From cc621e1829f61df96473cd04fb8ba27a65c99e13 Mon Sep 17 00:00:00 2001
From: Jeremy Andrews <athenian200@outlook.com>
Date: Sun, 26 Sep 2021 12:50:43 -0500
Subject: Issue %3003 - Move libaom to libs/

---
 libs/libaom/src/.clang-format                      |   148 +
 libs/libaom/src/.cmake-format.py                   |   102 +
 libs/libaom/src/.gitattributes                     |    18 +
 libs/libaom/src/.mailmap                           |    91 +
 libs/libaom/src/AUTHORS                            |   260 +
 libs/libaom/src/CHANGELOG                          |    51 +
 libs/libaom/src/CMakeLists.txt                     |   768 ++
 libs/libaom/src/LICENSE                            |    27 +
 libs/libaom/src/PATENTS                            |   108 +
 libs/libaom/src/README.md                          |   665 +
 libs/libaom/src/Sample.cfg                         |    35 +
 libs/libaom/src/aom/aom.h                          |   109 +
 libs/libaom/src/aom/aom_codec.h                    |   478 +
 libs/libaom/src/aom/aom_decoder.h                  |   257 +
 libs/libaom/src/aom/aom_encoder.h                  |  1136 ++
 libs/libaom/src/aom/aom_frame_buffer.h             |    84 +
 libs/libaom/src/aom/aom_image.h                    |   430 +
 libs/libaom/src/aom/aom_integer.h                  |   103 +
 libs/libaom/src/aom/aomcx.h                        |  1774 +++
 libs/libaom/src/aom/aomdx.h                        |   397 +
 libs/libaom/src/aom/exports_com                    |    41 +
 libs/libaom/src/aom/exports_dec                    |     8 +
 libs/libaom/src/aom/exports_enc                    |    17 +
 libs/libaom/src/aom/exports_test                   |     4 +
 libs/libaom/src/aom/internal/aom_codec_internal.h  |   381 +
 libs/libaom/src/aom/internal/aom_image_internal.h  |    93 +
 libs/libaom/src/aom/src/aom_codec.c                |   157 +
 libs/libaom/src/aom/src/aom_decoder.c              |   137 +
 libs/libaom/src/aom/src/aom_encoder.c              |   302 +
 libs/libaom/src/aom/src/aom_image.c                |   395 +
 libs/libaom/src/aom/src/aom_integer.c              |   105 +
 libs/libaom/src/aom_dsp/aom_convolve.c             |   239 +
 libs/libaom/src/aom_dsp/aom_dsp.cmake              |   422 +
 libs/libaom/src/aom_dsp/aom_dsp_common.h           |   101 +
 libs/libaom/src/aom_dsp/aom_dsp_rtcd.c             |    18 +
 libs/libaom/src/aom_dsp/aom_dsp_rtcd_defs.pl       |  1785 +++
 libs/libaom/src/aom_dsp/aom_filter.h               |    56 +
 libs/libaom/src/aom_dsp/aom_simd.h                 |    38 +
 libs/libaom/src/aom_dsp/aom_simd_inline.h          |    21 +
 libs/libaom/src/aom_dsp/arm/avg_neon.c             |    74 +
 libs/libaom/src/aom_dsp/arm/blend_a64_mask_neon.c  |   451 +
 libs/libaom/src/aom_dsp/arm/fwd_txfm_neon.c        |   316 +
 libs/libaom/src/aom_dsp/arm/hadamard_neon.c        |   183 +
 libs/libaom/src/aom_dsp/arm/intrapred_neon.c       |   590 +
 libs/libaom/src/aom_dsp/arm/loopfilter_neon.c      |   927 ++
 libs/libaom/src/aom_dsp/arm/sad4d_neon.c           |   226 +
 libs/libaom/src/aom_dsp/arm/sad_neon.c             |   224 +
 libs/libaom/src/aom_dsp/arm/sse_neon.c             |   487 +
 libs/libaom/src/aom_dsp/arm/subpel_variance_neon.c |   131 +
 libs/libaom/src/aom_dsp/arm/subtract_neon.c        |    81 +
 libs/libaom/src/aom_dsp/arm/sum_neon.h             |    37 +
 libs/libaom/src/aom_dsp/arm/variance_neon.c        |   401 +
 libs/libaom/src/aom_dsp/avg.c                      |   486 +
 libs/libaom/src/aom_dsp/binary_codes_reader.c      |    56 +
 libs/libaom/src/aom_dsp/binary_codes_reader.h      |    44 +
 libs/libaom/src/aom_dsp/binary_codes_writer.c      |   138 +
 libs/libaom/src/aom_dsp/binary_codes_writer.h      |    65 +
 libs/libaom/src/aom_dsp/bitreader.c                |    41 +
 libs/libaom/src/aom_dsp/bitreader.h                |   228 +
 libs/libaom/src/aom_dsp/bitreader_buffer.c         |   116 +
 libs/libaom/src/aom_dsp/bitreader_buffer.h         |    53 +
 libs/libaom/src/aom_dsp/bitwriter.c                |    31 +
 libs/libaom/src/aom_dsp/bitwriter.h                |   118 +
 libs/libaom/src/aom_dsp/bitwriter_buffer.c         |   141 +
 libs/libaom/src/aom_dsp/bitwriter_buffer.h         |    55 +
 libs/libaom/src/aom_dsp/blend.h                    |    45 +
 libs/libaom/src/aom_dsp/blend_a64_hmask.c          |    71 +
 libs/libaom/src/aom_dsp/blend_a64_mask.c           |   349 +
 libs/libaom/src/aom_dsp/blend_a64_vmask.c          |    73 +
 libs/libaom/src/aom_dsp/blk_sse_sum.c              |    26 +
 libs/libaom/src/aom_dsp/entcode.c                  |    49 +
 libs/libaom/src/aom_dsp/entcode.h                  |    41 +
 libs/libaom/src/aom_dsp/entdec.c                   |   247 +
 libs/libaom/src/aom_dsp/entdec.h                   |    81 +
 libs/libaom/src/aom_dsp/entenc.c                   |   423 +
 libs/libaom/src/aom_dsp/entenc.h                   |    85 +
 libs/libaom/src/aom_dsp/fastssim.c                 |   487 +
 libs/libaom/src/aom_dsp/fft.c                      |   219 +
 libs/libaom/src/aom_dsp/fft_common.h               |  1050 ++
 libs/libaom/src/aom_dsp/fwd_txfm.c                 |   229 +
 libs/libaom/src/aom_dsp/grain_synthesis.c          |  1408 ++
 libs/libaom/src/aom_dsp/grain_synthesis.h          |   192 +
 libs/libaom/src/aom_dsp/grain_table.c              |   334 +
 libs/libaom/src/aom_dsp/grain_table.h              |   102 +
 libs/libaom/src/aom_dsp/intrapred.c                |   792 ++
 libs/libaom/src/aom_dsp/intrapred_common.h         |    47 +
 libs/libaom/src/aom_dsp/loopfilter.c               |   929 ++
 .../src/aom_dsp/mips/aom_convolve8_horiz_msa.c     |   693 +
 .../src/aom_dsp/mips/aom_convolve8_vert_msa.c      |   699 +
 .../src/aom_dsp/mips/aom_convolve_copy_msa.c       |   248 +
 libs/libaom/src/aom_dsp/mips/aom_convolve_msa.h    |    79 +
 libs/libaom/src/aom_dsp/mips/common_dspr2.c        |    31 +
 libs/libaom/src/aom_dsp/mips/common_dspr2.h        |    51 +
 libs/libaom/src/aom_dsp/mips/convolve2_dspr2.c     |  1031 ++
 .../src/aom_dsp/mips/convolve2_horiz_dspr2.c       |   681 +
 .../libaom/src/aom_dsp/mips/convolve2_vert_dspr2.c |   237 +
 libs/libaom/src/aom_dsp/mips/convolve8_dspr2.c     |   222 +
 .../src/aom_dsp/mips/convolve8_horiz_dspr2.c       |   879 ++
 .../libaom/src/aom_dsp/mips/convolve8_vert_dspr2.c |   361 +
 .../src/aom_dsp/mips/convolve_common_dspr2.h       |    48 +
 libs/libaom/src/aom_dsp/mips/intrapred16_dspr2.c   |   327 +
 libs/libaom/src/aom_dsp/mips/intrapred4_dspr2.c    |    82 +
 libs/libaom/src/aom_dsp/mips/intrapred8_dspr2.c    |   150 +
 libs/libaom/src/aom_dsp/mips/intrapred_msa.c       |   550 +
 libs/libaom/src/aom_dsp/mips/loopfilter_16_msa.c   |  1488 +++
 libs/libaom/src/aom_dsp/mips/loopfilter_4_msa.c    |   147 +
 libs/libaom/src/aom_dsp/mips/loopfilter_8_msa.c    |   333 +
 .../src/aom_dsp/mips/loopfilter_filters_dspr2.c    |   328 +
 .../src/aom_dsp/mips/loopfilter_filters_dspr2.h    |   736 ++
 .../src/aom_dsp/mips/loopfilter_macros_dspr2.h     |   437 +
 .../src/aom_dsp/mips/loopfilter_masks_dspr2.h      |   357 +
 libs/libaom/src/aom_dsp/mips/loopfilter_mb_dspr2.c |   590 +
 .../src/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c   |   734 ++
 .../src/aom_dsp/mips/loopfilter_mb_vert_dspr2.c    |   758 ++
 libs/libaom/src/aom_dsp/mips/loopfilter_msa.h      |   251 +
 libs/libaom/src/aom_dsp/mips/macros_msa.h          |  2058 +++
 libs/libaom/src/aom_dsp/mips/sad_msa.c             |   800 ++
 .../src/aom_dsp/mips/sub_pixel_variance_msa.c      |  1792 +++
 libs/libaom/src/aom_dsp/mips/subtract_msa.c        |   266 +
 libs/libaom/src/aom_dsp/mips/variance_msa.c        |   633 +
 libs/libaom/src/aom_dsp/noise_model.c              |  1654 +++
 libs/libaom/src/aom_dsp/noise_model.h              |   323 +
 libs/libaom/src/aom_dsp/noise_util.c               |   223 +
 libs/libaom/src/aom_dsp/noise_util.h               |    68 +
 libs/libaom/src/aom_dsp/prob.h                     |   670 +
 libs/libaom/src/aom_dsp/psnr.c                     |   439 +
 libs/libaom/src/aom_dsp/psnr.h                     |    93 +
 libs/libaom/src/aom_dsp/psnrhvs.c                  |   277 +
 libs/libaom/src/aom_dsp/quantize.c                 |   466 +
 libs/libaom/src/aom_dsp/quantize.h                 |   124 +
 libs/libaom/src/aom_dsp/recenter.h                 |    61 +
 libs/libaom/src/aom_dsp/sad.c                      |   319 +
 libs/libaom/src/aom_dsp/sad_av1.c                  |   264 +
 libs/libaom/src/aom_dsp/simd/v128_intrinsics.h     |   346 +
 libs/libaom/src/aom_dsp/simd/v128_intrinsics_arm.h |   973 ++
 libs/libaom/src/aom_dsp/simd/v128_intrinsics_c.h   |   903 ++
 libs/libaom/src/aom_dsp/simd/v128_intrinsics_x86.h |   657 +
 libs/libaom/src/aom_dsp/simd/v256_intrinsics.h     |   377 +
 libs/libaom/src/aom_dsp/simd/v256_intrinsics_arm.h |    17 +
 libs/libaom/src/aom_dsp/simd/v256_intrinsics_c.h   |   968 ++
 .../libaom/src/aom_dsp/simd/v256_intrinsics_v128.h |   876 ++
 libs/libaom/src/aom_dsp/simd/v256_intrinsics_x86.h |   750 ++
 libs/libaom/src/aom_dsp/simd/v64_intrinsics.h      |   234 +
 libs/libaom/src/aom_dsp/simd/v64_intrinsics_arm.h  |   684 +
 libs/libaom/src/aom_dsp/simd/v64_intrinsics_c.h    |   982 ++
 libs/libaom/src/aom_dsp/simd/v64_intrinsics_x86.h  |   491 +
 libs/libaom/src/aom_dsp/sse.c                      |    54 +
 libs/libaom/src/aom_dsp/ssim.c                     |   441 +
 libs/libaom/src/aom_dsp/ssim.h                     |    87 +
 libs/libaom/src/aom_dsp/subtract.c                 |    55 +
 libs/libaom/src/aom_dsp/sum_squares.c              |    73 +
 libs/libaom/src/aom_dsp/txfm_common.h              |    91 +
 libs/libaom/src/aom_dsp/variance.c                 |  1483 +++
 libs/libaom/src/aom_dsp/variance.h                 |   129 +
 libs/libaom/src/aom_dsp/vmaf.c                     |   159 +
 libs/libaom/src/aom_dsp/vmaf.h                     |    27 +
 .../src/aom_dsp/x86/adaptive_quantize_avx2.c       |   244 +
 .../src/aom_dsp/x86/adaptive_quantize_sse2.c       |   633 +
 libs/libaom/src/aom_dsp/x86/aom_asm_stubs.c        |    95 +
 .../src/aom_dsp/x86/aom_convolve_copy_sse2.asm     |   297 +
 .../src/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm  |   613 +
 .../x86/aom_high_subpixel_bilinear_sse2.asm        |   367 +
 .../src/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c  |  1441 +++
 .../src/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c  |   569 +
 .../src/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c |   770 ++
 .../src/aom_dsp/x86/aom_subpixel_8t_sse2.asm       |   615 +
 .../src/aom_dsp/x86/aom_subpixel_8t_ssse3.asm      |   870 ++
 .../src/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm |   295 +
 .../aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm    |   267 +
 libs/libaom/src/aom_dsp/x86/avg_intrin_avx2.c      |   504 +
 libs/libaom/src/aom_dsp/x86/avg_intrin_sse2.c      |   512 +
 .../src/aom_dsp/x86/bitdepth_conversion_avx2.h     |    32 +
 .../src/aom_dsp/x86/bitdepth_conversion_sse2.h     |    35 +
 libs/libaom/src/aom_dsp/x86/blend_a64_hmask_sse4.c |    36 +
 libs/libaom/src/aom_dsp/x86/blend_a64_mask_avx2.c  |  1374 ++
 libs/libaom/src/aom_dsp/x86/blend_a64_mask_sse4.c  |  1560 +++
 libs/libaom/src/aom_dsp/x86/blend_a64_vmask_sse4.c |   285 +
 libs/libaom/src/aom_dsp/x86/blend_mask_sse4.h      |   237 +
 libs/libaom/src/aom_dsp/x86/blend_sse4.h           |   191 +
 libs/libaom/src/aom_dsp/x86/blk_sse_sum_avx2.c     |   185 +
 libs/libaom/src/aom_dsp/x86/blk_sse_sum_sse2.c     |   138 +
 libs/libaom/src/aom_dsp/x86/common_avx2.h          |   147 +
 libs/libaom/src/aom_dsp/x86/convolve.h             |   203 +
 libs/libaom/src/aom_dsp/x86/convolve_avx2.h        |   463 +
 .../src/aom_dsp/x86/convolve_common_intrin.h       |    31 +
 libs/libaom/src/aom_dsp/x86/convolve_sse2.h        |   121 +
 libs/libaom/src/aom_dsp/x86/convolve_sse4_1.h      |    53 +
 libs/libaom/src/aom_dsp/x86/fft_avx2.c             |    74 +
 libs/libaom/src/aom_dsp/x86/fft_sse2.c             |   167 +
 libs/libaom/src/aom_dsp/x86/fwd_txfm_impl_sse2.h   |   544 +
 libs/libaom/src/aom_dsp/x86/fwd_txfm_sse2.c        |    39 +
 libs/libaom/src/aom_dsp/x86/fwd_txfm_sse2.h        |   160 +
 .../src/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm      |   379 +
 .../aom_dsp/x86/highbd_adaptive_quantize_avx2.c    |   457 +
 .../aom_dsp/x86/highbd_adaptive_quantize_sse2.c    |   732 ++
 libs/libaom/src/aom_dsp/x86/highbd_convolve_avx2.c |  1323 ++
 libs/libaom/src/aom_dsp/x86/highbd_convolve_sse2.c |   351 +
 .../libaom/src/aom_dsp/x86/highbd_convolve_ssse3.c |   251 +
 .../src/aom_dsp/x86/highbd_intrapred_asm_sse2.asm  |   259 +
 .../libaom/src/aom_dsp/x86/highbd_intrapred_sse2.c |   984 ++
 .../src/aom_dsp/x86/highbd_loopfilter_avx2.c       |    66 +
 .../src/aom_dsp/x86/highbd_loopfilter_sse2.c       |  1698 +++
 .../src/aom_dsp/x86/highbd_quantize_intrin_avx2.c  |   160 +
 .../src/aom_dsp/x86/highbd_quantize_intrin_sse2.c  |   206 +
 libs/libaom/src/aom_dsp/x86/highbd_sad4d_sse2.asm  |   296 +
 libs/libaom/src/aom_dsp/x86/highbd_sad_sse2.asm    |   442 +
 .../x86/highbd_subpel_variance_impl_sse2.asm       |  1024 ++
 libs/libaom/src/aom_dsp/x86/highbd_subtract_sse2.c |   267 +
 libs/libaom/src/aom_dsp/x86/highbd_variance_avx2.c |   140 +
 .../src/aom_dsp/x86/highbd_variance_impl_sse2.asm  |   318 +
 libs/libaom/src/aom_dsp/x86/highbd_variance_sse2.c |   842 ++
 libs/libaom/src/aom_dsp/x86/highbd_variance_sse4.c |   216 +
 libs/libaom/src/aom_dsp/x86/intrapred_asm_sse2.asm |   608 +
 libs/libaom/src/aom_dsp/x86/intrapred_avx2.c       |  4895 +++++++
 libs/libaom/src/aom_dsp/x86/intrapred_sse2.c       |  1411 ++
 libs/libaom/src/aom_dsp/x86/intrapred_ssse3.c      |  1695 +++
 libs/libaom/src/aom_dsp/x86/intrapred_x86.h        |    38 +
 libs/libaom/src/aom_dsp/x86/inv_wht_sse2.asm       |   107 +
 libs/libaom/src/aom_dsp/x86/jnt_sad_ssse3.c        |   238 +
 libs/libaom/src/aom_dsp/x86/jnt_variance_ssse3.c   |   192 +
 libs/libaom/src/aom_dsp/x86/loopfilter_sse2.c      |  2100 +++
 libs/libaom/src/aom_dsp/x86/lpf_common_sse2.h      |   495 +
 libs/libaom/src/aom_dsp/x86/masked_sad4d_ssse3.c   |   266 +
 .../src/aom_dsp/x86/masked_sad_intrin_avx2.c       |   389 +
 .../src/aom_dsp/x86/masked_sad_intrin_ssse3.c      |   402 +
 .../src/aom_dsp/x86/masked_sad_intrin_ssse3.h      |    33 +
 .../src/aom_dsp/x86/masked_variance_intrin_ssse3.c |  1067 ++
 .../src/aom_dsp/x86/masked_variance_intrin_ssse3.h |    92 +
 libs/libaom/src/aom_dsp/x86/mem_sse2.h             |    42 +
 libs/libaom/src/aom_dsp/x86/obmc_intrinsic_sse4.h  |    58 +
 libs/libaom/src/aom_dsp/x86/obmc_intrinsic_ssse3.h |    54 +
 libs/libaom/src/aom_dsp/x86/obmc_sad_avx2.c        |   270 +
 libs/libaom/src/aom_dsp/x86/obmc_sad_sse4.c        |   268 +
 libs/libaom/src/aom_dsp/x86/obmc_variance_avx2.c   |   190 +
 libs/libaom/src/aom_dsp/x86/obmc_variance_sse4.c   |   381 +
 .../libaom/src/aom_dsp/x86/quantize_avx_x86_64.asm |   464 +
 libs/libaom/src/aom_dsp/x86/quantize_sse2.c        |   125 +
 libs/libaom/src/aom_dsp/x86/quantize_ssse3.c       |   192 +
 .../src/aom_dsp/x86/quantize_ssse3_x86_64.asm      |   302 +
 libs/libaom/src/aom_dsp/x86/quantize_x86.h         |   202 +
 libs/libaom/src/aom_dsp/x86/sad4d_avx2.c           |   106 +
 libs/libaom/src/aom_dsp/x86/sad4d_sse2.asm         |   428 +
 libs/libaom/src/aom_dsp/x86/sad_avx2.c             |   189 +
 libs/libaom/src/aom_dsp/x86/sad_highbd_avx2.c      |   699 +
 libs/libaom/src/aom_dsp/x86/sad_impl_avx2.c        |   159 +
 libs/libaom/src/aom_dsp/x86/sad_sse2.asm           |   353 +
 libs/libaom/src/aom_dsp/x86/sse_avx2.c             |   384 +
 libs/libaom/src/aom_dsp/x86/sse_sse4.c             |   353 +
 libs/libaom/src/aom_dsp/x86/ssim_sse2_x86_64.asm   |   222 +
 .../src/aom_dsp/x86/subpel_variance_sse2.asm       |  1470 +++
 libs/libaom/src/aom_dsp/x86/subtract_avx2.c        |   108 +
 libs/libaom/src/aom_dsp/x86/subtract_sse2.asm      |   146 +
 libs/libaom/src/aom_dsp/x86/sum_squares_avx2.c     |   248 +
 libs/libaom/src/aom_dsp/x86/sum_squares_sse2.c     |   366 +
 libs/libaom/src/aom_dsp/x86/sum_squares_sse2.h     |    22 +
 libs/libaom/src/aom_dsp/x86/synonyms.h             |   118 +
 libs/libaom/src/aom_dsp/x86/synonyms_avx2.h        |    79 +
 libs/libaom/src/aom_dsp/x86/transpose_sse2.h       |   420 +
 libs/libaom/src/aom_dsp/x86/txfm_common_avx2.h     |   360 +
 libs/libaom/src/aom_dsp/x86/txfm_common_sse2.h     |    33 +
 libs/libaom/src/aom_dsp/x86/variance_avx2.c        |   526 +
 libs/libaom/src/aom_dsp/x86/variance_impl_avx2.c   |   814 ++
 libs/libaom/src/aom_dsp/x86/variance_impl_ssse3.c  |   129 +
 libs/libaom/src/aom_dsp/x86/variance_sse2.c        |   757 ++
 libs/libaom/src/aom_mem/aom_mem.c                  |    84 +
 libs/libaom/src/aom_mem/aom_mem.cmake              |    29 +
 libs/libaom/src/aom_mem/aom_mem.h                  |    74 +
 libs/libaom/src/aom_mem/include/aom_mem_intrnl.h   |    29 +
 libs/libaom/src/aom_ports/aom_once.h               |   104 +
 libs/libaom/src/aom_ports/aom_ports.cmake          |    92 +
 libs/libaom/src/aom_ports/aom_timer.h              |   111 +
 libs/libaom/src/aom_ports/arm.h                    |    41 +
 libs/libaom/src/aom_ports/arm_cpudetect.c          |   150 +
 libs/libaom/src/aom_ports/bitops.h                 |    78 +
 libs/libaom/src/aom_ports/emmintrin_compat.h       |    56 +
 libs/libaom/src/aom_ports/emms.asm                 |    41 +
 libs/libaom/src/aom_ports/mem.h                    |    99 +
 libs/libaom/src/aom_ports/mem_ops.h                |   228 +
 libs/libaom/src/aom_ports/mem_ops_aligned.h        |   173 +
 libs/libaom/src/aom_ports/msvc.h                   |    75 +
 libs/libaom/src/aom_ports/ppc.h                    |    30 +
 libs/libaom/src/aom_ports/ppc_cpudetect.c          |    82 +
 libs/libaom/src/aom_ports/sanitizer.h              |    38 +
 libs/libaom/src/aom_ports/system_state.h           |    23 +
 libs/libaom/src/aom_ports/x86.h                    |   375 +
 libs/libaom/src/aom_ports/x86_abi_support.asm      |   402 +
 libs/libaom/src/aom_scale/aom_scale.cmake          |    45 +
 libs/libaom/src/aom_scale/aom_scale.h              |    23 +
 libs/libaom/src/aom_scale/aom_scale_rtcd.c         |    18 +
 libs/libaom/src/aom_scale/aom_scale_rtcd.pl        |    55 +
 libs/libaom/src/aom_scale/generic/aom_scale.c      |   506 +
 libs/libaom/src/aom_scale/generic/gen_scalers.c    |   201 +
 libs/libaom/src/aom_scale/generic/yv12config.c     |   269 +
 libs/libaom/src/aom_scale/generic/yv12extend.c     |   477 +
 .../src/aom_scale/mips/dspr2/yv12extend_dspr2.c    |   142 +
 libs/libaom/src/aom_scale/yv12config.h             |   159 +
 libs/libaom/src/aom_util/aom_thread.c              |   212 +
 libs/libaom/src/aom_util/aom_thread.h              |   364 +
 libs/libaom/src/aom_util/aom_util.cmake            |    31 +
 libs/libaom/src/aom_util/debug_util.c              |   279 +
 libs/libaom/src/aom_util/debug_util.h              |    69 +
 libs/libaom/src/aom_util/endian_inl.h              |   122 +
 libs/libaom/src/apps/aomdec.c                      |  1024 ++
 libs/libaom/src/apps/aomenc.c                      |  2752 ++++
 libs/libaom/src/apps/aomenc.h                      |    65 +
 libs/libaom/src/av1/av1.cmake                      |   580 +
 libs/libaom/src/av1/av1_cx_iface.c                 |  2936 +++++
 libs/libaom/src/av1/av1_dx_iface.c                 |  1397 ++
 libs/libaom/src/av1/av1_iface_common.h             |   144 +
 libs/libaom/src/av1/common/alloccommon.c           |   309 +
 libs/libaom/src/av1/common/alloccommon.h           |    56 +
 libs/libaom/src/av1/common/arm/av1_inv_txfm_neon.c |  4271 ++++++
 libs/libaom/src/av1/common/arm/av1_inv_txfm_neon.h |   154 +
 libs/libaom/src/av1/common/arm/av1_txfm_neon.c     |    30 +
 .../src/av1/common/arm/blend_a64_hmask_neon.c      |   134 +
 .../src/av1/common/arm/blend_a64_vmask_neon.c      |   141 +
 libs/libaom/src/av1/common/arm/cfl_neon.c          |   588 +
 libs/libaom/src/av1/common/arm/convolve_neon.c     |  1593 +++
 libs/libaom/src/av1/common/arm/convolve_neon.h     |   228 +
 libs/libaom/src/av1/common/arm/jnt_convolve_neon.c |  1739 +++
 libs/libaom/src/av1/common/arm/mem_neon.h          |   539 +
 libs/libaom/src/av1/common/arm/reconinter_neon.c   |    86 +
 libs/libaom/src/av1/common/arm/selfguided_neon.c   |  1590 +++
 libs/libaom/src/av1/common/arm/transpose_neon.h    |   602 +
 libs/libaom/src/av1/common/arm/warp_plane_neon.c   |   714 +
 .../src/av1/common/arm/wiener_convolve_neon.c      |   530 +
 libs/libaom/src/av1/common/av1_common_int.h        |  1557 +++
 libs/libaom/src/av1/common/av1_inv_txfm1d.c        |  1841 +++
 libs/libaom/src/av1/common/av1_inv_txfm1d.h        |    61 +
 libs/libaom/src/av1/common/av1_inv_txfm1d_cfg.h    |    47 +
 libs/libaom/src/av1/common/av1_inv_txfm2d.c        |   504 +
 libs/libaom/src/av1/common/av1_loopfilter.c        |   790 ++
 libs/libaom/src/av1/common/av1_loopfilter.h        |   208 +
 libs/libaom/src/av1/common/av1_rtcd.c              |    22 +
 libs/libaom/src/av1/common/av1_rtcd_defs.pl        |   496 +
 libs/libaom/src/av1/common/av1_txfm.c              |   161 +
 libs/libaom/src/av1/common/av1_txfm.h              |   234 +
 libs/libaom/src/av1/common/blockd.c                |   102 +
 libs/libaom/src/av1/common/blockd.h                |  1296 ++
 libs/libaom/src/av1/common/cdef.c                  |   388 +
 libs/libaom/src/av1/common/cdef.h                  |    52 +
 libs/libaom/src/av1/common/cdef_block.c            |   253 +
 libs/libaom/src/av1/common/cdef_block.h            |    58 +
 libs/libaom/src/av1/common/cdef_block_avx2.c       |    14 +
 libs/libaom/src/av1/common/cdef_block_neon.c       |    14 +
 libs/libaom/src/av1/common/cdef_block_simd.h       |   915 ++
 libs/libaom/src/av1/common/cdef_block_sse2.c       |    14 +
 libs/libaom/src/av1/common/cdef_block_sse4.c       |    14 +
 libs/libaom/src/av1/common/cdef_block_ssse3.c      |    14 +
 libs/libaom/src/av1/common/cfl.c                   |   436 +
 libs/libaom/src/av1/common/cfl.h                   |   288 +
 libs/libaom/src/av1/common/common.h                |    63 +
 libs/libaom/src/av1/common/common_data.h           |   446 +
 libs/libaom/src/av1/common/convolve.c              |  1274 ++
 libs/libaom/src/av1/common/convolve.h              |   136 +
 libs/libaom/src/av1/common/debugmodes.c            |   113 +
 libs/libaom/src/av1/common/entropy.c               |   179 +
 libs/libaom/src/av1/common/entropy.h               |   181 +
 libs/libaom/src/av1/common/entropymode.c           |  1103 ++
 libs/libaom/src/av1/common/entropymode.h           |   212 +
 libs/libaom/src/av1/common/entropymv.c             |    67 +
 libs/libaom/src/av1/common/entropymv.h             |   104 +
 libs/libaom/src/av1/common/enums.h                 |   678 +
 libs/libaom/src/av1/common/filter.h                |   279 +
 libs/libaom/src/av1/common/frame_buffers.c         |    98 +
 libs/libaom/src/av1/common/frame_buffers.h         |    60 +
 libs/libaom/src/av1/common/idct.c                  |   322 +
 libs/libaom/src/av1/common/idct.h                  |    51 +
 libs/libaom/src/av1/common/loopfiltermask.c        |  1458 +++
 libs/libaom/src/av1/common/mv.h                    |   354 +
 libs/libaom/src/av1/common/mvref_common.c          |  1511 +++
 libs/libaom/src/av1/common/mvref_common.h          |   341 +
 libs/libaom/src/av1/common/obmc.h                  |    89 +
 libs/libaom/src/av1/common/obu_util.c              |   154 +
 libs/libaom/src/av1/common/obu_util.h              |    47 +
 libs/libaom/src/av1/common/odintrin.c              |   541 +
 libs/libaom/src/av1/common/odintrin.h              |    96 +
 libs/libaom/src/av1/common/ppc/cfl_ppc.c           |   152 +
 libs/libaom/src/av1/common/pred_common.c           |   501 +
 libs/libaom/src/av1/common/pred_common.h           |   374 +
 libs/libaom/src/av1/common/quant_common.c          | 12875 +++++++++++++++++++
 libs/libaom/src/av1/common/quant_common.h          |    83 +
 libs/libaom/src/av1/common/reconinter.c            |  1426 ++
 libs/libaom/src/av1/common/reconinter.h            |   414 +
 libs/libaom/src/av1/common/reconintra.c            |  1704 +++
 libs/libaom/src/av1/common/reconintra.h            |   151 +
 libs/libaom/src/av1/common/resize.c                |  1455 +++
 libs/libaom/src/av1/common/resize.h                |   115 +
 libs/libaom/src/av1/common/restoration.c           |  1566 +++
 libs/libaom/src/av1/common/restoration.h           |   380 +
 libs/libaom/src/av1/common/scale.c                 |   128 +
 libs/libaom/src/av1/common/scale.h                 |    69 +
 libs/libaom/src/av1/common/scan.c                  |  2048 +++
 libs/libaom/src/av1/common/scan.h                  |    55 +
 libs/libaom/src/av1/common/seg_common.c            |    91 +
 libs/libaom/src/av1/common/seg_common.h            |   104 +
 libs/libaom/src/av1/common/thread_common.c         |   930 ++
 libs/libaom/src/av1/common/thread_common.h         |   122 +
 libs/libaom/src/av1/common/tile_common.c           |   239 +
 libs/libaom/src/av1/common/tile_common.h           |    75 +
 libs/libaom/src/av1/common/timing.c                |    92 +
 libs/libaom/src/av1/common/timing.h                |    55 +
 libs/libaom/src/av1/common/token_cdfs.h            |  3555 +++++
 libs/libaom/src/av1/common/txb_common.c            |   458 +
 libs/libaom/src/av1/common/txb_common.h            |   442 +
 libs/libaom/src/av1/common/warped_motion.c         |  1073 ++
 libs/libaom/src/av1/common/warped_motion.h         |   186 +
 .../av1/common/x86/av1_convolve_horiz_rs_sse4.c    |   228 +
 .../src/av1/common/x86/av1_convolve_scale_sse4.c   |   498 +
 libs/libaom/src/av1/common/x86/av1_inv_txfm_avx2.c |  1949 +++
 libs/libaom/src/av1/common/x86/av1_inv_txfm_avx2.h |    71 +
 .../libaom/src/av1/common/x86/av1_inv_txfm_ssse3.c |  2956 +++++
 .../libaom/src/av1/common/x86/av1_inv_txfm_ssse3.h |   232 +
 libs/libaom/src/av1/common/x86/av1_txfm_sse2.h     |   317 +
 libs/libaom/src/av1/common/x86/av1_txfm_sse4.c     |    21 +
 libs/libaom/src/av1/common/x86/av1_txfm_sse4.h     |    72 +
 libs/libaom/src/av1/common/x86/cfl_avx2.c          |   495 +
 libs/libaom/src/av1/common/x86/cfl_simd.h          |   246 +
 libs/libaom/src/av1/common/x86/cfl_sse2.c          |    89 +
 libs/libaom/src/av1/common/x86/cfl_ssse3.c         |   397 +
 libs/libaom/src/av1/common/x86/convolve_2d_avx2.c  |   317 +
 libs/libaom/src/av1/common/x86/convolve_2d_sse2.c  |   471 +
 libs/libaom/src/av1/common/x86/convolve_avx2.c     |   439 +
 libs/libaom/src/av1/common/x86/convolve_sse2.c     |   338 +
 libs/libaom/src/av1/common/x86/filterintra_sse4.c  |    71 +
 .../src/av1/common/x86/highbd_convolve_2d_avx2.c   |   326 +
 .../src/av1/common/x86/highbd_convolve_2d_sse2.c   |   191 +
 .../src/av1/common/x86/highbd_convolve_2d_sse4.c   |   425 +
 .../src/av1/common/x86/highbd_convolve_2d_ssse3.c  |   217 +
 .../src/av1/common/x86/highbd_inv_txfm_avx2.c      |  4246 ++++++
 .../src/av1/common/x86/highbd_inv_txfm_sse4.c      |  5821 +++++++++
 .../src/av1/common/x86/highbd_jnt_convolve_avx2.c  |   859 ++
 .../src/av1/common/x86/highbd_jnt_convolve_sse4.c  |   387 +
 .../src/av1/common/x86/highbd_txfm_utility_sse4.h  |   132 +
 .../src/av1/common/x86/highbd_warp_plane_sse4.c    |   632 +
 .../av1/common/x86/highbd_wiener_convolve_avx2.c   |   245 +
 .../av1/common/x86/highbd_wiener_convolve_ssse3.c  |   202 +
 libs/libaom/src/av1/common/x86/intra_edge_sse4.c   |   318 +
 libs/libaom/src/av1/common/x86/jnt_convolve_avx2.c |   917 ++
 libs/libaom/src/av1/common/x86/jnt_convolve_sse2.c |   615 +
 .../libaom/src/av1/common/x86/jnt_convolve_ssse3.c |   231 +
 libs/libaom/src/av1/common/x86/reconinter_avx2.c   |   620 +
 libs/libaom/src/av1/common/x86/reconinter_sse4.c   |   153 +
 libs/libaom/src/av1/common/x86/reconinter_ssse3.c  |   116 +
 libs/libaom/src/av1/common/x86/selfguided_avx2.c   |   724 ++
 libs/libaom/src/av1/common/x86/selfguided_sse4.c   |   662 +
 libs/libaom/src/av1/common/x86/warp_plane_avx2.c   |  1318 ++
 libs/libaom/src/av1/common/x86/warp_plane_sse2.c   |    88 +
 libs/libaom/src/av1/common/x86/warp_plane_sse4.c   |   963 ++
 .../src/av1/common/x86/wiener_convolve_avx2.c      |   242 +
 .../src/av1/common/x86/wiener_convolve_sse2.c      |   199 +
 libs/libaom/src/av1/decoder/accounting.c           |   138 +
 libs/libaom/src/av1/decoder/accounting.h           |    82 +
 libs/libaom/src/av1/decoder/decodeframe.c          |  5326 ++++++++
 libs/libaom/src/av1/decoder/decodeframe.h          |    87 +
 libs/libaom/src/av1/decoder/decodemv.c             |  1575 +++
 libs/libaom/src/av1/decoder/decodemv.h             |    33 +
 libs/libaom/src/av1/decoder/decoder.c              |   539 +
 libs/libaom/src/av1/decoder/decoder.h              |   331 +
 libs/libaom/src/av1/decoder/decodetxb.c            |   379 +
 libs/libaom/src/av1/decoder/decodetxb.h            |    32 +
 libs/libaom/src/av1/decoder/detokenize.c           |    78 +
 libs/libaom/src/av1/decoder/detokenize.h           |    29 +
 libs/libaom/src/av1/decoder/dthread.h              |    51 +
 libs/libaom/src/av1/decoder/inspection.c           |   154 +
 libs/libaom/src/av1/decoder/inspection.h           |    91 +
 libs/libaom/src/av1/decoder/obu.c                  |  1085 ++
 libs/libaom/src/av1/decoder/obu.h                  |    31 +
 libs/libaom/src/av1/encoder/aq_complexity.c        |   185 +
 libs/libaom/src/av1/encoder/aq_complexity.h        |    37 +
 libs/libaom/src/av1/encoder/aq_cyclicrefresh.c     |   501 +
 libs/libaom/src/av1/encoder/aq_cyclicrefresh.h     |   132 +
 libs/libaom/src/av1/encoder/aq_variance.c          |   205 +
 libs/libaom/src/av1/encoder/aq_variance.h          |    33 +
 .../src/av1/encoder/arm/neon/av1_error_neon.c      |    85 +
 .../src/av1/encoder/arm/neon/quantize_neon.c       |   215 +
 libs/libaom/src/av1/encoder/av1_fwd_txfm1d.c       |  1885 +++
 libs/libaom/src/av1/encoder/av1_fwd_txfm1d.h       |    49 +
 libs/libaom/src/av1/encoder/av1_fwd_txfm1d_cfg.h   |    19 +
 libs/libaom/src/av1/encoder/av1_fwd_txfm2d.c       |   429 +
 libs/libaom/src/av1/encoder/av1_multi_thread.c     |    70 +
 libs/libaom/src/av1/encoder/av1_multi_thread.h     |    21 +
 libs/libaom/src/av1/encoder/av1_quantize.c         |   789 ++
 libs/libaom/src/av1/encoder/av1_quantize.h         |   163 +
 libs/libaom/src/av1/encoder/bitstream.c            |  3925 ++++++
 libs/libaom/src/av1/encoder/bitstream.h            |    48 +
 libs/libaom/src/av1/encoder/block.h                |   575 +
 libs/libaom/src/av1/encoder/blockiness.c           |   142 +
 libs/libaom/src/av1/encoder/cnn.c                  |  1144 ++
 libs/libaom/src/av1/encoder/cnn.h                  |   197 +
 libs/libaom/src/av1/encoder/compound_type.c        |  1508 +++
 libs/libaom/src/av1/encoder/compound_type.h        |    48 +
 libs/libaom/src/av1/encoder/context_tree.c         |   268 +
 libs/libaom/src/av1/encoder/context_tree.h         |    97 +
 libs/libaom/src/av1/encoder/corner_detect.c        |    37 +
 libs/libaom/src/av1/encoder/corner_detect.h        |    22 +
 libs/libaom/src/av1/encoder/corner_match.c         |   194 +
 libs/libaom/src/av1/encoder/corner_match.h         |    33 +
 libs/libaom/src/av1/encoder/cost.c                 |    46 +
 libs/libaom/src/av1/encoder/cost.h                 |    51 +
 libs/libaom/src/av1/encoder/dwt.c                  |   155 +
 libs/libaom/src/av1/encoder/dwt.h                  |    25 +
 libs/libaom/src/av1/encoder/enc_enums.h            |   255 +
 libs/libaom/src/av1/encoder/encode_strategy.c      |  1322 ++
 libs/libaom/src/av1/encoder/encode_strategy.h      |    64 +
 libs/libaom/src/av1/encoder/encodeframe.c          |  6475 ++++++++++
 libs/libaom/src/av1/encoder/encodeframe.h          |    49 +
 libs/libaom/src/av1/encoder/encodemb.c             |   805 ++
 libs/libaom/src/av1/encoder/encodemb.h             |   145 +
 libs/libaom/src/av1/encoder/encodemv.c             |   270 +
 libs/libaom/src/av1/encoder/encodemv.h             |    76 +
 libs/libaom/src/av1/encoder/encoder.c              |  7187 +++++++++++
 libs/libaom/src/av1/encoder/encoder.h              |  1965 +++
 libs/libaom/src/av1/encoder/encodetxb.c            |  2261 ++++
 libs/libaom/src/av1/encoder/encodetxb.h            |   101 +
 libs/libaom/src/av1/encoder/ethread.c              |   729 ++
 libs/libaom/src/av1/encoder/ethread.h              |    54 +
 libs/libaom/src/av1/encoder/extend.c               |   151 +
 libs/libaom/src/av1/encoder/extend.h               |    29 +
 libs/libaom/src/av1/encoder/firstpass.c            |  1065 ++
 libs/libaom/src/av1/encoder/firstpass.h            |   196 +
 libs/libaom/src/av1/encoder/global_motion.c        |  1014 ++
 libs/libaom/src/av1/encoder/global_motion.h        |   101 +
 libs/libaom/src/av1/encoder/gop_structure.c        |   311 +
 libs/libaom/src/av1/encoder/gop_structure.h        |    43 +
 libs/libaom/src/av1/encoder/grain_test_vectors.h   |   781 ++
 libs/libaom/src/av1/encoder/hash.c                 |   125 +
 libs/libaom/src/av1/encoder/hash.h                 |    53 +
 libs/libaom/src/av1/encoder/hash_motion.c          |   491 +
 libs/libaom/src/av1/encoder/hash_motion.h          |   101 +
 libs/libaom/src/av1/encoder/hybrid_fwd_txfm.c      |   308 +
 libs/libaom/src/av1/encoder/hybrid_fwd_txfm.h      |    31 +
 libs/libaom/src/av1/encoder/interp_search.c        |   753 ++
 libs/libaom/src/av1/encoder/interp_search.h        |    85 +
 libs/libaom/src/av1/encoder/intra_mode_search.c    |  2132 +++
 libs/libaom/src/av1/encoder/intra_mode_search.h    |    63 +
 libs/libaom/src/av1/encoder/k_means_template.h     |   123 +
 libs/libaom/src/av1/encoder/level.c                |  1184 ++
 libs/libaom/src/av1/encoder/level.h                |   211 +
 libs/libaom/src/av1/encoder/lookahead.c            |   205 +
 libs/libaom/src/av1/encoder/lookahead.h            |   122 +
 libs/libaom/src/av1/encoder/mathutils.h            |   359 +
 libs/libaom/src/av1/encoder/mcomp.c                |  3391 +++++
 libs/libaom/src/av1/encoder/mcomp.h                |   329 +
 libs/libaom/src/av1/encoder/mips/msa/error_msa.c   |   109 +
 libs/libaom/src/av1/encoder/mips/msa/fdct4x4_msa.c |    46 +
 .../src/av1/encoder/mips/msa/temporal_filter_msa.c |   286 +
 libs/libaom/src/av1/encoder/misc_model_weights.h   |   696 +
 libs/libaom/src/av1/encoder/ml.c                   |   156 +
 libs/libaom/src/av1/encoder/ml.h                   |    82 +
 .../src/av1/encoder/mode_prune_model_weights.h     |   185 +
 libs/libaom/src/av1/encoder/model_rd.h             |   275 +
 libs/libaom/src/av1/encoder/motion_search_facade.c |   861 ++
 libs/libaom/src/av1/encoder/motion_search_facade.h |    76 +
 libs/libaom/src/av1/encoder/mv_prec.c              |   430 +
 libs/libaom/src/av1/encoder/mv_prec.h              |    48 +
 libs/libaom/src/av1/encoder/nonrd_pickmode.c       |  2182 ++++
 libs/libaom/src/av1/encoder/palette.c              |   154 +
 libs/libaom/src/av1/encoder/palette.h              |    96 +
 .../libaom/src/av1/encoder/partition_cnn_weights.h |  2139 +++
 .../src/av1/encoder/partition_model_weights.h      |  5646 ++++++++
 libs/libaom/src/av1/encoder/partition_strategy.c   |  1288 ++
 libs/libaom/src/av1/encoder/partition_strategy.h   |   222 +
 libs/libaom/src/av1/encoder/pass2_strategy.c       |  2895 +++++
 libs/libaom/src/av1/encoder/pass2_strategy.h       |    76 +
 libs/libaom/src/av1/encoder/pickcdef.c             |   587 +
 libs/libaom/src/av1/encoder/picklpf.c              |   285 +
 libs/libaom/src/av1/encoder/picklpf.h              |    30 +
 libs/libaom/src/av1/encoder/pickrst.c              |  1768 +++
 libs/libaom/src/av1/encoder/pickrst.h              |    66 +
 libs/libaom/src/av1/encoder/pustats.h              |   198 +
 libs/libaom/src/av1/encoder/random.h               |    29 +
 libs/libaom/src/av1/encoder/ransac.c               |   820 ++
 libs/libaom/src/av1/encoder/ransac.h               |    31 +
 libs/libaom/src/av1/encoder/ratectrl.c             |  2117 +++
 libs/libaom/src/av1/encoder/ratectrl.h             |   324 +
 libs/libaom/src/av1/encoder/rd.c                   |  1332 ++
 libs/libaom/src/av1/encoder/rd.h                   |   370 +
 libs/libaom/src/av1/encoder/rdopt.c                |  5505 ++++++++
 libs/libaom/src/av1/encoder/rdopt.h                |   244 +
 libs/libaom/src/av1/encoder/rdopt_data_defs.h      |   294 +
 libs/libaom/src/av1/encoder/rdopt_utils.h          |   652 +
 libs/libaom/src/av1/encoder/reconinter_enc.c       |   407 +
 libs/libaom/src/av1/encoder/reconinter_enc.h       |    72 +
 libs/libaom/src/av1/encoder/segmentation.c         |   251 +
 libs/libaom/src/av1/encoder/segmentation.h         |    38 +
 libs/libaom/src/av1/encoder/speed_features.c       |  1322 ++
 libs/libaom/src/av1/encoder/speed_features.h       |  1034 ++
 libs/libaom/src/av1/encoder/svc_layercontext.c     |   288 +
 libs/libaom/src/av1/encoder/svc_layercontext.h     |    99 +
 libs/libaom/src/av1/encoder/temporal_filter.c      |  1338 ++
 libs/libaom/src/av1/encoder/temporal_filter.h      |    87 +
 libs/libaom/src/av1/encoder/tokenize.c             |   242 +
 libs/libaom/src/av1/encoder/tokenize.h             |    71 +
 libs/libaom/src/av1/encoder/tpl_model.c            |  1189 ++
 libs/libaom/src/av1/encoder/tpl_model.h            |    47 +
 libs/libaom/src/av1/encoder/tune_vmaf.c            |   794 ++
 libs/libaom/src/av1/encoder/tune_vmaf.h            |    32 +
 .../src/av1/encoder/tx_prune_model_weights.h       |  3320 +++++
 libs/libaom/src/av1/encoder/tx_search.c            |  3602 ++++++
 libs/libaom/src/av1/encoder/tx_search.h            |    79 +
 .../src/av1/encoder/use_flat_gop_model_params.h    |   233 +
 libs/libaom/src/av1/encoder/var_based_part.c       |  1006 ++
 libs/libaom/src/av1/encoder/var_based_part.h       |    45 +
 libs/libaom/src/av1/encoder/wedge_utils.c          |   125 +
 .../src/av1/encoder/x86/av1_fwd_txfm1d_sse4.c      |  1417 ++
 .../src/av1/encoder/x86/av1_fwd_txfm2d_avx2.c      |  2814 ++++
 .../src/av1/encoder/x86/av1_fwd_txfm2d_sse4.c      |   364 +
 .../libaom/src/av1/encoder/x86/av1_fwd_txfm_avx2.h |    96 +
 .../libaom/src/av1/encoder/x86/av1_fwd_txfm_sse2.c |  2891 +++++
 .../libaom/src/av1/encoder/x86/av1_fwd_txfm_sse2.h |   119 +
 .../src/av1/encoder/x86/av1_highbd_quantize_avx2.c |   137 +
 .../src/av1/encoder/x86/av1_highbd_quantize_sse4.c |   195 +
 .../libaom/src/av1/encoder/x86/av1_quantize_avx2.c |   445 +
 .../libaom/src/av1/encoder/x86/av1_quantize_sse2.c |   189 +
 .../av1/encoder/x86/av1_quantize_ssse3_x86_64.asm  |   204 +
 .../src/av1/encoder/x86/av1_ssim_opt_x86_64.asm    |   222 +
 libs/libaom/src/av1/encoder/x86/av1_txfm1d_sse4.h  |   144 +
 .../libaom/src/av1/encoder/x86/corner_match_avx2.c |    81 +
 .../libaom/src/av1/encoder/x86/corner_match_sse4.c |   105 +
 libs/libaom/src/av1/encoder/x86/dct_sse2.asm       |    82 +
 libs/libaom/src/av1/encoder/x86/encodetxb_avx2.c   |   122 +
 libs/libaom/src/av1/encoder/x86/encodetxb_sse2.c   |   505 +
 libs/libaom/src/av1/encoder/x86/encodetxb_sse4.c   |    84 +
 .../libaom/src/av1/encoder/x86/error_intrin_avx2.c |   141 +
 libs/libaom/src/av1/encoder/x86/error_sse2.asm     |    88 +
 libs/libaom/src/av1/encoder/x86/hash_sse42.c       |    51 +
 .../encoder/x86/highbd_block_error_intrin_avx2.c   |    63 +
 .../encoder/x86/highbd_block_error_intrin_sse2.c   |    73 +
 .../src/av1/encoder/x86/highbd_fwd_txfm_avx2.c     |  3167 +++++
 .../src/av1/encoder/x86/highbd_fwd_txfm_sse4.c     |  2604 ++++
 libs/libaom/src/av1/encoder/x86/ml_sse3.c          |   244 +
 libs/libaom/src/av1/encoder/x86/pickrst_avx2.c     |  1084 ++
 libs/libaom/src/av1/encoder/x86/pickrst_sse4.c     |   833 ++
 libs/libaom/src/av1/encoder/x86/rdopt_avx2.c       |   256 +
 libs/libaom/src/av1/encoder/x86/rdopt_sse4.c       |   275 +
 .../src/av1/encoder/x86/temporal_filter_avx2.c     |   284 +
 .../av1/encoder/x86/temporal_filter_constants.h    |   407 +
 .../src/av1/encoder/x86/temporal_filter_sse2.c     |   262 +
 .../src/av1/encoder/x86/temporal_filter_sse4.c     |  2044 +++
 libs/libaom/src/av1/encoder/x86/wedge_utils_avx2.c |   215 +
 libs/libaom/src/av1/encoder/x86/wedge_utils_sse2.c |   254 +
 libs/libaom/src/av1/exports_com                    |     2 +
 libs/libaom/src/av1/exports_dec                    |     3 +
 libs/libaom/src/av1/exports_enc                    |     2 +
 libs/libaom/src/av1/exports_ident                  |     2 +
 libs/libaom/src/av1/exports_test                   |     2 +
 libs/libaom/src/build/.gitattributes               |     2 +
 libs/libaom/src/build/cmake/aom_config.c.template  |    13 +
 .../src/build/cmake/aom_config_defaults.cmake      |   193 +
 libs/libaom/src/build/cmake/aom_configure.cmake    |   399 +
 .../src/build/cmake/aom_experiment_deps.cmake      |    28 +
 libs/libaom/src/build/cmake/aom_install.cmake      |    96 +
 libs/libaom/src/build/cmake/aom_optimization.cmake |   240 +
 libs/libaom/src/build/cmake/compiler_flags.cmake   |   373 +
 libs/libaom/src/build/cmake/compiler_tests.cmake   |   179 +
 libs/libaom/src/build/cmake/cpu.cmake              |    82 +
 libs/libaom/src/build/cmake/dist.cmake             |    64 +
 libs/libaom/src/build/cmake/exports.cmake          |    74 +
 libs/libaom/src/build/cmake/exports_sources.cmake  |    35 +
 .../cmake/generate_aom_config_templates.cmake      |    92 +
 libs/libaom/src/build/cmake/generate_exports.cmake |    66 +
 libs/libaom/src/build/cmake/ios-Info.plist         |    37 +
 libs/libaom/src/build/cmake/iosbuild.sh            |   384 +
 libs/libaom/src/build/cmake/msvc_runtime.cmake     |    37 +
 libs/libaom/src/build/cmake/pkg_config.cmake       |    62 +
 libs/libaom/src/build/cmake/rtcd.pl                |   467 +
 libs/libaom/src/build/cmake/sanitizers.cmake       |    46 +
 .../build/cmake/toolchains/arm-ios-common.cmake    |    26 +
 .../cmake/toolchains/arm64-android-clang.cmake     |    48 +
 .../src/build/cmake/toolchains/arm64-ios.cmake     |    23 +
 .../build/cmake/toolchains/arm64-linux-gcc.cmake   |    36 +
 .../build/cmake/toolchains/arm64-mingw-gcc.cmake   |    29 +
 .../src/build/cmake/toolchains/armv7-ios.cmake     |    31 +
 .../build/cmake/toolchains/armv7-linux-gcc.cmake   |    40 +
 .../build/cmake/toolchains/armv7-mingw-gcc.cmake   |    29 +
 .../src/build/cmake/toolchains/armv7s-ios.cmake    |    31 +
 .../cmake/toolchains/ios-simulator-common.cmake    |    23 +
 .../build/cmake/toolchains/mips32-linux-gcc.cmake  |    77 +
 .../build/cmake/toolchains/mips64-linux-gcc.cmake  |    54 +
 .../src/build/cmake/toolchains/ppc-linux-gcc.cmake |    29 +
 .../build/cmake/toolchains/x86-ios-simulator.cmake |    28 +
 .../src/build/cmake/toolchains/x86-linux.cmake     |    19 +
 .../src/build/cmake/toolchains/x86-macos.cmake     |    18 +
 .../src/build/cmake/toolchains/x86-mingw-gcc.cmake |    28 +
 .../cmake/toolchains/x86_64-ios-simulator.cmake    |    25 +
 .../build/cmake/toolchains/x86_64-mingw-gcc.cmake  |    26 +
 libs/libaom/src/build/cmake/util.cmake             |   172 +
 libs/libaom/src/build/cmake/version.cmake          |    65 +
 libs/libaom/src/build/cmake/version.pl             |   112 +
 libs/libaom/src/codereview.settings                |     4 +
 libs/libaom/src/common/args.c                      |   343 +
 libs/libaom/src/common/args.h                      |    71 +
 libs/libaom/src/common/av1_config.c                |   511 +
 libs/libaom/src/common/av1_config.h                |    86 +
 libs/libaom/src/common/ivfdec.c                    |   110 +
 libs/libaom/src/common/ivfdec.h                    |    29 +
 libs/libaom/src/common/ivfenc.c                    |    52 +
 libs/libaom/src/common/ivfenc.h                    |    34 +
 libs/libaom/src/common/md5_utils.c                 |   249 +
 libs/libaom/src/common/md5_utils.h                 |    49 +
 libs/libaom/src/common/obudec.c                    |   486 +
 libs/libaom/src/common/obudec.h                    |    48 +
 libs/libaom/src/common/rawenc.c                    |    96 +
 libs/libaom/src/common/rawenc.h                    |    32 +
 libs/libaom/src/common/tools_common.c              |   508 +
 libs/libaom/src/common/tools_common.h              |   264 +
 libs/libaom/src/common/video_common.h              |    25 +
 libs/libaom/src/common/video_reader.c              |   127 +
 libs/libaom/src/common/video_reader.h              |    60 +
 libs/libaom/src/common/video_writer.c              |    83 +
 libs/libaom/src/common/video_writer.h              |    47 +
 libs/libaom/src/common/warnings.c                  |    97 +
 libs/libaom/src/common/warnings.h                  |    34 +
 libs/libaom/src/common/webmdec.cc                  |   248 +
 libs/libaom/src/common/webmdec.h                   |    71 +
 libs/libaom/src/common/webmenc.cc                  |   173 +
 libs/libaom/src/common/webmenc.h                   |    60 +
 libs/libaom/src/common/y4menc.c                    |   107 +
 libs/libaom/src/common/y4menc.h                    |    39 +
 libs/libaom/src/common/y4minput.c                  |  1153 ++
 libs/libaom/src/common/y4minput.h                  |    75 +
 libs/libaom/src/docs.cmake                         |   257 +
 libs/libaom/src/examples/analyzer.cc               |   723 ++
 libs/libaom/src/examples/aom_cx_set_ref.c          |   383 +
 libs/libaom/src/examples/av1_dec_fuzzer.cc         |    67 +
 libs/libaom/src/examples/build_av1_dec_fuzzer.sh   |    70 +
 libs/libaom/src/examples/decode_to_md5.c           |   131 +
 libs/libaom/src/examples/decode_with_drops.c       |   146 +
 libs/libaom/src/examples/encoder_util.c            |   136 +
 libs/libaom/src/examples/encoder_util.h            |    33 +
 libs/libaom/src/examples/inspect.c                 |   958 ++
 .../src/examples/lightfield_bitstream_parsing.c    |   414 +
 libs/libaom/src/examples/lightfield_decoder.c      |   364 +
 libs/libaom/src/examples/lightfield_encoder.c      |   522 +
 .../src/examples/lightfield_tile_list_decoder.c    |   227 +
 libs/libaom/src/examples/lossless_encoder.c        |   138 +
 libs/libaom/src/examples/noise_model.c             |   432 +
 libs/libaom/src/examples/resize_util.c             |   125 +
 libs/libaom/src/examples/scalable_decoder.c        |   185 +
 libs/libaom/src/examples/scalable_encoder.c        |   289 +
 libs/libaom/src/examples/set_maps.c                |   208 +
 libs/libaom/src/examples/simple_decoder.c          |   146 +
 libs/libaom/src/examples/simple_encoder.c          |   249 +
 libs/libaom/src/examples/svc_encoder_rtc.c         |   907 ++
 libs/libaom/src/examples/twopass_encoder.c         |   250 +
 libs/libaom/src/keywords.dox                       |    51 +
 libs/libaom/src/libs.doxy_template                 |  1260 ++
 libs/libaom/src/mainpage.dox                       |    52 +
 libs/libaom/src/stats/aomstats.c                   |   106 +
 libs/libaom/src/stats/aomstats.h                   |    44 +
 libs/libaom/src/stats/rate_hist.c                  |   271 +
 libs/libaom/src/stats/rate_hist.h                  |    41 +
 libs/libaom/src/test/accounting_test.cc            |    75 +
 libs/libaom/src/test/acm_random.h                  |    85 +
 libs/libaom/src/test/active_map_test.cc            |   103 +
 libs/libaom/src/test/altref_test.cc                |    98 +
 libs/libaom/src/test/aom_integer_test.cc           |   177 +
 libs/libaom/src/test/aomcx_set_ref.sh              |    58 +
 libs/libaom/src/test/aomdec.sh                     |   147 +
 libs/libaom/src/test/aomenc.sh                     |   269 +
 libs/libaom/src/test/aq_segment_test.cc            |    97 +
 libs/libaom/src/test/arf_freq_test.cc              |   225 +
 libs/libaom/src/test/av1_common_int_test.cc        |    22 +
 libs/libaom/src/test/av1_config_test.cc            |   164 +
 libs/libaom/src/test/av1_convolve_2d_test.cc       |   261 +
 libs/libaom/src/test/av1_convolve_2d_test_util.cc  |   708 +
 libs/libaom/src/test/av1_convolve_2d_test_util.h   |   120 +
 libs/libaom/src/test/av1_convolve_scale_test.cc    |   532 +
 .../src/test/av1_encoder_parms_get_to_decoder.cc   |   160 +
 libs/libaom/src/test/av1_ext_tile_test.cc          |   215 +
 libs/libaom/src/test/av1_fwd_txfm1d_test.cc        |   105 +
 libs/libaom/src/test/av1_fwd_txfm2d_test.cc        |   583 +
 libs/libaom/src/test/av1_highbd_iht_test.cc        |   362 +
 .../src/test/av1_horz_only_frame_superres_test.cc  |   365 +
 libs/libaom/src/test/av1_inv_txfm1d_test.cc        |   157 +
 libs/libaom/src/test/av1_inv_txfm2d_test.cc        |   422 +
 libs/libaom/src/test/av1_nn_predict_test.cc        |   217 +
 libs/libaom/src/test/av1_quantize_test.cc          |   239 +
 libs/libaom/src/test/av1_round_shift_array_test.cc |   130 +
 libs/libaom/src/test/av1_txfm_test.cc              |   371 +
 libs/libaom/src/test/av1_txfm_test.h               |   135 +
 libs/libaom/src/test/av1_wedge_utils_test.cc       |   391 +
 libs/libaom/src/test/avg_test.cc                   |   291 +
 libs/libaom/src/test/best_encode.sh                |   103 +
 libs/libaom/src/test/binary_codes_test.cc          |    83 +
 libs/libaom/src/test/blend_a64_mask_1d_test.cc     |   340 +
 libs/libaom/src/test/blend_a64_mask_test.cc        |   620 +
 libs/libaom/src/test/blockd_test.cc                |   122 +
 libs/libaom/src/test/boolcoder_test.cc             |   173 +
 libs/libaom/src/test/borders_test.cc               |    85 +
 libs/libaom/src/test/cdef_test.cc                  |   426 +
 libs/libaom/src/test/cfl_test.cc                   |   585 +
 libs/libaom/src/test/clear_system_state.h          |    31 +
 libs/libaom/src/test/cnn_test.cc                   |  2496 ++++
 libs/libaom/src/test/codec_factory.h               |   173 +
 libs/libaom/src/test/coding_path_sync.cc           |   206 +
 libs/libaom/src/test/comp_avg_pred_test.cc         |    80 +
 libs/libaom/src/test/comp_avg_pred_test.h          |   569 +
 libs/libaom/src/test/comp_mask_variance_test.cc    |   577 +
 libs/libaom/src/test/convolve_round_test.cc        |   184 +
 libs/libaom/src/test/convolve_test.cc              |   885 ++
 libs/libaom/src/test/corner_match_test.cc          |   144 +
 libs/libaom/src/test/cpu_speed_test.cc             |   180 +
 libs/libaom/src/test/datarate_test.cc              |   373 +
 libs/libaom/src/test/datarate_test.h               |   157 +
 libs/libaom/src/test/decode_api_test.cc            |    55 +
 libs/libaom/src/test/decode_multithreaded_test.cc  |   185 +
 libs/libaom/src/test/decode_perf_test.cc           |   247 +
 libs/libaom/src/test/decode_test_driver.cc         |   114 +
 libs/libaom/src/test/decode_test_driver.h          |   165 +
 libs/libaom/src/test/decode_to_md5.sh              |    77 +
 libs/libaom/src/test/decode_with_drops.sh          |    68 +
 libs/libaom/src/test/divu_small_test.cc            |    41 +
 libs/libaom/src/test/dr_prediction_test.cc         |   474 +
 libs/libaom/src/test/dump_obu.sh                   |    70 +
 libs/libaom/src/test/ec_test.cc                    |   160 +
 libs/libaom/src/test/edge_detect_test.cc           |   409 +
 libs/libaom/src/test/encode_api_test.cc            |    73 +
 libs/libaom/src/test/encode_perf_test.cc           |   184 +
 libs/libaom/src/test/encode_test_driver.cc         |   297 +
 libs/libaom/src/test/encode_test_driver.h          |   265 +
 libs/libaom/src/test/encodetxb_test.cc             |   263 +
 libs/libaom/src/test/end_to_end_test.cc            |   211 +
 libs/libaom/src/test/error_block_test.cc           |   289 +
 libs/libaom/src/test/error_resilience_test.cc      |   459 +
 libs/libaom/src/test/ethread_test.cc               |   275 +
 libs/libaom/src/test/examples.sh                   |    29 +
 libs/libaom/src/test/external_frame_buffer_test.cc |   540 +
 libs/libaom/src/test/fdct4x4_test.cc               |   124 +
 libs/libaom/src/test/fft_test.cc                   |   257 +
 libs/libaom/src/test/film_grain_table_test.cc      |   250 +
 libs/libaom/src/test/filterintra_test.cc           |   136 +
 libs/libaom/src/test/frame_error_test.cc           |   164 +
 libs/libaom/src/test/frame_size_tests.cc           |    78 +
 libs/libaom/src/test/function_equivalence_test.h   |    71 +
 libs/libaom/src/test/fwd_kf_test.cc                |   116 +
 libs/libaom/src/test/fwht4x4_test.cc               |   100 +
 libs/libaom/src/test/gf_pyr_height_test.cc         |   156 +
 libs/libaom/src/test/gviz_api.py                   |  1087 ++
 libs/libaom/src/test/hadamard_test.cc              |   261 +
 libs/libaom/src/test/hash_test.cc                  |   134 +
 libs/libaom/src/test/hbd_metrics_test.cc           |   240 +
 libs/libaom/src/test/hiprec_convolve_test.cc       |    68 +
 libs/libaom/src/test/hiprec_convolve_test_util.cc  |   350 +
 libs/libaom/src/test/hiprec_convolve_test_util.h   |    95 +
 libs/libaom/src/test/horver_correlation_test.cc    |   148 +
 libs/libaom/src/test/horz_superres_test.cc         |   406 +
 libs/libaom/src/test/i420_video_source.h           |    34 +
 libs/libaom/src/test/intra_edge_test.cc            |   337 +
 libs/libaom/src/test/intrabc_test.cc               |   170 +
 libs/libaom/src/test/intrapred_test.cc             |   273 +
 libs/libaom/src/test/invalid_file_test.cc          |   159 +
 libs/libaom/src/test/ivf_video_source.h            |   114 +
 libs/libaom/src/test/level_test.cc                 |   157 +
 libs/libaom/src/test/lightfield_test.sh            |   115 +
 libs/libaom/src/test/log2_test.cc                  |    50 +
 libs/libaom/src/test/lossless_test.cc              |   126 +
 libs/libaom/src/test/lpf_test.cc                   |   645 +
 libs/libaom/src/test/masked_sad_test.cc            |   495 +
 libs/libaom/src/test/masked_variance_test.cc       |   514 +
 libs/libaom/src/test/md5_helper.h                  |    76 +
 libs/libaom/src/test/metadata_test.cc              |   337 +
 libs/libaom/src/test/metrics_template.html         |   422 +
 libs/libaom/src/test/monochrome_test.cc            |   130 +
 libs/libaom/src/test/motion_vector_test.cc         |   107 +
 libs/libaom/src/test/noise_model_test.cc           |  1343 ++
 libs/libaom/src/test/obmc_sad_test.cc              |   268 +
 libs/libaom/src/test/obmc_variance_test.cc         |   397 +
 libs/libaom/src/test/pickrst_test.cc               |   534 +
 libs/libaom/src/test/qm_test.cc                    |    81 +
 libs/libaom/src/test/quantize_func_test.cc         |   547 +
 libs/libaom/src/test/reconinter_test.cc            |   259 +
 libs/libaom/src/test/register_state_check.h        |   148 +
 libs/libaom/src/test/resize_test.cc                |   644 +
 libs/libaom/src/test/rt_end_to_end_test.cc         |   174 +
 libs/libaom/src/test/run_encodes.sh                |    39 +
 libs/libaom/src/test/sad_test.cc                   |  1981 +++
 libs/libaom/src/test/sb_multipass_test.cc          |   153 +
 libs/libaom/src/test/scalability_test.cc           |    81 +
 libs/libaom/src/test/scan_test.cc                  |   133 +
 libs/libaom/src/test/segment_binarization_sync.cc  |    61 +
 libs/libaom/src/test/selfguided_filter_test.cc     |   420 +
 libs/libaom/src/test/set_maps.sh                   |    52 +
 libs/libaom/src/test/simd_avx2_test.cc             |    15 +
 libs/libaom/src/test/simd_cmp_avx2.cc              |    15 +
 libs/libaom/src/test/simd_cmp_impl.h               |  2171 ++++
 libs/libaom/src/test/simd_cmp_neon.cc              |    17 +
 libs/libaom/src/test/simd_cmp_sse2.cc              |    18 +
 libs/libaom/src/test/simd_cmp_sse4.cc              |    18 +
 libs/libaom/src/test/simd_cmp_ssse3.cc             |    18 +
 libs/libaom/src/test/simd_impl.h                   |  1143 ++
 libs/libaom/src/test/simd_neon_test.cc             |    17 +
 libs/libaom/src/test/simd_sse2_test.cc             |    18 +
 libs/libaom/src/test/simd_sse4_test.cc             |    18 +
 libs/libaom/src/test/simd_ssse3_test.cc            |    18 +
 libs/libaom/src/test/simple_decoder.sh             |    58 +
 libs/libaom/src/test/simple_encoder.sh             |    53 +
 libs/libaom/src/test/subtract_test.cc              |   252 +
 libs/libaom/src/test/sum_squares_test.cc           |   839 ++
 libs/libaom/src/test/superframe_test.cc            |   110 +
 libs/libaom/src/test/svc_datarate_test.cc          |   609 +
 .../src/test/temporal_filter_planewise_test.cc     |   242 +
 libs/libaom/src/test/temporal_filter_yuv_test.cc   |   841 ++
 libs/libaom/src/test/test-data.sha1                |   559 +
 libs/libaom/src/test/test.cmake                    |   471 +
 .../src/test/test_data_download_worker.cmake       |    46 +
 libs/libaom/src/test/test_data_util.cmake          |   650 +
 libs/libaom/src/test/test_intra_pred_speed.cc      |  1467 +++
 libs/libaom/src/test/test_libaom.cc                |    74 +
 libs/libaom/src/test/test_runner.cmake             |    28 +
 libs/libaom/src/test/test_vector_test.cc           |   174 +
 libs/libaom/src/test/test_vectors.cc               |   263 +
 libs/libaom/src/test/test_vectors.h                |    26 +
 libs/libaom/src/test/tile_independence_test.cc     |   173 +
 libs/libaom/src/test/time_stamp_test.cc            |   105 +
 libs/libaom/src/test/tools_common.sh               |   477 +
 libs/libaom/src/test/transform_test_base.h         |   345 +
 libs/libaom/src/test/twopass_encoder.sh            |    54 +
 libs/libaom/src/test/util.h                        |    53 +
 libs/libaom/src/test/variance_test.cc              |  2410 ++++
 libs/libaom/src/test/video_source.h                |   259 +
 libs/libaom/src/test/visual_metrics.py             |   466 +
 libs/libaom/src/test/warp_filter_test.cc           |    67 +
 libs/libaom/src/test/warp_filter_test_util.cc      |   483 +
 libs/libaom/src/test/warp_filter_test_util.h       |   107 +
 libs/libaom/src/test/webm_video_source.h           |    96 +
 libs/libaom/src/test/wiener_test.cc                |   587 +
 libs/libaom/src/test/y4m_test.cc                   |   180 +
 libs/libaom/src/test/y4m_video_source.h            |   125 +
 libs/libaom/src/test/yuv_video_source.h            |   123 +
 libs/libaom/src/third_party/fastfeat/LICENSE       |    30 +
 libs/libaom/src/third_party/fastfeat/README.libaom |    40 +
 libs/libaom/src/third_party/fastfeat/fast.c        |    22 +
 libs/libaom/src/third_party/fastfeat/fast.h        |    20 +
 libs/libaom/src/third_party/fastfeat/fast_9.c      |  5911 +++++++++
 libs/libaom/src/third_party/fastfeat/nonmax.c      |   121 +
 .../src/third_party/googletest/README.libaom       |    17 +
 .../third_party/googletest/src/googletest/CHANGES  |   157 +
 .../googletest/src/googletest/CMakeLists.txt       |   331 +
 .../googletest/src/googletest/CONTRIBUTORS         |    37 +
 .../third_party/googletest/src/googletest/LICENSE  |    28 +
 .../googletest/src/googletest/README.md            |   341 +
 .../src/googletest/cmake/Config.cmake.in           |     9 +
 .../googletest/src/googletest/cmake/gtest.pc.in    |     9 +
 .../src/googletest/cmake/gtest_main.pc.in          |    10 +
 .../src/googletest/cmake/internal_utils.cmake      |   318 +
 .../googletest/include/gtest/gtest-death-test.h    |   342 +
 .../src/googletest/include/gtest/gtest-matchers.h  |   769 ++
 .../src/googletest/include/gtest/gtest-message.h   |   217 +
 .../googletest/include/gtest/gtest-param-test.h    |   507 +
 .../src/googletest/include/gtest/gtest-printers.h  |   925 ++
 .../src/googletest/include/gtest/gtest-spi.h       |   245 +
 .../src/googletest/include/gtest/gtest-test-part.h |   183 +
 .../googletest/include/gtest/gtest-typed-test.h    |   337 +
 .../src/googletest/include/gtest/gtest.h           |  2454 ++++
 .../src/googletest/include/gtest/gtest_pred_impl.h |   277 +
 .../src/googletest/include/gtest/gtest_prod.h      |    61 +
 .../include/gtest/internal/custom/README.md        |    56 +
 .../include/gtest/internal/custom/gtest-port.h     |    37 +
 .../include/gtest/internal/custom/gtest-printers.h |    42 +
 .../include/gtest/internal/custom/gtest.h          |    37 +
 .../gtest/internal/gtest-death-test-internal.h     |   301 +
 .../include/gtest/internal/gtest-filepath.h        |   208 +
 .../include/gtest/internal/gtest-internal.h        |  1441 +++
 .../include/gtest/internal/gtest-param-util.h      |   922 ++
 .../include/gtest/internal/gtest-port-arch.h       |   111 +
 .../googletest/include/gtest/internal/gtest-port.h |  2232 ++++
 .../include/gtest/internal/gtest-string.h          |   171 +
 .../include/gtest/internal/gtest-type-util.h       |   184 +
 .../googletest/src/googletest/src/gtest-all.cc     |    48 +
 .../src/googletest/src/gtest-death-test.cc         |  1614 +++
 .../src/googletest/src/gtest-filepath.cc           |   377 +
 .../src/googletest/src/gtest-internal-inl.h        |  1213 ++
 .../src/googletest/src/gtest-matchers.cc           |    97 +
 .../googletest/src/googletest/src/gtest-port.cc    |  1361 ++
 .../src/googletest/src/gtest-printers.cc           |   400 +
 .../src/googletest/src/gtest-test-part.cc          |   107 +
 .../src/googletest/src/gtest-typed-test.cc         |   117 +
 .../googletest/src/googletest/src/gtest.cc         |  6240 +++++++++
 .../googletest/src/googletest/src/gtest_main.cc    |    52 +
 libs/libaom/src/third_party/libwebm/AUTHORS.TXT    |     4 +
 libs/libaom/src/third_party/libwebm/Android.mk     |    17 +
 libs/libaom/src/third_party/libwebm/LICENSE.TXT    |    30 +
 libs/libaom/src/third_party/libwebm/PATENTS.TXT    |    23 +
 libs/libaom/src/third_party/libwebm/README.libaom  |    20 +
 .../src/third_party/libwebm/common/file_util.cc    |    93 +
 .../src/third_party/libwebm/common/file_util.h     |    44 +
 .../src/third_party/libwebm/common/hdr_util.cc     |   220 +
 .../src/third_party/libwebm/common/hdr_util.h      |    71 +
 .../src/third_party/libwebm/common/webmids.h       |   193 +
 .../src/third_party/libwebm/mkvmuxer/mkvmuxer.cc   |  4221 ++++++
 .../src/third_party/libwebm/mkvmuxer/mkvmuxer.h    |  1924 +++
 .../third_party/libwebm/mkvmuxer/mkvmuxertypes.h   |    28 +
 .../third_party/libwebm/mkvmuxer/mkvmuxerutil.cc   |   743 ++
 .../third_party/libwebm/mkvmuxer/mkvmuxerutil.h    |   115 +
 .../src/third_party/libwebm/mkvmuxer/mkvwriter.cc  |    92 +
 .../src/third_party/libwebm/mkvmuxer/mkvwriter.h   |    51 +
 .../src/third_party/libwebm/mkvparser/mkvparser.cc |  8071 ++++++++++++
 .../src/third_party/libwebm/mkvparser/mkvparser.h  |  1147 ++
 .../src/third_party/libwebm/mkvparser/mkvreader.cc |   135 +
 .../src/third_party/libwebm/mkvparser/mkvreader.h  |    45 +
 libs/libaom/src/third_party/libyuv/README.libaom   |    15 +
 .../libyuv/include/libyuv/basic_types.h            |   119 +
 .../third_party/libyuv/include/libyuv/compare.h    |    79 +
 .../third_party/libyuv/include/libyuv/convert.h    |   246 +
 .../libyuv/include/libyuv/convert_argb.h           |   232 +
 .../libyuv/include/libyuv/convert_from.h           |   182 +
 .../libyuv/include/libyuv/convert_from_argb.h      |   191 +
 .../src/third_party/libyuv/include/libyuv/cpu_id.h |    82 +
 .../libyuv/include/libyuv/mjpeg_decoder.h          |   193 +
 .../libyuv/include/libyuv/planar_functions.h       |   454 +
 .../src/third_party/libyuv/include/libyuv/rotate.h |   118 +
 .../libyuv/include/libyuv/rotate_argb.h            |    34 +
 .../third_party/libyuv/include/libyuv/rotate_row.h |   139 +
 .../src/third_party/libyuv/include/libyuv/row.h    |  1857 +++
 .../src/third_party/libyuv/include/libyuv/scale.h  |   104 +
 .../third_party/libyuv/include/libyuv/scale_argb.h |    58 +
 .../third_party/libyuv/include/libyuv/scale_row.h  |   479 +
 .../third_party/libyuv/include/libyuv/version.h    |    17 +
 .../libyuv/include/libyuv/video_common.h           |   183 +
 .../src/third_party/libyuv/source/compare.cc       |   373 +
 .../third_party/libyuv/source/compare_common.cc    |    42 +
 .../src/third_party/libyuv/source/compare_gcc.cc   |   152 +
 .../src/third_party/libyuv/source/compare_neon.cc  |    65 +
 .../third_party/libyuv/source/compare_neon64.cc    |    63 +
 .../src/third_party/libyuv/source/compare_win.cc   |   229 +
 .../src/third_party/libyuv/source/convert.cc       |  1389 ++
 .../src/third_party/libyuv/source/convert_argb.cc  |  1155 ++
 .../src/third_party/libyuv/source/convert_from.cc  |  1348 ++
 .../third_party/libyuv/source/convert_from_argb.cc |  1301 ++
 .../src/third_party/libyuv/source/convert_jpeg.cc  |   392 +
 .../third_party/libyuv/source/convert_to_argb.cc   |   306 +
 .../third_party/libyuv/source/convert_to_i420.cc   |   339 +
 .../libaom/src/third_party/libyuv/source/cpu_id.cc |   307 +
 .../src/third_party/libyuv/source/mjpeg_decoder.cc |   572 +
 .../third_party/libyuv/source/mjpeg_validate.cc    |   101 +
 .../third_party/libyuv/source/planar_functions.cc  |  2555 ++++
 .../libaom/src/third_party/libyuv/source/rotate.cc |   496 +
 .../src/third_party/libyuv/source/rotate_any.cc    |    55 +
 .../src/third_party/libyuv/source/rotate_argb.cc   |   205 +
 .../src/third_party/libyuv/source/rotate_common.cc |    92 +
 .../src/third_party/libyuv/source/rotate_gcc.cc    |   493 +
 .../src/third_party/libyuv/source/rotate_mips.cc   |   484 +
 .../src/third_party/libyuv/source/rotate_neon.cc   |   535 +
 .../src/third_party/libyuv/source/rotate_neon64.cc |   543 +
 .../src/third_party/libyuv/source/rotate_win.cc    |   248 +
 .../src/third_party/libyuv/source/row_any.cc       |   680 +
 .../src/third_party/libyuv/source/row_common.cc    |  2576 ++++
 .../src/third_party/libyuv/source/row_gcc.cc       |  5475 ++++++++
 .../src/third_party/libyuv/source/row_mips.cc      |   911 ++
 .../src/third_party/libyuv/source/row_neon.cc      |  3084 +++++
 .../src/third_party/libyuv/source/row_neon64.cc    |  3087 +++++
 .../src/third_party/libyuv/source/row_win.cc       |  6331 +++++++++
 .../src/third_party/libyuv/source/row_x86.asm      |   146 +
 libs/libaom/src/third_party/libyuv/source/scale.cc |  1689 +++
 .../src/third_party/libyuv/source/scale_any.cc     |   200 +
 .../src/third_party/libyuv/source/scale_argb.cc    |   853 ++
 .../src/third_party/libyuv/source/scale_common.cc  |  1137 ++
 .../src/third_party/libyuv/source/scale_gcc.cc     |  1089 ++
 .../src/third_party/libyuv/source/scale_mips.cc    |   654 +
 .../src/third_party/libyuv/source/scale_neon.cc    |  1037 ++
 .../src/third_party/libyuv/source/scale_neon64.cc  |  1042 ++
 .../src/third_party/libyuv/source/scale_win.cc     |  1354 ++
 .../src/third_party/libyuv/source/video_common.cc  |    64 +
 .../src/third_party/libyuv/source/x86inc.asm       |  1136 ++
 libs/libaom/src/third_party/vector/LICENSE         |    19 +
 libs/libaom/src/third_party/vector/README.libaom   |    16 +
 libs/libaom/src/third_party/vector/vector.c        |   540 +
 libs/libaom/src/third_party/vector/vector.h        |   138 +
 libs/libaom/src/third_party/x86inc/LICENSE         |    18 +
 libs/libaom/src/third_party/x86inc/README.libaom   |    20 +
 libs/libaom/src/third_party/x86inc/x86inc.asm      |  1649 +++
 libs/libaom/src/tools/aggregate_entropy_stats.py   |    39 +
 libs/libaom/src/tools/aom_entropy_optimizer.c      |   761 ++
 libs/libaom/src/tools/cpplint.py                   |  4756 +++++++
 libs/libaom/src/tools/diff.py                      |   132 +
 libs/libaom/src/tools/dump_obu.cc                  |   164 +
 libs/libaom/src/tools/gen_authors.sh               |    10 +
 libs/libaom/src/tools/gen_constrained_tokenset.py  |   120 +
 libs/libaom/src/tools/inspect-cli.js               |    39 +
 libs/libaom/src/tools/inspect-post.js              |     1 +
 libs/libaom/src/tools/intersect-diffs.py           |    78 +
 libs/libaom/src/tools/lint-hunks.py                |   146 +
 libs/libaom/src/tools/obu_parser.cc                |   190 +
 libs/libaom/src/tools/obu_parser.h                 |    27 +
 .../src/tools/txfm_analyzer/txfm_gen_code.cc       |   580 +
 libs/libaom/src/tools/txfm_analyzer/txfm_graph.cc  |   943 ++
 libs/libaom/src/tools/txfm_analyzer/txfm_graph.h   |   160 +
 libs/libaom/src/tools/wrap-commit-msg.py           |    72 +
 libs/libaom/src/usage.dox                          |   109 +
 libs/libaom/src/usage_cx.dox                       |     9 +
 libs/libaom/src/usage_dx.dox                       |    22 +
 1093 files changed, 527771 insertions(+)
 create mode 100644 libs/libaom/src/.clang-format
 create mode 100644 libs/libaom/src/.cmake-format.py
 create mode 100644 libs/libaom/src/.gitattributes
 create mode 100644 libs/libaom/src/.mailmap
 create mode 100644 libs/libaom/src/AUTHORS
 create mode 100644 libs/libaom/src/CHANGELOG
 create mode 100644 libs/libaom/src/CMakeLists.txt
 create mode 100644 libs/libaom/src/LICENSE
 create mode 100644 libs/libaom/src/PATENTS
 create mode 100644 libs/libaom/src/README.md
 create mode 100644 libs/libaom/src/Sample.cfg
 create mode 100644 libs/libaom/src/aom/aom.h
 create mode 100644 libs/libaom/src/aom/aom_codec.h
 create mode 100644 libs/libaom/src/aom/aom_decoder.h
 create mode 100644 libs/libaom/src/aom/aom_encoder.h
 create mode 100644 libs/libaom/src/aom/aom_frame_buffer.h
 create mode 100644 libs/libaom/src/aom/aom_image.h
 create mode 100644 libs/libaom/src/aom/aom_integer.h
 create mode 100644 libs/libaom/src/aom/aomcx.h
 create mode 100644 libs/libaom/src/aom/aomdx.h
 create mode 100644 libs/libaom/src/aom/exports_com
 create mode 100644 libs/libaom/src/aom/exports_dec
 create mode 100644 libs/libaom/src/aom/exports_enc
 create mode 100644 libs/libaom/src/aom/exports_test
 create mode 100644 libs/libaom/src/aom/internal/aom_codec_internal.h
 create mode 100644 libs/libaom/src/aom/internal/aom_image_internal.h
 create mode 100644 libs/libaom/src/aom/src/aom_codec.c
 create mode 100644 libs/libaom/src/aom/src/aom_decoder.c
 create mode 100644 libs/libaom/src/aom/src/aom_encoder.c
 create mode 100644 libs/libaom/src/aom/src/aom_image.c
 create mode 100644 libs/libaom/src/aom/src/aom_integer.c
 create mode 100644 libs/libaom/src/aom_dsp/aom_convolve.c
 create mode 100644 libs/libaom/src/aom_dsp/aom_dsp.cmake
 create mode 100644 libs/libaom/src/aom_dsp/aom_dsp_common.h
 create mode 100644 libs/libaom/src/aom_dsp/aom_dsp_rtcd.c
 create mode 100644 libs/libaom/src/aom_dsp/aom_dsp_rtcd_defs.pl
 create mode 100644 libs/libaom/src/aom_dsp/aom_filter.h
 create mode 100644 libs/libaom/src/aom_dsp/aom_simd.h
 create mode 100644 libs/libaom/src/aom_dsp/aom_simd_inline.h
 create mode 100644 libs/libaom/src/aom_dsp/arm/avg_neon.c
 create mode 100644 libs/libaom/src/aom_dsp/arm/blend_a64_mask_neon.c
 create mode 100644 libs/libaom/src/aom_dsp/arm/fwd_txfm_neon.c
 create mode 100644 libs/libaom/src/aom_dsp/arm/hadamard_neon.c
 create mode 100644 libs/libaom/src/aom_dsp/arm/intrapred_neon.c
 create mode 100644 libs/libaom/src/aom_dsp/arm/loopfilter_neon.c
 create mode 100644 libs/libaom/src/aom_dsp/arm/sad4d_neon.c
 create mode 100644 libs/libaom/src/aom_dsp/arm/sad_neon.c
 create mode 100644 libs/libaom/src/aom_dsp/arm/sse_neon.c
 create mode 100644 libs/libaom/src/aom_dsp/arm/subpel_variance_neon.c
 create mode 100644 libs/libaom/src/aom_dsp/arm/subtract_neon.c
 create mode 100644 libs/libaom/src/aom_dsp/arm/sum_neon.h
 create mode 100644 libs/libaom/src/aom_dsp/arm/variance_neon.c
 create mode 100644 libs/libaom/src/aom_dsp/avg.c
 create mode 100644 libs/libaom/src/aom_dsp/binary_codes_reader.c
 create mode 100644 libs/libaom/src/aom_dsp/binary_codes_reader.h
 create mode 100644 libs/libaom/src/aom_dsp/binary_codes_writer.c
 create mode 100644 libs/libaom/src/aom_dsp/binary_codes_writer.h
 create mode 100644 libs/libaom/src/aom_dsp/bitreader.c
 create mode 100644 libs/libaom/src/aom_dsp/bitreader.h
 create mode 100644 libs/libaom/src/aom_dsp/bitreader_buffer.c
 create mode 100644 libs/libaom/src/aom_dsp/bitreader_buffer.h
 create mode 100644 libs/libaom/src/aom_dsp/bitwriter.c
 create mode 100644 libs/libaom/src/aom_dsp/bitwriter.h
 create mode 100644 libs/libaom/src/aom_dsp/bitwriter_buffer.c
 create mode 100644 libs/libaom/src/aom_dsp/bitwriter_buffer.h
 create mode 100644 libs/libaom/src/aom_dsp/blend.h
 create mode 100644 libs/libaom/src/aom_dsp/blend_a64_hmask.c
 create mode 100644 libs/libaom/src/aom_dsp/blend_a64_mask.c
 create mode 100644 libs/libaom/src/aom_dsp/blend_a64_vmask.c
 create mode 100644 libs/libaom/src/aom_dsp/blk_sse_sum.c
 create mode 100644 libs/libaom/src/aom_dsp/entcode.c
 create mode 100644 libs/libaom/src/aom_dsp/entcode.h
 create mode 100644 libs/libaom/src/aom_dsp/entdec.c
 create mode 100644 libs/libaom/src/aom_dsp/entdec.h
 create mode 100644 libs/libaom/src/aom_dsp/entenc.c
 create mode 100644 libs/libaom/src/aom_dsp/entenc.h
 create mode 100644 libs/libaom/src/aom_dsp/fastssim.c
 create mode 100644 libs/libaom/src/aom_dsp/fft.c
 create mode 100644 libs/libaom/src/aom_dsp/fft_common.h
 create mode 100644 libs/libaom/src/aom_dsp/fwd_txfm.c
 create mode 100644 libs/libaom/src/aom_dsp/grain_synthesis.c
 create mode 100644 libs/libaom/src/aom_dsp/grain_synthesis.h
 create mode 100644 libs/libaom/src/aom_dsp/grain_table.c
 create mode 100644 libs/libaom/src/aom_dsp/grain_table.h
 create mode 100644 libs/libaom/src/aom_dsp/intrapred.c
 create mode 100644 libs/libaom/src/aom_dsp/intrapred_common.h
 create mode 100644 libs/libaom/src/aom_dsp/loopfilter.c
 create mode 100644 libs/libaom/src/aom_dsp/mips/aom_convolve8_horiz_msa.c
 create mode 100644 libs/libaom/src/aom_dsp/mips/aom_convolve8_vert_msa.c
 create mode 100644 libs/libaom/src/aom_dsp/mips/aom_convolve_copy_msa.c
 create mode 100644 libs/libaom/src/aom_dsp/mips/aom_convolve_msa.h
 create mode 100644 libs/libaom/src/aom_dsp/mips/common_dspr2.c
 create mode 100644 libs/libaom/src/aom_dsp/mips/common_dspr2.h
 create mode 100644 libs/libaom/src/aom_dsp/mips/convolve2_dspr2.c
 create mode 100644 libs/libaom/src/aom_dsp/mips/convolve2_horiz_dspr2.c
 create mode 100644 libs/libaom/src/aom_dsp/mips/convolve2_vert_dspr2.c
 create mode 100644 libs/libaom/src/aom_dsp/mips/convolve8_dspr2.c
 create mode 100644 libs/libaom/src/aom_dsp/mips/convolve8_horiz_dspr2.c
 create mode 100644 libs/libaom/src/aom_dsp/mips/convolve8_vert_dspr2.c
 create mode 100644 libs/libaom/src/aom_dsp/mips/convolve_common_dspr2.h
 create mode 100644 libs/libaom/src/aom_dsp/mips/intrapred16_dspr2.c
 create mode 100644 libs/libaom/src/aom_dsp/mips/intrapred4_dspr2.c
 create mode 100644 libs/libaom/src/aom_dsp/mips/intrapred8_dspr2.c
 create mode 100644 libs/libaom/src/aom_dsp/mips/intrapred_msa.c
 create mode 100644 libs/libaom/src/aom_dsp/mips/loopfilter_16_msa.c
 create mode 100644 libs/libaom/src/aom_dsp/mips/loopfilter_4_msa.c
 create mode 100644 libs/libaom/src/aom_dsp/mips/loopfilter_8_msa.c
 create mode 100644 libs/libaom/src/aom_dsp/mips/loopfilter_filters_dspr2.c
 create mode 100644 libs/libaom/src/aom_dsp/mips/loopfilter_filters_dspr2.h
 create mode 100644 libs/libaom/src/aom_dsp/mips/loopfilter_macros_dspr2.h
 create mode 100644 libs/libaom/src/aom_dsp/mips/loopfilter_masks_dspr2.h
 create mode 100644 libs/libaom/src/aom_dsp/mips/loopfilter_mb_dspr2.c
 create mode 100644 libs/libaom/src/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c
 create mode 100644 libs/libaom/src/aom_dsp/mips/loopfilter_mb_vert_dspr2.c
 create mode 100644 libs/libaom/src/aom_dsp/mips/loopfilter_msa.h
 create mode 100644 libs/libaom/src/aom_dsp/mips/macros_msa.h
 create mode 100644 libs/libaom/src/aom_dsp/mips/sad_msa.c
 create mode 100644 libs/libaom/src/aom_dsp/mips/sub_pixel_variance_msa.c
 create mode 100644 libs/libaom/src/aom_dsp/mips/subtract_msa.c
 create mode 100644 libs/libaom/src/aom_dsp/mips/variance_msa.c
 create mode 100644 libs/libaom/src/aom_dsp/noise_model.c
 create mode 100644 libs/libaom/src/aom_dsp/noise_model.h
 create mode 100644 libs/libaom/src/aom_dsp/noise_util.c
 create mode 100644 libs/libaom/src/aom_dsp/noise_util.h
 create mode 100644 libs/libaom/src/aom_dsp/prob.h
 create mode 100644 libs/libaom/src/aom_dsp/psnr.c
 create mode 100644 libs/libaom/src/aom_dsp/psnr.h
 create mode 100644 libs/libaom/src/aom_dsp/psnrhvs.c
 create mode 100644 libs/libaom/src/aom_dsp/quantize.c
 create mode 100644 libs/libaom/src/aom_dsp/quantize.h
 create mode 100644 libs/libaom/src/aom_dsp/recenter.h
 create mode 100644 libs/libaom/src/aom_dsp/sad.c
 create mode 100644 libs/libaom/src/aom_dsp/sad_av1.c
 create mode 100644 libs/libaom/src/aom_dsp/simd/v128_intrinsics.h
 create mode 100644 libs/libaom/src/aom_dsp/simd/v128_intrinsics_arm.h
 create mode 100644 libs/libaom/src/aom_dsp/simd/v128_intrinsics_c.h
 create mode 100644 libs/libaom/src/aom_dsp/simd/v128_intrinsics_x86.h
 create mode 100644 libs/libaom/src/aom_dsp/simd/v256_intrinsics.h
 create mode 100644 libs/libaom/src/aom_dsp/simd/v256_intrinsics_arm.h
 create mode 100644 libs/libaom/src/aom_dsp/simd/v256_intrinsics_c.h
 create mode 100644 libs/libaom/src/aom_dsp/simd/v256_intrinsics_v128.h
 create mode 100644 libs/libaom/src/aom_dsp/simd/v256_intrinsics_x86.h
 create mode 100644 libs/libaom/src/aom_dsp/simd/v64_intrinsics.h
 create mode 100644 libs/libaom/src/aom_dsp/simd/v64_intrinsics_arm.h
 create mode 100644 libs/libaom/src/aom_dsp/simd/v64_intrinsics_c.h
 create mode 100644 libs/libaom/src/aom_dsp/simd/v64_intrinsics_x86.h
 create mode 100644 libs/libaom/src/aom_dsp/sse.c
 create mode 100644 libs/libaom/src/aom_dsp/ssim.c
 create mode 100644 libs/libaom/src/aom_dsp/ssim.h
 create mode 100644 libs/libaom/src/aom_dsp/subtract.c
 create mode 100644 libs/libaom/src/aom_dsp/sum_squares.c
 create mode 100644 libs/libaom/src/aom_dsp/txfm_common.h
 create mode 100644 libs/libaom/src/aom_dsp/variance.c
 create mode 100644 libs/libaom/src/aom_dsp/variance.h
 create mode 100644 libs/libaom/src/aom_dsp/vmaf.c
 create mode 100644 libs/libaom/src/aom_dsp/vmaf.h
 create mode 100644 libs/libaom/src/aom_dsp/x86/adaptive_quantize_avx2.c
 create mode 100644 libs/libaom/src/aom_dsp/x86/adaptive_quantize_sse2.c
 create mode 100644 libs/libaom/src/aom_dsp/x86/aom_asm_stubs.c
 create mode 100644 libs/libaom/src/aom_dsp/x86/aom_convolve_copy_sse2.asm
 create mode 100644 libs/libaom/src/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm
 create mode 100644 libs/libaom/src/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm
 create mode 100644 libs/libaom/src/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
 create mode 100644 libs/libaom/src/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c
 create mode 100644 libs/libaom/src/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
 create mode 100644 libs/libaom/src/aom_dsp/x86/aom_subpixel_8t_sse2.asm
 create mode 100644 libs/libaom/src/aom_dsp/x86/aom_subpixel_8t_ssse3.asm
 create mode 100644 libs/libaom/src/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm
 create mode 100644 libs/libaom/src/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm
 create mode 100644 libs/libaom/src/aom_dsp/x86/avg_intrin_avx2.c
 create mode 100644 libs/libaom/src/aom_dsp/x86/avg_intrin_sse2.c
 create mode 100644 libs/libaom/src/aom_dsp/x86/bitdepth_conversion_avx2.h
 create mode 100644 libs/libaom/src/aom_dsp/x86/bitdepth_conversion_sse2.h
 create mode 100644 libs/libaom/src/aom_dsp/x86/blend_a64_hmask_sse4.c
 create mode 100644 libs/libaom/src/aom_dsp/x86/blend_a64_mask_avx2.c
 create mode 100644 libs/libaom/src/aom_dsp/x86/blend_a64_mask_sse4.c
 create mode 100644 libs/libaom/src/aom_dsp/x86/blend_a64_vmask_sse4.c
 create mode 100644 libs/libaom/src/aom_dsp/x86/blend_mask_sse4.h
 create mode 100644 libs/libaom/src/aom_dsp/x86/blend_sse4.h
 create mode 100644 libs/libaom/src/aom_dsp/x86/blk_sse_sum_avx2.c
 create mode 100644 libs/libaom/src/aom_dsp/x86/blk_sse_sum_sse2.c
 create mode 100644 libs/libaom/src/aom_dsp/x86/common_avx2.h
 create mode 100644 libs/libaom/src/aom_dsp/x86/convolve.h
 create mode 100644 libs/libaom/src/aom_dsp/x86/convolve_avx2.h
 create mode 100644 libs/libaom/src/aom_dsp/x86/convolve_common_intrin.h
 create mode 100644 libs/libaom/src/aom_dsp/x86/convolve_sse2.h
 create mode 100644 libs/libaom/src/aom_dsp/x86/convolve_sse4_1.h
 create mode 100644 libs/libaom/src/aom_dsp/x86/fft_avx2.c
 create mode 100644 libs/libaom/src/aom_dsp/x86/fft_sse2.c
 create mode 100644 libs/libaom/src/aom_dsp/x86/fwd_txfm_impl_sse2.h
 create mode 100644 libs/libaom/src/aom_dsp/x86/fwd_txfm_sse2.c
 create mode 100644 libs/libaom/src/aom_dsp/x86/fwd_txfm_sse2.h
 create mode 100644 libs/libaom/src/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm
 create mode 100644 libs/libaom/src/aom_dsp/x86/highbd_adaptive_quantize_avx2.c
 create mode 100644 libs/libaom/src/aom_dsp/x86/highbd_adaptive_quantize_sse2.c
 create mode 100644 libs/libaom/src/aom_dsp/x86/highbd_convolve_avx2.c
 create mode 100644 libs/libaom/src/aom_dsp/x86/highbd_convolve_sse2.c
 create mode 100644 libs/libaom/src/aom_dsp/x86/highbd_convolve_ssse3.c
 create mode 100644 libs/libaom/src/aom_dsp/x86/highbd_intrapred_asm_sse2.asm
 create mode 100644 libs/libaom/src/aom_dsp/x86/highbd_intrapred_sse2.c
 create mode 100644 libs/libaom/src/aom_dsp/x86/highbd_loopfilter_avx2.c
 create mode 100644 libs/libaom/src/aom_dsp/x86/highbd_loopfilter_sse2.c
 create mode 100644 libs/libaom/src/aom_dsp/x86/highbd_quantize_intrin_avx2.c
 create mode 100644 libs/libaom/src/aom_dsp/x86/highbd_quantize_intrin_sse2.c
 create mode 100644 libs/libaom/src/aom_dsp/x86/highbd_sad4d_sse2.asm
 create mode 100644 libs/libaom/src/aom_dsp/x86/highbd_sad_sse2.asm
 create mode 100644 libs/libaom/src/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm
 create mode 100644 libs/libaom/src/aom_dsp/x86/highbd_subtract_sse2.c
 create mode 100644 libs/libaom/src/aom_dsp/x86/highbd_variance_avx2.c
 create mode 100644 libs/libaom/src/aom_dsp/x86/highbd_variance_impl_sse2.asm
 create mode 100644 libs/libaom/src/aom_dsp/x86/highbd_variance_sse2.c
 create mode 100644 libs/libaom/src/aom_dsp/x86/highbd_variance_sse4.c
 create mode 100644 libs/libaom/src/aom_dsp/x86/intrapred_asm_sse2.asm
 create mode 100644 libs/libaom/src/aom_dsp/x86/intrapred_avx2.c
 create mode 100644 libs/libaom/src/aom_dsp/x86/intrapred_sse2.c
 create mode 100644 libs/libaom/src/aom_dsp/x86/intrapred_ssse3.c
 create mode 100644 libs/libaom/src/aom_dsp/x86/intrapred_x86.h
 create mode 100644 libs/libaom/src/aom_dsp/x86/inv_wht_sse2.asm
 create mode 100644 libs/libaom/src/aom_dsp/x86/jnt_sad_ssse3.c
 create mode 100644 libs/libaom/src/aom_dsp/x86/jnt_variance_ssse3.c
 create mode 100644 libs/libaom/src/aom_dsp/x86/loopfilter_sse2.c
 create mode 100644 libs/libaom/src/aom_dsp/x86/lpf_common_sse2.h
 create mode 100644 libs/libaom/src/aom_dsp/x86/masked_sad4d_ssse3.c
 create mode 100644 libs/libaom/src/aom_dsp/x86/masked_sad_intrin_avx2.c
 create mode 100644 libs/libaom/src/aom_dsp/x86/masked_sad_intrin_ssse3.c
 create mode 100644 libs/libaom/src/aom_dsp/x86/masked_sad_intrin_ssse3.h
 create mode 100644 libs/libaom/src/aom_dsp/x86/masked_variance_intrin_ssse3.c
 create mode 100644 libs/libaom/src/aom_dsp/x86/masked_variance_intrin_ssse3.h
 create mode 100644 libs/libaom/src/aom_dsp/x86/mem_sse2.h
 create mode 100644 libs/libaom/src/aom_dsp/x86/obmc_intrinsic_sse4.h
 create mode 100644 libs/libaom/src/aom_dsp/x86/obmc_intrinsic_ssse3.h
 create mode 100644 libs/libaom/src/aom_dsp/x86/obmc_sad_avx2.c
 create mode 100644 libs/libaom/src/aom_dsp/x86/obmc_sad_sse4.c
 create mode 100644 libs/libaom/src/aom_dsp/x86/obmc_variance_avx2.c
 create mode 100644 libs/libaom/src/aom_dsp/x86/obmc_variance_sse4.c
 create mode 100644 libs/libaom/src/aom_dsp/x86/quantize_avx_x86_64.asm
 create mode 100644 libs/libaom/src/aom_dsp/x86/quantize_sse2.c
 create mode 100644 libs/libaom/src/aom_dsp/x86/quantize_ssse3.c
 create mode 100644 libs/libaom/src/aom_dsp/x86/quantize_ssse3_x86_64.asm
 create mode 100644 libs/libaom/src/aom_dsp/x86/quantize_x86.h
 create mode 100644 libs/libaom/src/aom_dsp/x86/sad4d_avx2.c
 create mode 100644 libs/libaom/src/aom_dsp/x86/sad4d_sse2.asm
 create mode 100644 libs/libaom/src/aom_dsp/x86/sad_avx2.c
 create mode 100644 libs/libaom/src/aom_dsp/x86/sad_highbd_avx2.c
 create mode 100644 libs/libaom/src/aom_dsp/x86/sad_impl_avx2.c
 create mode 100644 libs/libaom/src/aom_dsp/x86/sad_sse2.asm
 create mode 100644 libs/libaom/src/aom_dsp/x86/sse_avx2.c
 create mode 100644 libs/libaom/src/aom_dsp/x86/sse_sse4.c
 create mode 100644 libs/libaom/src/aom_dsp/x86/ssim_sse2_x86_64.asm
 create mode 100644 libs/libaom/src/aom_dsp/x86/subpel_variance_sse2.asm
 create mode 100644 libs/libaom/src/aom_dsp/x86/subtract_avx2.c
 create mode 100644 libs/libaom/src/aom_dsp/x86/subtract_sse2.asm
 create mode 100644 libs/libaom/src/aom_dsp/x86/sum_squares_avx2.c
 create mode 100644 libs/libaom/src/aom_dsp/x86/sum_squares_sse2.c
 create mode 100644 libs/libaom/src/aom_dsp/x86/sum_squares_sse2.h
 create mode 100644 libs/libaom/src/aom_dsp/x86/synonyms.h
 create mode 100644 libs/libaom/src/aom_dsp/x86/synonyms_avx2.h
 create mode 100644 libs/libaom/src/aom_dsp/x86/transpose_sse2.h
 create mode 100644 libs/libaom/src/aom_dsp/x86/txfm_common_avx2.h
 create mode 100644 libs/libaom/src/aom_dsp/x86/txfm_common_sse2.h
 create mode 100644 libs/libaom/src/aom_dsp/x86/variance_avx2.c
 create mode 100644 libs/libaom/src/aom_dsp/x86/variance_impl_avx2.c
 create mode 100644 libs/libaom/src/aom_dsp/x86/variance_impl_ssse3.c
 create mode 100644 libs/libaom/src/aom_dsp/x86/variance_sse2.c
 create mode 100644 libs/libaom/src/aom_mem/aom_mem.c
 create mode 100644 libs/libaom/src/aom_mem/aom_mem.cmake
 create mode 100644 libs/libaom/src/aom_mem/aom_mem.h
 create mode 100644 libs/libaom/src/aom_mem/include/aom_mem_intrnl.h
 create mode 100644 libs/libaom/src/aom_ports/aom_once.h
 create mode 100644 libs/libaom/src/aom_ports/aom_ports.cmake
 create mode 100644 libs/libaom/src/aom_ports/aom_timer.h
 create mode 100644 libs/libaom/src/aom_ports/arm.h
 create mode 100644 libs/libaom/src/aom_ports/arm_cpudetect.c
 create mode 100644 libs/libaom/src/aom_ports/bitops.h
 create mode 100644 libs/libaom/src/aom_ports/emmintrin_compat.h
 create mode 100644 libs/libaom/src/aom_ports/emms.asm
 create mode 100644 libs/libaom/src/aom_ports/mem.h
 create mode 100644 libs/libaom/src/aom_ports/mem_ops.h
 create mode 100644 libs/libaom/src/aom_ports/mem_ops_aligned.h
 create mode 100644 libs/libaom/src/aom_ports/msvc.h
 create mode 100644 libs/libaom/src/aom_ports/ppc.h
 create mode 100644 libs/libaom/src/aom_ports/ppc_cpudetect.c
 create mode 100644 libs/libaom/src/aom_ports/sanitizer.h
 create mode 100644 libs/libaom/src/aom_ports/system_state.h
 create mode 100644 libs/libaom/src/aom_ports/x86.h
 create mode 100644 libs/libaom/src/aom_ports/x86_abi_support.asm
 create mode 100644 libs/libaom/src/aom_scale/aom_scale.cmake
 create mode 100644 libs/libaom/src/aom_scale/aom_scale.h
 create mode 100644 libs/libaom/src/aom_scale/aom_scale_rtcd.c
 create mode 100644 libs/libaom/src/aom_scale/aom_scale_rtcd.pl
 create mode 100644 libs/libaom/src/aom_scale/generic/aom_scale.c
 create mode 100644 libs/libaom/src/aom_scale/generic/gen_scalers.c
 create mode 100644 libs/libaom/src/aom_scale/generic/yv12config.c
 create mode 100644 libs/libaom/src/aom_scale/generic/yv12extend.c
 create mode 100644 libs/libaom/src/aom_scale/mips/dspr2/yv12extend_dspr2.c
 create mode 100644 libs/libaom/src/aom_scale/yv12config.h
 create mode 100644 libs/libaom/src/aom_util/aom_thread.c
 create mode 100644 libs/libaom/src/aom_util/aom_thread.h
 create mode 100644 libs/libaom/src/aom_util/aom_util.cmake
 create mode 100644 libs/libaom/src/aom_util/debug_util.c
 create mode 100644 libs/libaom/src/aom_util/debug_util.h
 create mode 100644 libs/libaom/src/aom_util/endian_inl.h
 create mode 100644 libs/libaom/src/apps/aomdec.c
 create mode 100644 libs/libaom/src/apps/aomenc.c
 create mode 100644 libs/libaom/src/apps/aomenc.h
 create mode 100644 libs/libaom/src/av1/av1.cmake
 create mode 100644 libs/libaom/src/av1/av1_cx_iface.c
 create mode 100644 libs/libaom/src/av1/av1_dx_iface.c
 create mode 100644 libs/libaom/src/av1/av1_iface_common.h
 create mode 100644 libs/libaom/src/av1/common/alloccommon.c
 create mode 100644 libs/libaom/src/av1/common/alloccommon.h
 create mode 100644 libs/libaom/src/av1/common/arm/av1_inv_txfm_neon.c
 create mode 100644 libs/libaom/src/av1/common/arm/av1_inv_txfm_neon.h
 create mode 100644 libs/libaom/src/av1/common/arm/av1_txfm_neon.c
 create mode 100644 libs/libaom/src/av1/common/arm/blend_a64_hmask_neon.c
 create mode 100644 libs/libaom/src/av1/common/arm/blend_a64_vmask_neon.c
 create mode 100644 libs/libaom/src/av1/common/arm/cfl_neon.c
 create mode 100644 libs/libaom/src/av1/common/arm/convolve_neon.c
 create mode 100644 libs/libaom/src/av1/common/arm/convolve_neon.h
 create mode 100644 libs/libaom/src/av1/common/arm/jnt_convolve_neon.c
 create mode 100644 libs/libaom/src/av1/common/arm/mem_neon.h
 create mode 100644 libs/libaom/src/av1/common/arm/reconinter_neon.c
 create mode 100644 libs/libaom/src/av1/common/arm/selfguided_neon.c
 create mode 100644 libs/libaom/src/av1/common/arm/transpose_neon.h
 create mode 100644 libs/libaom/src/av1/common/arm/warp_plane_neon.c
 create mode 100644 libs/libaom/src/av1/common/arm/wiener_convolve_neon.c
 create mode 100644 libs/libaom/src/av1/common/av1_common_int.h
 create mode 100644 libs/libaom/src/av1/common/av1_inv_txfm1d.c
 create mode 100644 libs/libaom/src/av1/common/av1_inv_txfm1d.h
 create mode 100644 libs/libaom/src/av1/common/av1_inv_txfm1d_cfg.h
 create mode 100644 libs/libaom/src/av1/common/av1_inv_txfm2d.c
 create mode 100644 libs/libaom/src/av1/common/av1_loopfilter.c
 create mode 100644 libs/libaom/src/av1/common/av1_loopfilter.h
 create mode 100644 libs/libaom/src/av1/common/av1_rtcd.c
 create mode 100644 libs/libaom/src/av1/common/av1_rtcd_defs.pl
 create mode 100644 libs/libaom/src/av1/common/av1_txfm.c
 create mode 100644 libs/libaom/src/av1/common/av1_txfm.h
 create mode 100644 libs/libaom/src/av1/common/blockd.c
 create mode 100644 libs/libaom/src/av1/common/blockd.h
 create mode 100644 libs/libaom/src/av1/common/cdef.c
 create mode 100644 libs/libaom/src/av1/common/cdef.h
 create mode 100644 libs/libaom/src/av1/common/cdef_block.c
 create mode 100644 libs/libaom/src/av1/common/cdef_block.h
 create mode 100644 libs/libaom/src/av1/common/cdef_block_avx2.c
 create mode 100644 libs/libaom/src/av1/common/cdef_block_neon.c
 create mode 100644 libs/libaom/src/av1/common/cdef_block_simd.h
 create mode 100644 libs/libaom/src/av1/common/cdef_block_sse2.c
 create mode 100644 libs/libaom/src/av1/common/cdef_block_sse4.c
 create mode 100644 libs/libaom/src/av1/common/cdef_block_ssse3.c
 create mode 100644 libs/libaom/src/av1/common/cfl.c
 create mode 100644 libs/libaom/src/av1/common/cfl.h
 create mode 100644 libs/libaom/src/av1/common/common.h
 create mode 100644 libs/libaom/src/av1/common/common_data.h
 create mode 100644 libs/libaom/src/av1/common/convolve.c
 create mode 100644 libs/libaom/src/av1/common/convolve.h
 create mode 100644 libs/libaom/src/av1/common/debugmodes.c
 create mode 100644 libs/libaom/src/av1/common/entropy.c
 create mode 100644 libs/libaom/src/av1/common/entropy.h
 create mode 100644 libs/libaom/src/av1/common/entropymode.c
 create mode 100644 libs/libaom/src/av1/common/entropymode.h
 create mode 100644 libs/libaom/src/av1/common/entropymv.c
 create mode 100644 libs/libaom/src/av1/common/entropymv.h
 create mode 100644 libs/libaom/src/av1/common/enums.h
 create mode 100644 libs/libaom/src/av1/common/filter.h
 create mode 100644 libs/libaom/src/av1/common/frame_buffers.c
 create mode 100644 libs/libaom/src/av1/common/frame_buffers.h
 create mode 100644 libs/libaom/src/av1/common/idct.c
 create mode 100644 libs/libaom/src/av1/common/idct.h
 create mode 100644 libs/libaom/src/av1/common/loopfiltermask.c
 create mode 100644 libs/libaom/src/av1/common/mv.h
 create mode 100644 libs/libaom/src/av1/common/mvref_common.c
 create mode 100644 libs/libaom/src/av1/common/mvref_common.h
 create mode 100644 libs/libaom/src/av1/common/obmc.h
 create mode 100644 libs/libaom/src/av1/common/obu_util.c
 create mode 100644 libs/libaom/src/av1/common/obu_util.h
 create mode 100644 libs/libaom/src/av1/common/odintrin.c
 create mode 100644 libs/libaom/src/av1/common/odintrin.h
 create mode 100644 libs/libaom/src/av1/common/ppc/cfl_ppc.c
 create mode 100644 libs/libaom/src/av1/common/pred_common.c
 create mode 100644 libs/libaom/src/av1/common/pred_common.h
 create mode 100644 libs/libaom/src/av1/common/quant_common.c
 create mode 100644 libs/libaom/src/av1/common/quant_common.h
 create mode 100644 libs/libaom/src/av1/common/reconinter.c
 create mode 100644 libs/libaom/src/av1/common/reconinter.h
 create mode 100644 libs/libaom/src/av1/common/reconintra.c
 create mode 100644 libs/libaom/src/av1/common/reconintra.h
 create mode 100644 libs/libaom/src/av1/common/resize.c
 create mode 100644 libs/libaom/src/av1/common/resize.h
 create mode 100644 libs/libaom/src/av1/common/restoration.c
 create mode 100644 libs/libaom/src/av1/common/restoration.h
 create mode 100644 libs/libaom/src/av1/common/scale.c
 create mode 100644 libs/libaom/src/av1/common/scale.h
 create mode 100644 libs/libaom/src/av1/common/scan.c
 create mode 100644 libs/libaom/src/av1/common/scan.h
 create mode 100644 libs/libaom/src/av1/common/seg_common.c
 create mode 100644 libs/libaom/src/av1/common/seg_common.h
 create mode 100644 libs/libaom/src/av1/common/thread_common.c
 create mode 100644 libs/libaom/src/av1/common/thread_common.h
 create mode 100644 libs/libaom/src/av1/common/tile_common.c
 create mode 100644 libs/libaom/src/av1/common/tile_common.h
 create mode 100644 libs/libaom/src/av1/common/timing.c
 create mode 100644 libs/libaom/src/av1/common/timing.h
 create mode 100644 libs/libaom/src/av1/common/token_cdfs.h
 create mode 100644 libs/libaom/src/av1/common/txb_common.c
 create mode 100644 libs/libaom/src/av1/common/txb_common.h
 create mode 100644 libs/libaom/src/av1/common/warped_motion.c
 create mode 100644 libs/libaom/src/av1/common/warped_motion.h
 create mode 100644 libs/libaom/src/av1/common/x86/av1_convolve_horiz_rs_sse4.c
 create mode 100644 libs/libaom/src/av1/common/x86/av1_convolve_scale_sse4.c
 create mode 100644 libs/libaom/src/av1/common/x86/av1_inv_txfm_avx2.c
 create mode 100644 libs/libaom/src/av1/common/x86/av1_inv_txfm_avx2.h
 create mode 100644 libs/libaom/src/av1/common/x86/av1_inv_txfm_ssse3.c
 create mode 100644 libs/libaom/src/av1/common/x86/av1_inv_txfm_ssse3.h
 create mode 100644 libs/libaom/src/av1/common/x86/av1_txfm_sse2.h
 create mode 100644 libs/libaom/src/av1/common/x86/av1_txfm_sse4.c
 create mode 100644 libs/libaom/src/av1/common/x86/av1_txfm_sse4.h
 create mode 100644 libs/libaom/src/av1/common/x86/cfl_avx2.c
 create mode 100644 libs/libaom/src/av1/common/x86/cfl_simd.h
 create mode 100644 libs/libaom/src/av1/common/x86/cfl_sse2.c
 create mode 100644 libs/libaom/src/av1/common/x86/cfl_ssse3.c
 create mode 100644 libs/libaom/src/av1/common/x86/convolve_2d_avx2.c
 create mode 100644 libs/libaom/src/av1/common/x86/convolve_2d_sse2.c
 create mode 100644 libs/libaom/src/av1/common/x86/convolve_avx2.c
 create mode 100644 libs/libaom/src/av1/common/x86/convolve_sse2.c
 create mode 100644 libs/libaom/src/av1/common/x86/filterintra_sse4.c
 create mode 100644 libs/libaom/src/av1/common/x86/highbd_convolve_2d_avx2.c
 create mode 100644 libs/libaom/src/av1/common/x86/highbd_convolve_2d_sse2.c
 create mode 100644 libs/libaom/src/av1/common/x86/highbd_convolve_2d_sse4.c
 create mode 100644 libs/libaom/src/av1/common/x86/highbd_convolve_2d_ssse3.c
 create mode 100644 libs/libaom/src/av1/common/x86/highbd_inv_txfm_avx2.c
 create mode 100644 libs/libaom/src/av1/common/x86/highbd_inv_txfm_sse4.c
 create mode 100644 libs/libaom/src/av1/common/x86/highbd_jnt_convolve_avx2.c
 create mode 100644 libs/libaom/src/av1/common/x86/highbd_jnt_convolve_sse4.c
 create mode 100644 libs/libaom/src/av1/common/x86/highbd_txfm_utility_sse4.h
 create mode 100644 libs/libaom/src/av1/common/x86/highbd_warp_plane_sse4.c
 create mode 100644 libs/libaom/src/av1/common/x86/highbd_wiener_convolve_avx2.c
 create mode 100644 libs/libaom/src/av1/common/x86/highbd_wiener_convolve_ssse3.c
 create mode 100644 libs/libaom/src/av1/common/x86/intra_edge_sse4.c
 create mode 100644 libs/libaom/src/av1/common/x86/jnt_convolve_avx2.c
 create mode 100644 libs/libaom/src/av1/common/x86/jnt_convolve_sse2.c
 create mode 100644 libs/libaom/src/av1/common/x86/jnt_convolve_ssse3.c
 create mode 100644 libs/libaom/src/av1/common/x86/reconinter_avx2.c
 create mode 100644 libs/libaom/src/av1/common/x86/reconinter_sse4.c
 create mode 100644 libs/libaom/src/av1/common/x86/reconinter_ssse3.c
 create mode 100644 libs/libaom/src/av1/common/x86/selfguided_avx2.c
 create mode 100644 libs/libaom/src/av1/common/x86/selfguided_sse4.c
 create mode 100644 libs/libaom/src/av1/common/x86/warp_plane_avx2.c
 create mode 100644 libs/libaom/src/av1/common/x86/warp_plane_sse2.c
 create mode 100644 libs/libaom/src/av1/common/x86/warp_plane_sse4.c
 create mode 100644 libs/libaom/src/av1/common/x86/wiener_convolve_avx2.c
 create mode 100644 libs/libaom/src/av1/common/x86/wiener_convolve_sse2.c
 create mode 100644 libs/libaom/src/av1/decoder/accounting.c
 create mode 100644 libs/libaom/src/av1/decoder/accounting.h
 create mode 100644 libs/libaom/src/av1/decoder/decodeframe.c
 create mode 100644 libs/libaom/src/av1/decoder/decodeframe.h
 create mode 100644 libs/libaom/src/av1/decoder/decodemv.c
 create mode 100644 libs/libaom/src/av1/decoder/decodemv.h
 create mode 100644 libs/libaom/src/av1/decoder/decoder.c
 create mode 100644 libs/libaom/src/av1/decoder/decoder.h
 create mode 100644 libs/libaom/src/av1/decoder/decodetxb.c
 create mode 100644 libs/libaom/src/av1/decoder/decodetxb.h
 create mode 100644 libs/libaom/src/av1/decoder/detokenize.c
 create mode 100644 libs/libaom/src/av1/decoder/detokenize.h
 create mode 100644 libs/libaom/src/av1/decoder/dthread.h
 create mode 100644 libs/libaom/src/av1/decoder/inspection.c
 create mode 100644 libs/libaom/src/av1/decoder/inspection.h
 create mode 100644 libs/libaom/src/av1/decoder/obu.c
 create mode 100644 libs/libaom/src/av1/decoder/obu.h
 create mode 100644 libs/libaom/src/av1/encoder/aq_complexity.c
 create mode 100644 libs/libaom/src/av1/encoder/aq_complexity.h
 create mode 100644 libs/libaom/src/av1/encoder/aq_cyclicrefresh.c
 create mode 100644 libs/libaom/src/av1/encoder/aq_cyclicrefresh.h
 create mode 100644 libs/libaom/src/av1/encoder/aq_variance.c
 create mode 100644 libs/libaom/src/av1/encoder/aq_variance.h
 create mode 100644 libs/libaom/src/av1/encoder/arm/neon/av1_error_neon.c
 create mode 100644 libs/libaom/src/av1/encoder/arm/neon/quantize_neon.c
 create mode 100644 libs/libaom/src/av1/encoder/av1_fwd_txfm1d.c
 create mode 100644 libs/libaom/src/av1/encoder/av1_fwd_txfm1d.h
 create mode 100644 libs/libaom/src/av1/encoder/av1_fwd_txfm1d_cfg.h
 create mode 100644 libs/libaom/src/av1/encoder/av1_fwd_txfm2d.c
 create mode 100644 libs/libaom/src/av1/encoder/av1_multi_thread.c
 create mode 100644 libs/libaom/src/av1/encoder/av1_multi_thread.h
 create mode 100644 libs/libaom/src/av1/encoder/av1_quantize.c
 create mode 100644 libs/libaom/src/av1/encoder/av1_quantize.h
 create mode 100644 libs/libaom/src/av1/encoder/bitstream.c
 create mode 100644 libs/libaom/src/av1/encoder/bitstream.h
 create mode 100644 libs/libaom/src/av1/encoder/block.h
 create mode 100644 libs/libaom/src/av1/encoder/blockiness.c
 create mode 100644 libs/libaom/src/av1/encoder/cnn.c
 create mode 100644 libs/libaom/src/av1/encoder/cnn.h
 create mode 100644 libs/libaom/src/av1/encoder/compound_type.c
 create mode 100644 libs/libaom/src/av1/encoder/compound_type.h
 create mode 100644 libs/libaom/src/av1/encoder/context_tree.c
 create mode 100644 libs/libaom/src/av1/encoder/context_tree.h
 create mode 100644 libs/libaom/src/av1/encoder/corner_detect.c
 create mode 100644 libs/libaom/src/av1/encoder/corner_detect.h
 create mode 100644 libs/libaom/src/av1/encoder/corner_match.c
 create mode 100644 libs/libaom/src/av1/encoder/corner_match.h
 create mode 100644 libs/libaom/src/av1/encoder/cost.c
 create mode 100644 libs/libaom/src/av1/encoder/cost.h
 create mode 100644 libs/libaom/src/av1/encoder/dwt.c
 create mode 100644 libs/libaom/src/av1/encoder/dwt.h
 create mode 100644 libs/libaom/src/av1/encoder/enc_enums.h
 create mode 100644 libs/libaom/src/av1/encoder/encode_strategy.c
 create mode 100644 libs/libaom/src/av1/encoder/encode_strategy.h
 create mode 100644 libs/libaom/src/av1/encoder/encodeframe.c
 create mode 100644 libs/libaom/src/av1/encoder/encodeframe.h
 create mode 100644 libs/libaom/src/av1/encoder/encodemb.c
 create mode 100644 libs/libaom/src/av1/encoder/encodemb.h
 create mode 100644 libs/libaom/src/av1/encoder/encodemv.c
 create mode 100644 libs/libaom/src/av1/encoder/encodemv.h
 create mode 100644 libs/libaom/src/av1/encoder/encoder.c
 create mode 100644 libs/libaom/src/av1/encoder/encoder.h
 create mode 100644 libs/libaom/src/av1/encoder/encodetxb.c
 create mode 100644 libs/libaom/src/av1/encoder/encodetxb.h
 create mode 100644 libs/libaom/src/av1/encoder/ethread.c
 create mode 100644 libs/libaom/src/av1/encoder/ethread.h
 create mode 100644 libs/libaom/src/av1/encoder/extend.c
 create mode 100644 libs/libaom/src/av1/encoder/extend.h
 create mode 100644 libs/libaom/src/av1/encoder/firstpass.c
 create mode 100644 libs/libaom/src/av1/encoder/firstpass.h
 create mode 100644 libs/libaom/src/av1/encoder/global_motion.c
 create mode 100644 libs/libaom/src/av1/encoder/global_motion.h
 create mode 100644 libs/libaom/src/av1/encoder/gop_structure.c
 create mode 100644 libs/libaom/src/av1/encoder/gop_structure.h
 create mode 100644 libs/libaom/src/av1/encoder/grain_test_vectors.h
 create mode 100644 libs/libaom/src/av1/encoder/hash.c
 create mode 100644 libs/libaom/src/av1/encoder/hash.h
 create mode 100644 libs/libaom/src/av1/encoder/hash_motion.c
 create mode 100644 libs/libaom/src/av1/encoder/hash_motion.h
 create mode 100644 libs/libaom/src/av1/encoder/hybrid_fwd_txfm.c
 create mode 100644 libs/libaom/src/av1/encoder/hybrid_fwd_txfm.h
 create mode 100644 libs/libaom/src/av1/encoder/interp_search.c
 create mode 100644 libs/libaom/src/av1/encoder/interp_search.h
 create mode 100644 libs/libaom/src/av1/encoder/intra_mode_search.c
 create mode 100644 libs/libaom/src/av1/encoder/intra_mode_search.h
 create mode 100644 libs/libaom/src/av1/encoder/k_means_template.h
 create mode 100644 libs/libaom/src/av1/encoder/level.c
 create mode 100644 libs/libaom/src/av1/encoder/level.h
 create mode 100644 libs/libaom/src/av1/encoder/lookahead.c
 create mode 100644 libs/libaom/src/av1/encoder/lookahead.h
 create mode 100644 libs/libaom/src/av1/encoder/mathutils.h
 create mode 100644 libs/libaom/src/av1/encoder/mcomp.c
 create mode 100644 libs/libaom/src/av1/encoder/mcomp.h
 create mode 100644 libs/libaom/src/av1/encoder/mips/msa/error_msa.c
 create mode 100644 libs/libaom/src/av1/encoder/mips/msa/fdct4x4_msa.c
 create mode 100644 libs/libaom/src/av1/encoder/mips/msa/temporal_filter_msa.c
 create mode 100644 libs/libaom/src/av1/encoder/misc_model_weights.h
 create mode 100644 libs/libaom/src/av1/encoder/ml.c
 create mode 100644 libs/libaom/src/av1/encoder/ml.h
 create mode 100644 libs/libaom/src/av1/encoder/mode_prune_model_weights.h
 create mode 100644 libs/libaom/src/av1/encoder/model_rd.h
 create mode 100644 libs/libaom/src/av1/encoder/motion_search_facade.c
 create mode 100644 libs/libaom/src/av1/encoder/motion_search_facade.h
 create mode 100644 libs/libaom/src/av1/encoder/mv_prec.c
 create mode 100644 libs/libaom/src/av1/encoder/mv_prec.h
 create mode 100644 libs/libaom/src/av1/encoder/nonrd_pickmode.c
 create mode 100644 libs/libaom/src/av1/encoder/palette.c
 create mode 100644 libs/libaom/src/av1/encoder/palette.h
 create mode 100644 libs/libaom/src/av1/encoder/partition_cnn_weights.h
 create mode 100644 libs/libaom/src/av1/encoder/partition_model_weights.h
 create mode 100644 libs/libaom/src/av1/encoder/partition_strategy.c
 create mode 100644 libs/libaom/src/av1/encoder/partition_strategy.h
 create mode 100644 libs/libaom/src/av1/encoder/pass2_strategy.c
 create mode 100644 libs/libaom/src/av1/encoder/pass2_strategy.h
 create mode 100644 libs/libaom/src/av1/encoder/pickcdef.c
 create mode 100644 libs/libaom/src/av1/encoder/picklpf.c
 create mode 100644 libs/libaom/src/av1/encoder/picklpf.h
 create mode 100644 libs/libaom/src/av1/encoder/pickrst.c
 create mode 100644 libs/libaom/src/av1/encoder/pickrst.h
 create mode 100644 libs/libaom/src/av1/encoder/pustats.h
 create mode 100644 libs/libaom/src/av1/encoder/random.h
 create mode 100644 libs/libaom/src/av1/encoder/ransac.c
 create mode 100644 libs/libaom/src/av1/encoder/ransac.h
 create mode 100644 libs/libaom/src/av1/encoder/ratectrl.c
 create mode 100644 libs/libaom/src/av1/encoder/ratectrl.h
 create mode 100644 libs/libaom/src/av1/encoder/rd.c
 create mode 100644 libs/libaom/src/av1/encoder/rd.h
 create mode 100644 libs/libaom/src/av1/encoder/rdopt.c
 create mode 100644 libs/libaom/src/av1/encoder/rdopt.h
 create mode 100644 libs/libaom/src/av1/encoder/rdopt_data_defs.h
 create mode 100644 libs/libaom/src/av1/encoder/rdopt_utils.h
 create mode 100644 libs/libaom/src/av1/encoder/reconinter_enc.c
 create mode 100644 libs/libaom/src/av1/encoder/reconinter_enc.h
 create mode 100644 libs/libaom/src/av1/encoder/segmentation.c
 create mode 100644 libs/libaom/src/av1/encoder/segmentation.h
 create mode 100644 libs/libaom/src/av1/encoder/speed_features.c
 create mode 100644 libs/libaom/src/av1/encoder/speed_features.h
 create mode 100644 libs/libaom/src/av1/encoder/svc_layercontext.c
 create mode 100644 libs/libaom/src/av1/encoder/svc_layercontext.h
 create mode 100644 libs/libaom/src/av1/encoder/temporal_filter.c
 create mode 100644 libs/libaom/src/av1/encoder/temporal_filter.h
 create mode 100644 libs/libaom/src/av1/encoder/tokenize.c
 create mode 100644 libs/libaom/src/av1/encoder/tokenize.h
 create mode 100644 libs/libaom/src/av1/encoder/tpl_model.c
 create mode 100644 libs/libaom/src/av1/encoder/tpl_model.h
 create mode 100644 libs/libaom/src/av1/encoder/tune_vmaf.c
 create mode 100644 libs/libaom/src/av1/encoder/tune_vmaf.h
 create mode 100644 libs/libaom/src/av1/encoder/tx_prune_model_weights.h
 create mode 100644 libs/libaom/src/av1/encoder/tx_search.c
 create mode 100644 libs/libaom/src/av1/encoder/tx_search.h
 create mode 100644 libs/libaom/src/av1/encoder/use_flat_gop_model_params.h
 create mode 100644 libs/libaom/src/av1/encoder/var_based_part.c
 create mode 100644 libs/libaom/src/av1/encoder/var_based_part.h
 create mode 100644 libs/libaom/src/av1/encoder/wedge_utils.c
 create mode 100644 libs/libaom/src/av1/encoder/x86/av1_fwd_txfm1d_sse4.c
 create mode 100644 libs/libaom/src/av1/encoder/x86/av1_fwd_txfm2d_avx2.c
 create mode 100644 libs/libaom/src/av1/encoder/x86/av1_fwd_txfm2d_sse4.c
 create mode 100644 libs/libaom/src/av1/encoder/x86/av1_fwd_txfm_avx2.h
 create mode 100644 libs/libaom/src/av1/encoder/x86/av1_fwd_txfm_sse2.c
 create mode 100644 libs/libaom/src/av1/encoder/x86/av1_fwd_txfm_sse2.h
 create mode 100644 libs/libaom/src/av1/encoder/x86/av1_highbd_quantize_avx2.c
 create mode 100644 libs/libaom/src/av1/encoder/x86/av1_highbd_quantize_sse4.c
 create mode 100644 libs/libaom/src/av1/encoder/x86/av1_quantize_avx2.c
 create mode 100644 libs/libaom/src/av1/encoder/x86/av1_quantize_sse2.c
 create mode 100644 libs/libaom/src/av1/encoder/x86/av1_quantize_ssse3_x86_64.asm
 create mode 100644 libs/libaom/src/av1/encoder/x86/av1_ssim_opt_x86_64.asm
 create mode 100644 libs/libaom/src/av1/encoder/x86/av1_txfm1d_sse4.h
 create mode 100644 libs/libaom/src/av1/encoder/x86/corner_match_avx2.c
 create mode 100644 libs/libaom/src/av1/encoder/x86/corner_match_sse4.c
 create mode 100644 libs/libaom/src/av1/encoder/x86/dct_sse2.asm
 create mode 100644 libs/libaom/src/av1/encoder/x86/encodetxb_avx2.c
 create mode 100644 libs/libaom/src/av1/encoder/x86/encodetxb_sse2.c
 create mode 100644 libs/libaom/src/av1/encoder/x86/encodetxb_sse4.c
 create mode 100644 libs/libaom/src/av1/encoder/x86/error_intrin_avx2.c
 create mode 100644 libs/libaom/src/av1/encoder/x86/error_sse2.asm
 create mode 100644 libs/libaom/src/av1/encoder/x86/hash_sse42.c
 create mode 100644 libs/libaom/src/av1/encoder/x86/highbd_block_error_intrin_avx2.c
 create mode 100644 libs/libaom/src/av1/encoder/x86/highbd_block_error_intrin_sse2.c
 create mode 100644 libs/libaom/src/av1/encoder/x86/highbd_fwd_txfm_avx2.c
 create mode 100644 libs/libaom/src/av1/encoder/x86/highbd_fwd_txfm_sse4.c
 create mode 100644 libs/libaom/src/av1/encoder/x86/ml_sse3.c
 create mode 100644 libs/libaom/src/av1/encoder/x86/pickrst_avx2.c
 create mode 100644 libs/libaom/src/av1/encoder/x86/pickrst_sse4.c
 create mode 100644 libs/libaom/src/av1/encoder/x86/rdopt_avx2.c
 create mode 100644 libs/libaom/src/av1/encoder/x86/rdopt_sse4.c
 create mode 100644 libs/libaom/src/av1/encoder/x86/temporal_filter_avx2.c
 create mode 100644 libs/libaom/src/av1/encoder/x86/temporal_filter_constants.h
 create mode 100644 libs/libaom/src/av1/encoder/x86/temporal_filter_sse2.c
 create mode 100644 libs/libaom/src/av1/encoder/x86/temporal_filter_sse4.c
 create mode 100644 libs/libaom/src/av1/encoder/x86/wedge_utils_avx2.c
 create mode 100644 libs/libaom/src/av1/encoder/x86/wedge_utils_sse2.c
 create mode 100644 libs/libaom/src/av1/exports_com
 create mode 100644 libs/libaom/src/av1/exports_dec
 create mode 100644 libs/libaom/src/av1/exports_enc
 create mode 100644 libs/libaom/src/av1/exports_ident
 create mode 100644 libs/libaom/src/av1/exports_test
 create mode 100644 libs/libaom/src/build/.gitattributes
 create mode 100644 libs/libaom/src/build/cmake/aom_config.c.template
 create mode 100644 libs/libaom/src/build/cmake/aom_config_defaults.cmake
 create mode 100644 libs/libaom/src/build/cmake/aom_configure.cmake
 create mode 100644 libs/libaom/src/build/cmake/aom_experiment_deps.cmake
 create mode 100644 libs/libaom/src/build/cmake/aom_install.cmake
 create mode 100644 libs/libaom/src/build/cmake/aom_optimization.cmake
 create mode 100644 libs/libaom/src/build/cmake/compiler_flags.cmake
 create mode 100644 libs/libaom/src/build/cmake/compiler_tests.cmake
 create mode 100644 libs/libaom/src/build/cmake/cpu.cmake
 create mode 100644 libs/libaom/src/build/cmake/dist.cmake
 create mode 100644 libs/libaom/src/build/cmake/exports.cmake
 create mode 100644 libs/libaom/src/build/cmake/exports_sources.cmake
 create mode 100644 libs/libaom/src/build/cmake/generate_aom_config_templates.cmake
 create mode 100644 libs/libaom/src/build/cmake/generate_exports.cmake
 create mode 100644 libs/libaom/src/build/cmake/ios-Info.plist
 create mode 100644 libs/libaom/src/build/cmake/iosbuild.sh
 create mode 100644 libs/libaom/src/build/cmake/msvc_runtime.cmake
 create mode 100644 libs/libaom/src/build/cmake/pkg_config.cmake
 create mode 100644 libs/libaom/src/build/cmake/rtcd.pl
 create mode 100644 libs/libaom/src/build/cmake/sanitizers.cmake
 create mode 100644 libs/libaom/src/build/cmake/toolchains/arm-ios-common.cmake
 create mode 100644 libs/libaom/src/build/cmake/toolchains/arm64-android-clang.cmake
 create mode 100644 libs/libaom/src/build/cmake/toolchains/arm64-ios.cmake
 create mode 100644 libs/libaom/src/build/cmake/toolchains/arm64-linux-gcc.cmake
 create mode 100644 libs/libaom/src/build/cmake/toolchains/arm64-mingw-gcc.cmake
 create mode 100644 libs/libaom/src/build/cmake/toolchains/armv7-ios.cmake
 create mode 100644 libs/libaom/src/build/cmake/toolchains/armv7-linux-gcc.cmake
 create mode 100644 libs/libaom/src/build/cmake/toolchains/armv7-mingw-gcc.cmake
 create mode 100644 libs/libaom/src/build/cmake/toolchains/armv7s-ios.cmake
 create mode 100644 libs/libaom/src/build/cmake/toolchains/ios-simulator-common.cmake
 create mode 100644 libs/libaom/src/build/cmake/toolchains/mips32-linux-gcc.cmake
 create mode 100644 libs/libaom/src/build/cmake/toolchains/mips64-linux-gcc.cmake
 create mode 100644 libs/libaom/src/build/cmake/toolchains/ppc-linux-gcc.cmake
 create mode 100644 libs/libaom/src/build/cmake/toolchains/x86-ios-simulator.cmake
 create mode 100644 libs/libaom/src/build/cmake/toolchains/x86-linux.cmake
 create mode 100644 libs/libaom/src/build/cmake/toolchains/x86-macos.cmake
 create mode 100644 libs/libaom/src/build/cmake/toolchains/x86-mingw-gcc.cmake
 create mode 100644 libs/libaom/src/build/cmake/toolchains/x86_64-ios-simulator.cmake
 create mode 100644 libs/libaom/src/build/cmake/toolchains/x86_64-mingw-gcc.cmake
 create mode 100644 libs/libaom/src/build/cmake/util.cmake
 create mode 100644 libs/libaom/src/build/cmake/version.cmake
 create mode 100644 libs/libaom/src/build/cmake/version.pl
 create mode 100644 libs/libaom/src/codereview.settings
 create mode 100644 libs/libaom/src/common/args.c
 create mode 100644 libs/libaom/src/common/args.h
 create mode 100644 libs/libaom/src/common/av1_config.c
 create mode 100644 libs/libaom/src/common/av1_config.h
 create mode 100644 libs/libaom/src/common/ivfdec.c
 create mode 100644 libs/libaom/src/common/ivfdec.h
 create mode 100644 libs/libaom/src/common/ivfenc.c
 create mode 100644 libs/libaom/src/common/ivfenc.h
 create mode 100644 libs/libaom/src/common/md5_utils.c
 create mode 100644 libs/libaom/src/common/md5_utils.h
 create mode 100644 libs/libaom/src/common/obudec.c
 create mode 100644 libs/libaom/src/common/obudec.h
 create mode 100644 libs/libaom/src/common/rawenc.c
 create mode 100644 libs/libaom/src/common/rawenc.h
 create mode 100644 libs/libaom/src/common/tools_common.c
 create mode 100644 libs/libaom/src/common/tools_common.h
 create mode 100644 libs/libaom/src/common/video_common.h
 create mode 100644 libs/libaom/src/common/video_reader.c
 create mode 100644 libs/libaom/src/common/video_reader.h
 create mode 100644 libs/libaom/src/common/video_writer.c
 create mode 100644 libs/libaom/src/common/video_writer.h
 create mode 100644 libs/libaom/src/common/warnings.c
 create mode 100644 libs/libaom/src/common/warnings.h
 create mode 100644 libs/libaom/src/common/webmdec.cc
 create mode 100644 libs/libaom/src/common/webmdec.h
 create mode 100644 libs/libaom/src/common/webmenc.cc
 create mode 100644 libs/libaom/src/common/webmenc.h
 create mode 100644 libs/libaom/src/common/y4menc.c
 create mode 100644 libs/libaom/src/common/y4menc.h
 create mode 100644 libs/libaom/src/common/y4minput.c
 create mode 100644 libs/libaom/src/common/y4minput.h
 create mode 100644 libs/libaom/src/docs.cmake
 create mode 100644 libs/libaom/src/examples/analyzer.cc
 create mode 100644 libs/libaom/src/examples/aom_cx_set_ref.c
 create mode 100644 libs/libaom/src/examples/av1_dec_fuzzer.cc
 create mode 100644 libs/libaom/src/examples/build_av1_dec_fuzzer.sh
 create mode 100644 libs/libaom/src/examples/decode_to_md5.c
 create mode 100644 libs/libaom/src/examples/decode_with_drops.c
 create mode 100644 libs/libaom/src/examples/encoder_util.c
 create mode 100644 libs/libaom/src/examples/encoder_util.h
 create mode 100644 libs/libaom/src/examples/inspect.c
 create mode 100644 libs/libaom/src/examples/lightfield_bitstream_parsing.c
 create mode 100644 libs/libaom/src/examples/lightfield_decoder.c
 create mode 100644 libs/libaom/src/examples/lightfield_encoder.c
 create mode 100644 libs/libaom/src/examples/lightfield_tile_list_decoder.c
 create mode 100644 libs/libaom/src/examples/lossless_encoder.c
 create mode 100644 libs/libaom/src/examples/noise_model.c
 create mode 100644 libs/libaom/src/examples/resize_util.c
 create mode 100644 libs/libaom/src/examples/scalable_decoder.c
 create mode 100644 libs/libaom/src/examples/scalable_encoder.c
 create mode 100644 libs/libaom/src/examples/set_maps.c
 create mode 100644 libs/libaom/src/examples/simple_decoder.c
 create mode 100644 libs/libaom/src/examples/simple_encoder.c
 create mode 100644 libs/libaom/src/examples/svc_encoder_rtc.c
 create mode 100644 libs/libaom/src/examples/twopass_encoder.c
 create mode 100644 libs/libaom/src/keywords.dox
 create mode 100644 libs/libaom/src/libs.doxy_template
 create mode 100644 libs/libaom/src/mainpage.dox
 create mode 100644 libs/libaom/src/stats/aomstats.c
 create mode 100644 libs/libaom/src/stats/aomstats.h
 create mode 100644 libs/libaom/src/stats/rate_hist.c
 create mode 100644 libs/libaom/src/stats/rate_hist.h
 create mode 100644 libs/libaom/src/test/accounting_test.cc
 create mode 100644 libs/libaom/src/test/acm_random.h
 create mode 100644 libs/libaom/src/test/active_map_test.cc
 create mode 100644 libs/libaom/src/test/altref_test.cc
 create mode 100644 libs/libaom/src/test/aom_integer_test.cc
 create mode 100644 libs/libaom/src/test/aomcx_set_ref.sh
 create mode 100644 libs/libaom/src/test/aomdec.sh
 create mode 100644 libs/libaom/src/test/aomenc.sh
 create mode 100644 libs/libaom/src/test/aq_segment_test.cc
 create mode 100644 libs/libaom/src/test/arf_freq_test.cc
 create mode 100644 libs/libaom/src/test/av1_common_int_test.cc
 create mode 100644 libs/libaom/src/test/av1_config_test.cc
 create mode 100644 libs/libaom/src/test/av1_convolve_2d_test.cc
 create mode 100644 libs/libaom/src/test/av1_convolve_2d_test_util.cc
 create mode 100644 libs/libaom/src/test/av1_convolve_2d_test_util.h
 create mode 100644 libs/libaom/src/test/av1_convolve_scale_test.cc
 create mode 100644 libs/libaom/src/test/av1_encoder_parms_get_to_decoder.cc
 create mode 100644 libs/libaom/src/test/av1_ext_tile_test.cc
 create mode 100644 libs/libaom/src/test/av1_fwd_txfm1d_test.cc
 create mode 100644 libs/libaom/src/test/av1_fwd_txfm2d_test.cc
 create mode 100644 libs/libaom/src/test/av1_highbd_iht_test.cc
 create mode 100644 libs/libaom/src/test/av1_horz_only_frame_superres_test.cc
 create mode 100644 libs/libaom/src/test/av1_inv_txfm1d_test.cc
 create mode 100644 libs/libaom/src/test/av1_inv_txfm2d_test.cc
 create mode 100644 libs/libaom/src/test/av1_nn_predict_test.cc
 create mode 100644 libs/libaom/src/test/av1_quantize_test.cc
 create mode 100644 libs/libaom/src/test/av1_round_shift_array_test.cc
 create mode 100644 libs/libaom/src/test/av1_txfm_test.cc
 create mode 100644 libs/libaom/src/test/av1_txfm_test.h
 create mode 100644 libs/libaom/src/test/av1_wedge_utils_test.cc
 create mode 100644 libs/libaom/src/test/avg_test.cc
 create mode 100644 libs/libaom/src/test/best_encode.sh
 create mode 100644 libs/libaom/src/test/binary_codes_test.cc
 create mode 100644 libs/libaom/src/test/blend_a64_mask_1d_test.cc
 create mode 100644 libs/libaom/src/test/blend_a64_mask_test.cc
 create mode 100644 libs/libaom/src/test/blockd_test.cc
 create mode 100644 libs/libaom/src/test/boolcoder_test.cc
 create mode 100644 libs/libaom/src/test/borders_test.cc
 create mode 100644 libs/libaom/src/test/cdef_test.cc
 create mode 100644 libs/libaom/src/test/cfl_test.cc
 create mode 100644 libs/libaom/src/test/clear_system_state.h
 create mode 100644 libs/libaom/src/test/cnn_test.cc
 create mode 100644 libs/libaom/src/test/codec_factory.h
 create mode 100644 libs/libaom/src/test/coding_path_sync.cc
 create mode 100644 libs/libaom/src/test/comp_avg_pred_test.cc
 create mode 100644 libs/libaom/src/test/comp_avg_pred_test.h
 create mode 100644 libs/libaom/src/test/comp_mask_variance_test.cc
 create mode 100644 libs/libaom/src/test/convolve_round_test.cc
 create mode 100644 libs/libaom/src/test/convolve_test.cc
 create mode 100644 libs/libaom/src/test/corner_match_test.cc
 create mode 100644 libs/libaom/src/test/cpu_speed_test.cc
 create mode 100644 libs/libaom/src/test/datarate_test.cc
 create mode 100644 libs/libaom/src/test/datarate_test.h
 create mode 100644 libs/libaom/src/test/decode_api_test.cc
 create mode 100644 libs/libaom/src/test/decode_multithreaded_test.cc
 create mode 100644 libs/libaom/src/test/decode_perf_test.cc
 create mode 100644 libs/libaom/src/test/decode_test_driver.cc
 create mode 100644 libs/libaom/src/test/decode_test_driver.h
 create mode 100644 libs/libaom/src/test/decode_to_md5.sh
 create mode 100644 libs/libaom/src/test/decode_with_drops.sh
 create mode 100644 libs/libaom/src/test/divu_small_test.cc
 create mode 100644 libs/libaom/src/test/dr_prediction_test.cc
 create mode 100644 libs/libaom/src/test/dump_obu.sh
 create mode 100644 libs/libaom/src/test/ec_test.cc
 create mode 100644 libs/libaom/src/test/edge_detect_test.cc
 create mode 100644 libs/libaom/src/test/encode_api_test.cc
 create mode 100644 libs/libaom/src/test/encode_perf_test.cc
 create mode 100644 libs/libaom/src/test/encode_test_driver.cc
 create mode 100644 libs/libaom/src/test/encode_test_driver.h
 create mode 100644 libs/libaom/src/test/encodetxb_test.cc
 create mode 100644 libs/libaom/src/test/end_to_end_test.cc
 create mode 100644 libs/libaom/src/test/error_block_test.cc
 create mode 100644 libs/libaom/src/test/error_resilience_test.cc
 create mode 100644 libs/libaom/src/test/ethread_test.cc
 create mode 100644 libs/libaom/src/test/examples.sh
 create mode 100644 libs/libaom/src/test/external_frame_buffer_test.cc
 create mode 100644 libs/libaom/src/test/fdct4x4_test.cc
 create mode 100644 libs/libaom/src/test/fft_test.cc
 create mode 100644 libs/libaom/src/test/film_grain_table_test.cc
 create mode 100644 libs/libaom/src/test/filterintra_test.cc
 create mode 100644 libs/libaom/src/test/frame_error_test.cc
 create mode 100644 libs/libaom/src/test/frame_size_tests.cc
 create mode 100644 libs/libaom/src/test/function_equivalence_test.h
 create mode 100644 libs/libaom/src/test/fwd_kf_test.cc
 create mode 100644 libs/libaom/src/test/fwht4x4_test.cc
 create mode 100644 libs/libaom/src/test/gf_pyr_height_test.cc
 create mode 100644 libs/libaom/src/test/gviz_api.py
 create mode 100644 libs/libaom/src/test/hadamard_test.cc
 create mode 100644 libs/libaom/src/test/hash_test.cc
 create mode 100644 libs/libaom/src/test/hbd_metrics_test.cc
 create mode 100644 libs/libaom/src/test/hiprec_convolve_test.cc
 create mode 100644 libs/libaom/src/test/hiprec_convolve_test_util.cc
 create mode 100644 libs/libaom/src/test/hiprec_convolve_test_util.h
 create mode 100644 libs/libaom/src/test/horver_correlation_test.cc
 create mode 100644 libs/libaom/src/test/horz_superres_test.cc
 create mode 100644 libs/libaom/src/test/i420_video_source.h
 create mode 100644 libs/libaom/src/test/intra_edge_test.cc
 create mode 100644 libs/libaom/src/test/intrabc_test.cc
 create mode 100644 libs/libaom/src/test/intrapred_test.cc
 create mode 100644 libs/libaom/src/test/invalid_file_test.cc
 create mode 100644 libs/libaom/src/test/ivf_video_source.h
 create mode 100644 libs/libaom/src/test/level_test.cc
 create mode 100644 libs/libaom/src/test/lightfield_test.sh
 create mode 100644 libs/libaom/src/test/log2_test.cc
 create mode 100644 libs/libaom/src/test/lossless_test.cc
 create mode 100644 libs/libaom/src/test/lpf_test.cc
 create mode 100644 libs/libaom/src/test/masked_sad_test.cc
 create mode 100644 libs/libaom/src/test/masked_variance_test.cc
 create mode 100644 libs/libaom/src/test/md5_helper.h
 create mode 100644 libs/libaom/src/test/metadata_test.cc
 create mode 100644 libs/libaom/src/test/metrics_template.html
 create mode 100644 libs/libaom/src/test/monochrome_test.cc
 create mode 100644 libs/libaom/src/test/motion_vector_test.cc
 create mode 100644 libs/libaom/src/test/noise_model_test.cc
 create mode 100644 libs/libaom/src/test/obmc_sad_test.cc
 create mode 100644 libs/libaom/src/test/obmc_variance_test.cc
 create mode 100644 libs/libaom/src/test/pickrst_test.cc
 create mode 100644 libs/libaom/src/test/qm_test.cc
 create mode 100644 libs/libaom/src/test/quantize_func_test.cc
 create mode 100644 libs/libaom/src/test/reconinter_test.cc
 create mode 100644 libs/libaom/src/test/register_state_check.h
 create mode 100644 libs/libaom/src/test/resize_test.cc
 create mode 100644 libs/libaom/src/test/rt_end_to_end_test.cc
 create mode 100644 libs/libaom/src/test/run_encodes.sh
 create mode 100644 libs/libaom/src/test/sad_test.cc
 create mode 100644 libs/libaom/src/test/sb_multipass_test.cc
 create mode 100644 libs/libaom/src/test/scalability_test.cc
 create mode 100644 libs/libaom/src/test/scan_test.cc
 create mode 100644 libs/libaom/src/test/segment_binarization_sync.cc
 create mode 100644 libs/libaom/src/test/selfguided_filter_test.cc
 create mode 100644 libs/libaom/src/test/set_maps.sh
 create mode 100644 libs/libaom/src/test/simd_avx2_test.cc
 create mode 100644 libs/libaom/src/test/simd_cmp_avx2.cc
 create mode 100644 libs/libaom/src/test/simd_cmp_impl.h
 create mode 100644 libs/libaom/src/test/simd_cmp_neon.cc
 create mode 100644 libs/libaom/src/test/simd_cmp_sse2.cc
 create mode 100644 libs/libaom/src/test/simd_cmp_sse4.cc
 create mode 100644 libs/libaom/src/test/simd_cmp_ssse3.cc
 create mode 100644 libs/libaom/src/test/simd_impl.h
 create mode 100644 libs/libaom/src/test/simd_neon_test.cc
 create mode 100644 libs/libaom/src/test/simd_sse2_test.cc
 create mode 100644 libs/libaom/src/test/simd_sse4_test.cc
 create mode 100644 libs/libaom/src/test/simd_ssse3_test.cc
 create mode 100644 libs/libaom/src/test/simple_decoder.sh
 create mode 100644 libs/libaom/src/test/simple_encoder.sh
 create mode 100644 libs/libaom/src/test/subtract_test.cc
 create mode 100644 libs/libaom/src/test/sum_squares_test.cc
 create mode 100644 libs/libaom/src/test/superframe_test.cc
 create mode 100644 libs/libaom/src/test/svc_datarate_test.cc
 create mode 100644 libs/libaom/src/test/temporal_filter_planewise_test.cc
 create mode 100644 libs/libaom/src/test/temporal_filter_yuv_test.cc
 create mode 100644 libs/libaom/src/test/test-data.sha1
 create mode 100644 libs/libaom/src/test/test.cmake
 create mode 100644 libs/libaom/src/test/test_data_download_worker.cmake
 create mode 100644 libs/libaom/src/test/test_data_util.cmake
 create mode 100644 libs/libaom/src/test/test_intra_pred_speed.cc
 create mode 100644 libs/libaom/src/test/test_libaom.cc
 create mode 100644 libs/libaom/src/test/test_runner.cmake
 create mode 100644 libs/libaom/src/test/test_vector_test.cc
 create mode 100644 libs/libaom/src/test/test_vectors.cc
 create mode 100644 libs/libaom/src/test/test_vectors.h
 create mode 100644 libs/libaom/src/test/tile_independence_test.cc
 create mode 100644 libs/libaom/src/test/time_stamp_test.cc
 create mode 100644 libs/libaom/src/test/tools_common.sh
 create mode 100644 libs/libaom/src/test/transform_test_base.h
 create mode 100644 libs/libaom/src/test/twopass_encoder.sh
 create mode 100644 libs/libaom/src/test/util.h
 create mode 100644 libs/libaom/src/test/variance_test.cc
 create mode 100644 libs/libaom/src/test/video_source.h
 create mode 100644 libs/libaom/src/test/visual_metrics.py
 create mode 100644 libs/libaom/src/test/warp_filter_test.cc
 create mode 100644 libs/libaom/src/test/warp_filter_test_util.cc
 create mode 100644 libs/libaom/src/test/warp_filter_test_util.h
 create mode 100644 libs/libaom/src/test/webm_video_source.h
 create mode 100644 libs/libaom/src/test/wiener_test.cc
 create mode 100644 libs/libaom/src/test/y4m_test.cc
 create mode 100644 libs/libaom/src/test/y4m_video_source.h
 create mode 100644 libs/libaom/src/test/yuv_video_source.h
 create mode 100644 libs/libaom/src/third_party/fastfeat/LICENSE
 create mode 100644 libs/libaom/src/third_party/fastfeat/README.libaom
 create mode 100644 libs/libaom/src/third_party/fastfeat/fast.c
 create mode 100644 libs/libaom/src/third_party/fastfeat/fast.h
 create mode 100644 libs/libaom/src/third_party/fastfeat/fast_9.c
 create mode 100644 libs/libaom/src/third_party/fastfeat/nonmax.c
 create mode 100644 libs/libaom/src/third_party/googletest/README.libaom
 create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/CHANGES
 create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/CMakeLists.txt
 create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/CONTRIBUTORS
 create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/LICENSE
 create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/README.md
 create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/cmake/Config.cmake.in
 create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/cmake/gtest.pc.in
 create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/cmake/gtest_main.pc.in
 create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/cmake/internal_utils.cmake
 create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-death-test.h
 create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-matchers.h
 create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-message.h
 create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-param-test.h
 create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-printers.h
 create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-spi.h
 create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-test-part.h
 create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-typed-test.h
 create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest.h
 create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest_pred_impl.h
 create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest_prod.h
 create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/custom/README.md
 create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest-port.h
 create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest-printers.h
 create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest.h
 create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-death-test-internal.h
 create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-filepath.h
 create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-internal.h
 create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-param-util.h
 create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-port-arch.h
 create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-port.h
 create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-string.h
 create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-type-util.h
 create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/src/gtest-all.cc
 create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/src/gtest-death-test.cc
 create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/src/gtest-filepath.cc
 create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/src/gtest-internal-inl.h
 create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/src/gtest-matchers.cc
 create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/src/gtest-port.cc
 create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/src/gtest-printers.cc
 create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/src/gtest-test-part.cc
 create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/src/gtest-typed-test.cc
 create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/src/gtest.cc
 create mode 100644 libs/libaom/src/third_party/googletest/src/googletest/src/gtest_main.cc
 create mode 100644 libs/libaom/src/third_party/libwebm/AUTHORS.TXT
 create mode 100644 libs/libaom/src/third_party/libwebm/Android.mk
 create mode 100644 libs/libaom/src/third_party/libwebm/LICENSE.TXT
 create mode 100644 libs/libaom/src/third_party/libwebm/PATENTS.TXT
 create mode 100644 libs/libaom/src/third_party/libwebm/README.libaom
 create mode 100644 libs/libaom/src/third_party/libwebm/common/file_util.cc
 create mode 100644 libs/libaom/src/third_party/libwebm/common/file_util.h
 create mode 100644 libs/libaom/src/third_party/libwebm/common/hdr_util.cc
 create mode 100644 libs/libaom/src/third_party/libwebm/common/hdr_util.h
 create mode 100644 libs/libaom/src/third_party/libwebm/common/webmids.h
 create mode 100644 libs/libaom/src/third_party/libwebm/mkvmuxer/mkvmuxer.cc
 create mode 100644 libs/libaom/src/third_party/libwebm/mkvmuxer/mkvmuxer.h
 create mode 100644 libs/libaom/src/third_party/libwebm/mkvmuxer/mkvmuxertypes.h
 create mode 100644 libs/libaom/src/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc
 create mode 100644 libs/libaom/src/third_party/libwebm/mkvmuxer/mkvmuxerutil.h
 create mode 100644 libs/libaom/src/third_party/libwebm/mkvmuxer/mkvwriter.cc
 create mode 100644 libs/libaom/src/third_party/libwebm/mkvmuxer/mkvwriter.h
 create mode 100644 libs/libaom/src/third_party/libwebm/mkvparser/mkvparser.cc
 create mode 100644 libs/libaom/src/third_party/libwebm/mkvparser/mkvparser.h
 create mode 100644 libs/libaom/src/third_party/libwebm/mkvparser/mkvreader.cc
 create mode 100644 libs/libaom/src/third_party/libwebm/mkvparser/mkvreader.h
 create mode 100644 libs/libaom/src/third_party/libyuv/README.libaom
 create mode 100644 libs/libaom/src/third_party/libyuv/include/libyuv/basic_types.h
 create mode 100644 libs/libaom/src/third_party/libyuv/include/libyuv/compare.h
 create mode 100644 libs/libaom/src/third_party/libyuv/include/libyuv/convert.h
 create mode 100644 libs/libaom/src/third_party/libyuv/include/libyuv/convert_argb.h
 create mode 100644 libs/libaom/src/third_party/libyuv/include/libyuv/convert_from.h
 create mode 100644 libs/libaom/src/third_party/libyuv/include/libyuv/convert_from_argb.h
 create mode 100644 libs/libaom/src/third_party/libyuv/include/libyuv/cpu_id.h
 create mode 100644 libs/libaom/src/third_party/libyuv/include/libyuv/mjpeg_decoder.h
 create mode 100644 libs/libaom/src/third_party/libyuv/include/libyuv/planar_functions.h
 create mode 100644 libs/libaom/src/third_party/libyuv/include/libyuv/rotate.h
 create mode 100644 libs/libaom/src/third_party/libyuv/include/libyuv/rotate_argb.h
 create mode 100644 libs/libaom/src/third_party/libyuv/include/libyuv/rotate_row.h
 create mode 100644 libs/libaom/src/third_party/libyuv/include/libyuv/row.h
 create mode 100644 libs/libaom/src/third_party/libyuv/include/libyuv/scale.h
 create mode 100644 libs/libaom/src/third_party/libyuv/include/libyuv/scale_argb.h
 create mode 100644 libs/libaom/src/third_party/libyuv/include/libyuv/scale_row.h
 create mode 100644 libs/libaom/src/third_party/libyuv/include/libyuv/version.h
 create mode 100644 libs/libaom/src/third_party/libyuv/include/libyuv/video_common.h
 create mode 100644 libs/libaom/src/third_party/libyuv/source/compare.cc
 create mode 100644 libs/libaom/src/third_party/libyuv/source/compare_common.cc
 create mode 100644 libs/libaom/src/third_party/libyuv/source/compare_gcc.cc
 create mode 100644 libs/libaom/src/third_party/libyuv/source/compare_neon.cc
 create mode 100644 libs/libaom/src/third_party/libyuv/source/compare_neon64.cc
 create mode 100644 libs/libaom/src/third_party/libyuv/source/compare_win.cc
 create mode 100644 libs/libaom/src/third_party/libyuv/source/convert.cc
 create mode 100644 libs/libaom/src/third_party/libyuv/source/convert_argb.cc
 create mode 100644 libs/libaom/src/third_party/libyuv/source/convert_from.cc
 create mode 100644 libs/libaom/src/third_party/libyuv/source/convert_from_argb.cc
 create mode 100644 libs/libaom/src/third_party/libyuv/source/convert_jpeg.cc
 create mode 100644 libs/libaom/src/third_party/libyuv/source/convert_to_argb.cc
 create mode 100644 libs/libaom/src/third_party/libyuv/source/convert_to_i420.cc
 create mode 100644 libs/libaom/src/third_party/libyuv/source/cpu_id.cc
 create mode 100644 libs/libaom/src/third_party/libyuv/source/mjpeg_decoder.cc
 create mode 100644 libs/libaom/src/third_party/libyuv/source/mjpeg_validate.cc
 create mode 100644 libs/libaom/src/third_party/libyuv/source/planar_functions.cc
 create mode 100644 libs/libaom/src/third_party/libyuv/source/rotate.cc
 create mode 100644 libs/libaom/src/third_party/libyuv/source/rotate_any.cc
 create mode 100644 libs/libaom/src/third_party/libyuv/source/rotate_argb.cc
 create mode 100644 libs/libaom/src/third_party/libyuv/source/rotate_common.cc
 create mode 100644 libs/libaom/src/third_party/libyuv/source/rotate_gcc.cc
 create mode 100644 libs/libaom/src/third_party/libyuv/source/rotate_mips.cc
 create mode 100644 libs/libaom/src/third_party/libyuv/source/rotate_neon.cc
 create mode 100644 libs/libaom/src/third_party/libyuv/source/rotate_neon64.cc
 create mode 100644 libs/libaom/src/third_party/libyuv/source/rotate_win.cc
 create mode 100644 libs/libaom/src/third_party/libyuv/source/row_any.cc
 create mode 100644 libs/libaom/src/third_party/libyuv/source/row_common.cc
 create mode 100644 libs/libaom/src/third_party/libyuv/source/row_gcc.cc
 create mode 100644 libs/libaom/src/third_party/libyuv/source/row_mips.cc
 create mode 100644 libs/libaom/src/third_party/libyuv/source/row_neon.cc
 create mode 100644 libs/libaom/src/third_party/libyuv/source/row_neon64.cc
 create mode 100644 libs/libaom/src/third_party/libyuv/source/row_win.cc
 create mode 100644 libs/libaom/src/third_party/libyuv/source/row_x86.asm
 create mode 100644 libs/libaom/src/third_party/libyuv/source/scale.cc
 create mode 100644 libs/libaom/src/third_party/libyuv/source/scale_any.cc
 create mode 100644 libs/libaom/src/third_party/libyuv/source/scale_argb.cc
 create mode 100644 libs/libaom/src/third_party/libyuv/source/scale_common.cc
 create mode 100644 libs/libaom/src/third_party/libyuv/source/scale_gcc.cc
 create mode 100644 libs/libaom/src/third_party/libyuv/source/scale_mips.cc
 create mode 100644 libs/libaom/src/third_party/libyuv/source/scale_neon.cc
 create mode 100644 libs/libaom/src/third_party/libyuv/source/scale_neon64.cc
 create mode 100644 libs/libaom/src/third_party/libyuv/source/scale_win.cc
 create mode 100644 libs/libaom/src/third_party/libyuv/source/video_common.cc
 create mode 100644 libs/libaom/src/third_party/libyuv/source/x86inc.asm
 create mode 100644 libs/libaom/src/third_party/vector/LICENSE
 create mode 100644 libs/libaom/src/third_party/vector/README.libaom
 create mode 100644 libs/libaom/src/third_party/vector/vector.c
 create mode 100644 libs/libaom/src/third_party/vector/vector.h
 create mode 100644 libs/libaom/src/third_party/x86inc/LICENSE
 create mode 100644 libs/libaom/src/third_party/x86inc/README.libaom
 create mode 100644 libs/libaom/src/third_party/x86inc/x86inc.asm
 create mode 100644 libs/libaom/src/tools/aggregate_entropy_stats.py
 create mode 100644 libs/libaom/src/tools/aom_entropy_optimizer.c
 create mode 100644 libs/libaom/src/tools/cpplint.py
 create mode 100644 libs/libaom/src/tools/diff.py
 create mode 100644 libs/libaom/src/tools/dump_obu.cc
 create mode 100644 libs/libaom/src/tools/gen_authors.sh
 create mode 100644 libs/libaom/src/tools/gen_constrained_tokenset.py
 create mode 100644 libs/libaom/src/tools/inspect-cli.js
 create mode 100644 libs/libaom/src/tools/inspect-post.js
 create mode 100644 libs/libaom/src/tools/intersect-diffs.py
 create mode 100644 libs/libaom/src/tools/lint-hunks.py
 create mode 100644 libs/libaom/src/tools/obu_parser.cc
 create mode 100644 libs/libaom/src/tools/obu_parser.h
 create mode 100644 libs/libaom/src/tools/txfm_analyzer/txfm_gen_code.cc
 create mode 100644 libs/libaom/src/tools/txfm_analyzer/txfm_graph.cc
 create mode 100644 libs/libaom/src/tools/txfm_analyzer/txfm_graph.h
 create mode 100644 libs/libaom/src/tools/wrap-commit-msg.py
 create mode 100644 libs/libaom/src/usage.dox
 create mode 100644 libs/libaom/src/usage_cx.dox
 create mode 100644 libs/libaom/src/usage_dx.dox

(limited to 'libs/libaom/src')

diff --git a/libs/libaom/src/.clang-format b/libs/libaom/src/.clang-format
new file mode 100644
index 000000000..a37882007
--- /dev/null
+++ b/libs/libaom/src/.clang-format
@@ -0,0 +1,148 @@
+---
+Language:        Cpp
+# BasedOnStyle:  Google
+# Generated with clang-format 7.0.1
+AccessModifierOffset: -1
+AlignAfterOpenBracket: Align
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+AlignEscapedNewlines: Left
+AlignOperands:   true
+AlignTrailingComments: true
+AllowAllParametersOfDeclarationOnNextLine: true
+AllowShortBlocksOnASingleLine: false
+AllowShortCaseLabelsOnASingleLine: true
+AllowShortIfStatementsOnASingleLine: true
+AllowShortLoopsOnASingleLine: true
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: true
+AlwaysBreakTemplateDeclarations: true
+BinPackArguments: true
+BinPackParameters: true
+BraceWrapping:
+  AfterClass:      false
+  AfterControlStatement: false
+  AfterEnum:       false
+  AfterFunction:   false
+  AfterNamespace:  false
+  AfterObjCDeclaration: false
+  AfterStruct:     false
+  AfterUnion:      false
+  AfterExternBlock: false
+  BeforeCatch:     false
+  BeforeElse:      false
+  IndentBraces:    false
+  SplitEmptyFunction: true
+  SplitEmptyRecord: true
+  SplitEmptyNamespace: true
+BreakBeforeBinaryOperators: None
+BreakBeforeBraces: Attach
+BreakBeforeInheritanceComma: false
+BreakInheritanceList: BeforeColon
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: false
+BreakConstructorInitializers: BeforeColon
+BreakAfterJavaFieldAnnotations: false
+BreakStringLiterals: true
+ColumnLimit:     80
+CommentPragmas:  '^ IWYU pragma:'
+CompactNamespaces: false
+ConstructorInitializerAllOnOneLineOrOnePerLine: false
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: false
+DerivePointerAlignment: false
+DisableFormat:   false
+ExperimentalAutoDetectBinPacking: false
+FixNamespaceComments: true
+ForEachMacros:
+  - foreach
+  - Q_FOREACH
+  - BOOST_FOREACH
+IncludeBlocks:   Preserve
+IncludeCategories:
+  - Regex:           '^<ext/.*\.h>'
+    Priority:        2
+  - Regex:           '^<.*\.h>'
+    Priority:        1
+  - Regex:           '^<.*'
+    Priority:        2
+  - Regex:           '.*'
+    Priority:        3
+IncludeIsMainRegex: '([-_](test|unittest))?$'
+IndentCaseLabels: true
+IndentPPDirectives: None
+IndentWidth:     2
+IndentWrappedFunctionNames: false
+JavaScriptQuotes: Leave
+JavaScriptWrapImports: true
+KeepEmptyLinesAtTheStartOfBlocks: false
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBinPackProtocolList: Never
+ObjCBlockIndentWidth: 2
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: false
+PenaltyBreakAssignment: 2
+PenaltyBreakBeforeFirstCallParameter: 1
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyBreakTemplateDeclaration: 10
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 200
+PointerAlignment: Right
+RawStringFormats:
+  - Language:        Cpp
+    Delimiters:
+      - cc
+      - CC
+      - cpp
+      - Cpp
+      - CPP
+      - 'c++'
+      - 'C++'
+    CanonicalDelimiter: ''
+    BasedOnStyle:    google
+  - Language:        TextProto
+    Delimiters:
+      - pb
+      - PB
+      - proto
+      - PROTO
+    EnclosingFunctions:
+      - EqualsProto
+      - EquivToProto
+      - PARSE_PARTIAL_TEXT_PROTO
+      - PARSE_TEST_PROTO
+      - PARSE_TEXT_PROTO
+      - ParseTextOrDie
+      - ParseTextProtoOrDie
+    CanonicalDelimiter: ''
+    BasedOnStyle:    google
+ReflowComments:  true
+SortIncludes:    false
+SortUsingDeclarations: true
+SpaceAfterCStyleCast: false
+SpaceAfterTemplateKeyword: true
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeCpp11BracedList: false
+SpaceBeforeCtorInitializerColon: true
+SpaceBeforeInheritanceColon: true
+SpaceBeforeParens: ControlStatements
+SpaceBeforeRangeBasedForLoopColon: true
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 2
+SpacesInAngles:  false
+SpacesInContainerLiterals: false
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard:        Auto
+TabWidth:        8
+UseTab:          Never
+...
+
diff --git a/libs/libaom/src/.cmake-format.py b/libs/libaom/src/.cmake-format.py
new file mode 100644
index 000000000..7b0e4f08d
--- /dev/null
+++ b/libs/libaom/src/.cmake-format.py
@@ -0,0 +1,102 @@
+# Generated with cmake-format 0.5.1
+# How wide to allow formatted cmake files
+line_width = 80
+
+# How many spaces to tab for indent
+tab_size = 2
+
+# If arglists are longer than this, break them always
+max_subargs_per_line = 10
+
+# If true, separate flow control names from their parentheses with a space
+separate_ctrl_name_with_space = False
+
+# If true, separate function names from parentheses with a space
+separate_fn_name_with_space = False
+
+# If a statement is wrapped to more than one line, than dangle the closing
+# parenthesis on it's own line
+dangle_parens = False
+
+# What character to use for bulleted lists
+bullet_char = '*'
+
+# What character to use as punctuation after numerals in an enumerated list
+enum_char = '.'
+
+# What style line endings to use in the output.
+line_ending = u'unix'
+
+# Format command names consistently as 'lower' or 'upper' case
+command_case = u'lower'
+
+# Format keywords consistently as 'lower' or 'upper' case
+keyword_case = u'unchanged'
+
+# Specify structure for custom cmake functions
+additional_commands = {
+  "foo": {
+    "flags": [
+      "BAR",
+      "BAZ"
+    ],
+    "kwargs": {
+      "HEADERS": "*",
+      "DEPENDS": "*",
+      "SOURCES": "*"
+    }
+  }
+}
+
+# A list of command names which should always be wrapped
+always_wrap = []
+
+# Specify the order of wrapping algorithms during successive reflow attempts
+algorithm_order = [0, 1, 2, 3, 4]
+
+# If true, the argument lists which are known to be sortable will be sorted
+# lexicographicall
+autosort = False
+
+# enable comment markup parsing and reflow
+enable_markup = True
+
+# If comment markup is enabled, don't reflow the first comment block in
+# eachlistfile. Use this to preserve formatting of your
+# copyright/licensestatements.
+first_comment_is_literal = False
+
+# If comment markup is enabled, don't reflow any comment block which matchesthis
+# (regex) pattern. Default is `None` (disabled).
+literal_comment_pattern = None
+
+# Regular expression to match preformat fences in comments
+# default=r'^\s*([`~]{3}[`~]*)(.*)$'
+fence_pattern = u'^\\s*([`~]{3}[`~]*)(.*)$'
+
+# Regular expression to match rulers in comments
+# default=r'^\s*[^\w\s]{3}.*[^\w\s]{3}$'
+ruler_pattern = u'^\\s*[^\\w\\s]{3}.*[^\\w\\s]{3}$'
+
+# If true, emit the unicode byte-order mark (BOM) at the start of the file
+emit_byteorder_mark = False
+
+# If a comment line starts with at least this many consecutive hash characters,
+# then don't lstrip() them off. This allows for lazy hash rulers where the first
+# hash char is not separated by space
+hashruler_min_length = 10
+
+# If true, then insert a space between the first hash char and remaining hash
+# chars in a hash ruler, and normalize it's length to fill the column
+canonicalize_hashrulers = True
+
+# Specify the encoding of the input file. Defaults to utf-8.
+input_encoding = u'utf-8'
+
+# Specify the encoding of the output file. Defaults to utf-8. Note that cmake
+# only claims to support utf-8 so be careful when using anything else
+output_encoding = u'utf-8'
+
+# A dictionary containing any per-command configuration overrides. Currently
+# only `command_case` is supported.
+per_command = {}
diff --git a/libs/libaom/src/.gitattributes b/libs/libaom/src/.gitattributes
new file mode 100644
index 000000000..ffc6912a9
--- /dev/null
+++ b/libs/libaom/src/.gitattributes
@@ -0,0 +1,18 @@
+*.[chs]      filter=fixtabswsp
+*.[ch]pp     filter=fixtabswsp
+*.[ch]xx     filter=fixtabswsp
+*.asm        filter=fixtabswsp
+*.php        filter=fixtabswsp
+*.pl         filter=fixtabswsp
+*.sh         filter=fixtabswsp
+*.txt	     filter=fixwsp
+[Mm]akefile  filter=fixwsp
+*.mk         filter=fixwsp
+*.rc         -crlf
+*.ds[pw]     -crlf
+*.bat        -crlf
+*.mmp        -crlf
+*.dpj        -crlf
+*.pjt        -crlf
+*.vcp        -crlf
+*.inf        -crlf
diff --git a/libs/libaom/src/.mailmap b/libs/libaom/src/.mailmap
new file mode 100644
index 000000000..30fae4de7
--- /dev/null
+++ b/libs/libaom/src/.mailmap
@@ -0,0 +1,91 @@
+Adrian Grange <agrange@google.com>
+Aℓex Converse <aconverse@google.com>
+Aℓex Converse <aconverse@google.com> <alex.converse@gmail.com>
+Alexis Ballier <aballier@gentoo.org> <alexis.ballier@gmail.com>
+Alpha Lam <hclam@google.com> <hclam@chromium.org>
+Andrey Norkin <anorkin@netflix.com>
+Angie Chiang <angiebird@google.com>
+Arild Fuldseth <arilfuld@cisco.com> <arild.fuldseth@gmail.com>
+Arild Fuldseth <arilfuld@cisco.com> <arilfuld@cisco.com>
+Bohan Li <bohanli@google.com>
+Changjun Yang <changjun.yang@intel.com>
+Chi Yo Tsai <chiyotsai@google.com>
+Chi Yo Tsai <chiyotsai@google.com> <chiyotsai@dhcp-100-106-128-213.corp.google.com>
+Chm <chm@rock-chips.com>
+Damon Shen <yjshen@google.com>
+Daniele Castagna <dcastagna@chromium.org> <dcastagna@google.com>
+Deb Mukherjee <debargha@google.com>
+Elliott Karpilovsky <elliottk@google.com>
+Emil Keyder <emilkeyder@google.com>
+Erik Niemeyer <erik.a.niemeyer@intel.com> <erik.a.niemeyer@gmail.com>
+Frederic Barbier <frederic.barbier@allegrodvt.com> <fbarbier.contact@gmail.com>
+Fyodor Kyslov <kyslov@google.com>
+Grant Hsu <grant.hsu@cidana.com> <grant.hsu@gmail.com>
+Guillaume Martres <smarter@ubuntu.com>
+Guillaume Martres <smarter@ubuntu.com> <gmartres@google.com>
+Guillaume Martres <smarter@ubuntu.com> <smarter3@gmail.com>
+Guillaume Martres <smarter@ubuntu.com> <gmartres@mozilla.com>
+Hangyu Kuang <hkuang@google.com>
+Hui Su <huisu@google.com>
+Iole Moccagatta <iole.moccagatta@gmail.com>
+Jacky Chen <jackychen@google.com>
+James Zern <jzern@google.com> <jzern@google.cOm>
+Jean-Marc Valin <jmvalin@jmvalin.ca> <jmvalin@mozilla.com>
+Jim Bankoski <jimbankoski@google.com>
+Johann Koenig <johannkoenig@google.com>
+Johann Koenig <johannkoenig@google.com> <johann.koenig@duck.com>
+Johann Koenig <johannkoenig@google.com> <johann.koenig@gmail.com>
+Johann Koenig <johannkoenig@google.com> <johannkoenig@chromium.org>
+John Koleszar <jkoleszar@google.com>
+Joshua Litt <joshualitt@google.com> <joshualitt@chromium.org>
+Lokeshwar Reddy B <lokeshwar.reddy@ittiam.com>
+Logan Goldberg <logangw@google.com>
+Luc Trudeau <luc@trud.ca>
+Luc Trudeau <luc@trud.ca> <ltrudeau@mozilla.com>
+Marco Paniconi <marpan@google.com>
+Marco Paniconi <marpan@google.com> <marpan@chromium.org>
+Michael Bebenita <mbebenita@gmail.com> <mbebenita@mozilla.com>
+Michael Horowitz <mhoro@webrtc.org> <mhoro@google.com>
+Mingliang Chen <mlchen@google.com>
+Monty Montgomery <cmontgomery@mozilla.com>
+Nathan E. Egge <negge@mozilla.com>
+Nathan E. Egge <negge@mozilla.com> <negge@dgql.org>
+Pascal Massimino <pascal.massimino@gmail.com>
+Pascal Massimino <pascal.massimino@gmail.com> <skal@google.com>
+Paul Wilkins <paulwilkins@google.com>
+Peng Bin <binpengsmail@gmail.com>
+Peng Bin <binpengsmail@gmail.com> <pengbin@kingsoft.com>
+Peter de Rivaz <peter.derivaz@gmail.com> <peter.derivaz@argondesign.com>
+Ralph Giles <giles@xiph.org> <giles@entropywave.com>
+Ralph Giles <giles@xiph.org> <giles@mozilla.com>
+Remya Prakasan <remya.prakasan@ittiam.com>
+Roger Zhou <youzhou@microsoft.com>
+Ronald S. Bultje <rsbultje@gmail.com> <rbultje@google.com>
+Ryan Lei <ryan.z.lei@intel.com>
+Ryan Lei <ryan.z.lei@intel.com> <ryan.lei@intel.com>
+Ryan Lei <ryan.z.lei@intel.com> <zlei3@ZLEI3-DESK.amr.corp.intel.com>
+Sachin Kumar Garg <sachin.kumargarg@ittiam.com>
+Sai Deng <sdeng@google.com>
+Sami Pietilä <samipietila@google.com>
+Sarah Parker <sarahparker@google.com>
+Tamar Levy <tamar.levy@intel.com>
+Tamar Levy <tamar.levy@intel.com> <levytamar82@gmail.com>
+Tero Rintaluoma <teror@google.com> <tero.rintaluoma@on2.com>
+Thomas Davies Thomas <thdavies@cisco.com>
+Timothy B. Terriberry <tterribe@xiph.org>
+Timothy B. Terriberry <tterribe@xiph.org> <tterriberry@mozilla.com>
+Timothy B. Terriberry <tterribe@xiph.org> Tim Terriberry <tterriberry@mozilla.com>
+Tom Finegan <tomfinegan@google.com>
+Tom Finegan <tomfinegan@google.com> <tomfinegan@chromium.org>
+Tristan Matthews <tmatth@videolan.org> <le.businessman@gmail.com>
+Venkat Sanampudi <sanampudi.venkatarao@ittiam.com>
+Wei-Ting Lin <weitinglin@google.com>
+Wei-Ting Lin <weitinglin@google.com> <weitingco@gmail.com>
+Wenyao Liu <wenyao.liu@cidana.com>
+Yaowu Xu <yaowu@google.com> <adam@xuyaowu.com>
+Yaowu Xu <yaowu@google.com> <yaowu@xuyaowu.com>
+Yaowu Xu <yaowu@google.com> <yaowu@yaowu-macbookpro.roam.corp.google.com>
+Yaowu Xu <yaowu@google.com> <Yaowu Xu>
+Yaowu Xu <yaowu@google.com> <yaowu.google.com>
+Zhipin Deng <zhipin.deng@intel.com>
+Zoe Liu <zoeliu@gmail.com> <zoeliu@google.com>
diff --git a/libs/libaom/src/AUTHORS b/libs/libaom/src/AUTHORS
new file mode 100644
index 000000000..f61026fc0
--- /dev/null
+++ b/libs/libaom/src/AUTHORS
@@ -0,0 +1,260 @@
+# This file is automatically generated from the git commit history
+# by tools/gen_authors.sh.
+
+Aamir Anis <aanis@google.com>
+Aaron Watry <awatry@gmail.com>
+Aasaipriya <aasaipriya.c@ittiam.com>
+Abo Talib Mahfoodh <ab.mahfoodh@gmail.com>
+Adrian Grange <agrange@google.com>
+Ahmad Sharif <asharif@google.com>
+Akshata Jadhav <akshata.jadhav@ittiam.com>
+Alexander Bokov <alexanderbokov@google.com>
+Alexander Voronov <avoronov@graphics.cs.msu.ru>
+Aℓex Converse <aconverse@google.com>
+Alexis Ballier <aballier@gentoo.org>
+Alok Ahuja <waveletcoeff@gmail.com>
+Alpha Lam <hclam@google.com>
+A.Mahfoodh <ab.mahfoodh@gmail.com>
+Ami Fischman <fischman@chromium.org>
+Andoni Morales Alastruey <ylatuya@gmail.com>
+Andres Mejia <mcitadel@gmail.com>
+Andrew Russell <anrussell@google.com>
+Andrey Norkin <anorkin@netflix.com>
+Angie Chiang <angiebird@google.com>
+Aniket Dhok <aniket.dhok@ittiam.com>
+Ankur Saxena <ankurs@nvidia.com>
+Arild Fuldseth <arilfuld@cisco.com>
+Aron Rosenberg <arosenberg@logitech.com>
+Attila Nagy <attilanagy@google.com>
+Bohan Li <bohanli@google.com>
+Brennan Shacklett <bshacklett@mozilla.com>
+Brion Vibber <bvibber@wikimedia.org>
+Bruno Berthier <bruno.berthier@allegrodvt.com>
+Changjun Yang <changjun.yang@intel.com>
+Charles 'Buck' Krasic <ckrasic@google.com>
+Cheng Chen <chengchen@google.com>
+Cherma Rajan A <cherma.rajan@ittiam.com>
+Chi Yo Tsai <chiyotsai@google.com>
+Chm <chm@rock-chips.com>
+Christian Duvivier <cduvivier@google.com>
+Cyril Concolato <cconcolato@netflix.com>
+Dake He <dkhe@google.com>
+Damon Shen <yjshen@google.com>
+Dandan Ding <vickyddding@gmail.com>
+Daniele Castagna <dcastagna@chromium.org>
+Daniel Kang <ddkang@google.com>
+Daniel Max Valenzuela <daniel.vt@samsung.com>
+Danil Chapovalov <danilchap@google.com>
+David Barker <david.barker@argondesign.com>
+David Major <dmajor@mozilla.com>
+David Michael Barr <b@rr-dav.id.au>
+David Turner <david.turner@argondesign.com>
+Deb Mukherjee <debargha@google.com>
+Deepa K G <deepa.kg@ittiam.com>
+Deng <zhipin.deng@intel.com>
+Di Chen <chendixi@google.com>
+Dim Temp <dimtemp0@gmail.com>
+Dmitry Kovalev <dkovalev@google.com>
+Dominic Symes <dominic.symes@arm.com>
+Dragan Mrdjan <dmrdjan@mips.com>
+Ed Baker <edward.baker@intel.com>
+Edward Hervey <edward@centricular.com>
+Ehsan Akhgari <ehsan.akhgari@gmail.com>
+Elliott Karpilovsky <elliottk@google.com>
+Emil Keyder <emilkeyder@google.com>
+Erik Niemeyer <erik.a.niemeyer@intel.com>
+Fabio Pedretti <fabio.ped@libero.it>
+Fangwen Fu <fangwen.fu@intel.com>
+Fergus Simpson <afergs@google.com>
+Frank Bossen <fbossen@gmail.com>
+Frank Galligan <fgalligan@google.com>
+Frederic Barbier <frederic.barbier@allegrodvt.com>
+Fredrik Söderquist <fs@opera.com>
+Fritz Koenig <frkoenig@google.com>
+Fyodor Kyslov <kyslov@google.com>
+Gaute Strokkenes <gaute.strokkenes@broadcom.com>
+Geza Lore <gezalore@gmail.com>
+Ghislain MARY <ghislainmary2@gmail.com>
+Giuseppe Scrivano <gscrivano@gnu.org>
+Gordana Cmiljanovic <gordana.cmiljanovic@imgtec.com>
+Grant Hsu <grant.hsu@cidana.com>
+Guillaume Martres <smarter@ubuntu.com>
+Guillermo Ballester Valor <gbvalor@gmail.com>
+Hamsalekha S <hamsalekha.s@ittiam.com>
+Hangyu Kuang <hkuang@google.com>
+Hanno Böck <hanno@hboeck.de>
+Harish Mahendrakar <harish.mahendrakar@ittiam.com>
+Henrik Lundin <hlundin@google.com>
+Hien Ho <hienho@google.com>
+Hui Su <huisu@google.com>
+Ilie Halip <ilie.halip@gmail.com>
+Ilya Brailovskiy <brailovs@lab126.com>
+Imdad Sardharwalla <imdad.sardharwalla@argondesign.com>
+iole moccagatta <iole.moccagatta@gmail.com>
+Ivan Krasin <krasin@chromium.org>
+Ivan Maltz <ivanmaltz@google.com>
+Jacek Caban <cjacek@gmail.com>
+Jack Haughton <jack.haughton@argondesign.com>
+Jacky Chen <jackychen@google.com>
+James Berry <jamesberry@google.com>
+James Yu <james.yu@linaro.org>
+James Zern <jzern@google.com>
+Jan Gerber <j@mailb.org>
+Jan Kratochvil <jan.kratochvil@redhat.com>
+Janne Salonen <jsalonen@google.com>
+Jayasanker J <jayasanker.j@ittiam.com>
+Jean-Marc Valin <jmvalin@mozilla.com>
+Jean-Yves Avenard <jyavenard@mozilla.com>
+Jeff Faust <jfaust@google.com>
+Jeff Muizelaar <jmuizelaar@mozilla.com>
+Jeff Petkau <jpet@chromium.org>
+Jerome Jiang <jianj@google.com>
+Jia Jia <jia.jia@linaro.org>
+Jian Zhou <zhoujian@google.com>
+Jim Bankoski <jimbankoski@google.com>
+Jingning Han <jingning@google.com>
+Joe Young <joeyoung@google.com>
+Joey Parrish <joeyparrish@google.com>
+Johann Koenig <johannkoenig@google.com>
+John Koleszar <jkoleszar@google.com>
+Johnny Klonaris <google@jawknee.com>
+John Stark <jhnstrk@gmail.com>
+Jonathan Matthews <jonathan.matthews@argondesign.com>
+Joshua Bleecher Snyder <josh@treelinelabs.com>
+Joshua Litt <joshualitt@google.com>
+Julia Robson <juliamrobson@gmail.com>
+Justin Clift <justin@salasaga.org>
+Justin Lebar <justin.lebar@gmail.com>
+Katsuhisa Yuasa <berupon@gmail.com>
+KO Myung-Hun <komh@chollian.net>
+Krishna Malladi <kmalladi@google.com>
+Kyle Siefring <kylesiefring@gmail.com>
+Larisa Markeeva <lmarkeeva@google.com>
+Lawrence Velázquez <larryv@macports.org>
+Lester Lu <kslu@google.com>
+Linfeng Zhang <linfengz@google.com>
+Logan Goldberg <logangw@google.com>
+Lokeshwar Reddy B <lokeshwar.reddy@ittiam.com>
+Lou Quillio <louquillio@google.com>
+Luca Barbato <lu_zero@gentoo.org>
+Luc Trudeau <ltrudeau@mozilla.com>
+Makoto Kato <makoto.kt@gmail.com>
+Mans Rullgard <mans@mansr.com>
+Marco Paniconi <marpan@google.com>
+Mark Mentovai <mark@chromium.org>
+Martin Ettl <ettl.martin78@googlemail.com>
+Martin Storsjo <martin@martin.st>
+Matthew Heaney <matthewjheaney@chromium.org>
+Matthieu Vaudano <matthieu.vaudano@allegrodvt.com>
+Mattias Hansson <mattias.hansson@arm.com>
+Maxym Dmytrychenko <maxim.d33@gmail.com>
+Michael Bebenita <mbebenita@mozilla.com>
+Michael Horowitz <mhoro@webrtc.org>
+Michael Kohler <michaelkohler@live.com>
+Michelle Findlay-Olynyk <mfo@google.com>
+Mike Frysinger <vapier@chromium.org>
+Mike Hommey <mhommey@mozilla.com>
+Mikhal Shemer <mikhal@google.com>
+Minghai Shang <minghai@google.com>
+Mingliang Chen <mlchen@google.com>
+Mirko Bonadei <mbonadei@google.com>
+Monty Montgomery <cmontgomery@mozilla.com>
+Morton Jonuschat <yabawock@gmail.com>
+Mufaddal Chakera <mufaddal.chakera@ittiam.com>
+Nathan E. Egge <negge@mozilla.com>
+Neil Birkbeck <birkbeck@google.com>
+Nico Weber <thakis@chromium.org>
+Nithya V S <nithya.vs@ittiam.com>
+Ola Hugosson <ola.hugosson@arm.com>
+Oleg Nalivayko <o13g86@gmail.com>
+Parag Salasakar <img.mips1@gmail.com>
+Pascal Massimino <pascal.massimino@gmail.com>
+Patrik Westin <patrik.westin@gmail.com>
+Paul Wilkins <paulwilkins@google.com>
+Pavel Frolov <pavel.frolov@vicuesoft.com>
+Pavol Rusnak <stick@gk2.sk>
+Paweł Hajdan <phajdan@google.com>
+Peng Bin <binpengsmail@gmail.com>
+Pengchong Jin <pengchong@google.com>
+Peter Boström <pbos@google.com>
+Peter de Rivaz <peter.derivaz@gmail.com>
+Philip Jägenstedt <philipj@opera.com>
+Priit Laes <plaes@plaes.org>
+Rafael Ávila de Espíndola <rafael.espindola@gmail.com>
+Rafaël Carré <funman@videolan.org>
+Ralph Giles <giles@xiph.org>
+Ranjit Kumar Tulabandu <ranjit.tulabandu@ittiam.com>
+Ravi Chaudhary <ravi.chaudhary@ittiam.com>
+Remya Prakasan <remya.prakasan@ittiam.com>
+Remy Foray <remy.foray@allegrodvt.com>
+Rob Bradford <rob@linux.intel.com>
+Robert-André Mauchin <zebob.m@gmail.com>
+RogerZhou <youzhou@microsoft.com>
+Rohit Athavale <rathaval@xilinx.com>
+Ronald S. Bultje <rsbultje@gmail.com>
+Rostislav Pehlivanov <rpehlivanov@mozilla.com>
+Ruiling Song <ruiling.song@intel.com>
+Rui Ueyama <ruiu@google.com>
+Rupert Swarbrick <rupert.swarbrick@argondesign.com>
+Ryan Lei <ryan.lei@intel.com>
+Ryan Overbeck <rover@google.com>
+Sachin Kumar Garg <sachin.kumargarg@ittiam.com>
+Sai Deng <sdeng@google.com>
+Sami Pietilä <samipietila@google.com>
+Sarah Parker <sarahparker@google.com>
+Sasi Inguva <isasi@google.com>
+Satish Kumar Suman <satish.suman@ittiam.com>
+Scott Graham <scottmg@chromium.org>
+Scott LaVarnway <slavarnway@google.com>
+Sean DuBois <sean@siobud.com>
+Sean McGovern <gseanmcg@gmail.com>
+Sean Purser-Haskell <seanhaskell@google.com>
+Sebastien Alaiwan <sebastien.alaiwan@allegrodvt.com>
+Sergey Kolomenkin <kolomenkin@gmail.com>
+Sergey Ulanov <sergeyu@chromium.org>
+Shimon Doodkin <helpmepro1@gmail.com>
+Shunyao Li <shunyaoli@google.com>
+SmilingWolf <lupo996@gmail.com>
+Soo-Chul Han <shan@vidyo.com>
+Stanislav Vitvitskyy <vitvitskyy@google.com>
+Stefan Holmer <holmer@google.com>
+Steinar Midtskogen <stemidts@cisco.com>
+Suman Sunkara <sunkaras@google.com>
+Taekhyun Kim <takim@nvidia.com>
+Takanori MATSUURA <t.matsuu@gmail.com>
+Tamar Levy <tamar.levy@intel.com>
+Tao Bai <michaelbai@chromium.org>
+Tarek AMARA <amatarek@justin.tv>
+Tero Rintaluoma <teror@google.com>
+Thijs Vermeir <thijsvermeir@gmail.com>
+Thomas Daede <tdaede@mozilla.com>
+Thomas Davies Thomas <thdavies@cisco.com>
+Tim Kopp <tkopp@google.com>
+Timothy B. Terriberry <tterribe@xiph.org>
+Timo Witte <timo.witte@gmail.com>
+Todd Nguyen <toddnguyen@google.com>
+Tom Anderson <thomasanderson@google.com>
+Tom Finegan <tomfinegan@google.com>
+Tristan Matthews <tmatth@videolan.org>
+Umang Saini <umang.saini@ittiam.com>
+Urvang Joshi <urvang@google.com>
+Venkat Sanampudi <sanampudi.venkatarao@ittiam.com>
+Victoria Zhislina <niva213@gmail.com>
+Vignesh Venkatasubramanian <vigneshv@google.com>
+Vishesh <vishesh.garg@ittiam.com>
+Wan-Teh Chang <wtc@google.com>
+Wei-Ting Lin <weitinglin@google.com>
+Wenyao Liu <wenyao.liu@cidana.com>
+Xing Jin <ddvfinite@gmail.com>
+Xin Zhao <xinzzhao@tencent.com>
+Yaowu Xu <yaowu.google.com>
+Yaowu Xu <yaowu@google.com>
+Yi Luo <luoyi@google.com>
+Yongzhe Wang <yongzhe@google.com>
+Yue Chen <yuec@google.com>
+Yunqing Wang <yunqingwang@google.com>
+Yury Gitman <yuryg@google.com>
+Yushin Cho <ycho@mozilla.com>
+Zhijie Yang <zhijie.yang@broadcom.com>
+zhipin deng <zhipin.deng@intel.com>
+Zoe Liu <zoeliu@gmail.com>
diff --git a/libs/libaom/src/CHANGELOG b/libs/libaom/src/CHANGELOG
new file mode 100644
index 000000000..11da097af
--- /dev/null
+++ b/libs/libaom/src/CHANGELOG
@@ -0,0 +1,51 @@
+2021-02-09 v2.0.2
+  This release includes several bug fixes.
+
+  - Bug fixes:
+    Issue 2643: Modify the assertion in temporal filter intrinsics.
+
+    Issue 2648: Fix unit test ThreadTestLarge.EncoderResultTest/49
+    assertion failure.
+
+    Issue 2869: Add -Wimplicit-function-declaration as C flag only.
+
+    Issue 2878: Avoid memset in the av1_filter_intra_predictor module
+    functions.
+
+    Issue 2903: Fix a typo bug in apply_temporal_filter_planewise.
+
+    Call av1_setup_frame_size() when dropping a frame in the
+    encode_frame_to_data_rate() function in av1/encoder/encoder.c.
+
+2020-11-25 v2.0.1
+  This release includes two bug fixes.
+
+  - Bug fixes:
+    Issue 2723: Fix crash in chroma_check() when generating a monochrome
+    encoded stream in real-time mode.
+
+    Issue 2833: Fix crash on some input when reduced still picture header is
+    used in real-time mode and speed >=7.
+
+2020-05-07 v2.0.0 "Applejack"
+  First official release of libaom.
+  This release includes new real-time mode and SVC support.
+
+  - Upgrading:
+    AOM_SET_POSTPROC, AOM_CODEC_CAP_POSTPROC and AOM_CODEC_USE_POSTPROC are
+    removed.
+
+    AOM_SET_DBG_* is removed.
+
+    Multi-resolution encoding is removed.
+
+    put_frame and put_slice callbacks are removed.
+
+  - Enhancements:
+    Full-sweep document update for codec controls.
+
+2018-06-28 v1.0.0
+  AOMedia Codec Workgroup Approved version 1.0
+
+2016-04-07 v0.1.0 "AOMedia Codec 1"
+  This release is the first Alliance for Open Media codec.
diff --git a/libs/libaom/src/CMakeLists.txt b/libs/libaom/src/CMakeLists.txt
new file mode 100644
index 000000000..84c8995a5
--- /dev/null
+++ b/libs/libaom/src/CMakeLists.txt
@@ -0,0 +1,768 @@
+#
+# Copyright (c) 2016, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+cmake_minimum_required(VERSION 3.5)
+project(AOM C CXX)
+
+if(NOT EMSCRIPTEN)
+  if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
+    set(CMAKE_BUILD_TYPE
+        "Release"
+        CACHE STRING "Build type: Debug, Release, RelWithDebInfo or MinSizeRel"
+              FORCE)
+  endif()
+endif()
+
+set(AOM_ROOT "${CMAKE_CURRENT_SOURCE_DIR}")
+set(AOM_CONFIG_DIR "${CMAKE_CURRENT_BINARY_DIR}")
+
+if("${AOM_ROOT}" STREQUAL "${AOM_CONFIG_DIR}")
+  message(
+    FATAL_ERROR "Building from within the aom source tree is not supported.\n"
+                "Hint: Run these commands\n"
+                "$ rm -rf CMakeCache.txt CMakeFiles\n"
+                "$ mkdir -p ../aom_build\n" "$ cd ../aom_build\n"
+                "And re-run CMake from the aom_build directory.")
+endif()
+
+# Updating version info.
+# https://www.gnu.org/software/libtool/manual/libtool.html#Updating-version-info
+set(SO_VERSION 2)
+set(SO_FILE_VERSION 2.0.2)
+
+include("${AOM_ROOT}/build/cmake/aom_configure.cmake")
+include("${AOM_ROOT}/aom_dsp/aom_dsp.cmake")
+include("${AOM_ROOT}/aom_mem/aom_mem.cmake")
+include("${AOM_ROOT}/aom_ports/aom_ports.cmake")
+include("${AOM_ROOT}/aom_scale/aom_scale.cmake")
+include("${AOM_ROOT}/aom_util/aom_util.cmake")
+include("${AOM_ROOT}/av1/av1.cmake")
+include("${AOM_ROOT}/build/cmake/aom_install.cmake")
+include("${AOM_ROOT}/build/cmake/sanitizers.cmake")
+include("${AOM_ROOT}/build/cmake/util.cmake")
+include("${AOM_ROOT}/test/test.cmake")
+
+list(APPEND AOM_RTCD_SOURCES
+            "${AOM_CONFIG_DIR}/config/aom_dsp_rtcd.h"
+            "${AOM_CONFIG_DIR}/config/aom_scale_rtcd.h"
+            "${AOM_CONFIG_DIR}/config/av1_rtcd.h"
+            "${AOM_ROOT}/aom_dsp/aom_dsp_rtcd_defs.pl"
+            "${AOM_ROOT}/aom_dsp/aom_dsp_rtcd.c"
+            "${AOM_ROOT}/aom_scale/aom_scale_rtcd.pl"
+            "${AOM_ROOT}/aom_scale/aom_scale_rtcd.c"
+            "${AOM_ROOT}/av1/common/av1_rtcd_defs.pl"
+            "${AOM_ROOT}/av1/common/av1_rtcd.c"
+            "${AOM_ROOT}/build/cmake/rtcd.pl")
+
+list(APPEND AOM_LIBWEBM_SOURCES
+            "${AOM_ROOT}/third_party/libwebm/common/hdr_util.cc"
+            "${AOM_ROOT}/third_party/libwebm/common/hdr_util.h"
+            "${AOM_ROOT}/third_party/libwebm/common/webmids.h"
+            "${AOM_ROOT}/third_party/libwebm/mkvmuxer/mkvmuxer.cc"
+            "${AOM_ROOT}/third_party/libwebm/mkvmuxer/mkvmuxer.h"
+            "${AOM_ROOT}/third_party/libwebm/mkvmuxer/mkvmuxertypes.h"
+            "${AOM_ROOT}/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc"
+            "${AOM_ROOT}/third_party/libwebm/mkvmuxer/mkvmuxerutil.h"
+            "${AOM_ROOT}/third_party/libwebm/mkvmuxer/mkvwriter.cc"
+            "${AOM_ROOT}/third_party/libwebm/mkvmuxer/mkvwriter.h"
+            "${AOM_ROOT}/third_party/libwebm/mkvparser/mkvparser.cc"
+            "${AOM_ROOT}/third_party/libwebm/mkvparser/mkvparser.h"
+            "${AOM_ROOT}/third_party/libwebm/mkvparser/mkvreader.cc"
+            "${AOM_ROOT}/third_party/libwebm/mkvparser/mkvreader.h")
+
+list(APPEND AOM_LIBYUV_SOURCES
+            "${AOM_ROOT}/third_party/libyuv/include/libyuv/basic_types.h"
+            "${AOM_ROOT}/third_party/libyuv/include/libyuv/convert.h"
+            "${AOM_ROOT}/third_party/libyuv/include/libyuv/convert_argb.h"
+            "${AOM_ROOT}/third_party/libyuv/include/libyuv/convert_from.h"
+            "${AOM_ROOT}/third_party/libyuv/include/libyuv/cpu_id.h"
+            "${AOM_ROOT}/third_party/libyuv/include/libyuv/planar_functions.h"
+            "${AOM_ROOT}/third_party/libyuv/include/libyuv/rotate.h"
+            "${AOM_ROOT}/third_party/libyuv/include/libyuv/row.h"
+            "${AOM_ROOT}/third_party/libyuv/include/libyuv/scale.h"
+            "${AOM_ROOT}/third_party/libyuv/include/libyuv/scale_row.h"
+            "${AOM_ROOT}/third_party/libyuv/source/cpu_id.cc"
+            "${AOM_ROOT}/third_party/libyuv/source/planar_functions.cc"
+            "${AOM_ROOT}/third_party/libyuv/source/row_any.cc"
+            "${AOM_ROOT}/third_party/libyuv/source/row_common.cc"
+            "${AOM_ROOT}/third_party/libyuv/source/row_gcc.cc"
+            "${AOM_ROOT}/third_party/libyuv/source/row_mips.cc"
+            "${AOM_ROOT}/third_party/libyuv/source/row_neon.cc"
+            "${AOM_ROOT}/third_party/libyuv/source/row_neon64.cc"
+            "${AOM_ROOT}/third_party/libyuv/source/row_win.cc"
+            "${AOM_ROOT}/third_party/libyuv/source/scale.cc"
+            "${AOM_ROOT}/third_party/libyuv/source/scale_any.cc"
+            "${AOM_ROOT}/third_party/libyuv/source/scale_common.cc"
+            "${AOM_ROOT}/third_party/libyuv/source/scale_gcc.cc"
+            "${AOM_ROOT}/third_party/libyuv/source/scale_mips.cc"
+            "${AOM_ROOT}/third_party/libyuv/source/scale_neon.cc"
+            "${AOM_ROOT}/third_party/libyuv/source/scale_neon64.cc"
+            "${AOM_ROOT}/third_party/libyuv/source/scale_win.cc")
+
+list(APPEND AOM_SOURCES
+            "${AOM_CONFIG_DIR}/config/aom_config.c"
+            "${AOM_CONFIG_DIR}/config/aom_config.h"
+            "${AOM_ROOT}/aom/aom.h"
+            "${AOM_ROOT}/aom/aom_codec.h"
+            "${AOM_ROOT}/aom/aom_decoder.h"
+            "${AOM_ROOT}/aom/aom_encoder.h"
+            "${AOM_ROOT}/aom/aom_frame_buffer.h"
+            "${AOM_ROOT}/aom/aom_image.h"
+            "${AOM_ROOT}/aom/aom_integer.h"
+            "${AOM_ROOT}/aom/aomcx.h"
+            "${AOM_ROOT}/aom/aomdx.h"
+            "${AOM_ROOT}/aom/internal/aom_codec_internal.h"
+            "${AOM_ROOT}/aom/internal/aom_image_internal.h"
+            "${AOM_ROOT}/aom/src/aom_codec.c"
+            "${AOM_ROOT}/aom/src/aom_decoder.c"
+            "${AOM_ROOT}/aom/src/aom_encoder.c"
+            "${AOM_ROOT}/aom/src/aom_image.c"
+            "${AOM_ROOT}/aom/src/aom_integer.c")
+
+list(APPEND AOM_COMMON_APP_UTIL_SOURCES
+            "${AOM_ROOT}/common/args.c"
+            "${AOM_ROOT}/common/args.h"
+            "${AOM_ROOT}/common/av1_config.c"
+            "${AOM_ROOT}/common/av1_config.h"
+            "${AOM_ROOT}/common/md5_utils.c"
+            "${AOM_ROOT}/common/md5_utils.h"
+            "${AOM_ROOT}/common/tools_common.c"
+            "${AOM_ROOT}/common/tools_common.h"
+            "${AOM_ROOT}/common/video_common.h"
+            "${AOM_ROOT}/common/rawenc.c"
+            "${AOM_ROOT}/common/rawenc.h"
+            "${AOM_ROOT}/common/y4menc.c"
+            "${AOM_ROOT}/common/y4menc.h")
+
+list(APPEND AOM_DECODER_APP_UTIL_SOURCES "${AOM_ROOT}/common/ivfdec.c"
+            "${AOM_ROOT}/common/ivfdec.h" "${AOM_ROOT}/common/obudec.c"
+            "${AOM_ROOT}/common/obudec.h" "${AOM_ROOT}/common/video_reader.c"
+            "${AOM_ROOT}/common/video_reader.h")
+
+list(APPEND AOM_ENCODER_APP_UTIL_SOURCES
+            "${AOM_ROOT}/common/ivfenc.c"
+            "${AOM_ROOT}/common/ivfenc.h"
+            "${AOM_ROOT}/common/video_writer.c"
+            "${AOM_ROOT}/common/video_writer.h"
+            "${AOM_ROOT}/common/warnings.c"
+            "${AOM_ROOT}/common/warnings.h"
+            "${AOM_ROOT}/common/y4minput.c"
+            "${AOM_ROOT}/common/y4minput.h"
+            "${AOM_ROOT}/examples/encoder_util.h"
+            "${AOM_ROOT}/examples/encoder_util.c")
+
+list(APPEND AOM_ENCODER_STATS_SOURCES "${AOM_ROOT}/stats/aomstats.c"
+            "${AOM_ROOT}/stats/aomstats.h" "${AOM_ROOT}/stats/rate_hist.c"
+            "${AOM_ROOT}/stats/rate_hist.h")
+
+list(APPEND AOM_VERSION_SOURCES "${AOM_CONFIG_DIR}/config/aom_version.h")
+
+list(APPEND AOM_WEBM_DECODER_SOURCES "${AOM_ROOT}/common/webmdec.cc"
+            "${AOM_ROOT}/common/webmdec.h")
+
+list(APPEND AOM_WEBM_ENCODER_SOURCES "${AOM_ROOT}/common/webmenc.cc"
+            "${AOM_ROOT}/common/webmenc.h")
+
+include_directories(${AOM_ROOT} ${AOM_CONFIG_DIR} ${AOM_ROOT}/apps
+                    ${AOM_ROOT}/common ${AOM_ROOT}/examples ${AOM_ROOT}/stats)
+
+# Targets
+add_library(aom_version ${AOM_VERSION_SOURCES})
+add_dummy_source_file_to_target(aom_version c)
+add_custom_command(OUTPUT "${AOM_CONFIG_DIR}/config/aom_version.h"
+                   COMMAND ${CMAKE_COMMAND} ARGS
+                           -DAOM_CONFIG_DIR=${AOM_CONFIG_DIR}
+                           -DAOM_ROOT=${AOM_ROOT}
+                           -DGIT_EXECUTABLE=${GIT_EXECUTABLE}
+                           -DPERL_EXECUTABLE=${PERL_EXECUTABLE} -P
+                           "${AOM_ROOT}/build/cmake/version.cmake"
+                   COMMENT "Writing aom_version.h"
+                   VERBATIM)
+
+add_custom_target(aom_version_check
+                  COMMAND ${CMAKE_COMMAND}
+                          -DAOM_CONFIG_DIR=${AOM_CONFIG_DIR}
+                          -DAOM_ROOT=${AOM_ROOT}
+                          -DGIT_EXECUTABLE=${GIT_EXECUTABLE}
+                          -DPERL_EXECUTABLE=${PERL_EXECUTABLE} -P
+                          "${AOM_ROOT}/build/cmake/version.cmake"
+                  COMMENT "Updating version info if necessary."
+                  VERBATIM)
+
+if(BUILD_SHARED_LIBS AND NOT MSVC)
+  # Generate version file immediately for non-MSVC shared builds: The version
+  # string is needed for the aom target.
+  execute_process(COMMAND ${CMAKE_COMMAND}
+                          -DAOM_CONFIG_DIR=${AOM_CONFIG_DIR}
+                          -DAOM_ROOT=${AOM_ROOT}
+                          -DGIT_EXECUTABLE=${GIT_EXECUTABLE}
+                          -DPERL_EXECUTABLE=${PERL_EXECUTABLE} -P
+                          "${AOM_ROOT}/build/cmake/version.cmake")
+endif()
+
+add_dependencies(aom_version aom_version_check)
+
+# TODO(tomfinegan): Move rtcd target setup where it belongs for each rtcd
+# source.
+add_rtcd_build_step("${AOM_ROOT}/aom_dsp/aom_dsp_rtcd_defs.pl"
+                    "${AOM_CONFIG_DIR}/config/aom_dsp_rtcd.h"
+                    "${AOM_ROOT}/aom_dsp/aom_dsp_rtcd.c" "aom_dsp_rtcd")
+add_rtcd_build_step("${AOM_ROOT}/aom_scale/aom_scale_rtcd.pl"
+                    "${AOM_CONFIG_DIR}/config/aom_scale_rtcd.h"
+                    "${AOM_ROOT}/aom_scale/aom_scale_rtcd.c" "aom_scale_rtcd")
+add_rtcd_build_step("${AOM_ROOT}/av1/common/av1_rtcd_defs.pl"
+                    "${AOM_CONFIG_DIR}/config/av1_rtcd.h"
+                    "${AOM_ROOT}/av1/common/av1_rtcd.c" "av1_rtcd")
+
+add_library(aom_rtcd OBJECT ${AOM_RTCD_SOURCES})
+add_dependencies(aom_rtcd aom_version)
+
+if(ENABLE_EXAMPLES)
+  add_library(aom_encoder_stats OBJECT ${AOM_ENCODER_STATS_SOURCES})
+  set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_encoder_stats)
+endif()
+
+add_library(aom ${AOM_SOURCES} $<TARGET_OBJECTS:aom_rtcd>)
+if(BUILD_SHARED_LIBS)
+  add_library(aom_static STATIC ${AOM_SOURCES} $<TARGET_OBJECTS:aom_rtcd>)
+  set_target_properties(aom_static PROPERTIES OUTPUT_NAME aom)
+
+  if(NOT MSVC)
+    # Extract version string and set VERSION/SOVERSION for the aom target.
+    extract_version_string("${AOM_CONFIG_DIR}/config/aom_version.h"
+                           aom_version_triple)
+
+    # Strip any trailing version information, if present.
+    string(FIND "${aom_version_triple}" "-" dash_pos)
+    if(NOT dash_pos EQUAL -1)
+      string(SUBSTRING "${aom_version_triple}" 0 ${dash_pos} aom_version_triple)
+    endif()
+
+    # cmake-format: off
+    # VERSION is embedded in the .so file name.
+    # libaom.so -> libaom.so.SOVERSION
+    # libaom.so.SOVERSION -> libaom.so.VERSION
+    # libaom.so.VERSION
+    # cmake-format: on
+    set_target_properties(aom PROPERTIES SOVERSION ${SO_VERSION})
+    set_target_properties(aom PROPERTIES VERSION ${SO_FILE_VERSION})
+  endif()
+endif()
+
+if(NOT MSVC AND NOT APPLE)
+  target_link_libraries(aom ${AOM_LIB_LINK_TYPE} m)
+  if(BUILD_SHARED_LIBS)
+    target_link_libraries(aom_static ${AOM_LIB_LINK_TYPE} m)
+  endif()
+endif()
+
+# List of object and static library targets.
+set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_rtcd aom_mem aom_scale aom)
+if(BUILD_SHARED_LIBS)
+  set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_static)
+endif()
+
+# Setup dependencies.
+setup_aom_dsp_targets()
+setup_aom_mem_targets()
+setup_aom_ports_targets()
+setup_aom_util_targets()
+setup_aom_scale_targets()
+setup_av1_targets()
+
+# Make all library targets depend on aom_rtcd to make sure it builds first.
+foreach(aom_lib ${AOM_LIB_TARGETS})
+  if(NOT "${aom_lib}" STREQUAL "aom_rtcd")
+    add_dependencies(${aom_lib} aom_rtcd)
+  endif()
+endforeach()
+
+# Generate C/C++ stub files containing the function usage_exit(). Users of the
+# aom_common_app_util library must define this function. This is a convenience
+# to allow omission of the function from applications that might want to use
+# other pieces of the util support without defining usage_exit().
+file(WRITE "${AOM_GEN_SRC_DIR}/usage_exit.c" "void usage_exit(void) {}")
+file(WRITE "${AOM_GEN_SRC_DIR}/usage_exit.cc"
+     "extern \"C\" void usage_exit(void) {}")
+
+#
+# Application and application support targets.
+#
+if(ENABLE_EXAMPLES OR ENABLE_TESTS OR ENABLE_TOOLS)
+  add_library(aom_common_app_util OBJECT ${AOM_COMMON_APP_UTIL_SOURCES})
+  if(CONFIG_AV1_DECODER)
+    add_library(aom_decoder_app_util OBJECT ${AOM_DECODER_APP_UTIL_SOURCES})
+    # obudec depends on internal headers that require *rtcd.h
+    add_dependencies(aom_decoder_app_util aom_rtcd)
+  endif()
+  if(CONFIG_AV1_ENCODER)
+    add_library(aom_encoder_app_util OBJECT ${AOM_ENCODER_APP_UTIL_SOURCES})
+  endif()
+endif()
+
+if((CONFIG_AV1_DECODER OR CONFIG_AV1_ENCODER) AND ENABLE_EXAMPLES)
+  add_executable(resize_util "${AOM_ROOT}/examples/resize_util.c"
+                             $<TARGET_OBJECTS:aom_common_app_util>)
+  list(APPEND AOM_APP_TARGETS resize_util)
+endif()
+
+if(CONFIG_AV1_DECODER AND ENABLE_EXAMPLES)
+  add_executable(aomdec "${AOM_ROOT}/apps/aomdec.c"
+                        $<TARGET_OBJECTS:aom_common_app_util>
+                        $<TARGET_OBJECTS:aom_decoder_app_util>)
+  add_executable(decode_to_md5 "${AOM_ROOT}/examples/decode_to_md5.c"
+                               $<TARGET_OBJECTS:aom_common_app_util>
+                               $<TARGET_OBJECTS:aom_decoder_app_util>)
+  add_executable(decode_with_drops "${AOM_ROOT}/examples/decode_with_drops.c"
+                                   $<TARGET_OBJECTS:aom_common_app_util>
+                                   $<TARGET_OBJECTS:aom_decoder_app_util>)
+  add_executable(simple_decoder "${AOM_ROOT}/examples/simple_decoder.c"
+                                $<TARGET_OBJECTS:aom_common_app_util>
+                                $<TARGET_OBJECTS:aom_decoder_app_util>)
+  add_executable(scalable_decoder "${AOM_ROOT}/examples/scalable_decoder.c"
+                                  $<TARGET_OBJECTS:aom_common_app_util>
+                                  $<TARGET_OBJECTS:aom_decoder_app_util>)
+
+  if(CONFIG_ANALYZER)
+    add_executable(analyzer "${AOM_ROOT}/examples/analyzer.cc"
+                            $<TARGET_OBJECTS:aom_common_app_util>
+                            $<TARGET_OBJECTS:aom_decoder_app_util>)
+    target_link_libraries(analyzer ${AOM_LIB_LINK_TYPE} ${wxWidgets_LIBRARIES})
+    list(APPEND AOM_APP_TARGETS analyzer)
+    list(APPEND AOM_DECODER_EXAMPLE_TARGETS analyzer)
+  endif()
+
+  if(CONFIG_INSPECTION)
+    add_executable(inspect "${AOM_ROOT}/examples/inspect.c"
+                           $<TARGET_OBJECTS:aom_common_app_util>
+                           $<TARGET_OBJECTS:aom_decoder_app_util>)
+    list(APPEND AOM_DECODER_EXAMPLE_TARGETS inspect)
+
+    if(EMSCRIPTEN)
+      add_preproc_definition(_POSIX_SOURCE)
+      append_link_flag_to_target("inspect" "--emrun")
+      append_link_flag_to_target("inspect" "-s USE_PTHREADS=0")
+      append_link_flag_to_target("inspect" "-s WASM=1")
+      append_link_flag_to_target("inspect" "-s MODULARIZE=1")
+      append_link_flag_to_target("inspect" "-s ALLOW_MEMORY_GROWTH=1")
+      append_link_flag_to_target(
+        "inspect" "-s \'EXTRA_EXPORTED_RUNTIME_METHODS=[\"UTF8ToString\"]\'")
+      append_link_flag_to_target("inspect"
+                                 "-s EXPORT_NAME=\"\'DecoderModule\'\"")
+      append_link_flag_to_target("inspect" "--memory-init-file 0")
+
+      if("${CMAKE_BUILD_TYPE}" STREQUAL "")
+
+        # Default to -O3 when no build type is specified.
+        append_compiler_flag("-O3")
+      endif()
+
+      em_link_post_js(inspect "${AOM_ROOT}/tools/inspect-post.js")
+    endif()
+  endif()
+
+  # Maintain a list of decoder example targets.
+  list(APPEND AOM_DECODER_EXAMPLE_TARGETS aomdec decode_to_md5 decode_with_drops
+              scalable_decoder simple_decoder)
+
+  # Add decoder examples to the app targets list.
+  list(APPEND AOM_APP_TARGETS ${AOM_DECODER_EXAMPLE_TARGETS})
+endif()
+
+if(CONFIG_AV1_ENCODER)
+  if(ENABLE_EXAMPLES)
+    add_executable(aomenc "${AOM_ROOT}/apps/aomenc.c"
+                          $<TARGET_OBJECTS:aom_common_app_util>
+                          $<TARGET_OBJECTS:aom_encoder_app_util>
+                          $<TARGET_OBJECTS:aom_encoder_stats>)
+    add_executable(lossless_encoder "${AOM_ROOT}/examples/lossless_encoder.c"
+                                    $<TARGET_OBJECTS:aom_common_app_util>
+                                    $<TARGET_OBJECTS:aom_encoder_app_util>)
+    add_executable(set_maps "${AOM_ROOT}/examples/set_maps.c"
+                            $<TARGET_OBJECTS:aom_common_app_util>
+                            $<TARGET_OBJECTS:aom_encoder_app_util>)
+    add_executable(simple_encoder "${AOM_ROOT}/examples/simple_encoder.c"
+                                  $<TARGET_OBJECTS:aom_common_app_util>
+                                  $<TARGET_OBJECTS:aom_encoder_app_util>)
+    add_executable(twopass_encoder "${AOM_ROOT}/examples/twopass_encoder.c"
+                                   $<TARGET_OBJECTS:aom_common_app_util>
+                                   $<TARGET_OBJECTS:aom_encoder_app_util>)
+    add_executable(noise_model "${AOM_ROOT}/examples/noise_model.c"
+                               $<TARGET_OBJECTS:aom_common_app_util>
+                               $<TARGET_OBJECTS:aom_encoder_app_util>)
+    add_executable(scalable_encoder "${AOM_ROOT}/examples/scalable_encoder.c"
+                                    $<TARGET_OBJECTS:aom_common_app_util>
+                                    $<TARGET_OBJECTS:aom_encoder_app_util>)
+
+    add_executable(svc_encoder_rtc "${AOM_ROOT}/examples/svc_encoder_rtc.c"
+                                   $<TARGET_OBJECTS:aom_common_app_util>
+                                   $<TARGET_OBJECTS:aom_encoder_app_util>)
+
+    # Maintain a list of encoder example targets.
+    list(APPEND AOM_ENCODER_EXAMPLE_TARGETS aomenc lossless_encoder noise_model
+                set_maps simple_encoder scalable_encoder twopass_encoder
+                svc_encoder_rtc)
+  endif()
+
+  if(ENABLE_TOOLS)
+    if(CONFIG_ENTROPY_STATS AND NOT BUILD_SHARED_LIBS)
+
+      # TODO(tomfinegan): Sort out why a simple link command with
+      # aom_entropy_optimizer.c won't work on macos, but dragging in all the
+      # helper machinery allows the link to succeed.
+      add_executable(aom_entropy_optimizer
+                     "${AOM_GEN_SRC_DIR}/usage_exit.c"
+                     "${AOM_ROOT}/tools/aom_entropy_optimizer.c"
+                     $<TARGET_OBJECTS:aom_common_app_util>
+                     $<TARGET_OBJECTS:aom_encoder_app_util>)
+
+      # Maintain a list of encoder tool targets.
+      list(APPEND AOM_ENCODER_TOOL_TARGETS aom_entropy_optimizer)
+    endif()
+  endif()
+
+  # Add encoder examples and tools to the targets list.
+  list(APPEND AOM_APP_TARGETS ${AOM_ENCODER_EXAMPLE_TARGETS}
+              ${AOM_ENCODER_TOOL_TARGETS})
+
+  if(CONFIG_TUNE_VMAF)
+    find_library(VMAF libvmaf.a vmaf)
+    if(NOT VMAF)
+      message(FATAL_ERROR "VMAF library not found.")
+    endif()
+    message("-- Found VMAF library: " ${VMAF})
+    set_target_properties(aom PROPERTIES LINKER_LANGUAGE CXX)
+    if(BUILD_SHARED_LIBS)
+      set_target_properties(aom_static PROPERTIES LINKER_LANGUAGE CXX)
+    endif()
+    target_link_libraries(aom PRIVATE ${VMAF})
+  endif()
+endif()
+
+if(ENABLE_EXAMPLES)
+
+  # Maintain a separate variable listing only the examples to facilitate
+  # installation of example programs into an examples sub directory of
+  # $AOM_DIST_DIR/bin when building the dist target.
+  list(APPEND AOM_EXAMPLE_TARGETS ${AOM_DECODER_EXAMPLE_TARGETS}
+              ${AOM_ENCODER_EXAMPLE_TARGETS})
+endif()
+
+if(ENABLE_TOOLS)
+  if(CONFIG_AV1_DECODER)
+    add_executable(dump_obu "${AOM_GEN_SRC_DIR}/usage_exit.cc"
+                            "${AOM_ROOT}/tools/dump_obu.cc"
+                            "${AOM_ROOT}/tools/obu_parser.cc"
+                            "${AOM_ROOT}/tools/obu_parser.h"
+                            $<TARGET_OBJECTS:aom_common_app_util>
+                            $<TARGET_OBJECTS:aom_decoder_app_util>)
+
+    list(APPEND AOM_TOOL_TARGETS dump_obu)
+    list(APPEND AOM_APP_TARGETS dump_obu)
+
+    # Maintain a separate variable listing only the examples to facilitate
+    # installation of example programs into an tools sub directory of
+    # $AOM_DIST_DIR/bin when building the dist target.
+    list(APPEND AOM_TOOL_TARGETS ${AOM_DECODER_TOOL_TARGETS}
+                ${AOM_ENCODER_TOOL_TARGETS})
+  endif()
+endif()
+
+if(ENABLE_EXAMPLES AND CONFIG_AV1_DECODER AND CONFIG_AV1_ENCODER)
+  add_executable(aom_cx_set_ref "${AOM_ROOT}/examples/aom_cx_set_ref.c"
+                                $<TARGET_OBJECTS:aom_common_app_util>
+                                $<TARGET_OBJECTS:aom_encoder_app_util>)
+  list(APPEND AOM_EXAMPLE_TARGETS aom_cx_set_ref)
+  list(APPEND AOM_APP_TARGETS aom_cx_set_ref)
+endif()
+
+if(ENABLE_EXAMPLES AND CONFIG_AV1_ENCODER)
+  add_executable(lightfield_encoder "${AOM_ROOT}/examples/lightfield_encoder.c"
+                                    $<TARGET_OBJECTS:aom_common_app_util>
+                                    $<TARGET_OBJECTS:aom_encoder_app_util>)
+  list(APPEND AOM_EXAMPLE_TARGETS lightfield_encoder)
+  list(APPEND AOM_APP_TARGETS lightfield_encoder)
+endif()
+
+if(ENABLE_EXAMPLES AND CONFIG_AV1_DECODER)
+  add_executable(lightfield_tile_list_decoder
+                 "${AOM_ROOT}/examples/lightfield_tile_list_decoder.c"
+                 $<TARGET_OBJECTS:aom_common_app_util>
+                 $<TARGET_OBJECTS:aom_decoder_app_util>)
+  list(APPEND AOM_EXAMPLE_TARGETS lightfield_tile_list_decoder)
+  list(APPEND AOM_APP_TARGETS lightfield_tile_list_decoder)
+endif()
+
+if(ENABLE_EXAMPLES AND CONFIG_AV1_DECODER)
+  add_executable(lightfield_decoder "${AOM_ROOT}/examples/lightfield_decoder.c"
+                                    $<TARGET_OBJECTS:aom_common_app_util>
+                                    $<TARGET_OBJECTS:aom_decoder_app_util>)
+  list(APPEND AOM_EXAMPLE_TARGETS lightfield_decoder)
+  list(APPEND AOM_APP_TARGETS lightfield_decoder)
+endif()
+
+if(ENABLE_EXAMPLES AND CONFIG_AV1_ENCODER AND CONFIG_AV1_DECODER)
+  add_executable(lightfield_bitstream_parsing
+                 "${AOM_ROOT}/examples/lightfield_bitstream_parsing.c"
+                 $<TARGET_OBJECTS:aom_common_app_util>
+                 $<TARGET_OBJECTS:aom_encoder_app_util>
+                 $<TARGET_OBJECTS:aom_decoder_app_util>)
+  list(APPEND AOM_EXAMPLE_TARGETS lightfield_bitstream_parsing)
+  list(APPEND AOM_APP_TARGETS lightfield_bitstream_parsing)
+endif()
+
+foreach(aom_app ${AOM_APP_TARGETS})
+  target_link_libraries(${aom_app} ${AOM_LIB_LINK_TYPE} aom)
+endforeach()
+
+if(ENABLE_EXAMPLES OR ENABLE_TESTS OR ENABLE_TOOLS)
+  if(CONFIG_LIBYUV)
+    add_library(yuv OBJECT ${AOM_LIBYUV_SOURCES})
+    if(NOT MSVC)
+      target_compile_options(yuv PRIVATE -Wno-unused-parameter)
+    endif()
+    include_directories("${AOM_ROOT}/third_party/libyuv/include")
+
+    # Add to existing targets.
+    foreach(aom_app ${AOM_APP_TARGETS})
+      target_sources(${aom_app} PRIVATE $<TARGET_OBJECTS:yuv>)
+      set_property(TARGET ${aom_app} PROPERTY LINKER_LANGUAGE CXX)
+    endforeach()
+  endif()
+
+  if(CONFIG_WEBM_IO)
+    add_library(webm OBJECT ${AOM_LIBWEBM_SOURCES})
+    include_directories("${AOM_ROOT}/third_party/libwebm")
+    target_compile_definitions(webm PRIVATE __STDC_CONSTANT_MACROS)
+    target_compile_definitions(webm PRIVATE __STDC_LIMIT_MACROS)
+
+    if(NOT MSVC)
+      target_compile_options(webm PRIVATE -Wno-shadow)
+    endif()
+
+    # Add to existing targets.
+    if(CONFIG_AV1_DECODER)
+      target_sources(aom_decoder_app_util PRIVATE ${AOM_WEBM_DECODER_SOURCES})
+    endif()
+
+    if(CONFIG_AV1_ENCODER)
+      target_sources(aom_encoder_app_util PRIVATE ${AOM_WEBM_ENCODER_SOURCES})
+    endif()
+
+    foreach(aom_app ${AOM_APP_TARGETS})
+      target_sources(${aom_app} PRIVATE $<TARGET_OBJECTS:webm>)
+      set_property(TARGET ${aom_app} PROPERTY LINKER_LANGUAGE CXX)
+    endforeach()
+  endif()
+endif()
+
+if(ENABLE_TESTS)
+
+  # Create test_libaom target and the targets it depends on.
+  setup_aom_test_targets()
+endif()
+
+if(HAVE_PTHREAD_H AND CONFIG_MULTITHREAD)
+  find_package(Threads)
+  target_link_libraries(aom ${AOM_LIB_LINK_TYPE} Threads::Threads)
+  if(BUILD_SHARED_LIBS)
+    target_link_libraries(aom_static ${AOM_LIB_LINK_TYPE} Threads::Threads)
+  endif()
+endif()
+
+if(XCODE)
+
+  # TODO(tomfinegan): Make sure target has no C++ files before doing this as
+  # it's not necessary in that case.
+  if(CONFIG_LIBYUV OR CONFIG_WEBM_IO)
+
+    # The Xcode generator does not obey LINKER_LANGUAGE. Because of the issue
+    # what looks like a C++ file needs to be in any target that Xcode will link
+    # when the target contains a C++ dependency. Without this Xcode will try to
+    # link with the C linker, which always ends badly when a dependency actually
+    # includes C++.
+
+    # Note: LINKER_LANGUAGE is explicitly set to C++ for all targets touched
+    # here, it really is the Xcode generator's fault, or just a deficiency in
+    # Xcode itself.
+    foreach(aom_app ${AOM_APP_TARGETS})
+      add_dummy_source_file_to_target("${aom_app}" "cc")
+    endforeach()
+  endif()
+endif()
+
+if(ENABLE_EXAMPLES AND "${CMAKE_GENERATOR}" MATCHES "Makefiles$")
+
+  # For historical purposes place the example binaries in the example directory.
+  file(MAKE_DIRECTORY "${AOM_CONFIG_DIR}/examples")
+
+  foreach(target ${AOM_EXAMPLE_TARGETS})
+    if(NOT "${target}" MATCHES "aomdec\|aomenc")
+      set_target_properties(${target}
+                            PROPERTIES RUNTIME_OUTPUT_DIRECTORY
+                                       "${AOM_CONFIG_DIR}/examples")
+    endif()
+  endforeach()
+
+  if(ENABLE_TOOLS AND AOM_TOOL_TARGETS)
+
+    # The same expectation is true for tool targets.
+    file(MAKE_DIRECTORY "${AOM_CONFIG_DIR}/tools")
+    set_target_properties(${AOM_TOOL_TARGETS}
+                          PROPERTIES RUNTIME_OUTPUT_DIRECTORY
+                                     "${AOM_CONFIG_DIR}/tools")
+  endif()
+endif()
+
+if(BUILD_SHARED_LIBS)
+  include("${AOM_ROOT}/build/cmake/exports.cmake")
+  setup_exports_target()
+endif()
+
+# Handle user supplied compile and link flags last to ensure they're obeyed.
+set_user_flags()
+
+# Aomedia documentation rule.
+if(ENABLE_DOCS)
+  include(FindDoxygen)
+  if(DOXYGEN_FOUND)
+    include("${AOM_ROOT}/docs.cmake")
+    setup_documentation_targets()
+  else()
+    message("--- Cannot find doxygen, ENABLE_DOCS turned off.")
+    set(ENABLE_DOCS OFF)
+  endif()
+endif()
+
+# Aomedia dist rule.
+if(CONFIG_AV1_DECODER AND ENABLE_EXAMPLES)
+  list(APPEND AOM_DIST_APPS $<TARGET_FILE:aomdec>)
+endif()
+if(CONFIG_AV1_ENCODER AND ENABLE_EXAMPLES)
+  list(APPEND AOM_DIST_APPS $<TARGET_FILE:aomenc>)
+endif()
+
+if(ENABLE_EXAMPLES)
+  foreach(example ${AOM_EXAMPLE_TARGETS})
+    list(APPEND AOM_DIST_EXAMPLES $<TARGET_FILE:${example}>)
+  endforeach()
+endif()
+
+if(ENABLE_TOOLS)
+  foreach(tool ${AOM_TOOL_TARGETS})
+    list(APPEND AOM_DIST_TOOLS $<TARGET_FILE:${tool}>)
+  endforeach()
+endif()
+
+if(NOT AOM_DIST_DIR)
+  set(AOM_DIST_DIR "${AOM_CONFIG_DIR}/dist")
+endif()
+
+add_custom_target(dist
+                  COMMAND ${CMAKE_COMMAND}
+                          -DAOM_ROOT=${AOM_ROOT}
+                          -DAOM_CONFIG_DIR=${AOM_CONFIG_DIR}
+                          -DAOM_DIST_DIR=${AOM_DIST_DIR}
+                          -DAOM_DIST_APPS="${AOM_DIST_APPS}"
+                          -DAOM_DIST_EXAMPLES="${AOM_DIST_EXAMPLES}"
+                          -DAOM_DIST_TOOLS="${AOM_DIST_TOOLS}"
+                          -DAOM_DIST_INCLUDES="${AOM_INSTALL_INCS}"
+                          -DAOM_DIST_LIBS=$<TARGET_FILE:aom>
+                          -DENABLE_DOCS=${ENABLE_DOCS} -P
+                          "${AOM_ROOT}/build/cmake/dist.cmake"
+                  DEPENDS ${AOM_INSTALL_BINS} ${AOM_INSTALL_LIBS}
+                          ${AOM_INSTALL_INCS} ${AOM_EXAMPLE_TARGETS}
+                          ${AOM_TOOL_TARGETS})
+
+if(ENABLE_DOCS)
+  add_dependencies(dist docs)
+endif()
+
+# Collect all variables containing libaom source files.
+get_cmake_property(all_cmake_vars VARIABLES)
+foreach(var ${all_cmake_vars})
+  if("${var}" MATCHES "SOURCES$\|_INTRIN_\|_ASM_"
+     AND NOT "${var}" MATCHES "_APP_\|DOXYGEN\|LIBWEBM\|LIBYUV\|_PKG_\|TEST")
+    list(APPEND aom_source_vars ${var})
+  endif()
+endforeach()
+
+# Libaom_srcs.txt generation.
+set(libaom_srcs_txt_file "${AOM_CONFIG_DIR}/libaom_srcs.txt")
+file(WRITE "${libaom_srcs_txt_file}" "# This file is generated. DO NOT EDIT.\n")
+
+# Static source file list first.
+foreach(aom_source_var ${aom_source_vars})
+  foreach(file ${${aom_source_var}})
+    if(NOT "${file}" MATCHES "${AOM_CONFIG_DIR}")
+      string(REPLACE "${AOM_ROOT}/" "" file "${file}")
+      file(APPEND "${libaom_srcs_txt_file}" "${file}\n")
+    endif()
+  endforeach()
+endforeach()
+
+file(APPEND "${libaom_srcs_txt_file}"
+     "# Files below this line are generated by the libaom build system.\n")
+foreach(aom_source_var ${aom_source_vars})
+  foreach(file ${${aom_source_var}})
+    if("${file}" MATCHES "${AOM_CONFIG_DIR}")
+      string(REPLACE "${AOM_CONFIG_DIR}/" "" file "${file}")
+      file(APPEND "${libaom_srcs_txt_file}" "${file}\n")
+    endif()
+  endforeach()
+endforeach()
+
+# Libaom_srcs.gni generation.
+set(libaom_srcs_gni_file "${AOM_CONFIG_DIR}/libaom_srcs.gni")
+file(WRITE "${libaom_srcs_gni_file}" "# This file is generated. DO NOT EDIT.\n")
+
+foreach(aom_source_var ${aom_source_vars})
+  if("${${aom_source_var}}" MATCHES "${AOM_ROOT}")
+    string(TOLOWER ${aom_source_var} aom_source_var_lowercase)
+    file(APPEND "${libaom_srcs_gni_file}" "\n${aom_source_var_lowercase} = [\n")
+  endif()
+
+  foreach(file ${${aom_source_var}})
+    if(NOT "${file}" MATCHES "${AOM_CONFIG_DIR}")
+      string(REPLACE "${AOM_ROOT}" "//third_party/libaom/source/libaom" file
+                     "${file}")
+      file(APPEND "${libaom_srcs_gni_file}" "  \"${file}\",\n")
+    endif()
+  endforeach()
+
+  if("${${aom_source_var}}" MATCHES "${AOM_ROOT}")
+    file(APPEND "${libaom_srcs_gni_file}" "]\n")
+  endif()
+endforeach()
+
+file(APPEND "${libaom_srcs_gni_file}"
+     "\n# Files below this line are generated by the libaom build system.\n")
+
+foreach(aom_source_var ${aom_source_vars})
+  if("${${aom_source_var}}" MATCHES "${AOM_CONFIG_DIR}")
+    string(TOLOWER ${aom_source_var} aom_source_var_lowercase)
+    file(APPEND "${libaom_srcs_gni_file}"
+         "\n${aom_source_var_lowercase}_gen = [\n")
+  endif()
+  foreach(file ${${aom_source_var}})
+    if(NOT "${file}" MATCHES "${AOM_ROOT}")
+      string(REPLACE "${AOM_CONFIG_DIR}" "//third_party/libaom/source/libaom"
+                     file "${file}")
+      file(APPEND "${libaom_srcs_gni_file}" "  \"${file}\",\n")
+    endif()
+  endforeach()
+
+  if("${${aom_source_var}}" MATCHES "${AOM_CONFIG_DIR}")
+    file(APPEND "${libaom_srcs_gni_file}" "]\n")
+  endif()
+endforeach()
+
+# Generate aom.pc and setup install rule.
+setup_aom_install_targets()
diff --git a/libs/libaom/src/LICENSE b/libs/libaom/src/LICENSE
new file mode 100644
index 000000000..fc340c376
--- /dev/null
+++ b/libs/libaom/src/LICENSE
@@ -0,0 +1,27 @@
+Copyright (c) 2016, Alliance for Open Media. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in
+   the documentation and/or other materials provided with the
+   distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
diff --git a/libs/libaom/src/PATENTS b/libs/libaom/src/PATENTS
new file mode 100644
index 000000000..493f61637
--- /dev/null
+++ b/libs/libaom/src/PATENTS
@@ -0,0 +1,108 @@
+Alliance for Open Media Patent License 1.0
+
+1. License Terms.
+
+1.1. Patent License. Subject to the terms and conditions of this License, each
+     Licensor, on behalf of itself and successors in interest and assigns,
+     grants Licensee a non-sublicensable, perpetual, worldwide, non-exclusive,
+     no-charge, royalty-free, irrevocable (except as expressly stated in this
+     License) patent license to its Necessary Claims to make, use, sell, offer
+     for sale, import or distribute any Implementation.
+
+1.2. Conditions.
+
+1.2.1. Availability. As a condition to the grant of rights to Licensee to make,
+       sell, offer for sale, import or distribute an Implementation under
+       Section 1.1, Licensee must make its Necessary Claims available under
+       this License, and must reproduce this License with any Implementation
+       as follows:
+
+       a. For distribution in source code, by including this License in the
+          root directory of the source code with its Implementation.
+
+       b. For distribution in any other form (including binary, object form,
+          and/or hardware description code (e.g., HDL, RTL, Gate Level Netlist,
+          GDSII, etc.)), by including this License in the documentation, legal
+          notices, and/or other written materials provided with the
+          Implementation.
+
+1.2.2. Additional Conditions. This license is directly from Licensor to
+       Licensee.  Licensee acknowledges as a condition of benefiting from it
+       that no rights from Licensor are received from suppliers, distributors,
+       or otherwise in connection with this License.
+
+1.3. Defensive Termination. If any Licensee, its Affiliates, or its agents
+     initiates patent litigation or files, maintains, or voluntarily
+     participates in a lawsuit against another entity or any person asserting
+     that any Implementation infringes Necessary Claims, any patent licenses
+     granted under this License directly to the Licensee are immediately
+     terminated as of the date of the initiation of action unless 1) that suit
+     was in response to a corresponding suit regarding an Implementation first
+     brought against an initiating entity, or 2) that suit was brought to
+     enforce the terms of this License (including intervention in a third-party
+     action by a Licensee).
+
+1.4. Disclaimers. The Reference Implementation and Specification are provided
+     "AS IS" and without warranty. The entire risk as to implementing or
+     otherwise using the Reference Implementation or Specification is assumed
+     by the implementer and user. Licensor expressly disclaims any warranties
+     (express, implied, or otherwise), including implied warranties of
+     merchantability, non-infringement, fitness for a particular purpose, or
+     title, related to the material. IN NO EVENT WILL LICENSOR BE LIABLE TO
+     ANY OTHER PARTY FOR LOST PROFITS OR ANY FORM OF INDIRECT, SPECIAL,
+     INCIDENTAL, OR CONSEQUENTIAL DAMAGES OF ANY CHARACTER FROM ANY CAUSES OF
+     ACTION OF ANY KIND WITH RESPECT TO THIS LICENSE, WHETHER BASED ON BREACH
+     OF CONTRACT, TORT (INCLUDING NEGLIGENCE), OR OTHERWISE, AND WHETHER OR
+     NOT THE OTHER PARTRY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+2. Definitions.
+
+2.1. Affiliate.  "Affiliate" means an entity that directly or indirectly
+     Controls, is Controlled by, or is under common Control of that party.
+
+2.2. Control. "Control" means direct or indirect control of more than 50% of
+     the voting power to elect directors of that corporation, or for any other
+     entity, the power to direct management of such entity.
+
+2.3. Decoder.  "Decoder" means any decoder that conforms fully with all
+     non-optional portions of the Specification.
+
+2.4. Encoder.  "Encoder" means any encoder that produces a bitstream that can
+     be decoded by a Decoder only to the extent it produces such a bitstream.
+
+2.5. Final Deliverable.  "Final Deliverable" means the final version of a
+     deliverable approved by the Alliance for Open Media as a Final
+     Deliverable.
+
+2.6. Implementation.  "Implementation" means any implementation, including the
+     Reference Implementation, that is an Encoder and/or a Decoder. An
+     Implementation also includes components of an Implementation only to the
+     extent they are used as part of an Implementation.
+
+2.7. License. "License" means this license.
+
+2.8. Licensee. "Licensee" means any person or entity who exercises patent
+     rights granted under this License.
+
+2.9. Licensor.  "Licensor" means (i) any Licensee that makes, sells, offers
+     for sale, imports or distributes any Implementation, or (ii) a person
+     or entity that has a licensing obligation to the Implementation as a
+     result of its membership and/or participation in the Alliance for Open
+     Media working group that developed the Specification.
+
+2.10. Necessary Claims.  "Necessary Claims" means all claims of patents or
+      patent applications, (a) that currently or at any time in the future,
+      are owned or controlled by the Licensor, and (b) (i) would be an
+      Essential Claim as defined by the W3C Policy as of February 5, 2004
+      (https://www.w3.org/Consortium/Patent-Policy-20040205/#def-essential)
+      as if the Specification was a W3C Recommendation; or (ii) are infringed
+      by the Reference Implementation.
+
+2.11. Reference Implementation. "Reference Implementation" means an Encoder
+      and/or Decoder released by the Alliance for Open Media as a Final
+      Deliverable.
+
+2.12. Specification. "Specification" means the specification designated by
+      the Alliance for Open Media as a Final Deliverable for which this
+      License was issued.
+
diff --git a/libs/libaom/src/README.md b/libs/libaom/src/README.md
new file mode 100644
index 000000000..cf057ae6c
--- /dev/null
+++ b/libs/libaom/src/README.md
@@ -0,0 +1,665 @@
+# AV1 Codec Library
+
+## Contents
+1. [Building the lib and applications](#building-the-library-and-applications)
+    - [Prerequisites](#prerequisites)
+    - [Get the code](#get-the-code)
+    - [Basics](#basic-build)
+    - [Configuration options](#configuration-options)
+    - [Dylib builds](#dylib-builds)
+    - [Debugging](#debugging)
+    - [Cross compiling](#cross-compiling)
+    - [Sanitizer support](#sanitizers)
+    - [MSVC builds](#microsoft-visual-studio-builds)
+    - [Xcode builds](#xcode-builds)
+    - [Emscripten builds](#emscripten-builds)
+    - [Extra Build Flags](#extra-build-flags)
+    - [Build with VMAF support](#build-with-vmaf)
+2. [Testing the library](#testing-the-av1-codec)
+    - [Basics](#testing-basics)
+        - [Unit tests](#1_unit-tests)
+        - [Example tests](#2_example-tests)
+        - [Encoder tests](#3_encoder-tests)
+    - [IDE hosted tests](#ide-hosted-tests)
+    - [Downloading test data](#downloading-the-test-data)
+    - [Adding a new test data file](#adding-a-new-test-data-file)
+    - [Additional test data](#additional-test-data)
+    - [Sharded testing](#sharded-testing)
+        - [Running tests directly](#1_running-test_libaom-directly)
+        - [Running tests via CMake](#2_running-the-tests-via-the-cmake-build)
+3. [Coding style](#coding-style)
+4. [Submitting patches](#submitting-patches)
+    - [Login cookie](#login-cookie)
+    - [Contributor agreement](#contributor-agreement)
+    - [Testing your code](#testing-your-code)
+    - [Commit message hook](#commit-message-hook)
+    - [Upload your change](#upload-your-change)
+    - [Incorporating Reviewer Comments](#incorporating-reviewer-comments)
+    - [Submitting your change](#submitting-your-change)
+    - [Viewing change status](#viewing-the-status-of-uploaded-changes)
+5. [Support](#support)
+6. [Bug reports](#bug-reports)
+
+## Building the library and applications
+
+### Prerequisites
+
+ 1. [CMake](https://cmake.org) version 3.5 or higher.
+ 2. [Git](https://git-scm.com/).
+ 3. [Perl](https://www.perl.org/).
+ 4. For x86 targets, [yasm](http://yasm.tortall.net/), which is preferred, or a
+    recent version of [nasm](http://www.nasm.us/). If you download yasm with
+    the intention to work with Visual Studio, please download win32.exe or
+    win64.exe and rename it into yasm.exe. DO NOT download or use vsyasm.exe.
+ 5. Building the documentation requires [doxygen](http://doxygen.org).
+ 6. Building the unit tests requires [Python](https://www.python.org/).
+ 7. Emscripten builds require the portable
+   [EMSDK](https://kripken.github.io/emscripten-site/index.html).
+
+### Get the code
+
+The AV1 library source code is stored in the Alliance for Open Media Git
+repository:
+
+~~~
+    $ git clone https://aomedia.googlesource.com/aom
+    # By default, the above command stores the source in the aom directory:
+    $ cd aom
+~~~
+
+### Basic build
+
+CMake replaces the configure step typical of many projects. Running CMake will
+produce configuration and build files for the currently selected CMake
+generator. For most systems the default generator is Unix Makefiles. The basic
+form of a makefile build is the following:
+
+~~~
+    $ cmake path/to/aom
+    $ make
+~~~
+
+The above will generate a makefile build that produces the AV1 library and
+applications for the current host system after the make step completes
+successfully. The compiler chosen varies by host platform, but a general rule
+applies: On systems where cc and c++ are present in $PATH at the time CMake is
+run the generated build will use cc and c++ by default.
+
+### Configuration options
+
+The AV1 codec library has a great many configuration options. These come in two
+varieties:
+
+ 1. Build system configuration options. These have the form `ENABLE_FEATURE`.
+ 2. AV1 codec configuration options. These have the form `CONFIG_FEATURE`.
+
+Both types of options are set at the time CMake is run. The following example
+enables ccache and disables the AV1 encoder:
+
+~~~
+    $ cmake path/to/aom -DENABLE_CCACHE=1 -DCONFIG_AV1_ENCODER=0
+    $ make
+~~~
+
+The available configuration options are too numerous to list here. Build system
+configuration options can be found at the top of the CMakeLists.txt file found
+in the root of the AV1 repository, and AV1 codec configuration options can
+currently be found in the file `build/cmake/aom_config_defaults.cmake`.
+
+### Dylib builds
+
+A dylib (shared object) build of the AV1 codec library can be enabled via the
+CMake built in variable `BUILD_SHARED_LIBS`:
+
+~~~
+    $ cmake path/to/aom -DBUILD_SHARED_LIBS=1
+    $ make
+~~~
+
+This is currently only supported on non-Windows targets.
+
+### Debugging
+
+Depending on the generator used there are multiple ways of going about
+debugging AV1 components. For single configuration generators like the Unix
+Makefiles generator, setting `CMAKE_BUILD_TYPE` to Debug is sufficient:
+
+~~~
+    $ cmake path/to/aom -DCMAKE_BUILD_TYPE=Debug
+~~~
+
+For Xcode, mainly because configuration controls for Xcode builds are buried two
+configuration windows deep and must be set for each subproject within the Xcode
+IDE individually, `CMAKE_CONFIGURATION_TYPES` should be set to Debug:
+
+~~~
+    $ cmake path/to/aom -G Xcode -DCMAKE_CONFIGURATION_TYPES=Debug
+~~~
+
+For Visual Studio the in-IDE configuration controls should be used. Simply set
+the IDE project configuration to Debug to allow for stepping through the code.
+
+In addition to the above it can sometimes be useful to debug only C and C++
+code. To disable all assembly code and intrinsics set `AOM_TARGET_CPU` to
+generic at generation time:
+
+~~~
+    $ cmake path/to/aom -DAOM_TARGET_CPU=generic
+~~~
+
+### Cross compiling
+
+For the purposes of building the AV1 codec and applications and relative to the
+scope of this guide, all builds for architectures differing from the native host
+architecture will be considered cross compiles. The AV1 CMake build handles
+cross compiling via the use of toolchain files included in the AV1 repository.
+The toolchain files available at the time of this writing are:
+
+ - arm64-ios.cmake
+ - arm64-linux-gcc.cmake
+ - arm64-mingw-gcc.cmake
+ - armv7-ios.cmake
+ - armv7-linux-gcc.cmake
+ - armv7-mingw-gcc.cmake
+ - armv7s-ios.cmake
+ - mips32-linux-gcc.cmake
+ - mips64-linux-gcc.cmake
+ - x86-ios-simulator.cmake
+ - x86-linux.cmake
+ - x86-macos.cmake
+ - x86-mingw-gcc.cmake
+ - x86\_64-ios-simulator.cmake
+ - x86\_64-mingw-gcc.cmake
+
+The following example demonstrates use of the x86-macos.cmake toolchain file on
+a x86\_64 MacOS host:
+
+~~~
+    $ cmake path/to/aom \
+      -DCMAKE_TOOLCHAIN_FILE=path/to/aom/build/cmake/toolchains/x86-macos.cmake
+    $ make
+~~~
+
+To build for an unlisted target creation of a new toolchain file is the best
+solution. The existing toolchain files can be used a starting point for a new
+toolchain file since each one exposes the basic requirements for toolchain files
+as used in the AV1 codec build.
+
+As a temporary work around an unoptimized AV1 configuration that builds only C
+and C++ sources can be produced using the following commands:
+
+~~~
+    $ cmake path/to/aom -DAOM_TARGET_CPU=generic
+    $ make
+~~~
+
+In addition to the above it's important to note that the toolchain files
+suffixed with gcc behave differently than the others. These toolchain files
+attempt to obey the $CROSS environment variable.
+
+### Sanitizers
+
+Sanitizer integration is built-in to the CMake build system. To enable a
+sanitizer, add `-DSANITIZE=<type>` to the CMake command line. For example, to
+enable address sanitizer:
+
+~~~
+    $ cmake path/to/aom -DSANITIZE=address
+    $ make
+~~~
+
+Sanitizers available vary by platform, target, and compiler. Consult your
+compiler documentation to determine which, if any, are available.
+
+### Microsoft Visual Studio builds
+
+Building the AV1 codec library in Microsoft Visual Studio is supported. Visual
+Studio 2017 (15.0) or later is required. The following example demonstrates
+generating projects and a solution for the Microsoft IDE:
+
+~~~
+    # This does not require a bash shell; Command Prompt (cmd.exe) is fine.
+    # This assumes the build host is a Windows x64 computer.
+
+    # To build with Visual Studio 2019 for the x64 target:
+    $ cmake path/to/aom -G "Visual Studio 16 2019"
+    $ cmake --build .
+
+    # To build with Visual Studio 2019 for the 32-bit x86 target:
+    $ cmake path/to/aom -G "Visual Studio 16 2019" -A Win32
+    $ cmake --build .
+
+    # To build with Visual Studio 2017 for the x64 target:
+    $ cmake path/to/aom -G "Visual Studio 15 2017" -T host=x64 -A x64
+    $ cmake --build .
+
+    # To build with Visual Studio 2017 for the 32-bit x86 target:
+    $ cmake path/to/aom -G "Visual Studio 15 2017" -T host=x64
+    $ cmake --build .
+~~~
+
+NOTE: The build system targets Windows 7 or later by compiling files with
+`-D_WIN32_WINNT=0x0601`.
+
+### Xcode builds
+
+Building the AV1 codec library in Xcode is supported. The following example
+demonstrates generating an Xcode project:
+
+~~~
+    $ cmake path/to/aom -G Xcode
+~~~
+
+### Emscripten builds
+
+Building the AV1 codec library with Emscripten is supported. Typically this is
+used to hook into the AOMAnalyzer GUI application. These instructions focus on
+using the inspector with AOMAnalyzer, but all tools can be built with
+Emscripten.
+
+It is assumed here that you have already downloaded and installed the EMSDK,
+installed and activated at least one toolchain, and setup your environment
+appropriately using the emsdk\_env script.
+
+1. Download [AOMAnalyzer](https://people.xiph.org/~mbebenita/analyzer/).
+
+2. Configure the build:
+
+~~~
+    $ cmake path/to/aom \
+        -DENABLE_CCACHE=1 \
+        -DAOM_TARGET_CPU=generic \
+        -DENABLE_DOCS=0 \
+        -DENABLE_TESTS=0 \
+        -DCONFIG_ACCOUNTING=1 \
+        -DCONFIG_INSPECTION=1 \
+        -DCONFIG_MULTITHREAD=0 \
+        -DCONFIG_RUNTIME_CPU_DETECT=0 \
+        -DCONFIG_WEBM_IO=0 \
+        -DCMAKE_TOOLCHAIN_FILE=path/to/emsdk-portable/.../Emscripten.cmake
+~~~
+
+3. Build it: run make if that's your generator of choice:
+
+~~~
+    $ make inspect
+~~~
+
+4. Run the analyzer:
+
+~~~
+    # inspect.js is in the examples sub directory of the directory in which you
+    # executed cmake.
+    $ path/to/AOMAnalyzer path/to/examples/inspect.js path/to/av1/input/file
+~~~
+
+### Extra build flags
+
+Three variables allow for passing of additional flags to the build system.
+
+- AOM\_EXTRA\_C\_FLAGS
+- AOM\_EXTRA\_CXX\_FLAGS
+- AOM\_EXTRA\_EXE\_LINKER\_FLAGS
+
+The build system attempts to ensure the flags passed through the above variables
+are passed to tools last in order to allow for override of default behavior.
+These flags can be used, for example, to enable asserts in a release build:
+
+~~~
+    $ cmake path/to/aom \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DAOM_EXTRA_C_FLAGS=-UNDEBUG \
+        -DAOM_EXTRA_CXX_FLAGS=-UNDEBUG
+~~~
+
+### Build with VMAF support
+
+After installing
+[libvmaf.a](https://github.com/Netflix/vmaf/blob/master/resource/doc/libvmaf.md),
+you can use it with the encoder:
+
+~~~
+    $ cmake path/to/aom -DCONFIG_TUNE_VMAF=1
+~~~
+
+Please note that the default VMAF model
+("/usr/local/share/model/vmaf_v0.6.1.pkl")
+will be used unless you set the following flag when running the encoder:
+
+~~~
+    # --vmaf-model-path=path/to/model
+~~~
+
+## Testing the AV1 codec
+
+### Testing basics
+
+There are several methods of testing the AV1 codec. All of these methods require
+the presence of the AV1 source code and a working build of the AV1 library and
+applications.
+
+#### 1. Unit tests:
+
+The unit tests can be run at build time:
+
+~~~
+    # Before running the make command the LIBAOM_TEST_DATA_PATH environment
+    # variable should be set to avoid downloading the test files to the
+    # cmake build configuration directory.
+    $ cmake path/to/aom
+    # Note: The AV1 CMake build creates many test targets. Running make
+    # with multiple jobs will speed up the test run significantly.
+    $ make runtests
+~~~
+
+#### 2. Example tests:
+
+The example tests require a bash shell and can be run in the following manner:
+
+~~~
+    # See the note above about LIBAOM_TEST_DATA_PATH above.
+    $ cmake path/to/aom
+    $ make
+    # It's best to build the testdata target using many make jobs.
+    # Running it like this will verify and download (if necessary)
+    # one at a time, which takes a while.
+    $ make testdata
+    $ path/to/aom/test/examples.sh --bin-path examples
+~~~
+
+#### 3. Encoder tests:
+
+When making a change to the encoder run encoder tests to confirm that your
+change has a positive or negligible impact on encode quality. When running these
+tests the build configuration should be changed to enable internal encoder
+statistics:
+
+~~~
+    $ cmake path/to/aom -DCONFIG_INTERNAL_STATS=1
+    $ make
+~~~
+
+The repository contains scripts intended to make running these tests as simple
+as possible. The following example demonstrates creating a set of baseline clips
+for comparison to results produced after making your change to libaom:
+
+~~~
+    # This will encode all Y4M files in the current directory using the
+    # settings specified to create the encoder baseline statistical data:
+    $ cd path/to/test/inputs
+    # This command line assumes that run_encodes.sh, its helper script
+    # best_encode.sh, and the aomenc you intend to test are all within a
+    # directory in your PATH.
+    $ run_encodes.sh 200 500 50 baseline
+~~~
+
+After making your change and creating the baseline clips, you'll need to run
+encodes that include your change(s) to confirm that things are working as
+intended:
+
+~~~
+    # This will encode all Y4M files in the current directory using the
+    # settings specified to create the statistical data for your change:
+    $ cd path/to/test/inputs
+    # This command line assumes that run_encodes.sh, its helper script
+    # best_encode.sh, and the aomenc you intend to test are all within a
+    # directory in your PATH.
+    $ run_encodes.sh 200 500 50 mytweak
+~~~
+
+After creating both data sets you can use `test/visual_metrics.py` to generate a
+report that can be viewed in a web browser:
+
+~~~
+    $ visual_metrics.py metrics_template.html "*stt" baseline mytweak \
+      > mytweak.html
+~~~
+
+You can view the report by opening mytweak.html in a web browser.
+
+
+### IDE hosted tests
+
+By default the generated projects files created by CMake will not include the
+runtests and testdata rules when generating for IDEs like Microsoft Visual
+Studio and Xcode. This is done to avoid intolerably long build cycles in the
+IDEs-- IDE behavior is to build all targets when selecting the build project
+options in MSVS and Xcode. To enable the test rules in IDEs the
+`ENABLE_IDE_TEST_HOSTING` variable must be enabled at CMake generation time:
+
+~~~
+    # This example uses Xcode. To get a list of the generators
+    # available, run cmake with the -G argument missing its
+    # value.
+    $ cmake path/to/aom -DENABLE_IDE_TEST_HOSTING=1 -G Xcode
+~~~
+
+### Downloading the test data
+
+The fastest and easiest way to obtain the test data is to use CMake to generate
+a build using the Unix Makefiles generator, and then to build only the testdata
+rule:
+
+~~~
+    $ cmake path/to/aom -G "Unix Makefiles"
+    # 28 is used because there are 28 test files as of this writing.
+    $ make -j28 testdata
+~~~
+
+The above make command will only download and verify the test data.
+
+### Adding a new test data file
+
+First, add the new test data file to the `aom-test-data` bucket of the
+`aomedia-testing` project on Google Cloud Platform. You may need to ask someone
+with the necessary access permissions to do this for you.
+
+NOTE: When a new test data file is added to the `aom-test-data` bucket, its
+"Public access" is initially "Not public". We need to change its
+"Public access" to "Public" by using the following
+[`gsutil`](https://cloud.google.com/storage/docs/gsutil_install) command:
+~~~
+    $ gsutil acl ch -g all:R gs://aom-test-data/test-data-file-name
+~~~
+This command grants the `AllUsers` group READ access to the file named
+"test-data-file-name" in the `aom-test-data` bucket.
+
+Once the new test data file has been added to `aom-test-data`, create a CL to
+add the name of the new test data file to `test/test_data_util.cmake` and add
+the SHA1 checksum of the new test data file to `test/test-data.sha1`. (The SHA1
+checksum of a file can be calculated by running the `sha1sum` command on the
+file.)
+
+### Additional test data
+
+The test data mentioned above is strictly intended for unit testing.
+
+Additional input data for testing the encoder can be obtained from:
+https://media.xiph.org/video/derf/
+
+### Sharded testing
+
+The AV1 codec library unit tests are built upon gtest which supports sharding of
+test jobs. Sharded test runs can be achieved in a couple of ways.
+
+#### 1. Running test\_libaom directly:
+
+~~~
+   # Set the environment variable GTEST_TOTAL_SHARDS to control the number of
+   # shards.
+   $ export GTEST_TOTAL_SHARDS=10
+   # (GTEST shard indexing is 0 based).
+   $ seq 0 $(( $GTEST_TOTAL_SHARDS - 1 )) \
+       | xargs -n 1 -P 0 -I{} env GTEST_SHARD_INDEX={} ./test_libaom
+~~~
+
+To create a test shard for each CPU core available on the current system set
+`GTEST_TOTAL_SHARDS` to the number of CPU cores on your system minus one.
+
+#### 2. Running the tests via the CMake build:
+
+~~~
+    # For IDE based builds, ENABLE_IDE_TEST_HOSTING must be enabled. See
+    # the IDE hosted tests section above for more information. If the IDE
+    # supports building targets concurrently tests will be sharded by default.
+
+    # For make and ninja builds the -j parameter controls the number of shards
+    # at test run time. This example will run the tests using 10 shards via
+    # make.
+    $ make -j10 runtests
+~~~
+
+The maximum number of test targets that can run concurrently is determined by
+the number of CPUs on the system where the build is configured as detected by
+CMake. A system with 24 cores can run 24 test shards using a value of 24 with
+the `-j` parameter. When CMake is unable to detect the number of cores 10 shards
+is the default maximum value.
+
+## Coding style
+
+We are using the Google C Coding Style defined by the
+[Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html).
+
+The coding style used by this project is enforced with clang-format using the
+configuration contained in the
+[.clang-format](https://chromium.googlesource.com/webm/aom/+/master/.clang-format)
+file in the root of the repository.
+
+You can download clang-format using your system's package manager, or directly
+from [llvm.org](http://llvm.org/releases/download.html). You can also view the
+[documentation](https://clang.llvm.org/docs/ClangFormat.html) on llvm.org.
+Output from clang-format varies by clang-format version, for best results your
+version should match the one used on Jenkins. You can find the clang-format
+version by reading the comment in the `.clang-format` file linked above.
+
+Before pushing changes for review you can format your code with:
+
+~~~
+    # Apply clang-format to modified .c, .h and .cc files
+    $ clang-format -i --style=file \
+      $(git diff --name-only --diff-filter=ACMR '*.[hc]' '*.cc')
+~~~
+
+Check the .clang-format file for the version used to generate it if there is any
+difference between your local formatting and the review system.
+
+Some Git installations have clang-format integration. Here are some examples:
+
+~~~
+    # Apply clang-format to all staged changes:
+    $ git clang-format
+
+    # Clang format all staged and unstaged changes:
+    $ git clang-format -f
+
+    # Clang format all staged and unstaged changes interactively:
+    $ git clang-format -f -p
+~~~
+
+## Submitting patches
+
+We manage the submission of patches using the
+[Gerrit](https://www.gerritcodereview.com/) code review tool. This tool
+implements a workflow on top of the Git version control system to ensure that
+all changes get peer reviewed and tested prior to their distribution.
+
+### Login cookie
+
+Browse to [AOMedia Git index](https://aomedia.googlesource.com/) and login with
+your account (Gmail credentials, for example). Next, follow the
+`Generate Password` Password link at the top of the page. You’ll be given
+instructions for creating a cookie to use with our Git repos.
+
+### Contributor agreement
+
+You will be required to execute a
+[contributor agreement](http://aomedia.org/license) to ensure that the AOMedia
+Project has the right to distribute your changes.
+
+### Testing your code
+
+The testing basics are covered in the [testing section](#testing-the-av1-codec)
+above.
+
+In addition to the local tests, many more (e.g. asan, tsan, valgrind) will run
+through Jenkins instances upon upload to gerrit.
+
+### Commit message hook
+
+Gerrit requires that each submission include a unique Change-Id. You can assign
+one manually using git commit --amend, but it’s easier to automate it with the
+commit-msg hook provided by Gerrit.
+
+Copy commit-msg to the `.git/hooks` directory of your local repo. Here's an
+example:
+
+~~~
+    $ curl -Lo aom/.git/hooks/commit-msg https://chromium-review.googlesource.com/tools/hooks/commit-msg
+
+    # Next, ensure that the downloaded commit-msg script is executable:
+    $ chmod u+x aom/.git/hooks/commit-msg
+~~~
+
+See the Gerrit
+[documentation](https://gerrit-review.googlesource.com/Documentation/user-changeid.html)
+for more information.
+
+### Upload your change
+
+The command line to upload your patch looks like this:
+
+~~~
+    $ git push https://aomedia-review.googlesource.com/aom HEAD:refs/for/master
+~~~
+
+### Incorporating reviewer comments
+
+If you previously uploaded a change to Gerrit and the Approver has asked for
+changes, follow these steps:
+
+1. Edit the files to make the changes the reviewer has requested.
+2. Recommit your edits using the --amend flag, for example:
+
+~~~
+   $ git commit -a --amend
+~~~
+
+3. Use the same git push command as above to upload to Gerrit again for another
+   review cycle.
+
+In general, you should not rebase your changes when doing updates in response to
+review. Doing so can make it harder to follow the evolution of your change in
+the diff view.
+
+### Submitting your change
+
+Once your change has been Approved and Verified, you can “submit” it through the
+Gerrit UI. This will usually automatically rebase your change onto the branch
+specified.
+
+Sometimes this can’t be done automatically. If you run into this problem, you
+must rebase your changes manually:
+
+~~~
+    $ git fetch
+    $ git rebase origin/branchname
+~~~
+
+If there are any conflicts, resolve them as you normally would with Git. When
+you’re done, reupload your change.
+
+### Viewing the status of uploaded changes
+
+To check the status of a change that you uploaded, open
+[Gerrit](https://aomedia-review.googlesource.com/), sign in, and click My >
+Changes.
+
+## Support
+
+This library is an open source project supported by its community. Please
+please email aomediacodec@jointdevelopment.kavi.com for help.
+
+## Bug reports
+
+Bug reports can be filed in the Alliance for Open Media
+[issue tracker](https://bugs.chromium.org/p/aomedia/issues/list).
diff --git a/libs/libaom/src/Sample.cfg b/libs/libaom/src/Sample.cfg
new file mode 100644
index 000000000..d5dbe6641
--- /dev/null
+++ b/libs/libaom/src/Sample.cfg
@@ -0,0 +1,35 @@
+#sample config file
+super_block_size = 128                  # super block size. 0, 64 or 128
+max_partition_size = 128                # max partition size(8, 16, 32, 64, 128)
+min_partition_size = 4                  # min partition size(4, 8, 16, 32, 64)
+disable_rect_partition_type = 0         # disable rectangle partition type
+disable_ab_partition_type = 0           # disable AB partition type
+disable_1to4_partition_type = 0         # disable 1 to 4 and 4 to 1 partition type
+disable_intra_angle_delta = 0           # disable intra angle delta
+disable_paeth_intra = 0                 # disable paeth intra
+disable_smooth_intra = 0                # disable intra smooth mode
+disable_intra_edge_filter = 0           # disable intra edge filter
+disable_filter_intra = 0                # disable filter intra
+disable_intrabc = 0                     # disable Intra Block Copy
+disable_cfl = 0                         # disable chroma from luma prediction
+disable_palette = 0                     # disable Palette
+disable_flip_idtx = 0                   # disable flip and identity transform
+disable_tx_64x64 = 0                    # disable 64x64 transform
+reduced_tx_type_set = 0                 # use reduced transform type set
+reduced_reference_set = 0               # use reduced reference frame set
+disable_obmc = 0                        # disable OBMC
+disable_warp_motion = 0                 # disable Warped Motion
+disable_global_motion = 0               # disable global motion
+disable_ref_frame_mv = 0                # disable ref mv
+disable_dual_filter = 0                 # disable dual interpolation filter
+disable_one_sided_comp = 0              # disable one sided compound mode
+disable_masked_comp = 0                 # disable masked compound prediction
+disable_diff_wtd_comp = 0               # disable difference weighted compound mode
+disable_inter_inter_wedge = 0           # disable inter/inter wedge comp
+disable_dist_wtd_comp = 0               # disable distant weighted compound mode
+disable_inter_intra_comp = 0            # disable inter/intra compound mode.
+disable_inter_intra_wedge = 0           # disable inter/intra wedge comp
+disable_smooth_inter_intra = 0          # disable smooth inter/intra
+disable_cdef = 0                        # disable CDEF filter
+disable_lr = 0                          # disable Loop Restoration Filter
+disable_trellis_quant = 0               # disable trellis quantization
\ No newline at end of file
diff --git a/libs/libaom/src/aom/aom.h b/libs/libaom/src/aom/aom.h
new file mode 100644
index 000000000..c591dc9a4
--- /dev/null
+++ b/libs/libaom/src/aom/aom.h
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\defgroup aom AOM
+ * \ingroup codecs
+ * AOM is aom's newest video compression algorithm that uses motion
+ * compensated prediction, Discrete Cosine Transform (DCT) coding of the
+ * prediction error signal and context dependent entropy coding techniques
+ * based on arithmetic principles. It features:
+ *  - YUV 4:2:0 image format
+ *  - Macro-block based coding (16x16 luma plus two 8x8 chroma)
+ *  - 1/4 (1/8) pixel accuracy motion compensated prediction
+ *  - 4x4 DCT transform
+ *  - 128 level linear quantizer
+ *  - In loop deblocking filter
+ *  - Context-based entropy coding
+ *
+ * @{
+ */
+/*!\file
+ * \brief Provides controls common to both the AOM encoder and decoder.
+ */
+#ifndef AOM_AOM_AOM_H_
+#define AOM_AOM_AOM_H_
+
+#include "aom/aom_codec.h"
+#include "aom/aom_image.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!\brief Control functions
+ *
+ * The set of macros define the control functions of AOM interface
+ */
+enum aom_com_control_id {
+  /* TODO(https://crbug.com/aomedia/2671): The encoder overlaps the range of
+   * these values for its control ids, see the NOTEs in aom/aomcx.h. These
+   * should be migrated to something like the AOM_DECODER_CTRL_ID_START range
+   * next time we're ready to break the ABI.
+   */
+  AV1_GET_REFERENCE = 128,  /**< get a pointer to a reference frame,
+                               av1_ref_frame_t* parameter */
+  AV1_SET_REFERENCE = 129,  /**< write a frame into a reference buffer,
+                               av1_ref_frame_t* parameter */
+  AV1_COPY_REFERENCE = 130, /**< get a copy of reference frame from the decoderm
+                               av1_ref_frame_t* parameter */
+  AOM_COMMON_CTRL_ID_MAX,
+
+  AV1_GET_NEW_FRAME_IMAGE =
+      192, /**< get a pointer to the new frame, aom_image_t* parameter */
+  AV1_COPY_NEW_FRAME_IMAGE = 193, /**< copy the new frame to an external buffer,
+                                     aom_image_t* parameter */
+
+  AOM_DECODER_CTRL_ID_START = 256
+};
+
+/*!\brief AV1 specific reference frame data struct
+ *
+ * Define the data struct to access av1 reference frames.
+ */
+typedef struct av1_ref_frame {
+  int idx;              /**< frame index to get (input) */
+  int use_external_ref; /**< Directly use external ref buffer(decoder only) */
+  aom_image_t img;      /**< img structure to populate (output) */
+} av1_ref_frame_t;
+
+/*!\cond */
+/*!\brief aom decoder control function parameter type
+ *
+ * Defines the data type for each of AOM decoder control function requires.
+ *
+ * \note For each control ID "X", a macro-define of
+ * AOM_CTRL_X is provided. It is used at compile time to determine
+ * if the control ID is supported by the libaom library available,
+ * when the libaom version cannot be controlled.
+ */
+AOM_CTRL_USE_TYPE(AV1_GET_REFERENCE, av1_ref_frame_t *)
+#define AOM_CTRL_AV1_GET_REFERENCE
+
+AOM_CTRL_USE_TYPE(AV1_SET_REFERENCE, av1_ref_frame_t *)
+#define AOM_CTRL_AV1_SET_REFERENCE
+
+AOM_CTRL_USE_TYPE(AV1_COPY_REFERENCE, av1_ref_frame_t *)
+#define AOM_CTRL_AV1_COPY_REFERENCE
+
+AOM_CTRL_USE_TYPE(AV1_GET_NEW_FRAME_IMAGE, aom_image_t *)
+#define AOM_CTRL_AV1_GET_NEW_FRAME_IMAGE
+
+AOM_CTRL_USE_TYPE(AV1_COPY_NEW_FRAME_IMAGE, aom_image_t *)
+#define AOM_CTRL_AV1_COPY_NEW_FRAME_IMAGE
+
+/*!\endcond */
+/*! @} - end defgroup aom */
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_AOM_H_
diff --git a/libs/libaom/src/aom/aom_codec.h b/libs/libaom/src/aom/aom_codec.h
new file mode 100644
index 000000000..75f6a1af2
--- /dev/null
+++ b/libs/libaom/src/aom/aom_codec.h
@@ -0,0 +1,478 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\defgroup codec Common Algorithm Interface
+ * This abstraction allows applications to easily support multiple video
+ * formats with minimal code duplication. This section describes the interface
+ * common to all codecs (both encoders and decoders).
+ * @{
+ */
+
+/*!\file
+ * \brief Describes the codec algorithm interface to applications.
+ *
+ * This file describes the interface between an application and a
+ * video codec algorithm.
+ *
+ * An application instantiates a specific codec instance by using
+ * aom_codec_init() and a pointer to the algorithm's interface structure:
+ *     <pre>
+ *     my_app.c:
+ *       extern aom_codec_iface_t my_codec;
+ *       {
+ *           aom_codec_ctx_t algo;
+ *           res = aom_codec_init(&algo, &my_codec);
+ *       }
+ *     </pre>
+ *
+ * Once initialized, the instance is managed using other functions from
+ * the aom_codec_* family.
+ */
+#ifndef AOM_AOM_AOM_CODEC_H_
+#define AOM_AOM_AOM_CODEC_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "aom/aom_image.h"
+#include "aom/aom_integer.h"
+
+/*!\brief Decorator indicating a function is deprecated */
+#ifndef AOM_DEPRECATED
+#if defined(__GNUC__) && __GNUC__
+#define AOM_DEPRECATED __attribute__((deprecated))
+#elif defined(_MSC_VER)
+#define AOM_DEPRECATED
+#else
+#define AOM_DEPRECATED
+#endif
+#endif /* AOM_DEPRECATED */
+
+#ifndef AOM_DECLSPEC_DEPRECATED
+#if defined(__GNUC__) && __GNUC__
+#define AOM_DECLSPEC_DEPRECATED /**< \copydoc #AOM_DEPRECATED */
+#elif defined(_MSC_VER)
+/*!\brief \copydoc #AOM_DEPRECATED */
+#define AOM_DECLSPEC_DEPRECATED __declspec(deprecated)
+#else
+#define AOM_DECLSPEC_DEPRECATED /**< \copydoc #AOM_DEPRECATED */
+#endif
+#endif /* AOM_DECLSPEC_DEPRECATED */
+
+/*!\brief Decorator indicating a function is potentially unused */
+#ifdef AOM_UNUSED
+#elif defined(__GNUC__) || defined(__clang__)
+#define AOM_UNUSED __attribute__((unused))
+#else
+#define AOM_UNUSED
+#endif
+
+/*!\brief Decorator indicating that given struct/union/enum is packed */
+#ifndef ATTRIBUTE_PACKED
+#if defined(__GNUC__) && __GNUC__
+#define ATTRIBUTE_PACKED __attribute__((packed))
+#elif defined(_MSC_VER)
+#define ATTRIBUTE_PACKED
+#else
+#define ATTRIBUTE_PACKED
+#endif
+#endif /* ATTRIBUTE_PACKED */
+
+/*!\brief Current ABI version number
+ *
+ * \internal
+ * If this file is altered in any way that changes the ABI, this value
+ * must be bumped.  Examples include, but are not limited to, changing
+ * types, removing or reassigning enums, adding/removing/rearranging
+ * fields to structures
+ */
+#define AOM_CODEC_ABI_VERSION (5 + AOM_IMAGE_ABI_VERSION) /**<\hideinitializer*/
+
+/*!\brief Algorithm return codes */
+typedef enum {
+  /*!\brief Operation completed without error */
+  AOM_CODEC_OK,
+
+  /*!\brief Unspecified error */
+  AOM_CODEC_ERROR,
+
+  /*!\brief Memory operation failed */
+  AOM_CODEC_MEM_ERROR,
+
+  /*!\brief ABI version mismatch */
+  AOM_CODEC_ABI_MISMATCH,
+
+  /*!\brief Algorithm does not have required capability */
+  AOM_CODEC_INCAPABLE,
+
+  /*!\brief The given bitstream is not supported.
+   *
+   * The bitstream was unable to be parsed at the highest level. The decoder
+   * is unable to proceed. This error \ref SHOULD be treated as fatal to the
+   * stream. */
+  AOM_CODEC_UNSUP_BITSTREAM,
+
+  /*!\brief Encoded bitstream uses an unsupported feature
+   *
+   * The decoder does not implement a feature required by the encoder. This
+   * return code should only be used for features that prevent future
+   * pictures from being properly decoded. This error \ref MAY be treated as
+   * fatal to the stream or \ref MAY be treated as fatal to the current GOP.
+   */
+  AOM_CODEC_UNSUP_FEATURE,
+
+  /*!\brief The coded data for this stream is corrupt or incomplete
+   *
+   * There was a problem decoding the current frame.  This return code
+   * should only be used for failures that prevent future pictures from
+   * being properly decoded. This error \ref MAY be treated as fatal to the
+   * stream or \ref MAY be treated as fatal to the current GOP. If decoding
+   * is continued for the current GOP, artifacts may be present.
+   */
+  AOM_CODEC_CORRUPT_FRAME,
+
+  /*!\brief An application-supplied parameter is not valid.
+   *
+   */
+  AOM_CODEC_INVALID_PARAM,
+
+  /*!\brief An iterator reached the end of list.
+   *
+   */
+  AOM_CODEC_LIST_END
+
+} aom_codec_err_t;
+
+/*! \brief Codec capabilities bitfield
+ *
+ *  Each codec advertises the capabilities it supports as part of its
+ *  ::aom_codec_iface_t interface structure. Capabilities are extra interfaces
+ *  or functionality, and are not required to be supported.
+ *
+ *  The available flags are specified by AOM_CODEC_CAP_* defines.
+ */
+typedef long aom_codec_caps_t;
+#define AOM_CODEC_CAP_DECODER 0x1 /**< Is a decoder */
+#define AOM_CODEC_CAP_ENCODER 0x2 /**< Is an encoder */
+
+/*! \brief Initialization-time Feature Enabling
+ *
+ *  Certain codec features must be known at initialization time, to allow for
+ *  proper memory allocation.
+ *
+ *  The available flags are specified by AOM_CODEC_USE_* defines.
+ */
+typedef long aom_codec_flags_t;
+
+/*!\brief Time Stamp Type
+ *
+ * An integer, which when multiplied by the stream's time base, provides
+ * the absolute time of a sample.
+ */
+typedef int64_t aom_codec_pts_t;
+
+/*!\brief Codec interface structure.
+ *
+ * Contains function pointers and other data private to the codec
+ * implementation. This structure is opaque to the application. Common
+ * functions used with this structure:
+ *   - aom_codec_iface_name: get the name of the codec
+ *   - aom_codec_get_caps: returns the capabilities of the codec (see
+ *     aom_encoder.h for more details)
+ *   - aom_codec_enc_config_default: generate the default config to use
+ *     when initializing the encoder
+ *   - aom_codec_dec_init, aom_codec_enc_init: initialize the codec context
+ *     structure (see documentation on aom_codec_ctx for more information).
+ */
+typedef const struct aom_codec_iface aom_codec_iface_t;
+
+/*!\brief Codec private data structure.
+ *
+ * Contains data private to the codec implementation. This structure is opaque
+ * to the application.
+ */
+typedef struct aom_codec_priv aom_codec_priv_t;
+
+/*!\brief Iterator
+ *
+ * Opaque storage used for iterating over lists.
+ */
+typedef const void *aom_codec_iter_t;
+
+/*!\brief Codec context structure
+ *
+ * All codecs \ref MUST support this context structure fully. In general,
+ * this data should be considered private to the codec algorithm, and
+ * not be manipulated or examined by the calling application. Applications
+ * may reference the 'name' member to get a printable description of the
+ * algorithm.
+ */
+typedef struct aom_codec_ctx {
+  const char *name;             /**< Printable interface name */
+  aom_codec_iface_t *iface;     /**< Interface pointers */
+  aom_codec_err_t err;          /**< Last returned error */
+  const char *err_detail;       /**< Detailed info, if available */
+  aom_codec_flags_t init_flags; /**< Flags passed at init time */
+  union {
+    /**< Decoder Configuration Pointer */
+    const struct aom_codec_dec_cfg *dec;
+    /**< Encoder Configuration Pointer */
+    const struct aom_codec_enc_cfg *enc;
+    const void *raw;
+  } config;               /**< Configuration pointer aliasing union */
+  aom_codec_priv_t *priv; /**< Algorithm private storage */
+} aom_codec_ctx_t;
+
+/*!\brief Bit depth for codec
+ * *
+ * This enumeration determines the bit depth of the codec.
+ */
+typedef enum aom_bit_depth {
+  AOM_BITS_8 = 8,   /**<  8 bits */
+  AOM_BITS_10 = 10, /**< 10 bits */
+  AOM_BITS_12 = 12, /**< 12 bits */
+} aom_bit_depth_t;
+
+/*!\brief Superblock size selection.
+ *
+ * Defines the superblock size used for encoding. The superblock size can
+ * either be fixed at 64x64 or 128x128 pixels, or it can be dynamically
+ * selected by the encoder for each frame.
+ */
+typedef enum aom_superblock_size {
+  AOM_SUPERBLOCK_SIZE_64X64,   /**< Always use 64x64 superblocks. */
+  AOM_SUPERBLOCK_SIZE_128X128, /**< Always use 128x128 superblocks. */
+  AOM_SUPERBLOCK_SIZE_DYNAMIC  /**< Select superblock size dynamically. */
+} aom_superblock_size_t;
+
+/*
+ * Library Version Number Interface
+ *
+ * For example, see the following sample return values:
+ *     aom_codec_version()           (1<<16 | 2<<8 | 3)
+ *     aom_codec_version_str()       "v1.2.3-rc1-16-gec6a1ba"
+ *     aom_codec_version_extra_str() "rc1-16-gec6a1ba"
+ */
+
+/*!\brief Return the version information (as an integer)
+ *
+ * Returns a packed encoding of the library version number. This will only
+ * include
+ * the major.minor.patch component of the version number. Note that this encoded
+ * value should be accessed through the macros provided, as the encoding may
+ * change
+ * in the future.
+ *
+ */
+int aom_codec_version(void);
+
+/*!\brief Return the version major number */
+#define aom_codec_version_major() ((aom_codec_version() >> 16) & 0xff)
+
+/*!\brief Return the version minor number */
+#define aom_codec_version_minor() ((aom_codec_version() >> 8) & 0xff)
+
+/*!\brief Return the version patch number */
+#define aom_codec_version_patch() ((aom_codec_version() >> 0) & 0xff)
+
+/*!\brief Return the version information (as a string)
+ *
+ * Returns a printable string containing the full library version number. This
+ * may
+ * contain additional text following the three digit version number, as to
+ * indicate
+ * release candidates, prerelease versions, etc.
+ *
+ */
+const char *aom_codec_version_str(void);
+
+/*!\brief Return the version information (as a string)
+ *
+ * Returns a printable "extra string". This is the component of the string
+ * returned
+ * by aom_codec_version_str() following the three digit version number.
+ *
+ */
+const char *aom_codec_version_extra_str(void);
+
+/*!\brief Return the build configuration
+ *
+ * Returns a printable string containing an encoded version of the build
+ * configuration. This may be useful to aom support.
+ *
+ */
+const char *aom_codec_build_config(void);
+
+/*!\brief Return the name for a given interface
+ *
+ * Returns a human readable string for name of the given codec interface.
+ *
+ * \param[in]    iface     Interface pointer
+ *
+ */
+const char *aom_codec_iface_name(aom_codec_iface_t *iface);
+
+/*!\brief Convert error number to printable string
+ *
+ * Returns a human readable string for the last error returned by the
+ * algorithm. The returned error will be one line and will not contain
+ * any newline characters.
+ *
+ *
+ * \param[in]    err     Error number.
+ *
+ */
+const char *aom_codec_err_to_string(aom_codec_err_t err);
+
+/*!\brief Retrieve error synopsis for codec context
+ *
+ * Returns a human readable string for the last error returned by the
+ * algorithm. The returned error will be one line and will not contain
+ * any newline characters.
+ *
+ *
+ * \param[in]    ctx     Pointer to this instance's context.
+ *
+ */
+const char *aom_codec_error(aom_codec_ctx_t *ctx);
+
+/*!\brief Retrieve detailed error information for codec context
+ *
+ * Returns a human readable string providing detailed information about
+ * the last error.
+ *
+ * \param[in]    ctx     Pointer to this instance's context.
+ *
+ * \retval NULL
+ *     No detailed information is available.
+ */
+const char *aom_codec_error_detail(aom_codec_ctx_t *ctx);
+
+/* REQUIRED FUNCTIONS
+ *
+ * The following functions are required to be implemented for all codecs.
+ * They represent the base case functionality expected of all codecs.
+ */
+
+/*!\brief Destroy a codec instance
+ *
+ * Destroys a codec context, freeing any associated memory buffers.
+ *
+ * \param[in] ctx   Pointer to this instance's context
+ *
+ * \retval #AOM_CODEC_OK
+ *     The codec algorithm initialized.
+ * \retval #AOM_CODEC_MEM_ERROR
+ *     Memory allocation failed.
+ */
+aom_codec_err_t aom_codec_destroy(aom_codec_ctx_t *ctx);
+
+/*!\brief Get the capabilities of an algorithm.
+ *
+ * Retrieves the capabilities bitfield from the algorithm's interface.
+ *
+ * \param[in] iface   Pointer to the algorithm interface
+ *
+ */
+aom_codec_caps_t aom_codec_get_caps(aom_codec_iface_t *iface);
+
+/*!\name Codec Control
+ *
+ * The aom_codec_control function exchanges algorithm specific data with the
+ * codec instance. Additionally, the macro AOM_CODEC_CONTROL_TYPECHECKED is
+ * provided, which will type-check the parameter against the control ID before
+ * calling aom_codec_control - note that this macro requires the control ID
+ * to be directly encoded in it, e.g.,
+ * AOM_CODEC_CONTROL_TYPECHECKED(&ctx, AOME_SET_CPUUSED, 8).
+ *
+ * The codec control IDs can be found in aom.h, aomcx.h, and aomdx.h
+ * (defined as aom_com_control_id, aome_enc_control_id, and aom_dec_control_id).
+ * @{
+ */
+/*!\brief Algorithm Control
+ *
+ * aom_codec_control takes a context, a control ID, and a third parameter
+ * (with varying type). If the context is non-null and an error occurs,
+ * ctx->err will be set to the same value as the return value.
+ *
+ * \param[in]     ctx              Pointer to this instance's context
+ * \param[in]     ctrl_id          Algorithm specific control identifier
+ *
+ * \retval #AOM_CODEC_OK
+ *     The control request was processed.
+ * \retval #AOM_CODEC_ERROR
+ *     The control request was not processed.
+ * \retval #AOM_CODEC_INVALID_PARAM
+ *     The data was not valid.
+ */
+aom_codec_err_t aom_codec_control(aom_codec_ctx_t *ctx, int ctrl_id, ...);
+
+/*!\brief aom_codec_control wrapper macro (adds type-checking, less flexible)
+ *
+ * This macro allows for type safe conversions across the variadic parameter
+ * to aom_codec_control(). However, it requires the explicit control ID
+ * be passed in (it cannot be passed in via a variable) -- otherwise a compiler
+ * error will occur. After the type checking, it calls aom_codec_control.
+ */
+#define AOM_CODEC_CONTROL_TYPECHECKED(ctx, id, data) \
+  aom_codec_control_typechecked_##id(ctx, id, data) /**<\hideinitializer*/
+
+/*!\brief Creates typechecking mechanisms for aom_codec_control
+ *
+ * It defines a static function with the correctly typed arguments as a wrapper
+ * to the type-unsafe aom_codec_control function. It also creates a typedef
+ * for each type.
+ */
+#define AOM_CTRL_USE_TYPE(id, typ)                           \
+  static aom_codec_err_t aom_codec_control_typechecked_##id( \
+      aom_codec_ctx_t *, int, typ) AOM_UNUSED;               \
+  static aom_codec_err_t aom_codec_control_typechecked_##id( \
+      aom_codec_ctx_t *ctx, int ctrl, typ data) {            \
+    return aom_codec_control(ctx, ctrl, data);               \
+  } /**<\hideinitializer*/                                   \
+  typedef typ aom_codec_control_type_##id;
+/*!@} end Codec Control group */
+
+/*!\brief OBU types. */
+typedef enum ATTRIBUTE_PACKED {
+  OBU_SEQUENCE_HEADER = 1,
+  OBU_TEMPORAL_DELIMITER = 2,
+  OBU_FRAME_HEADER = 3,
+  OBU_TILE_GROUP = 4,
+  OBU_METADATA = 5,
+  OBU_FRAME = 6,
+  OBU_REDUNDANT_FRAME_HEADER = 7,
+  OBU_TILE_LIST = 8,
+  OBU_PADDING = 15,
+} OBU_TYPE;
+
+/*!\brief OBU metadata types. */
+typedef enum {
+  OBU_METADATA_TYPE_AOM_RESERVED_0 = 0,
+  OBU_METADATA_TYPE_HDR_CLL = 1,
+  OBU_METADATA_TYPE_HDR_MDCV = 2,
+  OBU_METADATA_TYPE_SCALABILITY = 3,
+  OBU_METADATA_TYPE_ITUT_T35 = 4,
+  OBU_METADATA_TYPE_TIMECODE = 5,
+} OBU_METADATA_TYPE;
+
+/*!\brief Returns string representation of OBU_TYPE.
+ *
+ * \param[in]     type            The OBU_TYPE to convert to string.
+ */
+const char *aom_obu_type_to_string(OBU_TYPE type);
+
+/*!@} - end defgroup codec*/
+#ifdef __cplusplus
+}
+#endif
+#endif  // AOM_AOM_AOM_CODEC_H_
diff --git a/libs/libaom/src/aom/aom_decoder.h b/libs/libaom/src/aom/aom_decoder.h
new file mode 100644
index 000000000..5ce7c7b10
--- /dev/null
+++ b/libs/libaom/src/aom/aom_decoder.h
@@ -0,0 +1,257 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AOM_AOM_DECODER_H_
+#define AOM_AOM_AOM_DECODER_H_
+
+/*!\defgroup decoder Decoder Algorithm Interface
+ * \ingroup codec
+ * This abstraction allows applications using this decoder to easily support
+ * multiple video formats with minimal code duplication. This section describes
+ * the interface common to all decoders.
+ * @{
+ */
+
+/*!\file
+ * \brief Describes the decoder algorithm interface to applications.
+ *
+ * This file describes the interface between an application and a
+ * video decoder algorithm.
+ *
+ */
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "aom/aom_codec.h"
+#include "aom/aom_frame_buffer.h"
+
+/*!\brief Current ABI version number
+ *
+ * \internal
+ * If this file is altered in any way that changes the ABI, this value
+ * must be bumped.  Examples include, but are not limited to, changing
+ * types, removing or reassigning enums, adding/removing/rearranging
+ * fields to structures
+ */
+#define AOM_DECODER_ABI_VERSION \
+  (6 + AOM_CODEC_ABI_VERSION) /**<\hideinitializer*/
+
+/*! \brief Decoder capabilities bitfield
+ *
+ *  Each decoder advertises the capabilities it supports as part of its
+ *  ::aom_codec_iface_t interface structure. Capabilities are extra interfaces
+ *  or functionality, and are not required to be supported by a decoder.
+ *
+ *  The available flags are specified by AOM_CODEC_CAP_* defines.
+ */
+/*!brief Can support external frame buffers */
+#define AOM_CODEC_CAP_EXTERNAL_FRAME_BUFFER 0x200000
+
+/*! \brief Initialization-time Feature Enabling
+ *
+ *  Certain codec features must be known at initialization time, to allow for
+ *  proper memory allocation.
+ *
+ *  The available flags are specified by AOM_CODEC_USE_* defines.
+ */
+
+/*!\brief Stream properties
+ *
+ * This structure is used to query or set properties of the decoded
+ * stream.
+ */
+typedef struct aom_codec_stream_info {
+  unsigned int w;                      /**< Width (or 0 for unknown/default) */
+  unsigned int h;                      /**< Height (or 0 for unknown/default) */
+  unsigned int is_kf;                  /**< Current frame is a keyframe */
+  unsigned int number_spatial_layers;  /**< Number of spatial layers */
+  unsigned int number_temporal_layers; /**< Number of temporal layers */
+  unsigned int is_annexb;              /**< Is Bitstream in Annex-B format */
+} aom_codec_stream_info_t;
+
+/* REQUIRED FUNCTIONS
+ *
+ * The following functions are required to be implemented for all decoders.
+ * They represent the base case functionality expected of all decoders.
+ */
+
+/*!\brief Initialization Configurations
+ *
+ * This structure is used to pass init time configuration options to the
+ * decoder.
+ */
+typedef struct aom_codec_dec_cfg {
+  unsigned int threads; /**< Maximum number of threads to use, default 1 */
+  unsigned int w;       /**< Width */
+  unsigned int h;       /**< Height */
+  unsigned int allow_lowbitdepth; /**< Allow use of low-bitdepth coding path */
+} aom_codec_dec_cfg_t;            /**< alias for struct aom_codec_dec_cfg */
+
+/*!\brief Initialize a decoder instance
+ *
+ * Initializes a decoder context using the given interface. Applications
+ * should call the aom_codec_dec_init convenience macro instead of this
+ * function directly, to ensure that the ABI version number parameter
+ * is properly initialized.
+ *
+ * If the library was configured with cmake -DCONFIG_MULTITHREAD=0, this
+ * call is not thread safe and should be guarded with a lock if being used
+ * in a multithreaded context.
+ *
+ * \param[in]    ctx     Pointer to this instance's context.
+ * \param[in]    iface   Pointer to the algorithm interface to use.
+ * \param[in]    cfg     Configuration to use, if known. May be NULL.
+ * \param[in]    flags   Bitfield of AOM_CODEC_USE_* flags
+ * \param[in]    ver     ABI version number. Must be set to
+ *                       AOM_DECODER_ABI_VERSION
+ * \retval #AOM_CODEC_OK
+ *     The decoder algorithm initialized.
+ * \retval #AOM_CODEC_MEM_ERROR
+ *     Memory allocation failed.
+ */
+aom_codec_err_t aom_codec_dec_init_ver(aom_codec_ctx_t *ctx,
+                                       aom_codec_iface_t *iface,
+                                       const aom_codec_dec_cfg_t *cfg,
+                                       aom_codec_flags_t flags, int ver);
+
+/*!\brief Convenience macro for aom_codec_dec_init_ver()
+ *
+ * Ensures the ABI version parameter is properly set.
+ */
+#define aom_codec_dec_init(ctx, iface, cfg, flags) \
+  aom_codec_dec_init_ver(ctx, iface, cfg, flags, AOM_DECODER_ABI_VERSION)
+
+/*!\brief Parse stream info from a buffer
+ *
+ * Performs high level parsing of the bitstream. Construction of a decoder
+ * context is not necessary. Can be used to determine if the bitstream is
+ * of the proper format, and to extract information from the stream.
+ *
+ * \param[in]      iface   Pointer to the algorithm interface
+ * \param[in]      data    Pointer to a block of data to parse
+ * \param[in]      data_sz Size of the data buffer
+ * \param[in,out]  si      Pointer to stream info to update. The is_annexb
+ *                         member \ref MUST be properly initialized. This
+ *                         function sets the rest of the members.
+ *
+ * \retval #AOM_CODEC_OK
+ *     Bitstream is parsable and stream information updated.
+ * \retval #AOM_CODEC_INVALID_PARAM
+ *     One of the arguments is invalid, for example a NULL pointer.
+ * \retval #AOM_CODEC_UNSUP_BITSTREAM
+ *     The decoder didn't recognize the coded data, or the
+ *     buffer was too short.
+ */
+aom_codec_err_t aom_codec_peek_stream_info(aom_codec_iface_t *iface,
+                                           const uint8_t *data, size_t data_sz,
+                                           aom_codec_stream_info_t *si);
+
+/*!\brief Return information about the current stream.
+ *
+ * Returns information about the stream that has been parsed during decoding.
+ *
+ * \param[in]      ctx     Pointer to this instance's context
+ * \param[in,out]  si      Pointer to stream info to update.
+ *
+ * \retval #AOM_CODEC_OK
+ *     Bitstream is parsable and stream information updated.
+ * \retval #AOM_CODEC_INVALID_PARAM
+ *     One of the arguments is invalid, for example a NULL pointer.
+ * \retval #AOM_CODEC_UNSUP_BITSTREAM
+ *     The decoder couldn't parse the submitted data.
+ */
+aom_codec_err_t aom_codec_get_stream_info(aom_codec_ctx_t *ctx,
+                                          aom_codec_stream_info_t *si);
+
+/*!\brief Decode data
+ *
+ * Processes a buffer of coded data. Encoded data \ref MUST be passed in DTS
+ * (decode time stamp) order. Frames produced will always be in PTS
+ * (presentation time stamp) order.
+ *
+ * \param[in] ctx          Pointer to this instance's context
+ * \param[in] data         Pointer to this block of new coded data.
+ * \param[in] data_sz      Size of the coded data, in bytes.
+ * \param[in] user_priv    Application specific data to associate with
+ *                         this frame.
+ *
+ * \return Returns #AOM_CODEC_OK if the coded data was processed completely
+ *         and future pictures can be decoded without error. Otherwise,
+ *         see the descriptions of the other error codes in ::aom_codec_err_t
+ *         for recoverability capabilities.
+ */
+aom_codec_err_t aom_codec_decode(aom_codec_ctx_t *ctx, const uint8_t *data,
+                                 size_t data_sz, void *user_priv);
+
+/*!\brief Decoded frames iterator
+ *
+ * Iterates over a list of the frames available for display. The iterator
+ * storage should be initialized to NULL to start the iteration. Iteration is
+ * complete when this function returns NULL.
+ *
+ * The list of available frames becomes valid upon completion of the
+ * aom_codec_decode call, and remains valid until the next call to
+ * aom_codec_decode.
+ *
+ * \param[in]     ctx      Pointer to this instance's context
+ * \param[in,out] iter     Iterator storage, initialized to NULL
+ *
+ * \return Returns a pointer to an image, if one is ready for display. Frames
+ *         produced will always be in PTS (presentation time stamp) order.
+ */
+aom_image_t *aom_codec_get_frame(aom_codec_ctx_t *ctx, aom_codec_iter_t *iter);
+
+/*!\defgroup cap_external_frame_buffer External Frame Buffer Functions
+ *
+ * The following function is required to be implemented for all decoders
+ * that advertise the AOM_CODEC_CAP_EXTERNAL_FRAME_BUFFER capability.
+ * Calling this function for codecs that don't advertise this capability
+ * will result in an error code being returned, usually AOM_CODEC_INCAPABLE.
+ * @{
+ */
+
+/*!\brief Pass in external frame buffers for the decoder to use.
+ *
+ * Registers functions to be called when libaom needs a frame buffer
+ * to decode the current frame and a function to be called when libaom does
+ * not internally reference the frame buffer. This set function must
+ * be called before the first call to decode or libaom will assume the
+ * default behavior of allocating frame buffers internally.
+ *
+ * \param[in] ctx          Pointer to this instance's context
+ * \param[in] cb_get       Pointer to the get callback function
+ * \param[in] cb_release   Pointer to the release callback function
+ * \param[in] cb_priv      Callback's private data
+ *
+ * \retval #AOM_CODEC_OK
+ *     External frame buffers will be used by libaom.
+ * \retval #AOM_CODEC_INVALID_PARAM
+ *     One or more of the callbacks were NULL.
+ * \retval #AOM_CODEC_ERROR
+ *     Decoder context not initialized.
+ * \retval #AOM_CODEC_INCAPABLE
+ *     Algorithm not capable of using external frame buffers.
+ *
+ * \note
+ * When decoding AV1, the application may be required to pass in at least
+ * #AOM_MAXIMUM_WORK_BUFFERS external frame buffers.
+ */
+aom_codec_err_t aom_codec_set_frame_buffer_functions(
+    aom_codec_ctx_t *ctx, aom_get_frame_buffer_cb_fn_t cb_get,
+    aom_release_frame_buffer_cb_fn_t cb_release, void *cb_priv);
+
+/*!@} - end defgroup cap_external_frame_buffer */
+
+/*!@} - end defgroup decoder*/
+#ifdef __cplusplus
+}
+#endif
+#endif  // AOM_AOM_AOM_DECODER_H_
diff --git a/libs/libaom/src/aom/aom_encoder.h b/libs/libaom/src/aom/aom_encoder.h
new file mode 100644
index 000000000..a494c17a4
--- /dev/null
+++ b/libs/libaom/src/aom/aom_encoder.h
@@ -0,0 +1,1136 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AOM_AOM_ENCODER_H_
+#define AOM_AOM_AOM_ENCODER_H_
+
+/*!\defgroup encoder Encoder Algorithm Interface
+ * \ingroup codec
+ * This abstraction allows applications using this encoder to easily support
+ * multiple video formats with minimal code duplication. This section describes
+ * the interface common to all encoders.
+ * @{
+ */
+
+/*!\file
+ * \brief Describes the encoder algorithm interface to applications.
+ *
+ * This file describes the interface between an application and a
+ * video encoder algorithm.
+ *
+ */
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "aom/aom_codec.h"
+
+/*!\brief Current ABI version number
+ *
+ * \internal
+ * If this file is altered in any way that changes the ABI, this value
+ * must be bumped.  Examples include, but are not limited to, changing
+ * types, removing or reassigning enums, adding/removing/rearranging
+ * fields to structures
+ */
+#define AOM_ENCODER_ABI_VERSION \
+  (8 + AOM_CODEC_ABI_VERSION) /**<\hideinitializer*/
+
+/*! \brief Encoder capabilities bitfield
+ *
+ *  Each encoder advertises the capabilities it supports as part of its
+ *  ::aom_codec_iface_t interface structure. Capabilities are extra
+ *  interfaces or functionality, and are not required to be supported
+ *  by an encoder.
+ *
+ *  The available flags are specified by AOM_CODEC_CAP_* defines.
+ */
+#define AOM_CODEC_CAP_PSNR 0x10000 /**< Can issue PSNR packets */
+
+/*! Can support input images at greater than 8 bitdepth.
+ */
+#define AOM_CODEC_CAP_HIGHBITDEPTH 0x40000
+
+/*! \brief Initialization-time Feature Enabling
+ *
+ *  Certain codec features must be known at initialization time, to allow
+ *  for proper memory allocation.
+ *
+ *  The available flags are specified by AOM_CODEC_USE_* defines.
+ */
+#define AOM_CODEC_USE_PSNR 0x10000 /**< Calculate PSNR on each frame */
+/*!\brief Make the encoder output one  partition at a time. */
+#define AOM_CODEC_USE_HIGHBITDEPTH 0x40000 /**< Use high bitdepth */
+
+/*!\brief Generic fixed size buffer structure
+ *
+ * This structure is able to hold a reference to any fixed size buffer.
+ */
+typedef struct aom_fixed_buf {
+  void *buf;       /**< Pointer to the data. Does NOT own the data! */
+  size_t sz;       /**< Length of the buffer, in chars */
+} aom_fixed_buf_t; /**< alias for struct aom_fixed_buf */
+
+/*!\brief Compressed Frame Flags
+ *
+ * This type represents a bitfield containing information about a compressed
+ * frame that may be useful to an application. The most significant 16 bits
+ * can be used by an algorithm to provide additional detail, for example to
+ * support frame types that are codec specific (MPEG-1 D-frames for example)
+ */
+typedef uint32_t aom_codec_frame_flags_t;
+#define AOM_FRAME_IS_KEY 0x1 /**< frame is the start of a GOP */
+/*!\brief frame can be dropped without affecting the stream (no future frame
+ * depends on this one) */
+#define AOM_FRAME_IS_DROPPABLE 0x2
+/*!\brief this is an INTRA_ONLY frame */
+#define AOM_FRAME_IS_INTRAONLY 0x10
+/*!\brief this is an S-frame */
+#define AOM_FRAME_IS_SWITCH 0x20
+/*!\brief this is an error-resilient frame */
+#define AOM_FRAME_IS_ERROR_RESILIENT 0x40
+/*!\brief this is a key-frame dependent recovery-point frame */
+#define AOM_FRAME_IS_DELAYED_RANDOM_ACCESS_POINT 0x80
+
+/*!\brief Error Resilient flags
+ *
+ * These flags define which error resilient features to enable in the
+ * encoder. The flags are specified through the
+ * aom_codec_enc_cfg::g_error_resilient variable.
+ */
+typedef uint32_t aom_codec_er_flags_t;
+/*!\brief Improve resiliency against losses of whole frames */
+#define AOM_ERROR_RESILIENT_DEFAULT 0x1
+
+/*!\brief Encoder output packet variants
+ *
+ * This enumeration lists the different kinds of data packets that can be
+ * returned by calls to aom_codec_get_cx_data(). Algorithms \ref MAY
+ * extend this list to provide additional functionality.
+ */
+enum aom_codec_cx_pkt_kind {
+  AOM_CODEC_CX_FRAME_PKT,    /**< Compressed video frame */
+  AOM_CODEC_STATS_PKT,       /**< Two-pass statistics for this frame */
+  AOM_CODEC_FPMB_STATS_PKT,  /**< first pass mb statistics for this frame */
+  AOM_CODEC_PSNR_PKT,        /**< PSNR statistics for this frame */
+  AOM_CODEC_CUSTOM_PKT = 256 /**< Algorithm extensions  */
+};
+
+/*!\brief Encoder output packet
+ *
+ * This structure contains the different kinds of output data the encoder
+ * may produce while compressing a frame.
+ */
+typedef struct aom_codec_cx_pkt {
+  enum aom_codec_cx_pkt_kind kind; /**< packet variant */
+  union {
+    struct {
+      void *buf; /**< compressed data buffer */
+      size_t sz; /**< length of compressed data */
+      /*!\brief time stamp to show frame (in timebase units) */
+      aom_codec_pts_t pts;
+      /*!\brief duration to show frame (in timebase units) */
+      unsigned long duration;
+      aom_codec_frame_flags_t flags; /**< flags for this frame */
+      /*!\brief the partition id defines the decoding order of the partitions.
+       * Only applicable when "output partition" mode is enabled. First
+       * partition has id 0.*/
+      int partition_id;
+      /*!\brief size of the visible frame in this packet */
+      size_t vis_frame_size;
+    } frame;                            /**< data for compressed frame packet */
+    aom_fixed_buf_t twopass_stats;      /**< data for two-pass packet */
+    aom_fixed_buf_t firstpass_mb_stats; /**< first pass mb packet */
+    struct aom_psnr_pkt {
+      unsigned int samples[4]; /**< Number of samples, total/y/u/v */
+      uint64_t sse[4];         /**< sum squared error, total/y/u/v */
+      double psnr[4];          /**< PSNR, total/y/u/v */
+    } psnr;                    /**< data for PSNR packet */
+    aom_fixed_buf_t raw;       /**< data for arbitrary packets */
+
+    /* This packet size is fixed to allow codecs to extend this
+     * interface without having to manage storage for raw packets,
+     * i.e., if it's smaller than 128 bytes, you can store in the
+     * packet list directly.
+     */
+    char pad[128 - sizeof(enum aom_codec_cx_pkt_kind)]; /**< fixed sz */
+  } data;                                               /**< packet data */
+} aom_codec_cx_pkt_t; /**< alias for struct aom_codec_cx_pkt */
+
+/*!\brief Rational Number
+ *
+ * This structure holds a fractional value.
+ */
+typedef struct aom_rational {
+  int num;        /**< fraction numerator */
+  int den;        /**< fraction denominator */
+} aom_rational_t; /**< alias for struct aom_rational */
+
+/*!\brief Multi-pass Encoding Pass */
+enum aom_enc_pass {
+  AOM_RC_ONE_PASS,   /**< Single pass mode */
+  AOM_RC_FIRST_PASS, /**< First pass of multi-pass mode */
+  AOM_RC_LAST_PASS   /**< Final pass of multi-pass mode */
+};
+
+/*!\brief Rate control mode */
+enum aom_rc_mode {
+  AOM_VBR, /**< Variable Bit Rate (VBR) mode */
+  AOM_CBR, /**< Constant Bit Rate (CBR) mode */
+  AOM_CQ,  /**< Constrained Quality (CQ)  mode */
+  AOM_Q,   /**< Constant Quality (Q) mode */
+};
+
+/*!\brief Keyframe placement mode.
+ *
+ * This enumeration determines whether keyframes are placed automatically by
+ * the encoder or whether this behavior is disabled. Older releases of this
+ * SDK were implemented such that AOM_KF_FIXED meant keyframes were disabled.
+ * This name is confusing for this behavior, so the new symbols to be used
+ * are AOM_KF_AUTO and AOM_KF_DISABLED.
+ */
+enum aom_kf_mode {
+  AOM_KF_FIXED,       /**< deprecated, implies AOM_KF_DISABLED */
+  AOM_KF_AUTO,        /**< Encoder determines optimal placement automatically */
+  AOM_KF_DISABLED = 0 /**< Encoder does not place keyframes. */
+};
+
+/*!\brief Encoder Config Options
+ *
+ * This type allows to enumerate and control flags defined for encoder control
+ * via config file at runtime.
+ */
+typedef struct cfg_options {
+  /*!\brief Indicate init by cfg file
+   * 0 or 1
+   */
+  unsigned int init_by_cfg_file;
+  /*!\brief Superblock size
+   * 0, 64 or 128
+   */
+  unsigned int super_block_size;
+  /*!\brief max partition size
+   * 8, 16, 32, 64, 128
+   */
+  unsigned int max_partition_size;
+  /*!\brief min partition size
+   * 8, 16, 32, 64, 128
+   */
+  unsigned int min_partition_size;
+  /*!\brief disable AB Shape partition type
+   *
+   */
+  unsigned int disable_ab_partition_type;
+  /*!\brief disable rectangular partition type
+   *
+   */
+  unsigned int disable_rect_partition_type;
+  /*!\brief disable 1:4/4:1 partition type
+   *
+   */
+  unsigned int disable_1to4_partition_type;
+  /*!\brief disable flip and identity transform type
+   *
+   */
+  unsigned int disable_flip_idtx;
+  /*!\brief disable CDEF filter
+   *
+   */
+  unsigned int disable_cdef;
+  /*!\brief disable Loop Restoration Filter
+   *
+   */
+  unsigned int disable_lr;
+  /*!\brief disable OBMC
+   *
+   */
+  unsigned int disable_obmc;
+  /*!\brief disable Warped Motion
+   *
+   */
+  unsigned int disable_warp_motion;
+  /*!\brief disable global motion
+   *
+   */
+  unsigned int disable_global_motion;
+  /*!\brief disable dist weighted compound
+   *
+   */
+  unsigned int disable_dist_wtd_comp;
+  /*!\brief disable diff weighted compound
+   *
+   */
+  unsigned int disable_diff_wtd_comp;
+  /*!\brief disable inter/intra compound
+   *
+   */
+  unsigned int disable_inter_intra_comp;
+  /*!\brief disable masked compound
+   *
+   */
+  unsigned int disable_masked_comp;
+  /*!\brief disable one sided compound
+   *
+   */
+  unsigned int disable_one_sided_comp;
+  /*!\brief disable Palette
+   *
+   */
+  unsigned int disable_palette;
+  /*!\brief disable Intra Block Copy
+   *
+   */
+  unsigned int disable_intrabc;
+  /*!\brief disable chroma from luma
+   *
+   */
+  unsigned int disable_cfl;
+  /*!\brief disable intra smooth mode
+   *
+   */
+  unsigned int disable_smooth_intra;
+  /*!\brief disable filter intra
+   *
+   */
+  unsigned int disable_filter_intra;
+  /*!\brief disable dual filter
+   *
+   */
+  unsigned int disable_dual_filter;
+  /*!\brief disable intra angle delta
+   *
+   */
+  unsigned int disable_intra_angle_delta;
+  /*!\brief disable intra edge filter
+   *
+   */
+  unsigned int disable_intra_edge_filter;
+  /*!\brief disable 64x64 transform
+   *
+   */
+  unsigned int disable_tx_64x64;
+  /*!\brief disable smooth inter/intra
+   *
+   */
+  unsigned int disable_smooth_inter_intra;
+  /*!\brief disable inter/inter wedge comp
+   *
+   */
+  unsigned int disable_inter_inter_wedge;
+  /*!\brief disable inter/intra wedge comp
+   *
+   */
+  unsigned int disable_inter_intra_wedge;
+  /*!\brief disable paeth intra
+   *
+   */
+  unsigned int disable_paeth_intra;
+  /*!\brief disable trellis quantization
+   *
+   */
+  unsigned int disable_trellis_quant;
+  /*!\brief disable ref frame MV
+   *
+   */
+  unsigned int disable_ref_frame_mv;
+  /*!\brief use reduced reference frame set
+   *
+   */
+  unsigned int reduced_reference_set;
+  /*!\brief use reduced transform type set
+   *
+   */
+  unsigned int reduced_tx_type_set;
+} cfg_options_t;
+
+/*!\brief Encoded Frame Flags
+ *
+ * This type indicates a bitfield to be passed to aom_codec_encode(), defining
+ * per-frame boolean values. By convention, bits common to all codecs will be
+ * named AOM_EFLAG_*, and bits specific to an algorithm will be named
+ * /algo/_eflag_*. The lower order 16 bits are reserved for common use.
+ */
+typedef long aom_enc_frame_flags_t;
+#define AOM_EFLAG_FORCE_KF (1 << 0) /**< Force this frame to be a keyframe */
+
+/*!\brief Encoder configuration structure
+ *
+ * This structure contains the encoder settings that have common representations
+ * across all codecs. This doesn't imply that all codecs support all features,
+ * however.
+ */
+typedef struct aom_codec_enc_cfg {
+  /*
+   * generic settings (g)
+   */
+
+  /*!\brief Algorithm specific "usage" value
+   *
+   * Algorithms may define multiple values for usage, which may convey the
+   * intent of how the application intends to use the stream. If this value
+   * is non-zero, consult the documentation for the codec to determine its
+   * meaning.
+   */
+  unsigned int g_usage;
+
+  /*!\brief Maximum number of threads to use
+   *
+   * For multi-threaded implementations, use no more than this number of
+   * threads. The codec may use fewer threads than allowed. The value
+   * 0 is equivalent to the value 1.
+   */
+  unsigned int g_threads;
+
+  /*!\brief Bitstream profile to use
+   *
+   * Some codecs support a notion of multiple bitstream profiles. Typically
+   * this maps to a set of features that are turned on or off. Often the
+   * profile to use is determined by the features of the intended decoder.
+   * Consult the documentation for the codec to determine the valid values
+   * for this parameter, or set to zero for a sane default.
+   */
+  unsigned int g_profile; /**< profile of bitstream to use */
+
+  /*!\brief Width of the frame
+   *
+   * This value identifies the presentation resolution of the frame,
+   * in pixels. Note that the frames passed as input to the encoder must
+   * have this resolution. Frames will be presented by the decoder in this
+   * resolution, independent of any spatial resampling the encoder may do.
+   */
+  unsigned int g_w;
+
+  /*!\brief Height of the frame
+   *
+   * This value identifies the presentation resolution of the frame,
+   * in pixels. Note that the frames passed as input to the encoder must
+   * have this resolution. Frames will be presented by the decoder in this
+   * resolution, independent of any spatial resampling the encoder may do.
+   */
+  unsigned int g_h;
+
+  /*!\brief Max number of frames to encode
+   *
+   */
+  unsigned int g_limit;
+
+  /*!\brief Forced maximum width of the frame
+   *
+   * If this value is non-zero then it is used to force the maximum frame
+   * width written in write_sequence_header().
+   */
+  unsigned int g_forced_max_frame_width;
+
+  /*!\brief Forced maximum height of the frame
+   *
+   * If this value is non-zero then it is used to force the maximum frame
+   * height written in write_sequence_header().
+   */
+  unsigned int g_forced_max_frame_height;
+
+  /*!\brief Bit-depth of the codec
+   *
+   * This value identifies the bit_depth of the codec,
+   * Only certain bit-depths are supported as identified in the
+   * aom_bit_depth_t enum.
+   */
+  aom_bit_depth_t g_bit_depth;
+
+  /*!\brief Bit-depth of the input frames
+   *
+   * This value identifies the bit_depth of the input frames in bits.
+   * Note that the frames passed as input to the encoder must have
+   * this bit-depth.
+   */
+  unsigned int g_input_bit_depth;
+
+  /*!\brief Stream timebase units
+   *
+   * Indicates the smallest interval of time, in seconds, used by the stream.
+   * For fixed frame rate material, or variable frame rate material where
+   * frames are timed at a multiple of a given clock (ex: video capture),
+   * the \ref RECOMMENDED method is to set the timebase to the reciprocal
+   * of the frame rate (ex: 1001/30000 for 29.970 Hz NTSC). This allows the
+   * pts to correspond to the frame number, which can be handy. For
+   * re-encoding video from containers with absolute time timestamps, the
+   * \ref RECOMMENDED method is to set the timebase to that of the parent
+   * container or multimedia framework (ex: 1/1000 for ms, as in FLV).
+   */
+  struct aom_rational g_timebase;
+
+  /*!\brief Enable error resilient modes.
+   *
+   * The error resilient bitfield indicates to the encoder which features
+   * it should enable to take measures for streaming over lossy or noisy
+   * links.
+   */
+  aom_codec_er_flags_t g_error_resilient;
+
+  /*!\brief Multi-pass Encoding Mode
+   *
+   * This value should be set to the current phase for multi-pass encoding.
+   * For single pass, set to #AOM_RC_ONE_PASS.
+   */
+  enum aom_enc_pass g_pass;
+
+  /*!\brief Allow lagged encoding
+   *
+   * If set, this value allows the encoder to consume a number of input
+   * frames before producing output frames. This allows the encoder to
+   * base decisions for the current frame on future frames. This does
+   * increase the latency of the encoding pipeline, so it is not appropriate
+   * in all situations (ex: realtime encoding).
+   *
+   * Note that this is a maximum value -- the encoder may produce frames
+   * sooner than the given limit. Set this value to 0 to disable this
+   * feature.
+   */
+  unsigned int g_lag_in_frames;
+
+  /*
+   * rate control settings (rc)
+   */
+
+  /*!\brief Temporal resampling configuration, if supported by the codec.
+   *
+   * Temporal resampling allows the codec to "drop" frames as a strategy to
+   * meet its target data rate. This can cause temporal discontinuities in
+   * the encoded video, which may appear as stuttering during playback. This
+   * trade-off is often acceptable, but for many applications is not. It can
+   * be disabled in these cases.
+   *
+   * Note that not all codecs support this feature. All aom AVx codecs do.
+   * For other codecs, consult the documentation for that algorithm.
+   *
+   * This threshold is described as a percentage of the target data buffer.
+   * When the data buffer falls below this percentage of fullness, a
+   * dropped frame is indicated. Set the threshold to zero (0) to disable
+   * this feature.
+   */
+  unsigned int rc_dropframe_thresh;
+
+  /*!\brief Mode for spatial resampling, if supported by the codec.
+   *
+   * Spatial resampling allows the codec to compress a lower resolution
+   * version of the frame, which is then upscaled by the decoder to the
+   * correct presentation resolution. This increases visual quality at
+   * low data rates, at the expense of CPU time on the encoder/decoder.
+   */
+  unsigned int rc_resize_mode;
+
+  /*!\brief Frame resize denominator.
+   *
+   * The denominator for resize to use, assuming 8 as the numerator.
+   *
+   * Valid denominators are  8 - 16 for now.
+   */
+  unsigned int rc_resize_denominator;
+
+  /*!\brief Keyframe resize denominator.
+   *
+   * The denominator for resize to use, assuming 8 as the numerator.
+   *
+   * Valid denominators are  8 - 16 for now.
+   */
+  unsigned int rc_resize_kf_denominator;
+
+  /*!\brief Frame super-resolution scaling mode.
+   *
+   * Similar to spatial resampling, frame super-resolution integrates
+   * upscaling after the encode/decode process. Taking control of upscaling and
+   * using restoration filters should allow it to outperform normal resizing.
+   *
+   * Valid values are 0 to 4 as defined in enum SUPERRES_MODE.
+   */
+  unsigned int rc_superres_mode;
+
+  /*!\brief Frame super-resolution denominator.
+   *
+   * The denominator for superres to use. If fixed it will only change if the
+   * cumulative scale change over resizing and superres is greater than 1/2;
+   * this forces superres to reduce scaling.
+   *
+   * Valid denominators are 8 to 16.
+   *
+   * Used only by SUPERRES_FIXED.
+   */
+  unsigned int rc_superres_denominator;
+
+  /*!\brief Keyframe super-resolution denominator.
+   *
+   * The denominator for superres to use. If fixed it will only change if the
+   * cumulative scale change over resizing and superres is greater than 1/2;
+   * this forces superres to reduce scaling.
+   *
+   * Valid denominators are 8 - 16 for now.
+   */
+  unsigned int rc_superres_kf_denominator;
+
+  /*!\brief Frame super-resolution q threshold.
+   *
+   * The q level threshold after which superres is used.
+   * Valid values are 1 to 63.
+   *
+   * Used only by SUPERRES_QTHRESH
+   */
+  unsigned int rc_superres_qthresh;
+
+  /*!\brief Keyframe super-resolution q threshold.
+   *
+   * The q level threshold after which superres is used for key frames.
+   * Valid values are 1 to 63.
+   *
+   * Used only by SUPERRES_QTHRESH
+   */
+  unsigned int rc_superres_kf_qthresh;
+
+  /*!\brief Rate control algorithm to use.
+   *
+   * Indicates whether the end usage of this stream is to be streamed over
+   * a bandwidth constrained link, indicating that Constant Bit Rate (CBR)
+   * mode should be used, or whether it will be played back on a high
+   * bandwidth link, as from a local disk, where higher variations in
+   * bitrate are acceptable.
+   */
+  enum aom_rc_mode rc_end_usage;
+
+  /*!\brief Two-pass stats buffer.
+   *
+   * A buffer containing all of the stats packets produced in the first
+   * pass, concatenated.
+   */
+  aom_fixed_buf_t rc_twopass_stats_in;
+
+  /*!\brief first pass mb stats buffer.
+   *
+   * A buffer containing all of the first pass mb stats packets produced
+   * in the first pass, concatenated.
+   */
+  aom_fixed_buf_t rc_firstpass_mb_stats_in;
+
+  /*!\brief Target data rate
+   *
+   * Target bandwidth to use for this stream, in kilobits per second.
+   */
+  unsigned int rc_target_bitrate;
+
+  /*
+   * quantizer settings
+   */
+
+  /*!\brief Minimum (Best Quality) Quantizer
+   *
+   * The quantizer is the most direct control over the quality of the
+   * encoded image. The range of valid values for the quantizer is codec
+   * specific. Consult the documentation for the codec to determine the
+   * values to use. To determine the range programmatically, call
+   * aom_codec_enc_config_default() with a usage value of 0.
+   */
+  unsigned int rc_min_quantizer;
+
+  /*!\brief Maximum (Worst Quality) Quantizer
+   *
+   * The quantizer is the most direct control over the quality of the
+   * encoded image. The range of valid values for the quantizer is codec
+   * specific. Consult the documentation for the codec to determine the
+   * values to use. To determine the range programmatically, call
+   * aom_codec_enc_config_default() with a usage value of 0.
+   */
+  unsigned int rc_max_quantizer;
+
+  /*
+   * bitrate tolerance
+   */
+
+  /*!\brief Rate control adaptation undershoot control
+   *
+   * This value, expressed as a percentage of the target bitrate,
+   * controls the maximum allowed adaptation speed of the codec.
+   * This factor controls the maximum amount of bits that can
+   * be subtracted from the target bitrate in order to compensate
+   * for prior overshoot.
+   *
+   * Valid values in the range 0-1000.
+   */
+  unsigned int rc_undershoot_pct;
+
+  /*!\brief Rate control adaptation overshoot control
+   *
+   * This value, expressed as a percentage of the target bitrate,
+   * controls the maximum allowed adaptation speed of the codec.
+   * This factor controls the maximum amount of bits that can
+   * be added to the target bitrate in order to compensate for
+   * prior undershoot.
+   *
+   * Valid values in the range 0-1000.
+   */
+  unsigned int rc_overshoot_pct;
+
+  /*
+   * decoder buffer model parameters
+   */
+
+  /*!\brief Decoder Buffer Size
+   *
+   * This value indicates the amount of data that may be buffered by the
+   * decoding application. Note that this value is expressed in units of
+   * time (milliseconds). For example, a value of 5000 indicates that the
+   * client will buffer (at least) 5000ms worth of encoded data. Use the
+   * target bitrate (#rc_target_bitrate) to convert to bits/bytes, if
+   * necessary.
+   */
+  unsigned int rc_buf_sz;
+
+  /*!\brief Decoder Buffer Initial Size
+   *
+   * This value indicates the amount of data that will be buffered by the
+   * decoding application prior to beginning playback. This value is
+   * expressed in units of time (milliseconds). Use the target bitrate
+   * (#rc_target_bitrate) to convert to bits/bytes, if necessary.
+   */
+  unsigned int rc_buf_initial_sz;
+
+  /*!\brief Decoder Buffer Optimal Size
+   *
+   * This value indicates the amount of data that the encoder should try
+   * to maintain in the decoder's buffer. This value is expressed in units
+   * of time (milliseconds). Use the target bitrate (#rc_target_bitrate)
+   * to convert to bits/bytes, if necessary.
+   */
+  unsigned int rc_buf_optimal_sz;
+
+  /*
+   * 2 pass rate control parameters
+   */
+
+  /*!\brief Two-pass mode CBR/VBR bias
+   *
+   * Bias, expressed on a scale of 0 to 100, for determining target size
+   * for the current frame. The value 0 indicates the optimal CBR mode
+   * value should be used. The value 100 indicates the optimal VBR mode
+   * value should be used. Values in between indicate which way the
+   * encoder should "lean."
+   */
+  unsigned int rc_2pass_vbr_bias_pct;
+
+  /*!\brief Two-pass mode per-GOP minimum bitrate
+   *
+   * This value, expressed as a percentage of the target bitrate, indicates
+   * the minimum bitrate to be used for a single GOP (aka "section")
+   */
+  unsigned int rc_2pass_vbr_minsection_pct;
+
+  /*!\brief Two-pass mode per-GOP maximum bitrate
+   *
+   * This value, expressed as a percentage of the target bitrate, indicates
+   * the maximum bitrate to be used for a single GOP (aka "section")
+   */
+  unsigned int rc_2pass_vbr_maxsection_pct;
+
+  /*
+   * keyframing settings (kf)
+   */
+
+  /*!\brief Option to enable forward reference key frame
+   *
+   */
+  int fwd_kf_enabled;
+
+  /*!\brief Keyframe placement mode
+   *
+   * This value indicates whether the encoder should place keyframes at a
+   * fixed interval, or determine the optimal placement automatically
+   * (as governed by the #kf_min_dist and #kf_max_dist parameters)
+   */
+  enum aom_kf_mode kf_mode;
+
+  /*!\brief Keyframe minimum interval
+   *
+   * This value, expressed as a number of frames, prevents the encoder from
+   * placing a keyframe nearer than kf_min_dist to the previous keyframe. At
+   * least kf_min_dist frames non-keyframes will be coded before the next
+   * keyframe. Set kf_min_dist equal to kf_max_dist for a fixed interval.
+   */
+  unsigned int kf_min_dist;
+
+  /*!\brief Keyframe maximum interval
+   *
+   * This value, expressed as a number of frames, forces the encoder to code
+   * a keyframe if one has not been coded in the last kf_max_dist frames.
+   * A value of 0 implies all frames will be keyframes. Set kf_min_dist
+   * equal to kf_max_dist for a fixed interval.
+   */
+  unsigned int kf_max_dist;
+
+  /*!\brief sframe interval
+   *
+   * This value, expressed as a number of frames, forces the encoder to code
+   * an S-Frame every sframe_dist frames.
+   */
+  unsigned int sframe_dist;
+
+  /*!\brief sframe insertion mode
+   *
+   * This value must be set to 1 or 2, and tells the encoder how to insert
+   * S-Frames. It will only have an effect if sframe_dist != 0.
+   *
+   * If altref is enabled:
+   *   - if sframe_mode == 1, the considered frame will be made into an
+   *     S-Frame only if it is an altref frame
+   *   - if sframe_mode == 2, the next altref frame will be made into an
+   *     S-Frame.
+   *
+   * Otherwise: the considered frame will be made into an S-Frame.
+   */
+  unsigned int sframe_mode;
+
+  /*!\brief Tile coding mode
+   *
+   * This value indicates the tile coding mode.
+   * A value of 0 implies a normal non-large-scale tile coding. A value of 1
+   * implies a large-scale tile coding.
+   */
+  unsigned int large_scale_tile;
+
+  /*!\brief Monochrome mode
+   *
+   * If this is nonzero, the encoder will generate a monochrome stream
+   * with no chroma planes.
+   */
+  unsigned int monochrome;
+
+  /*!\brief full_still_picture_hdr
+   *
+   * If this is nonzero, the encoder will generate a full header even for
+   * still picture encoding. if zero, a reduced header is used for still
+   * picture. This flag has no effect when a regular video with more than
+   * a single frame is encoded.
+   */
+  unsigned int full_still_picture_hdr;
+
+  /*!\brief Bitstream syntax mode
+   *
+   * This value indicates the bitstream syntax mode.
+   * A value of 0 indicates bitstream is saved as Section 5 bitstream. A value
+   * of 1 indicates the bitstream is saved in Annex-B format
+   */
+  unsigned int save_as_annexb;
+
+  /*!\brief Number of explicit tile widths specified
+   *
+   * This value indicates the number of tile widths specified
+   * A value of 0 implies no tile widths are specified.
+   * Tile widths are given in the array tile_widths[]
+   */
+  int tile_width_count;
+
+  /*!\brief Number of explicit tile heights specified
+   *
+   * This value indicates the number of tile heights specified
+   * A value of 0 implies no tile heights are specified.
+   * Tile heights are given in the array tile_heights[]
+   */
+  int tile_height_count;
+
+/*!\brief Maximum number of tile widths in tile widths array
+ *
+ * This define gives the maximum number of elements in the tile_widths array.
+ */
+#define MAX_TILE_WIDTHS 64  // maximum tile width array length
+
+  /*!\brief Array of specified tile widths
+   *
+   * This array specifies tile widths (and may be empty)
+   * The number of widths specified is given by tile_width_count
+   */
+  int tile_widths[MAX_TILE_WIDTHS];
+
+/*!\brief Maximum number of tile heights in tile heights array.
+ *
+ * This define gives the maximum number of elements in the tile_heights array.
+ */
+#define MAX_TILE_HEIGHTS 64  // maximum tile height array length
+
+  /*!\brief Array of specified tile heights
+   *
+   * This array specifies tile heights (and may be empty)
+   * The number of heights specified is given by tile_height_count
+   */
+  int tile_heights[MAX_TILE_HEIGHTS];
+
+  /*!\brief Whether encoder should use fixed QP offsets.
+   *
+   * If a value of 1 is provided, encoder will use fixed QP offsets for frames
+   * at different levels of the pyramid.
+   * - If 'fixed_qp_offsets' is also provided, encoder will use the given
+   * offsets
+   * - If not, encoder will select the fixed offsets based on the cq-level
+   *   provided.
+   * If a value of 0 is provided and fixed_qp_offset are not provided, encoder
+   * will NOT use fixed QP offsets.
+   * Note: This option is only relevant for --end-usage=q.
+   */
+  unsigned int use_fixed_qp_offsets;
+
+/*!\brief Number of fixed QP offsets
+ *
+ * This defines the number of elements in the fixed_qp_offsets array.
+ */
+#define FIXED_QP_OFFSET_COUNT 5
+
+  /*!\brief Array of fixed QP offsets
+   *
+   * This array specifies fixed QP offsets (range: 0 to 63) for frames at
+   * different levels of the pyramid. It is a comma-separated list of 5 values:
+   * - QP offset for keyframe
+   * - QP offset for ALTREF frame
+   * - QP offset for 1st level internal ARF
+   * - QP offset for 2nd level internal ARF
+   * - QP offset for 3rd level internal ARF
+   * Notes:
+   * - QP offset for leaf level frames is not explicitly specified. These frames
+   *   use the worst quality allowed (--cq-level).
+   * - This option is only relevant for --end-usage=q.
+   */
+  int fixed_qp_offsets[FIXED_QP_OFFSET_COUNT];
+
+  /*!\brief Options defined per config file
+   *
+   */
+  cfg_options_t encoder_cfg;
+} aom_codec_enc_cfg_t; /**< alias for struct aom_codec_enc_cfg */
+
+/*!\brief Initialize an encoder instance
+ *
+ * Initializes a encoder context using the given interface. Applications
+ * should call the aom_codec_enc_init convenience macro instead of this
+ * function directly, to ensure that the ABI version number parameter
+ * is properly initialized.
+ *
+ * If the library was configured with --disable-multithread, this call
+ * is not thread safe and should be guarded with a lock if being used
+ * in a multithreaded context.
+ *
+ * \param[in]    ctx     Pointer to this instance's context.
+ * \param[in]    iface   Pointer to the algorithm interface to use.
+ * \param[in]    cfg     Configuration to use, if known.
+ * \param[in]    flags   Bitfield of AOM_CODEC_USE_* flags
+ * \param[in]    ver     ABI version number. Must be set to
+ *                       AOM_ENCODER_ABI_VERSION
+ * \retval #AOM_CODEC_OK
+ *     The decoder algorithm initialized.
+ * \retval #AOM_CODEC_MEM_ERROR
+ *     Memory allocation failed.
+ */
+aom_codec_err_t aom_codec_enc_init_ver(aom_codec_ctx_t *ctx,
+                                       aom_codec_iface_t *iface,
+                                       const aom_codec_enc_cfg_t *cfg,
+                                       aom_codec_flags_t flags, int ver);
+
+/*!\brief Convenience macro for aom_codec_enc_init_ver()
+ *
+ * Ensures the ABI version parameter is properly set.
+ */
+#define aom_codec_enc_init(ctx, iface, cfg, flags) \
+  aom_codec_enc_init_ver(ctx, iface, cfg, flags, AOM_ENCODER_ABI_VERSION)
+
+/*!\brief Get the default configuration for a usage.
+ *
+ * Initializes an encoder configuration structure with default values. Supports
+ * the notion of "usages" so that an algorithm may offer different default
+ * settings depending on the user's intended goal. This function \ref SHOULD
+ * be called by all applications to initialize the configuration structure
+ * before specializing the configuration with application specific values.
+ *
+ * \param[in]    iface     Pointer to the algorithm interface to use.
+ * \param[out]   cfg       Configuration buffer to populate.
+ * \param[in]    usage     Algorithm specific usage value. For AV1, must be
+ *                         set to AOM_USAGE_GOOD_QUALITY (0) or
+ *                         AOM_USAGE_REALTIME (1).
+ *
+ * \retval #AOM_CODEC_OK
+ *     The configuration was populated.
+ * \retval #AOM_CODEC_INCAPABLE
+ *     Interface is not an encoder interface.
+ * \retval #AOM_CODEC_INVALID_PARAM
+ *     A parameter was NULL, or the usage value was not recognized.
+ */
+aom_codec_err_t aom_codec_enc_config_default(aom_codec_iface_t *iface,
+                                             aom_codec_enc_cfg_t *cfg,
+                                             unsigned int usage);
+
+/*!\brief Set or change configuration
+ *
+ * Reconfigures an encoder instance according to the given configuration.
+ *
+ * \param[in]    ctx     Pointer to this instance's context
+ * \param[in]    cfg     Configuration buffer to use
+ *
+ * \retval #AOM_CODEC_OK
+ *     The configuration was populated.
+ * \retval #AOM_CODEC_INCAPABLE
+ *     Interface is not an encoder interface.
+ * \retval #AOM_CODEC_INVALID_PARAM
+ *     A parameter was NULL, or the usage value was not recognized.
+ */
+aom_codec_err_t aom_codec_enc_config_set(aom_codec_ctx_t *ctx,
+                                         const aom_codec_enc_cfg_t *cfg);
+
+/*!\brief Get global stream headers
+ *
+ * Retrieves a stream level global header packet, if supported by the codec.
+ * Calls to this function should be deferred until all configuration information
+ * has been passed to libaom. Otherwise the global header data may be
+ * invalidated by additional configuration changes.
+ *
+ * The AV1 implementation of this function returns an OBU. The OBU returned is
+ * in Low Overhead Bitstream Format. Specifically, the obu_has_size_field bit is
+ * set, and the buffer contains the obu_size field for the returned OBU.
+ *
+ * \param[in]    ctx     Pointer to this instance's context
+ *
+ * \retval NULL
+ *     Encoder does not support global header, or an error occurred while
+ *     generating the global header.
+ *
+ * \retval Non-NULL
+ *     Pointer to buffer containing global header packet. The caller owns the
+ *     memory associated with this buffer, and must free the 'buf' member of the
+ *     aom_fixed_buf_t as well as the aom_fixed_buf_t pointer. Memory returned
+ *     must be freed via call to free().
+ */
+aom_fixed_buf_t *aom_codec_get_global_headers(aom_codec_ctx_t *ctx);
+
+/*!\brief usage parameter analogous to AV1 GOOD QUALITY mode. */
+#define AOM_USAGE_GOOD_QUALITY (0)
+/*!\brief usage parameter analogous to AV1 REALTIME mode. */
+#define AOM_USAGE_REALTIME (1)
+
+/*!\brief Encode a frame
+ *
+ * Encodes a video frame at the given "presentation time." The presentation
+ * time stamp (PTS) \ref MUST be strictly increasing.
+ *
+ * When the last frame has been passed to the encoder, this function should
+ * continue to be called, with the img parameter set to NULL. This will
+ * signal the end-of-stream condition to the encoder and allow it to encode
+ * any held buffers. Encoding is complete when aom_codec_encode() is called
+ * and aom_codec_get_cx_data() returns no data.
+ *
+ * \param[in]    ctx       Pointer to this instance's context
+ * \param[in]    img       Image data to encode, NULL to flush.
+ * \param[in]    pts       Presentation time stamp, in timebase units.
+ * \param[in]    duration  Duration to show frame, in timebase units.
+ * \param[in]    flags     Flags to use for encoding this frame.
+ *
+ * \retval #AOM_CODEC_OK
+ *     The configuration was populated.
+ * \retval #AOM_CODEC_INCAPABLE
+ *     Interface is not an encoder interface.
+ * \retval #AOM_CODEC_INVALID_PARAM
+ *     A parameter was NULL, the image format is unsupported, etc.
+ */
+aom_codec_err_t aom_codec_encode(aom_codec_ctx_t *ctx, const aom_image_t *img,
+                                 aom_codec_pts_t pts, unsigned long duration,
+                                 aom_enc_frame_flags_t flags);
+
+/*!\brief Set compressed data output buffer
+ *
+ * Sets the buffer that the codec should output the compressed data
+ * into. This call effectively sets the buffer pointer returned in the
+ * next AOM_CODEC_CX_FRAME_PKT packet. Subsequent packets will be
+ * appended into this buffer. The buffer is preserved across frames,
+ * so applications must periodically call this function after flushing
+ * the accumulated compressed data to disk or to the network to reset
+ * the pointer to the buffer's head.
+ *
+ * `pad_before` bytes will be skipped before writing the compressed
+ * data, and `pad_after` bytes will be appended to the packet. The size
+ * of the packet will be the sum of the size of the actual compressed
+ * data, pad_before, and pad_after. The padding bytes will be preserved
+ * (not overwritten).
+ *
+ * Note that calling this function does not guarantee that the returned
+ * compressed data will be placed into the specified buffer. In the
+ * event that the encoded data will not fit into the buffer provided,
+ * the returned packet \ref MAY point to an internal buffer, as it would
+ * if this call were never used. In this event, the output packet will
+ * NOT have any padding, and the application must free space and copy it
+ * to the proper place. This is of particular note in configurations
+ * that may output multiple packets for a single encoded frame (e.g., lagged
+ * encoding) or if the application does not reset the buffer periodically.
+ *
+ * Applications may restore the default behavior of the codec providing
+ * the compressed data buffer by calling this function with a NULL
+ * buffer.
+ *
+ * Applications \ref MUSTNOT call this function during iteration of
+ * aom_codec_get_cx_data().
+ *
+ * \param[in]    ctx         Pointer to this instance's context
+ * \param[in]    buf         Buffer to store compressed data into
+ * \param[in]    pad_before  Bytes to skip before writing compressed data
+ * \param[in]    pad_after   Bytes to skip after writing compressed data
+ *
+ * \retval #AOM_CODEC_OK
+ *     The buffer was set successfully.
+ * \retval #AOM_CODEC_INVALID_PARAM
+ *     A parameter was NULL, the image format is unsupported, etc.
+ */
+aom_codec_err_t aom_codec_set_cx_data_buf(aom_codec_ctx_t *ctx,
+                                          const aom_fixed_buf_t *buf,
+                                          unsigned int pad_before,
+                                          unsigned int pad_after);
+
+/*!\brief Encoded data iterator
+ *
+ * Iterates over a list of data packets to be passed from the encoder to the
+ * application. The different kinds of packets available are enumerated in
+ * #aom_codec_cx_pkt_kind.
+ *
+ * #AOM_CODEC_CX_FRAME_PKT packets should be passed to the application's
+ * muxer. Multiple compressed frames may be in the list.
+ * #AOM_CODEC_STATS_PKT packets should be appended to a global buffer.
+ *
+ * The application \ref MUST silently ignore any packet kinds that it does
+ * not recognize or support.
+ *
+ * The data buffers returned from this function are only guaranteed to be
+ * valid until the application makes another call to any aom_codec_* function.
+ *
+ * \param[in]     ctx      Pointer to this instance's context
+ * \param[in,out] iter     Iterator storage, initialized to NULL
+ *
+ * \return Returns a pointer to an output data packet (compressed frame data,
+ *         two-pass statistics, etc.) or NULL to signal end-of-list.
+ *
+ */
+const aom_codec_cx_pkt_t *aom_codec_get_cx_data(aom_codec_ctx_t *ctx,
+                                                aom_codec_iter_t *iter);
+
+/*!\brief Get Preview Frame
+ *
+ * Returns an image that can be used as a preview. Shows the image as it would
+ * exist at the decompressor. The application \ref MUST NOT write into this
+ * image buffer.
+ *
+ * \param[in]     ctx      Pointer to this instance's context
+ *
+ * \return Returns a pointer to a preview image, or NULL if no image is
+ *         available.
+ *
+ */
+const aom_image_t *aom_codec_get_preview_frame(aom_codec_ctx_t *ctx);
+
+/*!@} - end defgroup encoder*/
+#ifdef __cplusplus
+}
+#endif
+#endif  // AOM_AOM_AOM_ENCODER_H_
diff --git a/libs/libaom/src/aom/aom_frame_buffer.h b/libs/libaom/src/aom/aom_frame_buffer.h
new file mode 100644
index 000000000..a715645a7
--- /dev/null
+++ b/libs/libaom/src/aom/aom_frame_buffer.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_AOM_FRAME_BUFFER_H_
+#define AOM_AOM_AOM_FRAME_BUFFER_H_
+
+/*!\file
+ * \brief Describes the decoder external frame buffer interface.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "aom/aom_integer.h"
+
+/*!\brief The maximum number of work buffers used by libaom.
+ *  Support maximum 4 threads to decode video in parallel.
+ *  Each thread will use one work buffer.
+ * TODO(hkuang): Add support to set number of worker threads dynamically.
+ */
+#define AOM_MAXIMUM_WORK_BUFFERS 8
+
+/*!\brief The maximum number of reference buffers that a AV1 encoder may use.
+ */
+#define AOM_MAXIMUM_REF_BUFFERS 8
+
+/*!\brief External frame buffer
+ *
+ * This structure holds allocated frame buffers used by the decoder.
+ */
+typedef struct aom_codec_frame_buffer {
+  uint8_t *data; /**< Pointer to the data buffer */
+  size_t size;   /**< Size of data in bytes */
+  void *priv;    /**< Frame's private data */
+} aom_codec_frame_buffer_t;
+
+/*!\brief get frame buffer callback prototype
+ *
+ * This callback is invoked by the decoder to retrieve data for the frame
+ * buffer in order for the decode call to complete. The callback must
+ * allocate at least min_size in bytes and assign it to fb->data. The callback
+ * must zero out all the data allocated. Then the callback must set fb->size
+ * to the allocated size. The application does not need to align the allocated
+ * data. The callback is triggered when the decoder needs a frame buffer to
+ * decode a compressed image into. This function may be called more than once
+ * for every call to aom_codec_decode. The application may set fb->priv to
+ * some data which will be passed back in the aom_image_t and the release
+ * function call. |fb| is guaranteed to not be NULL. On success the callback
+ * must return 0. Any failure the callback must return a value less than 0.
+ *
+ * \param[in] priv         Callback's private data
+ * \param[in] new_size     Size in bytes needed by the buffer
+ * \param[in,out] fb       Pointer to aom_codec_frame_buffer_t
+ */
+typedef int (*aom_get_frame_buffer_cb_fn_t)(void *priv, size_t min_size,
+                                            aom_codec_frame_buffer_t *fb);
+
+/*!\brief release frame buffer callback prototype
+ *
+ * This callback is invoked by the decoder when the frame buffer is not
+ * referenced by any other buffers. |fb| is guaranteed to not be NULL. On
+ * success the callback must return 0. Any failure the callback must return
+ * a value less than 0.
+ *
+ * \param[in] priv         Callback's private data
+ * \param[in] fb           Pointer to aom_codec_frame_buffer_t
+ */
+typedef int (*aom_release_frame_buffer_cb_fn_t)(void *priv,
+                                                aom_codec_frame_buffer_t *fb);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_AOM_FRAME_BUFFER_H_
diff --git a/libs/libaom/src/aom/aom_image.h b/libs/libaom/src/aom/aom_image.h
new file mode 100644
index 000000000..bb6973f9c
--- /dev/null
+++ b/libs/libaom/src/aom/aom_image.h
@@ -0,0 +1,430 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\file
+ * \brief Describes the aom image descriptor and associated operations
+ *
+ */
+#ifndef AOM_AOM_AOM_IMAGE_H_
+#define AOM_AOM_AOM_IMAGE_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "aom/aom_integer.h"
+
+/*!\brief Current ABI version number
+ *
+ * \internal
+ * If this file is altered in any way that changes the ABI, this value
+ * must be bumped.  Examples include, but are not limited to, changing
+ * types, removing or reassigning enums, adding/removing/rearranging
+ * fields to structures
+ */
+#define AOM_IMAGE_ABI_VERSION (9) /**<\hideinitializer*/
+
+#define AOM_IMG_FMT_PLANAR 0x100  /**< Image is a planar format. */
+#define AOM_IMG_FMT_UV_FLIP 0x200 /**< V plane precedes U in memory. */
+/** 0x400 used to signal alpha channel, skipping for backwards compatibility. */
+#define AOM_IMG_FMT_HIGHBITDEPTH 0x800 /**< Image uses 16bit framebuffer. */
+
+/*!\brief List of supported image formats */
+typedef enum aom_img_fmt {
+  AOM_IMG_FMT_NONE,
+  AOM_IMG_FMT_YV12 =
+      AOM_IMG_FMT_PLANAR | AOM_IMG_FMT_UV_FLIP | 1, /**< planar YVU */
+  AOM_IMG_FMT_I420 = AOM_IMG_FMT_PLANAR | 2,
+  AOM_IMG_FMT_AOMYV12 = AOM_IMG_FMT_PLANAR | AOM_IMG_FMT_UV_FLIP |
+                        3, /** < planar 4:2:0 format with aom color space */
+  AOM_IMG_FMT_AOMI420 = AOM_IMG_FMT_PLANAR | 4,
+  AOM_IMG_FMT_I422 = AOM_IMG_FMT_PLANAR | 5,
+  AOM_IMG_FMT_I444 = AOM_IMG_FMT_PLANAR | 6,
+  AOM_IMG_FMT_I42016 = AOM_IMG_FMT_I420 | AOM_IMG_FMT_HIGHBITDEPTH,
+  AOM_IMG_FMT_YV1216 = AOM_IMG_FMT_YV12 | AOM_IMG_FMT_HIGHBITDEPTH,
+  AOM_IMG_FMT_I42216 = AOM_IMG_FMT_I422 | AOM_IMG_FMT_HIGHBITDEPTH,
+  AOM_IMG_FMT_I44416 = AOM_IMG_FMT_I444 | AOM_IMG_FMT_HIGHBITDEPTH,
+} aom_img_fmt_t; /**< alias for enum aom_img_fmt */
+
+/*!\brief List of supported color primaries */
+typedef enum aom_color_primaries {
+  AOM_CICP_CP_RESERVED_0 = 0,  /**< For future use */
+  AOM_CICP_CP_BT_709 = 1,      /**< BT.709 */
+  AOM_CICP_CP_UNSPECIFIED = 2, /**< Unspecified */
+  AOM_CICP_CP_RESERVED_3 = 3,  /**< For future use */
+  AOM_CICP_CP_BT_470_M = 4,    /**< BT.470 System M (historical) */
+  AOM_CICP_CP_BT_470_B_G = 5,  /**< BT.470 System B, G (historical) */
+  AOM_CICP_CP_BT_601 = 6,      /**< BT.601 */
+  AOM_CICP_CP_SMPTE_240 = 7,   /**< SMPTE 240 */
+  AOM_CICP_CP_GENERIC_FILM =
+      8, /**< Generic film (color filters using illuminant C) */
+  AOM_CICP_CP_BT_2020 = 9,      /**< BT.2020, BT.2100 */
+  AOM_CICP_CP_XYZ = 10,         /**< SMPTE 428 (CIE 1921 XYZ) */
+  AOM_CICP_CP_SMPTE_431 = 11,   /**< SMPTE RP 431-2 */
+  AOM_CICP_CP_SMPTE_432 = 12,   /**< SMPTE EG 432-1  */
+  AOM_CICP_CP_RESERVED_13 = 13, /**< For future use (values 13 - 21)  */
+  AOM_CICP_CP_EBU_3213 = 22,    /**< EBU Tech. 3213-E  */
+  AOM_CICP_CP_RESERVED_23 = 23  /**< For future use (values 23 - 255)  */
+} aom_color_primaries_t;        /**< alias for enum aom_color_primaries */
+
+/*!\brief List of supported transfer functions */
+typedef enum aom_transfer_characteristics {
+  AOM_CICP_TC_RESERVED_0 = 0,  /**< For future use */
+  AOM_CICP_TC_BT_709 = 1,      /**< BT.709 */
+  AOM_CICP_TC_UNSPECIFIED = 2, /**< Unspecified */
+  AOM_CICP_TC_RESERVED_3 = 3,  /**< For future use */
+  AOM_CICP_TC_BT_470_M = 4,    /**< BT.470 System M (historical)  */
+  AOM_CICP_TC_BT_470_B_G = 5,  /**< BT.470 System B, G (historical) */
+  AOM_CICP_TC_BT_601 = 6,      /**< BT.601 */
+  AOM_CICP_TC_SMPTE_240 = 7,   /**< SMPTE 240 M */
+  AOM_CICP_TC_LINEAR = 8,      /**< Linear */
+  AOM_CICP_TC_LOG_100 = 9,     /**< Logarithmic (100 : 1 range) */
+  AOM_CICP_TC_LOG_100_SQRT10 =
+      10,                     /**< Logarithmic (100 * Sqrt(10) : 1 range) */
+  AOM_CICP_TC_IEC_61966 = 11, /**< IEC 61966-2-4 */
+  AOM_CICP_TC_BT_1361 = 12,   /**< BT.1361 */
+  AOM_CICP_TC_SRGB = 13,      /**< sRGB or sYCC*/
+  AOM_CICP_TC_BT_2020_10_BIT = 14, /**< BT.2020 10-bit systems */
+  AOM_CICP_TC_BT_2020_12_BIT = 15, /**< BT.2020 12-bit systems */
+  AOM_CICP_TC_SMPTE_2084 = 16,     /**< SMPTE ST 2084, ITU BT.2100 PQ */
+  AOM_CICP_TC_SMPTE_428 = 17,      /**< SMPTE ST 428 */
+  AOM_CICP_TC_HLG = 18,            /**< BT.2100 HLG, ARIB STD-B67 */
+  AOM_CICP_TC_RESERVED_19 = 19     /**< For future use (values 19-255) */
+} aom_transfer_characteristics_t;  /**< alias for enum aom_transfer_function */
+
+/*!\brief List of supported matrix coefficients */
+typedef enum aom_matrix_coefficients {
+  AOM_CICP_MC_IDENTITY = 0,    /**< Identity matrix */
+  AOM_CICP_MC_BT_709 = 1,      /**< BT.709 */
+  AOM_CICP_MC_UNSPECIFIED = 2, /**< Unspecified */
+  AOM_CICP_MC_RESERVED_3 = 3,  /**< For future use */
+  AOM_CICP_MC_FCC = 4,         /**< US FCC 73.628 */
+  AOM_CICP_MC_BT_470_B_G = 5,  /**< BT.470 System B, G (historical) */
+  AOM_CICP_MC_BT_601 = 6,      /**< BT.601 */
+  AOM_CICP_MC_SMPTE_240 = 7,   /**< SMPTE 240 M */
+  AOM_CICP_MC_SMPTE_YCGCO = 8, /**< YCgCo */
+  AOM_CICP_MC_BT_2020_NCL =
+      9, /**< BT.2020 non-constant luminance, BT.2100 YCbCr  */
+  AOM_CICP_MC_BT_2020_CL = 10, /**< BT.2020 constant luminance */
+  AOM_CICP_MC_SMPTE_2085 = 11, /**< SMPTE ST 2085 YDzDx */
+  AOM_CICP_MC_CHROMAT_NCL =
+      12, /**< Chromaticity-derived non-constant luminance */
+  AOM_CICP_MC_CHROMAT_CL = 13, /**< Chromaticity-derived constant luminance */
+  AOM_CICP_MC_ICTCP = 14,      /**< BT.2100 ICtCp */
+  AOM_CICP_MC_RESERVED_15 = 15 /**< For future use (values 15-255)  */
+} aom_matrix_coefficients_t;
+
+/*!\brief List of supported color range */
+typedef enum aom_color_range {
+  AOM_CR_STUDIO_RANGE = 0, /**< Y [16..235], UV [16..240] */
+  AOM_CR_FULL_RANGE = 1    /**< YUV/RGB [0..255] */
+} aom_color_range_t;       /**< alias for enum aom_color_range */
+
+/*!\brief List of chroma sample positions */
+typedef enum aom_chroma_sample_position {
+  AOM_CSP_UNKNOWN = 0,          /**< Unknown */
+  AOM_CSP_VERTICAL = 1,         /**< Horizontally co-located with luma(0, 0)*/
+                                /**< sample, between two vertical samples */
+  AOM_CSP_COLOCATED = 2,        /**< Co-located with luma(0, 0) sample */
+  AOM_CSP_RESERVED = 3          /**< Reserved value */
+} aom_chroma_sample_position_t; /**< alias for enum aom_transfer_function */
+
+/*!\brief List of insert flags for Metadata
+ *
+ * These flags control how the library treats metadata during encode.
+ *
+ * While encoding, when metadata is added to an aom_image via
+ * aom_img_add_metadata(), the flag passed along with the metadata will
+ * determine where the metadata OBU will be placed in the encoded OBU stream.
+ * Metadata will be emitted into the output stream within the next temporal unit
+ * if it satisfies the specified insertion flag.
+ *
+ * During decoding, when the library encounters a metadata OBU, it is always
+ * flagged as AOM_MIF_ANY_FRAME and emitted with the next output aom_image.
+ */
+typedef enum aom_metadata_insert_flags {
+  AOM_MIF_NON_KEY_FRAME = 0, /**< Adds metadata if it's not keyframe */
+  AOM_MIF_KEY_FRAME = 1,     /**< Adds metadata only if it's a keyframe */
+  AOM_MIF_ANY_FRAME = 2      /**< Adds metadata to any type of frame */
+} aom_metadata_insert_flags_t;
+
+/*!\brief Array of aom_metadata structs for an image. */
+typedef struct aom_metadata_array aom_metadata_array_t;
+
+/*!\brief Metadata payload. */
+typedef struct aom_metadata {
+  uint32_t type;                           /**< Metadata type */
+  uint8_t *payload;                        /**< Metadata payload data */
+  size_t sz;                               /**< Metadata payload size */
+  aom_metadata_insert_flags_t insert_flag; /**< Metadata insertion flag */
+} aom_metadata_t;
+
+/**\brief Image Descriptor */
+typedef struct aom_image {
+  aom_img_fmt_t fmt;                 /**< Image Format */
+  aom_color_primaries_t cp;          /**< CICP Color Primaries */
+  aom_transfer_characteristics_t tc; /**< CICP Transfer Characteristics */
+  aom_matrix_coefficients_t mc;      /**< CICP Matrix Coefficients */
+  int monochrome;                    /**< Whether image is monochrome */
+  aom_chroma_sample_position_t csp;  /**< chroma sample position */
+  aom_color_range_t range;           /**< Color Range */
+
+  /* Image storage dimensions */
+  unsigned int w;         /**< Stored image width */
+  unsigned int h;         /**< Stored image height */
+  unsigned int bit_depth; /**< Stored image bit-depth */
+
+  /* Image display dimensions */
+  unsigned int d_w; /**< Displayed image width */
+  unsigned int d_h; /**< Displayed image height */
+
+  /* Image intended rendering dimensions */
+  unsigned int r_w; /**< Intended rendering image width */
+  unsigned int r_h; /**< Intended rendering image height */
+
+  /* Chroma subsampling info */
+  unsigned int x_chroma_shift; /**< subsampling order, X */
+  unsigned int y_chroma_shift; /**< subsampling order, Y */
+
+/* Image data pointers. */
+#define AOM_PLANE_PACKED 0  /**< To be used for all packed formats */
+#define AOM_PLANE_Y 0       /**< Y (Luminance) plane */
+#define AOM_PLANE_U 1       /**< U (Chroma) plane */
+#define AOM_PLANE_V 2       /**< V (Chroma) plane */
+  unsigned char *planes[3]; /**< pointer to the top left pixel for each plane */
+  int stride[3];            /**< stride between rows for each plane */
+  size_t sz;                /**< data size */
+
+  int bps; /**< bits per sample (for packed formats) */
+
+  int temporal_id; /**< Temporal layer Id of image */
+  int spatial_id;  /**< Spatial layer Id of image */
+
+  /*!\brief The following member may be set by the application to associate
+   * data with this image.
+   */
+  void *user_priv;
+
+  /* The following members should be treated as private. */
+  unsigned char *img_data; /**< private */
+  int img_data_owner;      /**< private */
+  int self_allocd;         /**< private */
+
+  aom_metadata_array_t
+      *metadata; /**< Metadata payloads associated with the image. */
+
+  void *fb_priv; /**< Frame buffer data associated with the image. */
+} aom_image_t;   /**< alias for struct aom_image */
+
+/*!\brief Open a descriptor, allocating storage for the underlying image
+ *
+ * Returns a descriptor for storing an image of the given format. The
+ * storage for the image is allocated on the heap.
+ *
+ * \param[in]    img       Pointer to storage for descriptor. If this parameter
+ *                         is NULL, the storage for the descriptor will be
+ *                         allocated on the heap.
+ * \param[in]    fmt       Format for the image
+ * \param[in]    d_w       Width of the image
+ * \param[in]    d_h       Height of the image
+ * \param[in]    align     Alignment, in bytes, of the image buffer and
+ *                         each row in the image (stride).
+ *
+ * \return Returns a pointer to the initialized image descriptor. If the img
+ *         parameter is non-null, the value of the img parameter will be
+ *         returned.
+ */
+aom_image_t *aom_img_alloc(aom_image_t *img, aom_img_fmt_t fmt,
+                           unsigned int d_w, unsigned int d_h,
+                           unsigned int align);
+
+/*!\brief Open a descriptor, using existing storage for the underlying image
+ *
+ * Returns a descriptor for storing an image of the given format. The
+ * storage for the image has been allocated elsewhere, and a descriptor is
+ * desired to "wrap" that storage.
+ *
+ * \param[in]    img       Pointer to storage for descriptor. If this parameter
+ *                         is NULL, the storage for the descriptor will be
+ *                         allocated on the heap.
+ * \param[in]    fmt       Format for the image
+ * \param[in]    d_w       Width of the image
+ * \param[in]    d_h       Height of the image
+ * \param[in]    align     Alignment, in bytes, of each row in the image
+ *                         (stride).
+ * \param[in]    img_data  Storage to use for the image
+ *
+ * \return Returns a pointer to the initialized image descriptor. If the img
+ *         parameter is non-null, the value of the img parameter will be
+ *         returned.
+ */
+aom_image_t *aom_img_wrap(aom_image_t *img, aom_img_fmt_t fmt, unsigned int d_w,
+                          unsigned int d_h, unsigned int align,
+                          unsigned char *img_data);
+
+/*!\brief Open a descriptor, allocating storage for the underlying image with a
+ * border
+ *
+ * Returns a descriptor for storing an image of the given format and its
+ * borders. The storage for the image is allocated on the heap.
+ *
+ * \param[in]    img        Pointer to storage for descriptor. If this parameter
+ *                          is NULL, the storage for the descriptor will be
+ *                          allocated on the heap.
+ * \param[in]    fmt        Format for the image
+ * \param[in]    d_w        Width of the image
+ * \param[in]    d_h        Height of the image
+ * \param[in]    align      Alignment, in bytes, of the image buffer and
+ *                          each row in the image (stride).
+ * \param[in]    size_align Alignment, in pixels, of the image width and height.
+ * \param[in]    border     A border that is padded on four sides of the image.
+ *
+ * \return Returns a pointer to the initialized image descriptor. If the img
+ *         parameter is non-null, the value of the img parameter will be
+ *         returned.
+ */
+aom_image_t *aom_img_alloc_with_border(aom_image_t *img, aom_img_fmt_t fmt,
+                                       unsigned int d_w, unsigned int d_h,
+                                       unsigned int align,
+                                       unsigned int size_align,
+                                       unsigned int border);
+
+/*!\brief Set the rectangle identifying the displayed portion of the image
+ *
+ * Updates the displayed rectangle (aka viewport) on the image surface to
+ * match the specified coordinates and size.
+ *
+ * \param[in]    img       Image descriptor
+ * \param[in]    x         leftmost column
+ * \param[in]    y         topmost row
+ * \param[in]    w         width
+ * \param[in]    h         height
+ * \param[in]    border    A border that is padded on four sides of the image.
+ *
+ * \return 0 if the requested rectangle is valid, nonzero otherwise.
+ */
+int aom_img_set_rect(aom_image_t *img, unsigned int x, unsigned int y,
+                     unsigned int w, unsigned int h, unsigned int border);
+
+/*!\brief Flip the image vertically (top for bottom)
+ *
+ * Adjusts the image descriptor's pointers and strides to make the image
+ * be referenced upside-down.
+ *
+ * \param[in]    img       Image descriptor
+ */
+void aom_img_flip(aom_image_t *img);
+
+/*!\brief Close an image descriptor
+ *
+ * Frees all allocated storage associated with an image descriptor.
+ *
+ * \param[in]    img       Image descriptor
+ */
+void aom_img_free(aom_image_t *img);
+
+/*!\brief Get the width of a plane
+ *
+ * Get the width of a plane of an image
+ *
+ * \param[in]    img       Image descriptor
+ * \param[in]    plane     Plane index
+ */
+int aom_img_plane_width(const aom_image_t *img, int plane);
+
+/*!\brief Get the height of a plane
+ *
+ * Get the height of a plane of an image
+ *
+ * \param[in]    img       Image descriptor
+ * \param[in]    plane     Plane index
+ */
+int aom_img_plane_height(const aom_image_t *img, int plane);
+
+/*!\brief Add metadata to image.
+ *
+ * Adds metadata to aom_image_t.
+ * Function makes a copy of the provided data parameter.
+ * Metadata insertion point is controlled by insert_flag.
+ *
+ * \param[in]    img          Image descriptor
+ * \param[in]    type         Metadata type
+ * \param[in]    data         Metadata contents
+ * \param[in]    sz           Metadata contents size
+ * \param[in]    insert_flag  Metadata insert flag
+ */
+int aom_img_add_metadata(aom_image_t *img, uint32_t type, const uint8_t *data,
+                         size_t sz, aom_metadata_insert_flags_t insert_flag);
+
+/*!\brief Return a metadata payload stored within the image metadata array.
+ *
+ * Gets the metadata (aom_metadata_t) at the indicated index in the image
+ * metadata array.
+ *
+ * \param[in] img          Pointer to image descriptor to get metadata from
+ * \param[in] index        Metadata index to get from metadata array
+ *
+ * \return Returns a const pointer to the selected metadata, if img and/or index
+ * is invalid, it returns NULL.
+ */
+const aom_metadata_t *aom_img_get_metadata(const aom_image_t *img,
+                                           size_t index);
+
+/*!\brief Return the number of metadata blocks within the image.
+ *
+ * Gets the number of metadata blocks contained within the provided image
+ * metadata array.
+ *
+ * \param[in] img          Pointer to image descriptor to get metadata number
+ * from.
+ *
+ * \return Returns the size of the metadata array. If img or metadata is NULL,
+ * it returns 0.
+ */
+size_t aom_img_num_metadata(const aom_image_t *img);
+
+/*!\brief Remove metadata from image.
+ *
+ * Removes all metadata in image metadata list and sets metadata list pointer
+ * to NULL.
+ *
+ * \param[in]    img       Image descriptor
+ */
+void aom_img_remove_metadata(aom_image_t *img);
+
+/*!\brief Allocate memory for aom_metadata struct.
+ *
+ * Allocates storage for the metadata payload, sets its type and copies the
+ * payload data into the aom_metadata struct. A metadata payload buffer of size
+ * sz is allocated and sz bytes are copied from data into the payload buffer.
+ *
+ * \param[in]    type         Metadata type
+ * \param[in]    data         Metadata data pointer
+ * \param[in]    sz           Metadata size
+ * \param[in]    insert_flag  Metadata insert flag
+ */
+aom_metadata_t *aom_img_metadata_alloc(uint32_t type, const uint8_t *data,
+                                       size_t sz,
+                                       aom_metadata_insert_flags_t insert_flag);
+
+/*!\brief Free metadata struct.
+ *
+ * Free metadata struct and its buffer.
+ *
+ * \param[in]    metadata       Metadata struct pointer
+ */
+void aom_img_metadata_free(aom_metadata_t *metadata);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_AOM_IMAGE_H_
diff --git a/libs/libaom/src/aom/aom_integer.h b/libs/libaom/src/aom/aom_integer.h
new file mode 100644
index 000000000..113671e82
--- /dev/null
+++ b/libs/libaom/src/aom/aom_integer.h
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AOM_AOM_INTEGER_H_
+#define AOM_AOM_AOM_INTEGER_H_
+
+/* get ptrdiff_t, size_t, wchar_t, NULL */
+#include <stddef.h>
+
+#if defined(_MSC_VER)
+#define AOM_FORCE_INLINE __forceinline
+#define AOM_INLINE __inline
+#else
+#define AOM_FORCE_INLINE __inline__ __attribute__((always_inline))
+#define AOM_INLINE inline
+#endif
+
+#if defined(AOM_EMULATE_INTTYPES)
+typedef signed char int8_t;
+typedef signed short int16_t;
+typedef signed int int32_t;
+
+typedef unsigned char uint8_t;
+typedef unsigned short uint16_t;
+typedef unsigned int uint32_t;
+
+#ifndef _UINTPTR_T_DEFINED
+typedef size_t uintptr_t;
+#endif
+
+#else
+
+/* Most platforms have the C99 standard integer types. */
+
+#if defined(__cplusplus)
+#if !defined(__STDC_FORMAT_MACROS)
+#define __STDC_FORMAT_MACROS
+#endif
+#if !defined(__STDC_LIMIT_MACROS)
+#define __STDC_LIMIT_MACROS
+#endif
+#endif  // __cplusplus
+
+#include <stdint.h>
+
+#endif
+
+/* VS2010 defines stdint.h, but not inttypes.h */
+#if defined(_MSC_VER) && _MSC_VER < 1800
+#define PRId64 "I64d"
+#else
+#include <inttypes.h>
+#endif
+
+#if !defined(INT8_MAX)
+#define INT8_MAX 127
+#endif
+
+#if !defined(INT32_MAX)
+#define INT32_MAX 2147483647
+#endif
+
+#if !defined(INT32_MIN)
+#define INT32_MIN (-2147483647 - 1)
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif  // __cplusplus
+
+// Returns size of uint64_t when encoded using LEB128.
+size_t aom_uleb_size_in_bytes(uint64_t value);
+
+// Returns 0 on success, -1 on decode failure.
+// On success, 'value' stores the decoded LEB128 value and 'length' stores
+// the number of bytes decoded.
+int aom_uleb_decode(const uint8_t *buffer, size_t available, uint64_t *value,
+                    size_t *length);
+
+// Encodes LEB128 integer. Returns 0 when successful, and -1 upon failure.
+int aom_uleb_encode(uint64_t value, size_t available, uint8_t *coded_value,
+                    size_t *coded_size);
+
+// Encodes LEB128 integer to size specified. Returns 0 when successful, and -1
+// upon failure.
+// Note: This will write exactly pad_to_size bytes; if the value cannot be
+// encoded in this many bytes, then this will fail.
+int aom_uleb_encode_fixed_size(uint64_t value, size_t available,
+                               size_t pad_to_size, uint8_t *coded_value,
+                               size_t *coded_size);
+
+#if defined(__cplusplus)
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // AOM_AOM_AOM_INTEGER_H_
diff --git a/libs/libaom/src/aom/aomcx.h b/libs/libaom/src/aom/aomcx.h
new file mode 100644
index 000000000..051d33e7b
--- /dev/null
+++ b/libs/libaom/src/aom/aomcx.h
@@ -0,0 +1,1774 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AOM_AOMCX_H_
+#define AOM_AOM_AOMCX_H_
+
+/*!\defgroup aom_encoder AOMedia AOM/AV1 Encoder
+ * \ingroup aom
+ *
+ * @{
+ */
+#include "aom/aom.h"
+#include "aom/aom_encoder.h"
+
+/*!\file
+ * \brief Provides definitions for using AOM or AV1 encoder algorithm within the
+ *        aom Codec Interface.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!\name Algorithm interface for AV1
+ *
+ * This interface provides the capability to encode raw AV1 streams.
+ * @{
+ */
+extern aom_codec_iface_t aom_codec_av1_cx_algo;
+extern aom_codec_iface_t *aom_codec_av1_cx(void);
+/*!@} - end algorithm interface member group*/
+
+/*
+ * Algorithm Flags
+ */
+
+/*!\brief Don't reference the last frame
+ *
+ * When this flag is set, the encoder will not use the last frame as a
+ * predictor. When not set, the encoder will choose whether to use the
+ * last frame or not automatically.
+ */
+#define AOM_EFLAG_NO_REF_LAST (1 << 16)
+/*!\brief Don't reference the last2 frame
+ *
+ * When this flag is set, the encoder will not use the last2 frame as a
+ * predictor. When not set, the encoder will choose whether to use the
+ * last2 frame or not automatically.
+ */
+#define AOM_EFLAG_NO_REF_LAST2 (1 << 17)
+/*!\brief Don't reference the last3 frame
+ *
+ * When this flag is set, the encoder will not use the last3 frame as a
+ * predictor. When not set, the encoder will choose whether to use the
+ * last3 frame or not automatically.
+ */
+#define AOM_EFLAG_NO_REF_LAST3 (1 << 18)
+/*!\brief Don't reference the golden frame
+ *
+ * When this flag is set, the encoder will not use the golden frame as a
+ * predictor. When not set, the encoder will choose whether to use the
+ * golden frame or not automatically.
+ */
+#define AOM_EFLAG_NO_REF_GF (1 << 19)
+
+/*!\brief Don't reference the alternate reference frame
+ *
+ * When this flag is set, the encoder will not use the alt ref frame as a
+ * predictor. When not set, the encoder will choose whether to use the
+ * alt ref frame or not automatically.
+ */
+#define AOM_EFLAG_NO_REF_ARF (1 << 20)
+/*!\brief Don't reference the bwd reference frame
+ *
+ * When this flag is set, the encoder will not use the bwd ref frame as a
+ * predictor. When not set, the encoder will choose whether to use the
+ * bwd ref frame or not automatically.
+ */
+#define AOM_EFLAG_NO_REF_BWD (1 << 21)
+/*!\brief Don't reference the alt2 reference frame
+ *
+ * When this flag is set, the encoder will not use the alt2 ref frame as a
+ * predictor. When not set, the encoder will choose whether to use the
+ * alt2 ref frame or not automatically.
+ */
+#define AOM_EFLAG_NO_REF_ARF2 (1 << 22)
+
+/*!\brief Don't update the last frame
+ *
+ * When this flag is set, the encoder will not update the last frame with
+ * the contents of the current frame.
+ */
+#define AOM_EFLAG_NO_UPD_LAST (1 << 23)
+
+/*!\brief Don't update the golden frame
+ *
+ * When this flag is set, the encoder will not update the golden frame with
+ * the contents of the current frame.
+ */
+#define AOM_EFLAG_NO_UPD_GF (1 << 24)
+
+/*!\brief Don't update the alternate reference frame
+ *
+ * When this flag is set, the encoder will not update the alt ref frame with
+ * the contents of the current frame.
+ */
+#define AOM_EFLAG_NO_UPD_ARF (1 << 25)
+/*!\brief Disable entropy update
+ *
+ * When this flag is set, the encoder will not update its internal entropy
+ * model based on the entropy of this frame.
+ */
+#define AOM_EFLAG_NO_UPD_ENTROPY (1 << 26)
+/*!\brief Disable ref frame mvs
+ *
+ * When this flag is set, the encoder will not allow frames to
+ * be encoded using mfmv.
+ */
+#define AOM_EFLAG_NO_REF_FRAME_MVS (1 << 27)
+/*!\brief Enable error resilient frame
+ *
+ * When this flag is set, the encoder will code frames as error
+ * resilient.
+ */
+#define AOM_EFLAG_ERROR_RESILIENT (1 << 28)
+/*!\brief Enable s frame mode
+ *
+ * When this flag is set, the encoder will code frames as an
+ * s frame.
+ */
+#define AOM_EFLAG_SET_S_FRAME (1 << 29)
+/*!\brief Force primary_ref_frame to PRIMARY_REF_NONE
+ *
+ * When this flag is set, the encoder will set a frame's primary_ref_frame
+ * to PRIMARY_REF_NONE
+ */
+#define AOM_EFLAG_SET_PRIMARY_REF_NONE (1 << 30)
+
+/*!\brief AVx encoder control functions
+ *
+ * This set of macros define the control functions available for AVx
+ * encoder interface.
+ *
+ * \sa #aom_codec_control(aom_codec_ctx_t *ctx, int ctrl_id, ...)
+ */
+enum aome_enc_control_id {
+  /*!\brief Codec control function to set which reference frame encoder can use,
+   * int parameter.
+   */
+  AOME_USE_REFERENCE = 7,
+
+  /*!\brief Codec control function to pass an ROI map to encoder, aom_roi_map_t*
+   * parameter.
+   */
+  AOME_SET_ROI_MAP = 8,
+
+  /*!\brief Codec control function to pass an Active map to encoder,
+   * aom_active_map_t* parameter.
+   */
+  AOME_SET_ACTIVEMAP = 9,
+
+  /* NOTE: enum 10 unused */
+
+  /*!\brief Codec control function to set encoder scaling mode,
+   * aom_scaling_mode_t* parameter.
+   */
+  AOME_SET_SCALEMODE = 11,
+
+  /*!\brief Codec control function to set encoder spatial layer id, unsigned int
+   * parameter.
+   */
+  AOME_SET_SPATIAL_LAYER_ID = 12,
+
+  /*!\brief Codec control function to set encoder internal speed settings,
+   * int parameter
+   *
+   * Changes in this value influences the complexity of algorithms used in
+   * encoding process, values greater than 0 will increase encoder speed at
+   * the expense of quality.
+   *
+   * Valid range: 0..8. 0 runs the slowest, and 8 runs the fastest;
+   * quality improves as speed decreases (since more compression
+   * possibilities are explored).
+   */
+  AOME_SET_CPUUSED = 13,
+
+  /*!\brief Codec control function to enable automatic set and use alf frames,
+   * unsigned int parameter
+   *
+   * - 0 = disable
+   * - 1 = enable (default)
+   */
+  AOME_SET_ENABLEAUTOALTREF = 14,
+
+  /* NOTE: enum 15 unused */
+
+  /*!\brief Codec control function to set sharpness, unsigned int parameter.
+   */
+  AOME_SET_SHARPNESS = AOME_SET_ENABLEAUTOALTREF + 2,  // 16
+
+  /*!\brief Codec control function to set the threshold for MBs treated static,
+   * unsigned int parameter
+   */
+  AOME_SET_STATIC_THRESHOLD = 17,
+
+  /* NOTE: enum 18 unused */
+
+  /*!\brief Codec control function to get last quantizer chosen by the encoder,
+   * int* parameter
+   *
+   * Return value uses internal quantizer scale defined by the codec.
+   */
+  AOME_GET_LAST_QUANTIZER = AOME_SET_STATIC_THRESHOLD + 2,  // 19
+
+  /*!\brief Codec control function to get last quantizer chosen by the encoder,
+   * int* parameter
+   *
+   * Return value uses the 0..63 scale as used by the rc_*_quantizer config
+   * parameters.
+   */
+  AOME_GET_LAST_QUANTIZER_64 = 20,
+
+  /*!\brief Codec control function to set the max no of frames to create arf,
+   * unsigned int parameter
+   */
+  AOME_SET_ARNR_MAXFRAMES = 21,
+
+  /*!\brief Codec control function to set the filter strength for the arf,
+   * unsigned int parameter
+   */
+  AOME_SET_ARNR_STRENGTH = 22,
+
+  /* NOTE: enum 23 unused */
+
+  /*!\brief Codec control function to set visual tuning, aom_tune_metric (int)
+   * parameter
+   */
+  AOME_SET_TUNING = AOME_SET_ARNR_STRENGTH + 2,  // 24
+
+  /*!\brief Codec control function to set constrained / constant quality level,
+   * unsigned int parameter
+   *
+   * Valid range: 0..63
+   *
+   * \attention For this value to be used aom_codec_enc_cfg_t::rc_end_usage
+   *            must be set to #AOM_CQ or #AOM_Q.
+   */
+  AOME_SET_CQ_LEVEL = 25,
+
+  /*!\brief Codec control function to set max data rate for intra frames,
+   * unsigned int parameter
+   *
+   * This value controls additional clamping on the maximum size of a
+   * keyframe. It is expressed as a percentage of the average
+   * per-frame bitrate, with the special (and default) value 0 meaning
+   * unlimited, or no additional clamping beyond the codec's built-in
+   * algorithm.
+   *
+   * For example, to allocate no more than 4.5 frames worth of bitrate
+   * to a keyframe, set this to 450.
+   */
+  AOME_SET_MAX_INTRA_BITRATE_PCT = 26,
+
+  /*!\brief Codec control function to set number of spatial layers, int
+   * parameter
+   */
+  AOME_SET_NUMBER_SPATIAL_LAYERS = 27,
+
+  /*!\brief Codec control function to set max data rate for inter frames,
+   * unsigned int parameter
+   *
+   * This value controls additional clamping on the maximum size of an
+   * inter frame. It is expressed as a percentage of the average
+   * per-frame bitrate, with the special (and default) value 0 meaning
+   * unlimited, or no additional clamping beyond the codec's built-in
+   * algorithm.
+   *
+   * For example, to allow no more than 4.5 frames worth of bitrate
+   * to an inter frame, set this to 450.
+   */
+  AV1E_SET_MAX_INTER_BITRATE_PCT = AOME_SET_MAX_INTRA_BITRATE_PCT + 2,  // 28
+
+  /*!\brief Boost percentage for Golden Frame in CBR mode, unsigned int
+   * parameter
+   *
+   * This value controls the amount of boost given to Golden Frame in
+   * CBR mode. It is expressed as a percentage of the average
+   * per-frame bitrate, with the special (and default) value 0 meaning
+   * the feature is off, i.e., no golden frame boost in CBR mode and
+   * average bitrate target is used.
+   *
+   * For example, to allow 100% more bits, i.e, 2X, in a golden frame
+   * than average frame, set this to 100.
+   */
+  AV1E_SET_GF_CBR_BOOST_PCT = 29,
+
+  /* NOTE: enum 30 unused */
+
+  /*!\brief Codec control function to set lossless encoding mode, unsigned int
+   * parameter
+   *
+   * AV1 can operate in lossless encoding mode, in which the bitstream
+   * produced will be able to decode and reconstruct a perfect copy of
+   * input source.
+   *
+   * - 0 = normal coding mode, may be lossy (default)
+   * - 1 = lossless coding mode
+   */
+  AV1E_SET_LOSSLESS = AV1E_SET_GF_CBR_BOOST_PCT + 2,  // 31
+
+  /*!\brief Codec control function to enable the row based multi-threading
+   * of the encoder, unsigned int parameter
+   *
+   * - 0 = disable
+   * - 1 = enable (default)
+   */
+  AV1E_SET_ROW_MT = 32,
+
+  /*!\brief Codec control function to set number of tile columns. unsigned int
+   * parameter
+   *
+   * In encoding and decoding, AV1 allows an input image frame be partitioned
+   * into separate vertical tile columns, which can be encoded or decoded
+   * independently. This enables easy implementation of parallel encoding and
+   * decoding. The parameter for this control describes the number of tile
+   * columns (in log2 units), which has a valid range of [0, 6]:
+   * \verbatim
+                 0 = 1 tile column
+                 1 = 2 tile columns
+                 2 = 4 tile columns
+                 .....
+                 n = 2**n tile columns
+     \endverbatim
+   * By default, the value is 0, i.e. one single column tile for entire image.
+   */
+  AV1E_SET_TILE_COLUMNS = 33,
+
+  /*!\brief Codec control function to set number of tile rows, unsigned int
+   * parameter
+   *
+   * In encoding and decoding, AV1 allows an input image frame be partitioned
+   * into separate horizontal tile rows, which can be encoded or decoded
+   * independently. The parameter for this control describes the number of tile
+   * rows (in log2 units), which has a valid range of [0, 6]:
+   * \verbatim
+                0 = 1 tile row
+                1 = 2 tile rows
+                2 = 4 tile rows
+                .....
+                n = 2**n tile rows
+   \endverbatim
+   * By default, the value is 0, i.e. one single row tile for entire image.
+   */
+  AV1E_SET_TILE_ROWS = 34,
+
+  /*!\brief Codec control function to enable RDO modulated by frame temporal
+   * dependency, unsigned int parameter
+   *
+   * - 0 = disable
+   * - 1 = enable (default)
+   */
+  AV1E_SET_ENABLE_TPL_MODEL = 35,
+
+  /*!\brief Codec control function to enable temporal filtering on key frame,
+   * unsigned int parameter
+   *
+   * - 0 = disable
+   * - 1 = enable (default)
+   */
+  AV1E_SET_ENABLE_KEYFRAME_FILTERING = 36,
+
+  /*!\brief Codec control function to enable frame parallel decoding feature,
+   * unsigned int parameter
+   *
+   * AV1 has a bitstream feature to reduce decoding dependency between frames
+   * by turning off backward update of probability context used in encoding
+   * and decoding. This allows staged parallel processing of more than one
+   * video frames in the decoder. This control function provides a mean to
+   * turn this feature on or off for bitstreams produced by encoder.
+   *
+   * - 0 = disable (default)
+   * - 1 = enable
+   */
+  AV1E_SET_FRAME_PARALLEL_DECODING = 37,
+
+  /*!\brief Codec control function to enable error_resilient_mode, int parameter
+   *
+   * AV1 has a bitstream feature to guarantee parseability of a frame
+   * by turning on the error_resilient_decoding mode, even though the
+   * reference buffers are unreliable or not received.
+   *
+   * - 0 = disable (default)
+   * - 1 = enable
+   */
+  AV1E_SET_ERROR_RESILIENT_MODE = 38,
+
+  /*!\brief Codec control function to enable s_frame_mode, int parameter
+   *
+   * AV1 has a bitstream feature to designate certain frames as S-frames,
+   * from where we can switch to a different stream,
+   * even though the reference buffers may not be exactly identical.
+   *
+   * - 0 = disable (default)
+   * - 1 = enable
+   */
+  AV1E_SET_S_FRAME_MODE = 39,
+
+  /*!\brief Codec control function to set adaptive quantization mode, unsigned
+   * int parameter
+   *
+   * AV1 has a segment based feature that allows encoder to adaptively change
+   * quantization parameter for each segment within a frame to improve the
+   * subjective quality. This control makes encoder operate in one of the
+   * several AQ_modes supported.
+   *
+   * - 0 = disable (default)
+   * - 1 = enable
+   */
+  AV1E_SET_AQ_MODE = 40,
+
+  /*!\brief Codec control function to enable/disable periodic Q boost, unsigned
+   * int parameter
+   *
+   * One AV1 encoder speed feature is to enable quality boost by lowering
+   * frame level Q periodically. This control function provides a mean to
+   * turn on/off this feature.
+   *
+   * - 0 = disable (default)
+   * - 1 = enable
+   */
+  AV1E_SET_FRAME_PERIODIC_BOOST = 41,
+
+  /*!\brief Codec control function to set noise sensitivity, unsigned int
+   * parameter
+   *
+   * - 0 = disable (default)
+   * - 1 = enable (Y only)
+   */
+  AV1E_SET_NOISE_SENSITIVITY = 42,
+
+  /*!\brief Codec control function to set content type, aom_tune_content
+   * parameter
+   *
+   *  - AOM_CONTENT_DEFAULT = Regular video content (default)
+   *  - AOM_CONTENT_SCREEN  = Screen capture content
+   */
+  AV1E_SET_TUNE_CONTENT = 43,
+
+  /*!\brief Codec control function to set CDF update mode, unsigned int
+   * parameter
+   *
+   *  - 0: no update
+   *  - 1: update on every frame (default)
+   *  - 2: selectively update
+   */
+  AV1E_SET_CDF_UPDATE_MODE = 44,
+
+  /*!\brief Codec control function to set color space info, int parameter
+   *
+   *  - 0 = For future use
+   *  - 1 = BT.709
+   *  - 2 = Unspecified (default)
+   *  - 3 = For future use
+   *  - 4 = BT.470 System M (historical)
+   *  - 5 = BT.470 System B, G (historical)
+   *  - 6 = BT.601
+   *  - 7 = SMPTE 240
+   *  - 8 = Generic film (color filters using illuminant C)
+   *  - 9 = BT.2020, BT.2100
+   *  - 10 = SMPTE 428 (CIE 1921 XYZ)
+   *  - 11 = SMPTE RP 431-2
+   *  - 12 = SMPTE EG 432-1
+   *  - 13..21 = For future use
+   *  - 22 = EBU Tech. 3213-E
+   *  - 23 = For future use
+   */
+  AV1E_SET_COLOR_PRIMARIES = 45,
+
+  /*!\brief Codec control function to set transfer function info, int parameter
+   *
+   * - 0 = For future use
+   * - 1 = BT.709
+   * - 2 = Unspecified (default)
+   * - 3 = For future use
+   * - 4 = BT.470 System M (historical)
+   * - 5 = BT.470 System B, G (historical)
+   * - 6 = BT.601
+   * - 7 = SMPTE 240 M
+   * - 8 = Linear
+   * - 9 = Logarithmic (100 : 1 range)
+   * - 10 = Logarithmic (100 * Sqrt(10) : 1 range)
+   * - 11 = IEC 61966-2-4
+   * - 12 = BT.1361
+   * - 13 = sRGB or sYCC
+   * - 14 = BT.2020 10-bit systems
+   * - 15 = BT.2020 12-bit systems
+   * - 16 = SMPTE ST 2084, ITU BT.2100 PQ
+   * - 17 = SMPTE ST 428
+   * - 18 = BT.2100 HLG, ARIB STD-B67
+   * - 19 = For future use
+   */
+  AV1E_SET_TRANSFER_CHARACTERISTICS = 46,
+
+  /*!\brief Codec control function to set transfer function info, int parameter
+   *
+   * - 0 = Identity matrix
+   * - 1 = BT.709
+   * - 2 = Unspecified (default)
+   * - 3 = For future use
+   * - 4 = US FCC 73.628
+   * - 5 = BT.470 System B, G (historical)
+   * - 6 = BT.601
+   * - 7 = SMPTE 240 M
+   * - 8 = YCgCo
+   * - 9 = BT.2020 non-constant luminance, BT.2100 YCbCr
+   * - 10 = BT.2020 constant luminance
+   * - 11 = SMPTE ST 2085 YDzDx
+   * - 12 = Chromaticity-derived non-constant luminance
+   * - 13 = Chromaticity-derived constant luminance
+   * - 14 = BT.2100 ICtCp
+   * - 15 = For future use
+   */
+  AV1E_SET_MATRIX_COEFFICIENTS = 47,
+
+  /*!\brief Codec control function to set chroma 4:2:0 sample position info,
+   * aom_chroma_sample_position_t parameter
+   *
+   * AOM_CSP_UNKNOWN is default
+   */
+  AV1E_SET_CHROMA_SAMPLE_POSITION = 48,
+
+  /*!\brief Codec control function to set minimum interval between GF/ARF
+   * frames, unsigned int parameter
+   *
+   * By default the value is set as 4.
+   */
+  AV1E_SET_MIN_GF_INTERVAL = 49,
+
+  /*!\brief Codec control function to set minimum interval between GF/ARF
+   * frames, unsigned int parameter
+   *
+   * By default the value is set as 16.
+   */
+  AV1E_SET_MAX_GF_INTERVAL = 50,
+
+  /*!\brief Codec control function to get an active map back from the encoder,
+    aom_active_map_t* parameter
+   */
+  AV1E_GET_ACTIVEMAP = 51,
+
+  /*!\brief Codec control function to set color range bit, int parameter
+   *
+   * - 0 = Limited range, 16..235 or HBD equivalent (default)
+   * - 1 = Full range, 0..255 or HBD equivalent
+   */
+  AV1E_SET_COLOR_RANGE = 52,
+
+  /*!\brief Codec control function to set intended rendering image size,
+   * int32_t[2] parameter
+   *
+   * By default, this is identical to the image size in pixels.
+   */
+  AV1E_SET_RENDER_SIZE = 53,
+
+  /*!\brief Control to set target sequence level index for a certain operating
+   * point(OP), int parameter
+   * Possible values are in the form of "ABxy"(pad leading zeros if less than
+   * 4 digits).
+   *  - AB: OP index.
+   *  - xy: Target level index for the OP. Can be values 0~23(corresponding to
+   *    level 2.0 ~ 7.3) or 24(keep level stats only for level monitoring) or
+   *    31(maximum level parameter, no level-based constraints).
+   *
+   * E.g.:
+   * - "0" means target level index 0 for the 0th OP;
+   * - "111" means target level index 11 for the 1st OP;
+   * - "1021" means target level index 21 for the 10th OP.
+   *
+   * If the target level is not specified for an OP, the maximum level parameter
+   * of 31 is used as default.
+   */
+  AV1E_SET_TARGET_SEQ_LEVEL_IDX = 54,
+
+  /*!\brief Codec control function to get sequence level index for each
+   * operating point. int* parameter. There can be at most 32 operating points.
+   * The results will be written into a provided integer array of sufficient
+   * size.
+   */
+  AV1E_GET_SEQ_LEVEL_IDX = 55,
+
+  /*!\brief Codec control function to set intended superblock size, unsigned int
+   * parameter
+   *
+   * By default, the superblock size is determined separately for each
+   * frame by the encoder.
+   */
+  AV1E_SET_SUPERBLOCK_SIZE = 56,
+
+  /*!\brief Codec control function to enable automatic set and use of
+   * bwd-pred frames, unsigned int parameter
+   *
+   * - 0 = disable (default)
+   * - 1 = enable
+   */
+  AOME_SET_ENABLEAUTOBWDREF = 57,
+
+  /*!\brief Codec control function to encode with CDEF, unsigned int parameter
+   *
+   * CDEF is the constrained directional enhancement filter which is an
+   * in-loop filter aiming to remove coding artifacts
+   *
+   * - 0 = disable
+   * - 1 = enable (default)
+   */
+  AV1E_SET_ENABLE_CDEF = 58,
+
+  /*!\brief Codec control function to encode with Loop Restoration Filter,
+   * unsigned int parameter
+   *
+   * - 0 = disable
+   * - 1 = enable (default)
+   */
+  AV1E_SET_ENABLE_RESTORATION = 59,
+
+  /*!\brief Codec control function to force video mode, unsigned int parameter
+   *
+   * - 0 = do not force video mode (default)
+   * - 1 = force video mode even for a single frame
+   */
+  AV1E_SET_FORCE_VIDEO_MODE = 60,
+
+  /*!\brief Codec control function to predict with OBMC mode, unsigned int
+   * parameter
+   *
+   * - 0 = disable
+   * - 1 = enable (default)
+   */
+  AV1E_SET_ENABLE_OBMC = 61,
+
+  /*!\brief Codec control function to encode without trellis quantization,
+   * unsigned int parameter
+   *
+   * - 0 = apply trellis quantization (default)
+   * - 1 = do not apply trellis quantization
+   * - 2 = disable trellis quantization in rd search
+   * - 3 = disable trellis quantization in estimate yrd
+   */
+  AV1E_SET_DISABLE_TRELLIS_QUANT = 62,
+
+  /*!\brief Codec control function to encode with quantisation matrices,
+   * unsigned int parameter
+   *
+   * AOM can operate with default quantisation matrices dependent on
+   * quantisation level and block type.
+   *
+   * - 0 = disable (default)
+   * - 1 = enable
+   */
+  AV1E_SET_ENABLE_QM = 63,
+
+  /*!\brief Codec control function to set the min quant matrix flatness,
+   * unsigned int parameter
+   *
+   * AOM can operate with different ranges of quantisation matrices.
+   * As quantisation levels increase, the matrices get flatter. This
+   * control sets the minimum level of flatness from which the matrices
+   * are determined.
+   *
+   * By default, the encoder sets this minimum at half the available
+   * range.
+   */
+  AV1E_SET_QM_MIN = 64,
+
+  /*!\brief Codec control function to set the max quant matrix flatness,
+   * unsigned int parameter
+   *
+   * AOM can operate with different ranges of quantisation matrices.
+   * As quantisation levels increase, the matrices get flatter. This
+   * control sets the maximum level of flatness possible.
+   *
+   * By default, the encoder sets this maximum at the top of the
+   * available range.
+   */
+  AV1E_SET_QM_MAX = 65,
+
+  /*!\brief Codec control function to set the min quant matrix flatness,
+   * unsigned int parameter
+   *
+   * AOM can operate with different ranges of quantisation matrices.
+   * As quantisation levels increase, the matrices get flatter. This
+   * control sets the flatness for luma (Y).
+   *
+   * By default, the encoder sets this minimum at half the available
+   * range.
+   */
+  AV1E_SET_QM_Y = 66,
+
+  /*!\brief Codec control function to set the min quant matrix flatness,
+   * unsigned int parameter
+   *
+   * AOM can operate with different ranges of quantisation matrices.
+   * As quantisation levels increase, the matrices get flatter. This
+   * control sets the flatness for chroma (U).
+   *
+   * By default, the encoder sets this minimum at half the available
+   * range.
+   */
+  AV1E_SET_QM_U = 67,
+
+  /*!\brief Codec control function to set the min quant matrix flatness,
+   * unsigned int parameter
+   *
+   * AOM can operate with different ranges of quantisation matrices.
+   * As quantisation levels increase, the matrices get flatter. This
+   * control sets the flatness for chrome (V).
+   *
+   * By default, the encoder sets this minimum at half the available
+   * range.
+   */
+  AV1E_SET_QM_V = 68,
+
+  /* NOTE: enum 69 unused */
+
+  /*!\brief Codec control function to set a maximum number of tile groups,
+   * unsigned int parameter
+   *
+   * This will set the maximum number of tile groups. This will be
+   * overridden if an MTU size is set. The default value is 1.
+   */
+  AV1E_SET_NUM_TG = 70,
+
+  /*!\brief Codec control function to set an MTU size for a tile group, unsigned
+   * int parameter
+   *
+   * This will set the maximum number of bytes in a tile group. This can be
+   * exceeded only if a single tile is larger than this amount.
+   *
+   * By default, the value is 0, in which case a fixed number of tile groups
+   * is used.
+   */
+  AV1E_SET_MTU = 71,
+
+  /* NOTE: enum 72 unused */
+
+  /*!\brief Codec control function to enable/disable rectangular partitions, int
+   * parameter
+   *
+   * - 0 = disable
+   * - 1 = enable (default)
+   */
+  AV1E_SET_ENABLE_RECT_PARTITIONS = 73,
+
+  /*!\brief Codec control function to enable/disable AB partitions, int
+   * parameter
+   *
+   * - 0 = disable
+   * - 1 = enable (default)
+   */
+  AV1E_SET_ENABLE_AB_PARTITIONS = 74,
+
+  /*!\brief Codec control function to enable/disable 1:4 and 4:1 partitions, int
+   * parameter
+   *
+   * - 0 = disable
+   * - 1 = enable (default)
+   */
+  AV1E_SET_ENABLE_1TO4_PARTITIONS = 75,
+
+  /*!\brief Codec control function to set min partition size, int parameter
+   *
+   * min_partition_size is applied to both width and height of the partition.
+   * i.e, both width and height of a partition can not be smaller than
+   * the min_partition_size, except the partition at the picture boundary.
+   *
+   * Valid values: [4, 8, 16, 32, 64, 128]. The default value is 4 for
+   * 4x4.
+   */
+  AV1E_SET_MIN_PARTITION_SIZE = 76,
+
+  /*!\brief Codec control function to set max partition size, int parameter
+   *
+   * max_partition_size is applied to both width and height of the partition.
+   * i.e, both width and height of a partition can not be larger than
+   * the max_partition_size.
+   *
+   * Valid values:[4, 8, 16, 32, 64, 128] The default value is 128 for
+   * 128x128.
+   */
+  AV1E_SET_MAX_PARTITION_SIZE = 77,
+
+  /*!\brief Codec control function to turn on / off intra edge filter
+   * at sequence level, int parameter
+   *
+   * - 0 = disable
+   * - 1 = enable (default)
+   */
+  AV1E_SET_ENABLE_INTRA_EDGE_FILTER = 78,
+
+  /*!\brief Codec control function to turn on / off frame order hint (int
+   * parameter). Affects: joint compound mode, motion field motion vector,
+   * ref frame sign bias
+   *
+   * - 0 = disable
+   * - 1 = enable (default)
+   */
+  AV1E_SET_ENABLE_ORDER_HINT = 79,
+
+  /*!\brief Codec control function to turn on / off 64-length transforms, int
+   * parameter
+   *
+   * This will enable or disable usage of length 64 transforms in any
+   * direction.
+   *
+   * - 0 = disable
+   * - 1 = enable (default)
+   */
+  AV1E_SET_ENABLE_TX64 = 80,
+
+  /*!\brief Codec control function to turn on / off flip and identity
+   * transforms, int parameter
+   *
+   * This will enable or disable usage of flip and identity transform
+   * types in any direction. If enabled, this includes:
+   * - FLIPADST_DCT
+   * - DCT_FLIPADST
+   * - FLIPADST_FLIPADST
+   * - ADST_FLIPADST
+   * - FLIPADST_ADST
+   * - IDTX
+   * - V_DCT
+   * - H_DCT
+   * - V_ADST
+   * - H_ADST
+   * - V_FLIPADST
+   * - H_FLIPADST
+   *
+   * Valid values:
+   * - 0 = disable
+   * - 1 = enable (default)
+   */
+  AV1E_SET_ENABLE_FLIP_IDTX = 81,
+
+  /* Note: enum value 82 unused */
+
+  /*!\brief Codec control function to turn on / off dist-wtd compound mode
+   * at sequence level, int parameter
+   *
+   * This will enable or disable distance-weighted compound mode.
+   * \attention If AV1E_SET_ENABLE_ORDER_HINT is 0, then this flag is forced
+   * to 0.
+   *
+   * - 0 = disable
+   * - 1 = enable (default)
+   */
+  AV1E_SET_ENABLE_DIST_WTD_COMP = 83,
+
+  /*!\brief Codec control function to turn on / off ref frame mvs (mfmv) usage
+   * at sequence level, int parameter
+   *
+   * \attention If AV1E_SET_ENABLE_ORDER_HINT is 0, then this flag is forced
+   * to 0.
+   *
+   * - 0 = disable
+   * - 1 = enable (default)
+   */
+  AV1E_SET_ENABLE_REF_FRAME_MVS = 84,
+
+  /*!\brief Codec control function to set temporal mv prediction
+   * enabling/disabling at frame level, int parameter
+   *
+   * \attention If AV1E_SET_ENABLE_REF_FRAME_MVS is 0, then this flag is
+   * forced to 0.
+   *
+   * - 0 = disable
+   * - 1 = enable (default)
+   */
+  AV1E_SET_ALLOW_REF_FRAME_MVS = 85,
+
+  /*!\brief Codec control function to turn on / off dual interpolation filter
+   * for a sequence, int parameter
+   *
+   * - 0 = disable
+   * - 1 = enable
+   */
+  AV1E_SET_ENABLE_DUAL_FILTER = 86,
+
+  /*!\brief Codec control function to turn on / off delta quantization in chroma
+   * planes usage for a sequence, int parameter
+   *
+   * - 0 = disable (default)
+   * - 1 = enable
+   */
+  AV1E_SET_ENABLE_CHROMA_DELTAQ = 87,
+
+  /*!\brief Codec control function to turn on / off masked compound usage
+   * (wedge and diff-wtd compound modes) for a sequence, int parameter
+   *
+   * - 0 = disable
+   * - 1 = enable (default)
+   */
+  AV1E_SET_ENABLE_MASKED_COMP = 88,
+
+  /*!\brief Codec control function to turn on / off one sided compound usage
+   * for a sequence, int parameter
+   *
+   * - 0 = disable
+   * - 1 = enable (default)
+   */
+  AV1E_SET_ENABLE_ONESIDED_COMP = 89,
+
+  /*!\brief Codec control function to turn on / off interintra compound
+   * for a sequence, int parameter
+   *
+   * - 0 = disable
+   * - 1 = enable (default)
+   */
+  AV1E_SET_ENABLE_INTERINTRA_COMP = 90,
+
+  /*!\brief Codec control function to turn on / off smooth inter-intra
+   * mode for a sequence, int parameter
+   *
+   * - 0 = disable
+   * - 1 = enable (default)
+   */
+  AV1E_SET_ENABLE_SMOOTH_INTERINTRA = 91,
+
+  /*!\brief Codec control function to turn on / off difference weighted
+   * compound, int parameter
+   *
+   * - 0 = disable
+   * - 1 = enable (default)
+   */
+  AV1E_SET_ENABLE_DIFF_WTD_COMP = 92,
+
+  /*!\brief Codec control function to turn on / off interinter wedge
+   * compound, int parameter
+   *
+   * - 0 = disable
+   * - 1 = enable (default)
+   */
+  AV1E_SET_ENABLE_INTERINTER_WEDGE = 93,
+
+  /*!\brief Codec control function to turn on / off interintra wedge
+   * compound, int parameter
+   *
+   * - 0 = disable
+   * - 1 = enable (default)
+   */
+  AV1E_SET_ENABLE_INTERINTRA_WEDGE = 94,
+
+  /*!\brief Codec control function to turn on / off global motion usage
+   * for a sequence, int parameter
+   *
+   * - 0 = disable
+   * - 1 = enable (default)
+   */
+  AV1E_SET_ENABLE_GLOBAL_MOTION = 95,
+
+  /*!\brief Codec control function to turn on / off warped motion usage
+   * at sequence level, int parameter
+   *
+   * - 0 = disable
+   * - 1 = enable (default)
+   */
+  AV1E_SET_ENABLE_WARPED_MOTION = 96,
+
+  /*!\brief Codec control function to turn on / off warped motion usage
+   * at frame level, int parameter
+   *
+   * \attention If AV1E_SET_ENABLE_WARPED_MOTION is 0, then this flag is
+   * forced to 0.
+   *
+   * - 0 = disable
+   * - 1 = enable (default)
+   */
+  AV1E_SET_ALLOW_WARPED_MOTION = 97,
+
+  /*!\brief Codec control function to turn on / off filter intra usage at
+   * sequence level, int parameter
+   *
+   * \attention If AV1E_SET_ENABLE_FILTER_INTRA is 0, then this flag is
+   * forced to 0.
+   *
+   * - 0 = disable
+   * - 1 = enable (default)
+   */
+  AV1E_SET_ENABLE_FILTER_INTRA = 98,
+
+  /*!\brief Codec control function to turn on / off smooth intra modes usage,
+   * int parameter
+   *
+   * This will enable or disable usage of smooth, smooth_h and smooth_v intra
+   * modes.
+   *
+   * - 0 = disable
+   * - 1 = enable (default)
+   */
+  AV1E_SET_ENABLE_SMOOTH_INTRA = 99,
+
+  /*!\brief Codec control function to turn on / off Paeth intra mode usage, int
+   * parameter
+   *
+   * - 0 = disable
+   * - 1 = enable (default)
+   */
+  AV1E_SET_ENABLE_PAETH_INTRA = 100,
+
+  /*!\brief Codec control function to turn on / off CFL uv intra mode usage, int
+   * parameter
+   *
+   * This will enable or disable usage of chroma-from-luma intra mode.
+   *
+   * - 0 = disable
+   * - 1 = enable (default)
+   */
+  AV1E_SET_ENABLE_CFL_INTRA = 101,
+
+  /*!\brief Codec control function to turn on / off frame superresolution, int
+   * parameter
+   *
+   * \attention If AV1E_SET_ENABLE_SUPERRES is 0, then this flag is forced to 0.
+   *
+   * - 0 = disable
+   * - 1 = enable (default)
+   */
+  AV1E_SET_ENABLE_SUPERRES = 102,
+
+  /*!\brief Codec control function to turn on / off overlay frames for
+   * filtered ALTREF frames, int parameter
+   *
+   * This will enable or disable coding of overlay frames for filtered ALTREF
+   * frames. When set to 0, overlay frames are not used but show existing frame
+   * is used to display the filtered ALTREF frame as is. As a result the decoded
+   * frame rate remains the same as the display frame rate. The default is 1.
+   */
+  AV1E_SET_ENABLE_OVERLAY = 103,
+
+  /*!\brief Codec control function to turn on/off palette mode, int parameter */
+  AV1E_SET_ENABLE_PALETTE = 104,
+
+  /*!\brief Codec control function to turn on/off intra block copy mode, int
+     parameter */
+  AV1E_SET_ENABLE_INTRABC = 105,
+
+  /*!\brief Codec control function to turn on/off intra angle delta, int
+     parameter */
+  AV1E_SET_ENABLE_ANGLE_DELTA = 106,
+
+  /*!\brief Codec control function to set the delta q mode, unsigned int
+   * parameter
+   *
+   * AV1 supports a delta q mode feature, that allows modulating q per
+   * superblock.
+   *
+   * - 0 = deltaq signaling off
+   * - 1 = use modulation to maximize objective quality (default)
+   * - 2 = use modulation to maximize perceptual quality
+   */
+  AV1E_SET_DELTAQ_MODE = 107,
+
+  /*!\brief Codec control function to turn on/off loopfilter modulation
+   * when delta q modulation is enabled, unsigned int parameter.
+   *
+   * \attention AV1 only supports loopfilter modulation when delta q
+   * modulation is enabled as well.
+   */
+  AV1E_SET_DELTALF_MODE = 108,
+
+  /*!\brief Codec control function to set the single tile decoding mode,
+   * unsigned int parameter
+   *
+   * \attention Only applicable if large scale tiling is on.
+   *
+   * - 0 = single tile decoding is off
+   * - 1 = single tile decoding is on (default)
+   */
+  AV1E_SET_SINGLE_TILE_DECODING = 109,
+
+  /*!\brief Codec control function to enable the extreme motion vector unit
+   * test, unsigned int parameter
+   *
+   * - 0 = off
+   * - 1 = MAX_EXTREME_MV
+   * - 2 = MIN_EXTREME_MV
+   *
+   * \note This is only used in motion vector unit test.
+   */
+  AV1E_ENABLE_MOTION_VECTOR_UNIT_TEST = 110,
+
+  /*!\brief Codec control function to signal picture timing info in the
+   * bitstream, aom_timing_info_type_t parameter. Default is
+   * AOM_TIMING_UNSPECIFIED.
+   */
+  AV1E_SET_TIMING_INFO_TYPE = 111,
+
+  /*!\brief Codec control function to add film grain parameters (one of several
+   * preset types) info in the bitstream, int parameter
+   *
+   Valid range: 0..16, 0 is unknown, 1..16 are test vectors
+   */
+  AV1E_SET_FILM_GRAIN_TEST_VECTOR = 112,
+
+  /*!\brief Codec control function to set the path to the film grain parameters,
+   * const char* parameter
+   */
+  AV1E_SET_FILM_GRAIN_TABLE = 113,
+
+  /*!\brief Sets the noise level, int parameter */
+  AV1E_SET_DENOISE_NOISE_LEVEL = 114,
+
+  /*!\brief Sets the denoisers block size, unsigned int parameter */
+  AV1E_SET_DENOISE_BLOCK_SIZE = 115,
+
+  /*!\brief Sets the chroma subsampling x value, unsigned int parameter */
+  AV1E_SET_CHROMA_SUBSAMPLING_X = 116,
+
+  /*!\brief Sets the chroma subsampling y value, unsigned int parameter */
+  AV1E_SET_CHROMA_SUBSAMPLING_Y = 117,
+
+  /*!\brief Control to use a reduced tx type set, int parameter */
+  AV1E_SET_REDUCED_TX_TYPE_SET = 118,
+
+  /*!\brief Control to use dct only for intra modes, int parameter */
+  AV1E_SET_INTRA_DCT_ONLY = 119,
+
+  /*!\brief Control to use dct only for inter modes, int parameter */
+  AV1E_SET_INTER_DCT_ONLY = 120,
+
+  /*!\brief Control to use default tx type only for intra modes, int parameter
+   */
+  AV1E_SET_INTRA_DEFAULT_TX_ONLY = 121,
+
+  /*!\brief Control to use adaptive quantize_b, int parameter */
+  AV1E_SET_QUANT_B_ADAPT = 122,
+
+  /*!\brief Control to select maximum height for the GF group pyramid structure,
+   * unsigned int parameter
+   *
+   * Valid range: 0..4
+   */
+  AV1E_SET_GF_MAX_PYRAMID_HEIGHT = 123,
+
+  /*!\brief Control to select maximum reference frames allowed per frame, int
+   * parameter
+   *
+   * Valid range: 3..7
+   */
+  AV1E_SET_MAX_REFERENCE_FRAMES = 124,
+
+  /*!\brief Control to use reduced set of single and compound references, int
+     parameter */
+  AV1E_SET_REDUCED_REFERENCE_SET = 125,
+
+  /* NOTE: enums 126-139 unused */
+  /* NOTE: Need a gap in enum values to avoud conflict with 128, 129, 130 */
+
+  /*!\brief Control to set frequency of the cost updates for coefficients,
+   * unsigned int parameter
+   *
+   * - 0 = update at SB level (default)
+   * - 1 = update at SB row level in tile
+   * - 2 = update at tile level
+   * - 3 = turn off
+   */
+  AV1E_SET_COEFF_COST_UPD_FREQ = 140,
+
+  /*!\brief Control to set frequency of the cost updates for mode, unsigned int
+   * parameter
+   *
+   * - 0 = update at SB level (default)
+   * - 1 = update at SB row level in tile
+   * - 2 = update at tile level
+   * - 3 = turn off
+   */
+  AV1E_SET_MODE_COST_UPD_FREQ = 141,
+
+  /*!\brief Control to set frequency of the cost updates for motion vectors,
+   * unsigned int parameter
+   *
+   * - 0 = update at SB level (default)
+   * - 1 = update at SB row level in tile
+   * - 2 = update at tile level
+   * - 3 = turn off
+   */
+  AV1E_SET_MV_COST_UPD_FREQ = 142,
+
+  /*!\brief Control to set bit mask that specifies which tier each of the 32
+   * possible operating points conforms to, unsigned int parameter
+   *
+   * - 0 = main tier (default)
+   * - 1 = high tier
+   */
+  AV1E_SET_TIER_MASK = 143,
+
+  /*!\brief Control to set minimum compression ratio, unsigned int parameter
+   * Take integer values. If non-zero, encoder will try to keep the compression
+   * ratio of each frame to be higher than the given value divided by 100.
+   * E.g. 850 means minimum compression ratio of 8.5.
+   */
+  AV1E_SET_MIN_CR = 144,
+
+  /* NOTE: enums 145-149 unused */
+
+  /*!\brief Codec control function to set the layer id, aom_svc_layer_id_t*
+   * parameter
+   */
+  AV1E_SET_SVC_LAYER_ID = 150,
+
+  /*!\brief Codec control function to set SVC paramaeters, aom_svc_params_t*
+   * parameter
+   */
+  AV1E_SET_SVC_PARAMS = 151,
+
+  /*!\brief Codec control function to set reference frame config:
+   * the ref_idx and the refresh flags for each buffer slot.
+   * aom_svc_ref_frame_config_t* parameter
+   */
+  AV1E_SET_SVC_REF_FRAME_CONFIG = 152,
+
+  /*!\brief Codec control function to set the path to the VMAF model used when
+   * tuning the encoder for VMAF, const char* parameter
+   */
+  AV1E_SET_VMAF_MODEL_PATH = 153,
+
+  /*!\brief Codec control function to enable EXT_TILE_DEBUG in AV1 encoder,
+   * unsigned int parameter
+   *
+   * - 0 = disable (default)
+   * - 1 = enable
+   *
+   * \note This is only used in lightfield example test.
+   */
+  AV1E_ENABLE_EXT_TILE_DEBUG = 154,
+
+  /*!\brief Codec control function to enable the superblock multipass unit test
+   * in AV1 to ensure that the encoder does not leak state between different
+   * passes. unsigned int parameter.
+   *
+   * - 0 = disable (default)
+   * - 1 = enable
+   *
+   * \note This is only used in sb_multipass unit test.
+   */
+  AV1E_ENABLE_SB_MULTIPASS_UNIT_TEST = 155,
+
+  /*!\brief Control to select minimum height for the GF group pyramid structure,
+   * unsigned int parameter
+   *
+   * Valid values: 0..4
+   */
+  AV1E_SET_GF_MIN_PYRAMID_HEIGHT = 156,
+};
+
+/*!\brief aom 1-D scaling mode
+ *
+ * This set of constants define 1-D aom scaling modes
+ */
+typedef enum aom_scaling_mode_1d {
+  AOME_NORMAL = 0,
+  AOME_FOURFIVE = 1,
+  AOME_THREEFIVE = 2,
+  AOME_ONETWO = 3
+} AOM_SCALING_MODE;
+
+/*!\brief Max number of segments
+ *
+ * This is the limit of number of segments allowed within a frame.
+ *
+ * Currently same as "MAX_SEGMENTS" in AV1, the maximum that AV1 supports.
+ *
+ */
+#define AOM_MAX_SEGMENTS 8
+
+/*!\brief  aom region of interest map
+ *
+ * These defines the data structures for the region of interest map
+ *
+ * TODO(yaowu): create a unit test for ROI map related APIs
+ *
+ */
+typedef struct aom_roi_map {
+  /*! An id between 0 and 7 for each 8x8 region within a frame. */
+  unsigned char *roi_map;
+  unsigned int rows;              /**< Number of rows. */
+  unsigned int cols;              /**< Number of columns. */
+  int delta_q[AOM_MAX_SEGMENTS];  /**< Quantizer deltas. */
+  int delta_lf[AOM_MAX_SEGMENTS]; /**< Loop filter deltas. */
+  /*! Static breakout threshold for each segment. */
+  unsigned int static_threshold[AOM_MAX_SEGMENTS];
+} aom_roi_map_t;
+
+/*!\brief  aom active region map
+ *
+ * These defines the data structures for active region map
+ *
+ */
+
+typedef struct aom_active_map {
+  /*!\brief specify an on (1) or off (0) each 16x16 region within a frame */
+  unsigned char *active_map;
+  unsigned int rows; /**< number of rows */
+  unsigned int cols; /**< number of cols */
+} aom_active_map_t;
+
+/*!\brief  aom image scaling mode
+ *
+ * This defines the data structure for image scaling mode
+ *
+ */
+typedef struct aom_scaling_mode {
+  AOM_SCALING_MODE h_scaling_mode; /**< horizontal scaling mode */
+  AOM_SCALING_MODE v_scaling_mode; /**< vertical scaling mode   */
+} aom_scaling_mode_t;
+
+/*!brief AV1 encoder content type */
+typedef enum {
+  AOM_CONTENT_DEFAULT,
+  AOM_CONTENT_SCREEN,
+  AOM_CONTENT_INVALID
+} aom_tune_content;
+
+/*!brief AV1 encoder timing info type signaling */
+typedef enum {
+  AOM_TIMING_UNSPECIFIED,
+  AOM_TIMING_EQUAL,
+  AOM_TIMING_DEC_MODEL
+} aom_timing_info_type_t;
+
+/*!\brief Model tuning parameters
+ *
+ * Changes the encoder to tune for certain types of input material.
+ *
+ */
+typedef enum {
+  AOM_TUNE_PSNR = 0,
+  AOM_TUNE_SSIM = 1,
+  /* NOTE: enums 2 and 3 unused */
+  AOM_TUNE_VMAF_WITH_PREPROCESSING = 4,
+  AOM_TUNE_VMAF_WITHOUT_PREPROCESSING = 5,
+  AOM_TUNE_VMAF_MAX_GAIN = 6
+} aom_tune_metric;
+
+#define AOM_MAX_LAYERS 32   /**< Max number of layers */
+#define AOM_MAX_SS_LAYERS 4 /**< Max number of spatial layers */
+#define AOM_MAX_TS_LAYERS 8 /**< Max number of temporal layers */
+
+/*!brief Struct for spatial and temporal layer ID */
+typedef struct aom_svc_layer_id {
+  int spatial_layer_id;  /**< Spatial layer ID */
+  int temporal_layer_id; /**< Temporal layer ID */
+} aom_svc_layer_id_t;
+
+/*!brief Parameter type for SVC */
+typedef struct aom_svc_params {
+  int number_spatial_layers;                 /**< Number of spatial layers */
+  int number_temporal_layers;                /**< Number of temporal layers */
+  int max_quantizers[AOM_MAX_LAYERS];        /**< Max Q for each layer */
+  int min_quantizers[AOM_MAX_LAYERS];        /**< Min Q for each layer */
+  int scaling_factor_num[AOM_MAX_SS_LAYERS]; /**< Scaling factor-numerator */
+  int scaling_factor_den[AOM_MAX_SS_LAYERS]; /**< Scaling factor-denominator */
+  /*! Target bitrate for each layer */
+  int layer_target_bitrate[AOM_MAX_LAYERS];
+  /*! Frame rate factor for each temporal layer */
+  int framerate_factor[AOM_MAX_TS_LAYERS];
+} aom_svc_params_t;
+
+/*!brief Parameters for setting ref frame config */
+typedef struct aom_svc_ref_frame_config {
+  // 7 references: LAST_FRAME (0), LAST2_FRAME(1), LAST3_FRAME(2),
+  // GOLDEN_FRAME(3), BWDREF_FRAME(4), ALTREF2_FRAME(5), ALTREF_FRAME(6).
+  int reference[7]; /**< Reference flag for each of the 7 references. */
+  /*! Buffer slot index for each of 7 references. */
+  int ref_idx[7];
+  int refresh[8]; /**< Refresh flag for each of the 8 slots. */
+} aom_svc_ref_frame_config_t;
+
+/*!\cond */
+/*!\brief Encoder control function parameter type
+ *
+ * Defines the data types that AOME/AV1E control functions take.
+ *
+ * \note Additional common controls are defined in aom.h.
+ *
+ * \note For each control ID "X", a macro-define of
+ * AOM_CTRL_X is provided. It is used at compile time to determine
+ * if the control ID is supported by the libaom library available,
+ * when the libaom version cannot be controlled.
+ */
+AOM_CTRL_USE_TYPE(AOME_USE_REFERENCE, int)
+#define AOM_CTRL_AOME_USE_REFERENCE
+
+AOM_CTRL_USE_TYPE(AOME_SET_ROI_MAP, aom_roi_map_t *)
+#define AOM_CTRL_AOME_SET_ROI_MAP
+
+AOM_CTRL_USE_TYPE(AOME_SET_ACTIVEMAP, aom_active_map_t *)
+#define AOM_CTRL_AOME_SET_ACTIVEMAP
+
+AOM_CTRL_USE_TYPE(AOME_SET_SCALEMODE, aom_scaling_mode_t *)
+#define AOM_CTRL_AOME_SET_SCALEMODE
+
+AOM_CTRL_USE_TYPE(AOME_SET_SPATIAL_LAYER_ID, unsigned int)
+#define AOM_CTRL_AOME_SET_SPATIAL_LAYER_ID
+
+AOM_CTRL_USE_TYPE(AOME_SET_CPUUSED, int)
+#define AOM_CTRL_AOME_SET_CPUUSED
+
+AOM_CTRL_USE_TYPE(AOME_SET_ENABLEAUTOALTREF, unsigned int)
+#define AOM_CTRL_AOME_SET_ENABLEAUTOALTREF
+
+AOM_CTRL_USE_TYPE(AOME_SET_ENABLEAUTOBWDREF, unsigned int)
+#define AOM_CTRL_AOME_SET_ENABLEAUTOBWDREF
+
+AOM_CTRL_USE_TYPE(AOME_SET_SHARPNESS, unsigned int)
+#define AOM_CTRL_AOME_SET_SHARPNESS
+
+AOM_CTRL_USE_TYPE(AOME_SET_STATIC_THRESHOLD, unsigned int)
+#define AOM_CTRL_AOME_SET_STATIC_THRESHOLD
+
+AOM_CTRL_USE_TYPE(AOME_SET_ARNR_MAXFRAMES, unsigned int)
+#define AOM_CTRL_AOME_SET_ARNR_MAXFRAMES
+
+AOM_CTRL_USE_TYPE(AOME_SET_ARNR_STRENGTH, unsigned int)
+#define AOM_CTRL_AOME_SET_ARNR_STRENGTH
+
+AOM_CTRL_USE_TYPE(AOME_SET_TUNING, int) /* aom_tune_metric */
+#define AOM_CTRL_AOME_SET_TUNING
+
+AOM_CTRL_USE_TYPE(AOME_SET_CQ_LEVEL, unsigned int)
+#define AOM_CTRL_AOME_SET_CQ_LEVEL
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ROW_MT, unsigned int)
+#define AOM_CTRL_AV1E_SET_ROW_MT
+
+AOM_CTRL_USE_TYPE(AV1E_SET_TILE_COLUMNS, unsigned int)
+#define AOM_CTRL_AV1E_SET_TILE_COLUMNS
+
+AOM_CTRL_USE_TYPE(AV1E_SET_TILE_ROWS, unsigned int)
+#define AOM_CTRL_AV1E_SET_TILE_ROWS
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_TPL_MODEL, unsigned int)
+#define AOM_CTRL_AV1E_SET_ENABLE_TPL_MODEL
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_KEYFRAME_FILTERING, unsigned int)
+#define AOM_CTRL_AV1E_SET_ENABLE_KEYFRAME_FILTERING
+
+AOM_CTRL_USE_TYPE(AOME_GET_LAST_QUANTIZER, int *)
+#define AOM_CTRL_AOME_GET_LAST_QUANTIZER
+
+AOM_CTRL_USE_TYPE(AOME_GET_LAST_QUANTIZER_64, int *)
+#define AOM_CTRL_AOME_GET_LAST_QUANTIZER_64
+
+AOM_CTRL_USE_TYPE(AOME_SET_MAX_INTRA_BITRATE_PCT, unsigned int)
+#define AOM_CTRL_AOME_SET_MAX_INTRA_BITRATE_PCT
+
+AOM_CTRL_USE_TYPE(AOME_SET_MAX_INTER_BITRATE_PCT, unsigned int)
+#define AOM_CTRL_AOME_SET_MAX_INTER_BITRATE_PCT
+
+AOM_CTRL_USE_TYPE(AOME_SET_NUMBER_SPATIAL_LAYERS, int)
+#define AOME_CTRL_AOME_SET_NUMBER_SPATIAL_LAYERS
+
+AOM_CTRL_USE_TYPE(AV1E_SET_GF_CBR_BOOST_PCT, unsigned int)
+#define AOM_CTRL_AV1E_SET_GF_CBR_BOOST_PCT
+
+AOM_CTRL_USE_TYPE(AV1E_SET_LOSSLESS, unsigned int)
+#define AOM_CTRL_AV1E_SET_LOSSLESS
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_CDEF, unsigned int)
+#define AOM_CTRL_AV1E_SET_ENABLE_CDEF
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_RESTORATION, unsigned int)
+#define AOM_CTRL_AV1E_SET_ENABLE_RESTORATION
+
+AOM_CTRL_USE_TYPE(AV1E_SET_FORCE_VIDEO_MODE, unsigned int)
+#define AOM_CTRL_AV1E_SET_FORCE_VIDEO_MODE
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_OBMC, unsigned int)
+#define AOM_CTRL_AV1E_SET_ENABLE_OBMC
+
+AOM_CTRL_USE_TYPE(AV1E_SET_DISABLE_TRELLIS_QUANT, unsigned int)
+#define AOM_CTRL_AV1E_SET_DISABLE_TRELLIS_QUANT
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_QM, unsigned int)
+#define AOM_CTRL_AV1E_SET_ENABLE_QM
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_DIST_8X8, unsigned int)
+#define AOM_CTRL_AV1E_SET_ENABLE_DIST_8X8
+
+AOM_CTRL_USE_TYPE(AV1E_SET_QM_MIN, unsigned int)
+#define AOM_CTRL_AV1E_SET_QM_MIN
+
+AOM_CTRL_USE_TYPE(AV1E_SET_QM_MAX, unsigned int)
+#define AOM_CTRL_AV1E_SET_QM_MAX
+
+AOM_CTRL_USE_TYPE(AV1E_SET_QM_Y, unsigned int)
+#define AOM_CTRL_AV1E_SET_QM_Y
+
+AOM_CTRL_USE_TYPE(AV1E_SET_QM_U, unsigned int)
+#define AOM_CTRL_AV1E_SET_QM_U
+
+AOM_CTRL_USE_TYPE(AV1E_SET_QM_V, unsigned int)
+#define AOM_CTRL_AV1E_SET_QM_V
+
+AOM_CTRL_USE_TYPE(AV1E_SET_NUM_TG, unsigned int)
+#define AOM_CTRL_AV1E_SET_NUM_TG
+
+AOM_CTRL_USE_TYPE(AV1E_SET_MTU, unsigned int)
+#define AOM_CTRL_AV1E_SET_MTU
+
+AOM_CTRL_USE_TYPE(AV1E_SET_TIMING_INFO_TYPE, int) /* aom_timing_info_type_t */
+#define AOM_CTRL_AV1E_SET_TIMING_INFO_TYPE
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_RECT_PARTITIONS, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_RECT_PARTITIONS
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_AB_PARTITIONS, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_AB_PARTITIONS
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_1TO4_PARTITIONS, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_1TO4_PARTITIONS
+
+AOM_CTRL_USE_TYPE(AV1E_SET_MIN_PARTITION_SIZE, int)
+#define AOM_CTRL_AV1E_SET_MIN_PARTITION_SIZE
+
+AOM_CTRL_USE_TYPE(AV1E_SET_MAX_PARTITION_SIZE, int)
+#define AOM_CTRL_AV1E_SET_MAX_PARTITION_SIZE
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_INTRA_EDGE_FILTER, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_INTRA_EDGE_FILTER
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_ORDER_HINT, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_ORDER_HINT
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_TX64, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_TX64
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_FLIP_IDTX, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_FLIP_IDTX
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_DIST_WTD_COMP, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_DIST_WTD_COMP
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_REF_FRAME_MVS, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_REF_FRAME_MVS
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ALLOW_REF_FRAME_MVS, int)
+#define AOM_CTRL_AV1E_SET_ALLOW_REF_FRAME_MVS
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_DUAL_FILTER, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_DUAL_FILTER
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_CHROMA_DELTAQ, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_CHROMA_DELTAQ
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_MASKED_COMP, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_MASKED_COMP
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_ONESIDED_COMP, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_ONESIDED_COMP
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_INTERINTRA_COMP, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_INTERINTRA_COMP
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_SMOOTH_INTERINTRA, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_SMOOTH_INTERINTRA
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_DIFF_WTD_COMP, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_DIFF_WTD_COMP
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_INTERINTER_WEDGE, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_INTERINTER_WEDGE
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_INTERINTRA_WEDGE, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_INTERINTRA_WEDGE
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_GLOBAL_MOTION, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_GLOBAL_MOTION
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_WARPED_MOTION, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_WARPED_MOTION
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ALLOW_WARPED_MOTION, int)
+#define AOM_CTRL_AV1E_SET_ALLOW_WARPED_MOTION
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_FILTER_INTRA, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_FILTER_INTRA
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_SMOOTH_INTRA, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_SMOOTH_INTRA
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_PAETH_INTRA, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_PAETH_INTRA
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_CFL_INTRA, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_CFL_INTRA
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_SUPERRES, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_SUPERRES
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_OVERLAY, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_OVERLAY
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_PALETTE, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_PALETTE
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_INTRABC, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_INTRABC
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_ANGLE_DELTA, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_ANGLE_DELTA
+
+AOM_CTRL_USE_TYPE(AV1E_SET_FRAME_PARALLEL_DECODING, unsigned int)
+#define AOM_CTRL_AV1E_SET_FRAME_PARALLEL_DECODING
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ERROR_RESILIENT_MODE, int)
+#define AOM_CTRL_AV1E_SET_ERROR_RESILIENT_MODE
+
+AOM_CTRL_USE_TYPE(AV1E_SET_S_FRAME_MODE, int)
+#define AOM_CTRL_AV1E_SET_S_FRAME_MODE
+
+AOM_CTRL_USE_TYPE(AV1E_SET_AQ_MODE, unsigned int)
+#define AOM_CTRL_AV1E_SET_AQ_MODE
+
+AOM_CTRL_USE_TYPE(AV1E_SET_DELTAQ_MODE, unsigned int)
+#define AOM_CTRL_AV1E_SET_DELTAQ_MODE
+
+AOM_CTRL_USE_TYPE(AV1E_SET_DELTALF_MODE, unsigned int)
+#define AOM_CTRL_AV1E_SET_DELTALF_MODE
+
+AOM_CTRL_USE_TYPE(AV1E_SET_FRAME_PERIODIC_BOOST, unsigned int)
+#define AOM_CTRL_AV1E_SET_FRAME_PERIODIC_BOOST
+
+AOM_CTRL_USE_TYPE(AV1E_SET_NOISE_SENSITIVITY, unsigned int)
+#define AOM_CTRL_AV1E_SET_NOISE_SENSITIVITY
+
+AOM_CTRL_USE_TYPE(AV1E_SET_TUNE_CONTENT, int) /* aom_tune_content */
+#define AOM_CTRL_AV1E_SET_TUNE_CONTENT
+
+AOM_CTRL_USE_TYPE(AV1E_SET_COLOR_PRIMARIES, int)
+#define AOM_CTRL_AV1E_SET_COLOR_PRIMARIES
+
+AOM_CTRL_USE_TYPE(AV1E_SET_TRANSFER_CHARACTERISTICS, int)
+#define AOM_CTRL_AV1E_SET_TRANSFER_CHARACTERISTICS
+
+AOM_CTRL_USE_TYPE(AV1E_SET_MATRIX_COEFFICIENTS, int)
+#define AOM_CTRL_AV1E_SET_MATRIX_COEFFICIENTS
+
+AOM_CTRL_USE_TYPE(AV1E_SET_CHROMA_SAMPLE_POSITION, int)
+#define AOM_CTRL_AV1E_SET_CHROMA_SAMPLE_POSITION
+
+AOM_CTRL_USE_TYPE(AV1E_SET_MIN_GF_INTERVAL, unsigned int)
+#define AOM_CTRL_AV1E_SET_MIN_GF_INTERVAL
+
+AOM_CTRL_USE_TYPE(AV1E_SET_MAX_GF_INTERVAL, unsigned int)
+#define AOM_CTRL_AV1E_SET_MAX_GF_INTERVAL
+
+AOM_CTRL_USE_TYPE(AV1E_GET_ACTIVEMAP, aom_active_map_t *)
+#define AOM_CTRL_AV1E_GET_ACTIVEMAP
+
+AOM_CTRL_USE_TYPE(AV1E_SET_COLOR_RANGE, int)
+#define AOM_CTRL_AV1E_SET_COLOR_RANGE
+
+#define AOM_CTRL_AV1E_SET_RENDER_SIZE
+AOM_CTRL_USE_TYPE(AV1E_SET_RENDER_SIZE, int *)
+
+AOM_CTRL_USE_TYPE(AV1E_SET_SUPERBLOCK_SIZE, unsigned int)
+#define AOM_CTRL_AV1E_SET_SUPERBLOCK_SIZE
+
+AOM_CTRL_USE_TYPE(AV1E_GET_SEQ_LEVEL_IDX, int *)
+#define AOM_CTRL_AV1E_GET_SEQ_LEVEL_IDX
+
+AOM_CTRL_USE_TYPE(AV1E_SET_SINGLE_TILE_DECODING, unsigned int)
+#define AOM_CTRL_AV1E_SET_SINGLE_TILE_DECODING
+
+AOM_CTRL_USE_TYPE(AV1E_ENABLE_MOTION_VECTOR_UNIT_TEST, unsigned int)
+#define AOM_CTRL_AV1E_ENABLE_MOTION_VECTOR_UNIT_TEST
+
+AOM_CTRL_USE_TYPE(AV1E_ENABLE_EXT_TILE_DEBUG, unsigned int)
+#define AOM_CTRL_AV1E_ENABLE_EXT_TILE_DEBUG
+
+AOM_CTRL_USE_TYPE(AV1E_SET_VMAF_MODEL_PATH, const char *)
+#define AOM_CTRL_AV1E_SET_VMAF_MODEL_PATH
+
+AOM_CTRL_USE_TYPE(AV1E_SET_FILM_GRAIN_TEST_VECTOR, int)
+#define AOM_CTRL_AV1E_SET_FILM_GRAIN_TEST_VECTOR
+
+AOM_CTRL_USE_TYPE(AV1E_SET_FILM_GRAIN_TABLE, const char *)
+#define AOM_CTRL_AV1E_SET_FILM_GRAIN_TABLE
+
+AOM_CTRL_USE_TYPE(AV1E_SET_CDF_UPDATE_MODE, unsigned int)
+#define AOM_CTRL_AV1E_SET_CDF_UPDATE_MODE
+
+AOM_CTRL_USE_TYPE(AV1E_SET_DENOISE_NOISE_LEVEL, int)
+#define AOM_CTRL_AV1E_SET_DENOISE_NOISE_LEVEL
+
+AOM_CTRL_USE_TYPE(AV1E_SET_DENOISE_BLOCK_SIZE, unsigned int)
+#define AOM_CTRL_AV1E_SET_DENOISE_BLOCK_SIZE
+
+AOM_CTRL_USE_TYPE(AV1E_SET_CHROMA_SUBSAMPLING_X, unsigned int)
+#define AOM_CTRL_AV1E_SET_CHROMA_SUBSAMPLING_X
+
+AOM_CTRL_USE_TYPE(AV1E_SET_CHROMA_SUBSAMPLING_Y, unsigned int)
+#define AOM_CTRL_AV1E_SET_CHROMA_SUBSAMPLING_Y
+
+AOM_CTRL_USE_TYPE(AV1E_SET_REDUCED_TX_TYPE_SET, int)
+#define AOM_CTRL_AV1E_SET_REDUCED_TX_TYPE_SET
+
+AOM_CTRL_USE_TYPE(AV1E_SET_INTRA_DCT_ONLY, int)
+#define AOM_CTRL_AV1E_SET_INTRA_DCT_ONLY
+
+AOM_CTRL_USE_TYPE(AV1E_SET_INTER_DCT_ONLY, int)
+#define AOM_CTRL_AV1E_SET_INTER_DCT_ONLY
+
+AOM_CTRL_USE_TYPE(AV1E_SET_INTRA_DEFAULT_TX_ONLY, int)
+#define AOM_CTRL_AV1E_SET_INTRA_DEFAULT_TX_ONLY
+
+AOM_CTRL_USE_TYPE(AV1E_SET_QUANT_B_ADAPT, int)
+#define AOM_CTRL_AV1E_SET_QUANT_B_ADAPT
+
+AOM_CTRL_USE_TYPE(AV1E_SET_GF_MIN_PYRAMID_HEIGHT, unsigned int)
+#define AOM_CTRL_AV1E_SET_GF_MIN_PYRAMID_HEIGHT
+
+AOM_CTRL_USE_TYPE(AV1E_SET_GF_MAX_PYRAMID_HEIGHT, unsigned int)
+#define AOM_CTRL_AV1E_SET_GF_MAX_PYRAMID_HEIGHT
+
+AOM_CTRL_USE_TYPE(AV1E_SET_MAX_REFERENCE_FRAMES, int)
+#define AOM_CTRL_AV1E_SET_MAX_REFERENCE_FRAMES
+
+AOM_CTRL_USE_TYPE(AV1E_SET_REDUCED_REFERENCE_SET, int)
+#define AOM_CTRL_AV1E_SET_REDUCED_REFERENCE_SET
+
+AOM_CTRL_USE_TYPE(AV1E_SET_COEFF_COST_UPD_FREQ, unsigned int)
+#define AOM_CTRL_AV1E_SET_COEFF_COST_UPD_FREQ
+
+AOM_CTRL_USE_TYPE(AV1E_SET_MODE_COST_UPD_FREQ, unsigned int)
+#define AOM_CTRL_AV1E_SET_MODE_COST_UPD_FREQ
+
+AOM_CTRL_USE_TYPE(AV1E_SET_MV_COST_UPD_FREQ, unsigned int)
+#define AOM_CTRL_AV1E_SET_MV_COST_UPD_FREQ
+
+AOM_CTRL_USE_TYPE(AV1E_SET_TARGET_SEQ_LEVEL_IDX, int)
+#define AOM_CTRL_AV1E_SET_TARGET_SEQ_LEVEL_IDX
+
+AOM_CTRL_USE_TYPE(AV1E_SET_TIER_MASK, unsigned int)
+#define AOM_CTRL_AV1E_SET_TIER_MASK
+
+AOM_CTRL_USE_TYPE(AV1E_SET_MIN_CR, unsigned int)
+#define AOM_CTRL_AV1E_SET_MIN_CR
+
+AOM_CTRL_USE_TYPE(AV1E_SET_SVC_LAYER_ID, aom_svc_layer_id_t *)
+#define AOME_CTRL_AV1E_SET_SVC_LAYER_ID
+
+AOM_CTRL_USE_TYPE(AV1E_SET_SVC_PARAMS, aom_svc_params_t *)
+#define AOME_CTRL_AV1E_SET_SVC_PARAMS
+
+AOM_CTRL_USE_TYPE(AV1E_SET_SVC_REF_FRAME_CONFIG, aom_svc_ref_frame_config_t *)
+#define AOME_CTRL_AV1E_SET_SVC_REF_FRAME_CONFIG
+
+AOM_CTRL_USE_TYPE(AV1E_ENABLE_SB_MULTIPASS_UNIT_TEST, unsigned int)
+#define AOM_CTRL_AV1E_ENABLE_SB_MULTIPASS_UNIT_TEST
+
+/*!\endcond */
+/*! @} - end defgroup aom_encoder */
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_AOMCX_H_
diff --git a/libs/libaom/src/aom/aomdx.h b/libs/libaom/src/aom/aomdx.h
new file mode 100644
index 000000000..8cd5de395
--- /dev/null
+++ b/libs/libaom/src/aom/aomdx.h
@@ -0,0 +1,397 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\defgroup aom_decoder AOMedia AOM/AV1 Decoder
+ * \ingroup aom
+ *
+ * @{
+ */
+/*!\file
+ * \brief Provides definitions for using AOM or AV1 within the aom Decoder
+ *        interface.
+ */
+#ifndef AOM_AOM_AOMDX_H_
+#define AOM_AOM_AOMDX_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Include controls common to both the encoder and decoder */
+#include "aom/aom.h"
+
+/*!\name Algorithm interface for AV1
+ *
+ * This interface provides the capability to decode AV1 streams.
+ * @{
+ */
+extern aom_codec_iface_t aom_codec_av1_dx_algo;
+extern aom_codec_iface_t *aom_codec_av1_dx(void);
+/*!@} - end algorithm interface member group*/
+
+/** Data structure that stores bit accounting for debug
+ */
+typedef struct Accounting Accounting;
+
+#ifndef AOM_INSPECTION_H_
+/** Callback that inspects decoder frame data.
+ */
+typedef void (*aom_inspect_cb)(void *decoder, void *ctx);
+
+#endif
+
+/*!\brief Structure to hold inspection callback and context.
+ *
+ * Defines a structure to hold the inspection callback function and calling
+ * context.
+ */
+typedef struct aom_inspect_init {
+  /*! Inspection callback. */
+  aom_inspect_cb inspect_cb;
+
+  /*! Inspection context. */
+  void *inspect_ctx;
+} aom_inspect_init;
+
+/*!\brief Structure to collect a buffer index when inspecting.
+ *
+ * Defines a structure to hold the buffer and return an index
+ * when calling decode from inspect. This enables us to decode
+ * non showable sub frames.
+ */
+typedef struct {
+  /*! Pointer for new position in compressed buffer after decoding 1 OBU. */
+  const unsigned char *buf;
+  /*! Index into reference buffer array to see result of decoding 1 OBU. */
+  int idx;
+  /*! Is a show existing frame. */
+  int show_existing;
+} Av1DecodeReturn;
+
+/*!\brief Structure to hold a tile's start address and size in the bitstream.
+ *
+ * Defines a structure to hold a tile's start address and size in the bitstream.
+ */
+typedef struct aom_tile_data {
+  /*! Tile data size. */
+  size_t coded_tile_data_size;
+  /*! Tile's start address. */
+  const void *coded_tile_data;
+  /*! Extra size information. */
+  size_t extra_size;
+} aom_tile_data;
+
+/*!\brief Structure to hold the external reference frame pointer.
+ *
+ * Define a structure to hold the external reference frame pointer.
+ */
+typedef struct av1_ext_ref_frame {
+  /*! Start pointer of external references. */
+  aom_image_t *img;
+  /*! Number of available external references. */
+  int num;
+} av1_ext_ref_frame_t;
+
+/*!\enum aom_dec_control_id
+ * \brief AOM decoder control functions
+ *
+ * This set of macros define the control functions available for the AOM
+ * decoder interface.
+ *
+ * \sa #aom_codec_control(aom_codec_ctx_t *ctx, int ctrl_id, ...)
+ */
+enum aom_dec_control_id {
+  /*!\brief Codec control function to get info on which reference frames were
+   * updated by the last decode, int* parameter
+   */
+  AOMD_GET_LAST_REF_UPDATES = AOM_DECODER_CTRL_ID_START,
+
+  /*!\brief Codec control function to check if the indicated frame is
+    corrupted, int* parameter
+  */
+  AOMD_GET_FRAME_CORRUPTED,
+
+  /*!\brief Codec control function to get info on which reference frames were
+   * used by the last decode, int* parameter
+   */
+  AOMD_GET_LAST_REF_USED,
+
+  /*!\brief Codec control function to get the dimensions that the current
+   * frame is decoded at, int* parameter. This may be different to the
+   * intended display size for the frame as specified in the wrapper or frame
+   * header (see AV1D_GET_DISPLAY_SIZE).
+   */
+  AV1D_GET_FRAME_SIZE,
+
+  /*!\brief Codec control function to get the current frame's intended display
+   * dimensions (as specified in the wrapper or frame header), int* parameter.
+   * This may be different to the decoded dimensions of this frame (see
+   * AV1D_GET_FRAME_SIZE).
+   */
+  AV1D_GET_DISPLAY_SIZE,
+
+  /*!\brief Codec control function to get the bit depth of the stream,
+   * unsigned int* parameter
+   */
+  AV1D_GET_BIT_DEPTH,
+
+  /*!\brief Codec control function to get the image format of the stream,
+   * aom_img_fmt_t* parameter
+   */
+  AV1D_GET_IMG_FORMAT,
+
+  /*!\brief Codec control function to get the size of the tile, unsigned int
+    parameter */
+  AV1D_GET_TILE_SIZE,
+
+  /*!\brief Codec control function to get the tile count in a tile list, int*
+   * parameter
+   */
+  AV1D_GET_TILE_COUNT,
+
+  /*!\brief Codec control function to set the byte alignment of the planes in
+   * the reference buffers, int parameter
+   *
+   * Valid values are power of 2, from 32 to 1024. A value of 0 sets
+   * legacy alignment. I.e. Y plane is aligned to 32 bytes, U plane directly
+   * follows Y plane, and V plane directly follows U plane. Default value is 0.
+   */
+  AV1_SET_BYTE_ALIGNMENT,
+
+  /*!\brief Codec control function to invert the decoding order to from right to
+   * left, int parameter
+   *
+   * The function is used in a test to confirm the decoding independence of tile
+   * columns. The function may be used in application where this order
+   * of decoding is desired. int parameter
+   *
+   * TODO(yaowu): Rework the unit test that uses this control, and in a future
+   *              release, this test-only control shall be removed.
+   */
+  AV1_INVERT_TILE_DECODE_ORDER,
+
+  /*!\brief Codec control function to set the skip loop filter flag, int
+   * parameter
+   *
+   * Valid values are integers. The decoder will skip the loop filter
+   * when its value is set to nonzero. If the loop filter is skipped the
+   * decoder may accumulate decode artifacts. The default value is 0.
+   */
+  AV1_SET_SKIP_LOOP_FILTER,
+
+  /*!\brief Codec control function to retrieve a pointer to the Accounting
+   * struct, takes Accounting** as parameter
+   *
+   * If called before a frame has been decoded, this returns AOM_CODEC_ERROR.
+   * The caller should ensure that AOM_CODEC_OK is returned before attempting
+   * to dereference the Accounting pointer.
+   *
+   * \attention When compiled without --enable-accounting, this returns
+   * AOM_CODEC_INCAPABLE.
+   */
+  AV1_GET_ACCOUNTING,
+
+  /*!\brief Codec control function to get last decoded frame quantizer,
+   * int* parameter
+   *
+   * Returned value uses internal quantizer scale defined by the codec.
+   */
+  AOMD_GET_LAST_QUANTIZER,
+
+  /*!\brief Codec control function to set the range of tile decoding, int
+   * parameter
+   *
+   * A value that is greater and equal to zero indicates only the specific
+   * row/column is decoded. A value that is -1 indicates the whole row/column
+   * is decoded. A special case is both values are -1 that means the whole
+   * frame is decoded.
+   */
+  AV1_SET_DECODE_TILE_ROW,
+  AV1_SET_DECODE_TILE_COL,
+
+  /*!\brief Codec control function to set the tile coding mode, int parameter
+   *
+   * - 0 = tiles are coded in normal tile mode
+   * - 1 = tiles are coded in large-scale tile mode
+   */
+  AV1_SET_TILE_MODE,
+
+  /*!\brief Codec control function to get the frame header information of an
+   * encoded frame, unsigned int* parameter
+   */
+  AV1D_GET_FRAME_HEADER_INFO,
+
+  /*!\brief Codec control function to get the start address and size of a
+   * tile in the coded bitstream, aom_tile_data* parameter.
+   */
+  AV1D_GET_TILE_DATA,
+
+  /*!\brief Codec control function to set the external references' pointers in
+   * the decoder, av1_ext_ref_frame_t* parameter.
+   *
+   * This is used while decoding the tile list OBU in large-scale tile coding
+   * mode.
+   */
+  AV1D_SET_EXT_REF_PTR,
+
+  /*!\brief Codec control function to enable the ext-tile software debug and
+   * testing code in the decoder, unsigned int parameter
+   */
+  AV1D_EXT_TILE_DEBUG,
+
+  /*!\brief Codec control function to enable the row based multi-threading of
+   * decoding, unsigned int parameter
+   *
+   * - 0 = disabled
+   * - 1 = enabled (default)
+   */
+  AV1D_SET_ROW_MT,
+
+  /*!\brief Codec control function to indicate whether bitstream is in
+   * Annex-B format, unsigned int parameter
+   */
+  AV1D_SET_IS_ANNEXB,
+
+  /*!\brief Codec control function to indicate which operating point to use,
+   * int parameter
+   *
+   * A scalable stream may define multiple operating points, each of which
+   * defines a set of temporal and spatial layers to be processed. The
+   * operating point index may take a value between 0 and
+   * operating_points_cnt_minus_1 (which is at most 31).
+   */
+  AV1D_SET_OPERATING_POINT,
+
+  /*!\brief Codec control function to indicate whether to output one frame per
+   * temporal unit (the default), or one frame per spatial layer. int parameter
+   *
+   * In a scalable stream, each temporal unit corresponds to a single "frame"
+   * of video, and within a temporal unit there may be multiple spatial layers
+   * with different versions of that frame.
+   * For video playback, only the highest-quality version (within the
+   * selected operating point) is needed, but for some use cases it is useful
+   * to have access to multiple versions of a frame when they are available.
+   */
+  AV1D_SET_OUTPUT_ALL_LAYERS,
+
+  /*!\brief Codec control function to set an aom_inspect_cb callback that is
+   * invoked each time a frame is decoded, aom_inspect_init* parameter
+   *
+   * \attention When compiled without --enable-inspection, this
+   * returns AOM_CODEC_INCAPABLE.
+   */
+  AV1_SET_INSPECTION_CALLBACK,
+
+  /*!\brief Codec control function to set the skip film grain flag, int
+   * parameter
+   *
+   * Valid values are integers. The decoder will skip the film grain when its
+   * value is set to nonzero. The default value is 0.
+   */
+  AV1D_SET_SKIP_FILM_GRAIN,
+
+  AOM_DECODER_CTRL_ID_MAX,
+};
+
+/*!\cond */
+/*!\brief AOM decoder control function parameter type
+ *
+ * Defines the data types that AOMD control functions take.
+ *
+ * \note Additional common controls are defined in aom.h.
+ *
+ * \note For each control ID "X", a macro-define of
+ * AOM_CTRL_X is provided. It is used at compile time to determine
+ * if the control ID is supported by the libaom library available,
+ * when the libaom version cannot be controlled.
+ */
+AOM_CTRL_USE_TYPE(AOMD_GET_LAST_REF_UPDATES, int *)
+#define AOM_CTRL_AOMD_GET_LAST_REF_UPDATES
+
+AOM_CTRL_USE_TYPE(AOMD_GET_FRAME_CORRUPTED, int *)
+#define AOM_CTRL_AOMD_GET_FRAME_CORRUPTED
+
+AOM_CTRL_USE_TYPE(AOMD_GET_LAST_REF_USED, int *)
+#define AOM_CTRL_AOMD_GET_LAST_REF_USED
+
+AOM_CTRL_USE_TYPE(AOMD_GET_LAST_QUANTIZER, int *)
+#define AOM_CTRL_AOMD_GET_LAST_QUANTIZER
+
+AOM_CTRL_USE_TYPE(AV1D_GET_DISPLAY_SIZE, int *)
+#define AOM_CTRL_AV1D_GET_DISPLAY_SIZE
+
+AOM_CTRL_USE_TYPE(AV1D_GET_BIT_DEPTH, unsigned int *)
+#define AOM_CTRL_AV1D_GET_BIT_DEPTH
+
+AOM_CTRL_USE_TYPE(AV1D_GET_IMG_FORMAT, aom_img_fmt_t *)
+#define AOM_CTRL_AV1D_GET_IMG_FORMAT
+
+AOM_CTRL_USE_TYPE(AV1D_GET_TILE_SIZE, unsigned int *)
+#define AOM_CTRL_AV1D_GET_TILE_SIZE
+
+AOM_CTRL_USE_TYPE(AV1D_GET_TILE_COUNT, unsigned int *)
+#define AOM_CTRL_AV1D_GET_TILE_COUNT
+
+AOM_CTRL_USE_TYPE(AV1D_GET_FRAME_SIZE, int *)
+#define AOM_CTRL_AV1D_GET_FRAME_SIZE
+
+AOM_CTRL_USE_TYPE(AV1_INVERT_TILE_DECODE_ORDER, int)
+#define AOM_CTRL_AV1_INVERT_TILE_DECODE_ORDER
+
+AOM_CTRL_USE_TYPE(AV1_GET_ACCOUNTING, Accounting **)
+#define AOM_CTRL_AV1_GET_ACCOUNTING
+
+AOM_CTRL_USE_TYPE(AV1_SET_DECODE_TILE_ROW, int)
+#define AOM_CTRL_AV1_SET_DECODE_TILE_ROW
+
+AOM_CTRL_USE_TYPE(AV1_SET_DECODE_TILE_COL, int)
+#define AOM_CTRL_AV1_SET_DECODE_TILE_COL
+
+AOM_CTRL_USE_TYPE(AV1_SET_TILE_MODE, unsigned int)
+#define AOM_CTRL_AV1_SET_TILE_MODE
+
+AOM_CTRL_USE_TYPE(AV1D_GET_FRAME_HEADER_INFO, aom_tile_data *)
+#define AOM_CTRL_AV1D_GET_FRAME_HEADER_INFO
+
+AOM_CTRL_USE_TYPE(AV1D_GET_TILE_DATA, aom_tile_data *)
+#define AOM_CTRL_AV1D_GET_TILE_DATA
+
+AOM_CTRL_USE_TYPE(AV1D_SET_EXT_REF_PTR, av1_ext_ref_frame_t *)
+#define AOM_CTRL_AV1D_SET_EXT_REF_PTR
+
+AOM_CTRL_USE_TYPE(AV1D_EXT_TILE_DEBUG, unsigned int)
+#define AOM_CTRL_AV1D_EXT_TILE_DEBUG
+
+AOM_CTRL_USE_TYPE(AV1D_SET_ROW_MT, unsigned int)
+#define AOM_CTRL_AV1D_SET_ROW_MT
+
+AOM_CTRL_USE_TYPE(AV1D_SET_SKIP_FILM_GRAIN, int)
+#define AOM_CTRL_AV1D_SET_SKIP_FILM_GRAIN
+
+AOM_CTRL_USE_TYPE(AV1D_SET_IS_ANNEXB, unsigned int)
+#define AOM_CTRL_AV1D_SET_IS_ANNEXB
+
+AOM_CTRL_USE_TYPE(AV1D_SET_OPERATING_POINT, int)
+#define AOM_CTRL_AV1D_SET_OPERATING_POINT
+
+AOM_CTRL_USE_TYPE(AV1D_SET_OUTPUT_ALL_LAYERS, int)
+#define AOM_CTRL_AV1D_SET_OUTPUT_ALL_LAYERS
+
+AOM_CTRL_USE_TYPE(AV1_SET_INSPECTION_CALLBACK, aom_inspect_init *)
+#define AOM_CTRL_AV1_SET_INSPECTION_CALLBACK
+/*!\endcond */
+/*! @} - end defgroup aom_decoder */
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_AOMDX_H_
diff --git a/libs/libaom/src/aom/exports_com b/libs/libaom/src/aom/exports_com
new file mode 100644
index 000000000..6f796f5db
--- /dev/null
+++ b/libs/libaom/src/aom/exports_com
@@ -0,0 +1,41 @@
+text aom_codec_build_config
+text aom_codec_control
+text aom_codec_destroy
+text aom_codec_err_to_string
+text aom_codec_error
+text aom_codec_error_detail
+text aom_codec_get_caps
+text aom_codec_iface_name
+text aom_codec_version
+text aom_codec_version_extra_str
+text aom_codec_version_str
+text aom_free
+text aom_img_add_metadata
+text aom_img_alloc
+text aom_img_alloc_with_border
+text aom_img_flip
+text aom_img_free
+text aom_img_get_metadata
+text aom_img_metadata_array_free
+text aom_img_metadata_array_alloc
+text aom_img_metadata_free
+text aom_img_metadata_alloc
+text aom_img_num_metadata
+text aom_img_plane_height
+text aom_img_plane_width
+text aom_img_remove_metadata
+text aom_img_set_rect
+text aom_img_wrap
+text aom_malloc
+text aom_rb_bytes_read
+text aom_rb_read_bit
+text aom_rb_read_literal
+text aom_rb_read_uvlc
+text aom_uleb_decode
+text aom_uleb_encode
+text aom_uleb_encode_fixed_size
+text aom_uleb_size_in_bytes
+text aom_wb_bytes_written
+text aom_wb_write_bit
+text aom_wb_write_literal
+text aom_wb_write_unsigned_literal
diff --git a/libs/libaom/src/aom/exports_dec b/libs/libaom/src/aom/exports_dec
new file mode 100644
index 000000000..ffff023dd
--- /dev/null
+++ b/libs/libaom/src/aom/exports_dec
@@ -0,0 +1,8 @@
+text aom_codec_dec_init_ver
+text aom_codec_decode
+text aom_codec_get_frame
+text aom_codec_get_stream_info
+text aom_codec_peek_stream_info
+text aom_codec_set_frame_buffer_functions
+text aom_obu_type_to_string
+text aom_read_obu_header
diff --git a/libs/libaom/src/aom/exports_enc b/libs/libaom/src/aom/exports_enc
new file mode 100644
index 000000000..1473d9d2b
--- /dev/null
+++ b/libs/libaom/src/aom/exports_enc
@@ -0,0 +1,17 @@
+text aom_codec_enc_config_default
+text aom_codec_enc_config_set
+text aom_codec_enc_init_ver
+text aom_codec_encode
+text aom_codec_get_cx_data
+text aom_codec_get_global_headers
+text aom_codec_get_preview_frame
+text aom_codec_set_cx_data_buf
+text aom_film_grain_table_append
+text aom_film_grain_table_free
+text aom_film_grain_table_write
+text aom_flat_block_finder_init
+text aom_flat_block_finder_run
+text aom_noise_model_init
+text aom_noise_model_get_grain_parameters
+text aom_noise_model_save_latest
+text aom_noise_model_update
diff --git a/libs/libaom/src/aom/exports_test b/libs/libaom/src/aom/exports_test
new file mode 100644
index 000000000..452a532ce
--- /dev/null
+++ b/libs/libaom/src/aom/exports_test
@@ -0,0 +1,4 @@
+text aom_copy_metadata_to_frame_buffer
+text aom_dsp_rtcd
+text aom_remove_metadata_from_frame_buffer
+text aom_scale_rtcd
diff --git a/libs/libaom/src/aom/internal/aom_codec_internal.h b/libs/libaom/src/aom/internal/aom_codec_internal.h
new file mode 100644
index 000000000..efe09acc9
--- /dev/null
+++ b/libs/libaom/src/aom/internal/aom_codec_internal.h
@@ -0,0 +1,381 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\file
+ * \brief Describes the decoder algorithm interface for algorithm
+ *        implementations.
+ *
+ * This file defines the private structures and data types that are only
+ * relevant to implementing an algorithm, as opposed to using it.
+ *
+ * To create a decoder algorithm class, an interface structure is put
+ * into the global namespace:
+ *     <pre>
+ *     my_codec.c:
+ *       aom_codec_iface_t my_codec = {
+ *           "My Codec v1.0",
+ *           AOM_CODEC_ALG_ABI_VERSION,
+ *           ...
+ *       };
+ *     </pre>
+ *
+ * An application instantiates a specific decoder instance by using
+ * aom_codec_init() and a pointer to the algorithm's interface structure:
+ *     <pre>
+ *     my_app.c:
+ *       extern aom_codec_iface_t my_codec;
+ *       {
+ *           aom_codec_ctx_t algo;
+ *           res = aom_codec_init(&algo, &my_codec);
+ *       }
+ *     </pre>
+ *
+ * Once initialized, the instance is managed using other functions from
+ * the aom_codec_* family.
+ */
+#ifndef AOM_AOM_INTERNAL_AOM_CODEC_INTERNAL_H_
+#define AOM_AOM_INTERNAL_AOM_CODEC_INTERNAL_H_
+#include "../aom_decoder.h"
+#include "../aom_encoder.h"
+#include <stdarg.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!\brief Current ABI version number
+ *
+ * \internal
+ * If this file is altered in any way that changes the ABI, this value
+ * must be bumped.  Examples include, but are not limited to, changing
+ * types, removing or reassigning enums, adding/removing/rearranging
+ * fields to structures
+ */
+#define AOM_CODEC_INTERNAL_ABI_VERSION (7) /**<\hideinitializer*/
+
+typedef struct aom_codec_alg_priv aom_codec_alg_priv_t;
+
+/*!\brief init function pointer prototype
+ *
+ * Performs algorithm-specific initialization of the decoder context. This
+ * function is called by the generic aom_codec_init() wrapper function, so
+ * plugins implementing this interface may trust the input parameters to be
+ * properly initialized.
+ *
+ * \param[in] ctx   Pointer to this instance's context
+ * \retval #AOM_CODEC_OK
+ *     The input stream was recognized and decoder initialized.
+ * \retval #AOM_CODEC_MEM_ERROR
+ *     Memory operation failed.
+ */
+typedef aom_codec_err_t (*aom_codec_init_fn_t)(aom_codec_ctx_t *ctx);
+
+/*!\brief destroy function pointer prototype
+ *
+ * Performs algorithm-specific destruction of the decoder context. This
+ * function is called by the generic aom_codec_destroy() wrapper function,
+ * so plugins implementing this interface may trust the input parameters
+ * to be properly initialized.
+ *
+ * \param[in] ctx   Pointer to this instance's context
+ * \retval #AOM_CODEC_OK
+ *     The input stream was recognized and decoder initialized.
+ * \retval #AOM_CODEC_MEM_ERROR
+ *     Memory operation failed.
+ */
+typedef aom_codec_err_t (*aom_codec_destroy_fn_t)(aom_codec_alg_priv_t *ctx);
+
+/*!\brief parse stream info function pointer prototype
+ *
+ * Performs high level parsing of the bitstream. This function is called by the
+ * generic aom_codec_peek_stream_info() wrapper function, so plugins
+ * implementing this interface may trust the input parameters to be properly
+ * initialized.
+ *
+ * \param[in]      data    Pointer to a block of data to parse
+ * \param[in]      data_sz Size of the data buffer
+ * \param[in,out]  si      Pointer to stream info to update. The is_annexb
+ *                         member \ref MUST be properly initialized. This
+ *                         function sets the rest of the members.
+ *
+ * \retval #AOM_CODEC_OK
+ *     Bitstream is parsable and stream information updated
+ */
+typedef aom_codec_err_t (*aom_codec_peek_si_fn_t)(const uint8_t *data,
+                                                  size_t data_sz,
+                                                  aom_codec_stream_info_t *si);
+
+/*!\brief Return information about the current stream.
+ *
+ * Returns information about the stream that has been parsed during decoding.
+ *
+ * \param[in]      ctx     Pointer to this instance's context
+ * \param[in,out]  si      Pointer to stream info to update
+ *
+ * \retval #AOM_CODEC_OK
+ *     Bitstream is parsable and stream information updated
+ */
+typedef aom_codec_err_t (*aom_codec_get_si_fn_t)(aom_codec_alg_priv_t *ctx,
+                                                 aom_codec_stream_info_t *si);
+
+/*!\brief control function pointer prototype
+ *
+ * This function is used to exchange algorithm specific data with the decoder
+ * instance. This can be used to implement features specific to a particular
+ * algorithm.
+ *
+ * This function is called by the generic aom_codec_control() wrapper
+ * function, so plugins implementing this interface may trust the input
+ * parameters to be properly initialized. However,  this interface does not
+ * provide type safety for the exchanged data or assign meanings to the
+ * control IDs. Those details should be specified in the algorithm's
+ * header file. In particular, the ctrl_id parameter is guaranteed to exist
+ * in the algorithm's control mapping table, and the data parameter may be NULL.
+ *
+ *
+ * \param[in]     ctx              Pointer to this instance's context
+ * \param[in]     ctrl_id          Algorithm specific control identifier
+ * \param[in,out] data             Data to exchange with algorithm instance.
+ *
+ * \retval #AOM_CODEC_OK
+ *     The internal state data was deserialized.
+ */
+typedef aom_codec_err_t (*aom_codec_control_fn_t)(aom_codec_alg_priv_t *ctx,
+                                                  va_list ap);
+
+/*!\brief control function pointer mapping
+ *
+ * This structure stores the mapping between control identifiers and
+ * implementing functions. Each algorithm provides a list of these
+ * mappings. This list is searched by the aom_codec_control() wrapper
+ * function to determine which function to invoke. The special
+ * value {0, NULL} is used to indicate end-of-list, and must be
+ * present. The special value {0, <non-null>} can be used as a catch-all
+ * mapping. This implies that ctrl_id values chosen by the algorithm
+ * \ref MUST be non-zero.
+ */
+typedef const struct aom_codec_ctrl_fn_map {
+  int ctrl_id;
+  aom_codec_control_fn_t fn;
+} aom_codec_ctrl_fn_map_t;
+
+/*!\brief decode data function pointer prototype
+ *
+ * Processes a buffer of coded data. This function is called by the generic
+ * aom_codec_decode() wrapper function, so plugins implementing this interface
+ * may trust the input parameters to be properly initialized.
+ *
+ * \param[in] ctx          Pointer to this instance's context
+ * \param[in] data         Pointer to this block of new coded data.
+ * \param[in] data_sz      Size of the coded data, in bytes.
+ *
+ * \return Returns #AOM_CODEC_OK if the coded data was processed completely
+ *         and future pictures can be decoded without error. Otherwise,
+ *         see the descriptions of the other error codes in ::aom_codec_err_t
+ *         for recoverability capabilities.
+ */
+typedef aom_codec_err_t (*aom_codec_decode_fn_t)(aom_codec_alg_priv_t *ctx,
+                                                 const uint8_t *data,
+                                                 size_t data_sz,
+                                                 void *user_priv);
+
+/*!\brief Decoded frames iterator
+ *
+ * Iterates over a list of the frames available for display. The iterator
+ * storage should be initialized to NULL to start the iteration. Iteration is
+ * complete when this function returns NULL.
+ *
+ * The list of available frames becomes valid upon completion of the
+ * aom_codec_decode call, and remains valid until the next call to
+ * aom_codec_decode.
+ *
+ * \param[in]     ctx      Pointer to this instance's context
+ * \param[in out] iter     Iterator storage, initialized to NULL
+ *
+ * \return Returns a pointer to an image, if one is ready for display. Frames
+ *         produced will always be in PTS (presentation time stamp) order.
+ */
+typedef aom_image_t *(*aom_codec_get_frame_fn_t)(aom_codec_alg_priv_t *ctx,
+                                                 aom_codec_iter_t *iter);
+
+/*!\brief Pass in external frame buffers for the decoder to use.
+ *
+ * Registers functions to be called when libaom needs a frame buffer
+ * to decode the current frame and a function to be called when libaom does
+ * not internally reference the frame buffer. This set function must
+ * be called before the first call to decode or libaom will assume the
+ * default behavior of allocating frame buffers internally.
+ *
+ * \param[in] ctx          Pointer to this instance's context
+ * \param[in] cb_get       Pointer to the get callback function
+ * \param[in] cb_release   Pointer to the release callback function
+ * \param[in] cb_priv      Callback's private data
+ *
+ * \retval #AOM_CODEC_OK
+ *     External frame buffers will be used by libaom.
+ * \retval #AOM_CODEC_INVALID_PARAM
+ *     One or more of the callbacks were NULL.
+ * \retval #AOM_CODEC_ERROR
+ *     Decoder context not initialized, or algorithm not capable of
+ *     using external frame buffers.
+ *
+ * \note
+ * When decoding AV1, the application may be required to pass in at least
+ * #AOM_MAXIMUM_WORK_BUFFERS external frame
+ * buffers.
+ */
+typedef aom_codec_err_t (*aom_codec_set_fb_fn_t)(
+    aom_codec_alg_priv_t *ctx, aom_get_frame_buffer_cb_fn_t cb_get,
+    aom_release_frame_buffer_cb_fn_t cb_release, void *cb_priv);
+
+typedef aom_codec_err_t (*aom_codec_encode_fn_t)(aom_codec_alg_priv_t *ctx,
+                                                 const aom_image_t *img,
+                                                 aom_codec_pts_t pts,
+                                                 unsigned long duration,
+                                                 aom_enc_frame_flags_t flags);
+typedef const aom_codec_cx_pkt_t *(*aom_codec_get_cx_data_fn_t)(
+    aom_codec_alg_priv_t *ctx, aom_codec_iter_t *iter);
+
+typedef aom_codec_err_t (*aom_codec_enc_config_set_fn_t)(
+    aom_codec_alg_priv_t *ctx, const aom_codec_enc_cfg_t *cfg);
+typedef aom_fixed_buf_t *(*aom_codec_get_global_headers_fn_t)(
+    aom_codec_alg_priv_t *ctx);
+
+typedef aom_image_t *(*aom_codec_get_preview_frame_fn_t)(
+    aom_codec_alg_priv_t *ctx);
+
+/*!\brief Decoder algorithm interface interface
+ *
+ * All decoders \ref MUST expose a variable of this type.
+ */
+struct aom_codec_iface {
+  const char *name;                   /**< Identification String  */
+  int abi_version;                    /**< Implemented ABI version */
+  aom_codec_caps_t caps;              /**< Decoder capabilities */
+  aom_codec_init_fn_t init;           /**< \copydoc ::aom_codec_init_fn_t */
+  aom_codec_destroy_fn_t destroy;     /**< \copydoc ::aom_codec_destroy_fn_t */
+  aom_codec_ctrl_fn_map_t *ctrl_maps; /**< \copydoc ::aom_codec_ctrl_fn_map_t */
+  struct aom_codec_dec_iface {
+    aom_codec_peek_si_fn_t peek_si; /**< \copydoc ::aom_codec_peek_si_fn_t */
+    aom_codec_get_si_fn_t get_si;   /**< \copydoc ::aom_codec_get_si_fn_t */
+    aom_codec_decode_fn_t decode;   /**< \copydoc ::aom_codec_decode_fn_t */
+    aom_codec_get_frame_fn_t
+        get_frame;                   /**< \copydoc ::aom_codec_get_frame_fn_t */
+    aom_codec_set_fb_fn_t set_fb_fn; /**< \copydoc ::aom_codec_set_fb_fn_t */
+  } dec;
+  struct aom_codec_enc_iface {
+    int cfg_count;
+    const aom_codec_enc_cfg_t *cfgs; /**< \copydoc ::aom_codec_enc_cfg_t */
+    aom_codec_encode_fn_t encode;    /**< \copydoc ::aom_codec_encode_fn_t */
+    aom_codec_get_cx_data_fn_t
+        get_cx_data; /**< \copydoc ::aom_codec_get_cx_data_fn_t */
+    aom_codec_enc_config_set_fn_t
+        cfg_set; /**< \copydoc ::aom_codec_enc_config_set_fn_t */
+    aom_codec_get_global_headers_fn_t
+        get_glob_hdrs; /**< \copydoc ::aom_codec_get_global_headers_fn_t */
+    aom_codec_get_preview_frame_fn_t
+        get_preview; /**< \copydoc ::aom_codec_get_preview_frame_fn_t */
+  } enc;
+};
+
+/*!\brief Instance private storage
+ *
+ * This structure is allocated by the algorithm's init function. It can be
+ * extended in one of two ways. First, a second, algorithm specific structure
+ * can be allocated and the priv member pointed to it. Alternatively, this
+ * structure can be made the first member of the algorithm specific structure,
+ * and the pointer cast to the proper type.
+ */
+struct aom_codec_priv {
+  const char *err_detail;
+  aom_codec_flags_t init_flags;
+  struct {
+    aom_fixed_buf_t cx_data_dst_buf;
+    unsigned int cx_data_pad_before;
+    unsigned int cx_data_pad_after;
+    aom_codec_cx_pkt_t cx_data_pkt;
+  } enc;
+};
+
+#define CAST(id, arg) va_arg((arg), aom_codec_control_type_##id)
+
+/* CODEC_INTERFACE convenience macro
+ *
+ * By convention, each codec interface is a struct with extern linkage, where
+ * the symbol is suffixed with _algo. A getter function is also defined to
+ * return a pointer to the struct, since in some cases it's easier to work
+ * with text symbols than data symbols (see issue #169). This function has
+ * the same name as the struct, less the _algo suffix. The CODEC_INTERFACE
+ * macro is provided to define this getter function automatically.
+ */
+#define CODEC_INTERFACE(id)                          \
+  aom_codec_iface_t *id(void) { return &id##_algo; } \
+  aom_codec_iface_t id##_algo
+
+/* Internal Utility Functions
+ *
+ * The following functions are intended to be used inside algorithms as
+ * utilities for manipulating aom_codec_* data structures.
+ */
+struct aom_codec_pkt_list {
+  unsigned int cnt;
+  unsigned int max;
+  struct aom_codec_cx_pkt pkts[1];
+};
+
+#define aom_codec_pkt_list_decl(n)     \
+  union {                              \
+    struct aom_codec_pkt_list head;    \
+    struct {                           \
+      struct aom_codec_pkt_list head;  \
+      struct aom_codec_cx_pkt pkts[n]; \
+    } alloc;                           \
+  }
+
+#define aom_codec_pkt_list_init(m) \
+  (m)->alloc.head.cnt = 0,         \
+  (m)->alloc.head.max = sizeof((m)->alloc.pkts) / sizeof((m)->alloc.pkts[0])
+
+int aom_codec_pkt_list_add(struct aom_codec_pkt_list *,
+                           const struct aom_codec_cx_pkt *);
+
+const aom_codec_cx_pkt_t *aom_codec_pkt_list_get(
+    struct aom_codec_pkt_list *list, aom_codec_iter_t *iter);
+
+#include <stdio.h>
+#include <setjmp.h>
+
+struct aom_internal_error_info {
+  aom_codec_err_t error_code;
+  int has_detail;
+  char detail[80];
+  int setjmp;  // Boolean: whether 'jmp' is valid.
+  jmp_buf jmp;
+};
+
+#define CLANG_ANALYZER_NORETURN
+#if defined(__has_feature)
+#if __has_feature(attribute_analyzer_noreturn)
+#undef CLANG_ANALYZER_NORETURN
+#define CLANG_ANALYZER_NORETURN __attribute__((analyzer_noreturn))
+#endif
+#endif
+
+void aom_internal_error(struct aom_internal_error_info *info,
+                        aom_codec_err_t error, const char *fmt,
+                        ...) CLANG_ANALYZER_NORETURN;
+
+void aom_merge_corrupted_flag(int *corrupted, int value);
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_INTERNAL_AOM_CODEC_INTERNAL_H_
diff --git a/libs/libaom/src/aom/internal/aom_image_internal.h b/libs/libaom/src/aom/internal/aom_image_internal.h
new file mode 100644
index 000000000..7f2fd1891
--- /dev/null
+++ b/libs/libaom/src/aom/internal/aom_image_internal.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\file
+ * \brief Describes the internal functions associated with the aom image
+ * descriptor.
+ *
+ */
+#ifndef AOM_AOM_INTERNAL_AOM_IMAGE_INTERNAL_H_
+#define AOM_AOM_INTERNAL_AOM_IMAGE_INTERNAL_H_
+
+#include "aom/aom_image.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!\brief Array of aom_metadata structs for an image. */
+struct aom_metadata_array {
+  size_t sz;                       /* Number of metadata structs in the list */
+  aom_metadata_t **metadata_array; /* Array of metadata structs */
+};
+
+/*!\brief Alloc memory for aom_metadata_array struct.
+ *
+ * Allocate memory for aom_metadata_array struct.
+ * If sz is 0 the aom_metadata_array structs internal buffer list will be NULL,
+ * but the aom_metadata_array struct itself will still be allocated.
+ * Returns a pointer to the allocated struct or NULL on failure.
+ *
+ * \param[in]    sz       Size of internal metadata list buffer
+ */
+aom_metadata_array_t *aom_img_metadata_array_alloc(size_t sz);
+
+/*!\brief Free metadata array struct.
+ *
+ * Free metadata array struct and all metadata structs inside.
+ *
+ * \param[in]    arr       Metadata array struct pointer
+ */
+void aom_img_metadata_array_free(aom_metadata_array_t *arr);
+
+typedef void *(*aom_alloc_img_data_cb_fn_t)(void *priv, size_t size);
+
+/*!\brief Open a descriptor, allocating storage for the underlying image by
+ * using the provided callback function.
+ *
+ * Returns a descriptor for storing an image of the given format. The storage
+ * for the image is allocated by using the provided callback function. Unlike
+ * aom_img_alloc(), the returned descriptor does not own the storage for the
+ * image. The caller is responsible for freeing the storage for the image.
+ *
+ * Note: If the callback function is invoked and succeeds,
+ * aom_img_alloc_with_cb() is guaranteed to succeed. Therefore, if
+ * aom_img_alloc_with_cb() fails, the caller is assured that no storage was
+ * allocated.
+ *
+ * \param[in]    img       Pointer to storage for descriptor. If this parameter
+ *                         is NULL, the storage for the descriptor will be
+ *                         allocated on the heap.
+ * \param[in]    fmt       Format for the image
+ * \param[in]    d_w       Width of the image
+ * \param[in]    d_h       Height of the image
+ * \param[in]    align     Alignment, in bytes, of the image buffer and
+ *                         each row in the image (stride).
+ * \param[in]    alloc_cb  Callback function used to allocate storage for the
+ *                         image.
+ * \param[in]    cb_priv   The first argument ('priv') for the callback
+ *                         function.
+ *
+ * \return Returns a pointer to the initialized image descriptor. If the img
+ *         parameter is non-null, the value of the img parameter will be
+ *         returned.
+ */
+aom_image_t *aom_img_alloc_with_cb(aom_image_t *img, aom_img_fmt_t fmt,
+                                   unsigned int d_w, unsigned int d_h,
+                                   unsigned int align,
+                                   aom_alloc_img_data_cb_fn_t alloc_cb,
+                                   void *cb_priv);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_INTERNAL_AOM_IMAGE_INTERNAL_H_
diff --git a/libs/libaom/src/aom/src/aom_codec.c b/libs/libaom/src/aom/src/aom_codec.c
new file mode 100644
index 000000000..196ab8354
--- /dev/null
+++ b/libs/libaom/src/aom/src/aom_codec.c
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\file
+ * \brief Provides the high level interface to wrap decoder algorithms.
+ *
+ */
+#include <stdarg.h>
+#include <stdlib.h>
+
+#include "config/aom_config.h"
+#include "config/aom_version.h"
+
+#include "aom/aom_integer.h"
+#include "aom/internal/aom_codec_internal.h"
+
+#define SAVE_STATUS(ctx, var) (ctx ? (ctx->err = var) : var)
+
+int aom_codec_version(void) { return VERSION_PACKED; }
+
+const char *aom_codec_version_str(void) { return VERSION_STRING_NOSP; }
+
+const char *aom_codec_version_extra_str(void) { return VERSION_EXTRA; }
+
+const char *aom_codec_iface_name(aom_codec_iface_t *iface) {
+  return iface ? iface->name : "<invalid interface>";
+}
+
+const char *aom_codec_err_to_string(aom_codec_err_t err) {
+  switch (err) {
+    case AOM_CODEC_OK: return "Success";
+    case AOM_CODEC_ERROR: return "Unspecified internal error";
+    case AOM_CODEC_MEM_ERROR: return "Memory allocation error";
+    case AOM_CODEC_ABI_MISMATCH: return "ABI version mismatch";
+    case AOM_CODEC_INCAPABLE:
+      return "Codec does not implement requested capability";
+    case AOM_CODEC_UNSUP_BITSTREAM:
+      return "Bitstream not supported by this decoder";
+    case AOM_CODEC_UNSUP_FEATURE:
+      return "Bitstream required feature not supported by this decoder";
+    case AOM_CODEC_CORRUPT_FRAME: return "Corrupt frame detected";
+    case AOM_CODEC_INVALID_PARAM: return "Invalid parameter";
+    case AOM_CODEC_LIST_END: return "End of iterated list";
+  }
+
+  return "Unrecognized error code";
+}
+
+const char *aom_codec_error(aom_codec_ctx_t *ctx) {
+  return (ctx) ? aom_codec_err_to_string(ctx->err)
+               : aom_codec_err_to_string(AOM_CODEC_INVALID_PARAM);
+}
+
+const char *aom_codec_error_detail(aom_codec_ctx_t *ctx) {
+  if (ctx && ctx->err)
+    return ctx->priv ? ctx->priv->err_detail : ctx->err_detail;
+
+  return NULL;
+}
+
+aom_codec_err_t aom_codec_destroy(aom_codec_ctx_t *ctx) {
+  aom_codec_err_t res;
+
+  if (!ctx)
+    res = AOM_CODEC_INVALID_PARAM;
+  else if (!ctx->iface || !ctx->priv)
+    res = AOM_CODEC_ERROR;
+  else {
+    ctx->iface->destroy((aom_codec_alg_priv_t *)ctx->priv);
+
+    ctx->iface = NULL;
+    ctx->name = NULL;
+    ctx->priv = NULL;
+    res = AOM_CODEC_OK;
+  }
+
+  return SAVE_STATUS(ctx, res);
+}
+
+aom_codec_caps_t aom_codec_get_caps(aom_codec_iface_t *iface) {
+  return (iface) ? iface->caps : 0;
+}
+
+aom_codec_err_t aom_codec_control(aom_codec_ctx_t *ctx, int ctrl_id, ...) {
+  aom_codec_err_t res;
+
+  if (!ctx || !ctrl_id)
+    res = AOM_CODEC_INVALID_PARAM;
+  else if (!ctx->iface || !ctx->priv || !ctx->iface->ctrl_maps)
+    res = AOM_CODEC_ERROR;
+  else {
+    aom_codec_ctrl_fn_map_t *entry;
+
+    res = AOM_CODEC_ERROR;
+
+    for (entry = ctx->iface->ctrl_maps; entry && entry->fn; entry++) {
+      if (!entry->ctrl_id || entry->ctrl_id == ctrl_id) {
+        va_list ap;
+
+        va_start(ap, ctrl_id);
+        res = entry->fn((aom_codec_alg_priv_t *)ctx->priv, ap);
+        va_end(ap);
+        break;
+      }
+    }
+  }
+
+  return SAVE_STATUS(ctx, res);
+}
+
+void aom_internal_error(struct aom_internal_error_info *info,
+                        aom_codec_err_t error, const char *fmt, ...) {
+  va_list ap;
+
+  info->error_code = error;
+  info->has_detail = 0;
+
+  if (fmt) {
+    size_t sz = sizeof(info->detail);
+
+    info->has_detail = 1;
+    va_start(ap, fmt);
+    vsnprintf(info->detail, sz - 1, fmt, ap);
+    va_end(ap);
+    info->detail[sz - 1] = '\0';
+  }
+
+  if (info->setjmp) longjmp(info->jmp, info->error_code);
+}
+
+void aom_merge_corrupted_flag(int *corrupted, int value) {
+  *corrupted |= value;
+}
+
+const char *aom_obu_type_to_string(OBU_TYPE type) {
+  switch (type) {
+    case OBU_SEQUENCE_HEADER: return "OBU_SEQUENCE_HEADER";
+    case OBU_TEMPORAL_DELIMITER: return "OBU_TEMPORAL_DELIMITER";
+    case OBU_FRAME_HEADER: return "OBU_FRAME_HEADER";
+    case OBU_REDUNDANT_FRAME_HEADER: return "OBU_REDUNDANT_FRAME_HEADER";
+    case OBU_FRAME: return "OBU_FRAME";
+    case OBU_TILE_GROUP: return "OBU_TILE_GROUP";
+    case OBU_METADATA: return "OBU_METADATA";
+    case OBU_TILE_LIST: return "OBU_TILE_LIST";
+    case OBU_PADDING: return "OBU_PADDING";
+    default: break;
+  }
+  return "<Invalid OBU Type>";
+}
diff --git a/libs/libaom/src/aom/src/aom_decoder.c b/libs/libaom/src/aom/src/aom_decoder.c
new file mode 100644
index 000000000..49fff2635
--- /dev/null
+++ b/libs/libaom/src/aom/src/aom_decoder.c
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\file
+ * \brief Provides the high level interface to wrap decoder algorithms.
+ *
+ */
+#include <string.h>
+#include "aom/internal/aom_codec_internal.h"
+
+#define SAVE_STATUS(ctx, var) (ctx ? (ctx->err = var) : var)
+
+static aom_codec_alg_priv_t *get_alg_priv(aom_codec_ctx_t *ctx) {
+  return (aom_codec_alg_priv_t *)ctx->priv;
+}
+
+aom_codec_err_t aom_codec_dec_init_ver(aom_codec_ctx_t *ctx,
+                                       aom_codec_iface_t *iface,
+                                       const aom_codec_dec_cfg_t *cfg,
+                                       aom_codec_flags_t flags, int ver) {
+  aom_codec_err_t res;
+
+  if (ver != AOM_DECODER_ABI_VERSION)
+    res = AOM_CODEC_ABI_MISMATCH;
+  else if (!ctx || !iface)
+    res = AOM_CODEC_INVALID_PARAM;
+  else if (iface->abi_version != AOM_CODEC_INTERNAL_ABI_VERSION)
+    res = AOM_CODEC_ABI_MISMATCH;
+  else if (!(iface->caps & AOM_CODEC_CAP_DECODER))
+    res = AOM_CODEC_INCAPABLE;
+  else {
+    memset(ctx, 0, sizeof(*ctx));
+    ctx->iface = iface;
+    ctx->name = iface->name;
+    ctx->priv = NULL;
+    ctx->init_flags = flags;
+    ctx->config.dec = cfg;
+
+    res = ctx->iface->init(ctx);
+    if (res) {
+      ctx->err_detail = ctx->priv ? ctx->priv->err_detail : NULL;
+      aom_codec_destroy(ctx);
+    }
+  }
+
+  return SAVE_STATUS(ctx, res);
+}
+
+aom_codec_err_t aom_codec_peek_stream_info(aom_codec_iface_t *iface,
+                                           const uint8_t *data, size_t data_sz,
+                                           aom_codec_stream_info_t *si) {
+  aom_codec_err_t res;
+
+  if (!iface || !data || !data_sz || !si) {
+    res = AOM_CODEC_INVALID_PARAM;
+  } else {
+    /* Set default/unknown values */
+    si->w = 0;
+    si->h = 0;
+
+    res = iface->dec.peek_si(data, data_sz, si);
+  }
+
+  return res;
+}
+
+aom_codec_err_t aom_codec_get_stream_info(aom_codec_ctx_t *ctx,
+                                          aom_codec_stream_info_t *si) {
+  aom_codec_err_t res;
+
+  if (!ctx || !si) {
+    res = AOM_CODEC_INVALID_PARAM;
+  } else if (!ctx->iface || !ctx->priv) {
+    res = AOM_CODEC_ERROR;
+  } else {
+    /* Set default/unknown values */
+    si->w = 0;
+    si->h = 0;
+
+    res = ctx->iface->dec.get_si(get_alg_priv(ctx), si);
+  }
+
+  return SAVE_STATUS(ctx, res);
+}
+
+aom_codec_err_t aom_codec_decode(aom_codec_ctx_t *ctx, const uint8_t *data,
+                                 size_t data_sz, void *user_priv) {
+  aom_codec_err_t res;
+
+  if (!ctx)
+    res = AOM_CODEC_INVALID_PARAM;
+  else if (!ctx->iface || !ctx->priv)
+    res = AOM_CODEC_ERROR;
+  else {
+    res = ctx->iface->dec.decode(get_alg_priv(ctx), data, data_sz, user_priv);
+  }
+
+  return SAVE_STATUS(ctx, res);
+}
+
+aom_image_t *aom_codec_get_frame(aom_codec_ctx_t *ctx, aom_codec_iter_t *iter) {
+  aom_image_t *img;
+
+  if (!ctx || !iter || !ctx->iface || !ctx->priv)
+    img = NULL;
+  else
+    img = ctx->iface->dec.get_frame(get_alg_priv(ctx), iter);
+
+  return img;
+}
+
+aom_codec_err_t aom_codec_set_frame_buffer_functions(
+    aom_codec_ctx_t *ctx, aom_get_frame_buffer_cb_fn_t cb_get,
+    aom_release_frame_buffer_cb_fn_t cb_release, void *cb_priv) {
+  aom_codec_err_t res;
+
+  if (!ctx || !cb_get || !cb_release) {
+    res = AOM_CODEC_INVALID_PARAM;
+  } else if (!ctx->iface || !ctx->priv) {
+    res = AOM_CODEC_ERROR;
+  } else if (!(ctx->iface->caps & AOM_CODEC_CAP_EXTERNAL_FRAME_BUFFER)) {
+    res = AOM_CODEC_INCAPABLE;
+  } else {
+    res = ctx->iface->dec.set_fb_fn(get_alg_priv(ctx), cb_get, cb_release,
+                                    cb_priv);
+  }
+
+  return SAVE_STATUS(ctx, res);
+}
diff --git a/libs/libaom/src/aom/src/aom_encoder.c b/libs/libaom/src/aom/src/aom_encoder.c
new file mode 100644
index 000000000..bb51c9388
--- /dev/null
+++ b/libs/libaom/src/aom/src/aom_encoder.c
@@ -0,0 +1,302 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\file
+ * \brief Provides the high level interface to wrap encoder algorithms.
+ *
+ */
+#include "config/aom_config.h"
+
+#if HAVE_FEXCEPT
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+#include <fenv.h>
+#endif
+
+#include <limits.h>
+#include <string.h>
+
+#include "aom/aom_encoder.h"
+#include "aom/internal/aom_codec_internal.h"
+
+#define SAVE_STATUS(ctx, var) (ctx ? (ctx->err = var) : var)
+
+static aom_codec_alg_priv_t *get_alg_priv(aom_codec_ctx_t *ctx) {
+  return (aom_codec_alg_priv_t *)ctx->priv;
+}
+
+aom_codec_err_t aom_codec_enc_init_ver(aom_codec_ctx_t *ctx,
+                                       aom_codec_iface_t *iface,
+                                       const aom_codec_enc_cfg_t *cfg,
+                                       aom_codec_flags_t flags, int ver) {
+  aom_codec_err_t res;
+
+  if (ver != AOM_ENCODER_ABI_VERSION)
+    res = AOM_CODEC_ABI_MISMATCH;
+  else if (!ctx || !iface || !cfg)
+    res = AOM_CODEC_INVALID_PARAM;
+  else if (iface->abi_version != AOM_CODEC_INTERNAL_ABI_VERSION)
+    res = AOM_CODEC_ABI_MISMATCH;
+  else if (!(iface->caps & AOM_CODEC_CAP_ENCODER))
+    res = AOM_CODEC_INCAPABLE;
+  else if ((flags & AOM_CODEC_USE_PSNR) && !(iface->caps & AOM_CODEC_CAP_PSNR))
+    res = AOM_CODEC_INCAPABLE;
+  else {
+    ctx->iface = iface;
+    ctx->name = iface->name;
+    ctx->priv = NULL;
+    ctx->init_flags = flags;
+    ctx->config.enc = cfg;
+    res = ctx->iface->init(ctx);
+
+    if (res) {
+      ctx->err_detail = ctx->priv ? ctx->priv->err_detail : NULL;
+      aom_codec_destroy(ctx);
+    }
+  }
+
+  return SAVE_STATUS(ctx, res);
+}
+
+aom_codec_err_t aom_codec_enc_config_default(aom_codec_iface_t *iface,
+                                             aom_codec_enc_cfg_t *cfg,
+                                             unsigned int usage) {
+  aom_codec_err_t res;
+  int i;
+
+  if (!iface || !cfg)
+    res = AOM_CODEC_INVALID_PARAM;
+  else if (!(iface->caps & AOM_CODEC_CAP_ENCODER))
+    res = AOM_CODEC_INCAPABLE;
+  else {
+    res = AOM_CODEC_INVALID_PARAM;
+
+    for (i = 0; i < iface->enc.cfg_count; ++i) {
+      if (iface->enc.cfgs[i].g_usage == usage) {
+        *cfg = iface->enc.cfgs[i];
+        res = AOM_CODEC_OK;
+        break;
+      }
+    }
+  }
+  /* default values */
+  if (cfg) {
+    memset(&cfg->encoder_cfg, 0, sizeof(cfg->encoder_cfg));
+    cfg->encoder_cfg.super_block_size = 0;  // Dynamic
+    cfg->encoder_cfg.max_partition_size = 128;
+    cfg->encoder_cfg.min_partition_size = 4;
+    cfg->encoder_cfg.disable_trellis_quant = 3;
+  }
+  return res;
+}
+
+#if ARCH_X86 || ARCH_X86_64
+/* On X86, disable the x87 unit's internal 80 bit precision for better
+ * consistency with the SSE unit's 64 bit precision.
+ */
+#include "aom_ports/x86.h"
+#define FLOATING_POINT_SET_PRECISION \
+  unsigned short x87_orig_mode = x87_set_double_precision();
+#define FLOATING_POINT_RESTORE_PRECISION x87_set_control_word(x87_orig_mode);
+#else
+#define FLOATING_POINT_SET_PRECISION
+#define FLOATING_POINT_RESTORE_PRECISION
+#endif  // ARCH_X86 || ARCH_X86_64
+
+#if HAVE_FEXCEPT && CONFIG_DEBUG
+#define FLOATING_POINT_SET_EXCEPTIONS \
+  const int float_excepts =           \
+      feenableexcept(FE_DIVBYZERO | FE_UNDERFLOW | FE_OVERFLOW);
+#define FLOATING_POINT_RESTORE_EXCEPTIONS \
+  fedisableexcept(FE_ALL_EXCEPT);         \
+  feenableexcept(float_excepts);
+#else
+#define FLOATING_POINT_SET_EXCEPTIONS
+#define FLOATING_POINT_RESTORE_EXCEPTIONS
+#endif  // HAVE_FEXCEPT && CONFIG_DEBUG
+
+/* clang-format off */
+#define FLOATING_POINT_INIT    \
+  do {                         \
+  FLOATING_POINT_SET_PRECISION \
+  FLOATING_POINT_SET_EXCEPTIONS
+
+#define FLOATING_POINT_RESTORE      \
+  FLOATING_POINT_RESTORE_EXCEPTIONS \
+  FLOATING_POINT_RESTORE_PRECISION  \
+  } while (0);
+/* clang-format on */
+
+aom_codec_err_t aom_codec_encode(aom_codec_ctx_t *ctx, const aom_image_t *img,
+                                 aom_codec_pts_t pts, unsigned long duration,
+                                 aom_enc_frame_flags_t flags) {
+  aom_codec_err_t res = AOM_CODEC_OK;
+
+  if (!ctx || (img && !duration))
+    res = AOM_CODEC_INVALID_PARAM;
+  else if (!ctx->iface || !ctx->priv)
+    res = AOM_CODEC_ERROR;
+  else if (!(ctx->iface->caps & AOM_CODEC_CAP_ENCODER))
+    res = AOM_CODEC_INCAPABLE;
+  else {
+    /* Execute in a normalized floating point environment, if the platform
+     * requires it.
+     */
+    FLOATING_POINT_INIT
+    res = ctx->iface->enc.encode(get_alg_priv(ctx), img, pts, duration, flags);
+    FLOATING_POINT_RESTORE
+  }
+
+  return SAVE_STATUS(ctx, res);
+}
+
+const aom_codec_cx_pkt_t *aom_codec_get_cx_data(aom_codec_ctx_t *ctx,
+                                                aom_codec_iter_t *iter) {
+  const aom_codec_cx_pkt_t *pkt = NULL;
+
+  if (ctx) {
+    if (!iter)
+      ctx->err = AOM_CODEC_INVALID_PARAM;
+    else if (!ctx->iface || !ctx->priv)
+      ctx->err = AOM_CODEC_ERROR;
+    else if (!(ctx->iface->caps & AOM_CODEC_CAP_ENCODER))
+      ctx->err = AOM_CODEC_INCAPABLE;
+    else
+      pkt = ctx->iface->enc.get_cx_data(get_alg_priv(ctx), iter);
+  }
+
+  if (pkt && pkt->kind == AOM_CODEC_CX_FRAME_PKT) {
+    // If the application has specified a destination area for the
+    // compressed data, and the codec has not placed the data there,
+    // and it fits, copy it.
+    aom_codec_priv_t *const priv = ctx->priv;
+    char *const dst_buf = (char *)priv->enc.cx_data_dst_buf.buf;
+
+    if (dst_buf && pkt->data.raw.buf != dst_buf &&
+        pkt->data.raw.sz + priv->enc.cx_data_pad_before +
+                priv->enc.cx_data_pad_after <=
+            priv->enc.cx_data_dst_buf.sz) {
+      aom_codec_cx_pkt_t *modified_pkt = &priv->enc.cx_data_pkt;
+
+      memcpy(dst_buf + priv->enc.cx_data_pad_before, pkt->data.raw.buf,
+             pkt->data.raw.sz);
+      *modified_pkt = *pkt;
+      modified_pkt->data.raw.buf = dst_buf;
+      modified_pkt->data.raw.sz +=
+          priv->enc.cx_data_pad_before + priv->enc.cx_data_pad_after;
+      pkt = modified_pkt;
+    }
+
+    if (dst_buf == pkt->data.raw.buf) {
+      priv->enc.cx_data_dst_buf.buf = dst_buf + pkt->data.raw.sz;
+      priv->enc.cx_data_dst_buf.sz -= pkt->data.raw.sz;
+    }
+  }
+
+  return pkt;
+}
+
+aom_codec_err_t aom_codec_set_cx_data_buf(aom_codec_ctx_t *ctx,
+                                          const aom_fixed_buf_t *buf,
+                                          unsigned int pad_before,
+                                          unsigned int pad_after) {
+  if (!ctx || !ctx->priv) return AOM_CODEC_INVALID_PARAM;
+
+  if (buf) {
+    ctx->priv->enc.cx_data_dst_buf = *buf;
+    ctx->priv->enc.cx_data_pad_before = pad_before;
+    ctx->priv->enc.cx_data_pad_after = pad_after;
+  } else {
+    ctx->priv->enc.cx_data_dst_buf.buf = NULL;
+    ctx->priv->enc.cx_data_dst_buf.sz = 0;
+    ctx->priv->enc.cx_data_pad_before = 0;
+    ctx->priv->enc.cx_data_pad_after = 0;
+  }
+
+  return AOM_CODEC_OK;
+}
+
+const aom_image_t *aom_codec_get_preview_frame(aom_codec_ctx_t *ctx) {
+  aom_image_t *img = NULL;
+
+  if (ctx) {
+    if (!ctx->iface || !ctx->priv)
+      ctx->err = AOM_CODEC_ERROR;
+    else if (!(ctx->iface->caps & AOM_CODEC_CAP_ENCODER))
+      ctx->err = AOM_CODEC_INCAPABLE;
+    else if (!ctx->iface->enc.get_preview)
+      ctx->err = AOM_CODEC_INCAPABLE;
+    else
+      img = ctx->iface->enc.get_preview(get_alg_priv(ctx));
+  }
+
+  return img;
+}
+
+aom_fixed_buf_t *aom_codec_get_global_headers(aom_codec_ctx_t *ctx) {
+  aom_fixed_buf_t *buf = NULL;
+
+  if (ctx) {
+    if (!ctx->iface || !ctx->priv)
+      ctx->err = AOM_CODEC_ERROR;
+    else if (!(ctx->iface->caps & AOM_CODEC_CAP_ENCODER))
+      ctx->err = AOM_CODEC_INCAPABLE;
+    else if (!ctx->iface->enc.get_glob_hdrs)
+      ctx->err = AOM_CODEC_INCAPABLE;
+    else
+      buf = ctx->iface->enc.get_glob_hdrs(get_alg_priv(ctx));
+  }
+
+  return buf;
+}
+
+aom_codec_err_t aom_codec_enc_config_set(aom_codec_ctx_t *ctx,
+                                         const aom_codec_enc_cfg_t *cfg) {
+  aom_codec_err_t res;
+
+  if (!ctx || !ctx->iface || !ctx->priv || !cfg)
+    res = AOM_CODEC_INVALID_PARAM;
+  else if (!(ctx->iface->caps & AOM_CODEC_CAP_ENCODER))
+    res = AOM_CODEC_INCAPABLE;
+  else
+    res = ctx->iface->enc.cfg_set(get_alg_priv(ctx), cfg);
+
+  return SAVE_STATUS(ctx, res);
+}
+
+int aom_codec_pkt_list_add(struct aom_codec_pkt_list *list,
+                           const struct aom_codec_cx_pkt *pkt) {
+  if (list->cnt < list->max) {
+    list->pkts[list->cnt++] = *pkt;
+    return 0;
+  }
+
+  return 1;
+}
+
+const aom_codec_cx_pkt_t *aom_codec_pkt_list_get(
+    struct aom_codec_pkt_list *list, aom_codec_iter_t *iter) {
+  const aom_codec_cx_pkt_t *pkt;
+
+  if (!(*iter)) {
+    *iter = list->pkts;
+  }
+
+  pkt = (const aom_codec_cx_pkt_t *)*iter;
+
+  if ((size_t)(pkt - list->pkts) < list->cnt)
+    *iter = pkt + 1;
+  else
+    pkt = NULL;
+
+  return pkt;
+}
diff --git a/libs/libaom/src/aom/src/aom_image.c b/libs/libaom/src/aom/src/aom_image.c
new file mode 100644
index 000000000..cd0b5ed83
--- /dev/null
+++ b/libs/libaom/src/aom/src/aom_image.c
@@ -0,0 +1,395 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom/aom_image.h"
+#include "aom/aom_integer.h"
+#include "aom/internal/aom_image_internal.h"
+#include "aom_mem/aom_mem.h"
+
+static INLINE unsigned int align_image_dimension(unsigned int d,
+                                                 unsigned int subsampling,
+                                                 unsigned int size_align) {
+  unsigned int align;
+
+  align = (1 << subsampling) - 1;
+  align = (size_align - 1 > align) ? (size_align - 1) : align;
+  return ((d + align) & ~align);
+}
+
+static aom_image_t *img_alloc_helper(
+    aom_image_t *img, aom_img_fmt_t fmt, unsigned int d_w, unsigned int d_h,
+    unsigned int buf_align, unsigned int stride_align, unsigned int size_align,
+    unsigned int border, unsigned char *img_data,
+    aom_alloc_img_data_cb_fn_t alloc_cb, void *cb_priv) {
+  /* NOTE: In this function, bit_depth is either 8 or 16 (if
+   * AOM_IMG_FMT_HIGHBITDEPTH is set), never 10 or 12.
+   */
+  unsigned int h, w, s, xcs, ycs, bps, bit_depth;
+  unsigned int stride_in_bytes;
+
+  /* Treat align==0 like align==1 */
+  if (!buf_align) buf_align = 1;
+
+  /* Validate alignment (must be power of 2) */
+  if (buf_align & (buf_align - 1)) goto fail;
+
+  /* Treat align==0 like align==1 */
+  if (!stride_align) stride_align = 1;
+
+  /* Validate alignment (must be power of 2) */
+  if (stride_align & (stride_align - 1)) goto fail;
+
+  /* Treat align==0 like align==1 */
+  if (!size_align) size_align = 1;
+
+  /* Validate alignment (must be power of 2) */
+  if (size_align & (size_align - 1)) goto fail;
+
+  /* Get sample size for this format */
+  switch (fmt) {
+    case AOM_IMG_FMT_I420:
+    case AOM_IMG_FMT_YV12:
+    case AOM_IMG_FMT_AOMI420:
+    case AOM_IMG_FMT_AOMYV12: bps = 12; break;
+    case AOM_IMG_FMT_I422: bps = 16; break;
+    case AOM_IMG_FMT_I444: bps = 24; break;
+    case AOM_IMG_FMT_YV1216:
+    case AOM_IMG_FMT_I42016: bps = 24; break;
+    case AOM_IMG_FMT_I42216: bps = 32; break;
+    case AOM_IMG_FMT_I44416: bps = 48; break;
+    default: bps = 16; break;
+  }
+
+  bit_depth = (fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 16 : 8;
+
+  /* Get chroma shift values for this format */
+  switch (fmt) {
+    case AOM_IMG_FMT_I420:
+    case AOM_IMG_FMT_YV12:
+    case AOM_IMG_FMT_AOMI420:
+    case AOM_IMG_FMT_AOMYV12:
+    case AOM_IMG_FMT_I422:
+    case AOM_IMG_FMT_I42016:
+    case AOM_IMG_FMT_YV1216:
+    case AOM_IMG_FMT_I42216: xcs = 1; break;
+    default: xcs = 0; break;
+  }
+
+  switch (fmt) {
+    case AOM_IMG_FMT_I420:
+    case AOM_IMG_FMT_YV12:
+    case AOM_IMG_FMT_AOMI420:
+    case AOM_IMG_FMT_AOMYV12:
+    case AOM_IMG_FMT_YV1216:
+    case AOM_IMG_FMT_I42016: ycs = 1; break;
+    default: ycs = 0; break;
+  }
+
+  /* Calculate storage sizes given the chroma subsampling */
+  w = align_image_dimension(d_w, xcs, size_align);
+  h = align_image_dimension(d_h, ycs, size_align);
+
+  s = (fmt & AOM_IMG_FMT_PLANAR) ? w : bps * w / bit_depth;
+  s = (s + 2 * border + stride_align - 1) & ~(stride_align - 1);
+  stride_in_bytes = s * bit_depth / 8;
+
+  /* Allocate the new image */
+  if (!img) {
+    img = (aom_image_t *)calloc(1, sizeof(aom_image_t));
+
+    if (!img) goto fail;
+
+    img->self_allocd = 1;
+  } else {
+    memset(img, 0, sizeof(aom_image_t));
+  }
+
+  img->img_data = img_data;
+
+  if (!img_data) {
+    const uint64_t alloc_size =
+        (fmt & AOM_IMG_FMT_PLANAR)
+            ? (uint64_t)(h + 2 * border) * stride_in_bytes * bps / bit_depth
+            : (uint64_t)(h + 2 * border) * stride_in_bytes;
+
+    if (alloc_size != (size_t)alloc_size) goto fail;
+
+    if (alloc_cb) {
+      const size_t padded_alloc_size = (size_t)alloc_size + buf_align - 1;
+      img->img_data = (uint8_t *)alloc_cb(cb_priv, padded_alloc_size);
+      if (img->img_data) {
+        img->img_data = (uint8_t *)aom_align_addr(img->img_data, buf_align);
+      }
+      img->img_data_owner = 0;
+    } else {
+      img->img_data = (uint8_t *)aom_memalign(buf_align, (size_t)alloc_size);
+      img->img_data_owner = 1;
+    }
+    img->sz = (size_t)alloc_size;
+  }
+
+  if (!img->img_data) goto fail;
+
+  img->fmt = fmt;
+  img->bit_depth = bit_depth;
+  // aligned width and aligned height
+  img->w = w;
+  img->h = h;
+  img->x_chroma_shift = xcs;
+  img->y_chroma_shift = ycs;
+  img->bps = bps;
+
+  /* Calculate strides */
+  img->stride[AOM_PLANE_Y] = stride_in_bytes;
+  img->stride[AOM_PLANE_U] = img->stride[AOM_PLANE_V] = stride_in_bytes >> xcs;
+
+  /* Default viewport to entire image. (This aom_img_set_rect call always
+   * succeeds.) */
+  aom_img_set_rect(img, 0, 0, d_w, d_h, border);
+  return img;
+
+fail:
+  aom_img_free(img);
+  return NULL;
+}
+
+aom_image_t *aom_img_alloc(aom_image_t *img, aom_img_fmt_t fmt,
+                           unsigned int d_w, unsigned int d_h,
+                           unsigned int align) {
+  return img_alloc_helper(img, fmt, d_w, d_h, align, align, 1, 0, NULL, NULL,
+                          NULL);
+}
+
+aom_image_t *aom_img_alloc_with_cb(aom_image_t *img, aom_img_fmt_t fmt,
+                                   unsigned int d_w, unsigned int d_h,
+                                   unsigned int align,
+                                   aom_alloc_img_data_cb_fn_t alloc_cb,
+                                   void *cb_priv) {
+  return img_alloc_helper(img, fmt, d_w, d_h, align, align, 1, 0, NULL,
+                          alloc_cb, cb_priv);
+}
+
+aom_image_t *aom_img_wrap(aom_image_t *img, aom_img_fmt_t fmt, unsigned int d_w,
+                          unsigned int d_h, unsigned int stride_align,
+                          unsigned char *img_data) {
+  /* Set buf_align = 1. It is ignored by img_alloc_helper because img_data is
+   * not NULL. */
+  return img_alloc_helper(img, fmt, d_w, d_h, 1, stride_align, 1, 0, img_data,
+                          NULL, NULL);
+}
+
+aom_image_t *aom_img_alloc_with_border(aom_image_t *img, aom_img_fmt_t fmt,
+                                       unsigned int d_w, unsigned int d_h,
+                                       unsigned int align,
+                                       unsigned int size_align,
+                                       unsigned int border) {
+  return img_alloc_helper(img, fmt, d_w, d_h, align, align, size_align, border,
+                          NULL, NULL, NULL);
+}
+
+int aom_img_set_rect(aom_image_t *img, unsigned int x, unsigned int y,
+                     unsigned int w, unsigned int h, unsigned int border) {
+  unsigned char *data;
+
+  if (x + w <= img->w && y + h <= img->h) {
+    img->d_w = w;
+    img->d_h = h;
+
+    x += border;
+    y += border;
+
+    /* Calculate plane pointers */
+    if (!(img->fmt & AOM_IMG_FMT_PLANAR)) {
+      img->planes[AOM_PLANE_PACKED] =
+          img->img_data + x * img->bps / 8 + y * img->stride[AOM_PLANE_PACKED];
+    } else {
+      const int bytes_per_sample =
+          (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1;
+      data = img->img_data;
+
+      img->planes[AOM_PLANE_Y] =
+          data + x * bytes_per_sample + y * img->stride[AOM_PLANE_Y];
+      data += (img->h + 2 * border) * img->stride[AOM_PLANE_Y];
+
+      unsigned int uv_border_h = border >> img->y_chroma_shift;
+      unsigned int uv_x = x >> img->x_chroma_shift;
+      unsigned int uv_y = y >> img->y_chroma_shift;
+      if (!(img->fmt & AOM_IMG_FMT_UV_FLIP)) {
+        img->planes[AOM_PLANE_U] =
+            data + uv_x * bytes_per_sample + uv_y * img->stride[AOM_PLANE_U];
+        data += ((img->h >> img->y_chroma_shift) + 2 * uv_border_h) *
+                img->stride[AOM_PLANE_U];
+        img->planes[AOM_PLANE_V] =
+            data + uv_x * bytes_per_sample + uv_y * img->stride[AOM_PLANE_V];
+      } else {
+        img->planes[AOM_PLANE_V] =
+            data + uv_x * bytes_per_sample + uv_y * img->stride[AOM_PLANE_V];
+        data += ((img->h >> img->y_chroma_shift) + 2 * uv_border_h) *
+                img->stride[AOM_PLANE_V];
+        img->planes[AOM_PLANE_U] =
+            data + uv_x * bytes_per_sample + uv_y * img->stride[AOM_PLANE_U];
+      }
+    }
+    return 0;
+  }
+  return -1;
+}
+
+void aom_img_flip(aom_image_t *img) {
+  /* Note: In the calculation pointer adjustment calculation, we want the
+   * rhs to be promoted to a signed type. Section 6.3.1.8 of the ISO C99
+   * standard indicates that if the adjustment parameter is unsigned, the
+   * stride parameter will be promoted to unsigned, causing errors when
+   * the lhs is a larger type than the rhs.
+   */
+  img->planes[AOM_PLANE_Y] += (signed)(img->d_h - 1) * img->stride[AOM_PLANE_Y];
+  img->stride[AOM_PLANE_Y] = -img->stride[AOM_PLANE_Y];
+
+  img->planes[AOM_PLANE_U] += (signed)((img->d_h >> img->y_chroma_shift) - 1) *
+                              img->stride[AOM_PLANE_U];
+  img->stride[AOM_PLANE_U] = -img->stride[AOM_PLANE_U];
+
+  img->planes[AOM_PLANE_V] += (signed)((img->d_h >> img->y_chroma_shift) - 1) *
+                              img->stride[AOM_PLANE_V];
+  img->stride[AOM_PLANE_V] = -img->stride[AOM_PLANE_V];
+}
+
+void aom_img_free(aom_image_t *img) {
+  if (img) {
+    aom_img_remove_metadata(img);
+    if (img->img_data && img->img_data_owner) aom_free(img->img_data);
+
+    if (img->self_allocd) free(img);
+  }
+}
+
+int aom_img_plane_width(const aom_image_t *img, int plane) {
+  if (plane > 0 && img->x_chroma_shift > 0)
+    return (img->d_w + 1) >> img->x_chroma_shift;
+  else
+    return img->d_w;
+}
+
+int aom_img_plane_height(const aom_image_t *img, int plane) {
+  if (plane > 0 && img->y_chroma_shift > 0)
+    return (img->d_h + 1) >> img->y_chroma_shift;
+  else
+    return img->d_h;
+}
+
+aom_metadata_t *aom_img_metadata_alloc(
+    uint32_t type, const uint8_t *data, size_t sz,
+    aom_metadata_insert_flags_t insert_flag) {
+  if (!data || sz == 0) return NULL;
+  aom_metadata_t *metadata = (aom_metadata_t *)malloc(sizeof(aom_metadata_t));
+  if (!metadata) return NULL;
+  metadata->type = type;
+  metadata->payload = (uint8_t *)malloc(sz);
+  if (!metadata->payload) {
+    free(metadata);
+    return NULL;
+  }
+  memcpy(metadata->payload, data, sz);
+  metadata->sz = sz;
+  metadata->insert_flag = insert_flag;
+  return metadata;
+}
+
+void aom_img_metadata_free(aom_metadata_t *metadata) {
+  if (metadata) {
+    if (metadata->payload) free(metadata->payload);
+    free(metadata);
+  }
+}
+
+aom_metadata_array_t *aom_img_metadata_array_alloc(size_t sz) {
+  aom_metadata_array_t *arr =
+      (aom_metadata_array_t *)calloc(1, sizeof(aom_metadata_array_t));
+  if (!arr) return NULL;
+  if (sz > 0) {
+    arr->metadata_array =
+        (aom_metadata_t **)calloc(sz, sizeof(aom_metadata_t *));
+    if (!arr->metadata_array) {
+      aom_img_metadata_array_free(arr);
+      return NULL;
+    }
+    arr->sz = sz;
+  }
+  return arr;
+}
+
+void aom_img_metadata_array_free(aom_metadata_array_t *arr) {
+  if (arr) {
+    if (arr->metadata_array) {
+      for (size_t i = 0; i < arr->sz; i++) {
+        aom_img_metadata_free(arr->metadata_array[i]);
+      }
+      free(arr->metadata_array);
+    }
+    free(arr);
+  }
+}
+
+int aom_img_add_metadata(aom_image_t *img, uint32_t type, const uint8_t *data,
+                         size_t sz, aom_metadata_insert_flags_t insert_flag) {
+  if (!img) return -1;
+  if (!img->metadata) {
+    img->metadata = aom_img_metadata_array_alloc(0);
+    if (!img->metadata) return -1;
+  }
+  aom_metadata_t *metadata =
+      aom_img_metadata_alloc(type, data, sz, insert_flag);
+  if (!metadata) goto fail;
+  if (!img->metadata->metadata_array) {
+    img->metadata->metadata_array =
+        (aom_metadata_t **)calloc(1, sizeof(metadata));
+    if (!img->metadata->metadata_array || img->metadata->sz != 0) {
+      aom_img_metadata_free(metadata);
+      goto fail;
+    }
+  } else {
+    img->metadata->metadata_array =
+        (aom_metadata_t **)realloc(img->metadata->metadata_array,
+                                   (img->metadata->sz + 1) * sizeof(metadata));
+  }
+  img->metadata->metadata_array[img->metadata->sz] = metadata;
+  img->metadata->sz++;
+  return 0;
+fail:
+  aom_img_metadata_array_free(img->metadata);
+  img->metadata = NULL;
+  return -1;
+}
+
+void aom_img_remove_metadata(aom_image_t *img) {
+  if (img && img->metadata) {
+    aom_img_metadata_array_free(img->metadata);
+    img->metadata = NULL;
+  }
+}
+
+const aom_metadata_t *aom_img_get_metadata(const aom_image_t *img,
+                                           size_t index) {
+  if (!img) return NULL;
+  const aom_metadata_array_t *array = img->metadata;
+  if (array && index < array->sz) {
+    return array->metadata_array[index];
+  }
+  return NULL;
+}
+
+size_t aom_img_num_metadata(const aom_image_t *img) {
+  if (!img || !img->metadata) return 0;
+  return img->metadata->sz;
+}
diff --git a/libs/libaom/src/aom/src/aom_integer.c b/libs/libaom/src/aom/src/aom_integer.c
new file mode 100644
index 000000000..7edfd0de8
--- /dev/null
+++ b/libs/libaom/src/aom/src/aom_integer.c
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+
+static const size_t kMaximumLeb128Size = 8;
+static const uint8_t kLeb128ByteMask = 0x7f;  // Binary: 01111111
+
+// Disallow values larger than 32-bits to ensure consistent behavior on 32 and
+// 64 bit targets: value is typically used to determine buffer allocation size
+// when decoded.
+static const uint64_t kMaximumLeb128Value = UINT32_MAX;
+
+size_t aom_uleb_size_in_bytes(uint64_t value) {
+  size_t size = 0;
+  do {
+    ++size;
+  } while ((value >>= 7) != 0);
+  return size;
+}
+
+int aom_uleb_decode(const uint8_t *buffer, size_t available, uint64_t *value,
+                    size_t *length) {
+  if (buffer && value) {
+    *value = 0;
+    for (size_t i = 0; i < kMaximumLeb128Size && i < available; ++i) {
+      const uint8_t decoded_byte = *(buffer + i) & kLeb128ByteMask;
+      *value |= ((uint64_t)decoded_byte) << (i * 7);
+      if ((*(buffer + i) >> 7) == 0) {
+        if (length) {
+          *length = i + 1;
+        }
+
+        // Fail on values larger than 32-bits to ensure consistent behavior on
+        // 32 and 64 bit targets: value is typically used to determine buffer
+        // allocation size.
+        if (*value > UINT32_MAX) return -1;
+
+        return 0;
+      }
+    }
+  }
+
+  // If we get here, either the buffer/value pointers were invalid,
+  // or we ran over the available space
+  return -1;
+}
+
+int aom_uleb_encode(uint64_t value, size_t available, uint8_t *coded_value,
+                    size_t *coded_size) {
+  const size_t leb_size = aom_uleb_size_in_bytes(value);
+  if (value > kMaximumLeb128Value || leb_size > kMaximumLeb128Size ||
+      leb_size > available || !coded_value || !coded_size) {
+    return -1;
+  }
+
+  for (size_t i = 0; i < leb_size; ++i) {
+    uint8_t byte = value & 0x7f;
+    value >>= 7;
+
+    if (value != 0) byte |= 0x80;  // Signal that more bytes follow.
+
+    *(coded_value + i) = byte;
+  }
+
+  *coded_size = leb_size;
+  return 0;
+}
+
+int aom_uleb_encode_fixed_size(uint64_t value, size_t available,
+                               size_t pad_to_size, uint8_t *coded_value,
+                               size_t *coded_size) {
+  if (value > kMaximumLeb128Value || !coded_value || !coded_size ||
+      available < pad_to_size || pad_to_size > kMaximumLeb128Size) {
+    return -1;
+  }
+  const uint64_t limit = 1ULL << (7 * pad_to_size);
+  if (value >= limit) {
+    // Can't encode 'value' within 'pad_to_size' bytes
+    return -1;
+  }
+
+  for (size_t i = 0; i < pad_to_size; ++i) {
+    uint8_t byte = value & 0x7f;
+    value >>= 7;
+
+    if (i < pad_to_size - 1) byte |= 0x80;  // Signal that more bytes follow.
+
+    *(coded_value + i) = byte;
+  }
+
+  assert(value == 0);
+
+  *coded_size = pad_to_size;
+  return 0;
+}
diff --git a/libs/libaom/src/aom_dsp/aom_convolve.c b/libs/libaom/src/aom_dsp/aom_convolve.c
new file mode 100644
index 000000000..7879b88f6
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/aom_convolve.c
@@ -0,0 +1,239 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <string.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_ports/mem.h"
+
+static INLINE int horz_scalar_product(const uint8_t *a, const int16_t *b) {
+  int sum = 0;
+  for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
+  return sum;
+}
+
+static INLINE int vert_scalar_product(const uint8_t *a, ptrdiff_t a_stride,
+                                      const int16_t *b) {
+  int sum = 0;
+  for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k * a_stride] * b[k];
+  return sum;
+}
+
+static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride,
+                           uint8_t *dst, ptrdiff_t dst_stride,
+                           const InterpKernel *x_filters, int x0_q4,
+                           int x_step_q4, int w, int h) {
+  src -= SUBPEL_TAPS / 2 - 1;
+  for (int y = 0; y < h; ++y) {
+    int x_q4 = x0_q4;
+    for (int x = 0; x < w; ++x) {
+      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+      const int sum = horz_scalar_product(src_x, x_filter);
+      dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+      x_q4 += x_step_q4;
+    }
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride,
+                          uint8_t *dst, ptrdiff_t dst_stride,
+                          const InterpKernel *y_filters, int y0_q4,
+                          int y_step_q4, int w, int h) {
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+
+  for (int x = 0; x < w; ++x) {
+    int y_q4 = y0_q4;
+    for (int y = 0; y < h; ++y) {
+      const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+      const int sum = vert_scalar_product(src_y, src_stride, y_filter);
+      dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+      y_q4 += y_step_q4;
+    }
+    ++src;
+    ++dst;
+  }
+}
+
+static const InterpKernel *get_filter_base(const int16_t *filter) {
+  // NOTE: This assumes that the filter table is 256-byte aligned.
+  return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
+}
+
+static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
+  return (int)((const InterpKernel *)(intptr_t)f - base);
+}
+
+void aom_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+                           uint8_t *dst, ptrdiff_t dst_stride,
+                           const int16_t *filter_x, int x_step_q4,
+                           const int16_t *filter_y, int y_step_q4, int w,
+                           int h) {
+  const InterpKernel *const filters_x = get_filter_base(filter_x);
+  const int x0_q4 = get_filter_offset(filter_x, filters_x);
+
+  (void)filter_y;
+  (void)y_step_q4;
+
+  convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
+                 w, h);
+}
+
+void aom_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+                          uint8_t *dst, ptrdiff_t dst_stride,
+                          const int16_t *filter_x, int x_step_q4,
+                          const int16_t *filter_y, int y_step_q4, int w,
+                          int h) {
+  const InterpKernel *const filters_y = get_filter_base(filter_y);
+  const int y0_q4 = get_filter_offset(filter_y, filters_y);
+
+  (void)filter_x;
+  (void)x_step_q4;
+
+  convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, y_step_q4,
+                w, h);
+}
+
+void aom_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+                         ptrdiff_t dst_stride, const int16_t *filter_x,
+                         int filter_x_stride, const int16_t *filter_y,
+                         int filter_y_stride, int w, int h) {
+  int r;
+
+  (void)filter_x;
+  (void)filter_x_stride;
+  (void)filter_y;
+  (void)filter_y_stride;
+
+  for (r = h; r > 0; --r) {
+    memcpy(dst, src, w);
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE int highbd_vert_scalar_product(const uint16_t *a,
+                                             ptrdiff_t a_stride,
+                                             const int16_t *b) {
+  int sum = 0;
+  for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k * a_stride] * b[k];
+  return sum;
+}
+
+static INLINE int highbd_horz_scalar_product(const uint16_t *a,
+                                             const int16_t *b) {
+  int sum = 0;
+  for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
+  return sum;
+}
+
+static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride,
+                                  uint8_t *dst8, ptrdiff_t dst_stride,
+                                  const InterpKernel *x_filters, int x0_q4,
+                                  int x_step_q4, int w, int h, int bd) {
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  src -= SUBPEL_TAPS / 2 - 1;
+  for (int y = 0; y < h; ++y) {
+    int x_q4 = x0_q4;
+    for (int x = 0; x < w; ++x) {
+      const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+      const int sum = highbd_horz_scalar_product(src_x, x_filter);
+      dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+      x_q4 += x_step_q4;
+    }
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void highbd_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride,
+                                 uint8_t *dst8, ptrdiff_t dst_stride,
+                                 const InterpKernel *y_filters, int y0_q4,
+                                 int y_step_q4, int w, int h, int bd) {
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+  for (int x = 0; x < w; ++x) {
+    int y_q4 = y0_q4;
+    for (int y = 0; y < h; ++y) {
+      const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+      const int sum = highbd_vert_scalar_product(src_y, src_stride, y_filter);
+      dst[y * dst_stride] =
+          clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+      y_q4 += y_step_q4;
+    }
+    ++src;
+    ++dst;
+  }
+}
+
+void aom_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+                                  uint8_t *dst, ptrdiff_t dst_stride,
+                                  const int16_t *filter_x, int x_step_q4,
+                                  const int16_t *filter_y, int y_step_q4, int w,
+                                  int h, int bd) {
+  const InterpKernel *const filters_x = get_filter_base(filter_x);
+  const int x0_q4 = get_filter_offset(filter_x, filters_x);
+  (void)filter_y;
+  (void)y_step_q4;
+
+  highbd_convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
+                        x_step_q4, w, h, bd);
+}
+
+void aom_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+                                 uint8_t *dst, ptrdiff_t dst_stride,
+                                 const int16_t *filter_x, int x_step_q4,
+                                 const int16_t *filter_y, int y_step_q4, int w,
+                                 int h, int bd) {
+  const InterpKernel *const filters_y = get_filter_base(filter_y);
+  const int y0_q4 = get_filter_offset(filter_y, filters_y);
+  (void)filter_x;
+  (void)x_step_q4;
+
+  highbd_convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
+                       y_step_q4, w, h, bd);
+}
+
+void aom_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride,
+                                uint8_t *dst8, ptrdiff_t dst_stride,
+                                const int16_t *filter_x, int filter_x_stride,
+                                const int16_t *filter_y, int filter_y_stride,
+                                int w, int h, int bd) {
+  int r;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  (void)filter_x;
+  (void)filter_y;
+  (void)filter_x_stride;
+  (void)filter_y_stride;
+  (void)bd;
+
+  for (r = h; r > 0; --r) {
+    memcpy(dst, src, w * sizeof(uint16_t));
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/libs/libaom/src/aom_dsp/aom_dsp.cmake b/libs/libaom/src/aom_dsp/aom_dsp.cmake
new file mode 100644
index 000000000..f1b61f010
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/aom_dsp.cmake
@@ -0,0 +1,422 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_AOM_DSP_AOM_DSP_CMAKE_)
+  return()
+endif() # AOM_AOM_DSP_AOM_DSP_CMAKE_
+set(AOM_AOM_DSP_AOM_DSP_CMAKE_ 1)
+
+list(APPEND AOM_DSP_COMMON_SOURCES
+            "${AOM_ROOT}/aom_dsp/aom_convolve.c"
+            "${AOM_ROOT}/aom_dsp/aom_dsp_common.h"
+            "${AOM_ROOT}/aom_dsp/aom_filter.h"
+            "${AOM_ROOT}/aom_dsp/aom_simd.h"
+            "${AOM_ROOT}/aom_dsp/aom_simd_inline.h"
+            "${AOM_ROOT}/aom_dsp/bitreader_buffer.c"
+            "${AOM_ROOT}/aom_dsp/bitreader_buffer.h"
+            "${AOM_ROOT}/aom_dsp/bitwriter_buffer.c"
+            "${AOM_ROOT}/aom_dsp/bitwriter_buffer.h"
+            "${AOM_ROOT}/aom_dsp/blend.h"
+            "${AOM_ROOT}/aom_dsp/blend_a64_hmask.c"
+            "${AOM_ROOT}/aom_dsp/blend_a64_mask.c"
+            "${AOM_ROOT}/aom_dsp/blend_a64_vmask.c"
+            "${AOM_ROOT}/aom_dsp/entcode.c"
+            "${AOM_ROOT}/aom_dsp/entcode.h"
+            "${AOM_ROOT}/aom_dsp/fft.c"
+            "${AOM_ROOT}/aom_dsp/fft_common.h"
+            "${AOM_ROOT}/aom_dsp/intrapred.c"
+            "${AOM_ROOT}/aom_dsp/intrapred_common.h"
+            "${AOM_ROOT}/aom_dsp/loopfilter.c"
+            "${AOM_ROOT}/aom_dsp/prob.h"
+            "${AOM_ROOT}/aom_dsp/recenter.h"
+            "${AOM_ROOT}/aom_dsp/simd/v128_intrinsics.h"
+            "${AOM_ROOT}/aom_dsp/simd/v128_intrinsics_c.h"
+            "${AOM_ROOT}/aom_dsp/simd/v256_intrinsics.h"
+            "${AOM_ROOT}/aom_dsp/simd/v256_intrinsics_c.h"
+            "${AOM_ROOT}/aom_dsp/simd/v64_intrinsics.h"
+            "${AOM_ROOT}/aom_dsp/simd/v64_intrinsics_c.h"
+            "${AOM_ROOT}/aom_dsp/subtract.c"
+            "${AOM_ROOT}/aom_dsp/txfm_common.h"
+            "${AOM_ROOT}/aom_dsp/x86/convolve_common_intrin.h"
+            "${AOM_ROOT}/aom_dsp/avg.c")
+
+list(APPEND AOM_DSP_COMMON_ASM_SSE2
+            "${AOM_ROOT}/aom_dsp/x86/aom_convolve_copy_sse2.asm"
+            "${AOM_ROOT}/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm"
+            "${AOM_ROOT}/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm"
+            "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_sse2.asm"
+            "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm"
+            "${AOM_ROOT}/aom_dsp/x86/highbd_intrapred_asm_sse2.asm"
+            "${AOM_ROOT}/aom_dsp/x86/intrapred_asm_sse2.asm"
+            "${AOM_ROOT}/aom_dsp/x86/inv_wht_sse2.asm")
+
+list(APPEND AOM_DSP_COMMON_INTRIN_SSE2
+            "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c"
+            "${AOM_ROOT}/aom_dsp/x86/aom_asm_stubs.c"
+            "${AOM_ROOT}/aom_dsp/x86/convolve.h"
+            "${AOM_ROOT}/aom_dsp/x86/convolve_sse2.h"
+            "${AOM_ROOT}/aom_dsp/x86/fft_sse2.c"
+            "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_sse2.c"
+            "${AOM_ROOT}/aom_dsp/x86/highbd_intrapred_sse2.c"
+            "${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_sse2.c"
+            "${AOM_ROOT}/aom_dsp/x86/intrapred_sse2.c"
+            "${AOM_ROOT}/aom_dsp/x86/intrapred_x86.h"
+            "${AOM_ROOT}/aom_dsp/x86/loopfilter_sse2.c"
+            "${AOM_ROOT}/aom_dsp/x86/lpf_common_sse2.h"
+            "${AOM_ROOT}/aom_dsp/x86/mem_sse2.h"
+            "${AOM_ROOT}/aom_dsp/x86/transpose_sse2.h"
+            "${AOM_ROOT}/aom_dsp/x86/txfm_common_sse2.h"
+            "${AOM_ROOT}/aom_dsp/x86/sum_squares_sse2.h"
+            "${AOM_ROOT}/aom_dsp/x86/avg_intrin_sse2.c"
+            "${AOM_ROOT}/aom_dsp/x86/bitdepth_conversion_sse2.h")
+
+if(NOT CONFIG_AV1_HIGHBITDEPTH)
+  list(REMOVE_ITEM AOM_DSP_COMMON_INTRIN_SSE2
+                   "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_sse2.c"
+                   "${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_sse2.c")
+endif()
+
+list(APPEND AOM_DSP_COMMON_ASM_SSSE3
+            "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_ssse3.asm"
+            "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm")
+
+list(APPEND AOM_DSP_COMMON_INTRIN_SSSE3
+            "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c"
+            "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_ssse3.c"
+            "${AOM_ROOT}/aom_dsp/x86/intrapred_ssse3.c")
+
+if(NOT CONFIG_AV1_HIGHBITDEPTH)
+  list(REMOVE_ITEM AOM_DSP_COMMON_INTRIN_SSSE3
+                   "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_ssse3.c")
+endif()
+
+list(APPEND AOM_DSP_COMMON_INTRIN_SSE4_1
+            "${AOM_ROOT}/aom_dsp/x86/blend_mask_sse4.h"
+            "${AOM_ROOT}/aom_dsp/x86/blend_a64_hmask_sse4.c"
+            "${AOM_ROOT}/aom_dsp/x86/blend_a64_mask_sse4.c"
+            "${AOM_ROOT}/aom_dsp/x86/blend_a64_vmask_sse4.c")
+
+list(APPEND AOM_DSP_COMMON_INTRIN_AVX2
+            "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c"
+            "${AOM_ROOT}/aom_dsp/x86/common_avx2.h"
+            "${AOM_ROOT}/aom_dsp/x86/txfm_common_avx2.h"
+            "${AOM_ROOT}/aom_dsp/x86/convolve_avx2.h"
+            "${AOM_ROOT}/aom_dsp/x86/fft_avx2.c"
+            "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_avx2.c"
+            "${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_avx2.c"
+            "${AOM_ROOT}/aom_dsp/x86/intrapred_avx2.c"
+            "${AOM_ROOT}/aom_dsp/x86/blend_a64_mask_avx2.c"
+            "${AOM_ROOT}/aom_dsp/x86/avg_intrin_avx2.c"
+            "${AOM_ROOT}/aom_dsp/x86/bitdepth_conversion_avx2.h")
+
+if(NOT CONFIG_AV1_HIGHBITDEPTH)
+  list(REMOVE_ITEM AOM_DSP_COMMON_INTRIN_AVX2
+                   "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_avx2.c"
+                   "${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_avx2.c")
+endif()
+
+list(APPEND AOM_DSP_COMMON_INTRIN_NEON "${AOM_ROOT}/aom_dsp/arm/fwd_txfm_neon.c"
+            "${AOM_ROOT}/aom_dsp/arm/loopfilter_neon.c"
+            "${AOM_ROOT}/aom_dsp/arm/intrapred_neon.c"
+            "${AOM_ROOT}/aom_dsp/arm/subtract_neon.c"
+            "${AOM_ROOT}/aom_dsp/arm/blend_a64_mask_neon.c")
+
+list(APPEND AOM_DSP_COMMON_INTRIN_DSPR2
+            "${AOM_ROOT}/aom_dsp/mips/common_dspr2.c"
+            "${AOM_ROOT}/aom_dsp/mips/common_dspr2.h"
+            "${AOM_ROOT}/aom_dsp/mips/convolve2_dspr2.c"
+            "${AOM_ROOT}/aom_dsp/mips/convolve2_horiz_dspr2.c"
+            "${AOM_ROOT}/aom_dsp/mips/convolve2_vert_dspr2.c"
+            "${AOM_ROOT}/aom_dsp/mips/convolve8_dspr2.c"
+            "${AOM_ROOT}/aom_dsp/mips/convolve8_horiz_dspr2.c"
+            "${AOM_ROOT}/aom_dsp/mips/convolve8_vert_dspr2.c"
+            "${AOM_ROOT}/aom_dsp/mips/convolve_common_dspr2.h"
+            "${AOM_ROOT}/aom_dsp/mips/intrapred16_dspr2.c"
+            "${AOM_ROOT}/aom_dsp/mips/intrapred4_dspr2.c"
+            "${AOM_ROOT}/aom_dsp/mips/intrapred8_dspr2.c"
+            "${AOM_ROOT}/aom_dsp/mips/inv_txfm_dspr2.h")
+
+list(APPEND AOM_DSP_COMMON_INTRIN_MSA
+            "${AOM_ROOT}/aom_dsp/mips/aom_convolve8_horiz_msa.c"
+            "${AOM_ROOT}/aom_dsp/mips/aom_convolve8_vert_msa.c"
+            "${AOM_ROOT}/aom_dsp/mips/aom_convolve_copy_msa.c"
+            "${AOM_ROOT}/aom_dsp/mips/aom_convolve_msa.h"
+            "${AOM_ROOT}/aom_dsp/mips/intrapred_msa.c"
+            "${AOM_ROOT}/aom_dsp/mips/macros_msa.h")
+
+if(CONFIG_AV1_DECODER)
+  list(APPEND AOM_DSP_DECODER_SOURCES
+              "${AOM_ROOT}/aom_dsp/binary_codes_reader.c"
+              "${AOM_ROOT}/aom_dsp/binary_codes_reader.h"
+              "${AOM_ROOT}/aom_dsp/bitreader.c"
+              "${AOM_ROOT}/aom_dsp/bitreader.h" "${AOM_ROOT}/aom_dsp/entdec.c"
+              "${AOM_ROOT}/aom_dsp/entdec.h"
+              "${AOM_ROOT}/aom_dsp/grain_synthesis.c"
+              "${AOM_ROOT}/aom_dsp/grain_synthesis.h")
+endif()
+
+if(CONFIG_AV1_ENCODER)
+  list(APPEND AOM_DSP_ENCODER_SOURCES
+              "${AOM_ROOT}/aom_dsp/binary_codes_writer.c"
+              "${AOM_ROOT}/aom_dsp/binary_codes_writer.h"
+              "${AOM_ROOT}/aom_dsp/bitwriter.c"
+              "${AOM_ROOT}/aom_dsp/bitwriter.h"
+              "${AOM_ROOT}/aom_dsp/blk_sse_sum.c"
+              "${AOM_ROOT}/aom_dsp/entenc.c"
+              "${AOM_ROOT}/aom_dsp/entenc.h"
+              "${AOM_ROOT}/aom_dsp/fwd_txfm.c"
+              "${AOM_ROOT}/aom_dsp/grain_table.c"
+              "${AOM_ROOT}/aom_dsp/grain_table.h"
+              "${AOM_ROOT}/aom_dsp/noise_model.c"
+              "${AOM_ROOT}/aom_dsp/noise_model.h"
+              "${AOM_ROOT}/aom_dsp/noise_util.c"
+              "${AOM_ROOT}/aom_dsp/noise_util.h"
+              "${AOM_ROOT}/aom_dsp/psnr.c"
+              "${AOM_ROOT}/aom_dsp/psnr.h"
+              "${AOM_ROOT}/aom_dsp/quantize.c"
+              "${AOM_ROOT}/aom_dsp/quantize.h"
+              "${AOM_ROOT}/aom_dsp/sad.c"
+              "${AOM_ROOT}/aom_dsp/sse.c"
+              "${AOM_ROOT}/aom_dsp/sad_av1.c"
+              "${AOM_ROOT}/aom_dsp/sum_squares.c"
+              "${AOM_ROOT}/aom_dsp/variance.c"
+              "${AOM_ROOT}/aom_dsp/variance.h")
+
+  list(APPEND AOM_DSP_ENCODER_ASM_SSE2
+              "${AOM_ROOT}/aom_dsp/x86/highbd_sad4d_sse2.asm"
+              "${AOM_ROOT}/aom_dsp/x86/highbd_sad_sse2.asm"
+              "${AOM_ROOT}/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm"
+              "${AOM_ROOT}/aom_dsp/x86/highbd_variance_impl_sse2.asm"
+              "${AOM_ROOT}/aom_dsp/x86/sad4d_sse2.asm"
+              "${AOM_ROOT}/aom_dsp/x86/sad_sse2.asm"
+              "${AOM_ROOT}/aom_dsp/x86/subpel_variance_sse2.asm"
+              "${AOM_ROOT}/aom_dsp/x86/subtract_sse2.asm")
+
+  list(APPEND AOM_DSP_ENCODER_ASM_SSE2_X86_64
+              "${AOM_ROOT}/aom_dsp/x86/ssim_sse2_x86_64.asm")
+
+  list(APPEND AOM_DSP_ENCODER_INTRIN_SSE2
+              "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_impl_sse2.h"
+              "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_sse2.c"
+              "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_sse2.h"
+              "${AOM_ROOT}/aom_dsp/x86/highbd_quantize_intrin_sse2.c"
+              "${AOM_ROOT}/aom_dsp/x86/highbd_subtract_sse2.c"
+              "${AOM_ROOT}/aom_dsp/x86/highbd_variance_sse2.c"
+              "${AOM_ROOT}/aom_dsp/x86/quantize_sse2.c"
+              "${AOM_ROOT}/aom_dsp/x86/adaptive_quantize_sse2.c"
+              "${AOM_ROOT}/aom_dsp/x86/highbd_adaptive_quantize_sse2.c"
+              "${AOM_ROOT}/aom_dsp/x86/quantize_x86.h"
+              "${AOM_ROOT}/aom_dsp/x86/blk_sse_sum_sse2.c"
+              "${AOM_ROOT}/aom_dsp/x86/sum_squares_sse2.c"
+              "${AOM_ROOT}/aom_dsp/x86/variance_sse2.c")
+  if(NOT CONFIG_AV1_HIGHBITDEPTH)
+    list(REMOVE_ITEM AOM_DSP_ENCODER_INTRIN_SSE2
+                     "${AOM_ROOT}/aom_dsp/x86/highbd_adaptive_quantize_sse2.c"
+                     "${AOM_ROOT}/aom_dsp/x86/highbd_quantize_intrin_sse2.c"
+                     "${AOM_ROOT}/aom_dsp/x86/highbd_subtract_sse2.c"
+                     "${AOM_ROOT}/aom_dsp/x86/highbd_variance_sse2.c")
+  endif()
+
+  list(APPEND AOM_DSP_ENCODER_ASM_SSSE3_X86_64
+              "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm"
+              "${AOM_ROOT}/aom_dsp/x86/quantize_ssse3_x86_64.asm")
+
+  list(APPEND AOM_DSP_ENCODER_INTRIN_AVX2
+              "${AOM_ROOT}/aom_dsp/x86/masked_sad_intrin_avx2.c"
+              "${AOM_ROOT}/aom_dsp/x86/subtract_avx2.c"
+              "${AOM_ROOT}/aom_dsp/x86/highbd_quantize_intrin_avx2.c"
+              "${AOM_ROOT}/aom_dsp/x86/adaptive_quantize_avx2.c"
+              "${AOM_ROOT}/aom_dsp/x86/highbd_adaptive_quantize_avx2.c"
+              "${AOM_ROOT}/aom_dsp/x86/sad4d_avx2.c"
+              "${AOM_ROOT}/aom_dsp/x86/sad_avx2.c"
+              "${AOM_ROOT}/aom_dsp/x86/sad_highbd_avx2.c"
+              "${AOM_ROOT}/aom_dsp/x86/sad_impl_avx2.c"
+              "${AOM_ROOT}/aom_dsp/x86/variance_avx2.c"
+              "${AOM_ROOT}/aom_dsp/x86/highbd_variance_avx2.c"
+              "${AOM_ROOT}/aom_dsp/x86/sse_avx2.c"
+              "${AOM_ROOT}/aom_dsp/x86/variance_impl_avx2.c"
+              "${AOM_ROOT}/aom_dsp/x86/obmc_sad_avx2.c"
+              "${AOM_ROOT}/aom_dsp/x86/obmc_variance_avx2.c"
+              "${AOM_ROOT}/aom_dsp/x86/blk_sse_sum_avx2.c"
+              "${AOM_ROOT}/aom_dsp/x86/sum_squares_avx2.c")
+
+  list(APPEND AOM_DSP_ENCODER_AVX_ASM_X86_64
+              "${AOM_ROOT}/aom_dsp/x86/quantize_avx_x86_64.asm")
+
+  list(APPEND AOM_DSP_ENCODER_INTRIN_SSSE3
+              "${AOM_ROOT}/aom_dsp/x86/masked_sad_intrin_ssse3.h"
+              "${AOM_ROOT}/aom_dsp/x86/masked_sad_intrin_ssse3.c"
+              "${AOM_ROOT}/aom_dsp/x86/masked_sad4d_ssse3.c"
+              "${AOM_ROOT}/aom_dsp/x86/masked_variance_intrin_ssse3.h"
+              "${AOM_ROOT}/aom_dsp/x86/masked_variance_intrin_ssse3.c"
+              "${AOM_ROOT}/aom_dsp/x86/quantize_ssse3.c"
+              "${AOM_ROOT}/aom_dsp/x86/variance_impl_ssse3.c"
+              "${AOM_ROOT}/aom_dsp/x86/jnt_variance_ssse3.c"
+              "${AOM_ROOT}/aom_dsp/x86/jnt_sad_ssse3.c")
+
+  list(APPEND AOM_DSP_ENCODER_INTRIN_SSE4_1
+              "${AOM_ROOT}/aom_dsp/x86/highbd_variance_sse4.c"
+              "${AOM_ROOT}/aom_dsp/x86/sse_sse4.c"
+              "${AOM_ROOT}/aom_dsp/x86/obmc_sad_sse4.c"
+              "${AOM_ROOT}/aom_dsp/x86/obmc_variance_sse4.c")
+
+  if(NOT CONFIG_AV1_HIGHBITDEPTH)
+    list(REMOVE_ITEM AOM_DSP_ENCODER_INTRIN_SSE4_1
+                     "${AOM_ROOT}/aom_dsp/x86/highbd_variance_sse4.c")
+  endif()
+
+  list(APPEND AOM_DSP_ENCODER_INTRIN_NEON "${AOM_ROOT}/aom_dsp/arm/sad4d_neon.c"
+              "${AOM_ROOT}/aom_dsp/arm/sad_neon.c"
+              "${AOM_ROOT}/aom_dsp/arm/subpel_variance_neon.c"
+              "${AOM_ROOT}/aom_dsp/arm/variance_neon.c"
+              "${AOM_ROOT}/aom_dsp/arm/hadamard_neon.c"
+              "${AOM_ROOT}/aom_dsp/arm/avg_neon.c"
+              "${AOM_ROOT}/aom_dsp/arm/sse_neon.c")
+
+  list(APPEND AOM_DSP_ENCODER_INTRIN_MSA "${AOM_ROOT}/aom_dsp/mips/sad_msa.c"
+              "${AOM_ROOT}/aom_dsp/mips/subtract_msa.c"
+              "${AOM_ROOT}/aom_dsp/mips/variance_msa.c"
+              "${AOM_ROOT}/aom_dsp/mips/sub_pixel_variance_msa.c")
+
+  if(CONFIG_INTERNAL_STATS)
+    list(APPEND AOM_DSP_ENCODER_SOURCES "${AOM_ROOT}/aom_dsp/fastssim.c"
+                "${AOM_ROOT}/aom_dsp/psnrhvs.c" "${AOM_ROOT}/aom_dsp/ssim.c"
+                "${AOM_ROOT}/aom_dsp/ssim.h")
+  endif()
+
+  if(CONFIG_TUNE_VMAF)
+    list(APPEND AOM_DSP_ENCODER_SOURCES "${AOM_ROOT}/aom_dsp/vmaf.c"
+                "${AOM_ROOT}/aom_dsp/vmaf.h")
+  endif()
+endif()
+
+# Creates aom_dsp build targets. Must not be called until after libaom target
+# has been created.
+function(setup_aom_dsp_targets)
+  add_library(aom_dsp_common OBJECT ${AOM_DSP_COMMON_SOURCES})
+  list(APPEND AOM_LIB_TARGETS aom_dsp_common)
+  create_dummy_source_file("aom_av1" "c" "dummy_source_file")
+  add_library(aom_dsp OBJECT "${dummy_source_file}")
+  target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_dsp_common>)
+  if(BUILD_SHARED_LIBS)
+    target_sources(aom_static PRIVATE $<TARGET_OBJECTS:aom_dsp_common>)
+  endif()
+  list(APPEND AOM_LIB_TARGETS aom_dsp)
+
+  # Not all generators support libraries consisting only of object files. Add a
+  # dummy source file to the aom_dsp target.
+  add_dummy_source_file_to_target("aom_dsp" "c")
+
+  if(CONFIG_AV1_DECODER)
+    add_library(aom_dsp_decoder OBJECT ${AOM_DSP_DECODER_SOURCES})
+    list(APPEND AOM_LIB_TARGETS aom_dsp_decoder)
+    target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_dsp_decoder>)
+    if(BUILD_SHARED_LIBS)
+      target_sources(aom_static PRIVATE $<TARGET_OBJECTS:aom_dsp_decoder>)
+    endif()
+  endif()
+
+  if(CONFIG_AV1_ENCODER)
+    add_library(aom_dsp_encoder OBJECT ${AOM_DSP_ENCODER_SOURCES})
+    list(APPEND AOM_LIB_TARGETS aom_dsp_encoder)
+    target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_dsp_encoder>)
+    if(BUILD_SHARED_LIBS)
+      target_sources(aom_static PRIVATE $<TARGET_OBJECTS:aom_dsp_encoder>)
+    endif()
+  endif()
+
+  if(HAVE_SSE2)
+    add_asm_library("aom_dsp_common_sse2" "AOM_DSP_COMMON_ASM_SSE2")
+    add_intrinsics_object_library("-msse2" "sse2" "aom_dsp_common"
+                                  "AOM_DSP_COMMON_INTRIN_SSE2")
+
+    if(CONFIG_AV1_ENCODER)
+      if("${AOM_TARGET_CPU}" STREQUAL "x86_64")
+        list(APPEND AOM_DSP_ENCODER_ASM_SSE2 ${AOM_DSP_ENCODER_ASM_SSE2_X86_64})
+      endif()
+      add_asm_library("aom_dsp_encoder_sse2" "AOM_DSP_ENCODER_ASM_SSE2")
+      add_intrinsics_object_library("-msse2" "sse2" "aom_dsp_encoder"
+                                    "AOM_DSP_ENCODER_INTRIN_SSE2")
+    endif()
+  endif()
+
+  if(HAVE_SSSE3)
+    add_asm_library("aom_dsp_common_ssse3" "AOM_DSP_COMMON_ASM_SSSE3")
+    add_intrinsics_object_library("-mssse3" "ssse3" "aom_dsp_common"
+                                  "AOM_DSP_COMMON_INTRIN_SSSE3")
+
+    if(CONFIG_AV1_ENCODER)
+      if("${AOM_TARGET_CPU}" STREQUAL "x86_64")
+        list(APPEND AOM_DSP_ENCODER_ASM_SSSE3
+                    ${AOM_DSP_ENCODER_ASM_SSSE3_X86_64})
+      endif()
+      add_asm_library("aom_dsp_encoder_ssse3" "AOM_DSP_ENCODER_ASM_SSSE3")
+      add_intrinsics_object_library("-mssse3" "ssse3" "aom_dsp_encoder"
+                                    "AOM_DSP_ENCODER_INTRIN_SSSE3")
+    endif()
+  endif()
+
+  if(HAVE_SSE4_1)
+    add_intrinsics_object_library("-msse4.1" "sse4_1" "aom_dsp_common"
+                                  "AOM_DSP_COMMON_INTRIN_SSE4_1")
+    if(CONFIG_AV1_ENCODER)
+      add_intrinsics_object_library("-msse4.1" "sse4_1" "aom_dsp_encoder"
+                                    "AOM_DSP_ENCODER_INTRIN_SSE4_1")
+    endif()
+  endif()
+
+  if(HAVE_AVX AND "${AOM_TARGET_CPU}" STREQUAL "x86_64")
+    if(CONFIG_AV1_ENCODER)
+      add_asm_library("aom_dsp_encoder_avx" "AOM_DSP_ENCODER_AVX_ASM_X86_64")
+    endif()
+  endif()
+
+  if(HAVE_AVX2)
+    add_intrinsics_object_library("-mavx2" "avx2" "aom_dsp_common"
+                                  "AOM_DSP_COMMON_INTRIN_AVX2")
+    if(CONFIG_AV1_ENCODER)
+      add_intrinsics_object_library("-mavx2" "avx2" "aom_dsp_encoder"
+                                    "AOM_DSP_ENCODER_INTRIN_AVX2")
+    endif()
+  endif()
+
+  if(HAVE_NEON)
+    add_intrinsics_object_library("${AOM_NEON_INTRIN_FLAG}" "neon"
+                                  "aom_dsp_common" "AOM_DSP_COMMON_INTRIN_NEON")
+    if(CONFIG_AV1_ENCODER)
+      add_intrinsics_object_library("${AOM_NEON_INTRIN_FLAG}" "neon"
+                                    "aom_dsp_encoder"
+                                    "AOM_DSP_ENCODER_INTRIN_NEON")
+    endif()
+  endif()
+
+  if(HAVE_DSPR2)
+    add_intrinsics_object_library("" "dspr2" "aom_dsp_common"
+                                  "AOM_DSP_COMMON_INTRIN_DSPR2")
+  endif()
+
+  if(HAVE_MSA)
+    add_intrinsics_object_library("" "msa" "aom_dsp_common"
+                                  "AOM_DSP_COMMON_INTRIN_MSA")
+    if(CONFIG_AV1_ENCODER)
+      add_intrinsics_object_library("" "msa" "aom_dsp_encoder"
+                                    "AOM_DSP_ENCODER_INTRIN_MSA")
+    endif()
+  endif()
+
+  target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_dsp>)
+  if(BUILD_SHARED_LIBS)
+    target_sources(aom_static PRIVATE $<TARGET_OBJECTS:aom_dsp>)
+  endif()
+
+  # Pass the new lib targets up to the parent scope instance of
+  # $AOM_LIB_TARGETS.
+  set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} PARENT_SCOPE)
+endfunction()
diff --git a/libs/libaom/src/aom_dsp/aom_dsp_common.h b/libs/libaom/src/aom_dsp/aom_dsp_common.h
new file mode 100644
index 000000000..150d35dd1
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/aom_dsp_common.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_AOM_DSP_COMMON_H_
+#define AOM_AOM_DSP_AOM_DSP_COMMON_H_
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef MAX_SB_SIZE
+#define MAX_SB_SIZE 128
+#endif  // ndef MAX_SB_SIZE
+
+#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
+#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
+#define AOMSIGN(x) ((x) < 0 ? -1 : 0)
+
+#define NELEMENTS(x) (int)(sizeof(x) / sizeof(x[0]))
+
+#define IMPLIES(a, b) (!(a) || (b))  //  Logical 'a implies b' (or 'a -> b')
+
+#define IS_POWER_OF_TWO(x) (((x) & ((x)-1)) == 0)
+
+/* Left shifting a negative value became undefined behavior in C99 (downgraded
+   from merely implementation-defined in C89). This should still compile to the
+   correct thing on any two's-complement machine, but avoid ubsan warnings.*/
+#define AOM_SIGNED_SHL(x, shift) ((x) * (((x)*0 + 1) << (shift)))
+
+// These can be used to give a hint about branch outcomes.
+// This can have an effect, even if your target processor has a
+// good branch predictor, as these hints can affect basic block
+// ordering by the compiler.
+#ifdef __GNUC__
+#define LIKELY(v) __builtin_expect(v, 1)
+#define UNLIKELY(v) __builtin_expect(v, 0)
+#else
+#define LIKELY(v) (v)
+#define UNLIKELY(v) (v)
+#endif
+
+typedef uint8_t qm_val_t;
+#define AOM_QM_BITS 5
+
+// Note:
+// tran_low_t  is the datatype used for final transform coefficients.
+// tran_high_t is the datatype used for intermediate transform stages.
+typedef int64_t tran_high_t;
+typedef int32_t tran_low_t;
+
+static INLINE uint8_t clip_pixel(int val) {
+  return (val > 255) ? 255 : (val < 0) ? 0 : val;
+}
+
+static INLINE int clamp(int value, int low, int high) {
+  return value < low ? low : (value > high ? high : value);
+}
+
+static INLINE int64_t clamp64(int64_t value, int64_t low, int64_t high) {
+  return value < low ? low : (value > high ? high : value);
+}
+
+static INLINE double fclamp(double value, double low, double high) {
+  return value < low ? low : (value > high ? high : value);
+}
+
+static INLINE uint16_t clip_pixel_highbd(int val, int bd) {
+  switch (bd) {
+    case 8:
+    default: return (uint16_t)clamp(val, 0, 255);
+    case 10: return (uint16_t)clamp(val, 0, 1023);
+    case 12: return (uint16_t)clamp(val, 0, 4095);
+  }
+}
+
+// The result of this branchless code is equivalent to (value < 0 ? 0 : value)
+// or max(0, value) and might be faster in some cases.
+// Care should be taken since the behavior of right shifting signed type
+// negative value is undefined by C standards and implementation defined,
+static INLINE unsigned int negative_to_zero(int value) {
+  return value & ~(value >> (sizeof(value) * 8 - 1));
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_DSP_AOM_DSP_COMMON_H_
diff --git a/libs/libaom/src/aom_dsp/aom_dsp_rtcd.c b/libs/libaom/src/aom_dsp/aom_dsp_rtcd.c
new file mode 100644
index 000000000..1514bd64e
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/aom_dsp_rtcd.c
@@ -0,0 +1,18 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include "config/aom_config.h"
+
+#define RTCD_C
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_ports/aom_once.h"
+
+void aom_dsp_rtcd() { aom_once(setup_rtcd_internal); }
diff --git a/libs/libaom/src/aom_dsp/aom_dsp_rtcd_defs.pl b/libs/libaom/src/aom_dsp/aom_dsp_rtcd_defs.pl
new file mode 100644
index 000000000..b7d5a41ba
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -0,0 +1,1785 @@
+##
+## Copyright (c) 2017, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+sub aom_dsp_forward_decls() {
+print <<EOF
+/*
+ * DSP
+ */
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "av1/common/enums.h"
+#include "av1/common/blockd.h"
+
+EOF
+}
+forward_decls qw/aom_dsp_forward_decls/;
+
+# optimizations which depend on multiple features
+$avx2_ssse3 = '';
+if ((aom_config("HAVE_AVX2") eq "yes") && (aom_config("HAVE_SSSE3") eq "yes")) {
+  $avx2_ssse3 = 'avx2';
+}
+
+# functions that are 64 bit only.
+$mmx_x86_64 = $sse2_x86_64 = $ssse3_x86_64 = $avx_x86_64 = $avx2_x86_64 = '';
+if ($opts{arch} eq "x86_64") {
+  $mmx_x86_64 = 'mmx';
+  $sse2_x86_64 = 'sse2';
+  $ssse3_x86_64 = 'ssse3';
+  $avx_x86_64 = 'avx';
+  $avx2_x86_64 = 'avx2';
+}
+
+@block_widths = (4, 8, 16, 32, 64, 128);
+
+@block_sizes = ();
+foreach $w (@block_widths) {
+  foreach $h (@block_widths) {
+    push @block_sizes, [$w, $h] if ($w <= 2*$h && $h <= 2*$w) ;
+  }
+}
+push @block_sizes, [4, 16];
+push @block_sizes, [16, 4];
+push @block_sizes, [8, 32];
+push @block_sizes, [32, 8];
+push @block_sizes, [16, 64];
+push @block_sizes, [64, 16];
+
+@tx_dims = (2, 4, 8, 16, 32, 64);
+@tx_sizes = ();
+foreach $w (@tx_dims) {
+  push @tx_sizes, [$w, $w];
+  foreach $h (@tx_dims) {
+    push @tx_sizes, [$w, $h] if ($w >=4 && $h >=4 && ($w == 2*$h || $h == 2*$w));
+    push @tx_sizes, [$w, $h] if ($w >=4 && $h >=4 && ($w == 4*$h || $h == 4*$w));
+  }
+}
+
+@pred_names = qw/dc dc_top dc_left dc_128 v h paeth smooth smooth_v smooth_h/;
+
+#
+# Intra prediction
+#
+
+foreach (@tx_sizes) {
+  ($w, $h) = @$_;
+  foreach $pred_name (@pred_names) {
+    add_proto "void", "aom_${pred_name}_predictor_${w}x${h}",
+              "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+    if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+        add_proto "void", "aom_highbd_${pred_name}_predictor_${w}x${h}",
+                  "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    }
+  }
+}
+
+specialize qw/aom_dc_top_predictor_4x4 msa neon sse2/;
+specialize qw/aom_dc_top_predictor_4x8 sse2/;
+specialize qw/aom_dc_top_predictor_4x16 sse2/;
+specialize qw/aom_dc_top_predictor_8x4 sse2/;
+specialize qw/aom_dc_top_predictor_8x8 neon msa sse2/;
+specialize qw/aom_dc_top_predictor_8x16 sse2/;
+specialize qw/aom_dc_top_predictor_8x32 sse2/;
+specialize qw/aom_dc_top_predictor_16x4 sse2/;
+specialize qw/aom_dc_top_predictor_16x8 sse2/;
+specialize qw/aom_dc_top_predictor_16x16 neon msa sse2/;
+
+specialize qw/aom_dc_top_predictor_16x32 sse2/;
+specialize qw/aom_dc_top_predictor_16x64 sse2/;
+specialize qw/aom_dc_top_predictor_32x8 sse2/;
+specialize qw/aom_dc_top_predictor_32x16 sse2 avx2/;
+specialize qw/aom_dc_top_predictor_32x32 msa neon sse2 avx2/;
+specialize qw/aom_dc_top_predictor_32x64 sse2 avx2/;
+specialize qw/aom_dc_top_predictor_64x64 sse2 avx2/;
+specialize qw/aom_dc_top_predictor_64x32 sse2 avx2/;
+specialize qw/aom_dc_top_predictor_64x16 sse2 avx2/;
+specialize qw/aom_dc_left_predictor_4x4 msa neon sse2/;
+specialize qw/aom_dc_left_predictor_4x8 sse2/;
+specialize qw/aom_dc_left_predictor_4x16 sse2/;
+specialize qw/aom_dc_left_predictor_8x4 sse2/;
+specialize qw/aom_dc_left_predictor_8x8 neon msa sse2/;
+specialize qw/aom_dc_left_predictor_8x16 sse2/;
+specialize qw/aom_dc_left_predictor_8x32 sse2/;
+specialize qw/aom_dc_left_predictor_16x4 sse2/;
+specialize qw/aom_dc_left_predictor_16x8 sse2/;
+specialize qw/aom_dc_left_predictor_16x16 neon msa sse2/;
+specialize qw/aom_dc_left_predictor_16x32 sse2/;
+specialize qw/aom_dc_left_predictor_16x64 sse2/;
+specialize qw/aom_dc_left_predictor_32x8 sse2/;
+specialize qw/aom_dc_left_predictor_32x16 sse2 avx2/;
+specialize qw/aom_dc_left_predictor_32x32 msa neon sse2 avx2/;
+specialize qw/aom_dc_left_predictor_32x64 sse2 avx2/;
+specialize qw/aom_dc_left_predictor_64x64 sse2 avx2/;
+specialize qw/aom_dc_left_predictor_64x32 sse2 avx2/;
+specialize qw/aom_dc_left_predictor_64x16 sse2 avx2/;
+specialize qw/aom_dc_128_predictor_4x4 msa neon sse2/;
+specialize qw/aom_dc_128_predictor_4x8 sse2/;
+specialize qw/aom_dc_128_predictor_4x16 sse2/;
+specialize qw/aom_dc_128_predictor_8x4 sse2/;
+specialize qw/aom_dc_128_predictor_8x8 neon msa sse2/;
+specialize qw/aom_dc_128_predictor_8x16 sse2/;
+specialize qw/aom_dc_128_predictor_8x32 sse2/;
+specialize qw/aom_dc_128_predictor_16x4 sse2/;
+specialize qw/aom_dc_128_predictor_16x8 sse2/;
+specialize qw/aom_dc_128_predictor_16x16 neon msa sse2/;
+specialize qw/aom_dc_128_predictor_16x32 sse2/;
+specialize qw/aom_dc_128_predictor_16x64 sse2/;
+specialize qw/aom_dc_128_predictor_32x8 sse2/;
+specialize qw/aom_dc_128_predictor_32x16 sse2 avx2/;
+specialize qw/aom_dc_128_predictor_32x32 msa neon sse2 avx2/;
+specialize qw/aom_dc_128_predictor_32x64 sse2 avx2/;
+specialize qw/aom_dc_128_predictor_64x64 sse2 avx2/;
+specialize qw/aom_dc_128_predictor_64x32 sse2 avx2/;
+specialize qw/aom_dc_128_predictor_64x16 sse2 avx2/;
+specialize qw/aom_v_predictor_4x4 neon msa sse2/;
+specialize qw/aom_v_predictor_4x8 sse2/;
+specialize qw/aom_v_predictor_4x16 sse2/;
+specialize qw/aom_v_predictor_8x4 sse2/;
+specialize qw/aom_v_predictor_8x8 neon msa sse2/;
+specialize qw/aom_v_predictor_8x16 sse2/;
+specialize qw/aom_v_predictor_8x32 sse2/;
+specialize qw/aom_v_predictor_16x4 sse2/;
+specialize qw/aom_v_predictor_16x8 sse2/;
+specialize qw/aom_v_predictor_16x16 neon msa sse2/;
+specialize qw/aom_v_predictor_16x32 sse2/;
+specialize qw/aom_v_predictor_16x64 sse2/;
+specialize qw/aom_v_predictor_32x8 sse2/;
+specialize qw/aom_v_predictor_32x16 sse2 avx2/;
+specialize qw/aom_v_predictor_32x32 neon msa sse2 avx2/;
+specialize qw/aom_v_predictor_32x64 sse2 avx2/;
+specialize qw/aom_v_predictor_64x64 sse2 avx2/;
+specialize qw/aom_v_predictor_64x32 sse2 avx2/;
+specialize qw/aom_v_predictor_64x16 sse2 avx2/;
+specialize qw/aom_h_predictor_4x8 sse2/;
+specialize qw/aom_h_predictor_4x16 sse2/;
+specialize qw/aom_h_predictor_4x4 neon dspr2 msa sse2/;
+specialize qw/aom_h_predictor_8x4 sse2/;
+specialize qw/aom_h_predictor_8x8 neon dspr2 msa sse2/;
+specialize qw/aom_h_predictor_8x16 sse2/;
+specialize qw/aom_h_predictor_8x32 sse2/;
+specialize qw/aom_h_predictor_16x4 sse2/;
+specialize qw/aom_h_predictor_16x8 sse2/;
+specialize qw/aom_h_predictor_16x16 neon dspr2 msa sse2/;
+specialize qw/aom_h_predictor_16x32 sse2/;
+specialize qw/aom_h_predictor_16x64 sse2/;
+specialize qw/aom_h_predictor_32x8 sse2/;
+specialize qw/aom_h_predictor_32x16 sse2/;
+specialize qw/aom_h_predictor_32x32 neon msa sse2 avx2/;
+specialize qw/aom_h_predictor_32x64 sse2/;
+specialize qw/aom_h_predictor_64x64 sse2/;
+specialize qw/aom_h_predictor_64x32 sse2/;
+specialize qw/aom_h_predictor_64x16 sse2/;
+specialize qw/aom_paeth_predictor_4x4 ssse3/;
+specialize qw/aom_paeth_predictor_4x8 ssse3/;
+specialize qw/aom_paeth_predictor_4x16 ssse3/;
+specialize qw/aom_paeth_predictor_8x4 ssse3/;
+specialize qw/aom_paeth_predictor_8x8 ssse3/;
+specialize qw/aom_paeth_predictor_8x16 ssse3/;
+specialize qw/aom_paeth_predictor_8x32 ssse3/;
+specialize qw/aom_paeth_predictor_16x4 ssse3/;
+specialize qw/aom_paeth_predictor_16x8 ssse3 avx2/;
+specialize qw/aom_paeth_predictor_16x16 ssse3 avx2/;
+specialize qw/aom_paeth_predictor_16x32 ssse3 avx2/;
+specialize qw/aom_paeth_predictor_16x64 ssse3 avx2/;
+specialize qw/aom_paeth_predictor_32x8 ssse3/;
+specialize qw/aom_paeth_predictor_32x16 ssse3 avx2/;
+specialize qw/aom_paeth_predictor_32x32 ssse3 avx2/;
+specialize qw/aom_paeth_predictor_32x64 ssse3 avx2/;
+specialize qw/aom_paeth_predictor_64x32 ssse3 avx2/;
+specialize qw/aom_paeth_predictor_64x64 ssse3 avx2/;
+specialize qw/aom_paeth_predictor_64x16 ssse3 avx2/;
+specialize qw/aom_paeth_predictor_16x8 ssse3/;
+specialize qw/aom_paeth_predictor_16x16 ssse3/;
+specialize qw/aom_paeth_predictor_16x32 ssse3/;
+specialize qw/aom_paeth_predictor_32x16 ssse3/;
+specialize qw/aom_paeth_predictor_32x32 ssse3/;
+specialize qw/aom_smooth_predictor_4x4 ssse3/;
+specialize qw/aom_smooth_predictor_4x8 ssse3/;
+specialize qw/aom_smooth_predictor_4x16 ssse3/;
+specialize qw/aom_smooth_predictor_8x4 ssse3/;
+specialize qw/aom_smooth_predictor_8x8 ssse3/;
+specialize qw/aom_smooth_predictor_8x16 ssse3/;
+specialize qw/aom_smooth_predictor_8x32 ssse3/;
+specialize qw/aom_smooth_predictor_16x4 ssse3/;
+specialize qw/aom_smooth_predictor_16x8 ssse3/;
+specialize qw/aom_smooth_predictor_16x16 ssse3/;
+specialize qw/aom_smooth_predictor_16x32 ssse3/;
+specialize qw/aom_smooth_predictor_16x64 ssse3/;
+specialize qw/aom_smooth_predictor_32x8 ssse3/;
+specialize qw/aom_smooth_predictor_32x16 ssse3/;
+specialize qw/aom_smooth_predictor_32x32 ssse3/;
+specialize qw/aom_smooth_predictor_32x64 ssse3/;
+specialize qw/aom_smooth_predictor_64x64 ssse3/;
+specialize qw/aom_smooth_predictor_64x32 ssse3/;
+specialize qw/aom_smooth_predictor_64x16 ssse3/;
+
+specialize qw/aom_smooth_v_predictor_4x4 ssse3/;
+specialize qw/aom_smooth_v_predictor_4x8 ssse3/;
+specialize qw/aom_smooth_v_predictor_4x16 ssse3/;
+specialize qw/aom_smooth_v_predictor_8x4 ssse3/;
+specialize qw/aom_smooth_v_predictor_8x8 ssse3/;
+specialize qw/aom_smooth_v_predictor_8x16 ssse3/;
+specialize qw/aom_smooth_v_predictor_8x32 ssse3/;
+specialize qw/aom_smooth_v_predictor_16x4 ssse3/;
+specialize qw/aom_smooth_v_predictor_16x8 ssse3/;
+specialize qw/aom_smooth_v_predictor_16x16 ssse3/;
+specialize qw/aom_smooth_v_predictor_16x32 ssse3/;
+specialize qw/aom_smooth_v_predictor_16x64 ssse3/;
+specialize qw/aom_smooth_v_predictor_32x8 ssse3/;
+specialize qw/aom_smooth_v_predictor_32x16 ssse3/;
+specialize qw/aom_smooth_v_predictor_32x32 ssse3/;
+specialize qw/aom_smooth_v_predictor_32x64 ssse3/;
+specialize qw/aom_smooth_v_predictor_64x64 ssse3/;
+specialize qw/aom_smooth_v_predictor_64x32 ssse3/;
+specialize qw/aom_smooth_v_predictor_64x16 ssse3/;
+
+specialize qw/aom_smooth_h_predictor_4x4 ssse3/;
+specialize qw/aom_smooth_h_predictor_4x8 ssse3/;
+specialize qw/aom_smooth_h_predictor_4x16 ssse3/;
+specialize qw/aom_smooth_h_predictor_8x4 ssse3/;
+specialize qw/aom_smooth_h_predictor_8x8 ssse3/;
+specialize qw/aom_smooth_h_predictor_8x16 ssse3/;
+specialize qw/aom_smooth_h_predictor_8x32 ssse3/;
+specialize qw/aom_smooth_h_predictor_16x4 ssse3/;
+specialize qw/aom_smooth_h_predictor_16x8 ssse3/;
+specialize qw/aom_smooth_h_predictor_16x16 ssse3/;
+specialize qw/aom_smooth_h_predictor_16x32 ssse3/;
+specialize qw/aom_smooth_h_predictor_16x64 ssse3/;
+specialize qw/aom_smooth_h_predictor_32x8 ssse3/;
+specialize qw/aom_smooth_h_predictor_32x16 ssse3/;
+specialize qw/aom_smooth_h_predictor_32x32 ssse3/;
+specialize qw/aom_smooth_h_predictor_32x64 ssse3/;
+specialize qw/aom_smooth_h_predictor_64x64 ssse3/;
+specialize qw/aom_smooth_h_predictor_64x32 ssse3/;
+specialize qw/aom_smooth_h_predictor_64x16 ssse3/;
+
+# TODO(yunqingwang): optimize rectangular DC_PRED to replace division
+# by multiply and shift.
+specialize qw/aom_dc_predictor_4x4 dspr2 msa neon sse2/;
+specialize qw/aom_dc_predictor_4x8 sse2/;
+specialize qw/aom_dc_predictor_4x16 sse2/;
+specialize qw/aom_dc_predictor_8x4 sse2/;
+specialize qw/aom_dc_predictor_8x8 dspr2 neon msa sse2/;
+specialize qw/aom_dc_predictor_8x16 sse2/;
+specialize qw/aom_dc_predictor_8x32 sse2/;
+specialize qw/aom_dc_predictor_16x4 sse2/;
+specialize qw/aom_dc_predictor_16x8 sse2/;
+specialize qw/aom_dc_predictor_16x16 dspr2 neon msa sse2/;
+specialize qw/aom_dc_predictor_16x32 sse2/;
+specialize qw/aom_dc_predictor_16x64 sse2/;
+specialize qw/aom_dc_predictor_32x8 sse2/;
+specialize qw/aom_dc_predictor_32x16 sse2 avx2/;
+specialize qw/aom_dc_predictor_32x32 msa neon sse2 avx2/;
+specialize qw/aom_dc_predictor_32x64 sse2 avx2/;
+specialize qw/aom_dc_predictor_64x64 sse2 avx2/;
+specialize qw/aom_dc_predictor_64x32 sse2 avx2/;
+specialize qw/aom_dc_predictor_64x16 sse2 avx2/;
+if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+  specialize qw/aom_highbd_v_predictor_4x4 sse2/;
+  specialize qw/aom_highbd_v_predictor_4x8 sse2/;
+  specialize qw/aom_highbd_v_predictor_8x4 sse2/;
+  specialize qw/aom_highbd_v_predictor_8x8 sse2/;
+  specialize qw/aom_highbd_v_predictor_8x16 sse2/;
+  specialize qw/aom_highbd_v_predictor_16x8 sse2/;
+  specialize qw/aom_highbd_v_predictor_16x16 sse2/;
+  specialize qw/aom_highbd_v_predictor_16x32 sse2/;
+  specialize qw/aom_highbd_v_predictor_32x16 sse2/;
+  specialize qw/aom_highbd_v_predictor_32x32 sse2/;
+
+  # TODO(yunqingwang): optimize rectangular DC_PRED to replace division
+  # by multiply and shift.
+  specialize qw/aom_highbd_dc_predictor_4x4 sse2 neon/;
+  specialize qw/aom_highbd_dc_predictor_4x8 sse2/;
+  specialize qw/aom_highbd_dc_predictor_8x4 sse2/;;
+  specialize qw/aom_highbd_dc_predictor_8x8 sse2 neon/;;
+  specialize qw/aom_highbd_dc_predictor_8x16 sse2/;;
+  specialize qw/aom_highbd_dc_predictor_16x8 sse2/;
+  specialize qw/aom_highbd_dc_predictor_16x16 sse2 neon/;
+  specialize qw/aom_highbd_dc_predictor_16x32 sse2/;
+  specialize qw/aom_highbd_dc_predictor_32x16 sse2/;
+  specialize qw/aom_highbd_dc_predictor_32x32 sse2 neon/;
+  specialize qw/aom_highbd_dc_predictor_64x64 neon/;
+
+  specialize qw/aom_highbd_h_predictor_4x4 sse2/;
+  specialize qw/aom_highbd_h_predictor_4x8 sse2/;
+  specialize qw/aom_highbd_h_predictor_8x4 sse2/;
+  specialize qw/aom_highbd_h_predictor_8x8 sse2/;
+  specialize qw/aom_highbd_h_predictor_8x16 sse2/;
+  specialize qw/aom_highbd_h_predictor_16x8 sse2/;
+  specialize qw/aom_highbd_h_predictor_16x16 sse2/;
+  specialize qw/aom_highbd_h_predictor_16x32 sse2/;
+  specialize qw/aom_highbd_h_predictor_32x16 sse2/;
+  specialize qw/aom_highbd_h_predictor_32x32 sse2/;
+  specialize qw/aom_highbd_dc_left_predictor_4x4 sse2/;
+  specialize qw/aom_highbd_dc_top_predictor_4x4 sse2/;
+  specialize qw/aom_highbd_dc_128_predictor_4x4 sse2/;
+  specialize qw/aom_highbd_dc_left_predictor_4x8 sse2/;
+  specialize qw/aom_highbd_dc_top_predictor_4x8 sse2/;
+  specialize qw/aom_highbd_dc_128_predictor_4x8 sse2/;
+  specialize qw/aom_highbd_dc_left_predictor_8x4 sse2/;
+  specialize qw/aom_highbd_dc_top_predictor_8x4 sse2/;
+  specialize qw/aom_highbd_dc_128_predictor_8x4 sse2/;
+  specialize qw/aom_highbd_dc_left_predictor_8x8 sse2/;
+  specialize qw/aom_highbd_dc_top_predictor_8x8 sse2/;
+  specialize qw/aom_highbd_dc_128_predictor_8x8 sse2/;
+  specialize qw/aom_highbd_dc_left_predictor_8x16 sse2/;
+  specialize qw/aom_highbd_dc_top_predictor_8x16 sse2/;
+  specialize qw/aom_highbd_dc_128_predictor_8x16 sse2/;
+  specialize qw/aom_highbd_dc_left_predictor_16x8 sse2/;
+  specialize qw/aom_highbd_dc_top_predictor_16x8 sse2/;
+  specialize qw/aom_highbd_dc_128_predictor_16x8 sse2/;
+  specialize qw/aom_highbd_dc_left_predictor_16x16 sse2/;
+  specialize qw/aom_highbd_dc_top_predictor_16x16 sse2/;
+  specialize qw/aom_highbd_dc_128_predictor_16x16 sse2/;
+  specialize qw/aom_highbd_dc_left_predictor_16x32 sse2/;
+  specialize qw/aom_highbd_dc_top_predictor_16x32 sse2/;
+  specialize qw/aom_highbd_dc_128_predictor_16x32 sse2/;
+  specialize qw/aom_highbd_dc_left_predictor_32x16 sse2/;
+  specialize qw/aom_highbd_dc_top_predictor_32x16 sse2/;
+  specialize qw/aom_highbd_dc_128_predictor_32x16 sse2/;
+  specialize qw/aom_highbd_dc_left_predictor_32x32 sse2/;
+  specialize qw/aom_highbd_dc_top_predictor_32x32 sse2/;
+  specialize qw/aom_highbd_dc_128_predictor_32x32 sse2/;
+}
+#
+# Sub Pixel Filters
+#
+add_proto qw/void aom_convolve_copy/,             "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride, const int16_t *filter_y, int filter_y_stride, int w, int h";
+add_proto qw/void aom_convolve8_horiz/,           "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void aom_convolve8_vert/,            "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+
+specialize qw/aom_convolve_copy       sse2      /;
+specialize qw/aom_convolve8_horiz     sse2 ssse3/, "$avx2_ssse3";
+specialize qw/aom_convolve8_vert      sse2 ssse3/, "$avx2_ssse3";
+
+if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+  add_proto qw/void aom_highbd_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride, const int16_t *filter_y, int filter_y_stride, int w, int h, int bd";
+  specialize qw/aom_highbd_convolve_copy sse2 avx2/;
+
+  add_proto qw/void aom_highbd_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd";
+  specialize qw/aom_highbd_convolve8_horiz sse2 avx2/;
+
+  add_proto qw/void aom_highbd_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd";
+  specialize qw/aom_highbd_convolve8_vert sse2 avx2/;
+}
+
+#
+# Loopfilter
+#
+add_proto qw/void aom_lpf_vertical_14/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/aom_lpf_vertical_14 sse2 neon/;
+
+add_proto qw/void aom_lpf_vertical_14_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/aom_lpf_vertical_14_dual sse2/;
+
+add_proto qw/void aom_lpf_vertical_6/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/aom_lpf_vertical_6 sse2 neon/;
+
+add_proto qw/void aom_lpf_vertical_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/aom_lpf_vertical_8 sse2 neon/;
+
+add_proto qw/void aom_lpf_vertical_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/aom_lpf_vertical_8_dual sse2/;
+
+add_proto qw/void aom_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/aom_lpf_vertical_4 sse2 neon/;
+
+add_proto qw/void aom_lpf_vertical_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/aom_lpf_vertical_4_dual sse2/;
+
+add_proto qw/void aom_lpf_horizontal_14/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/aom_lpf_horizontal_14 sse2 neon/;
+
+add_proto qw/void aom_lpf_horizontal_14_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/aom_lpf_horizontal_14_dual sse2/;
+
+add_proto qw/void aom_lpf_horizontal_6/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/aom_lpf_horizontal_6 sse2 neon/;
+
+add_proto qw/void aom_lpf_horizontal_6_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/aom_lpf_horizontal_6_dual sse2/;
+
+add_proto qw/void aom_lpf_horizontal_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/aom_lpf_horizontal_8 sse2 neon/;
+
+add_proto qw/void aom_lpf_horizontal_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/aom_lpf_horizontal_8_dual sse2/;
+
+add_proto qw/void aom_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/aom_lpf_horizontal_4 sse2 neon/;
+
+add_proto qw/void aom_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/aom_lpf_horizontal_4_dual sse2/;
+
+add_proto qw/void aom_lpf_vertical_6_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/aom_lpf_vertical_6_dual sse2/;
+
+if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+  add_proto qw/void aom_highbd_lpf_vertical_14/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+  specialize qw/aom_highbd_lpf_vertical_14 sse2/;
+
+  add_proto qw/void aom_highbd_lpf_vertical_14_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+  specialize qw/aom_highbd_lpf_vertical_14_dual sse2 avx2/;
+
+  add_proto qw/void aom_highbd_lpf_vertical_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+  specialize qw/aom_highbd_lpf_vertical_8 sse2/;
+
+  add_proto qw/void aom_highbd_lpf_vertical_6/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+  specialize qw/aom_highbd_lpf_vertical_6 sse2/;
+
+  add_proto qw/void aom_highbd_lpf_vertical_6_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+  specialize qw/aom_highbd_lpf_vertical_6_dual sse2/;
+
+  add_proto qw/void aom_highbd_lpf_vertical_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+  specialize qw/aom_highbd_lpf_vertical_8_dual sse2 avx2/;
+
+  add_proto qw/void aom_highbd_lpf_vertical_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+  specialize qw/aom_highbd_lpf_vertical_4 sse2/;
+
+  add_proto qw/void aom_highbd_lpf_vertical_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+  specialize qw/aom_highbd_lpf_vertical_4_dual sse2 avx2/;
+
+  add_proto qw/void aom_highbd_lpf_horizontal_14/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+  specialize qw/aom_highbd_lpf_horizontal_14 sse2/;
+
+  add_proto qw/void aom_highbd_lpf_horizontal_14_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1,int bd";
+  specialize qw/aom_highbd_lpf_horizontal_14_dual sse2 avx2/;
+
+  add_proto qw/void aom_highbd_lpf_horizontal_6/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+  specialize qw/aom_highbd_lpf_horizontal_6 sse2/;
+
+  add_proto qw/void aom_highbd_lpf_horizontal_6_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+  specialize qw/aom_highbd_lpf_horizontal_6_dual sse2/;
+
+  add_proto qw/void aom_highbd_lpf_horizontal_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+  specialize qw/aom_highbd_lpf_horizontal_8 sse2/;
+
+  add_proto qw/void aom_highbd_lpf_horizontal_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+  specialize qw/aom_highbd_lpf_horizontal_8_dual sse2 avx2/;
+
+  add_proto qw/void aom_highbd_lpf_horizontal_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+  specialize qw/aom_highbd_lpf_horizontal_4 sse2/;
+
+  add_proto qw/void aom_highbd_lpf_horizontal_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+  specialize qw/aom_highbd_lpf_horizontal_4_dual sse2 avx2/;
+}
+
+#
+# Encoder functions.
+#
+
+#
+# Forward transform
+#
+if (aom_config("CONFIG_AV1_ENCODER") eq "yes"){
+    add_proto qw/void aom_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/aom_fdct4x4 neon sse2/;
+
+    add_proto qw/void aom_fdct4x4_lp/, "const int16_t *input, int16_t *output, int stride";
+    specialize qw/aom_fdct4x4_lp neon sse2/;
+
+    add_proto qw/void aom_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/aom_fdct8x8 neon sse2/, "$ssse3_x86_64";
+    # High bit depth
+    if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+      add_proto qw/void aom_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
+      specialize qw/aom_highbd_fdct8x8 sse2/;
+    }
+    # FFT/IFFT (float) only used for denoising (and noise power spectral density estimation)
+    add_proto qw/void aom_fft2x2_float/, "const float *input, float *temp, float *output";
+
+    add_proto qw/void aom_fft4x4_float/, "const float *input, float *temp, float *output";
+    specialize qw/aom_fft4x4_float                  sse2/;
+
+    add_proto qw/void aom_fft8x8_float/, "const float *input, float *temp, float *output";
+    specialize qw/aom_fft8x8_float avx2             sse2/;
+
+    add_proto qw/void aom_fft16x16_float/, "const float *input, float *temp, float *output";
+    specialize qw/aom_fft16x16_float avx2           sse2/;
+
+    add_proto qw/void aom_fft32x32_float/, "const float *input, float *temp, float *output";
+    specialize qw/aom_fft32x32_float avx2           sse2/;
+
+    add_proto qw/void aom_ifft2x2_float/, "const float *input, float *temp, float *output";
+
+    add_proto qw/void aom_ifft4x4_float/, "const float *input, float *temp, float *output";
+    specialize qw/aom_ifft4x4_float                 sse2/;
+
+    add_proto qw/void aom_ifft8x8_float/, "const float *input, float *temp, float *output";
+    specialize qw/aom_ifft8x8_float avx2            sse2/;
+
+    add_proto qw/void aom_ifft16x16_float/, "const float *input, float *temp, float *output";
+    specialize qw/aom_ifft16x16_float avx2          sse2/;
+
+    add_proto qw/void aom_ifft32x32_float/, "const float *input, float *temp, float *output";
+    specialize qw/aom_ifft32x32_float avx2          sse2/;
+}  # CONFIG_AV1_ENCODER
+
+#
+# Quantization
+#
+if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
+  add_proto qw/void aom_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/aom_quantize_b sse2/, "$ssse3_x86_64", "$avx_x86_64";
+
+  add_proto qw/void aom_quantize_b_adaptive/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/aom_quantize_b_adaptive sse2 avx2/;
+
+  add_proto qw/void aom_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/aom_quantize_b_32x32/, "$ssse3_x86_64", "$avx_x86_64";
+
+  add_proto qw/void aom_quantize_b_32x32_adaptive/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/aom_quantize_b_32x32_adaptive sse2/;
+
+  add_proto qw/void aom_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/aom_quantize_b_64x64 ssse3/;
+
+  add_proto qw/void aom_quantize_b_64x64_adaptive/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/aom_quantize_b_64x64_adaptive sse2/;
+}  # CONFIG_AV1_ENCODER
+
+if (aom_config("CONFIG_AV1_ENCODER") eq "yes" && aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+  add_proto qw/void aom_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/aom_highbd_quantize_b sse2 avx2/;
+
+  add_proto qw/void aom_highbd_quantize_b_adaptive/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/aom_highbd_quantize_b_adaptive sse2 avx2/;
+
+  add_proto qw/void aom_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/aom_highbd_quantize_b_32x32 sse2/;
+
+  add_proto qw/void aom_highbd_quantize_b_32x32_adaptive/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/aom_highbd_quantize_b_32x32_adaptive sse2 avx2/;
+
+  add_proto qw/void aom_highbd_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/aom_highbd_quantize_b_64x64 sse2/;
+
+  add_proto qw/void aom_highbd_quantize_b_64x64_adaptive/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/aom_highbd_quantize_b_64x64_adaptive sse2/;
+}  # CONFIG_AV1_ENCODER
+
+#
+# Alpha blending with mask
+#
+add_proto qw/void aom_lowbd_blend_a64_d16_mask/, "uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, ConvolveParams *conv_params";
+specialize qw/aom_lowbd_blend_a64_d16_mask sse4_1 avx2 neon/;
+add_proto qw/void aom_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh";
+add_proto qw/void aom_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h";
+add_proto qw/void aom_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h";
+specialize "aom_blend_a64_mask", qw/sse4_1 avx2/;
+specialize "aom_blend_a64_hmask", qw/sse4_1 neon/;
+specialize "aom_blend_a64_vmask", qw/sse4_1 neon/;
+
+if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+  add_proto qw/void aom_highbd_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, int bd";
+  add_proto qw/void aom_highbd_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd";
+  add_proto qw/void aom_highbd_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd";
+  add_proto qw/void aom_highbd_blend_a64_d16_mask/, "uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, ConvolveParams *conv_params, const int bd";
+  specialize "aom_highbd_blend_a64_mask", qw/sse4_1/;
+  specialize "aom_highbd_blend_a64_hmask", qw/sse4_1/;
+  specialize "aom_highbd_blend_a64_vmask", qw/sse4_1/;
+  specialize "aom_highbd_blend_a64_d16_mask", qw/sse4_1 avx2/;
+}
+
+if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
+  #
+  # Block subtraction
+  #
+  add_proto qw/void aom_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
+  specialize qw/aom_subtract_block neon msa sse2 avx2/;
+
+  add_proto qw/int64_t/, "aom_sse", "const uint8_t *a, int a_stride, const uint8_t *b,int b_stride, int width, int height";
+  specialize qw/aom_sse  sse4_1 avx2 neon/;
+
+  add_proto qw/void/, "aom_get_blk_sse_sum", "const int16_t *data, int stride, int bw, int bh, int *x_sum, int64_t *x2_sum";
+  specialize qw/aom_get_blk_sse_sum sse2 avx2/;
+
+  if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+    add_proto qw/void aom_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd";
+    specialize qw/aom_highbd_subtract_block sse2/;
+
+    add_proto qw/int64_t/, "aom_highbd_sse", "const uint8_t *a8, int a_stride, const uint8_t *b8,int b_stride, int width, int height";
+    specialize qw/aom_highbd_sse  sse4_1 avx2 neon/;
+  }
+
+  if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
+    #
+    # Sum of Squares
+    #
+    add_proto qw/uint64_t aom_sum_squares_2d_i16/, "const int16_t *src, int stride, int width, int height";
+    specialize qw/aom_sum_squares_2d_i16 sse2 avx2/;
+
+    add_proto qw/uint64_t aom_sum_squares_i16/, "const int16_t *src, uint32_t N";
+    specialize qw/aom_sum_squares_i16 sse2/;
+
+    add_proto qw/uint64_t aom_var_2d_u8/, "uint8_t *src, int src_stride, int width, int height";
+    specialize qw/aom_var_2d_u8 sse2 avx2/;
+
+    add_proto qw/uint64_t aom_var_2d_u16/, "uint8_t *src, int src_stride, int width, int height";
+    specialize qw/aom_var_2d_u16 sse2 avx2/;
+  }
+
+  #
+  # Single block SAD / Single block Avg SAD
+  #
+  foreach (@block_sizes) {
+    ($w, $h) = @$_;
+    add_proto qw/unsigned int/, "aom_sad${w}x${h}", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+    add_proto qw/unsigned int/, "aom_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+    add_proto qw/unsigned int/, "aom_dist_wtd_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param";
+  }
+
+  specialize qw/aom_sad128x128    avx2          sse2/;
+  specialize qw/aom_sad128x64     avx2          sse2/;
+  specialize qw/aom_sad64x128     avx2          sse2/;
+  specialize qw/aom_sad64x64      avx2 neon msa sse2/;
+  specialize qw/aom_sad64x32      avx2      msa sse2/;
+  specialize qw/aom_sad32x64      avx2      msa sse2/;
+  specialize qw/aom_sad32x32      avx2 neon msa sse2/;
+  specialize qw/aom_sad32x16      avx2      msa sse2/;
+  specialize qw/aom_sad16x32                msa sse2/;
+  specialize qw/aom_sad16x16           neon msa sse2/;
+  specialize qw/aom_sad16x8            neon msa sse2/;
+  specialize qw/aom_sad8x16            neon msa sse2/;
+  specialize qw/aom_sad8x8             neon msa sse2/;
+  specialize qw/aom_sad8x4                  msa sse2/;
+  specialize qw/aom_sad4x8                  msa sse2/;
+  specialize qw/aom_sad4x4             neon msa sse2/;
+
+  specialize qw/aom_sad128x128_avg avx2     sse2/;
+  specialize qw/aom_sad128x64_avg  avx2     sse2/;
+  specialize qw/aom_sad64x128_avg  avx2     sse2/;
+  specialize qw/aom_sad64x64_avg   avx2 msa sse2/;
+  specialize qw/aom_sad64x32_avg   avx2 msa sse2/;
+  specialize qw/aom_sad32x64_avg   avx2 msa sse2/;
+  specialize qw/aom_sad32x32_avg   avx2 msa sse2/;
+  specialize qw/aom_sad32x16_avg   avx2 msa sse2/;
+  specialize qw/aom_sad16x32_avg        msa sse2/;
+  specialize qw/aom_sad16x16_avg        msa sse2/;
+  specialize qw/aom_sad16x8_avg         msa sse2/;
+  specialize qw/aom_sad8x16_avg         msa sse2/;
+  specialize qw/aom_sad8x8_avg          msa sse2/;
+  specialize qw/aom_sad8x4_avg          msa sse2/;
+  specialize qw/aom_sad4x8_avg          msa sse2/;
+  specialize qw/aom_sad4x4_avg          msa sse2/;
+
+  specialize qw/aom_sad4x16      sse2/;
+  specialize qw/aom_sad16x4      sse2/;
+  specialize qw/aom_sad8x32      sse2/;
+  specialize qw/aom_sad32x8      sse2/;
+  specialize qw/aom_sad16x64     sse2/;
+  specialize qw/aom_sad64x16     sse2/;
+
+  specialize qw/aom_sad4x16_avg  sse2/;
+  specialize qw/aom_sad16x4_avg  sse2/;
+  specialize qw/aom_sad8x32_avg  sse2/;
+  specialize qw/aom_sad32x8_avg  sse2/;
+  specialize qw/aom_sad16x64_avg sse2/;
+  specialize qw/aom_sad64x16_avg sse2/;
+
+  specialize qw/aom_dist_wtd_sad128x128_avg ssse3/;
+  specialize qw/aom_dist_wtd_sad128x64_avg  ssse3/;
+  specialize qw/aom_dist_wtd_sad64x128_avg  ssse3/;
+  specialize qw/aom_dist_wtd_sad64x64_avg   ssse3/;
+  specialize qw/aom_dist_wtd_sad64x32_avg   ssse3/;
+  specialize qw/aom_dist_wtd_sad32x64_avg   ssse3/;
+  specialize qw/aom_dist_wtd_sad32x32_avg   ssse3/;
+  specialize qw/aom_dist_wtd_sad32x16_avg   ssse3/;
+  specialize qw/aom_dist_wtd_sad16x32_avg   ssse3/;
+  specialize qw/aom_dist_wtd_sad16x16_avg   ssse3/;
+  specialize qw/aom_dist_wtd_sad16x8_avg    ssse3/;
+  specialize qw/aom_dist_wtd_sad8x16_avg    ssse3/;
+  specialize qw/aom_dist_wtd_sad8x8_avg     ssse3/;
+  specialize qw/aom_dist_wtd_sad8x4_avg     ssse3/;
+  specialize qw/aom_dist_wtd_sad4x8_avg     ssse3/;
+  specialize qw/aom_dist_wtd_sad4x4_avg     ssse3/;
+
+  specialize qw/aom_dist_wtd_sad4x16_avg     ssse3/;
+  specialize qw/aom_dist_wtd_sad16x4_avg     ssse3/;
+  specialize qw/aom_dist_wtd_sad8x32_avg     ssse3/;
+  specialize qw/aom_dist_wtd_sad32x8_avg     ssse3/;
+  specialize qw/aom_dist_wtd_sad16x64_avg    ssse3/;
+  specialize qw/aom_dist_wtd_sad64x16_avg    ssse3/;
+
+  add_proto qw/unsigned int/, "aom_sad4xh", "const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height";
+  add_proto qw/unsigned int/, "aom_sad8xh", "const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height";
+  add_proto qw/unsigned int/, "aom_sad16xh", "const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height";
+  add_proto qw/unsigned int/, "aom_sad32xh", "const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height";
+  add_proto qw/unsigned int/, "aom_sad64xh", "const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height";
+  add_proto qw/unsigned int/, "aom_sad128xh", "const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height";
+
+  specialize qw/aom_sad4xh   sse2/;
+  specialize qw/aom_sad8xh   sse2/;
+  specialize qw/aom_sad16xh  sse2/;
+  specialize qw/aom_sad32xh  sse2/;
+  specialize qw/aom_sad64xh  sse2/;
+  specialize qw/aom_sad128xh sse2/;
+
+  if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+    foreach (@block_sizes) {
+      ($w, $h) = @$_;
+      add_proto qw/unsigned int/, "aom_highbd_sad${w}x${h}", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+      add_proto qw/unsigned int/, "aom_highbd_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+      if ($w != 128 && $h != 128 && $w != 4) {
+        specialize "aom_highbd_sad${w}x${h}", qw/sse2/;
+        specialize "aom_highbd_sad${w}x${h}_avg", qw/sse2/;
+      }
+      add_proto qw/unsigned int/, "aom_highbd_dist_wtd_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param";
+    }
+    specialize qw/aom_highbd_sad128x128 avx2/;
+    specialize qw/aom_highbd_sad128x64  avx2/;
+    specialize qw/aom_highbd_sad64x128  avx2/;
+    specialize qw/aom_highbd_sad64x64   avx2 sse2/;
+    specialize qw/aom_highbd_sad64x32   avx2 sse2/;
+    specialize qw/aom_highbd_sad32x64   avx2 sse2/;
+    specialize qw/aom_highbd_sad32x32   avx2 sse2/;
+    specialize qw/aom_highbd_sad32x16   avx2 sse2/;
+    specialize qw/aom_highbd_sad16x32   avx2 sse2/;
+    specialize qw/aom_highbd_sad16x16   avx2 sse2/;
+    specialize qw/aom_highbd_sad16x8    avx2 sse2/;
+    specialize qw/aom_highbd_sad8x4     sse2/;
+    specialize qw/aom_highbd_sad4x8     sse2/;
+    specialize qw/aom_highbd_sad4x4     sse2/;
+
+    specialize qw/aom_highbd_sad128x128_avg avx2/;
+    specialize qw/aom_highbd_sad128x64_avg  avx2/;
+    specialize qw/aom_highbd_sad64x128_avg  avx2/;
+    specialize qw/aom_highbd_sad64x64_avg   avx2 sse2/;
+    specialize qw/aom_highbd_sad64x32_avg   avx2 sse2/;
+    specialize qw/aom_highbd_sad32x64_avg   avx2 sse2/;
+    specialize qw/aom_highbd_sad32x32_avg   avx2 sse2/;
+    specialize qw/aom_highbd_sad32x16_avg   avx2 sse2/;
+    specialize qw/aom_highbd_sad16x32_avg   avx2 sse2/;
+    specialize qw/aom_highbd_sad16x16_avg   avx2 sse2/;
+    specialize qw/aom_highbd_sad16x8_avg    avx2 sse2/;
+    specialize qw/aom_highbd_sad8x4_avg     sse2/;
+    specialize qw/aom_highbd_sad4x8_avg     sse2/;
+    specialize qw/aom_highbd_sad4x4_avg     sse2/;
+
+    specialize qw/aom_highbd_sad4x16        sse2/;
+    specialize qw/aom_highbd_sad16x4        avx2 sse2/;
+    specialize qw/aom_highbd_sad8x32        sse2/;
+    specialize qw/aom_highbd_sad32x8        avx2 sse2/;
+    specialize qw/aom_highbd_sad16x64       avx2 sse2/;
+    specialize qw/aom_highbd_sad64x16       avx2 sse2/;
+
+    specialize qw/aom_highbd_sad4x16_avg    sse2/;
+    specialize qw/aom_highbd_sad16x4_avg    avx2 sse2/;
+    specialize qw/aom_highbd_sad8x32_avg    sse2/;
+    specialize qw/aom_highbd_sad32x8_avg    avx2 sse2/;
+    specialize qw/aom_highbd_sad16x64_avg   avx2 sse2/;
+    specialize qw/aom_highbd_sad64x16_avg   avx2 sse2/;
+  }
+  #
+  # Masked SAD
+  #
+  foreach (@block_sizes) {
+    ($w, $h) = @$_;
+    add_proto qw/unsigned int/, "aom_masked_sad${w}x${h}", "const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask";
+    specialize "aom_masked_sad${w}x${h}", qw/ssse3 avx2/;
+  }
+
+  if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+    foreach (@block_sizes) {
+      ($w, $h) = @$_;
+      add_proto qw/unsigned int/, "aom_highbd_masked_sad${w}x${h}", "const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask";
+      specialize "aom_highbd_masked_sad${w}x${h}", qw/ssse3 avx2/;
+    }
+  }
+
+  #
+  # OBMC SAD
+  #
+  foreach (@block_sizes) {
+    ($w, $h) = @$_;
+    add_proto qw/unsigned int/, "aom_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask";
+    if (! (($w == 128 && $h == 32) || ($w == 32 && $h == 128))) {
+       specialize "aom_obmc_sad${w}x${h}", qw/sse4_1 avx2/;
+    }
+  }
+
+  if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+    foreach (@block_sizes) {
+      ($w, $h) = @$_;
+      add_proto qw/unsigned int/, "aom_highbd_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask";
+      if (! (($w == 128 && $h == 32) || ($w == 32 && $h == 128))) {
+        specialize "aom_highbd_obmc_sad${w}x${h}", qw/sse4_1 avx2/;
+      }
+    }
+  }
+
+  #
+  # Multi-block SAD, comparing a reference to N independent blocks
+  #
+  foreach (@block_sizes) {
+    ($w, $h) = @$_;
+    add_proto qw/void/, "aom_sad${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+    add_proto qw/void/, "aom_sad${w}x${h}x4d_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, const uint8_t *second_pred, uint32_t *sad_array";
+    add_proto qw/void/, "aom_masked_sad${w}x${h}x4d", "const uint8_t *src, int src_stride, const uint8_t *ref[], int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned sads[]";
+  }
+
+  specialize qw/aom_sad128x128x4d avx2          sse2/;
+  specialize qw/aom_sad128x64x4d  avx2          sse2/;
+  specialize qw/aom_sad64x128x4d  avx2          sse2/;
+  specialize qw/aom_sad64x64x4d   avx2 neon msa sse2/;
+  specialize qw/aom_sad64x32x4d   avx2      msa sse2/;
+  specialize qw/aom_sad64x16x4d   avx2          sse2/;
+  specialize qw/aom_sad32x64x4d   avx2      msa sse2/;
+  specialize qw/aom_sad32x32x4d   avx2 neon msa sse2/;
+  specialize qw/aom_sad32x16x4d   avx2      msa sse2/;
+  specialize qw/aom_sad32x8x4d    avx2          sse2/;
+  specialize qw/aom_sad16x64x4d                 sse2/;
+  specialize qw/aom_sad16x32x4d             msa sse2/;
+  specialize qw/aom_sad16x16x4d         neon msa sse2/;
+  specialize qw/aom_sad16x8x4d               msa sse2/;
+
+  specialize qw/aom_sad8x16x4d              msa sse2/;
+  specialize qw/aom_sad8x8x4d               msa sse2/;
+  specialize qw/aom_sad8x4x4d               msa sse2/;
+  specialize qw/aom_sad4x16x4d              msa sse2/;
+  specialize qw/aom_sad4x8x4d               msa sse2/;
+  specialize qw/aom_sad4x4x4d               msa sse2/;
+
+  specialize qw/aom_sad4x32x4d  sse2/;
+  specialize qw/aom_sad4x16x4d  sse2/;
+  specialize qw/aom_sad16x4x4d  sse2/;
+  specialize qw/aom_sad8x32x4d  sse2/;
+  specialize qw/aom_sad32x8x4d  sse2/;
+  specialize qw/aom_sad64x16x4d sse2/;
+
+  specialize qw/aom_sad128x128x4d_avg sse2/;
+  specialize qw/aom_sad128x64x4d_avg  sse2/;
+  specialize qw/aom_sad64x128x4d_avg  sse2/;
+  specialize qw/aom_sad64x64x4d_avg   sse2/;
+  specialize qw/aom_sad64x32x4d_avg   sse2/;
+  specialize qw/aom_sad64x16x4d_avg   sse2/;
+  specialize qw/aom_sad32x64x4d_avg   sse2/;
+  specialize qw/aom_sad32x32x4d_avg   sse2/;
+  specialize qw/aom_sad32x16x4d_avg   sse2/;
+  specialize qw/aom_sad32x8x4d_avg    sse2/;
+  specialize qw/aom_sad16x64x4d_avg   sse2/;
+  specialize qw/aom_sad16x32x4d_avg   sse2/;
+  specialize qw/aom_sad16x16x4d_avg   sse2/;
+  specialize qw/aom_sad16x8x4d_avg    sse2/;
+
+  specialize qw/aom_sad8x16x4d_avg    sse2/;
+  specialize qw/aom_sad8x8x4d_avg     sse2/;
+  specialize qw/aom_sad8x4x4d_avg     sse2/;
+  specialize qw/aom_sad4x16x4d_avg    sse2/;
+  specialize qw/aom_sad4x8x4d_avg     sse2/;
+  specialize qw/aom_sad4x4x4d_avg     sse2/;
+
+  specialize qw/aom_sad4x32x4d_avg    sse2/;
+  specialize qw/aom_sad4x16x4d_avg    sse2/;
+  specialize qw/aom_sad16x4x4d_avg    sse2/;
+  specialize qw/aom_sad8x32x4d_avg    sse2/;
+  specialize qw/aom_sad32x8x4d_avg    sse2/;
+  specialize qw/aom_sad64x16x4d_avg   sse2/;
+
+  specialize qw/aom_masked_sad128x128x4d  ssse3/;
+  specialize qw/aom_masked_sad128x64x4d   ssse3/;
+  specialize qw/aom_masked_sad64x128x4d   ssse3/;
+  specialize qw/aom_masked_sad64x64x4d    ssse3/;
+  specialize qw/aom_masked_sad64x32x4d    ssse3/;
+  specialize qw/aom_masked_sad64x16x4d    ssse3/;
+  specialize qw/aom_masked_sad32x64x4d    ssse3/;
+  specialize qw/aom_masked_sad32x32x4d    ssse3/;
+  specialize qw/aom_masked_sad32x16x4d    ssse3/;
+  specialize qw/aom_masked_sad32x8x4d     ssse3/;
+  specialize qw/aom_masked_sad16x64x4d    ssse3/;
+  specialize qw/aom_masked_sad16x32x4d    ssse3/;
+  specialize qw/aom_masked_sad16x16x4d    ssse3/;
+  specialize qw/aom_masked_sad16x8x4d     ssse3/;
+
+  specialize qw/aom_masked_sad8x16x4d     ssse3/;
+  specialize qw/aom_masked_sad8x8x4d      ssse3/;
+  specialize qw/aom_masked_sad8x4x4d      ssse3/;
+  specialize qw/aom_masked_sad4x16x4d     ssse3/;
+  specialize qw/aom_masked_sad4x8x4d      ssse3/;
+  specialize qw/aom_masked_sad4x4x4d      ssse3/;
+
+  specialize qw/aom_masked_sad4x32x4d     ssse3/;
+  specialize qw/aom_masked_sad4x16x4d     ssse3/;
+  specialize qw/aom_masked_sad16x4x4d     ssse3/;
+  specialize qw/aom_masked_sad8x32x4d     ssse3/;
+  specialize qw/aom_masked_sad32x8x4d     ssse3/;
+  specialize qw/aom_masked_sad64x16x4d    ssse3/;
+  #
+  # Multi-block SAD, comparing a reference to N independent blocks
+  #
+  if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+    foreach (@block_sizes) {
+      ($w, $h) = @$_;
+      add_proto qw/void/, "aom_highbd_sad${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+      if ($w != 128 && $h != 128) {
+        specialize "aom_highbd_sad${w}x${h}x4d", qw/sse2/;
+      }
+    }
+    specialize qw/aom_highbd_sad128x128x4d avx2/;
+    specialize qw/aom_highbd_sad128x64x4d  avx2/;
+    specialize qw/aom_highbd_sad64x128x4d  avx2/;
+    specialize qw/aom_highbd_sad64x64x4d   sse2 avx2/;
+    specialize qw/aom_highbd_sad64x32x4d   sse2 avx2/;
+    specialize qw/aom_highbd_sad32x64x4d   sse2 avx2/;
+    specialize qw/aom_highbd_sad32x32x4d   sse2 avx2/;
+    specialize qw/aom_highbd_sad32x16x4d   sse2 avx2/;
+    specialize qw/aom_highbd_sad16x32x4d   sse2 avx2/;
+    specialize qw/aom_highbd_sad16x16x4d   sse2 avx2/;
+    specialize qw/aom_highbd_sad16x8x4d    sse2 avx2/;
+    specialize qw/aom_highbd_sad8x16x4d    sse2/;
+    specialize qw/aom_highbd_sad8x8x4d     sse2/;
+    specialize qw/aom_highbd_sad8x4x4d     sse2/;
+    specialize qw/aom_highbd_sad4x8x4d     sse2/;
+    specialize qw/aom_highbd_sad4x4x4d     sse2/;
+
+    specialize qw/aom_highbd_sad4x16x4d         sse2/;
+    specialize qw/aom_highbd_sad16x4x4d    avx2 sse2/;
+    specialize qw/aom_highbd_sad8x32x4d         sse2/;
+    specialize qw/aom_highbd_sad32x8x4d    avx2 sse2/;
+    specialize qw/aom_highbd_sad16x64x4d   avx2 sse2/;
+    specialize qw/aom_highbd_sad64x16x4d   avx2 sse2/;
+  }
+  #
+  # Avg
+  #
+  add_proto qw/unsigned int aom_avg_8x8/, "const uint8_t *, int p";
+  specialize qw/aom_avg_8x8 sse2 neon/;
+
+  add_proto qw/unsigned int aom_avg_4x4/, "const uint8_t *, int p";
+  specialize qw/aom_avg_4x4 sse2 neon/;
+
+  add_proto qw/void aom_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
+  specialize qw/aom_minmax_8x8 sse2/;
+
+  if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+    add_proto qw/unsigned int aom_highbd_avg_8x8/, "const uint8_t *, int p";
+    add_proto qw/unsigned int aom_highbd_avg_4x4/, "const uint8_t *, int p";
+    add_proto qw/void aom_highbd_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
+  }
+
+  add_proto qw/void aom_int_pro_row/, "int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height";
+  specialize qw/aom_int_pro_row sse2/;
+
+  add_proto qw/int16_t aom_int_pro_col/, "const uint8_t *ref, const int width";
+  specialize qw/aom_int_pro_col sse2/;
+
+  add_proto qw/int aom_vector_var/, "const int16_t *ref, const int16_t *src, const int bwl";
+  # TODO(kyslov@) bring back SSE2 by extending it to 128 block size
+  #specialize qw/aom_vector_var sse2/;
+
+  #
+  # hamadard transform and satd for implmenting temporal dependency model
+  #
+  add_proto qw/void aom_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
+  specialize qw/aom_hadamard_8x8 sse2 neon/;
+
+  add_proto qw/void aom_hadamard_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
+  specialize qw/aom_hadamard_16x16 avx2 sse2 neon/;
+
+  add_proto qw/void aom_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
+  specialize qw/aom_hadamard_32x32 avx2 sse2/;
+
+  add_proto qw/void aom_hadamard_lp_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff";
+  specialize qw/aom_hadamard_lp_8x8 sse2 neon/;
+
+  add_proto qw/void aom_hadamard_lp_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff";
+  specialize qw/aom_hadamard_lp_16x16 avx2 neon/;
+
+
+  if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+    add_proto qw/void aom_highbd_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
+    specialize qw/aom_highbd_hadamard_8x8 avx2/;
+
+    add_proto qw/void aom_highbd_hadamard_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
+    specialize qw/aom_highbd_hadamard_16x16 avx2/;
+
+    add_proto qw/void aom_highbd_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
+    specialize qw/aom_highbd_hadamard_32x32 avx2/;
+  }
+  add_proto qw/int aom_satd/, "const tran_low_t *coeff, int length";
+  specialize qw/aom_satd avx2/;
+
+  add_proto qw/int aom_satd_lp/, "const int16_t *coeff, int length";
+  specialize qw/aom_satd_lp avx2 neon/;
+
+
+  #
+  # Structured Similarity (SSIM)
+  #
+  if (aom_config("CONFIG_INTERNAL_STATS") eq "yes") {
+    add_proto qw/void aom_ssim_parms_8x8/, "const uint8_t *s, int sp, const uint8_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
+    specialize qw/aom_ssim_parms_8x8/, "$sse2_x86_64";
+
+    add_proto qw/void aom_ssim_parms_16x16/, "const uint8_t *s, int sp, const uint8_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
+    specialize qw/aom_ssim_parms_16x16/, "$sse2_x86_64";
+
+    if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+      add_proto qw/void aom_highbd_ssim_parms_8x8/, "const uint16_t *s, int sp, const uint16_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
+    }
+  }
+}  # CONFIG_AV1_ENCODER
+
+if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
+
+  #
+  # Specialty Variance
+  #
+  add_proto qw/void aom_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+
+  add_proto qw/void aom_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+
+  specialize qw/aom_get16x16var                neon msa/;
+  specialize qw/aom_get8x8var             sse2 neon msa/;
+
+
+  add_proto qw/unsigned int aom_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  add_proto qw/unsigned int aom_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  add_proto qw/unsigned int aom_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  add_proto qw/unsigned int aom_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+
+  specialize qw/aom_mse16x16          sse2 avx2 neon msa/;
+  specialize qw/aom_mse16x8           sse2           msa/;
+  specialize qw/aom_mse8x16           sse2           msa/;
+  specialize qw/aom_mse8x8            sse2           msa/;
+
+  if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+    foreach $bd (8, 10, 12) {
+      add_proto qw/void/, "aom_highbd_${bd}_get16x16var", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+      add_proto qw/void/, "aom_highbd_${bd}_get8x8var", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+
+      add_proto qw/unsigned int/, "aom_highbd_${bd}_mse16x16", "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+      add_proto qw/unsigned int/, "aom_highbd_${bd}_mse16x8", "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+      add_proto qw/unsigned int/, "aom_highbd_${bd}_mse8x16", "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+      add_proto qw/unsigned int/, "aom_highbd_${bd}_mse8x8", "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+
+      specialize "aom_highbd_${bd}_mse16x16", qw/sse2/;
+      specialize "aom_highbd_${bd}_mse8x8", qw/sse2/;
+    }
+  }
+
+  #
+  #
+  #
+  add_proto qw/void aom_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+                                          const MV *const mv, uint8_t *comp_pred, int width, int height, int subpel_x_q3,
+                                          int subpel_y_q3, const uint8_t *ref, int ref_stride, int subpel_search";
+  specialize qw/aom_upsampled_pred sse2/;
+
+  add_proto qw/void aom_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+                                                   const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
+                                                   int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+                                                   int ref_stride, int subpel_search";
+  specialize qw/aom_comp_avg_upsampled_pred sse2/;
+
+  add_proto qw/void aom_dist_wtd_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+                                                       const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
+                                                       int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+                                                       int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search";
+  specialize qw/aom_dist_wtd_comp_avg_upsampled_pred ssse3/;
+
+  add_proto qw/void aom_comp_mask_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+                                                       const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
+                                                       int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+                                                       int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask,
+                                                       int subpel_search";
+  specialize qw/aom_comp_mask_upsampled_pred sse2/;
+
+  if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+    add_proto qw/void aom_highbd_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+                                                   const MV *const mv, uint8_t *comp_pred8, int width, int height, int subpel_x_q3,
+                                                   int subpel_y_q3, const uint8_t *ref8, int ref_stride, int bd, int subpel_search";
+    specialize qw/aom_highbd_upsampled_pred sse2/;
+
+    add_proto qw/void aom_highbd_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+                                                            const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
+                                                            int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, int ref_stride, int bd, int subpel_search";
+    specialize qw/aom_highbd_comp_avg_upsampled_pred sse2/;
+
+    add_proto qw/void aom_highbd_dist_wtd_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+                                                                const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
+                                                                int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+                                                                int ref_stride, int bd, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search";
+    specialize qw/aom_highbd_dist_wtd_comp_avg_upsampled_pred sse2/;
+  }
+
+  #
+  #
+  #
+  add_proto qw/unsigned int aom_get_mb_ss/, "const int16_t *";
+  add_proto qw/unsigned int aom_get4x4sse_cs/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride";
+
+  specialize qw/aom_get_mb_ss sse2 msa/;
+  specialize qw/aom_get4x4sse_cs neon msa/;
+
+  #
+  # Variance / Subpixel Variance / Subpixel Avg Variance
+  #
+  add_proto qw/unsigned int/, "aom_variance2x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+
+  add_proto qw/unsigned int/, "aom_variance2x4", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+
+  add_proto qw/unsigned int/, "aom_variance4x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+
+  foreach (@block_sizes) {
+    ($w, $h) = @$_;
+    add_proto qw/unsigned int/, "aom_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    add_proto qw/uint32_t/, "aom_sub_pixel_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+    add_proto qw/uint32_t/, "aom_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+    add_proto qw/uint32_t/, "aom_dist_wtd_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param";
+  }
+  specialize qw/aom_variance128x128   sse2 avx2 neon    /;
+  specialize qw/aom_variance128x64    sse2 avx2         /;
+  specialize qw/aom_variance64x128    sse2 avx2         /;
+  specialize qw/aom_variance64x64     sse2 avx2 neon msa/;
+  specialize qw/aom_variance64x32     sse2 avx2 neon msa/;
+  specialize qw/aom_variance32x64     sse2 avx2 neon msa/;
+  specialize qw/aom_variance32x32     sse2 avx2 neon msa/;
+  specialize qw/aom_variance32x16     sse2 avx2      msa/;
+  specialize qw/aom_variance16x32     sse2 avx2      msa/;
+  specialize qw/aom_variance16x16     sse2 avx2 neon msa/;
+  specialize qw/aom_variance16x8      sse2 avx2 neon msa/;
+  specialize qw/aom_variance8x16      sse2      neon msa/;
+  specialize qw/aom_variance8x8       sse2      neon msa/;
+  specialize qw/aom_variance8x4       sse2           msa/;
+  specialize qw/aom_variance4x8       sse2           msa/;
+  specialize qw/aom_variance4x4       sse2           msa/;
+
+  specialize qw/aom_sub_pixel_variance128x128   avx2          sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance128x64    avx2          sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance64x128    avx2          sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance64x64     avx2 neon msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance64x32     avx2      msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance32x64     avx2      msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance32x32     avx2 neon msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance32x16     avx2      msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance16x32     avx2      msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance16x16     avx2 neon msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance16x8      avx2      msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance8x16                msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance8x8            neon msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance8x4                 msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance4x8                 msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance4x4                 msa sse2 ssse3/;
+
+  specialize qw/aom_sub_pixel_avg_variance128x128 avx2     sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance128x64  avx2     sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance64x128  avx2     sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance64x64   avx2 msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance64x32   avx2 msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance32x64   avx2 msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance32x32   avx2 msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance32x16   avx2 msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance16x32        msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance16x16        msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance16x8         msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance8x16         msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance8x8          msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance8x4          msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance4x8          msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance4x4          msa sse2 ssse3/;
+
+  specialize qw/aom_variance4x16 sse2/;
+  specialize qw/aom_variance16x4 sse2 avx2/;
+  specialize qw/aom_variance8x32 sse2/;
+  specialize qw/aom_variance32x8 sse2 avx2/;
+  specialize qw/aom_variance16x64 sse2 avx2/;
+  specialize qw/aom_variance64x16 sse2 avx2/;
+
+  specialize qw/aom_sub_pixel_variance4x16 sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance16x4 avx2 sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance8x32 sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance32x8 sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance16x64 avx2 sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance64x16 sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance4x16 sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance16x4 sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance8x32 sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance32x8 sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance16x64 sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance64x16 sse2 ssse3/;
+
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x64 ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x32 ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance32x64 ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance32x32 ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance32x16 ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x32 ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x16 ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x8  ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance8x16  ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance8x8   ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance8x4   ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance4x8   ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance4x4   ssse3/;
+
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance4x16  ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x4  ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance8x32  ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance32x8  ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x64 ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x16 ssse3/;
+
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance128x128  ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance128x64   ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x128   ssse3/;
+
+  if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+    foreach $bd (8, 10, 12) {
+      add_proto qw/unsigned int/, "aom_highbd_${bd}_variance2x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+
+      add_proto qw/unsigned int/, "aom_highbd_${bd}_variance2x4", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+
+      add_proto qw/unsigned int/, "aom_highbd_${bd}_variance4x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+
+      foreach (@block_sizes) {
+        ($w, $h) = @$_;
+        add_proto qw/unsigned int/, "aom_highbd_${bd}_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+        add_proto qw/uint32_t/, "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+        add_proto qw/uint32_t/, "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+        if ($w != 128 && $h != 128 && $w != 4 && $h != 4) {
+          specialize "aom_highbd_${bd}_variance${w}x${h}", "sse2";
+        }
+        # TODO(david.barker): When ext-partition-types is enabled, we currently
+        # don't have vectorized 4x16 highbd variance functions
+        if ($w == 4 && $h == 4) {
+            specialize "aom_highbd_${bd}_variance${w}x${h}", "sse4_1";
+          }
+        if ($w != 128 && $h != 128 && $w != 4) {
+          specialize "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", qw/sse2/;
+          specialize "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", qw/sse2/;
+        }
+        if ($w == 4 && $h == 4) {
+          specialize "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", "sse4_1";
+          specialize "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", "sse4_1";
+        }
+
+        add_proto qw/uint32_t/, "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param";
+      }
+    }
+  }
+  #
+  # Masked Variance / Masked Subpixel Variance
+  #
+  foreach (@block_sizes) {
+    ($w, $h) = @$_;
+    add_proto qw/unsigned int/, "aom_masked_sub_pixel_variance${w}x${h}", "const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse";
+    specialize "aom_masked_sub_pixel_variance${w}x${h}", qw/ssse3/;
+  }
+
+  if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+    foreach $bd ("_8_", "_10_", "_12_") {
+      foreach (@block_sizes) {
+        ($w, $h) = @$_;
+        add_proto qw/unsigned int/, "aom_highbd${bd}masked_sub_pixel_variance${w}x${h}", "const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse";
+        specialize "aom_highbd${bd}masked_sub_pixel_variance${w}x${h}", qw/ssse3/;
+      }
+    }
+  }
+
+  #
+  # OBMC Variance / OBMC Subpixel Variance
+  #
+  foreach (@block_sizes) {
+    ($w, $h) = @$_;
+    add_proto qw/unsigned int/, "aom_obmc_variance${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
+    add_proto qw/unsigned int/, "aom_obmc_sub_pixel_variance${w}x${h}", "const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
+    specialize "aom_obmc_variance${w}x${h}", qw/sse4_1 avx2/;
+    specialize "aom_obmc_sub_pixel_variance${w}x${h}", q/sse4_1/;
+  }
+
+  if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+    foreach $bd ("_", "_10_", "_12_") {
+      foreach (@block_sizes) {
+        ($w, $h) = @$_;
+        add_proto qw/unsigned int/, "aom_highbd${bd}obmc_variance${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
+        add_proto qw/unsigned int/, "aom_highbd${bd}obmc_sub_pixel_variance${w}x${h}", "const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
+        specialize "aom_highbd${bd}obmc_variance${w}x${h}", qw/sse4_1/;
+      }
+    }
+  }
+
+  add_proto qw/uint32_t aom_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/aom_sub_pixel_avg_variance64x64 avx2 msa sse2 ssse3/;
+
+  add_proto qw/uint32_t aom_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/aom_sub_pixel_avg_variance64x32 msa sse2 ssse3/;
+
+  add_proto qw/uint32_t aom_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/aom_sub_pixel_avg_variance32x64 msa sse2 ssse3/;
+
+  add_proto qw/uint32_t aom_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/aom_sub_pixel_avg_variance32x32 avx2 msa sse2 ssse3/;
+
+  add_proto qw/uint32_t aom_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/aom_sub_pixel_avg_variance32x16 msa sse2 ssse3/;
+
+  add_proto qw/uint32_t aom_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/aom_sub_pixel_avg_variance16x32 msa sse2 ssse3/;
+
+  add_proto qw/uint32_t aom_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/aom_sub_pixel_avg_variance16x16 msa sse2 ssse3/;
+
+  add_proto qw/uint32_t aom_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/aom_sub_pixel_avg_variance16x8 msa sse2 ssse3/;
+
+  add_proto qw/uint32_t aom_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/aom_sub_pixel_avg_variance8x16 msa sse2 ssse3/;
+
+  add_proto qw/uint32_t aom_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/aom_sub_pixel_avg_variance8x8 msa sse2 ssse3/;
+
+  add_proto qw/uint32_t aom_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/aom_sub_pixel_avg_variance8x4 msa sse2 ssse3/;
+
+  add_proto qw/uint32_t aom_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/aom_sub_pixel_avg_variance4x8 msa sse2 ssse3/;
+
+  add_proto qw/uint32_t aom_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/aom_sub_pixel_avg_variance4x4 msa sse2 ssse3/;
+
+  #
+  # Comp Avg
+  #
+  add_proto qw/void aom_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride";
+
+  add_proto qw/void aom_dist_wtd_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param";
+  specialize qw/aom_dist_wtd_comp_avg_pred ssse3/;
+
+  if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+
+    add_proto qw/unsigned int aom_highbd_12_variance128x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_12_variance128x128 sse2/;
+
+    add_proto qw/unsigned int aom_highbd_12_variance128x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_12_variance128x64 sse2/;
+
+    add_proto qw/unsigned int aom_highbd_12_variance64x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_12_variance64x128 sse2/;
+
+    add_proto qw/unsigned int aom_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_12_variance64x64 sse2/;
+
+    add_proto qw/unsigned int aom_highbd_12_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_12_variance64x32 sse2/;
+
+    add_proto qw/unsigned int aom_highbd_12_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_12_variance32x64 sse2/;
+
+    add_proto qw/unsigned int aom_highbd_12_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_12_variance32x32 sse2/;
+
+    add_proto qw/unsigned int aom_highbd_12_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_12_variance32x16 sse2/;
+
+    add_proto qw/unsigned int aom_highbd_12_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_12_variance16x32 sse2/;
+
+    add_proto qw/unsigned int aom_highbd_12_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_12_variance16x16 sse2/;
+
+    add_proto qw/unsigned int aom_highbd_12_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_12_variance16x8 sse2/;
+
+    add_proto qw/unsigned int aom_highbd_12_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_12_variance8x16 sse2/;
+
+    add_proto qw/unsigned int aom_highbd_12_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_12_variance8x8 sse2/;
+
+    add_proto qw/unsigned int aom_highbd_12_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    add_proto qw/unsigned int aom_highbd_12_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    add_proto qw/unsigned int aom_highbd_12_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+
+    add_proto qw/unsigned int aom_highbd_10_variance128x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_10_variance128x128 sse2 avx2/;
+
+    add_proto qw/unsigned int aom_highbd_10_variance128x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_10_variance128x64 sse2 avx2/;
+
+    add_proto qw/unsigned int aom_highbd_10_variance64x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_10_variance64x128 sse2 avx2/;
+
+    add_proto qw/unsigned int aom_highbd_10_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_10_variance64x64 sse2 avx2/;
+
+    add_proto qw/unsigned int aom_highbd_10_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_10_variance64x32 sse2 avx2/;
+
+    add_proto qw/unsigned int aom_highbd_10_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_10_variance32x64 sse2 avx2/;
+
+    add_proto qw/unsigned int aom_highbd_10_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_10_variance32x32 sse2 avx2/;
+
+    add_proto qw/unsigned int aom_highbd_10_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_10_variance32x16 sse2 avx2/;
+
+    add_proto qw/unsigned int aom_highbd_10_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_10_variance16x32 sse2 avx2/;
+
+    add_proto qw/unsigned int aom_highbd_10_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_10_variance16x16 sse2 avx2/;
+
+    add_proto qw/unsigned int aom_highbd_10_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_10_variance16x8 sse2 avx2/;
+
+    add_proto qw/unsigned int aom_highbd_10_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_10_variance8x16 sse2 avx2/;
+
+    add_proto qw/unsigned int aom_highbd_10_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_10_variance8x8 sse2 avx2/;
+
+    add_proto qw/unsigned int aom_highbd_10_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    add_proto qw/unsigned int aom_highbd_10_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    add_proto qw/unsigned int aom_highbd_10_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+
+    add_proto qw/unsigned int aom_highbd_8_variance128x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_8_variance128x128 sse2/;
+
+    add_proto qw/unsigned int aom_highbd_8_variance128x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_8_variance128x64 sse2/;
+
+    add_proto qw/unsigned int aom_highbd_8_variance64x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_8_variance64x128 sse2/;
+
+    add_proto qw/unsigned int aom_highbd_8_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_8_variance64x64 sse2/;
+
+    add_proto qw/unsigned int aom_highbd_8_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_8_variance64x32 sse2/;
+
+    add_proto qw/unsigned int aom_highbd_8_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_8_variance32x64 sse2/;
+
+    add_proto qw/unsigned int aom_highbd_8_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_8_variance32x32 sse2/;
+
+    add_proto qw/unsigned int aom_highbd_8_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_8_variance32x16 sse2/;
+
+    add_proto qw/unsigned int aom_highbd_8_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_8_variance16x32 sse2/;
+
+    add_proto qw/unsigned int aom_highbd_8_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_8_variance16x16 sse2/;
+
+    add_proto qw/unsigned int aom_highbd_8_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_8_variance16x8 sse2/;
+
+    add_proto qw/unsigned int aom_highbd_8_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_8_variance8x16 sse2/;
+
+    add_proto qw/unsigned int aom_highbd_8_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    specialize qw/aom_highbd_8_variance8x8 sse2/;
+
+    add_proto qw/unsigned int aom_highbd_8_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    add_proto qw/unsigned int aom_highbd_8_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    add_proto qw/unsigned int aom_highbd_8_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+
+    add_proto qw/void aom_highbd_8_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+    add_proto qw/void aom_highbd_8_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+
+    add_proto qw/void aom_highbd_10_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+    add_proto qw/void aom_highbd_10_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+
+    add_proto qw/void aom_highbd_12_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+    add_proto qw/void aom_highbd_12_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+
+    add_proto qw/unsigned int aom_highbd_8_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+    specialize qw/aom_highbd_8_mse16x16 sse2/;
+
+    add_proto qw/unsigned int aom_highbd_8_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+    add_proto qw/unsigned int aom_highbd_8_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+    add_proto qw/unsigned int aom_highbd_8_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+    specialize qw/aom_highbd_8_mse8x8 sse2/;
+
+    add_proto qw/unsigned int aom_highbd_10_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+    specialize qw/aom_highbd_10_mse16x16 sse2/;
+
+    add_proto qw/unsigned int aom_highbd_10_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+    add_proto qw/unsigned int aom_highbd_10_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+    add_proto qw/unsigned int aom_highbd_10_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+    specialize qw/aom_highbd_10_mse8x8 sse2/;
+
+    add_proto qw/unsigned int aom_highbd_12_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+    specialize qw/aom_highbd_12_mse16x16 sse2/;
+
+    add_proto qw/unsigned int aom_highbd_12_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+    add_proto qw/unsigned int aom_highbd_12_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+    add_proto qw/unsigned int aom_highbd_12_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+    specialize qw/aom_highbd_12_mse8x8 sse2/;
+
+    add_proto qw/void aom_highbd_comp_avg_pred/, "uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride";
+
+    add_proto qw/void aom_highbd_dist_wtd_comp_avg_pred/, "uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param";
+    specialize qw/aom_highbd_dist_wtd_comp_avg_pred sse2/;
+  }
+    #
+    # Subpixel Variance
+    #
+    if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+      add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance128x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_12_sub_pixel_variance128x128 sse2/;
+
+      add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance128x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_12_sub_pixel_variance128x64 sse2/;
+
+      add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance64x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_12_sub_pixel_variance64x128 sse2/;
+
+      add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_12_sub_pixel_variance64x64 sse2/;
+
+      add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_12_sub_pixel_variance64x32 sse2/;
+
+      add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_12_sub_pixel_variance32x64 sse2/;
+
+      add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_12_sub_pixel_variance32x32 sse2/;
+
+      add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_12_sub_pixel_variance32x16 sse2/;
+
+      add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_12_sub_pixel_variance16x32 sse2/;
+
+      add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_12_sub_pixel_variance16x16 sse2/;
+
+      add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_12_sub_pixel_variance16x8 sse2/;
+
+      add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_12_sub_pixel_variance8x16 sse2/;
+
+      add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_12_sub_pixel_variance8x8 sse2/;
+
+      add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_12_sub_pixel_variance8x4 sse2/;
+
+      add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+
+      add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance128x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_10_sub_pixel_variance128x128 sse2/;
+
+      add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance128x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_10_sub_pixel_variance128x64 sse2/;
+
+      add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance64x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_10_sub_pixel_variance64x128 sse2/;
+
+      add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_10_sub_pixel_variance64x64 sse2/;
+
+      add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_10_sub_pixel_variance64x32 sse2/;
+
+      add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_10_sub_pixel_variance32x64 sse2/;
+
+      add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_10_sub_pixel_variance32x32 sse2/;
+
+      add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_10_sub_pixel_variance32x16 sse2/;
+
+      add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_10_sub_pixel_variance16x32 sse2/;
+
+      add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_10_sub_pixel_variance16x16 sse2/;
+
+      add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_10_sub_pixel_variance16x8 sse2/;
+
+      add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_10_sub_pixel_variance8x16 sse2/;
+
+      add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_10_sub_pixel_variance8x8 sse2/;
+
+      add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_10_sub_pixel_variance8x4 sse2/;
+
+      add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+
+      add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance128x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_8_sub_pixel_variance128x128 sse2/;
+
+      add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance128x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_8_sub_pixel_variance128x64 sse2/;
+
+      add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance64x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_8_sub_pixel_variance64x128 sse2/;
+
+      add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_8_sub_pixel_variance64x64 sse2/;
+
+      add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_8_sub_pixel_variance64x32 sse2/;
+
+      add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_8_sub_pixel_variance32x64 sse2/;
+
+      add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_8_sub_pixel_variance32x32 sse2/;
+
+      add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_8_sub_pixel_variance32x16 sse2/;
+
+      add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_8_sub_pixel_variance16x32 sse2/;
+
+      add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_8_sub_pixel_variance16x16 sse2/;
+
+      add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_8_sub_pixel_variance16x8 sse2/;
+
+      add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_8_sub_pixel_variance8x16 sse2/;
+
+      add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_8_sub_pixel_variance8x8 sse2/;
+
+      add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      specialize qw/aom_highbd_8_sub_pixel_variance8x4 sse2/;
+
+      add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+
+      add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      specialize qw/aom_highbd_12_sub_pixel_avg_variance64x64 sse2/;
+
+      add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      specialize qw/aom_highbd_12_sub_pixel_avg_variance64x32 sse2/;
+
+      add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      specialize qw/aom_highbd_12_sub_pixel_avg_variance32x64 sse2/;
+
+      add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      specialize qw/aom_highbd_12_sub_pixel_avg_variance32x32 sse2/;
+
+      add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      specialize qw/aom_highbd_12_sub_pixel_avg_variance32x16 sse2/;
+
+      add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      specialize qw/aom_highbd_12_sub_pixel_avg_variance16x32 sse2/;
+
+      add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      specialize qw/aom_highbd_12_sub_pixel_avg_variance16x16 sse2/;
+
+      add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      specialize qw/aom_highbd_12_sub_pixel_avg_variance16x8 sse2/;
+
+      add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      specialize qw/aom_highbd_12_sub_pixel_avg_variance8x16 sse2/;
+
+      add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      specialize qw/aom_highbd_12_sub_pixel_avg_variance8x8 sse2/;
+
+      add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      specialize qw/aom_highbd_12_sub_pixel_avg_variance8x4 sse2/;
+
+      add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+
+      add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      specialize qw/aom_highbd_10_sub_pixel_avg_variance64x64 sse2/;
+
+      add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      specialize qw/aom_highbd_10_sub_pixel_avg_variance64x32 sse2/;
+
+      add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      specialize qw/aom_highbd_10_sub_pixel_avg_variance32x64 sse2/;
+
+      add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      specialize qw/aom_highbd_10_sub_pixel_avg_variance32x32 sse2/;
+
+      add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      specialize qw/aom_highbd_10_sub_pixel_avg_variance32x16 sse2/;
+
+      add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      specialize qw/aom_highbd_10_sub_pixel_avg_variance16x32 sse2/;
+
+      add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      specialize qw/aom_highbd_10_sub_pixel_avg_variance16x16 sse2/;
+
+      add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      specialize qw/aom_highbd_10_sub_pixel_avg_variance16x8 sse2/;
+
+      add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      specialize qw/aom_highbd_10_sub_pixel_avg_variance8x16 sse2/;
+
+      add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      specialize qw/aom_highbd_10_sub_pixel_avg_variance8x8 sse2/;
+
+      add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      specialize qw/aom_highbd_10_sub_pixel_avg_variance8x4 sse2/;
+
+      add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+
+      add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      specialize qw/aom_highbd_8_sub_pixel_avg_variance64x64 sse2/;
+
+      add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      specialize qw/aom_highbd_8_sub_pixel_avg_variance64x32 sse2/;
+
+      add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      specialize qw/aom_highbd_8_sub_pixel_avg_variance32x64 sse2/;
+
+      add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      specialize qw/aom_highbd_8_sub_pixel_avg_variance32x32 sse2/;
+
+      add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      specialize qw/aom_highbd_8_sub_pixel_avg_variance32x16 sse2/;
+
+      add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      specialize qw/aom_highbd_8_sub_pixel_avg_variance16x32 sse2/;
+
+      add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      specialize qw/aom_highbd_8_sub_pixel_avg_variance16x16 sse2/;
+
+      add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      specialize qw/aom_highbd_8_sub_pixel_avg_variance16x8 sse2/;
+
+      add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      specialize qw/aom_highbd_8_sub_pixel_avg_variance8x16 sse2/;
+
+      add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      specialize qw/aom_highbd_8_sub_pixel_avg_variance8x8 sse2/;
+
+      add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      specialize qw/aom_highbd_8_sub_pixel_avg_variance8x4 sse2/;
+
+      add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+    }
+
+
+  add_proto qw/void aom_comp_mask_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask";
+  specialize qw/aom_comp_mask_pred ssse3 avx2/;
+
+  if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+    add_proto qw/void aom_highbd_comp_mask_pred/, "uint8_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask";
+    specialize qw/aom_highbd_comp_mask_pred sse2 avx2/;
+  }
+
+}  # CONFIG_AV1_ENCODER
+
+1;
diff --git a/libs/libaom/src/aom_dsp/aom_filter.h b/libs/libaom/src/aom_dsp/aom_filter.h
new file mode 100644
index 000000000..00686ac38
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/aom_filter.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_AOM_FILTER_H_
+#define AOM_AOM_DSP_AOM_FILTER_H_
+
+#include "aom/aom_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define FILTER_BITS 7
+
+#define SUBPEL_BITS 4
+#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
+#define SUBPEL_SHIFTS (1 << SUBPEL_BITS)
+#define SUBPEL_TAPS 8
+
+#define SCALE_SUBPEL_BITS 10
+#define SCALE_SUBPEL_SHIFTS (1 << SCALE_SUBPEL_BITS)
+#define SCALE_SUBPEL_MASK (SCALE_SUBPEL_SHIFTS - 1)
+#define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
+#define SCALE_EXTRA_OFF ((1 << SCALE_EXTRA_BITS) / 2)
+
+#define RS_SUBPEL_BITS 6
+#define RS_SUBPEL_MASK ((1 << RS_SUBPEL_BITS) - 1)
+#define RS_SCALE_SUBPEL_BITS 14
+#define RS_SCALE_SUBPEL_MASK ((1 << RS_SCALE_SUBPEL_BITS) - 1)
+#define RS_SCALE_EXTRA_BITS (RS_SCALE_SUBPEL_BITS - RS_SUBPEL_BITS)
+#define RS_SCALE_EXTRA_OFF (1 << (RS_SCALE_EXTRA_BITS - 1))
+
+typedef int16_t InterpKernel[SUBPEL_TAPS];
+
+#define BIL_SUBPEL_BITS 3
+#define BIL_SUBPEL_SHIFTS (1 << BIL_SUBPEL_BITS)
+
+// 2 tap bilinear filters
+static const uint8_t bilinear_filters_2t[BIL_SUBPEL_SHIFTS][2] = {
+  { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
+  { 64, 64 }, { 48, 80 },  { 32, 96 }, { 16, 112 },
+};
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_DSP_AOM_FILTER_H_
diff --git a/libs/libaom/src/aom_dsp/aom_simd.h b/libs/libaom/src/aom_dsp/aom_simd.h
new file mode 100644
index 000000000..ab950ca55
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/aom_simd.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_AOM_SIMD_H_
+#define AOM_AOM_DSP_AOM_SIMD_H_
+
+#include <stdint.h>
+
+#if defined(_WIN32)
+#include <intrin.h>
+#endif
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/aom_simd_inline.h"
+
+#define SIMD_CHECK 1  // Sanity checks in C equivalents
+
+#if HAVE_NEON
+#include "simd/v256_intrinsics_arm.h"
+// VS compiling for 32 bit targets does not support vector types in
+// structs as arguments, which makes the v256 type of the intrinsics
+// hard to support, so optimizations for this target are disabled.
+#elif HAVE_SSE2 && (defined(_WIN64) || !defined(_MSC_VER) || defined(__clang__))
+#include "simd/v256_intrinsics_x86.h"
+#else
+#include "simd/v256_intrinsics.h"
+#endif
+
+#endif  // AOM_AOM_DSP_AOM_SIMD_H_
diff --git a/libs/libaom/src/aom_dsp/aom_simd_inline.h b/libs/libaom/src/aom_dsp/aom_simd_inline.h
new file mode 100644
index 000000000..eb333f6f6
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/aom_simd_inline.h
@@ -0,0 +1,21 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_AOM_SIMD_INLINE_H_
+#define AOM_AOM_DSP_AOM_SIMD_INLINE_H_
+
+#include "aom/aom_integer.h"
+
+#ifndef SIMD_INLINE
+#define SIMD_INLINE static AOM_FORCE_INLINE
+#endif
+
+#endif  // AOM_AOM_DSP_AOM_SIMD_INLINE_H_
diff --git a/libs/libaom/src/aom_dsp/arm/avg_neon.c b/libs/libaom/src/aom_dsp/arm/avg_neon.c
new file mode 100644
index 000000000..af3769edf
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/arm/avg_neon.c
@@ -0,0 +1,74 @@
+/*
+ *  Copyright (c) 2019, Alliance for Open Media. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "av1/common/arm/mem_neon.h"
+#include "av1/common/arm/transpose_neon.h"
+
+unsigned int aom_avg_4x4_neon(const uint8_t *a, int a_stride) {
+  const uint8x16_t b = load_unaligned_u8q(a, a_stride);
+  const uint16x8_t c = vaddl_u8(vget_low_u8(b), vget_high_u8(b));
+#if defined(__aarch64__)
+  const uint32_t d = vaddlvq_u16(c);
+  return (d + 8) >> 4;
+#else
+  const uint32x2_t d = horizontal_add_u16x8(c);
+  return vget_lane_u32(vrshr_n_u32(d, 4), 0);
+#endif
+}
+
+unsigned int aom_avg_8x8_neon(const uint8_t *a, int a_stride) {
+  uint16x8_t sum;
+  uint32x2_t d;
+  uint8x8_t b = vld1_u8(a);
+  a += a_stride;
+  uint8x8_t c = vld1_u8(a);
+  a += a_stride;
+  sum = vaddl_u8(b, c);
+
+  for (int i = 0; i < 6; ++i) {
+    const uint8x8_t e = vld1_u8(a);
+    a += a_stride;
+    sum = vaddw_u8(sum, e);
+  }
+
+  d = horizontal_add_u16x8(sum);
+
+  return vget_lane_u32(vrshr_n_u32(d, 6), 0);
+}
+
+int aom_satd_lp_neon(const int16_t *coeff, int length) {
+  const int16x4_t zero = vdup_n_s16(0);
+  int32x4_t accum = vdupq_n_s32(0);
+
+  do {
+    const int16x8_t src0 = vld1q_s16(coeff);
+    const int16x8_t src8 = vld1q_s16(coeff + 8);
+    accum = vabal_s16(accum, vget_low_s16(src0), zero);
+    accum = vabal_s16(accum, vget_high_s16(src0), zero);
+    accum = vabal_s16(accum, vget_low_s16(src8), zero);
+    accum = vabal_s16(accum, vget_high_s16(src8), zero);
+    length -= 16;
+    coeff += 16;
+  } while (length != 0);
+
+  {
+    // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024]
+    const int64x2_t s0 = vpaddlq_s32(accum);  // cascading summation of 'accum'.
+    const int32x2_t s1 = vadd_s32(vreinterpret_s32_s64(vget_low_s64(s0)),
+                                  vreinterpret_s32_s64(vget_high_s64(s0)));
+    const int satd = vget_lane_s32(s1, 0);
+    return satd;
+  }
+}
diff --git a/libs/libaom/src/aom_dsp/arm/blend_a64_mask_neon.c b/libs/libaom/src/aom_dsp/arm/blend_a64_mask_neon.c
new file mode 100644
index 000000000..e7f08a5fd
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/arm/blend_a64_mask_neon.c
@@ -0,0 +1,451 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/blend.h"
+#include "aom_ports/mem.h"
+#include "av1/common/arm/mem_neon.h"
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE void blend8x1(int16x8_t mask, int16x8_t src_0, int16x8_t src_1,
+                            const int16x8_t v_maxval, int16x8_t *res) {
+  int32x4_t im_res_low, im_res_high;
+  const int16x8_t max_minus_mask = vsubq_s16(v_maxval, mask);
+
+  im_res_low = vmull_s16(vget_low_s16(mask), vget_low_s16(src_0));
+  im_res_low =
+      vmlal_s16(im_res_low, vget_low_s16(max_minus_mask), vget_low_s16(src_1));
+
+  im_res_high = vmull_s16(vget_high_s16(mask), vget_high_s16(src_0));
+  im_res_high = vmlal_s16(im_res_high, vget_high_s16(max_minus_mask),
+                          vget_high_s16(src_1));
+
+  *res = vcombine_s16(vshrn_n_s32(im_res_low, AOM_BLEND_A64_ROUND_BITS),
+                      vshrn_n_s32(im_res_high, AOM_BLEND_A64_ROUND_BITS));
+}
+
+static INLINE void blend_8x4(uint8_t *dst, uint32_t dst_stride,
+                             const CONV_BUF_TYPE *src0, uint32_t src0_stride,
+                             const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+                             int16x8_t mask0, int16x8_t mask1, int16x8_t mask2,
+                             int16x8_t mask3, const int16x8_t v_maxval,
+                             const uint16x8_t vec_round_offset,
+                             const int16x8_t vec_round_bits) {
+  int16x8_t src0_0, src0_1, src0_2, src0_3;
+  int16x8_t src1_0, src1_1, src1_2, src1_3;
+  int16x8_t im_res_0, im_res_1, im_res_2, im_res_3;
+
+  load_s16_8x4((int16_t *)src0, (int32_t)src0_stride, &src0_0, &src0_1, &src0_2,
+               &src0_3);
+  load_s16_8x4((int16_t *)src1, (int32_t)src1_stride, &src1_0, &src1_1, &src1_2,
+               &src1_3);
+
+  blend8x1(mask0, src0_0, src1_0, v_maxval, &im_res_0);
+  blend8x1(mask1, src0_1, src1_1, v_maxval, &im_res_1);
+  blend8x1(mask2, src0_2, src1_2, v_maxval, &im_res_2);
+  blend8x1(mask3, src0_3, src1_3, v_maxval, &im_res_3);
+
+  uint16x8_t im_res1_0 =
+      vqsubq_u16(vreinterpretq_u16_s16(im_res_0), vec_round_offset);
+  uint16x8_t im_res1_1 =
+      vqsubq_u16(vreinterpretq_u16_s16(im_res_1), vec_round_offset);
+  uint16x8_t im_res1_2 =
+      vqsubq_u16(vreinterpretq_u16_s16(im_res_2), vec_round_offset);
+  uint16x8_t im_res1_3 =
+      vqsubq_u16(vreinterpretq_u16_s16(im_res_3), vec_round_offset);
+
+  im_res_0 = vshlq_s16(vreinterpretq_s16_u16(im_res1_0), vec_round_bits);
+  im_res_1 = vshlq_s16(vreinterpretq_s16_u16(im_res1_1), vec_round_bits);
+  im_res_2 = vshlq_s16(vreinterpretq_s16_u16(im_res1_2), vec_round_bits);
+  im_res_3 = vshlq_s16(vreinterpretq_s16_u16(im_res1_3), vec_round_bits);
+
+  vst1_u8((dst + 0 * dst_stride), vqmovun_s16(im_res_0));
+  vst1_u8((dst + 1 * dst_stride), vqmovun_s16(im_res_1));
+  vst1_u8((dst + 2 * dst_stride), vqmovun_s16(im_res_2));
+  vst1_u8((dst + 3 * dst_stride), vqmovun_s16(im_res_3));
+}
+
+static INLINE void blend_4x4(uint8_t *dst, uint32_t dst_stride,
+                             const CONV_BUF_TYPE *src0, uint32_t src0_stride,
+                             const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+                             int16x4_t mask0, int16x4_t mask1, int16x4_t mask2,
+                             int16x4_t mask3, const int16x8_t v_maxval,
+                             const uint16x8_t vec_round_offset,
+                             const int16x8_t vec_round_bits) {
+  int16x8_t src0_0, src0_1;
+  int16x8_t src1_0, src1_1;
+  uint64x2_t tu0 = vdupq_n_u64(0), tu1 = vdupq_n_u64(0), tu2 = vdupq_n_u64(0),
+             tu3 = vdupq_n_u64(0);
+  int16x8_t mask0_1, mask2_3;
+  int16x8_t res0, res1;
+
+  load_unaligned_u16_4x4(src0, src0_stride, &tu0, &tu1);
+  load_unaligned_u16_4x4(src1, src1_stride, &tu2, &tu3);
+
+  src0_0 = vreinterpretq_s16_u64(tu0);
+  src0_1 = vreinterpretq_s16_u64(tu1);
+
+  src1_0 = vreinterpretq_s16_u64(tu2);
+  src1_1 = vreinterpretq_s16_u64(tu3);
+
+  mask0_1 = vcombine_s16(mask0, mask1);
+  mask2_3 = vcombine_s16(mask2, mask3);
+
+  blend8x1(mask0_1, src0_0, src1_0, v_maxval, &res0);
+  blend8x1(mask2_3, src0_1, src1_1, v_maxval, &res1);
+
+  uint16x8_t im_res_0 =
+      vqsubq_u16(vreinterpretq_u16_s16(res0), vec_round_offset);
+  uint16x8_t im_res_1 =
+      vqsubq_u16(vreinterpretq_u16_s16(res1), vec_round_offset);
+
+  src0_0 = vshlq_s16(vreinterpretq_s16_u16(im_res_0), vec_round_bits);
+  src0_1 = vshlq_s16(vreinterpretq_s16_u16(im_res_1), vec_round_bits);
+
+  uint8x8_t res_0 = vqmovun_s16(src0_0);
+  uint8x8_t res_1 = vqmovun_s16(src0_1);
+
+  vst1_lane_u32((uint32_t *)(dst + 0 * dst_stride), vreinterpret_u32_u8(res_0),
+                0);
+  vst1_lane_u32((uint32_t *)(dst + 1 * dst_stride), vreinterpret_u32_u8(res_0),
+                1);
+  vst1_lane_u32((uint32_t *)(dst + 2 * dst_stride), vreinterpret_u32_u8(res_1),
+                0);
+  vst1_lane_u32((uint32_t *)(dst + 3 * dst_stride), vreinterpret_u32_u8(res_1),
+                1);
+}
+
+void aom_lowbd_blend_a64_d16_mask_neon(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
+    ConvolveParams *conv_params) {
+  int i = 0;
+  const int bd = 8;
+  int w_tmp = w;
+  const uint8_t *mask_tmp = mask;
+  const CONV_BUF_TYPE *src0_tmp = src0;
+  const CONV_BUF_TYPE *src1_tmp = src1;
+  uint8_t *dst_tmp = dst;
+
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
+                           (1 << (offset_bits - conv_params->round_1 - 1));
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+
+  assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 4);
+  assert(w >= 4);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  uint8x8_t s0, s1, s2, s3;
+  uint32x2_t tu0 = vdup_n_u32(0), tu1 = vdup_n_u32(0), tu2 = vdup_n_u32(0),
+             tu3 = vdup_n_u32(0);
+  uint8x16_t t0, t1, t2, t3, t4, t5, t6, t7;
+  int16x8_t mask0, mask1, mask2, mask3;
+  int16x8_t mask4, mask5, mask6, mask7;
+  int32x4_t m0_32, m1_32, m2_32, m3_32;
+  int32x4_t m4_32, m5_32, m6_32, m7_32;
+  uint8x8_t mask0_l, mask1_l, mask2_l, mask3_l;
+  uint8x8_t mask4_l, mask5_l, mask6_l, mask7_l;
+  int16x4_t mask0_low, mask1_low, mask2_low, mask3_low;
+  const uint16x4_t vec_zero = vdup_n_u16(0);
+  const uint16_t offset = round_offset - (1 << (round_bits - 1));
+  const int16x8_t v_maxval = vdupq_n_s16(AOM_BLEND_A64_MAX_ALPHA);
+  const int16x8_t vec_round_bits = vdupq_n_s16(-round_bits);
+  const uint16x8_t vec_offset = vdupq_n_u16(offset);
+
+  if (subw == 0 && subh == 0) {
+    if (w_tmp > 7) {
+      do {
+        w_tmp = w;
+        do {
+          load_u8_8x4(mask_tmp, mask_stride, &s0, &s1, &s2, &s3);
+
+          mask0 = vmovl_s8(vreinterpret_s8_u8(s0));
+          mask1 = vmovl_s8(vreinterpret_s8_u8(s1));
+          mask2 = vmovl_s8(vreinterpret_s8_u8(s2));
+          mask3 = vmovl_s8(vreinterpret_s8_u8(s3));
+
+          blend_8x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
+                    src1_stride, mask0, mask1, mask2, mask3, v_maxval,
+                    vec_offset, vec_round_bits);
+
+          w_tmp -= 8;
+          mask_tmp += 8;
+          dst_tmp += 8;
+          src0_tmp += 8;
+          src1_tmp += 8;
+        } while (w_tmp > 7);
+        i += 4;
+        mask_tmp += (4 * mask_stride) - w;
+        dst_tmp += (4 * dst_stride) - w;
+        src0_tmp += (4 * src0_stride) - w;
+        src1_tmp += (4 * src1_stride) - w;
+      } while (i < h);
+    } else {
+      do {
+        load_unaligned_u8_4x4(mask_tmp, mask_stride, &tu0, &tu1);
+
+        mask0 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu0)));
+        mask1 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu1)));
+
+        mask0_low = vget_low_s16(mask0);
+        mask1_low = vget_high_s16(mask0);
+        mask2_low = vget_low_s16(mask1);
+        mask3_low = vget_high_s16(mask1);
+
+        blend_4x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
+                  src1_stride, mask0_low, mask1_low, mask2_low, mask3_low,
+                  v_maxval, vec_offset, vec_round_bits);
+
+        i += 4;
+        mask_tmp += (4 * mask_stride);
+        dst_tmp += (4 * dst_stride);
+        src0_tmp += (4 * src0_stride);
+        src1_tmp += (4 * src1_stride);
+      } while (i < h);
+    }
+  } else if (subw == 1 && subh == 1) {
+    if (w_tmp > 7) {
+      do {
+        w_tmp = w;
+        do {
+          load_u8_16x8(mask_tmp, mask_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6,
+                       &t7);
+
+          mask0 =
+              vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(t0), vget_low_u8(t1)));
+          mask1 =
+              vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(t2), vget_low_u8(t3)));
+          mask2 =
+              vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(t4), vget_low_u8(t5)));
+          mask3 =
+              vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(t6), vget_low_u8(t7)));
+
+          mask4 = vreinterpretq_s16_u16(
+              vaddl_u8(vget_high_u8(t0), vget_high_u8(t1)));
+          mask5 = vreinterpretq_s16_u16(
+              vaddl_u8(vget_high_u8(t2), vget_high_u8(t3)));
+          mask6 = vreinterpretq_s16_u16(
+              vaddl_u8(vget_high_u8(t4), vget_high_u8(t5)));
+          mask7 = vreinterpretq_s16_u16(
+              vaddl_u8(vget_high_u8(t6), vget_high_u8(t7)));
+
+          m0_32 = vpaddlq_s16(mask0);
+          m1_32 = vpaddlq_s16(mask1);
+          m2_32 = vpaddlq_s16(mask2);
+          m3_32 = vpaddlq_s16(mask3);
+
+          m4_32 = vpaddlq_s16(mask4);
+          m5_32 = vpaddlq_s16(mask5);
+          m6_32 = vpaddlq_s16(mask6);
+          m7_32 = vpaddlq_s16(mask7);
+
+          mask0 =
+              vcombine_s16(vqrshrn_n_s32(m0_32, 2), vqrshrn_n_s32(m4_32, 2));
+          mask1 =
+              vcombine_s16(vqrshrn_n_s32(m1_32, 2), vqrshrn_n_s32(m5_32, 2));
+          mask2 =
+              vcombine_s16(vqrshrn_n_s32(m2_32, 2), vqrshrn_n_s32(m6_32, 2));
+          mask3 =
+              vcombine_s16(vqrshrn_n_s32(m3_32, 2), vqrshrn_n_s32(m7_32, 2));
+
+          blend_8x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
+                    src1_stride, mask0, mask1, mask2, mask3, v_maxval,
+                    vec_offset, vec_round_bits);
+
+          w_tmp -= 8;
+          mask_tmp += 16;
+          dst_tmp += 8;
+          src0_tmp += 8;
+          src1_tmp += 8;
+        } while (w_tmp > 7);
+        i += 4;
+        mask_tmp += (8 * mask_stride) - (2 * w);
+        dst_tmp += (4 * dst_stride) - w;
+        src0_tmp += (4 * src0_stride) - w;
+        src1_tmp += (4 * src1_stride) - w;
+      } while (i < h);
+    } else {
+      do {
+        load_u8_8x8(mask_tmp, mask_stride, &mask0_l, &mask1_l, &mask2_l,
+                    &mask3_l, &mask4_l, &mask5_l, &mask6_l, &mask7_l);
+
+        mask0 = vreinterpretq_s16_u16(vaddl_u8(mask0_l, mask1_l));
+        mask1 = vreinterpretq_s16_u16(vaddl_u8(mask2_l, mask3_l));
+        mask2 = vreinterpretq_s16_u16(vaddl_u8(mask4_l, mask5_l));
+        mask3 = vreinterpretq_s16_u16(vaddl_u8(mask6_l, mask7_l));
+
+        m0_32 = vpaddlq_s16(mask0);
+        m1_32 = vpaddlq_s16(mask1);
+        m2_32 = vpaddlq_s16(mask2);
+        m3_32 = vpaddlq_s16(mask3);
+
+        mask0_low = vqrshrn_n_s32(m0_32, 2);
+        mask1_low = vqrshrn_n_s32(m1_32, 2);
+        mask2_low = vqrshrn_n_s32(m2_32, 2);
+        mask3_low = vqrshrn_n_s32(m3_32, 2);
+
+        blend_4x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
+                  src1_stride, mask0_low, mask1_low, mask2_low, mask3_low,
+                  v_maxval, vec_offset, vec_round_bits);
+
+        i += 4;
+        mask_tmp += (8 * mask_stride);
+        dst_tmp += (4 * dst_stride);
+        src0_tmp += (4 * src0_stride);
+        src1_tmp += (4 * src1_stride);
+      } while (i < h);
+    }
+  } else if (subw == 1 && subh == 0) {
+    if (w_tmp > 7) {
+      do {
+        w_tmp = w;
+        do {
+          load_u8_16x4(mask_tmp, mask_stride, &t0, &t1, &t2, &t3);
+
+          mask0 = vreinterpretq_s16_u16(vcombine_u16(
+              vpaddl_u8(vget_low_u8(t0)), vpaddl_u8(vget_high_u8(t0))));
+          mask1 = vreinterpretq_s16_u16(vcombine_u16(
+              vpaddl_u8(vget_low_u8(t1)), vpaddl_u8(vget_high_u8(t1))));
+          mask2 = vreinterpretq_s16_u16(vcombine_u16(
+              vpaddl_u8(vget_low_u8(t2)), vpaddl_u8(vget_high_u8(t2))));
+          mask3 = vreinterpretq_s16_u16(vcombine_u16(
+              vpaddl_u8(vget_low_u8(t3)), vpaddl_u8(vget_high_u8(t3))));
+
+          mask0 = vmovl_s8(vqrshrn_n_s16(mask0, 1));
+          mask1 = vmovl_s8(vqrshrn_n_s16(mask1, 1));
+          mask2 = vmovl_s8(vqrshrn_n_s16(mask2, 1));
+          mask3 = vmovl_s8(vqrshrn_n_s16(mask3, 1));
+
+          blend_8x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
+                    src1_stride, mask0, mask1, mask2, mask3, v_maxval,
+                    vec_offset, vec_round_bits);
+          w_tmp -= 8;
+          mask_tmp += 16;
+          dst_tmp += 8;
+          src0_tmp += 8;
+          src1_tmp += 8;
+        } while (w_tmp > 7);
+        i += 4;
+        mask_tmp += (4 * mask_stride) - (2 * w);
+        dst_tmp += (4 * dst_stride) - w;
+        src0_tmp += (4 * src0_stride) - w;
+        src1_tmp += (4 * src1_stride) - w;
+      } while (i < h);
+    } else {
+      do {
+        load_u8_8x4(mask_tmp, mask_stride, &mask0_l, &mask1_l, &mask2_l,
+                    &mask3_l);
+
+        mask0 =
+            vreinterpretq_s16_u16(vcombine_u16(vpaddl_u8(mask0_l), vec_zero));
+        mask1 =
+            vreinterpretq_s16_u16(vcombine_u16(vpaddl_u8(mask1_l), vec_zero));
+        mask2 =
+            vreinterpretq_s16_u16(vcombine_u16(vpaddl_u8(mask2_l), vec_zero));
+        mask3 =
+            vreinterpretq_s16_u16(vcombine_u16(vpaddl_u8(mask3_l), vec_zero));
+
+        mask0_low = vget_low_s16(vmovl_s8(vqrshrn_n_s16(mask0, 1)));
+        mask1_low = vget_low_s16(vmovl_s8(vqrshrn_n_s16(mask1, 1)));
+        mask2_low = vget_low_s16(vmovl_s8(vqrshrn_n_s16(mask2, 1)));
+        mask3_low = vget_low_s16(vmovl_s8(vqrshrn_n_s16(mask3, 1)));
+
+        blend_4x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
+                  src1_stride, mask0_low, mask1_low, mask2_low, mask3_low,
+                  v_maxval, vec_offset, vec_round_bits);
+
+        i += 4;
+        mask_tmp += (4 * mask_stride);
+        dst_tmp += (4 * dst_stride);
+        src0_tmp += (4 * src0_stride);
+        src1_tmp += (4 * src1_stride);
+      } while (i < h);
+    }
+  } else {
+    if (w_tmp > 7) {
+      do {
+        w_tmp = w;
+        do {
+          load_u8_8x8(mask_tmp, mask_stride, &mask0_l, &mask1_l, &mask2_l,
+                      &mask3_l, &mask4_l, &mask5_l, &mask6_l, &mask7_l);
+
+          mask0 = vreinterpretq_s16_u16(vaddl_u8(mask0_l, mask1_l));
+          mask1 = vreinterpretq_s16_u16(vaddl_u8(mask2_l, mask3_l));
+          mask2 = vreinterpretq_s16_u16(vaddl_u8(mask4_l, mask5_l));
+          mask3 = vreinterpretq_s16_u16(vaddl_u8(mask6_l, mask7_l));
+
+          mask0 = vmovl_s8(vqrshrn_n_s16(mask0, 1));
+          mask1 = vmovl_s8(vqrshrn_n_s16(mask1, 1));
+          mask2 = vmovl_s8(vqrshrn_n_s16(mask2, 1));
+          mask3 = vmovl_s8(vqrshrn_n_s16(mask3, 1));
+
+          blend_8x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
+                    src1_stride, mask0, mask1, mask2, mask3, v_maxval,
+                    vec_offset, vec_round_bits);
+
+          w_tmp -= 8;
+          mask_tmp += 8;
+          dst_tmp += 8;
+          src0_tmp += 8;
+          src1_tmp += 8;
+        } while (w_tmp > 7);
+        i += 4;
+        mask_tmp += (8 * mask_stride) - w;
+        dst_tmp += (4 * dst_stride) - w;
+        src0_tmp += (4 * src0_stride) - w;
+        src1_tmp += (4 * src1_stride) - w;
+      } while (i < h);
+    } else {
+      do {
+        load_unaligned_u8_4x4(mask_tmp, 2 * mask_stride, &tu0, &tu1);
+        load_unaligned_u8_4x4(mask_tmp + mask_stride, 2 * mask_stride, &tu2,
+                              &tu3);
+
+        s0 = vreinterpret_u8_u32(tu0);
+        s1 = vreinterpret_u8_u32(tu1);
+        s2 = vreinterpret_u8_u32(tu2);
+        s3 = vreinterpret_u8_u32(tu3);
+
+        mask0 = vreinterpretq_s16_u16(vaddl_u8(s0, s2));
+        mask1 = vreinterpretq_s16_u16(vaddl_u8(s1, s3));
+
+        mask0 = vmovl_s8(vqrshrn_n_s16(mask0, 1));
+        mask1 = vmovl_s8(vqrshrn_n_s16(mask1, 1));
+
+        mask0_low = vget_low_s16(mask0);
+        mask1_low = vget_high_s16(mask0);
+        mask2_low = vget_low_s16(mask1);
+        mask3_low = vget_high_s16(mask1);
+
+        blend_4x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
+                  src1_stride, mask0_low, mask1_low, mask2_low, mask3_low,
+                  v_maxval, vec_offset, vec_round_bits);
+
+        i += 4;
+        mask_tmp += (8 * mask_stride);
+        dst_tmp += (4 * dst_stride);
+        src0_tmp += (4 * src0_stride);
+        src1_tmp += (4 * src1_stride);
+      } while (i < h);
+    }
+  }
+}
diff --git a/libs/libaom/src/aom_dsp/arm/fwd_txfm_neon.c b/libs/libaom/src/aom_dsp/arm/fwd_txfm_neon.c
new file mode 100644
index 000000000..ce9352347
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/arm/fwd_txfm_neon.c
@@ -0,0 +1,316 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/txfm_common.h"
+#include "av1/common/arm/mem_neon.h"
+#include "av1/common/arm/transpose_neon.h"
+
+static void aom_fdct4x4_helper(const int16_t *input, int stride,
+                               int16x4_t *input_0, int16x4_t *input_1,
+                               int16x4_t *input_2, int16x4_t *input_3) {
+  *input_0 = vshl_n_s16(vld1_s16(input + 0 * stride), 4);
+  *input_1 = vshl_n_s16(vld1_s16(input + 1 * stride), 4);
+  *input_2 = vshl_n_s16(vld1_s16(input + 2 * stride), 4);
+  *input_3 = vshl_n_s16(vld1_s16(input + 3 * stride), 4);
+  // If the very first value != 0, then add 1.
+  if (input[0] != 0) {
+    const int16x4_t one = vreinterpret_s16_s64(vdup_n_s64(1));
+    *input_0 = vadd_s16(*input_0, one);
+  }
+
+  for (int i = 0; i < 2; ++i) {
+    const int16x8_t input_01 = vcombine_s16(*input_0, *input_1);
+    const int16x8_t input_32 = vcombine_s16(*input_3, *input_2);
+
+    // in_0 +/- in_3, in_1 +/- in_2
+    const int16x8_t s_01 = vaddq_s16(input_01, input_32);
+    const int16x8_t s_32 = vsubq_s16(input_01, input_32);
+
+    // step_0 +/- step_1, step_2 +/- step_3
+    const int16x4_t s_0 = vget_low_s16(s_01);
+    const int16x4_t s_1 = vget_high_s16(s_01);
+    const int16x4_t s_2 = vget_high_s16(s_32);
+    const int16x4_t s_3 = vget_low_s16(s_32);
+
+    // (s_0 +/- s_1) * cospi_16_64
+    // Must expand all elements to s32. See 'needs32' comment in fwd_txfm.c.
+    const int32x4_t s_0_p_s_1 = vaddl_s16(s_0, s_1);
+    const int32x4_t s_0_m_s_1 = vsubl_s16(s_0, s_1);
+    const int32x4_t temp1 = vmulq_n_s32(s_0_p_s_1, cospi_16_64);
+    const int32x4_t temp2 = vmulq_n_s32(s_0_m_s_1, cospi_16_64);
+
+    // fdct_round_shift
+    int16x4_t out_0 = vrshrn_n_s32(temp1, DCT_CONST_BITS);
+    int16x4_t out_2 = vrshrn_n_s32(temp2, DCT_CONST_BITS);
+
+    // s_3 * cospi_8_64 + s_2 * cospi_24_64
+    // s_3 * cospi_24_64 - s_2 * cospi_8_64
+    const int32x4_t s_3_cospi_8_64 = vmull_n_s16(s_3, cospi_8_64);
+    const int32x4_t s_3_cospi_24_64 = vmull_n_s16(s_3, cospi_24_64);
+
+    const int32x4_t temp3 = vmlal_n_s16(s_3_cospi_8_64, s_2, cospi_24_64);
+    const int32x4_t temp4 = vmlsl_n_s16(s_3_cospi_24_64, s_2, cospi_8_64);
+
+    // fdct_round_shift
+    int16x4_t out_1 = vrshrn_n_s32(temp3, DCT_CONST_BITS);
+    int16x4_t out_3 = vrshrn_n_s32(temp4, DCT_CONST_BITS);
+
+    transpose_s16_4x4d(&out_0, &out_1, &out_2, &out_3);
+
+    *input_0 = out_0;
+    *input_1 = out_1;
+    *input_2 = out_2;
+    *input_3 = out_3;
+  }
+}
+
+void aom_fdct4x4_neon(const int16_t *input, tran_low_t *final_output,
+                      int stride) {
+  // input[M * stride] * 16
+  int16x4_t input_0, input_1, input_2, input_3;
+
+  aom_fdct4x4_helper(input, stride, &input_0, &input_1, &input_2, &input_3);
+
+  // Not quite a rounding shift. Only add 1 despite shifting by 2.
+  const int16x8_t one = vdupq_n_s16(1);
+  int16x8_t out_01 = vcombine_s16(input_0, input_1);
+  int16x8_t out_23 = vcombine_s16(input_2, input_3);
+  out_01 = vshrq_n_s16(vaddq_s16(out_01, one), 2);
+  out_23 = vshrq_n_s16(vaddq_s16(out_23, one), 2);
+  store_s16q_to_tran_low(final_output + 0 * 8, out_01);
+  store_s16q_to_tran_low(final_output + 1 * 8, out_23);
+}
+
+void aom_fdct4x4_lp_neon(const int16_t *input, int16_t *final_output,
+                         int stride) {
+  // input[M * stride] * 16
+  int16x4_t input_0, input_1, input_2, input_3;
+
+  aom_fdct4x4_helper(input, stride, &input_0, &input_1, &input_2, &input_3);
+
+  // Not quite a rounding shift. Only add 1 despite shifting by 2.
+  const int16x8_t one = vdupq_n_s16(1);
+  int16x8_t out_01 = vcombine_s16(input_0, input_1);
+  int16x8_t out_23 = vcombine_s16(input_2, input_3);
+  out_01 = vshrq_n_s16(vaddq_s16(out_01, one), 2);
+  out_23 = vshrq_n_s16(vaddq_s16(out_23, one), 2);
+  vst1q_s16(final_output + 0 * 8, out_01);
+  vst1q_s16(final_output + 1 * 8, out_23);
+}
+
+void aom_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) {
+  // stage 1
+  int16x8_t input_0 = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2);
+  int16x8_t input_1 = vshlq_n_s16(vld1q_s16(&input[1 * stride]), 2);
+  int16x8_t input_2 = vshlq_n_s16(vld1q_s16(&input[2 * stride]), 2);
+  int16x8_t input_3 = vshlq_n_s16(vld1q_s16(&input[3 * stride]), 2);
+  int16x8_t input_4 = vshlq_n_s16(vld1q_s16(&input[4 * stride]), 2);
+  int16x8_t input_5 = vshlq_n_s16(vld1q_s16(&input[5 * stride]), 2);
+  int16x8_t input_6 = vshlq_n_s16(vld1q_s16(&input[6 * stride]), 2);
+  int16x8_t input_7 = vshlq_n_s16(vld1q_s16(&input[7 * stride]), 2);
+  for (int i = 0; i < 2; ++i) {
+    int16x8_t out_0, out_1, out_2, out_3, out_4, out_5, out_6, out_7;
+    const int16x8_t v_s0 = vaddq_s16(input_0, input_7);
+    const int16x8_t v_s1 = vaddq_s16(input_1, input_6);
+    const int16x8_t v_s2 = vaddq_s16(input_2, input_5);
+    const int16x8_t v_s3 = vaddq_s16(input_3, input_4);
+    const int16x8_t v_s4 = vsubq_s16(input_3, input_4);
+    const int16x8_t v_s5 = vsubq_s16(input_2, input_5);
+    const int16x8_t v_s6 = vsubq_s16(input_1, input_6);
+    const int16x8_t v_s7 = vsubq_s16(input_0, input_7);
+    // fdct4(step, step);
+    int16x8_t v_x0 = vaddq_s16(v_s0, v_s3);
+    int16x8_t v_x1 = vaddq_s16(v_s1, v_s2);
+    int16x8_t v_x2 = vsubq_s16(v_s1, v_s2);
+    int16x8_t v_x3 = vsubq_s16(v_s0, v_s3);
+    // fdct4(step, step);
+    int32x4_t v_t0_lo = vaddl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
+    int32x4_t v_t0_hi = vaddl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
+    int32x4_t v_t1_lo = vsubl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
+    int32x4_t v_t1_hi = vsubl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
+    int32x4_t v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_24_64);
+    int32x4_t v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_24_64);
+    int32x4_t v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_24_64);
+    int32x4_t v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_24_64);
+    v_t2_lo = vmlal_n_s16(v_t2_lo, vget_low_s16(v_x3), (int16_t)cospi_8_64);
+    v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), (int16_t)cospi_8_64);
+    v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), (int16_t)cospi_8_64);
+    v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), (int16_t)cospi_8_64);
+    v_t0_lo = vmulq_n_s32(v_t0_lo, (int32_t)cospi_16_64);
+    v_t0_hi = vmulq_n_s32(v_t0_hi, (int32_t)cospi_16_64);
+    v_t1_lo = vmulq_n_s32(v_t1_lo, (int32_t)cospi_16_64);
+    v_t1_hi = vmulq_n_s32(v_t1_hi, (int32_t)cospi_16_64);
+    {
+      const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
+      const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
+      const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
+      const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
+      const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);
+      const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);
+      const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);
+      const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);
+      out_0 = vcombine_s16(a, c);  // 00 01 02 03 40 41 42 43
+      out_2 = vcombine_s16(e, g);  // 20 21 22 23 60 61 62 63
+      out_4 = vcombine_s16(b, d);  // 04 05 06 07 44 45 46 47
+      out_6 = vcombine_s16(f, h);  // 24 25 26 27 64 65 66 67
+    }
+    // Stage 2
+    v_x0 = vsubq_s16(v_s6, v_s5);
+    v_x1 = vaddq_s16(v_s6, v_s5);
+    v_t0_lo = vmull_n_s16(vget_low_s16(v_x0), (int16_t)cospi_16_64);
+    v_t0_hi = vmull_n_s16(vget_high_s16(v_x0), (int16_t)cospi_16_64);
+    v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_16_64);
+    v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_16_64);
+    {
+      const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
+      const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
+      const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
+      const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
+      const int16x8_t ab = vcombine_s16(a, b);
+      const int16x8_t cd = vcombine_s16(c, d);
+      // Stage 3
+      v_x0 = vaddq_s16(v_s4, ab);
+      v_x1 = vsubq_s16(v_s4, ab);
+      v_x2 = vsubq_s16(v_s7, cd);
+      v_x3 = vaddq_s16(v_s7, cd);
+    }
+    // Stage 4
+    v_t0_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_4_64);
+    v_t0_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_4_64);
+    v_t0_lo = vmlal_n_s16(v_t0_lo, vget_low_s16(v_x0), (int16_t)cospi_28_64);
+    v_t0_hi = vmlal_n_s16(v_t0_hi, vget_high_s16(v_x0), (int16_t)cospi_28_64);
+    v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_12_64);
+    v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_12_64);
+    v_t1_lo = vmlal_n_s16(v_t1_lo, vget_low_s16(v_x2), (int16_t)cospi_20_64);
+    v_t1_hi = vmlal_n_s16(v_t1_hi, vget_high_s16(v_x2), (int16_t)cospi_20_64);
+    v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_12_64);
+    v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_12_64);
+    v_t2_lo = vmlsl_n_s16(v_t2_lo, vget_low_s16(v_x1), (int16_t)cospi_20_64);
+    v_t2_hi = vmlsl_n_s16(v_t2_hi, vget_high_s16(v_x1), (int16_t)cospi_20_64);
+    v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_28_64);
+    v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_28_64);
+    v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x0), (int16_t)cospi_4_64);
+    v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x0), (int16_t)cospi_4_64);
+    {
+      const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
+      const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
+      const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
+      const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
+      const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);
+      const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);
+      const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);
+      const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);
+      out_1 = vcombine_s16(a, c);  // 10 11 12 13 50 51 52 53
+      out_3 = vcombine_s16(e, g);  // 30 31 32 33 70 71 72 73
+      out_5 = vcombine_s16(b, d);  // 14 15 16 17 54 55 56 57
+      out_7 = vcombine_s16(f, h);  // 34 35 36 37 74 75 76 77
+    }
+    // transpose 8x8
+    {
+      // 00 01 02 03 40 41 42 43
+      // 10 11 12 13 50 51 52 53
+      // 20 21 22 23 60 61 62 63
+      // 30 31 32 33 70 71 72 73
+      // 04 05 06 07 44 45 46 47
+      // 14 15 16 17 54 55 56 57
+      // 24 25 26 27 64 65 66 67
+      // 34 35 36 37 74 75 76 77
+      const int32x4x2_t r02_s32 =
+          vtrnq_s32(vreinterpretq_s32_s16(out_0), vreinterpretq_s32_s16(out_2));
+      const int32x4x2_t r13_s32 =
+          vtrnq_s32(vreinterpretq_s32_s16(out_1), vreinterpretq_s32_s16(out_3));
+      const int32x4x2_t r46_s32 =
+          vtrnq_s32(vreinterpretq_s32_s16(out_4), vreinterpretq_s32_s16(out_6));
+      const int32x4x2_t r57_s32 =
+          vtrnq_s32(vreinterpretq_s32_s16(out_5), vreinterpretq_s32_s16(out_7));
+      const int16x8x2_t r01_s16 =
+          vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[0]),
+                    vreinterpretq_s16_s32(r13_s32.val[0]));
+      const int16x8x2_t r23_s16 =
+          vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[1]),
+                    vreinterpretq_s16_s32(r13_s32.val[1]));
+      const int16x8x2_t r45_s16 =
+          vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[0]),
+                    vreinterpretq_s16_s32(r57_s32.val[0]));
+      const int16x8x2_t r67_s16 =
+          vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[1]),
+                    vreinterpretq_s16_s32(r57_s32.val[1]));
+      input_0 = r01_s16.val[0];
+      input_1 = r01_s16.val[1];
+      input_2 = r23_s16.val[0];
+      input_3 = r23_s16.val[1];
+      input_4 = r45_s16.val[0];
+      input_5 = r45_s16.val[1];
+      input_6 = r67_s16.val[0];
+      input_7 = r67_s16.val[1];
+      // 00 10 20 30 40 50 60 70
+      // 01 11 21 31 41 51 61 71
+      // 02 12 22 32 42 52 62 72
+      // 03 13 23 33 43 53 63 73
+      // 04 14 24 34 44 54 64 74
+      // 05 15 25 35 45 55 65 75
+      // 06 16 26 36 46 56 66 76
+      // 07 17 27 37 47 57 67 77
+    }
+  }  // for
+  {
+    // from aom_dct_sse2.c
+    // Post-condition (division by two)
+    //    division of two 16 bits signed numbers using shifts
+    //    n / 2 = (n - (n >> 15)) >> 1
+    const int16x8_t sign_in0 = vshrq_n_s16(input_0, 15);
+    const int16x8_t sign_in1 = vshrq_n_s16(input_1, 15);
+    const int16x8_t sign_in2 = vshrq_n_s16(input_2, 15);
+    const int16x8_t sign_in3 = vshrq_n_s16(input_3, 15);
+    const int16x8_t sign_in4 = vshrq_n_s16(input_4, 15);
+    const int16x8_t sign_in5 = vshrq_n_s16(input_5, 15);
+    const int16x8_t sign_in6 = vshrq_n_s16(input_6, 15);
+    const int16x8_t sign_in7 = vshrq_n_s16(input_7, 15);
+    input_0 = vhsubq_s16(input_0, sign_in0);
+    input_1 = vhsubq_s16(input_1, sign_in1);
+    input_2 = vhsubq_s16(input_2, sign_in2);
+    input_3 = vhsubq_s16(input_3, sign_in3);
+    input_4 = vhsubq_s16(input_4, sign_in4);
+    input_5 = vhsubq_s16(input_5, sign_in5);
+    input_6 = vhsubq_s16(input_6, sign_in6);
+    input_7 = vhsubq_s16(input_7, sign_in7);
+    // store results
+    vst1q_s16(&final_output[0 * 8], input_0);
+    vst1q_s16(&final_output[1 * 8], input_1);
+    vst1q_s16(&final_output[2 * 8], input_2);
+    vst1q_s16(&final_output[3 * 8], input_3);
+    vst1q_s16(&final_output[4 * 8], input_4);
+    vst1q_s16(&final_output[5 * 8], input_5);
+    vst1q_s16(&final_output[6 * 8], input_6);
+    vst1q_s16(&final_output[7 * 8], input_7);
+  }
+}
+
+void aom_fdct8x8_1_neon(const int16_t *input, int16_t *output, int stride) {
+  int r;
+  int16x8_t sum = vld1q_s16(&input[0]);
+  for (r = 1; r < 8; ++r) {
+    const int16x8_t input_00 = vld1q_s16(&input[r * stride]);
+    sum = vaddq_s16(sum, input_00);
+  }
+  {
+    const int32x4_t a = vpaddlq_s16(sum);
+    const int64x2_t b = vpaddlq_s32(a);
+    const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
+                                 vreinterpret_s32_s64(vget_high_s64(b)));
+    output[0] = vget_lane_s16(vreinterpret_s16_s32(c), 0);
+    output[1] = 0;
+  }
+}
diff --git a/libs/libaom/src/aom_dsp/arm/hadamard_neon.c b/libs/libaom/src/aom_dsp/arm/hadamard_neon.c
new file mode 100644
index 000000000..929792ab3
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/arm/hadamard_neon.c
@@ -0,0 +1,183 @@
+/*
+ *  Copyright (c) 2019, Alliance for Open Media. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+#include "av1/common/arm/mem_neon.h"
+#include "av1/common/arm/transpose_neon.h"
+
+static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2,
+                                 int16x8_t *a3, int16x8_t *a4, int16x8_t *a5,
+                                 int16x8_t *a6, int16x8_t *a7) {
+  const int16x8_t b0 = vaddq_s16(*a0, *a1);
+  const int16x8_t b1 = vsubq_s16(*a0, *a1);
+  const int16x8_t b2 = vaddq_s16(*a2, *a3);
+  const int16x8_t b3 = vsubq_s16(*a2, *a3);
+  const int16x8_t b4 = vaddq_s16(*a4, *a5);
+  const int16x8_t b5 = vsubq_s16(*a4, *a5);
+  const int16x8_t b6 = vaddq_s16(*a6, *a7);
+  const int16x8_t b7 = vsubq_s16(*a6, *a7);
+
+  const int16x8_t c0 = vaddq_s16(b0, b2);
+  const int16x8_t c1 = vaddq_s16(b1, b3);
+  const int16x8_t c2 = vsubq_s16(b0, b2);
+  const int16x8_t c3 = vsubq_s16(b1, b3);
+  const int16x8_t c4 = vaddq_s16(b4, b6);
+  const int16x8_t c5 = vaddq_s16(b5, b7);
+  const int16x8_t c6 = vsubq_s16(b4, b6);
+  const int16x8_t c7 = vsubq_s16(b5, b7);
+
+  *a0 = vaddq_s16(c0, c4);
+  *a1 = vsubq_s16(c2, c6);
+  *a2 = vsubq_s16(c0, c4);
+  *a3 = vaddq_s16(c2, c6);
+  *a4 = vaddq_s16(c3, c7);
+  *a5 = vsubq_s16(c3, c7);
+  *a6 = vsubq_s16(c1, c5);
+  *a7 = vaddq_s16(c1, c5);
+}
+
+void aom_hadamard_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride,
+                           tran_low_t *coeff) {
+  int16x8_t a0 = vld1q_s16(src_diff);
+  int16x8_t a1 = vld1q_s16(src_diff + src_stride);
+  int16x8_t a2 = vld1q_s16(src_diff + 2 * src_stride);
+  int16x8_t a3 = vld1q_s16(src_diff + 3 * src_stride);
+  int16x8_t a4 = vld1q_s16(src_diff + 4 * src_stride);
+  int16x8_t a5 = vld1q_s16(src_diff + 5 * src_stride);
+  int16x8_t a6 = vld1q_s16(src_diff + 6 * src_stride);
+  int16x8_t a7 = vld1q_s16(src_diff + 7 * src_stride);
+
+  hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+  transpose_s16_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+  hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+  // Skip the second transpose because it is not required.
+
+  store_s16q_to_tran_low(coeff + 0, a0);
+  store_s16q_to_tran_low(coeff + 8, a1);
+  store_s16q_to_tran_low(coeff + 16, a2);
+  store_s16q_to_tran_low(coeff + 24, a3);
+  store_s16q_to_tran_low(coeff + 32, a4);
+  store_s16q_to_tran_low(coeff + 40, a5);
+  store_s16q_to_tran_low(coeff + 48, a6);
+  store_s16q_to_tran_low(coeff + 56, a7);
+}
+
+void aom_hadamard_lp_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride,
+                              int16_t *coeff) {
+  int16x8_t a0 = vld1q_s16(src_diff);
+  int16x8_t a1 = vld1q_s16(src_diff + src_stride);
+  int16x8_t a2 = vld1q_s16(src_diff + 2 * src_stride);
+  int16x8_t a3 = vld1q_s16(src_diff + 3 * src_stride);
+  int16x8_t a4 = vld1q_s16(src_diff + 4 * src_stride);
+  int16x8_t a5 = vld1q_s16(src_diff + 5 * src_stride);
+  int16x8_t a6 = vld1q_s16(src_diff + 6 * src_stride);
+  int16x8_t a7 = vld1q_s16(src_diff + 7 * src_stride);
+
+  hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+  transpose_s16_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+  hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+  // Skip the second transpose because it is not required.
+
+  vst1q_s16(coeff + 0, a0);
+  vst1q_s16(coeff + 8, a1);
+  vst1q_s16(coeff + 16, a2);
+  vst1q_s16(coeff + 24, a3);
+  vst1q_s16(coeff + 32, a4);
+  vst1q_s16(coeff + 40, a5);
+  vst1q_s16(coeff + 48, a6);
+  vst1q_s16(coeff + 56, a7);
+}
+
+void aom_hadamard_lp_16x16_neon(const int16_t *src_diff, ptrdiff_t src_stride,
+                                int16_t *coeff) {
+  /* Rearrange 16x16 to 8x32 and remove stride.
+   * Top left first. */
+  aom_hadamard_lp_8x8_neon(src_diff + 0 + 0 * src_stride, src_stride,
+                           coeff + 0);
+  /* Top right. */
+  aom_hadamard_lp_8x8_neon(src_diff + 8 + 0 * src_stride, src_stride,
+                           coeff + 64);
+  /* Bottom left. */
+  aom_hadamard_lp_8x8_neon(src_diff + 0 + 8 * src_stride, src_stride,
+                           coeff + 128);
+  /* Bottom right. */
+  aom_hadamard_lp_8x8_neon(src_diff + 8 + 8 * src_stride, src_stride,
+                           coeff + 192);
+
+  for (int i = 0; i < 64; i += 8) {
+    const int16x8_t a0 = vld1q_s16(coeff + 0);
+    const int16x8_t a1 = vld1q_s16(coeff + 64);
+    const int16x8_t a2 = vld1q_s16(coeff + 128);
+    const int16x8_t a3 = vld1q_s16(coeff + 192);
+
+    const int16x8_t b0 = vhaddq_s16(a0, a1);
+    const int16x8_t b1 = vhsubq_s16(a0, a1);
+    const int16x8_t b2 = vhaddq_s16(a2, a3);
+    const int16x8_t b3 = vhsubq_s16(a2, a3);
+
+    const int16x8_t c0 = vaddq_s16(b0, b2);
+    const int16x8_t c1 = vaddq_s16(b1, b3);
+    const int16x8_t c2 = vsubq_s16(b0, b2);
+    const int16x8_t c3 = vsubq_s16(b1, b3);
+
+    vst1q_s16(coeff + 0, c0);
+    vst1q_s16(coeff + 64, c1);
+    vst1q_s16(coeff + 128, c2);
+    vst1q_s16(coeff + 192, c3);
+
+    coeff += 8;
+  }
+}
+
+void aom_hadamard_16x16_neon(const int16_t *src_diff, ptrdiff_t src_stride,
+                             tran_low_t *coeff) {
+  /* Rearrange 16x16 to 8x32 and remove stride.
+   * Top left first. */
+  aom_hadamard_8x8_neon(src_diff + 0 + 0 * src_stride, src_stride, coeff + 0);
+  /* Top right. */
+  aom_hadamard_8x8_neon(src_diff + 8 + 0 * src_stride, src_stride, coeff + 64);
+  /* Bottom left. */
+  aom_hadamard_8x8_neon(src_diff + 0 + 8 * src_stride, src_stride, coeff + 128);
+  /* Bottom right. */
+  aom_hadamard_8x8_neon(src_diff + 8 + 8 * src_stride, src_stride, coeff + 192);
+
+  for (int i = 0; i < 64; i += 8) {
+    const int16x8_t a0 = load_tran_low_to_s16q(coeff + 0);
+    const int16x8_t a1 = load_tran_low_to_s16q(coeff + 64);
+    const int16x8_t a2 = load_tran_low_to_s16q(coeff + 128);
+    const int16x8_t a3 = load_tran_low_to_s16q(coeff + 192);
+
+    const int16x8_t b0 = vhaddq_s16(a0, a1);
+    const int16x8_t b1 = vhsubq_s16(a0, a1);
+    const int16x8_t b2 = vhaddq_s16(a2, a3);
+    const int16x8_t b3 = vhsubq_s16(a2, a3);
+
+    const int16x8_t c0 = vaddq_s16(b0, b2);
+    const int16x8_t c1 = vaddq_s16(b1, b3);
+    const int16x8_t c2 = vsubq_s16(b0, b2);
+    const int16x8_t c3 = vsubq_s16(b1, b3);
+
+    store_s16q_to_tran_low(coeff + 0, c0);
+    store_s16q_to_tran_low(coeff + 64, c1);
+    store_s16q_to_tran_low(coeff + 128, c2);
+    store_s16q_to_tran_low(coeff + 192, c3);
+
+    coeff += 8;
+  }
+}
diff --git a/libs/libaom/src/aom_dsp/arm/intrapred_neon.c b/libs/libaom/src/aom_dsp/arm/intrapred_neon.c
new file mode 100644
index 000000000..c85b1e910
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/arm/intrapred_neon.c
@@ -0,0 +1,590 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+
+//------------------------------------------------------------------------------
+// DC 4x4
+
+// 'do_above' and 'do_left' facilitate branch removal when inlined.
+static INLINE void dc_4x4(uint8_t *dst, ptrdiff_t stride, const uint8_t *above,
+                          const uint8_t *left, int do_above, int do_left) {
+  uint16x8_t sum_top;
+  uint16x8_t sum_left;
+  uint8x8_t dc0;
+
+  if (do_above) {
+    const uint8x8_t A = vld1_u8(above);  // top row
+    const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
+    const uint16x4_t p1 = vpadd_u16(p0, p0);
+    sum_top = vcombine_u16(p1, p1);
+  }
+
+  if (do_left) {
+    const uint8x8_t L = vld1_u8(left);   // left border
+    const uint16x4_t p0 = vpaddl_u8(L);  // cascading summation of the left
+    const uint16x4_t p1 = vpadd_u16(p0, p0);
+    sum_left = vcombine_u16(p1, p1);
+  }
+
+  if (do_above && do_left) {
+    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
+    dc0 = vrshrn_n_u16(sum, 3);
+  } else if (do_above) {
+    dc0 = vrshrn_n_u16(sum_top, 2);
+  } else if (do_left) {
+    dc0 = vrshrn_n_u16(sum_left, 2);
+  } else {
+    dc0 = vdup_n_u8(0x80);
+  }
+
+  {
+    const uint8x8_t dc = vdup_lane_u8(dc0, 0);
+    int i;
+    for (i = 0; i < 4; ++i) {
+      vst1_lane_u32((uint32_t *)(dst + i * stride), vreinterpret_u32_u8(dc), 0);
+    }
+  }
+}
+
+void aom_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  dc_4x4(dst, stride, above, left, 1, 1);
+}
+
+void aom_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  dc_4x4(dst, stride, NULL, left, 0, 1);
+}
+
+void aom_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  dc_4x4(dst, stride, above, NULL, 1, 0);
+}
+
+void aom_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+  dc_4x4(dst, stride, NULL, NULL, 0, 0);
+}
+
+//------------------------------------------------------------------------------
+// DC 8x8
+
+// 'do_above' and 'do_left' facilitate branch removal when inlined.
+static INLINE void dc_8x8(uint8_t *dst, ptrdiff_t stride, const uint8_t *above,
+                          const uint8_t *left, int do_above, int do_left) {
+  uint16x8_t sum_top;
+  uint16x8_t sum_left;
+  uint8x8_t dc0;
+
+  if (do_above) {
+    const uint8x8_t A = vld1_u8(above);  // top row
+    const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
+    const uint16x4_t p1 = vpadd_u16(p0, p0);
+    const uint16x4_t p2 = vpadd_u16(p1, p1);
+    sum_top = vcombine_u16(p2, p2);
+  }
+
+  if (do_left) {
+    const uint8x8_t L = vld1_u8(left);   // left border
+    const uint16x4_t p0 = vpaddl_u8(L);  // cascading summation of the left
+    const uint16x4_t p1 = vpadd_u16(p0, p0);
+    const uint16x4_t p2 = vpadd_u16(p1, p1);
+    sum_left = vcombine_u16(p2, p2);
+  }
+
+  if (do_above && do_left) {
+    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
+    dc0 = vrshrn_n_u16(sum, 4);
+  } else if (do_above) {
+    dc0 = vrshrn_n_u16(sum_top, 3);
+  } else if (do_left) {
+    dc0 = vrshrn_n_u16(sum_left, 3);
+  } else {
+    dc0 = vdup_n_u8(0x80);
+  }
+
+  {
+    const uint8x8_t dc = vdup_lane_u8(dc0, 0);
+    int i;
+    for (i = 0; i < 8; ++i) {
+      vst1_u32((uint32_t *)(dst + i * stride), vreinterpret_u32_u8(dc));
+    }
+  }
+}
+
+void aom_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  dc_8x8(dst, stride, above, left, 1, 1);
+}
+
+void aom_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  dc_8x8(dst, stride, NULL, left, 0, 1);
+}
+
+void aom_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  dc_8x8(dst, stride, above, NULL, 1, 0);
+}
+
+void aom_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+  dc_8x8(dst, stride, NULL, NULL, 0, 0);
+}
+
+//------------------------------------------------------------------------------
+// DC 16x16
+
+// 'do_above' and 'do_left' facilitate branch removal when inlined.
+static INLINE void dc_16x16(uint8_t *dst, ptrdiff_t stride,
+                            const uint8_t *above, const uint8_t *left,
+                            int do_above, int do_left) {
+  uint16x8_t sum_top;
+  uint16x8_t sum_left;
+  uint8x8_t dc0;
+
+  if (do_above) {
+    const uint8x16_t A = vld1q_u8(above);  // top row
+    const uint16x8_t p0 = vpaddlq_u8(A);   // cascading summation of the top
+    const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
+    const uint16x4_t p2 = vpadd_u16(p1, p1);
+    const uint16x4_t p3 = vpadd_u16(p2, p2);
+    sum_top = vcombine_u16(p3, p3);
+  }
+
+  if (do_left) {
+    const uint8x16_t L = vld1q_u8(left);  // left row
+    const uint16x8_t p0 = vpaddlq_u8(L);  // cascading summation of the left
+    const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
+    const uint16x4_t p2 = vpadd_u16(p1, p1);
+    const uint16x4_t p3 = vpadd_u16(p2, p2);
+    sum_left = vcombine_u16(p3, p3);
+  }
+
+  if (do_above && do_left) {
+    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
+    dc0 = vrshrn_n_u16(sum, 5);
+  } else if (do_above) {
+    dc0 = vrshrn_n_u16(sum_top, 4);
+  } else if (do_left) {
+    dc0 = vrshrn_n_u16(sum_left, 4);
+  } else {
+    dc0 = vdup_n_u8(0x80);
+  }
+
+  {
+    const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
+    int i;
+    for (i = 0; i < 16; ++i) {
+      vst1q_u8(dst + i * stride, dc);
+    }
+  }
+}
+
+void aom_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  dc_16x16(dst, stride, above, left, 1, 1);
+}
+
+void aom_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  (void)above;
+  dc_16x16(dst, stride, NULL, left, 0, 1);
+}
+
+void aom_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)left;
+  dc_16x16(dst, stride, above, NULL, 1, 0);
+}
+
+void aom_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  dc_16x16(dst, stride, NULL, NULL, 0, 0);
+}
+
+//------------------------------------------------------------------------------
+// DC 32x32
+
+// 'do_above' and 'do_left' facilitate branch removal when inlined.
+static INLINE void dc_32x32(uint8_t *dst, ptrdiff_t stride,
+                            const uint8_t *above, const uint8_t *left,
+                            int do_above, int do_left) {
+  uint16x8_t sum_top;
+  uint16x8_t sum_left;
+  uint8x8_t dc0;
+
+  if (do_above) {
+    const uint8x16_t A0 = vld1q_u8(above);  // top row
+    const uint8x16_t A1 = vld1q_u8(above + 16);
+    const uint16x8_t p0 = vpaddlq_u8(A0);  // cascading summation of the top
+    const uint16x8_t p1 = vpaddlq_u8(A1);
+    const uint16x8_t p2 = vaddq_u16(p0, p1);
+    const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
+    const uint16x4_t p4 = vpadd_u16(p3, p3);
+    const uint16x4_t p5 = vpadd_u16(p4, p4);
+    sum_top = vcombine_u16(p5, p5);
+  }
+
+  if (do_left) {
+    const uint8x16_t L0 = vld1q_u8(left);  // left row
+    const uint8x16_t L1 = vld1q_u8(left + 16);
+    const uint16x8_t p0 = vpaddlq_u8(L0);  // cascading summation of the left
+    const uint16x8_t p1 = vpaddlq_u8(L1);
+    const uint16x8_t p2 = vaddq_u16(p0, p1);
+    const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
+    const uint16x4_t p4 = vpadd_u16(p3, p3);
+    const uint16x4_t p5 = vpadd_u16(p4, p4);
+    sum_left = vcombine_u16(p5, p5);
+  }
+
+  if (do_above && do_left) {
+    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
+    dc0 = vrshrn_n_u16(sum, 6);
+  } else if (do_above) {
+    dc0 = vrshrn_n_u16(sum_top, 5);
+  } else if (do_left) {
+    dc0 = vrshrn_n_u16(sum_left, 5);
+  } else {
+    dc0 = vdup_n_u8(0x80);
+  }
+
+  {
+    const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
+    int i;
+    for (i = 0; i < 32; ++i) {
+      vst1q_u8(dst + i * stride, dc);
+      vst1q_u8(dst + i * stride + 16, dc);
+    }
+  }
+}
+
+void aom_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  dc_32x32(dst, stride, above, left, 1, 1);
+}
+
+void aom_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  (void)above;
+  dc_32x32(dst, stride, NULL, left, 0, 1);
+}
+
+void aom_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)left;
+  dc_32x32(dst, stride, above, NULL, 1, 0);
+}
+
+void aom_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  dc_32x32(dst, stride, NULL, NULL, 0, 0);
+}
+
+// -----------------------------------------------------------------------------
+
+void aom_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  const uint8x8_t XABCD_u8 = vld1_u8(above - 1);
+  const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8);
+  const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32);
+  const uint32x2_t zero = vdup_n_u32(0);
+  const uint32x2_t IJKL = vld1_lane_u32((const uint32_t *)left, zero, 0);
+  const uint8x8_t IJKL_u8 = vreinterpret_u8_u32(IJKL);
+  const uint64x1_t LKJI____ = vreinterpret_u64_u8(vrev32_u8(IJKL_u8));
+  const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC);
+  const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8));
+  const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16));
+  const uint8_t D = vget_lane_u8(XABCD_u8, 4);
+  const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6);
+  const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC);
+  const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8);
+  const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_);
+  const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
+  const uint32x2_t r3 = vreinterpret_u32_u8(avg2);
+  const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
+  const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
+  const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
+  vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0);
+  vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0);
+  vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0);
+  vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0);
+}
+
+void aom_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  int i;
+  uint32x2_t d0u32 = vdup_n_u32(0);
+  (void)left;
+
+  d0u32 = vld1_lane_u32((const uint32_t *)above, d0u32, 0);
+  for (i = 0; i < 4; i++, dst += stride)
+    vst1_lane_u32((uint32_t *)dst, d0u32, 0);
+}
+
+void aom_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  int i;
+  uint8x8_t d0u8 = vdup_n_u8(0);
+  (void)left;
+
+  d0u8 = vld1_u8(above);
+  for (i = 0; i < 8; i++, dst += stride) vst1_u8(dst, d0u8);
+}
+
+void aom_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  int i;
+  uint8x16_t q0u8 = vdupq_n_u8(0);
+  (void)left;
+
+  q0u8 = vld1q_u8(above);
+  for (i = 0; i < 16; i++, dst += stride) vst1q_u8(dst, q0u8);
+}
+
+void aom_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  int i;
+  uint8x16_t q0u8 = vdupq_n_u8(0);
+  uint8x16_t q1u8 = vdupq_n_u8(0);
+  (void)left;
+
+  q0u8 = vld1q_u8(above);
+  q1u8 = vld1q_u8(above + 16);
+  for (i = 0; i < 32; i++, dst += stride) {
+    vst1q_u8(dst, q0u8);
+    vst1q_u8(dst + 16, q1u8);
+  }
+}
+
+void aom_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  uint8x8_t d0u8 = vdup_n_u8(0);
+  uint32x2_t d1u32 = vdup_n_u32(0);
+  (void)above;
+
+  d1u32 = vld1_lane_u32((const uint32_t *)left, d1u32, 0);
+
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 0);
+  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+  dst += stride;
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 1);
+  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+  dst += stride;
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 2);
+  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+  dst += stride;
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 3);
+  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+}
+
+void aom_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  uint8x8_t d0u8 = vdup_n_u8(0);
+  uint64x1_t d1u64 = vdup_n_u64(0);
+  (void)above;
+
+  d1u64 = vld1_u64((const uint64_t *)left);
+
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 0);
+  vst1_u8(dst, d0u8);
+  dst += stride;
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 1);
+  vst1_u8(dst, d0u8);
+  dst += stride;
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 2);
+  vst1_u8(dst, d0u8);
+  dst += stride;
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 3);
+  vst1_u8(dst, d0u8);
+  dst += stride;
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 4);
+  vst1_u8(dst, d0u8);
+  dst += stride;
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 5);
+  vst1_u8(dst, d0u8);
+  dst += stride;
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 6);
+  vst1_u8(dst, d0u8);
+  dst += stride;
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 7);
+  vst1_u8(dst, d0u8);
+}
+
+void aom_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  int j;
+  uint8x8_t d2u8 = vdup_n_u8(0);
+  uint8x16_t q0u8 = vdupq_n_u8(0);
+  uint8x16_t q1u8 = vdupq_n_u8(0);
+  (void)above;
+
+  q1u8 = vld1q_u8(left);
+  d2u8 = vget_low_u8(q1u8);
+  for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
+    q0u8 = vdupq_lane_u8(d2u8, 0);
+    vst1q_u8(dst, q0u8);
+    dst += stride;
+    q0u8 = vdupq_lane_u8(d2u8, 1);
+    vst1q_u8(dst, q0u8);
+    dst += stride;
+    q0u8 = vdupq_lane_u8(d2u8, 2);
+    vst1q_u8(dst, q0u8);
+    dst += stride;
+    q0u8 = vdupq_lane_u8(d2u8, 3);
+    vst1q_u8(dst, q0u8);
+    dst += stride;
+    q0u8 = vdupq_lane_u8(d2u8, 4);
+    vst1q_u8(dst, q0u8);
+    dst += stride;
+    q0u8 = vdupq_lane_u8(d2u8, 5);
+    vst1q_u8(dst, q0u8);
+    dst += stride;
+    q0u8 = vdupq_lane_u8(d2u8, 6);
+    vst1q_u8(dst, q0u8);
+    dst += stride;
+    q0u8 = vdupq_lane_u8(d2u8, 7);
+    vst1q_u8(dst, q0u8);
+    dst += stride;
+  }
+}
+
+void aom_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  int j, k;
+  uint8x8_t d2u8 = vdup_n_u8(0);
+  uint8x16_t q0u8 = vdupq_n_u8(0);
+  uint8x16_t q1u8 = vdupq_n_u8(0);
+  (void)above;
+
+  for (k = 0; k < 2; k++, left += 16) {
+    q1u8 = vld1q_u8(left);
+    d2u8 = vget_low_u8(q1u8);
+    for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
+      q0u8 = vdupq_lane_u8(d2u8, 0);
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q0u8);
+      dst += stride;
+      q0u8 = vdupq_lane_u8(d2u8, 1);
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q0u8);
+      dst += stride;
+      q0u8 = vdupq_lane_u8(d2u8, 2);
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q0u8);
+      dst += stride;
+      q0u8 = vdupq_lane_u8(d2u8, 3);
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q0u8);
+      dst += stride;
+      q0u8 = vdupq_lane_u8(d2u8, 4);
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q0u8);
+      dst += stride;
+      q0u8 = vdupq_lane_u8(d2u8, 5);
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q0u8);
+      dst += stride;
+      q0u8 = vdupq_lane_u8(d2u8, 6);
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q0u8);
+      dst += stride;
+      q0u8 = vdupq_lane_u8(d2u8, 7);
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q0u8);
+      dst += stride;
+    }
+  }
+}
+
+static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bw,
+                                       const uint16_t *above,
+                                       const uint16_t *left) {
+  assert(bw >= 4);
+  assert(IS_POWER_OF_TWO(bw));
+  int expected_dc, sum = 0;
+  const int count = bw * 2;
+  uint32x4_t sum_q = vdupq_n_u32(0);
+  uint32x2_t sum_d;
+  uint16_t *dst_1;
+  if (bw >= 8) {
+    for (int i = 0; i < bw; i += 8) {
+      sum_q = vpadalq_u16(sum_q, vld1q_u16(above));
+      sum_q = vpadalq_u16(sum_q, vld1q_u16(left));
+      above += 8;
+      left += 8;
+    }
+    sum_d = vadd_u32(vget_low_u32(sum_q), vget_high_u32(sum_q));
+    sum = vget_lane_s32(vreinterpret_s32_u64(vpaddl_u32(sum_d)), 0);
+    expected_dc = (sum + (count >> 1)) / count;
+    const uint16x8_t dc = vdupq_n_u16((uint16_t)expected_dc);
+    for (int r = 0; r < bw; r++) {
+      dst_1 = dst;
+      for (int i = 0; i < bw; i += 8) {
+        vst1q_u16(dst_1, dc);
+        dst_1 += 8;
+      }
+      dst += stride;
+    }
+  } else {  // 4x4
+    sum_q = vaddl_u16(vld1_u16(above), vld1_u16(left));
+    sum_d = vadd_u32(vget_low_u32(sum_q), vget_high_u32(sum_q));
+    sum = vget_lane_s32(vreinterpret_s32_u64(vpaddl_u32(sum_d)), 0);
+    expected_dc = (sum + (count >> 1)) / count;
+    const uint16x4_t dc = vdup_n_u16((uint16_t)expected_dc);
+    for (int r = 0; r < bw; r++) {
+      vst1_u16(dst, dc);
+      dst += stride;
+    }
+  }
+}
+
+#define intra_pred_highbd_sized_neon(type, width)               \
+  void aom_highbd_##type##_predictor_##width##x##width##_neon(  \
+      uint16_t *dst, ptrdiff_t stride, const uint16_t *above,   \
+      const uint16_t *left, int bd) {                           \
+    (void)bd;                                                   \
+    highbd_##type##_predictor(dst, stride, width, above, left); \
+  }
+
+#define intra_pred_square(type)           \
+  intra_pred_highbd_sized_neon(type, 4);  \
+  intra_pred_highbd_sized_neon(type, 8);  \
+  intra_pred_highbd_sized_neon(type, 16); \
+  intra_pred_highbd_sized_neon(type, 32); \
+  intra_pred_highbd_sized_neon(type, 64);
+
+intra_pred_square(dc);
+#undef intra_pred_square
diff --git a/libs/libaom/src/aom_dsp/arm/loopfilter_neon.c b/libs/libaom/src/aom_dsp/arm/loopfilter_neon.c
new file mode 100644
index 000000000..aafac8966
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/arm/loopfilter_neon.c
@@ -0,0 +1,927 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+#include "av1/common/arm/mem_neon.h"
+#include "av1/common/arm/transpose_neon.h"
+
+static INLINE uint8x8_t lpf_mask(uint8x8_t p3q3, uint8x8_t p2q2, uint8x8_t p1q1,
+                                 uint8x8_t p0q0, const uint8_t blimit,
+                                 const uint8_t limit) {
+  // Calculate mask values for four samples
+  uint32x2x2_t p0q0_p1q1;
+  uint16x8_t temp_16x8;
+  uint16x4_t temp0_16x4, temp1_16x4;
+  uint8x8_t mask_8x8, temp_8x8;
+  const uint8x8_t limit_8x8 = vdup_n_u8(limit);
+  const uint16x4_t blimit_16x4 = vdup_n_u16((uint16_t)blimit);
+
+  mask_8x8 = vabd_u8(p3q3, p2q2);
+  mask_8x8 = vmax_u8(mask_8x8, vabd_u8(p2q2, p1q1));
+  mask_8x8 = vmax_u8(mask_8x8, vabd_u8(p1q1, p0q0));
+  mask_8x8 = vcle_u8(mask_8x8, limit_8x8);
+
+  temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(mask_8x8)));
+  mask_8x8 = vand_u8(mask_8x8, temp_8x8);
+
+  p0q0_p1q1 = vtrn_u32(vreinterpret_u32_u8(p0q0), vreinterpret_u32_u8(p1q1));
+  temp_8x8 = vabd_u8(vreinterpret_u8_u32(p0q0_p1q1.val[0]),
+                     vreinterpret_u8_u32(p0q0_p1q1.val[1]));
+  temp_16x8 = vmovl_u8(temp_8x8);
+  temp0_16x4 = vshl_n_u16(vget_low_u16(temp_16x8), 1);
+  temp1_16x4 = vshr_n_u16(vget_high_u16(temp_16x8), 1);
+  temp0_16x4 = vadd_u16(temp0_16x4, temp1_16x4);
+  temp0_16x4 = vcle_u16(temp0_16x4, blimit_16x4);
+  temp_8x8 = vmovn_u16(vcombine_u16(temp0_16x4, temp0_16x4));
+
+  mask_8x8 = vand_u8(mask_8x8, temp_8x8);
+
+  return mask_8x8;
+}
+
+static INLINE uint8x8_t lpf_mask2(uint8x8_t p1q1, uint8x8_t p0q0,
+                                  const uint8_t blimit, const uint8_t limit) {
+  uint32x2x2_t p0q0_p1q1;
+  uint16x8_t temp_16x8;
+  uint16x4_t temp0_16x4, temp1_16x4;
+  const uint16x4_t blimit_16x4 = vdup_n_u16(blimit);
+  const uint8x8_t limit_8x8 = vdup_n_u8(limit);
+  uint8x8_t mask_8x8, temp_8x8;
+
+  mask_8x8 = vabd_u8(p1q1, p0q0);
+  mask_8x8 = vcle_u8(mask_8x8, limit_8x8);
+
+  temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(mask_8x8)));
+  mask_8x8 = vand_u8(mask_8x8, temp_8x8);
+
+  p0q0_p1q1 = vtrn_u32(vreinterpret_u32_u8(p0q0), vreinterpret_u32_u8(p1q1));
+  temp_8x8 = vabd_u8(vreinterpret_u8_u32(p0q0_p1q1.val[0]),
+                     vreinterpret_u8_u32(p0q0_p1q1.val[1]));
+  temp_16x8 = vmovl_u8(temp_8x8);
+  temp0_16x4 = vshl_n_u16(vget_low_u16(temp_16x8), 1);
+  temp1_16x4 = vshr_n_u16(vget_high_u16(temp_16x8), 1);
+  temp0_16x4 = vadd_u16(temp0_16x4, temp1_16x4);
+  temp0_16x4 = vcle_u16(temp0_16x4, blimit_16x4);
+  temp_8x8 = vmovn_u16(vcombine_u16(temp0_16x4, temp0_16x4));
+
+  mask_8x8 = vand_u8(mask_8x8, temp_8x8);
+
+  return mask_8x8;
+}
+
+static INLINE uint8x8_t lpf_flat_mask4(uint8x8_t p3q3, uint8x8_t p2q2,
+                                       uint8x8_t p1q1, uint8x8_t p0q0) {
+  const uint8x8_t thresh_8x8 = vdup_n_u8(1);  // for bd==8 threshold is always 1
+  uint8x8_t flat_8x8, temp_8x8;
+
+  flat_8x8 = vabd_u8(p1q1, p0q0);
+  flat_8x8 = vmax_u8(flat_8x8, vabd_u8(p2q2, p0q0));
+  flat_8x8 = vmax_u8(flat_8x8, vabd_u8(p3q3, p0q0));
+  flat_8x8 = vcle_u8(flat_8x8, thresh_8x8);
+
+  temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(flat_8x8)));
+  flat_8x8 = vand_u8(flat_8x8, temp_8x8);
+
+  return flat_8x8;
+}
+
+static INLINE uint8x8_t lpf_flat_mask3(uint8x8_t p2q2, uint8x8_t p1q1,
+                                       uint8x8_t p0q0) {
+  const uint8x8_t thresh_8x8 = vdup_n_u8(1);  // for bd==8 threshold is always 1
+  uint8x8_t flat_8x8, temp_8x8;
+
+  flat_8x8 = vabd_u8(p1q1, p0q0);
+  flat_8x8 = vmax_u8(flat_8x8, vabd_u8(p2q2, p0q0));
+  flat_8x8 = vcle_u8(flat_8x8, thresh_8x8);
+
+  temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(flat_8x8)));
+  flat_8x8 = vand_u8(flat_8x8, temp_8x8);
+
+  return flat_8x8;
+}
+
+static INLINE uint8x8_t lpf_mask3_chroma(uint8x8_t p2q2, uint8x8_t p1q1,
+                                         uint8x8_t p0q0, const uint8_t blimit,
+                                         const uint8_t limit) {
+  // Calculate mask3 values for four samples
+  uint32x2x2_t p0q0_p1q1;
+  uint16x8_t temp_16x8;
+  uint16x4_t temp0_16x4, temp1_16x4;
+  uint8x8_t mask_8x8, temp_8x8;
+  const uint8x8_t limit_8x8 = vdup_n_u8(limit);
+  const uint16x4_t blimit_16x4 = vdup_n_u16((uint16_t)blimit);
+
+  mask_8x8 = vabd_u8(p2q2, p1q1);
+  mask_8x8 = vmax_u8(mask_8x8, vabd_u8(p1q1, p0q0));
+  mask_8x8 = vcle_u8(mask_8x8, limit_8x8);
+
+  temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(mask_8x8)));
+  mask_8x8 = vand_u8(mask_8x8, temp_8x8);
+
+  p0q0_p1q1 = vtrn_u32(vreinterpret_u32_u8(p0q0), vreinterpret_u32_u8(p1q1));
+  temp_8x8 = vabd_u8(vreinterpret_u8_u32(p0q0_p1q1.val[0]),
+                     vreinterpret_u8_u32(p0q0_p1q1.val[1]));
+  temp_16x8 = vmovl_u8(temp_8x8);
+  temp0_16x4 = vshl_n_u16(vget_low_u16(temp_16x8), 1);
+  temp1_16x4 = vshr_n_u16(vget_high_u16(temp_16x8), 1);
+  temp0_16x4 = vadd_u16(temp0_16x4, temp1_16x4);
+  temp0_16x4 = vcle_u16(temp0_16x4, blimit_16x4);
+  temp_8x8 = vmovn_u16(vcombine_u16(temp0_16x4, temp0_16x4));
+
+  mask_8x8 = vand_u8(mask_8x8, temp_8x8);
+
+  return mask_8x8;
+}
+
+static void lpf_14_neon(uint8x8_t *p6q6, uint8x8_t *p5q5, uint8x8_t *p4q4,
+                        uint8x8_t *p3q3, uint8x8_t *p2q2, uint8x8_t *p1q1,
+                        uint8x8_t *p0q0, const uint8_t blimit,
+                        const uint8_t limit, const uint8_t thresh) {
+  uint16x8_t out;
+  uint8x8_t out_f14_pq0, out_f14_pq1, out_f14_pq2, out_f14_pq3, out_f14_pq4,
+      out_f14_pq5;
+  uint8x8_t out_f7_pq0, out_f7_pq1, out_f7_pq2;
+  uint8x8_t out_f4_pq0, out_f4_pq1;
+  uint8x8_t mask_8x8, flat_8x8, flat2_8x8;
+  uint8x8_t q0p0, q1p1, q2p2;
+
+  // Calculate filter masks
+  mask_8x8 = lpf_mask(*p3q3, *p2q2, *p1q1, *p0q0, blimit, limit);
+  flat_8x8 = lpf_flat_mask4(*p3q3, *p2q2, *p1q1, *p0q0);
+  flat2_8x8 = lpf_flat_mask4(*p6q6, *p5q5, *p4q4, *p0q0);
+  {
+    // filter 4
+    int32x2x2_t ps0_qs0, ps1_qs1;
+    int16x8_t filter_s16;
+    const uint8x8_t thresh_f4 = vdup_n_u8(thresh);
+    uint8x8_t temp0_8x8, temp1_8x8;
+    int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8;
+    int8x8_t op0, oq0, op1, oq1;
+    int8x8_t pq_s0, pq_s1;
+    int8x8_t filter_s8, filter1_s8, filter2_s8;
+    int8x8_t hev_8x8;
+    const int8x8_t sign_mask = vdup_n_s8(0x80);
+    const int8x8_t val_4 = vdup_n_s8(4);
+    const int8x8_t val_3 = vdup_n_s8(3);
+
+    pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask);
+    pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask);
+
+    ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0));
+    ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1));
+    ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]);
+    qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]);
+    ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]);
+    qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]);
+
+    // hev_mask
+    temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4);
+    temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8)));
+    hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8));
+
+    // add outer taps if we have high edge variance
+    filter_s8 = vqsub_s8(ps1_s8, qs1_s8);
+    filter_s8 = vand_s8(filter_s8, hev_8x8);
+
+    // inner taps
+    temp_s8 = vqsub_s8(qs0_s8, ps0_s8);
+    filter_s16 = vmovl_s8(filter_s8);
+    filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3);
+    filter_s8 = vqmovn_s16(filter_s16);
+    filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8));
+
+    filter1_s8 = vqadd_s8(filter_s8, val_4);
+    filter2_s8 = vqadd_s8(filter_s8, val_3);
+    filter1_s8 = vshr_n_s8(filter1_s8, 3);
+    filter2_s8 = vshr_n_s8(filter2_s8, 3);
+
+    oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask);
+    op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask);
+
+    hev_8x8 = vmvn_s8(hev_8x8);
+    filter_s8 = vrshr_n_s8(filter1_s8, 1);
+    filter_s8 = vand_s8(filter_s8, hev_8x8);
+
+    oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask);
+    op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask);
+
+    out_f4_pq0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4));
+    out_f4_pq1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4));
+  }
+  // reverse p and q
+  q0p0 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p0q0)));
+  q1p1 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p1q1)));
+  q2p2 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p2q2)));
+  {
+    // filter 8
+    uint16x8_t out_pq0, out_pq1, out_pq2;
+    out = vaddl_u8(*p3q3, *p2q2);
+    out = vaddw_u8(out, *p1q1);
+    out = vaddw_u8(out, *p0q0);
+
+    out = vaddw_u8(out, q0p0);
+    out_pq1 = vaddw_u8(out, *p3q3);
+    out_pq2 = vaddw_u8(out_pq1, *p3q3);
+    out_pq2 = vaddw_u8(out_pq2, *p2q2);
+    out_pq1 = vaddw_u8(out_pq1, *p1q1);
+    out_pq1 = vaddw_u8(out_pq1, q1p1);
+
+    out_pq0 = vaddw_u8(out, *p0q0);
+    out_pq0 = vaddw_u8(out_pq0, q1p1);
+    out_pq0 = vaddw_u8(out_pq0, q2p2);
+
+    out_f7_pq0 = vrshrn_n_u16(out_pq0, 3);
+    out_f7_pq1 = vrshrn_n_u16(out_pq1, 3);
+    out_f7_pq2 = vrshrn_n_u16(out_pq2, 3);
+  }
+  {
+    // filter 14
+    uint16x8_t out_pq0, out_pq1, out_pq2, out_pq3, out_pq4, out_pq5;
+    uint16x8_t p6q6_2, p6q6_temp, qp_sum;
+    uint8x8_t qp_rev;
+
+    out = vaddw_u8(out, *p4q4);
+    out = vaddw_u8(out, *p5q5);
+    out = vaddw_u8(out, *p6q6);
+
+    out_pq5 = vaddw_u8(out, *p4q4);
+    out_pq4 = vaddw_u8(out_pq5, *p3q3);
+    out_pq3 = vaddw_u8(out_pq4, *p2q2);
+
+    out_pq5 = vaddw_u8(out_pq5, *p5q5);
+    out_pq4 = vaddw_u8(out_pq4, *p5q5);
+
+    out_pq0 = vaddw_u8(out, *p1q1);
+    out_pq1 = vaddw_u8(out_pq0, *p2q2);
+    out_pq2 = vaddw_u8(out_pq1, *p3q3);
+
+    out_pq0 = vaddw_u8(out_pq0, *p0q0);
+    out_pq1 = vaddw_u8(out_pq1, *p0q0);
+
+    out_pq1 = vaddw_u8(out_pq1, *p6q6);
+    p6q6_2 = vaddl_u8(*p6q6, *p6q6);
+    out_pq2 = vaddq_u16(out_pq2, p6q6_2);
+    p6q6_temp = vaddw_u8(p6q6_2, *p6q6);
+    out_pq3 = vaddq_u16(out_pq3, p6q6_temp);
+    p6q6_temp = vaddw_u8(p6q6_temp, *p6q6);
+    out_pq4 = vaddq_u16(out_pq4, p6q6_temp);
+    p6q6_temp = vaddq_u16(p6q6_temp, p6q6_2);
+    out_pq5 = vaddq_u16(out_pq5, p6q6_temp);
+
+    out_pq4 = vaddw_u8(out_pq4, q1p1);
+
+    qp_sum = vaddl_u8(q2p2, q1p1);
+    out_pq3 = vaddq_u16(out_pq3, qp_sum);
+
+    qp_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p3q3)));
+    qp_sum = vaddw_u8(qp_sum, qp_rev);
+    out_pq2 = vaddq_u16(out_pq2, qp_sum);
+
+    qp_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p4q4)));
+    qp_sum = vaddw_u8(qp_sum, qp_rev);
+    out_pq1 = vaddq_u16(out_pq1, qp_sum);
+
+    qp_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p5q5)));
+    qp_sum = vaddw_u8(qp_sum, qp_rev);
+    out_pq0 = vaddq_u16(out_pq0, qp_sum);
+
+    out_pq0 = vaddw_u8(out_pq0, q0p0);
+
+    out_f14_pq0 = vrshrn_n_u16(out_pq0, 4);
+    out_f14_pq1 = vrshrn_n_u16(out_pq1, 4);
+    out_f14_pq2 = vrshrn_n_u16(out_pq2, 4);
+    out_f14_pq3 = vrshrn_n_u16(out_pq3, 4);
+    out_f14_pq4 = vrshrn_n_u16(out_pq4, 4);
+    out_f14_pq5 = vrshrn_n_u16(out_pq5, 4);
+  }
+  {
+    uint8x8_t filter4_cond, filter8_cond, filter14_cond;
+    filter8_cond = vand_u8(flat_8x8, mask_8x8);
+    filter4_cond = vmvn_u8(filter8_cond);
+    filter14_cond = vand_u8(filter8_cond, flat2_8x8);
+
+    // filter4 outputs
+    *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0);
+    *p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1);
+
+    // filter8 outputs
+    *p0q0 = vbsl_u8(filter8_cond, out_f7_pq0, *p0q0);
+    *p1q1 = vbsl_u8(filter8_cond, out_f7_pq1, *p1q1);
+    *p2q2 = vbsl_u8(filter8_cond, out_f7_pq2, *p2q2);
+
+    // filter14 outputs
+    *p0q0 = vbsl_u8(filter14_cond, out_f14_pq0, *p0q0);
+    *p1q1 = vbsl_u8(filter14_cond, out_f14_pq1, *p1q1);
+    *p2q2 = vbsl_u8(filter14_cond, out_f14_pq2, *p2q2);
+    *p3q3 = vbsl_u8(filter14_cond, out_f14_pq3, *p3q3);
+    *p4q4 = vbsl_u8(filter14_cond, out_f14_pq4, *p4q4);
+    *p5q5 = vbsl_u8(filter14_cond, out_f14_pq5, *p5q5);
+  }
+}
+
+static void lpf_8_neon(uint8x8_t *p3q3, uint8x8_t *p2q2, uint8x8_t *p1q1,
+                       uint8x8_t *p0q0, const uint8_t blimit,
+                       const uint8_t limit, const uint8_t thresh) {
+  uint16x8_t out;
+  uint8x8_t out_f7_pq0, out_f7_pq1, out_f7_pq2;
+  uint8x8_t out_f4_pq0, out_f4_pq1;
+  uint8x8_t mask_8x8, flat_8x8;
+
+  // Calculate filter masks
+  mask_8x8 = lpf_mask(*p3q3, *p2q2, *p1q1, *p0q0, blimit, limit);
+  flat_8x8 = lpf_flat_mask4(*p3q3, *p2q2, *p1q1, *p0q0);
+  {
+    // filter 4
+    int32x2x2_t ps0_qs0, ps1_qs1;
+    int16x8_t filter_s16;
+    const uint8x8_t thresh_f4 = vdup_n_u8(thresh);
+    uint8x8_t temp0_8x8, temp1_8x8;
+    int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8;
+    int8x8_t op0, oq0, op1, oq1;
+    int8x8_t pq_s0, pq_s1;
+    int8x8_t filter_s8, filter1_s8, filter2_s8;
+    int8x8_t hev_8x8;
+    const int8x8_t sign_mask = vdup_n_s8(0x80);
+    const int8x8_t val_4 = vdup_n_s8(4);
+    const int8x8_t val_3 = vdup_n_s8(3);
+
+    pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask);
+    pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask);
+
+    ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0));
+    ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1));
+    ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]);
+    qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]);
+    ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]);
+    qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]);
+
+    // hev_mask
+    temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4);
+    temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8)));
+    hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8));
+
+    // add outer taps if we have high edge variance
+    filter_s8 = vqsub_s8(ps1_s8, qs1_s8);
+    filter_s8 = vand_s8(filter_s8, hev_8x8);
+
+    // inner taps
+    temp_s8 = vqsub_s8(qs0_s8, ps0_s8);
+    filter_s16 = vmovl_s8(filter_s8);
+    filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3);
+    filter_s8 = vqmovn_s16(filter_s16);
+    filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8));
+
+    filter1_s8 = vqadd_s8(filter_s8, val_4);
+    filter2_s8 = vqadd_s8(filter_s8, val_3);
+    filter1_s8 = vshr_n_s8(filter1_s8, 3);
+    filter2_s8 = vshr_n_s8(filter2_s8, 3);
+
+    oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask);
+    op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask);
+
+    hev_8x8 = vmvn_s8(hev_8x8);
+    filter_s8 = vrshr_n_s8(filter1_s8, 1);
+    filter_s8 = vand_s8(filter_s8, hev_8x8);
+
+    oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask);
+    op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask);
+
+    out_f4_pq0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4));
+    out_f4_pq1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4));
+  }
+  {
+    // filter 8
+    uint16x8_t out_pq0, out_pq1, out_pq2;
+    uint8x8_t q0p0, q1p1, q2p2;
+
+    out = vaddl_u8(*p3q3, *p2q2);
+    out = vaddw_u8(out, *p1q1);
+    out = vaddw_u8(out, *p0q0);
+
+    // reverse p and q
+    q0p0 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p0q0)));
+    q1p1 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p1q1)));
+    q2p2 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p2q2)));
+
+    out = vaddw_u8(out, q0p0);
+    out_pq1 = vaddw_u8(out, *p3q3);
+    out_pq2 = vaddw_u8(out_pq1, *p3q3);
+    out_pq2 = vaddw_u8(out_pq2, *p2q2);
+    out_pq1 = vaddw_u8(out_pq1, *p1q1);
+    out_pq1 = vaddw_u8(out_pq1, q1p1);
+
+    out_pq0 = vaddw_u8(out, *p0q0);
+    out_pq0 = vaddw_u8(out_pq0, q1p1);
+    out_pq0 = vaddw_u8(out_pq0, q2p2);
+
+    out_f7_pq0 = vrshrn_n_u16(out_pq0, 3);
+    out_f7_pq1 = vrshrn_n_u16(out_pq1, 3);
+    out_f7_pq2 = vrshrn_n_u16(out_pq2, 3);
+  }
+  {
+    uint8x8_t filter4_cond, filter8_cond;
+    filter8_cond = vand_u8(flat_8x8, mask_8x8);
+    filter4_cond = vmvn_u8(filter8_cond);
+
+    // filter4 outputs
+    *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0);
+    *p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1);
+
+    // filter8 outputs
+    *p0q0 = vbsl_u8(filter8_cond, out_f7_pq0, *p0q0);
+    *p1q1 = vbsl_u8(filter8_cond, out_f7_pq1, *p1q1);
+    *p2q2 = vbsl_u8(filter8_cond, out_f7_pq2, *p2q2);
+  }
+}
+
+static void lpf_6_neon(uint8x8_t *p2q2, uint8x8_t *p1q1, uint8x8_t *p0q0,
+                       const uint8_t blimit, const uint8_t limit,
+                       const uint8_t thresh) {
+  uint16x8_t out;
+  uint8x8_t out_f6_pq0, out_f6_pq1;
+  uint8x8_t out_f4_pq0, out_f4_pq1;
+  uint8x8_t mask_8x8, flat_8x8;
+
+  // Calculate filter masks
+  mask_8x8 = lpf_mask3_chroma(*p2q2, *p1q1, *p0q0, blimit, limit);
+  flat_8x8 = lpf_flat_mask3(*p2q2, *p1q1, *p0q0);
+  {
+    // filter 4
+    int32x2x2_t ps0_qs0, ps1_qs1;
+    int16x8_t filter_s16;
+    const uint8x8_t thresh_f4 = vdup_n_u8(thresh);
+    uint8x8_t temp0_8x8, temp1_8x8;
+    int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8;
+    int8x8_t op0, oq0, op1, oq1;
+    int8x8_t pq_s0, pq_s1;
+    int8x8_t filter_s8, filter1_s8, filter2_s8;
+    int8x8_t hev_8x8;
+    const int8x8_t sign_mask = vdup_n_s8(0x80);
+    const int8x8_t val_4 = vdup_n_s8(4);
+    const int8x8_t val_3 = vdup_n_s8(3);
+
+    pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask);
+    pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask);
+
+    ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0));
+    ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1));
+    ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]);
+    qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]);
+    ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]);
+    qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]);
+
+    // hev_mask
+    temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4);
+    temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8)));
+    hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8));
+
+    // add outer taps if we have high edge variance
+    filter_s8 = vqsub_s8(ps1_s8, qs1_s8);
+    filter_s8 = vand_s8(filter_s8, hev_8x8);
+
+    // inner taps
+    temp_s8 = vqsub_s8(qs0_s8, ps0_s8);
+    filter_s16 = vmovl_s8(filter_s8);
+    filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3);
+    filter_s8 = vqmovn_s16(filter_s16);
+    filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8));
+
+    filter1_s8 = vqadd_s8(filter_s8, val_4);
+    filter2_s8 = vqadd_s8(filter_s8, val_3);
+    filter1_s8 = vshr_n_s8(filter1_s8, 3);
+    filter2_s8 = vshr_n_s8(filter2_s8, 3);
+
+    oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask);
+    op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask);
+
+    filter_s8 = vrshr_n_s8(filter1_s8, 1);
+    filter_s8 = vbic_s8(filter_s8, hev_8x8);
+
+    oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask);
+    op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask);
+
+    out_f4_pq0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4));
+    out_f4_pq1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4));
+  }
+  {
+    // filter 6
+    uint16x8_t out_pq0, out_pq1;
+    uint8x8_t pq_rev;
+
+    out = vaddl_u8(*p0q0, *p1q1);
+    out = vaddq_u16(out, out);
+    out = vaddw_u8(out, *p2q2);
+
+    pq_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p0q0)));
+    out = vaddw_u8(out, pq_rev);
+
+    out_pq0 = vaddw_u8(out, pq_rev);
+    pq_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p1q1)));
+    out_pq0 = vaddw_u8(out_pq0, pq_rev);
+
+    out_pq1 = vaddw_u8(out, *p2q2);
+    out_pq1 = vaddw_u8(out_pq1, *p2q2);
+
+    out_f6_pq0 = vrshrn_n_u16(out_pq0, 3);
+    out_f6_pq1 = vrshrn_n_u16(out_pq1, 3);
+  }
+  {
+    uint8x8_t filter4_cond, filter6_cond;
+    filter6_cond = vand_u8(flat_8x8, mask_8x8);
+    filter4_cond = vmvn_u8(filter6_cond);
+
+    // filter4 outputs
+    *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0);
+    *p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1);
+
+    // filter6 outputs
+    *p0q0 = vbsl_u8(filter6_cond, out_f6_pq0, *p0q0);
+    *p1q1 = vbsl_u8(filter6_cond, out_f6_pq1, *p1q1);
+  }
+}
+
+static void lpf_4_neon(uint8x8_t *p1q1, uint8x8_t *p0q0, const uint8_t blimit,
+                       const uint8_t limit, const uint8_t thresh) {
+  int32x2x2_t ps0_qs0, ps1_qs1;
+  int16x8_t filter_s16;
+  const uint8x8_t thresh_f4 = vdup_n_u8(thresh);
+  uint8x8_t mask_8x8, temp0_8x8, temp1_8x8;
+  int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8;
+  int8x8_t op0, oq0, op1, oq1;
+  int8x8_t pq_s0, pq_s1;
+  int8x8_t filter_s8, filter1_s8, filter2_s8;
+  int8x8_t hev_8x8;
+  const int8x8_t sign_mask = vdup_n_s8(0x80);
+  const int8x8_t val_4 = vdup_n_s8(4);
+  const int8x8_t val_3 = vdup_n_s8(3);
+
+  // Calculate filter mask
+  mask_8x8 = lpf_mask2(*p1q1, *p0q0, blimit, limit);
+
+  pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask);
+  pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask);
+
+  ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0));
+  ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1));
+  ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]);
+  qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]);
+  ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]);
+  qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]);
+
+  // hev_mask
+  temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4);
+  temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8)));
+  hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8));
+
+  // add outer taps if we have high edge variance
+  filter_s8 = vqsub_s8(ps1_s8, qs1_s8);
+  filter_s8 = vand_s8(filter_s8, hev_8x8);
+
+  // inner taps
+  temp_s8 = vqsub_s8(qs0_s8, ps0_s8);
+  filter_s16 = vmovl_s8(filter_s8);
+  filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3);
+  filter_s8 = vqmovn_s16(filter_s16);
+  filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8));
+
+  filter1_s8 = vqadd_s8(filter_s8, val_4);
+  filter2_s8 = vqadd_s8(filter_s8, val_3);
+  filter1_s8 = vshr_n_s8(filter1_s8, 3);
+  filter2_s8 = vshr_n_s8(filter2_s8, 3);
+
+  oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask);
+  op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask);
+
+  filter_s8 = vrshr_n_s8(filter1_s8, 1);
+  filter_s8 = vbic_s8(filter_s8, hev_8x8);
+
+  oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask);
+  op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask);
+
+  *p0q0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4));
+  *p1q1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4));
+}
+
+void aom_lpf_vertical_14_neon(uint8_t *src, int stride, const uint8_t *blimit,
+                              const uint8_t *limit, const uint8_t *thresh) {
+  uint8x16_t row0, row1, row2, row3;
+  uint8x8_t pxp3, p6p2, p5p1, p4p0;
+  uint8x8_t q0q4, q1q5, q2q6, q3qy;
+  uint32x2x2_t p6q6_p2q2, p5q5_p1q1, p4q4_p0q0, pxqx_p3q3;
+  uint32x2_t pq_rev;
+  uint8x8_t p0q0, p1q1, p2q2, p3q3, p4q4, p5q5, p6q6;
+
+  // row0: x p6 p5 p4 p3 p2 p1 p0 | q0 q1 q2 q3 q4 q5 q6 y
+  // row1: x p6 p5 p4 p3 p2 p1 p0 | q0 q1 q2 q3 q4 q5 q6 y
+  // row2: x p6 p5 p4 p3 p2 p1 p0 | q0 q1 q2 q3 q4 q5 q6 y
+  // row3: x p6 p5 p4 p3 p2 p1 p0 | q0 q1 q2 q3 q4 q5 q6 y
+  load_u8_8x16(src - 8, stride, &row0, &row1, &row2, &row3);
+
+  pxp3 = vget_low_u8(row0);
+  p6p2 = vget_low_u8(row1);
+  p5p1 = vget_low_u8(row2);
+  p4p0 = vget_low_u8(row3);
+  transpose_u8_8x4(&pxp3, &p6p2, &p5p1, &p4p0);
+
+  q0q4 = vget_high_u8(row0);
+  q1q5 = vget_high_u8(row1);
+  q2q6 = vget_high_u8(row2);
+  q3qy = vget_high_u8(row3);
+  transpose_u8_8x4(&q0q4, &q1q5, &q2q6, &q3qy);
+
+  pq_rev = vrev64_u32(vreinterpret_u32_u8(q3qy));
+  pxqx_p3q3 = vtrn_u32(vreinterpret_u32_u8(pxp3), pq_rev);
+
+  pq_rev = vrev64_u32(vreinterpret_u32_u8(q1q5));
+  p5q5_p1q1 = vtrn_u32(vreinterpret_u32_u8(p5p1), pq_rev);
+
+  pq_rev = vrev64_u32(vreinterpret_u32_u8(q0q4));
+  p4q4_p0q0 = vtrn_u32(vreinterpret_u32_u8(p4p0), pq_rev);
+
+  pq_rev = vrev64_u32(vreinterpret_u32_u8(q2q6));
+  p6q6_p2q2 = vtrn_u32(vreinterpret_u32_u8(p6p2), pq_rev);
+
+  p0q0 = vreinterpret_u8_u32(p4q4_p0q0.val[1]);
+  p1q1 = vreinterpret_u8_u32(p5q5_p1q1.val[1]);
+  p2q2 = vreinterpret_u8_u32(p6q6_p2q2.val[1]);
+  p3q3 = vreinterpret_u8_u32(pxqx_p3q3.val[1]);
+  p4q4 = vreinterpret_u8_u32(p4q4_p0q0.val[0]);
+  p5q5 = vreinterpret_u8_u32(p5q5_p1q1.val[0]);
+  p6q6 = vreinterpret_u8_u32(p6q6_p2q2.val[0]);
+
+  lpf_14_neon(&p6q6, &p5q5, &p4q4, &p3q3, &p2q2, &p1q1, &p0q0, *blimit, *limit,
+              *thresh);
+
+  pxqx_p3q3 = vtrn_u32(pxqx_p3q3.val[0], vreinterpret_u32_u8(p3q3));
+  p5q5_p1q1 = vtrn_u32(vreinterpret_u32_u8(p5q5), vreinterpret_u32_u8(p1q1));
+  p4q4_p0q0 = vtrn_u32(vreinterpret_u32_u8(p4q4), vreinterpret_u32_u8(p0q0));
+  p6q6_p2q2 = vtrn_u32(vreinterpret_u32_u8(p6q6), vreinterpret_u32_u8(p2q2));
+
+  pxqx_p3q3.val[1] = vrev64_u32(pxqx_p3q3.val[1]);
+  p5q5_p1q1.val[1] = vrev64_u32(p5q5_p1q1.val[1]);
+  p4q4_p0q0.val[1] = vrev64_u32(p4q4_p0q0.val[1]);
+  p6q6_p2q2.val[1] = vrev64_u32(p6q6_p2q2.val[1]);
+
+  q0q4 = vreinterpret_u8_u32(p4q4_p0q0.val[1]);
+  q1q5 = vreinterpret_u8_u32(p5q5_p1q1.val[1]);
+  q2q6 = vreinterpret_u8_u32(p6q6_p2q2.val[1]);
+  q3qy = vreinterpret_u8_u32(pxqx_p3q3.val[1]);
+  transpose_u8_8x4(&q0q4, &q1q5, &q2q6, &q3qy);
+
+  pxp3 = vreinterpret_u8_u32(pxqx_p3q3.val[0]);
+  p6p2 = vreinterpret_u8_u32(p6q6_p2q2.val[0]);
+  p5p1 = vreinterpret_u8_u32(p5q5_p1q1.val[0]);
+  p4p0 = vreinterpret_u8_u32(p4q4_p0q0.val[0]);
+  transpose_u8_8x4(&pxp3, &p6p2, &p5p1, &p4p0);
+
+  row0 = vcombine_u8(pxp3, q0q4);
+  row1 = vcombine_u8(p6p2, q1q5);
+  row2 = vcombine_u8(p5p1, q2q6);
+  row3 = vcombine_u8(p4p0, q3qy);
+
+  store_u8_8x16(src - 8, stride, row0, row1, row2, row3);
+}
+
+void aom_lpf_vertical_8_neon(uint8_t *src, int stride, const uint8_t *blimit,
+                             const uint8_t *limit, const uint8_t *thresh) {
+  uint32x2x2_t p2q2_p1q1, p3q3_p0q0;
+  uint32x2_t pq_rev;
+  uint8x8_t p3q0, p2q1, p1q2, p0q3;
+  uint8x8_t p0q0, p1q1, p2q2, p3q3;
+
+  // row0: p3 p2 p1 p0 | q0 q1 q2 q3
+  // row1: p3 p2 p1 p0 | q0 q1 q2 q3
+  // row2: p3 p2 p1 p0 | q0 q1 q2 q3
+  // row3: p3 p2 p1 p0 | q0 q1 q2 q3
+  load_u8_8x4(src - 4, stride, &p3q0, &p2q1, &p1q2, &p0q3);
+
+  transpose_u8_8x4(&p3q0, &p2q1, &p1q2, &p0q3);
+
+  pq_rev = vrev64_u32(vreinterpret_u32_u8(p0q3));
+  p3q3_p0q0 = vtrn_u32(vreinterpret_u32_u8(p3q0), pq_rev);
+
+  pq_rev = vrev64_u32(vreinterpret_u32_u8(p1q2));
+  p2q2_p1q1 = vtrn_u32(vreinterpret_u32_u8(p2q1), pq_rev);
+
+  p0q0 = vreinterpret_u8_u32(vrev64_u32(p3q3_p0q0.val[1]));
+  p1q1 = vreinterpret_u8_u32(vrev64_u32(p2q2_p1q1.val[1]));
+  p2q2 = vreinterpret_u8_u32(p2q2_p1q1.val[0]);
+  p3q3 = vreinterpret_u8_u32(p3q3_p0q0.val[0]);
+
+  lpf_8_neon(&p3q3, &p2q2, &p1q1, &p0q0, *blimit, *limit, *thresh);
+
+  pq_rev = vrev64_u32(vreinterpret_u32_u8(p0q0));
+  p3q3_p0q0 = vtrn_u32(vreinterpret_u32_u8(p3q3), pq_rev);
+
+  pq_rev = vrev64_u32(vreinterpret_u32_u8(p1q1));
+  p2q2_p1q1 = vtrn_u32(vreinterpret_u32_u8(p2q2), pq_rev);
+
+  p0q3 = vreinterpret_u8_u32(vrev64_u32(p3q3_p0q0.val[1]));
+  p1q2 = vreinterpret_u8_u32(vrev64_u32(p2q2_p1q1.val[1]));
+  p2q1 = vreinterpret_u8_u32(p2q2_p1q1.val[0]);
+  p3q0 = vreinterpret_u8_u32(p3q3_p0q0.val[0]);
+  transpose_u8_8x4(&p3q0, &p2q1, &p1q2, &p0q3);
+
+  store_u8_8x4(src - 4, stride, p3q0, p2q1, p1q2, p0q3);
+}
+
+void aom_lpf_vertical_6_neon(uint8_t *src, int stride, const uint8_t *blimit,
+                             const uint8_t *limit, const uint8_t *thresh) {
+  uint32x2x2_t p2q2_p1q1, pxqy_p0q0;
+  uint32x2_t pq_rev;
+  uint8x8_t pxq0, p2q1, p1q2, p0qy;
+  uint8x8_t p0q0, p1q1, p2q2, pxqy;
+
+  // row0: px p2 p1 p0 | q0 q1 q2 qy
+  // row1: px p2 p1 p0 | q0 q1 q2 qy
+  // row2: px p2 p1 p0 | q0 q1 q2 qy
+  // row3: px p2 p1 p0 | q0 q1 q2 qy
+  load_u8_8x4(src - 4, stride, &pxq0, &p2q1, &p1q2, &p0qy);
+
+  transpose_u8_8x4(&pxq0, &p2q1, &p1q2, &p0qy);
+
+  pq_rev = vrev64_u32(vreinterpret_u32_u8(p0qy));
+  pxqy_p0q0 = vtrn_u32(vreinterpret_u32_u8(pxq0), pq_rev);
+
+  pq_rev = vrev64_u32(vreinterpret_u32_u8(p1q2));
+  p2q2_p1q1 = vtrn_u32(vreinterpret_u32_u8(p2q1), pq_rev);
+
+  p0q0 = vreinterpret_u8_u32(vrev64_u32(pxqy_p0q0.val[1]));
+  p1q1 = vreinterpret_u8_u32(vrev64_u32(p2q2_p1q1.val[1]));
+  p2q2 = vreinterpret_u8_u32(p2q2_p1q1.val[0]);
+  pxqy = vreinterpret_u8_u32(pxqy_p0q0.val[0]);
+
+  lpf_6_neon(&p2q2, &p1q1, &p0q0, *blimit, *limit, *thresh);
+
+  pq_rev = vrev64_u32(vreinterpret_u32_u8(p0q0));
+  pxqy_p0q0 = vtrn_u32(vreinterpret_u32_u8(pxqy), pq_rev);
+
+  pq_rev = vrev64_u32(vreinterpret_u32_u8(p1q1));
+  p2q2_p1q1 = vtrn_u32(vreinterpret_u32_u8(p2q2), pq_rev);
+
+  p0qy = vreinterpret_u8_u32(vrev64_u32(pxqy_p0q0.val[1]));
+  p1q2 = vreinterpret_u8_u32(vrev64_u32(p2q2_p1q1.val[1]));
+  p2q1 = vreinterpret_u8_u32(p2q2_p1q1.val[0]);
+  pxq0 = vreinterpret_u8_u32(pxqy_p0q0.val[0]);
+  transpose_u8_8x4(&pxq0, &p2q1, &p1q2, &p0qy);
+
+  store_u8_8x4(src - 4, stride, pxq0, p2q1, p1q2, p0qy);
+}
+
+void aom_lpf_vertical_4_neon(uint8_t *src, int stride, const uint8_t *blimit,
+                             const uint8_t *limit, const uint8_t *thresh) {
+  uint32x2x2_t p1q0_p0q1, p1q1_p0q0, p1p0_q1q0;
+  uint32x2_t pq_rev;
+  uint8x8_t UNINITIALIZED_IS_SAFE(p1p0), UNINITIALIZED_IS_SAFE(q0q1);
+  uint8x8_t p0q0, p1q1;
+
+  // row0: p1 p0 | q0 q1
+  // row1: p1 p0 | q0 q1
+  // row2: p1 p0 | q0 q1
+  // row3: p1 p0 | q0 q1
+  load_unaligned_u8_4x4(src - 2, stride, (uint32x2_t *)&p1p0,
+                        (uint32x2_t *)&q0q1);
+
+  transpose_u8_4x4(&p1p0, &q0q1);
+
+  p1q0_p0q1 = vtrn_u32(vreinterpret_u32_u8(p1p0), vreinterpret_u32_u8(q0q1));
+
+  pq_rev = vrev64_u32(p1q0_p0q1.val[1]);
+  p1q1_p0q0 = vtrn_u32(p1q0_p0q1.val[0], pq_rev);
+
+  p1q1 = vreinterpret_u8_u32(p1q1_p0q0.val[0]);
+  p0q0 = vreinterpret_u8_u32(p1q1_p0q0.val[1]);
+
+  lpf_4_neon(&p1q1, &p0q0, *blimit, *limit, *thresh);
+
+  p1p0_q1q0 = vtrn_u32(vreinterpret_u32_u8(p1q1), vreinterpret_u32_u8(p0q0));
+
+  p1p0 = vreinterpret_u8_u32(p1p0_q1q0.val[0]);
+  q0q1 = vreinterpret_u8_u32(vrev64_u32(p1p0_q1q0.val[1]));
+
+  transpose_u8_4x4(&p1p0, &q0q1);
+
+  store_unaligned_u8_4x1(src - 2, p1p0, 0);
+  store_unaligned_u8_4x1((src - 2) + 1 * stride, q0q1, 0);
+  store_unaligned_u8_4x1((src - 2) + 2 * stride, p1p0, 1);
+  store_unaligned_u8_4x1((src - 2) + 3 * stride, q0q1, 1);
+}
+
+void aom_lpf_horizontal_14_neon(uint8_t *src, int stride, const uint8_t *blimit,
+                                const uint8_t *limit, const uint8_t *thresh) {
+  uint8x8_t p0q0, p1q1, p2q2, p3q3, p4q4, p5q5, UNINITIALIZED_IS_SAFE(p6q6);
+
+  load_u8_4x1(src - 7 * stride, &p6q6, 0);
+  load_u8_4x1(src - 6 * stride, &p5q5, 0);
+  load_u8_4x1(src - 5 * stride, &p4q4, 0);
+  load_u8_4x1(src - 4 * stride, &p3q3, 0);
+  load_u8_4x1(src - 3 * stride, &p2q2, 0);
+  load_u8_4x1(src - 2 * stride, &p1q1, 0);
+  load_u8_4x1(src - 1 * stride, &p0q0, 0);
+  load_u8_4x1(src + 0 * stride, &p0q0, 1);
+  load_u8_4x1(src + 1 * stride, &p1q1, 1);
+  load_u8_4x1(src + 2 * stride, &p2q2, 1);
+  load_u8_4x1(src + 3 * stride, &p3q3, 1);
+  load_u8_4x1(src + 4 * stride, &p4q4, 1);
+  load_u8_4x1(src + 5 * stride, &p5q5, 1);
+  load_u8_4x1(src + 6 * stride, &p6q6, 1);
+
+  lpf_14_neon(&p6q6, &p5q5, &p4q4, &p3q3, &p2q2, &p1q1, &p0q0, *blimit, *limit,
+              *thresh);
+
+  store_u8_4x1(src - 6 * stride, p5q5, 0);
+  store_u8_4x1(src - 5 * stride, p4q4, 0);
+  store_u8_4x1(src - 4 * stride, p3q3, 0);
+  store_u8_4x1(src - 3 * stride, p2q2, 0);
+  store_u8_4x1(src - 2 * stride, p1q1, 0);
+  store_u8_4x1(src - 1 * stride, p0q0, 0);
+  store_u8_4x1(src + 0 * stride, p0q0, 1);
+  store_u8_4x1(src + 1 * stride, p1q1, 1);
+  store_u8_4x1(src + 2 * stride, p2q2, 1);
+  store_u8_4x1(src + 3 * stride, p3q3, 1);
+  store_u8_4x1(src + 4 * stride, p4q4, 1);
+  store_u8_4x1(src + 5 * stride, p5q5, 1);
+}
+
+void aom_lpf_horizontal_8_neon(uint8_t *src, int stride, const uint8_t *blimit,
+                               const uint8_t *limit, const uint8_t *thresh) {
+  uint8x8_t p0q0, p1q1, p2q2, p3q3;
+
+  p3q3 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 4 * stride)));
+  p2q2 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 3 * stride)));
+  p1q1 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 2 * stride)));
+  p0q0 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 1 * stride)));
+  p0q0 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 0 * stride),
+                                           vreinterpret_u32_u8(p0q0), 1));
+  p1q1 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 1 * stride),
+                                           vreinterpret_u32_u8(p1q1), 1));
+  p2q2 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 2 * stride),
+                                           vreinterpret_u32_u8(p2q2), 1));
+  p3q3 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 3 * stride),
+                                           vreinterpret_u32_u8(p3q3), 1));
+
+  lpf_8_neon(&p3q3, &p2q2, &p1q1, &p0q0, *blimit, *limit, *thresh);
+
+  vst1_lane_u32((uint32_t *)(src - 4 * stride), vreinterpret_u32_u8(p3q3), 0);
+  vst1_lane_u32((uint32_t *)(src - 3 * stride), vreinterpret_u32_u8(p2q2), 0);
+  vst1_lane_u32((uint32_t *)(src - 2 * stride), vreinterpret_u32_u8(p1q1), 0);
+  vst1_lane_u32((uint32_t *)(src - 1 * stride), vreinterpret_u32_u8(p0q0), 0);
+  vst1_lane_u32((uint32_t *)(src + 0 * stride), vreinterpret_u32_u8(p0q0), 1);
+  vst1_lane_u32((uint32_t *)(src + 1 * stride), vreinterpret_u32_u8(p1q1), 1);
+  vst1_lane_u32((uint32_t *)(src + 2 * stride), vreinterpret_u32_u8(p2q2), 1);
+  vst1_lane_u32((uint32_t *)(src + 3 * stride), vreinterpret_u32_u8(p3q3), 1);
+}
+
+void aom_lpf_horizontal_6_neon(uint8_t *src, int stride, const uint8_t *blimit,
+                               const uint8_t *limit, const uint8_t *thresh) {
+  uint8x8_t p0q0, p1q1, p2q2;
+
+  p2q2 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 3 * stride)));
+  p1q1 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 2 * stride)));
+  p0q0 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 1 * stride)));
+  p0q0 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 0 * stride),
+                                           vreinterpret_u32_u8(p0q0), 1));
+  p1q1 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 1 * stride),
+                                           vreinterpret_u32_u8(p1q1), 1));
+  p2q2 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 2 * stride),
+                                           vreinterpret_u32_u8(p2q2), 1));
+
+  lpf_6_neon(&p2q2, &p1q1, &p0q0, *blimit, *limit, *thresh);
+
+  vst1_lane_u32((uint32_t *)(src - 3 * stride), vreinterpret_u32_u8(p2q2), 0);
+  vst1_lane_u32((uint32_t *)(src - 2 * stride), vreinterpret_u32_u8(p1q1), 0);
+  vst1_lane_u32((uint32_t *)(src - 1 * stride), vreinterpret_u32_u8(p0q0), 0);
+  vst1_lane_u32((uint32_t *)(src + 0 * stride), vreinterpret_u32_u8(p0q0), 1);
+  vst1_lane_u32((uint32_t *)(src + 1 * stride), vreinterpret_u32_u8(p1q1), 1);
+  vst1_lane_u32((uint32_t *)(src + 2 * stride), vreinterpret_u32_u8(p2q2), 1);
+}
+
+void aom_lpf_horizontal_4_neon(uint8_t *src, int stride, const uint8_t *blimit,
+                               const uint8_t *limit, const uint8_t *thresh) {
+  uint8x8_t p0q0, UNINITIALIZED_IS_SAFE(p1q1);
+
+  load_u8_4x1(src - 2 * stride, &p1q1, 0);
+  load_u8_4x1(src - 1 * stride, &p0q0, 0);
+  load_u8_4x1(src + 0 * stride, &p0q0, 1);
+  load_u8_4x1(src + 1 * stride, &p1q1, 1);
+
+  lpf_4_neon(&p1q1, &p0q0, *blimit, *limit, *thresh);
+
+  store_u8_4x1(src - 2 * stride, p1q1, 0);
+  store_u8_4x1(src - 1 * stride, p0q0, 0);
+  store_u8_4x1(src + 0 * stride, p0q0, 1);
+  store_u8_4x1(src + 1 * stride, p1q1, 1);
+}
diff --git a/libs/libaom/src/aom_dsp/arm/sad4d_neon.c b/libs/libaom/src/aom_dsp/arm/sad4d_neon.c
new file mode 100644
index 000000000..606950ab2
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/arm/sad4d_neon.c
@@ -0,0 +1,226 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+
+static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,
+                                                    const uint16x8_t vec_hi) {
+  const uint32x4_t vec_l_lo =
+      vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo));
+  const uint32x4_t vec_l_hi =
+      vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi));
+  const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);
+  const uint64x2_t b = vpaddlq_u32(a);
+  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
+                                vreinterpret_u32_u64(vget_high_u64(b)));
+  return vget_lane_u32(c, 0);
+}
+
+// Calculate the absolute difference of 64 bytes from vec_src_00, vec_src_16,
+// vec_src_32, vec_src_48 and ref. Accumulate partial sums in vec_sum_ref_lo
+// and vec_sum_ref_hi.
+static void sad_neon_64(const uint8x16_t vec_src_00,
+                        const uint8x16_t vec_src_16,
+                        const uint8x16_t vec_src_32,
+                        const uint8x16_t vec_src_48, const uint8_t *ref,
+                        uint16x8_t *vec_sum_ref_lo,
+                        uint16x8_t *vec_sum_ref_hi) {
+  const uint8x16_t vec_ref_00 = vld1q_u8(ref);
+  const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
+  const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32);
+  const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48);
+
+  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00),
+                             vget_low_u8(vec_ref_00));
+  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00),
+                             vget_high_u8(vec_ref_00));
+  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16),
+                             vget_low_u8(vec_ref_16));
+  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16),
+                             vget_high_u8(vec_ref_16));
+  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_32),
+                             vget_low_u8(vec_ref_32));
+  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_32),
+                             vget_high_u8(vec_ref_32));
+  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_48),
+                             vget_low_u8(vec_ref_48));
+  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_48),
+                             vget_high_u8(vec_ref_48));
+}
+
+// Calculate the absolute difference of 32 bytes from vec_src_00, vec_src_16,
+// and ref. Accumulate partial sums in vec_sum_ref_lo and vec_sum_ref_hi.
+static void sad_neon_32(const uint8x16_t vec_src_00,
+                        const uint8x16_t vec_src_16, const uint8_t *ref,
+                        uint16x8_t *vec_sum_ref_lo,
+                        uint16x8_t *vec_sum_ref_hi) {
+  const uint8x16_t vec_ref_00 = vld1q_u8(ref);
+  const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
+
+  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00),
+                             vget_low_u8(vec_ref_00));
+  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00),
+                             vget_high_u8(vec_ref_00));
+  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16),
+                             vget_low_u8(vec_ref_16));
+  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16),
+                             vget_high_u8(vec_ref_16));
+}
+
+void aom_sad64x64x4d_neon(const uint8_t *src, int src_stride,
+                          const uint8_t *const ref[4], int ref_stride,
+                          uint32_t *res) {
+  int i;
+  uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0);
+  const uint8_t *ref0, *ref1, *ref2, *ref3;
+  ref0 = ref[0];
+  ref1 = ref[1];
+  ref2 = ref[2];
+  ref3 = ref[3];
+
+  for (i = 0; i < 64; ++i) {
+    const uint8x16_t vec_src_00 = vld1q_u8(src);
+    const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
+    const uint8x16_t vec_src_32 = vld1q_u8(src + 32);
+    const uint8x16_t vec_src_48 = vld1q_u8(src + 48);
+
+    sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref0,
+                &vec_sum_ref0_lo, &vec_sum_ref0_hi);
+    sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref1,
+                &vec_sum_ref1_lo, &vec_sum_ref1_hi);
+    sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref2,
+                &vec_sum_ref2_lo, &vec_sum_ref2_hi);
+    sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref3,
+                &vec_sum_ref3_lo, &vec_sum_ref3_hi);
+
+    src += src_stride;
+    ref0 += ref_stride;
+    ref1 += ref_stride;
+    ref2 += ref_stride;
+    ref3 += ref_stride;
+  }
+
+  res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
+  res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
+  res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
+  res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
+}
+
+void aom_sad32x32x4d_neon(const uint8_t *src, int src_stride,
+                          const uint8_t *const ref[4], int ref_stride,
+                          uint32_t *res) {
+  int i;
+  uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0);
+  const uint8_t *ref0, *ref1, *ref2, *ref3;
+  ref0 = ref[0];
+  ref1 = ref[1];
+  ref2 = ref[2];
+  ref3 = ref[3];
+
+  for (i = 0; i < 32; ++i) {
+    const uint8x16_t vec_src_00 = vld1q_u8(src);
+    const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
+
+    sad_neon_32(vec_src_00, vec_src_16, ref0, &vec_sum_ref0_lo,
+                &vec_sum_ref0_hi);
+    sad_neon_32(vec_src_00, vec_src_16, ref1, &vec_sum_ref1_lo,
+                &vec_sum_ref1_hi);
+    sad_neon_32(vec_src_00, vec_src_16, ref2, &vec_sum_ref2_lo,
+                &vec_sum_ref2_hi);
+    sad_neon_32(vec_src_00, vec_src_16, ref3, &vec_sum_ref3_lo,
+                &vec_sum_ref3_hi);
+
+    src += src_stride;
+    ref0 += ref_stride;
+    ref1 += ref_stride;
+    ref2 += ref_stride;
+    ref3 += ref_stride;
+  }
+
+  res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
+  res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
+  res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
+  res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
+}
+
+void aom_sad16x16x4d_neon(const uint8_t *src, int src_stride,
+                          const uint8_t *const ref[4], int ref_stride,
+                          uint32_t *res) {
+  int i;
+  uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0);
+  const uint8_t *ref0, *ref1, *ref2, *ref3;
+  ref0 = ref[0];
+  ref1 = ref[1];
+  ref2 = ref[2];
+  ref3 = ref[3];
+
+  for (i = 0; i < 16; ++i) {
+    const uint8x16_t vec_src = vld1q_u8(src);
+    const uint8x16_t vec_ref0 = vld1q_u8(ref0);
+    const uint8x16_t vec_ref1 = vld1q_u8(ref1);
+    const uint8x16_t vec_ref2 = vld1q_u8(ref2);
+    const uint8x16_t vec_ref3 = vld1q_u8(ref3);
+
+    vec_sum_ref0_lo =
+        vabal_u8(vec_sum_ref0_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref0));
+    vec_sum_ref0_hi = vabal_u8(vec_sum_ref0_hi, vget_high_u8(vec_src),
+                               vget_high_u8(vec_ref0));
+    vec_sum_ref1_lo =
+        vabal_u8(vec_sum_ref1_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref1));
+    vec_sum_ref1_hi = vabal_u8(vec_sum_ref1_hi, vget_high_u8(vec_src),
+                               vget_high_u8(vec_ref1));
+    vec_sum_ref2_lo =
+        vabal_u8(vec_sum_ref2_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref2));
+    vec_sum_ref2_hi = vabal_u8(vec_sum_ref2_hi, vget_high_u8(vec_src),
+                               vget_high_u8(vec_ref2));
+    vec_sum_ref3_lo =
+        vabal_u8(vec_sum_ref3_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref3));
+    vec_sum_ref3_hi = vabal_u8(vec_sum_ref3_hi, vget_high_u8(vec_src),
+                               vget_high_u8(vec_ref3));
+
+    src += src_stride;
+    ref0 += ref_stride;
+    ref1 += ref_stride;
+    ref2 += ref_stride;
+    ref3 += ref_stride;
+  }
+
+  res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
+  res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
+  res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
+  res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
+}
diff --git a/libs/libaom/src/aom_dsp/arm/sad_neon.c b/libs/libaom/src/aom_dsp/arm/sad_neon.c
new file mode 100644
index 000000000..a39de91d6
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/arm/sad_neon.c
@@ -0,0 +1,224 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+
+unsigned int aom_sad8x16_neon(unsigned char *src_ptr, int src_stride,
+                              unsigned char *ref_ptr, int ref_stride) {
+  uint8x8_t d0, d8;
+  uint16x8_t q12;
+  uint32x4_t q1;
+  uint64x2_t q3;
+  uint32x2_t d5;
+  int i;
+
+  d0 = vld1_u8(src_ptr);
+  src_ptr += src_stride;
+  d8 = vld1_u8(ref_ptr);
+  ref_ptr += ref_stride;
+  q12 = vabdl_u8(d0, d8);
+
+  for (i = 0; i < 15; i++) {
+    d0 = vld1_u8(src_ptr);
+    src_ptr += src_stride;
+    d8 = vld1_u8(ref_ptr);
+    ref_ptr += ref_stride;
+    q12 = vabal_u8(q12, d0, d8);
+  }
+
+  q1 = vpaddlq_u16(q12);
+  q3 = vpaddlq_u32(q1);
+  d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
+                vreinterpret_u32_u64(vget_high_u64(q3)));
+
+  return vget_lane_u32(d5, 0);
+}
+
+unsigned int aom_sad4x4_neon(unsigned char *src_ptr, int src_stride,
+                             unsigned char *ref_ptr, int ref_stride) {
+  uint8x8_t d0, d8;
+  uint16x8_t q12;
+  uint32x2_t d1;
+  uint64x1_t d3;
+  int i;
+
+  d0 = vld1_u8(src_ptr);
+  src_ptr += src_stride;
+  d8 = vld1_u8(ref_ptr);
+  ref_ptr += ref_stride;
+  q12 = vabdl_u8(d0, d8);
+
+  for (i = 0; i < 3; i++) {
+    d0 = vld1_u8(src_ptr);
+    src_ptr += src_stride;
+    d8 = vld1_u8(ref_ptr);
+    ref_ptr += ref_stride;
+    q12 = vabal_u8(q12, d0, d8);
+  }
+
+  d1 = vpaddl_u16(vget_low_u16(q12));
+  d3 = vpaddl_u32(d1);
+
+  return vget_lane_u32(vreinterpret_u32_u64(d3), 0);
+}
+
+unsigned int aom_sad16x8_neon(unsigned char *src_ptr, int src_stride,
+                              unsigned char *ref_ptr, int ref_stride) {
+  uint8x16_t q0, q4;
+  uint16x8_t q12, q13;
+  uint32x4_t q1;
+  uint64x2_t q3;
+  uint32x2_t d5;
+  int i;
+
+  q0 = vld1q_u8(src_ptr);
+  src_ptr += src_stride;
+  q4 = vld1q_u8(ref_ptr);
+  ref_ptr += ref_stride;
+  q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4));
+  q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4));
+
+  for (i = 0; i < 7; i++) {
+    q0 = vld1q_u8(src_ptr);
+    src_ptr += src_stride;
+    q4 = vld1q_u8(ref_ptr);
+    ref_ptr += ref_stride;
+    q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4));
+    q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4));
+  }
+
+  q12 = vaddq_u16(q12, q13);
+  q1 = vpaddlq_u16(q12);
+  q3 = vpaddlq_u32(q1);
+  d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
+                vreinterpret_u32_u64(vget_high_u64(q3)));
+
+  return vget_lane_u32(d5, 0);
+}
+
+static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,
+                                                    const uint16x8_t vec_hi) {
+  const uint32x4_t vec_l_lo =
+      vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo));
+  const uint32x4_t vec_l_hi =
+      vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi));
+  const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);
+  const uint64x2_t b = vpaddlq_u32(a);
+  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
+                                vreinterpret_u32_u64(vget_high_u64(b)));
+  return vget_lane_u32(c, 0);
+}
+static INLINE unsigned int horizontal_add_16x8(const uint16x8_t vec_16x8) {
+  const uint32x4_t a = vpaddlq_u16(vec_16x8);
+  const uint64x2_t b = vpaddlq_u32(a);
+  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
+                                vreinterpret_u32_u64(vget_high_u64(b)));
+  return vget_lane_u32(c, 0);
+}
+
+unsigned int aom_sad64x64_neon(const uint8_t *src, int src_stride,
+                               const uint8_t *ref, int ref_stride) {
+  int i;
+  uint16x8_t vec_accum_lo = vdupq_n_u16(0);
+  uint16x8_t vec_accum_hi = vdupq_n_u16(0);
+  for (i = 0; i < 64; ++i) {
+    const uint8x16_t vec_src_00 = vld1q_u8(src);
+    const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
+    const uint8x16_t vec_src_32 = vld1q_u8(src + 32);
+    const uint8x16_t vec_src_48 = vld1q_u8(src + 48);
+    const uint8x16_t vec_ref_00 = vld1q_u8(ref);
+    const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
+    const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32);
+    const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48);
+    src += src_stride;
+    ref += ref_stride;
+    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00),
+                            vget_low_u8(vec_ref_00));
+    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00),
+                            vget_high_u8(vec_ref_00));
+    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16),
+                            vget_low_u8(vec_ref_16));
+    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16),
+                            vget_high_u8(vec_ref_16));
+    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_32),
+                            vget_low_u8(vec_ref_32));
+    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_32),
+                            vget_high_u8(vec_ref_32));
+    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_48),
+                            vget_low_u8(vec_ref_48));
+    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_48),
+                            vget_high_u8(vec_ref_48));
+  }
+  return horizontal_long_add_16x8(vec_accum_lo, vec_accum_hi);
+}
+
+unsigned int aom_sad32x32_neon(const uint8_t *src, int src_stride,
+                               const uint8_t *ref, int ref_stride) {
+  int i;
+  uint16x8_t vec_accum_lo = vdupq_n_u16(0);
+  uint16x8_t vec_accum_hi = vdupq_n_u16(0);
+
+  for (i = 0; i < 32; ++i) {
+    const uint8x16_t vec_src_00 = vld1q_u8(src);
+    const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
+    const uint8x16_t vec_ref_00 = vld1q_u8(ref);
+    const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
+    src += src_stride;
+    ref += ref_stride;
+    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00),
+                            vget_low_u8(vec_ref_00));
+    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00),
+                            vget_high_u8(vec_ref_00));
+    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16),
+                            vget_low_u8(vec_ref_16));
+    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16),
+                            vget_high_u8(vec_ref_16));
+  }
+  return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));
+}
+
+unsigned int aom_sad16x16_neon(const uint8_t *src, int src_stride,
+                               const uint8_t *ref, int ref_stride) {
+  int i;
+  uint16x8_t vec_accum_lo = vdupq_n_u16(0);
+  uint16x8_t vec_accum_hi = vdupq_n_u16(0);
+
+  for (i = 0; i < 16; ++i) {
+    const uint8x16_t vec_src = vld1q_u8(src);
+    const uint8x16_t vec_ref = vld1q_u8(ref);
+    src += src_stride;
+    ref += ref_stride;
+    vec_accum_lo =
+        vabal_u8(vec_accum_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref));
+    vec_accum_hi =
+        vabal_u8(vec_accum_hi, vget_high_u8(vec_src), vget_high_u8(vec_ref));
+  }
+  return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));
+}
+
+unsigned int aom_sad8x8_neon(const uint8_t *src, int src_stride,
+                             const uint8_t *ref, int ref_stride) {
+  int i;
+  uint16x8_t vec_accum = vdupq_n_u16(0);
+
+  for (i = 0; i < 8; ++i) {
+    const uint8x8_t vec_src = vld1_u8(src);
+    const uint8x8_t vec_ref = vld1_u8(ref);
+    src += src_stride;
+    ref += ref_stride;
+    vec_accum = vabal_u8(vec_accum, vec_src, vec_ref);
+  }
+  return horizontal_add_16x8(vec_accum);
+}
diff --git a/libs/libaom/src/aom_dsp/arm/sse_neon.c b/libs/libaom/src/aom_dsp/arm/sse_neon.c
new file mode 100644
index 000000000..06b81cc3d
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/arm/sse_neon.c
@@ -0,0 +1,487 @@
+/*
+ *  Copyright (c) 2020, Alliance for Open Media. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+
+static INLINE uint32_t sse_W16x1_neon(uint8x16_t q2, uint8x16_t q3) {
+  const uint16_t sse1 = 0;
+  const uint16x8_t q1 = vld1q_dup_u16(&sse1);
+
+  uint32_t sse;
+
+  uint8x16_t q4 = vabdq_u8(q2, q3);  // diff = abs(a[x] - b[x])
+  uint8x8_t d0 = vget_low_u8(q4);
+  uint8x8_t d1 = vget_high_u8(q4);
+
+  uint16x8_t q6 = vmlal_u8(q1, d0, d0);
+  uint16x8_t q7 = vmlal_u8(q1, d1, d1);
+
+  uint32x4_t q8 = vaddl_u16(vget_low_u16(q6), vget_high_u16(q6));
+  uint32x4_t q9 = vaddl_u16(vget_low_u16(q7), vget_high_u16(q7));
+
+  uint32x2_t d4 = vadd_u32(vget_low_u32(q8), vget_high_u32(q8));
+  uint32x2_t d5 = vadd_u32(vget_low_u32(q9), vget_high_u32(q9));
+
+  uint32x2_t d6 = vadd_u32(d4, d5);
+
+  sse = vget_lane_u32(d6, 0);
+  sse += vget_lane_u32(d6, 1);
+
+  return sse;
+}
+
+int64_t aom_sse_neon(const uint8_t *a, int a_stride, const uint8_t *b,
+                     int b_stride, int width, int height) {
+  const uint8x16_t q0 = {
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+  };
+  int addinc, x, y;
+  uint8x8_t d0, d1, d2, d3;
+  uint8_t dx;
+  uint8x16_t q2, q3, q4, q5;
+  uint32_t sse = 0;
+  uint8x8x2_t tmp, tmp2;
+
+  switch (width) {
+    case 4:
+      for (y = 0; y < height; y += 4) {
+        d0 = vld1_u8(a);  // load 4 data
+        a += a_stride;
+        d1 = vld1_u8(a);
+        a += a_stride;
+        d2 = vld1_u8(a);
+        a += a_stride;
+        d3 = vld1_u8(a);
+        a += a_stride;
+        tmp = vzip_u8(d0, d1);
+        tmp2 = vzip_u8(d2, d3);
+        q2 = vcombine_u8(tmp.val[0], tmp2.val[0]);  // make a 16 data vector
+
+        d0 = vld1_u8(b);
+        b += b_stride;
+        d1 = vld1_u8(b);
+        b += b_stride;
+        d2 = vld1_u8(b);
+        b += b_stride;
+        d3 = vld1_u8(b);
+        b += b_stride;
+        tmp = vzip_u8(d0, d1);
+        tmp2 = vzip_u8(d2, d3);
+        q3 = vcombine_u8(tmp.val[0], tmp2.val[0]);
+
+        sse += sse_W16x1_neon(q2, q3);
+      }
+      break;
+    case 8:
+      for (y = 0; y < height; y += 2) {
+        d0 = vld1_u8(a);  // load 8 data
+        d1 = vld1_u8(a + a_stride);
+        q2 = vcombine_u8(d0, d1);  // make a 16 data vector
+
+        d0 = vld1_u8(b);
+        d1 = vld1_u8(b + b_stride);
+        q3 = vcombine_u8(d0, d1);
+
+        sse += sse_W16x1_neon(q2, q3);
+
+        a += 2 * a_stride;
+        b += 2 * b_stride;
+      }
+      break;
+    case 16:
+      for (y = 0; y < height; y++) {
+        q2 = vld1q_u8(a);
+        q3 = vld1q_u8(b);
+
+        sse += sse_W16x1_neon(q2, q3);
+
+        a += a_stride;
+        b += b_stride;
+      }
+      break;
+    case 32:
+      for (y = 0; y < height; y++) {
+        q2 = vld1q_u8(a);
+        q3 = vld1q_u8(b);
+
+        sse += sse_W16x1_neon(q2, q3);
+
+        q2 = vld1q_u8(a + 16);
+        q3 = vld1q_u8(b + 16);
+
+        sse += sse_W16x1_neon(q2, q3);
+
+        a += a_stride;
+        b += b_stride;
+      }
+      break;
+    case 64:
+      for (y = 0; y < height; y++) {
+        q2 = vld1q_u8(a);
+        q3 = vld1q_u8(b);
+
+        sse += sse_W16x1_neon(q2, q3);
+
+        q2 = vld1q_u8(a + 16);
+        q3 = vld1q_u8(b + 16);
+
+        sse += sse_W16x1_neon(q2, q3);
+
+        q2 = vld1q_u8(a + 32);
+        q3 = vld1q_u8(b + 32);
+
+        sse += sse_W16x1_neon(q2, q3);
+
+        q2 = vld1q_u8(a + 48);
+        q3 = vld1q_u8(b + 48);
+
+        sse += sse_W16x1_neon(q2, q3);
+
+        a += a_stride;
+        b += b_stride;
+      }
+      break;
+    case 128:
+      for (y = 0; y < height; y++) {
+        q2 = vld1q_u8(a);
+        q3 = vld1q_u8(b);
+
+        sse += sse_W16x1_neon(q2, q3);
+
+        q2 = vld1q_u8(a + 16);
+        q3 = vld1q_u8(b + 16);
+
+        sse += sse_W16x1_neon(q2, q3);
+
+        q2 = vld1q_u8(a + 32);
+        q3 = vld1q_u8(b + 32);
+
+        sse += sse_W16x1_neon(q2, q3);
+
+        q2 = vld1q_u8(a + 48);
+        q3 = vld1q_u8(b + 48);
+
+        sse += sse_W16x1_neon(q2, q3);
+
+        q2 = vld1q_u8(a + 64);
+        q3 = vld1q_u8(b + 64);
+
+        sse += sse_W16x1_neon(q2, q3);
+
+        q2 = vld1q_u8(a + 80);
+        q3 = vld1q_u8(b + 80);
+
+        sse += sse_W16x1_neon(q2, q3);
+
+        q2 = vld1q_u8(a + 96);
+        q3 = vld1q_u8(b + 96);
+
+        sse += sse_W16x1_neon(q2, q3);
+
+        q2 = vld1q_u8(a + 112);
+        q3 = vld1q_u8(b + 112);
+
+        sse += sse_W16x1_neon(q2, q3);
+
+        a += a_stride;
+        b += b_stride;
+      }
+      break;
+    default:
+      for (y = 0; y < height; y++) {
+        x = width;
+        while (x > 0) {
+          addinc = width - x;
+          q2 = vld1q_u8(a + addinc);
+          q3 = vld1q_u8(b + addinc);
+          if (x < 16) {
+            dx = x;
+            q4 = vld1q_dup_u8(&dx);
+            q5 = vcltq_u8(q0, q4);
+            q2 = vandq_u8(q2, q5);
+            q3 = vandq_u8(q3, q5);
+          }
+          sse += sse_W16x1_neon(q2, q3);
+          x -= 16;
+        }
+        a += a_stride;
+        b += b_stride;
+      }
+  }
+  return (int64_t)sse;
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE uint32_t highbd_sse_W8x1_neon(uint16x8_t q2, uint16x8_t q3) {
+  uint32_t sse;
+  const uint32_t sse1 = 0;
+  const uint32x4_t q1 = vld1q_dup_u32(&sse1);
+
+  uint16x8_t q4 = vabdq_u16(q2, q3);  // diff = abs(a[x] - b[x])
+  uint16x4_t d0 = vget_low_u16(q4);
+  uint16x4_t d1 = vget_high_u16(q4);
+
+  uint32x4_t q6 = vmlal_u16(q1, d0, d0);
+  uint32x4_t q7 = vmlal_u16(q1, d1, d1);
+
+  uint32x2_t d4 = vadd_u32(vget_low_u32(q6), vget_high_u32(q6));
+  uint32x2_t d5 = vadd_u32(vget_low_u32(q7), vget_high_u32(q7));
+
+  uint32x2_t d6 = vadd_u32(d4, d5);
+
+  sse = vget_lane_u32(d6, 0);
+  sse += vget_lane_u32(d6, 1);
+
+  return sse;
+}
+
+int64_t aom_highbd_sse_neon(const uint8_t *a8, int a_stride, const uint8_t *b8,
+                            int b_stride, int width, int height) {
+  const uint16x8_t q0 = { 0, 1, 2, 3, 4, 5, 6, 7 };
+  int64_t sse = 0;
+  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  int x, y;
+  int addinc;
+  uint16x4_t d0, d1, d2, d3;
+  uint16_t dx;
+  uint16x8_t q2, q3, q4, q5;
+
+  switch (width) {
+    case 4:
+      for (y = 0; y < height; y += 2) {
+        d0 = vld1_u16(a);  // load 4 data
+        a += a_stride;
+        d1 = vld1_u16(a);
+        a += a_stride;
+
+        d2 = vld1_u16(b);
+        b += b_stride;
+        d3 = vld1_u16(b);
+        b += b_stride;
+        q2 = vcombine_u16(d0, d1);  // make a 8 data vector
+        q3 = vcombine_u16(d2, d3);
+
+        sse += highbd_sse_W8x1_neon(q2, q3);
+      }
+      break;
+    case 8:
+      for (y = 0; y < height; y++) {
+        q2 = vld1q_u16(a);
+        q3 = vld1q_u16(b);
+
+        sse += highbd_sse_W8x1_neon(q2, q3);
+
+        a += a_stride;
+        b += b_stride;
+      }
+      break;
+    case 16:
+      for (y = 0; y < height; y++) {
+        q2 = vld1q_u16(a);
+        q3 = vld1q_u16(b);
+
+        sse += highbd_sse_W8x1_neon(q2, q3);
+
+        q2 = vld1q_u16(a + 8);
+        q3 = vld1q_u16(b + 8);
+
+        sse += highbd_sse_W8x1_neon(q2, q3);
+
+        a += a_stride;
+        b += b_stride;
+      }
+      break;
+    case 32:
+      for (y = 0; y < height; y++) {
+        q2 = vld1q_u16(a);
+        q3 = vld1q_u16(b);
+
+        sse += highbd_sse_W8x1_neon(q2, q3);
+
+        q2 = vld1q_u16(a + 8);
+        q3 = vld1q_u16(b + 8);
+
+        sse += highbd_sse_W8x1_neon(q2, q3);
+
+        q2 = vld1q_u16(a + 16);
+        q3 = vld1q_u16(b + 16);
+
+        sse += highbd_sse_W8x1_neon(q2, q3);
+
+        q2 = vld1q_u16(a + 24);
+        q3 = vld1q_u16(b + 24);
+
+        sse += highbd_sse_W8x1_neon(q2, q3);
+
+        a += a_stride;
+        b += b_stride;
+      }
+      break;
+    case 64:
+      for (y = 0; y < height; y++) {
+        q2 = vld1q_u16(a);
+        q3 = vld1q_u16(b);
+
+        sse += highbd_sse_W8x1_neon(q2, q3);
+
+        q2 = vld1q_u16(a + 8);
+        q3 = vld1q_u16(b + 8);
+
+        sse += highbd_sse_W8x1_neon(q2, q3);
+
+        q2 = vld1q_u16(a + 16);
+        q3 = vld1q_u16(b + 16);
+
+        sse += highbd_sse_W8x1_neon(q2, q3);
+
+        q2 = vld1q_u16(a + 24);
+        q3 = vld1q_u16(b + 24);
+
+        sse += highbd_sse_W8x1_neon(q2, q3);
+
+        q2 = vld1q_u16(a + 32);
+        q3 = vld1q_u16(b + 32);
+
+        sse += highbd_sse_W8x1_neon(q2, q3);
+
+        q2 = vld1q_u16(a + 40);
+        q3 = vld1q_u16(b + 40);
+
+        sse += highbd_sse_W8x1_neon(q2, q3);
+
+        q2 = vld1q_u16(a + 48);
+        q3 = vld1q_u16(b + 48);
+
+        sse += highbd_sse_W8x1_neon(q2, q3);
+
+        q2 = vld1q_u16(a + 56);
+        q3 = vld1q_u16(b + 56);
+
+        sse += highbd_sse_W8x1_neon(q2, q3);
+
+        a += a_stride;
+        b += b_stride;
+      }
+      break;
+    case 128:
+      for (y = 0; y < height; y++) {
+        q2 = vld1q_u16(a);
+        q3 = vld1q_u16(b);
+
+        sse += highbd_sse_W8x1_neon(q2, q3);
+
+        q2 = vld1q_u16(a + 8);
+        q3 = vld1q_u16(b + 8);
+
+        sse += highbd_sse_W8x1_neon(q2, q3);
+
+        q2 = vld1q_u16(a + 16);
+        q3 = vld1q_u16(b + 16);
+
+        sse += highbd_sse_W8x1_neon(q2, q3);
+
+        q2 = vld1q_u16(a + 24);
+        q3 = vld1q_u16(b + 24);
+
+        sse += highbd_sse_W8x1_neon(q2, q3);
+
+        q2 = vld1q_u16(a + 32);
+        q3 = vld1q_u16(b + 32);
+
+        sse += highbd_sse_W8x1_neon(q2, q3);
+
+        q2 = vld1q_u16(a + 40);
+        q3 = vld1q_u16(b + 40);
+
+        sse += highbd_sse_W8x1_neon(q2, q3);
+
+        q2 = vld1q_u16(a + 48);
+        q3 = vld1q_u16(b + 48);
+
+        sse += highbd_sse_W8x1_neon(q2, q3);
+
+        q2 = vld1q_u16(a + 56);
+        q3 = vld1q_u16(b + 56);
+
+        sse += highbd_sse_W8x1_neon(q2, q3);
+
+        q2 = vld1q_u16(a + 64);
+        q3 = vld1q_u16(b + 64);
+
+        sse += highbd_sse_W8x1_neon(q2, q3);
+
+        q2 = vld1q_u16(a + 72);
+        q3 = vld1q_u16(b + 72);
+
+        sse += highbd_sse_W8x1_neon(q2, q3);
+
+        q2 = vld1q_u16(a + 80);
+        q3 = vld1q_u16(b + 80);
+
+        sse += highbd_sse_W8x1_neon(q2, q3);
+
+        q2 = vld1q_u16(a + 88);
+        q3 = vld1q_u16(b + 88);
+
+        sse += highbd_sse_W8x1_neon(q2, q3);
+
+        q2 = vld1q_u16(a + 96);
+        q3 = vld1q_u16(b + 96);
+
+        sse += highbd_sse_W8x1_neon(q2, q3);
+
+        q2 = vld1q_u16(a + 104);
+        q3 = vld1q_u16(b + 104);
+
+        sse += highbd_sse_W8x1_neon(q2, q3);
+
+        q2 = vld1q_u16(a + 112);
+        q3 = vld1q_u16(b + 112);
+
+        sse += highbd_sse_W8x1_neon(q2, q3);
+
+        q2 = vld1q_u16(a + 120);
+        q3 = vld1q_u16(b + 120);
+
+        sse += highbd_sse_W8x1_neon(q2, q3);
+        a += a_stride;
+        b += b_stride;
+      }
+      break;
+    default:
+
+      for (y = 0; y < height; y++) {
+        x = width;
+        while (x > 0) {
+          addinc = width - x;
+          q2 = vld1q_u16(a + addinc);
+          q3 = vld1q_u16(b + addinc);
+          if (x < 8) {
+            dx = x;
+            q4 = vld1q_dup_u16(&dx);
+            q5 = vcltq_u16(q0, q4);
+            q2 = vandq_u16(q2, q5);
+            q3 = vandq_u16(q3, q5);
+          }
+          sse += highbd_sse_W8x1_neon(q2, q3);
+          x -= 8;
+        }
+        a += a_stride;
+        b += b_stride;
+      }
+  }
+  return (int64_t)sse;
+}
+#endif
diff --git a/libs/libaom/src/aom_dsp/arm/subpel_variance_neon.c b/libs/libaom/src/aom_dsp/arm/subpel_variance_neon.c
new file mode 100644
index 000000000..cf618eee7
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/arm/subpel_variance_neon.c
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+
+#include "aom_ports/mem.h"
+#include "aom/aom_integer.h"
+
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/variance.h"
+
+static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
+                                      uint8_t *output_ptr,
+                                      unsigned int src_pixels_per_line,
+                                      int pixel_step,
+                                      unsigned int output_height,
+                                      unsigned int output_width,
+                                      const uint8_t *filter) {
+  const uint8x8_t f0 = vmov_n_u8(filter[0]);
+  const uint8x8_t f1 = vmov_n_u8(filter[1]);
+  unsigned int i;
+  for (i = 0; i < output_height; ++i) {
+    const uint8x8_t src_0 = vld1_u8(&src_ptr[0]);
+    const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]);
+    const uint16x8_t a = vmull_u8(src_0, f0);
+    const uint16x8_t b = vmlal_u8(a, src_1, f1);
+    const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS);
+    vst1_u8(&output_ptr[0], out);
+    // Next row...
+    src_ptr += src_pixels_per_line;
+    output_ptr += output_width;
+  }
+}
+
+static void var_filter_block2d_bil_w16(const uint8_t *src_ptr,
+                                       uint8_t *output_ptr,
+                                       unsigned int src_pixels_per_line,
+                                       int pixel_step,
+                                       unsigned int output_height,
+                                       unsigned int output_width,
+                                       const uint8_t *filter) {
+  const uint8x8_t f0 = vmov_n_u8(filter[0]);
+  const uint8x8_t f1 = vmov_n_u8(filter[1]);
+  unsigned int i, j;
+  for (i = 0; i < output_height; ++i) {
+    for (j = 0; j < output_width; j += 16) {
+      const uint8x16_t src_0 = vld1q_u8(&src_ptr[j]);
+      const uint8x16_t src_1 = vld1q_u8(&src_ptr[j + pixel_step]);
+      const uint16x8_t a = vmull_u8(vget_low_u8(src_0), f0);
+      const uint16x8_t b = vmlal_u8(a, vget_low_u8(src_1), f1);
+      const uint8x8_t out_lo = vrshrn_n_u16(b, FILTER_BITS);
+      const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0);
+      const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1);
+      const uint8x8_t out_hi = vrshrn_n_u16(d, FILTER_BITS);
+      vst1q_u8(&output_ptr[j], vcombine_u8(out_lo, out_hi));
+    }
+    // Next row...
+    src_ptr += src_pixels_per_line;
+    output_ptr += output_width;
+  }
+}
+
+unsigned int aom_sub_pixel_variance8x8_neon(const uint8_t *src, int src_stride,
+                                            int xoffset, int yoffset,
+                                            const uint8_t *dst, int dst_stride,
+                                            unsigned int *sse) {
+  DECLARE_ALIGNED(16, uint8_t, temp2[8 * 8]);
+  DECLARE_ALIGNED(16, uint8_t, fdata3[9 * 8]);
+
+  var_filter_block2d_bil_w8(src, fdata3, src_stride, 1, 9, 8,
+                            bilinear_filters_2t[xoffset]);
+  var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8, 8,
+                            bilinear_filters_2t[yoffset]);
+  return aom_variance8x8_neon(temp2, 8, dst, dst_stride, sse);
+}
+
+unsigned int aom_sub_pixel_variance16x16_neon(const uint8_t *src,
+                                              int src_stride, int xoffset,
+                                              int yoffset, const uint8_t *dst,
+                                              int dst_stride,
+                                              unsigned int *sse) {
+  DECLARE_ALIGNED(16, uint8_t, temp2[16 * 16]);
+  DECLARE_ALIGNED(16, uint8_t, fdata3[17 * 16]);
+
+  var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 17, 16,
+                             bilinear_filters_2t[xoffset]);
+  var_filter_block2d_bil_w16(fdata3, temp2, 16, 16, 16, 16,
+                             bilinear_filters_2t[yoffset]);
+  return aom_variance16x16_neon(temp2, 16, dst, dst_stride, sse);
+}
+
+unsigned int aom_sub_pixel_variance32x32_neon(const uint8_t *src,
+                                              int src_stride, int xoffset,
+                                              int yoffset, const uint8_t *dst,
+                                              int dst_stride,
+                                              unsigned int *sse) {
+  DECLARE_ALIGNED(16, uint8_t, temp2[32 * 32]);
+  DECLARE_ALIGNED(16, uint8_t, fdata3[33 * 32]);
+
+  var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 33, 32,
+                             bilinear_filters_2t[xoffset]);
+  var_filter_block2d_bil_w16(fdata3, temp2, 32, 32, 32, 32,
+                             bilinear_filters_2t[yoffset]);
+  return aom_variance32x32_neon(temp2, 32, dst, dst_stride, sse);
+}
+
+unsigned int aom_sub_pixel_variance64x64_neon(const uint8_t *src,
+                                              int src_stride, int xoffset,
+                                              int yoffset, const uint8_t *dst,
+                                              int dst_stride,
+                                              unsigned int *sse) {
+  DECLARE_ALIGNED(16, uint8_t, temp2[64 * 64]);
+  DECLARE_ALIGNED(16, uint8_t, fdata3[65 * 64]);
+
+  var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 65, 64,
+                             bilinear_filters_2t[xoffset]);
+  var_filter_block2d_bil_w16(fdata3, temp2, 64, 64, 64, 64,
+                             bilinear_filters_2t[yoffset]);
+  return aom_variance64x64_neon(temp2, 64, dst, dst_stride, sse);
+}
diff --git a/libs/libaom/src/aom_dsp/arm/subtract_neon.c b/libs/libaom/src/aom_dsp/arm/subtract_neon.c
new file mode 100644
index 000000000..28f5ace8e
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/arm/subtract_neon.c
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+
+void aom_subtract_block_neon(int rows, int cols, int16_t *diff,
+                             ptrdiff_t diff_stride, const uint8_t *src,
+                             ptrdiff_t src_stride, const uint8_t *pred,
+                             ptrdiff_t pred_stride) {
+  int r, c;
+
+  if (cols > 16) {
+    for (r = 0; r < rows; ++r) {
+      for (c = 0; c < cols; c += 32) {
+        const uint8x16_t v_src_00 = vld1q_u8(&src[c + 0]);
+        const uint8x16_t v_src_16 = vld1q_u8(&src[c + 16]);
+        const uint8x16_t v_pred_00 = vld1q_u8(&pred[c + 0]);
+        const uint8x16_t v_pred_16 = vld1q_u8(&pred[c + 16]);
+        const uint16x8_t v_diff_lo_00 =
+            vsubl_u8(vget_low_u8(v_src_00), vget_low_u8(v_pred_00));
+        const uint16x8_t v_diff_hi_00 =
+            vsubl_u8(vget_high_u8(v_src_00), vget_high_u8(v_pred_00));
+        const uint16x8_t v_diff_lo_16 =
+            vsubl_u8(vget_low_u8(v_src_16), vget_low_u8(v_pred_16));
+        const uint16x8_t v_diff_hi_16 =
+            vsubl_u8(vget_high_u8(v_src_16), vget_high_u8(v_pred_16));
+        vst1q_s16(&diff[c + 0], vreinterpretq_s16_u16(v_diff_lo_00));
+        vst1q_s16(&diff[c + 8], vreinterpretq_s16_u16(v_diff_hi_00));
+        vst1q_s16(&diff[c + 16], vreinterpretq_s16_u16(v_diff_lo_16));
+        vst1q_s16(&diff[c + 24], vreinterpretq_s16_u16(v_diff_hi_16));
+      }
+      diff += diff_stride;
+      pred += pred_stride;
+      src += src_stride;
+    }
+  } else if (cols > 8) {
+    for (r = 0; r < rows; ++r) {
+      const uint8x16_t v_src = vld1q_u8(&src[0]);
+      const uint8x16_t v_pred = vld1q_u8(&pred[0]);
+      const uint16x8_t v_diff_lo =
+          vsubl_u8(vget_low_u8(v_src), vget_low_u8(v_pred));
+      const uint16x8_t v_diff_hi =
+          vsubl_u8(vget_high_u8(v_src), vget_high_u8(v_pred));
+      vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff_lo));
+      vst1q_s16(&diff[8], vreinterpretq_s16_u16(v_diff_hi));
+      diff += diff_stride;
+      pred += pred_stride;
+      src += src_stride;
+    }
+  } else if (cols > 4) {
+    for (r = 0; r < rows; ++r) {
+      const uint8x8_t v_src = vld1_u8(&src[0]);
+      const uint8x8_t v_pred = vld1_u8(&pred[0]);
+      const uint16x8_t v_diff = vsubl_u8(v_src, v_pred);
+      vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff));
+      diff += diff_stride;
+      pred += pred_stride;
+      src += src_stride;
+    }
+  } else {
+    for (r = 0; r < rows; ++r) {
+      for (c = 0; c < cols; ++c) diff[c] = src[c] - pred[c];
+
+      diff += diff_stride;
+      pred += pred_stride;
+      src += src_stride;
+    }
+  }
+}
diff --git a/libs/libaom/src/aom_dsp/arm/sum_neon.h b/libs/libaom/src/aom_dsp/arm/sum_neon.h
new file mode 100644
index 000000000..809e51ce1
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/arm/sum_neon.h
@@ -0,0 +1,37 @@
+/*
+ *  Copyright (c) 2019, Alliance for Open Media. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+
+static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) {
+  const int32x4_t a = vpaddlq_s16(v_16x8);
+  const int64x2_t b = vpaddlq_s32(a);
+  const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
+                               vreinterpret_s32_s64(vget_high_s64(b)));
+  return vget_lane_s32(c, 0);
+}
+
+static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) {
+  const int64x2_t b = vpaddlq_s32(v_32x4);
+  const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
+                               vreinterpret_s32_s64(vget_high_s64(b)));
+  return vget_lane_s32(c, 0);
+}
+
+static INLINE uint32x2_t horizontal_add_u16x8(const uint16x8_t a) {
+  const uint32x4_t b = vpaddlq_u16(a);
+  const uint64x2_t c = vpaddlq_u32(b);
+  return vadd_u32(vreinterpret_u32_u64(vget_low_u64(c)),
+                  vreinterpret_u32_u64(vget_high_u64(c)));
+}
diff --git a/libs/libaom/src/aom_dsp/arm/variance_neon.c b/libs/libaom/src/aom_dsp/arm/variance_neon.c
new file mode 100644
index 000000000..d4107ce0d
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/arm/variance_neon.c
@@ -0,0 +1,401 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+
+// w * h must be less than 2048 or local variable v_sum may overflow.
+static void variance_neon_w8(const uint8_t *a, int a_stride, const uint8_t *b,
+                             int b_stride, int w, int h, uint32_t *sse,
+                             int *sum) {
+  int i, j;
+  int16x8_t v_sum = vdupq_n_s16(0);
+  int32x4_t v_sse_lo = vdupq_n_s32(0);
+  int32x4_t v_sse_hi = vdupq_n_s32(0);
+
+  for (i = 0; i < h; ++i) {
+    for (j = 0; j < w; j += 8) {
+      const uint8x8_t v_a = vld1_u8(&a[j]);
+      const uint8x8_t v_b = vld1_u8(&b[j]);
+      const uint16x8_t v_diff = vsubl_u8(v_a, v_b);
+      const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff);
+      v_sum = vaddq_s16(v_sum, sv_diff);
+      v_sse_lo =
+          vmlal_s16(v_sse_lo, vget_low_s16(sv_diff), vget_low_s16(sv_diff));
+      v_sse_hi =
+          vmlal_s16(v_sse_hi, vget_high_s16(sv_diff), vget_high_s16(sv_diff));
+    }
+    a += a_stride;
+    b += b_stride;
+  }
+
+  *sum = horizontal_add_s16x8(v_sum);
+  *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi));
+}
+
+void aom_get8x8var_neon(const uint8_t *a, int a_stride, const uint8_t *b,
+                        int b_stride, unsigned int *sse, int *sum) {
+  variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, sum);
+}
+
+void aom_get16x16var_neon(const uint8_t *a, int a_stride, const uint8_t *b,
+                          int b_stride, unsigned int *sse, int *sum) {
+  variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, sum);
+}
+
+unsigned int aom_variance8x8_neon(const uint8_t *a, int a_stride,
+                                  const uint8_t *b, int b_stride,
+                                  unsigned int *sse) {
+  int sum;
+  variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, &sum);
+  return *sse - ((sum * sum) >> 6);
+}
+
+unsigned int aom_variance16x16_neon(const uint8_t *a, int a_stride,
+                                    const uint8_t *b, int b_stride,
+                                    unsigned int *sse) {
+  int sum;
+  variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, &sum);
+  return *sse - (((unsigned int)((int64_t)sum * sum)) >> 8);
+}
+
+unsigned int aom_variance32x32_neon(const uint8_t *a, int a_stride,
+                                    const uint8_t *b, int b_stride,
+                                    unsigned int *sse) {
+  int sum;
+  variance_neon_w8(a, a_stride, b, b_stride, 32, 32, sse, &sum);
+  return *sse - (unsigned int)(((int64_t)sum * sum) >> 10);
+}
+
+unsigned int aom_variance32x64_neon(const uint8_t *a, int a_stride,
+                                    const uint8_t *b, int b_stride,
+                                    unsigned int *sse) {
+  int sum1, sum2;
+  uint32_t sse1, sse2;
+  variance_neon_w8(a, a_stride, b, b_stride, 32, 32, &sse1, &sum1);
+  variance_neon_w8(a + (32 * a_stride), a_stride, b + (32 * b_stride), b_stride,
+                   32, 32, &sse2, &sum2);
+  *sse = sse1 + sse2;
+  sum1 += sum2;
+  return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 11);
+}
+
+unsigned int aom_variance64x32_neon(const uint8_t *a, int a_stride,
+                                    const uint8_t *b, int b_stride,
+                                    unsigned int *sse) {
+  int sum1, sum2;
+  uint32_t sse1, sse2;
+  variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
+  variance_neon_w8(a + (16 * a_stride), a_stride, b + (16 * b_stride), b_stride,
+                   64, 16, &sse2, &sum2);
+  *sse = sse1 + sse2;
+  sum1 += sum2;
+  return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 11);
+}
+
+unsigned int aom_variance64x64_neon(const uint8_t *a, int a_stride,
+                                    const uint8_t *b, int b_stride,
+                                    unsigned int *sse) {
+  int sum1, sum2;
+  uint32_t sse1, sse2;
+
+  variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
+  variance_neon_w8(a + (16 * a_stride), a_stride, b + (16 * b_stride), b_stride,
+                   64, 16, &sse2, &sum2);
+  sse1 += sse2;
+  sum1 += sum2;
+
+  variance_neon_w8(a + (16 * 2 * a_stride), a_stride, b + (16 * 2 * b_stride),
+                   b_stride, 64, 16, &sse2, &sum2);
+  sse1 += sse2;
+  sum1 += sum2;
+
+  variance_neon_w8(a + (16 * 3 * a_stride), a_stride, b + (16 * 3 * b_stride),
+                   b_stride, 64, 16, &sse2, &sum2);
+  *sse = sse1 + sse2;
+  sum1 += sum2;
+  return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 12);
+}
+
+unsigned int aom_variance128x128_neon(const uint8_t *a, int a_stride,
+                                      const uint8_t *b, int b_stride,
+                                      unsigned int *sse) {
+  int sum1, sum2;
+  uint32_t sse1, sse2;
+  sum1 = sse1 = 0;
+  for (int i = 0; i < 16; i++) {
+    variance_neon_w8(a + (8 * i * a_stride), a_stride, b + (8 * i * b_stride),
+                     b_stride, 128, 8, &sse2, &sum2);
+    sse1 += sse2;
+    sum1 += sum2;
+  }
+
+  *sse = sse1;
+
+  return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 14);
+}
+
+unsigned int aom_variance16x8_neon(const unsigned char *src_ptr,
+                                   int source_stride,
+                                   const unsigned char *ref_ptr,
+                                   int recon_stride, unsigned int *sse) {
+  int i;
+  int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
+  uint32x2_t d0u32, d10u32;
+  int64x1_t d0s64, d1s64;
+  uint8x16_t q0u8, q1u8, q2u8, q3u8;
+  uint16x8_t q11u16, q12u16, q13u16, q14u16;
+  int32x4_t q8s32, q9s32, q10s32;
+  int64x2_t q0s64, q1s64, q5s64;
+
+  q8s32 = vdupq_n_s32(0);
+  q9s32 = vdupq_n_s32(0);
+  q10s32 = vdupq_n_s32(0);
+
+  for (i = 0; i < 4; i++) {
+    q0u8 = vld1q_u8(src_ptr);
+    src_ptr += source_stride;
+    q1u8 = vld1q_u8(src_ptr);
+    src_ptr += source_stride;
+    __builtin_prefetch(src_ptr);
+
+    q2u8 = vld1q_u8(ref_ptr);
+    ref_ptr += recon_stride;
+    q3u8 = vld1q_u8(ref_ptr);
+    ref_ptr += recon_stride;
+    __builtin_prefetch(ref_ptr);
+
+    q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
+    q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
+    q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
+    q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
+
+    d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+    d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
+    q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
+    q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
+
+    d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+    d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
+    q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
+    q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
+
+    d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+    d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
+    q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
+    q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
+
+    d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
+    d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
+    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
+    q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
+    q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
+  }
+
+  q10s32 = vaddq_s32(q10s32, q9s32);
+  q0s64 = vpaddlq_s32(q8s32);
+  q1s64 = vpaddlq_s32(q10s32);
+
+  d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
+  d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
+
+  q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), vreinterpret_s32_s64(d0s64));
+  vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
+
+  d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
+  d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
+
+  return vget_lane_u32(d0u32, 0);
+}
+
+unsigned int aom_variance8x16_neon(const unsigned char *src_ptr,
+                                   int source_stride,
+                                   const unsigned char *ref_ptr,
+                                   int recon_stride, unsigned int *sse) {
+  int i;
+  uint8x8_t d0u8, d2u8, d4u8, d6u8;
+  int16x4_t d22s16, d23s16, d24s16, d25s16;
+  uint32x2_t d0u32, d10u32;
+  int64x1_t d0s64, d1s64;
+  uint16x8_t q11u16, q12u16;
+  int32x4_t q8s32, q9s32, q10s32;
+  int64x2_t q0s64, q1s64, q5s64;
+
+  q8s32 = vdupq_n_s32(0);
+  q9s32 = vdupq_n_s32(0);
+  q10s32 = vdupq_n_s32(0);
+
+  for (i = 0; i < 8; i++) {
+    d0u8 = vld1_u8(src_ptr);
+    src_ptr += source_stride;
+    d2u8 = vld1_u8(src_ptr);
+    src_ptr += source_stride;
+    __builtin_prefetch(src_ptr);
+
+    d4u8 = vld1_u8(ref_ptr);
+    ref_ptr += recon_stride;
+    d6u8 = vld1_u8(ref_ptr);
+    ref_ptr += recon_stride;
+    __builtin_prefetch(ref_ptr);
+
+    q11u16 = vsubl_u8(d0u8, d4u8);
+    q12u16 = vsubl_u8(d2u8, d6u8);
+
+    d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+    d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
+    q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
+    q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
+
+    d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+    d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
+    q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
+    q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
+  }
+
+  q10s32 = vaddq_s32(q10s32, q9s32);
+  q0s64 = vpaddlq_s32(q8s32);
+  q1s64 = vpaddlq_s32(q10s32);
+
+  d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
+  d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
+
+  q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), vreinterpret_s32_s64(d0s64));
+  vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
+
+  d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
+  d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
+
+  return vget_lane_u32(d0u32, 0);
+}
+
+unsigned int aom_mse16x16_neon(const unsigned char *src_ptr, int source_stride,
+                               const unsigned char *ref_ptr, int recon_stride,
+                               unsigned int *sse) {
+  int i;
+  int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
+  int64x1_t d0s64;
+  uint8x16_t q0u8, q1u8, q2u8, q3u8;
+  int32x4_t q7s32, q8s32, q9s32, q10s32;
+  uint16x8_t q11u16, q12u16, q13u16, q14u16;
+  int64x2_t q1s64;
+
+  q7s32 = vdupq_n_s32(0);
+  q8s32 = vdupq_n_s32(0);
+  q9s32 = vdupq_n_s32(0);
+  q10s32 = vdupq_n_s32(0);
+
+  for (i = 0; i < 8; i++) {  // mse16x16_neon_loop
+    q0u8 = vld1q_u8(src_ptr);
+    src_ptr += source_stride;
+    q1u8 = vld1q_u8(src_ptr);
+    src_ptr += source_stride;
+    q2u8 = vld1q_u8(ref_ptr);
+    ref_ptr += recon_stride;
+    q3u8 = vld1q_u8(ref_ptr);
+    ref_ptr += recon_stride;
+
+    q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
+    q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
+    q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
+    q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
+
+    d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+    d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+    q7s32 = vmlal_s16(q7s32, d22s16, d22s16);
+    q8s32 = vmlal_s16(q8s32, d23s16, d23s16);
+
+    d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+    d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+    q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
+    q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
+
+    d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+    d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+    q7s32 = vmlal_s16(q7s32, d26s16, d26s16);
+    q8s32 = vmlal_s16(q8s32, d27s16, d27s16);
+
+    d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
+    d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
+    q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
+    q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
+  }
+
+  q7s32 = vaddq_s32(q7s32, q8s32);
+  q9s32 = vaddq_s32(q9s32, q10s32);
+  q10s32 = vaddq_s32(q7s32, q9s32);
+
+  q1s64 = vpaddlq_s32(q10s32);
+  d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
+
+  vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d0s64), 0);
+  return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
+}
+
+unsigned int aom_get4x4sse_cs_neon(const unsigned char *src_ptr,
+                                   int source_stride,
+                                   const unsigned char *ref_ptr,
+                                   int recon_stride) {
+  int16x4_t d22s16, d24s16, d26s16, d28s16;
+  int64x1_t d0s64;
+  uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
+  int32x4_t q7s32, q8s32, q9s32, q10s32;
+  uint16x8_t q11u16, q12u16, q13u16, q14u16;
+  int64x2_t q1s64;
+
+  d0u8 = vld1_u8(src_ptr);
+  src_ptr += source_stride;
+  d4u8 = vld1_u8(ref_ptr);
+  ref_ptr += recon_stride;
+  d1u8 = vld1_u8(src_ptr);
+  src_ptr += source_stride;
+  d5u8 = vld1_u8(ref_ptr);
+  ref_ptr += recon_stride;
+  d2u8 = vld1_u8(src_ptr);
+  src_ptr += source_stride;
+  d6u8 = vld1_u8(ref_ptr);
+  ref_ptr += recon_stride;
+  d3u8 = vld1_u8(src_ptr);
+  d7u8 = vld1_u8(ref_ptr);
+
+  q11u16 = vsubl_u8(d0u8, d4u8);
+  q12u16 = vsubl_u8(d1u8, d5u8);
+  q13u16 = vsubl_u8(d2u8, d6u8);
+  q14u16 = vsubl_u8(d3u8, d7u8);
+
+  d22s16 = vget_low_s16(vreinterpretq_s16_u16(q11u16));
+  d24s16 = vget_low_s16(vreinterpretq_s16_u16(q12u16));
+  d26s16 = vget_low_s16(vreinterpretq_s16_u16(q13u16));
+  d28s16 = vget_low_s16(vreinterpretq_s16_u16(q14u16));
+
+  q7s32 = vmull_s16(d22s16, d22s16);
+  q8s32 = vmull_s16(d24s16, d24s16);
+  q9s32 = vmull_s16(d26s16, d26s16);
+  q10s32 = vmull_s16(d28s16, d28s16);
+
+  q7s32 = vaddq_s32(q7s32, q8s32);
+  q9s32 = vaddq_s32(q9s32, q10s32);
+  q9s32 = vaddq_s32(q7s32, q9s32);
+
+  q1s64 = vpaddlq_s32(q9s32);
+  d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
+
+  return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
+}
diff --git a/libs/libaom/src/aom_dsp/avg.c b/libs/libaom/src/aom_dsp/avg.c
new file mode 100644
index 000000000..7386296fd
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/avg.c
@@ -0,0 +1,486 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom_ports/mem.h"
+
+void aom_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp,
+                      int *min, int *max) {
+  int i, j;
+  *min = 255;
+  *max = 0;
+  for (i = 0; i < 8; ++i, s += p, d += dp) {
+    for (j = 0; j < 8; ++j) {
+      int diff = abs(s[j] - d[j]);
+      *min = diff < *min ? diff : *min;
+      *max = diff > *max ? diff : *max;
+    }
+  }
+}
+
+unsigned int aom_avg_4x4_c(const uint8_t *s, int p) {
+  int i, j;
+  int sum = 0;
+  for (i = 0; i < 4; ++i, s += p)
+    for (j = 0; j < 4; sum += s[j], ++j) {
+    }
+
+  return (sum + 8) >> 4;
+}
+
+unsigned int aom_avg_8x8_c(const uint8_t *s, int p) {
+  int i, j;
+  int sum = 0;
+  for (i = 0; i < 8; ++i, s += p)
+    for (j = 0; j < 8; sum += s[j], ++j) {
+    }
+
+  return (sum + 32) >> 6;
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+unsigned int aom_highbd_avg_8x8_c(const uint8_t *s8, int p) {
+  int i, j;
+  int sum = 0;
+  const uint16_t *s = CONVERT_TO_SHORTPTR(s8);
+  for (i = 0; i < 8; ++i, s += p)
+    for (j = 0; j < 8; sum += s[j], ++j) {
+    }
+
+  return (sum + 32) >> 6;
+}
+
+unsigned int aom_highbd_avg_4x4_c(const uint8_t *s8, int p) {
+  int i, j;
+  int sum = 0;
+  const uint16_t *s = CONVERT_TO_SHORTPTR(s8);
+  for (i = 0; i < 4; ++i, s += p)
+    for (j = 0; j < 4; sum += s[j], ++j) {
+    }
+
+  return (sum + 8) >> 4;
+}
+
+void aom_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8,
+                             int dp, int *min, int *max) {
+  int i, j;
+  const uint16_t *s = CONVERT_TO_SHORTPTR(s8);
+  const uint16_t *d = CONVERT_TO_SHORTPTR(d8);
+  *min = 255;
+  *max = 0;
+  for (i = 0; i < 8; ++i, s += p, d += dp) {
+    for (j = 0; j < 8; ++j) {
+      int diff = abs(s[j] - d[j]);
+      *min = diff < *min ? diff : *min;
+      *max = diff > *max ? diff : *max;
+    }
+  }
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+// src_diff: first pass, 9 bit, dynamic range [-255, 255]
+//           second pass, 12 bit, dynamic range [-2040, 2040]
+static void hadamard_col8(const int16_t *src_diff, ptrdiff_t src_stride,
+                          int16_t *coeff) {
+  int16_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride];
+  int16_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride];
+  int16_t b2 = src_diff[2 * src_stride] + src_diff[3 * src_stride];
+  int16_t b3 = src_diff[2 * src_stride] - src_diff[3 * src_stride];
+  int16_t b4 = src_diff[4 * src_stride] + src_diff[5 * src_stride];
+  int16_t b5 = src_diff[4 * src_stride] - src_diff[5 * src_stride];
+  int16_t b6 = src_diff[6 * src_stride] + src_diff[7 * src_stride];
+  int16_t b7 = src_diff[6 * src_stride] - src_diff[7 * src_stride];
+
+  int16_t c0 = b0 + b2;
+  int16_t c1 = b1 + b3;
+  int16_t c2 = b0 - b2;
+  int16_t c3 = b1 - b3;
+  int16_t c4 = b4 + b6;
+  int16_t c5 = b5 + b7;
+  int16_t c6 = b4 - b6;
+  int16_t c7 = b5 - b7;
+
+  coeff[0] = c0 + c4;
+  coeff[7] = c1 + c5;
+  coeff[3] = c2 + c6;
+  coeff[4] = c3 + c7;
+  coeff[2] = c0 - c4;
+  coeff[6] = c1 - c5;
+  coeff[1] = c2 - c6;
+  coeff[5] = c3 - c7;
+}
+
+// The order of the output coeff of the hadamard is not important. For
+// optimization purposes the final transpose may be skipped.
+void aom_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride,
+                        tran_low_t *coeff) {
+  int idx;
+  int16_t buffer[64];
+  int16_t buffer2[64];
+  int16_t *tmp_buf = &buffer[0];
+  for (idx = 0; idx < 8; ++idx) {
+    hadamard_col8(src_diff, src_stride, tmp_buf);  // src_diff: 9 bit
+                                                   // dynamic range [-255, 255]
+    tmp_buf += 8;
+    ++src_diff;
+  }
+
+  tmp_buf = &buffer[0];
+  for (idx = 0; idx < 8; ++idx) {
+    hadamard_col8(tmp_buf, 8, buffer2 + 8 * idx);  // tmp_buf: 12 bit
+    // dynamic range [-2040, 2040]
+    // buffer2: 15 bit
+    // dynamic range [-16320, 16320]
+    ++tmp_buf;
+  }
+
+  for (idx = 0; idx < 64; ++idx) coeff[idx] = (tran_low_t)buffer2[idx];
+}
+
+void aom_hadamard_lp_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride,
+                           int16_t *coeff) {
+  int16_t buffer[64];
+  int16_t buffer2[64];
+  int16_t *tmp_buf = &buffer[0];
+  for (int idx = 0; idx < 8; ++idx) {
+    hadamard_col8(src_diff, src_stride, tmp_buf);  // src_diff: 9 bit
+                                                   // dynamic range [-255, 255]
+    tmp_buf += 8;
+    ++src_diff;
+  }
+
+  tmp_buf = &buffer[0];
+  for (int idx = 0; idx < 8; ++idx) {
+    hadamard_col8(tmp_buf, 8, buffer2 + 8 * idx);  // tmp_buf: 12 bit
+    // dynamic range [-2040, 2040]
+    // buffer2: 15 bit
+    // dynamic range [-16320, 16320]
+    ++tmp_buf;
+  }
+
+  for (int idx = 0; idx < 64; ++idx) coeff[idx] = buffer2[idx];
+}
+
+// In place 16x16 2D Hadamard transform
+void aom_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride,
+                          tran_low_t *coeff) {
+  int idx;
+  for (idx = 0; idx < 4; ++idx) {
+    // src_diff: 9 bit, dynamic range [-255, 255]
+    const int16_t *src_ptr =
+        src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
+    aom_hadamard_8x8_c(src_ptr, src_stride, coeff + idx * 64);
+  }
+
+  // coeff: 15 bit, dynamic range [-16320, 16320]
+  for (idx = 0; idx < 64; ++idx) {
+    tran_low_t a0 = coeff[0];
+    tran_low_t a1 = coeff[64];
+    tran_low_t a2 = coeff[128];
+    tran_low_t a3 = coeff[192];
+
+    tran_low_t b0 = (a0 + a1) >> 1;  // (a0 + a1): 16 bit, [-32640, 32640]
+    tran_low_t b1 = (a0 - a1) >> 1;  // b0-b3: 15 bit, dynamic range
+    tran_low_t b2 = (a2 + a3) >> 1;  // [-16320, 16320]
+    tran_low_t b3 = (a2 - a3) >> 1;
+
+    coeff[0] = b0 + b2;  // 16 bit, [-32640, 32640]
+    coeff[64] = b1 + b3;
+    coeff[128] = b0 - b2;
+    coeff[192] = b1 - b3;
+
+    ++coeff;
+  }
+}
+
+void aom_hadamard_lp_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride,
+                             int16_t *coeff) {
+  for (int idx = 0; idx < 4; ++idx) {
+    // src_diff: 9 bit, dynamic range [-255, 255]
+    const int16_t *src_ptr =
+        src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
+    aom_hadamard_lp_8x8_c(src_ptr, src_stride, coeff + idx * 64);
+  }
+
+  for (int idx = 0; idx < 64; ++idx) {
+    int16_t a0 = coeff[0];
+    int16_t a1 = coeff[64];
+    int16_t a2 = coeff[128];
+    int16_t a3 = coeff[192];
+
+    int16_t b0 = (a0 + a1) >> 1;  // (a0 + a1): 16 bit, [-32640, 32640]
+    int16_t b1 = (a0 - a1) >> 1;  // b0-b3: 15 bit, dynamic range
+    int16_t b2 = (a2 + a3) >> 1;  // [-16320, 16320]
+    int16_t b3 = (a2 - a3) >> 1;
+
+    coeff[0] = b0 + b2;  // 16 bit, [-32640, 32640]
+    coeff[64] = b1 + b3;
+    coeff[128] = b0 - b2;
+    coeff[192] = b1 - b3;
+
+    ++coeff;
+  }
+}
+
+void aom_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride,
+                          tran_low_t *coeff) {
+  int idx;
+  for (idx = 0; idx < 4; ++idx) {
+    // src_diff: 9 bit, dynamic range [-255, 255]
+    const int16_t *src_ptr =
+        src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
+    aom_hadamard_16x16_c(src_ptr, src_stride, coeff + idx * 256);
+  }
+
+  // coeff: 15 bit, dynamic range [-16320, 16320]
+  for (idx = 0; idx < 256; ++idx) {
+    tran_low_t a0 = coeff[0];
+    tran_low_t a1 = coeff[256];
+    tran_low_t a2 = coeff[512];
+    tran_low_t a3 = coeff[768];
+
+    tran_low_t b0 = (a0 + a1) >> 2;  // (a0 + a1): 16 bit, [-32640, 32640]
+    tran_low_t b1 = (a0 - a1) >> 2;  // b0-b3: 15 bit, dynamic range
+    tran_low_t b2 = (a2 + a3) >> 2;  // [-16320, 16320]
+    tran_low_t b3 = (a2 - a3) >> 2;
+
+    coeff[0] = b0 + b2;  // 16 bit, [-32640, 32640]
+    coeff[256] = b1 + b3;
+    coeff[512] = b0 - b2;
+    coeff[768] = b1 - b3;
+
+    ++coeff;
+  }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static void hadamard_highbd_col8_first_pass(const int16_t *src_diff,
+                                            ptrdiff_t src_stride,
+                                            int16_t *coeff) {
+  int16_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride];
+  int16_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride];
+  int16_t b2 = src_diff[2 * src_stride] + src_diff[3 * src_stride];
+  int16_t b3 = src_diff[2 * src_stride] - src_diff[3 * src_stride];
+  int16_t b4 = src_diff[4 * src_stride] + src_diff[5 * src_stride];
+  int16_t b5 = src_diff[4 * src_stride] - src_diff[5 * src_stride];
+  int16_t b6 = src_diff[6 * src_stride] + src_diff[7 * src_stride];
+  int16_t b7 = src_diff[6 * src_stride] - src_diff[7 * src_stride];
+
+  int16_t c0 = b0 + b2;
+  int16_t c1 = b1 + b3;
+  int16_t c2 = b0 - b2;
+  int16_t c3 = b1 - b3;
+  int16_t c4 = b4 + b6;
+  int16_t c5 = b5 + b7;
+  int16_t c6 = b4 - b6;
+  int16_t c7 = b5 - b7;
+
+  coeff[0] = c0 + c4;
+  coeff[7] = c1 + c5;
+  coeff[3] = c2 + c6;
+  coeff[4] = c3 + c7;
+  coeff[2] = c0 - c4;
+  coeff[6] = c1 - c5;
+  coeff[1] = c2 - c6;
+  coeff[5] = c3 - c7;
+}
+
+// src_diff: 16 bit, dynamic range [-32760, 32760]
+// coeff: 19 bit
+static void hadamard_highbd_col8_second_pass(const int16_t *src_diff,
+                                             ptrdiff_t src_stride,
+                                             int32_t *coeff) {
+  int32_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride];
+  int32_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride];
+  int32_t b2 = src_diff[2 * src_stride] + src_diff[3 * src_stride];
+  int32_t b3 = src_diff[2 * src_stride] - src_diff[3 * src_stride];
+  int32_t b4 = src_diff[4 * src_stride] + src_diff[5 * src_stride];
+  int32_t b5 = src_diff[4 * src_stride] - src_diff[5 * src_stride];
+  int32_t b6 = src_diff[6 * src_stride] + src_diff[7 * src_stride];
+  int32_t b7 = src_diff[6 * src_stride] - src_diff[7 * src_stride];
+
+  int32_t c0 = b0 + b2;
+  int32_t c1 = b1 + b3;
+  int32_t c2 = b0 - b2;
+  int32_t c3 = b1 - b3;
+  int32_t c4 = b4 + b6;
+  int32_t c5 = b5 + b7;
+  int32_t c6 = b4 - b6;
+  int32_t c7 = b5 - b7;
+
+  coeff[0] = c0 + c4;
+  coeff[7] = c1 + c5;
+  coeff[3] = c2 + c6;
+  coeff[4] = c3 + c7;
+  coeff[2] = c0 - c4;
+  coeff[6] = c1 - c5;
+  coeff[1] = c2 - c6;
+  coeff[5] = c3 - c7;
+}
+
+// The order of the output coeff of the hadamard is not important. For
+// optimization purposes the final transpose may be skipped.
+void aom_highbd_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride,
+                               tran_low_t *coeff) {
+  int idx;
+  int16_t buffer[64];
+  int32_t buffer2[64];
+  int16_t *tmp_buf = &buffer[0];
+  for (idx = 0; idx < 8; ++idx) {
+    // src_diff: 13 bit
+    // buffer: 16 bit, dynamic range [-32760, 32760]
+    hadamard_highbd_col8_first_pass(src_diff, src_stride, tmp_buf);
+    tmp_buf += 8;
+    ++src_diff;
+  }
+
+  tmp_buf = &buffer[0];
+  for (idx = 0; idx < 8; ++idx) {
+    // buffer: 16 bit
+    // buffer2: 19 bit, dynamic range [-262080, 262080]
+    hadamard_highbd_col8_second_pass(tmp_buf, 8, buffer2 + 8 * idx);
+    ++tmp_buf;
+  }
+
+  for (idx = 0; idx < 64; ++idx) coeff[idx] = (tran_low_t)buffer2[idx];
+}
+
+// In place 16x16 2D Hadamard transform
+void aom_highbd_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride,
+                                 tran_low_t *coeff) {
+  int idx;
+  for (idx = 0; idx < 4; ++idx) {
+    // src_diff: 13 bit, dynamic range [-4095, 4095]
+    const int16_t *src_ptr =
+        src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
+    aom_highbd_hadamard_8x8_c(src_ptr, src_stride, coeff + idx * 64);
+  }
+
+  // coeff: 19 bit, dynamic range [-262080, 262080]
+  for (idx = 0; idx < 64; ++idx) {
+    tran_low_t a0 = coeff[0];
+    tran_low_t a1 = coeff[64];
+    tran_low_t a2 = coeff[128];
+    tran_low_t a3 = coeff[192];
+
+    tran_low_t b0 = (a0 + a1) >> 1;
+    tran_low_t b1 = (a0 - a1) >> 1;
+    tran_low_t b2 = (a2 + a3) >> 1;
+    tran_low_t b3 = (a2 - a3) >> 1;
+
+    // new coeff dynamic range: 20 bit
+    coeff[0] = b0 + b2;
+    coeff[64] = b1 + b3;
+    coeff[128] = b0 - b2;
+    coeff[192] = b1 - b3;
+
+    ++coeff;
+  }
+}
+
+void aom_highbd_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride,
+                                 tran_low_t *coeff) {
+  int idx;
+  for (idx = 0; idx < 4; ++idx) {
+    // src_diff: 13 bit, dynamic range [-4095, 4095]
+    const int16_t *src_ptr =
+        src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
+    aom_highbd_hadamard_16x16_c(src_ptr, src_stride, coeff + idx * 256);
+  }
+
+  // coeff: 20 bit
+  for (idx = 0; idx < 256; ++idx) {
+    tran_low_t a0 = coeff[0];
+    tran_low_t a1 = coeff[256];
+    tran_low_t a2 = coeff[512];
+    tran_low_t a3 = coeff[768];
+
+    tran_low_t b0 = (a0 + a1) >> 2;
+    tran_low_t b1 = (a0 - a1) >> 2;
+    tran_low_t b2 = (a2 + a3) >> 2;
+    tran_low_t b3 = (a2 - a3) >> 2;
+
+    // new coeff dynamic range: 20 bit
+    coeff[0] = b0 + b2;
+    coeff[256] = b1 + b3;
+    coeff[512] = b0 - b2;
+    coeff[768] = b1 - b3;
+
+    ++coeff;
+  }
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+// coeff: 16 bits, dynamic range [-32640, 32640].
+// length: value range {16, 64, 256, 1024}.
+int aom_satd_c(const tran_low_t *coeff, int length) {
+  int i;
+  int satd = 0;
+  for (i = 0; i < length; ++i) satd += abs(coeff[i]);
+
+  // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024]
+  return satd;
+}
+
+int aom_satd_lp_c(const int16_t *coeff, int length) {
+  int satd = 0;
+  for (int i = 0; i < length; ++i) satd += abs(coeff[i]);
+
+  // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024]
+  return satd;
+}
+
+// Integer projection onto row vectors.
+// height: value range {16, 32, 64, 128}.
+void aom_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref,
+                       const int ref_stride, const int height) {
+  int idx;
+  const int norm_factor = height >> 1;
+  for (idx = 0; idx < 16; ++idx) {
+    int i;
+    hbuf[idx] = 0;
+    // hbuf[idx]: 14 bit, dynamic range [0, 32640].
+    for (i = 0; i < height; ++i) hbuf[idx] += ref[i * ref_stride];
+    // hbuf[idx]: 9 bit, dynamic range [0, 1020].
+    hbuf[idx] /= norm_factor;
+    ++ref;
+  }
+}
+
+// width: value range {16, 32, 64, 128}.
+int16_t aom_int_pro_col_c(const uint8_t *ref, const int width) {
+  int idx;
+  int16_t sum = 0;
+  // sum: 14 bit, dynamic range [0, 32640]
+  for (idx = 0; idx < width; ++idx) sum += ref[idx];
+  return sum;
+}
+
+// ref: [0 - 510]
+// src: [0 - 510]
+// bwl: {2, 3, 4, 5}
+int aom_vector_var_c(const int16_t *ref, const int16_t *src, const int bwl) {
+  int i;
+  int width = 4 << bwl;
+  int sse = 0, mean = 0, var;
+
+  for (i = 0; i < width; ++i) {
+    int diff = ref[i] - src[i];  // diff: dynamic range [-510, 510], 10 bits.
+    mean += diff;                // mean: dynamic range 16 bits.
+    sse += diff * diff;          // sse:  dynamic range 26 bits.
+  }
+
+  // (mean * mean): dynamic range 31 bits.
+  var = sse - ((mean * mean) >> (bwl + 2));
+  return var;
+}
diff --git a/libs/libaom/src/aom_dsp/binary_codes_reader.c b/libs/libaom/src/aom_dsp/binary_codes_reader.c
new file mode 100644
index 000000000..7cd903d82
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/binary_codes_reader.c
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/binary_codes_reader.h"
+#include "aom_dsp/recenter.h"
+#include "av1/common/common.h"
+
+uint16_t aom_read_primitive_quniform_(aom_reader *r,
+                                      uint16_t n ACCT_STR_PARAM) {
+  if (n <= 1) return 0;
+  const int l = get_msb(n) + 1;
+  const int m = (1 << l) - n;
+  const int v = aom_read_literal(r, l - 1, ACCT_STR_NAME);
+  return v < m ? v : (v << 1) - m + aom_read_bit(r, ACCT_STR_NAME);
+}
+
+// Decode finite subexponential code that for a symbol v in [0, n-1] with
+// parameter k
+uint16_t aom_read_primitive_subexpfin_(aom_reader *r, uint16_t n,
+                                       uint16_t k ACCT_STR_PARAM) {
+  int i = 0;
+  int mk = 0;
+
+  while (1) {
+    int b = (i ? k + i - 1 : k);
+    int a = (1 << b);
+
+    if (n <= mk + 3 * a) {
+      return aom_read_primitive_quniform(r, n - mk, ACCT_STR_NAME) + mk;
+    }
+
+    if (!aom_read_bit(r, ACCT_STR_NAME)) {
+      return aom_read_literal(r, b, ACCT_STR_NAME) + mk;
+    }
+
+    i = i + 1;
+    mk += a;
+  }
+
+  assert(0);
+  return 0;
+}
+
+uint16_t aom_read_primitive_refsubexpfin_(aom_reader *r, uint16_t n, uint16_t k,
+                                          uint16_t ref ACCT_STR_PARAM) {
+  return inv_recenter_finite_nonneg(
+      n, ref, aom_read_primitive_subexpfin(r, n, k, ACCT_STR_NAME));
+}
diff --git a/libs/libaom/src/aom_dsp/binary_codes_reader.h b/libs/libaom/src/aom_dsp/binary_codes_reader.h
new file mode 100644
index 000000000..d218f0619
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/binary_codes_reader.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_BINARY_CODES_READER_H_
+#define AOM_AOM_DSP_BINARY_CODES_READER_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <assert.h>
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/bitreader.h"
+#include "aom_dsp/bitreader_buffer.h"
+
+#define aom_read_primitive_quniform(r, n, ACCT_STR_NAME) \
+  aom_read_primitive_quniform_(r, n ACCT_STR_ARG(ACCT_STR_NAME))
+#define aom_read_primitive_subexpfin(r, n, k, ACCT_STR_NAME) \
+  aom_read_primitive_subexpfin_(r, n, k ACCT_STR_ARG(ACCT_STR_NAME))
+#define aom_read_primitive_refsubexpfin(r, n, k, ref, ACCT_STR_NAME) \
+  aom_read_primitive_refsubexpfin_(r, n, k, ref ACCT_STR_ARG(ACCT_STR_NAME))
+
+uint16_t aom_read_primitive_quniform_(aom_reader *r, uint16_t n ACCT_STR_PARAM);
+uint16_t aom_read_primitive_subexpfin_(aom_reader *r, uint16_t n,
+                                       uint16_t k ACCT_STR_PARAM);
+uint16_t aom_read_primitive_refsubexpfin_(aom_reader *r, uint16_t n, uint16_t k,
+                                          uint16_t ref ACCT_STR_PARAM);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_DSP_BINARY_CODES_READER_H_
diff --git a/libs/libaom/src/aom_dsp/binary_codes_writer.c b/libs/libaom/src/aom_dsp/binary_codes_writer.c
new file mode 100644
index 000000000..adf1c1304
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/binary_codes_writer.c
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/bitwriter.h"
+#include "aom_dsp/binary_codes_writer.h"
+#include "aom_dsp/recenter.h"
+#include "aom_ports/bitops.h"
+#include "av1/common/common.h"
+
+// Codes a symbol v in [-2^mag_bits, 2^mag_bits].
+// mag_bits is number of bits for magnitude. The alphabet is of size
+// 2 * 2^mag_bits + 1, symmetric around 0, where one bit is used to
+// indicate 0 or non-zero, mag_bits bits are used to indicate magnitide
+// and 1 more bit for the sign if non-zero.
+void aom_write_primitive_symmetric(aom_writer *w, int16_t v,
+                                   unsigned int abs_bits) {
+  if (v == 0) {
+    aom_write_bit(w, 0);
+  } else {
+    const int x = abs(v);
+    const int s = v < 0;
+    aom_write_bit(w, 1);
+    aom_write_bit(w, s);
+    aom_write_literal(w, x - 1, abs_bits);
+  }
+}
+
+int aom_count_primitive_symmetric(int16_t v, unsigned int abs_bits) {
+  return (v == 0 ? 1 : abs_bits + 2);
+}
+
+// Encodes a value v in [0, n-1] quasi-uniformly
+void aom_write_primitive_quniform(aom_writer *w, uint16_t n, uint16_t v) {
+  if (n <= 1) return;
+  const int l = get_msb(n) + 1;
+  const int m = (1 << l) - n;
+  if (v < m) {
+    aom_write_literal(w, v, l - 1);
+  } else {
+    aom_write_literal(w, m + ((v - m) >> 1), l - 1);
+    aom_write_bit(w, (v - m) & 1);
+  }
+}
+
+int aom_count_primitive_quniform(uint16_t n, uint16_t v) {
+  if (n <= 1) return 0;
+  const int l = get_msb(n) + 1;
+  const int m = (1 << l) - n;
+  return v < m ? l - 1 : l;
+}
+
+// Finite subexponential code that codes a symbol v in [0, n-1] with parameter k
+void aom_write_primitive_subexpfin(aom_writer *w, uint16_t n, uint16_t k,
+                                   uint16_t v) {
+  int i = 0;
+  int mk = 0;
+  while (1) {
+    int b = (i ? k + i - 1 : k);
+    int a = (1 << b);
+    if (n <= mk + 3 * a) {
+      aom_write_primitive_quniform(w, n - mk, v - mk);
+      break;
+    } else {
+      int t = (v >= mk + a);
+      aom_write_bit(w, t);
+      if (t) {
+        i = i + 1;
+        mk += a;
+      } else {
+        aom_write_literal(w, v - mk, b);
+        break;
+      }
+    }
+  }
+}
+
+int aom_count_primitive_subexpfin(uint16_t n, uint16_t k, uint16_t v) {
+  int count = 0;
+  int i = 0;
+  int mk = 0;
+  while (1) {
+    int b = (i ? k + i - 1 : k);
+    int a = (1 << b);
+    if (n <= mk + 3 * a) {
+      count += aom_count_primitive_quniform(n - mk, v - mk);
+      break;
+    } else {
+      int t = (v >= mk + a);
+      count++;
+      if (t) {
+        i = i + 1;
+        mk += a;
+      } else {
+        count += b;
+        break;
+      }
+    }
+  }
+  return count;
+}
+
+// Finite subexponential code that codes a symbol v in [0, n-1] with parameter k
+// based on a reference ref also in [0, n-1].
+// Recenters symbol around r first and then uses a finite subexponential code.
+void aom_write_primitive_refsubexpfin(aom_writer *w, uint16_t n, uint16_t k,
+                                      uint16_t ref, uint16_t v) {
+  aom_write_primitive_subexpfin(w, n, k, recenter_finite_nonneg(n, ref, v));
+}
+
+void aom_write_signed_primitive_refsubexpfin(aom_writer *w, uint16_t n,
+                                             uint16_t k, int16_t ref,
+                                             int16_t v) {
+  ref += n - 1;
+  v += n - 1;
+  const uint16_t scaled_n = (n << 1) - 1;
+  aom_write_primitive_refsubexpfin(w, scaled_n, k, ref, v);
+}
+
+int aom_count_primitive_refsubexpfin(uint16_t n, uint16_t k, uint16_t ref,
+                                     uint16_t v) {
+  return aom_count_primitive_subexpfin(n, k, recenter_finite_nonneg(n, ref, v));
+}
+
+int aom_count_signed_primitive_refsubexpfin(uint16_t n, uint16_t k, int16_t ref,
+                                            int16_t v) {
+  ref += n - 1;
+  v += n - 1;
+  const uint16_t scaled_n = (n << 1) - 1;
+  return aom_count_primitive_refsubexpfin(scaled_n, k, ref, v);
+}
diff --git a/libs/libaom/src/aom_dsp/binary_codes_writer.h b/libs/libaom/src/aom_dsp/binary_codes_writer.h
new file mode 100644
index 000000000..5ec866213
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/binary_codes_writer.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_BINARY_CODES_WRITER_H_
+#define AOM_AOM_DSP_BINARY_CODES_WRITER_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <assert.h>
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/bitwriter.h"
+#include "aom_dsp/bitwriter_buffer.h"
+
+// Codes a symbol v in [-2^mag_bits, 2^mag_bits]
+// mag_bits is number of bits for magnitude. The alphabet is of size
+// 2 * 2^mag_bits + 1, symmetric around 0, where one bit is used to
+// indicate 0 or non-zero, mag_bits bits are used to indicate magnitide
+// and 1 more bit for the sign if non-zero.
+void aom_write_primitive_symmetric(aom_writer *w, int16_t v,
+                                   unsigned int mag_bits);
+
+// Encodes a value v in [0, n-1] quasi-uniformly
+void aom_write_primitive_quniform(aom_writer *w, uint16_t n, uint16_t v);
+
+// Finite subexponential code that codes a symbol v in [0, n-1] with parameter k
+void aom_write_primitive_subexpfin(aom_writer *w, uint16_t n, uint16_t k,
+                                   uint16_t v);
+
+// Finite subexponential code that codes a symbol v in [0, n-1] with parameter k
+// based on a reference ref also in [0, n-1].
+void aom_write_primitive_refsubexpfin(aom_writer *w, uint16_t n, uint16_t k,
+                                      uint16_t ref, uint16_t v);
+
+// Finite subexponential code that codes a symbol v in [-(n-1), n-1] with
+// parameter k based on a reference ref also in [-(n-1), n-1].
+void aom_write_signed_primitive_refsubexpfin(aom_writer *w, uint16_t n,
+                                             uint16_t k, int16_t ref,
+                                             int16_t v);
+
+// Functions that counts bits for the above primitives
+int aom_count_primitive_symmetric(int16_t v, unsigned int mag_bits);
+int aom_count_primitive_quniform(uint16_t n, uint16_t v);
+int aom_count_primitive_subexpfin(uint16_t n, uint16_t k, uint16_t v);
+int aom_count_primitive_refsubexpfin(uint16_t n, uint16_t k, uint16_t ref,
+                                     uint16_t v);
+int aom_count_signed_primitive_refsubexpfin(uint16_t n, uint16_t k, int16_t ref,
+                                            int16_t v);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_DSP_BINARY_CODES_WRITER_H_
diff --git a/libs/libaom/src/aom_dsp/bitreader.c b/libs/libaom/src/aom_dsp/bitreader.c
new file mode 100644
index 000000000..4c70a9171
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/bitreader.c
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/bitreader.h"
+
+int aom_reader_init(aom_reader *r, const uint8_t *buffer, size_t size) {
+  if (size && !buffer) {
+    return 1;
+  }
+  r->buffer_end = buffer + size;
+  r->buffer = buffer;
+  od_ec_dec_init(&r->ec, buffer, (uint32_t)size);
+#if CONFIG_ACCOUNTING
+  r->accounting = NULL;
+#endif
+  return 0;
+}
+
+const uint8_t *aom_reader_find_begin(aom_reader *r) { return r->buffer; }
+
+const uint8_t *aom_reader_find_end(aom_reader *r) { return r->buffer_end; }
+
+uint32_t aom_reader_tell(const aom_reader *r) { return od_ec_dec_tell(&r->ec); }
+
+uint32_t aom_reader_tell_frac(const aom_reader *r) {
+  return od_ec_dec_tell_frac(&r->ec);
+}
+
+int aom_reader_has_overflowed(const aom_reader *r) {
+  const uint32_t tell_bits = aom_reader_tell(r);
+  const uint32_t tell_bytes = (tell_bits + 7) >> 3;
+  return ((ptrdiff_t)tell_bytes > r->buffer_end - r->buffer);
+}
diff --git a/libs/libaom/src/aom_dsp/bitreader.h b/libs/libaom/src/aom_dsp/bitreader.h
new file mode 100644
index 000000000..a8b3f55ef
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/bitreader.h
@@ -0,0 +1,228 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_BITREADER_H_
+#define AOM_AOM_DSP_BITREADER_H_
+
+#include <assert.h>
+#include <limits.h>
+
+#include "config/aom_config.h"
+
+#include "aom/aomdx.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/entdec.h"
+#include "aom_dsp/prob.h"
+#include "av1/common/odintrin.h"
+
+#if CONFIG_ACCOUNTING
+#include "av1/decoder/accounting.h"
+#define ACCT_STR_NAME acct_str
+#define ACCT_STR_PARAM , const char *ACCT_STR_NAME
+#define ACCT_STR_ARG(s) , s
+#else
+#define ACCT_STR_PARAM
+#define ACCT_STR_ARG(s)
+#endif
+
+#define aom_read(r, prob, ACCT_STR_NAME) \
+  aom_read_(r, prob ACCT_STR_ARG(ACCT_STR_NAME))
+#define aom_read_bit(r, ACCT_STR_NAME) \
+  aom_read_bit_(r ACCT_STR_ARG(ACCT_STR_NAME))
+#define aom_read_tree(r, tree, probs, ACCT_STR_NAME) \
+  aom_read_tree_(r, tree, probs ACCT_STR_ARG(ACCT_STR_NAME))
+#define aom_read_literal(r, bits, ACCT_STR_NAME) \
+  aom_read_literal_(r, bits ACCT_STR_ARG(ACCT_STR_NAME))
+#define aom_read_cdf(r, cdf, nsymbs, ACCT_STR_NAME) \
+  aom_read_cdf_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
+#define aom_read_symbol(r, cdf, nsymbs, ACCT_STR_NAME) \
+  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct aom_reader {
+  const uint8_t *buffer;
+  const uint8_t *buffer_end;
+  od_ec_dec ec;
+#if CONFIG_ACCOUNTING
+  Accounting *accounting;
+#endif
+  uint8_t allow_update_cdf;
+};
+
+typedef struct aom_reader aom_reader;
+
+int aom_reader_init(aom_reader *r, const uint8_t *buffer, size_t size);
+
+const uint8_t *aom_reader_find_begin(aom_reader *r);
+
+const uint8_t *aom_reader_find_end(aom_reader *r);
+
+// Returns true if the bit reader has tried to decode more data from the buffer
+// than was actually provided.
+int aom_reader_has_overflowed(const aom_reader *r);
+
+// Returns the position in the bit reader in bits.
+uint32_t aom_reader_tell(const aom_reader *r);
+
+// Returns the position in the bit reader in 1/8th bits.
+uint32_t aom_reader_tell_frac(const aom_reader *r);
+
+#if CONFIG_ACCOUNTING
+static INLINE void aom_process_accounting(const aom_reader *r ACCT_STR_PARAM) {
+  if (r->accounting != NULL) {
+    uint32_t tell_frac;
+    tell_frac = aom_reader_tell_frac(r);
+    aom_accounting_record(r->accounting, ACCT_STR_NAME,
+                          tell_frac - r->accounting->last_tell_frac);
+    r->accounting->last_tell_frac = tell_frac;
+  }
+}
+
+static INLINE void aom_update_symb_counts(const aom_reader *r, int is_binary) {
+  if (r->accounting != NULL) {
+    r->accounting->syms.num_multi_syms += !is_binary;
+    r->accounting->syms.num_binary_syms += !!is_binary;
+  }
+}
+#endif
+
+static INLINE int aom_read_(aom_reader *r, int prob ACCT_STR_PARAM) {
+  int p = (0x7FFFFF - (prob << 15) + prob) >> 8;
+  int bit = od_ec_decode_bool_q15(&r->ec, p);
+
+#if CONFIG_BITSTREAM_DEBUG
+  {
+    int i;
+    int ref_bit, ref_nsymbs;
+    aom_cdf_prob ref_cdf[16];
+    const int queue_r = bitstream_queue_get_read();
+    const int frame_idx = aom_bitstream_queue_get_frame_read();
+    bitstream_queue_pop(&ref_bit, ref_cdf, &ref_nsymbs);
+    if (ref_nsymbs != 2) {
+      fprintf(stderr,
+              "\n *** [bit] nsymbs error, frame_idx_r %d nsymbs %d ref_nsymbs "
+              "%d queue_r %d\n",
+              frame_idx, 2, ref_nsymbs, queue_r);
+      assert(0);
+    }
+    if ((ref_nsymbs != 2) || (ref_cdf[0] != (aom_cdf_prob)p) ||
+        (ref_cdf[1] != 32767)) {
+      fprintf(stderr,
+              "\n *** [bit] cdf error, frame_idx_r %d cdf {%d, %d} ref_cdf {%d",
+              frame_idx, p, 32767, ref_cdf[0]);
+      for (i = 1; i < ref_nsymbs; ++i) fprintf(stderr, ", %d", ref_cdf[i]);
+      fprintf(stderr, "} queue_r %d\n", queue_r);
+      assert(0);
+    }
+    if (bit != ref_bit) {
+      fprintf(stderr,
+              "\n *** [bit] symb error, frame_idx_r %d symb %d ref_symb %d "
+              "queue_r %d\n",
+              frame_idx, bit, ref_bit, queue_r);
+      assert(0);
+    }
+  }
+#endif
+
+#if CONFIG_ACCOUNTING
+  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
+  aom_update_symb_counts(r, 1);
+#endif
+  return bit;
+}
+
+static INLINE int aom_read_bit_(aom_reader *r ACCT_STR_PARAM) {
+  int ret;
+  ret = aom_read(r, 128, NULL);  // aom_prob_half
+#if CONFIG_ACCOUNTING
+  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
+#endif
+  return ret;
+}
+
+static INLINE int aom_read_literal_(aom_reader *r, int bits ACCT_STR_PARAM) {
+  int literal = 0, bit;
+
+  for (bit = bits - 1; bit >= 0; bit--) literal |= aom_read_bit(r, NULL) << bit;
+#if CONFIG_ACCOUNTING
+  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
+#endif
+  return literal;
+}
+
+static INLINE int aom_read_cdf_(aom_reader *r, const aom_cdf_prob *cdf,
+                                int nsymbs ACCT_STR_PARAM) {
+  int symb;
+  assert(cdf != NULL);
+  symb = od_ec_decode_cdf_q15(&r->ec, cdf, nsymbs);
+
+#if CONFIG_BITSTREAM_DEBUG
+  {
+    int i;
+    int cdf_error = 0;
+    int ref_symb, ref_nsymbs;
+    aom_cdf_prob ref_cdf[16];
+    const int queue_r = bitstream_queue_get_read();
+    const int frame_idx = aom_bitstream_queue_get_frame_read();
+    bitstream_queue_pop(&ref_symb, ref_cdf, &ref_nsymbs);
+    if (nsymbs != ref_nsymbs) {
+      fprintf(stderr,
+              "\n *** nsymbs error, frame_idx_r %d nsymbs %d ref_nsymbs %d "
+              "queue_r %d\n",
+              frame_idx, nsymbs, ref_nsymbs, queue_r);
+      cdf_error = 0;
+      assert(0);
+    } else {
+      for (i = 0; i < nsymbs; ++i)
+        if (cdf[i] != ref_cdf[i]) cdf_error = 1;
+    }
+    if (cdf_error) {
+      fprintf(stderr, "\n *** cdf error, frame_idx_r %d cdf {%d", frame_idx,
+              cdf[0]);
+      for (i = 1; i < nsymbs; ++i) fprintf(stderr, ", %d", cdf[i]);
+      fprintf(stderr, "} ref_cdf {%d", ref_cdf[0]);
+      for (i = 1; i < ref_nsymbs; ++i) fprintf(stderr, ", %d", ref_cdf[i]);
+      fprintf(stderr, "} queue_r %d\n", queue_r);
+      assert(0);
+    }
+    if (symb != ref_symb) {
+      fprintf(
+          stderr,
+          "\n *** symb error, frame_idx_r %d symb %d ref_symb %d queue_r %d\n",
+          frame_idx, symb, ref_symb, queue_r);
+      assert(0);
+    }
+  }
+#endif
+
+#if CONFIG_ACCOUNTING
+  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
+  aom_update_symb_counts(r, (nsymbs == 2));
+#endif
+  return symb;
+}
+
+static INLINE int aom_read_symbol_(aom_reader *r, aom_cdf_prob *cdf,
+                                   int nsymbs ACCT_STR_PARAM) {
+  int ret;
+  ret = aom_read_cdf(r, cdf, nsymbs, ACCT_STR_NAME);
+  if (r->allow_update_cdf) update_cdf(cdf, ret, nsymbs);
+  return ret;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_DSP_BITREADER_H_
diff --git a/libs/libaom/src/aom_dsp/bitreader_buffer.c b/libs/libaom/src/aom_dsp/bitreader_buffer.c
new file mode 100644
index 000000000..d79feea6a
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/bitreader_buffer.c
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/bitreader_buffer.h"
+#include "aom_dsp/recenter.h"
+#include "aom_ports/bitops.h"
+
+size_t aom_rb_bytes_read(const struct aom_read_bit_buffer *rb) {
+  return (rb->bit_offset + 7) >> 3;
+}
+
+int aom_rb_read_bit(struct aom_read_bit_buffer *rb) {
+  const uint32_t off = rb->bit_offset;
+  const uint32_t p = off >> 3;
+  const int q = 7 - (int)(off & 0x7);
+  if (rb->bit_buffer + p < rb->bit_buffer_end) {
+    const int bit = (rb->bit_buffer[p] >> q) & 1;
+    rb->bit_offset = off + 1;
+    return bit;
+  } else {
+    if (rb->error_handler) rb->error_handler(rb->error_handler_data);
+    return 0;
+  }
+}
+
+int aom_rb_read_literal(struct aom_read_bit_buffer *rb, int bits) {
+  assert(bits <= 31);
+  int value = 0, bit;
+  for (bit = bits - 1; bit >= 0; bit--) value |= aom_rb_read_bit(rb) << bit;
+  return value;
+}
+
+uint32_t aom_rb_read_unsigned_literal(struct aom_read_bit_buffer *rb,
+                                      int bits) {
+  assert(bits <= 32);
+  uint32_t value = 0;
+  int bit;
+  for (bit = bits - 1; bit >= 0; bit--)
+    value |= (uint32_t)aom_rb_read_bit(rb) << bit;
+  return value;
+}
+
+int aom_rb_read_inv_signed_literal(struct aom_read_bit_buffer *rb, int bits) {
+  const int nbits = sizeof(unsigned) * 8 - bits - 1;
+  const unsigned value = (unsigned)aom_rb_read_literal(rb, bits + 1) << nbits;
+  return ((int)value) >> nbits;
+}
+
+uint32_t aom_rb_read_uvlc(struct aom_read_bit_buffer *rb) {
+  int leading_zeros = 0;
+  while (leading_zeros < 32 && !aom_rb_read_bit(rb)) ++leading_zeros;
+  // Maximum 32 bits.
+  if (leading_zeros == 32) return UINT32_MAX;
+  const uint32_t base = (1u << leading_zeros) - 1;
+  const uint32_t value = aom_rb_read_literal(rb, leading_zeros);
+  return base + value;
+}
+
+static uint16_t aom_rb_read_primitive_quniform(struct aom_read_bit_buffer *rb,
+                                               uint16_t n) {
+  if (n <= 1) return 0;
+  const int l = get_msb(n) + 1;
+  const int m = (1 << l) - n;
+  const int v = aom_rb_read_literal(rb, l - 1);
+  return v < m ? v : (v << 1) - m + aom_rb_read_bit(rb);
+}
+
+static uint16_t aom_rb_read_primitive_subexpfin(struct aom_read_bit_buffer *rb,
+                                                uint16_t n, uint16_t k) {
+  int i = 0;
+  int mk = 0;
+
+  while (1) {
+    int b = (i ? k + i - 1 : k);
+    int a = (1 << b);
+
+    if (n <= mk + 3 * a) {
+      return aom_rb_read_primitive_quniform(rb, n - mk) + mk;
+    }
+
+    if (!aom_rb_read_bit(rb)) {
+      return aom_rb_read_literal(rb, b) + mk;
+    }
+
+    i = i + 1;
+    mk += a;
+  }
+
+  assert(0);
+  return 0;
+}
+
+static uint16_t aom_rb_read_primitive_refsubexpfin(
+    struct aom_read_bit_buffer *rb, uint16_t n, uint16_t k, uint16_t ref) {
+  return inv_recenter_finite_nonneg(n, ref,
+                                    aom_rb_read_primitive_subexpfin(rb, n, k));
+}
+
+int16_t aom_rb_read_signed_primitive_refsubexpfin(
+    struct aom_read_bit_buffer *rb, uint16_t n, uint16_t k, int16_t ref) {
+  ref += n - 1;
+  const uint16_t scaled_n = (n << 1) - 1;
+  return aom_rb_read_primitive_refsubexpfin(rb, scaled_n, k, ref) - n + 1;
+}
diff --git a/libs/libaom/src/aom_dsp/bitreader_buffer.h b/libs/libaom/src/aom_dsp/bitreader_buffer.h
new file mode 100644
index 000000000..359fbe519
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/bitreader_buffer.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_BITREADER_BUFFER_H_
+#define AOM_AOM_DSP_BITREADER_BUFFER_H_
+
+#include <limits.h>
+
+#include "aom/aom_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef void (*aom_rb_error_handler)(void *data);
+
+struct aom_read_bit_buffer {
+  const uint8_t *bit_buffer;
+  const uint8_t *bit_buffer_end;
+  uint32_t bit_offset;
+
+  void *error_handler_data;
+  aom_rb_error_handler error_handler;
+};
+
+size_t aom_rb_bytes_read(const struct aom_read_bit_buffer *rb);
+
+int aom_rb_read_bit(struct aom_read_bit_buffer *rb);
+
+int aom_rb_read_literal(struct aom_read_bit_buffer *rb, int bits);
+
+uint32_t aom_rb_read_unsigned_literal(struct aom_read_bit_buffer *rb, int bits);
+
+int aom_rb_read_inv_signed_literal(struct aom_read_bit_buffer *rb, int bits);
+
+uint32_t aom_rb_read_uvlc(struct aom_read_bit_buffer *rb);
+
+int16_t aom_rb_read_signed_primitive_refsubexpfin(
+    struct aom_read_bit_buffer *rb, uint16_t n, uint16_t k, int16_t ref);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_DSP_BITREADER_BUFFER_H_
diff --git a/libs/libaom/src/aom_dsp/bitwriter.c b/libs/libaom/src/aom_dsp/bitwriter.c
new file mode 100644
index 000000000..41fcc5175
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/bitwriter.c
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <string.h>
+#include "aom_dsp/bitwriter.h"
+
+void aom_start_encode(aom_writer *w, uint8_t *source) {
+  w->buffer = source;
+  w->pos = 0;
+  od_ec_enc_init(&w->ec, 62025);
+}
+
+int aom_stop_encode(aom_writer *w) {
+  int nb_bits;
+  uint32_t bytes;
+  unsigned char *data;
+  data = od_ec_enc_done(&w->ec, &bytes);
+  nb_bits = od_ec_enc_tell(&w->ec);
+  memcpy(w->buffer, data, bytes);
+  w->pos = bytes;
+  od_ec_enc_clear(&w->ec);
+  return nb_bits;
+}
diff --git a/libs/libaom/src/aom_dsp/bitwriter.h b/libs/libaom/src/aom_dsp/bitwriter.h
new file mode 100644
index 000000000..4e77a1794
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/bitwriter.h
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_BITWRITER_H_
+#define AOM_AOM_DSP_BITWRITER_H_
+
+#include <assert.h>
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/entenc.h"
+#include "aom_dsp/prob.h"
+
+#if CONFIG_RD_DEBUG
+#include "av1/common/blockd.h"
+#include "av1/encoder/cost.h"
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct aom_writer {
+  unsigned int pos;
+  uint8_t *buffer;
+  od_ec_enc ec;
+  uint8_t allow_update_cdf;
+};
+
+typedef struct aom_writer aom_writer;
+
+typedef struct TOKEN_STATS {
+  int cost;
+#if CONFIG_RD_DEBUG
+  int txb_coeff_cost_map[TXB_COEFF_COST_MAP_SIZE][TXB_COEFF_COST_MAP_SIZE];
+#endif
+} TOKEN_STATS;
+
+static INLINE void init_token_stats(TOKEN_STATS *token_stats) {
+#if CONFIG_RD_DEBUG
+  int r, c;
+  for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r) {
+    for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c) {
+      token_stats->txb_coeff_cost_map[r][c] = 0;
+    }
+  }
+#endif
+  token_stats->cost = 0;
+}
+
+void aom_start_encode(aom_writer *w, uint8_t *buffer);
+
+int aom_stop_encode(aom_writer *w);
+
+static INLINE void aom_write(aom_writer *w, int bit, int probability) {
+  int p = (0x7FFFFF - (probability << 15) + probability) >> 8;
+#if CONFIG_BITSTREAM_DEBUG
+  aom_cdf_prob cdf[2] = { (aom_cdf_prob)p, 32767 };
+  /*int queue_r = 0;
+  int frame_idx_r = 0;
+  int queue_w = bitstream_queue_get_write();
+  int frame_idx_w = aom_bitstream_queue_get_frame_writee();
+  if (frame_idx_w == frame_idx_r && queue_w == queue_r) {
+    fprintf(stderr, "\n *** bitstream queue at frame_idx_w %d queue_w %d\n",
+    frame_idx_w, queue_w);
+  }*/
+  bitstream_queue_push(bit, cdf, 2);
+#endif
+
+  od_ec_encode_bool_q15(&w->ec, bit, p);
+}
+
+static INLINE void aom_write_bit(aom_writer *w, int bit) {
+  aom_write(w, bit, 128);  // aom_prob_half
+}
+
+static INLINE void aom_write_literal(aom_writer *w, int data, int bits) {
+  int bit;
+
+  for (bit = bits - 1; bit >= 0; bit--) aom_write_bit(w, 1 & (data >> bit));
+}
+
+static INLINE void aom_write_cdf(aom_writer *w, int symb,
+                                 const aom_cdf_prob *cdf, int nsymbs) {
+#if CONFIG_BITSTREAM_DEBUG
+  /*int queue_r = 0;
+  int frame_idx_r = 0;
+  int queue_w = bitstream_queue_get_write();
+  int frame_idx_w = aom_bitstream_queue_get_frame_writee();
+  if (frame_idx_w == frame_idx_r && queue_w == queue_r) {
+    fprintf(stderr, "\n *** bitstream queue at frame_idx_w %d queue_w %d\n",
+    frame_idx_w, queue_w);
+  }*/
+  bitstream_queue_push(symb, cdf, nsymbs);
+#endif
+
+  od_ec_encode_cdf_q15(&w->ec, symb, cdf, nsymbs);
+}
+
+static INLINE void aom_write_symbol(aom_writer *w, int symb, aom_cdf_prob *cdf,
+                                    int nsymbs) {
+  aom_write_cdf(w, symb, cdf, nsymbs);
+  if (w->allow_update_cdf) update_cdf(cdf, symb, nsymbs);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_DSP_BITWRITER_H_
diff --git a/libs/libaom/src/aom_dsp/bitwriter_buffer.c b/libs/libaom/src/aom_dsp/bitwriter_buffer.c
new file mode 100644
index 000000000..7d0ab9486
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/bitwriter_buffer.c
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <stdlib.h>
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/bitwriter_buffer.h"
+#include "aom_dsp/recenter.h"
+#include "aom_ports/bitops.h"
+
+int aom_wb_is_byte_aligned(const struct aom_write_bit_buffer *wb) {
+  return (wb->bit_offset % CHAR_BIT == 0);
+}
+
+uint32_t aom_wb_bytes_written(const struct aom_write_bit_buffer *wb) {
+  return wb->bit_offset / CHAR_BIT + (wb->bit_offset % CHAR_BIT > 0);
+}
+
+void aom_wb_write_bit(struct aom_write_bit_buffer *wb, int bit) {
+  const int off = (int)wb->bit_offset;
+  const int p = off / CHAR_BIT;
+  const int q = CHAR_BIT - 1 - off % CHAR_BIT;
+  if (q == CHAR_BIT - 1) {
+    // Zero next char and write bit
+    wb->bit_buffer[p] = bit << q;
+  } else {
+    wb->bit_buffer[p] &= ~(1 << q);
+    wb->bit_buffer[p] |= bit << q;
+  }
+  wb->bit_offset = off + 1;
+}
+
+void aom_wb_overwrite_bit(struct aom_write_bit_buffer *wb, int bit) {
+  // Do not zero bytes but overwrite exisiting values
+  const int off = (int)wb->bit_offset;
+  const int p = off / CHAR_BIT;
+  const int q = CHAR_BIT - 1 - off % CHAR_BIT;
+  wb->bit_buffer[p] &= ~(1 << q);
+  wb->bit_buffer[p] |= bit << q;
+  wb->bit_offset = off + 1;
+}
+
+void aom_wb_write_literal(struct aom_write_bit_buffer *wb, int data, int bits) {
+  assert(bits <= 31);
+  int bit;
+  for (bit = bits - 1; bit >= 0; bit--) aom_wb_write_bit(wb, (data >> bit) & 1);
+}
+
+void aom_wb_write_unsigned_literal(struct aom_write_bit_buffer *wb,
+                                   uint32_t data, int bits) {
+  assert(bits <= 32);
+  int bit;
+  for (bit = bits - 1; bit >= 0; bit--) aom_wb_write_bit(wb, (data >> bit) & 1);
+}
+
+void aom_wb_overwrite_literal(struct aom_write_bit_buffer *wb, int data,
+                              int bits) {
+  int bit;
+  for (bit = bits - 1; bit >= 0; bit--)
+    aom_wb_overwrite_bit(wb, (data >> bit) & 1);
+}
+
+void aom_wb_write_inv_signed_literal(struct aom_write_bit_buffer *wb, int data,
+                                     int bits) {
+  aom_wb_write_literal(wb, data, bits + 1);
+}
+
+void aom_wb_write_uvlc(struct aom_write_bit_buffer *wb, uint32_t v) {
+  int64_t shift_val = ++v;
+  int leading_zeroes = 1;
+
+  assert(shift_val > 0);
+
+  while (shift_val >>= 1) leading_zeroes += 2;
+
+  aom_wb_write_literal(wb, 0, leading_zeroes >> 1);
+  aom_wb_write_unsigned_literal(wb, v, (leading_zeroes + 1) >> 1);
+}
+
+static void wb_write_primitive_quniform(struct aom_write_bit_buffer *wb,
+                                        uint16_t n, uint16_t v) {
+  if (n <= 1) return;
+  const int l = get_msb(n) + 1;
+  const int m = (1 << l) - n;
+  if (v < m) {
+    aom_wb_write_literal(wb, v, l - 1);
+  } else {
+    aom_wb_write_literal(wb, m + ((v - m) >> 1), l - 1);
+    aom_wb_write_bit(wb, (v - m) & 1);
+  }
+}
+
+static void wb_write_primitive_subexpfin(struct aom_write_bit_buffer *wb,
+                                         uint16_t n, uint16_t k, uint16_t v) {
+  int i = 0;
+  int mk = 0;
+  while (1) {
+    int b = (i ? k + i - 1 : k);
+    int a = (1 << b);
+    if (n <= mk + 3 * a) {
+      wb_write_primitive_quniform(wb, n - mk, v - mk);
+      break;
+    } else {
+      int t = (v >= mk + a);
+      aom_wb_write_bit(wb, t);
+      if (t) {
+        i = i + 1;
+        mk += a;
+      } else {
+        aom_wb_write_literal(wb, v - mk, b);
+        break;
+      }
+    }
+  }
+}
+
+static void wb_write_primitive_refsubexpfin(struct aom_write_bit_buffer *wb,
+                                            uint16_t n, uint16_t k,
+                                            uint16_t ref, uint16_t v) {
+  wb_write_primitive_subexpfin(wb, n, k, recenter_finite_nonneg(n, ref, v));
+}
+
+void aom_wb_write_signed_primitive_refsubexpfin(struct aom_write_bit_buffer *wb,
+                                                uint16_t n, uint16_t k,
+                                                int16_t ref, int16_t v) {
+  ref += n - 1;
+  v += n - 1;
+  const uint16_t scaled_n = (n << 1) - 1;
+  wb_write_primitive_refsubexpfin(wb, scaled_n, k, ref, v);
+}
diff --git a/libs/libaom/src/aom_dsp/bitwriter_buffer.h b/libs/libaom/src/aom_dsp/bitwriter_buffer.h
new file mode 100644
index 000000000..fd10e01bb
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/bitwriter_buffer.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_BITWRITER_BUFFER_H_
+#define AOM_AOM_DSP_BITWRITER_BUFFER_H_
+
+#include "aom/aom_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct aom_write_bit_buffer {
+  uint8_t *bit_buffer;
+  uint32_t bit_offset;
+};
+
+int aom_wb_is_byte_aligned(const struct aom_write_bit_buffer *wb);
+
+uint32_t aom_wb_bytes_written(const struct aom_write_bit_buffer *wb);
+
+void aom_wb_write_bit(struct aom_write_bit_buffer *wb, int bit);
+
+void aom_wb_overwrite_bit(struct aom_write_bit_buffer *wb, int bit);
+
+void aom_wb_write_literal(struct aom_write_bit_buffer *wb, int data, int bits);
+
+void aom_wb_write_unsigned_literal(struct aom_write_bit_buffer *wb,
+                                   uint32_t data, int bits);
+
+void aom_wb_overwrite_literal(struct aom_write_bit_buffer *wb, int data,
+                              int bits);
+
+void aom_wb_write_inv_signed_literal(struct aom_write_bit_buffer *wb, int data,
+                                     int bits);
+
+void aom_wb_write_uvlc(struct aom_write_bit_buffer *wb, uint32_t v);
+
+void aom_wb_write_signed_primitive_refsubexpfin(struct aom_write_bit_buffer *wb,
+                                                uint16_t n, uint16_t k,
+                                                int16_t ref, int16_t v);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_DSP_BITWRITER_BUFFER_H_
diff --git a/libs/libaom/src/aom_dsp/blend.h b/libs/libaom/src/aom_dsp/blend.h
new file mode 100644
index 000000000..fd87dc181
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/blend.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_BLEND_H_
+#define AOM_AOM_DSP_BLEND_H_
+
+#include "aom_ports/mem.h"
+
+// Various blending functions and macros.
+// See also the aom_blend_* functions in aom_dsp_rtcd.h
+
+// Alpha blending with alpha values from the range [0, 64], where 64
+// means use the first input and 0 means use the second input.
+
+#define AOM_BLEND_A64_ROUND_BITS 6
+#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS)  // 64
+
+#define AOM_BLEND_A64(a, v0, v1)                                          \
+  ROUND_POWER_OF_TWO((a) * (v0) + (AOM_BLEND_A64_MAX_ALPHA - (a)) * (v1), \
+                     AOM_BLEND_A64_ROUND_BITS)
+
+// Alpha blending with alpha values from the range [0, 256], where 256
+// means use the first input and 0 means use the second input.
+#define AOM_BLEND_A256_ROUND_BITS 8
+#define AOM_BLEND_A256_MAX_ALPHA (1 << AOM_BLEND_A256_ROUND_BITS)  // 256
+
+#define AOM_BLEND_A256(a, v0, v1)                                          \
+  ROUND_POWER_OF_TWO((a) * (v0) + (AOM_BLEND_A256_MAX_ALPHA - (a)) * (v1), \
+                     AOM_BLEND_A256_ROUND_BITS)
+
+// Blending by averaging.
+#define AOM_BLEND_AVG(v0, v1) ROUND_POWER_OF_TWO((v0) + (v1), 1)
+
+#define DIFF_FACTOR_LOG2 4
+#define DIFF_FACTOR (1 << DIFF_FACTOR_LOG2)
+
+#endif  // AOM_AOM_DSP_BLEND_H_
diff --git a/libs/libaom/src/aom_dsp/blend_a64_hmask.c b/libs/libaom/src/aom_dsp/blend_a64_hmask.c
new file mode 100644
index 000000000..e9e38ef96
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/blend_a64_hmask.c
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/blend.h"
+
+#include "config/aom_dsp_rtcd.h"
+
+void aom_blend_a64_hmask_c(uint8_t *dst, uint32_t dst_stride,
+                           const uint8_t *src0, uint32_t src0_stride,
+                           const uint8_t *src1, uint32_t src1_stride,
+                           const uint8_t *mask, int w, int h) {
+  int i, j;
+
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  for (i = 0; i < h; ++i) {
+    for (j = 0; j < w; ++j) {
+      dst[i * dst_stride + j] = AOM_BLEND_A64(
+          mask[j], src0[i * src0_stride + j], src1[i * src1_stride + j]);
+    }
+  }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void aom_highbd_blend_a64_hmask_c(uint8_t *dst_8, uint32_t dst_stride,
+                                  const uint8_t *src0_8, uint32_t src0_stride,
+                                  const uint8_t *src1_8, uint32_t src1_stride,
+                                  const uint8_t *mask, int w, int h, int bd) {
+  int i, j;
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
+  const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
+  const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
+  (void)bd;
+
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  assert(bd == 8 || bd == 10 || bd == 12);
+
+  for (i = 0; i < h; ++i) {
+    for (j = 0; j < w; ++j) {
+      dst[i * dst_stride + j] = AOM_BLEND_A64(
+          mask[j], src0[i * src0_stride + j], src1[i * src1_stride + j]);
+    }
+  }
+}
+#endif
diff --git a/libs/libaom/src/aom_dsp/blend_a64_mask.c b/libs/libaom/src/aom_dsp/blend_a64_mask.c
new file mode 100644
index 000000000..32f2dc6d8
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/blend_a64_mask.c
@@ -0,0 +1,349 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+#include "aom_dsp/blend.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+#include "config/aom_dsp_rtcd.h"
+
+// Blending with alpha mask. Mask values come from the range [0, 64],
+// as described for AOM_BLEND_A64 in aom_dsp/blend.h. src0 or src1 can
+// be the same as dst, or dst can be different from both sources.
+
+// NOTE(david.barker): The input and output of aom_blend_a64_d16_mask_c() are
+// in a higher intermediate precision, and will later be rounded down to pixel
+// precision.
+// Thus, in order to avoid double-rounding, we want to use normal right shifts
+// within this function, not ROUND_POWER_OF_TWO.
+// This works because of the identity:
+// ROUND_POWER_OF_TWO(x >> y, z) == ROUND_POWER_OF_TWO(x, y+z)
+//
+// In contrast, the output of the non-d16 functions will not be further rounded,
+// so we *should* use ROUND_POWER_OF_TWO there.
+
+void aom_lowbd_blend_a64_d16_mask_c(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
+    ConvolveParams *conv_params) {
+  int i, j;
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
+                           (1 << (offset_bits - conv_params->round_1 - 1));
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+
+  assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 4);
+  assert(w >= 4);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  if (subw == 0 && subh == 0) {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; ++j) {
+        int32_t res;
+        const int m = mask[i * mask_stride + j];
+        res = ((m * (int32_t)src0[i * src0_stride + j] +
+                (AOM_BLEND_A64_MAX_ALPHA - m) *
+                    (int32_t)src1[i * src1_stride + j]) >>
+               AOM_BLEND_A64_ROUND_BITS);
+        res -= round_offset;
+        dst[i * dst_stride + j] =
+            clip_pixel(ROUND_POWER_OF_TWO(res, round_bits));
+      }
+    }
+  } else if (subw == 1 && subh == 1) {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; ++j) {
+        int32_t res;
+        const int m = ROUND_POWER_OF_TWO(
+            mask[(2 * i) * mask_stride + (2 * j)] +
+                mask[(2 * i + 1) * mask_stride + (2 * j)] +
+                mask[(2 * i) * mask_stride + (2 * j + 1)] +
+                mask[(2 * i + 1) * mask_stride + (2 * j + 1)],
+            2);
+        res = ((m * (int32_t)src0[i * src0_stride + j] +
+                (AOM_BLEND_A64_MAX_ALPHA - m) *
+                    (int32_t)src1[i * src1_stride + j]) >>
+               AOM_BLEND_A64_ROUND_BITS);
+        res -= round_offset;
+        dst[i * dst_stride + j] =
+            clip_pixel(ROUND_POWER_OF_TWO(res, round_bits));
+      }
+    }
+  } else if (subw == 1 && subh == 0) {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; ++j) {
+        int32_t res;
+        const int m = AOM_BLEND_AVG(mask[i * mask_stride + (2 * j)],
+                                    mask[i * mask_stride + (2 * j + 1)]);
+        res = ((m * (int32_t)src0[i * src0_stride + j] +
+                (AOM_BLEND_A64_MAX_ALPHA - m) *
+                    (int32_t)src1[i * src1_stride + j]) >>
+               AOM_BLEND_A64_ROUND_BITS);
+        res -= round_offset;
+        dst[i * dst_stride + j] =
+            clip_pixel(ROUND_POWER_OF_TWO(res, round_bits));
+      }
+    }
+  } else {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; ++j) {
+        int32_t res;
+        const int m = AOM_BLEND_AVG(mask[(2 * i) * mask_stride + j],
+                                    mask[(2 * i + 1) * mask_stride + j]);
+        res = ((int32_t)(m * (int32_t)src0[i * src0_stride + j] +
+                         (AOM_BLEND_A64_MAX_ALPHA - m) *
+                             (int32_t)src1[i * src1_stride + j]) >>
+               AOM_BLEND_A64_ROUND_BITS);
+        res -= round_offset;
+        dst[i * dst_stride + j] =
+            clip_pixel(ROUND_POWER_OF_TWO(res, round_bits));
+      }
+    }
+  }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void aom_highbd_blend_a64_d16_mask_c(
+    uint8_t *dst_8, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
+    ConvolveParams *conv_params, const int bd) {
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
+                           (1 << (offset_bits - conv_params->round_1 - 1));
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
+
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  // excerpt from clip_pixel_highbd()
+  // set saturation_value to (1 << bd) - 1
+  unsigned int saturation_value;
+  switch (bd) {
+    case 8:
+    default: saturation_value = 255; break;
+    case 10: saturation_value = 1023; break;
+    case 12: saturation_value = 4095; break;
+  }
+
+  if (subw == 0 && subh == 0) {
+    for (int i = 0; i < h; ++i) {
+      for (int j = 0; j < w; ++j) {
+        int32_t res;
+        const int m = mask[j];
+        res = ((m * src0[j] + (AOM_BLEND_A64_MAX_ALPHA - m) * src1[j]) >>
+               AOM_BLEND_A64_ROUND_BITS);
+        res -= round_offset;
+        unsigned int v = negative_to_zero(ROUND_POWER_OF_TWO(res, round_bits));
+        dst[j] = AOMMIN(v, saturation_value);
+      }
+      mask += mask_stride;
+      src0 += src0_stride;
+      src1 += src1_stride;
+      dst += dst_stride;
+    }
+  } else if (subw == 1 && subh == 1) {
+    for (int i = 0; i < h; ++i) {
+      for (int j = 0; j < w; ++j) {
+        int32_t res;
+        const int m = ROUND_POWER_OF_TWO(
+            mask[2 * j] + mask[mask_stride + 2 * j] + mask[2 * j + 1] +
+                mask[mask_stride + 2 * j + 1],
+            2);
+        res = (m * src0[j] + (AOM_BLEND_A64_MAX_ALPHA - m) * src1[j]) >>
+              AOM_BLEND_A64_ROUND_BITS;
+        res -= round_offset;
+        unsigned int v = negative_to_zero(ROUND_POWER_OF_TWO(res, round_bits));
+        dst[j] = AOMMIN(v, saturation_value);
+      }
+      mask += 2 * mask_stride;
+      src0 += src0_stride;
+      src1 += src1_stride;
+      dst += dst_stride;
+    }
+  } else if (subw == 1 && subh == 0) {
+    for (int i = 0; i < h; ++i) {
+      for (int j = 0; j < w; ++j) {
+        int32_t res;
+        const int m = AOM_BLEND_AVG(mask[2 * j], mask[2 * j + 1]);
+        res = (m * src0[j] + (AOM_BLEND_A64_MAX_ALPHA - m) * src1[j]) >>
+              AOM_BLEND_A64_ROUND_BITS;
+        res -= round_offset;
+        unsigned int v = negative_to_zero(ROUND_POWER_OF_TWO(res, round_bits));
+        dst[j] = AOMMIN(v, saturation_value);
+      }
+      mask += mask_stride;
+      src0 += src0_stride;
+      src1 += src1_stride;
+      dst += dst_stride;
+    }
+  } else {
+    for (int i = 0; i < h; ++i) {
+      for (int j = 0; j < w; ++j) {
+        int32_t res;
+        const int m = AOM_BLEND_AVG(mask[j], mask[mask_stride + j]);
+        res = (m * src0[j] + (AOM_BLEND_A64_MAX_ALPHA - m) * src1[j]) >>
+              AOM_BLEND_A64_ROUND_BITS;
+        res -= round_offset;
+        unsigned int v = negative_to_zero(ROUND_POWER_OF_TWO(res, round_bits));
+        dst[j] = AOMMIN(v, saturation_value);
+      }
+      mask += 2 * mask_stride;
+      src0 += src0_stride;
+      src1 += src1_stride;
+      dst += dst_stride;
+    }
+  }
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+// Blending with alpha mask. Mask values come from the range [0, 64],
+// as described for AOM_BLEND_A64 in aom_dsp/blend.h. src0 or src1 can
+// be the same as dst, or dst can be different from both sources.
+
+void aom_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride,
+                          const uint8_t *src0, uint32_t src0_stride,
+                          const uint8_t *src1, uint32_t src1_stride,
+                          const uint8_t *mask, uint32_t mask_stride, int w,
+                          int h, int subw, int subh) {
+  int i, j;
+
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  if (subw == 0 && subh == 0) {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; ++j) {
+        const int m = mask[i * mask_stride + j];
+        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
+                                                src1[i * src1_stride + j]);
+      }
+    }
+  } else if (subw == 1 && subh == 1) {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; ++j) {
+        const int m = ROUND_POWER_OF_TWO(
+            mask[(2 * i) * mask_stride + (2 * j)] +
+                mask[(2 * i + 1) * mask_stride + (2 * j)] +
+                mask[(2 * i) * mask_stride + (2 * j + 1)] +
+                mask[(2 * i + 1) * mask_stride + (2 * j + 1)],
+            2);
+        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
+                                                src1[i * src1_stride + j]);
+      }
+    }
+  } else if (subw == 1 && subh == 0) {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; ++j) {
+        const int m = AOM_BLEND_AVG(mask[i * mask_stride + (2 * j)],
+                                    mask[i * mask_stride + (2 * j + 1)]);
+        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
+                                                src1[i * src1_stride + j]);
+      }
+    }
+  } else {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; ++j) {
+        const int m = AOM_BLEND_AVG(mask[(2 * i) * mask_stride + j],
+                                    mask[(2 * i + 1) * mask_stride + j]);
+        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
+                                                src1[i * src1_stride + j]);
+      }
+    }
+  }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void aom_highbd_blend_a64_mask_c(uint8_t *dst_8, uint32_t dst_stride,
+                                 const uint8_t *src0_8, uint32_t src0_stride,
+                                 const uint8_t *src1_8, uint32_t src1_stride,
+                                 const uint8_t *mask, uint32_t mask_stride,
+                                 int w, int h, int subw, int subh, int bd) {
+  int i, j;
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
+  const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
+  const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
+  (void)bd;
+
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  assert(bd == 8 || bd == 10 || bd == 12);
+
+  if (subw == 0 && subh == 0) {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; ++j) {
+        const int m = mask[i * mask_stride + j];
+        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
+                                                src1[i * src1_stride + j]);
+      }
+    }
+  } else if (subw == 1 && subh == 1) {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; ++j) {
+        const int m = ROUND_POWER_OF_TWO(
+            mask[(2 * i) * mask_stride + (2 * j)] +
+                mask[(2 * i + 1) * mask_stride + (2 * j)] +
+                mask[(2 * i) * mask_stride + (2 * j + 1)] +
+                mask[(2 * i + 1) * mask_stride + (2 * j + 1)],
+            2);
+        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
+                                                src1[i * src1_stride + j]);
+      }
+    }
+  } else if (subw == 1 && subh == 0) {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; ++j) {
+        const int m = AOM_BLEND_AVG(mask[i * mask_stride + (2 * j)],
+                                    mask[i * mask_stride + (2 * j + 1)]);
+        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
+                                                src1[i * src1_stride + j]);
+      }
+    }
+  } else {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; ++j) {
+        const int m = AOM_BLEND_AVG(mask[(2 * i) * mask_stride + j],
+                                    mask[(2 * i + 1) * mask_stride + j]);
+        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
+                                                src1[i * src1_stride + j]);
+      }
+    }
+  }
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/libs/libaom/src/aom_dsp/blend_a64_vmask.c b/libs/libaom/src/aom_dsp/blend_a64_vmask.c
new file mode 100644
index 000000000..c938bb33a
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/blend_a64_vmask.c
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/blend.h"
+
+#include "config/aom_dsp_rtcd.h"
+
+void aom_blend_a64_vmask_c(uint8_t *dst, uint32_t dst_stride,
+                           const uint8_t *src0, uint32_t src0_stride,
+                           const uint8_t *src1, uint32_t src1_stride,
+                           const uint8_t *mask, int w, int h) {
+  int i, j;
+
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  for (i = 0; i < h; ++i) {
+    const int m = mask[i];
+    for (j = 0; j < w; ++j) {
+      dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
+                                              src1[i * src1_stride + j]);
+    }
+  }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void aom_highbd_blend_a64_vmask_c(uint8_t *dst_8, uint32_t dst_stride,
+                                  const uint8_t *src0_8, uint32_t src0_stride,
+                                  const uint8_t *src1_8, uint32_t src1_stride,
+                                  const uint8_t *mask, int w, int h, int bd) {
+  int i, j;
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
+  const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
+  const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
+  (void)bd;
+
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  assert(bd == 8 || bd == 10 || bd == 12);
+
+  for (i = 0; i < h; ++i) {
+    const int m = mask[i];
+    for (j = 0; j < w; ++j) {
+      dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
+                                              src1[i * src1_stride + j]);
+    }
+  }
+}
+#endif
diff --git a/libs/libaom/src/aom_dsp/blk_sse_sum.c b/libs/libaom/src/aom_dsp/blk_sse_sum.c
new file mode 100644
index 000000000..d76c3f87b
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/blk_sse_sum.c
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_dsp_rtcd.h"
+
+void aom_get_blk_sse_sum_c(const int16_t *data, int stride, int bw, int bh,
+                           int *x_sum, int64_t *x2_sum) {
+  *x_sum = 0;
+  *x2_sum = 0;
+  for (int i = 0; i < bh; ++i) {
+    for (int j = 0; j < bw; ++j) {
+      const int val = data[j];
+      *x_sum += val;
+      *x2_sum += val * val;
+    }
+    data += stride;
+  }
+}
diff --git a/libs/libaom/src/aom_dsp/entcode.c b/libs/libaom/src/aom_dsp/entcode.c
new file mode 100644
index 000000000..aad96c6fc
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/entcode.c
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/entcode.h"
+
+/*Given the current total integer number of bits used and the current value of
+   rng, computes the fraction number of bits used to OD_BITRES precision.
+  This is used by od_ec_enc_tell_frac() and od_ec_dec_tell_frac().
+  nbits_total: The number of whole bits currently used, i.e., the value
+                returned by od_ec_enc_tell() or od_ec_dec_tell().
+  rng: The current value of rng from either the encoder or decoder state.
+  Return: The number of bits scaled by 2**OD_BITRES.
+          This will always be slightly larger than the exact value (e.g., all
+           rounding error is in the positive direction).*/
+uint32_t od_ec_tell_frac(uint32_t nbits_total, uint32_t rng) {
+  uint32_t nbits;
+  int l;
+  int i;
+  /*To handle the non-integral number of bits still left in the encoder/decoder
+     state, we compute the worst-case number of bits of val that must be
+     encoded to ensure that the value is inside the range for any possible
+     subsequent bits.
+    The computation here is independent of val itself (the decoder does not
+     even track that value), even though the real number of bits used after
+     od_ec_enc_done() may be 1 smaller if rng is a power of two and the
+     corresponding trailing bits of val are all zeros.
+    If we did try to track that special case, then coding a value with a
+     probability of 1/(1 << n) might sometimes appear to use more than n bits.
+    This may help explain the surprising result that a newly initialized
+     encoder or decoder claims to have used 1 bit.*/
+  nbits = nbits_total << OD_BITRES;
+  l = 0;
+  for (i = OD_BITRES; i-- > 0;) {
+    int b;
+    rng = rng * rng >> 15;
+    b = (int)(rng >> 16);
+    l = l << 1 | b;
+    rng >>= b;
+  }
+  return nbits - l;
+}
diff --git a/libs/libaom/src/aom_dsp/entcode.h b/libs/libaom/src/aom_dsp/entcode.h
new file mode 100644
index 000000000..751887921
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/entcode.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_ENTCODE_H_
+#define AOM_AOM_DSP_ENTCODE_H_
+
+#include <limits.h>
+#include <stddef.h>
+#include "av1/common/odintrin.h"
+#include "aom_dsp/prob.h"
+
+#define EC_PROB_SHIFT 6
+#define EC_MIN_PROB 4  // must be <= (1<<EC_PROB_SHIFT)/16
+
+/*OPT: od_ec_window must be at least 32 bits, but if you have fast arithmetic
+   on a larger type, you can speed up the decoder by using it here.*/
+typedef uint32_t od_ec_window;
+
+/*The size in bits of od_ec_window.*/
+#define OD_EC_WINDOW_SIZE ((int)sizeof(od_ec_window) * CHAR_BIT)
+
+/*The resolution of fractional-precision bit usage measurements, i.e.,
+   3 => 1/8th bits.*/
+#define OD_BITRES (3)
+
+#define OD_ICDF AOM_ICDF
+
+/*See entcode.c for further documentation.*/
+
+OD_WARN_UNUSED_RESULT uint32_t od_ec_tell_frac(uint32_t nbits_total,
+                                               uint32_t rng);
+
+#endif  // AOM_AOM_DSP_ENTCODE_H_
diff --git a/libs/libaom/src/aom_dsp/entdec.c b/libs/libaom/src/aom_dsp/entdec.c
new file mode 100644
index 000000000..da43e8a39
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/entdec.c
@@ -0,0 +1,247 @@
+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include "aom_dsp/entdec.h"
+#include "aom_dsp/prob.h"
+
+/*A range decoder.
+  This is an entropy decoder based upon \cite{Mar79}, which is itself a
+   rediscovery of the FIFO arithmetic code introduced by \cite{Pas76}.
+  It is very similar to arithmetic encoding, except that encoding is done with
+   digits in any base, instead of with bits, and so it is faster when using
+   larger bases (i.e.: a byte).
+  The author claims an average waste of $\frac{1}{2}\log_b(2b)$ bits, where $b$
+   is the base, longer than the theoretical optimum, but to my knowledge there
+   is no published justification for this claim.
+  This only seems true when using near-infinite precision arithmetic so that
+   the process is carried out with no rounding errors.
+
+  An excellent description of implementation details is available at
+   http://www.arturocampos.com/ac_range.html
+  A recent work \cite{MNW98} which proposes several changes to arithmetic
+   encoding for efficiency actually re-discovers many of the principles
+   behind range encoding, and presents a good theoretical analysis of them.
+
+  End of stream is handled by writing out the smallest number of bits that
+   ensures that the stream will be correctly decoded regardless of the value of
+   any subsequent bits.
+  od_ec_dec_tell() can be used to determine how many bits were needed to decode
+   all the symbols thus far; other data can be packed in the remaining bits of
+   the input buffer.
+  @PHDTHESIS{Pas76,
+    author="Richard Clark Pasco",
+    title="Source coding algorithms for fast data compression",
+    school="Dept. of Electrical Engineering, Stanford University",
+    address="Stanford, CA",
+    month=May,
+    year=1976,
+    URL="http://www.richpasco.org/scaffdc.pdf"
+  }
+  @INPROCEEDINGS{Mar79,
+   author="Martin, G.N.N.",
+   title="Range encoding: an algorithm for removing redundancy from a digitised
+    message",
+   booktitle="Video & Data Recording Conference",
+   year=1979,
+   address="Southampton",
+   month=Jul,
+   URL="http://www.compressconsult.com/rangecoder/rngcod.pdf.gz"
+  }
+  @ARTICLE{MNW98,
+   author="Alistair Moffat and Radford Neal and Ian H. Witten",
+   title="Arithmetic Coding Revisited",
+   journal="{ACM} Transactions on Information Systems",
+   year=1998,
+   volume=16,
+   number=3,
+   pages="256--294",
+   month=Jul,
+   URL="http://researchcommons.waikato.ac.nz/bitstream/handle/10289/78/content.pdf"
+  }*/
+
+/*This is meant to be a large, positive constant that can still be efficiently
+   loaded as an immediate (on platforms like ARM, for example).
+  Even relatively modest values like 100 would work fine.*/
+#define OD_EC_LOTS_OF_BITS (0x4000)
+
+/*The return value of od_ec_dec_tell does not change across an od_ec_dec_refill
+   call.*/
+static void od_ec_dec_refill(od_ec_dec *dec) {
+  int s;
+  od_ec_window dif;
+  int16_t cnt;
+  const unsigned char *bptr;
+  const unsigned char *end;
+  dif = dec->dif;
+  cnt = dec->cnt;
+  bptr = dec->bptr;
+  end = dec->end;
+  s = OD_EC_WINDOW_SIZE - 9 - (cnt + 15);
+  for (; s >= 0 && bptr < end; s -= 8, bptr++) {
+    /*Each time a byte is inserted into the window (dif), bptr advances and cnt
+       is incremented by 8, so the total number of consumed bits (the return
+       value of od_ec_dec_tell) does not change.*/
+    assert(s <= OD_EC_WINDOW_SIZE - 8);
+    dif ^= (od_ec_window)bptr[0] << s;
+    cnt += 8;
+  }
+  if (bptr >= end) {
+    /*We've reached the end of the buffer. It is perfectly valid for us to need
+       to fill the window with additional bits past the end of the buffer (and
+       this happens in normal operation). These bits should all just be taken
+       as zero. But we cannot increment bptr past 'end' (this is undefined
+       behavior), so we start to increment dec->tell_offs. We also don't want
+       to keep testing bptr against 'end', so we set cnt to OD_EC_LOTS_OF_BITS
+       and adjust dec->tell_offs so that the total number of unconsumed bits in
+       the window (dec->cnt - dec->tell_offs) does not change. This effectively
+       puts lots of zero bits into the window, and means we won't try to refill
+       it from the buffer for a very long time (at which point we'll put lots
+       of zero bits into the window again).*/
+    dec->tell_offs += OD_EC_LOTS_OF_BITS - cnt;
+    cnt = OD_EC_LOTS_OF_BITS;
+  }
+  dec->dif = dif;
+  dec->cnt = cnt;
+  dec->bptr = bptr;
+}
+
+/*Takes updated dif and range values, renormalizes them so that
+   32768 <= rng < 65536 (reading more bytes from the stream into dif if
+   necessary), and stores them back in the decoder context.
+  dif: The new value of dif.
+  rng: The new value of the range.
+  ret: The value to return.
+  Return: ret.
+          This allows the compiler to jump to this function via a tail-call.*/
+static int od_ec_dec_normalize(od_ec_dec *dec, od_ec_window dif, unsigned rng,
+                               int ret) {
+  int d;
+  assert(rng <= 65535U);
+  /*The number of leading zeros in the 16-bit binary representation of rng.*/
+  d = 16 - OD_ILOG_NZ(rng);
+  /*d bits in dec->dif are consumed.*/
+  dec->cnt -= d;
+  /*This is equivalent to shifting in 1's instead of 0's.*/
+  dec->dif = ((dif + 1) << d) - 1;
+  dec->rng = rng << d;
+  if (dec->cnt < 0) od_ec_dec_refill(dec);
+  return ret;
+}
+
+/*Initializes the decoder.
+  buf: The input buffer to use.
+  storage: The size in bytes of the input buffer.*/
+void od_ec_dec_init(od_ec_dec *dec, const unsigned char *buf,
+                    uint32_t storage) {
+  dec->buf = buf;
+  dec->tell_offs = 10 - (OD_EC_WINDOW_SIZE - 8);
+  dec->end = buf + storage;
+  dec->bptr = buf;
+  dec->dif = ((od_ec_window)1 << (OD_EC_WINDOW_SIZE - 1)) - 1;
+  dec->rng = 0x8000;
+  dec->cnt = -15;
+  od_ec_dec_refill(dec);
+}
+
+/*Decode a single binary value.
+  f: The probability that the bit is one, scaled by 32768.
+  Return: The value decoded (0 or 1).*/
+int od_ec_decode_bool_q15(od_ec_dec *dec, unsigned f) {
+  od_ec_window dif;
+  od_ec_window vw;
+  unsigned r;
+  unsigned r_new;
+  unsigned v;
+  int ret;
+  assert(0 < f);
+  assert(f < 32768U);
+  dif = dec->dif;
+  r = dec->rng;
+  assert(dif >> (OD_EC_WINDOW_SIZE - 16) < r);
+  assert(32768U <= r);
+  v = ((r >> 8) * (uint32_t)(f >> EC_PROB_SHIFT) >> (7 - EC_PROB_SHIFT));
+  v += EC_MIN_PROB;
+  vw = (od_ec_window)v << (OD_EC_WINDOW_SIZE - 16);
+  ret = 1;
+  r_new = v;
+  if (dif >= vw) {
+    r_new = r - v;
+    dif -= vw;
+    ret = 0;
+  }
+  return od_ec_dec_normalize(dec, dif, r_new, ret);
+}
+
+/*Decodes a symbol given an inverse cumulative distribution function (CDF)
+   table in Q15.
+  icdf: CDF_PROB_TOP minus the CDF, such that symbol s falls in the range
+         [s > 0 ? (CDF_PROB_TOP - icdf[s - 1]) : 0, CDF_PROB_TOP - icdf[s]).
+        The values must be monotonically non-increasing, and icdf[nsyms - 1]
+         must be 0.
+  nsyms: The number of symbols in the alphabet.
+         This should be at most 16.
+  Return: The decoded symbol s.*/
+int od_ec_decode_cdf_q15(od_ec_dec *dec, const uint16_t *icdf, int nsyms) {
+  od_ec_window dif;
+  unsigned r;
+  unsigned c;
+  unsigned u;
+  unsigned v;
+  int ret;
+  (void)nsyms;
+  dif = dec->dif;
+  r = dec->rng;
+  const int N = nsyms - 1;
+
+  assert(dif >> (OD_EC_WINDOW_SIZE - 16) < r);
+  assert(icdf[nsyms - 1] == OD_ICDF(CDF_PROB_TOP));
+  assert(32768U <= r);
+  assert(7 - EC_PROB_SHIFT - CDF_SHIFT >= 0);
+  c = (unsigned)(dif >> (OD_EC_WINDOW_SIZE - 16));
+  v = r;
+  ret = -1;
+  do {
+    u = v;
+    v = ((r >> 8) * (uint32_t)(icdf[++ret] >> EC_PROB_SHIFT) >>
+         (7 - EC_PROB_SHIFT - CDF_SHIFT));
+    v += EC_MIN_PROB * (N - ret);
+  } while (c < v);
+  assert(v < u);
+  assert(u <= r);
+  r = u - v;
+  dif -= (od_ec_window)v << (OD_EC_WINDOW_SIZE - 16);
+  return od_ec_dec_normalize(dec, dif, r, ret);
+}
+
+/*Returns the number of bits "used" by the decoded symbols so far.
+  This same number can be computed in either the encoder or the decoder, and is
+   suitable for making coding decisions.
+  Return: The number of bits.
+          This will always be slightly larger than the exact value (e.g., all
+           rounding error is in the positive direction).*/
+int od_ec_dec_tell(const od_ec_dec *dec) {
+  /*There is a window of bits stored in dec->dif. The difference
+     (dec->bptr - dec->buf) tells us how many bytes have been read into this
+     window. The difference (dec->cnt - dec->tell_offs) tells us how many of
+     the bits in that window remain unconsumed.*/
+  return (int)((dec->bptr - dec->buf) * 8 - dec->cnt + dec->tell_offs);
+}
+
+/*Returns the number of bits "used" by the decoded symbols so far.
+  This same number can be computed in either the encoder or the decoder, and is
+   suitable for making coding decisions.
+  Return: The number of bits scaled by 2**OD_BITRES.
+          This will always be slightly larger than the exact value (e.g., all
+           rounding error is in the positive direction).*/
+uint32_t od_ec_dec_tell_frac(const od_ec_dec *dec) {
+  return od_ec_tell_frac(od_ec_dec_tell(dec), dec->rng);
+}
diff --git a/libs/libaom/src/aom_dsp/entdec.h b/libs/libaom/src/aom_dsp/entdec.h
new file mode 100644
index 000000000..c74616777
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/entdec.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_ENTDEC_H_
+#define AOM_AOM_DSP_ENTDEC_H_
+#include <limits.h>
+#include "aom_dsp/entcode.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct od_ec_dec od_ec_dec;
+
+#if defined(OD_ACCOUNTING) && OD_ACCOUNTING
+#define OD_ACC_STR , char *acc_str
+#define od_ec_dec_bits(dec, ftb, str) od_ec_dec_bits_(dec, ftb, str)
+#else
+#define OD_ACC_STR
+#define od_ec_dec_bits(dec, ftb, str) od_ec_dec_bits_(dec, ftb)
+#endif
+
+/*The entropy decoder context.*/
+struct od_ec_dec {
+  /*The start of the current input buffer.*/
+  const unsigned char *buf;
+  /*An offset used to keep track of tell after reaching the end of the stream.
+    This is constant throughout most of the decoding process, but becomes
+     important once we hit the end of the buffer and stop incrementing bptr
+     (and instead pretend cnt has lots of bits).*/
+  int32_t tell_offs;
+  /*The end of the current input buffer.*/
+  const unsigned char *end;
+  /*The read pointer for the entropy-coded bits.*/
+  const unsigned char *bptr;
+  /*The difference between the high end of the current range, (low + rng), and
+     the coded value, minus 1.
+    This stores up to OD_EC_WINDOW_SIZE bits of that difference, but the
+     decoder only uses the top 16 bits of the window to decode the next symbol.
+    As we shift up during renormalization, if we don't have enough bits left in
+     the window to fill the top 16, we'll read in more bits of the coded
+     value.*/
+  od_ec_window dif;
+  /*The number of values in the current range.*/
+  uint16_t rng;
+  /*The number of bits of data in the current value.*/
+  int16_t cnt;
+};
+
+/*See entdec.c for further documentation.*/
+
+void od_ec_dec_init(od_ec_dec *dec, const unsigned char *buf, uint32_t storage)
+    OD_ARG_NONNULL(1) OD_ARG_NONNULL(2);
+
+OD_WARN_UNUSED_RESULT int od_ec_decode_bool_q15(od_ec_dec *dec, unsigned f)
+    OD_ARG_NONNULL(1);
+OD_WARN_UNUSED_RESULT int od_ec_decode_cdf_q15(od_ec_dec *dec,
+                                               const uint16_t *cdf, int nsyms)
+    OD_ARG_NONNULL(1) OD_ARG_NONNULL(2);
+
+OD_WARN_UNUSED_RESULT uint32_t od_ec_dec_bits_(od_ec_dec *dec, unsigned ftb)
+    OD_ARG_NONNULL(1);
+
+OD_WARN_UNUSED_RESULT int od_ec_dec_tell(const od_ec_dec *dec)
+    OD_ARG_NONNULL(1);
+OD_WARN_UNUSED_RESULT uint32_t od_ec_dec_tell_frac(const od_ec_dec *dec)
+    OD_ARG_NONNULL(1);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_DSP_ENTDEC_H_
diff --git a/libs/libaom/src/aom_dsp/entenc.c b/libs/libaom/src/aom_dsp/entenc.c
new file mode 100644
index 000000000..2fd4493ea
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/entenc.c
@@ -0,0 +1,423 @@
+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <assert.h>
+#include "aom_dsp/entenc.h"
+#include "aom_dsp/prob.h"
+
+#if OD_MEASURE_EC_OVERHEAD
+#if !defined(M_LOG2E)
+#define M_LOG2E (1.4426950408889634073599246810019)
+#endif
+#define OD_LOG2(x) (M_LOG2E * log(x))
+#endif  // OD_MEASURE_EC_OVERHEAD
+
+/*A range encoder.
+  See entdec.c and the references for implementation details \cite{Mar79,MNW98}.
+
+  @INPROCEEDINGS{Mar79,
+   author="Martin, G.N.N.",
+   title="Range encoding: an algorithm for removing redundancy from a digitised
+    message",
+   booktitle="Video \& Data Recording Conference",
+   year=1979,
+   address="Southampton",
+   month=Jul,
+   URL="http://www.compressconsult.com/rangecoder/rngcod.pdf.gz"
+  }
+  @ARTICLE{MNW98,
+   author="Alistair Moffat and Radford Neal and Ian H. Witten",
+   title="Arithmetic Coding Revisited",
+   journal="{ACM} Transactions on Information Systems",
+   year=1998,
+   volume=16,
+   number=3,
+   pages="256--294",
+   month=Jul,
+   URL="http://researchcommons.waikato.ac.nz/bitstream/handle/10289/78/content.pdf"
+  }*/
+
+/*Takes updated low and range values, renormalizes them so that
+   32768 <= rng < 65536 (flushing bytes from low to the pre-carry buffer if
+   necessary), and stores them back in the encoder context.
+  low: The new value of low.
+  rng: The new value of the range.*/
+static void od_ec_enc_normalize(od_ec_enc *enc, od_ec_window low,
+                                unsigned rng) {
+  int d;
+  int c;
+  int s;
+  c = enc->cnt;
+  assert(rng <= 65535U);
+  /*The number of leading zeros in the 16-bit binary representation of rng.*/
+  d = 16 - OD_ILOG_NZ(rng);
+  s = c + d;
+  /*TODO: Right now we flush every time we have at least one byte available.
+    Instead we should use an od_ec_window and flush right before we're about to
+     shift bits off the end of the window.
+    For a 32-bit window this is about the same amount of work, but for a 64-bit
+     window it should be a fair win.*/
+  if (s >= 0) {
+    uint16_t *buf;
+    uint32_t storage;
+    uint32_t offs;
+    unsigned m;
+    buf = enc->precarry_buf;
+    storage = enc->precarry_storage;
+    offs = enc->offs;
+    if (offs + 2 > storage) {
+      storage = 2 * storage + 2;
+      buf = (uint16_t *)realloc(buf, sizeof(*buf) * storage);
+      if (buf == NULL) {
+        enc->error = -1;
+        enc->offs = 0;
+        return;
+      }
+      enc->precarry_buf = buf;
+      enc->precarry_storage = storage;
+    }
+    c += 16;
+    m = (1 << c) - 1;
+    if (s >= 8) {
+      assert(offs < storage);
+      buf[offs++] = (uint16_t)(low >> c);
+      low &= m;
+      c -= 8;
+      m >>= 8;
+    }
+    assert(offs < storage);
+    buf[offs++] = (uint16_t)(low >> c);
+    s = c + d - 24;
+    low &= m;
+    enc->offs = offs;
+  }
+  enc->low = low << d;
+  enc->rng = rng << d;
+  enc->cnt = s;
+}
+
+/*Initializes the encoder.
+  size: The initial size of the buffer, in bytes.*/
+void od_ec_enc_init(od_ec_enc *enc, uint32_t size) {
+  od_ec_enc_reset(enc);
+  enc->buf = (unsigned char *)malloc(sizeof(*enc->buf) * size);
+  enc->storage = size;
+  if (size > 0 && enc->buf == NULL) {
+    enc->storage = 0;
+    enc->error = -1;
+  }
+  enc->precarry_buf = (uint16_t *)malloc(sizeof(*enc->precarry_buf) * size);
+  enc->precarry_storage = size;
+  if (size > 0 && enc->precarry_buf == NULL) {
+    enc->precarry_storage = 0;
+    enc->error = -1;
+  }
+}
+
+/*Reinitializes the encoder.*/
+void od_ec_enc_reset(od_ec_enc *enc) {
+  enc->offs = 0;
+  enc->low = 0;
+  enc->rng = 0x8000;
+  /*This is initialized to -9 so that it crosses zero after we've accumulated
+     one byte + one carry bit.*/
+  enc->cnt = -9;
+  enc->error = 0;
+#if OD_MEASURE_EC_OVERHEAD
+  enc->entropy = 0;
+  enc->nb_symbols = 0;
+#endif
+}
+
+/*Frees the buffers used by the encoder.*/
+void od_ec_enc_clear(od_ec_enc *enc) {
+  free(enc->precarry_buf);
+  free(enc->buf);
+}
+
+/*Encodes a symbol given its frequency in Q15.
+  fl: CDF_PROB_TOP minus the cumulative frequency of all symbols that come
+  before the
+       one to be encoded.
+  fh: CDF_PROB_TOP minus the cumulative frequency of all symbols up to and
+  including
+       the one to be encoded.*/
+static void od_ec_encode_q15(od_ec_enc *enc, unsigned fl, unsigned fh, int s,
+                             int nsyms) {
+  od_ec_window l;
+  unsigned r;
+  unsigned u;
+  unsigned v;
+  l = enc->low;
+  r = enc->rng;
+  assert(32768U <= r);
+  assert(fh <= fl);
+  assert(fl <= 32768U);
+  assert(7 - EC_PROB_SHIFT - CDF_SHIFT >= 0);
+  const int N = nsyms - 1;
+  if (fl < CDF_PROB_TOP) {
+    u = ((r >> 8) * (uint32_t)(fl >> EC_PROB_SHIFT) >>
+         (7 - EC_PROB_SHIFT - CDF_SHIFT)) +
+        EC_MIN_PROB * (N - (s - 1));
+    v = ((r >> 8) * (uint32_t)(fh >> EC_PROB_SHIFT) >>
+         (7 - EC_PROB_SHIFT - CDF_SHIFT)) +
+        EC_MIN_PROB * (N - (s + 0));
+    l += r - u;
+    r = u - v;
+  } else {
+    r -= ((r >> 8) * (uint32_t)(fh >> EC_PROB_SHIFT) >>
+          (7 - EC_PROB_SHIFT - CDF_SHIFT)) +
+         EC_MIN_PROB * (N - (s + 0));
+  }
+  od_ec_enc_normalize(enc, l, r);
+#if OD_MEASURE_EC_OVERHEAD
+  enc->entropy -= OD_LOG2((double)(OD_ICDF(fh) - OD_ICDF(fl)) / CDF_PROB_TOP.);
+  enc->nb_symbols++;
+#endif
+}
+
+/*Encode a single binary value.
+  val: The value to encode (0 or 1).
+  f: The probability that the val is one, scaled by 32768.*/
+void od_ec_encode_bool_q15(od_ec_enc *enc, int val, unsigned f) {
+  od_ec_window l;
+  unsigned r;
+  unsigned v;
+  assert(0 < f);
+  assert(f < 32768U);
+  l = enc->low;
+  r = enc->rng;
+  assert(32768U <= r);
+  v = ((r >> 8) * (uint32_t)(f >> EC_PROB_SHIFT) >> (7 - EC_PROB_SHIFT));
+  v += EC_MIN_PROB;
+  if (val) l += r - v;
+  r = val ? v : r - v;
+  od_ec_enc_normalize(enc, l, r);
+#if OD_MEASURE_EC_OVERHEAD
+  enc->entropy -= OD_LOG2((double)(val ? f : (32768 - f)) / 32768.);
+  enc->nb_symbols++;
+#endif
+}
+
+/*Encodes a symbol given a cumulative distribution function (CDF) table in Q15.
+  s: The index of the symbol to encode.
+  icdf: 32768 minus the CDF, such that symbol s falls in the range
+         [s > 0 ? (32768 - icdf[s - 1]) : 0, 32768 - icdf[s]).
+        The values must be monotonically decreasing, and icdf[nsyms - 1] must
+         be 0.
+  nsyms: The number of symbols in the alphabet.
+         This should be at most 16.*/
+void od_ec_encode_cdf_q15(od_ec_enc *enc, int s, const uint16_t *icdf,
+                          int nsyms) {
+  (void)nsyms;
+  assert(s >= 0);
+  assert(s < nsyms);
+  assert(icdf[nsyms - 1] == OD_ICDF(CDF_PROB_TOP));
+  od_ec_encode_q15(enc, s > 0 ? icdf[s - 1] : OD_ICDF(0), icdf[s], s, nsyms);
+}
+
+/*Overwrites a few bits at the very start of an existing stream, after they
+   have already been encoded.
+  This makes it possible to have a few flags up front, where it is easy for
+   decoders to access them without parsing the whole stream, even if their
+   values are not determined until late in the encoding process, without having
+   to buffer all the intermediate symbols in the encoder.
+  In order for this to work, at least nbits bits must have already been encoded
+   using probabilities that are an exact power of two.
+  The encoder can verify the number of encoded bits is sufficient, but cannot
+   check this latter condition.
+  val: The bits to encode (in the least nbits significant bits).
+       They will be decoded in order from most-significant to least.
+  nbits: The number of bits to overwrite.
+         This must be no more than 8.*/
+void od_ec_enc_patch_initial_bits(od_ec_enc *enc, unsigned val, int nbits) {
+  int shift;
+  unsigned mask;
+  assert(nbits >= 0);
+  assert(nbits <= 8);
+  assert(val < 1U << nbits);
+  shift = 8 - nbits;
+  mask = ((1U << nbits) - 1) << shift;
+  if (enc->offs > 0) {
+    /*The first byte has been finalized.*/
+    enc->precarry_buf[0] =
+        (uint16_t)((enc->precarry_buf[0] & ~mask) | val << shift);
+  } else if (9 + enc->cnt + (enc->rng == 0x8000) > nbits) {
+    /*The first byte has yet to be output.*/
+    enc->low = (enc->low & ~((od_ec_window)mask << (16 + enc->cnt))) |
+               (od_ec_window)val << (16 + enc->cnt + shift);
+  } else {
+    /*The encoder hasn't even encoded _nbits of data yet.*/
+    enc->error = -1;
+  }
+}
+
+#if OD_MEASURE_EC_OVERHEAD
+#include <stdio.h>
+#endif
+
+/*Indicates that there are no more symbols to encode.
+  All remaining output bytes are flushed to the output buffer.
+  od_ec_enc_reset() should be called before using the encoder again.
+  bytes: Returns the size of the encoded data in the returned buffer.
+  Return: A pointer to the start of the final buffer, or NULL if there was an
+           encoding error.*/
+unsigned char *od_ec_enc_done(od_ec_enc *enc, uint32_t *nbytes) {
+  unsigned char *out;
+  uint32_t storage;
+  uint16_t *buf;
+  uint32_t offs;
+  od_ec_window m;
+  od_ec_window e;
+  od_ec_window l;
+  int c;
+  int s;
+  if (enc->error) return NULL;
+#if OD_MEASURE_EC_OVERHEAD
+  {
+    uint32_t tell;
+    /* Don't count the 1 bit we lose to raw bits as overhead. */
+    tell = od_ec_enc_tell(enc) - 1;
+    fprintf(stderr, "overhead: %f%%\n",
+            100 * (tell - enc->entropy) / enc->entropy);
+    fprintf(stderr, "efficiency: %f bits/symbol\n",
+            (double)tell / enc->nb_symbols);
+  }
+#endif
+  /*We output the minimum number of bits that ensures that the symbols encoded
+     thus far will be decoded correctly regardless of the bits that follow.*/
+  l = enc->low;
+  c = enc->cnt;
+  s = 10;
+  m = 0x3FFF;
+  e = ((l + m) & ~m) | (m + 1);
+  s += c;
+  offs = enc->offs;
+  buf = enc->precarry_buf;
+  if (s > 0) {
+    unsigned n;
+    storage = enc->precarry_storage;
+    if (offs + ((s + 7) >> 3) > storage) {
+      storage = storage * 2 + ((s + 7) >> 3);
+      buf = (uint16_t *)realloc(buf, sizeof(*buf) * storage);
+      if (buf == NULL) {
+        enc->error = -1;
+        return NULL;
+      }
+      enc->precarry_buf = buf;
+      enc->precarry_storage = storage;
+    }
+    n = (1 << (c + 16)) - 1;
+    do {
+      assert(offs < storage);
+      buf[offs++] = (uint16_t)(e >> (c + 16));
+      e &= n;
+      s -= 8;
+      c -= 8;
+      n >>= 8;
+    } while (s > 0);
+  }
+  /*Make sure there's enough room for the entropy-coded bits.*/
+  out = enc->buf;
+  storage = enc->storage;
+  c = OD_MAXI((s + 7) >> 3, 0);
+  if (offs + c > storage) {
+    storage = offs + c;
+    out = (unsigned char *)realloc(out, sizeof(*out) * storage);
+    if (out == NULL) {
+      enc->error = -1;
+      return NULL;
+    }
+    enc->buf = out;
+    enc->storage = storage;
+  }
+  *nbytes = offs;
+  /*Perform carry propagation.*/
+  assert(offs <= storage);
+  out = out + storage - offs;
+  c = 0;
+  while (offs > 0) {
+    offs--;
+    c = buf[offs] + c;
+    out[offs] = (unsigned char)c;
+    c >>= 8;
+  }
+  /*Note: Unless there's an allocation error, if you keep encoding into the
+     current buffer and call this function again later, everything will work
+     just fine (you won't get a new packet out, but you will get a single
+     buffer with the new data appended to the old).
+    However, this function is O(N) where N is the amount of data coded so far,
+     so calling it more than once for a given packet is a bad idea.*/
+  return out;
+}
+
+/*Returns the number of bits "used" by the encoded symbols so far.
+  This same number can be computed in either the encoder or the decoder, and is
+   suitable for making coding decisions.
+  Warning: The value returned by this function can decrease compared to an
+   earlier call, even after encoding more data, if there is an encoding error
+   (i.e., a failure to allocate enough space for the output buffer).
+  Return: The number of bits.
+          This will always be slightly larger than the exact value (e.g., all
+           rounding error is in the positive direction).*/
+int od_ec_enc_tell(const od_ec_enc *enc) {
+  /*The 10 here counteracts the offset of -9 baked into cnt, and adds 1 extra
+     bit, which we reserve for terminating the stream.*/
+  return (enc->cnt + 10) + enc->offs * 8;
+}
+
+/*Returns the number of bits "used" by the encoded symbols so far.
+  This same number can be computed in either the encoder or the decoder, and is
+   suitable for making coding decisions.
+  Warning: The value returned by this function can decrease compared to an
+   earlier call, even after encoding more data, if there is an encoding error
+   (i.e., a failure to allocate enough space for the output buffer).
+  Return: The number of bits scaled by 2**OD_BITRES.
+          This will always be slightly larger than the exact value (e.g., all
+           rounding error is in the positive direction).*/
+uint32_t od_ec_enc_tell_frac(const od_ec_enc *enc) {
+  return od_ec_tell_frac(od_ec_enc_tell(enc), enc->rng);
+}
+
+/*Saves a entropy coder checkpoint to dst.
+  This allows an encoder to reverse a series of entropy coder
+   decisions if it decides that the information would have been
+   better coded some other way.*/
+void od_ec_enc_checkpoint(od_ec_enc *dst, const od_ec_enc *src) {
+  OD_COPY(dst, src, 1);
+}
+
+/*Restores an entropy coder checkpoint saved by od_ec_enc_checkpoint.
+  This can only be used to restore from checkpoints earlier in the target
+   state's history: you can not switch backwards and forwards or otherwise
+   switch to a state which isn't a casual ancestor of the current state.
+  Restore is also incompatible with patching the initial bits, as the
+   changes will remain in the restored version.*/
+void od_ec_enc_rollback(od_ec_enc *dst, const od_ec_enc *src) {
+  unsigned char *buf;
+  uint32_t storage;
+  uint16_t *precarry_buf;
+  uint32_t precarry_storage;
+  assert(dst->storage >= src->storage);
+  assert(dst->precarry_storage >= src->precarry_storage);
+  buf = dst->buf;
+  storage = dst->storage;
+  precarry_buf = dst->precarry_buf;
+  precarry_storage = dst->precarry_storage;
+  OD_COPY(dst, src, 1);
+  dst->buf = buf;
+  dst->storage = storage;
+  dst->precarry_buf = precarry_buf;
+  dst->precarry_storage = precarry_storage;
+}
diff --git a/libs/libaom/src/aom_dsp/entenc.h b/libs/libaom/src/aom_dsp/entenc.h
new file mode 100644
index 000000000..3551d4250
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/entenc.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_ENTENC_H_
+#define AOM_AOM_DSP_ENTENC_H_
+#include <stddef.h>
+#include "aom_dsp/entcode.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct od_ec_enc od_ec_enc;
+
+#define OD_MEASURE_EC_OVERHEAD (0)
+
+/*The entropy encoder context.*/
+struct od_ec_enc {
+  /*Buffered output.
+    This contains only the raw bits until the final call to od_ec_enc_done(),
+     where all the arithmetic-coded data gets prepended to it.*/
+  unsigned char *buf;
+  /*The size of the buffer.*/
+  uint32_t storage;
+  /*A buffer for output bytes with their associated carry flags.*/
+  uint16_t *precarry_buf;
+  /*The size of the pre-carry buffer.*/
+  uint32_t precarry_storage;
+  /*The offset at which the next entropy-coded byte will be written.*/
+  uint32_t offs;
+  /*The low end of the current range.*/
+  od_ec_window low;
+  /*The number of values in the current range.*/
+  uint16_t rng;
+  /*The number of bits of data in the current value.*/
+  int16_t cnt;
+  /*Nonzero if an error occurred.*/
+  int error;
+#if OD_MEASURE_EC_OVERHEAD
+  double entropy;
+  int nb_symbols;
+#endif
+};
+
+/*See entenc.c for further documentation.*/
+
+void od_ec_enc_init(od_ec_enc *enc, uint32_t size) OD_ARG_NONNULL(1);
+void od_ec_enc_reset(od_ec_enc *enc) OD_ARG_NONNULL(1);
+void od_ec_enc_clear(od_ec_enc *enc) OD_ARG_NONNULL(1);
+
+void od_ec_encode_bool_q15(od_ec_enc *enc, int val, unsigned f_q15)
+    OD_ARG_NONNULL(1);
+void od_ec_encode_cdf_q15(od_ec_enc *enc, int s, const uint16_t *cdf, int nsyms)
+    OD_ARG_NONNULL(1) OD_ARG_NONNULL(3);
+
+void od_ec_enc_bits(od_ec_enc *enc, uint32_t fl, unsigned ftb)
+    OD_ARG_NONNULL(1);
+
+void od_ec_enc_patch_initial_bits(od_ec_enc *enc, unsigned val, int nbits)
+    OD_ARG_NONNULL(1);
+OD_WARN_UNUSED_RESULT unsigned char *od_ec_enc_done(od_ec_enc *enc,
+                                                    uint32_t *nbytes)
+    OD_ARG_NONNULL(1) OD_ARG_NONNULL(2);
+
+OD_WARN_UNUSED_RESULT int od_ec_enc_tell(const od_ec_enc *enc)
+    OD_ARG_NONNULL(1);
+OD_WARN_UNUSED_RESULT uint32_t od_ec_enc_tell_frac(const od_ec_enc *enc)
+    OD_ARG_NONNULL(1);
+
+void od_ec_enc_checkpoint(od_ec_enc *dst, const od_ec_enc *src);
+void od_ec_enc_rollback(od_ec_enc *dst, const od_ec_enc *src);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_DSP_ENTENC_H_
diff --git a/libs/libaom/src/aom_dsp/fastssim.c b/libs/libaom/src/aom_dsp/fastssim.c
new file mode 100644
index 000000000..3804519b3
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/fastssim.c
@@ -0,0 +1,487 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ *
+ *  This code was originally written by: Nathan E. Egge, at the Daala
+ *  project.
+ */
+#include <assert.h>
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/ssim.h"
+#include "aom_ports/system_state.h"
+
+typedef struct fs_level fs_level;
+typedef struct fs_ctx fs_ctx;
+
+#define SSIM_C1 (255 * 255 * 0.01 * 0.01)
+#define SSIM_C2 (255 * 255 * 0.03 * 0.03)
+#define SSIM_C1_10 (1023 * 1023 * 0.01 * 0.01)
+#define SSIM_C1_12 (4095 * 4095 * 0.01 * 0.01)
+#define SSIM_C2_10 (1023 * 1023 * 0.03 * 0.03)
+#define SSIM_C2_12 (4095 * 4095 * 0.03 * 0.03)
+
+#define FS_MINI(_a, _b) ((_a) < (_b) ? (_a) : (_b))
+#define FS_MAXI(_a, _b) ((_a) > (_b) ? (_a) : (_b))
+
+struct fs_level {
+  uint32_t *im1;
+  uint32_t *im2;
+  double *ssim;
+  int w;
+  int h;
+};
+
+struct fs_ctx {
+  fs_level *level;
+  int nlevels;
+  unsigned *col_buf;
+};
+
+static void fs_ctx_init(fs_ctx *_ctx, int _w, int _h, int _nlevels) {
+  unsigned char *data;
+  size_t data_size;
+  int lw;
+  int lh;
+  int l;
+  lw = (_w + 1) >> 1;
+  lh = (_h + 1) >> 1;
+  data_size =
+      _nlevels * sizeof(fs_level) + 2 * (lw + 8) * 8 * sizeof(*_ctx->col_buf);
+  for (l = 0; l < _nlevels; l++) {
+    size_t im_size;
+    size_t level_size;
+    im_size = lw * (size_t)lh;
+    level_size = 2 * im_size * sizeof(*_ctx->level[l].im1);
+    level_size += sizeof(*_ctx->level[l].ssim) - 1;
+    level_size /= sizeof(*_ctx->level[l].ssim);
+    level_size += im_size;
+    level_size *= sizeof(*_ctx->level[l].ssim);
+    data_size += level_size;
+    lw = (lw + 1) >> 1;
+    lh = (lh + 1) >> 1;
+  }
+  data = (unsigned char *)malloc(data_size);
+  _ctx->level = (fs_level *)data;
+  _ctx->nlevels = _nlevels;
+  data += _nlevels * sizeof(*_ctx->level);
+  lw = (_w + 1) >> 1;
+  lh = (_h + 1) >> 1;
+  for (l = 0; l < _nlevels; l++) {
+    size_t im_size;
+    size_t level_size;
+    _ctx->level[l].w = lw;
+    _ctx->level[l].h = lh;
+    im_size = lw * (size_t)lh;
+    level_size = 2 * im_size * sizeof(*_ctx->level[l].im1);
+    level_size += sizeof(*_ctx->level[l].ssim) - 1;
+    level_size /= sizeof(*_ctx->level[l].ssim);
+    level_size *= sizeof(*_ctx->level[l].ssim);
+    _ctx->level[l].im1 = (uint32_t *)data;
+    _ctx->level[l].im2 = _ctx->level[l].im1 + im_size;
+    data += level_size;
+    _ctx->level[l].ssim = (double *)data;
+    data += im_size * sizeof(*_ctx->level[l].ssim);
+    lw = (lw + 1) >> 1;
+    lh = (lh + 1) >> 1;
+  }
+  _ctx->col_buf = (unsigned *)data;
+}
+
+static void fs_ctx_clear(fs_ctx *_ctx) { free(_ctx->level); }
+
+static void fs_downsample_level(fs_ctx *_ctx, int _l) {
+  const uint32_t *src1;
+  const uint32_t *src2;
+  uint32_t *dst1;
+  uint32_t *dst2;
+  int w2;
+  int h2;
+  int w;
+  int h;
+  int i;
+  int j;
+  w = _ctx->level[_l].w;
+  h = _ctx->level[_l].h;
+  dst1 = _ctx->level[_l].im1;
+  dst2 = _ctx->level[_l].im2;
+  w2 = _ctx->level[_l - 1].w;
+  h2 = _ctx->level[_l - 1].h;
+  src1 = _ctx->level[_l - 1].im1;
+  src2 = _ctx->level[_l - 1].im2;
+  for (j = 0; j < h; j++) {
+    int j0offs;
+    int j1offs;
+    j0offs = 2 * j * w2;
+    j1offs = FS_MINI(2 * j + 1, h2) * w2;
+    for (i = 0; i < w; i++) {
+      int i0;
+      int i1;
+      i0 = 2 * i;
+      i1 = FS_MINI(i0 + 1, w2);
+      dst1[j * w + i] = src1[j0offs + i0] + src1[j0offs + i1] +
+                        src1[j1offs + i0] + src1[j1offs + i1];
+      dst2[j * w + i] = src2[j0offs + i0] + src2[j0offs + i1] +
+                        src2[j1offs + i0] + src2[j1offs + i1];
+    }
+  }
+}
+
+static void fs_downsample_level0(fs_ctx *_ctx, const uint8_t *_src1,
+                                 int _s1ystride, const uint8_t *_src2,
+                                 int _s2ystride, int _w, int _h, uint32_t shift,
+                                 int buf_is_hbd) {
+  uint32_t *dst1;
+  uint32_t *dst2;
+  int w;
+  int h;
+  int i;
+  int j;
+  w = _ctx->level[0].w;
+  h = _ctx->level[0].h;
+  dst1 = _ctx->level[0].im1;
+  dst2 = _ctx->level[0].im2;
+  for (j = 0; j < h; j++) {
+    int j0;
+    int j1;
+    j0 = 2 * j;
+    j1 = FS_MINI(j0 + 1, _h);
+    for (i = 0; i < w; i++) {
+      int i0;
+      int i1;
+      i0 = 2 * i;
+      i1 = FS_MINI(i0 + 1, _w);
+      if (!buf_is_hbd) {
+        dst1[j * w + i] =
+            _src1[j0 * _s1ystride + i0] + _src1[j0 * _s1ystride + i1] +
+            _src1[j1 * _s1ystride + i0] + _src1[j1 * _s1ystride + i1];
+        dst2[j * w + i] =
+            _src2[j0 * _s2ystride + i0] + _src2[j0 * _s2ystride + i1] +
+            _src2[j1 * _s2ystride + i0] + _src2[j1 * _s2ystride + i1];
+      } else {
+        uint16_t *src1s = CONVERT_TO_SHORTPTR(_src1);
+        uint16_t *src2s = CONVERT_TO_SHORTPTR(_src2);
+        dst1[j * w + i] = (src1s[j0 * _s1ystride + i0] >> shift) +
+                          (src1s[j0 * _s1ystride + i1] >> shift) +
+                          (src1s[j1 * _s1ystride + i0] >> shift) +
+                          (src1s[j1 * _s1ystride + i1] >> shift);
+        dst2[j * w + i] = (src2s[j0 * _s2ystride + i0] >> shift) +
+                          (src2s[j0 * _s2ystride + i1] >> shift) +
+                          (src2s[j1 * _s2ystride + i0] >> shift) +
+                          (src2s[j1 * _s2ystride + i1] >> shift);
+      }
+    }
+  }
+}
+
+static void fs_apply_luminance(fs_ctx *_ctx, int _l, int bit_depth) {
+  unsigned *col_sums_x;
+  unsigned *col_sums_y;
+  uint32_t *im1;
+  uint32_t *im2;
+  double *ssim;
+  double c1;
+  int w;
+  int h;
+  int j0offs;
+  int j1offs;
+  int i;
+  int j;
+  double ssim_c1 = SSIM_C1;
+
+  if (bit_depth == 10) ssim_c1 = SSIM_C1_10;
+  if (bit_depth == 12) ssim_c1 = SSIM_C1_12;
+
+  w = _ctx->level[_l].w;
+  h = _ctx->level[_l].h;
+  col_sums_x = _ctx->col_buf;
+  col_sums_y = col_sums_x + w;
+  im1 = _ctx->level[_l].im1;
+  im2 = _ctx->level[_l].im2;
+  for (i = 0; i < w; i++) col_sums_x[i] = 5 * im1[i];
+  for (i = 0; i < w; i++) col_sums_y[i] = 5 * im2[i];
+  for (j = 1; j < 4; j++) {
+    j1offs = FS_MINI(j, h - 1) * w;
+    for (i = 0; i < w; i++) col_sums_x[i] += im1[j1offs + i];
+    for (i = 0; i < w; i++) col_sums_y[i] += im2[j1offs + i];
+  }
+  ssim = _ctx->level[_l].ssim;
+  c1 = (double)(ssim_c1 * 4096 * (1 << 4 * _l));
+  for (j = 0; j < h; j++) {
+    unsigned mux;
+    unsigned muy;
+    int i0;
+    int i1;
+    mux = 5 * col_sums_x[0];
+    muy = 5 * col_sums_y[0];
+    for (i = 1; i < 4; i++) {
+      i1 = FS_MINI(i, w - 1);
+      mux += col_sums_x[i1];
+      muy += col_sums_y[i1];
+    }
+    for (i = 0; i < w; i++) {
+      ssim[j * w + i] *= (2 * mux * (double)muy + c1) /
+                         (mux * (double)mux + muy * (double)muy + c1);
+      if (i + 1 < w) {
+        i0 = FS_MAXI(0, i - 4);
+        i1 = FS_MINI(i + 4, w - 1);
+        mux += col_sums_x[i1] - col_sums_x[i0];
+        muy += col_sums_x[i1] - col_sums_x[i0];
+      }
+    }
+    if (j + 1 < h) {
+      j0offs = FS_MAXI(0, j - 4) * w;
+      for (i = 0; i < w; i++) col_sums_x[i] -= im1[j0offs + i];
+      for (i = 0; i < w; i++) col_sums_y[i] -= im2[j0offs + i];
+      j1offs = FS_MINI(j + 4, h - 1) * w;
+      for (i = 0; i < w; i++) col_sums_x[i] += im1[j1offs + i];
+      for (i = 0; i < w; i++) col_sums_y[i] += im2[j1offs + i];
+    }
+  }
+}
+
+#define FS_COL_SET(_col, _joffs, _ioffs)                       \
+  do {                                                         \
+    unsigned gx;                                               \
+    unsigned gy;                                               \
+    gx = gx_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
+    gy = gy_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
+    col_sums_gx2[(_col)] = gx * (double)gx;                    \
+    col_sums_gy2[(_col)] = gy * (double)gy;                    \
+    col_sums_gxgy[(_col)] = gx * (double)gy;                   \
+  } while (0)
+
+#define FS_COL_ADD(_col, _joffs, _ioffs)                       \
+  do {                                                         \
+    unsigned gx;                                               \
+    unsigned gy;                                               \
+    gx = gx_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
+    gy = gy_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
+    col_sums_gx2[(_col)] += gx * (double)gx;                   \
+    col_sums_gy2[(_col)] += gy * (double)gy;                   \
+    col_sums_gxgy[(_col)] += gx * (double)gy;                  \
+  } while (0)
+
+#define FS_COL_SUB(_col, _joffs, _ioffs)                       \
+  do {                                                         \
+    unsigned gx;                                               \
+    unsigned gy;                                               \
+    gx = gx_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
+    gy = gy_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
+    col_sums_gx2[(_col)] -= gx * (double)gx;                   \
+    col_sums_gy2[(_col)] -= gy * (double)gy;                   \
+    col_sums_gxgy[(_col)] -= gx * (double)gy;                  \
+  } while (0)
+
+#define FS_COL_COPY(_col1, _col2)                    \
+  do {                                               \
+    col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)];   \
+    col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)];   \
+    col_sums_gxgy[(_col1)] = col_sums_gxgy[(_col2)]; \
+  } while (0)
+
+#define FS_COL_HALVE(_col1, _col2)                         \
+  do {                                                     \
+    col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)] * 0.5;   \
+    col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)] * 0.5;   \
+    col_sums_gxgy[(_col1)] = col_sums_gxgy[(_col2)] * 0.5; \
+  } while (0)
+
+#define FS_COL_DOUBLE(_col1, _col2)                      \
+  do {                                                   \
+    col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)] * 2;   \
+    col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)] * 2;   \
+    col_sums_gxgy[(_col1)] = col_sums_gxgy[(_col2)] * 2; \
+  } while (0)
+
+static void fs_calc_structure(fs_ctx *_ctx, int _l, int bit_depth) {
+  uint32_t *im1;
+  uint32_t *im2;
+  unsigned *gx_buf;
+  unsigned *gy_buf;
+  double *ssim;
+  double col_sums_gx2[8];
+  double col_sums_gy2[8];
+  double col_sums_gxgy[8];
+  double c2;
+  int stride;
+  int w;
+  int h;
+  int i;
+  int j;
+  double ssim_c2 = SSIM_C2;
+  if (bit_depth == 10) ssim_c2 = SSIM_C2_10;
+  if (bit_depth == 12) ssim_c2 = SSIM_C2_12;
+
+  w = _ctx->level[_l].w;
+  h = _ctx->level[_l].h;
+  im1 = _ctx->level[_l].im1;
+  im2 = _ctx->level[_l].im2;
+  ssim = _ctx->level[_l].ssim;
+  gx_buf = _ctx->col_buf;
+  stride = w + 8;
+  gy_buf = gx_buf + 8 * stride;
+  memset(gx_buf, 0, 2 * 8 * stride * sizeof(*gx_buf));
+  c2 = ssim_c2 * (1 << 4 * _l) * 16 * 104;
+  for (j = 0; j < h + 4; j++) {
+    if (j < h - 1) {
+      for (i = 0; i < w - 1; i++) {
+        unsigned g1;
+        unsigned g2;
+        unsigned gx;
+        unsigned gy;
+        g1 = abs((int)im1[(j + 1) * w + i + 1] - (int)im1[j * w + i]);
+        g2 = abs((int)im1[(j + 1) * w + i] - (int)im1[j * w + i + 1]);
+        gx = 4 * FS_MAXI(g1, g2) + FS_MINI(g1, g2);
+        g1 = abs((int)im2[(j + 1) * w + i + 1] - (int)im2[j * w + i]);
+        g2 = abs((int)im2[(j + 1) * w + i] - (int)im2[j * w + i + 1]);
+        gy = 4 * FS_MAXI(g1, g2) + FS_MINI(g1, g2);
+        gx_buf[(j & 7) * stride + i + 4] = gx;
+        gy_buf[(j & 7) * stride + i + 4] = gy;
+      }
+    } else {
+      memset(gx_buf + (j & 7) * stride, 0, stride * sizeof(*gx_buf));
+      memset(gy_buf + (j & 7) * stride, 0, stride * sizeof(*gy_buf));
+    }
+    if (j >= 4) {
+      int k;
+      col_sums_gx2[3] = col_sums_gx2[2] = col_sums_gx2[1] = col_sums_gx2[0] = 0;
+      col_sums_gy2[3] = col_sums_gy2[2] = col_sums_gy2[1] = col_sums_gy2[0] = 0;
+      col_sums_gxgy[3] = col_sums_gxgy[2] = col_sums_gxgy[1] =
+          col_sums_gxgy[0] = 0;
+      for (i = 4; i < 8; i++) {
+        FS_COL_SET(i, -1, 0);
+        FS_COL_ADD(i, 0, 0);
+        for (k = 1; k < 8 - i; k++) {
+          FS_COL_DOUBLE(i, i);
+          FS_COL_ADD(i, -k - 1, 0);
+          FS_COL_ADD(i, k, 0);
+        }
+      }
+      for (i = 0; i < w; i++) {
+        double mugx2;
+        double mugy2;
+        double mugxgy;
+        mugx2 = col_sums_gx2[0];
+        for (k = 1; k < 8; k++) mugx2 += col_sums_gx2[k];
+        mugy2 = col_sums_gy2[0];
+        for (k = 1; k < 8; k++) mugy2 += col_sums_gy2[k];
+        mugxgy = col_sums_gxgy[0];
+        for (k = 1; k < 8; k++) mugxgy += col_sums_gxgy[k];
+        ssim[(j - 4) * w + i] = (2 * mugxgy + c2) / (mugx2 + mugy2 + c2);
+        if (i + 1 < w) {
+          FS_COL_SET(0, -1, 1);
+          FS_COL_ADD(0, 0, 1);
+          FS_COL_SUB(2, -3, 2);
+          FS_COL_SUB(2, 2, 2);
+          FS_COL_HALVE(1, 2);
+          FS_COL_SUB(3, -4, 3);
+          FS_COL_SUB(3, 3, 3);
+          FS_COL_HALVE(2, 3);
+          FS_COL_COPY(3, 4);
+          FS_COL_DOUBLE(4, 5);
+          FS_COL_ADD(4, -4, 5);
+          FS_COL_ADD(4, 3, 5);
+          FS_COL_DOUBLE(5, 6);
+          FS_COL_ADD(5, -3, 6);
+          FS_COL_ADD(5, 2, 6);
+          FS_COL_DOUBLE(6, 7);
+          FS_COL_ADD(6, -2, 7);
+          FS_COL_ADD(6, 1, 7);
+          FS_COL_SET(7, -1, 8);
+          FS_COL_ADD(7, 0, 8);
+        }
+      }
+    }
+  }
+}
+
+#define FS_NLEVELS (4)
+
+/*These weights were derived from the default weights found in Wang's original
+ Matlab implementation: {0.0448, 0.2856, 0.2363, 0.1333}.
+ We drop the finest scale and renormalize the rest to sum to 1.*/
+
+static const double FS_WEIGHTS[FS_NLEVELS] = {
+  0.2989654541015625, 0.3141326904296875, 0.2473602294921875, 0.1395416259765625
+};
+
+static double fs_average(fs_ctx *_ctx, int _l) {
+  double *ssim;
+  double ret;
+  int w;
+  int h;
+  int i;
+  int j;
+  w = _ctx->level[_l].w;
+  h = _ctx->level[_l].h;
+  ssim = _ctx->level[_l].ssim;
+  ret = 0;
+  for (j = 0; j < h; j++)
+    for (i = 0; i < w; i++) ret += ssim[j * w + i];
+  return pow(ret / (w * h), FS_WEIGHTS[_l]);
+}
+
+static double convert_ssim_db(double _ssim, double _weight) {
+  assert(_weight >= _ssim);
+  if ((_weight - _ssim) < 1e-10) return MAX_SSIM_DB;
+  return 10 * (log10(_weight) - log10(_weight - _ssim));
+}
+
+static double calc_ssim(const uint8_t *_src, int _systride, const uint8_t *_dst,
+                        int _dystride, int _w, int _h, uint32_t _bd,
+                        uint32_t _shift, int buf_is_hbd) {
+  fs_ctx ctx;
+  double ret;
+  int l;
+  ret = 1;
+  fs_ctx_init(&ctx, _w, _h, FS_NLEVELS);
+  fs_downsample_level0(&ctx, _src, _systride, _dst, _dystride, _w, _h, _shift,
+                       buf_is_hbd);
+  for (l = 0; l < FS_NLEVELS - 1; l++) {
+    fs_calc_structure(&ctx, l, _bd);
+    ret *= fs_average(&ctx, l);
+    fs_downsample_level(&ctx, l + 1);
+  }
+  fs_calc_structure(&ctx, l, _bd);
+  fs_apply_luminance(&ctx, l, _bd);
+  ret *= fs_average(&ctx, l);
+  fs_ctx_clear(&ctx);
+  return ret;
+}
+
+double aom_calc_fastssim(const YV12_BUFFER_CONFIG *source,
+                         const YV12_BUFFER_CONFIG *dest, double *ssim_y,
+                         double *ssim_u, double *ssim_v, uint32_t bd,
+                         uint32_t in_bd) {
+  double ssimv;
+  uint32_t bd_shift = 0;
+  aom_clear_system_state();
+  assert(bd >= in_bd);
+  assert(source->flags == dest->flags);
+  int buf_is_hbd = source->flags & YV12_FLAG_HIGHBITDEPTH;
+  bd_shift = bd - in_bd;
+
+  *ssim_y = calc_ssim(source->y_buffer, source->y_stride, dest->y_buffer,
+                      dest->y_stride, source->y_crop_width,
+                      source->y_crop_height, in_bd, bd_shift, buf_is_hbd);
+  *ssim_u = calc_ssim(source->u_buffer, source->uv_stride, dest->u_buffer,
+                      dest->uv_stride, source->uv_crop_width,
+                      source->uv_crop_height, in_bd, bd_shift, buf_is_hbd);
+  *ssim_v = calc_ssim(source->v_buffer, source->uv_stride, dest->v_buffer,
+                      dest->uv_stride, source->uv_crop_width,
+                      source->uv_crop_height, in_bd, bd_shift, buf_is_hbd);
+  ssimv = (*ssim_y) * .8 + .1 * ((*ssim_u) + (*ssim_v));
+  return convert_ssim_db(ssimv, 1.0);
+}
diff --git a/libs/libaom/src/aom_dsp/fft.c b/libs/libaom/src/aom_dsp/fft.c
new file mode 100644
index 000000000..0ba71cfb3
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/fft.c
@@ -0,0 +1,219 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/fft_common.h"
+
+static INLINE void simple_transpose(const float *A, float *B, int n) {
+  for (int y = 0; y < n; y++) {
+    for (int x = 0; x < n; x++) {
+      B[y * n + x] = A[x * n + y];
+    }
+  }
+}
+
+// The 1d transform is real to complex and packs the complex results in
+// a way to take advantage of conjugate symmetry (e.g., the n/2 + 1 real
+// components, followed by the n/2 - 1 imaginary components). After the
+// transform is done on the rows, the first n/2 + 1 columns are real, and
+// the remaining are the imaginary components. After the transform on the
+// columns, the region of [0, n/2]x[0, n/2] contains the real part of
+// fft of the real columns. The real part of the 2d fft also includes the
+// imaginary part of transformed imaginary columns. This function assembles
+// the correct outputs while putting the real and imaginary components
+// next to each other.
+static INLINE void unpack_2d_output(const float *col_fft, float *output,
+                                    int n) {
+  for (int y = 0; y <= n / 2; ++y) {
+    const int y2 = y + n / 2;
+    const int y_extra = y2 > n / 2 && y2 < n;
+
+    for (int x = 0; x <= n / 2; ++x) {
+      const int x2 = x + n / 2;
+      const int x_extra = x2 > n / 2 && x2 < n;
+      output[2 * (y * n + x)] =
+          col_fft[y * n + x] - (x_extra && y_extra ? col_fft[y2 * n + x2] : 0);
+      output[2 * (y * n + x) + 1] = (y_extra ? col_fft[y2 * n + x] : 0) +
+                                    (x_extra ? col_fft[y * n + x2] : 0);
+      if (y_extra) {
+        output[2 * ((n - y) * n + x)] =
+            col_fft[y * n + x] +
+            (x_extra && y_extra ? col_fft[y2 * n + x2] : 0);
+        output[2 * ((n - y) * n + x) + 1] =
+            -(y_extra ? col_fft[y2 * n + x] : 0) +
+            (x_extra ? col_fft[y * n + x2] : 0);
+      }
+    }
+  }
+}
+
+void aom_fft_2d_gen(const float *input, float *temp, float *output, int n,
+                    aom_fft_1d_func_t tform, aom_fft_transpose_func_t transpose,
+                    aom_fft_unpack_func_t unpack, int vec_size) {
+  for (int x = 0; x < n; x += vec_size) {
+    tform(input + x, output + x, n);
+  }
+  transpose(output, temp, n);
+
+  for (int x = 0; x < n; x += vec_size) {
+    tform(temp + x, output + x, n);
+  }
+  transpose(output, temp, n);
+
+  unpack(temp, output, n);
+}
+
+static INLINE void store_float(float *output, float input) { *output = input; }
+static INLINE float add_float(float a, float b) { return a + b; }
+static INLINE float sub_float(float a, float b) { return a - b; }
+static INLINE float mul_float(float a, float b) { return a * b; }
+
+GEN_FFT_2(void, float, float, float, *, store_float);
+GEN_FFT_4(void, float, float, float, *, store_float, (float), add_float,
+          sub_float);
+GEN_FFT_8(void, float, float, float, *, store_float, (float), add_float,
+          sub_float, mul_float);
+GEN_FFT_16(void, float, float, float, *, store_float, (float), add_float,
+           sub_float, mul_float);
+GEN_FFT_32(void, float, float, float, *, store_float, (float), add_float,
+           sub_float, mul_float);
+
+void aom_fft2x2_float_c(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 2, aom_fft1d_2_float, simple_transpose,
+                 unpack_2d_output, 1);
+}
+
+void aom_fft4x4_float_c(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 4, aom_fft1d_4_float, simple_transpose,
+                 unpack_2d_output, 1);
+}
+
+void aom_fft8x8_float_c(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, simple_transpose,
+                 unpack_2d_output, 1);
+}
+
+void aom_fft16x16_float_c(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 16, aom_fft1d_16_float, simple_transpose,
+                 unpack_2d_output, 1);
+}
+
+void aom_fft32x32_float_c(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 32, aom_fft1d_32_float, simple_transpose,
+                 unpack_2d_output, 1);
+}
+
+void aom_ifft_2d_gen(const float *input, float *temp, float *output, int n,
+                     aom_fft_1d_func_t fft_single, aom_fft_1d_func_t fft_multi,
+                     aom_fft_1d_func_t ifft_multi,
+                     aom_fft_transpose_func_t transpose, int vec_size) {
+  // Column 0 and n/2 have conjugate symmetry, so we can directly do the ifft
+  // and get real outputs.
+  for (int y = 0; y <= n / 2; ++y) {
+    output[y * n] = input[2 * y * n];
+    output[y * n + 1] = input[2 * (y * n + n / 2)];
+  }
+  for (int y = n / 2 + 1; y < n; ++y) {
+    output[y * n] = input[2 * (y - n / 2) * n + 1];
+    output[y * n + 1] = input[2 * ((y - n / 2) * n + n / 2) + 1];
+  }
+
+  for (int i = 0; i < 2; i += vec_size) {
+    ifft_multi(output + i, temp + i, n);
+  }
+
+  // For the other columns, since we don't have a full ifft for complex inputs
+  // we have to split them into the real and imaginary counterparts.
+  // Pack the real component, then the imaginary components.
+  for (int y = 0; y < n; ++y) {
+    for (int x = 1; x < n / 2; ++x) {
+      output[y * n + (x + 1)] = input[2 * (y * n + x)];
+    }
+    for (int x = 1; x < n / 2; ++x) {
+      output[y * n + (x + n / 2)] = input[2 * (y * n + x) + 1];
+    }
+  }
+  for (int y = 2; y < vec_size; y++) {
+    fft_single(output + y, temp + y, n);
+  }
+  // This is the part that can be sped up with SIMD
+  for (int y = AOMMAX(2, vec_size); y < n; y += vec_size) {
+    fft_multi(output + y, temp + y, n);
+  }
+
+  // Put the 0 and n/2 th results in the correct place.
+  for (int x = 0; x < n; ++x) {
+    output[x] = temp[x * n];
+    output[(n / 2) * n + x] = temp[x * n + 1];
+  }
+  // This rearranges and transposes.
+  for (int y = 1; y < n / 2; ++y) {
+    // Fill in the real columns
+    for (int x = 0; x <= n / 2; ++x) {
+      output[x + y * n] =
+          temp[(y + 1) + x * n] +
+          ((x > 0 && x < n / 2) ? temp[(y + n / 2) + (x + n / 2) * n] : 0);
+    }
+    for (int x = n / 2 + 1; x < n; ++x) {
+      output[x + y * n] = temp[(y + 1) + (n - x) * n] -
+                          temp[(y + n / 2) + ((n - x) + n / 2) * n];
+    }
+    // Fill in the imag columns
+    for (int x = 0; x <= n / 2; ++x) {
+      output[x + (y + n / 2) * n] =
+          temp[(y + n / 2) + x * n] -
+          ((x > 0 && x < n / 2) ? temp[(y + 1) + (x + n / 2) * n] : 0);
+    }
+    for (int x = n / 2 + 1; x < n; ++x) {
+      output[x + (y + n / 2) * n] = temp[(y + 1) + ((n - x) + n / 2) * n] +
+                                    temp[(y + n / 2) + (n - x) * n];
+    }
+  }
+  for (int y = 0; y < n; y += vec_size) {
+    ifft_multi(output + y, temp + y, n);
+  }
+  transpose(temp, output, n);
+}
+
+GEN_IFFT_2(void, float, float, float, *, store_float);
+GEN_IFFT_4(void, float, float, float, *, store_float, (float), add_float,
+           sub_float);
+GEN_IFFT_8(void, float, float, float, *, store_float, (float), add_float,
+           sub_float, mul_float);
+GEN_IFFT_16(void, float, float, float, *, store_float, (float), add_float,
+            sub_float, mul_float);
+GEN_IFFT_32(void, float, float, float, *, store_float, (float), add_float,
+            sub_float, mul_float);
+
+void aom_ifft2x2_float_c(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 2, aom_fft1d_2_float, aom_fft1d_2_float,
+                  aom_ifft1d_2_float, simple_transpose, 1);
+}
+
+void aom_ifft4x4_float_c(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 4, aom_fft1d_4_float, aom_fft1d_4_float,
+                  aom_ifft1d_4_float, simple_transpose, 1);
+}
+
+void aom_ifft8x8_float_c(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, aom_fft1d_8_float,
+                  aom_ifft1d_8_float, simple_transpose, 1);
+}
+
+void aom_ifft16x16_float_c(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 16, aom_fft1d_16_float,
+                  aom_fft1d_16_float, aom_ifft1d_16_float, simple_transpose, 1);
+}
+
+void aom_ifft32x32_float_c(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 32, aom_fft1d_32_float,
+                  aom_fft1d_32_float, aom_ifft1d_32_float, simple_transpose, 1);
+}
diff --git a/libs/libaom/src/aom_dsp/fft_common.h b/libs/libaom/src/aom_dsp/fft_common.h
new file mode 100644
index 000000000..5137331ae
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/fft_common.h
@@ -0,0 +1,1050 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_FFT_COMMON_H_
+#define AOM_AOM_DSP_FFT_COMMON_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!\brief A function pointer for computing 1d fft and ifft.
+ *
+ * The function will point to an implementation for a specific transform size,
+ * and may perform the transforms using vectorized instructions.
+ *
+ * For a non-vectorized forward transforms of size n, the input and output
+ * buffers will be size n. The output takes advantage of conjugate symmetry and
+ * packs the results as: [r_0, r_1, ..., r_{n/2}, i_1, ..., i_{n/2-1}], where
+ * (r_{j}, i_{j}) is the complex output for index j.
+ *
+ * An inverse transform will assume that the complex "input" is packed
+ * similarly. Its output will be real.
+ *
+ * Non-vectorized transforms (e.g., on a single row) would use a stride = 1.
+ *
+ * Vectorized implementations are parallelized along the columns so that the fft
+ * can be performed on multiple columns at a time. In such cases the data block
+ * for input and output is typically square (n x n) and the stride will
+ * correspond to the spacing between rows. At minimum, the input size must be
+ * n x simd_vector_length.
+ *
+ * \param[in]  input   Input buffer. See above for size restrictions.
+ * \param[out] output  Output buffer. See above for size restrictions.
+ * \param[in]  stride  The spacing in number of elements between rows
+ *                     (or elements)
+ */
+typedef void (*aom_fft_1d_func_t)(const float *input, float *output,
+                                  int stride);
+
+// Declare some of the forward non-vectorized transforms which are used in some
+// of the vectorized implementations
+void aom_fft1d_4_float(const float *input, float *output, int stride);
+void aom_fft1d_8_float(const float *input, float *output, int stride);
+void aom_fft1d_16_float(const float *input, float *output, int stride);
+void aom_fft1d_32_float(const float *input, float *output, int stride);
+
+/**\!brief Function pointer for transposing a matrix of floats.
+ *
+ * \param[in]  input  Input buffer (size n x n)
+ * \param[out] output Output buffer (size n x n)
+ * \param[in]  n      Extent of one dimension of the square matrix.
+ */
+typedef void (*aom_fft_transpose_func_t)(const float *input, float *output,
+                                         int n);
+
+/**\!brief Function pointer for re-arranging intermediate 2d transform results.
+ *
+ * After re-arrangement, the real and imaginary components will be packed
+ * tightly next to each other.
+ *
+ * \param[in]  input  Input buffer (size n x n)
+ * \param[out] output Output buffer (size 2 x n x n)
+ * \param[in]  n      Extent of one dimension of the square matrix.
+ */
+typedef void (*aom_fft_unpack_func_t)(const float *input, float *output, int n);
+
+/*!\brief Performs a 2d fft with the given functions.
+ *
+ * This generator function allows for multiple different implementations of 2d
+ * fft with different vector operations, without having to redefine the main
+ * body multiple times.
+ *
+ * \param[in]  input     Input buffer to run the transform on (size n x n)
+ * \param[out] temp      Working buffer for computing the transform (size n x n)
+ * \param[out] output    Output buffer (size 2 x n x n)
+ * \param[in]  tform     Forward transform function
+ * \param[in]  transpose Transpose function (for n x n matrix)
+ * \param[in]  unpack    Unpack function used to massage outputs to correct form
+ * \param[in]  vec_size  Vector size (the transform is done vec_size units at
+ *                       a time)
+ */
+void aom_fft_2d_gen(const float *input, float *temp, float *output, int n,
+                    aom_fft_1d_func_t tform, aom_fft_transpose_func_t transpose,
+                    aom_fft_unpack_func_t unpack, int vec_size);
+
+/*!\brief Perform a 2d inverse fft with the given helper functions
+ *
+ * \param[in]  input      Input buffer to run the transform on (size 2 x n x n)
+ * \param[out] temp       Working buffer for computations (size 2 x n x n)
+ * \param[out] output     Output buffer (size n x n)
+ * \param[in]  fft_single Forward transform function (non vectorized)
+ * \param[in]  fft_multi  Forward transform function (vectorized)
+ * \param[in]  ifft_multi Inverse transform function (vectorized)
+ * \param[in]  transpose  Transpose function (for n x n matrix)
+ * \param[in]  vec_size   Vector size (the transform is done vec_size
+ *                        units at a time)
+ */
+void aom_ifft_2d_gen(const float *input, float *temp, float *output, int n,
+                     aom_fft_1d_func_t fft_single, aom_fft_1d_func_t fft_multi,
+                     aom_fft_1d_func_t ifft_multi,
+                     aom_fft_transpose_func_t transpose, int vec_size);
+#ifdef __cplusplus
+}
+#endif
+
+// The macros below define 1D fft/ifft for different data types and for
+// different simd vector intrinsic types.
+
+#define GEN_FFT_2(ret, suffix, T, T_VEC, load, store)               \
+  ret aom_fft1d_2_##suffix(const T *input, T *output, int stride) { \
+    const T_VEC i0 = load(input + 0 * stride);                      \
+    const T_VEC i1 = load(input + 1 * stride);                      \
+    store(output + 0 * stride, i0 + i1);                            \
+    store(output + 1 * stride, i0 - i1);                            \
+  }
+
+#define GEN_FFT_4(ret, suffix, T, T_VEC, load, store, constant, add, sub) \
+  ret aom_fft1d_4_##suffix(const T *input, T *output, int stride) {       \
+    const T_VEC kWeight0 = constant(0.0f);                                \
+    const T_VEC i0 = load(input + 0 * stride);                            \
+    const T_VEC i1 = load(input + 1 * stride);                            \
+    const T_VEC i2 = load(input + 2 * stride);                            \
+    const T_VEC i3 = load(input + 3 * stride);                            \
+    const T_VEC w0 = add(i0, i2);                                         \
+    const T_VEC w1 = sub(i0, i2);                                         \
+    const T_VEC w2 = add(i1, i3);                                         \
+    const T_VEC w3 = sub(i1, i3);                                         \
+    store(output + 0 * stride, add(w0, w2));                              \
+    store(output + 1 * stride, w1);                                       \
+    store(output + 2 * stride, sub(w0, w2));                              \
+    store(output + 3 * stride, sub(kWeight0, w3));                        \
+  }
+
+#define GEN_FFT_8(ret, suffix, T, T_VEC, load, store, constant, add, sub, mul) \
+  ret aom_fft1d_8_##suffix(const T *input, T *output, int stride) {            \
+    const T_VEC kWeight0 = constant(0.0f);                                     \
+    const T_VEC kWeight2 = constant(0.707107f);                                \
+    const T_VEC i0 = load(input + 0 * stride);                                 \
+    const T_VEC i1 = load(input + 1 * stride);                                 \
+    const T_VEC i2 = load(input + 2 * stride);                                 \
+    const T_VEC i3 = load(input + 3 * stride);                                 \
+    const T_VEC i4 = load(input + 4 * stride);                                 \
+    const T_VEC i5 = load(input + 5 * stride);                                 \
+    const T_VEC i6 = load(input + 6 * stride);                                 \
+    const T_VEC i7 = load(input + 7 * stride);                                 \
+    const T_VEC w0 = add(i0, i4);                                              \
+    const T_VEC w1 = sub(i0, i4);                                              \
+    const T_VEC w2 = add(i2, i6);                                              \
+    const T_VEC w3 = sub(i2, i6);                                              \
+    const T_VEC w4 = add(w0, w2);                                              \
+    const T_VEC w5 = sub(w0, w2);                                              \
+    const T_VEC w7 = add(i1, i5);                                              \
+    const T_VEC w8 = sub(i1, i5);                                              \
+    const T_VEC w9 = add(i3, i7);                                              \
+    const T_VEC w10 = sub(i3, i7);                                             \
+    const T_VEC w11 = add(w7, w9);                                             \
+    const T_VEC w12 = sub(w7, w9);                                             \
+    store(output + 0 * stride, add(w4, w11));                                  \
+    store(output + 1 * stride, add(w1, mul(kWeight2, sub(w8, w10))));          \
+    store(output + 2 * stride, w5);                                            \
+    store(output + 3 * stride, sub(w1, mul(kWeight2, sub(w8, w10))));          \
+    store(output + 4 * stride, sub(w4, w11));                                  \
+    store(output + 5 * stride,                                                 \
+          sub(sub(kWeight0, w3), mul(kWeight2, add(w10, w8))));                \
+    store(output + 6 * stride, sub(kWeight0, w12));                            \
+    store(output + 7 * stride, sub(w3, mul(kWeight2, add(w10, w8))));          \
+  }
+
+#define GEN_FFT_16(ret, suffix, T, T_VEC, load, store, constant, add, sub, \
+                   mul)                                                    \
+  ret aom_fft1d_16_##suffix(const T *input, T *output, int stride) {       \
+    const T_VEC kWeight0 = constant(0.0f);                                 \
+    const T_VEC kWeight2 = constant(0.707107f);                            \
+    const T_VEC kWeight3 = constant(0.92388f);                             \
+    const T_VEC kWeight4 = constant(0.382683f);                            \
+    const T_VEC i0 = load(input + 0 * stride);                             \
+    const T_VEC i1 = load(input + 1 * stride);                             \
+    const T_VEC i2 = load(input + 2 * stride);                             \
+    const T_VEC i3 = load(input + 3 * stride);                             \
+    const T_VEC i4 = load(input + 4 * stride);                             \
+    const T_VEC i5 = load(input + 5 * stride);                             \
+    const T_VEC i6 = load(input + 6 * stride);                             \
+    const T_VEC i7 = load(input + 7 * stride);                             \
+    const T_VEC i8 = load(input + 8 * stride);                             \
+    const T_VEC i9 = load(input + 9 * stride);                             \
+    const T_VEC i10 = load(input + 10 * stride);                           \
+    const T_VEC i11 = load(input + 11 * stride);                           \
+    const T_VEC i12 = load(input + 12 * stride);                           \
+    const T_VEC i13 = load(input + 13 * stride);                           \
+    const T_VEC i14 = load(input + 14 * stride);                           \
+    const T_VEC i15 = load(input + 15 * stride);                           \
+    const T_VEC w0 = add(i0, i8);                                          \
+    const T_VEC w1 = sub(i0, i8);                                          \
+    const T_VEC w2 = add(i4, i12);                                         \
+    const T_VEC w3 = sub(i4, i12);                                         \
+    const T_VEC w4 = add(w0, w2);                                          \
+    const T_VEC w5 = sub(w0, w2);                                          \
+    const T_VEC w7 = add(i2, i10);                                         \
+    const T_VEC w8 = sub(i2, i10);                                         \
+    const T_VEC w9 = add(i6, i14);                                         \
+    const T_VEC w10 = sub(i6, i14);                                        \
+    const T_VEC w11 = add(w7, w9);                                         \
+    const T_VEC w12 = sub(w7, w9);                                         \
+    const T_VEC w14 = add(w4, w11);                                        \
+    const T_VEC w15 = sub(w4, w11);                                        \
+    const T_VEC w16[2] = { add(w1, mul(kWeight2, sub(w8, w10))),           \
+                           sub(sub(kWeight0, w3),                          \
+                               mul(kWeight2, add(w10, w8))) };             \
+    const T_VEC w18[2] = { sub(w1, mul(kWeight2, sub(w8, w10))),           \
+                           sub(w3, mul(kWeight2, add(w10, w8))) };         \
+    const T_VEC w19 = add(i1, i9);                                         \
+    const T_VEC w20 = sub(i1, i9);                                         \
+    const T_VEC w21 = add(i5, i13);                                        \
+    const T_VEC w22 = sub(i5, i13);                                        \
+    const T_VEC w23 = add(w19, w21);                                       \
+    const T_VEC w24 = sub(w19, w21);                                       \
+    const T_VEC w26 = add(i3, i11);                                        \
+    const T_VEC w27 = sub(i3, i11);                                        \
+    const T_VEC w28 = add(i7, i15);                                        \
+    const T_VEC w29 = sub(i7, i15);                                        \
+    const T_VEC w30 = add(w26, w28);                                       \
+    const T_VEC w31 = sub(w26, w28);                                       \
+    const T_VEC w33 = add(w23, w30);                                       \
+    const T_VEC w34 = sub(w23, w30);                                       \
+    const T_VEC w35[2] = { add(w20, mul(kWeight2, sub(w27, w29))),         \
+                           sub(sub(kWeight0, w22),                         \
+                               mul(kWeight2, add(w29, w27))) };            \
+    const T_VEC w37[2] = { sub(w20, mul(kWeight2, sub(w27, w29))),         \
+                           sub(w22, mul(kWeight2, add(w29, w27))) };       \
+    store(output + 0 * stride, add(w14, w33));                             \
+    store(output + 1 * stride,                                             \
+          add(w16[0], add(mul(kWeight3, w35[0]), mul(kWeight4, w35[1])))); \
+    store(output + 2 * stride, add(w5, mul(kWeight2, sub(w24, w31))));     \
+    store(output + 3 * stride,                                             \
+          add(w18[0], add(mul(kWeight4, w37[0]), mul(kWeight3, w37[1])))); \
+    store(output + 4 * stride, w15);                                       \
+    store(output + 5 * stride,                                             \
+          add(w18[0], sub(sub(kWeight0, mul(kWeight4, w37[0])),            \
+                          mul(kWeight3, w37[1]))));                        \
+    store(output + 6 * stride, sub(w5, mul(kWeight2, sub(w24, w31))));     \
+    store(output + 7 * stride,                                             \
+          add(w16[0], sub(sub(kWeight0, mul(kWeight3, w35[0])),            \
+                          mul(kWeight4, w35[1]))));                        \
+    store(output + 8 * stride, sub(w14, w33));                             \
+    store(output + 9 * stride,                                             \
+          add(w16[1], sub(mul(kWeight3, w35[1]), mul(kWeight4, w35[0])))); \
+    store(output + 10 * stride,                                            \
+          sub(sub(kWeight0, w12), mul(kWeight2, add(w31, w24))));          \
+    store(output + 11 * stride,                                            \
+          add(w18[1], sub(mul(kWeight4, w37[1]), mul(kWeight3, w37[0])))); \
+    store(output + 12 * stride, sub(kWeight0, w34));                       \
+    store(output + 13 * stride,                                            \
+          sub(sub(kWeight0, w18[1]),                                       \
+              sub(mul(kWeight3, w37[0]), mul(kWeight4, w37[1]))));         \
+    store(output + 14 * stride, sub(w12, mul(kWeight2, add(w31, w24))));   \
+    store(output + 15 * stride,                                            \
+          sub(sub(kWeight0, w16[1]),                                       \
+              sub(mul(kWeight4, w35[0]), mul(kWeight3, w35[1]))));         \
+  }
+
+#define GEN_FFT_32(ret, suffix, T, T_VEC, load, store, constant, add, sub,   \
+                   mul)                                                      \
+  ret aom_fft1d_32_##suffix(const T *input, T *output, int stride) {         \
+    const T_VEC kWeight0 = constant(0.0f);                                   \
+    const T_VEC kWeight2 = constant(0.707107f);                              \
+    const T_VEC kWeight3 = constant(0.92388f);                               \
+    const T_VEC kWeight4 = constant(0.382683f);                              \
+    const T_VEC kWeight5 = constant(0.980785f);                              \
+    const T_VEC kWeight6 = constant(0.19509f);                               \
+    const T_VEC kWeight7 = constant(0.83147f);                               \
+    const T_VEC kWeight8 = constant(0.55557f);                               \
+    const T_VEC i0 = load(input + 0 * stride);                               \
+    const T_VEC i1 = load(input + 1 * stride);                               \
+    const T_VEC i2 = load(input + 2 * stride);                               \
+    const T_VEC i3 = load(input + 3 * stride);                               \
+    const T_VEC i4 = load(input + 4 * stride);                               \
+    const T_VEC i5 = load(input + 5 * stride);                               \
+    const T_VEC i6 = load(input + 6 * stride);                               \
+    const T_VEC i7 = load(input + 7 * stride);                               \
+    const T_VEC i8 = load(input + 8 * stride);                               \
+    const T_VEC i9 = load(input + 9 * stride);                               \
+    const T_VEC i10 = load(input + 10 * stride);                             \
+    const T_VEC i11 = load(input + 11 * stride);                             \
+    const T_VEC i12 = load(input + 12 * stride);                             \
+    const T_VEC i13 = load(input + 13 * stride);                             \
+    const T_VEC i14 = load(input + 14 * stride);                             \
+    const T_VEC i15 = load(input + 15 * stride);                             \
+    const T_VEC i16 = load(input + 16 * stride);                             \
+    const T_VEC i17 = load(input + 17 * stride);                             \
+    const T_VEC i18 = load(input + 18 * stride);                             \
+    const T_VEC i19 = load(input + 19 * stride);                             \
+    const T_VEC i20 = load(input + 20 * stride);                             \
+    const T_VEC i21 = load(input + 21 * stride);                             \
+    const T_VEC i22 = load(input + 22 * stride);                             \
+    const T_VEC i23 = load(input + 23 * stride);                             \
+    const T_VEC i24 = load(input + 24 * stride);                             \
+    const T_VEC i25 = load(input + 25 * stride);                             \
+    const T_VEC i26 = load(input + 26 * stride);                             \
+    const T_VEC i27 = load(input + 27 * stride);                             \
+    const T_VEC i28 = load(input + 28 * stride);                             \
+    const T_VEC i29 = load(input + 29 * stride);                             \
+    const T_VEC i30 = load(input + 30 * stride);                             \
+    const T_VEC i31 = load(input + 31 * stride);                             \
+    const T_VEC w0 = add(i0, i16);                                           \
+    const T_VEC w1 = sub(i0, i16);                                           \
+    const T_VEC w2 = add(i8, i24);                                           \
+    const T_VEC w3 = sub(i8, i24);                                           \
+    const T_VEC w4 = add(w0, w2);                                            \
+    const T_VEC w5 = sub(w0, w2);                                            \
+    const T_VEC w7 = add(i4, i20);                                           \
+    const T_VEC w8 = sub(i4, i20);                                           \
+    const T_VEC w9 = add(i12, i28);                                          \
+    const T_VEC w10 = sub(i12, i28);                                         \
+    const T_VEC w11 = add(w7, w9);                                           \
+    const T_VEC w12 = sub(w7, w9);                                           \
+    const T_VEC w14 = add(w4, w11);                                          \
+    const T_VEC w15 = sub(w4, w11);                                          \
+    const T_VEC w16[2] = { add(w1, mul(kWeight2, sub(w8, w10))),             \
+                           sub(sub(kWeight0, w3),                            \
+                               mul(kWeight2, add(w10, w8))) };               \
+    const T_VEC w18[2] = { sub(w1, mul(kWeight2, sub(w8, w10))),             \
+                           sub(w3, mul(kWeight2, add(w10, w8))) };           \
+    const T_VEC w19 = add(i2, i18);                                          \
+    const T_VEC w20 = sub(i2, i18);                                          \
+    const T_VEC w21 = add(i10, i26);                                         \
+    const T_VEC w22 = sub(i10, i26);                                         \
+    const T_VEC w23 = add(w19, w21);                                         \
+    const T_VEC w24 = sub(w19, w21);                                         \
+    const T_VEC w26 = add(i6, i22);                                          \
+    const T_VEC w27 = sub(i6, i22);                                          \
+    const T_VEC w28 = add(i14, i30);                                         \
+    const T_VEC w29 = sub(i14, i30);                                         \
+    const T_VEC w30 = add(w26, w28);                                         \
+    const T_VEC w31 = sub(w26, w28);                                         \
+    const T_VEC w33 = add(w23, w30);                                         \
+    const T_VEC w34 = sub(w23, w30);                                         \
+    const T_VEC w35[2] = { add(w20, mul(kWeight2, sub(w27, w29))),           \
+                           sub(sub(kWeight0, w22),                           \
+                               mul(kWeight2, add(w29, w27))) };              \
+    const T_VEC w37[2] = { sub(w20, mul(kWeight2, sub(w27, w29))),           \
+                           sub(w22, mul(kWeight2, add(w29, w27))) };         \
+    const T_VEC w38 = add(w14, w33);                                         \
+    const T_VEC w39 = sub(w14, w33);                                         \
+    const T_VEC w40[2] = {                                                   \
+      add(w16[0], add(mul(kWeight3, w35[0]), mul(kWeight4, w35[1]))),        \
+      add(w16[1], sub(mul(kWeight3, w35[1]), mul(kWeight4, w35[0])))         \
+    };                                                                       \
+    const T_VEC w41[2] = { add(w5, mul(kWeight2, sub(w24, w31))),            \
+                           sub(sub(kWeight0, w12),                           \
+                               mul(kWeight2, add(w31, w24))) };              \
+    const T_VEC w42[2] = {                                                   \
+      add(w18[0], add(mul(kWeight4, w37[0]), mul(kWeight3, w37[1]))),        \
+      add(w18[1], sub(mul(kWeight4, w37[1]), mul(kWeight3, w37[0])))         \
+    };                                                                       \
+    const T_VEC w44[2] = {                                                   \
+      add(w18[0],                                                            \
+          sub(sub(kWeight0, mul(kWeight4, w37[0])), mul(kWeight3, w37[1]))), \
+      sub(sub(kWeight0, w18[1]),                                             \
+          sub(mul(kWeight3, w37[0]), mul(kWeight4, w37[1])))                 \
+    };                                                                       \
+    const T_VEC w45[2] = { sub(w5, mul(kWeight2, sub(w24, w31))),            \
+                           sub(w12, mul(kWeight2, add(w31, w24))) };         \
+    const T_VEC w46[2] = {                                                   \
+      add(w16[0],                                                            \
+          sub(sub(kWeight0, mul(kWeight3, w35[0])), mul(kWeight4, w35[1]))), \
+      sub(sub(kWeight0, w16[1]),                                             \
+          sub(mul(kWeight4, w35[0]), mul(kWeight3, w35[1])))                 \
+    };                                                                       \
+    const T_VEC w47 = add(i1, i17);                                          \
+    const T_VEC w48 = sub(i1, i17);                                          \
+    const T_VEC w49 = add(i9, i25);                                          \
+    const T_VEC w50 = sub(i9, i25);                                          \
+    const T_VEC w51 = add(w47, w49);                                         \
+    const T_VEC w52 = sub(w47, w49);                                         \
+    const T_VEC w54 = add(i5, i21);                                          \
+    const T_VEC w55 = sub(i5, i21);                                          \
+    const T_VEC w56 = add(i13, i29);                                         \
+    const T_VEC w57 = sub(i13, i29);                                         \
+    const T_VEC w58 = add(w54, w56);                                         \
+    const T_VEC w59 = sub(w54, w56);                                         \
+    const T_VEC w61 = add(w51, w58);                                         \
+    const T_VEC w62 = sub(w51, w58);                                         \
+    const T_VEC w63[2] = { add(w48, mul(kWeight2, sub(w55, w57))),           \
+                           sub(sub(kWeight0, w50),                           \
+                               mul(kWeight2, add(w57, w55))) };              \
+    const T_VEC w65[2] = { sub(w48, mul(kWeight2, sub(w55, w57))),           \
+                           sub(w50, mul(kWeight2, add(w57, w55))) };         \
+    const T_VEC w66 = add(i3, i19);                                          \
+    const T_VEC w67 = sub(i3, i19);                                          \
+    const T_VEC w68 = add(i11, i27);                                         \
+    const T_VEC w69 = sub(i11, i27);                                         \
+    const T_VEC w70 = add(w66, w68);                                         \
+    const T_VEC w71 = sub(w66, w68);                                         \
+    const T_VEC w73 = add(i7, i23);                                          \
+    const T_VEC w74 = sub(i7, i23);                                          \
+    const T_VEC w75 = add(i15, i31);                                         \
+    const T_VEC w76 = sub(i15, i31);                                         \
+    const T_VEC w77 = add(w73, w75);                                         \
+    const T_VEC w78 = sub(w73, w75);                                         \
+    const T_VEC w80 = add(w70, w77);                                         \
+    const T_VEC w81 = sub(w70, w77);                                         \
+    const T_VEC w82[2] = { add(w67, mul(kWeight2, sub(w74, w76))),           \
+                           sub(sub(kWeight0, w69),                           \
+                               mul(kWeight2, add(w76, w74))) };              \
+    const T_VEC w84[2] = { sub(w67, mul(kWeight2, sub(w74, w76))),           \
+                           sub(w69, mul(kWeight2, add(w76, w74))) };         \
+    const T_VEC w85 = add(w61, w80);                                         \
+    const T_VEC w86 = sub(w61, w80);                                         \
+    const T_VEC w87[2] = {                                                   \
+      add(w63[0], add(mul(kWeight3, w82[0]), mul(kWeight4, w82[1]))),        \
+      add(w63[1], sub(mul(kWeight3, w82[1]), mul(kWeight4, w82[0])))         \
+    };                                                                       \
+    const T_VEC w88[2] = { add(w52, mul(kWeight2, sub(w71, w78))),           \
+                           sub(sub(kWeight0, w59),                           \
+                               mul(kWeight2, add(w78, w71))) };              \
+    const T_VEC w89[2] = {                                                   \
+      add(w65[0], add(mul(kWeight4, w84[0]), mul(kWeight3, w84[1]))),        \
+      add(w65[1], sub(mul(kWeight4, w84[1]), mul(kWeight3, w84[0])))         \
+    };                                                                       \
+    const T_VEC w91[2] = {                                                   \
+      add(w65[0],                                                            \
+          sub(sub(kWeight0, mul(kWeight4, w84[0])), mul(kWeight3, w84[1]))), \
+      sub(sub(kWeight0, w65[1]),                                             \
+          sub(mul(kWeight3, w84[0]), mul(kWeight4, w84[1])))                 \
+    };                                                                       \
+    const T_VEC w92[2] = { sub(w52, mul(kWeight2, sub(w71, w78))),           \
+                           sub(w59, mul(kWeight2, add(w78, w71))) };         \
+    const T_VEC w93[2] = {                                                   \
+      add(w63[0],                                                            \
+          sub(sub(kWeight0, mul(kWeight3, w82[0])), mul(kWeight4, w82[1]))), \
+      sub(sub(kWeight0, w63[1]),                                             \
+          sub(mul(kWeight4, w82[0]), mul(kWeight3, w82[1])))                 \
+    };                                                                       \
+    store(output + 0 * stride, add(w38, w85));                               \
+    store(output + 1 * stride,                                               \
+          add(w40[0], add(mul(kWeight5, w87[0]), mul(kWeight6, w87[1]))));   \
+    store(output + 2 * stride,                                               \
+          add(w41[0], add(mul(kWeight3, w88[0]), mul(kWeight4, w88[1]))));   \
+    store(output + 3 * stride,                                               \
+          add(w42[0], add(mul(kWeight7, w89[0]), mul(kWeight8, w89[1]))));   \
+    store(output + 4 * stride, add(w15, mul(kWeight2, sub(w62, w81))));      \
+    store(output + 5 * stride,                                               \
+          add(w44[0], add(mul(kWeight8, w91[0]), mul(kWeight7, w91[1]))));   \
+    store(output + 6 * stride,                                               \
+          add(w45[0], add(mul(kWeight4, w92[0]), mul(kWeight3, w92[1]))));   \
+    store(output + 7 * stride,                                               \
+          add(w46[0], add(mul(kWeight6, w93[0]), mul(kWeight5, w93[1]))));   \
+    store(output + 8 * stride, w39);                                         \
+    store(output + 9 * stride,                                               \
+          add(w46[0], sub(sub(kWeight0, mul(kWeight6, w93[0])),              \
+                          mul(kWeight5, w93[1]))));                          \
+    store(output + 10 * stride,                                              \
+          add(w45[0], sub(sub(kWeight0, mul(kWeight4, w92[0])),              \
+                          mul(kWeight3, w92[1]))));                          \
+    store(output + 11 * stride,                                              \
+          add(w44[0], sub(sub(kWeight0, mul(kWeight8, w91[0])),              \
+                          mul(kWeight7, w91[1]))));                          \
+    store(output + 12 * stride, sub(w15, mul(kWeight2, sub(w62, w81))));     \
+    store(output + 13 * stride,                                              \
+          add(w42[0], sub(sub(kWeight0, mul(kWeight7, w89[0])),              \
+                          mul(kWeight8, w89[1]))));                          \
+    store(output + 14 * stride,                                              \
+          add(w41[0], sub(sub(kWeight0, mul(kWeight3, w88[0])),              \
+                          mul(kWeight4, w88[1]))));                          \
+    store(output + 15 * stride,                                              \
+          add(w40[0], sub(sub(kWeight0, mul(kWeight5, w87[0])),              \
+                          mul(kWeight6, w87[1]))));                          \
+    store(output + 16 * stride, sub(w38, w85));                              \
+    store(output + 17 * stride,                                              \
+          add(w40[1], sub(mul(kWeight5, w87[1]), mul(kWeight6, w87[0]))));   \
+    store(output + 18 * stride,                                              \
+          add(w41[1], sub(mul(kWeight3, w88[1]), mul(kWeight4, w88[0]))));   \
+    store(output + 19 * stride,                                              \
+          add(w42[1], sub(mul(kWeight7, w89[1]), mul(kWeight8, w89[0]))));   \
+    store(output + 20 * stride,                                              \
+          sub(sub(kWeight0, w34), mul(kWeight2, add(w81, w62))));            \
+    store(output + 21 * stride,                                              \
+          add(w44[1], sub(mul(kWeight8, w91[1]), mul(kWeight7, w91[0]))));   \
+    store(output + 22 * stride,                                              \
+          add(w45[1], sub(mul(kWeight4, w92[1]), mul(kWeight3, w92[0]))));   \
+    store(output + 23 * stride,                                              \
+          add(w46[1], sub(mul(kWeight6, w93[1]), mul(kWeight5, w93[0]))));   \
+    store(output + 24 * stride, sub(kWeight0, w86));                         \
+    store(output + 25 * stride,                                              \
+          sub(sub(kWeight0, w46[1]),                                         \
+              sub(mul(kWeight5, w93[0]), mul(kWeight6, w93[1]))));           \
+    store(output + 26 * stride,                                              \
+          sub(sub(kWeight0, w45[1]),                                         \
+              sub(mul(kWeight3, w92[0]), mul(kWeight4, w92[1]))));           \
+    store(output + 27 * stride,                                              \
+          sub(sub(kWeight0, w44[1]),                                         \
+              sub(mul(kWeight7, w91[0]), mul(kWeight8, w91[1]))));           \
+    store(output + 28 * stride, sub(w34, mul(kWeight2, add(w81, w62))));     \
+    store(output + 29 * stride,                                              \
+          sub(sub(kWeight0, w42[1]),                                         \
+              sub(mul(kWeight8, w89[0]), mul(kWeight7, w89[1]))));           \
+    store(output + 30 * stride,                                              \
+          sub(sub(kWeight0, w41[1]),                                         \
+              sub(mul(kWeight4, w88[0]), mul(kWeight3, w88[1]))));           \
+    store(output + 31 * stride,                                              \
+          sub(sub(kWeight0, w40[1]),                                         \
+              sub(mul(kWeight6, w87[0]), mul(kWeight5, w87[1]))));           \
+  }
+
+#define GEN_IFFT_2(ret, suffix, T, T_VEC, load, store)               \
+  ret aom_ifft1d_2_##suffix(const T *input, T *output, int stride) { \
+    const T_VEC i0 = load(input + 0 * stride);                       \
+    const T_VEC i1 = load(input + 1 * stride);                       \
+    store(output + 0 * stride, i0 + i1);                             \
+    store(output + 1 * stride, i0 - i1);                             \
+  }
+
+#define GEN_IFFT_4(ret, suffix, T, T_VEC, load, store, constant, add, sub) \
+  ret aom_ifft1d_4_##suffix(const T *input, T *output, int stride) {       \
+    const T_VEC kWeight0 = constant(0.0f);                                 \
+    const T_VEC i0 = load(input + 0 * stride);                             \
+    const T_VEC i1 = load(input + 1 * stride);                             \
+    const T_VEC i2 = load(input + 2 * stride);                             \
+    const T_VEC i3 = load(input + 3 * stride);                             \
+    const T_VEC w2 = add(i0, i2);                                          \
+    const T_VEC w3 = sub(i0, i2);                                          \
+    const T_VEC w4[2] = { add(i1, i1), sub(i3, i3) };                      \
+    const T_VEC w5[2] = { sub(i1, i1), sub(sub(kWeight0, i3), i3) };       \
+    store(output + 0 * stride, add(w2, w4[0]));                            \
+    store(output + 1 * stride, add(w3, w5[1]));                            \
+    store(output + 2 * stride, sub(w2, w4[0]));                            \
+    store(output + 3 * stride, sub(w3, w5[1]));                            \
+  }
+
+#define GEN_IFFT_8(ret, suffix, T, T_VEC, load, store, constant, add, sub, \
+                   mul)                                                    \
+  ret aom_ifft1d_8_##suffix(const T *input, T *output, int stride) {       \
+    const T_VEC kWeight0 = constant(0.0f);                                 \
+    const T_VEC kWeight2 = constant(0.707107f);                            \
+    const T_VEC i0 = load(input + 0 * stride);                             \
+    const T_VEC i1 = load(input + 1 * stride);                             \
+    const T_VEC i2 = load(input + 2 * stride);                             \
+    const T_VEC i3 = load(input + 3 * stride);                             \
+    const T_VEC i4 = load(input + 4 * stride);                             \
+    const T_VEC i5 = load(input + 5 * stride);                             \
+    const T_VEC i6 = load(input + 6 * stride);                             \
+    const T_VEC i7 = load(input + 7 * stride);                             \
+    const T_VEC w6 = add(i0, i4);                                          \
+    const T_VEC w7 = sub(i0, i4);                                          \
+    const T_VEC w8[2] = { add(i2, i2), sub(i6, i6) };                      \
+    const T_VEC w9[2] = { sub(i2, i2), sub(sub(kWeight0, i6), i6) };       \
+    const T_VEC w10[2] = { add(w6, w8[0]), w8[1] };                        \
+    const T_VEC w11[2] = { sub(w6, w8[0]), sub(kWeight0, w8[1]) };         \
+    const T_VEC w12[2] = { add(w7, w9[1]), sub(kWeight0, w9[0]) };         \
+    const T_VEC w13[2] = { sub(w7, w9[1]), w9[0] };                        \
+    const T_VEC w14[2] = { add(i1, i3), sub(i7, i5) };                     \
+    const T_VEC w15[2] = { sub(i1, i3), sub(sub(kWeight0, i5), i7) };      \
+    const T_VEC w16[2] = { add(i3, i1), sub(i5, i7) };                     \
+    const T_VEC w17[2] = { sub(i3, i1), sub(sub(kWeight0, i7), i5) };      \
+    const T_VEC w18[2] = { add(w14[0], w16[0]), add(w14[1], w16[1]) };     \
+    const T_VEC w19[2] = { sub(w14[0], w16[0]), sub(w14[1], w16[1]) };     \
+    const T_VEC w20[2] = { add(w15[0], w17[1]), sub(w15[1], w17[0]) };     \
+    const T_VEC w21[2] = { sub(w15[0], w17[1]), add(w15[1], w17[0]) };     \
+    store(output + 0 * stride, add(w10[0], w18[0]));                       \
+    store(output + 1 * stride,                                             \
+          add(w12[0], mul(kWeight2, add(w20[0], w20[1]))));                \
+    store(output + 2 * stride, add(w11[0], w19[1]));                       \
+    store(output + 3 * stride,                                             \
+          sub(w13[0], mul(kWeight2, sub(w21[0], w21[1]))));                \
+    store(output + 4 * stride, sub(w10[0], w18[0]));                       \
+    store(output + 5 * stride,                                             \
+          add(w12[0], sub(sub(kWeight0, mul(kWeight2, w20[0])),            \
+                          mul(kWeight2, w20[1]))));                        \
+    store(output + 6 * stride, sub(w11[0], w19[1]));                       \
+    store(output + 7 * stride,                                             \
+          add(w13[0], mul(kWeight2, sub(w21[0], w21[1]))));                \
+  }
+
+#define GEN_IFFT_16(ret, suffix, T, T_VEC, load, store, constant, add, sub,   \
+                    mul)                                                      \
+  ret aom_ifft1d_16_##suffix(const T *input, T *output, int stride) {         \
+    const T_VEC kWeight0 = constant(0.0f);                                    \
+    const T_VEC kWeight2 = constant(0.707107f);                               \
+    const T_VEC kWeight3 = constant(0.92388f);                                \
+    const T_VEC kWeight4 = constant(0.382683f);                               \
+    const T_VEC i0 = load(input + 0 * stride);                                \
+    const T_VEC i1 = load(input + 1 * stride);                                \
+    const T_VEC i2 = load(input + 2 * stride);                                \
+    const T_VEC i3 = load(input + 3 * stride);                                \
+    const T_VEC i4 = load(input + 4 * stride);                                \
+    const T_VEC i5 = load(input + 5 * stride);                                \
+    const T_VEC i6 = load(input + 6 * stride);                                \
+    const T_VEC i7 = load(input + 7 * stride);                                \
+    const T_VEC i8 = load(input + 8 * stride);                                \
+    const T_VEC i9 = load(input + 9 * stride);                                \
+    const T_VEC i10 = load(input + 10 * stride);                              \
+    const T_VEC i11 = load(input + 11 * stride);                              \
+    const T_VEC i12 = load(input + 12 * stride);                              \
+    const T_VEC i13 = load(input + 13 * stride);                              \
+    const T_VEC i14 = load(input + 14 * stride);                              \
+    const T_VEC i15 = load(input + 15 * stride);                              \
+    const T_VEC w14 = add(i0, i8);                                            \
+    const T_VEC w15 = sub(i0, i8);                                            \
+    const T_VEC w16[2] = { add(i4, i4), sub(i12, i12) };                      \
+    const T_VEC w17[2] = { sub(i4, i4), sub(sub(kWeight0, i12), i12) };       \
+    const T_VEC w18[2] = { add(w14, w16[0]), w16[1] };                        \
+    const T_VEC w19[2] = { sub(w14, w16[0]), sub(kWeight0, w16[1]) };         \
+    const T_VEC w20[2] = { add(w15, w17[1]), sub(kWeight0, w17[0]) };         \
+    const T_VEC w21[2] = { sub(w15, w17[1]), w17[0] };                        \
+    const T_VEC w22[2] = { add(i2, i6), sub(i14, i10) };                      \
+    const T_VEC w23[2] = { sub(i2, i6), sub(sub(kWeight0, i10), i14) };       \
+    const T_VEC w24[2] = { add(i6, i2), sub(i10, i14) };                      \
+    const T_VEC w25[2] = { sub(i6, i2), sub(sub(kWeight0, i14), i10) };       \
+    const T_VEC w26[2] = { add(w22[0], w24[0]), add(w22[1], w24[1]) };        \
+    const T_VEC w27[2] = { sub(w22[0], w24[0]), sub(w22[1], w24[1]) };        \
+    const T_VEC w28[2] = { add(w23[0], w25[1]), sub(w23[1], w25[0]) };        \
+    const T_VEC w29[2] = { sub(w23[0], w25[1]), add(w23[1], w25[0]) };        \
+    const T_VEC w30[2] = { add(w18[0], w26[0]), add(w18[1], w26[1]) };        \
+    const T_VEC w31[2] = { sub(w18[0], w26[0]), sub(w18[1], w26[1]) };        \
+    const T_VEC w32[2] = { add(w20[0], mul(kWeight2, add(w28[0], w28[1]))),   \
+                           add(w20[1], mul(kWeight2, sub(w28[1], w28[0]))) }; \
+    const T_VEC w33[2] = { add(w20[0],                                        \
+                               sub(sub(kWeight0, mul(kWeight2, w28[0])),      \
+                                   mul(kWeight2, w28[1]))),                   \
+                           add(w20[1], mul(kWeight2, sub(w28[0], w28[1]))) }; \
+    const T_VEC w34[2] = { add(w19[0], w27[1]), sub(w19[1], w27[0]) };        \
+    const T_VEC w35[2] = { sub(w19[0], w27[1]), add(w19[1], w27[0]) };        \
+    const T_VEC w36[2] = { sub(w21[0], mul(kWeight2, sub(w29[0], w29[1]))),   \
+                           sub(w21[1], mul(kWeight2, add(w29[1], w29[0]))) }; \
+    const T_VEC w37[2] = { add(w21[0], mul(kWeight2, sub(w29[0], w29[1]))),   \
+                           add(w21[1], mul(kWeight2, add(w29[1], w29[0]))) }; \
+    const T_VEC w38[2] = { add(i1, i7), sub(i15, i9) };                       \
+    const T_VEC w39[2] = { sub(i1, i7), sub(sub(kWeight0, i9), i15) };        \
+    const T_VEC w40[2] = { add(i5, i3), sub(i11, i13) };                      \
+    const T_VEC w41[2] = { sub(i5, i3), sub(sub(kWeight0, i13), i11) };       \
+    const T_VEC w42[2] = { add(w38[0], w40[0]), add(w38[1], w40[1]) };        \
+    const T_VEC w43[2] = { sub(w38[0], w40[0]), sub(w38[1], w40[1]) };        \
+    const T_VEC w44[2] = { add(w39[0], w41[1]), sub(w39[1], w41[0]) };        \
+    const T_VEC w45[2] = { sub(w39[0], w41[1]), add(w39[1], w41[0]) };        \
+    const T_VEC w46[2] = { add(i3, i5), sub(i13, i11) };                      \
+    const T_VEC w47[2] = { sub(i3, i5), sub(sub(kWeight0, i11), i13) };       \
+    const T_VEC w48[2] = { add(i7, i1), sub(i9, i15) };                       \
+    const T_VEC w49[2] = { sub(i7, i1), sub(sub(kWeight0, i15), i9) };        \
+    const T_VEC w50[2] = { add(w46[0], w48[0]), add(w46[1], w48[1]) };        \
+    const T_VEC w51[2] = { sub(w46[0], w48[0]), sub(w46[1], w48[1]) };        \
+    const T_VEC w52[2] = { add(w47[0], w49[1]), sub(w47[1], w49[0]) };        \
+    const T_VEC w53[2] = { sub(w47[0], w49[1]), add(w47[1], w49[0]) };        \
+    const T_VEC w54[2] = { add(w42[0], w50[0]), add(w42[1], w50[1]) };        \
+    const T_VEC w55[2] = { sub(w42[0], w50[0]), sub(w42[1], w50[1]) };        \
+    const T_VEC w56[2] = { add(w44[0], mul(kWeight2, add(w52[0], w52[1]))),   \
+                           add(w44[1], mul(kWeight2, sub(w52[1], w52[0]))) }; \
+    const T_VEC w57[2] = { add(w44[0],                                        \
+                               sub(sub(kWeight0, mul(kWeight2, w52[0])),      \
+                                   mul(kWeight2, w52[1]))),                   \
+                           add(w44[1], mul(kWeight2, sub(w52[0], w52[1]))) }; \
+    const T_VEC w58[2] = { add(w43[0], w51[1]), sub(w43[1], w51[0]) };        \
+    const T_VEC w59[2] = { sub(w43[0], w51[1]), add(w43[1], w51[0]) };        \
+    const T_VEC w60[2] = { sub(w45[0], mul(kWeight2, sub(w53[0], w53[1]))),   \
+                           sub(w45[1], mul(kWeight2, add(w53[1], w53[0]))) }; \
+    const T_VEC w61[2] = { add(w45[0], mul(kWeight2, sub(w53[0], w53[1]))),   \
+                           add(w45[1], mul(kWeight2, add(w53[1], w53[0]))) }; \
+    store(output + 0 * stride, add(w30[0], w54[0]));                          \
+    store(output + 1 * stride,                                                \
+          add(w32[0], add(mul(kWeight3, w56[0]), mul(kWeight4, w56[1]))));    \
+    store(output + 2 * stride,                                                \
+          add(w34[0], mul(kWeight2, add(w58[0], w58[1]))));                   \
+    store(output + 3 * stride,                                                \
+          add(w36[0], add(mul(kWeight4, w60[0]), mul(kWeight3, w60[1]))));    \
+    store(output + 4 * stride, add(w31[0], w55[1]));                          \
+    store(output + 5 * stride,                                                \
+          sub(w33[0], sub(mul(kWeight4, w57[0]), mul(kWeight3, w57[1]))));    \
+    store(output + 6 * stride,                                                \
+          sub(w35[0], mul(kWeight2, sub(w59[0], w59[1]))));                   \
+    store(output + 7 * stride,                                                \
+          sub(w37[0], sub(mul(kWeight3, w61[0]), mul(kWeight4, w61[1]))));    \
+    store(output + 8 * stride, sub(w30[0], w54[0]));                          \
+    store(output + 9 * stride,                                                \
+          add(w32[0], sub(sub(kWeight0, mul(kWeight3, w56[0])),               \
+                          mul(kWeight4, w56[1]))));                           \
+    store(output + 10 * stride,                                               \
+          add(w34[0], sub(sub(kWeight0, mul(kWeight2, w58[0])),               \
+                          mul(kWeight2, w58[1]))));                           \
+    store(output + 11 * stride,                                               \
+          add(w36[0], sub(sub(kWeight0, mul(kWeight4, w60[0])),               \
+                          mul(kWeight3, w60[1]))));                           \
+    store(output + 12 * stride, sub(w31[0], w55[1]));                         \
+    store(output + 13 * stride,                                               \
+          add(w33[0], sub(mul(kWeight4, w57[0]), mul(kWeight3, w57[1]))));    \
+    store(output + 14 * stride,                                               \
+          add(w35[0], mul(kWeight2, sub(w59[0], w59[1]))));                   \
+    store(output + 15 * stride,                                               \
+          add(w37[0], sub(mul(kWeight3, w61[0]), mul(kWeight4, w61[1]))));    \
+  }
+#define GEN_IFFT_32(ret, suffix, T, T_VEC, load, store, constant, add, sub,    \
+                    mul)                                                       \
+  ret aom_ifft1d_32_##suffix(const T *input, T *output, int stride) {          \
+    const T_VEC kWeight0 = constant(0.0f);                                     \
+    const T_VEC kWeight2 = constant(0.707107f);                                \
+    const T_VEC kWeight3 = constant(0.92388f);                                 \
+    const T_VEC kWeight4 = constant(0.382683f);                                \
+    const T_VEC kWeight5 = constant(0.980785f);                                \
+    const T_VEC kWeight6 = constant(0.19509f);                                 \
+    const T_VEC kWeight7 = constant(0.83147f);                                 \
+    const T_VEC kWeight8 = constant(0.55557f);                                 \
+    const T_VEC i0 = load(input + 0 * stride);                                 \
+    const T_VEC i1 = load(input + 1 * stride);                                 \
+    const T_VEC i2 = load(input + 2 * stride);                                 \
+    const T_VEC i3 = load(input + 3 * stride);                                 \
+    const T_VEC i4 = load(input + 4 * stride);                                 \
+    const T_VEC i5 = load(input + 5 * stride);                                 \
+    const T_VEC i6 = load(input + 6 * stride);                                 \
+    const T_VEC i7 = load(input + 7 * stride);                                 \
+    const T_VEC i8 = load(input + 8 * stride);                                 \
+    const T_VEC i9 = load(input + 9 * stride);                                 \
+    const T_VEC i10 = load(input + 10 * stride);                               \
+    const T_VEC i11 = load(input + 11 * stride);                               \
+    const T_VEC i12 = load(input + 12 * stride);                               \
+    const T_VEC i13 = load(input + 13 * stride);                               \
+    const T_VEC i14 = load(input + 14 * stride);                               \
+    const T_VEC i15 = load(input + 15 * stride);                               \
+    const T_VEC i16 = load(input + 16 * stride);                               \
+    const T_VEC i17 = load(input + 17 * stride);                               \
+    const T_VEC i18 = load(input + 18 * stride);                               \
+    const T_VEC i19 = load(input + 19 * stride);                               \
+    const T_VEC i20 = load(input + 20 * stride);                               \
+    const T_VEC i21 = load(input + 21 * stride);                               \
+    const T_VEC i22 = load(input + 22 * stride);                               \
+    const T_VEC i23 = load(input + 23 * stride);                               \
+    const T_VEC i24 = load(input + 24 * stride);                               \
+    const T_VEC i25 = load(input + 25 * stride);                               \
+    const T_VEC i26 = load(input + 26 * stride);                               \
+    const T_VEC i27 = load(input + 27 * stride);                               \
+    const T_VEC i28 = load(input + 28 * stride);                               \
+    const T_VEC i29 = load(input + 29 * stride);                               \
+    const T_VEC i30 = load(input + 30 * stride);                               \
+    const T_VEC i31 = load(input + 31 * stride);                               \
+    const T_VEC w30 = add(i0, i16);                                            \
+    const T_VEC w31 = sub(i0, i16);                                            \
+    const T_VEC w32[2] = { add(i8, i8), sub(i24, i24) };                       \
+    const T_VEC w33[2] = { sub(i8, i8), sub(sub(kWeight0, i24), i24) };        \
+    const T_VEC w34[2] = { add(w30, w32[0]), w32[1] };                         \
+    const T_VEC w35[2] = { sub(w30, w32[0]), sub(kWeight0, w32[1]) };          \
+    const T_VEC w36[2] = { add(w31, w33[1]), sub(kWeight0, w33[0]) };          \
+    const T_VEC w37[2] = { sub(w31, w33[1]), w33[0] };                         \
+    const T_VEC w38[2] = { add(i4, i12), sub(i28, i20) };                      \
+    const T_VEC w39[2] = { sub(i4, i12), sub(sub(kWeight0, i20), i28) };       \
+    const T_VEC w40[2] = { add(i12, i4), sub(i20, i28) };                      \
+    const T_VEC w41[2] = { sub(i12, i4), sub(sub(kWeight0, i28), i20) };       \
+    const T_VEC w42[2] = { add(w38[0], w40[0]), add(w38[1], w40[1]) };         \
+    const T_VEC w43[2] = { sub(w38[0], w40[0]), sub(w38[1], w40[1]) };         \
+    const T_VEC w44[2] = { add(w39[0], w41[1]), sub(w39[1], w41[0]) };         \
+    const T_VEC w45[2] = { sub(w39[0], w41[1]), add(w39[1], w41[0]) };         \
+    const T_VEC w46[2] = { add(w34[0], w42[0]), add(w34[1], w42[1]) };         \
+    const T_VEC w47[2] = { sub(w34[0], w42[0]), sub(w34[1], w42[1]) };         \
+    const T_VEC w48[2] = { add(w36[0], mul(kWeight2, add(w44[0], w44[1]))),    \
+                           add(w36[1], mul(kWeight2, sub(w44[1], w44[0]))) };  \
+    const T_VEC w49[2] = { add(w36[0],                                         \
+                               sub(sub(kWeight0, mul(kWeight2, w44[0])),       \
+                                   mul(kWeight2, w44[1]))),                    \
+                           add(w36[1], mul(kWeight2, sub(w44[0], w44[1]))) };  \
+    const T_VEC w50[2] = { add(w35[0], w43[1]), sub(w35[1], w43[0]) };         \
+    const T_VEC w51[2] = { sub(w35[0], w43[1]), add(w35[1], w43[0]) };         \
+    const T_VEC w52[2] = { sub(w37[0], mul(kWeight2, sub(w45[0], w45[1]))),    \
+                           sub(w37[1], mul(kWeight2, add(w45[1], w45[0]))) };  \
+    const T_VEC w53[2] = { add(w37[0], mul(kWeight2, sub(w45[0], w45[1]))),    \
+                           add(w37[1], mul(kWeight2, add(w45[1], w45[0]))) };  \
+    const T_VEC w54[2] = { add(i2, i14), sub(i30, i18) };                      \
+    const T_VEC w55[2] = { sub(i2, i14), sub(sub(kWeight0, i18), i30) };       \
+    const T_VEC w56[2] = { add(i10, i6), sub(i22, i26) };                      \
+    const T_VEC w57[2] = { sub(i10, i6), sub(sub(kWeight0, i26), i22) };       \
+    const T_VEC w58[2] = { add(w54[0], w56[0]), add(w54[1], w56[1]) };         \
+    const T_VEC w59[2] = { sub(w54[0], w56[0]), sub(w54[1], w56[1]) };         \
+    const T_VEC w60[2] = { add(w55[0], w57[1]), sub(w55[1], w57[0]) };         \
+    const T_VEC w61[2] = { sub(w55[0], w57[1]), add(w55[1], w57[0]) };         \
+    const T_VEC w62[2] = { add(i6, i10), sub(i26, i22) };                      \
+    const T_VEC w63[2] = { sub(i6, i10), sub(sub(kWeight0, i22), i26) };       \
+    const T_VEC w64[2] = { add(i14, i2), sub(i18, i30) };                      \
+    const T_VEC w65[2] = { sub(i14, i2), sub(sub(kWeight0, i30), i18) };       \
+    const T_VEC w66[2] = { add(w62[0], w64[0]), add(w62[1], w64[1]) };         \
+    const T_VEC w67[2] = { sub(w62[0], w64[0]), sub(w62[1], w64[1]) };         \
+    const T_VEC w68[2] = { add(w63[0], w65[1]), sub(w63[1], w65[0]) };         \
+    const T_VEC w69[2] = { sub(w63[0], w65[1]), add(w63[1], w65[0]) };         \
+    const T_VEC w70[2] = { add(w58[0], w66[0]), add(w58[1], w66[1]) };         \
+    const T_VEC w71[2] = { sub(w58[0], w66[0]), sub(w58[1], w66[1]) };         \
+    const T_VEC w72[2] = { add(w60[0], mul(kWeight2, add(w68[0], w68[1]))),    \
+                           add(w60[1], mul(kWeight2, sub(w68[1], w68[0]))) };  \
+    const T_VEC w73[2] = { add(w60[0],                                         \
+                               sub(sub(kWeight0, mul(kWeight2, w68[0])),       \
+                                   mul(kWeight2, w68[1]))),                    \
+                           add(w60[1], mul(kWeight2, sub(w68[0], w68[1]))) };  \
+    const T_VEC w74[2] = { add(w59[0], w67[1]), sub(w59[1], w67[0]) };         \
+    const T_VEC w75[2] = { sub(w59[0], w67[1]), add(w59[1], w67[0]) };         \
+    const T_VEC w76[2] = { sub(w61[0], mul(kWeight2, sub(w69[0], w69[1]))),    \
+                           sub(w61[1], mul(kWeight2, add(w69[1], w69[0]))) };  \
+    const T_VEC w77[2] = { add(w61[0], mul(kWeight2, sub(w69[0], w69[1]))),    \
+                           add(w61[1], mul(kWeight2, add(w69[1], w69[0]))) };  \
+    const T_VEC w78[2] = { add(w46[0], w70[0]), add(w46[1], w70[1]) };         \
+    const T_VEC w79[2] = { sub(w46[0], w70[0]), sub(w46[1], w70[1]) };         \
+    const T_VEC w80[2] = {                                                     \
+      add(w48[0], add(mul(kWeight3, w72[0]), mul(kWeight4, w72[1]))),          \
+      add(w48[1], sub(mul(kWeight3, w72[1]), mul(kWeight4, w72[0])))           \
+    };                                                                         \
+    const T_VEC w81[2] = {                                                     \
+      add(w48[0],                                                              \
+          sub(sub(kWeight0, mul(kWeight3, w72[0])), mul(kWeight4, w72[1]))),   \
+      add(w48[1], sub(mul(kWeight4, w72[0]), mul(kWeight3, w72[1])))           \
+    };                                                                         \
+    const T_VEC w82[2] = { add(w50[0], mul(kWeight2, add(w74[0], w74[1]))),    \
+                           add(w50[1], mul(kWeight2, sub(w74[1], w74[0]))) };  \
+    const T_VEC w83[2] = { add(w50[0],                                         \
+                               sub(sub(kWeight0, mul(kWeight2, w74[0])),       \
+                                   mul(kWeight2, w74[1]))),                    \
+                           add(w50[1], mul(kWeight2, sub(w74[0], w74[1]))) };  \
+    const T_VEC w84[2] = {                                                     \
+      add(w52[0], add(mul(kWeight4, w76[0]), mul(kWeight3, w76[1]))),          \
+      add(w52[1], sub(mul(kWeight4, w76[1]), mul(kWeight3, w76[0])))           \
+    };                                                                         \
+    const T_VEC w85[2] = {                                                     \
+      add(w52[0],                                                              \
+          sub(sub(kWeight0, mul(kWeight4, w76[0])), mul(kWeight3, w76[1]))),   \
+      add(w52[1], sub(mul(kWeight3, w76[0]), mul(kWeight4, w76[1])))           \
+    };                                                                         \
+    const T_VEC w86[2] = { add(w47[0], w71[1]), sub(w47[1], w71[0]) };         \
+    const T_VEC w87[2] = { sub(w47[0], w71[1]), add(w47[1], w71[0]) };         \
+    const T_VEC w88[2] = {                                                     \
+      sub(w49[0], sub(mul(kWeight4, w73[0]), mul(kWeight3, w73[1]))),          \
+      add(w49[1],                                                              \
+          sub(sub(kWeight0, mul(kWeight4, w73[1])), mul(kWeight3, w73[0])))    \
+    };                                                                         \
+    const T_VEC w89[2] = {                                                     \
+      add(w49[0], sub(mul(kWeight4, w73[0]), mul(kWeight3, w73[1]))),          \
+      add(w49[1], add(mul(kWeight4, w73[1]), mul(kWeight3, w73[0])))           \
+    };                                                                         \
+    const T_VEC w90[2] = { sub(w51[0], mul(kWeight2, sub(w75[0], w75[1]))),    \
+                           sub(w51[1], mul(kWeight2, add(w75[1], w75[0]))) };  \
+    const T_VEC w91[2] = { add(w51[0], mul(kWeight2, sub(w75[0], w75[1]))),    \
+                           add(w51[1], mul(kWeight2, add(w75[1], w75[0]))) };  \
+    const T_VEC w92[2] = {                                                     \
+      sub(w53[0], sub(mul(kWeight3, w77[0]), mul(kWeight4, w77[1]))),          \
+      add(w53[1],                                                              \
+          sub(sub(kWeight0, mul(kWeight3, w77[1])), mul(kWeight4, w77[0])))    \
+    };                                                                         \
+    const T_VEC w93[2] = {                                                     \
+      add(w53[0], sub(mul(kWeight3, w77[0]), mul(kWeight4, w77[1]))),          \
+      add(w53[1], add(mul(kWeight3, w77[1]), mul(kWeight4, w77[0])))           \
+    };                                                                         \
+    const T_VEC w94[2] = { add(i1, i15), sub(i31, i17) };                      \
+    const T_VEC w95[2] = { sub(i1, i15), sub(sub(kWeight0, i17), i31) };       \
+    const T_VEC w96[2] = { add(i9, i7), sub(i23, i25) };                       \
+    const T_VEC w97[2] = { sub(i9, i7), sub(sub(kWeight0, i25), i23) };        \
+    const T_VEC w98[2] = { add(w94[0], w96[0]), add(w94[1], w96[1]) };         \
+    const T_VEC w99[2] = { sub(w94[0], w96[0]), sub(w94[1], w96[1]) };         \
+    const T_VEC w100[2] = { add(w95[0], w97[1]), sub(w95[1], w97[0]) };        \
+    const T_VEC w101[2] = { sub(w95[0], w97[1]), add(w95[1], w97[0]) };        \
+    const T_VEC w102[2] = { add(i5, i11), sub(i27, i21) };                     \
+    const T_VEC w103[2] = { sub(i5, i11), sub(sub(kWeight0, i21), i27) };      \
+    const T_VEC w104[2] = { add(i13, i3), sub(i19, i29) };                     \
+    const T_VEC w105[2] = { sub(i13, i3), sub(sub(kWeight0, i29), i19) };      \
+    const T_VEC w106[2] = { add(w102[0], w104[0]), add(w102[1], w104[1]) };    \
+    const T_VEC w107[2] = { sub(w102[0], w104[0]), sub(w102[1], w104[1]) };    \
+    const T_VEC w108[2] = { add(w103[0], w105[1]), sub(w103[1], w105[0]) };    \
+    const T_VEC w109[2] = { sub(w103[0], w105[1]), add(w103[1], w105[0]) };    \
+    const T_VEC w110[2] = { add(w98[0], w106[0]), add(w98[1], w106[1]) };      \
+    const T_VEC w111[2] = { sub(w98[0], w106[0]), sub(w98[1], w106[1]) };      \
+    const T_VEC w112[2] = {                                                    \
+      add(w100[0], mul(kWeight2, add(w108[0], w108[1]))),                      \
+      add(w100[1], mul(kWeight2, sub(w108[1], w108[0])))                       \
+    };                                                                         \
+    const T_VEC w113[2] = {                                                    \
+      add(w100[0],                                                             \
+          sub(sub(kWeight0, mul(kWeight2, w108[0])), mul(kWeight2, w108[1]))), \
+      add(w100[1], mul(kWeight2, sub(w108[0], w108[1])))                       \
+    };                                                                         \
+    const T_VEC w114[2] = { add(w99[0], w107[1]), sub(w99[1], w107[0]) };      \
+    const T_VEC w115[2] = { sub(w99[0], w107[1]), add(w99[1], w107[0]) };      \
+    const T_VEC w116[2] = {                                                    \
+      sub(w101[0], mul(kWeight2, sub(w109[0], w109[1]))),                      \
+      sub(w101[1], mul(kWeight2, add(w109[1], w109[0])))                       \
+    };                                                                         \
+    const T_VEC w117[2] = {                                                    \
+      add(w101[0], mul(kWeight2, sub(w109[0], w109[1]))),                      \
+      add(w101[1], mul(kWeight2, add(w109[1], w109[0])))                       \
+    };                                                                         \
+    const T_VEC w118[2] = { add(i3, i13), sub(i29, i19) };                     \
+    const T_VEC w119[2] = { sub(i3, i13), sub(sub(kWeight0, i19), i29) };      \
+    const T_VEC w120[2] = { add(i11, i5), sub(i21, i27) };                     \
+    const T_VEC w121[2] = { sub(i11, i5), sub(sub(kWeight0, i27), i21) };      \
+    const T_VEC w122[2] = { add(w118[0], w120[0]), add(w118[1], w120[1]) };    \
+    const T_VEC w123[2] = { sub(w118[0], w120[0]), sub(w118[1], w120[1]) };    \
+    const T_VEC w124[2] = { add(w119[0], w121[1]), sub(w119[1], w121[0]) };    \
+    const T_VEC w125[2] = { sub(w119[0], w121[1]), add(w119[1], w121[0]) };    \
+    const T_VEC w126[2] = { add(i7, i9), sub(i25, i23) };                      \
+    const T_VEC w127[2] = { sub(i7, i9), sub(sub(kWeight0, i23), i25) };       \
+    const T_VEC w128[2] = { add(i15, i1), sub(i17, i31) };                     \
+    const T_VEC w129[2] = { sub(i15, i1), sub(sub(kWeight0, i31), i17) };      \
+    const T_VEC w130[2] = { add(w126[0], w128[0]), add(w126[1], w128[1]) };    \
+    const T_VEC w131[2] = { sub(w126[0], w128[0]), sub(w126[1], w128[1]) };    \
+    const T_VEC w132[2] = { add(w127[0], w129[1]), sub(w127[1], w129[0]) };    \
+    const T_VEC w133[2] = { sub(w127[0], w129[1]), add(w127[1], w129[0]) };    \
+    const T_VEC w134[2] = { add(w122[0], w130[0]), add(w122[1], w130[1]) };    \
+    const T_VEC w135[2] = { sub(w122[0], w130[0]), sub(w122[1], w130[1]) };    \
+    const T_VEC w136[2] = {                                                    \
+      add(w124[0], mul(kWeight2, add(w132[0], w132[1]))),                      \
+      add(w124[1], mul(kWeight2, sub(w132[1], w132[0])))                       \
+    };                                                                         \
+    const T_VEC w137[2] = {                                                    \
+      add(w124[0],                                                             \
+          sub(sub(kWeight0, mul(kWeight2, w132[0])), mul(kWeight2, w132[1]))), \
+      add(w124[1], mul(kWeight2, sub(w132[0], w132[1])))                       \
+    };                                                                         \
+    const T_VEC w138[2] = { add(w123[0], w131[1]), sub(w123[1], w131[0]) };    \
+    const T_VEC w139[2] = { sub(w123[0], w131[1]), add(w123[1], w131[0]) };    \
+    const T_VEC w140[2] = {                                                    \
+      sub(w125[0], mul(kWeight2, sub(w133[0], w133[1]))),                      \
+      sub(w125[1], mul(kWeight2, add(w133[1], w133[0])))                       \
+    };                                                                         \
+    const T_VEC w141[2] = {                                                    \
+      add(w125[0], mul(kWeight2, sub(w133[0], w133[1]))),                      \
+      add(w125[1], mul(kWeight2, add(w133[1], w133[0])))                       \
+    };                                                                         \
+    const T_VEC w142[2] = { add(w110[0], w134[0]), add(w110[1], w134[1]) };    \
+    const T_VEC w143[2] = { sub(w110[0], w134[0]), sub(w110[1], w134[1]) };    \
+    const T_VEC w144[2] = {                                                    \
+      add(w112[0], add(mul(kWeight3, w136[0]), mul(kWeight4, w136[1]))),       \
+      add(w112[1], sub(mul(kWeight3, w136[1]), mul(kWeight4, w136[0])))        \
+    };                                                                         \
+    const T_VEC w145[2] = {                                                    \
+      add(w112[0],                                                             \
+          sub(sub(kWeight0, mul(kWeight3, w136[0])), mul(kWeight4, w136[1]))), \
+      add(w112[1], sub(mul(kWeight4, w136[0]), mul(kWeight3, w136[1])))        \
+    };                                                                         \
+    const T_VEC w146[2] = {                                                    \
+      add(w114[0], mul(kWeight2, add(w138[0], w138[1]))),                      \
+      add(w114[1], mul(kWeight2, sub(w138[1], w138[0])))                       \
+    };                                                                         \
+    const T_VEC w147[2] = {                                                    \
+      add(w114[0],                                                             \
+          sub(sub(kWeight0, mul(kWeight2, w138[0])), mul(kWeight2, w138[1]))), \
+      add(w114[1], mul(kWeight2, sub(w138[0], w138[1])))                       \
+    };                                                                         \
+    const T_VEC w148[2] = {                                                    \
+      add(w116[0], add(mul(kWeight4, w140[0]), mul(kWeight3, w140[1]))),       \
+      add(w116[1], sub(mul(kWeight4, w140[1]), mul(kWeight3, w140[0])))        \
+    };                                                                         \
+    const T_VEC w149[2] = {                                                    \
+      add(w116[0],                                                             \
+          sub(sub(kWeight0, mul(kWeight4, w140[0])), mul(kWeight3, w140[1]))), \
+      add(w116[1], sub(mul(kWeight3, w140[0]), mul(kWeight4, w140[1])))        \
+    };                                                                         \
+    const T_VEC w150[2] = { add(w111[0], w135[1]), sub(w111[1], w135[0]) };    \
+    const T_VEC w151[2] = { sub(w111[0], w135[1]), add(w111[1], w135[0]) };    \
+    const T_VEC w152[2] = {                                                    \
+      sub(w113[0], sub(mul(kWeight4, w137[0]), mul(kWeight3, w137[1]))),       \
+      add(w113[1],                                                             \
+          sub(sub(kWeight0, mul(kWeight4, w137[1])), mul(kWeight3, w137[0])))  \
+    };                                                                         \
+    const T_VEC w153[2] = {                                                    \
+      add(w113[0], sub(mul(kWeight4, w137[0]), mul(kWeight3, w137[1]))),       \
+      add(w113[1], add(mul(kWeight4, w137[1]), mul(kWeight3, w137[0])))        \
+    };                                                                         \
+    const T_VEC w154[2] = {                                                    \
+      sub(w115[0], mul(kWeight2, sub(w139[0], w139[1]))),                      \
+      sub(w115[1], mul(kWeight2, add(w139[1], w139[0])))                       \
+    };                                                                         \
+    const T_VEC w155[2] = {                                                    \
+      add(w115[0], mul(kWeight2, sub(w139[0], w139[1]))),                      \
+      add(w115[1], mul(kWeight2, add(w139[1], w139[0])))                       \
+    };                                                                         \
+    const T_VEC w156[2] = {                                                    \
+      sub(w117[0], sub(mul(kWeight3, w141[0]), mul(kWeight4, w141[1]))),       \
+      add(w117[1],                                                             \
+          sub(sub(kWeight0, mul(kWeight3, w141[1])), mul(kWeight4, w141[0])))  \
+    };                                                                         \
+    const T_VEC w157[2] = {                                                    \
+      add(w117[0], sub(mul(kWeight3, w141[0]), mul(kWeight4, w141[1]))),       \
+      add(w117[1], add(mul(kWeight3, w141[1]), mul(kWeight4, w141[0])))        \
+    };                                                                         \
+    store(output + 0 * stride, add(w78[0], w142[0]));                          \
+    store(output + 1 * stride,                                                 \
+          add(w80[0], add(mul(kWeight5, w144[0]), mul(kWeight6, w144[1]))));   \
+    store(output + 2 * stride,                                                 \
+          add(w82[0], add(mul(kWeight3, w146[0]), mul(kWeight4, w146[1]))));   \
+    store(output + 3 * stride,                                                 \
+          add(w84[0], add(mul(kWeight7, w148[0]), mul(kWeight8, w148[1]))));   \
+    store(output + 4 * stride,                                                 \
+          add(w86[0], mul(kWeight2, add(w150[0], w150[1]))));                  \
+    store(output + 5 * stride,                                                 \
+          add(w88[0], add(mul(kWeight8, w152[0]), mul(kWeight7, w152[1]))));   \
+    store(output + 6 * stride,                                                 \
+          add(w90[0], add(mul(kWeight4, w154[0]), mul(kWeight3, w154[1]))));   \
+    store(output + 7 * stride,                                                 \
+          add(w92[0], add(mul(kWeight6, w156[0]), mul(kWeight5, w156[1]))));   \
+    store(output + 8 * stride, add(w79[0], w143[1]));                          \
+    store(output + 9 * stride,                                                 \
+          sub(w81[0], sub(mul(kWeight6, w145[0]), mul(kWeight5, w145[1]))));   \
+    store(output + 10 * stride,                                                \
+          sub(w83[0], sub(mul(kWeight4, w147[0]), mul(kWeight3, w147[1]))));   \
+    store(output + 11 * stride,                                                \
+          sub(w85[0], sub(mul(kWeight8, w149[0]), mul(kWeight7, w149[1]))));   \
+    store(output + 12 * stride,                                                \
+          sub(w87[0], mul(kWeight2, sub(w151[0], w151[1]))));                  \
+    store(output + 13 * stride,                                                \
+          sub(w89[0], sub(mul(kWeight7, w153[0]), mul(kWeight8, w153[1]))));   \
+    store(output + 14 * stride,                                                \
+          sub(w91[0], sub(mul(kWeight3, w155[0]), mul(kWeight4, w155[1]))));   \
+    store(output + 15 * stride,                                                \
+          sub(w93[0], sub(mul(kWeight5, w157[0]), mul(kWeight6, w157[1]))));   \
+    store(output + 16 * stride, sub(w78[0], w142[0]));                         \
+    store(output + 17 * stride,                                                \
+          add(w80[0], sub(sub(kWeight0, mul(kWeight5, w144[0])),               \
+                          mul(kWeight6, w144[1]))));                           \
+    store(output + 18 * stride,                                                \
+          add(w82[0], sub(sub(kWeight0, mul(kWeight3, w146[0])),               \
+                          mul(kWeight4, w146[1]))));                           \
+    store(output + 19 * stride,                                                \
+          add(w84[0], sub(sub(kWeight0, mul(kWeight7, w148[0])),               \
+                          mul(kWeight8, w148[1]))));                           \
+    store(output + 20 * stride,                                                \
+          add(w86[0], sub(sub(kWeight0, mul(kWeight2, w150[0])),               \
+                          mul(kWeight2, w150[1]))));                           \
+    store(output + 21 * stride,                                                \
+          add(w88[0], sub(sub(kWeight0, mul(kWeight8, w152[0])),               \
+                          mul(kWeight7, w152[1]))));                           \
+    store(output + 22 * stride,                                                \
+          add(w90[0], sub(sub(kWeight0, mul(kWeight4, w154[0])),               \
+                          mul(kWeight3, w154[1]))));                           \
+    store(output + 23 * stride,                                                \
+          add(w92[0], sub(sub(kWeight0, mul(kWeight6, w156[0])),               \
+                          mul(kWeight5, w156[1]))));                           \
+    store(output + 24 * stride, sub(w79[0], w143[1]));                         \
+    store(output + 25 * stride,                                                \
+          add(w81[0], sub(mul(kWeight6, w145[0]), mul(kWeight5, w145[1]))));   \
+    store(output + 26 * stride,                                                \
+          add(w83[0], sub(mul(kWeight4, w147[0]), mul(kWeight3, w147[1]))));   \
+    store(output + 27 * stride,                                                \
+          add(w85[0], sub(mul(kWeight8, w149[0]), mul(kWeight7, w149[1]))));   \
+    store(output + 28 * stride,                                                \
+          add(w87[0], mul(kWeight2, sub(w151[0], w151[1]))));                  \
+    store(output + 29 * stride,                                                \
+          add(w89[0], sub(mul(kWeight7, w153[0]), mul(kWeight8, w153[1]))));   \
+    store(output + 30 * stride,                                                \
+          add(w91[0], sub(mul(kWeight3, w155[0]), mul(kWeight4, w155[1]))));   \
+    store(output + 31 * stride,                                                \
+          add(w93[0], sub(mul(kWeight5, w157[0]), mul(kWeight6, w157[1]))));   \
+  }
+
+#endif  // AOM_AOM_DSP_FFT_COMMON_H_
diff --git a/libs/libaom/src/aom_dsp/fwd_txfm.c b/libs/libaom/src/aom_dsp/fwd_txfm.c
new file mode 100644
index 000000000..3d3044415
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/fwd_txfm.c
@@ -0,0 +1,229 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include "aom_dsp/txfm_common.h"
+#include "config/aom_dsp_rtcd.h"
+
+void aom_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) {
+  // The 2D transform is done with two passes which are actually pretty
+  // similar. In the first one, we transform the columns and transpose
+  // the results. In the second one, we transform the rows. To achieve that,
+  // as the first pass results are transposed, we transpose the columns (that
+  // is the transposed rows) and transpose the results (so that it goes back
+  // in normal/row positions).
+  // We need an intermediate buffer between passes.
+  tran_low_t intermediate[4 * 4];
+  const tran_low_t *in_low = NULL;
+  tran_low_t *out = intermediate;
+  // Do the two transform/transpose passes
+  for (int pass = 0; pass < 2; ++pass) {
+    tran_high_t in_high[4];    // canbe16
+    tran_high_t step[4];       // canbe16
+    tran_high_t temp1, temp2;  // needs32
+    for (int i = 0; i < 4; ++i) {
+      // Load inputs.
+      if (pass == 0) {
+        in_high[0] = input[0 * stride] * 16;
+        in_high[1] = input[1 * stride] * 16;
+        in_high[2] = input[2 * stride] * 16;
+        in_high[3] = input[3 * stride] * 16;
+        if (i == 0 && in_high[0]) {
+          ++in_high[0];
+        }
+      } else {
+        assert(in_low != NULL);
+        in_high[0] = in_low[0 * 4];
+        in_high[1] = in_low[1 * 4];
+        in_high[2] = in_low[2 * 4];
+        in_high[3] = in_low[3 * 4];
+        ++in_low;
+      }
+      // Transform.
+      step[0] = in_high[0] + in_high[3];
+      step[1] = in_high[1] + in_high[2];
+      step[2] = in_high[1] - in_high[2];
+      step[3] = in_high[0] - in_high[3];
+      temp1 = (step[0] + step[1]) * cospi_16_64;
+      temp2 = (step[0] - step[1]) * cospi_16_64;
+      out[0] = (tran_low_t)fdct_round_shift(temp1);
+      out[2] = (tran_low_t)fdct_round_shift(temp2);
+      temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
+      temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
+      out[1] = (tran_low_t)fdct_round_shift(temp1);
+      out[3] = (tran_low_t)fdct_round_shift(temp2);
+      // Do next column (which is a transposed row in second/horizontal pass)
+      ++input;
+      out += 4;
+    }
+    // Setup in/out for next pass.
+    in_low = intermediate;
+    out = output;
+  }
+
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 4; ++j)
+      output[j + i * 4] = (output[j + i * 4] + 1) >> 2;
+  }
+}
+
+void aom_fdct4x4_lp_c(const int16_t *input, int16_t *output, int stride) {
+  // The 2D transform is done with two passes which are actually pretty
+  // similar. In the first one, we transform the columns and transpose
+  // the results. In the second one, we transform the rows. To achieve that,
+  // as the first pass results are transposed, we transpose the columns (that
+  // is the transposed rows) and transpose the results (so that it goes back
+  // in normal/row positions).
+  // We need an intermediate buffer between passes.
+  int16_t intermediate[4 * 4];
+  const int16_t *in_low = NULL;
+  int16_t *out = intermediate;
+  // Do the two transform/transpose passes
+  for (int pass = 0; pass < 2; ++pass) {
+    int32_t in_high[4];    // canbe16
+    int32_t step[4];       // canbe16
+    int32_t temp1, temp2;  // needs32
+    for (int i = 0; i < 4; ++i) {
+      // Load inputs.
+      if (pass == 0) {
+        in_high[0] = input[0 * stride] * 16;
+        in_high[1] = input[1 * stride] * 16;
+        in_high[2] = input[2 * stride] * 16;
+        in_high[3] = input[3 * stride] * 16;
+        if (i == 0 && in_high[0]) {
+          ++in_high[0];
+        }
+      } else {
+        assert(in_low != NULL);
+        in_high[0] = in_low[0 * 4];
+        in_high[1] = in_low[1 * 4];
+        in_high[2] = in_low[2 * 4];
+        in_high[3] = in_low[3 * 4];
+        ++in_low;
+      }
+      // Transform.
+      step[0] = in_high[0] + in_high[3];
+      step[1] = in_high[1] + in_high[2];
+      step[2] = in_high[1] - in_high[2];
+      step[3] = in_high[0] - in_high[3];
+      temp1 = (step[0] + step[1]) * (int32_t)cospi_16_64;
+      temp2 = (step[0] - step[1]) * (int32_t)cospi_16_64;
+      out[0] = (int16_t)fdct_round_shift(temp1);
+      out[2] = (int16_t)fdct_round_shift(temp2);
+      temp1 = step[2] * (int32_t)cospi_24_64 + step[3] * (int32_t)cospi_8_64;
+      temp2 = -step[2] * (int32_t)cospi_8_64 + step[3] * (int32_t)cospi_24_64;
+      out[1] = (int16_t)fdct_round_shift(temp1);
+      out[3] = (int16_t)fdct_round_shift(temp2);
+      // Do next column (which is a transposed row in second/horizontal pass)
+      ++input;
+      out += 4;
+    }
+    // Setup in/out for next pass.
+    in_low = intermediate;
+    out = output;
+  }
+
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 4; ++j)
+      output[j + i * 4] = (output[j + i * 4] + 1) >> 2;
+  }
+}
+
+void aom_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) {
+  int i, j;
+  tran_low_t intermediate[64];
+  int pass;
+  tran_low_t *output = intermediate;
+  const tran_low_t *in = NULL;
+
+  // Transform columns
+  for (pass = 0; pass < 2; ++pass) {
+    tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
+    tran_high_t t0, t1, t2, t3;                  // needs32
+    tran_high_t x0, x1, x2, x3;                  // canbe16
+
+    for (i = 0; i < 8; i++) {
+      // stage 1
+      if (pass == 0) {
+        s0 = (input[0 * stride] + input[7 * stride]) * 4;
+        s1 = (input[1 * stride] + input[6 * stride]) * 4;
+        s2 = (input[2 * stride] + input[5 * stride]) * 4;
+        s3 = (input[3 * stride] + input[4 * stride]) * 4;
+        s4 = (input[3 * stride] - input[4 * stride]) * 4;
+        s5 = (input[2 * stride] - input[5 * stride]) * 4;
+        s6 = (input[1 * stride] - input[6 * stride]) * 4;
+        s7 = (input[0 * stride] - input[7 * stride]) * 4;
+        ++input;
+      } else {
+        s0 = in[0 * 8] + in[7 * 8];
+        s1 = in[1 * 8] + in[6 * 8];
+        s2 = in[2 * 8] + in[5 * 8];
+        s3 = in[3 * 8] + in[4 * 8];
+        s4 = in[3 * 8] - in[4 * 8];
+        s5 = in[2 * 8] - in[5 * 8];
+        s6 = in[1 * 8] - in[6 * 8];
+        s7 = in[0 * 8] - in[7 * 8];
+        ++in;
+      }
+
+      // fdct4(step, step);
+      x0 = s0 + s3;
+      x1 = s1 + s2;
+      x2 = s1 - s2;
+      x3 = s0 - s3;
+      t0 = (x0 + x1) * cospi_16_64;
+      t1 = (x0 - x1) * cospi_16_64;
+      t2 = x2 * cospi_24_64 + x3 * cospi_8_64;
+      t3 = -x2 * cospi_8_64 + x3 * cospi_24_64;
+      output[0] = (tran_low_t)fdct_round_shift(t0);
+      output[2] = (tran_low_t)fdct_round_shift(t2);
+      output[4] = (tran_low_t)fdct_round_shift(t1);
+      output[6] = (tran_low_t)fdct_round_shift(t3);
+
+      // Stage 2
+      t0 = (s6 - s5) * cospi_16_64;
+      t1 = (s6 + s5) * cospi_16_64;
+      t2 = fdct_round_shift(t0);
+      t3 = fdct_round_shift(t1);
+
+      // Stage 3
+      x0 = s4 + t2;
+      x1 = s4 - t2;
+      x2 = s7 - t3;
+      x3 = s7 + t3;
+
+      // Stage 4
+      t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
+      t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
+      t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
+      t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
+      output[1] = (tran_low_t)fdct_round_shift(t0);
+      output[3] = (tran_low_t)fdct_round_shift(t2);
+      output[5] = (tran_low_t)fdct_round_shift(t1);
+      output[7] = (tran_low_t)fdct_round_shift(t3);
+      output += 8;
+    }
+    in = intermediate;
+    output = final_output;
+  }
+
+  // Rows
+  for (i = 0; i < 8; ++i) {
+    for (j = 0; j < 8; ++j) final_output[j + i * 8] /= 2;
+  }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void aom_highbd_fdct8x8_c(const int16_t *input, tran_low_t *final_output,
+                          int stride) {
+  aom_fdct8x8_c(input, final_output, stride);
+}
+#endif
diff --git a/libs/libaom/src/aom_dsp/grain_synthesis.c b/libs/libaom/src/aom_dsp/grain_synthesis.c
new file mode 100644
index 000000000..626eb76af
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/grain_synthesis.c
@@ -0,0 +1,1408 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\file
+ * \brief Describes film grain parameters and film grain synthesis
+ *
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <assert.h>
+#include "aom_dsp/grain_synthesis.h"
+#include "aom_mem/aom_mem.h"
+
+// Samples with Gaussian distribution in the range of [-2048, 2047] (12 bits)
+// with zero mean and standard deviation of about 512.
+// should be divided by 4 for 10-bit range and 16 for 8-bit range.
+static const int gaussian_sequence[2048] = {
+  56,    568,   -180,  172,   124,   -84,   172,   -64,   -900,  24,   820,
+  224,   1248,  996,   272,   -8,    -916,  -388,  -732,  -104,  -188, 800,
+  112,   -652,  -320,  -376,  140,   -252,  492,   -168,  44,    -788, 588,
+  -584,  500,   -228,  12,    680,   272,   -476,  972,   -100,  652,  368,
+  432,   -196,  -720,  -192,  1000,  -332,  652,   -136,  -552,  -604, -4,
+  192,   -220,  -136,  1000,  -52,   372,   -96,   -624,  124,   -24,  396,
+  540,   -12,   -104,  640,   464,   244,   -208,  -84,   368,   -528, -740,
+  248,   -968,  -848,  608,   376,   -60,   -292,  -40,   -156,  252,  -292,
+  248,   224,   -280,  400,   -244,  244,   -60,   76,    -80,   212,  532,
+  340,   128,   -36,   824,   -352,  -60,   -264,  -96,   -612,  416,  -704,
+  220,   -204,  640,   -160,  1220,  -408,  900,   336,   20,    -336, -96,
+  -792,  304,   48,    -28,   -1232, -1172, -448,  104,   -292,  -520, 244,
+  60,    -948,  0,     -708,  268,   108,   356,   -548,  488,   -344, -136,
+  488,   -196,  -224,  656,   -236,  -1128, 60,    4,     140,   276,  -676,
+  -376,  168,   -108,  464,   8,     564,   64,    240,   308,   -300, -400,
+  -456,  -136,  56,    120,   -408,  -116,  436,   504,   -232,  328,  844,
+  -164,  -84,   784,   -168,  232,   -224,  348,   -376,  128,   568,  96,
+  -1244, -288,  276,   848,   832,   -360,  656,   464,   -384,  -332, -356,
+  728,   -388,  160,   -192,  468,   296,   224,   140,   -776,  -100, 280,
+  4,     196,   44,    -36,   -648,  932,   16,    1428,  28,    528,  808,
+  772,   20,    268,   88,    -332,  -284,  124,   -384,  -448,  208,  -228,
+  -1044, -328,  660,   380,   -148,  -300,  588,   240,   540,   28,   136,
+  -88,   -436,  256,   296,   -1000, 1400,  0,     -48,   1056,  -136, 264,
+  -528,  -1108, 632,   -484,  -592,  -344,  796,   124,   -668,  -768, 388,
+  1296,  -232,  -188,  -200,  -288,  -4,    308,   100,   -168,  256,  -500,
+  204,   -508,  648,   -136,  372,   -272,  -120,  -1004, -552,  -548, -384,
+  548,   -296,  428,   -108,  -8,    -912,  -324,  -224,  -88,   -112, -220,
+  -100,  996,   -796,  548,   360,   -216,  180,   428,   -200,  -212, 148,
+  96,    148,   284,   216,   -412,  -320,  120,   -300,  -384,  -604, -572,
+  -332,  -8,    -180,  -176,  696,   116,   -88,   628,   76,    44,   -516,
+  240,   -208,  -40,   100,   -592,  344,   -308,  -452,  -228,  20,   916,
+  -1752, -136,  -340,  -804,  140,   40,    512,   340,   248,   184,  -492,
+  896,   -156,  932,   -628,  328,   -688,  -448,  -616,  -752,  -100, 560,
+  -1020, 180,   -800,  -64,   76,    576,   1068,  396,   660,   552,  -108,
+  -28,   320,   -628,  312,   -92,   -92,   -472,  268,   16,    560,  516,
+  -672,  -52,   492,   -100,  260,   384,   284,   292,   304,   -148, 88,
+  -152,  1012,  1064,  -228,  164,   -376,  -684,  592,   -392,  156,  196,
+  -524,  -64,   -884,  160,   -176,  636,   648,   404,   -396,  -436, 864,
+  424,   -728,  988,   -604,  904,   -592,  296,   -224,  536,   -176, -920,
+  436,   -48,   1176,  -884,  416,   -776,  -824,  -884,  524,   -548, -564,
+  -68,   -164,  -96,   692,   364,   -692,  -1012, -68,   260,   -480, 876,
+  -1116, 452,   -332,  -352,  892,   -1088, 1220,  -676,  12,    -292, 244,
+  496,   372,   -32,   280,   200,   112,   -440,  -96,   24,    -644, -184,
+  56,    -432,  224,   -980,  272,   -260,  144,   -436,  420,   356,  364,
+  -528,  76,    172,   -744,  -368,  404,   -752,  -416,  684,   -688, 72,
+  540,   416,   92,    444,   480,   -72,   -1416, 164,   -1172, -68,  24,
+  424,   264,   1040,  128,   -912,  -524,  -356,  64,    876,   -12,  4,
+  -88,   532,   272,   -524,  320,   276,   -508,  940,   24,    -400, -120,
+  756,   60,    236,   -412,  100,   376,   -484,  400,   -100,  -740, -108,
+  -260,  328,   -268,  224,   -200,  -416,  184,   -604,  -564,  -20,  296,
+  60,    892,   -888,  60,    164,   68,    -760,  216,   -296,  904,  -336,
+  -28,   404,   -356,  -568,  -208,  -1480, -512,  296,   328,   -360, -164,
+  -1560, -776,  1156,  -428,  164,   -504,  -112,  120,   -216,  -148, -264,
+  308,   32,    64,    -72,   72,    116,   176,   -64,   -272,  460,  -536,
+  -784,  -280,  348,   108,   -752,  -132,  524,   -540,  -776,  116,  -296,
+  -1196, -288,  -560,  1040,  -472,  116,   -848,  -1116, 116,   636,  696,
+  284,   -176,  1016,  204,   -864,  -648,  -248,  356,   972,   -584, -204,
+  264,   880,   528,   -24,   -184,  116,   448,   -144,  828,   524,  212,
+  -212,  52,    12,    200,   268,   -488,  -404,  -880,  824,   -672, -40,
+  908,   -248,  500,   716,   -576,  492,   -576,  16,    720,   -108, 384,
+  124,   344,   280,   576,   -500,  252,   104,   -308,  196,   -188, -8,
+  1268,  296,   1032,  -1196, 436,   316,   372,   -432,  -200,  -660, 704,
+  -224,  596,   -132,  268,   32,    -452,  884,   104,   -1008, 424,  -1348,
+  -280,  4,     -1168, 368,   476,   696,   300,   -8,    24,    180,  -592,
+  -196,  388,   304,   500,   724,   -160,  244,   -84,   272,   -256, -420,
+  320,   208,   -144,  -156,  156,   364,   452,   28,    540,   316,  220,
+  -644,  -248,  464,   72,    360,   32,    -388,  496,   -680,  -48,  208,
+  -116,  -408,  60,    -604,  -392,  548,   -840,  784,   -460,  656,  -544,
+  -388,  -264,  908,   -800,  -628,  -612,  -568,  572,   -220,  164,  288,
+  -16,   -308,  308,   -112,  -636,  -760,  280,   -668,  432,   364,  240,
+  -196,  604,   340,   384,   196,   592,   -44,   -500,  432,   -580, -132,
+  636,   -76,   392,   4,     -412,  540,   508,   328,   -356,  -36,  16,
+  -220,  -64,   -248,  -60,   24,    -192,  368,   1040,  92,    -24,  -1044,
+  -32,   40,    104,   148,   192,   -136,  -520,  56,    -816,  -224, 732,
+  392,   356,   212,   -80,   -424,  -1008, -324,  588,   -1496, 576,  460,
+  -816,  -848,  56,    -580,  -92,   -1372, -112,  -496,  200,   364,  52,
+  -140,  48,    -48,   -60,   84,    72,    40,    132,   -356,  -268, -104,
+  -284,  -404,  732,   -520,  164,   -304,  -540,  120,   328,   -76,  -460,
+  756,   388,   588,   236,   -436,  -72,   -176,  -404,  -316,  -148, 716,
+  -604,  404,   -72,   -88,   -888,  -68,   944,   88,    -220,  -344, 960,
+  472,   460,   -232,  704,   120,   832,   -228,  692,   -508,  132,  -476,
+  844,   -748,  -364,  -44,   1116,  -1104, -1056, 76,    428,   552,  -692,
+  60,    356,   96,    -384,  -188,  -612,  -576,  736,   508,   892,  352,
+  -1132, 504,   -24,   -352,  324,   332,   -600,  -312,  292,   508,  -144,
+  -8,    484,   48,    284,   -260,  -240,  256,   -100,  -292,  -204, -44,
+  472,   -204,  908,   -188,  -1000, -256,  92,    1164,  -392,  564,  356,
+  652,   -28,   -884,  256,   484,   -192,  760,   -176,  376,   -524, -452,
+  -436,  860,   -736,  212,   124,   504,   -476,  468,   76,    -472, 552,
+  -692,  -944,  -620,  740,   -240,  400,   132,   20,    192,   -196, 264,
+  -668,  -1012, -60,   296,   -316,  -828,  76,    -156,  284,   -768, -448,
+  -832,  148,   248,   652,   616,   1236,  288,   -328,  -400,  -124, 588,
+  220,   520,   -696,  1032,  768,   -740,  -92,   -272,  296,   448,  -464,
+  412,   -200,  392,   440,   -200,  264,   -152,  -260,  320,   1032, 216,
+  320,   -8,    -64,   156,   -1016, 1084,  1172,  536,   484,   -432, 132,
+  372,   -52,   -256,  84,    116,   -352,  48,    116,   304,   -384, 412,
+  924,   -300,  528,   628,   180,   648,   44,    -980,  -220,  1320, 48,
+  332,   748,   524,   -268,  -720,  540,   -276,  564,   -344,  -208, -196,
+  436,   896,   88,    -392,  132,   80,    -964,  -288,  568,   56,   -48,
+  -456,  888,   8,     552,   -156,  -292,  948,   288,   128,   -716, -292,
+  1192,  -152,  876,   352,   -600,  -260,  -812,  -468,  -28,   -120, -32,
+  -44,   1284,  496,   192,   464,   312,   -76,   -516,  -380,  -456, -1012,
+  -48,   308,   -156,  36,    492,   -156,  -808,  188,   1652,  68,   -120,
+  -116,  316,   160,   -140,  352,   808,   -416,  592,   316,   -480, 56,
+  528,   -204,  -568,  372,   -232,  752,   -344,  744,   -4,    324,  -416,
+  -600,  768,   268,   -248,  -88,   -132,  -420,  -432,  80,    -288, 404,
+  -316,  -1216, -588,  520,   -108,  92,    -320,  368,   -480,  -216, -92,
+  1688,  -300,  180,   1020,  -176,  820,   -68,   -228,  -260,  436,  -904,
+  20,    40,    -508,  440,   -736,  312,   332,   204,   760,   -372, 728,
+  96,    -20,   -632,  -520,  -560,  336,   1076,  -64,   -532,  776,  584,
+  192,   396,   -728,  -520,  276,   -188,  80,    -52,   -612,  -252, -48,
+  648,   212,   -688,  228,   -52,   -260,  428,   -412,  -272,  -404, 180,
+  816,   -796,  48,    152,   484,   -88,   -216,  988,   696,   188,  -528,
+  648,   -116,  -180,  316,   476,   12,    -564,  96,    476,   -252, -364,
+  -376,  -392,  556,   -256,  -576,  260,   -352,  120,   -16,   -136, -260,
+  -492,  72,    556,   660,   580,   616,   772,   436,   424,   -32,  -324,
+  -1268, 416,   -324,  -80,   920,   160,   228,   724,   32,    -516, 64,
+  384,   68,    -128,  136,   240,   248,   -204,  -68,   252,   -932, -120,
+  -480,  -628,  -84,   192,   852,   -404,  -288,  -132,  204,   100,  168,
+  -68,   -196,  -868,  460,   1080,  380,   -80,   244,   0,     484,  -888,
+  64,    184,   352,   600,   460,   164,   604,   -196,  320,   -64,  588,
+  -184,  228,   12,    372,   48,    -848,  -344,  224,   208,   -200, 484,
+  128,   -20,   272,   -468,  -840,  384,   256,   -720,  -520,  -464, -580,
+  112,   -120,  644,   -356,  -208,  -608,  -528,  704,   560,   -424, 392,
+  828,   40,    84,    200,   -152,  0,     -144,  584,   280,   -120, 80,
+  -556,  -972,  -196,  -472,  724,   80,    168,   -32,   88,    160,  -688,
+  0,     160,   356,   372,   -776,  740,   -128,  676,   -248,  -480, 4,
+  -364,  96,    544,   232,   -1032, 956,   236,   356,   20,    -40,  300,
+  24,    -676,  -596,  132,   1120,  -104,  532,   -1096, 568,   648,  444,
+  508,   380,   188,   -376,  -604,  1488,  424,   24,    756,   -220, -192,
+  716,   120,   920,   688,   168,   44,    -460,  568,   284,   1144, 1160,
+  600,   424,   888,   656,   -356,  -320,  220,   316,   -176,  -724, -188,
+  -816,  -628,  -348,  -228,  -380,  1012,  -452,  -660,  736,   928,  404,
+  -696,  -72,   -268,  -892,  128,   184,   -344,  -780,  360,   336,  400,
+  344,   428,   548,   -112,  136,   -228,  -216,  -820,  -516,  340,  92,
+  -136,  116,   -300,  376,   -244,  100,   -316,  -520,  -284,  -12,  824,
+  164,   -548,  -180,  -128,  116,   -924,  -828,  268,   -368,  -580, 620,
+  192,   160,   0,     -1676, 1068,  424,   -56,   -360,  468,   -156, 720,
+  288,   -528,  556,   -364,  548,   -148,  504,   316,   152,   -648, -620,
+  -684,  -24,   -376,  -384,  -108,  -920,  -1032, 768,   180,   -264, -508,
+  -1268, -260,  -60,   300,   -240,  988,   724,   -376,  -576,  -212, -736,
+  556,   192,   1092,  -620,  -880,  376,   -56,   -4,    -216,  -32,  836,
+  268,   396,   1332,  864,   -600,  100,   56,    -412,  -92,   356,  180,
+  884,   -468,  -436,  292,   -388,  -804,  -704,  -840,  368,   -348, 140,
+  -724,  1536,  940,   372,   112,   -372,  436,   -480,  1136,  296,  -32,
+  -228,  132,   -48,   -220,  868,   -1016, -60,   -1044, -464,  328,  916,
+  244,   12,    -736,  -296,  360,   468,   -376,  -108,  -92,   788,  368,
+  -56,   544,   400,   -672,  -420,  728,   16,    320,   44,    -284, -380,
+  -796,  488,   132,   204,   -596,  -372,  88,    -152,  -908,  -636, -572,
+  -624,  -116,  -692,  -200,  -56,   276,   -88,   484,   -324,  948,  864,
+  1000,  -456,  -184,  -276,  292,   -296,  156,   676,   320,   160,  908,
+  -84,   -1236, -288,  -116,  260,   -372,  -644,  732,   -756,  -96,  84,
+  344,   -520,  348,   -688,  240,   -84,   216,   -1044, -136,  -676, -396,
+  -1500, 960,   -40,   176,   168,   1516,  420,   -504,  -344,  -364, -360,
+  1216,  -940,  -380,  -212,  252,   -660,  -708,  484,   -444,  -152, 928,
+  -120,  1112,  476,   -260,  560,   -148,  -344,  108,   -196,  228,  -288,
+  504,   560,   -328,  -88,   288,   -1008, 460,   -228,  468,   -836, -196,
+  76,    388,   232,   412,   -1168, -716,  -644,  756,   -172,  -356, -504,
+  116,   432,   528,   48,    476,   -168,  -608,  448,   160,   -532, -272,
+  28,    -676,  -12,   828,   980,   456,   520,   104,   -104,  256,  -344,
+  -4,    -28,   -368,  -52,   -524,  -572,  -556,  -200,  768,   1124, -208,
+  -512,  176,   232,   248,   -148,  -888,  604,   -600,  -304,  804,  -156,
+  -212,  488,   -192,  -804,  -256,  368,   -360,  -916,  -328,  228,  -240,
+  -448,  -472,  856,   -556,  -364,  572,   -12,   -156,  -368,  -340, 432,
+  252,   -752,  -152,  288,   268,   -580,  -848,  -592,  108,   -76,  244,
+  312,   -716,  592,   -80,   436,   360,   4,     -248,  160,   516,  584,
+  732,   44,    -468,  -280,  -292,  -156,  -588,  28,    308,   912,  24,
+  124,   156,   180,   -252,  944,   -924,  -772,  -520,  -428,  -624, 300,
+  -212,  -1144, 32,    -724,  800,   -1128, -212,  -1288, -848,  180,  -416,
+  440,   192,   -576,  -792,  -76,   -1080, 80,    -532,  -352,  -132, 380,
+  -820,  148,   1112,  128,   164,   456,   700,   -924,  144,   -668, -384,
+  648,   -832,  508,   552,   -52,   -100,  -656,  208,   -568,  748,  -88,
+  680,   232,   300,   192,   -408,  -1012, -152,  -252,  -268,  272,  -876,
+  -664,  -648,  -332,  -136,  16,    12,    1152,  -28,   332,   -536, 320,
+  -672,  -460,  -316,  532,   -260,  228,   -40,   1052,  -816,  180,  88,
+  -496,  -556,  -672,  -368,  428,   92,    356,   404,   -408,  252,  196,
+  -176,  -556,  792,   268,   32,    372,   40,    96,    -332,  328,  120,
+  372,   -900,  -40,   472,   -264,  -592,  952,   128,   656,   112,  664,
+  -232,  420,   4,     -344,  -464,  556,   244,   -416,  -32,   252,  0,
+  -412,  188,   -696,  508,   -476,  324,   -1096, 656,   -312,  560,  264,
+  -136,  304,   160,   -64,   -580,  248,   336,   -720,  560,   -348, -288,
+  -276,  -196,  -500,  852,   -544,  -236,  -1128, -992,  -776,  116,  56,
+  52,    860,   884,   212,   -12,   168,   1020,  512,   -552,  924,  -148,
+  716,   188,   164,   -340,  -520,  -184,  880,   -152,  -680,  -208, -1156,
+  -300,  -528,  -472,  364,   100,   -744,  -1056, -32,   540,   280,  144,
+  -676,  -32,   -232,  -280,  -224,  96,    568,   -76,   172,   148,  148,
+  104,   32,    -296,  -32,   788,   -80,   32,    -16,   280,   288,  944,
+  428,   -484
+};
+
+static const int gauss_bits = 11;
+
+static int luma_subblock_size_y = 32;
+static int luma_subblock_size_x = 32;
+
+static int chroma_subblock_size_y = 16;
+static int chroma_subblock_size_x = 16;
+
+static const int min_luma_legal_range = 16;
+static const int max_luma_legal_range = 235;
+
+static const int min_chroma_legal_range = 16;
+static const int max_chroma_legal_range = 240;
+
+static int scaling_lut_y[256];
+static int scaling_lut_cb[256];
+static int scaling_lut_cr[256];
+
+static int grain_min;
+static int grain_max;
+
+static uint16_t random_register = 0;  // random number generator register
+
+static void init_arrays(const aom_film_grain_t *params, int luma_stride,
+                        int chroma_stride, int ***pred_pos_luma_p,
+                        int ***pred_pos_chroma_p, int **luma_grain_block,
+                        int **cb_grain_block, int **cr_grain_block,
+                        int **y_line_buf, int **cb_line_buf, int **cr_line_buf,
+                        int **y_col_buf, int **cb_col_buf, int **cr_col_buf,
+                        int luma_grain_samples, int chroma_grain_samples,
+                        int chroma_subsamp_y, int chroma_subsamp_x) {
+  memset(scaling_lut_y, 0, sizeof(*scaling_lut_y) * 256);
+  memset(scaling_lut_cb, 0, sizeof(*scaling_lut_cb) * 256);
+  memset(scaling_lut_cr, 0, sizeof(*scaling_lut_cr) * 256);
+
+  int num_pos_luma = 2 * params->ar_coeff_lag * (params->ar_coeff_lag + 1);
+  int num_pos_chroma = num_pos_luma;
+  if (params->num_y_points > 0) ++num_pos_chroma;
+
+  int **pred_pos_luma;
+  int **pred_pos_chroma;
+
+  pred_pos_luma = (int **)aom_malloc(sizeof(*pred_pos_luma) * num_pos_luma);
+
+  for (int row = 0; row < num_pos_luma; row++) {
+    pred_pos_luma[row] = (int *)aom_malloc(sizeof(**pred_pos_luma) * 3);
+  }
+
+  pred_pos_chroma =
+      (int **)aom_malloc(sizeof(*pred_pos_chroma) * num_pos_chroma);
+
+  for (int row = 0; row < num_pos_chroma; row++) {
+    pred_pos_chroma[row] = (int *)aom_malloc(sizeof(**pred_pos_chroma) * 3);
+  }
+
+  int pos_ar_index = 0;
+
+  for (int row = -params->ar_coeff_lag; row < 0; row++) {
+    for (int col = -params->ar_coeff_lag; col < params->ar_coeff_lag + 1;
+         col++) {
+      pred_pos_luma[pos_ar_index][0] = row;
+      pred_pos_luma[pos_ar_index][1] = col;
+      pred_pos_luma[pos_ar_index][2] = 0;
+
+      pred_pos_chroma[pos_ar_index][0] = row;
+      pred_pos_chroma[pos_ar_index][1] = col;
+      pred_pos_chroma[pos_ar_index][2] = 0;
+      ++pos_ar_index;
+    }
+  }
+
+  for (int col = -params->ar_coeff_lag; col < 0; col++) {
+    pred_pos_luma[pos_ar_index][0] = 0;
+    pred_pos_luma[pos_ar_index][1] = col;
+    pred_pos_luma[pos_ar_index][2] = 0;
+
+    pred_pos_chroma[pos_ar_index][0] = 0;
+    pred_pos_chroma[pos_ar_index][1] = col;
+    pred_pos_chroma[pos_ar_index][2] = 0;
+
+    ++pos_ar_index;
+  }
+
+  if (params->num_y_points > 0) {
+    pred_pos_chroma[pos_ar_index][0] = 0;
+    pred_pos_chroma[pos_ar_index][1] = 0;
+    pred_pos_chroma[pos_ar_index][2] = 1;
+  }
+
+  *pred_pos_luma_p = pred_pos_luma;
+  *pred_pos_chroma_p = pred_pos_chroma;
+
+  *y_line_buf = (int *)aom_malloc(sizeof(**y_line_buf) * luma_stride * 2);
+  *cb_line_buf = (int *)aom_malloc(sizeof(**cb_line_buf) * chroma_stride *
+                                   (2 >> chroma_subsamp_y));
+  *cr_line_buf = (int *)aom_malloc(sizeof(**cr_line_buf) * chroma_stride *
+                                   (2 >> chroma_subsamp_y));
+
+  *y_col_buf =
+      (int *)aom_malloc(sizeof(**y_col_buf) * (luma_subblock_size_y + 2) * 2);
+  *cb_col_buf =
+      (int *)aom_malloc(sizeof(**cb_col_buf) *
+                        (chroma_subblock_size_y + (2 >> chroma_subsamp_y)) *
+                        (2 >> chroma_subsamp_x));
+  *cr_col_buf =
+      (int *)aom_malloc(sizeof(**cr_col_buf) *
+                        (chroma_subblock_size_y + (2 >> chroma_subsamp_y)) *
+                        (2 >> chroma_subsamp_x));
+
+  *luma_grain_block =
+      (int *)aom_malloc(sizeof(**luma_grain_block) * luma_grain_samples);
+  *cb_grain_block =
+      (int *)aom_malloc(sizeof(**cb_grain_block) * chroma_grain_samples);
+  *cr_grain_block =
+      (int *)aom_malloc(sizeof(**cr_grain_block) * chroma_grain_samples);
+}
+
+static void dealloc_arrays(const aom_film_grain_t *params, int ***pred_pos_luma,
+                           int ***pred_pos_chroma, int **luma_grain_block,
+                           int **cb_grain_block, int **cr_grain_block,
+                           int **y_line_buf, int **cb_line_buf,
+                           int **cr_line_buf, int **y_col_buf, int **cb_col_buf,
+                           int **cr_col_buf) {
+  int num_pos_luma = 2 * params->ar_coeff_lag * (params->ar_coeff_lag + 1);
+  int num_pos_chroma = num_pos_luma;
+  if (params->num_y_points > 0) ++num_pos_chroma;
+
+  for (int row = 0; row < num_pos_luma; row++) {
+    aom_free((*pred_pos_luma)[row]);
+  }
+  aom_free(*pred_pos_luma);
+
+  for (int row = 0; row < num_pos_chroma; row++) {
+    aom_free((*pred_pos_chroma)[row]);
+  }
+  aom_free((*pred_pos_chroma));
+
+  aom_free(*y_line_buf);
+
+  aom_free(*cb_line_buf);
+
+  aom_free(*cr_line_buf);
+
+  aom_free(*y_col_buf);
+
+  aom_free(*cb_col_buf);
+
+  aom_free(*cr_col_buf);
+
+  aom_free(*luma_grain_block);
+
+  aom_free(*cb_grain_block);
+
+  aom_free(*cr_grain_block);
+}
+
+// get a number between 0 and 2^bits - 1
+static INLINE int get_random_number(int bits) {
+  uint16_t bit;
+  bit = ((random_register >> 0) ^ (random_register >> 1) ^
+         (random_register >> 3) ^ (random_register >> 12)) &
+        1;
+  random_register = (random_register >> 1) | (bit << 15);
+  return (random_register >> (16 - bits)) & ((1 << bits) - 1);
+}
+
+static void init_random_generator(int luma_line, uint16_t seed) {
+  // same for the picture
+
+  uint16_t msb = (seed >> 8) & 255;
+  uint16_t lsb = seed & 255;
+
+  random_register = (msb << 8) + lsb;
+
+  //  changes for each row
+  int luma_num = luma_line >> 5;
+
+  random_register ^= ((luma_num * 37 + 178) & 255) << 8;
+  random_register ^= ((luma_num * 173 + 105) & 255);
+}
+
+// Return 0 for success, -1 for failure
+static int generate_luma_grain_block(
+    const aom_film_grain_t *params, int **pred_pos_luma, int *luma_grain_block,
+    int luma_block_size_y, int luma_block_size_x, int luma_grain_stride,
+    int left_pad, int top_pad, int right_pad, int bottom_pad) {
+  if (params->num_y_points == 0) {
+    memset(luma_grain_block, 0,
+           sizeof(*luma_grain_block) * luma_block_size_y * luma_grain_stride);
+    return 0;
+  }
+
+  int bit_depth = params->bit_depth;
+  int gauss_sec_shift = 12 - bit_depth + params->grain_scale_shift;
+
+  int num_pos_luma = 2 * params->ar_coeff_lag * (params->ar_coeff_lag + 1);
+  int rounding_offset = (1 << (params->ar_coeff_shift - 1));
+
+  for (int i = 0; i < luma_block_size_y; i++)
+    for (int j = 0; j < luma_block_size_x; j++)
+      luma_grain_block[i * luma_grain_stride + j] =
+          (gaussian_sequence[get_random_number(gauss_bits)] +
+           ((1 << gauss_sec_shift) >> 1)) >>
+          gauss_sec_shift;
+
+  for (int i = top_pad; i < luma_block_size_y - bottom_pad; i++)
+    for (int j = left_pad; j < luma_block_size_x - right_pad; j++) {
+      int wsum = 0;
+      for (int pos = 0; pos < num_pos_luma; pos++) {
+        wsum = wsum + params->ar_coeffs_y[pos] *
+                          luma_grain_block[(i + pred_pos_luma[pos][0]) *
+                                               luma_grain_stride +
+                                           j + pred_pos_luma[pos][1]];
+      }
+      luma_grain_block[i * luma_grain_stride + j] =
+          clamp(luma_grain_block[i * luma_grain_stride + j] +
+                    ((wsum + rounding_offset) >> params->ar_coeff_shift),
+                grain_min, grain_max);
+    }
+  return 0;
+}
+
+// Return 0 for success, -1 for failure
+static int generate_chroma_grain_blocks(
+    const aom_film_grain_t *params,
+    //                                  int** pred_pos_luma,
+    int **pred_pos_chroma, int *luma_grain_block, int *cb_grain_block,
+    int *cr_grain_block, int luma_grain_stride, int chroma_block_size_y,
+    int chroma_block_size_x, int chroma_grain_stride, int left_pad, int top_pad,
+    int right_pad, int bottom_pad, int chroma_subsamp_y, int chroma_subsamp_x) {
+  int bit_depth = params->bit_depth;
+  int gauss_sec_shift = 12 - bit_depth + params->grain_scale_shift;
+
+  int num_pos_chroma = 2 * params->ar_coeff_lag * (params->ar_coeff_lag + 1);
+  if (params->num_y_points > 0) ++num_pos_chroma;
+  int rounding_offset = (1 << (params->ar_coeff_shift - 1));
+  int chroma_grain_block_size = chroma_block_size_y * chroma_grain_stride;
+
+  if (params->num_cb_points || params->chroma_scaling_from_luma) {
+    init_random_generator(7 << 5, params->random_seed);
+
+    for (int i = 0; i < chroma_block_size_y; i++)
+      for (int j = 0; j < chroma_block_size_x; j++)
+        cb_grain_block[i * chroma_grain_stride + j] =
+            (gaussian_sequence[get_random_number(gauss_bits)] +
+             ((1 << gauss_sec_shift) >> 1)) >>
+            gauss_sec_shift;
+  } else {
+    memset(cb_grain_block, 0,
+           sizeof(*cb_grain_block) * chroma_grain_block_size);
+  }
+
+  if (params->num_cr_points || params->chroma_scaling_from_luma) {
+    init_random_generator(11 << 5, params->random_seed);
+
+    for (int i = 0; i < chroma_block_size_y; i++)
+      for (int j = 0; j < chroma_block_size_x; j++)
+        cr_grain_block[i * chroma_grain_stride + j] =
+            (gaussian_sequence[get_random_number(gauss_bits)] +
+             ((1 << gauss_sec_shift) >> 1)) >>
+            gauss_sec_shift;
+  } else {
+    memset(cr_grain_block, 0,
+           sizeof(*cr_grain_block) * chroma_grain_block_size);
+  }
+
+  for (int i = top_pad; i < chroma_block_size_y - bottom_pad; i++)
+    for (int j = left_pad; j < chroma_block_size_x - right_pad; j++) {
+      int wsum_cb = 0;
+      int wsum_cr = 0;
+      for (int pos = 0; pos < num_pos_chroma; pos++) {
+        if (pred_pos_chroma[pos][2] == 0) {
+          wsum_cb = wsum_cb + params->ar_coeffs_cb[pos] *
+                                  cb_grain_block[(i + pred_pos_chroma[pos][0]) *
+                                                     chroma_grain_stride +
+                                                 j + pred_pos_chroma[pos][1]];
+          wsum_cr = wsum_cr + params->ar_coeffs_cr[pos] *
+                                  cr_grain_block[(i + pred_pos_chroma[pos][0]) *
+                                                     chroma_grain_stride +
+                                                 j + pred_pos_chroma[pos][1]];
+        } else if (pred_pos_chroma[pos][2] == 1) {
+          int av_luma = 0;
+          int luma_coord_y = ((i - top_pad) << chroma_subsamp_y) + top_pad;
+          int luma_coord_x = ((j - left_pad) << chroma_subsamp_x) + left_pad;
+
+          for (int k = luma_coord_y; k < luma_coord_y + chroma_subsamp_y + 1;
+               k++)
+            for (int l = luma_coord_x; l < luma_coord_x + chroma_subsamp_x + 1;
+                 l++)
+              av_luma += luma_grain_block[k * luma_grain_stride + l];
+
+          av_luma =
+              (av_luma + ((1 << (chroma_subsamp_y + chroma_subsamp_x)) >> 1)) >>
+              (chroma_subsamp_y + chroma_subsamp_x);
+
+          wsum_cb = wsum_cb + params->ar_coeffs_cb[pos] * av_luma;
+          wsum_cr = wsum_cr + params->ar_coeffs_cr[pos] * av_luma;
+        } else {
+          fprintf(
+              stderr,
+              "Grain synthesis: prediction between two chroma components is "
+              "not supported!");
+          return -1;
+        }
+      }
+      if (params->num_cb_points || params->chroma_scaling_from_luma)
+        cb_grain_block[i * chroma_grain_stride + j] =
+            clamp(cb_grain_block[i * chroma_grain_stride + j] +
+                      ((wsum_cb + rounding_offset) >> params->ar_coeff_shift),
+                  grain_min, grain_max);
+      if (params->num_cr_points || params->chroma_scaling_from_luma)
+        cr_grain_block[i * chroma_grain_stride + j] =
+            clamp(cr_grain_block[i * chroma_grain_stride + j] +
+                      ((wsum_cr + rounding_offset) >> params->ar_coeff_shift),
+                  grain_min, grain_max);
+    }
+  return 0;
+}
+
+static void init_scaling_function(const int scaling_points[][2], int num_points,
+                                  int scaling_lut[]) {
+  if (num_points == 0) return;
+
+  for (int i = 0; i < scaling_points[0][0]; i++)
+    scaling_lut[i] = scaling_points[0][1];
+
+  for (int point = 0; point < num_points - 1; point++) {
+    int delta_y = scaling_points[point + 1][1] - scaling_points[point][1];
+    int delta_x = scaling_points[point + 1][0] - scaling_points[point][0];
+
+    int64_t delta = delta_y * ((65536 + (delta_x >> 1)) / delta_x);
+
+    for (int x = 0; x < delta_x; x++) {
+      scaling_lut[scaling_points[point][0] + x] =
+          scaling_points[point][1] + (int)((x * delta + 32768) >> 16);
+    }
+  }
+
+  for (int i = scaling_points[num_points - 1][0]; i < 256; i++)
+    scaling_lut[i] = scaling_points[num_points - 1][1];
+}
+
+// function that extracts samples from a LUT (and interpolates intemediate
+// frames for 10- and 12-bit video)
+static int scale_LUT(int *scaling_lut, int index, int bit_depth) {
+  int x = index >> (bit_depth - 8);
+
+  if (!(bit_depth - 8) || x == 255)
+    return scaling_lut[x];
+  else
+    return scaling_lut[x] + (((scaling_lut[x + 1] - scaling_lut[x]) *
+                                  (index & ((1 << (bit_depth - 8)) - 1)) +
+                              (1 << (bit_depth - 9))) >>
+                             (bit_depth - 8));
+}
+
+static void add_noise_to_block(const aom_film_grain_t *params, uint8_t *luma,
+                               uint8_t *cb, uint8_t *cr, int luma_stride,
+                               int chroma_stride, int *luma_grain,
+                               int *cb_grain, int *cr_grain,
+                               int luma_grain_stride, int chroma_grain_stride,
+                               int half_luma_height, int half_luma_width,
+                               int bit_depth, int chroma_subsamp_y,
+                               int chroma_subsamp_x, int mc_identity) {
+  int cb_mult = params->cb_mult - 128;            // fixed scale
+  int cb_luma_mult = params->cb_luma_mult - 128;  // fixed scale
+  int cb_offset = params->cb_offset - 256;
+
+  int cr_mult = params->cr_mult - 128;            // fixed scale
+  int cr_luma_mult = params->cr_luma_mult - 128;  // fixed scale
+  int cr_offset = params->cr_offset - 256;
+
+  int rounding_offset = (1 << (params->scaling_shift - 1));
+
+  int apply_y = params->num_y_points > 0 ? 1 : 0;
+  int apply_cb =
+      (params->num_cb_points > 0 || params->chroma_scaling_from_luma) ? 1 : 0;
+  int apply_cr =
+      (params->num_cr_points > 0 || params->chroma_scaling_from_luma) ? 1 : 0;
+
+  if (params->chroma_scaling_from_luma) {
+    cb_mult = 0;        // fixed scale
+    cb_luma_mult = 64;  // fixed scale
+    cb_offset = 0;
+
+    cr_mult = 0;        // fixed scale
+    cr_luma_mult = 64;  // fixed scale
+    cr_offset = 0;
+  }
+
+  int min_luma, max_luma, min_chroma, max_chroma;
+
+  if (params->clip_to_restricted_range) {
+    min_luma = min_luma_legal_range;
+    max_luma = max_luma_legal_range;
+
+    if (mc_identity) {
+      min_chroma = min_luma_legal_range;
+      max_chroma = max_luma_legal_range;
+    } else {
+      min_chroma = min_chroma_legal_range;
+      max_chroma = max_chroma_legal_range;
+    }
+  } else {
+    min_luma = min_chroma = 0;
+    max_luma = max_chroma = 255;
+  }
+
+  for (int i = 0; i < (half_luma_height << (1 - chroma_subsamp_y)); i++) {
+    for (int j = 0; j < (half_luma_width << (1 - chroma_subsamp_x)); j++) {
+      int average_luma = 0;
+      if (chroma_subsamp_x) {
+        average_luma = (luma[(i << chroma_subsamp_y) * luma_stride +
+                             (j << chroma_subsamp_x)] +
+                        luma[(i << chroma_subsamp_y) * luma_stride +
+                             (j << chroma_subsamp_x) + 1] +
+                        1) >>
+                       1;
+      } else {
+        average_luma = luma[(i << chroma_subsamp_y) * luma_stride + j];
+      }
+
+      if (apply_cb) {
+        cb[i * chroma_stride + j] = clamp(
+            cb[i * chroma_stride + j] +
+                ((scale_LUT(scaling_lut_cb,
+                            clamp(((average_luma * cb_luma_mult +
+                                    cb_mult * cb[i * chroma_stride + j]) >>
+                                   6) +
+                                      cb_offset,
+                                  0, (256 << (bit_depth - 8)) - 1),
+                            8) *
+                      cb_grain[i * chroma_grain_stride + j] +
+                  rounding_offset) >>
+                 params->scaling_shift),
+            min_chroma, max_chroma);
+      }
+
+      if (apply_cr) {
+        cr[i * chroma_stride + j] = clamp(
+            cr[i * chroma_stride + j] +
+                ((scale_LUT(scaling_lut_cr,
+                            clamp(((average_luma * cr_luma_mult +
+                                    cr_mult * cr[i * chroma_stride + j]) >>
+                                   6) +
+                                      cr_offset,
+                                  0, (256 << (bit_depth - 8)) - 1),
+                            8) *
+                      cr_grain[i * chroma_grain_stride + j] +
+                  rounding_offset) >>
+                 params->scaling_shift),
+            min_chroma, max_chroma);
+      }
+    }
+  }
+
+  if (apply_y) {
+    for (int i = 0; i < (half_luma_height << 1); i++) {
+      for (int j = 0; j < (half_luma_width << 1); j++) {
+        luma[i * luma_stride + j] =
+            clamp(luma[i * luma_stride + j] +
+                      ((scale_LUT(scaling_lut_y, luma[i * luma_stride + j], 8) *
+                            luma_grain[i * luma_grain_stride + j] +
+                        rounding_offset) >>
+                       params->scaling_shift),
+                  min_luma, max_luma);
+      }
+    }
+  }
+}
+
+static void add_noise_to_block_hbd(
+    const aom_film_grain_t *params, uint16_t *luma, uint16_t *cb, uint16_t *cr,
+    int luma_stride, int chroma_stride, int *luma_grain, int *cb_grain,
+    int *cr_grain, int luma_grain_stride, int chroma_grain_stride,
+    int half_luma_height, int half_luma_width, int bit_depth,
+    int chroma_subsamp_y, int chroma_subsamp_x, int mc_identity) {
+  int cb_mult = params->cb_mult - 128;            // fixed scale
+  int cb_luma_mult = params->cb_luma_mult - 128;  // fixed scale
+  // offset value depends on the bit depth
+  int cb_offset = (params->cb_offset << (bit_depth - 8)) - (1 << bit_depth);
+
+  int cr_mult = params->cr_mult - 128;            // fixed scale
+  int cr_luma_mult = params->cr_luma_mult - 128;  // fixed scale
+  // offset value depends on the bit depth
+  int cr_offset = (params->cr_offset << (bit_depth - 8)) - (1 << bit_depth);
+
+  int rounding_offset = (1 << (params->scaling_shift - 1));
+
+  int apply_y = params->num_y_points > 0 ? 1 : 0;
+  int apply_cb =
+      (params->num_cb_points > 0 || params->chroma_scaling_from_luma) > 0 ? 1
+                                                                          : 0;
+  int apply_cr =
+      (params->num_cr_points > 0 || params->chroma_scaling_from_luma) > 0 ? 1
+                                                                          : 0;
+
+  if (params->chroma_scaling_from_luma) {
+    cb_mult = 0;        // fixed scale
+    cb_luma_mult = 64;  // fixed scale
+    cb_offset = 0;
+
+    cr_mult = 0;        // fixed scale
+    cr_luma_mult = 64;  // fixed scale
+    cr_offset = 0;
+  }
+
+  int min_luma, max_luma, min_chroma, max_chroma;
+
+  if (params->clip_to_restricted_range) {
+    min_luma = min_luma_legal_range << (bit_depth - 8);
+    max_luma = max_luma_legal_range << (bit_depth - 8);
+
+    if (mc_identity) {
+      min_chroma = min_luma_legal_range << (bit_depth - 8);
+      max_chroma = max_luma_legal_range << (bit_depth - 8);
+    } else {
+      min_chroma = min_chroma_legal_range << (bit_depth - 8);
+      max_chroma = max_chroma_legal_range << (bit_depth - 8);
+    }
+  } else {
+    min_luma = min_chroma = 0;
+    max_luma = max_chroma = (256 << (bit_depth - 8)) - 1;
+  }
+
+  for (int i = 0; i < (half_luma_height << (1 - chroma_subsamp_y)); i++) {
+    for (int j = 0; j < (half_luma_width << (1 - chroma_subsamp_x)); j++) {
+      int average_luma = 0;
+      if (chroma_subsamp_x) {
+        average_luma = (luma[(i << chroma_subsamp_y) * luma_stride +
+                             (j << chroma_subsamp_x)] +
+                        luma[(i << chroma_subsamp_y) * luma_stride +
+                             (j << chroma_subsamp_x) + 1] +
+                        1) >>
+                       1;
+      } else {
+        average_luma = luma[(i << chroma_subsamp_y) * luma_stride + j];
+      }
+
+      if (apply_cb) {
+        cb[i * chroma_stride + j] = clamp(
+            cb[i * chroma_stride + j] +
+                ((scale_LUT(scaling_lut_cb,
+                            clamp(((average_luma * cb_luma_mult +
+                                    cb_mult * cb[i * chroma_stride + j]) >>
+                                   6) +
+                                      cb_offset,
+                                  0, (256 << (bit_depth - 8)) - 1),
+                            bit_depth) *
+                      cb_grain[i * chroma_grain_stride + j] +
+                  rounding_offset) >>
+                 params->scaling_shift),
+            min_chroma, max_chroma);
+      }
+      if (apply_cr) {
+        cr[i * chroma_stride + j] = clamp(
+            cr[i * chroma_stride + j] +
+                ((scale_LUT(scaling_lut_cr,
+                            clamp(((average_luma * cr_luma_mult +
+                                    cr_mult * cr[i * chroma_stride + j]) >>
+                                   6) +
+                                      cr_offset,
+                                  0, (256 << (bit_depth - 8)) - 1),
+                            bit_depth) *
+                      cr_grain[i * chroma_grain_stride + j] +
+                  rounding_offset) >>
+                 params->scaling_shift),
+            min_chroma, max_chroma);
+      }
+    }
+  }
+
+  if (apply_y) {
+    for (int i = 0; i < (half_luma_height << 1); i++) {
+      for (int j = 0; j < (half_luma_width << 1); j++) {
+        luma[i * luma_stride + j] =
+            clamp(luma[i * luma_stride + j] +
+                      ((scale_LUT(scaling_lut_y, luma[i * luma_stride + j],
+                                  bit_depth) *
+                            luma_grain[i * luma_grain_stride + j] +
+                        rounding_offset) >>
+                       params->scaling_shift),
+                  min_luma, max_luma);
+      }
+    }
+  }
+}
+
+static void copy_rect(uint8_t *src, int src_stride, uint8_t *dst,
+                      int dst_stride, int width, int height,
+                      int use_high_bit_depth) {
+  int hbd_coeff = use_high_bit_depth ? 2 : 1;
+  while (height) {
+    memcpy(dst, src, width * sizeof(uint8_t) * hbd_coeff);
+    src += src_stride;
+    dst += dst_stride;
+    --height;
+  }
+  return;
+}
+
+static void copy_area(int *src, int src_stride, int *dst, int dst_stride,
+                      int width, int height) {
+  while (height) {
+    memcpy(dst, src, width * sizeof(*src));
+    src += src_stride;
+    dst += dst_stride;
+    --height;
+  }
+  return;
+}
+
+static void extend_even(uint8_t *dst, int dst_stride, int width, int height,
+                        int use_high_bit_depth) {
+  if ((width & 1) == 0 && (height & 1) == 0) return;
+  if (use_high_bit_depth) {
+    uint16_t *dst16 = (uint16_t *)dst;
+    int dst16_stride = dst_stride / 2;
+    if (width & 1) {
+      for (int i = 0; i < height; ++i)
+        dst16[i * dst16_stride + width] = dst16[i * dst16_stride + width - 1];
+    }
+    width = (width + 1) & (~1);
+    if (height & 1) {
+      memcpy(&dst16[height * dst16_stride], &dst16[(height - 1) * dst16_stride],
+             sizeof(*dst16) * width);
+    }
+  } else {
+    if (width & 1) {
+      for (int i = 0; i < height; ++i)
+        dst[i * dst_stride + width] = dst[i * dst_stride + width - 1];
+    }
+    width = (width + 1) & (~1);
+    if (height & 1) {
+      memcpy(&dst[height * dst_stride], &dst[(height - 1) * dst_stride],
+             sizeof(*dst) * width);
+    }
+  }
+}
+
+static void ver_boundary_overlap(int *left_block, int left_stride,
+                                 int *right_block, int right_stride,
+                                 int *dst_block, int dst_stride, int width,
+                                 int height) {
+  if (width == 1) {
+    while (height) {
+      *dst_block = clamp((*left_block * 23 + *right_block * 22 + 16) >> 5,
+                         grain_min, grain_max);
+      left_block += left_stride;
+      right_block += right_stride;
+      dst_block += dst_stride;
+      --height;
+    }
+    return;
+  } else if (width == 2) {
+    while (height) {
+      dst_block[0] = clamp((27 * left_block[0] + 17 * right_block[0] + 16) >> 5,
+                           grain_min, grain_max);
+      dst_block[1] = clamp((17 * left_block[1] + 27 * right_block[1] + 16) >> 5,
+                           grain_min, grain_max);
+      left_block += left_stride;
+      right_block += right_stride;
+      dst_block += dst_stride;
+      --height;
+    }
+    return;
+  }
+}
+
+static void hor_boundary_overlap(int *top_block, int top_stride,
+                                 int *bottom_block, int bottom_stride,
+                                 int *dst_block, int dst_stride, int width,
+                                 int height) {
+  if (height == 1) {
+    while (width) {
+      *dst_block = clamp((*top_block * 23 + *bottom_block * 22 + 16) >> 5,
+                         grain_min, grain_max);
+      ++top_block;
+      ++bottom_block;
+      ++dst_block;
+      --width;
+    }
+    return;
+  } else if (height == 2) {
+    while (width) {
+      dst_block[0] = clamp((27 * top_block[0] + 17 * bottom_block[0] + 16) >> 5,
+                           grain_min, grain_max);
+      dst_block[dst_stride] = clamp((17 * top_block[top_stride] +
+                                     27 * bottom_block[bottom_stride] + 16) >>
+                                        5,
+                                    grain_min, grain_max);
+      ++top_block;
+      ++bottom_block;
+      ++dst_block;
+      --width;
+    }
+    return;
+  }
+}
+
+int av1_add_film_grain(const aom_film_grain_t *params, const aom_image_t *src,
+                       aom_image_t *dst) {
+  uint8_t *luma, *cb, *cr;
+  int height, width, luma_stride, chroma_stride;
+  int use_high_bit_depth = 0;
+  int chroma_subsamp_x = 0;
+  int chroma_subsamp_y = 0;
+  int mc_identity = src->mc == AOM_CICP_MC_IDENTITY ? 1 : 0;
+
+  switch (src->fmt) {
+    case AOM_IMG_FMT_AOMI420:
+    case AOM_IMG_FMT_I420:
+      use_high_bit_depth = 0;
+      chroma_subsamp_x = 1;
+      chroma_subsamp_y = 1;
+      break;
+    case AOM_IMG_FMT_I42016:
+      use_high_bit_depth = 1;
+      chroma_subsamp_x = 1;
+      chroma_subsamp_y = 1;
+      break;
+      //    case AOM_IMG_FMT_444A:
+    case AOM_IMG_FMT_I444:
+      use_high_bit_depth = 0;
+      chroma_subsamp_x = 0;
+      chroma_subsamp_y = 0;
+      break;
+    case AOM_IMG_FMT_I44416:
+      use_high_bit_depth = 1;
+      chroma_subsamp_x = 0;
+      chroma_subsamp_y = 0;
+      break;
+    case AOM_IMG_FMT_I422:
+      use_high_bit_depth = 0;
+      chroma_subsamp_x = 1;
+      chroma_subsamp_y = 0;
+      break;
+    case AOM_IMG_FMT_I42216:
+      use_high_bit_depth = 1;
+      chroma_subsamp_x = 1;
+      chroma_subsamp_y = 0;
+      break;
+    default:  // unknown input format
+      fprintf(stderr, "Film grain error: input format is not supported!");
+      return -1;
+  }
+
+  assert(params->bit_depth == src->bit_depth);
+
+  dst->fmt = src->fmt;
+  dst->bit_depth = src->bit_depth;
+
+  dst->r_w = src->r_w;
+  dst->r_h = src->r_h;
+  dst->d_w = src->d_w;
+  dst->d_h = src->d_h;
+
+  dst->cp = src->cp;
+  dst->tc = src->tc;
+  dst->mc = src->mc;
+
+  dst->monochrome = src->monochrome;
+  dst->csp = src->csp;
+  dst->range = src->range;
+
+  dst->x_chroma_shift = src->x_chroma_shift;
+  dst->y_chroma_shift = src->y_chroma_shift;
+
+  dst->temporal_id = src->temporal_id;
+  dst->spatial_id = src->spatial_id;
+
+  width = src->d_w % 2 ? src->d_w + 1 : src->d_w;
+  height = src->d_h % 2 ? src->d_h + 1 : src->d_h;
+
+  copy_rect(src->planes[AOM_PLANE_Y], src->stride[AOM_PLANE_Y],
+            dst->planes[AOM_PLANE_Y], dst->stride[AOM_PLANE_Y], src->d_w,
+            src->d_h, use_high_bit_depth);
+  // Note that dst is already assumed to be aligned to even.
+  extend_even(dst->planes[AOM_PLANE_Y], dst->stride[AOM_PLANE_Y], src->d_w,
+              src->d_h, use_high_bit_depth);
+
+  if (!src->monochrome) {
+    copy_rect(src->planes[AOM_PLANE_U], src->stride[AOM_PLANE_U],
+              dst->planes[AOM_PLANE_U], dst->stride[AOM_PLANE_U],
+              width >> chroma_subsamp_x, height >> chroma_subsamp_y,
+              use_high_bit_depth);
+
+    copy_rect(src->planes[AOM_PLANE_V], src->stride[AOM_PLANE_V],
+              dst->planes[AOM_PLANE_V], dst->stride[AOM_PLANE_V],
+              width >> chroma_subsamp_x, height >> chroma_subsamp_y,
+              use_high_bit_depth);
+  }
+
+  luma = dst->planes[AOM_PLANE_Y];
+  cb = dst->planes[AOM_PLANE_U];
+  cr = dst->planes[AOM_PLANE_V];
+
+  // luma and chroma strides in samples
+  luma_stride = dst->stride[AOM_PLANE_Y] >> use_high_bit_depth;
+  chroma_stride = dst->stride[AOM_PLANE_U] >> use_high_bit_depth;
+
+  return av1_add_film_grain_run(
+      params, luma, cb, cr, height, width, luma_stride, chroma_stride,
+      use_high_bit_depth, chroma_subsamp_y, chroma_subsamp_x, mc_identity);
+}
+
+int av1_add_film_grain_run(const aom_film_grain_t *params, uint8_t *luma,
+                           uint8_t *cb, uint8_t *cr, int height, int width,
+                           int luma_stride, int chroma_stride,
+                           int use_high_bit_depth, int chroma_subsamp_y,
+                           int chroma_subsamp_x, int mc_identity) {
+  int **pred_pos_luma;
+  int **pred_pos_chroma;
+  int *luma_grain_block;
+  int *cb_grain_block;
+  int *cr_grain_block;
+
+  int *y_line_buf;
+  int *cb_line_buf;
+  int *cr_line_buf;
+
+  int *y_col_buf;
+  int *cb_col_buf;
+  int *cr_col_buf;
+
+  random_register = params->random_seed;
+
+  int left_pad = 3;
+  int right_pad = 3;  // padding to offset for AR coefficients
+  int top_pad = 3;
+  int bottom_pad = 0;
+
+  int ar_padding = 3;  // maximum lag used for stabilization of AR coefficients
+
+  luma_subblock_size_y = 32;
+  luma_subblock_size_x = 32;
+
+  chroma_subblock_size_y = luma_subblock_size_y >> chroma_subsamp_y;
+  chroma_subblock_size_x = luma_subblock_size_x >> chroma_subsamp_x;
+
+  // Initial padding is only needed for generation of
+  // film grain templates (to stabilize the AR process)
+  // Only a 64x64 luma and 32x32 chroma part of a template
+  // is used later for adding grain, padding can be discarded
+
+  int luma_block_size_y =
+      top_pad + 2 * ar_padding + luma_subblock_size_y * 2 + bottom_pad;
+  int luma_block_size_x = left_pad + 2 * ar_padding + luma_subblock_size_x * 2 +
+                          2 * ar_padding + right_pad;
+
+  int chroma_block_size_y = top_pad + (2 >> chroma_subsamp_y) * ar_padding +
+                            chroma_subblock_size_y * 2 + bottom_pad;
+  int chroma_block_size_x = left_pad + (2 >> chroma_subsamp_x) * ar_padding +
+                            chroma_subblock_size_x * 2 +
+                            (2 >> chroma_subsamp_x) * ar_padding + right_pad;
+
+  int luma_grain_stride = luma_block_size_x;
+  int chroma_grain_stride = chroma_block_size_x;
+
+  int overlap = params->overlap_flag;
+  int bit_depth = params->bit_depth;
+
+  const int grain_center = 128 << (bit_depth - 8);
+  grain_min = 0 - grain_center;
+  grain_max = grain_center - 1;
+
+  init_arrays(params, luma_stride, chroma_stride, &pred_pos_luma,
+              &pred_pos_chroma, &luma_grain_block, &cb_grain_block,
+              &cr_grain_block, &y_line_buf, &cb_line_buf, &cr_line_buf,
+              &y_col_buf, &cb_col_buf, &cr_col_buf,
+              luma_block_size_y * luma_block_size_x,
+              chroma_block_size_y * chroma_block_size_x, chroma_subsamp_y,
+              chroma_subsamp_x);
+
+  if (generate_luma_grain_block(params, pred_pos_luma, luma_grain_block,
+                                luma_block_size_y, luma_block_size_x,
+                                luma_grain_stride, left_pad, top_pad, right_pad,
+                                bottom_pad))
+    return -1;
+
+  if (generate_chroma_grain_blocks(
+          params,
+          //                               pred_pos_luma,
+          pred_pos_chroma, luma_grain_block, cb_grain_block, cr_grain_block,
+          luma_grain_stride, chroma_block_size_y, chroma_block_size_x,
+          chroma_grain_stride, left_pad, top_pad, right_pad, bottom_pad,
+          chroma_subsamp_y, chroma_subsamp_x))
+    return -1;
+
+  init_scaling_function(params->scaling_points_y, params->num_y_points,
+                        scaling_lut_y);
+
+  if (params->chroma_scaling_from_luma) {
+    memcpy(scaling_lut_cb, scaling_lut_y, sizeof(*scaling_lut_y) * 256);
+    memcpy(scaling_lut_cr, scaling_lut_y, sizeof(*scaling_lut_y) * 256);
+  } else {
+    init_scaling_function(params->scaling_points_cb, params->num_cb_points,
+                          scaling_lut_cb);
+    init_scaling_function(params->scaling_points_cr, params->num_cr_points,
+                          scaling_lut_cr);
+  }
+  for (int y = 0; y < height / 2; y += (luma_subblock_size_y >> 1)) {
+    init_random_generator(y * 2, params->random_seed);
+
+    for (int x = 0; x < width / 2; x += (luma_subblock_size_x >> 1)) {
+      int offset_y = get_random_number(8);
+      int offset_x = (offset_y >> 4) & 15;
+      offset_y &= 15;
+
+      int luma_offset_y = left_pad + 2 * ar_padding + (offset_y << 1);
+      int luma_offset_x = top_pad + 2 * ar_padding + (offset_x << 1);
+
+      int chroma_offset_y = top_pad + (2 >> chroma_subsamp_y) * ar_padding +
+                            offset_y * (2 >> chroma_subsamp_y);
+      int chroma_offset_x = left_pad + (2 >> chroma_subsamp_x) * ar_padding +
+                            offset_x * (2 >> chroma_subsamp_x);
+
+      if (overlap && x) {
+        ver_boundary_overlap(
+            y_col_buf, 2,
+            luma_grain_block + luma_offset_y * luma_grain_stride +
+                luma_offset_x,
+            luma_grain_stride, y_col_buf, 2, 2,
+            AOMMIN(luma_subblock_size_y + 2, height - (y << 1)));
+
+        ver_boundary_overlap(
+            cb_col_buf, 2 >> chroma_subsamp_x,
+            cb_grain_block + chroma_offset_y * chroma_grain_stride +
+                chroma_offset_x,
+            chroma_grain_stride, cb_col_buf, 2 >> chroma_subsamp_x,
+            2 >> chroma_subsamp_x,
+            AOMMIN(chroma_subblock_size_y + (2 >> chroma_subsamp_y),
+                   (height - (y << 1)) >> chroma_subsamp_y));
+
+        ver_boundary_overlap(
+            cr_col_buf, 2 >> chroma_subsamp_x,
+            cr_grain_block + chroma_offset_y * chroma_grain_stride +
+                chroma_offset_x,
+            chroma_grain_stride, cr_col_buf, 2 >> chroma_subsamp_x,
+            2 >> chroma_subsamp_x,
+            AOMMIN(chroma_subblock_size_y + (2 >> chroma_subsamp_y),
+                   (height - (y << 1)) >> chroma_subsamp_y));
+
+        int i = y ? 1 : 0;
+
+        if (use_high_bit_depth) {
+          add_noise_to_block_hbd(
+              params,
+              (uint16_t *)luma + ((y + i) << 1) * luma_stride + (x << 1),
+              (uint16_t *)cb +
+                  ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride +
+                  (x << (1 - chroma_subsamp_x)),
+              (uint16_t *)cr +
+                  ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride +
+                  (x << (1 - chroma_subsamp_x)),
+              luma_stride, chroma_stride, y_col_buf + i * 4,
+              cb_col_buf + i * (2 - chroma_subsamp_y) * (2 - chroma_subsamp_x),
+              cr_col_buf + i * (2 - chroma_subsamp_y) * (2 - chroma_subsamp_x),
+              2, (2 - chroma_subsamp_x),
+              AOMMIN(luma_subblock_size_y >> 1, height / 2 - y) - i, 1,
+              bit_depth, chroma_subsamp_y, chroma_subsamp_x, mc_identity);
+        } else {
+          add_noise_to_block(
+              params, luma + ((y + i) << 1) * luma_stride + (x << 1),
+              cb + ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride +
+                  (x << (1 - chroma_subsamp_x)),
+              cr + ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride +
+                  (x << (1 - chroma_subsamp_x)),
+              luma_stride, chroma_stride, y_col_buf + i * 4,
+              cb_col_buf + i * (2 - chroma_subsamp_y) * (2 - chroma_subsamp_x),
+              cr_col_buf + i * (2 - chroma_subsamp_y) * (2 - chroma_subsamp_x),
+              2, (2 - chroma_subsamp_x),
+              AOMMIN(luma_subblock_size_y >> 1, height / 2 - y) - i, 1,
+              bit_depth, chroma_subsamp_y, chroma_subsamp_x, mc_identity);
+        }
+      }
+
+      if (overlap && y) {
+        if (x) {
+          hor_boundary_overlap(y_line_buf + (x << 1), luma_stride, y_col_buf, 2,
+                               y_line_buf + (x << 1), luma_stride, 2, 2);
+
+          hor_boundary_overlap(cb_line_buf + x * (2 >> chroma_subsamp_x),
+                               chroma_stride, cb_col_buf, 2 >> chroma_subsamp_x,
+                               cb_line_buf + x * (2 >> chroma_subsamp_x),
+                               chroma_stride, 2 >> chroma_subsamp_x,
+                               2 >> chroma_subsamp_y);
+
+          hor_boundary_overlap(cr_line_buf + x * (2 >> chroma_subsamp_x),
+                               chroma_stride, cr_col_buf, 2 >> chroma_subsamp_x,
+                               cr_line_buf + x * (2 >> chroma_subsamp_x),
+                               chroma_stride, 2 >> chroma_subsamp_x,
+                               2 >> chroma_subsamp_y);
+        }
+
+        hor_boundary_overlap(
+            y_line_buf + ((x ? x + 1 : 0) << 1), luma_stride,
+            luma_grain_block + luma_offset_y * luma_grain_stride +
+                luma_offset_x + (x ? 2 : 0),
+            luma_grain_stride, y_line_buf + ((x ? x + 1 : 0) << 1), luma_stride,
+            AOMMIN(luma_subblock_size_x - ((x ? 1 : 0) << 1),
+                   width - ((x ? x + 1 : 0) << 1)),
+            2);
+
+        hor_boundary_overlap(
+            cb_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)),
+            chroma_stride,
+            cb_grain_block + chroma_offset_y * chroma_grain_stride +
+                chroma_offset_x + ((x ? 1 : 0) << (1 - chroma_subsamp_x)),
+            chroma_grain_stride,
+            cb_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)),
+            chroma_stride,
+            AOMMIN(chroma_subblock_size_x -
+                       ((x ? 1 : 0) << (1 - chroma_subsamp_x)),
+                   (width - ((x ? x + 1 : 0) << 1)) >> chroma_subsamp_x),
+            2 >> chroma_subsamp_y);
+
+        hor_boundary_overlap(
+            cr_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)),
+            chroma_stride,
+            cr_grain_block + chroma_offset_y * chroma_grain_stride +
+                chroma_offset_x + ((x ? 1 : 0) << (1 - chroma_subsamp_x)),
+            chroma_grain_stride,
+            cr_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)),
+            chroma_stride,
+            AOMMIN(chroma_subblock_size_x -
+                       ((x ? 1 : 0) << (1 - chroma_subsamp_x)),
+                   (width - ((x ? x + 1 : 0) << 1)) >> chroma_subsamp_x),
+            2 >> chroma_subsamp_y);
+
+        if (use_high_bit_depth) {
+          add_noise_to_block_hbd(
+              params, (uint16_t *)luma + (y << 1) * luma_stride + (x << 1),
+              (uint16_t *)cb + (y << (1 - chroma_subsamp_y)) * chroma_stride +
+                  (x << ((1 - chroma_subsamp_x))),
+              (uint16_t *)cr + (y << (1 - chroma_subsamp_y)) * chroma_stride +
+                  (x << ((1 - chroma_subsamp_x))),
+              luma_stride, chroma_stride, y_line_buf + (x << 1),
+              cb_line_buf + (x << (1 - chroma_subsamp_x)),
+              cr_line_buf + (x << (1 - chroma_subsamp_x)), luma_stride,
+              chroma_stride, 1,
+              AOMMIN(luma_subblock_size_x >> 1, width / 2 - x), bit_depth,
+              chroma_subsamp_y, chroma_subsamp_x, mc_identity);
+        } else {
+          add_noise_to_block(
+              params, luma + (y << 1) * luma_stride + (x << 1),
+              cb + (y << (1 - chroma_subsamp_y)) * chroma_stride +
+                  (x << ((1 - chroma_subsamp_x))),
+              cr + (y << (1 - chroma_subsamp_y)) * chroma_stride +
+                  (x << ((1 - chroma_subsamp_x))),
+              luma_stride, chroma_stride, y_line_buf + (x << 1),
+              cb_line_buf + (x << (1 - chroma_subsamp_x)),
+              cr_line_buf + (x << (1 - chroma_subsamp_x)), luma_stride,
+              chroma_stride, 1,
+              AOMMIN(luma_subblock_size_x >> 1, width / 2 - x), bit_depth,
+              chroma_subsamp_y, chroma_subsamp_x, mc_identity);
+        }
+      }
+
+      int i = overlap && y ? 1 : 0;
+      int j = overlap && x ? 1 : 0;
+
+      if (use_high_bit_depth) {
+        add_noise_to_block_hbd(
+            params,
+            (uint16_t *)luma + ((y + i) << 1) * luma_stride + ((x + j) << 1),
+            (uint16_t *)cb +
+                ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride +
+                ((x + j) << (1 - chroma_subsamp_x)),
+            (uint16_t *)cr +
+                ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride +
+                ((x + j) << (1 - chroma_subsamp_x)),
+            luma_stride, chroma_stride,
+            luma_grain_block + (luma_offset_y + (i << 1)) * luma_grain_stride +
+                luma_offset_x + (j << 1),
+            cb_grain_block +
+                (chroma_offset_y + (i << (1 - chroma_subsamp_y))) *
+                    chroma_grain_stride +
+                chroma_offset_x + (j << (1 - chroma_subsamp_x)),
+            cr_grain_block +
+                (chroma_offset_y + (i << (1 - chroma_subsamp_y))) *
+                    chroma_grain_stride +
+                chroma_offset_x + (j << (1 - chroma_subsamp_x)),
+            luma_grain_stride, chroma_grain_stride,
+            AOMMIN(luma_subblock_size_y >> 1, height / 2 - y) - i,
+            AOMMIN(luma_subblock_size_x >> 1, width / 2 - x) - j, bit_depth,
+            chroma_subsamp_y, chroma_subsamp_x, mc_identity);
+      } else {
+        add_noise_to_block(
+            params, luma + ((y + i) << 1) * luma_stride + ((x + j) << 1),
+            cb + ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride +
+                ((x + j) << (1 - chroma_subsamp_x)),
+            cr + ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride +
+                ((x + j) << (1 - chroma_subsamp_x)),
+            luma_stride, chroma_stride,
+            luma_grain_block + (luma_offset_y + (i << 1)) * luma_grain_stride +
+                luma_offset_x + (j << 1),
+            cb_grain_block +
+                (chroma_offset_y + (i << (1 - chroma_subsamp_y))) *
+                    chroma_grain_stride +
+                chroma_offset_x + (j << (1 - chroma_subsamp_x)),
+            cr_grain_block +
+                (chroma_offset_y + (i << (1 - chroma_subsamp_y))) *
+                    chroma_grain_stride +
+                chroma_offset_x + (j << (1 - chroma_subsamp_x)),
+            luma_grain_stride, chroma_grain_stride,
+            AOMMIN(luma_subblock_size_y >> 1, height / 2 - y) - i,
+            AOMMIN(luma_subblock_size_x >> 1, width / 2 - x) - j, bit_depth,
+            chroma_subsamp_y, chroma_subsamp_x, mc_identity);
+      }
+
+      if (overlap) {
+        if (x) {
+          // Copy overlapped column bufer to line buffer
+          copy_area(y_col_buf + (luma_subblock_size_y << 1), 2,
+                    y_line_buf + (x << 1), luma_stride, 2, 2);
+
+          copy_area(
+              cb_col_buf + (chroma_subblock_size_y << (1 - chroma_subsamp_x)),
+              2 >> chroma_subsamp_x,
+              cb_line_buf + (x << (1 - chroma_subsamp_x)), chroma_stride,
+              2 >> chroma_subsamp_x, 2 >> chroma_subsamp_y);
+
+          copy_area(
+              cr_col_buf + (chroma_subblock_size_y << (1 - chroma_subsamp_x)),
+              2 >> chroma_subsamp_x,
+              cr_line_buf + (x << (1 - chroma_subsamp_x)), chroma_stride,
+              2 >> chroma_subsamp_x, 2 >> chroma_subsamp_y);
+        }
+
+        // Copy grain to the line buffer for overlap with a bottom block
+        copy_area(
+            luma_grain_block +
+                (luma_offset_y + luma_subblock_size_y) * luma_grain_stride +
+                luma_offset_x + ((x ? 2 : 0)),
+            luma_grain_stride, y_line_buf + ((x ? x + 1 : 0) << 1), luma_stride,
+            AOMMIN(luma_subblock_size_x, width - (x << 1)) - (x ? 2 : 0), 2);
+
+        copy_area(cb_grain_block +
+                      (chroma_offset_y + chroma_subblock_size_y) *
+                          chroma_grain_stride +
+                      chroma_offset_x + (x ? 2 >> chroma_subsamp_x : 0),
+                  chroma_grain_stride,
+                  cb_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)),
+                  chroma_stride,
+                  AOMMIN(chroma_subblock_size_x,
+                         ((width - (x << 1)) >> chroma_subsamp_x)) -
+                      (x ? 2 >> chroma_subsamp_x : 0),
+                  2 >> chroma_subsamp_y);
+
+        copy_area(cr_grain_block +
+                      (chroma_offset_y + chroma_subblock_size_y) *
+                          chroma_grain_stride +
+                      chroma_offset_x + (x ? 2 >> chroma_subsamp_x : 0),
+                  chroma_grain_stride,
+                  cr_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)),
+                  chroma_stride,
+                  AOMMIN(chroma_subblock_size_x,
+                         ((width - (x << 1)) >> chroma_subsamp_x)) -
+                      (x ? 2 >> chroma_subsamp_x : 0),
+                  2 >> chroma_subsamp_y);
+
+        // Copy grain to the column buffer for overlap with the next block to
+        // the right
+
+        copy_area(luma_grain_block + luma_offset_y * luma_grain_stride +
+                      luma_offset_x + luma_subblock_size_x,
+                  luma_grain_stride, y_col_buf, 2, 2,
+                  AOMMIN(luma_subblock_size_y + 2, height - (y << 1)));
+
+        copy_area(cb_grain_block + chroma_offset_y * chroma_grain_stride +
+                      chroma_offset_x + chroma_subblock_size_x,
+                  chroma_grain_stride, cb_col_buf, 2 >> chroma_subsamp_x,
+                  2 >> chroma_subsamp_x,
+                  AOMMIN(chroma_subblock_size_y + (2 >> chroma_subsamp_y),
+                         (height - (y << 1)) >> chroma_subsamp_y));
+
+        copy_area(cr_grain_block + chroma_offset_y * chroma_grain_stride +
+                      chroma_offset_x + chroma_subblock_size_x,
+                  chroma_grain_stride, cr_col_buf, 2 >> chroma_subsamp_x,
+                  2 >> chroma_subsamp_x,
+                  AOMMIN(chroma_subblock_size_y + (2 >> chroma_subsamp_y),
+                         (height - (y << 1)) >> chroma_subsamp_y));
+      }
+    }
+  }
+
+  dealloc_arrays(params, &pred_pos_luma, &pred_pos_chroma, &luma_grain_block,
+                 &cb_grain_block, &cr_grain_block, &y_line_buf, &cb_line_buf,
+                 &cr_line_buf, &y_col_buf, &cb_col_buf, &cr_col_buf);
+  return 0;
+}
diff --git a/libs/libaom/src/aom_dsp/grain_synthesis.h b/libs/libaom/src/aom_dsp/grain_synthesis.h
new file mode 100644
index 000000000..9155b3903
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/grain_synthesis.h
@@ -0,0 +1,192 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\file
+ * \brief Describes film grain parameters and film grain synthesis
+ *
+ */
+#ifndef AOM_AOM_DSP_GRAIN_SYNTHESIS_H_
+#define AOM_AOM_DSP_GRAIN_SYNTHESIS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <string.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom/aom_image.h"
+
+/*!\brief Structure containing film grain synthesis parameters for a frame
+ *
+ * This structure contains input parameters for film grain synthesis
+ */
+typedef struct {
+  // This structure is compared element-by-element in the function
+  // av1_check_grain_params_equiv: this function must be updated if any changes
+  // are made to this structure.
+  int apply_grain;
+
+  int update_parameters;
+
+  // 8 bit values
+  int scaling_points_y[14][2];
+  int num_y_points;  // value: 0..14
+
+  // 8 bit values
+  int scaling_points_cb[10][2];
+  int num_cb_points;  // value: 0..10
+
+  // 8 bit values
+  int scaling_points_cr[10][2];
+  int num_cr_points;  // value: 0..10
+
+  int scaling_shift;  // values : 8..11
+
+  int ar_coeff_lag;  // values:  0..3
+
+  // 8 bit values
+  int ar_coeffs_y[24];
+  int ar_coeffs_cb[25];
+  int ar_coeffs_cr[25];
+
+  // Shift value: AR coeffs range
+  // 6: [-2, 2)
+  // 7: [-1, 1)
+  // 8: [-0.5, 0.5)
+  // 9: [-0.25, 0.25)
+  int ar_coeff_shift;  // values : 6..9
+
+  int cb_mult;       // 8 bits
+  int cb_luma_mult;  // 8 bits
+  int cb_offset;     // 9 bits
+
+  int cr_mult;       // 8 bits
+  int cr_luma_mult;  // 8 bits
+  int cr_offset;     // 9 bits
+
+  int overlap_flag;
+
+  int clip_to_restricted_range;
+
+  unsigned int bit_depth;  // video bit depth
+
+  int chroma_scaling_from_luma;
+
+  int grain_scale_shift;
+
+  uint16_t random_seed;
+  // This structure is compared element-by-element in the function
+  // av1_check_grain_params_equiv: this function must be updated if any changes
+  // are made to this structure.
+} aom_film_grain_t;
+
+/*!\brief Check if two film grain parameters structs are equivalent
+ *
+ * Check if two film grain parameters are equal, except for the
+ * update_parameters and random_seed elements which are ignored.
+ *
+ * \param[in]    pa               The first set of parameters to compare
+ * \param[in]    pb               The second set of parameters to compare
+ * \return       Returns 1 if the params are equivalent, 0 otherwise
+ */
+static INLINE int av1_check_grain_params_equiv(
+    const aom_film_grain_t *const pa, const aom_film_grain_t *const pb) {
+  if (pa->apply_grain != pb->apply_grain) return 0;
+  // Don't compare update_parameters
+
+  if (pa->num_y_points != pb->num_y_points) return 0;
+  if (memcmp(pa->scaling_points_y, pb->scaling_points_y,
+             pa->num_y_points * 2 * sizeof(*pa->scaling_points_y)) != 0)
+    return 0;
+
+  if (pa->num_cb_points != pb->num_cb_points) return 0;
+  if (memcmp(pa->scaling_points_cb, pb->scaling_points_cb,
+             pa->num_cb_points * 2 * sizeof(*pa->scaling_points_cb)) != 0)
+    return 0;
+
+  if (pa->num_cr_points != pb->num_cr_points) return 0;
+  if (memcmp(pa->scaling_points_cr, pb->scaling_points_cr,
+             pa->num_cr_points * 2 * sizeof(*pa->scaling_points_cr)) != 0)
+    return 0;
+
+  if (pa->scaling_shift != pb->scaling_shift) return 0;
+  if (pa->ar_coeff_lag != pb->ar_coeff_lag) return 0;
+
+  const int num_pos = 2 * pa->ar_coeff_lag * (pa->ar_coeff_lag + 1);
+  if (memcmp(pa->ar_coeffs_y, pb->ar_coeffs_y,
+             num_pos * sizeof(*pa->ar_coeffs_y)) != 0)
+    return 0;
+  if (memcmp(pa->ar_coeffs_cb, pb->ar_coeffs_cb,
+             num_pos * sizeof(*pa->ar_coeffs_cb)) != 0)
+    return 0;
+  if (memcmp(pa->ar_coeffs_cr, pb->ar_coeffs_cr,
+             num_pos * sizeof(*pa->ar_coeffs_cr)) != 0)
+    return 0;
+
+  if (pa->ar_coeff_shift != pb->ar_coeff_shift) return 0;
+
+  if (pa->cb_mult != pb->cb_mult) return 0;
+  if (pa->cb_luma_mult != pb->cb_luma_mult) return 0;
+  if (pa->cb_offset != pb->cb_offset) return 0;
+
+  if (pa->cr_mult != pb->cr_mult) return 0;
+  if (pa->cr_luma_mult != pb->cr_luma_mult) return 0;
+  if (pa->cr_offset != pb->cr_offset) return 0;
+
+  if (pa->overlap_flag != pb->overlap_flag) return 0;
+  if (pa->clip_to_restricted_range != pb->clip_to_restricted_range) return 0;
+  if (pa->bit_depth != pb->bit_depth) return 0;
+  if (pa->chroma_scaling_from_luma != pb->chroma_scaling_from_luma) return 0;
+  if (pa->grain_scale_shift != pb->grain_scale_shift) return 0;
+
+  return 1;
+}
+
+/*!\brief Add film grain
+ *
+ * Add film grain to an image
+ *
+ * Returns 0 for success, -1 for failure
+ *
+ * \param[in]    grain_params     Grain parameters
+ * \param[in]    luma             luma plane
+ * \param[in]    cb               cb plane
+ * \param[in]    cr               cr plane
+ * \param[in]    height           luma plane height
+ * \param[in]    width            luma plane width
+ * \param[in]    luma_stride      luma plane stride
+ * \param[in]    chroma_stride    chroma plane stride
+ */
+int av1_add_film_grain_run(const aom_film_grain_t *grain_params, uint8_t *luma,
+                           uint8_t *cb, uint8_t *cr, int height, int width,
+                           int luma_stride, int chroma_stride,
+                           int use_high_bit_depth, int chroma_subsamp_y,
+                           int chroma_subsamp_x, int mc_identity);
+
+/*!\brief Add film grain
+ *
+ * Add film grain to an image
+ *
+ * Returns 0 for success, -1 for failure
+ *
+ * \param[in]    grain_params     Grain parameters
+ * \param[in]    src              Source image
+ * \param[out]   dst              Resulting image with grain
+ */
+int av1_add_film_grain(const aom_film_grain_t *grain_params,
+                       const aom_image_t *src, aom_image_t *dst);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_DSP_GRAIN_SYNTHESIS_H_
diff --git a/libs/libaom/src/aom_dsp/grain_table.c b/libs/libaom/src/aom_dsp/grain_table.c
new file mode 100644
index 000000000..e03f04d5d
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/grain_table.c
@@ -0,0 +1,334 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\file
+ * \brief This file has the implementation details of the grain table.
+ *
+ * The file format is an ascii representation for readability and
+ * editability. Array parameters are separated from the non-array
+ * parameters and prefixed with a few characters to make for easy
+ * localization with a parameter set. Each entry is prefixed with "E"
+ * and the other parameters are only specified if "update-parms" is
+ * non-zero.
+ *
+ * filmgrn1
+ * E <start-time> <end-time> <apply-grain> <random-seed> <update-parms>
+ *  p <ar_coeff_lag> <ar_coeff_shift> <grain_scale_shift> ...
+ *  sY <num_y_points> <point_0_x> <point_0_y> ...
+ *  sCb <num_cb_points> <point_0_x> <point_0_y> ...
+ *  sCr <num_cr_points> <point_0_x> <point_0_y> ...
+ *  cY <ar_coeff_y_0> ....
+ *  cCb <ar_coeff_cb_0> ....
+ *  cCr <ar_coeff_cr_0> ....
+ * E <start-time> ...
+ */
+#include <string.h>
+#include <stdio.h>
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/grain_table.h"
+#include "aom_mem/aom_mem.h"
+
+static const char kFileMagic[8] = "filmgrn1";
+
+static void grain_table_entry_read(FILE *file,
+                                   struct aom_internal_error_info *error_info,
+                                   aom_film_grain_table_entry_t *entry) {
+  aom_film_grain_t *pars = &entry->params;
+  int num_read =
+      fscanf(file, "E %" PRId64 " %" PRId64 " %d %hd %d\n", &entry->start_time,
+             &entry->end_time, &pars->apply_grain, &pars->random_seed,
+             &pars->update_parameters);
+  if (num_read == 0 && feof(file)) return;
+  if (num_read != 5) {
+    aom_internal_error(error_info, AOM_CODEC_ERROR,
+                       "Unable to read entry header. Read %d != 5", num_read);
+    return;
+  }
+  if (pars->update_parameters) {
+    num_read = fscanf(file, "p %d %d %d %d %d %d %d %d %d %d %d %d\n",
+                      &pars->ar_coeff_lag, &pars->ar_coeff_shift,
+                      &pars->grain_scale_shift, &pars->scaling_shift,
+                      &pars->chroma_scaling_from_luma, &pars->overlap_flag,
+                      &pars->cb_mult, &pars->cb_luma_mult, &pars->cb_offset,
+                      &pars->cr_mult, &pars->cr_luma_mult, &pars->cr_offset);
+    if (num_read != 12) {
+      aom_internal_error(error_info, AOM_CODEC_ERROR,
+                         "Unable to read entry params. Read %d != 12",
+                         num_read);
+      return;
+    }
+    if (!fscanf(file, "\tsY %d ", &pars->num_y_points)) {
+      aom_internal_error(error_info, AOM_CODEC_ERROR,
+                         "Unable to read num y points");
+      return;
+    }
+    for (int i = 0; i < pars->num_y_points; ++i) {
+      if (2 != fscanf(file, "%d %d", &pars->scaling_points_y[i][0],
+                      &pars->scaling_points_y[i][1])) {
+        aom_internal_error(error_info, AOM_CODEC_ERROR,
+                           "Unable to read y scaling points");
+        return;
+      }
+    }
+    if (!fscanf(file, "\n\tsCb %d", &pars->num_cb_points)) {
+      aom_internal_error(error_info, AOM_CODEC_ERROR,
+                         "Unable to read num cb points");
+      return;
+    }
+    for (int i = 0; i < pars->num_cb_points; ++i) {
+      if (2 != fscanf(file, "%d %d", &pars->scaling_points_cb[i][0],
+                      &pars->scaling_points_cb[i][1])) {
+        aom_internal_error(error_info, AOM_CODEC_ERROR,
+                           "Unable to read cb scaling points");
+        return;
+      }
+    }
+    if (!fscanf(file, "\n\tsCr %d", &pars->num_cr_points)) {
+      aom_internal_error(error_info, AOM_CODEC_ERROR,
+                         "Unable to read num cr points");
+      return;
+    }
+    for (int i = 0; i < pars->num_cr_points; ++i) {
+      if (2 != fscanf(file, "%d %d", &pars->scaling_points_cr[i][0],
+                      &pars->scaling_points_cr[i][1])) {
+        aom_internal_error(error_info, AOM_CODEC_ERROR,
+                           "Unable to read cr scaling points");
+        return;
+      }
+    }
+
+    fscanf(file, "\n\tcY");
+    const int n = 2 * pars->ar_coeff_lag * (pars->ar_coeff_lag + 1);
+    for (int i = 0; i < n; ++i) {
+      if (1 != fscanf(file, "%d", &pars->ar_coeffs_y[i])) {
+        aom_internal_error(error_info, AOM_CODEC_ERROR,
+                           "Unable to read Y coeffs");
+        return;
+      }
+    }
+    fscanf(file, "\n\tcCb");
+    for (int i = 0; i <= n; ++i) {
+      if (1 != fscanf(file, "%d", &pars->ar_coeffs_cb[i])) {
+        aom_internal_error(error_info, AOM_CODEC_ERROR,
+                           "Unable to read Cb coeffs");
+        return;
+      }
+    }
+    fscanf(file, "\n\tcCr");
+    for (int i = 0; i <= n; ++i) {
+      if (1 != fscanf(file, "%d", &pars->ar_coeffs_cr[i])) {
+        aom_internal_error(error_info, AOM_CODEC_ERROR,
+                           "Unable to read Cr coeffs");
+        return;
+      }
+    }
+    fscanf(file, "\n");
+  }
+}
+
+static void grain_table_entry_write(FILE *file,
+                                    aom_film_grain_table_entry_t *entry) {
+  const aom_film_grain_t *pars = &entry->params;
+  fprintf(file, "E %" PRId64 " %" PRId64 " %d %d %d\n", entry->start_time,
+          entry->end_time, pars->apply_grain, pars->random_seed,
+          pars->update_parameters);
+  if (pars->update_parameters) {
+    fprintf(file, "\tp %d %d %d %d %d %d %d %d %d %d %d %d\n",
+            pars->ar_coeff_lag, pars->ar_coeff_shift, pars->grain_scale_shift,
+            pars->scaling_shift, pars->chroma_scaling_from_luma,
+            pars->overlap_flag, pars->cb_mult, pars->cb_luma_mult,
+            pars->cb_offset, pars->cr_mult, pars->cr_luma_mult,
+            pars->cr_offset);
+    fprintf(file, "\tsY %d ", pars->num_y_points);
+    for (int i = 0; i < pars->num_y_points; ++i) {
+      fprintf(file, " %d %d", pars->scaling_points_y[i][0],
+              pars->scaling_points_y[i][1]);
+    }
+    fprintf(file, "\n\tsCb %d", pars->num_cb_points);
+    for (int i = 0; i < pars->num_cb_points; ++i) {
+      fprintf(file, " %d %d", pars->scaling_points_cb[i][0],
+              pars->scaling_points_cb[i][1]);
+    }
+    fprintf(file, "\n\tsCr %d", pars->num_cr_points);
+    for (int i = 0; i < pars->num_cr_points; ++i) {
+      fprintf(file, " %d %d", pars->scaling_points_cr[i][0],
+              pars->scaling_points_cr[i][1]);
+    }
+    fprintf(file, "\n\tcY");
+    const int n = 2 * pars->ar_coeff_lag * (pars->ar_coeff_lag + 1);
+    for (int i = 0; i < n; ++i) {
+      fprintf(file, " %d", pars->ar_coeffs_y[i]);
+    }
+    fprintf(file, "\n\tcCb");
+    for (int i = 0; i <= n; ++i) {
+      fprintf(file, " %d", pars->ar_coeffs_cb[i]);
+    }
+    fprintf(file, "\n\tcCr");
+    for (int i = 0; i <= n; ++i) {
+      fprintf(file, " %d", pars->ar_coeffs_cr[i]);
+    }
+    fprintf(file, "\n");
+  }
+}
+
+void aom_film_grain_table_append(aom_film_grain_table_t *t, int64_t time_stamp,
+                                 int64_t end_time,
+                                 const aom_film_grain_t *grain) {
+  if (!t->tail || memcmp(grain, &t->tail->params, sizeof(*grain))) {
+    aom_film_grain_table_entry_t *new_tail = aom_malloc(sizeof(*new_tail));
+    memset(new_tail, 0, sizeof(*new_tail));
+    if (t->tail) t->tail->next = new_tail;
+    if (!t->head) t->head = new_tail;
+    t->tail = new_tail;
+
+    new_tail->start_time = time_stamp;
+    new_tail->end_time = end_time;
+    new_tail->params = *grain;
+  } else {
+    t->tail->end_time = AOMMAX(t->tail->end_time, end_time);
+    t->tail->start_time = AOMMIN(t->tail->start_time, time_stamp);
+  }
+}
+
+int aom_film_grain_table_lookup(aom_film_grain_table_t *t, int64_t time_stamp,
+                                int64_t end_time, int erase,
+                                aom_film_grain_t *grain) {
+  aom_film_grain_table_entry_t *entry = t->head;
+  aom_film_grain_table_entry_t *prev_entry = 0;
+  uint16_t random_seed = grain ? grain->random_seed : 0;
+  if (grain) memset(grain, 0, sizeof(*grain));
+
+  while (entry) {
+    aom_film_grain_table_entry_t *next = entry->next;
+    if (time_stamp >= entry->start_time && time_stamp < entry->end_time) {
+      if (grain) {
+        *grain = entry->params;
+        if (time_stamp != 0) grain->random_seed = random_seed;
+      }
+      if (!erase) return 1;
+
+      const int64_t entry_end_time = entry->end_time;
+      if (time_stamp <= entry->start_time && end_time >= entry->end_time) {
+        if (t->tail == entry) t->tail = prev_entry;
+        if (prev_entry) {
+          prev_entry->next = entry->next;
+        } else {
+          t->head = entry->next;
+        }
+        aom_free(entry);
+      } else if (time_stamp <= entry->start_time &&
+                 end_time < entry->end_time) {
+        entry->start_time = end_time;
+      } else if (time_stamp > entry->start_time &&
+                 end_time >= entry->end_time) {
+        entry->end_time = time_stamp;
+      } else {
+        aom_film_grain_table_entry_t *new_entry =
+            aom_malloc(sizeof(*new_entry));
+        new_entry->next = entry->next;
+        new_entry->start_time = end_time;
+        new_entry->end_time = entry->end_time;
+        new_entry->params = entry->params;
+        entry->next = new_entry;
+        entry->end_time = time_stamp;
+        if (t->tail == entry) t->tail = new_entry;
+      }
+      // If segments aren't aligned, delete from the beggining of subsequent
+      // segments
+      if (end_time > entry_end_time) {
+        aom_film_grain_table_lookup(t, entry->end_time, end_time, 1, 0);
+      }
+      return 1;
+    }
+    prev_entry = entry;
+    entry = next;
+  }
+  return 0;
+}
+
+aom_codec_err_t aom_film_grain_table_read(
+    aom_film_grain_table_t *t, const char *filename,
+    struct aom_internal_error_info *error_info) {
+  FILE *file = fopen(filename, "rb");
+  if (!file) {
+    aom_internal_error(error_info, AOM_CODEC_ERROR, "Unable to open %s",
+                       filename);
+    return error_info->error_code;
+  }
+  error_info->error_code = AOM_CODEC_OK;
+
+  // Read in one extra character as there should be white space after
+  // the header.
+  char magic[9];
+  if (!fread(magic, 9, 1, file) || memcmp(magic, kFileMagic, 8)) {
+    aom_internal_error(error_info, AOM_CODEC_ERROR,
+                       "Unable to read (or invalid) file magic");
+    fclose(file);
+    return error_info->error_code;
+  }
+
+  aom_film_grain_table_entry_t *prev_entry = 0;
+  while (!feof(file)) {
+    aom_film_grain_table_entry_t *entry = aom_malloc(sizeof(*entry));
+    memset(entry, 0, sizeof(*entry));
+    grain_table_entry_read(file, error_info, entry);
+    entry->next = 0;
+
+    if (prev_entry) prev_entry->next = entry;
+    if (!t->head) t->head = entry;
+    t->tail = entry;
+    prev_entry = entry;
+
+    if (error_info->error_code != AOM_CODEC_OK) break;
+  }
+
+  fclose(file);
+  return error_info->error_code;
+}
+
+aom_codec_err_t aom_film_grain_table_write(
+    const aom_film_grain_table_t *t, const char *filename,
+    struct aom_internal_error_info *error_info) {
+  error_info->error_code = AOM_CODEC_OK;
+
+  FILE *file = fopen(filename, "wb");
+  if (!file) {
+    aom_internal_error(error_info, AOM_CODEC_ERROR, "Unable to open file %s",
+                       filename);
+    return error_info->error_code;
+  }
+
+  if (!fwrite(kFileMagic, 8, 1, file)) {
+    aom_internal_error(error_info, AOM_CODEC_ERROR,
+                       "Unable to write file magic");
+    fclose(file);
+    return error_info->error_code;
+  }
+
+  fprintf(file, "\n");
+  aom_film_grain_table_entry_t *entry = t->head;
+  while (entry) {
+    grain_table_entry_write(file, entry);
+    entry = entry->next;
+  }
+  fclose(file);
+  return error_info->error_code;
+}
+
+void aom_film_grain_table_free(aom_film_grain_table_t *t) {
+  aom_film_grain_table_entry_t *entry = t->head;
+  while (entry) {
+    aom_film_grain_table_entry_t *next = entry->next;
+    aom_free(entry);
+    entry = next;
+  }
+  memset(t, 0, sizeof(*t));
+}
diff --git a/libs/libaom/src/aom_dsp/grain_table.h b/libs/libaom/src/aom_dsp/grain_table.h
new file mode 100644
index 000000000..a8ac50730
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/grain_table.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\file
+ * \brief A table mapping from time to corresponding film grain parameters.
+ *
+ * In order to apply grain synthesis in the decoder, the film grain parameters
+ * need to be signalled in the encoder. The film grain parameters are time
+ * varying, and for two-pass encoding (and denoiser implementation flexibility)
+ * it is common to denoise the video and do parameter estimation before encoding
+ * the denoised video.
+ *
+ * The film grain table is used to provide this flexibility and is used as a
+ * parameter that is passed to the encoder.
+ *
+ * Further, if regraining is to be done in say a single pass mode, or in two
+ * pass within the encoder (before frames are added to the lookahead buffer),
+ * this data structure can be used to keep track of on-the-fly estimated grain
+ * parameters, that are then extracted from the table before the encoded frame
+ * is written.
+ */
+#ifndef AOM_AOM_DSP_GRAIN_TABLE_H_
+#define AOM_AOM_DSP_GRAIN_TABLE_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "aom_dsp/grain_synthesis.h"
+#include "aom/internal/aom_codec_internal.h"
+
+typedef struct aom_film_grain_table_entry_t {
+  aom_film_grain_t params;
+  int64_t start_time;
+  int64_t end_time;
+  struct aom_film_grain_table_entry_t *next;
+} aom_film_grain_table_entry_t;
+
+typedef struct {
+  aom_film_grain_table_entry_t *head;
+  aom_film_grain_table_entry_t *tail;
+} aom_film_grain_table_t;
+
+/*!\brief Add a mapping from [time_stamp, end_time) to the given grain
+ * parameters
+ *
+ * \param[in/out] table      The grain table
+ * \param[in]     time_stamp The start time stamp
+ * \param[in]     end_stamp  The end time_stamp
+ * \param[in]     grain      The grain parameters
+ */
+void aom_film_grain_table_append(aom_film_grain_table_t *table,
+                                 int64_t time_stamp, int64_t end_time,
+                                 const aom_film_grain_t *grain);
+
+/*!\brief Look-up (and optionally erase) the grain parameters for the given time
+ *
+ * \param[in]  table      The grain table
+ * \param[in]  time_stamp The start time stamp
+ * \param[in]  end_stamp  The end time_stamp
+ * \param[in]  erase      Whether the time segment can be deleted
+ * \param[out] grain      The output grain parameters
+ */
+int aom_film_grain_table_lookup(aom_film_grain_table_t *t, int64_t time_stamp,
+                                int64_t end_time, int erase,
+                                aom_film_grain_t *grain);
+
+/*!\brief Reads the grain table from a file.
+ *
+ * \param[out]  table       The grain table
+ * \param[in]   filename    The file to read from
+ * \param[in]   error_info  Error info for tracking errors
+ */
+aom_codec_err_t aom_film_grain_table_read(
+    aom_film_grain_table_t *table, const char *filename,
+    struct aom_internal_error_info *error_info);
+
+/*!\brief Writes the grain table from a file.
+ *
+ * \param[out]  table       The grain table
+ * \param[in]   filename    The file to read from
+ * \param[in]   error_info  Error info for tracking errors
+ */
+aom_codec_err_t aom_film_grain_table_write(
+    const aom_film_grain_table_t *t, const char *filename,
+    struct aom_internal_error_info *error_info);
+
+void aom_film_grain_table_free(aom_film_grain_table_t *t);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // AOM_AOM_DSP_GRAIN_TABLE_H_
diff --git a/libs/libaom/src/aom_dsp/intrapred.c b/libs/libaom/src/aom_dsp/intrapred.c
new file mode 100644
index 000000000..72ccfd835
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/intrapred.c
@@ -0,0 +1,792 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <math.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/intrapred_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/bitops.h"
+
+static INLINE void v_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+                               const uint8_t *above, const uint8_t *left) {
+  int r;
+  (void)left;
+
+  for (r = 0; r < bh; r++) {
+    memcpy(dst, above, bw);
+    dst += stride;
+  }
+}
+
+static INLINE void h_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+                               const uint8_t *above, const uint8_t *left) {
+  int r;
+  (void)above;
+
+  for (r = 0; r < bh; r++) {
+    memset(dst, left[r], bw);
+    dst += stride;
+  }
+}
+
+static INLINE int abs_diff(int a, int b) { return (a > b) ? a - b : b - a; }
+
+static INLINE uint16_t paeth_predictor_single(uint16_t left, uint16_t top,
+                                              uint16_t top_left) {
+  const int base = top + left - top_left;
+  const int p_left = abs_diff(base, left);
+  const int p_top = abs_diff(base, top);
+  const int p_top_left = abs_diff(base, top_left);
+
+  // Return nearest to base of left, top and top_left.
+  return (p_left <= p_top && p_left <= p_top_left)
+             ? left
+             : (p_top <= p_top_left) ? top : top_left;
+}
+
+static INLINE void paeth_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
+                                   int bh, const uint8_t *above,
+                                   const uint8_t *left) {
+  int r, c;
+  const uint8_t ytop_left = above[-1];
+
+  for (r = 0; r < bh; r++) {
+    for (c = 0; c < bw; c++)
+      dst[c] = (uint8_t)paeth_predictor_single(left[r], above[c], ytop_left);
+    dst += stride;
+  }
+}
+
+// Some basic checks on weights for smooth predictor.
+#define sm_weights_sanity_checks(weights_w, weights_h, weights_scale, \
+                                 pred_scale)                          \
+  assert(weights_w[0] < weights_scale);                               \
+  assert(weights_h[0] < weights_scale);                               \
+  assert(weights_scale - weights_w[bw - 1] < weights_scale);          \
+  assert(weights_scale - weights_h[bh - 1] < weights_scale);          \
+  assert(pred_scale < 31)  // ensures no overflow when calculating predictor.
+
+#define divide_round(value, bits) (((value) + (1 << ((bits)-1))) >> (bits))
+
+static INLINE void smooth_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
+                                    int bh, const uint8_t *above,
+                                    const uint8_t *left) {
+  const uint8_t below_pred = left[bh - 1];   // estimated by bottom-left pixel
+  const uint8_t right_pred = above[bw - 1];  // estimated by top-right pixel
+  const uint8_t *const sm_weights_w = sm_weight_arrays + bw;
+  const uint8_t *const sm_weights_h = sm_weight_arrays + bh;
+  // scale = 2 * 2^sm_weight_log2_scale
+  const int log2_scale = 1 + sm_weight_log2_scale;
+  const uint16_t scale = (1 << sm_weight_log2_scale);
+  sm_weights_sanity_checks(sm_weights_w, sm_weights_h, scale,
+                           log2_scale + sizeof(*dst));
+  int r;
+  for (r = 0; r < bh; ++r) {
+    int c;
+    for (c = 0; c < bw; ++c) {
+      const uint8_t pixels[] = { above[c], below_pred, left[r], right_pred };
+      const uint8_t weights[] = { sm_weights_h[r], scale - sm_weights_h[r],
+                                  sm_weights_w[c], scale - sm_weights_w[c] };
+      uint32_t this_pred = 0;
+      int i;
+      assert(scale >= sm_weights_h[r] && scale >= sm_weights_w[c]);
+      for (i = 0; i < 4; ++i) {
+        this_pred += weights[i] * pixels[i];
+      }
+      dst[c] = divide_round(this_pred, log2_scale);
+    }
+    dst += stride;
+  }
+}
+
+static INLINE void smooth_v_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
+                                      int bh, const uint8_t *above,
+                                      const uint8_t *left) {
+  const uint8_t below_pred = left[bh - 1];  // estimated by bottom-left pixel
+  const uint8_t *const sm_weights = sm_weight_arrays + bh;
+  // scale = 2^sm_weight_log2_scale
+  const int log2_scale = sm_weight_log2_scale;
+  const uint16_t scale = (1 << sm_weight_log2_scale);
+  sm_weights_sanity_checks(sm_weights, sm_weights, scale,
+                           log2_scale + sizeof(*dst));
+
+  int r;
+  for (r = 0; r < bh; r++) {
+    int c;
+    for (c = 0; c < bw; ++c) {
+      const uint8_t pixels[] = { above[c], below_pred };
+      const uint8_t weights[] = { sm_weights[r], scale - sm_weights[r] };
+      uint32_t this_pred = 0;
+      assert(scale >= sm_weights[r]);
+      int i;
+      for (i = 0; i < 2; ++i) {
+        this_pred += weights[i] * pixels[i];
+      }
+      dst[c] = divide_round(this_pred, log2_scale);
+    }
+    dst += stride;
+  }
+}
+
+static INLINE void smooth_h_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
+                                      int bh, const uint8_t *above,
+                                      const uint8_t *left) {
+  const uint8_t right_pred = above[bw - 1];  // estimated by top-right pixel
+  const uint8_t *const sm_weights = sm_weight_arrays + bw;
+  // scale = 2^sm_weight_log2_scale
+  const int log2_scale = sm_weight_log2_scale;
+  const uint16_t scale = (1 << sm_weight_log2_scale);
+  sm_weights_sanity_checks(sm_weights, sm_weights, scale,
+                           log2_scale + sizeof(*dst));
+
+  int r;
+  for (r = 0; r < bh; r++) {
+    int c;
+    for (c = 0; c < bw; ++c) {
+      const uint8_t pixels[] = { left[r], right_pred };
+      const uint8_t weights[] = { sm_weights[c], scale - sm_weights[c] };
+      uint32_t this_pred = 0;
+      assert(scale >= sm_weights[c]);
+      int i;
+      for (i = 0; i < 2; ++i) {
+        this_pred += weights[i] * pixels[i];
+      }
+      dst[c] = divide_round(this_pred, log2_scale);
+    }
+    dst += stride;
+  }
+}
+
+static INLINE void dc_128_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
+                                    int bh, const uint8_t *above,
+                                    const uint8_t *left) {
+  int r;
+  (void)above;
+  (void)left;
+
+  for (r = 0; r < bh; r++) {
+    memset(dst, 128, bw);
+    dst += stride;
+  }
+}
+
+static INLINE void dc_left_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
+                                     int bh, const uint8_t *above,
+                                     const uint8_t *left) {
+  int i, r, expected_dc, sum = 0;
+  (void)above;
+
+  for (i = 0; i < bh; i++) sum += left[i];
+  expected_dc = (sum + (bh >> 1)) / bh;
+
+  for (r = 0; r < bh; r++) {
+    memset(dst, expected_dc, bw);
+    dst += stride;
+  }
+}
+
+static INLINE void dc_top_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
+                                    int bh, const uint8_t *above,
+                                    const uint8_t *left) {
+  int i, r, expected_dc, sum = 0;
+  (void)left;
+
+  for (i = 0; i < bw; i++) sum += above[i];
+  expected_dc = (sum + (bw >> 1)) / bw;
+
+  for (r = 0; r < bh; r++) {
+    memset(dst, expected_dc, bw);
+    dst += stride;
+  }
+}
+
+static INLINE void dc_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+                                const uint8_t *above, const uint8_t *left) {
+  int i, r, expected_dc, sum = 0;
+  const int count = bw + bh;
+
+  for (i = 0; i < bw; i++) {
+    sum += above[i];
+  }
+  for (i = 0; i < bh; i++) {
+    sum += left[i];
+  }
+
+  expected_dc = (sum + (count >> 1)) / count;
+
+  for (r = 0; r < bh; r++) {
+    memset(dst, expected_dc, bw);
+    dst += stride;
+  }
+}
+
+static INLINE int divide_using_multiply_shift(int num, int shift1,
+                                              int multiplier, int shift2) {
+  const int interm = num >> shift1;
+  return interm * multiplier >> shift2;
+}
+
+// The constants (multiplier and shifts) for a given block size are obtained
+// as follows:
+// - Let sum_w_h =  block width + block height.
+// - Shift 'sum_w_h' right until we reach an odd number. Let the number of
+// shifts for that block size be called 'shift1' (see the parameter in
+// dc_predictor_rect() function), and let the odd number be 'd'. [d has only 2
+// possible values: d = 3 for a 1:2 rect block and d = 5 for a 1:4 rect
+// block].
+// - Find multipliers for (i) dividing by 3, and (ii) dividing by 5,
+// using the "Algorithm 1" in:
+// http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=1467632
+// by ensuring that m + n = 16 (in that algorithm). This ensures that our 2nd
+// shift will be 16, regardless of the block size.
+
+// Note: For low bitdepth, assembly code may be optimized by using smaller
+// constants for smaller block sizes, where the range of the 'sum' is
+// restricted to fewer bits.
+
+#define DC_MULTIPLIER_1X2 0x5556
+#define DC_MULTIPLIER_1X4 0x3334
+
+#define DC_SHIFT2 16
+
+static INLINE void dc_predictor_rect(uint8_t *dst, ptrdiff_t stride, int bw,
+                                     int bh, const uint8_t *above,
+                                     const uint8_t *left, int shift1,
+                                     int multiplier) {
+  int sum = 0;
+
+  for (int i = 0; i < bw; i++) {
+    sum += above[i];
+  }
+  for (int i = 0; i < bh; i++) {
+    sum += left[i];
+  }
+
+  const int expected_dc = divide_using_multiply_shift(
+      sum + ((bw + bh) >> 1), shift1, multiplier, DC_SHIFT2);
+  assert(expected_dc < (1 << 8));
+
+  for (int r = 0; r < bh; r++) {
+    memset(dst, expected_dc, bw);
+    dst += stride;
+  }
+}
+
+#undef DC_SHIFT2
+
+void aom_dc_predictor_4x8_c(uint8_t *dst, ptrdiff_t stride,
+                            const uint8_t *above, const uint8_t *left) {
+  dc_predictor_rect(dst, stride, 4, 8, above, left, 2, DC_MULTIPLIER_1X2);
+}
+
+void aom_dc_predictor_8x4_c(uint8_t *dst, ptrdiff_t stride,
+                            const uint8_t *above, const uint8_t *left) {
+  dc_predictor_rect(dst, stride, 8, 4, above, left, 2, DC_MULTIPLIER_1X2);
+}
+
+void aom_dc_predictor_4x16_c(uint8_t *dst, ptrdiff_t stride,
+                             const uint8_t *above, const uint8_t *left) {
+  dc_predictor_rect(dst, stride, 4, 16, above, left, 2, DC_MULTIPLIER_1X4);
+}
+
+void aom_dc_predictor_16x4_c(uint8_t *dst, ptrdiff_t stride,
+                             const uint8_t *above, const uint8_t *left) {
+  dc_predictor_rect(dst, stride, 16, 4, above, left, 2, DC_MULTIPLIER_1X4);
+}
+
+void aom_dc_predictor_8x16_c(uint8_t *dst, ptrdiff_t stride,
+                             const uint8_t *above, const uint8_t *left) {
+  dc_predictor_rect(dst, stride, 8, 16, above, left, 3, DC_MULTIPLIER_1X2);
+}
+
+void aom_dc_predictor_16x8_c(uint8_t *dst, ptrdiff_t stride,
+                             const uint8_t *above, const uint8_t *left) {
+  dc_predictor_rect(dst, stride, 16, 8, above, left, 3, DC_MULTIPLIER_1X2);
+}
+
+void aom_dc_predictor_8x32_c(uint8_t *dst, ptrdiff_t stride,
+                             const uint8_t *above, const uint8_t *left) {
+  dc_predictor_rect(dst, stride, 8, 32, above, left, 3, DC_MULTIPLIER_1X4);
+}
+
+void aom_dc_predictor_32x8_c(uint8_t *dst, ptrdiff_t stride,
+                             const uint8_t *above, const uint8_t *left) {
+  dc_predictor_rect(dst, stride, 32, 8, above, left, 3, DC_MULTIPLIER_1X4);
+}
+
+void aom_dc_predictor_16x32_c(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  dc_predictor_rect(dst, stride, 16, 32, above, left, 4, DC_MULTIPLIER_1X2);
+}
+
+void aom_dc_predictor_32x16_c(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  dc_predictor_rect(dst, stride, 32, 16, above, left, 4, DC_MULTIPLIER_1X2);
+}
+
+void aom_dc_predictor_16x64_c(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  dc_predictor_rect(dst, stride, 16, 64, above, left, 4, DC_MULTIPLIER_1X4);
+}
+
+void aom_dc_predictor_64x16_c(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  dc_predictor_rect(dst, stride, 64, 16, above, left, 4, DC_MULTIPLIER_1X4);
+}
+
+void aom_dc_predictor_32x64_c(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  dc_predictor_rect(dst, stride, 32, 64, above, left, 5, DC_MULTIPLIER_1X2);
+}
+
+void aom_dc_predictor_64x32_c(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  dc_predictor_rect(dst, stride, 64, 32, above, left, 5, DC_MULTIPLIER_1X2);
+}
+
+#undef DC_MULTIPLIER_1X2
+#undef DC_MULTIPLIER_1X4
+
+static INLINE void highbd_v_predictor(uint16_t *dst, ptrdiff_t stride, int bw,
+                                      int bh, const uint16_t *above,
+                                      const uint16_t *left, int bd) {
+  int r;
+  (void)left;
+  (void)bd;
+  for (r = 0; r < bh; r++) {
+    memcpy(dst, above, bw * sizeof(uint16_t));
+    dst += stride;
+  }
+}
+
+static INLINE void highbd_h_predictor(uint16_t *dst, ptrdiff_t stride, int bw,
+                                      int bh, const uint16_t *above,
+                                      const uint16_t *left, int bd) {
+  int r;
+  (void)above;
+  (void)bd;
+  for (r = 0; r < bh; r++) {
+    aom_memset16(dst, left[r], bw);
+    dst += stride;
+  }
+}
+
+static INLINE void highbd_paeth_predictor(uint16_t *dst, ptrdiff_t stride,
+                                          int bw, int bh, const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  int r, c;
+  const uint16_t ytop_left = above[-1];
+  (void)bd;
+
+  for (r = 0; r < bh; r++) {
+    for (c = 0; c < bw; c++)
+      dst[c] = paeth_predictor_single(left[r], above[c], ytop_left);
+    dst += stride;
+  }
+}
+
+static INLINE void highbd_smooth_predictor(uint16_t *dst, ptrdiff_t stride,
+                                           int bw, int bh,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  (void)bd;
+  const uint16_t below_pred = left[bh - 1];   // estimated by bottom-left pixel
+  const uint16_t right_pred = above[bw - 1];  // estimated by top-right pixel
+  const uint8_t *const sm_weights_w = sm_weight_arrays + bw;
+  const uint8_t *const sm_weights_h = sm_weight_arrays + bh;
+  // scale = 2 * 2^sm_weight_log2_scale
+  const int log2_scale = 1 + sm_weight_log2_scale;
+  const uint16_t scale = (1 << sm_weight_log2_scale);
+  sm_weights_sanity_checks(sm_weights_w, sm_weights_h, scale,
+                           log2_scale + sizeof(*dst));
+  int r;
+  for (r = 0; r < bh; ++r) {
+    int c;
+    for (c = 0; c < bw; ++c) {
+      const uint16_t pixels[] = { above[c], below_pred, left[r], right_pred };
+      const uint8_t weights[] = { sm_weights_h[r], scale - sm_weights_h[r],
+                                  sm_weights_w[c], scale - sm_weights_w[c] };
+      uint32_t this_pred = 0;
+      int i;
+      assert(scale >= sm_weights_h[r] && scale >= sm_weights_w[c]);
+      for (i = 0; i < 4; ++i) {
+        this_pred += weights[i] * pixels[i];
+      }
+      dst[c] = divide_round(this_pred, log2_scale);
+    }
+    dst += stride;
+  }
+}
+
+static INLINE void highbd_smooth_v_predictor(uint16_t *dst, ptrdiff_t stride,
+                                             int bw, int bh,
+                                             const uint16_t *above,
+                                             const uint16_t *left, int bd) {
+  (void)bd;
+  const uint16_t below_pred = left[bh - 1];  // estimated by bottom-left pixel
+  const uint8_t *const sm_weights = sm_weight_arrays + bh;
+  // scale = 2^sm_weight_log2_scale
+  const int log2_scale = sm_weight_log2_scale;
+  const uint16_t scale = (1 << sm_weight_log2_scale);
+  sm_weights_sanity_checks(sm_weights, sm_weights, scale,
+                           log2_scale + sizeof(*dst));
+
+  int r;
+  for (r = 0; r < bh; r++) {
+    int c;
+    for (c = 0; c < bw; ++c) {
+      const uint16_t pixels[] = { above[c], below_pred };
+      const uint8_t weights[] = { sm_weights[r], scale - sm_weights[r] };
+      uint32_t this_pred = 0;
+      assert(scale >= sm_weights[r]);
+      int i;
+      for (i = 0; i < 2; ++i) {
+        this_pred += weights[i] * pixels[i];
+      }
+      dst[c] = divide_round(this_pred, log2_scale);
+    }
+    dst += stride;
+  }
+}
+
+static INLINE void highbd_smooth_h_predictor(uint16_t *dst, ptrdiff_t stride,
+                                             int bw, int bh,
+                                             const uint16_t *above,
+                                             const uint16_t *left, int bd) {
+  (void)bd;
+  const uint16_t right_pred = above[bw - 1];  // estimated by top-right pixel
+  const uint8_t *const sm_weights = sm_weight_arrays + bw;
+  // scale = 2^sm_weight_log2_scale
+  const int log2_scale = sm_weight_log2_scale;
+  const uint16_t scale = (1 << sm_weight_log2_scale);
+  sm_weights_sanity_checks(sm_weights, sm_weights, scale,
+                           log2_scale + sizeof(*dst));
+
+  int r;
+  for (r = 0; r < bh; r++) {
+    int c;
+    for (c = 0; c < bw; ++c) {
+      const uint16_t pixels[] = { left[r], right_pred };
+      const uint8_t weights[] = { sm_weights[c], scale - sm_weights[c] };
+      uint32_t this_pred = 0;
+      assert(scale >= sm_weights[c]);
+      int i;
+      for (i = 0; i < 2; ++i) {
+        this_pred += weights[i] * pixels[i];
+      }
+      dst[c] = divide_round(this_pred, log2_scale);
+    }
+    dst += stride;
+  }
+}
+
+static INLINE void highbd_dc_128_predictor(uint16_t *dst, ptrdiff_t stride,
+                                           int bw, int bh,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  int r;
+  (void)above;
+  (void)left;
+
+  for (r = 0; r < bh; r++) {
+    aom_memset16(dst, 128 << (bd - 8), bw);
+    dst += stride;
+  }
+}
+
+static INLINE void highbd_dc_left_predictor(uint16_t *dst, ptrdiff_t stride,
+                                            int bw, int bh,
+                                            const uint16_t *above,
+                                            const uint16_t *left, int bd) {
+  int i, r, expected_dc, sum = 0;
+  (void)above;
+  (void)bd;
+
+  for (i = 0; i < bh; i++) sum += left[i];
+  expected_dc = (sum + (bh >> 1)) / bh;
+
+  for (r = 0; r < bh; r++) {
+    aom_memset16(dst, expected_dc, bw);
+    dst += stride;
+  }
+}
+
+static INLINE void highbd_dc_top_predictor(uint16_t *dst, ptrdiff_t stride,
+                                           int bw, int bh,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  int i, r, expected_dc, sum = 0;
+  (void)left;
+  (void)bd;
+
+  for (i = 0; i < bw; i++) sum += above[i];
+  expected_dc = (sum + (bw >> 1)) / bw;
+
+  for (r = 0; r < bh; r++) {
+    aom_memset16(dst, expected_dc, bw);
+    dst += stride;
+  }
+}
+
+static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bw,
+                                       int bh, const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  int i, r, expected_dc, sum = 0;
+  const int count = bw + bh;
+  (void)bd;
+
+  for (i = 0; i < bw; i++) {
+    sum += above[i];
+  }
+  for (i = 0; i < bh; i++) {
+    sum += left[i];
+  }
+
+  expected_dc = (sum + (count >> 1)) / count;
+
+  for (r = 0; r < bh; r++) {
+    aom_memset16(dst, expected_dc, bw);
+    dst += stride;
+  }
+}
+
+// Obtained similarly as DC_MULTIPLIER_1X2 and DC_MULTIPLIER_1X4 above, but
+// assume 2nd shift of 17 bits instead of 16.
+// Note: Strictly speaking, 2nd shift needs to be 17 only when:
+// - bit depth == 12, and
+// - bw + bh is divisible by 5 (as opposed to divisible by 3).
+// All other cases can use half the multipliers with a shift of 16 instead.
+// This special optimization can be used when writing assembly code.
+#define HIGHBD_DC_MULTIPLIER_1X2 0xAAAB
+// Note: This constant is odd, but a smaller even constant (0x199a) with the
+// appropriate shift should work for neon in 8/10-bit.
+#define HIGHBD_DC_MULTIPLIER_1X4 0x6667
+
+#define HIGHBD_DC_SHIFT2 17
+
+static INLINE void highbd_dc_predictor_rect(uint16_t *dst, ptrdiff_t stride,
+                                            int bw, int bh,
+                                            const uint16_t *above,
+                                            const uint16_t *left, int bd,
+                                            int shift1, uint32_t multiplier) {
+  int sum = 0;
+  (void)bd;
+
+  for (int i = 0; i < bw; i++) {
+    sum += above[i];
+  }
+  for (int i = 0; i < bh; i++) {
+    sum += left[i];
+  }
+
+  const int expected_dc = divide_using_multiply_shift(
+      sum + ((bw + bh) >> 1), shift1, multiplier, HIGHBD_DC_SHIFT2);
+  assert(expected_dc < (1 << bd));
+
+  for (int r = 0; r < bh; r++) {
+    aom_memset16(dst, expected_dc, bw);
+    dst += stride;
+  }
+}
+
+#undef HIGHBD_DC_SHIFT2
+
+void aom_highbd_dc_predictor_4x8_c(uint16_t *dst, ptrdiff_t stride,
+                                   const uint16_t *above, const uint16_t *left,
+                                   int bd) {
+  highbd_dc_predictor_rect(dst, stride, 4, 8, above, left, bd, 2,
+                           HIGHBD_DC_MULTIPLIER_1X2);
+}
+
+void aom_highbd_dc_predictor_8x4_c(uint16_t *dst, ptrdiff_t stride,
+                                   const uint16_t *above, const uint16_t *left,
+                                   int bd) {
+  highbd_dc_predictor_rect(dst, stride, 8, 4, above, left, bd, 2,
+                           HIGHBD_DC_MULTIPLIER_1X2);
+}
+
+void aom_highbd_dc_predictor_4x16_c(uint16_t *dst, ptrdiff_t stride,
+                                    const uint16_t *above, const uint16_t *left,
+                                    int bd) {
+  highbd_dc_predictor_rect(dst, stride, 4, 16, above, left, bd, 2,
+                           HIGHBD_DC_MULTIPLIER_1X4);
+}
+
+void aom_highbd_dc_predictor_16x4_c(uint16_t *dst, ptrdiff_t stride,
+                                    const uint16_t *above, const uint16_t *left,
+                                    int bd) {
+  highbd_dc_predictor_rect(dst, stride, 16, 4, above, left, bd, 2,
+                           HIGHBD_DC_MULTIPLIER_1X4);
+}
+
+void aom_highbd_dc_predictor_8x16_c(uint16_t *dst, ptrdiff_t stride,
+                                    const uint16_t *above, const uint16_t *left,
+                                    int bd) {
+  highbd_dc_predictor_rect(dst, stride, 8, 16, above, left, bd, 3,
+                           HIGHBD_DC_MULTIPLIER_1X2);
+}
+
+void aom_highbd_dc_predictor_16x8_c(uint16_t *dst, ptrdiff_t stride,
+                                    const uint16_t *above, const uint16_t *left,
+                                    int bd) {
+  highbd_dc_predictor_rect(dst, stride, 16, 8, above, left, bd, 3,
+                           HIGHBD_DC_MULTIPLIER_1X2);
+}
+
+void aom_highbd_dc_predictor_8x32_c(uint16_t *dst, ptrdiff_t stride,
+                                    const uint16_t *above, const uint16_t *left,
+                                    int bd) {
+  highbd_dc_predictor_rect(dst, stride, 8, 32, above, left, bd, 3,
+                           HIGHBD_DC_MULTIPLIER_1X4);
+}
+
+void aom_highbd_dc_predictor_32x8_c(uint16_t *dst, ptrdiff_t stride,
+                                    const uint16_t *above, const uint16_t *left,
+                                    int bd) {
+  highbd_dc_predictor_rect(dst, stride, 32, 8, above, left, bd, 3,
+                           HIGHBD_DC_MULTIPLIER_1X4);
+}
+
+void aom_highbd_dc_predictor_16x32_c(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  highbd_dc_predictor_rect(dst, stride, 16, 32, above, left, bd, 4,
+                           HIGHBD_DC_MULTIPLIER_1X2);
+}
+
+void aom_highbd_dc_predictor_32x16_c(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  highbd_dc_predictor_rect(dst, stride, 32, 16, above, left, bd, 4,
+                           HIGHBD_DC_MULTIPLIER_1X2);
+}
+
+void aom_highbd_dc_predictor_16x64_c(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  highbd_dc_predictor_rect(dst, stride, 16, 64, above, left, bd, 4,
+                           HIGHBD_DC_MULTIPLIER_1X4);
+}
+
+void aom_highbd_dc_predictor_64x16_c(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  highbd_dc_predictor_rect(dst, stride, 64, 16, above, left, bd, 4,
+                           HIGHBD_DC_MULTIPLIER_1X4);
+}
+
+void aom_highbd_dc_predictor_32x64_c(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  highbd_dc_predictor_rect(dst, stride, 32, 64, above, left, bd, 5,
+                           HIGHBD_DC_MULTIPLIER_1X2);
+}
+
+void aom_highbd_dc_predictor_64x32_c(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  highbd_dc_predictor_rect(dst, stride, 64, 32, above, left, bd, 5,
+                           HIGHBD_DC_MULTIPLIER_1X2);
+}
+
+#undef HIGHBD_DC_MULTIPLIER_1X2
+#undef HIGHBD_DC_MULTIPLIER_1X4
+
+// This serves as a wrapper function, so that all the prediction functions
+// can be unified and accessed as a pointer array. Note that the boundary
+// above and left are not necessarily used all the time.
+#define intra_pred_sized(type, width, height)                  \
+  void aom_##type##_predictor_##width##x##height##_c(          \
+      uint8_t *dst, ptrdiff_t stride, const uint8_t *above,    \
+      const uint8_t *left) {                                   \
+    type##_predictor(dst, stride, width, height, above, left); \
+  }
+
+#define intra_pred_highbd_sized(type, width, height)                        \
+  void aom_highbd_##type##_predictor_##width##x##height##_c(                \
+      uint16_t *dst, ptrdiff_t stride, const uint16_t *above,               \
+      const uint16_t *left, int bd) {                                       \
+    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
+  }
+
+/* clang-format off */
+#define intra_pred_rectangular(type) \
+  intra_pred_sized(type, 4, 8) \
+  intra_pred_sized(type, 8, 4) \
+  intra_pred_sized(type, 8, 16) \
+  intra_pred_sized(type, 16, 8) \
+  intra_pred_sized(type, 16, 32) \
+  intra_pred_sized(type, 32, 16) \
+  intra_pred_sized(type, 32, 64) \
+  intra_pred_sized(type, 64, 32) \
+  intra_pred_sized(type, 4, 16) \
+  intra_pred_sized(type, 16, 4) \
+  intra_pred_sized(type, 8, 32) \
+  intra_pred_sized(type, 32, 8) \
+  intra_pred_sized(type, 16, 64) \
+  intra_pred_sized(type, 64, 16) \
+  intra_pred_highbd_sized(type, 4, 8) \
+  intra_pred_highbd_sized(type, 8, 4) \
+  intra_pred_highbd_sized(type, 8, 16) \
+  intra_pred_highbd_sized(type, 16, 8) \
+  intra_pred_highbd_sized(type, 16, 32) \
+  intra_pred_highbd_sized(type, 32, 16) \
+  intra_pred_highbd_sized(type, 32, 64) \
+  intra_pred_highbd_sized(type, 64, 32) \
+  intra_pred_highbd_sized(type, 4, 16) \
+  intra_pred_highbd_sized(type, 16, 4) \
+  intra_pred_highbd_sized(type, 8, 32) \
+  intra_pred_highbd_sized(type, 32, 8) \
+  intra_pred_highbd_sized(type, 16, 64) \
+  intra_pred_highbd_sized(type, 64, 16)
+#define intra_pred_above_4x4(type) \
+  intra_pred_sized(type, 8, 8) \
+  intra_pred_sized(type, 16, 16) \
+  intra_pred_sized(type, 32, 32) \
+  intra_pred_sized(type, 64, 64) \
+  intra_pred_highbd_sized(type, 4, 4) \
+  intra_pred_highbd_sized(type, 8, 8) \
+  intra_pred_highbd_sized(type, 16, 16) \
+  intra_pred_highbd_sized(type, 32, 32) \
+  intra_pred_highbd_sized(type, 64, 64) \
+  intra_pred_rectangular(type)
+#define intra_pred_allsizes(type) \
+  intra_pred_sized(type, 4, 4) \
+  intra_pred_above_4x4(type)
+#define intra_pred_square(type) \
+  intra_pred_sized(type, 4, 4) \
+  intra_pred_sized(type, 8, 8) \
+  intra_pred_sized(type, 16, 16) \
+  intra_pred_sized(type, 32, 32) \
+  intra_pred_sized(type, 64, 64) \
+  intra_pred_highbd_sized(type, 4, 4) \
+  intra_pred_highbd_sized(type, 8, 8) \
+  intra_pred_highbd_sized(type, 16, 16) \
+  intra_pred_highbd_sized(type, 32, 32) \
+  intra_pred_highbd_sized(type, 64, 64)
+
+intra_pred_allsizes(v)
+intra_pred_allsizes(h)
+intra_pred_allsizes(smooth)
+intra_pred_allsizes(smooth_v)
+intra_pred_allsizes(smooth_h)
+intra_pred_allsizes(paeth)
+intra_pred_allsizes(dc_128)
+intra_pred_allsizes(dc_left)
+intra_pred_allsizes(dc_top)
+intra_pred_square(dc)
+/* clang-format on */
+#undef intra_pred_allsizes
diff --git a/libs/libaom/src/aom_dsp/intrapred_common.h b/libs/libaom/src/aom_dsp/intrapred_common.h
new file mode 100644
index 000000000..3ec62a86e
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/intrapred_common.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_INTRAPRED_COMMON_H_
+#define AOM_AOM_DSP_INTRAPRED_COMMON_H_
+
+#include "config/aom_config.h"
+
+// Weights are quadratic from '1' to '1 / block_size', scaled by
+// 2^sm_weight_log2_scale.
+static const int sm_weight_log2_scale = 8;
+
+// max(block_size_wide[BLOCK_LARGEST], block_size_high[BLOCK_LARGEST])
+#define MAX_BLOCK_DIM 64
+
+/* clang-format off */
+static const uint8_t sm_weight_arrays[2 * MAX_BLOCK_DIM] = {
+  // Unused, because we always offset by bs, which is at least 2.
+  0, 0,
+  // bs = 2
+  255, 128,
+  // bs = 4
+  255, 149, 85, 64,
+  // bs = 8
+  255, 197, 146, 105, 73, 50, 37, 32,
+  // bs = 16
+  255, 225, 196, 170, 145, 123, 102, 84, 68, 54, 43, 33, 26, 20, 17, 16,
+  // bs = 32
+  255, 240, 225, 210, 196, 182, 169, 157, 145, 133, 122, 111, 101, 92, 83, 74,
+  66, 59, 52, 45, 39, 34, 29, 25, 21, 17, 14, 12, 10, 9, 8, 8,
+  // bs = 64
+  255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169, 163, 156,
+  150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96, 91, 86, 82, 77, 73, 69,
+  65, 61, 57, 54, 50, 47, 44, 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16, 15,
+  13, 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4,
+};
+/* clang-format on */
+
+#endif  // AOM_AOM_DSP_INTRAPRED_COMMON_H_
diff --git a/libs/libaom/src/aom_dsp/loopfilter.c b/libs/libaom/src/aom_dsp/loopfilter.c
new file mode 100644
index 000000000..903ebcd7c
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/loopfilter.c
@@ -0,0 +1,929 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/mem.h"
+
+static INLINE int8_t signed_char_clamp(int t) {
+  return (int8_t)clamp(t, -128, 127);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE int16_t signed_char_clamp_high(int t, int bd) {
+  switch (bd) {
+    case 10: return (int16_t)clamp(t, -128 * 4, 128 * 4 - 1);
+    case 12: return (int16_t)clamp(t, -128 * 16, 128 * 16 - 1);
+    case 8:
+    default: return (int16_t)clamp(t, -128, 128 - 1);
+  }
+}
+#endif
+
+// should we apply any filter at all: 11111111 yes, 00000000 no
+static INLINE int8_t filter_mask2(uint8_t limit, uint8_t blimit, uint8_t p1,
+                                  uint8_t p0, uint8_t q0, uint8_t q1) {
+  int8_t mask = 0;
+  mask |= (abs(p1 - p0) > limit) * -1;
+  mask |= (abs(q1 - q0) > limit) * -1;
+  mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+  return ~mask;
+}
+
+static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit, uint8_t p3,
+                                 uint8_t p2, uint8_t p1, uint8_t p0, uint8_t q0,
+                                 uint8_t q1, uint8_t q2, uint8_t q3) {
+  int8_t mask = 0;
+  mask |= (abs(p3 - p2) > limit) * -1;
+  mask |= (abs(p2 - p1) > limit) * -1;
+  mask |= (abs(p1 - p0) > limit) * -1;
+  mask |= (abs(q1 - q0) > limit) * -1;
+  mask |= (abs(q2 - q1) > limit) * -1;
+  mask |= (abs(q3 - q2) > limit) * -1;
+  mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+  return ~mask;
+}
+
+static INLINE int8_t filter_mask3_chroma(uint8_t limit, uint8_t blimit,
+                                         uint8_t p2, uint8_t p1, uint8_t p0,
+                                         uint8_t q0, uint8_t q1, uint8_t q2) {
+  int8_t mask = 0;
+  mask |= (abs(p2 - p1) > limit) * -1;
+  mask |= (abs(p1 - p0) > limit) * -1;
+  mask |= (abs(q1 - q0) > limit) * -1;
+  mask |= (abs(q2 - q1) > limit) * -1;
+  mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+  return ~mask;
+}
+
+static INLINE int8_t flat_mask3_chroma(uint8_t thresh, uint8_t p2, uint8_t p1,
+                                       uint8_t p0, uint8_t q0, uint8_t q1,
+                                       uint8_t q2) {
+  int8_t mask = 0;
+  mask |= (abs(p1 - p0) > thresh) * -1;
+  mask |= (abs(q1 - q0) > thresh) * -1;
+  mask |= (abs(p2 - p0) > thresh) * -1;
+  mask |= (abs(q2 - q0) > thresh) * -1;
+  return ~mask;
+}
+
+static INLINE int8_t flat_mask4(uint8_t thresh, uint8_t p3, uint8_t p2,
+                                uint8_t p1, uint8_t p0, uint8_t q0, uint8_t q1,
+                                uint8_t q2, uint8_t q3) {
+  int8_t mask = 0;
+  mask |= (abs(p1 - p0) > thresh) * -1;
+  mask |= (abs(q1 - q0) > thresh) * -1;
+  mask |= (abs(p2 - p0) > thresh) * -1;
+  mask |= (abs(q2 - q0) > thresh) * -1;
+  mask |= (abs(p3 - p0) > thresh) * -1;
+  mask |= (abs(q3 - q0) > thresh) * -1;
+  return ~mask;
+}
+
+// is there high edge variance internal edge: 11111111 yes, 00000000 no
+static INLINE int8_t hev_mask(uint8_t thresh, uint8_t p1, uint8_t p0,
+                              uint8_t q0, uint8_t q1) {
+  int8_t hev = 0;
+  hev |= (abs(p1 - p0) > thresh) * -1;
+  hev |= (abs(q1 - q0) > thresh) * -1;
+  return hev;
+}
+
+static INLINE void filter4(int8_t mask, uint8_t thresh, uint8_t *op1,
+                           uint8_t *op0, uint8_t *oq0, uint8_t *oq1) {
+  int8_t filter1, filter2;
+
+  const int8_t ps1 = (int8_t)(*op1 ^ 0x80);
+  const int8_t ps0 = (int8_t)(*op0 ^ 0x80);
+  const int8_t qs0 = (int8_t)(*oq0 ^ 0x80);
+  const int8_t qs1 = (int8_t)(*oq1 ^ 0x80);
+  const int8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1);
+
+  // add outer taps if we have high edge variance
+  int8_t filter = signed_char_clamp(ps1 - qs1) & hev;
+
+  // inner taps
+  filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask;
+
+  // save bottom 3 bits so that we round one side +4 and the other +3
+  // if it equals 4 we'll set to adjust by -1 to account for the fact
+  // we'd round 3 the other way
+  filter1 = signed_char_clamp(filter + 4) >> 3;
+  filter2 = signed_char_clamp(filter + 3) >> 3;
+
+  *oq0 = (uint8_t)(signed_char_clamp(qs0 - filter1) ^ 0x80);
+  *op0 = (uint8_t)(signed_char_clamp(ps0 + filter2) ^ 0x80);
+
+  // outer tap adjustments
+  filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
+
+  *oq1 = (uint8_t)(signed_char_clamp(qs1 - filter) ^ 0x80);
+  *op1 = (uint8_t)(signed_char_clamp(ps1 + filter) ^ 0x80);
+}
+
+void aom_lpf_horizontal_4_c(uint8_t *s, int p /* pitch */,
+                            const uint8_t *blimit, const uint8_t *limit,
+                            const uint8_t *thresh) {
+  int i;
+  int count = 4;
+
+  // loop filter designed to work using chars so that we can make maximum use
+  // of 8 bit simd instructions.
+  for (i = 0; i < count; ++i) {
+    const uint8_t p1 = s[-2 * p], p0 = s[-p];
+    const uint8_t q0 = s[0 * p], q1 = s[1 * p];
+    const int8_t mask = filter_mask2(*limit, *blimit, p1, p0, q0, q1);
+    filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p);
+    ++s;
+  }
+}
+
+void aom_lpf_horizontal_4_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
+                                 const uint8_t *limit0, const uint8_t *thresh0,
+                                 const uint8_t *blimit1, const uint8_t *limit1,
+                                 const uint8_t *thresh1) {
+  aom_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0);
+  aom_lpf_horizontal_4_c(s + 4, p, blimit1, limit1, thresh1);
+}
+
+void aom_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit,
+                          const uint8_t *limit, const uint8_t *thresh) {
+  int i;
+  int count = 4;
+
+  // loop filter designed to work using chars so that we can make maximum use
+  // of 8 bit simd instructions.
+  for (i = 0; i < count; ++i) {
+    const uint8_t p1 = s[-2], p0 = s[-1];
+    const uint8_t q0 = s[0], q1 = s[1];
+    const int8_t mask = filter_mask2(*limit, *blimit, p1, p0, q0, q1);
+    filter4(mask, *thresh, s - 2, s - 1, s, s + 1);
+    s += pitch;
+  }
+}
+
+void aom_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
+                               const uint8_t *limit0, const uint8_t *thresh0,
+                               const uint8_t *blimit1, const uint8_t *limit1,
+                               const uint8_t *thresh1) {
+  aom_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0);
+  aom_lpf_vertical_4_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1);
+}
+
+static INLINE void filter6(int8_t mask, uint8_t thresh, int8_t flat,
+                           uint8_t *op2, uint8_t *op1, uint8_t *op0,
+                           uint8_t *oq0, uint8_t *oq1, uint8_t *oq2) {
+  if (flat && mask) {
+    const uint8_t p2 = *op2, p1 = *op1, p0 = *op0;
+    const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2;
+
+    // 5-tap filter [1, 2, 2, 2, 1]
+    *op1 = ROUND_POWER_OF_TWO(p2 * 3 + p1 * 2 + p0 * 2 + q0, 3);
+    *op0 = ROUND_POWER_OF_TWO(p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1, 3);
+    *oq0 = ROUND_POWER_OF_TWO(p1 + p0 * 2 + q0 * 2 + q1 * 2 + q2, 3);
+    *oq1 = ROUND_POWER_OF_TWO(p0 + q0 * 2 + q1 * 2 + q2 * 3, 3);
+  } else {
+    filter4(mask, thresh, op1, op0, oq0, oq1);
+  }
+}
+
+static INLINE void filter8(int8_t mask, uint8_t thresh, int8_t flat,
+                           uint8_t *op3, uint8_t *op2, uint8_t *op1,
+                           uint8_t *op0, uint8_t *oq0, uint8_t *oq1,
+                           uint8_t *oq2, uint8_t *oq3) {
+  if (flat && mask) {
+    const uint8_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
+    const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
+
+    // 7-tap filter [1, 1, 1, 2, 1, 1, 1]
+    *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0, 3);
+    *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1, 3);
+    *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2, 3);
+    *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3, 3);
+    *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3);
+    *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3);
+  } else {
+    filter4(mask, thresh, op1, op0, oq0, oq1);
+  }
+}
+
+void aom_lpf_horizontal_6_c(uint8_t *s, int p, const uint8_t *blimit,
+                            const uint8_t *limit, const uint8_t *thresh) {
+  int i;
+  int count = 4;
+
+  // loop filter designed to work using chars so that we can make maximum use
+  // of 8 bit simd instructions.
+  for (i = 0; i < count; ++i) {
+    const uint8_t p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
+    const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p];
+
+    const int8_t mask =
+        filter_mask3_chroma(*limit, *blimit, p2, p1, p0, q0, q1, q2);
+    const int8_t flat = flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2);
+    filter6(mask, *thresh, flat, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p,
+            s + 2 * p);
+    ++s;
+  }
+}
+
+void aom_lpf_horizontal_6_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
+                                 const uint8_t *limit0, const uint8_t *thresh0,
+                                 const uint8_t *blimit1, const uint8_t *limit1,
+                                 const uint8_t *thresh1) {
+  aom_lpf_horizontal_6_c(s, p, blimit0, limit0, thresh0);
+  aom_lpf_horizontal_6_c(s + 4, p, blimit1, limit1, thresh1);
+}
+
+void aom_lpf_horizontal_8_c(uint8_t *s, int p, const uint8_t *blimit,
+                            const uint8_t *limit, const uint8_t *thresh) {
+  int i;
+  int count = 4;
+
+  // loop filter designed to work using chars so that we can make maximum use
+  // of 8 bit simd instructions.
+  for (i = 0; i < count; ++i) {
+    const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
+    const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
+
+    const int8_t mask =
+        filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
+    const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
+    filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s,
+            s + 1 * p, s + 2 * p, s + 3 * p);
+    ++s;
+  }
+}
+
+void aom_lpf_horizontal_8_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
+                                 const uint8_t *limit0, const uint8_t *thresh0,
+                                 const uint8_t *blimit1, const uint8_t *limit1,
+                                 const uint8_t *thresh1) {
+  aom_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0);
+  aom_lpf_horizontal_8_c(s + 4, p, blimit1, limit1, thresh1);
+}
+
+void aom_lpf_vertical_6_c(uint8_t *s, int pitch, const uint8_t *blimit,
+                          const uint8_t *limit, const uint8_t *thresh) {
+  int i;
+  int count = 4;
+
+  for (i = 0; i < count; ++i) {
+    const uint8_t p2 = s[-3], p1 = s[-2], p0 = s[-1];
+    const uint8_t q0 = s[0], q1 = s[1], q2 = s[2];
+    const int8_t mask =
+        filter_mask3_chroma(*limit, *blimit, p2, p1, p0, q0, q1, q2);
+    const int8_t flat = flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2);
+    filter6(mask, *thresh, flat, s - 3, s - 2, s - 1, s, s + 1, s + 2);
+    s += pitch;
+  }
+}
+
+void aom_lpf_vertical_6_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
+                               const uint8_t *limit0, const uint8_t *thresh0,
+                               const uint8_t *blimit1, const uint8_t *limit1,
+                               const uint8_t *thresh1) {
+  aom_lpf_vertical_6_c(s, pitch, blimit0, limit0, thresh0);
+  aom_lpf_vertical_6_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1);
+}
+
+void aom_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit,
+                          const uint8_t *limit, const uint8_t *thresh) {
+  int i;
+  int count = 4;
+
+  for (i = 0; i < count; ++i) {
+    const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
+    const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
+    const int8_t mask =
+        filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
+    const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
+    filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1, s, s + 1, s + 2,
+            s + 3);
+    s += pitch;
+  }
+}
+
+void aom_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
+                               const uint8_t *limit0, const uint8_t *thresh0,
+                               const uint8_t *blimit1, const uint8_t *limit1,
+                               const uint8_t *thresh1) {
+  aom_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0);
+  aom_lpf_vertical_8_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1);
+}
+
+static INLINE void filter14(int8_t mask, uint8_t thresh, int8_t flat,
+                            int8_t flat2, uint8_t *op6, uint8_t *op5,
+                            uint8_t *op4, uint8_t *op3, uint8_t *op2,
+                            uint8_t *op1, uint8_t *op0, uint8_t *oq0,
+                            uint8_t *oq1, uint8_t *oq2, uint8_t *oq3,
+                            uint8_t *oq4, uint8_t *oq5, uint8_t *oq6) {
+  if (flat2 && flat && mask) {
+    const uint8_t p6 = *op6, p5 = *op5, p4 = *op4, p3 = *op3, p2 = *op2,
+                  p1 = *op1, p0 = *op0;
+    const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3, q4 = *oq4,
+                  q5 = *oq5, q6 = *oq6;
+
+    // 13-tap filter [1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1]
+    *op5 = ROUND_POWER_OF_TWO(p6 * 7 + p5 * 2 + p4 * 2 + p3 + p2 + p1 + p0 + q0,
+                              4);
+    *op4 = ROUND_POWER_OF_TWO(
+        p6 * 5 + p5 * 2 + p4 * 2 + p3 * 2 + p2 + p1 + p0 + q0 + q1, 4);
+    *op3 = ROUND_POWER_OF_TWO(
+        p6 * 4 + p5 + p4 * 2 + p3 * 2 + p2 * 2 + p1 + p0 + q0 + q1 + q2, 4);
+    *op2 = ROUND_POWER_OF_TWO(
+        p6 * 3 + p5 + p4 + p3 * 2 + p2 * 2 + p1 * 2 + p0 + q0 + q1 + q2 + q3,
+        4);
+    *op1 = ROUND_POWER_OF_TWO(p6 * 2 + p5 + p4 + p3 + p2 * 2 + p1 * 2 + p0 * 2 +
+                                  q0 + q1 + q2 + q3 + q4,
+                              4);
+    *op0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 +
+                                  q0 * 2 + q1 + q2 + q3 + q4 + q5,
+                              4);
+    *oq0 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 * 2 +
+                                  q1 * 2 + q2 + q3 + q4 + q5 + q6,
+                              4);
+    *oq1 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 * 2 +
+                                  q2 * 2 + q3 + q4 + q5 + q6 * 2,
+                              4);
+    *oq2 = ROUND_POWER_OF_TWO(
+        p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 * 2 + q3 * 2 + q4 + q5 + q6 * 3,
+        4);
+    *oq3 = ROUND_POWER_OF_TWO(
+        p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 * 2 + q4 * 2 + q5 + q6 * 4, 4);
+    *oq4 = ROUND_POWER_OF_TWO(
+        p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 * 2 + q5 * 2 + q6 * 5, 4);
+    *oq5 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 * 2 + q6 * 7,
+                              4);
+  } else {
+    filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3);
+  }
+}
+
+static void mb_lpf_horizontal_edge_w(uint8_t *s, int p, const uint8_t *blimit,
+                                     const uint8_t *limit,
+                                     const uint8_t *thresh, int count) {
+  int i;
+  int step = 4;
+
+  // loop filter designed to work using chars so that we can make maximum use
+  // of 8 bit simd instructions.
+  for (i = 0; i < step * count; ++i) {
+    const uint8_t p6 = s[-7 * p], p5 = s[-6 * p], p4 = s[-5 * p],
+                  p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
+    const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p],
+                  q4 = s[4 * p], q5 = s[5 * p], q6 = s[6 * p];
+    const int8_t mask =
+        filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
+    const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
+    const int8_t flat2 = flat_mask4(1, p6, p5, p4, p0, q0, q4, q5, q6);
+
+    filter14(mask, *thresh, flat, flat2, s - 7 * p, s - 6 * p, s - 5 * p,
+             s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p,
+             s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p, s + 6 * p);
+    ++s;
+  }
+}
+
+void aom_lpf_horizontal_14_c(uint8_t *s, int p, const uint8_t *blimit,
+                             const uint8_t *limit, const uint8_t *thresh) {
+  mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1);
+}
+
+void aom_lpf_horizontal_14_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
+                                  const uint8_t *limit0, const uint8_t *thresh0,
+                                  const uint8_t *blimit1, const uint8_t *limit1,
+                                  const uint8_t *thresh1) {
+  mb_lpf_horizontal_edge_w(s, p, blimit0, limit0, thresh0, 1);
+  mb_lpf_horizontal_edge_w(s + 4, p, blimit1, limit1, thresh1, 1);
+}
+
+static void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit,
+                                   const uint8_t *limit, const uint8_t *thresh,
+                                   int count) {
+  int i;
+
+  for (i = 0; i < count; ++i) {
+    const uint8_t p6 = s[-7], p5 = s[-6], p4 = s[-5], p3 = s[-4], p2 = s[-3],
+                  p1 = s[-2], p0 = s[-1];
+    const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3], q4 = s[4],
+                  q5 = s[5], q6 = s[6];
+    const int8_t mask =
+        filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
+    const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
+    const int8_t flat2 = flat_mask4(1, p6, p5, p4, p0, q0, q4, q5, q6);
+
+    filter14(mask, *thresh, flat, flat2, s - 7, s - 6, s - 5, s - 4, s - 3,
+             s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6);
+    s += p;
+  }
+}
+
+void aom_lpf_vertical_14_c(uint8_t *s, int p, const uint8_t *blimit,
+                           const uint8_t *limit, const uint8_t *thresh) {
+  mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 4);
+}
+
+void aom_lpf_vertical_14_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
+                                const uint8_t *limit0, const uint8_t *thresh0,
+                                const uint8_t *blimit1, const uint8_t *limit1,
+                                const uint8_t *thresh1) {
+  mb_lpf_vertical_edge_w(s, pitch, blimit0, limit0, thresh0, 4);
+  mb_lpf_vertical_edge_w(s + 4 * pitch, pitch, blimit1, limit1, thresh1, 4);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+// Should we apply any filter at all: 11111111 yes, 00000000 no ?
+static INLINE int8_t highbd_filter_mask2(uint8_t limit, uint8_t blimit,
+                                         uint16_t p1, uint16_t p0, uint16_t q0,
+                                         uint16_t q1, int bd) {
+  int8_t mask = 0;
+  int16_t limit16 = (uint16_t)limit << (bd - 8);
+  int16_t blimit16 = (uint16_t)blimit << (bd - 8);
+  mask |= (abs(p1 - p0) > limit16) * -1;
+  mask |= (abs(q1 - q0) > limit16) * -1;
+  mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1;
+  return ~mask;
+}
+
+// Should we apply any filter at all: 11111111 yes, 00000000 no ?
+static INLINE int8_t highbd_filter_mask(uint8_t limit, uint8_t blimit,
+                                        uint16_t p3, uint16_t p2, uint16_t p1,
+                                        uint16_t p0, uint16_t q0, uint16_t q1,
+                                        uint16_t q2, uint16_t q3, int bd) {
+  int8_t mask = 0;
+  int16_t limit16 = (uint16_t)limit << (bd - 8);
+  int16_t blimit16 = (uint16_t)blimit << (bd - 8);
+  mask |= (abs(p3 - p2) > limit16) * -1;
+  mask |= (abs(p2 - p1) > limit16) * -1;
+  mask |= (abs(p1 - p0) > limit16) * -1;
+  mask |= (abs(q1 - q0) > limit16) * -1;
+  mask |= (abs(q2 - q1) > limit16) * -1;
+  mask |= (abs(q3 - q2) > limit16) * -1;
+  mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1;
+  return ~mask;
+}
+
+static INLINE int8_t highbd_filter_mask3_chroma(uint8_t limit, uint8_t blimit,
+                                                uint16_t p2, uint16_t p1,
+                                                uint16_t p0, uint16_t q0,
+                                                uint16_t q1, uint16_t q2,
+                                                int bd) {
+  int8_t mask = 0;
+  int16_t limit16 = (uint16_t)limit << (bd - 8);
+  int16_t blimit16 = (uint16_t)blimit << (bd - 8);
+  mask |= (abs(p2 - p1) > limit16) * -1;
+  mask |= (abs(p1 - p0) > limit16) * -1;
+  mask |= (abs(q1 - q0) > limit16) * -1;
+  mask |= (abs(q2 - q1) > limit16) * -1;
+  mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1;
+  return ~mask;
+}
+
+static INLINE int8_t highbd_flat_mask3_chroma(uint8_t thresh, uint16_t p2,
+                                              uint16_t p1, uint16_t p0,
+                                              uint16_t q0, uint16_t q1,
+                                              uint16_t q2, int bd) {
+  int8_t mask = 0;
+  int16_t thresh16 = (uint16_t)thresh << (bd - 8);
+  mask |= (abs(p1 - p0) > thresh16) * -1;
+  mask |= (abs(q1 - q0) > thresh16) * -1;
+  mask |= (abs(p2 - p0) > thresh16) * -1;
+  mask |= (abs(q2 - q0) > thresh16) * -1;
+  return ~mask;
+}
+
+static INLINE int8_t highbd_flat_mask4(uint8_t thresh, uint16_t p3, uint16_t p2,
+                                       uint16_t p1, uint16_t p0, uint16_t q0,
+                                       uint16_t q1, uint16_t q2, uint16_t q3,
+                                       int bd) {
+  int8_t mask = 0;
+  int16_t thresh16 = (uint16_t)thresh << (bd - 8);
+  mask |= (abs(p1 - p0) > thresh16) * -1;
+  mask |= (abs(q1 - q0) > thresh16) * -1;
+  mask |= (abs(p2 - p0) > thresh16) * -1;
+  mask |= (abs(q2 - q0) > thresh16) * -1;
+  mask |= (abs(p3 - p0) > thresh16) * -1;
+  mask |= (abs(q3 - q0) > thresh16) * -1;
+  return ~mask;
+}
+
+// Is there high edge variance internal edge:
+// 11111111_11111111 yes, 00000000_00000000 no ?
+static INLINE int16_t highbd_hev_mask(uint8_t thresh, uint16_t p1, uint16_t p0,
+                                      uint16_t q0, uint16_t q1, int bd) {
+  int16_t hev = 0;
+  int16_t thresh16 = (uint16_t)thresh << (bd - 8);
+  hev |= (abs(p1 - p0) > thresh16) * -1;
+  hev |= (abs(q1 - q0) > thresh16) * -1;
+  return hev;
+}
+
+static INLINE void highbd_filter4(int8_t mask, uint8_t thresh, uint16_t *op1,
+                                  uint16_t *op0, uint16_t *oq0, uint16_t *oq1,
+                                  int bd) {
+  int16_t filter1, filter2;
+  // ^0x80 equivalent to subtracting 0x80 from the values to turn them
+  // into -128 to +127 instead of 0 to 255.
+  int shift = bd - 8;
+  const int16_t ps1 = (int16_t)*op1 - (0x80 << shift);
+  const int16_t ps0 = (int16_t)*op0 - (0x80 << shift);
+  const int16_t qs0 = (int16_t)*oq0 - (0x80 << shift);
+  const int16_t qs1 = (int16_t)*oq1 - (0x80 << shift);
+  const int16_t hev = highbd_hev_mask(thresh, *op1, *op0, *oq0, *oq1, bd);
+
+  // Add outer taps if we have high edge variance.
+  int16_t filter = signed_char_clamp_high(ps1 - qs1, bd) & hev;
+
+  // Inner taps.
+  filter = signed_char_clamp_high(filter + 3 * (qs0 - ps0), bd) & mask;
+
+  // Save bottom 3 bits so that we round one side +4 and the other +3
+  // if it equals 4 we'll set to adjust by -1 to account for the fact
+  // we'd round 3 the other way.
+  filter1 = signed_char_clamp_high(filter + 4, bd) >> 3;
+  filter2 = signed_char_clamp_high(filter + 3, bd) >> 3;
+
+  *oq0 = signed_char_clamp_high(qs0 - filter1, bd) + (0x80 << shift);
+  *op0 = signed_char_clamp_high(ps0 + filter2, bd) + (0x80 << shift);
+
+  // Outer tap adjustments.
+  filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
+
+  *oq1 = signed_char_clamp_high(qs1 - filter, bd) + (0x80 << shift);
+  *op1 = signed_char_clamp_high(ps1 + filter, bd) + (0x80 << shift);
+}
+
+void aom_highbd_lpf_horizontal_4_c(uint16_t *s, int p /* pitch */,
+                                   const uint8_t *blimit, const uint8_t *limit,
+                                   const uint8_t *thresh, int bd) {
+  int i;
+  int count = 4;
+
+  // loop filter designed to work using chars so that we can make maximum use
+  // of 8 bit simd instructions.
+  for (i = 0; i < count; ++i) {
+    const uint16_t p1 = s[-2 * p];
+    const uint16_t p0 = s[-p];
+    const uint16_t q0 = s[0 * p];
+    const uint16_t q1 = s[1 * p];
+    const int8_t mask =
+        highbd_filter_mask2(*limit, *blimit, p1, p0, q0, q1, bd);
+    highbd_filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p, bd);
+    ++s;
+  }
+}
+
+void aom_highbd_lpf_horizontal_4_dual_c(
+    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  aom_highbd_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, bd);
+  aom_highbd_lpf_horizontal_4_c(s + 4, p, blimit1, limit1, thresh1, bd);
+}
+
+void aom_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit,
+                                 const uint8_t *limit, const uint8_t *thresh,
+                                 int bd) {
+  int i;
+  int count = 4;
+
+  // loop filter designed to work using chars so that we can make maximum use
+  // of 8 bit simd instructions.
+  for (i = 0; i < count; ++i) {
+    const uint16_t p1 = s[-2], p0 = s[-1];
+    const uint16_t q0 = s[0], q1 = s[1];
+    const int8_t mask =
+        highbd_filter_mask2(*limit, *blimit, p1, p0, q0, q1, bd);
+    highbd_filter4(mask, *thresh, s - 2, s - 1, s, s + 1, bd);
+    s += pitch;
+  }
+}
+
+void aom_highbd_lpf_vertical_4_dual_c(
+    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  aom_highbd_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, bd);
+  aom_highbd_lpf_vertical_4_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1,
+                              bd);
+}
+
+static INLINE void highbd_filter6(int8_t mask, uint8_t thresh, int8_t flat,
+                                  uint16_t *op2, uint16_t *op1, uint16_t *op0,
+                                  uint16_t *oq0, uint16_t *oq1, uint16_t *oq2,
+                                  int bd) {
+  if (flat && mask) {
+    const uint16_t p2 = *op2, p1 = *op1, p0 = *op0;
+    const uint16_t q0 = *oq0, q1 = *oq1, q2 = *oq2;
+
+    // 5-tap filter [1, 2, 2, 2, 1]
+    *op1 = ROUND_POWER_OF_TWO(p2 * 3 + p1 * 2 + p0 * 2 + q0, 3);
+    *op0 = ROUND_POWER_OF_TWO(p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1, 3);
+    *oq0 = ROUND_POWER_OF_TWO(p1 + p0 * 2 + q0 * 2 + q1 * 2 + q2, 3);
+    *oq1 = ROUND_POWER_OF_TWO(p0 + q0 * 2 + q1 * 2 + q2 * 3, 3);
+  } else {
+    highbd_filter4(mask, thresh, op1, op0, oq0, oq1, bd);
+  }
+}
+
+static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, int8_t flat,
+                                  uint16_t *op3, uint16_t *op2, uint16_t *op1,
+                                  uint16_t *op0, uint16_t *oq0, uint16_t *oq1,
+                                  uint16_t *oq2, uint16_t *oq3, int bd) {
+  if (flat && mask) {
+    const uint16_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
+    const uint16_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
+
+    // 7-tap filter [1, 1, 1, 2, 1, 1, 1]
+    *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0, 3);
+    *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1, 3);
+    *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2, 3);
+    *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3, 3);
+    *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3);
+    *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3);
+  } else {
+    highbd_filter4(mask, thresh, op1, op0, oq0, oq1, bd);
+  }
+}
+
+void aom_highbd_lpf_horizontal_8_c(uint16_t *s, int p, const uint8_t *blimit,
+                                   const uint8_t *limit, const uint8_t *thresh,
+                                   int bd) {
+  int i;
+  int count = 4;
+
+  // loop filter designed to work using chars so that we can make maximum use
+  // of 8 bit simd instructions.
+  for (i = 0; i < count; ++i) {
+    const uint16_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
+    const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
+
+    const int8_t mask =
+        highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+    const int8_t flat =
+        highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+    highbd_filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p,
+                   s - 1 * p, s, s + 1 * p, s + 2 * p, s + 3 * p, bd);
+    ++s;
+  }
+}
+
+void aom_highbd_lpf_horizontal_6_c(uint16_t *s, int p, const uint8_t *blimit,
+                                   const uint8_t *limit, const uint8_t *thresh,
+                                   int bd) {
+  int i;
+  int count = 4;
+
+  // loop filter designed to work using chars so that we can make maximum use
+  // of 8 bit simd instructions.
+  for (i = 0; i < count; ++i) {
+    const uint16_t p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
+    const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p];
+
+    const int8_t mask =
+        highbd_filter_mask3_chroma(*limit, *blimit, p2, p1, p0, q0, q1, q2, bd);
+    const int8_t flat = highbd_flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2, bd);
+    highbd_filter6(mask, *thresh, flat, s - 3 * p, s - 2 * p, s - 1 * p, s,
+                   s + 1 * p, s + 2 * p, bd);
+    ++s;
+  }
+}
+
+void aom_highbd_lpf_horizontal_6_dual_c(
+    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  aom_highbd_lpf_horizontal_6_c(s, p, blimit0, limit0, thresh0, bd);
+  aom_highbd_lpf_horizontal_6_c(s + 4, p, blimit1, limit1, thresh1, bd);
+}
+
+void aom_highbd_lpf_horizontal_8_dual_c(
+    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  aom_highbd_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, bd);
+  aom_highbd_lpf_horizontal_8_c(s + 4, p, blimit1, limit1, thresh1, bd);
+}
+
+void aom_highbd_lpf_vertical_6_c(uint16_t *s, int pitch, const uint8_t *blimit,
+                                 const uint8_t *limit, const uint8_t *thresh,
+                                 int bd) {
+  int i;
+  int count = 4;
+
+  for (i = 0; i < count; ++i) {
+    const uint16_t p2 = s[-3], p1 = s[-2], p0 = s[-1];
+    const uint16_t q0 = s[0], q1 = s[1], q2 = s[2];
+    const int8_t mask =
+        highbd_filter_mask3_chroma(*limit, *blimit, p2, p1, p0, q0, q1, q2, bd);
+    const int8_t flat = highbd_flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2, bd);
+    highbd_filter6(mask, *thresh, flat, s - 3, s - 2, s - 1, s, s + 1, s + 2,
+                   bd);
+    s += pitch;
+  }
+}
+
+void aom_highbd_lpf_vertical_6_dual_c(
+    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  aom_highbd_lpf_vertical_6_c(s, pitch, blimit0, limit0, thresh0, bd);
+  aom_highbd_lpf_vertical_6_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1,
+                              bd);
+}
+
+void aom_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit,
+                                 const uint8_t *limit, const uint8_t *thresh,
+                                 int bd) {
+  int i;
+  int count = 4;
+
+  for (i = 0; i < count; ++i) {
+    const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
+    const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
+    const int8_t mask =
+        highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+    const int8_t flat =
+        highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+    highbd_filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1, s, s + 1,
+                   s + 2, s + 3, bd);
+    s += pitch;
+  }
+}
+
+void aom_highbd_lpf_vertical_8_dual_c(
+    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  aom_highbd_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, bd);
+  aom_highbd_lpf_vertical_8_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1,
+                              bd);
+}
+
+static INLINE void highbd_filter14(int8_t mask, uint8_t thresh, int8_t flat,
+                                   int8_t flat2, uint16_t *op6, uint16_t *op5,
+                                   uint16_t *op4, uint16_t *op3, uint16_t *op2,
+                                   uint16_t *op1, uint16_t *op0, uint16_t *oq0,
+                                   uint16_t *oq1, uint16_t *oq2, uint16_t *oq3,
+                                   uint16_t *oq4, uint16_t *oq5, uint16_t *oq6,
+                                   int bd) {
+  if (flat2 && flat && mask) {
+    const uint16_t p6 = *op6;
+    const uint16_t p5 = *op5;
+    const uint16_t p4 = *op4;
+    const uint16_t p3 = *op3;
+    const uint16_t p2 = *op2;
+    const uint16_t p1 = *op1;
+    const uint16_t p0 = *op0;
+    const uint16_t q0 = *oq0;
+    const uint16_t q1 = *oq1;
+    const uint16_t q2 = *oq2;
+    const uint16_t q3 = *oq3;
+    const uint16_t q4 = *oq4;
+    const uint16_t q5 = *oq5;
+    const uint16_t q6 = *oq6;
+
+    // 13-tap filter [1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1]
+    *op5 = ROUND_POWER_OF_TWO(p6 * 7 + p5 * 2 + p4 * 2 + p3 + p2 + p1 + p0 + q0,
+                              4);
+    *op4 = ROUND_POWER_OF_TWO(
+        p6 * 5 + p5 * 2 + p4 * 2 + p3 * 2 + p2 + p1 + p0 + q0 + q1, 4);
+    *op3 = ROUND_POWER_OF_TWO(
+        p6 * 4 + p5 + p4 * 2 + p3 * 2 + p2 * 2 + p1 + p0 + q0 + q1 + q2, 4);
+    *op2 = ROUND_POWER_OF_TWO(
+        p6 * 3 + p5 + p4 + p3 * 2 + p2 * 2 + p1 * 2 + p0 + q0 + q1 + q2 + q3,
+        4);
+    *op1 = ROUND_POWER_OF_TWO(p6 * 2 + p5 + p4 + p3 + p2 * 2 + p1 * 2 + p0 * 2 +
+                                  q0 + q1 + q2 + q3 + q4,
+                              4);
+    *op0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 +
+                                  q0 * 2 + q1 + q2 + q3 + q4 + q5,
+                              4);
+    *oq0 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 * 2 +
+                                  q1 * 2 + q2 + q3 + q4 + q5 + q6,
+                              4);
+    *oq1 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 * 2 +
+                                  q2 * 2 + q3 + q4 + q5 + q6 * 2,
+                              4);
+    *oq2 = ROUND_POWER_OF_TWO(
+        p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 * 2 + q3 * 2 + q4 + q5 + q6 * 3,
+        4);
+    *oq3 = ROUND_POWER_OF_TWO(
+        p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 * 2 + q4 * 2 + q5 + q6 * 4, 4);
+    *oq4 = ROUND_POWER_OF_TWO(
+        p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 * 2 + q5 * 2 + q6 * 5, 4);
+    *oq5 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 * 2 + q6 * 7,
+                              4);
+  } else {
+    highbd_filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3,
+                   bd);
+  }
+}
+
+static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p,
+                                            const uint8_t *blimit,
+                                            const uint8_t *limit,
+                                            const uint8_t *thresh, int count,
+                                            int bd) {
+  int i;
+  int step = 4;
+
+  // loop filter designed to work using chars so that we can make maximum use
+  // of 8 bit simd instructions.
+  for (i = 0; i < step * count; ++i) {
+    const uint16_t p3 = s[-4 * p];
+    const uint16_t p2 = s[-3 * p];
+    const uint16_t p1 = s[-2 * p];
+    const uint16_t p0 = s[-p];
+    const uint16_t q0 = s[0 * p];
+    const uint16_t q1 = s[1 * p];
+    const uint16_t q2 = s[2 * p];
+    const uint16_t q3 = s[3 * p];
+    const int8_t mask =
+        highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+    const int8_t flat =
+        highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+
+    const int8_t flat2 =
+        highbd_flat_mask4(1, s[-7 * p], s[-6 * p], s[-5 * p], p0, q0, s[4 * p],
+                          s[5 * p], s[6 * p], bd);
+
+    highbd_filter14(mask, *thresh, flat, flat2, s - 7 * p, s - 6 * p, s - 5 * p,
+                    s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p,
+                    s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p, s + 6 * p, bd);
+    ++s;
+  }
+}
+
+void aom_highbd_lpf_horizontal_14_c(uint16_t *s, int pitch,
+                                    const uint8_t *blimit, const uint8_t *limit,
+                                    const uint8_t *thresh, int bd) {
+  highbd_mb_lpf_horizontal_edge_w(s, pitch, blimit, limit, thresh, 1, bd);
+}
+
+void aom_highbd_lpf_horizontal_14_dual_c(
+    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  highbd_mb_lpf_horizontal_edge_w(s, p, blimit0, limit0, thresh0, 1, bd);
+  highbd_mb_lpf_horizontal_edge_w(s + 4, p, blimit1, limit1, thresh1, 1, bd);
+}
+
+static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p,
+                                          const uint8_t *blimit,
+                                          const uint8_t *limit,
+                                          const uint8_t *thresh, int count,
+                                          int bd) {
+  int i;
+
+  for (i = 0; i < count; ++i) {
+    const uint16_t p3 = s[-4];
+    const uint16_t p2 = s[-3];
+    const uint16_t p1 = s[-2];
+    const uint16_t p0 = s[-1];
+    const uint16_t q0 = s[0];
+    const uint16_t q1 = s[1];
+    const uint16_t q2 = s[2];
+    const uint16_t q3 = s[3];
+    const int8_t mask =
+        highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+    const int8_t flat =
+        highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+    const int8_t flat2 =
+        highbd_flat_mask4(1, s[-7], s[-6], s[-5], p0, q0, s[4], s[5], s[6], bd);
+
+    highbd_filter14(mask, *thresh, flat, flat2, s - 7, s - 6, s - 5, s - 4,
+                    s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5,
+                    s + 6, bd);
+    s += p;
+  }
+}
+
+void aom_highbd_lpf_vertical_14_c(uint16_t *s, int p, const uint8_t *blimit,
+                                  const uint8_t *limit, const uint8_t *thresh,
+                                  int bd) {
+  highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 4, bd);
+}
+
+void aom_highbd_lpf_vertical_14_dual_c(
+    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  highbd_mb_lpf_vertical_edge_w(s, pitch, blimit0, limit0, thresh0, 4, bd);
+  highbd_mb_lpf_vertical_edge_w(s + 4 * pitch, pitch, blimit1, limit1, thresh1,
+                                4, bd);
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/libs/libaom/src/aom_dsp/mips/aom_convolve8_horiz_msa.c b/libs/libaom/src/aom_dsp/mips/aom_convolve8_horiz_msa.c
new file mode 100644
index 000000000..c8ab61249
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/mips/aom_convolve8_horiz_msa.c
@@ -0,0 +1,693 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/mips/aom_convolve_msa.h"
+
+static void common_hz_8t_4x4_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter) {
+  v16u8 mask0, mask1, mask2, mask3, out;
+  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+  v8i16 filt, out0, out1;
+
+  mask0 = LD_UB(&mc_filt_mask_arr[16]);
+  src -= 3;
+
+  /* rearranging filter */
+  filt = LD_SH(filter);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+  mask1 = mask0 + 2;
+  mask2 = mask0 + 4;
+  mask3 = mask0 + 6;
+
+  LD_SB4(src, src_stride, src0, src1, src2, src3);
+  XORI_B4_128_SB(src0, src1, src2, src3);
+  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+                             filt0, filt1, filt2, filt3, out0, out1);
+  SRARI_H2_SH(out0, out1, FILTER_BITS);
+  SAT_SH2_SH(out0, out1, 7);
+  out = PCKEV_XORI128_UB(out0, out1);
+  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_hz_8t_4x8_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter) {
+  v16i8 filt0, filt1, filt2, filt3;
+  v16i8 src0, src1, src2, src3;
+  v16u8 mask0, mask1, mask2, mask3, out;
+  v8i16 filt, out0, out1, out2, out3;
+
+  mask0 = LD_UB(&mc_filt_mask_arr[16]);
+  src -= 3;
+
+  /* rearranging filter */
+  filt = LD_SH(filter);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+  mask1 = mask0 + 2;
+  mask2 = mask0 + 4;
+  mask3 = mask0 + 6;
+
+  LD_SB4(src, src_stride, src0, src1, src2, src3);
+  XORI_B4_128_SB(src0, src1, src2, src3);
+  src += (4 * src_stride);
+  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+                             filt0, filt1, filt2, filt3, out0, out1);
+  LD_SB4(src, src_stride, src0, src1, src2, src3);
+  XORI_B4_128_SB(src0, src1, src2, src3);
+  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+                             filt0, filt1, filt2, filt3, out2, out3);
+  SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+  SAT_SH4_SH(out0, out1, out2, out3, 7);
+  out = PCKEV_XORI128_UB(out0, out1);
+  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+  dst += (4 * dst_stride);
+  out = PCKEV_XORI128_UB(out2, out3);
+  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_hz_8t_4w_msa(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                int8_t *filter, int32_t height) {
+  if (4 == height) {
+    common_hz_8t_4x4_msa(src, src_stride, dst, dst_stride, filter);
+  } else if (8 == height) {
+    common_hz_8t_4x8_msa(src, src_stride, dst, dst_stride, filter);
+  }
+}
+
+static void common_hz_8t_8x4_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter) {
+  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+  v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
+  v8i16 filt, out0, out1, out2, out3;
+
+  mask0 = LD_UB(&mc_filt_mask_arr[0]);
+  src -= 3;
+
+  /* rearranging filter */
+  filt = LD_SH(filter);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+  mask1 = mask0 + 2;
+  mask2 = mask0 + 4;
+  mask3 = mask0 + 6;
+
+  LD_SB4(src, src_stride, src0, src1, src2, src3);
+  XORI_B4_128_SB(src0, src1, src2, src3);
+  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+                             filt0, filt1, filt2, filt3, out0, out1, out2,
+                             out3);
+  SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+  SAT_SH4_SH(out0, out1, out2, out3, 7);
+  tmp0 = PCKEV_XORI128_UB(out0, out1);
+  tmp1 = PCKEV_XORI128_UB(out2, out3);
+  ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+}
+
+static void common_hz_8t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+  v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
+  v8i16 filt, out0, out1, out2, out3;
+
+  mask0 = LD_UB(&mc_filt_mask_arr[0]);
+  src -= 3;
+
+  /* rearranging filter */
+  filt = LD_SH(filter);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+  mask1 = mask0 + 2;
+  mask2 = mask0 + 4;
+  mask3 = mask0 + 6;
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    src += (4 * src_stride);
+    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, out0, out1,
+                               out2, out3);
+    SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    tmp0 = PCKEV_XORI128_UB(out0, out1);
+    tmp1 = PCKEV_XORI128_UB(out2, out3);
+    ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+    dst += (4 * dst_stride);
+  }
+}
+
+static void common_hz_8t_8w_msa(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                int8_t *filter, int32_t height) {
+  if (4 == height) {
+    common_hz_8t_8x4_msa(src, src_stride, dst, dst_stride, filter);
+  } else {
+    common_hz_8t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height);
+  }
+}
+
+static void common_hz_8t_16w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+  v16u8 mask0, mask1, mask2, mask3, out;
+  v8i16 filt, out0, out1, out2, out3;
+
+  mask0 = LD_UB(&mc_filt_mask_arr[0]);
+  src -= 3;
+
+  /* rearranging filter */
+  filt = LD_SH(filter);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+  mask1 = mask0 + 2;
+  mask2 = mask0 + 4;
+  mask3 = mask0 + 6;
+
+  for (loop_cnt = (height >> 1); loop_cnt--;) {
+    LD_SB2(src, src_stride, src0, src2);
+    LD_SB2(src + 8, src_stride, src1, src3);
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    src += (2 * src_stride);
+    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, out0, out1,
+                               out2, out3);
+    SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST_UB(out, dst);
+    dst += dst_stride;
+    out = PCKEV_XORI128_UB(out2, out3);
+    ST_UB(out, dst);
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_8t_32w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+  v16u8 mask0, mask1, mask2, mask3, out;
+  v8i16 filt, out0, out1, out2, out3;
+
+  mask0 = LD_UB(&mc_filt_mask_arr[0]);
+  src -= 3;
+
+  /* rearranging filter */
+  filt = LD_SH(filter);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+  mask1 = mask0 + 2;
+  mask2 = mask0 + 4;
+  mask3 = mask0 + 6;
+
+  for (loop_cnt = (height >> 1); loop_cnt--;) {
+    src0 = LD_SB(src);
+    src2 = LD_SB(src + 16);
+    src3 = LD_SB(src + 24);
+    src1 = __msa_sldi_b(src2, src0, 8);
+    src += src_stride;
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, out0, out1,
+                               out2, out3);
+    SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+
+    src0 = LD_SB(src);
+    src2 = LD_SB(src + 16);
+    src3 = LD_SB(src + 24);
+    src1 = __msa_sldi_b(src2, src0, 8);
+    src += src_stride;
+
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST_UB(out, dst);
+    out = PCKEV_XORI128_UB(out2, out3);
+    ST_UB(out, dst + 16);
+    dst += dst_stride;
+
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, out0, out1,
+                               out2, out3);
+    SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST_UB(out, dst);
+    out = PCKEV_XORI128_UB(out2, out3);
+    ST_UB(out, dst + 16);
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_8t_64w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  int32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
+  v16u8 mask0, mask1, mask2, mask3, out;
+  v8i16 filt, out0, out1, out2, out3;
+
+  mask0 = LD_UB(&mc_filt_mask_arr[0]);
+  src -= 3;
+
+  /* rearranging filter */
+  filt = LD_SH(filter);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+  mask1 = mask0 + 2;
+  mask2 = mask0 + 4;
+  mask3 = mask0 + 6;
+
+  for (loop_cnt = height; loop_cnt--;) {
+    src0 = LD_SB(src);
+    src2 = LD_SB(src + 16);
+    src3 = LD_SB(src + 24);
+    src1 = __msa_sldi_b(src2, src0, 8);
+
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, out0, out1,
+                               out2, out3);
+    SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST_UB(out, dst);
+    out = PCKEV_XORI128_UB(out2, out3);
+    ST_UB(out, dst + 16);
+
+    src0 = LD_SB(src + 32);
+    src2 = LD_SB(src + 48);
+    src3 = LD_SB(src + 56);
+    src1 = __msa_sldi_b(src2, src0, 8);
+    src += src_stride;
+
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, out0, out1,
+                               out2, out3);
+    SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
+    SAT_SH4_SH(out0, out1, out2, out3, 7);
+    out = PCKEV_XORI128_UB(out0, out1);
+    ST_UB(out, dst + 32);
+    out = PCKEV_XORI128_UB(out2, out3);
+    ST_UB(out, dst + 48);
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_2t_4x4_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter) {
+  v16i8 src0, src1, src2, src3, mask;
+  v16u8 filt0, vec0, vec1, res0, res1;
+  v8u16 vec2, vec3, filt;
+
+  mask = LD_SB(&mc_filt_mask_arr[16]);
+
+  /* rearranging filter */
+  filt = LD_UH(filter);
+  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+  LD_SB4(src, src_stride, src0, src1, src2, src3);
+  VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
+  DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
+  SRARI_H2_UH(vec2, vec3, FILTER_BITS);
+  PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
+  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter) {
+  v16u8 vec0, vec1, vec2, vec3, filt0;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+  v16i8 res0, res1, res2, res3;
+  v8u16 vec4, vec5, vec6, vec7, filt;
+
+  mask = LD_SB(&mc_filt_mask_arr[16]);
+
+  /* rearranging filter */
+  filt = LD_UH(filter);
+  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+  VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
+  VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
+  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5,
+              vec6, vec7);
+  SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
+  PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2,
+              res3);
+  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+  dst += (4 * dst_stride);
+  ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
+}
+
+static void common_hz_2t_4w_msa(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                int8_t *filter, int32_t height) {
+  if (4 == height) {
+    common_hz_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
+  } else if (8 == height) {
+    common_hz_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
+  }
+}
+
+static void common_hz_2t_8x4_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter) {
+  v16u8 filt0;
+  v16i8 src0, src1, src2, src3, mask;
+  v8u16 vec0, vec1, vec2, vec3, filt;
+
+  mask = LD_SB(&mc_filt_mask_arr[0]);
+
+  /* rearranging filter */
+  filt = LD_UH(filter);
+  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+  LD_SB4(src, src_stride, src0, src1, src2, src3);
+  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+              vec2, vec3);
+  SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+  PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1);
+  ST8x4_UB(src0, src1, dst, dst_stride);
+}
+
+static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     int8_t *filter, int32_t height) {
+  v16u8 filt0;
+  v16i8 src0, src1, src2, src3, mask, out0, out1;
+  v8u16 vec0, vec1, vec2, vec3, filt;
+
+  mask = LD_SB(&mc_filt_mask_arr[0]);
+
+  /* rearranging filter */
+  filt = LD_UH(filter);
+  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+  LD_SB4(src, src_stride, src0, src1, src2, src3);
+  src += (4 * src_stride);
+
+  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+              vec2, vec3);
+  SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+
+  LD_SB4(src, src_stride, src0, src1, src2, src3);
+  src += (4 * src_stride);
+
+  PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
+  ST8x4_UB(out0, out1, dst, dst_stride);
+  dst += (4 * dst_stride);
+
+  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+              vec2, vec3);
+  SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+  PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
+  ST8x4_UB(out0, out1, dst, dst_stride);
+  dst += (4 * dst_stride);
+
+  if (16 == height) {
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+                vec2, vec3);
+    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+
+    PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
+    ST8x4_UB(out0, out1, dst, dst_stride);
+
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+                vec2, vec3);
+    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+    PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
+    ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride);
+  }
+}
+
+static void common_hz_2t_8w_msa(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                int8_t *filter, int32_t height) {
+  if (4 == height) {
+    common_hz_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
+  } else {
+    common_hz_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height);
+  }
+}
+
+static void common_hz_2t_16w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+  v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
+
+  mask = LD_SB(&mc_filt_mask_arr[0]);
+
+  loop_cnt = (height >> 2) - 1;
+
+  /* rearranging filter */
+  filt = LD_UH(filter);
+  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+  LD_SB4(src, src_stride, src0, src2, src4, src6);
+  LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+  src += (4 * src_stride);
+
+  VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+  VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+  VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+  VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
+              out2, out3);
+  DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
+              out6, out7);
+  SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
+  SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
+  PCKEV_ST_SB(out0, out1, dst);
+  dst += dst_stride;
+  PCKEV_ST_SB(out2, out3, dst);
+  dst += dst_stride;
+  PCKEV_ST_SB(out4, out5, dst);
+  dst += dst_stride;
+  PCKEV_ST_SB(out6, out7, dst);
+  dst += dst_stride;
+
+  for (; loop_cnt--;) {
+    LD_SB4(src, src_stride, src0, src2, src4, src6);
+    LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+    src += (4 * src_stride);
+
+    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+    VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
+                out2, out3);
+    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
+                out6, out7);
+    SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
+    SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
+    PCKEV_ST_SB(out0, out1, dst);
+    dst += dst_stride;
+    PCKEV_ST_SB(out2, out3, dst);
+    dst += dst_stride;
+    PCKEV_ST_SB(out4, out5, dst);
+    dst += dst_stride;
+    PCKEV_ST_SB(out6, out7, dst);
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_2t_32w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+  v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
+
+  mask = LD_SB(&mc_filt_mask_arr[0]);
+
+  /* rearranging filter */
+  filt = LD_UH(filter);
+  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+  for (loop_cnt = height >> 1; loop_cnt--;) {
+    src0 = LD_SB(src);
+    src2 = LD_SB(src + 16);
+    src3 = LD_SB(src + 24);
+    src1 = __msa_sldi_b(src2, src0, 8);
+    src += src_stride;
+    src4 = LD_SB(src);
+    src6 = LD_SB(src + 16);
+    src7 = LD_SB(src + 24);
+    src5 = __msa_sldi_b(src6, src4, 8);
+    src += src_stride;
+
+    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+    VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
+                out2, out3);
+    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
+                out6, out7);
+    SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
+    SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
+    PCKEV_ST_SB(out0, out1, dst);
+    PCKEV_ST_SB(out2, out3, dst + 16);
+    dst += dst_stride;
+    PCKEV_ST_SB(out4, out5, dst);
+    PCKEV_ST_SB(out6, out7, dst + 16);
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_2t_64w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+  v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
+
+  mask = LD_SB(&mc_filt_mask_arr[0]);
+
+  /* rearranging filter */
+  filt = LD_UH(filter);
+  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+  for (loop_cnt = height; loop_cnt--;) {
+    src0 = LD_SB(src);
+    src2 = LD_SB(src + 16);
+    src4 = LD_SB(src + 32);
+    src6 = LD_SB(src + 48);
+    src7 = LD_SB(src + 56);
+    SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8);
+    src += src_stride;
+
+    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
+    VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
+                out2, out3);
+    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
+                out6, out7);
+    SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
+    SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
+    PCKEV_ST_SB(out0, out1, dst);
+    PCKEV_ST_SB(out2, out3, dst + 16);
+    PCKEV_ST_SB(out4, out5, dst + 32);
+    PCKEV_ST_SB(out6, out7, dst + 48);
+    dst += dst_stride;
+  }
+}
+
+void aom_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
+                             uint8_t *dst, ptrdiff_t dst_stride,
+                             const int16_t *filter_x, int x_step_q4,
+                             const int16_t *filter_y, int y_step_q4, int w,
+                             int h) {
+  int8_t cnt, filt_hor[8];
+
+  assert(x_step_q4 == 16);
+  assert(((const int32_t *)filter_x)[1] != 0x800000);
+
+  for (cnt = 0; cnt < 8; ++cnt) {
+    filt_hor[cnt] = filter_x[cnt];
+  }
+
+  if (((const int32_t *)filter_x)[0] == 0) {
+    switch (w) {
+      case 4:
+        common_hz_2t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                            &filt_hor[3], h);
+        break;
+      case 8:
+        common_hz_2t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                            &filt_hor[3], h);
+        break;
+      case 16:
+        common_hz_2t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                             &filt_hor[3], h);
+        break;
+      case 32:
+        common_hz_2t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                             &filt_hor[3], h);
+        break;
+      case 64:
+        common_hz_2t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                             &filt_hor[3], h);
+        break;
+      default:
+        aom_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x,
+                              x_step_q4, filter_y, y_step_q4, w, h);
+        break;
+    }
+  } else {
+    switch (w) {
+      case 4:
+        common_hz_8t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                            filt_hor, h);
+        break;
+      case 8:
+        common_hz_8t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                            filt_hor, h);
+        break;
+      case 16:
+        common_hz_8t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                             filt_hor, h);
+        break;
+      case 32:
+        common_hz_8t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                             filt_hor, h);
+        break;
+      case 64:
+        common_hz_8t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                             filt_hor, h);
+        break;
+      default:
+        aom_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x,
+                              x_step_q4, filter_y, y_step_q4, w, h);
+        break;
+    }
+  }
+}
diff --git a/libs/libaom/src/aom_dsp/mips/aom_convolve8_vert_msa.c b/libs/libaom/src/aom_dsp/mips/aom_convolve8_vert_msa.c
new file mode 100644
index 000000000..2c3bc084c
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/mips/aom_convolve8_vert_msa.c
@@ -0,0 +1,699 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/mips/aom_convolve_msa.h"
+
+static void common_vt_8t_4w_msa(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+  v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
+  v16i8 src10998, filt0, filt1, filt2, filt3;
+  v16u8 out;
+  v8i16 filt, out10, out32;
+
+  src -= (3 * src_stride);
+
+  filt = LD_SH(filter);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+  src += (7 * src_stride);
+
+  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
+             src54_r, src21_r);
+  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+  ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
+             src4332, src6554);
+  XORI_B3_128_SB(src2110, src4332, src6554);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_SB4(src, src_stride, src7, src8, src9, src10);
+    src += (4 * src_stride);
+
+    ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+               src87_r, src98_r, src109_r);
+    ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
+    XORI_B2_128_SB(src8776, src10998);
+    out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0,
+                                filt1, filt2, filt3);
+    out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0,
+                                filt1, filt2, filt3);
+    SRARI_H2_SH(out10, out32, FILTER_BITS);
+    SAT_SH2_SH(out10, out32, 7);
+    out = PCKEV_XORI128_UB(out10, out32);
+    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    src2110 = src6554;
+    src4332 = src8776;
+    src6554 = src10998;
+    src6 = src10;
+  }
+}
+
+static void common_vt_8t_8w_msa(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+  v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
+  v16u8 tmp0, tmp1;
+  v8i16 filt, out0_r, out1_r, out2_r, out3_r;
+
+  src -= (3 * src_stride);
+
+  filt = LD_SH(filter);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+  src += (7 * src_stride);
+  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
+             src54_r, src21_r);
+  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_SB4(src, src_stride, src7, src8, src9, src10);
+    XORI_B4_128_SB(src7, src8, src9, src10);
+    src += (4 * src_stride);
+
+    ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+               src87_r, src98_r, src109_r);
+    out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
+                                 filt1, filt2, filt3);
+    out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
+                                 filt1, filt2, filt3);
+    out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
+                                 filt1, filt2, filt3);
+    out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
+                                 filt1, filt2, filt3);
+    SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS);
+    SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+    tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
+    tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
+    ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    src10_r = src54_r;
+    src32_r = src76_r;
+    src54_r = src98_r;
+    src21_r = src65_r;
+    src43_r = src87_r;
+    src65_r = src109_r;
+    src6 = src10;
+  }
+}
+
+static void common_vt_8t_16w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  v16i8 filt0, filt1, filt2, filt3;
+  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+  v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
+  v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
+  v16u8 tmp0, tmp1, tmp2, tmp3;
+  v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
+
+  src -= (3 * src_stride);
+
+  filt = LD_SH(filter);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+  src += (7 * src_stride);
+  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
+             src54_r, src21_r);
+  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+  ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
+             src54_l, src21_l);
+  ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_SB4(src, src_stride, src7, src8, src9, src10);
+    XORI_B4_128_SB(src7, src8, src9, src10);
+    src += (4 * src_stride);
+
+    ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+               src87_r, src98_r, src109_r);
+    ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
+               src87_l, src98_l, src109_l);
+    out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
+                                 filt1, filt2, filt3);
+    out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
+                                 filt1, filt2, filt3);
+    out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
+                                 filt1, filt2, filt3);
+    out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
+                                 filt1, filt2, filt3);
+    out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0,
+                                 filt1, filt2, filt3);
+    out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0,
+                                 filt1, filt2, filt3);
+    out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0,
+                                 filt1, filt2, filt3);
+    out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0,
+                                 filt1, filt2, filt3);
+    SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS);
+    SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, FILTER_BITS);
+    SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+    SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+    PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, out3_r,
+                tmp0, tmp1, tmp2, tmp3);
+    XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
+    ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    src10_r = src54_r;
+    src32_r = src76_r;
+    src54_r = src98_r;
+    src21_r = src65_r;
+    src43_r = src87_r;
+    src65_r = src109_r;
+    src10_l = src54_l;
+    src32_l = src76_l;
+    src54_l = src98_l;
+    src21_l = src65_l;
+    src43_l = src87_l;
+    src65_l = src109_l;
+    src6 = src10;
+  }
+}
+
+static void common_vt_8t_16w_mult_msa(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int8_t *filter, int32_t height,
+                                      int32_t width) {
+  const uint8_t *src_tmp;
+  uint8_t *dst_tmp;
+  uint32_t loop_cnt, cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  v16i8 filt0, filt1, filt2, filt3;
+  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
+  v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
+  v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
+  v16u8 tmp0, tmp1, tmp2, tmp3;
+  v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
+
+  src -= (3 * src_stride);
+
+  filt = LD_SH(filter);
+  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+
+  for (cnt = (width >> 4); cnt--;) {
+    src_tmp = src;
+    dst_tmp = dst;
+
+    LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
+    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
+    src_tmp += (7 * src_stride);
+    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
+               src54_r, src21_r);
+    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
+    ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
+               src54_l, src21_l);
+    ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+      LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
+      XORI_B4_128_SB(src7, src8, src9, src10);
+      src_tmp += (4 * src_stride);
+      ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
+                 src87_r, src98_r, src109_r);
+      ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
+                 src87_l, src98_l, src109_l);
+      out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
+                                   filt1, filt2, filt3);
+      out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
+                                   filt1, filt2, filt3);
+      out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
+                                   filt1, filt2, filt3);
+      out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
+                                   filt1, filt2, filt3);
+      out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0,
+                                   filt1, filt2, filt3);
+      out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0,
+                                   filt1, filt2, filt3);
+      out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0,
+                                   filt1, filt2, filt3);
+      out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0,
+                                   filt1, filt2, filt3);
+      SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS);
+      SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, FILTER_BITS);
+      SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
+      SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
+      PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
+                  out3_r, tmp0, tmp1, tmp2, tmp3);
+      XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
+      ST_UB4(tmp0, tmp1, tmp2, tmp3, dst_tmp, dst_stride);
+      dst_tmp += (4 * dst_stride);
+
+      src10_r = src54_r;
+      src32_r = src76_r;
+      src54_r = src98_r;
+      src21_r = src65_r;
+      src43_r = src87_r;
+      src65_r = src109_r;
+      src10_l = src54_l;
+      src32_l = src76_l;
+      src54_l = src98_l;
+      src21_l = src65_l;
+      src43_l = src87_l;
+      src65_l = src109_l;
+      src6 = src10;
+    }
+
+    src += 16;
+    dst += 16;
+  }
+}
+
+static void common_vt_8t_32w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
+                            32);
+}
+
+static void common_vt_8t_64w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
+                            64);
+}
+
+static void common_vt_2t_4x4_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter) {
+  v16i8 src0, src1, src2, src3, src4;
+  v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
+  v16u8 filt0;
+  v8i16 filt;
+  v8u16 tmp0, tmp1;
+
+  filt = LD_SH(filter);
+  filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
+
+  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+             src32_r, src43_r);
+  ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
+  DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
+  SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+  src2110 = __msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+  ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void common_vt_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter) {
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+  v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r;
+  v16i8 src65_r, src87_r, src2110, src4332, src6554, src8776;
+  v8u16 tmp0, tmp1, tmp2, tmp3;
+  v16u8 filt0;
+  v8i16 filt;
+
+  filt = LD_SH(filter);
+  filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+  src += (8 * src_stride);
+
+  src8 = LD_SB(src);
+
+  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+             src32_r, src43_r);
+  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
+             src76_r, src87_r);
+  ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src87_r,
+             src76_r, src2110, src4332, src6554, src8776);
+  DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
+              tmp0, tmp1, tmp2, tmp3);
+  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+  PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
+  ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
+  ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
+}
+
+static void common_vt_2t_4w_msa(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                int8_t *filter, int32_t height) {
+  if (4 == height) {
+    common_vt_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
+  } else if (8 == height) {
+    common_vt_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
+  }
+}
+
+static void common_vt_2t_8x4_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter) {
+  v16u8 src0, src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0;
+  v16i8 out0, out1;
+  v8u16 tmp0, tmp1, tmp2, tmp3;
+  v8i16 filt;
+
+  /* rearranging filter_y */
+  filt = LD_SH(filter);
+  filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+  LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
+  ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1);
+  ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3);
+  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
+              tmp2, tmp3);
+  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+  PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+  ST8x4_UB(out0, out1, dst, dst_stride);
+}
+
+static void common_vt_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+  v16i8 out0, out1;
+  v8u16 tmp0, tmp1, tmp2, tmp3;
+  v8i16 filt;
+
+  /* rearranging filter_y */
+  filt = LD_SH(filter);
+  filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+  src0 = LD_UB(src);
+  src += src_stride;
+
+  for (loop_cnt = (height >> 3); loop_cnt--;) {
+    LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
+    src += (8 * src_stride);
+
+    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2,
+               vec3);
+    ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, vec4, vec5, vec6,
+               vec7);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
+                tmp2, tmp3);
+    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+    PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+    ST8x4_UB(out0, out1, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1,
+                tmp2, tmp3);
+    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+    PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+    ST8x4_UB(out0, out1, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    src0 = src8;
+  }
+}
+
+static void common_vt_2t_8w_msa(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                int8_t *filter, int32_t height) {
+  if (4 == height) {
+    common_vt_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
+  } else {
+    common_vt_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height);
+  }
+}
+
+static void common_vt_2t_16w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  v16u8 src0, src1, src2, src3, src4;
+  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+  v8u16 tmp0, tmp1, tmp2, tmp3;
+  v8i16 filt;
+
+  /* rearranging filter_y */
+  filt = LD_SH(filter);
+  filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+  src0 = LD_UB(src);
+  src += src_stride;
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_UB4(src, src_stride, src1, src2, src3, src4);
+    src += (4 * src_stride);
+
+    ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
+    ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
+    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    PCKEV_ST_SB(tmp0, tmp1, dst);
+    dst += dst_stride;
+
+    ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
+    ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
+    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    PCKEV_ST_SB(tmp2, tmp3, dst);
+    dst += dst_stride;
+
+    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    PCKEV_ST_SB(tmp0, tmp1, dst);
+    dst += dst_stride;
+
+    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    PCKEV_ST_SB(tmp2, tmp3, dst);
+    dst += dst_stride;
+
+    src0 = src4;
+  }
+}
+
+static void common_vt_2t_32w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
+  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+  v8u16 tmp0, tmp1, tmp2, tmp3;
+  v8i16 filt;
+
+  /* rearranging filter_y */
+  filt = LD_SH(filter);
+  filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+  src0 = LD_UB(src);
+  src5 = LD_UB(src + 16);
+  src += src_stride;
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_UB4(src, src_stride, src1, src2, src3, src4);
+    ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
+    ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
+
+    LD_UB4(src + 16, src_stride, src6, src7, src8, src9);
+    src += (4 * src_stride);
+
+    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    PCKEV_ST_SB(tmp0, tmp1, dst);
+    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride);
+
+    ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
+    ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
+    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    PCKEV_ST_SB(tmp0, tmp1, dst + 2 * dst_stride);
+
+    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    PCKEV_ST_SB(tmp2, tmp3, dst + 3 * dst_stride);
+
+    ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2);
+    ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3);
+    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    PCKEV_ST_SB(tmp0, tmp1, dst + 16);
+
+    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    PCKEV_ST_SB(tmp2, tmp3, dst + 16 + dst_stride);
+
+    ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6);
+    ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7);
+    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    PCKEV_ST_SB(tmp0, tmp1, dst + 16 + 2 * dst_stride);
+
+    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    PCKEV_ST_SB(tmp2, tmp3, dst + 16 + 3 * dst_stride);
+    dst += (4 * dst_stride);
+
+    src0 = src4;
+    src5 = src9;
+  }
+}
+
+static void common_vt_2t_64w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  v16u8 src11, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+  v8u16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  v8i16 filt;
+
+  /* rearranging filter_y */
+  filt = LD_SH(filter);
+  filt0 = (v16u8)__msa_splati_h(filt, 0);
+
+  LD_UB4(src, 16, src0, src3, src6, src9);
+  src += src_stride;
+
+  for (loop_cnt = (height >> 1); loop_cnt--;) {
+    LD_UB2(src, src_stride, src1, src2);
+    LD_UB2(src + 16, src_stride, src4, src5);
+    LD_UB2(src + 32, src_stride, src7, src8);
+    LD_UB2(src + 48, src_stride, src10, src11);
+    src += (2 * src_stride);
+
+    ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
+    ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
+    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    PCKEV_ST_SB(tmp0, tmp1, dst);
+
+    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride);
+
+    ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6);
+    ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7);
+    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
+    SRARI_H2_UH(tmp4, tmp5, FILTER_BITS);
+    PCKEV_ST_SB(tmp4, tmp5, dst + 16);
+
+    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
+    SRARI_H2_UH(tmp6, tmp7, FILTER_BITS);
+    PCKEV_ST_SB(tmp6, tmp7, dst + 16 + dst_stride);
+
+    ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2);
+    ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3);
+    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    PCKEV_ST_SB(tmp0, tmp1, dst + 32);
+
+    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    PCKEV_ST_SB(tmp2, tmp3, dst + 32 + dst_stride);
+
+    ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6);
+    ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7);
+    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
+    SRARI_H2_UH(tmp4, tmp5, FILTER_BITS);
+    PCKEV_ST_SB(tmp4, tmp5, dst + 48);
+
+    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
+    SRARI_H2_UH(tmp6, tmp7, FILTER_BITS);
+    PCKEV_ST_SB(tmp6, tmp7, dst + 48 + dst_stride);
+    dst += (2 * dst_stride);
+
+    src0 = src2;
+    src3 = src5;
+    src6 = src8;
+    src9 = src11;
+  }
+}
+
+void aom_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
+                            uint8_t *dst, ptrdiff_t dst_stride,
+                            const int16_t *filter_x, int x_step_q4,
+                            const int16_t *filter_y, int y_step_q4, int w,
+                            int h) {
+  int8_t cnt, filt_ver[8];
+
+  assert(y_step_q4 == 16);
+  assert(((const int32_t *)filter_y)[1] != 0x800000);
+
+  for (cnt = 8; cnt--;) {
+    filt_ver[cnt] = filter_y[cnt];
+  }
+
+  if (((const int32_t *)filter_y)[0] == 0) {
+    switch (w) {
+      case 4:
+        common_vt_2t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                            &filt_ver[3], h);
+        break;
+      case 8:
+        common_vt_2t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                            &filt_ver[3], h);
+        break;
+      case 16:
+        common_vt_2t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                             &filt_ver[3], h);
+        break;
+      case 32:
+        common_vt_2t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                             &filt_ver[3], h);
+        break;
+      case 64:
+        common_vt_2t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                             &filt_ver[3], h);
+        break;
+      default:
+        aom_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,
+                             x_step_q4, filter_y, y_step_q4, w, h);
+        break;
+    }
+  } else {
+    switch (w) {
+      case 4:
+        common_vt_8t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                            filt_ver, h);
+        break;
+      case 8:
+        common_vt_8t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                            filt_ver, h);
+        break;
+      case 16:
+        common_vt_8t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                             filt_ver, h);
+        break;
+      case 32:
+        common_vt_8t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                             filt_ver, h);
+        break;
+      case 64:
+        common_vt_8t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                             filt_ver, h);
+        break;
+      default:
+        aom_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,
+                             x_step_q4, filter_y, y_step_q4, w, h);
+        break;
+    }
+  }
+}
diff --git a/libs/libaom/src/aom_dsp/mips/aom_convolve_copy_msa.c b/libs/libaom/src/aom_dsp/mips/aom_convolve_copy_msa.c
new file mode 100644
index 000000000..f7f116f4d
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/mips/aom_convolve_copy_msa.c
@@ -0,0 +1,248 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <string.h>
+#include "aom_dsp/mips/macros_msa.h"
+
+static void copy_width8_msa(const uint8_t *src, int32_t src_stride,
+                            uint8_t *dst, int32_t dst_stride, int32_t height) {
+  int32_t cnt;
+  uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+  if (0 == height % 12) {
+    for (cnt = (height / 12); cnt--;) {
+      LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+      src += (8 * src_stride);
+
+      out0 = __msa_copy_u_d((v2i64)src0, 0);
+      out1 = __msa_copy_u_d((v2i64)src1, 0);
+      out2 = __msa_copy_u_d((v2i64)src2, 0);
+      out3 = __msa_copy_u_d((v2i64)src3, 0);
+      out4 = __msa_copy_u_d((v2i64)src4, 0);
+      out5 = __msa_copy_u_d((v2i64)src5, 0);
+      out6 = __msa_copy_u_d((v2i64)src6, 0);
+      out7 = __msa_copy_u_d((v2i64)src7, 0);
+
+      SD4(out0, out1, out2, out3, dst, dst_stride);
+      dst += (4 * dst_stride);
+      SD4(out4, out5, out6, out7, dst, dst_stride);
+      dst += (4 * dst_stride);
+
+      LD_UB4(src, src_stride, src0, src1, src2, src3);
+      src += (4 * src_stride);
+
+      out0 = __msa_copy_u_d((v2i64)src0, 0);
+      out1 = __msa_copy_u_d((v2i64)src1, 0);
+      out2 = __msa_copy_u_d((v2i64)src2, 0);
+      out3 = __msa_copy_u_d((v2i64)src3, 0);
+      SD4(out0, out1, out2, out3, dst, dst_stride);
+      dst += (4 * dst_stride);
+    }
+  } else if (0 == height % 8) {
+    for (cnt = height >> 3; cnt--;) {
+      LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+      src += (8 * src_stride);
+
+      out0 = __msa_copy_u_d((v2i64)src0, 0);
+      out1 = __msa_copy_u_d((v2i64)src1, 0);
+      out2 = __msa_copy_u_d((v2i64)src2, 0);
+      out3 = __msa_copy_u_d((v2i64)src3, 0);
+      out4 = __msa_copy_u_d((v2i64)src4, 0);
+      out5 = __msa_copy_u_d((v2i64)src5, 0);
+      out6 = __msa_copy_u_d((v2i64)src6, 0);
+      out7 = __msa_copy_u_d((v2i64)src7, 0);
+
+      SD4(out0, out1, out2, out3, dst, dst_stride);
+      dst += (4 * dst_stride);
+      SD4(out4, out5, out6, out7, dst, dst_stride);
+      dst += (4 * dst_stride);
+    }
+  } else if (0 == height % 4) {
+    for (cnt = (height / 4); cnt--;) {
+      LD_UB4(src, src_stride, src0, src1, src2, src3);
+      src += (4 * src_stride);
+      out0 = __msa_copy_u_d((v2i64)src0, 0);
+      out1 = __msa_copy_u_d((v2i64)src1, 0);
+      out2 = __msa_copy_u_d((v2i64)src2, 0);
+      out3 = __msa_copy_u_d((v2i64)src3, 0);
+
+      SD4(out0, out1, out2, out3, dst, dst_stride);
+      dst += (4 * dst_stride);
+    }
+  } else if (0 == height % 2) {
+    for (cnt = (height / 2); cnt--;) {
+      LD_UB2(src, src_stride, src0, src1);
+      src += (2 * src_stride);
+      out0 = __msa_copy_u_d((v2i64)src0, 0);
+      out1 = __msa_copy_u_d((v2i64)src1, 0);
+
+      SD(out0, dst);
+      dst += dst_stride;
+      SD(out1, dst);
+      dst += dst_stride;
+    }
+  }
+}
+
+static void copy_16multx8mult_msa(const uint8_t *src, int32_t src_stride,
+                                  uint8_t *dst, int32_t dst_stride,
+                                  int32_t height, int32_t width) {
+  int32_t cnt, loop_cnt;
+  const uint8_t *src_tmp;
+  uint8_t *dst_tmp;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+  for (cnt = (width >> 4); cnt--;) {
+    src_tmp = src;
+    dst_tmp = dst;
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+      LD_UB8(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6,
+             src7);
+      src_tmp += (8 * src_stride);
+
+      ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst_tmp,
+             dst_stride);
+      dst_tmp += (8 * dst_stride);
+    }
+
+    src += 16;
+    dst += 16;
+  }
+}
+
+static void copy_width16_msa(const uint8_t *src, int32_t src_stride,
+                             uint8_t *dst, int32_t dst_stride, int32_t height) {
+  int32_t cnt;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+  if (0 == height % 12) {
+    for (cnt = (height / 12); cnt--;) {
+      LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+      src += (8 * src_stride);
+      ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
+      dst += (8 * dst_stride);
+
+      LD_UB4(src, src_stride, src0, src1, src2, src3);
+      src += (4 * src_stride);
+      ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+      dst += (4 * dst_stride);
+    }
+  } else if (0 == height % 8) {
+    copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16);
+  } else if (0 == height % 4) {
+    for (cnt = (height >> 2); cnt--;) {
+      LD_UB4(src, src_stride, src0, src1, src2, src3);
+      src += (4 * src_stride);
+
+      ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+      dst += (4 * dst_stride);
+    }
+  }
+}
+
+static void copy_width32_msa(const uint8_t *src, int32_t src_stride,
+                             uint8_t *dst, int32_t dst_stride, int32_t height) {
+  int32_t cnt;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+  if (0 == height % 12) {
+    for (cnt = (height / 12); cnt--;) {
+      LD_UB4(src, src_stride, src0, src1, src2, src3);
+      LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
+      src += (4 * src_stride);
+      ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+      ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
+      dst += (4 * dst_stride);
+
+      LD_UB4(src, src_stride, src0, src1, src2, src3);
+      LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
+      src += (4 * src_stride);
+      ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+      ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
+      dst += (4 * dst_stride);
+
+      LD_UB4(src, src_stride, src0, src1, src2, src3);
+      LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
+      src += (4 * src_stride);
+      ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+      ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
+      dst += (4 * dst_stride);
+    }
+  } else if (0 == height % 8) {
+    copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 32);
+  } else if (0 == height % 4) {
+    for (cnt = (height >> 2); cnt--;) {
+      LD_UB4(src, src_stride, src0, src1, src2, src3);
+      LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
+      src += (4 * src_stride);
+      ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+      ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
+      dst += (4 * dst_stride);
+    }
+  }
+}
+
+static void copy_width64_msa(const uint8_t *src, int32_t src_stride,
+                             uint8_t *dst, int32_t dst_stride, int32_t height) {
+  copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 64);
+}
+
+void aom_convolve_copy_msa(const uint8_t *src, ptrdiff_t src_stride,
+                           uint8_t *dst, ptrdiff_t dst_stride,
+                           const int16_t *filter_x, int32_t filter_x_stride,
+                           const int16_t *filter_y, int32_t filter_y_stride,
+                           int32_t w, int32_t h) {
+  (void)filter_x;
+  (void)filter_y;
+  (void)filter_x_stride;
+  (void)filter_y_stride;
+
+  switch (w) {
+    case 4: {
+      uint32_t cnt, tmp;
+      /* 1 word storage */
+      for (cnt = h; cnt--;) {
+        tmp = LW(src);
+        SW(tmp, dst);
+        src += src_stride;
+        dst += dst_stride;
+      }
+      break;
+    }
+    case 8: {
+      copy_width8_msa(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    case 16: {
+      copy_width16_msa(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    case 32: {
+      copy_width32_msa(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    case 64: {
+      copy_width64_msa(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    default: {
+      uint32_t cnt;
+      for (cnt = h; cnt--;) {
+        memcpy(dst, src, w);
+        src += src_stride;
+        dst += dst_stride;
+      }
+      break;
+    }
+  }
+}
diff --git a/libs/libaom/src/aom_dsp/mips/aom_convolve_msa.h b/libs/libaom/src/aom_dsp/mips/aom_convolve_msa.h
new file mode 100644
index 000000000..852415c20
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/mips/aom_convolve_msa.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_MIPS_AOM_CONVOLVE_MSA_H_
+#define AOM_AOM_DSP_MIPS_AOM_CONVOLVE_MSA_H_
+
+#include "aom_dsp/mips/macros_msa.h"
+#include "aom_dsp/aom_filter.h"
+
+extern const uint8_t mc_filt_mask_arr[16 * 3];
+
+#define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt0, filt1, filt2,   \
+                            filt3)                                         \
+  ({                                                                       \
+    v8i16 tmp_dpadd_0, tmp_dpadd_1;                                        \
+                                                                           \
+    tmp_dpadd_0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)filt0);               \
+    tmp_dpadd_0 = __msa_dpadd_s_h(tmp_dpadd_0, (v16i8)vec1, (v16i8)filt1); \
+    tmp_dpadd_1 = __msa_dotp_s_h((v16i8)vec2, (v16i8)filt2);               \
+    tmp_dpadd_1 = __msa_dpadd_s_h(tmp_dpadd_1, (v16i8)vec3, (v16i8)filt3); \
+    tmp_dpadd_0 = __msa_adds_s_h(tmp_dpadd_0, tmp_dpadd_1);                \
+                                                                           \
+    tmp_dpadd_0;                                                           \
+  })
+
+#define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,     \
+                                   mask2, mask3, filt0, filt1, filt2, filt3, \
+                                   out0, out1)                               \
+  {                                                                          \
+    v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;    \
+    v8i16 res0_m, res1_m, res2_m, res3_m;                                    \
+                                                                             \
+    VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m);        \
+    DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, res0_m, res1_m);               \
+    VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m);        \
+    DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, res0_m, res1_m);              \
+    VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m);        \
+    DOTP_SB2_SH(vec4_m, vec5_m, filt2, filt2, res2_m, res3_m);               \
+    VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m);        \
+    DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, res2_m, res3_m);              \
+    ADDS_SH2_SH(res0_m, res2_m, res1_m, res3_m, out0, out1);                 \
+  }
+
+#define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,     \
+                                   mask2, mask3, filt0, filt1, filt2, filt3, \
+                                   out0, out1, out2, out3)                   \
+  {                                                                          \
+    v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;    \
+    v8i16 res0_m, res1_m, res2_m, res3_m, res4_m, res5_m, res6_m, res7_m;    \
+                                                                             \
+    VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);        \
+    VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);        \
+    DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,  \
+                res0_m, res1_m, res2_m, res3_m);                             \
+    VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m);        \
+    VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m);        \
+    DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2,  \
+                res4_m, res5_m, res6_m, res7_m);                             \
+    VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m);        \
+    VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m);        \
+    DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1, \
+                 res0_m, res1_m, res2_m, res3_m);                            \
+    VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m);        \
+    VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m);        \
+    DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3, \
+                 res4_m, res5_m, res6_m, res7_m);                            \
+    ADDS_SH4_SH(res0_m, res4_m, res1_m, res5_m, res2_m, res6_m, res3_m,      \
+                res7_m, out0, out1, out2, out3);                             \
+  }
+
+#endif  // AOM_AOM_DSP_MIPS_AOM_CONVOLVE_MSA_H_
diff --git a/libs/libaom/src/aom_dsp/mips/common_dspr2.c b/libs/libaom/src/aom_dsp/mips/common_dspr2.c
new file mode 100644
index 000000000..00ab75dc3
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/mips/common_dspr2.c
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/mips/common_dspr2.h"
+
+#if HAVE_DSPR2
+uint8_t aom_ff_cropTbl_a[256 + 2 * CROP_WIDTH];
+uint8_t *aom_ff_cropTbl;
+
+void aom_dsputil_static_init(void) {
+  int i;
+
+  for (i = 0; i < 256; i++) aom_ff_cropTbl_a[i + CROP_WIDTH] = i;
+
+  for (i = 0; i < CROP_WIDTH; i++) {
+    aom_ff_cropTbl_a[i] = 0;
+    aom_ff_cropTbl_a[i + CROP_WIDTH + 256] = 255;
+  }
+
+  aom_ff_cropTbl = &aom_ff_cropTbl_a[CROP_WIDTH];
+}
+
+#endif
diff --git a/libs/libaom/src/aom_dsp/mips/common_dspr2.h b/libs/libaom/src/aom_dsp/mips/common_dspr2.h
new file mode 100644
index 000000000..c42188d62
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/mips/common_dspr2.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_MIPS_COMMON_DSPR2_H_
+#define AOM_AOM_DSP_MIPS_COMMON_DSPR2_H_
+
+#include <assert.h>
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#if HAVE_DSPR2
+#define CROP_WIDTH 512
+
+extern uint8_t *aom_ff_cropTbl;  // From "aom_dsp/mips/intrapred4_dspr2.c"
+
+static INLINE void prefetch_load(const unsigned char *src) {
+  __asm__ __volatile__("pref   0,  0(%[src])   \n\t" : : [src] "r"(src));
+}
+
+/* prefetch data for store */
+static INLINE void prefetch_store(unsigned char *dst) {
+  __asm__ __volatile__("pref   1,  0(%[dst])   \n\t" : : [dst] "r"(dst));
+}
+
+static INLINE void prefetch_load_streamed(const unsigned char *src) {
+  __asm__ __volatile__("pref   4,  0(%[src])   \n\t" : : [src] "r"(src));
+}
+
+/* prefetch data for store */
+static INLINE void prefetch_store_streamed(unsigned char *dst) {
+  __asm__ __volatile__("pref   5,  0(%[dst])   \n\t" : : [dst] "r"(dst));
+}
+#endif  // #if HAVE_DSPR2
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_DSP_MIPS_COMMON_DSPR2_H_
diff --git a/libs/libaom/src/aom_dsp/mips/convolve2_dspr2.c b/libs/libaom/src/aom_dsp/mips/convolve2_dspr2.c
new file mode 100644
index 000000000..08bf1ab30
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/mips/convolve2_dspr2.c
@@ -0,0 +1,1031 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/mips/convolve_common_dspr2.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_ports/mem.h"
+
+#if HAVE_DSPR2
+static void convolve_bi_horiz_4_transposed_dspr2(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    const int16_t *filter_x0, int32_t h) {
+  int32_t y;
+  uint8_t *cm = aom_ff_cropTbl;
+  uint8_t *dst_ptr;
+  int32_t Temp1, Temp2;
+  uint32_t vector4a = 64;
+  uint32_t tp1, tp2;
+  uint32_t p1, p2;
+  const int16_t *filter = &filter_x0[3];
+  uint32_t filter45;
+
+  filter45 = ((const int32_t *)filter)[0];
+
+  for (y = h; y--;) {
+    dst_ptr = dst;
+    /* prefetch data to cache memory */
+    prefetch_load(src + src_stride);
+    prefetch_load(src + src_stride + 32);
+
+    __asm__ __volatile__(
+        "ulw              %[tp1],         0(%[src])                      \n\t"
+        "ulw              %[tp2],         4(%[src])                      \n\t"
+
+        /* even 1. pixel */
+        "mtlo             %[vector4a],    $ac3                           \n\t"
+        "mthi             $zero,          $ac3                           \n\t"
+        "preceu.ph.qbr    %[p1],          %[tp1]                         \n\t"
+        "preceu.ph.qbl    %[p2],          %[tp1]                         \n\t"
+        "dpa.w.ph         $ac3,           %[p1],          %[filter45]    \n\t"
+        "extp             %[Temp1],       $ac3,           31             \n\t"
+
+        /* even 2. pixel */
+        "mtlo             %[vector4a],    $ac2                           \n\t"
+        "mthi             $zero,          $ac2                           \n\t"
+        "balign           %[tp2],         %[tp1],         3              \n\t"
+        "dpa.w.ph         $ac2,           %[p2],          %[filter45]    \n\t"
+        "extp             %[Temp2],       $ac2,           31             \n\t"
+
+        /* odd 1. pixel */
+        "lbux             %[tp1],         %[Temp1](%[cm])                \n\t"
+        "mtlo             %[vector4a],    $ac3                           \n\t"
+        "mthi             $zero,          $ac3                           \n\t"
+        "preceu.ph.qbr    %[p1],          %[tp2]                         \n\t"
+        "preceu.ph.qbl    %[p2],          %[tp2]                         \n\t"
+        "dpa.w.ph         $ac3,           %[p1],          %[filter45]    \n\t"
+        "extp             %[Temp1],       $ac3,           31             \n\t"
+
+        /* odd 2. pixel */
+        "lbux             %[tp2],         %[Temp2](%[cm])                \n\t"
+        "mtlo             %[vector4a],    $ac2                           \n\t"
+        "mthi             $zero,          $ac2                           \n\t"
+        "dpa.w.ph         $ac2,           %[p2],          %[filter45]    \n\t"
+        "extp             %[Temp2],       $ac2,           31             \n\t"
+
+        /* clamp */
+        "lbux             %[p1],          %[Temp1](%[cm])                \n\t"
+        "lbux             %[p2],          %[Temp2](%[cm])                \n\t"
+
+        /* store bytes */
+        "sb               %[tp1],         0(%[dst_ptr])                  \n\t"
+        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
+
+        "sb               %[p1],          0(%[dst_ptr])                  \n\t"
+        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
+
+        "sb               %[tp2],         0(%[dst_ptr])                  \n\t"
+        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
+
+        "sb               %[p2],          0(%[dst_ptr])                  \n\t"
+        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
+
+        : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [p1] "=&r"(p1), [p2] "=&r"(p2),
+          [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [dst_ptr] "+r"(dst_ptr)
+        : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
+          [src] "r"(src), [dst_stride] "r"(dst_stride));
+
+    /* Next row... */
+    src += src_stride;
+    dst += 1;
+  }
+}
+
+static void convolve_bi_horiz_8_transposed_dspr2(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    const int16_t *filter_x0, int32_t h) {
+  int32_t y;
+  uint8_t *cm = aom_ff_cropTbl;
+  uint8_t *dst_ptr;
+  uint32_t vector4a = 64;
+  int32_t Temp1, Temp2, Temp3;
+  uint32_t tp1, tp2, tp3;
+  uint32_t p1, p2, p3, p4;
+  uint8_t *odd_dst;
+  uint32_t dst_pitch_2 = (dst_stride << 1);
+  const int16_t *filter = &filter_x0[3];
+  uint32_t filter45;
+
+  filter45 = ((const int32_t *)filter)[0];
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    prefetch_load(src + src_stride);
+    prefetch_load(src + src_stride + 32);
+
+    dst_ptr = dst;
+    odd_dst = (dst_ptr + dst_stride);
+
+    __asm__ __volatile__(
+        "ulw              %[tp1],         0(%[src])                       \n\t"
+        "ulw              %[tp2],         4(%[src])                       \n\t"
+
+        /* even 1. pixel */
+        "mtlo             %[vector4a],    $ac3                            \n\t"
+        "mthi             $zero,          $ac3                            \n\t"
+        "mtlo             %[vector4a],    $ac2                            \n\t"
+        "mthi             $zero,          $ac2                            \n\t"
+        "preceu.ph.qbr    %[p1],          %[tp1]                          \n\t"
+        "preceu.ph.qbl    %[p2],          %[tp1]                          \n\t"
+        "preceu.ph.qbr    %[p3],          %[tp2]                          \n\t"
+        "preceu.ph.qbl    %[p4],          %[tp2]                          \n\t"
+        "ulw              %[tp3],         8(%[src])                       \n\t"
+        "dpa.w.ph         $ac3,           %[p1],          %[filter45]     \n\t"
+        "extp             %[Temp1],       $ac3,           31              \n\t"
+
+        /* even 2. pixel */
+        "dpa.w.ph         $ac2,           %[p2],          %[filter45]     \n\t"
+        "extp             %[Temp3],       $ac2,           31              \n\t"
+
+        /* even 3. pixel */
+        "lbux             %[Temp2],       %[Temp1](%[cm])                 \n\t"
+        "mtlo             %[vector4a],    $ac1                            \n\t"
+        "mthi             $zero,          $ac1                            \n\t"
+        "balign           %[tp3],         %[tp2],         3              \n\t"
+        "balign           %[tp2],         %[tp1],         3              \n\t"
+        "dpa.w.ph         $ac1,           %[p3],          %[filter45]     \n\t"
+        "lbux             %[tp1],         %[Temp3](%[cm])                 \n\t"
+        "extp             %[p3],          $ac1,           31              \n\t"
+
+        /* even 4. pixel */
+        "mtlo             %[vector4a],    $ac2                            \n\t"
+        "mthi             $zero,          $ac2                            \n\t"
+        "mtlo             %[vector4a],    $ac3                            \n\t"
+        "mthi             $zero,          $ac3                            \n\t"
+        "sb               %[Temp2],       0(%[dst_ptr])                   \n\t"
+        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
+        "sb               %[tp1],         0(%[dst_ptr])                   \n\t"
+        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
+
+        "dpa.w.ph         $ac2,           %[p4],          %[filter45]     \n\t"
+        "extp             %[Temp3],       $ac2,           31              \n\t"
+
+        "lbux             %[Temp1],         %[p3](%[cm])                    "
+        "\n\t"
+
+        /* odd 1. pixel */
+        "mtlo             %[vector4a],    $ac1                            \n\t"
+        "mthi             $zero,          $ac1                            \n\t"
+        "preceu.ph.qbr    %[p1],          %[tp2]                          \n\t"
+        "preceu.ph.qbl    %[p2],          %[tp2]                          \n\t"
+        "preceu.ph.qbr    %[p3],          %[tp3]                          \n\t"
+        "preceu.ph.qbl    %[p4],          %[tp3]                          \n\t"
+        "sb               %[Temp1],       0(%[dst_ptr])                   \n\t"
+        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
+
+        "dpa.w.ph         $ac3,           %[p1],          %[filter45]     \n\t"
+        "extp             %[Temp2],       $ac3,           31              \n\t"
+
+        /* odd 2. pixel */
+        "lbux             %[tp1],         %[Temp3](%[cm])                 \n\t"
+        "mtlo             %[vector4a],    $ac3                            \n\t"
+        "mthi             $zero,          $ac3                            \n\t"
+        "mtlo             %[vector4a],    $ac2                            \n\t"
+        "mthi             $zero,          $ac2                            \n\t"
+        "dpa.w.ph         $ac1,           %[p2],          %[filter45]     \n\t"
+        "sb               %[tp1],         0(%[dst_ptr])                   \n\t"
+        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
+        "extp             %[Temp3],       $ac1,           31              \n\t"
+
+        /* odd 3. pixel */
+        "lbux             %[tp3],         %[Temp2](%[cm])                 \n\t"
+        "dpa.w.ph         $ac3,           %[p3],          %[filter45]     \n\t"
+        "extp             %[Temp2],       $ac3,           31              \n\t"
+
+        /* odd 4. pixel */
+        "sb               %[tp3],         0(%[odd_dst])                   \n\t"
+        "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
+        "dpa.w.ph         $ac2,           %[p4],          %[filter45]     \n\t"
+        "extp             %[Temp1],       $ac2,           31              \n\t"
+
+        /* clamp */
+        "lbux             %[p4],          %[Temp3](%[cm])                 \n\t"
+        "lbux             %[p2],          %[Temp2](%[cm])                 \n\t"
+        "lbux             %[p1],          %[Temp1](%[cm])                 \n\t"
+
+        /* store bytes */
+        "sb               %[p4],          0(%[odd_dst])                   \n\t"
+        "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
+
+        "sb               %[p2],          0(%[odd_dst])                   \n\t"
+        "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
+
+        "sb               %[p1],          0(%[odd_dst])                   \n\t"
+
+        : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), [p1] "=&r"(p1),
+          [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), [Temp1] "=&r"(Temp1),
+          [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), [dst_ptr] "+r"(dst_ptr),
+          [odd_dst] "+r"(odd_dst)
+        : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
+          [src] "r"(src), [dst_pitch_2] "r"(dst_pitch_2));
+
+    /* Next row... */
+    src += src_stride;
+    dst += 1;
+  }
+}
+
+static void convolve_bi_horiz_16_transposed_dspr2(
+    const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr,
+    int32_t dst_stride, const int16_t *filter_x0, int32_t h, int32_t count) {
+  int32_t c, y;
+  const uint8_t *src;
+  uint8_t *dst;
+  uint8_t *cm = aom_ff_cropTbl;
+  uint32_t vector_64 = 64;
+  int32_t Temp1, Temp2, Temp3;
+  uint32_t qload1, qload2;
+  uint32_t p1, p2, p3, p4, p5;
+  uint32_t st1, st2, st3;
+  uint32_t dst_pitch_2 = (dst_stride << 1);
+  uint8_t *odd_dst;
+  const int16_t *filter = &filter_x0[3];
+  uint32_t filter45;
+
+  filter45 = ((const int32_t *)filter)[0];
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    prefetch_load(src_ptr + src_stride);
+    prefetch_load(src_ptr + src_stride + 32);
+
+    src = src_ptr;
+    dst = dst_ptr;
+
+    odd_dst = (dst + dst_stride);
+
+    for (c = 0; c < count; c++) {
+      __asm__ __volatile__(
+          "ulw              %[qload1],        0(%[src])                       "
+          "\n\t"
+          "ulw              %[qload2],        4(%[src])                       "
+          "\n\t"
+
+          /* even 1. pixel */
+          "mtlo             %[vector_64],     $ac1                            "
+          "\n\t" /* even 1 */
+          "mthi             $zero,            $ac1                            "
+          "\n\t"
+          "mtlo             %[vector_64],     $ac2                            "
+          "\n\t" /* even 2 */
+          "mthi             $zero,            $ac2                            "
+          "\n\t"
+          "preceu.ph.qbr    %[p1],            %[qload1]                       "
+          "\n\t"
+          "preceu.ph.qbl    %[p2],            %[qload1]                       "
+          "\n\t"
+          "preceu.ph.qbr    %[p3],            %[qload2]                       "
+          "\n\t"
+          "preceu.ph.qbl    %[p4],            %[qload2]                       "
+          "\n\t"
+          "ulw              %[qload1],        8(%[src])                       "
+          "\n\t"
+          "dpa.w.ph         $ac1,             %[p1],          %[filter45]     "
+          "\n\t" /* even 1 */
+          "extp             %[Temp1],         $ac1,           31              "
+          "\n\t" /* even 1 */
+
+          /* even 2. pixel */
+          "mtlo             %[vector_64],     $ac3                            "
+          "\n\t" /* even 3 */
+          "mthi             $zero,            $ac3                            "
+          "\n\t"
+          "preceu.ph.qbr    %[p1],            %[qload1]                       "
+          "\n\t"
+          "preceu.ph.qbl    %[p5],            %[qload1]                       "
+          "\n\t"
+          "ulw              %[qload2],        12(%[src])                      "
+          "\n\t"
+          "dpa.w.ph         $ac2,             %[p2],          %[filter45]     "
+          "\n\t" /* even 1 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 "
+          "\n\t" /* even 1 */
+          "extp             %[Temp2],         $ac2,           31              "
+          "\n\t" /* even 1 */
+
+          /* even 3. pixel */
+          "mtlo             %[vector_64],     $ac1                            "
+          "\n\t" /* even 4 */
+          "mthi             $zero,            $ac1                            "
+          "\n\t"
+          "preceu.ph.qbr    %[p2],            %[qload2]                       "
+          "\n\t"
+          "sb               %[st1],           0(%[dst])                       "
+          "\n\t" /* even 1 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]   "
+          "          \n\t"
+          "dpa.w.ph         $ac3,             %[p3],          %[filter45]     "
+          "\n\t" /* even 3 */
+          "extp             %[Temp3],         $ac3,           31              "
+          "\n\t" /* even 3 */
+          "lbux             %[st2],           %[Temp2](%[cm])                 "
+          "\n\t" /* even 1 */
+
+          /* even 4. pixel */
+          "mtlo             %[vector_64],     $ac2                            "
+          "\n\t" /* even 5 */
+          "mthi             $zero,            $ac2                            "
+          "\n\t"
+          "preceu.ph.qbl    %[p3],            %[qload2]                       "
+          "\n\t"
+          "sb               %[st2],           0(%[dst])                       "
+          "\n\t" /* even 2 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
+          "\n\t"
+          "dpa.w.ph         $ac1,             %[p4],          %[filter45]     "
+          "\n\t" /* even 4 */
+          "extp             %[Temp1],         $ac1,           31              "
+          "\n\t" /* even 4 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 "
+          "\n\t" /* even 3 */
+
+          /* even 5. pixel */
+          "mtlo             %[vector_64],     $ac3                            "
+          "\n\t" /* even 6 */
+          "mthi             $zero,            $ac3                            "
+          "\n\t"
+          "sb               %[st3],           0(%[dst])                       "
+          "\n\t" /* even 3 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
+          "\n\t"
+          "dpa.w.ph         $ac2,             %[p1],          %[filter45]     "
+          "\n\t" /* even 5 */
+          "extp             %[Temp2],         $ac2,           31              "
+          "\n\t" /* even 5 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 "
+          "\n\t" /* even 4 */
+
+          /* even 6. pixel */
+          "mtlo             %[vector_64],     $ac1                            "
+          "\n\t" /* even 7 */
+          "mthi             $zero,            $ac1                            "
+          "\n\t"
+          "sb               %[st1],           0(%[dst])                       "
+          "\n\t" /* even 4 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
+          "\n\t"
+          "ulw              %[qload1],        20(%[src])                      "
+          "\n\t"
+          "dpa.w.ph         $ac3,             %[p5],          %[filter45]     "
+          "\n\t" /* even 6 */
+          "extp             %[Temp3],         $ac3,           31              "
+          "\n\t" /* even 6 */
+          "lbux             %[st2],           %[Temp2](%[cm])                 "
+          "\n\t" /* even 5 */
+
+          /* even 7. pixel */
+          "mtlo             %[vector_64],     $ac2                            "
+          "\n\t" /* even 8 */
+          "mthi             $zero,            $ac2                            "
+          "\n\t"
+          "preceu.ph.qbr    %[p5],            %[qload1]                       "
+          "\n\t"
+          "sb               %[st2],           0(%[dst])                       "
+          "\n\t" /* even 5 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
+          "\n\t"
+          "dpa.w.ph         $ac1,             %[p2],          %[filter45]     "
+          "\n\t" /* even 7 */
+          "extp             %[Temp1],         $ac1,           31              "
+          "\n\t" /* even 7 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 "
+          "\n\t" /* even 6 */
+
+          /* even 8. pixel */
+          "mtlo             %[vector_64],     $ac3                            "
+          "\n\t" /* odd 1 */
+          "mthi             $zero,            $ac3                            "
+          "\n\t"
+          "dpa.w.ph         $ac2,             %[p3],          %[filter45]     "
+          "\n\t" /* even 8 */
+          "sb               %[st3],           0(%[dst])                       "
+          "\n\t" /* even 6 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
+          "\n\t"
+          "extp             %[Temp2],         $ac2,           31              "
+          "\n\t" /* even 8 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 "
+          "\n\t" /* even 7 */
+
+          /* ODD pixels */
+          "ulw              %[qload1],        1(%[src])                       "
+          "\n\t"
+          "ulw              %[qload2],        5(%[src])                       "
+          "\n\t"
+
+          /* odd 1. pixel */
+          "mtlo             %[vector_64],     $ac1                            "
+          "\n\t" /* odd 2 */
+          "mthi             $zero,            $ac1                            "
+          "\n\t"
+          "preceu.ph.qbr    %[p1],            %[qload1]                       "
+          "\n\t"
+          "preceu.ph.qbl    %[p2],            %[qload1]                       "
+          "\n\t"
+          "preceu.ph.qbr    %[p3],            %[qload2]                       "
+          "\n\t"
+          "preceu.ph.qbl    %[p4],            %[qload2]                       "
+          "\n\t"
+          "sb               %[st1],           0(%[dst])                       "
+          "\n\t" /* even 7 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
+          "\n\t"
+          "ulw              %[qload2],        9(%[src])                       "
+          "\n\t"
+          "dpa.w.ph         $ac3,             %[p1],          %[filter45]     "
+          "\n\t" /* odd 1 */
+          "extp             %[Temp3],         $ac3,           31              "
+          "\n\t" /* odd 1 */
+          "lbux             %[st2],           %[Temp2](%[cm])                 "
+          "\n\t" /* even 8 */
+
+          /* odd 2. pixel */
+          "mtlo             %[vector_64],     $ac2                            "
+          "\n\t" /* odd 3 */
+          "mthi             $zero,            $ac2                            "
+          "\n\t"
+          "preceu.ph.qbr    %[p1],            %[qload2]                       "
+          "\n\t"
+          "preceu.ph.qbl    %[p5],            %[qload2]                       "
+          "\n\t"
+          "sb               %[st2],           0(%[dst])                       "
+          "\n\t" /* even 8 */
+          "ulw              %[qload1],        13(%[src])                      "
+          "\n\t"
+          "dpa.w.ph         $ac1,             %[p2],          %[filter45]     "
+          "\n\t" /* odd 2 */
+          "extp             %[Temp1],         $ac1,           31              "
+          "\n\t" /* odd 2 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 "
+          "\n\t" /* odd 1 */
+
+          /* odd 3. pixel */
+          "mtlo             %[vector_64],     $ac3                            "
+          "\n\t" /* odd 4 */
+          "mthi             $zero,            $ac3                            "
+          "\n\t"
+          "preceu.ph.qbr    %[p2],            %[qload1]                       "
+          "\n\t"
+          "sb               %[st3],           0(%[odd_dst])                   "
+          "\n\t" /* odd 1 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
+          "\n\t"
+          "dpa.w.ph         $ac2,             %[p3],          %[filter45]     "
+          "\n\t" /* odd 3 */
+          "extp             %[Temp2],         $ac2,           31              "
+          "\n\t" /* odd 3 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 "
+          "\n\t" /* odd 2 */
+
+          /* odd 4. pixel */
+          "mtlo             %[vector_64],     $ac1                            "
+          "\n\t" /* odd 5 */
+          "mthi             $zero,            $ac1                            "
+          "\n\t"
+          "preceu.ph.qbl    %[p3],            %[qload1]                       "
+          "\n\t"
+          "sb               %[st1],           0(%[odd_dst])                   "
+          "\n\t" /* odd 2 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
+          "\n\t"
+          "dpa.w.ph         $ac3,             %[p4],          %[filter45]     "
+          "\n\t" /* odd 4 */
+          "extp             %[Temp3],         $ac3,           31              "
+          "\n\t" /* odd 4 */
+          "lbux             %[st2],           %[Temp2](%[cm])                 "
+          "\n\t" /* odd 3 */
+
+          /* odd 5. pixel */
+          "mtlo             %[vector_64],     $ac2                            "
+          "\n\t" /* odd 6 */
+          "mthi             $zero,            $ac2                            "
+          "\n\t"
+          "sb               %[st2],           0(%[odd_dst])                   "
+          "\n\t" /* odd 3 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
+          "\n\t"
+          "dpa.w.ph         $ac1,             %[p1],          %[filter45]     "
+          "\n\t" /* odd 5 */
+          "extp             %[Temp1],         $ac1,           31              "
+          "\n\t" /* odd 5 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 "
+          "\n\t" /* odd 4 */
+
+          /* odd 6. pixel */
+          "mtlo             %[vector_64],     $ac3                            "
+          "\n\t" /* odd 7 */
+          "mthi             $zero,            $ac3                            "
+          "\n\t"
+          "sb               %[st3],           0(%[odd_dst])                   "
+          "\n\t" /* odd 4 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
+          "\n\t"
+          "ulw              %[qload1],        21(%[src])                      "
+          "\n\t"
+          "dpa.w.ph         $ac2,             %[p5],          %[filter45]     "
+          "\n\t" /* odd 6 */
+          "extp             %[Temp2],         $ac2,           31              "
+          "\n\t" /* odd 6 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 "
+          "\n\t" /* odd 5 */
+
+          /* odd 7. pixel */
+          "mtlo             %[vector_64],     $ac1                            "
+          "\n\t" /* odd 8 */
+          "mthi             $zero,            $ac1                            "
+          "\n\t"
+          "preceu.ph.qbr    %[p5],            %[qload1]                       "
+          "\n\t"
+          "sb               %[st1],           0(%[odd_dst])                   "
+          "\n\t" /* odd 5 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
+          "\n\t"
+          "dpa.w.ph         $ac3,             %[p2],          %[filter45]     "
+          "\n\t" /* odd 7 */
+          "extp             %[Temp3],         $ac3,           31              "
+          "\n\t" /* odd 7 */
+
+          /* odd 8. pixel */
+          "dpa.w.ph         $ac1,             %[p3],          %[filter45]     "
+          "\n\t" /* odd 8 */
+          "extp             %[Temp1],         $ac1,           31              "
+          "\n\t" /* odd 8 */
+
+          "lbux             %[st2],           %[Temp2](%[cm])                 "
+          "\n\t" /* odd 6 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 "
+          "\n\t" /* odd 7 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 "
+          "\n\t" /* odd 8 */
+
+          "sb               %[st2],           0(%[odd_dst])                   "
+          "\n\t" /* odd 6 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
+          "\n\t"
+
+          "sb               %[st3],           0(%[odd_dst])                   "
+          "\n\t" /* odd 7 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
+          "\n\t"
+
+          "sb               %[st1],           0(%[odd_dst])                   "
+          "\n\t" /* odd 8 */
+
+          : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5),
+            [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3),
+            [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4),
+            [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
+            [dst] "+r"(dst), [odd_dst] "+r"(odd_dst)
+          : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
+            [src] "r"(src), [dst_pitch_2] "r"(dst_pitch_2));
+
+      src += 16;
+      dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
+      odd_dst = (dst + dst_stride);
+    }
+
+    /* Next row... */
+    src_ptr += src_stride;
+    dst_ptr += 1;
+  }
+}
+
+static void convolve_bi_horiz_64_transposed_dspr2(
+    const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr,
+    int32_t dst_stride, const int16_t *filter_x0, int32_t h) {
+  int32_t c, y;
+  const uint8_t *src;
+  uint8_t *dst;
+  uint8_t *cm = aom_ff_cropTbl;
+  uint32_t vector_64 = 64;
+  int32_t Temp1, Temp2, Temp3;
+  uint32_t qload1, qload2;
+  uint32_t p1, p2, p3, p4, p5;
+  uint32_t st1, st2, st3;
+  uint32_t dst_pitch_2 = (dst_stride << 1);
+  uint8_t *odd_dst;
+  const int16_t *filter = &filter_x0[3];
+  uint32_t filter45;
+
+  filter45 = ((const int32_t *)filter)[0];
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    prefetch_load(src_ptr + src_stride);
+    prefetch_load(src_ptr + src_stride + 32);
+    prefetch_load(src_ptr + src_stride + 64);
+
+    src = src_ptr;
+    dst = dst_ptr;
+
+    odd_dst = (dst + dst_stride);
+
+    for (c = 0; c < 4; c++) {
+      __asm__ __volatile__(
+          "ulw              %[qload1],        0(%[src])                       "
+          "\n\t"
+          "ulw              %[qload2],        4(%[src])                       "
+          "\n\t"
+
+          /* even 1. pixel */
+          "mtlo             %[vector_64],     $ac1                            "
+          "\n\t" /* even 1 */
+          "mthi             $zero,            $ac1                            "
+          "\n\t"
+          "mtlo             %[vector_64],     $ac2                            "
+          "\n\t" /* even 2 */
+          "mthi             $zero,            $ac2                            "
+          "\n\t"
+          "preceu.ph.qbr    %[p1],            %[qload1]                       "
+          "\n\t"
+          "preceu.ph.qbl    %[p2],            %[qload1]                       "
+          "\n\t"
+          "preceu.ph.qbr    %[p3],            %[qload2]                       "
+          "\n\t"
+          "preceu.ph.qbl    %[p4],            %[qload2]                       "
+          "\n\t"
+          "ulw              %[qload1],        8(%[src])                       "
+          "\n\t"
+          "dpa.w.ph         $ac1,             %[p1],          %[filter45]     "
+          "\n\t" /* even 1 */
+          "extp             %[Temp1],         $ac1,           31              "
+          "\n\t" /* even 1 */
+
+          /* even 2. pixel */
+          "mtlo             %[vector_64],     $ac3                            "
+          "\n\t" /* even 3 */
+          "mthi             $zero,            $ac3                            "
+          "\n\t"
+          "preceu.ph.qbr    %[p1],            %[qload1]                       "
+          "\n\t"
+          "preceu.ph.qbl    %[p5],            %[qload1]                       "
+          "\n\t"
+          "ulw              %[qload2],        12(%[src])                      "
+          "\n\t"
+          "dpa.w.ph         $ac2,             %[p2],          %[filter45]     "
+          "\n\t" /* even 1 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 "
+          "\n\t" /* even 1 */
+          "extp             %[Temp2],         $ac2,           31              "
+          "\n\t" /* even 1 */
+
+          /* even 3. pixel */
+          "mtlo             %[vector_64],     $ac1                            "
+          "\n\t" /* even 4 */
+          "mthi             $zero,            $ac1                            "
+          "\n\t"
+          "preceu.ph.qbr    %[p2],            %[qload2]                       "
+          "\n\t"
+          "sb               %[st1],           0(%[dst])                       "
+          "\n\t" /* even 1 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]   "
+          "          \n\t"
+          "dpa.w.ph         $ac3,             %[p3],          %[filter45]     "
+          "\n\t" /* even 3 */
+          "extp             %[Temp3],         $ac3,           31              "
+          "\n\t" /* even 3 */
+          "lbux             %[st2],           %[Temp2](%[cm])                 "
+          "\n\t" /* even 1 */
+
+          /* even 4. pixel */
+          "mtlo             %[vector_64],     $ac2                            "
+          "\n\t" /* even 5 */
+          "mthi             $zero,            $ac2                            "
+          "\n\t"
+          "preceu.ph.qbl    %[p3],            %[qload2]                       "
+          "\n\t"
+          "sb               %[st2],           0(%[dst])                       "
+          "\n\t" /* even 2 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
+          "\n\t"
+          "dpa.w.ph         $ac1,             %[p4],          %[filter45]     "
+          "\n\t" /* even 4 */
+          "extp             %[Temp1],         $ac1,           31              "
+          "\n\t" /* even 4 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 "
+          "\n\t" /* even 3 */
+
+          /* even 5. pixel */
+          "mtlo             %[vector_64],     $ac3                            "
+          "\n\t" /* even 6 */
+          "mthi             $zero,            $ac3                            "
+          "\n\t"
+          "sb               %[st3],           0(%[dst])                       "
+          "\n\t" /* even 3 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
+          "\n\t"
+          "dpa.w.ph         $ac2,             %[p1],          %[filter45]     "
+          "\n\t" /* even 5 */
+          "extp             %[Temp2],         $ac2,           31              "
+          "\n\t" /* even 5 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 "
+          "\n\t" /* even 4 */
+
+          /* even 6. pixel */
+          "mtlo             %[vector_64],     $ac1                            "
+          "\n\t" /* even 7 */
+          "mthi             $zero,            $ac1                            "
+          "\n\t"
+          "sb               %[st1],           0(%[dst])                       "
+          "\n\t" /* even 4 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
+          "\n\t"
+          "ulw              %[qload1],        20(%[src])                      "
+          "\n\t"
+          "dpa.w.ph         $ac3,             %[p5],          %[filter45]     "
+          "\n\t" /* even 6 */
+          "extp             %[Temp3],         $ac3,           31              "
+          "\n\t" /* even 6 */
+          "lbux             %[st2],           %[Temp2](%[cm])                 "
+          "\n\t" /* even 5 */
+
+          /* even 7. pixel */
+          "mtlo             %[vector_64],     $ac2                            "
+          "\n\t" /* even 8 */
+          "mthi             $zero,            $ac2                            "
+          "\n\t"
+          "preceu.ph.qbr    %[p5],            %[qload1]                       "
+          "\n\t"
+          "sb               %[st2],           0(%[dst])                       "
+          "\n\t" /* even 5 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
+          "\n\t"
+          "dpa.w.ph         $ac1,             %[p2],          %[filter45]     "
+          "\n\t" /* even 7 */
+          "extp             %[Temp1],         $ac1,           31              "
+          "\n\t" /* even 7 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 "
+          "\n\t" /* even 6 */
+
+          /* even 8. pixel */
+          "mtlo             %[vector_64],     $ac3                            "
+          "\n\t" /* odd 1 */
+          "mthi             $zero,            $ac3                            "
+          "\n\t"
+          "dpa.w.ph         $ac2,             %[p3],          %[filter45]     "
+          "\n\t" /* even 8 */
+          "sb               %[st3],           0(%[dst])                       "
+          "\n\t" /* even 6 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
+          "\n\t"
+          "extp             %[Temp2],         $ac2,           31              "
+          "\n\t" /* even 8 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 "
+          "\n\t" /* even 7 */
+
+          /* ODD pixels */
+          "ulw              %[qload1],        1(%[src])                       "
+          "\n\t"
+          "ulw              %[qload2],        5(%[src])                       "
+          "\n\t"
+
+          /* odd 1. pixel */
+          "mtlo             %[vector_64],     $ac1                            "
+          "\n\t" /* odd 2 */
+          "mthi             $zero,            $ac1                            "
+          "\n\t"
+          "preceu.ph.qbr    %[p1],            %[qload1]                       "
+          "\n\t"
+          "preceu.ph.qbl    %[p2],            %[qload1]                       "
+          "\n\t"
+          "preceu.ph.qbr    %[p3],            %[qload2]                       "
+          "\n\t"
+          "preceu.ph.qbl    %[p4],            %[qload2]                       "
+          "\n\t"
+          "sb               %[st1],           0(%[dst])                       "
+          "\n\t" /* even 7 */
+          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
+          "\n\t"
+          "ulw              %[qload2],        9(%[src])                       "
+          "\n\t"
+          "dpa.w.ph         $ac3,             %[p1],          %[filter45]     "
+          "\n\t" /* odd 1 */
+          "extp             %[Temp3],         $ac3,           31              "
+          "\n\t" /* odd 1 */
+          "lbux             %[st2],           %[Temp2](%[cm])                 "
+          "\n\t" /* even 8 */
+
+          /* odd 2. pixel */
+          "mtlo             %[vector_64],     $ac2                            "
+          "\n\t" /* odd 3 */
+          "mthi             $zero,            $ac2                            "
+          "\n\t"
+          "preceu.ph.qbr    %[p1],            %[qload2]                       "
+          "\n\t"
+          "preceu.ph.qbl    %[p5],            %[qload2]                       "
+          "\n\t"
+          "sb               %[st2],           0(%[dst])                       "
+          "\n\t" /* even 8 */
+          "ulw              %[qload1],        13(%[src])                      "
+          "\n\t"
+          "dpa.w.ph         $ac1,             %[p2],          %[filter45]     "
+          "\n\t" /* odd 2 */
+          "extp             %[Temp1],         $ac1,           31              "
+          "\n\t" /* odd 2 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 "
+          "\n\t" /* odd 1 */
+
+          /* odd 3. pixel */
+          "mtlo             %[vector_64],     $ac3                            "
+          "\n\t" /* odd 4 */
+          "mthi             $zero,            $ac3                            "
+          "\n\t"
+          "preceu.ph.qbr    %[p2],            %[qload1]                       "
+          "\n\t"
+          "sb               %[st3],           0(%[odd_dst])                   "
+          "\n\t" /* odd 1 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
+          "\n\t"
+          "dpa.w.ph         $ac2,             %[p3],          %[filter45]     "
+          "\n\t" /* odd 3 */
+          "extp             %[Temp2],         $ac2,           31              "
+          "\n\t" /* odd 3 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 "
+          "\n\t" /* odd 2 */
+
+          /* odd 4. pixel */
+          "mtlo             %[vector_64],     $ac1                            "
+          "\n\t" /* odd 5 */
+          "mthi             $zero,            $ac1                            "
+          "\n\t"
+          "preceu.ph.qbl    %[p3],            %[qload1]                       "
+          "\n\t"
+          "sb               %[st1],           0(%[odd_dst])                   "
+          "\n\t" /* odd 2 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
+          "\n\t"
+          "dpa.w.ph         $ac3,             %[p4],          %[filter45]     "
+          "\n\t" /* odd 4 */
+          "extp             %[Temp3],         $ac3,           31              "
+          "\n\t" /* odd 4 */
+          "lbux             %[st2],           %[Temp2](%[cm])                 "
+          "\n\t" /* odd 3 */
+
+          /* odd 5. pixel */
+          "mtlo             %[vector_64],     $ac2                            "
+          "\n\t" /* odd 6 */
+          "mthi             $zero,            $ac2                            "
+          "\n\t"
+          "sb               %[st2],           0(%[odd_dst])                   "
+          "\n\t" /* odd 3 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
+          "\n\t"
+          "dpa.w.ph         $ac1,             %[p1],          %[filter45]     "
+          "\n\t" /* odd 5 */
+          "extp             %[Temp1],         $ac1,           31              "
+          "\n\t" /* odd 5 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 "
+          "\n\t" /* odd 4 */
+
+          /* odd 6. pixel */
+          "mtlo             %[vector_64],     $ac3                            "
+          "\n\t" /* odd 7 */
+          "mthi             $zero,            $ac3                            "
+          "\n\t"
+          "sb               %[st3],           0(%[odd_dst])                   "
+          "\n\t" /* odd 4 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
+          "\n\t"
+          "ulw              %[qload1],        21(%[src])                      "
+          "\n\t"
+          "dpa.w.ph         $ac2,             %[p5],          %[filter45]     "
+          "\n\t" /* odd 6 */
+          "extp             %[Temp2],         $ac2,           31              "
+          "\n\t" /* odd 6 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 "
+          "\n\t" /* odd 5 */
+
+          /* odd 7. pixel */
+          "mtlo             %[vector_64],     $ac1                            "
+          "\n\t" /* odd 8 */
+          "mthi             $zero,            $ac1                            "
+          "\n\t"
+          "preceu.ph.qbr    %[p5],            %[qload1]                       "
+          "\n\t"
+          "sb               %[st1],           0(%[odd_dst])                   "
+          "\n\t" /* odd 5 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
+          "\n\t"
+          "dpa.w.ph         $ac3,             %[p2],          %[filter45]     "
+          "\n\t" /* odd 7 */
+          "extp             %[Temp3],         $ac3,           31              "
+          "\n\t" /* odd 7 */
+
+          /* odd 8. pixel */
+          "dpa.w.ph         $ac1,             %[p3],          %[filter45]     "
+          "\n\t" /* odd 8 */
+          "extp             %[Temp1],         $ac1,           31              "
+          "\n\t" /* odd 8 */
+
+          "lbux             %[st2],           %[Temp2](%[cm])                 "
+          "\n\t" /* odd 6 */
+          "lbux             %[st3],           %[Temp3](%[cm])                 "
+          "\n\t" /* odd 7 */
+          "lbux             %[st1],           %[Temp1](%[cm])                 "
+          "\n\t" /* odd 8 */
+
+          "sb               %[st2],           0(%[odd_dst])                   "
+          "\n\t" /* odd 6 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
+          "\n\t"
+
+          "sb               %[st3],           0(%[odd_dst])                   "
+          "\n\t" /* odd 7 */
+          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
+          "\n\t"
+
+          "sb               %[st1],           0(%[odd_dst])                   "
+          "\n\t" /* odd 8 */
+
+          : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5),
+            [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3),
+            [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4),
+            [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
+            [dst] "+r"(dst), [odd_dst] "+r"(odd_dst)
+          : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
+            [src] "r"(src), [dst_pitch_2] "r"(dst_pitch_2));
+
+      src += 16;
+      dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
+      odd_dst = (dst + dst_stride);
+    }
+
+    /* Next row... */
+    src_ptr += src_stride;
+    dst_ptr += 1;
+  }
+}
+
+void convolve_bi_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
+                                  uint8_t *dst, ptrdiff_t dst_stride,
+                                  const int16_t *filter, int w, int h) {
+  int x, y;
+
+  for (y = 0; y < h; ++y) {
+    for (x = 0; x < w; ++x) {
+      int sum = 0;
+
+      sum += src[x] * filter[3];
+      sum += src[x + 1] * filter[4];
+
+      dst[x * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+    }
+
+    src += src_stride;
+    dst += 1;
+  }
+}
+
+void aom_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+                         ptrdiff_t dst_stride, const int16_t *filter, int w,
+                         int h) {
+  uint32_t pos = 38;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
+                       :
+                       : [pos] "r"(pos));
+
+  /* prefetch data to cache memory */
+  prefetch_load(src);
+  prefetch_load(src + 32);
+
+  switch (w) {
+    case 4:
+      convolve_bi_horiz_4_transposed_dspr2(src, src_stride, dst, dst_stride,
+                                           filter, h);
+      break;
+    case 8:
+      convolve_bi_horiz_8_transposed_dspr2(src, src_stride, dst, dst_stride,
+                                           filter, h);
+      break;
+    case 16:
+    case 32:
+      convolve_bi_horiz_16_transposed_dspr2(src, src_stride, dst, dst_stride,
+                                            filter, h, (w / 16));
+      break;
+    case 64:
+      prefetch_load(src + 32);
+      convolve_bi_horiz_64_transposed_dspr2(src, src_stride, dst, dst_stride,
+                                            filter, h);
+      break;
+    default:
+      convolve_bi_horiz_transposed(src, src_stride, dst, dst_stride, filter, w,
+                                   h);
+      break;
+  }
+}
+#endif
diff --git a/libs/libaom/src/aom_dsp/mips/convolve2_horiz_dspr2.c b/libs/libaom/src/aom_dsp/mips/convolve2_horiz_dspr2.c
new file mode 100644
index 000000000..097da73ca
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/mips/convolve2_horiz_dspr2.c
@@ -0,0 +1,681 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/mips/convolve_common_dspr2.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/mem.h"
+
+#if HAVE_DSPR2
+static void convolve_bi_horiz_4_dspr2(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      const int16_t *filter_x0, int32_t h) {
+  int32_t y;
+  uint8_t *cm = aom_ff_cropTbl;
+  int32_t Temp1, Temp2, Temp3, Temp4;
+  uint32_t vector4a = 64;
+  uint32_t tp1, tp2;
+  uint32_t p1, p2;
+  const int16_t *filter = &filter_x0[3];
+  uint32_t filter45;
+
+  filter45 = ((const int32_t *)filter)[0];
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    prefetch_load(src + src_stride);
+    prefetch_load(src + src_stride + 32);
+    prefetch_store(dst + dst_stride);
+
+    __asm__ __volatile__(
+        "ulw              %[tp1],      0(%[src])                      \n\t"
+        "ulw              %[tp2],      4(%[src])                      \n\t"
+
+        /* even 1. pixel */
+        "mtlo             %[vector4a], $ac3                           \n\t"
+        "mthi             $zero,       $ac3                           \n\t"
+        "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
+        "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
+        "dpa.w.ph         $ac3,        %[p1],          %[filter45]    \n\t"
+        "extp             %[Temp1],    $ac3,           31             \n\t"
+
+        /* even 2. pixel */
+        "mtlo             %[vector4a], $ac2                           \n\t"
+        "mthi             $zero,       $ac2                           \n\t"
+        "balign           %[tp2],      %[tp1],         3              \n\t"
+        "dpa.w.ph         $ac2,        %[p2],          %[filter45]    \n\t"
+        "extp             %[Temp3],    $ac2,           31             \n\t"
+
+        /* odd 1. pixel */
+        "lbux             %[tp1],      %[Temp1](%[cm])                \n\t"
+        "mtlo             %[vector4a], $ac3                           \n\t"
+        "mthi             $zero,       $ac3                           \n\t"
+        "preceu.ph.qbr    %[p1],       %[tp2]                         \n\t"
+        "preceu.ph.qbl    %[p2],       %[tp2]                         \n\t"
+        "dpa.w.ph         $ac3,        %[p1],          %[filter45]    \n\t"
+        "extp             %[Temp2],    $ac3,           31             \n\t"
+
+        /* odd 2. pixel */
+        "lbux             %[tp2],      %[Temp3](%[cm])                \n\t"
+        "mtlo             %[vector4a], $ac2                           \n\t"
+        "mthi             $zero,       $ac2                           \n\t"
+        "dpa.w.ph         $ac2,        %[p2],          %[filter45]    \n\t"
+        "extp             %[Temp4],    $ac2,           31             \n\t"
+
+        /* clamp */
+        "lbux             %[p1],       %[Temp2](%[cm])                \n\t"
+        "lbux             %[p2],       %[Temp4](%[cm])                \n\t"
+
+        /* store bytes */
+        "sb               %[tp1],      0(%[dst])                      \n\t"
+        "sb               %[p1],       1(%[dst])                      \n\t"
+        "sb               %[tp2],      2(%[dst])                      \n\t"
+        "sb               %[p2],       3(%[dst])                      \n\t"
+
+        : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [p1] "=&r"(p1), [p2] "=&r"(p2),
+          [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
+          [Temp4] "=&r"(Temp4)
+        : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
+          [dst] "r"(dst), [src] "r"(src));
+
+    /* Next row... */
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void convolve_bi_horiz_8_dspr2(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      const int16_t *filter_x0, int32_t h) {
+  int32_t y;
+  uint8_t *cm = aom_ff_cropTbl;
+  uint32_t vector4a = 64;
+  int32_t Temp1, Temp2, Temp3;
+  uint32_t tp1, tp2, tp3;
+  uint32_t p1, p2, p3, p4;
+  uint32_t st0, st1;
+  const int16_t *filter = &filter_x0[3];
+  uint32_t filter45;
+
+  filter45 = ((const int32_t *)filter)[0];
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    prefetch_load(src + src_stride);
+    prefetch_load(src + src_stride + 32);
+    prefetch_store(dst + dst_stride);
+
+    __asm__ __volatile__(
+        "ulw              %[tp1],      0(%[src])                      \n\t"
+        "ulw              %[tp2],      4(%[src])                      \n\t"
+
+        /* even 1. pixel */
+        "mtlo             %[vector4a], $ac3                           \n\t"
+        "mthi             $zero,       $ac3                           \n\t"
+        "mtlo             %[vector4a], $ac2                           \n\t"
+        "mthi             $zero,       $ac2                           \n\t"
+        "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
+        "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
+        "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
+        "preceu.ph.qbl    %[p4],       %[tp2]                         \n\t"
+        "ulw              %[tp3],      8(%[src])                      \n\t"
+        "dpa.w.ph         $ac3,        %[p1],          %[filter45]    \n\t"
+        "extp             %[Temp1],    $ac3,           31             \n\t"
+
+        /* even 2. pixel */
+        "dpa.w.ph         $ac2,        %[p2],          %[filter45]    \n\t"
+        "extp             %[Temp3],    $ac2,           31             \n\t"
+
+        /* even 3. pixel */
+        "lbux             %[st0],      %[Temp1](%[cm])                \n\t"
+        "mtlo             %[vector4a], $ac1                           \n\t"
+        "mthi             $zero,       $ac1                           \n\t"
+        "dpa.w.ph         $ac1,        %[p3],          %[filter45]    \n\t"
+        "extp             %[Temp1],    $ac1,           31             \n\t"
+
+        /* even 4. pixel */
+        "mtlo             %[vector4a], $ac2                           \n\t"
+        "mthi             $zero,       $ac2                           \n\t"
+        "mtlo             %[vector4a], $ac3                           \n\t"
+        "mthi             $zero,       $ac3                           \n\t"
+        "sb               %[st0],      0(%[dst])                      \n\t"
+        "lbux             %[st1],      %[Temp3](%[cm])                \n\t"
+
+        "balign           %[tp3],      %[tp2],         3              \n\t"
+        "balign           %[tp2],      %[tp1],         3              \n\t"
+
+        "dpa.w.ph         $ac2,        %[p4],          %[filter45]    \n\t"
+        "extp             %[Temp3],    $ac2,           31             \n\t"
+
+        "lbux             %[st0],      %[Temp1](%[cm])                \n\t"
+
+        /* odd 1. pixel */
+        "mtlo             %[vector4a], $ac1                           \n\t"
+        "mthi             $zero,       $ac1                           \n\t"
+        "sb               %[st1],      2(%[dst])                      \n\t"
+        "preceu.ph.qbr    %[p1],       %[tp2]                         \n\t"
+        "preceu.ph.qbl    %[p2],       %[tp2]                         \n\t"
+        "preceu.ph.qbr    %[p3],       %[tp3]                         \n\t"
+        "preceu.ph.qbl    %[p4],       %[tp3]                         \n\t"
+        "sb               %[st0],      4(%[dst])                      \n\t"
+        "dpa.w.ph         $ac3,        %[p1],          %[filter45]    \n\t"
+        "extp             %[Temp2],    $ac3,           31             \n\t"
+
+        /* odd 2. pixel */
+        "mtlo             %[vector4a], $ac3                           \n\t"
+        "mthi             $zero,       $ac3                           \n\t"
+        "mtlo             %[vector4a], $ac2                           \n\t"
+        "mthi             $zero,       $ac2                           \n\t"
+        "lbux             %[st0],      %[Temp3](%[cm])                \n\t"
+        "dpa.w.ph         $ac1,        %[p2],          %[filter45]    \n\t"
+        "extp             %[Temp3],    $ac1,           31             \n\t"
+
+        /* odd 3. pixel */
+        "lbux             %[st1],      %[Temp2](%[cm])                \n\t"
+        "dpa.w.ph         $ac3,        %[p3],          %[filter45]    \n\t"
+        "extp             %[Temp2],    $ac3,           31             \n\t"
+
+        /* odd 4. pixel */
+        "sb               %[st1],      1(%[dst])                      \n\t"
+        "sb               %[st0],      6(%[dst])                      \n\t"
+        "dpa.w.ph         $ac2,        %[p4],          %[filter45]    \n\t"
+        "extp             %[Temp1],    $ac2,           31             \n\t"
+
+        /* clamp */
+        "lbux             %[p4],       %[Temp3](%[cm])                \n\t"
+        "lbux             %[p2],       %[Temp2](%[cm])                \n\t"
+        "lbux             %[p1],       %[Temp1](%[cm])                \n\t"
+
+        /* store bytes */
+        "sb               %[p4],       3(%[dst])                      \n\t"
+        "sb               %[p2],       5(%[dst])                      \n\t"
+        "sb               %[p1],       7(%[dst])                      \n\t"
+
+        : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
+          [st0] "=&r"(st0), [st1] "=&r"(st1), [p1] "=&r"(p1), [p2] "=&r"(p2),
+          [p3] "=&r"(p3), [p4] "=&r"(p4), [Temp1] "=&r"(Temp1),
+          [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
+        : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
+          [dst] "r"(dst), [src] "r"(src));
+
+    /* Next row... */
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void convolve_bi_horiz_16_dspr2(const uint8_t *src_ptr,
+                                       int32_t src_stride, uint8_t *dst_ptr,
+                                       int32_t dst_stride,
+                                       const int16_t *filter_x0, int32_t h,
+                                       int32_t count) {
+  int32_t y, c;
+  const uint8_t *src;
+  uint8_t *dst;
+  uint8_t *cm = aom_ff_cropTbl;
+  uint32_t vector_64 = 64;
+  int32_t Temp1, Temp2, Temp3;
+  uint32_t qload1, qload2, qload3;
+  uint32_t p1, p2, p3, p4, p5;
+  uint32_t st1, st2, st3;
+  const int16_t *filter = &filter_x0[3];
+  uint32_t filter45;
+
+  filter45 = ((const int32_t *)filter)[0];
+
+  for (y = h; y--;) {
+    src = src_ptr;
+    dst = dst_ptr;
+
+    /* prefetch data to cache memory */
+    prefetch_load(src_ptr + src_stride);
+    prefetch_load(src_ptr + src_stride + 32);
+    prefetch_store(dst_ptr + dst_stride);
+
+    for (c = 0; c < count; c++) {
+      __asm__ __volatile__(
+          "ulw              %[qload1],    0(%[src])                    \n\t"
+          "ulw              %[qload2],    4(%[src])                    \n\t"
+
+          /* even 1. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
+          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
+          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
+          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
+          "ulw              %[qload3],    8(%[src])                    \n\t"
+          "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* even 1 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
+
+          /* even 2. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
+          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
+          "ulw              %[qload1],    12(%[src])                   \n\t"
+          "dpa.w.ph         $ac2,         %[p2],          %[filter45]  \n\t" /* even 1 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
+
+          /* even 3. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
+          "sb               %[st1],       0(%[dst])                    \n\t" /* even 1 */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter45]  \n\t" /* even 3 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
+
+          /* even 4. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
+          "sb               %[st2],       2(%[dst])                    \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter45]  \n\t" /* even 4 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
+
+          /* even 5. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "sb               %[st3],       4(%[dst])                    \n\t" /* even 3 */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter45]  \n\t" /* even 5 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
+
+          /* even 6. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "sb               %[st1],       6(%[dst])                    \n\t" /* even 4 */
+          "dpa.w.ph         $ac3,         %[p5],          %[filter45]  \n\t" /* even 6 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
+
+          /* even 7. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "sb               %[st2],       8(%[dst])                    \n\t" /* even 5 */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* even 7 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
+
+          /* even 8. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* even 8 */
+          "sb               %[st3],       10(%[dst])                   \n\t" /* even 6 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
+
+          /* ODD pixels */
+          "ulw              %[qload1],    1(%[src])                    \n\t"
+          "ulw              %[qload2],    5(%[src])                    \n\t"
+
+          /* odd 1. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
+          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
+          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
+          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
+          "sb               %[st1],       12(%[dst])                   \n\t" /* even 7 */
+          "ulw              %[qload3],    9(%[src])                    \n\t"
+          "dpa.w.ph         $ac3,         %[p1],          %[filter45]  \n\t" /* odd 1 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
+
+          /* odd 2. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
+          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
+          "sb               %[st2],       14(%[dst])                   \n\t" /* even 8 */
+          "ulw              %[qload1],    13(%[src])                   \n\t"
+          "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* odd 2 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
+
+          /* odd 3. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
+          "sb               %[st3],       1(%[dst])                    \n\t" /* odd 1 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* odd 3 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
+
+          /* odd 4. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
+          "sb               %[st1],       3(%[dst])                    \n\t" /* odd 2 */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter45]  \n\t" /* odd 4 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
+
+          /* odd 5. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "sb               %[st2],       5(%[dst])                    \n\t" /* odd 3 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* odd 5 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
+
+          /* odd 6. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "sb               %[st3],       7(%[dst])                    \n\t" /* odd 4 */
+          "dpa.w.ph         $ac2,         %[p5],          %[filter45]  \n\t" /* odd 6 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
+
+          /* odd 7. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "sb               %[st1],       9(%[dst])                    \n\t" /* odd 5 */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter45]  \n\t" /* odd 7 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
+
+          /* odd 8. pixel */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter45]  \n\t" /* odd 8 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
+
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
+
+          "sb               %[st2],       11(%[dst])                   \n\t" /* odd 6 */
+          "sb               %[st3],       13(%[dst])                   \n\t" /* odd 7 */
+          "sb               %[st1],       15(%[dst])                   \n\t" /* odd 8 */
+
+          : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2),
+            [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2),
+            [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
+            [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1),
+            [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
+          : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
+            [dst] "r"(dst), [src] "r"(src));
+
+      src += 16;
+      dst += 16;
+    }
+
+    /* Next row... */
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+  }
+}
+
+static void convolve_bi_horiz_64_dspr2(const uint8_t *src_ptr,
+                                       int32_t src_stride, uint8_t *dst_ptr,
+                                       int32_t dst_stride,
+                                       const int16_t *filter_x0, int32_t h) {
+  int32_t y, c;
+  const uint8_t *src;
+  uint8_t *dst;
+  uint8_t *cm = aom_ff_cropTbl;
+  uint32_t vector_64 = 64;
+  int32_t Temp1, Temp2, Temp3;
+  uint32_t qload1, qload2, qload3;
+  uint32_t p1, p2, p3, p4, p5;
+  uint32_t st1, st2, st3;
+  const int16_t *filter = &filter_x0[3];
+  uint32_t filter45;
+
+  filter45 = ((const int32_t *)filter)[0];
+
+  for (y = h; y--;) {
+    src = src_ptr;
+    dst = dst_ptr;
+
+    /* prefetch data to cache memory */
+    prefetch_load(src_ptr + src_stride);
+    prefetch_load(src_ptr + src_stride + 32);
+    prefetch_load(src_ptr + src_stride + 64);
+    prefetch_store(dst_ptr + dst_stride);
+    prefetch_store(dst_ptr + dst_stride + 32);
+
+    for (c = 0; c < 4; c++) {
+      __asm__ __volatile__(
+          "ulw              %[qload1],    0(%[src])                    \n\t"
+          "ulw              %[qload2],    4(%[src])                    \n\t"
+
+          /* even 1. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
+          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
+          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
+          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
+          "ulw              %[qload3],    8(%[src])                    \n\t"
+          "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* even 1 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
+
+          /* even 2. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
+          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
+          "ulw              %[qload1],    12(%[src])                   \n\t"
+          "dpa.w.ph         $ac2,         %[p2],          %[filter45]  \n\t" /* even 1 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
+
+          /* even 3. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
+          "sb               %[st1],       0(%[dst])                    \n\t" /* even 1 */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter45]  \n\t" /* even 3 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
+
+          /* even 4. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
+          "sb               %[st2],       2(%[dst])                    \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter45]  \n\t" /* even 4 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
+
+          /* even 5. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "sb               %[st3],       4(%[dst])                    \n\t" /* even 3 */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter45]  \n\t" /* even 5 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
+
+          /* even 6. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "sb               %[st1],       6(%[dst])                    \n\t" /* even 4 */
+          "dpa.w.ph         $ac3,         %[p5],          %[filter45]  \n\t" /* even 6 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
+
+          /* even 7. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "sb               %[st2],       8(%[dst])                    \n\t" /* even 5 */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* even 7 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
+
+          /* even 8. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* even 8 */
+          "sb               %[st3],       10(%[dst])                   \n\t" /* even 6 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
+
+          /* ODD pixels */
+          "ulw              %[qload1],    1(%[src])                    \n\t"
+          "ulw              %[qload2],    5(%[src])                    \n\t"
+
+          /* odd 1. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
+          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
+          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
+          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
+          "sb               %[st1],       12(%[dst])                   \n\t" /* even 7 */
+          "ulw              %[qload3],    9(%[src])                    \n\t"
+          "dpa.w.ph         $ac3,         %[p1],          %[filter45]  \n\t" /* odd 1 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
+
+          /* odd 2. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
+          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
+          "sb               %[st2],       14(%[dst])                   \n\t" /* even 8 */
+          "ulw              %[qload1],    13(%[src])                   \n\t"
+          "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* odd 2 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
+
+          /* odd 3. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
+          "sb               %[st3],       1(%[dst])                    \n\t" /* odd 1 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* odd 3 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
+
+          /* odd 4. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
+          "sb               %[st1],       3(%[dst])                    \n\t" /* odd 2 */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter45]  \n\t" /* odd 4 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
+
+          /* odd 5. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "sb               %[st2],       5(%[dst])                    \n\t" /* odd 3 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* odd 5 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
+
+          /* odd 6. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "sb               %[st3],       7(%[dst])                    \n\t" /* odd 4 */
+          "dpa.w.ph         $ac2,         %[p5],          %[filter45]  \n\t" /* odd 6 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
+
+          /* odd 7. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "sb               %[st1],       9(%[dst])                    \n\t" /* odd 5 */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter45]  \n\t" /* odd 7 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
+
+          /* odd 8. pixel */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter45]  \n\t" /* odd 8 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
+
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
+
+          "sb               %[st2],       11(%[dst])                   \n\t" /* odd 6 */
+          "sb               %[st3],       13(%[dst])                   \n\t" /* odd 7 */
+          "sb               %[st1],       15(%[dst])                   \n\t" /* odd 8 */
+
+          : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2),
+            [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2),
+            [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
+            [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1),
+            [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
+          : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
+            [dst] "r"(dst), [src] "r"(src));
+
+      src += 16;
+      dst += 16;
+    }
+
+    /* Next row... */
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+  }
+}
+
+void aom_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+                               uint8_t *dst, ptrdiff_t dst_stride,
+                               const int16_t *filter_x, int x_step_q4,
+                               const int16_t *filter_y, int y_step_q4, int w,
+                               int h) {
+  uint32_t pos = 38;
+
+  assert(x_step_q4 == 16);
+
+  prefetch_load((const uint8_t *)filter_x);
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
+                       :
+                       : [pos] "r"(pos));
+
+  /* prefetch data to cache memory */
+  prefetch_load(src);
+  prefetch_load(src + 32);
+  prefetch_store(dst);
+
+  switch (w) {
+    case 4:
+      convolve_bi_horiz_4_dspr2(src, (int32_t)src_stride, dst,
+                                (int32_t)dst_stride, filter_x, (int32_t)h);
+      break;
+    case 8:
+      convolve_bi_horiz_8_dspr2(src, (int32_t)src_stride, dst,
+                                (int32_t)dst_stride, filter_x, (int32_t)h);
+      break;
+    case 16:
+      convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride, dst,
+                                 (int32_t)dst_stride, filter_x, (int32_t)h, 1);
+      break;
+    case 32:
+      convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride, dst,
+                                 (int32_t)dst_stride, filter_x, (int32_t)h, 2);
+      break;
+    case 64:
+      prefetch_load(src + 64);
+      prefetch_store(dst + 32);
+
+      convolve_bi_horiz_64_dspr2(src, (int32_t)src_stride, dst,
+                                 (int32_t)dst_stride, filter_x, (int32_t)h);
+      break;
+    default:
+      aom_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x,
+                            x_step_q4, filter_y, y_step_q4, w, h);
+      break;
+  }
+}
+#endif
diff --git a/libs/libaom/src/aom_dsp/mips/convolve2_vert_dspr2.c b/libs/libaom/src/aom_dsp/mips/convolve2_vert_dspr2.c
new file mode 100644
index 000000000..40abfd89e
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/mips/convolve2_vert_dspr2.c
@@ -0,0 +1,237 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/mips/convolve_common_dspr2.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/mem.h"
+
+#if HAVE_DSPR2
+static void convolve_bi_vert_4_dspr2(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     const int16_t *filter_y, int32_t w,
+                                     int32_t h) {
+  int32_t x, y;
+  const uint8_t *src_ptr;
+  uint8_t *dst_ptr;
+  uint8_t *cm = aom_ff_cropTbl;
+  uint32_t vector4a = 64;
+  uint32_t load1, load2;
+  uint32_t p1, p2;
+  uint32_t scratch1;
+  uint32_t store1, store2;
+  int32_t Temp1, Temp2;
+  const int16_t *filter = &filter_y[3];
+  uint32_t filter45;
+
+  filter45 = ((const int32_t *)filter)[0];
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    prefetch_store(dst + dst_stride);
+
+    for (x = 0; x < w; x += 4) {
+      src_ptr = src + x;
+      dst_ptr = dst + x;
+
+      __asm__ __volatile__(
+          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
+
+          "mtlo             %[vector4a],  $ac0                            \n\t"
+          "mtlo             %[vector4a],  $ac1                            \n\t"
+          "mtlo             %[vector4a],  $ac2                            \n\t"
+          "mtlo             %[vector4a],  $ac3                            \n\t"
+          "mthi             $zero,        $ac0                            \n\t"
+          "mthi             $zero,        $ac1                            \n\t"
+          "mthi             $zero,        $ac2                            \n\t"
+          "mthi             $zero,        $ac3                            \n\t"
+
+          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
+
+          "precrq.ph.w      %[p2],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac0,         %[p1],          %[filter45]     \n\t"
+          "dpa.w.ph         $ac1,         %[p2],          %[filter45]     \n\t"
+
+          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
+
+          "precrq.ph.w      %[p2],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac2,         %[p1],          %[filter45]     \n\t"
+          "dpa.w.ph         $ac3,         %[p2],          %[filter45]     \n\t"
+
+          "extp             %[Temp1],     $ac0,           31              \n\t"
+          "extp             %[Temp2],     $ac1,           31              \n\t"
+
+          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
+          "extp             %[Temp1],     $ac2,           31              \n\t"
+
+          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
+          "extp             %[Temp2],     $ac3,           31              \n\t"
+
+          "sb               %[store1],    0(%[dst_ptr])                   \n\t"
+          "sb               %[store2],    1(%[dst_ptr])                   \n\t"
+
+          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
+          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
+
+          "sb               %[store1],    2(%[dst_ptr])                   \n\t"
+          "sb               %[store2],    3(%[dst_ptr])                   \n\t"
+
+          : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1),
+            [p2] "=&r"(p2), [scratch1] "=&r"(scratch1), [Temp1] "=&r"(Temp1),
+            [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
+            [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
+          : [filter45] "r"(filter45), [vector4a] "r"(vector4a),
+            [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
+    }
+
+    /* Next row... */
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void convolve_bi_vert_64_dspr2(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      const int16_t *filter_y, int32_t h) {
+  int32_t x, y;
+  const uint8_t *src_ptr;
+  uint8_t *dst_ptr;
+  uint8_t *cm = aom_ff_cropTbl;
+  uint32_t vector4a = 64;
+  uint32_t load1, load2;
+  uint32_t p1, p2;
+  uint32_t scratch1;
+  uint32_t store1, store2;
+  int32_t Temp1, Temp2;
+  const int16_t *filter = &filter_y[3];
+  uint32_t filter45;
+
+  filter45 = ((const int32_t *)filter)[0];
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    prefetch_store(dst + dst_stride);
+
+    for (x = 0; x < 64; x += 4) {
+      src_ptr = src + x;
+      dst_ptr = dst + x;
+
+      __asm__ __volatile__(
+          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
+
+          "mtlo             %[vector4a],  $ac0                            \n\t"
+          "mtlo             %[vector4a],  $ac1                            \n\t"
+          "mtlo             %[vector4a],  $ac2                            \n\t"
+          "mtlo             %[vector4a],  $ac3                            \n\t"
+          "mthi             $zero,        $ac0                            \n\t"
+          "mthi             $zero,        $ac1                            \n\t"
+          "mthi             $zero,        $ac2                            \n\t"
+          "mthi             $zero,        $ac3                            \n\t"
+
+          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
+
+          "precrq.ph.w      %[p2],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac0,         %[p1],          %[filter45]     \n\t"
+          "dpa.w.ph         $ac1,         %[p2],          %[filter45]     \n\t"
+
+          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
+
+          "precrq.ph.w      %[p2],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac2,         %[p1],          %[filter45]     \n\t"
+          "dpa.w.ph         $ac3,         %[p2],          %[filter45]     \n\t"
+
+          "extp             %[Temp1],     $ac0,           31              \n\t"
+          "extp             %[Temp2],     $ac1,           31              \n\t"
+
+          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
+          "extp             %[Temp1],     $ac2,           31              \n\t"
+
+          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
+          "extp             %[Temp2],     $ac3,           31              \n\t"
+
+          "sb               %[store1],    0(%[dst_ptr])                   \n\t"
+          "sb               %[store2],    1(%[dst_ptr])                   \n\t"
+
+          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
+          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
+
+          "sb               %[store1],    2(%[dst_ptr])                   \n\t"
+          "sb               %[store2],    3(%[dst_ptr])                   \n\t"
+
+          : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1),
+            [p2] "=&r"(p2), [scratch1] "=&r"(scratch1), [Temp1] "=&r"(Temp1),
+            [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
+            [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
+          : [filter45] "r"(filter45), [vector4a] "r"(vector4a),
+            [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
+    }
+
+    /* Next row... */
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+void aom_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
+                              const int16_t *filter_x, int x_step_q4,
+                              const int16_t *filter_y, int y_step_q4, int w,
+                              int h) {
+  uint32_t pos = 38;
+
+  assert(y_step_q4 == 16);
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
+                       :
+                       : [pos] "r"(pos));
+
+  prefetch_store(dst);
+
+  switch (w) {
+    case 4:
+    case 8:
+    case 16:
+    case 32:
+      convolve_bi_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y, w,
+                               h);
+      break;
+    case 64:
+      prefetch_store(dst + 32);
+      convolve_bi_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, h);
+      break;
+    default:
+      aom_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,
+                           x_step_q4, filter_y, y_step_q4, w, h);
+      break;
+  }
+}
+#endif
diff --git a/libs/libaom/src/aom_dsp/mips/convolve8_dspr2.c b/libs/libaom/src/aom_dsp/mips/convolve8_dspr2.c
new file mode 100644
index 000000000..af54b4264
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/mips/convolve8_dspr2.c
@@ -0,0 +1,222 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/mips/convolve_common_dspr2.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_ports/mem.h"
+
+#if HAVE_DSPR2
+void aom_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+                             uint8_t *dst, ptrdiff_t dst_stride,
+                             const int16_t *filter_x, int filter_x_stride,
+                             const int16_t *filter_y, int filter_y_stride,
+                             int w, int h) {
+  int x, y;
+
+  (void)filter_x;
+  (void)filter_x_stride;
+  (void)filter_y;
+  (void)filter_y_stride;
+
+  /* prefetch data to cache memory */
+  prefetch_load(src);
+  prefetch_load(src + 32);
+  prefetch_store(dst);
+
+  switch (w) {
+    case 4: {
+      uint32_t tp1;
+
+      /* 1 word storage */
+      for (y = h; y--;) {
+        prefetch_load(src + src_stride);
+        prefetch_load(src + src_stride + 32);
+        prefetch_store(dst + dst_stride);
+
+        __asm__ __volatile__(
+            "ulw              %[tp1],         (%[src])      \n\t"
+            "sw               %[tp1],         (%[dst])      \n\t" /* store */
+
+            : [tp1] "=&r"(tp1)
+            : [src] "r"(src), [dst] "r"(dst));
+
+        src += src_stride;
+        dst += dst_stride;
+      }
+    } break;
+    case 8: {
+      uint32_t tp1, tp2;
+
+      /* 2 word storage */
+      for (y = h; y--;) {
+        prefetch_load(src + src_stride);
+        prefetch_load(src + src_stride + 32);
+        prefetch_store(dst + dst_stride);
+
+        __asm__ __volatile__(
+            "ulw              %[tp1],         0(%[src])      \n\t"
+            "ulw              %[tp2],         4(%[src])      \n\t"
+            "sw               %[tp1],         0(%[dst])      \n\t" /* store */
+            "sw               %[tp2],         4(%[dst])      \n\t" /* store */
+
+            : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2)
+            : [src] "r"(src), [dst] "r"(dst));
+
+        src += src_stride;
+        dst += dst_stride;
+      }
+    } break;
+    case 16: {
+      uint32_t tp1, tp2, tp3, tp4;
+
+      /* 4 word storage */
+      for (y = h; y--;) {
+        prefetch_load(src + src_stride);
+        prefetch_load(src + src_stride + 32);
+        prefetch_store(dst + dst_stride);
+
+        __asm__ __volatile__(
+            "ulw              %[tp1],         0(%[src])      \n\t"
+            "ulw              %[tp2],         4(%[src])      \n\t"
+            "ulw              %[tp3],         8(%[src])      \n\t"
+            "ulw              %[tp4],         12(%[src])     \n\t"
+
+            "sw               %[tp1],         0(%[dst])      \n\t" /* store */
+            "sw               %[tp2],         4(%[dst])      \n\t" /* store */
+            "sw               %[tp3],         8(%[dst])      \n\t" /* store */
+            "sw               %[tp4],         12(%[dst])     \n\t" /* store */
+
+            : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
+              [tp4] "=&r"(tp4)
+            : [src] "r"(src), [dst] "r"(dst));
+
+        src += src_stride;
+        dst += dst_stride;
+      }
+    } break;
+    case 32: {
+      uint32_t tp1, tp2, tp3, tp4;
+      uint32_t tp5, tp6, tp7, tp8;
+
+      /* 8 word storage */
+      for (y = h; y--;) {
+        prefetch_load(src + src_stride);
+        prefetch_load(src + src_stride + 32);
+        prefetch_store(dst + dst_stride);
+
+        __asm__ __volatile__(
+            "ulw              %[tp1],         0(%[src])      \n\t"
+            "ulw              %[tp2],         4(%[src])      \n\t"
+            "ulw              %[tp3],         8(%[src])      \n\t"
+            "ulw              %[tp4],         12(%[src])     \n\t"
+            "ulw              %[tp5],         16(%[src])     \n\t"
+            "ulw              %[tp6],         20(%[src])     \n\t"
+            "ulw              %[tp7],         24(%[src])     \n\t"
+            "ulw              %[tp8],         28(%[src])     \n\t"
+
+            "sw               %[tp1],         0(%[dst])      \n\t" /* store */
+            "sw               %[tp2],         4(%[dst])      \n\t" /* store */
+            "sw               %[tp3],         8(%[dst])      \n\t" /* store */
+            "sw               %[tp4],         12(%[dst])     \n\t" /* store */
+            "sw               %[tp5],         16(%[dst])     \n\t" /* store */
+            "sw               %[tp6],         20(%[dst])     \n\t" /* store */
+            "sw               %[tp7],         24(%[dst])     \n\t" /* store */
+            "sw               %[tp8],         28(%[dst])     \n\t" /* store */
+
+            : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
+              [tp4] "=&r"(tp4), [tp5] "=&r"(tp5), [tp6] "=&r"(tp6),
+              [tp7] "=&r"(tp7), [tp8] "=&r"(tp8)
+            : [src] "r"(src), [dst] "r"(dst));
+
+        src += src_stride;
+        dst += dst_stride;
+      }
+    } break;
+    case 64: {
+      uint32_t tp1, tp2, tp3, tp4;
+      uint32_t tp5, tp6, tp7, tp8;
+
+      prefetch_load(src + 64);
+      prefetch_store(dst + 32);
+
+      /* 16 word storage */
+      for (y = h; y--;) {
+        prefetch_load(src + src_stride);
+        prefetch_load(src + src_stride + 32);
+        prefetch_load(src + src_stride + 64);
+        prefetch_store(dst + dst_stride);
+        prefetch_store(dst + dst_stride + 32);
+
+        __asm__ __volatile__(
+            "ulw              %[tp1],         0(%[src])      \n\t"
+            "ulw              %[tp2],         4(%[src])      \n\t"
+            "ulw              %[tp3],         8(%[src])      \n\t"
+            "ulw              %[tp4],         12(%[src])     \n\t"
+            "ulw              %[tp5],         16(%[src])     \n\t"
+            "ulw              %[tp6],         20(%[src])     \n\t"
+            "ulw              %[tp7],         24(%[src])     \n\t"
+            "ulw              %[tp8],         28(%[src])     \n\t"
+
+            "sw               %[tp1],         0(%[dst])      \n\t" /* store */
+            "sw               %[tp2],         4(%[dst])      \n\t" /* store */
+            "sw               %[tp3],         8(%[dst])      \n\t" /* store */
+            "sw               %[tp4],         12(%[dst])     \n\t" /* store */
+            "sw               %[tp5],         16(%[dst])     \n\t" /* store */
+            "sw               %[tp6],         20(%[dst])     \n\t" /* store */
+            "sw               %[tp7],         24(%[dst])     \n\t" /* store */
+            "sw               %[tp8],         28(%[dst])     \n\t" /* store */
+
+            "ulw              %[tp1],         32(%[src])     \n\t"
+            "ulw              %[tp2],         36(%[src])     \n\t"
+            "ulw              %[tp3],         40(%[src])     \n\t"
+            "ulw              %[tp4],         44(%[src])     \n\t"
+            "ulw              %[tp5],         48(%[src])     \n\t"
+            "ulw              %[tp6],         52(%[src])     \n\t"
+            "ulw              %[tp7],         56(%[src])     \n\t"
+            "ulw              %[tp8],         60(%[src])     \n\t"
+
+            "sw               %[tp1],         32(%[dst])     \n\t" /* store */
+            "sw               %[tp2],         36(%[dst])     \n\t" /* store */
+            "sw               %[tp3],         40(%[dst])     \n\t" /* store */
+            "sw               %[tp4],         44(%[dst])     \n\t" /* store */
+            "sw               %[tp5],         48(%[dst])     \n\t" /* store */
+            "sw               %[tp6],         52(%[dst])     \n\t" /* store */
+            "sw               %[tp7],         56(%[dst])     \n\t" /* store */
+            "sw               %[tp8],         60(%[dst])     \n\t" /* store */
+
+            : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
+              [tp4] "=&r"(tp4), [tp5] "=&r"(tp5), [tp6] "=&r"(tp6),
+              [tp7] "=&r"(tp7), [tp8] "=&r"(tp8)
+            : [src] "r"(src), [dst] "r"(dst));
+
+        src += src_stride;
+        dst += dst_stride;
+      }
+    } break;
+    default:
+      for (y = h; y--;) {
+        for (x = 0; x < w; ++x) {
+          dst[x] = src[x];
+        }
+
+        src += src_stride;
+        dst += dst_stride;
+      }
+      break;
+  }
+}
+#endif
diff --git a/libs/libaom/src/aom_dsp/mips/convolve8_horiz_dspr2.c b/libs/libaom/src/aom_dsp/mips/convolve8_horiz_dspr2.c
new file mode 100644
index 000000000..f9c6879ab
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/mips/convolve8_horiz_dspr2.c
@@ -0,0 +1,879 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/mips/convolve_common_dspr2.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_ports/mem.h"
+
+#if HAVE_DSPR2
+static void convolve_horiz_4_dspr2(const uint8_t *src, int32_t src_stride,
+                                   uint8_t *dst, int32_t dst_stride,
+                                   const int16_t *filter_x0, int32_t h) {
+  int32_t y;
+  uint8_t *cm = aom_ff_cropTbl;
+  int32_t vector1b, vector2b, vector3b, vector4b;
+  int32_t Temp1, Temp2, Temp3, Temp4;
+  uint32_t vector4a = 64;
+  uint32_t tp1, tp2;
+  uint32_t p1, p2, p3, p4;
+  uint32_t n1, n2, n3, n4;
+  uint32_t tn1, tn2;
+
+  vector1b = ((const int32_t *)filter_x0)[0];
+  vector2b = ((const int32_t *)filter_x0)[1];
+  vector3b = ((const int32_t *)filter_x0)[2];
+  vector4b = ((const int32_t *)filter_x0)[3];
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    prefetch_load(src + src_stride);
+    prefetch_load(src + src_stride + 32);
+    prefetch_store(dst + dst_stride);
+
+    __asm__ __volatile__(
+        "ulw              %[tp1],      0(%[src])                      \n\t"
+        "ulw              %[tp2],      4(%[src])                      \n\t"
+
+        /* even 1. pixel */
+        "mtlo             %[vector4a], $ac3                           \n\t"
+        "mthi             $zero,       $ac3                           \n\t"
+        "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
+        "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
+        "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
+        "preceu.ph.qbl    %[p4],       %[tp2]                         \n\t"
+        "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac3,        %[p3],          %[vector3b]    \n\t"
+        "ulw              %[tn2],      8(%[src])                      \n\t"
+        "dpa.w.ph         $ac3,        %[p4],          %[vector4b]    \n\t"
+        "extp             %[Temp1],    $ac3,           31             \n\t"
+
+        /* even 2. pixel */
+        "mtlo             %[vector4a], $ac2                           \n\t"
+        "mthi             $zero,       $ac2                           \n\t"
+        "preceu.ph.qbr    %[p1],       %[tn2]                         \n\t"
+        "balign           %[tn1],      %[tn2],         3              \n\t"
+        "balign           %[tn2],      %[tp2],         3              \n\t"
+        "balign           %[tp2],      %[tp1],         3              \n\t"
+        "dpa.w.ph         $ac2,        %[p2],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac2,        %[p3],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac2,        %[p4],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac2,        %[p1],          %[vector4b]    \n\t"
+        "extp             %[Temp3],    $ac2,           31             \n\t"
+
+        /* odd 1. pixel */
+        "lbux             %[tp1],      %[Temp1](%[cm])                \n\t"
+        "mtlo             %[vector4a], $ac3                           \n\t"
+        "mthi             $zero,       $ac3                           \n\t"
+        "preceu.ph.qbr    %[n1],       %[tp2]                         \n\t"
+        "preceu.ph.qbl    %[n2],       %[tp2]                         \n\t"
+        "preceu.ph.qbr    %[n3],       %[tn2]                         \n\t"
+        "preceu.ph.qbl    %[n4],       %[tn2]                         \n\t"
+        "dpa.w.ph         $ac3,        %[n1],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac3,        %[n2],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac3,        %[n3],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac3,        %[n4],          %[vector4b]    \n\t"
+        "extp             %[Temp2],    $ac3,           31             \n\t"
+
+        /* odd 2. pixel */
+        "lbux             %[tp2],      %[Temp3](%[cm])                \n\t"
+        "mtlo             %[vector4a], $ac2                           \n\t"
+        "mthi             $zero,       $ac2                           \n\t"
+        "preceu.ph.qbr    %[n1],       %[tn1]                         \n\t"
+        "dpa.w.ph         $ac2,        %[n2],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac2,        %[n3],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac2,        %[n4],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac2,        %[n1],          %[vector4b]    \n\t"
+        "extp             %[Temp4],    $ac2,           31             \n\t"
+
+        /* clamp */
+        "lbux             %[tn1],      %[Temp2](%[cm])                \n\t"
+        "lbux             %[n2],       %[Temp4](%[cm])                \n\t"
+
+        /* store bytes */
+        "sb               %[tp1],      0(%[dst])                      \n\t"
+        "sb               %[tn1],      1(%[dst])                      \n\t"
+        "sb               %[tp2],      2(%[dst])                      \n\t"
+        "sb               %[n2],       3(%[dst])                      \n\t"
+
+        : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
+          [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
+          [p4] "=&r"(p4), [n1] "=&r"(n1), [n2] "=&r"(n2), [n3] "=&r"(n3),
+          [n4] "=&r"(n4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
+          [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4)
+        : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
+          [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
+          [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst),
+          [src] "r"(src));
+
+    /* Next row... */
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void convolve_horiz_8_dspr2(const uint8_t *src, int32_t src_stride,
+                                   uint8_t *dst, int32_t dst_stride,
+                                   const int16_t *filter_x0, int32_t h) {
+  int32_t y;
+  uint8_t *cm = aom_ff_cropTbl;
+  uint32_t vector4a = 64;
+  int32_t vector1b, vector2b, vector3b, vector4b;
+  int32_t Temp1, Temp2, Temp3;
+  uint32_t tp1, tp2;
+  uint32_t p1, p2, p3, p4, n1;
+  uint32_t tn1, tn2, tn3;
+  uint32_t st0, st1;
+
+  vector1b = ((const int32_t *)filter_x0)[0];
+  vector2b = ((const int32_t *)filter_x0)[1];
+  vector3b = ((const int32_t *)filter_x0)[2];
+  vector4b = ((const int32_t *)filter_x0)[3];
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    prefetch_load(src + src_stride);
+    prefetch_load(src + src_stride + 32);
+    prefetch_store(dst + dst_stride);
+
+    __asm__ __volatile__(
+        "ulw              %[tp1],      0(%[src])                      \n\t"
+        "ulw              %[tp2],      4(%[src])                      \n\t"
+
+        /* even 1. pixel */
+        "mtlo             %[vector4a], $ac3                           \n\t"
+        "mthi             $zero,       $ac3                           \n\t"
+        "mtlo             %[vector4a], $ac2                           \n\t"
+        "mthi             $zero,       $ac2                           \n\t"
+        "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
+        "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
+        "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
+        "preceu.ph.qbl    %[p4],       %[tp2]                         \n\t"
+        "ulw              %[tn2],      8(%[src])                      \n\t"
+        "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac3,        %[p3],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac3,        %[p4],          %[vector4b]    \n\t"
+        "extp             %[Temp1],    $ac3,           31             \n\t"
+
+        /* even 2. pixel */
+        "preceu.ph.qbr    %[p1],       %[tn2]                         \n\t"
+        "preceu.ph.qbl    %[n1],       %[tn2]                         \n\t"
+        "ulw              %[tn1],      12(%[src])                     \n\t"
+        "dpa.w.ph         $ac2,        %[p2],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac2,        %[p3],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac2,        %[p4],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac2,        %[p1],          %[vector4b]    \n\t"
+        "extp             %[Temp3],    $ac2,           31             \n\t"
+
+        /* even 3. pixel */
+        "lbux             %[st0],      %[Temp1](%[cm])                \n\t"
+        "mtlo             %[vector4a], $ac1                           \n\t"
+        "mthi             $zero,       $ac1                           \n\t"
+        "preceu.ph.qbr    %[p2],       %[tn1]                         \n\t"
+        "dpa.w.ph         $ac1,        %[p3],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac1,        %[p4],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac1,        %[p1],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac1,        %[n1],          %[vector4b]    \n\t"
+        "extp             %[Temp1],    $ac1,           31             \n\t"
+
+        /* even 4. pixel */
+        "mtlo             %[vector4a], $ac2                           \n\t"
+        "mthi             $zero,       $ac2                           \n\t"
+        "mtlo             %[vector4a], $ac3                           \n\t"
+        "mthi             $zero,       $ac3                           \n\t"
+        "sb               %[st0],      0(%[dst])                      \n\t"
+        "lbux             %[st1],      %[Temp3](%[cm])                \n\t"
+
+        "balign           %[tn3],      %[tn1],         3              \n\t"
+        "balign           %[tn1],      %[tn2],         3              \n\t"
+        "balign           %[tn2],      %[tp2],         3              \n\t"
+        "balign           %[tp2],      %[tp1],         3              \n\t"
+
+        "dpa.w.ph         $ac2,        %[p4],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac2,        %[p1],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac2,        %[n1],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac2,        %[p2],          %[vector4b]    \n\t"
+        "extp             %[Temp3],    $ac2,           31             \n\t"
+
+        "lbux             %[st0],      %[Temp1](%[cm])                \n\t"
+
+        /* odd 1. pixel */
+        "mtlo             %[vector4a], $ac1                           \n\t"
+        "mthi             $zero,       $ac1                           \n\t"
+        "sb               %[st1],      2(%[dst])                      \n\t"
+        "preceu.ph.qbr    %[p1],       %[tp2]                         \n\t"
+        "preceu.ph.qbl    %[p2],       %[tp2]                         \n\t"
+        "preceu.ph.qbr    %[p3],       %[tn2]                         \n\t"
+        "preceu.ph.qbl    %[p4],       %[tn2]                         \n\t"
+        "sb               %[st0],      4(%[dst])                      \n\t"
+        "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac3,        %[p3],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac3,        %[p4],          %[vector4b]    \n\t"
+        "extp             %[Temp2],    $ac3,           31             \n\t"
+
+        /* odd 2. pixel */
+        "mtlo             %[vector4a], $ac3                           \n\t"
+        "mthi             $zero,       $ac3                           \n\t"
+        "mtlo             %[vector4a], $ac2                           \n\t"
+        "mthi             $zero,       $ac2                           \n\t"
+        "preceu.ph.qbr    %[p1],       %[tn1]                         \n\t"
+        "preceu.ph.qbl    %[n1],       %[tn1]                         \n\t"
+        "lbux             %[st0],      %[Temp3](%[cm])                \n\t"
+        "dpa.w.ph         $ac1,        %[p2],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac1,        %[p3],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac1,        %[p4],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac1,        %[p1],          %[vector4b]    \n\t"
+        "extp             %[Temp3],    $ac1,           31             \n\t"
+
+        /* odd 3. pixel */
+        "lbux             %[st1],      %[Temp2](%[cm])                \n\t"
+        "preceu.ph.qbr    %[p2],       %[tn3]                         \n\t"
+        "dpa.w.ph         $ac3,        %[p3],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac3,        %[p4],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac3,        %[p1],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac3,        %[n1],          %[vector4b]    \n\t"
+        "extp             %[Temp2],    $ac3,           31             \n\t"
+
+        /* odd 4. pixel */
+        "sb               %[st1],      1(%[dst])                      \n\t"
+        "sb               %[st0],      6(%[dst])                      \n\t"
+        "dpa.w.ph         $ac2,        %[p4],          %[vector1b]    \n\t"
+        "dpa.w.ph         $ac2,        %[p1],          %[vector2b]    \n\t"
+        "dpa.w.ph         $ac2,        %[n1],          %[vector3b]    \n\t"
+        "dpa.w.ph         $ac2,        %[p2],          %[vector4b]    \n\t"
+        "extp             %[Temp1],    $ac2,           31             \n\t"
+
+        /* clamp */
+        "lbux             %[p4],       %[Temp3](%[cm])                \n\t"
+        "lbux             %[p2],       %[Temp2](%[cm])                \n\t"
+        "lbux             %[n1],       %[Temp1](%[cm])                \n\t"
+
+        /* store bytes */
+        "sb               %[p4],       3(%[dst])                      \n\t"
+        "sb               %[p2],       5(%[dst])                      \n\t"
+        "sb               %[n1],       7(%[dst])                      \n\t"
+
+        : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
+          [tn2] "=&r"(tn2), [tn3] "=&r"(tn3), [st0] "=&r"(st0),
+          [st1] "=&r"(st1), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
+          [p4] "=&r"(p4), [n1] "=&r"(n1), [Temp1] "=&r"(Temp1),
+          [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
+        : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
+          [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
+          [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst),
+          [src] "r"(src));
+
+    /* Next row... */
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void convolve_horiz_16_dspr2(const uint8_t *src_ptr, int32_t src_stride,
+                                    uint8_t *dst_ptr, int32_t dst_stride,
+                                    const int16_t *filter_x0, int32_t h,
+                                    int32_t count) {
+  int32_t y, c;
+  const uint8_t *src;
+  uint8_t *dst;
+  uint8_t *cm = aom_ff_cropTbl;
+  uint32_t vector_64 = 64;
+  int32_t filter12, filter34, filter56, filter78;
+  int32_t Temp1, Temp2, Temp3;
+  uint32_t qload1, qload2, qload3;
+  uint32_t p1, p2, p3, p4, p5;
+  uint32_t st1, st2, st3;
+
+  filter12 = ((const int32_t *)filter_x0)[0];
+  filter34 = ((const int32_t *)filter_x0)[1];
+  filter56 = ((const int32_t *)filter_x0)[2];
+  filter78 = ((const int32_t *)filter_x0)[3];
+
+  for (y = h; y--;) {
+    src = src_ptr;
+    dst = dst_ptr;
+
+    /* prefetch data to cache memory */
+    prefetch_load(src_ptr + src_stride);
+    prefetch_load(src_ptr + src_stride + 32);
+    prefetch_store(dst_ptr + dst_stride);
+
+    for (c = 0; c < count; c++) {
+      __asm__ __volatile__(
+          "ulw              %[qload1],    0(%[src])                    \n\t"
+          "ulw              %[qload2],    4(%[src])                    \n\t"
+
+          /* even 1. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
+          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
+          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
+          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
+          "ulw              %[qload3],    8(%[src])                    \n\t"
+          "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter34]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter56]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter78]  \n\t" /* even 1 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
+
+          /* even 2. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
+          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
+          "ulw              %[qload1],    12(%[src])                   \n\t"
+          "dpa.w.ph         $ac2,         %[p2],          %[filter12]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter34]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac2,         %[p4],          %[filter56]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter78]  \n\t" /* even 1 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
+
+          /* even 3. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
+          "sb               %[st1],       0(%[dst])                    \n\t" /* even 1 */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter12]  \n\t" /* even 3 */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter34]  \n\t" /* even 3 */
+          "dpa.w.ph         $ac3,         %[p1],          %[filter56]  \n\t" /* even 3 */
+          "dpa.w.ph         $ac3,         %[p5],          %[filter78]  \n\t" /* even 3 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
+
+          /* even 4. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
+          "sb               %[st2],       2(%[dst])                    \n\t" /* even 1 */
+          "ulw              %[qload2],    16(%[src])                   \n\t"
+          "dpa.w.ph         $ac1,         %[p4],          %[filter12]  \n\t" /* even 4 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter34]  \n\t" /* even 4 */
+          "dpa.w.ph         $ac1,         %[p5],          %[filter56]  \n\t" /* even 4 */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter78]  \n\t" /* even 4 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
+
+          /* even 5. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
+          "sb               %[st3],       4(%[dst])                    \n\t" /* even 3 */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter12]  \n\t" /* even 5 */
+          "dpa.w.ph         $ac2,         %[p5],          %[filter34]  \n\t" /* even 5 */
+          "dpa.w.ph         $ac2,         %[p2],          %[filter56]  \n\t" /* even 5 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter78]  \n\t" /* even 5 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
+
+          /* even 6. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
+          "sb               %[st1],       6(%[dst])                    \n\t" /* even 4 */
+          "ulw              %[qload3],    20(%[src])                   \n\t"
+          "dpa.w.ph         $ac3,         %[p5],          %[filter12]  \n\t" /* even 6 */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* even 6 */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* even 6 */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* even 6 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
+
+          /* even 7. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
+          "sb               %[st2],       8(%[dst])                    \n\t" /* even 5 */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* even 7 */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* even 7 */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* even 7 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* even 7 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
+
+          /* even 8. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* even 8 */
+          "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* even 8 */
+          "sb               %[st3],       10(%[dst])                   \n\t" /* even 6 */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* even 8 */
+          "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* even 8 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
+
+          /* ODD pixels */
+          "ulw              %[qload1],    1(%[src])                    \n\t"
+          "ulw              %[qload2],    5(%[src])                    \n\t"
+
+          /* odd 1. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
+          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
+          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
+          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
+          "sb               %[st1],       12(%[dst])                   \n\t" /* even 7 */
+          "ulw              %[qload3],    9(%[src])                    \n\t"
+          "dpa.w.ph         $ac3,         %[p1],          %[filter12]  \n\t" /* odd 1 */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* odd 1 */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* odd 1 */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* odd 1 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
+
+          /* odd 2. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
+          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
+          "sb               %[st2],       14(%[dst])                   \n\t" /* even 8 */
+          "ulw              %[qload1],    13(%[src])                   \n\t"
+          "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* odd 2 */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* odd 2 */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* odd 2 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* odd 2 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
+
+          /* odd 3. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
+          "sb               %[st3],       1(%[dst])                    \n\t" /* odd 1 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* odd 3 */
+          "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* odd 3 */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* odd 3 */
+          "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* odd 3 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
+
+          /* odd 4. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
+          "sb               %[st1],       3(%[dst])                    \n\t" /* odd 2 */
+          "ulw              %[qload2],    17(%[src])                   \n\t"
+          "dpa.w.ph         $ac3,         %[p4],          %[filter12]  \n\t" /* odd 4 */
+          "dpa.w.ph         $ac3,         %[p1],          %[filter34]  \n\t" /* odd 4 */
+          "dpa.w.ph         $ac3,         %[p5],          %[filter56]  \n\t" /* odd 4 */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter78]  \n\t" /* odd 4 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
+
+          /* odd 5. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
+          "sb               %[st2],       5(%[dst])                    \n\t" /* odd 3 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* odd 5 */
+          "dpa.w.ph         $ac1,         %[p5],          %[filter34]  \n\t" /* odd 5 */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter56]  \n\t" /* odd 5 */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter78]  \n\t" /* odd 5 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
+
+          /* odd 6. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
+          "sb               %[st3],       7(%[dst])                    \n\t" /* odd 4 */
+          "ulw              %[qload3],    21(%[src])                   \n\t"
+          "dpa.w.ph         $ac2,         %[p5],          %[filter12]  \n\t" /* odd 6 */
+          "dpa.w.ph         $ac2,         %[p2],          %[filter34]  \n\t" /* odd 6 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter56]  \n\t" /* odd 6 */
+          "dpa.w.ph         $ac2,         %[p4],          %[filter78]  \n\t" /* odd 6 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
+
+          /* odd 7. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
+          "sb               %[st1],       9(%[dst])                    \n\t" /* odd 5 */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter12]  \n\t" /* odd 7 */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter34]  \n\t" /* odd 7 */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter56]  \n\t" /* odd 7 */
+          "dpa.w.ph         $ac3,         %[p1],          %[filter78]  \n\t" /* odd 7 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
+
+          /* odd 8. pixel */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter12]  \n\t" /* odd 8 */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter34]  \n\t" /* odd 8 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter56]  \n\t" /* odd 8 */
+          "dpa.w.ph         $ac1,         %[p5],          %[filter78]  \n\t" /* odd 8 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
+
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
+
+          "sb               %[st2],       11(%[dst])                   \n\t" /* odd 6 */
+          "sb               %[st3],       13(%[dst])                   \n\t" /* odd 7 */
+          "sb               %[st1],       15(%[dst])                   \n\t" /* odd 8 */
+
+          : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2),
+            [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2),
+            [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
+            [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1),
+            [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
+          : [filter12] "r"(filter12), [filter34] "r"(filter34),
+            [filter56] "r"(filter56), [filter78] "r"(filter78),
+            [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst),
+            [src] "r"(src));
+
+      src += 16;
+      dst += 16;
+    }
+
+    /* Next row... */
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+  }
+}
+
+static void convolve_horiz_64_dspr2(const uint8_t *src_ptr, int32_t src_stride,
+                                    uint8_t *dst_ptr, int32_t dst_stride,
+                                    const int16_t *filter_x0, int32_t h) {
+  int32_t y, c;
+  const uint8_t *src;
+  uint8_t *dst;
+  uint8_t *cm = aom_ff_cropTbl;
+  uint32_t vector_64 = 64;
+  int32_t filter12, filter34, filter56, filter78;
+  int32_t Temp1, Temp2, Temp3;
+  uint32_t qload1, qload2, qload3;
+  uint32_t p1, p2, p3, p4, p5;
+  uint32_t st1, st2, st3;
+
+  filter12 = ((const int32_t *)filter_x0)[0];
+  filter34 = ((const int32_t *)filter_x0)[1];
+  filter56 = ((const int32_t *)filter_x0)[2];
+  filter78 = ((const int32_t *)filter_x0)[3];
+
+  for (y = h; y--;) {
+    src = src_ptr;
+    dst = dst_ptr;
+
+    /* prefetch data to cache memory */
+    prefetch_load(src_ptr + src_stride);
+    prefetch_load(src_ptr + src_stride + 32);
+    prefetch_load(src_ptr + src_stride + 64);
+    prefetch_store(dst_ptr + dst_stride);
+    prefetch_store(dst_ptr + dst_stride + 32);
+
+    for (c = 0; c < 4; c++) {
+      __asm__ __volatile__(
+          "ulw              %[qload1],    0(%[src])                    \n\t"
+          "ulw              %[qload2],    4(%[src])                    \n\t"
+
+          /* even 1. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
+          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
+          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
+          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
+          "ulw              %[qload3],    8(%[src])                    \n\t"
+          "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter34]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter56]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter78]  \n\t" /* even 1 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
+
+          /* even 2. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
+          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
+          "ulw              %[qload1],    12(%[src])                   \n\t"
+          "dpa.w.ph         $ac2,         %[p2],          %[filter12]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter34]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac2,         %[p4],          %[filter56]  \n\t" /* even 1 */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter78]  \n\t" /* even 1 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
+
+          /* even 3. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
+          "sb               %[st1],       0(%[dst])                    \n\t" /* even 1 */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter12]  \n\t" /* even 3 */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter34]  \n\t" /* even 3 */
+          "dpa.w.ph         $ac3,         %[p1],          %[filter56]  \n\t" /* even 3 */
+          "dpa.w.ph         $ac3,         %[p5],          %[filter78]  \n\t" /* even 3 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
+
+          /* even 4. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
+          "sb               %[st2],       2(%[dst])                    \n\t" /* even 1 */
+          "ulw              %[qload2],    16(%[src])                   \n\t"
+          "dpa.w.ph         $ac1,         %[p4],          %[filter12]  \n\t" /* even 4 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter34]  \n\t" /* even 4 */
+          "dpa.w.ph         $ac1,         %[p5],          %[filter56]  \n\t" /* even 4 */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter78]  \n\t" /* even 4 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
+
+          /* even 5. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
+          "sb               %[st3],       4(%[dst])                    \n\t" /* even 3 */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter12]  \n\t" /* even 5 */
+          "dpa.w.ph         $ac2,         %[p5],          %[filter34]  \n\t" /* even 5 */
+          "dpa.w.ph         $ac2,         %[p2],          %[filter56]  \n\t" /* even 5 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter78]  \n\t" /* even 5 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
+
+          /* even 6. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
+          "sb               %[st1],       6(%[dst])                    \n\t" /* even 4 */
+          "ulw              %[qload3],    20(%[src])                   \n\t"
+          "dpa.w.ph         $ac3,         %[p5],          %[filter12]  \n\t" /* even 6 */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* even 6 */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* even 6 */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* even 6 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
+
+          /* even 7. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
+          "sb               %[st2],       8(%[dst])                    \n\t" /* even 5 */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* even 7 */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* even 7 */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* even 7 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* even 7 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
+
+          /* even 8. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* even 8 */
+          "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* even 8 */
+          "sb               %[st3],       10(%[dst])                   \n\t" /* even 6 */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* even 8 */
+          "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* even 8 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
+
+          /* ODD pixels */
+          "ulw              %[qload1],    1(%[src])                    \n\t"
+          "ulw              %[qload2],    5(%[src])                    \n\t"
+
+          /* odd 1. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
+          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
+          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
+          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
+          "sb               %[st1],       12(%[dst])                   \n\t" /* even 7 */
+          "ulw              %[qload3],    9(%[src])                    \n\t"
+          "dpa.w.ph         $ac3,         %[p1],          %[filter12]  \n\t" /* odd 1 */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* odd 1 */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* odd 1 */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* odd 1 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
+
+          /* odd 2. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
+          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
+          "sb               %[st2],       14(%[dst])                   \n\t" /* even 8 */
+          "ulw              %[qload1],    13(%[src])                   \n\t"
+          "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* odd 2 */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* odd 2 */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* odd 2 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* odd 2 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
+
+          /* odd 3. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
+          "sb               %[st3],       1(%[dst])                    \n\t" /* odd 1 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* odd 3 */
+          "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* odd 3 */
+          "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* odd 3 */
+          "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* odd 3 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
+
+          /* odd 4. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
+          "sb               %[st1],       3(%[dst])                    \n\t" /* odd 2 */
+          "ulw              %[qload2],    17(%[src])                   \n\t"
+          "dpa.w.ph         $ac3,         %[p4],          %[filter12]  \n\t" /* odd 4 */
+          "dpa.w.ph         $ac3,         %[p1],          %[filter34]  \n\t" /* odd 4 */
+          "dpa.w.ph         $ac3,         %[p5],          %[filter56]  \n\t" /* odd 4 */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter78]  \n\t" /* odd 4 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
+
+          /* odd 5. pixel */
+          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
+          "mthi             $zero,        $ac2                         \n\t"
+          "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
+          "sb               %[st2],       5(%[dst])                    \n\t" /* odd 3 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* odd 5 */
+          "dpa.w.ph         $ac1,         %[p5],          %[filter34]  \n\t" /* odd 5 */
+          "dpa.w.ph         $ac1,         %[p2],          %[filter56]  \n\t" /* odd 5 */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter78]  \n\t" /* odd 5 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
+
+          /* odd 6. pixel */
+          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
+          "mthi             $zero,        $ac3                         \n\t"
+          "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
+          "sb               %[st3],       7(%[dst])                    \n\t" /* odd 4 */
+          "ulw              %[qload3],    21(%[src])                   \n\t"
+          "dpa.w.ph         $ac2,         %[p5],          %[filter12]  \n\t" /* odd 6 */
+          "dpa.w.ph         $ac2,         %[p2],          %[filter34]  \n\t" /* odd 6 */
+          "dpa.w.ph         $ac2,         %[p3],          %[filter56]  \n\t" /* odd 6 */
+          "dpa.w.ph         $ac2,         %[p4],          %[filter78]  \n\t" /* odd 6 */
+          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
+
+          /* odd 7. pixel */
+          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
+          "mthi             $zero,        $ac1                         \n\t"
+          "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
+          "sb               %[st1],       9(%[dst])                    \n\t" /* odd 5 */
+          "dpa.w.ph         $ac3,         %[p2],          %[filter12]  \n\t" /* odd 7 */
+          "dpa.w.ph         $ac3,         %[p3],          %[filter34]  \n\t" /* odd 7 */
+          "dpa.w.ph         $ac3,         %[p4],          %[filter56]  \n\t" /* odd 7 */
+          "dpa.w.ph         $ac3,         %[p1],          %[filter78]  \n\t" /* odd 7 */
+          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
+
+          /* odd 8. pixel */
+          "dpa.w.ph         $ac1,         %[p3],          %[filter12]  \n\t" /* odd 8 */
+          "dpa.w.ph         $ac1,         %[p4],          %[filter34]  \n\t" /* odd 8 */
+          "dpa.w.ph         $ac1,         %[p1],          %[filter56]  \n\t" /* odd 8 */
+          "dpa.w.ph         $ac1,         %[p5],          %[filter78]  \n\t" /* odd 8 */
+          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
+
+          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
+          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
+          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
+
+          "sb               %[st2],       11(%[dst])                   \n\t" /* odd 6 */
+          "sb               %[st3],       13(%[dst])                   \n\t" /* odd 7 */
+          "sb               %[st1],       15(%[dst])                   \n\t" /* odd 8 */
+
+          : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2),
+            [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2),
+            [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
+            [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1),
+            [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
+          : [filter12] "r"(filter12), [filter34] "r"(filter34),
+            [filter56] "r"(filter56), [filter78] "r"(filter78),
+            [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst),
+            [src] "r"(src));
+
+      src += 16;
+      dst += 16;
+    }
+
+    /* Next row... */
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+  }
+}
+
+void aom_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+                               uint8_t *dst, ptrdiff_t dst_stride,
+                               const int16_t *filter_x, int x_step_q4,
+                               const int16_t *filter_y, int y_step_q4, int w,
+                               int h) {
+  assert(x_step_q4 == 16);
+  assert(((const int32_t *)filter_x)[1] != 0x800000);
+
+  if (((const int32_t *)filter_x)[0] == 0) {
+    aom_convolve2_horiz_dspr2(src, src_stride, dst, dst_stride, filter_x,
+                              x_step_q4, filter_y, y_step_q4, w, h);
+  } else {
+    uint32_t pos = 38;
+
+    prefetch_load((const uint8_t *)filter_x);
+    src -= 3;
+
+    /* bit positon for extract from acc */
+    __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
+                         :
+                         : [pos] "r"(pos));
+
+    /* prefetch data to cache memory */
+    prefetch_load(src);
+    prefetch_load(src + 32);
+    prefetch_store(dst);
+
+    switch (w) {
+      case 4:
+        convolve_horiz_4_dspr2(src, (int32_t)src_stride, dst,
+                               (int32_t)dst_stride, filter_x, (int32_t)h);
+        break;
+      case 8:
+        convolve_horiz_8_dspr2(src, (int32_t)src_stride, dst,
+                               (int32_t)dst_stride, filter_x, (int32_t)h);
+        break;
+      case 16:
+        convolve_horiz_16_dspr2(src, (int32_t)src_stride, dst,
+                                (int32_t)dst_stride, filter_x, (int32_t)h, 1);
+        break;
+      case 32:
+        convolve_horiz_16_dspr2(src, (int32_t)src_stride, dst,
+                                (int32_t)dst_stride, filter_x, (int32_t)h, 2);
+        break;
+      case 64:
+        prefetch_load(src + 64);
+        prefetch_store(dst + 32);
+
+        convolve_horiz_64_dspr2(src, (int32_t)src_stride, dst,
+                                (int32_t)dst_stride, filter_x, (int32_t)h);
+        break;
+      default:
+        aom_convolve8_horiz_c(src + 3, src_stride, dst, dst_stride, filter_x,
+                              x_step_q4, filter_y, y_step_q4, w, h);
+        break;
+    }
+  }
+}
+#endif
diff --git a/libs/libaom/src/aom_dsp/mips/convolve8_vert_dspr2.c b/libs/libaom/src/aom_dsp/mips/convolve8_vert_dspr2.c
new file mode 100644
index 000000000..201e66427
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/mips/convolve8_vert_dspr2.c
@@ -0,0 +1,361 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/mips/convolve_common_dspr2.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_ports/mem.h"
+
+#if HAVE_DSPR2
+static void convolve_vert_4_dspr2(const uint8_t *src, int32_t src_stride,
+                                  uint8_t *dst, int32_t dst_stride,
+                                  const int16_t *filter_y, int32_t w,
+                                  int32_t h) {
+  int32_t x, y;
+  const uint8_t *src_ptr;
+  uint8_t *dst_ptr;
+  uint8_t *cm = aom_ff_cropTbl;
+  uint32_t vector4a = 64;
+  uint32_t load1, load2, load3, load4;
+  uint32_t p1, p2;
+  uint32_t n1, n2;
+  uint32_t scratch1, scratch2;
+  uint32_t store1, store2;
+  int32_t vector1b, vector2b, vector3b, vector4b;
+  int32_t Temp1, Temp2;
+
+  vector1b = ((const int32_t *)filter_y)[0];
+  vector2b = ((const int32_t *)filter_y)[1];
+  vector3b = ((const int32_t *)filter_y)[2];
+  vector4b = ((const int32_t *)filter_y)[3];
+
+  src -= 3 * src_stride;
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    prefetch_store(dst + dst_stride);
+
+    for (x = 0; x < w; x += 4) {
+      src_ptr = src + x;
+      dst_ptr = dst + x;
+
+      __asm__ __volatile__(
+          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load3],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load4],     0(%[src_ptr])                   \n\t"
+
+          "mtlo             %[vector4a],  $ac0                            \n\t"
+          "mtlo             %[vector4a],  $ac1                            \n\t"
+          "mtlo             %[vector4a],  $ac2                            \n\t"
+          "mtlo             %[vector4a],  $ac3                            \n\t"
+          "mthi             $zero,        $ac0                            \n\t"
+          "mthi             $zero,        $ac1                            \n\t"
+          "mthi             $zero,        $ac2                            \n\t"
+          "mthi             $zero,        $ac3                            \n\t"
+
+          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+          "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
+          "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
+          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
+          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac0,         %[p1],          %[vector1b]     \n\t"
+          "dpa.w.ph         $ac0,         %[p2],          %[vector2b]     \n\t"
+          "dpa.w.ph         $ac1,         %[n1],          %[vector1b]     \n\t"
+          "dpa.w.ph         $ac1,         %[n2],          %[vector2b]     \n\t"
+
+          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+          "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
+          "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
+          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
+          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac2,         %[p1],          %[vector1b]     \n\t"
+          "dpa.w.ph         $ac2,         %[p2],          %[vector2b]     \n\t"
+          "dpa.w.ph         $ac3,         %[n1],          %[vector1b]     \n\t"
+          "dpa.w.ph         $ac3,         %[n2],          %[vector2b]     \n\t"
+
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load3],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load4],     0(%[src_ptr])                   \n\t"
+
+          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+          "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
+          "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
+          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
+          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac0,         %[p1],          %[vector3b]     \n\t"
+          "dpa.w.ph         $ac0,         %[p2],          %[vector4b]     \n\t"
+          "extp             %[Temp1],     $ac0,           31              \n\t"
+          "dpa.w.ph         $ac1,         %[n1],          %[vector3b]     \n\t"
+          "dpa.w.ph         $ac1,         %[n2],          %[vector4b]     \n\t"
+          "extp             %[Temp2],     $ac1,           31              \n\t"
+
+          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+          "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
+          "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
+          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
+          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
+
+          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
+          "dpa.w.ph         $ac2,         %[p1],          %[vector3b]     \n\t"
+          "dpa.w.ph         $ac2,         %[p2],          %[vector4b]     \n\t"
+          "extp             %[Temp1],     $ac2,           31              \n\t"
+
+          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
+          "dpa.w.ph         $ac3,         %[n1],          %[vector3b]     \n\t"
+          "dpa.w.ph         $ac3,         %[n2],          %[vector4b]     \n\t"
+          "extp             %[Temp2],     $ac3,           31              \n\t"
+
+          "sb               %[store1],    0(%[dst_ptr])                   \n\t"
+          "sb               %[store2],    1(%[dst_ptr])                   \n\t"
+
+          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
+          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
+
+          "sb               %[store1],    2(%[dst_ptr])                   \n\t"
+          "sb               %[store2],    3(%[dst_ptr])                   \n\t"
+
+          : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+            [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2),
+            [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1),
+            [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1),
+            [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
+            [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
+          : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
+            [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
+            [vector4a] "r"(vector4a), [src_stride] "r"(src_stride),
+            [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
+    }
+
+    /* Next row... */
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void convolve_vert_64_dspr2(const uint8_t *src, int32_t src_stride,
+                                   uint8_t *dst, int32_t dst_stride,
+                                   const int16_t *filter_y, int32_t h) {
+  int32_t x, y;
+  const uint8_t *src_ptr;
+  uint8_t *dst_ptr;
+  uint8_t *cm = aom_ff_cropTbl;
+  uint32_t vector4a = 64;
+  uint32_t load1, load2, load3, load4;
+  uint32_t p1, p2;
+  uint32_t n1, n2;
+  uint32_t scratch1, scratch2;
+  uint32_t store1, store2;
+  int32_t vector1b, vector2b, vector3b, vector4b;
+  int32_t Temp1, Temp2;
+
+  vector1b = ((const int32_t *)filter_y)[0];
+  vector2b = ((const int32_t *)filter_y)[1];
+  vector3b = ((const int32_t *)filter_y)[2];
+  vector4b = ((const int32_t *)filter_y)[3];
+
+  src -= 3 * src_stride;
+
+  for (y = h; y--;) {
+    /* prefetch data to cache memory */
+    prefetch_store(dst + dst_stride);
+    prefetch_store(dst + dst_stride + 32);
+
+    for (x = 0; x < 64; x += 4) {
+      src_ptr = src + x;
+      dst_ptr = dst + x;
+
+      __asm__ __volatile__(
+          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load3],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load4],     0(%[src_ptr])                   \n\t"
+
+          "mtlo             %[vector4a],  $ac0                            \n\t"
+          "mtlo             %[vector4a],  $ac1                            \n\t"
+          "mtlo             %[vector4a],  $ac2                            \n\t"
+          "mtlo             %[vector4a],  $ac3                            \n\t"
+          "mthi             $zero,        $ac0                            \n\t"
+          "mthi             $zero,        $ac1                            \n\t"
+          "mthi             $zero,        $ac2                            \n\t"
+          "mthi             $zero,        $ac3                            \n\t"
+
+          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+          "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
+          "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
+          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
+          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac0,         %[p1],          %[vector1b]     \n\t"
+          "dpa.w.ph         $ac0,         %[p2],          %[vector2b]     \n\t"
+          "dpa.w.ph         $ac1,         %[n1],          %[vector1b]     \n\t"
+          "dpa.w.ph         $ac1,         %[n2],          %[vector2b]     \n\t"
+
+          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+          "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
+          "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
+          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
+          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac2,         %[p1],          %[vector1b]     \n\t"
+          "dpa.w.ph         $ac2,         %[p2],          %[vector2b]     \n\t"
+          "dpa.w.ph         $ac3,         %[n1],          %[vector1b]     \n\t"
+          "dpa.w.ph         $ac3,         %[n2],          %[vector2b]     \n\t"
+
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load3],     0(%[src_ptr])                   \n\t"
+          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
+          "ulw              %[load4],     0(%[src_ptr])                   \n\t"
+
+          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+          "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
+          "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
+          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
+          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
+
+          "dpa.w.ph         $ac0,         %[p1],          %[vector3b]     \n\t"
+          "dpa.w.ph         $ac0,         %[p2],          %[vector4b]     \n\t"
+          "extp             %[Temp1],     $ac0,           31              \n\t"
+          "dpa.w.ph         $ac1,         %[n1],          %[vector3b]     \n\t"
+          "dpa.w.ph         $ac1,         %[n2],          %[vector4b]     \n\t"
+          "extp             %[Temp2],     $ac1,           31              \n\t"
+
+          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
+          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
+          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
+          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
+          "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
+          "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
+          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
+          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
+
+          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
+          "dpa.w.ph         $ac2,         %[p1],          %[vector3b]     \n\t"
+          "dpa.w.ph         $ac2,         %[p2],          %[vector4b]     \n\t"
+          "extp             %[Temp1],     $ac2,           31              \n\t"
+
+          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
+          "dpa.w.ph         $ac3,         %[n1],          %[vector3b]     \n\t"
+          "dpa.w.ph         $ac3,         %[n2],          %[vector4b]     \n\t"
+          "extp             %[Temp2],     $ac3,           31              \n\t"
+
+          "sb               %[store1],    0(%[dst_ptr])                   \n\t"
+          "sb               %[store2],    1(%[dst_ptr])                   \n\t"
+
+          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
+          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
+
+          "sb               %[store1],    2(%[dst_ptr])                   \n\t"
+          "sb               %[store2],    3(%[dst_ptr])                   \n\t"
+
+          : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
+            [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2),
+            [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1),
+            [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1),
+            [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
+            [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
+          : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
+            [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
+            [vector4a] "r"(vector4a), [src_stride] "r"(src_stride),
+            [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
+    }
+
+    /* Next row... */
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+void aom_convolve8_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
+                              const int16_t *filter_x, int x_step_q4,
+                              const int16_t *filter_y, int y_step_q4, int w,
+                              int h) {
+  assert(y_step_q4 == 16);
+  assert(((const int32_t *)filter_y)[1] != 0x800000);
+
+  if (((const int32_t *)filter_y)[0] == 0) {
+    aom_convolve2_vert_dspr2(src, src_stride, dst, dst_stride, filter_x,
+                             x_step_q4, filter_y, y_step_q4, w, h);
+  } else {
+    uint32_t pos = 38;
+
+    /* bit positon for extract from acc */
+    __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
+                         :
+                         : [pos] "r"(pos));
+
+    prefetch_store(dst);
+
+    switch (w) {
+      case 4:
+      case 8:
+      case 16:
+      case 32:
+        convolve_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y, w, h);
+        break;
+      case 64:
+        prefetch_store(dst + 32);
+        convolve_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, h);
+        break;
+      default:
+        aom_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,
+                             x_step_q4, filter_y, y_step_q4, w, h);
+        break;
+    }
+  }
+}
+
+#endif
diff --git a/libs/libaom/src/aom_dsp/mips/convolve_common_dspr2.h b/libs/libaom/src/aom_dsp/mips/convolve_common_dspr2.h
new file mode 100644
index 000000000..e5d48a884
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/mips/convolve_common_dspr2.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_MIPS_CONVOLVE_COMMON_DSPR2_H_
+#define AOM_AOM_DSP_MIPS_CONVOLVE_COMMON_DSPR2_H_
+
+#include <assert.h>
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/mips/common_dspr2.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if HAVE_DSPR2
+void aom_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+                               uint8_t *dst, ptrdiff_t dst_stride,
+                               const int16_t *filter_x, int x_step_q4,
+                               const int16_t *filter_y, int y_step_q4, int w,
+                               int h);
+
+void aom_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+                         ptrdiff_t dst_stride, const int16_t *filter, int w,
+                         int h);
+
+void aom_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
+                              const int16_t *filter_x, int x_step_q4,
+                              const int16_t *filter_y, int y_step_q4, int w,
+                              int h);
+
+#endif  // #if HAVE_DSPR2
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_DSP_MIPS_CONVOLVE_COMMON_DSPR2_H_
diff --git a/libs/libaom/src/aom_dsp/mips/intrapred16_dspr2.c b/libs/libaom/src/aom_dsp/mips/intrapred16_dspr2.c
new file mode 100644
index 000000000..7c221ae89
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/mips/intrapred16_dspr2.c
@@ -0,0 +1,327 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/mips/common_dspr2.h"
+
+#if HAVE_DSPR2
+void aom_h_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  int32_t tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
+  int32_t tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
+
+  (void)above;
+
+  __asm__ __volatile__(
+      "lb         %[tmp1],      (%[left])                    \n\t"
+      "lb         %[tmp2],      1(%[left])                   \n\t"
+      "lb         %[tmp3],      2(%[left])                   \n\t"
+      "lb         %[tmp4],      3(%[left])                   \n\t"
+      "lb         %[tmp5],      4(%[left])                   \n\t"
+      "lb         %[tmp6],      5(%[left])                   \n\t"
+      "lb         %[tmp7],      6(%[left])                   \n\t"
+      "lb         %[tmp8],      7(%[left])                   \n\t"
+      "lb         %[tmp9],      8(%[left])                   \n\t"
+      "lb         %[tmp10],     9(%[left])                   \n\t"
+      "lb         %[tmp11],     10(%[left])                  \n\t"
+      "lb         %[tmp12],     11(%[left])                  \n\t"
+      "lb         %[tmp13],     12(%[left])                  \n\t"
+      "lb         %[tmp14],     13(%[left])                  \n\t"
+      "lb         %[tmp15],     14(%[left])                  \n\t"
+      "lb         %[tmp16],     15(%[left])                  \n\t"
+
+      "replv.qb   %[tmp1],      %[tmp1]                      \n\t"
+      "replv.qb   %[tmp2],      %[tmp2]                      \n\t"
+      "replv.qb   %[tmp3],      %[tmp3]                      \n\t"
+      "replv.qb   %[tmp4],      %[tmp4]                      \n\t"
+      "replv.qb   %[tmp5],      %[tmp5]                      \n\t"
+      "replv.qb   %[tmp6],      %[tmp6]                      \n\t"
+      "replv.qb   %[tmp7],      %[tmp7]                      \n\t"
+      "replv.qb   %[tmp8],      %[tmp8]                      \n\t"
+      "replv.qb   %[tmp9],      %[tmp9]                      \n\t"
+      "replv.qb   %[tmp10],     %[tmp10]                     \n\t"
+      "replv.qb   %[tmp11],     %[tmp11]                     \n\t"
+      "replv.qb   %[tmp12],     %[tmp12]                     \n\t"
+      "replv.qb   %[tmp13],     %[tmp13]                     \n\t"
+      "replv.qb   %[tmp14],     %[tmp14]                     \n\t"
+      "replv.qb   %[tmp15],     %[tmp15]                     \n\t"
+      "replv.qb   %[tmp16],     %[tmp16]                     \n\t"
+
+      "sw         %[tmp1],      (%[dst])                     \n\t"
+      "sw         %[tmp1],      4(%[dst])                    \n\t"
+      "sw         %[tmp1],      8(%[dst])                    \n\t"
+      "sw         %[tmp1],      12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp2],      (%[dst])                     \n\t"
+      "sw         %[tmp2],      4(%[dst])                    \n\t"
+      "sw         %[tmp2],      8(%[dst])                    \n\t"
+      "sw         %[tmp2],      12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp3],      (%[dst])                     \n\t"
+      "sw         %[tmp3],      4(%[dst])                    \n\t"
+      "sw         %[tmp3],      8(%[dst])                    \n\t"
+      "sw         %[tmp3],      12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp4],      (%[dst])                     \n\t"
+      "sw         %[tmp4],      4(%[dst])                    \n\t"
+      "sw         %[tmp4],      8(%[dst])                    \n\t"
+      "sw         %[tmp4],      12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp5],      (%[dst])                     \n\t"
+      "sw         %[tmp5],      4(%[dst])                    \n\t"
+      "sw         %[tmp5],      8(%[dst])                    \n\t"
+      "sw         %[tmp5],      12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp6],      (%[dst])                     \n\t"
+      "sw         %[tmp6],      4(%[dst])                    \n\t"
+      "sw         %[tmp6],      8(%[dst])                    \n\t"
+      "sw         %[tmp6],      12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp7],      (%[dst])                     \n\t"
+      "sw         %[tmp7],      4(%[dst])                    \n\t"
+      "sw         %[tmp7],      8(%[dst])                    \n\t"
+      "sw         %[tmp7],      12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp8],      (%[dst])                     \n\t"
+      "sw         %[tmp8],      4(%[dst])                    \n\t"
+      "sw         %[tmp8],      8(%[dst])                    \n\t"
+      "sw         %[tmp8],      12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp9],      (%[dst])                     \n\t"
+      "sw         %[tmp9],      4(%[dst])                    \n\t"
+      "sw         %[tmp9],      8(%[dst])                    \n\t"
+      "sw         %[tmp9],      12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp10],     (%[dst])                     \n\t"
+      "sw         %[tmp10],     4(%[dst])                    \n\t"
+      "sw         %[tmp10],     8(%[dst])                    \n\t"
+      "sw         %[tmp10],     12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp11],     (%[dst])                     \n\t"
+      "sw         %[tmp11],     4(%[dst])                    \n\t"
+      "sw         %[tmp11],     8(%[dst])                    \n\t"
+      "sw         %[tmp11],     12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp12],     (%[dst])                     \n\t"
+      "sw         %[tmp12],     4(%[dst])                    \n\t"
+      "sw         %[tmp12],     8(%[dst])                    \n\t"
+      "sw         %[tmp12],     12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp13],     (%[dst])                     \n\t"
+      "sw         %[tmp13],     4(%[dst])                    \n\t"
+      "sw         %[tmp13],     8(%[dst])                    \n\t"
+      "sw         %[tmp13],     12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp14],     (%[dst])                     \n\t"
+      "sw         %[tmp14],     4(%[dst])                    \n\t"
+      "sw         %[tmp14],     8(%[dst])                    \n\t"
+      "sw         %[tmp14],     12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp15],     (%[dst])                     \n\t"
+      "sw         %[tmp15],     4(%[dst])                    \n\t"
+      "sw         %[tmp15],     8(%[dst])                    \n\t"
+      "sw         %[tmp15],     12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp16],     (%[dst])                     \n\t"
+      "sw         %[tmp16],     4(%[dst])                    \n\t"
+      "sw         %[tmp16],     8(%[dst])                    \n\t"
+      "sw         %[tmp16],     12(%[dst])                   \n\t"
+
+      : [tmp1] "=&r"(tmp1), [tmp2] "=&r"(tmp2), [tmp3] "=&r"(tmp3),
+        [tmp4] "=&r"(tmp4), [tmp5] "=&r"(tmp5), [tmp7] "=&r"(tmp7),
+        [tmp6] "=&r"(tmp6), [tmp8] "=&r"(tmp8), [tmp9] "=&r"(tmp9),
+        [tmp10] "=&r"(tmp10), [tmp11] "=&r"(tmp11), [tmp12] "=&r"(tmp12),
+        [tmp13] "=&r"(tmp13), [tmp14] "=&r"(tmp14), [tmp15] "=&r"(tmp15),
+        [tmp16] "=&r"(tmp16)
+      : [left] "r"(left), [dst] "r"(dst), [stride] "r"(stride));
+}
+
+void aom_dc_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride,
+                                  const uint8_t *above, const uint8_t *left) {
+  int32_t expected_dc;
+  int32_t average;
+  int32_t tmp, above1, above_l1, above_r1, left1, left_r1, left_l1;
+  int32_t above2, left2;
+
+  __asm__ __volatile__(
+      "lw              %[above1],           (%[above])                    \n\t"
+      "lw              %[above2],           4(%[above])                   \n\t"
+      "lw              %[left1],            (%[left])                     \n\t"
+      "lw              %[left2],            4(%[left])                    \n\t"
+
+      "preceu.ph.qbl   %[above_l1],         %[above1]                     \n\t"
+      "preceu.ph.qbr   %[above_r1],         %[above1]                     \n\t"
+      "preceu.ph.qbl   %[left_l1],          %[left1]                      \n\t"
+      "preceu.ph.qbr   %[left_r1],          %[left1]                      \n\t"
+
+      "addu.ph         %[average],          %[above_r1],     %[above_l1]  \n\t"
+      "addu.ph         %[average],          %[average],      %[left_l1]   \n\t"
+      "addu.ph         %[average],          %[average],      %[left_r1]   \n\t"
+
+      "preceu.ph.qbl   %[above_l1],         %[above2]                     \n\t"
+      "preceu.ph.qbr   %[above_r1],         %[above2]                     \n\t"
+      "preceu.ph.qbl   %[left_l1],          %[left2]                      \n\t"
+      "preceu.ph.qbr   %[left_r1],          %[left2]                      \n\t"
+
+      "addu.ph         %[average],          %[average],      %[above_l1]  \n\t"
+      "addu.ph         %[average],          %[average],      %[above_r1]  \n\t"
+      "addu.ph         %[average],          %[average],      %[left_l1]   \n\t"
+      "addu.ph         %[average],          %[average],      %[left_r1]   \n\t"
+
+      "lw              %[above1],           8(%[above])                   \n\t"
+      "lw              %[above2],           12(%[above])                  \n\t"
+      "lw              %[left1],            8(%[left])                    \n\t"
+      "lw              %[left2],            12(%[left])                   \n\t"
+
+      "preceu.ph.qbl   %[above_l1],         %[above1]                     \n\t"
+      "preceu.ph.qbr   %[above_r1],         %[above1]                     \n\t"
+      "preceu.ph.qbl   %[left_l1],          %[left1]                      \n\t"
+      "preceu.ph.qbr   %[left_r1],          %[left1]                      \n\t"
+
+      "addu.ph         %[average],          %[average],      %[above_l1]  \n\t"
+      "addu.ph         %[average],          %[average],      %[above_r1]  \n\t"
+      "addu.ph         %[average],          %[average],      %[left_l1]   \n\t"
+      "addu.ph         %[average],          %[average],      %[left_r1]   \n\t"
+
+      "preceu.ph.qbl   %[above_l1],         %[above2]                     \n\t"
+      "preceu.ph.qbr   %[above_r1],         %[above2]                     \n\t"
+      "preceu.ph.qbl   %[left_l1],          %[left2]                      \n\t"
+      "preceu.ph.qbr   %[left_r1],          %[left2]                      \n\t"
+
+      "addu.ph         %[average],          %[average],      %[above_l1]  \n\t"
+      "addu.ph         %[average],          %[average],      %[above_r1]  \n\t"
+      "addu.ph         %[average],          %[average],      %[left_l1]   \n\t"
+      "addu.ph         %[average],          %[average],      %[left_r1]   \n\t"
+
+      "addiu           %[average],          %[average],      16           \n\t"
+      "srl             %[tmp],              %[average],      16           \n\t"
+      "addu.ph         %[average],          %[tmp],          %[average]   \n\t"
+      "srl             %[expected_dc],      %[average],      5            \n\t"
+      "replv.qb        %[expected_dc],      %[expected_dc]                \n\t"
+
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      : [left1] "=&r"(left1), [above1] "=&r"(above1), [left_l1] "=&r"(left_l1),
+        [above_l1] "=&r"(above_l1), [left_r1] "=&r"(left_r1),
+        [above_r1] "=&r"(above_r1), [above2] "=&r"(above2),
+        [left2] "=&r"(left2), [average] "=&r"(average), [tmp] "=&r"(tmp),
+        [expected_dc] "=&r"(expected_dc)
+      : [above] "r"(above), [left] "r"(left), [dst] "r"(dst),
+        [stride] "r"(stride));
+}
+#endif  // #if HAVE_DSPR2
diff --git a/libs/libaom/src/aom_dsp/mips/intrapred4_dspr2.c b/libs/libaom/src/aom_dsp/mips/intrapred4_dspr2.c
new file mode 100644
index 000000000..0a21979c7
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/mips/intrapred4_dspr2.c
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/mips/common_dspr2.h"
+
+#if HAVE_DSPR2
+void aom_h_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  int32_t tmp1, tmp2, tmp3, tmp4;
+  (void)above;
+
+  __asm__ __volatile__(
+      "lb         %[tmp1],      (%[left])                    \n\t"
+      "lb         %[tmp2],      1(%[left])                   \n\t"
+      "lb         %[tmp3],      2(%[left])                   \n\t"
+      "lb         %[tmp4],      3(%[left])                   \n\t"
+      "replv.qb   %[tmp1],      %[tmp1]                      \n\t"
+      "replv.qb   %[tmp2],      %[tmp2]                      \n\t"
+      "replv.qb   %[tmp3],      %[tmp3]                      \n\t"
+      "replv.qb   %[tmp4],      %[tmp4]                      \n\t"
+      "sw         %[tmp1],      (%[dst])                     \n\t"
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp2],      (%[dst])                     \n\t"
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp3],      (%[dst])                     \n\t"
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp4],      (%[dst])                     \n\t"
+
+      : [tmp1] "=&r"(tmp1), [tmp2] "=&r"(tmp2), [tmp3] "=&r"(tmp3),
+        [tmp4] "=&r"(tmp4)
+      : [left] "r"(left), [dst] "r"(dst), [stride] "r"(stride));
+}
+
+void aom_dc_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  int32_t expected_dc;
+  int32_t average;
+  int32_t tmp, above_c, above_l, above_r, left_c, left_r, left_l;
+
+  __asm__ __volatile__(
+      "lw              %[above_c],         (%[above])                    \n\t"
+      "lw              %[left_c],          (%[left])                     \n\t"
+
+      "preceu.ph.qbl   %[above_l],         %[above_c]                    \n\t"
+      "preceu.ph.qbr   %[above_r],         %[above_c]                    \n\t"
+      "preceu.ph.qbl   %[left_l],          %[left_c]                     \n\t"
+      "preceu.ph.qbr   %[left_r],          %[left_c]                     \n\t"
+
+      "addu.ph         %[average],         %[above_r],       %[above_l]  \n\t"
+      "addu.ph         %[average],         %[average],       %[left_l]   \n\t"
+      "addu.ph         %[average],         %[average],       %[left_r]   \n\t"
+      "addiu           %[average],         %[average],       4           \n\t"
+      "srl             %[tmp],             %[average],       16          \n\t"
+      "addu.ph         %[average],         %[tmp],           %[average]  \n\t"
+      "srl             %[expected_dc],     %[average],       3           \n\t"
+      "replv.qb        %[expected_dc],     %[expected_dc]                \n\t"
+
+      "sw              %[expected_dc],     (%[dst])                      \n\t"
+      "add             %[dst],              %[dst],          %[stride]   \n\t"
+      "sw              %[expected_dc],     (%[dst])                      \n\t"
+      "add             %[dst],              %[dst],          %[stride]   \n\t"
+      "sw              %[expected_dc],     (%[dst])                      \n\t"
+      "add             %[dst],              %[dst],          %[stride]   \n\t"
+      "sw              %[expected_dc],     (%[dst])                      \n\t"
+
+      : [above_c] "=&r"(above_c), [above_l] "=&r"(above_l),
+        [above_r] "=&r"(above_r), [left_c] "=&r"(left_c),
+        [left_l] "=&r"(left_l), [left_r] "=&r"(left_r),
+        [average] "=&r"(average), [tmp] "=&r"(tmp),
+        [expected_dc] "=&r"(expected_dc)
+      : [above] "r"(above), [left] "r"(left), [dst] "r"(dst),
+        [stride] "r"(stride));
+}
+#endif  // #if HAVE_DSPR2
diff --git a/libs/libaom/src/aom_dsp/mips/intrapred8_dspr2.c b/libs/libaom/src/aom_dsp/mips/intrapred8_dspr2.c
new file mode 100644
index 000000000..d42a77c80
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/mips/intrapred8_dspr2.c
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/mips/common_dspr2.h"
+
+#if HAVE_DSPR2
+void aom_h_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  int32_t tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
+  (void)above;
+
+  __asm__ __volatile__(
+      "lb         %[tmp1],      (%[left])                   \n\t"
+      "lb         %[tmp2],      1(%[left])                  \n\t"
+      "lb         %[tmp3],      2(%[left])                  \n\t"
+      "lb         %[tmp4],      3(%[left])                  \n\t"
+      "lb         %[tmp5],      4(%[left])                  \n\t"
+      "lb         %[tmp6],      5(%[left])                  \n\t"
+      "lb         %[tmp7],      6(%[left])                  \n\t"
+      "lb         %[tmp8],      7(%[left])                  \n\t"
+
+      "replv.qb   %[tmp1],      %[tmp1]                     \n\t"
+      "replv.qb   %[tmp2],      %[tmp2]                     \n\t"
+      "replv.qb   %[tmp3],      %[tmp3]                     \n\t"
+      "replv.qb   %[tmp4],      %[tmp4]                     \n\t"
+      "replv.qb   %[tmp5],      %[tmp5]                     \n\t"
+      "replv.qb   %[tmp6],      %[tmp6]                     \n\t"
+      "replv.qb   %[tmp7],      %[tmp7]                     \n\t"
+      "replv.qb   %[tmp8],      %[tmp8]                     \n\t"
+
+      "sw         %[tmp1],      (%[dst])                    \n\t"
+      "sw         %[tmp1],      4(%[dst])                   \n\t"
+      "add        %[dst],       %[dst],         %[stride]   \n\t"
+      "sw         %[tmp2],      (%[dst])                    \n\t"
+      "sw         %[tmp2],      4(%[dst])                   \n\t"
+      "add        %[dst],       %[dst],         %[stride]   \n\t"
+      "sw         %[tmp3],      (%[dst])                    \n\t"
+      "sw         %[tmp3],      4(%[dst])                   \n\t"
+      "add        %[dst],       %[dst],         %[stride]   \n\t"
+      "sw         %[tmp4],      (%[dst])                    \n\t"
+      "sw         %[tmp4],      4(%[dst])                   \n\t"
+      "add        %[dst],       %[dst],         %[stride]   \n\t"
+      "sw         %[tmp5],      (%[dst])                    \n\t"
+      "sw         %[tmp5],      4(%[dst])                   \n\t"
+      "add        %[dst],       %[dst],         %[stride]   \n\t"
+      "sw         %[tmp6],      (%[dst])                    \n\t"
+      "sw         %[tmp6],      4(%[dst])                   \n\t"
+      "add        %[dst],       %[dst],         %[stride]   \n\t"
+      "sw         %[tmp7],      (%[dst])                    \n\t"
+      "sw         %[tmp7],      4(%[dst])                   \n\t"
+      "add        %[dst],       %[dst],         %[stride]   \n\t"
+      "sw         %[tmp8],      (%[dst])                    \n\t"
+      "sw         %[tmp8],      4(%[dst])                   \n\t"
+
+      : [tmp1] "=&r"(tmp1), [tmp2] "=&r"(tmp2), [tmp3] "=&r"(tmp3),
+        [tmp4] "=&r"(tmp4), [tmp5] "=&r"(tmp5), [tmp7] "=&r"(tmp7),
+        [tmp6] "=&r"(tmp6), [tmp8] "=&r"(tmp8)
+      : [left] "r"(left), [dst] "r"(dst), [stride] "r"(stride));
+}
+
+void aom_dc_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  int32_t expected_dc;
+  int32_t average;
+  int32_t tmp, above1, above_l1, above_r1, left1, left_r1, left_l1;
+  int32_t above2, above_l2, above_r2, left2, left_r2, left_l2;
+
+  __asm__ __volatile__(
+      "lw              %[above1],         (%[above])                      \n\t"
+      "lw              %[above2],         4(%[above])                     \n\t"
+      "lw              %[left1],          (%[left])                       \n\t"
+      "lw              %[left2],          4(%[left])                      \n\t"
+
+      "preceu.ph.qbl   %[above_l1],       %[above1]                       \n\t"
+      "preceu.ph.qbr   %[above_r1],       %[above1]                       \n\t"
+      "preceu.ph.qbl   %[left_l1],        %[left1]                        \n\t"
+      "preceu.ph.qbr   %[left_r1],        %[left1]                        \n\t"
+
+      "preceu.ph.qbl   %[above_l2],       %[above2]                       \n\t"
+      "preceu.ph.qbr   %[above_r2],       %[above2]                       \n\t"
+      "preceu.ph.qbl   %[left_l2],        %[left2]                        \n\t"
+      "preceu.ph.qbr   %[left_r2],        %[left2]                        \n\t"
+
+      "addu.ph         %[average],        %[above_r1],      %[above_l1]   \n\t"
+      "addu.ph         %[average],        %[average],       %[left_l1]    \n\t"
+      "addu.ph         %[average],        %[average],       %[left_r1]    \n\t"
+
+      "addu.ph         %[average],        %[average],       %[above_l2]   \n\t"
+      "addu.ph         %[average],        %[average],       %[above_r2]   \n\t"
+      "addu.ph         %[average],        %[average],       %[left_l2]    \n\t"
+      "addu.ph         %[average],        %[average],       %[left_r2]    \n\t"
+
+      "addiu           %[average],        %[average],       8             \n\t"
+
+      "srl             %[tmp],            %[average],       16            \n\t"
+      "addu.ph         %[average],        %[tmp],           %[average]    \n\t"
+      "srl             %[expected_dc],    %[average],       4             \n\t"
+      "replv.qb        %[expected_dc],    %[expected_dc]                  \n\t"
+
+      "sw              %[expected_dc],    (%[dst])                        \n\t"
+      "sw              %[expected_dc],    4(%[dst])                       \n\t"
+
+      "add             %[dst],             %[dst],          %[stride]     \n\t"
+      "sw              %[expected_dc],    (%[dst])                        \n\t"
+      "sw              %[expected_dc],    4(%[dst])                       \n\t"
+
+      "add             %[dst],             %[dst],          %[stride]     \n\t"
+      "sw              %[expected_dc],    (%[dst])                        \n\t"
+      "sw              %[expected_dc],    4(%[dst])                       \n\t"
+
+      "add             %[dst],             %[dst],          %[stride]     \n\t"
+      "sw              %[expected_dc],    (%[dst])                        \n\t"
+      "sw              %[expected_dc],    4(%[dst])                       \n\t"
+
+      "add             %[dst],             %[dst],          %[stride]     \n\t"
+      "sw              %[expected_dc],    (%[dst])                        \n\t"
+      "sw              %[expected_dc],    4(%[dst])                       \n\t"
+
+      "add             %[dst],             %[dst],          %[stride]     \n\t"
+      "sw              %[expected_dc],    (%[dst])                        \n\t"
+      "sw              %[expected_dc],    4(%[dst])                       \n\t"
+
+      "add             %[dst],             %[dst],          %[stride]     \n\t"
+      "sw              %[expected_dc],    (%[dst])                        \n\t"
+      "sw              %[expected_dc],    4(%[dst])                       \n\t"
+
+      "add             %[dst],             %[dst],          %[stride]     \n\t"
+      "sw              %[expected_dc],    (%[dst])                        \n\t"
+      "sw              %[expected_dc],    4(%[dst])                       \n\t"
+
+      : [above1] "=&r"(above1), [above_l1] "=&r"(above_l1),
+        [above_r1] "=&r"(above_r1), [left1] "=&r"(left1),
+        [left_l1] "=&r"(left_l1), [left_r1] "=&r"(left_r1),
+        [above2] "=&r"(above2), [above_l2] "=&r"(above_l2),
+        [above_r2] "=&r"(above_r2), [left2] "=&r"(left2),
+        [left_l2] "=&r"(left_l2), [left_r2] "=&r"(left_r2),
+        [average] "=&r"(average), [tmp] "=&r"(tmp),
+        [expected_dc] "=&r"(expected_dc)
+      : [above] "r"(above), [left] "r"(left), [dst] "r"(dst),
+        [stride] "r"(stride));
+}
+#endif  // #if HAVE_DSPR2
diff --git a/libs/libaom/src/aom_dsp/mips/intrapred_msa.c b/libs/libaom/src/aom_dsp/mips/intrapred_msa.c
new file mode 100644
index 000000000..9f25cc1ca
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/mips/intrapred_msa.c
@@ -0,0 +1,550 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/mips/macros_msa.h"
+
+#define IPRED_SUBS_UH2_UH(in0, in1, out0, out1) \
+  {                                             \
+    out0 = __msa_subs_u_h(out0, in0);           \
+    out1 = __msa_subs_u_h(out1, in1);           \
+  }
+
+static void intra_predict_vert_4x4_msa(const uint8_t *src, uint8_t *dst,
+                                       int32_t dst_stride) {
+  uint32_t src_data;
+
+  src_data = LW(src);
+
+  SW4(src_data, src_data, src_data, src_data, dst, dst_stride);
+}
+
+static void intra_predict_vert_8x8_msa(const uint8_t *src, uint8_t *dst,
+                                       int32_t dst_stride) {
+  uint32_t row;
+  uint32_t src_data1, src_data2;
+
+  src_data1 = LW(src);
+  src_data2 = LW(src + 4);
+
+  for (row = 8; row--;) {
+    SW(src_data1, dst);
+    SW(src_data2, (dst + 4));
+    dst += dst_stride;
+  }
+}
+
+static void intra_predict_vert_16x16_msa(const uint8_t *src, uint8_t *dst,
+                                         int32_t dst_stride) {
+  uint32_t row;
+  v16u8 src0;
+
+  src0 = LD_UB(src);
+
+  for (row = 16; row--;) {
+    ST_UB(src0, dst);
+    dst += dst_stride;
+  }
+}
+
+static void intra_predict_vert_32x32_msa(const uint8_t *src, uint8_t *dst,
+                                         int32_t dst_stride) {
+  uint32_t row;
+  v16u8 src1, src2;
+
+  src1 = LD_UB(src);
+  src2 = LD_UB(src + 16);
+
+  for (row = 32; row--;) {
+    ST_UB2(src1, src2, dst, 16);
+    dst += dst_stride;
+  }
+}
+
+static void intra_predict_horiz_4x4_msa(const uint8_t *src, uint8_t *dst,
+                                        int32_t dst_stride) {
+  uint32_t out0, out1, out2, out3;
+
+  out0 = src[0] * 0x01010101;
+  out1 = src[1] * 0x01010101;
+  out2 = src[2] * 0x01010101;
+  out3 = src[3] * 0x01010101;
+
+  SW4(out0, out1, out2, out3, dst, dst_stride);
+}
+
+static void intra_predict_horiz_8x8_msa(const uint8_t *src, uint8_t *dst,
+                                        int32_t dst_stride) {
+  uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+  out0 = src[0] * 0x0101010101010101ull;
+  out1 = src[1] * 0x0101010101010101ull;
+  out2 = src[2] * 0x0101010101010101ull;
+  out3 = src[3] * 0x0101010101010101ull;
+  out4 = src[4] * 0x0101010101010101ull;
+  out5 = src[5] * 0x0101010101010101ull;
+  out6 = src[6] * 0x0101010101010101ull;
+  out7 = src[7] * 0x0101010101010101ull;
+
+  SD4(out0, out1, out2, out3, dst, dst_stride);
+  dst += (4 * dst_stride);
+  SD4(out4, out5, out6, out7, dst, dst_stride);
+}
+
+static void intra_predict_horiz_16x16_msa(const uint8_t *src, uint8_t *dst,
+                                          int32_t dst_stride) {
+  uint32_t row;
+  uint8_t inp0, inp1, inp2, inp3;
+  v16u8 src0, src1, src2, src3;
+
+  for (row = 4; row--;) {
+    inp0 = src[0];
+    inp1 = src[1];
+    inp2 = src[2];
+    inp3 = src[3];
+    src += 4;
+
+    src0 = (v16u8)__msa_fill_b(inp0);
+    src1 = (v16u8)__msa_fill_b(inp1);
+    src2 = (v16u8)__msa_fill_b(inp2);
+    src3 = (v16u8)__msa_fill_b(inp3);
+
+    ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+    dst += (4 * dst_stride);
+  }
+}
+
+static void intra_predict_horiz_32x32_msa(const uint8_t *src, uint8_t *dst,
+                                          int32_t dst_stride) {
+  uint32_t row;
+  uint8_t inp0, inp1, inp2, inp3;
+  v16u8 src0, src1, src2, src3;
+
+  for (row = 8; row--;) {
+    inp0 = src[0];
+    inp1 = src[1];
+    inp2 = src[2];
+    inp3 = src[3];
+    src += 4;
+
+    src0 = (v16u8)__msa_fill_b(inp0);
+    src1 = (v16u8)__msa_fill_b(inp1);
+    src2 = (v16u8)__msa_fill_b(inp2);
+    src3 = (v16u8)__msa_fill_b(inp3);
+
+    ST_UB2(src0, src0, dst, 16);
+    dst += dst_stride;
+    ST_UB2(src1, src1, dst, 16);
+    dst += dst_stride;
+    ST_UB2(src2, src2, dst, 16);
+    dst += dst_stride;
+    ST_UB2(src3, src3, dst, 16);
+    dst += dst_stride;
+  }
+}
+
+static void intra_predict_dc_4x4_msa(const uint8_t *src_top,
+                                     const uint8_t *src_left, uint8_t *dst,
+                                     int32_t dst_stride) {
+  uint32_t val0, val1;
+  v16i8 store, src = { 0 };
+  v8u16 sum_h;
+  v4u32 sum_w;
+  v2u64 sum_d;
+
+  val0 = LW(src_top);
+  val1 = LW(src_left);
+  INSERT_W2_SB(val0, val1, src);
+  sum_h = __msa_hadd_u_h((v16u8)src, (v16u8)src);
+  sum_w = __msa_hadd_u_w(sum_h, sum_h);
+  sum_d = __msa_hadd_u_d(sum_w, sum_w);
+  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 3);
+  store = __msa_splati_b((v16i8)sum_w, 0);
+  val0 = __msa_copy_u_w((v4i32)store, 0);
+
+  SW4(val0, val0, val0, val0, dst, dst_stride);
+}
+
+static void intra_predict_dc_tl_4x4_msa(const uint8_t *src, uint8_t *dst,
+                                        int32_t dst_stride) {
+  uint32_t val0;
+  v16i8 store, data = { 0 };
+  v8u16 sum_h;
+  v4u32 sum_w;
+
+  val0 = LW(src);
+  data = (v16i8)__msa_insert_w((v4i32)data, 0, val0);
+  sum_h = __msa_hadd_u_h((v16u8)data, (v16u8)data);
+  sum_w = __msa_hadd_u_w(sum_h, sum_h);
+  sum_w = (v4u32)__msa_srari_w((v4i32)sum_w, 2);
+  store = __msa_splati_b((v16i8)sum_w, 0);
+  val0 = __msa_copy_u_w((v4i32)store, 0);
+
+  SW4(val0, val0, val0, val0, dst, dst_stride);
+}
+
+static void intra_predict_128dc_4x4_msa(uint8_t *dst, int32_t dst_stride) {
+  uint32_t out;
+  const v16i8 store = __msa_ldi_b(128);
+
+  out = __msa_copy_u_w((v4i32)store, 0);
+
+  SW4(out, out, out, out, dst, dst_stride);
+}
+
+static void intra_predict_dc_8x8_msa(const uint8_t *src_top,
+                                     const uint8_t *src_left, uint8_t *dst,
+                                     int32_t dst_stride) {
+  uint64_t val0, val1;
+  v16i8 store;
+  v16u8 src = { 0 };
+  v8u16 sum_h;
+  v4u32 sum_w;
+  v2u64 sum_d;
+
+  val0 = LD(src_top);
+  val1 = LD(src_left);
+  INSERT_D2_UB(val0, val1, src);
+  sum_h = __msa_hadd_u_h(src, src);
+  sum_w = __msa_hadd_u_w(sum_h, sum_h);
+  sum_d = __msa_hadd_u_d(sum_w, sum_w);
+  sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
+  sum_d = __msa_hadd_u_d(sum_w, sum_w);
+  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 4);
+  store = __msa_splati_b((v16i8)sum_w, 0);
+  val0 = __msa_copy_u_d((v2i64)store, 0);
+
+  SD4(val0, val0, val0, val0, dst, dst_stride);
+  dst += (4 * dst_stride);
+  SD4(val0, val0, val0, val0, dst, dst_stride);
+}
+
+static void intra_predict_dc_tl_8x8_msa(const uint8_t *src, uint8_t *dst,
+                                        int32_t dst_stride) {
+  uint64_t val0;
+  v16i8 store;
+  v16u8 data = { 0 };
+  v8u16 sum_h;
+  v4u32 sum_w;
+  v2u64 sum_d;
+
+  val0 = LD(src);
+  data = (v16u8)__msa_insert_d((v2i64)data, 0, val0);
+  sum_h = __msa_hadd_u_h(data, data);
+  sum_w = __msa_hadd_u_w(sum_h, sum_h);
+  sum_d = __msa_hadd_u_d(sum_w, sum_w);
+  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 3);
+  store = __msa_splati_b((v16i8)sum_w, 0);
+  val0 = __msa_copy_u_d((v2i64)store, 0);
+
+  SD4(val0, val0, val0, val0, dst, dst_stride);
+  dst += (4 * dst_stride);
+  SD4(val0, val0, val0, val0, dst, dst_stride);
+}
+
+static void intra_predict_128dc_8x8_msa(uint8_t *dst, int32_t dst_stride) {
+  uint64_t out;
+  const v16i8 store = __msa_ldi_b(128);
+
+  out = __msa_copy_u_d((v2i64)store, 0);
+
+  SD4(out, out, out, out, dst, dst_stride);
+  dst += (4 * dst_stride);
+  SD4(out, out, out, out, dst, dst_stride);
+}
+
+static void intra_predict_dc_16x16_msa(const uint8_t *src_top,
+                                       const uint8_t *src_left, uint8_t *dst,
+                                       int32_t dst_stride) {
+  v16u8 top, left, out;
+  v8u16 sum_h, sum_top, sum_left;
+  v4u32 sum_w;
+  v2u64 sum_d;
+
+  top = LD_UB(src_top);
+  left = LD_UB(src_left);
+  HADD_UB2_UH(top, left, sum_top, sum_left);
+  sum_h = sum_top + sum_left;
+  sum_w = __msa_hadd_u_w(sum_h, sum_h);
+  sum_d = __msa_hadd_u_d(sum_w, sum_w);
+  sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
+  sum_d = __msa_hadd_u_d(sum_w, sum_w);
+  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 5);
+  out = (v16u8)__msa_splati_b((v16i8)sum_w, 0);
+
+  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
+  dst += (8 * dst_stride);
+  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
+}
+
+static void intra_predict_dc_tl_16x16_msa(const uint8_t *src, uint8_t *dst,
+                                          int32_t dst_stride) {
+  v16u8 data, out;
+  v8u16 sum_h;
+  v4u32 sum_w;
+  v2u64 sum_d;
+
+  data = LD_UB(src);
+  sum_h = __msa_hadd_u_h(data, data);
+  sum_w = __msa_hadd_u_w(sum_h, sum_h);
+  sum_d = __msa_hadd_u_d(sum_w, sum_w);
+  sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
+  sum_d = __msa_hadd_u_d(sum_w, sum_w);
+  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 4);
+  out = (v16u8)__msa_splati_b((v16i8)sum_w, 0);
+
+  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
+  dst += (8 * dst_stride);
+  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
+}
+
+static void intra_predict_128dc_16x16_msa(uint8_t *dst, int32_t dst_stride) {
+  const v16u8 out = (v16u8)__msa_ldi_b(128);
+
+  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
+  dst += (8 * dst_stride);
+  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
+}
+
+static void intra_predict_dc_32x32_msa(const uint8_t *src_top,
+                                       const uint8_t *src_left, uint8_t *dst,
+                                       int32_t dst_stride) {
+  uint32_t row;
+  v16u8 top0, top1, left0, left1, out;
+  v8u16 sum_h, sum_top0, sum_top1, sum_left0, sum_left1;
+  v4u32 sum_w;
+  v2u64 sum_d;
+
+  LD_UB2(src_top, 16, top0, top1);
+  LD_UB2(src_left, 16, left0, left1);
+  HADD_UB2_UH(top0, top1, sum_top0, sum_top1);
+  HADD_UB2_UH(left0, left1, sum_left0, sum_left1);
+  sum_h = sum_top0 + sum_top1;
+  sum_h += sum_left0 + sum_left1;
+  sum_w = __msa_hadd_u_w(sum_h, sum_h);
+  sum_d = __msa_hadd_u_d(sum_w, sum_w);
+  sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
+  sum_d = __msa_hadd_u_d(sum_w, sum_w);
+  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 6);
+  out = (v16u8)__msa_splati_b((v16i8)sum_w, 0);
+
+  for (row = 16; row--;) {
+    ST_UB2(out, out, dst, 16);
+    dst += dst_stride;
+    ST_UB2(out, out, dst, 16);
+    dst += dst_stride;
+  }
+}
+
+static void intra_predict_dc_tl_32x32_msa(const uint8_t *src, uint8_t *dst,
+                                          int32_t dst_stride) {
+  uint32_t row;
+  v16u8 data0, data1, out;
+  v8u16 sum_h, sum_data0, sum_data1;
+  v4u32 sum_w;
+  v2u64 sum_d;
+
+  LD_UB2(src, 16, data0, data1);
+  HADD_UB2_UH(data0, data1, sum_data0, sum_data1);
+  sum_h = sum_data0 + sum_data1;
+  sum_w = __msa_hadd_u_w(sum_h, sum_h);
+  sum_d = __msa_hadd_u_d(sum_w, sum_w);
+  sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
+  sum_d = __msa_hadd_u_d(sum_w, sum_w);
+  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 5);
+  out = (v16u8)__msa_splati_b((v16i8)sum_w, 0);
+
+  for (row = 16; row--;) {
+    ST_UB2(out, out, dst, 16);
+    dst += dst_stride;
+    ST_UB2(out, out, dst, 16);
+    dst += dst_stride;
+  }
+}
+
+static void intra_predict_128dc_32x32_msa(uint8_t *dst, int32_t dst_stride) {
+  uint32_t row;
+  const v16u8 out = (v16u8)__msa_ldi_b(128);
+
+  for (row = 16; row--;) {
+    ST_UB2(out, out, dst, 16);
+    dst += dst_stride;
+    ST_UB2(out, out, dst, 16);
+    dst += dst_stride;
+  }
+}
+
+void aom_v_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
+                             const uint8_t *above, const uint8_t *left) {
+  (void)left;
+
+  intra_predict_vert_4x4_msa(above, dst, y_stride);
+}
+
+void aom_v_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
+                             const uint8_t *above, const uint8_t *left) {
+  (void)left;
+
+  intra_predict_vert_8x8_msa(above, dst, y_stride);
+}
+
+void aom_v_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
+                               const uint8_t *above, const uint8_t *left) {
+  (void)left;
+
+  intra_predict_vert_16x16_msa(above, dst, y_stride);
+}
+
+void aom_v_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
+                               const uint8_t *above, const uint8_t *left) {
+  (void)left;
+
+  intra_predict_vert_32x32_msa(above, dst, y_stride);
+}
+
+void aom_h_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
+                             const uint8_t *above, const uint8_t *left) {
+  (void)above;
+
+  intra_predict_horiz_4x4_msa(left, dst, y_stride);
+}
+
+void aom_h_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
+                             const uint8_t *above, const uint8_t *left) {
+  (void)above;
+
+  intra_predict_horiz_8x8_msa(left, dst, y_stride);
+}
+
+void aom_h_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
+                               const uint8_t *above, const uint8_t *left) {
+  (void)above;
+
+  intra_predict_horiz_16x16_msa(left, dst, y_stride);
+}
+
+void aom_h_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
+                               const uint8_t *above, const uint8_t *left) {
+  (void)above;
+
+  intra_predict_horiz_32x32_msa(left, dst, y_stride);
+}
+
+void aom_dc_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
+                              const uint8_t *above, const uint8_t *left) {
+  intra_predict_dc_4x4_msa(above, left, dst, y_stride);
+}
+
+void aom_dc_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
+                              const uint8_t *above, const uint8_t *left) {
+  intra_predict_dc_8x8_msa(above, left, dst, y_stride);
+}
+
+void aom_dc_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                const uint8_t *above, const uint8_t *left) {
+  intra_predict_dc_16x16_msa(above, left, dst, y_stride);
+}
+
+void aom_dc_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                const uint8_t *above, const uint8_t *left) {
+  intra_predict_dc_32x32_msa(above, left, dst, y_stride);
+}
+
+void aom_dc_top_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                  const uint8_t *above, const uint8_t *left) {
+  (void)left;
+
+  intra_predict_dc_tl_4x4_msa(above, dst, y_stride);
+}
+
+void aom_dc_top_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                  const uint8_t *above, const uint8_t *left) {
+  (void)left;
+
+  intra_predict_dc_tl_8x8_msa(above, dst, y_stride);
+}
+
+void aom_dc_top_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)left;
+
+  intra_predict_dc_tl_16x16_msa(above, dst, y_stride);
+}
+
+void aom_dc_top_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)left;
+
+  intra_predict_dc_tl_32x32_msa(above, dst, y_stride);
+}
+
+void aom_dc_left_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  (void)above;
+
+  intra_predict_dc_tl_4x4_msa(left, dst, y_stride);
+}
+
+void aom_dc_left_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  (void)above;
+
+  intra_predict_dc_tl_8x8_msa(left, dst, y_stride);
+}
+
+void aom_dc_left_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+
+  intra_predict_dc_tl_16x16_msa(left, dst, y_stride);
+}
+
+void aom_dc_left_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+
+  intra_predict_dc_tl_32x32_msa(left, dst, y_stride);
+}
+
+void aom_dc_128_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                  const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+
+  intra_predict_128dc_4x4_msa(dst, y_stride);
+}
+
+void aom_dc_128_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                  const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+
+  intra_predict_128dc_8x8_msa(dst, y_stride);
+}
+
+void aom_dc_128_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+
+  intra_predict_128dc_16x16_msa(dst, y_stride);
+}
+
+void aom_dc_128_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+
+  intra_predict_128dc_32x32_msa(dst, y_stride);
+}
diff --git a/libs/libaom/src/aom_dsp/mips/loopfilter_16_msa.c b/libs/libaom/src/aom_dsp/mips/loopfilter_16_msa.c
new file mode 100644
index 000000000..38a10e9b2
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/mips/loopfilter_16_msa.c
@@ -0,0 +1,1488 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_ports/mem.h"
+#include "aom_dsp/mips/loopfilter_msa.h"
+
+int32_t aom_hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch, uint8_t *filter48,
+                                 const uint8_t *b_limit_ptr,
+                                 const uint8_t *limit_ptr,
+                                 const uint8_t *thresh_ptr) {
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+  v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+  v16u8 flat, mask, hev, thresh, b_limit, limit;
+  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
+  v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
+  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
+  v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l;
+  v16u8 zero = { 0 };
+
+  /* load vector elements */
+  LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+  thresh = (v16u8)__msa_fill_b(*thresh_ptr);
+  b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
+  limit = (v16u8)__msa_fill_b(*limit_ptr);
+
+  /* mask and hev */
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
+  AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+  AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+  if (__msa_test_bz_v(flat)) {
+    ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
+
+    return 1;
+  } else {
+    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
+               q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
+    AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
+                p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
+
+    ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l);
+    ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l);
+    AOM_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
+                p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
+
+    /* convert 16 bit output data into 8 bit */
+    PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
+                p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
+                p0_filt8_r, q0_filt8_r);
+    PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
+                q2_filt8_r);
+
+    /* store pixel values */
+    p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
+    p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
+    p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
+    q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
+    q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
+    q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);
+
+    ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
+    filter48 += (4 * 16);
+    ST_UB2(q1_out, q2_out, filter48, 16);
+    filter48 += (2 * 16);
+    ST_UB(flat, filter48);
+
+    return 0;
+  }
+}
+
+void aom_hz_lpf_t16_16w(uint8_t *src, int32_t pitch, uint8_t *filter48) {
+  v16u8 flat, flat2, filter8;
+  v16i8 zero = { 0 };
+  v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+  v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in;
+  v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in;
+  v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in, p3_l_in, p2_l_in, p1_l_in, p0_l_in;
+  v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in, q3_l_in, q2_l_in, q1_l_in, q0_l_in;
+  v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l;
+  v8i16 l_out, r_out;
+
+  flat = LD_UB(filter48 + 96);
+
+  LD_UB8((src - 8 * pitch), pitch, p7, p6, p5, p4, p3, p2, p1, p0);
+  LD_UB8(src, pitch, q0, q1, q2, q3, q4, q5, q6, q7);
+  AOM_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
+
+  if (__msa_test_bz_v(flat2)) {
+    LD_UB4(filter48, 16, p2, p1, p0, q0);
+    LD_UB2(filter48 + 4 * 16, 16, q1, q2);
+
+    src -= 3 * pitch;
+    ST_UB4(p2, p1, p0, q0, src, pitch);
+    src += (4 * pitch);
+    ST_UB2(q1, q2, src, pitch);
+  } else {
+    src -= 7 * pitch;
+
+    ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero,
+               p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in,
+               p2_r_in, p1_r_in, p0_r_in);
+
+    q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0);
+
+    tmp0_r = p7_r_in << 3;
+    tmp0_r -= p7_r_in;
+    tmp0_r += p6_r_in;
+    tmp0_r += q0_r_in;
+    tmp1_r = p6_r_in + p5_r_in;
+    tmp1_r += p4_r_in;
+    tmp1_r += p3_r_in;
+    tmp1_r += p2_r_in;
+    tmp1_r += p1_r_in;
+    tmp1_r += p0_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+    ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in,
+               p5_l_in, p4_l_in);
+    ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in,
+               p1_l_in, p0_l_in);
+    q0_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q0);
+
+    tmp0_l = p7_l_in << 3;
+    tmp0_l -= p7_l_in;
+    tmp0_l += p6_l_in;
+    tmp0_l += q0_l_in;
+    tmp1_l = p6_l_in + p5_l_in;
+    tmp1_l += p4_l_in;
+    tmp1_l += p3_l_in;
+    tmp1_l += p2_l_in;
+    tmp1_l += p1_l_in;
+    tmp1_l += p0_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2);
+    ST_UB(p6, src);
+    src += pitch;
+
+    /* p5 */
+    q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1);
+    tmp0_r = p5_r_in - p6_r_in;
+    tmp0_r += q1_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+    q1_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q1);
+    tmp0_l = p5_l_in - p6_l_in;
+    tmp0_l += q1_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2);
+    ST_UB(p5, src);
+    src += pitch;
+
+    /* p4 */
+    q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2);
+    tmp0_r = p4_r_in - p5_r_in;
+    tmp0_r += q2_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = (v8i16)__msa_srari_h((v8i16)tmp1_r, 4);
+
+    q2_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q2);
+    tmp0_l = p4_l_in - p5_l_in;
+    tmp0_l += q2_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2);
+    ST_UB(p4, src);
+    src += pitch;
+
+    /* p3 */
+    q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3);
+    tmp0_r = p3_r_in - p4_r_in;
+    tmp0_r += q3_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+    q3_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q3);
+    tmp0_l = p3_l_in - p4_l_in;
+    tmp0_l += q3_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2);
+    ST_UB(p3, src);
+    src += pitch;
+
+    /* p2 */
+    q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4);
+    filter8 = LD_UB(filter48);
+    tmp0_r = p2_r_in - p3_r_in;
+    tmp0_r += q4_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+    q4_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q4);
+    tmp0_l = p2_l_in - p3_l_in;
+    tmp0_l += q4_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+    ST_UB(filter8, src);
+    src += pitch;
+
+    /* p1 */
+    q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5);
+    filter8 = LD_UB(filter48 + 16);
+    tmp0_r = p1_r_in - p2_r_in;
+    tmp0_r += q5_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+    q5_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q5);
+    tmp0_l = p1_l_in - p2_l_in;
+    tmp0_l += q5_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+    ST_UB(filter8, src);
+    src += pitch;
+
+    /* p0 */
+    q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6);
+    filter8 = LD_UB(filter48 + 32);
+    tmp0_r = p0_r_in - p1_r_in;
+    tmp0_r += q6_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+    q6_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q6);
+    tmp0_l = p0_l_in - p1_l_in;
+    tmp0_l += q6_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+    ST_UB(filter8, src);
+    src += pitch;
+
+    /* q0 */
+    q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7);
+    filter8 = LD_UB(filter48 + 48);
+    tmp0_r = q7_r_in - p0_r_in;
+    tmp0_r += q0_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+    q7_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q7);
+    tmp0_l = q7_l_in - p0_l_in;
+    tmp0_l += q0_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+    ST_UB(filter8, src);
+    src += pitch;
+
+    /* q1 */
+    filter8 = LD_UB(filter48 + 64);
+    tmp0_r = q7_r_in - q0_r_in;
+    tmp0_r += q1_r_in;
+    tmp0_r -= p6_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+    tmp0_l = q7_l_in - q0_l_in;
+    tmp0_l += q1_l_in;
+    tmp0_l -= p6_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+    ST_UB(filter8, src);
+    src += pitch;
+
+    /* q2 */
+    filter8 = LD_UB(filter48 + 80);
+    tmp0_r = q7_r_in - q1_r_in;
+    tmp0_r += q2_r_in;
+    tmp0_r -= p5_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+    tmp0_l = q7_l_in - q1_l_in;
+    tmp0_l += q2_l_in;
+    tmp0_l -= p5_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+    ST_UB(filter8, src);
+    src += pitch;
+
+    /* q3 */
+    tmp0_r = q7_r_in - q2_r_in;
+    tmp0_r += q3_r_in;
+    tmp0_r -= p4_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+    tmp0_l = q7_l_in - q2_l_in;
+    tmp0_l += q3_l_in;
+    tmp0_l -= p4_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2);
+    ST_UB(q3, src);
+    src += pitch;
+
+    /* q4 */
+    tmp0_r = q7_r_in - q3_r_in;
+    tmp0_r += q4_r_in;
+    tmp0_r -= p3_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+    tmp0_l = q7_l_in - q3_l_in;
+    tmp0_l += q4_l_in;
+    tmp0_l -= p3_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2);
+    ST_UB(q4, src);
+    src += pitch;
+
+    /* q5 */
+    tmp0_r = q7_r_in - q4_r_in;
+    tmp0_r += q5_r_in;
+    tmp0_r -= p2_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+    tmp0_l = q7_l_in - q4_l_in;
+    tmp0_l += q5_l_in;
+    tmp0_l -= p2_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2);
+    ST_UB(q5, src);
+    src += pitch;
+
+    /* q6 */
+    tmp0_r = q7_r_in - q5_r_in;
+    tmp0_r += q6_r_in;
+    tmp0_r -= p1_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+    tmp0_l = q7_l_in - q5_l_in;
+    tmp0_l += q6_l_in;
+    tmp0_l -= p1_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2);
+    ST_UB(q6, src);
+  }
+}
+
+static void mb_lpf_horizontal_edge_dual(uint8_t *src, int32_t pitch,
+                                        const uint8_t *b_limit_ptr,
+                                        const uint8_t *limit_ptr,
+                                        const uint8_t *thresh_ptr,
+                                        int32_t count) {
+  DECLARE_ALIGNED(32, uint8_t, filter48[16 * 8]);
+  uint8_t early_exit = 0;
+
+  (void)count;
+
+  early_exit = aom_hz_lpf_t4_and_t8_16w(src, pitch, &filter48[0], b_limit_ptr,
+                                        limit_ptr, thresh_ptr);
+
+  if (0 == early_exit) {
+    aom_hz_lpf_t16_16w(src, pitch, filter48);
+  }
+}
+
+static void mb_lpf_horizontal_edge(uint8_t *src, int32_t pitch,
+                                   const uint8_t *b_limit_ptr,
+                                   const uint8_t *limit_ptr,
+                                   const uint8_t *thresh_ptr, int32_t count) {
+  if (1 == count) {
+    uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
+    uint64_t dword0, dword1;
+    v16u8 flat2, mask, hev, flat, thresh, b_limit, limit;
+    v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p7, p6, p5, p4, q4, q5, q6, q7;
+    v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+    v16u8 p0_filter16, p1_filter16;
+    v8i16 p2_filter8, p1_filter8, p0_filter8;
+    v8i16 q0_filter8, q1_filter8, q2_filter8;
+    v8u16 p7_r, p6_r, p5_r, p4_r, q7_r, q6_r, q5_r, q4_r;
+    v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r;
+    v16i8 zero = { 0 };
+    v8u16 tmp0, tmp1, tmp2;
+
+    /* load vector elements */
+    LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+    thresh = (v16u8)__msa_fill_b(*thresh_ptr);
+    b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
+    limit = (v16u8)__msa_fill_b(*limit_ptr);
+
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+                 mask, flat);
+    AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+    AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
+                       q1_out);
+
+    flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat);
+
+    if (__msa_test_bz_v(flat)) {
+      p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
+      p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
+      q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
+      q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
+      SD4(p1_d, p0_d, q0_d, q1_d, src - 2 * pitch, pitch);
+    } else {
+      /* convert 8 bit input data into 16 bit */
+      ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
+                 zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
+                 q3_r);
+      AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8,
+                  p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);
+
+      /* convert 16 bit output data into 8 bit */
+      PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8, zero,
+                  q0_filter8, p2_filter8, p1_filter8, p0_filter8, q0_filter8);
+      PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8);
+
+      /* store pixel values */
+      p2_out = __msa_bmnz_v(p2, (v16u8)p2_filter8, flat);
+      p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filter8, flat);
+      p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filter8, flat);
+      q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filter8, flat);
+      q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filter8, flat);
+      q2_out = __msa_bmnz_v(q2, (v16u8)q2_filter8, flat);
+
+      /* load 16 vector elements */
+      LD_UB4((src - 8 * pitch), pitch, p7, p6, p5, p4);
+      LD_UB4(src + (4 * pitch), pitch, q4, q5, q6, q7);
+
+      AOM_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
+
+      if (__msa_test_bz_v(flat2)) {
+        p2_d = __msa_copy_u_d((v2i64)p2_out, 0);
+        p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
+        p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
+        q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
+        q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
+        q2_d = __msa_copy_u_d((v2i64)q2_out, 0);
+
+        SD4(p2_d, p1_d, p0_d, q0_d, src - 3 * pitch, pitch);
+        SD(q1_d, src + pitch);
+        SD(q2_d, src + 2 * pitch);
+      } else {
+        /* LSB(right) 8 pixel operation */
+        ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, q4, zero, q5,
+                   zero, q6, zero, q7, p7_r, p6_r, p5_r, p4_r, q4_r, q5_r, q6_r,
+                   q7_r);
+
+        tmp0 = p7_r << 3;
+        tmp0 -= p7_r;
+        tmp0 += p6_r;
+        tmp0 += q0_r;
+
+        src -= 7 * pitch;
+
+        /* calculation of p6 and p5 */
+        tmp1 = p6_r + p5_r + p4_r + p3_r;
+        tmp1 += (p2_r + p1_r + p0_r);
+        tmp1 += tmp0;
+        p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+        tmp0 = p5_r - p6_r + q1_r - p7_r;
+        tmp1 += tmp0;
+        p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+        PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
+                    p1_filter16);
+        p0_filter16 = __msa_bmnz_v(p6, p0_filter16, flat2);
+        p1_filter16 = __msa_bmnz_v(p5, p1_filter16, flat2);
+        dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
+        dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
+        SD(dword0, src);
+        src += pitch;
+        SD(dword1, src);
+        src += pitch;
+
+        /* calculation of p4 and p3 */
+        tmp0 = p4_r - p5_r + q2_r - p7_r;
+        tmp2 = p3_r - p4_r + q3_r - p7_r;
+        tmp1 += tmp0;
+        p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+        tmp1 += tmp2;
+        p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+        PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
+                    p1_filter16);
+        p0_filter16 = __msa_bmnz_v(p4, p0_filter16, flat2);
+        p1_filter16 = __msa_bmnz_v(p3, p1_filter16, flat2);
+        dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
+        dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
+        SD(dword0, src);
+        src += pitch;
+        SD(dword1, src);
+        src += pitch;
+
+        /* calculation of p2 and p1 */
+        tmp0 = p2_r - p3_r + q4_r - p7_r;
+        tmp2 = p1_r - p2_r + q5_r - p7_r;
+        tmp1 += tmp0;
+        p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+        tmp1 += tmp2;
+        p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+        PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
+                    p1_filter16);
+        p0_filter16 = __msa_bmnz_v(p2_out, p0_filter16, flat2);
+        p1_filter16 = __msa_bmnz_v(p1_out, p1_filter16, flat2);
+        dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
+        dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
+        SD(dword0, src);
+        src += pitch;
+        SD(dword1, src);
+        src += pitch;
+
+        /* calculation of p0 and q0 */
+        tmp0 = (p0_r - p1_r) + (q6_r - p7_r);
+        tmp2 = (q7_r - p0_r) + (q0_r - p7_r);
+        tmp1 += tmp0;
+        p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+        tmp1 += tmp2;
+        p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+        PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
+                    p1_filter16);
+        p0_filter16 = __msa_bmnz_v(p0_out, p0_filter16, flat2);
+        p1_filter16 = __msa_bmnz_v(q0_out, p1_filter16, flat2);
+        dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
+        dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
+        SD(dword0, src);
+        src += pitch;
+        SD(dword1, src);
+        src += pitch;
+
+        /* calculation of q1 and q2 */
+        tmp0 = q7_r - q0_r + q1_r - p6_r;
+        tmp2 = q7_r - q1_r + q2_r - p5_r;
+        tmp1 += tmp0;
+        p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+        tmp1 += tmp2;
+        p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+        PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
+                    p1_filter16);
+        p0_filter16 = __msa_bmnz_v(q1_out, p0_filter16, flat2);
+        p1_filter16 = __msa_bmnz_v(q2_out, p1_filter16, flat2);
+        dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
+        dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
+        SD(dword0, src);
+        src += pitch;
+        SD(dword1, src);
+        src += pitch;
+
+        /* calculation of q3 and q4 */
+        tmp0 = (q7_r - q2_r) + (q3_r - p4_r);
+        tmp2 = (q7_r - q3_r) + (q4_r - p3_r);
+        tmp1 += tmp0;
+        p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+        tmp1 += tmp2;
+        p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+        PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
+                    p1_filter16);
+        p0_filter16 = __msa_bmnz_v(q3, p0_filter16, flat2);
+        p1_filter16 = __msa_bmnz_v(q4, p1_filter16, flat2);
+        dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
+        dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
+        SD(dword0, src);
+        src += pitch;
+        SD(dword1, src);
+        src += pitch;
+
+        /* calculation of q5 and q6 */
+        tmp0 = (q7_r - q4_r) + (q5_r - p2_r);
+        tmp2 = (q7_r - q5_r) + (q6_r - p1_r);
+        tmp1 += tmp0;
+        p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+        tmp1 += tmp2;
+        p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
+        PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
+                    p1_filter16);
+        p0_filter16 = __msa_bmnz_v(q5, p0_filter16, flat2);
+        p1_filter16 = __msa_bmnz_v(q6, p1_filter16, flat2);
+        dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
+        dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
+        SD(dword0, src);
+        src += pitch;
+        SD(dword1, src);
+      }
+    }
+  } else {
+    mb_lpf_horizontal_edge_dual(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr,
+                                count);
+  }
+}
+
+void aom_lpf_horizontal_16_msa(uint8_t *src, int32_t pitch,
+                               const uint8_t *b_limit_ptr,
+                               const uint8_t *limit_ptr,
+                               const uint8_t *thresh_ptr) {
+  mb_lpf_horizontal_edge(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, 1);
+}
+
+void aom_lpf_horizontal_16_dual_msa(uint8_t *src, int32_t pitch,
+                                    const uint8_t *b_limit_ptr,
+                                    const uint8_t *limit_ptr,
+                                    const uint8_t *thresh_ptr) {
+  mb_lpf_horizontal_edge(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, 2);
+}
+
+static void transpose_16x8_to_8x16(uint8_t *input, int32_t in_pitch,
+                                   uint8_t *output, int32_t out_pitch) {
+  v16u8 p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org;
+  v16i8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+
+  LD_UB8(input, in_pitch, p7_org, p6_org, p5_org, p4_org, p3_org, p2_org,
+         p1_org, p0_org);
+  /* 8x8 transpose */
+  TRANSPOSE8x8_UB_UB(p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org,
+                     p0_org, p7, p6, p5, p4, p3, p2, p1, p0);
+  /* 8x8 transpose */
+  ILVL_B4_SB(p5_org, p7_org, p4_org, p6_org, p1_org, p3_org, p0_org, p2_org,
+             tmp0, tmp1, tmp2, tmp3);
+  ILVR_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp4, tmp6);
+  ILVL_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp7);
+  ILVR_W2_UB(tmp6, tmp4, tmp7, tmp5, q0, q4);
+  ILVL_W2_UB(tmp6, tmp4, tmp7, tmp5, q2, q6);
+  SLDI_B4_0_UB(q0, q2, q4, q6, q1, q3, q5, q7, 8);
+
+  ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch);
+  output += (8 * out_pitch);
+  ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
+}
+
+static void transpose_8x16_to_16x8(uint8_t *input, int32_t in_pitch,
+                                   uint8_t *output, int32_t out_pitch) {
+  v16u8 p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o;
+  v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+
+  LD_UB8(input, in_pitch, p7, p6, p5, p4, p3, p2, p1, p0);
+  LD_UB8(input + (8 * in_pitch), in_pitch, q0, q1, q2, q3, q4, q5, q6, q7);
+  TRANSPOSE16x8_UB_UB(p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5,
+                      q6, q7, p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o);
+  ST_UB8(p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o, output, out_pitch);
+}
+
+static void transpose_16x16(uint8_t *input, int32_t in_pitch, uint8_t *output,
+                            int32_t out_pitch) {
+  v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
+  v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
+  v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+  v8i16 tmp0, tmp1, tmp4, tmp5, tmp6, tmp7;
+  v4i32 tmp2, tmp3;
+
+  LD_UB8(input, in_pitch, row0, row1, row2, row3, row4, row5, row6, row7);
+  input += (8 * in_pitch);
+  LD_UB8(input, in_pitch, row8, row9, row10, row11, row12, row13, row14, row15);
+
+  TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8,
+                      row9, row10, row11, row12, row13, row14, row15, p7, p6,
+                      p5, p4, p3, p2, p1, p0);
+
+  /* transpose 16x8 matrix into 8x16 */
+  /* total 8 intermediate register and 32 instructions */
+  q7 = (v16u8)__msa_ilvod_d((v2i64)row8, (v2i64)row0);
+  q6 = (v16u8)__msa_ilvod_d((v2i64)row9, (v2i64)row1);
+  q5 = (v16u8)__msa_ilvod_d((v2i64)row10, (v2i64)row2);
+  q4 = (v16u8)__msa_ilvod_d((v2i64)row11, (v2i64)row3);
+  q3 = (v16u8)__msa_ilvod_d((v2i64)row12, (v2i64)row4);
+  q2 = (v16u8)__msa_ilvod_d((v2i64)row13, (v2i64)row5);
+  q1 = (v16u8)__msa_ilvod_d((v2i64)row14, (v2i64)row6);
+  q0 = (v16u8)__msa_ilvod_d((v2i64)row15, (v2i64)row7);
+
+  ILVEV_B2_SH(q7, q6, q5, q4, tmp0, tmp1);
+  tmp4 = (v8i16)__msa_ilvod_b((v16i8)q6, (v16i8)q7);
+  tmp5 = (v8i16)__msa_ilvod_b((v16i8)q4, (v16i8)q5);
+
+  ILVEV_B2_UB(q3, q2, q1, q0, q5, q7);
+  tmp6 = (v8i16)__msa_ilvod_b((v16i8)q2, (v16i8)q3);
+  tmp7 = (v8i16)__msa_ilvod_b((v16i8)q0, (v16i8)q1);
+
+  ILVEV_H2_SW(tmp0, tmp1, q5, q7, tmp2, tmp3);
+  q0 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
+  q4 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
+
+  tmp2 = (v4i32)__msa_ilvod_h(tmp1, tmp0);
+  tmp3 = (v4i32)__msa_ilvod_h((v8i16)q7, (v8i16)q5);
+  q2 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
+  q6 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
+
+  ILVEV_H2_SW(tmp4, tmp5, tmp6, tmp7, tmp2, tmp3);
+  q1 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
+  q5 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
+
+  tmp2 = (v4i32)__msa_ilvod_h(tmp5, tmp4);
+  tmp3 = (v4i32)__msa_ilvod_h(tmp7, tmp6);
+  q3 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
+  q7 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
+
+  ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch);
+  output += (8 * out_pitch);
+  ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
+}
+
+int32_t aom_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48,
+                                uint8_t *src_org, int32_t pitch_org,
+                                const uint8_t *b_limit_ptr,
+                                const uint8_t *limit_ptr,
+                                const uint8_t *thresh_ptr) {
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+  v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+  v16u8 flat, mask, hev, thresh, b_limit, limit;
+  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
+  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
+  v16i8 zero = { 0 };
+  v8i16 vec0, vec1, vec2, vec3;
+
+  /* load vector elements */
+  LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3);
+
+  thresh = (v16u8)__msa_fill_b(*thresh_ptr);
+  b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
+  limit = (v16u8)__msa_fill_b(*limit_ptr);
+
+  /* mask and hev */
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
+  /* flat4 */
+  AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+  /* filter4 */
+  AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+  flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat);
+
+  if (__msa_test_bz_v(flat)) {
+    ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+    ILVRL_H2_SH(vec1, vec0, vec2, vec3);
+    ST4x8_UB(vec2, vec3, (src_org - 2), pitch_org);
+    return 1;
+  } else {
+    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
+               q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
+    AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
+                p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
+
+    /* convert 16 bit output data into 8 bit */
+    p2_r = (v8u16)__msa_pckev_b((v16i8)p2_filt8_r, (v16i8)p2_filt8_r);
+    p1_r = (v8u16)__msa_pckev_b((v16i8)p1_filt8_r, (v16i8)p1_filt8_r);
+    p0_r = (v8u16)__msa_pckev_b((v16i8)p0_filt8_r, (v16i8)p0_filt8_r);
+    q0_r = (v8u16)__msa_pckev_b((v16i8)q0_filt8_r, (v16i8)q0_filt8_r);
+    q1_r = (v8u16)__msa_pckev_b((v16i8)q1_filt8_r, (v16i8)q1_filt8_r);
+    q2_r = (v8u16)__msa_pckev_b((v16i8)q2_filt8_r, (v16i8)q2_filt8_r);
+
+    /* store pixel values */
+    p2_out = __msa_bmnz_v(p2, (v16u8)p2_r, flat);
+    p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_r, flat);
+    p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_r, flat);
+    q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_r, flat);
+    q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_r, flat);
+    q2_out = __msa_bmnz_v(q2, (v16u8)q2_r, flat);
+
+    ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
+    filter48 += (4 * 16);
+    ST_UB2(q1_out, q2_out, filter48, 16);
+    filter48 += (2 * 16);
+    ST_UB(flat, filter48);
+
+    return 0;
+  }
+}
+
+int32_t aom_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, int32_t pitch,
+                          uint8_t *filter48) {
+  v16i8 zero = { 0 };
+  v16u8 filter8, flat, flat2;
+  v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+  v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in;
+  v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in;
+  v8u16 tmp0_r, tmp1_r;
+  v8i16 r_out;
+
+  flat = LD_UB(filter48 + 6 * 16);
+
+  LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0);
+  LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7);
+
+  AOM_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
+
+  if (__msa_test_bz_v(flat2)) {
+    v8i16 vec0, vec1, vec2, vec3, vec4;
+
+    LD_UB4(filter48, 16, p2, p1, p0, q0);
+    LD_UB2(filter48 + 4 * 16, 16, q1, q2);
+
+    ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
+    ILVRL_H2_SH(vec1, vec0, vec3, vec4);
+    vec2 = (v8i16)__msa_ilvr_b((v16i8)q2, (v16i8)q1);
+
+    src_org -= 3;
+    ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch);
+    ST2x4_UB(vec2, 0, (src_org + 4), pitch);
+    src_org += (4 * pitch);
+    ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch);
+    ST2x4_UB(vec2, 4, (src_org + 4), pitch);
+
+    return 1;
+  } else {
+    src -= 7 * 16;
+
+    ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero,
+               p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in,
+               p2_r_in, p1_r_in, p0_r_in);
+    q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0);
+
+    tmp0_r = p7_r_in << 3;
+    tmp0_r -= p7_r_in;
+    tmp0_r += p6_r_in;
+    tmp0_r += q0_r_in;
+    tmp1_r = p6_r_in + p5_r_in;
+    tmp1_r += p4_r_in;
+    tmp1_r += p3_r_in;
+    tmp1_r += p2_r_in;
+    tmp1_r += p1_r_in;
+    tmp1_r += p0_r_in;
+    tmp1_r += tmp0_r;
+
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+    p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2);
+    ST8x1_UB(p6, src);
+    src += 16;
+
+    /* p5 */
+    q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1);
+    tmp0_r = p5_r_in - p6_r_in;
+    tmp0_r += q1_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+    p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2);
+    ST8x1_UB(p5, src);
+    src += 16;
+
+    /* p4 */
+    q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2);
+    tmp0_r = p4_r_in - p5_r_in;
+    tmp0_r += q2_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+    p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2);
+    ST8x1_UB(p4, src);
+    src += 16;
+
+    /* p3 */
+    q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3);
+    tmp0_r = p3_r_in - p4_r_in;
+    tmp0_r += q3_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+    p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2);
+    ST8x1_UB(p3, src);
+    src += 16;
+
+    /* p2 */
+    q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4);
+    filter8 = LD_UB(filter48);
+    tmp0_r = p2_r_in - p3_r_in;
+    tmp0_r += q4_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+    ST8x1_UB(filter8, src);
+    src += 16;
+
+    /* p1 */
+    q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5);
+    filter8 = LD_UB(filter48 + 16);
+    tmp0_r = p1_r_in - p2_r_in;
+    tmp0_r += q5_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+    ST8x1_UB(filter8, src);
+    src += 16;
+
+    /* p0 */
+    q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6);
+    filter8 = LD_UB(filter48 + 32);
+    tmp0_r = p0_r_in - p1_r_in;
+    tmp0_r += q6_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+    ST8x1_UB(filter8, src);
+    src += 16;
+
+    /* q0 */
+    q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7);
+    filter8 = LD_UB(filter48 + 48);
+    tmp0_r = q7_r_in - p0_r_in;
+    tmp0_r += q0_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+    ST8x1_UB(filter8, src);
+    src += 16;
+
+    /* q1 */
+    filter8 = LD_UB(filter48 + 64);
+    tmp0_r = q7_r_in - q0_r_in;
+    tmp0_r += q1_r_in;
+    tmp0_r -= p6_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+    ST8x1_UB(filter8, src);
+    src += 16;
+
+    /* q2 */
+    filter8 = LD_UB(filter48 + 80);
+    tmp0_r = q7_r_in - q1_r_in;
+    tmp0_r += q2_r_in;
+    tmp0_r -= p5_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+    ST8x1_UB(filter8, src);
+    src += 16;
+
+    /* q3 */
+    tmp0_r = q7_r_in - q2_r_in;
+    tmp0_r += q3_r_in;
+    tmp0_r -= p4_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+    q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2);
+    ST8x1_UB(q3, src);
+    src += 16;
+
+    /* q4 */
+    tmp0_r = q7_r_in - q3_r_in;
+    tmp0_r += q4_r_in;
+    tmp0_r -= p3_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+    q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2);
+    ST8x1_UB(q4, src);
+    src += 16;
+
+    /* q5 */
+    tmp0_r = q7_r_in - q4_r_in;
+    tmp0_r += q5_r_in;
+    tmp0_r -= p2_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+    q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2);
+    ST8x1_UB(q5, src);
+    src += 16;
+
+    /* q6 */
+    tmp0_r = q7_r_in - q5_r_in;
+    tmp0_r += q6_r_in;
+    tmp0_r -= p1_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
+    q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2);
+    ST8x1_UB(q6, src);
+
+    return 0;
+  }
+}
+
+void aom_lpf_vertical_16_msa(uint8_t *src, int32_t pitch,
+                             const uint8_t *b_limit_ptr,
+                             const uint8_t *limit_ptr,
+                             const uint8_t *thresh_ptr) {
+  uint8_t early_exit = 0;
+  DECLARE_ALIGNED(32, uint8_t, transposed_input[16 * 24]);
+  uint8_t *filter48 = &transposed_input[16 * 16];
+
+  transpose_16x8_to_8x16(src - 8, pitch, transposed_input, 16);
+
+  early_exit =
+      aom_vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8), &filter48[0], src,
+                              pitch, b_limit_ptr, limit_ptr, thresh_ptr);
+
+  if (0 == early_exit) {
+    early_exit = aom_vt_lpf_t16_8w((transposed_input + 16 * 8), src, pitch,
+                                   &filter48[0]);
+
+    if (0 == early_exit) {
+      transpose_8x16_to_16x8(transposed_input, 16, src - 8, pitch);
+    }
+  }
+}
+
+int32_t aom_vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48,
+                                 uint8_t *src_org, int32_t pitch,
+                                 const uint8_t *b_limit_ptr,
+                                 const uint8_t *limit_ptr,
+                                 const uint8_t *thresh_ptr) {
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+  v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+  v16u8 flat, mask, hev, thresh, b_limit, limit;
+  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
+  v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
+  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
+  v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l;
+  v16i8 zero = { 0 };
+  v8i16 vec0, vec1, vec2, vec3, vec4, vec5;
+
+  /* load vector elements */
+  LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3);
+
+  thresh = (v16u8)__msa_fill_b(*thresh_ptr);
+  b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
+  limit = (v16u8)__msa_fill_b(*limit_ptr);
+
+  /* mask and hev */
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
+  /* flat4 */
+  AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+  /* filter4 */
+  AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+  if (__msa_test_bz_v(flat)) {
+    ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+    ILVRL_H2_SH(vec1, vec0, vec2, vec3);
+    ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+    ILVRL_H2_SH(vec1, vec0, vec4, vec5);
+
+    src_org -= 2;
+    ST4x8_UB(vec2, vec3, src_org, pitch);
+    src_org += 8 * pitch;
+    ST4x8_UB(vec4, vec5, src_org, pitch);
+
+    return 1;
+  } else {
+    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
+               q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
+    AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
+                p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
+    ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l);
+    ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l);
+    AOM_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
+                p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
+
+    /* convert 16 bit output data into 8 bit */
+    PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
+                p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
+                p0_filt8_r, q0_filt8_r);
+    PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
+                q2_filt8_r);
+
+    /* store pixel values */
+    p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
+    p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
+    p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
+    q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
+    q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
+    q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);
+
+    ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
+    filter48 += (4 * 16);
+    ST_UB2(q1_out, q2_out, filter48, 16);
+    filter48 += (2 * 16);
+    ST_UB(flat, filter48);
+
+    return 0;
+  }
+}
+
+int32_t aom_vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, int32_t pitch,
+                           uint8_t *filter48) {
+  v16u8 flat, flat2, filter8;
+  v16i8 zero = { 0 };
+  v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+  v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in;
+  v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in;
+  v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in, p3_l_in, p2_l_in, p1_l_in, p0_l_in;
+  v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in, q3_l_in, q2_l_in, q1_l_in, q0_l_in;
+  v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l;
+  v8i16 l_out, r_out;
+
+  flat = LD_UB(filter48 + 6 * 16);
+
+  LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0);
+  LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7);
+
+  AOM_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
+
+  if (__msa_test_bz_v(flat2)) {
+    v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+
+    LD_UB4(filter48, 16, p2, p1, p0, q0);
+    LD_UB2(filter48 + 4 * 16, 16, q1, q2);
+
+    ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
+    ILVRL_H2_SH(vec1, vec0, vec3, vec4);
+    ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
+    ILVRL_H2_SH(vec1, vec0, vec6, vec7);
+    ILVRL_B2_SH(q2, q1, vec2, vec5);
+
+    src_org -= 3;
+    ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch);
+    ST2x4_UB(vec2, 0, (src_org + 4), pitch);
+    src_org += (4 * pitch);
+    ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch);
+    ST2x4_UB(vec2, 4, (src_org + 4), pitch);
+    src_org += (4 * pitch);
+    ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src_org, pitch);
+    ST2x4_UB(vec5, 0, (src_org + 4), pitch);
+    src_org += (4 * pitch);
+    ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src_org, pitch);
+    ST2x4_UB(vec5, 4, (src_org + 4), pitch);
+
+    return 1;
+  } else {
+    src -= 7 * 16;
+
+    ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero,
+               p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in,
+               p2_r_in, p1_r_in, p0_r_in);
+    q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0);
+
+    tmp0_r = p7_r_in << 3;
+    tmp0_r -= p7_r_in;
+    tmp0_r += p6_r_in;
+    tmp0_r += q0_r_in;
+    tmp1_r = p6_r_in + p5_r_in;
+    tmp1_r += p4_r_in;
+    tmp1_r += p3_r_in;
+    tmp1_r += p2_r_in;
+    tmp1_r += p1_r_in;
+    tmp1_r += p0_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+
+    ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in,
+               p5_l_in, p4_l_in);
+    ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in,
+               p1_l_in, p0_l_in);
+    q0_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q0);
+
+    tmp0_l = p7_l_in << 3;
+    tmp0_l -= p7_l_in;
+    tmp0_l += p6_l_in;
+    tmp0_l += q0_l_in;
+    tmp1_l = p6_l_in + p5_l_in;
+    tmp1_l += p4_l_in;
+    tmp1_l += p3_l_in;
+    tmp1_l += p2_l_in;
+    tmp1_l += p1_l_in;
+    tmp1_l += p0_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2);
+    ST_UB(p6, src);
+    src += 16;
+
+    /* p5 */
+    q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1);
+    tmp0_r = p5_r_in - p6_r_in;
+    tmp0_r += q1_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    q1_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q1);
+    tmp0_l = p5_l_in - p6_l_in;
+    tmp0_l += q1_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2);
+    ST_UB(p5, src);
+    src += 16;
+
+    /* p4 */
+    q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2);
+    tmp0_r = p4_r_in - p5_r_in;
+    tmp0_r += q2_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    q2_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q2);
+    tmp0_l = p4_l_in - p5_l_in;
+    tmp0_l += q2_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2);
+    ST_UB(p4, src);
+    src += 16;
+
+    /* p3 */
+    q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3);
+    tmp0_r = p3_r_in - p4_r_in;
+    tmp0_r += q3_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    q3_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q3);
+    tmp0_l = p3_l_in - p4_l_in;
+    tmp0_l += q3_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2);
+    ST_UB(p3, src);
+    src += 16;
+
+    /* p2 */
+    q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4);
+    filter8 = LD_UB(filter48);
+    tmp0_r = p2_r_in - p3_r_in;
+    tmp0_r += q4_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    q4_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q4);
+    tmp0_l = p2_l_in - p3_l_in;
+    tmp0_l += q4_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+    ST_UB(filter8, src);
+    src += 16;
+
+    /* p1 */
+    q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5);
+    filter8 = LD_UB(filter48 + 16);
+    tmp0_r = p1_r_in - p2_r_in;
+    tmp0_r += q5_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    q5_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q5);
+    tmp0_l = p1_l_in - p2_l_in;
+    tmp0_l += q5_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)(tmp1_l), 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+    ST_UB(filter8, src);
+    src += 16;
+
+    /* p0 */
+    q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6);
+    filter8 = LD_UB(filter48 + 32);
+    tmp0_r = p0_r_in - p1_r_in;
+    tmp0_r += q6_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    q6_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q6);
+    tmp0_l = p0_l_in - p1_l_in;
+    tmp0_l += q6_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+    ST_UB(filter8, src);
+    src += 16;
+
+    /* q0 */
+    q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7);
+    filter8 = LD_UB(filter48 + 48);
+    tmp0_r = q7_r_in - p0_r_in;
+    tmp0_r += q0_r_in;
+    tmp0_r -= p7_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    q7_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q7);
+    tmp0_l = q7_l_in - p0_l_in;
+    tmp0_l += q0_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+    ST_UB(filter8, src);
+    src += 16;
+
+    /* q1 */
+    filter8 = LD_UB(filter48 + 64);
+    tmp0_r = q7_r_in - q0_r_in;
+    tmp0_r += q1_r_in;
+    tmp0_r -= p6_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    tmp0_l = q7_l_in - q0_l_in;
+    tmp0_l += q1_l_in;
+    tmp0_l -= p6_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+    ST_UB(filter8, src);
+    src += 16;
+
+    /* q2 */
+    filter8 = LD_UB(filter48 + 80);
+    tmp0_r = q7_r_in - q1_r_in;
+    tmp0_r += q2_r_in;
+    tmp0_r -= p5_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    tmp0_l = q7_l_in - q1_l_in;
+    tmp0_l += q2_l_in;
+    tmp0_l -= p5_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
+    ST_UB(filter8, src);
+    src += 16;
+
+    /* q3 */
+    tmp0_r = q7_r_in - q2_r_in;
+    tmp0_r += q3_r_in;
+    tmp0_r -= p4_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    tmp0_l = q7_l_in - q2_l_in;
+    tmp0_l += q3_l_in;
+    tmp0_l -= p4_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2);
+    ST_UB(q3, src);
+    src += 16;
+
+    /* q4 */
+    tmp0_r = q7_r_in - q3_r_in;
+    tmp0_r += q4_r_in;
+    tmp0_r -= p3_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    tmp0_l = q7_l_in - q3_l_in;
+    tmp0_l += q4_l_in;
+    tmp0_l -= p3_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2);
+    ST_UB(q4, src);
+    src += 16;
+
+    /* q5 */
+    tmp0_r = q7_r_in - q4_r_in;
+    tmp0_r += q5_r_in;
+    tmp0_r -= p2_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    tmp0_l = q7_l_in - q4_l_in;
+    tmp0_l += q5_l_in;
+    tmp0_l -= p2_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2);
+    ST_UB(q5, src);
+    src += 16;
+
+    /* q6 */
+    tmp0_r = q7_r_in - q5_r_in;
+    tmp0_r += q6_r_in;
+    tmp0_r -= p1_r_in;
+    tmp1_r += tmp0_r;
+    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
+    tmp0_l = q7_l_in - q5_l_in;
+    tmp0_l += q6_l_in;
+    tmp0_l -= p1_l_in;
+    tmp1_l += tmp0_l;
+    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
+    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
+    q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2);
+    ST_UB(q6, src);
+
+    return 0;
+  }
+}
+
+void aom_lpf_vertical_16_dual_msa(uint8_t *src, int32_t pitch,
+                                  const uint8_t *b_limit_ptr,
+                                  const uint8_t *limit_ptr,
+                                  const uint8_t *thresh_ptr) {
+  uint8_t early_exit = 0;
+  DECLARE_ALIGNED(32, uint8_t, transposed_input[16 * 24]);
+  uint8_t *filter48 = &transposed_input[16 * 16];
+
+  transpose_16x16((src - 8), pitch, &transposed_input[0], 16);
+
+  early_exit =
+      aom_vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8), &filter48[0], src,
+                               pitch, b_limit_ptr, limit_ptr, thresh_ptr);
+
+  if (0 == early_exit) {
+    early_exit = aom_vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch,
+                                    &filter48[0]);
+
+    if (0 == early_exit) {
+      transpose_16x16(transposed_input, 16, (src - 8), pitch);
+    }
+  }
+}
diff --git a/libs/libaom/src/aom_dsp/mips/loopfilter_4_msa.c b/libs/libaom/src/aom_dsp/mips/loopfilter_4_msa.c
new file mode 100644
index 000000000..dc0a97764
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/mips/loopfilter_4_msa.c
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/mips/loopfilter_msa.h"
+
+void aom_lpf_horizontal_4_msa(uint8_t *src, int32_t pitch,
+                              const uint8_t *b_limit_ptr,
+                              const uint8_t *limit_ptr,
+                              const uint8_t *thresh_ptr) {
+  uint64_t p1_d, p0_d, q0_d, q1_d;
+  v16u8 mask, hev, flat, thresh, b_limit, limit;
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p1_out, p0_out, q0_out, q1_out;
+
+  /* load vector elements */
+  LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+  thresh = (v16u8)__msa_fill_b(*thresh_ptr);
+  b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
+  limit = (v16u8)__msa_fill_b(*limit_ptr);
+
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
+  AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+  p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
+  p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
+  q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
+  q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
+  SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch);
+}
+
+void aom_lpf_horizontal_4_dual_msa(uint8_t *src, int32_t pitch,
+                                   const uint8_t *b_limit0_ptr,
+                                   const uint8_t *limit0_ptr,
+                                   const uint8_t *thresh0_ptr,
+                                   const uint8_t *b_limit1_ptr,
+                                   const uint8_t *limit1_ptr,
+                                   const uint8_t *thresh1_ptr) {
+  v16u8 mask, hev, flat, thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+
+  /* load vector elements */
+  LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+  thresh0 = (v16u8)__msa_fill_b(*thresh0_ptr);
+  thresh1 = (v16u8)__msa_fill_b(*thresh1_ptr);
+  thresh0 = (v16u8)__msa_ilvr_d((v2i64)thresh1, (v2i64)thresh0);
+
+  b_limit0 = (v16u8)__msa_fill_b(*b_limit0_ptr);
+  b_limit1 = (v16u8)__msa_fill_b(*b_limit1_ptr);
+  b_limit0 = (v16u8)__msa_ilvr_d((v2i64)b_limit1, (v2i64)b_limit0);
+
+  limit0 = (v16u8)__msa_fill_b(*limit0_ptr);
+  limit1 = (v16u8)__msa_fill_b(*limit1_ptr);
+  limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0);
+
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev,
+               mask, flat);
+  AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
+
+  ST_UB4(p1, p0, q0, q1, (src - 2 * pitch), pitch);
+}
+
+void aom_lpf_vertical_4_msa(uint8_t *src, int32_t pitch,
+                            const uint8_t *b_limit_ptr,
+                            const uint8_t *limit_ptr,
+                            const uint8_t *thresh_ptr) {
+  v16u8 mask, hev, flat, limit, thresh, b_limit;
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+  v8i16 vec0, vec1, vec2, vec3;
+
+  LD_UB8((src - 4), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+  thresh = (v16u8)__msa_fill_b(*thresh_ptr);
+  b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
+  limit = (v16u8)__msa_fill_b(*limit_ptr);
+
+  TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3, p3, p2, p1, p0, q0, q1, q2,
+                     q3);
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
+  AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
+  ILVR_B2_SH(p0, p1, q1, q0, vec0, vec1);
+  ILVRL_H2_SH(vec1, vec0, vec2, vec3);
+
+  src -= 2;
+  ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch);
+  src += 4 * pitch;
+  ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
+}
+
+void aom_lpf_vertical_4_dual_msa(uint8_t *src, int32_t pitch,
+                                 const uint8_t *b_limit0_ptr,
+                                 const uint8_t *limit0_ptr,
+                                 const uint8_t *thresh0_ptr,
+                                 const uint8_t *b_limit1_ptr,
+                                 const uint8_t *limit1_ptr,
+                                 const uint8_t *thresh1_ptr) {
+  v16u8 mask, hev, flat;
+  v16u8 thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+  v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
+  v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
+  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+
+  LD_UB8(src - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
+  LD_UB8(src - 4 + (8 * pitch), pitch, row8, row9, row10, row11, row12, row13,
+         row14, row15);
+
+  TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8,
+                      row9, row10, row11, row12, row13, row14, row15, p3, p2,
+                      p1, p0, q0, q1, q2, q3);
+
+  thresh0 = (v16u8)__msa_fill_b(*thresh0_ptr);
+  thresh1 = (v16u8)__msa_fill_b(*thresh1_ptr);
+  thresh0 = (v16u8)__msa_ilvr_d((v2i64)thresh1, (v2i64)thresh0);
+
+  b_limit0 = (v16u8)__msa_fill_b(*b_limit0_ptr);
+  b_limit1 = (v16u8)__msa_fill_b(*b_limit1_ptr);
+  b_limit0 = (v16u8)__msa_ilvr_d((v2i64)b_limit1, (v2i64)b_limit0);
+
+  limit0 = (v16u8)__msa_fill_b(*limit0_ptr);
+  limit1 = (v16u8)__msa_fill_b(*limit1_ptr);
+  limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0);
+
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev,
+               mask, flat);
+  AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
+  ILVR_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
+  ILVRL_H2_SH(tmp1, tmp0, tmp2, tmp3);
+  ILVL_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
+  ILVRL_H2_SH(tmp1, tmp0, tmp4, tmp5);
+
+  src -= 2;
+
+  ST4x8_UB(tmp2, tmp3, src, pitch);
+  src += (8 * pitch);
+  ST4x8_UB(tmp4, tmp5, src, pitch);
+}
diff --git a/libs/libaom/src/aom_dsp/mips/loopfilter_8_msa.c b/libs/libaom/src/aom_dsp/mips/loopfilter_8_msa.c
new file mode 100644
index 000000000..dc203e79c
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/mips/loopfilter_8_msa.c
@@ -0,0 +1,333 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/mips/loopfilter_msa.h"
+
+void aom_lpf_horizontal_8_msa(uint8_t *src, int32_t pitch,
+                              const uint8_t *b_limit_ptr,
+                              const uint8_t *limit_ptr,
+                              const uint8_t *thresh_ptr) {
+  uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
+  v16u8 mask, hev, flat, thresh, b_limit, limit;
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+  v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+  v8i16 p2_filter8, p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8;
+  v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r;
+  v16i8 zero = { 0 };
+
+  /* load vector elements */
+  LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+  thresh = (v16u8)__msa_fill_b(*thresh_ptr);
+  b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
+  limit = (v16u8)__msa_fill_b(*limit_ptr);
+
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
+  AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+  AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+  flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat);
+
+  if (__msa_test_bz_v(flat)) {
+    p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
+    p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
+    q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
+    q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
+    SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch);
+  } else {
+    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
+               q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
+    AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8,
+                p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);
+
+    /* convert 16 bit output data into 8 bit */
+    PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8, zero,
+                q0_filter8, p2_filter8, p1_filter8, p0_filter8, q0_filter8);
+    PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8);
+
+    /* store pixel values */
+    p2_out = __msa_bmnz_v(p2, (v16u8)p2_filter8, flat);
+    p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filter8, flat);
+    p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filter8, flat);
+    q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filter8, flat);
+    q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filter8, flat);
+    q2_out = __msa_bmnz_v(q2, (v16u8)q2_filter8, flat);
+
+    p2_d = __msa_copy_u_d((v2i64)p2_out, 0);
+    p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
+    p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
+    q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
+    q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
+    q2_d = __msa_copy_u_d((v2i64)q2_out, 0);
+
+    src -= 3 * pitch;
+
+    SD4(p2_d, p1_d, p0_d, q0_d, src, pitch);
+    src += (4 * pitch);
+    SD(q1_d, src);
+    src += pitch;
+    SD(q2_d, src);
+  }
+}
+
+void aom_lpf_horizontal_8_dual_msa(
+    uint8_t *src, int32_t pitch, const uint8_t *b_limit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *b_limit1, const uint8_t *limit1,
+    const uint8_t *thresh1) {
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+  v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+  v16u8 flat, mask, hev, tmp, thresh, b_limit, limit;
+  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
+  v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
+  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
+  v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l;
+  v16u8 zero = { 0 };
+
+  /* load vector elements */
+  LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+  thresh = (v16u8)__msa_fill_b(*thresh0);
+  tmp = (v16u8)__msa_fill_b(*thresh1);
+  thresh = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)thresh);
+
+  b_limit = (v16u8)__msa_fill_b(*b_limit0);
+  tmp = (v16u8)__msa_fill_b(*b_limit1);
+  b_limit = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)b_limit);
+
+  limit = (v16u8)__msa_fill_b(*limit0);
+  tmp = (v16u8)__msa_fill_b(*limit1);
+  limit = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)limit);
+
+  /* mask and hev */
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
+  AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+  AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+  if (__msa_test_bz_v(flat)) {
+    ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
+  } else {
+    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
+               q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
+    AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
+                p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
+
+    ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l);
+    ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l);
+    AOM_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
+                p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
+
+    /* convert 16 bit output data into 8 bit */
+    PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
+                p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
+                p0_filt8_r, q0_filt8_r);
+    PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
+                q2_filt8_r);
+
+    /* store pixel values */
+    p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
+    p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
+    p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
+    q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
+    q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
+    q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);
+
+    src -= 3 * pitch;
+
+    ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch);
+    src += (4 * pitch);
+    ST_UB2(q1_out, q2_out, src, pitch);
+    src += (2 * pitch);
+  }
+}
+
+void aom_lpf_vertical_8_msa(uint8_t *src, int32_t pitch,
+                            const uint8_t *b_limit_ptr,
+                            const uint8_t *limit_ptr,
+                            const uint8_t *thresh_ptr) {
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+  v16u8 p1_out, p0_out, q0_out, q1_out;
+  v16u8 flat, mask, hev, thresh, b_limit, limit;
+  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
+  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
+  v16u8 zero = { 0 };
+  v8i16 vec0, vec1, vec2, vec3, vec4;
+
+  /* load vector elements */
+  LD_UB8(src - 4, pitch, p3, p2, p1, p0, q0, q1, q2, q3);
+
+  TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3, p3, p2, p1, p0, q0, q1, q2,
+                     q3);
+
+  thresh = (v16u8)__msa_fill_b(*thresh_ptr);
+  b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
+  limit = (v16u8)__msa_fill_b(*limit_ptr);
+
+  /* mask and hev */
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
+  /* flat4 */
+  AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+  /* filter4 */
+  AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+  flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat);
+
+  if (__msa_test_bz_v(flat)) {
+    /* Store 4 pixels p1-_q1 */
+    ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+    ILVRL_H2_SH(vec1, vec0, vec2, vec3);
+
+    src -= 2;
+    ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch);
+    src += 4 * pitch;
+    ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
+  } else {
+    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
+               q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
+    AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
+                p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
+    /* convert 16 bit output data into 8 bit */
+    PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r, p0_filt8_r,
+                p0_filt8_r, q0_filt8_r, q0_filt8_r, p2_filt8_r, p1_filt8_r,
+                p0_filt8_r, q0_filt8_r);
+    PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r, q1_filt8_r,
+                q2_filt8_r);
+
+    /* store pixel values */
+    p2 = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
+    p1 = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
+    p0 = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
+    q0 = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
+    q1 = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
+    q2 = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);
+
+    /* Store 6 pixels p2-_q2 */
+    ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
+    ILVRL_H2_SH(vec1, vec0, vec2, vec3);
+    vec4 = (v8i16)__msa_ilvr_b((v16i8)q2, (v16i8)q1);
+
+    src -= 3;
+    ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch);
+    ST2x4_UB(vec4, 0, src + 4, pitch);
+    src += (4 * pitch);
+    ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
+    ST2x4_UB(vec4, 4, src + 4, pitch);
+  }
+}
+
+void aom_lpf_vertical_8_dual_msa(uint8_t *src, int32_t pitch,
+                                 const uint8_t *b_limit0, const uint8_t *limit0,
+                                 const uint8_t *thresh0,
+                                 const uint8_t *b_limit1, const uint8_t *limit1,
+                                 const uint8_t *thresh1) {
+  uint8_t *temp_src;
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+  v16u8 p1_out, p0_out, q0_out, q1_out;
+  v16u8 flat, mask, hev, thresh, b_limit, limit;
+  v16u8 row4, row5, row6, row7, row12, row13, row14, row15;
+  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
+  v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
+  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
+  v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l;
+  v16u8 zero = { 0 };
+  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+
+  temp_src = src - 4;
+
+  LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7);
+  temp_src += (8 * pitch);
+  LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15);
+
+  /* transpose 16x8 matrix into 8x16 */
+  TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7, q3, q2, q1, q0,
+                      row12, row13, row14, row15, p3, p2, p1, p0, q0, q1, q2,
+                      q3);
+
+  thresh = (v16u8)__msa_fill_b(*thresh0);
+  vec0 = (v8i16)__msa_fill_b(*thresh1);
+  thresh = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)thresh);
+
+  b_limit = (v16u8)__msa_fill_b(*b_limit0);
+  vec0 = (v8i16)__msa_fill_b(*b_limit1);
+  b_limit = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)b_limit);
+
+  limit = (v16u8)__msa_fill_b(*limit0);
+  vec0 = (v8i16)__msa_fill_b(*limit1);
+  limit = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)limit);
+
+  /* mask and hev */
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
+  /* flat4 */
+  AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+  /* filter4 */
+  AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+  if (__msa_test_bz_v(flat)) {
+    ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+    ILVRL_H2_SH(vec1, vec0, vec2, vec3);
+    ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+    ILVRL_H2_SH(vec1, vec0, vec4, vec5);
+
+    src -= 2;
+    ST4x8_UB(vec2, vec3, src, pitch);
+    src += 8 * pitch;
+    ST4x8_UB(vec4, vec5, src, pitch);
+  } else {
+    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
+               q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
+    AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
+                p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
+
+    ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l);
+    ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l);
+
+    /* filter8 */
+    AOM_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
+                p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
+
+    /* convert 16 bit output data into 8 bit */
+    PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
+                p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
+                p0_filt8_r, q0_filt8_r);
+    PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
+                q2_filt8_r);
+
+    /* store pixel values */
+    p2 = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
+    p1 = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
+    p0 = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
+    q0 = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
+    q1 = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
+    q2 = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);
+
+    ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
+    ILVRL_H2_SH(vec1, vec0, vec3, vec4);
+    ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
+    ILVRL_H2_SH(vec1, vec0, vec6, vec7);
+    ILVRL_B2_SH(q2, q1, vec2, vec5);
+
+    src -= 3;
+    ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
+    ST2x4_UB(vec2, 0, src + 4, pitch);
+    src += (4 * pitch);
+    ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src, pitch);
+    ST2x4_UB(vec2, 4, src + 4, pitch);
+    src += (4 * pitch);
+    ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src, pitch);
+    ST2x4_UB(vec5, 0, src + 4, pitch);
+    src += (4 * pitch);
+    ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src, pitch);
+    ST2x4_UB(vec5, 4, src + 4, pitch);
+  }
+}
diff --git a/libs/libaom/src/aom_dsp/mips/loopfilter_filters_dspr2.c b/libs/libaom/src/aom_dsp/mips/loopfilter_filters_dspr2.c
new file mode 100644
index 000000000..8c41278be
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/mips/loopfilter_filters_dspr2.c
@@ -0,0 +1,328 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/mips/common_dspr2.h"
+#include "aom_dsp/mips/loopfilter_filters_dspr2.h"
+#include "aom_dsp/mips/loopfilter_macros_dspr2.h"
+#include "aom_dsp/mips/loopfilter_masks_dspr2.h"
+#include "aom_mem/aom_mem.h"
+
+#if HAVE_DSPR2
+void aom_lpf_horizontal_4_dspr2(unsigned char *s, int pitch,
+                                const uint8_t *blimit, const uint8_t *limit,
+                                const uint8_t *thresh) {
+  uint8_t i;
+  uint32_t mask;
+  uint32_t hev;
+  uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
+  uint8_t *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
+  uint32_t thresh_vec, flimit_vec, limit_vec;
+  uint32_t uflimit, ulimit, uthresh;
+
+  uflimit = *blimit;
+  ulimit = *limit;
+  uthresh = *thresh;
+
+  /* create quad-byte */
+  __asm__ __volatile__(
+      "replv.qb       %[thresh_vec],    %[uthresh]    \n\t"
+      "replv.qb       %[flimit_vec],    %[uflimit]    \n\t"
+      "replv.qb       %[limit_vec],     %[ulimit]     \n\t"
+
+      : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
+        [limit_vec] "=r"(limit_vec)
+      : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
+
+  /* prefetch data for store */
+  prefetch_store(s);
+
+  /* loop filter designed to work using chars so that we can make maximum use
+     of 8 bit simd instructions. */
+  for (i = 0; i < 2; i++) {
+    sm1 = s - (pitch << 2);
+    s0 = sm1 + pitch;
+    s1 = s0 + pitch;
+    s2 = s - pitch;
+    s3 = s;
+    s4 = s + pitch;
+    s5 = s4 + pitch;
+    s6 = s5 + pitch;
+
+    __asm__ __volatile__(
+        "lw     %[p1],  (%[s1])    \n\t"
+        "lw     %[p2],  (%[s2])    \n\t"
+        "lw     %[p3],  (%[s3])    \n\t"
+        "lw     %[p4],  (%[s4])    \n\t"
+
+        : [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4)
+        : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4));
+
+    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+       mask will be zero and filtering is not needed */
+    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
+      __asm__ __volatile__(
+          "lw       %[pm1], (%[sm1])   \n\t"
+          "lw       %[p0],  (%[s0])    \n\t"
+          "lw       %[p5],  (%[s5])    \n\t"
+          "lw       %[p6],  (%[s6])    \n\t"
+
+          : [pm1] "=&r"(pm1), [p0] "=&r"(p0), [p5] "=&r"(p5), [p6] "=&r"(p6)
+          : [sm1] "r"(sm1), [s0] "r"(s0), [s5] "r"(s5), [s6] "r"(s6));
+
+      filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1, p0, p3, p4, p5,
+                            p6, thresh_vec, &hev, &mask);
+
+      /* if mask == 0 do filtering is not needed */
+      if (mask) {
+        /* filtering */
+        filter_dspr2(mask, hev, &p1, &p2, &p3, &p4);
+
+        __asm__ __volatile__(
+            "sw     %[p1],  (%[s1])    \n\t"
+            "sw     %[p2],  (%[s2])    \n\t"
+            "sw     %[p3],  (%[s3])    \n\t"
+            "sw     %[p4],  (%[s4])    \n\t"
+
+            :
+            : [p1] "r"(p1), [p2] "r"(p2), [p3] "r"(p3), [p4] "r"(p4),
+              [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4));
+      }
+    }
+
+    s = s + 4;
+  }
+}
+
+void aom_lpf_vertical_4_dspr2(unsigned char *s, int pitch,
+                              const uint8_t *blimit, const uint8_t *limit,
+                              const uint8_t *thresh) {
+  uint8_t i;
+  uint32_t mask, hev;
+  uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
+  uint8_t *s1, *s2, *s3, *s4;
+  uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
+  uint32_t thresh_vec, flimit_vec, limit_vec;
+  uint32_t uflimit, ulimit, uthresh;
+
+  uflimit = *blimit;
+  ulimit = *limit;
+  uthresh = *thresh;
+
+  /* create quad-byte */
+  __asm__ __volatile__(
+      "replv.qb       %[thresh_vec],    %[uthresh]    \n\t"
+      "replv.qb       %[flimit_vec],    %[uflimit]    \n\t"
+      "replv.qb       %[limit_vec],     %[ulimit]     \n\t"
+
+      : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
+        [limit_vec] "=r"(limit_vec)
+      : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
+
+  /* prefetch data for store */
+  prefetch_store(s + pitch);
+
+  for (i = 0; i < 2; i++) {
+    s1 = s;
+    s2 = s + pitch;
+    s3 = s2 + pitch;
+    s4 = s3 + pitch;
+    s = s4 + pitch;
+
+    /* load quad-byte vectors
+     * memory is 4 byte aligned
+     */
+    p2 = *((uint32_t *)(s1 - 4));
+    p6 = *((uint32_t *)(s1));
+    p1 = *((uint32_t *)(s2 - 4));
+    p5 = *((uint32_t *)(s2));
+    p0 = *((uint32_t *)(s3 - 4));
+    p4 = *((uint32_t *)(s3));
+    pm1 = *((uint32_t *)(s4 - 4));
+    p3 = *((uint32_t *)(s4));
+
+    /* transpose pm1, p0, p1, p2 */
+    __asm__ __volatile__(
+        "precrq.qb.ph   %[prim1],   %[p2],      %[p1]       \n\t"
+        "precr.qb.ph    %[prim2],   %[p2],      %[p1]       \n\t"
+        "precrq.qb.ph   %[prim3],   %[p0],      %[pm1]      \n\t"
+        "precr.qb.ph    %[prim4],   %[p0],      %[pm1]      \n\t"
+
+        "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
+        "precr.qb.ph    %[pm1],     %[prim1],   %[prim2]    \n\t"
+        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+        "precrq.ph.w    %[p2],      %[p1],      %[sec3]     \n\t"
+        "precrq.ph.w    %[p0],      %[pm1],     %[sec4]     \n\t"
+        "append         %[p1],      %[sec3],    16          \n\t"
+        "append         %[pm1],     %[sec4],    16          \n\t"
+
+        : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
+          [prim4] "=&r"(prim4), [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0),
+          [pm1] "+r"(pm1), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
+        :);
+
+    /* transpose p3, p4, p5, p6 */
+    __asm__ __volatile__(
+        "precrq.qb.ph   %[prim1],   %[p6],      %[p5]       \n\t"
+        "precr.qb.ph    %[prim2],   %[p6],      %[p5]       \n\t"
+        "precrq.qb.ph   %[prim3],   %[p4],      %[p3]       \n\t"
+        "precr.qb.ph    %[prim4],   %[p4],      %[p3]       \n\t"
+
+        "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
+        "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
+        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+        "precrq.ph.w    %[p6],      %[p5],      %[sec3]     \n\t"
+        "precrq.ph.w    %[p4],      %[p3],      %[sec4]     \n\t"
+        "append         %[p5],      %[sec3],    16          \n\t"
+        "append         %[p3],      %[sec4],    16          \n\t"
+
+        : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
+          [prim4] "=&r"(prim4), [p6] "+r"(p6), [p5] "+r"(p5), [p4] "+r"(p4),
+          [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
+        :);
+
+    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+     * mask will be zero and filtering is not needed
+     */
+    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
+      filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1, p0, p3, p4, p5,
+                            p6, thresh_vec, &hev, &mask);
+
+      /* if mask == 0 do filtering is not needed */
+      if (mask) {
+        /* filtering */
+        filter_dspr2(mask, hev, &p1, &p2, &p3, &p4);
+
+        /* unpack processed 4x4 neighborhood
+         * don't use transpose on output data
+         * because memory isn't aligned
+         */
+        __asm__ __volatile__(
+            "sb     %[p4],   1(%[s4])    \n\t"
+            "sb     %[p3],   0(%[s4])    \n\t"
+            "sb     %[p2],  -1(%[s4])    \n\t"
+            "sb     %[p1],  -2(%[s4])    \n\t"
+
+            :
+            : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
+              [s4] "r"(s4));
+
+        __asm__ __volatile__(
+            "srl    %[p4],  %[p4],  8     \n\t"
+            "srl    %[p3],  %[p3],  8     \n\t"
+            "srl    %[p2],  %[p2],  8     \n\t"
+            "srl    %[p1],  %[p1],  8     \n\t"
+
+            : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
+            :);
+
+        __asm__ __volatile__(
+            "sb     %[p4],   1(%[s3])    \n\t"
+            "sb     %[p3],   0(%[s3])    \n\t"
+            "sb     %[p2],  -1(%[s3])    \n\t"
+            "sb     %[p1],  -2(%[s3])    \n\t"
+
+            : [p1] "+r"(p1)
+            : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [s3] "r"(s3));
+
+        __asm__ __volatile__(
+            "srl    %[p4],  %[p4],  8     \n\t"
+            "srl    %[p3],  %[p3],  8     \n\t"
+            "srl    %[p2],  %[p2],  8     \n\t"
+            "srl    %[p1],  %[p1],  8     \n\t"
+
+            : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
+            :);
+
+        __asm__ __volatile__(
+            "sb     %[p4],   1(%[s2])    \n\t"
+            "sb     %[p3],   0(%[s2])    \n\t"
+            "sb     %[p2],  -1(%[s2])    \n\t"
+            "sb     %[p1],  -2(%[s2])    \n\t"
+
+            :
+            : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
+              [s2] "r"(s2));
+
+        __asm__ __volatile__(
+            "srl    %[p4],  %[p4],  8     \n\t"
+            "srl    %[p3],  %[p3],  8     \n\t"
+            "srl    %[p2],  %[p2],  8     \n\t"
+            "srl    %[p1],  %[p1],  8     \n\t"
+
+            : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
+            :);
+
+        __asm__ __volatile__(
+            "sb     %[p4],   1(%[s1])    \n\t"
+            "sb     %[p3],   0(%[s1])    \n\t"
+            "sb     %[p2],  -1(%[s1])    \n\t"
+            "sb     %[p1],  -2(%[s1])    \n\t"
+
+            :
+            : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
+              [s1] "r"(s1));
+      }
+    }
+  }
+}
+
+void aom_lpf_horizontal_4_dual_dspr2(
+    uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
+    const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
+    const uint8_t *limit1, const uint8_t *thresh1) {
+  aom_lpf_horizontal_4_dspr2(s, p, blimit0, limit0, thresh0);
+  aom_lpf_horizontal_4_dspr2(s + 8, p, blimit1, limit1, thresh1);
+}
+
+void aom_lpf_horizontal_8_dual_dspr2(
+    uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
+    const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
+    const uint8_t *limit1, const uint8_t *thresh1) {
+  aom_lpf_horizontal_8_dspr2(s, p, blimit0, limit0, thresh0);
+  aom_lpf_horizontal_8_dspr2(s + 8, p, blimit1, limit1, thresh1);
+}
+
+void aom_lpf_vertical_4_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit0,
+                                   const uint8_t *limit0,
+                                   const uint8_t *thresh0,
+                                   const uint8_t *blimit1,
+                                   const uint8_t *limit1,
+                                   const uint8_t *thresh1) {
+  aom_lpf_vertical_4_dspr2(s, p, blimit0, limit0, thresh0);
+  aom_lpf_vertical_4_dspr2(s + 8 * p, p, blimit1, limit1, thresh1);
+}
+
+void aom_lpf_vertical_8_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit0,
+                                   const uint8_t *limit0,
+                                   const uint8_t *thresh0,
+                                   const uint8_t *blimit1,
+                                   const uint8_t *limit1,
+                                   const uint8_t *thresh1) {
+  aom_lpf_vertical_8_dspr2(s, p, blimit0, limit0, thresh0);
+  aom_lpf_vertical_8_dspr2(s + 8 * p, p, blimit1, limit1, thresh1);
+}
+
+void aom_lpf_vertical_16_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit,
+                                    const uint8_t *limit,
+                                    const uint8_t *thresh) {
+  aom_lpf_vertical_16_dspr2(s, p, blimit, limit, thresh);
+  aom_lpf_vertical_16_dspr2(s + 8 * p, p, blimit, limit, thresh);
+}
+#endif  // #if HAVE_DSPR2
diff --git a/libs/libaom/src/aom_dsp/mips/loopfilter_filters_dspr2.h b/libs/libaom/src/aom_dsp/mips/loopfilter_filters_dspr2.h
new file mode 100644
index 000000000..28f0dc35a
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/mips/loopfilter_filters_dspr2.h
@@ -0,0 +1,736 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_
+#define AOM_AOM_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_
+
+#include <stdlib.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if HAVE_DSPR2
+/* inputs & outputs are quad-byte vectors */
+static INLINE void filter_dspr2(uint32_t mask, uint32_t hev, uint32_t *ps1,
+                                uint32_t *ps0, uint32_t *qs0, uint32_t *qs1) {
+  int32_t aom_filter_l, aom_filter_r;
+  int32_t Filter1_l, Filter1_r, Filter2_l, Filter2_r;
+  int32_t subr_r, subr_l;
+  uint32_t t1, t2, HWM, t3;
+  uint32_t hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r;
+  int32_t vps1, vps0, vqs0, vqs1;
+  int32_t vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r;
+  uint32_t N128;
+
+  N128 = 0x80808080;
+  t1 = 0x03000300;
+  t2 = 0x04000400;
+  t3 = 0x01000100;
+  HWM = 0xFF00FF00;
+
+  vps0 = (*ps0) ^ N128;
+  vps1 = (*ps1) ^ N128;
+  vqs0 = (*qs0) ^ N128;
+  vqs1 = (*qs1) ^ N128;
+
+  /* use halfword pairs instead quad-bytes because of accuracy */
+  vps0_l = vps0 & HWM;
+  vps0_r = vps0 << 8;
+  vps0_r = vps0_r & HWM;
+
+  vps1_l = vps1 & HWM;
+  vps1_r = vps1 << 8;
+  vps1_r = vps1_r & HWM;
+
+  vqs0_l = vqs0 & HWM;
+  vqs0_r = vqs0 << 8;
+  vqs0_r = vqs0_r & HWM;
+
+  vqs1_l = vqs1 & HWM;
+  vqs1_r = vqs1 << 8;
+  vqs1_r = vqs1_r & HWM;
+
+  mask_l = mask & HWM;
+  mask_r = mask << 8;
+  mask_r = mask_r & HWM;
+
+  hev_l = hev & HWM;
+  hev_r = hev << 8;
+  hev_r = hev_r & HWM;
+
+  __asm__ __volatile__(
+      /* aom_filter = aom_signed_char_clamp(ps1 - qs1); */
+      "subq_s.ph    %[aom_filter_l], %[vps1_l],       %[vqs1_l]       \n\t"
+      "subq_s.ph    %[aom_filter_r], %[vps1_r],       %[vqs1_r]       \n\t"
+
+      /* qs0 - ps0 */
+      "subq_s.ph    %[subr_l],       %[vqs0_l],       %[vps0_l]       \n\t"
+      "subq_s.ph    %[subr_r],       %[vqs0_r],       %[vps0_r]       \n\t"
+
+      /* aom_filter &= hev; */
+      "and          %[aom_filter_l], %[aom_filter_l], %[hev_l]        \n\t"
+      "and          %[aom_filter_r], %[aom_filter_r], %[hev_r]        \n\t"
+
+      /* aom_filter = aom_signed_char_clamp(aom_filter + 3 * (qs0 - ps0)); */
+      "addq_s.ph    %[aom_filter_l], %[aom_filter_l], %[subr_l]       \n\t"
+      "addq_s.ph    %[aom_filter_r], %[aom_filter_r], %[subr_r]       \n\t"
+      "xor          %[invhev_l],     %[hev_l],        %[HWM]          \n\t"
+      "addq_s.ph    %[aom_filter_l], %[aom_filter_l], %[subr_l]       \n\t"
+      "addq_s.ph    %[aom_filter_r], %[aom_filter_r], %[subr_r]       \n\t"
+      "xor          %[invhev_r],     %[hev_r],        %[HWM]          \n\t"
+      "addq_s.ph    %[aom_filter_l], %[aom_filter_l], %[subr_l]       \n\t"
+      "addq_s.ph    %[aom_filter_r], %[aom_filter_r], %[subr_r]       \n\t"
+
+      /* aom_filter &= mask; */
+      "and          %[aom_filter_l], %[aom_filter_l], %[mask_l]       \n\t"
+      "and          %[aom_filter_r], %[aom_filter_r], %[mask_r]       \n\t"
+
+      : [aom_filter_l] "=&r"(aom_filter_l), [aom_filter_r] "=&r"(aom_filter_r),
+        [subr_l] "=&r"(subr_l), [subr_r] "=&r"(subr_r),
+        [invhev_l] "=&r"(invhev_l), [invhev_r] "=&r"(invhev_r)
+      : [vps0_l] "r"(vps0_l), [vps0_r] "r"(vps0_r), [vps1_l] "r"(vps1_l),
+        [vps1_r] "r"(vps1_r), [vqs0_l] "r"(vqs0_l), [vqs0_r] "r"(vqs0_r),
+        [vqs1_l] "r"(vqs1_l), [vqs1_r] "r"(vqs1_r), [mask_l] "r"(mask_l),
+        [mask_r] "r"(mask_r), [hev_l] "r"(hev_l), [hev_r] "r"(hev_r),
+        [HWM] "r"(HWM));
+
+  /* save bottom 3 bits so that we round one side +4 and the other +3 */
+  __asm__ __volatile__(
+      /* Filter2 = aom_signed_char_clamp(aom_filter + 3) >>= 3; */
+      "addq_s.ph    %[Filter1_l],    %[aom_filter_l], %[t2]           \n\t"
+      "addq_s.ph    %[Filter1_r],    %[aom_filter_r], %[t2]           \n\t"
+
+      /* Filter1 = aom_signed_char_clamp(aom_filter + 4) >>= 3; */
+      "addq_s.ph    %[Filter2_l],    %[aom_filter_l], %[t1]           \n\t"
+      "addq_s.ph    %[Filter2_r],    %[aom_filter_r], %[t1]           \n\t"
+      "shra.ph      %[Filter1_r],    %[Filter1_r],    3               \n\t"
+      "shra.ph      %[Filter1_l],    %[Filter1_l],    3               \n\t"
+
+      "shra.ph      %[Filter2_l],    %[Filter2_l],    3               \n\t"
+      "shra.ph      %[Filter2_r],    %[Filter2_r],    3               \n\t"
+
+      "and          %[Filter1_l],    %[Filter1_l],    %[HWM]          \n\t"
+      "and          %[Filter1_r],    %[Filter1_r],    %[HWM]          \n\t"
+
+      /* vps0 = aom_signed_char_clamp(ps0 + Filter2); */
+      "addq_s.ph    %[vps0_l],       %[vps0_l],       %[Filter2_l]    \n\t"
+      "addq_s.ph    %[vps0_r],       %[vps0_r],       %[Filter2_r]    \n\t"
+
+      /* vqs0 = aom_signed_char_clamp(qs0 - Filter1); */
+      "subq_s.ph    %[vqs0_l],       %[vqs0_l],       %[Filter1_l]    \n\t"
+      "subq_s.ph    %[vqs0_r],       %[vqs0_r],       %[Filter1_r]    \n\t"
+
+      : [Filter1_l] "=&r"(Filter1_l), [Filter1_r] "=&r"(Filter1_r),
+        [Filter2_l] "=&r"(Filter2_l), [Filter2_r] "=&r"(Filter2_r),
+        [vps0_l] "+r"(vps0_l), [vps0_r] "+r"(vps0_r), [vqs0_l] "+r"(vqs0_l),
+        [vqs0_r] "+r"(vqs0_r)
+      : [t1] "r"(t1), [t2] "r"(t2), [HWM] "r"(HWM),
+        [aom_filter_l] "r"(aom_filter_l), [aom_filter_r] "r"(aom_filter_r));
+
+  __asm__ __volatile__(
+      /* (aom_filter += 1) >>= 1 */
+      "addqh.ph    %[Filter1_l],    %[Filter1_l],     %[t3]           \n\t"
+      "addqh.ph    %[Filter1_r],    %[Filter1_r],     %[t3]           \n\t"
+
+      /* aom_filter &= ~hev; */
+      "and          %[Filter1_l],    %[Filter1_l],    %[invhev_l]     \n\t"
+      "and          %[Filter1_r],    %[Filter1_r],    %[invhev_r]     \n\t"
+
+      /* vps1 = aom_signed_char_clamp(ps1 + aom_filter); */
+      "addq_s.ph    %[vps1_l],       %[vps1_l],       %[Filter1_l]    \n\t"
+      "addq_s.ph    %[vps1_r],       %[vps1_r],       %[Filter1_r]    \n\t"
+
+      /* vqs1 = aom_signed_char_clamp(qs1 - aom_filter); */
+      "subq_s.ph    %[vqs1_l],       %[vqs1_l],       %[Filter1_l]    \n\t"
+      "subq_s.ph    %[vqs1_r],       %[vqs1_r],       %[Filter1_r]    \n\t"
+
+      : [Filter1_l] "+r"(Filter1_l), [Filter1_r] "+r"(Filter1_r),
+        [vps1_l] "+r"(vps1_l), [vps1_r] "+r"(vps1_r), [vqs1_l] "+r"(vqs1_l),
+        [vqs1_r] "+r"(vqs1_r)
+      : [t3] "r"(t3), [invhev_l] "r"(invhev_l), [invhev_r] "r"(invhev_r));
+
+  /* Create quad-bytes from halfword pairs */
+  vqs0_l = vqs0_l & HWM;
+  vqs1_l = vqs1_l & HWM;
+  vps0_l = vps0_l & HWM;
+  vps1_l = vps1_l & HWM;
+
+  __asm__ __volatile__(
+      "shrl.ph      %[vqs0_r],       %[vqs0_r],       8   \n\t"
+      "shrl.ph      %[vps0_r],       %[vps0_r],       8   \n\t"
+      "shrl.ph      %[vqs1_r],       %[vqs1_r],       8   \n\t"
+      "shrl.ph      %[vps1_r],       %[vps1_r],       8   \n\t"
+
+      : [vps1_r] "+r"(vps1_r), [vqs1_r] "+r"(vqs1_r), [vps0_r] "+r"(vps0_r),
+        [vqs0_r] "+r"(vqs0_r)
+      :);
+
+  vqs0 = vqs0_l | vqs0_r;
+  vqs1 = vqs1_l | vqs1_r;
+  vps0 = vps0_l | vps0_r;
+  vps1 = vps1_l | vps1_r;
+
+  *ps0 = vps0 ^ N128;
+  *ps1 = vps1 ^ N128;
+  *qs0 = vqs0 ^ N128;
+  *qs1 = vqs1 ^ N128;
+}
+
+static INLINE void filter1_dspr2(uint32_t mask, uint32_t hev, uint32_t ps1,
+                                 uint32_t ps0, uint32_t qs0, uint32_t qs1,
+                                 uint32_t *p1_f0, uint32_t *p0_f0,
+                                 uint32_t *q0_f0, uint32_t *q1_f0) {
+  int32_t aom_filter_l, aom_filter_r;
+  int32_t Filter1_l, Filter1_r, Filter2_l, Filter2_r;
+  int32_t subr_r, subr_l;
+  uint32_t t1, t2, HWM, t3;
+  uint32_t hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r;
+  int32_t vps1, vps0, vqs0, vqs1;
+  int32_t vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r;
+  uint32_t N128;
+
+  N128 = 0x80808080;
+  t1 = 0x03000300;
+  t2 = 0x04000400;
+  t3 = 0x01000100;
+  HWM = 0xFF00FF00;
+
+  vps0 = (ps0) ^ N128;
+  vps1 = (ps1) ^ N128;
+  vqs0 = (qs0) ^ N128;
+  vqs1 = (qs1) ^ N128;
+
+  /* use halfword pairs instead quad-bytes because of accuracy */
+  vps0_l = vps0 & HWM;
+  vps0_r = vps0 << 8;
+  vps0_r = vps0_r & HWM;
+
+  vps1_l = vps1 & HWM;
+  vps1_r = vps1 << 8;
+  vps1_r = vps1_r & HWM;
+
+  vqs0_l = vqs0 & HWM;
+  vqs0_r = vqs0 << 8;
+  vqs0_r = vqs0_r & HWM;
+
+  vqs1_l = vqs1 & HWM;
+  vqs1_r = vqs1 << 8;
+  vqs1_r = vqs1_r & HWM;
+
+  mask_l = mask & HWM;
+  mask_r = mask << 8;
+  mask_r = mask_r & HWM;
+
+  hev_l = hev & HWM;
+  hev_r = hev << 8;
+  hev_r = hev_r & HWM;
+
+  __asm__ __volatile__(
+      /* aom_filter = aom_signed_char_clamp(ps1 - qs1); */
+      "subq_s.ph    %[aom_filter_l], %[vps1_l],       %[vqs1_l]       \n\t"
+      "subq_s.ph    %[aom_filter_r], %[vps1_r],       %[vqs1_r]       \n\t"
+
+      /* qs0 - ps0 */
+      "subq_s.ph    %[subr_l],       %[vqs0_l],       %[vps0_l]       \n\t"
+      "subq_s.ph    %[subr_r],       %[vqs0_r],       %[vps0_r]       \n\t"
+
+      /* aom_filter &= hev; */
+      "and          %[aom_filter_l], %[aom_filter_l], %[hev_l]        \n\t"
+      "and          %[aom_filter_r], %[aom_filter_r], %[hev_r]        \n\t"
+
+      /* aom_filter = aom_signed_char_clamp(aom_filter + 3 * (qs0 - ps0)); */
+      "addq_s.ph    %[aom_filter_l], %[aom_filter_l], %[subr_l]       \n\t"
+      "addq_s.ph    %[aom_filter_r], %[aom_filter_r], %[subr_r]       \n\t"
+      "xor          %[invhev_l],     %[hev_l],        %[HWM]          \n\t"
+      "addq_s.ph    %[aom_filter_l], %[aom_filter_l], %[subr_l]       \n\t"
+      "addq_s.ph    %[aom_filter_r], %[aom_filter_r], %[subr_r]       \n\t"
+      "xor          %[invhev_r],     %[hev_r],        %[HWM]          \n\t"
+      "addq_s.ph    %[aom_filter_l], %[aom_filter_l], %[subr_l]       \n\t"
+      "addq_s.ph    %[aom_filter_r], %[aom_filter_r], %[subr_r]       \n\t"
+
+      /* aom_filter &= mask; */
+      "and          %[aom_filter_l], %[aom_filter_l], %[mask_l]       \n\t"
+      "and          %[aom_filter_r], %[aom_filter_r], %[mask_r]       \n\t"
+
+      : [aom_filter_l] "=&r"(aom_filter_l), [aom_filter_r] "=&r"(aom_filter_r),
+        [subr_l] "=&r"(subr_l), [subr_r] "=&r"(subr_r),
+        [invhev_l] "=&r"(invhev_l), [invhev_r] "=&r"(invhev_r)
+      : [vps0_l] "r"(vps0_l), [vps0_r] "r"(vps0_r), [vps1_l] "r"(vps1_l),
+        [vps1_r] "r"(vps1_r), [vqs0_l] "r"(vqs0_l), [vqs0_r] "r"(vqs0_r),
+        [vqs1_l] "r"(vqs1_l), [vqs1_r] "r"(vqs1_r), [mask_l] "r"(mask_l),
+        [mask_r] "r"(mask_r), [hev_l] "r"(hev_l), [hev_r] "r"(hev_r),
+        [HWM] "r"(HWM));
+
+  /* save bottom 3 bits so that we round one side +4 and the other +3 */
+  __asm__ __volatile__(
+      /* Filter2 = aom_signed_char_clamp(aom_filter + 3) >>= 3; */
+      "addq_s.ph    %[Filter1_l],    %[aom_filter_l], %[t2]           \n\t"
+      "addq_s.ph    %[Filter1_r],    %[aom_filter_r], %[t2]           \n\t"
+
+      /* Filter1 = aom_signed_char_clamp(aom_filter + 4) >>= 3; */
+      "addq_s.ph    %[Filter2_l],    %[aom_filter_l], %[t1]           \n\t"
+      "addq_s.ph    %[Filter2_r],    %[aom_filter_r], %[t1]           \n\t"
+      "shra.ph      %[Filter1_r],    %[Filter1_r],    3               \n\t"
+      "shra.ph      %[Filter1_l],    %[Filter1_l],    3               \n\t"
+
+      "shra.ph      %[Filter2_l],    %[Filter2_l],    3               \n\t"
+      "shra.ph      %[Filter2_r],    %[Filter2_r],    3               \n\t"
+
+      "and          %[Filter1_l],    %[Filter1_l],    %[HWM]          \n\t"
+      "and          %[Filter1_r],    %[Filter1_r],    %[HWM]          \n\t"
+
+      /* vps0 = aom_signed_char_clamp(ps0 + Filter2); */
+      "addq_s.ph    %[vps0_l],       %[vps0_l],       %[Filter2_l]    \n\t"
+      "addq_s.ph    %[vps0_r],       %[vps0_r],       %[Filter2_r]    \n\t"
+
+      /* vqs0 = aom_signed_char_clamp(qs0 - Filter1); */
+      "subq_s.ph    %[vqs0_l],       %[vqs0_l],       %[Filter1_l]    \n\t"
+      "subq_s.ph    %[vqs0_r],       %[vqs0_r],       %[Filter1_r]    \n\t"
+
+      : [Filter1_l] "=&r"(Filter1_l), [Filter1_r] "=&r"(Filter1_r),
+        [Filter2_l] "=&r"(Filter2_l), [Filter2_r] "=&r"(Filter2_r),
+        [vps0_l] "+r"(vps0_l), [vps0_r] "+r"(vps0_r), [vqs0_l] "+r"(vqs0_l),
+        [vqs0_r] "+r"(vqs0_r)
+      : [t1] "r"(t1), [t2] "r"(t2), [HWM] "r"(HWM),
+        [aom_filter_l] "r"(aom_filter_l), [aom_filter_r] "r"(aom_filter_r));
+
+  __asm__ __volatile__(
+      /* (aom_filter += 1) >>= 1 */
+      "addqh.ph    %[Filter1_l],    %[Filter1_l],     %[t3]           \n\t"
+      "addqh.ph    %[Filter1_r],    %[Filter1_r],     %[t3]           \n\t"
+
+      /* aom_filter &= ~hev; */
+      "and          %[Filter1_l],    %[Filter1_l],    %[invhev_l]     \n\t"
+      "and          %[Filter1_r],    %[Filter1_r],    %[invhev_r]     \n\t"
+
+      /* vps1 = aom_signed_char_clamp(ps1 + aom_filter); */
+      "addq_s.ph    %[vps1_l],       %[vps1_l],       %[Filter1_l]    \n\t"
+      "addq_s.ph    %[vps1_r],       %[vps1_r],       %[Filter1_r]    \n\t"
+
+      /* vqs1 = aom_signed_char_clamp(qs1 - aom_filter); */
+      "subq_s.ph    %[vqs1_l],       %[vqs1_l],       %[Filter1_l]    \n\t"
+      "subq_s.ph    %[vqs1_r],       %[vqs1_r],       %[Filter1_r]    \n\t"
+
+      : [Filter1_l] "+r"(Filter1_l), [Filter1_r] "+r"(Filter1_r),
+        [vps1_l] "+r"(vps1_l), [vps1_r] "+r"(vps1_r), [vqs1_l] "+r"(vqs1_l),
+        [vqs1_r] "+r"(vqs1_r)
+      : [t3] "r"(t3), [invhev_l] "r"(invhev_l), [invhev_r] "r"(invhev_r));
+
+  /* Create quad-bytes from halfword pairs */
+  vqs0_l = vqs0_l & HWM;
+  vqs1_l = vqs1_l & HWM;
+  vps0_l = vps0_l & HWM;
+  vps1_l = vps1_l & HWM;
+
+  __asm__ __volatile__(
+      "shrl.ph      %[vqs0_r],       %[vqs0_r],       8   \n\t"
+      "shrl.ph      %[vps0_r],       %[vps0_r],       8   \n\t"
+      "shrl.ph      %[vqs1_r],       %[vqs1_r],       8   \n\t"
+      "shrl.ph      %[vps1_r],       %[vps1_r],       8   \n\t"
+
+      : [vps1_r] "+r"(vps1_r), [vqs1_r] "+r"(vqs1_r), [vps0_r] "+r"(vps0_r),
+        [vqs0_r] "+r"(vqs0_r)
+      :);
+
+  vqs0 = vqs0_l | vqs0_r;
+  vqs1 = vqs1_l | vqs1_r;
+  vps0 = vps0_l | vps0_r;
+  vps1 = vps1_l | vps1_r;
+
+  *p0_f0 = vps0 ^ N128;
+  *p1_f0 = vps1 ^ N128;
+  *q0_f0 = vqs0 ^ N128;
+  *q1_f0 = vqs1 ^ N128;
+}
+
+static INLINE void mbfilter_dspr2(uint32_t *op3, uint32_t *op2, uint32_t *op1,
+                                  uint32_t *op0, uint32_t *oq0, uint32_t *oq1,
+                                  uint32_t *oq2, uint32_t *oq3) {
+  /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */
+  const uint32_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
+  const uint32_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
+  uint32_t res_op2, res_op1, res_op0;
+  uint32_t res_oq0, res_oq1, res_oq2;
+  uint32_t tmp;
+  uint32_t add_p210_q012;
+  uint32_t u32Four = 0x00040004;
+
+  /* *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0, 3)  1 */
+  /* *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1, 3)  2 */
+  /* *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2, 3)  3 */
+  /* *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3, 3)  4 */
+  /* *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3)  5 */
+  /* *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3)  6 */
+
+  __asm__ __volatile__(
+      "addu.ph    %[add_p210_q012],  %[p2],             %[p1]            \n\t"
+      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[p0]            \n\t"
+      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q0]            \n\t"
+      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q1]            \n\t"
+      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q2]            \n\t"
+      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[u32Four]       \n\t"
+
+      "shll.ph    %[tmp],            %[p3],             1                \n\t"
+      "addu.ph    %[res_op2],        %[tmp],            %[p3]            \n\t"
+      "addu.ph    %[res_op1],        %[p3],             %[p3]            \n\t"
+      "addu.ph    %[res_op2],        %[res_op2],        %[p2]            \n\t"
+      "addu.ph    %[res_op1],        %[res_op1],        %[p1]            \n\t"
+      "addu.ph    %[res_op2],        %[res_op2],        %[add_p210_q012] \n\t"
+      "addu.ph    %[res_op1],        %[res_op1],        %[add_p210_q012] \n\t"
+      "subu.ph    %[res_op2],        %[res_op2],        %[q1]            \n\t"
+      "subu.ph    %[res_op1],        %[res_op1],        %[q2]            \n\t"
+      "subu.ph    %[res_op2],        %[res_op2],        %[q2]            \n\t"
+      "shrl.ph    %[res_op1],        %[res_op1],        3                \n\t"
+      "shrl.ph    %[res_op2],        %[res_op2],        3                \n\t"
+      "addu.ph    %[res_op0],        %[p3],             %[p0]            \n\t"
+      "addu.ph    %[res_oq0],        %[q0],             %[q3]            \n\t"
+      "addu.ph    %[res_op0],        %[res_op0],        %[add_p210_q012] \n\t"
+      "addu.ph    %[res_oq0],        %[res_oq0],        %[add_p210_q012] \n\t"
+      "addu.ph    %[res_oq1],        %[q3],             %[q3]            \n\t"
+      "shll.ph    %[tmp],            %[q3],             1                \n\t"
+      "addu.ph    %[res_oq1],        %[res_oq1],        %[q1]            \n\t"
+      "addu.ph    %[res_oq2],        %[tmp],            %[q3]            \n\t"
+      "addu.ph    %[res_oq1],        %[res_oq1],        %[add_p210_q012] \n\t"
+      "addu.ph    %[res_oq2],        %[res_oq2],        %[add_p210_q012] \n\t"
+      "subu.ph    %[res_oq1],        %[res_oq1],        %[p2]            \n\t"
+      "addu.ph    %[res_oq2],        %[res_oq2],        %[q2]            \n\t"
+      "shrl.ph    %[res_oq1],        %[res_oq1],        3                \n\t"
+      "subu.ph    %[res_oq2],        %[res_oq2],        %[p2]            \n\t"
+      "shrl.ph    %[res_oq0],        %[res_oq0],        3                \n\t"
+      "subu.ph    %[res_oq2],        %[res_oq2],        %[p1]            \n\t"
+      "shrl.ph    %[res_op0],        %[res_op0],        3                \n\t"
+      "shrl.ph    %[res_oq2],        %[res_oq2],        3                \n\t"
+
+      : [add_p210_q012] "=&r"(add_p210_q012), [tmp] "=&r"(tmp),
+        [res_op2] "=&r"(res_op2), [res_op1] "=&r"(res_op1),
+        [res_op0] "=&r"(res_op0), [res_oq0] "=&r"(res_oq0),
+        [res_oq1] "=&r"(res_oq1), [res_oq2] "=&r"(res_oq2)
+      : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [q1] "r"(q1), [p2] "r"(p2),
+        [q2] "r"(q2), [p3] "r"(p3), [q3] "r"(q3), [u32Four] "r"(u32Four));
+
+  *op2 = res_op2;
+  *op1 = res_op1;
+  *op0 = res_op0;
+  *oq0 = res_oq0;
+  *oq1 = res_oq1;
+  *oq2 = res_oq2;
+}
+
+static INLINE void mbfilter1_dspr2(uint32_t p3, uint32_t p2, uint32_t p1,
+                                   uint32_t p0, uint32_t q0, uint32_t q1,
+                                   uint32_t q2, uint32_t q3, uint32_t *op2_f1,
+                                   uint32_t *op1_f1, uint32_t *op0_f1,
+                                   uint32_t *oq0_f1, uint32_t *oq1_f1,
+                                   uint32_t *oq2_f1) {
+  /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */
+  uint32_t res_op2, res_op1, res_op0;
+  uint32_t res_oq0, res_oq1, res_oq2;
+  uint32_t tmp;
+  uint32_t add_p210_q012;
+  uint32_t u32Four = 0x00040004;
+
+  /* *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0, 3)   1 */
+  /* *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1, 3)   2 */
+  /* *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2, 3)   3 */
+  /* *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3, 3)   4 */
+  /* *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3)   5 */
+  /* *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3)   6 */
+
+  __asm__ __volatile__(
+      "addu.ph    %[add_p210_q012],  %[p2],             %[p1]             \n\t"
+      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[p0]             \n\t"
+      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q0]             \n\t"
+      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q1]             \n\t"
+      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q2]             \n\t"
+      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[u32Four]        \n\t"
+
+      "shll.ph    %[tmp],            %[p3],             1                 \n\t"
+      "addu.ph    %[res_op2],        %[tmp],            %[p3]             \n\t"
+      "addu.ph    %[res_op1],        %[p3],             %[p3]             \n\t"
+      "addu.ph    %[res_op2],        %[res_op2],        %[p2]             \n\t"
+      "addu.ph    %[res_op1],        %[res_op1],        %[p1]             \n\t"
+      "addu.ph    %[res_op2],        %[res_op2],        %[add_p210_q012]  \n\t"
+      "addu.ph    %[res_op1],        %[res_op1],        %[add_p210_q012]  \n\t"
+      "subu.ph    %[res_op2],        %[res_op2],        %[q1]             \n\t"
+      "subu.ph    %[res_op1],        %[res_op1],        %[q2]             \n\t"
+      "subu.ph    %[res_op2],        %[res_op2],        %[q2]             \n\t"
+      "shrl.ph    %[res_op1],        %[res_op1],        3                 \n\t"
+      "shrl.ph    %[res_op2],        %[res_op2],        3                 \n\t"
+      "addu.ph    %[res_op0],        %[p3],             %[p0]             \n\t"
+      "addu.ph    %[res_oq0],        %[q0],             %[q3]             \n\t"
+      "addu.ph    %[res_op0],        %[res_op0],        %[add_p210_q012]  \n\t"
+      "addu.ph    %[res_oq0],        %[res_oq0],        %[add_p210_q012]  \n\t"
+      "addu.ph    %[res_oq1],        %[q3],             %[q3]             \n\t"
+      "shll.ph    %[tmp],            %[q3],             1                 \n\t"
+      "addu.ph    %[res_oq1],        %[res_oq1],        %[q1]             \n\t"
+      "addu.ph    %[res_oq2],        %[tmp],            %[q3]             \n\t"
+      "addu.ph    %[res_oq1],        %[res_oq1],        %[add_p210_q012]  \n\t"
+      "addu.ph    %[res_oq2],        %[res_oq2],        %[add_p210_q012]  \n\t"
+      "subu.ph    %[res_oq1],        %[res_oq1],        %[p2]             \n\t"
+      "addu.ph    %[res_oq2],        %[res_oq2],        %[q2]             \n\t"
+      "shrl.ph    %[res_oq1],        %[res_oq1],        3                 \n\t"
+      "subu.ph    %[res_oq2],        %[res_oq2],        %[p2]             \n\t"
+      "shrl.ph    %[res_oq0],        %[res_oq0],        3                 \n\t"
+      "subu.ph    %[res_oq2],        %[res_oq2],        %[p1]             \n\t"
+      "shrl.ph    %[res_op0],        %[res_op0],        3                 \n\t"
+      "shrl.ph    %[res_oq2],        %[res_oq2],        3                 \n\t"
+
+      : [add_p210_q012] "=&r"(add_p210_q012), [tmp] "=&r"(tmp),
+        [res_op2] "=&r"(res_op2), [res_op1] "=&r"(res_op1),
+        [res_op0] "=&r"(res_op0), [res_oq0] "=&r"(res_oq0),
+        [res_oq1] "=&r"(res_oq1), [res_oq2] "=&r"(res_oq2)
+      : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [q1] "r"(q1), [p2] "r"(p2),
+        [q2] "r"(q2), [p3] "r"(p3), [q3] "r"(q3), [u32Four] "r"(u32Four));
+
+  *op2_f1 = res_op2;
+  *op1_f1 = res_op1;
+  *op0_f1 = res_op0;
+  *oq0_f1 = res_oq0;
+  *oq1_f1 = res_oq1;
+  *oq2_f1 = res_oq2;
+}
+
+static INLINE void wide_mbfilter_dspr2(
+    uint32_t *op7, uint32_t *op6, uint32_t *op5, uint32_t *op4, uint32_t *op3,
+    uint32_t *op2, uint32_t *op1, uint32_t *op0, uint32_t *oq0, uint32_t *oq1,
+    uint32_t *oq2, uint32_t *oq3, uint32_t *oq4, uint32_t *oq5, uint32_t *oq6,
+    uint32_t *oq7) {
+  const uint32_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4;
+  const uint32_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
+  const uint32_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
+  const uint32_t q4 = *oq4, q5 = *oq5, q6 = *oq6, q7 = *oq7;
+  uint32_t res_op6, res_op5, res_op4, res_op3, res_op2, res_op1, res_op0;
+  uint32_t res_oq0, res_oq1, res_oq2, res_oq3, res_oq4, res_oq5, res_oq6;
+  uint32_t tmp;
+  uint32_t add_p6toq6;
+  uint32_t u32Eight = 0x00080008;
+
+  __asm__ __volatile__(
+      /* addition of p6,p5,p4,p3,p2,p1,p0,q0,q1,q2,q3,q4,q5,q6
+         which is used most of the time */
+      "addu.ph      %[add_p6toq6],     %[p6],              %[p5]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[p4]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[p3]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[p2]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[p1]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[p0]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q0]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q1]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q2]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q3]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q4]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q5]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q6]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[u32Eight]   \n\t"
+
+      : [add_p6toq6] "=&r"(add_p6toq6)
+      : [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2),
+        [p1] "r"(p1), [p0] "r"(p0), [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2),
+        [q3] "r"(q3), [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6),
+        [u32Eight] "r"(u32Eight));
+
+  __asm__ __volatile__(
+      /* *op6 = ROUND_POWER_OF_TWO(p7 * 7 + p6 * 2 + p5 + p4 +
+                                   p3 + p2 + p1 + p0 + q0, 4) */
+      "shll.ph       %[tmp],            %[p7],            3               \n\t"
+      "subu.ph       %[res_op6],        %[tmp],           %[p7]           \n\t"
+      "addu.ph       %[res_op6],        %[res_op6],       %[p6]           \n\t"
+      "addu.ph       %[res_op6],        %[res_op6],       %[add_p6toq6]   \n\t"
+      "subu.ph       %[res_op6],        %[res_op6],       %[q1]           \n\t"
+      "subu.ph       %[res_op6],        %[res_op6],       %[q2]           \n\t"
+      "subu.ph       %[res_op6],        %[res_op6],       %[q3]           \n\t"
+      "subu.ph       %[res_op6],        %[res_op6],       %[q4]           \n\t"
+      "subu.ph       %[res_op6],        %[res_op6],       %[q5]           \n\t"
+      "subu.ph       %[res_op6],        %[res_op6],       %[q6]           \n\t"
+      "shrl.ph       %[res_op6],        %[res_op6],       4               \n\t"
+
+      /* *op5 = ROUND_POWER_OF_TWO(p7 * 6 + p6 + p5 * 2 + p4 + p3 +
+                                   p2 + p1 + p0 + q0 + q1, 4) */
+      "shll.ph       %[tmp],            %[p7],            2               \n\t"
+      "addu.ph       %[res_op5],        %[tmp],           %[p7]           \n\t"
+      "addu.ph       %[res_op5],        %[res_op5],       %[p7]           \n\t"
+      "addu.ph       %[res_op5],        %[res_op5],       %[p5]           \n\t"
+      "addu.ph       %[res_op5],        %[res_op5],       %[add_p6toq6]   \n\t"
+      "subu.ph       %[res_op5],        %[res_op5],       %[q2]           \n\t"
+      "subu.ph       %[res_op5],        %[res_op5],       %[q3]           \n\t"
+      "subu.ph       %[res_op5],        %[res_op5],       %[q4]           \n\t"
+      "subu.ph       %[res_op5],        %[res_op5],       %[q5]           \n\t"
+      "subu.ph       %[res_op5],        %[res_op5],       %[q6]           \n\t"
+      "shrl.ph       %[res_op5],        %[res_op5],       4               \n\t"
+
+      /* *op4 = ROUND_POWER_OF_TWO(p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 +
+                                   p1 + p0 + q0 + q1 + q2, 4) */
+      "shll.ph       %[tmp],            %[p7],            2               \n\t"
+      "addu.ph       %[res_op4],        %[tmp],           %[p7]           \n\t"
+      "addu.ph       %[res_op4],        %[res_op4],       %[p4]           \n\t"
+      "addu.ph       %[res_op4],        %[res_op4],       %[add_p6toq6]   \n\t"
+      "subu.ph       %[res_op4],        %[res_op4],       %[q3]           \n\t"
+      "subu.ph       %[res_op4],        %[res_op4],       %[q4]           \n\t"
+      "subu.ph       %[res_op4],        %[res_op4],       %[q5]           \n\t"
+      "subu.ph       %[res_op4],        %[res_op4],       %[q6]           \n\t"
+      "shrl.ph       %[res_op4],        %[res_op4],       4               \n\t"
+
+      /* *op3 = ROUND_POWER_OF_TWO(p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 +
+                                   p1 + p0 + q0 + q1 + q2 + q3, 4) */
+      "shll.ph       %[tmp],            %[p7],            2               \n\t"
+      "addu.ph       %[res_op3],        %[tmp],           %[p3]           \n\t"
+      "addu.ph       %[res_op3],        %[res_op3],       %[add_p6toq6]   \n\t"
+      "subu.ph       %[res_op3],        %[res_op3],       %[q4]           \n\t"
+      "subu.ph       %[res_op3],        %[res_op3],       %[q5]           \n\t"
+      "subu.ph       %[res_op3],        %[res_op3],       %[q6]           \n\t"
+      "shrl.ph       %[res_op3],        %[res_op3],       4               \n\t"
+
+      /* *op2 = ROUND_POWER_OF_TWO(p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 +
+                                   p0 + q0 + q1 + q2 + q3 + q4, 4) */
+      "shll.ph       %[tmp],            %[p7],            1               \n\t"
+      "addu.ph       %[res_op2],        %[tmp],           %[p7]           \n\t"
+      "addu.ph       %[res_op2],        %[res_op2],       %[p2]           \n\t"
+      "addu.ph       %[res_op2],        %[res_op2],       %[add_p6toq6]   \n\t"
+      "subu.ph       %[res_op2],        %[res_op2],       %[q5]           \n\t"
+      "subu.ph       %[res_op2],        %[res_op2],       %[q6]           \n\t"
+      "shrl.ph       %[res_op2],        %[res_op2],       4               \n\t"
+
+      /* *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 +
+                                   p0 + q0 + q1 + q2 + q3 + q4 + q5, 4); */
+      "shll.ph       %[tmp],            %[p7],            1               \n\t"
+      "addu.ph       %[res_op1],        %[tmp],           %[p1]           \n\t"
+      "addu.ph       %[res_op1],        %[res_op1],       %[add_p6toq6]   \n\t"
+      "subu.ph       %[res_op1],        %[res_op1],       %[q6]           \n\t"
+      "shrl.ph       %[res_op1],        %[res_op1],       4               \n\t"
+
+      /* *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 +
+                                  q0 + q1 + q2 + q3 + q4 + q5 + q6, 4) */
+      "addu.ph       %[res_op0],        %[p7],            %[p0]           \n\t"
+      "addu.ph       %[res_op0],        %[res_op0],       %[add_p6toq6]   \n\t"
+      "shrl.ph       %[res_op0],        %[res_op0],       4               \n\t"
+
+      : [res_op6] "=&r"(res_op6), [res_op5] "=&r"(res_op5),
+        [res_op4] "=&r"(res_op4), [res_op3] "=&r"(res_op3),
+        [res_op2] "=&r"(res_op2), [res_op1] "=&r"(res_op1),
+        [res_op0] "=&r"(res_op0), [tmp] "=&r"(tmp)
+      : [p7] "r"(p7), [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3),
+        [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [q2] "r"(q2), [q1] "r"(q1),
+        [q3] "r"(q3), [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6),
+        [add_p6toq6] "r"(add_p6toq6));
+
+  *op6 = res_op6;
+  *op5 = res_op5;
+  *op4 = res_op4;
+  *op3 = res_op3;
+  *op2 = res_op2;
+  *op1 = res_op1;
+  *op0 = res_op0;
+
+  __asm__ __volatile__(
+      /* *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 +
+                                   q1 + q2 + q3 + q4 + q5 + q6 + q7, 4); */
+      "addu.ph       %[res_oq0],        %[q7],            %[q0]           \n\t"
+      "addu.ph       %[res_oq0],        %[res_oq0],       %[add_p6toq6]   \n\t"
+      "shrl.ph       %[res_oq0],        %[res_oq0],       4               \n\t"
+
+      /* *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 +
+                                   q2 + q3 + q4 + q5 + q6 + q7 * 2, 4) */
+      "shll.ph       %[tmp],            %[q7],            1               \n\t"
+      "addu.ph       %[res_oq1],        %[tmp],           %[q1]           \n\t"
+      "addu.ph       %[res_oq1],        %[res_oq1],       %[add_p6toq6]   \n\t"
+      "subu.ph       %[res_oq1],        %[res_oq1],       %[p6]           \n\t"
+      "shrl.ph       %[res_oq1],        %[res_oq1],       4               \n\t"
+
+      /* *oq2 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 +
+                                   q3 + q4 + q5 + q6 + q7 * 3, 4) */
+      "shll.ph       %[tmp],            %[q7],            1               \n\t"
+      "addu.ph       %[res_oq2],        %[tmp],           %[q7]           \n\t"
+      "addu.ph       %[res_oq2],        %[res_oq2],       %[q2]           \n\t"
+      "addu.ph       %[res_oq2],        %[res_oq2],       %[add_p6toq6]   \n\t"
+      "subu.ph       %[res_oq2],        %[res_oq2],       %[p5]           \n\t"
+      "subu.ph       %[res_oq2],        %[res_oq2],       %[p6]           \n\t"
+      "shrl.ph       %[res_oq2],        %[res_oq2],       4               \n\t"
+
+      /* *oq3 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + q0 + q1 + q2 +
+                                   q3 * 2 + q4 + q5 + q6 + q7 * 4, 4) */
+      "shll.ph       %[tmp],            %[q7],            2               \n\t"
+      "addu.ph       %[res_oq3],        %[tmp],           %[q3]           \n\t"
+      "addu.ph       %[res_oq3],        %[res_oq3],       %[add_p6toq6]   \n\t"
+      "subu.ph       %[res_oq3],        %[res_oq3],       %[p4]           \n\t"
+      "subu.ph       %[res_oq3],        %[res_oq3],       %[p5]           \n\t"
+      "subu.ph       %[res_oq3],        %[res_oq3],       %[p6]           \n\t"
+      "shrl.ph       %[res_oq3],        %[res_oq3],       4               \n\t"
+
+      /* *oq4 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q1 + q2 + q3 +
+                                   q4 * 2 + q5 + q6 + q7 * 5, 4) */
+      "shll.ph       %[tmp],            %[q7],            2               \n\t"
+      "addu.ph       %[res_oq4],        %[tmp],           %[q7]           \n\t"
+      "addu.ph       %[res_oq4],        %[res_oq4],       %[q4]           \n\t"
+      "addu.ph       %[res_oq4],        %[res_oq4],       %[add_p6toq6]   \n\t"
+      "subu.ph       %[res_oq4],        %[res_oq4],       %[p3]           \n\t"
+      "subu.ph       %[res_oq4],        %[res_oq4],       %[p4]           \n\t"
+      "subu.ph       %[res_oq4],        %[res_oq4],       %[p5]           \n\t"
+      "subu.ph       %[res_oq4],        %[res_oq4],       %[p6]           \n\t"
+      "shrl.ph       %[res_oq4],        %[res_oq4],       4               \n\t"
+
+      /* *oq5 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q2 + q3 + q4 +
+                                   q5 * 2 + q6 + q7 * 6, 4) */
+      "shll.ph       %[tmp],            %[q7],            2               \n\t"
+      "addu.ph       %[res_oq5],        %[tmp],           %[q7]           \n\t"
+      "addu.ph       %[res_oq5],        %[res_oq5],       %[q7]           \n\t"
+      "addu.ph       %[res_oq5],        %[res_oq5],       %[q5]           \n\t"
+      "addu.ph       %[res_oq5],        %[res_oq5],       %[add_p6toq6]   \n\t"
+      "subu.ph       %[res_oq5],        %[res_oq5],       %[p2]           \n\t"
+      "subu.ph       %[res_oq5],        %[res_oq5],       %[p3]           \n\t"
+      "subu.ph       %[res_oq5],        %[res_oq5],       %[p4]           \n\t"
+      "subu.ph       %[res_oq5],        %[res_oq5],       %[p5]           \n\t"
+      "subu.ph       %[res_oq5],        %[res_oq5],       %[p6]           \n\t"
+      "shrl.ph       %[res_oq5],        %[res_oq5],       4               \n\t"
+
+      /* *oq6 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q3 +
+                                   q4 + q5 + q6 * 2 + q7 * 7, 4) */
+      "shll.ph       %[tmp],            %[q7],            3               \n\t"
+      "subu.ph       %[res_oq6],        %[tmp],           %[q7]           \n\t"
+      "addu.ph       %[res_oq6],        %[res_oq6],       %[q6]           \n\t"
+      "addu.ph       %[res_oq6],        %[res_oq6],       %[add_p6toq6]   \n\t"
+      "subu.ph       %[res_oq6],        %[res_oq6],       %[p1]           \n\t"
+      "subu.ph       %[res_oq6],        %[res_oq6],       %[p2]           \n\t"
+      "subu.ph       %[res_oq6],        %[res_oq6],       %[p3]           \n\t"
+      "subu.ph       %[res_oq6],        %[res_oq6],       %[p4]           \n\t"
+      "subu.ph       %[res_oq6],        %[res_oq6],       %[p5]           \n\t"
+      "subu.ph       %[res_oq6],        %[res_oq6],       %[p6]           \n\t"
+      "shrl.ph       %[res_oq6],        %[res_oq6],       4               \n\t"
+
+      : [res_oq6] "=&r"(res_oq6), [res_oq5] "=&r"(res_oq5),
+        [res_oq4] "=&r"(res_oq4), [res_oq3] "=&r"(res_oq3),
+        [res_oq2] "=&r"(res_oq2), [res_oq1] "=&r"(res_oq1),
+        [res_oq0] "=&r"(res_oq0), [tmp] "=&r"(tmp)
+      : [q7] "r"(q7), [q6] "r"(q6), [q5] "r"(q5), [q4] "r"(q4), [q3] "r"(q3),
+        [q2] "r"(q2), [q1] "r"(q1), [q0] "r"(q0), [p1] "r"(p1), [p2] "r"(p2),
+        [p3] "r"(p3), [p4] "r"(p4), [p5] "r"(p5), [p6] "r"(p6),
+        [add_p6toq6] "r"(add_p6toq6));
+
+  *oq0 = res_oq0;
+  *oq1 = res_oq1;
+  *oq2 = res_oq2;
+  *oq3 = res_oq3;
+  *oq4 = res_oq4;
+  *oq5 = res_oq5;
+  *oq6 = res_oq6;
+}
+#endif  // #if HAVE_DSPR2
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_
diff --git a/libs/libaom/src/aom_dsp/mips/loopfilter_macros_dspr2.h b/libs/libaom/src/aom_dsp/mips/loopfilter_macros_dspr2.h
new file mode 100644
index 000000000..62295d69d
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/mips/loopfilter_macros_dspr2.h
@@ -0,0 +1,437 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_MIPS_LOOPFILTER_MACROS_DSPR2_H_
+#define AOM_AOM_DSP_MIPS_LOOPFILTER_MACROS_DSPR2_H_
+
+#include <stdlib.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_mem/aom_mem.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if HAVE_DSPR2
+#define STORE_F0()                                                       \
+  {                                                                      \
+    __asm__ __volatile__(                                                \
+        "sb     %[q1_f0],    1(%[s4])           \n\t"                    \
+        "sb     %[q0_f0],    0(%[s4])           \n\t"                    \
+        "sb     %[p0_f0],   -1(%[s4])           \n\t"                    \
+        "sb     %[p1_f0],   -2(%[s4])           \n\t"                    \
+                                                                         \
+        :                                                                \
+        : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [p0_f0] "r"(p0_f0),    \
+          [p1_f0] "r"(p1_f0), [s4] "r"(s4));                             \
+                                                                         \
+    __asm__ __volatile__(                                                \
+        "srl    %[q1_f0],   %[q1_f0],   8       \n\t"                    \
+        "srl    %[q0_f0],   %[q0_f0],   8       \n\t"                    \
+        "srl    %[p0_f0],   %[p0_f0],   8       \n\t"                    \
+        "srl    %[p1_f0],   %[p1_f0],   8       \n\t"                    \
+                                                                         \
+        : [q1_f0] "+r"(q1_f0), [q0_f0] "+r"(q0_f0), [p0_f0] "+r"(p0_f0), \
+          [p1_f0] "+r"(p1_f0)                                            \
+        :);                                                              \
+                                                                         \
+    __asm__ __volatile__(                                                \
+        "sb     %[q1_f0],    1(%[s3])           \n\t"                    \
+        "sb     %[q0_f0],    0(%[s3])           \n\t"                    \
+        "sb     %[p0_f0],   -1(%[s3])           \n\t"                    \
+        "sb     %[p1_f0],   -2(%[s3])           \n\t"                    \
+                                                                         \
+        : [p1_f0] "+r"(p1_f0)                                            \
+        : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [s3] "r"(s3),          \
+          [p0_f0] "r"(p0_f0));                                           \
+                                                                         \
+    __asm__ __volatile__(                                                \
+        "srl    %[q1_f0],   %[q1_f0],   8       \n\t"                    \
+        "srl    %[q0_f0],   %[q0_f0],   8       \n\t"                    \
+        "srl    %[p0_f0],   %[p0_f0],   8       \n\t"                    \
+        "srl    %[p1_f0],   %[p1_f0],   8       \n\t"                    \
+                                                                         \
+        : [q1_f0] "+r"(q1_f0), [q0_f0] "+r"(q0_f0), [p0_f0] "+r"(p0_f0), \
+          [p1_f0] "+r"(p1_f0)                                            \
+        :);                                                              \
+                                                                         \
+    __asm__ __volatile__(                                                \
+        "sb     %[q1_f0],    1(%[s2])           \n\t"                    \
+        "sb     %[q0_f0],    0(%[s2])           \n\t"                    \
+        "sb     %[p0_f0],   -1(%[s2])           \n\t"                    \
+        "sb     %[p1_f0],   -2(%[s2])           \n\t"                    \
+                                                                         \
+        :                                                                \
+        : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [p0_f0] "r"(p0_f0),    \
+          [p1_f0] "r"(p1_f0), [s2] "r"(s2));                             \
+                                                                         \
+    __asm__ __volatile__(                                                \
+        "srl    %[q1_f0],   %[q1_f0],   8       \n\t"                    \
+        "srl    %[q0_f0],   %[q0_f0],   8       \n\t"                    \
+        "srl    %[p0_f0],   %[p0_f0],   8       \n\t"                    \
+        "srl    %[p1_f0],   %[p1_f0],   8       \n\t"                    \
+                                                                         \
+        : [q1_f0] "+r"(q1_f0), [q0_f0] "+r"(q0_f0), [p0_f0] "+r"(p0_f0), \
+          [p1_f0] "+r"(p1_f0)                                            \
+        :);                                                              \
+                                                                         \
+    __asm__ __volatile__(                                                \
+        "sb     %[q1_f0],    1(%[s1])           \n\t"                    \
+        "sb     %[q0_f0],    0(%[s1])           \n\t"                    \
+        "sb     %[p0_f0],   -1(%[s1])           \n\t"                    \
+        "sb     %[p1_f0],   -2(%[s1])           \n\t"                    \
+                                                                         \
+        :                                                                \
+        : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [p0_f0] "r"(p0_f0),    \
+          [p1_f0] "r"(p1_f0), [s1] "r"(s1));                             \
+  }
+
+#define STORE_F1()                                                             \
+  {                                                                            \
+    __asm__ __volatile__(                                                      \
+        "sb     %[q2_r],     2(%[s4])           \n\t"                          \
+        "sb     %[q1_r],     1(%[s4])           \n\t"                          \
+        "sb     %[q0_r],     0(%[s4])           \n\t"                          \
+        "sb     %[p0_r],    -1(%[s4])           \n\t"                          \
+        "sb     %[p1_r],    -2(%[s4])           \n\t"                          \
+        "sb     %[p2_r],    -3(%[s4])           \n\t"                          \
+                                                                               \
+        :                                                                      \
+        : [q2_r] "r"(q2_r), [q1_r] "r"(q1_r), [q0_r] "r"(q0_r),                \
+          [p0_r] "r"(p0_r), [p1_r] "r"(p1_r), [p2_r] "r"(p2_r), [s4] "r"(s4)); \
+                                                                               \
+    __asm__ __volatile__(                                                      \
+        "srl    %[q2_r],    %[q2_r],    16      \n\t"                          \
+        "srl    %[q1_r],    %[q1_r],    16      \n\t"                          \
+        "srl    %[q0_r],    %[q0_r],    16      \n\t"                          \
+        "srl    %[p0_r],    %[p0_r],    16      \n\t"                          \
+        "srl    %[p1_r],    %[p1_r],    16      \n\t"                          \
+        "srl    %[p2_r],    %[p2_r],    16      \n\t"                          \
+                                                                               \
+        : [q2_r] "+r"(q2_r), [q1_r] "+r"(q1_r), [q0_r] "+r"(q0_r),             \
+          [p0_r] "+r"(p0_r), [p1_r] "+r"(p1_r), [p2_r] "+r"(p2_r)              \
+        :);                                                                    \
+                                                                               \
+    __asm__ __volatile__(                                                      \
+        "sb     %[q2_r],     2(%[s3])           \n\t"                          \
+        "sb     %[q1_r],     1(%[s3])           \n\t"                          \
+        "sb     %[q0_r],     0(%[s3])           \n\t"                          \
+        "sb     %[p0_r],    -1(%[s3])           \n\t"                          \
+        "sb     %[p1_r],    -2(%[s3])           \n\t"                          \
+        "sb     %[p2_r],    -3(%[s3])           \n\t"                          \
+                                                                               \
+        :                                                                      \
+        : [q2_r] "r"(q2_r), [q1_r] "r"(q1_r), [q0_r] "r"(q0_r),                \
+          [p0_r] "r"(p0_r), [p1_r] "r"(p1_r), [p2_r] "r"(p2_r), [s3] "r"(s3)); \
+                                                                               \
+    __asm__ __volatile__(                                                      \
+        "sb     %[q2_l],     2(%[s2])           \n\t"                          \
+        "sb     %[q1_l],     1(%[s2])           \n\t"                          \
+        "sb     %[q0_l],     0(%[s2])           \n\t"                          \
+        "sb     %[p0_l],    -1(%[s2])           \n\t"                          \
+        "sb     %[p1_l],    -2(%[s2])           \n\t"                          \
+        "sb     %[p2_l],    -3(%[s2])           \n\t"                          \
+                                                                               \
+        :                                                                      \
+        : [q2_l] "r"(q2_l), [q1_l] "r"(q1_l), [q0_l] "r"(q0_l),                \
+          [p0_l] "r"(p0_l), [p1_l] "r"(p1_l), [p2_l] "r"(p2_l), [s2] "r"(s2)); \
+                                                                               \
+    __asm__ __volatile__(                                                      \
+        "srl    %[q2_l],    %[q2_l],    16      \n\t"                          \
+        "srl    %[q1_l],    %[q1_l],    16      \n\t"                          \
+        "srl    %[q0_l],    %[q0_l],    16      \n\t"                          \
+        "srl    %[p0_l],    %[p0_l],    16      \n\t"                          \
+        "srl    %[p1_l],    %[p1_l],    16      \n\t"                          \
+        "srl    %[p2_l],    %[p2_l],    16      \n\t"                          \
+                                                                               \
+        : [q2_l] "+r"(q2_l), [q1_l] "+r"(q1_l), [q0_l] "+r"(q0_l),             \
+          [p0_l] "+r"(p0_l), [p1_l] "+r"(p1_l), [p2_l] "+r"(p2_l)              \
+        :);                                                                    \
+                                                                               \
+    __asm__ __volatile__(                                                      \
+        "sb     %[q2_l],     2(%[s1])           \n\t"                          \
+        "sb     %[q1_l],     1(%[s1])           \n\t"                          \
+        "sb     %[q0_l],     0(%[s1])           \n\t"                          \
+        "sb     %[p0_l],    -1(%[s1])           \n\t"                          \
+        "sb     %[p1_l],    -2(%[s1])           \n\t"                          \
+        "sb     %[p2_l],    -3(%[s1])           \n\t"                          \
+                                                                               \
+        :                                                                      \
+        : [q2_l] "r"(q2_l), [q1_l] "r"(q1_l), [q0_l] "r"(q0_l),                \
+          [p0_l] "r"(p0_l), [p1_l] "r"(p1_l), [p2_l] "r"(p2_l), [s1] "r"(s1)); \
+  }
+
+#define STORE_F2()                                                 \
+  {                                                                \
+    __asm__ __volatile__(                                          \
+        "sb     %[q6_r],     6(%[s4])           \n\t"              \
+        "sb     %[q5_r],     5(%[s4])           \n\t"              \
+        "sb     %[q4_r],     4(%[s4])           \n\t"              \
+        "sb     %[q3_r],     3(%[s4])           \n\t"              \
+        "sb     %[q2_r],     2(%[s4])           \n\t"              \
+        "sb     %[q1_r],     1(%[s4])           \n\t"              \
+        "sb     %[q0_r],     0(%[s4])           \n\t"              \
+        "sb     %[p0_r],    -1(%[s4])           \n\t"              \
+        "sb     %[p1_r],    -2(%[s4])           \n\t"              \
+        "sb     %[p2_r],    -3(%[s4])           \n\t"              \
+        "sb     %[p3_r],    -4(%[s4])           \n\t"              \
+        "sb     %[p4_r],    -5(%[s4])           \n\t"              \
+        "sb     %[p5_r],    -6(%[s4])           \n\t"              \
+        "sb     %[p6_r],    -7(%[s4])           \n\t"              \
+                                                                   \
+        :                                                          \
+        : [q6_r] "r"(q6_r), [q5_r] "r"(q5_r), [q4_r] "r"(q4_r),    \
+          [q3_r] "r"(q3_r), [q2_r] "r"(q2_r), [q1_r] "r"(q1_r),    \
+          [q0_r] "r"(q0_r), [p0_r] "r"(p0_r), [p1_r] "r"(p1_r),    \
+          [p2_r] "r"(p2_r), [p3_r] "r"(p3_r), [p4_r] "r"(p4_r),    \
+          [p5_r] "r"(p5_r), [p6_r] "r"(p6_r), [s4] "r"(s4));       \
+                                                                   \
+    __asm__ __volatile__(                                          \
+        "srl    %[q6_r],    %[q6_r],    16      \n\t"              \
+        "srl    %[q5_r],    %[q5_r],    16      \n\t"              \
+        "srl    %[q4_r],    %[q4_r],    16      \n\t"              \
+        "srl    %[q3_r],    %[q3_r],    16      \n\t"              \
+        "srl    %[q2_r],    %[q2_r],    16      \n\t"              \
+        "srl    %[q1_r],    %[q1_r],    16      \n\t"              \
+        "srl    %[q0_r],    %[q0_r],    16      \n\t"              \
+        "srl    %[p0_r],    %[p0_r],    16      \n\t"              \
+        "srl    %[p1_r],    %[p1_r],    16      \n\t"              \
+        "srl    %[p2_r],    %[p2_r],    16      \n\t"              \
+        "srl    %[p3_r],    %[p3_r],    16      \n\t"              \
+        "srl    %[p4_r],    %[p4_r],    16      \n\t"              \
+        "srl    %[p5_r],    %[p5_r],    16      \n\t"              \
+        "srl    %[p6_r],    %[p6_r],    16      \n\t"              \
+                                                                   \
+        : [q6_r] "+r"(q6_r), [q5_r] "+r"(q5_r), [q4_r] "+r"(q4_r), \
+          [q3_r] "+r"(q3_r), [q2_r] "+r"(q2_r), [q1_r] "+r"(q1_r), \
+          [q0_r] "+r"(q0_r), [p0_r] "+r"(p0_r), [p1_r] "+r"(p1_r), \
+          [p2_r] "+r"(p2_r), [p3_r] "+r"(p3_r), [p4_r] "+r"(p4_r), \
+          [p5_r] "+r"(p5_r), [p6_r] "+r"(p6_r)                     \
+        :);                                                        \
+                                                                   \
+    __asm__ __volatile__(                                          \
+        "sb     %[q6_r],     6(%[s3])           \n\t"              \
+        "sb     %[q5_r],     5(%[s3])           \n\t"              \
+        "sb     %[q4_r],     4(%[s3])           \n\t"              \
+        "sb     %[q3_r],     3(%[s3])           \n\t"              \
+        "sb     %[q2_r],     2(%[s3])           \n\t"              \
+        "sb     %[q1_r],     1(%[s3])           \n\t"              \
+        "sb     %[q0_r],     0(%[s3])           \n\t"              \
+        "sb     %[p0_r],    -1(%[s3])           \n\t"              \
+        "sb     %[p1_r],    -2(%[s3])           \n\t"              \
+        "sb     %[p2_r],    -3(%[s3])           \n\t"              \
+        "sb     %[p3_r],    -4(%[s3])           \n\t"              \
+        "sb     %[p4_r],    -5(%[s3])           \n\t"              \
+        "sb     %[p5_r],    -6(%[s3])           \n\t"              \
+        "sb     %[p6_r],    -7(%[s3])           \n\t"              \
+                                                                   \
+        :                                                          \
+        : [q6_r] "r"(q6_r), [q5_r] "r"(q5_r), [q4_r] "r"(q4_r),    \
+          [q3_r] "r"(q3_r), [q2_r] "r"(q2_r), [q1_r] "r"(q1_r),    \
+          [q0_r] "r"(q0_r), [p0_r] "r"(p0_r), [p1_r] "r"(p1_r),    \
+          [p2_r] "r"(p2_r), [p3_r] "r"(p3_r), [p4_r] "r"(p4_r),    \
+          [p5_r] "r"(p5_r), [p6_r] "r"(p6_r), [s3] "r"(s3));       \
+                                                                   \
+    __asm__ __volatile__(                                          \
+        "sb     %[q6_l],     6(%[s2])           \n\t"              \
+        "sb     %[q5_l],     5(%[s2])           \n\t"              \
+        "sb     %[q4_l],     4(%[s2])           \n\t"              \
+        "sb     %[q3_l],     3(%[s2])           \n\t"              \
+        "sb     %[q2_l],     2(%[s2])           \n\t"              \
+        "sb     %[q1_l],     1(%[s2])           \n\t"              \
+        "sb     %[q0_l],     0(%[s2])           \n\t"              \
+        "sb     %[p0_l],    -1(%[s2])           \n\t"              \
+        "sb     %[p1_l],    -2(%[s2])           \n\t"              \
+        "sb     %[p2_l],    -3(%[s2])           \n\t"              \
+        "sb     %[p3_l],    -4(%[s2])           \n\t"              \
+        "sb     %[p4_l],    -5(%[s2])           \n\t"              \
+        "sb     %[p5_l],    -6(%[s2])           \n\t"              \
+        "sb     %[p6_l],    -7(%[s2])           \n\t"              \
+                                                                   \
+        :                                                          \
+        : [q6_l] "r"(q6_l), [q5_l] "r"(q5_l), [q4_l] "r"(q4_l),    \
+          [q3_l] "r"(q3_l), [q2_l] "r"(q2_l), [q1_l] "r"(q1_l),    \
+          [q0_l] "r"(q0_l), [p0_l] "r"(p0_l), [p1_l] "r"(p1_l),    \
+          [p2_l] "r"(p2_l), [p3_l] "r"(p3_l), [p4_l] "r"(p4_l),    \
+          [p5_l] "r"(p5_l), [p6_l] "r"(p6_l), [s2] "r"(s2));       \
+                                                                   \
+    __asm__ __volatile__(                                          \
+        "srl    %[q6_l],    %[q6_l],    16     \n\t"               \
+        "srl    %[q5_l],    %[q5_l],    16     \n\t"               \
+        "srl    %[q4_l],    %[q4_l],    16     \n\t"               \
+        "srl    %[q3_l],    %[q3_l],    16     \n\t"               \
+        "srl    %[q2_l],    %[q2_l],    16     \n\t"               \
+        "srl    %[q1_l],    %[q1_l],    16     \n\t"               \
+        "srl    %[q0_l],    %[q0_l],    16     \n\t"               \
+        "srl    %[p0_l],    %[p0_l],    16     \n\t"               \
+        "srl    %[p1_l],    %[p1_l],    16     \n\t"               \
+        "srl    %[p2_l],    %[p2_l],    16     \n\t"               \
+        "srl    %[p3_l],    %[p3_l],    16     \n\t"               \
+        "srl    %[p4_l],    %[p4_l],    16     \n\t"               \
+        "srl    %[p5_l],    %[p5_l],    16     \n\t"               \
+        "srl    %[p6_l],    %[p6_l],    16     \n\t"               \
+                                                                   \
+        : [q6_l] "+r"(q6_l), [q5_l] "+r"(q5_l), [q4_l] "+r"(q4_l), \
+          [q3_l] "+r"(q3_l), [q2_l] "+r"(q2_l), [q1_l] "+r"(q1_l), \
+          [q0_l] "+r"(q0_l), [p0_l] "+r"(p0_l), [p1_l] "+r"(p1_l), \
+          [p2_l] "+r"(p2_l), [p3_l] "+r"(p3_l), [p4_l] "+r"(p4_l), \
+          [p5_l] "+r"(p5_l), [p6_l] "+r"(p6_l)                     \
+        :);                                                        \
+                                                                   \
+    __asm__ __volatile__(                                          \
+        "sb     %[q6_l],     6(%[s1])           \n\t"              \
+        "sb     %[q5_l],     5(%[s1])           \n\t"              \
+        "sb     %[q4_l],     4(%[s1])           \n\t"              \
+        "sb     %[q3_l],     3(%[s1])           \n\t"              \
+        "sb     %[q2_l],     2(%[s1])           \n\t"              \
+        "sb     %[q1_l],     1(%[s1])           \n\t"              \
+        "sb     %[q0_l],     0(%[s1])           \n\t"              \
+        "sb     %[p0_l],    -1(%[s1])           \n\t"              \
+        "sb     %[p1_l],    -2(%[s1])           \n\t"              \
+        "sb     %[p2_l],    -3(%[s1])           \n\t"              \
+        "sb     %[p3_l],    -4(%[s1])           \n\t"              \
+        "sb     %[p4_l],    -5(%[s1])           \n\t"              \
+        "sb     %[p5_l],    -6(%[s1])           \n\t"              \
+        "sb     %[p6_l],    -7(%[s1])           \n\t"              \
+                                                                   \
+        :                                                          \
+        : [q6_l] "r"(q6_l), [q5_l] "r"(q5_l), [q4_l] "r"(q4_l),    \
+          [q3_l] "r"(q3_l), [q2_l] "r"(q2_l), [q1_l] "r"(q1_l),    \
+          [q0_l] "r"(q0_l), [p0_l] "r"(p0_l), [p1_l] "r"(p1_l),    \
+          [p2_l] "r"(p2_l), [p3_l] "r"(p3_l), [p4_l] "r"(p4_l),    \
+          [p5_l] "r"(p5_l), [p6_l] "r"(p6_l), [s1] "r"(s1));       \
+  }
+
+#define PACK_LEFT_0TO3()                                              \
+  {                                                                   \
+    __asm__ __volatile__(                                             \
+        "preceu.ph.qbl   %[p3_l],   %[p3]   \n\t"                     \
+        "preceu.ph.qbl   %[p2_l],   %[p2]   \n\t"                     \
+        "preceu.ph.qbl   %[p1_l],   %[p1]   \n\t"                     \
+        "preceu.ph.qbl   %[p0_l],   %[p0]   \n\t"                     \
+        "preceu.ph.qbl   %[q0_l],   %[q0]   \n\t"                     \
+        "preceu.ph.qbl   %[q1_l],   %[q1]   \n\t"                     \
+        "preceu.ph.qbl   %[q2_l],   %[q2]   \n\t"                     \
+        "preceu.ph.qbl   %[q3_l],   %[q3]   \n\t"                     \
+                                                                      \
+        : [p3_l] "=&r"(p3_l), [p2_l] "=&r"(p2_l), [p1_l] "=&r"(p1_l), \
+          [p0_l] "=&r"(p0_l), [q0_l] "=&r"(q0_l), [q1_l] "=&r"(q1_l), \
+          [q2_l] "=&r"(q2_l), [q3_l] "=&r"(q3_l)                      \
+        : [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0),     \
+          [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2), [q3] "r"(q3));    \
+  }
+
+#define PACK_LEFT_4TO7()                                              \
+  {                                                                   \
+    __asm__ __volatile__(                                             \
+        "preceu.ph.qbl   %[p7_l],   %[p7]   \n\t"                     \
+        "preceu.ph.qbl   %[p6_l],   %[p6]   \n\t"                     \
+        "preceu.ph.qbl   %[p5_l],   %[p5]   \n\t"                     \
+        "preceu.ph.qbl   %[p4_l],   %[p4]   \n\t"                     \
+        "preceu.ph.qbl   %[q4_l],   %[q4]   \n\t"                     \
+        "preceu.ph.qbl   %[q5_l],   %[q5]   \n\t"                     \
+        "preceu.ph.qbl   %[q6_l],   %[q6]   \n\t"                     \
+        "preceu.ph.qbl   %[q7_l],   %[q7]   \n\t"                     \
+                                                                      \
+        : [p7_l] "=&r"(p7_l), [p6_l] "=&r"(p6_l), [p5_l] "=&r"(p5_l), \
+          [p4_l] "=&r"(p4_l), [q4_l] "=&r"(q4_l), [q5_l] "=&r"(q5_l), \
+          [q6_l] "=&r"(q6_l), [q7_l] "=&r"(q7_l)                      \
+        : [p7] "r"(p7), [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4),     \
+          [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6), [q7] "r"(q7));    \
+  }
+
+#define PACK_RIGHT_0TO3()                                             \
+  {                                                                   \
+    __asm__ __volatile__(                                             \
+        "preceu.ph.qbr   %[p3_r],   %[p3]  \n\t"                      \
+        "preceu.ph.qbr   %[p2_r],   %[p2]   \n\t"                     \
+        "preceu.ph.qbr   %[p1_r],   %[p1]   \n\t"                     \
+        "preceu.ph.qbr   %[p0_r],   %[p0]   \n\t"                     \
+        "preceu.ph.qbr   %[q0_r],   %[q0]   \n\t"                     \
+        "preceu.ph.qbr   %[q1_r],   %[q1]   \n\t"                     \
+        "preceu.ph.qbr   %[q2_r],   %[q2]   \n\t"                     \
+        "preceu.ph.qbr   %[q3_r],   %[q3]   \n\t"                     \
+                                                                      \
+        : [p3_r] "=&r"(p3_r), [p2_r] "=&r"(p2_r), [p1_r] "=&r"(p1_r), \
+          [p0_r] "=&r"(p0_r), [q0_r] "=&r"(q0_r), [q1_r] "=&r"(q1_r), \
+          [q2_r] "=&r"(q2_r), [q3_r] "=&r"(q3_r)                      \
+        : [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0),     \
+          [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2), [q3] "r"(q3));    \
+  }
+
+#define PACK_RIGHT_4TO7()                                             \
+  {                                                                   \
+    __asm__ __volatile__(                                             \
+        "preceu.ph.qbr   %[p7_r],   %[p7]   \n\t"                     \
+        "preceu.ph.qbr   %[p6_r],   %[p6]   \n\t"                     \
+        "preceu.ph.qbr   %[p5_r],   %[p5]   \n\t"                     \
+        "preceu.ph.qbr   %[p4_r],   %[p4]   \n\t"                     \
+        "preceu.ph.qbr   %[q4_r],   %[q4]   \n\t"                     \
+        "preceu.ph.qbr   %[q5_r],   %[q5]   \n\t"                     \
+        "preceu.ph.qbr   %[q6_r],   %[q6]   \n\t"                     \
+        "preceu.ph.qbr   %[q7_r],   %[q7]   \n\t"                     \
+                                                                      \
+        : [p7_r] "=&r"(p7_r), [p6_r] "=&r"(p6_r), [p5_r] "=&r"(p5_r), \
+          [p4_r] "=&r"(p4_r), [q4_r] "=&r"(q4_r), [q5_r] "=&r"(q5_r), \
+          [q6_r] "=&r"(q6_r), [q7_r] "=&r"(q7_r)                      \
+        : [p7] "r"(p7), [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4),     \
+          [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6), [q7] "r"(q7));    \
+  }
+
+#define COMBINE_LEFT_RIGHT_0TO2()                                         \
+  {                                                                       \
+    __asm__ __volatile__(                                                 \
+        "precr.qb.ph    %[p2],  %[p2_l],    %[p2_r]    \n\t"              \
+        "precr.qb.ph    %[p1],  %[p1_l],    %[p1_r]    \n\t"              \
+        "precr.qb.ph    %[p0],  %[p0_l],    %[p0_r]    \n\t"              \
+        "precr.qb.ph    %[q0],  %[q0_l],    %[q0_r]    \n\t"              \
+        "precr.qb.ph    %[q1],  %[q1_l],    %[q1_r]    \n\t"              \
+        "precr.qb.ph    %[q2],  %[q2_l],    %[q2_r]    \n\t"              \
+                                                                          \
+        : [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0), [q0] "=&r"(q0), \
+          [q1] "=&r"(q1), [q2] "=&r"(q2)                                  \
+        : [p2_l] "r"(p2_l), [p2_r] "r"(p2_r), [p1_l] "r"(p1_l),           \
+          [p1_r] "r"(p1_r), [p0_l] "r"(p0_l), [p0_r] "r"(p0_r),           \
+          [q0_l] "r"(q0_l), [q0_r] "r"(q0_r), [q1_l] "r"(q1_l),           \
+          [q1_r] "r"(q1_r), [q2_l] "r"(q2_l), [q2_r] "r"(q2_r));          \
+  }
+
+#define COMBINE_LEFT_RIGHT_3TO6()                                         \
+  {                                                                       \
+    __asm__ __volatile__(                                                 \
+        "precr.qb.ph    %[p6],  %[p6_l],    %[p6_r]    \n\t"              \
+        "precr.qb.ph    %[p5],  %[p5_l],    %[p5_r]    \n\t"              \
+        "precr.qb.ph    %[p4],  %[p4_l],    %[p4_r]    \n\t"              \
+        "precr.qb.ph    %[p3],  %[p3_l],    %[p3_r]    \n\t"              \
+        "precr.qb.ph    %[q3],  %[q3_l],    %[q3_r]    \n\t"              \
+        "precr.qb.ph    %[q4],  %[q4_l],    %[q4_r]    \n\t"              \
+        "precr.qb.ph    %[q5],  %[q5_l],    %[q5_r]    \n\t"              \
+        "precr.qb.ph    %[q6],  %[q6_l],    %[q6_r]    \n\t"              \
+                                                                          \
+        : [p6] "=&r"(p6), [p5] "=&r"(p5), [p4] "=&r"(p4), [p3] "=&r"(p3), \
+          [q3] "=&r"(q3), [q4] "=&r"(q4), [q5] "=&r"(q5), [q6] "=&r"(q6)  \
+        : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l),           \
+          [p3_l] "r"(p3_l), [p6_r] "r"(p6_r), [p5_r] "r"(p5_r),           \
+          [p4_r] "r"(p4_r), [p3_r] "r"(p3_r), [q3_l] "r"(q3_l),           \
+          [q4_l] "r"(q4_l), [q5_l] "r"(q5_l), [q6_l] "r"(q6_l),           \
+          [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r),           \
+          [q6_r] "r"(q6_r));                                              \
+  }
+
+#endif  // #if HAVE_DSPR2
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_DSP_MIPS_LOOPFILTER_MACROS_DSPR2_H_
diff --git a/libs/libaom/src/aom_dsp/mips/loopfilter_masks_dspr2.h b/libs/libaom/src/aom_dsp/mips/loopfilter_masks_dspr2.h
new file mode 100644
index 000000000..a0f57f386
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/mips/loopfilter_masks_dspr2.h
@@ -0,0 +1,357 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_MIPS_LOOPFILTER_MASKS_DSPR2_H_
+#define AOM_AOM_DSP_MIPS_LOOPFILTER_MASKS_DSPR2_H_
+
+#include <stdlib.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_mem/aom_mem.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if HAVE_DSPR2
+/* processing 4 pixels at the same time
+ * compute hev and mask in the same function */
+static INLINE void filter_hev_mask_dspr2(uint32_t limit, uint32_t flimit,
+                                         uint32_t p1, uint32_t p0, uint32_t p3,
+                                         uint32_t p2, uint32_t q0, uint32_t q1,
+                                         uint32_t q2, uint32_t q3,
+                                         uint32_t thresh, uint32_t *hev,
+                                         uint32_t *mask) {
+  uint32_t c, r, r3, r_k;
+  uint32_t s1, s2, s3;
+  uint32_t ones = 0xFFFFFFFF;
+  uint32_t hev1;
+
+  __asm__ __volatile__(
+      /* mask |= (abs(p3 - p2) > limit) */
+      "subu_s.qb      %[c],   %[p3],     %[p2]        \n\t"
+      "subu_s.qb      %[r_k], %[p2],     %[p3]        \n\t"
+      "or             %[r_k], %[r_k],    %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
+      "or             %[r],   $0,        %[c]         \n\t"
+
+      /* mask |= (abs(p2 - p1) > limit) */
+      "subu_s.qb      %[c],   %[p2],     %[p1]        \n\t"
+      "subu_s.qb      %[r_k], %[p1],     %[p2]        \n\t"
+      "or             %[r_k], %[r_k],    %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
+      "or             %[r],   %[r],      %[c]         \n\t"
+
+      /* mask |= (abs(p1 - p0) > limit)
+       * hev  |= (abs(p1 - p0) > thresh)
+       */
+      "subu_s.qb      %[c],   %[p1],     %[p0]        \n\t"
+      "subu_s.qb      %[r_k], %[p0],     %[p1]        \n\t"
+      "or             %[r_k], %[r_k],    %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],   %[thresh], %[r_k]       \n\t"
+      "or             %[r3],  $0,        %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
+      "or             %[r],   %[r],      %[c]         \n\t"
+
+      /* mask |= (abs(q1 - q0) > limit)
+       * hev  |= (abs(q1 - q0) > thresh)
+       */
+      "subu_s.qb      %[c],   %[q1],     %[q0]        \n\t"
+      "subu_s.qb      %[r_k], %[q0],     %[q1]        \n\t"
+      "or             %[r_k], %[r_k],    %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],   %[thresh], %[r_k]       \n\t"
+      "or             %[r3],  %[r3],     %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
+      "or             %[r],   %[r],      %[c]         \n\t"
+
+      /* mask |= (abs(q2 - q1) > limit) */
+      "subu_s.qb      %[c],   %[q2],     %[q1]        \n\t"
+      "subu_s.qb      %[r_k], %[q1],     %[q2]        \n\t"
+      "or             %[r_k], %[r_k],    %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
+      "or             %[r],   %[r],      %[c]         \n\t"
+      "sll            %[r3],    %[r3],    24          \n\t"
+
+      /* mask |= (abs(q3 - q2) > limit) */
+      "subu_s.qb      %[c],   %[q3],     %[q2]        \n\t"
+      "subu_s.qb      %[r_k], %[q2],     %[q3]        \n\t"
+      "or             %[r_k], %[r_k],    %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
+      "or             %[r],   %[r],      %[c]         \n\t"
+
+      : [c] "=&r"(c), [r_k] "=&r"(r_k), [r] "=&r"(r), [r3] "=&r"(r3)
+      : [limit] "r"(limit), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
+        [p0] "r"(p0), [q1] "r"(q1), [q0] "r"(q0), [q2] "r"(q2), [q3] "r"(q3),
+        [thresh] "r"(thresh));
+
+  __asm__ __volatile__(
+      /* abs(p0 - q0) */
+      "subu_s.qb      %[c],   %[p0],     %[q0]        \n\t"
+      "subu_s.qb      %[r_k], %[q0],     %[p0]        \n\t"
+      "wrdsp          %[r3]                           \n\t"
+      "or             %[s1],  %[r_k],    %[c]         \n\t"
+
+      /* abs(p1 - q1) */
+      "subu_s.qb      %[c],    %[p1],    %[q1]        \n\t"
+      "addu_s.qb      %[s3],   %[s1],    %[s1]        \n\t"
+      "pick.qb        %[hev1], %[ones],  $0           \n\t"
+      "subu_s.qb      %[r_k],  %[q1],    %[p1]        \n\t"
+      "or             %[s2],   %[r_k],   %[c]         \n\t"
+
+      /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > flimit * 2 + limit */
+      "shrl.qb        %[s2],   %[s2],     1           \n\t"
+      "addu_s.qb      %[s1],   %[s2],     %[s3]       \n\t"
+      "cmpgu.lt.qb    %[c],    %[flimit], %[s1]       \n\t"
+      "or             %[r],    %[r],      %[c]        \n\t"
+      "sll            %[r],    %[r],      24          \n\t"
+
+      "wrdsp          %[r]                            \n\t"
+      "pick.qb        %[s2],  $0,         %[ones]     \n\t"
+
+      : [c] "=&r"(c), [r_k] "=&r"(r_k), [s1] "=&r"(s1), [hev1] "=&r"(hev1),
+        [s2] "=&r"(s2), [r] "+r"(r), [s3] "=&r"(s3)
+      : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [r3] "r"(r3), [q1] "r"(q1),
+        [ones] "r"(ones), [flimit] "r"(flimit));
+
+  *hev = hev1;
+  *mask = s2;
+}
+
+static INLINE void filter_hev_mask_flatmask4_dspr2(
+    uint32_t limit, uint32_t flimit, uint32_t thresh, uint32_t p1, uint32_t p0,
+    uint32_t p3, uint32_t p2, uint32_t q0, uint32_t q1, uint32_t q2,
+    uint32_t q3, uint32_t *hev, uint32_t *mask, uint32_t *flat) {
+  uint32_t c, r, r3, r_k, r_flat;
+  uint32_t s1, s2, s3;
+  uint32_t ones = 0xFFFFFFFF;
+  uint32_t flat_thresh = 0x01010101;
+  uint32_t hev1;
+  uint32_t flat1;
+
+  __asm__ __volatile__(
+      /* mask |= (abs(p3 - p2) > limit) */
+      "subu_s.qb      %[c],       %[p3],          %[p2]        \n\t"
+      "subu_s.qb      %[r_k],     %[p2],          %[p3]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[limit],       %[r_k]       \n\t"
+      "or             %[r],       $0,             %[c]         \n\t"
+
+      /* mask |= (abs(p2 - p1) > limit) */
+      "subu_s.qb      %[c],       %[p2],          %[p1]        \n\t"
+      "subu_s.qb      %[r_k],     %[p1],          %[p2]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[limit],       %[r_k]       \n\t"
+      "or             %[r],       %[r],           %[c]         \n\t"
+
+      /* mask |= (abs(p1 - p0) > limit)
+       * hev  |= (abs(p1 - p0) > thresh)
+       * flat |= (abs(p1 - p0) > thresh)
+       */
+      "subu_s.qb      %[c],       %[p1],          %[p0]        \n\t"
+      "subu_s.qb      %[r_k],     %[p0],          %[p1]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[thresh],      %[r_k]       \n\t"
+      "or             %[r3],      $0,             %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[limit],       %[r_k]       \n\t"
+      "or             %[r],       %[r],           %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
+      "or             %[r_flat],  $0,             %[c]         \n\t"
+
+      /* mask |= (abs(q1 - q0) > limit)
+       * hev  |= (abs(q1 - q0) > thresh)
+       * flat |= (abs(q1 - q0) > thresh)
+       */
+      "subu_s.qb      %[c],       %[q1],          %[q0]        \n\t"
+      "subu_s.qb      %[r_k],     %[q0],          %[q1]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[thresh],      %[r_k]       \n\t"
+      "or             %[r3],      %[r3],          %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[limit],       %[r_k]       \n\t"
+      "or             %[r],       %[r],           %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
+      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
+
+      /* flat |= (abs(p0 - p2) > thresh) */
+      "subu_s.qb      %[c],       %[p0],          %[p2]        \n\t"
+      "subu_s.qb      %[r_k],     %[p2],          %[p0]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
+      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
+
+      /* flat |= (abs(q0 - q2) > thresh) */
+      "subu_s.qb      %[c],       %[q0],          %[q2]        \n\t"
+      "subu_s.qb      %[r_k],     %[q2],          %[q0]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
+      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
+
+      /* flat |= (abs(p3 - p0) > thresh) */
+      "subu_s.qb      %[c],       %[p3],          %[p0]        \n\t"
+      "subu_s.qb      %[r_k],     %[p0],          %[p3]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
+      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
+
+      /* flat |= (abs(q3 - q0) > thresh) */
+      "subu_s.qb      %[c],       %[q3],          %[q0]        \n\t"
+      "subu_s.qb      %[r_k],     %[q0],          %[q3]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
+      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
+      "sll            %[r_flat],  %[r_flat],      24           \n\t"
+      /* look at stall here */
+      "wrdsp          %[r_flat]                                \n\t"
+      "pick.qb        %[flat1],   $0,             %[ones]      \n\t"
+
+      /* mask |= (abs(q2 - q1) > limit) */
+      "subu_s.qb      %[c],       %[q2],          %[q1]        \n\t"
+      "subu_s.qb      %[r_k],     %[q1],          %[q2]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[limit],       %[r_k]       \n\t"
+      "or             %[r],       %[r],           %[c]         \n\t"
+      "sll            %[r3],      %[r3],          24           \n\t"
+
+      /* mask |= (abs(q3 - q2) > limit) */
+      "subu_s.qb      %[c],       %[q3],          %[q2]        \n\t"
+      "subu_s.qb      %[r_k],     %[q2],          %[q3]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[limit],       %[r_k]       \n\t"
+      "or             %[r],       %[r],           %[c]         \n\t"
+
+      : [c] "=&r"(c), [r_k] "=&r"(r_k), [r] "=&r"(r), [r3] "=&r"(r3),
+        [r_flat] "=&r"(r_flat), [flat1] "=&r"(flat1)
+      : [limit] "r"(limit), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
+        [p0] "r"(p0), [q1] "r"(q1), [q0] "r"(q0), [q2] "r"(q2), [q3] "r"(q3),
+        [thresh] "r"(thresh), [flat_thresh] "r"(flat_thresh), [ones] "r"(ones));
+
+  __asm__ __volatile__(
+      /* abs(p0 - q0) */
+      "subu_s.qb      %[c],   %[p0],     %[q0]        \n\t"
+      "subu_s.qb      %[r_k], %[q0],     %[p0]        \n\t"
+      "wrdsp          %[r3]                           \n\t"
+      "or             %[s1],  %[r_k],    %[c]         \n\t"
+
+      /* abs(p1 - q1) */
+      "subu_s.qb      %[c],    %[p1],    %[q1]        \n\t"
+      "addu_s.qb      %[s3],   %[s1],    %[s1]        \n\t"
+      "pick.qb        %[hev1], %[ones],  $0           \n\t"
+      "subu_s.qb      %[r_k],  %[q1],    %[p1]        \n\t"
+      "or             %[s2],   %[r_k],   %[c]         \n\t"
+
+      /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > flimit * 2 + limit */
+      "shrl.qb        %[s2],   %[s2],     1           \n\t"
+      "addu_s.qb      %[s1],   %[s2],     %[s3]       \n\t"
+      "cmpgu.lt.qb    %[c],    %[flimit], %[s1]       \n\t"
+      "or             %[r],    %[r],      %[c]        \n\t"
+      "sll            %[r],    %[r],      24          \n\t"
+
+      "wrdsp          %[r]                            \n\t"
+      "pick.qb        %[s2],   $0,        %[ones]     \n\t"
+
+      : [c] "=&r"(c), [r_k] "=&r"(r_k), [s1] "=&r"(s1), [hev1] "=&r"(hev1),
+        [s2] "=&r"(s2), [r] "+r"(r), [s3] "=&r"(s3)
+      : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [r3] "r"(r3), [q1] "r"(q1),
+        [ones] "r"(ones), [flimit] "r"(flimit));
+
+  *hev = hev1;
+  *mask = s2;
+  *flat = flat1;
+}
+
+static INLINE void flatmask5(uint32_t p4, uint32_t p3, uint32_t p2, uint32_t p1,
+                             uint32_t p0, uint32_t q0, uint32_t q1, uint32_t q2,
+                             uint32_t q3, uint32_t q4, uint32_t *flat2) {
+  uint32_t c, r, r_k, r_flat;
+  uint32_t ones = 0xFFFFFFFF;
+  uint32_t flat_thresh = 0x01010101;
+  uint32_t flat1, flat3;
+
+  __asm__ __volatile__(
+      /* flat |= (abs(p4 - p0) > thresh) */
+      "subu_s.qb      %[c],   %[p4],           %[p0]        \n\t"
+      "subu_s.qb      %[r_k], %[p0],           %[p4]        \n\t"
+      "or             %[r_k], %[r_k],          %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],   %[flat_thresh],  %[r_k]       \n\t"
+      "or             %[r],   $0,              %[c]         \n\t"
+
+      /* flat |= (abs(q4 - q0) > thresh) */
+      "subu_s.qb      %[c],     %[q4],           %[q0]     \n\t"
+      "subu_s.qb      %[r_k],   %[q0],           %[q4]     \n\t"
+      "or             %[r_k],   %[r_k],          %[c]      \n\t"
+      "cmpgu.lt.qb    %[c],     %[flat_thresh],  %[r_k]    \n\t"
+      "or             %[r],     %[r],            %[c]      \n\t"
+      "sll            %[r],     %[r],            24        \n\t"
+      "wrdsp          %[r]                                 \n\t"
+      "pick.qb        %[flat3], $0,           %[ones]      \n\t"
+
+      /* flat |= (abs(p1 - p0) > thresh) */
+      "subu_s.qb      %[c],       %[p1],          %[p0]        \n\t"
+      "subu_s.qb      %[r_k],     %[p0],          %[p1]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
+      "or             %[r_flat],  $0,             %[c]         \n\t"
+
+      /* flat |= (abs(q1 - q0) > thresh) */
+      "subu_s.qb      %[c],      %[q1],           %[q0]        \n\t"
+      "subu_s.qb      %[r_k],    %[q0],           %[q1]        \n\t"
+      "or             %[r_k],    %[r_k],          %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],      %[flat_thresh],  %[r_k]       \n\t"
+      "or             %[r_flat], %[r_flat],       %[c]         \n\t"
+
+      /* flat |= (abs(p0 - p2) > thresh) */
+      "subu_s.qb      %[c],       %[p0],          %[p2]        \n\t"
+      "subu_s.qb      %[r_k],     %[p2],          %[p0]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
+      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
+
+      /* flat |= (abs(q0 - q2) > thresh) */
+      "subu_s.qb      %[c],       %[q0],          %[q2]        \n\t"
+      "subu_s.qb      %[r_k],     %[q2],          %[q0]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
+      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
+
+      /* flat |= (abs(p3 - p0) > thresh) */
+      "subu_s.qb      %[c],       %[p3],          %[p0]        \n\t"
+      "subu_s.qb      %[r_k],     %[p0],          %[p3]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
+      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
+
+      /* flat |= (abs(q3 - q0) > thresh) */
+      "subu_s.qb      %[c],       %[q3],          %[q0]        \n\t"
+      "subu_s.qb      %[r_k],     %[q0],          %[q3]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
+      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
+      "sll            %[r_flat],  %[r_flat],      24           \n\t"
+      "wrdsp          %[r_flat]                                \n\t"
+      "pick.qb        %[flat1],   $0,             %[ones]      \n\t"
+      /* flat & flatmask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3) */
+      "and            %[flat1],  %[flat3],        %[flat1]     \n\t"
+
+      : [c] "=&r"(c), [r_k] "=&r"(r_k), [r] "=&r"(r), [r_flat] "=&r"(r_flat),
+        [flat1] "=&r"(flat1), [flat3] "=&r"(flat3)
+      : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0),
+        [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2), [q3] "r"(q3), [q4] "r"(q4),
+        [flat_thresh] "r"(flat_thresh), [ones] "r"(ones));
+
+  *flat2 = flat1;
+}
+#endif  // #if HAVE_DSPR2
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_DSP_MIPS_LOOPFILTER_MASKS_DSPR2_H_
diff --git a/libs/libaom/src/aom_dsp/mips/loopfilter_mb_dspr2.c b/libs/libaom/src/aom_dsp/mips/loopfilter_mb_dspr2.c
new file mode 100644
index 000000000..b67ccfe9d
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/mips/loopfilter_mb_dspr2.c
@@ -0,0 +1,590 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/mips/common_dspr2.h"
+#include "aom_dsp/mips/loopfilter_filters_dspr2.h"
+#include "aom_dsp/mips/loopfilter_macros_dspr2.h"
+#include "aom_dsp/mips/loopfilter_masks_dspr2.h"
+#include "aom_mem/aom_mem.h"
+
+#if HAVE_DSPR2
+void aom_lpf_horizontal_8_dspr2(unsigned char *s, int pitch,
+                                const uint8_t *blimit, const uint8_t *limit,
+                                const uint8_t *thresh) {
+  uint32_t mask;
+  uint32_t hev, flat;
+  uint8_t i;
+  uint8_t *sp3, *sp2, *sp1, *sp0, *sq0, *sq1, *sq2, *sq3;
+  uint32_t thresh_vec, flimit_vec, limit_vec;
+  uint32_t uflimit, ulimit, uthresh;
+  uint32_t p1_f0, p0_f0, q0_f0, q1_f0;
+  uint32_t p3, p2, p1, p0, q0, q1, q2, q3;
+  uint32_t p0_l, p1_l, p2_l, p3_l, q0_l, q1_l, q2_l, q3_l;
+  uint32_t p0_r, p1_r, p2_r, p3_r, q0_r, q1_r, q2_r, q3_r;
+
+  uflimit = *blimit;
+  ulimit = *limit;
+  uthresh = *thresh;
+
+  /* create quad-byte */
+  __asm__ __volatile__(
+      "replv.qb       %[thresh_vec],    %[uthresh]    \n\t"
+      "replv.qb       %[flimit_vec],    %[uflimit]    \n\t"
+      "replv.qb       %[limit_vec],     %[ulimit]     \n\t"
+
+      : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
+        [limit_vec] "=r"(limit_vec)
+      : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
+
+  /* prefetch data for store */
+  prefetch_store(s);
+
+  for (i = 0; i < 2; i++) {
+    sp3 = s - (pitch << 2);
+    sp2 = sp3 + pitch;
+    sp1 = sp2 + pitch;
+    sp0 = sp1 + pitch;
+    sq0 = s;
+    sq1 = s + pitch;
+    sq2 = sq1 + pitch;
+    sq3 = sq2 + pitch;
+
+    __asm__ __volatile__(
+        "lw     %[p3],      (%[sp3])    \n\t"
+        "lw     %[p2],      (%[sp2])    \n\t"
+        "lw     %[p1],      (%[sp1])    \n\t"
+        "lw     %[p0],      (%[sp0])    \n\t"
+        "lw     %[q0],      (%[sq0])    \n\t"
+        "lw     %[q1],      (%[sq1])    \n\t"
+        "lw     %[q2],      (%[sq2])    \n\t"
+        "lw     %[q3],      (%[sq3])    \n\t"
+
+        : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0),
+          [q3] "=&r"(q3), [q2] "=&r"(q2), [q1] "=&r"(q1), [q0] "=&r"(q0)
+        : [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0),
+          [sq3] "r"(sq3), [sq2] "r"(sq2), [sq1] "r"(sq1), [sq0] "r"(sq0));
+
+    filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0,
+                                    p3, p2, q0, q1, q2, q3, &hev, &mask, &flat);
+
+    if ((flat == 0) && (mask != 0)) {
+      filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+
+      __asm__ __volatile__(
+          "sw       %[p1_f0],   (%[sp1])    \n\t"
+          "sw       %[p0_f0],   (%[sp0])    \n\t"
+          "sw       %[q0_f0],   (%[sq0])    \n\t"
+          "sw       %[q1_f0],   (%[sq1])    \n\t"
+
+          :
+          : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+            [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
+            [sq1] "r"(sq1));
+    } else if ((mask & flat) == 0xFFFFFFFF) {
+      /* left 2 element operation */
+      PACK_LEFT_0TO3()
+      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
+
+      /* right 2 element operation */
+      PACK_RIGHT_0TO3()
+      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
+
+      COMBINE_LEFT_RIGHT_0TO2()
+
+      __asm__ __volatile__(
+          "sw       %[p2],      (%[sp2])    \n\t"
+          "sw       %[p1],      (%[sp1])    \n\t"
+          "sw       %[p0],      (%[sp0])    \n\t"
+          "sw       %[q0],      (%[sq0])    \n\t"
+          "sw       %[q1],      (%[sq1])    \n\t"
+          "sw       %[q2],      (%[sq2])    \n\t"
+
+          :
+          : [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [q0] "r"(q0),
+            [q1] "r"(q1), [q2] "r"(q2), [sp2] "r"(sp2), [sp1] "r"(sp1),
+            [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2));
+    } else if ((flat != 0) && (mask != 0)) {
+      /* filtering */
+      filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+
+      /* left 2 element operation */
+      PACK_LEFT_0TO3()
+      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
+
+      /* right 2 element operation */
+      PACK_RIGHT_0TO3()
+      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
+
+      if (mask & flat & 0x000000FF) {
+        __asm__ __volatile__(
+            "sb     %[p2_r],    (%[sp2])    \n\t"
+            "sb     %[p1_r],    (%[sp1])    \n\t"
+            "sb     %[p0_r],    (%[sp0])    \n\t"
+            "sb     %[q0_r],    (%[sq0])    \n\t"
+            "sb     %[q1_r],    (%[sq1])    \n\t"
+            "sb     %[q2_r],    (%[sq2])    \n\t"
+
+            :
+            : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
+              [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
+              [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
+              [sq1] "r"(sq1), [sq2] "r"(sq2));
+      } else if (mask & 0x000000FF) {
+        __asm__ __volatile__(
+            "sb         %[p1_f0],  (%[sp1])    \n\t"
+            "sb         %[p0_f0],  (%[sp0])    \n\t"
+            "sb         %[q0_f0],  (%[sq0])    \n\t"
+            "sb         %[q1_f0],  (%[sq1])    \n\t"
+
+            :
+            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+              [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
+              [sq0] "r"(sq0), [sq1] "r"(sq1));
+      }
+
+      __asm__ __volatile__(
+          "srl      %[p2_r],    %[p2_r],    16      \n\t"
+          "srl      %[p1_r],    %[p1_r],    16      \n\t"
+          "srl      %[p0_r],    %[p0_r],    16      \n\t"
+          "srl      %[q0_r],    %[q0_r],    16      \n\t"
+          "srl      %[q1_r],    %[q1_r],    16      \n\t"
+          "srl      %[q2_r],    %[q2_r],    16      \n\t"
+          "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
+
+          : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r),
+            [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r),
+            [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+            [q1_f0] "+r"(q1_f0)
+          :);
+
+      if (mask & flat & 0x0000FF00) {
+        __asm__ __volatile__(
+            "sb     %[p2_r],    +1(%[sp2])    \n\t"
+            "sb     %[p1_r],    +1(%[sp1])    \n\t"
+            "sb     %[p0_r],    +1(%[sp0])    \n\t"
+            "sb     %[q0_r],    +1(%[sq0])    \n\t"
+            "sb     %[q1_r],    +1(%[sq1])    \n\t"
+            "sb     %[q2_r],    +1(%[sq2])    \n\t"
+
+            :
+            : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
+              [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
+              [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
+              [sq1] "r"(sq1), [sq2] "r"(sq2));
+      } else if (mask & 0x0000FF00) {
+        __asm__ __volatile__(
+            "sb     %[p1_f0],   +1(%[sp1])    \n\t"
+            "sb     %[p0_f0],   +1(%[sp0])    \n\t"
+            "sb     %[q0_f0],   +1(%[sq0])    \n\t"
+            "sb     %[q1_f0],   +1(%[sq1])    \n\t"
+
+            :
+            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+              [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
+              [sq0] "r"(sq0), [sq1] "r"(sq1));
+      }
+
+      __asm__ __volatile__(
+          "srl      %[p1_f0],   %[p1_f0],   8     \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8     \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8     \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8     \n\t"
+
+          : [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0), [q0] "+r"(q0),
+            [q1] "+r"(q1), [q2] "+r"(q2), [p1_f0] "+r"(p1_f0),
+            [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), [q1_f0] "+r"(q1_f0)
+          :);
+
+      if (mask & flat & 0x00FF0000) {
+        __asm__ __volatile__(
+            "sb     %[p2_l],    +2(%[sp2])    \n\t"
+            "sb     %[p1_l],    +2(%[sp1])    \n\t"
+            "sb     %[p0_l],    +2(%[sp0])    \n\t"
+            "sb     %[q0_l],    +2(%[sq0])    \n\t"
+            "sb     %[q1_l],    +2(%[sq1])    \n\t"
+            "sb     %[q2_l],    +2(%[sq2])    \n\t"
+
+            :
+            : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
+              [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
+              [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
+              [sq1] "r"(sq1), [sq2] "r"(sq2));
+      } else if (mask & 0x00FF0000) {
+        __asm__ __volatile__(
+            "sb     %[p1_f0],   +2(%[sp1])    \n\t"
+            "sb     %[p0_f0],   +2(%[sp0])    \n\t"
+            "sb     %[q0_f0],   +2(%[sq0])    \n\t"
+            "sb     %[q1_f0],   +2(%[sq1])    \n\t"
+
+            :
+            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+              [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
+              [sq0] "r"(sq0), [sq1] "r"(sq1));
+      }
+
+      __asm__ __volatile__(
+          "srl      %[p2_l],    %[p2_l],    16      \n\t"
+          "srl      %[p1_l],    %[p1_l],    16      \n\t"
+          "srl      %[p0_l],    %[p0_l],    16      \n\t"
+          "srl      %[q0_l],    %[q0_l],    16      \n\t"
+          "srl      %[q1_l],    %[q1_l],    16      \n\t"
+          "srl      %[q2_l],    %[q2_l],    16      \n\t"
+          "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
+
+          : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l),
+            [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l),
+            [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+            [q1_f0] "+r"(q1_f0)
+          :);
+
+      if (mask & flat & 0xFF000000) {
+        __asm__ __volatile__(
+            "sb     %[p2_l],    +3(%[sp2])    \n\t"
+            "sb     %[p1_l],    +3(%[sp1])    \n\t"
+            "sb     %[p0_l],    +3(%[sp0])    \n\t"
+            "sb     %[q0_l],    +3(%[sq0])    \n\t"
+            "sb     %[q1_l],    +3(%[sq1])    \n\t"
+            "sb     %[q2_l],    +3(%[sq2])    \n\t"
+
+            :
+            : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
+              [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
+              [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
+              [sq1] "r"(sq1), [sq2] "r"(sq2));
+      } else if (mask & 0xFF000000) {
+        __asm__ __volatile__(
+            "sb     %[p1_f0],   +3(%[sp1])    \n\t"
+            "sb     %[p0_f0],   +3(%[sp0])    \n\t"
+            "sb     %[q0_f0],   +3(%[sq0])    \n\t"
+            "sb     %[q1_f0],   +3(%[sq1])    \n\t"
+
+            :
+            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+              [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
+              [sq0] "r"(sq0), [sq1] "r"(sq1));
+      }
+    }
+
+    s = s + 4;
+  }
+}
+
+void aom_lpf_vertical_8_dspr2(unsigned char *s, int pitch,
+                              const uint8_t *blimit, const uint8_t *limit,
+                              const uint8_t *thresh) {
+  uint8_t i;
+  uint32_t mask, hev, flat;
+  uint8_t *s1, *s2, *s3, *s4;
+  uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
+  uint32_t thresh_vec, flimit_vec, limit_vec;
+  uint32_t uflimit, ulimit, uthresh;
+  uint32_t p3, p2, p1, p0, q3, q2, q1, q0;
+  uint32_t p1_f0, p0_f0, q0_f0, q1_f0;
+  uint32_t p0_l, p1_l, p2_l, p3_l, q0_l, q1_l, q2_l, q3_l;
+  uint32_t p0_r, p1_r, p2_r, p3_r, q0_r, q1_r, q2_r, q3_r;
+
+  uflimit = *blimit;
+  ulimit = *limit;
+  uthresh = *thresh;
+
+  /* create quad-byte */
+  __asm__ __volatile__(
+      "replv.qb     %[thresh_vec],  %[uthresh]    \n\t"
+      "replv.qb     %[flimit_vec],  %[uflimit]    \n\t"
+      "replv.qb     %[limit_vec],   %[ulimit]     \n\t"
+
+      : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
+        [limit_vec] "=r"(limit_vec)
+      : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
+
+  prefetch_store(s + pitch);
+
+  for (i = 0; i < 2; i++) {
+    s1 = s;
+    s2 = s + pitch;
+    s3 = s2 + pitch;
+    s4 = s3 + pitch;
+    s = s4 + pitch;
+
+    __asm__ __volatile__(
+        "lw     %[p0],  -4(%[s1])    \n\t"
+        "lw     %[p1],  -4(%[s2])    \n\t"
+        "lw     %[p2],  -4(%[s3])    \n\t"
+        "lw     %[p3],  -4(%[s4])    \n\t"
+        "lw     %[q3],    (%[s1])    \n\t"
+        "lw     %[q2],    (%[s2])    \n\t"
+        "lw     %[q1],    (%[s3])    \n\t"
+        "lw     %[q0],    (%[s4])    \n\t"
+
+        : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0),
+          [q0] "=&r"(q0), [q1] "=&r"(q1), [q2] "=&r"(q2), [q3] "=&r"(q3)
+        : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4));
+
+    /* transpose p3, p2, p1, p0
+       original (when loaded from memory)
+       register       -4    -3   -2     -1
+         p0         p0_0  p0_1  p0_2  p0_3
+         p1         p1_0  p1_1  p1_2  p1_3
+         p2         p2_0  p2_1  p2_2  p2_3
+         p3         p3_0  p3_1  p3_2  p3_3
+
+       after transpose
+       register
+         p0         p3_3  p2_3  p1_3  p0_3
+         p1         p3_2  p2_2  p1_2  p0_2
+         p2         p3_1  p2_1  p1_1  p0_1
+         p3         p3_0  p2_0  p1_0  p0_0
+    */
+    __asm__ __volatile__(
+        "precrq.qb.ph   %[prim1],   %[p0],      %[p1]       \n\t"
+        "precr.qb.ph    %[prim2],   %[p0],      %[p1]       \n\t"
+        "precrq.qb.ph   %[prim3],   %[p2],      %[p3]       \n\t"
+        "precr.qb.ph    %[prim4],   %[p2],      %[p3]       \n\t"
+
+        "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
+        "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
+        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+        "precrq.ph.w    %[p0],      %[p1],      %[sec3]     \n\t"
+        "precrq.ph.w    %[p2],      %[p3],      %[sec4]     \n\t"
+        "append         %[p1],      %[sec3],    16          \n\t"
+        "append         %[p3],      %[sec4],    16          \n\t"
+
+        : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
+          [prim4] "=&r"(prim4), [p0] "+r"(p0), [p1] "+r"(p1), [p2] "+r"(p2),
+          [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
+        :);
+
+    /* transpose q0, q1, q2, q3
+       original (when loaded from memory)
+       register       +1    +2    +3    +4
+         q3         q3_0  q3_1  q3_2  q3_3
+         q2         q2_0  q2_1  q2_2  q2_3
+         q1         q1_0  q1_1  q1_2  q1_3
+         q0         q0_0  q0_1  q0_2  q0_3
+
+       after transpose
+       register
+         q3         q0_3  q1_3  q2_3  q3_3
+         q2         q0_2  q1_2  q2_2  q3_2
+         q1         q0_1  q1_1  q2_1  q3_1
+         q0         q0_0  q1_0  q2_0  q3_0
+    */
+    __asm__ __volatile__(
+        "precrq.qb.ph   %[prim1],   %[q3],      %[q2]       \n\t"
+        "precr.qb.ph    %[prim2],   %[q3],      %[q2]       \n\t"
+        "precrq.qb.ph   %[prim3],   %[q1],      %[q0]       \n\t"
+        "precr.qb.ph    %[prim4],   %[q1],      %[q0]       \n\t"
+
+        "precrq.qb.ph   %[q2],      %[prim1],   %[prim2]    \n\t"
+        "precr.qb.ph    %[q0],      %[prim1],   %[prim2]    \n\t"
+        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+        "precrq.ph.w    %[q3],      %[q2],      %[sec3]     \n\t"
+        "precrq.ph.w    %[q1],      %[q0],      %[sec4]     \n\t"
+        "append         %[q2],      %[sec3],    16          \n\t"
+        "append         %[q0],      %[sec4],    16          \n\t"
+
+        : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
+          [prim4] "=&r"(prim4), [q3] "+r"(q3), [q2] "+r"(q2), [q1] "+r"(q1),
+          [q0] "+r"(q0), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
+        :);
+
+    filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0,
+                                    p3, p2, q0, q1, q2, q3, &hev, &mask, &flat);
+
+    if ((flat == 0) && (mask != 0)) {
+      filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+      STORE_F0()
+    } else if ((mask & flat) == 0xFFFFFFFF) {
+      /* left 2 element operation */
+      PACK_LEFT_0TO3()
+      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
+
+      /* right 2 element operation */
+      PACK_RIGHT_0TO3()
+      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
+
+      STORE_F1()
+    } else if ((flat != 0) && (mask != 0)) {
+      filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+
+      /* left 2 element operation */
+      PACK_LEFT_0TO3()
+      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
+
+      /* right 2 element operation */
+      PACK_RIGHT_0TO3()
+      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
+
+      if (mask & flat & 0x000000FF) {
+        __asm__ __volatile__(
+            "sb         %[p2_r],  -3(%[s4])    \n\t"
+            "sb         %[p1_r],  -2(%[s4])    \n\t"
+            "sb         %[p0_r],  -1(%[s4])    \n\t"
+            "sb         %[q0_r],    (%[s4])    \n\t"
+            "sb         %[q1_r],  +1(%[s4])    \n\t"
+            "sb         %[q2_r],  +2(%[s4])    \n\t"
+
+            :
+            : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
+              [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
+              [s4] "r"(s4));
+      } else if (mask & 0x000000FF) {
+        __asm__ __volatile__(
+            "sb         %[p1_f0],  -2(%[s4])    \n\t"
+            "sb         %[p0_f0],  -1(%[s4])    \n\t"
+            "sb         %[q0_f0],    (%[s4])    \n\t"
+            "sb         %[q1_f0],  +1(%[s4])    \n\t"
+
+            :
+            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+              [q1_f0] "r"(q1_f0), [s4] "r"(s4));
+      }
+
+      __asm__ __volatile__(
+          "srl      %[p2_r],    %[p2_r],    16      \n\t"
+          "srl      %[p1_r],    %[p1_r],    16      \n\t"
+          "srl      %[p0_r],    %[p0_r],    16      \n\t"
+          "srl      %[q0_r],    %[q0_r],    16      \n\t"
+          "srl      %[q1_r],    %[q1_r],    16      \n\t"
+          "srl      %[q2_r],    %[q2_r],    16      \n\t"
+          "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
+
+          : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r),
+            [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r),
+            [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+            [q1_f0] "+r"(q1_f0)
+          :);
+
+      if (mask & flat & 0x0000FF00) {
+        __asm__ __volatile__(
+            "sb         %[p2_r],  -3(%[s3])    \n\t"
+            "sb         %[p1_r],  -2(%[s3])    \n\t"
+            "sb         %[p0_r],  -1(%[s3])    \n\t"
+            "sb         %[q0_r],    (%[s3])    \n\t"
+            "sb         %[q1_r],  +1(%[s3])    \n\t"
+            "sb         %[q2_r],  +2(%[s3])    \n\t"
+
+            :
+            : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
+              [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
+              [s3] "r"(s3));
+      } else if (mask & 0x0000FF00) {
+        __asm__ __volatile__(
+            "sb         %[p1_f0],  -2(%[s3])    \n\t"
+            "sb         %[p0_f0],  -1(%[s3])    \n\t"
+            "sb         %[q0_f0],    (%[s3])    \n\t"
+            "sb         %[q1_f0],  +1(%[s3])    \n\t"
+
+            :
+            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+              [q1_f0] "r"(q1_f0), [s3] "r"(s3));
+      }
+
+      __asm__ __volatile__(
+          "srl      %[p1_f0],   %[p1_f0],   8     \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8     \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8     \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8     \n\t"
+
+          : [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0), [q0] "+r"(q0),
+            [q1] "+r"(q1), [q2] "+r"(q2), [p1_f0] "+r"(p1_f0),
+            [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), [q1_f0] "+r"(q1_f0)
+          :);
+
+      if (mask & flat & 0x00FF0000) {
+        __asm__ __volatile__(
+            "sb         %[p2_l],  -3(%[s2])    \n\t"
+            "sb         %[p1_l],  -2(%[s2])    \n\t"
+            "sb         %[p0_l],  -1(%[s2])    \n\t"
+            "sb         %[q0_l],    (%[s2])    \n\t"
+            "sb         %[q1_l],  +1(%[s2])    \n\t"
+            "sb         %[q2_l],  +2(%[s2])    \n\t"
+
+            :
+            : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
+              [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
+              [s2] "r"(s2));
+      } else if (mask & 0x00FF0000) {
+        __asm__ __volatile__(
+            "sb         %[p1_f0],  -2(%[s2])    \n\t"
+            "sb         %[p0_f0],  -1(%[s2])    \n\t"
+            "sb         %[q0_f0],    (%[s2])    \n\t"
+            "sb         %[q1_f0],  +1(%[s2])    \n\t"
+
+            :
+            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+              [q1_f0] "r"(q1_f0), [s2] "r"(s2));
+      }
+
+      __asm__ __volatile__(
+          "srl      %[p2_l],    %[p2_l],    16      \n\t"
+          "srl      %[p1_l],    %[p1_l],    16      \n\t"
+          "srl      %[p0_l],    %[p0_l],    16      \n\t"
+          "srl      %[q0_l],    %[q0_l],    16      \n\t"
+          "srl      %[q1_l],    %[q1_l],    16      \n\t"
+          "srl      %[q2_l],    %[q2_l],    16      \n\t"
+          "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
+
+          : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l),
+            [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l),
+            [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+            [q1_f0] "+r"(q1_f0)
+          :);
+
+      if (mask & flat & 0xFF000000) {
+        __asm__ __volatile__(
+            "sb         %[p2_l],  -3(%[s1])    \n\t"
+            "sb         %[p1_l],  -2(%[s1])    \n\t"
+            "sb         %[p0_l],  -1(%[s1])    \n\t"
+            "sb         %[q0_l],    (%[s1])    \n\t"
+            "sb         %[q1_l],  +1(%[s1])    \n\t"
+            "sb         %[q2_l],  +2(%[s1])    \n\t"
+
+            :
+            : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
+              [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
+              [s1] "r"(s1));
+      } else if (mask & 0xFF000000) {
+        __asm__ __volatile__(
+            "sb         %[p1_f0],  -2(%[s1])    \n\t"
+            "sb         %[p0_f0],  -1(%[s1])    \n\t"
+            "sb         %[q0_f0],    (%[s1])    \n\t"
+            "sb         %[q1_f0],  +1(%[s1])    \n\t"
+
+            :
+            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+              [q1_f0] "r"(q1_f0), [s1] "r"(s1));
+      }
+    }
+  }
+}
+#endif  // #if HAVE_DSPR2
diff --git a/libs/libaom/src/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c b/libs/libaom/src/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c
new file mode 100644
index 000000000..34733e42e
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c
@@ -0,0 +1,734 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/mips/common_dspr2.h"
+#include "aom_dsp/mips/loopfilter_filters_dspr2.h"
+#include "aom_dsp/mips/loopfilter_macros_dspr2.h"
+#include "aom_dsp/mips/loopfilter_masks_dspr2.h"
+#include "aom_mem/aom_mem.h"
+
+#if HAVE_DSPR2
+static void mb_lpf_horizontal_edge(unsigned char *s, int pitch,
+                                   const uint8_t *blimit, const uint8_t *limit,
+                                   const uint8_t *thresh, int count) {
+  uint32_t mask;
+  uint32_t hev, flat, flat2;
+  uint8_t i;
+  uint8_t *sp7, *sp6, *sp5, *sp4, *sp3, *sp2, *sp1, *sp0;
+  uint8_t *sq0, *sq1, *sq2, *sq3, *sq4, *sq5, *sq6, *sq7;
+  uint32_t thresh_vec, flimit_vec, limit_vec;
+  uint32_t uflimit, ulimit, uthresh;
+  uint32_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+  uint32_t p1_f0, p0_f0, q0_f0, q1_f0;
+  uint32_t p7_l, p6_l, p5_l, p4_l, p3_l, p2_l, p1_l, p0_l;
+  uint32_t q0_l, q1_l, q2_l, q3_l, q4_l, q5_l, q6_l, q7_l;
+  uint32_t p7_r, p6_r, p5_r, p4_r, p3_r, p2_r, p1_r, p0_r;
+  uint32_t q0_r, q1_r, q2_r, q3_r, q4_r, q5_r, q6_r, q7_r;
+  uint32_t p2_l_f1, p1_l_f1, p0_l_f1, p2_r_f1, p1_r_f1, p0_r_f1;
+  uint32_t q0_l_f1, q1_l_f1, q2_l_f1, q0_r_f1, q1_r_f1, q2_r_f1;
+
+  uflimit = *blimit;
+  ulimit = *limit;
+  uthresh = *thresh;
+
+  /* create quad-byte */
+  __asm__ __volatile__(
+      "replv.qb       %[thresh_vec],    %[uthresh]      \n\t"
+      "replv.qb       %[flimit_vec],    %[uflimit]      \n\t"
+      "replv.qb       %[limit_vec],     %[ulimit]       \n\t"
+
+      : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
+        [limit_vec] "=r"(limit_vec)
+      : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
+
+  /* prefetch data for store */
+  prefetch_store(s);
+
+  for (i = 0; i < (2 * count); i++) {
+    sp7 = s - (pitch << 3);
+    sp6 = sp7 + pitch;
+    sp5 = sp6 + pitch;
+    sp4 = sp5 + pitch;
+    sp3 = sp4 + pitch;
+    sp2 = sp3 + pitch;
+    sp1 = sp2 + pitch;
+    sp0 = sp1 + pitch;
+    sq0 = s;
+    sq1 = s + pitch;
+    sq2 = sq1 + pitch;
+    sq3 = sq2 + pitch;
+    sq4 = sq3 + pitch;
+    sq5 = sq4 + pitch;
+    sq6 = sq5 + pitch;
+    sq7 = sq6 + pitch;
+
+    __asm__ __volatile__(
+        "lw     %[p7],      (%[sp7])            \n\t"
+        "lw     %[p6],      (%[sp6])            \n\t"
+        "lw     %[p5],      (%[sp5])            \n\t"
+        "lw     %[p4],      (%[sp4])            \n\t"
+        "lw     %[p3],      (%[sp3])            \n\t"
+        "lw     %[p2],      (%[sp2])            \n\t"
+        "lw     %[p1],      (%[sp1])            \n\t"
+        "lw     %[p0],      (%[sp0])            \n\t"
+
+        : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0),
+          [p7] "=&r"(p7), [p6] "=&r"(p6), [p5] "=&r"(p5), [p4] "=&r"(p4)
+        : [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0),
+          [sp4] "r"(sp4), [sp5] "r"(sp5), [sp6] "r"(sp6), [sp7] "r"(sp7));
+
+    __asm__ __volatile__(
+        "lw     %[q0],      (%[sq0])            \n\t"
+        "lw     %[q1],      (%[sq1])            \n\t"
+        "lw     %[q2],      (%[sq2])            \n\t"
+        "lw     %[q3],      (%[sq3])            \n\t"
+        "lw     %[q4],      (%[sq4])            \n\t"
+        "lw     %[q5],      (%[sq5])            \n\t"
+        "lw     %[q6],      (%[sq6])            \n\t"
+        "lw     %[q7],      (%[sq7])            \n\t"
+
+        : [q3] "=&r"(q3), [q2] "=&r"(q2), [q1] "=&r"(q1), [q0] "=&r"(q0),
+          [q7] "=&r"(q7), [q6] "=&r"(q6), [q5] "=&r"(q5), [q4] "=&r"(q4)
+        : [sq3] "r"(sq3), [sq2] "r"(sq2), [sq1] "r"(sq1), [sq0] "r"(sq0),
+          [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6), [sq7] "r"(sq7));
+
+    filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0,
+                                    p3, p2, q0, q1, q2, q3, &hev, &mask, &flat);
+
+    flatmask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, &flat2);
+
+    /* f0 */
+    if (((flat2 == 0) && (flat == 0) && (mask != 0)) ||
+        ((flat2 != 0) && (flat == 0) && (mask != 0))) {
+      filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+
+      __asm__ __volatile__(
+          "sw       %[p1_f0],   (%[sp1])            \n\t"
+          "sw       %[p0_f0],   (%[sp0])            \n\t"
+          "sw       %[q0_f0],   (%[sq0])            \n\t"
+          "sw       %[q1_f0],   (%[sq1])            \n\t"
+
+          :
+          : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+            [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
+            [sq1] "r"(sq1));
+    } else if ((flat2 == 0XFFFFFFFF) && (flat == 0xFFFFFFFF) &&
+               (mask == 0xFFFFFFFF)) {
+      /* f2 */
+      PACK_LEFT_0TO3()
+      PACK_LEFT_4TO7()
+      wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l,
+                          &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l,
+                          &q6_l, &q7_l);
+
+      PACK_RIGHT_0TO3()
+      PACK_RIGHT_4TO7()
+      wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r,
+                          &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r,
+                          &q6_r, &q7_r);
+
+      COMBINE_LEFT_RIGHT_0TO2()
+      COMBINE_LEFT_RIGHT_3TO6()
+
+      __asm__ __volatile__(
+          "sw         %[p6], (%[sp6])    \n\t"
+          "sw         %[p5], (%[sp5])    \n\t"
+          "sw         %[p4], (%[sp4])    \n\t"
+          "sw         %[p3], (%[sp3])    \n\t"
+          "sw         %[p2], (%[sp2])    \n\t"
+          "sw         %[p1], (%[sp1])    \n\t"
+          "sw         %[p0], (%[sp0])    \n\t"
+
+          :
+          : [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3),
+            [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [sp6] "r"(sp6),
+            [sp5] "r"(sp5), [sp4] "r"(sp4), [sp3] "r"(sp3), [sp2] "r"(sp2),
+            [sp1] "r"(sp1), [sp0] "r"(sp0));
+
+      __asm__ __volatile__(
+          "sw         %[q6], (%[sq6])    \n\t"
+          "sw         %[q5], (%[sq5])    \n\t"
+          "sw         %[q4], (%[sq4])    \n\t"
+          "sw         %[q3], (%[sq3])    \n\t"
+          "sw         %[q2], (%[sq2])    \n\t"
+          "sw         %[q1], (%[sq1])    \n\t"
+          "sw         %[q0], (%[sq0])    \n\t"
+
+          :
+          : [q6] "r"(q6), [q5] "r"(q5), [q4] "r"(q4), [q3] "r"(q3),
+            [q2] "r"(q2), [q1] "r"(q1), [q0] "r"(q0), [sq6] "r"(sq6),
+            [sq5] "r"(sq5), [sq4] "r"(sq4), [sq3] "r"(sq3), [sq2] "r"(sq2),
+            [sq1] "r"(sq1), [sq0] "r"(sq0));
+    } else if ((flat2 == 0) && (flat == 0xFFFFFFFF) && (mask == 0xFFFFFFFF)) {
+      /* f1 */
+      /* left 2 element operation */
+      PACK_LEFT_0TO3()
+      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
+
+      /* right 2 element operation */
+      PACK_RIGHT_0TO3()
+      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
+
+      COMBINE_LEFT_RIGHT_0TO2()
+
+      __asm__ __volatile__(
+          "sw         %[p2], (%[sp2])    \n\t"
+          "sw         %[p1], (%[sp1])    \n\t"
+          "sw         %[p0], (%[sp0])    \n\t"
+          "sw         %[q0], (%[sq0])    \n\t"
+          "sw         %[q1], (%[sq1])    \n\t"
+          "sw         %[q2], (%[sq2])    \n\t"
+
+          :
+          : [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [q0] "r"(q0),
+            [q1] "r"(q1), [q2] "r"(q2), [sp2] "r"(sp2), [sp1] "r"(sp1),
+            [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2));
+    } else if ((flat2 == 0) && (flat != 0) && (mask != 0)) {
+      /* f0+f1 */
+      filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+
+      /* left 2 element operation */
+      PACK_LEFT_0TO3()
+      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
+
+      /* right 2 element operation */
+      PACK_RIGHT_0TO3()
+      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
+
+      if (mask & flat & 0x000000FF) {
+        __asm__ __volatile__(
+            "sb         %[p2_r],  (%[sp2])    \n\t"
+            "sb         %[p1_r],  (%[sp1])    \n\t"
+            "sb         %[p0_r],  (%[sp0])    \n\t"
+            "sb         %[q0_r],  (%[sq0])    \n\t"
+            "sb         %[q1_r],  (%[sq1])    \n\t"
+            "sb         %[q2_r],  (%[sq2])    \n\t"
+
+            :
+            : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
+              [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
+              [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
+              [sq1] "r"(sq1), [sq2] "r"(sq2));
+      } else if (mask & 0x000000FF) {
+        __asm__ __volatile__(
+            "sb         %[p1_f0],  (%[sp1])    \n\t"
+            "sb         %[p0_f0],  (%[sp0])    \n\t"
+            "sb         %[q0_f0],  (%[sq0])    \n\t"
+            "sb         %[q1_f0],  (%[sq1])    \n\t"
+
+            :
+            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+              [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
+              [sq0] "r"(sq0), [sq1] "r"(sq1));
+      }
+
+      __asm__ __volatile__(
+          "srl      %[p2_r],    %[p2_r],    16      \n\t"
+          "srl      %[p1_r],    %[p1_r],    16      \n\t"
+          "srl      %[p0_r],    %[p0_r],    16      \n\t"
+          "srl      %[q0_r],    %[q0_r],    16      \n\t"
+          "srl      %[q1_r],    %[q1_r],    16      \n\t"
+          "srl      %[q2_r],    %[q2_r],    16      \n\t"
+          "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
+
+          : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r),
+            [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r),
+            [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+            [q1_f0] "+r"(q1_f0)
+          :);
+
+      if (mask & flat & 0x0000FF00) {
+        __asm__ __volatile__(
+            "sb         %[p2_r],  +1(%[sp2])    \n\t"
+            "sb         %[p1_r],  +1(%[sp1])    \n\t"
+            "sb         %[p0_r],  +1(%[sp0])    \n\t"
+            "sb         %[q0_r],  +1(%[sq0])    \n\t"
+            "sb         %[q1_r],  +1(%[sq1])    \n\t"
+            "sb         %[q2_r],  +1(%[sq2])    \n\t"
+
+            :
+            : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
+              [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
+              [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
+              [sq1] "r"(sq1), [sq2] "r"(sq2));
+      } else if (mask & 0x0000FF00) {
+        __asm__ __volatile__(
+            "sb         %[p1_f0],  +1(%[sp1])    \n\t"
+            "sb         %[p0_f0],  +1(%[sp0])    \n\t"
+            "sb         %[q0_f0],  +1(%[sq0])    \n\t"
+            "sb         %[q1_f0],  +1(%[sq1])    \n\t"
+
+            :
+            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+              [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
+              [sq0] "r"(sq0), [sq1] "r"(sq1));
+      }
+
+      __asm__ __volatile__(
+          "srl      %[p1_f0],   %[p1_f0],   8     \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8     \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8     \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8     \n\t"
+
+          : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+            [q1_f0] "+r"(q1_f0)
+          :);
+
+      if (mask & flat & 0x00FF0000) {
+        __asm__ __volatile__(
+            "sb         %[p2_l],  +2(%[sp2])    \n\t"
+            "sb         %[p1_l],  +2(%[sp1])    \n\t"
+            "sb         %[p0_l],  +2(%[sp0])    \n\t"
+            "sb         %[q0_l],  +2(%[sq0])    \n\t"
+            "sb         %[q1_l],  +2(%[sq1])    \n\t"
+            "sb         %[q2_l],  +2(%[sq2])    \n\t"
+
+            :
+            : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
+              [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
+              [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
+              [sq1] "r"(sq1), [sq2] "r"(sq2));
+      } else if (mask & 0x00FF0000) {
+        __asm__ __volatile__(
+            "sb         %[p1_f0],  +2(%[sp1])    \n\t"
+            "sb         %[p0_f0],  +2(%[sp0])    \n\t"
+            "sb         %[q0_f0],  +2(%[sq0])    \n\t"
+            "sb         %[q1_f0],  +2(%[sq1])    \n\t"
+
+            :
+            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+              [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
+              [sq0] "r"(sq0), [sq1] "r"(sq1));
+      }
+
+      __asm__ __volatile__(
+          "srl      %[p2_l],    %[p2_l],    16      \n\t"
+          "srl      %[p1_l],    %[p1_l],    16      \n\t"
+          "srl      %[p0_l],    %[p0_l],    16      \n\t"
+          "srl      %[q0_l],    %[q0_l],    16      \n\t"
+          "srl      %[q1_l],    %[q1_l],    16      \n\t"
+          "srl      %[q2_l],    %[q2_l],    16      \n\t"
+          "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
+
+          : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l),
+            [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l),
+            [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+            [q1_f0] "+r"(q1_f0)
+          :);
+
+      if (mask & flat & 0xFF000000) {
+        __asm__ __volatile__(
+            "sb         %[p2_l],  +3(%[sp2])    \n\t"
+            "sb         %[p1_l],  +3(%[sp1])    \n\t"
+            "sb         %[p0_l],  +3(%[sp0])    \n\t"
+            "sb         %[q0_l],  +3(%[sq0])    \n\t"
+            "sb         %[q1_l],  +3(%[sq1])    \n\t"
+            "sb         %[q2_l],  +3(%[sq2])    \n\t"
+
+            :
+            : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
+              [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
+              [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
+              [sq1] "r"(sq1), [sq2] "r"(sq2));
+      } else if (mask & 0xFF000000) {
+        __asm__ __volatile__(
+            "sb         %[p1_f0],  +3(%[sp1])    \n\t"
+            "sb         %[p0_f0],  +3(%[sp0])    \n\t"
+            "sb         %[q0_f0],  +3(%[sq0])    \n\t"
+            "sb         %[q1_f0],  +3(%[sq1])    \n\t"
+
+            :
+            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+              [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
+              [sq0] "r"(sq0), [sq1] "r"(sq1));
+      }
+    } else if ((flat2 != 0) && (flat != 0) && (mask != 0)) {
+      /* f0 + f1 + f2 */
+      /* f0  function */
+      filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+
+      /* f1  function */
+      /* left 2 element operation */
+      PACK_LEFT_0TO3()
+      mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, &p2_l_f1,
+                      &p1_l_f1, &p0_l_f1, &q0_l_f1, &q1_l_f1, &q2_l_f1);
+
+      /* right 2 element operation */
+      PACK_RIGHT_0TO3()
+      mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, &p2_r_f1,
+                      &p1_r_f1, &p0_r_f1, &q0_r_f1, &q1_r_f1, &q2_r_f1);
+
+      /* f2  function */
+      PACK_LEFT_4TO7()
+      wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l,
+                          &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l,
+                          &q6_l, &q7_l);
+
+      PACK_RIGHT_4TO7()
+      wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r,
+                          &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r,
+                          &q6_r, &q7_r);
+
+      if (mask & flat & flat2 & 0x000000FF) {
+        __asm__ __volatile__(
+            "sb         %[p6_r],  (%[sp6])    \n\t"
+            "sb         %[p5_r],  (%[sp5])    \n\t"
+            "sb         %[p4_r],  (%[sp4])    \n\t"
+            "sb         %[p3_r],  (%[sp3])    \n\t"
+            "sb         %[p2_r],  (%[sp2])    \n\t"
+            "sb         %[p1_r],  (%[sp1])    \n\t"
+            "sb         %[p0_r],  (%[sp0])    \n\t"
+
+            :
+            : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r),
+              [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r),
+              [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4), [sp3] "r"(sp3),
+              [sp2] "r"(sp2), [sp1] "r"(sp1), [p0_r] "r"(p0_r), [sp0] "r"(sp0));
+
+        __asm__ __volatile__(
+            "sb         %[q0_r],  (%[sq0])    \n\t"
+            "sb         %[q1_r],  (%[sq1])    \n\t"
+            "sb         %[q2_r],  (%[sq2])    \n\t"
+            "sb         %[q3_r],  (%[sq3])    \n\t"
+            "sb         %[q4_r],  (%[sq4])    \n\t"
+            "sb         %[q5_r],  (%[sq5])    \n\t"
+            "sb         %[q6_r],  (%[sq6])    \n\t"
+
+            :
+            : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
+              [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r),
+              [q6_r] "r"(q6_r), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2),
+              [sq3] "r"(sq3), [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6));
+      } else if (mask & flat & 0x000000FF) {
+        __asm__ __volatile__(
+            "sb         %[p2_r_f1],  (%[sp2])    \n\t"
+            "sb         %[p1_r_f1],  (%[sp1])    \n\t"
+            "sb         %[p0_r_f1],  (%[sp0])    \n\t"
+            "sb         %[q0_r_f1],  (%[sq0])    \n\t"
+            "sb         %[q1_r_f1],  (%[sq1])    \n\t"
+            "sb         %[q2_r_f1],  (%[sq2])    \n\t"
+
+            :
+            : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1),
+              [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1),
+              [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [sp2] "r"(sp2),
+              [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1),
+              [sq2] "r"(sq2));
+      } else if (mask & 0x000000FF) {
+        __asm__ __volatile__(
+            "sb         %[p1_f0],  (%[sp1])    \n\t"
+            "sb         %[p0_f0],  (%[sp0])    \n\t"
+            "sb         %[q0_f0],  (%[sq0])    \n\t"
+            "sb         %[q1_f0],  (%[sq1])    \n\t"
+
+            :
+            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+              [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
+              [sq0] "r"(sq0), [sq1] "r"(sq1));
+      }
+
+      __asm__ __volatile__(
+          "srl        %[p6_r], %[p6_r], 16     \n\t"
+          "srl        %[p5_r], %[p5_r], 16     \n\t"
+          "srl        %[p4_r], %[p4_r], 16     \n\t"
+          "srl        %[p3_r], %[p3_r], 16     \n\t"
+          "srl        %[p2_r], %[p2_r], 16     \n\t"
+          "srl        %[p1_r], %[p1_r], 16     \n\t"
+          "srl        %[p0_r], %[p0_r], 16     \n\t"
+          "srl        %[q0_r], %[q0_r], 16     \n\t"
+          "srl        %[q1_r], %[q1_r], 16     \n\t"
+          "srl        %[q2_r], %[q2_r], 16     \n\t"
+          "srl        %[q3_r], %[q3_r], 16     \n\t"
+          "srl        %[q4_r], %[q4_r], 16     \n\t"
+          "srl        %[q5_r], %[q5_r], 16     \n\t"
+          "srl        %[q6_r], %[q6_r], 16     \n\t"
+
+          : [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r),
+            [q3_r] "+r"(q3_r), [q4_r] "+r"(q4_r), [q5_r] "+r"(q5_r),
+            [p6_r] "+r"(p6_r), [p5_r] "+r"(p5_r), [p4_r] "+r"(p4_r),
+            [p3_r] "+r"(p3_r), [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r),
+            [q6_r] "+r"(q6_r), [p0_r] "+r"(p0_r)
+          :);
+
+      __asm__ __volatile__(
+          "srl        %[p2_r_f1], %[p2_r_f1], 16     \n\t"
+          "srl        %[p1_r_f1], %[p1_r_f1], 16     \n\t"
+          "srl        %[p0_r_f1], %[p0_r_f1], 16     \n\t"
+          "srl        %[q0_r_f1], %[q0_r_f1], 16     \n\t"
+          "srl        %[q1_r_f1], %[q1_r_f1], 16     \n\t"
+          "srl        %[q2_r_f1], %[q2_r_f1], 16     \n\t"
+          "srl        %[p1_f0],   %[p1_f0],   8      \n\t"
+          "srl        %[p0_f0],   %[p0_f0],   8      \n\t"
+          "srl        %[q0_f0],   %[q0_f0],   8      \n\t"
+          "srl        %[q1_f0],   %[q1_f0],   8      \n\t"
+
+          : [p2_r_f1] "+r"(p2_r_f1), [p1_r_f1] "+r"(p1_r_f1),
+            [p0_r_f1] "+r"(p0_r_f1), [q0_r_f1] "+r"(q0_r_f1),
+            [q1_r_f1] "+r"(q1_r_f1), [q2_r_f1] "+r"(q2_r_f1),
+            [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+            [q1_f0] "+r"(q1_f0)
+          :);
+
+      if (mask & flat & flat2 & 0x0000FF00) {
+        __asm__ __volatile__(
+            "sb         %[p6_r],  +1(%[sp6])    \n\t"
+            "sb         %[p5_r],  +1(%[sp5])    \n\t"
+            "sb         %[p4_r],  +1(%[sp4])    \n\t"
+            "sb         %[p3_r],  +1(%[sp3])    \n\t"
+            "sb         %[p2_r],  +1(%[sp2])    \n\t"
+            "sb         %[p1_r],  +1(%[sp1])    \n\t"
+            "sb         %[p0_r],  +1(%[sp0])    \n\t"
+
+            :
+            : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r),
+              [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r),
+              [p0_r] "r"(p0_r), [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4),
+              [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0));
+
+        __asm__ __volatile__(
+            "sb         %[q0_r],  +1(%[sq0])    \n\t"
+            "sb         %[q1_r],  +1(%[sq1])    \n\t"
+            "sb         %[q2_r],  +1(%[sq2])    \n\t"
+            "sb         %[q3_r],  +1(%[sq3])    \n\t"
+            "sb         %[q4_r],  +1(%[sq4])    \n\t"
+            "sb         %[q5_r],  +1(%[sq5])    \n\t"
+            "sb         %[q6_r],  +1(%[sq6])    \n\t"
+
+            :
+            : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
+              [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r),
+              [q6_r] "r"(q6_r), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2),
+              [sq3] "r"(sq3), [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6));
+      } else if (mask & flat & 0x0000FF00) {
+        __asm__ __volatile__(
+            "sb         %[p2_r_f1],  +1(%[sp2])    \n\t"
+            "sb         %[p1_r_f1],  +1(%[sp1])    \n\t"
+            "sb         %[p0_r_f1],  +1(%[sp0])    \n\t"
+            "sb         %[q0_r_f1],  +1(%[sq0])    \n\t"
+            "sb         %[q1_r_f1],  +1(%[sq1])    \n\t"
+            "sb         %[q2_r_f1],  +1(%[sq2])    \n\t"
+
+            :
+            : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1),
+              [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1),
+              [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [sp2] "r"(sp2),
+              [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1),
+              [sq2] "r"(sq2));
+      } else if (mask & 0x0000FF00) {
+        __asm__ __volatile__(
+            "sb         %[p1_f0],  +1(%[sp1])    \n\t"
+            "sb         %[p0_f0],  +1(%[sp0])    \n\t"
+            "sb         %[q0_f0],  +1(%[sq0])    \n\t"
+            "sb         %[q1_f0],  +1(%[sq1])    \n\t"
+
+            :
+            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+              [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
+              [sq0] "r"(sq0), [sq1] "r"(sq1));
+      }
+
+      __asm__ __volatile__(
+          "srl        %[p1_f0], %[p1_f0], 8     \n\t"
+          "srl        %[p0_f0], %[p0_f0], 8     \n\t"
+          "srl        %[q0_f0], %[q0_f0], 8     \n\t"
+          "srl        %[q1_f0], %[q1_f0], 8     \n\t"
+
+          : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+            [q1_f0] "+r"(q1_f0)
+          :);
+
+      if (mask & flat & flat2 & 0x00FF0000) {
+        __asm__ __volatile__(
+            "sb         %[p6_l],  +2(%[sp6])    \n\t"
+            "sb         %[p5_l],  +2(%[sp5])    \n\t"
+            "sb         %[p4_l],  +2(%[sp4])    \n\t"
+            "sb         %[p3_l],  +2(%[sp3])    \n\t"
+            "sb         %[p2_l],  +2(%[sp2])    \n\t"
+            "sb         %[p1_l],  +2(%[sp1])    \n\t"
+            "sb         %[p0_l],  +2(%[sp0])    \n\t"
+
+            :
+            : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l),
+              [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l),
+              [p0_l] "r"(p0_l), [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4),
+              [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0));
+
+        __asm__ __volatile__(
+            "sb         %[q0_l],  +2(%[sq0])    \n\t"
+            "sb         %[q1_l],  +2(%[sq1])    \n\t"
+            "sb         %[q2_l],  +2(%[sq2])    \n\t"
+            "sb         %[q3_l],  +2(%[sq3])    \n\t"
+            "sb         %[q4_l],  +2(%[sq4])    \n\t"
+            "sb         %[q5_l],  +2(%[sq5])    \n\t"
+            "sb         %[q6_l],  +2(%[sq6])    \n\t"
+
+            :
+            : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
+              [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l),
+              [q6_l] "r"(q6_l), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2),
+              [sq3] "r"(sq3), [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6));
+      } else if (mask & flat & 0x00FF0000) {
+        __asm__ __volatile__(
+            "sb         %[p2_l_f1],  +2(%[sp2])    \n\t"
+            "sb         %[p1_l_f1],  +2(%[sp1])    \n\t"
+            "sb         %[p0_l_f1],  +2(%[sp0])    \n\t"
+            "sb         %[q0_l_f1],  +2(%[sq0])    \n\t"
+            "sb         %[q1_l_f1],  +2(%[sq1])    \n\t"
+            "sb         %[q2_l_f1],  +2(%[sq2])    \n\t"
+
+            :
+            : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1),
+              [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1),
+              [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [sp2] "r"(sp2),
+              [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1),
+              [sq2] "r"(sq2));
+      } else if (mask & 0x00FF0000) {
+        __asm__ __volatile__(
+            "sb         %[p1_f0],  +2(%[sp1])    \n\t"
+            "sb         %[p0_f0],  +2(%[sp0])    \n\t"
+            "sb         %[q0_f0],  +2(%[sq0])    \n\t"
+            "sb         %[q1_f0],  +2(%[sq1])    \n\t"
+
+            :
+            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+              [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
+              [sq0] "r"(sq0), [sq1] "r"(sq1));
+      }
+
+      __asm__ __volatile__(
+          "srl      %[p6_l],    %[p6_l],    16   \n\t"
+          "srl      %[p5_l],    %[p5_l],    16   \n\t"
+          "srl      %[p4_l],    %[p4_l],    16   \n\t"
+          "srl      %[p3_l],    %[p3_l],    16   \n\t"
+          "srl      %[p2_l],    %[p2_l],    16   \n\t"
+          "srl      %[p1_l],    %[p1_l],    16   \n\t"
+          "srl      %[p0_l],    %[p0_l],    16   \n\t"
+          "srl      %[q0_l],    %[q0_l],    16   \n\t"
+          "srl      %[q1_l],    %[q1_l],    16   \n\t"
+          "srl      %[q2_l],    %[q2_l],    16   \n\t"
+          "srl      %[q3_l],    %[q3_l],    16   \n\t"
+          "srl      %[q4_l],    %[q4_l],    16   \n\t"
+          "srl      %[q5_l],    %[q5_l],    16   \n\t"
+          "srl      %[q6_l],    %[q6_l],    16   \n\t"
+
+          : [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l),
+            [q3_l] "+r"(q3_l), [q4_l] "+r"(q4_l), [q5_l] "+r"(q5_l),
+            [q6_l] "+r"(q6_l), [p6_l] "+r"(p6_l), [p5_l] "+r"(p5_l),
+            [p4_l] "+r"(p4_l), [p3_l] "+r"(p3_l), [p2_l] "+r"(p2_l),
+            [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l)
+          :);
+
+      __asm__ __volatile__(
+          "srl      %[p2_l_f1],   %[p2_l_f1],   16   \n\t"
+          "srl      %[p1_l_f1],   %[p1_l_f1],   16   \n\t"
+          "srl      %[p0_l_f1],   %[p0_l_f1],   16   \n\t"
+          "srl      %[q0_l_f1],   %[q0_l_f1],   16   \n\t"
+          "srl      %[q1_l_f1],   %[q1_l_f1],   16   \n\t"
+          "srl      %[q2_l_f1],   %[q2_l_f1],   16   \n\t"
+          "srl      %[p1_f0],     %[p1_f0],     8    \n\t"
+          "srl      %[p0_f0],     %[p0_f0],     8    \n\t"
+          "srl      %[q0_f0],     %[q0_f0],     8    \n\t"
+          "srl      %[q1_f0],     %[q1_f0],     8    \n\t"
+
+          : [p2_l_f1] "+r"(p2_l_f1), [p1_l_f1] "+r"(p1_l_f1),
+            [p0_l_f1] "+r"(p0_l_f1), [q0_l_f1] "+r"(q0_l_f1),
+            [q1_l_f1] "+r"(q1_l_f1), [q2_l_f1] "+r"(q2_l_f1),
+            [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+            [q1_f0] "+r"(q1_f0)
+          :);
+
+      if (mask & flat & flat2 & 0xFF000000) {
+        __asm__ __volatile__(
+            "sb     %[p6_l],    +3(%[sp6])    \n\t"
+            "sb     %[p5_l],    +3(%[sp5])    \n\t"
+            "sb     %[p4_l],    +3(%[sp4])    \n\t"
+            "sb     %[p3_l],    +3(%[sp3])    \n\t"
+            "sb     %[p2_l],    +3(%[sp2])    \n\t"
+            "sb     %[p1_l],    +3(%[sp1])    \n\t"
+            "sb     %[p0_l],    +3(%[sp0])    \n\t"
+
+            :
+            : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l),
+              [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l),
+              [p0_l] "r"(p0_l), [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4),
+              [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0));
+
+        __asm__ __volatile__(
+            "sb     %[q0_l],    +3(%[sq0])    \n\t"
+            "sb     %[q1_l],    +3(%[sq1])    \n\t"
+            "sb     %[q2_l],    +3(%[sq2])    \n\t"
+            "sb     %[q3_l],    +3(%[sq3])    \n\t"
+            "sb     %[q4_l],    +3(%[sq4])    \n\t"
+            "sb     %[q5_l],    +3(%[sq5])    \n\t"
+            "sb     %[q6_l],    +3(%[sq6])    \n\t"
+
+            :
+            : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
+              [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l),
+              [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2), [sq3] "r"(sq3),
+              [sq4] "r"(sq4), [sq5] "r"(sq5), [q6_l] "r"(q6_l), [sq6] "r"(sq6));
+      } else if (mask & flat & 0xFF000000) {
+        __asm__ __volatile__(
+            "sb     %[p2_l_f1],     +3(%[sp2])    \n\t"
+            "sb     %[p1_l_f1],     +3(%[sp1])    \n\t"
+            "sb     %[p0_l_f1],     +3(%[sp0])    \n\t"
+            "sb     %[q0_l_f1],     +3(%[sq0])    \n\t"
+            "sb     %[q1_l_f1],     +3(%[sq1])    \n\t"
+            "sb     %[q2_l_f1],     +3(%[sq2])    \n\t"
+
+            :
+            : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1),
+              [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1),
+              [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [sp2] "r"(sp2),
+              [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1),
+              [sq2] "r"(sq2));
+      } else if (mask & 0xFF000000) {
+        __asm__ __volatile__(
+            "sb     %[p1_f0],   +3(%[sp1])    \n\t"
+            "sb     %[p0_f0],   +3(%[sp0])    \n\t"
+            "sb     %[q0_f0],   +3(%[sq0])    \n\t"
+            "sb     %[q1_f0],   +3(%[sq1])    \n\t"
+
+            :
+            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+              [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
+              [sq0] "r"(sq0), [sq1] "r"(sq1));
+      }
+    }
+
+    s = s + 4;
+  }
+}
+
+void aom_lpf_horizontal_16_dspr2(unsigned char *s, int pitch,
+                                 const uint8_t *blimit, const uint8_t *limit,
+                                 const uint8_t *thresh) {
+  mb_lpf_horizontal_edge(s, pitch, blimit, limit, thresh, 1);
+}
+
+void aom_lpf_horizontal_16_dual_dspr2(unsigned char *s, int pitch,
+                                      const uint8_t *blimit,
+                                      const uint8_t *limit,
+                                      const uint8_t *thresh) {
+  mb_lpf_horizontal_edge(s, pitch, blimit, limit, thresh, 2);
+}
+#endif  // #if HAVE_DSPR2
diff --git a/libs/libaom/src/aom_dsp/mips/loopfilter_mb_vert_dspr2.c b/libs/libaom/src/aom_dsp/mips/loopfilter_mb_vert_dspr2.c
new file mode 100644
index 000000000..3d3f1ec97
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/mips/loopfilter_mb_vert_dspr2.c
@@ -0,0 +1,758 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/mips/common_dspr2.h"
+#include "aom_dsp/mips/loopfilter_filters_dspr2.h"
+#include "aom_dsp/mips/loopfilter_macros_dspr2.h"
+#include "aom_dsp/mips/loopfilter_masks_dspr2.h"
+#include "aom_mem/aom_mem.h"
+
+#if HAVE_DSPR2
+void aom_lpf_vertical_16_dspr2(uint8_t *s, int pitch, const uint8_t *blimit,
+                               const uint8_t *limit, const uint8_t *thresh) {
+  uint8_t i;
+  uint32_t mask, hev, flat, flat2;
+  uint8_t *s1, *s2, *s3, *s4;
+  uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
+  uint32_t thresh_vec, flimit_vec, limit_vec;
+  uint32_t uflimit, ulimit, uthresh;
+  uint32_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+  uint32_t p1_f0, p0_f0, q0_f0, q1_f0;
+  uint32_t p7_l, p6_l, p5_l, p4_l, p3_l, p2_l, p1_l, p0_l;
+  uint32_t q0_l, q1_l, q2_l, q3_l, q4_l, q5_l, q6_l, q7_l;
+  uint32_t p7_r, p6_r, p5_r, p4_r, p3_r, p2_r, p1_r, p0_r;
+  uint32_t q0_r, q1_r, q2_r, q3_r, q4_r, q5_r, q6_r, q7_r;
+  uint32_t p2_l_f1, p1_l_f1, p0_l_f1, p2_r_f1, p1_r_f1, p0_r_f1;
+  uint32_t q0_l_f1, q1_l_f1, q2_l_f1, q0_r_f1, q1_r_f1, q2_r_f1;
+
+  uflimit = *blimit;
+  ulimit = *limit;
+  uthresh = *thresh;
+
+  /* create quad-byte */
+  __asm__ __volatile__(
+      "replv.qb     %[thresh_vec],     %[uthresh]    \n\t"
+      "replv.qb     %[flimit_vec],     %[uflimit]    \n\t"
+      "replv.qb     %[limit_vec],      %[ulimit]     \n\t"
+
+      : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
+        [limit_vec] "=r"(limit_vec)
+      : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
+
+  prefetch_store(s + pitch);
+
+  for (i = 0; i < 2; i++) {
+    s1 = s;
+    s2 = s + pitch;
+    s3 = s2 + pitch;
+    s4 = s3 + pitch;
+    s = s4 + pitch;
+
+    __asm__ __volatile__(
+        "lw     %[p0],  -4(%[s1])    \n\t"
+        "lw     %[p1],  -4(%[s2])    \n\t"
+        "lw     %[p2],  -4(%[s3])    \n\t"
+        "lw     %[p3],  -4(%[s4])    \n\t"
+        "lw     %[p4],  -8(%[s1])    \n\t"
+        "lw     %[p5],  -8(%[s2])    \n\t"
+        "lw     %[p6],  -8(%[s3])    \n\t"
+        "lw     %[p7],  -8(%[s4])    \n\t"
+
+        : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0),
+          [p7] "=&r"(p7), [p6] "=&r"(p6), [p5] "=&r"(p5), [p4] "=&r"(p4)
+        : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4));
+
+    __asm__ __volatile__(
+        "lw     %[q3],  (%[s1])     \n\t"
+        "lw     %[q2],  (%[s2])     \n\t"
+        "lw     %[q1],  (%[s3])     \n\t"
+        "lw     %[q0],  (%[s4])     \n\t"
+        "lw     %[q7],  +4(%[s1])   \n\t"
+        "lw     %[q6],  +4(%[s2])   \n\t"
+        "lw     %[q5],  +4(%[s3])   \n\t"
+        "lw     %[q4],  +4(%[s4])   \n\t"
+
+        : [q3] "=&r"(q3), [q2] "=&r"(q2), [q1] "=&r"(q1), [q0] "=&r"(q0),
+          [q7] "=&r"(q7), [q6] "=&r"(q6), [q5] "=&r"(q5), [q4] "=&r"(q4)
+        : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4));
+
+    /* transpose p3, p2, p1, p0
+       original (when loaded from memory)
+       register       -4    -3   -2     -1
+         p0         p0_0  p0_1  p0_2  p0_3
+         p1         p1_0  p1_1  p1_2  p1_3
+         p2         p2_0  p2_1  p2_2  p2_3
+         p3         p3_0  p3_1  p3_2  p3_3
+
+       after transpose
+       register
+         p0         p3_3  p2_3  p1_3  p0_3
+         p1         p3_2  p2_2  p1_2  p0_2
+         p2         p3_1  p2_1  p1_1  p0_1
+         p3         p3_0  p2_0  p1_0  p0_0
+    */
+    __asm__ __volatile__(
+        "precrq.qb.ph   %[prim1],   %[p0],      %[p1]       \n\t"
+        "precr.qb.ph    %[prim2],   %[p0],      %[p1]       \n\t"
+        "precrq.qb.ph   %[prim3],   %[p2],      %[p3]       \n\t"
+        "precr.qb.ph    %[prim4],   %[p2],      %[p3]       \n\t"
+
+        "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
+        "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
+        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+        "precrq.ph.w    %[p0],      %[p1],      %[sec3]     \n\t"
+        "precrq.ph.w    %[p2],      %[p3],      %[sec4]     \n\t"
+        "append         %[p1],      %[sec3],    16          \n\t"
+        "append         %[p3],      %[sec4],    16          \n\t"
+
+        : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
+          [prim4] "=&r"(prim4), [p0] "+r"(p0), [p1] "+r"(p1), [p2] "+r"(p2),
+          [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
+        :);
+
+    /* transpose q0, q1, q2, q3
+       original (when loaded from memory)
+       register       +1    +2    +3    +4
+         q3         q3_0  q3_1  q3_2  q3_3
+         q2         q2_0  q2_1  q2_2  q2_3
+         q1         q1_0  q1_1  q1_2  q1_3
+         q0         q0_0  q0_1  q0_2  q0_3
+
+       after transpose
+       register
+         q3         q0_3  q1_3  q2_3  q3_3
+         q2         q0_2  q1_2  q2_2  q3_2
+         q1         q0_1  q1_1  q2_1  q3_1
+         q0         q0_0  q1_0  q2_0  q3_0
+    */
+    __asm__ __volatile__(
+        "precrq.qb.ph   %[prim1],   %[q3],      %[q2]       \n\t"
+        "precr.qb.ph    %[prim2],   %[q3],      %[q2]       \n\t"
+        "precrq.qb.ph   %[prim3],   %[q1],      %[q0]       \n\t"
+        "precr.qb.ph    %[prim4],   %[q1],      %[q0]       \n\t"
+
+        "precrq.qb.ph   %[q2],      %[prim1],   %[prim2]    \n\t"
+        "precr.qb.ph    %[q0],      %[prim1],   %[prim2]    \n\t"
+        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+        "precrq.ph.w    %[q3],      %[q2],      %[sec3]     \n\t"
+        "precrq.ph.w    %[q1],      %[q0],      %[sec4]     \n\t"
+        "append         %[q2],      %[sec3],    16          \n\t"
+        "append         %[q0],      %[sec4],    16          \n\t"
+
+        : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
+          [prim4] "=&r"(prim4), [q3] "+r"(q3), [q2] "+r"(q2), [q1] "+r"(q1),
+          [q0] "+r"(q0), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
+        :);
+
+    /* transpose p7, p6, p5, p4
+       original (when loaded from memory)
+       register      -8    -7   -6     -5
+         p4         p4_0  p4_1  p4_2  p4_3
+         p5         p5_0  p5_1  p5_2  p5_3
+         p6         p6_0  p6_1  p6_2  p6_3
+         p7         p7_0  p7_1  p7_2  p7_3
+
+       after transpose
+       register
+         p4         p7_3  p6_3  p5_3  p4_3
+         p5         p7_2  p6_2  p5_2  p4_2
+         p6         p7_1  p6_1  p5_1  p4_1
+         p7         p7_0  p6_0  p5_0  p4_0
+    */
+    __asm__ __volatile__(
+        "precrq.qb.ph   %[prim1],   %[p4],      %[p5]       \n\t"
+        "precr.qb.ph    %[prim2],   %[p4],      %[p5]       \n\t"
+        "precrq.qb.ph   %[prim3],   %[p6],      %[p7]       \n\t"
+        "precr.qb.ph    %[prim4],   %[p6],      %[p7]       \n\t"
+
+        "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
+        "precr.qb.ph    %[p7],      %[prim1],   %[prim2]    \n\t"
+        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+        "precrq.ph.w    %[p4],      %[p5],      %[sec3]     \n\t"
+        "precrq.ph.w    %[p6],      %[p7],      %[sec4]     \n\t"
+        "append         %[p5],      %[sec3],    16          \n\t"
+        "append         %[p7],      %[sec4],    16          \n\t"
+
+        : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
+          [prim4] "=&r"(prim4), [p4] "+r"(p4), [p5] "+r"(p5), [p6] "+r"(p6),
+          [p7] "+r"(p7), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
+        :);
+
+    /* transpose q4, q5, q6, q7
+       original (when loaded from memory)
+       register      +5    +6    +7    +8
+         q7         q7_0  q7_1  q7_2  q7_3
+         q6         q6_0  q6_1  q6_2  q6_3
+         q5         q5_0  q5_1  q5_2  q5_3
+         q4         q4_0  q4_1  q4_2  q4_3
+
+       after transpose
+       register
+         q7         q4_3  q5_3  q26_3  q7_3
+         q6         q4_2  q5_2  q26_2  q7_2
+         q5         q4_1  q5_1  q26_1  q7_1
+         q4         q4_0  q5_0  q26_0  q7_0
+    */
+    __asm__ __volatile__(
+        "precrq.qb.ph   %[prim1],   %[q7],      %[q6]       \n\t"
+        "precr.qb.ph    %[prim2],   %[q7],      %[q6]       \n\t"
+        "precrq.qb.ph   %[prim3],   %[q5],      %[q4]       \n\t"
+        "precr.qb.ph    %[prim4],   %[q5],      %[q4]       \n\t"
+
+        "precrq.qb.ph   %[q6],      %[prim1],   %[prim2]    \n\t"
+        "precr.qb.ph    %[q4],      %[prim1],   %[prim2]    \n\t"
+        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+        "precrq.ph.w    %[q7],      %[q6],      %[sec3]     \n\t"
+        "precrq.ph.w    %[q5],      %[q4],      %[sec4]     \n\t"
+        "append         %[q6],      %[sec3],    16          \n\t"
+        "append         %[q4],      %[sec4],    16          \n\t"
+
+        : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
+          [prim4] "=&r"(prim4), [q7] "+r"(q7), [q6] "+r"(q6), [q5] "+r"(q5),
+          [q4] "+r"(q4), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
+        :);
+
+    filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0,
+                                    p3, p2, q0, q1, q2, q3, &hev, &mask, &flat);
+
+    flatmask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, &flat2);
+
+    /* f0 */
+    if (((flat2 == 0) && (flat == 0) && (mask != 0)) ||
+        ((flat2 != 0) && (flat == 0) && (mask != 0))) {
+      filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+      STORE_F0()
+    } else if ((flat2 == 0XFFFFFFFF) && (flat == 0xFFFFFFFF) &&
+               (mask == 0xFFFFFFFF)) {
+      /* f2 */
+      PACK_LEFT_0TO3()
+      PACK_LEFT_4TO7()
+      wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l,
+                          &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l,
+                          &q6_l, &q7_l);
+
+      PACK_RIGHT_0TO3()
+      PACK_RIGHT_4TO7()
+      wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r,
+                          &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r,
+                          &q6_r, &q7_r);
+
+      STORE_F2()
+    } else if ((flat2 == 0) && (flat == 0xFFFFFFFF) && (mask == 0xFFFFFFFF)) {
+      /* f1 */
+      PACK_LEFT_0TO3()
+      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
+
+      PACK_RIGHT_0TO3()
+      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
+
+      STORE_F1()
+    } else if ((flat2 == 0) && (flat != 0) && (mask != 0)) {
+      /* f0 + f1 */
+      filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+
+      /* left 2 element operation */
+      PACK_LEFT_0TO3()
+      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
+
+      /* right 2 element operation */
+      PACK_RIGHT_0TO3()
+      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
+
+      if (mask & flat & 0x000000FF) {
+        __asm__ __volatile__(
+            "sb     %[p2_r],    -3(%[s4])    \n\t"
+            "sb     %[p1_r],    -2(%[s4])    \n\t"
+            "sb     %[p0_r],    -1(%[s4])    \n\t"
+            "sb     %[q0_r],      (%[s4])    \n\t"
+            "sb     %[q1_r],    +1(%[s4])    \n\t"
+            "sb     %[q2_r],    +2(%[s4])    \n\t"
+
+            :
+            : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
+              [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
+              [s4] "r"(s4));
+      } else if (mask & 0x000000FF) {
+        __asm__ __volatile__(
+            "sb         %[p1_f0],  -2(%[s4])    \n\t"
+            "sb         %[p0_f0],  -1(%[s4])    \n\t"
+            "sb         %[q0_f0],    (%[s4])    \n\t"
+            "sb         %[q1_f0],  +1(%[s4])    \n\t"
+
+            :
+            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+              [q1_f0] "r"(q1_f0), [s4] "r"(s4));
+      }
+
+      __asm__ __volatile__(
+          "srl      %[p2_r],    %[p2_r],    16      \n\t"
+          "srl      %[p1_r],    %[p1_r],    16      \n\t"
+          "srl      %[p0_r],    %[p0_r],    16      \n\t"
+          "srl      %[q0_r],    %[q0_r],    16      \n\t"
+          "srl      %[q1_r],    %[q1_r],    16      \n\t"
+          "srl      %[q2_r],    %[q2_r],    16      \n\t"
+          "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
+
+          : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r),
+            [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r),
+            [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+            [q1_f0] "+r"(q1_f0)
+          :);
+
+      if (mask & flat & 0x0000FF00) {
+        __asm__ __volatile__(
+            "sb     %[p2_r],    -3(%[s3])    \n\t"
+            "sb     %[p1_r],    -2(%[s3])    \n\t"
+            "sb     %[p0_r],    -1(%[s3])    \n\t"
+            "sb     %[q0_r],      (%[s3])    \n\t"
+            "sb     %[q1_r],    +1(%[s3])    \n\t"
+            "sb     %[q2_r],    +2(%[s3])    \n\t"
+
+            :
+            : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
+              [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
+              [s3] "r"(s3));
+      } else if (mask & 0x0000FF00) {
+        __asm__ __volatile__(
+            "sb     %[p1_f0],   -2(%[s3])    \n\t"
+            "sb     %[p0_f0],   -1(%[s3])    \n\t"
+            "sb     %[q0_f0],     (%[s3])    \n\t"
+            "sb     %[q1_f0],   +1(%[s3])    \n\t"
+
+            :
+            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+              [q1_f0] "r"(q1_f0), [s3] "r"(s3));
+      }
+
+      __asm__ __volatile__(
+          "srl      %[p1_f0],   %[p1_f0],   8     \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8     \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8     \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8     \n\t"
+
+          : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+            [q1_f0] "+r"(q1_f0)
+          :);
+
+      if (mask & flat & 0x00FF0000) {
+        __asm__ __volatile__(
+            "sb       %[p2_l],    -3(%[s2])    \n\t"
+            "sb       %[p1_l],    -2(%[s2])    \n\t"
+            "sb       %[p0_l],    -1(%[s2])    \n\t"
+            "sb       %[q0_l],      (%[s2])    \n\t"
+            "sb       %[q1_l],    +1(%[s2])    \n\t"
+            "sb       %[q2_l],    +2(%[s2])    \n\t"
+
+            :
+            : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
+              [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
+              [s2] "r"(s2));
+      } else if (mask & 0x00FF0000) {
+        __asm__ __volatile__(
+            "sb     %[p1_f0],   -2(%[s2])    \n\t"
+            "sb     %[p0_f0],   -1(%[s2])    \n\t"
+            "sb     %[q0_f0],     (%[s2])    \n\t"
+            "sb     %[q1_f0],   +1(%[s2])    \n\t"
+
+            :
+            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+              [q1_f0] "r"(q1_f0), [s2] "r"(s2));
+      }
+
+      __asm__ __volatile__(
+          "srl      %[p2_l],    %[p2_l],    16      \n\t"
+          "srl      %[p1_l],    %[p1_l],    16      \n\t"
+          "srl      %[p0_l],    %[p0_l],    16      \n\t"
+          "srl      %[q0_l],    %[q0_l],    16      \n\t"
+          "srl      %[q1_l],    %[q1_l],    16      \n\t"
+          "srl      %[q2_l],    %[q2_l],    16      \n\t"
+          "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
+
+          : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l),
+            [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l),
+            [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+            [q1_f0] "+r"(q1_f0)
+          :);
+
+      if (mask & flat & 0xFF000000) {
+        __asm__ __volatile__(
+            "sb     %[p2_l],    -3(%[s1])    \n\t"
+            "sb     %[p1_l],    -2(%[s1])    \n\t"
+            "sb     %[p0_l],    -1(%[s1])    \n\t"
+            "sb     %[q0_l],      (%[s1])    \n\t"
+            "sb     %[q1_l],    +1(%[s1])    \n\t"
+            "sb     %[q2_l],    +2(%[s1])    \n\t"
+
+            :
+            : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
+              [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
+              [s1] "r"(s1));
+      } else if (mask & 0xFF000000) {
+        __asm__ __volatile__(
+            "sb     %[p1_f0],   -2(%[s1])    \n\t"
+            "sb     %[p0_f0],   -1(%[s1])    \n\t"
+            "sb     %[q0_f0],     (%[s1])    \n\t"
+            "sb     %[q1_f0],   +1(%[s1])    \n\t"
+
+            :
+            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+              [q1_f0] "r"(q1_f0), [s1] "r"(s1));
+      }
+    } else if ((flat2 != 0) && (flat != 0) && (mask != 0)) {
+      /* f0+f1+f2 */
+      filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+
+      PACK_LEFT_0TO3()
+      mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, &p2_l_f1,
+                      &p1_l_f1, &p0_l_f1, &q0_l_f1, &q1_l_f1, &q2_l_f1);
+
+      PACK_RIGHT_0TO3()
+      mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, &p2_r_f1,
+                      &p1_r_f1, &p0_r_f1, &q0_r_f1, &q1_r_f1, &q2_r_f1);
+
+      PACK_LEFT_4TO7()
+      wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l,
+                          &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l,
+                          &q6_l, &q7_l);
+
+      PACK_RIGHT_4TO7()
+      wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r,
+                          &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r,
+                          &q6_r, &q7_r);
+
+      if (mask & flat & flat2 & 0x000000FF) {
+        __asm__ __volatile__(
+            "sb     %[p6_r],    -7(%[s4])    \n\t"
+            "sb     %[p5_r],    -6(%[s4])    \n\t"
+            "sb     %[p4_r],    -5(%[s4])    \n\t"
+            "sb     %[p3_r],    -4(%[s4])    \n\t"
+            "sb     %[p2_r],    -3(%[s4])    \n\t"
+            "sb     %[p1_r],    -2(%[s4])    \n\t"
+            "sb     %[p0_r],    -1(%[s4])    \n\t"
+
+            :
+            : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r),
+              [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r),
+              [p0_r] "r"(p0_r), [s4] "r"(s4));
+
+        __asm__ __volatile__(
+            "sb     %[q0_r],      (%[s4])    \n\t"
+            "sb     %[q1_r],    +1(%[s4])    \n\t"
+            "sb     %[q2_r],    +2(%[s4])    \n\t"
+            "sb     %[q3_r],    +3(%[s4])    \n\t"
+            "sb     %[q4_r],    +4(%[s4])    \n\t"
+            "sb     %[q5_r],    +5(%[s4])    \n\t"
+            "sb     %[q6_r],    +6(%[s4])    \n\t"
+
+            :
+            : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
+              [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r),
+              [q6_r] "r"(q6_r), [s4] "r"(s4));
+      } else if (mask & flat & 0x000000FF) {
+        __asm__ __volatile__(
+            "sb     %[p2_r_f1],     -3(%[s4])    \n\t"
+            "sb     %[p1_r_f1],     -2(%[s4])    \n\t"
+            "sb     %[p0_r_f1],     -1(%[s4])    \n\t"
+            "sb     %[q0_r_f1],       (%[s4])    \n\t"
+            "sb     %[q1_r_f1],     +1(%[s4])    \n\t"
+            "sb     %[q2_r_f1],     +2(%[s4])    \n\t"
+
+            :
+            : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1),
+              [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1),
+              [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [s4] "r"(s4));
+      } else if (mask & 0x000000FF) {
+        __asm__ __volatile__(
+            "sb     %[p1_f0],   -2(%[s4])    \n\t"
+            "sb     %[p0_f0],   -1(%[s4])    \n\t"
+            "sb     %[q0_f0],     (%[s4])    \n\t"
+            "sb     %[q1_f0],   +1(%[s4])    \n\t"
+
+            :
+            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+              [q1_f0] "r"(q1_f0), [s4] "r"(s4));
+      }
+
+      __asm__ __volatile__(
+          "srl      %[p6_r],        %[p6_r],        16     \n\t"
+          "srl      %[p5_r],        %[p5_r],        16     \n\t"
+          "srl      %[p4_r],        %[p4_r],        16     \n\t"
+          "srl      %[p3_r],        %[p3_r],        16     \n\t"
+          "srl      %[p2_r],        %[p2_r],        16     \n\t"
+          "srl      %[p1_r],        %[p1_r],        16     \n\t"
+          "srl      %[p0_r],        %[p0_r],        16     \n\t"
+          "srl      %[q0_r],        %[q0_r],        16     \n\t"
+          "srl      %[q1_r],        %[q1_r],        16     \n\t"
+          "srl      %[q2_r],        %[q2_r],        16     \n\t"
+          "srl      %[q3_r],        %[q3_r],        16     \n\t"
+          "srl      %[q4_r],        %[q4_r],        16     \n\t"
+          "srl      %[q5_r],        %[q5_r],        16     \n\t"
+          "srl      %[q6_r],        %[q6_r],        16     \n\t"
+
+          : [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r),
+            [q3_r] "+r"(q3_r), [q4_r] "+r"(q4_r), [q5_r] "+r"(q5_r),
+            [q6_r] "+r"(q6_r), [p6_r] "+r"(p6_r), [p5_r] "+r"(p5_r),
+            [p4_r] "+r"(p4_r), [p3_r] "+r"(p3_r), [p2_r] "+r"(p2_r),
+            [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r)
+          :);
+
+      __asm__ __volatile__(
+          "srl      %[p2_r_f1],     %[p2_r_f1],     16      \n\t"
+          "srl      %[p1_r_f1],     %[p1_r_f1],     16      \n\t"
+          "srl      %[p0_r_f1],     %[p0_r_f1],     16      \n\t"
+          "srl      %[q0_r_f1],     %[q0_r_f1],     16      \n\t"
+          "srl      %[q1_r_f1],     %[q1_r_f1],     16      \n\t"
+          "srl      %[q2_r_f1],     %[q2_r_f1],     16      \n\t"
+          "srl      %[p1_f0],       %[p1_f0],       8       \n\t"
+          "srl      %[p0_f0],       %[p0_f0],       8       \n\t"
+          "srl      %[q0_f0],       %[q0_f0],       8       \n\t"
+          "srl      %[q1_f0],       %[q1_f0],       8       \n\t"
+
+          : [p2_r_f1] "+r"(p2_r_f1), [p1_r_f1] "+r"(p1_r_f1),
+            [p0_r_f1] "+r"(p0_r_f1), [q0_r_f1] "+r"(q0_r_f1),
+            [q1_r_f1] "+r"(q1_r_f1), [q2_r_f1] "+r"(q2_r_f1),
+            [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+            [q1_f0] "+r"(q1_f0)
+          :);
+
+      if (mask & flat & flat2 & 0x0000FF00) {
+        __asm__ __volatile__(
+            "sb     %[p6_r],    -7(%[s3])    \n\t"
+            "sb     %[p5_r],    -6(%[s3])    \n\t"
+            "sb     %[p4_r],    -5(%[s3])    \n\t"
+            "sb     %[p3_r],    -4(%[s3])    \n\t"
+            "sb     %[p2_r],    -3(%[s3])    \n\t"
+            "sb     %[p1_r],    -2(%[s3])    \n\t"
+            "sb     %[p0_r],    -1(%[s3])    \n\t"
+
+            :
+            : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r),
+              [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r),
+              [p0_r] "r"(p0_r), [s3] "r"(s3));
+
+        __asm__ __volatile__(
+            "sb     %[q0_r],      (%[s3])    \n\t"
+            "sb     %[q1_r],    +1(%[s3])    \n\t"
+            "sb     %[q2_r],    +2(%[s3])    \n\t"
+            "sb     %[q3_r],    +3(%[s3])    \n\t"
+            "sb     %[q4_r],    +4(%[s3])    \n\t"
+            "sb     %[q5_r],    +5(%[s3])    \n\t"
+            "sb     %[q6_r],    +6(%[s3])    \n\t"
+
+            :
+            : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
+              [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r),
+              [q6_r] "r"(q6_r), [s3] "r"(s3));
+      } else if (mask & flat & 0x0000FF00) {
+        __asm__ __volatile__(
+            "sb     %[p2_r_f1],     -3(%[s3])    \n\t"
+            "sb     %[p1_r_f1],     -2(%[s3])    \n\t"
+            "sb     %[p0_r_f1],     -1(%[s3])    \n\t"
+            "sb     %[q0_r_f1],       (%[s3])    \n\t"
+            "sb     %[q1_r_f1],     +1(%[s3])    \n\t"
+            "sb     %[q2_r_f1],     +2(%[s3])    \n\t"
+
+            :
+            : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1),
+              [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1),
+              [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [s3] "r"(s3));
+      } else if (mask & 0x0000FF00) {
+        __asm__ __volatile__(
+            "sb     %[p1_f0],   -2(%[s3])    \n\t"
+            "sb     %[p0_f0],   -1(%[s3])    \n\t"
+            "sb     %[q0_f0],     (%[s3])    \n\t"
+            "sb     %[q1_f0],   +1(%[s3])    \n\t"
+
+            :
+            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+              [q1_f0] "r"(q1_f0), [s3] "r"(s3));
+      }
+
+      __asm__ __volatile__(
+          "srl      %[p1_f0],   %[p1_f0],   8     \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8     \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8     \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8     \n\t"
+
+          : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+            [q1_f0] "+r"(q1_f0)
+          :);
+
+      if (mask & flat & flat2 & 0x00FF0000) {
+        __asm__ __volatile__(
+            "sb     %[p6_l],    -7(%[s2])    \n\t"
+            "sb     %[p5_l],    -6(%[s2])    \n\t"
+            "sb     %[p4_l],    -5(%[s2])    \n\t"
+            "sb     %[p3_l],    -4(%[s2])    \n\t"
+            "sb     %[p2_l],    -3(%[s2])    \n\t"
+            "sb     %[p1_l],    -2(%[s2])    \n\t"
+            "sb     %[p0_l],    -1(%[s2])    \n\t"
+
+            :
+            : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l),
+              [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l),
+              [p0_l] "r"(p0_l), [s2] "r"(s2));
+
+        __asm__ __volatile__(
+            "sb     %[q0_l],      (%[s2])    \n\t"
+            "sb     %[q1_l],    +1(%[s2])    \n\t"
+            "sb     %[q2_l],    +2(%[s2])    \n\t"
+            "sb     %[q3_l],    +3(%[s2])    \n\t"
+            "sb     %[q4_l],    +4(%[s2])    \n\t"
+            "sb     %[q5_l],    +5(%[s2])    \n\t"
+            "sb     %[q6_l],    +6(%[s2])    \n\t"
+
+            :
+            : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
+              [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l),
+              [q6_l] "r"(q6_l), [s2] "r"(s2));
+      } else if (mask & flat & 0x00FF0000) {
+        __asm__ __volatile__(
+            "sb     %[p2_l_f1],     -3(%[s2])    \n\t"
+            "sb     %[p1_l_f1],     -2(%[s2])    \n\t"
+            "sb     %[p0_l_f1],     -1(%[s2])    \n\t"
+            "sb     %[q0_l_f1],       (%[s2])    \n\t"
+            "sb     %[q1_l_f1],     +1(%[s2])    \n\t"
+            "sb     %[q2_l_f1],     +2(%[s2])    \n\t"
+
+            :
+            : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1),
+              [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1),
+              [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [s2] "r"(s2));
+      } else if (mask & 0x00FF0000) {
+        __asm__ __volatile__(
+            "sb     %[p1_f0],   -2(%[s2])    \n\t"
+            "sb     %[p0_f0],   -1(%[s2])    \n\t"
+            "sb     %[q0_f0],     (%[s2])    \n\t"
+            "sb     %[q1_f0],   +1(%[s2])    \n\t"
+
+            :
+            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+              [q1_f0] "r"(q1_f0), [s2] "r"(s2));
+      }
+
+      __asm__ __volatile__(
+          "srl      %[p6_l],        %[p6_l],        16     \n\t"
+          "srl      %[p5_l],        %[p5_l],        16     \n\t"
+          "srl      %[p4_l],        %[p4_l],        16     \n\t"
+          "srl      %[p3_l],        %[p3_l],        16     \n\t"
+          "srl      %[p2_l],        %[p2_l],        16     \n\t"
+          "srl      %[p1_l],        %[p1_l],        16     \n\t"
+          "srl      %[p0_l],        %[p0_l],        16     \n\t"
+          "srl      %[q0_l],        %[q0_l],        16     \n\t"
+          "srl      %[q1_l],        %[q1_l],        16     \n\t"
+          "srl      %[q2_l],        %[q2_l],        16     \n\t"
+          "srl      %[q3_l],        %[q3_l],        16     \n\t"
+          "srl      %[q4_l],        %[q4_l],        16     \n\t"
+          "srl      %[q5_l],        %[q5_l],        16     \n\t"
+          "srl      %[q6_l],        %[q6_l],        16     \n\t"
+
+          : [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l),
+            [q3_l] "+r"(q3_l), [q4_l] "+r"(q4_l), [q5_l] "+r"(q5_l),
+            [q6_l] "+r"(q6_l), [p6_l] "+r"(p6_l), [p5_l] "+r"(p5_l),
+            [p4_l] "+r"(p4_l), [p3_l] "+r"(p3_l), [p2_l] "+r"(p2_l),
+            [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l)
+          :);
+
+      __asm__ __volatile__(
+          "srl      %[p2_l_f1],     %[p2_l_f1],     16      \n\t"
+          "srl      %[p1_l_f1],     %[p1_l_f1],     16      \n\t"
+          "srl      %[p0_l_f1],     %[p0_l_f1],     16      \n\t"
+          "srl      %[q0_l_f1],     %[q0_l_f1],     16      \n\t"
+          "srl      %[q1_l_f1],     %[q1_l_f1],     16      \n\t"
+          "srl      %[q2_l_f1],     %[q2_l_f1],     16      \n\t"
+          "srl      %[p1_f0],       %[p1_f0],       8       \n\t"
+          "srl      %[p0_f0],       %[p0_f0],       8       \n\t"
+          "srl      %[q0_f0],       %[q0_f0],       8       \n\t"
+          "srl      %[q1_f0],       %[q1_f0],       8       \n\t"
+
+          : [p2_l_f1] "+r"(p2_l_f1), [p1_l_f1] "+r"(p1_l_f1),
+            [p0_l_f1] "+r"(p0_l_f1), [q0_l_f1] "+r"(q0_l_f1),
+            [q1_l_f1] "+r"(q1_l_f1), [q2_l_f1] "+r"(q2_l_f1),
+            [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
+            [q1_f0] "+r"(q1_f0)
+          :);
+
+      if (mask & flat & flat2 & 0xFF000000) {
+        __asm__ __volatile__(
+            "sb     %[p6_l],    -7(%[s1])    \n\t"
+            "sb     %[p5_l],    -6(%[s1])    \n\t"
+            "sb     %[p4_l],    -5(%[s1])    \n\t"
+            "sb     %[p3_l],    -4(%[s1])    \n\t"
+            "sb     %[p2_l],    -3(%[s1])    \n\t"
+            "sb     %[p1_l],    -2(%[s1])    \n\t"
+            "sb     %[p0_l],    -1(%[s1])    \n\t"
+
+            :
+            : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l),
+              [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l),
+              [p0_l] "r"(p0_l), [s1] "r"(s1));
+
+        __asm__ __volatile__(
+            "sb     %[q0_l],     (%[s1])    \n\t"
+            "sb     %[q1_l],    1(%[s1])    \n\t"
+            "sb     %[q2_l],    2(%[s1])    \n\t"
+            "sb     %[q3_l],    3(%[s1])    \n\t"
+            "sb     %[q4_l],    4(%[s1])    \n\t"
+            "sb     %[q5_l],    5(%[s1])    \n\t"
+            "sb     %[q6_l],    6(%[s1])    \n\t"
+
+            :
+            : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
+              [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l),
+              [q6_l] "r"(q6_l), [s1] "r"(s1));
+      } else if (mask & flat & 0xFF000000) {
+        __asm__ __volatile__(
+            "sb     %[p2_l_f1],     -3(%[s1])    \n\t"
+            "sb     %[p1_l_f1],     -2(%[s1])    \n\t"
+            "sb     %[p0_l_f1],     -1(%[s1])    \n\t"
+            "sb     %[q0_l_f1],       (%[s1])    \n\t"
+            "sb     %[q1_l_f1],     +1(%[s1])    \n\t"
+            "sb     %[q2_l_f1],     +2(%[s1])    \n\t"
+
+            :
+            : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1),
+              [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1),
+              [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [s1] "r"(s1));
+      } else if (mask & 0xFF000000) {
+        __asm__ __volatile__(
+            "sb     %[p1_f0],   -2(%[s1])    \n\t"
+            "sb     %[p0_f0],   -1(%[s1])    \n\t"
+            "sb     %[q0_f0],     (%[s1])    \n\t"
+            "sb     %[q1_f0],   +1(%[s1])    \n\t"
+
+            :
+            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
+              [q1_f0] "r"(q1_f0), [s1] "r"(s1));
+      }
+    }
+  }
+}
+#endif  // #if HAVE_DSPR2
diff --git a/libs/libaom/src/aom_dsp/mips/loopfilter_msa.h b/libs/libaom/src/aom_dsp/mips/loopfilter_msa.h
new file mode 100644
index 000000000..54b0bb4bd
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/mips/loopfilter_msa.h
@@ -0,0 +1,251 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_MIPS_LOOPFILTER_MSA_H_
+#define AOM_AOM_DSP_MIPS_LOOPFILTER_MSA_H_
+
+#include "aom_dsp/mips/macros_msa.h"
+
+#define AOM_LPF_FILTER4_8W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, \
+                           p1_out, p0_out, q0_out, q1_out)              \
+  {                                                                     \
+    v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign;                 \
+    v16i8 filt, filt1, filt2, cnst4b, cnst3b;                           \
+    v8i16 q0_sub_p0_r, filt_r, cnst3h;                                  \
+                                                                        \
+    p1_m = (v16i8)__msa_xori_b(p1_in, 0x80);                            \
+    p0_m = (v16i8)__msa_xori_b(p0_in, 0x80);                            \
+    q0_m = (v16i8)__msa_xori_b(q0_in, 0x80);                            \
+    q1_m = (v16i8)__msa_xori_b(q1_in, 0x80);                            \
+                                                                        \
+    filt = __msa_subs_s_b(p1_m, q1_m);                                  \
+    filt = filt & (v16i8)hev_in;                                        \
+    q0_sub_p0 = q0_m - p0_m;                                            \
+    filt_sign = __msa_clti_s_b(filt, 0);                                \
+                                                                        \
+    cnst3h = __msa_ldi_h(3);                                            \
+    q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0, q0_sub_p0);            \
+    q0_sub_p0_r = __msa_dotp_s_h((v16i8)q0_sub_p0_r, (v16i8)cnst3h);    \
+    filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt);                      \
+    filt_r += q0_sub_p0_r;                                              \
+    filt_r = __msa_sat_s_h(filt_r, 7);                                  \
+                                                                        \
+    /* combine left and right part */                                   \
+    filt = __msa_pckev_b((v16i8)filt_r, (v16i8)filt_r);                 \
+                                                                        \
+    filt = filt & (v16i8)mask_in;                                       \
+    cnst4b = __msa_ldi_b(4);                                            \
+    filt1 = __msa_adds_s_b(filt, cnst4b);                               \
+    filt1 >>= 3;                                                        \
+                                                                        \
+    cnst3b = __msa_ldi_b(3);                                            \
+    filt2 = __msa_adds_s_b(filt, cnst3b);                               \
+    filt2 >>= 3;                                                        \
+                                                                        \
+    q0_m = __msa_subs_s_b(q0_m, filt1);                                 \
+    q0_out = __msa_xori_b((v16u8)q0_m, 0x80);                           \
+    p0_m = __msa_adds_s_b(p0_m, filt2);                                 \
+    p0_out = __msa_xori_b((v16u8)p0_m, 0x80);                           \
+                                                                        \
+    filt = __msa_srari_b(filt1, 1);                                     \
+    hev_in = __msa_xori_b((v16u8)hev_in, 0xff);                         \
+    filt = filt & (v16i8)hev_in;                                        \
+                                                                        \
+    q1_m = __msa_subs_s_b(q1_m, filt);                                  \
+    q1_out = __msa_xori_b((v16u8)q1_m, 0x80);                           \
+    p1_m = __msa_adds_s_b(p1_m, filt);                                  \
+    p1_out = __msa_xori_b((v16u8)p1_m, 0x80);                           \
+  }
+
+#define AOM_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, \
+                           p1_out, p0_out, q0_out, q1_out)              \
+  {                                                                     \
+    v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign;                 \
+    v16i8 filt, filt1, filt2, cnst4b, cnst3b;                           \
+    v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_l, filt_r, cnst3h;             \
+                                                                        \
+    p1_m = (v16i8)__msa_xori_b(p1_in, 0x80);                            \
+    p0_m = (v16i8)__msa_xori_b(p0_in, 0x80);                            \
+    q0_m = (v16i8)__msa_xori_b(q0_in, 0x80);                            \
+    q1_m = (v16i8)__msa_xori_b(q1_in, 0x80);                            \
+                                                                        \
+    filt = __msa_subs_s_b(p1_m, q1_m);                                  \
+                                                                        \
+    filt = filt & (v16i8)hev_in;                                        \
+                                                                        \
+    q0_sub_p0 = q0_m - p0_m;                                            \
+    filt_sign = __msa_clti_s_b(filt, 0);                                \
+                                                                        \
+    cnst3h = __msa_ldi_h(3);                                            \
+    q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0, q0_sub_p0);            \
+    q0_sub_p0_r = __msa_dotp_s_h((v16i8)q0_sub_p0_r, (v16i8)cnst3h);    \
+    filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt);                      \
+    filt_r += q0_sub_p0_r;                                              \
+    filt_r = __msa_sat_s_h(filt_r, 7);                                  \
+                                                                        \
+    q0_sub_p0_l = (v8i16)__msa_ilvl_b(q0_sub_p0, q0_sub_p0);            \
+    q0_sub_p0_l = __msa_dotp_s_h((v16i8)q0_sub_p0_l, (v16i8)cnst3h);    \
+    filt_l = (v8i16)__msa_ilvl_b(filt_sign, filt);                      \
+    filt_l += q0_sub_p0_l;                                              \
+    filt_l = __msa_sat_s_h(filt_l, 7);                                  \
+                                                                        \
+    filt = __msa_pckev_b((v16i8)filt_l, (v16i8)filt_r);                 \
+    filt = filt & (v16i8)mask_in;                                       \
+                                                                        \
+    cnst4b = __msa_ldi_b(4);                                            \
+    filt1 = __msa_adds_s_b(filt, cnst4b);                               \
+    filt1 >>= 3;                                                        \
+                                                                        \
+    cnst3b = __msa_ldi_b(3);                                            \
+    filt2 = __msa_adds_s_b(filt, cnst3b);                               \
+    filt2 >>= 3;                                                        \
+                                                                        \
+    q0_m = __msa_subs_s_b(q0_m, filt1);                                 \
+    q0_out = __msa_xori_b((v16u8)q0_m, 0x80);                           \
+    p0_m = __msa_adds_s_b(p0_m, filt2);                                 \
+    p0_out = __msa_xori_b((v16u8)p0_m, 0x80);                           \
+                                                                        \
+    filt = __msa_srari_b(filt1, 1);                                     \
+    hev_in = __msa_xori_b((v16u8)hev_in, 0xff);                         \
+    filt = filt & (v16i8)hev_in;                                        \
+                                                                        \
+    q1_m = __msa_subs_s_b(q1_m, filt);                                  \
+    q1_out = __msa_xori_b((v16u8)q1_m, 0x80);                           \
+    p1_m = __msa_adds_s_b(p1_m, filt);                                  \
+    p1_out = __msa_xori_b((v16u8)p1_m, 0x80);                           \
+  }
+
+#define AOM_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out)    \
+  {                                                                      \
+    v16u8 tmp_flat4, p2_a_sub_p0, q2_a_sub_q0, p3_a_sub_p0, q3_a_sub_q0; \
+    v16u8 zero_in = { 0 };                                               \
+                                                                         \
+    tmp_flat4 = __msa_ori_b(zero_in, 1);                                 \
+    p2_a_sub_p0 = __msa_asub_u_b(p2_in, p0_in);                          \
+    q2_a_sub_q0 = __msa_asub_u_b(q2_in, q0_in);                          \
+    p3_a_sub_p0 = __msa_asub_u_b(p3_in, p0_in);                          \
+    q3_a_sub_q0 = __msa_asub_u_b(q3_in, q0_in);                          \
+                                                                         \
+    p2_a_sub_p0 = __msa_max_u_b(p2_a_sub_p0, q2_a_sub_q0);               \
+    flat_out = __msa_max_u_b(p2_a_sub_p0, flat_out);                     \
+    p3_a_sub_p0 = __msa_max_u_b(p3_a_sub_p0, q3_a_sub_q0);               \
+    flat_out = __msa_max_u_b(p3_a_sub_p0, flat_out);                     \
+                                                                         \
+    flat_out = (tmp_flat4 < (v16u8)flat_out);                            \
+    flat_out = __msa_xori_b(flat_out, 0xff);                             \
+    flat_out = flat_out & (mask);                                        \
+  }
+
+#define AOM_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in, q5_in, \
+                  q6_in, q7_in, flat_in, flat2_out)                       \
+  {                                                                       \
+    v16u8 tmp_flat5, zero_in = { 0 };                                     \
+    v16u8 p4_a_sub_p0, q4_a_sub_q0, p5_a_sub_p0, q5_a_sub_q0;             \
+    v16u8 p6_a_sub_p0, q6_a_sub_q0, p7_a_sub_p0, q7_a_sub_q0;             \
+                                                                          \
+    tmp_flat5 = __msa_ori_b(zero_in, 1);                                  \
+    p4_a_sub_p0 = __msa_asub_u_b(p4_in, p0_in);                           \
+    q4_a_sub_q0 = __msa_asub_u_b(q4_in, q0_in);                           \
+    p5_a_sub_p0 = __msa_asub_u_b(p5_in, p0_in);                           \
+    q5_a_sub_q0 = __msa_asub_u_b(q5_in, q0_in);                           \
+    p6_a_sub_p0 = __msa_asub_u_b(p6_in, p0_in);                           \
+    q6_a_sub_q0 = __msa_asub_u_b(q6_in, q0_in);                           \
+    p7_a_sub_p0 = __msa_asub_u_b(p7_in, p0_in);                           \
+    q7_a_sub_q0 = __msa_asub_u_b(q7_in, q0_in);                           \
+                                                                          \
+    p4_a_sub_p0 = __msa_max_u_b(p4_a_sub_p0, q4_a_sub_q0);                \
+    flat2_out = __msa_max_u_b(p5_a_sub_p0, q5_a_sub_q0);                  \
+    flat2_out = __msa_max_u_b(p4_a_sub_p0, flat2_out);                    \
+    p6_a_sub_p0 = __msa_max_u_b(p6_a_sub_p0, q6_a_sub_q0);                \
+    flat2_out = __msa_max_u_b(p6_a_sub_p0, flat2_out);                    \
+    p7_a_sub_p0 = __msa_max_u_b(p7_a_sub_p0, q7_a_sub_q0);                \
+    flat2_out = __msa_max_u_b(p7_a_sub_p0, flat2_out);                    \
+                                                                          \
+    flat2_out = (tmp_flat5 < (v16u8)flat2_out);                           \
+    flat2_out = __msa_xori_b(flat2_out, 0xff);                            \
+    flat2_out = flat2_out & flat_in;                                      \
+  }
+
+#define AOM_FILTER8(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \
+                    p2_filt8_out, p1_filt8_out, p0_filt8_out, q0_filt8_out, \
+                    q1_filt8_out, q2_filt8_out)                             \
+  {                                                                         \
+    v8u16 tmp_filt8_0, tmp_filt8_1, tmp_filt8_2;                            \
+                                                                            \
+    tmp_filt8_2 = p2_in + p1_in + p0_in;                                    \
+    tmp_filt8_0 = p3_in << 1;                                               \
+                                                                            \
+    tmp_filt8_0 = tmp_filt8_0 + tmp_filt8_2 + q0_in;                        \
+    tmp_filt8_1 = tmp_filt8_0 + p3_in + p2_in;                              \
+    p2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3);             \
+                                                                            \
+    tmp_filt8_1 = tmp_filt8_0 + p1_in + q1_in;                              \
+    p1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3);             \
+                                                                            \
+    tmp_filt8_1 = q2_in + q1_in + q0_in;                                    \
+    tmp_filt8_2 = tmp_filt8_2 + tmp_filt8_1;                                \
+    tmp_filt8_0 = tmp_filt8_2 + (p0_in);                                    \
+    tmp_filt8_0 = tmp_filt8_0 + (p3_in);                                    \
+    p0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_0, 3);             \
+                                                                            \
+    tmp_filt8_0 = q2_in + q3_in;                                            \
+    tmp_filt8_0 = p0_in + tmp_filt8_1 + tmp_filt8_0;                        \
+    tmp_filt8_1 = q3_in + q3_in;                                            \
+    tmp_filt8_1 = tmp_filt8_1 + tmp_filt8_0;                                \
+    q2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3);             \
+                                                                            \
+    tmp_filt8_0 = tmp_filt8_2 + q3_in;                                      \
+    tmp_filt8_1 = tmp_filt8_0 + q0_in;                                      \
+    q0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3);             \
+                                                                            \
+    tmp_filt8_1 = tmp_filt8_0 - p2_in;                                      \
+    tmp_filt8_0 = q1_in + q3_in;                                            \
+    tmp_filt8_1 = tmp_filt8_0 + tmp_filt8_1;                                \
+    q1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3);             \
+  }
+
+#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \
+                     limit_in, b_limit_in, thresh_in, hev_out, mask_out,     \
+                     flat_out)                                               \
+  {                                                                          \
+    v16u8 p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m;            \
+    v16u8 p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m;            \
+                                                                             \
+    /* absolute subtraction of pixel values */                               \
+    p3_asub_p2_m = __msa_asub_u_b(p3_in, p2_in);                             \
+    p2_asub_p1_m = __msa_asub_u_b(p2_in, p1_in);                             \
+    p1_asub_p0_m = __msa_asub_u_b(p1_in, p0_in);                             \
+    q1_asub_q0_m = __msa_asub_u_b(q1_in, q0_in);                             \
+    q2_asub_q1_m = __msa_asub_u_b(q2_in, q1_in);                             \
+    q3_asub_q2_m = __msa_asub_u_b(q3_in, q2_in);                             \
+    p0_asub_q0_m = __msa_asub_u_b(p0_in, q0_in);                             \
+    p1_asub_q1_m = __msa_asub_u_b(p1_in, q1_in);                             \
+                                                                             \
+    /* calculation of hev */                                                 \
+    flat_out = __msa_max_u_b(p1_asub_p0_m, q1_asub_q0_m);                    \
+    hev_out = thresh_in < (v16u8)flat_out;                                   \
+                                                                             \
+    /* calculation of mask */                                                \
+    p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p0_asub_q0_m);               \
+    p1_asub_q1_m >>= 1;                                                      \
+    p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p1_asub_q1_m);               \
+                                                                             \
+    mask_out = b_limit_in < p0_asub_q0_m;                                    \
+    mask_out = __msa_max_u_b(flat_out, mask_out);                            \
+    p3_asub_p2_m = __msa_max_u_b(p3_asub_p2_m, p2_asub_p1_m);                \
+    mask_out = __msa_max_u_b(p3_asub_p2_m, mask_out);                        \
+    q2_asub_q1_m = __msa_max_u_b(q2_asub_q1_m, q3_asub_q2_m);                \
+    mask_out = __msa_max_u_b(q2_asub_q1_m, mask_out);                        \
+                                                                             \
+    mask_out = limit_in < (v16u8)mask_out;                                   \
+    mask_out = __msa_xori_b(mask_out, 0xff);                                 \
+  }
+#endif  // AOM_AOM_DSP_MIPS_LOOPFILTER_MSA_H_
diff --git a/libs/libaom/src/aom_dsp/mips/macros_msa.h b/libs/libaom/src/aom_dsp/mips/macros_msa.h
new file mode 100644
index 000000000..9bfc27147
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/mips/macros_msa.h
@@ -0,0 +1,2058 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_MIPS_MACROS_MSA_H_
+#define AOM_AOM_DSP_MIPS_MACROS_MSA_H_
+
+#include <msa.h>
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+
+#define LD_B(RTYPE, psrc) *((const RTYPE *)(psrc))
+#define LD_UB(...) LD_B(v16u8, __VA_ARGS__)
+#define LD_SB(...) LD_B(v16i8, __VA_ARGS__)
+
+#define LD_H(RTYPE, psrc) *((const RTYPE *)(psrc))
+#define LD_UH(...) LD_H(v8u16, __VA_ARGS__)
+#define LD_SH(...) LD_H(v8i16, __VA_ARGS__)
+
+#define LD_W(RTYPE, psrc) *((const RTYPE *)(psrc))
+#define LD_SW(...) LD_W(v4i32, __VA_ARGS__)
+
+#define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
+#define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
+#define ST_SB(...) ST_B(v16i8, __VA_ARGS__)
+
+#define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
+#define ST_SH(...) ST_H(v8i16, __VA_ARGS__)
+
+#define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
+#define ST_SW(...) ST_W(v4i32, __VA_ARGS__)
+
+#if (__mips_isa_rev >= 6)
+#define LH(psrc)                                          \
+  ({                                                      \
+    const uint8_t *psrc_m = (const uint8_t *)(psrc);      \
+    uint16_t val_m;                                       \
+                                                          \
+    __asm__ __volatile__("lh  %[val_m],  %[psrc_m]  \n\t" \
+                                                          \
+                         : [val_m] "=r"(val_m)            \
+                         : [psrc_m] "m"(*psrc_m));        \
+                                                          \
+    val_m;                                                \
+  })
+
+#define LW(psrc)                                          \
+  ({                                                      \
+    const uint8_t *psrc_m = (const uint8_t *)(psrc);      \
+    uint32_t val_m;                                       \
+                                                          \
+    __asm__ __volatile__("lw  %[val_m],  %[psrc_m]  \n\t" \
+                                                          \
+                         : [val_m] "=r"(val_m)            \
+                         : [psrc_m] "m"(*psrc_m));        \
+                                                          \
+    val_m;                                                \
+  })
+
+#if (__mips == 64)
+#define LD(psrc)                                          \
+  ({                                                      \
+    const uint8_t *psrc_m = (const uint8_t *)(psrc);      \
+    uint64_t val_m = 0;                                   \
+                                                          \
+    __asm__ __volatile__("ld  %[val_m],  %[psrc_m]  \n\t" \
+                                                          \
+                         : [val_m] "=r"(val_m)            \
+                         : [psrc_m] "m"(*psrc_m));        \
+                                                          \
+    val_m;                                                \
+  })
+#else  // !(__mips == 64)
+#define LD(psrc)                                            \
+  ({                                                        \
+    const uint8_t *psrc_m = (const uint8_t *)(psrc);        \
+    uint32_t val0_m, val1_m;                                \
+    uint64_t val_m = 0;                                     \
+                                                            \
+    val0_m = LW(psrc_m);                                    \
+    val1_m = LW(psrc_m + 4);                                \
+                                                            \
+    val_m = (uint64_t)(val1_m);                             \
+    val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \
+    val_m = (uint64_t)(val_m | (uint64_t)val0_m);           \
+                                                            \
+    val_m;                                                  \
+  })
+#endif  // (__mips == 64)
+
+#define SH(val, pdst)                                     \
+  {                                                       \
+    uint8_t *pdst_m = (uint8_t *)(pdst);                  \
+    const uint16_t val_m = (val);                         \
+                                                          \
+    __asm__ __volatile__("sh  %[val_m],  %[pdst_m]  \n\t" \
+                                                          \
+                         : [pdst_m] "=m"(*pdst_m)         \
+                         : [val_m] "r"(val_m));           \
+  }
+
+#define SW(val, pdst)                                     \
+  {                                                       \
+    uint8_t *pdst_m = (uint8_t *)(pdst);                  \
+    const uint32_t val_m = (val);                         \
+                                                          \
+    __asm__ __volatile__("sw  %[val_m],  %[pdst_m]  \n\t" \
+                                                          \
+                         : [pdst_m] "=m"(*pdst_m)         \
+                         : [val_m] "r"(val_m));           \
+  }
+
+#define SD(val, pdst)                                     \
+  {                                                       \
+    uint8_t *pdst_m = (uint8_t *)(pdst);                  \
+    const uint64_t val_m = (val);                         \
+                                                          \
+    __asm__ __volatile__("sd  %[val_m],  %[pdst_m]  \n\t" \
+                                                          \
+                         : [pdst_m] "=m"(*pdst_m)         \
+                         : [val_m] "r"(val_m));           \
+  }
+#else  // !(__mips_isa_rev >= 6)
+#define LH(psrc)                                           \
+  ({                                                       \
+    const uint8_t *psrc_m = (const uint8_t *)(psrc);       \
+    uint16_t val_m;                                        \
+                                                           \
+    __asm__ __volatile__("ulh  %[val_m],  %[psrc_m]  \n\t" \
+                                                           \
+                         : [val_m] "=r"(val_m)             \
+                         : [psrc_m] "m"(*psrc_m));         \
+                                                           \
+    val_m;                                                 \
+  })
+
+#define LW(psrc)                                           \
+  ({                                                       \
+    const uint8_t *psrc_m = (const uint8_t *)(psrc);       \
+    uint32_t val_m;                                        \
+                                                           \
+    __asm__ __volatile__("ulw  %[val_m],  %[psrc_m]  \n\t" \
+                                                           \
+                         : [val_m] "=r"(val_m)             \
+                         : [psrc_m] "m"(*psrc_m));         \
+                                                           \
+    val_m;                                                 \
+  })
+
+#if (__mips == 64)
+#define LD(psrc)                                           \
+  ({                                                       \
+    const uint8_t *psrc_m = (const uint8_t *)(psrc);       \
+    uint64_t val_m = 0;                                    \
+                                                           \
+    __asm__ __volatile__("uld  %[val_m],  %[psrc_m]  \n\t" \
+                                                           \
+                         : [val_m] "=r"(val_m)             \
+                         : [psrc_m] "m"(*psrc_m));         \
+                                                           \
+    val_m;                                                 \
+  })
+#else  // !(__mips == 64)
+#define LD(psrc)                                                              \
+  ({                                                                          \
+    const uint8_t *psrc_m1 = (const uint8_t *)(psrc);                         \
+    uint32_t val0_m, val1_m;                                                  \
+    uint64_t val_m_combined = 0;                                              \
+                                                                              \
+    val0_m = LW(psrc_m1);                                                     \
+    val1_m = LW(psrc_m1 + 4);                                                 \
+                                                                              \
+    val_m_combined = (uint64_t)(val1_m);                                      \
+    val_m_combined = (uint64_t)((val_m_combined << 32) & 0xFFFFFFFF00000000); \
+    val_m_combined = (uint64_t)(val_m_combined | (uint64_t)val0_m);           \
+                                                                              \
+    val_m_combined;                                                           \
+  })
+#endif  // (__mips == 64)
+
+#define SH(val, pdst)                                      \
+  {                                                        \
+    uint8_t *pdst_m = (uint8_t *)(pdst);                   \
+    const uint16_t val_m = (val);                          \
+                                                           \
+    __asm__ __volatile__("ush  %[val_m],  %[pdst_m]  \n\t" \
+                                                           \
+                         : [pdst_m] "=m"(*pdst_m)          \
+                         : [val_m] "r"(val_m));            \
+  }
+
+#define SW(val, pdst)                                      \
+  {                                                        \
+    uint8_t *pdst_m = (uint8_t *)(pdst);                   \
+    const uint32_t val_m = (val);                          \
+                                                           \
+    __asm__ __volatile__("usw  %[val_m],  %[pdst_m]  \n\t" \
+                                                           \
+                         : [pdst_m] "=m"(*pdst_m)          \
+                         : [val_m] "r"(val_m));            \
+  }
+
+#define SD(val, pdst)                                        \
+  {                                                          \
+    uint8_t *pdst_m1 = (uint8_t *)(pdst);                    \
+    uint32_t val0_m, val1_m;                                 \
+                                                             \
+    val0_m = (uint32_t)((val)&0x00000000FFFFFFFF);           \
+    val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \
+                                                             \
+    SW(val0_m, pdst_m1);                                     \
+    SW(val1_m, pdst_m1 + 4);                                 \
+  }
+#endif  // (__mips_isa_rev >= 6)
+
+/* Description : Load 4 words with stride
+   Arguments   : Inputs  - psrc, stride
+                 Outputs - out0, out1, out2, out3
+   Details     : Load word in 'out0' from (psrc)
+                 Load word in 'out1' from (psrc + stride)
+                 Load word in 'out2' from (psrc + 2 * stride)
+                 Load word in 'out3' from (psrc + 3 * stride)
+*/
+#define LW4(psrc, stride, out0, out1, out2, out3) \
+  {                                               \
+    out0 = LW((psrc));                            \
+    out1 = LW((psrc) + stride);                   \
+    out2 = LW((psrc) + 2 * stride);               \
+    out3 = LW((psrc) + 3 * stride);               \
+  }
+
+/* Description : Load double words with stride
+   Arguments   : Inputs  - psrc, stride
+                 Outputs - out0, out1
+   Details     : Load double word in 'out0' from (psrc)
+                 Load double word in 'out1' from (psrc + stride)
+*/
+#define LD2(psrc, stride, out0, out1) \
+  {                                   \
+    out0 = LD((psrc));                \
+    out1 = LD((psrc) + stride);       \
+  }
+#define LD4(psrc, stride, out0, out1, out2, out3) \
+  {                                               \
+    LD2((psrc), stride, out0, out1);              \
+    LD2((psrc) + 2 * stride, stride, out2, out3); \
+  }
+
+/* Description : Store 4 words with stride
+   Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
+   Details     : Store word from 'in0' to (pdst)
+                 Store word from 'in1' to (pdst + stride)
+                 Store word from 'in2' to (pdst + 2 * stride)
+                 Store word from 'in3' to (pdst + 3 * stride)
+*/
+#define SW4(in0, in1, in2, in3, pdst, stride) \
+  {                                           \
+    SW(in0, (pdst))                           \
+    SW(in1, (pdst) + stride);                 \
+    SW(in2, (pdst) + 2 * stride);             \
+    SW(in3, (pdst) + 3 * stride);             \
+  }
+
+/* Description : Store 4 double words with stride
+   Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
+   Details     : Store double word from 'in0' to (pdst)
+                 Store double word from 'in1' to (pdst + stride)
+                 Store double word from 'in2' to (pdst + 2 * stride)
+                 Store double word from 'in3' to (pdst + 3 * stride)
+*/
+#define SD4(in0, in1, in2, in3, pdst, stride) \
+  {                                           \
+    SD(in0, (pdst))                           \
+    SD(in1, (pdst) + stride);                 \
+    SD(in2, (pdst) + 2 * stride);             \
+    SD(in3, (pdst) + 3 * stride);             \
+  }
+
+/* Description : Load vectors with 16 byte elements with stride
+   Arguments   : Inputs  - psrc, stride
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Load 16 byte elements in 'out0' from (psrc)
+                 Load 16 byte elements in 'out1' from (psrc + stride)
+*/
+#define LD_B2(RTYPE, psrc, stride, out0, out1) \
+  {                                            \
+    out0 = LD_B(RTYPE, (psrc));                \
+    out1 = LD_B(RTYPE, (psrc) + stride);       \
+  }
+#define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)
+#define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__)
+
+#define LD_B3(RTYPE, psrc, stride, out0, out1, out2) \
+  {                                                  \
+    LD_B2(RTYPE, (psrc), stride, out0, out1);        \
+    out2 = LD_B(RTYPE, (psrc) + 2 * stride);         \
+  }
+#define LD_UB3(...) LD_B3(v16u8, __VA_ARGS__)
+
+#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \
+  {                                                        \
+    LD_B2(RTYPE, (psrc), stride, out0, out1);              \
+    LD_B2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \
+  }
+#define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)
+#define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__)
+
+#define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \
+  {                                                              \
+    LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);        \
+    out4 = LD_B(RTYPE, (psrc) + 4 * stride);                     \
+  }
+#define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__)
+#define LD_SB5(...) LD_B5(v16i8, __VA_ARGS__)
+
+#define LD_B7(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6) \
+  {                                                                          \
+    LD_B5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4);              \
+    LD_B2(RTYPE, (psrc) + 5 * stride, stride, out5, out6);                   \
+  }
+#define LD_SB7(...) LD_B7(v16i8, __VA_ARGS__)
+
+#define LD_B8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \
+              out7)                                                          \
+  {                                                                          \
+    LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);                    \
+    LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);       \
+  }
+#define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__)
+#define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__)
+
+/* Description : Load vectors with 8 halfword elements with stride
+   Arguments   : Inputs  - psrc, stride
+                 Outputs - out0, out1
+   Details     : Load 8 halfword elements in 'out0' from (psrc)
+                 Load 8 halfword elements in 'out1' from (psrc + stride)
+*/
+#define LD_H2(RTYPE, psrc, stride, out0, out1) \
+  {                                            \
+    out0 = LD_H(RTYPE, (psrc));                \
+    out1 = LD_H(RTYPE, (psrc) + (stride));     \
+  }
+#define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__)
+
+#define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3) \
+  {                                                        \
+    LD_H2(RTYPE, (psrc), stride, out0, out1);              \
+    LD_H2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \
+  }
+#define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__)
+
+#define LD_H8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \
+              out7)                                                          \
+  {                                                                          \
+    LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3);                    \
+    LD_H4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);       \
+  }
+#define LD_SH8(...) LD_H8(v8i16, __VA_ARGS__)
+
+#define LD_H16(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6,  \
+               out7, out8, out9, out10, out11, out12, out13, out14, out15)     \
+  {                                                                            \
+    LD_H8(RTYPE, (psrc), stride, out0, out1, out2, out3, out4, out5, out6,     \
+          out7);                                                               \
+    LD_H8(RTYPE, (psrc) + 8 * stride, stride, out8, out9, out10, out11, out12, \
+          out13, out14, out15);                                                \
+  }
+#define LD_SH16(...) LD_H16(v8i16, __VA_ARGS__)
+
+/* Description : Load 4x4 block of signed halfword elements from 1D source
+                 data into 4 vectors (Each vector with 4 signed halfwords)
+   Arguments   : Input   - psrc
+                 Outputs - out0, out1, out2, out3
+*/
+#define LD4x4_SH(psrc, out0, out1, out2, out3)            \
+  {                                                       \
+    out0 = LD_SH(psrc);                                   \
+    out2 = LD_SH(psrc + 8);                               \
+    out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \
+    out3 = (v8i16)__msa_ilvl_d((v2i64)out2, (v2i64)out2); \
+  }
+
+/* Description : Load 2 vectors of signed word elements with stride
+   Arguments   : Inputs  - psrc, stride
+                 Outputs - out0, out1
+                 Return Type - signed word
+*/
+#define LD_SW2(psrc, stride, out0, out1) \
+  {                                      \
+    out0 = LD_SW((psrc));                \
+    out1 = LD_SW((psrc) + stride);       \
+  }
+
+/* Description : Store vectors of 16 byte elements with stride
+   Arguments   : Inputs - in0, in1, pdst, stride
+   Details     : Store 16 byte elements from 'in0' to (pdst)
+                 Store 16 byte elements from 'in1' to (pdst + stride)
+*/
+#define ST_B2(RTYPE, in0, in1, pdst, stride) \
+  {                                          \
+    ST_B(RTYPE, in0, (pdst));                \
+    ST_B(RTYPE, in1, (pdst) + stride);       \
+  }
+#define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
+
+#define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride)   \
+  {                                                      \
+    ST_B2(RTYPE, in0, in1, (pdst), stride);              \
+    ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
+  }
+#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
+
+#define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
+  {                                                                        \
+    ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride);                        \
+    ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride);         \
+  }
+#define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__)
+
+/* Description : Store vectors of 8 halfword elements with stride
+   Arguments   : Inputs - in0, in1, pdst, stride
+   Details     : Store 8 halfword elements from 'in0' to (pdst)
+                 Store 8 halfword elements from 'in1' to (pdst + stride)
+*/
+#define ST_H2(RTYPE, in0, in1, pdst, stride) \
+  {                                          \
+    ST_H(RTYPE, in0, (pdst));                \
+    ST_H(RTYPE, in1, (pdst) + stride);       \
+  }
+#define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__)
+
+#define ST_H4(RTYPE, in0, in1, in2, in3, pdst, stride)   \
+  {                                                      \
+    ST_H2(RTYPE, in0, in1, (pdst), stride);              \
+    ST_H2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
+  }
+#define ST_SH4(...) ST_H4(v8i16, __VA_ARGS__)
+
+#define ST_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
+  {                                                                        \
+    ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride);                      \
+    ST_H4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride);         \
+  }
+#define ST_SH8(...) ST_H8(v8i16, __VA_ARGS__)
+
+/* Description : Store vectors of word elements with stride
+   Arguments   : Inputs - in0, in1, pdst, stride
+   Details     : Store 4 word elements from 'in0' to (pdst)
+                 Store 4 word elements from 'in1' to (pdst + stride)
+*/
+#define ST_SW2(in0, in1, pdst, stride) \
+  {                                    \
+    ST_SW(in0, (pdst));                \
+    ST_SW(in1, (pdst) + stride);       \
+  }
+
+/* Description : Store 2x4 byte block to destination memory from input vector
+   Arguments   : Inputs - in, stidx, pdst, stride
+   Details     : Index 'stidx' halfword element from 'in' vector is copied to
+                 the GP register and stored to (pdst)
+                 Index 'stidx+1' halfword element from 'in' vector is copied to
+                 the GP register and stored to (pdst + stride)
+                 Index 'stidx+2' halfword element from 'in' vector is copied to
+                 the GP register and stored to (pdst + 2 * stride)
+                 Index 'stidx+3' halfword element from 'in' vector is copied to
+                 the GP register and stored to (pdst + 3 * stride)
+*/
+#define ST2x4_UB(in, stidx, pdst, stride)            \
+  {                                                  \
+    uint16_t out0_m, out1_m, out2_m, out3_m;         \
+    uint8_t *pblk_2x4_m = (uint8_t *)(pdst);         \
+                                                     \
+    out0_m = __msa_copy_u_h((v8i16)in, (stidx));     \
+    out1_m = __msa_copy_u_h((v8i16)in, (stidx + 1)); \
+    out2_m = __msa_copy_u_h((v8i16)in, (stidx + 2)); \
+    out3_m = __msa_copy_u_h((v8i16)in, (stidx + 3)); \
+                                                     \
+    SH(out0_m, pblk_2x4_m);                          \
+    SH(out1_m, pblk_2x4_m + stride);                 \
+    SH(out2_m, pblk_2x4_m + 2 * stride);             \
+    SH(out3_m, pblk_2x4_m + 3 * stride);             \
+  }
+
+/* Description : Store 4x2 byte block to destination memory from input vector
+   Arguments   : Inputs - in, pdst, stride
+   Details     : Index 0 word element from 'in' vector is copied to the GP
+                 register and stored to (pdst)
+                 Index 1 word element from 'in' vector is copied to the GP
+                 register and stored to (pdst + stride)
+*/
+#define ST4x2_UB(in, pdst, stride)           \
+  {                                          \
+    uint32_t out0_m, out1_m;                 \
+    uint8_t *pblk_4x2_m = (uint8_t *)(pdst); \
+                                             \
+    out0_m = __msa_copy_u_w((v4i32)in, 0);   \
+    out1_m = __msa_copy_u_w((v4i32)in, 1);   \
+                                             \
+    SW(out0_m, pblk_4x2_m);                  \
+    SW(out1_m, pblk_4x2_m + stride);         \
+  }
+
+/* Description : Store 4x4 byte block to destination memory from input vector
+   Arguments   : Inputs - in0, in1, pdst, stride
+   Details     : 'Idx0' word element from input vector 'in0' is copied to the
+                 GP register and stored to (pdst)
+                 'Idx1' word element from input vector 'in0' is copied to the
+                 GP register and stored to (pdst + stride)
+                 'Idx2' word element from input vector 'in0' is copied to the
+                 GP register and stored to (pdst + 2 * stride)
+                 'Idx3' word element from input vector 'in0' is copied to the
+                 GP register and stored to (pdst + 3 * stride)
+*/
+#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \
+  {                                                              \
+    uint32_t out0_m, out1_m, out2_m, out3_m;                     \
+    uint8_t *pblk_4x4_m = (uint8_t *)(pdst);                     \
+                                                                 \
+    out0_m = __msa_copy_u_w((v4i32)in0, idx0);                   \
+    out1_m = __msa_copy_u_w((v4i32)in0, idx1);                   \
+    out2_m = __msa_copy_u_w((v4i32)in1, idx2);                   \
+    out3_m = __msa_copy_u_w((v4i32)in1, idx3);                   \
+                                                                 \
+    SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride);     \
+  }
+#define ST4x8_UB(in0, in1, pdst, stride)                           \
+  {                                                                \
+    uint8_t *pblk_4x8 = (uint8_t *)(pdst);                         \
+                                                                   \
+    ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride);              \
+    ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride); \
+  }
+
+/* Description : Store 8x1 byte block to destination memory from input vector
+   Arguments   : Inputs - in, pdst
+   Details     : Index 0 double word element from 'in' vector is copied to the
+                 GP register and stored to (pdst)
+*/
+#define ST8x1_UB(in, pdst)                 \
+  {                                        \
+    uint64_t out0_m;                       \
+                                           \
+    out0_m = __msa_copy_u_d((v2i64)in, 0); \
+    SD(out0_m, pdst);                      \
+  }
+
+/* Description : Store 8x2 byte block to destination memory from input vector
+   Arguments   : Inputs - in, pdst, stride
+   Details     : Index 0 double word element from 'in' vector is copied to the
+                 GP register and stored to (pdst)
+                 Index 1 double word element from 'in' vector is copied to the
+                 GP register and stored to (pdst + stride)
+*/
+#define ST8x2_UB(in, pdst, stride)           \
+  {                                          \
+    uint64_t out0_m, out1_m;                 \
+    uint8_t *pblk_8x2_m = (uint8_t *)(pdst); \
+                                             \
+    out0_m = __msa_copy_u_d((v2i64)in, 0);   \
+    out1_m = __msa_copy_u_d((v2i64)in, 1);   \
+                                             \
+    SD(out0_m, pblk_8x2_m);                  \
+    SD(out1_m, pblk_8x2_m + stride);         \
+  }
+
+/* Description : Store 8x4 byte block to destination memory from input
+                 vectors
+   Arguments   : Inputs - in0, in1, pdst, stride
+   Details     : Index 0 double word element from 'in0' vector is copied to the
+                 GP register and stored to (pdst)
+                 Index 1 double word element from 'in0' vector is copied to the
+                 GP register and stored to (pdst + stride)
+                 Index 0 double word element from 'in1' vector is copied to the
+                 GP register and stored to (pdst + 2 * stride)
+                 Index 1 double word element from 'in1' vector is copied to the
+                 GP register and stored to (pdst + 3 * stride)
+*/
+#define ST8x4_UB(in0, in1, pdst, stride)                     \
+  {                                                          \
+    uint64_t out0_m, out1_m, out2_m, out3_m;                 \
+    uint8_t *pblk_8x4_m = (uint8_t *)(pdst);                 \
+                                                             \
+    out0_m = __msa_copy_u_d((v2i64)in0, 0);                  \
+    out1_m = __msa_copy_u_d((v2i64)in0, 1);                  \
+    out2_m = __msa_copy_u_d((v2i64)in1, 0);                  \
+    out3_m = __msa_copy_u_d((v2i64)in1, 1);                  \
+                                                             \
+    SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride); \
+  }
+
+/* Description : average with rounding (in0 + in1 + 1) / 2.
+   Arguments   : Inputs  - in0, in1, in2, in3,
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Each unsigned byte element from 'in0' vector is added with
+                 each unsigned byte element from 'in1' vector. Then the average
+                 with rounding is calculated and written to 'out0'
+*/
+#define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1)   \
+  {                                                       \
+    out0 = (RTYPE)__msa_aver_u_b((v16u8)in0, (v16u8)in1); \
+    out1 = (RTYPE)__msa_aver_u_b((v16u8)in2, (v16u8)in3); \
+  }
+#define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__)
+
+#define AVER_UB4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+                 out2, out3)                                                \
+  {                                                                         \
+    AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1)                         \
+    AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3)                         \
+  }
+#define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__)
+
+/* Description : Immediate number of elements to slide with zero
+   Arguments   : Inputs  - in0, in1, slide_val
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Byte elements from 'zero_m' vector are slid into 'in0' by
+                 value specified in the 'slide_val'
+*/
+#define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val)             \
+  {                                                                   \
+    v16i8 zero_m = { 0 };                                             \
+    out0 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in0, slide_val); \
+    out1 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in1, slide_val); \
+  }
+#define SLDI_B2_0_SW(...) SLDI_B2_0(v4i32, __VA_ARGS__)
+
+#define SLDI_B4_0(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, \
+                  slide_val)                                         \
+  {                                                                  \
+    SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val);               \
+    SLDI_B2_0(RTYPE, in2, in3, out2, out3, slide_val);               \
+  }
+#define SLDI_B4_0_UB(...) SLDI_B4_0(v16u8, __VA_ARGS__)
+
+/* Description : Immediate number of elements to slide
+   Arguments   : Inputs  - in0_0, in0_1, in1_0, in1_1, slide_val
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Byte elements from 'in0_0' vector are slid into 'in1_0' by
+                 value specified in the 'slide_val'
+*/
+#define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \
+  {                                                                       \
+    out0 = (RTYPE)__msa_sldi_b((v16i8)in0_0, (v16i8)in1_0, slide_val);    \
+    out1 = (RTYPE)__msa_sldi_b((v16i8)in0_1, (v16i8)in1_1, slide_val);    \
+  }
+#define SLDI_B2_UB(...) SLDI_B2(v16u8, __VA_ARGS__)
+#define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__)
+
+#define SLDI_B3(RTYPE, in0_0, in0_1, in0_2, in1_0, in1_1, in1_2, out0, out1, \
+                out2, slide_val)                                             \
+  {                                                                          \
+    SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val)        \
+    out2 = (RTYPE)__msa_sldi_b((v16i8)in0_2, (v16i8)in1_2, slide_val);       \
+  }
+#define SLDI_B3_SB(...) SLDI_B3(v16i8, __VA_ARGS__)
+#define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__)
+
+/* Description : Shuffle byte vector elements as per mask vector
+   Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Byte elements from 'in0' & 'in1' are copied selectively to
+                 'out0' as per control vector 'mask0'
+*/
+#define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1)  \
+  {                                                                   \
+    out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0); \
+    out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2); \
+  }
+#define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
+#define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
+#define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)
+
+#define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3, out0, out1, out2, \
+                out3)                                                          \
+  {                                                                            \
+    VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1);              \
+    VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3);              \
+  }
+#define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__)
+#define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__)
+
+/* Description : Dot product of byte vector elements
+   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Unsigned byte elements from 'mult0' are multiplied with
+                 unsigned byte elements from 'cnst0' producing a result
+                 twice the size of input i.e. unsigned halfword.
+                 The multiplication result of adjacent odd-even elements
+                 are added together and written to the 'out0' vector
+*/
+#define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
+  {                                                             \
+    out0 = (RTYPE)__msa_dotp_u_h((v16u8)mult0, (v16u8)cnst0);   \
+    out1 = (RTYPE)__msa_dotp_u_h((v16u8)mult1, (v16u8)cnst1);   \
+  }
+#define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__)
+
+#define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
+                 cnst3, out0, out1, out2, out3)                          \
+  {                                                                      \
+    DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);             \
+    DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);             \
+  }
+#define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__)
+
+/* Description : Dot product of byte vector elements
+   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Signed byte elements from 'mult0' are multiplied with
+                 signed byte elements from 'cnst0' producing a result
+                 twice the size of input i.e. signed halfword.
+                 The multiplication result of adjacent odd-even elements
+                 are added together and written to the 'out0' vector
+*/
+#define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
+  {                                                             \
+    out0 = (RTYPE)__msa_dotp_s_h((v16i8)mult0, (v16i8)cnst0);   \
+    out1 = (RTYPE)__msa_dotp_s_h((v16i8)mult1, (v16i8)cnst1);   \
+  }
+#define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)
+
+#define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
+                 cnst3, out0, out1, out2, out3)                          \
+  {                                                                      \
+    DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);             \
+    DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);             \
+  }
+#define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__)
+
+/* Description : Dot product of halfword vector elements
+   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Signed halfword elements from 'mult0' are multiplied with
+                 signed halfword elements from 'cnst0' producing a result
+                 twice the size of input i.e. signed word.
+                 The multiplication result of adjacent odd-even elements
+                 are added together and written to the 'out0' vector
+*/
+#define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
+  {                                                             \
+    out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0);   \
+    out1 = (RTYPE)__msa_dotp_s_w((v8i16)mult1, (v8i16)cnst1);   \
+  }
+#define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__)
+
+#define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
+                 cnst3, out0, out1, out2, out3)                          \
+  {                                                                      \
+    DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);             \
+    DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);             \
+  }
+#define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__)
+
+/* Description : Dot product of word vector elements
+   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Signed word elements from 'mult0' are multiplied with
+                 signed word elements from 'cnst0' producing a result
+                 twice the size of input i.e. signed double word.
+                 The multiplication result of adjacent odd-even elements
+                 are added together and written to the 'out0' vector
+*/
+#define DOTP_SW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
+  {                                                             \
+    out0 = (RTYPE)__msa_dotp_s_d((v4i32)mult0, (v4i32)cnst0);   \
+    out1 = (RTYPE)__msa_dotp_s_d((v4i32)mult1, (v4i32)cnst1);   \
+  }
+#define DOTP_SW2_SD(...) DOTP_SW2(v2i64, __VA_ARGS__)
+
+/* Description : Dot product & addition of byte vector elements
+   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Signed byte elements from 'mult0' are multiplied with
+                 signed byte elements from 'cnst0' producing a result
+                 twice the size of input i.e. signed halfword.
+                 The multiplication result of adjacent odd-even elements
+                 are added to the 'out0' vector
+*/
+#define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)            \
+  {                                                                         \
+    out0 = (RTYPE)__msa_dpadd_s_h((v8i16)out0, (v16i8)mult0, (v16i8)cnst0); \
+    out1 = (RTYPE)__msa_dpadd_s_h((v8i16)out1, (v16i8)mult1, (v16i8)cnst1); \
+  }
+#define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__)
+
+#define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
+                  cnst3, out0, out1, out2, out3)                          \
+  {                                                                       \
+    DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);             \
+    DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);             \
+  }
+#define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__)
+
+/* Description : Dot product & addition of halfword vector elements
+   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Signed halfword elements from 'mult0' are multiplied with
+                 signed halfword elements from 'cnst0' producing a result
+                 twice the size of input i.e. signed word.
+                 The multiplication result of adjacent odd-even elements
+                 are added to the 'out0' vector
+*/
+#define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)            \
+  {                                                                         \
+    out0 = (RTYPE)__msa_dpadd_s_w((v4i32)out0, (v8i16)mult0, (v8i16)cnst0); \
+    out1 = (RTYPE)__msa_dpadd_s_w((v4i32)out1, (v8i16)mult1, (v8i16)cnst1); \
+  }
+#define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__)
+
+/* Description : Dot product & addition of double word vector elements
+   Arguments   : Inputs  - mult0, mult1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Each signed word element from 'mult0' is multiplied with itself
+                 producing an intermediate result twice the size of input
+                 i.e. signed double word
+                 The multiplication result of adjacent odd-even elements
+                 are added to the 'out0' vector
+*/
+#define DPADD_SD2(RTYPE, mult0, mult1, out0, out1)                          \
+  {                                                                         \
+    out0 = (RTYPE)__msa_dpadd_s_d((v2i64)out0, (v4i32)mult0, (v4i32)mult0); \
+    out1 = (RTYPE)__msa_dpadd_s_d((v2i64)out1, (v4i32)mult1, (v4i32)mult1); \
+  }
+#define DPADD_SD2_SD(...) DPADD_SD2(v2i64, __VA_ARGS__)
+
+/* Description : Minimum values between unsigned elements of
+                 either vector are copied to the output vector
+   Arguments   : Inputs  - in0, in1, min_vec
+                 Outputs - in place operation
+                 Return Type - as per RTYPE
+   Details     : Minimum of unsigned halfword element values from 'in0' and
+                 'min_vec' are written to output vector 'in0'
+*/
+#define MIN_UH2(RTYPE, in0, in1, min_vec)            \
+  {                                                  \
+    in0 = (RTYPE)__msa_min_u_h((v8u16)in0, min_vec); \
+    in1 = (RTYPE)__msa_min_u_h((v8u16)in1, min_vec); \
+  }
+#define MIN_UH2_UH(...) MIN_UH2(v8u16, __VA_ARGS__)
+
+#define MIN_UH4(RTYPE, in0, in1, in2, in3, min_vec) \
+  {                                                 \
+    MIN_UH2(RTYPE, in0, in1, min_vec);              \
+    MIN_UH2(RTYPE, in2, in3, min_vec);              \
+  }
+#define MIN_UH4_UH(...) MIN_UH4(v8u16, __VA_ARGS__)
+
+/* Description : Clips all signed halfword elements of input vector
+                 between 0 & 255
+   Arguments   : Input  - in
+                 Output - out_m
+                 Return Type - signed halfword
+*/
+#define CLIP_SH_0_255(in)                              \
+  ({                                                   \
+    v8i16 max_m = __msa_ldi_h(255);                    \
+    v8i16 out_m;                                       \
+                                                       \
+    out_m = __msa_maxi_s_h((v8i16)in, 0);              \
+    out_m = __msa_min_s_h((v8i16)max_m, (v8i16)out_m); \
+    out_m;                                             \
+  })
+#define CLIP_SH2_0_255(in0, in1) \
+  {                              \
+    in0 = CLIP_SH_0_255(in0);    \
+    in1 = CLIP_SH_0_255(in1);    \
+  }
+#define CLIP_SH4_0_255(in0, in1, in2, in3) \
+  {                                        \
+    CLIP_SH2_0_255(in0, in1);              \
+    CLIP_SH2_0_255(in2, in3);              \
+  }
+
+/* Description : Horizontal addition of 4 signed word elements of input vector
+   Arguments   : Input  - in       (signed word vector)
+                 Output - sum_m    (i32 sum)
+                 Return Type - signed word (GP)
+   Details     : 4 signed word elements of 'in' vector are added together and
+                 the resulting integer sum is returned
+*/
+#define HADD_SW_S32(in)                            \
+  ({                                               \
+    v2i64 res0_m, res1_m;                          \
+    int32_t sum_m;                                 \
+                                                   \
+    res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in); \
+    res1_m = __msa_splati_d(res0_m, 1);            \
+    res0_m = res0_m + res1_m;                      \
+    sum_m = __msa_copy_s_w((v4i32)res0_m, 0);      \
+    sum_m;                                         \
+  })
+
+/* Description : Horizontal addition of 8 unsigned halfword elements
+   Arguments   : Inputs  - in       (unsigned halfword vector)
+                 Outputs - sum_m    (u32 sum)
+                 Return Type - unsigned word
+   Details     : 8 unsigned halfword elements of input vector are added
+                 together and the resulting integer sum is returned
+*/
+#define HADD_UH_U32(in)                               \
+  ({                                                  \
+    v4u32 res_m;                                      \
+    v2u64 res0_m, res1_m;                             \
+    uint32_t sum_m;                                   \
+                                                      \
+    res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in);     \
+    res0_m = __msa_hadd_u_d(res_m, res_m);            \
+    res1_m = (v2u64)__msa_splati_d((v2i64)res0_m, 1); \
+    res0_m = res0_m + res1_m;                         \
+    sum_m = __msa_copy_u_w((v4i32)res0_m, 0);         \
+    sum_m;                                            \
+  })
+
+/* Description : Horizontal addition of unsigned byte vector elements
+   Arguments   : Inputs  - in0, in1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Each unsigned odd byte element from 'in0' is added to
+                 even unsigned byte element from 'in0' (pairwise) and the
+                 halfword result is written to 'out0'
+*/
+#define HADD_UB2(RTYPE, in0, in1, out0, out1)             \
+  {                                                       \
+    out0 = (RTYPE)__msa_hadd_u_h((v16u8)in0, (v16u8)in0); \
+    out1 = (RTYPE)__msa_hadd_u_h((v16u8)in1, (v16u8)in1); \
+  }
+#define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__)
+
+#define HADD_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \
+  {                                                                 \
+    HADD_UB2(RTYPE, in0, in1, out0, out1);                          \
+    HADD_UB2(RTYPE, in2, in3, out2, out3);                          \
+  }
+#define HADD_UB4_UH(...) HADD_UB4(v8u16, __VA_ARGS__)
+
+/* Description : Horizontal subtraction of unsigned byte vector elements
+   Arguments   : Inputs  - in0, in1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Each unsigned odd byte element from 'in0' is subtracted from
+                 even unsigned byte element from 'in0' (pairwise) and the
+                 halfword result is written to 'out0'
+*/
+#define HSUB_UB2(RTYPE, in0, in1, out0, out1)             \
+  {                                                       \
+    out0 = (RTYPE)__msa_hsub_u_h((v16u8)in0, (v16u8)in0); \
+    out1 = (RTYPE)__msa_hsub_u_h((v16u8)in1, (v16u8)in1); \
+  }
+#define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
+
+/* Description : SAD (Sum of Absolute Difference)
+   Arguments   : Inputs  - in0, in1, ref0, ref1
+                 Outputs - sad_m                 (halfword vector)
+                 Return Type - unsigned halfword
+   Details     : Absolute difference of all the byte elements from 'in0' with
+                 'ref0' is calculated and preserved in 'diff0'. Then even-odd
+                 pairs are added together to generate 8 halfword results.
+*/
+#define SAD_UB2_UH(in0, in1, ref0, ref1)                     \
+  ({                                                         \
+    v16u8 diff0_m, diff1_m;                                  \
+    v8u16 sad_m = { 0 };                                     \
+                                                             \
+    diff0_m = __msa_asub_u_b((v16u8)in0, (v16u8)ref0);       \
+    diff1_m = __msa_asub_u_b((v16u8)in1, (v16u8)ref1);       \
+                                                             \
+    sad_m += __msa_hadd_u_h((v16u8)diff0_m, (v16u8)diff0_m); \
+    sad_m += __msa_hadd_u_h((v16u8)diff1_m, (v16u8)diff1_m); \
+                                                             \
+    sad_m;                                                   \
+  })
+
+/* Description : Horizontal subtraction of signed halfword vector elements
+   Arguments   : Inputs  - in0, in1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Each signed odd halfword element from 'in0' is subtracted from
+                 even signed halfword element from 'in0' (pairwise) and the
+                 word result is written to 'out0'
+*/
+#define HSUB_UH2(RTYPE, in0, in1, out0, out1)             \
+  {                                                       \
+    out0 = (RTYPE)__msa_hsub_s_w((v8i16)in0, (v8i16)in0); \
+    out1 = (RTYPE)__msa_hsub_s_w((v8i16)in1, (v8i16)in1); \
+  }
+#define HSUB_UH2_SW(...) HSUB_UH2(v4i32, __VA_ARGS__)
+
+/* Description : Set element n input vector to GPR value
+   Arguments   : Inputs - in0, in1, in2, in3
+                 Output - out
+                 Return Type - as per RTYPE
+   Details     : Set element 0 in vector 'out' to value specified in 'in0'
+*/
+#define INSERT_W2(RTYPE, in0, in1, out)              \
+  {                                                  \
+    out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \
+    out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \
+  }
+#define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__)
+
+#define INSERT_W4(RTYPE, in0, in1, in2, in3, out)    \
+  {                                                  \
+    out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \
+    out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \
+    out = (RTYPE)__msa_insert_w((v4i32)out, 2, in2); \
+    out = (RTYPE)__msa_insert_w((v4i32)out, 3, in3); \
+  }
+#define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__)
+#define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__)
+
+#define INSERT_D2(RTYPE, in0, in1, out)              \
+  {                                                  \
+    out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0); \
+    out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1); \
+  }
+#define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__)
+#define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)
+
+/* Description : Interleave even byte elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Even byte elements of 'in0' and 'in1' are interleaved
+                 and written to 'out0'
+*/
+#define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1)  \
+  {                                                      \
+    out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \
+    out1 = (RTYPE)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \
+  }
+#define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__)
+#define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__)
+
+/* Description : Interleave even halfword elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Even halfword elements of 'in0' and 'in1' are interleaved
+                 and written to 'out0'
+*/
+#define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1)  \
+  {                                                      \
+    out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0); \
+    out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2); \
+  }
+#define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__)
+#define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__)
+#define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__)
+
+/* Description : Interleave even word elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Even word elements of 'in0' and 'in1' are interleaved
+                 and written to 'out0'
+*/
+#define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1)  \
+  {                                                      \
+    out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0); \
+    out1 = (RTYPE)__msa_ilvev_w((v4i32)in3, (v4i32)in2); \
+  }
+#define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__)
+
+/* Description : Interleave even double word elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Even double word elements of 'in0' and 'in1' are interleaved
+                 and written to 'out0'
+*/
+#define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1)  \
+  {                                                      \
+    out0 = (RTYPE)__msa_ilvev_d((v2i64)in1, (v2i64)in0); \
+    out1 = (RTYPE)__msa_ilvev_d((v2i64)in3, (v2i64)in2); \
+  }
+#define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)
+
+/* Description : Interleave left half of byte elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Left half of byte elements of 'in0' and 'in1' are interleaved
+                 and written to 'out0'.
+*/
+#define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1)  \
+  {                                                     \
+    out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \
+    out1 = (RTYPE)__msa_ilvl_b((v16i8)in2, (v16i8)in3); \
+  }
+#define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__)
+#define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__)
+#define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__)
+#define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__)
+
+#define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+                out2, out3)                                                \
+  {                                                                        \
+    ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
+    ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
+  }
+#define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__)
+#define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__)
+#define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__)
+
+/* Description : Interleave left half of halfword elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Left half of halfword elements of 'in0' and 'in1' are
+                 interleaved and written to 'out0'.
+*/
+#define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1)  \
+  {                                                     \
+    out0 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \
+    out1 = (RTYPE)__msa_ilvl_h((v8i16)in2, (v8i16)in3); \
+  }
+#define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__)
+#define ILVL_H2_SW(...) ILVL_H2(v4i32, __VA_ARGS__)
+
+/* Description : Interleave left half of word elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Left half of word elements of 'in0' and 'in1' are interleaved
+                 and written to 'out0'.
+*/
+#define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1)  \
+  {                                                     \
+    out0 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \
+    out1 = (RTYPE)__msa_ilvl_w((v4i32)in2, (v4i32)in3); \
+  }
+#define ILVL_W2_UB(...) ILVL_W2(v16u8, __VA_ARGS__)
+#define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__)
+
+/* Description : Interleave right half of byte elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Right half of byte elements of 'in0' and 'in1' are interleaved
+                 and written to out0.
+*/
+#define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1)  \
+  {                                                     \
+    out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \
+    out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3); \
+  }
+#define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__)
+#define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)
+#define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__)
+#define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
+
+#define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+                out2, out3)                                                \
+  {                                                                        \
+    ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
+    ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
+  }
+#define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__)
+#define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__)
+#define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)
+#define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__)
+
+#define ILVR_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, \
+                in11, in12, in13, in14, in15, out0, out1, out2, out3, out4,    \
+                out5, out6, out7)                                              \
+  {                                                                            \
+    ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2,   \
+            out3);                                                             \
+    ILVR_B4(RTYPE, in8, in9, in10, in11, in12, in13, in14, in15, out4, out5,   \
+            out6, out7);                                                       \
+  }
+#define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__)
+
+/* Description : Interleave right half of halfword elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Right half of halfword elements of 'in0' and 'in1' are
+                 interleaved and written to 'out0'.
+*/
+#define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1)  \
+  {                                                     \
+    out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \
+    out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3); \
+  }
+#define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
+#define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__)
+
+#define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+                out2, out3)                                                \
+  {                                                                        \
+    ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
+    ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
+  }
+#define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__)
+
+#define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1)  \
+  {                                                     \
+    out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \
+    out1 = (RTYPE)__msa_ilvr_w((v4i32)in2, (v4i32)in3); \
+  }
+#define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__)
+#define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__)
+
+#define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+                out2, out3)                                                \
+  {                                                                        \
+    ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
+    ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
+  }
+#define ILVR_W4_UB(...) ILVR_W4(v16u8, __VA_ARGS__)
+
+/* Description : Interleave right half of double word elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Right half of double word elements of 'in0' and 'in1' are
+                 interleaved and written to 'out0'.
+*/
+#define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1)      \
+  {                                                         \
+    out0 = (RTYPE)__msa_ilvr_d((v2i64)(in0), (v2i64)(in1)); \
+    out1 = (RTYPE)__msa_ilvr_d((v2i64)(in2), (v2i64)(in3)); \
+  }
+#define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__)
+#define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
+#define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
+
+#define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
+  {                                                                    \
+    ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);                    \
+    out2 = (RTYPE)__msa_ilvr_d((v2i64)(in4), (v2i64)(in5));            \
+  }
+#define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__)
+
+#define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+                out2, out3)                                                \
+  {                                                                        \
+    ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
+    ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
+  }
+#define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__)
+#define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__)
+
+/* Description : Interleave both left and right half of input vectors
+   Arguments   : Inputs  - in0, in1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Right half of byte elements from 'in0' and 'in1' are
+                 interleaved and written to 'out0'
+*/
+#define ILVRL_B2(RTYPE, in0, in1, out0, out1)           \
+  {                                                     \
+    out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \
+    out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \
+  }
+#define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
+#define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
+#define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__)
+#define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
+
+#define ILVRL_H2(RTYPE, in0, in1, out0, out1)           \
+  {                                                     \
+    out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \
+    out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \
+  }
+#define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
+#define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
+
+#define ILVRL_W2(RTYPE, in0, in1, out0, out1)           \
+  {                                                     \
+    out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \
+    out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \
+  }
+#define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__)
+#define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
+#define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
+
+/* Description : Saturate the halfword element values to the max
+                 unsigned value of (sat_val + 1) bits
+                 The element data width remains unchanged
+   Arguments   : Inputs  - in0, in1, sat_val
+                 Outputs - in place operation
+                 Return Type - as per RTYPE
+   Details     : Each unsigned halfword element from 'in0' is saturated to the
+                 value generated with (sat_val + 1) bit range.
+                 The results are written in place
+*/
+#define SAT_UH2(RTYPE, in0, in1, sat_val)            \
+  {                                                  \
+    in0 = (RTYPE)__msa_sat_u_h((v8u16)in0, sat_val); \
+    in1 = (RTYPE)__msa_sat_u_h((v8u16)in1, sat_val); \
+  }
+#define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__)
+
+#define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) \
+  {                                                 \
+    SAT_UH2(RTYPE, in0, in1, sat_val);              \
+    SAT_UH2(RTYPE, in2, in3, sat_val)               \
+  }
+#define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__)
+
+/* Description : Saturate the halfword element values to the max
+                 unsigned value of (sat_val + 1) bits
+                 The element data width remains unchanged
+   Arguments   : Inputs  - in0, in1, sat_val
+                 Outputs - in place operation
+                 Return Type - as per RTYPE
+   Details     : Each unsigned halfword element from 'in0' is saturated to the
+                 value generated with (sat_val + 1) bit range
+                 The results are written in place
+*/
+#define SAT_SH2(RTYPE, in0, in1, sat_val)            \
+  {                                                  \
+    in0 = (RTYPE)__msa_sat_s_h((v8i16)in0, sat_val); \
+    in1 = (RTYPE)__msa_sat_s_h((v8i16)in1, sat_val); \
+  }
+#define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__)
+
+#define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) \
+  {                                                 \
+    SAT_SH2(RTYPE, in0, in1, sat_val);              \
+    SAT_SH2(RTYPE, in2, in3, sat_val);              \
+  }
+#define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__)
+
+/* Description : Indexed halfword element values are replicated to all
+                 elements in output vector
+   Arguments   : Inputs  - in, idx0, idx1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : 'idx0' element value from 'in' vector is replicated to all
+                  elements in 'out0' vector
+                  Valid index range for halfword operation is 0-7
+*/
+#define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) \
+  {                                                  \
+    out0 = (RTYPE)__msa_splati_h((v8i16)in, idx0);   \
+    out1 = (RTYPE)__msa_splati_h((v8i16)in, idx1);   \
+  }
+#define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__)
+
+#define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3, out0, out1, out2, out3) \
+  {                                                                          \
+    SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1);                            \
+    SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3);                            \
+  }
+#define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__)
+#define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__)
+
+/* Description : Pack even byte elements of vector pairs
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Even byte elements of 'in0' are copied to the left half of
+                 'out0' & even byte elements of 'in1' are copied to the right
+                 half of 'out0'.
+*/
+#define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1)  \
+  {                                                      \
+    out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1); \
+    out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3); \
+  }
+#define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)
+#define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)
+#define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__)
+
+#define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+                 out2, out3)                                                \
+  {                                                                         \
+    PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
+    PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
+  }
+#define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__)
+#define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__)
+#define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__)
+
+/* Description : Pack even halfword elements of vector pairs
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Even halfword elements of 'in0' are copied to the left half of
+                 'out0' & even halfword elements of 'in1' are copied to the
+                 right half of 'out0'.
+*/
+#define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1)  \
+  {                                                      \
+    out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1); \
+    out1 = (RTYPE)__msa_pckev_h((v8i16)in2, (v8i16)in3); \
+  }
+#define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__)
+#define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__)
+
+#define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+                 out2, out3)                                                \
+  {                                                                         \
+    PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
+    PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
+  }
+#define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__)
+
+/* Description : Pack even double word elements of vector pairs
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Even double elements of 'in0' are copied to the left half of
+                 'out0' & even double elements of 'in1' are copied to the right
+                 half of 'out0'.
+*/
+#define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1)  \
+  {                                                      \
+    out0 = (RTYPE)__msa_pckev_d((v2i64)in0, (v2i64)in1); \
+    out1 = (RTYPE)__msa_pckev_d((v2i64)in2, (v2i64)in3); \
+  }
+#define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__)
+#define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__)
+
+#define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+                 out2, out3)                                                \
+  {                                                                         \
+    PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
+    PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
+  }
+#define PCKEV_D4_UB(...) PCKEV_D4(v16u8, __VA_ARGS__)
+
+/* Description : Each byte element is logically xor'ed with immediate 128
+   Arguments   : Inputs  - in0, in1
+                 Outputs - in place operation
+                 Return Type - as per RTYPE
+   Details     : Each unsigned byte element from input vector 'in0' is
+                 logically xor'ed with 128 and the result is stored in-place.
+*/
+#define XORI_B2_128(RTYPE, in0, in1)            \
+  {                                             \
+    in0 = (RTYPE)__msa_xori_b((v16u8)in0, 128); \
+    in1 = (RTYPE)__msa_xori_b((v16u8)in1, 128); \
+  }
+#define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__)
+#define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__)
+
+#define XORI_B3_128(RTYPE, in0, in1, in2)       \
+  {                                             \
+    XORI_B2_128(RTYPE, in0, in1);               \
+    in2 = (RTYPE)__msa_xori_b((v16u8)in2, 128); \
+  }
+#define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__)
+
+#define XORI_B4_128(RTYPE, in0, in1, in2, in3) \
+  {                                            \
+    XORI_B2_128(RTYPE, in0, in1);              \
+    XORI_B2_128(RTYPE, in2, in3);              \
+  }
+#define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__)
+#define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__)
+
+#define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6) \
+  {                                                           \
+    XORI_B4_128(RTYPE, in0, in1, in2, in3);                   \
+    XORI_B3_128(RTYPE, in4, in5, in6);                        \
+  }
+#define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__)
+
+/* Description : Average of signed halfword elements -> (a + b) / 2
+   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
+                 Outputs - out0, out1, out2, out3
+                 Return Type - as per RTYPE
+   Details     : Each signed halfword element from 'in0' is added to each
+                 signed halfword element of 'in1' with full precision resulting
+                 in one extra bit in the result. The result is then divided by
+                 2 and written to 'out0'
+*/
+#define AVE_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+                out2, out3)                                                \
+  {                                                                        \
+    out0 = (RTYPE)__msa_ave_s_h((v8i16)in0, (v8i16)in1);                   \
+    out1 = (RTYPE)__msa_ave_s_h((v8i16)in2, (v8i16)in3);                   \
+    out2 = (RTYPE)__msa_ave_s_h((v8i16)in4, (v8i16)in5);                   \
+    out3 = (RTYPE)__msa_ave_s_h((v8i16)in6, (v8i16)in7);                   \
+  }
+#define AVE_SH4_SH(...) AVE_SH4(v8i16, __VA_ARGS__)
+
+/* Description : Addition of signed halfword elements and signed saturation
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Signed halfword elements from 'in0' are added to signed
+                 halfword elements of 'in1'. The result is then signed saturated
+                 between halfword data type range
+*/
+#define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1)   \
+  {                                                       \
+    out0 = (RTYPE)__msa_adds_s_h((v8i16)in0, (v8i16)in1); \
+    out1 = (RTYPE)__msa_adds_s_h((v8i16)in2, (v8i16)in3); \
+  }
+#define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__)
+
+#define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+                 out2, out3)                                                \
+  {                                                                         \
+    ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
+    ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
+  }
+#define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__)
+
+/* Description : Shift left all elements of vector (generic for all data types)
+   Arguments   : Inputs  - in0, in1, in2, in3, shift
+                 Outputs - in place operation
+                 Return Type - as per input vector RTYPE
+   Details     : Each element of vector 'in0' is left shifted by 'shift' and
+                 the result is written in-place.
+*/
+#define SLLI_4V(in0, in1, in2, in3, shift) \
+  {                                        \
+    in0 = in0 << shift;                    \
+    in1 = in1 << shift;                    \
+    in2 = in2 << shift;                    \
+    in3 = in3 << shift;                    \
+  }
+
+/* Description : Arithmetic shift right all elements of vector
+                 (generic for all data types)
+   Arguments   : Inputs  - in0, in1, in2, in3, shift
+                 Outputs - in place operation
+                 Return Type - as per input vector RTYPE
+   Details     : Each element of vector 'in0' is right shifted by 'shift' and
+                 the result is written in-place. 'shift' is a GP variable.
+*/
+#define SRA_4V(in0, in1, in2, in3, shift) \
+  {                                       \
+    in0 = in0 >> shift;                   \
+    in1 = in1 >> shift;                   \
+    in2 = in2 >> shift;                   \
+    in3 = in3 >> shift;                   \
+  }
+
+/* Description : Shift right arithmetic rounded words
+   Arguments   : Inputs  - in0, in1, shift
+                 Outputs - in place operation
+                 Return Type - as per RTYPE
+   Details     : Each element of vector 'in0' is shifted right arithmetically by
+                 the number of bits in the corresponding element in the vector
+                 'shift'. The last discarded bit is added to shifted value for
+                 rounding and the result is written in-place.
+                 'shift' is a vector.
+*/
+#define SRAR_W2(RTYPE, in0, in1, shift)                  \
+  {                                                      \
+    in0 = (RTYPE)__msa_srar_w((v4i32)in0, (v4i32)shift); \
+    in1 = (RTYPE)__msa_srar_w((v4i32)in1, (v4i32)shift); \
+  }
+
+#define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) \
+  {                                               \
+    SRAR_W2(RTYPE, in0, in1, shift)               \
+    SRAR_W2(RTYPE, in2, in3, shift)               \
+  }
+#define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__)
+
+/* Description : Shift right arithmetic rounded (immediate)
+   Arguments   : Inputs  - in0, in1, shift
+                 Outputs - in place operation
+                 Return Type - as per RTYPE
+   Details     : Each element of vector 'in0' is shifted right arithmetically by
+                 the value in 'shift'. The last discarded bit is added to the
+                 shifted value for rounding and the result is written in-place.
+                 'shift' is an immediate value.
+*/
+#define SRARI_H2(RTYPE, in0, in1, shift)           \
+  {                                                \
+    in0 = (RTYPE)__msa_srari_h((v8i16)in0, shift); \
+    in1 = (RTYPE)__msa_srari_h((v8i16)in1, shift); \
+  }
+#define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__)
+#define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__)
+
+#define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) \
+  {                                                \
+    SRARI_H2(RTYPE, in0, in1, shift);              \
+    SRARI_H2(RTYPE, in2, in3, shift);              \
+  }
+#define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__)
+#define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__)
+
+#define SRARI_W2(RTYPE, in0, in1, shift)           \
+  {                                                \
+    in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift); \
+    in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift); \
+  }
+#define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__)
+
+#define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) \
+  {                                                \
+    SRARI_W2(RTYPE, in0, in1, shift);              \
+    SRARI_W2(RTYPE, in2, in3, shift);              \
+  }
+#define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)
+
+/* Description : Logical shift right all elements of vector (immediate)
+   Arguments   : Inputs  - in0, in1, in2, in3, shift
+                 Outputs - out0, out1, out2, out3
+                 Return Type - as per RTYPE
+   Details     : Each element of vector 'in0' is right shifted by 'shift' and
+                 the result is written in-place. 'shift' is an immediate value.
+*/
+#define SRLI_H4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, shift) \
+  {                                                                       \
+    out0 = (RTYPE)__msa_srli_h((v8i16)in0, shift);                        \
+    out1 = (RTYPE)__msa_srli_h((v8i16)in1, shift);                        \
+    out2 = (RTYPE)__msa_srli_h((v8i16)in2, shift);                        \
+    out3 = (RTYPE)__msa_srli_h((v8i16)in3, shift);                        \
+  }
+#define SRLI_H4_SH(...) SRLI_H4(v8i16, __VA_ARGS__)
+
+/* Description : Multiplication of pairs of vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+   Details     : Each element from 'in0' is multiplied with elements from 'in1'
+                 and the result is written to 'out0'
+*/
+#define MUL2(in0, in1, in2, in3, out0, out1) \
+  {                                          \
+    out0 = in0 * in1;                        \
+    out1 = in2 * in3;                        \
+  }
+#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
+  {                                                                          \
+    MUL2(in0, in1, in2, in3, out0, out1);                                    \
+    MUL2(in4, in5, in6, in7, out2, out3);                                    \
+  }
+
+/* Description : Addition of 2 pairs of vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+   Details     : Each element in 'in0' is added to 'in1' and result is written
+                 to 'out0'.
+*/
+#define ADD2(in0, in1, in2, in3, out0, out1) \
+  {                                          \
+    out0 = in0 + in1;                        \
+    out1 = in2 + in3;                        \
+  }
+#define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
+  {                                                                          \
+    ADD2(in0, in1, in2, in3, out0, out1);                                    \
+    ADD2(in4, in5, in6, in7, out2, out3);                                    \
+  }
+
+/* Description : Subtraction of 2 pairs of vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+   Details     : Each element in 'in1' is subtracted from 'in0' and result is
+                 written to 'out0'.
+*/
+#define SUB2(in0, in1, in2, in3, out0, out1) \
+  {                                          \
+    out0 = in0 - in1;                        \
+    out1 = in2 - in3;                        \
+  }
+#define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
+  {                                                                          \
+    out0 = in0 - in1;                                                        \
+    out1 = in2 - in3;                                                        \
+    out2 = in4 - in5;                                                        \
+    out3 = in6 - in7;                                                        \
+  }
+
+/* Description : Sign extend halfword elements from right half of the vector
+   Arguments   : Input  - in    (halfword vector)
+                 Output - out   (sign extended word vector)
+                 Return Type - signed word
+   Details     : Sign bit of halfword elements from input vector 'in' is
+                 extracted and interleaved with same vector 'in0' to generate
+                 4 word elements keeping sign intact
+*/
+#define UNPCK_R_SH_SW(in, out)                    \
+  {                                               \
+    v8i16 sign_m;                                 \
+                                                  \
+    sign_m = __msa_clti_s_h((v8i16)in, 0);        \
+    out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in); \
+  }
+
+/* Description : Zero extend unsigned byte elements to halfword elements
+   Arguments   : Input   - in          (unsigned byte vector)
+                 Outputs - out0, out1  (unsigned  halfword vectors)
+                 Return Type - signed halfword
+   Details     : Zero extended right half of vector is returned in 'out0'
+                 Zero extended left half of vector is returned in 'out1'
+*/
+#define UNPCK_UB_SH(in, out0, out1)      \
+  {                                      \
+    v16i8 zero_m = { 0 };                \
+                                         \
+    ILVRL_B2_SH(zero_m, in, out0, out1); \
+  }
+
+/* Description : Sign extend halfword elements from input vector and return
+                 the result in pair of vectors
+   Arguments   : Input   - in            (halfword vector)
+                 Outputs - out0, out1   (sign extended word vectors)
+                 Return Type - signed word
+   Details     : Sign bit of halfword elements from input vector 'in' is
+                 extracted and interleaved right with same vector 'in0' to
+                 generate 4 signed word elements in 'out0'
+                 Then interleaved left with same vector 'in0' to
+                 generate 4 signed word elements in 'out1'
+*/
+#define UNPCK_SH_SW(in, out0, out1)       \
+  {                                       \
+    v8i16 tmp_m;                          \
+                                          \
+    tmp_m = __msa_clti_s_h((v8i16)in, 0); \
+    ILVRL_H2_SW(tmp_m, in, out0, out1);   \
+  }
+
+/* Description : Butterfly of 4 input vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1, out2, out3
+   Details     : Butterfly operation
+*/
+#define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) \
+  {                                                             \
+    out0 = in0 + in3;                                           \
+    out1 = in1 + in2;                                           \
+                                                                \
+    out2 = in1 - in2;                                           \
+    out3 = in0 - in3;                                           \
+  }
+
+/* Description : Butterfly of 8 input vectors
+   Arguments   : Inputs  - in0 ...  in7
+                 Outputs - out0 .. out7
+   Details     : Butterfly operation
+*/
+#define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \
+                    out3, out4, out5, out6, out7)                             \
+  {                                                                           \
+    out0 = in0 + in7;                                                         \
+    out1 = in1 + in6;                                                         \
+    out2 = in2 + in5;                                                         \
+    out3 = in3 + in4;                                                         \
+                                                                              \
+    out4 = in3 - in4;                                                         \
+    out5 = in2 - in5;                                                         \
+    out6 = in1 - in6;                                                         \
+    out7 = in0 - in7;                                                         \
+  }
+
+/* Description : Butterfly of 16 input vectors
+   Arguments   : Inputs  - in0 ...  in15
+                 Outputs - out0 .. out15
+   Details     : Butterfly operation
+*/
+#define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10,  \
+                     in11, in12, in13, in14, in15, out0, out1, out2, out3,    \
+                     out4, out5, out6, out7, out8, out9, out10, out11, out12, \
+                     out13, out14, out15)                                     \
+  {                                                                           \
+    out0 = in0 + in15;                                                        \
+    out1 = in1 + in14;                                                        \
+    out2 = in2 + in13;                                                        \
+    out3 = in3 + in12;                                                        \
+    out4 = in4 + in11;                                                        \
+    out5 = in5 + in10;                                                        \
+    out6 = in6 + in9;                                                         \
+    out7 = in7 + in8;                                                         \
+                                                                              \
+    out8 = in7 - in8;                                                         \
+    out9 = in6 - in9;                                                         \
+    out10 = in5 - in10;                                                       \
+    out11 = in4 - in11;                                                       \
+    out12 = in3 - in12;                                                       \
+    out13 = in2 - in13;                                                       \
+    out14 = in1 - in14;                                                       \
+    out15 = in0 - in15;                                                       \
+  }
+
+/* Description : Transpose input 8x8 byte block
+   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
+                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+                 Return Type - as per RTYPE
+*/
+#define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0,   \
+                        out1, out2, out3, out4, out5, out6, out7)              \
+  {                                                                            \
+    v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                      \
+    v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                      \
+                                                                               \
+    ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5, tmp0_m, tmp1_m, tmp2_m, \
+               tmp3_m);                                                        \
+    ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m);                               \
+    ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m);                               \
+    ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2);                               \
+    ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6);                               \
+    SLDI_B2_0(RTYPE, out0, out2, out1, out3, 8);                               \
+    SLDI_B2_0(RTYPE, out4, out6, out5, out7, 8);                               \
+  }
+#define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__)
+
+/* Description : Transpose 16x8 block into 8x16 with byte elements in vectors
+   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7,
+                           in8, in9, in10, in11, in12, in13, in14, in15
+                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+                 Return Type - unsigned byte
+*/
+#define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, \
+                            in10, in11, in12, in13, in14, in15, out0, out1,   \
+                            out2, out3, out4, out5, out6, out7)               \
+  {                                                                           \
+    v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                     \
+    v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                     \
+                                                                              \
+    ILVEV_D2_UB(in0, in8, in1, in9, out7, out6);                              \
+    ILVEV_D2_UB(in2, in10, in3, in11, out5, out4);                            \
+    ILVEV_D2_UB(in4, in12, in5, in13, out3, out2);                            \
+    ILVEV_D2_UB(in6, in14, in7, in15, out1, out0);                            \
+                                                                              \
+    tmp0_m = (v16u8)__msa_ilvev_b((v16i8)out6, (v16i8)out7);                  \
+    tmp4_m = (v16u8)__msa_ilvod_b((v16i8)out6, (v16i8)out7);                  \
+    tmp1_m = (v16u8)__msa_ilvev_b((v16i8)out4, (v16i8)out5);                  \
+    tmp5_m = (v16u8)__msa_ilvod_b((v16i8)out4, (v16i8)out5);                  \
+    out5 = (v16u8)__msa_ilvev_b((v16i8)out2, (v16i8)out3);                    \
+    tmp6_m = (v16u8)__msa_ilvod_b((v16i8)out2, (v16i8)out3);                  \
+    out7 = (v16u8)__msa_ilvev_b((v16i8)out0, (v16i8)out1);                    \
+    tmp7_m = (v16u8)__msa_ilvod_b((v16i8)out0, (v16i8)out1);                  \
+                                                                              \
+    ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m);                  \
+    out0 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
+    out4 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
+                                                                              \
+    tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m);              \
+    tmp3_m = (v16u8)__msa_ilvod_h((v8i16)out7, (v8i16)out5);                  \
+    out2 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
+    out6 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
+                                                                              \
+    ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m);              \
+    out1 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
+    out5 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
+                                                                              \
+    tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m);              \
+    tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m);              \
+    tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m);              \
+    tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m);              \
+    out3 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
+    out7 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
+  }
+
+/* Description : Transpose 4x4 block with half word elements in vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1, out2, out3
+                 Return Type - signed halfword
+*/
+#define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \
+  {                                                                    \
+    v8i16 s0_m, s1_m;                                                  \
+                                                                       \
+    ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m);                        \
+    ILVRL_W2_SH(s1_m, s0_m, out0, out2);                               \
+    out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0);              \
+    out3 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out2);              \
+  }
+
+/* Description : Transpose 4x8 block with half word elements in vectors
+   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
+                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+                 Return Type - signed halfword
+*/
+#define TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+                           out2, out3, out4, out5, out6, out7)                 \
+  {                                                                            \
+    v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                      \
+    v8i16 tmp0_n, tmp1_n, tmp2_n, tmp3_n;                                      \
+    v8i16 zero_m = { 0 };                                                      \
+                                                                               \
+    ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6, tmp0_n, tmp1_n, tmp2_n, \
+               tmp3_n);                                                        \
+    ILVRL_W2_SH(tmp1_n, tmp0_n, tmp0_m, tmp2_m);                               \
+    ILVRL_W2_SH(tmp3_n, tmp2_n, tmp1_m, tmp3_m);                               \
+                                                                               \
+    out0 = (v8i16)__msa_ilvr_d((v2i64)tmp1_m, (v2i64)tmp0_m);                  \
+    out1 = (v8i16)__msa_ilvl_d((v2i64)tmp1_m, (v2i64)tmp0_m);                  \
+    out2 = (v8i16)__msa_ilvr_d((v2i64)tmp3_m, (v2i64)tmp2_m);                  \
+    out3 = (v8i16)__msa_ilvl_d((v2i64)tmp3_m, (v2i64)tmp2_m);                  \
+                                                                               \
+    out4 = zero_m;                                                             \
+    out5 = zero_m;                                                             \
+    out6 = zero_m;                                                             \
+    out7 = zero_m;                                                             \
+  }
+
+/* Description : Transpose 8x4 block with half word elements in vectors
+   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
+                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+                 Return Type - signed halfword
+*/
+#define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \
+  {                                                                    \
+    v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                              \
+                                                                       \
+    ILVR_H2_SH(in1, in0, in3, in2, tmp0_m, tmp1_m);                    \
+    ILVL_H2_SH(in1, in0, in3, in2, tmp2_m, tmp3_m);                    \
+    ILVR_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2);            \
+    ILVL_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3);            \
+  }
+
+/* Description : Transpose 8x8 block with half word elements in vectors
+   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
+                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+                 Return Type - as per RTYPE
+*/
+#define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, \
+                       out1, out2, out3, out4, out5, out6, out7)            \
+  {                                                                         \
+    v8i16 s0_m, s1_m;                                                       \
+    v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \
+    v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                   \
+                                                                            \
+    ILVR_H2_SH(in6, in4, in7, in5, s0_m, s1_m);                             \
+    ILVRL_H2_SH(s1_m, s0_m, tmp0_m, tmp1_m);                                \
+    ILVL_H2_SH(in6, in4, in7, in5, s0_m, s1_m);                             \
+    ILVRL_H2_SH(s1_m, s0_m, tmp2_m, tmp3_m);                                \
+    ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m);                             \
+    ILVRL_H2_SH(s1_m, s0_m, tmp4_m, tmp5_m);                                \
+    ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m);                             \
+    ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m);                                \
+    PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m, tmp3_m, \
+             tmp7_m, out0, out2, out4, out6);                               \
+    out1 = (RTYPE)__msa_pckod_d((v2i64)tmp0_m, (v2i64)tmp4_m);              \
+    out3 = (RTYPE)__msa_pckod_d((v2i64)tmp1_m, (v2i64)tmp5_m);              \
+    out5 = (RTYPE)__msa_pckod_d((v2i64)tmp2_m, (v2i64)tmp6_m);              \
+    out7 = (RTYPE)__msa_pckod_d((v2i64)tmp3_m, (v2i64)tmp7_m);              \
+  }
+#define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__)
+
+/* Description : Transpose 4x4 block with word elements in vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1, out2, out3
+                 Return Type - signed word
+*/
+#define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) \
+  {                                                                    \
+    v4i32 s0_m, s1_m, s2_m, s3_m;                                      \
+                                                                       \
+    ILVRL_W2_SW(in1, in0, s0_m, s1_m);                                 \
+    ILVRL_W2_SW(in3, in2, s2_m, s3_m);                                 \
+                                                                       \
+    out0 = (v4i32)__msa_ilvr_d((v2i64)s2_m, (v2i64)s0_m);              \
+    out1 = (v4i32)__msa_ilvl_d((v2i64)s2_m, (v2i64)s0_m);              \
+    out2 = (v4i32)__msa_ilvr_d((v2i64)s3_m, (v2i64)s1_m);              \
+    out3 = (v4i32)__msa_ilvl_d((v2i64)s3_m, (v2i64)s1_m);              \
+  }
+
+/* Description : Add block 4x4
+   Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
+   Details     : Least significant 4 bytes from each input vector are added to
+                 the destination bytes, clipped between 0-255 and stored.
+*/
+#define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride)        \
+  {                                                              \
+    uint32_t src0_m, src1_m, src2_m, src3_m;                     \
+    v8i16 inp0_m, inp1_m, res0_m, res1_m;                        \
+    v16i8 dst0_m = { 0 };                                        \
+    v16i8 dst1_m = { 0 };                                        \
+    v16i8 zero_m = { 0 };                                        \
+                                                                 \
+    ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m)               \
+    LW4(pdst, stride, src0_m, src1_m, src2_m, src3_m);           \
+    INSERT_W2_SB(src0_m, src1_m, dst0_m);                        \
+    INSERT_W2_SB(src2_m, src3_m, dst1_m);                        \
+    ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m);  \
+    ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m);        \
+    CLIP_SH2_0_255(res0_m, res1_m);                              \
+    PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m); \
+    ST4x4_UB(dst0_m, dst1_m, 0, 1, 0, 1, pdst, stride);          \
+  }
+
+/* Description : Pack even elements of input vectors & xor with 128
+   Arguments   : Inputs - in0, in1
+                 Output - out_m
+                 Return Type - unsigned byte
+   Details     : Signed byte even elements from 'in0' and 'in1' are packed
+                 together in one vector and the resulting vector is xor'ed with
+                 128 to shift the range from signed to unsigned byte
+*/
+#define PCKEV_XORI128_UB(in0, in1)                        \
+  ({                                                      \
+    v16u8 out_m;                                          \
+                                                          \
+    out_m = (v16u8)__msa_pckev_b((v16i8)in1, (v16i8)in0); \
+    out_m = (v16u8)__msa_xori_b((v16u8)out_m, 128);       \
+    out_m;                                                \
+  })
+
+/* Description : Converts inputs to unsigned bytes, interleave, average & store
+                 as 8x4 unsigned byte block
+   Arguments   : Inputs - in0, in1, in2, in3, dst0, dst1, dst2, dst3,
+                          pdst, stride
+*/
+#define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3, dst0, dst1, dst2, dst3, \
+                                pdst, stride)                               \
+  {                                                                         \
+    v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \
+                                                                            \
+    tmp0_m = PCKEV_XORI128_UB(in0, in1);                                    \
+    tmp1_m = PCKEV_XORI128_UB(in2, in3);                                    \
+    ILVR_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m);                     \
+    AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m);            \
+    ST8x4_UB(tmp0_m, tmp1_m, pdst, stride);                                 \
+  }
+
+/* Description : Pack even byte elements and store byte vector in destination
+                 memory
+   Arguments   : Inputs - in0, in1, pdst
+*/
+#define PCKEV_ST_SB(in0, in1, pdst)                \
+  {                                                \
+    v16i8 tmp_m;                                   \
+                                                   \
+    tmp_m = __msa_pckev_b((v16i8)in1, (v16i8)in0); \
+    ST_SB(tmp_m, (pdst));                          \
+  }
+
+/* Description : Horizontal 2 tap filter kernel code
+   Arguments   : Inputs - in0, in1, mask, coeff, shift
+*/
+#define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift)        \
+  ({                                                            \
+    v16i8 tmp0_m;                                               \
+    v8u16 tmp1_m;                                               \
+                                                                \
+    tmp0_m = __msa_vshf_b((v16i8)mask, (v16i8)in1, (v16i8)in0); \
+    tmp1_m = __msa_dotp_u_h((v16u8)tmp0_m, (v16u8)coeff);       \
+    tmp1_m = (v8u16)__msa_srari_h((v8i16)tmp1_m, shift);        \
+                                                                \
+    tmp1_m;                                                     \
+  })
+#endif  // AOM_AOM_DSP_MIPS_MACROS_MSA_H_
diff --git a/libs/libaom/src/aom_dsp/mips/sad_msa.c b/libs/libaom/src/aom_dsp/mips/sad_msa.c
new file mode 100644
index 000000000..58cdd80d9
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/mips/sad_msa.c
@@ -0,0 +1,800 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/mips/macros_msa.h"
+
+#define SAD_INSVE_W4(RTYPE, in0, in1, in2, in3, out)       \
+  {                                                        \
+    out = (RTYPE)__msa_insve_w((v4i32)out, 0, (v4i32)in0); \
+    out = (RTYPE)__msa_insve_w((v4i32)out, 1, (v4i32)in1); \
+    out = (RTYPE)__msa_insve_w((v4i32)out, 2, (v4i32)in2); \
+    out = (RTYPE)__msa_insve_w((v4i32)out, 3, (v4i32)in3); \
+  }
+#define SAD_INSVE_W4_UB(...) SAD_INSVE_W4(v16u8, __VA_ARGS__)
+
+static uint32_t sad_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
+                               const uint8_t *ref_ptr, int32_t ref_stride,
+                               int32_t height) {
+  int32_t ht_cnt;
+  uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3;
+  v16u8 src = { 0 };
+  v16u8 ref = { 0 };
+  v16u8 diff;
+  v8u16 sad = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LW4(src_ptr, src_stride, src0, src1, src2, src3);
+    src_ptr += (4 * src_stride);
+    LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
+    ref_ptr += (4 * ref_stride);
+
+    INSERT_W4_UB(src0, src1, src2, src3, src);
+    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+
+    diff = __msa_asub_u_b(src, ref);
+    sad += __msa_hadd_u_h(diff, diff);
+  }
+
+  return HADD_UH_U32(sad);
+}
+
+static uint32_t sad_8width_msa(const uint8_t *src, int32_t src_stride,
+                               const uint8_t *ref, int32_t ref_stride,
+                               int32_t height) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
+  v8u16 sad = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LD_UB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+    LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
+    ref += (4 * ref_stride);
+
+    PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
+                ref0, ref1);
+    sad += SAD_UB2_UH(src0, src1, ref0, ref1);
+  }
+
+  return HADD_UH_U32(sad);
+}
+
+static uint32_t sad_16width_msa(const uint8_t *src, int32_t src_stride,
+                                const uint8_t *ref, int32_t ref_stride,
+                                int32_t height) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, ref0, ref1;
+  v8u16 sad = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LD_UB2(src, src_stride, src0, src1);
+    src += (2 * src_stride);
+    LD_UB2(ref, ref_stride, ref0, ref1);
+    ref += (2 * ref_stride);
+    sad += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    LD_UB2(src, src_stride, src0, src1);
+    src += (2 * src_stride);
+    LD_UB2(ref, ref_stride, ref0, ref1);
+    ref += (2 * ref_stride);
+    sad += SAD_UB2_UH(src0, src1, ref0, ref1);
+  }
+
+  return HADD_UH_U32(sad);
+}
+
+static uint32_t sad_32width_msa(const uint8_t *src, int32_t src_stride,
+                                const uint8_t *ref, int32_t ref_stride,
+                                int32_t height) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, ref0, ref1;
+  v8u16 sad = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LD_UB2(src, 16, src0, src1);
+    src += src_stride;
+    LD_UB2(ref, 16, ref0, ref1);
+    ref += ref_stride;
+    sad += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    LD_UB2(src, 16, src0, src1);
+    src += src_stride;
+    LD_UB2(ref, 16, ref0, ref1);
+    ref += ref_stride;
+    sad += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    LD_UB2(src, 16, src0, src1);
+    src += src_stride;
+    LD_UB2(ref, 16, ref0, ref1);
+    ref += ref_stride;
+    sad += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    LD_UB2(src, 16, src0, src1);
+    src += src_stride;
+    LD_UB2(ref, 16, ref0, ref1);
+    ref += ref_stride;
+    sad += SAD_UB2_UH(src0, src1, ref0, ref1);
+  }
+
+  return HADD_UH_U32(sad);
+}
+
+static uint32_t sad_64width_msa(const uint8_t *src, int32_t src_stride,
+                                const uint8_t *ref, int32_t ref_stride,
+                                int32_t height) {
+  int32_t ht_cnt;
+  uint32_t sad = 0;
+  v16u8 src0, src1, src2, src3;
+  v16u8 ref0, ref1, ref2, ref3;
+  v8u16 sad0 = { 0 };
+  v8u16 sad1 = { 0 };
+
+  for (ht_cnt = (height >> 1); ht_cnt--;) {
+    LD_UB4(src, 16, src0, src1, src2, src3);
+    src += src_stride;
+    LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
+    ref += ref_stride;
+    sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad1 += SAD_UB2_UH(src2, src3, ref2, ref3);
+
+    LD_UB4(src, 16, src0, src1, src2, src3);
+    src += src_stride;
+    LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
+    ref += ref_stride;
+    sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad1 += SAD_UB2_UH(src2, src3, ref2, ref3);
+  }
+
+  sad = HADD_UH_U32(sad0);
+  sad += HADD_UH_U32(sad1);
+
+  return sad;
+}
+
+static void sad_4width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
+                               const uint8_t *const aref_ptr[],
+                               int32_t ref_stride, int32_t height,
+                               uint32_t *sad_array) {
+  const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
+  int32_t ht_cnt;
+  uint32_t src0, src1, src2, src3;
+  uint32_t ref0, ref1, ref2, ref3;
+  v16u8 src = { 0 };
+  v16u8 ref = { 0 };
+  v16u8 diff;
+  v8u16 sad0 = { 0 };
+  v8u16 sad1 = { 0 };
+  v8u16 sad2 = { 0 };
+  v8u16 sad3 = { 0 };
+
+  ref0_ptr = aref_ptr[0];
+  ref1_ptr = aref_ptr[1];
+  ref2_ptr = aref_ptr[2];
+  ref3_ptr = aref_ptr[3];
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LW4(src_ptr, src_stride, src0, src1, src2, src3);
+    INSERT_W4_UB(src0, src1, src2, src3, src);
+    src_ptr += (4 * src_stride);
+
+    LW4(ref0_ptr, ref_stride, ref0, ref1, ref2, ref3);
+    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+    ref0_ptr += (4 * ref_stride);
+
+    diff = __msa_asub_u_b(src, ref);
+    sad0 += __msa_hadd_u_h(diff, diff);
+
+    LW4(ref1_ptr, ref_stride, ref0, ref1, ref2, ref3);
+    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+    ref1_ptr += (4 * ref_stride);
+
+    diff = __msa_asub_u_b(src, ref);
+    sad1 += __msa_hadd_u_h(diff, diff);
+
+    LW4(ref2_ptr, ref_stride, ref0, ref1, ref2, ref3);
+    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+    ref2_ptr += (4 * ref_stride);
+
+    diff = __msa_asub_u_b(src, ref);
+    sad2 += __msa_hadd_u_h(diff, diff);
+
+    LW4(ref3_ptr, ref_stride, ref0, ref1, ref2, ref3);
+    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+    ref3_ptr += (4 * ref_stride);
+
+    diff = __msa_asub_u_b(src, ref);
+    sad3 += __msa_hadd_u_h(diff, diff);
+  }
+
+  sad_array[0] = HADD_UH_U32(sad0);
+  sad_array[1] = HADD_UH_U32(sad1);
+  sad_array[2] = HADD_UH_U32(sad2);
+  sad_array[3] = HADD_UH_U32(sad3);
+}
+
+static void sad_8width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
+                               const uint8_t *const aref_ptr[],
+                               int32_t ref_stride, int32_t height,
+                               uint32_t *sad_array) {
+  int32_t ht_cnt;
+  const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
+  v16u8 src0, src1, src2, src3;
+  v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
+  v16u8 ref8, ref9, ref10, ref11, ref12, ref13, ref14, ref15;
+  v8u16 sad0 = { 0 };
+  v8u16 sad1 = { 0 };
+  v8u16 sad2 = { 0 };
+  v8u16 sad3 = { 0 };
+
+  ref0_ptr = aref_ptr[0];
+  ref1_ptr = aref_ptr[1];
+  ref2_ptr = aref_ptr[2];
+  ref3_ptr = aref_ptr[3];
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
+    src_ptr += (4 * src_stride);
+    LD_UB4(ref0_ptr, ref_stride, ref0, ref1, ref2, ref3);
+    ref0_ptr += (4 * ref_stride);
+    LD_UB4(ref1_ptr, ref_stride, ref4, ref5, ref6, ref7);
+    ref1_ptr += (4 * ref_stride);
+    LD_UB4(ref2_ptr, ref_stride, ref8, ref9, ref10, ref11);
+    ref2_ptr += (4 * ref_stride);
+    LD_UB4(ref3_ptr, ref_stride, ref12, ref13, ref14, ref15);
+    ref3_ptr += (4 * ref_stride);
+
+    PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
+    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
+    sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    PCKEV_D2_UB(ref5, ref4, ref7, ref6, ref0, ref1);
+    sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    PCKEV_D2_UB(ref9, ref8, ref11, ref10, ref0, ref1);
+    sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    PCKEV_D2_UB(ref13, ref12, ref15, ref14, ref0, ref1);
+    sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
+  }
+
+  sad_array[0] = HADD_UH_U32(sad0);
+  sad_array[1] = HADD_UH_U32(sad1);
+  sad_array[2] = HADD_UH_U32(sad2);
+  sad_array[3] = HADD_UH_U32(sad3);
+}
+
+static void sad_16width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
+                                const uint8_t *const aref_ptr[],
+                                int32_t ref_stride, int32_t height,
+                                uint32_t *sad_array) {
+  int32_t ht_cnt;
+  const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
+  v16u8 src, ref0, ref1, ref2, ref3, diff;
+  v8u16 sad0 = { 0 };
+  v8u16 sad1 = { 0 };
+  v8u16 sad2 = { 0 };
+  v8u16 sad3 = { 0 };
+
+  ref0_ptr = aref_ptr[0];
+  ref1_ptr = aref_ptr[1];
+  ref2_ptr = aref_ptr[2];
+  ref3_ptr = aref_ptr[3];
+
+  for (ht_cnt = (height >> 1); ht_cnt--;) {
+    src = LD_UB(src_ptr);
+    src_ptr += src_stride;
+    ref0 = LD_UB(ref0_ptr);
+    ref0_ptr += ref_stride;
+    ref1 = LD_UB(ref1_ptr);
+    ref1_ptr += ref_stride;
+    ref2 = LD_UB(ref2_ptr);
+    ref2_ptr += ref_stride;
+    ref3 = LD_UB(ref3_ptr);
+    ref3_ptr += ref_stride;
+
+    diff = __msa_asub_u_b(src, ref0);
+    sad0 += __msa_hadd_u_h(diff, diff);
+    diff = __msa_asub_u_b(src, ref1);
+    sad1 += __msa_hadd_u_h(diff, diff);
+    diff = __msa_asub_u_b(src, ref2);
+    sad2 += __msa_hadd_u_h(diff, diff);
+    diff = __msa_asub_u_b(src, ref3);
+    sad3 += __msa_hadd_u_h(diff, diff);
+
+    src = LD_UB(src_ptr);
+    src_ptr += src_stride;
+    ref0 = LD_UB(ref0_ptr);
+    ref0_ptr += ref_stride;
+    ref1 = LD_UB(ref1_ptr);
+    ref1_ptr += ref_stride;
+    ref2 = LD_UB(ref2_ptr);
+    ref2_ptr += ref_stride;
+    ref3 = LD_UB(ref3_ptr);
+    ref3_ptr += ref_stride;
+
+    diff = __msa_asub_u_b(src, ref0);
+    sad0 += __msa_hadd_u_h(diff, diff);
+    diff = __msa_asub_u_b(src, ref1);
+    sad1 += __msa_hadd_u_h(diff, diff);
+    diff = __msa_asub_u_b(src, ref2);
+    sad2 += __msa_hadd_u_h(diff, diff);
+    diff = __msa_asub_u_b(src, ref3);
+    sad3 += __msa_hadd_u_h(diff, diff);
+  }
+
+  sad_array[0] = HADD_UH_U32(sad0);
+  sad_array[1] = HADD_UH_U32(sad1);
+  sad_array[2] = HADD_UH_U32(sad2);
+  sad_array[3] = HADD_UH_U32(sad3);
+}
+
+static void sad_32width_x4d_msa(const uint8_t *src, int32_t src_stride,
+                                const uint8_t *const aref_ptr[],
+                                int32_t ref_stride, int32_t height,
+                                uint32_t *sad_array) {
+  const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
+  int32_t ht_cnt;
+  v16u8 src0, src1, ref0, ref1;
+  v8u16 sad0 = { 0 };
+  v8u16 sad1 = { 0 };
+  v8u16 sad2 = { 0 };
+  v8u16 sad3 = { 0 };
+
+  ref0_ptr = aref_ptr[0];
+  ref1_ptr = aref_ptr[1];
+  ref2_ptr = aref_ptr[2];
+  ref3_ptr = aref_ptr[3];
+
+  for (ht_cnt = height; ht_cnt--;) {
+    LD_UB2(src, 16, src0, src1);
+    src += src_stride;
+
+    LD_UB2(ref0_ptr, 16, ref0, ref1);
+    ref0_ptr += ref_stride;
+    sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    LD_UB2(ref1_ptr, 16, ref0, ref1);
+    ref1_ptr += ref_stride;
+    sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    LD_UB2(ref2_ptr, 16, ref0, ref1);
+    ref2_ptr += ref_stride;
+    sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    LD_UB2(ref3_ptr, 16, ref0, ref1);
+    ref3_ptr += ref_stride;
+    sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
+  }
+
+  sad_array[0] = HADD_UH_U32(sad0);
+  sad_array[1] = HADD_UH_U32(sad1);
+  sad_array[2] = HADD_UH_U32(sad2);
+  sad_array[3] = HADD_UH_U32(sad3);
+}
+
+static void sad_64width_x4d_msa(const uint8_t *src, int32_t src_stride,
+                                const uint8_t *const aref_ptr[],
+                                int32_t ref_stride, int32_t height,
+                                uint32_t *sad_array) {
+  const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
+  int32_t ht_cnt;
+  v16u8 src0, src1, src2, src3;
+  v16u8 ref0, ref1, ref2, ref3;
+  v8u16 sad0_0 = { 0 };
+  v8u16 sad0_1 = { 0 };
+  v8u16 sad1_0 = { 0 };
+  v8u16 sad1_1 = { 0 };
+  v8u16 sad2_0 = { 0 };
+  v8u16 sad2_1 = { 0 };
+  v8u16 sad3_0 = { 0 };
+  v8u16 sad3_1 = { 0 };
+
+  ref0_ptr = aref_ptr[0];
+  ref1_ptr = aref_ptr[1];
+  ref2_ptr = aref_ptr[2];
+  ref3_ptr = aref_ptr[3];
+
+  for (ht_cnt = height; ht_cnt--;) {
+    LD_UB4(src, 16, src0, src1, src2, src3);
+    src += src_stride;
+
+    LD_UB4(ref0_ptr, 16, ref0, ref1, ref2, ref3);
+    ref0_ptr += ref_stride;
+    sad0_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad0_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
+
+    LD_UB4(ref1_ptr, 16, ref0, ref1, ref2, ref3);
+    ref1_ptr += ref_stride;
+    sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
+
+    LD_UB4(ref2_ptr, 16, ref0, ref1, ref2, ref3);
+    ref2_ptr += ref_stride;
+    sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
+
+    LD_UB4(ref3_ptr, 16, ref0, ref1, ref2, ref3);
+    ref3_ptr += ref_stride;
+    sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
+  }
+
+  sad_array[0] = HADD_UH_U32(sad0_0);
+  sad_array[0] += HADD_UH_U32(sad0_1);
+  sad_array[1] = HADD_UH_U32(sad1_0);
+  sad_array[1] += HADD_UH_U32(sad1_1);
+  sad_array[2] = HADD_UH_U32(sad2_0);
+  sad_array[2] += HADD_UH_U32(sad2_1);
+  sad_array[3] = HADD_UH_U32(sad3_0);
+  sad_array[3] += HADD_UH_U32(sad3_1);
+}
+
+static uint32_t avgsad_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
+                                  const uint8_t *ref_ptr, int32_t ref_stride,
+                                  int32_t height, const uint8_t *sec_pred) {
+  int32_t ht_cnt;
+  uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3;
+  v16u8 src = { 0 };
+  v16u8 ref = { 0 };
+  v16u8 diff, pred, comp;
+  v8u16 sad = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LW4(src_ptr, src_stride, src0, src1, src2, src3);
+    src_ptr += (4 * src_stride);
+    LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
+    ref_ptr += (4 * ref_stride);
+    pred = LD_UB(sec_pred);
+    sec_pred += 16;
+
+    INSERT_W4_UB(src0, src1, src2, src3, src);
+    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+
+    comp = __msa_aver_u_b(pred, ref);
+    diff = __msa_asub_u_b(src, comp);
+    sad += __msa_hadd_u_h(diff, diff);
+  }
+
+  return HADD_UH_U32(sad);
+}
+
+static uint32_t avgsad_8width_msa(const uint8_t *src, int32_t src_stride,
+                                  const uint8_t *ref, int32_t ref_stride,
+                                  int32_t height, const uint8_t *sec_pred) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
+  v16u8 diff0, diff1, pred0, pred1;
+  v8u16 sad = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LD_UB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+    LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
+    ref += (4 * ref_stride);
+    LD_UB2(sec_pred, 16, pred0, pred1);
+    sec_pred += 32;
+    PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
+                ref0, ref1);
+    AVER_UB2_UB(pred0, ref0, pred1, ref1, diff0, diff1);
+    sad += SAD_UB2_UH(src0, src1, diff0, diff1);
+  }
+
+  return HADD_UH_U32(sad);
+}
+
+static uint32_t avgsad_16width_msa(const uint8_t *src, int32_t src_stride,
+                                   const uint8_t *ref, int32_t ref_stride,
+                                   int32_t height, const uint8_t *sec_pred) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
+  v16u8 pred0, pred1, pred2, pred3, comp0, comp1;
+  v8u16 sad = { 0 };
+
+  for (ht_cnt = (height >> 3); ht_cnt--;) {
+    LD_UB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+    LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
+    ref += (4 * ref_stride);
+    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
+    sec_pred += (4 * 16);
+    AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1);
+    sad += SAD_UB2_UH(src0, src1, comp0, comp1);
+    AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1);
+    sad += SAD_UB2_UH(src2, src3, comp0, comp1);
+
+    LD_UB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+    LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
+    ref += (4 * ref_stride);
+    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
+    sec_pred += (4 * 16);
+    AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1);
+    sad += SAD_UB2_UH(src0, src1, comp0, comp1);
+    AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1);
+    sad += SAD_UB2_UH(src2, src3, comp0, comp1);
+  }
+
+  return HADD_UH_U32(sad);
+}
+
+static uint32_t avgsad_32width_msa(const uint8_t *src, int32_t src_stride,
+                                   const uint8_t *ref, int32_t ref_stride,
+                                   int32_t height, const uint8_t *sec_pred) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
+  v16u8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
+  v16u8 comp0, comp1;
+  v8u16 sad = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LD_UB4(src, src_stride, src0, src2, src4, src6);
+    LD_UB4(src + 16, src_stride, src1, src3, src5, src7);
+    src += (4 * src_stride);
+
+    LD_UB4(ref, ref_stride, ref0, ref2, ref4, ref6);
+    LD_UB4(ref + 16, ref_stride, ref1, ref3, ref5, ref7);
+    ref += (4 * ref_stride);
+
+    LD_UB4(sec_pred, 32, pred0, pred2, pred4, pred6);
+    LD_UB4(sec_pred + 16, 32, pred1, pred3, pred5, pred7);
+    sec_pred += (4 * 32);
+
+    AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1);
+    sad += SAD_UB2_UH(src0, src1, comp0, comp1);
+    AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1);
+    sad += SAD_UB2_UH(src2, src3, comp0, comp1);
+    AVER_UB2_UB(pred4, ref4, pred5, ref5, comp0, comp1);
+    sad += SAD_UB2_UH(src4, src5, comp0, comp1);
+    AVER_UB2_UB(pred6, ref6, pred7, ref7, comp0, comp1);
+    sad += SAD_UB2_UH(src6, src7, comp0, comp1);
+  }
+
+  return HADD_UH_U32(sad);
+}
+
+static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride,
+                                   const uint8_t *ref, int32_t ref_stride,
+                                   int32_t height, const uint8_t *sec_pred) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, src2, src3;
+  v16u8 ref0, ref1, ref2, ref3;
+  v16u8 comp0, comp1, comp2, comp3;
+  v16u8 pred0, pred1, pred2, pred3;
+  v8u16 sad0 = { 0 };
+  v8u16 sad1 = { 0 };
+  v4u32 sad;
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LD_UB4(src, 16, src0, src1, src2, src3);
+    src += src_stride;
+    LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
+    ref += ref_stride;
+    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
+    sec_pred += 64;
+    AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
+                comp1, comp2, comp3);
+    sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
+    sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
+
+    LD_UB4(src, 16, src0, src1, src2, src3);
+    src += src_stride;
+    LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
+    ref += ref_stride;
+    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
+    sec_pred += 64;
+    AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
+                comp1, comp2, comp3);
+    sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
+    sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
+
+    LD_UB4(src, 16, src0, src1, src2, src3);
+    src += src_stride;
+    LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
+    ref += ref_stride;
+    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
+    sec_pred += 64;
+    AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
+                comp1, comp2, comp3);
+    sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
+    sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
+
+    LD_UB4(src, 16, src0, src1, src2, src3);
+    src += src_stride;
+    LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
+    ref += ref_stride;
+    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
+    sec_pred += 64;
+    AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
+                comp1, comp2, comp3);
+    sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
+    sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
+  }
+
+  sad = __msa_hadd_u_w(sad0, sad0);
+  sad += __msa_hadd_u_w(sad1, sad1);
+
+  return HADD_SW_S32(sad);
+}
+
+#define AOM_SAD_4xHEIGHT_MSA(height)                                         \
+  uint32_t aom_sad4x##height##_msa(const uint8_t *src, int32_t src_stride,   \
+                                   const uint8_t *ref, int32_t ref_stride) { \
+    return sad_4width_msa(src, src_stride, ref, ref_stride, height);         \
+  }
+
+#define AOM_SAD_8xHEIGHT_MSA(height)                                         \
+  uint32_t aom_sad8x##height##_msa(const uint8_t *src, int32_t src_stride,   \
+                                   const uint8_t *ref, int32_t ref_stride) { \
+    return sad_8width_msa(src, src_stride, ref, ref_stride, height);         \
+  }
+
+#define AOM_SAD_16xHEIGHT_MSA(height)                                         \
+  uint32_t aom_sad16x##height##_msa(const uint8_t *src, int32_t src_stride,   \
+                                    const uint8_t *ref, int32_t ref_stride) { \
+    return sad_16width_msa(src, src_stride, ref, ref_stride, height);         \
+  }
+
+#define AOM_SAD_32xHEIGHT_MSA(height)                                         \
+  uint32_t aom_sad32x##height##_msa(const uint8_t *src, int32_t src_stride,   \
+                                    const uint8_t *ref, int32_t ref_stride) { \
+    return sad_32width_msa(src, src_stride, ref, ref_stride, height);         \
+  }
+
+#define AOM_SAD_64xHEIGHT_MSA(height)                                         \
+  uint32_t aom_sad64x##height##_msa(const uint8_t *src, int32_t src_stride,   \
+                                    const uint8_t *ref, int32_t ref_stride) { \
+    return sad_64width_msa(src, src_stride, ref, ref_stride, height);         \
+  }
+
+#define AOM_SAD_4xHEIGHTx4D_MSA(height)                                   \
+  void aom_sad4x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
+                                  const uint8_t *const refs[],            \
+                                  int32_t ref_stride, uint32_t *sads) {   \
+    sad_4width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
+  }
+
+#define AOM_SAD_8xHEIGHTx4D_MSA(height)                                   \
+  void aom_sad8x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
+                                  const uint8_t *const refs[],            \
+                                  int32_t ref_stride, uint32_t *sads) {   \
+    sad_8width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
+  }
+
+#define AOM_SAD_16xHEIGHTx4D_MSA(height)                                   \
+  void aom_sad16x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
+                                   const uint8_t *const refs[],            \
+                                   int32_t ref_stride, uint32_t *sads) {   \
+    sad_16width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
+  }
+
+#define AOM_SAD_32xHEIGHTx4D_MSA(height)                                   \
+  void aom_sad32x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
+                                   const uint8_t *const refs[],            \
+                                   int32_t ref_stride, uint32_t *sads) {   \
+    sad_32width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
+  }
+
+#define AOM_SAD_64xHEIGHTx4D_MSA(height)                                   \
+  void aom_sad64x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
+                                   const uint8_t *const refs[],            \
+                                   int32_t ref_stride, uint32_t *sads) {   \
+    sad_64width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
+  }
+
+#define AOM_AVGSAD_4xHEIGHT_MSA(height)                                        \
+  uint32_t aom_sad4x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \
+                                       const uint8_t *ref, int32_t ref_stride, \
+                                       const uint8_t *second_pred) {           \
+    return avgsad_4width_msa(src, src_stride, ref, ref_stride, height,         \
+                             second_pred);                                     \
+  }
+
+#define AOM_AVGSAD_8xHEIGHT_MSA(height)                                        \
+  uint32_t aom_sad8x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \
+                                       const uint8_t *ref, int32_t ref_stride, \
+                                       const uint8_t *second_pred) {           \
+    return avgsad_8width_msa(src, src_stride, ref, ref_stride, height,         \
+                             second_pred);                                     \
+  }
+
+#define AOM_AVGSAD_16xHEIGHT_MSA(height)                                \
+  uint32_t aom_sad16x##height##_avg_msa(                                \
+      const uint8_t *src, int32_t src_stride, const uint8_t *ref,       \
+      int32_t ref_stride, const uint8_t *second_pred) {                 \
+    return avgsad_16width_msa(src, src_stride, ref, ref_stride, height, \
+                              second_pred);                             \
+  }
+
+#define AOM_AVGSAD_32xHEIGHT_MSA(height)                                \
+  uint32_t aom_sad32x##height##_avg_msa(                                \
+      const uint8_t *src, int32_t src_stride, const uint8_t *ref,       \
+      int32_t ref_stride, const uint8_t *second_pred) {                 \
+    return avgsad_32width_msa(src, src_stride, ref, ref_stride, height, \
+                              second_pred);                             \
+  }
+
+#define AOM_AVGSAD_64xHEIGHT_MSA(height)                                \
+  uint32_t aom_sad64x##height##_avg_msa(                                \
+      const uint8_t *src, int32_t src_stride, const uint8_t *ref,       \
+      int32_t ref_stride, const uint8_t *second_pred) {                 \
+    return avgsad_64width_msa(src, src_stride, ref, ref_stride, height, \
+                              second_pred);                             \
+  }
+
+/* clang-format off */
+// 64x64
+AOM_SAD_64xHEIGHT_MSA(64)
+AOM_SAD_64xHEIGHTx4D_MSA(64)
+AOM_AVGSAD_64xHEIGHT_MSA(64)
+
+// 64x32
+AOM_SAD_64xHEIGHT_MSA(32)
+AOM_SAD_64xHEIGHTx4D_MSA(32)
+AOM_AVGSAD_64xHEIGHT_MSA(32)
+
+// 32x64
+AOM_SAD_32xHEIGHT_MSA(64)
+AOM_SAD_32xHEIGHTx4D_MSA(64)
+AOM_AVGSAD_32xHEIGHT_MSA(64)
+
+// 32x32
+AOM_SAD_32xHEIGHT_MSA(32)
+AOM_SAD_32xHEIGHTx4D_MSA(32)
+AOM_AVGSAD_32xHEIGHT_MSA(32)
+
+// 32x16
+AOM_SAD_32xHEIGHT_MSA(16)
+AOM_SAD_32xHEIGHTx4D_MSA(16)
+AOM_AVGSAD_32xHEIGHT_MSA(16)
+
+// 16x32
+AOM_SAD_16xHEIGHT_MSA(32)
+AOM_SAD_16xHEIGHTx4D_MSA(32)
+AOM_AVGSAD_16xHEIGHT_MSA(32)
+
+// 16x16
+AOM_SAD_16xHEIGHT_MSA(16)
+AOM_SAD_16xHEIGHTx4D_MSA(16)
+AOM_AVGSAD_16xHEIGHT_MSA(16)
+
+// 16x8
+AOM_SAD_16xHEIGHT_MSA(8)
+AOM_SAD_16xHEIGHTx4D_MSA(8)
+AOM_AVGSAD_16xHEIGHT_MSA(8)
+
+// 8x16
+AOM_SAD_8xHEIGHT_MSA(16)
+AOM_SAD_8xHEIGHTx4D_MSA(16)
+AOM_AVGSAD_8xHEIGHT_MSA(16)
+
+// 8x8
+AOM_SAD_8xHEIGHT_MSA(8)
+AOM_SAD_8xHEIGHTx4D_MSA(8)
+AOM_AVGSAD_8xHEIGHT_MSA(8)
+
+// 8x4
+AOM_SAD_8xHEIGHT_MSA(4)
+AOM_SAD_8xHEIGHTx4D_MSA(4)
+AOM_AVGSAD_8xHEIGHT_MSA(4)
+
+// 4x8
+AOM_SAD_4xHEIGHT_MSA(8)
+AOM_SAD_4xHEIGHTx4D_MSA(8)
+AOM_AVGSAD_4xHEIGHT_MSA(8)
+
+// 4x4
+AOM_SAD_4xHEIGHT_MSA(4)
+AOM_SAD_4xHEIGHTx4D_MSA(4)
+AOM_AVGSAD_4xHEIGHT_MSA(4)
+    /* clang-format on */
diff --git a/libs/libaom/src/aom_dsp/mips/sub_pixel_variance_msa.c b/libs/libaom/src/aom_dsp/mips/sub_pixel_variance_msa.c
new file mode 100644
index 000000000..810b6efaa
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/mips/sub_pixel_variance_msa.c
@@ -0,0 +1,1792 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_ports/mem.h"
+#include "aom_dsp/mips/macros_msa.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/variance.h"
+
+#define CALC_MSE_AVG_B(src, ref, var, sub)                          \
+  {                                                                 \
+    v16u8 src_l0_m, src_l1_m;                                       \
+    v8i16 res_l0_m, res_l1_m;                                       \
+                                                                    \
+    ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m);                      \
+    HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m);            \
+    DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \
+                                                                    \
+    sub += res_l0_m + res_l1_m;                                     \
+  }
+
+#define VARIANCE_WxH(sse, diff, shift) sse - (((uint32_t)diff * diff) >> shift)
+
+#define VARIANCE_LARGE_WxH(sse, diff, shift) \
+  sse - (((int64_t)diff * diff) >> shift)
+
+static uint32_t avg_sse_diff_4width_msa(const uint8_t *src_ptr,
+                                        int32_t src_stride,
+                                        const uint8_t *ref_ptr,
+                                        int32_t ref_stride,
+                                        const uint8_t *sec_pred, int32_t height,
+                                        int32_t *diff) {
+  int32_t ht_cnt;
+  uint32_t src0, src1, src2, src3;
+  uint32_t ref0, ref1, ref2, ref3;
+  v16u8 pred, src = { 0 };
+  v16u8 ref = { 0 };
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    pred = LD_UB(sec_pred);
+    sec_pred += 16;
+    LW4(src_ptr, src_stride, src0, src1, src2, src3);
+    src_ptr += (4 * src_stride);
+    LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
+    ref_ptr += (4 * ref_stride);
+
+    INSERT_W4_UB(src0, src1, src2, src3, src);
+    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+
+    src = __msa_aver_u_b(src, pred);
+    CALC_MSE_AVG_B(src, ref, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t avg_sse_diff_8width_msa(const uint8_t *src_ptr,
+                                        int32_t src_stride,
+                                        const uint8_t *ref_ptr,
+                                        int32_t ref_stride,
+                                        const uint8_t *sec_pred, int32_t height,
+                                        int32_t *diff) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, src2, src3;
+  v16u8 ref0, ref1, ref2, ref3;
+  v16u8 pred0, pred1;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LD_UB2(sec_pred, 16, pred0, pred1);
+    sec_pred += 32;
+    LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
+    src_ptr += (4 * src_stride);
+    LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
+    ref_ptr += (4 * ref_stride);
+
+    PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
+                ref0, ref1);
+    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t avg_sse_diff_16width_msa(const uint8_t *src_ptr,
+                                         int32_t src_stride,
+                                         const uint8_t *ref_ptr,
+                                         int32_t ref_stride,
+                                         const uint8_t *sec_pred,
+                                         int32_t height, int32_t *diff) {
+  int32_t ht_cnt;
+  v16u8 src, ref, pred;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    pred = LD_UB(sec_pred);
+    sec_pred += 16;
+    src = LD_UB(src_ptr);
+    src_ptr += src_stride;
+    ref = LD_UB(ref_ptr);
+    ref_ptr += ref_stride;
+    src = __msa_aver_u_b(src, pred);
+    CALC_MSE_AVG_B(src, ref, var, avg);
+
+    pred = LD_UB(sec_pred);
+    sec_pred += 16;
+    src = LD_UB(src_ptr);
+    src_ptr += src_stride;
+    ref = LD_UB(ref_ptr);
+    ref_ptr += ref_stride;
+    src = __msa_aver_u_b(src, pred);
+    CALC_MSE_AVG_B(src, ref, var, avg);
+
+    pred = LD_UB(sec_pred);
+    sec_pred += 16;
+    src = LD_UB(src_ptr);
+    src_ptr += src_stride;
+    ref = LD_UB(ref_ptr);
+    ref_ptr += ref_stride;
+    src = __msa_aver_u_b(src, pred);
+    CALC_MSE_AVG_B(src, ref, var, avg);
+
+    pred = LD_UB(sec_pred);
+    sec_pred += 16;
+    src = LD_UB(src_ptr);
+    src_ptr += src_stride;
+    ref = LD_UB(ref_ptr);
+    ref_ptr += ref_stride;
+    src = __msa_aver_u_b(src, pred);
+    CALC_MSE_AVG_B(src, ref, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t avg_sse_diff_32width_msa(const uint8_t *src_ptr,
+                                         int32_t src_stride,
+                                         const uint8_t *ref_ptr,
+                                         int32_t ref_stride,
+                                         const uint8_t *sec_pred,
+                                         int32_t height, int32_t *diff) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, ref0, ref1, pred0, pred1;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LD_UB2(sec_pred, 16, pred0, pred1);
+    sec_pred += 32;
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+
+    LD_UB2(sec_pred, 16, pred0, pred1);
+    sec_pred += 32;
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+
+    LD_UB2(sec_pred, 16, pred0, pred1);
+    sec_pred += 32;
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+
+    LD_UB2(sec_pred, 16, pred0, pred1);
+    sec_pred += 32;
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t avg_sse_diff_32x64_msa(const uint8_t *src_ptr,
+                                       int32_t src_stride,
+                                       const uint8_t *ref_ptr,
+                                       int32_t ref_stride,
+                                       const uint8_t *sec_pred, int32_t *diff) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, ref0, ref1, pred0, pred1;
+  v8i16 avg0 = { 0 };
+  v8i16 avg1 = { 0 };
+  v4i32 vec, var = { 0 };
+
+  for (ht_cnt = 16; ht_cnt--;) {
+    LD_UB2(sec_pred, 16, pred0, pred1);
+    sec_pred += 32;
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+
+    LD_UB2(sec_pred, 16, pred0, pred1);
+    sec_pred += 32;
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+
+    LD_UB2(sec_pred, 16, pred0, pred1);
+    sec_pred += 32;
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+
+    LD_UB2(sec_pred, 16, pred0, pred1);
+    sec_pred += 32;
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+  }
+
+  vec = __msa_hadd_s_w(avg0, avg0);
+  vec += __msa_hadd_s_w(avg1, avg1);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t avg_sse_diff_64x32_msa(const uint8_t *src_ptr,
+                                       int32_t src_stride,
+                                       const uint8_t *ref_ptr,
+                                       int32_t ref_stride,
+                                       const uint8_t *sec_pred, int32_t *diff) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, src2, src3;
+  v16u8 ref0, ref1, ref2, ref3;
+  v16u8 pred0, pred1, pred2, pred3;
+  v8i16 avg0 = { 0 };
+  v8i16 avg1 = { 0 };
+  v4i32 vec, var = { 0 };
+
+  for (ht_cnt = 16; ht_cnt--;) {
+    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
+    sec_pred += 64;
+    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
+    src_ptr += src_stride;
+    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
+    ref_ptr += ref_stride;
+    AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1,
+                src2, src3);
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src2, ref2, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+    CALC_MSE_AVG_B(src3, ref3, var, avg1);
+
+    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
+    sec_pred += 64;
+    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
+    src_ptr += src_stride;
+    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
+    ref_ptr += ref_stride;
+    AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1,
+                src2, src3);
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src2, ref2, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+    CALC_MSE_AVG_B(src3, ref3, var, avg1);
+  }
+
+  vec = __msa_hadd_s_w(avg0, avg0);
+  vec += __msa_hadd_s_w(avg1, avg1);
+
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t avg_sse_diff_64x64_msa(const uint8_t *src_ptr,
+                                       int32_t src_stride,
+                                       const uint8_t *ref_ptr,
+                                       int32_t ref_stride,
+                                       const uint8_t *sec_pred, int32_t *diff) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, src2, src3;
+  v16u8 ref0, ref1, ref2, ref3;
+  v16u8 pred0, pred1, pred2, pred3;
+  v8i16 avg0 = { 0 };
+  v8i16 avg1 = { 0 };
+  v8i16 avg2 = { 0 };
+  v8i16 avg3 = { 0 };
+  v4i32 vec, var = { 0 };
+
+  for (ht_cnt = 32; ht_cnt--;) {
+    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
+    sec_pred += 64;
+    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
+    src_ptr += src_stride;
+    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
+    ref_ptr += ref_stride;
+    AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1,
+                src2, src3);
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+    CALC_MSE_AVG_B(src2, ref2, var, avg2);
+    CALC_MSE_AVG_B(src3, ref3, var, avg3);
+
+    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
+    sec_pred += 64;
+    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
+    src_ptr += src_stride;
+    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
+    ref_ptr += ref_stride;
+    AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1,
+                src2, src3);
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+    CALC_MSE_AVG_B(src2, ref2, var, avg2);
+    CALC_MSE_AVG_B(src3, ref3, var, avg3);
+  }
+
+  vec = __msa_hadd_s_w(avg0, avg0);
+  vec += __msa_hadd_s_w(avg1, avg1);
+  vec += __msa_hadd_s_w(avg2, avg2);
+  vec += __msa_hadd_s_w(avg3, avg3);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_4width_h_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  uint32_t ref0, ref1, ref2, ref3;
+  v16u8 filt0, ref = { 0 };
+  v16i8 src0, src1, src2, src3;
+  v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+  v8u16 vec0, vec1, vec2, vec3;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter);
+  filt0 = (v16u8)__msa_fill_h(filtval);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+    LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+                vec2, vec3);
+    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+    PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1,
+                src2, src3);
+    ILVEV_W2_SB(src0, src1, src2, src3, src0, src2);
+    src0 = (v16i8)__msa_ilvev_d((v2i64)src2, (v2i64)src0);
+    CALC_MSE_AVG_B(src0, ref, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_8width_h_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  v16u8 filt0, out, ref0, ref1, ref2, ref3;
+  v16i8 src0, src1, src2, src3;
+  v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+  v8u16 vec0, vec1, vec2, vec3;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter);
+  filt0 = (v16u8)__msa_fill_h(filtval);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+
+    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+                vec2, vec3);
+    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+    PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1,
+                src2, src3);
+    out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0);
+    CALC_MSE_AVG_B(out, ref0, var, avg);
+    out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2);
+    CALC_MSE_AVG_B(out, ref1, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_16width_h_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+  v16u8 dst0, dst1, dst2, dst3, filt0;
+  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8u16 out0, out1, out2, out3, out4, out5, out6, out7;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter);
+  filt0 = (v16u8)__msa_fill_h(filtval);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_SB4(src, src_stride, src0, src2, src4, src6);
+    LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+    src += (4 * src_stride);
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    dst += (4 * dst_stride);
+
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5);
+    VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
+                out2, out3);
+    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
+                out6, out7);
+    SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
+    SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
+    PCKEV_B4_SB(out1, out0, out3, out2, out5, out4, out7, out6, src0, src1,
+                src2, src3);
+    CALC_MSE_AVG_B(src0, dst0, var, avg);
+    CALC_MSE_AVG_B(src1, dst1, var, avg);
+    CALC_MSE_AVG_B(src2, dst2, var, avg);
+    CALC_MSE_AVG_B(src3, dst3, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_32width_h_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+  uint32_t loop_cnt, sse = 0;
+  int32_t diff0[2];
+
+  for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
+    sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride,
+                                            filter, height, &diff0[loop_cnt]);
+    src += 16;
+    dst += 16;
+  }
+
+  *diff = diff0[0] + diff0[1];
+
+  return sse;
+}
+
+static uint32_t sub_pixel_sse_diff_64width_h_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+  uint32_t loop_cnt, sse = 0;
+  int32_t diff0[4];
+
+  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
+    sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride,
+                                            filter, height, &diff0[loop_cnt]);
+    src += 16;
+    dst += 16;
+  }
+
+  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
+
+  return sse;
+}
+
+static uint32_t sub_pixel_sse_diff_4width_v_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  uint32_t ref0, ref1, ref2, ref3;
+  v16u8 src0, src1, src2, src3, src4, out;
+  v16u8 src10_r, src32_r, src21_r, src43_r;
+  v16u8 ref = { 0 };
+  v16u8 src2110, src4332;
+  v16u8 filt0;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+  v8u16 tmp0, tmp1;
+
+  filtval = LH(filter);
+  filt0 = (v16u8)__msa_fill_h(filtval);
+
+  src0 = LD_UB(src);
+  src += src_stride;
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_UB4(src, src_stride, src1, src2, src3, src4);
+    src += (4 * src_stride);
+    LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+
+    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+               src32_r, src43_r);
+    ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
+    DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+    CALC_MSE_AVG_B(out, ref, var, avg);
+    src0 = src4;
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_8width_v_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  v16u8 src0, src1, src2, src3, src4;
+  v16u8 ref0, ref1, ref2, ref3;
+  v8u16 vec0, vec1, vec2, vec3;
+  v8u16 tmp0, tmp1, tmp2, tmp3;
+  v16u8 filt0;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter);
+  filt0 = (v16u8)__msa_fill_h(filtval);
+
+  src0 = LD_UB(src);
+  src += src_stride;
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_UB4(src, src_stride, src1, src2, src3, src4);
+    src += (4 * src_stride);
+    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+
+    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
+    ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2,
+               vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
+                tmp2, tmp3);
+    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+    src0 = src4;
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_16width_v_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  v16u8 ref0, ref1, ref2, ref3;
+  v16u8 src0, src1, src2, src3, src4;
+  v16u8 out0, out1, out2, out3;
+  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8u16 tmp0, tmp1, tmp2, tmp3;
+  v16u8 filt0;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter);
+  filt0 = (v16u8)__msa_fill_h(filtval);
+
+  src0 = LD_UB(src);
+  src += src_stride;
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_UB4(src, src_stride, src1, src2, src3, src4);
+    src += (4 * src_stride);
+    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+
+    ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
+    ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
+    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+    ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
+    ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
+    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
+
+    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
+
+    src0 = src4;
+
+    CALC_MSE_AVG_B(out0, ref0, var, avg);
+    CALC_MSE_AVG_B(out1, ref1, var, avg);
+    CALC_MSE_AVG_B(out2, ref2, var, avg);
+    CALC_MSE_AVG_B(out3, ref3, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_32width_v_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+  uint32_t loop_cnt, sse = 0;
+  int32_t diff0[2];
+
+  for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
+    sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride,
+                                            filter, height, &diff0[loop_cnt]);
+    src += 16;
+    dst += 16;
+  }
+
+  *diff = diff0[0] + diff0[1];
+
+  return sse;
+}
+
+static uint32_t sub_pixel_sse_diff_64width_v_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+  uint32_t loop_cnt, sse = 0;
+  int32_t diff0[4];
+
+  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
+    sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride,
+                                            filter, height, &diff0[loop_cnt]);
+    src += 16;
+    dst += 16;
+  }
+
+  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
+
+  return sse;
+}
+
+static uint32_t sub_pixel_sse_diff_4width_hv_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
+    int32_t height, int32_t *diff) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  uint32_t ref0, ref1, ref2, ref3;
+  v16u8 src0, src1, src2, src3, src4;
+  v16u8 out, ref = { 0 };
+  v16u8 filt_vt, filt_hz, vec0, vec1;
+  v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
+  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4;
+  v8u16 tmp0, tmp1;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter_horiz);
+  filt_hz = (v16u8)__msa_fill_h(filtval);
+  filtval = LH(filter_vert);
+  filt_vt = (v16u8)__msa_fill_h(filtval);
+
+  src0 = LD_UB(src);
+  src += src_stride;
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_UB4(src, src_stride, src1, src2, src3, src4);
+    src += (4 * src_stride);
+    LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
+    hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+    hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
+    hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
+    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+    CALC_MSE_AVG_B(out, ref, var, avg);
+    src0 = src4;
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_8width_hv_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
+    int32_t height, int32_t *diff) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  v16u8 ref0, ref1, ref2, ref3;
+  v16u8 src0, src1, src2, src3, src4;
+  v16u8 out0, out1;
+  v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+  v8u16 hz_out0, hz_out1;
+  v8u16 tmp0, tmp1, tmp2, tmp3;
+  v16u8 filt_vt, filt_hz, vec0;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter_horiz);
+  filt_hz = (v16u8)__msa_fill_h(filtval);
+  filtval = LH(filter_vert);
+  filt_vt = (v16u8)__msa_fill_h(filtval);
+
+  src0 = LD_UB(src);
+  src += src_stride;
+  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_UB4(src, src_stride, src1, src2, src3, src4);
+    src += (4 * src_stride);
+    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+
+    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
+    hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+    tmp0 = __msa_dotp_u_h(vec0, filt_vt);
+    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+    tmp1 = __msa_dotp_u_h(vec0, filt_vt);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+    tmp2 = __msa_dotp_u_h(vec0, filt_vt);
+    hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+    tmp3 = __msa_dotp_u_h(vec0, filt_vt);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+    CALC_MSE_AVG_B(out0, ref0, var, avg);
+    CALC_MSE_AVG_B(out1, ref1, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_16width_hv_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
+    int32_t height, int32_t *diff) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16u8 ref0, ref1, ref2, ref3;
+  v16u8 filt_hz, filt_vt, vec0, vec1;
+  v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+  v8u16 hz_out0, hz_out1, hz_out2, hz_out3;
+  v8u16 tmp0, tmp1;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter_horiz);
+  filt_hz = (v16u8)__msa_fill_h(filtval);
+  filtval = LH(filter_vert);
+  filt_vt = (v16u8)__msa_fill_h(filtval);
+
+  LD_UB2(src, 8, src0, src1);
+  src += src_stride;
+
+  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+  hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_UB4(src, src_stride, src0, src2, src4, src6);
+    LD_UB4(src + 8, src_stride, src1, src3, src5, src7);
+    src += (4 * src_stride);
+    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+
+    hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+    hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    src0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    src1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+    hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+    hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
+    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    src2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
+    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    src3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+    CALC_MSE_AVG_B(src2, ref2, var, avg);
+    CALC_MSE_AVG_B(src3, ref3, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_32width_hv_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
+    int32_t height, int32_t *diff) {
+  uint32_t loop_cnt, sse = 0;
+  int32_t diff0[2];
+
+  for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
+    sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride,
+                                             filter_horiz, filter_vert, height,
+                                             &diff0[loop_cnt]);
+    src += 16;
+    dst += 16;
+  }
+
+  *diff = diff0[0] + diff0[1];
+
+  return sse;
+}
+
+static uint32_t sub_pixel_sse_diff_64width_hv_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
+    int32_t height, int32_t *diff) {
+  uint32_t loop_cnt, sse = 0;
+  int32_t diff0[4];
+
+  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
+    sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride,
+                                             filter_horiz, filter_vert, height,
+                                             &diff0[loop_cnt]);
+    src += 16;
+    dst += 16;
+  }
+
+  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
+
+  return sse;
+}
+
+static uint32_t sub_pixel_avg_sse_diff_4width_h_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+    int32_t height, int32_t *diff) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  uint32_t ref0, ref1, ref2, ref3;
+  v16u8 out, pred, filt0, ref = { 0 };
+  v16i8 src0, src1, src2, src3;
+  v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+  v8u16 vec0, vec1, vec2, vec3;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter);
+  filt0 = (v16u8)__msa_fill_h(filtval);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+    pred = LD_UB(sec_pred);
+    sec_pred += 16;
+    LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+
+    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+                vec2, vec3);
+    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+    PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1,
+                src2, src3);
+    ILVEV_W2_SB(src0, src1, src2, src3, src0, src2);
+    out = (v16u8)__msa_ilvev_d((v2i64)src2, (v2i64)src0);
+    out = __msa_aver_u_b(out, pred);
+    CALC_MSE_AVG_B(out, ref, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_avg_sse_diff_8width_h_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+    int32_t height, int32_t *diff) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  v16u8 out, pred, filt0;
+  v16u8 ref0, ref1, ref2, ref3;
+  v16i8 src0, src1, src2, src3;
+  v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+  v8u16 vec0, vec1, vec2, vec3;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter);
+  filt0 = (v16u8)__msa_fill_h(filtval);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+
+    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
+                vec2, vec3);
+    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+    PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1,
+                src2, src3);
+    out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0);
+
+    pred = LD_UB(sec_pred);
+    sec_pred += 16;
+    out = __msa_aver_u_b(out, pred);
+    CALC_MSE_AVG_B(out, ref0, var, avg);
+    out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2);
+    pred = LD_UB(sec_pred);
+    sec_pred += 16;
+    out = __msa_aver_u_b(out, pred);
+    CALC_MSE_AVG_B(out, ref1, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t subpel_avg_ssediff_16w_h_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+    int32_t height, int32_t *diff, int32_t width) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+  v16u8 dst0, dst1, dst2, dst3;
+  v16u8 tmp0, tmp1, tmp2, tmp3;
+  v16u8 pred0, pred1, pred2, pred3, filt0;
+  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8u16 out0, out1, out2, out3, out4, out5, out6, out7;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter);
+  filt0 = (v16u8)__msa_fill_h(filtval);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_SB4(src, src_stride, src0, src2, src4, src6);
+    LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+    src += (4 * src_stride);
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    dst += (4 * dst_stride);
+    LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3);
+    sec_pred += (4 * width);
+
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5);
+    VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
+                out2, out3);
+    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
+                out6, out7);
+    SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
+    SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
+    PCKEV_B4_UB(out1, out0, out3, out2, out5, out4, out7, out6, tmp0, tmp1,
+                tmp2, tmp3);
+    AVER_UB4_UB(tmp0, pred0, tmp1, pred1, tmp2, pred2, tmp3, pred3, tmp0, tmp1,
+                tmp2, tmp3);
+
+    CALC_MSE_AVG_B(tmp0, dst0, var, avg);
+    CALC_MSE_AVG_B(tmp1, dst1, var, avg);
+    CALC_MSE_AVG_B(tmp2, dst2, var, avg);
+    CALC_MSE_AVG_B(tmp3, dst3, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_avg_sse_diff_16width_h_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+    int32_t height, int32_t *diff) {
+  return subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride,
+                                      sec_pred, filter, height, diff, 16);
+}
+
+static uint32_t sub_pixel_avg_sse_diff_32width_h_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+    int32_t height, int32_t *diff) {
+  uint32_t loop_cnt, sse = 0;
+  int32_t diff0[2];
+
+  for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
+    sse +=
+        subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride, sec_pred,
+                                     filter, height, &diff0[loop_cnt], 32);
+    src += 16;
+    dst += 16;
+    sec_pred += 16;
+  }
+
+  *diff = diff0[0] + diff0[1];
+
+  return sse;
+}
+
+static uint32_t sub_pixel_avg_sse_diff_64width_h_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+    int32_t height, int32_t *diff) {
+  uint32_t loop_cnt, sse = 0;
+  int32_t diff0[4];
+
+  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
+    sse +=
+        subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride, sec_pred,
+                                     filter, height, &diff0[loop_cnt], 64);
+    src += 16;
+    dst += 16;
+    sec_pred += 16;
+  }
+
+  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
+
+  return sse;
+}
+
+static uint32_t sub_pixel_avg_sse_diff_4width_v_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+    int32_t height, int32_t *diff) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  uint32_t ref0, ref1, ref2, ref3;
+  v16u8 src0, src1, src2, src3, src4;
+  v16u8 src10_r, src32_r, src21_r, src43_r;
+  v16u8 out, pred, ref = { 0 };
+  v16u8 src2110, src4332, filt0;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+  v8u16 tmp0, tmp1;
+
+  filtval = LH(filter);
+  filt0 = (v16u8)__msa_fill_h(filtval);
+
+  src0 = LD_UB(src);
+  src += src_stride;
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_UB4(src, src_stride, src1, src2, src3, src4);
+    src += (4 * src_stride);
+    pred = LD_UB(sec_pred);
+    sec_pred += 16;
+    LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+
+    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
+               src32_r, src43_r);
+    ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
+    DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+
+    out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+    out = __msa_aver_u_b(out, pred);
+    CALC_MSE_AVG_B(out, ref, var, avg);
+    src0 = src4;
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_avg_sse_diff_8width_v_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+    int32_t height, int32_t *diff) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  v16u8 src0, src1, src2, src3, src4;
+  v16u8 ref0, ref1, ref2, ref3;
+  v16u8 pred0, pred1, filt0;
+  v8u16 vec0, vec1, vec2, vec3;
+  v8u16 tmp0, tmp1, tmp2, tmp3;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter);
+  filt0 = (v16u8)__msa_fill_h(filtval);
+
+  src0 = LD_UB(src);
+  src += src_stride;
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_UB4(src, src_stride, src1, src2, src3, src4);
+    src += (4 * src_stride);
+    LD_UB2(sec_pred, 16, pred0, pred1);
+    sec_pred += 32;
+    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
+    ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2,
+               vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
+                tmp2, tmp3);
+    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
+    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+
+    src0 = src4;
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t subpel_avg_ssediff_16w_v_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+    int32_t height, int32_t *diff, int32_t width) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  v16u8 ref0, ref1, ref2, ref3;
+  v16u8 pred0, pred1, pred2, pred3;
+  v16u8 src0, src1, src2, src3, src4;
+  v16u8 out0, out1, out2, out3, filt0;
+  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8u16 tmp0, tmp1, tmp2, tmp3;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter);
+  filt0 = (v16u8)__msa_fill_h(filtval);
+
+  src0 = LD_UB(src);
+  src += src_stride;
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_UB4(src, src_stride, src1, src2, src3, src4);
+    src += (4 * src_stride);
+    LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3);
+    sec_pred += (4 * width);
+
+    ILVR_B2_UH(src1, src0, src2, src1, vec0, vec2);
+    ILVL_B2_UH(src1, src0, src2, src1, vec1, vec3);
+    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+    ILVR_B2_UH(src3, src2, src4, src3, vec4, vec6);
+    ILVL_B2_UH(src3, src2, src4, src3, vec5, vec7);
+    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
+
+    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
+
+    src0 = src4;
+    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+
+    AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3, out0, out1,
+                out2, out3);
+
+    CALC_MSE_AVG_B(out0, ref0, var, avg);
+    CALC_MSE_AVG_B(out1, ref1, var, avg);
+    CALC_MSE_AVG_B(out2, ref2, var, avg);
+    CALC_MSE_AVG_B(out3, ref3, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_avg_sse_diff_16width_v_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+    int32_t height, int32_t *diff) {
+  return subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride,
+                                      sec_pred, filter, height, diff, 16);
+}
+
+static uint32_t sub_pixel_avg_sse_diff_32width_v_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+    int32_t height, int32_t *diff) {
+  uint32_t loop_cnt, sse = 0;
+  int32_t diff0[2];
+
+  for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
+    sse +=
+        subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride, sec_pred,
+                                     filter, height, &diff0[loop_cnt], 32);
+    src += 16;
+    dst += 16;
+    sec_pred += 16;
+  }
+
+  *diff = diff0[0] + diff0[1];
+
+  return sse;
+}
+
+static uint32_t sub_pixel_avg_sse_diff_64width_v_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+    int32_t height, int32_t *diff) {
+  uint32_t loop_cnt, sse = 0;
+  int32_t diff0[4];
+
+  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
+    sse +=
+        subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride, sec_pred,
+                                     filter, height, &diff0[loop_cnt], 64);
+    src += 16;
+    dst += 16;
+    sec_pred += 16;
+  }
+
+  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
+
+  return sse;
+}
+
+static uint32_t sub_pixel_avg_sse_diff_4width_hv_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
+    const uint8_t *filter_vert, int32_t height, int32_t *diff) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  uint32_t ref0, ref1, ref2, ref3;
+  v16u8 src0, src1, src2, src3, src4;
+  v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
+  v16u8 filt_hz, filt_vt, vec0, vec1;
+  v16u8 out, pred, ref = { 0 };
+  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter_horiz);
+  filt_hz = (v16u8)__msa_fill_h(filtval);
+  filtval = LH(filter_vert);
+  filt_vt = (v16u8)__msa_fill_h(filtval);
+
+  src0 = LD_UB(src);
+  src += src_stride;
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_UB4(src, src_stride, src1, src2, src3, src4);
+    src += (4 * src_stride);
+    pred = LD_UB(sec_pred);
+    sec_pred += 16;
+    LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
+    hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+    hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
+    hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
+    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+    out = __msa_aver_u_b(out, pred);
+    CALC_MSE_AVG_B(out, ref, var, avg);
+    src0 = src4;
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_avg_sse_diff_8width_hv_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
+    const uint8_t *filter_vert, int32_t height, int32_t *diff) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  v16u8 ref0, ref1, ref2, ref3;
+  v16u8 src0, src1, src2, src3, src4;
+  v16u8 pred0, pred1, out0, out1;
+  v16u8 filt_hz, filt_vt, vec0;
+  v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+  v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter_horiz);
+  filt_hz = (v16u8)__msa_fill_h(filtval);
+  filtval = LH(filter_vert);
+  filt_vt = (v16u8)__msa_fill_h(filtval);
+
+  src0 = LD_UB(src);
+  src += src_stride;
+  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_UB4(src, src_stride, src1, src2, src3, src4);
+    src += (4 * src_stride);
+    LD_UB2(sec_pred, 16, pred0, pred1);
+    sec_pred += 32;
+    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+
+    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
+    hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+    tmp0 = __msa_dotp_u_h(vec0, filt_vt);
+    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+    tmp1 = __msa_dotp_u_h(vec0, filt_vt);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+    tmp2 = __msa_dotp_u_h(vec0, filt_vt);
+    hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+    tmp3 = __msa_dotp_u_h(vec0, filt_vt);
+
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+    AVER_UB2_UB(out0, pred0, out1, pred1, out0, out1);
+
+    CALC_MSE_AVG_B(out0, ref0, var, avg);
+    CALC_MSE_AVG_B(out1, ref1, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t subpel_avg_ssediff_16w_hv_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
+    const uint8_t *filter_vert, int32_t height, int32_t *diff, int32_t width) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16u8 ref0, ref1, ref2, ref3;
+  v16u8 pred0, pred1, pred2, pred3;
+  v16u8 out0, out1, out2, out3;
+  v16u8 filt_hz, filt_vt, vec0, vec1;
+  v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter_horiz);
+  filt_hz = (v16u8)__msa_fill_h(filtval);
+  filtval = LH(filter_vert);
+  filt_vt = (v16u8)__msa_fill_h(filtval);
+
+  LD_UB2(src, 8, src0, src1);
+  src += src_stride;
+
+  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+  hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_UB4(src, src_stride, src0, src2, src4, src6);
+    LD_UB4(src + 8, src_stride, src1, src3, src5, src7);
+    src += (4 * src_stride);
+    LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3);
+    sec_pred += (4 * width);
+
+    hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+    hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    out1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+    hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+    hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
+    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
+    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    out3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+
+    AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3, out0, out1,
+                out2, out3);
+
+    CALC_MSE_AVG_B(out0, ref0, var, avg);
+    CALC_MSE_AVG_B(out1, ref1, var, avg);
+    CALC_MSE_AVG_B(out2, ref2, var, avg);
+    CALC_MSE_AVG_B(out3, ref3, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_avg_sse_diff_16width_hv_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
+    const uint8_t *filter_vert, int32_t height, int32_t *diff) {
+  return subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride,
+                                       sec_pred, filter_horiz, filter_vert,
+                                       height, diff, 16);
+}
+
+static uint32_t sub_pixel_avg_sse_diff_32width_hv_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
+    const uint8_t *filter_vert, int32_t height, int32_t *diff) {
+  uint32_t loop_cnt, sse = 0;
+  int32_t diff0[2];
+
+  for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
+    sse += subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride,
+                                         sec_pred, filter_horiz, filter_vert,
+                                         height, &diff0[loop_cnt], 32);
+    src += 16;
+    dst += 16;
+    sec_pred += 16;
+  }
+
+  *diff = diff0[0] + diff0[1];
+
+  return sse;
+}
+
+static uint32_t sub_pixel_avg_sse_diff_64width_hv_msa(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
+    const uint8_t *filter_vert, int32_t height, int32_t *diff) {
+  uint32_t loop_cnt, sse = 0;
+  int32_t diff0[4];
+
+  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
+    sse += subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride,
+                                         sec_pred, filter_horiz, filter_vert,
+                                         height, &diff0[loop_cnt], 64);
+    src += 16;
+    dst += 16;
+    sec_pred += 16;
+  }
+
+  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
+
+  return sse;
+}
+
+#define VARIANCE_4Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 4);
+#define VARIANCE_4Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 5);
+#define VARIANCE_8Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 5);
+#define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6);
+#define VARIANCE_8Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 7);
+#define VARIANCE_16Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 7);
+#define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8);
+
+#define VARIANCE_16Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
+#define VARIANCE_32Wx16H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
+#define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10);
+#define VARIANCE_32Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
+#define VARIANCE_64Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
+#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12);
+
+#define AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(wd, ht)                              \
+  uint32_t aom_sub_pixel_variance##wd##x##ht##_msa(                           \
+      const uint8_t *src, int32_t src_stride, int32_t xoffset,                \
+      int32_t yoffset, const uint8_t *ref, int32_t ref_stride,                \
+      uint32_t *sse) {                                                        \
+    int32_t diff;                                                             \
+    uint32_t var;                                                             \
+    const uint8_t *h_filter = bilinear_filters_2t[xoffset];                   \
+    const uint8_t *v_filter = bilinear_filters_2t[yoffset];                   \
+                                                                              \
+    if (yoffset) {                                                            \
+      if (xoffset) {                                                          \
+        *sse = sub_pixel_sse_diff_##wd##width_hv_msa(                         \
+            src, src_stride, ref, ref_stride, h_filter, v_filter, ht, &diff); \
+      } else {                                                                \
+        *sse = sub_pixel_sse_diff_##wd##width_v_msa(                          \
+            src, src_stride, ref, ref_stride, v_filter, ht, &diff);           \
+      }                                                                       \
+                                                                              \
+      var = VARIANCE_##wd##Wx##ht##H(*sse, diff);                             \
+    } else {                                                                  \
+      if (xoffset) {                                                          \
+        *sse = sub_pixel_sse_diff_##wd##width_h_msa(                          \
+            src, src_stride, ref, ref_stride, h_filter, ht, &diff);           \
+                                                                              \
+        var = VARIANCE_##wd##Wx##ht##H(*sse, diff);                           \
+      } else {                                                                \
+        var = aom_variance##wd##x##ht##_msa(src, src_stride, ref, ref_stride, \
+                                            sse);                             \
+      }                                                                       \
+    }                                                                         \
+                                                                              \
+    return var;                                                               \
+  }
+
+/* clang-format off */
+AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 4)
+AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 8)
+
+AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 4)
+AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 8)
+AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 16)
+
+AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 8)
+AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 16)
+AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 32)
+
+AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 16)
+AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 32)
+AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 64)
+
+AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 32)
+AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 64)
+/* clang-format on */
+
+#define AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(wd, ht)                          \
+  uint32_t aom_sub_pixel_avg_variance##wd##x##ht##_msa(                       \
+      const uint8_t *src_ptr, int32_t src_stride, int32_t xoffset,            \
+      int32_t yoffset, const uint8_t *ref_ptr, int32_t ref_stride,            \
+      uint32_t *sse, const uint8_t *sec_pred) {                               \
+    int32_t diff;                                                             \
+    const uint8_t *h_filter = bilinear_filters_2t[xoffset];                   \
+    const uint8_t *v_filter = bilinear_filters_2t[yoffset];                   \
+                                                                              \
+    if (yoffset) {                                                            \
+      if (xoffset) {                                                          \
+        *sse = sub_pixel_avg_sse_diff_##wd##width_hv_msa(                     \
+            src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter,     \
+            v_filter, ht, &diff);                                             \
+      } else {                                                                \
+        *sse = sub_pixel_avg_sse_diff_##wd##width_v_msa(                      \
+            src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, v_filter, ht, \
+            &diff);                                                           \
+      }                                                                       \
+    } else {                                                                  \
+      if (xoffset) {                                                          \
+        *sse = sub_pixel_avg_sse_diff_##wd##width_h_msa(                      \
+            src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \
+            &diff);                                                           \
+      } else {                                                                \
+        *sse = avg_sse_diff_##wd##width_msa(src_ptr, src_stride, ref_ptr,     \
+                                            ref_stride, sec_pred, ht, &diff); \
+      }                                                                       \
+    }                                                                         \
+                                                                              \
+    return VARIANCE_##wd##Wx##ht##H(*sse, diff);                              \
+  }
+
+/* clang-format off */
+AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 4)
+AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 8)
+
+AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 4)
+AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 8)
+AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 16)
+
+AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 8)
+AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 16)
+AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 32)
+
+AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 16)
+AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 32)
+/* clang-format on */
+
+uint32_t aom_sub_pixel_avg_variance32x64_msa(const uint8_t *src_ptr,
+                                             int32_t src_stride,
+                                             int32_t xoffset, int32_t yoffset,
+                                             const uint8_t *ref_ptr,
+                                             int32_t ref_stride, uint32_t *sse,
+                                             const uint8_t *sec_pred) {
+  int32_t diff;
+  const uint8_t *h_filter = bilinear_filters_2t[xoffset];
+  const uint8_t *v_filter = bilinear_filters_2t[yoffset];
+
+  if (yoffset) {
+    if (xoffset) {
+      *sse = sub_pixel_avg_sse_diff_32width_hv_msa(
+          src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter,
+          v_filter, 64, &diff);
+    } else {
+      *sse = sub_pixel_avg_sse_diff_32width_v_msa(src_ptr, src_stride, ref_ptr,
+                                                  ref_stride, sec_pred,
+                                                  v_filter, 64, &diff);
+    }
+  } else {
+    if (xoffset) {
+      *sse = sub_pixel_avg_sse_diff_32width_h_msa(src_ptr, src_stride, ref_ptr,
+                                                  ref_stride, sec_pred,
+                                                  h_filter, 64, &diff);
+    } else {
+      *sse = avg_sse_diff_32x64_msa(src_ptr, src_stride, ref_ptr, ref_stride,
+                                    sec_pred, &diff);
+    }
+  }
+
+  return VARIANCE_32Wx64H(*sse, diff);
+}
+
+#define AOM_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(ht)                           \
+  uint32_t aom_sub_pixel_avg_variance64x##ht##_msa(                           \
+      const uint8_t *src_ptr, int32_t src_stride, int32_t xoffset,            \
+      int32_t yoffset, const uint8_t *ref_ptr, int32_t ref_stride,            \
+      uint32_t *sse, const uint8_t *sec_pred) {                               \
+    int32_t diff;                                                             \
+    const uint8_t *h_filter = bilinear_filters_2t[xoffset];                   \
+    const uint8_t *v_filter = bilinear_filters_2t[yoffset];                   \
+                                                                              \
+    if (yoffset) {                                                            \
+      if (xoffset) {                                                          \
+        *sse = sub_pixel_avg_sse_diff_64width_hv_msa(                         \
+            src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter,     \
+            v_filter, ht, &diff);                                             \
+      } else {                                                                \
+        *sse = sub_pixel_avg_sse_diff_64width_v_msa(                          \
+            src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, v_filter, ht, \
+            &diff);                                                           \
+      }                                                                       \
+    } else {                                                                  \
+      if (xoffset) {                                                          \
+        *sse = sub_pixel_avg_sse_diff_64width_h_msa(                          \
+            src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \
+            &diff);                                                           \
+      } else {                                                                \
+        *sse = avg_sse_diff_64x##ht##_msa(src_ptr, src_stride, ref_ptr,       \
+                                          ref_stride, sec_pred, &diff);       \
+      }                                                                       \
+    }                                                                         \
+                                                                              \
+    return VARIANCE_64Wx##ht##H(*sse, diff);                                  \
+  }
+
+/* clang-format off */
+AOM_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(32)
+AOM_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(64)
+/* clang-format on */
diff --git a/libs/libaom/src/aom_dsp/mips/subtract_msa.c b/libs/libaom/src/aom_dsp/mips/subtract_msa.c
new file mode 100644
index 000000000..bfed773ac
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/mips/subtract_msa.c
@@ -0,0 +1,266 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/mips/macros_msa.h"
+
+static void sub_blk_4x4_msa(const uint8_t *src_ptr, int32_t src_stride,
+                            const uint8_t *pred_ptr, int32_t pred_stride,
+                            int16_t *diff_ptr, int32_t diff_stride) {
+  uint32_t src0, src1, src2, src3;
+  uint32_t pred0, pred1, pred2, pred3;
+  v16i8 src = { 0 };
+  v16i8 pred = { 0 };
+  v16u8 src_l0, src_l1;
+  v8i16 diff0, diff1;
+
+  LW4(src_ptr, src_stride, src0, src1, src2, src3);
+  LW4(pred_ptr, pred_stride, pred0, pred1, pred2, pred3);
+  INSERT_W4_SB(src0, src1, src2, src3, src);
+  INSERT_W4_SB(pred0, pred1, pred2, pred3, pred);
+  ILVRL_B2_UB(src, pred, src_l0, src_l1);
+  HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+  ST8x4_UB(diff0, diff1, diff_ptr, (2 * diff_stride));
+}
+
+static void sub_blk_8x8_msa(const uint8_t *src_ptr, int32_t src_stride,
+                            const uint8_t *pred_ptr, int32_t pred_stride,
+                            int16_t *diff_ptr, int32_t diff_stride) {
+  uint32_t loop_cnt;
+  uint64_t src0, src1, pred0, pred1;
+  v16i8 src = { 0 };
+  v16i8 pred = { 0 };
+  v16u8 src_l0, src_l1;
+  v8i16 diff0, diff1;
+
+  for (loop_cnt = 4; loop_cnt--;) {
+    LD2(src_ptr, src_stride, src0, src1);
+    src_ptr += (2 * src_stride);
+    LD2(pred_ptr, pred_stride, pred0, pred1);
+    pred_ptr += (2 * pred_stride);
+
+    INSERT_D2_SB(src0, src1, src);
+    INSERT_D2_SB(pred0, pred1, pred);
+    ILVRL_B2_UB(src, pred, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff_ptr, diff_stride);
+    diff_ptr += (2 * diff_stride);
+  }
+}
+
+static void sub_blk_16x16_msa(const uint8_t *src, int32_t src_stride,
+                              const uint8_t *pred, int32_t pred_stride,
+                              int16_t *diff, int32_t diff_stride) {
+  int8_t count;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
+  v16u8 src_l0, src_l1;
+  v8i16 diff0, diff1;
+
+  for (count = 2; count--;) {
+    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+    src += (8 * src_stride);
+
+    LD_SB8(pred, pred_stride, pred0, pred1, pred2, pred3, pred4, pred5, pred6,
+           pred7);
+    pred += (8 * pred_stride);
+
+    ILVRL_B2_UB(src0, pred0, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff, 8);
+    diff += diff_stride;
+
+    ILVRL_B2_UB(src1, pred1, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff, 8);
+    diff += diff_stride;
+
+    ILVRL_B2_UB(src2, pred2, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff, 8);
+    diff += diff_stride;
+
+    ILVRL_B2_UB(src3, pred3, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff, 8);
+    diff += diff_stride;
+
+    ILVRL_B2_UB(src4, pred4, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff, 8);
+    diff += diff_stride;
+
+    ILVRL_B2_UB(src5, pred5, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff, 8);
+    diff += diff_stride;
+
+    ILVRL_B2_UB(src6, pred6, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff, 8);
+    diff += diff_stride;
+
+    ILVRL_B2_UB(src7, pred7, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff, 8);
+    diff += diff_stride;
+  }
+}
+
+static void sub_blk_32x32_msa(const uint8_t *src, int32_t src_stride,
+                              const uint8_t *pred, int32_t pred_stride,
+                              int16_t *diff, int32_t diff_stride) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
+  v16u8 src_l0, src_l1;
+  v8i16 diff0, diff1;
+
+  for (loop_cnt = 8; loop_cnt--;) {
+    LD_SB2(src, 16, src0, src1);
+    src += src_stride;
+    LD_SB2(src, 16, src2, src3);
+    src += src_stride;
+    LD_SB2(src, 16, src4, src5);
+    src += src_stride;
+    LD_SB2(src, 16, src6, src7);
+    src += src_stride;
+
+    LD_SB2(pred, 16, pred0, pred1);
+    pred += pred_stride;
+    LD_SB2(pred, 16, pred2, pred3);
+    pred += pred_stride;
+    LD_SB2(pred, 16, pred4, pred5);
+    pred += pred_stride;
+    LD_SB2(pred, 16, pred6, pred7);
+    pred += pred_stride;
+
+    ILVRL_B2_UB(src0, pred0, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff, 8);
+    ILVRL_B2_UB(src1, pred1, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff + 16, 8);
+    diff += diff_stride;
+
+    ILVRL_B2_UB(src2, pred2, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff, 8);
+    ILVRL_B2_UB(src3, pred3, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff + 16, 8);
+    diff += diff_stride;
+
+    ILVRL_B2_UB(src4, pred4, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff, 8);
+    ILVRL_B2_UB(src5, pred5, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff + 16, 8);
+    diff += diff_stride;
+
+    ILVRL_B2_UB(src6, pred6, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff, 8);
+    ILVRL_B2_UB(src7, pred7, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff + 16, 8);
+    diff += diff_stride;
+  }
+}
+
+static void sub_blk_64x64_msa(const uint8_t *src, int32_t src_stride,
+                              const uint8_t *pred, int32_t pred_stride,
+                              int16_t *diff, int32_t diff_stride) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
+  v16u8 src_l0, src_l1;
+  v8i16 diff0, diff1;
+
+  for (loop_cnt = 32; loop_cnt--;) {
+    LD_SB4(src, 16, src0, src1, src2, src3);
+    src += src_stride;
+    LD_SB4(src, 16, src4, src5, src6, src7);
+    src += src_stride;
+
+    LD_SB4(pred, 16, pred0, pred1, pred2, pred3);
+    pred += pred_stride;
+    LD_SB4(pred, 16, pred4, pred5, pred6, pred7);
+    pred += pred_stride;
+
+    ILVRL_B2_UB(src0, pred0, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff, 8);
+    ILVRL_B2_UB(src1, pred1, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff + 16, 8);
+    ILVRL_B2_UB(src2, pred2, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff + 32, 8);
+    ILVRL_B2_UB(src3, pred3, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff + 48, 8);
+    diff += diff_stride;
+
+    ILVRL_B2_UB(src4, pred4, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff, 8);
+    ILVRL_B2_UB(src5, pred5, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff + 16, 8);
+    ILVRL_B2_UB(src6, pred6, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff + 32, 8);
+    ILVRL_B2_UB(src7, pred7, src_l0, src_l1);
+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
+    ST_SH2(diff0, diff1, diff + 48, 8);
+    diff += diff_stride;
+  }
+}
+
+void aom_subtract_block_msa(int32_t rows, int32_t cols, int16_t *diff_ptr,
+                            ptrdiff_t diff_stride, const uint8_t *src_ptr,
+                            ptrdiff_t src_stride, const uint8_t *pred_ptr,
+                            ptrdiff_t pred_stride) {
+  if (rows == cols) {
+    switch (rows) {
+      case 4:
+        sub_blk_4x4_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
+                        diff_stride);
+        break;
+      case 8:
+        sub_blk_8x8_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
+                        diff_stride);
+        break;
+      case 16:
+        sub_blk_16x16_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
+                          diff_stride);
+        break;
+      case 32:
+        sub_blk_32x32_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
+                          diff_stride);
+        break;
+      case 64:
+        sub_blk_64x64_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
+                          diff_stride);
+        break;
+      default:
+        aom_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr,
+                             src_stride, pred_ptr, pred_stride);
+        break;
+    }
+  } else {
+    aom_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr, src_stride,
+                         pred_ptr, pred_stride);
+  }
+}
diff --git a/libs/libaom/src/aom_dsp/mips/variance_msa.c b/libs/libaom/src/aom_dsp/mips/variance_msa.c
new file mode 100644
index 000000000..065c09ac5
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/mips/variance_msa.c
@@ -0,0 +1,633 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/mips/macros_msa.h"
+
+#define CALC_MSE_B(src, ref, var)                                   \
+  {                                                                 \
+    v16u8 src_l0_m, src_l1_m;                                       \
+    v8i16 res_l0_m, res_l1_m;                                       \
+                                                                    \
+    ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m);                      \
+    HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m);            \
+    DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \
+  }
+
+#define CALC_MSE_AVG_B(src, ref, var, sub)                          \
+  {                                                                 \
+    v16u8 src_l0_m, src_l1_m;                                       \
+    v8i16 res_l0_m, res_l1_m;                                       \
+                                                                    \
+    ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m);                      \
+    HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m);            \
+    DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \
+                                                                    \
+    sub += res_l0_m + res_l1_m;                                     \
+  }
+
+#define VARIANCE_WxH(sse, diff, shift) sse - (((uint32_t)diff * diff) >> shift)
+
+#define VARIANCE_LARGE_WxH(sse, diff, shift) \
+  sse - (((int64_t)diff * diff) >> shift)
+
+static uint32_t sse_diff_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
+                                    const uint8_t *ref_ptr, int32_t ref_stride,
+                                    int32_t height, int32_t *diff) {
+  uint32_t src0, src1, src2, src3;
+  uint32_t ref0, ref1, ref2, ref3;
+  int32_t ht_cnt;
+  v16u8 src = { 0 };
+  v16u8 ref = { 0 };
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LW4(src_ptr, src_stride, src0, src1, src2, src3);
+    src_ptr += (4 * src_stride);
+    LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
+    ref_ptr += (4 * ref_stride);
+
+    INSERT_W4_UB(src0, src1, src2, src3, src);
+    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+    CALC_MSE_AVG_B(src, ref, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sse_diff_8width_msa(const uint8_t *src_ptr, int32_t src_stride,
+                                    const uint8_t *ref_ptr, int32_t ref_stride,
+                                    int32_t height, int32_t *diff) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, src2, src3;
+  v16u8 ref0, ref1, ref2, ref3;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
+    src_ptr += (4 * src_stride);
+    LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
+    ref_ptr += (4 * ref_stride);
+
+    PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
+                ref0, ref1);
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sse_diff_16width_msa(const uint8_t *src_ptr, int32_t src_stride,
+                                     const uint8_t *ref_ptr, int32_t ref_stride,
+                                     int32_t height, int32_t *diff) {
+  int32_t ht_cnt;
+  v16u8 src, ref;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    src = LD_UB(src_ptr);
+    src_ptr += src_stride;
+    ref = LD_UB(ref_ptr);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src, ref, var, avg);
+
+    src = LD_UB(src_ptr);
+    src_ptr += src_stride;
+    ref = LD_UB(ref_ptr);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src, ref, var, avg);
+
+    src = LD_UB(src_ptr);
+    src_ptr += src_stride;
+    ref = LD_UB(ref_ptr);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src, ref, var, avg);
+
+    src = LD_UB(src_ptr);
+    src_ptr += src_stride;
+    ref = LD_UB(ref_ptr);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src, ref, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sse_diff_32width_msa(const uint8_t *src_ptr, int32_t src_stride,
+                                     const uint8_t *ref_ptr, int32_t ref_stride,
+                                     int32_t height, int32_t *diff) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, ref0, ref1;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sse_diff_32x64_msa(const uint8_t *src_ptr, int32_t src_stride,
+                                   const uint8_t *ref_ptr, int32_t ref_stride,
+                                   int32_t *diff) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, ref0, ref1;
+  v8i16 avg0 = { 0 };
+  v8i16 avg1 = { 0 };
+  v4i32 vec, var = { 0 };
+
+  for (ht_cnt = 16; ht_cnt--;) {
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+  }
+
+  vec = __msa_hadd_s_w(avg0, avg0);
+  vec += __msa_hadd_s_w(avg1, avg1);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sse_diff_64x32_msa(const uint8_t *src_ptr, int32_t src_stride,
+                                   const uint8_t *ref_ptr, int32_t ref_stride,
+                                   int32_t *diff) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, src2, src3;
+  v16u8 ref0, ref1, ref2, ref3;
+  v8i16 avg0 = { 0 };
+  v8i16 avg1 = { 0 };
+  v4i32 vec, var = { 0 };
+
+  for (ht_cnt = 16; ht_cnt--;) {
+    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
+    src_ptr += src_stride;
+    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src2, ref2, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+    CALC_MSE_AVG_B(src3, ref3, var, avg1);
+
+    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
+    src_ptr += src_stride;
+    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src2, ref2, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+    CALC_MSE_AVG_B(src3, ref3, var, avg1);
+  }
+
+  vec = __msa_hadd_s_w(avg0, avg0);
+  vec += __msa_hadd_s_w(avg1, avg1);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sse_diff_64x64_msa(const uint8_t *src_ptr, int32_t src_stride,
+                                   const uint8_t *ref_ptr, int32_t ref_stride,
+                                   int32_t *diff) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, src2, src3;
+  v16u8 ref0, ref1, ref2, ref3;
+  v8i16 avg0 = { 0 };
+  v8i16 avg1 = { 0 };
+  v8i16 avg2 = { 0 };
+  v8i16 avg3 = { 0 };
+  v4i32 vec, var = { 0 };
+
+  for (ht_cnt = 32; ht_cnt--;) {
+    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
+    src_ptr += src_stride;
+    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
+    ref_ptr += ref_stride;
+
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+    CALC_MSE_AVG_B(src2, ref2, var, avg2);
+    CALC_MSE_AVG_B(src3, ref3, var, avg3);
+    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
+    src_ptr += src_stride;
+    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+    CALC_MSE_AVG_B(src2, ref2, var, avg2);
+    CALC_MSE_AVG_B(src3, ref3, var, avg3);
+  }
+
+  vec = __msa_hadd_s_w(avg0, avg0);
+  vec += __msa_hadd_s_w(avg1, avg1);
+  vec += __msa_hadd_s_w(avg2, avg2);
+  vec += __msa_hadd_s_w(avg3, avg3);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t get_mb_ss_msa(const int16_t *src) {
+  uint32_t sum, cnt;
+  v8i16 src0, src1, src2, src3;
+  v4i32 src0_l, src1_l, src2_l, src3_l;
+  v4i32 src0_r, src1_r, src2_r, src3_r;
+  v2i64 sq_src_l = { 0 };
+  v2i64 sq_src_r = { 0 };
+
+  for (cnt = 8; cnt--;) {
+    LD_SH4(src, 8, src0, src1, src2, src3);
+    src += 4 * 8;
+
+    UNPCK_SH_SW(src0, src0_l, src0_r);
+    UNPCK_SH_SW(src1, src1_l, src1_r);
+    UNPCK_SH_SW(src2, src2_l, src2_r);
+    UNPCK_SH_SW(src3, src3_l, src3_r);
+
+    DPADD_SD2_SD(src0_l, src0_r, sq_src_l, sq_src_r);
+    DPADD_SD2_SD(src1_l, src1_r, sq_src_l, sq_src_r);
+    DPADD_SD2_SD(src2_l, src2_r, sq_src_l, sq_src_r);
+    DPADD_SD2_SD(src3_l, src3_r, sq_src_l, sq_src_r);
+  }
+
+  sq_src_l += __msa_splati_d(sq_src_l, 1);
+  sq_src_r += __msa_splati_d(sq_src_r, 1);
+
+  sum = __msa_copy_s_d(sq_src_l, 0);
+  sum += __msa_copy_s_d(sq_src_r, 0);
+
+  return sum;
+}
+
+static uint32_t sse_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
+                               const uint8_t *ref_ptr, int32_t ref_stride,
+                               int32_t height) {
+  int32_t ht_cnt;
+  uint32_t src0, src1, src2, src3;
+  uint32_t ref0, ref1, ref2, ref3;
+  v16u8 src = { 0 };
+  v16u8 ref = { 0 };
+  v4i32 var = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LW4(src_ptr, src_stride, src0, src1, src2, src3);
+    src_ptr += (4 * src_stride);
+    LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
+    ref_ptr += (4 * ref_stride);
+
+    INSERT_W4_UB(src0, src1, src2, src3, src);
+    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+    CALC_MSE_B(src, ref, var);
+  }
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sse_8width_msa(const uint8_t *src_ptr, int32_t src_stride,
+                               const uint8_t *ref_ptr, int32_t ref_stride,
+                               int32_t height) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, src2, src3;
+  v16u8 ref0, ref1, ref2, ref3;
+  v4i32 var = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
+    src_ptr += (4 * src_stride);
+    LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
+    ref_ptr += (4 * ref_stride);
+
+    PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
+                ref0, ref1);
+    CALC_MSE_B(src0, ref0, var);
+    CALC_MSE_B(src1, ref1, var);
+  }
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sse_16width_msa(const uint8_t *src_ptr, int32_t src_stride,
+                                const uint8_t *ref_ptr, int32_t ref_stride,
+                                int32_t height) {
+  int32_t ht_cnt;
+  v16u8 src, ref;
+  v4i32 var = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    src = LD_UB(src_ptr);
+    src_ptr += src_stride;
+    ref = LD_UB(ref_ptr);
+    ref_ptr += ref_stride;
+    CALC_MSE_B(src, ref, var);
+
+    src = LD_UB(src_ptr);
+    src_ptr += src_stride;
+    ref = LD_UB(ref_ptr);
+    ref_ptr += ref_stride;
+    CALC_MSE_B(src, ref, var);
+
+    src = LD_UB(src_ptr);
+    src_ptr += src_stride;
+    ref = LD_UB(ref_ptr);
+    ref_ptr += ref_stride;
+    CALC_MSE_B(src, ref, var);
+
+    src = LD_UB(src_ptr);
+    src_ptr += src_stride;
+    ref = LD_UB(ref_ptr);
+    ref_ptr += ref_stride;
+    CALC_MSE_B(src, ref, var);
+  }
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sse_32width_msa(const uint8_t *src_ptr, int32_t src_stride,
+                                const uint8_t *ref_ptr, int32_t ref_stride,
+                                int32_t height) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, ref0, ref1;
+  v4i32 var = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    CALC_MSE_B(src0, ref0, var);
+    CALC_MSE_B(src1, ref1, var);
+
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    CALC_MSE_B(src0, ref0, var);
+    CALC_MSE_B(src1, ref1, var);
+
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    CALC_MSE_B(src0, ref0, var);
+    CALC_MSE_B(src1, ref1, var);
+
+    LD_UB2(src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    CALC_MSE_B(src0, ref0, var);
+    CALC_MSE_B(src1, ref1, var);
+  }
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sse_64width_msa(const uint8_t *src_ptr, int32_t src_stride,
+                                const uint8_t *ref_ptr, int32_t ref_stride,
+                                int32_t height) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, src2, src3;
+  v16u8 ref0, ref1, ref2, ref3;
+  v4i32 var = { 0 };
+
+  for (ht_cnt = height >> 1; ht_cnt--;) {
+    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
+    src_ptr += src_stride;
+    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
+    ref_ptr += ref_stride;
+    CALC_MSE_B(src0, ref0, var);
+    CALC_MSE_B(src2, ref2, var);
+    CALC_MSE_B(src1, ref1, var);
+    CALC_MSE_B(src3, ref3, var);
+
+    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
+    src_ptr += src_stride;
+    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
+    ref_ptr += ref_stride;
+    CALC_MSE_B(src0, ref0, var);
+    CALC_MSE_B(src2, ref2, var);
+    CALC_MSE_B(src1, ref1, var);
+    CALC_MSE_B(src3, ref3, var);
+  }
+
+  return HADD_SW_S32(var);
+}
+
+uint32_t aom_get4x4sse_cs_msa(const uint8_t *src_ptr, int32_t src_stride,
+                              const uint8_t *ref_ptr, int32_t ref_stride) {
+  uint32_t err = 0;
+  uint32_t src0, src1, src2, src3;
+  uint32_t ref0, ref1, ref2, ref3;
+  v16i8 src = { 0 };
+  v16i8 ref = { 0 };
+  v16u8 src_vec0, src_vec1;
+  v8i16 diff0, diff1;
+  v4i32 err0 = { 0 };
+  v4i32 err1 = { 0 };
+
+  LW4(src_ptr, src_stride, src0, src1, src2, src3);
+  LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
+  INSERT_W4_SB(src0, src1, src2, src3, src);
+  INSERT_W4_SB(ref0, ref1, ref2, ref3, ref);
+  ILVRL_B2_UB(src, ref, src_vec0, src_vec1);
+  HSUB_UB2_SH(src_vec0, src_vec1, diff0, diff1);
+  DPADD_SH2_SW(diff0, diff1, diff0, diff1, err0, err1);
+  err = HADD_SW_S32(err0);
+  err += HADD_SW_S32(err1);
+
+  return err;
+}
+
+#define VARIANCE_4Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 4);
+#define VARIANCE_4Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 5);
+#define VARIANCE_8Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 5);
+#define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6);
+#define VARIANCE_8Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 7);
+#define VARIANCE_16Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 7);
+#define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8);
+
+#define VARIANCE_16Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
+#define VARIANCE_32Wx16H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
+#define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10);
+#define VARIANCE_32Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
+#define VARIANCE_64Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
+#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12);
+
+#define AOM_VARIANCE_WDXHT_MSA(wd, ht)                                         \
+  uint32_t aom_variance##wd##x##ht##_msa(                                      \
+      const uint8_t *src, int32_t src_stride, const uint8_t *ref,              \
+      int32_t ref_stride, uint32_t *sse) {                                     \
+    int32_t diff;                                                              \
+                                                                               \
+    *sse =                                                                     \
+        sse_diff_##wd##width_msa(src, src_stride, ref, ref_stride, ht, &diff); \
+                                                                               \
+    return VARIANCE_##wd##Wx##ht##H(*sse, diff);                               \
+  }
+
+/* clang-format off */
+AOM_VARIANCE_WDXHT_MSA(4, 4)
+AOM_VARIANCE_WDXHT_MSA(4, 8)
+
+AOM_VARIANCE_WDXHT_MSA(8, 4)
+AOM_VARIANCE_WDXHT_MSA(8, 8)
+AOM_VARIANCE_WDXHT_MSA(8, 16)
+
+AOM_VARIANCE_WDXHT_MSA(16, 8)
+AOM_VARIANCE_WDXHT_MSA(16, 16)
+AOM_VARIANCE_WDXHT_MSA(16, 32)
+
+AOM_VARIANCE_WDXHT_MSA(32, 16)
+AOM_VARIANCE_WDXHT_MSA(32, 32)
+/* clang-format on */
+
+uint32_t aom_variance32x64_msa(const uint8_t *src, int32_t src_stride,
+                               const uint8_t *ref, int32_t ref_stride,
+                               uint32_t *sse) {
+  int32_t diff;
+
+  *sse = sse_diff_32x64_msa(src, src_stride, ref, ref_stride, &diff);
+
+  return VARIANCE_32Wx64H(*sse, diff);
+}
+
+uint32_t aom_variance64x32_msa(const uint8_t *src, int32_t src_stride,
+                               const uint8_t *ref, int32_t ref_stride,
+                               uint32_t *sse) {
+  int32_t diff;
+
+  *sse = sse_diff_64x32_msa(src, src_stride, ref, ref_stride, &diff);
+
+  return VARIANCE_64Wx32H(*sse, diff);
+}
+
+uint32_t aom_variance64x64_msa(const uint8_t *src, int32_t src_stride,
+                               const uint8_t *ref, int32_t ref_stride,
+                               uint32_t *sse) {
+  int32_t diff;
+
+  *sse = sse_diff_64x64_msa(src, src_stride, ref, ref_stride, &diff);
+
+  return VARIANCE_64Wx64H(*sse, diff);
+}
+
+uint32_t aom_mse8x8_msa(const uint8_t *src, int32_t src_stride,
+                        const uint8_t *ref, int32_t ref_stride, uint32_t *sse) {
+  *sse = sse_8width_msa(src, src_stride, ref, ref_stride, 8);
+
+  return *sse;
+}
+
+uint32_t aom_mse8x16_msa(const uint8_t *src, int32_t src_stride,
+                         const uint8_t *ref, int32_t ref_stride,
+                         uint32_t *sse) {
+  *sse = sse_8width_msa(src, src_stride, ref, ref_stride, 16);
+
+  return *sse;
+}
+
+uint32_t aom_mse16x8_msa(const uint8_t *src, int32_t src_stride,
+                         const uint8_t *ref, int32_t ref_stride,
+                         uint32_t *sse) {
+  *sse = sse_16width_msa(src, src_stride, ref, ref_stride, 8);
+
+  return *sse;
+}
+
+uint32_t aom_mse16x16_msa(const uint8_t *src, int32_t src_stride,
+                          const uint8_t *ref, int32_t ref_stride,
+                          uint32_t *sse) {
+  *sse = sse_16width_msa(src, src_stride, ref, ref_stride, 16);
+
+  return *sse;
+}
+
+void aom_get8x8var_msa(const uint8_t *src, int32_t src_stride,
+                       const uint8_t *ref, int32_t ref_stride, uint32_t *sse,
+                       int32_t *sum) {
+  *sse = sse_diff_8width_msa(src, src_stride, ref, ref_stride, 8, sum);
+}
+
+void aom_get16x16var_msa(const uint8_t *src, int32_t src_stride,
+                         const uint8_t *ref, int32_t ref_stride, uint32_t *sse,
+                         int32_t *sum) {
+  *sse = sse_diff_16width_msa(src, src_stride, ref, ref_stride, 16, sum);
+}
+
+uint32_t aom_get_mb_ss_msa(const int16_t *src) { return get_mb_ss_msa(src); }
diff --git a/libs/libaom/src/aom_dsp/noise_model.c b/libs/libaom/src/aom_dsp/noise_model.c
new file mode 100644
index 000000000..c7a0003a8
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/noise_model.c
@@ -0,0 +1,1654 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/noise_model.h"
+#include "aom_dsp/noise_util.h"
+#include "aom_mem/aom_mem.h"
+#include "av1/common/common.h"
+#include "av1/encoder/mathutils.h"
+
+#define kLowPolyNumParams 3
+
+static const int kMaxLag = 4;
+
+// Defines a function that can be used to obtain the mean of a block for the
+// provided data type (uint8_t, or uint16_t)
+#define GET_BLOCK_MEAN(INT_TYPE, suffix)                                    \
+  static double get_block_mean_##suffix(const INT_TYPE *data, int w, int h, \
+                                        int stride, int x_o, int y_o,       \
+                                        int block_size) {                   \
+    const int max_h = AOMMIN(h - y_o, block_size);                          \
+    const int max_w = AOMMIN(w - x_o, block_size);                          \
+    double block_mean = 0;                                                  \
+    for (int y = 0; y < max_h; ++y) {                                       \
+      for (int x = 0; x < max_w; ++x) {                                     \
+        block_mean += data[(y_o + y) * stride + x_o + x];                   \
+      }                                                                     \
+    }                                                                       \
+    return block_mean / (max_w * max_h);                                    \
+  }
+
+GET_BLOCK_MEAN(uint8_t, lowbd);
+GET_BLOCK_MEAN(uint16_t, highbd);
+
+static INLINE double get_block_mean(const uint8_t *data, int w, int h,
+                                    int stride, int x_o, int y_o,
+                                    int block_size, int use_highbd) {
+  if (use_highbd)
+    return get_block_mean_highbd((const uint16_t *)data, w, h, stride, x_o, y_o,
+                                 block_size);
+  return get_block_mean_lowbd(data, w, h, stride, x_o, y_o, block_size);
+}
+
+// Defines a function that can be used to obtain the variance of a block
+// for the provided data type (uint8_t, or uint16_t)
+#define GET_NOISE_VAR(INT_TYPE, suffix)                                  \
+  static double get_noise_var_##suffix(                                  \
+      const INT_TYPE *data, const INT_TYPE *denoised, int stride, int w, \
+      int h, int x_o, int y_o, int block_size_x, int block_size_y) {     \
+    const int max_h = AOMMIN(h - y_o, block_size_y);                     \
+    const int max_w = AOMMIN(w - x_o, block_size_x);                     \
+    double noise_var = 0;                                                \
+    double noise_mean = 0;                                               \
+    for (int y = 0; y < max_h; ++y) {                                    \
+      for (int x = 0; x < max_w; ++x) {                                  \
+        double noise = (double)data[(y_o + y) * stride + x_o + x] -      \
+                       denoised[(y_o + y) * stride + x_o + x];           \
+        noise_mean += noise;                                             \
+        noise_var += noise * noise;                                      \
+      }                                                                  \
+    }                                                                    \
+    noise_mean /= (max_w * max_h);                                       \
+    return noise_var / (max_w * max_h) - noise_mean * noise_mean;        \
+  }
+
+GET_NOISE_VAR(uint8_t, lowbd);
+GET_NOISE_VAR(uint16_t, highbd);
+
+static INLINE double get_noise_var(const uint8_t *data, const uint8_t *denoised,
+                                   int w, int h, int stride, int x_o, int y_o,
+                                   int block_size_x, int block_size_y,
+                                   int use_highbd) {
+  if (use_highbd)
+    return get_noise_var_highbd((const uint16_t *)data,
+                                (const uint16_t *)denoised, w, h, stride, x_o,
+                                y_o, block_size_x, block_size_y);
+  return get_noise_var_lowbd(data, denoised, w, h, stride, x_o, y_o,
+                             block_size_x, block_size_y);
+}
+
+static void equation_system_clear(aom_equation_system_t *eqns) {
+  const int n = eqns->n;
+  memset(eqns->A, 0, sizeof(*eqns->A) * n * n);
+  memset(eqns->x, 0, sizeof(*eqns->x) * n);
+  memset(eqns->b, 0, sizeof(*eqns->b) * n);
+}
+
+static void equation_system_copy(aom_equation_system_t *dst,
+                                 const aom_equation_system_t *src) {
+  const int n = dst->n;
+  memcpy(dst->A, src->A, sizeof(*dst->A) * n * n);
+  memcpy(dst->x, src->x, sizeof(*dst->x) * n);
+  memcpy(dst->b, src->b, sizeof(*dst->b) * n);
+}
+
+static int equation_system_init(aom_equation_system_t *eqns, int n) {
+  eqns->A = (double *)aom_malloc(sizeof(*eqns->A) * n * n);
+  eqns->b = (double *)aom_malloc(sizeof(*eqns->b) * n);
+  eqns->x = (double *)aom_malloc(sizeof(*eqns->x) * n);
+  eqns->n = n;
+  if (!eqns->A || !eqns->b || !eqns->x) {
+    fprintf(stderr, "Failed to allocate system of equations of size %d\n", n);
+    aom_free(eqns->A);
+    aom_free(eqns->b);
+    aom_free(eqns->x);
+    memset(eqns, 0, sizeof(*eqns));
+    return 0;
+  }
+  equation_system_clear(eqns);
+  return 1;
+}
+
+static int equation_system_solve(aom_equation_system_t *eqns) {
+  const int n = eqns->n;
+  double *b = (double *)aom_malloc(sizeof(*b) * n);
+  double *A = (double *)aom_malloc(sizeof(*A) * n * n);
+  int ret = 0;
+  if (A == NULL || b == NULL) {
+    fprintf(stderr, "Unable to allocate temp values of size %dx%d\n", n, n);
+    aom_free(b);
+    aom_free(A);
+    return 0;
+  }
+  memcpy(A, eqns->A, sizeof(*eqns->A) * n * n);
+  memcpy(b, eqns->b, sizeof(*eqns->b) * n);
+  ret = linsolve(n, A, eqns->n, b, eqns->x);
+  aom_free(b);
+  aom_free(A);
+
+  if (ret == 0) {
+    return 0;
+  }
+  return 1;
+}
+
+static void equation_system_add(aom_equation_system_t *dest,
+                                aom_equation_system_t *src) {
+  const int n = dest->n;
+  int i, j;
+  for (i = 0; i < n; ++i) {
+    for (j = 0; j < n; ++j) {
+      dest->A[i * n + j] += src->A[i * n + j];
+    }
+    dest->b[i] += src->b[i];
+  }
+}
+
+static void equation_system_free(aom_equation_system_t *eqns) {
+  if (!eqns) return;
+  aom_free(eqns->A);
+  aom_free(eqns->b);
+  aom_free(eqns->x);
+  memset(eqns, 0, sizeof(*eqns));
+}
+
+static void noise_strength_solver_clear(aom_noise_strength_solver_t *solver) {
+  equation_system_clear(&solver->eqns);
+  solver->num_equations = 0;
+  solver->total = 0;
+}
+
+static void noise_strength_solver_add(aom_noise_strength_solver_t *dest,
+                                      aom_noise_strength_solver_t *src) {
+  equation_system_add(&dest->eqns, &src->eqns);
+  dest->num_equations += src->num_equations;
+  dest->total += src->total;
+}
+
+// Return the number of coefficients required for the given parameters
+static int num_coeffs(const aom_noise_model_params_t params) {
+  const int n = 2 * params.lag + 1;
+  switch (params.shape) {
+    case AOM_NOISE_SHAPE_DIAMOND: return params.lag * (params.lag + 1);
+    case AOM_NOISE_SHAPE_SQUARE: return (n * n) / 2;
+  }
+  return 0;
+}
+
+static int noise_state_init(aom_noise_state_t *state, int n, int bit_depth) {
+  const int kNumBins = 20;
+  if (!equation_system_init(&state->eqns, n)) {
+    fprintf(stderr, "Failed initialization noise state with size %d\n", n);
+    return 0;
+  }
+  state->ar_gain = 1.0;
+  state->num_observations = 0;
+  return aom_noise_strength_solver_init(&state->strength_solver, kNumBins,
+                                        bit_depth);
+}
+
+static void set_chroma_coefficient_fallback_soln(aom_equation_system_t *eqns) {
+  const double kTolerance = 1e-6;
+  const int last = eqns->n - 1;
+  // Set all of the AR coefficients to zero, but try to solve for correlation
+  // with the luma channel
+  memset(eqns->x, 0, sizeof(*eqns->x) * eqns->n);
+  if (fabs(eqns->A[last * eqns->n + last]) > kTolerance) {
+    eqns->x[last] = eqns->b[last] / eqns->A[last * eqns->n + last];
+  }
+}
+
+int aom_noise_strength_lut_init(aom_noise_strength_lut_t *lut, int num_points) {
+  if (!lut) return 0;
+  lut->num_points = 0;
+  lut->points = (double(*)[2])aom_malloc(num_points * sizeof(*lut->points));
+  if (!lut->points) return 0;
+  lut->num_points = num_points;
+  memset(lut->points, 0, sizeof(*lut->points) * num_points);
+  return 1;
+}
+
+void aom_noise_strength_lut_free(aom_noise_strength_lut_t *lut) {
+  if (!lut) return;
+  aom_free(lut->points);
+  memset(lut, 0, sizeof(*lut));
+}
+
+double aom_noise_strength_lut_eval(const aom_noise_strength_lut_t *lut,
+                                   double x) {
+  int i = 0;
+  // Constant extrapolation for x <  x_0.
+  if (x < lut->points[0][0]) return lut->points[0][1];
+  for (i = 0; i < lut->num_points - 1; ++i) {
+    if (x >= lut->points[i][0] && x <= lut->points[i + 1][0]) {
+      const double a =
+          (x - lut->points[i][0]) / (lut->points[i + 1][0] - lut->points[i][0]);
+      return lut->points[i + 1][1] * a + lut->points[i][1] * (1.0 - a);
+    }
+  }
+  // Constant extrapolation for x > x_{n-1}
+  return lut->points[lut->num_points - 1][1];
+}
+
+static double noise_strength_solver_get_bin_index(
+    const aom_noise_strength_solver_t *solver, double value) {
+  const double val =
+      fclamp(value, solver->min_intensity, solver->max_intensity);
+  const double range = solver->max_intensity - solver->min_intensity;
+  return (solver->num_bins - 1) * (val - solver->min_intensity) / range;
+}
+
+static double noise_strength_solver_get_value(
+    const aom_noise_strength_solver_t *solver, double x) {
+  const double bin = noise_strength_solver_get_bin_index(solver, x);
+  const int bin_i0 = (int)floor(bin);
+  const int bin_i1 = AOMMIN(solver->num_bins - 1, bin_i0 + 1);
+  const double a = bin - bin_i0;
+  return (1.0 - a) * solver->eqns.x[bin_i0] + a * solver->eqns.x[bin_i1];
+}
+
+void aom_noise_strength_solver_add_measurement(
+    aom_noise_strength_solver_t *solver, double block_mean, double noise_std) {
+  const double bin = noise_strength_solver_get_bin_index(solver, block_mean);
+  const int bin_i0 = (int)floor(bin);
+  const int bin_i1 = AOMMIN(solver->num_bins - 1, bin_i0 + 1);
+  const double a = bin - bin_i0;
+  const int n = solver->num_bins;
+  solver->eqns.A[bin_i0 * n + bin_i0] += (1.0 - a) * (1.0 - a);
+  solver->eqns.A[bin_i1 * n + bin_i0] += a * (1.0 - a);
+  solver->eqns.A[bin_i1 * n + bin_i1] += a * a;
+  solver->eqns.A[bin_i0 * n + bin_i1] += a * (1.0 - a);
+  solver->eqns.b[bin_i0] += (1.0 - a) * noise_std;
+  solver->eqns.b[bin_i1] += a * noise_std;
+  solver->total += noise_std;
+  solver->num_equations++;
+}
+
+int aom_noise_strength_solver_solve(aom_noise_strength_solver_t *solver) {
+  // Add regularization proportional to the number of constraints
+  const int n = solver->num_bins;
+  const double kAlpha = 2.0 * (double)(solver->num_equations) / n;
+  int result = 0;
+  double mean = 0;
+
+  // Do this in a non-destructive manner so it is not confusing to the caller
+  double *old_A = solver->eqns.A;
+  double *A = (double *)aom_malloc(sizeof(*A) * n * n);
+  if (!A) {
+    fprintf(stderr, "Unable to allocate copy of A\n");
+    return 0;
+  }
+  memcpy(A, old_A, sizeof(*A) * n * n);
+
+  for (int i = 0; i < n; ++i) {
+    const int i_lo = AOMMAX(0, i - 1);
+    const int i_hi = AOMMIN(n - 1, i + 1);
+    A[i * n + i_lo] -= kAlpha;
+    A[i * n + i] += 2 * kAlpha;
+    A[i * n + i_hi] -= kAlpha;
+  }
+
+  // Small regularization to give average noise strength
+  mean = solver->total / solver->num_equations;
+  for (int i = 0; i < n; ++i) {
+    A[i * n + i] += 1.0 / 8192.;
+    solver->eqns.b[i] += mean / 8192.;
+  }
+  solver->eqns.A = A;
+  result = equation_system_solve(&solver->eqns);
+  solver->eqns.A = old_A;
+
+  aom_free(A);
+  return result;
+}
+
+int aom_noise_strength_solver_init(aom_noise_strength_solver_t *solver,
+                                   int num_bins, int bit_depth) {
+  if (!solver) return 0;
+  memset(solver, 0, sizeof(*solver));
+  solver->num_bins = num_bins;
+  solver->min_intensity = 0;
+  solver->max_intensity = (1 << bit_depth) - 1;
+  solver->total = 0;
+  solver->num_equations = 0;
+  return equation_system_init(&solver->eqns, num_bins);
+}
+
+void aom_noise_strength_solver_free(aom_noise_strength_solver_t *solver) {
+  if (!solver) return;
+  equation_system_free(&solver->eqns);
+}
+
+double aom_noise_strength_solver_get_center(
+    const aom_noise_strength_solver_t *solver, int i) {
+  const double range = solver->max_intensity - solver->min_intensity;
+  const int n = solver->num_bins;
+  return ((double)i) / (n - 1) * range + solver->min_intensity;
+}
+
+// Computes the residual if a point were to be removed from the lut. This is
+// calculated as the area between the output of the solver and the line segment
+// that would be formed between [x_{i - 1}, x_{i + 1}).
+static void update_piecewise_linear_residual(
+    const aom_noise_strength_solver_t *solver,
+    const aom_noise_strength_lut_t *lut, double *residual, int start, int end) {
+  const double dx = 255. / solver->num_bins;
+  for (int i = AOMMAX(start, 1); i < AOMMIN(end, lut->num_points - 1); ++i) {
+    const int lower = AOMMAX(0, (int)floor(noise_strength_solver_get_bin_index(
+                                    solver, lut->points[i - 1][0])));
+    const int upper = AOMMIN(solver->num_bins - 1,
+                             (int)ceil(noise_strength_solver_get_bin_index(
+                                 solver, lut->points[i + 1][0])));
+    double r = 0;
+    for (int j = lower; j <= upper; ++j) {
+      const double x = aom_noise_strength_solver_get_center(solver, j);
+      if (x < lut->points[i - 1][0]) continue;
+      if (x >= lut->points[i + 1][0]) continue;
+      const double y = solver->eqns.x[j];
+      const double a = (x - lut->points[i - 1][0]) /
+                       (lut->points[i + 1][0] - lut->points[i - 1][0]);
+      const double estimate_y =
+          lut->points[i - 1][1] * (1.0 - a) + lut->points[i + 1][1] * a;
+      r += fabs(y - estimate_y);
+    }
+    residual[i] = r * dx;
+  }
+}
+
+int aom_noise_strength_solver_fit_piecewise(
+    const aom_noise_strength_solver_t *solver, int max_output_points,
+    aom_noise_strength_lut_t *lut) {
+  // The tolerance is normalized to be give consistent results between
+  // different bit-depths.
+  const double kTolerance = solver->max_intensity * 0.00625 / 255.0;
+  if (!aom_noise_strength_lut_init(lut, solver->num_bins)) {
+    fprintf(stderr, "Failed to init lut\n");
+    return 0;
+  }
+  for (int i = 0; i < solver->num_bins; ++i) {
+    lut->points[i][0] = aom_noise_strength_solver_get_center(solver, i);
+    lut->points[i][1] = solver->eqns.x[i];
+  }
+  if (max_output_points < 0) {
+    max_output_points = solver->num_bins;
+  }
+
+  double *residual = aom_malloc(solver->num_bins * sizeof(*residual));
+  memset(residual, 0, sizeof(*residual) * solver->num_bins);
+
+  update_piecewise_linear_residual(solver, lut, residual, 0, solver->num_bins);
+
+  // Greedily remove points if there are too many or if it doesn't hurt local
+  // approximation (never remove the end points)
+  while (lut->num_points > 2) {
+    int min_index = 1;
+    for (int j = 1; j < lut->num_points - 1; ++j) {
+      if (residual[j] < residual[min_index]) {
+        min_index = j;
+      }
+    }
+    const double dx =
+        lut->points[min_index + 1][0] - lut->points[min_index - 1][0];
+    const double avg_residual = residual[min_index] / dx;
+    if (lut->num_points <= max_output_points && avg_residual > kTolerance) {
+      break;
+    }
+
+    const int num_remaining = lut->num_points - min_index - 1;
+    memmove(lut->points + min_index, lut->points + min_index + 1,
+            sizeof(lut->points[0]) * num_remaining);
+    lut->num_points--;
+
+    update_piecewise_linear_residual(solver, lut, residual, min_index - 1,
+                                     min_index + 1);
+  }
+  aom_free(residual);
+  return 1;
+}
+
+int aom_flat_block_finder_init(aom_flat_block_finder_t *block_finder,
+                               int block_size, int bit_depth, int use_highbd) {
+  const int n = block_size * block_size;
+  aom_equation_system_t eqns;
+  double *AtA_inv = 0;
+  double *A = 0;
+  int x = 0, y = 0, i = 0, j = 0;
+  block_finder->A = NULL;
+  block_finder->AtA_inv = NULL;
+
+  if (!equation_system_init(&eqns, kLowPolyNumParams)) {
+    fprintf(stderr, "Failed to init equation system for block_size=%d\n",
+            block_size);
+    return 0;
+  }
+
+  AtA_inv = (double *)aom_malloc(kLowPolyNumParams * kLowPolyNumParams *
+                                 sizeof(*AtA_inv));
+  A = (double *)aom_malloc(kLowPolyNumParams * n * sizeof(*A));
+  if (AtA_inv == NULL || A == NULL) {
+    fprintf(stderr, "Failed to alloc A or AtA_inv for block_size=%d\n",
+            block_size);
+    aom_free(AtA_inv);
+    aom_free(A);
+    equation_system_free(&eqns);
+    return 0;
+  }
+
+  block_finder->A = A;
+  block_finder->AtA_inv = AtA_inv;
+  block_finder->block_size = block_size;
+  block_finder->normalization = (1 << bit_depth) - 1;
+  block_finder->use_highbd = use_highbd;
+
+  for (y = 0; y < block_size; ++y) {
+    const double yd = ((double)y - block_size / 2.) / (block_size / 2.);
+    for (x = 0; x < block_size; ++x) {
+      const double xd = ((double)x - block_size / 2.) / (block_size / 2.);
+      const double coords[3] = { yd, xd, 1 };
+      const int row = y * block_size + x;
+      A[kLowPolyNumParams * row + 0] = yd;
+      A[kLowPolyNumParams * row + 1] = xd;
+      A[kLowPolyNumParams * row + 2] = 1;
+
+      for (i = 0; i < kLowPolyNumParams; ++i) {
+        for (j = 0; j < kLowPolyNumParams; ++j) {
+          eqns.A[kLowPolyNumParams * i + j] += coords[i] * coords[j];
+        }
+      }
+    }
+  }
+
+  // Lazy inverse using existing equation solver.
+  for (i = 0; i < kLowPolyNumParams; ++i) {
+    memset(eqns.b, 0, sizeof(*eqns.b) * kLowPolyNumParams);
+    eqns.b[i] = 1;
+    equation_system_solve(&eqns);
+
+    for (j = 0; j < kLowPolyNumParams; ++j) {
+      AtA_inv[j * kLowPolyNumParams + i] = eqns.x[j];
+    }
+  }
+  equation_system_free(&eqns);
+  return 1;
+}
+
+void aom_flat_block_finder_free(aom_flat_block_finder_t *block_finder) {
+  if (!block_finder) return;
+  aom_free(block_finder->A);
+  aom_free(block_finder->AtA_inv);
+  memset(block_finder, 0, sizeof(*block_finder));
+}
+
+void aom_flat_block_finder_extract_block(
+    const aom_flat_block_finder_t *block_finder, const uint8_t *const data,
+    int w, int h, int stride, int offsx, int offsy, double *plane,
+    double *block) {
+  const int block_size = block_finder->block_size;
+  const int n = block_size * block_size;
+  const double *A = block_finder->A;
+  const double *AtA_inv = block_finder->AtA_inv;
+  double plane_coords[kLowPolyNumParams];
+  double AtA_inv_b[kLowPolyNumParams];
+  int xi, yi, i;
+
+  if (block_finder->use_highbd) {
+    const uint16_t *const data16 = (const uint16_t *const)data;
+    for (yi = 0; yi < block_size; ++yi) {
+      const int y = clamp(offsy + yi, 0, h - 1);
+      for (xi = 0; xi < block_size; ++xi) {
+        const int x = clamp(offsx + xi, 0, w - 1);
+        block[yi * block_size + xi] =
+            ((double)data16[y * stride + x]) / block_finder->normalization;
+      }
+    }
+  } else {
+    for (yi = 0; yi < block_size; ++yi) {
+      const int y = clamp(offsy + yi, 0, h - 1);
+      for (xi = 0; xi < block_size; ++xi) {
+        const int x = clamp(offsx + xi, 0, w - 1);
+        block[yi * block_size + xi] =
+            ((double)data[y * stride + x]) / block_finder->normalization;
+      }
+    }
+  }
+  multiply_mat(block, A, AtA_inv_b, 1, n, kLowPolyNumParams);
+  multiply_mat(AtA_inv, AtA_inv_b, plane_coords, kLowPolyNumParams,
+               kLowPolyNumParams, 1);
+  multiply_mat(A, plane_coords, plane, n, kLowPolyNumParams, 1);
+
+  for (i = 0; i < n; ++i) {
+    block[i] -= plane[i];
+  }
+}
+
+typedef struct {
+  int index;
+  float score;
+} index_and_score_t;
+
+static int compare_scores(const void *a, const void *b) {
+  const float diff =
+      ((index_and_score_t *)a)->score - ((index_and_score_t *)b)->score;
+  if (diff < 0)
+    return -1;
+  else if (diff > 0)
+    return 1;
+  return 0;
+}
+
+int aom_flat_block_finder_run(const aom_flat_block_finder_t *block_finder,
+                              const uint8_t *const data, int w, int h,
+                              int stride, uint8_t *flat_blocks) {
+  // The gradient-based features used in this code are based on:
+  //  A. Kokaram, D. Kelly, H. Denman and A. Crawford, "Measuring noise
+  //  correlation for improved video denoising," 2012 19th, ICIP.
+  // The thresholds are more lenient to allow for correct grain modeling
+  // if extreme cases.
+  const int block_size = block_finder->block_size;
+  const int n = block_size * block_size;
+  const double kTraceThreshold = 0.15 / (32 * 32);
+  const double kRatioThreshold = 1.25;
+  const double kNormThreshold = 0.08 / (32 * 32);
+  const double kVarThreshold = 0.005 / (double)n;
+  const int num_blocks_w = (w + block_size - 1) / block_size;
+  const int num_blocks_h = (h + block_size - 1) / block_size;
+  int num_flat = 0;
+  int bx = 0, by = 0;
+  double *plane = (double *)aom_malloc(n * sizeof(*plane));
+  double *block = (double *)aom_malloc(n * sizeof(*block));
+  index_and_score_t *scores = (index_and_score_t *)aom_malloc(
+      num_blocks_w * num_blocks_h * sizeof(*scores));
+  if (plane == NULL || block == NULL || scores == NULL) {
+    fprintf(stderr, "Failed to allocate memory for block of size %d\n", n);
+    aom_free(plane);
+    aom_free(block);
+    aom_free(scores);
+    return -1;
+  }
+
+#ifdef NOISE_MODEL_LOG_SCORE
+  fprintf(stderr, "score = [");
+#endif
+  for (by = 0; by < num_blocks_h; ++by) {
+    for (bx = 0; bx < num_blocks_w; ++bx) {
+      // Compute gradient covariance matrix.
+      double Gxx = 0, Gxy = 0, Gyy = 0;
+      double var = 0;
+      double mean = 0;
+      int xi, yi;
+      aom_flat_block_finder_extract_block(block_finder, data, w, h, stride,
+                                          bx * block_size, by * block_size,
+                                          plane, block);
+
+      for (yi = 1; yi < block_size - 1; ++yi) {
+        for (xi = 1; xi < block_size - 1; ++xi) {
+          const double gx = (block[yi * block_size + xi + 1] -
+                             block[yi * block_size + xi - 1]) /
+                            2;
+          const double gy = (block[yi * block_size + xi + block_size] -
+                             block[yi * block_size + xi - block_size]) /
+                            2;
+          Gxx += gx * gx;
+          Gxy += gx * gy;
+          Gyy += gy * gy;
+
+          mean += block[yi * block_size + xi];
+          var += block[yi * block_size + xi] * block[yi * block_size + xi];
+        }
+      }
+      mean /= (block_size - 2) * (block_size - 2);
+
+      // Normalize gradients by block_size.
+      Gxx /= ((block_size - 2) * (block_size - 2));
+      Gxy /= ((block_size - 2) * (block_size - 2));
+      Gyy /= ((block_size - 2) * (block_size - 2));
+      var = var / ((block_size - 2) * (block_size - 2)) - mean * mean;
+
+      {
+        const double trace = Gxx + Gyy;
+        const double det = Gxx * Gyy - Gxy * Gxy;
+        const double e1 = (trace + sqrt(trace * trace - 4 * det)) / 2.;
+        const double e2 = (trace - sqrt(trace * trace - 4 * det)) / 2.;
+        const double norm = e1;  // Spectral norm
+        const double ratio = (e1 / AOMMAX(e2, 1e-6));
+        const int is_flat = (trace < kTraceThreshold) &&
+                            (ratio < kRatioThreshold) &&
+                            (norm < kNormThreshold) && (var > kVarThreshold);
+        // The following weights are used to combine the above features to give
+        // a sigmoid score for flatness. If the input was normalized to [0,100]
+        // the magnitude of these values would be close to 1 (e.g., weights
+        // corresponding to variance would be a factor of 10000x smaller).
+        // The weights are given in the following order:
+        //    [{var}, {ratio}, {trace}, {norm}, offset]
+        // with one of the most discriminative being simply the variance.
+        const double weights[5] = { -6682, -0.2056, 13087, -12434, 2.5694 };
+        double sum_weights = weights[0] * var + weights[1] * ratio +
+                             weights[2] * trace + weights[3] * norm +
+                             weights[4];
+        // clamp the value to [-25.0, 100.0] to prevent overflow
+        sum_weights = fclamp(sum_weights, -25.0, 100.0);
+        const float score = (float)(1.0 / (1 + exp(-sum_weights)));
+        flat_blocks[by * num_blocks_w + bx] = is_flat ? 255 : 0;
+        scores[by * num_blocks_w + bx].score = var > kVarThreshold ? score : 0;
+        scores[by * num_blocks_w + bx].index = by * num_blocks_w + bx;
+#ifdef NOISE_MODEL_LOG_SCORE
+        fprintf(stderr, "%g %g %g %g %g %d ", score, var, ratio, trace, norm,
+                is_flat);
+#endif
+        num_flat += is_flat;
+      }
+    }
+#ifdef NOISE_MODEL_LOG_SCORE
+    fprintf(stderr, "\n");
+#endif
+  }
+#ifdef NOISE_MODEL_LOG_SCORE
+  fprintf(stderr, "];\n");
+#endif
+  // Find the top-scored blocks (most likely to be flat) and set the flat blocks
+  // be the union of the thresholded results and the top 10th percentile of the
+  // scored results.
+  qsort(scores, num_blocks_w * num_blocks_h, sizeof(*scores), &compare_scores);
+  const int top_nth_percentile = num_blocks_w * num_blocks_h * 90 / 100;
+  const float score_threshold = scores[top_nth_percentile].score;
+  for (int i = 0; i < num_blocks_w * num_blocks_h; ++i) {
+    if (scores[i].score >= score_threshold) {
+      num_flat += flat_blocks[scores[i].index] == 0;
+      flat_blocks[scores[i].index] |= 1;
+    }
+  }
+  aom_free(block);
+  aom_free(plane);
+  aom_free(scores);
+  return num_flat;
+}
+
+int aom_noise_model_init(aom_noise_model_t *model,
+                         const aom_noise_model_params_t params) {
+  const int n = num_coeffs(params);
+  const int lag = params.lag;
+  const int bit_depth = params.bit_depth;
+  int x = 0, y = 0, i = 0, c = 0;
+
+  memset(model, 0, sizeof(*model));
+  if (params.lag < 1) {
+    fprintf(stderr, "Invalid noise param: lag = %d must be >= 1\n", params.lag);
+    return 0;
+  }
+  if (params.lag > kMaxLag) {
+    fprintf(stderr, "Invalid noise param: lag = %d must be <= %d\n", params.lag,
+            kMaxLag);
+    return 0;
+  }
+
+  memcpy(&model->params, &params, sizeof(params));
+  for (c = 0; c < 3; ++c) {
+    if (!noise_state_init(&model->combined_state[c], n + (c > 0), bit_depth)) {
+      fprintf(stderr, "Failed to allocate noise state for channel %d\n", c);
+      aom_noise_model_free(model);
+      return 0;
+    }
+    if (!noise_state_init(&model->latest_state[c], n + (c > 0), bit_depth)) {
+      fprintf(stderr, "Failed to allocate noise state for channel %d\n", c);
+      aom_noise_model_free(model);
+      return 0;
+    }
+  }
+  model->n = n;
+  model->coords = (int(*)[2])aom_malloc(sizeof(*model->coords) * n);
+
+  for (y = -lag; y <= 0; ++y) {
+    const int max_x = y == 0 ? -1 : lag;
+    for (x = -lag; x <= max_x; ++x) {
+      switch (params.shape) {
+        case AOM_NOISE_SHAPE_DIAMOND:
+          if (abs(x) <= y + lag) {
+            model->coords[i][0] = x;
+            model->coords[i][1] = y;
+            ++i;
+          }
+          break;
+        case AOM_NOISE_SHAPE_SQUARE:
+          model->coords[i][0] = x;
+          model->coords[i][1] = y;
+          ++i;
+          break;
+        default:
+          fprintf(stderr, "Invalid shape\n");
+          aom_noise_model_free(model);
+          return 0;
+      }
+    }
+  }
+  assert(i == n);
+  return 1;
+}
+
+void aom_noise_model_free(aom_noise_model_t *model) {
+  int c = 0;
+  if (!model) return;
+
+  aom_free(model->coords);
+  for (c = 0; c < 3; ++c) {
+    equation_system_free(&model->latest_state[c].eqns);
+    equation_system_free(&model->combined_state[c].eqns);
+
+    equation_system_free(&model->latest_state[c].strength_solver.eqns);
+    equation_system_free(&model->combined_state[c].strength_solver.eqns);
+  }
+  memset(model, 0, sizeof(*model));
+}
+
+// Extracts the neighborhood defined by coords around point (x, y) from
+// the difference between the data and denoised images. Also extracts the
+// entry (possibly downsampled) for (x, y) in the alt_data (e.g., luma).
+#define EXTRACT_AR_ROW(INT_TYPE, suffix)                                   \
+  static double extract_ar_row_##suffix(                                   \
+      int(*coords)[2], int num_coords, const INT_TYPE *const data,         \
+      const INT_TYPE *const denoised, int stride, int sub_log2[2],         \
+      const INT_TYPE *const alt_data, const INT_TYPE *const alt_denoised,  \
+      int alt_stride, int x, int y, double *buffer) {                      \
+    for (int i = 0; i < num_coords; ++i) {                                 \
+      const int x_i = x + coords[i][0], y_i = y + coords[i][1];            \
+      buffer[i] =                                                          \
+          (double)data[y_i * stride + x_i] - denoised[y_i * stride + x_i]; \
+    }                                                                      \
+    const double val =                                                     \
+        (double)data[y * stride + x] - denoised[y * stride + x];           \
+                                                                           \
+    if (alt_data && alt_denoised) {                                        \
+      double avg_data = 0, avg_denoised = 0;                               \
+      int num_samples = 0;                                                 \
+      for (int dy_i = 0; dy_i < (1 << sub_log2[1]); dy_i++) {              \
+        const int y_up = (y << sub_log2[1]) + dy_i;                        \
+        for (int dx_i = 0; dx_i < (1 << sub_log2[0]); dx_i++) {            \
+          const int x_up = (x << sub_log2[0]) + dx_i;                      \
+          avg_data += alt_data[y_up * alt_stride + x_up];                  \
+          avg_denoised += alt_denoised[y_up * alt_stride + x_up];          \
+          num_samples++;                                                   \
+        }                                                                  \
+      }                                                                    \
+      buffer[num_coords] = (avg_data - avg_denoised) / num_samples;        \
+    }                                                                      \
+    return val;                                                            \
+  }
+
+EXTRACT_AR_ROW(uint8_t, lowbd);
+EXTRACT_AR_ROW(uint16_t, highbd);
+
+static int add_block_observations(
+    aom_noise_model_t *noise_model, int c, const uint8_t *const data,
+    const uint8_t *const denoised, int w, int h, int stride, int sub_log2[2],
+    const uint8_t *const alt_data, const uint8_t *const alt_denoised,
+    int alt_stride, const uint8_t *const flat_blocks, int block_size,
+    int num_blocks_w, int num_blocks_h) {
+  const int lag = noise_model->params.lag;
+  const int num_coords = noise_model->n;
+  const double normalization = (1 << noise_model->params.bit_depth) - 1;
+  double *A = noise_model->latest_state[c].eqns.A;
+  double *b = noise_model->latest_state[c].eqns.b;
+  double *buffer = (double *)aom_malloc(sizeof(*buffer) * (num_coords + 1));
+  const int n = noise_model->latest_state[c].eqns.n;
+
+  if (!buffer) {
+    fprintf(stderr, "Unable to allocate buffer of size %d\n", num_coords + 1);
+    return 0;
+  }
+  for (int by = 0; by < num_blocks_h; ++by) {
+    const int y_o = by * (block_size >> sub_log2[1]);
+    for (int bx = 0; bx < num_blocks_w; ++bx) {
+      const int x_o = bx * (block_size >> sub_log2[0]);
+      if (!flat_blocks[by * num_blocks_w + bx]) {
+        continue;
+      }
+      int y_start =
+          (by > 0 && flat_blocks[(by - 1) * num_blocks_w + bx]) ? 0 : lag;
+      int x_start =
+          (bx > 0 && flat_blocks[by * num_blocks_w + bx - 1]) ? 0 : lag;
+      int y_end = AOMMIN((h >> sub_log2[1]) - by * (block_size >> sub_log2[1]),
+                         block_size >> sub_log2[1]);
+      int x_end = AOMMIN(
+          (w >> sub_log2[0]) - bx * (block_size >> sub_log2[0]) - lag,
+          (bx + 1 < num_blocks_w && flat_blocks[by * num_blocks_w + bx + 1])
+              ? (block_size >> sub_log2[0])
+              : ((block_size >> sub_log2[0]) - lag));
+      for (int y = y_start; y < y_end; ++y) {
+        for (int x = x_start; x < x_end; ++x) {
+          const double val =
+              noise_model->params.use_highbd
+                  ? extract_ar_row_highbd(noise_model->coords, num_coords,
+                                          (const uint16_t *const)data,
+                                          (const uint16_t *const)denoised,
+                                          stride, sub_log2,
+                                          (const uint16_t *const)alt_data,
+                                          (const uint16_t *const)alt_denoised,
+                                          alt_stride, x + x_o, y + y_o, buffer)
+                  : extract_ar_row_lowbd(noise_model->coords, num_coords, data,
+                                         denoised, stride, sub_log2, alt_data,
+                                         alt_denoised, alt_stride, x + x_o,
+                                         y + y_o, buffer);
+          for (int i = 0; i < n; ++i) {
+            for (int j = 0; j < n; ++j) {
+              A[i * n + j] +=
+                  (buffer[i] * buffer[j]) / (normalization * normalization);
+            }
+            b[i] += (buffer[i] * val) / (normalization * normalization);
+          }
+          noise_model->latest_state[c].num_observations++;
+        }
+      }
+    }
+  }
+  aom_free(buffer);
+  return 1;
+}
+
+static void add_noise_std_observations(
+    aom_noise_model_t *noise_model, int c, const double *coeffs,
+    const uint8_t *const data, const uint8_t *const denoised, int w, int h,
+    int stride, int sub_log2[2], const uint8_t *const alt_data, int alt_stride,
+    const uint8_t *const flat_blocks, int block_size, int num_blocks_w,
+    int num_blocks_h) {
+  const int num_coords = noise_model->n;
+  aom_noise_strength_solver_t *noise_strength_solver =
+      &noise_model->latest_state[c].strength_solver;
+
+  const aom_noise_strength_solver_t *noise_strength_luma =
+      &noise_model->latest_state[0].strength_solver;
+  const double luma_gain = noise_model->latest_state[0].ar_gain;
+  const double noise_gain = noise_model->latest_state[c].ar_gain;
+  for (int by = 0; by < num_blocks_h; ++by) {
+    const int y_o = by * (block_size >> sub_log2[1]);
+    for (int bx = 0; bx < num_blocks_w; ++bx) {
+      const int x_o = bx * (block_size >> sub_log2[0]);
+      if (!flat_blocks[by * num_blocks_w + bx]) {
+        continue;
+      }
+      const int num_samples_h =
+          AOMMIN((h >> sub_log2[1]) - by * (block_size >> sub_log2[1]),
+                 block_size >> sub_log2[1]);
+      const int num_samples_w =
+          AOMMIN((w >> sub_log2[0]) - bx * (block_size >> sub_log2[0]),
+                 (block_size >> sub_log2[0]));
+      // Make sure that we have a reasonable amount of samples to consider the
+      // block
+      if (num_samples_w * num_samples_h > block_size) {
+        const double block_mean = get_block_mean(
+            alt_data ? alt_data : data, w, h, alt_data ? alt_stride : stride,
+            x_o << sub_log2[0], y_o << sub_log2[1], block_size,
+            noise_model->params.use_highbd);
+        const double noise_var = get_noise_var(
+            data, denoised, stride, w >> sub_log2[0], h >> sub_log2[1], x_o,
+            y_o, block_size >> sub_log2[0], block_size >> sub_log2[1],
+            noise_model->params.use_highbd);
+        // We want to remove the part of the noise that came from being
+        // correlated with luma. Note that the noise solver for luma must
+        // have already been run.
+        const double luma_strength =
+            c > 0 ? luma_gain * noise_strength_solver_get_value(
+                                    noise_strength_luma, block_mean)
+                  : 0;
+        const double corr = c > 0 ? coeffs[num_coords] : 0;
+        // Chroma noise:
+        //    N(0, noise_var) = N(0, uncorr_var) + corr * N(0, luma_strength^2)
+        // The uncorrelated component:
+        //   uncorr_var = noise_var - (corr * luma_strength)^2
+        // But don't allow fully correlated noise (hence the max), since the
+        // synthesis cannot model it.
+        const double uncorr_std = sqrt(
+            AOMMAX(noise_var / 16, noise_var - pow(corr * luma_strength, 2)));
+        // After we've removed correlation with luma, undo the gain that will
+        // come from running the IIR filter.
+        const double adjusted_strength = uncorr_std / noise_gain;
+        aom_noise_strength_solver_add_measurement(
+            noise_strength_solver, block_mean, adjusted_strength);
+      }
+    }
+  }
+}
+
+// Return true if the noise estimate appears to be different from the combined
+// (multi-frame) estimate. The difference is measured by checking whether the
+// AR coefficients have diverged (using a threshold on normalized cross
+// correlation), or whether the noise strength has changed.
+static int is_noise_model_different(aom_noise_model_t *const noise_model) {
+  // These thresholds are kind of arbitrary and will likely need further tuning
+  // (or exported as parameters). The threshold on noise strength is a weighted
+  // difference between the noise strength histograms
+  const double kCoeffThreshold = 0.9;
+  const double kStrengthThreshold =
+      0.005 * (1 << (noise_model->params.bit_depth - 8));
+  for (int c = 0; c < 1; ++c) {
+    const double corr =
+        aom_normalized_cross_correlation(noise_model->latest_state[c].eqns.x,
+                                         noise_model->combined_state[c].eqns.x,
+                                         noise_model->combined_state[c].eqns.n);
+    if (corr < kCoeffThreshold) return 1;
+
+    const double dx =
+        1.0 / noise_model->latest_state[c].strength_solver.num_bins;
+
+    const aom_equation_system_t *latest_eqns =
+        &noise_model->latest_state[c].strength_solver.eqns;
+    const aom_equation_system_t *combined_eqns =
+        &noise_model->combined_state[c].strength_solver.eqns;
+    double diff = 0;
+    double total_weight = 0;
+    for (int j = 0; j < latest_eqns->n; ++j) {
+      double weight = 0;
+      for (int i = 0; i < latest_eqns->n; ++i) {
+        weight += latest_eqns->A[i * latest_eqns->n + j];
+      }
+      weight = sqrt(weight);
+      diff += weight * fabs(latest_eqns->x[j] - combined_eqns->x[j]);
+      total_weight += weight;
+    }
+    if (diff * dx / total_weight > kStrengthThreshold) return 1;
+  }
+  return 0;
+}
+
+static int ar_equation_system_solve(aom_noise_state_t *state, int is_chroma) {
+  const int ret = equation_system_solve(&state->eqns);
+  state->ar_gain = 1.0;
+  if (!ret) return ret;
+
+  // Update the AR gain from the equation system as it will be used to fit
+  // the noise strength as a function of intensity.  In the Yule-Walker
+  // equations, the diagonal should be the variance of the correlated noise.
+  // In the case of the least squares estimate, there will be some variability
+  // in the diagonal. So use the mean of the diagonal as the estimate of
+  // overall variance (this works for least squares or Yule-Walker formulation).
+  double var = 0;
+  const int n = state->eqns.n;
+  for (int i = 0; i < (state->eqns.n - is_chroma); ++i) {
+    var += state->eqns.A[i * n + i] / state->num_observations;
+  }
+  var /= (n - is_chroma);
+
+  // Keep track of E(Y^2) = <b, x> + E(X^2)
+  // In the case that we are using chroma and have an estimate of correlation
+  // with luma we adjust that estimate slightly to remove the correlated bits by
+  // subtracting out the last column of a scaled by our correlation estimate
+  // from b. E(y^2) = <b - A(:, end)*x(end), x>
+  double sum_covar = 0;
+  for (int i = 0; i < state->eqns.n - is_chroma; ++i) {
+    double bi = state->eqns.b[i];
+    if (is_chroma) {
+      bi -= state->eqns.A[i * n + (n - 1)] * state->eqns.x[n - 1];
+    }
+    sum_covar += (bi * state->eqns.x[i]) / state->num_observations;
+  }
+  // Now, get an estimate of the variance of uncorrelated noise signal and use
+  // it to determine the gain of the AR filter.
+  const double noise_var = AOMMAX(var - sum_covar, 1e-6);
+  state->ar_gain = AOMMAX(1, sqrt(AOMMAX(var / noise_var, 1e-6)));
+  return ret;
+}
+
+aom_noise_status_t aom_noise_model_update(
+    aom_noise_model_t *const noise_model, const uint8_t *const data[3],
+    const uint8_t *const denoised[3], int w, int h, int stride[3],
+    int chroma_sub_log2[2], const uint8_t *const flat_blocks, int block_size) {
+  const int num_blocks_w = (w + block_size - 1) / block_size;
+  const int num_blocks_h = (h + block_size - 1) / block_size;
+  int y_model_different = 0;
+  int num_blocks = 0;
+  int i = 0, channel = 0;
+
+  if (block_size <= 1) {
+    fprintf(stderr, "block_size = %d must be > 1\n", block_size);
+    return AOM_NOISE_STATUS_INVALID_ARGUMENT;
+  }
+
+  if (block_size < noise_model->params.lag * 2 + 1) {
+    fprintf(stderr, "block_size = %d must be >= %d\n", block_size,
+            noise_model->params.lag * 2 + 1);
+    return AOM_NOISE_STATUS_INVALID_ARGUMENT;
+  }
+
+  // Clear the latest equation system
+  for (i = 0; i < 3; ++i) {
+    equation_system_clear(&noise_model->latest_state[i].eqns);
+    noise_model->latest_state[i].num_observations = 0;
+    noise_strength_solver_clear(&noise_model->latest_state[i].strength_solver);
+  }
+
+  // Check that we have enough flat blocks
+  for (i = 0; i < num_blocks_h * num_blocks_w; ++i) {
+    if (flat_blocks[i]) {
+      num_blocks++;
+    }
+  }
+
+  if (num_blocks <= 1) {
+    fprintf(stderr, "Not enough flat blocks to update noise estimate\n");
+    return AOM_NOISE_STATUS_INSUFFICIENT_FLAT_BLOCKS;
+  }
+
+  for (channel = 0; channel < 3; ++channel) {
+    int no_subsampling[2] = { 0, 0 };
+    const uint8_t *alt_data = channel > 0 ? data[0] : 0;
+    const uint8_t *alt_denoised = channel > 0 ? denoised[0] : 0;
+    int *sub = channel > 0 ? chroma_sub_log2 : no_subsampling;
+    const int is_chroma = channel != 0;
+    if (!data[channel] || !denoised[channel]) break;
+    if (!add_block_observations(noise_model, channel, data[channel],
+                                denoised[channel], w, h, stride[channel], sub,
+                                alt_data, alt_denoised, stride[0], flat_blocks,
+                                block_size, num_blocks_w, num_blocks_h)) {
+      fprintf(stderr, "Adding block observation failed\n");
+      return AOM_NOISE_STATUS_INTERNAL_ERROR;
+    }
+
+    if (!ar_equation_system_solve(&noise_model->latest_state[channel],
+                                  is_chroma)) {
+      if (is_chroma) {
+        set_chroma_coefficient_fallback_soln(
+            &noise_model->latest_state[channel].eqns);
+      } else {
+        fprintf(stderr, "Solving latest noise equation system failed %d!\n",
+                channel);
+        return AOM_NOISE_STATUS_INTERNAL_ERROR;
+      }
+    }
+
+    add_noise_std_observations(
+        noise_model, channel, noise_model->latest_state[channel].eqns.x,
+        data[channel], denoised[channel], w, h, stride[channel], sub, alt_data,
+        stride[0], flat_blocks, block_size, num_blocks_w, num_blocks_h);
+
+    if (!aom_noise_strength_solver_solve(
+            &noise_model->latest_state[channel].strength_solver)) {
+      fprintf(stderr, "Solving latest noise strength failed!\n");
+      return AOM_NOISE_STATUS_INTERNAL_ERROR;
+    }
+
+    // Check noise characteristics and return if error.
+    if (channel == 0 &&
+        noise_model->combined_state[channel].strength_solver.num_equations >
+            0 &&
+        is_noise_model_different(noise_model)) {
+      y_model_different = 1;
+    }
+
+    // Don't update the combined stats if the y model is different.
+    if (y_model_different) continue;
+
+    noise_model->combined_state[channel].num_observations +=
+        noise_model->latest_state[channel].num_observations;
+    equation_system_add(&noise_model->combined_state[channel].eqns,
+                        &noise_model->latest_state[channel].eqns);
+    if (!ar_equation_system_solve(&noise_model->combined_state[channel],
+                                  is_chroma)) {
+      if (is_chroma) {
+        set_chroma_coefficient_fallback_soln(
+            &noise_model->combined_state[channel].eqns);
+      } else {
+        fprintf(stderr, "Solving combined noise equation system failed %d!\n",
+                channel);
+        return AOM_NOISE_STATUS_INTERNAL_ERROR;
+      }
+    }
+
+    noise_strength_solver_add(
+        &noise_model->combined_state[channel].strength_solver,
+        &noise_model->latest_state[channel].strength_solver);
+
+    if (!aom_noise_strength_solver_solve(
+            &noise_model->combined_state[channel].strength_solver)) {
+      fprintf(stderr, "Solving combined noise strength failed!\n");
+      return AOM_NOISE_STATUS_INTERNAL_ERROR;
+    }
+  }
+
+  return y_model_different ? AOM_NOISE_STATUS_DIFFERENT_NOISE_TYPE
+                           : AOM_NOISE_STATUS_OK;
+}
+
+void aom_noise_model_save_latest(aom_noise_model_t *noise_model) {
+  for (int c = 0; c < 3; c++) {
+    equation_system_copy(&noise_model->combined_state[c].eqns,
+                         &noise_model->latest_state[c].eqns);
+    equation_system_copy(&noise_model->combined_state[c].strength_solver.eqns,
+                         &noise_model->latest_state[c].strength_solver.eqns);
+    noise_model->combined_state[c].strength_solver.num_equations =
+        noise_model->latest_state[c].strength_solver.num_equations;
+    noise_model->combined_state[c].num_observations =
+        noise_model->latest_state[c].num_observations;
+    noise_model->combined_state[c].ar_gain =
+        noise_model->latest_state[c].ar_gain;
+  }
+}
+
+int aom_noise_model_get_grain_parameters(aom_noise_model_t *const noise_model,
+                                         aom_film_grain_t *film_grain) {
+  if (noise_model->params.lag > 3) {
+    fprintf(stderr, "params.lag = %d > 3\n", noise_model->params.lag);
+    return 0;
+  }
+  uint16_t random_seed = film_grain->random_seed;
+  memset(film_grain, 0, sizeof(*film_grain));
+  film_grain->random_seed = random_seed;
+
+  film_grain->apply_grain = 1;
+  film_grain->update_parameters = 1;
+
+  film_grain->ar_coeff_lag = noise_model->params.lag;
+
+  // Convert the scaling functions to 8 bit values
+  aom_noise_strength_lut_t scaling_points[3];
+  aom_noise_strength_solver_fit_piecewise(
+      &noise_model->combined_state[0].strength_solver, 14, scaling_points + 0);
+  aom_noise_strength_solver_fit_piecewise(
+      &noise_model->combined_state[1].strength_solver, 10, scaling_points + 1);
+  aom_noise_strength_solver_fit_piecewise(
+      &noise_model->combined_state[2].strength_solver, 10, scaling_points + 2);
+
+  // Both the domain and the range of the scaling functions in the film_grain
+  // are normalized to 8-bit (e.g., they are implicitly scaled during grain
+  // synthesis).
+  const double strength_divisor = 1 << (noise_model->params.bit_depth - 8);
+  double max_scaling_value = 1e-4;
+  for (int c = 0; c < 3; ++c) {
+    for (int i = 0; i < scaling_points[c].num_points; ++i) {
+      scaling_points[c].points[i][0] =
+          AOMMIN(255, scaling_points[c].points[i][0] / strength_divisor);
+      scaling_points[c].points[i][1] =
+          AOMMIN(255, scaling_points[c].points[i][1] / strength_divisor);
+      max_scaling_value =
+          AOMMAX(scaling_points[c].points[i][1], max_scaling_value);
+    }
+  }
+
+  // Scaling_shift values are in the range [8,11]
+  const int max_scaling_value_log2 =
+      clamp((int)floor(log2(max_scaling_value) + 1), 2, 5);
+  film_grain->scaling_shift = 5 + (8 - max_scaling_value_log2);
+
+  const double scale_factor = 1 << (8 - max_scaling_value_log2);
+  film_grain->num_y_points = scaling_points[0].num_points;
+  film_grain->num_cb_points = scaling_points[1].num_points;
+  film_grain->num_cr_points = scaling_points[2].num_points;
+
+  int(*film_grain_scaling[3])[2] = {
+    film_grain->scaling_points_y,
+    film_grain->scaling_points_cb,
+    film_grain->scaling_points_cr,
+  };
+  for (int c = 0; c < 3; c++) {
+    for (int i = 0; i < scaling_points[c].num_points; ++i) {
+      film_grain_scaling[c][i][0] = (int)(scaling_points[c].points[i][0] + 0.5);
+      film_grain_scaling[c][i][1] = clamp(
+          (int)(scale_factor * scaling_points[c].points[i][1] + 0.5), 0, 255);
+    }
+  }
+  aom_noise_strength_lut_free(scaling_points + 0);
+  aom_noise_strength_lut_free(scaling_points + 1);
+  aom_noise_strength_lut_free(scaling_points + 2);
+
+  // Convert the ar_coeffs into 8-bit values
+  const int n_coeff = noise_model->combined_state[0].eqns.n;
+  double max_coeff = 1e-4, min_coeff = -1e-4;
+  double y_corr[2] = { 0, 0 };
+  double avg_luma_strength = 0;
+  for (int c = 0; c < 3; c++) {
+    aom_equation_system_t *eqns = &noise_model->combined_state[c].eqns;
+    for (int i = 0; i < n_coeff; ++i) {
+      max_coeff = AOMMAX(max_coeff, eqns->x[i]);
+      min_coeff = AOMMIN(min_coeff, eqns->x[i]);
+    }
+    // Since the correlation between luma/chroma was computed in an already
+    // scaled space, we adjust it in the un-scaled space.
+    aom_noise_strength_solver_t *solver =
+        &noise_model->combined_state[c].strength_solver;
+    // Compute a weighted average of the strength for the channel.
+    double average_strength = 0, total_weight = 0;
+    for (int i = 0; i < solver->eqns.n; ++i) {
+      double w = 0;
+      for (int j = 0; j < solver->eqns.n; ++j) {
+        w += solver->eqns.A[i * solver->eqns.n + j];
+      }
+      w = sqrt(w);
+      average_strength += solver->eqns.x[i] * w;
+      total_weight += w;
+    }
+    if (total_weight == 0)
+      average_strength = 1;
+    else
+      average_strength /= total_weight;
+    if (c == 0) {
+      avg_luma_strength = average_strength;
+    } else {
+      y_corr[c - 1] = avg_luma_strength * eqns->x[n_coeff] / average_strength;
+      max_coeff = AOMMAX(max_coeff, y_corr[c - 1]);
+      min_coeff = AOMMIN(min_coeff, y_corr[c - 1]);
+    }
+  }
+  // Shift value: AR coeffs range (values 6-9)
+  // 6: [-2, 2),  7: [-1, 1), 8: [-0.5, 0.5), 9: [-0.25, 0.25)
+  film_grain->ar_coeff_shift =
+      clamp(7 - (int)AOMMAX(1 + floor(log2(max_coeff)), ceil(log2(-min_coeff))),
+            6, 9);
+  double scale_ar_coeff = 1 << film_grain->ar_coeff_shift;
+  int *ar_coeffs[3] = {
+    film_grain->ar_coeffs_y,
+    film_grain->ar_coeffs_cb,
+    film_grain->ar_coeffs_cr,
+  };
+  for (int c = 0; c < 3; ++c) {
+    aom_equation_system_t *eqns = &noise_model->combined_state[c].eqns;
+    for (int i = 0; i < n_coeff; ++i) {
+      ar_coeffs[c][i] =
+          clamp((int)round(scale_ar_coeff * eqns->x[i]), -128, 127);
+    }
+    if (c > 0) {
+      ar_coeffs[c][n_coeff] =
+          clamp((int)round(scale_ar_coeff * y_corr[c - 1]), -128, 127);
+    }
+  }
+
+  // At the moment, the noise modeling code assumes that the chroma scaling
+  // functions are a function of luma.
+  film_grain->cb_mult = 128;       // 8 bits
+  film_grain->cb_luma_mult = 192;  // 8 bits
+  film_grain->cb_offset = 256;     // 9 bits
+
+  film_grain->cr_mult = 128;       // 8 bits
+  film_grain->cr_luma_mult = 192;  // 8 bits
+  film_grain->cr_offset = 256;     // 9 bits
+
+  film_grain->chroma_scaling_from_luma = 0;
+  film_grain->grain_scale_shift = 0;
+  film_grain->overlap_flag = 1;
+  return 1;
+}
+
+static void pointwise_multiply(const float *a, float *b, int n) {
+  for (int i = 0; i < n; ++i) {
+    b[i] *= a[i];
+  }
+}
+
+static float *get_half_cos_window(int block_size) {
+  float *window_function =
+      (float *)aom_malloc(block_size * block_size * sizeof(*window_function));
+  for (int y = 0; y < block_size; ++y) {
+    const double cos_yd = cos((.5 + y) * PI / block_size - PI / 2);
+    for (int x = 0; x < block_size; ++x) {
+      const double cos_xd = cos((.5 + x) * PI / block_size - PI / 2);
+      window_function[y * block_size + x] = (float)(cos_yd * cos_xd);
+    }
+  }
+  return window_function;
+}
+
+#define DITHER_AND_QUANTIZE(INT_TYPE, suffix)                               \
+  static void dither_and_quantize_##suffix(                                 \
+      float *result, int result_stride, INT_TYPE *denoised, int w, int h,   \
+      int stride, int chroma_sub_w, int chroma_sub_h, int block_size,       \
+      float block_normalization) {                                          \
+    for (int y = 0; y < (h >> chroma_sub_h); ++y) {                         \
+      for (int x = 0; x < (w >> chroma_sub_w); ++x) {                       \
+        const int result_idx =                                              \
+            (y + (block_size >> chroma_sub_h)) * result_stride + x +        \
+            (block_size >> chroma_sub_w);                                   \
+        INT_TYPE new_val = (INT_TYPE)AOMMIN(                                \
+            AOMMAX(result[result_idx] * block_normalization + 0.5f, 0),     \
+            block_normalization);                                           \
+        const float err =                                                   \
+            -(((float)new_val) / block_normalization - result[result_idx]); \
+        denoised[y * stride + x] = new_val;                                 \
+        if (x + 1 < (w >> chroma_sub_w)) {                                  \
+          result[result_idx + 1] += err * 7.0f / 16.0f;                     \
+        }                                                                   \
+        if (y + 1 < (h >> chroma_sub_h)) {                                  \
+          if (x > 0) {                                                      \
+            result[result_idx + result_stride - 1] += err * 3.0f / 16.0f;   \
+          }                                                                 \
+          result[result_idx + result_stride] += err * 5.0f / 16.0f;         \
+          if (x + 1 < (w >> chroma_sub_w)) {                                \
+            result[result_idx + result_stride + 1] += err * 1.0f / 16.0f;   \
+          }                                                                 \
+        }                                                                   \
+      }                                                                     \
+    }                                                                       \
+  }
+
+DITHER_AND_QUANTIZE(uint8_t, lowbd);
+DITHER_AND_QUANTIZE(uint16_t, highbd);
+
+int aom_wiener_denoise_2d(const uint8_t *const data[3], uint8_t *denoised[3],
+                          int w, int h, int stride[3], int chroma_sub[2],
+                          float *noise_psd[3], int block_size, int bit_depth,
+                          int use_highbd) {
+  float *plane = NULL, *block = NULL, *window_full = NULL,
+        *window_chroma = NULL;
+  double *block_d = NULL, *plane_d = NULL;
+  struct aom_noise_tx_t *tx_full = NULL;
+  struct aom_noise_tx_t *tx_chroma = NULL;
+  const int num_blocks_w = (w + block_size - 1) / block_size;
+  const int num_blocks_h = (h + block_size - 1) / block_size;
+  const int result_stride = (num_blocks_w + 2) * block_size;
+  const int result_height = (num_blocks_h + 2) * block_size;
+  float *result = NULL;
+  int init_success = 1;
+  aom_flat_block_finder_t block_finder_full;
+  aom_flat_block_finder_t block_finder_chroma;
+  const float kBlockNormalization = (float)((1 << bit_depth) - 1);
+  if (chroma_sub[0] != chroma_sub[1]) {
+    fprintf(stderr,
+            "aom_wiener_denoise_2d doesn't handle different chroma "
+            "subsampling");
+    return 0;
+  }
+  init_success &= aom_flat_block_finder_init(&block_finder_full, block_size,
+                                             bit_depth, use_highbd);
+  result = (float *)aom_malloc((num_blocks_h + 2) * block_size * result_stride *
+                               sizeof(*result));
+  plane = (float *)aom_malloc(block_size * block_size * sizeof(*plane));
+  block =
+      (float *)aom_memalign(32, 2 * block_size * block_size * sizeof(*block));
+  block_d = (double *)aom_malloc(block_size * block_size * sizeof(*block_d));
+  plane_d = (double *)aom_malloc(block_size * block_size * sizeof(*plane_d));
+  window_full = get_half_cos_window(block_size);
+  tx_full = aom_noise_tx_malloc(block_size);
+
+  if (chroma_sub[0] != 0) {
+    init_success &= aom_flat_block_finder_init(&block_finder_chroma,
+                                               block_size >> chroma_sub[0],
+                                               bit_depth, use_highbd);
+    window_chroma = get_half_cos_window(block_size >> chroma_sub[0]);
+    tx_chroma = aom_noise_tx_malloc(block_size >> chroma_sub[0]);
+  } else {
+    window_chroma = window_full;
+    tx_chroma = tx_full;
+  }
+
+  init_success &= (tx_full != NULL) && (tx_chroma != NULL) && (plane != NULL) &&
+                  (plane_d != NULL) && (block != NULL) && (block_d != NULL) &&
+                  (window_full != NULL) && (window_chroma != NULL) &&
+                  (result != NULL);
+  for (int c = init_success ? 0 : 3; c < 3; ++c) {
+    float *window_function = c == 0 ? window_full : window_chroma;
+    aom_flat_block_finder_t *block_finder = &block_finder_full;
+    const int chroma_sub_h = c > 0 ? chroma_sub[1] : 0;
+    const int chroma_sub_w = c > 0 ? chroma_sub[0] : 0;
+    struct aom_noise_tx_t *tx =
+        (c > 0 && chroma_sub[0] > 0) ? tx_chroma : tx_full;
+    if (!data[c] || !denoised[c]) continue;
+    if (c > 0 && chroma_sub[0] != 0) {
+      block_finder = &block_finder_chroma;
+    }
+    memset(result, 0, sizeof(*result) * result_stride * result_height);
+    // Do overlapped block processing (half overlapped). The block rows can
+    // easily be done in parallel
+    for (int offsy = 0; offsy < (block_size >> chroma_sub_h);
+         offsy += (block_size >> chroma_sub_h) / 2) {
+      for (int offsx = 0; offsx < (block_size >> chroma_sub_w);
+           offsx += (block_size >> chroma_sub_w) / 2) {
+        // Pad the boundary when processing each block-set.
+        for (int by = -1; by < num_blocks_h; ++by) {
+          for (int bx = -1; bx < num_blocks_w; ++bx) {
+            const int pixels_per_block =
+                (block_size >> chroma_sub_w) * (block_size >> chroma_sub_h);
+            aom_flat_block_finder_extract_block(
+                block_finder, data[c], w >> chroma_sub_w, h >> chroma_sub_h,
+                stride[c], bx * (block_size >> chroma_sub_w) + offsx,
+                by * (block_size >> chroma_sub_h) + offsy, plane_d, block_d);
+            for (int j = 0; j < pixels_per_block; ++j) {
+              block[j] = (float)block_d[j];
+              plane[j] = (float)plane_d[j];
+            }
+            pointwise_multiply(window_function, block, pixels_per_block);
+            aom_noise_tx_forward(tx, block);
+            aom_noise_tx_filter(tx, noise_psd[c]);
+            aom_noise_tx_inverse(tx, block);
+
+            // Apply window function to the plane approximation (we will apply
+            // it to the sum of plane + block when composing the results).
+            pointwise_multiply(window_function, plane, pixels_per_block);
+
+            for (int y = 0; y < (block_size >> chroma_sub_h); ++y) {
+              const int y_result =
+                  y + (by + 1) * (block_size >> chroma_sub_h) + offsy;
+              for (int x = 0; x < (block_size >> chroma_sub_w); ++x) {
+                const int x_result =
+                    x + (bx + 1) * (block_size >> chroma_sub_w) + offsx;
+                result[y_result * result_stride + x_result] +=
+                    (block[y * (block_size >> chroma_sub_w) + x] +
+                     plane[y * (block_size >> chroma_sub_w) + x]) *
+                    window_function[y * (block_size >> chroma_sub_w) + x];
+              }
+            }
+          }
+        }
+      }
+    }
+    if (use_highbd) {
+      dither_and_quantize_highbd(result, result_stride, (uint16_t *)denoised[c],
+                                 w, h, stride[c], chroma_sub_w, chroma_sub_h,
+                                 block_size, kBlockNormalization);
+    } else {
+      dither_and_quantize_lowbd(result, result_stride, denoised[c], w, h,
+                                stride[c], chroma_sub_w, chroma_sub_h,
+                                block_size, kBlockNormalization);
+    }
+  }
+  aom_free(result);
+  aom_free(plane);
+  aom_free(block);
+  aom_free(plane_d);
+  aom_free(block_d);
+  aom_free(window_full);
+
+  aom_noise_tx_free(tx_full);
+
+  aom_flat_block_finder_free(&block_finder_full);
+  if (chroma_sub[0] != 0) {
+    aom_flat_block_finder_free(&block_finder_chroma);
+    aom_free(window_chroma);
+    aom_noise_tx_free(tx_chroma);
+  }
+  return init_success;
+}
+
+struct aom_denoise_and_model_t {
+  int block_size;
+  int bit_depth;
+  float noise_level;
+
+  // Size of current denoised buffer and flat_block buffer
+  int width;
+  int height;
+  int y_stride;
+  int uv_stride;
+  int num_blocks_w;
+  int num_blocks_h;
+
+  // Buffers for image and noise_psd allocated on the fly
+  float *noise_psd[3];
+  uint8_t *denoised[3];
+  uint8_t *flat_blocks;
+
+  aom_flat_block_finder_t flat_block_finder;
+  aom_noise_model_t noise_model;
+};
+
+struct aom_denoise_and_model_t *aom_denoise_and_model_alloc(int bit_depth,
+                                                            int block_size,
+                                                            float noise_level) {
+  struct aom_denoise_and_model_t *ctx =
+      (struct aom_denoise_and_model_t *)aom_malloc(
+          sizeof(struct aom_denoise_and_model_t));
+  if (!ctx) {
+    fprintf(stderr, "Unable to allocate denoise_and_model struct\n");
+    return NULL;
+  }
+  memset(ctx, 0, sizeof(*ctx));
+
+  ctx->block_size = block_size;
+  ctx->noise_level = noise_level;
+  ctx->bit_depth = bit_depth;
+
+  ctx->noise_psd[0] =
+      aom_malloc(sizeof(*ctx->noise_psd[0]) * block_size * block_size);
+  ctx->noise_psd[1] =
+      aom_malloc(sizeof(*ctx->noise_psd[1]) * block_size * block_size);
+  ctx->noise_psd[2] =
+      aom_malloc(sizeof(*ctx->noise_psd[2]) * block_size * block_size);
+  if (!ctx->noise_psd[0] || !ctx->noise_psd[1] || !ctx->noise_psd[2]) {
+    fprintf(stderr, "Unable to allocate noise PSD buffers\n");
+    aom_denoise_and_model_free(ctx);
+    return NULL;
+  }
+  return ctx;
+}
+
+void aom_denoise_and_model_free(struct aom_denoise_and_model_t *ctx) {
+  aom_free(ctx->flat_blocks);
+  for (int i = 0; i < 3; ++i) {
+    aom_free(ctx->denoised[i]);
+    aom_free(ctx->noise_psd[i]);
+  }
+  aom_noise_model_free(&ctx->noise_model);
+  aom_flat_block_finder_free(&ctx->flat_block_finder);
+  aom_free(ctx);
+}
+
+static int denoise_and_model_realloc_if_necessary(
+    struct aom_denoise_and_model_t *ctx, YV12_BUFFER_CONFIG *sd) {
+  if (ctx->width == sd->y_width && ctx->height == sd->y_height &&
+      ctx->y_stride == sd->y_stride && ctx->uv_stride == sd->uv_stride)
+    return 1;
+  const int use_highbd = (sd->flags & YV12_FLAG_HIGHBITDEPTH) != 0;
+  const int block_size = ctx->block_size;
+
+  ctx->width = sd->y_width;
+  ctx->height = sd->y_height;
+  ctx->y_stride = sd->y_stride;
+  ctx->uv_stride = sd->uv_stride;
+
+  for (int i = 0; i < 3; ++i) {
+    aom_free(ctx->denoised[i]);
+    ctx->denoised[i] = NULL;
+  }
+  aom_free(ctx->flat_blocks);
+  ctx->flat_blocks = NULL;
+
+  ctx->denoised[0] = aom_malloc((sd->y_stride * sd->y_height) << use_highbd);
+  ctx->denoised[1] = aom_malloc((sd->uv_stride * sd->uv_height) << use_highbd);
+  ctx->denoised[2] = aom_malloc((sd->uv_stride * sd->uv_height) << use_highbd);
+  if (!ctx->denoised[0] || !ctx->denoised[1] || !ctx->denoised[2]) {
+    fprintf(stderr, "Unable to allocate denoise buffers\n");
+    return 0;
+  }
+  ctx->num_blocks_w = (sd->y_width + ctx->block_size - 1) / ctx->block_size;
+  ctx->num_blocks_h = (sd->y_height + ctx->block_size - 1) / ctx->block_size;
+  ctx->flat_blocks = aom_malloc(ctx->num_blocks_w * ctx->num_blocks_h);
+
+  aom_flat_block_finder_free(&ctx->flat_block_finder);
+  if (!aom_flat_block_finder_init(&ctx->flat_block_finder, ctx->block_size,
+                                  ctx->bit_depth, use_highbd)) {
+    fprintf(stderr, "Unable to init flat block finder\n");
+    return 0;
+  }
+
+  const aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, 3,
+                                            ctx->bit_depth, use_highbd };
+  aom_noise_model_free(&ctx->noise_model);
+  if (!aom_noise_model_init(&ctx->noise_model, params)) {
+    fprintf(stderr, "Unable to init noise model\n");
+    return 0;
+  }
+
+  // Simply use a flat PSD (although we could use the flat blocks to estimate
+  // PSD) those to estimate an actual noise PSD)
+  const float y_noise_level =
+      aom_noise_psd_get_default_value(ctx->block_size, ctx->noise_level);
+  const float uv_noise_level = aom_noise_psd_get_default_value(
+      ctx->block_size >> sd->subsampling_x, ctx->noise_level);
+  for (int i = 0; i < block_size * block_size; ++i) {
+    ctx->noise_psd[0][i] = y_noise_level;
+    ctx->noise_psd[1][i] = ctx->noise_psd[2][i] = uv_noise_level;
+  }
+  return 1;
+}
+
+int aom_denoise_and_model_run(struct aom_denoise_and_model_t *ctx,
+                              YV12_BUFFER_CONFIG *sd,
+                              aom_film_grain_t *film_grain) {
+  const int block_size = ctx->block_size;
+  const int use_highbd = (sd->flags & YV12_FLAG_HIGHBITDEPTH) != 0;
+  uint8_t *raw_data[3] = {
+    use_highbd ? (uint8_t *)CONVERT_TO_SHORTPTR(sd->y_buffer) : sd->y_buffer,
+    use_highbd ? (uint8_t *)CONVERT_TO_SHORTPTR(sd->u_buffer) : sd->u_buffer,
+    use_highbd ? (uint8_t *)CONVERT_TO_SHORTPTR(sd->v_buffer) : sd->v_buffer,
+  };
+  const uint8_t *const data[3] = { raw_data[0], raw_data[1], raw_data[2] };
+  int strides[3] = { sd->y_stride, sd->uv_stride, sd->uv_stride };
+  int chroma_sub_log2[2] = { sd->subsampling_x, sd->subsampling_y };
+
+  if (!denoise_and_model_realloc_if_necessary(ctx, sd)) {
+    fprintf(stderr, "Unable to realloc buffers\n");
+    return 0;
+  }
+
+  aom_flat_block_finder_run(&ctx->flat_block_finder, data[0], sd->y_width,
+                            sd->y_height, strides[0], ctx->flat_blocks);
+
+  if (!aom_wiener_denoise_2d(data, ctx->denoised, sd->y_width, sd->y_height,
+                             strides, chroma_sub_log2, ctx->noise_psd,
+                             block_size, ctx->bit_depth, use_highbd)) {
+    fprintf(stderr, "Unable to denoise image\n");
+    return 0;
+  }
+
+  const aom_noise_status_t status = aom_noise_model_update(
+      &ctx->noise_model, data, (const uint8_t *const *)ctx->denoised,
+      sd->y_width, sd->y_height, strides, chroma_sub_log2, ctx->flat_blocks,
+      block_size);
+  int have_noise_estimate = 0;
+  if (status == AOM_NOISE_STATUS_OK) {
+    have_noise_estimate = 1;
+  } else if (status == AOM_NOISE_STATUS_DIFFERENT_NOISE_TYPE) {
+    aom_noise_model_save_latest(&ctx->noise_model);
+    have_noise_estimate = 1;
+  } else {
+    // Unable to update noise model; proceed if we have a previous estimate.
+    have_noise_estimate =
+        (ctx->noise_model.combined_state[0].strength_solver.num_equations > 0);
+  }
+
+  film_grain->apply_grain = 0;
+  if (have_noise_estimate) {
+    if (!aom_noise_model_get_grain_parameters(&ctx->noise_model, film_grain)) {
+      fprintf(stderr, "Unable to get grain parameters.\n");
+      return 0;
+    }
+    if (!film_grain->random_seed) {
+      film_grain->random_seed = 7391;
+    }
+    memcpy(raw_data[0], ctx->denoised[0],
+           (strides[0] * sd->y_height) << use_highbd);
+    memcpy(raw_data[1], ctx->denoised[1],
+           (strides[1] * sd->uv_height) << use_highbd);
+    memcpy(raw_data[2], ctx->denoised[2],
+           (strides[2] * sd->uv_height) << use_highbd);
+  }
+  return 1;
+}
diff --git a/libs/libaom/src/aom_dsp/noise_model.h b/libs/libaom/src/aom_dsp/noise_model.h
new file mode 100644
index 000000000..5e7de9bf2
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/noise_model.h
@@ -0,0 +1,323 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_NOISE_MODEL_H_
+#define AOM_AOM_DSP_NOISE_MODEL_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+#include <stdint.h>
+#include "aom_dsp/grain_synthesis.h"
+#include "aom_scale/yv12config.h"
+
+/*!\brief Wrapper of data required to represent linear system of eqns and soln.
+ */
+typedef struct {
+  double *A;
+  double *b;
+  double *x;
+  int n;
+} aom_equation_system_t;
+
+/*!\brief Representation of a piecewise linear curve
+ *
+ * Holds n points as (x, y) pairs, that store the curve.
+ */
+typedef struct {
+  double (*points)[2];
+  int num_points;
+} aom_noise_strength_lut_t;
+
+/*!\brief Init the noise strength lut with the given number of points*/
+int aom_noise_strength_lut_init(aom_noise_strength_lut_t *lut, int num_points);
+
+/*!\brief Frees the noise strength lut. */
+void aom_noise_strength_lut_free(aom_noise_strength_lut_t *lut);
+
+/*!\brief Evaluate the lut at the point x.
+ *
+ * \param[in] lut  The lut data.
+ * \param[in] x    The coordinate to evaluate the lut.
+ */
+double aom_noise_strength_lut_eval(const aom_noise_strength_lut_t *lut,
+                                   double x);
+
+/*!\brief Helper struct to model noise strength as a function of intensity.
+ *
+ * Internally, this structure holds a representation of a linear system
+ * of equations that models noise strength (standard deviation) as a
+ * function of intensity. The mapping is initially stored using a
+ * piecewise representation with evenly spaced bins that cover the entire
+ * domain from [min_intensity, max_intensity]. Each observation (x,y) gives a
+ * constraint of the form:
+ *   y_{i} (1 - a) + y_{i+1} a = y
+ * where y_{i} is the value of bin i and x_{i} <= x <= x_{i+1} and
+ * a = x/(x_{i+1} - x{i}). The equation system holds the corresponding
+ * normal equations.
+ *
+ * As there may be missing data, the solution is regularized to get a
+ * complete set of values for the bins. A reduced representation after
+ * solving can be obtained by getting the corresponding noise_strength_lut_t.
+ */
+typedef struct {
+  aom_equation_system_t eqns;
+  double min_intensity;
+  double max_intensity;
+  int num_bins;
+  int num_equations;
+  double total;
+} aom_noise_strength_solver_t;
+
+/*!\brief Initializes the noise solver with the given number of bins.
+ *
+ * Returns 0 if initialization fails.
+ *
+ * \param[in]  solver    The noise solver to be initialized.
+ * \param[in]  num_bins  Number of bins to use in the internal representation.
+ * \param[in]  bit_depth The bit depth used to derive {min,max}_intensity.
+ */
+int aom_noise_strength_solver_init(aom_noise_strength_solver_t *solver,
+                                   int num_bins, int bit_depth);
+void aom_noise_strength_solver_free(aom_noise_strength_solver_t *solver);
+
+/*!\brief Gets the x coordinate of bin i.
+ *
+ * \param[in]  i  The bin whose coordinate to query.
+ */
+double aom_noise_strength_solver_get_center(
+    const aom_noise_strength_solver_t *solver, int i);
+
+/*!\brief Add an observation of the block mean intensity to its noise strength.
+ *
+ * \param[in]  block_mean  The average block intensity,
+ * \param[in]  noise_std   The observed noise strength.
+ */
+void aom_noise_strength_solver_add_measurement(
+    aom_noise_strength_solver_t *solver, double block_mean, double noise_std);
+
+/*!\brief Solves the current set of equations for the noise strength. */
+int aom_noise_strength_solver_solve(aom_noise_strength_solver_t *solver);
+
+/*!\brief Fits a reduced piecewise linear lut to the internal solution
+ *
+ * \param[in] max_num_points  The maximum number of output points
+ * \param[out] lut  The output piecewise linear lut.
+ */
+int aom_noise_strength_solver_fit_piecewise(
+    const aom_noise_strength_solver_t *solver, int max_num_points,
+    aom_noise_strength_lut_t *lut);
+
+/*!\brief Helper for holding precomputed data for finding flat blocks.
+ *
+ * Internally a block is modeled with a low-order polynomial model. A
+ * planar model would be a bunch of equations like:
+ * <[y_i x_i 1], [a_1, a_2, a_3]>  = b_i
+ * for each point in the block. The system matrix A with row i as [y_i x_i 1]
+ * is maintained as is the inverse, inv(A'*A), so that the plane parameters
+ * can be fit for each block.
+ */
+typedef struct {
+  double *AtA_inv;
+  double *A;
+  int num_params;  // The number of parameters used for internal low-order model
+  int block_size;  // The block size the finder was initialized with
+  double normalization;  // Normalization factor (1 / (2^(bit_depth) - 1))
+  int use_highbd;        // Whether input data should be interpreted as uint16
+} aom_flat_block_finder_t;
+
+/*!\brief Init the block_finder with the given block size, bit_depth */
+int aom_flat_block_finder_init(aom_flat_block_finder_t *block_finder,
+                               int block_size, int bit_depth, int use_highbd);
+void aom_flat_block_finder_free(aom_flat_block_finder_t *block_finder);
+
+/*!\brief Helper to extract a block and low order "planar" model. */
+void aom_flat_block_finder_extract_block(
+    const aom_flat_block_finder_t *block_finder, const uint8_t *const data,
+    int w, int h, int stride, int offsx, int offsy, double *plane,
+    double *block);
+
+/*!\brief Runs the flat block finder on the input data.
+ *
+ * Find flat blocks in the input image data. Returns a map of
+ * flat_blocks, where the value of flat_blocks map will be non-zero
+ * when a block is determined to be flat. A higher value indicates a bigger
+ * confidence in the decision.
+ */
+int aom_flat_block_finder_run(const aom_flat_block_finder_t *block_finder,
+                              const uint8_t *const data, int w, int h,
+                              int stride, uint8_t *flat_blocks);
+
+// The noise shape indicates the allowed coefficients in the AR model.
+enum {
+  AOM_NOISE_SHAPE_DIAMOND = 0,
+  AOM_NOISE_SHAPE_SQUARE = 1
+} UENUM1BYTE(aom_noise_shape);
+
+// The parameters of the noise model include the shape type, lag, the
+// bit depth of the input images provided, and whether the input images
+// will be using uint16 (or uint8) representation.
+typedef struct {
+  aom_noise_shape shape;
+  int lag;
+  int bit_depth;
+  int use_highbd;
+} aom_noise_model_params_t;
+
+/*!\brief State of a noise model estimate for a single channel.
+ *
+ * This contains a system of equations that can be used to solve
+ * for the auto-regressive coefficients as well as a noise strength
+ * solver that can be used to model noise strength as a function of
+ * intensity.
+ */
+typedef struct {
+  aom_equation_system_t eqns;
+  aom_noise_strength_solver_t strength_solver;
+  int num_observations;  // The number of observations in the eqn system
+  double ar_gain;        // The gain of the current AR filter
+} aom_noise_state_t;
+
+/*!\brief Complete model of noise for a planar video
+ *
+ * This includes a noise model for the latest frame and an aggregated
+ * estimate over all previous frames that had similar parameters.
+ */
+typedef struct {
+  aom_noise_model_params_t params;
+  aom_noise_state_t combined_state[3];  // Combined state per channel
+  aom_noise_state_t latest_state[3];    // Latest state per channel
+  int (*coords)[2];  // Offsets (x,y) of the coefficient samples
+  int n;             // Number of parameters (size of coords)
+  int bit_depth;
+} aom_noise_model_t;
+
+/*!\brief Result of a noise model update. */
+enum {
+  AOM_NOISE_STATUS_OK = 0,
+  AOM_NOISE_STATUS_INVALID_ARGUMENT,
+  AOM_NOISE_STATUS_INSUFFICIENT_FLAT_BLOCKS,
+  AOM_NOISE_STATUS_DIFFERENT_NOISE_TYPE,
+  AOM_NOISE_STATUS_INTERNAL_ERROR,
+} UENUM1BYTE(aom_noise_status_t);
+
+/*!\brief Initializes a noise model with the given parameters.
+ *
+ * Returns 0 on failure.
+ */
+int aom_noise_model_init(aom_noise_model_t *model,
+                         const aom_noise_model_params_t params);
+void aom_noise_model_free(aom_noise_model_t *model);
+
+/*!\brief Updates the noise model with a new frame observation.
+ *
+ * Updates the noise model with measurements from the given input frame and a
+ * denoised variant of it. Noise is sampled from flat blocks using the flat
+ * block map.
+ *
+ * Returns a noise_status indicating if the update was successful. If the
+ * Update was successful, the combined_state is updated with measurements from
+ * the provided frame. If status is OK or DIFFERENT_NOISE_TYPE, the latest noise
+ * state will be updated with measurements from the provided frame.
+ *
+ * \param[in,out] noise_model     The noise model to be updated
+ * \param[in]     data            Raw frame data
+ * \param[in]     denoised        Denoised frame data.
+ * \param[in]     w               Frame width
+ * \param[in]     h               Frame height
+ * \param[in]     strides         Stride of the planes
+ * \param[in]     chroma_sub_log2 Chroma subsampling for planes != 0.
+ * \param[in]     flat_blocks     A map to blocks that have been determined flat
+ * \param[in]     block_size      The size of blocks.
+ */
+aom_noise_status_t aom_noise_model_update(
+    aom_noise_model_t *const noise_model, const uint8_t *const data[3],
+    const uint8_t *const denoised[3], int w, int h, int strides[3],
+    int chroma_sub_log2[2], const uint8_t *const flat_blocks, int block_size);
+
+/*\brief Save the "latest" estimate into the "combined" estimate.
+ *
+ * This is meant to be called when the noise modeling detected a change
+ * in parameters (or for example, if a user wanted to reset estimation at
+ * a shot boundary).
+ */
+void aom_noise_model_save_latest(aom_noise_model_t *noise_model);
+
+/*!\brief Converts the noise_model parameters to the corresponding
+ *    grain_parameters.
+ *
+ * The noise structs in this file are suitable for estimation (e.g., using
+ * floats), but the grain parameters in the bitstream are quantized. This
+ * function does the conversion by selecting the correct quantization levels.
+ */
+int aom_noise_model_get_grain_parameters(aom_noise_model_t *const noise_model,
+                                         aom_film_grain_t *film_grain);
+
+/*!\brief Perform a Wiener filter denoising in 2D using the provided noise psd.
+ *
+ * \param[in]     data            Raw frame data
+ * \param[out]    denoised        Denoised frame data
+ * \param[in]     w               Frame width
+ * \param[in]     h               Frame height
+ * \param[in]     stride          Stride of the planes
+ * \param[in]     chroma_sub_log2 Chroma subsampling for planes != 0.
+ * \param[in]     noise_psd       The power spectral density of the noise
+ * \param[in]     block_size      The size of blocks
+ * \param[in]     bit_depth       Bit depth of the image
+ * \param[in]     use_highbd      If true, uint8 pointers are interpreted as
+ *                                uint16 and stride is measured in uint16.
+ *                                This must be true when bit_depth >= 10.
+ */
+int aom_wiener_denoise_2d(const uint8_t *const data[3], uint8_t *denoised[3],
+                          int w, int h, int stride[3], int chroma_sub_log2[2],
+                          float *noise_psd[3], int block_size, int bit_depth,
+                          int use_highbd);
+
+struct aom_denoise_and_model_t;
+
+/*!\brief Denoise the buffer and model the residual noise.
+ *
+ * This is meant to be called sequentially on input frames. The input buffer
+ * is denoised and the residual noise is modelled. The current noise estimate
+ * is populated in film_grain. Returns true on success. The grain.apply_grain
+ * parameter will be true when the input buffer was successfully denoised and
+ * grain was modelled. Returns false on error.
+ *
+ * \param[in]      ctx   Struct allocated with aom_denoise_and_model_alloc
+ *                       that holds some buffers for denoising and the current
+ *                       noise estimate.
+ * \param[in/out]   buf  The raw input buffer to be denoised.
+ * \param[out]    grain  Output film grain parameters
+ */
+int aom_denoise_and_model_run(struct aom_denoise_and_model_t *ctx,
+                              YV12_BUFFER_CONFIG *buf, aom_film_grain_t *grain);
+
+/*!\brief Allocates a context that can be used for denoising and noise modeling.
+ *
+ * \param[in]  bit_depth   Bit depth of buffers this will be run on.
+ * \param[in]  block_size  Block size for noise modeling and flat block
+ *                         estimation
+ * \param[in]  noise_level The noise_level (2.5 for moderate noise, and 5 for
+ *                         higher levels of noise)
+ */
+struct aom_denoise_and_model_t *aom_denoise_and_model_alloc(int bit_depth,
+                                                            int block_size,
+                                                            float noise_level);
+
+/*!\brief Frees the denoise context allocated with aom_denoise_and_model_alloc
+ */
+void aom_denoise_and_model_free(struct aom_denoise_and_model_t *denoise_model);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+#endif  // AOM_AOM_DSP_NOISE_MODEL_H_
diff --git a/libs/libaom/src/aom_dsp/noise_util.c b/libs/libaom/src/aom_dsp/noise_util.c
new file mode 100644
index 000000000..7e7e380c6
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/noise_util.c
@@ -0,0 +1,223 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom_dsp/noise_util.h"
+#include "aom_dsp/fft_common.h"
+#include "aom_mem/aom_mem.h"
+#include "config/aom_dsp_rtcd.h"
+
+float aom_noise_psd_get_default_value(int block_size, float factor) {
+  return (factor * factor / 10000) * block_size * block_size / 8;
+}
+
+// Internal representation of noise transform. It keeps track of the
+// transformed data and a temporary working buffer to use during the
+// transform.
+struct aom_noise_tx_t {
+  float *tx_block;
+  float *temp;
+  int block_size;
+  void (*fft)(const float *, float *, float *);
+  void (*ifft)(const float *, float *, float *);
+};
+
+struct aom_noise_tx_t *aom_noise_tx_malloc(int block_size) {
+  struct aom_noise_tx_t *noise_tx =
+      (struct aom_noise_tx_t *)aom_malloc(sizeof(struct aom_noise_tx_t));
+  if (!noise_tx) return NULL;
+  memset(noise_tx, 0, sizeof(*noise_tx));
+  switch (block_size) {
+    case 2:
+      noise_tx->fft = aom_fft2x2_float;
+      noise_tx->ifft = aom_ifft2x2_float;
+      break;
+    case 4:
+      noise_tx->fft = aom_fft4x4_float;
+      noise_tx->ifft = aom_ifft4x4_float;
+      break;
+    case 8:
+      noise_tx->fft = aom_fft8x8_float;
+      noise_tx->ifft = aom_ifft8x8_float;
+      break;
+    case 16:
+      noise_tx->fft = aom_fft16x16_float;
+      noise_tx->ifft = aom_ifft16x16_float;
+      break;
+    case 32:
+      noise_tx->fft = aom_fft32x32_float;
+      noise_tx->ifft = aom_ifft32x32_float;
+      break;
+    default:
+      aom_free(noise_tx);
+      fprintf(stderr, "Unsupported block size %d\n", block_size);
+      return NULL;
+  }
+  noise_tx->block_size = block_size;
+  noise_tx->tx_block = (float *)aom_memalign(
+      32, 2 * sizeof(*noise_tx->tx_block) * block_size * block_size);
+  noise_tx->temp = (float *)aom_memalign(
+      32, 2 * sizeof(*noise_tx->temp) * block_size * block_size);
+  if (!noise_tx->tx_block || !noise_tx->temp) {
+    aom_noise_tx_free(noise_tx);
+    return NULL;
+  }
+  // Clear the buffers up front. Some outputs of the forward transform are
+  // real only (the imaginary component will never be touched)
+  memset(noise_tx->tx_block, 0,
+         2 * sizeof(*noise_tx->tx_block) * block_size * block_size);
+  memset(noise_tx->temp, 0,
+         2 * sizeof(*noise_tx->temp) * block_size * block_size);
+  return noise_tx;
+}
+
+void aom_noise_tx_forward(struct aom_noise_tx_t *noise_tx, const float *data) {
+  noise_tx->fft(data, noise_tx->temp, noise_tx->tx_block);
+}
+
+void aom_noise_tx_filter(struct aom_noise_tx_t *noise_tx, const float *psd) {
+  const int block_size = noise_tx->block_size;
+  const float kBeta = 1.1f;
+  const float kEps = 1e-6f;
+  for (int y = 0; y < block_size; ++y) {
+    for (int x = 0; x < block_size; ++x) {
+      int i = y * block_size + x;
+      float *c = noise_tx->tx_block + 2 * i;
+      const float c0 = AOMMAX((float)fabs(c[0]), 1e-8f);
+      const float c1 = AOMMAX((float)fabs(c[1]), 1e-8f);
+      const float p = c0 * c0 + c1 * c1;
+      if (p > kBeta * psd[i] && p > 1e-6) {
+        noise_tx->tx_block[2 * i + 0] *= (p - psd[i]) / AOMMAX(p, kEps);
+        noise_tx->tx_block[2 * i + 1] *= (p - psd[i]) / AOMMAX(p, kEps);
+      } else {
+        noise_tx->tx_block[2 * i + 0] *= (kBeta - 1.0f) / kBeta;
+        noise_tx->tx_block[2 * i + 1] *= (kBeta - 1.0f) / kBeta;
+      }
+    }
+  }
+}
+
+void aom_noise_tx_inverse(struct aom_noise_tx_t *noise_tx, float *data) {
+  const int n = noise_tx->block_size * noise_tx->block_size;
+  noise_tx->ifft(noise_tx->tx_block, noise_tx->temp, data);
+  for (int i = 0; i < n; ++i) {
+    data[i] /= n;
+  }
+}
+
+void aom_noise_tx_add_energy(const struct aom_noise_tx_t *noise_tx,
+                             float *psd) {
+  const int block_size = noise_tx->block_size;
+  for (int yb = 0; yb < block_size; ++yb) {
+    for (int xb = 0; xb <= block_size / 2; ++xb) {
+      float *c = noise_tx->tx_block + 2 * (yb * block_size + xb);
+      psd[yb * block_size + xb] += c[0] * c[0] + c[1] * c[1];
+    }
+  }
+}
+
+void aom_noise_tx_free(struct aom_noise_tx_t *noise_tx) {
+  if (!noise_tx) return;
+  aom_free(noise_tx->tx_block);
+  aom_free(noise_tx->temp);
+  aom_free(noise_tx);
+}
+
+double aom_normalized_cross_correlation(const double *a, const double *b,
+                                        int n) {
+  double c = 0;
+  double a_len = 0;
+  double b_len = 0;
+  for (int i = 0; i < n; ++i) {
+    a_len += a[i] * a[i];
+    b_len += b[i] * b[i];
+    c += a[i] * b[i];
+  }
+  return c / (sqrt(a_len) * sqrt(b_len));
+}
+
+int aom_noise_data_validate(const double *data, int w, int h) {
+  const double kVarianceThreshold = 2;
+  const double kMeanThreshold = 2;
+
+  int x = 0, y = 0;
+  int ret_value = 1;
+  double var = 0, mean = 0;
+  double *mean_x, *mean_y, *var_x, *var_y;
+
+  // Check that noise variance is not increasing in x or y
+  // and that the data is zero mean.
+  mean_x = (double *)aom_malloc(sizeof(*mean_x) * w);
+  var_x = (double *)aom_malloc(sizeof(*var_x) * w);
+  mean_y = (double *)aom_malloc(sizeof(*mean_x) * h);
+  var_y = (double *)aom_malloc(sizeof(*var_y) * h);
+
+  memset(mean_x, 0, sizeof(*mean_x) * w);
+  memset(var_x, 0, sizeof(*var_x) * w);
+  memset(mean_y, 0, sizeof(*mean_y) * h);
+  memset(var_y, 0, sizeof(*var_y) * h);
+
+  for (y = 0; y < h; ++y) {
+    for (x = 0; x < w; ++x) {
+      const double d = data[y * w + x];
+      var_x[x] += d * d;
+      var_y[y] += d * d;
+      mean_x[x] += d;
+      mean_y[y] += d;
+      var += d * d;
+      mean += d;
+    }
+  }
+  mean /= (w * h);
+  var = var / (w * h) - mean * mean;
+
+  for (y = 0; y < h; ++y) {
+    mean_y[y] /= h;
+    var_y[y] = var_y[y] / h - mean_y[y] * mean_y[y];
+    if (fabs(var_y[y] - var) >= kVarianceThreshold) {
+      fprintf(stderr, "Variance distance too large %f %f\n", var_y[y], var);
+      ret_value = 0;
+      break;
+    }
+    if (fabs(mean_y[y] - mean) >= kMeanThreshold) {
+      fprintf(stderr, "Mean distance too large %f %f\n", mean_y[y], mean);
+      ret_value = 0;
+      break;
+    }
+  }
+
+  for (x = 0; x < w; ++x) {
+    mean_x[x] /= w;
+    var_x[x] = var_x[x] / w - mean_x[x] * mean_x[x];
+    if (fabs(var_x[x] - var) >= kVarianceThreshold) {
+      fprintf(stderr, "Variance distance too large %f %f\n", var_x[x], var);
+      ret_value = 0;
+      break;
+    }
+    if (fabs(mean_x[x] - mean) >= kMeanThreshold) {
+      fprintf(stderr, "Mean distance too large %f %f\n", mean_x[x], mean);
+      ret_value = 0;
+      break;
+    }
+  }
+
+  aom_free(mean_x);
+  aom_free(mean_y);
+  aom_free(var_x);
+  aom_free(var_y);
+
+  return ret_value;
+}
diff --git a/libs/libaom/src/aom_dsp/noise_util.h b/libs/libaom/src/aom_dsp/noise_util.h
new file mode 100644
index 000000000..2284a171a
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/noise_util.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_NOISE_UTIL_H_
+#define AOM_AOM_DSP_NOISE_UTIL_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// aom_noise_tx_t is an abstraction of a transform that is used for denoising.
+// It is meant to be lightweight and does hold the transformed data (as
+// the user should not be manipulating the transformed data directly).
+struct aom_noise_tx_t;
+
+// Allocates and returns a aom_noise_tx_t useful for denoising the given
+// block_size. The resulting aom_noise_tx_t should be free'd with
+// aom_noise_tx_free.
+struct aom_noise_tx_t *aom_noise_tx_malloc(int block_size);
+void aom_noise_tx_free(struct aom_noise_tx_t *aom_noise_tx);
+
+// Transforms the internal data and holds it in the aom_noise_tx's internal
+// buffer. For compatibility with existing SIMD implementations, "data" must
+// be 32-byte aligned.
+void aom_noise_tx_forward(struct aom_noise_tx_t *aom_noise_tx,
+                          const float *data);
+
+// Filters aom_noise_tx's internal data using the provided noise power spectral
+// density. The PSD must be at least block_size * block_size and should be
+// populated with a constant or via estimates taken from
+// aom_noise_tx_add_energy.
+void aom_noise_tx_filter(struct aom_noise_tx_t *aom_noise_tx, const float *psd);
+
+// Performs an inverse transform using the internal transform data.
+// For compatibility with existing SIMD implementations, "data" must be 32-byte
+// aligned.
+void aom_noise_tx_inverse(struct aom_noise_tx_t *aom_noise_tx, float *data);
+
+// Aggregates the power of the buffered transform data into the psd buffer.
+void aom_noise_tx_add_energy(const struct aom_noise_tx_t *aom_noise_tx,
+                             float *psd);
+
+// Returns a default value suitable for denosing a transform of the given
+// block_size. The noise "factor" determines the strength of the noise to
+// be removed. A value of about 2.5 can be used for moderate denoising,
+// where a value of 5.0 can be used for a high level of denoising.
+float aom_noise_psd_get_default_value(int block_size, float factor);
+
+// Computes normalized cross correlation of two vectors a and b of length n.
+double aom_normalized_cross_correlation(const double *a, const double *b,
+                                        int n);
+
+// Validates the correlated noise in the data buffer of size (w, h).
+int aom_noise_data_validate(const double *data, int w, int h);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // AOM_AOM_DSP_NOISE_UTIL_H_
diff --git a/libs/libaom/src/aom_dsp/prob.h b/libs/libaom/src/aom_dsp/prob.h
new file mode 100644
index 000000000..ea5e4cb34
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/prob.h
@@ -0,0 +1,670 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_PROB_H_
+#define AOM_AOM_DSP_PROB_H_
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/entcode.h"
+#include "aom_ports/bitops.h"
+#include "aom_ports/mem.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef uint16_t aom_cdf_prob;
+
+#define CDF_SIZE(x) ((x) + 1)
+#define CDF_PROB_BITS 15
+#define CDF_PROB_TOP (1 << CDF_PROB_BITS)
+#define CDF_INIT_TOP 32768
+#define CDF_SHIFT (15 - CDF_PROB_BITS)
+/*The value stored in an iCDF is CDF_PROB_TOP minus the actual cumulative
+  probability (an "inverse" CDF).
+  This function converts from one representation to the other (and is its own
+  inverse).*/
+#define AOM_ICDF(x) (CDF_PROB_TOP - (x))
+
+#if CDF_SHIFT == 0
+
+#define AOM_CDF2(a0) AOM_ICDF(a0), AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF3(a0, a1) AOM_ICDF(a0), AOM_ICDF(a1), AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF4(a0, a1, a2) \
+  AOM_ICDF(a0), AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF5(a0, a1, a2, a3) \
+  AOM_ICDF(a0)                   \
+  , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF6(a0, a1, a2, a3, a4)                        \
+  AOM_ICDF(a0)                                              \
+  , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), \
+      AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF7(a0, a1, a2, a3, a4, a5)                                  \
+  AOM_ICDF(a0)                                                            \
+  , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \
+      AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF8(a0, a1, a2, a3, a4, a5, a6)                              \
+  AOM_ICDF(a0)                                                            \
+  , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \
+      AOM_ICDF(a6), AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF9(a0, a1, a2, a3, a4, a5, a6, a7)                          \
+  AOM_ICDF(a0)                                                            \
+  , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \
+      AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF10(a0, a1, a2, a3, a4, a5, a6, a7, a8)                     \
+  AOM_ICDF(a0)                                                            \
+  , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \
+      AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF11(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9)                 \
+  AOM_ICDF(a0)                                                            \
+  , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \
+      AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9),             \
+      AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF12(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10)               \
+  AOM_ICDF(a0)                                                               \
+  , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5),    \
+      AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9), AOM_ICDF(a10), \
+      AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF13(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11)          \
+  AOM_ICDF(a0)                                                               \
+  , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5),    \
+      AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9), AOM_ICDF(a10), \
+      AOM_ICDF(a11), AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF14(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12)     \
+  AOM_ICDF(a0)                                                               \
+  , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5),    \
+      AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9), AOM_ICDF(a10), \
+      AOM_ICDF(a11), AOM_ICDF(a12), AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF15(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13) \
+  AOM_ICDF(a0)                                                                \
+  , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5),     \
+      AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9), AOM_ICDF(a10),  \
+      AOM_ICDF(a11), AOM_ICDF(a12), AOM_ICDF(a13), AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF16(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, \
+                  a14)                                                        \
+  AOM_ICDF(a0)                                                                \
+  , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5),     \
+      AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9), AOM_ICDF(a10),  \
+      AOM_ICDF(a11), AOM_ICDF(a12), AOM_ICDF(a13), AOM_ICDF(a14),             \
+      AOM_ICDF(CDF_PROB_TOP), 0
+
+#else
+#define AOM_CDF2(a0)                                       \
+  AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 2) + \
+            ((CDF_INIT_TOP - 2) >> 1)) /                   \
+               ((CDF_INIT_TOP - 2)) +                      \
+           1)                                              \
+  , AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF3(a0, a1)                                       \
+  AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 3) +     \
+            ((CDF_INIT_TOP - 3) >> 1)) /                       \
+               ((CDF_INIT_TOP - 3)) +                          \
+           1)                                                  \
+  ,                                                            \
+      AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 3) + \
+                ((CDF_INIT_TOP - 3) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 3)) +                      \
+               2),                                             \
+      AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF4(a0, a1, a2)                                   \
+  AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 4) +     \
+            ((CDF_INIT_TOP - 4) >> 1)) /                       \
+               ((CDF_INIT_TOP - 4)) +                          \
+           1)                                                  \
+  ,                                                            \
+      AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 4) + \
+                ((CDF_INIT_TOP - 4) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 4)) +                      \
+               2),                                             \
+      AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 4) + \
+                ((CDF_INIT_TOP - 4) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 4)) +                      \
+               3),                                             \
+      AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF5(a0, a1, a2, a3)                               \
+  AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 5) +     \
+            ((CDF_INIT_TOP - 5) >> 1)) /                       \
+               ((CDF_INIT_TOP - 5)) +                          \
+           1)                                                  \
+  ,                                                            \
+      AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 5) + \
+                ((CDF_INIT_TOP - 5) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 5)) +                      \
+               2),                                             \
+      AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 5) + \
+                ((CDF_INIT_TOP - 5) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 5)) +                      \
+               3),                                             \
+      AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 5) + \
+                ((CDF_INIT_TOP - 5) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 5)) +                      \
+               4),                                             \
+      AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF6(a0, a1, a2, a3, a4)                           \
+  AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 6) +     \
+            ((CDF_INIT_TOP - 6) >> 1)) /                       \
+               ((CDF_INIT_TOP - 6)) +                          \
+           1)                                                  \
+  ,                                                            \
+      AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 6) + \
+                ((CDF_INIT_TOP - 6) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 6)) +                      \
+               2),                                             \
+      AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 6) + \
+                ((CDF_INIT_TOP - 6) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 6)) +                      \
+               3),                                             \
+      AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 6) + \
+                ((CDF_INIT_TOP - 6) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 6)) +                      \
+               4),                                             \
+      AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 6) + \
+                ((CDF_INIT_TOP - 6) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 6)) +                      \
+               5),                                             \
+      AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF7(a0, a1, a2, a3, a4, a5)                       \
+  AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 7) +     \
+            ((CDF_INIT_TOP - 7) >> 1)) /                       \
+               ((CDF_INIT_TOP - 7)) +                          \
+           1)                                                  \
+  ,                                                            \
+      AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 7) + \
+                ((CDF_INIT_TOP - 7) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 7)) +                      \
+               2),                                             \
+      AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 7) + \
+                ((CDF_INIT_TOP - 7) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 7)) +                      \
+               3),                                             \
+      AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 7) + \
+                ((CDF_INIT_TOP - 7) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 7)) +                      \
+               4),                                             \
+      AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 7) + \
+                ((CDF_INIT_TOP - 7) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 7)) +                      \
+               5),                                             \
+      AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 7) + \
+                ((CDF_INIT_TOP - 7) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 7)) +                      \
+               6),                                             \
+      AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF8(a0, a1, a2, a3, a4, a5, a6)                   \
+  AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 8) +     \
+            ((CDF_INIT_TOP - 8) >> 1)) /                       \
+               ((CDF_INIT_TOP - 8)) +                          \
+           1)                                                  \
+  ,                                                            \
+      AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 8) + \
+                ((CDF_INIT_TOP - 8) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 8)) +                      \
+               2),                                             \
+      AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 8) + \
+                ((CDF_INIT_TOP - 8) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 8)) +                      \
+               3),                                             \
+      AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 8) + \
+                ((CDF_INIT_TOP - 8) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 8)) +                      \
+               4),                                             \
+      AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 8) + \
+                ((CDF_INIT_TOP - 8) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 8)) +                      \
+               5),                                             \
+      AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 8) + \
+                ((CDF_INIT_TOP - 8) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 8)) +                      \
+               6),                                             \
+      AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 8) + \
+                ((CDF_INIT_TOP - 8) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 8)) +                      \
+               7),                                             \
+      AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF9(a0, a1, a2, a3, a4, a5, a6, a7)               \
+  AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 9) +     \
+            ((CDF_INIT_TOP - 9) >> 1)) /                       \
+               ((CDF_INIT_TOP - 9)) +                          \
+           1)                                                  \
+  ,                                                            \
+      AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 9) + \
+                ((CDF_INIT_TOP - 9) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 9)) +                      \
+               2),                                             \
+      AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 9) + \
+                ((CDF_INIT_TOP - 9) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 9)) +                      \
+               3),                                             \
+      AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 9) + \
+                ((CDF_INIT_TOP - 9) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 9)) +                      \
+               4),                                             \
+      AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 9) + \
+                ((CDF_INIT_TOP - 9) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 9)) +                      \
+               5),                                             \
+      AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 9) + \
+                ((CDF_INIT_TOP - 9) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 9)) +                      \
+               6),                                             \
+      AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 9) + \
+                ((CDF_INIT_TOP - 9) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 9)) +                      \
+               7),                                             \
+      AOM_ICDF((((a7)-8) * ((CDF_INIT_TOP >> CDF_SHIFT) - 9) + \
+                ((CDF_INIT_TOP - 9) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 9)) +                      \
+               8),                                             \
+      AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF10(a0, a1, a2, a3, a4, a5, a6, a7, a8)           \
+  AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) +     \
+            ((CDF_INIT_TOP - 10) >> 1)) /                       \
+               ((CDF_INIT_TOP - 10)) +                          \
+           1)                                                   \
+  ,                                                             \
+      AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \
+                ((CDF_INIT_TOP - 10) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 10)) +                      \
+               2),                                              \
+      AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \
+                ((CDF_INIT_TOP - 10) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 10)) +                      \
+               3),                                              \
+      AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \
+                ((CDF_INIT_TOP - 10) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 10)) +                      \
+               4),                                              \
+      AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \
+                ((CDF_INIT_TOP - 10) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 10)) +                      \
+               5),                                              \
+      AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \
+                ((CDF_INIT_TOP - 10) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 10)) +                      \
+               6),                                              \
+      AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \
+                ((CDF_INIT_TOP - 10) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 10)) +                      \
+               7),                                              \
+      AOM_ICDF((((a7)-8) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \
+                ((CDF_INIT_TOP - 10) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 10)) +                      \
+               8),                                              \
+      AOM_ICDF((((a8)-9) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \
+                ((CDF_INIT_TOP - 10) >> 1)) /                   \
+                   ((CDF_INIT_TOP - 10)) +                      \
+               9),                                              \
+      AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF11(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9)        \
+  AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) +      \
+            ((CDF_INIT_TOP - 11) >> 1)) /                        \
+               ((CDF_INIT_TOP - 11)) +                           \
+           1)                                                    \
+  ,                                                              \
+      AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) +  \
+                ((CDF_INIT_TOP - 11) >> 1)) /                    \
+                   ((CDF_INIT_TOP - 11)) +                       \
+               2),                                               \
+      AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) +  \
+                ((CDF_INIT_TOP - 11) >> 1)) /                    \
+                   ((CDF_INIT_TOP - 11)) +                       \
+               3),                                               \
+      AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) +  \
+                ((CDF_INIT_TOP - 11) >> 1)) /                    \
+                   ((CDF_INIT_TOP - 11)) +                       \
+               4),                                               \
+      AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) +  \
+                ((CDF_INIT_TOP - 11) >> 1)) /                    \
+                   ((CDF_INIT_TOP - 11)) +                       \
+               5),                                               \
+      AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) +  \
+                ((CDF_INIT_TOP - 11) >> 1)) /                    \
+                   ((CDF_INIT_TOP - 11)) +                       \
+               6),                                               \
+      AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) +  \
+                ((CDF_INIT_TOP - 11) >> 1)) /                    \
+                   ((CDF_INIT_TOP - 11)) +                       \
+               7),                                               \
+      AOM_ICDF((((a7)-8) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) +  \
+                ((CDF_INIT_TOP - 11) >> 1)) /                    \
+                   ((CDF_INIT_TOP - 11)) +                       \
+               8),                                               \
+      AOM_ICDF((((a8)-9) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) +  \
+                ((CDF_INIT_TOP - 11) >> 1)) /                    \
+                   ((CDF_INIT_TOP - 11)) +                       \
+               9),                                               \
+      AOM_ICDF((((a9)-10) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) + \
+                ((CDF_INIT_TOP - 11) >> 1)) /                    \
+                   ((CDF_INIT_TOP - 11)) +                       \
+               10),                                              \
+      AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF12(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10)    \
+  AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) +       \
+            ((CDF_INIT_TOP - 12) >> 1)) /                         \
+               ((CDF_INIT_TOP - 12)) +                            \
+           1)                                                     \
+  ,                                                               \
+      AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) +   \
+                ((CDF_INIT_TOP - 12) >> 1)) /                     \
+                   ((CDF_INIT_TOP - 12)) +                        \
+               2),                                                \
+      AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) +   \
+                ((CDF_INIT_TOP - 12) >> 1)) /                     \
+                   ((CDF_INIT_TOP - 12)) +                        \
+               3),                                                \
+      AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) +   \
+                ((CDF_INIT_TOP - 12) >> 1)) /                     \
+                   ((CDF_INIT_TOP - 12)) +                        \
+               4),                                                \
+      AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) +   \
+                ((CDF_INIT_TOP - 12) >> 1)) /                     \
+                   ((CDF_INIT_TOP - 12)) +                        \
+               5),                                                \
+      AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) +   \
+                ((CDF_INIT_TOP - 12) >> 1)) /                     \
+                   ((CDF_INIT_TOP - 12)) +                        \
+               6),                                                \
+      AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) +   \
+                ((CDF_INIT_TOP - 12) >> 1)) /                     \
+                   ((CDF_INIT_TOP - 12)) +                        \
+               7),                                                \
+      AOM_ICDF((((a7)-8) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) +   \
+                ((CDF_INIT_TOP - 12) >> 1)) /                     \
+                   ((CDF_INIT_TOP - 12)) +                        \
+               8),                                                \
+      AOM_ICDF((((a8)-9) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) +   \
+                ((CDF_INIT_TOP - 12) >> 1)) /                     \
+                   ((CDF_INIT_TOP - 12)) +                        \
+               9),                                                \
+      AOM_ICDF((((a9)-10) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) +  \
+                ((CDF_INIT_TOP - 12) >> 1)) /                     \
+                   ((CDF_INIT_TOP - 12)) +                        \
+               10),                                               \
+      AOM_ICDF((((a10)-11) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) + \
+                ((CDF_INIT_TOP - 12) >> 1)) /                     \
+                   ((CDF_INIT_TOP - 12)) +                        \
+               11),                                               \
+      AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF13(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11) \
+  AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) +         \
+            ((CDF_INIT_TOP - 13) >> 1)) /                           \
+               ((CDF_INIT_TOP - 13)) +                              \
+           1)                                                       \
+  ,                                                                 \
+      AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) +     \
+                ((CDF_INIT_TOP - 13) >> 1)) /                       \
+                   ((CDF_INIT_TOP - 13)) +                          \
+               2),                                                  \
+      AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) +     \
+                ((CDF_INIT_TOP - 13) >> 1)) /                       \
+                   ((CDF_INIT_TOP - 13)) +                          \
+               3),                                                  \
+      AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) +     \
+                ((CDF_INIT_TOP - 13) >> 1)) /                       \
+                   ((CDF_INIT_TOP - 13)) +                          \
+               4),                                                  \
+      AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) +     \
+                ((CDF_INIT_TOP - 13) >> 1)) /                       \
+                   ((CDF_INIT_TOP - 13)) +                          \
+               5),                                                  \
+      AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) +     \
+                ((CDF_INIT_TOP - 13) >> 1)) /                       \
+                   ((CDF_INIT_TOP - 13)) +                          \
+               6),                                                  \
+      AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) +     \
+                ((CDF_INIT_TOP - 13) >> 1)) /                       \
+                   ((CDF_INIT_TOP - 13)) +                          \
+               7),                                                  \
+      AOM_ICDF((((a7)-8) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) +     \
+                ((CDF_INIT_TOP - 13) >> 1)) /                       \
+                   ((CDF_INIT_TOP - 13)) +                          \
+               8),                                                  \
+      AOM_ICDF((((a8)-9) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) +     \
+                ((CDF_INIT_TOP - 13) >> 1)) /                       \
+                   ((CDF_INIT_TOP - 13)) +                          \
+               9),                                                  \
+      AOM_ICDF((((a9)-10) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) +    \
+                ((CDF_INIT_TOP - 13) >> 1)) /                       \
+                   ((CDF_INIT_TOP - 13)) +                          \
+               10),                                                 \
+      AOM_ICDF((((a10)-11) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) +   \
+                ((CDF_INIT_TOP - 13) >> 1)) /                       \
+                   ((CDF_INIT_TOP - 13)) +                          \
+               11),                                                 \
+      AOM_ICDF((((a11)-12) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) +   \
+                ((CDF_INIT_TOP - 13) >> 1)) /                       \
+                   ((CDF_INIT_TOP - 13)) +                          \
+               12),                                                 \
+      AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF14(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12) \
+  AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) +              \
+            ((CDF_INIT_TOP - 14) >> 1)) /                                \
+               ((CDF_INIT_TOP - 14)) +                                   \
+           1)                                                            \
+  ,                                                                      \
+      AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) +          \
+                ((CDF_INIT_TOP - 14) >> 1)) /                            \
+                   ((CDF_INIT_TOP - 14)) +                               \
+               2),                                                       \
+      AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) +          \
+                ((CDF_INIT_TOP - 14) >> 1)) /                            \
+                   ((CDF_INIT_TOP - 14)) +                               \
+               3),                                                       \
+      AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) +          \
+                ((CDF_INIT_TOP - 14) >> 1)) /                            \
+                   ((CDF_INIT_TOP - 14)) +                               \
+               4),                                                       \
+      AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) +          \
+                ((CDF_INIT_TOP - 14) >> 1)) /                            \
+                   ((CDF_INIT_TOP - 14)) +                               \
+               5),                                                       \
+      AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) +          \
+                ((CDF_INIT_TOP - 14) >> 1)) /                            \
+                   ((CDF_INIT_TOP - 14)) +                               \
+               6),                                                       \
+      AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) +          \
+                ((CDF_INIT_TOP - 14) >> 1)) /                            \
+                   ((CDF_INIT_TOP - 14)) +                               \
+               7),                                                       \
+      AOM_ICDF((((a7)-8) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) +          \
+                ((CDF_INIT_TOP - 14) >> 1)) /                            \
+                   ((CDF_INIT_TOP - 14)) +                               \
+               8),                                                       \
+      AOM_ICDF((((a8)-9) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) +          \
+                ((CDF_INIT_TOP - 14) >> 1)) /                            \
+                   ((CDF_INIT_TOP - 14)) +                               \
+               9),                                                       \
+      AOM_ICDF((((a9)-10) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) +         \
+                ((CDF_INIT_TOP - 14) >> 1)) /                            \
+                   ((CDF_INIT_TOP - 14)) +                               \
+               10),                                                      \
+      AOM_ICDF((((a10)-11) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) +        \
+                ((CDF_INIT_TOP - 14) >> 1)) /                            \
+                   ((CDF_INIT_TOP - 14)) +                               \
+               11),                                                      \
+      AOM_ICDF((((a11)-12) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) +        \
+                ((CDF_INIT_TOP - 14) >> 1)) /                            \
+                   ((CDF_INIT_TOP - 14)) +                               \
+               12),                                                      \
+      AOM_ICDF((((a12)-13) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) +        \
+                ((CDF_INIT_TOP - 14) >> 1)) /                            \
+                   ((CDF_INIT_TOP - 14)) +                               \
+               13),                                                      \
+      AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF15(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13) \
+  AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) +                   \
+            ((CDF_INIT_TOP - 15) >> 1)) /                                     \
+               ((CDF_INIT_TOP - 15)) +                                        \
+           1)                                                                 \
+  ,                                                                           \
+      AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) +               \
+                ((CDF_INIT_TOP - 15) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 15)) +                                    \
+               2),                                                            \
+      AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) +               \
+                ((CDF_INIT_TOP - 15) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 15)) +                                    \
+               3),                                                            \
+      AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) +               \
+                ((CDF_INIT_TOP - 15) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 15)) +                                    \
+               4),                                                            \
+      AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) +               \
+                ((CDF_INIT_TOP - 15) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 15)) +                                    \
+               5),                                                            \
+      AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) +               \
+                ((CDF_INIT_TOP - 15) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 15)) +                                    \
+               6),                                                            \
+      AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) +               \
+                ((CDF_INIT_TOP - 15) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 15)) +                                    \
+               7),                                                            \
+      AOM_ICDF((((a7)-8) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) +               \
+                ((CDF_INIT_TOP - 15) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 15)) +                                    \
+               8),                                                            \
+      AOM_ICDF((((a8)-9) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) +               \
+                ((CDF_INIT_TOP - 15) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 15)) +                                    \
+               9),                                                            \
+      AOM_ICDF((((a9)-10) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) +              \
+                ((CDF_INIT_TOP - 15) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 15)) +                                    \
+               10),                                                           \
+      AOM_ICDF((((a10)-11) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) +             \
+                ((CDF_INIT_TOP - 15) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 15)) +                                    \
+               11),                                                           \
+      AOM_ICDF((((a11)-12) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) +             \
+                ((CDF_INIT_TOP - 15) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 15)) +                                    \
+               12),                                                           \
+      AOM_ICDF((((a12)-13) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) +             \
+                ((CDF_INIT_TOP - 15) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 15)) +                                    \
+               13),                                                           \
+      AOM_ICDF((((a13)-14) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) +             \
+                ((CDF_INIT_TOP - 15) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 15)) +                                    \
+               14),                                                           \
+      AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF16(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, \
+                  a14)                                                        \
+  AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) +                   \
+            ((CDF_INIT_TOP - 16) >> 1)) /                                     \
+               ((CDF_INIT_TOP - 16)) +                                        \
+           1)                                                                 \
+  ,                                                                           \
+      AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) +               \
+                ((CDF_INIT_TOP - 16) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 16)) +                                    \
+               2),                                                            \
+      AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) +               \
+                ((CDF_INIT_TOP - 16) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 16)) +                                    \
+               3),                                                            \
+      AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) +               \
+                ((CDF_INIT_TOP - 16) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 16)) +                                    \
+               4),                                                            \
+      AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) +               \
+                ((CDF_INIT_TOP - 16) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 16)) +                                    \
+               5),                                                            \
+      AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) +               \
+                ((CDF_INIT_TOP - 16) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 16)) +                                    \
+               6),                                                            \
+      AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) +               \
+                ((CDF_INIT_TOP - 16) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 16)) +                                    \
+               7),                                                            \
+      AOM_ICDF((((a7)-8) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) +               \
+                ((CDF_INIT_TOP - 16) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 16)) +                                    \
+               8),                                                            \
+      AOM_ICDF((((a8)-9) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) +               \
+                ((CDF_INIT_TOP - 16) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 16)) +                                    \
+               9),                                                            \
+      AOM_ICDF((((a9)-10) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) +              \
+                ((CDF_INIT_TOP - 16) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 16)) +                                    \
+               10),                                                           \
+      AOM_ICDF((((a10)-11) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) +             \
+                ((CDF_INIT_TOP - 16) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 16)) +                                    \
+               11),                                                           \
+      AOM_ICDF((((a11)-12) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) +             \
+                ((CDF_INIT_TOP - 16) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 16)) +                                    \
+               12),                                                           \
+      AOM_ICDF((((a12)-13) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) +             \
+                ((CDF_INIT_TOP - 16) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 16)) +                                    \
+               13),                                                           \
+      AOM_ICDF((((a13)-14) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) +             \
+                ((CDF_INIT_TOP - 16) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 16)) +                                    \
+               14),                                                           \
+      AOM_ICDF((((a14)-15) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) +             \
+                ((CDF_INIT_TOP - 16) >> 1)) /                                 \
+                   ((CDF_INIT_TOP - 16)) +                                    \
+               15),                                                           \
+      AOM_ICDF(CDF_PROB_TOP), 0
+
+#endif
+
+static INLINE uint8_t get_prob(unsigned int num, unsigned int den) {
+  assert(den != 0);
+  {
+    const int p = (int)(((uint64_t)num * 256 + (den >> 1)) / den);
+    // (p > 255) ? 255 : (p < 1) ? 1 : p;
+    const int clipped_prob = p | ((255 - p) >> 23) | (p == 0);
+    return (uint8_t)clipped_prob;
+  }
+}
+
+static INLINE void update_cdf(aom_cdf_prob *cdf, int8_t val, int nsymbs) {
+  int rate;
+  int i, tmp;
+
+  static const int nsymbs2speed[17] = { 0, 0, 1, 1, 2, 2, 2, 2, 2,
+                                        2, 2, 2, 2, 2, 2, 2, 2 };
+  assert(nsymbs < 17);
+  rate = 3 + (cdf[nsymbs] > 15) + (cdf[nsymbs] > 31) +
+         nsymbs2speed[nsymbs];  // + get_msb(nsymbs);
+  tmp = AOM_ICDF(0);
+
+  // Single loop (faster)
+  for (i = 0; i < nsymbs - 1; ++i) {
+    tmp = (i == val) ? 0 : tmp;
+    if (tmp < cdf[i]) {
+      cdf[i] -= ((cdf[i] - tmp) >> rate);
+    } else {
+      cdf[i] += ((tmp - cdf[i]) >> rate);
+    }
+  }
+  cdf[nsymbs] += (cdf[nsymbs] < 32);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_DSP_PROB_H_
diff --git a/libs/libaom/src/aom_dsp/psnr.c b/libs/libaom/src/aom_dsp/psnr.c
new file mode 100644
index 000000000..c66dd52d0
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/psnr.c
@@ -0,0 +1,439 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <math.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/psnr.h"
+#include "aom_scale/yv12config.h"
+
+double aom_sse_to_psnr(double samples, double peak, double sse) {
+  if (sse > 0.0) {
+    const double psnr = 10.0 * log10(samples * peak * peak / sse);
+    return psnr > MAX_PSNR ? MAX_PSNR : psnr;
+  } else {
+    return MAX_PSNR;
+  }
+}
+
+static void encoder_variance(const uint8_t *a, int a_stride, const uint8_t *b,
+                             int b_stride, int w, int h, unsigned int *sse,
+                             int *sum) {
+  int i, j;
+
+  *sum = 0;
+  *sse = 0;
+
+  for (i = 0; i < h; i++) {
+    for (j = 0; j < w; j++) {
+      const int diff = a[j] - b[j];
+      *sum += diff;
+      *sse += diff * diff;
+    }
+
+    a += a_stride;
+    b += b_stride;
+  }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static void encoder_highbd_variance64(const uint8_t *a8, int a_stride,
+                                      const uint8_t *b8, int b_stride, int w,
+                                      int h, uint64_t *sse, int64_t *sum) {
+  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  int64_t tsum = 0;
+  uint64_t tsse = 0;
+  for (int i = 0; i < h; ++i) {
+    int32_t lsum = 0;
+    for (int j = 0; j < w; ++j) {
+      const int diff = a[j] - b[j];
+      lsum += diff;
+      tsse += (uint32_t)(diff * diff);
+    }
+    tsum += lsum;
+    a += a_stride;
+    b += b_stride;
+  }
+  *sum = tsum;
+  *sse = tsse;
+}
+
+static void encoder_highbd_8_variance(const uint8_t *a8, int a_stride,
+                                      const uint8_t *b8, int b_stride, int w,
+                                      int h, unsigned int *sse, int *sum) {
+  uint64_t sse_long = 0;
+  int64_t sum_long = 0;
+  encoder_highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long,
+                            &sum_long);
+  *sse = (unsigned int)sse_long;
+  *sum = (int)sum_long;
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+static int64_t get_sse(const uint8_t *a, int a_stride, const uint8_t *b,
+                       int b_stride, int width, int height) {
+  const int dw = width % 16;
+  const int dh = height % 16;
+  int64_t total_sse = 0;
+  unsigned int sse = 0;
+  int sum = 0;
+  int x, y;
+
+  if (dw > 0) {
+    encoder_variance(&a[width - dw], a_stride, &b[width - dw], b_stride, dw,
+                     height, &sse, &sum);
+    total_sse += sse;
+  }
+
+  if (dh > 0) {
+    encoder_variance(&a[(height - dh) * a_stride], a_stride,
+                     &b[(height - dh) * b_stride], b_stride, width - dw, dh,
+                     &sse, &sum);
+    total_sse += sse;
+  }
+
+  for (y = 0; y < height / 16; ++y) {
+    const uint8_t *pa = a;
+    const uint8_t *pb = b;
+    for (x = 0; x < width / 16; ++x) {
+      aom_mse16x16(pa, a_stride, pb, b_stride, &sse);
+      total_sse += sse;
+
+      pa += 16;
+      pb += 16;
+    }
+
+    a += 16 * a_stride;
+    b += 16 * b_stride;
+  }
+
+  return total_sse;
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static int64_t highbd_get_sse_shift(const uint8_t *a8, int a_stride,
+                                    const uint8_t *b8, int b_stride, int width,
+                                    int height, unsigned int input_shift) {
+  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  int64_t total_sse = 0;
+  int x, y;
+  for (y = 0; y < height; ++y) {
+    for (x = 0; x < width; ++x) {
+      int64_t diff;
+      diff = (a[x] >> input_shift) - (b[x] >> input_shift);
+      total_sse += diff * diff;
+    }
+    a += a_stride;
+    b += b_stride;
+  }
+  return total_sse;
+}
+
+static int64_t highbd_get_sse(const uint8_t *a, int a_stride, const uint8_t *b,
+                              int b_stride, int width, int height) {
+  int64_t total_sse = 0;
+  int x, y;
+  const int dw = width % 16;
+  const int dh = height % 16;
+  unsigned int sse = 0;
+  int sum = 0;
+  if (dw > 0) {
+    encoder_highbd_8_variance(&a[width - dw], a_stride, &b[width - dw],
+                              b_stride, dw, height, &sse, &sum);
+    total_sse += sse;
+  }
+  if (dh > 0) {
+    encoder_highbd_8_variance(&a[(height - dh) * a_stride], a_stride,
+                              &b[(height - dh) * b_stride], b_stride,
+                              width - dw, dh, &sse, &sum);
+    total_sse += sse;
+  }
+  for (y = 0; y < height / 16; ++y) {
+    const uint8_t *pa = a;
+    const uint8_t *pb = b;
+    for (x = 0; x < width / 16; ++x) {
+      aom_highbd_8_mse16x16(pa, a_stride, pb, b_stride, &sse);
+      total_sse += sse;
+      pa += 16;
+      pb += 16;
+    }
+    a += 16 * a_stride;
+    b += 16 * b_stride;
+  }
+  return total_sse;
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+uint64_t aom_get_y_var(const YV12_BUFFER_CONFIG *a, int hstart, int width,
+                       int vstart, int height) {
+  return aom_var_2d_u8(a->y_buffer + vstart * a->y_stride + hstart, a->y_stride,
+                       width, height) /
+         (width * height);
+}
+
+uint64_t aom_get_u_var(const YV12_BUFFER_CONFIG *a, int hstart, int width,
+                       int vstart, int height) {
+  return aom_var_2d_u8(a->u_buffer + vstart * a->uv_stride + hstart,
+                       a->uv_stride, width, height) /
+         (width * height);
+}
+
+uint64_t aom_get_v_var(const YV12_BUFFER_CONFIG *a, int hstart, int width,
+                       int vstart, int height) {
+  return aom_var_2d_u8(a->v_buffer + vstart * a->uv_stride + hstart,
+                       a->uv_stride, width, height) /
+         (width * height);
+}
+
+int64_t aom_get_y_sse_part(const YV12_BUFFER_CONFIG *a,
+                           const YV12_BUFFER_CONFIG *b, int hstart, int width,
+                           int vstart, int height) {
+  return get_sse(a->y_buffer + vstart * a->y_stride + hstart, a->y_stride,
+                 b->y_buffer + vstart * b->y_stride + hstart, b->y_stride,
+                 width, height);
+}
+
+int64_t aom_get_y_sse(const YV12_BUFFER_CONFIG *a,
+                      const YV12_BUFFER_CONFIG *b) {
+  assert(a->y_crop_width == b->y_crop_width);
+  assert(a->y_crop_height == b->y_crop_height);
+
+  return get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
+                 a->y_crop_width, a->y_crop_height);
+}
+
+int64_t aom_get_u_sse_part(const YV12_BUFFER_CONFIG *a,
+                           const YV12_BUFFER_CONFIG *b, int hstart, int width,
+                           int vstart, int height) {
+  return get_sse(a->u_buffer + vstart * a->uv_stride + hstart, a->uv_stride,
+                 b->u_buffer + vstart * b->uv_stride + hstart, b->uv_stride,
+                 width, height);
+}
+
+int64_t aom_get_u_sse(const YV12_BUFFER_CONFIG *a,
+                      const YV12_BUFFER_CONFIG *b) {
+  assert(a->uv_crop_width == b->uv_crop_width);
+  assert(a->uv_crop_height == b->uv_crop_height);
+
+  return get_sse(a->u_buffer, a->uv_stride, b->u_buffer, b->uv_stride,
+                 a->uv_crop_width, a->uv_crop_height);
+}
+
+int64_t aom_get_v_sse_part(const YV12_BUFFER_CONFIG *a,
+                           const YV12_BUFFER_CONFIG *b, int hstart, int width,
+                           int vstart, int height) {
+  return get_sse(a->v_buffer + vstart * a->uv_stride + hstart, a->uv_stride,
+                 b->v_buffer + vstart * b->uv_stride + hstart, b->uv_stride,
+                 width, height);
+}
+
+int64_t aom_get_v_sse(const YV12_BUFFER_CONFIG *a,
+                      const YV12_BUFFER_CONFIG *b) {
+  assert(a->uv_crop_width == b->uv_crop_width);
+  assert(a->uv_crop_height == b->uv_crop_height);
+
+  return get_sse(a->v_buffer, a->uv_stride, b->v_buffer, b->uv_stride,
+                 a->uv_crop_width, a->uv_crop_height);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+uint64_t aom_highbd_get_y_var(const YV12_BUFFER_CONFIG *a, int hstart,
+                              int width, int vstart, int height) {
+  return aom_var_2d_u16(a->y_buffer + vstart * a->y_stride + hstart,
+                        a->y_stride, width, height) /
+         (width * height);
+}
+
+uint64_t aom_highbd_get_u_var(const YV12_BUFFER_CONFIG *a, int hstart,
+                              int width, int vstart, int height) {
+  return aom_var_2d_u16(a->u_buffer + vstart * a->uv_stride + hstart,
+                        a->uv_stride, width, height) /
+         (width * height);
+}
+
+uint64_t aom_highbd_get_v_var(const YV12_BUFFER_CONFIG *a, int hstart,
+                              int width, int vstart, int height) {
+  return aom_var_2d_u16(a->v_buffer + vstart * a->uv_stride + hstart,
+                        a->uv_stride, width, height) /
+         (width * height);
+}
+
+int64_t aom_highbd_get_y_sse_part(const YV12_BUFFER_CONFIG *a,
+                                  const YV12_BUFFER_CONFIG *b, int hstart,
+                                  int width, int vstart, int height) {
+  return highbd_get_sse(
+      a->y_buffer + vstart * a->y_stride + hstart, a->y_stride,
+      b->y_buffer + vstart * b->y_stride + hstart, b->y_stride, width, height);
+}
+
+int64_t aom_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a,
+                             const YV12_BUFFER_CONFIG *b) {
+  assert(a->y_crop_width == b->y_crop_width);
+  assert(a->y_crop_height == b->y_crop_height);
+  assert((a->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
+  assert((b->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
+
+  return highbd_get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
+                        a->y_crop_width, a->y_crop_height);
+}
+
+int64_t aom_highbd_get_u_sse_part(const YV12_BUFFER_CONFIG *a,
+                                  const YV12_BUFFER_CONFIG *b, int hstart,
+                                  int width, int vstart, int height) {
+  return highbd_get_sse(a->u_buffer + vstart * a->uv_stride + hstart,
+                        a->uv_stride,
+                        b->u_buffer + vstart * b->uv_stride + hstart,
+                        b->uv_stride, width, height);
+}
+
+int64_t aom_highbd_get_u_sse(const YV12_BUFFER_CONFIG *a,
+                             const YV12_BUFFER_CONFIG *b) {
+  assert(a->uv_crop_width == b->uv_crop_width);
+  assert(a->uv_crop_height == b->uv_crop_height);
+  assert((a->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
+  assert((b->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
+
+  return highbd_get_sse(a->u_buffer, a->uv_stride, b->u_buffer, b->uv_stride,
+                        a->uv_crop_width, a->uv_crop_height);
+}
+
+int64_t aom_highbd_get_v_sse_part(const YV12_BUFFER_CONFIG *a,
+                                  const YV12_BUFFER_CONFIG *b, int hstart,
+                                  int width, int vstart, int height) {
+  return highbd_get_sse(a->v_buffer + vstart * a->uv_stride + hstart,
+                        a->uv_stride,
+                        b->v_buffer + vstart * b->uv_stride + hstart,
+                        b->uv_stride, width, height);
+}
+
+int64_t aom_highbd_get_v_sse(const YV12_BUFFER_CONFIG *a,
+                             const YV12_BUFFER_CONFIG *b) {
+  assert(a->uv_crop_width == b->uv_crop_width);
+  assert(a->uv_crop_height == b->uv_crop_height);
+  assert((a->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
+  assert((b->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
+
+  return highbd_get_sse(a->v_buffer, a->uv_stride, b->v_buffer, b->uv_stride,
+                        a->uv_crop_width, a->uv_crop_height);
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+int64_t aom_get_sse_plane(const YV12_BUFFER_CONFIG *a,
+                          const YV12_BUFFER_CONFIG *b, int plane, int highbd) {
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (highbd) {
+    switch (plane) {
+      case 0: return aom_highbd_get_y_sse(a, b);
+      case 1: return aom_highbd_get_u_sse(a, b);
+      case 2: return aom_highbd_get_v_sse(a, b);
+      default: assert(plane >= 0 && plane <= 2); return 0;
+    }
+  } else {
+    switch (plane) {
+      case 0: return aom_get_y_sse(a, b);
+      case 1: return aom_get_u_sse(a, b);
+      case 2: return aom_get_v_sse(a, b);
+      default: assert(plane >= 0 && plane <= 2); return 0;
+    }
+  }
+#else
+  (void)highbd;
+  switch (plane) {
+    case 0: return aom_get_y_sse(a, b);
+    case 1: return aom_get_u_sse(a, b);
+    case 2: return aom_get_v_sse(a, b);
+    default: assert(plane >= 0 && plane <= 2); return 0;
+  }
+#endif
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void aom_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a,
+                          const YV12_BUFFER_CONFIG *b, PSNR_STATS *psnr,
+                          uint32_t bit_depth, uint32_t in_bit_depth) {
+  const int widths[3] = { a->y_crop_width, a->uv_crop_width, a->uv_crop_width };
+  const int heights[3] = { a->y_crop_height, a->uv_crop_height,
+                           a->uv_crop_height };
+  const int a_strides[3] = { a->y_stride, a->uv_stride, a->uv_stride };
+  const int b_strides[3] = { b->y_stride, b->uv_stride, b->uv_stride };
+  int i;
+  uint64_t total_sse = 0;
+  uint32_t total_samples = 0;
+  const double peak = (double)((1 << in_bit_depth) - 1);
+  const unsigned int input_shift = bit_depth - in_bit_depth;
+
+  for (i = 0; i < 3; ++i) {
+    const int w = widths[i];
+    const int h = heights[i];
+    const uint32_t samples = w * h;
+    uint64_t sse;
+    if (a->flags & YV12_FLAG_HIGHBITDEPTH) {
+      if (input_shift) {
+        sse = highbd_get_sse_shift(a->buffers[i], a_strides[i], b->buffers[i],
+                                   b_strides[i], w, h, input_shift);
+      } else {
+        sse = highbd_get_sse(a->buffers[i], a_strides[i], b->buffers[i],
+                             b_strides[i], w, h);
+      }
+    } else {
+      sse = get_sse(a->buffers[i], a_strides[i], b->buffers[i], b_strides[i], w,
+                    h);
+    }
+    psnr->sse[1 + i] = sse;
+    psnr->samples[1 + i] = samples;
+    psnr->psnr[1 + i] = aom_sse_to_psnr(samples, peak, (double)sse);
+
+    total_sse += sse;
+    total_samples += samples;
+  }
+
+  psnr->sse[0] = total_sse;
+  psnr->samples[0] = total_samples;
+  psnr->psnr[0] =
+      aom_sse_to_psnr((double)total_samples, peak, (double)total_sse);
+}
+#endif
+
+void aom_calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b,
+                   PSNR_STATS *psnr) {
+  static const double peak = 255.0;
+  const int widths[3] = { a->y_crop_width, a->uv_crop_width, a->uv_crop_width };
+  const int heights[3] = { a->y_crop_height, a->uv_crop_height,
+                           a->uv_crop_height };
+  const int a_strides[3] = { a->y_stride, a->uv_stride, a->uv_stride };
+  const int b_strides[3] = { b->y_stride, b->uv_stride, b->uv_stride };
+  int i;
+  uint64_t total_sse = 0;
+  uint32_t total_samples = 0;
+
+  for (i = 0; i < 3; ++i) {
+    const int w = widths[i];
+    const int h = heights[i];
+    const uint32_t samples = w * h;
+    const uint64_t sse =
+        get_sse(a->buffers[i], a_strides[i], b->buffers[i], b_strides[i], w, h);
+    psnr->sse[1 + i] = sse;
+    psnr->samples[1 + i] = samples;
+    psnr->psnr[1 + i] = aom_sse_to_psnr(samples, peak, (double)sse);
+
+    total_sse += sse;
+    total_samples += samples;
+  }
+
+  psnr->sse[0] = total_sse;
+  psnr->samples[0] = total_samples;
+  psnr->psnr[0] =
+      aom_sse_to_psnr((double)total_samples, peak, (double)total_sse);
+}
diff --git a/libs/libaom/src/aom_dsp/psnr.h b/libs/libaom/src/aom_dsp/psnr.h
new file mode 100644
index 000000000..7f40b8b57
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/psnr.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_PSNR_H_
+#define AOM_AOM_DSP_PSNR_H_
+
+#include "aom_scale/yv12config.h"
+
+#define MAX_PSNR 100.0
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+  double psnr[4];       // total/y/u/v
+  uint64_t sse[4];      // total/y/u/v
+  uint32_t samples[4];  // total/y/u/v
+} PSNR_STATS;
+
+/*!\brief Converts SSE to PSNR
+ *
+ * Converts sum of squared errros (SSE) to peak signal-to-noise ratio (PNSR).
+ *
+ * \param[in]    samples       Number of samples
+ * \param[in]    peak          Max sample value
+ * \param[in]    sse           Sum of squared errors
+ */
+double aom_sse_to_psnr(double samples, double peak, double sse);
+uint64_t aom_get_y_var(const YV12_BUFFER_CONFIG *a, int hstart, int width,
+                       int vstart, int height);
+uint64_t aom_get_u_var(const YV12_BUFFER_CONFIG *a, int hstart, int width,
+                       int vstart, int height);
+uint64_t aom_get_v_var(const YV12_BUFFER_CONFIG *a, int hstart, int width,
+                       int vstart, int height);
+int64_t aom_get_y_sse_part(const YV12_BUFFER_CONFIG *a,
+                           const YV12_BUFFER_CONFIG *b, int hstart, int width,
+                           int vstart, int height);
+int64_t aom_get_y_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b);
+int64_t aom_get_u_sse_part(const YV12_BUFFER_CONFIG *a,
+                           const YV12_BUFFER_CONFIG *b, int hstart, int width,
+                           int vstart, int height);
+int64_t aom_get_u_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b);
+int64_t aom_get_v_sse_part(const YV12_BUFFER_CONFIG *a,
+                           const YV12_BUFFER_CONFIG *b, int hstart, int width,
+                           int vstart, int height);
+int64_t aom_get_v_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b);
+int64_t aom_get_sse_plane(const YV12_BUFFER_CONFIG *a,
+                          const YV12_BUFFER_CONFIG *b, int plane, int highbd);
+#if CONFIG_AV1_HIGHBITDEPTH
+uint64_t aom_highbd_get_y_var(const YV12_BUFFER_CONFIG *a, int hstart,
+                              int width, int vstart, int height);
+uint64_t aom_highbd_get_u_var(const YV12_BUFFER_CONFIG *a, int hstart,
+                              int width, int vstart, int height);
+uint64_t aom_highbd_get_v_var(const YV12_BUFFER_CONFIG *a, int hstart,
+                              int width, int vstart, int height);
+int64_t aom_highbd_get_y_sse_part(const YV12_BUFFER_CONFIG *a,
+                                  const YV12_BUFFER_CONFIG *b, int hstart,
+                                  int width, int vstart, int height);
+int64_t aom_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a,
+                             const YV12_BUFFER_CONFIG *b);
+int64_t aom_highbd_get_u_sse_part(const YV12_BUFFER_CONFIG *a,
+                                  const YV12_BUFFER_CONFIG *b, int hstart,
+                                  int width, int vstart, int height);
+int64_t aom_highbd_get_u_sse(const YV12_BUFFER_CONFIG *a,
+                             const YV12_BUFFER_CONFIG *b);
+int64_t aom_highbd_get_v_sse_part(const YV12_BUFFER_CONFIG *a,
+                                  const YV12_BUFFER_CONFIG *b, int hstart,
+                                  int width, int vstart, int height);
+int64_t aom_highbd_get_v_sse(const YV12_BUFFER_CONFIG *a,
+                             const YV12_BUFFER_CONFIG *b);
+void aom_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a,
+                          const YV12_BUFFER_CONFIG *b, PSNR_STATS *psnr,
+                          unsigned int bit_depth, unsigned int in_bit_depth);
+#endif
+void aom_calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b,
+                   PSNR_STATS *psnr);
+
+double aom_psnrhvs(const YV12_BUFFER_CONFIG *source,
+                   const YV12_BUFFER_CONFIG *dest, double *phvs_y,
+                   double *phvs_u, double *phvs_v, uint32_t bd, uint32_t in_bd);
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+#endif  // AOM_AOM_DSP_PSNR_H_
diff --git a/libs/libaom/src/aom_dsp/psnrhvs.c b/libs/libaom/src/aom_dsp/psnrhvs.c
new file mode 100644
index 000000000..69a1d99bf
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/psnrhvs.c
@@ -0,0 +1,277 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ *
+ *  This code was originally written by: Gregory Maxwell, at the Daala
+ *  project.
+ */
+
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/psnr.h"
+#include "aom_dsp/ssim.h"
+#include "aom_ports/system_state.h"
+
+static void od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x,
+                           int xstride) {
+  int i, j;
+  (void)xstride;
+  aom_fdct8x8(x, y, ystride);
+  for (i = 0; i < 8; i++)
+    for (j = 0; j < 8; j++)
+      *(y + ystride * i + j) = (*(y + ystride * i + j) + 4) >> 3;
+}
+
+static void hbd_od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x,
+                               int xstride) {
+  int i, j;
+  (void)xstride;
+  aom_highbd_fdct8x8(x, y, ystride);
+  for (i = 0; i < 8; i++)
+    for (j = 0; j < 8; j++)
+      *(y + ystride * i + j) = (*(y + ystride * i + j) + 4) >> 3;
+}
+
+/* Normalized inverse quantization matrix for 8x8 DCT at the point of
+ * transparency. This is not the JPEG based matrix from the paper,
+ this one gives a slightly higher MOS agreement.*/
+static const double csf_y[8][8] = {
+  { 1.6193873005, 2.2901594831, 2.08509755623, 1.48366094411, 1.00227514334,
+    0.678296995242, 0.466224900598, 0.3265091542 },
+  { 2.2901594831, 1.94321815382, 2.04793073064, 1.68731108984, 1.2305666963,
+    0.868920337363, 0.61280991668, 0.436405793551 },
+  { 2.08509755623, 2.04793073064, 1.34329019223, 1.09205635862, 0.875748795257,
+    0.670882927016, 0.501731932449, 0.372504254596 },
+  { 1.48366094411, 1.68731108984, 1.09205635862, 0.772819797575, 0.605636379554,
+    0.48309405692, 0.380429446972, 0.295774038565 },
+  { 1.00227514334, 1.2305666963, 0.875748795257, 0.605636379554, 0.448996256676,
+    0.352889268808, 0.283006984131, 0.226951348204 },
+  { 0.678296995242, 0.868920337363, 0.670882927016, 0.48309405692,
+    0.352889268808, 0.27032073436, 0.215017739696, 0.17408067321 },
+  { 0.466224900598, 0.61280991668, 0.501731932449, 0.380429446972,
+    0.283006984131, 0.215017739696, 0.168869545842, 0.136153931001 },
+  { 0.3265091542, 0.436405793551, 0.372504254596, 0.295774038565,
+    0.226951348204, 0.17408067321, 0.136153931001, 0.109083846276 }
+};
+static const double csf_cb420[8][8] = {
+  { 1.91113096927, 2.46074210438, 1.18284184739, 1.14982565193, 1.05017074788,
+    0.898018824055, 0.74725392039, 0.615105596242 },
+  { 2.46074210438, 1.58529308355, 1.21363250036, 1.38190029285, 1.33100189972,
+    1.17428548929, 0.996404342439, 0.830890433625 },
+  { 1.18284184739, 1.21363250036, 0.978712413627, 1.02624506078, 1.03145147362,
+    0.960060382087, 0.849823426169, 0.731221236837 },
+  { 1.14982565193, 1.38190029285, 1.02624506078, 0.861317501629, 0.801821139099,
+    0.751437590932, 0.685398513368, 0.608694761374 },
+  { 1.05017074788, 1.33100189972, 1.03145147362, 0.801821139099, 0.676555426187,
+    0.605503172737, 0.55002013668, 0.495804539034 },
+  { 0.898018824055, 1.17428548929, 0.960060382087, 0.751437590932,
+    0.605503172737, 0.514674450957, 0.454353482512, 0.407050308965 },
+  { 0.74725392039, 0.996404342439, 0.849823426169, 0.685398513368,
+    0.55002013668, 0.454353482512, 0.389234902883, 0.342353999733 },
+  { 0.615105596242, 0.830890433625, 0.731221236837, 0.608694761374,
+    0.495804539034, 0.407050308965, 0.342353999733, 0.295530605237 }
+};
+static const double csf_cr420[8][8] = {
+  { 2.03871978502, 2.62502345193, 1.26180942886, 1.11019789803, 1.01397751469,
+    0.867069376285, 0.721500455585, 0.593906509971 },
+  { 2.62502345193, 1.69112867013, 1.17180569821, 1.3342742857, 1.28513006198,
+    1.13381474809, 0.962064122248, 0.802254508198 },
+  { 1.26180942886, 1.17180569821, 0.944981930573, 0.990876405848,
+    0.995903384143, 0.926972725286, 0.820534991409, 0.706020324706 },
+  { 1.11019789803, 1.3342742857, 0.990876405848, 0.831632933426, 0.77418706195,
+    0.725539939514, 0.661776842059, 0.587716619023 },
+  { 1.01397751469, 1.28513006198, 0.995903384143, 0.77418706195, 0.653238524286,
+    0.584635025748, 0.531064164893, 0.478717061273 },
+  { 0.867069376285, 1.13381474809, 0.926972725286, 0.725539939514,
+    0.584635025748, 0.496936637883, 0.438694579826, 0.393021669543 },
+  { 0.721500455585, 0.962064122248, 0.820534991409, 0.661776842059,
+    0.531064164893, 0.438694579826, 0.375820256136, 0.330555063063 },
+  { 0.593906509971, 0.802254508198, 0.706020324706, 0.587716619023,
+    0.478717061273, 0.393021669543, 0.330555063063, 0.285345396658 }
+};
+
+static double convert_score_db(double _score, double _weight, int16_t pix_max) {
+  assert(_score * _weight >= 0.0);
+
+  if (_weight * _score < pix_max * pix_max * 1e-10) return MAX_PSNR;
+  return 10 * (log10(pix_max * pix_max) - log10(_weight * _score));
+}
+
+static double calc_psnrhvs(const unsigned char *src, int _systride,
+                           const unsigned char *dst, int _dystride, double _par,
+                           int _w, int _h, int _step, const double _csf[8][8],
+                           uint32_t _shift, int buf_is_hbd, int16_t pix_max,
+                           int luma) {
+  double ret;
+  const uint8_t *_src8 = src;
+  const uint8_t *_dst8 = dst;
+  const uint16_t *_src16 = CONVERT_TO_SHORTPTR(src);
+  const uint16_t *_dst16 = CONVERT_TO_SHORTPTR(dst);
+  DECLARE_ALIGNED(16, int16_t, dct_s[8 * 8]);
+  DECLARE_ALIGNED(16, int16_t, dct_d[8 * 8]);
+  DECLARE_ALIGNED(16, tran_low_t, dct_s_coef[8 * 8]);
+  DECLARE_ALIGNED(16, tran_low_t, dct_d_coef[8 * 8]);
+  double mask[8][8];
+  int pixels;
+  int x;
+  int y;
+  float sum1;
+  float sum2;
+  float delt;
+  (void)_par;
+  ret = pixels = 0;
+  sum1 = sum2 = delt = 0.0f;
+  for (y = 0; y < _h; y++) {
+    for (x = 0; x < _w; x++) {
+      if (!buf_is_hbd) {
+        sum1 += _src8[y * _systride + x];
+        sum2 += _dst8[y * _dystride + x];
+      } else {
+        sum1 += _src16[y * _systride + x] >> _shift;
+        sum2 += _dst16[y * _dystride + x] >> _shift;
+      }
+    }
+  }
+  if (luma) delt = (sum1 - sum2) / (_w * _h);
+  /*In the PSNR-HVS-M paper[1] the authors describe the construction of
+   their masking table as "we have used the quantization table for the
+   color component Y of JPEG [6] that has been also obtained on the
+   basis of CSF. Note that the values in quantization table JPEG have
+   been normalized and then squared." Their CSF matrix (from PSNR-HVS)
+   was also constructed from the JPEG matrices. I can not find any obvious
+   scheme of normalizing to produce their table, but if I multiply their
+   CSF by 0.3885746225901003 and square the result I get their masking table.
+   I have no idea where this constant comes from, but deviating from it
+   too greatly hurts MOS agreement.
+
+   [1] Nikolay Ponomarenko, Flavia Silvestri, Karen Egiazarian, Marco Carli,
+   Jaakko Astola, Vladimir Lukin, "On between-coefficient contrast masking
+   of DCT basis functions", CD-ROM Proceedings of the Third
+   International Workshop on Video Processing and Quality Metrics for Consumer
+   Electronics VPQM-07, Scottsdale, Arizona, USA, 25-26 January, 2007, 4 p.
+
+   Suggested in aomedia issue#2363:
+   0.3885746225901003 is a reciprocal of the maximum coefficient (2.573509)
+   of the old JPEG based matrix from the paper. Since you are not using that,
+   divide by actual maximum coefficient. */
+  for (x = 0; x < 8; x++)
+    for (y = 0; y < 8; y++)
+      mask[x][y] = (_csf[x][y] / _csf[1][0]) * (_csf[x][y] / _csf[1][0]);
+  for (y = 0; y < _h - 7; y += _step) {
+    for (x = 0; x < _w - 7; x += _step) {
+      int i;
+      int j;
+      int n = 0;
+      double s_gx = 0;
+      double s_gy = 0;
+      double g = 0;
+      double s_gmean = 0;
+      double s_gvar = 0;
+      double s_mask = 0;
+      for (i = 0; i < 8; i++) {
+        for (j = 0; j < 8; j++) {
+          if (!buf_is_hbd) {
+            dct_s[i * 8 + j] = _src8[(y + i) * _systride + (j + x)];
+            dct_d[i * 8 + j] = _dst8[(y + i) * _dystride + (j + x)];
+          } else {
+            dct_s[i * 8 + j] = _src16[(y + i) * _systride + (j + x)] >> _shift;
+            dct_d[i * 8 + j] = _dst16[(y + i) * _dystride + (j + x)] >> _shift;
+          }
+          dct_d[i * 8 + j] += (int)(delt + 0.5f);
+        }
+      }
+      for (i = 1; i < 7; i++) {
+        for (j = 1; j < 7; j++) {
+          s_gx = (dct_s[(i - 1) * 8 + j - 1] * 3 -
+                  dct_s[(i - 1) * 8 + j + 1] * 3 + dct_s[i * 8 + j - 1] * 10 -
+                  dct_s[i * 8 + j + 1] * 10 + dct_s[(i + 1) * 8 + j - 1] * 3 -
+                  dct_s[(i + 1) * 8 + j + 1] * 3) /
+                 (pix_max * 16.f);
+          s_gy = (dct_s[(i - 1) * 8 + j - 1] * 3 -
+                  dct_s[(i + 1) * 8 + j - 1] * 3 + dct_s[(i - 1) * 8 + j] * 10 -
+                  dct_s[(i + 1) * 8 + j] * 10 + dct_s[(i - 1) * 8 + j + 1] * 3 -
+                  dct_s[(i + 1) * 8 + j + 1] * 3) /
+                 (pix_max * 16.f);
+          g = sqrt(s_gx * s_gx + s_gy * s_gy);
+          if (g > 0.1f) n++;
+          s_gmean += g;
+        }
+      }
+      s_gvar = 1.f / (36 - n + 1) * s_gmean / 36.f;
+      if (!buf_is_hbd) {
+        od_bin_fdct8x8(dct_s_coef, 8, dct_s, 8);
+        od_bin_fdct8x8(dct_d_coef, 8, dct_d, 8);
+      } else {
+        hbd_od_bin_fdct8x8(dct_s_coef, 8, dct_s, 8);
+        hbd_od_bin_fdct8x8(dct_d_coef, 8, dct_d, 8);
+      }
+      for (i = 0; i < 8; i++)
+        for (j = (i == 0); j < 8; j++)
+          s_mask += dct_s_coef[i * 8 + j] * dct_s_coef[i * 8 + j] * mask[i][j];
+      s_mask = sqrt(s_mask * s_gvar) / 8.f;
+      for (i = 0; i < 8; i++) {
+        for (j = 0; j < 8; j++) {
+          double err;
+          err = fabs((double)(dct_s_coef[i * 8 + j] - dct_d_coef[i * 8 + j]));
+          if (i != 0 || j != 0)
+            err = err < s_mask / mask[i][j] ? 0 : err - s_mask / mask[i][j];
+          ret += (err * _csf[i][j]) * (err * _csf[i][j]);
+          pixels++;
+        }
+      }
+    }
+  }
+  if (pixels <= 0) return 0;
+  ret /= pixels;
+  ret += 0.04 * delt * delt;
+  return ret;
+}
+
+double aom_psnrhvs(const YV12_BUFFER_CONFIG *src, const YV12_BUFFER_CONFIG *dst,
+                   double *y_psnrhvs, double *u_psnrhvs, double *v_psnrhvs,
+                   uint32_t bd, uint32_t in_bd) {
+  double psnrhvs;
+  const double par = 1.0;
+  const int step = 7;
+  uint32_t bd_shift = 0;
+  aom_clear_system_state();
+  assert(bd == 8 || bd == 10 || bd == 12);
+  assert(bd >= in_bd);
+  assert(src->flags == dst->flags);
+  const int buf_is_hbd = src->flags & YV12_FLAG_HIGHBITDEPTH;
+
+  int16_t pix_max = 255;
+  if (in_bd == 10)
+    pix_max = 1023;
+  else if (in_bd == 12)
+    pix_max = 4095;
+
+  bd_shift = bd - in_bd;
+
+  *y_psnrhvs =
+      calc_psnrhvs(src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride,
+                   par, src->y_crop_width, src->y_crop_height, step, csf_y,
+                   bd_shift, buf_is_hbd, pix_max, 1);
+  *u_psnrhvs =
+      calc_psnrhvs(src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride,
+                   par, src->uv_crop_width, src->uv_crop_height, step,
+                   csf_cb420, bd_shift, buf_is_hbd, pix_max, 0);
+  *v_psnrhvs =
+      calc_psnrhvs(src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride,
+                   par, src->uv_crop_width, src->uv_crop_height, step,
+                   csf_cr420, bd_shift, buf_is_hbd, pix_max, 0);
+  psnrhvs = (*y_psnrhvs) * .8 + .1 * ((*u_psnrhvs) + (*v_psnrhvs));
+  return convert_score_db(psnrhvs, 1.0, pix_max);
+}
diff --git a/libs/libaom/src/aom_dsp/quantize.c b/libs/libaom/src/aom_dsp/quantize.c
new file mode 100644
index 000000000..edd4d9648
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/quantize.c
@@ -0,0 +1,466 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/quantize.h"
+#include "aom_mem/aom_mem.h"
+#include "av1/encoder/av1_quantize.h"
+
+void aom_quantize_b_adaptive_helper_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+    const qm_val_t *iqm_ptr, const int log_scale) {
+  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
+                         ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
+  const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
+  int i, non_zero_count = (int)n_coeffs, eob = -1;
+  (void)iscan;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  int prescan_add[2];
+  for (i = 0; i < 2; ++i)
+    prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
+
+  // Pre-scan pass
+  for (i = (int)n_coeffs - 1; i >= 0; i--) {
+    const int rc = scan[i];
+    const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+    const int coeff = coeff_ptr[rc] * wt;
+    const int prescan_add_val = prescan_add[rc != 0];
+    if (coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val) &&
+        coeff > (nzbins[rc != 0] * (1 << AOM_QM_BITS) - prescan_add_val))
+      non_zero_count--;
+    else
+      break;
+  }
+
+  // Quantization pass: All coefficients with index >= zero_flag are
+  // skippable. Note: zero_flag can be zero.
+#if SKIP_EOB_FACTOR_ADJUST
+  int first = -1;
+#endif  // SKIP_EOB_FACTOR_ADJUST
+  for (i = 0; i < non_zero_count; i++) {
+    const int rc = scan[i];
+    const int coeff = coeff_ptr[rc];
+    const int coeff_sign = AOMSIGN(coeff);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+    int tmp32;
+
+    const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+    if (abs_coeff * wt >= (zbins[rc != 0] << AOM_QM_BITS)) {
+      int64_t tmp =
+          clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale),
+                INT16_MIN, INT16_MAX);
+      tmp *= wt;
+      tmp32 = (int)(((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
+                     quant_shift_ptr[rc != 0]) >>
+                    (16 - log_scale + AOM_QM_BITS));  // quantization
+      qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
+      const int iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS);
+      const int dequant =
+          (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >>
+          AOM_QM_BITS;
+      const tran_low_t abs_dqcoeff = (tmp32 * dequant) >> log_scale;
+      dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
+
+      if (tmp32) {
+        eob = i;
+#if SKIP_EOB_FACTOR_ADJUST
+        if (first == -1) first = i;
+#endif  // SKIP_EOB_FACTOR_ADJUST
+      }
+    }
+  }
+#if SKIP_EOB_FACTOR_ADJUST
+  if (eob >= 0 && first == eob) {
+    const int rc = scan[eob];
+    if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
+      const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+      const int coeff = coeff_ptr[rc] * wt;
+      const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
+      const int prescan_add_val =
+          ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
+      if (coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val) &&
+          coeff > (nzbins[rc != 0] * (1 << AOM_QM_BITS) - prescan_add_val)) {
+        qcoeff_ptr[rc] = 0;
+        dqcoeff_ptr[rc] = 0;
+        eob = -1;
+      }
+    }
+  }
+#endif  // SKIP_EOB_FACTOR_ADJUST
+  *eob_ptr = eob + 1;
+}
+
+void aom_quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                             const int16_t *zbin_ptr, const int16_t *round_ptr,
+                             const int16_t *quant_ptr,
+                             const int16_t *quant_shift_ptr,
+                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                             const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                             const int16_t *scan, const int16_t *iscan,
+                             const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr,
+                             const int log_scale) {
+  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
+                         ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
+  const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
+  int i, non_zero_count = (int)n_coeffs, eob = -1;
+  (void)iscan;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  // Pre-scan pass
+  for (i = (int)n_coeffs - 1; i >= 0; i--) {
+    const int rc = scan[i];
+    const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+    const int coeff = coeff_ptr[rc] * wt;
+
+    if (coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS)) &&
+        coeff > (nzbins[rc != 0] * (1 << AOM_QM_BITS)))
+      non_zero_count--;
+    else
+      break;
+  }
+
+  // Quantization pass: All coefficients with index >= zero_flag are
+  // skippable. Note: zero_flag can be zero.
+  for (i = 0; i < non_zero_count; i++) {
+    const int rc = scan[i];
+    const int coeff = coeff_ptr[rc];
+    const int coeff_sign = AOMSIGN(coeff);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+    int tmp32;
+
+    const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+    if (abs_coeff * wt >= (zbins[rc != 0] << AOM_QM_BITS)) {
+      int64_t tmp =
+          clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale),
+                INT16_MIN, INT16_MAX);
+      tmp *= wt;
+      tmp32 = (int)(((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
+                     quant_shift_ptr[rc != 0]) >>
+                    (16 - log_scale + AOM_QM_BITS));  // quantization
+      qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
+      const int iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS);
+      const int dequant =
+          (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >>
+          AOM_QM_BITS;
+      const tran_low_t abs_dqcoeff = (tmp32 * dequant) >> log_scale;
+      dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
+
+      if (tmp32) eob = i;
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void aom_highbd_quantize_b_adaptive_helper_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+    const qm_val_t *iqm_ptr, const int log_scale) {
+  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
+                         ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
+  const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
+  (void)iscan;
+  int i, non_zero_count = (int)n_coeffs, eob = -1;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  int prescan_add[2];
+  for (i = 0; i < 2; ++i)
+    prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
+
+  // Pre-scan pass
+  for (i = (int)n_coeffs - 1; i >= 0; i--) {
+    const int rc = scan[i];
+    const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+    const int coeff = coeff_ptr[rc] * wt;
+    const int prescan_add_val = prescan_add[rc != 0];
+    if (coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val) &&
+        coeff > (nzbins[rc != 0] * (1 << AOM_QM_BITS) - prescan_add_val))
+      non_zero_count--;
+    else
+      break;
+  }
+
+  // Quantization pass: All coefficients with index >= zero_flag are
+  // skippable. Note: zero_flag can be zero.
+#if SKIP_EOB_FACTOR_ADJUST
+  int first = -1;
+#endif  // SKIP_EOB_FACTOR_ADJUST
+  for (i = 0; i < non_zero_count; i++) {
+    const int rc = scan[i];
+    const int coeff = coeff_ptr[rc];
+    const int coeff_sign = AOMSIGN(coeff);
+    const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+    if (abs_coeff * wt >= (zbins[rc != 0] << AOM_QM_BITS)) {
+      const int64_t tmp1 =
+          abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale);
+      const int64_t tmpw = tmp1 * wt;
+      const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw;
+      const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >>
+                                   (16 - log_scale + AOM_QM_BITS));
+      qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+      const qm_val_t iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS);
+      const int dequant =
+          (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >>
+          AOM_QM_BITS;
+      const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant) >> log_scale;
+      dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
+      if (abs_qcoeff) {
+        eob = i;
+#if SKIP_EOB_FACTOR_ADJUST
+        if (first == -1) first = eob;
+#endif  // SKIP_EOB_FACTOR_ADJUST
+      }
+    }
+  }
+#if SKIP_EOB_FACTOR_ADJUST
+  if (eob >= 0 && first == eob) {
+    const int rc = scan[eob];
+    if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
+      const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+      const int coeff = coeff_ptr[rc] * wt;
+      const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
+      const int prescan_add_val =
+          ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
+      if (coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val) &&
+          coeff > (nzbins[rc != 0] * (1 << AOM_QM_BITS) - prescan_add_val)) {
+        qcoeff_ptr[rc] = 0;
+        dqcoeff_ptr[rc] = 0;
+        eob = -1;
+      }
+    }
+  }
+#endif  // SKIP_EOB_FACTOR_ADJUST
+  *eob_ptr = eob + 1;
+}
+
+void aom_highbd_quantize_b_helper_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+    const qm_val_t *iqm_ptr, const int log_scale) {
+  int i, eob = -1;
+  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
+                         ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
+  const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
+  int dequant;
+  int idx_arr[4096];
+  (void)iscan;
+  int idx = 0;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  // Pre-scan pass
+  for (i = 0; i < n_coeffs; i++) {
+    const int rc = scan[i];
+    const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+    const int coeff = coeff_ptr[rc] * wt;
+
+    // If the coefficient is out of the base ZBIN range, keep it for
+    // quantization.
+    if (coeff >= (zbins[rc != 0] * (1 << AOM_QM_BITS)) ||
+        coeff <= (nzbins[rc != 0] * (1 << AOM_QM_BITS)))
+      idx_arr[idx++] = i;
+  }
+
+  // Quantization pass: only process the coefficients selected in
+  // pre-scan pass. Note: idx can be zero.
+  for (i = 0; i < idx; i++) {
+    const int rc = scan[idx_arr[i]];
+    const int coeff = coeff_ptr[rc];
+    const int coeff_sign = AOMSIGN(coeff);
+    const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+    const qm_val_t iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+    const int64_t tmp1 =
+        abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale);
+    const int64_t tmpw = tmp1 * wt;
+    const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw;
+    const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >>
+                                 (16 - log_scale + AOM_QM_BITS));
+    qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+    dequant =
+        (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
+    const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant) >> log_scale;
+    dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
+    if (abs_qcoeff) eob = idx_arr[i];
+  }
+  *eob_ptr = eob + 1;
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+/* These functions should only be called when quantisation matrices
+   are not used. */
+void aom_quantize_b_adaptive_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                               const int16_t *zbin_ptr,
+                               const int16_t *round_ptr,
+                               const int16_t *quant_ptr,
+                               const int16_t *quant_shift_ptr,
+                               tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                               const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                               const int16_t *scan, const int16_t *iscan) {
+  aom_quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+                                   quant_ptr, quant_shift_ptr, qcoeff_ptr,
+                                   dqcoeff_ptr, dequant_ptr, eob_ptr, scan,
+                                   iscan, NULL, NULL, 0);
+}
+
+void aom_quantize_b_32x32_adaptive_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  aom_quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+                                   quant_ptr, quant_shift_ptr, qcoeff_ptr,
+                                   dqcoeff_ptr, dequant_ptr, eob_ptr, scan,
+                                   iscan, NULL, NULL, 1);
+}
+
+void aom_quantize_b_64x64_adaptive_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  aom_quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+                                   quant_ptr, quant_shift_ptr, qcoeff_ptr,
+                                   dqcoeff_ptr, dequant_ptr, eob_ptr, scan,
+                                   iscan, NULL, NULL, 2);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void aom_highbd_quantize_b_adaptive_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  aom_highbd_quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr,
+                                          round_ptr, quant_ptr, quant_shift_ptr,
+                                          qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+                                          eob_ptr, scan, iscan, NULL, NULL, 0);
+}
+
+void aom_highbd_quantize_b_32x32_adaptive_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  aom_highbd_quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr,
+                                          round_ptr, quant_ptr, quant_shift_ptr,
+                                          qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+                                          eob_ptr, scan, iscan, NULL, NULL, 1);
+}
+
+void aom_highbd_quantize_b_64x64_adaptive_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  aom_highbd_quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr,
+                                          round_ptr, quant_ptr, quant_shift_ptr,
+                                          qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+                                          eob_ptr, scan, iscan, NULL, NULL, 2);
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+void aom_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                      const int16_t *zbin_ptr, const int16_t *round_ptr,
+                      const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
+                      tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                      const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                      const int16_t *scan, const int16_t *iscan) {
+  aom_quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
+                          quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+                          eob_ptr, scan, iscan, NULL, NULL, 0);
+}
+
+void aom_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                            const int16_t *zbin_ptr, const int16_t *round_ptr,
+                            const int16_t *quant_ptr,
+                            const int16_t *quant_shift_ptr,
+                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                            const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                            const int16_t *scan, const int16_t *iscan) {
+  aom_quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
+                          quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+                          eob_ptr, scan, iscan, NULL, NULL, 1);
+}
+
+void aom_quantize_b_64x64_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                            const int16_t *zbin_ptr, const int16_t *round_ptr,
+                            const int16_t *quant_ptr,
+                            const int16_t *quant_shift_ptr,
+                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                            const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                            const int16_t *scan, const int16_t *iscan) {
+  aom_quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
+                          quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+                          eob_ptr, scan, iscan, NULL, NULL, 2);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void aom_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                             const int16_t *zbin_ptr, const int16_t *round_ptr,
+                             const int16_t *quant_ptr,
+                             const int16_t *quant_shift_ptr,
+                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                             const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                             const int16_t *scan, const int16_t *iscan) {
+  aom_highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+                                 quant_ptr, quant_shift_ptr, qcoeff_ptr,
+                                 dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan,
+                                 NULL, NULL, 0);
+}
+
+void aom_highbd_quantize_b_32x32_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  aom_highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+                                 quant_ptr, quant_shift_ptr, qcoeff_ptr,
+                                 dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan,
+                                 NULL, NULL, 1);
+}
+
+void aom_highbd_quantize_b_64x64_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  aom_highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+                                 quant_ptr, quant_shift_ptr, qcoeff_ptr,
+                                 dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan,
+                                 NULL, NULL, 2);
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/libs/libaom/src/aom_dsp/quantize.h b/libs/libaom/src/aom_dsp/quantize.h
new file mode 100644
index 000000000..395631814
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/quantize.h
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_QUANTIZE_H_
+#define AOM_AOM_DSP_QUANTIZE_H_
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void aom_quantize_b_adaptive_helper_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+    const qm_val_t *iqm_ptr, const int log_scale);
+
+void aom_quantize_b_adaptive_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                               const int16_t *zbin_ptr,
+                               const int16_t *round_ptr,
+                               const int16_t *quant_ptr,
+                               const int16_t *quant_shift_ptr,
+                               tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                               const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                               const int16_t *scan, const int16_t *iscan);
+
+void aom_quantize_b_32x32_adaptive_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan);
+
+void aom_quantize_b_64x64_adaptive_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void aom_highbd_quantize_b_adaptive_helper_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+    const qm_val_t *iqm_ptr, const int log_scale);
+
+void aom_highbd_quantize_b_adaptive_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan);
+
+void aom_highbd_quantize_b_32x32_adaptive_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan);
+
+void aom_highbd_quantize_b_64x64_adaptive_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan);
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+void aom_quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                             const int16_t *zbin_ptr, const int16_t *round_ptr,
+                             const int16_t *quant_ptr,
+                             const int16_t *quant_shift_ptr,
+                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                             const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                             const int16_t *scan, const int16_t *iscan,
+                             const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr,
+                             const int log_scale);
+
+void aom_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                      const int16_t *zbin_ptr, const int16_t *round_ptr,
+                      const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
+                      tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                      const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                      const int16_t *scan, const int16_t *iscan);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void aom_highbd_quantize_b_helper_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+    const qm_val_t *iqm_ptr, const int log_scale);
+
+void aom_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                             const int16_t *zbin_ptr, const int16_t *round_ptr,
+                             const int16_t *quant_ptr,
+                             const int16_t *quant_shift_ptr,
+                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                             const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                             const int16_t *scan, const int16_t *iscan);
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_DSP_QUANTIZE_H_
diff --git a/libs/libaom/src/aom_dsp/recenter.h b/libs/libaom/src/aom_dsp/recenter.h
new file mode 100644
index 000000000..b3fd41290
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/recenter.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_RECENTER_H_
+#define AOM_AOM_DSP_RECENTER_H_
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+
+// Inverse recenters a non-negative literal v around a reference r
+static INLINE uint16_t inv_recenter_nonneg(uint16_t r, uint16_t v) {
+  if (v > (r << 1))
+    return v;
+  else if ((v & 1) == 0)
+    return (v >> 1) + r;
+  else
+    return r - ((v + 1) >> 1);
+}
+
+// Inverse recenters a non-negative literal v in [0, n-1] around a
+// reference r also in [0, n-1]
+static INLINE uint16_t inv_recenter_finite_nonneg(uint16_t n, uint16_t r,
+                                                  uint16_t v) {
+  if ((r << 1) <= n) {
+    return inv_recenter_nonneg(r, v);
+  } else {
+    return n - 1 - inv_recenter_nonneg(n - 1 - r, v);
+  }
+}
+
+// Recenters a non-negative literal v around a reference r
+static INLINE uint16_t recenter_nonneg(uint16_t r, uint16_t v) {
+  if (v > (r << 1))
+    return v;
+  else if (v >= r)
+    return ((v - r) << 1);
+  else
+    return ((r - v) << 1) - 1;
+}
+
+// Recenters a non-negative literal v in [0, n-1] around a
+// reference r also in [0, n-1]
+static INLINE uint16_t recenter_finite_nonneg(uint16_t n, uint16_t r,
+                                              uint16_t v) {
+  if ((r << 1) <= n) {
+    return recenter_nonneg(r, v);
+  } else {
+    return recenter_nonneg(n - 1 - r, n - 1 - v);
+  }
+}
+
+#endif  // AOM_AOM_DSP_RECENTER_H_
diff --git a/libs/libaom/src/aom_dsp/sad.c b/libs/libaom/src/aom_dsp/sad.c
new file mode 100644
index 000000000..8ddc683d6
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/sad.c
@@ -0,0 +1,319 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+#include "aom_dsp/blend.h"
+
+/* Sum the difference between every corresponding element of the buffers. */
+static INLINE unsigned int sad(const uint8_t *a, int a_stride, const uint8_t *b,
+                               int b_stride, int width, int height) {
+  int y, x;
+  unsigned int sad = 0;
+
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x++) {
+      sad += abs(a[x] - b[x]);
+    }
+
+    a += a_stride;
+    b += b_stride;
+  }
+  return sad;
+}
+
+#define sadMxh(m)                                                          \
+  unsigned int aom_sad##m##xh_c(const uint8_t *a, int a_stride,            \
+                                const uint8_t *b, int b_stride, int width, \
+                                int height) {                              \
+    return sad(a, a_stride, b, b_stride, width, height);                   \
+  }
+
+#define sadMxN(m, n)                                                          \
+  unsigned int aom_sad##m##x##n##_c(const uint8_t *src, int src_stride,       \
+                                    const uint8_t *ref, int ref_stride) {     \
+    return sad(src, src_stride, ref, ref_stride, m, n);                       \
+  }                                                                           \
+  unsigned int aom_sad##m##x##n##_avg_c(const uint8_t *src, int src_stride,   \
+                                        const uint8_t *ref, int ref_stride,   \
+                                        const uint8_t *second_pred) {         \
+    uint8_t comp_pred[m * n];                                                 \
+    aom_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride);         \
+    return sad(src, src_stride, comp_pred, m, m, n);                          \
+  }                                                                           \
+  unsigned int aom_dist_wtd_sad##m##x##n##_avg_c(                             \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {    \
+    uint8_t comp_pred[m * n];                                                 \
+    aom_dist_wtd_comp_avg_pred_c(comp_pred, second_pred, m, n, ref,           \
+                                 ref_stride, jcp_param);                      \
+    return sad(src, src_stride, comp_pred, m, m, n);                          \
+  }
+
+// Calculate sad against 4 reference locations and store each in sad_array
+#define sadMxNx4D(m, n)                                                      \
+  void aom_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride,           \
+                               const uint8_t *const ref_array[],             \
+                               int ref_stride, uint32_t *sad_array) {        \
+    int i;                                                                   \
+    for (i = 0; i < 4; ++i) {                                                \
+      sad_array[i] =                                                         \
+          aom_sad##m##x##n##_c(src, src_stride, ref_array[i], ref_stride);   \
+    }                                                                        \
+  }                                                                          \
+  void aom_sad##m##x##n##x4d_avg_c(                                          \
+      const uint8_t *src, int src_stride, const uint8_t *const ref_array[],  \
+      int ref_stride, const uint8_t *second_pred, uint32_t *sad_array) {     \
+    int i;                                                                   \
+    for (i = 0; i < 4; ++i) {                                                \
+      sad_array[i] = aom_sad##m##x##n##_avg_c(src, src_stride, ref_array[i], \
+                                              ref_stride, second_pred);      \
+    }                                                                        \
+  }
+
+// 128x128
+sadMxN(128, 128);
+sadMxNx4D(128, 128);
+
+// 128x64
+sadMxN(128, 64);
+sadMxNx4D(128, 64);
+
+// 64x128
+sadMxN(64, 128);
+sadMxNx4D(64, 128);
+
+// 64x64
+sadMxN(64, 64);
+sadMxNx4D(64, 64);
+
+// 64x32
+sadMxN(64, 32);
+sadMxNx4D(64, 32);
+
+// 32x64
+sadMxN(32, 64);
+sadMxNx4D(32, 64);
+
+// 32x32
+sadMxN(32, 32);
+sadMxNx4D(32, 32);
+
+// 32x16
+sadMxN(32, 16);
+sadMxNx4D(32, 16);
+
+// 16x32
+sadMxN(16, 32);
+sadMxNx4D(16, 32);
+
+// 16x16
+sadMxN(16, 16);
+sadMxNx4D(16, 16);
+
+// 16x8
+sadMxN(16, 8);
+sadMxNx4D(16, 8);
+
+// 8x16
+sadMxN(8, 16);
+sadMxNx4D(8, 16);
+
+// 8x8
+sadMxN(8, 8);
+sadMxNx4D(8, 8);
+
+// 8x4
+sadMxN(8, 4);
+sadMxNx4D(8, 4);
+
+// 4x8
+sadMxN(4, 8);
+sadMxNx4D(4, 8);
+
+// 4x4
+sadMxN(4, 4);
+sadMxNx4D(4, 4);
+
+sadMxh(128);
+sadMxh(64);
+sadMxh(32);
+sadMxh(16);
+sadMxh(8);
+sadMxh(4);
+
+sadMxN(4, 16);
+sadMxNx4D(4, 16);
+sadMxN(16, 4);
+sadMxNx4D(16, 4);
+sadMxN(8, 32);
+sadMxNx4D(8, 32);
+sadMxN(32, 8);
+sadMxNx4D(32, 8);
+sadMxN(16, 64);
+sadMxNx4D(16, 64);
+sadMxN(64, 16);
+sadMxNx4D(64, 16);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE unsigned int highbd_sad(const uint8_t *a8, int a_stride,
+                                      const uint8_t *b8, int b_stride,
+                                      int width, int height) {
+  int y, x;
+  unsigned int sad = 0;
+  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x++) {
+      sad += abs(a[x] - b[x]);
+    }
+
+    a += a_stride;
+    b += b_stride;
+  }
+  return sad;
+}
+
+static INLINE unsigned int highbd_sadb(const uint8_t *a8, int a_stride,
+                                       const uint8_t *b8, int b_stride,
+                                       int width, int height) {
+  int y, x;
+  unsigned int sad = 0;
+  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x++) {
+      sad += abs(a[x] - b[x]);
+    }
+
+    a += a_stride;
+    b += b_stride;
+  }
+  return sad;
+}
+
+#define highbd_sadMxN(m, n)                                                    \
+  unsigned int aom_highbd_sad##m##x##n##_c(const uint8_t *src, int src_stride, \
+                                           const uint8_t *ref,                 \
+                                           int ref_stride) {                   \
+    return highbd_sad(src, src_stride, ref, ref_stride, m, n);                 \
+  }                                                                            \
+  unsigned int aom_highbd_sad##m##x##n##_avg_c(                                \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,  \
+      const uint8_t *second_pred) {                                            \
+    uint16_t comp_pred[m * n];                                                 \
+    uint8_t *const comp_pred8 = CONVERT_TO_BYTEPTR(comp_pred);                 \
+    aom_highbd_comp_avg_pred(comp_pred8, second_pred, m, n, ref, ref_stride);  \
+    return highbd_sadb(src, src_stride, comp_pred8, m, m, n);                  \
+  }                                                                            \
+  unsigned int aom_highbd_dist_wtd_sad##m##x##n##_avg_c(                       \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,  \
+      const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {     \
+    uint16_t comp_pred[m * n];                                                 \
+    uint8_t *const comp_pred8 = CONVERT_TO_BYTEPTR(comp_pred);                 \
+    aom_highbd_dist_wtd_comp_avg_pred(comp_pred8, second_pred, m, n, ref,      \
+                                      ref_stride, jcp_param);                  \
+    return highbd_sadb(src, src_stride, comp_pred8, m, m, n);                  \
+  }
+
+#define highbd_sadMxNx4D(m, n)                                               \
+  void aom_highbd_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride,    \
+                                      const uint8_t *const ref_array[],      \
+                                      int ref_stride, uint32_t *sad_array) { \
+    int i;                                                                   \
+    for (i = 0; i < 4; ++i) {                                                \
+      sad_array[i] = aom_highbd_sad##m##x##n##_c(src, src_stride,            \
+                                                 ref_array[i], ref_stride);  \
+    }                                                                        \
+  }
+
+// 128x128
+highbd_sadMxN(128, 128);
+highbd_sadMxNx4D(128, 128);
+
+// 128x64
+highbd_sadMxN(128, 64);
+highbd_sadMxNx4D(128, 64);
+
+// 64x128
+highbd_sadMxN(64, 128);
+highbd_sadMxNx4D(64, 128);
+
+// 64x64
+highbd_sadMxN(64, 64);
+highbd_sadMxNx4D(64, 64);
+
+// 64x32
+highbd_sadMxN(64, 32);
+highbd_sadMxNx4D(64, 32);
+
+// 32x64
+highbd_sadMxN(32, 64);
+highbd_sadMxNx4D(32, 64);
+
+// 32x32
+highbd_sadMxN(32, 32);
+highbd_sadMxNx4D(32, 32);
+
+// 32x16
+highbd_sadMxN(32, 16);
+highbd_sadMxNx4D(32, 16);
+
+// 16x32
+highbd_sadMxN(16, 32);
+highbd_sadMxNx4D(16, 32);
+
+// 16x16
+highbd_sadMxN(16, 16);
+highbd_sadMxNx4D(16, 16);
+
+// 16x8
+highbd_sadMxN(16, 8);
+highbd_sadMxNx4D(16, 8);
+
+// 8x16
+highbd_sadMxN(8, 16);
+highbd_sadMxNx4D(8, 16);
+
+// 8x8
+highbd_sadMxN(8, 8);
+highbd_sadMxNx4D(8, 8);
+
+// 8x4
+highbd_sadMxN(8, 4);
+highbd_sadMxNx4D(8, 4);
+
+// 4x8
+highbd_sadMxN(4, 8);
+highbd_sadMxNx4D(4, 8);
+
+// 4x4
+highbd_sadMxN(4, 4);
+highbd_sadMxNx4D(4, 4);
+
+highbd_sadMxN(4, 16);
+highbd_sadMxNx4D(4, 16);
+highbd_sadMxN(16, 4);
+highbd_sadMxNx4D(16, 4);
+highbd_sadMxN(8, 32);
+highbd_sadMxNx4D(8, 32);
+highbd_sadMxN(32, 8);
+highbd_sadMxNx4D(32, 8);
+highbd_sadMxN(16, 64);
+highbd_sadMxNx4D(16, 64);
+highbd_sadMxN(64, 16);
+highbd_sadMxNx4D(64, 16);
+#endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/libs/libaom/src/aom_dsp/sad_av1.c b/libs/libaom/src/aom_dsp/sad_av1.c
new file mode 100644
index 000000000..467518163
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/sad_av1.c
@@ -0,0 +1,264 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+#include "aom_dsp/blend.h"
+
+static INLINE unsigned int masked_sad(const uint8_t *src, int src_stride,
+                                      const uint8_t *a, int a_stride,
+                                      const uint8_t *b, int b_stride,
+                                      const uint8_t *m, int m_stride, int width,
+                                      int height) {
+  int y, x;
+  unsigned int sad = 0;
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x++) {
+      const int16_t pred = AOM_BLEND_A64(m[x], a[x], b[x]);
+      sad += abs(pred - src[x]);
+    }
+    src += src_stride;
+    a += a_stride;
+    b += b_stride;
+    m += m_stride;
+  }
+  return sad;
+}
+
+#define MASKSADMxN(m, n)                                                       \
+  unsigned int aom_masked_sad##m##x##n##_c(                                    \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,  \
+      const uint8_t *second_pred, const uint8_t *msk, int msk_stride,          \
+      int invert_mask) {                                                       \
+    if (!invert_mask)                                                          \
+      return masked_sad(src, src_stride, ref, ref_stride, second_pred, m, msk, \
+                        msk_stride, m, n);                                     \
+    else                                                                       \
+      return masked_sad(src, src_stride, second_pred, m, ref, ref_stride, msk, \
+                        msk_stride, m, n);                                     \
+  }                                                                            \
+  void aom_masked_sad##m##x##n##x4d_c(                                         \
+      const uint8_t *src, int src_stride, const uint8_t *ref[],                \
+      int ref_stride, const uint8_t *second_pred, const uint8_t *msk,          \
+      int msk_stride, int invert_mask, unsigned sads[]) {                      \
+    if (!invert_mask)                                                          \
+      for (int i = 0; i < 4; i++) {                                            \
+        sads[i] = masked_sad(src, src_stride, ref[i], ref_stride, second_pred, \
+                             m, msk, msk_stride, m, n);                        \
+      }                                                                        \
+    else                                                                       \
+      for (int i = 0; i < 4; i++) {                                            \
+        sads[i] = masked_sad(src, src_stride, second_pred, m, ref[i],          \
+                             ref_stride, msk, msk_stride, m, n);               \
+      }                                                                        \
+  }
+
+/* clang-format off */
+MASKSADMxN(128, 128)
+MASKSADMxN(128, 64)
+MASKSADMxN(64, 128)
+MASKSADMxN(64, 64)
+MASKSADMxN(64, 32)
+MASKSADMxN(32, 64)
+MASKSADMxN(32, 32)
+MASKSADMxN(32, 16)
+MASKSADMxN(16, 32)
+MASKSADMxN(16, 16)
+MASKSADMxN(16, 8)
+MASKSADMxN(8, 16)
+MASKSADMxN(8, 8)
+MASKSADMxN(8, 4)
+MASKSADMxN(4, 8)
+MASKSADMxN(4, 4)
+MASKSADMxN(4, 16)
+MASKSADMxN(16, 4)
+MASKSADMxN(8, 32)
+MASKSADMxN(32, 8)
+MASKSADMxN(16, 64)
+MASKSADMxN(64, 16)
+/* clang-format on */
+
+#if CONFIG_AV1_HIGHBITDEPTH
+                            static INLINE
+    unsigned int highbd_masked_sad(const uint8_t *src8, int src_stride,
+                                   const uint8_t *a8, int a_stride,
+                                   const uint8_t *b8, int b_stride,
+                                   const uint8_t *m, int m_stride, int width,
+                                   int height) {
+  int y, x;
+  unsigned int sad = 0;
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x++) {
+      const uint16_t pred = AOM_BLEND_A64(m[x], a[x], b[x]);
+      sad += abs(pred - src[x]);
+    }
+
+    src += src_stride;
+    a += a_stride;
+    b += b_stride;
+    m += m_stride;
+  }
+
+  return sad;
+}
+
+#define HIGHBD_MASKSADMXN(m, n)                                         \
+  unsigned int aom_highbd_masked_sad##m##x##n##_c(                      \
+      const uint8_t *src8, int src_stride, const uint8_t *ref8,         \
+      int ref_stride, const uint8_t *second_pred8, const uint8_t *msk,  \
+      int msk_stride, int invert_mask) {                                \
+    if (!invert_mask)                                                   \
+      return highbd_masked_sad(src8, src_stride, ref8, ref_stride,      \
+                               second_pred8, m, msk, msk_stride, m, n); \
+    else                                                                \
+      return highbd_masked_sad(src8, src_stride, second_pred8, m, ref8, \
+                               ref_stride, msk, msk_stride, m, n);      \
+  }
+
+HIGHBD_MASKSADMXN(128, 128)
+HIGHBD_MASKSADMXN(128, 64)
+HIGHBD_MASKSADMXN(64, 128)
+HIGHBD_MASKSADMXN(64, 64)
+HIGHBD_MASKSADMXN(64, 32)
+HIGHBD_MASKSADMXN(32, 64)
+HIGHBD_MASKSADMXN(32, 32)
+HIGHBD_MASKSADMXN(32, 16)
+HIGHBD_MASKSADMXN(16, 32)
+HIGHBD_MASKSADMXN(16, 16)
+HIGHBD_MASKSADMXN(16, 8)
+HIGHBD_MASKSADMXN(8, 16)
+HIGHBD_MASKSADMXN(8, 8)
+HIGHBD_MASKSADMXN(8, 4)
+HIGHBD_MASKSADMXN(4, 8)
+HIGHBD_MASKSADMXN(4, 4)
+HIGHBD_MASKSADMXN(4, 16)
+HIGHBD_MASKSADMXN(16, 4)
+HIGHBD_MASKSADMXN(8, 32)
+HIGHBD_MASKSADMXN(32, 8)
+HIGHBD_MASKSADMXN(16, 64)
+HIGHBD_MASKSADMXN(64, 16)
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+// pre: predictor being evaluated
+// wsrc: target weighted prediction (has been *4096 to keep precision)
+// mask: 2d weights (scaled by 4096)
+static INLINE unsigned int obmc_sad(const uint8_t *pre, int pre_stride,
+                                    const int32_t *wsrc, const int32_t *mask,
+                                    int width, int height) {
+  int y, x;
+  unsigned int sad = 0;
+
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x++)
+      sad += ROUND_POWER_OF_TWO(abs(wsrc[x] - pre[x] * mask[x]), 12);
+
+    pre += pre_stride;
+    wsrc += width;
+    mask += width;
+  }
+
+  return sad;
+}
+
+#define OBMCSADMxN(m, n)                                                     \
+  unsigned int aom_obmc_sad##m##x##n##_c(const uint8_t *ref, int ref_stride, \
+                                         const int32_t *wsrc,                \
+                                         const int32_t *mask) {              \
+    return obmc_sad(ref, ref_stride, wsrc, mask, m, n);                      \
+  }
+
+/* clang-format off */
+OBMCSADMxN(128, 128)
+OBMCSADMxN(128, 64)
+OBMCSADMxN(64, 128)
+OBMCSADMxN(64, 64)
+OBMCSADMxN(64, 32)
+OBMCSADMxN(32, 64)
+OBMCSADMxN(32, 32)
+OBMCSADMxN(32, 16)
+OBMCSADMxN(16, 32)
+OBMCSADMxN(16, 16)
+OBMCSADMxN(16, 8)
+OBMCSADMxN(8, 16)
+OBMCSADMxN(8, 8)
+OBMCSADMxN(8, 4)
+OBMCSADMxN(4, 8)
+OBMCSADMxN(4, 4)
+OBMCSADMxN(4, 16)
+OBMCSADMxN(16, 4)
+OBMCSADMxN(8, 32)
+OBMCSADMxN(32, 8)
+OBMCSADMxN(16, 64)
+OBMCSADMxN(64, 16)
+/* clang-format on */
+
+#if CONFIG_AV1_HIGHBITDEPTH
+                            static INLINE
+    unsigned int highbd_obmc_sad(const uint8_t *pre8, int pre_stride,
+                                 const int32_t *wsrc, const int32_t *mask,
+                                 int width, int height) {
+  int y, x;
+  unsigned int sad = 0;
+  const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
+
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x++)
+      sad += ROUND_POWER_OF_TWO(abs(wsrc[x] - pre[x] * mask[x]), 12);
+
+    pre += pre_stride;
+    wsrc += width;
+    mask += width;
+  }
+
+  return sad;
+}
+
+#define HIGHBD_OBMCSADMXN(m, n)                                \
+  unsigned int aom_highbd_obmc_sad##m##x##n##_c(               \
+      const uint8_t *ref, int ref_stride, const int32_t *wsrc, \
+      const int32_t *mask) {                                   \
+    return highbd_obmc_sad(ref, ref_stride, wsrc, mask, m, n); \
+  }
+
+/* clang-format off */
+HIGHBD_OBMCSADMXN(128, 128)
+HIGHBD_OBMCSADMXN(128, 64)
+HIGHBD_OBMCSADMXN(64, 128)
+HIGHBD_OBMCSADMXN(64, 64)
+HIGHBD_OBMCSADMXN(64, 32)
+HIGHBD_OBMCSADMXN(32, 64)
+HIGHBD_OBMCSADMXN(32, 32)
+HIGHBD_OBMCSADMXN(32, 16)
+HIGHBD_OBMCSADMXN(16, 32)
+HIGHBD_OBMCSADMXN(16, 16)
+HIGHBD_OBMCSADMXN(16, 8)
+HIGHBD_OBMCSADMXN(8, 16)
+HIGHBD_OBMCSADMXN(8, 8)
+HIGHBD_OBMCSADMXN(8, 4)
+HIGHBD_OBMCSADMXN(4, 8)
+HIGHBD_OBMCSADMXN(4, 4)
+HIGHBD_OBMCSADMXN(4, 16)
+HIGHBD_OBMCSADMXN(16, 4)
+HIGHBD_OBMCSADMXN(8, 32)
+HIGHBD_OBMCSADMXN(32, 8)
+HIGHBD_OBMCSADMXN(16, 64)
+HIGHBD_OBMCSADMXN(64, 16)
+/* clang-format on */
+#endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/libs/libaom/src/aom_dsp/simd/v128_intrinsics.h b/libs/libaom/src/aom_dsp/simd/v128_intrinsics.h
new file mode 100644
index 000000000..218a7a618
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/simd/v128_intrinsics.h
@@ -0,0 +1,346 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_SIMD_V128_INTRINSICS_H_
+#define AOM_AOM_DSP_SIMD_V128_INTRINSICS_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom_dsp/simd/v128_intrinsics_c.h"
+#include "aom_dsp/simd/v64_intrinsics.h"
+
+/* Fallback to plain, unoptimised C. */
+
+typedef c_v128 v128;
+
+SIMD_INLINE uint32_t v128_low_u32(v128 a) { return c_v128_low_u32(a); }
+SIMD_INLINE v64 v128_low_v64(v128 a) { return c_v128_low_v64(a); }
+SIMD_INLINE v64 v128_high_v64(v128 a) { return c_v128_high_v64(a); }
+SIMD_INLINE v128 v128_from_64(uint64_t hi, uint64_t lo) {
+  return c_v128_from_64(hi, lo);
+}
+SIMD_INLINE v128 v128_from_v64(v64 hi, v64 lo) {
+  return c_v128_from_v64(hi, lo);
+}
+SIMD_INLINE v128 v128_from_32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
+  return c_v128_from_32(a, b, c, d);
+}
+
+SIMD_INLINE v128 v128_load_unaligned(const void *p) {
+  return c_v128_load_unaligned(p);
+}
+SIMD_INLINE v128 v128_load_aligned(const void *p) {
+  return c_v128_load_aligned(p);
+}
+
+SIMD_INLINE void v128_store_unaligned(void *p, v128 a) {
+  c_v128_store_unaligned(p, a);
+}
+SIMD_INLINE void v128_store_aligned(void *p, v128 a) {
+  c_v128_store_aligned(p, a);
+}
+
+SIMD_INLINE v128 v128_align(v128 a, v128 b, unsigned int c) {
+  return c_v128_align(a, b, c);
+}
+
+SIMD_INLINE v128 v128_zero(void) { return c_v128_zero(); }
+SIMD_INLINE v128 v128_dup_8(uint8_t x) { return c_v128_dup_8(x); }
+SIMD_INLINE v128 v128_dup_16(uint16_t x) { return c_v128_dup_16(x); }
+SIMD_INLINE v128 v128_dup_32(uint32_t x) { return c_v128_dup_32(x); }
+SIMD_INLINE v128 v128_dup_64(uint64_t x) { return c_v128_dup_64(x); }
+
+SIMD_INLINE c_sad128_internal v128_sad_u8_init(void) {
+  return c_v128_sad_u8_init();
+}
+SIMD_INLINE c_sad128_internal v128_sad_u8(c_sad128_internal s, v128 a, v128 b) {
+  return c_v128_sad_u8(s, a, b);
+}
+SIMD_INLINE uint32_t v128_sad_u8_sum(c_sad128_internal s) {
+  return c_v128_sad_u8_sum(s);
+}
+SIMD_INLINE c_ssd128_internal v128_ssd_u8_init(void) {
+  return c_v128_ssd_u8_init();
+}
+SIMD_INLINE c_ssd128_internal v128_ssd_u8(c_ssd128_internal s, v128 a, v128 b) {
+  return c_v128_ssd_u8(s, a, b);
+}
+SIMD_INLINE uint32_t v128_ssd_u8_sum(c_ssd128_internal s) {
+  return c_v128_ssd_u8_sum(s);
+}
+SIMD_INLINE int64_t v128_dotp_su8(v128 a, v128 b) {
+  return c_v128_dotp_su8(a, b);
+}
+SIMD_INLINE int64_t v128_dotp_s16(v128 a, v128 b) {
+  return c_v128_dotp_s16(a, b);
+}
+SIMD_INLINE int64_t v128_dotp_s32(v128 a, v128 b) {
+  return c_v128_dotp_s32(a, b);
+}
+SIMD_INLINE uint64_t v128_hadd_u8(v128 a) { return c_v128_hadd_u8(a); }
+
+SIMD_INLINE v128 v128_or(v128 a, v128 b) { return c_v128_or(a, b); }
+SIMD_INLINE v128 v128_xor(v128 a, v128 b) { return c_v128_xor(a, b); }
+SIMD_INLINE v128 v128_and(v128 a, v128 b) { return c_v128_and(a, b); }
+SIMD_INLINE v128 v128_andn(v128 a, v128 b) { return c_v128_andn(a, b); }
+
+SIMD_INLINE v128 v128_add_8(v128 a, v128 b) { return c_v128_add_8(a, b); }
+SIMD_INLINE v128 v128_add_16(v128 a, v128 b) { return c_v128_add_16(a, b); }
+SIMD_INLINE v128 v128_sadd_u8(v128 a, v128 b) { return c_v128_sadd_u8(a, b); }
+SIMD_INLINE v128 v128_sadd_s8(v128 a, v128 b) { return c_v128_sadd_s8(a, b); }
+SIMD_INLINE v128 v128_sadd_s16(v128 a, v128 b) { return c_v128_sadd_s16(a, b); }
+SIMD_INLINE v128 v128_add_32(v128 a, v128 b) { return c_v128_add_32(a, b); }
+SIMD_INLINE v128 v128_add_64(v128 a, v128 b) { return c_v128_add_64(a, b); }
+SIMD_INLINE v128 v128_padd_u8(v128 a) { return c_v128_padd_u8(a); }
+SIMD_INLINE v128 v128_padd_s16(v128 a) { return c_v128_padd_s16(a); }
+SIMD_INLINE v128 v128_sub_8(v128 a, v128 b) { return c_v128_sub_8(a, b); }
+SIMD_INLINE v128 v128_ssub_u8(v128 a, v128 b) { return c_v128_ssub_u8(a, b); }
+SIMD_INLINE v128 v128_ssub_s8(v128 a, v128 b) { return c_v128_ssub_s8(a, b); }
+SIMD_INLINE v128 v128_sub_16(v128 a, v128 b) { return c_v128_sub_16(a, b); }
+SIMD_INLINE v128 v128_ssub_s16(v128 a, v128 b) { return c_v128_ssub_s16(a, b); }
+SIMD_INLINE v128 v128_ssub_u16(v128 a, v128 b) { return c_v128_ssub_u16(a, b); }
+SIMD_INLINE v128 v128_sub_32(v128 a, v128 b) { return c_v128_sub_32(a, b); }
+SIMD_INLINE v128 v128_sub_64(v128 a, v128 b) { return c_v128_sub_64(a, b); }
+SIMD_INLINE v128 v128_abs_s16(v128 a) { return c_v128_abs_s16(a); }
+SIMD_INLINE v128 v128_abs_s8(v128 a) { return c_v128_abs_s8(a); }
+
+SIMD_INLINE v128 v128_mul_s16(v64 a, v64 b) { return c_v128_mul_s16(a, b); }
+SIMD_INLINE v128 v128_mullo_s16(v128 a, v128 b) {
+  return c_v128_mullo_s16(a, b);
+}
+SIMD_INLINE v128 v128_mulhi_s16(v128 a, v128 b) {
+  return c_v128_mulhi_s16(a, b);
+}
+SIMD_INLINE v128 v128_mullo_s32(v128 a, v128 b) {
+  return c_v128_mullo_s32(a, b);
+}
+SIMD_INLINE v128 v128_madd_s16(v128 a, v128 b) { return c_v128_madd_s16(a, b); }
+SIMD_INLINE v128 v128_madd_us8(v128 a, v128 b) { return c_v128_madd_us8(a, b); }
+
+SIMD_INLINE uint32_t v128_movemask_8(v128 a) { return c_v128_movemask_8(a); }
+SIMD_INLINE v128 v128_blend_8(v128 a, v128 b, v128 c) {
+  return c_v128_blend_8(a, b, c);
+}
+
+SIMD_INLINE v128 v128_avg_u8(v128 a, v128 b) { return c_v128_avg_u8(a, b); }
+SIMD_INLINE v128 v128_rdavg_u8(v128 a, v128 b) { return c_v128_rdavg_u8(a, b); }
+SIMD_INLINE v128 v128_rdavg_u16(v128 a, v128 b) {
+  return c_v128_rdavg_u16(a, b);
+}
+SIMD_INLINE v128 v128_avg_u16(v128 a, v128 b) { return c_v128_avg_u16(a, b); }
+SIMD_INLINE v128 v128_min_u8(v128 a, v128 b) { return c_v128_min_u8(a, b); }
+SIMD_INLINE v128 v128_max_u8(v128 a, v128 b) { return c_v128_max_u8(a, b); }
+SIMD_INLINE v128 v128_min_s8(v128 a, v128 b) { return c_v128_min_s8(a, b); }
+SIMD_INLINE v128 v128_max_s8(v128 a, v128 b) { return c_v128_max_s8(a, b); }
+SIMD_INLINE v128 v128_min_s16(v128 a, v128 b) { return c_v128_min_s16(a, b); }
+SIMD_INLINE v128 v128_max_s16(v128 a, v128 b) { return c_v128_max_s16(a, b); }
+SIMD_INLINE v128 v128_min_s32(v128 a, v128 b) { return c_v128_min_s32(a, b); }
+SIMD_INLINE v128 v128_max_s32(v128 a, v128 b) { return c_v128_max_s32(a, b); }
+
+SIMD_INLINE v128 v128_ziplo_8(v128 a, v128 b) { return c_v128_ziplo_8(a, b); }
+SIMD_INLINE v128 v128_ziphi_8(v128 a, v128 b) { return c_v128_ziphi_8(a, b); }
+SIMD_INLINE v128 v128_ziplo_16(v128 a, v128 b) { return c_v128_ziplo_16(a, b); }
+SIMD_INLINE v128 v128_ziphi_16(v128 a, v128 b) { return c_v128_ziphi_16(a, b); }
+SIMD_INLINE v128 v128_ziplo_32(v128 a, v128 b) { return c_v128_ziplo_32(a, b); }
+SIMD_INLINE v128 v128_ziphi_32(v128 a, v128 b) { return c_v128_ziphi_32(a, b); }
+SIMD_INLINE v128 v128_ziplo_64(v128 a, v128 b) { return c_v128_ziplo_64(a, b); }
+SIMD_INLINE v128 v128_ziphi_64(v128 a, v128 b) { return c_v128_ziphi_64(a, b); }
+SIMD_INLINE v128 v128_zip_8(v64 a, v64 b) { return c_v128_zip_8(a, b); }
+SIMD_INLINE v128 v128_zip_16(v64 a, v64 b) { return c_v128_zip_16(a, b); }
+SIMD_INLINE v128 v128_zip_32(v64 a, v64 b) { return c_v128_zip_32(a, b); }
+SIMD_INLINE v128 v128_unziplo_8(v128 a, v128 b) {
+  return c_v128_unziplo_8(a, b);
+}
+SIMD_INLINE v128 v128_unziphi_8(v128 a, v128 b) {
+  return c_v128_unziphi_8(a, b);
+}
+SIMD_INLINE v128 v128_unziplo_16(v128 a, v128 b) {
+  return c_v128_unziplo_16(a, b);
+}
+SIMD_INLINE v128 v128_unziphi_16(v128 a, v128 b) {
+  return c_v128_unziphi_16(a, b);
+}
+SIMD_INLINE v128 v128_unziplo_32(v128 a, v128 b) {
+  return c_v128_unziplo_32(a, b);
+}
+SIMD_INLINE v128 v128_unziphi_32(v128 a, v128 b) {
+  return c_v128_unziphi_32(a, b);
+}
+SIMD_INLINE v128 v128_unpack_u8_s16(v64 a) { return c_v128_unpack_u8_s16(a); }
+SIMD_INLINE v128 v128_unpacklo_u8_s16(v128 a) {
+  return c_v128_unpacklo_u8_s16(a);
+}
+SIMD_INLINE v128 v128_unpackhi_u8_s16(v128 a) {
+  return c_v128_unpackhi_u8_s16(a);
+}
+SIMD_INLINE v128 v128_unpack_s8_s16(v64 a) { return c_v128_unpack_s8_s16(a); }
+SIMD_INLINE v128 v128_unpacklo_s8_s16(v128 a) {
+  return c_v128_unpacklo_s8_s16(a);
+}
+SIMD_INLINE v128 v128_unpackhi_s8_s16(v128 a) {
+  return c_v128_unpackhi_s8_s16(a);
+}
+SIMD_INLINE v128 v128_pack_s32_s16(v128 a, v128 b) {
+  return c_v128_pack_s32_s16(a, b);
+}
+SIMD_INLINE v128 v128_pack_s32_u16(v128 a, v128 b) {
+  return c_v128_pack_s32_u16(a, b);
+}
+SIMD_INLINE v128 v128_pack_s16_u8(v128 a, v128 b) {
+  return c_v128_pack_s16_u8(a, b);
+}
+SIMD_INLINE v128 v128_pack_s16_s8(v128 a, v128 b) {
+  return c_v128_pack_s16_s8(a, b);
+}
+SIMD_INLINE v128 v128_unpack_u16_s32(v64 a) { return c_v128_unpack_u16_s32(a); }
+SIMD_INLINE v128 v128_unpack_s16_s32(v64 a) { return c_v128_unpack_s16_s32(a); }
+SIMD_INLINE v128 v128_unpacklo_u16_s32(v128 a) {
+  return c_v128_unpacklo_u16_s32(a);
+}
+SIMD_INLINE v128 v128_unpacklo_s16_s32(v128 a) {
+  return c_v128_unpacklo_s16_s32(a);
+}
+SIMD_INLINE v128 v128_unpackhi_u16_s32(v128 a) {
+  return c_v128_unpackhi_u16_s32(a);
+}
+SIMD_INLINE v128 v128_unpackhi_s16_s32(v128 a) {
+  return c_v128_unpackhi_s16_s32(a);
+}
+SIMD_INLINE v128 v128_shuffle_8(v128 a, v128 pattern) {
+  return c_v128_shuffle_8(a, pattern);
+}
+
+SIMD_INLINE v128 v128_cmpgt_s8(v128 a, v128 b) { return c_v128_cmpgt_s8(a, b); }
+SIMD_INLINE v128 v128_cmplt_s8(v128 a, v128 b) { return c_v128_cmplt_s8(a, b); }
+SIMD_INLINE v128 v128_cmpeq_8(v128 a, v128 b) { return c_v128_cmpeq_8(a, b); }
+SIMD_INLINE v128 v128_cmpgt_s16(v128 a, v128 b) {
+  return c_v128_cmpgt_s16(a, b);
+}
+SIMD_INLINE v128 v128_cmplt_s16(v128 a, v128 b) {
+  return c_v128_cmplt_s16(a, b);
+}
+SIMD_INLINE v128 v128_cmpeq_16(v128 a, v128 b) { return c_v128_cmpeq_16(a, b); }
+
+SIMD_INLINE v128 v128_cmpgt_s32(v128 a, v128 b) {
+  return c_v128_cmpgt_s32(a, b);
+}
+SIMD_INLINE v128 v128_cmplt_s32(v128 a, v128 b) {
+  return c_v128_cmplt_s32(a, b);
+}
+SIMD_INLINE v128 v128_cmpeq_32(v128 a, v128 b) { return c_v128_cmpeq_32(a, b); }
+
+SIMD_INLINE v128 v128_shl_8(v128 a, unsigned int c) {
+  return c_v128_shl_8(a, c);
+}
+SIMD_INLINE v128 v128_shr_u8(v128 a, unsigned int c) {
+  return c_v128_shr_u8(a, c);
+}
+SIMD_INLINE v128 v128_shr_s8(v128 a, unsigned int c) {
+  return c_v128_shr_s8(a, c);
+}
+SIMD_INLINE v128 v128_shl_16(v128 a, unsigned int c) {
+  return c_v128_shl_16(a, c);
+}
+SIMD_INLINE v128 v128_shr_u16(v128 a, unsigned int c) {
+  return c_v128_shr_u16(a, c);
+}
+SIMD_INLINE v128 v128_shr_s16(v128 a, unsigned int c) {
+  return c_v128_shr_s16(a, c);
+}
+SIMD_INLINE v128 v128_shl_32(v128 a, unsigned int c) {
+  return c_v128_shl_32(a, c);
+}
+SIMD_INLINE v128 v128_shr_u32(v128 a, unsigned int c) {
+  return c_v128_shr_u32(a, c);
+}
+SIMD_INLINE v128 v128_shr_s32(v128 a, unsigned int c) {
+  return c_v128_shr_s32(a, c);
+}
+SIMD_INLINE v128 v128_shl_64(v128 a, unsigned int c) {
+  return c_v128_shl_64(a, c);
+}
+SIMD_INLINE v128 v128_shr_u64(v128 a, unsigned int c) {
+  return c_v128_shr_u64(a, c);
+}
+SIMD_INLINE v128 v128_shr_s64(v128 a, unsigned int c) {
+  return c_v128_shr_s64(a, c);
+}
+
+SIMD_INLINE v128 v128_shr_n_byte(v128 a, unsigned int n) {
+  return c_v128_shr_n_byte(a, n);
+}
+SIMD_INLINE v128 v128_shl_n_byte(v128 a, unsigned int n) {
+  return c_v128_shl_n_byte(a, n);
+}
+SIMD_INLINE v128 v128_shl_n_8(v128 a, unsigned int n) {
+  return c_v128_shl_n_8(a, n);
+}
+SIMD_INLINE v128 v128_shl_n_16(v128 a, unsigned int n) {
+  return c_v128_shl_n_16(a, n);
+}
+SIMD_INLINE v128 v128_shl_n_32(v128 a, unsigned int n) {
+  return c_v128_shl_n_32(a, n);
+}
+SIMD_INLINE v128 v128_shl_n_64(v128 a, unsigned int n) {
+  return c_v128_shl_n_64(a, n);
+}
+SIMD_INLINE v128 v128_shr_n_u8(v128 a, unsigned int n) {
+  return c_v128_shr_n_u8(a, n);
+}
+SIMD_INLINE v128 v128_shr_n_u16(v128 a, unsigned int n) {
+  return c_v128_shr_n_u16(a, n);
+}
+SIMD_INLINE v128 v128_shr_n_u32(v128 a, unsigned int n) {
+  return c_v128_shr_n_u32(a, n);
+}
+SIMD_INLINE v128 v128_shr_n_u64(v128 a, unsigned int n) {
+  return c_v128_shr_n_u64(a, n);
+}
+SIMD_INLINE v128 v128_shr_n_s8(v128 a, unsigned int n) {
+  return c_v128_shr_n_s8(a, n);
+}
+SIMD_INLINE v128 v128_shr_n_s16(v128 a, unsigned int n) {
+  return c_v128_shr_n_s16(a, n);
+}
+SIMD_INLINE v128 v128_shr_n_s32(v128 a, unsigned int n) {
+  return c_v128_shr_n_s32(a, n);
+}
+SIMD_INLINE v128 v128_shr_n_s64(v128 a, unsigned int n) {
+  return c_v128_shr_n_s64(a, n);
+}
+
+typedef uint32_t sad128_internal_u16;
+SIMD_INLINE sad128_internal_u16 v128_sad_u16_init(void) {
+  return c_v128_sad_u16_init();
+}
+SIMD_INLINE sad128_internal_u16 v128_sad_u16(sad128_internal_u16 s, v128 a,
+                                             v128 b) {
+  return c_v128_sad_u16(s, a, b);
+}
+SIMD_INLINE uint32_t v128_sad_u16_sum(sad128_internal_u16 s) {
+  return c_v128_sad_u16_sum(s);
+}
+
+typedef uint64_t ssd128_internal_s16;
+SIMD_INLINE ssd128_internal_s16 v128_ssd_s16_init(void) {
+  return c_v128_ssd_s16_init();
+}
+SIMD_INLINE ssd128_internal_s16 v128_ssd_s16(ssd128_internal_s16 s, v128 a,
+                                             v128 b) {
+  return c_v128_ssd_s16(s, a, b);
+}
+SIMD_INLINE uint64_t v128_ssd_s16_sum(ssd128_internal_s16 s) {
+  return c_v128_ssd_s16_sum(s);
+}
+
+#endif  // AOM_AOM_DSP_SIMD_V128_INTRINSICS_H_
diff --git a/libs/libaom/src/aom_dsp/simd/v128_intrinsics_arm.h b/libs/libaom/src/aom_dsp/simd/v128_intrinsics_arm.h
new file mode 100644
index 000000000..2d497f4c0
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/simd/v128_intrinsics_arm.h
@@ -0,0 +1,973 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_SIMD_V128_INTRINSICS_ARM_H_
+#define AOM_AOM_DSP_SIMD_V128_INTRINSICS_ARM_H_
+
+#include <arm_neon.h>
+
+#include "aom_dsp/simd/v64_intrinsics_arm.h"
+
+typedef int64x2_t v128;
+
+SIMD_INLINE uint32_t v128_low_u32(v128 a) {
+  return v64_low_u32(vget_low_s64(a));
+}
+
+SIMD_INLINE v64 v128_low_v64(v128 a) { return vget_low_s64(a); }
+
+SIMD_INLINE v64 v128_high_v64(v128 a) { return vget_high_s64(a); }
+
+SIMD_INLINE v128 v128_from_v64(v64 a, v64 b) { return vcombine_s64(b, a); }
+
+SIMD_INLINE v128 v128_from_64(uint64_t a, uint64_t b) {
+  return vcombine_s64((int64x1_t)b, (int64x1_t)a);
+}
+
+SIMD_INLINE v128 v128_from_32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
+  return vcombine_s64(v64_from_32(c, d), v64_from_32(a, b));
+}
+
+SIMD_INLINE v128 v128_load_aligned(const void *p) {
+  return vreinterpretq_s64_u8(vld1q_u8((const uint8_t *)p));
+}
+
+SIMD_INLINE v128 v128_load_unaligned(const void *p) {
+  return v128_load_aligned(p);
+}
+
+SIMD_INLINE void v128_store_aligned(void *p, v128 r) {
+  vst1q_u8((uint8_t *)p, vreinterpretq_u8_s64(r));
+}
+
+SIMD_INLINE void v128_store_unaligned(void *p, v128 r) {
+  vst1q_u8((uint8_t *)p, vreinterpretq_u8_s64(r));
+}
+
+SIMD_INLINE v128 v128_align(v128 a, v128 b, unsigned int c) {
+// The following functions require an immediate.
+// Some compilers will check this during optimisation, others wont.
+#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__)
+  return c ? vreinterpretq_s64_s8(
+                 vextq_s8(vreinterpretq_s8_s64(b), vreinterpretq_s8_s64(a), c))
+           : b;
+#else
+  return c < 8 ? v128_from_v64(v64_align(v128_low_v64(a), v128_high_v64(b), c),
+                               v64_align(v128_high_v64(b), v128_low_v64(b), c))
+               : v128_from_v64(
+                     v64_align(v128_high_v64(a), v128_low_v64(a), c - 8),
+                     v64_align(v128_low_v64(a), v128_high_v64(b), c - 8));
+#endif
+}
+
+SIMD_INLINE v128 v128_zero(void) { return vreinterpretq_s64_u8(vdupq_n_u8(0)); }
+
+SIMD_INLINE v128 v128_ones(void) {
+  return vreinterpretq_s64_u8(vdupq_n_u8(-1));
+}
+
+SIMD_INLINE v128 v128_dup_8(uint8_t x) {
+  return vreinterpretq_s64_u8(vdupq_n_u8(x));
+}
+
+SIMD_INLINE v128 v128_dup_16(uint16_t x) {
+  return vreinterpretq_s64_u16(vdupq_n_u16(x));
+}
+
+SIMD_INLINE v128 v128_dup_32(uint32_t x) {
+  return vreinterpretq_s64_u32(vdupq_n_u32(x));
+}
+
+SIMD_INLINE v128 v128_dup_64(uint64_t x) {
+  return vreinterpretq_s64_u64(vdupq_n_u64(x));
+}
+
+SIMD_INLINE int64_t v128_dotp_su8(v128 a, v128 b) {
+  int16x8_t t1 = vmulq_s16(
+      vmovl_s8(vreinterpret_s8_s64(vget_low_s64(a))),
+      vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(vget_low_s64(b)))));
+  int16x8_t t2 = vmulq_s16(
+      vmovl_s8(vreinterpret_s8_s64(vget_high_s64(a))),
+      vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(vget_high_s64(b)))));
+#if defined(__aarch64__)
+  return vaddlvq_s16(t1) + vaddlvq_s16(t2);
+#else
+  int64x2_t t = vpaddlq_s32(vaddq_s32(vpaddlq_s16(t1), vpaddlq_s16(t2)));
+  return (int64_t)vget_high_s64(t) + (int64_t)vget_low_s64(t);
+#endif
+}
+
+SIMD_INLINE int64_t v128_dotp_s16(v128 a, v128 b) {
+  return v64_dotp_s16(vget_high_s64(a), vget_high_s64(b)) +
+         v64_dotp_s16(vget_low_s64(a), vget_low_s64(b));
+}
+
+SIMD_INLINE int64_t v128_dotp_s32(v128 a, v128 b) {
+  int64x2_t t = vpaddlq_s32(
+      vmulq_s32(vreinterpretq_s32_s64(a), vreinterpretq_s32_s64(b)));
+  return (int64_t)vget_high_s64(t) + (int64_t)vget_low_s64(t);
+}
+
+SIMD_INLINE uint64_t v128_hadd_u8(v128 x) {
+#if defined(__aarch64__)
+  return vaddlvq_u8(vreinterpretq_u8_s64(x));
+#else
+  uint64x2_t t = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vreinterpretq_u8_s64(x))));
+  return vget_lane_s32(
+      vreinterpret_s32_u64(vadd_u64(vget_high_u64(t), vget_low_u64(t))), 0);
+#endif
+}
+
+SIMD_INLINE v128 v128_padd_s16(v128 a) {
+  return vreinterpretq_s64_s32(vpaddlq_s16(vreinterpretq_s16_s64(a)));
+}
+
+SIMD_INLINE v128 v128_padd_u8(v128 a) {
+  return vreinterpretq_s64_u16(vpaddlq_u8(vreinterpretq_u8_s64(a)));
+}
+
+typedef struct {
+  sad64_internal hi, lo;
+} sad128_internal;
+
+SIMD_INLINE sad128_internal v128_sad_u8_init(void) {
+  sad128_internal s;
+  s.hi = s.lo = vdupq_n_u16(0);
+  return s;
+}
+
+/* Implementation dependent return value.  Result must be finalised with
+   v128_sad_u8_sum().
+   The result for more than 32 v128_sad_u8() calls is undefined. */
+SIMD_INLINE sad128_internal v128_sad_u8(sad128_internal s, v128 a, v128 b) {
+  sad128_internal r;
+  r.hi = v64_sad_u8(s.hi, vget_high_s64(a), vget_high_s64(b));
+  r.lo = v64_sad_u8(s.lo, vget_low_s64(a), vget_low_s64(b));
+  return r;
+}
+
+SIMD_INLINE uint32_t v128_sad_u8_sum(sad128_internal s) {
+#if defined(__aarch64__)
+  return vaddlvq_u16(s.hi) + vaddlvq_u16(s.lo);
+#else
+  uint64x2_t t = vpaddlq_u32(vpaddlq_u16(vaddq_u16(s.hi, s.lo)));
+  return (uint32_t)(uint64_t)(vget_high_u64(t) + vget_low_u64(t));
+#endif
+}
+
+typedef struct {
+  ssd64_internal hi, lo;
+} ssd128_internal;
+
+SIMD_INLINE ssd128_internal v128_ssd_u8_init(void) {
+  ssd128_internal s;
+  s.hi = s.lo = v64_ssd_u8_init();
+  return s;
+}
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v128_ssd_u8_sum(). */
+SIMD_INLINE ssd128_internal v128_ssd_u8(ssd128_internal s, v128 a, v128 b) {
+  ssd128_internal r;
+  r.hi = v64_ssd_u8(s.hi, vget_high_s64(a), vget_high_s64(b));
+  r.lo = v64_ssd_u8(s.lo, vget_low_s64(a), vget_low_s64(b));
+  return r;
+}
+
+SIMD_INLINE uint32_t v128_ssd_u8_sum(ssd128_internal s) {
+  return (uint32_t)(v64_ssd_u8_sum(s.hi) + v64_ssd_u8_sum(s.lo));
+}
+
+SIMD_INLINE v128 v128_or(v128 x, v128 y) { return vorrq_s64(x, y); }
+
+SIMD_INLINE v128 v128_xor(v128 x, v128 y) { return veorq_s64(x, y); }
+
+SIMD_INLINE v128 v128_and(v128 x, v128 y) { return vandq_s64(x, y); }
+
+SIMD_INLINE v128 v128_andn(v128 x, v128 y) { return vbicq_s64(x, y); }
+
+SIMD_INLINE v128 v128_add_8(v128 x, v128 y) {
+  return vreinterpretq_s64_u8(
+      vaddq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
+}
+
+SIMD_INLINE v128 v128_sadd_u8(v128 x, v128 y) {
+  return vreinterpretq_s64_u8(
+      vqaddq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
+}
+
+SIMD_INLINE v128 v128_sadd_s8(v128 x, v128 y) {
+  return vreinterpretq_s64_s8(
+      vqaddq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y)));
+}
+
+SIMD_INLINE v128 v128_add_16(v128 x, v128 y) {
+  return vreinterpretq_s64_s16(
+      vaddq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
+}
+
+SIMD_INLINE v128 v128_sadd_s16(v128 x, v128 y) {
+  return vreinterpretq_s64_s16(
+      vqaddq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
+}
+
+SIMD_INLINE v128 v128_add_32(v128 x, v128 y) {
+  return vreinterpretq_s64_u32(
+      vaddq_u32(vreinterpretq_u32_s64(x), vreinterpretq_u32_s64(y)));
+}
+
+SIMD_INLINE v128 v128_add_64(v128 x, v128 y) {
+  return vreinterpretq_s64_u64(
+      vaddq_u64(vreinterpretq_u64_s64(x), vreinterpretq_u64_s64(y)));
+}
+
+SIMD_INLINE v128 v128_sub_8(v128 x, v128 y) {
+  return vreinterpretq_s64_u8(
+      vsubq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
+}
+
+SIMD_INLINE v128 v128_sub_16(v128 x, v128 y) {
+  return vreinterpretq_s64_s16(
+      vsubq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
+}
+
+SIMD_INLINE v128 v128_ssub_s16(v128 x, v128 y) {
+  return vreinterpretq_s64_s16(
+      vqsubq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
+}
+
+SIMD_INLINE v128 v128_ssub_u16(v128 x, v128 y) {
+  return vreinterpretq_s64_u16(
+      vqsubq_u16(vreinterpretq_u16_s64(x), vreinterpretq_u16_s64(y)));
+}
+
+SIMD_INLINE v128 v128_ssub_u8(v128 x, v128 y) {
+  return vreinterpretq_s64_u8(
+      vqsubq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
+}
+
+SIMD_INLINE v128 v128_ssub_s8(v128 x, v128 y) {
+  return vreinterpretq_s64_s8(
+      vqsubq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y)));
+}
+
+SIMD_INLINE v128 v128_sub_32(v128 x, v128 y) {
+  return vreinterpretq_s64_s32(
+      vsubq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y)));
+}
+
+SIMD_INLINE v128 v128_sub_64(v128 x, v128 y) { return vsubq_s64(x, y); }
+
+SIMD_INLINE v128 v128_abs_s16(v128 x) {
+  return vreinterpretq_s64_s16(vabsq_s16(vreinterpretq_s16_s64(x)));
+}
+
+SIMD_INLINE v128 v128_abs_s8(v128 x) {
+  return vreinterpretq_s64_s8(vabsq_s8(vreinterpretq_s8_s64(x)));
+}
+
+SIMD_INLINE v128 v128_mul_s16(v64 a, v64 b) {
+  return vreinterpretq_s64_s32(
+      vmull_s16(vreinterpret_s16_s64(a), vreinterpret_s16_s64(b)));
+}
+
+SIMD_INLINE v128 v128_mullo_s16(v128 a, v128 b) {
+  return vreinterpretq_s64_s16(
+      vmulq_s16(vreinterpretq_s16_s64(a), vreinterpretq_s16_s64(b)));
+}
+
+SIMD_INLINE v128 v128_mulhi_s16(v128 a, v128 b) {
+#if defined(__aarch64__)
+  return vreinterpretq_s64_s16(vuzp2q_s16(
+      vreinterpretq_s16_s32(vmull_s16(vreinterpret_s16_s64(vget_low_s64(a)),
+                                      vreinterpret_s16_s64(vget_low_s64(b)))),
+      vreinterpretq_s16_s32(
+          vmull_high_s16(vreinterpretq_s16_s64(a), vreinterpretq_s16_s64(b)))));
+#else
+  return v128_from_v64(v64_mulhi_s16(vget_high_s64(a), vget_high_s64(b)),
+                       v64_mulhi_s16(vget_low_s64(a), vget_low_s64(b)));
+#endif
+}
+
+SIMD_INLINE v128 v128_mullo_s32(v128 a, v128 b) {
+  return vreinterpretq_s64_s32(
+      vmulq_s32(vreinterpretq_s32_s64(a), vreinterpretq_s32_s64(b)));
+}
+
+SIMD_INLINE v128 v128_madd_s16(v128 a, v128 b) {
+#if defined(__aarch64__)
+  int32x4_t t1 = vmull_s16(vreinterpret_s16_s64(vget_low_s64(a)),
+                           vreinterpret_s16_s64(vget_low_s64(b)));
+  int32x4_t t2 =
+      vmull_high_s16(vreinterpretq_s16_s64(a), vreinterpretq_s16_s64(b));
+  return vreinterpretq_s64_s32(vpaddq_s32(t1, t2));
+#else
+  return v128_from_v64(v64_madd_s16(vget_high_s64(a), vget_high_s64(b)),
+                       v64_madd_s16(vget_low_s64(a), vget_low_s64(b)));
+#endif
+}
+
+SIMD_INLINE v128 v128_madd_us8(v128 a, v128 b) {
+#if defined(__aarch64__)
+  int16x8_t t1 = vmulq_s16(
+      vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(vget_low_s64(a)))),
+      vmovl_s8(vreinterpret_s8_s64(vget_low_s64(b))));
+  int16x8_t t2 = vmulq_s16(
+      vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(vget_high_s64(a)))),
+      vmovl_s8(vreinterpret_s8_s64(vget_high_s64(b))));
+  return vreinterpretq_s64_s16(
+      vqaddq_s16(vuzp1q_s16(t1, t2), vuzp2q_s16(t1, t2)));
+#else
+  return v128_from_v64(v64_madd_us8(vget_high_s64(a), vget_high_s64(b)),
+                       v64_madd_us8(vget_low_s64(a), vget_low_s64(b)));
+#endif
+}
+
+SIMD_INLINE v128 v128_avg_u8(v128 x, v128 y) {
+  return vreinterpretq_s64_u8(
+      vrhaddq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
+}
+
+SIMD_INLINE v128 v128_rdavg_u8(v128 x, v128 y) {
+  return vreinterpretq_s64_u8(
+      vhaddq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
+}
+
+SIMD_INLINE v128 v128_rdavg_u16(v128 x, v128 y) {
+  return vreinterpretq_s64_u16(
+      vhaddq_u16(vreinterpretq_u16_s64(x), vreinterpretq_u16_s64(y)));
+}
+
+SIMD_INLINE v128 v128_avg_u16(v128 x, v128 y) {
+  return vreinterpretq_s64_u16(
+      vrhaddq_u16(vreinterpretq_u16_s64(x), vreinterpretq_u16_s64(y)));
+}
+
+SIMD_INLINE v128 v128_min_u8(v128 x, v128 y) {
+  return vreinterpretq_s64_u8(
+      vminq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
+}
+
+SIMD_INLINE v128 v128_max_u8(v128 x, v128 y) {
+  return vreinterpretq_s64_u8(
+      vmaxq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
+}
+
+SIMD_INLINE v128 v128_min_s8(v128 x, v128 y) {
+  return vreinterpretq_s64_s8(
+      vminq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y)));
+}
+
+SIMD_INLINE uint32_t v128_movemask_8(v128 a) {
+  a = vreinterpretq_s64_u8(vcltq_s8(vreinterpretq_s8_s64(a), vdupq_n_s8(0)));
+#if defined(__aarch64__)
+  uint8x16_t m =
+      vandq_u8(vreinterpretq_u8_s64(a),
+               vreinterpretq_u8_u64(vdupq_n_u64(0x8040201008040201ULL)));
+  return vaddv_u8(vget_low_u8(m)) + (vaddv_u8(vget_high_u8(m)) << 8);
+#else
+  uint64x2_t m = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(
+      vandq_u8(vreinterpretq_u8_s64(a),
+               vreinterpretq_u8_u64(vdupq_n_u64(0x8040201008040201ULL))))));
+  return v64_low_u32(
+      v64_ziplo_8(v128_high_v64((v128)m), v128_low_v64((v128)m)));
+#endif
+}
+
+SIMD_INLINE v128 v128_blend_8(v128 a, v128 b, v128 c) {
+  c = vreinterpretq_s64_u8(vcltq_s8(vreinterpretq_s8_s64(c), vdupq_n_s8(0)));
+  return v128_or(v128_and(b, c), v128_andn(a, c));
+}
+
+SIMD_INLINE v128 v128_max_s8(v128 x, v128 y) {
+  return vreinterpretq_s64_s8(
+      vmaxq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y)));
+}
+
+SIMD_INLINE v128 v128_min_s16(v128 x, v128 y) {
+  return vreinterpretq_s64_s16(
+      vminq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
+}
+
+SIMD_INLINE v128 v128_max_s16(v128 x, v128 y) {
+  return vreinterpretq_s64_s16(
+      vmaxq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
+}
+
+SIMD_INLINE v128 v128_min_s32(v128 x, v128 y) {
+  return vreinterpretq_s64_s32(
+      vminq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y)));
+}
+
+SIMD_INLINE v128 v128_max_s32(v128 x, v128 y) {
+  return vreinterpretq_s64_s32(
+      vmaxq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y)));
+}
+
+SIMD_INLINE v128 v128_ziplo_8(v128 x, v128 y) {
+#if defined(__aarch64__)
+  return vreinterpretq_s64_u8(
+      vzip1q_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x)));
+#else
+  uint8x16x2_t r = vzipq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x));
+  return vreinterpretq_s64_u8(r.val[0]);
+#endif
+}
+
+SIMD_INLINE v128 v128_ziphi_8(v128 x, v128 y) {
+#if defined(__aarch64__)
+  return vreinterpretq_s64_u8(
+      vzip2q_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x)));
+#else
+  uint8x16x2_t r = vzipq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x));
+  return vreinterpretq_s64_u8(r.val[1]);
+#endif
+}
+
+SIMD_INLINE v128 v128_zip_8(v64 x, v64 y) {
+  uint8x8x2_t r = vzip_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x));
+  return vreinterpretq_s64_u8(vcombine_u8(r.val[0], r.val[1]));
+}
+
+SIMD_INLINE v128 v128_ziplo_16(v128 x, v128 y) {
+#if defined(__aarch64__)
+  return vreinterpretq_s64_u16(
+      vzip1q_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x)));
+#else
+  int16x8x2_t r = vzipq_s16(vreinterpretq_s16_s64(y), vreinterpretq_s16_s64(x));
+  return vreinterpretq_s64_s16(r.val[0]);
+#endif
+}
+
+SIMD_INLINE v128 v128_ziphi_16(v128 x, v128 y) {
+#if defined(__aarch64__)
+  return vreinterpretq_s64_u16(
+      vzip2q_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x)));
+#else
+  int16x8x2_t r = vzipq_s16(vreinterpretq_s16_s64(y), vreinterpretq_s16_s64(x));
+  return vreinterpretq_s64_s16(r.val[1]);
+#endif
+}
+
+SIMD_INLINE v128 v128_zip_16(v64 x, v64 y) {
+  uint16x4x2_t r = vzip_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x));
+  return vreinterpretq_s64_u16(vcombine_u16(r.val[0], r.val[1]));
+}
+
+SIMD_INLINE v128 v128_ziplo_32(v128 x, v128 y) {
+#if defined(__aarch64__)
+  return vreinterpretq_s64_u32(
+      vzip1q_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x)));
+#else
+  int32x4x2_t r = vzipq_s32(vreinterpretq_s32_s64(y), vreinterpretq_s32_s64(x));
+  return vreinterpretq_s64_s32(r.val[0]);
+#endif
+}
+
+SIMD_INLINE v128 v128_ziphi_32(v128 x, v128 y) {
+#if defined(__aarch64__)
+  return vreinterpretq_s64_u32(
+      vzip2q_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x)));
+#else
+  int32x4x2_t r = vzipq_s32(vreinterpretq_s32_s64(y), vreinterpretq_s32_s64(x));
+  return vreinterpretq_s64_s32(r.val[1]);
+#endif
+}
+
+SIMD_INLINE v128 v128_zip_32(v64 x, v64 y) {
+  uint32x2x2_t r = vzip_u32(vreinterpret_u32_s64(y), vreinterpret_u32_s64(x));
+  return vreinterpretq_s64_u32(vcombine_u32(r.val[0], r.val[1]));
+}
+
+SIMD_INLINE v128 v128_ziplo_64(v128 a, v128 b) {
+  return v128_from_v64(vget_low_s64((int64x2_t)a), vget_low_s64((int64x2_t)b));
+}
+
+SIMD_INLINE v128 v128_ziphi_64(v128 a, v128 b) {
+  return v128_from_v64(vget_high_s64((int64x2_t)a),
+                       vget_high_s64((int64x2_t)b));
+}
+
+SIMD_INLINE v128 v128_unziplo_8(v128 x, v128 y) {
+#if defined(__aarch64__)
+  return vreinterpretq_s64_u8(
+      vuzp1q_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x)));
+#else
+  uint8x16x2_t r = vuzpq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x));
+  return vreinterpretq_s64_u8(r.val[0]);
+#endif
+}
+
+SIMD_INLINE v128 v128_unziphi_8(v128 x, v128 y) {
+#if defined(__aarch64__)
+  return vreinterpretq_s64_u8(
+      vuzp2q_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x)));
+#else
+  uint8x16x2_t r = vuzpq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x));
+  return vreinterpretq_s64_u8(r.val[1]);
+#endif
+}
+
+SIMD_INLINE v128 v128_unziplo_16(v128 x, v128 y) {
+#if defined(__aarch64__)
+  return vreinterpretq_s64_u16(
+      vuzp1q_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x)));
+#else
+  uint16x8x2_t r =
+      vuzpq_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x));
+  return vreinterpretq_s64_u16(r.val[0]);
+#endif
+}
+
+SIMD_INLINE v128 v128_unziphi_16(v128 x, v128 y) {
+#if defined(__aarch64__)
+  return vreinterpretq_s64_u16(
+      vuzp2q_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x)));
+#else
+  uint16x8x2_t r =
+      vuzpq_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x));
+  return vreinterpretq_s64_u16(r.val[1]);
+#endif
+}
+
+SIMD_INLINE v128 v128_unziplo_32(v128 x, v128 y) {
+#if defined(__aarch64__)
+  return vreinterpretq_s64_u32(
+      vuzp1q_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x)));
+#else
+  uint32x4x2_t r =
+      vuzpq_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x));
+  return vreinterpretq_s64_u32(r.val[0]);
+#endif
+}
+
+SIMD_INLINE v128 v128_unziphi_32(v128 x, v128 y) {
+#if defined(__aarch64__)
+  return vreinterpretq_s64_u32(
+      vuzp2q_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x)));
+#else
+  uint32x4x2_t r =
+      vuzpq_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x));
+  return vreinterpretq_s64_u32(r.val[1]);
+#endif
+}
+
+SIMD_INLINE v128 v128_unpack_u8_s16(v64 a) {
+  return vreinterpretq_s64_u16(vmovl_u8(vreinterpret_u8_s64(a)));
+}
+
+SIMD_INLINE v128 v128_unpacklo_u8_s16(v128 a) {
+  return vreinterpretq_s64_u16(vmovl_u8(vreinterpret_u8_s64(vget_low_s64(a))));
+}
+
+SIMD_INLINE v128 v128_unpackhi_u8_s16(v128 a) {
+  return vreinterpretq_s64_u16(vmovl_u8(vreinterpret_u8_s64(vget_high_s64(a))));
+}
+
+SIMD_INLINE v128 v128_unpack_s8_s16(v64 a) {
+  return vreinterpretq_s64_s16(vmovl_s8(vreinterpret_s8_s64(a)));
+}
+
+SIMD_INLINE v128 v128_unpacklo_s8_s16(v128 a) {
+  return vreinterpretq_s64_s16(vmovl_s8(vreinterpret_s8_s64(vget_low_s64(a))));
+}
+
+SIMD_INLINE v128 v128_unpackhi_s8_s16(v128 a) {
+  return vreinterpretq_s64_s16(vmovl_s8(vreinterpret_s8_s64(vget_high_s64(a))));
+}
+
+SIMD_INLINE v128 v128_pack_s32_s16(v128 a, v128 b) {
+  return v128_from_v64(
+      vreinterpret_s64_s16(vqmovn_s32(vreinterpretq_s32_s64(a))),
+      vreinterpret_s64_s16(vqmovn_s32(vreinterpretq_s32_s64(b))));
+}
+
+SIMD_INLINE v128 v128_pack_s32_u16(v128 a, v128 b) {
+  return v128_from_v64(
+      vreinterpret_s64_u16(vqmovun_s32(vreinterpretq_s32_s64(a))),
+      vreinterpret_s64_u16(vqmovun_s32(vreinterpretq_s32_s64(b))));
+}
+
+SIMD_INLINE v128 v128_pack_s16_u8(v128 a, v128 b) {
+  return v128_from_v64(
+      vreinterpret_s64_u8(vqmovun_s16(vreinterpretq_s16_s64(a))),
+      vreinterpret_s64_u8(vqmovun_s16(vreinterpretq_s16_s64(b))));
+}
+
+SIMD_INLINE v128 v128_pack_s16_s8(v128 a, v128 b) {
+  return v128_from_v64(
+      vreinterpret_s64_s8(vqmovn_s16(vreinterpretq_s16_s64(a))),
+      vreinterpret_s64_s8(vqmovn_s16(vreinterpretq_s16_s64(b))));
+}
+
+SIMD_INLINE v128 v128_unpack_u16_s32(v64 a) {
+  return vreinterpretq_s64_u32(vmovl_u16(vreinterpret_u16_s64(a)));
+}
+
+SIMD_INLINE v128 v128_unpack_s16_s32(v64 a) {
+  return vreinterpretq_s64_s32(vmovl_s16(vreinterpret_s16_s64(a)));
+}
+
+SIMD_INLINE v128 v128_unpacklo_u16_s32(v128 a) {
+  return vreinterpretq_s64_u32(
+      vmovl_u16(vreinterpret_u16_s64(vget_low_s64(a))));
+}
+
+SIMD_INLINE v128 v128_unpacklo_s16_s32(v128 a) {
+  return vreinterpretq_s64_s32(
+      vmovl_s16(vreinterpret_s16_s64(vget_low_s64(a))));
+}
+
+SIMD_INLINE v128 v128_unpackhi_u16_s32(v128 a) {
+  return vreinterpretq_s64_u32(
+      vmovl_u16(vreinterpret_u16_s64(vget_high_s64(a))));
+}
+
+SIMD_INLINE v128 v128_unpackhi_s16_s32(v128 a) {
+  return vreinterpretq_s64_s32(
+      vmovl_s16(vreinterpret_s16_s64(vget_high_s64(a))));
+}
+
+SIMD_INLINE v128 v128_shuffle_8(v128 x, v128 pattern) {
+#if defined(__aarch64__)
+  return vreinterpretq_s64_u8(
+      vqtbl1q_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(pattern)));
+#else
+  uint8x8x2_t p = { { vget_low_u8(vreinterpretq_u8_s64(x)),
+                      vget_high_u8(vreinterpretq_u8_s64(x)) } };
+  return v128_from_64((uint64_t)vreinterpret_s64_u8(vtbl2_u8(
+                          p, vreinterpret_u8_s64(vget_high_s64(pattern)))),
+                      (uint64_t)vreinterpret_s64_u8(vtbl2_u8(
+                          p, vreinterpret_u8_s64(vget_low_s64(pattern)))));
+#endif
+}
+
+SIMD_INLINE v128 v128_cmpgt_s8(v128 x, v128 y) {
+  return vreinterpretq_s64_u8(
+      vcgtq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y)));
+}
+
+SIMD_INLINE v128 v128_cmplt_s8(v128 x, v128 y) {
+  return vreinterpretq_s64_u8(
+      vcltq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y)));
+}
+
+SIMD_INLINE v128 v128_cmpeq_8(v128 x, v128 y) {
+  return vreinterpretq_s64_u8(
+      vceqq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
+}
+
+SIMD_INLINE v128 v128_cmpgt_s16(v128 x, v128 y) {
+  return vreinterpretq_s64_u16(
+      vcgtq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
+}
+
+SIMD_INLINE v128 v128_cmplt_s16(v128 x, v128 y) {
+  return vreinterpretq_s64_u16(
+      vcltq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
+}
+
+SIMD_INLINE v128 v128_cmpeq_16(v128 x, v128 y) {
+  return vreinterpretq_s64_u16(
+      vceqq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
+}
+
+SIMD_INLINE v128 v128_cmpgt_s32(v128 x, v128 y) {
+  return vreinterpretq_s64_u32(
+      vcgtq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y)));
+}
+
+SIMD_INLINE v128 v128_cmplt_s32(v128 x, v128 y) {
+  return vreinterpretq_s64_u32(
+      vcltq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y)));
+}
+
+SIMD_INLINE v128 v128_cmpeq_32(v128 x, v128 y) {
+  return vreinterpretq_s64_u32(
+      vceqq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y)));
+}
+
+SIMD_INLINE v128 v128_shl_8(v128 a, unsigned int c) {
+  return (c > 7) ? v128_zero()
+                 : vreinterpretq_s64_u8(
+                       vshlq_u8(vreinterpretq_u8_s64(a), vdupq_n_s8(c)));
+}
+
+SIMD_INLINE v128 v128_shr_u8(v128 a, unsigned int c) {
+  return (c > 7) ? v128_zero()
+                 : vreinterpretq_s64_u8(
+                       vshlq_u8(vreinterpretq_u8_s64(a), vdupq_n_s8(-c)));
+}
+
+SIMD_INLINE v128 v128_shr_s8(v128 a, unsigned int c) {
+  return (c > 7) ? v128_ones()
+                 : vreinterpretq_s64_s8(
+                       vshlq_s8(vreinterpretq_s8_s64(a), vdupq_n_s8(-c)));
+}
+
+SIMD_INLINE v128 v128_shl_16(v128 a, unsigned int c) {
+  return (c > 15) ? v128_zero()
+                  : vreinterpretq_s64_u16(
+                        vshlq_u16(vreinterpretq_u16_s64(a), vdupq_n_s16(c)));
+}
+
+SIMD_INLINE v128 v128_shr_u16(v128 a, unsigned int c) {
+  return (c > 15) ? v128_zero()
+                  : vreinterpretq_s64_u16(
+                        vshlq_u16(vreinterpretq_u16_s64(a), vdupq_n_s16(-c)));
+}
+
+SIMD_INLINE v128 v128_shr_s16(v128 a, unsigned int c) {
+  return (c > 15) ? v128_ones()
+                  : vreinterpretq_s64_s16(
+                        vshlq_s16(vreinterpretq_s16_s64(a), vdupq_n_s16(-c)));
+}
+
+SIMD_INLINE v128 v128_shl_32(v128 a, unsigned int c) {
+  return (c > 31) ? v128_zero()
+                  : vreinterpretq_s64_u32(
+                        vshlq_u32(vreinterpretq_u32_s64(a), vdupq_n_s32(c)));
+}
+
+SIMD_INLINE v128 v128_shr_u32(v128 a, unsigned int c) {
+  return (c > 31) ? v128_zero()
+                  : vreinterpretq_s64_u32(
+                        vshlq_u32(vreinterpretq_u32_s64(a), vdupq_n_s32(-c)));
+}
+
+SIMD_INLINE v128 v128_shr_s32(v128 a, unsigned int c) {
+  return (c > 31) ? v128_ones()
+                  : vreinterpretq_s64_s32(
+                        vshlq_s32(vreinterpretq_s32_s64(a), vdupq_n_s32(-c)));
+}
+
+SIMD_INLINE v128 v128_shl_64(v128 a, unsigned int c) {
+  return (c > 63) ? v128_zero()
+                  : vreinterpretq_s64_u64(
+                        vshlq_u64(vreinterpretq_u64_s64(a), vdupq_n_s64(c)));
+}
+
+SIMD_INLINE v128 v128_shr_u64(v128 a, unsigned int c) {
+  return (c > 63) ? v128_zero()
+                  : vreinterpretq_s64_u64(
+                        vshlq_u64(vreinterpretq_u64_s64(a), vdupq_n_s64(-c)));
+}
+
+SIMD_INLINE v128 v128_shr_s64(v128 a, unsigned int c) {
+  return (c > 63) ? v128_ones() : vshlq_s64(a, vdupq_n_s64(-c));
+}
+
+#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__)
+
+SIMD_INLINE v128 v128_shl_n_byte(v128 a, unsigned int n) {
+  return n < 8
+             ? v128_from_64(
+                   (uint64_t)vorr_u64(
+                       vshl_n_u64(vreinterpret_u64_s64(vget_high_s64(a)),
+                                  n * 8),
+                       vshr_n_u64(vreinterpret_u64_s64(vget_low_s64(a)),
+                                  (8 - n) * 8)),
+                   (uint64_t)vshl_n_u64(vreinterpret_u64_s64(vget_low_s64(a)),
+                                        n * 8))
+             : (n == 8 ? v128_from_64(
+                             (uint64_t)vreinterpret_u64_s64(vget_low_s64(a)), 0)
+                       : v128_from_64((uint64_t)vshl_n_u64(
+                                          vreinterpret_u64_s64(vget_low_s64(a)),
+                                          (n - 8) * 8),
+                                      0));
+}
+
+SIMD_INLINE v128 v128_shr_n_byte(v128 a, unsigned int n) {
+  return n == 0
+             ? a
+             : (n < 8
+                    ? v128_from_64(
+                          (uint64_t)vshr_n_u64(
+                              vreinterpret_u64_s64(vget_high_s64(a)), n * 8),
+                          (uint64_t)vorr_u64(
+                              vshr_n_u64(vreinterpret_u64_s64(vget_low_s64(a)),
+                                         n * 8),
+                              vshl_n_u64(vreinterpret_u64_s64(vget_high_s64(a)),
+                                         (8 - n) * 8)))
+                    : (n == 8 ? v128_from_64(0, (uint64_t)vreinterpret_u64_s64(
+                                                    vget_high_s64(a)))
+                              : v128_from_64(0, (uint64_t)vshr_n_u64(
+                                                    vreinterpret_u64_s64(
+                                                        vget_high_s64(a)),
+                                                    (n - 8) * 8))));
+}
+
+SIMD_INLINE v128 v128_shl_n_8(v128 a, unsigned int c) {
+  return c ? vreinterpretq_s64_u8(vshlq_n_u8(vreinterpretq_u8_s64(a), c)) : a;
+}
+
+SIMD_INLINE v128 v128_shr_n_u8(v128 a, unsigned int c) {
+  return c ? vreinterpretq_s64_u8(vshrq_n_u8(vreinterpretq_u8_s64(a), c)) : a;
+}
+
+SIMD_INLINE v128 v128_shr_n_s8(v128 a, unsigned int c) {
+  return c ? vreinterpretq_s64_s8(vshrq_n_s8(vreinterpretq_s8_s64(a), c)) : a;
+}
+
+SIMD_INLINE v128 v128_shl_n_16(v128 a, unsigned int c) {
+  return c ? vreinterpretq_s64_u16(vshlq_n_u16(vreinterpretq_u16_s64(a), c))
+           : a;
+}
+
+SIMD_INLINE v128 v128_shr_n_u16(v128 a, unsigned int c) {
+  return c ? vreinterpretq_s64_u16(vshrq_n_u16(vreinterpretq_u16_s64(a), c))
+           : a;
+}
+
+SIMD_INLINE v128 v128_shr_n_s16(v128 a, unsigned int c) {
+  return c ? vreinterpretq_s64_s16(vshrq_n_s16(vreinterpretq_s16_s64(a), c))
+           : a;
+}
+
+SIMD_INLINE v128 v128_shl_n_32(v128 a, unsigned int c) {
+  return c ? vreinterpretq_s64_u32(vshlq_n_u32(vreinterpretq_u32_s64(a), c))
+           : a;
+}
+
+SIMD_INLINE v128 v128_shr_n_u32(v128 a, unsigned int c) {
+  return c ? vreinterpretq_s64_u32(vshrq_n_u32(vreinterpretq_u32_s64(a), c))
+           : a;
+}
+
+SIMD_INLINE v128 v128_shr_n_s32(v128 a, unsigned int c) {
+  return c ? vreinterpretq_s64_s32(vshrq_n_s32(vreinterpretq_s32_s64(a), c))
+           : a;
+}
+
+SIMD_INLINE v128 v128_shl_n_64(v128 a, unsigned int c) {
+  return c ? vreinterpretq_s64_u64(vshlq_n_u64(vreinterpretq_u64_s64(a), c))
+           : a;
+}
+
+SIMD_INLINE v128 v128_shr_n_u64(v128 a, unsigned int c) {
+  return c ? vreinterpretq_s64_u64(vshrq_n_u64(vreinterpretq_u64_s64(a), c))
+           : a;
+}
+
+SIMD_INLINE v128 v128_shr_n_s64(v128 a, unsigned int c) {
+  return c ? vshrq_n_s64(a, c) : a;
+}
+
+#else
+
+SIMD_INLINE v128 v128_shl_n_byte(v128 a, unsigned int n) {
+  if (n < 8)
+    return v128_from_v64(v64_or(v64_shl_n_byte(v128_high_v64(a), n),
+                                v64_shr_n_byte(v128_low_v64(a), 8 - n)),
+                         v64_shl_n_byte(v128_low_v64(a), n));
+  else
+    return v128_from_v64(v64_shl_n_byte(v128_low_v64(a), n - 8), v64_zero());
+}
+
+SIMD_INLINE v128 v128_shr_n_byte(v128 a, unsigned int n) {
+  if (n < 8)
+    return v128_from_v64(v64_shr_n_byte(v128_high_v64(a), n),
+                         v64_or(v64_shr_n_byte(v128_low_v64(a), n),
+                                v64_shl_n_byte(v128_high_v64(a), 8 - n)));
+  else
+    return v128_from_v64(v64_zero(), v64_shr_n_byte(v128_high_v64(a), n - 8));
+}
+
+SIMD_INLINE v128 v128_shl_n_8(v128 a, unsigned int c) {
+  return v128_shl_8(a, c);
+}
+
+SIMD_INLINE v128 v128_shr_n_u8(v128 a, unsigned int c) {
+  return v128_shr_u8(a, c);
+}
+
+SIMD_INLINE v128 v128_shr_n_s8(v128 a, unsigned int c) {
+  return v128_shr_s8(a, c);
+}
+
+SIMD_INLINE v128 v128_shl_n_16(v128 a, unsigned int c) {
+  return v128_shl_16(a, c);
+}
+
+SIMD_INLINE v128 v128_shr_n_u16(v128 a, unsigned int c) {
+  return v128_shr_u16(a, c);
+}
+
+SIMD_INLINE v128 v128_shr_n_s16(v128 a, unsigned int c) {
+  return v128_shr_s16(a, c);
+}
+
+SIMD_INLINE v128 v128_shl_n_32(v128 a, unsigned int c) {
+  return v128_shl_32(a, c);
+}
+
+SIMD_INLINE v128 v128_shr_n_u32(v128 a, unsigned int c) {
+  return v128_shr_u32(a, c);
+}
+
+SIMD_INLINE v128 v128_shr_n_s32(v128 a, unsigned int c) {
+  return v128_shr_s32(a, c);
+}
+
+SIMD_INLINE v128 v128_shl_n_64(v128 a, unsigned int c) {
+  return v128_shl_64(a, c);
+}
+
+SIMD_INLINE v128 v128_shr_n_u64(v128 a, unsigned int c) {
+  return v128_shr_u64(a, c);
+}
+
+SIMD_INLINE v128 v128_shr_n_s64(v128 a, unsigned int c) {
+  return v128_shr_s64(a, c);
+}
+
+#endif
+
+typedef uint32x4_t sad128_internal_u16;
+
+SIMD_INLINE sad128_internal_u16 v128_sad_u16_init(void) {
+  return vdupq_n_u32(0);
+}
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v128_sad_u16_sum(). */
+SIMD_INLINE sad128_internal_u16 v128_sad_u16(sad128_internal_u16 s, v128 a,
+                                             v128 b) {
+  return vaddq_u32(
+      s, vpaddlq_u16(vsubq_u16(
+             vmaxq_u16(vreinterpretq_u16_s64(a), vreinterpretq_u16_s64(b)),
+             vminq_u16(vreinterpretq_u16_s64(a), vreinterpretq_u16_s64(b)))));
+}
+
+SIMD_INLINE uint32_t v128_sad_u16_sum(sad128_internal_u16 s) {
+  uint64x2_t t = vpaddlq_u32(s);
+  return (uint32_t)(uint64_t)vget_high_u64(t) +
+         (uint32_t)(uint64_t)vget_low_u64(t);
+}
+
+typedef v128 ssd128_internal_s16;
+SIMD_INLINE ssd128_internal_s16 v128_ssd_s16_init(void) { return v128_zero(); }
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v128_ssd_s16_sum(). */
+SIMD_INLINE ssd128_internal_s16 v128_ssd_s16(ssd128_internal_s16 s, v128 a,
+                                             v128 b) {
+  v128 d = v128_sub_16(a, b);
+  d = v128_madd_s16(d, d);
+  return v128_add_64(
+      s, vreinterpretq_s64_u64(vpaddlq_u32(vreinterpretq_u32_s64(d))));
+}
+
+SIMD_INLINE uint64_t v128_ssd_s16_sum(ssd128_internal_s16 s) {
+  return v64_u64(v128_low_v64(s)) + v64_u64(v128_high_v64(s));
+}
+
+#endif  // AOM_AOM_DSP_SIMD_V128_INTRINSICS_ARM_H_
diff --git a/libs/libaom/src/aom_dsp/simd/v128_intrinsics_c.h b/libs/libaom/src/aom_dsp/simd/v128_intrinsics_c.h
new file mode 100644
index 000000000..466a41e10
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/simd/v128_intrinsics_c.h
@@ -0,0 +1,903 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_SIMD_V128_INTRINSICS_C_H_
+#define AOM_AOM_DSP_SIMD_V128_INTRINSICS_C_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/simd/v64_intrinsics_c.h"
+
+typedef union {
+  uint8_t u8[16];
+  uint16_t u16[8];
+  uint32_t u32[4];
+  uint64_t u64[2];
+  int8_t s8[16];
+  int16_t s16[8];
+  int32_t s32[4];
+  int64_t s64[2];
+  c_v64 v64[2];
+} c_v128;
+
+SIMD_INLINE uint32_t c_v128_low_u32(c_v128 a) { return a.u32[0]; }
+
+SIMD_INLINE c_v64 c_v128_low_v64(c_v128 a) { return a.v64[0]; }
+
+SIMD_INLINE c_v64 c_v128_high_v64(c_v128 a) { return a.v64[1]; }
+
+SIMD_INLINE c_v128 c_v128_from_64(uint64_t hi, uint64_t lo) {
+  c_v128 t;
+  t.u64[1] = hi;
+  t.u64[0] = lo;
+  return t;
+}
+
+SIMD_INLINE c_v128 c_v128_from_v64(c_v64 hi, c_v64 lo) {
+  c_v128 t;
+  t.v64[1] = hi;
+  t.v64[0] = lo;
+  return t;
+}
+
+SIMD_INLINE c_v128 c_v128_from_32(uint32_t a, uint32_t b, uint32_t c,
+                                  uint32_t d) {
+  c_v128 t;
+  t.u32[3] = a;
+  t.u32[2] = b;
+  t.u32[1] = c;
+  t.u32[0] = d;
+  return t;
+}
+
+SIMD_INLINE c_v128 c_v128_load_unaligned(const void *p) {
+  c_v128 t;
+  uint8_t *pp = (uint8_t *)p;
+  uint8_t *q = (uint8_t *)&t;
+  int c;
+  for (c = 0; c < 16; c++) q[c] = pp[c];
+  return t;
+}
+
+SIMD_INLINE c_v128 c_v128_load_aligned(const void *p) {
+  if (SIMD_CHECK && (uintptr_t)p & 15) {
+    fprintf(stderr, "Error: unaligned v128 load at %p\n", p);
+    abort();
+  }
+  return c_v128_load_unaligned(p);
+}
+
+SIMD_INLINE void c_v128_store_unaligned(void *p, c_v128 a) {
+  uint8_t *pp = (uint8_t *)p;
+  uint8_t *q = (uint8_t *)&a;
+  int c;
+  for (c = 0; c < 16; c++) pp[c] = q[c];
+}
+
+SIMD_INLINE void c_v128_store_aligned(void *p, c_v128 a) {
+  if (SIMD_CHECK && (uintptr_t)p & 15) {
+    fprintf(stderr, "Error: unaligned v128 store at %p\n", p);
+    abort();
+  }
+  c_v128_store_unaligned(p, a);
+}
+
+SIMD_INLINE c_v128 c_v128_zero(void) {
+  c_v128 t;
+  t.u64[1] = t.u64[0] = 0;
+  return t;
+}
+
+SIMD_INLINE c_v128 c_v128_dup_8(uint8_t x) {
+  c_v128 t;
+  t.v64[1] = t.v64[0] = c_v64_dup_8(x);
+  return t;
+}
+
+SIMD_INLINE c_v128 c_v128_dup_16(uint16_t x) {
+  c_v128 t;
+  t.v64[1] = t.v64[0] = c_v64_dup_16(x);
+  return t;
+}
+
+SIMD_INLINE c_v128 c_v128_dup_32(uint32_t x) {
+  c_v128 t;
+  t.v64[1] = t.v64[0] = c_v64_dup_32(x);
+  return t;
+}
+
+SIMD_INLINE c_v128 c_v128_dup_64(uint64_t x) {
+  c_v128 t;
+  t.u64[1] = t.u64[0] = x;
+  return t;
+}
+
+SIMD_INLINE int64_t c_v128_dotp_su8(c_v128 a, c_v128 b) {
+  return c_v64_dotp_su8(a.v64[1], b.v64[1]) +
+         c_v64_dotp_su8(a.v64[0], b.v64[0]);
+}
+
+SIMD_INLINE int64_t c_v128_dotp_s16(c_v128 a, c_v128 b) {
+  return c_v64_dotp_s16(a.v64[1], b.v64[1]) +
+         c_v64_dotp_s16(a.v64[0], b.v64[0]);
+}
+
+SIMD_INLINE int64_t c_v128_dotp_s32(c_v128 a, c_v128 b) {
+  // 32 bit products, 64 bit sum
+  return (int64_t)(int32_t)((int64_t)a.s32[3] * b.s32[3]) +
+         (int64_t)(int32_t)((int64_t)a.s32[2] * b.s32[2]) +
+         (int64_t)(int32_t)((int64_t)a.s32[1] * b.s32[1]) +
+         (int64_t)(int32_t)((int64_t)a.s32[0] * b.s32[0]);
+}
+
+SIMD_INLINE uint64_t c_v128_hadd_u8(c_v128 a) {
+  return c_v64_hadd_u8(a.v64[1]) + c_v64_hadd_u8(a.v64[0]);
+}
+
+typedef struct {
+  uint32_t val;
+  int count;
+} c_sad128_internal;
+
+SIMD_INLINE c_sad128_internal c_v128_sad_u8_init(void) {
+  c_sad128_internal t;
+  t.val = t.count = 0;
+  return t;
+}
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v128_sad_u8_sum(). The result for more than 32 v128_sad_u8() calls is
+ * undefined. */
+SIMD_INLINE c_sad128_internal c_v128_sad_u8(c_sad128_internal s, c_v128 a,
+                                            c_v128 b) {
+  int c;
+  for (c = 0; c < 16; c++)
+    s.val += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c];
+  s.count++;
+  if (SIMD_CHECK && s.count > 32) {
+    fprintf(stderr,
+            "Error: sad called 32 times returning an undefined result\n");
+    abort();
+  }
+  return s;
+}
+
+SIMD_INLINE uint32_t c_v128_sad_u8_sum(c_sad128_internal s) { return s.val; }
+
+typedef uint32_t c_ssd128_internal;
+
+SIMD_INLINE c_ssd128_internal c_v128_ssd_u8_init(void) { return 0; }
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v128_ssd_u8_sum(). */
+SIMD_INLINE c_ssd128_internal c_v128_ssd_u8(c_ssd128_internal s, c_v128 a,
+                                            c_v128 b) {
+  int c;
+  for (c = 0; c < 16; c++) s += (a.u8[c] - b.u8[c]) * (a.u8[c] - b.u8[c]);
+  return s;
+}
+
+SIMD_INLINE uint32_t c_v128_ssd_u8_sum(c_ssd128_internal s) { return s; }
+
+SIMD_INLINE c_v128 c_v128_or(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_or(a.v64[1], b.v64[1]),
+                         c_v64_or(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_xor(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_xor(a.v64[1], b.v64[1]),
+                         c_v64_xor(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_and(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_and(a.v64[1], b.v64[1]),
+                         c_v64_and(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_andn(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_andn(a.v64[1], b.v64[1]),
+                         c_v64_andn(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_add_8(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_add_8(a.v64[1], b.v64[1]),
+                         c_v64_add_8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_add_16(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_add_16(a.v64[1], b.v64[1]),
+                         c_v64_add_16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_sadd_u8(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_sadd_u8(a.v64[1], b.v64[1]),
+                         c_v64_sadd_u8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_sadd_s8(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_sadd_s8(a.v64[1], b.v64[1]),
+                         c_v64_sadd_s8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_sadd_s16(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_sadd_s16(a.v64[1], b.v64[1]),
+                         c_v64_sadd_s16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_add_32(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_add_32(a.v64[1], b.v64[1]),
+                         c_v64_add_32(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_add_64(c_v128 a, c_v128 b) {
+  // Two complement overflow (silences sanitizers)
+  return c_v128_from_64(
+      a.v64[1].u64 > ~b.v64[1].u64 ? a.v64[1].u64 - ~b.v64[1].u64 - 1
+                                   : a.v64[1].u64 + b.v64[1].u64,
+      a.v64[0].u64 > ~b.v64[0].u64 ? a.v64[0].u64 - ~b.v64[0].u64 - 1
+                                   : a.v64[0].u64 + b.v64[0].u64);
+}
+
+SIMD_INLINE c_v128 c_v128_padd_s16(c_v128 a) {
+  c_v128 t;
+  t.s32[0] = (int32_t)a.s16[0] + (int32_t)a.s16[1];
+  t.s32[1] = (int32_t)a.s16[2] + (int32_t)a.s16[3];
+  t.s32[2] = (int32_t)a.s16[4] + (int32_t)a.s16[5];
+  t.s32[3] = (int32_t)a.s16[6] + (int32_t)a.s16[7];
+  return t;
+}
+
+SIMD_INLINE c_v128 c_v128_padd_u8(c_v128 a) {
+  c_v128 t;
+  t.u16[0] = (uint16_t)a.u8[0] + (uint16_t)a.u8[1];
+  t.u16[1] = (uint16_t)a.u8[2] + (uint16_t)a.u8[3];
+  t.u16[2] = (uint16_t)a.u8[4] + (uint16_t)a.u8[5];
+  t.u16[3] = (uint16_t)a.u8[6] + (uint16_t)a.u8[7];
+  t.u16[4] = (uint16_t)a.u8[8] + (uint16_t)a.u8[9];
+  t.u16[5] = (uint16_t)a.u8[10] + (uint16_t)a.u8[11];
+  t.u16[6] = (uint16_t)a.u8[12] + (uint16_t)a.u8[13];
+  t.u16[7] = (uint16_t)a.u8[14] + (uint16_t)a.u8[15];
+  return t;
+}
+
+SIMD_INLINE c_v128 c_v128_sub_8(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_sub_8(a.v64[1], b.v64[1]),
+                         c_v64_sub_8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_ssub_u8(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_ssub_u8(a.v64[1], b.v64[1]),
+                         c_v64_ssub_u8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_ssub_s8(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_ssub_s8(a.v64[1], b.v64[1]),
+                         c_v64_ssub_s8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_sub_16(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_sub_16(a.v64[1], b.v64[1]),
+                         c_v64_sub_16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_ssub_s16(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_ssub_s16(a.v64[1], b.v64[1]),
+                         c_v64_ssub_s16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_ssub_u16(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_ssub_u16(a.v64[1], b.v64[1]),
+                         c_v64_ssub_u16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_sub_32(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_sub_32(a.v64[1], b.v64[1]),
+                         c_v64_sub_32(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_sub_64(c_v128 a, c_v128 b) {
+  // Two complement underflow (silences sanitizers)
+  return c_v128_from_64(
+      a.v64[1].u64 < b.v64[1].u64 ? a.v64[1].u64 + ~b.v64[1].u64 + 1
+                                  : a.v64[1].u64 - b.v64[1].u64,
+      a.v64[0].u64 < b.v64[0].u64 ? a.v64[0].u64 + ~b.v64[0].u64 + 1
+                                  : a.v64[0].u64 - b.v64[0].u64);
+}
+
+SIMD_INLINE c_v128 c_v128_abs_s16(c_v128 a) {
+  return c_v128_from_v64(c_v64_abs_s16(a.v64[1]), c_v64_abs_s16(a.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_abs_s8(c_v128 a) {
+  return c_v128_from_v64(c_v64_abs_s8(a.v64[1]), c_v64_abs_s8(a.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_mul_s16(c_v64 a, c_v64 b) {
+  c_v64 lo_bits = c_v64_mullo_s16(a, b);
+  c_v64 hi_bits = c_v64_mulhi_s16(a, b);
+  return c_v128_from_v64(c_v64_ziphi_16(hi_bits, lo_bits),
+                         c_v64_ziplo_16(hi_bits, lo_bits));
+}
+
+SIMD_INLINE c_v128 c_v128_mullo_s16(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_mullo_s16(a.v64[1], b.v64[1]),
+                         c_v64_mullo_s16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_mulhi_s16(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_mulhi_s16(a.v64[1], b.v64[1]),
+                         c_v64_mulhi_s16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_mullo_s32(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_mullo_s32(a.v64[1], b.v64[1]),
+                         c_v64_mullo_s32(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_madd_s16(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_madd_s16(a.v64[1], b.v64[1]),
+                         c_v64_madd_s16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_madd_us8(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_madd_us8(a.v64[1], b.v64[1]),
+                         c_v64_madd_us8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_avg_u8(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_avg_u8(a.v64[1], b.v64[1]),
+                         c_v64_avg_u8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_rdavg_u8(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_rdavg_u8(a.v64[1], b.v64[1]),
+                         c_v64_rdavg_u8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_rdavg_u16(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_rdavg_u16(a.v64[1], b.v64[1]),
+                         c_v64_rdavg_u16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_avg_u16(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_avg_u16(a.v64[1], b.v64[1]),
+                         c_v64_avg_u16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_min_u8(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_min_u8(a.v64[1], b.v64[1]),
+                         c_v64_min_u8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_max_u8(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_max_u8(a.v64[1], b.v64[1]),
+                         c_v64_max_u8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_min_s8(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_min_s8(a.v64[1], b.v64[1]),
+                         c_v64_min_s8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE uint32_t c_v128_movemask_8(c_v128 a) {
+  return ((a.s8[15] < 0) << 15) | ((a.s8[14] < 0) << 14) |
+         ((a.s8[13] < 0) << 13) | ((a.s8[12] < 0) << 12) |
+         ((a.s8[11] < 0) << 11) | ((a.s8[10] < 0) << 10) |
+         ((a.s8[9] < 0) << 9) | ((a.s8[8] < 0) << 8) | ((a.s8[7] < 0) << 7) |
+         ((a.s8[6] < 0) << 6) | ((a.s8[5] < 0) << 5) | ((a.s8[4] < 0) << 4) |
+         ((a.s8[3] < 0) << 3) | ((a.s8[2] < 0) << 2) | ((a.s8[1] < 0) << 1) |
+         ((a.s8[0] < 0) << 0);
+}
+
+SIMD_INLINE c_v128 c_v128_blend_8(c_v128 a, c_v128 b, c_v128 c) {
+  c_v128 t;
+  for (int i = 0; i < 16; i++) t.u8[i] = c.s8[i] < 0 ? b.u8[i] : a.u8[i];
+  return t;
+}
+
+SIMD_INLINE c_v128 c_v128_max_s8(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_max_s8(a.v64[1], b.v64[1]),
+                         c_v64_max_s8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_min_s16(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_min_s16(a.v64[1], b.v64[1]),
+                         c_v64_min_s16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_max_s16(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_max_s16(a.v64[1], b.v64[1]),
+                         c_v64_max_s16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_max_s32(c_v128 a, c_v128 b) {
+  c_v128 t;
+  int c;
+  for (c = 0; c < 4; c++) t.s32[c] = a.s32[c] > b.s32[c] ? a.s32[c] : b.s32[c];
+  return t;
+}
+
+SIMD_INLINE c_v128 c_v128_min_s32(c_v128 a, c_v128 b) {
+  c_v128 t;
+  int c;
+  for (c = 0; c < 4; c++) t.s32[c] = a.s32[c] > b.s32[c] ? b.s32[c] : a.s32[c];
+  return t;
+}
+
+SIMD_INLINE c_v128 c_v128_ziplo_8(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_ziphi_8(a.v64[0], b.v64[0]),
+                         c_v64_ziplo_8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_ziphi_8(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_ziphi_8(a.v64[1], b.v64[1]),
+                         c_v64_ziplo_8(a.v64[1], b.v64[1]));
+}
+
+SIMD_INLINE c_v128 c_v128_ziplo_16(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_ziphi_16(a.v64[0], b.v64[0]),
+                         c_v64_ziplo_16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_ziphi_16(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_ziphi_16(a.v64[1], b.v64[1]),
+                         c_v64_ziplo_16(a.v64[1], b.v64[1]));
+}
+
+SIMD_INLINE c_v128 c_v128_ziplo_32(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_ziphi_32(a.v64[0], b.v64[0]),
+                         c_v64_ziplo_32(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_ziphi_32(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_ziphi_32(a.v64[1], b.v64[1]),
+                         c_v64_ziplo_32(a.v64[1], b.v64[1]));
+}
+
+SIMD_INLINE c_v128 c_v128_ziplo_64(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(a.v64[0], b.v64[0]);
+}
+
+SIMD_INLINE c_v128 c_v128_ziphi_64(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(a.v64[1], b.v64[1]);
+}
+
+SIMD_INLINE c_v128 c_v128_zip_8(c_v64 a, c_v64 b) {
+  return c_v128_from_v64(c_v64_ziphi_8(a, b), c_v64_ziplo_8(a, b));
+}
+
+SIMD_INLINE c_v128 c_v128_zip_16(c_v64 a, c_v64 b) {
+  return c_v128_from_v64(c_v64_ziphi_16(a, b), c_v64_ziplo_16(a, b));
+}
+
+SIMD_INLINE c_v128 c_v128_zip_32(c_v64 a, c_v64 b) {
+  return c_v128_from_v64(c_v64_ziphi_32(a, b), c_v64_ziplo_32(a, b));
+}
+
+SIMD_INLINE c_v128 _c_v128_unzip_8(c_v128 a, c_v128 b, int mode) {
+  c_v128 t;
+  if (mode) {
+    t.u8[15] = b.u8[15];
+    t.u8[14] = b.u8[13];
+    t.u8[13] = b.u8[11];
+    t.u8[12] = b.u8[9];
+    t.u8[11] = b.u8[7];
+    t.u8[10] = b.u8[5];
+    t.u8[9] = b.u8[3];
+    t.u8[8] = b.u8[1];
+    t.u8[7] = a.u8[15];
+    t.u8[6] = a.u8[13];
+    t.u8[5] = a.u8[11];
+    t.u8[4] = a.u8[9];
+    t.u8[3] = a.u8[7];
+    t.u8[2] = a.u8[5];
+    t.u8[1] = a.u8[3];
+    t.u8[0] = a.u8[1];
+  } else {
+    t.u8[15] = a.u8[14];
+    t.u8[14] = a.u8[12];
+    t.u8[13] = a.u8[10];
+    t.u8[12] = a.u8[8];
+    t.u8[11] = a.u8[6];
+    t.u8[10] = a.u8[4];
+    t.u8[9] = a.u8[2];
+    t.u8[8] = a.u8[0];
+    t.u8[7] = b.u8[14];
+    t.u8[6] = b.u8[12];
+    t.u8[5] = b.u8[10];
+    t.u8[4] = b.u8[8];
+    t.u8[3] = b.u8[6];
+    t.u8[2] = b.u8[4];
+    t.u8[1] = b.u8[2];
+    t.u8[0] = b.u8[0];
+  }
+  return t;
+}
+
+SIMD_INLINE c_v128 c_v128_unziplo_8(c_v128 a, c_v128 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v128_unzip_8(a, b, 1)
+                           : _c_v128_unzip_8(a, b, 0);
+}
+
+SIMD_INLINE c_v128 c_v128_unziphi_8(c_v128 a, c_v128 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v128_unzip_8(b, a, 0)
+                           : _c_v128_unzip_8(b, a, 1);
+}
+
+SIMD_INLINE c_v128 _c_v128_unzip_16(c_v128 a, c_v128 b, int mode) {
+  c_v128 t;
+  if (mode) {
+    t.u16[7] = b.u16[7];
+    t.u16[6] = b.u16[5];
+    t.u16[5] = b.u16[3];
+    t.u16[4] = b.u16[1];
+    t.u16[3] = a.u16[7];
+    t.u16[2] = a.u16[5];
+    t.u16[1] = a.u16[3];
+    t.u16[0] = a.u16[1];
+  } else {
+    t.u16[7] = a.u16[6];
+    t.u16[6] = a.u16[4];
+    t.u16[5] = a.u16[2];
+    t.u16[4] = a.u16[0];
+    t.u16[3] = b.u16[6];
+    t.u16[2] = b.u16[4];
+    t.u16[1] = b.u16[2];
+    t.u16[0] = b.u16[0];
+  }
+  return t;
+}
+
+SIMD_INLINE c_v128 c_v128_unziplo_16(c_v128 a, c_v128 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v128_unzip_16(a, b, 1)
+                           : _c_v128_unzip_16(a, b, 0);
+}
+
+SIMD_INLINE c_v128 c_v128_unziphi_16(c_v128 a, c_v128 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v128_unzip_16(b, a, 0)
+                           : _c_v128_unzip_16(b, a, 1);
+}
+
+SIMD_INLINE c_v128 _c_v128_unzip_32(c_v128 a, c_v128 b, int mode) {
+  c_v128 t;
+  if (mode) {
+    t.u32[3] = b.u32[3];
+    t.u32[2] = b.u32[1];
+    t.u32[1] = a.u32[3];
+    t.u32[0] = a.u32[1];
+  } else {
+    t.u32[3] = a.u32[2];
+    t.u32[2] = a.u32[0];
+    t.u32[1] = b.u32[2];
+    t.u32[0] = b.u32[0];
+  }
+  return t;
+}
+
+SIMD_INLINE c_v128 c_v128_unziplo_32(c_v128 a, c_v128 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v128_unzip_32(a, b, 1)
+                           : _c_v128_unzip_32(a, b, 0);
+}
+
+SIMD_INLINE c_v128 c_v128_unziphi_32(c_v128 a, c_v128 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v128_unzip_32(b, a, 0)
+                           : _c_v128_unzip_32(b, a, 1);
+}
+
+SIMD_INLINE c_v128 c_v128_unpack_u8_s16(c_v64 a) {
+  return c_v128_from_v64(c_v64_unpackhi_u8_s16(a), c_v64_unpacklo_u8_s16(a));
+}
+
+SIMD_INLINE c_v128 c_v128_unpacklo_u8_s16(c_v128 a) {
+  return c_v128_from_v64(c_v64_unpackhi_u8_s16(a.v64[0]),
+                         c_v64_unpacklo_u8_s16(a.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_unpackhi_u8_s16(c_v128 a) {
+  return c_v128_from_v64(c_v64_unpackhi_u8_s16(a.v64[1]),
+                         c_v64_unpacklo_u8_s16(a.v64[1]));
+}
+
+SIMD_INLINE c_v128 c_v128_unpack_s8_s16(c_v64 a) {
+  return c_v128_from_v64(c_v64_unpackhi_s8_s16(a), c_v64_unpacklo_s8_s16(a));
+}
+
+SIMD_INLINE c_v128 c_v128_unpacklo_s8_s16(c_v128 a) {
+  return c_v128_from_v64(c_v64_unpackhi_s8_s16(a.v64[0]),
+                         c_v64_unpacklo_s8_s16(a.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_unpackhi_s8_s16(c_v128 a) {
+  return c_v128_from_v64(c_v64_unpackhi_s8_s16(a.v64[1]),
+                         c_v64_unpacklo_s8_s16(a.v64[1]));
+}
+
+SIMD_INLINE c_v128 c_v128_pack_s32_s16(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_pack_s32_s16(a.v64[1], a.v64[0]),
+                         c_v64_pack_s32_s16(b.v64[1], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_pack_s32_u16(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_pack_s32_u16(a.v64[1], a.v64[0]),
+                         c_v64_pack_s32_u16(b.v64[1], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_pack_s16_u8(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_pack_s16_u8(a.v64[1], a.v64[0]),
+                         c_v64_pack_s16_u8(b.v64[1], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_pack_s16_s8(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_pack_s16_s8(a.v64[1], a.v64[0]),
+                         c_v64_pack_s16_s8(b.v64[1], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_unpack_u16_s32(c_v64 a) {
+  return c_v128_from_v64(c_v64_unpackhi_u16_s32(a), c_v64_unpacklo_u16_s32(a));
+}
+
+SIMD_INLINE c_v128 c_v128_unpack_s16_s32(c_v64 a) {
+  return c_v128_from_v64(c_v64_unpackhi_s16_s32(a), c_v64_unpacklo_s16_s32(a));
+}
+
+SIMD_INLINE c_v128 c_v128_unpacklo_u16_s32(c_v128 a) {
+  return c_v128_from_v64(c_v64_unpackhi_u16_s32(a.v64[0]),
+                         c_v64_unpacklo_u16_s32(a.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_unpacklo_s16_s32(c_v128 a) {
+  return c_v128_from_v64(c_v64_unpackhi_s16_s32(a.v64[0]),
+                         c_v64_unpacklo_s16_s32(a.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_unpackhi_u16_s32(c_v128 a) {
+  return c_v128_from_v64(c_v64_unpackhi_u16_s32(a.v64[1]),
+                         c_v64_unpacklo_u16_s32(a.v64[1]));
+}
+
+SIMD_INLINE c_v128 c_v128_unpackhi_s16_s32(c_v128 a) {
+  return c_v128_from_v64(c_v64_unpackhi_s16_s32(a.v64[1]),
+                         c_v64_unpacklo_s16_s32(a.v64[1]));
+}
+
+SIMD_INLINE c_v128 c_v128_shuffle_8(c_v128 a, c_v128 pattern) {
+  c_v128 t;
+  int c;
+  for (c = 0; c < 16; c++)
+    t.u8[c] = a.u8[CONFIG_BIG_ENDIAN ? 15 - (pattern.u8[c] & 15)
+                                     : pattern.u8[c] & 15];
+
+  return t;
+}
+
+SIMD_INLINE c_v128 c_v128_cmpgt_s8(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_cmpgt_s8(a.v64[1], b.v64[1]),
+                         c_v64_cmpgt_s8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_cmplt_s8(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_cmplt_s8(a.v64[1], b.v64[1]),
+                         c_v64_cmplt_s8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_cmpeq_8(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_cmpeq_8(a.v64[1], b.v64[1]),
+                         c_v64_cmpeq_8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_cmpgt_s16(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_cmpgt_s16(a.v64[1], b.v64[1]),
+                         c_v64_cmpgt_s16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_cmplt_s16(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_cmplt_s16(a.v64[1], b.v64[1]),
+                         c_v64_cmplt_s16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_cmpeq_16(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_cmpeq_16(a.v64[1], b.v64[1]),
+                         c_v64_cmpeq_16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_cmpgt_s32(c_v128 a, c_v128 b) {
+  c_v128 t;
+  int c;
+  for (c = 0; c < 4; c++) t.s32[c] = -(a.s32[c] > b.s32[c]);
+  return t;
+}
+
+SIMD_INLINE c_v128 c_v128_cmplt_s32(c_v128 a, c_v128 b) {
+  c_v128 t;
+  int c;
+  for (c = 0; c < 4; c++) t.s32[c] = -(a.s32[c] < b.s32[c]);
+  return t;
+}
+
+SIMD_INLINE c_v128 c_v128_cmpeq_32(c_v128 a, c_v128 b) {
+  c_v128 t;
+  int c;
+  for (c = 0; c < 4; c++) t.s32[c] = -(a.s32[c] == b.s32[c]);
+  return t;
+}
+
+SIMD_INLINE c_v128 c_v128_shl_n_byte(c_v128 a, const unsigned int n) {
+  if (n == 0) return a;
+  if (n < 8)
+    return c_v128_from_v64(c_v64_or(c_v64_shl_n_byte(a.v64[1], n),
+                                    c_v64_shr_n_byte(a.v64[0], 8 - n)),
+                           c_v64_shl_n_byte(a.v64[0], n));
+  else
+    return c_v128_from_v64(c_v64_shl_n_byte(a.v64[0], n - 8), c_v64_zero());
+}
+
+SIMD_INLINE c_v128 c_v128_shr_n_byte(c_v128 a, const unsigned int n) {
+  if (n == 0) return a;
+  if (n < 8)
+    return c_v128_from_v64(c_v64_shr_n_byte(a.v64[1], n),
+                           c_v64_or(c_v64_shr_n_byte(a.v64[0], n),
+                                    c_v64_shl_n_byte(a.v64[1], 8 - n)));
+  else
+    return c_v128_from_v64(c_v64_zero(), c_v64_shr_n_byte(a.v64[1], n - 8));
+}
+
+SIMD_INLINE c_v128 c_v128_align(c_v128 a, c_v128 b, const unsigned int c) {
+  if (SIMD_CHECK && c > 15) {
+    fprintf(stderr, "Error: undefined alignment %d\n", c);
+    abort();
+  }
+  return c ? c_v128_or(c_v128_shr_n_byte(b, c), c_v128_shl_n_byte(a, 16 - c))
+           : b;
+}
+
+SIMD_INLINE c_v128 c_v128_shl_8(c_v128 a, const unsigned int c) {
+  return c_v128_from_v64(c_v64_shl_8(a.v64[1], c), c_v64_shl_8(a.v64[0], c));
+}
+
+SIMD_INLINE c_v128 c_v128_shr_u8(c_v128 a, const unsigned int c) {
+  return c_v128_from_v64(c_v64_shr_u8(a.v64[1], c), c_v64_shr_u8(a.v64[0], c));
+}
+
+SIMD_INLINE c_v128 c_v128_shr_s8(c_v128 a, const unsigned int c) {
+  return c_v128_from_v64(c_v64_shr_s8(a.v64[1], c), c_v64_shr_s8(a.v64[0], c));
+}
+
+SIMD_INLINE c_v128 c_v128_shl_16(c_v128 a, const unsigned int c) {
+  return c_v128_from_v64(c_v64_shl_16(a.v64[1], c), c_v64_shl_16(a.v64[0], c));
+}
+
+SIMD_INLINE c_v128 c_v128_shr_u16(c_v128 a, const unsigned int c) {
+  return c_v128_from_v64(c_v64_shr_u16(a.v64[1], c),
+                         c_v64_shr_u16(a.v64[0], c));
+}
+
+SIMD_INLINE c_v128 c_v128_shr_s16(c_v128 a, const unsigned int c) {
+  return c_v128_from_v64(c_v64_shr_s16(a.v64[1], c),
+                         c_v64_shr_s16(a.v64[0], c));
+}
+
+SIMD_INLINE c_v128 c_v128_shl_32(c_v128 a, const unsigned int c) {
+  return c_v128_from_v64(c_v64_shl_32(a.v64[1], c), c_v64_shl_32(a.v64[0], c));
+}
+
+SIMD_INLINE c_v128 c_v128_shr_u32(c_v128 a, const unsigned int c) {
+  return c_v128_from_v64(c_v64_shr_u32(a.v64[1], c),
+                         c_v64_shr_u32(a.v64[0], c));
+}
+
+SIMD_INLINE c_v128 c_v128_shr_s32(c_v128 a, const unsigned int c) {
+  return c_v128_from_v64(c_v64_shr_s32(a.v64[1], c),
+                         c_v64_shr_s32(a.v64[0], c));
+}
+
+SIMD_INLINE c_v128 c_v128_shl_64(c_v128 a, const unsigned int c) {
+  a.v64[1].u64 <<= c;
+  a.v64[0].u64 <<= c;
+  return c_v128_from_v64(a.v64[1], a.v64[0]);
+}
+
+SIMD_INLINE c_v128 c_v128_shr_u64(c_v128 a, const unsigned int c) {
+  a.v64[1].u64 >>= c;
+  a.v64[0].u64 >>= c;
+  return c_v128_from_v64(a.v64[1], a.v64[0]);
+}
+
+SIMD_INLINE c_v128 c_v128_shr_s64(c_v128 a, const unsigned int c) {
+  a.v64[1].s64 >>= c;
+  a.v64[0].s64 >>= c;
+  return c_v128_from_v64(a.v64[1], a.v64[0]);
+}
+
+SIMD_INLINE c_v128 c_v128_shl_n_8(c_v128 a, const unsigned int n) {
+  return c_v128_shl_8(a, n);
+}
+
+SIMD_INLINE c_v128 c_v128_shl_n_16(c_v128 a, const unsigned int n) {
+  return c_v128_shl_16(a, n);
+}
+
+SIMD_INLINE c_v128 c_v128_shl_n_32(c_v128 a, const unsigned int n) {
+  return c_v128_shl_32(a, n);
+}
+
+SIMD_INLINE c_v128 c_v128_shl_n_64(c_v128 a, const unsigned int n) {
+  return c_v128_shl_64(a, n);
+}
+
+SIMD_INLINE c_v128 c_v128_shr_n_u8(c_v128 a, const unsigned int n) {
+  return c_v128_shr_u8(a, n);
+}
+
+SIMD_INLINE c_v128 c_v128_shr_n_u16(c_v128 a, const unsigned int n) {
+  return c_v128_shr_u16(a, n);
+}
+
+SIMD_INLINE c_v128 c_v128_shr_n_u32(c_v128 a, const unsigned int n) {
+  return c_v128_shr_u32(a, n);
+}
+
+SIMD_INLINE c_v128 c_v128_shr_n_u64(c_v128 a, const unsigned int n) {
+  return c_v128_shr_u64(a, n);
+}
+
+SIMD_INLINE c_v128 c_v128_shr_n_s8(c_v128 a, const unsigned int n) {
+  return c_v128_shr_s8(a, n);
+}
+
+SIMD_INLINE c_v128 c_v128_shr_n_s16(c_v128 a, const unsigned int n) {
+  return c_v128_shr_s16(a, n);
+}
+
+SIMD_INLINE c_v128 c_v128_shr_n_s32(c_v128 a, const unsigned int n) {
+  return c_v128_shr_s32(a, n);
+}
+
+SIMD_INLINE c_v128 c_v128_shr_n_s64(c_v128 a, const unsigned int n) {
+  return c_v128_shr_s64(a, n);
+}
+
+typedef uint32_t c_sad128_internal_u16;
+
+SIMD_INLINE c_sad128_internal_u16 c_v128_sad_u16_init(void) { return 0; }
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v128_sad_u16_sum(). */
+SIMD_INLINE c_sad128_internal_u16 c_v128_sad_u16(c_sad128_internal_u16 s,
+                                                 c_v128 a, c_v128 b) {
+  int c;
+  for (c = 0; c < 8; c++)
+    s += a.u16[c] > b.u16[c] ? a.u16[c] - b.u16[c] : b.u16[c] - a.u16[c];
+  return s;
+}
+
+SIMD_INLINE uint32_t c_v128_sad_u16_sum(c_sad128_internal_u16 s) { return s; }
+
+typedef uint64_t c_ssd128_internal_s16;
+
+SIMD_INLINE c_ssd128_internal_s16 c_v128_ssd_s16_init(void) { return 0; }
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v128_ssd_s16_sum(). */
+SIMD_INLINE c_ssd128_internal_s16 c_v128_ssd_s16(c_ssd128_internal_s16 s,
+                                                 c_v128 a, c_v128 b) {
+  int c;
+  for (c = 0; c < 8; c++)
+    s += (int32_t)(int16_t)(a.s16[c] - b.s16[c]) *
+         (int32_t)(int16_t)(a.s16[c] - b.s16[c]);
+  return s;
+}
+
+SIMD_INLINE uint64_t c_v128_ssd_s16_sum(c_ssd128_internal_s16 s) { return s; }
+
+#endif  // AOM_AOM_DSP_SIMD_V128_INTRINSICS_C_H_
diff --git a/libs/libaom/src/aom_dsp/simd/v128_intrinsics_x86.h b/libs/libaom/src/aom_dsp/simd/v128_intrinsics_x86.h
new file mode 100644
index 000000000..c404015ef
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/simd/v128_intrinsics_x86.h
@@ -0,0 +1,657 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_SIMD_V128_INTRINSICS_X86_H_
+#define AOM_AOM_DSP_SIMD_V128_INTRINSICS_X86_H_
+
+#include <stdint.h>
+#include "aom_dsp/simd/v64_intrinsics_x86.h"
+
+typedef __m128i v128;
+
+SIMD_INLINE uint32_t v128_low_u32(v128 a) {
+  return (uint32_t)_mm_cvtsi128_si32(a);
+}
+
+SIMD_INLINE v64 v128_low_v64(v128 a) {
+  return _mm_unpacklo_epi64(a, v64_zero());
+}
+
+SIMD_INLINE v64 v128_high_v64(v128 a) { return _mm_srli_si128(a, 8); }
+
+SIMD_INLINE v128 v128_from_v64(v64 a, v64 b) {
+  return _mm_unpacklo_epi64(b, a);
+}
+
+SIMD_INLINE v128 v128_from_64(uint64_t a, uint64_t b) {
+  return v128_from_v64(v64_from_64(a), v64_from_64(b));
+}
+
+SIMD_INLINE v128 v128_from_32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
+  return _mm_set_epi32(a, b, c, d);
+}
+
+SIMD_INLINE v128 v128_load_aligned(const void *p) {
+  return _mm_load_si128((__m128i *)p);
+}
+
+SIMD_INLINE v128 v128_load_unaligned(const void *p) {
+#if defined(__SSSE3__)
+  return _mm_lddqu_si128((__m128i *)p);
+#else
+  return _mm_loadu_si128((__m128i *)p);
+#endif
+}
+
+SIMD_INLINE void v128_store_aligned(void *p, v128 a) {
+  _mm_store_si128((__m128i *)p, a);
+}
+
+SIMD_INLINE void v128_store_unaligned(void *p, v128 a) {
+  _mm_storeu_si128((__m128i *)p, a);
+}
+
+// The following function requires an immediate.
+// Some compilers will check this during optimisation, others wont.
+#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__)
+#if defined(__SSSE3__)
+SIMD_INLINE v128 v128_align(v128 a, v128 b, const unsigned int c) {
+  return c ? _mm_alignr_epi8(a, b, c) : b;
+}
+#else
+#define v128_align(a, b, c) \
+  ((c) ? _mm_or_si128(_mm_srli_si128(b, c), _mm_slli_si128(a, 16 - (c))) : (b))
+#endif
+#else
+#if defined(__SSSE3__)
+#define v128_align(a, b, c) ((c) ? _mm_alignr_epi8(a, b, (uint8_t)(c)) : (b))
+#else
+#define v128_align(a, b, c) \
+  ((c) ? _mm_or_si128(_mm_srli_si128(b, c), _mm_slli_si128(a, 16 - (c))) : (b))
+#endif
+#endif
+
+SIMD_INLINE v128 v128_zero() { return _mm_setzero_si128(); }
+
+SIMD_INLINE v128 v128_dup_8(uint8_t x) { return _mm_set1_epi8(x); }
+
+SIMD_INLINE v128 v128_dup_16(uint16_t x) { return _mm_set1_epi16(x); }
+
+SIMD_INLINE v128 v128_dup_32(uint32_t x) { return _mm_set1_epi32(x); }
+
+SIMD_INLINE v128 v128_dup_64(uint64_t x) {
+  // _mm_set_pi64x and _mm_cvtsi64x_si64 missing in some compilers
+  return _mm_set_epi32((uint32_t)(x >> 32), (uint32_t)x, (uint32_t)(x >> 32),
+                       (uint32_t)x);
+}
+
+SIMD_INLINE v128 v128_add_8(v128 a, v128 b) { return _mm_add_epi8(a, b); }
+
+SIMD_INLINE v128 v128_add_16(v128 a, v128 b) { return _mm_add_epi16(a, b); }
+
+SIMD_INLINE v128 v128_sadd_u8(v128 a, v128 b) { return _mm_adds_epu8(a, b); }
+
+SIMD_INLINE v128 v128_sadd_s8(v128 a, v128 b) { return _mm_adds_epi8(a, b); }
+
+SIMD_INLINE v128 v128_sadd_s16(v128 a, v128 b) { return _mm_adds_epi16(a, b); }
+
+SIMD_INLINE v128 v128_add_32(v128 a, v128 b) { return _mm_add_epi32(a, b); }
+
+SIMD_INLINE v128 v128_add_64(v128 a, v128 b) { return _mm_add_epi64(a, b); }
+
+SIMD_INLINE v128 v128_padd_s16(v128 a) {
+  return _mm_madd_epi16(a, _mm_set1_epi16(1));
+}
+
+SIMD_INLINE v128 v128_sub_8(v128 a, v128 b) { return _mm_sub_epi8(a, b); }
+
+SIMD_INLINE v128 v128_ssub_u8(v128 a, v128 b) { return _mm_subs_epu8(a, b); }
+
+SIMD_INLINE v128 v128_ssub_s8(v128 a, v128 b) { return _mm_subs_epi8(a, b); }
+
+SIMD_INLINE v128 v128_sub_16(v128 a, v128 b) { return _mm_sub_epi16(a, b); }
+
+SIMD_INLINE v128 v128_ssub_s16(v128 a, v128 b) { return _mm_subs_epi16(a, b); }
+
+SIMD_INLINE v128 v128_ssub_u16(v128 a, v128 b) { return _mm_subs_epu16(a, b); }
+
+SIMD_INLINE v128 v128_sub_32(v128 a, v128 b) { return _mm_sub_epi32(a, b); }
+
+SIMD_INLINE v128 v128_sub_64(v128 a, v128 b) { return _mm_sub_epi64(a, b); }
+
+SIMD_INLINE v128 v128_abs_s16(v128 a) {
+#if defined(__SSSE3__)
+  return _mm_abs_epi16(a);
+#else
+  return _mm_max_epi16(a, _mm_sub_epi16(_mm_setzero_si128(), a));
+#endif
+}
+
+SIMD_INLINE v128 v128_abs_s8(v128 a) {
+#if defined(__SSSE3__)
+  return _mm_abs_epi8(a);
+#else
+  v128 sign = _mm_cmplt_epi8(a, _mm_setzero_si128());
+  return _mm_xor_si128(sign, _mm_add_epi8(a, sign));
+#endif
+}
+
+SIMD_INLINE v128 v128_ziplo_8(v128 a, v128 b) {
+  return _mm_unpacklo_epi8(b, a);
+}
+
+SIMD_INLINE v128 v128_ziphi_8(v128 a, v128 b) {
+  return _mm_unpackhi_epi8(b, a);
+}
+
+SIMD_INLINE v128 v128_ziplo_16(v128 a, v128 b) {
+  return _mm_unpacklo_epi16(b, a);
+}
+
+SIMD_INLINE v128 v128_ziphi_16(v128 a, v128 b) {
+  return _mm_unpackhi_epi16(b, a);
+}
+
+SIMD_INLINE v128 v128_ziplo_32(v128 a, v128 b) {
+  return _mm_unpacklo_epi32(b, a);
+}
+
+SIMD_INLINE v128 v128_ziphi_32(v128 a, v128 b) {
+  return _mm_unpackhi_epi32(b, a);
+}
+
+SIMD_INLINE v128 v128_ziplo_64(v128 a, v128 b) {
+  return _mm_unpacklo_epi64(b, a);
+}
+
+SIMD_INLINE v128 v128_ziphi_64(v128 a, v128 b) {
+  return _mm_unpackhi_epi64(b, a);
+}
+
+SIMD_INLINE v128 v128_zip_8(v64 a, v64 b) { return _mm_unpacklo_epi8(b, a); }
+
+SIMD_INLINE v128 v128_zip_16(v64 a, v64 b) { return _mm_unpacklo_epi16(b, a); }
+
+SIMD_INLINE v128 v128_zip_32(v64 a, v64 b) { return _mm_unpacklo_epi32(b, a); }
+
+SIMD_INLINE v128 v128_unziphi_8(v128 a, v128 b) {
+  return _mm_packs_epi16(_mm_srai_epi16(b, 8), _mm_srai_epi16(a, 8));
+}
+
+SIMD_INLINE v128 v128_unziplo_8(v128 a, v128 b) {
+#if defined(__SSSE3__)
+#ifdef __x86_64__
+  v128 order = _mm_cvtsi64_si128(0x0e0c0a0806040200LL);
+#else
+  v128 order = _mm_set_epi32(0, 0, 0x0e0c0a08, 0x06040200);
+#endif
+  return _mm_unpacklo_epi64(_mm_shuffle_epi8(b, order),
+                            _mm_shuffle_epi8(a, order));
+#else
+  return v128_unziphi_8(_mm_slli_si128(a, 1), _mm_slli_si128(b, 1));
+#endif
+}
+
+SIMD_INLINE v128 v128_unziphi_16(v128 a, v128 b) {
+  return _mm_packs_epi32(_mm_srai_epi32(b, 16), _mm_srai_epi32(a, 16));
+}
+
+SIMD_INLINE v128 v128_unziplo_16(v128 a, v128 b) {
+#if defined(__SSSE3__)
+#ifdef __x86_64__
+  v128 order = _mm_cvtsi64_si128(0x0d0c090805040100LL);
+#else
+  v128 order = _mm_set_epi32(0, 0, 0x0d0c0908, 0x05040100);
+#endif
+  return _mm_unpacklo_epi64(_mm_shuffle_epi8(b, order),
+                            _mm_shuffle_epi8(a, order));
+#else
+  return v128_unziphi_16(_mm_slli_si128(a, 2), _mm_slli_si128(b, 2));
+#endif
+}
+
+SIMD_INLINE v128 v128_unziphi_32(v128 a, v128 b) {
+  return _mm_castps_si128(_mm_shuffle_ps(
+      _mm_castsi128_ps(b), _mm_castsi128_ps(a), _MM_SHUFFLE(3, 1, 3, 1)));
+}
+
+SIMD_INLINE v128 v128_unziplo_32(v128 a, v128 b) {
+  return _mm_castps_si128(_mm_shuffle_ps(
+      _mm_castsi128_ps(b), _mm_castsi128_ps(a), _MM_SHUFFLE(2, 0, 2, 0)));
+}
+
+SIMD_INLINE v128 v128_unpack_u8_s16(v64 a) {
+  return _mm_unpacklo_epi8(a, _mm_setzero_si128());
+}
+
+SIMD_INLINE v128 v128_unpacklo_u8_s16(v128 a) {
+  return _mm_unpacklo_epi8(a, _mm_setzero_si128());
+}
+
+SIMD_INLINE v128 v128_unpackhi_u8_s16(v128 a) {
+  return _mm_unpackhi_epi8(a, _mm_setzero_si128());
+}
+
+SIMD_INLINE v128 v128_unpack_s8_s16(v64 a) {
+  return _mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8);
+}
+
+SIMD_INLINE v128 v128_unpacklo_s8_s16(v128 a) {
+  return _mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8);
+}
+
+SIMD_INLINE v128 v128_unpackhi_s8_s16(v128 a) {
+  return _mm_srai_epi16(_mm_unpackhi_epi8(a, a), 8);
+}
+
+SIMD_INLINE v128 v128_pack_s32_s16(v128 a, v128 b) {
+  return _mm_packs_epi32(b, a);
+}
+
+SIMD_INLINE v128 v128_pack_s32_u16(v128 a, v128 b) {
+#if defined(__SSE4_1__)
+  return _mm_packus_epi32(b, a);
+#else
+  return v128_from_v64(v64_pack_s32_u16(v128_high_v64(a), v128_low_v64(a)),
+                       v64_pack_s32_u16(v128_high_v64(b), v128_low_v64(b)));
+#endif
+}
+
+SIMD_INLINE v128 v128_pack_s16_u8(v128 a, v128 b) {
+  return _mm_packus_epi16(b, a);
+}
+
+SIMD_INLINE v128 v128_pack_s16_s8(v128 a, v128 b) {
+  return _mm_packs_epi16(b, a);
+}
+
+SIMD_INLINE v128 v128_unpack_u16_s32(v64 a) {
+  return _mm_unpacklo_epi16(a, _mm_setzero_si128());
+}
+
+SIMD_INLINE v128 v128_unpack_s16_s32(v64 a) {
+  return _mm_srai_epi32(_mm_unpacklo_epi16(a, a), 16);
+}
+
+SIMD_INLINE v128 v128_unpacklo_u16_s32(v128 a) {
+  return _mm_unpacklo_epi16(a, _mm_setzero_si128());
+}
+
+SIMD_INLINE v128 v128_unpacklo_s16_s32(v128 a) {
+  return _mm_srai_epi32(_mm_unpacklo_epi16(a, a), 16);
+}
+
+SIMD_INLINE v128 v128_unpackhi_u16_s32(v128 a) {
+  return _mm_unpackhi_epi16(a, _mm_setzero_si128());
+}
+
+SIMD_INLINE v128 v128_unpackhi_s16_s32(v128 a) {
+  return _mm_srai_epi32(_mm_unpackhi_epi16(a, a), 16);
+}
+
+SIMD_INLINE v128 v128_shuffle_8(v128 x, v128 pattern) {
+#if defined(__SSSE3__)
+  return _mm_shuffle_epi8(x, pattern);
+#else
+  v128 output;
+  unsigned char *input = (unsigned char *)&x;
+  unsigned char *index = (unsigned char *)&pattern;
+  char *selected = (char *)&output;
+  int counter;
+
+  for (counter = 0; counter < 16; counter++) {
+    selected[counter] = input[index[counter] & 15];
+  }
+
+  return output;
+#endif
+}
+
+SIMD_INLINE int64_t v128_dotp_su8(v128 a, v128 b) {
+  v128 t1 = _mm_madd_epi16(v128_unpackhi_s8_s16(a), v128_unpackhi_u8_s16(b));
+  v128 t2 = _mm_madd_epi16(v128_unpacklo_s8_s16(a), v128_unpacklo_u8_s16(b));
+  v128 t = v128_add_32(t1, t2);
+  t = v128_add_32(t, _mm_srli_si128(t, 8));
+  t = v128_add_32(t, _mm_srli_si128(t, 4));
+  return (int32_t)v128_low_u32(t);
+}
+
+SIMD_INLINE int64_t v128_dotp_s16(v128 a, v128 b) {
+  v128 r = _mm_madd_epi16(a, b);
+#if defined(__SSE4_1__) && defined(__x86_64__)
+  v128 c = _mm_add_epi64(_mm_cvtepi32_epi64(r),
+                         _mm_cvtepi32_epi64(_mm_srli_si128(r, 8)));
+  return _mm_cvtsi128_si64(_mm_add_epi64(c, _mm_srli_si128(c, 8)));
+#else
+  return (int64_t)_mm_cvtsi128_si32(r) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 4)) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 8)) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 12));
+#endif
+}
+
+SIMD_INLINE uint64_t v128_hadd_u8(v128 a) {
+  v128 t = _mm_sad_epu8(a, _mm_setzero_si128());
+  return v64_low_u32(v128_low_v64(t)) + v64_low_u32(v128_high_v64(t));
+}
+
+typedef v128 sad128_internal;
+
+SIMD_INLINE sad128_internal v128_sad_u8_init() { return _mm_setzero_si128(); }
+
+/* Implementation dependent return value.  Result must be finalised with
+   v128_sad_sum().
+   The result for more than 32 v128_sad_u8() calls is undefined. */
+SIMD_INLINE sad128_internal v128_sad_u8(sad128_internal s, v128 a, v128 b) {
+  return _mm_add_epi64(s, _mm_sad_epu8(a, b));
+}
+
+SIMD_INLINE uint32_t v128_sad_u8_sum(sad128_internal s) {
+  return v128_low_u32(_mm_add_epi32(s, _mm_unpackhi_epi64(s, s)));
+}
+
+typedef int32_t ssd128_internal;
+
+SIMD_INLINE ssd128_internal v128_ssd_u8_init() { return 0; }
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v128_ssd_sum(). */
+SIMD_INLINE ssd128_internal v128_ssd_u8(ssd128_internal s, v128 a, v128 b) {
+  v128 z = _mm_setzero_si128();
+  v128 l = _mm_sub_epi16(_mm_unpacklo_epi8(a, z), _mm_unpacklo_epi8(b, z));
+  v128 h = _mm_sub_epi16(_mm_unpackhi_epi8(a, z), _mm_unpackhi_epi8(b, z));
+  v128 rl = _mm_madd_epi16(l, l);
+  v128 rh = _mm_madd_epi16(h, h);
+  v128 r = _mm_add_epi32(rl, rh);
+  r = _mm_add_epi32(r, _mm_srli_si128(r, 8));
+  r = _mm_add_epi32(r, _mm_srli_si128(r, 4));
+  return s + _mm_cvtsi128_si32(r);
+}
+
+SIMD_INLINE int32_t v128_ssd_u8_sum(ssd128_internal s) { return s; }
+
+SIMD_INLINE v128 v128_or(v128 a, v128 b) { return _mm_or_si128(a, b); }
+
+SIMD_INLINE v128 v128_xor(v128 a, v128 b) { return _mm_xor_si128(a, b); }
+
+SIMD_INLINE v128 v128_and(v128 a, v128 b) { return _mm_and_si128(a, b); }
+
+SIMD_INLINE v128 v128_andn(v128 a, v128 b) { return _mm_andnot_si128(b, a); }
+
+SIMD_INLINE v128 v128_mul_s16(v64 a, v64 b) {
+  v64 lo_bits = v64_mullo_s16(a, b);
+  v64 hi_bits = v64_mulhi_s16(a, b);
+  return v128_from_v64(v64_ziphi_16(hi_bits, lo_bits),
+                       v64_ziplo_16(hi_bits, lo_bits));
+}
+
+SIMD_INLINE v128 v128_mullo_s16(v128 a, v128 b) {
+  return _mm_mullo_epi16(a, b);
+}
+
+SIMD_INLINE v128 v128_mulhi_s16(v128 a, v128 b) {
+  return _mm_mulhi_epi16(a, b);
+}
+
+SIMD_INLINE v128 v128_mullo_s32(v128 a, v128 b) {
+#if defined(__SSE4_1__)
+  return _mm_mullo_epi32(a, b);
+#else
+  return _mm_unpacklo_epi32(
+      _mm_shuffle_epi32(_mm_mul_epu32(a, b), 8),
+      _mm_shuffle_epi32(
+          _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4)), 8));
+#endif
+}
+
+SIMD_INLINE int64_t v128_dotp_s32(v128 a, v128 b) {
+  v128 r = v128_mullo_s32(a, b);
+  return (int64_t)_mm_cvtsi128_si32(r) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 4)) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 8)) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 12));
+}
+
+SIMD_INLINE v128 v128_madd_s16(v128 a, v128 b) { return _mm_madd_epi16(a, b); }
+
+SIMD_INLINE v128 v128_madd_us8(v128 a, v128 b) {
+#if defined(__SSSE3__)
+  return _mm_maddubs_epi16(a, b);
+#else
+  return _mm_packs_epi32(
+      _mm_madd_epi16(_mm_unpacklo_epi8(a, _mm_setzero_si128()),
+                     _mm_srai_epi16(_mm_unpacklo_epi8(b, b), 8)),
+      _mm_madd_epi16(_mm_unpackhi_epi8(a, _mm_setzero_si128()),
+                     _mm_srai_epi16(_mm_unpackhi_epi8(b, b), 8)));
+#endif
+}
+
+SIMD_INLINE v128 v128_padd_u8(v128 a) {
+  return v128_madd_us8(a, _mm_set1_epi8(1));
+}
+
+SIMD_INLINE v128 v128_avg_u8(v128 a, v128 b) { return _mm_avg_epu8(a, b); }
+
+SIMD_INLINE v128 v128_rdavg_u8(v128 a, v128 b) {
+  return _mm_sub_epi8(_mm_avg_epu8(a, b),
+                      _mm_and_si128(_mm_xor_si128(a, b), v128_dup_8(1)));
+}
+
+SIMD_INLINE v128 v128_rdavg_u16(v128 a, v128 b) {
+  return _mm_sub_epi16(_mm_avg_epu16(a, b),
+                       _mm_and_si128(_mm_xor_si128(a, b), v128_dup_16(1)));
+}
+
+SIMD_INLINE v128 v128_avg_u16(v128 a, v128 b) { return _mm_avg_epu16(a, b); }
+
+SIMD_INLINE v128 v128_min_u8(v128 a, v128 b) { return _mm_min_epu8(a, b); }
+
+SIMD_INLINE v128 v128_max_u8(v128 a, v128 b) { return _mm_max_epu8(a, b); }
+
+SIMD_INLINE v128 v128_min_s8(v128 a, v128 b) {
+#if defined(__SSE4_1__)
+  return _mm_min_epi8(a, b);
+#else
+  v128 mask = _mm_cmplt_epi8(a, b);
+  return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a));
+#endif
+}
+
+SIMD_INLINE uint32_t v128_movemask_8(v128 a) { return _mm_movemask_epi8(a); }
+
+SIMD_INLINE v128 v128_blend_8(v128 a, v128 b, v128 c) {
+#if defined(__SSE4_1__)
+  return _mm_blendv_epi8(a, b, c);
+#else
+  c = _mm_cmplt_epi8(c, v128_zero());
+  return v128_or(v128_and(b, c), v128_andn(a, c));
+#endif
+}
+
+SIMD_INLINE v128 v128_max_s8(v128 a, v128 b) {
+#if defined(__SSE4_1__)
+  return _mm_max_epi8(a, b);
+#else
+  v128 mask = _mm_cmplt_epi8(b, a);
+  return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a));
+#endif
+}
+
+SIMD_INLINE v128 v128_min_s16(v128 a, v128 b) { return _mm_min_epi16(a, b); }
+
+SIMD_INLINE v128 v128_max_s16(v128 a, v128 b) { return _mm_max_epi16(a, b); }
+
+SIMD_INLINE v128 v128_min_s32(v128 a, v128 b) {
+#if defined(__SSE4_1__)
+  return _mm_min_epi32(a, b);
+#else
+  v128 mask = _mm_cmplt_epi32(a, b);
+  return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a));
+#endif
+}
+
+SIMD_INLINE v128 v128_max_s32(v128 a, v128 b) {
+#if defined(__SSE4_1__)
+  return _mm_max_epi32(a, b);
+#else
+  v128 mask = _mm_cmplt_epi32(b, a);
+  return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a));
+#endif
+}
+
+SIMD_INLINE v128 v128_cmpgt_s8(v128 a, v128 b) { return _mm_cmpgt_epi8(a, b); }
+
+SIMD_INLINE v128 v128_cmplt_s8(v128 a, v128 b) { return _mm_cmplt_epi8(a, b); }
+
+SIMD_INLINE v128 v128_cmpeq_8(v128 a, v128 b) { return _mm_cmpeq_epi8(a, b); }
+
+SIMD_INLINE v128 v128_cmpgt_s16(v128 a, v128 b) {
+  return _mm_cmpgt_epi16(a, b);
+}
+
+SIMD_INLINE v128 v128_cmplt_s16(v128 a, v128 b) {
+  return _mm_cmplt_epi16(a, b);
+}
+
+SIMD_INLINE v128 v128_cmpeq_32(v128 a, v128 b) { return _mm_cmpeq_epi32(a, b); }
+
+SIMD_INLINE v128 v128_cmpgt_s32(v128 a, v128 b) {
+  return _mm_cmpgt_epi32(a, b);
+}
+
+SIMD_INLINE v128 v128_cmplt_s32(v128 a, v128 b) {
+  return _mm_cmplt_epi32(a, b);
+}
+
+SIMD_INLINE v128 v128_cmpeq_16(v128 a, v128 b) { return _mm_cmpeq_epi16(a, b); }
+
+SIMD_INLINE v128 v128_shl_8(v128 a, unsigned int c) {
+  return _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << c)),
+                       _mm_sll_epi16(a, _mm_cvtsi32_si128(c)));
+}
+
+SIMD_INLINE v128 v128_shr_u8(v128 a, unsigned int c) {
+  return _mm_and_si128(_mm_set1_epi8((char)(0xff >> c)),
+                       _mm_srl_epi16(a, _mm_cvtsi32_si128(c)));
+}
+
+SIMD_INLINE v128 v128_shr_s8(v128 a, unsigned int c) {
+  __m128i x = _mm_cvtsi32_si128(c + 8);
+  return _mm_packs_epi16(_mm_sra_epi16(_mm_unpacklo_epi8(a, a), x),
+                         _mm_sra_epi16(_mm_unpackhi_epi8(a, a), x));
+}
+
+SIMD_INLINE v128 v128_shl_16(v128 a, unsigned int c) {
+  return _mm_sll_epi16(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v128 v128_shr_u16(v128 a, unsigned int c) {
+  return _mm_srl_epi16(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v128 v128_shr_s16(v128 a, unsigned int c) {
+  return _mm_sra_epi16(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v128 v128_shl_32(v128 a, unsigned int c) {
+  return _mm_sll_epi32(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v128 v128_shr_u32(v128 a, unsigned int c) {
+  return _mm_srl_epi32(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v128 v128_shr_s32(v128 a, unsigned int c) {
+  return _mm_sra_epi32(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v128 v128_shl_64(v128 a, unsigned int c) {
+  return _mm_sll_epi64(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v128 v128_shr_u64(v128 a, unsigned int c) {
+  return _mm_srl_epi64(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v128 v128_shr_s64(v128 a, unsigned int c) {
+  // _mm_sra_epi64 is missing in gcc?
+  return v128_from_64((int64_t)v64_u64(v128_high_v64(a)) >> c,
+                      (int64_t)v64_u64(v128_low_v64(a)) >> c);
+  // return _mm_sra_epi64(a, _mm_cvtsi32_si128(c));
+}
+
+/* These intrinsics require immediate values, so we must use #defines
+   to enforce that. */
+#define v128_shl_n_byte(a, c) _mm_slli_si128(a, (c)&127)
+#define v128_shr_n_byte(a, c) _mm_srli_si128(a, (c)&127)
+#define v128_shl_n_8(a, c) \
+  _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << (c))), _mm_slli_epi16(a, c))
+#define v128_shr_n_u8(a, c) \
+  _mm_and_si128(_mm_set1_epi8(0xff >> (c)), _mm_srli_epi16(a, c))
+#define v128_shr_n_s8(a, c)                                         \
+  _mm_packs_epi16(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), (c) + 8), \
+                  _mm_srai_epi16(_mm_unpackhi_epi8(a, a), (c) + 8))
+#define v128_shl_n_16(a, c) _mm_slli_epi16(a, c)
+#define v128_shr_n_u16(a, c) _mm_srli_epi16(a, c)
+#define v128_shr_n_s16(a, c) _mm_srai_epi16(a, c)
+#define v128_shl_n_32(a, c) _mm_slli_epi32(a, c)
+#define v128_shr_n_u32(a, c) _mm_srli_epi32(a, c)
+#define v128_shr_n_s32(a, c) _mm_srai_epi32(a, c)
+#define v128_shl_n_64(a, c) _mm_slli_epi64(a, c)
+#define v128_shr_n_u64(a, c) _mm_srli_epi64(a, c)
+#define v128_shr_n_s64(a, c) \
+  v128_shr_s64(a, c)  // _mm_srai_epi64 missing in gcc?
+
+typedef v128 sad128_internal_u16;
+
+SIMD_INLINE sad128_internal_u16 v128_sad_u16_init() { return v128_zero(); }
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v128_sad_u16_sum(). */
+SIMD_INLINE sad128_internal_u16 v128_sad_u16(sad128_internal_u16 s, v128 a,
+                                             v128 b) {
+#if defined(__SSE4_1__)
+  v128 t = v128_sub_16(_mm_max_epu16(a, b), _mm_min_epu16(a, b));
+#else
+  v128 t = v128_cmplt_s16(v128_xor(a, v128_dup_16(32768)),
+                          v128_xor(b, v128_dup_16(32768)));
+  t = v128_sub_16(v128_or(v128_and(b, t), v128_andn(a, t)),
+                  v128_or(v128_and(a, t), v128_andn(b, t)));
+#endif
+  return v128_add_32(
+      s, v128_add_32(v128_unpackhi_u16_s32(t), v128_unpacklo_u16_s32(t)));
+}
+
+SIMD_INLINE uint32_t v128_sad_u16_sum(sad128_internal_u16 s) {
+  return v128_low_u32(s) + v128_low_u32(v128_shr_n_byte(s, 4)) +
+         v128_low_u32(v128_shr_n_byte(s, 8)) +
+         v128_low_u32(v128_shr_n_byte(s, 12));
+}
+
+typedef v128 ssd128_internal_s16;
+
+SIMD_INLINE ssd128_internal_s16 v128_ssd_s16_init() { return v128_zero(); }
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v128_ssd_s16_sum(). */
+SIMD_INLINE ssd128_internal_s16 v128_ssd_s16(ssd128_internal_s16 s, v128 a,
+                                             v128 b) {
+  v128 d = v128_sub_16(a, b);
+  d = v128_madd_s16(d, d);
+  return v128_add_64(s, v128_add_64(_mm_unpackhi_epi32(d, v128_zero()),
+                                    _mm_unpacklo_epi32(d, v128_zero())));
+}
+
+SIMD_INLINE uint64_t v128_ssd_s16_sum(ssd128_internal_s16 s) {
+  return v64_u64(v128_low_v64(s)) + v64_u64(v128_high_v64(s));
+}
+
+#endif  // AOM_AOM_DSP_SIMD_V128_INTRINSICS_X86_H_
diff --git a/libs/libaom/src/aom_dsp/simd/v256_intrinsics.h b/libs/libaom/src/aom_dsp/simd/v256_intrinsics.h
new file mode 100644
index 000000000..17e36eed6
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/simd/v256_intrinsics.h
@@ -0,0 +1,377 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_H_
+#define AOM_AOM_DSP_SIMD_V256_INTRINSICS_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom_dsp/simd/v256_intrinsics_c.h"
+#include "aom_dsp/simd/v128_intrinsics.h"
+#include "aom_dsp/simd/v64_intrinsics.h"
+
+/* Fallback to plain, unoptimised C. */
+
+typedef c_v256 v256;
+
+SIMD_INLINE uint32_t v256_low_u32(v256 a) { return c_v256_low_u32(a); }
+SIMD_INLINE v64 v256_low_v64(v256 a) { return c_v256_low_v64(a); }
+SIMD_INLINE uint64_t v256_low_u64(v256 a) { return c_v256_low_u64(a); }
+SIMD_INLINE v128 v256_low_v128(v256 a) { return c_v256_low_v128(a); }
+SIMD_INLINE v128 v256_high_v128(v256 a) { return c_v256_high_v128(a); }
+SIMD_INLINE v256 v256_from_v128(v128 hi, v128 lo) {
+  return c_v256_from_v128(hi, lo);
+}
+SIMD_INLINE v256 v256_from_64(uint64_t a, uint64_t b, uint64_t c, uint64_t d) {
+  return c_v256_from_64(a, b, c, d);
+}
+SIMD_INLINE v256 v256_from_v64(v64 a, v64 b, v64 c, v64 d) {
+  return c_v256_from_v64(a, b, c, d);
+}
+
+SIMD_INLINE v256 v256_load_unaligned(const void *p) {
+  return c_v256_load_unaligned(p);
+}
+SIMD_INLINE v256 v256_load_aligned(const void *p) {
+  return c_v256_load_aligned(p);
+}
+
+SIMD_INLINE void v256_store_unaligned(void *p, v256 a) {
+  c_v256_store_unaligned(p, a);
+}
+SIMD_INLINE void v256_store_aligned(void *p, v256 a) {
+  c_v256_store_aligned(p, a);
+}
+
+SIMD_INLINE v256 v256_align(v256 a, v256 b, unsigned int c) {
+  return c_v256_align(a, b, c);
+}
+
+SIMD_INLINE v256 v256_zero(void) { return c_v256_zero(); }
+SIMD_INLINE v256 v256_dup_8(uint8_t x) { return c_v256_dup_8(x); }
+SIMD_INLINE v256 v256_dup_16(uint16_t x) { return c_v256_dup_16(x); }
+SIMD_INLINE v256 v256_dup_32(uint32_t x) { return c_v256_dup_32(x); }
+SIMD_INLINE v256 v256_dup_64(uint64_t x) { return c_v256_dup_64(x); }
+
+SIMD_INLINE c_sad256_internal v256_sad_u8_init(void) {
+  return c_v256_sad_u8_init();
+}
+SIMD_INLINE c_sad256_internal v256_sad_u8(c_sad256_internal s, v256 a, v256 b) {
+  return c_v256_sad_u8(s, a, b);
+}
+SIMD_INLINE uint32_t v256_sad_u8_sum(c_sad256_internal s) {
+  return c_v256_sad_u8_sum(s);
+}
+SIMD_INLINE c_ssd256_internal v256_ssd_u8_init(void) {
+  return c_v256_ssd_u8_init();
+}
+SIMD_INLINE c_ssd256_internal v256_ssd_u8(c_ssd256_internal s, v256 a, v256 b) {
+  return c_v256_ssd_u8(s, a, b);
+}
+SIMD_INLINE uint32_t v256_ssd_u8_sum(c_ssd256_internal s) {
+  return c_v256_ssd_u8_sum(s);
+}
+
+SIMD_INLINE c_ssd256_internal_s16 v256_ssd_s16_init(void) {
+  return c_v256_ssd_s16_init();
+}
+SIMD_INLINE c_ssd256_internal_s16 v256_ssd_s16(c_ssd256_internal_s16 s, v256 a,
+                                               v256 b) {
+  return c_v256_ssd_s16(s, a, b);
+}
+SIMD_INLINE uint64_t v256_ssd_s16_sum(c_ssd256_internal_s16 s) {
+  return c_v256_ssd_s16_sum(s);
+}
+
+SIMD_INLINE int64_t v256_dotp_su8(v256 a, v256 b) {
+  return c_v256_dotp_su8(a, b);
+}
+SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) {
+  return c_v256_dotp_s16(a, b);
+}
+SIMD_INLINE int64_t v256_dotp_s32(v256 a, v256 b) {
+  return c_v256_dotp_s32(a, b);
+}
+SIMD_INLINE uint64_t v256_hadd_u8(v256 a) { return c_v256_hadd_u8(a); }
+
+SIMD_INLINE v256 v256_or(v256 a, v256 b) { return c_v256_or(a, b); }
+SIMD_INLINE v256 v256_xor(v256 a, v256 b) { return c_v256_xor(a, b); }
+SIMD_INLINE v256 v256_and(v256 a, v256 b) { return c_v256_and(a, b); }
+SIMD_INLINE v256 v256_andn(v256 a, v256 b) { return c_v256_andn(a, b); }
+
+SIMD_INLINE v256 v256_add_8(v256 a, v256 b) { return c_v256_add_8(a, b); }
+SIMD_INLINE v256 v256_add_16(v256 a, v256 b) { return c_v256_add_16(a, b); }
+SIMD_INLINE v256 v256_sadd_s8(v256 a, v256 b) { return c_v256_sadd_s8(a, b); }
+SIMD_INLINE v256 v256_sadd_u8(v256 a, v256 b) { return c_v256_sadd_u8(a, b); }
+SIMD_INLINE v256 v256_sadd_s16(v256 a, v256 b) { return c_v256_sadd_s16(a, b); }
+SIMD_INLINE v256 v256_add_32(v256 a, v256 b) { return c_v256_add_32(a, b); }
+SIMD_INLINE v256 v256_add_64(v256 a, v256 b) { return c_v256_add_64(a, b); }
+SIMD_INLINE v256 v256_sub_64(v256 a, v256 b) { return c_v256_sub_64(a, b); }
+SIMD_INLINE v256 v256_padd_u8(v256 a) { return c_v256_padd_u8(a); }
+SIMD_INLINE v256 v256_padd_s16(v256 a) { return c_v256_padd_s16(a); }
+SIMD_INLINE v256 v256_sub_8(v256 a, v256 b) { return c_v256_sub_8(a, b); }
+SIMD_INLINE v256 v256_ssub_u8(v256 a, v256 b) { return c_v256_ssub_u8(a, b); }
+SIMD_INLINE v256 v256_ssub_s8(v256 a, v256 b) { return c_v256_ssub_s8(a, b); }
+SIMD_INLINE v256 v256_sub_16(v256 a, v256 b) { return c_v256_sub_16(a, b); }
+SIMD_INLINE v256 v256_ssub_s16(v256 a, v256 b) { return c_v256_ssub_s16(a, b); }
+SIMD_INLINE v256 v256_ssub_u16(v256 a, v256 b) { return c_v256_ssub_u16(a, b); }
+SIMD_INLINE v256 v256_sub_32(v256 a, v256 b) { return c_v256_sub_32(a, b); }
+SIMD_INLINE v256 v256_abs_s16(v256 a) { return c_v256_abs_s16(a); }
+SIMD_INLINE v256 v256_abs_s8(v256 a) { return c_v256_abs_s8(a); }
+
+SIMD_INLINE v256 v256_mul_s16(v128 a, v128 b) { return c_v256_mul_s16(a, b); }
+SIMD_INLINE v256 v256_mullo_s16(v256 a, v256 b) {
+  return c_v256_mullo_s16(a, b);
+}
+SIMD_INLINE v256 v256_mulhi_s16(v256 a, v256 b) {
+  return c_v256_mulhi_s16(a, b);
+}
+SIMD_INLINE v256 v256_mullo_s32(v256 a, v256 b) {
+  return c_v256_mullo_s32(a, b);
+}
+SIMD_INLINE v256 v256_madd_s16(v256 a, v256 b) { return c_v256_madd_s16(a, b); }
+SIMD_INLINE v256 v256_madd_us8(v256 a, v256 b) { return c_v256_madd_us8(a, b); }
+
+SIMD_INLINE uint32_t v256_movemask_8(v256 a) { return c_v256_movemask_8(a); }
+SIMD_INLINE v256 v256_blend_8(v256 a, v256 b, v256 c) {
+  return c_v256_blend_8(a, b, c);
+}
+
+SIMD_INLINE v256 v256_avg_u8(v256 a, v256 b) { return c_v256_avg_u8(a, b); }
+SIMD_INLINE v256 v256_rdavg_u8(v256 a, v256 b) { return c_v256_rdavg_u8(a, b); }
+SIMD_INLINE v256 v256_rdavg_u16(v256 a, v256 b) {
+  return c_v256_rdavg_u16(a, b);
+}
+SIMD_INLINE v256 v256_avg_u16(v256 a, v256 b) { return c_v256_avg_u16(a, b); }
+SIMD_INLINE v256 v256_min_u8(v256 a, v256 b) { return c_v256_min_u8(a, b); }
+SIMD_INLINE v256 v256_max_u8(v256 a, v256 b) { return c_v256_max_u8(a, b); }
+SIMD_INLINE v256 v256_min_s8(v256 a, v256 b) { return c_v256_min_s8(a, b); }
+SIMD_INLINE v256 v256_max_s8(v256 a, v256 b) { return c_v256_max_s8(a, b); }
+SIMD_INLINE v256 v256_min_s16(v256 a, v256 b) { return c_v256_min_s16(a, b); }
+SIMD_INLINE v256 v256_max_s16(v256 a, v256 b) { return c_v256_max_s16(a, b); }
+SIMD_INLINE v256 v256_min_s32(v256 a, v256 b) { return c_v256_min_s32(a, b); }
+SIMD_INLINE v256 v256_max_s32(v256 a, v256 b) { return c_v256_max_s32(a, b); }
+
+SIMD_INLINE v256 v256_ziplo_8(v256 a, v256 b) { return c_v256_ziplo_8(a, b); }
+SIMD_INLINE v256 v256_ziphi_8(v256 a, v256 b) { return c_v256_ziphi_8(a, b); }
+SIMD_INLINE v256 v256_ziplo_16(v256 a, v256 b) { return c_v256_ziplo_16(a, b); }
+SIMD_INLINE v256 v256_ziphi_16(v256 a, v256 b) { return c_v256_ziphi_16(a, b); }
+SIMD_INLINE v256 v256_ziplo_32(v256 a, v256 b) { return c_v256_ziplo_32(a, b); }
+SIMD_INLINE v256 v256_ziphi_32(v256 a, v256 b) { return c_v256_ziphi_32(a, b); }
+SIMD_INLINE v256 v256_ziplo_64(v256 a, v256 b) { return c_v256_ziplo_64(a, b); }
+SIMD_INLINE v256 v256_ziphi_64(v256 a, v256 b) { return c_v256_ziphi_64(a, b); }
+SIMD_INLINE v256 v256_ziplo_128(v256 a, v256 b) {
+  return c_v256_ziplo_128(a, b);
+}
+SIMD_INLINE v256 v256_ziphi_128(v256 a, v256 b) {
+  return c_v256_ziphi_128(a, b);
+}
+SIMD_INLINE v256 v256_zip_8(v128 a, v128 b) { return c_v256_zip_8(a, b); }
+SIMD_INLINE v256 v256_zip_16(v128 a, v128 b) { return c_v256_zip_16(a, b); }
+SIMD_INLINE v256 v256_zip_32(v128 a, v128 b) { return c_v256_zip_32(a, b); }
+SIMD_INLINE v256 v256_unziplo_8(v256 a, v256 b) {
+  return c_v256_unziplo_8(a, b);
+}
+SIMD_INLINE v256 v256_unziphi_8(v256 a, v256 b) {
+  return c_v256_unziphi_8(a, b);
+}
+SIMD_INLINE v256 v256_unziplo_16(v256 a, v256 b) {
+  return c_v256_unziplo_16(a, b);
+}
+SIMD_INLINE v256 v256_unziphi_16(v256 a, v256 b) {
+  return c_v256_unziphi_16(a, b);
+}
+SIMD_INLINE v256 v256_unziplo_32(v256 a, v256 b) {
+  return c_v256_unziplo_32(a, b);
+}
+SIMD_INLINE v256 v256_unziphi_32(v256 a, v256 b) {
+  return c_v256_unziphi_32(a, b);
+}
+SIMD_INLINE v256 v256_unziplo_64(v256 a, v256 b) {
+  return c_v256_unziplo_64(a, b);
+}
+SIMD_INLINE v256 v256_unziphi_64(v256 a, v256 b) {
+  return c_v256_unziphi_64(a, b);
+}
+SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) { return c_v256_unpack_u8_s16(a); }
+SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) {
+  return c_v256_unpacklo_u8_s16(a);
+}
+SIMD_INLINE v256 v256_unpackhi_u8_s16(v256 a) {
+  return c_v256_unpackhi_u8_s16(a);
+}
+SIMD_INLINE v256 v256_unpack_s8_s16(v128 a) { return c_v256_unpack_s8_s16(a); }
+SIMD_INLINE v256 v256_unpacklo_s8_s16(v256 a) {
+  return c_v256_unpacklo_s8_s16(a);
+}
+SIMD_INLINE v256 v256_unpackhi_s8_s16(v256 a) {
+  return c_v256_unpackhi_s8_s16(a);
+}
+SIMD_INLINE v256 v256_pack_s32_s16(v256 a, v256 b) {
+  return c_v256_pack_s32_s16(a, b);
+}
+SIMD_INLINE v256 v256_pack_s32_u16(v256 a, v256 b) {
+  return c_v256_pack_s32_u16(a, b);
+}
+SIMD_INLINE v256 v256_pack_s16_u8(v256 a, v256 b) {
+  return c_v256_pack_s16_u8(a, b);
+}
+SIMD_INLINE v256 v256_pack_s16_s8(v256 a, v256 b) {
+  return c_v256_pack_s16_s8(a, b);
+}
+SIMD_INLINE v256 v256_unpack_u16_s32(v128 a) {
+  return c_v256_unpack_u16_s32(a);
+}
+SIMD_INLINE v256 v256_unpack_s16_s32(v128 a) {
+  return c_v256_unpack_s16_s32(a);
+}
+SIMD_INLINE v256 v256_unpacklo_u16_s32(v256 a) {
+  return c_v256_unpacklo_u16_s32(a);
+}
+SIMD_INLINE v256 v256_unpacklo_s16_s32(v256 a) {
+  return c_v256_unpacklo_s16_s32(a);
+}
+SIMD_INLINE v256 v256_unpackhi_u16_s32(v256 a) {
+  return c_v256_unpackhi_u16_s32(a);
+}
+SIMD_INLINE v256 v256_unpackhi_s16_s32(v256 a) {
+  return c_v256_unpackhi_s16_s32(a);
+}
+SIMD_INLINE v256 v256_shuffle_8(v256 a, v256 pattern) {
+  return c_v256_shuffle_8(a, pattern);
+}
+SIMD_INLINE v256 v256_wideshuffle_8(v256 a, v256 b, v256 pattern) {
+  return c_v256_wideshuffle_8(a, b, pattern);
+}
+SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) {
+  return c_v256_pshuffle_8(a, pattern);
+}
+
+SIMD_INLINE v256 v256_cmpgt_s8(v256 a, v256 b) { return c_v256_cmpgt_s8(a, b); }
+SIMD_INLINE v256 v256_cmplt_s8(v256 a, v256 b) { return c_v256_cmplt_s8(a, b); }
+SIMD_INLINE v256 v256_cmpeq_8(v256 a, v256 b) { return c_v256_cmpeq_8(a, b); }
+SIMD_INLINE v256 v256_cmpgt_s16(v256 a, v256 b) {
+  return c_v256_cmpgt_s16(a, b);
+}
+SIMD_INLINE v256 v256_cmplt_s16(v256 a, v256 b) {
+  return c_v256_cmplt_s16(a, b);
+}
+SIMD_INLINE v256 v256_cmpeq_16(v256 a, v256 b) { return c_v256_cmpeq_16(a, b); }
+SIMD_INLINE v256 v256_cmpeq_32(v256 a, v256 b) { return c_v256_cmpeq_32(a, b); }
+
+SIMD_INLINE v256 v256_cmpgt_s32(v256 a, v256 b) {
+  return c_v256_cmpgt_s32(a, b);
+}
+SIMD_INLINE v256 v256_cmplt_s32(v256 a, v256 b) {
+  return c_v256_cmplt_s32(a, b);
+}
+SIMD_INLINE v256 v256_shl_8(v256 a, unsigned int c) {
+  return c_v256_shl_8(a, c);
+}
+SIMD_INLINE v256 v256_shr_u8(v256 a, unsigned int c) {
+  return c_v256_shr_u8(a, c);
+}
+SIMD_INLINE v256 v256_shr_s8(v256 a, unsigned int c) {
+  return c_v256_shr_s8(a, c);
+}
+SIMD_INLINE v256 v256_shl_16(v256 a, unsigned int c) {
+  return c_v256_shl_16(a, c);
+}
+SIMD_INLINE v256 v256_shr_u16(v256 a, unsigned int c) {
+  return c_v256_shr_u16(a, c);
+}
+SIMD_INLINE v256 v256_shr_s16(v256 a, unsigned int c) {
+  return c_v256_shr_s16(a, c);
+}
+SIMD_INLINE v256 v256_shl_32(v256 a, unsigned int c) {
+  return c_v256_shl_32(a, c);
+}
+SIMD_INLINE v256 v256_shr_u32(v256 a, unsigned int c) {
+  return c_v256_shr_u32(a, c);
+}
+SIMD_INLINE v256 v256_shr_s32(v256 a, unsigned int c) {
+  return c_v256_shr_s32(a, c);
+}
+SIMD_INLINE v256 v256_shl_64(v256 a, unsigned int c) {
+  return c_v256_shl_64(a, c);
+}
+SIMD_INLINE v256 v256_shr_u64(v256 a, unsigned int c) {
+  return c_v256_shr_u64(a, c);
+}
+SIMD_INLINE v256 v256_shr_s64(v256 a, unsigned int c) {
+  return c_v256_shr_s64(a, c);
+}
+
+SIMD_INLINE v256 v256_shr_n_byte(v256 a, unsigned int n) {
+  return c_v256_shr_n_byte(a, n);
+}
+SIMD_INLINE v256 v256_shl_n_byte(v256 a, unsigned int n) {
+  return c_v256_shl_n_byte(a, n);
+}
+SIMD_INLINE v256 v256_shl_n_8(v256 a, unsigned int n) {
+  return c_v256_shl_n_8(a, n);
+}
+SIMD_INLINE v256 v256_shl_n_16(v256 a, unsigned int n) {
+  return c_v256_shl_n_16(a, n);
+}
+SIMD_INLINE v256 v256_shl_n_32(v256 a, unsigned int n) {
+  return c_v256_shl_n_32(a, n);
+}
+SIMD_INLINE v256 v256_shl_n_64(v256 a, unsigned int n) {
+  return c_v256_shl_n_64(a, n);
+}
+SIMD_INLINE v256 v256_shr_n_u8(v256 a, unsigned int n) {
+  return c_v256_shr_n_u8(a, n);
+}
+SIMD_INLINE v256 v256_shr_n_u16(v256 a, unsigned int n) {
+  return c_v256_shr_n_u16(a, n);
+}
+SIMD_INLINE v256 v256_shr_n_u32(v256 a, unsigned int n) {
+  return c_v256_shr_n_u32(a, n);
+}
+SIMD_INLINE v256 v256_shr_n_u64(v256 a, unsigned int n) {
+  return c_v256_shr_n_u64(a, n);
+}
+SIMD_INLINE v256 v256_shr_n_s8(v256 a, unsigned int n) {
+  return c_v256_shr_n_s8(a, n);
+}
+SIMD_INLINE v256 v256_shr_n_s16(v256 a, unsigned int n) {
+  return c_v256_shr_n_s16(a, n);
+}
+SIMD_INLINE v256 v256_shr_n_s32(v256 a, unsigned int n) {
+  return c_v256_shr_n_s32(a, n);
+}
+SIMD_INLINE v256 v256_shr_n_s64(v256 a, unsigned int n) {
+  return c_v256_shr_n_s64(a, n);
+}
+
+SIMD_INLINE v256 v256_shr_n_word(v256 a, unsigned int n) {
+  return c_v256_shr_n_word(a, n);
+}
+SIMD_INLINE v256 v256_shl_n_word(v256 a, unsigned int n) {
+  return c_v256_shl_n_word(a, n);
+}
+
+typedef uint32_t sad256_internal_u16;
+SIMD_INLINE sad256_internal_u16 v256_sad_u16_init(void) {
+  return c_v256_sad_u16_init();
+}
+SIMD_INLINE sad256_internal_u16 v256_sad_u16(sad256_internal_u16 s, v256 a,
+                                             v256 b) {
+  return c_v256_sad_u16(s, a, b);
+}
+SIMD_INLINE uint32_t v256_sad_u16_sum(sad256_internal_u16 s) {
+  return c_v256_sad_u16_sum(s);
+}
+
+#endif  // AOM_AOM_DSP_SIMD_V256_INTRINSICS_H_
diff --git a/libs/libaom/src/aom_dsp/simd/v256_intrinsics_arm.h b/libs/libaom/src/aom_dsp/simd/v256_intrinsics_arm.h
new file mode 100644
index 000000000..bd86ea172
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/simd/v256_intrinsics_arm.h
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_ARM_H_
+#define AOM_AOM_DSP_SIMD_V256_INTRINSICS_ARM_H_
+
+#include "aom_dsp/simd/v256_intrinsics_v128.h"
+
+#endif  // AOM_AOM_DSP_SIMD_V256_INTRINSICS_ARM_H_
diff --git a/libs/libaom/src/aom_dsp/simd/v256_intrinsics_c.h b/libs/libaom/src/aom_dsp/simd/v256_intrinsics_c.h
new file mode 100644
index 000000000..8127ee356
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/simd/v256_intrinsics_c.h
@@ -0,0 +1,968 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_C_H_
+#define AOM_AOM_DSP_SIMD_V256_INTRINSICS_C_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/simd/v128_intrinsics_c.h"
+
+typedef union {
+  uint8_t u8[32];
+  uint16_t u16[16];
+  uint32_t u32[8];
+  uint64_t u64[4];
+  int8_t s8[32];
+  int16_t s16[16];
+  int32_t s32[8];
+  int64_t s64[4];
+  c_v64 v64[4];
+  c_v128 v128[2];
+} c_v256;
+
+SIMD_INLINE uint32_t c_v256_low_u32(c_v256 a) { return a.u32[0]; }
+
+SIMD_INLINE c_v64 c_v256_low_v64(c_v256 a) { return a.v64[0]; }
+
+SIMD_INLINE uint64_t c_v256_low_u64(c_v256 a) { return a.u64[0]; }
+
+SIMD_INLINE c_v128 c_v256_low_v128(c_v256 a) { return a.v128[0]; }
+
+SIMD_INLINE c_v128 c_v256_high_v128(c_v256 a) { return a.v128[1]; }
+
+SIMD_INLINE c_v256 c_v256_from_v128(c_v128 hi, c_v128 lo) {
+  c_v256 t;
+  t.v128[1] = hi;
+  t.v128[0] = lo;
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_from_64(uint64_t a, uint64_t b, uint64_t c,
+                                  uint64_t d) {
+  c_v256 t;
+  t.u64[3] = a;
+  t.u64[2] = b;
+  t.u64[1] = c;
+  t.u64[0] = d;
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_from_v64(c_v64 a, c_v64 b, c_v64 c, c_v64 d) {
+  c_v256 t;
+  t.u64[3] = a.u64;
+  t.u64[2] = b.u64;
+  t.u64[1] = c.u64;
+  t.u64[0] = d.u64;
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_load_unaligned(const void *p) {
+  c_v256 t;
+  uint8_t *pp = (uint8_t *)p;
+  uint8_t *q = (uint8_t *)&t;
+  int c;
+  for (c = 0; c < 32; c++) q[c] = pp[c];
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_load_aligned(const void *p) {
+  if (SIMD_CHECK && (uintptr_t)p & 31) {
+    fprintf(stderr, "Error: unaligned v256 load at %p\n", p);
+    abort();
+  }
+  return c_v256_load_unaligned(p);
+}
+
+SIMD_INLINE void c_v256_store_unaligned(void *p, c_v256 a) {
+  uint8_t *pp = (uint8_t *)p;
+  uint8_t *q = (uint8_t *)&a;
+  int c;
+  for (c = 0; c < 32; c++) pp[c] = q[c];
+}
+
+SIMD_INLINE void c_v256_store_aligned(void *p, c_v256 a) {
+  if (SIMD_CHECK && (uintptr_t)p & 31) {
+    fprintf(stderr, "Error: unaligned v256 store at %p\n", p);
+    abort();
+  }
+  c_v256_store_unaligned(p, a);
+}
+
+SIMD_INLINE c_v256 c_v256_zero() {
+  c_v256 t;
+  t.u64[3] = t.u64[2] = t.u64[1] = t.u64[0] = 0;
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_dup_8(uint8_t x) {
+  c_v256 t;
+  t.v64[3] = t.v64[2] = t.v64[1] = t.v64[0] = c_v64_dup_8(x);
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_dup_16(uint16_t x) {
+  c_v256 t;
+  t.v64[3] = t.v64[2] = t.v64[1] = t.v64[0] = c_v64_dup_16(x);
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_dup_32(uint32_t x) {
+  c_v256 t;
+  t.v64[3] = t.v64[2] = t.v64[1] = t.v64[0] = c_v64_dup_32(x);
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_dup_64(uint64_t x) {
+  c_v256 t;
+  t.u64[3] = t.u64[2] = t.u64[1] = t.u64[0] = x;
+  return t;
+}
+
+SIMD_INLINE int64_t c_v256_dotp_su8(c_v256 a, c_v256 b) {
+  return c_v128_dotp_su8(a.v128[1], b.v128[1]) +
+         c_v128_dotp_su8(a.v128[0], b.v128[0]);
+}
+
+SIMD_INLINE int64_t c_v256_dotp_s16(c_v256 a, c_v256 b) {
+  return c_v128_dotp_s16(a.v128[1], b.v128[1]) +
+         c_v128_dotp_s16(a.v128[0], b.v128[0]);
+}
+
+SIMD_INLINE int64_t c_v256_dotp_s32(c_v256 a, c_v256 b) {
+  return c_v128_dotp_s32(a.v128[1], b.v128[1]) +
+         c_v128_dotp_s32(a.v128[0], b.v128[0]);
+}
+
+SIMD_INLINE uint64_t c_v256_hadd_u8(c_v256 a) {
+  return c_v128_hadd_u8(a.v128[1]) + c_v128_hadd_u8(a.v128[0]);
+}
+
+typedef struct {
+  uint32_t val;
+  int count;
+} c_sad256_internal;
+
+SIMD_INLINE c_sad256_internal c_v256_sad_u8_init(void) {
+  c_sad256_internal t;
+  t.val = t.count = 0;
+  return t;
+}
+
+/* Implementation dependent return value.  Result must be finalised with
+   v256_sad_u8_sum().
+   The result for more than 16 v256_sad_u8() calls is undefined. */
+SIMD_INLINE c_sad256_internal c_v256_sad_u8(c_sad256_internal s, c_v256 a,
+                                            c_v256 b) {
+  int c;
+  for (c = 0; c < 32; c++)
+    s.val += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c];
+  s.count++;
+  if (SIMD_CHECK && s.count > 32) {
+    fprintf(stderr,
+            "Error: sad called 32 times returning an undefined result\n");
+    abort();
+  }
+  return s;
+}
+
+SIMD_INLINE uint32_t c_v256_sad_u8_sum(c_sad256_internal s) { return s.val; }
+
+typedef uint32_t c_ssd256_internal;
+
+SIMD_INLINE c_ssd256_internal c_v256_ssd_u8_init() { return 0; }
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v256_ssd_u8_sum(). */
+SIMD_INLINE c_ssd256_internal c_v256_ssd_u8(c_ssd256_internal s, c_v256 a,
+                                            c_v256 b) {
+  int c;
+  for (c = 0; c < 32; c++) s += (a.u8[c] - b.u8[c]) * (a.u8[c] - b.u8[c]);
+  return s;
+}
+
+SIMD_INLINE uint32_t c_v256_ssd_u8_sum(c_ssd256_internal s) { return s; }
+
+SIMD_INLINE c_v256 c_v256_or(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_or(a.v128[1], b.v128[1]),
+                          c_v128_or(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_xor(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_xor(a.v128[1], b.v128[1]),
+                          c_v128_xor(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_and(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_and(a.v128[1], b.v128[1]),
+                          c_v128_and(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_andn(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_andn(a.v128[1], b.v128[1]),
+                          c_v128_andn(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_add_8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_add_8(a.v128[1], b.v128[1]),
+                          c_v128_add_8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_add_16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_add_16(a.v128[1], b.v128[1]),
+                          c_v128_add_16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_sadd_s8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_sadd_s8(a.v128[1], b.v128[1]),
+                          c_v128_sadd_s8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_sadd_u8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_sadd_u8(a.v128[1], b.v128[1]),
+                          c_v128_sadd_u8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_sadd_s16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_sadd_s16(a.v128[1], b.v128[1]),
+                          c_v128_sadd_s16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_add_32(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_add_32(a.v128[1], b.v128[1]),
+                          c_v128_add_32(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_add_64(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_add_64(a.v128[1], b.v128[1]),
+                          c_v128_add_64(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_sub_64(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_sub_64(a.v128[1], b.v128[1]),
+                          c_v128_sub_64(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_padd_u8(c_v256 a) {
+  c_v256 t;
+  for (int i = 0; i < 16; i++)
+    t.u16[i] = (uint16_t)a.u8[i * 2] + (uint16_t)a.u8[i * 2 + 1];
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_padd_s16(c_v256 a) {
+  c_v256 t;
+  t.s32[0] = (int32_t)a.s16[0] + (int32_t)a.s16[1];
+  t.s32[1] = (int32_t)a.s16[2] + (int32_t)a.s16[3];
+  t.s32[2] = (int32_t)a.s16[4] + (int32_t)a.s16[5];
+  t.s32[3] = (int32_t)a.s16[6] + (int32_t)a.s16[7];
+  t.s32[4] = (int32_t)a.s16[8] + (int32_t)a.s16[9];
+  t.s32[5] = (int32_t)a.s16[10] + (int32_t)a.s16[11];
+  t.s32[6] = (int32_t)a.s16[12] + (int32_t)a.s16[13];
+  t.s32[7] = (int32_t)a.s16[14] + (int32_t)a.s16[15];
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_sub_8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_sub_8(a.v128[1], b.v128[1]),
+                          c_v128_sub_8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_ssub_u8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_ssub_u8(a.v128[1], b.v128[1]),
+                          c_v128_ssub_u8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_ssub_s8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_ssub_s8(a.v128[1], b.v128[1]),
+                          c_v128_ssub_s8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_sub_16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_sub_16(a.v128[1], b.v128[1]),
+                          c_v128_sub_16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_ssub_s16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_ssub_s16(a.v128[1], b.v128[1]),
+                          c_v128_ssub_s16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_ssub_u16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_ssub_u16(a.v128[1], b.v128[1]),
+                          c_v128_ssub_u16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_sub_32(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_sub_32(a.v128[1], b.v128[1]),
+                          c_v128_sub_32(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_abs_s16(c_v256 a) {
+  return c_v256_from_v128(c_v128_abs_s16(a.v128[1]), c_v128_abs_s16(a.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_abs_s8(c_v256 a) {
+  return c_v256_from_v128(c_v128_abs_s8(a.v128[1]), c_v128_abs_s8(a.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_mul_s16(c_v128 a, c_v128 b) {
+  c_v128 lo_bits = c_v128_mullo_s16(a, b);
+  c_v128 hi_bits = c_v128_mulhi_s16(a, b);
+  return c_v256_from_v128(c_v128_ziphi_16(hi_bits, lo_bits),
+                          c_v128_ziplo_16(hi_bits, lo_bits));
+}
+
+SIMD_INLINE c_v256 c_v256_mullo_s16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_mullo_s16(a.v128[1], b.v128[1]),
+                          c_v128_mullo_s16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_mulhi_s16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_mulhi_s16(a.v128[1], b.v128[1]),
+                          c_v128_mulhi_s16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_mullo_s32(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_mullo_s32(a.v128[1], b.v128[1]),
+                          c_v128_mullo_s32(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_madd_s16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_madd_s16(a.v128[1], b.v128[1]),
+                          c_v128_madd_s16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_madd_us8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_madd_us8(a.v128[1], b.v128[1]),
+                          c_v128_madd_us8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_avg_u8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_avg_u8(a.v128[1], b.v128[1]),
+                          c_v128_avg_u8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_rdavg_u8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_rdavg_u8(a.v128[1], b.v128[1]),
+                          c_v128_rdavg_u8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_rdavg_u16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_rdavg_u16(a.v128[1], b.v128[1]),
+                          c_v128_rdavg_u16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_avg_u16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_avg_u16(a.v128[1], b.v128[1]),
+                          c_v128_avg_u16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_min_u8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_min_u8(a.v128[1], b.v128[1]),
+                          c_v128_min_u8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_max_u8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_max_u8(a.v128[1], b.v128[1]),
+                          c_v128_max_u8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_min_s8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_min_s8(a.v128[1], b.v128[1]),
+                          c_v128_min_s8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE uint32_t c_v256_movemask_8(c_v256 a) {
+  return ((a.s8[31] < 0) << 31) | ((a.s8[30] < 0) << 30) |
+         ((a.s8[29] < 0) << 29) | ((a.s8[28] < 0) << 28) |
+         ((a.s8[27] < 0) << 27) | ((a.s8[26] < 0) << 26) |
+         ((a.s8[25] < 0) << 25) | ((a.s8[24] < 0) << 24) |
+         ((a.s8[23] < 0) << 23) | ((a.s8[22] < 0) << 22) |
+         ((a.s8[21] < 0) << 21) | ((a.s8[20] < 0) << 20) |
+         ((a.s8[19] < 0) << 19) | ((a.s8[18] < 0) << 18) |
+         ((a.s8[17] < 0) << 17) | ((a.s8[16] < 0) << 16) |
+         ((a.s8[15] < 0) << 15) | ((a.s8[14] < 0) << 14) |
+         ((a.s8[13] < 0) << 13) | ((a.s8[12] < 0) << 12) |
+         ((a.s8[11] < 0) << 11) | ((a.s8[10] < 0) << 10) |
+         ((a.s8[9] < 0) << 9) | ((a.s8[8] < 0) << 8) | ((a.s8[7] < 0) << 7) |
+         ((a.s8[6] < 0) << 6) | ((a.s8[5] < 0) << 5) | ((a.s8[4] < 0) << 4) |
+         ((a.s8[3] < 0) << 3) | ((a.s8[2] < 0) << 2) | ((a.s8[1] < 0) << 1) |
+         ((a.s8[0] < 0) << 0);
+}
+
+SIMD_INLINE c_v256 c_v256_blend_8(c_v256 a, c_v256 b, c_v256 c) {
+  c_v256 t;
+  for (int i = 0; i < 32; i++) t.u8[i] = c.s8[i] < 0 ? b.u8[i] : a.u8[i];
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_max_s8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_max_s8(a.v128[1], b.v128[1]),
+                          c_v128_max_s8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_min_s16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_min_s16(a.v128[1], b.v128[1]),
+                          c_v128_min_s16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_max_s16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_max_s16(a.v128[1], b.v128[1]),
+                          c_v128_max_s16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_min_s32(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_min_s32(a.v128[1], b.v128[1]),
+                          c_v128_min_s32(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_max_s32(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_max_s32(a.v128[1], b.v128[1]),
+                          c_v128_max_s32(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_ziplo_8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_ziphi_8(a.v128[0], b.v128[0]),
+                          c_v128_ziplo_8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_ziphi_8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_ziphi_8(a.v128[1], b.v128[1]),
+                          c_v128_ziplo_8(a.v128[1], b.v128[1]));
+}
+
+SIMD_INLINE c_v256 c_v256_ziplo_16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_ziphi_16(a.v128[0], b.v128[0]),
+                          c_v128_ziplo_16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_ziphi_16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_ziphi_16(a.v128[1], b.v128[1]),
+                          c_v128_ziplo_16(a.v128[1], b.v128[1]));
+}
+
+SIMD_INLINE c_v256 c_v256_ziplo_32(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_ziphi_32(a.v128[0], b.v128[0]),
+                          c_v128_ziplo_32(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_ziphi_32(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_ziphi_32(a.v128[1], b.v128[1]),
+                          c_v128_ziplo_32(a.v128[1], b.v128[1]));
+}
+
+SIMD_INLINE c_v256 c_v256_ziplo_64(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_ziphi_64(a.v128[0], b.v128[0]),
+                          c_v128_ziplo_64(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_ziphi_64(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_ziphi_64(a.v128[1], b.v128[1]),
+                          c_v128_ziplo_64(a.v128[1], b.v128[1]));
+}
+
+SIMD_INLINE c_v256 c_v256_ziplo_128(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(a.v128[0], b.v128[0]);
+}
+
+SIMD_INLINE c_v256 c_v256_ziphi_128(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(a.v128[1], b.v128[1]);
+}
+
+SIMD_INLINE c_v256 c_v256_zip_8(c_v128 a, c_v128 b) {
+  return c_v256_from_v128(c_v128_ziphi_8(a, b), c_v128_ziplo_8(a, b));
+}
+
+SIMD_INLINE c_v256 c_v256_zip_16(c_v128 a, c_v128 b) {
+  return c_v256_from_v128(c_v128_ziphi_16(a, b), c_v128_ziplo_16(a, b));
+}
+
+SIMD_INLINE c_v256 c_v256_zip_32(c_v128 a, c_v128 b) {
+  return c_v256_from_v128(c_v128_ziphi_32(a, b), c_v128_ziplo_32(a, b));
+}
+
+SIMD_INLINE c_v256 _c_v256_unzip_8(c_v256 a, c_v256 b, int mode) {
+  c_v256 t;
+  int i;
+  if (mode) {
+    for (i = 0; i < 16; i++) {
+      t.u8[i] = a.u8[i * 2 + 1];
+      t.u8[i + 16] = b.u8[i * 2 + 1];
+    }
+  } else {
+    for (i = 0; i < 16; i++) {
+      t.u8[i] = b.u8[i * 2];
+      t.u8[i + 16] = a.u8[i * 2];
+    }
+  }
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_unziplo_8(c_v256 a, c_v256 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v256_unzip_8(a, b, 1)
+                           : _c_v256_unzip_8(a, b, 0);
+}
+
+SIMD_INLINE c_v256 c_v256_unziphi_8(c_v256 a, c_v256 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v256_unzip_8(b, a, 0)
+                           : _c_v256_unzip_8(b, a, 1);
+}
+
+SIMD_INLINE c_v256 _c_v256_unzip_16(c_v256 a, c_v256 b, int mode) {
+  c_v256 t;
+  int i;
+  if (mode) {
+    for (i = 0; i < 8; i++) {
+      t.u16[i] = a.u16[i * 2 + 1];
+      t.u16[i + 8] = b.u16[i * 2 + 1];
+    }
+  } else {
+    for (i = 0; i < 8; i++) {
+      t.u16[i] = b.u16[i * 2];
+      t.u16[i + 8] = a.u16[i * 2];
+    }
+  }
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_unziplo_16(c_v256 a, c_v256 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v256_unzip_16(a, b, 1)
+                           : _c_v256_unzip_16(a, b, 0);
+}
+
+SIMD_INLINE c_v256 c_v256_unziphi_16(c_v256 a, c_v256 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v256_unzip_16(b, a, 0)
+                           : _c_v256_unzip_16(b, a, 1);
+}
+
+SIMD_INLINE c_v256 _c_v256_unzip_32(c_v256 a, c_v256 b, int mode) {
+  c_v256 t;
+  if (mode) {
+    t.u32[7] = b.u32[7];
+    t.u32[6] = b.u32[5];
+    t.u32[5] = b.u32[3];
+    t.u32[4] = b.u32[1];
+    t.u32[3] = a.u32[7];
+    t.u32[2] = a.u32[5];
+    t.u32[1] = a.u32[3];
+    t.u32[0] = a.u32[1];
+  } else {
+    t.u32[7] = a.u32[6];
+    t.u32[6] = a.u32[4];
+    t.u32[5] = a.u32[2];
+    t.u32[4] = a.u32[0];
+    t.u32[3] = b.u32[6];
+    t.u32[2] = b.u32[4];
+    t.u32[1] = b.u32[2];
+    t.u32[0] = b.u32[0];
+  }
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_unziplo_32(c_v256 a, c_v256 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v256_unzip_32(a, b, 1)
+                           : _c_v256_unzip_32(a, b, 0);
+}
+
+SIMD_INLINE c_v256 c_v256_unziphi_32(c_v256 a, c_v256 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v256_unzip_32(b, a, 0)
+                           : _c_v256_unzip_32(b, a, 1);
+}
+
+SIMD_INLINE c_v256 _c_v256_unzip_64(c_v256 a, c_v256 b, int mode) {
+  c_v256 t;
+  if (mode) {
+    t.u64[3] = b.u64[3];
+    t.u64[2] = b.u64[1];
+    t.u64[1] = a.u64[3];
+    t.u64[0] = a.u64[1];
+  } else {
+    t.u64[3] = a.u64[2];
+    t.u64[2] = a.u64[0];
+    t.u64[1] = b.u64[2];
+    t.u64[0] = b.u64[0];
+  }
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_unziplo_64(c_v256 a, c_v256 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v256_unzip_64(a, b, 1)
+                           : _c_v256_unzip_64(a, b, 0);
+}
+
+SIMD_INLINE c_v256 c_v256_unziphi_64(c_v256 a, c_v256 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v256_unzip_64(b, a, 0)
+                           : _c_v256_unzip_64(b, a, 1);
+}
+
+SIMD_INLINE c_v256 c_v256_unpack_u8_s16(c_v128 a) {
+  return c_v256_from_v128(c_v128_unpackhi_u8_s16(a), c_v128_unpacklo_u8_s16(a));
+}
+
+SIMD_INLINE c_v256 c_v256_unpacklo_u8_s16(c_v256 a) {
+  return c_v256_from_v128(c_v128_unpackhi_u8_s16(a.v128[0]),
+                          c_v128_unpacklo_u8_s16(a.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_unpackhi_u8_s16(c_v256 a) {
+  return c_v256_from_v128(c_v128_unpackhi_u8_s16(a.v128[1]),
+                          c_v128_unpacklo_u8_s16(a.v128[1]));
+}
+
+SIMD_INLINE c_v256 c_v256_unpack_s8_s16(c_v128 a) {
+  return c_v256_from_v128(c_v128_unpackhi_s8_s16(a), c_v128_unpacklo_s8_s16(a));
+}
+
+SIMD_INLINE c_v256 c_v256_unpacklo_s8_s16(c_v256 a) {
+  return c_v256_from_v128(c_v128_unpackhi_s8_s16(a.v128[0]),
+                          c_v128_unpacklo_s8_s16(a.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_unpackhi_s8_s16(c_v256 a) {
+  return c_v256_from_v128(c_v128_unpackhi_s8_s16(a.v128[1]),
+                          c_v128_unpacklo_s8_s16(a.v128[1]));
+}
+
+SIMD_INLINE c_v256 c_v256_pack_s32_s16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_pack_s32_s16(a.v128[1], a.v128[0]),
+                          c_v128_pack_s32_s16(b.v128[1], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_pack_s32_u16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_pack_s32_u16(a.v128[1], a.v128[0]),
+                          c_v128_pack_s32_u16(b.v128[1], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_pack_s16_u8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_pack_s16_u8(a.v128[1], a.v128[0]),
+                          c_v128_pack_s16_u8(b.v128[1], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_pack_s16_s8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_pack_s16_s8(a.v128[1], a.v128[0]),
+                          c_v128_pack_s16_s8(b.v128[1], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_unpack_u16_s32(c_v128 a) {
+  return c_v256_from_v128(c_v128_unpackhi_u16_s32(a),
+                          c_v128_unpacklo_u16_s32(a));
+}
+
+SIMD_INLINE c_v256 c_v256_unpack_s16_s32(c_v128 a) {
+  return c_v256_from_v128(c_v128_unpackhi_s16_s32(a),
+                          c_v128_unpacklo_s16_s32(a));
+}
+
+SIMD_INLINE c_v256 c_v256_unpacklo_u16_s32(c_v256 a) {
+  return c_v256_from_v128(c_v128_unpackhi_u16_s32(a.v128[0]),
+                          c_v128_unpacklo_u16_s32(a.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_unpacklo_s16_s32(c_v256 a) {
+  return c_v256_from_v128(c_v128_unpackhi_s16_s32(a.v128[0]),
+                          c_v128_unpacklo_s16_s32(a.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_unpackhi_u16_s32(c_v256 a) {
+  return c_v256_from_v128(c_v128_unpackhi_u16_s32(a.v128[1]),
+                          c_v128_unpacklo_u16_s32(a.v128[1]));
+}
+
+SIMD_INLINE c_v256 c_v256_unpackhi_s16_s32(c_v256 a) {
+  return c_v256_from_v128(c_v128_unpackhi_s16_s32(a.v128[1]),
+                          c_v128_unpacklo_s16_s32(a.v128[1]));
+}
+
+SIMD_INLINE c_v256 c_v256_shuffle_8(c_v256 a, c_v256 pattern) {
+  c_v256 t;
+  int c;
+  for (c = 0; c < 32; c++)
+    t.u8[c] = a.u8[CONFIG_BIG_ENDIAN ? 31 - (pattern.u8[c] & 31)
+                                     : pattern.u8[c] & 31];
+
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_wideshuffle_8(c_v256 a, c_v256 b, c_v256 pattern) {
+  c_v256 t;
+  int c;
+  for (c = 0; c < 32; c++)
+    t.u8[c] = (pattern.u8[c] < 32
+                   ? b.u8
+                   : a.u8)[CONFIG_BIG_ENDIAN ? 31 - (pattern.u8[c] & 31)
+                                             : pattern.u8[c] & 31];
+  return t;
+}
+
+// Pairwise / dual-lane shuffle: shuffle two 128 bit lates.
+SIMD_INLINE c_v256 c_v256_pshuffle_8(c_v256 a, c_v256 pattern) {
+  return c_v256_from_v128(
+      c_v128_shuffle_8(c_v256_high_v128(a), c_v256_high_v128(pattern)),
+      c_v128_shuffle_8(c_v256_low_v128(a), c_v256_low_v128(pattern)));
+}
+
+SIMD_INLINE c_v256 c_v256_cmpgt_s8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_cmpgt_s8(a.v128[1], b.v128[1]),
+                          c_v128_cmpgt_s8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_cmplt_s8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_cmplt_s8(a.v128[1], b.v128[1]),
+                          c_v128_cmplt_s8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_cmpeq_8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_cmpeq_8(a.v128[1], b.v128[1]),
+                          c_v128_cmpeq_8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_cmpgt_s16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_cmpgt_s16(a.v128[1], b.v128[1]),
+                          c_v128_cmpgt_s16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_cmplt_s16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_cmplt_s16(a.v128[1], b.v128[1]),
+                          c_v128_cmplt_s16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_cmpeq_16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_cmpeq_16(a.v128[1], b.v128[1]),
+                          c_v128_cmpeq_16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_cmpgt_s32(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_cmpgt_s32(a.v128[1], b.v128[1]),
+                          c_v128_cmpgt_s32(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_cmplt_s32(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_cmplt_s32(a.v128[1], b.v128[1]),
+                          c_v128_cmplt_s32(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_cmpeq_32(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_cmpeq_32(a.v128[1], b.v128[1]),
+                          c_v128_cmpeq_32(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_shl_n_byte(c_v256 a, unsigned int n) {
+  if (n == 0) return a;
+  if (n < 16)
+    return c_v256_from_v128(c_v128_or(c_v128_shl_n_byte(a.v128[1], n),
+                                      c_v128_shr_n_byte(a.v128[0], 16 - n)),
+                            c_v128_shl_n_byte(a.v128[0], n));
+  else if (n > 16)
+    return c_v256_from_v128(c_v128_shl_n_byte(a.v128[0], n - 16),
+                            c_v128_zero());
+  else
+    return c_v256_from_v128(c_v256_low_v128(a), c_v128_zero());
+}
+
+SIMD_INLINE c_v256 c_v256_shr_n_byte(c_v256 a, unsigned int n) {
+  if (n == 0) return a;
+  if (n < 16)
+    return c_v256_from_v128(c_v128_shr_n_byte(a.v128[1], n),
+                            c_v128_or(c_v128_shr_n_byte(a.v128[0], n),
+                                      c_v128_shl_n_byte(a.v128[1], 16 - n)));
+  else if (n > 16)
+    return c_v256_from_v128(c_v128_zero(),
+                            c_v128_shr_n_byte(a.v128[1], n - 16));
+  else
+    return c_v256_from_v128(c_v128_zero(), c_v256_high_v128(a));
+}
+
+SIMD_INLINE c_v256 c_v256_align(c_v256 a, c_v256 b, unsigned int c) {
+  if (SIMD_CHECK && c > 31) {
+    fprintf(stderr, "Error: undefined alignment %d\n", c);
+    abort();
+  }
+  return c ? c_v256_or(c_v256_shr_n_byte(b, c), c_v256_shl_n_byte(a, 32 - c))
+           : b;
+}
+
+SIMD_INLINE c_v256 c_v256_shl_8(c_v256 a, unsigned int c) {
+  return c_v256_from_v128(c_v128_shl_8(a.v128[1], c),
+                          c_v128_shl_8(a.v128[0], c));
+}
+
+SIMD_INLINE c_v256 c_v256_shr_u8(c_v256 a, unsigned int c) {
+  return c_v256_from_v128(c_v128_shr_u8(a.v128[1], c),
+                          c_v128_shr_u8(a.v128[0], c));
+}
+
+SIMD_INLINE c_v256 c_v256_shr_s8(c_v256 a, unsigned int c) {
+  return c_v256_from_v128(c_v128_shr_s8(a.v128[1], c),
+                          c_v128_shr_s8(a.v128[0], c));
+}
+
+SIMD_INLINE c_v256 c_v256_shl_16(c_v256 a, unsigned int c) {
+  return c_v256_from_v128(c_v128_shl_16(a.v128[1], c),
+                          c_v128_shl_16(a.v128[0], c));
+}
+
+SIMD_INLINE c_v256 c_v256_shr_u16(c_v256 a, unsigned int c) {
+  return c_v256_from_v128(c_v128_shr_u16(a.v128[1], c),
+                          c_v128_shr_u16(a.v128[0], c));
+}
+
+SIMD_INLINE c_v256 c_v256_shr_s16(c_v256 a, unsigned int c) {
+  return c_v256_from_v128(c_v128_shr_s16(a.v128[1], c),
+                          c_v128_shr_s16(a.v128[0], c));
+}
+
+SIMD_INLINE c_v256 c_v256_shl_32(c_v256 a, unsigned int c) {
+  return c_v256_from_v128(c_v128_shl_32(a.v128[1], c),
+                          c_v128_shl_32(a.v128[0], c));
+}
+
+SIMD_INLINE c_v256 c_v256_shr_u32(c_v256 a, unsigned int c) {
+  return c_v256_from_v128(c_v128_shr_u32(a.v128[1], c),
+                          c_v128_shr_u32(a.v128[0], c));
+}
+
+SIMD_INLINE c_v256 c_v256_shr_s32(c_v256 a, unsigned int c) {
+  return c_v256_from_v128(c_v128_shr_s32(a.v128[1], c),
+                          c_v128_shr_s32(a.v128[0], c));
+}
+
+SIMD_INLINE c_v256 c_v256_shr_s64(c_v256 a, unsigned int n) {
+  c_v256 t;
+  if (SIMD_CHECK && n > 63) {
+    fprintf(stderr, "Error: undefined s64 shift right %d\n", n);
+    abort();
+  }
+  t.s64[3] = a.s64[3] >> n;
+  t.s64[2] = a.s64[2] >> n;
+  t.s64[1] = a.s64[1] >> n;
+  t.s64[0] = a.s64[0] >> n;
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_shr_u64(c_v256 a, unsigned int n) {
+  c_v256 t;
+  if (SIMD_CHECK && n > 63) {
+    fprintf(stderr, "Error: undefined s64 shift right %d\n", n);
+    abort();
+  }
+  t.u64[3] = a.u64[3] >> n;
+  t.u64[2] = a.u64[2] >> n;
+  t.u64[1] = a.u64[1] >> n;
+  t.u64[0] = a.u64[0] >> n;
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_shl_64(c_v256 a, unsigned int n) {
+  c_v256 t;
+  if (SIMD_CHECK && n > 63) {
+    fprintf(stderr, "Error: undefined s64 shift right %d\n", n);
+    abort();
+  }
+  t.u64[3] = a.u64[3] << n;
+  t.u64[2] = a.u64[2] << n;
+  t.u64[1] = a.u64[1] << n;
+  t.u64[0] = a.u64[0] << n;
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_shl_n_8(c_v256 a, unsigned int n) {
+  return c_v256_shl_8(a, n);
+}
+
+SIMD_INLINE c_v256 c_v256_shl_n_16(c_v256 a, unsigned int n) {
+  return c_v256_shl_16(a, n);
+}
+
+SIMD_INLINE c_v256 c_v256_shl_n_32(c_v256 a, unsigned int n) {
+  return c_v256_shl_32(a, n);
+}
+
+SIMD_INLINE c_v256 c_v256_shl_n_64(c_v256 a, unsigned int n) {
+  return c_v256_shl_64(a, n);
+}
+
+SIMD_INLINE c_v256 c_v256_shr_n_u8(c_v256 a, unsigned int n) {
+  return c_v256_shr_u8(a, n);
+}
+
+SIMD_INLINE c_v256 c_v256_shr_n_u16(c_v256 a, unsigned int n) {
+  return c_v256_shr_u16(a, n);
+}
+
+SIMD_INLINE c_v256 c_v256_shr_n_u32(c_v256 a, unsigned int n) {
+  return c_v256_shr_u32(a, n);
+}
+
+SIMD_INLINE c_v256 c_v256_shr_n_u64(c_v256 a, unsigned int n) {
+  return c_v256_shr_u64(a, n);
+}
+
+SIMD_INLINE c_v256 c_v256_shr_n_s8(c_v256 a, unsigned int n) {
+  return c_v256_shr_s8(a, n);
+}
+
+SIMD_INLINE c_v256 c_v256_shr_n_s16(c_v256 a, unsigned int n) {
+  return c_v256_shr_s16(a, n);
+}
+
+SIMD_INLINE c_v256 c_v256_shr_n_s32(c_v256 a, unsigned int n) {
+  return c_v256_shr_s32(a, n);
+}
+
+SIMD_INLINE c_v256 c_v256_shr_n_s64(c_v256 a, unsigned int n) {
+  return c_v256_shr_s64(a, n);
+}
+
+SIMD_INLINE c_v256 c_v256_shr_n_word(c_v256 a, const unsigned int n) {
+  return c_v256_shr_n_byte(a, 2 * n);
+}
+SIMD_INLINE c_v256 c_v256_shl_n_word(c_v256 a, const unsigned int n) {
+  return c_v256_shl_n_byte(a, 2 * n);
+}
+
+typedef uint32_t c_sad256_internal_u16;
+
+SIMD_INLINE c_sad256_internal_u16 c_v256_sad_u16_init() { return 0; }
+
+/* Implementation dependent return value.  Result must be finalised with
+   v256_sad_u16_sum(). */
+SIMD_INLINE c_sad256_internal_u16 c_v256_sad_u16(c_sad256_internal_u16 s,
+                                                 c_v256 a, c_v256 b) {
+  int c;
+  for (c = 0; c < 16; c++)
+    s += a.u16[c] > b.u16[c] ? a.u16[c] - b.u16[c] : b.u16[c] - a.u16[c];
+  return s;
+}
+
+SIMD_INLINE uint32_t c_v256_sad_u16_sum(c_sad256_internal_u16 s) { return s; }
+
+typedef uint64_t c_ssd256_internal_s16;
+
+SIMD_INLINE c_ssd256_internal_s16 c_v256_ssd_s16_init() { return 0; }
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v256_ssd_s16_sum(). */
+SIMD_INLINE c_ssd256_internal_s16 c_v256_ssd_s16(c_ssd256_internal_s16 s,
+                                                 c_v256 a, c_v256 b) {
+  int c;
+  for (c = 0; c < 16; c++)
+    s += (int32_t)(int16_t)(a.s16[c] - b.s16[c]) *
+         (int32_t)(int16_t)(a.s16[c] - b.s16[c]);
+  return s;
+}
+
+SIMD_INLINE uint64_t c_v256_ssd_s16_sum(c_ssd256_internal_s16 s) { return s; }
+
+#endif  // AOM_AOM_DSP_SIMD_V256_INTRINSICS_C_H_
diff --git a/libs/libaom/src/aom_dsp/simd/v256_intrinsics_v128.h b/libs/libaom/src/aom_dsp/simd/v256_intrinsics_v128.h
new file mode 100644
index 000000000..0d2266754
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/simd/v256_intrinsics_v128.h
@@ -0,0 +1,876 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_V128_H_
+#define AOM_AOM_DSP_SIMD_V256_INTRINSICS_V128_H_
+
+#if HAVE_NEON
+#include "aom_dsp/simd/v128_intrinsics_arm.h"
+#elif HAVE_SSE2
+#include "aom_dsp/simd/v128_intrinsics_x86.h"
+#else
+#include "aom_dsp/simd/v128_intrinsics.h"
+#endif
+
+#if HAVE_NEON
+typedef int64x2x2_t v256;
+#else
+typedef struct {
+  v128 val[2];
+} v256;
+#endif
+
+SIMD_INLINE uint32_t v256_low_u32(v256 a) { return v128_low_u32(a.val[0]); }
+
+SIMD_INLINE v64 v256_low_v64(v256 a) { return v128_low_v64(a.val[0]); }
+
+SIMD_INLINE uint64_t v256_low_u64(v256 a) { return v64_u64(v256_low_v64(a)); }
+
+SIMD_INLINE v128 v256_low_v128(v256 a) { return a.val[0]; }
+
+SIMD_INLINE v128 v256_high_v128(v256 a) { return a.val[1]; }
+
+SIMD_INLINE v256 v256_from_v128(v128 hi, v128 lo) {
+  v256 t;
+  t.val[1] = hi;
+  t.val[0] = lo;
+  return t;
+}
+
+SIMD_INLINE v256 v256_from_64(uint64_t a, uint64_t b, uint64_t c, uint64_t d) {
+  return v256_from_v128(v128_from_64(a, b), v128_from_64(c, d));
+}
+
+SIMD_INLINE v256 v256_from_v64(v64 a, v64 b, v64 c, v64 d) {
+  return v256_from_v128(v128_from_v64(a, b), v128_from_v64(c, d));
+}
+
+SIMD_INLINE v256 v256_load_unaligned(const void *p) {
+  return v256_from_v128(v128_load_unaligned((uint8_t *)p + 16),
+                        v128_load_unaligned(p));
+}
+
+SIMD_INLINE v256 v256_load_aligned(const void *p) {
+  return v256_from_v128(v128_load_aligned((uint8_t *)p + 16),
+                        v128_load_aligned(p));
+}
+
+SIMD_INLINE void v256_store_unaligned(void *p, v256 a) {
+  v128_store_unaligned(p, a.val[0]);
+  v128_store_unaligned((uint8_t *)p + 16, a.val[1]);
+}
+
+SIMD_INLINE void v256_store_aligned(void *p, v256 a) {
+  v128_store_aligned(p, a.val[0]);
+  v128_store_aligned((uint8_t *)p + 16, a.val[1]);
+}
+
+SIMD_INLINE v256 v256_zero(void) {
+  return v256_from_v128(v128_zero(), v128_zero());
+}
+
+SIMD_INLINE v256 v256_dup_8(uint8_t x) {
+  v128 t = v128_dup_8(x);
+  return v256_from_v128(t, t);
+}
+
+SIMD_INLINE v256 v256_dup_16(uint16_t x) {
+  v128 t = v128_dup_16(x);
+  return v256_from_v128(t, t);
+}
+
+SIMD_INLINE v256 v256_dup_32(uint32_t x) {
+  v128 t = v128_dup_32(x);
+  return v256_from_v128(t, t);
+}
+
+SIMD_INLINE v256 v256_dup_64(uint64_t x) {
+  v128 t = v128_dup_64(x);
+  return v256_from_v128(t, t);
+}
+
+SIMD_INLINE int64_t v256_dotp_su8(v256 a, v256 b) {
+  return v128_dotp_su8(a.val[1], b.val[1]) + v128_dotp_su8(a.val[0], b.val[0]);
+}
+
+SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) {
+  return v128_dotp_s16(a.val[1], b.val[1]) + v128_dotp_s16(a.val[0], b.val[0]);
+}
+
+SIMD_INLINE int64_t v256_dotp_s32(v256 a, v256 b) {
+  return v128_dotp_s32(a.val[1], b.val[1]) + v128_dotp_s32(a.val[0], b.val[0]);
+}
+
+SIMD_INLINE uint64_t v256_hadd_u8(v256 a) {
+  return v128_hadd_u8(a.val[1]) + v128_hadd_u8(a.val[0]);
+}
+
+typedef struct {
+  sad128_internal val[2];
+} sad256_internal;
+
+SIMD_INLINE sad256_internal v256_sad_u8_init(void) {
+  sad256_internal t;
+  t.val[1] = v128_sad_u8_init();
+  t.val[0] = v128_sad_u8_init();
+  return t;
+}
+
+/* Implementation dependent return value.  Result must be finalised with
+   v256_sad_u8_sum().
+   The result for more than 16 v256_sad_u8() calls is undefined. */
+SIMD_INLINE sad256_internal v256_sad_u8(sad256_internal s, v256 a, v256 b) {
+  sad256_internal t;
+  t.val[1] = v128_sad_u8(s.val[1], a.val[1], b.val[1]);
+  t.val[0] = v128_sad_u8(s.val[0], a.val[0], b.val[0]);
+  return t;
+}
+
+SIMD_INLINE uint32_t v256_sad_u8_sum(sad256_internal s) {
+  return v128_sad_u8_sum(s.val[1]) + v128_sad_u8_sum(s.val[0]);
+}
+
+typedef struct {
+  ssd128_internal val[2];
+} ssd256_internal;
+
+SIMD_INLINE ssd256_internal v256_ssd_u8_init(void) {
+  ssd256_internal t;
+  t.val[1] = v128_ssd_u8_init();
+  t.val[0] = v128_ssd_u8_init();
+  return t;
+}
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v256_ssd_u8_sum(). */
+SIMD_INLINE ssd256_internal v256_ssd_u8(ssd256_internal s, v256 a, v256 b) {
+  ssd256_internal t;
+  t.val[1] = v128_ssd_u8(s.val[1], a.val[1], b.val[1]);
+  t.val[0] = v128_ssd_u8(s.val[0], a.val[0], b.val[0]);
+  return t;
+}
+
+SIMD_INLINE uint32_t v256_ssd_u8_sum(ssd256_internal s) {
+  return v128_ssd_u8_sum(s.val[1]) + v128_ssd_u8_sum(s.val[0]);
+}
+
+SIMD_INLINE v256 v256_or(v256 a, v256 b) {
+  return v256_from_v128(v128_or(a.val[1], b.val[1]),
+                        v128_or(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_xor(v256 a, v256 b) {
+  return v256_from_v128(v128_xor(a.val[1], b.val[1]),
+                        v128_xor(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_and(v256 a, v256 b) {
+  return v256_from_v128(v128_and(a.val[1], b.val[1]),
+                        v128_and(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_andn(v256 a, v256 b) {
+  return v256_from_v128(v128_andn(a.val[1], b.val[1]),
+                        v128_andn(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_add_8(v256 a, v256 b) {
+  return v256_from_v128(v128_add_8(a.val[1], b.val[1]),
+                        v128_add_8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_add_16(v256 a, v256 b) {
+  return v256_from_v128(v128_add_16(a.val[1], b.val[1]),
+                        v128_add_16(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_sadd_s8(v256 a, v256 b) {
+  return v256_from_v128(v128_sadd_s8(a.val[1], b.val[1]),
+                        v128_sadd_s8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_sadd_u8(v256 a, v256 b) {
+  return v256_from_v128(v128_sadd_u8(a.val[1], b.val[1]),
+                        v128_sadd_u8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_sadd_s16(v256 a, v256 b) {
+  return v256_from_v128(v128_sadd_s16(a.val[1], b.val[1]),
+                        v128_sadd_s16(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_add_32(v256 a, v256 b) {
+  return v256_from_v128(v128_add_32(a.val[1], b.val[1]),
+                        v128_add_32(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_add_64(v256 a, v256 b) {
+  return v256_from_v128(v128_add_64(a.val[1], b.val[1]),
+                        v128_add_64(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_padd_u8(v256 a) {
+  return v256_from_v128(v128_padd_u8(a.val[1]), v128_padd_u8(a.val[0]));
+}
+
+SIMD_INLINE v256 v256_padd_s16(v256 a) {
+  return v256_from_v128(v128_padd_s16(a.val[1]), v128_padd_s16(a.val[0]));
+}
+
+SIMD_INLINE v256 v256_sub_8(v256 a, v256 b) {
+  return v256_from_v128(v128_sub_8(a.val[1], b.val[1]),
+                        v128_sub_8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_ssub_u8(v256 a, v256 b) {
+  return v256_from_v128(v128_ssub_u8(a.val[1], b.val[1]),
+                        v128_ssub_u8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_ssub_s8(v256 a, v256 b) {
+  return v256_from_v128(v128_ssub_s8(a.val[1], b.val[1]),
+                        v128_ssub_s8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_sub_16(v256 a, v256 b) {
+  return v256_from_v128(v128_sub_16(a.val[1], b.val[1]),
+                        v128_sub_16(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_ssub_s16(v256 a, v256 b) {
+  return v256_from_v128(v128_ssub_s16(a.val[1], b.val[1]),
+                        v128_ssub_s16(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_ssub_u16(v256 a, v256 b) {
+  return v256_from_v128(v128_ssub_u16(a.val[1], b.val[1]),
+                        v128_ssub_u16(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_sub_32(v256 a, v256 b) {
+  return v256_from_v128(v128_sub_32(a.val[1], b.val[1]),
+                        v128_sub_32(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_sub_64(v256 a, v256 b) {
+  return v256_from_v128(v128_sub_64(a.val[1], b.val[1]),
+                        v128_sub_64(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_abs_s16(v256 a) {
+  return v256_from_v128(v128_abs_s16(a.val[1]), v128_abs_s16(a.val[0]));
+}
+
+SIMD_INLINE v256 v256_abs_s8(v256 a) {
+  return v256_from_v128(v128_abs_s8(a.val[1]), v128_abs_s8(a.val[0]));
+}
+
+SIMD_INLINE v256 v256_mul_s16(v128 a, v128 b) {
+  v128 lo_bits = v128_mullo_s16(a, b);
+  v128 hi_bits = v128_mulhi_s16(a, b);
+  return v256_from_v128(v128_ziphi_16(hi_bits, lo_bits),
+                        v128_ziplo_16(hi_bits, lo_bits));
+}
+
+SIMD_INLINE v256 v256_mullo_s16(v256 a, v256 b) {
+  return v256_from_v128(v128_mullo_s16(a.val[1], b.val[1]),
+                        v128_mullo_s16(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_mulhi_s16(v256 a, v256 b) {
+  return v256_from_v128(v128_mulhi_s16(a.val[1], b.val[1]),
+                        v128_mulhi_s16(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_mullo_s32(v256 a, v256 b) {
+  return v256_from_v128(v128_mullo_s32(a.val[1], b.val[1]),
+                        v128_mullo_s32(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_madd_s16(v256 a, v256 b) {
+  return v256_from_v128(v128_madd_s16(a.val[1], b.val[1]),
+                        v128_madd_s16(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_madd_us8(v256 a, v256 b) {
+  return v256_from_v128(v128_madd_us8(a.val[1], b.val[1]),
+                        v128_madd_us8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_avg_u8(v256 a, v256 b) {
+  return v256_from_v128(v128_avg_u8(a.val[1], b.val[1]),
+                        v128_avg_u8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_rdavg_u8(v256 a, v256 b) {
+  return v256_from_v128(v128_rdavg_u8(a.val[1], b.val[1]),
+                        v128_rdavg_u8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_rdavg_u16(v256 a, v256 b) {
+  return v256_from_v128(v128_rdavg_u16(a.val[1], b.val[1]),
+                        v128_rdavg_u16(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_avg_u16(v256 a, v256 b) {
+  return v256_from_v128(v128_avg_u16(a.val[1], b.val[1]),
+                        v128_avg_u16(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_min_u8(v256 a, v256 b) {
+  return v256_from_v128(v128_min_u8(a.val[1], b.val[1]),
+                        v128_min_u8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_max_u8(v256 a, v256 b) {
+  return v256_from_v128(v128_max_u8(a.val[1], b.val[1]),
+                        v128_max_u8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_min_s8(v256 a, v256 b) {
+  return v256_from_v128(v128_min_s8(a.val[1], b.val[1]),
+                        v128_min_s8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE uint32_t v256_movemask_8(v256 a) {
+  return (v128_movemask_8(v256_high_v128(a)) << 16) |
+         v128_movemask_8(v256_low_v128(a));
+}
+
+SIMD_INLINE v256 v256_blend_8(v256 a, v256 b, v256 c) {
+  return v256_from_v128(v128_blend_8(a.val[1], b.val[1], c.val[1]),
+                        v128_blend_8(a.val[0], b.val[0], c.val[0]));
+}
+
+SIMD_INLINE v256 v256_max_s8(v256 a, v256 b) {
+  return v256_from_v128(v128_max_s8(a.val[1], b.val[1]),
+                        v128_max_s8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_min_s16(v256 a, v256 b) {
+  return v256_from_v128(v128_min_s16(a.val[1], b.val[1]),
+                        v128_min_s16(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_max_s16(v256 a, v256 b) {
+  return v256_from_v128(v128_max_s16(a.val[1], b.val[1]),
+                        v128_max_s16(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_min_s32(v256 a, v256 b) {
+  return v256_from_v128(v128_min_s32(a.val[1], b.val[1]),
+                        v128_min_s32(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_max_s32(v256 a, v256 b) {
+  return v256_from_v128(v128_max_s32(a.val[1], b.val[1]),
+                        v128_max_s32(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_ziplo_8(v256 a, v256 b) {
+  return v256_from_v128(v128_ziphi_8(a.val[0], b.val[0]),
+                        v128_ziplo_8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_ziphi_8(v256 a, v256 b) {
+  return v256_from_v128(v128_ziphi_8(a.val[1], b.val[1]),
+                        v128_ziplo_8(a.val[1], b.val[1]));
+}
+
+SIMD_INLINE v256 v256_ziplo_16(v256 a, v256 b) {
+  return v256_from_v128(v128_ziphi_16(a.val[0], b.val[0]),
+                        v128_ziplo_16(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_ziphi_16(v256 a, v256 b) {
+  return v256_from_v128(v128_ziphi_16(a.val[1], b.val[1]),
+                        v128_ziplo_16(a.val[1], b.val[1]));
+}
+
+SIMD_INLINE v256 v256_ziplo_32(v256 a, v256 b) {
+  return v256_from_v128(v128_ziphi_32(a.val[0], b.val[0]),
+                        v128_ziplo_32(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_ziphi_32(v256 a, v256 b) {
+  return v256_from_v128(v128_ziphi_32(a.val[1], b.val[1]),
+                        v128_ziplo_32(a.val[1], b.val[1]));
+}
+
+SIMD_INLINE v256 v256_ziplo_64(v256 a, v256 b) {
+  return v256_from_v128(v128_ziphi_64(a.val[0], b.val[0]),
+                        v128_ziplo_64(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_ziphi_64(v256 a, v256 b) {
+  return v256_from_v128(v128_ziphi_64(a.val[1], b.val[1]),
+                        v128_ziplo_64(a.val[1], b.val[1]));
+}
+
+SIMD_INLINE v256 v256_ziplo_128(v256 a, v256 b) {
+  return v256_from_v128(a.val[0], b.val[0]);
+}
+
+SIMD_INLINE v256 v256_ziphi_128(v256 a, v256 b) {
+  return v256_from_v128(a.val[1], b.val[1]);
+}
+
+SIMD_INLINE v256 v256_zip_8(v128 a, v128 b) {
+  return v256_from_v128(v128_ziphi_8(a, b), v128_ziplo_8(a, b));
+}
+
+SIMD_INLINE v256 v256_zip_16(v128 a, v128 b) {
+  return v256_from_v128(v128_ziphi_16(a, b), v128_ziplo_16(a, b));
+}
+
+SIMD_INLINE v256 v256_zip_32(v128 a, v128 b) {
+  return v256_from_v128(v128_ziphi_32(a, b), v128_ziplo_32(a, b));
+}
+
+SIMD_INLINE v256 v256_unziplo_8(v256 a, v256 b) {
+  return v256_from_v128(v128_unziplo_8(a.val[1], a.val[0]),
+                        v128_unziplo_8(b.val[1], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_unziphi_8(v256 a, v256 b) {
+  return v256_from_v128(v128_unziphi_8(a.val[1], a.val[0]),
+                        v128_unziphi_8(b.val[1], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_unziplo_16(v256 a, v256 b) {
+  return v256_from_v128(v128_unziplo_16(a.val[1], a.val[0]),
+                        v128_unziplo_16(b.val[1], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_unziphi_16(v256 a, v256 b) {
+  return v256_from_v128(v128_unziphi_16(a.val[1], a.val[0]),
+                        v128_unziphi_16(b.val[1], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_unziplo_32(v256 a, v256 b) {
+  return v256_from_v128(v128_unziplo_32(a.val[1], a.val[0]),
+                        v128_unziplo_32(b.val[1], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_unziphi_32(v256 a, v256 b) {
+  return v256_from_v128(v128_unziphi_32(a.val[1], a.val[0]),
+                        v128_unziphi_32(b.val[1], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_unziplo_64(v256 a, v256 b) {
+#if HAVE_SSE2
+  return v256_from_v128(
+      _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(a.val[0]),
+                                      _mm_castsi128_pd(a.val[1]), 0)),
+      _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(b.val[0]),
+                                      _mm_castsi128_pd(b.val[1]), 0)));
+#else
+  return v256_from_v64(v128_low_v64(a.val[1]), v128_low_v64(a.val[0]),
+                       v128_low_v64(b.val[1]), v128_low_v64(b.val[0]));
+#endif
+}
+
+SIMD_INLINE v256 v256_unziphi_64(v256 a, v256 b) {
+#if HAVE_SSE2
+  return v256_from_v128(
+      _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(a.val[0]),
+                                      _mm_castsi128_pd(a.val[1]), 3)),
+      _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(b.val[0]),
+                                      _mm_castsi128_pd(b.val[1]), 3)));
+#else
+  return v256_from_v64(v128_high_v64(a.val[1]), v128_high_v64(a.val[0]),
+                       v128_high_v64(b.val[1]), v128_high_v64(b.val[0]));
+#endif
+}
+
+SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) {
+  return v256_from_v128(v128_unpackhi_u8_s16(a), v128_unpacklo_u8_s16(a));
+}
+
+SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) {
+  return v256_from_v128(v128_unpackhi_u8_s16(a.val[0]),
+                        v128_unpacklo_u8_s16(a.val[0]));
+}
+
+SIMD_INLINE v256 v256_unpackhi_u8_s16(v256 a) {
+  return v256_from_v128(v128_unpackhi_u8_s16(a.val[1]),
+                        v128_unpacklo_u8_s16(a.val[1]));
+}
+
+SIMD_INLINE v256 v256_unpack_s8_s16(v128 a) {
+  return v256_from_v128(v128_unpackhi_s8_s16(a), v128_unpacklo_s8_s16(a));
+}
+
+SIMD_INLINE v256 v256_unpacklo_s8_s16(v256 a) {
+  return v256_from_v128(v128_unpackhi_s8_s16(a.val[0]),
+                        v128_unpacklo_s8_s16(a.val[0]));
+}
+
+SIMD_INLINE v256 v256_unpackhi_s8_s16(v256 a) {
+  return v256_from_v128(v128_unpackhi_s8_s16(a.val[1]),
+                        v128_unpacklo_s8_s16(a.val[1]));
+}
+
+SIMD_INLINE v256 v256_pack_s32_s16(v256 a, v256 b) {
+  return v256_from_v128(v128_pack_s32_s16(a.val[1], a.val[0]),
+                        v128_pack_s32_s16(b.val[1], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_pack_s32_u16(v256 a, v256 b) {
+  return v256_from_v128(v128_pack_s32_u16(a.val[1], a.val[0]),
+                        v128_pack_s32_u16(b.val[1], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_pack_s16_u8(v256 a, v256 b) {
+  return v256_from_v128(v128_pack_s16_u8(a.val[1], a.val[0]),
+                        v128_pack_s16_u8(b.val[1], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_pack_s16_s8(v256 a, v256 b) {
+  return v256_from_v128(v128_pack_s16_s8(a.val[1], a.val[0]),
+                        v128_pack_s16_s8(b.val[1], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_unpack_u16_s32(v128 a) {
+  return v256_from_v128(v128_unpackhi_u16_s32(a), v128_unpacklo_u16_s32(a));
+}
+
+SIMD_INLINE v256 v256_unpack_s16_s32(v128 a) {
+  return v256_from_v128(v128_unpackhi_s16_s32(a), v128_unpacklo_s16_s32(a));
+}
+
+SIMD_INLINE v256 v256_unpacklo_u16_s32(v256 a) {
+  return v256_from_v128(v128_unpackhi_u16_s32(a.val[0]),
+                        v128_unpacklo_u16_s32(a.val[0]));
+}
+
+SIMD_INLINE v256 v256_unpacklo_s16_s32(v256 a) {
+  return v256_from_v128(v128_unpackhi_s16_s32(a.val[0]),
+                        v128_unpacklo_s16_s32(a.val[0]));
+}
+
+SIMD_INLINE v256 v256_unpackhi_u16_s32(v256 a) {
+  return v256_from_v128(v128_unpackhi_u16_s32(a.val[1]),
+                        v128_unpacklo_u16_s32(a.val[1]));
+}
+
+SIMD_INLINE v256 v256_unpackhi_s16_s32(v256 a) {
+  return v256_from_v128(v128_unpackhi_s16_s32(a.val[1]),
+                        v128_unpacklo_s16_s32(a.val[1]));
+}
+
+SIMD_INLINE v256 v256_cmpgt_s8(v256 a, v256 b) {
+  return v256_from_v128(v128_cmpgt_s8(a.val[1], b.val[1]),
+                        v128_cmpgt_s8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_cmplt_s8(v256 a, v256 b) {
+  return v256_from_v128(v128_cmplt_s8(a.val[1], b.val[1]),
+                        v128_cmplt_s8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_cmpeq_8(v256 a, v256 b) {
+  return v256_from_v128(v128_cmpeq_8(a.val[1], b.val[1]),
+                        v128_cmpeq_8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_cmpgt_s16(v256 a, v256 b) {
+  return v256_from_v128(v128_cmpgt_s16(a.val[1], b.val[1]),
+                        v128_cmpgt_s16(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_cmplt_s16(v256 a, v256 b) {
+  return v256_from_v128(v128_cmplt_s16(a.val[1], b.val[1]),
+                        v128_cmplt_s16(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_cmpeq_16(v256 a, v256 b) {
+  return v256_from_v128(v128_cmpeq_16(a.val[1], b.val[1]),
+                        v128_cmpeq_16(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_cmpgt_s32(v256 a, v256 b) {
+  return v256_from_v128(v128_cmpgt_s32(a.val[1], b.val[1]),
+                        v128_cmpgt_s32(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_cmplt_s32(v256 a, v256 b) {
+  return v256_from_v128(v128_cmplt_s32(a.val[1], b.val[1]),
+                        v128_cmplt_s32(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_cmpeq_32(v256 a, v256 b) {
+  return v256_from_v128(v128_cmpeq_32(a.val[1], b.val[1]),
+                        v128_cmpeq_32(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_shuffle_8(v256 x, v256 pattern) {
+#if HAVE_NEON
+#if defined(__aarch64__)
+  uint8x16x2_t p = { { vreinterpretq_u8_s64(x.val[0]),
+                       vreinterpretq_u8_s64(x.val[1]) } };
+  return v256_from_v128(
+      vreinterpretq_s64_u8(vqtbl2q_u8(p, vreinterpretq_u8_s64(pattern.val[1]))),
+      vreinterpretq_s64_u8(
+          vqtbl2q_u8(p, vreinterpretq_u8_s64(pattern.val[0]))));
+#else
+  uint8x8x4_t p = { { vget_low_u8(vreinterpretq_u8_s64(x.val[0])),
+                      vget_high_u8(vreinterpretq_u8_s64(x.val[0])),
+                      vget_low_u8(vreinterpretq_u8_s64(x.val[1])),
+                      vget_high_u8(vreinterpretq_u8_s64(x.val[1])) } };
+  return v256_from_64(
+      (uint64_t)vreinterpret_s64_u8(
+          vtbl4_u8(p, vreinterpret_u8_s64(vget_high_s64(pattern.val[1])))),
+      (uint64_t)vreinterpret_s64_u8(
+          vtbl4_u8(p, vreinterpret_u8_s64(vget_low_s64(pattern.val[1])))),
+      (uint64_t)vreinterpret_s64_u8(
+          vtbl4_u8(p, vreinterpret_u8_s64(vget_high_s64(pattern.val[0])))),
+      (uint64_t)vreinterpret_s64_u8(
+          vtbl4_u8(p, vreinterpret_u8_s64(vget_low_s64(pattern.val[0])))));
+#endif
+#else
+  v128 c16 = v128_dup_8(16);
+  v128 maskhi = v128_cmplt_s8(pattern.val[1], c16);
+  v128 masklo = v128_cmplt_s8(pattern.val[0], c16);
+  return v256_from_v128(
+      v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[1], c16)),
+                   v128_shuffle_8(x.val[0], pattern.val[1]), maskhi),
+      v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[0], c16)),
+                   v128_shuffle_8(x.val[0], pattern.val[0]), masklo));
+#endif
+}
+
+SIMD_INLINE v256 v256_wideshuffle_8(v256 x, v256 y, v256 pattern) {
+#if HAVE_NEON
+#if defined(__aarch64__)
+  uint8x16x4_t p = { {
+      vreinterpretq_u8_s64(y.val[0]),
+      vreinterpretq_u8_s64(y.val[1]),
+      vreinterpretq_u8_s64(x.val[0]),
+      vreinterpretq_u8_s64(x.val[1]),
+  } };
+  return v256_from_v128(
+      vreinterpretq_s64_u8(vqtbl4q_u8(p, vreinterpretq_u8_s64(pattern.val[1]))),
+      vreinterpretq_s64_u8(
+          vqtbl4q_u8(p, vreinterpretq_u8_s64(pattern.val[0]))));
+#else
+  v256 c32 = v256_dup_8(32);
+  v256 p32 = v256_sub_8(pattern, c32);
+  uint8x8x4_t p = { { vget_low_u8(vreinterpretq_u8_s64(x.val[0])),
+                      vget_high_u8(vreinterpretq_u8_s64(x.val[0])),
+                      vget_low_u8(vreinterpretq_u8_s64(x.val[1])),
+                      vget_high_u8(vreinterpretq_u8_s64(x.val[1])) } };
+  uint8x8x4_t q = { { vget_low_u8(vreinterpretq_u8_s64(y.val[0])),
+                      vget_high_u8(vreinterpretq_u8_s64(y.val[0])),
+                      vget_low_u8(vreinterpretq_u8_s64(y.val[1])),
+                      vget_high_u8(vreinterpretq_u8_s64(y.val[1])) } };
+  v256 r1 =
+      v256_from_64((uint64_t)vreinterpret_s64_u8(vtbl4_u8(
+                       p, vreinterpret_u8_s64(vget_high_s64(p32.val[1])))),
+                   (uint64_t)vreinterpret_s64_u8(vtbl4_u8(
+                       p, vreinterpret_u8_s64(vget_low_s64(p32.val[1])))),
+                   (uint64_t)vreinterpret_s64_u8(vtbl4_u8(
+                       p, vreinterpret_u8_s64(vget_high_s64(p32.val[0])))),
+                   (uint64_t)vreinterpret_s64_u8(vtbl4_u8(
+                       p, vreinterpret_u8_s64(vget_low_s64(p32.val[0])))));
+  v256 r2 =
+      v256_from_64((uint64_t)vreinterpret_s64_u8(vtbl4_u8(
+                       q, vreinterpret_u8_s64(vget_high_s64(pattern.val[1])))),
+                   (uint64_t)vreinterpret_s64_u8(vtbl4_u8(
+                       q, vreinterpret_u8_s64(vget_low_s64(pattern.val[1])))),
+                   (uint64_t)vreinterpret_s64_u8(vtbl4_u8(
+                       q, vreinterpret_u8_s64(vget_high_s64(pattern.val[0])))),
+                   (uint64_t)vreinterpret_s64_u8(vtbl4_u8(
+                       q, vreinterpret_u8_s64(vget_low_s64(pattern.val[0])))));
+  return v256_blend_8(r1, r2, v256_cmplt_s8(pattern, c32));
+#endif
+#else
+  v128 c16 = v128_dup_8(16);
+  v128 c32 = v128_dup_8(32);
+  v128 c48 = v128_dup_8(48);
+  v128 maskhi16 = v128_cmpgt_s8(c16, pattern.val[1]);
+  v128 masklo16 = v128_cmpgt_s8(c16, pattern.val[0]);
+  v128 maskhi48 = v128_cmpgt_s8(c48, pattern.val[1]);
+  v128 masklo48 = v128_cmpgt_s8(c48, pattern.val[0]);
+  v256 r1 = v256_from_v128(
+      v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[1], c48)),
+                   v128_shuffle_8(x.val[0], v128_sub_8(pattern.val[1], c32)),
+                   maskhi48),
+      v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[0], c48)),
+                   v128_shuffle_8(x.val[0], v128_sub_8(pattern.val[0], c32)),
+                   masklo48));
+  v256 r2 = v256_from_v128(
+      v128_blend_8(v128_shuffle_8(y.val[1], v128_sub_8(pattern.val[1], c16)),
+                   v128_shuffle_8(y.val[0], pattern.val[1]), maskhi16),
+      v128_blend_8(v128_shuffle_8(y.val[1], v128_sub_8(pattern.val[0], c16)),
+                   v128_shuffle_8(y.val[0], pattern.val[0]), masklo16));
+  return v256_blend_8(r1, r2, v256_cmpgt_s8(v256_from_v128(c32, c32), pattern));
+#endif
+}
+
+SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) {
+  return v256_from_v128(
+      v128_shuffle_8(v256_high_v128(a), v256_high_v128(pattern)),
+      v128_shuffle_8(v256_low_v128(a), v256_low_v128(pattern)));
+}
+
+SIMD_INLINE v256 v256_shl_8(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shl_8(a.val[1], c), v128_shl_8(a.val[0], c));
+}
+
+SIMD_INLINE v256 v256_shr_u8(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shr_u8(a.val[1], c), v128_shr_u8(a.val[0], c));
+}
+
+SIMD_INLINE v256 v256_shr_s8(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shr_s8(a.val[1], c), v128_shr_s8(a.val[0], c));
+}
+
+SIMD_INLINE v256 v256_shl_16(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shl_16(a.val[1], c), v128_shl_16(a.val[0], c));
+}
+
+SIMD_INLINE v256 v256_shr_u16(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shr_u16(a.val[1], c), v128_shr_u16(a.val[0], c));
+}
+
+SIMD_INLINE v256 v256_shr_s16(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shr_s16(a.val[1], c), v128_shr_s16(a.val[0], c));
+}
+
+SIMD_INLINE v256 v256_shl_32(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shl_32(a.val[1], c), v128_shl_32(a.val[0], c));
+}
+
+SIMD_INLINE v256 v256_shr_u32(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shr_u32(a.val[1], c), v128_shr_u32(a.val[0], c));
+}
+
+SIMD_INLINE v256 v256_shr_s32(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shr_s32(a.val[1], c), v128_shr_s32(a.val[0], c));
+}
+
+SIMD_INLINE v256 v256_shl_64(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shl_64(a.val[1], c), v128_shl_64(a.val[0], c));
+}
+
+SIMD_INLINE v256 v256_shr_u64(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shr_u64(a.val[1], c), v128_shr_u64(a.val[0], c));
+}
+
+SIMD_INLINE v256 v256_shr_s64(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shr_s64(a.val[1], c), v128_shr_s64(a.val[0], c));
+}
+
+/* These intrinsics require immediate values, so we must use #defines
+   to enforce that. */
+#define v256_shl_n_byte(a, n)                                              \
+  ((n) < 16 ? v256_from_v128(v128_or(v128_shl_n_byte(a.val[1], n),         \
+                                     v128_shr_n_byte(a.val[0], 16 - (n))), \
+                             v128_shl_n_byte(a.val[0], (n)))               \
+            : v256_from_v128(                                              \
+                  (n) > 16 ? v128_shl_n_byte(a.val[0], (n)-16) : a.val[0], \
+                  v128_zero()))
+
+#define v256_shr_n_byte(a, n)                                                \
+  (n == 0                                                                    \
+       ? a                                                                   \
+       : ((n) < 16                                                           \
+              ? v256_from_v128(v128_shr_n_byte(a.val[1], n),                 \
+                               v128_or(v128_shr_n_byte(a.val[0], n),         \
+                                       v128_shl_n_byte(a.val[1], 16 - (n)))) \
+              : v256_from_v128(                                              \
+                    v128_zero(),                                             \
+                    (n) > 16 ? v128_shr_n_byte(a.val[1], (n)-16) : a.val[1])))
+
+#define v256_align(a, b, c) \
+  ((c) ? v256_or(v256_shr_n_byte(b, c), v256_shl_n_byte(a, 32 - (c))) : b)
+
+#define v256_shl_n_8(a, n) \
+  v256_from_v128(v128_shl_n_8(a.val[1], n), v128_shl_n_8(a.val[0], n))
+#define v256_shl_n_16(a, n) \
+  v256_from_v128(v128_shl_n_16(a.val[1], n), v128_shl_n_16(a.val[0], n))
+#define v256_shl_n_32(a, n) \
+  v256_from_v128(v128_shl_n_32(a.val[1], n), v128_shl_n_32(a.val[0], n))
+#define v256_shl_n_64(a, n) \
+  v256_from_v128(v128_shl_n_64(a.val[1], n), v128_shl_n_64(a.val[0], n))
+#define v256_shr_n_u8(a, n) \
+  v256_from_v128(v128_shr_n_u8(a.val[1], n), v128_shr_n_u8(a.val[0], n))
+#define v256_shr_n_u16(a, n) \
+  v256_from_v128(v128_shr_n_u16(a.val[1], n), v128_shr_n_u16(a.val[0], n))
+#define v256_shr_n_u32(a, n) \
+  v256_from_v128(v128_shr_n_u32(a.val[1], n), v128_shr_n_u32(a.val[0], n))
+#define v256_shr_n_u64(a, n) \
+  v256_from_v128(v128_shr_n_u64(a.val[1], n), v128_shr_n_u64(a.val[0], n))
+#define v256_shr_n_s8(a, n) \
+  v256_from_v128(v128_shr_n_s8(a.val[1], n), v128_shr_n_s8(a.val[0], n))
+#define v256_shr_n_s16(a, n) \
+  v256_from_v128(v128_shr_n_s16(a.val[1], n), v128_shr_n_s16(a.val[0], n))
+#define v256_shr_n_s32(a, n) \
+  v256_from_v128(v128_shr_n_s32(a.val[1], n), v128_shr_n_s32(a.val[0], n))
+#define v256_shr_n_s64(a, n) \
+  v256_from_v128(v128_shr_n_s64(a.val[1], n), v128_shr_n_s64(a.val[0], n))
+
+#define v256_shr_n_word(a, n) v256_shr_n_byte(a, 2 * (n))
+#define v256_shl_n_word(a, n) v256_shl_n_byte(a, 2 * (n))
+
+typedef struct {
+  sad128_internal_u16 val[2];
+} sad256_internal_u16;
+
+SIMD_INLINE sad256_internal_u16 v256_sad_u16_init(void) {
+  sad256_internal_u16 t;
+  t.val[1] = v128_sad_u16_init();
+  t.val[0] = v128_sad_u16_init();
+  return t;
+}
+
+/* Implementation dependent return value.  Result must be finalised with
+   v256_sad_u16_sum().
+   The result for more than 16 v256_sad_u16() calls is undefined. */
+SIMD_INLINE sad256_internal_u16 v256_sad_u16(sad256_internal_u16 s, v256 a,
+                                             v256 b) {
+  sad256_internal_u16 t;
+  t.val[1] = v128_sad_u16(s.val[1], a.val[1], b.val[1]);
+  t.val[0] = v128_sad_u16(s.val[0], a.val[0], b.val[0]);
+  return t;
+}
+
+SIMD_INLINE uint32_t v256_sad_u16_sum(sad256_internal_u16 s) {
+  return v128_sad_u16_sum(s.val[1]) + v128_sad_u16_sum(s.val[0]);
+}
+
+typedef struct {
+  ssd128_internal_s16 val[2];
+} ssd256_internal_s16;
+
+SIMD_INLINE ssd256_internal_s16 v256_ssd_s16_init(void) {
+  ssd256_internal_s16 t;
+  t.val[1] = v128_ssd_s16_init();
+  t.val[0] = v128_ssd_s16_init();
+  return t;
+}
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v256_ssd_s16_sum(). */
+SIMD_INLINE ssd256_internal_s16 v256_ssd_s16(ssd256_internal_s16 s, v256 a,
+                                             v256 b) {
+  ssd256_internal_s16 t;
+  t.val[1] = v128_ssd_s16(s.val[1], a.val[1], b.val[1]);
+  t.val[0] = v128_ssd_s16(s.val[0], a.val[0], b.val[0]);
+  return t;
+}
+
+SIMD_INLINE uint64_t v256_ssd_s16_sum(ssd256_internal_s16 s) {
+  return v128_ssd_s16_sum(s.val[1]) + v128_ssd_s16_sum(s.val[0]);
+}
+
+#endif  // AOM_AOM_DSP_SIMD_V256_INTRINSICS_V128_H_
diff --git a/libs/libaom/src/aom_dsp/simd/v256_intrinsics_x86.h b/libs/libaom/src/aom_dsp/simd/v256_intrinsics_x86.h
new file mode 100644
index 000000000..5983cb80c
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/simd/v256_intrinsics_x86.h
@@ -0,0 +1,750 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_X86_H_
+#define AOM_AOM_DSP_SIMD_V256_INTRINSICS_X86_H_
+
+#if !defined(__AVX2__)
+
+#include "aom_dsp/simd/v256_intrinsics_v128.h"
+
+#else
+
+// The _m256i type seems to cause problems for g++'s mangling prior to
+// version 5, but adding -fabi-version=0 fixes this.
+#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 5 && \
+    defined(__AVX2__) && defined(__cplusplus)
+#pragma GCC optimize "-fabi-version=0"
+#endif
+
+#include <immintrin.h>
+
+#include "aom_dsp/simd/v128_intrinsics_x86.h"
+
+typedef __m256i v256;
+
+SIMD_INLINE uint32_t v256_low_u32(v256 a) {
+  return (uint32_t)_mm_cvtsi128_si32(_mm256_extracti128_si256(a, 0));
+}
+
+SIMD_INLINE v64 v256_low_v64(v256 a) {
+  return _mm_unpacklo_epi64(_mm256_extracti128_si256(a, 0), v64_zero());
+}
+
+SIMD_INLINE uint64_t v256_low_u64(v256 a) { return v64_u64(v256_low_v64(a)); }
+
+SIMD_INLINE v128 v256_low_v128(v256 a) { return _mm256_castsi256_si128(a); }
+
+SIMD_INLINE v128 v256_high_v128(v256 a) {
+  return _mm256_extracti128_si256(a, 1);
+}
+
+SIMD_INLINE v256 v256_from_v128(v128 a, v128 b) {
+  // gcc seems to be missing _mm256_set_m128i()
+  return _mm256_inserti128_si256(_mm256_castsi128_si256(b), a, 1);
+}
+
+SIMD_INLINE v256 v256_from_v64(v64 a, v64 b, v64 c, v64 d) {
+  return v256_from_v128(v128_from_v64(a, b), v128_from_v64(c, d));
+}
+
+SIMD_INLINE v256 v256_from_64(uint64_t a, uint64_t b, uint64_t c, uint64_t d) {
+  return _mm256_set_epi64x(a, b, c, d);
+}
+
+SIMD_INLINE v256 v256_load_aligned(const void *p) {
+  return _mm256_load_si256((const __m256i *)p);
+}
+
+SIMD_INLINE v256 v256_load_unaligned(const void *p) {
+  return _mm256_loadu_si256((const __m256i *)p);
+}
+
+SIMD_INLINE void v256_store_aligned(void *p, v256 a) {
+  _mm256_store_si256((__m256i *)p, a);
+}
+
+SIMD_INLINE void v256_store_unaligned(void *p, v256 a) {
+  _mm256_storeu_si256((__m256i *)p, a);
+}
+
+SIMD_INLINE v256 v256_zero(void) { return _mm256_setzero_si256(); }
+
+SIMD_INLINE v256 v256_dup_8(uint8_t x) { return _mm256_set1_epi8(x); }
+
+SIMD_INLINE v256 v256_dup_16(uint16_t x) { return _mm256_set1_epi16(x); }
+
+SIMD_INLINE v256 v256_dup_32(uint32_t x) { return _mm256_set1_epi32(x); }
+
+SIMD_INLINE v256 v256_dup_64(uint64_t x) { return _mm256_set1_epi64x(x); }
+
+SIMD_INLINE v256 v256_add_8(v256 a, v256 b) { return _mm256_add_epi8(a, b); }
+
+SIMD_INLINE v256 v256_add_16(v256 a, v256 b) { return _mm256_add_epi16(a, b); }
+
+SIMD_INLINE v256 v256_sadd_u8(v256 a, v256 b) { return _mm256_adds_epu8(a, b); }
+
+SIMD_INLINE v256 v256_sadd_s8(v256 a, v256 b) { return _mm256_adds_epi8(a, b); }
+
+SIMD_INLINE v256 v256_sadd_s16(v256 a, v256 b) {
+  return _mm256_adds_epi16(a, b);
+}
+
+SIMD_INLINE v256 v256_add_32(v256 a, v256 b) { return _mm256_add_epi32(a, b); }
+
+SIMD_INLINE v256 v256_add_64(v256 a, v256 b) { return _mm256_add_epi64(a, b); }
+
+SIMD_INLINE v256 v256_padd_u8(v256 a) {
+  return _mm256_maddubs_epi16(a, _mm256_set1_epi8(1));
+}
+
+SIMD_INLINE v256 v256_padd_s16(v256 a) {
+  return _mm256_madd_epi16(a, _mm256_set1_epi16(1));
+}
+
+SIMD_INLINE v256 v256_sub_8(v256 a, v256 b) { return _mm256_sub_epi8(a, b); }
+
+SIMD_INLINE v256 v256_ssub_u8(v256 a, v256 b) { return _mm256_subs_epu8(a, b); }
+
+SIMD_INLINE v256 v256_ssub_s8(v256 a, v256 b) { return _mm256_subs_epi8(a, b); }
+
+SIMD_INLINE v256 v256_sub_16(v256 a, v256 b) { return _mm256_sub_epi16(a, b); }
+
+SIMD_INLINE v256 v256_ssub_s16(v256 a, v256 b) {
+  return _mm256_subs_epi16(a, b);
+}
+
+SIMD_INLINE v256 v256_ssub_u16(v256 a, v256 b) {
+  return _mm256_subs_epu16(a, b);
+}
+
+SIMD_INLINE v256 v256_sub_32(v256 a, v256 b) { return _mm256_sub_epi32(a, b); }
+
+SIMD_INLINE v256 v256_sub_64(v256 a, v256 b) { return _mm256_sub_epi64(a, b); }
+
+SIMD_INLINE v256 v256_abs_s16(v256 a) { return _mm256_abs_epi16(a); }
+
+SIMD_INLINE v256 v256_abs_s8(v256 a) { return _mm256_abs_epi8(a); }
+
+// AVX doesn't have the direct intrinsics to zip/unzip 8, 16, 32 bit
+// lanes of lower or upper halves of a 256bit vector because the
+// unpack/pack intrinsics operate on the 256 bit input vector as 2
+// independent 128 bit vectors.
+SIMD_INLINE v256 v256_ziplo_8(v256 a, v256 b) {
+  return _mm256_unpacklo_epi8(
+      _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
+      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
+}
+
+SIMD_INLINE v256 v256_ziphi_8(v256 a, v256 b) {
+  return _mm256_unpackhi_epi8(
+      _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
+      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
+}
+
+SIMD_INLINE v256 v256_ziplo_16(v256 a, v256 b) {
+  return _mm256_unpacklo_epi16(
+      _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
+      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
+}
+
+SIMD_INLINE v256 v256_ziphi_16(v256 a, v256 b) {
+  return _mm256_unpackhi_epi16(
+      _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
+      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
+}
+
+SIMD_INLINE v256 v256_ziplo_32(v256 a, v256 b) {
+  return _mm256_unpacklo_epi32(
+      _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
+      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
+}
+
+SIMD_INLINE v256 v256_ziphi_32(v256 a, v256 b) {
+  return _mm256_unpackhi_epi32(
+      _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
+      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
+}
+
+SIMD_INLINE v256 v256_ziplo_64(v256 a, v256 b) {
+  return _mm256_unpacklo_epi64(
+      _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
+      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
+}
+
+SIMD_INLINE v256 v256_ziphi_64(v256 a, v256 b) {
+  return _mm256_unpackhi_epi64(
+      _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
+      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
+}
+
+SIMD_INLINE v256 v256_ziplo_128(v256 a, v256 b) {
+  return _mm256_permute2x128_si256(a, b, 0x02);
+}
+
+SIMD_INLINE v256 v256_ziphi_128(v256 a, v256 b) {
+  return _mm256_permute2x128_si256(a, b, 0x13);
+}
+
+SIMD_INLINE v256 v256_zip_8(v128 a, v128 b) {
+  return v256_from_v128(v128_ziphi_8(a, b), v128_ziplo_8(a, b));
+}
+
+SIMD_INLINE v256 v256_zip_16(v128 a, v128 b) {
+  return v256_from_v128(v128_ziphi_16(a, b), v128_ziplo_16(a, b));
+}
+
+SIMD_INLINE v256 v256_zip_32(v128 a, v128 b) {
+  return v256_from_v128(v128_ziphi_32(a, b), v128_ziplo_32(a, b));
+}
+
+SIMD_INLINE v256 v256_unziphi_8(v256 a, v256 b) {
+  return _mm256_permute4x64_epi64(
+      _mm256_packs_epi16(_mm256_srai_epi16(b, 8), _mm256_srai_epi16(a, 8)),
+      _MM_SHUFFLE(3, 1, 2, 0));
+}
+
+SIMD_INLINE v256 v256_unziplo_8(v256 a, v256 b) {
+  return v256_unziphi_8(_mm256_slli_si256(a, 1), _mm256_slli_si256(b, 1));
+}
+
+SIMD_INLINE v256 v256_unziphi_16(v256 a, v256 b) {
+  return _mm256_permute4x64_epi64(
+      _mm256_packs_epi32(_mm256_srai_epi32(b, 16), _mm256_srai_epi32(a, 16)),
+      _MM_SHUFFLE(3, 1, 2, 0));
+}
+
+SIMD_INLINE v256 v256_unziplo_16(v256 a, v256 b) {
+  return v256_unziphi_16(_mm256_slli_si256(a, 2), _mm256_slli_si256(b, 2));
+}
+
+SIMD_INLINE v256 v256_unziphi_32(v256 a, v256 b) {
+  return _mm256_permute4x64_epi64(
+      _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(b),
+                                            _mm256_castsi256_ps(a),
+                                            _MM_SHUFFLE(3, 1, 3, 1))),
+      _MM_SHUFFLE(3, 1, 2, 0));
+}
+
+SIMD_INLINE v256 v256_unziplo_32(v256 a, v256 b) {
+  return _mm256_permute4x64_epi64(
+      _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(b),
+                                            _mm256_castsi256_ps(a),
+                                            _MM_SHUFFLE(2, 0, 2, 0))),
+      _MM_SHUFFLE(3, 1, 2, 0));
+}
+
+SIMD_INLINE v256 v256_unziphi_64(v256 a, v256 b) {
+  return _mm256_permute4x64_epi64(
+      _mm256_castpd_si256(_mm256_shuffle_pd(_mm256_castsi256_pd(b),
+                                            _mm256_castsi256_pd(a), 15)),
+      _MM_SHUFFLE(3, 1, 2, 0));
+}
+
+SIMD_INLINE v256 v256_unziplo_64(v256 a, v256 b) {
+  return _mm256_permute4x64_epi64(
+      _mm256_castpd_si256(
+          _mm256_shuffle_pd(_mm256_castsi256_pd(b), _mm256_castsi256_pd(a), 0)),
+      _MM_SHUFFLE(3, 1, 2, 0));
+}
+
+SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) { return _mm256_cvtepu8_epi16(a); }
+
+SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) {
+  return _mm256_unpacklo_epi8(
+      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)),
+      _mm256_setzero_si256());
+}
+
+SIMD_INLINE v256 v256_unpackhi_u8_s16(v256 a) {
+  return _mm256_unpackhi_epi8(
+      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)),
+      _mm256_setzero_si256());
+}
+
+SIMD_INLINE v256 v256_unpack_s8_s16(v128 a) {
+  return v256_from_v128(v128_unpackhi_s8_s16(a), v128_unpacklo_s8_s16(a));
+}
+
+SIMD_INLINE v256 v256_unpacklo_s8_s16(v256 a) {
+  return _mm256_srai_epi16(
+      _mm256_unpacklo_epi8(
+          a, _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))),
+      8);
+}
+
+SIMD_INLINE v256 v256_unpackhi_s8_s16(v256 a) {
+  return _mm256_srai_epi16(
+      _mm256_unpackhi_epi8(
+          a, _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))),
+      8);
+}
+
+SIMD_INLINE v256 v256_pack_s32_s16(v256 a, v256 b) {
+  return _mm256_permute4x64_epi64(_mm256_packs_epi32(b, a),
+                                  _MM_SHUFFLE(3, 1, 2, 0));
+}
+
+SIMD_INLINE v256 v256_pack_s32_u16(v256 a, v256 b) {
+  return _mm256_permute4x64_epi64(_mm256_packus_epi32(b, a),
+                                  _MM_SHUFFLE(3, 1, 2, 0));
+}
+
+SIMD_INLINE v256 v256_pack_s16_u8(v256 a, v256 b) {
+  return _mm256_permute4x64_epi64(_mm256_packus_epi16(b, a),
+                                  _MM_SHUFFLE(3, 1, 2, 0));
+}
+
+SIMD_INLINE v256 v256_pack_s16_s8(v256 a, v256 b) {
+  return _mm256_permute4x64_epi64(_mm256_packs_epi16(b, a),
+                                  _MM_SHUFFLE(3, 1, 2, 0));
+}
+
+SIMD_INLINE v256 v256_unpack_u16_s32(v128 a) {
+  return _mm256_cvtepu16_epi32(a);
+}
+
+SIMD_INLINE v256 v256_unpack_s16_s32(v128 a) {
+  return _mm256_cvtepi16_epi32(a);
+}
+
+SIMD_INLINE v256 v256_unpacklo_u16_s32(v256 a) {
+  return _mm256_unpacklo_epi16(
+      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)),
+      _mm256_setzero_si256());
+}
+
+SIMD_INLINE v256 v256_unpacklo_s16_s32(v256 a) {
+  return _mm256_srai_epi32(
+      _mm256_unpacklo_epi16(
+          a, _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))),
+      16);
+}
+
+SIMD_INLINE v256 v256_unpackhi_u16_s32(v256 a) {
+  return _mm256_unpackhi_epi16(
+      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)),
+      _mm256_setzero_si256());
+}
+
+SIMD_INLINE v256 v256_unpackhi_s16_s32(v256 a) {
+  return _mm256_srai_epi32(
+      _mm256_unpackhi_epi16(
+          a, _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))),
+      16);
+}
+
+SIMD_INLINE v256 v256_shuffle_8(v256 a, v256 pattern) {
+  return _mm256_blendv_epi8(
+      _mm256_shuffle_epi8(
+          _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(0, 1, 0, 1)), pattern),
+      _mm256_shuffle_epi8(
+          _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(0, 0, 0, 0)), pattern),
+      _mm256_cmpgt_epi8(v256_dup_8(16), pattern));
+}
+
+SIMD_INLINE v256 v256_wideshuffle_8(v256 a, v256 b, v256 pattern) {
+  v256 c32 = v256_dup_8(32);
+  v256 p32 = v256_sub_8(pattern, c32);
+  v256 r1 = _mm256_blendv_epi8(
+      _mm256_shuffle_epi8(
+          _mm256_permute2x128_si256(a, b, _MM_SHUFFLE(0, 1, 0, 1)), p32),
+      _mm256_shuffle_epi8(
+          _mm256_permute2x128_si256(a, b, _MM_SHUFFLE(0, 0, 0, 0)), p32),
+      _mm256_cmpgt_epi8(v256_dup_8(48), pattern));
+  v256 r2 = _mm256_blendv_epi8(
+      _mm256_shuffle_epi8(
+          _mm256_permute2x128_si256(a, b, _MM_SHUFFLE(0, 3, 0, 3)), pattern),
+      _mm256_shuffle_epi8(
+          _mm256_permute2x128_si256(a, b, _MM_SHUFFLE(0, 2, 0, 2)), pattern),
+      _mm256_cmpgt_epi8(v256_dup_8(16), pattern));
+  return _mm256_blendv_epi8(r1, r2, _mm256_cmpgt_epi8(c32, pattern));
+}
+
+SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) {
+  return _mm256_shuffle_epi8(a, pattern);
+}
+
+SIMD_INLINE int64_t v256_dotp_su8(v256 a, v256 b) {
+  v256 t1 = _mm256_madd_epi16(v256_unpackhi_s8_s16(a), v256_unpackhi_u8_s16(b));
+  v256 t2 = _mm256_madd_epi16(v256_unpacklo_s8_s16(a), v256_unpacklo_u8_s16(b));
+  t1 = _mm256_add_epi32(t1, t2);
+  v128 t = _mm_add_epi32(_mm256_extracti128_si256(t1, 0),
+                         _mm256_extracti128_si256(t1, 1));
+  t = _mm_add_epi32(t, _mm_srli_si128(t, 8));
+  t = _mm_add_epi32(t, _mm_srli_si128(t, 4));
+  return (int32_t)v128_low_u32(t);
+}
+
+SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) {
+  v256 r = _mm256_madd_epi16(a, b);
+#if defined(__x86_64__)
+  v128 t;
+  r = _mm256_add_epi64(_mm256_cvtepi32_epi64(v256_high_v128(r)),
+                       _mm256_cvtepi32_epi64(v256_low_v128(r)));
+  t = v256_low_v128(_mm256_add_epi64(
+      r, _mm256_permute2x128_si256(r, r, _MM_SHUFFLE(2, 0, 0, 1))));
+  return _mm_cvtsi128_si64(_mm_add_epi64(t, _mm_srli_si128(t, 8)));
+#else
+  v128 l = v256_low_v128(r);
+  v128 h = v256_high_v128(r);
+  return (int64_t)_mm_cvtsi128_si32(l) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 4)) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 8)) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 12)) +
+         (int64_t)_mm_cvtsi128_si32(h) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 4)) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 8)) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 12));
+#endif
+}
+
+SIMD_INLINE int64_t v256_dotp_s32(v256 a, v256 b) {
+  v256 r = _mm256_mullo_epi32(a, b);
+#if defined(__x86_64__)
+  v128 t;
+  r = _mm256_add_epi64(_mm256_cvtepi32_epi64(v256_high_v128(r)),
+                       _mm256_cvtepi32_epi64(v256_low_v128(r)));
+  t = v256_low_v128(_mm256_add_epi64(
+      r, _mm256_permute2x128_si256(r, r, _MM_SHUFFLE(2, 0, 0, 1))));
+  return _mm_cvtsi128_si64(_mm_add_epi64(t, _mm_srli_si128(t, 8)));
+#else
+  v128 l = v256_low_v128(r);
+  v128 h = v256_high_v128(r);
+  return (int64_t)_mm_cvtsi128_si32(l) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 4)) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 8)) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 12)) +
+         (int64_t)_mm_cvtsi128_si32(h) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 4)) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 8)) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 12));
+#endif
+}
+
+SIMD_INLINE uint64_t v256_hadd_u8(v256 a) {
+  v256 t = _mm256_sad_epu8(a, _mm256_setzero_si256());
+  v128 lo = v256_low_v128(t);
+  v128 hi = v256_high_v128(t);
+  lo = v128_add_32(lo, hi);
+  return v64_low_u32(v128_low_v64(lo)) + v128_low_u32(v128_high_v64(lo));
+}
+
+typedef v256 sad256_internal;
+
+SIMD_INLINE sad256_internal v256_sad_u8_init(void) {
+  return _mm256_setzero_si256();
+}
+
+/* Implementation dependent return value.  Result must be finalised with
+   v256_sad_u8_sum().
+   The result for more than 32 v256_sad_u8() calls is undefined. */
+SIMD_INLINE sad256_internal v256_sad_u8(sad256_internal s, v256 a, v256 b) {
+  return _mm256_add_epi64(s, _mm256_sad_epu8(a, b));
+}
+
+SIMD_INLINE uint32_t v256_sad_u8_sum(sad256_internal s) {
+  v256 t = _mm256_add_epi32(s, _mm256_unpackhi_epi64(s, s));
+  return v128_low_u32(_mm_add_epi32(v256_high_v128(t), v256_low_v128(t)));
+}
+
+typedef v256 ssd256_internal;
+
+SIMD_INLINE ssd256_internal v256_ssd_u8_init(void) {
+  return _mm256_setzero_si256();
+}
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v256_ssd_u8_sum(). */
+SIMD_INLINE ssd256_internal v256_ssd_u8(ssd256_internal s, v256 a, v256 b) {
+  v256 l = _mm256_sub_epi16(_mm256_unpacklo_epi8(a, _mm256_setzero_si256()),
+                            _mm256_unpacklo_epi8(b, _mm256_setzero_si256()));
+  v256 h = _mm256_sub_epi16(_mm256_unpackhi_epi8(a, _mm256_setzero_si256()),
+                            _mm256_unpackhi_epi8(b, _mm256_setzero_si256()));
+  v256 rl = _mm256_madd_epi16(l, l);
+  v256 rh = _mm256_madd_epi16(h, h);
+  v128 c = _mm_cvtsi32_si128(32);
+  rl = _mm256_add_epi32(rl, _mm256_srli_si256(rl, 8));
+  rl = _mm256_add_epi32(rl, _mm256_srli_si256(rl, 4));
+  rh = _mm256_add_epi32(rh, _mm256_srli_si256(rh, 8));
+  rh = _mm256_add_epi32(rh, _mm256_srli_si256(rh, 4));
+  return _mm256_add_epi64(
+      s,
+      _mm256_srl_epi64(_mm256_sll_epi64(_mm256_unpacklo_epi64(rl, rh), c), c));
+}
+
+SIMD_INLINE uint32_t v256_ssd_u8_sum(ssd256_internal s) {
+  v256 t = _mm256_add_epi32(s, _mm256_unpackhi_epi64(s, s));
+  return v128_low_u32(_mm_add_epi32(v256_high_v128(t), v256_low_v128(t)));
+}
+
+SIMD_INLINE v256 v256_or(v256 a, v256 b) { return _mm256_or_si256(a, b); }
+
+SIMD_INLINE v256 v256_xor(v256 a, v256 b) { return _mm256_xor_si256(a, b); }
+
+SIMD_INLINE v256 v256_and(v256 a, v256 b) { return _mm256_and_si256(a, b); }
+
+SIMD_INLINE v256 v256_andn(v256 a, v256 b) { return _mm256_andnot_si256(b, a); }
+
+SIMD_INLINE v256 v256_mul_s16(v64 a, v64 b) {
+  v128 lo_bits = v128_mullo_s16(a, b);
+  v128 hi_bits = v128_mulhi_s16(a, b);
+  return v256_from_v128(v128_ziphi_16(hi_bits, lo_bits),
+                        v128_ziplo_16(hi_bits, lo_bits));
+}
+
+SIMD_INLINE v256 v256_mullo_s16(v256 a, v256 b) {
+  return _mm256_mullo_epi16(a, b);
+}
+
+SIMD_INLINE v256 v256_mulhi_s16(v256 a, v256 b) {
+  return _mm256_mulhi_epi16(a, b);
+}
+
+SIMD_INLINE v256 v256_mullo_s32(v256 a, v256 b) {
+  return _mm256_mullo_epi32(a, b);
+}
+
+SIMD_INLINE v256 v256_madd_s16(v256 a, v256 b) {
+  return _mm256_madd_epi16(a, b);
+}
+
+SIMD_INLINE v256 v256_madd_us8(v256 a, v256 b) {
+  return _mm256_maddubs_epi16(a, b);
+}
+
+SIMD_INLINE v256 v256_avg_u8(v256 a, v256 b) { return _mm256_avg_epu8(a, b); }
+
+SIMD_INLINE v256 v256_rdavg_u8(v256 a, v256 b) {
+  return _mm256_sub_epi8(
+      _mm256_avg_epu8(a, b),
+      _mm256_and_si256(_mm256_xor_si256(a, b), v256_dup_8(1)));
+}
+
+SIMD_INLINE v256 v256_rdavg_u16(v256 a, v256 b) {
+  return _mm256_sub_epi16(
+      _mm256_avg_epu16(a, b),
+      _mm256_and_si256(_mm256_xor_si256(a, b), v256_dup_16(1)));
+}
+
+SIMD_INLINE v256 v256_avg_u16(v256 a, v256 b) { return _mm256_avg_epu16(a, b); }
+
+SIMD_INLINE v256 v256_min_u8(v256 a, v256 b) { return _mm256_min_epu8(a, b); }
+
+SIMD_INLINE v256 v256_max_u8(v256 a, v256 b) { return _mm256_max_epu8(a, b); }
+
+SIMD_INLINE v256 v256_min_s8(v256 a, v256 b) { return _mm256_min_epi8(a, b); }
+
+SIMD_INLINE uint32_t v256_movemask_8(v256 a) { return _mm256_movemask_epi8(a); }
+
+SIMD_INLINE v256 v256_blend_8(v256 a, v256 b, v256 c) {
+  return _mm256_blendv_epi8(a, b, c);
+}
+
+SIMD_INLINE v256 v256_max_s8(v256 a, v256 b) { return _mm256_max_epi8(a, b); }
+
+SIMD_INLINE v256 v256_min_s16(v256 a, v256 b) { return _mm256_min_epi16(a, b); }
+
+SIMD_INLINE v256 v256_max_s16(v256 a, v256 b) { return _mm256_max_epi16(a, b); }
+
+SIMD_INLINE v256 v256_min_s32(v256 a, v256 b) { return _mm256_min_epi32(a, b); }
+
+SIMD_INLINE v256 v256_max_s32(v256 a, v256 b) { return _mm256_max_epi32(a, b); }
+
+SIMD_INLINE v256 v256_cmpgt_s8(v256 a, v256 b) {
+  return _mm256_cmpgt_epi8(a, b);
+}
+
+SIMD_INLINE v256 v256_cmplt_s8(v256 a, v256 b) {
+  return _mm256_cmpgt_epi8(b, a);
+}
+
+SIMD_INLINE v256 v256_cmpeq_8(v256 a, v256 b) {
+  return _mm256_cmpeq_epi8(a, b);
+}
+
+SIMD_INLINE v256 v256_cmpgt_s16(v256 a, v256 b) {
+  return _mm256_cmpgt_epi16(a, b);
+}
+
+SIMD_INLINE v256 v256_cmplt_s16(v256 a, v256 b) {
+  return _mm256_cmpgt_epi16(b, a);
+}
+
+SIMD_INLINE v256 v256_cmpeq_16(v256 a, v256 b) {
+  return _mm256_cmpeq_epi16(a, b);
+}
+
+SIMD_INLINE v256 v256_cmpgt_s32(v256 a, v256 b) {
+  return _mm256_cmpgt_epi32(a, b);
+}
+
+SIMD_INLINE v256 v256_cmplt_s32(v256 a, v256 b) {
+  return _mm256_cmpgt_epi32(b, a);
+}
+
+SIMD_INLINE v256 v256_cmpeq_32(v256 a, v256 b) {
+  return _mm256_cmpeq_epi32(a, b);
+}
+
+SIMD_INLINE v256 v256_shl_8(v256 a, unsigned int c) {
+  return _mm256_and_si256(_mm256_set1_epi8((uint8_t)(0xff << c)),
+                          _mm256_sll_epi16(a, _mm_cvtsi32_si128(c)));
+}
+
+SIMD_INLINE v256 v256_shr_u8(v256 a, unsigned int c) {
+  return _mm256_and_si256(_mm256_set1_epi8((char)(0xff >> c)),
+                          _mm256_srl_epi16(a, _mm_cvtsi32_si128(c)));
+}
+
+SIMD_INLINE v256 v256_shr_s8(v256 a, unsigned int c) {
+  __m128i x = _mm_cvtsi32_si128(c + 8);
+  return _mm256_packs_epi16(_mm256_sra_epi16(_mm256_unpacklo_epi8(a, a), x),
+                            _mm256_sra_epi16(_mm256_unpackhi_epi8(a, a), x));
+}
+
+SIMD_INLINE v256 v256_shl_16(v256 a, unsigned int c) {
+  return _mm256_sll_epi16(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v256 v256_shr_u16(v256 a, unsigned int c) {
+  return _mm256_srl_epi16(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v256 v256_shr_s16(v256 a, unsigned int c) {
+  return _mm256_sra_epi16(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v256 v256_shl_32(v256 a, unsigned int c) {
+  return _mm256_sll_epi32(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v256 v256_shr_u32(v256 a, unsigned int c) {
+  return _mm256_srl_epi32(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v256 v256_shr_s32(v256 a, unsigned int c) {
+  return _mm256_sra_epi32(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v256 v256_shl_64(v256 a, unsigned int c) {
+  return _mm256_sll_epi64(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v256 v256_shr_u64(v256 a, unsigned int c) {
+  return _mm256_srl_epi64(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v256 v256_shr_s64(v256 a, unsigned int c) {
+#if defined(__AVX512VL__)
+  return _mm256_sra_epi64(a, _mm_cvtsi32_si128(c));
+#else
+  return v256_from_v128(v128_shr_s64(v256_high_v128(a), c),
+                        v128_shr_s64(v256_low_v128(a), c));
+#endif
+}
+
+/* These intrinsics require immediate values, so we must use #defines
+   to enforce that. */
+// _mm256_slli_si256 works on 128 bit lanes and can't be used
+#define v256_shl_n_byte(a, n)                                                \
+  ((n) < 16 ? v256_from_v128(                                                \
+                  v128_align(v256_high_v128(a), v256_low_v128(a), 16 - (n)), \
+                  v128_shl_n_byte(v256_low_v128(a), n))                      \
+            : _mm256_inserti128_si256(                                       \
+                  _mm256_setzero_si256(),                                    \
+                  v128_shl_n_byte(v256_low_v128(a), (n)-16), 1))
+
+// _mm256_srli_si256 works on 128 bit lanes and can't be used
+#define v256_shr_n_byte(a, n)                                                \
+  ((n) < 16                                                                  \
+       ? _mm256_alignr_epi8(                                                 \
+             _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(2, 0, 0, 1)), a, n) \
+       : ((n) == 16                                                          \
+              ? _mm256_permute2x128_si256(_mm256_setzero_si256(), a, 3)      \
+              : _mm256_inserti128_si256(                                     \
+                    _mm256_setzero_si256(),                                  \
+                    v128_align(v256_high_v128(a), v256_high_v128(a), n), 0)))
+
+// _mm256_alignr_epi8 works on two 128 bit lanes and can't be used
+#define v256_align(a, b, c) \
+  ((c) ? v256_or(v256_shr_n_byte(b, c), v256_shl_n_byte(a, 32 - (c))) : b)
+
+#define v256_shl_n_8(a, c)                                   \
+  _mm256_and_si256(_mm256_set1_epi8((uint8_t)(0xff << (c))), \
+                   _mm256_slli_epi16(a, c))
+#define v256_shr_n_u8(a, c) \
+  _mm256_and_si256(_mm256_set1_epi8(0xff >> (c)), _mm256_srli_epi16(a, c))
+#define v256_shr_n_s8(a, c)                                                  \
+  _mm256_packs_epi16(_mm256_srai_epi16(_mm256_unpacklo_epi8(a, a), (c) + 8), \
+                     _mm256_srai_epi16(_mm256_unpackhi_epi8(a, a), (c) + 8))
+#define v256_shl_n_16(a, c) _mm256_slli_epi16(a, c)
+#define v256_shr_n_u16(a, c) _mm256_srli_epi16(a, c)
+#define v256_shr_n_s16(a, c) _mm256_srai_epi16(a, c)
+#define v256_shl_n_32(a, c) _mm256_slli_epi32(a, c)
+#define v256_shr_n_u32(a, c) _mm256_srli_epi32(a, c)
+#define v256_shr_n_s32(a, c) _mm256_srai_epi32(a, c)
+#define v256_shl_n_64(a, c) _mm256_slli_epi64(a, c)
+#define v256_shr_n_u64(a, c) _mm256_srli_epi64(a, c)
+#define v256_shr_n_s64(a, c) \
+  v256_shr_s64((a), (c))  // _mm256_srai_epi64 broken in gcc?
+#define v256_shr_n_word(a, n) v256_shr_n_byte(a, 2 * (n))
+#define v256_shl_n_word(a, n) v256_shl_n_byte(a, 2 * (n))
+
+typedef v256 sad256_internal_u16;
+
+SIMD_INLINE sad256_internal_u16 v256_sad_u16_init(void) { return v256_zero(); }
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v256_sad_u16_sum(). */
+SIMD_INLINE sad256_internal_u16 v256_sad_u16(sad256_internal_u16 s, v256 a,
+                                             v256 b) {
+#if defined(__SSE4_1__)
+  v256 t = v256_sub_16(_mm256_max_epu16(a, b), _mm256_min_epu16(a, b));
+#else
+  v256 t = v256_cmplt_s16(v256_xor(a, v256_dup_16(32768)),
+                          v256_xor(b, v256_dup_16(32768)));
+  t = v256_sub_16(v256_or(v256_and(b, t), v256_andn(a, t)),
+                  v256_or(v256_and(a, t), v256_andn(b, t)));
+#endif
+  return v256_add_32(
+      s, v256_add_32(v256_unpackhi_u16_s32(t), v256_unpacklo_u16_s32(t)));
+}
+
+SIMD_INLINE uint32_t v256_sad_u16_sum(sad256_internal_u16 s) {
+  v128 t = v128_add_32(v256_high_v128(s), v256_low_v128(s));
+  return v128_low_u32(t) + v128_low_u32(v128_shr_n_byte(t, 4)) +
+         v128_low_u32(v128_shr_n_byte(t, 8)) +
+         v128_low_u32(v128_shr_n_byte(t, 12));
+}
+
+typedef v256 ssd256_internal_s16;
+
+SIMD_INLINE ssd256_internal_s16 v256_ssd_s16_init(void) { return v256_zero(); }
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v256_ssd_s16_sum(). */
+SIMD_INLINE ssd256_internal_s16 v256_ssd_s16(ssd256_internal_s16 s, v256 a,
+                                             v256 b) {
+  v256 d = v256_sub_16(a, b);
+  d = v256_madd_s16(d, d);
+  return v256_add_64(s, v256_add_64(_mm256_unpackhi_epi32(d, v256_zero()),
+                                    _mm256_unpacklo_epi32(d, v256_zero())));
+}
+
+SIMD_INLINE uint64_t v256_ssd_s16_sum(ssd256_internal_s16 s) {
+  v128 t = v128_add_64(v256_high_v128(s), v256_low_v128(s));
+  return v64_u64(v128_low_v64(t)) + v64_u64(v128_high_v64(t));
+}
+
+#endif
+
+#endif  // AOM_AOM_DSP_SIMD_V256_INTRINSICS_X86_H_
diff --git a/libs/libaom/src/aom_dsp/simd/v64_intrinsics.h b/libs/libaom/src/aom_dsp/simd/v64_intrinsics.h
new file mode 100644
index 000000000..7079949cd
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/simd/v64_intrinsics.h
@@ -0,0 +1,234 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_SIMD_V64_INTRINSICS_H_
+#define AOM_AOM_DSP_SIMD_V64_INTRINSICS_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "aom_dsp/simd/v64_intrinsics_c.h"
+
+/* Fallback to plain, unoptimised C. */
+
+typedef c_v64 v64;
+
+SIMD_INLINE uint32_t v64_low_u32(v64 a) { return c_v64_low_u32(a); }
+SIMD_INLINE uint32_t v64_high_u32(v64 a) { return c_v64_high_u32(a); }
+SIMD_INLINE int32_t v64_low_s32(v64 a) { return c_v64_low_s32(a); }
+SIMD_INLINE int32_t v64_high_s32(v64 a) { return c_v64_high_s32(a); }
+SIMD_INLINE v64 v64_from_32(uint32_t x, uint32_t y) {
+  return c_v64_from_32(x, y);
+}
+SIMD_INLINE v64 v64_from_64(uint64_t x) { return c_v64_from_64(x); }
+SIMD_INLINE uint64_t v64_u64(v64 x) { return c_v64_u64(x); }
+SIMD_INLINE v64 v64_from_16(uint16_t a, uint16_t b, uint16_t c, uint16_t d) {
+  return c_v64_from_16(a, b, c, d);
+}
+
+SIMD_INLINE uint32_t u32_load_unaligned(const void *p) {
+  return c_u32_load_unaligned(p);
+}
+SIMD_INLINE uint32_t u32_load_aligned(const void *p) {
+  return c_u32_load_aligned(p);
+}
+SIMD_INLINE void u32_store_unaligned(void *p, uint32_t a) {
+  c_u32_store_unaligned(p, a);
+}
+SIMD_INLINE void u32_store_aligned(void *p, uint32_t a) {
+  c_u32_store_aligned(p, a);
+}
+
+SIMD_INLINE v64 v64_load_unaligned(const void *p) {
+  return c_v64_load_unaligned(p);
+}
+SIMD_INLINE v64 v64_load_aligned(const void *p) {
+  return c_v64_load_aligned(p);
+}
+
+SIMD_INLINE void v64_store_unaligned(void *p, v64 a) {
+  c_v64_store_unaligned(p, a);
+}
+SIMD_INLINE void v64_store_aligned(void *p, v64 a) {
+  c_v64_store_aligned(p, a);
+}
+
+SIMD_INLINE v64 v64_align(v64 a, v64 b, unsigned int c) {
+  return c_v64_align(a, b, c);
+}
+
+SIMD_INLINE v64 v64_zero(void) { return c_v64_zero(); }
+SIMD_INLINE v64 v64_dup_8(uint8_t x) { return c_v64_dup_8(x); }
+SIMD_INLINE v64 v64_dup_16(uint16_t x) { return c_v64_dup_16(x); }
+SIMD_INLINE v64 v64_dup_32(uint32_t x) { return c_v64_dup_32(x); }
+
+SIMD_INLINE v64 v64_add_8(v64 a, v64 b) { return c_v64_add_8(a, b); }
+SIMD_INLINE v64 v64_add_16(v64 a, v64 b) { return c_v64_add_16(a, b); }
+SIMD_INLINE v64 v64_sadd_u8(v64 a, v64 b) { return c_v64_sadd_u8(a, b); }
+SIMD_INLINE v64 v64_sadd_s8(v64 a, v64 b) { return c_v64_sadd_s8(a, b); }
+SIMD_INLINE v64 v64_sadd_s16(v64 a, v64 b) { return c_v64_sadd_s16(a, b); }
+SIMD_INLINE v64 v64_add_32(v64 a, v64 b) { return c_v64_add_32(a, b); }
+SIMD_INLINE v64 v64_sub_8(v64 a, v64 b) { return c_v64_sub_8(a, b); }
+SIMD_INLINE v64 v64_ssub_u8(v64 a, v64 b) { return c_v64_ssub_u8(a, b); }
+SIMD_INLINE v64 v64_ssub_s8(v64 a, v64 b) { return c_v64_ssub_s8(a, b); }
+SIMD_INLINE v64 v64_sub_16(v64 a, v64 b) { return c_v64_sub_16(a, b); }
+SIMD_INLINE v64 v64_ssub_s16(v64 a, v64 b) { return c_v64_ssub_s16(a, b); }
+SIMD_INLINE v64 v64_ssub_u16(v64 a, v64 b) { return c_v64_ssub_u16(a, b); }
+SIMD_INLINE v64 v64_sub_32(v64 a, v64 b) { return c_v64_sub_32(a, b); }
+SIMD_INLINE v64 v64_abs_s16(v64 a) { return c_v64_abs_s16(a); }
+SIMD_INLINE v64 v64_abs_s8(v64 a) { return c_v64_abs_s8(a); }
+
+SIMD_INLINE v64 v64_ziplo_8(v64 a, v64 b) { return c_v64_ziplo_8(a, b); }
+SIMD_INLINE v64 v64_ziphi_8(v64 a, v64 b) { return c_v64_ziphi_8(a, b); }
+SIMD_INLINE v64 v64_ziplo_16(v64 a, v64 b) { return c_v64_ziplo_16(a, b); }
+SIMD_INLINE v64 v64_ziphi_16(v64 a, v64 b) { return c_v64_ziphi_16(a, b); }
+SIMD_INLINE v64 v64_ziplo_32(v64 a, v64 b) { return c_v64_ziplo_32(a, b); }
+SIMD_INLINE v64 v64_ziphi_32(v64 a, v64 b) { return c_v64_ziphi_32(a, b); }
+SIMD_INLINE v64 v64_unziplo_8(v64 a, v64 b) { return c_v64_unziplo_8(a, b); }
+SIMD_INLINE v64 v64_unziphi_8(v64 a, v64 b) { return c_v64_unziphi_8(a, b); }
+SIMD_INLINE v64 v64_unziplo_16(v64 a, v64 b) { return c_v64_unziplo_16(a, b); }
+SIMD_INLINE v64 v64_unziphi_16(v64 a, v64 b) { return c_v64_unziphi_16(a, b); }
+SIMD_INLINE v64 v64_unpacklo_u8_s16(v64 a) { return c_v64_unpacklo_u8_s16(a); }
+SIMD_INLINE v64 v64_unpackhi_u8_s16(v64 a) { return c_v64_unpackhi_u8_s16(a); }
+SIMD_INLINE v64 v64_unpacklo_s8_s16(v64 a) { return c_v64_unpacklo_s8_s16(a); }
+SIMD_INLINE v64 v64_unpackhi_s8_s16(v64 a) { return c_v64_unpackhi_s8_s16(a); }
+SIMD_INLINE v64 v64_pack_s32_s16(v64 a, v64 b) {
+  return c_v64_pack_s32_s16(a, b);
+}
+SIMD_INLINE v64 v64_pack_s32_u16(v64 a, v64 b) {
+  return c_v64_pack_s32_u16(a, b);
+}
+SIMD_INLINE v64 v64_pack_s16_u8(v64 a, v64 b) {
+  return c_v64_pack_s16_u8(a, b);
+}
+SIMD_INLINE v64 v64_pack_s16_s8(v64 a, v64 b) {
+  return c_v64_pack_s16_s8(a, b);
+}
+SIMD_INLINE v64 v64_unpacklo_u16_s32(v64 a) {
+  return c_v64_unpacklo_u16_s32(a);
+}
+SIMD_INLINE v64 v64_unpacklo_s16_s32(v64 a) {
+  return c_v64_unpacklo_s16_s32(a);
+}
+SIMD_INLINE v64 v64_unpackhi_u16_s32(v64 a) {
+  return c_v64_unpackhi_u16_s32(a);
+}
+SIMD_INLINE v64 v64_unpackhi_s16_s32(v64 a) {
+  return c_v64_unpackhi_s16_s32(a);
+}
+SIMD_INLINE v64 v64_shuffle_8(v64 a, v64 pattern) {
+  return c_v64_shuffle_8(a, pattern);
+}
+
+SIMD_INLINE c_sad64_internal v64_sad_u8_init(void) {
+  return c_v64_sad_u8_init();
+}
+SIMD_INLINE c_sad64_internal v64_sad_u8(c_sad64_internal s, v64 a, v64 b) {
+  return c_v64_sad_u8(s, a, b);
+}
+SIMD_INLINE uint32_t v64_sad_u8_sum(c_sad64_internal s) {
+  return c_v64_sad_u8_sum(s);
+}
+SIMD_INLINE c_ssd64_internal v64_ssd_u8_init(void) {
+  return c_v64_ssd_u8_init();
+}
+SIMD_INLINE c_ssd64_internal v64_ssd_u8(c_ssd64_internal s, v64 a, v64 b) {
+  return c_v64_ssd_u8(s, a, b);
+}
+SIMD_INLINE uint32_t v64_ssd_u8_sum(c_ssd64_internal s) {
+  return c_v64_ssd_u8_sum(s);
+}
+SIMD_INLINE int64_t v64_dotp_su8(v64 a, v64 b) { return c_v64_dotp_su8(a, b); }
+SIMD_INLINE int64_t v64_dotp_s16(v64 a, v64 b) { return c_v64_dotp_s16(a, b); }
+SIMD_INLINE uint64_t v64_hadd_u8(v64 a) { return c_v64_hadd_u8(a); }
+SIMD_INLINE int64_t v64_hadd_s16(v64 a) { return c_v64_hadd_s16(a); }
+
+SIMD_INLINE v64 v64_or(v64 a, v64 b) { return c_v64_or(a, b); }
+SIMD_INLINE v64 v64_xor(v64 a, v64 b) { return c_v64_xor(a, b); }
+SIMD_INLINE v64 v64_and(v64 a, v64 b) { return c_v64_and(a, b); }
+SIMD_INLINE v64 v64_andn(v64 a, v64 b) { return c_v64_andn(a, b); }
+
+SIMD_INLINE v64 v64_mullo_s16(v64 a, v64 b) { return c_v64_mullo_s16(a, b); }
+SIMD_INLINE v64 v64_mulhi_s16(v64 a, v64 b) { return c_v64_mulhi_s16(a, b); }
+SIMD_INLINE v64 v64_mullo_s32(v64 a, v64 b) { return c_v64_mullo_s32(a, b); }
+SIMD_INLINE v64 v64_madd_s16(v64 a, v64 b) { return c_v64_madd_s16(a, b); }
+SIMD_INLINE v64 v64_madd_us8(v64 a, v64 b) { return c_v64_madd_us8(a, b); }
+
+SIMD_INLINE v64 v64_avg_u8(v64 a, v64 b) { return c_v64_avg_u8(a, b); }
+SIMD_INLINE v64 v64_rdavg_u8(v64 a, v64 b) { return c_v64_rdavg_u8(a, b); }
+SIMD_INLINE v64 v64_rdavg_u16(v64 a, v64 b) { return c_v64_rdavg_u16(a, b); }
+SIMD_INLINE v64 v64_avg_u16(v64 a, v64 b) { return c_v64_avg_u16(a, b); }
+SIMD_INLINE v64 v64_min_u8(v64 a, v64 b) { return c_v64_min_u8(a, b); }
+SIMD_INLINE v64 v64_max_u8(v64 a, v64 b) { return c_v64_max_u8(a, b); }
+SIMD_INLINE v64 v64_min_s8(v64 a, v64 b) { return c_v64_min_s8(a, b); }
+SIMD_INLINE v64 v64_max_s8(v64 a, v64 b) { return c_v64_max_s8(a, b); }
+SIMD_INLINE v64 v64_min_s16(v64 a, v64 b) { return c_v64_min_s16(a, b); }
+SIMD_INLINE v64 v64_max_s16(v64 a, v64 b) { return c_v64_max_s16(a, b); }
+
+SIMD_INLINE v64 v64_cmpgt_s8(v64 a, v64 b) { return c_v64_cmpgt_s8(a, b); }
+SIMD_INLINE v64 v64_cmplt_s8(v64 a, v64 b) { return c_v64_cmplt_s8(a, b); }
+SIMD_INLINE v64 v64_cmpeq_8(v64 a, v64 b) { return c_v64_cmpeq_8(a, b); }
+SIMD_INLINE v64 v64_cmpgt_s16(v64 a, v64 b) { return c_v64_cmpgt_s16(a, b); }
+SIMD_INLINE v64 v64_cmplt_s16(v64 a, v64 b) { return c_v64_cmplt_s16(a, b); }
+SIMD_INLINE v64 v64_cmpeq_16(v64 a, v64 b) { return c_v64_cmpeq_16(a, b); }
+
+SIMD_INLINE v64 v64_shl_8(v64 a, unsigned int n) { return c_v64_shl_8(a, n); }
+SIMD_INLINE v64 v64_shr_u8(v64 a, unsigned int n) { return c_v64_shr_u8(a, n); }
+SIMD_INLINE v64 v64_shr_s8(v64 a, unsigned int n) { return c_v64_shr_s8(a, n); }
+SIMD_INLINE v64 v64_shl_16(v64 a, unsigned int n) { return c_v64_shl_16(a, n); }
+SIMD_INLINE v64 v64_shr_u16(v64 a, unsigned int n) {
+  return c_v64_shr_u16(a, n);
+}
+SIMD_INLINE v64 v64_shr_s16(v64 a, unsigned int n) {
+  return c_v64_shr_s16(a, n);
+}
+SIMD_INLINE v64 v64_shl_32(v64 a, unsigned int n) { return c_v64_shl_32(a, n); }
+SIMD_INLINE v64 v64_shr_u32(v64 a, unsigned int n) {
+  return c_v64_shr_u32(a, n);
+}
+SIMD_INLINE v64 v64_shr_s32(v64 a, unsigned int n) {
+  return c_v64_shr_s32(a, n);
+}
+SIMD_INLINE v64 v64_shr_n_byte(v64 a, unsigned int n) {
+  return c_v64_shr_n_byte(a, n);
+}
+SIMD_INLINE v64 v64_shl_n_byte(v64 a, unsigned int n) {
+  return c_v64_shl_n_byte(a, n);
+}
+SIMD_INLINE v64 v64_shl_n_8(v64 a, unsigned int c) {
+  return c_v64_shl_n_8(a, c);
+}
+SIMD_INLINE v64 v64_shr_n_u8(v64 a, unsigned int c) {
+  return c_v64_shr_n_u8(a, c);
+}
+SIMD_INLINE v64 v64_shr_n_s8(v64 a, unsigned int c) {
+  return c_v64_shr_n_s8(a, c);
+}
+SIMD_INLINE v64 v64_shl_n_16(v64 a, unsigned int c) {
+  return c_v64_shl_n_16(a, c);
+}
+SIMD_INLINE v64 v64_shr_n_u16(v64 a, unsigned int c) {
+  return c_v64_shr_n_u16(a, c);
+}
+SIMD_INLINE v64 v64_shr_n_s16(v64 a, unsigned int c) {
+  return c_v64_shr_n_s16(a, c);
+}
+SIMD_INLINE v64 v64_shl_n_32(v64 a, unsigned int c) {
+  return c_v64_shl_n_32(a, c);
+}
+SIMD_INLINE v64 v64_shr_n_u32(v64 a, unsigned int c) {
+  return c_v64_shr_n_u32(a, c);
+}
+SIMD_INLINE v64 v64_shr_n_s32(v64 a, unsigned int c) {
+  return c_v64_shr_n_s32(a, c);
+}
+
+#endif  // AOM_AOM_DSP_SIMD_V64_INTRINSICS_H_
diff --git a/libs/libaom/src/aom_dsp/simd/v64_intrinsics_arm.h b/libs/libaom/src/aom_dsp/simd/v64_intrinsics_arm.h
new file mode 100644
index 000000000..a4ecdf4b5
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/simd/v64_intrinsics_arm.h
@@ -0,0 +1,684 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_SIMD_V64_INTRINSICS_ARM_H_
+#define AOM_AOM_DSP_SIMD_V64_INTRINSICS_ARM_H_
+
+#include <arm_neon.h>
+
+#include "aom_dsp/simd/v64_intrinsics_arm.h"
+#include "aom_ports/arm.h"
+
+#ifdef AOM_INCOMPATIBLE_GCC
+#error Incompatible gcc
+#endif
+
+typedef int64x1_t v64;
+
+SIMD_INLINE uint32_t v64_low_u32(v64 a) {
+  return vget_lane_u32(vreinterpret_u32_s64(a), 0);
+}
+
+SIMD_INLINE uint32_t v64_high_u32(v64 a) {
+  return vget_lane_u32(vreinterpret_u32_s64(a), 1);
+}
+
+SIMD_INLINE int32_t v64_low_s32(v64 a) {
+  return vget_lane_s32(vreinterpret_s32_s64(a), 0);
+}
+
+SIMD_INLINE int32_t v64_high_s32(v64 a) {
+  return vget_lane_s32(vreinterpret_s32_s64(a), 1);
+}
+
+SIMD_INLINE v64 v64_from_16(uint16_t a, uint16_t b, uint16_t c, uint16_t d) {
+  return vcreate_s64((uint64_t)a << 48 | (uint64_t)b << 32 | (uint64_t)c << 16 |
+                     d);
+}
+
+SIMD_INLINE v64 v64_from_32(uint32_t x, uint32_t y) {
+  return vcreate_s64((uint64_t)x << 32 | y);
+}
+
+SIMD_INLINE v64 v64_from_64(uint64_t x) { return vcreate_s64(x); }
+
+SIMD_INLINE uint64_t v64_u64(v64 x) { return (uint64_t)x; }
+
+SIMD_INLINE uint32_t u32_load_aligned(const void *p) {
+  return *((uint32_t *)p);
+}
+
+SIMD_INLINE uint32_t u32_load_unaligned(const void *p) {
+  return vget_lane_u32(vreinterpret_u32_u8(vld1_u8((const uint8_t *)p)), 0);
+}
+
+SIMD_INLINE void u32_store_aligned(void *p, uint32_t a) {
+  *((uint32_t *)p) = a;
+}
+
+SIMD_INLINE void u32_store_unaligned(void *p, uint32_t a) {
+#if defined(__clang__)
+  vst1_lane_u32((uint32_t *)p, vreinterpret_u32_s64((uint64x1_t)(uint64_t)a),
+                0);
+#elif defined(__CC_ARM)
+  *(__packed uint32_t *)p) = a;
+#elif defined(__GNUC__)
+  struct Unaligned32Struct {
+    uint32_t value;
+    uint8_t dummy;  // To make the size non-power-of-two.
+  } __attribute__((__packed__));
+  ((struct Unaligned32Struct *)p)->value = a;
+#else
+  vst1_lane_u32((uint32_t *)p, vreinterpret_u32_s64((uint64x1_t)(uint64_t)a),
+                0);
+#endif
+}
+
+SIMD_INLINE v64 v64_load_aligned(const void *p) {
+  return vreinterpret_s64_u8(vld1_u8((const uint8_t *)p));
+}
+
+SIMD_INLINE v64 v64_load_unaligned(const void *p) {
+  return v64_load_aligned(p);
+}
+
+SIMD_INLINE void v64_store_aligned(void *p, v64 r) {
+  vst1_u8((uint8_t *)p, vreinterpret_u8_s64(r));
+}
+
+SIMD_INLINE void v64_store_unaligned(void *p, v64 r) {
+  vst1_u8((uint8_t *)p, vreinterpret_u8_s64(r));
+}
+
+// The following function requires an immediate.
+// Some compilers will check this if it's optimising, others wont.
+SIMD_INLINE v64 v64_align(v64 a, v64 b, unsigned int c) {
+#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__)
+  return c ? vreinterpret_s64_s8(
+                 vext_s8(vreinterpret_s8_s64(b), vreinterpret_s8_s64(a), c))
+           : b;
+#else
+  return c ? v64_from_64(((uint64_t)b >> c * 8) | ((uint64_t)a << (8 - c) * 8))
+           : b;
+#endif
+}
+
+SIMD_INLINE v64 v64_zero(void) { return vreinterpret_s64_u8(vdup_n_u8(0)); }
+
+SIMD_INLINE v64 v64_dup_8(uint8_t x) {
+  return vreinterpret_s64_u8(vdup_n_u8(x));
+}
+
+SIMD_INLINE v64 v64_dup_16(uint16_t x) {
+  return vreinterpret_s64_u16(vdup_n_u16(x));
+}
+
+SIMD_INLINE v64 v64_dup_32(uint32_t x) {
+  return vreinterpret_s64_u32(vdup_n_u32(x));
+}
+
+SIMD_INLINE int64_t v64_dotp_su8(v64 x, v64 y) {
+  int16x8_t t =
+      vmulq_s16(vmovl_s8(vreinterpret_s8_s64(x)),
+                vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(y))));
+#if defined(__aarch64__)
+  return vaddlvq_s16(t);
+#else
+  int64x2_t r = vpaddlq_s32(vpaddlq_s16(t));
+  return (int64_t)vadd_s64(vget_high_s64(r), vget_low_s64(r));
+#endif
+}
+
+SIMD_INLINE int64_t v64_dotp_s16(v64 x, v64 y) {
+#if defined(__aarch64__)
+  return vaddlvq_s32(
+      vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
+#else
+  int64x2_t r =
+      vpaddlq_s32(vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
+  return (int64_t)(vget_high_s64(r) + vget_low_s64(r));
+#endif
+}
+
+SIMD_INLINE uint64_t v64_hadd_u8(v64 x) {
+#if defined(__aarch64__)
+  return vaddlv_u8(vreinterpret_u8_s64(x));
+#else
+  return (uint64_t)vpaddl_u32(vpaddl_u16(vpaddl_u8(vreinterpret_u8_s64(x))));
+#endif
+}
+
+SIMD_INLINE int64_t v64_hadd_s16(v64 a) {
+  return (int64_t)vpaddl_s32(vpaddl_s16(vreinterpret_s16_s64(a)));
+}
+
+typedef uint16x8_t sad64_internal;
+
+SIMD_INLINE sad64_internal v64_sad_u8_init(void) { return vdupq_n_u16(0); }
+
+// Implementation dependent return value. Result must be finalised with
+// v64_sad_u8_sum().
+SIMD_INLINE sad64_internal v64_sad_u8(sad64_internal s, v64 a, v64 b) {
+  return vabal_u8(s, vreinterpret_u8_s64(a), vreinterpret_u8_s64(b));
+}
+
+SIMD_INLINE uint32_t v64_sad_u8_sum(sad64_internal s) {
+#if defined(__aarch64__)
+  return vaddlvq_u16(s);
+#else
+  uint64x2_t r = vpaddlq_u32(vpaddlq_u16(s));
+  return (uint32_t)(uint64_t)(vget_high_u64(r) + vget_low_u64(r));
+#endif
+}
+
+typedef uint32x4_t ssd64_internal;
+
+SIMD_INLINE ssd64_internal v64_ssd_u8_init(void) { return vdupq_n_u32(0); }
+
+// Implementation dependent return value. Result must be finalised with
+// v64_ssd_u8_sum().
+SIMD_INLINE ssd64_internal v64_ssd_u8(ssd64_internal s, v64 a, v64 b) {
+  uint8x8_t t = vabd_u8(vreinterpret_u8_s64(a), vreinterpret_u8_s64(b));
+  return vaddq_u32(s, vpaddlq_u16(vmull_u8(t, t)));
+}
+
+SIMD_INLINE uint32_t v64_ssd_u8_sum(ssd64_internal s) {
+#if defined(__aarch64__)
+  return vaddvq_u32(s);
+#else
+  uint64x2_t t = vpaddlq_u32(s);
+  return vget_lane_u32(
+      vreinterpret_u32_u64(vadd_u64(vget_high_u64(t), vget_low_u64(t))), 0);
+#endif
+}
+
+SIMD_INLINE v64 v64_or(v64 x, v64 y) { return vorr_s64(x, y); }
+
+SIMD_INLINE v64 v64_xor(v64 x, v64 y) { return veor_s64(x, y); }
+
+SIMD_INLINE v64 v64_and(v64 x, v64 y) { return vand_s64(x, y); }
+
+SIMD_INLINE v64 v64_andn(v64 x, v64 y) { return vbic_s64(x, y); }
+
+SIMD_INLINE v64 v64_add_8(v64 x, v64 y) {
+  return vreinterpret_s64_u8(
+      vadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
+}
+
+SIMD_INLINE v64 v64_sadd_u8(v64 x, v64 y) {
+  return vreinterpret_s64_u8(
+      vqadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
+}
+
+SIMD_INLINE v64 v64_sadd_s8(v64 x, v64 y) {
+  return vreinterpret_s64_s8(
+      vqadd_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y)));
+}
+
+SIMD_INLINE v64 v64_add_16(v64 x, v64 y) {
+  return vreinterpret_s64_s16(
+      vadd_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
+}
+
+SIMD_INLINE v64 v64_sadd_s16(v64 x, v64 y) {
+  return vreinterpret_s64_s16(
+      vqadd_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
+}
+
+SIMD_INLINE v64 v64_add_32(v64 x, v64 y) {
+  return vreinterpret_s64_u32(
+      vadd_u32(vreinterpret_u32_s64(x), vreinterpret_u32_s64(y)));
+}
+
+SIMD_INLINE v64 v64_sub_8(v64 x, v64 y) {
+  return vreinterpret_s64_u8(
+      vsub_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
+}
+
+SIMD_INLINE v64 v64_sub_16(v64 x, v64 y) {
+  return vreinterpret_s64_s16(
+      vsub_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
+}
+
+SIMD_INLINE v64 v64_ssub_s16(v64 x, v64 y) {
+  return vreinterpret_s64_s16(
+      vqsub_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
+}
+
+SIMD_INLINE v64 v64_ssub_u16(v64 x, v64 y) {
+  return vreinterpret_s64_u16(
+      vqsub_u16(vreinterpret_u16_s64(x), vreinterpret_u16_s64(y)));
+}
+
+SIMD_INLINE v64 v64_ssub_u8(v64 x, v64 y) {
+  return vreinterpret_s64_u8(
+      vqsub_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
+}
+
+SIMD_INLINE v64 v64_ssub_s8(v64 x, v64 y) {
+  return vreinterpret_s64_s8(
+      vqsub_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y)));
+}
+
+SIMD_INLINE v64 v64_sub_32(v64 x, v64 y) {
+  return vreinterpret_s64_s32(
+      vsub_s32(vreinterpret_s32_s64(x), vreinterpret_s32_s64(y)));
+}
+
+SIMD_INLINE v64 v64_abs_s16(v64 x) {
+  return vreinterpret_s64_s16(vabs_s16(vreinterpret_s16_s64(x)));
+}
+
+SIMD_INLINE v64 v64_abs_s8(v64 x) {
+  return vreinterpret_s64_s8(vabs_s8(vreinterpret_s8_s64(x)));
+}
+
+SIMD_INLINE v64 v64_mullo_s16(v64 x, v64 y) {
+  return vreinterpret_s64_s16(
+      vmul_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
+}
+
+SIMD_INLINE v64 v64_mulhi_s16(v64 x, v64 y) {
+#if defined(__aarch64__)
+  int16x8_t t = vreinterpretq_s16_s32(
+      vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
+  return vget_low_s64(vreinterpretq_s64_s16(vuzp2q_s16(t, t)));
+#else
+  return vreinterpret_s64_s16(vmovn_s32(vshrq_n_s32(
+      vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)), 16)));
+#endif
+}
+
+SIMD_INLINE v64 v64_mullo_s32(v64 x, v64 y) {
+  return vreinterpret_s64_s32(
+      vmul_s32(vreinterpret_s32_s64(x), vreinterpret_s32_s64(y)));
+}
+
+SIMD_INLINE v64 v64_madd_s16(v64 x, v64 y) {
+  int32x4_t t = vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y));
+  return vreinterpret_s64_s32(
+      vpadd_s32(vreinterpret_s32_s64(vget_low_s64(vreinterpretq_s64_s32(t))),
+                vreinterpret_s32_s64(vget_high_s64(vreinterpretq_s64_s32(t)))));
+}
+
+SIMD_INLINE v64 v64_madd_us8(v64 x, v64 y) {
+  int16x8_t t =
+      vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(x))),
+                vmovl_s8(vreinterpret_s8_s64(y)));
+  return vreinterpret_s64_s16(vqmovn_s32(vpaddlq_s16(t)));
+}
+
+SIMD_INLINE v64 v64_avg_u8(v64 x, v64 y) {
+  return vreinterpret_s64_u8(
+      vrhadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
+}
+
+SIMD_INLINE v64 v64_rdavg_u8(v64 x, v64 y) {
+  return vreinterpret_s64_u8(
+      vhadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
+}
+
+SIMD_INLINE v64 v64_rdavg_u16(v64 x, v64 y) {
+  return vreinterpret_s64_u16(
+      vhadd_u16(vreinterpret_u16_s64(x), vreinterpret_u16_s64(y)));
+}
+
+SIMD_INLINE v64 v64_avg_u16(v64 x, v64 y) {
+  return vreinterpret_s64_u16(
+      vrhadd_u16(vreinterpret_u16_s64(x), vreinterpret_u16_s64(y)));
+}
+
+SIMD_INLINE v64 v64_max_u8(v64 x, v64 y) {
+  return vreinterpret_s64_u8(
+      vmax_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
+}
+
+SIMD_INLINE v64 v64_min_u8(v64 x, v64 y) {
+  return vreinterpret_s64_u8(
+      vmin_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
+}
+
+SIMD_INLINE v64 v64_max_s8(v64 x, v64 y) {
+  return vreinterpret_s64_s8(
+      vmax_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y)));
+}
+
+SIMD_INLINE v64 v64_min_s8(v64 x, v64 y) {
+  return vreinterpret_s64_s8(
+      vmin_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y)));
+}
+
+SIMD_INLINE v64 v64_max_s16(v64 x, v64 y) {
+  return vreinterpret_s64_s16(
+      vmax_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
+}
+
+SIMD_INLINE v64 v64_min_s16(v64 x, v64 y) {
+  return vreinterpret_s64_s16(
+      vmin_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
+}
+
+SIMD_INLINE v64 v64_ziplo_8(v64 x, v64 y) {
+#if defined(__aarch64__)
+  return vreinterpret_s64_u8(
+      vzip1_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)));
+#else
+  uint8x8x2_t r = vzip_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x));
+  return vreinterpret_s64_u8(r.val[0]);
+#endif
+}
+
+SIMD_INLINE v64 v64_ziphi_8(v64 x, v64 y) {
+#if defined(__aarch64__)
+  return vreinterpret_s64_u8(
+      vzip2_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)));
+#else
+  uint8x8x2_t r = vzip_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x));
+  return vreinterpret_s64_u8(r.val[1]);
+#endif
+}
+
+SIMD_INLINE v64 v64_ziplo_16(v64 x, v64 y) {
+#if defined(__aarch64__)
+  return vreinterpret_s64_u16(
+      vzip1_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x)));
+#else
+  int16x4x2_t r = vzip_s16(vreinterpret_s16_s64(y), vreinterpret_s16_s64(x));
+  return vreinterpret_s64_s16(r.val[0]);
+#endif
+}
+
+SIMD_INLINE v64 v64_ziphi_16(v64 x, v64 y) {
+#if defined(__aarch64__)
+  return vreinterpret_s64_u16(
+      vzip2_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x)));
+#else
+  int16x4x2_t r = vzip_s16(vreinterpret_s16_s64(y), vreinterpret_s16_s64(x));
+  return vreinterpret_s64_s16(r.val[1]);
+#endif
+}
+
+SIMD_INLINE v64 v64_ziplo_32(v64 x, v64 y) {
+#if defined(__aarch64__)
+  return vreinterpret_s64_u32(
+      vzip1_u32(vreinterpret_u32_s64(y), vreinterpret_u32_s64(x)));
+#else
+  int32x2x2_t r = vzip_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x));
+  return vreinterpret_s64_s32(r.val[0]);
+#endif
+}
+
+SIMD_INLINE v64 v64_ziphi_32(v64 x, v64 y) {
+#if defined(__aarch64__)
+  return vreinterpret_s64_u32(
+      vzip2_u32(vreinterpret_u32_s64(y), vreinterpret_u32_s64(x)));
+#else
+  int32x2x2_t r = vzip_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x));
+  return vreinterpret_s64_s32(r.val[1]);
+#endif
+}
+
+SIMD_INLINE v64 v64_unpacklo_u8_s16(v64 a) {
+  return vreinterpret_s64_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_s64(a))));
+}
+
+SIMD_INLINE v64 v64_unpackhi_u8_s16(v64 a) {
+  return vreinterpret_s64_u16(vget_high_u16(vmovl_u8(vreinterpret_u8_s64(a))));
+}
+
+SIMD_INLINE v64 v64_unpacklo_s8_s16(v64 a) {
+  return vreinterpret_s64_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_s64(a))));
+}
+
+SIMD_INLINE v64 v64_unpackhi_s8_s16(v64 a) {
+  return vreinterpret_s64_s16(vget_high_s16(vmovl_s8(vreinterpret_s8_s64(a))));
+}
+
+SIMD_INLINE v64 v64_pack_s32_s16(v64 x, v64 y) {
+  return vreinterpret_s64_s16(vqmovn_s32(
+      vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x))));
+}
+
+SIMD_INLINE v64 v64_pack_s32_u16(v64 x, v64 y) {
+  return vreinterpret_s64_u16(vqmovun_s32(
+      vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x))));
+}
+
+SIMD_INLINE v64 v64_pack_s16_u8(v64 x, v64 y) {
+  return vreinterpret_s64_u8(vqmovun_s16(vreinterpretq_s16_s32(
+      vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x)))));
+}
+
+SIMD_INLINE v64 v64_pack_s16_s8(v64 x, v64 y) {
+  return vreinterpret_s64_s8(vqmovn_s16(vreinterpretq_s16_s32(
+      vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x)))));
+}
+
+SIMD_INLINE v64 v64_unziplo_8(v64 x, v64 y) {
+#if defined(__aarch64__)
+  return vreinterpret_s64_u8(
+      vuzp1_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)));
+#else
+  uint8x8x2_t r = vuzp_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x));
+  return vreinterpret_s64_u8(r.val[0]);
+#endif
+}
+
+SIMD_INLINE v64 v64_unziphi_8(v64 x, v64 y) {
+#if defined(__aarch64__)
+  return vreinterpret_s64_u8(
+      vuzp2_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)));
+#else
+  uint8x8x2_t r = vuzp_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x));
+  return vreinterpret_s64_u8(r.val[1]);
+#endif
+}
+
+SIMD_INLINE v64 v64_unziplo_16(v64 x, v64 y) {
+#if defined(__aarch64__)
+  return vreinterpret_s64_u16(
+      vuzp1_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x)));
+#else
+  uint16x4x2_t r = vuzp_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x));
+  return vreinterpret_s64_u16(r.val[0]);
+#endif
+}
+
+SIMD_INLINE v64 v64_unziphi_16(v64 x, v64 y) {
+#if defined(__aarch64__)
+  return vreinterpret_s64_u16(
+      vuzp2_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x)));
+#else
+  uint16x4x2_t r = vuzp_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x));
+  return vreinterpret_s64_u16(r.val[1]);
+#endif
+}
+
+SIMD_INLINE v64 v64_unpacklo_s16_s32(v64 x) {
+  return vreinterpret_s64_s32(vget_low_s32(vmovl_s16(vreinterpret_s16_s64(x))));
+}
+
+SIMD_INLINE v64 v64_unpacklo_u16_s32(v64 x) {
+  return vreinterpret_s64_u32(vget_low_u32(vmovl_u16(vreinterpret_u16_s64(x))));
+}
+
+SIMD_INLINE v64 v64_unpackhi_s16_s32(v64 x) {
+  return vreinterpret_s64_s32(
+      vget_high_s32(vmovl_s16(vreinterpret_s16_s64(x))));
+}
+
+SIMD_INLINE v64 v64_unpackhi_u16_s32(v64 x) {
+  return vreinterpret_s64_u32(
+      vget_high_u32(vmovl_u16(vreinterpret_u16_s64(x))));
+}
+
+SIMD_INLINE v64 v64_shuffle_8(v64 x, v64 pattern) {
+  return vreinterpret_s64_u8(
+      vtbl1_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(pattern)));
+}
+
+SIMD_INLINE v64 v64_cmpgt_s8(v64 x, v64 y) {
+  return vreinterpret_s64_u8(
+      vcgt_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y)));
+}
+
+SIMD_INLINE v64 v64_cmplt_s8(v64 x, v64 y) {
+  return vreinterpret_s64_u8(
+      vclt_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y)));
+}
+
+SIMD_INLINE v64 v64_cmpeq_8(v64 x, v64 y) {
+  return vreinterpret_s64_u8(
+      vceq_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
+}
+
+SIMD_INLINE v64 v64_cmpgt_s16(v64 x, v64 y) {
+  return vreinterpret_s64_u16(
+      vcgt_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
+}
+
+SIMD_INLINE v64 v64_cmplt_s16(v64 x, v64 y) {
+  return vreinterpret_s64_u16(
+      vclt_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
+}
+
+SIMD_INLINE v64 v64_cmpeq_16(v64 x, v64 y) {
+  return vreinterpret_s64_u16(
+      vceq_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
+}
+
+SIMD_INLINE v64 v64_shl_8(v64 a, unsigned int c) {
+  return vreinterpret_s64_u8(vshl_u8(vreinterpret_u8_s64(a), vdup_n_s8(c)));
+}
+
+SIMD_INLINE v64 v64_shr_u8(v64 a, unsigned int c) {
+  return vreinterpret_s64_u8(vshl_u8(vreinterpret_u8_s64(a), vdup_n_s8(-c)));
+}
+
+SIMD_INLINE v64 v64_shr_s8(v64 a, unsigned int c) {
+  return vreinterpret_s64_s8(vshl_s8(vreinterpret_s8_s64(a), vdup_n_s8(-c)));
+}
+
+SIMD_INLINE v64 v64_shl_16(v64 a, unsigned int c) {
+  return vreinterpret_s64_u16(vshl_u16(vreinterpret_u16_s64(a), vdup_n_s16(c)));
+}
+
+SIMD_INLINE v64 v64_shr_u16(v64 a, unsigned int c) {
+  return vreinterpret_s64_u16(
+      vshl_u16(vreinterpret_u16_s64(a), vdup_n_s16(-(int)c)));
+}
+
+SIMD_INLINE v64 v64_shr_s16(v64 a, unsigned int c) {
+  return vreinterpret_s64_s16(
+      vshl_s16(vreinterpret_s16_s64(a), vdup_n_s16(-(int)c)));
+}
+
+SIMD_INLINE v64 v64_shl_32(v64 a, unsigned int c) {
+  return vreinterpret_s64_u32(vshl_u32(vreinterpret_u32_s64(a), vdup_n_s32(c)));
+}
+
+SIMD_INLINE v64 v64_shr_u32(v64 a, unsigned int c) {
+  return vreinterpret_s64_u32(
+      vshl_u32(vreinterpret_u32_s64(a), vdup_n_s32(-(int)c)));
+}
+
+SIMD_INLINE v64 v64_shr_s32(v64 a, unsigned int c) {
+  return vreinterpret_s64_s32(
+      vshl_s32(vreinterpret_s32_s64(a), vdup_n_s32(-(int)c)));
+}
+
+// The following functions require an immediate.
+// Some compilers will check this during optimisation, others wont.
+#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__)
+
+SIMD_INLINE v64 v64_shl_n_byte(v64 a, unsigned int c) {
+  return vshl_n_s64(a, c * 8);
+}
+
+SIMD_INLINE v64 v64_shr_n_byte(v64 a, unsigned int c) {
+  return c ? (v64)vshr_n_u64(vreinterpret_u64_s64(a), c * 8) : a;
+}
+
+SIMD_INLINE v64 v64_shl_n_8(v64 a, unsigned int c) {
+  return c ? vreinterpret_s64_u8(vshl_n_u8(vreinterpret_u8_s64(a), c)) : a;
+}
+
+SIMD_INLINE v64 v64_shr_n_u8(v64 a, unsigned int c) {
+  return c ? vreinterpret_s64_u8(vshr_n_u8(vreinterpret_u8_s64(a), c)) : a;
+}
+
+SIMD_INLINE v64 v64_shr_n_s8(v64 a, unsigned int c) {
+  return c ? vreinterpret_s64_s8(vshr_n_s8(vreinterpret_s8_s64(a), c)) : a;
+}
+
+SIMD_INLINE v64 v64_shl_n_16(v64 a, unsigned int c) {
+  return c ? vreinterpret_s64_u16(vshl_n_u16(vreinterpret_u16_s64(a), c)) : a;
+}
+
+SIMD_INLINE v64 v64_shr_n_u16(v64 a, unsigned int c) {
+  return c ? vreinterpret_s64_u16(vshr_n_u16(vreinterpret_u16_s64(a), c)) : a;
+}
+
+SIMD_INLINE v64 v64_shr_n_s16(v64 a, unsigned int c) {
+  return c ? vreinterpret_s64_s16(vshr_n_s16(vreinterpret_s16_s64(a), c)) : a;
+}
+
+SIMD_INLINE v64 v64_shl_n_32(v64 a, unsigned int c) {
+  return c ? vreinterpret_s64_u32(vshl_n_u32(vreinterpret_u32_s64(a), c)) : a;
+}
+
+SIMD_INLINE v64 v64_shr_n_u32(v64 a, unsigned int c) {
+  return c ? vreinterpret_s64_u32(vshr_n_u32(vreinterpret_u32_s64(a), c)) : a;
+}
+
+SIMD_INLINE v64 v64_shr_n_s32(v64 a, unsigned int c) {
+  return c ? vreinterpret_s64_s32(vshr_n_s32(vreinterpret_s32_s64(a), c)) : a;
+}
+
+#else
+
+SIMD_INLINE v64 v64_shl_n_byte(v64 a, unsigned int c) {
+  return v64_from_64(v64_u64(a) << c * 8);
+}
+
+SIMD_INLINE v64 v64_shr_n_byte(v64 a, unsigned int c) {
+  return v64_from_64(v64_u64(a) >> c * 8);
+}
+
+SIMD_INLINE v64 v64_shl_n_8(v64 a, unsigned int c) { return v64_shl_8(a, c); }
+
+SIMD_INLINE v64 v64_shr_n_u8(v64 a, unsigned int c) { return v64_shr_u8(a, c); }
+
+SIMD_INLINE v64 v64_shr_n_s8(v64 a, unsigned int c) { return v64_shr_s8(a, c); }
+
+SIMD_INLINE v64 v64_shl_n_16(v64 a, unsigned int c) { return v64_shl_16(a, c); }
+
+SIMD_INLINE v64 v64_shr_n_u16(v64 a, unsigned int c) {
+  return v64_shr_u16(a, c);
+}
+
+SIMD_INLINE v64 v64_shr_n_s16(v64 a, unsigned int c) {
+  return v64_shr_s16(a, c);
+}
+
+SIMD_INLINE v64 v64_shl_n_32(v64 a, unsigned int c) { return v64_shl_32(a, c); }
+
+SIMD_INLINE v64 v64_shr_n_u32(v64 a, unsigned int c) {
+  return v64_shr_u32(a, c);
+}
+
+SIMD_INLINE v64 v64_shr_n_s32(v64 a, unsigned int c) {
+  return v64_shr_s32(a, c);
+}
+
+#endif
+
+#endif  // AOM_AOM_DSP_SIMD_V64_INTRINSICS_ARM_H_
diff --git a/libs/libaom/src/aom_dsp/simd/v64_intrinsics_c.h b/libs/libaom/src/aom_dsp/simd/v64_intrinsics_c.h
new file mode 100644
index 000000000..b84f243c4
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/simd/v64_intrinsics_c.h
@@ -0,0 +1,982 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_SIMD_V64_INTRINSICS_C_H_
+#define AOM_AOM_DSP_SIMD_V64_INTRINSICS_C_H_
+
+/* Note: This implements the intrinsics in plain, unoptimised C.
+   Intended for reference, porting or debugging. */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "config/aom_config.h"
+
+typedef union {
+  uint8_t u8[8];
+  uint16_t u16[4];
+  uint32_t u32[2];
+  uint64_t u64;
+  int8_t s8[8];
+  int16_t s16[4];
+  int32_t s32[2];
+  int64_t s64;
+} c_v64;
+
+SIMD_INLINE uint32_t c_v64_low_u32(c_v64 a) {
+  return a.u32[!!CONFIG_BIG_ENDIAN];
+}
+
+SIMD_INLINE uint32_t c_v64_high_u32(c_v64 a) {
+  return a.u32[!CONFIG_BIG_ENDIAN];
+}
+
+SIMD_INLINE int32_t c_v64_low_s32(c_v64 a) {
+  return a.s32[!!CONFIG_BIG_ENDIAN];
+}
+
+SIMD_INLINE int32_t c_v64_high_s32(c_v64 a) {
+  return a.s32[!CONFIG_BIG_ENDIAN];
+}
+
+SIMD_INLINE c_v64 c_v64_from_32(uint32_t x, uint32_t y) {
+  c_v64 t;
+  t.u32[!CONFIG_BIG_ENDIAN] = x;
+  t.u32[!!CONFIG_BIG_ENDIAN] = y;
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_from_64(uint64_t x) {
+  c_v64 t;
+  t.u64 = x;
+  return t;
+}
+
+SIMD_INLINE uint64_t c_v64_u64(c_v64 x) { return x.u64; }
+
+SIMD_INLINE c_v64 c_v64_from_16(uint16_t a, uint16_t b, uint16_t c,
+                                uint16_t d) {
+  c_v64 t;
+  if (CONFIG_BIG_ENDIAN) {
+    t.u16[0] = a;
+    t.u16[1] = b;
+    t.u16[2] = c;
+    t.u16[3] = d;
+  } else {
+    t.u16[3] = a;
+    t.u16[2] = b;
+    t.u16[1] = c;
+    t.u16[0] = d;
+  }
+  return t;
+}
+
+SIMD_INLINE uint32_t c_u32_load_unaligned(const void *p) {
+  uint32_t t;
+  uint8_t *pp = (uint8_t *)p;
+  uint8_t *q = (uint8_t *)&t;
+  int c;
+  for (c = 0; c < 4; c++) q[c] = pp[c];
+  return t;
+}
+
+SIMD_INLINE void c_u32_store_unaligned(void *p, uint32_t a) {
+  uint8_t *pp = (uint8_t *)p;
+  uint8_t *q = (uint8_t *)&a;
+  int c;
+  for (c = 0; c < 4; c++) pp[c] = q[c];
+}
+
+SIMD_INLINE uint32_t c_u32_load_aligned(const void *p) {
+  if (SIMD_CHECK && (uintptr_t)p & 3) {
+    fprintf(stderr, "Error: Unaligned u32 load at %p\n", p);
+    abort();
+  }
+  return c_u32_load_unaligned(p);
+}
+
+SIMD_INLINE void c_u32_store_aligned(void *p, uint32_t a) {
+  if (SIMD_CHECK && (uintptr_t)p & 3) {
+    fprintf(stderr, "Error: Unaligned u32 store at %p\n", p);
+    abort();
+  }
+  c_u32_store_unaligned(p, a);
+}
+
+SIMD_INLINE c_v64 c_v64_load_unaligned(const void *p) {
+  c_v64 t;
+  uint8_t *pp = (uint8_t *)p;
+  uint8_t *q = (uint8_t *)&t;
+  int c;
+  for (c = 0; c < 8; c++) q[c] = pp[c];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_load_aligned(const void *p) {
+  if (SIMD_CHECK && (uintptr_t)p & 7) {
+    fprintf(stderr, "Error: Unaligned c_v64 load at %p\n", p);
+    abort();
+  }
+  return c_v64_load_unaligned(p);
+}
+
+SIMD_INLINE void c_v64_store_unaligned(void *p, c_v64 a) {
+  uint8_t *q = (uint8_t *)p;
+  uint8_t *r = (uint8_t *)&a;
+  int c;
+  for (c = 0; c < 8; c++) q[c] = r[c];
+}
+
+SIMD_INLINE void c_v64_store_aligned(void *p, c_v64 a) {
+  if (SIMD_CHECK && (uintptr_t)p & 7) {
+    fprintf(stderr, "Error: Unaligned c_v64 store at %p\n", p);
+    abort();
+  }
+  c_v64_store_unaligned(p, a);
+}
+
+SIMD_INLINE c_v64 c_v64_zero(void) {
+  c_v64 t;
+  t.u64 = 0;
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_dup_8(uint8_t x) {
+  c_v64 t;
+  t.u8[0] = t.u8[1] = t.u8[2] = t.u8[3] = t.u8[4] = t.u8[5] = t.u8[6] =
+      t.u8[7] = x;
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_dup_16(uint16_t x) {
+  c_v64 t;
+  t.u16[0] = t.u16[1] = t.u16[2] = t.u16[3] = x;
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_dup_32(uint32_t x) {
+  c_v64 t;
+  t.u32[0] = t.u32[1] = x;
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_add_8(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 8; c++) t.u8[c] = (uint8_t)(a.u8[c] + b.u8[c]);
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_add_16(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 4; c++) t.u16[c] = (uint16_t)(a.u16[c] + b.u16[c]);
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_sadd_u8(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 8; c++)
+    t.u8[c] = (int16_t)a.u8[c] + (int16_t)b.u8[c] > 255
+                  ? 255
+                  : (int16_t)a.u8[c] + (int16_t)b.u8[c] < 0
+                        ? 0
+                        : (int16_t)a.u8[c] + (int16_t)b.u8[c];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_sadd_s8(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 8; c++)
+    t.s8[c] = (int16_t)a.s8[c] + (int16_t)b.s8[c] > 127
+                  ? 127
+                  : (int16_t)a.s8[c] + (int16_t)b.s8[c] < -128
+                        ? -128
+                        : (int16_t)a.s8[c] + (int16_t)b.s8[c];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_sadd_s16(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 4; c++)
+    t.s16[c] = (int32_t)a.s16[c] + (int32_t)b.s16[c] > 32767
+                   ? 32767
+                   : (int32_t)a.s16[c] + (int32_t)b.s16[c] < -32768
+                         ? -32768
+                         : (int32_t)a.s16[c] + (int32_t)b.s16[c];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_add_32(c_v64 a, c_v64 b) {
+  c_v64 t;
+  t.u32[0] = (uint32_t)((uint64_t)a.u32[0] + b.u32[0]);
+  t.u32[1] = (uint32_t)((uint64_t)a.u32[1] + b.u32[1]);
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_sub_8(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 8; c++) t.u8[c] = (uint8_t)(a.u8[c] - b.u8[c]);
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_ssub_u8(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] < b.u8[c] ? 0 : a.u8[c] - b.u8[c];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_ssub_s8(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 8; c++) {
+    int16_t d = (int16_t)a.s8[c] - (int16_t)b.s8[c];
+    t.s8[c] = d > 127 ? 127 : (d < -128 ? -128 : d);
+  }
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_sub_16(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 4; c++) t.u16[c] = (uint16_t)(a.u16[c] - b.u16[c]);
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_ssub_s16(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 4; c++)
+    t.s16[c] = (int32_t)a.s16[c] - (int32_t)b.s16[c] < -32768
+                   ? -32768
+                   : (int32_t)a.s16[c] - (int32_t)b.s16[c] > 32767
+                         ? 32767
+                         : (int32_t)a.s16[c] - (int32_t)b.s16[c];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_ssub_u16(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 4; c++)
+    t.u16[c] =
+        (int32_t)a.u16[c] - (int32_t)b.u16[c] < 0 ? 0 : a.u16[c] - b.u16[c];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_sub_32(c_v64 a, c_v64 b) {
+  c_v64 t;
+  t.u32[0] = (uint32_t)((int64_t)a.u32[0] - b.u32[0]);
+  t.u32[1] = (uint32_t)((int64_t)a.u32[1] - b.u32[1]);
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_abs_s16(c_v64 a) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 4; c++)
+    t.u16[c] = (uint16_t)((int16_t)a.u16[c] > 0 ? a.u16[c] : -a.u16[c]);
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_abs_s8(c_v64 a) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 8; c++)
+    t.u8[c] = (uint8_t)((int8_t)a.u8[c] > 0 ? a.u8[c] : -a.u8[c]);
+  return t;
+}
+
+SIMD_INLINE c_v64 _c_v64_zip_8(c_v64 a, c_v64 b, int mode) {
+  c_v64 t;
+  if (mode) {
+    t.u8[7] = a.u8[7];
+    t.u8[6] = b.u8[7];
+    t.u8[5] = a.u8[6];
+    t.u8[4] = b.u8[6];
+    t.u8[3] = a.u8[5];
+    t.u8[2] = b.u8[5];
+    t.u8[1] = a.u8[4];
+    t.u8[0] = b.u8[4];
+  } else {
+    t.u8[7] = a.u8[3];
+    t.u8[6] = b.u8[3];
+    t.u8[5] = a.u8[2];
+    t.u8[4] = b.u8[2];
+    t.u8[3] = a.u8[1];
+    t.u8[2] = b.u8[1];
+    t.u8[1] = a.u8[0];
+    t.u8[0] = b.u8[0];
+  }
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_ziplo_8(c_v64 a, c_v64 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v64_zip_8(b, a, 1) : _c_v64_zip_8(a, b, 0);
+}
+
+SIMD_INLINE c_v64 c_v64_ziphi_8(c_v64 a, c_v64 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v64_zip_8(b, a, 0) : _c_v64_zip_8(a, b, 1);
+}
+
+SIMD_INLINE c_v64 _c_v64_zip_16(c_v64 a, c_v64 b, int mode) {
+  c_v64 t;
+  if (mode) {
+    t.u16[3] = a.u16[3];
+    t.u16[2] = b.u16[3];
+    t.u16[1] = a.u16[2];
+    t.u16[0] = b.u16[2];
+  } else {
+    t.u16[3] = a.u16[1];
+    t.u16[2] = b.u16[1];
+    t.u16[1] = a.u16[0];
+    t.u16[0] = b.u16[0];
+  }
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_ziplo_16(c_v64 a, c_v64 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v64_zip_16(b, a, 1) : _c_v64_zip_16(a, b, 0);
+}
+
+SIMD_INLINE c_v64 c_v64_ziphi_16(c_v64 a, c_v64 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v64_zip_16(b, a, 0) : _c_v64_zip_16(a, b, 1);
+}
+
+SIMD_INLINE c_v64 _c_v64_zip_32(c_v64 a, c_v64 b, int mode) {
+  c_v64 t;
+  if (mode) {
+    t.u32[1] = a.u32[1];
+    t.u32[0] = b.u32[1];
+  } else {
+    t.u32[1] = a.u32[0];
+    t.u32[0] = b.u32[0];
+  }
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_ziplo_32(c_v64 a, c_v64 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v64_zip_32(b, a, 1) : _c_v64_zip_32(a, b, 0);
+}
+
+SIMD_INLINE c_v64 c_v64_ziphi_32(c_v64 a, c_v64 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v64_zip_32(b, a, 0) : _c_v64_zip_32(a, b, 1);
+}
+
+SIMD_INLINE c_v64 _c_v64_unzip_8(c_v64 a, c_v64 b, int mode) {
+  c_v64 t;
+  if (mode) {
+    t.u8[7] = b.u8[7];
+    t.u8[6] = b.u8[5];
+    t.u8[5] = b.u8[3];
+    t.u8[4] = b.u8[1];
+    t.u8[3] = a.u8[7];
+    t.u8[2] = a.u8[5];
+    t.u8[1] = a.u8[3];
+    t.u8[0] = a.u8[1];
+  } else {
+    t.u8[7] = a.u8[6];
+    t.u8[6] = a.u8[4];
+    t.u8[5] = a.u8[2];
+    t.u8[4] = a.u8[0];
+    t.u8[3] = b.u8[6];
+    t.u8[2] = b.u8[4];
+    t.u8[1] = b.u8[2];
+    t.u8[0] = b.u8[0];
+  }
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_unziplo_8(c_v64 a, c_v64 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v64_unzip_8(a, b, 1) : _c_v64_unzip_8(a, b, 0);
+}
+
+SIMD_INLINE c_v64 c_v64_unziphi_8(c_v64 a, c_v64 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v64_unzip_8(b, a, 0) : _c_v64_unzip_8(b, a, 1);
+}
+
+SIMD_INLINE c_v64 _c_v64_unzip_16(c_v64 a, c_v64 b, int mode) {
+  c_v64 t;
+  if (mode) {
+    t.u16[3] = b.u16[3];
+    t.u16[2] = b.u16[1];
+    t.u16[1] = a.u16[3];
+    t.u16[0] = a.u16[1];
+  } else {
+    t.u16[3] = a.u16[2];
+    t.u16[2] = a.u16[0];
+    t.u16[1] = b.u16[2];
+    t.u16[0] = b.u16[0];
+  }
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_unziplo_16(c_v64 a, c_v64 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v64_unzip_16(a, b, 1)
+                           : _c_v64_unzip_16(a, b, 0);
+}
+
+SIMD_INLINE c_v64 c_v64_unziphi_16(c_v64 a, c_v64 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v64_unzip_16(b, a, 0)
+                           : _c_v64_unzip_16(b, a, 1);
+}
+
+SIMD_INLINE c_v64 c_v64_unpacklo_u8_s16(c_v64 a) {
+  c_v64 t;
+  int endian = !!CONFIG_BIG_ENDIAN * 4;
+  t.s16[3] = (int16_t)a.u8[3 + endian];
+  t.s16[2] = (int16_t)a.u8[2 + endian];
+  t.s16[1] = (int16_t)a.u8[1 + endian];
+  t.s16[0] = (int16_t)a.u8[0 + endian];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_unpackhi_u8_s16(c_v64 a) {
+  c_v64 t;
+  int endian = !!CONFIG_BIG_ENDIAN * 4;
+  t.s16[3] = (int16_t)a.u8[7 - endian];
+  t.s16[2] = (int16_t)a.u8[6 - endian];
+  t.s16[1] = (int16_t)a.u8[5 - endian];
+  t.s16[0] = (int16_t)a.u8[4 - endian];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_unpacklo_s8_s16(c_v64 a) {
+  c_v64 t;
+  int endian = !!CONFIG_BIG_ENDIAN * 4;
+  t.s16[3] = (int16_t)a.s8[3 + endian];
+  t.s16[2] = (int16_t)a.s8[2 + endian];
+  t.s16[1] = (int16_t)a.s8[1 + endian];
+  t.s16[0] = (int16_t)a.s8[0 + endian];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_unpackhi_s8_s16(c_v64 a) {
+  c_v64 t;
+  int endian = !!CONFIG_BIG_ENDIAN * 4;
+  t.s16[3] = (int16_t)a.s8[7 - endian];
+  t.s16[2] = (int16_t)a.s8[6 - endian];
+  t.s16[1] = (int16_t)a.s8[5 - endian];
+  t.s16[0] = (int16_t)a.s8[4 - endian];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_pack_s32_s16(c_v64 a, c_v64 b) {
+  c_v64 t;
+  if (CONFIG_BIG_ENDIAN) {
+    c_v64 u = a;
+    a = b;
+    b = u;
+  }
+  t.s16[3] = a.s32[1] > 32767 ? 32767 : a.s32[1] < -32768 ? -32768 : a.s32[1];
+  t.s16[2] = a.s32[0] > 32767 ? 32767 : a.s32[0] < -32768 ? -32768 : a.s32[0];
+  t.s16[1] = b.s32[1] > 32767 ? 32767 : b.s32[1] < -32768 ? -32768 : b.s32[1];
+  t.s16[0] = b.s32[0] > 32767 ? 32767 : b.s32[0] < -32768 ? -32768 : b.s32[0];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_pack_s32_u16(c_v64 a, c_v64 b) {
+  c_v64 t;
+  if (CONFIG_BIG_ENDIAN) {
+    c_v64 u = a;
+    a = b;
+    b = u;
+  }
+  t.u16[3] = a.s32[1] > 65535 ? 65535 : a.s32[1] < 0 ? 0 : a.s32[1];
+  t.u16[2] = a.s32[0] > 65535 ? 65535 : a.s32[0] < 0 ? 0 : a.s32[0];
+  t.u16[1] = b.s32[1] > 65535 ? 65535 : b.s32[1] < 0 ? 0 : b.s32[1];
+  t.u16[0] = b.s32[0] > 65535 ? 65535 : b.s32[0] < 0 ? 0 : b.s32[0];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_pack_s16_u8(c_v64 a, c_v64 b) {
+  c_v64 t;
+  if (CONFIG_BIG_ENDIAN) {
+    c_v64 u = a;
+    a = b;
+    b = u;
+  }
+  t.u8[7] = a.s16[3] > 255 ? 255 : a.s16[3] < 0 ? 0 : a.s16[3];
+  t.u8[6] = a.s16[2] > 255 ? 255 : a.s16[2] < 0 ? 0 : a.s16[2];
+  t.u8[5] = a.s16[1] > 255 ? 255 : a.s16[1] < 0 ? 0 : a.s16[1];
+  t.u8[4] = a.s16[0] > 255 ? 255 : a.s16[0] < 0 ? 0 : a.s16[0];
+  t.u8[3] = b.s16[3] > 255 ? 255 : b.s16[3] < 0 ? 0 : b.s16[3];
+  t.u8[2] = b.s16[2] > 255 ? 255 : b.s16[2] < 0 ? 0 : b.s16[2];
+  t.u8[1] = b.s16[1] > 255 ? 255 : b.s16[1] < 0 ? 0 : b.s16[1];
+  t.u8[0] = b.s16[0] > 255 ? 255 : b.s16[0] < 0 ? 0 : b.s16[0];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_pack_s16_s8(c_v64 a, c_v64 b) {
+  c_v64 t;
+  if (CONFIG_BIG_ENDIAN) {
+    c_v64 u = a;
+    a = b;
+    b = u;
+  }
+  t.u8[7] = (uint8_t)(a.s16[3] > 127 ? 127 : a.s16[3] < -128 ? 128 : a.s16[3]);
+  t.u8[6] = (uint8_t)(a.s16[2] > 127 ? 127 : a.s16[2] < -128 ? 128 : a.s16[2]);
+  t.u8[5] = (uint8_t)(a.s16[1] > 127 ? 127 : a.s16[1] < -128 ? 128 : a.s16[1]);
+  t.u8[4] = (uint8_t)(a.s16[0] > 127 ? 127 : a.s16[0] < -128 ? 128 : a.s16[0]);
+  t.u8[3] = (uint8_t)(b.s16[3] > 127 ? 127 : b.s16[3] < -128 ? 128 : b.s16[3]);
+  t.u8[2] = (uint8_t)(b.s16[2] > 127 ? 127 : b.s16[2] < -128 ? 128 : b.s16[2]);
+  t.u8[1] = (uint8_t)(b.s16[1] > 127 ? 127 : b.s16[1] < -128 ? 128 : b.s16[1]);
+  t.u8[0] = (uint8_t)(b.s16[0] > 127 ? 127 : b.s16[0] < -128 ? 128 : b.s16[0]);
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_unpacklo_u16_s32(c_v64 a) {
+  c_v64 t;
+  t.s32[1] = a.u16[1 + !!CONFIG_BIG_ENDIAN * 2];
+  t.s32[0] = a.u16[0 + !!CONFIG_BIG_ENDIAN * 2];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_unpacklo_s16_s32(c_v64 a) {
+  c_v64 t;
+  t.s32[1] = a.s16[1 + !!CONFIG_BIG_ENDIAN * 2];
+  t.s32[0] = a.s16[0 + !!CONFIG_BIG_ENDIAN * 2];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_unpackhi_u16_s32(c_v64 a) {
+  c_v64 t;
+  t.s32[1] = a.u16[3 - !!CONFIG_BIG_ENDIAN * 2];
+  t.s32[0] = a.u16[2 - !!CONFIG_BIG_ENDIAN * 2];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_unpackhi_s16_s32(c_v64 a) {
+  c_v64 t;
+  t.s32[1] = a.s16[3 - !!CONFIG_BIG_ENDIAN * 2];
+  t.s32[0] = a.s16[2 - !!CONFIG_BIG_ENDIAN * 2];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_shuffle_8(c_v64 a, c_v64 pattern) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 8; c++) {
+    if (SIMD_CHECK && (pattern.u8[c] & ~7)) {
+      fprintf(stderr, "Error: Undefined v64_shuffle_8 index %d/%d\n",
+              pattern.u8[c], c);
+      abort();
+    }
+    t.u8[c] =
+        a.u8[CONFIG_BIG_ENDIAN ? 7 - (pattern.u8[c] & 7) : pattern.u8[c] & 7];
+  }
+  return t;
+}
+
+SIMD_INLINE int64_t c_v64_dotp_su8(c_v64 a, c_v64 b) {
+  return a.s8[7] * b.u8[7] + a.s8[6] * b.u8[6] + a.s8[5] * b.u8[5] +
+         a.s8[4] * b.u8[4] + a.s8[3] * b.u8[3] + a.s8[2] * b.u8[2] +
+         a.s8[1] * b.u8[1] + a.s8[0] * b.u8[0];
+}
+
+SIMD_INLINE int64_t c_v64_dotp_s16(c_v64 a, c_v64 b) {
+  return (int64_t)(a.s16[3] * b.s16[3] + a.s16[2] * b.s16[2]) +
+         (int64_t)(a.s16[1] * b.s16[1] + a.s16[0] * b.s16[0]);
+}
+
+SIMD_INLINE uint64_t c_v64_hadd_u8(c_v64 a) {
+  return a.u8[7] + a.u8[6] + a.u8[5] + a.u8[4] + a.u8[3] + a.u8[2] + a.u8[1] +
+         a.u8[0];
+}
+
+SIMD_INLINE int64_t c_v64_hadd_s16(c_v64 a) {
+  return a.s16[3] + a.s16[2] + a.s16[1] + a.s16[0];
+}
+
+typedef struct {
+  uint32_t val;
+  int count;
+} c_sad64_internal;
+
+SIMD_INLINE c_sad64_internal c_v64_sad_u8_init(void) {
+  c_sad64_internal t;
+  t.val = t.count = 0;
+  return t;
+}
+
+/* Implementation dependent return value.  Result must be finalised with
+   v64_sad_u8_sum(). The result for more than 32 v64_sad_u8() calls is
+   undefined. */
+SIMD_INLINE c_sad64_internal c_v64_sad_u8(c_sad64_internal s, c_v64 a,
+                                          c_v64 b) {
+  int c;
+  for (c = 0; c < 8; c++)
+    s.val += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c];
+  s.count++;
+  if (SIMD_CHECK && s.count > 32) {
+    fprintf(stderr,
+            "Error: sad called 32 times returning an undefined result\n");
+    abort();
+  }
+  return s;
+}
+
+SIMD_INLINE uint32_t c_v64_sad_u8_sum(c_sad64_internal s) { return s.val; }
+
+typedef uint32_t c_ssd64_internal;
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v64_ssd_u8_sum(). */
+SIMD_INLINE c_ssd64_internal c_v64_ssd_u8_init(void) { return 0; }
+
+SIMD_INLINE c_ssd64_internal c_v64_ssd_u8(c_ssd64_internal s, c_v64 a,
+                                          c_v64 b) {
+  int c;
+  for (c = 0; c < 8; c++) s += (a.u8[c] - b.u8[c]) * (a.u8[c] - b.u8[c]);
+  return s;
+}
+
+SIMD_INLINE uint32_t c_v64_ssd_u8_sum(c_ssd64_internal s) { return s; }
+
+SIMD_INLINE c_v64 c_v64_or(c_v64 a, c_v64 b) {
+  c_v64 t;
+  t.u64 = a.u64 | b.u64;
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_xor(c_v64 a, c_v64 b) {
+  c_v64 t;
+  t.u64 = a.u64 ^ b.u64;
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_and(c_v64 a, c_v64 b) {
+  c_v64 t;
+  t.u64 = a.u64 & b.u64;
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_andn(c_v64 a, c_v64 b) {
+  c_v64 t;
+  t.u64 = a.u64 & ~b.u64;
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_mullo_s16(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 4; c++) t.s16[c] = (int16_t)(a.s16[c] * b.s16[c]);
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_mulhi_s16(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 4; c++) t.s16[c] = (a.s16[c] * b.s16[c]) >> 16;
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_mullo_s32(c_v64 a, c_v64 b) {
+  c_v64 t;
+  t.s32[0] = (int32_t)((int64_t)a.s32[0] * b.s32[0]);
+  t.s32[1] = (int32_t)((int64_t)a.s32[1] * b.s32[1]);
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_madd_s16(c_v64 a, c_v64 b) {
+  c_v64 t;
+  t.s32[0] = a.s16[0] * b.s16[0] + a.s16[1] * b.s16[1];
+  t.s32[1] = a.s16[2] * b.s16[2] + a.s16[3] * b.s16[3];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_madd_us8(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int32_t u;
+  u = a.u8[0] * b.s8[0] + a.u8[1] * b.s8[1];
+  t.s16[0] = u > 32767 ? 32767 : u < -32768 ? -32768 : u;
+  u = a.u8[2] * b.s8[2] + a.u8[3] * b.s8[3];
+  t.s16[1] = u > 32767 ? 32767 : u < -32768 ? -32768 : u;
+  u = a.u8[4] * b.s8[4] + a.u8[5] * b.s8[5];
+  t.s16[2] = u > 32767 ? 32767 : u < -32768 ? -32768 : u;
+  u = a.u8[6] * b.s8[6] + a.u8[7] * b.s8[7];
+  t.s16[3] = u > 32767 ? 32767 : u < -32768 ? -32768 : u;
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_avg_u8(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 8; c++) t.u8[c] = (a.u8[c] + b.u8[c] + 1) >> 1;
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_rdavg_u8(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 8; c++) t.u8[c] = (a.u8[c] + b.u8[c]) >> 1;
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_rdavg_u16(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 4; c++) t.u16[c] = (a.u16[c] + b.u16[c]) >> 1;
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_avg_u16(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 4; c++) t.u16[c] = (a.u16[c] + b.u16[c] + 1) >> 1;
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_min_u8(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] > b.u8[c] ? b.u8[c] : a.u8[c];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_max_u8(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] > b.u8[c] ? a.u8[c] : b.u8[c];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_min_s8(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 8; c++) t.s8[c] = a.s8[c] > b.s8[c] ? b.s8[c] : a.s8[c];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_max_s8(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 8; c++) t.s8[c] = a.s8[c] > b.s8[c] ? a.s8[c] : b.s8[c];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_min_s16(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 4; c++) t.s16[c] = a.s16[c] > b.s16[c] ? b.s16[c] : a.s16[c];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_max_s16(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 4; c++) t.s16[c] = a.s16[c] > b.s16[c] ? a.s16[c] : b.s16[c];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_cmpgt_s8(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 8; c++) t.s8[c] = -(a.s8[c] > b.s8[c]);
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_cmplt_s8(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 8; c++) t.s8[c] = -(a.s8[c] < b.s8[c]);
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_cmpeq_8(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 8; c++) t.s8[c] = -(a.u8[c] == b.u8[c]);
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_cmpgt_s16(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 4; c++) t.s16[c] = -(a.s16[c] > b.s16[c]);
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_cmplt_s16(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 4; c++) t.s16[c] = -(a.s16[c] < b.s16[c]);
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_cmpeq_16(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 4; c++) t.s16[c] = -(a.u16[c] == b.u16[c]);
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_shl_8(c_v64 a, unsigned int n) {
+  c_v64 t;
+  int c;
+  if (SIMD_CHECK && n > 7) {
+    fprintf(stderr, "Error: Undefined u8 shift left %d\n", n);
+    abort();
+  }
+  for (c = 0; c < 8; c++) t.s8[c] = (int8_t)(a.u8[c] << n);
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_shr_u8(c_v64 a, unsigned int n) {
+  c_v64 t;
+  int c;
+  if (SIMD_CHECK && n > 7) {
+    fprintf(stderr, "Error: Undefined u8 shift right %d\n", n);
+    abort();
+  }
+  for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] >> n;
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_shr_s8(c_v64 a, unsigned int n) {
+  c_v64 t;
+  int c;
+  if (SIMD_CHECK && n > 7) {
+    fprintf(stderr, "Error: Undefined s8 shift right %d\n", n);
+    abort();
+  }
+  for (c = 0; c < 8; c++) t.s8[c] = a.s8[c] >> n;
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_shl_16(c_v64 a, unsigned int n) {
+  c_v64 t;
+  int c;
+  if (SIMD_CHECK && n > 15) {
+    fprintf(stderr, "Error: Undefined u16 shift left %d\n", n);
+    abort();
+  }
+  for (c = 0; c < 4; c++) t.u16[c] = (uint16_t)(a.u16[c] << n);
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_shr_u16(c_v64 a, unsigned int n) {
+  c_v64 t;
+  int c;
+  if (SIMD_CHECK && n > 15) {
+    fprintf(stderr, "Error: Undefined u16 shift right %d\n", n);
+    abort();
+  }
+  for (c = 0; c < 4; c++) t.u16[c] = a.u16[c] >> n;
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_shr_s16(c_v64 a, unsigned int n) {
+  c_v64 t;
+  int c;
+  if (SIMD_CHECK && n > 15) {
+    fprintf(stderr, "Error: undefined s16 shift right %d\n", n);
+    abort();
+  }
+  for (c = 0; c < 4; c++) t.s16[c] = a.s16[c] >> n;
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_shl_32(c_v64 a, unsigned int n) {
+  c_v64 t;
+  if (SIMD_CHECK && n > 31) {
+    fprintf(stderr, "Error: undefined u32 shift left %d\n", n);
+    abort();
+  }
+  t.u32[1] = a.u32[1] << n;
+  t.u32[0] = a.u32[0] << n;
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_shr_u32(c_v64 a, unsigned int n) {
+  c_v64 t;
+  if (SIMD_CHECK && n > 31) {
+    fprintf(stderr, "Error: undefined u32 shift right %d\n", n);
+    abort();
+  }
+  t.u32[1] = a.u32[1] >> n;
+  t.u32[0] = a.u32[0] >> n;
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_shr_s32(c_v64 a, unsigned int n) {
+  c_v64 t;
+  if (SIMD_CHECK && n > 31) {
+    fprintf(stderr, "Error: undefined s32 shift right %d\n", n);
+    abort();
+  }
+  t.s32[1] = a.s32[1] >> n;
+  t.s32[0] = a.s32[0] >> n;
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_shr_n_byte(c_v64 x, unsigned int i) {
+  c_v64 t;
+  t.u64 = x.u64 >> i * 8;
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_shl_n_byte(c_v64 x, unsigned int i) {
+  c_v64 t;
+  t.u64 = x.u64 << i * 8;
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_align(c_v64 a, c_v64 b, unsigned int c) {
+  if (SIMD_CHECK && c > 7) {
+    fprintf(stderr, "Error: undefined alignment %d\n", c);
+    abort();
+  }
+  return c ? c_v64_or(c_v64_shr_n_byte(b, c), c_v64_shl_n_byte(a, 8 - c)) : b;
+}
+
+SIMD_INLINE c_v64 c_v64_shl_n_8(c_v64 a, unsigned int c) {
+  return c_v64_shl_8(a, c);
+}
+
+SIMD_INLINE c_v64 c_v64_shr_n_u8(c_v64 a, unsigned int c) {
+  return c_v64_shr_u8(a, c);
+}
+
+SIMD_INLINE c_v64 c_v64_shr_n_s8(c_v64 a, unsigned int c) {
+  return c_v64_shr_s8(a, c);
+}
+
+SIMD_INLINE c_v64 c_v64_shl_n_16(c_v64 a, unsigned int c) {
+  return c_v64_shl_16(a, c);
+}
+
+SIMD_INLINE c_v64 c_v64_shr_n_u16(c_v64 a, unsigned int c) {
+  return c_v64_shr_u16(a, c);
+}
+
+SIMD_INLINE c_v64 c_v64_shr_n_s16(c_v64 a, unsigned int c) {
+  return c_v64_shr_s16(a, c);
+}
+
+SIMD_INLINE c_v64 c_v64_shl_n_32(c_v64 a, unsigned int c) {
+  return c_v64_shl_32(a, c);
+}
+
+SIMD_INLINE c_v64 c_v64_shr_n_u32(c_v64 a, unsigned int c) {
+  return c_v64_shr_u32(a, c);
+}
+
+SIMD_INLINE c_v64 c_v64_shr_n_s32(c_v64 a, unsigned int c) {
+  return c_v64_shr_s32(a, c);
+}
+
+#endif  // AOM_AOM_DSP_SIMD_V64_INTRINSICS_C_H_
diff --git a/libs/libaom/src/aom_dsp/simd/v64_intrinsics_x86.h b/libs/libaom/src/aom_dsp/simd/v64_intrinsics_x86.h
new file mode 100644
index 000000000..1f273fe96
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/simd/v64_intrinsics_x86.h
@@ -0,0 +1,491 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_SIMD_V64_INTRINSICS_X86_H_
+#define AOM_AOM_DSP_SIMD_V64_INTRINSICS_X86_H_
+
+#include <emmintrin.h>
+#if defined(__SSSE3__)
+#include <tmmintrin.h>
+#endif
+#if defined(__SSE4_1__)
+#include <smmintrin.h>
+#endif
+
+typedef __m128i v64;
+
+SIMD_INLINE uint32_t v64_low_u32(v64 a) {
+  return (uint32_t)_mm_cvtsi128_si32(a);
+}
+
+SIMD_INLINE uint32_t v64_high_u32(v64 a) {
+  return (uint32_t)_mm_cvtsi128_si32(_mm_srli_si128(a, 4));
+}
+
+SIMD_INLINE int32_t v64_low_s32(v64 a) { return (int32_t)_mm_cvtsi128_si32(a); }
+
+SIMD_INLINE int32_t v64_high_s32(v64 a) {
+  return (int32_t)_mm_cvtsi128_si32(_mm_srli_si128(a, 4));
+}
+
+SIMD_INLINE v64 v64_from_16(uint16_t a, uint16_t b, uint16_t c, uint16_t d) {
+  return _mm_packs_epi32(
+      _mm_set_epi32((int16_t)a, (int16_t)b, (int16_t)c, (int16_t)d),
+      _mm_setzero_si128());
+}
+
+SIMD_INLINE v64 v64_from_32(uint32_t x, uint32_t y) {
+  return _mm_set_epi32(0, 0, x, y);
+}
+
+SIMD_INLINE v64 v64_from_64(uint64_t x) {
+#ifdef __x86_64__
+  return _mm_cvtsi64_si128(x);
+#else
+  return _mm_set_epi32(0, 0, x >> 32, (uint32_t)x);
+#endif
+}
+
+SIMD_INLINE uint64_t v64_u64(v64 x) {
+  return (uint64_t)v64_low_u32(x) | ((uint64_t)v64_high_u32(x) << 32);
+}
+
+SIMD_INLINE uint32_t u32_load_aligned(const void *p) {
+  return *((uint32_t *)p);
+}
+
+SIMD_INLINE uint32_t u32_load_unaligned(const void *p) {
+  return *((uint32_t *)p);
+}
+
+SIMD_INLINE void u32_store_aligned(void *p, uint32_t a) {
+  *((uint32_t *)p) = a;
+}
+
+SIMD_INLINE void u32_store_unaligned(void *p, uint32_t a) {
+  *((uint32_t *)p) = a;
+}
+
+SIMD_INLINE v64 v64_load_aligned(const void *p) {
+  return _mm_loadl_epi64((__m128i *)p);
+}
+
+SIMD_INLINE v64 v64_load_unaligned(const void *p) {
+  return _mm_loadl_epi64((__m128i *)p);
+}
+
+SIMD_INLINE void v64_store_aligned(void *p, v64 a) {
+  _mm_storel_epi64((__m128i *)p, a);
+}
+
+SIMD_INLINE void v64_store_unaligned(void *p, v64 a) {
+  _mm_storel_epi64((__m128i *)p, a);
+}
+
+#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__)
+#define v64_align(a, b, c) \
+  ((c) ? _mm_srli_si128(_mm_unpacklo_epi64(b, a), (c)) : b)
+#else
+#define v64_align(a, b, c)                                                  \
+  ((c) ? v64_from_64((v64_u64(b) >> (c)*8) | (v64_u64(a) << (8 - (c)) * 8)) \
+       : (b))
+#endif
+
+SIMD_INLINE v64 v64_zero(void) { return _mm_setzero_si128(); }
+
+SIMD_INLINE v64 v64_dup_8(uint8_t x) { return _mm_set1_epi8(x); }
+
+SIMD_INLINE v64 v64_dup_16(uint16_t x) { return _mm_set1_epi16(x); }
+
+SIMD_INLINE v64 v64_dup_32(uint32_t x) { return _mm_set1_epi32(x); }
+
+SIMD_INLINE v64 v64_add_8(v64 a, v64 b) { return _mm_add_epi8(a, b); }
+
+SIMD_INLINE v64 v64_add_16(v64 a, v64 b) { return _mm_add_epi16(a, b); }
+
+SIMD_INLINE v64 v64_sadd_u8(v64 a, v64 b) { return _mm_adds_epu8(a, b); }
+
+SIMD_INLINE v64 v64_sadd_s8(v64 a, v64 b) { return _mm_adds_epi8(a, b); }
+
+SIMD_INLINE v64 v64_sadd_s16(v64 a, v64 b) { return _mm_adds_epi16(a, b); }
+
+SIMD_INLINE v64 v64_add_32(v64 a, v64 b) { return _mm_add_epi32(a, b); }
+
+SIMD_INLINE v64 v64_sub_8(v64 a, v64 b) { return _mm_sub_epi8(a, b); }
+
+SIMD_INLINE v64 v64_ssub_u8(v64 a, v64 b) { return _mm_subs_epu8(a, b); }
+
+SIMD_INLINE v64 v64_ssub_s8(v64 a, v64 b) { return _mm_subs_epi8(a, b); }
+
+SIMD_INLINE v64 v64_sub_16(v64 a, v64 b) { return _mm_sub_epi16(a, b); }
+
+SIMD_INLINE v64 v64_ssub_s16(v64 a, v64 b) { return _mm_subs_epi16(a, b); }
+
+SIMD_INLINE v64 v64_ssub_u16(v64 a, v64 b) { return _mm_subs_epu16(a, b); }
+
+SIMD_INLINE v64 v64_sub_32(v64 a, v64 b) { return _mm_sub_epi32(a, b); }
+
+SIMD_INLINE v64 v64_abs_s16(v64 a) {
+#if defined(__SSSE3__)
+  return _mm_abs_epi16(a);
+#else
+  return _mm_max_epi16(a, _mm_sub_epi16(_mm_setzero_si128(), a));
+#endif
+}
+
+SIMD_INLINE v64 v64_abs_s8(v64 a) {
+#if defined(__SSSE3__)
+  return _mm_abs_epi8(a);
+#else
+  v64 sign = _mm_cmplt_epi8(a, _mm_setzero_si128());
+  return _mm_xor_si128(sign, _mm_add_epi8(a, sign));
+#endif
+}
+
+SIMD_INLINE v64 v64_ziplo_8(v64 a, v64 b) { return _mm_unpacklo_epi8(b, a); }
+
+SIMD_INLINE v64 v64_ziphi_8(v64 a, v64 b) {
+  return _mm_srli_si128(_mm_unpacklo_epi8(b, a), 8);
+}
+
+SIMD_INLINE v64 v64_ziplo_16(v64 a, v64 b) { return _mm_unpacklo_epi16(b, a); }
+
+SIMD_INLINE v64 v64_ziphi_16(v64 a, v64 b) {
+  return _mm_srli_si128(_mm_unpacklo_epi16(b, a), 8);
+}
+
+SIMD_INLINE v64 v64_ziplo_32(v64 a, v64 b) { return _mm_unpacklo_epi32(b, a); }
+
+SIMD_INLINE v64 v64_ziphi_32(v64 a, v64 b) {
+  return _mm_srli_si128(_mm_unpacklo_epi32(b, a), 8);
+}
+
+SIMD_INLINE v64 v64_pack_s32_s16(v64 a, v64 b) {
+  __m128i t = _mm_unpacklo_epi64(b, a);
+  return _mm_packs_epi32(t, t);
+}
+
+SIMD_INLINE v64 v64_pack_s32_u16(v64 a, v64 b) {
+#if defined(__SSE4_1__)
+  __m128i t = _mm_unpacklo_epi64(b, a);
+  return _mm_packus_epi32(t, t);
+#else
+  int32_t ah = v64_high_u32(a);
+  int32_t al = v64_low_u32(a);
+  int32_t bh = v64_high_u32(b);
+  int32_t bl = v64_low_u32(b);
+  return v64_from_16(ah > 65535 ? 65535 : ah < 0 ? 0 : ah,
+                     al > 65535 ? 65535 : al < 0 ? 0 : al,
+                     bh > 65535 ? 65535 : bh < 0 ? 0 : bh,
+                     bl > 65535 ? 65535 : bl < 0 ? 0 : bl);
+#endif
+}
+
+SIMD_INLINE v64 v64_pack_s16_u8(v64 a, v64 b) {
+  __m128i t = _mm_unpacklo_epi64(b, a);
+  return _mm_packus_epi16(t, t);
+}
+
+SIMD_INLINE v64 v64_pack_s16_s8(v64 a, v64 b) {
+  __m128i t = _mm_unpacklo_epi64(b, a);
+  return _mm_packs_epi16(t, t);
+}
+
+SIMD_INLINE v64 v64_unziphi_8(v64 a, v64 b) {
+#if defined(__SSSE3__)
+  return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a),
+                          v64_from_64(0x0f0d0b0907050301LL));
+#else
+  return _mm_packus_epi16(
+      _mm_unpacklo_epi64(_mm_srli_epi16(b, 8), _mm_srli_epi16(a, 8)),
+      _mm_setzero_si128());
+#endif
+}
+
+SIMD_INLINE v64 v64_unziplo_8(v64 a, v64 b) {
+#if defined(__SSSE3__)
+  return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a),
+                          v64_from_64(0x0e0c0a0806040200LL));
+#else
+  return v64_unziphi_8(_mm_slli_si128(a, 1), _mm_slli_si128(b, 1));
+#endif
+}
+
+SIMD_INLINE v64 v64_unziphi_16(v64 a, v64 b) {
+#if defined(__SSSE3__)
+  return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a),
+                          v64_from_64(0x0f0e0b0a07060302LL));
+#else
+  return _mm_packs_epi32(
+      _mm_unpacklo_epi64(_mm_srai_epi32(b, 16), _mm_srai_epi32(a, 16)),
+      _mm_setzero_si128());
+#endif
+}
+
+SIMD_INLINE v64 v64_unziplo_16(v64 a, v64 b) {
+#if defined(__SSSE3__)
+  return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a),
+                          v64_from_64(0x0d0c090805040100LL));
+#else
+  return v64_unziphi_16(_mm_slli_si128(a, 2), _mm_slli_si128(b, 2));
+#endif
+}
+
+SIMD_INLINE v64 v64_unpacklo_u8_s16(v64 a) {
+  return _mm_unpacklo_epi8(a, _mm_setzero_si128());
+}
+
+SIMD_INLINE v64 v64_unpackhi_u8_s16(v64 a) {
+  return _mm_srli_si128(_mm_unpacklo_epi8(a, _mm_setzero_si128()), 8);
+}
+
+SIMD_INLINE v64 v64_unpacklo_s8_s16(v64 a) {
+  return _mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8);
+}
+
+SIMD_INLINE v64 v64_unpackhi_s8_s16(v64 a) {
+  return _mm_srli_si128(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8), 8);
+}
+
+SIMD_INLINE v64 v64_unpacklo_u16_s32(v64 a) {
+  return _mm_unpacklo_epi16(a, _mm_setzero_si128());
+}
+
+SIMD_INLINE v64 v64_unpacklo_s16_s32(v64 a) {
+  return _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), a), 16);
+}
+
+SIMD_INLINE v64 v64_unpackhi_u16_s32(v64 a) {
+  return _mm_srli_si128(_mm_unpacklo_epi16(a, _mm_setzero_si128()), 8);
+}
+
+SIMD_INLINE v64 v64_unpackhi_s16_s32(v64 a) {
+  return _mm_srli_si128(
+      _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), a), 16), 8);
+}
+
+SIMD_INLINE v64 v64_shuffle_8(v64 x, v64 pattern) {
+#if defined(__SSSE3__)
+  return _mm_shuffle_epi8(x, pattern);
+#else
+  v64 output;
+  unsigned char *input = (unsigned char *)&x;
+  unsigned char *index = (unsigned char *)&pattern;
+  char *selected = (char *)&output;
+  int counter;
+
+  for (counter = 0; counter < 8; counter++) {
+    selected[counter] = input[index[counter]];
+  }
+
+  return output;
+#endif
+}
+
+SIMD_INLINE int64_t v64_dotp_su8(v64 a, v64 b) {
+  __m128i t = _mm_madd_epi16(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8),
+                             _mm_unpacklo_epi8(b, _mm_setzero_si128()));
+  t = _mm_add_epi32(t, _mm_srli_si128(t, 8));
+  t = _mm_add_epi32(t, _mm_srli_si128(t, 4));
+  return (int32_t)v64_low_u32(t);
+}
+
+SIMD_INLINE int64_t v64_dotp_s16(v64 a, v64 b) {
+  __m128i r = _mm_madd_epi16(a, b);
+#if defined(__SSE4_1__) && defined(__x86_64__)
+  __m128i x = _mm_cvtepi32_epi64(r);
+  return _mm_cvtsi128_si64(_mm_add_epi64(x, _mm_srli_si128(x, 8)));
+#else
+  return (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 4)) +
+         (int64_t)_mm_cvtsi128_si32(r);
+#endif
+}
+
+SIMD_INLINE uint64_t v64_hadd_u8(v64 a) {
+  return v64_low_u32(_mm_sad_epu8(a, _mm_setzero_si128()));
+}
+
+SIMD_INLINE int64_t v64_hadd_s16(v64 a) {
+  return v64_dotp_s16(a, v64_dup_16(1));
+}
+
+typedef v64 sad64_internal;
+
+SIMD_INLINE sad64_internal v64_sad_u8_init(void) { return _mm_setzero_si128(); }
+
+/* Implementation dependent return value.  Result must be finalised with
+   v64_sad_u8_sum().
+   The result for more than 32 v64_sad_u8() calls is undefined. */
+SIMD_INLINE sad64_internal v64_sad_u8(sad64_internal s, v64 a, v64 b) {
+  return _mm_add_epi64(s, _mm_sad_epu8(a, b));
+}
+
+SIMD_INLINE uint32_t v64_sad_u8_sum(sad64_internal s) { return v64_low_u32(s); }
+
+typedef v64 ssd64_internal;
+
+SIMD_INLINE ssd64_internal v64_ssd_u8_init(void) { return _mm_setzero_si128(); }
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v64_ssd_u8_sum(). */
+SIMD_INLINE ssd64_internal v64_ssd_u8(ssd64_internal s, v64 a, v64 b) {
+  v64 l = v64_sub_16(v64_ziplo_8(v64_zero(), a), v64_ziplo_8(v64_zero(), b));
+  v64 h = v64_sub_16(v64_ziphi_8(v64_zero(), a), v64_ziphi_8(v64_zero(), b));
+  v64 r = v64_add_32(_mm_madd_epi16(l, l), _mm_madd_epi16(h, h));
+  return _mm_add_epi64(
+      s, v64_ziplo_32(v64_zero(), _mm_add_epi32(r, _mm_srli_si128(r, 4))));
+}
+
+SIMD_INLINE uint32_t v64_ssd_u8_sum(sad64_internal s) { return v64_low_u32(s); }
+
+SIMD_INLINE v64 v64_or(v64 a, v64 b) { return _mm_or_si128(a, b); }
+
+SIMD_INLINE v64 v64_xor(v64 a, v64 b) { return _mm_xor_si128(a, b); }
+
+SIMD_INLINE v64 v64_and(v64 a, v64 b) { return _mm_and_si128(a, b); }
+
+SIMD_INLINE v64 v64_andn(v64 a, v64 b) { return _mm_andnot_si128(b, a); }
+
+SIMD_INLINE v64 v64_mullo_s16(v64 a, v64 b) { return _mm_mullo_epi16(a, b); }
+
+SIMD_INLINE v64 v64_mulhi_s16(v64 a, v64 b) { return _mm_mulhi_epi16(a, b); }
+
+SIMD_INLINE v64 v64_mullo_s32(v64 a, v64 b) {
+#if defined(__SSE4_1__)
+  return _mm_mullo_epi32(a, b);
+#else
+  return _mm_unpacklo_epi32(
+      _mm_mul_epu32(a, b),
+      _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4)));
+#endif
+}
+
+SIMD_INLINE v64 v64_madd_s16(v64 a, v64 b) { return _mm_madd_epi16(a, b); }
+
+SIMD_INLINE v64 v64_madd_us8(v64 a, v64 b) {
+#if defined(__SSSE3__)
+  return _mm_maddubs_epi16(a, b);
+#else
+  __m128i t = _mm_madd_epi16(_mm_unpacklo_epi8(a, _mm_setzero_si128()),
+                             _mm_srai_epi16(_mm_unpacklo_epi8(b, b), 8));
+  return _mm_packs_epi32(t, t);
+#endif
+}
+
+SIMD_INLINE v64 v64_avg_u8(v64 a, v64 b) { return _mm_avg_epu8(a, b); }
+
+SIMD_INLINE v64 v64_rdavg_u8(v64 a, v64 b) {
+  return _mm_sub_epi8(_mm_avg_epu8(a, b),
+                      _mm_and_si128(_mm_xor_si128(a, b), v64_dup_8(1)));
+}
+
+SIMD_INLINE v64 v64_rdavg_u16(v64 a, v64 b) {
+  return _mm_sub_epi16(_mm_avg_epu16(a, b),
+                       _mm_and_si128(_mm_xor_si128(a, b), v64_dup_16(1)));
+}
+
+SIMD_INLINE v64 v64_avg_u16(v64 a, v64 b) { return _mm_avg_epu16(a, b); }
+
+SIMD_INLINE v64 v64_min_u8(v64 a, v64 b) { return _mm_min_epu8(a, b); }
+
+SIMD_INLINE v64 v64_max_u8(v64 a, v64 b) { return _mm_max_epu8(a, b); }
+
+SIMD_INLINE v64 v64_min_s8(v64 a, v64 b) {
+#if defined(__SSE4_1__)
+  return _mm_min_epi8(a, b);
+#else
+  v64 mask = _mm_cmplt_epi8(a, b);
+  return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a));
+#endif
+}
+
+SIMD_INLINE v64 v64_max_s8(v64 a, v64 b) {
+#if defined(__SSE4_1__)
+  return _mm_max_epi8(a, b);
+#else
+  v64 mask = _mm_cmplt_epi8(b, a);
+  return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a));
+#endif
+}
+
+SIMD_INLINE v64 v64_min_s16(v64 a, v64 b) { return _mm_min_epi16(a, b); }
+
+SIMD_INLINE v64 v64_max_s16(v64 a, v64 b) { return _mm_max_epi16(a, b); }
+
+SIMD_INLINE v64 v64_cmpgt_s8(v64 a, v64 b) { return _mm_cmpgt_epi8(a, b); }
+
+SIMD_INLINE v64 v64_cmplt_s8(v64 a, v64 b) { return _mm_cmplt_epi8(a, b); }
+
+SIMD_INLINE v64 v64_cmpeq_8(v64 a, v64 b) { return _mm_cmpeq_epi8(a, b); }
+
+SIMD_INLINE v64 v64_cmpgt_s16(v64 a, v64 b) { return _mm_cmpgt_epi16(a, b); }
+
+SIMD_INLINE v64 v64_cmplt_s16(v64 a, v64 b) { return _mm_cmplt_epi16(a, b); }
+
+SIMD_INLINE v64 v64_cmpeq_16(v64 a, v64 b) { return _mm_cmpeq_epi16(a, b); }
+
+SIMD_INLINE v64 v64_shl_8(v64 a, unsigned int c) {
+  return _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << c)),
+                       _mm_sll_epi16(a, _mm_cvtsi32_si128(c)));
+}
+
+SIMD_INLINE v64 v64_shr_u8(v64 a, unsigned int c) {
+  return _mm_and_si128(_mm_set1_epi8((char)(0xff >> c)),
+                       _mm_srl_epi16(a, _mm_cvtsi32_si128(c)));
+}
+
+SIMD_INLINE v64 v64_shr_s8(v64 a, unsigned int c) {
+  return _mm_packs_epi16(
+      _mm_sra_epi16(_mm_unpacklo_epi8(a, a), _mm_cvtsi32_si128(c + 8)), a);
+}
+
+SIMD_INLINE v64 v64_shl_16(v64 a, unsigned int c) {
+  return _mm_sll_epi16(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v64 v64_shr_u16(v64 a, unsigned int c) {
+  return _mm_srl_epi16(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v64 v64_shr_s16(v64 a, unsigned int c) {
+  return _mm_sra_epi16(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v64 v64_shl_32(v64 a, unsigned int c) {
+  return _mm_sll_epi32(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v64 v64_shr_u32(v64 a, unsigned int c) {
+  return _mm_srl_epi32(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v64 v64_shr_s32(v64 a, unsigned int c) {
+  return _mm_sra_epi32(a, _mm_cvtsi32_si128(c));
+}
+
+/* These intrinsics require immediate values, so we must use #defines
+   to enforce that. */
+#define v64_shl_n_byte(a, c) _mm_slli_si128(a, c)
+#define v64_shr_n_byte(a, c) _mm_srli_si128(_mm_unpacklo_epi64(a, a), c + 8)
+#define v64_shl_n_8(a, c) \
+  _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << (c))), _mm_slli_epi16(a, c))
+#define v64_shr_n_u8(a, c) \
+  _mm_and_si128(_mm_set1_epi8(0xff >> (c)), _mm_srli_epi16(a, c))
+#define v64_shr_n_s8(a, c) \
+  _mm_packs_epi16(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), (c) + 8), a)
+#define v64_shl_n_16(a, c) _mm_slli_epi16(a, c)
+#define v64_shr_n_u16(a, c) _mm_srli_epi16(a, c)
+#define v64_shr_n_s16(a, c) _mm_srai_epi16(a, c)
+#define v64_shl_n_32(a, c) _mm_slli_epi32(a, c)
+#define v64_shr_n_u32(a, c) _mm_srli_epi32(a, c)
+#define v64_shr_n_s32(a, c) _mm_srai_epi32(a, c)
+
+#endif  // AOM_AOM_DSP_SIMD_V64_INTRINSICS_X86_H_
diff --git a/libs/libaom/src/aom_dsp/sse.c b/libs/libaom/src/aom_dsp/sse.c
new file mode 100644
index 000000000..16f6b58bd
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/sse.c
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/* Sum the difference between every corresponding element of the buffers. */
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+
+int64_t aom_sse_c(const uint8_t *a, int a_stride, const uint8_t *b,
+                  int b_stride, int width, int height) {
+  int y, x;
+  int64_t sse = 0;
+
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x++) {
+      const int32_t diff = abs(a[x] - b[x]);
+      sse += diff * diff;
+    }
+
+    a += a_stride;
+    b += b_stride;
+  }
+  return sse;
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+int64_t aom_highbd_sse_c(const uint8_t *a8, int a_stride, const uint8_t *b8,
+                         int b_stride, int width, int height) {
+  int y, x;
+  int64_t sse = 0;
+  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x++) {
+      const int32_t diff = (int32_t)(a[x]) - (int32_t)(b[x]);
+      sse += diff * diff;
+    }
+
+    a += a_stride;
+    b += b_stride;
+  }
+  return sse;
+}
+#endif
diff --git a/libs/libaom/src/aom_dsp/ssim.c b/libs/libaom/src/aom_dsp/ssim.c
new file mode 100644
index 000000000..95b88887b
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/ssim.c
@@ -0,0 +1,441 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <math.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/ssim.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/system_state.h"
+
+void aom_ssim_parms_16x16_c(const uint8_t *s, int sp, const uint8_t *r, int rp,
+                            uint32_t *sum_s, uint32_t *sum_r,
+                            uint32_t *sum_sq_s, uint32_t *sum_sq_r,
+                            uint32_t *sum_sxr) {
+  int i, j;
+  for (i = 0; i < 16; i++, s += sp, r += rp) {
+    for (j = 0; j < 16; j++) {
+      *sum_s += s[j];
+      *sum_r += r[j];
+      *sum_sq_s += s[j] * s[j];
+      *sum_sq_r += r[j] * r[j];
+      *sum_sxr += s[j] * r[j];
+    }
+  }
+}
+
+void aom_ssim_parms_8x8_c(const uint8_t *s, int sp, const uint8_t *r, int rp,
+                          uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s,
+                          uint32_t *sum_sq_r, uint32_t *sum_sxr) {
+  int i, j;
+  for (i = 0; i < 8; i++, s += sp, r += rp) {
+    for (j = 0; j < 8; j++) {
+      *sum_s += s[j];
+      *sum_r += r[j];
+      *sum_sq_s += s[j] * s[j];
+      *sum_sq_r += r[j] * r[j];
+      *sum_sxr += s[j] * r[j];
+    }
+  }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void aom_highbd_ssim_parms_8x8_c(const uint16_t *s, int sp, const uint16_t *r,
+                                 int rp, uint32_t *sum_s, uint32_t *sum_r,
+                                 uint32_t *sum_sq_s, uint32_t *sum_sq_r,
+                                 uint32_t *sum_sxr) {
+  int i, j;
+  for (i = 0; i < 8; i++, s += sp, r += rp) {
+    for (j = 0; j < 8; j++) {
+      *sum_s += s[j];
+      *sum_r += r[j];
+      *sum_sq_s += s[j] * s[j];
+      *sum_sq_r += r[j] * r[j];
+      *sum_sxr += s[j] * r[j];
+    }
+  }
+}
+#endif
+
+static const int64_t cc1 = 26634;        // (64^2*(.01*255)^2
+static const int64_t cc2 = 239708;       // (64^2*(.03*255)^2
+static const int64_t cc1_10 = 428658;    // (64^2*(.01*1023)^2
+static const int64_t cc2_10 = 3857925;   // (64^2*(.03*1023)^2
+static const int64_t cc1_12 = 6868593;   // (64^2*(.01*4095)^2
+static const int64_t cc2_12 = 61817334;  // (64^2*(.03*4095)^2
+
+static double similarity(uint32_t sum_s, uint32_t sum_r, uint32_t sum_sq_s,
+                         uint32_t sum_sq_r, uint32_t sum_sxr, int count,
+                         uint32_t bd) {
+  int64_t ssim_n, ssim_d;
+  int64_t c1, c2;
+  if (bd == 8) {
+    // scale the constants by number of pixels
+    c1 = (cc1 * count * count) >> 12;
+    c2 = (cc2 * count * count) >> 12;
+  } else if (bd == 10) {
+    c1 = (cc1_10 * count * count) >> 12;
+    c2 = (cc2_10 * count * count) >> 12;
+  } else if (bd == 12) {
+    c1 = (cc1_12 * count * count) >> 12;
+    c2 = (cc2_12 * count * count) >> 12;
+  } else {
+    c1 = c2 = 0;
+    assert(0);
+  }
+
+  ssim_n = (2 * sum_s * sum_r + c1) *
+           ((int64_t)2 * count * sum_sxr - (int64_t)2 * sum_s * sum_r + c2);
+
+  ssim_d = (sum_s * sum_s + sum_r * sum_r + c1) *
+           ((int64_t)count * sum_sq_s - (int64_t)sum_s * sum_s +
+            (int64_t)count * sum_sq_r - (int64_t)sum_r * sum_r + c2);
+
+  return ssim_n * 1.0 / ssim_d;
+}
+
+static double ssim_8x8(const uint8_t *s, int sp, const uint8_t *r, int rp) {
+  uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
+  aom_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r,
+                     &sum_sxr);
+  return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64, 8);
+}
+
+static double highbd_ssim_8x8(const uint16_t *s, int sp, const uint16_t *r,
+                              int rp, uint32_t bd, uint32_t shift) {
+  uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
+  aom_highbd_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r,
+                            &sum_sxr);
+  return similarity(sum_s >> shift, sum_r >> shift, sum_sq_s >> (2 * shift),
+                    sum_sq_r >> (2 * shift), sum_sxr >> (2 * shift), 64, bd);
+}
+
+// We are using a 8x8 moving window with starting location of each 8x8 window
+// on the 4x4 pixel grid. Such arrangement allows the windows to overlap
+// block boundaries to penalize blocking artifacts.
+static double aom_ssim2(const uint8_t *img1, const uint8_t *img2,
+                        int stride_img1, int stride_img2, int width,
+                        int height) {
+  int i, j;
+  int samples = 0;
+  double ssim_total = 0;
+
+  // sample point start with each 4x4 location
+  for (i = 0; i <= height - 8;
+       i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) {
+    for (j = 0; j <= width - 8; j += 4) {
+      double v = ssim_8x8(img1 + j, stride_img1, img2 + j, stride_img2);
+      ssim_total += v;
+      samples++;
+    }
+  }
+  ssim_total /= samples;
+  return ssim_total;
+}
+
+static double aom_highbd_ssim2(const uint8_t *img1, const uint8_t *img2,
+                               int stride_img1, int stride_img2, int width,
+                               int height, uint32_t bd, uint32_t shift) {
+  int i, j;
+  int samples = 0;
+  double ssim_total = 0;
+
+  // sample point start with each 4x4 location
+  for (i = 0; i <= height - 8;
+       i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) {
+    for (j = 0; j <= width - 8; j += 4) {
+      double v = highbd_ssim_8x8(CONVERT_TO_SHORTPTR(img1 + j), stride_img1,
+                                 CONVERT_TO_SHORTPTR(img2 + j), stride_img2, bd,
+                                 shift);
+      ssim_total += v;
+      samples++;
+    }
+  }
+  ssim_total /= samples;
+  return ssim_total;
+}
+
+double aom_calc_ssim(const YV12_BUFFER_CONFIG *source,
+                     const YV12_BUFFER_CONFIG *dest, double *weight) {
+  double abc[3];
+  for (int i = 0; i < 3; ++i) {
+    const int is_uv = i > 0;
+    abc[i] = aom_ssim2(source->buffers[i], dest->buffers[i],
+                       source->strides[is_uv], dest->strides[is_uv],
+                       source->crop_widths[is_uv], source->crop_heights[is_uv]);
+  }
+
+  *weight = 1;
+  return abc[0] * .8 + .1 * (abc[1] + abc[2]);
+}
+
+// traditional ssim as per: http://en.wikipedia.org/wiki/Structural_similarity
+//
+// Re working out the math ->
+//
+// ssim(x,y) =  (2*mean(x)*mean(y) + c1)*(2*cov(x,y)+c2) /
+//   ((mean(x)^2+mean(y)^2+c1)*(var(x)+var(y)+c2))
+//
+// mean(x) = sum(x) / n
+//
+// cov(x,y) = (n*sum(xi*yi)-sum(x)*sum(y))/(n*n)
+//
+// var(x) = (n*sum(xi*xi)-sum(xi)*sum(xi))/(n*n)
+//
+// ssim(x,y) =
+//   (2*sum(x)*sum(y)/(n*n) + c1)*(2*(n*sum(xi*yi)-sum(x)*sum(y))/(n*n)+c2) /
+//   (((sum(x)*sum(x)+sum(y)*sum(y))/(n*n) +c1) *
+//    ((n*sum(xi*xi) - sum(xi)*sum(xi))/(n*n)+
+//     (n*sum(yi*yi) - sum(yi)*sum(yi))/(n*n)+c2)))
+//
+// factoring out n*n
+//
+// ssim(x,y) =
+//   (2*sum(x)*sum(y) + n*n*c1)*(2*(n*sum(xi*yi)-sum(x)*sum(y))+n*n*c2) /
+//   (((sum(x)*sum(x)+sum(y)*sum(y)) + n*n*c1) *
+//    (n*sum(xi*xi)-sum(xi)*sum(xi)+n*sum(yi*yi)-sum(yi)*sum(yi)+n*n*c2))
+//
+// Replace c1 with n*n * c1 for the final step that leads to this code:
+// The final step scales by 12 bits so we don't lose precision in the constants.
+
+static double ssimv_similarity(const Ssimv *sv, int64_t n) {
+  // Scale the constants by number of pixels.
+  const int64_t c1 = (cc1 * n * n) >> 12;
+  const int64_t c2 = (cc2 * n * n) >> 12;
+
+  const double l = 1.0 * (2 * sv->sum_s * sv->sum_r + c1) /
+                   (sv->sum_s * sv->sum_s + sv->sum_r * sv->sum_r + c1);
+
+  // Since these variables are unsigned sums, convert to double so
+  // math is done in double arithmetic.
+  const double v = (2.0 * n * sv->sum_sxr - 2 * sv->sum_s * sv->sum_r + c2) /
+                   (n * sv->sum_sq_s - sv->sum_s * sv->sum_s +
+                    n * sv->sum_sq_r - sv->sum_r * sv->sum_r + c2);
+
+  return l * v;
+}
+
+// The first term of the ssim metric is a luminance factor.
+//
+// (2*mean(x)*mean(y) + c1)/ (mean(x)^2+mean(y)^2+c1)
+//
+// This luminance factor is super sensitive to the dark side of luminance
+// values and completely insensitive on the white side.  check out 2 sets
+// (1,3) and (250,252) the term gives ( 2*1*3/(1+9) = .60
+// 2*250*252/ (250^2+252^2) => .99999997
+//
+// As a result in this tweaked version of the calculation in which the
+// luminance is taken as percentage off from peak possible.
+//
+// 255 * 255 - (sum_s - sum_r) / count * (sum_s - sum_r) / count
+//
+static double ssimv_similarity2(const Ssimv *sv, int64_t n) {
+  // Scale the constants by number of pixels.
+  const int64_t c1 = (cc1 * n * n) >> 12;
+  const int64_t c2 = (cc2 * n * n) >> 12;
+
+  const double mean_diff = (1.0 * sv->sum_s - sv->sum_r) / n;
+  const double l = (255 * 255 - mean_diff * mean_diff + c1) / (255 * 255 + c1);
+
+  // Since these variables are unsigned, sums convert to double so
+  // math is done in double arithmetic.
+  const double v = (2.0 * n * sv->sum_sxr - 2 * sv->sum_s * sv->sum_r + c2) /
+                   (n * sv->sum_sq_s - sv->sum_s * sv->sum_s +
+                    n * sv->sum_sq_r - sv->sum_r * sv->sum_r + c2);
+
+  return l * v;
+}
+static void ssimv_parms(uint8_t *img1, int img1_pitch, uint8_t *img2,
+                        int img2_pitch, Ssimv *sv) {
+  aom_ssim_parms_8x8(img1, img1_pitch, img2, img2_pitch, &sv->sum_s, &sv->sum_r,
+                     &sv->sum_sq_s, &sv->sum_sq_r, &sv->sum_sxr);
+}
+
+double aom_get_ssim_metrics(uint8_t *img1, int img1_pitch, uint8_t *img2,
+                            int img2_pitch, int width, int height, Ssimv *sv2,
+                            Metrics *m, int do_inconsistency) {
+  double dssim_total = 0;
+  double ssim_total = 0;
+  double ssim2_total = 0;
+  double inconsistency_total = 0;
+  int i, j;
+  int c = 0;
+  double norm;
+  double old_ssim_total = 0;
+  aom_clear_system_state();
+  // We can sample points as frequently as we like start with 1 per 4x4.
+  for (i = 0; i < height;
+       i += 4, img1 += img1_pitch * 4, img2 += img2_pitch * 4) {
+    for (j = 0; j < width; j += 4, ++c) {
+      Ssimv sv = { 0, 0, 0, 0, 0, 0 };
+      double ssim;
+      double ssim2;
+      double dssim;
+      uint32_t var_new;
+      uint32_t var_old;
+      uint32_t mean_new;
+      uint32_t mean_old;
+      double ssim_new;
+      double ssim_old;
+
+      // Not sure there's a great way to handle the edge pixels
+      // in ssim when using a window. Seems biased against edge pixels
+      // however you handle this. This uses only samples that are
+      // fully in the frame.
+      if (j + 8 <= width && i + 8 <= height) {
+        ssimv_parms(img1 + j, img1_pitch, img2 + j, img2_pitch, &sv);
+      }
+
+      ssim = ssimv_similarity(&sv, 64);
+      ssim2 = ssimv_similarity2(&sv, 64);
+
+      sv.ssim = ssim2;
+
+      // dssim is calculated to use as an actual error metric and
+      // is scaled up to the same range as sum square error.
+      // Since we are subsampling every 16th point maybe this should be
+      // *16 ?
+      dssim = 255 * 255 * (1 - ssim2) / 2;
+
+      // Here I introduce a new error metric: consistency-weighted
+      // SSIM-inconsistency.  This metric isolates frames where the
+      // SSIM 'suddenly' changes, e.g. if one frame in every 8 is much
+      // sharper or blurrier than the others. Higher values indicate a
+      // temporally inconsistent SSIM. There are two ideas at work:
+      //
+      // 1) 'SSIM-inconsistency': the total inconsistency value
+      // reflects how much SSIM values are changing between this
+      // source / reference frame pair and the previous pair.
+      //
+      // 2) 'consistency-weighted': weights de-emphasize areas in the
+      // frame where the scene content has changed. Changes in scene
+      // content are detected via changes in local variance and local
+      // mean.
+      //
+      // Thus the overall measure reflects how inconsistent the SSIM
+      // values are, over consistent regions of the frame.
+      //
+      // The metric has three terms:
+      //
+      // term 1 -> uses change in scene Variance to weight error score
+      //  2 * var(Fi)*var(Fi-1) / (var(Fi)^2+var(Fi-1)^2)
+      //  larger changes from one frame to the next mean we care
+      //  less about consistency.
+      //
+      // term 2 -> uses change in local scene luminance to weight error
+      //  2 * avg(Fi)*avg(Fi-1) / (avg(Fi)^2+avg(Fi-1)^2)
+      //  larger changes from one frame to the next mean we care
+      //  less about consistency.
+      //
+      // term3 -> measures inconsistency in ssim scores between frames
+      //   1 - ( 2 * ssim(Fi)*ssim(Fi-1)/(ssim(Fi)^2+sssim(Fi-1)^2).
+      //
+      // This term compares the ssim score for the same location in 2
+      // subsequent frames.
+      var_new = sv.sum_sq_s - sv.sum_s * sv.sum_s / 64;
+      var_old = sv2[c].sum_sq_s - sv2[c].sum_s * sv2[c].sum_s / 64;
+      mean_new = sv.sum_s;
+      mean_old = sv2[c].sum_s;
+      ssim_new = sv.ssim;
+      ssim_old = sv2[c].ssim;
+
+      if (do_inconsistency) {
+        // We do the metric once for every 4x4 block in the image. Since
+        // we are scaling the error to SSE for use in a psnr calculation
+        // 1.0 = 4x4x255x255 the worst error we can possibly have.
+        static const double kScaling = 4. * 4 * 255 * 255;
+
+        // The constants have to be non 0 to avoid potential divide by 0
+        // issues other than that they affect kind of a weighting between
+        // the terms.  No testing of what the right terms should be has been
+        // done.
+        static const double c1 = 1, c2 = 1, c3 = 1;
+
+        // This measures how much consistent variance is in two consecutive
+        // source frames. 1.0 means they have exactly the same variance.
+        const double variance_term =
+            (2.0 * var_old * var_new + c1) /
+            (1.0 * var_old * var_old + 1.0 * var_new * var_new + c1);
+
+        // This measures how consistent the local mean are between two
+        // consecutive frames. 1.0 means they have exactly the same mean.
+        const double mean_term =
+            (2.0 * mean_old * mean_new + c2) /
+            (1.0 * mean_old * mean_old + 1.0 * mean_new * mean_new + c2);
+
+        // This measures how consistent the ssims of two
+        // consecutive frames is. 1.0 means they are exactly the same.
+        double ssim_term =
+            pow((2.0 * ssim_old * ssim_new + c3) /
+                    (ssim_old * ssim_old + ssim_new * ssim_new + c3),
+                5);
+
+        double this_inconsistency;
+
+        // Floating point math sometimes makes this > 1 by a tiny bit.
+        // We want the metric to scale between 0 and 1.0 so we can convert
+        // it to an snr scaled value.
+        if (ssim_term > 1) ssim_term = 1;
+
+        // This converts the consistency metric to an inconsistency metric
+        // ( so we can scale it like psnr to something like sum square error.
+        // The reason for the variance and mean terms is the assumption that
+        // if there are big changes in the source we shouldn't penalize
+        // inconsistency in ssim scores a bit less as it will be less visible
+        // to the user.
+        this_inconsistency = (1 - ssim_term) * variance_term * mean_term;
+
+        this_inconsistency *= kScaling;
+        inconsistency_total += this_inconsistency;
+      }
+      sv2[c] = sv;
+      ssim_total += ssim;
+      ssim2_total += ssim2;
+      dssim_total += dssim;
+
+      old_ssim_total += ssim_old;
+    }
+    old_ssim_total += 0;
+  }
+
+  norm = 1. / (width / 4) / (height / 4);
+  ssim_total *= norm;
+  ssim2_total *= norm;
+  m->ssim2 = ssim2_total;
+  m->ssim = ssim_total;
+  if (old_ssim_total == 0) inconsistency_total = 0;
+
+  m->ssimc = inconsistency_total;
+
+  m->dssim = dssim_total;
+  return inconsistency_total;
+}
+
+double aom_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source,
+                            const YV12_BUFFER_CONFIG *dest, double *weight,
+                            uint32_t bd, uint32_t in_bd) {
+  assert(bd >= in_bd);
+  const uint32_t shift = bd - in_bd;
+
+  double abc[3];
+  for (int i = 0; i < 3; ++i) {
+    const int is_uv = i > 0;
+    abc[i] = aom_highbd_ssim2(source->buffers[i], dest->buffers[i],
+                              source->strides[is_uv], dest->strides[is_uv],
+                              source->crop_widths[is_uv],
+                              source->crop_heights[is_uv], in_bd, shift);
+  }
+
+  *weight = 1;
+  return abc[0] * .8 + .1 * (abc[1] + abc[2]);
+}
diff --git a/libs/libaom/src/aom_dsp/ssim.h b/libs/libaom/src/aom_dsp/ssim.h
new file mode 100644
index 000000000..55038f4c2
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/ssim.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_SSIM_H_
+#define AOM_AOM_DSP_SSIM_H_
+
+#define MAX_SSIM_DB 100.0;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "config/aom_config.h"
+
+#include "aom_scale/yv12config.h"
+
+// metrics used for calculating ssim, ssim2, dssim, and ssimc
+typedef struct {
+  // source sum ( over 8x8 region )
+  uint32_t sum_s;
+
+  // reference sum (over 8x8 region )
+  uint32_t sum_r;
+
+  // source sum squared ( over 8x8 region )
+  uint32_t sum_sq_s;
+
+  // reference sum squared (over 8x8 region )
+  uint32_t sum_sq_r;
+
+  // sum of source times reference (over 8x8 region)
+  uint32_t sum_sxr;
+
+  // calculated ssim score between source and reference
+  double ssim;
+} Ssimv;
+
+// metrics collected on a frame basis
+typedef struct {
+  // ssim consistency error metric ( see code for explanation )
+  double ssimc;
+
+  // standard ssim
+  double ssim;
+
+  // revised ssim ( see code for explanation)
+  double ssim2;
+
+  // ssim restated as an error metric like sse
+  double dssim;
+
+  // dssim converted to decibels
+  double dssimd;
+
+  // ssimc converted to decibels
+  double ssimcd;
+} Metrics;
+
+double aom_get_ssim_metrics(uint8_t *img1, int img1_pitch, uint8_t *img2,
+                            int img2_pitch, int width, int height, Ssimv *sv2,
+                            Metrics *m, int do_inconsistency);
+
+double aom_calc_ssim(const YV12_BUFFER_CONFIG *source,
+                     const YV12_BUFFER_CONFIG *dest, double *weight);
+
+double aom_calc_fastssim(const YV12_BUFFER_CONFIG *source,
+                         const YV12_BUFFER_CONFIG *dest, double *ssim_y,
+                         double *ssim_u, double *ssim_v, uint32_t bd,
+                         uint32_t in_bd);
+
+double aom_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source,
+                            const YV12_BUFFER_CONFIG *dest, double *weight,
+                            uint32_t bd, uint32_t in_bd);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_DSP_SSIM_H_
diff --git a/libs/libaom/src/aom_dsp/subtract.c b/libs/libaom/src/aom_dsp/subtract.c
new file mode 100644
index 000000000..4f4e35597
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/subtract.c
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+
+void aom_subtract_block_c(int rows, int cols, int16_t *diff,
+                          ptrdiff_t diff_stride, const uint8_t *src,
+                          ptrdiff_t src_stride, const uint8_t *pred,
+                          ptrdiff_t pred_stride) {
+  int r, c;
+
+  for (r = 0; r < rows; r++) {
+    for (c = 0; c < cols; c++) diff[c] = src[c] - pred[c];
+
+    diff += diff_stride;
+    pred += pred_stride;
+    src += src_stride;
+  }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void aom_highbd_subtract_block_c(int rows, int cols, int16_t *diff,
+                                 ptrdiff_t diff_stride, const uint8_t *src8,
+                                 ptrdiff_t src_stride, const uint8_t *pred8,
+                                 ptrdiff_t pred_stride, int bd) {
+  int r, c;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  (void)bd;
+
+  for (r = 0; r < rows; r++) {
+    for (c = 0; c < cols; c++) {
+      diff[c] = src[c] - pred[c];
+    }
+
+    diff += diff_stride;
+    pred += pred_stride;
+    src += src_stride;
+  }
+}
+#endif
diff --git a/libs/libaom/src/aom_dsp/sum_squares.c b/libs/libaom/src/aom_dsp/sum_squares.c
new file mode 100644
index 000000000..d739a6083
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/sum_squares.c
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+uint64_t aom_sum_squares_2d_i16_c(const int16_t *src, int src_stride, int width,
+                                  int height) {
+  int r, c;
+  uint64_t ss = 0;
+
+  for (r = 0; r < height; r++) {
+    for (c = 0; c < width; c++) {
+      const int16_t v = src[c];
+      ss += v * v;
+    }
+    src += src_stride;
+  }
+
+  return ss;
+}
+
+uint64_t aom_sum_squares_i16_c(const int16_t *src, uint32_t n) {
+  uint64_t ss = 0;
+  do {
+    const int16_t v = *src++;
+    ss += v * v;
+  } while (--n);
+
+  return ss;
+}
+
+uint64_t aom_var_2d_u8_c(uint8_t *src, int src_stride, int width, int height) {
+  int r, c;
+  uint64_t ss = 0, s = 0;
+
+  for (r = 0; r < height; r++) {
+    for (c = 0; c < width; c++) {
+      const uint8_t v = src[c];
+      ss += v * v;
+      s += v;
+    }
+    src += src_stride;
+  }
+
+  return (ss - s * s / (width * height));
+}
+
+uint64_t aom_var_2d_u16_c(uint8_t *src, int src_stride, int width, int height) {
+  uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
+  int r, c;
+  uint64_t ss = 0, s = 0;
+
+  for (r = 0; r < height; r++) {
+    for (c = 0; c < width; c++) {
+      const uint16_t v = srcp[c];
+      ss += v * v;
+      s += v;
+    }
+    srcp += src_stride;
+  }
+
+  return (ss - s * s / (width * height));
+}
diff --git a/libs/libaom/src/aom_dsp/txfm_common.h b/libs/libaom/src/aom_dsp/txfm_common.h
new file mode 100644
index 000000000..f13d69092
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/txfm_common.h
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_TXFM_COMMON_H_
+#define AOM_AOM_DSP_TXFM_COMMON_H_
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "av1/common/enums.h"
+
+// Constants and Macros used by all idct/dct functions
+#define DCT_CONST_BITS 14
+#define DCT_CONST_ROUNDING (1 << (DCT_CONST_BITS - 1))
+
+#define UNIT_QUANT_SHIFT 2
+#define UNIT_QUANT_FACTOR (1 << UNIT_QUANT_SHIFT)
+
+typedef struct txfm_param {
+  // for both forward and inverse transforms
+  TX_TYPE tx_type;
+  TX_SIZE tx_size;
+  int lossless;
+  int bd;
+  // are the pixel buffers octets or shorts?  This should collapse to
+  // bd==8 implies !is_hbd, but that's not certain right now.
+  int is_hbd;
+  TxSetType tx_set_type;
+  // for inverse transforms only
+  int eob;
+} TxfmParam;
+
+// Constants:
+//  for (int i = 1; i< 32; ++i)
+//    printf("static const int cospi_%d_64 = %.0f;\n", i,
+//           round(16384 * cos(i*PI/64)));
+// Note: sin(k*Pi/64) = cos((32-k)*Pi/64)
+static const tran_high_t cospi_1_64 = 16364;
+static const tran_high_t cospi_2_64 = 16305;
+static const tran_high_t cospi_3_64 = 16207;
+static const tran_high_t cospi_4_64 = 16069;
+static const tran_high_t cospi_5_64 = 15893;
+static const tran_high_t cospi_6_64 = 15679;
+static const tran_high_t cospi_7_64 = 15426;
+static const tran_high_t cospi_8_64 = 15137;
+static const tran_high_t cospi_9_64 = 14811;
+static const tran_high_t cospi_10_64 = 14449;
+static const tran_high_t cospi_11_64 = 14053;
+static const tran_high_t cospi_12_64 = 13623;
+static const tran_high_t cospi_13_64 = 13160;
+static const tran_high_t cospi_14_64 = 12665;
+static const tran_high_t cospi_15_64 = 12140;
+static const tran_high_t cospi_16_64 = 11585;
+static const tran_high_t cospi_17_64 = 11003;
+static const tran_high_t cospi_18_64 = 10394;
+static const tran_high_t cospi_19_64 = 9760;
+static const tran_high_t cospi_20_64 = 9102;
+static const tran_high_t cospi_21_64 = 8423;
+static const tran_high_t cospi_22_64 = 7723;
+static const tran_high_t cospi_23_64 = 7005;
+static const tran_high_t cospi_24_64 = 6270;
+static const tran_high_t cospi_25_64 = 5520;
+static const tran_high_t cospi_26_64 = 4756;
+static const tran_high_t cospi_27_64 = 3981;
+static const tran_high_t cospi_28_64 = 3196;
+static const tran_high_t cospi_29_64 = 2404;
+static const tran_high_t cospi_30_64 = 1606;
+static const tran_high_t cospi_31_64 = 804;
+
+//  16384 * sqrt(2) * sin(kPi/9) * 2 / 3
+static const tran_high_t sinpi_1_9 = 5283;
+static const tran_high_t sinpi_2_9 = 9929;
+static const tran_high_t sinpi_3_9 = 13377;
+static const tran_high_t sinpi_4_9 = 15212;
+
+// 16384 * sqrt(2)
+static const tran_high_t Sqrt2 = 23170;
+static const tran_high_t InvSqrt2 = 11585;
+
+static INLINE tran_high_t fdct_round_shift(tran_high_t input) {
+  tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
+  return rv;
+}
+
+#endif  // AOM_AOM_DSP_TXFM_COMMON_H_
diff --git a/libs/libaom/src/aom_dsp/variance.c b/libs/libaom/src/aom_dsp/variance.c
new file mode 100644
index 000000000..695f12a52
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/variance.c
@@ -0,0 +1,1483 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/blend.h"
+#include "aom_dsp/variance.h"
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/filter.h"
+#include "av1/common/reconinter.h"
+#include "av1/encoder/reconinter_enc.h"
+
+uint32_t aom_get4x4sse_cs_c(const uint8_t *a, int a_stride, const uint8_t *b,
+                            int b_stride) {
+  int distortion = 0;
+  int r, c;
+
+  for (r = 0; r < 4; ++r) {
+    for (c = 0; c < 4; ++c) {
+      int diff = a[c] - b[c];
+      distortion += diff * diff;
+    }
+
+    a += a_stride;
+    b += b_stride;
+  }
+
+  return distortion;
+}
+
+uint32_t aom_get_mb_ss_c(const int16_t *a) {
+  unsigned int i, sum = 0;
+
+  for (i = 0; i < 256; ++i) {
+    sum += a[i] * a[i];
+  }
+
+  return sum;
+}
+
+static void variance(const uint8_t *a, int a_stride, const uint8_t *b,
+                     int b_stride, int w, int h, uint32_t *sse, int *sum) {
+  int i, j;
+
+  *sum = 0;
+  *sse = 0;
+
+  for (i = 0; i < h; ++i) {
+    for (j = 0; j < w; ++j) {
+      const int diff = a[j] - b[j];
+      *sum += diff;
+      *sse += diff * diff;
+    }
+
+    a += a_stride;
+    b += b_stride;
+  }
+}
+
+uint32_t aom_sse_odd_size(const uint8_t *a, int a_stride, const uint8_t *b,
+                          int b_stride, int w, int h) {
+  uint32_t sse;
+  int sum;
+  variance(a, a_stride, b, b_stride, w, h, &sse, &sum);
+  return sse;
+}
+
+// Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
+// or vertical direction to produce the filtered output block. Used to implement
+// the first-pass of 2-D separable filter.
+//
+// Produces int16_t output to retain precision for the next pass. Two filter
+// taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is
+// applied horizontally (pixel_step = 1) or vertically (pixel_step = stride).
+// It defines the offset required to move from one input to the next.
+void aom_var_filter_block2d_bil_first_pass_c(const uint8_t *a, uint16_t *b,
+                                             unsigned int src_pixels_per_line,
+                                             unsigned int pixel_step,
+                                             unsigned int output_height,
+                                             unsigned int output_width,
+                                             const uint8_t *filter) {
+  unsigned int i, j;
+
+  for (i = 0; i < output_height; ++i) {
+    for (j = 0; j < output_width; ++j) {
+      b[j] = ROUND_POWER_OF_TWO(
+          (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS);
+
+      ++a;
+    }
+
+    a += src_pixels_per_line - output_width;
+    b += output_width;
+  }
+}
+
+// Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
+// or vertical direction to produce the filtered output block. Used to implement
+// the second-pass of 2-D separable filter.
+//
+// Requires 16-bit input as produced by filter_block2d_bil_first_pass. Two
+// filter taps should sum to FILTER_WEIGHT. pixel_step defines whether the
+// filter is applied horizontally (pixel_step = 1) or vertically
+// (pixel_step = stride). It defines the offset required to move from one input
+// to the next. Output is 8-bit.
+void aom_var_filter_block2d_bil_second_pass_c(const uint16_t *a, uint8_t *b,
+                                              unsigned int src_pixels_per_line,
+                                              unsigned int pixel_step,
+                                              unsigned int output_height,
+                                              unsigned int output_width,
+                                              const uint8_t *filter) {
+  unsigned int i, j;
+
+  for (i = 0; i < output_height; ++i) {
+    for (j = 0; j < output_width; ++j) {
+      b[j] = ROUND_POWER_OF_TWO(
+          (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS);
+      ++a;
+    }
+
+    a += src_pixels_per_line - output_width;
+    b += output_width;
+  }
+}
+
+#define VAR(W, H)                                                    \
+  uint32_t aom_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
+                                     const uint8_t *b, int b_stride, \
+                                     uint32_t *sse) {                \
+    int sum;                                                         \
+    variance(a, a_stride, b, b_stride, W, H, sse, &sum);             \
+    return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H));        \
+  }
+
+#define SUBPIX_VAR(W, H)                                                      \
+  uint32_t aom_sub_pixel_variance##W##x##H##_c(                               \
+      const uint8_t *a, int a_stride, int xoffset, int yoffset,               \
+      const uint8_t *b, int b_stride, uint32_t *sse) {                        \
+    uint16_t fdata3[(H + 1) * W];                                             \
+    uint8_t temp2[H * W];                                                     \
+                                                                              \
+    aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \
+                                            bilinear_filters_2t[xoffset]);    \
+    aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,       \
+                                             bilinear_filters_2t[yoffset]);   \
+                                                                              \
+    return aom_variance##W##x##H##_c(temp2, W, b, b_stride, sse);             \
+  }
+
+#define SUBPIX_AVG_VAR(W, H)                                                   \
+  uint32_t aom_sub_pixel_avg_variance##W##x##H##_c(                            \
+      const uint8_t *a, int a_stride, int xoffset, int yoffset,                \
+      const uint8_t *b, int b_stride, uint32_t *sse,                           \
+      const uint8_t *second_pred) {                                            \
+    uint16_t fdata3[(H + 1) * W];                                              \
+    uint8_t temp2[H * W];                                                      \
+    DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                                \
+                                                                               \
+    aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W,  \
+                                            bilinear_filters_2t[xoffset]);     \
+    aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,        \
+                                             bilinear_filters_2t[yoffset]);    \
+                                                                               \
+    aom_comp_avg_pred(temp3, second_pred, W, H, temp2, W);                     \
+                                                                               \
+    return aom_variance##W##x##H##_c(temp3, W, b, b_stride, sse);              \
+  }                                                                            \
+  uint32_t aom_dist_wtd_sub_pixel_avg_variance##W##x##H##_c(                   \
+      const uint8_t *a, int a_stride, int xoffset, int yoffset,                \
+      const uint8_t *b, int b_stride, uint32_t *sse,                           \
+      const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {     \
+    uint16_t fdata3[(H + 1) * W];                                              \
+    uint8_t temp2[H * W];                                                      \
+    DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                                \
+                                                                               \
+    aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W,  \
+                                            bilinear_filters_2t[xoffset]);     \
+    aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,        \
+                                             bilinear_filters_2t[yoffset]);    \
+                                                                               \
+    aom_dist_wtd_comp_avg_pred(temp3, second_pred, W, H, temp2, W, jcp_param); \
+                                                                               \
+    return aom_variance##W##x##H(temp3, W, b, b_stride, sse);                  \
+  }
+
+/* Identical to the variance call except it takes an additional parameter, sum,
+ * and returns that value using pass-by-reference instead of returning
+ * sse - sum^2 / w*h
+ */
+#define GET_VAR(W, H)                                                         \
+  void aom_get##W##x##H##var_c(const uint8_t *a, int a_stride,                \
+                               const uint8_t *b, int b_stride, uint32_t *sse, \
+                               int *sum) {                                    \
+    variance(a, a_stride, b, b_stride, W, H, sse, sum);                       \
+  }
+
+/* Identical to the variance call except it does not calculate the
+ * sse - sum^2 / w*h and returns sse in addtion to modifying the passed in
+ * variable.
+ */
+#define MSE(W, H)                                               \
+  uint32_t aom_mse##W##x##H##_c(const uint8_t *a, int a_stride, \
+                                const uint8_t *b, int b_stride, \
+                                uint32_t *sse) {                \
+    int sum;                                                    \
+    variance(a, a_stride, b, b_stride, W, H, sse, &sum);        \
+    return *sse;                                                \
+  }
+
+/* All three forms of the variance are available in the same sizes. */
+#define VARIANCES(W, H) \
+  VAR(W, H)             \
+  SUBPIX_VAR(W, H)      \
+  SUBPIX_AVG_VAR(W, H)
+
+VARIANCES(128, 128)
+VARIANCES(128, 64)
+VARIANCES(64, 128)
+VARIANCES(64, 64)
+VARIANCES(64, 32)
+VARIANCES(32, 64)
+VARIANCES(32, 32)
+VARIANCES(32, 16)
+VARIANCES(16, 32)
+VARIANCES(16, 16)
+VARIANCES(16, 8)
+VARIANCES(8, 16)
+VARIANCES(8, 8)
+VARIANCES(8, 4)
+VARIANCES(4, 8)
+VARIANCES(4, 4)
+VARIANCES(4, 2)
+VARIANCES(2, 4)
+VARIANCES(2, 2)
+VARIANCES(4, 16)
+VARIANCES(16, 4)
+VARIANCES(8, 32)
+VARIANCES(32, 8)
+VARIANCES(16, 64)
+VARIANCES(64, 16)
+
+GET_VAR(16, 16)
+GET_VAR(8, 8)
+
+MSE(16, 16)
+MSE(16, 8)
+MSE(8, 16)
+MSE(8, 8)
+
+void aom_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
+                         int height, const uint8_t *ref, int ref_stride) {
+  int i, j;
+
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < width; ++j) {
+      const int tmp = pred[j] + ref[j];
+      comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
+    }
+    comp_pred += width;
+    pred += width;
+    ref += ref_stride;
+  }
+}
+
+// Get pred block from up-sampled reference.
+void aom_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm,
+                          int mi_row, int mi_col, const MV *const mv,
+                          uint8_t *comp_pred, int width, int height,
+                          int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+                          int ref_stride, int subpel_search) {
+  // expect xd == NULL only in tests
+  if (xd != NULL) {
+    const MB_MODE_INFO *mi = xd->mi[0];
+    const int ref_num = 0;
+    const int is_intrabc = is_intrabc_block(mi);
+    const struct scale_factors *const sf =
+        is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num];
+    const int is_scaled = av1_is_scaled(sf);
+
+    if (is_scaled) {
+      int plane = 0;
+      const int mi_x = mi_col * MI_SIZE;
+      const int mi_y = mi_row * MI_SIZE;
+      const struct macroblockd_plane *const pd = &xd->plane[plane];
+      const struct buf_2d *const dst_buf = &pd->dst;
+      const struct buf_2d *const pre_buf =
+          is_intrabc ? dst_buf : &pd->pre[ref_num];
+
+      InterPredParams inter_pred_params;
+      inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd);
+      const int_interpfilters filters =
+          av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+      av1_init_inter_params(
+          &inter_pred_params, width, height, mi_y >> pd->subsampling_y,
+          mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y,
+          xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters);
+      av1_enc_build_one_inter_predictor(comp_pred, width, mv,
+                                        &inter_pred_params);
+      return;
+    }
+  }
+
+  const InterpFilterParams *filter = av1_get_filter(subpel_search);
+
+  if (!subpel_x_q3 && !subpel_y_q3) {
+    for (int i = 0; i < height; i++) {
+      memcpy(comp_pred, ref, width * sizeof(*comp_pred));
+      comp_pred += width;
+      ref += ref_stride;
+    }
+  } else if (!subpel_y_q3) {
+    const int16_t *const kernel =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+    aom_convolve8_horiz_c(ref, ref_stride, comp_pred, width, kernel, 16, NULL,
+                          -1, width, height);
+  } else if (!subpel_x_q3) {
+    const int16_t *const kernel =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+    aom_convolve8_vert_c(ref, ref_stride, comp_pred, width, NULL, -1, kernel,
+                         16, width, height);
+  } else {
+    DECLARE_ALIGNED(16, uint8_t,
+                    temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
+    const int16_t *const kernel_x =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+    const int16_t *const kernel_y =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+    const int intermediate_height =
+        (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps;
+    assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
+    aom_convolve8_horiz_c(ref - ref_stride * ((filter->taps >> 1) - 1),
+                          ref_stride, temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1,
+                          width, intermediate_height);
+    aom_convolve8_vert_c(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1),
+                         MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y, 16,
+                         width, height);
+  }
+}
+
+void aom_comp_avg_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm,
+                                   int mi_row, int mi_col, const MV *const mv,
+                                   uint8_t *comp_pred, const uint8_t *pred,
+                                   int width, int height, int subpel_x_q3,
+                                   int subpel_y_q3, const uint8_t *ref,
+                                   int ref_stride, int subpel_search) {
+  int i, j;
+
+  aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
+                     subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search);
+  for (i = 0; i < height; i++) {
+    for (j = 0; j < width; j++) {
+      comp_pred[j] = ROUND_POWER_OF_TWO(comp_pred[j] + pred[j], 1);
+    }
+    comp_pred += width;
+    pred += width;
+  }
+}
+
+void aom_dist_wtd_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred,
+                                  int width, int height, const uint8_t *ref,
+                                  int ref_stride,
+                                  const DIST_WTD_COMP_PARAMS *jcp_param) {
+  int i, j;
+  const int fwd_offset = jcp_param->fwd_offset;
+  const int bck_offset = jcp_param->bck_offset;
+
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < width; ++j) {
+      int tmp = pred[j] * bck_offset + ref[j] * fwd_offset;
+      tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
+      comp_pred[j] = (uint8_t)tmp;
+    }
+    comp_pred += width;
+    pred += width;
+    ref += ref_stride;
+  }
+}
+
+void aom_dist_wtd_comp_avg_upsampled_pred_c(
+    MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+    int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search) {
+  int i, j;
+  const int fwd_offset = jcp_param->fwd_offset;
+  const int bck_offset = jcp_param->bck_offset;
+
+  aom_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
+                       subpel_x_q3, subpel_y_q3, ref, ref_stride,
+                       subpel_search);
+
+  for (i = 0; i < height; i++) {
+    for (j = 0; j < width; j++) {
+      int tmp = pred[j] * bck_offset + comp_pred[j] * fwd_offset;
+      tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
+      comp_pred[j] = (uint8_t)tmp;
+    }
+    comp_pred += width;
+    pred += width;
+  }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static void highbd_variance64(const uint8_t *a8, int a_stride,
+                              const uint8_t *b8, int b_stride, int w, int h,
+                              uint64_t *sse, int64_t *sum) {
+  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  int64_t tsum = 0;
+  uint64_t tsse = 0;
+  for (int i = 0; i < h; ++i) {
+    int32_t lsum = 0;
+    for (int j = 0; j < w; ++j) {
+      const int diff = a[j] - b[j];
+      lsum += diff;
+      tsse += (uint32_t)(diff * diff);
+    }
+    tsum += lsum;
+    a += a_stride;
+    b += b_stride;
+  }
+  *sum = tsum;
+  *sse = tsse;
+}
+
+uint64_t aom_highbd_sse_odd_size(const uint8_t *a, int a_stride,
+                                 const uint8_t *b, int b_stride, int w, int h) {
+  uint64_t sse;
+  int64_t sum;
+  highbd_variance64(a, a_stride, b, b_stride, w, h, &sse, &sum);
+  return sse;
+}
+
+static void highbd_8_variance(const uint8_t *a8, int a_stride,
+                              const uint8_t *b8, int b_stride, int w, int h,
+                              uint32_t *sse, int *sum) {
+  uint64_t sse_long = 0;
+  int64_t sum_long = 0;
+  highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
+  *sse = (uint32_t)sse_long;
+  *sum = (int)sum_long;
+}
+
+static void highbd_10_variance(const uint8_t *a8, int a_stride,
+                               const uint8_t *b8, int b_stride, int w, int h,
+                               uint32_t *sse, int *sum) {
+  uint64_t sse_long = 0;
+  int64_t sum_long = 0;
+  highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
+  *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
+  *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);
+}
+
+static void highbd_12_variance(const uint8_t *a8, int a_stride,
+                               const uint8_t *b8, int b_stride, int w, int h,
+                               uint32_t *sse, int *sum) {
+  uint64_t sse_long = 0;
+  int64_t sum_long = 0;
+  highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
+  *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
+  *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);
+}
+
+#define HIGHBD_VAR(W, H)                                                       \
+  uint32_t aom_highbd_8_variance##W##x##H##_c(const uint8_t *a, int a_stride,  \
+                                              const uint8_t *b, int b_stride,  \
+                                              uint32_t *sse) {                 \
+    int sum;                                                                   \
+    highbd_8_variance(a, a_stride, b, b_stride, W, H, sse, &sum);              \
+    return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H));                  \
+  }                                                                            \
+                                                                               \
+  uint32_t aom_highbd_10_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
+                                               const uint8_t *b, int b_stride, \
+                                               uint32_t *sse) {                \
+    int sum;                                                                   \
+    int64_t var;                                                               \
+    highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum);             \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));                  \
+    return (var >= 0) ? (uint32_t)var : 0;                                     \
+  }                                                                            \
+                                                                               \
+  uint32_t aom_highbd_12_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
+                                               const uint8_t *b, int b_stride, \
+                                               uint32_t *sse) {                \
+    int sum;                                                                   \
+    int64_t var;                                                               \
+    highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum);             \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));                  \
+    return (var >= 0) ? (uint32_t)var : 0;                                     \
+  }
+
+#define HIGHBD_GET_VAR(S)                                                    \
+  void aom_highbd_8_get##S##x##S##var_c(const uint8_t *src, int src_stride,  \
+                                        const uint8_t *ref, int ref_stride,  \
+                                        uint32_t *sse, int *sum) {           \
+    highbd_8_variance(src, src_stride, ref, ref_stride, S, S, sse, sum);     \
+  }                                                                          \
+                                                                             \
+  void aom_highbd_10_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
+                                         const uint8_t *ref, int ref_stride, \
+                                         uint32_t *sse, int *sum) {          \
+    highbd_10_variance(src, src_stride, ref, ref_stride, S, S, sse, sum);    \
+  }                                                                          \
+                                                                             \
+  void aom_highbd_12_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
+                                         const uint8_t *ref, int ref_stride, \
+                                         uint32_t *sse, int *sum) {          \
+    highbd_12_variance(src, src_stride, ref, ref_stride, S, S, sse, sum);    \
+  }
+
+#define HIGHBD_MSE(W, H)                                                      \
+  uint32_t aom_highbd_8_mse##W##x##H##_c(const uint8_t *src, int src_stride,  \
+                                         const uint8_t *ref, int ref_stride,  \
+                                         uint32_t *sse) {                     \
+    int sum;                                                                  \
+    highbd_8_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum);     \
+    return *sse;                                                              \
+  }                                                                           \
+                                                                              \
+  uint32_t aom_highbd_10_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
+                                          const uint8_t *ref, int ref_stride, \
+                                          uint32_t *sse) {                    \
+    int sum;                                                                  \
+    highbd_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum);    \
+    return *sse;                                                              \
+  }                                                                           \
+                                                                              \
+  uint32_t aom_highbd_12_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
+                                          const uint8_t *ref, int ref_stride, \
+                                          uint32_t *sse) {                    \
+    int sum;                                                                  \
+    highbd_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum);    \
+    return *sse;                                                              \
+  }
+
+void aom_highbd_var_filter_block2d_bil_first_pass(
+    const uint8_t *src_ptr8, uint16_t *output_ptr,
+    unsigned int src_pixels_per_line, int pixel_step,
+    unsigned int output_height, unsigned int output_width,
+    const uint8_t *filter) {
+  unsigned int i, j;
+  uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8);
+  for (i = 0; i < output_height; ++i) {
+    for (j = 0; j < output_width; ++j) {
+      output_ptr[j] = ROUND_POWER_OF_TWO(
+          (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
+          FILTER_BITS);
+
+      ++src_ptr;
+    }
+
+    // Next row...
+    src_ptr += src_pixels_per_line - output_width;
+    output_ptr += output_width;
+  }
+}
+
+void aom_highbd_var_filter_block2d_bil_second_pass(
+    const uint16_t *src_ptr, uint16_t *output_ptr,
+    unsigned int src_pixels_per_line, unsigned int pixel_step,
+    unsigned int output_height, unsigned int output_width,
+    const uint8_t *filter) {
+  unsigned int i, j;
+
+  for (i = 0; i < output_height; ++i) {
+    for (j = 0; j < output_width; ++j) {
+      output_ptr[j] = ROUND_POWER_OF_TWO(
+          (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
+          FILTER_BITS);
+      ++src_ptr;
+    }
+
+    src_ptr += src_pixels_per_line - output_width;
+    output_ptr += output_width;
+  }
+}
+
+#define HIGHBD_SUBPIX_VAR(W, H)                                              \
+  uint32_t aom_highbd_8_sub_pixel_variance##W##x##H##_c(                     \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
+      const uint8_t *dst, int dst_stride, uint32_t *sse) {                   \
+    uint16_t fdata3[(H + 1) * W];                                            \
+    uint16_t temp2[H * W];                                                   \
+                                                                             \
+    aom_highbd_var_filter_block2d_bil_first_pass(                            \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
+    aom_highbd_var_filter_block2d_bil_second_pass(                           \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
+                                                                             \
+    return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W,  \
+                                              dst, dst_stride, sse);         \
+  }                                                                          \
+                                                                             \
+  uint32_t aom_highbd_10_sub_pixel_variance##W##x##H##_c(                    \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
+      const uint8_t *dst, int dst_stride, uint32_t *sse) {                   \
+    uint16_t fdata3[(H + 1) * W];                                            \
+    uint16_t temp2[H * W];                                                   \
+                                                                             \
+    aom_highbd_var_filter_block2d_bil_first_pass(                            \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
+    aom_highbd_var_filter_block2d_bil_second_pass(                           \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
+                                                                             \
+    return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
+                                               dst, dst_stride, sse);        \
+  }                                                                          \
+                                                                             \
+  uint32_t aom_highbd_12_sub_pixel_variance##W##x##H##_c(                    \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
+      const uint8_t *dst, int dst_stride, uint32_t *sse) {                   \
+    uint16_t fdata3[(H + 1) * W];                                            \
+    uint16_t temp2[H * W];                                                   \
+                                                                             \
+    aom_highbd_var_filter_block2d_bil_first_pass(                            \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
+    aom_highbd_var_filter_block2d_bil_second_pass(                           \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
+                                                                             \
+    return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
+                                               dst, dst_stride, sse);        \
+  }
+
+#define HIGHBD_SUBPIX_AVG_VAR(W, H)                                           \
+  uint32_t aom_highbd_8_sub_pixel_avg_variance##W##x##H##_c(                  \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
+      const uint8_t *dst, int dst_stride, uint32_t *sse,                      \
+      const uint8_t *second_pred) {                                           \
+    uint16_t fdata3[(H + 1) * W];                                             \
+    uint16_t temp2[H * W];                                                    \
+    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                              \
+                                                                              \
+    aom_highbd_var_filter_block2d_bil_first_pass(                             \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
+    aom_highbd_var_filter_block2d_bil_second_pass(                            \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
+                                                                              \
+    aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H,  \
+                               CONVERT_TO_BYTEPTR(temp2), W);                 \
+                                                                              \
+    return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,   \
+                                              dst, dst_stride, sse);          \
+  }                                                                           \
+                                                                              \
+  uint32_t aom_highbd_10_sub_pixel_avg_variance##W##x##H##_c(                 \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
+      const uint8_t *dst, int dst_stride, uint32_t *sse,                      \
+      const uint8_t *second_pred) {                                           \
+    uint16_t fdata3[(H + 1) * W];                                             \
+    uint16_t temp2[H * W];                                                    \
+    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                              \
+                                                                              \
+    aom_highbd_var_filter_block2d_bil_first_pass(                             \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
+    aom_highbd_var_filter_block2d_bil_second_pass(                            \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
+                                                                              \
+    aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H,  \
+                               CONVERT_TO_BYTEPTR(temp2), W);                 \
+                                                                              \
+    return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,  \
+                                               dst, dst_stride, sse);         \
+  }                                                                           \
+                                                                              \
+  uint32_t aom_highbd_12_sub_pixel_avg_variance##W##x##H##_c(                 \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
+      const uint8_t *dst, int dst_stride, uint32_t *sse,                      \
+      const uint8_t *second_pred) {                                           \
+    uint16_t fdata3[(H + 1) * W];                                             \
+    uint16_t temp2[H * W];                                                    \
+    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                              \
+                                                                              \
+    aom_highbd_var_filter_block2d_bil_first_pass(                             \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
+    aom_highbd_var_filter_block2d_bil_second_pass(                            \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
+                                                                              \
+    aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H,  \
+                               CONVERT_TO_BYTEPTR(temp2), W);                 \
+                                                                              \
+    return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,  \
+                                               dst, dst_stride, sse);         \
+  }                                                                           \
+                                                                              \
+  uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance##W##x##H##_c(         \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
+      const uint8_t *dst, int dst_stride, uint32_t *sse,                      \
+      const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {    \
+    uint16_t fdata3[(H + 1) * W];                                             \
+    uint16_t temp2[H * W];                                                    \
+    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                              \
+                                                                              \
+    aom_highbd_var_filter_block2d_bil_first_pass(                             \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
+    aom_highbd_var_filter_block2d_bil_second_pass(                            \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
+                                                                              \
+    aom_highbd_dist_wtd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, \
+                                      W, H, CONVERT_TO_BYTEPTR(temp2), W,     \
+                                      jcp_param);                             \
+                                                                              \
+    return aom_highbd_8_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst,  \
+                                          dst_stride, sse);                   \
+  }                                                                           \
+                                                                              \
+  uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance##W##x##H##_c(        \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
+      const uint8_t *dst, int dst_stride, uint32_t *sse,                      \
+      const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {    \
+    uint16_t fdata3[(H + 1) * W];                                             \
+    uint16_t temp2[H * W];                                                    \
+    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                              \
+                                                                              \
+    aom_highbd_var_filter_block2d_bil_first_pass(                             \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
+    aom_highbd_var_filter_block2d_bil_second_pass(                            \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
+                                                                              \
+    aom_highbd_dist_wtd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, \
+                                      W, H, CONVERT_TO_BYTEPTR(temp2), W,     \
+                                      jcp_param);                             \
+                                                                              \
+    return aom_highbd_10_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \
+                                           dst_stride, sse);                  \
+  }                                                                           \
+                                                                              \
+  uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance##W##x##H##_c(        \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
+      const uint8_t *dst, int dst_stride, uint32_t *sse,                      \
+      const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {    \
+    uint16_t fdata3[(H + 1) * W];                                             \
+    uint16_t temp2[H * W];                                                    \
+    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                              \
+                                                                              \
+    aom_highbd_var_filter_block2d_bil_first_pass(                             \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
+    aom_highbd_var_filter_block2d_bil_second_pass(                            \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
+                                                                              \
+    aom_highbd_dist_wtd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, \
+                                      W, H, CONVERT_TO_BYTEPTR(temp2), W,     \
+                                      jcp_param);                             \
+                                                                              \
+    return aom_highbd_12_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \
+                                           dst_stride, sse);                  \
+  }
+
+/* All three forms of the variance are available in the same sizes. */
+#define HIGHBD_VARIANCES(W, H) \
+  HIGHBD_VAR(W, H)             \
+  HIGHBD_SUBPIX_VAR(W, H)      \
+  HIGHBD_SUBPIX_AVG_VAR(W, H)
+
+HIGHBD_VARIANCES(128, 128)
+HIGHBD_VARIANCES(128, 64)
+HIGHBD_VARIANCES(64, 128)
+HIGHBD_VARIANCES(64, 64)
+HIGHBD_VARIANCES(64, 32)
+HIGHBD_VARIANCES(32, 64)
+HIGHBD_VARIANCES(32, 32)
+HIGHBD_VARIANCES(32, 16)
+HIGHBD_VARIANCES(16, 32)
+HIGHBD_VARIANCES(16, 16)
+HIGHBD_VARIANCES(16, 8)
+HIGHBD_VARIANCES(8, 16)
+HIGHBD_VARIANCES(8, 8)
+HIGHBD_VARIANCES(8, 4)
+HIGHBD_VARIANCES(4, 8)
+HIGHBD_VARIANCES(4, 4)
+HIGHBD_VARIANCES(4, 2)
+HIGHBD_VARIANCES(2, 4)
+HIGHBD_VARIANCES(2, 2)
+HIGHBD_VARIANCES(4, 16)
+HIGHBD_VARIANCES(16, 4)
+HIGHBD_VARIANCES(8, 32)
+HIGHBD_VARIANCES(32, 8)
+HIGHBD_VARIANCES(16, 64)
+HIGHBD_VARIANCES(64, 16)
+
+HIGHBD_GET_VAR(8)
+HIGHBD_GET_VAR(16)
+
+HIGHBD_MSE(16, 16)
+HIGHBD_MSE(16, 8)
+HIGHBD_MSE(8, 16)
+HIGHBD_MSE(8, 8)
+
+void aom_highbd_comp_avg_pred_c(uint8_t *comp_pred8, const uint8_t *pred8,
+                                int width, int height, const uint8_t *ref8,
+                                int ref_stride) {
+  int i, j;
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < width; ++j) {
+      const int tmp = pred[j] + ref[j];
+      comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
+    }
+    comp_pred += width;
+    pred += width;
+    ref += ref_stride;
+  }
+}
+
+void aom_highbd_upsampled_pred_c(MACROBLOCKD *xd,
+                                 const struct AV1Common *const cm, int mi_row,
+                                 int mi_col, const MV *const mv,
+                                 uint8_t *comp_pred8, int width, int height,
+                                 int subpel_x_q3, int subpel_y_q3,
+                                 const uint8_t *ref8, int ref_stride, int bd,
+                                 int subpel_search) {
+  // expect xd == NULL only in tests
+  if (xd != NULL) {
+    const MB_MODE_INFO *mi = xd->mi[0];
+    const int ref_num = 0;
+    const int is_intrabc = is_intrabc_block(mi);
+    const struct scale_factors *const sf =
+        is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num];
+    const int is_scaled = av1_is_scaled(sf);
+
+    if (is_scaled) {
+      int plane = 0;
+      const int mi_x = mi_col * MI_SIZE;
+      const int mi_y = mi_row * MI_SIZE;
+      const struct macroblockd_plane *const pd = &xd->plane[plane];
+      const struct buf_2d *const dst_buf = &pd->dst;
+      const struct buf_2d *const pre_buf =
+          is_intrabc ? dst_buf : &pd->pre[ref_num];
+
+      InterPredParams inter_pred_params;
+      inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd);
+      const int_interpfilters filters =
+          av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+      av1_init_inter_params(
+          &inter_pred_params, width, height, mi_y >> pd->subsampling_y,
+          mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y,
+          xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters);
+      av1_enc_build_one_inter_predictor(comp_pred8, width, mv,
+                                        &inter_pred_params);
+      return;
+    }
+  }
+
+  const InterpFilterParams *filter = av1_get_filter(subpel_search);
+
+  if (!subpel_x_q3 && !subpel_y_q3) {
+    const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+    uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+    for (int i = 0; i < height; i++) {
+      memcpy(comp_pred, ref, width * sizeof(*comp_pred));
+      comp_pred += width;
+      ref += ref_stride;
+    }
+  } else if (!subpel_y_q3) {
+    const int16_t *const kernel =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+    aom_highbd_convolve8_horiz_c(ref8, ref_stride, comp_pred8, width, kernel,
+                                 16, NULL, -1, width, height, bd);
+  } else if (!subpel_x_q3) {
+    const int16_t *const kernel =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+    aom_highbd_convolve8_vert_c(ref8, ref_stride, comp_pred8, width, NULL, -1,
+                                kernel, 16, width, height, bd);
+  } else {
+    DECLARE_ALIGNED(16, uint16_t,
+                    temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]);
+    const int16_t *const kernel_x =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+    const int16_t *const kernel_y =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+    const int intermediate_height =
+        (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps;
+    assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
+    aom_highbd_convolve8_horiz_c(ref8 - ref_stride * ((filter->taps >> 1) - 1),
+                                 ref_stride, CONVERT_TO_BYTEPTR(temp),
+                                 MAX_SB_SIZE, kernel_x, 16, NULL, -1, width,
+                                 intermediate_height, bd);
+    aom_highbd_convolve8_vert_c(
+        CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1)),
+        MAX_SB_SIZE, comp_pred8, width, NULL, -1, kernel_y, 16, width, height,
+        bd);
+  }
+}
+
+void aom_highbd_comp_avg_upsampled_pred_c(
+    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+    int ref_stride, int bd, int subpel_search) {
+  int i, j;
+
+  const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+  aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
+                            height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
+                            bd, subpel_search);
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < width; ++j) {
+      comp_pred[j] = ROUND_POWER_OF_TWO(pred[j] + comp_pred[j], 1);
+    }
+    comp_pred += width;
+    pred += width;
+  }
+}
+
+void aom_highbd_dist_wtd_comp_avg_pred_c(
+    uint8_t *comp_pred8, const uint8_t *pred8, int width, int height,
+    const uint8_t *ref8, int ref_stride,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
+  int i, j;
+  const int fwd_offset = jcp_param->fwd_offset;
+  const int bck_offset = jcp_param->bck_offset;
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < width; ++j) {
+      int tmp = pred[j] * bck_offset + ref[j] * fwd_offset;
+      tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
+      comp_pred[j] = (uint16_t)tmp;
+    }
+    comp_pred += width;
+    pred += width;
+    ref += ref_stride;
+  }
+}
+
+void aom_highbd_dist_wtd_comp_avg_upsampled_pred_c(
+    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+    int ref_stride, int bd, const DIST_WTD_COMP_PARAMS *jcp_param,
+    int subpel_search) {
+  int i, j;
+  const int fwd_offset = jcp_param->fwd_offset;
+  const int bck_offset = jcp_param->bck_offset;
+  const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+  aom_highbd_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
+                              height, subpel_x_q3, subpel_y_q3, ref8,
+                              ref_stride, bd, subpel_search);
+
+  for (i = 0; i < height; i++) {
+    for (j = 0; j < width; j++) {
+      int tmp = pred[j] * bck_offset + comp_pred[j] * fwd_offset;
+      tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
+      comp_pred[j] = (uint16_t)tmp;
+    }
+    comp_pred += width;
+    pred += width;
+  }
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+void aom_comp_mask_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
+                          int height, const uint8_t *ref, int ref_stride,
+                          const uint8_t *mask, int mask_stride,
+                          int invert_mask) {
+  int i, j;
+  const uint8_t *src0 = invert_mask ? pred : ref;
+  const uint8_t *src1 = invert_mask ? ref : pred;
+  const int stride0 = invert_mask ? width : ref_stride;
+  const int stride1 = invert_mask ? ref_stride : width;
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < width; ++j) {
+      comp_pred[j] = AOM_BLEND_A64(mask[j], src0[j], src1[j]);
+    }
+    comp_pred += width;
+    src0 += stride0;
+    src1 += stride1;
+    mask += mask_stride;
+  }
+}
+
+void aom_comp_mask_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm,
+                                    int mi_row, int mi_col, const MV *const mv,
+                                    uint8_t *comp_pred, const uint8_t *pred,
+                                    int width, int height, int subpel_x_q3,
+                                    int subpel_y_q3, const uint8_t *ref,
+                                    int ref_stride, const uint8_t *mask,
+                                    int mask_stride, int invert_mask,
+                                    int subpel_search) {
+  if (subpel_x_q3 | subpel_y_q3) {
+    aom_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
+                         subpel_x_q3, subpel_y_q3, ref, ref_stride,
+                         subpel_search);
+    ref = comp_pred;
+    ref_stride = width;
+  }
+  aom_comp_mask_pred_c(comp_pred, pred, width, height, ref, ref_stride, mask,
+                       mask_stride, invert_mask);
+}
+
+#define MASK_SUBPIX_VAR(W, H)                                                  \
+  unsigned int aom_masked_sub_pixel_variance##W##x##H##_c(                     \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
+      const uint8_t *ref, int ref_stride, const uint8_t *second_pred,          \
+      const uint8_t *msk, int msk_stride, int invert_mask,                     \
+      unsigned int *sse) {                                                     \
+    uint16_t fdata3[(H + 1) * W];                                              \
+    uint8_t temp2[H * W];                                                      \
+    DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                                \
+                                                                               \
+    aom_var_filter_block2d_bil_first_pass_c(src, fdata3, src_stride, 1, H + 1, \
+                                            W, bilinear_filters_2t[xoffset]);  \
+    aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,        \
+                                             bilinear_filters_2t[yoffset]);    \
+                                                                               \
+    aom_comp_mask_pred_c(temp3, second_pred, W, H, temp2, W, msk, msk_stride,  \
+                         invert_mask);                                         \
+    return aom_variance##W##x##H##_c(temp3, W, ref, ref_stride, sse);          \
+  }
+
+MASK_SUBPIX_VAR(4, 4)
+MASK_SUBPIX_VAR(4, 8)
+MASK_SUBPIX_VAR(8, 4)
+MASK_SUBPIX_VAR(8, 8)
+MASK_SUBPIX_VAR(8, 16)
+MASK_SUBPIX_VAR(16, 8)
+MASK_SUBPIX_VAR(16, 16)
+MASK_SUBPIX_VAR(16, 32)
+MASK_SUBPIX_VAR(32, 16)
+MASK_SUBPIX_VAR(32, 32)
+MASK_SUBPIX_VAR(32, 64)
+MASK_SUBPIX_VAR(64, 32)
+MASK_SUBPIX_VAR(64, 64)
+MASK_SUBPIX_VAR(64, 128)
+MASK_SUBPIX_VAR(128, 64)
+MASK_SUBPIX_VAR(128, 128)
+MASK_SUBPIX_VAR(4, 16)
+MASK_SUBPIX_VAR(16, 4)
+MASK_SUBPIX_VAR(8, 32)
+MASK_SUBPIX_VAR(32, 8)
+MASK_SUBPIX_VAR(16, 64)
+MASK_SUBPIX_VAR(64, 16)
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void aom_highbd_comp_mask_pred_c(uint8_t *comp_pred8, const uint8_t *pred8,
+                                 int width, int height, const uint8_t *ref8,
+                                 int ref_stride, const uint8_t *mask,
+                                 int mask_stride, int invert_mask) {
+  int i, j;
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < width; ++j) {
+      if (!invert_mask)
+        comp_pred[j] = AOM_BLEND_A64(mask[j], ref[j], pred[j]);
+      else
+        comp_pred[j] = AOM_BLEND_A64(mask[j], pred[j], ref[j]);
+    }
+    comp_pred += width;
+    pred += width;
+    ref += ref_stride;
+    mask += mask_stride;
+  }
+}
+
+void aom_highbd_comp_mask_upsampled_pred(
+    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+    int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask,
+    int bd, int subpel_search) {
+  aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
+                            height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
+                            bd, subpel_search);
+  aom_highbd_comp_mask_pred(comp_pred8, pred8, width, height, comp_pred8, width,
+                            mask, mask_stride, invert_mask);
+}
+
+#define HIGHBD_MASK_SUBPIX_VAR(W, H)                                           \
+  unsigned int aom_highbd_8_masked_sub_pixel_variance##W##x##H##_c(            \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
+      const uint8_t *ref, int ref_stride, const uint8_t *second_pred,          \
+      const uint8_t *msk, int msk_stride, int invert_mask,                     \
+      unsigned int *sse) {                                                     \
+    uint16_t fdata3[(H + 1) * W];                                              \
+    uint16_t temp2[H * W];                                                     \
+    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
+                                                                               \
+    aom_highbd_var_filter_block2d_bil_first_pass(                              \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
+    aom_highbd_var_filter_block2d_bil_second_pass(                             \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
+                                                                               \
+    aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H,  \
+                                CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \
+                                invert_mask);                                  \
+                                                                               \
+    return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,    \
+                                              ref, ref_stride, sse);           \
+  }                                                                            \
+                                                                               \
+  unsigned int aom_highbd_10_masked_sub_pixel_variance##W##x##H##_c(           \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
+      const uint8_t *ref, int ref_stride, const uint8_t *second_pred,          \
+      const uint8_t *msk, int msk_stride, int invert_mask,                     \
+      unsigned int *sse) {                                                     \
+    uint16_t fdata3[(H + 1) * W];                                              \
+    uint16_t temp2[H * W];                                                     \
+    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
+                                                                               \
+    aom_highbd_var_filter_block2d_bil_first_pass(                              \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
+    aom_highbd_var_filter_block2d_bil_second_pass(                             \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
+                                                                               \
+    aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H,  \
+                                CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \
+                                invert_mask);                                  \
+                                                                               \
+    return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,   \
+                                               ref, ref_stride, sse);          \
+  }                                                                            \
+                                                                               \
+  unsigned int aom_highbd_12_masked_sub_pixel_variance##W##x##H##_c(           \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
+      const uint8_t *ref, int ref_stride, const uint8_t *second_pred,          \
+      const uint8_t *msk, int msk_stride, int invert_mask,                     \
+      unsigned int *sse) {                                                     \
+    uint16_t fdata3[(H + 1) * W];                                              \
+    uint16_t temp2[H * W];                                                     \
+    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
+                                                                               \
+    aom_highbd_var_filter_block2d_bil_first_pass(                              \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
+    aom_highbd_var_filter_block2d_bil_second_pass(                             \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
+                                                                               \
+    aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H,  \
+                                CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \
+                                invert_mask);                                  \
+                                                                               \
+    return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,   \
+                                               ref, ref_stride, sse);          \
+  }
+
+HIGHBD_MASK_SUBPIX_VAR(4, 4)
+HIGHBD_MASK_SUBPIX_VAR(4, 8)
+HIGHBD_MASK_SUBPIX_VAR(8, 4)
+HIGHBD_MASK_SUBPIX_VAR(8, 8)
+HIGHBD_MASK_SUBPIX_VAR(8, 16)
+HIGHBD_MASK_SUBPIX_VAR(16, 8)
+HIGHBD_MASK_SUBPIX_VAR(16, 16)
+HIGHBD_MASK_SUBPIX_VAR(16, 32)
+HIGHBD_MASK_SUBPIX_VAR(32, 16)
+HIGHBD_MASK_SUBPIX_VAR(32, 32)
+HIGHBD_MASK_SUBPIX_VAR(32, 64)
+HIGHBD_MASK_SUBPIX_VAR(64, 32)
+HIGHBD_MASK_SUBPIX_VAR(64, 64)
+HIGHBD_MASK_SUBPIX_VAR(64, 128)
+HIGHBD_MASK_SUBPIX_VAR(128, 64)
+HIGHBD_MASK_SUBPIX_VAR(128, 128)
+HIGHBD_MASK_SUBPIX_VAR(4, 16)
+HIGHBD_MASK_SUBPIX_VAR(16, 4)
+HIGHBD_MASK_SUBPIX_VAR(8, 32)
+HIGHBD_MASK_SUBPIX_VAR(32, 8)
+HIGHBD_MASK_SUBPIX_VAR(16, 64)
+HIGHBD_MASK_SUBPIX_VAR(64, 16)
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+static INLINE void obmc_variance(const uint8_t *pre, int pre_stride,
+                                 const int32_t *wsrc, const int32_t *mask,
+                                 int w, int h, unsigned int *sse, int *sum) {
+  int i, j;
+
+  *sse = 0;
+  *sum = 0;
+
+  for (i = 0; i < h; i++) {
+    for (j = 0; j < w; j++) {
+      int diff = ROUND_POWER_OF_TWO_SIGNED(wsrc[j] - pre[j] * mask[j], 12);
+      *sum += diff;
+      *sse += diff * diff;
+    }
+
+    pre += pre_stride;
+    wsrc += w;
+    mask += w;
+  }
+}
+
+#define OBMC_VAR(W, H)                                            \
+  unsigned int aom_obmc_variance##W##x##H##_c(                    \
+      const uint8_t *pre, int pre_stride, const int32_t *wsrc,    \
+      const int32_t *mask, unsigned int *sse) {                   \
+    int sum;                                                      \
+    obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum);  \
+    return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \
+  }
+
+#define OBMC_SUBPIX_VAR(W, H)                                                  \
+  unsigned int aom_obmc_sub_pixel_variance##W##x##H##_c(                       \
+      const uint8_t *pre, int pre_stride, int xoffset, int yoffset,            \
+      const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {           \
+    uint16_t fdata3[(H + 1) * W];                                              \
+    uint8_t temp2[H * W];                                                      \
+                                                                               \
+    aom_var_filter_block2d_bil_first_pass_c(pre, fdata3, pre_stride, 1, H + 1, \
+                                            W, bilinear_filters_2t[xoffset]);  \
+    aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,        \
+                                             bilinear_filters_2t[yoffset]);    \
+                                                                               \
+    return aom_obmc_variance##W##x##H##_c(temp2, W, wsrc, mask, sse);          \
+  }
+
+OBMC_VAR(4, 4)
+OBMC_SUBPIX_VAR(4, 4)
+
+OBMC_VAR(4, 8)
+OBMC_SUBPIX_VAR(4, 8)
+
+OBMC_VAR(8, 4)
+OBMC_SUBPIX_VAR(8, 4)
+
+OBMC_VAR(8, 8)
+OBMC_SUBPIX_VAR(8, 8)
+
+OBMC_VAR(8, 16)
+OBMC_SUBPIX_VAR(8, 16)
+
+OBMC_VAR(16, 8)
+OBMC_SUBPIX_VAR(16, 8)
+
+OBMC_VAR(16, 16)
+OBMC_SUBPIX_VAR(16, 16)
+
+OBMC_VAR(16, 32)
+OBMC_SUBPIX_VAR(16, 32)
+
+OBMC_VAR(32, 16)
+OBMC_SUBPIX_VAR(32, 16)
+
+OBMC_VAR(32, 32)
+OBMC_SUBPIX_VAR(32, 32)
+
+OBMC_VAR(32, 64)
+OBMC_SUBPIX_VAR(32, 64)
+
+OBMC_VAR(64, 32)
+OBMC_SUBPIX_VAR(64, 32)
+
+OBMC_VAR(64, 64)
+OBMC_SUBPIX_VAR(64, 64)
+
+OBMC_VAR(64, 128)
+OBMC_SUBPIX_VAR(64, 128)
+
+OBMC_VAR(128, 64)
+OBMC_SUBPIX_VAR(128, 64)
+
+OBMC_VAR(128, 128)
+OBMC_SUBPIX_VAR(128, 128)
+
+OBMC_VAR(4, 16)
+OBMC_SUBPIX_VAR(4, 16)
+OBMC_VAR(16, 4)
+OBMC_SUBPIX_VAR(16, 4)
+OBMC_VAR(8, 32)
+OBMC_SUBPIX_VAR(8, 32)
+OBMC_VAR(32, 8)
+OBMC_SUBPIX_VAR(32, 8)
+OBMC_VAR(16, 64)
+OBMC_SUBPIX_VAR(16, 64)
+OBMC_VAR(64, 16)
+OBMC_SUBPIX_VAR(64, 16)
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE void highbd_obmc_variance64(const uint8_t *pre8, int pre_stride,
+                                          const int32_t *wsrc,
+                                          const int32_t *mask, int w, int h,
+                                          uint64_t *sse, int64_t *sum) {
+  int i, j;
+  uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
+
+  *sse = 0;
+  *sum = 0;
+
+  for (i = 0; i < h; i++) {
+    for (j = 0; j < w; j++) {
+      int diff = ROUND_POWER_OF_TWO_SIGNED(wsrc[j] - pre[j] * mask[j], 12);
+      *sum += diff;
+      *sse += diff * diff;
+    }
+
+    pre += pre_stride;
+    wsrc += w;
+    mask += w;
+  }
+}
+
+static INLINE void highbd_obmc_variance(const uint8_t *pre8, int pre_stride,
+                                        const int32_t *wsrc,
+                                        const int32_t *mask, int w, int h,
+                                        unsigned int *sse, int *sum) {
+  int64_t sum64;
+  uint64_t sse64;
+  highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
+  *sum = (int)sum64;
+  *sse = (unsigned int)sse64;
+}
+
+static INLINE void highbd_10_obmc_variance(const uint8_t *pre8, int pre_stride,
+                                           const int32_t *wsrc,
+                                           const int32_t *mask, int w, int h,
+                                           unsigned int *sse, int *sum) {
+  int64_t sum64;
+  uint64_t sse64;
+  highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
+  *sum = (int)ROUND_POWER_OF_TWO(sum64, 2);
+  *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4);
+}
+
+static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride,
+                                           const int32_t *wsrc,
+                                           const int32_t *mask, int w, int h,
+                                           unsigned int *sse, int *sum) {
+  int64_t sum64;
+  uint64_t sse64;
+  highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
+  *sum = (int)ROUND_POWER_OF_TWO(sum64, 4);
+  *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8);
+}
+
+#define HIGHBD_OBMC_VAR(W, H)                                              \
+  unsigned int aom_highbd_obmc_variance##W##x##H##_c(                      \
+      const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
+      const int32_t *mask, unsigned int *sse) {                            \
+    int sum;                                                               \
+    highbd_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum);    \
+    return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H));          \
+  }                                                                        \
+                                                                           \
+  unsigned int aom_highbd_10_obmc_variance##W##x##H##_c(                   \
+      const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
+      const int32_t *mask, unsigned int *sse) {                            \
+    int sum;                                                               \
+    int64_t var;                                                           \
+    highbd_10_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));              \
+    return (var >= 0) ? (uint32_t)var : 0;                                 \
+  }                                                                        \
+                                                                           \
+  unsigned int aom_highbd_12_obmc_variance##W##x##H##_c(                   \
+      const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
+      const int32_t *mask, unsigned int *sse) {                            \
+    int sum;                                                               \
+    int64_t var;                                                           \
+    highbd_12_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));              \
+    return (var >= 0) ? (uint32_t)var : 0;                                 \
+  }
+
+#define HIGHBD_OBMC_SUBPIX_VAR(W, H)                                           \
+  unsigned int aom_highbd_obmc_sub_pixel_variance##W##x##H##_c(                \
+      const uint8_t *pre, int pre_stride, int xoffset, int yoffset,            \
+      const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {           \
+    uint16_t fdata3[(H + 1) * W];                                              \
+    uint16_t temp2[H * W];                                                     \
+                                                                               \
+    aom_highbd_var_filter_block2d_bil_first_pass(                              \
+        pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
+    aom_highbd_var_filter_block2d_bil_second_pass(                             \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
+                                                                               \
+    return aom_highbd_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
+                                                 wsrc, mask, sse);             \
+  }                                                                            \
+                                                                               \
+  unsigned int aom_highbd_10_obmc_sub_pixel_variance##W##x##H##_c(             \
+      const uint8_t *pre, int pre_stride, int xoffset, int yoffset,            \
+      const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {           \
+    uint16_t fdata3[(H + 1) * W];                                              \
+    uint16_t temp2[H * W];                                                     \
+                                                                               \
+    aom_highbd_var_filter_block2d_bil_first_pass(                              \
+        pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
+    aom_highbd_var_filter_block2d_bil_second_pass(                             \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
+                                                                               \
+    return aom_highbd_10_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
+                                                    W, wsrc, mask, sse);       \
+  }                                                                            \
+                                                                               \
+  unsigned int aom_highbd_12_obmc_sub_pixel_variance##W##x##H##_c(             \
+      const uint8_t *pre, int pre_stride, int xoffset, int yoffset,            \
+      const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {           \
+    uint16_t fdata3[(H + 1) * W];                                              \
+    uint16_t temp2[H * W];                                                     \
+                                                                               \
+    aom_highbd_var_filter_block2d_bil_first_pass(                              \
+        pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
+    aom_highbd_var_filter_block2d_bil_second_pass(                             \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
+                                                                               \
+    return aom_highbd_12_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
+                                                    W, wsrc, mask, sse);       \
+  }
+
+HIGHBD_OBMC_VAR(4, 4)
+HIGHBD_OBMC_SUBPIX_VAR(4, 4)
+
+HIGHBD_OBMC_VAR(4, 8)
+HIGHBD_OBMC_SUBPIX_VAR(4, 8)
+
+HIGHBD_OBMC_VAR(8, 4)
+HIGHBD_OBMC_SUBPIX_VAR(8, 4)
+
+HIGHBD_OBMC_VAR(8, 8)
+HIGHBD_OBMC_SUBPIX_VAR(8, 8)
+
+HIGHBD_OBMC_VAR(8, 16)
+HIGHBD_OBMC_SUBPIX_VAR(8, 16)
+
+HIGHBD_OBMC_VAR(16, 8)
+HIGHBD_OBMC_SUBPIX_VAR(16, 8)
+
+HIGHBD_OBMC_VAR(16, 16)
+HIGHBD_OBMC_SUBPIX_VAR(16, 16)
+
+HIGHBD_OBMC_VAR(16, 32)
+HIGHBD_OBMC_SUBPIX_VAR(16, 32)
+
+HIGHBD_OBMC_VAR(32, 16)
+HIGHBD_OBMC_SUBPIX_VAR(32, 16)
+
+HIGHBD_OBMC_VAR(32, 32)
+HIGHBD_OBMC_SUBPIX_VAR(32, 32)
+
+HIGHBD_OBMC_VAR(32, 64)
+HIGHBD_OBMC_SUBPIX_VAR(32, 64)
+
+HIGHBD_OBMC_VAR(64, 32)
+HIGHBD_OBMC_SUBPIX_VAR(64, 32)
+
+HIGHBD_OBMC_VAR(64, 64)
+HIGHBD_OBMC_SUBPIX_VAR(64, 64)
+
+HIGHBD_OBMC_VAR(64, 128)
+HIGHBD_OBMC_SUBPIX_VAR(64, 128)
+
+HIGHBD_OBMC_VAR(128, 64)
+HIGHBD_OBMC_SUBPIX_VAR(128, 64)
+
+HIGHBD_OBMC_VAR(128, 128)
+HIGHBD_OBMC_SUBPIX_VAR(128, 128)
+
+HIGHBD_OBMC_VAR(4, 16)
+HIGHBD_OBMC_SUBPIX_VAR(4, 16)
+HIGHBD_OBMC_VAR(16, 4)
+HIGHBD_OBMC_SUBPIX_VAR(16, 4)
+HIGHBD_OBMC_VAR(8, 32)
+HIGHBD_OBMC_SUBPIX_VAR(8, 32)
+HIGHBD_OBMC_VAR(32, 8)
+HIGHBD_OBMC_SUBPIX_VAR(32, 8)
+HIGHBD_OBMC_VAR(16, 64)
+HIGHBD_OBMC_SUBPIX_VAR(16, 64)
+HIGHBD_OBMC_VAR(64, 16)
+HIGHBD_OBMC_SUBPIX_VAR(64, 16)
+#endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/libs/libaom/src/aom_dsp/variance.h b/libs/libaom/src/aom_dsp/variance.h
new file mode 100644
index 000000000..4550c17b3
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/variance.h
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_VARIANCE_H_
+#define AOM_AOM_DSP_VARIANCE_H_
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define FILTER_BITS 7
+#define FILTER_WEIGHT 128
+
+typedef unsigned int (*aom_sad_fn_t)(const uint8_t *a, int a_stride,
+                                     const uint8_t *b, int b_stride);
+
+typedef unsigned int (*aom_sad_avg_fn_t)(const uint8_t *a, int a_stride,
+                                         const uint8_t *b, int b_stride,
+                                         const uint8_t *second_pred);
+
+typedef void (*aom_copy32xn_fn_t)(const uint8_t *a, int a_stride, uint8_t *b,
+                                  int b_stride, int n);
+
+typedef void (*aom_sad_multi_d_fn_t)(const uint8_t *a, int a_stride,
+                                     const uint8_t *const b_array[],
+                                     int b_stride, unsigned int *sad_array);
+
+typedef unsigned int (*aom_variance_fn_t)(const uint8_t *a, int a_stride,
+                                          const uint8_t *b, int b_stride,
+                                          unsigned int *sse);
+
+typedef unsigned int (*aom_subpixvariance_fn_t)(const uint8_t *a, int a_stride,
+                                                int xoffset, int yoffset,
+                                                const uint8_t *b, int b_stride,
+                                                unsigned int *sse);
+
+typedef unsigned int (*aom_subp_avg_variance_fn_t)(
+    const uint8_t *a, int a_stride, int xoffset, int yoffset, const uint8_t *b,
+    int b_stride, unsigned int *sse, const uint8_t *second_pred);
+
+typedef unsigned int (*aom_dist_wtd_sad_avg_fn_t)(
+    const uint8_t *a, int a_stride, const uint8_t *b, int b_stride,
+    const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+
+typedef unsigned int (*aom_dist_wtd_subp_avg_variance_fn_t)(
+    const uint8_t *a, int a_stride, int xoffset, int yoffset, const uint8_t *b,
+    int b_stride, unsigned int *sse, const uint8_t *second_pred,
+    const DIST_WTD_COMP_PARAMS *jcp_param);
+
+typedef unsigned int (*aom_masked_sad_fn_t)(const uint8_t *src, int src_stride,
+                                            const uint8_t *ref, int ref_stride,
+                                            const uint8_t *second_pred,
+                                            const uint8_t *msk, int msk_stride,
+                                            int invert_mask);
+typedef unsigned int (*aom_masked_subpixvariance_fn_t)(
+    const uint8_t *src, int src_stride, int xoffset, int yoffset,
+    const uint8_t *ref, int ref_stride, const uint8_t *second_pred,
+    const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+
+void aom_highbd_comp_mask_upsampled_pred(
+    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+    int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask,
+    int bd, int subpel_search);
+
+typedef unsigned int (*aom_obmc_sad_fn_t)(const uint8_t *pred, int pred_stride,
+                                          const int32_t *wsrc,
+                                          const int32_t *msk);
+typedef unsigned int (*aom_obmc_variance_fn_t)(const uint8_t *pred,
+                                               int pred_stride,
+                                               const int32_t *wsrc,
+                                               const int32_t *msk,
+                                               unsigned int *sse);
+typedef unsigned int (*aom_obmc_subpixvariance_fn_t)(
+    const uint8_t *pred, int pred_stride, int xoffset, int yoffset,
+    const int32_t *wsrc, const int32_t *msk, unsigned int *sse);
+
+typedef struct aom_variance_vtable {
+  aom_sad_fn_t sdf;
+  aom_sad_avg_fn_t sdaf;
+  aom_variance_fn_t vf;
+  aom_subpixvariance_fn_t svf;
+  aom_subp_avg_variance_fn_t svaf;
+  aom_sad_multi_d_fn_t sdx4df;
+  aom_masked_sad_fn_t msdf;
+  aom_masked_subpixvariance_fn_t msvf;
+  aom_obmc_sad_fn_t osdf;
+  aom_obmc_variance_fn_t ovf;
+  aom_obmc_subpixvariance_fn_t osvf;
+  aom_dist_wtd_sad_avg_fn_t jsdaf;
+  aom_dist_wtd_subp_avg_variance_fn_t jsvaf;
+} aom_variance_fn_ptr_t;
+
+void aom_highbd_var_filter_block2d_bil_first_pass(
+    const uint8_t *src_ptr8, uint16_t *output_ptr,
+    unsigned int src_pixels_per_line, int pixel_step,
+    unsigned int output_height, unsigned int output_width,
+    const uint8_t *filter);
+
+void aom_highbd_var_filter_block2d_bil_second_pass(
+    const uint16_t *src_ptr, uint16_t *output_ptr,
+    unsigned int src_pixels_per_line, unsigned int pixel_step,
+    unsigned int output_height, unsigned int output_width,
+    const uint8_t *filter);
+
+uint32_t aom_sse_odd_size(const uint8_t *a, int a_stride, const uint8_t *b,
+                          int b_stride, int w, int h);
+
+uint64_t aom_highbd_sse_odd_size(const uint8_t *a, int a_stride,
+                                 const uint8_t *b, int b_stride, int w, int h);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_DSP_VARIANCE_H_
diff --git a/libs/libaom/src/aom_dsp/vmaf.c b/libs/libaom/src/aom_dsp/vmaf.c
new file mode 100644
index 000000000..3a012e768
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/vmaf.c
@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <libvmaf/libvmaf.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom_dsp/blend.h"
+#include "aom_dsp/vmaf.h"
+#include "aom_ports/system_state.h"
+
+typedef struct FrameData {
+  const YV12_BUFFER_CONFIG *source;
+  const YV12_BUFFER_CONFIG *distorted;
+  int frame_set;
+  int bit_depth;
+} FrameData;
+
+static void vmaf_fatal_error(const char *message) {
+  fprintf(stderr, "Fatal error: %s\n", message);
+  exit(EXIT_FAILURE);
+}
+
+// A callback function used to pass data to VMAF.
+// Returns 0 after reading a frame.
+// Returns 2 when there is no more frame to read.
+static int read_frame(float *ref_data, float *main_data, float *temp_data,
+                      int stride, void *user_data) {
+  FrameData *frames = (FrameData *)user_data;
+
+  if (!frames->frame_set) {
+    const int width = frames->source->y_width;
+    const int height = frames->source->y_height;
+    assert(width == frames->distorted->y_width);
+    assert(height == frames->distorted->y_height);
+
+    if (frames->bit_depth > 8) {
+      const float scale_factor = 1.0f / (float)(1 << (frames->bit_depth - 8));
+      uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(frames->source->y_buffer);
+      uint16_t *main_ptr = CONVERT_TO_SHORTPTR(frames->distorted->y_buffer);
+
+      for (int row = 0; row < height; ++row) {
+        for (int col = 0; col < width; ++col) {
+          ref_data[col] = scale_factor * (float)ref_ptr[col];
+        }
+        ref_ptr += frames->source->y_stride;
+        ref_data += stride / sizeof(*ref_data);
+      }
+
+      for (int row = 0; row < height; ++row) {
+        for (int col = 0; col < width; ++col) {
+          main_data[col] = scale_factor * (float)main_ptr[col];
+        }
+        main_ptr += frames->distorted->y_stride;
+        main_data += stride / sizeof(*main_data);
+      }
+    } else {
+      uint8_t *ref_ptr = frames->source->y_buffer;
+      uint8_t *main_ptr = frames->distorted->y_buffer;
+
+      for (int row = 0; row < height; ++row) {
+        for (int col = 0; col < width; ++col) {
+          ref_data[col] = (float)ref_ptr[col];
+        }
+        ref_ptr += frames->source->y_stride;
+        ref_data += stride / sizeof(*ref_data);
+      }
+
+      for (int row = 0; row < height; ++row) {
+        for (int col = 0; col < width; ++col) {
+          main_data[col] = (float)main_ptr[col];
+        }
+        main_ptr += frames->distorted->y_stride;
+        main_data += stride / sizeof(*main_data);
+      }
+    }
+    frames->frame_set = 1;
+    return 0;
+  }
+
+  (void)temp_data;
+  return 2;
+}
+
+void aom_calc_vmaf(const char *model_path, const YV12_BUFFER_CONFIG *source,
+                   const YV12_BUFFER_CONFIG *distorted, const int bit_depth,
+                   double *const vmaf) {
+  aom_clear_system_state();
+  const int width = source->y_width;
+  const int height = source->y_height;
+  FrameData frames = { source, distorted, 0, bit_depth };
+  char *fmt = bit_depth == 10 ? "yuv420p10le" : "yuv420p";
+  double vmaf_score;
+  const int ret =
+      compute_vmaf(&vmaf_score, fmt, width, height, read_frame,
+                   /*user_data=*/&frames, (char *)model_path,
+                   /*log_path=*/NULL, /*log_fmt=*/NULL, /*disable_clip=*/1,
+                   /*disable_avx=*/0, /*enable_transform=*/0,
+                   /*phone_model=*/0, /*do_psnr=*/0, /*do_ssim=*/0,
+                   /*do_ms_ssim=*/0, /*pool_method=*/NULL, /*n_thread=*/0,
+                   /*n_subsample=*/1, /*enable_conf_interval=*/0);
+  if (ret) vmaf_fatal_error("Failed to compute VMAF scores.");
+
+  aom_clear_system_state();
+  *vmaf = vmaf_score;
+}
+
+void aom_calc_vmaf_multi_frame(
+    void *user_data, const char *model_path,
+    int (*read_frame)(float *ref_data, float *main_data, float *temp_data,
+                      int stride_byte, void *user_data),
+    int frame_width, int frame_height, int bit_depth, double *vmaf) {
+  aom_clear_system_state();
+
+  char *fmt = bit_depth == 10 ? "yuv420p10le" : "yuv420p";
+  double vmaf_score;
+  const int ret = compute_vmaf(
+      &vmaf_score, fmt, frame_width, frame_height, read_frame,
+      /*user_data=*/user_data, (char *)model_path,
+      /*log_path=*/"vmaf_scores.xml", /*log_fmt=*/NULL, /*disable_clip=*/0,
+      /*disable_avx=*/0, /*enable_transform=*/0,
+      /*phone_model=*/0, /*do_psnr=*/0, /*do_ssim=*/0,
+      /*do_ms_ssim=*/0, /*pool_method=*/NULL, /*n_thread=*/0,
+      /*n_subsample=*/1, /*enable_conf_interval=*/0);
+  FILE *vmaf_log = fopen("vmaf_scores.xml", "r");
+  if (vmaf_log == NULL || ret) {
+    vmaf_fatal_error("Failed to compute VMAF scores.");
+  }
+
+  int frame_index = 0;
+  char buf[512];
+  while (fgets(buf, 511, vmaf_log) != NULL) {
+    if (memcmp(buf, "\t\t<frame ", 9) == 0) {
+      char *p = strstr(buf, "vmaf=");
+      if (p != NULL && p[5] == '"') {
+        char *p2 = strstr(&p[6], "\"");
+        *p2 = '\0';
+        const double score = atof(&p[6]);
+        if (score < 0.0 || score > 100.0) {
+          vmaf_fatal_error("Failed to compute VMAF scores.");
+        }
+        vmaf[frame_index++] = score;
+      }
+    }
+  }
+  fclose(vmaf_log);
+
+  aom_clear_system_state();
+}
diff --git a/libs/libaom/src/aom_dsp/vmaf.h b/libs/libaom/src/aom_dsp/vmaf.h
new file mode 100644
index 000000000..fb8bf4613
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/vmaf.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_VMAF_H_
+#define AOM_AOM_DSP_VMAF_H_
+
+#include "aom_scale/yv12config.h"
+
+void aom_calc_vmaf(const char *model_path, const YV12_BUFFER_CONFIG *source,
+                   const YV12_BUFFER_CONFIG *distorted, int bit_depth,
+                   double *vmaf);
+
+void aom_calc_vmaf_multi_frame(
+    void *user_data, const char *model_path,
+    int (*read_frame)(float *ref_data, float *main_data, float *temp_data,
+                      int stride_byte, void *user_data),
+    int frame_width, int frame_height, int bit_depth, double *vmaf);
+
+#endif  // AOM_AOM_DSP_VMAF_H_
diff --git a/libs/libaom/src/aom_dsp/x86/adaptive_quantize_avx2.c b/libs/libaom/src/aom_dsp/x86/adaptive_quantize_avx2.c
new file mode 100644
index 000000000..e33dff20c
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/adaptive_quantize_avx2.c
@@ -0,0 +1,244 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+#include "config/aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+#include "av1/encoder/av1_quantize.h"
+#include "aom_dsp/x86/quantize_x86.h"
+
+static INLINE void load_b_values_avx2(const int16_t *zbin_ptr, __m256i *zbin,
+                                      const int16_t *round_ptr, __m256i *round,
+                                      const int16_t *quant_ptr, __m256i *quant,
+                                      const int16_t *dequant_ptr,
+                                      __m256i *dequant,
+                                      const int16_t *shift_ptr,
+                                      __m256i *shift) {
+  *zbin = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)zbin_ptr));
+  *zbin = _mm256_permute4x64_epi64(*zbin, 0x54);
+  *zbin = _mm256_sub_epi16(*zbin, _mm256_set1_epi16(1));
+  *round = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)round_ptr));
+  *round = _mm256_permute4x64_epi64(*round, 0x54);
+  *quant = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)quant_ptr));
+  *quant = _mm256_permute4x64_epi64(*quant, 0x54);
+  *dequant =
+      _mm256_castsi128_si256(_mm_load_si128((const __m128i *)dequant_ptr));
+  *dequant = _mm256_permute4x64_epi64(*dequant, 0x54);
+  *shift = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)shift_ptr));
+  *shift = _mm256_permute4x64_epi64(*shift, 0x54);
+}
+
+static INLINE __m256i load_coefficients_avx2(const tran_low_t *coeff_ptr) {
+  const __m256i coeff1 = _mm256_load_si256((__m256i *)(coeff_ptr));
+  const __m256i coeff2 = _mm256_load_si256((__m256i *)(coeff_ptr + 8));
+  return _mm256_packs_epi32(coeff1, coeff2);
+}
+
+static INLINE void update_mask1_avx2(__m256i *cmp_mask,
+                                     const int16_t *iscan_ptr, int *is_found,
+                                     __m256i *mask) {
+  __m256i temp_mask = _mm256_setzero_si256();
+  if (_mm256_movemask_epi8(*cmp_mask)) {
+    __m256i iscan = _mm256_loadu_si256((const __m256i *)(iscan_ptr));
+    temp_mask = _mm256_and_si256(*cmp_mask, iscan);
+    *is_found = 1;
+  }
+  *mask = _mm256_max_epi16(temp_mask, *mask);
+}
+
+static INLINE void update_mask0_avx2(__m256i *qcoeff, __m256i *threshold,
+                                     const int16_t *iscan_ptr, int *is_found,
+                                     __m256i *mask) {
+  __m256i zero = _mm256_setzero_si256();
+  __m256i coeff[2], cmp_mask0, cmp_mask1;
+  coeff[0] = _mm256_unpacklo_epi16(*qcoeff, zero);
+  coeff[1] = _mm256_unpackhi_epi16(*qcoeff, zero);
+  coeff[0] = _mm256_slli_epi32(coeff[0], AOM_QM_BITS);
+  cmp_mask0 = _mm256_cmpgt_epi32(coeff[0], threshold[0]);
+  coeff[1] = _mm256_slli_epi32(coeff[1], AOM_QM_BITS);
+  cmp_mask1 = _mm256_cmpgt_epi32(coeff[1], threshold[1]);
+  cmp_mask0 =
+      _mm256_permute4x64_epi64(_mm256_packs_epi32(cmp_mask0, cmp_mask1), 0xd8);
+  update_mask1_avx2(&cmp_mask0, iscan_ptr, is_found, mask);
+}
+
+static INLINE void calculate_qcoeff_avx2(__m256i *coeff, const __m256i *round,
+                                         const __m256i *quant,
+                                         const __m256i *shift) {
+  __m256i tmp, qcoeff;
+  qcoeff = _mm256_adds_epi16(*coeff, *round);
+  tmp = _mm256_mulhi_epi16(qcoeff, *quant);
+  qcoeff = _mm256_add_epi16(tmp, qcoeff);
+  *coeff = _mm256_mulhi_epi16(qcoeff, *shift);
+}
+
+static INLINE __m256i calculate_dqcoeff_avx2(__m256i qcoeff, __m256i dequant) {
+  return _mm256_mullo_epi16(qcoeff, dequant);
+}
+
+static INLINE void store_coefficients_avx2(__m256i coeff_vals,
+                                           tran_low_t *coeff_ptr) {
+  __m256i coeff_sign = _mm256_srai_epi16(coeff_vals, 15);
+  __m256i coeff_vals_lo = _mm256_unpacklo_epi16(coeff_vals, coeff_sign);
+  __m256i coeff_vals_hi = _mm256_unpackhi_epi16(coeff_vals, coeff_sign);
+  _mm256_store_si256((__m256i *)(coeff_ptr), coeff_vals_lo);
+  _mm256_store_si256((__m256i *)(coeff_ptr + 8), coeff_vals_hi);
+}
+
+void aom_quantize_b_adaptive_avx2(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  int index = 16;
+  int non_zero_count = 0;
+  int non_zero_count_prescan_add_zero = 0;
+  int is_found0 = 0, is_found1 = 0;
+  int eob = -1;
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i zbin, round, quant, dequant, shift;
+  __m256i coeff, qcoeff;
+  __m256i cmp_mask, mask0 = zero, mask1 = zero;
+  __m128i temp_mask0, temp_mask1;
+  int prescan_add[2];
+  int thresh[2];
+  const qm_val_t wt = (1 << AOM_QM_BITS);
+  for (int i = 0; i < 2; ++i) {
+    prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
+    thresh[i] = (zbin_ptr[i] * wt + prescan_add[i]) - 1;
+  }
+  __m256i threshold[2];
+  threshold[0] = _mm256_set1_epi32(thresh[0]);
+  threshold[1] = _mm256_set1_epi32(thresh[1]);
+  threshold[0] = _mm256_blend_epi32(threshold[0], threshold[1], 0xfe);
+
+#if SKIP_EOB_FACTOR_ADJUST
+  int first = -1;
+#endif
+
+  // Setup global values.
+  load_b_values_avx2(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant,
+                     dequant_ptr, &dequant, quant_shift_ptr, &shift);
+
+  // Do DC and first 15 AC.
+  coeff = load_coefficients_avx2(coeff_ptr);
+  qcoeff = _mm256_abs_epi16(coeff);
+  update_mask0_avx2(&qcoeff, threshold, iscan, &is_found0, &mask0);
+  __m256i temp0 = _mm256_cmpgt_epi16(qcoeff, zbin);
+  zbin = _mm256_unpackhi_epi64(zbin, zbin);
+  cmp_mask = _mm256_permute4x64_epi64(temp0, 0xd8);
+  update_mask1_avx2(&cmp_mask, iscan, &is_found1, &mask1);
+  threshold[0] = threshold[1];
+  if (_mm256_movemask_epi8(cmp_mask) == 0) {
+    _mm256_store_si256((__m256i *)(qcoeff_ptr), zero);
+    _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), zero);
+    _mm256_store_si256((__m256i *)(dqcoeff_ptr), zero);
+    _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), zero);
+    round = _mm256_unpackhi_epi64(round, round);
+    quant = _mm256_unpackhi_epi64(quant, quant);
+    shift = _mm256_unpackhi_epi64(shift, shift);
+    dequant = _mm256_unpackhi_epi64(dequant, dequant);
+  } else {
+    calculate_qcoeff_avx2(&qcoeff, &round, &quant, &shift);
+    round = _mm256_unpackhi_epi64(round, round);
+    quant = _mm256_unpackhi_epi64(quant, quant);
+    shift = _mm256_unpackhi_epi64(shift, shift);
+    // Reinsert signs
+    qcoeff = _mm256_sign_epi16(qcoeff, coeff);
+    // Mask out zbin threshold coeffs
+    qcoeff = _mm256_and_si256(qcoeff, temp0);
+    store_coefficients_avx2(qcoeff, qcoeff_ptr);
+    coeff = calculate_dqcoeff_avx2(qcoeff, dequant);
+    dequant = _mm256_unpackhi_epi64(dequant, dequant);
+    store_coefficients_avx2(coeff, dqcoeff_ptr);
+  }
+
+  // AC only loop.
+  while (index < n_coeffs) {
+    coeff = load_coefficients_avx2(coeff_ptr + index);
+    qcoeff = _mm256_abs_epi16(coeff);
+    update_mask0_avx2(&qcoeff, threshold, iscan + index, &is_found0, &mask0);
+    temp0 = _mm256_cmpgt_epi16(qcoeff, zbin);
+    cmp_mask = _mm256_permute4x64_epi64(temp0, 0xd8);
+    update_mask1_avx2(&cmp_mask, iscan + index, &is_found1, &mask1);
+    if (_mm256_movemask_epi8(cmp_mask) == 0) {
+      _mm256_store_si256((__m256i *)(qcoeff_ptr + index), zero);
+      _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), zero);
+      _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), zero);
+      _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), zero);
+      index += 16;
+      continue;
+    }
+    calculate_qcoeff_avx2(&qcoeff, &round, &quant, &shift);
+    qcoeff = _mm256_sign_epi16(qcoeff, coeff);
+    qcoeff = _mm256_and_si256(qcoeff, temp0);
+    store_coefficients_avx2(qcoeff, qcoeff_ptr + index);
+    coeff = calculate_dqcoeff_avx2(qcoeff, dequant);
+    store_coefficients_avx2(coeff, dqcoeff_ptr + index);
+    index += 16;
+  }
+  if (is_found0) {
+    temp_mask0 = _mm_max_epi16(_mm256_castsi256_si128(mask0),
+                               _mm256_extracti128_si256(mask0, 1));
+    non_zero_count = calculate_non_zero_count(temp_mask0);
+  }
+  if (is_found1) {
+    temp_mask1 = _mm_max_epi16(_mm256_castsi256_si128(mask1),
+                               _mm256_extracti128_si256(mask1, 1));
+    non_zero_count_prescan_add_zero = calculate_non_zero_count(temp_mask1);
+  }
+
+  for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
+    const int rc = scan[i];
+    qcoeff_ptr[rc] = 0;
+    dqcoeff_ptr[rc] = 0;
+  }
+
+  for (int i = non_zero_count - 1; i >= 0; i--) {
+    const int rc = scan[i];
+    if (qcoeff_ptr[rc]) {
+      eob = i;
+      break;
+    }
+  }
+
+  *eob_ptr = eob + 1;
+#if SKIP_EOB_FACTOR_ADJUST
+  // TODO(Aniket): Experiment the following loop with intrinsic by combining
+  // with the quantization loop above
+  for (int i = 0; i < non_zero_count; i++) {
+    const int rc = scan[i];
+    const int qcoeff0 = qcoeff_ptr[rc];
+    if (qcoeff0) {
+      first = i;
+      break;
+    }
+  }
+  if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
+    const int rc = scan[(*eob_ptr - 1)];
+    if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
+      const int coeff0 = coeff_ptr[rc] * wt;
+      const int coeff_sign = AOMSIGN(coeff0);
+      const int abs_coeff = (coeff0 ^ coeff_sign) - coeff_sign;
+      const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
+      const int prescan_add_val =
+          ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
+      if (abs_coeff <
+          (zbin_ptr[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
+        qcoeff_ptr[rc] = 0;
+        dqcoeff_ptr[rc] = 0;
+        *eob_ptr = 0;
+      }
+    }
+  }
+#endif
+}
diff --git a/libs/libaom/src/aom_dsp/x86/adaptive_quantize_sse2.c b/libs/libaom/src/aom_dsp/x86/adaptive_quantize_sse2.c
new file mode 100644
index 000000000..584cd671f
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/adaptive_quantize_sse2.c
@@ -0,0 +1,633 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+#include "config/aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+#include "av1/encoder/av1_quantize.h"
+#include "aom_dsp/x86/quantize_x86.h"
+
+void aom_quantize_b_adaptive_sse2(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  int index = 16;
+  int non_zero_count = 0;
+  int non_zero_count_prescan_add_zero = 0;
+  int is_found0 = 0, is_found1 = 0;
+  int eob = -1;
+  const __m128i zero = _mm_setzero_si128();
+  __m128i zbin, round, quant, dequant, shift;
+  __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
+  __m128i qcoeff0, qcoeff1;
+  __m128i cmp_mask0, cmp_mask1;
+  __m128i all_zero;
+  __m128i mask0 = zero, mask1 = zero;
+
+  int prescan_add[2];
+  int thresh[4];
+  const qm_val_t wt = (1 << AOM_QM_BITS);
+  for (int i = 0; i < 2; ++i) {
+    prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
+    thresh[i] = (zbin_ptr[i] * wt + prescan_add[i]) - 1;
+  }
+  thresh[2] = thresh[3] = thresh[1];
+  __m128i threshold[2];
+  threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]);
+  threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]);
+
+#if SKIP_EOB_FACTOR_ADJUST
+  int first = -1;
+#endif
+  // Setup global values.
+  load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant,
+                dequant_ptr, &dequant, quant_shift_ptr, &shift);
+
+  // Do DC and first 15 AC.
+  coeff0 = load_coefficients(coeff_ptr);
+  coeff1 = load_coefficients(coeff_ptr + 8);
+
+  // Poor man's abs().
+  coeff0_sign = _mm_srai_epi16(coeff0, 15);
+  coeff1_sign = _mm_srai_epi16(coeff1, 15);
+  qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+  qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+
+  update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0);
+
+  cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+  zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
+  cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+  update_mask1(&cmp_mask0, &cmp_mask1, iscan, &is_found1, &mask1);
+
+  threshold[0] = threshold[1];
+  all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+  if (_mm_movemask_epi8(all_zero) == 0) {
+    _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero);
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+  } else {
+    calculate_qcoeff(&qcoeff0, round, quant, shift);
+
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+
+    calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+    // Reinsert signs
+    qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+    qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+
+    // Mask out zbin threshold coeffs
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    store_coefficients(qcoeff0, qcoeff_ptr);
+    store_coefficients(qcoeff1, qcoeff_ptr + 8);
+
+    coeff0 = calculate_dqcoeff(qcoeff0, dequant);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+    coeff1 = calculate_dqcoeff(qcoeff1, dequant);
+
+    store_coefficients(coeff0, dqcoeff_ptr);
+    store_coefficients(coeff1, dqcoeff_ptr + 8);
+  }
+
+  // AC only loop.
+  while (index < n_coeffs) {
+    coeff0 = load_coefficients(coeff_ptr + index);
+    coeff1 = load_coefficients(coeff_ptr + index + 8);
+
+    coeff0_sign = _mm_srai_epi16(coeff0, 15);
+    coeff1_sign = _mm_srai_epi16(coeff1, 15);
+    qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+    qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+
+    update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index, &is_found0,
+                 &mask0);
+
+    cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+    cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+    update_mask1(&cmp_mask0, &cmp_mask1, iscan + index, &is_found1, &mask1);
+
+    all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+    if (_mm_movemask_epi8(all_zero) == 0) {
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero);
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero);
+      index += 16;
+      continue;
+    }
+    calculate_qcoeff(&qcoeff0, round, quant, shift);
+    calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+    qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+    qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    store_coefficients(qcoeff0, qcoeff_ptr + index);
+    store_coefficients(qcoeff1, qcoeff_ptr + index + 8);
+
+    coeff0 = calculate_dqcoeff(qcoeff0, dequant);
+    coeff1 = calculate_dqcoeff(qcoeff1, dequant);
+
+    store_coefficients(coeff0, dqcoeff_ptr + index);
+    store_coefficients(coeff1, dqcoeff_ptr + index + 8);
+
+    index += 16;
+  }
+  if (is_found0) non_zero_count = calculate_non_zero_count(mask0);
+  if (is_found1)
+    non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1);
+
+  for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
+    const int rc = scan[i];
+    qcoeff_ptr[rc] = 0;
+    dqcoeff_ptr[rc] = 0;
+  }
+
+  for (int i = non_zero_count - 1; i >= 0; i--) {
+    const int rc = scan[i];
+    if (qcoeff_ptr[rc]) {
+      eob = i;
+      break;
+    }
+  }
+
+  *eob_ptr = eob + 1;
+#if SKIP_EOB_FACTOR_ADJUST
+  // TODO(Aniket): Experiment the following loop with intrinsic by combining
+  // with the quantization loop above
+  for (int i = 0; i < non_zero_count; i++) {
+    const int rc = scan[i];
+    const int qcoeff = qcoeff_ptr[rc];
+    if (qcoeff) {
+      first = i;
+      break;
+    }
+  }
+  if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
+    const int rc = scan[(*eob_ptr - 1)];
+    if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
+      const int coeff = coeff_ptr[rc] * wt;
+      const int coeff_sign = AOMSIGN(coeff);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
+      const int prescan_add_val =
+          ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
+      if (abs_coeff <
+          (zbin_ptr[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
+        qcoeff_ptr[rc] = 0;
+        dqcoeff_ptr[rc] = 0;
+        *eob_ptr = 0;
+      }
+    }
+  }
+#endif
+}
+
+void aom_quantize_b_32x32_adaptive_sse2(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  int index = 16;
+  const int log_scale = 1;
+  int non_zero_count = 0;
+  int non_zero_count_prescan_add_zero = 0;
+  int is_found0 = 0, is_found1 = 0;
+  int eob = -1;
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i log_scale_vec = _mm_set1_epi16(log_scale);
+  __m128i zbin, round, quant, dequant, shift;
+  __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
+  __m128i qcoeff0, qcoeff1;
+  __m128i cmp_mask0, cmp_mask1;
+  __m128i all_zero;
+  __m128i mask0 = zero, mask1 = zero;
+
+  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
+                         ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
+  int prescan_add[2];
+  int thresh[4];
+  const qm_val_t wt = (1 << AOM_QM_BITS);
+  for (int i = 0; i < 2; ++i) {
+    prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
+    thresh[i] = (zbins[i] * wt + prescan_add[i]) - 1;
+  }
+  thresh[2] = thresh[3] = thresh[1];
+  __m128i threshold[2];
+  threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]);
+  threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]);
+
+#if SKIP_EOB_FACTOR_ADJUST
+  int first = -1;
+#endif
+  // Setup global values.
+  zbin = _mm_load_si128((const __m128i *)zbin_ptr);
+  round = _mm_load_si128((const __m128i *)round_ptr);
+  quant = _mm_load_si128((const __m128i *)quant_ptr);
+  dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+  shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
+
+  // Shift with rounding.
+  zbin = _mm_add_epi16(zbin, log_scale_vec);
+  round = _mm_add_epi16(round, log_scale_vec);
+  zbin = _mm_srli_epi16(zbin, log_scale);
+  round = _mm_srli_epi16(round, log_scale);
+  zbin = _mm_sub_epi16(zbin, one);
+
+  // Do DC and first 15 AC.
+  coeff0 = load_coefficients(coeff_ptr);
+  coeff1 = load_coefficients(coeff_ptr + 8);
+
+  coeff0_sign = _mm_srai_epi16(coeff0, 15);
+  coeff1_sign = _mm_srai_epi16(coeff1, 15);
+  qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+  qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+
+  update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0);
+
+  cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+  zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
+  cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+  update_mask1(&cmp_mask0, &cmp_mask1, iscan, &is_found1, &mask1);
+
+  threshold[0] = threshold[1];
+  all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+  if (_mm_movemask_epi8(all_zero) == 0) {
+    _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero);
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+  } else {
+    calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale);
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale);
+
+    // Reinsert signs
+    qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+    qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+
+    // Mask out zbin threshold coeffs
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    store_coefficients(qcoeff0, qcoeff_ptr);
+    store_coefficients(qcoeff1, qcoeff_ptr + 8);
+
+    calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero, dqcoeff_ptr,
+                                          &log_scale);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+    calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero,
+                                          dqcoeff_ptr + 8, &log_scale);
+  }
+
+  // AC only loop.
+  while (index < n_coeffs) {
+    coeff0 = load_coefficients(coeff_ptr + index);
+    coeff1 = load_coefficients(coeff_ptr + index + 8);
+
+    coeff0_sign = _mm_srai_epi16(coeff0, 15);
+    coeff1_sign = _mm_srai_epi16(coeff1, 15);
+    qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+    qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+
+    update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index, &is_found0,
+                 &mask0);
+
+    cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+    cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+    update_mask1(&cmp_mask0, &cmp_mask1, iscan + index, &is_found1, &mask1);
+
+    all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+    if (_mm_movemask_epi8(all_zero) == 0) {
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero);
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero);
+      index += 16;
+      continue;
+    }
+    calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale);
+    calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale);
+
+    qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+    qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    store_coefficients(qcoeff0, qcoeff_ptr + index);
+    store_coefficients(qcoeff1, qcoeff_ptr + index + 8);
+
+    calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero,
+                                          dqcoeff_ptr + index, &log_scale);
+    calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero,
+                                          dqcoeff_ptr + index + 8, &log_scale);
+    index += 16;
+  }
+  if (is_found0) non_zero_count = calculate_non_zero_count(mask0);
+  if (is_found1)
+    non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1);
+
+  for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
+    const int rc = scan[i];
+    qcoeff_ptr[rc] = 0;
+    dqcoeff_ptr[rc] = 0;
+  }
+
+  for (int i = non_zero_count - 1; i >= 0; i--) {
+    const int rc = scan[i];
+    if (qcoeff_ptr[rc]) {
+      eob = i;
+      break;
+    }
+  }
+
+  *eob_ptr = eob + 1;
+#if SKIP_EOB_FACTOR_ADJUST
+  // TODO(Aniket): Experiment the following loop with intrinsic by combining
+  // with the quantization loop above
+  for (int i = 0; i < non_zero_count; i++) {
+    const int rc = scan[i];
+    const int qcoeff = qcoeff_ptr[rc];
+    if (qcoeff) {
+      first = i;
+      break;
+    }
+  }
+  if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
+    const int rc = scan[(*eob_ptr - 1)];
+    if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
+      const int coeff = coeff_ptr[rc] * wt;
+      const int coeff_sign = AOMSIGN(coeff);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
+      const int prescan_add_val =
+          ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
+      if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
+        qcoeff_ptr[rc] = 0;
+        dqcoeff_ptr[rc] = 0;
+        *eob_ptr = 0;
+      }
+    }
+  }
+#endif
+}
+
+void aom_quantize_b_64x64_adaptive_sse2(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  int index = 16;
+  const int log_scale = 2;
+  int non_zero_count = 0;
+  int non_zero_count_prescan_add_zero = 0;
+  int is_found0 = 0, is_found1 = 0;
+  int eob = -1;
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i log_scale_vec = _mm_set1_epi16(log_scale);
+  __m128i zbin, round, quant, dequant, shift;
+  __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
+  __m128i qcoeff0, qcoeff1;
+  __m128i cmp_mask0, cmp_mask1;
+  __m128i all_zero;
+  __m128i mask0 = zero, mask1 = zero;
+
+  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
+                         ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
+  int prescan_add[2];
+  int thresh[4];
+  const qm_val_t wt = (1 << AOM_QM_BITS);
+  for (int i = 0; i < 2; ++i) {
+    prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
+    thresh[i] = (zbins[i] * wt + prescan_add[i]) - 1;
+  }
+  thresh[2] = thresh[3] = thresh[1];
+  __m128i threshold[2];
+  threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]);
+  threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]);
+
+#if SKIP_EOB_FACTOR_ADJUST
+  int first = -1;
+#endif
+  // Setup global values.
+  zbin = _mm_load_si128((const __m128i *)zbin_ptr);
+  round = _mm_load_si128((const __m128i *)round_ptr);
+  quant = _mm_load_si128((const __m128i *)quant_ptr);
+  dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+  shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
+
+  // Shift with rounding.
+  zbin = _mm_add_epi16(zbin, log_scale_vec);
+  round = _mm_add_epi16(round, log_scale_vec);
+  zbin = _mm_srli_epi16(zbin, log_scale);
+  round = _mm_srli_epi16(round, log_scale);
+  zbin = _mm_sub_epi16(zbin, one);
+
+  // Do DC and first 15 AC.
+  coeff0 = load_coefficients(coeff_ptr);
+  coeff1 = load_coefficients(coeff_ptr + 8);
+
+  coeff0_sign = _mm_srai_epi16(coeff0, 15);
+  coeff1_sign = _mm_srai_epi16(coeff1, 15);
+  qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+  qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+
+  update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0);
+
+  cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+  zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
+  cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+  update_mask1(&cmp_mask0, &cmp_mask1, iscan, &is_found1, &mask1);
+
+  threshold[0] = threshold[1];
+  all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+  if (_mm_movemask_epi8(all_zero) == 0) {
+    _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero);
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+  } else {
+    calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale);
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale);
+
+    // Reinsert signs
+    qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+    qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+
+    // Mask out zbin threshold coeffs
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    store_coefficients(qcoeff0, qcoeff_ptr);
+    store_coefficients(qcoeff1, qcoeff_ptr + 8);
+
+    calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero, dqcoeff_ptr,
+                                          &log_scale);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+    calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero,
+                                          dqcoeff_ptr + 8, &log_scale);
+  }
+
+  // AC only loop.
+  while (index < n_coeffs) {
+    coeff0 = load_coefficients(coeff_ptr + index);
+    coeff1 = load_coefficients(coeff_ptr + index + 8);
+
+    coeff0_sign = _mm_srai_epi16(coeff0, 15);
+    coeff1_sign = _mm_srai_epi16(coeff1, 15);
+    qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+    qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+
+    update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index, &is_found0,
+                 &mask0);
+
+    cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+    cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+    update_mask1(&cmp_mask0, &cmp_mask1, iscan + index, &is_found1, &mask1);
+
+    all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+    if (_mm_movemask_epi8(all_zero) == 0) {
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero);
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero);
+      index += 16;
+      continue;
+    }
+    calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale);
+    calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale);
+
+    qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+    qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    store_coefficients(qcoeff0, qcoeff_ptr + index);
+    store_coefficients(qcoeff1, qcoeff_ptr + index + 8);
+
+    calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero,
+                                          dqcoeff_ptr + index, &log_scale);
+    calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero,
+                                          dqcoeff_ptr + index + 8, &log_scale);
+    index += 16;
+  }
+  if (is_found0) non_zero_count = calculate_non_zero_count(mask0);
+  if (is_found1)
+    non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1);
+
+  for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
+    const int rc = scan[i];
+    qcoeff_ptr[rc] = 0;
+    dqcoeff_ptr[rc] = 0;
+  }
+
+  for (int i = non_zero_count - 1; i >= 0; i--) {
+    const int rc = scan[i];
+    if (qcoeff_ptr[rc]) {
+      eob = i;
+      break;
+    }
+  }
+
+  *eob_ptr = eob + 1;
+#if SKIP_EOB_FACTOR_ADJUST
+  // TODO(Aniket): Experiment the following loop with intrinsic by combining
+  // with the quantization loop above
+  for (int i = 0; i < non_zero_count; i++) {
+    const int rc = scan[i];
+    const int qcoeff = qcoeff_ptr[rc];
+    if (qcoeff) {
+      first = i;
+      break;
+    }
+  }
+  if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
+    const int rc = scan[(*eob_ptr - 1)];
+    if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
+      const int coeff = coeff_ptr[rc] * wt;
+      const int coeff_sign = AOMSIGN(coeff);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
+      const int prescan_add_val =
+          ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
+      if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
+        qcoeff_ptr[rc] = 0;
+        dqcoeff_ptr[rc] = 0;
+        *eob_ptr = 0;
+      }
+    }
+  }
+#endif
+}
diff --git a/libs/libaom/src/aom_dsp/x86/aom_asm_stubs.c b/libs/libaom/src/aom_dsp/x86/aom_asm_stubs.c
new file mode 100644
index 000000000..ce8285e43
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/aom_asm_stubs.c
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/convolve.h"
+
+#if HAVE_SSE2
+filter8_1dfunction aom_filter_block1d16_v8_sse2;
+filter8_1dfunction aom_filter_block1d16_h8_sse2;
+filter8_1dfunction aom_filter_block1d8_v8_sse2;
+filter8_1dfunction aom_filter_block1d8_h8_sse2;
+filter8_1dfunction aom_filter_block1d4_v8_sse2;
+filter8_1dfunction aom_filter_block1d4_h8_sse2;
+filter8_1dfunction aom_filter_block1d16_v4_sse2;
+filter8_1dfunction aom_filter_block1d16_h4_sse2;
+
+filter8_1dfunction aom_filter_block1d8_h4_sse2;
+filter8_1dfunction aom_filter_block1d8_v4_sse2;
+filter8_1dfunction aom_filter_block1d4_h4_sse2;
+filter8_1dfunction aom_filter_block1d4_v4_sse2;
+
+filter8_1dfunction aom_filter_block1d16_v2_sse2;
+filter8_1dfunction aom_filter_block1d16_h2_sse2;
+filter8_1dfunction aom_filter_block1d8_v2_sse2;
+filter8_1dfunction aom_filter_block1d8_h2_sse2;
+filter8_1dfunction aom_filter_block1d4_v2_sse2;
+filter8_1dfunction aom_filter_block1d4_h2_sse2;
+
+// void aom_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
+//                               uint8_t *dst, ptrdiff_t dst_stride,
+//                               const int16_t *filter_x, int x_step_q4,
+//                               const int16_t *filter_y, int y_step_q4,
+//                               int w, int h);
+// void aom_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
+//                              uint8_t *dst, ptrdiff_t dst_stride,
+//                              const int16_t *filter_x, int x_step_q4,
+//                              const int16_t *filter_y, int y_step_q4,
+//                              int w, int h);
+FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);
+FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+highbd_filter8_1dfunction aom_highbd_filter_block1d16_v8_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d16_h8_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d8_v8_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d8_h8_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d4_v8_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d4_h8_sse2;
+
+highbd_filter8_1dfunction aom_highbd_filter_block1d16_v4_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d16_h4_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d8_v4_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d8_h4_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d4_v4_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d4_h4_sse2;
+
+highbd_filter8_1dfunction aom_highbd_filter_block1d16_v2_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d16_h2_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d8_v2_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d8_h2_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d4_v2_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d4_h2_sse2;
+
+// void aom_highbd_convolve8_horiz_sse2(const uint8_t *src,
+//                                      ptrdiff_t src_stride,
+//                                      uint8_t *dst,
+//                                      ptrdiff_t dst_stride,
+//                                      const int16_t *filter_x,
+//                                      int x_step_q4,
+//                                      const int16_t *filter_y,
+//                                      int y_step_q4,
+//                                      int w, int h, int bd);
+// void aom_highbd_convolve8_vert_sse2(const uint8_t *src,
+//                                     ptrdiff_t src_stride,
+//                                     uint8_t *dst,
+//                                     ptrdiff_t dst_stride,
+//                                     const int16_t *filter_x,
+//                                     int x_step_q4,
+//                                     const int16_t *filter_y,
+//                                     int y_step_q4,
+//                                     int w, int h, int bd);
+HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);
+HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
+#endif
+#endif  // HAVE_SSE2
diff --git a/libs/libaom/src/aom_dsp/x86/aom_convolve_copy_sse2.asm b/libs/libaom/src/aom_dsp/x86/aom_convolve_copy_sse2.asm
new file mode 100644
index 000000000..7283c32b8
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/aom_convolve_copy_sse2.asm
@@ -0,0 +1,297 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+%macro convolve_fn 1-2
+%ifidn %1, avg
+%define AUX_XMM_REGS 4
+%else
+%define AUX_XMM_REGS 0
+%endif
+%ifidn %2, highbd
+%define pavg pavgw
+cglobal %2_convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \
+                                              dst, dst_stride, \
+                                              fx, fxs, fy, fys, w, h, bd
+%else
+%define pavg pavgb
+cglobal convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \
+                                           dst, dst_stride, \
+                                           fx, fxs, fy, fys, w, h
+%endif
+  mov r4d, dword wm
+%ifidn %2, highbd
+  shl r4d, 1
+  shl srcq, 1
+  shl src_strideq, 1
+  shl dstq, 1
+  shl dst_strideq, 1
+%else
+  cmp r4d, 4
+  je .w4
+%endif
+  cmp r4d, 8
+  je .w8
+  cmp r4d, 16
+  je .w16
+  cmp r4d, 32
+  je .w32
+
+  cmp r4d, 64
+  je .w64
+%ifidn %2, highbd
+  cmp r4d, 128
+  je .w128
+
+.w256:
+  mov                    r4d, dword hm
+.loop256:
+  movu                    m0, [srcq]
+  movu                    m1, [srcq+16]
+  movu                    m2, [srcq+32]
+  movu                    m3, [srcq+48]
+%ifidn %1, avg
+  pavg                    m0, [dstq]
+  pavg                    m1, [dstq+16]
+  pavg                    m2, [dstq+32]
+  pavg                    m3, [dstq+48]
+%endif
+  mova             [dstq   ], m0
+  mova             [dstq+16], m1
+  mova             [dstq+32], m2
+  mova             [dstq+48], m3
+  movu                    m0, [srcq+64]
+  movu                    m1, [srcq+80]
+  movu                    m2, [srcq+96]
+  movu                    m3, [srcq+112]
+%ifidn %1, avg
+  pavg                    m0, [dstq+64]
+  pavg                    m1, [dstq+80]
+  pavg                    m2, [dstq+96]
+  pavg                    m3, [dstq+112]
+%endif
+  mova             [dstq+64], m0
+  mova             [dstq+80], m1
+  mova             [dstq+96], m2
+  mova            [dstq+112], m3
+  movu                    m0, [srcq+128]
+  movu                    m1, [srcq+128+16]
+  movu                    m2, [srcq+128+32]
+  movu                    m3, [srcq+128+48]
+%ifidn %1, avg
+  pavg                    m0, [dstq+128]
+  pavg                    m1, [dstq+128+16]
+  pavg                    m2, [dstq+128+32]
+  pavg                    m3, [dstq+128+48]
+%endif
+  mova         [dstq+128   ], m0
+  mova         [dstq+128+16], m1
+  mova         [dstq+128+32], m2
+  mova         [dstq+128+48], m3
+  movu                    m0, [srcq+128+64]
+  movu                    m1, [srcq+128+80]
+  movu                    m2, [srcq+128+96]
+  movu                    m3, [srcq+128+112]
+  add                   srcq, src_strideq
+%ifidn %1, avg
+  pavg                    m0, [dstq+128+64]
+  pavg                    m1, [dstq+128+80]
+  pavg                    m2, [dstq+128+96]
+  pavg                    m3, [dstq+128+112]
+%endif
+  mova         [dstq+128+64], m0
+  mova         [dstq+128+80], m1
+  mova         [dstq+128+96], m2
+  mova        [dstq+128+112], m3
+  add                   dstq, dst_strideq
+  sub                    r4d, 1
+  jnz .loop256
+  RET
+%endif
+
+.w128:
+  mov                    r4d, dword hm
+.loop128:
+  movu                    m0, [srcq]
+  movu                    m1, [srcq+16]
+  movu                    m2, [srcq+32]
+  movu                    m3, [srcq+48]
+%ifidn %1, avg
+  pavg                    m0, [dstq]
+  pavg                    m1, [dstq+16]
+  pavg                    m2, [dstq+32]
+  pavg                    m3, [dstq+48]
+%endif
+  mova             [dstq   ], m0
+  mova             [dstq+16], m1
+  mova             [dstq+32], m2
+  mova             [dstq+48], m3
+  movu                    m0, [srcq+64]
+  movu                    m1, [srcq+80]
+  movu                    m2, [srcq+96]
+  movu                    m3, [srcq+112]
+  add                   srcq, src_strideq
+%ifidn %1, avg
+  pavg                    m0, [dstq+64]
+  pavg                    m1, [dstq+80]
+  pavg                    m2, [dstq+96]
+  pavg                    m3, [dstq+112]
+%endif
+  mova             [dstq+64], m0
+  mova             [dstq+80], m1
+  mova             [dstq+96], m2
+  mova            [dstq+112], m3
+  add                   dstq, dst_strideq
+  sub                    r4d, 1
+  jnz .loop128
+  RET
+
+.w64:
+  mov                    r4d, dword hm
+.loop64:
+  movu                    m0, [srcq]
+  movu                    m1, [srcq+16]
+  movu                    m2, [srcq+32]
+  movu                    m3, [srcq+48]
+  add                   srcq, src_strideq
+%ifidn %1, avg
+  pavg                    m0, [dstq]
+  pavg                    m1, [dstq+16]
+  pavg                    m2, [dstq+32]
+  pavg                    m3, [dstq+48]
+%endif
+  mova             [dstq   ], m0
+  mova             [dstq+16], m1
+  mova             [dstq+32], m2
+  mova             [dstq+48], m3
+  add                   dstq, dst_strideq
+  sub                    r4d, 1
+  jnz .loop64
+  RET
+
+.w32:
+  mov                    r4d, dword hm
+.loop32:
+  movu                    m0, [srcq]
+  movu                    m1, [srcq+16]
+  movu                    m2, [srcq+src_strideq]
+  movu                    m3, [srcq+src_strideq+16]
+  lea                   srcq, [srcq+src_strideq*2]
+%ifidn %1, avg
+  pavg                    m0, [dstq]
+  pavg                    m1, [dstq            +16]
+  pavg                    m2, [dstq+dst_strideq]
+  pavg                    m3, [dstq+dst_strideq+16]
+%endif
+  mova [dstq               ], m0
+  mova [dstq            +16], m1
+  mova [dstq+dst_strideq   ], m2
+  mova [dstq+dst_strideq+16], m3
+  lea                   dstq, [dstq+dst_strideq*2]
+  sub                    r4d, 2
+  jnz .loop32
+  RET
+
+.w16:
+  mov                    r4d, dword hm
+  lea                    r5q, [src_strideq*3]
+  lea                    r6q, [dst_strideq*3]
+.loop16:
+  movu                    m0, [srcq]
+  movu                    m1, [srcq+src_strideq]
+  movu                    m2, [srcq+src_strideq*2]
+  movu                    m3, [srcq+r5q]
+  lea                   srcq, [srcq+src_strideq*4]
+%ifidn %1, avg
+  pavg                    m0, [dstq]
+  pavg                    m1, [dstq+dst_strideq]
+  pavg                    m2, [dstq+dst_strideq*2]
+  pavg                    m3, [dstq+r6q]
+%endif
+  mova  [dstq              ], m0
+  mova  [dstq+dst_strideq  ], m1
+  mova  [dstq+dst_strideq*2], m2
+  mova  [dstq+r6q          ], m3
+  lea                   dstq, [dstq+dst_strideq*4]
+  sub                    r4d, 4
+  jnz .loop16
+  RET
+
+.w8:
+  mov                    r4d, dword hm
+  lea                    r5q, [src_strideq*3]
+  lea                    r6q, [dst_strideq*3]
+.loop8:
+  movh                    m0, [srcq]
+  movh                    m1, [srcq+src_strideq]
+  movh                    m2, [srcq+src_strideq*2]
+  movh                    m3, [srcq+r5q]
+  lea                   srcq, [srcq+src_strideq*4]
+%ifidn %1, avg
+  movh                    m4, [dstq]
+  movh                    m5, [dstq+dst_strideq]
+  movh                    m6, [dstq+dst_strideq*2]
+  movh                    m7, [dstq+r6q]
+  pavg                    m0, m4
+  pavg                    m1, m5
+  pavg                    m2, m6
+  pavg                    m3, m7
+%endif
+  movh  [dstq              ], m0
+  movh  [dstq+dst_strideq  ], m1
+  movh  [dstq+dst_strideq*2], m2
+  movh  [dstq+r6q          ], m3
+  lea                   dstq, [dstq+dst_strideq*4]
+  sub                    r4d, 4
+  jnz .loop8
+  RET
+
+%ifnidn %2, highbd
+.w4:
+  mov                    r4d, dword hm
+  lea                    r5q, [src_strideq*3]
+  lea                    r6q, [dst_strideq*3]
+.loop4:
+  movd                    m0, [srcq]
+  movd                    m1, [srcq+src_strideq]
+  movd                    m2, [srcq+src_strideq*2]
+  movd                    m3, [srcq+r5q]
+  lea                   srcq, [srcq+src_strideq*4]
+%ifidn %1, avg
+  movd                    m4, [dstq]
+  movd                    m5, [dstq+dst_strideq]
+  movd                    m6, [dstq+dst_strideq*2]
+  movd                    m7, [dstq+r6q]
+  pavg                    m0, m4
+  pavg                    m1, m5
+  pavg                    m2, m6
+  pavg                    m3, m7
+%endif
+  movd  [dstq              ], m0
+  movd  [dstq+dst_strideq  ], m1
+  movd  [dstq+dst_strideq*2], m2
+  movd  [dstq+r6q          ], m3
+  lea                   dstq, [dstq+dst_strideq*4]
+  sub                    r4d, 4
+  jnz .loop4
+  RET
+%endif
+%endmacro
+
+INIT_XMM sse2
+convolve_fn copy
+convolve_fn avg
+convolve_fn copy, highbd
diff --git a/libs/libaom/src/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm b/libs/libaom/src/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm
new file mode 100644
index 000000000..b6f040791
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm
@@ -0,0 +1,613 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+
+%include "aom_ports/x86_abi_support.asm"
+
+;Note: tap3 and tap4 have to be applied and added after other taps to avoid
+;overflow.
+
+%macro HIGH_GET_FILTERS_4 0
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rcx, 0x00000040
+
+    movdqa      xmm7, [rdx]                 ;load filters
+    pshuflw     xmm0, xmm7, 0b              ;k0
+    pshuflw     xmm1, xmm7, 01010101b       ;k1
+    pshuflw     xmm2, xmm7, 10101010b       ;k2
+    pshuflw     xmm3, xmm7, 11111111b       ;k3
+    psrldq      xmm7, 8
+    pshuflw     xmm4, xmm7, 0b              ;k4
+    pshuflw     xmm5, xmm7, 01010101b       ;k5
+    pshuflw     xmm6, xmm7, 10101010b       ;k6
+    pshuflw     xmm7, xmm7, 11111111b       ;k7
+
+    punpcklwd   xmm0, xmm6
+    punpcklwd   xmm2, xmm5
+    punpcklwd   xmm3, xmm4
+    punpcklwd   xmm1, xmm7
+
+    movdqa      k0k6, xmm0
+    movdqa      k2k5, xmm2
+    movdqa      k3k4, xmm3
+    movdqa      k1k7, xmm1
+
+    movq        xmm6, rcx
+    pshufd      xmm6, xmm6, 0
+    movdqa      krd, xmm6
+
+    ;Compute max and min values of a pixel
+    mov         rdx, 0x00010001
+    movsxd      rcx, DWORD PTR arg(6)      ;bps
+    movq        xmm0, rdx
+    movq        xmm1, rcx
+    pshufd      xmm0, xmm0, 0b
+    movdqa      xmm2, xmm0
+    psllw       xmm0, xmm1
+    psubw       xmm0, xmm2
+    pxor        xmm1, xmm1
+    movdqa      max, xmm0                  ;max value (for clamping)
+    movdqa      min, xmm1                  ;min value (for clamping)
+
+%endm
+
+%macro HIGH_APPLY_FILTER_4 1
+    punpcklwd   xmm0, xmm6                  ;two row in one register
+    punpcklwd   xmm1, xmm7
+    punpcklwd   xmm2, xmm5
+    punpcklwd   xmm3, xmm4
+
+    pmaddwd     xmm0, k0k6                  ;multiply the filter factors
+    pmaddwd     xmm1, k1k7
+    pmaddwd     xmm2, k2k5
+    pmaddwd     xmm3, k3k4
+
+    paddd       xmm0, xmm1                  ;sum
+    paddd       xmm0, xmm2
+    paddd       xmm0, xmm3
+
+    paddd       xmm0, krd                   ;rounding
+    psrad       xmm0, 7                     ;shift
+    packssdw    xmm0, xmm0                  ;pack to word
+
+    ;clamp the values
+    pminsw      xmm0, max
+    pmaxsw      xmm0, min
+
+%if %1
+    movq        xmm1, [rdi]
+    pavgw       xmm0, xmm1
+%endif
+    movq        [rdi], xmm0
+%endm
+
+%macro HIGH_GET_FILTERS 0
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+    mov         rcx, 0x00000040
+
+    movdqa      xmm7, [rdx]                 ;load filters
+    pshuflw     xmm0, xmm7, 0b              ;k0
+    pshuflw     xmm1, xmm7, 01010101b       ;k1
+    pshuflw     xmm2, xmm7, 10101010b       ;k2
+    pshuflw     xmm3, xmm7, 11111111b       ;k3
+    pshufhw     xmm4, xmm7, 0b              ;k4
+    pshufhw     xmm5, xmm7, 01010101b       ;k5
+    pshufhw     xmm6, xmm7, 10101010b       ;k6
+    pshufhw     xmm7, xmm7, 11111111b       ;k7
+    punpcklqdq  xmm2, xmm2
+    punpcklqdq  xmm3, xmm3
+    punpcklwd   xmm0, xmm1
+    punpckhwd   xmm6, xmm7
+    punpckhwd   xmm2, xmm5
+    punpckhwd   xmm3, xmm4
+
+    movdqa      k0k1, xmm0                  ;store filter factors on stack
+    movdqa      k6k7, xmm6
+    movdqa      k2k5, xmm2
+    movdqa      k3k4, xmm3
+
+    movq        xmm6, rcx
+    pshufd      xmm6, xmm6, 0
+    movdqa      krd, xmm6                   ;rounding
+
+    ;Compute max and min values of a pixel
+    mov         rdx, 0x00010001
+    movsxd      rcx, DWORD PTR arg(6)       ;bps
+    movq        xmm0, rdx
+    movq        xmm1, rcx
+    pshufd      xmm0, xmm0, 0b
+    movdqa      xmm2, xmm0
+    psllw       xmm0, xmm1
+    psubw       xmm0, xmm2
+    pxor        xmm1, xmm1
+    movdqa      max, xmm0                  ;max value (for clamping)
+    movdqa      min, xmm1                  ;min value (for clamping)
+%endm
+
+%macro LOAD_VERT_8 1
+    movdqu      xmm0, [rsi + %1]            ;0
+    movdqu      xmm1, [rsi + rax + %1]      ;1
+    movdqu      xmm6, [rsi + rdx * 2 + %1]  ;6
+    lea         rsi,  [rsi + rax]
+    movdqu      xmm7, [rsi + rdx * 2 + %1]  ;7
+    movdqu      xmm2, [rsi + rax + %1]      ;2
+    movdqu      xmm3, [rsi + rax * 2 + %1]  ;3
+    movdqu      xmm4, [rsi + rdx + %1]      ;4
+    movdqu      xmm5, [rsi + rax * 4 + %1]  ;5
+%endm
+
+%macro HIGH_APPLY_FILTER_8 2
+    movdqu      temp, xmm4
+    movdqa      xmm4, xmm0
+    punpcklwd   xmm0, xmm1
+    punpckhwd   xmm4, xmm1
+    movdqa      xmm1, xmm6
+    punpcklwd   xmm6, xmm7
+    punpckhwd   xmm1, xmm7
+    movdqa      xmm7, xmm2
+    punpcklwd   xmm2, xmm5
+    punpckhwd   xmm7, xmm5
+
+    movdqu      xmm5, temp
+    movdqu      temp, xmm4
+    movdqa      xmm4, xmm3
+    punpcklwd   xmm3, xmm5
+    punpckhwd   xmm4, xmm5
+    movdqu      xmm5, temp
+
+    pmaddwd     xmm0, k0k1
+    pmaddwd     xmm5, k0k1
+    pmaddwd     xmm6, k6k7
+    pmaddwd     xmm1, k6k7
+    pmaddwd     xmm2, k2k5
+    pmaddwd     xmm7, k2k5
+    pmaddwd     xmm3, k3k4
+    pmaddwd     xmm4, k3k4
+
+    paddd       xmm0, xmm6
+    paddd       xmm0, xmm2
+    paddd       xmm0, xmm3
+    paddd       xmm5, xmm1
+    paddd       xmm5, xmm7
+    paddd       xmm5, xmm4
+
+    paddd       xmm0, krd                   ;rounding
+    paddd       xmm5, krd
+    psrad       xmm0, 7                     ;shift
+    psrad       xmm5, 7
+    packssdw    xmm0, xmm5                  ;pack back to word
+
+    ;clamp the values
+    pminsw      xmm0, max
+    pmaxsw      xmm0, min
+
+%if %1
+    movdqu      xmm1, [rdi + %2]
+    pavgw       xmm0, xmm1
+%endif
+    movdqu      [rdi + %2], xmm0
+%endm
+
+SECTION .text
+
+;void aom_filter_block1d4_v8_sse2
+;(
+;    unsigned char *src_ptr,
+;    unsigned int   src_pitch,
+;    unsigned char *output_ptr,
+;    unsigned int   out_pitch,
+;    unsigned int   output_height,
+;    short *filter
+;)
+global sym(aom_highbd_filter_block1d4_v8_sse2) PRIVATE
+sym(aom_highbd_filter_block1d4_v8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 7
+    %define k0k6 [rsp + 16 * 0]
+    %define k2k5 [rsp + 16 * 1]
+    %define k3k4 [rsp + 16 * 2]
+    %define k1k7 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define max [rsp + 16 * 5]
+    %define min [rsp + 16 * 6]
+
+    HIGH_GET_FILTERS_4
+
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
+    lea         rax, [rax + rax]            ;bytes per line
+    lea         rbx, [rbx + rbx]
+    lea         rdx, [rax + rax * 2]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movq        xmm0, [rsi]                 ;load src: row 0
+    movq        xmm1, [rsi + rax]           ;1
+    movq        xmm6, [rsi + rdx * 2]       ;6
+    lea         rsi,  [rsi + rax]
+    movq        xmm7, [rsi + rdx * 2]       ;7
+    movq        xmm2, [rsi + rax]           ;2
+    movq        xmm3, [rsi + rax * 2]       ;3
+    movq        xmm4, [rsi + rdx]           ;4
+    movq        xmm5, [rsi + rax * 4]       ;5
+
+    HIGH_APPLY_FILTER_4 0
+
+    lea         rdi, [rdi + rbx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 7
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void aom_filter_block1d8_v8_sse2
+;(
+;    unsigned char *src_ptr,
+;    unsigned int   src_pitch,
+;    unsigned char *output_ptr,
+;    unsigned int   out_pitch,
+;    unsigned int   output_height,
+;    short *filter
+;)
+global sym(aom_highbd_filter_block1d8_v8_sse2) PRIVATE
+sym(aom_highbd_filter_block1d8_v8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 8
+    %define k0k1 [rsp + 16 * 0]
+    %define k6k7 [rsp + 16 * 1]
+    %define k2k5 [rsp + 16 * 2]
+    %define k3k4 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define temp [rsp + 16 * 5]
+    %define max [rsp + 16 * 6]
+    %define min [rsp + 16 * 7]
+
+    HIGH_GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
+    lea         rax, [rax + rax]            ;bytes per line
+    lea         rbx, [rbx + rbx]
+    lea         rdx, [rax + rax * 2]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    LOAD_VERT_8 0
+    HIGH_APPLY_FILTER_8 0, 0
+
+    lea         rdi, [rdi + rbx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 8
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void aom_filter_block1d16_v8_sse2
+;(
+;    unsigned char *src_ptr,
+;    unsigned int   src_pitch,
+;    unsigned char *output_ptr,
+;    unsigned int   out_pitch,
+;    unsigned int   output_height,
+;    short *filter
+;)
+global sym(aom_highbd_filter_block1d16_v8_sse2) PRIVATE
+sym(aom_highbd_filter_block1d16_v8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 8
+    %define k0k1 [rsp + 16 * 0]
+    %define k6k7 [rsp + 16 * 1]
+    %define k2k5 [rsp + 16 * 2]
+    %define k3k4 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define temp [rsp + 16 * 5]
+    %define max [rsp + 16 * 6]
+    %define min [rsp + 16 * 7]
+
+    HIGH_GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
+    lea         rax, [rax + rax]            ;bytes per line
+    lea         rbx, [rbx + rbx]
+    lea         rdx, [rax + rax * 2]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    LOAD_VERT_8 0
+    HIGH_APPLY_FILTER_8 0, 0
+    sub         rsi, rax
+
+    LOAD_VERT_8 16
+    HIGH_APPLY_FILTER_8 0, 16
+    add         rdi, rbx
+
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 8
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void aom_filter_block1d4_h8_sse2
+;(
+;    unsigned char  *src_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned char  *output_ptr,
+;    unsigned int    output_pitch,
+;    unsigned int    output_height,
+;    short *filter
+;)
+global sym(aom_highbd_filter_block1d4_h8_sse2) PRIVATE
+sym(aom_highbd_filter_block1d4_h8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 7
+    %define k0k6 [rsp + 16 * 0]
+    %define k2k5 [rsp + 16 * 1]
+    %define k3k4 [rsp + 16 * 2]
+    %define k1k7 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define max [rsp + 16 * 5]
+    %define min [rsp + 16 * 6]
+
+    HIGH_GET_FILTERS_4
+
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    lea         rax, [rax + rax]            ;bytes per line
+    lea         rdx, [rdx + rdx]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movdqu      xmm0,   [rsi - 6]           ;load src
+    movdqu      xmm4,   [rsi + 2]
+    movdqa      xmm1, xmm0
+    movdqa      xmm6, xmm4
+    movdqa      xmm7, xmm4
+    movdqa      xmm2, xmm0
+    movdqa      xmm3, xmm0
+    movdqa      xmm5, xmm4
+
+    psrldq      xmm1, 2
+    psrldq      xmm6, 4
+    psrldq      xmm7, 6
+    psrldq      xmm2, 4
+    psrldq      xmm3, 6
+    psrldq      xmm5, 2
+
+    HIGH_APPLY_FILTER_4 0
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 7
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void aom_filter_block1d8_h8_sse2
+;(
+;    unsigned char  *src_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned char  *output_ptr,
+;    unsigned int    output_pitch,
+;    unsigned int    output_height,
+;    short *filter
+;)
+global sym(aom_highbd_filter_block1d8_h8_sse2) PRIVATE
+sym(aom_highbd_filter_block1d8_h8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 8
+    %define k0k1 [rsp + 16 * 0]
+    %define k6k7 [rsp + 16 * 1]
+    %define k2k5 [rsp + 16 * 2]
+    %define k3k4 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define temp [rsp + 16 * 5]
+    %define max [rsp + 16 * 6]
+    %define min [rsp + 16 * 7]
+
+    HIGH_GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    lea         rax, [rax + rax]            ;bytes per line
+    lea         rdx, [rdx + rdx]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movdqu      xmm0,   [rsi - 6]           ;load src
+    movdqu      xmm1,   [rsi - 4]
+    movdqu      xmm2,   [rsi - 2]
+    movdqu      xmm3,   [rsi]
+    movdqu      xmm4,   [rsi + 2]
+    movdqu      xmm5,   [rsi + 4]
+    movdqu      xmm6,   [rsi + 6]
+    movdqu      xmm7,   [rsi + 8]
+
+    HIGH_APPLY_FILTER_8 0, 0
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 8
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void aom_filter_block1d16_h8_sse2
+;(
+;    unsigned char  *src_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned char  *output_ptr,
+;    unsigned int    output_pitch,
+;    unsigned int    output_height,
+;    short *filter
+;)
+global sym(aom_highbd_filter_block1d16_h8_sse2) PRIVATE
+sym(aom_highbd_filter_block1d16_h8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 8
+    %define k0k1 [rsp + 16 * 0]
+    %define k6k7 [rsp + 16 * 1]
+    %define k2k5 [rsp + 16 * 2]
+    %define k3k4 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define temp [rsp + 16 * 5]
+    %define max [rsp + 16 * 6]
+    %define min [rsp + 16 * 7]
+
+    HIGH_GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    lea         rax, [rax + rax]            ;bytes per line
+    lea         rdx, [rdx + rdx]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movdqu      xmm0,   [rsi - 6]           ;load src
+    movdqu      xmm1,   [rsi - 4]
+    movdqu      xmm2,   [rsi - 2]
+    movdqu      xmm3,   [rsi]
+    movdqu      xmm4,   [rsi + 2]
+    movdqu      xmm5,   [rsi + 4]
+    movdqu      xmm6,   [rsi + 6]
+    movdqu      xmm7,   [rsi + 8]
+
+    HIGH_APPLY_FILTER_8 0, 0
+
+    movdqu      xmm0,   [rsi + 10]           ;load src
+    movdqu      xmm1,   [rsi + 12]
+    movdqu      xmm2,   [rsi + 14]
+    movdqu      xmm3,   [rsi + 16]
+    movdqu      xmm4,   [rsi + 18]
+    movdqu      xmm5,   [rsi + 20]
+    movdqu      xmm6,   [rsi + 22]
+    movdqu      xmm7,   [rsi + 24]
+
+    HIGH_APPLY_FILTER_8 0, 16
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 8
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
diff --git a/libs/libaom/src/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm b/libs/libaom/src/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm
new file mode 100644
index 000000000..a7152be57
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm
@@ -0,0 +1,367 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "aom_ports/x86_abi_support.asm"
+
+%macro HIGH_GET_PARAM_4 0
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+    mov         rcx, 0x00000040
+
+    movdqa      xmm3, [rdx]                 ;load filters
+    pshuflw     xmm4, xmm3, 11111111b       ;k3
+    psrldq      xmm3, 8
+    pshuflw     xmm3, xmm3, 0b              ;k4
+    punpcklwd   xmm4, xmm3                  ;k3k4
+
+    movq        xmm3, rcx                   ;rounding
+    pshufd      xmm3, xmm3, 0
+
+    mov         rdx, 0x00010001
+    movsxd      rcx, DWORD PTR arg(6)       ;bps
+    movq        xmm5, rdx
+    movq        xmm2, rcx
+    pshufd      xmm5, xmm5, 0b
+    movdqa      xmm1, xmm5
+    psllw       xmm5, xmm2
+    psubw       xmm5, xmm1                  ;max value (for clamping)
+    pxor        xmm2, xmm2                  ;min value (for clamping)
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+%endm
+
+%macro HIGH_APPLY_FILTER_4 1
+
+    punpcklwd   xmm0, xmm1                  ;two row in one register
+    pmaddwd     xmm0, xmm4                  ;multiply the filter factors
+
+    paddd       xmm0, xmm3                  ;rounding
+    psrad       xmm0, 7                     ;shift
+    packssdw    xmm0, xmm0                  ;pack to word
+
+    ;clamp the values
+    pminsw      xmm0, xmm5
+    pmaxsw      xmm0, xmm2
+
+%if %1
+    movq        xmm1, [rdi]
+    pavgw       xmm0, xmm1
+%endif
+
+    movq        [rdi], xmm0
+    lea         rsi, [rsi + 2*rax]
+    lea         rdi, [rdi + 2*rdx]
+    dec         rcx
+%endm
+
+%macro HIGH_GET_PARAM 0
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+    mov         rcx, 0x00000040
+
+    movdqa      xmm6, [rdx]                 ;load filters
+
+    pshuflw     xmm7, xmm6, 11111111b       ;k3
+    pshufhw     xmm6, xmm6, 0b              ;k4
+    psrldq      xmm6, 8
+    punpcklwd   xmm7, xmm6                  ;k3k4k3k4k3k4k3k4
+
+    movq        xmm4, rcx                   ;rounding
+    pshufd      xmm4, xmm4, 0
+
+    mov         rdx, 0x00010001
+    movsxd      rcx, DWORD PTR arg(6)       ;bps
+    movq        xmm3, rdx
+    movq        xmm5, rcx
+    pshufd      xmm3, xmm3, 0b
+    movdqa      xmm1, xmm3
+    psllw       xmm3, xmm5
+    psubw       xmm3, xmm1                  ;max value (for clamping)
+    pxor        xmm5, xmm5                  ;min value (for clamping)
+
+    movdqa      max, xmm3
+    movdqa      min, xmm5
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+%endm
+
+%macro HIGH_APPLY_FILTER_8 1
+    movdqa      xmm6, xmm0
+    punpckhwd   xmm6, xmm1
+    punpcklwd   xmm0, xmm1
+    pmaddwd     xmm6, xmm7
+    pmaddwd     xmm0, xmm7
+
+    paddd       xmm6, xmm4                  ;rounding
+    paddd       xmm0, xmm4                  ;rounding
+    psrad       xmm6, 7                     ;shift
+    psrad       xmm0, 7                     ;shift
+    packssdw    xmm0, xmm6                  ;pack back to word
+
+    ;clamp the values
+    pminsw      xmm0, max
+    pmaxsw      xmm0, min
+
+%if %1
+    movdqu      xmm1, [rdi]
+    pavgw       xmm0, xmm1
+%endif
+    movdqu      [rdi], xmm0                 ;store the result
+
+    lea         rsi, [rsi + 2*rax]
+    lea         rdi, [rdi + 2*rdx]
+    dec         rcx
+%endm
+
+%macro HIGH_APPLY_FILTER_16 1
+    movdqa      xmm5, xmm0
+    movdqa      xmm6, xmm2
+    punpckhwd   xmm5, xmm1
+    punpckhwd   xmm6, xmm3
+    punpcklwd   xmm0, xmm1
+    punpcklwd   xmm2, xmm3
+
+    pmaddwd     xmm5, xmm7
+    pmaddwd     xmm6, xmm7
+    pmaddwd     xmm0, xmm7
+    pmaddwd     xmm2, xmm7
+
+    paddd       xmm5, xmm4                  ;rounding
+    paddd       xmm6, xmm4
+    paddd       xmm0, xmm4
+    paddd       xmm2, xmm4
+
+    psrad       xmm5, 7                     ;shift
+    psrad       xmm6, 7
+    psrad       xmm0, 7
+    psrad       xmm2, 7
+
+    packssdw    xmm0, xmm5                  ;pack back to word
+    packssdw    xmm2, xmm6                  ;pack back to word
+
+    ;clamp the values
+    pminsw      xmm0, max
+    pmaxsw      xmm0, min
+    pminsw      xmm2, max
+    pmaxsw      xmm2, min
+
+%if %1
+    movdqu      xmm1, [rdi]
+    movdqu      xmm3, [rdi + 16]
+    pavgw       xmm0, xmm1
+    pavgw       xmm2, xmm3
+%endif
+    movdqu      [rdi], xmm0               ;store the result
+    movdqu      [rdi + 16], xmm2          ;store the result
+
+    lea         rsi, [rsi + 2*rax]
+    lea         rdi, [rdi + 2*rdx]
+    dec         rcx
+%endm
+
+SECTION .text
+
+global sym(aom_highbd_filter_block1d4_v2_sse2) PRIVATE
+sym(aom_highbd_filter_block1d4_v2_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    HIGH_GET_PARAM_4
+.loop:
+    movq        xmm0, [rsi]                 ;load src
+    movq        xmm1, [rsi + 2*rax]
+
+    HIGH_APPLY_FILTER_4 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(aom_highbd_filter_block1d8_v2_sse2) PRIVATE
+sym(aom_highbd_filter_block1d8_v2_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 8
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 2
+    %define max [rsp + 16 * 0]
+    %define min [rsp + 16 * 1]
+
+    HIGH_GET_PARAM
+.loop:
+    movdqu      xmm0, [rsi]                 ;0
+    movdqu      xmm1, [rsi + 2*rax]         ;1
+
+    HIGH_APPLY_FILTER_8 0
+    jnz         .loop
+
+    add rsp, 16 * 2
+    pop rsp
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(aom_highbd_filter_block1d16_v2_sse2) PRIVATE
+sym(aom_highbd_filter_block1d16_v2_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 9
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 2
+    %define max [rsp + 16 * 0]
+    %define min [rsp + 16 * 1]
+
+    HIGH_GET_PARAM
+.loop:
+    movdqu        xmm0, [rsi]               ;0
+    movdqu        xmm2, [rsi + 16]
+    movdqu        xmm1, [rsi + 2*rax]       ;1
+    movdqu        xmm3, [rsi + 2*rax + 16]
+
+    HIGH_APPLY_FILTER_16 0
+    jnz         .loop
+
+    add rsp, 16 * 2
+    pop rsp
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(aom_highbd_filter_block1d4_h2_sse2) PRIVATE
+sym(aom_highbd_filter_block1d4_h2_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    HIGH_GET_PARAM_4
+.loop:
+    movdqu      xmm0, [rsi]                 ;load src
+    movdqa      xmm1, xmm0
+    psrldq      xmm1, 2
+
+    HIGH_APPLY_FILTER_4 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(aom_highbd_filter_block1d8_h2_sse2) PRIVATE
+sym(aom_highbd_filter_block1d8_h2_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 8
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 2
+    %define max [rsp + 16 * 0]
+    %define min [rsp + 16 * 1]
+
+    HIGH_GET_PARAM
+.loop:
+    movdqu      xmm0, [rsi]                 ;load src
+    movdqu      xmm1, [rsi + 2]
+
+    HIGH_APPLY_FILTER_8 0
+    jnz         .loop
+
+    add rsp, 16 * 2
+    pop rsp
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(aom_highbd_filter_block1d16_h2_sse2) PRIVATE
+sym(aom_highbd_filter_block1d16_h2_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 9
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 2
+    %define max [rsp + 16 * 0]
+    %define min [rsp + 16 * 1]
+
+    HIGH_GET_PARAM
+.loop:
+    movdqu      xmm0,   [rsi]               ;load src
+    movdqu      xmm1,   [rsi + 2]
+    movdqu      xmm2,   [rsi + 16]
+    movdqu      xmm3,   [rsi + 18]
+
+    HIGH_APPLY_FILTER_16 0
+    jnz         .loop
+
+    add rsp, 16 * 2
+    pop rsp
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
diff --git a/libs/libaom/src/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c b/libs/libaom/src/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
new file mode 100644
index 000000000..94b5da171
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
@@ -0,0 +1,1441 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/convolve.h"
+#include "aom_dsp/x86/convolve_avx2.h"
+#include "aom_ports/mem.h"
+
+#if defined(__clang__)
+#if (__clang_major__ > 0 && __clang_major__ < 3) ||            \
+    (__clang_major__ == 3 && __clang_minor__ <= 3) ||          \
+    (defined(__APPLE__) && defined(__apple_build_version__) && \
+     ((__clang_major__ == 4 && __clang_minor__ <= 2) ||        \
+      (__clang_major__ == 5 && __clang_minor__ == 0)))
+#define MM256_BROADCASTSI128_SI256(x) \
+  _mm_broadcastsi128_si256((__m128i const *)&(x))
+#else  // clang > 3.3, and not 5.0 on macosx.
+#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
+#endif  // clang <= 3.3
+#elif defined(__GNUC__)
+#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6)
+#define MM256_BROADCASTSI128_SI256(x) \
+  _mm_broadcastsi128_si256((__m128i const *)&(x))
+#elif __GNUC__ == 4 && __GNUC_MINOR__ == 7
+#define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x)
+#else  // gcc > 4.7
+#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
+#endif  // gcc <= 4.6
+#else   // !(gcc || clang)
+#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
+#endif  // __clang__
+
+static INLINE void xx_storeu2_epi32(const uint8_t *output_ptr,
+                                    const ptrdiff_t stride, const __m256i *a) {
+  *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(_mm256_castsi256_si128(*a));
+  *((uint32_t *)(output_ptr + stride)) =
+      _mm_cvtsi128_si32(_mm256_extracti128_si256(*a, 1));
+}
+
+static INLINE __m256i xx_loadu2_epi64(const void *hi, const void *lo) {
+  __m256i a = _mm256_castsi128_si256(_mm_loadl_epi64((const __m128i *)(lo)));
+  a = _mm256_inserti128_si256(a, _mm_loadl_epi64((const __m128i *)(hi)), 1);
+  return a;
+}
+
+static INLINE void xx_storeu2_epi64(const uint8_t *output_ptr,
+                                    const ptrdiff_t stride, const __m256i *a) {
+  _mm_storel_epi64((__m128i *)output_ptr, _mm256_castsi256_si128(*a));
+  _mm_storel_epi64((__m128i *)(output_ptr + stride),
+                   _mm256_extractf128_si256(*a, 1));
+}
+
+static INLINE __m256i xx_loadu2_mi128(const void *hi, const void *lo) {
+  __m256i a = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(lo)));
+  a = _mm256_inserti128_si256(a, _mm_loadu_si128((const __m128i *)(hi)), 1);
+  return a;
+}
+
+static INLINE void xx_store2_mi128(const uint8_t *output_ptr,
+                                   const ptrdiff_t stride, const __m256i *a) {
+  _mm_store_si128((__m128i *)output_ptr, _mm256_castsi256_si128(*a));
+  _mm_store_si128((__m128i *)(output_ptr + stride),
+                  _mm256_extractf128_si256(*a, 1));
+}
+
+static void aom_filter_block1d4_h4_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+  __m128i filtersReg;
+  __m256i addFilterReg32, filt1Reg, firstFilters, srcReg32b1, srcRegFilt32b1_1;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+  src_ptr -= 3;
+  addFilterReg32 = _mm256_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+  // converting the 16 bit (short) to 8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+  // have the same data in both lanes of a 256 bit register
+  const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+  firstFilters =
+      _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi32(0x5040302u));
+  filt1Reg = _mm256_load_si256((__m256i const *)(filt4_d4_global_avx2));
+
+  // multiple the size of the source and destination stride by two
+  src_stride = src_pixels_per_line << 1;
+  dst_stride = output_pitch << 1;
+  for (i = output_height; i > 1; i -= 2) {
+    // load the 2 strides of source
+    srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
+
+    // filter the source buffer
+    srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
+
+    // multiply 4 adjacent elements with the filter and add the result
+    srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
+
+    srcRegFilt32b1_1 =
+        _mm256_hadds_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
+
+    // shift by 6 bit each 16 bit
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+    srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve result
+    srcRegFilt32b1_1 =
+        _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
+
+    src_ptr += src_stride;
+
+    xx_storeu2_epi32(output_ptr, output_pitch, &srcRegFilt32b1_1);
+    output_ptr += dst_stride;
+  }
+
+  // if the number of strides is odd.
+  // process only 4 bytes
+  if (i > 0) {
+    __m128i srcReg1, srcRegFilt1_1;
+
+    srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
+
+    // filter the source buffer
+    srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
+
+    // multiply 4 adjacent elements with the filter and add the result
+    srcRegFilt1_1 =
+        _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
+
+    srcRegFilt1_1 = _mm_hadds_epi16(srcRegFilt1_1, _mm_setzero_si128());
+    // shift by 6 bit each 16 bit
+    srcRegFilt1_1 =
+        _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
+    srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve result
+    srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
+
+    // save 4 bytes
+    *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt1_1);
+  }
+}
+
+static void aom_filter_block1d4_h8_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+  __m128i filtersReg;
+  __m256i addFilterReg32, filt1Reg, filt2Reg;
+  __m256i firstFilters, secondFilters;
+  __m256i srcRegFilt32b1_1, srcRegFilt32b2;
+  __m256i srcReg32b1;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+  src_ptr -= 3;
+  addFilterReg32 = _mm256_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+  // converting the 16 bit (short) to 8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+  // have the same data in both lanes of a 256 bit register
+  const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+  // duplicate only the first 32 bits
+  firstFilters = _mm256_shuffle_epi32(filtersReg32, 0);
+  // duplicate only the second 32 bits
+  secondFilters = _mm256_shuffle_epi32(filtersReg32, 0x55);
+
+  filt1Reg = _mm256_load_si256((__m256i const *)filt_d4_global_avx2);
+  filt2Reg = _mm256_load_si256((__m256i const *)(filt_d4_global_avx2 + 32));
+
+  // multiple the size of the source and destination stride by two
+  src_stride = src_pixels_per_line << 1;
+  dst_stride = output_pitch << 1;
+  for (i = output_height; i > 1; i -= 2) {
+    // load the 2 strides of source
+    srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
+
+    // filter the source buffer
+    srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
+
+    // multiply 4 adjacent elements with the filter and add the result
+    srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
+
+    // filter the source buffer
+    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
+
+    // multiply 4 adjacent elements with the filter and add the result
+    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters);
+
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
+
+    srcRegFilt32b1_1 =
+        _mm256_hadds_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
+
+    // shift by 6 bit each 16 bit
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+    srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve result
+    srcRegFilt32b1_1 =
+        _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
+
+    src_ptr += src_stride;
+
+    xx_storeu2_epi32(output_ptr, output_pitch, &srcRegFilt32b1_1);
+    output_ptr += dst_stride;
+  }
+
+  // if the number of strides is odd.
+  // process only 4 bytes
+  if (i > 0) {
+    __m128i srcReg1, srcRegFilt1_1;
+    __m128i srcRegFilt2;
+
+    srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
+
+    // filter the source buffer
+    srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
+
+    // multiply 4 adjacent elements with the filter and add the result
+    srcRegFilt1_1 =
+        _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
+
+    // filter the source buffer
+    srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
+
+    // multiply 4 adjacent elements with the filter and add the result
+    srcRegFilt2 =
+        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(secondFilters));
+
+    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
+    srcRegFilt1_1 = _mm_hadds_epi16(srcRegFilt1_1, _mm_setzero_si128());
+    // shift by 6 bit each 16 bit
+    srcRegFilt1_1 =
+        _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
+    srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve result
+    srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
+
+    // save 4 bytes
+    *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt1_1);
+  }
+}
+
+static void aom_filter_block1d8_h4_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+  __m128i filtersReg;
+  __m256i addFilterReg32, filt2Reg, filt3Reg;
+  __m256i secondFilters, thirdFilters;
+  __m256i srcRegFilt32b1_1, srcRegFilt32b2, srcRegFilt32b3;
+  __m256i srcReg32b1, filtersReg32;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+  src_ptr -= 3;
+  addFilterReg32 = _mm256_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+  // converting the 16 bit (short) to 8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+  // have the same data in both lanes of a 256 bit register
+  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+  // duplicate only the second 16 bits (third and forth byte)
+  // across 256 bit register
+  secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
+  // duplicate only the third 16 bits (fifth and sixth byte)
+  // across 256 bit register
+  thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
+
+  filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+  filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+
+  // multiply the size of the source and destination stride by two
+  src_stride = src_pixels_per_line << 1;
+  dst_stride = output_pitch << 1;
+  for (i = output_height; i > 1; i -= 2) {
+    // load the 2 strides of source
+    srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
+
+    // filter the source buffer
+    srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
+    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
+    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
+
+    // shift by 6 bit each 16 bit
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+    srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
+
+    // shrink to 8 bit each 16 bits
+    srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b1_1);
+
+    src_ptr += src_stride;
+
+    xx_storeu2_epi64(output_ptr, output_pitch, &srcRegFilt32b1_1);
+    output_ptr += dst_stride;
+  }
+
+  // if the number of strides is odd.
+  // process only 8 bytes
+  if (i > 0) {
+    __m128i srcReg1, srcRegFilt1_1;
+    __m128i srcRegFilt2, srcRegFilt3;
+
+    srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
+
+    // filter the source buffer
+    srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
+    srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg));
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt2 =
+        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(secondFilters));
+    srcRegFilt3 =
+        _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(thirdFilters));
+
+    // add and saturate the results together
+    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt2, srcRegFilt3);
+
+    // shift by 6 bit each 16 bit
+    srcRegFilt1_1 =
+        _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
+    srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
+
+    // shrink to 8 bit each 16 bits
+    srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
+
+    // save 8 bytes
+    _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1_1);
+  }
+}
+
+static void aom_filter_block1d8_h8_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+  __m128i filtersReg;
+  __m256i addFilterReg32, filt1Reg, filt2Reg, filt3Reg, filt4Reg;
+  __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
+  __m256i srcRegFilt32b1_1, srcRegFilt32b2, srcRegFilt32b3;
+  __m256i srcReg32b1;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+  src_ptr -= 3;
+  addFilterReg32 = _mm256_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+  // converting the 16 bit (short) to 8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+  // have the same data in both lanes of a 256 bit register
+  const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+  // duplicate only the first 16 bits (first and second byte)
+  // across 256 bit register
+  firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
+  // duplicate only the second 16 bits (third and forth byte)
+  // across 256 bit register
+  secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
+  // duplicate only the third 16 bits (fifth and sixth byte)
+  // across 256 bit register
+  thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
+  // duplicate only the forth 16 bits (seventh and eighth byte)
+  // across 256 bit register
+  forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
+
+  filt1Reg = _mm256_load_si256((__m256i const *)filt_global_avx2);
+  filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+  filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+  filt4Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
+
+  // multiple the size of the source and destination stride by two
+  src_stride = src_pixels_per_line << 1;
+  dst_stride = output_pitch << 1;
+  for (i = output_height; i > 1; i -= 2) {
+    // load the 2 strides of source
+    srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
+
+    // filter the source buffer
+    srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
+    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt4Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
+    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
+
+    // add and saturate the results together
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
+
+    // filter the source buffer
+    srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
+    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
+    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+    __m256i sum23 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, sum23);
+
+    // shift by 6 bit each 16 bit
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+    srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve result
+    srcRegFilt32b1_1 =
+        _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
+
+    src_ptr += src_stride;
+
+    xx_storeu2_epi64(output_ptr, output_pitch, &srcRegFilt32b1_1);
+    output_ptr += dst_stride;
+  }
+
+  // if the number of strides is odd.
+  // process only 8 bytes
+  if (i > 0) {
+    __m128i srcReg1, srcRegFilt1_1;
+    __m128i srcRegFilt2, srcRegFilt3;
+
+    srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
+
+    // filter the source buffer
+    srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
+    srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt4Reg));
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt1_1 =
+        _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
+    srcRegFilt2 =
+        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters));
+
+    // add and saturate the results together
+    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
+
+    // filter the source buffer
+    srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
+    srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg));
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt3 =
+        _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters));
+    srcRegFilt2 =
+        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters));
+
+    // add and saturate the results together
+    srcRegFilt1_1 =
+        _mm_adds_epi16(srcRegFilt1_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2));
+
+    // shift by 6 bit each 16 bit
+    srcRegFilt1_1 =
+        _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
+    srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
+
+    // save 8 bytes
+    _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1_1);
+  }
+}
+
+static void aom_filter_block1d16_h4_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+  __m128i filtersReg;
+  __m256i addFilterReg32, filt2Reg, filt3Reg;
+  __m256i secondFilters, thirdFilters;
+  __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
+  __m256i srcReg32b1, srcReg32b2, filtersReg32;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+  src_ptr -= 3;
+  addFilterReg32 = _mm256_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+  // converting the 16 bit (short) to 8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+  // have the same data in both lanes of a 256 bit register
+  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+  // duplicate only the second 16 bits (third and forth byte)
+  // across 256 bit register
+  secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
+  // duplicate only the third 16 bits (fifth and sixth byte)
+  // across 256 bit register
+  thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
+
+  filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+  filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+
+  // multiply the size of the source and destination stride by two
+  src_stride = src_pixels_per_line << 1;
+  dst_stride = output_pitch << 1;
+  for (i = output_height; i > 1; i -= 2) {
+    // load the 2 strides of source
+    srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
+
+    // filter the source buffer
+    srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
+    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
+    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
+
+    // reading 2 strides of the next 16 bytes
+    // (part of it was being read by earlier read)
+    srcReg32b2 =
+        xx_loadu2_mi128(src_ptr + src_pixels_per_line + 8, src_ptr + 8);
+
+    // filter the source buffer
+    srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
+    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt3Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
+    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+    // add and saturate the results together
+    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
+
+    // shift by 6 bit each 16 bit
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg32);
+    srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
+    srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve result
+    srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1);
+
+    src_ptr += src_stride;
+
+    xx_store2_mi128(output_ptr, output_pitch, &srcRegFilt32b1_1);
+    output_ptr += dst_stride;
+  }
+
+  // if the number of strides is odd.
+  // process only 16 bytes
+  if (i > 0) {
+    __m256i srcReg1, srcReg12;
+    __m256i srcRegFilt2, srcRegFilt3, srcRegFilt1_1;
+
+    srcReg1 = _mm256_loadu_si256((const __m256i *)(src_ptr));
+    srcReg12 = _mm256_permute4x64_epi64(srcReg1, 0x94);
+
+    // filter the source buffer
+    srcRegFilt2 = _mm256_shuffle_epi8(srcReg12, filt2Reg);
+    srcRegFilt3 = _mm256_shuffle_epi8(srcReg12, filt3Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt2 = _mm256_maddubs_epi16(srcRegFilt2, secondFilters);
+    srcRegFilt3 = _mm256_maddubs_epi16(srcRegFilt3, thirdFilters);
+
+    // add and saturate the results together
+    srcRegFilt1_1 = _mm256_adds_epi16(srcRegFilt2, srcRegFilt3);
+
+    // shift by 6 bit each 16 bit
+    srcRegFilt1_1 = _mm256_adds_epi16(srcRegFilt1_1, addFilterReg32);
+    srcRegFilt1_1 = _mm256_srai_epi16(srcRegFilt1_1, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    srcRegFilt1_1 = _mm256_packus_epi16(srcRegFilt1_1, srcRegFilt1_1);
+    srcRegFilt1_1 = _mm256_permute4x64_epi64(srcRegFilt1_1, 0x8);
+
+    // save 16 bytes
+    _mm_store_si128((__m128i *)output_ptr,
+                    _mm256_castsi256_si128(srcRegFilt1_1));
+  }
+}
+
+static void aom_filter_block1d16_h8_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+  __m128i filtersReg;
+  __m256i addFilterReg32, filt1Reg, filt2Reg, filt3Reg, filt4Reg;
+  __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
+  __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
+  __m256i srcReg32b1, srcReg32b2, filtersReg32;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+  src_ptr -= 3;
+  addFilterReg32 = _mm256_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+  // converting the 16 bit (short) to 8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+  // have the same data in both lanes of a 256 bit register
+  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+  // duplicate only the first 16 bits (first and second byte)
+  // across 256 bit register
+  firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
+  // duplicate only the second 16 bits (third and forth byte)
+  // across 256 bit register
+  secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
+  // duplicate only the third 16 bits (fifth and sixth byte)
+  // across 256 bit register
+  thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
+  // duplicate only the forth 16 bits (seventh and eighth byte)
+  // across 256 bit register
+  forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
+
+  filt1Reg = _mm256_load_si256((__m256i const *)filt_global_avx2);
+  filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+  filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+  filt4Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
+
+  // multiple the size of the source and destination stride by two
+  src_stride = src_pixels_per_line << 1;
+  dst_stride = output_pitch << 1;
+  for (i = output_height; i > 1; i -= 2) {
+    // load the 2 strides of source
+    srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
+
+    // filter the source buffer
+    srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
+    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt4Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
+    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
+
+    // add and saturate the results together
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
+
+    // filter the source buffer
+    srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
+    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
+    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+    __m256i sum23 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, sum23);
+
+    // reading 2 strides of the next 16 bytes
+    // (part of it was being read by earlier read)
+    srcReg32b2 =
+        xx_loadu2_mi128(src_ptr + src_pixels_per_line + 8, src_ptr + 8);
+
+    // filter the source buffer
+    srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg);
+    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters);
+    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
+
+    // add and saturate the results together
+    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2);
+
+    // filter the source buffer
+    srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
+    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt3Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
+    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+    // add and saturate the results together
+    srcRegFilt32b2_1 = _mm256_adds_epi16(
+        srcRegFilt32b2_1, _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2));
+
+    // shift by 6 bit each 16 bit
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg32);
+    srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
+    srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve result
+    srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1);
+
+    src_ptr += src_stride;
+
+    xx_store2_mi128(output_ptr, output_pitch, &srcRegFilt32b1_1);
+    output_ptr += dst_stride;
+  }
+
+  // if the number of strides is odd.
+  // process only 16 bytes
+  if (i > 0) {
+    __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1;
+    __m128i srcRegFilt2, srcRegFilt3;
+
+    srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
+
+    // filter the source buffer
+    srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
+    srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt4Reg));
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt1_1 =
+        _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
+    srcRegFilt2 =
+        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters));
+
+    // add and saturate the results together
+    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
+
+    // filter the source buffer
+    srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
+    srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg));
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt3 =
+        _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters));
+    srcRegFilt2 =
+        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters));
+
+    // add and saturate the results together
+    srcRegFilt1_1 =
+        _mm_adds_epi16(srcRegFilt1_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2));
+
+    // reading the next 16 bytes
+    // (part of it was being read by earlier read)
+    srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 8));
+
+    // filter the source buffer
+    srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt1Reg));
+    srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt4Reg));
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt2_1 =
+        _mm_maddubs_epi16(srcRegFilt2_1, _mm256_castsi256_si128(firstFilters));
+    srcRegFilt2 =
+        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters));
+
+    // add and saturate the results together
+    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
+
+    // filter the source buffer
+    srcRegFilt3 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt2Reg));
+    srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt3Reg));
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt3 =
+        _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters));
+    srcRegFilt2 =
+        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters));
+
+    // add and saturate the results together
+    srcRegFilt2_1 =
+        _mm_adds_epi16(srcRegFilt2_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2));
+
+    // shift by 6 bit each 16 bit
+    srcRegFilt1_1 =
+        _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
+    srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
+
+    srcRegFilt2_1 =
+        _mm_adds_epi16(srcRegFilt2_1, _mm256_castsi256_si128(addFilterReg32));
+    srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);
+
+    // save 16 bytes
+    _mm_store_si128((__m128i *)output_ptr, srcRegFilt1_1);
+  }
+}
+
+static void aom_filter_block1d8_v4_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+  __m128i filtersReg;
+  __m256i filtersReg32, addFilterReg32;
+  __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56;
+  __m256i srcReg23_34_lo, srcReg45_56_lo;
+  __m256i resReg23_34_lo, resReg45_56_lo;
+  __m256i resReglo, resReg;
+  __m256i secondFilters, thirdFilters;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+
+  addFilterReg32 = _mm256_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the
+  // same data in both lanes of 128 bit register.
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+  // have the same data in both lanes of a 256 bit register
+  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+  // duplicate only the second 16 bits (third and forth byte)
+  // across 256 bit register
+  secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
+  // duplicate only the third 16 bits (fifth and sixth byte)
+  // across 256 bit register
+  thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
+
+  // multiple the size of the source and destination stride by two
+  src_stride = src_pitch << 1;
+  dst_stride = out_pitch << 1;
+
+  srcReg23 = xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
+  srcReg4x = _mm256_castsi128_si256(
+      _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)));
+
+  // have consecutive loads on the same 256 register
+  srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21);
+
+  srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34);
+
+  for (i = output_height; i > 1; i -= 2) {
+    // load the last 2 loads of 16 bytes and have every two
+    // consecutive loads in the same 256 bit register
+    srcReg5x = _mm256_castsi128_si256(
+        _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)));
+    srcReg45 =
+        _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1);
+
+    srcReg6x = _mm256_castsi128_si256(
+        _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)));
+    srcReg56 =
+        _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1);
+
+    // merge every two consecutive registers
+    srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    resReg23_34_lo = _mm256_maddubs_epi16(srcReg23_34_lo, secondFilters);
+    resReg45_56_lo = _mm256_maddubs_epi16(srcReg45_56_lo, thirdFilters);
+
+    // add and saturate the results together
+    resReglo = _mm256_adds_epi16(resReg23_34_lo, resReg45_56_lo);
+
+    // shift by 6 bit each 16 bit
+    resReglo = _mm256_adds_epi16(resReglo, addFilterReg32);
+    resReglo = _mm256_srai_epi16(resReglo, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    resReg = _mm256_packus_epi16(resReglo, resReglo);
+
+    src_ptr += src_stride;
+
+    xx_storeu2_epi64(output_ptr, out_pitch, &resReg);
+
+    output_ptr += dst_stride;
+
+    // save part of the registers for next strides
+    srcReg23_34_lo = srcReg45_56_lo;
+    srcReg4x = srcReg6x;
+  }
+}
+
+static void aom_filter_block1d8_v8_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+  __m128i filtersReg;
+  __m256i addFilterReg32;
+  __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
+  __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;
+  __m256i srcReg32b11, srcReg32b12, filtersReg32;
+  __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+
+  addFilterReg32 = _mm256_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the
+  // same data in both lanes of 128 bit register.
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+  // have the same data in both lanes of a 256 bit register
+  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+  // duplicate only the first 16 bits (first and second byte)
+  // across 256 bit register
+  firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
+  // duplicate only the second 16 bits (third and forth byte)
+  // across 256 bit register
+  secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
+  // duplicate only the third 16 bits (fifth and sixth byte)
+  // across 256 bit register
+  thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
+  // duplicate only the forth 16 bits (seventh and eighth byte)
+  // across 256 bit register
+  forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
+
+  // multiple the size of the source and destination stride by two
+  src_stride = src_pitch << 1;
+  dst_stride = out_pitch << 1;
+
+  // load 16 bytes 7 times in stride of src_pitch
+  srcReg32b1 = xx_loadu2_epi64(src_ptr + src_pitch, src_ptr);
+  srcReg32b3 =
+      xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
+  srcReg32b5 =
+      xx_loadu2_epi64(src_ptr + src_pitch * 5, src_ptr + src_pitch * 4);
+  srcReg32b7 = _mm256_castsi128_si256(
+      _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)));
+
+  // have each consecutive loads on the same 256 register
+  srcReg32b2 = _mm256_permute2x128_si256(srcReg32b1, srcReg32b3, 0x21);
+  srcReg32b4 = _mm256_permute2x128_si256(srcReg32b3, srcReg32b5, 0x21);
+  srcReg32b6 = _mm256_permute2x128_si256(srcReg32b5, srcReg32b7, 0x21);
+  // merge every two consecutive registers except the last one
+  srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2);
+  srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4);
+  srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6);
+
+  for (i = output_height; i > 1; i -= 2) {
+    // load the last 2 loads of 16 bytes and have every two
+    // consecutive loads in the same 256 bit register
+    srcReg32b8 = _mm256_castsi128_si256(
+        _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7)));
+    srcReg32b7 = _mm256_inserti128_si256(srcReg32b7,
+                                         _mm256_castsi256_si128(srcReg32b8), 1);
+    srcReg32b9 = _mm256_castsi128_si256(
+        _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 8)));
+    srcReg32b8 = _mm256_inserti128_si256(srcReg32b8,
+                                         _mm256_castsi256_si128(srcReg32b9), 1);
+
+    // merge every two consecutive registers
+    // save
+    srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);
+    srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters);
+
+    // add and saturate the results together
+    srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters);
+    srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters);
+
+    // add and saturate the results together
+    srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
+                                    _mm256_adds_epi16(srcReg32b8, srcReg32b12));
+
+    // shift by 6 bit each 16 bit
+    srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg32);
+    srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    srcReg32b1 = _mm256_packus_epi16(srcReg32b10, _mm256_setzero_si256());
+
+    src_ptr += src_stride;
+
+    xx_storeu2_epi64(output_ptr, out_pitch, &srcReg32b1);
+
+    output_ptr += dst_stride;
+
+    // save part of the registers for next strides
+    srcReg32b10 = srcReg32b11;
+    srcReg32b11 = srcReg32b2;
+    srcReg32b2 = srcReg32b4;
+    srcReg32b7 = srcReg32b9;
+  }
+  if (i > 0) {
+    __m128i srcRegFilt1, srcRegFilt4, srcRegFilt6, srcRegFilt8;
+    // load the last 16 bytes
+    srcRegFilt8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
+
+    // merge the last 2 results together
+    srcRegFilt4 =
+        _mm_unpacklo_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10),
+                                    _mm256_castsi256_si128(firstFilters));
+    srcRegFilt4 =
+        _mm_maddubs_epi16(srcRegFilt4, _mm256_castsi256_si128(forthFilters));
+
+    // add and saturate the results together
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11),
+                                    _mm256_castsi256_si128(secondFilters));
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2),
+                                    _mm256_castsi256_si128(thirdFilters));
+
+    // add and saturate the results together
+    srcRegFilt1 =
+        _mm_adds_epi16(srcRegFilt1, _mm_adds_epi16(srcRegFilt4, srcRegFilt6));
+
+    // shift by 6 bit each 16 bit
+    srcRegFilt1 =
+        _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg32));
+    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve result
+    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, _mm_setzero_si128());
+
+    // save 8 bytes
+    _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1);
+  }
+}
+
+static void aom_filter_block1d16_v4_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+  __m128i filtersReg;
+  __m256i filtersReg32, addFilterReg32;
+  __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56;
+  __m256i srcReg23_34_lo, srcReg23_34_hi, srcReg45_56_lo, srcReg45_56_hi;
+  __m256i resReg23_34_lo, resReg23_34_hi, resReg45_56_lo, resReg45_56_hi;
+  __m256i resReglo, resReghi, resReg;
+  __m256i secondFilters, thirdFilters;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+
+  addFilterReg32 = _mm256_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the
+  // same data in both lanes of 128 bit register.
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+  // have the same data in both lanes of a 256 bit register
+  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+  // duplicate only the second 16 bits (third and forth byte)
+  // across 256 bit register
+  secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
+  // duplicate only the third 16 bits (fifth and sixth byte)
+  // across 256 bit register
+  thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
+
+  // multiple the size of the source and destination stride by two
+  src_stride = src_pitch << 1;
+  dst_stride = out_pitch << 1;
+
+  srcReg23 = xx_loadu2_mi128(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
+  srcReg4x = _mm256_castsi128_si256(
+      _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)));
+
+  // have consecutive loads on the same 256 register
+  srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21);
+
+  srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34);
+  srcReg23_34_hi = _mm256_unpackhi_epi8(srcReg23, srcReg34);
+
+  for (i = output_height; i > 1; i -= 2) {
+    // load the last 2 loads of 16 bytes and have every two
+    // consecutive loads in the same 256 bit register
+    srcReg5x = _mm256_castsi128_si256(
+        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)));
+    srcReg45 =
+        _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1);
+
+    srcReg6x = _mm256_castsi128_si256(
+        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)));
+    srcReg56 =
+        _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1);
+
+    // merge every two consecutive registers
+    srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56);
+    srcReg45_56_hi = _mm256_unpackhi_epi8(srcReg45, srcReg56);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    resReg23_34_lo = _mm256_maddubs_epi16(srcReg23_34_lo, secondFilters);
+    resReg45_56_lo = _mm256_maddubs_epi16(srcReg45_56_lo, thirdFilters);
+
+    // add and saturate the results together
+    resReglo = _mm256_adds_epi16(resReg23_34_lo, resReg45_56_lo);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    resReg23_34_hi = _mm256_maddubs_epi16(srcReg23_34_hi, secondFilters);
+    resReg45_56_hi = _mm256_maddubs_epi16(srcReg45_56_hi, thirdFilters);
+
+    // add and saturate the results together
+    resReghi = _mm256_adds_epi16(resReg23_34_hi, resReg45_56_hi);
+
+    // shift by 6 bit each 16 bit
+    resReglo = _mm256_adds_epi16(resReglo, addFilterReg32);
+    resReghi = _mm256_adds_epi16(resReghi, addFilterReg32);
+    resReglo = _mm256_srai_epi16(resReglo, 6);
+    resReghi = _mm256_srai_epi16(resReghi, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    resReg = _mm256_packus_epi16(resReglo, resReghi);
+
+    src_ptr += src_stride;
+
+    xx_store2_mi128(output_ptr, out_pitch, &resReg);
+
+    output_ptr += dst_stride;
+
+    // save part of the registers for next strides
+    srcReg23_34_lo = srcReg45_56_lo;
+    srcReg23_34_hi = srcReg45_56_hi;
+    srcReg4x = srcReg6x;
+  }
+}
+
+static void aom_filter_block1d16_v8_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+  __m128i filtersReg;
+  __m256i addFilterReg32;
+  __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
+  __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;
+  __m256i srcReg32b11, srcReg32b12, filtersReg32;
+  __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+
+  addFilterReg32 = _mm256_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the
+  // same data in both lanes of 128 bit register.
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+  // have the same data in both lanes of a 256 bit register
+  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+  // duplicate only the first 16 bits (first and second byte)
+  // across 256 bit register
+  firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
+  // duplicate only the second 16 bits (third and forth byte)
+  // across 256 bit register
+  secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
+  // duplicate only the third 16 bits (fifth and sixth byte)
+  // across 256 bit register
+  thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
+  // duplicate only the forth 16 bits (seventh and eighth byte)
+  // across 256 bit register
+  forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
+
+  // multiple the size of the source and destination stride by two
+  src_stride = src_pitch << 1;
+  dst_stride = out_pitch << 1;
+
+  // load 16 bytes 7 times in stride of src_pitch
+  srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pitch, src_ptr);
+  srcReg32b3 =
+      xx_loadu2_mi128(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
+  srcReg32b5 =
+      xx_loadu2_mi128(src_ptr + src_pitch * 5, src_ptr + src_pitch * 4);
+  srcReg32b7 = _mm256_castsi128_si256(
+      _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)));
+
+  // have each consecutive loads on the same 256 register
+  srcReg32b2 = _mm256_permute2x128_si256(srcReg32b1, srcReg32b3, 0x21);
+  srcReg32b4 = _mm256_permute2x128_si256(srcReg32b3, srcReg32b5, 0x21);
+  srcReg32b6 = _mm256_permute2x128_si256(srcReg32b5, srcReg32b7, 0x21);
+  // merge every two consecutive registers except the last one
+  srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2);
+  srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2);
+
+  // save
+  srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4);
+  srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4);
+  srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6);
+  srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6);
+
+  for (i = output_height; i > 1; i -= 2) {
+    // load the last 2 loads of 16 bytes and have every two
+    // consecutive loads in the same 256 bit register
+    srcReg32b8 = _mm256_castsi128_si256(
+        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)));
+    srcReg32b7 = _mm256_inserti128_si256(srcReg32b7,
+                                         _mm256_castsi256_si128(srcReg32b8), 1);
+    srcReg32b9 = _mm256_castsi128_si256(
+        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8)));
+    srcReg32b8 = _mm256_inserti128_si256(srcReg32b8,
+                                         _mm256_castsi256_si128(srcReg32b9), 1);
+
+    // merge every two consecutive registers
+    // save
+    srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8);
+    srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);
+    srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters);
+
+    // add and saturate the results together
+    srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters);
+    srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters);
+
+    // add and saturate the results together
+    srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
+                                    _mm256_adds_epi16(srcReg32b8, srcReg32b12));
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters);
+    srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters);
+
+    srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters);
+    srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters);
+
+    // add and saturate the results together
+    srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
+                                   _mm256_adds_epi16(srcReg32b8, srcReg32b12));
+
+    // shift by 6 bit each 16 bit
+    srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg32);
+    srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg32);
+    srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 6);
+    srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1);
+
+    src_ptr += src_stride;
+
+    xx_store2_mi128(output_ptr, out_pitch, &srcReg32b1);
+
+    output_ptr += dst_stride;
+
+    // save part of the registers for next strides
+    srcReg32b10 = srcReg32b11;
+    srcReg32b1 = srcReg32b3;
+    srcReg32b11 = srcReg32b2;
+    srcReg32b3 = srcReg32b5;
+    srcReg32b2 = srcReg32b4;
+    srcReg32b5 = srcReg32b7;
+    srcReg32b7 = srcReg32b9;
+  }
+  if (i > 0) {
+    __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5;
+    __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8;
+    // load the last 16 bytes
+    srcRegFilt8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
+
+    // merge the last 2 results together
+    srcRegFilt4 =
+        _mm_unpacklo_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
+    srcRegFilt7 =
+        _mm_unpackhi_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10),
+                                    _mm256_castsi256_si128(firstFilters));
+    srcRegFilt4 =
+        _mm_maddubs_epi16(srcRegFilt4, _mm256_castsi256_si128(forthFilters));
+    srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1),
+                                    _mm256_castsi256_si128(firstFilters));
+    srcRegFilt7 =
+        _mm_maddubs_epi16(srcRegFilt7, _mm256_castsi256_si128(forthFilters));
+
+    // add and saturate the results together
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
+    srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11),
+                                    _mm256_castsi256_si128(secondFilters));
+    srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3),
+                                    _mm256_castsi256_si128(secondFilters));
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2),
+                                    _mm256_castsi256_si128(thirdFilters));
+    srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5),
+                                    _mm256_castsi256_si128(thirdFilters));
+
+    // add and saturate the results together
+    srcRegFilt1 =
+        _mm_adds_epi16(srcRegFilt1, _mm_adds_epi16(srcRegFilt4, srcRegFilt6));
+    srcRegFilt3 =
+        _mm_adds_epi16(srcRegFilt3, _mm_adds_epi16(srcRegFilt5, srcRegFilt7));
+
+    // shift by 6 bit each 16 bit
+    srcRegFilt1 =
+        _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg32));
+    srcRegFilt3 =
+        _mm_adds_epi16(srcRegFilt3, _mm256_castsi256_si128(addFilterReg32));
+    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 6);
+    srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3);
+
+    // save 16 bytes
+    _mm_store_si128((__m128i *)output_ptr, srcRegFilt1);
+  }
+}
+
+static void aom_filter_block1d4_v4_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+  __m128i filtersReg;
+  __m256i filtersReg32, addFilterReg32;
+  __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56;
+  __m256i srcReg23_34_lo, srcReg45_56_lo;
+  __m256i srcReg2345_3456_lo;
+  __m256i resReglo, resReg;
+  __m256i firstFilters;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+
+  addFilterReg32 = _mm256_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the
+  // same data in both lanes of 128 bit register.
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+  // have the same data in both lanes of a 256 bit register
+  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+  firstFilters =
+      _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi32(0x5040302u));
+
+  // multiple the size of the source and destination stride by two
+  src_stride = src_pitch << 1;
+  dst_stride = out_pitch << 1;
+
+  srcReg23 = xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
+  srcReg4x = _mm256_castsi128_si256(
+      _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)));
+
+  // have consecutive loads on the same 256 register
+  srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21);
+
+  srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34);
+
+  for (i = output_height; i > 1; i -= 2) {
+    // load the last 2 loads of 16 bytes and have every two
+    // consecutive loads in the same 256 bit register
+    srcReg5x = _mm256_castsi128_si256(
+        _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)));
+    srcReg45 =
+        _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1);
+
+    srcReg6x = _mm256_castsi128_si256(
+        _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)));
+    srcReg56 =
+        _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1);
+
+    // merge every two consecutive registers
+    srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56);
+
+    srcReg2345_3456_lo = _mm256_unpacklo_epi16(srcReg23_34_lo, srcReg45_56_lo);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    resReglo = _mm256_maddubs_epi16(srcReg2345_3456_lo, firstFilters);
+
+    resReglo = _mm256_hadds_epi16(resReglo, _mm256_setzero_si256());
+
+    // shift by 6 bit each 16 bit
+    resReglo = _mm256_adds_epi16(resReglo, addFilterReg32);
+    resReglo = _mm256_srai_epi16(resReglo, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    resReg = _mm256_packus_epi16(resReglo, resReglo);
+
+    src_ptr += src_stride;
+
+    xx_storeu2_epi32(output_ptr, out_pitch, &resReg);
+
+    output_ptr += dst_stride;
+
+    // save part of the registers for next strides
+    srcReg23_34_lo = srcReg45_56_lo;
+    srcReg4x = srcReg6x;
+  }
+}
+
+#if HAVE_AVX2 && HAVE_SSSE3
+filter8_1dfunction aom_filter_block1d4_v8_ssse3;
+filter8_1dfunction aom_filter_block1d16_v2_ssse3;
+filter8_1dfunction aom_filter_block1d16_h2_ssse3;
+filter8_1dfunction aom_filter_block1d8_v2_ssse3;
+filter8_1dfunction aom_filter_block1d8_h2_ssse3;
+filter8_1dfunction aom_filter_block1d4_v2_ssse3;
+filter8_1dfunction aom_filter_block1d4_h2_ssse3;
+#define aom_filter_block1d4_v8_avx2 aom_filter_block1d4_v8_ssse3
+#define aom_filter_block1d16_v2_avx2 aom_filter_block1d16_v2_ssse3
+#define aom_filter_block1d16_h2_avx2 aom_filter_block1d16_h2_ssse3
+#define aom_filter_block1d8_v2_avx2 aom_filter_block1d8_v2_ssse3
+#define aom_filter_block1d8_h2_avx2 aom_filter_block1d8_h2_ssse3
+#define aom_filter_block1d4_v2_avx2 aom_filter_block1d4_v2_ssse3
+#define aom_filter_block1d4_h2_avx2 aom_filter_block1d4_h2_ssse3
+// void aom_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
+//                                uint8_t *dst, ptrdiff_t dst_stride,
+//                                const int16_t *filter_x, int x_step_q4,
+//                                const int16_t *filter_y, int y_step_q4,
+//                                int w, int h);
+// void aom_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
+//                               uint8_t *dst, ptrdiff_t dst_stride,
+//                               const int16_t *filter_x, int x_step_q4,
+//                               const int16_t *filter_y, int y_step_q4,
+//                               int w, int h);
+FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);
+FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
+
+#endif  // HAVE_AX2 && HAVE_SSSE3
diff --git a/libs/libaom/src/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c b/libs/libaom/src/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c
new file mode 100644
index 000000000..cff7f43ee
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c
@@ -0,0 +1,569 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>  // SSE2
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom_dsp/x86/convolve.h"
+#include "aom_ports/mem.h"
+
+void aom_filter_block1d16_h4_sse2(const uint8_t *src_ptr,
+                                  ptrdiff_t src_pixels_per_line,
+                                  uint8_t *output_ptr, ptrdiff_t output_pitch,
+                                  uint32_t output_height,
+                                  const int16_t *filter) {
+  __m128i filtersReg;
+  __m128i addFilterReg32;
+  __m128i secondFilters, thirdFilters;
+  __m128i srcRegFilt32b1_1, srcRegFilt32b1_2, srcRegFilt32b2_1,
+      srcRegFilt32b2_2;
+  __m128i srcReg32b1, srcReg32b2;
+  unsigned int i;
+  src_ptr -= 3;
+  addFilterReg32 = _mm_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+
+  // coeffs 0 1 0 1 2 3 2 3
+  const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
+  // coeffs 4 5 4 5 6 7 6 7
+  const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
+
+  secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0);  // coeffs 2 3 2 3 2 3 2 3
+  thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1);   // coeffs 4 5 4 5 4 5 4 5
+
+  for (i = output_height; i > 0; i -= 1) {
+    srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr);
+
+    __m128i ss_2 = _mm_srli_si128(srcReg32b1, 2);
+    __m128i ss_4 = _mm_srli_si128(srcReg32b1, 4);
+    __m128i ss_1_1 = _mm_unpacklo_epi8(ss_2, _mm_setzero_si128());
+    __m128i ss_2_1 = _mm_unpacklo_epi8(ss_4, _mm_setzero_si128());
+    __m128i d1 = _mm_madd_epi16(ss_1_1, secondFilters);
+    __m128i d2 = _mm_madd_epi16(ss_2_1, thirdFilters);
+    srcRegFilt32b1_1 = _mm_add_epi32(d1, d2);
+
+    __m128i ss_1 = _mm_srli_si128(srcReg32b1, 3);
+    __m128i ss_3 = _mm_srli_si128(srcReg32b1, 5);
+    __m128i ss_1_2 = _mm_unpacklo_epi8(ss_1, _mm_setzero_si128());
+    __m128i ss_2_2 = _mm_unpacklo_epi8(ss_3, _mm_setzero_si128());
+    d1 = _mm_madd_epi16(ss_1_2, secondFilters);
+    d2 = _mm_madd_epi16(ss_2_2, thirdFilters);
+    srcRegFilt32b1_2 = _mm_add_epi32(d1, d2);
+
+    __m128i res_lo = _mm_unpacklo_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2);
+    __m128i res_hi = _mm_unpackhi_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2);
+    srcRegFilt32b1_1 = _mm_packs_epi32(res_lo, res_hi);
+
+    // reading stride of the next 16 bytes
+    // (part of it was being read by earlier read)
+    srcReg32b2 = _mm_loadu_si128((const __m128i *)(src_ptr + 8));
+
+    ss_2 = _mm_srli_si128(srcReg32b2, 2);
+    ss_4 = _mm_srli_si128(srcReg32b2, 4);
+    ss_1_1 = _mm_unpacklo_epi8(ss_2, _mm_setzero_si128());
+    ss_2_1 = _mm_unpacklo_epi8(ss_4, _mm_setzero_si128());
+    d1 = _mm_madd_epi16(ss_1_1, secondFilters);
+    d2 = _mm_madd_epi16(ss_2_1, thirdFilters);
+    srcRegFilt32b2_1 = _mm_add_epi32(d1, d2);
+
+    ss_1 = _mm_srli_si128(srcReg32b2, 3);
+    ss_3 = _mm_srli_si128(srcReg32b2, 5);
+    ss_1_2 = _mm_unpacklo_epi8(ss_1, _mm_setzero_si128());
+    ss_2_2 = _mm_unpacklo_epi8(ss_3, _mm_setzero_si128());
+    d1 = _mm_madd_epi16(ss_1_2, secondFilters);
+    d2 = _mm_madd_epi16(ss_2_2, thirdFilters);
+    srcRegFilt32b2_2 = _mm_add_epi32(d1, d2);
+
+    res_lo = _mm_unpacklo_epi32(srcRegFilt32b2_1, srcRegFilt32b2_2);
+    res_hi = _mm_unpackhi_epi32(srcRegFilt32b2_1, srcRegFilt32b2_2);
+    srcRegFilt32b2_1 = _mm_packs_epi32(res_lo, res_hi);
+
+    // shift by 6 bit each 16 bit
+    srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+    srcRegFilt32b2_1 = _mm_adds_epi16(srcRegFilt32b2_1, addFilterReg32);
+    srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6);
+    srcRegFilt32b2_1 = _mm_srai_epi16(srcRegFilt32b2_1, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve result
+    srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1);
+
+    src_ptr += src_pixels_per_line;
+
+    _mm_store_si128((__m128i *)output_ptr, srcRegFilt32b1_1);
+
+    output_ptr += output_pitch;
+  }
+}
+
+void aom_filter_block1d16_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_pitch,
+                                  uint8_t *output_ptr, ptrdiff_t out_pitch,
+                                  uint32_t output_height,
+                                  const int16_t *filter) {
+  __m128i filtersReg;
+  __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
+  __m128i srcReg23_lo, srcReg23_hi, srcReg34_lo, srcReg34_hi;
+  __m128i srcReg45_lo, srcReg45_hi, srcReg56_lo, srcReg56_hi;
+  __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo;
+  __m128i resReg23_hi, resReg34_hi, resReg45_hi, resReg56_hi;
+  __m128i resReg23_45_lo, resReg34_56_lo, resReg23_45_hi, resReg34_56_hi;
+  __m128i resReg23_45, resReg34_56;
+  __m128i addFilterReg32, secondFilters, thirdFilters;
+  __m128i tmp_0, tmp_1;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+
+  addFilterReg32 = _mm_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+
+  // coeffs 0 1 0 1 2 3 2 3
+  const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
+  // coeffs 4 5 4 5 6 7 6 7
+  const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
+
+  secondFilters = _mm_unpackhi_epi64(tmp0, tmp0);  // coeffs 2 3 2 3 2 3 2 3
+  thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1);   // coeffs 4 5 4 5 4 5 4 5
+
+  // multiply the size of the source and destination stride by two
+  src_stride = src_pitch << 1;
+  dst_stride = out_pitch << 1;
+
+  srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
+  srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
+  srcReg23_lo = _mm_unpacklo_epi8(srcReg2, srcReg3);
+  srcReg23_hi = _mm_unpackhi_epi8(srcReg2, srcReg3);
+  __m128i resReg23_lo_1 = _mm_unpacklo_epi8(srcReg23_lo, _mm_setzero_si128());
+  __m128i resReg23_lo_2 = _mm_unpackhi_epi8(srcReg23_lo, _mm_setzero_si128());
+  __m128i resReg23_hi_1 = _mm_unpacklo_epi8(srcReg23_hi, _mm_setzero_si128());
+  __m128i resReg23_hi_2 = _mm_unpackhi_epi8(srcReg23_hi, _mm_setzero_si128());
+
+  srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
+  srcReg34_lo = _mm_unpacklo_epi8(srcReg3, srcReg4);
+  srcReg34_hi = _mm_unpackhi_epi8(srcReg3, srcReg4);
+  __m128i resReg34_lo_1 = _mm_unpacklo_epi8(srcReg34_lo, _mm_setzero_si128());
+  __m128i resReg34_lo_2 = _mm_unpackhi_epi8(srcReg34_lo, _mm_setzero_si128());
+  __m128i resReg34_hi_1 = _mm_unpacklo_epi8(srcReg34_hi, _mm_setzero_si128());
+  __m128i resReg34_hi_2 = _mm_unpackhi_epi8(srcReg34_hi, _mm_setzero_si128());
+
+  for (i = output_height; i > 1; i -= 2) {
+    srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
+
+    srcReg45_lo = _mm_unpacklo_epi8(srcReg4, srcReg5);
+    srcReg45_hi = _mm_unpackhi_epi8(srcReg4, srcReg5);
+
+    srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
+
+    srcReg56_lo = _mm_unpacklo_epi8(srcReg5, srcReg6);
+    srcReg56_hi = _mm_unpackhi_epi8(srcReg5, srcReg6);
+
+    // multiply 2 adjacent elements with the filter and add the result
+
+    tmp_0 = _mm_madd_epi16(resReg23_lo_1, secondFilters);
+    tmp_1 = _mm_madd_epi16(resReg23_lo_2, secondFilters);
+    resReg23_lo = _mm_packs_epi32(tmp_0, tmp_1);
+
+    tmp_0 = _mm_madd_epi16(resReg34_lo_1, secondFilters);
+    tmp_1 = _mm_madd_epi16(resReg34_lo_2, secondFilters);
+    resReg34_lo = _mm_packs_epi32(tmp_0, tmp_1);
+
+    __m128i resReg45_lo_1 = _mm_unpacklo_epi8(srcReg45_lo, _mm_setzero_si128());
+    __m128i resReg45_lo_2 = _mm_unpackhi_epi8(srcReg45_lo, _mm_setzero_si128());
+    tmp_0 = _mm_madd_epi16(resReg45_lo_1, thirdFilters);
+    tmp_1 = _mm_madd_epi16(resReg45_lo_2, thirdFilters);
+    resReg45_lo = _mm_packs_epi32(tmp_0, tmp_1);
+
+    __m128i resReg56_lo_1 = _mm_unpacklo_epi8(srcReg56_lo, _mm_setzero_si128());
+    __m128i resReg56_lo_2 = _mm_unpackhi_epi8(srcReg56_lo, _mm_setzero_si128());
+    tmp_0 = _mm_madd_epi16(resReg56_lo_1, thirdFilters);
+    tmp_1 = _mm_madd_epi16(resReg56_lo_2, thirdFilters);
+    resReg56_lo = _mm_packs_epi32(tmp_0, tmp_1);
+
+    // add and saturate the results together
+    resReg23_45_lo = _mm_adds_epi16(resReg23_lo, resReg45_lo);
+    resReg34_56_lo = _mm_adds_epi16(resReg34_lo, resReg56_lo);
+
+    // multiply 2 adjacent elements with the filter and add the result
+
+    tmp_0 = _mm_madd_epi16(resReg23_hi_1, secondFilters);
+    tmp_1 = _mm_madd_epi16(resReg23_hi_2, secondFilters);
+    resReg23_hi = _mm_packs_epi32(tmp_0, tmp_1);
+
+    tmp_0 = _mm_madd_epi16(resReg34_hi_1, secondFilters);
+    tmp_1 = _mm_madd_epi16(resReg34_hi_2, secondFilters);
+    resReg34_hi = _mm_packs_epi32(tmp_0, tmp_1);
+
+    __m128i resReg45_hi_1 = _mm_unpacklo_epi8(srcReg45_hi, _mm_setzero_si128());
+    __m128i resReg45_hi_2 = _mm_unpackhi_epi8(srcReg45_hi, _mm_setzero_si128());
+    tmp_0 = _mm_madd_epi16(resReg45_hi_1, thirdFilters);
+    tmp_1 = _mm_madd_epi16(resReg45_hi_2, thirdFilters);
+    resReg45_hi = _mm_packs_epi32(tmp_0, tmp_1);
+
+    __m128i resReg56_hi_1 = _mm_unpacklo_epi8(srcReg56_hi, _mm_setzero_si128());
+    __m128i resReg56_hi_2 = _mm_unpackhi_epi8(srcReg56_hi, _mm_setzero_si128());
+    tmp_0 = _mm_madd_epi16(resReg56_hi_1, thirdFilters);
+    tmp_1 = _mm_madd_epi16(resReg56_hi_2, thirdFilters);
+    resReg56_hi = _mm_packs_epi32(tmp_0, tmp_1);
+
+    // add and saturate the results together
+    resReg23_45_hi = _mm_adds_epi16(resReg23_hi, resReg45_hi);
+    resReg34_56_hi = _mm_adds_epi16(resReg34_hi, resReg56_hi);
+
+    // shift by 6 bit each 16 bit
+    resReg23_45_lo = _mm_adds_epi16(resReg23_45_lo, addFilterReg32);
+    resReg34_56_lo = _mm_adds_epi16(resReg34_56_lo, addFilterReg32);
+    resReg23_45_hi = _mm_adds_epi16(resReg23_45_hi, addFilterReg32);
+    resReg34_56_hi = _mm_adds_epi16(resReg34_56_hi, addFilterReg32);
+    resReg23_45_lo = _mm_srai_epi16(resReg23_45_lo, 6);
+    resReg34_56_lo = _mm_srai_epi16(resReg34_56_lo, 6);
+    resReg23_45_hi = _mm_srai_epi16(resReg23_45_hi, 6);
+    resReg34_56_hi = _mm_srai_epi16(resReg34_56_hi, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    resReg23_45 = _mm_packus_epi16(resReg23_45_lo, resReg23_45_hi);
+    resReg34_56 = _mm_packus_epi16(resReg34_56_lo, resReg34_56_hi);
+
+    src_ptr += src_stride;
+
+    _mm_store_si128((__m128i *)output_ptr, (resReg23_45));
+    _mm_store_si128((__m128i *)(output_ptr + out_pitch), (resReg34_56));
+
+    output_ptr += dst_stride;
+
+    // save part of the registers for next strides
+    resReg23_lo_1 = resReg45_lo_1;
+    resReg23_lo_2 = resReg45_lo_2;
+    resReg23_hi_1 = resReg45_hi_1;
+    resReg23_hi_2 = resReg45_hi_2;
+    resReg34_lo_1 = resReg56_lo_1;
+    resReg34_lo_2 = resReg56_lo_2;
+    resReg34_hi_1 = resReg56_hi_1;
+    resReg34_hi_2 = resReg56_hi_2;
+    srcReg4 = srcReg6;
+  }
+}
+
+void aom_filter_block1d8_h4_sse2(const uint8_t *src_ptr,
+                                 ptrdiff_t src_pixels_per_line,
+                                 uint8_t *output_ptr, ptrdiff_t output_pitch,
+                                 uint32_t output_height,
+                                 const int16_t *filter) {
+  __m128i filtersReg;
+  __m128i addFilterReg32;
+  __m128i secondFilters, thirdFilters;
+  __m128i srcRegFilt32b1_1, srcRegFilt32b1_2;
+  __m128i srcReg32b1;
+  unsigned int i;
+  src_ptr -= 3;
+  addFilterReg32 = _mm_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+
+  // coeffs 0 1 0 1 2 3 2 3
+  const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
+  // coeffs 4 5 4 5 6 7 6 7
+  const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
+
+  secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0);  // coeffs 2 3 2 3 2 3 2 3
+  thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1);   // coeffs 4 5 4 5 4 5 4 5
+
+  for (i = output_height; i > 0; i -= 1) {
+    srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr);
+
+    __m128i ss_2 = _mm_srli_si128(srcReg32b1, 2);
+    __m128i ss_4 = _mm_srli_si128(srcReg32b1, 4);
+    ss_2 = _mm_unpacklo_epi8(ss_2, _mm_setzero_si128());
+    ss_4 = _mm_unpacklo_epi8(ss_4, _mm_setzero_si128());
+    __m128i d1 = _mm_madd_epi16(ss_2, secondFilters);
+    __m128i d2 = _mm_madd_epi16(ss_4, thirdFilters);
+    srcRegFilt32b1_1 = _mm_add_epi32(d1, d2);
+
+    __m128i ss_3 = _mm_srli_si128(srcReg32b1, 3);
+    __m128i ss_5 = _mm_srli_si128(srcReg32b1, 5);
+    ss_3 = _mm_unpacklo_epi8(ss_3, _mm_setzero_si128());
+    ss_5 = _mm_unpacklo_epi8(ss_5, _mm_setzero_si128());
+    d1 = _mm_madd_epi16(ss_3, secondFilters);
+    d2 = _mm_madd_epi16(ss_5, thirdFilters);
+    srcRegFilt32b1_2 = _mm_add_epi32(d1, d2);
+
+    __m128i res_lo = _mm_unpacklo_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2);
+    __m128i res_hi = _mm_unpackhi_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2);
+    srcRegFilt32b1_1 = _mm_packs_epi32(res_lo, res_hi);
+
+    // shift by 6 bit each 16 bit
+    srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+    srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve result
+    srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
+
+    src_ptr += src_pixels_per_line;
+
+    _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt32b1_1);
+
+    output_ptr += output_pitch;
+  }
+}
+
+void aom_filter_block1d8_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_pitch,
+                                 uint8_t *output_ptr, ptrdiff_t out_pitch,
+                                 uint32_t output_height,
+                                 const int16_t *filter) {
+  __m128i filtersReg;
+  __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
+  __m128i srcReg23_lo, srcReg34_lo;
+  __m128i srcReg45_lo, srcReg56_lo;
+  __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo;
+  __m128i resReg23_45_lo, resReg34_56_lo;
+  __m128i resReg23_45, resReg34_56;
+  __m128i addFilterReg32, secondFilters, thirdFilters;
+  __m128i tmp_0, tmp_1;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+
+  addFilterReg32 = _mm_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+
+  // coeffs 0 1 0 1 2 3 2 3
+  const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
+  // coeffs 4 5 4 5 6 7 6 7
+  const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
+
+  secondFilters = _mm_unpackhi_epi64(tmp0, tmp0);  // coeffs 2 3 2 3 2 3 2 3
+  thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1);   // coeffs 4 5 4 5 4 5 4 5
+
+  // multiply the size of the source and destination stride by two
+  src_stride = src_pitch << 1;
+  dst_stride = out_pitch << 1;
+
+  srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
+  srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
+  srcReg23_lo = _mm_unpacklo_epi8(srcReg2, srcReg3);
+  __m128i resReg23_lo_1 = _mm_unpacklo_epi8(srcReg23_lo, _mm_setzero_si128());
+  __m128i resReg23_lo_2 = _mm_unpackhi_epi8(srcReg23_lo, _mm_setzero_si128());
+
+  srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
+  srcReg34_lo = _mm_unpacklo_epi8(srcReg3, srcReg4);
+  __m128i resReg34_lo_1 = _mm_unpacklo_epi8(srcReg34_lo, _mm_setzero_si128());
+  __m128i resReg34_lo_2 = _mm_unpackhi_epi8(srcReg34_lo, _mm_setzero_si128());
+
+  for (i = output_height; i > 1; i -= 2) {
+    srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
+    srcReg45_lo = _mm_unpacklo_epi8(srcReg4, srcReg5);
+
+    srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
+    srcReg56_lo = _mm_unpacklo_epi8(srcReg5, srcReg6);
+
+    // multiply 2 adjacent elements with the filter and add the result
+
+    tmp_0 = _mm_madd_epi16(resReg23_lo_1, secondFilters);
+    tmp_1 = _mm_madd_epi16(resReg23_lo_2, secondFilters);
+    resReg23_lo = _mm_packs_epi32(tmp_0, tmp_1);
+
+    tmp_0 = _mm_madd_epi16(resReg34_lo_1, secondFilters);
+    tmp_1 = _mm_madd_epi16(resReg34_lo_2, secondFilters);
+    resReg34_lo = _mm_packs_epi32(tmp_0, tmp_1);
+
+    __m128i resReg45_lo_1 = _mm_unpacklo_epi8(srcReg45_lo, _mm_setzero_si128());
+    __m128i resReg45_lo_2 = _mm_unpackhi_epi8(srcReg45_lo, _mm_setzero_si128());
+    tmp_0 = _mm_madd_epi16(resReg45_lo_1, thirdFilters);
+    tmp_1 = _mm_madd_epi16(resReg45_lo_2, thirdFilters);
+    resReg45_lo = _mm_packs_epi32(tmp_0, tmp_1);
+
+    __m128i resReg56_lo_1 = _mm_unpacklo_epi8(srcReg56_lo, _mm_setzero_si128());
+    __m128i resReg56_lo_2 = _mm_unpackhi_epi8(srcReg56_lo, _mm_setzero_si128());
+    tmp_0 = _mm_madd_epi16(resReg56_lo_1, thirdFilters);
+    tmp_1 = _mm_madd_epi16(resReg56_lo_2, thirdFilters);
+    resReg56_lo = _mm_packs_epi32(tmp_0, tmp_1);
+
+    // add and saturate the results together
+    resReg23_45_lo = _mm_adds_epi16(resReg23_lo, resReg45_lo);
+    resReg34_56_lo = _mm_adds_epi16(resReg34_lo, resReg56_lo);
+
+    // shift by 6 bit each 16 bit
+    resReg23_45_lo = _mm_adds_epi16(resReg23_45_lo, addFilterReg32);
+    resReg34_56_lo = _mm_adds_epi16(resReg34_56_lo, addFilterReg32);
+    resReg23_45_lo = _mm_srai_epi16(resReg23_45_lo, 6);
+    resReg34_56_lo = _mm_srai_epi16(resReg34_56_lo, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    resReg23_45 = _mm_packus_epi16(resReg23_45_lo, _mm_setzero_si128());
+    resReg34_56 = _mm_packus_epi16(resReg34_56_lo, _mm_setzero_si128());
+
+    src_ptr += src_stride;
+
+    _mm_storel_epi64((__m128i *)output_ptr, (resReg23_45));
+    _mm_storel_epi64((__m128i *)(output_ptr + out_pitch), (resReg34_56));
+
+    output_ptr += dst_stride;
+
+    // save part of the registers for next strides
+    resReg23_lo_1 = resReg45_lo_1;
+    resReg23_lo_2 = resReg45_lo_2;
+    resReg34_lo_1 = resReg56_lo_1;
+    resReg34_lo_2 = resReg56_lo_2;
+    srcReg4 = srcReg6;
+  }
+}
+
+void aom_filter_block1d4_h4_sse2(const uint8_t *src_ptr,
+                                 ptrdiff_t src_pixels_per_line,
+                                 uint8_t *output_ptr, ptrdiff_t output_pitch,
+                                 uint32_t output_height,
+                                 const int16_t *filter) {
+  __m128i filtersReg;
+  __m128i addFilterReg32;
+  __m128i secondFilters, thirdFilters;
+  __m128i srcRegFilt32b1_1;
+  __m128i srcReg32b1;
+  unsigned int i;
+  src_ptr -= 3;
+  addFilterReg32 = _mm_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+
+  // coeffs 0 1 0 1 2 3 2 3
+  const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
+  // coeffs 4 5 4 5 6 7 6 7
+  const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
+
+  secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0);  // coeffs 2 3 2 3 2 3 2 3
+  thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1);   // coeffs 4 5 4 5 4 5 4 5
+
+  for (i = output_height; i > 0; i -= 1) {
+    srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr);
+
+    __m128i ss_2 = _mm_srli_si128(srcReg32b1, 2);
+    __m128i ss_3 = _mm_srli_si128(srcReg32b1, 3);
+    __m128i ss_4 = _mm_srli_si128(srcReg32b1, 4);
+    __m128i ss_5 = _mm_srli_si128(srcReg32b1, 5);
+
+    ss_2 = _mm_unpacklo_epi8(ss_2, _mm_setzero_si128());
+    ss_3 = _mm_unpacklo_epi8(ss_3, _mm_setzero_si128());
+    ss_4 = _mm_unpacklo_epi8(ss_4, _mm_setzero_si128());
+    ss_5 = _mm_unpacklo_epi8(ss_5, _mm_setzero_si128());
+
+    __m128i ss_1_1 = _mm_unpacklo_epi32(ss_2, ss_3);
+    __m128i ss_1_2 = _mm_unpacklo_epi32(ss_4, ss_5);
+
+    __m128i d1 = _mm_madd_epi16(ss_1_1, secondFilters);
+    __m128i d2 = _mm_madd_epi16(ss_1_2, thirdFilters);
+    srcRegFilt32b1_1 = _mm_add_epi32(d1, d2);
+
+    srcRegFilt32b1_1 = _mm_packs_epi32(srcRegFilt32b1_1, _mm_setzero_si128());
+
+    // shift by 6 bit each 16 bit
+    srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+    srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve result
+    srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
+
+    src_ptr += src_pixels_per_line;
+
+    *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt32b1_1);
+
+    output_ptr += output_pitch;
+  }
+}
+
+void aom_filter_block1d4_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_pitch,
+                                 uint8_t *output_ptr, ptrdiff_t out_pitch,
+                                 uint32_t output_height,
+                                 const int16_t *filter) {
+  __m128i filtersReg;
+  __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
+  __m128i srcReg23, srcReg34, srcReg45, srcReg56;
+  __m128i resReg23_34, resReg45_56;
+  __m128i resReg23_34_45_56;
+  __m128i addFilterReg32, secondFilters, thirdFilters;
+  __m128i tmp_0, tmp_1;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+
+  addFilterReg32 = _mm_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+
+  // coeffs 0 1 0 1 2 3 2 3
+  const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
+  // coeffs 4 5 4 5 6 7 6 7
+  const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
+
+  secondFilters = _mm_unpackhi_epi64(tmp0, tmp0);  // coeffs 2 3 2 3 2 3 2 3
+  thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1);   // coeffs 4 5 4 5 4 5 4 5
+
+  // multiply the size of the source and destination stride by two
+  src_stride = src_pitch << 1;
+  dst_stride = out_pitch << 1;
+
+  srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
+  srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
+  srcReg23 = _mm_unpacklo_epi8(srcReg2, srcReg3);
+  __m128i resReg23 = _mm_unpacklo_epi8(srcReg23, _mm_setzero_si128());
+
+  srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
+  srcReg34 = _mm_unpacklo_epi8(srcReg3, srcReg4);
+  __m128i resReg34 = _mm_unpacklo_epi8(srcReg34, _mm_setzero_si128());
+
+  for (i = output_height; i > 1; i -= 2) {
+    srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
+    srcReg45 = _mm_unpacklo_epi8(srcReg4, srcReg5);
+    srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
+    srcReg56 = _mm_unpacklo_epi8(srcReg5, srcReg6);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    tmp_0 = _mm_madd_epi16(resReg23, secondFilters);
+    tmp_1 = _mm_madd_epi16(resReg34, secondFilters);
+    resReg23_34 = _mm_packs_epi32(tmp_0, tmp_1);
+
+    __m128i resReg45 = _mm_unpacklo_epi8(srcReg45, _mm_setzero_si128());
+    __m128i resReg56 = _mm_unpacklo_epi8(srcReg56, _mm_setzero_si128());
+
+    tmp_0 = _mm_madd_epi16(resReg45, thirdFilters);
+    tmp_1 = _mm_madd_epi16(resReg56, thirdFilters);
+    resReg45_56 = _mm_packs_epi32(tmp_0, tmp_1);
+
+    // add and saturate the results together
+    resReg23_34_45_56 = _mm_adds_epi16(resReg23_34, resReg45_56);
+
+    // shift by 6 bit each 16 bit
+    resReg23_34_45_56 = _mm_adds_epi16(resReg23_34_45_56, addFilterReg32);
+    resReg23_34_45_56 = _mm_srai_epi16(resReg23_34_45_56, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    resReg23_34_45_56 =
+        _mm_packus_epi16(resReg23_34_45_56, _mm_setzero_si128());
+
+    src_ptr += src_stride;
+
+    *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(resReg23_34_45_56);
+    *((uint32_t *)(output_ptr + out_pitch)) =
+        _mm_cvtsi128_si32(_mm_srli_si128(resReg23_34_45_56, 4));
+
+    output_ptr += dst_stride;
+
+    // save part of the registers for next strides
+    resReg23 = resReg45;
+    resReg34 = resReg56;
+    srcReg4 = srcReg6;
+  }
+}
diff --git a/libs/libaom/src/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c b/libs/libaom/src/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
new file mode 100644
index 000000000..f64b821ea
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
@@ -0,0 +1,770 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/convolve.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/emmintrin_compat.h"
+
+// filters only for the 4_h8 convolution
+DECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = { 0, 1, 1, 2, 2, 3,
+                                                              3, 4, 2, 3, 3, 4,
+                                                              4, 5, 5, 6 };
+
+DECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = { 4, 5, 5, 6, 6, 7,
+                                                              7, 8, 6, 7, 7, 8,
+                                                              8, 9, 9, 10 };
+
+// filters for 8_h8 and 16_h8
+DECLARE_ALIGNED(16, static const uint8_t,
+                filt1_global[16]) = { 0, 1, 1, 2, 2, 3, 3, 4,
+                                      4, 5, 5, 6, 6, 7, 7, 8 };
+
+DECLARE_ALIGNED(16, static const uint8_t,
+                filt2_global[16]) = { 2, 3, 3, 4, 4, 5, 5, 6,
+                                      6, 7, 7, 8, 8, 9, 9, 10 };
+
+DECLARE_ALIGNED(16, static const uint8_t,
+                filt3_global[16]) = { 4, 5, 5, 6,  6,  7,  7,  8,
+                                      8, 9, 9, 10, 10, 11, 11, 12 };
+
+DECLARE_ALIGNED(16, static const uint8_t,
+                filt4_global[16]) = { 6,  7,  7,  8,  8,  9,  9,  10,
+                                      10, 11, 11, 12, 12, 13, 13, 14 };
+
+DECLARE_ALIGNED(32, static const uint8_t, filt_h4[]) = {
+  0,  1,  1,  2,  2, 3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8,  0,  1,  1,
+  2,  2,  3,  3,  4, 4,  5,  5,  6,  6,  7,  7,  8,  2,  3,  3,  4,  4,  5,
+  5,  6,  6,  7,  7, 8,  8,  9,  9,  10, 2,  3,  3,  4,  4,  5,  5,  6,  6,
+  7,  7,  8,  8,  9, 9,  10, 4,  5,  5,  6,  6,  7,  7,  8,  8,  9,  9,  10,
+  10, 11, 11, 12, 4, 5,  5,  6,  6,  7,  7,  8,  8,  9,  9,  10, 10, 11, 11,
+  12, 6,  7,  7,  8, 8,  9,  9,  10, 10, 11, 11, 12, 12, 13, 13, 14, 6,  7,
+  7,  8,  8,  9,  9, 10, 10, 11, 11, 12, 12, 13, 13, 14
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, filtd4[]) = {
+  2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8,
+  2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8,
+};
+
+// These are reused by the avx2 intrinsics.
+filter8_1dfunction aom_filter_block1d8_v8_intrin_ssse3;
+filter8_1dfunction aom_filter_block1d8_h8_intrin_ssse3;
+filter8_1dfunction aom_filter_block1d4_h8_intrin_ssse3;
+
+static void aom_filter_block1d4_h4_ssse3(
+    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+  __m128i filtersReg;
+  __m128i addFilterReg32, filt1Reg, firstFilters, srcReg32b1, srcRegFilt32b1_1;
+  unsigned int i;
+  src_ptr -= 3;
+  addFilterReg32 = _mm_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+  // converting the 16 bit (short) to 8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+
+  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi32(0x5040302u));
+  filt1Reg = _mm_load_si128((__m128i const *)(filtd4));
+
+  for (i = output_height; i > 0; i -= 1) {
+    // load the 2 strides of source
+    srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr);
+
+    // filter the source buffer
+    srcRegFilt32b1_1 = _mm_shuffle_epi8(srcReg32b1, filt1Reg);
+
+    // multiply 4 adjacent elements with the filter and add the result
+    srcRegFilt32b1_1 = _mm_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
+
+    srcRegFilt32b1_1 = _mm_hadds_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
+
+    // shift by 6 bit each 16 bit
+    srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+    srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve result
+    srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
+
+    src_ptr += src_pixels_per_line;
+
+    *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt32b1_1);
+    output_ptr += output_pitch;
+  }
+}
+
+static void aom_filter_block1d4_v4_ssse3(
+    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+  __m128i filtersReg;
+  __m128i addFilterReg32;
+  __m128i srcReg2, srcReg3, srcReg23, srcReg4, srcReg34, srcReg5, srcReg45,
+      srcReg6, srcReg56;
+  __m128i srcReg23_34_lo, srcReg45_56_lo;
+  __m128i srcReg2345_3456_lo, srcReg2345_3456_hi;
+  __m128i resReglo, resReghi;
+  __m128i firstFilters;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+
+  addFilterReg32 = _mm_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the
+  // same data in both lanes of 128 bit register.
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+
+  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi32(0x5040302u));
+
+  // multiple the size of the source and destination stride by two
+  src_stride = src_pitch << 1;
+  dst_stride = out_pitch << 1;
+
+  srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
+  srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
+  srcReg23 = _mm_unpacklo_epi32(srcReg2, srcReg3);
+
+  srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
+
+  // have consecutive loads on the same 256 register
+  srcReg34 = _mm_unpacklo_epi32(srcReg3, srcReg4);
+
+  srcReg23_34_lo = _mm_unpacklo_epi8(srcReg23, srcReg34);
+
+  for (i = output_height; i > 1; i -= 2) {
+    srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
+    srcReg45 = _mm_unpacklo_epi32(srcReg4, srcReg5);
+
+    srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
+    srcReg56 = _mm_unpacklo_epi32(srcReg5, srcReg6);
+
+    // merge every two consecutive registers
+    srcReg45_56_lo = _mm_unpacklo_epi8(srcReg45, srcReg56);
+
+    srcReg2345_3456_lo = _mm_unpacklo_epi16(srcReg23_34_lo, srcReg45_56_lo);
+    srcReg2345_3456_hi = _mm_unpackhi_epi16(srcReg23_34_lo, srcReg45_56_lo);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    resReglo = _mm_maddubs_epi16(srcReg2345_3456_lo, firstFilters);
+    resReghi = _mm_maddubs_epi16(srcReg2345_3456_hi, firstFilters);
+
+    resReglo = _mm_hadds_epi16(resReglo, _mm_setzero_si128());
+    resReghi = _mm_hadds_epi16(resReghi, _mm_setzero_si128());
+
+    // shift by 6 bit each 16 bit
+    resReglo = _mm_adds_epi16(resReglo, addFilterReg32);
+    resReghi = _mm_adds_epi16(resReghi, addFilterReg32);
+    resReglo = _mm_srai_epi16(resReglo, 6);
+    resReghi = _mm_srai_epi16(resReghi, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    resReglo = _mm_packus_epi16(resReglo, resReglo);
+    resReghi = _mm_packus_epi16(resReghi, resReghi);
+
+    src_ptr += src_stride;
+
+    *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(resReglo);
+    *((uint32_t *)(output_ptr + out_pitch)) = _mm_cvtsi128_si32(resReghi);
+
+    output_ptr += dst_stride;
+
+    // save part of the registers for next strides
+    srcReg23_34_lo = srcReg45_56_lo;
+    srcReg4 = srcReg6;
+  }
+}
+
+void aom_filter_block1d4_h8_intrin_ssse3(
+    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+  __m128i firstFilters, secondFilters, shuffle1, shuffle2;
+  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
+  __m128i addFilterReg64, filtersReg, srcReg, minReg;
+  unsigned int i;
+
+  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+
+  // duplicate only the first 16 bits in the filter into the first lane
+  firstFilters = _mm_shufflelo_epi16(filtersReg, 0);
+  // duplicate only the third 16 bit in the filter into the first lane
+  secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu);
+  // duplicate only the seconds 16 bits in the filter into the second lane
+  // firstFilters: k0 k1 k0 k1 k0 k1 k0 k1 k2 k3 k2 k3 k2 k3 k2 k3
+  firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u);
+  // duplicate only the forth 16 bits in the filter into the second lane
+  // secondFilters: k4 k5 k4 k5 k4 k5 k4 k5 k6 k7 k6 k7 k6 k7 k6 k7
+  secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu);
+
+  // loading the local filters
+  shuffle1 = _mm_load_si128((__m128i const *)filt1_4_h8);
+  shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8);
+
+  for (i = 0; i < output_height; i++) {
+    srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
+
+    // filter the source buffer
+    srcRegFilt1 = _mm_shuffle_epi8(srcReg, shuffle1);
+    srcRegFilt2 = _mm_shuffle_epi8(srcReg, shuffle2);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
+
+    // extract the higher half of the lane
+    srcRegFilt3 = _mm_srli_si128(srcRegFilt1, 8);
+    srcRegFilt4 = _mm_srli_si128(srcRegFilt2, 8);
+
+    minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2);
+
+    // add and saturate all the results together
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
+    srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
+
+    // shift by 7 bit each 16 bits
+    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+
+    // shrink to 8 bit each 16 bits
+    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
+    src_ptr += src_pixels_per_line;
+
+    // save only 4 bytes
+    *((int *)&output_ptr[0]) = _mm_cvtsi128_si32(srcRegFilt1);
+
+    output_ptr += output_pitch;
+  }
+}
+
+static void aom_filter_block1d8_h4_ssse3(
+    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+  __m128i filtersReg;
+  __m128i addFilterReg32, filt2Reg, filt3Reg;
+  __m128i secondFilters, thirdFilters;
+  __m128i srcRegFilt32b1_1, srcRegFilt32b2, srcRegFilt32b3;
+  __m128i srcReg32b1;
+  unsigned int i;
+  src_ptr -= 3;
+  addFilterReg32 = _mm_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+  // converting the 16 bit (short) to 8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+
+  // duplicate only the second 16 bits (third and forth byte)
+  // across 256 bit register
+  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
+  // duplicate only the third 16 bits (fifth and sixth byte)
+  // across 256 bit register
+  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
+
+  filt2Reg = _mm_load_si128((__m128i const *)(filt_h4 + 32));
+  filt3Reg = _mm_load_si128((__m128i const *)(filt_h4 + 32 * 2));
+
+  for (i = output_height; i > 0; i -= 1) {
+    srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr);
+
+    // filter the source buffer
+    srcRegFilt32b3 = _mm_shuffle_epi8(srcReg32b1, filt2Reg);
+    srcRegFilt32b2 = _mm_shuffle_epi8(srcReg32b1, filt3Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt32b3 = _mm_maddubs_epi16(srcRegFilt32b3, secondFilters);
+    srcRegFilt32b2 = _mm_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+    srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
+
+    // shift by 6 bit each 16 bit
+    srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+    srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6);
+
+    // shrink to 8 bit each 16 bits
+    srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
+
+    src_ptr += src_pixels_per_line;
+
+    _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt32b1_1);
+
+    output_ptr += output_pitch;
+  }
+}
+
+static void aom_filter_block1d8_v4_ssse3(
+    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+  __m128i filtersReg;
+  __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
+  __m128i srcReg23, srcReg34, srcReg45, srcReg56;
+  __m128i resReg23, resReg34, resReg45, resReg56;
+  __m128i resReg23_45, resReg34_56;
+  __m128i addFilterReg32, secondFilters, thirdFilters;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+
+  addFilterReg32 = _mm_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the
+  // same data in both lanes of 128 bit register.
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+
+  // duplicate only the second 16 bits (third and forth byte)
+  // across 128 bit register
+  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
+  // duplicate only the third 16 bits (fifth and sixth byte)
+  // across 128 bit register
+  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
+
+  // multiple the size of the source and destination stride by two
+  src_stride = src_pitch << 1;
+  dst_stride = out_pitch << 1;
+
+  srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
+  srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
+  srcReg23 = _mm_unpacklo_epi8(srcReg2, srcReg3);
+
+  srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
+
+  // have consecutive loads on the same 256 register
+  srcReg34 = _mm_unpacklo_epi8(srcReg3, srcReg4);
+
+  for (i = output_height; i > 1; i -= 2) {
+    srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
+
+    srcReg45 = _mm_unpacklo_epi8(srcReg4, srcReg5);
+
+    srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
+
+    srcReg56 = _mm_unpacklo_epi8(srcReg5, srcReg6);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    resReg23 = _mm_maddubs_epi16(srcReg23, secondFilters);
+    resReg34 = _mm_maddubs_epi16(srcReg34, secondFilters);
+    resReg45 = _mm_maddubs_epi16(srcReg45, thirdFilters);
+    resReg56 = _mm_maddubs_epi16(srcReg56, thirdFilters);
+
+    // add and saturate the results together
+    resReg23_45 = _mm_adds_epi16(resReg23, resReg45);
+    resReg34_56 = _mm_adds_epi16(resReg34, resReg56);
+
+    // shift by 6 bit each 16 bit
+    resReg23_45 = _mm_adds_epi16(resReg23_45, addFilterReg32);
+    resReg34_56 = _mm_adds_epi16(resReg34_56, addFilterReg32);
+    resReg23_45 = _mm_srai_epi16(resReg23_45, 6);
+    resReg34_56 = _mm_srai_epi16(resReg34_56, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    resReg23_45 = _mm_packus_epi16(resReg23_45, _mm_setzero_si128());
+    resReg34_56 = _mm_packus_epi16(resReg34_56, _mm_setzero_si128());
+
+    src_ptr += src_stride;
+
+    _mm_storel_epi64((__m128i *)output_ptr, (resReg23_45));
+    _mm_storel_epi64((__m128i *)(output_ptr + out_pitch), (resReg34_56));
+
+    output_ptr += dst_stride;
+
+    // save part of the registers for next strides
+    srcReg23 = srcReg45;
+    srcReg34 = srcReg56;
+    srcReg4 = srcReg6;
+  }
+}
+
+void aom_filter_block1d8_h8_intrin_ssse3(
+    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+  __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg;
+  __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
+  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
+  __m128i addFilterReg64, filtersReg, minReg;
+  unsigned int i;
+
+  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+
+  // duplicate only the first 16 bits (first and second byte)
+  // across 128 bit register
+  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
+  // duplicate only the second 16 bits (third and forth byte)
+  // across 128 bit register
+  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
+  // duplicate only the third 16 bits (fifth and sixth byte)
+  // across 128 bit register
+  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
+  // duplicate only the forth 16 bits (seventh and eighth byte)
+  // across 128 bit register
+  forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
+
+  filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
+  filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
+  filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
+  filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
+
+  for (i = 0; i < output_height; i++) {
+    srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
+
+    // filter the source buffer
+    srcRegFilt1 = _mm_shuffle_epi8(srcReg, filt1Reg);
+    srcRegFilt2 = _mm_shuffle_epi8(srcReg, filt2Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
+
+    // filter the source buffer
+    srcRegFilt3 = _mm_shuffle_epi8(srcReg, filt3Reg);
+    srcRegFilt4 = _mm_shuffle_epi8(srcReg, filt4Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters);
+    srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters);
+
+    // add and saturate all the results together
+    minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
+
+    srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
+
+    // shift by 7 bit each 16 bits
+    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+
+    // shrink to 8 bit each 16 bits
+    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
+
+    src_ptr += src_pixels_per_line;
+
+    // save only 8 bytes
+    _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1);
+
+    output_ptr += output_pitch;
+  }
+}
+
+void aom_filter_block1d8_v8_intrin_ssse3(
+    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+  __m128i addFilterReg64, filtersReg, minReg;
+  __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
+  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5;
+  __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7;
+  __m128i srcReg8;
+  unsigned int i;
+
+  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
+  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+
+  // duplicate only the first 16 bits in the filter
+  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
+  // duplicate only the second 16 bits in the filter
+  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
+  // duplicate only the third 16 bits in the filter
+  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
+  // duplicate only the forth 16 bits in the filter
+  forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
+
+  // load the first 7 rows of 8 bytes
+  srcReg1 = _mm_loadl_epi64((const __m128i *)src_ptr);
+  srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
+  srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
+  srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
+  srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
+  srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
+  srcReg7 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
+
+  for (i = 0; i < output_height; i++) {
+    // load the last 8 bytes
+    srcReg8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
+
+    // merge the result together
+    srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2);
+    srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4);
+
+    // merge the result together
+    srcRegFilt2 = _mm_unpacklo_epi8(srcReg5, srcReg6);
+    srcRegFilt5 = _mm_unpacklo_epi8(srcReg7, srcReg8);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
+    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
+    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
+    srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters);
+
+    // add and saturate the results together
+    minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5);
+    srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
+
+    // shift by 7 bit each 16 bit
+    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+
+    // shrink to 8 bit each 16 bits
+    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
+
+    src_ptr += src_pitch;
+
+    // shift down a row
+    srcReg1 = srcReg2;
+    srcReg2 = srcReg3;
+    srcReg3 = srcReg4;
+    srcReg4 = srcReg5;
+    srcReg5 = srcReg6;
+    srcReg6 = srcReg7;
+    srcReg7 = srcReg8;
+
+    // save only 8 bytes convolve result
+    _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1);
+
+    output_ptr += out_pitch;
+  }
+}
+
+static void aom_filter_block1d16_h4_ssse3(
+    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+  __m128i filtersReg;
+  __m128i addFilterReg32, filt2Reg, filt3Reg;
+  __m128i secondFilters, thirdFilters;
+  __m128i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
+  __m128i srcReg32b1, srcReg32b2;
+  unsigned int i;
+  src_ptr -= 3;
+  addFilterReg32 = _mm_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+  // converting the 16 bit (short) to 8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+
+  // duplicate only the second 16 bits (third and forth byte)
+  // across 256 bit register
+  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
+  // duplicate only the third 16 bits (fifth and sixth byte)
+  // across 256 bit register
+  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
+
+  filt2Reg = _mm_load_si128((__m128i const *)(filt_h4 + 32));
+  filt3Reg = _mm_load_si128((__m128i const *)(filt_h4 + 32 * 2));
+
+  for (i = output_height; i > 0; i -= 1) {
+    srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr);
+
+    // filter the source buffer
+    srcRegFilt32b3 = _mm_shuffle_epi8(srcReg32b1, filt2Reg);
+    srcRegFilt32b2 = _mm_shuffle_epi8(srcReg32b1, filt3Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt32b3 = _mm_maddubs_epi16(srcRegFilt32b3, secondFilters);
+    srcRegFilt32b2 = _mm_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+    srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
+
+    // reading stride of the next 16 bytes
+    // (part of it was being read by earlier read)
+    srcReg32b2 = _mm_loadu_si128((const __m128i *)(src_ptr + 8));
+
+    // filter the source buffer
+    srcRegFilt32b3 = _mm_shuffle_epi8(srcReg32b2, filt2Reg);
+    srcRegFilt32b2 = _mm_shuffle_epi8(srcReg32b2, filt3Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt32b3 = _mm_maddubs_epi16(srcRegFilt32b3, secondFilters);
+    srcRegFilt32b2 = _mm_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+    // add and saturate the results together
+    srcRegFilt32b2_1 = _mm_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
+
+    // shift by 6 bit each 16 bit
+    srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+    srcRegFilt32b2_1 = _mm_adds_epi16(srcRegFilt32b2_1, addFilterReg32);
+    srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6);
+    srcRegFilt32b2_1 = _mm_srai_epi16(srcRegFilt32b2_1, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve result
+    srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1);
+
+    src_ptr += src_pixels_per_line;
+
+    _mm_store_si128((__m128i *)output_ptr, srcRegFilt32b1_1);
+
+    output_ptr += output_pitch;
+  }
+}
+
+static void aom_filter_block1d16_v4_ssse3(
+    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+  __m128i filtersReg;
+  __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
+  __m128i srcReg23_lo, srcReg23_hi, srcReg34_lo, srcReg34_hi;
+  __m128i srcReg45_lo, srcReg45_hi, srcReg56_lo, srcReg56_hi;
+  __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo;
+  __m128i resReg23_hi, resReg34_hi, resReg45_hi, resReg56_hi;
+  __m128i resReg23_45_lo, resReg34_56_lo, resReg23_45_hi, resReg34_56_hi;
+  __m128i resReg23_45, resReg34_56;
+  __m128i addFilterReg32, secondFilters, thirdFilters;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+
+  addFilterReg32 = _mm_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the
+  // same data in both lanes of 128 bit register.
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+
+  // duplicate only the second 16 bits (third and forth byte)
+  // across 128 bit register
+  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
+  // duplicate only the third 16 bits (fifth and sixth byte)
+  // across 128 bit register
+  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
+
+  // multiple the size of the source and destination stride by two
+  src_stride = src_pitch << 1;
+  dst_stride = out_pitch << 1;
+
+  srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
+  srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
+  srcReg23_lo = _mm_unpacklo_epi8(srcReg2, srcReg3);
+  srcReg23_hi = _mm_unpackhi_epi8(srcReg2, srcReg3);
+
+  srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
+
+  // have consecutive loads on the same 256 register
+  srcReg34_lo = _mm_unpacklo_epi8(srcReg3, srcReg4);
+  srcReg34_hi = _mm_unpackhi_epi8(srcReg3, srcReg4);
+
+  for (i = output_height; i > 1; i -= 2) {
+    srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
+
+    srcReg45_lo = _mm_unpacklo_epi8(srcReg4, srcReg5);
+    srcReg45_hi = _mm_unpackhi_epi8(srcReg4, srcReg5);
+
+    srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
+
+    srcReg56_lo = _mm_unpacklo_epi8(srcReg5, srcReg6);
+    srcReg56_hi = _mm_unpackhi_epi8(srcReg5, srcReg6);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    resReg23_lo = _mm_maddubs_epi16(srcReg23_lo, secondFilters);
+    resReg34_lo = _mm_maddubs_epi16(srcReg34_lo, secondFilters);
+    resReg45_lo = _mm_maddubs_epi16(srcReg45_lo, thirdFilters);
+    resReg56_lo = _mm_maddubs_epi16(srcReg56_lo, thirdFilters);
+
+    // add and saturate the results together
+    resReg23_45_lo = _mm_adds_epi16(resReg23_lo, resReg45_lo);
+    resReg34_56_lo = _mm_adds_epi16(resReg34_lo, resReg56_lo);
+
+    // multiply 2 adjacent elements with the filter and add the result
+
+    resReg23_hi = _mm_maddubs_epi16(srcReg23_hi, secondFilters);
+    resReg34_hi = _mm_maddubs_epi16(srcReg34_hi, secondFilters);
+    resReg45_hi = _mm_maddubs_epi16(srcReg45_hi, thirdFilters);
+    resReg56_hi = _mm_maddubs_epi16(srcReg56_hi, thirdFilters);
+
+    // add and saturate the results together
+    resReg23_45_hi = _mm_adds_epi16(resReg23_hi, resReg45_hi);
+    resReg34_56_hi = _mm_adds_epi16(resReg34_hi, resReg56_hi);
+
+    // shift by 6 bit each 16 bit
+    resReg23_45_lo = _mm_adds_epi16(resReg23_45_lo, addFilterReg32);
+    resReg34_56_lo = _mm_adds_epi16(resReg34_56_lo, addFilterReg32);
+    resReg23_45_hi = _mm_adds_epi16(resReg23_45_hi, addFilterReg32);
+    resReg34_56_hi = _mm_adds_epi16(resReg34_56_hi, addFilterReg32);
+    resReg23_45_lo = _mm_srai_epi16(resReg23_45_lo, 6);
+    resReg34_56_lo = _mm_srai_epi16(resReg34_56_lo, 6);
+    resReg23_45_hi = _mm_srai_epi16(resReg23_45_hi, 6);
+    resReg34_56_hi = _mm_srai_epi16(resReg34_56_hi, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    resReg23_45 = _mm_packus_epi16(resReg23_45_lo, resReg23_45_hi);
+    resReg34_56 = _mm_packus_epi16(resReg34_56_lo, resReg34_56_hi);
+
+    src_ptr += src_stride;
+
+    _mm_store_si128((__m128i *)output_ptr, (resReg23_45));
+    _mm_store_si128((__m128i *)(output_ptr + out_pitch), (resReg34_56));
+
+    output_ptr += dst_stride;
+
+    // save part of the registers for next strides
+    srcReg23_lo = srcReg45_lo;
+    srcReg34_lo = srcReg56_lo;
+    srcReg23_hi = srcReg45_hi;
+    srcReg34_hi = srcReg56_hi;
+    srcReg4 = srcReg6;
+  }
+}
+
+filter8_1dfunction aom_filter_block1d16_v8_ssse3;
+filter8_1dfunction aom_filter_block1d16_h8_ssse3;
+filter8_1dfunction aom_filter_block1d8_v8_ssse3;
+filter8_1dfunction aom_filter_block1d8_h8_ssse3;
+filter8_1dfunction aom_filter_block1d4_v8_ssse3;
+filter8_1dfunction aom_filter_block1d4_h8_ssse3;
+
+filter8_1dfunction aom_filter_block1d16_v2_ssse3;
+filter8_1dfunction aom_filter_block1d16_h2_ssse3;
+filter8_1dfunction aom_filter_block1d8_v2_ssse3;
+filter8_1dfunction aom_filter_block1d8_h2_ssse3;
+filter8_1dfunction aom_filter_block1d4_v2_ssse3;
+filter8_1dfunction aom_filter_block1d4_h2_ssse3;
+
+// void aom_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+//                                uint8_t *dst, ptrdiff_t dst_stride,
+//                                const int16_t *filter_x, int x_step_q4,
+//                                const int16_t *filter_y, int y_step_q4,
+//                                int w, int h);
+// void aom_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+//                               uint8_t *dst, ptrdiff_t dst_stride,
+//                               const int16_t *filter_x, int x_step_q4,
+//                               const int16_t *filter_y, int y_step_q4,
+//                               int w, int h);
+FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3);
+FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3);
diff --git a/libs/libaom/src/aom_dsp/x86/aom_subpixel_8t_sse2.asm b/libs/libaom/src/aom_dsp/x86/aom_subpixel_8t_sse2.asm
new file mode 100644
index 000000000..c88fc9ffb
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/aom_subpixel_8t_sse2.asm
@@ -0,0 +1,615 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+
+%include "aom_ports/x86_abi_support.asm"
+
+;Note: tap3 and tap4 have to be applied and added after other taps to avoid
+;overflow.
+
+%macro GET_FILTERS_4 0
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rcx, 0x0400040
+
+    movdqa      xmm7, [rdx]                 ;load filters
+    pshuflw     xmm0, xmm7, 0b              ;k0
+    pshuflw     xmm1, xmm7, 01010101b       ;k1
+    pshuflw     xmm2, xmm7, 10101010b       ;k2
+    pshuflw     xmm3, xmm7, 11111111b       ;k3
+    psrldq      xmm7, 8
+    pshuflw     xmm4, xmm7, 0b              ;k4
+    pshuflw     xmm5, xmm7, 01010101b       ;k5
+    pshuflw     xmm6, xmm7, 10101010b       ;k6
+    pshuflw     xmm7, xmm7, 11111111b       ;k7
+
+    punpcklqdq  xmm0, xmm1
+    punpcklqdq  xmm2, xmm3
+    punpcklqdq  xmm5, xmm4
+    punpcklqdq  xmm6, xmm7
+
+    movdqa      k0k1, xmm0
+    movdqa      k2k3, xmm2
+    movdqa      k5k4, xmm5
+    movdqa      k6k7, xmm6
+
+    movq        xmm6, rcx
+    pshufd      xmm6, xmm6, 0
+    movdqa      krd, xmm6
+
+    pxor        xmm7, xmm7
+    movdqa      zero, xmm7
+%endm
+
+%macro APPLY_FILTER_4 1
+    punpckldq   xmm0, xmm1                  ;two row in one register
+    punpckldq   xmm6, xmm7
+    punpckldq   xmm2, xmm3
+    punpckldq   xmm5, xmm4
+
+    punpcklbw   xmm0, zero                  ;unpack to word
+    punpcklbw   xmm6, zero
+    punpcklbw   xmm2, zero
+    punpcklbw   xmm5, zero
+
+    pmullw      xmm0, k0k1                  ;multiply the filter factors
+    pmullw      xmm6, k6k7
+    pmullw      xmm2, k2k3
+    pmullw      xmm5, k5k4
+
+    paddsw      xmm0, xmm6                  ;sum
+    movdqa      xmm1, xmm0
+    psrldq      xmm1, 8
+    paddsw      xmm0, xmm1
+    paddsw      xmm0, xmm2
+    psrldq      xmm2, 8
+    paddsw      xmm0, xmm5
+    psrldq      xmm5, 8
+    paddsw      xmm0, xmm2
+    paddsw      xmm0, xmm5
+
+    paddsw      xmm0, krd                   ;rounding
+    psraw       xmm0, 7                     ;shift
+    packuswb    xmm0, xmm0                  ;pack to byte
+
+%if %1
+    movd        xmm1, [rdi]
+    pavgb       xmm0, xmm1
+%endif
+    movd        [rdi], xmm0
+%endm
+
+%macro GET_FILTERS 0
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+    mov         rcx, 0x0400040
+
+    movdqa      xmm7, [rdx]                 ;load filters
+    pshuflw     xmm0, xmm7, 0b              ;k0
+    pshuflw     xmm1, xmm7, 01010101b       ;k1
+    pshuflw     xmm2, xmm7, 10101010b       ;k2
+    pshuflw     xmm3, xmm7, 11111111b       ;k3
+    pshufhw     xmm4, xmm7, 0b              ;k4
+    pshufhw     xmm5, xmm7, 01010101b       ;k5
+    pshufhw     xmm6, xmm7, 10101010b       ;k6
+    pshufhw     xmm7, xmm7, 11111111b       ;k7
+
+    punpcklwd   xmm0, xmm0
+    punpcklwd   xmm1, xmm1
+    punpcklwd   xmm2, xmm2
+    punpcklwd   xmm3, xmm3
+    punpckhwd   xmm4, xmm4
+    punpckhwd   xmm5, xmm5
+    punpckhwd   xmm6, xmm6
+    punpckhwd   xmm7, xmm7
+
+    movdqa      k0,   xmm0                  ;store filter factors on stack
+    movdqa      k1,   xmm1
+    movdqa      k2,   xmm2
+    movdqa      k3,   xmm3
+    movdqa      k4,   xmm4
+    movdqa      k5,   xmm5
+    movdqa      k6,   xmm6
+    movdqa      k7,   xmm7
+
+    movq        xmm6, rcx
+    pshufd      xmm6, xmm6, 0
+    movdqa      krd, xmm6                   ;rounding
+
+    pxor        xmm7, xmm7
+    movdqa      zero, xmm7
+%endm
+
+%macro LOAD_VERT_8 1
+    movq        xmm0, [rsi + %1]            ;0
+    movq        xmm1, [rsi + rax + %1]      ;1
+    movq        xmm6, [rsi + rdx * 2 + %1]  ;6
+    lea         rsi,  [rsi + rax]
+    movq        xmm7, [rsi + rdx * 2 + %1]  ;7
+    movq        xmm2, [rsi + rax + %1]      ;2
+    movq        xmm3, [rsi + rax * 2 + %1]  ;3
+    movq        xmm4, [rsi + rdx + %1]      ;4
+    movq        xmm5, [rsi + rax * 4 + %1]  ;5
+%endm
+
+%macro APPLY_FILTER_8 2
+    punpcklbw   xmm0, zero
+    punpcklbw   xmm1, zero
+    punpcklbw   xmm6, zero
+    punpcklbw   xmm7, zero
+    punpcklbw   xmm2, zero
+    punpcklbw   xmm5, zero
+    punpcklbw   xmm3, zero
+    punpcklbw   xmm4, zero
+
+    pmullw      xmm0, k0
+    pmullw      xmm1, k1
+    pmullw      xmm6, k6
+    pmullw      xmm7, k7
+    pmullw      xmm2, k2
+    pmullw      xmm5, k5
+    pmullw      xmm3, k3
+    pmullw      xmm4, k4
+
+    paddsw      xmm0, xmm1
+    paddsw      xmm0, xmm6
+    paddsw      xmm0, xmm7
+    paddsw      xmm0, xmm2
+    paddsw      xmm0, xmm5
+    paddsw      xmm0, xmm3
+    paddsw      xmm0, xmm4
+
+    paddsw      xmm0, krd                   ;rounding
+    psraw       xmm0, 7                     ;shift
+    packuswb    xmm0, xmm0                  ;pack back to byte
+%if %1
+    movq        xmm1, [rdi + %2]
+    pavgb       xmm0, xmm1
+%endif
+    movq        [rdi + %2], xmm0
+%endm
+
+SECTION .text
+
+;void aom_filter_block1d4_v8_sse2
+;(
+;    unsigned char *src_ptr,
+;    unsigned int   src_pitch,
+;    unsigned char *output_ptr,
+;    unsigned int   out_pitch,
+;    unsigned int   output_height,
+;    short *filter
+;)
+global sym(aom_filter_block1d4_v8_sse2) PRIVATE
+sym(aom_filter_block1d4_v8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 6
+    %define k0k1 [rsp + 16 * 0]
+    %define k2k3 [rsp + 16 * 1]
+    %define k5k4 [rsp + 16 * 2]
+    %define k6k7 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define zero [rsp + 16 * 5]
+
+    GET_FILTERS_4
+
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
+    lea         rdx, [rax + rax * 2]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movd        xmm0, [rsi]                 ;load src: row 0
+    movd        xmm1, [rsi + rax]           ;1
+    movd        xmm6, [rsi + rdx * 2]       ;6
+    lea         rsi,  [rsi + rax]
+    movd        xmm7, [rsi + rdx * 2]       ;7
+    movd        xmm2, [rsi + rax]           ;2
+    movd        xmm3, [rsi + rax * 2]       ;3
+    movd        xmm4, [rsi + rdx]           ;4
+    movd        xmm5, [rsi + rax * 4]       ;5
+
+    APPLY_FILTER_4 0
+
+    lea         rdi, [rdi + rbx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 6
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void aom_filter_block1d8_v8_sse2
+;(
+;    unsigned char *src_ptr,
+;    unsigned int   src_pitch,
+;    unsigned char *output_ptr,
+;    unsigned int   out_pitch,
+;    unsigned int   output_height,
+;    short *filter
+;)
+global sym(aom_filter_block1d8_v8_sse2) PRIVATE
+sym(aom_filter_block1d8_v8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 10
+    %define k0 [rsp + 16 * 0]
+    %define k1 [rsp + 16 * 1]
+    %define k2 [rsp + 16 * 2]
+    %define k3 [rsp + 16 * 3]
+    %define k4 [rsp + 16 * 4]
+    %define k5 [rsp + 16 * 5]
+    %define k6 [rsp + 16 * 6]
+    %define k7 [rsp + 16 * 7]
+    %define krd [rsp + 16 * 8]
+    %define zero [rsp + 16 * 9]
+
+    GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
+    lea         rdx, [rax + rax * 2]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    LOAD_VERT_8 0
+    APPLY_FILTER_8 0, 0
+
+    lea         rdi, [rdi + rbx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 10
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void aom_filter_block1d16_v8_sse2
+;(
+;    unsigned char *src_ptr,
+;    unsigned int   src_pitch,
+;    unsigned char *output_ptr,
+;    unsigned int   out_pitch,
+;    unsigned int   output_height,
+;    short *filter
+;)
+global sym(aom_filter_block1d16_v8_sse2) PRIVATE
+sym(aom_filter_block1d16_v8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 10
+    %define k0 [rsp + 16 * 0]
+    %define k1 [rsp + 16 * 1]
+    %define k2 [rsp + 16 * 2]
+    %define k3 [rsp + 16 * 3]
+    %define k4 [rsp + 16 * 4]
+    %define k5 [rsp + 16 * 5]
+    %define k6 [rsp + 16 * 6]
+    %define k7 [rsp + 16 * 7]
+    %define krd [rsp + 16 * 8]
+    %define zero [rsp + 16 * 9]
+
+    GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
+    lea         rdx, [rax + rax * 2]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    LOAD_VERT_8 0
+    APPLY_FILTER_8 0, 0
+    sub         rsi, rax
+
+    LOAD_VERT_8 8
+    APPLY_FILTER_8 0, 8
+    add         rdi, rbx
+
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 10
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void aom_filter_block1d4_h8_sse2
+;(
+;    unsigned char  *src_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned char  *output_ptr,
+;    unsigned int    output_pitch,
+;    unsigned int    output_height,
+;    short *filter
+;)
+global sym(aom_filter_block1d4_h8_sse2) PRIVATE
+sym(aom_filter_block1d4_h8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 6
+    %define k0k1 [rsp + 16 * 0]
+    %define k2k3 [rsp + 16 * 1]
+    %define k5k4 [rsp + 16 * 2]
+    %define k6k7 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define zero [rsp + 16 * 5]
+
+    GET_FILTERS_4
+
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movdqu      xmm0,   [rsi - 3]           ;load src
+
+    movdqa      xmm1, xmm0
+    movdqa      xmm6, xmm0
+    movdqa      xmm7, xmm0
+    movdqa      xmm2, xmm0
+    movdqa      xmm3, xmm0
+    movdqa      xmm5, xmm0
+    movdqa      xmm4, xmm0
+
+    psrldq      xmm1, 1
+    psrldq      xmm6, 6
+    psrldq      xmm7, 7
+    psrldq      xmm2, 2
+    psrldq      xmm3, 3
+    psrldq      xmm5, 5
+    psrldq      xmm4, 4
+
+    APPLY_FILTER_4 0
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 6
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void aom_filter_block1d8_h8_sse2
+;(
+;    unsigned char  *src_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned char  *output_ptr,
+;    unsigned int    output_pitch,
+;    unsigned int    output_height,
+;    short *filter
+;)
+global sym(aom_filter_block1d8_h8_sse2) PRIVATE
+sym(aom_filter_block1d8_h8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 10
+    %define k0 [rsp + 16 * 0]
+    %define k1 [rsp + 16 * 1]
+    %define k2 [rsp + 16 * 2]
+    %define k3 [rsp + 16 * 3]
+    %define k4 [rsp + 16 * 4]
+    %define k5 [rsp + 16 * 5]
+    %define k6 [rsp + 16 * 6]
+    %define k7 [rsp + 16 * 7]
+    %define krd [rsp + 16 * 8]
+    %define zero [rsp + 16 * 9]
+
+    GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movdqu      xmm0,   [rsi - 3]           ;load src
+
+    movdqa      xmm1, xmm0
+    movdqa      xmm6, xmm0
+    movdqa      xmm7, xmm0
+    movdqa      xmm2, xmm0
+    movdqa      xmm5, xmm0
+    movdqa      xmm3, xmm0
+    movdqa      xmm4, xmm0
+
+    psrldq      xmm1, 1
+    psrldq      xmm6, 6
+    psrldq      xmm7, 7
+    psrldq      xmm2, 2
+    psrldq      xmm5, 5
+    psrldq      xmm3, 3
+    psrldq      xmm4, 4
+
+    APPLY_FILTER_8 0, 0
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 10
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void aom_filter_block1d16_h8_sse2
+;(
+;    unsigned char  *src_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned char  *output_ptr,
+;    unsigned int    output_pitch,
+;    unsigned int    output_height,
+;    short *filter
+;)
+global sym(aom_filter_block1d16_h8_sse2) PRIVATE
+sym(aom_filter_block1d16_h8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 10
+    %define k0 [rsp + 16 * 0]
+    %define k1 [rsp + 16 * 1]
+    %define k2 [rsp + 16 * 2]
+    %define k3 [rsp + 16 * 3]
+    %define k4 [rsp + 16 * 4]
+    %define k5 [rsp + 16 * 5]
+    %define k6 [rsp + 16 * 6]
+    %define k7 [rsp + 16 * 7]
+    %define krd [rsp + 16 * 8]
+    %define zero [rsp + 16 * 9]
+
+    GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movdqu      xmm0,   [rsi - 3]           ;load src
+
+    movdqa      xmm1, xmm0
+    movdqa      xmm6, xmm0
+    movdqa      xmm7, xmm0
+    movdqa      xmm2, xmm0
+    movdqa      xmm5, xmm0
+    movdqa      xmm3, xmm0
+    movdqa      xmm4, xmm0
+
+    psrldq      xmm1, 1
+    psrldq      xmm6, 6
+    psrldq      xmm7, 7
+    psrldq      xmm2, 2
+    psrldq      xmm5, 5
+    psrldq      xmm3, 3
+    psrldq      xmm4, 4
+
+    APPLY_FILTER_8 0, 0
+
+    movdqu      xmm0,   [rsi + 5]           ;load src
+
+    movdqa      xmm1, xmm0
+    movdqa      xmm6, xmm0
+    movdqa      xmm7, xmm0
+    movdqa      xmm2, xmm0
+    movdqa      xmm5, xmm0
+    movdqa      xmm3, xmm0
+    movdqa      xmm4, xmm0
+
+    psrldq      xmm1, 1
+    psrldq      xmm6, 6
+    psrldq      xmm7, 7
+    psrldq      xmm2, 2
+    psrldq      xmm5, 5
+    psrldq      xmm3, 3
+    psrldq      xmm4, 4
+
+    APPLY_FILTER_8 0, 8
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 10
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
diff --git a/libs/libaom/src/aom_dsp/x86/aom_subpixel_8t_ssse3.asm b/libs/libaom/src/aom_dsp/x86/aom_subpixel_8t_ssse3.asm
new file mode 100644
index 000000000..3ca7921b6
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/aom_subpixel_8t_ssse3.asm
@@ -0,0 +1,870 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_64:    times 8 dw 64
+even_byte_mask: times 8 dw 0x00ff
+
+; %define USE_PMULHRSW
+; NOTE: pmulhrsw has a latency of 5 cycles.  Tests showed a performance loss
+; when using this instruction.
+;
+; The add order below (based on ffav1) must be followed to prevent outranges.
+; x = k0k1 + k4k5
+; y = k2k3 + k6k7
+; z = signed SAT(x + y)
+
+SECTION .text
+%define LOCAL_VARS_SIZE 16*6
+
+%macro SETUP_LOCAL_VARS 0
+    ; TODO(slavarnway): using xmm registers for these on ARCH_X86_64 +
+    ; pmaddubsw has a higher latency on some platforms, this might be eased by
+    ; interleaving the instructions.
+    %define    k0k1  [rsp + 16*0]
+    %define    k2k3  [rsp + 16*1]
+    %define    k4k5  [rsp + 16*2]
+    %define    k6k7  [rsp + 16*3]
+    packsswb     m4, m4
+    ; TODO(slavarnway): multiple pshufb instructions had a higher latency on
+    ; some platforms.
+    pshuflw      m0, m4, 0b              ;k0_k1
+    pshuflw      m1, m4, 01010101b       ;k2_k3
+    pshuflw      m2, m4, 10101010b       ;k4_k5
+    pshuflw      m3, m4, 11111111b       ;k6_k7
+    punpcklqdq   m0, m0
+    punpcklqdq   m1, m1
+    punpcklqdq   m2, m2
+    punpcklqdq   m3, m3
+    mova       k0k1, m0
+    mova       k2k3, m1
+    mova       k4k5, m2
+    mova       k6k7, m3
+%if ARCH_X86_64
+    %define     krd  m12
+    %define    tmp0  [rsp + 16*4]
+    %define    tmp1  [rsp + 16*5]
+    mova        krd, [GLOBAL(pw_64)]
+%else
+    %define     krd  [rsp + 16*4]
+%if CONFIG_PIC=0
+    mova         m6, [GLOBAL(pw_64)]
+%else
+    ; build constants without accessing global memory
+    pcmpeqb      m6, m6                  ;all ones
+    psrlw        m6, 15
+    psllw        m6, 6                   ;aka pw_64
+%endif
+    mova        krd, m6
+%endif
+%endm
+
+;-------------------------------------------------------------------------------
+%if ARCH_X86_64
+  %define LOCAL_VARS_SIZE_H4 0
+%else
+  %define LOCAL_VARS_SIZE_H4 16*4
+%endif
+
+%macro SUBPIX_HFILTER4 1
+cglobal filter_block1d4_%1, 6, 6, 11, LOCAL_VARS_SIZE_H4, \
+                            src, sstride, dst, dstride, height, filter
+    mova                m4, [filterq]
+    packsswb            m4, m4
+%if ARCH_X86_64
+    %define       k0k1k4k5  m8
+    %define       k2k3k6k7  m9
+    %define            krd  m10
+    mova               krd, [GLOBAL(pw_64)]
+    pshuflw       k0k1k4k5, m4, 0b              ;k0_k1
+    pshufhw       k0k1k4k5, k0k1k4k5, 10101010b ;k0_k1_k4_k5
+    pshuflw       k2k3k6k7, m4, 01010101b       ;k2_k3
+    pshufhw       k2k3k6k7, k2k3k6k7, 11111111b ;k2_k3_k6_k7
+%else
+    %define       k0k1k4k5  [rsp + 16*0]
+    %define       k2k3k6k7  [rsp + 16*1]
+    %define            krd  [rsp + 16*2]
+    pshuflw             m6, m4, 0b              ;k0_k1
+    pshufhw             m6, m6, 10101010b       ;k0_k1_k4_k5
+    pshuflw             m7, m4, 01010101b       ;k2_k3
+    pshufhw             m7, m7, 11111111b       ;k2_k3_k6_k7
+%if CONFIG_PIC=0
+    mova                m1, [GLOBAL(pw_64)]
+%else
+    ; build constants without accessing global memory
+    pcmpeqb             m1, m1                  ;all ones
+    psrlw               m1, 15
+    psllw               m1, 6                   ;aka pw_64
+%endif
+    mova          k0k1k4k5, m6
+    mova          k2k3k6k7, m7
+    mova               krd, m1
+%endif
+    dec            heightd
+
+.loop:
+    ;Do two rows at once
+    movu                m4, [srcq - 3]
+    movu                m5, [srcq + sstrideq - 3]
+    punpckhbw           m1, m4, m4
+    punpcklbw           m4, m4
+    punpckhbw           m3, m5, m5
+    punpcklbw           m5, m5
+    palignr             m0, m1, m4, 1
+    pmaddubsw           m0, k0k1k4k5
+    palignr             m1, m4, 5
+    pmaddubsw           m1, k2k3k6k7
+    palignr             m2, m3, m5, 1
+    pmaddubsw           m2, k0k1k4k5
+    palignr             m3, m5, 5
+    pmaddubsw           m3, k2k3k6k7
+    punpckhqdq          m4, m0, m2
+    punpcklqdq          m0, m2
+    punpckhqdq          m5, m1, m3
+    punpcklqdq          m1, m3
+    paddsw              m0, m4
+    paddsw              m1, m5
+%ifidn %1, h8_avg
+    movd                m4, [dstq]
+    movd                m5, [dstq + dstrideq]
+%endif
+    paddsw              m0, m1
+    paddsw              m0, krd
+    psraw               m0, 7
+%ifidn %1, h8_add_src
+    pxor                 m3, m3
+    movu                 m4, [srcq]
+    movu                 m5, [srcq + sstrideq]
+    punpckldq            m4, m5 ; Bytes 0,1,2,3 from row 0, then 0,1,2,3 from row 2
+    punpcklbw            m4, m3
+    paddsw               m0, m4
+%endif
+    packuswb            m0, m0
+    psrldq              m1, m0, 4
+
+%ifidn %1, h8_avg
+    pavgb               m0, m4
+    pavgb               m1, m5
+%endif
+    movd            [dstq], m0
+    movd [dstq + dstrideq], m1
+
+    lea               srcq, [srcq + sstrideq        ]
+    prefetcht0              [srcq + 4 * sstrideq - 3]
+    lea               srcq, [srcq + sstrideq        ]
+    lea               dstq, [dstq + 2 * dstrideq    ]
+    prefetcht0              [srcq + 2 * sstrideq - 3]
+
+    sub            heightd, 2
+    jg               .loop
+
+    ; Do last row if output_height is odd
+    jne              .done
+
+    movu                m4, [srcq - 3]
+    punpckhbw           m1, m4, m4
+    punpcklbw           m4, m4
+    palignr             m0, m1, m4, 1
+    palignr             m1, m4, 5
+    pmaddubsw           m0, k0k1k4k5
+    pmaddubsw           m1, k2k3k6k7
+    psrldq              m2, m0, 8
+    psrldq              m3, m1, 8
+    paddsw              m0, m2
+    paddsw              m1, m3
+    paddsw              m0, m1
+    paddsw              m0, krd
+    psraw               m0, 7
+%ifidn %1, h8_add_src
+    pxor                m3, m3
+    movu                m4, [srcq]
+    punpcklbw           m4, m3
+    paddsw              m0, m4
+%endif
+    packuswb            m0, m0
+%ifidn %1, h8_avg
+    movd                m4, [dstq]
+    pavgb               m0, m4
+%endif
+    movd            [dstq], m0
+.done:
+    REP_RET
+%endm
+
+;-------------------------------------------------------------------------------
+%macro SUBPIX_HFILTER8 1
+cglobal filter_block1d8_%1, 6, 6, 14, LOCAL_VARS_SIZE, \
+                            src, sstride, dst, dstride, height, filter
+    mova                 m4, [filterq]
+    SETUP_LOCAL_VARS
+    dec             heightd
+
+.loop:
+    ;Do two rows at once
+    movu                 m0, [srcq - 3]
+    movu                 m4, [srcq + sstrideq - 3]
+    punpckhbw            m1, m0, m0
+    punpcklbw            m0, m0
+    palignr              m5, m1, m0, 13
+    pmaddubsw            m5, k6k7
+    palignr              m2, m1, m0, 5
+    palignr              m3, m1, m0, 9
+    palignr              m1, m0, 1
+    pmaddubsw            m1, k0k1
+    punpckhbw            m6, m4, m4
+    punpcklbw            m4, m4
+    pmaddubsw            m2, k2k3
+    pmaddubsw            m3, k4k5
+
+    palignr              m7, m6, m4, 13
+    palignr              m0, m6, m4, 5
+    pmaddubsw            m7, k6k7
+    paddsw               m1, m3
+    paddsw               m2, m5
+    paddsw               m1, m2
+%ifidn %1, h8_avg
+    movh                 m2, [dstq]
+    movhps               m2, [dstq + dstrideq]
+%endif
+    palignr              m5, m6, m4, 9
+    palignr              m6, m4, 1
+    pmaddubsw            m0, k2k3
+    pmaddubsw            m6, k0k1
+    paddsw               m1, krd
+    pmaddubsw            m5, k4k5
+    psraw                m1, 7
+    paddsw               m0, m7
+    paddsw               m6, m5
+    paddsw               m6, m0
+    paddsw               m6, krd
+    psraw                m6, 7
+%ifidn %1, h8_add_src
+    pxor                 m3, m3
+    movu                 m4, [srcq]
+    movu                 m5, [srcq + sstrideq]
+    punpcklbw            m4, m3
+    punpcklbw            m5, m3
+    paddsw               m1, m4
+    paddsw               m6, m5
+%endif
+    packuswb             m1, m6
+%ifidn %1, h8_avg
+    pavgb                m1, m2
+%endif
+    movh              [dstq], m1
+    movhps [dstq + dstrideq], m1
+
+    lea                srcq, [srcq + sstrideq        ]
+    prefetcht0               [srcq + 4 * sstrideq - 3]
+    lea                srcq, [srcq + sstrideq        ]
+    lea                dstq, [dstq + 2 * dstrideq    ]
+    prefetcht0               [srcq + 2 * sstrideq - 3]
+    sub             heightd, 2
+    jg                .loop
+
+    ; Do last row if output_height is odd
+    jne               .done
+
+    movu                 m0, [srcq - 3]
+    punpckhbw            m3, m0, m0
+    punpcklbw            m0, m0
+    palignr              m1, m3, m0, 1
+    palignr              m2, m3, m0, 5
+    palignr              m4, m3, m0, 13
+    palignr              m3, m0, 9
+    pmaddubsw            m1, k0k1
+    pmaddubsw            m2, k2k3
+    pmaddubsw            m3, k4k5
+    pmaddubsw            m4, k6k7
+    paddsw               m1, m3
+    paddsw               m4, m2
+    paddsw               m1, m4
+    paddsw               m1, krd
+    psraw                m1, 7
+%ifidn %1, h8_add_src
+    pxor                 m6, m6
+    movu                 m5, [srcq]
+    punpcklbw            m5, m6
+    paddsw               m1, m5
+%endif
+    packuswb             m1, m1
+%ifidn %1, h8_avg
+    movh                 m0, [dstq]
+    pavgb                m1, m0
+%endif
+    movh             [dstq], m1
+.done:
+    REP_RET
+%endm
+
+;-------------------------------------------------------------------------------
+%macro SUBPIX_HFILTER16 1
+cglobal filter_block1d16_%1, 6, 6, 14, LOCAL_VARS_SIZE, \
+                             src, sstride, dst, dstride, height, filter
+    mova          m4, [filterq]
+    SETUP_LOCAL_VARS
+
+.loop:
+    prefetcht0        [srcq + 2 * sstrideq -3]
+
+    movu          m0, [srcq - 3]
+    movu          m4, [srcq - 2]
+    pmaddubsw     m0, k0k1
+    pmaddubsw     m4, k0k1
+    movu          m1, [srcq - 1]
+    movu          m5, [srcq + 0]
+    pmaddubsw     m1, k2k3
+    pmaddubsw     m5, k2k3
+    movu          m2, [srcq + 1]
+    movu          m6, [srcq + 2]
+    pmaddubsw     m2, k4k5
+    pmaddubsw     m6, k4k5
+    movu          m3, [srcq + 3]
+    movu          m7, [srcq + 4]
+    pmaddubsw     m3, k6k7
+    pmaddubsw     m7, k6k7
+    paddsw        m0, m2
+    paddsw        m1, m3
+    paddsw        m0, m1
+    paddsw        m4, m6
+    paddsw        m5, m7
+    paddsw        m4, m5
+    paddsw        m0, krd
+    paddsw        m4, krd
+    psraw         m0, 7
+    psraw         m4, 7
+%ifidn %1, h8_add_src
+%if ARCH_X86=1 && CONFIG_PIC=1
+    pcmpeqb       m2, m2                  ;all ones
+    psrlw         m2, 8                   ;even_byte_mask
+%else
+    mova          m2, [GLOBAL(even_byte_mask)]
+%endif
+    movu          m5, [srcq]
+    mova          m7, m5
+    pand          m5, m2
+    psrlw         m7, 8
+    paddsw        m0, m5
+    paddsw        m4, m7
+%endif
+    packuswb      m0, m0
+    packuswb      m4, m4
+    punpcklbw     m0, m4
+%ifidn %1, h8_avg
+    pavgb         m0, [dstq]
+%endif
+    lea         srcq, [srcq + sstrideq]
+    mova      [dstq], m0
+    lea         dstq, [dstq + dstrideq]
+    dec      heightd
+    jnz        .loop
+    REP_RET
+%endm
+
+INIT_XMM ssse3
+SUBPIX_HFILTER16 h8
+SUBPIX_HFILTER8  h8
+SUBPIX_HFILTER4  h8
+
+;-------------------------------------------------------------------------------
+
+; TODO(Linfeng): Detect cpu type and choose the code with better performance.
+%define X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON 1
+
+%if ARCH_X86_64 && X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON
+    %define NUM_GENERAL_REG_USED 9
+%else
+    %define NUM_GENERAL_REG_USED 6
+%endif
+
+%macro SUBPIX_VFILTER 2
+cglobal filter_block1d%2_%1, 6, NUM_GENERAL_REG_USED, 15, LOCAL_VARS_SIZE, \
+                             src, sstride, dst, dstride, height, filter
+    mova          m4, [filterq]
+    SETUP_LOCAL_VARS
+
+%ifidn %2, 8
+    %define                movx  movh
+%else
+    %define                movx  movd
+%endif
+
+    dec                 heightd
+
+%if ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON
+
+%if ARCH_X86_64
+    %define               src1q  r7
+    %define           sstride6q  r8
+    %define          dst_stride  dstrideq
+%else
+    %define               src1q  filterq
+    %define           sstride6q  dstrideq
+    %define          dst_stride  dstridemp
+%endif
+    mov                   src1q, srcq
+    add                   src1q, sstrideq
+    lea               sstride6q, [sstrideq + sstrideq * 4]
+    add               sstride6q, sstrideq                   ;pitch * 6
+
+.loop:
+    ;Do two rows at once
+    movx                     m0, [srcq                ]     ;A
+    movx                     m1, [src1q               ]     ;B
+    punpcklbw                m0, m1                         ;A B
+    movx                     m2, [srcq + sstrideq * 2 ]     ;C
+    pmaddubsw                m0, k0k1
+    mova                     m6, m2
+    movx                     m3, [src1q + sstrideq * 2]     ;D
+    punpcklbw                m2, m3                         ;C D
+    pmaddubsw                m2, k2k3
+    movx                     m4, [srcq + sstrideq * 4 ]     ;E
+    mova                     m7, m4
+    movx                     m5, [src1q + sstrideq * 4]     ;F
+    punpcklbw                m4, m5                         ;E F
+    pmaddubsw                m4, k4k5
+    punpcklbw                m1, m6                         ;A B next iter
+    movx                     m6, [srcq + sstride6q    ]     ;G
+    punpcklbw                m5, m6                         ;E F next iter
+    punpcklbw                m3, m7                         ;C D next iter
+    pmaddubsw                m5, k4k5
+    movx                     m7, [src1q + sstride6q   ]     ;H
+    punpcklbw                m6, m7                         ;G H
+    pmaddubsw                m6, k6k7
+    pmaddubsw                m3, k2k3
+    pmaddubsw                m1, k0k1
+    paddsw                   m0, m4
+    paddsw                   m2, m6
+    movx                     m6, [srcq + sstrideq * 8 ]     ;H next iter
+    punpcklbw                m7, m6
+    pmaddubsw                m7, k6k7
+    paddsw                   m0, m2
+    paddsw                   m0, krd
+    psraw                    m0, 7
+    paddsw                   m1, m5
+%ifidn %1, v8_add_src
+    pxor                     m6, m6
+    movu                     m4, [srcq]
+    punpcklbw                m4, m6
+    paddsw                   m0, m4
+%endif
+    packuswb                 m0, m0
+
+    paddsw                   m3, m7
+    paddsw                   m1, m3
+    paddsw                   m1, krd
+    psraw                    m1, 7
+%ifidn %1, v8_add_src
+    movu                     m4, [src1q]
+    punpcklbw                m4, m6
+    paddsw                   m1, m4
+%endif
+    lea                    srcq, [srcq + sstrideq * 2 ]
+    lea                   src1q, [src1q + sstrideq * 2]
+    packuswb                 m1, m1
+
+%ifidn %1, v8_avg
+    movx                     m2, [dstq]
+    pavgb                    m0, m2
+%endif
+    movx                 [dstq], m0
+    add                    dstq, dst_stride
+%ifidn %1, v8_avg
+    movx                     m3, [dstq]
+    pavgb                    m1, m3
+%endif
+    movx                 [dstq], m1
+    add                    dstq, dst_stride
+    sub                 heightd, 2
+    jg                    .loop
+
+    ; Do last row if output_height is odd
+    jne                   .done
+
+    movx                     m0, [srcq                ]     ;A
+    movx                     m1, [srcq + sstrideq     ]     ;B
+    movx                     m6, [srcq + sstride6q    ]     ;G
+    punpcklbw                m0, m1                         ;A B
+    movx                     m7, [src1q + sstride6q   ]     ;H
+    pmaddubsw                m0, k0k1
+    movx                     m2, [srcq + sstrideq * 2 ]     ;C
+    punpcklbw                m6, m7                         ;G H
+    movx                     m3, [src1q + sstrideq * 2]     ;D
+    pmaddubsw                m6, k6k7
+    movx                     m4, [srcq + sstrideq * 4 ]     ;E
+    punpcklbw                m2, m3                         ;C D
+    movx                     m5, [src1q + sstrideq * 4]     ;F
+    punpcklbw                m4, m5                         ;E F
+    pmaddubsw                m2, k2k3
+    pmaddubsw                m4, k4k5
+    paddsw                   m2, m6
+    paddsw                   m0, m4
+    paddsw                   m0, m2
+    paddsw                   m0, krd
+    psraw                    m0, 7
+%ifidn %1, v8_add_src
+    pxor                     m6, m6
+    movu                     m4, [srcq]
+    punpcklbw                m4, m6
+    paddsw                   m0, m4
+%endif
+    packuswb                 m0, m0
+%ifidn %1, v8_avg
+    movx                     m1, [dstq]
+    pavgb                    m0, m1
+%endif
+    movx                 [dstq], m0
+
+%else
+    ; ARCH_X86_64
+
+    movx                     m0, [srcq                ]     ;A
+    movx                     m1, [srcq + sstrideq     ]     ;B
+    lea                    srcq, [srcq + sstrideq * 2 ]
+    movx                     m2, [srcq]                     ;C
+    movx                     m3, [srcq + sstrideq]          ;D
+    lea                    srcq, [srcq + sstrideq * 2 ]
+    movx                     m4, [srcq]                     ;E
+    movx                     m5, [srcq + sstrideq]          ;F
+    lea                    srcq, [srcq + sstrideq * 2 ]
+    movx                     m6, [srcq]                     ;G
+    punpcklbw                m0, m1                         ;A B
+    punpcklbw                m1, m2                         ;A B next iter
+    punpcklbw                m2, m3                         ;C D
+    punpcklbw                m3, m4                         ;C D next iter
+    punpcklbw                m4, m5                         ;E F
+    punpcklbw                m5, m6                         ;E F next iter
+
+.loop:
+    ;Do two rows at once
+    movx                     m7, [srcq + sstrideq]          ;H
+    lea                    srcq, [srcq + sstrideq * 2 ]
+    movx                    m14, [srcq]                     ;H next iter
+    punpcklbw                m6, m7                         ;G H
+    punpcklbw                m7, m14                        ;G H next iter
+    pmaddubsw                m8, m0, k0k1
+    pmaddubsw                m9, m1, k0k1
+    mova                     m0, m2
+    mova                     m1, m3
+    pmaddubsw               m10, m2, k2k3
+    pmaddubsw               m11, m3, k2k3
+    mova                     m2, m4
+    mova                     m3, m5
+    pmaddubsw                m4, k4k5
+    pmaddubsw                m5, k4k5
+    paddsw                   m8, m4
+    paddsw                   m9, m5
+    mova                     m4, m6
+    mova                     m5, m7
+    pmaddubsw                m6, k6k7
+    pmaddubsw                m7, k6k7
+    paddsw                  m10, m6
+    paddsw                  m11, m7
+    paddsw                   m8, m10
+    paddsw                   m9, m11
+    mova                     m6, m14
+    paddsw                   m8, krd
+    paddsw                   m9, krd
+    psraw                    m8, 7
+    psraw                    m9, 7
+%ifidn %2, 4
+    packuswb                 m8, m8
+    packuswb                 m9, m9
+%else
+    packuswb                 m8, m9
+%endif
+
+%ifidn %1, v8_avg
+    movx                     m7, [dstq]
+%ifidn %2, 4
+    movx                    m10, [dstq + dstrideq]
+    pavgb                    m9, m10
+%else
+    movhpd                   m7, [dstq + dstrideq]
+%endif
+    pavgb                    m8, m7
+%endif
+    movx                 [dstq], m8
+%ifidn %2, 4
+    movx      [dstq + dstrideq], m9
+%else
+    movhpd    [dstq + dstrideq], m8
+%endif
+
+    lea                    dstq, [dstq + dstrideq * 2 ]
+    sub                 heightd, 2
+    jg                    .loop
+
+    ; Do last row if output_height is odd
+    jne                   .done
+
+    movx                     m7, [srcq + sstrideq]          ;H
+    punpcklbw                m6, m7                         ;G H
+    pmaddubsw                m0, k0k1
+    pmaddubsw                m2, k2k3
+    pmaddubsw                m4, k4k5
+    pmaddubsw                m6, k6k7
+    paddsw                   m0, m4
+    paddsw                   m2, m6
+    paddsw                   m0, m2
+    paddsw                   m0, krd
+    psraw                    m0, 7
+    packuswb                 m0, m0
+%ifidn %1, v8_avg
+    movx                     m1, [dstq]
+    pavgb                    m0, m1
+%endif
+    movx                 [dstq], m0
+
+%endif ; ARCH_X86_64
+
+.done:
+    REP_RET
+
+%endm
+
+;-------------------------------------------------------------------------------
+%macro SUBPIX_VFILTER16 1
+cglobal filter_block1d16_%1, 6, NUM_GENERAL_REG_USED, 16, LOCAL_VARS_SIZE, \
+                             src, sstride, dst, dstride, height, filter
+    mova                     m4, [filterq]
+    SETUP_LOCAL_VARS
+
+%if ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON
+
+%if ARCH_X86_64
+    %define               src1q  r7
+    %define           sstride6q  r8
+    %define          dst_stride  dstrideq
+%else
+    %define               src1q  filterq
+    %define           sstride6q  dstrideq
+    %define          dst_stride  dstridemp
+%endif
+    lea                   src1q, [srcq + sstrideq]
+    lea               sstride6q, [sstrideq + sstrideq * 4]
+    add               sstride6q, sstrideq                   ;pitch * 6
+
+.loop:
+    movh                     m0, [srcq                ]     ;A
+    movh                     m1, [src1q               ]     ;B
+    movh                     m2, [srcq + sstrideq * 2 ]     ;C
+    movh                     m3, [src1q + sstrideq * 2]     ;D
+    movh                     m4, [srcq + sstrideq * 4 ]     ;E
+    movh                     m5, [src1q + sstrideq * 4]     ;F
+
+    punpcklbw                m0, m1                         ;A B
+    movh                     m6, [srcq + sstride6q]         ;G
+    punpcklbw                m2, m3                         ;C D
+    movh                     m7, [src1q + sstride6q]        ;H
+    punpcklbw                m4, m5                         ;E F
+    pmaddubsw                m0, k0k1
+    movh                     m3, [srcq + 8]                 ;A
+    pmaddubsw                m2, k2k3
+    punpcklbw                m6, m7                         ;G H
+    movh                     m5, [srcq + sstrideq + 8]      ;B
+    pmaddubsw                m4, k4k5
+    punpcklbw                m3, m5                         ;A B
+    movh                     m7, [srcq + sstrideq * 2 + 8]  ;C
+    pmaddubsw                m6, k6k7
+    movh                     m5, [src1q + sstrideq * 2 + 8] ;D
+    punpcklbw                m7, m5                         ;C D
+    paddsw                   m2, m6
+    pmaddubsw                m3, k0k1
+    movh                     m1, [srcq + sstrideq * 4 + 8]  ;E
+    paddsw                   m0, m4
+    pmaddubsw                m7, k2k3
+    movh                     m6, [src1q + sstrideq * 4 + 8] ;F
+    punpcklbw                m1, m6                         ;E F
+    paddsw                   m0, m2
+    paddsw                   m0, krd
+    movh                     m2, [srcq + sstride6q + 8]     ;G
+    pmaddubsw                m1, k4k5
+    movh                     m5, [src1q + sstride6q + 8]    ;H
+    psraw                    m0, 7
+    punpcklbw                m2, m5                         ;G H
+    pmaddubsw                m2, k6k7
+    paddsw                   m7, m2
+    paddsw                   m3, m1
+    paddsw                   m3, m7
+    paddsw                   m3, krd
+    psraw                    m3, 7
+%ifidn %1, v8_add_src
+    pxor                     m6, m6
+    movu                     m4, [src1q + 2 * sstrideq] ; Fetch from 3 rows down
+    mova                     m5, m4
+    punpcklbw                m4, m6
+    punpckhbw                m5, m6
+    paddsw                   m0, m4
+    paddsw                   m3, m5
+%endif
+    packuswb                 m0, m3
+
+    add                    srcq, sstrideq
+    add                   src1q, sstrideq
+%ifidn %1, v8_avg
+    pavgb                    m0, [dstq]
+%endif
+    mova                 [dstq], m0
+    add                    dstq, dst_stride
+    dec                 heightd
+    jnz                   .loop
+    REP_RET
+
+%else
+    ; ARCH_X86_64
+    dec                 heightd
+
+    movu                     m1, [srcq                ]     ;A
+    movu                     m3, [srcq + sstrideq     ]     ;B
+    lea                    srcq, [srcq + sstrideq * 2]
+    punpcklbw                m0, m1, m3                     ;A B
+    punpckhbw                m1, m3                         ;A B
+    movu                     m5, [srcq]                     ;C
+    punpcklbw                m2, m3, m5                     ;A B next iter
+    punpckhbw                m3, m5                         ;A B next iter
+    mova                   tmp0, m2                         ;store to stack
+    mova                   tmp1, m3                         ;store to stack
+    movu                     m7, [srcq + sstrideq]          ;D
+    lea                    srcq, [srcq + sstrideq * 2]
+    punpcklbw                m4, m5, m7                     ;C D
+    punpckhbw                m5, m7                         ;C D
+    movu                     m9, [srcq]                     ;E
+    punpcklbw                m6, m7, m9                     ;C D next iter
+    punpckhbw                m7, m9                         ;C D next iter
+    movu                    m11, [srcq + sstrideq]          ;F
+    lea                    srcq, [srcq + sstrideq * 2]
+    punpcklbw                m8, m9, m11                    ;E F
+    punpckhbw                m9, m11                        ;E F
+    movu                     m2, [srcq]                     ;G
+    punpcklbw               m10, m11, m2                    ;E F next iter
+    punpckhbw               m11, m2                         ;E F next iter
+
+.loop:
+    ;Do two rows at once
+    pmaddubsw               m13, m0, k0k1
+    mova                     m0, m4
+    pmaddubsw               m14, m8, k4k5
+    pmaddubsw               m15, m4, k2k3
+    mova                     m4, m8
+    paddsw                  m13, m14
+    movu                     m3, [srcq + sstrideq]          ;H
+    lea                    srcq, [srcq + sstrideq * 2]
+    punpcklbw               m14, m2, m3                     ;G H
+    mova                     m8, m14
+    pmaddubsw               m14, k6k7
+    paddsw                  m15, m14
+    paddsw                  m13, m15
+    paddsw                  m13, krd
+    psraw                   m13, 7
+
+    pmaddubsw               m14, m1, k0k1
+    pmaddubsw                m1, m9, k4k5
+    pmaddubsw               m15, m5, k2k3
+    paddsw                  m14, m1
+    mova                     m1, m5
+    mova                     m5, m9
+    punpckhbw                m2, m3                         ;G H
+    mova                     m9, m2
+    pmaddubsw                m2, k6k7
+    paddsw                  m15, m2
+    paddsw                  m14, m15
+    paddsw                  m14, krd
+    psraw                   m14, 7
+    packuswb                m13, m14
+%ifidn %1, v8_avg
+    pavgb                   m13, [dstq]
+%endif
+    mova                 [dstq], m13
+
+    ; next iter
+    pmaddubsw               m15, tmp0, k0k1
+    pmaddubsw               m14, m10, k4k5
+    pmaddubsw               m13, m6, k2k3
+    paddsw                  m15, m14
+    mova                   tmp0, m6
+    mova                     m6, m10
+    movu                     m2, [srcq]                     ;G next iter
+    punpcklbw               m14, m3, m2                     ;G H next iter
+    mova                    m10, m14
+    pmaddubsw               m14, k6k7
+    paddsw                  m13, m14
+    paddsw                  m15, m13
+    paddsw                  m15, krd
+    psraw                   m15, 7
+
+    pmaddubsw               m14, tmp1, k0k1
+    mova                   tmp1, m7
+    pmaddubsw               m13, m7, k2k3
+    mova                     m7, m11
+    pmaddubsw               m11, k4k5
+    paddsw                  m14, m11
+    punpckhbw                m3, m2                         ;G H next iter
+    mova                    m11, m3
+    pmaddubsw                m3, k6k7
+    paddsw                  m13, m3
+    paddsw                  m14, m13
+    paddsw                  m14, krd
+    psraw                   m14, 7
+    packuswb                m15, m14
+%ifidn %1, v8_avg
+    pavgb                   m15, [dstq + dstrideq]
+%endif
+    mova      [dstq + dstrideq], m15
+    lea                    dstq, [dstq + dstrideq * 2]
+    sub                 heightd, 2
+    jg                    .loop
+
+    ; Do last row if output_height is odd
+    jne                   .done
+
+    movu                     m3, [srcq + sstrideq]          ;H
+    punpcklbw                m6, m2, m3                     ;G H
+    punpckhbw                m2, m3                         ;G H
+    pmaddubsw                m0, k0k1
+    pmaddubsw                m1, k0k1
+    pmaddubsw                m4, k2k3
+    pmaddubsw                m5, k2k3
+    pmaddubsw                m8, k4k5
+    pmaddubsw                m9, k4k5
+    pmaddubsw                m6, k6k7
+    pmaddubsw                m2, k6k7
+    paddsw                   m0, m8
+    paddsw                   m1, m9
+    paddsw                   m4, m6
+    paddsw                   m5, m2
+    paddsw                   m0, m4
+    paddsw                   m1, m5
+    paddsw                   m0, krd
+    paddsw                   m1, krd
+    psraw                    m0, 7
+    psraw                    m1, 7
+    packuswb                 m0, m1
+%ifidn %1, v8_avg
+    pavgb                    m0, [dstq]
+%endif
+    mova                 [dstq], m0
+
+.done:
+    REP_RET
+
+%endif ; ARCH_X86_64
+
+%endm
+
+INIT_XMM ssse3
+SUBPIX_VFILTER16     v8
+SUBPIX_VFILTER       v8, 8
+SUBPIX_VFILTER       v8, 4
diff --git a/libs/libaom/src/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm b/libs/libaom/src/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm
new file mode 100644
index 000000000..d0b4b2839
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm
@@ -0,0 +1,295 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "aom_ports/x86_abi_support.asm"
+
+%macro GET_PARAM_4 0
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+    mov         rcx, 0x0400040
+
+    movdqa      xmm3, [rdx]                 ;load filters
+    pshuflw     xmm4, xmm3, 11111111b       ;k3
+    psrldq      xmm3, 8
+    pshuflw     xmm3, xmm3, 0b              ;k4
+    punpcklqdq  xmm4, xmm3                  ;k3k4
+
+    movq        xmm3, rcx                   ;rounding
+    pshufd      xmm3, xmm3, 0
+
+    pxor        xmm2, xmm2
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+%endm
+
+%macro APPLY_FILTER_4 1
+
+    punpckldq   xmm0, xmm1                  ;two row in one register
+    punpcklbw   xmm0, xmm2                  ;unpack to word
+    pmullw      xmm0, xmm4                  ;multiply the filter factors
+
+    movdqa      xmm1, xmm0
+    psrldq      xmm1, 8
+    paddsw      xmm0, xmm1
+
+    paddsw      xmm0, xmm3                  ;rounding
+    psraw       xmm0, 7                     ;shift
+    packuswb    xmm0, xmm0                  ;pack to byte
+
+%if %1
+    movd        xmm1, [rdi]
+    pavgb       xmm0, xmm1
+%endif
+
+    movd        [rdi], xmm0
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+%endm
+
+%macro GET_PARAM 0
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+    mov         rcx, 0x0400040
+
+    movdqa      xmm7, [rdx]                 ;load filters
+
+    pshuflw     xmm6, xmm7, 11111111b       ;k3
+    pshufhw     xmm7, xmm7, 0b              ;k4
+    punpcklwd   xmm6, xmm6
+    punpckhwd   xmm7, xmm7
+
+    movq        xmm4, rcx                   ;rounding
+    pshufd      xmm4, xmm4, 0
+
+    pxor        xmm5, xmm5
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+%endm
+
+%macro APPLY_FILTER_8 1
+    punpcklbw   xmm0, xmm5
+    punpcklbw   xmm1, xmm5
+
+    pmullw      xmm0, xmm6
+    pmullw      xmm1, xmm7
+    paddsw      xmm0, xmm1
+    paddsw      xmm0, xmm4                  ;rounding
+    psraw       xmm0, 7                     ;shift
+    packuswb    xmm0, xmm0                  ;pack back to byte
+%if %1
+    movq        xmm1, [rdi]
+    pavgb       xmm0, xmm1
+%endif
+    movq        [rdi], xmm0                 ;store the result
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+%endm
+
+%macro APPLY_FILTER_16 1
+    punpcklbw   xmm0, xmm5
+    punpcklbw   xmm1, xmm5
+    punpckhbw   xmm2, xmm5
+    punpckhbw   xmm3, xmm5
+
+    pmullw      xmm0, xmm6
+    pmullw      xmm1, xmm7
+    pmullw      xmm2, xmm6
+    pmullw      xmm3, xmm7
+
+    paddsw      xmm0, xmm1
+    paddsw      xmm2, xmm3
+
+    paddsw      xmm0, xmm4                  ;rounding
+    paddsw      xmm2, xmm4
+    psraw       xmm0, 7                     ;shift
+    psraw       xmm2, 7
+    packuswb    xmm0, xmm2                  ;pack back to byte
+%if %1
+    movdqu      xmm1, [rdi]
+    pavgb       xmm0, xmm1
+%endif
+    movdqu      [rdi], xmm0                 ;store the result
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+%endm
+
+SECTION .text
+
+global sym(aom_filter_block1d4_v2_sse2) PRIVATE
+sym(aom_filter_block1d4_v2_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM_4
+.loop:
+    movd        xmm0, [rsi]                 ;load src
+    movd        xmm1, [rsi + rax]
+
+    APPLY_FILTER_4 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(aom_filter_block1d8_v2_sse2) PRIVATE
+sym(aom_filter_block1d8_v2_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movq        xmm0, [rsi]                 ;0
+    movq        xmm1, [rsi + rax]           ;1
+
+    APPLY_FILTER_8 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(aom_filter_block1d16_v2_sse2) PRIVATE
+sym(aom_filter_block1d16_v2_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movdqu        xmm0, [rsi]               ;0
+    movdqu        xmm1, [rsi + rax]         ;1
+    movdqa        xmm2, xmm0
+    movdqa        xmm3, xmm1
+
+    APPLY_FILTER_16 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(aom_filter_block1d4_h2_sse2) PRIVATE
+sym(aom_filter_block1d4_h2_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM_4
+.loop:
+    movdqu      xmm0, [rsi]                 ;load src
+    movdqa      xmm1, xmm0
+    psrldq      xmm1, 1
+
+    APPLY_FILTER_4 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(aom_filter_block1d8_h2_sse2) PRIVATE
+sym(aom_filter_block1d8_h2_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movdqu      xmm0, [rsi]                 ;load src
+    movdqa      xmm1, xmm0
+    psrldq      xmm1, 1
+
+    APPLY_FILTER_8 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(aom_filter_block1d16_h2_sse2) PRIVATE
+sym(aom_filter_block1d16_h2_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movdqu      xmm0,   [rsi]               ;load src
+    movdqu      xmm1,   [rsi + 1]
+    movdqa      xmm2, xmm0
+    movdqa      xmm3, xmm1
+
+    APPLY_FILTER_16 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
diff --git a/libs/libaom/src/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm b/libs/libaom/src/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm
new file mode 100644
index 000000000..59edc49a9
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm
@@ -0,0 +1,267 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "aom_ports/x86_abi_support.asm"
+
+%macro GET_PARAM_4 0
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+    mov         ecx, 0x01000100
+
+    movdqa      xmm3, [rdx]                 ;load filters
+    psrldq      xmm3, 6
+    packsswb    xmm3, xmm3
+    pshuflw     xmm3, xmm3, 0b              ;k3_k4
+
+    movd        xmm2, ecx                   ;rounding_shift
+    pshufd      xmm2, xmm2, 0
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+%endm
+
+%macro APPLY_FILTER_4 1
+    punpcklbw   xmm0, xmm1
+    pmaddubsw   xmm0, xmm3
+
+    pmulhrsw    xmm0, xmm2                  ;rounding(+64)+shift(>>7)
+    packuswb    xmm0, xmm0                  ;pack to byte
+
+%if %1
+    movd        xmm1, [rdi]
+    pavgb       xmm0, xmm1
+%endif
+    movd        [rdi], xmm0
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+%endm
+
+%macro GET_PARAM 0
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+    mov         ecx, 0x01000100
+
+    movdqa      xmm7, [rdx]                 ;load filters
+    psrldq      xmm7, 6
+    packsswb    xmm7, xmm7
+    pshuflw     xmm7, xmm7, 0b              ;k3_k4
+    punpcklwd   xmm7, xmm7
+
+    movd        xmm6, ecx                   ;rounding_shift
+    pshufd      xmm6, xmm6, 0
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+%endm
+
+%macro APPLY_FILTER_8 1
+    punpcklbw   xmm0, xmm1
+    pmaddubsw   xmm0, xmm7
+
+    pmulhrsw    xmm0, xmm6                  ;rounding(+64)+shift(>>7)
+    packuswb    xmm0, xmm0                  ;pack back to byte
+
+%if %1
+    movq        xmm1, [rdi]
+    pavgb       xmm0, xmm1
+%endif
+    movq        [rdi], xmm0                 ;store the result
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+%endm
+
+%macro APPLY_FILTER_16 1
+    punpcklbw   xmm0, xmm1
+    punpckhbw   xmm2, xmm1
+    pmaddubsw   xmm0, xmm7
+    pmaddubsw   xmm2, xmm7
+
+    pmulhrsw    xmm0, xmm6                  ;rounding(+64)+shift(>>7)
+    pmulhrsw    xmm2, xmm6
+    packuswb    xmm0, xmm2                  ;pack back to byte
+
+%if %1
+    movdqu      xmm1, [rdi]
+    pavgb       xmm0, xmm1
+%endif
+    movdqu      [rdi], xmm0                 ;store the result
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+%endm
+
+SECTION .text
+
+global sym(aom_filter_block1d4_v2_ssse3) PRIVATE
+sym(aom_filter_block1d4_v2_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM_4
+.loop:
+    movd        xmm0, [rsi]                 ;load src
+    movd        xmm1, [rsi + rax]
+
+    APPLY_FILTER_4 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(aom_filter_block1d8_v2_ssse3) PRIVATE
+sym(aom_filter_block1d8_v2_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movq        xmm0, [rsi]                 ;0
+    movq        xmm1, [rsi + rax]           ;1
+
+    APPLY_FILTER_8 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(aom_filter_block1d16_v2_ssse3) PRIVATE
+sym(aom_filter_block1d16_v2_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movdqu        xmm0, [rsi]               ;0
+    movdqu        xmm1, [rsi + rax]         ;1
+    movdqa        xmm2, xmm0
+
+    APPLY_FILTER_16 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(aom_filter_block1d4_h2_ssse3) PRIVATE
+sym(aom_filter_block1d4_h2_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM_4
+.loop:
+    movdqu      xmm0, [rsi]                 ;load src
+    movdqa      xmm1, xmm0
+    psrldq      xmm1, 1
+
+    APPLY_FILTER_4 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(aom_filter_block1d8_h2_ssse3) PRIVATE
+sym(aom_filter_block1d8_h2_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movdqu      xmm0, [rsi]                 ;load src
+    movdqa      xmm1, xmm0
+    psrldq      xmm1, 1
+
+    APPLY_FILTER_8 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+global sym(aom_filter_block1d16_h2_ssse3) PRIVATE
+sym(aom_filter_block1d16_h2_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movdqu      xmm0,   [rsi]               ;load src
+    movdqu      xmm1,   [rsi + 1]
+    movdqa      xmm2, xmm0
+
+    APPLY_FILTER_16 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
diff --git a/libs/libaom/src/aom_dsp/x86/avg_intrin_avx2.c b/libs/libaom/src/aom_dsp/x86/avg_intrin_avx2.c
new file mode 100644
index 000000000..3bbffbd80
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/avg_intrin_avx2.c
@@ -0,0 +1,504 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/bitdepth_conversion_avx2.h"
+#include "aom_ports/mem.h"
+
+static void hadamard_col8x2_avx2(__m256i *in, int iter) {
+  __m256i a0 = in[0];
+  __m256i a1 = in[1];
+  __m256i a2 = in[2];
+  __m256i a3 = in[3];
+  __m256i a4 = in[4];
+  __m256i a5 = in[5];
+  __m256i a6 = in[6];
+  __m256i a7 = in[7];
+
+  __m256i b0 = _mm256_add_epi16(a0, a1);
+  __m256i b1 = _mm256_sub_epi16(a0, a1);
+  __m256i b2 = _mm256_add_epi16(a2, a3);
+  __m256i b3 = _mm256_sub_epi16(a2, a3);
+  __m256i b4 = _mm256_add_epi16(a4, a5);
+  __m256i b5 = _mm256_sub_epi16(a4, a5);
+  __m256i b6 = _mm256_add_epi16(a6, a7);
+  __m256i b7 = _mm256_sub_epi16(a6, a7);
+
+  a0 = _mm256_add_epi16(b0, b2);
+  a1 = _mm256_add_epi16(b1, b3);
+  a2 = _mm256_sub_epi16(b0, b2);
+  a3 = _mm256_sub_epi16(b1, b3);
+  a4 = _mm256_add_epi16(b4, b6);
+  a5 = _mm256_add_epi16(b5, b7);
+  a6 = _mm256_sub_epi16(b4, b6);
+  a7 = _mm256_sub_epi16(b5, b7);
+
+  if (iter == 0) {
+    b0 = _mm256_add_epi16(a0, a4);
+    b7 = _mm256_add_epi16(a1, a5);
+    b3 = _mm256_add_epi16(a2, a6);
+    b4 = _mm256_add_epi16(a3, a7);
+    b2 = _mm256_sub_epi16(a0, a4);
+    b6 = _mm256_sub_epi16(a1, a5);
+    b1 = _mm256_sub_epi16(a2, a6);
+    b5 = _mm256_sub_epi16(a3, a7);
+
+    a0 = _mm256_unpacklo_epi16(b0, b1);
+    a1 = _mm256_unpacklo_epi16(b2, b3);
+    a2 = _mm256_unpackhi_epi16(b0, b1);
+    a3 = _mm256_unpackhi_epi16(b2, b3);
+    a4 = _mm256_unpacklo_epi16(b4, b5);
+    a5 = _mm256_unpacklo_epi16(b6, b7);
+    a6 = _mm256_unpackhi_epi16(b4, b5);
+    a7 = _mm256_unpackhi_epi16(b6, b7);
+
+    b0 = _mm256_unpacklo_epi32(a0, a1);
+    b1 = _mm256_unpacklo_epi32(a4, a5);
+    b2 = _mm256_unpackhi_epi32(a0, a1);
+    b3 = _mm256_unpackhi_epi32(a4, a5);
+    b4 = _mm256_unpacklo_epi32(a2, a3);
+    b5 = _mm256_unpacklo_epi32(a6, a7);
+    b6 = _mm256_unpackhi_epi32(a2, a3);
+    b7 = _mm256_unpackhi_epi32(a6, a7);
+
+    in[0] = _mm256_unpacklo_epi64(b0, b1);
+    in[1] = _mm256_unpackhi_epi64(b0, b1);
+    in[2] = _mm256_unpacklo_epi64(b2, b3);
+    in[3] = _mm256_unpackhi_epi64(b2, b3);
+    in[4] = _mm256_unpacklo_epi64(b4, b5);
+    in[5] = _mm256_unpackhi_epi64(b4, b5);
+    in[6] = _mm256_unpacklo_epi64(b6, b7);
+    in[7] = _mm256_unpackhi_epi64(b6, b7);
+  } else {
+    in[0] = _mm256_add_epi16(a0, a4);
+    in[7] = _mm256_add_epi16(a1, a5);
+    in[3] = _mm256_add_epi16(a2, a6);
+    in[4] = _mm256_add_epi16(a3, a7);
+    in[2] = _mm256_sub_epi16(a0, a4);
+    in[6] = _mm256_sub_epi16(a1, a5);
+    in[1] = _mm256_sub_epi16(a2, a6);
+    in[5] = _mm256_sub_epi16(a3, a7);
+  }
+}
+
+static void hadamard_8x8x2_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
+                                int16_t *coeff) {
+  __m256i src[8];
+  src[0] = _mm256_loadu_si256((const __m256i *)src_diff);
+  src[1] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+  src[2] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+  src[3] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+  src[4] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+  src[5] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+  src[6] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+  src[7] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+
+  hadamard_col8x2_avx2(src, 0);
+  hadamard_col8x2_avx2(src, 1);
+
+  _mm256_storeu_si256((__m256i *)coeff,
+                      _mm256_permute2x128_si256(src[0], src[1], 0x20));
+  coeff += 16;
+  _mm256_storeu_si256((__m256i *)coeff,
+                      _mm256_permute2x128_si256(src[2], src[3], 0x20));
+  coeff += 16;
+  _mm256_storeu_si256((__m256i *)coeff,
+                      _mm256_permute2x128_si256(src[4], src[5], 0x20));
+  coeff += 16;
+  _mm256_storeu_si256((__m256i *)coeff,
+                      _mm256_permute2x128_si256(src[6], src[7], 0x20));
+  coeff += 16;
+  _mm256_storeu_si256((__m256i *)coeff,
+                      _mm256_permute2x128_si256(src[0], src[1], 0x31));
+  coeff += 16;
+  _mm256_storeu_si256((__m256i *)coeff,
+                      _mm256_permute2x128_si256(src[2], src[3], 0x31));
+  coeff += 16;
+  _mm256_storeu_si256((__m256i *)coeff,
+                      _mm256_permute2x128_si256(src[4], src[5], 0x31));
+  coeff += 16;
+  _mm256_storeu_si256((__m256i *)coeff,
+                      _mm256_permute2x128_si256(src[6], src[7], 0x31));
+}
+
+static INLINE void hadamard_16x16_avx2(const int16_t *src_diff,
+                                       ptrdiff_t src_stride, tran_low_t *coeff,
+                                       int is_final) {
+  DECLARE_ALIGNED(32, int16_t, temp_coeff[16 * 16]);
+  int16_t *t_coeff = temp_coeff;
+  int16_t *coeff16 = (int16_t *)coeff;
+  int idx;
+  for (idx = 0; idx < 2; ++idx) {
+    const int16_t *src_ptr = src_diff + idx * 8 * src_stride;
+    hadamard_8x8x2_avx2(src_ptr, src_stride, t_coeff + (idx * 64 * 2));
+  }
+
+  for (idx = 0; idx < 64; idx += 16) {
+    const __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
+    const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 64));
+    const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 128));
+    const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 192));
+
+    __m256i b0 = _mm256_add_epi16(coeff0, coeff1);
+    __m256i b1 = _mm256_sub_epi16(coeff0, coeff1);
+    __m256i b2 = _mm256_add_epi16(coeff2, coeff3);
+    __m256i b3 = _mm256_sub_epi16(coeff2, coeff3);
+
+    b0 = _mm256_srai_epi16(b0, 1);
+    b1 = _mm256_srai_epi16(b1, 1);
+    b2 = _mm256_srai_epi16(b2, 1);
+    b3 = _mm256_srai_epi16(b3, 1);
+    if (is_final) {
+      store_tran_low(_mm256_add_epi16(b0, b2), coeff);
+      store_tran_low(_mm256_add_epi16(b1, b3), coeff + 64);
+      store_tran_low(_mm256_sub_epi16(b0, b2), coeff + 128);
+      store_tran_low(_mm256_sub_epi16(b1, b3), coeff + 192);
+      coeff += 16;
+    } else {
+      _mm256_storeu_si256((__m256i *)coeff16, _mm256_add_epi16(b0, b2));
+      _mm256_storeu_si256((__m256i *)(coeff16 + 64), _mm256_add_epi16(b1, b3));
+      _mm256_storeu_si256((__m256i *)(coeff16 + 128), _mm256_sub_epi16(b0, b2));
+      _mm256_storeu_si256((__m256i *)(coeff16 + 192), _mm256_sub_epi16(b1, b3));
+      coeff16 += 16;
+    }
+    t_coeff += 16;
+  }
+}
+
+void aom_hadamard_16x16_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
+                             tran_low_t *coeff) {
+  hadamard_16x16_avx2(src_diff, src_stride, coeff, 1);
+}
+
+void aom_hadamard_lp_16x16_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
+                                int16_t *coeff) {
+  int16_t *t_coeff = coeff;
+  for (int idx = 0; idx < 2; ++idx) {
+    const int16_t *src_ptr = src_diff + idx * 8 * src_stride;
+    hadamard_8x8x2_avx2(src_ptr, src_stride, t_coeff + (idx * 64 * 2));
+  }
+
+  for (int idx = 0; idx < 64; idx += 16) {
+    const __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
+    const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 64));
+    const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 128));
+    const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 192));
+
+    __m256i b0 = _mm256_add_epi16(coeff0, coeff1);
+    __m256i b1 = _mm256_sub_epi16(coeff0, coeff1);
+    __m256i b2 = _mm256_add_epi16(coeff2, coeff3);
+    __m256i b3 = _mm256_sub_epi16(coeff2, coeff3);
+
+    b0 = _mm256_srai_epi16(b0, 1);
+    b1 = _mm256_srai_epi16(b1, 1);
+    b2 = _mm256_srai_epi16(b2, 1);
+    b3 = _mm256_srai_epi16(b3, 1);
+    _mm256_storeu_si256((__m256i *)coeff, _mm256_add_epi16(b0, b2));
+    _mm256_storeu_si256((__m256i *)(coeff + 64), _mm256_add_epi16(b1, b3));
+    _mm256_storeu_si256((__m256i *)(coeff + 128), _mm256_sub_epi16(b0, b2));
+    _mm256_storeu_si256((__m256i *)(coeff + 192), _mm256_sub_epi16(b1, b3));
+    coeff += 16;
+    t_coeff += 16;
+  }
+}
+
+void aom_hadamard_32x32_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
+                             tran_low_t *coeff) {
+  // For high bitdepths, it is unnecessary to store_tran_low
+  // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the
+  // next stage.  Output to an intermediate buffer first, then store_tran_low()
+  // in the final stage.
+  DECLARE_ALIGNED(32, int16_t, temp_coeff[32 * 32]);
+  int16_t *t_coeff = temp_coeff;
+  int idx;
+  for (idx = 0; idx < 4; ++idx) {
+    // src_diff: 9 bit, dynamic range [-255, 255]
+    const int16_t *src_ptr =
+        src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
+    hadamard_16x16_avx2(src_ptr, src_stride,
+                        (tran_low_t *)(t_coeff + idx * 256), 0);
+  }
+
+  for (idx = 0; idx < 256; idx += 16) {
+    const __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
+    const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 256));
+    const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 512));
+    const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 768));
+
+    __m256i b0 = _mm256_add_epi16(coeff0, coeff1);
+    __m256i b1 = _mm256_sub_epi16(coeff0, coeff1);
+    __m256i b2 = _mm256_add_epi16(coeff2, coeff3);
+    __m256i b3 = _mm256_sub_epi16(coeff2, coeff3);
+
+    b0 = _mm256_srai_epi16(b0, 2);
+    b1 = _mm256_srai_epi16(b1, 2);
+    b2 = _mm256_srai_epi16(b2, 2);
+    b3 = _mm256_srai_epi16(b3, 2);
+
+    store_tran_low(_mm256_add_epi16(b0, b2), coeff);
+    store_tran_low(_mm256_add_epi16(b1, b3), coeff + 256);
+    store_tran_low(_mm256_sub_epi16(b0, b2), coeff + 512);
+    store_tran_low(_mm256_sub_epi16(b1, b3), coeff + 768);
+
+    coeff += 16;
+    t_coeff += 16;
+  }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static void highbd_hadamard_col8_avx2(__m256i *in, int iter) {
+  __m256i a0 = in[0];
+  __m256i a1 = in[1];
+  __m256i a2 = in[2];
+  __m256i a3 = in[3];
+  __m256i a4 = in[4];
+  __m256i a5 = in[5];
+  __m256i a6 = in[6];
+  __m256i a7 = in[7];
+
+  __m256i b0 = _mm256_add_epi32(a0, a1);
+  __m256i b1 = _mm256_sub_epi32(a0, a1);
+  __m256i b2 = _mm256_add_epi32(a2, a3);
+  __m256i b3 = _mm256_sub_epi32(a2, a3);
+  __m256i b4 = _mm256_add_epi32(a4, a5);
+  __m256i b5 = _mm256_sub_epi32(a4, a5);
+  __m256i b6 = _mm256_add_epi32(a6, a7);
+  __m256i b7 = _mm256_sub_epi32(a6, a7);
+
+  a0 = _mm256_add_epi32(b0, b2);
+  a1 = _mm256_add_epi32(b1, b3);
+  a2 = _mm256_sub_epi32(b0, b2);
+  a3 = _mm256_sub_epi32(b1, b3);
+  a4 = _mm256_add_epi32(b4, b6);
+  a5 = _mm256_add_epi32(b5, b7);
+  a6 = _mm256_sub_epi32(b4, b6);
+  a7 = _mm256_sub_epi32(b5, b7);
+
+  if (iter == 0) {
+    b0 = _mm256_add_epi32(a0, a4);
+    b7 = _mm256_add_epi32(a1, a5);
+    b3 = _mm256_add_epi32(a2, a6);
+    b4 = _mm256_add_epi32(a3, a7);
+    b2 = _mm256_sub_epi32(a0, a4);
+    b6 = _mm256_sub_epi32(a1, a5);
+    b1 = _mm256_sub_epi32(a2, a6);
+    b5 = _mm256_sub_epi32(a3, a7);
+
+    a0 = _mm256_unpacklo_epi32(b0, b1);
+    a1 = _mm256_unpacklo_epi32(b2, b3);
+    a2 = _mm256_unpackhi_epi32(b0, b1);
+    a3 = _mm256_unpackhi_epi32(b2, b3);
+    a4 = _mm256_unpacklo_epi32(b4, b5);
+    a5 = _mm256_unpacklo_epi32(b6, b7);
+    a6 = _mm256_unpackhi_epi32(b4, b5);
+    a7 = _mm256_unpackhi_epi32(b6, b7);
+
+    b0 = _mm256_unpacklo_epi64(a0, a1);
+    b1 = _mm256_unpacklo_epi64(a4, a5);
+    b2 = _mm256_unpackhi_epi64(a0, a1);
+    b3 = _mm256_unpackhi_epi64(a4, a5);
+    b4 = _mm256_unpacklo_epi64(a2, a3);
+    b5 = _mm256_unpacklo_epi64(a6, a7);
+    b6 = _mm256_unpackhi_epi64(a2, a3);
+    b7 = _mm256_unpackhi_epi64(a6, a7);
+
+    in[0] = _mm256_permute2x128_si256(b0, b1, 0x20);
+    in[1] = _mm256_permute2x128_si256(b0, b1, 0x31);
+    in[2] = _mm256_permute2x128_si256(b2, b3, 0x20);
+    in[3] = _mm256_permute2x128_si256(b2, b3, 0x31);
+    in[4] = _mm256_permute2x128_si256(b4, b5, 0x20);
+    in[5] = _mm256_permute2x128_si256(b4, b5, 0x31);
+    in[6] = _mm256_permute2x128_si256(b6, b7, 0x20);
+    in[7] = _mm256_permute2x128_si256(b6, b7, 0x31);
+  } else {
+    in[0] = _mm256_add_epi32(a0, a4);
+    in[7] = _mm256_add_epi32(a1, a5);
+    in[3] = _mm256_add_epi32(a2, a6);
+    in[4] = _mm256_add_epi32(a3, a7);
+    in[2] = _mm256_sub_epi32(a0, a4);
+    in[6] = _mm256_sub_epi32(a1, a5);
+    in[1] = _mm256_sub_epi32(a2, a6);
+    in[5] = _mm256_sub_epi32(a3, a7);
+  }
+}
+
+void aom_highbd_hadamard_8x8_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
+                                  tran_low_t *coeff) {
+  __m128i src16[8];
+  __m256i src32[8];
+
+  src16[0] = _mm_loadu_si128((const __m128i *)src_diff);
+  src16[1] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+  src16[2] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+  src16[3] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+  src16[4] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+  src16[5] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+  src16[6] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+  src16[7] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+
+  src32[0] = _mm256_cvtepi16_epi32(src16[0]);
+  src32[1] = _mm256_cvtepi16_epi32(src16[1]);
+  src32[2] = _mm256_cvtepi16_epi32(src16[2]);
+  src32[3] = _mm256_cvtepi16_epi32(src16[3]);
+  src32[4] = _mm256_cvtepi16_epi32(src16[4]);
+  src32[5] = _mm256_cvtepi16_epi32(src16[5]);
+  src32[6] = _mm256_cvtepi16_epi32(src16[6]);
+  src32[7] = _mm256_cvtepi16_epi32(src16[7]);
+
+  highbd_hadamard_col8_avx2(src32, 0);
+  highbd_hadamard_col8_avx2(src32, 1);
+
+  _mm256_storeu_si256((__m256i *)coeff, src32[0]);
+  coeff += 8;
+  _mm256_storeu_si256((__m256i *)coeff, src32[1]);
+  coeff += 8;
+  _mm256_storeu_si256((__m256i *)coeff, src32[2]);
+  coeff += 8;
+  _mm256_storeu_si256((__m256i *)coeff, src32[3]);
+  coeff += 8;
+  _mm256_storeu_si256((__m256i *)coeff, src32[4]);
+  coeff += 8;
+  _mm256_storeu_si256((__m256i *)coeff, src32[5]);
+  coeff += 8;
+  _mm256_storeu_si256((__m256i *)coeff, src32[6]);
+  coeff += 8;
+  _mm256_storeu_si256((__m256i *)coeff, src32[7]);
+}
+
+void aom_highbd_hadamard_16x16_avx2(const int16_t *src_diff,
+                                    ptrdiff_t src_stride, tran_low_t *coeff) {
+  int idx;
+  tran_low_t *t_coeff = coeff;
+  for (idx = 0; idx < 4; ++idx) {
+    const int16_t *src_ptr =
+        src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
+    aom_highbd_hadamard_8x8_avx2(src_ptr, src_stride, t_coeff + idx * 64);
+  }
+
+  for (idx = 0; idx < 64; idx += 8) {
+    __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
+    __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 64));
+    __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 128));
+    __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 192));
+
+    __m256i b0 = _mm256_add_epi32(coeff0, coeff1);
+    __m256i b1 = _mm256_sub_epi32(coeff0, coeff1);
+    __m256i b2 = _mm256_add_epi32(coeff2, coeff3);
+    __m256i b3 = _mm256_sub_epi32(coeff2, coeff3);
+
+    b0 = _mm256_srai_epi32(b0, 1);
+    b1 = _mm256_srai_epi32(b1, 1);
+    b2 = _mm256_srai_epi32(b2, 1);
+    b3 = _mm256_srai_epi32(b3, 1);
+
+    coeff0 = _mm256_add_epi32(b0, b2);
+    coeff1 = _mm256_add_epi32(b1, b3);
+    coeff2 = _mm256_sub_epi32(b0, b2);
+    coeff3 = _mm256_sub_epi32(b1, b3);
+
+    _mm256_storeu_si256((__m256i *)coeff, coeff0);
+    _mm256_storeu_si256((__m256i *)(coeff + 64), coeff1);
+    _mm256_storeu_si256((__m256i *)(coeff + 128), coeff2);
+    _mm256_storeu_si256((__m256i *)(coeff + 192), coeff3);
+
+    coeff += 8;
+    t_coeff += 8;
+  }
+}
+
+void aom_highbd_hadamard_32x32_avx2(const int16_t *src_diff,
+                                    ptrdiff_t src_stride, tran_low_t *coeff) {
+  int idx;
+  tran_low_t *t_coeff = coeff;
+  for (idx = 0; idx < 4; ++idx) {
+    const int16_t *src_ptr =
+        src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
+    aom_highbd_hadamard_16x16_avx2(src_ptr, src_stride, t_coeff + idx * 256);
+  }
+
+  for (idx = 0; idx < 256; idx += 8) {
+    __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
+    __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 256));
+    __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 512));
+    __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 768));
+
+    __m256i b0 = _mm256_add_epi32(coeff0, coeff1);
+    __m256i b1 = _mm256_sub_epi32(coeff0, coeff1);
+    __m256i b2 = _mm256_add_epi32(coeff2, coeff3);
+    __m256i b3 = _mm256_sub_epi32(coeff2, coeff3);
+
+    b0 = _mm256_srai_epi32(b0, 2);
+    b1 = _mm256_srai_epi32(b1, 2);
+    b2 = _mm256_srai_epi32(b2, 2);
+    b3 = _mm256_srai_epi32(b3, 2);
+
+    coeff0 = _mm256_add_epi32(b0, b2);
+    coeff1 = _mm256_add_epi32(b1, b3);
+    coeff2 = _mm256_sub_epi32(b0, b2);
+    coeff3 = _mm256_sub_epi32(b1, b3);
+
+    _mm256_storeu_si256((__m256i *)coeff, coeff0);
+    _mm256_storeu_si256((__m256i *)(coeff + 256), coeff1);
+    _mm256_storeu_si256((__m256i *)(coeff + 512), coeff2);
+    _mm256_storeu_si256((__m256i *)(coeff + 768), coeff3);
+
+    coeff += 8;
+    t_coeff += 8;
+  }
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+int aom_satd_avx2(const tran_low_t *coeff, int length) {
+  __m256i accum = _mm256_setzero_si256();
+  int i;
+
+  for (i = 0; i < length; i += 8, coeff += 8) {
+    const __m256i src_line = _mm256_loadu_si256((const __m256i *)coeff);
+    const __m256i abs = _mm256_abs_epi32(src_line);
+    accum = _mm256_add_epi32(accum, abs);
+  }
+
+  {  // 32 bit horizontal add
+    const __m256i a = _mm256_srli_si256(accum, 8);
+    const __m256i b = _mm256_add_epi32(accum, a);
+    const __m256i c = _mm256_srli_epi64(b, 32);
+    const __m256i d = _mm256_add_epi32(b, c);
+    const __m128i accum_128 = _mm_add_epi32(_mm256_castsi256_si128(d),
+                                            _mm256_extractf128_si256(d, 1));
+    return _mm_cvtsi128_si32(accum_128);
+  }
+}
+
+int aom_satd_lp_avx2(const int16_t *coeff, int length) {
+  const __m256i one = _mm256_set1_epi16(1);
+  __m256i accum = _mm256_setzero_si256();
+
+  for (int i = 0; i < length; i += 16) {
+    const __m256i src_line = _mm256_loadu_si256((const __m256i *)coeff);
+    const __m256i abs = _mm256_abs_epi16(src_line);
+    const __m256i sum = _mm256_madd_epi16(abs, one);
+    accum = _mm256_add_epi32(accum, sum);
+    coeff += 16;
+  }
+
+  {  // 32 bit horizontal add
+    const __m256i a = _mm256_srli_si256(accum, 8);
+    const __m256i b = _mm256_add_epi32(accum, a);
+    const __m256i c = _mm256_srli_epi64(b, 32);
+    const __m256i d = _mm256_add_epi32(b, c);
+    const __m128i accum_128 = _mm_add_epi32(_mm256_castsi256_si128(d),
+                                            _mm256_extractf128_si256(d, 1));
+    return _mm_cvtsi128_si32(accum_128);
+  }
+}
diff --git a/libs/libaom/src/aom_dsp/x86/avg_intrin_sse2.c b/libs/libaom/src/aom_dsp/x86/avg_intrin_sse2.c
new file mode 100644
index 000000000..260ca2ad1
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/avg_intrin_sse2.c
@@ -0,0 +1,512 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/bitdepth_conversion_sse2.h"
+#include "aom_ports/mem.h"
+
+void aom_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp,
+                         int *min, int *max) {
+  __m128i u0, s0, d0, diff, maxabsdiff, minabsdiff, negdiff, absdiff0, absdiff;
+  u0 = _mm_setzero_si128();
+  // Row 0
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d)), u0);
+  diff = _mm_subs_epi16(s0, d0);
+  negdiff = _mm_subs_epi16(u0, diff);
+  absdiff0 = _mm_max_epi16(diff, negdiff);
+  // Row 1
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + dp)), u0);
+  diff = _mm_subs_epi16(s0, d0);
+  negdiff = _mm_subs_epi16(u0, diff);
+  absdiff = _mm_max_epi16(diff, negdiff);
+  maxabsdiff = _mm_max_epi16(absdiff0, absdiff);
+  minabsdiff = _mm_min_epi16(absdiff0, absdiff);
+  // Row 2
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 2 * dp)), u0);
+  diff = _mm_subs_epi16(s0, d0);
+  negdiff = _mm_subs_epi16(u0, diff);
+  absdiff = _mm_max_epi16(diff, negdiff);
+  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+  // Row 3
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 3 * dp)), u0);
+  diff = _mm_subs_epi16(s0, d0);
+  negdiff = _mm_subs_epi16(u0, diff);
+  absdiff = _mm_max_epi16(diff, negdiff);
+  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+  // Row 4
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0);
+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 4 * dp)), u0);
+  diff = _mm_subs_epi16(s0, d0);
+  negdiff = _mm_subs_epi16(u0, diff);
+  absdiff = _mm_max_epi16(diff, negdiff);
+  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+  // Row 5
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0);
+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 5 * dp)), u0);
+  diff = _mm_subs_epi16(s0, d0);
+  negdiff = _mm_subs_epi16(u0, diff);
+  absdiff = _mm_max_epi16(diff, negdiff);
+  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+  // Row 6
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0);
+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 6 * dp)), u0);
+  diff = _mm_subs_epi16(s0, d0);
+  negdiff = _mm_subs_epi16(u0, diff);
+  absdiff = _mm_max_epi16(diff, negdiff);
+  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+  // Row 7
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0);
+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 7 * dp)), u0);
+  diff = _mm_subs_epi16(s0, d0);
+  negdiff = _mm_subs_epi16(u0, diff);
+  absdiff = _mm_max_epi16(diff, negdiff);
+  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+
+  maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_si128(maxabsdiff, 8));
+  maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 32));
+  maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 16));
+  *max = _mm_extract_epi16(maxabsdiff, 0);
+
+  minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_si128(minabsdiff, 8));
+  minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 32));
+  minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 16));
+  *min = _mm_extract_epi16(minabsdiff, 0);
+}
+
+unsigned int aom_avg_8x8_sse2(const uint8_t *s, int p) {
+  __m128i s0, s1, u0;
+  unsigned int avg = 0;
+  u0 = _mm_setzero_si128();
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+
+  s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 8));
+  s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 32));
+  s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16));
+  avg = _mm_extract_epi16(s0, 0);
+  return (avg + 32) >> 6;
+}
+
+unsigned int aom_avg_4x4_sse2(const uint8_t *s, int p) {
+  __m128i s0, s1, u0;
+  unsigned int avg = 0;
+  u0 = _mm_setzero_si128();
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+
+  s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 4));
+  s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16));
+  avg = _mm_extract_epi16(s0, 0);
+  return (avg + 8) >> 4;
+}
+
+static INLINE void hadamard_col8_sse2(__m128i *in, int iter) {
+  __m128i a0 = in[0];
+  __m128i a1 = in[1];
+  __m128i a2 = in[2];
+  __m128i a3 = in[3];
+  __m128i a4 = in[4];
+  __m128i a5 = in[5];
+  __m128i a6 = in[6];
+  __m128i a7 = in[7];
+
+  __m128i b0 = _mm_add_epi16(a0, a1);
+  __m128i b1 = _mm_sub_epi16(a0, a1);
+  __m128i b2 = _mm_add_epi16(a2, a3);
+  __m128i b3 = _mm_sub_epi16(a2, a3);
+  __m128i b4 = _mm_add_epi16(a4, a5);
+  __m128i b5 = _mm_sub_epi16(a4, a5);
+  __m128i b6 = _mm_add_epi16(a6, a7);
+  __m128i b7 = _mm_sub_epi16(a6, a7);
+
+  a0 = _mm_add_epi16(b0, b2);
+  a1 = _mm_add_epi16(b1, b3);
+  a2 = _mm_sub_epi16(b0, b2);
+  a3 = _mm_sub_epi16(b1, b3);
+  a4 = _mm_add_epi16(b4, b6);
+  a5 = _mm_add_epi16(b5, b7);
+  a6 = _mm_sub_epi16(b4, b6);
+  a7 = _mm_sub_epi16(b5, b7);
+
+  if (iter == 0) {
+    b0 = _mm_add_epi16(a0, a4);
+    b7 = _mm_add_epi16(a1, a5);
+    b3 = _mm_add_epi16(a2, a6);
+    b4 = _mm_add_epi16(a3, a7);
+    b2 = _mm_sub_epi16(a0, a4);
+    b6 = _mm_sub_epi16(a1, a5);
+    b1 = _mm_sub_epi16(a2, a6);
+    b5 = _mm_sub_epi16(a3, a7);
+
+    a0 = _mm_unpacklo_epi16(b0, b1);
+    a1 = _mm_unpacklo_epi16(b2, b3);
+    a2 = _mm_unpackhi_epi16(b0, b1);
+    a3 = _mm_unpackhi_epi16(b2, b3);
+    a4 = _mm_unpacklo_epi16(b4, b5);
+    a5 = _mm_unpacklo_epi16(b6, b7);
+    a6 = _mm_unpackhi_epi16(b4, b5);
+    a7 = _mm_unpackhi_epi16(b6, b7);
+
+    b0 = _mm_unpacklo_epi32(a0, a1);
+    b1 = _mm_unpacklo_epi32(a4, a5);
+    b2 = _mm_unpackhi_epi32(a0, a1);
+    b3 = _mm_unpackhi_epi32(a4, a5);
+    b4 = _mm_unpacklo_epi32(a2, a3);
+    b5 = _mm_unpacklo_epi32(a6, a7);
+    b6 = _mm_unpackhi_epi32(a2, a3);
+    b7 = _mm_unpackhi_epi32(a6, a7);
+
+    in[0] = _mm_unpacklo_epi64(b0, b1);
+    in[1] = _mm_unpackhi_epi64(b0, b1);
+    in[2] = _mm_unpacklo_epi64(b2, b3);
+    in[3] = _mm_unpackhi_epi64(b2, b3);
+    in[4] = _mm_unpacklo_epi64(b4, b5);
+    in[5] = _mm_unpackhi_epi64(b4, b5);
+    in[6] = _mm_unpacklo_epi64(b6, b7);
+    in[7] = _mm_unpackhi_epi64(b6, b7);
+  } else {
+    in[0] = _mm_add_epi16(a0, a4);
+    in[7] = _mm_add_epi16(a1, a5);
+    in[3] = _mm_add_epi16(a2, a6);
+    in[4] = _mm_add_epi16(a3, a7);
+    in[2] = _mm_sub_epi16(a0, a4);
+    in[6] = _mm_sub_epi16(a1, a5);
+    in[1] = _mm_sub_epi16(a2, a6);
+    in[5] = _mm_sub_epi16(a3, a7);
+  }
+}
+
+static INLINE void hadamard_8x8_sse2(const int16_t *src_diff,
+                                     ptrdiff_t src_stride, tran_low_t *coeff,
+                                     int is_final) {
+  __m128i src[8];
+  src[0] = _mm_load_si128((const __m128i *)src_diff);
+  src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[2] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[3] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[7] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+
+  hadamard_col8_sse2(src, 0);
+  hadamard_col8_sse2(src, 1);
+
+  if (is_final) {
+    store_tran_low(src[0], coeff);
+    coeff += 8;
+    store_tran_low(src[1], coeff);
+    coeff += 8;
+    store_tran_low(src[2], coeff);
+    coeff += 8;
+    store_tran_low(src[3], coeff);
+    coeff += 8;
+    store_tran_low(src[4], coeff);
+    coeff += 8;
+    store_tran_low(src[5], coeff);
+    coeff += 8;
+    store_tran_low(src[6], coeff);
+    coeff += 8;
+    store_tran_low(src[7], coeff);
+  } else {
+    int16_t *coeff16 = (int16_t *)coeff;
+    _mm_store_si128((__m128i *)coeff16, src[0]);
+    coeff16 += 8;
+    _mm_store_si128((__m128i *)coeff16, src[1]);
+    coeff16 += 8;
+    _mm_store_si128((__m128i *)coeff16, src[2]);
+    coeff16 += 8;
+    _mm_store_si128((__m128i *)coeff16, src[3]);
+    coeff16 += 8;
+    _mm_store_si128((__m128i *)coeff16, src[4]);
+    coeff16 += 8;
+    _mm_store_si128((__m128i *)coeff16, src[5]);
+    coeff16 += 8;
+    _mm_store_si128((__m128i *)coeff16, src[6]);
+    coeff16 += 8;
+    _mm_store_si128((__m128i *)coeff16, src[7]);
+  }
+}
+
+void aom_hadamard_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
+                           tran_low_t *coeff) {
+  hadamard_8x8_sse2(src_diff, src_stride, coeff, 1);
+}
+
+void aom_hadamard_lp_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
+                              int16_t *coeff) {
+  __m128i src[8];
+  src[0] = _mm_load_si128((const __m128i *)src_diff);
+  src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[2] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[3] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[7] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+
+  hadamard_col8_sse2(src, 0);
+  hadamard_col8_sse2(src, 1);
+
+  _mm_store_si128((__m128i *)coeff, src[0]);
+  coeff += 8;
+  _mm_store_si128((__m128i *)coeff, src[1]);
+  coeff += 8;
+  _mm_store_si128((__m128i *)coeff, src[2]);
+  coeff += 8;
+  _mm_store_si128((__m128i *)coeff, src[3]);
+  coeff += 8;
+  _mm_store_si128((__m128i *)coeff, src[4]);
+  coeff += 8;
+  _mm_store_si128((__m128i *)coeff, src[5]);
+  coeff += 8;
+  _mm_store_si128((__m128i *)coeff, src[6]);
+  coeff += 8;
+  _mm_store_si128((__m128i *)coeff, src[7]);
+}
+
+static INLINE void hadamard_16x16_sse2(const int16_t *src_diff,
+                                       ptrdiff_t src_stride, tran_low_t *coeff,
+                                       int is_final) {
+  // For high bitdepths, it is unnecessary to store_tran_low
+  // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the
+  // next stage.  Output to an intermediate buffer first, then store_tran_low()
+  // in the final stage.
+  DECLARE_ALIGNED(32, int16_t, temp_coeff[16 * 16]);
+  int16_t *t_coeff = temp_coeff;
+  int16_t *coeff16 = (int16_t *)coeff;
+  int idx;
+  for (idx = 0; idx < 4; ++idx) {
+    const int16_t *src_ptr =
+        src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
+    hadamard_8x8_sse2(src_ptr, src_stride, (tran_low_t *)(t_coeff + idx * 64),
+                      0);
+  }
+
+  for (idx = 0; idx < 64; idx += 8) {
+    __m128i coeff0 = _mm_load_si128((const __m128i *)t_coeff);
+    __m128i coeff1 = _mm_load_si128((const __m128i *)(t_coeff + 64));
+    __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 128));
+    __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 192));
+
+    __m128i b0 = _mm_add_epi16(coeff0, coeff1);
+    __m128i b1 = _mm_sub_epi16(coeff0, coeff1);
+    __m128i b2 = _mm_add_epi16(coeff2, coeff3);
+    __m128i b3 = _mm_sub_epi16(coeff2, coeff3);
+
+    b0 = _mm_srai_epi16(b0, 1);
+    b1 = _mm_srai_epi16(b1, 1);
+    b2 = _mm_srai_epi16(b2, 1);
+    b3 = _mm_srai_epi16(b3, 1);
+
+    coeff0 = _mm_add_epi16(b0, b2);
+    coeff1 = _mm_add_epi16(b1, b3);
+    coeff2 = _mm_sub_epi16(b0, b2);
+    coeff3 = _mm_sub_epi16(b1, b3);
+
+    if (is_final) {
+      store_tran_low(coeff0, coeff);
+      store_tran_low(coeff1, coeff + 64);
+      store_tran_low(coeff2, coeff + 128);
+      store_tran_low(coeff3, coeff + 192);
+      coeff += 8;
+    } else {
+      _mm_store_si128((__m128i *)coeff16, coeff0);
+      _mm_store_si128((__m128i *)(coeff16 + 64), coeff1);
+      _mm_store_si128((__m128i *)(coeff16 + 128), coeff2);
+      _mm_store_si128((__m128i *)(coeff16 + 192), coeff3);
+      coeff16 += 8;
+    }
+
+    t_coeff += 8;
+  }
+}
+
+void aom_hadamard_16x16_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
+                             tran_low_t *coeff) {
+  hadamard_16x16_sse2(src_diff, src_stride, coeff, 1);
+}
+
+void aom_hadamard_32x32_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
+                             tran_low_t *coeff) {
+  // For high bitdepths, it is unnecessary to store_tran_low
+  // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the
+  // next stage.  Output to an intermediate buffer first, then store_tran_low()
+  // in the final stage.
+  DECLARE_ALIGNED(32, int16_t, temp_coeff[32 * 32]);
+  int16_t *t_coeff = temp_coeff;
+  int idx;
+  for (idx = 0; idx < 4; ++idx) {
+    const int16_t *src_ptr =
+        src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
+    hadamard_16x16_sse2(src_ptr, src_stride,
+                        (tran_low_t *)(t_coeff + idx * 256), 0);
+  }
+
+  for (idx = 0; idx < 256; idx += 8) {
+    __m128i coeff0 = _mm_load_si128((const __m128i *)t_coeff);
+    __m128i coeff1 = _mm_load_si128((const __m128i *)(t_coeff + 256));
+    __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 512));
+    __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 768));
+
+    __m128i b0 = _mm_add_epi16(coeff0, coeff1);
+    __m128i b1 = _mm_sub_epi16(coeff0, coeff1);
+    __m128i b2 = _mm_add_epi16(coeff2, coeff3);
+    __m128i b3 = _mm_sub_epi16(coeff2, coeff3);
+
+    b0 = _mm_srai_epi16(b0, 2);
+    b1 = _mm_srai_epi16(b1, 2);
+    b2 = _mm_srai_epi16(b2, 2);
+    b3 = _mm_srai_epi16(b3, 2);
+
+    coeff0 = _mm_add_epi16(b0, b2);
+    coeff1 = _mm_add_epi16(b1, b3);
+    store_tran_low(coeff0, coeff);
+    store_tran_low(coeff1, coeff + 256);
+
+    coeff2 = _mm_sub_epi16(b0, b2);
+    coeff3 = _mm_sub_epi16(b1, b3);
+    store_tran_low(coeff2, coeff + 512);
+    store_tran_low(coeff3, coeff + 768);
+
+    coeff += 8;
+    t_coeff += 8;
+  }
+}
+
+int aom_satd_sse2(const tran_low_t *coeff, int length) {
+  int i;
+  const __m128i zero = _mm_setzero_si128();
+  __m128i accum = zero;
+
+  for (i = 0; i < length; i += 8) {
+    const __m128i src_line = load_tran_low(coeff);
+    const __m128i inv = _mm_sub_epi16(zero, src_line);
+    const __m128i abs = _mm_max_epi16(src_line, inv);  // abs(src_line)
+    const __m128i abs_lo = _mm_unpacklo_epi16(abs, zero);
+    const __m128i abs_hi = _mm_unpackhi_epi16(abs, zero);
+    const __m128i sum = _mm_add_epi32(abs_lo, abs_hi);
+    accum = _mm_add_epi32(accum, sum);
+    coeff += 8;
+  }
+
+  {  // cascading summation of accum
+    __m128i hi = _mm_srli_si128(accum, 8);
+    accum = _mm_add_epi32(accum, hi);
+    hi = _mm_srli_epi64(accum, 32);
+    accum = _mm_add_epi32(accum, hi);
+  }
+
+  return _mm_cvtsi128_si32(accum);
+}
+
+void aom_int_pro_row_sse2(int16_t *hbuf, const uint8_t *ref,
+                          const int ref_stride, const int height) {
+  int idx = 1;
+  __m128i zero = _mm_setzero_si128();
+  __m128i src_line = _mm_loadu_si128((const __m128i *)ref);
+  __m128i s0 = _mm_unpacklo_epi8(src_line, zero);
+  __m128i s1 = _mm_unpackhi_epi8(src_line, zero);
+  __m128i t0, t1;
+  int height_1 = height - 1;
+  ref += ref_stride;
+  do {
+    src_line = _mm_loadu_si128((const __m128i *)ref);
+    t0 = _mm_unpacklo_epi8(src_line, zero);
+    t1 = _mm_unpackhi_epi8(src_line, zero);
+    s0 = _mm_adds_epu16(s0, t0);
+    s1 = _mm_adds_epu16(s1, t1);
+    ref += ref_stride;
+
+    src_line = _mm_loadu_si128((const __m128i *)ref);
+    t0 = _mm_unpacklo_epi8(src_line, zero);
+    t1 = _mm_unpackhi_epi8(src_line, zero);
+    s0 = _mm_adds_epu16(s0, t0);
+    s1 = _mm_adds_epu16(s1, t1);
+    ref += ref_stride;
+    idx += 2;
+  } while (idx < height_1);
+
+  src_line = _mm_loadu_si128((const __m128i *)ref);
+  t0 = _mm_unpacklo_epi8(src_line, zero);
+  t1 = _mm_unpackhi_epi8(src_line, zero);
+  s0 = _mm_adds_epu16(s0, t0);
+  s1 = _mm_adds_epu16(s1, t1);
+  if (height == 128) {
+    s0 = _mm_srai_epi16(s0, 6);
+    s1 = _mm_srai_epi16(s1, 6);
+  } else if (height == 64) {
+    s0 = _mm_srai_epi16(s0, 5);
+    s1 = _mm_srai_epi16(s1, 5);
+  } else if (height == 32) {
+    s0 = _mm_srai_epi16(s0, 4);
+    s1 = _mm_srai_epi16(s1, 4);
+  } else {
+    assert(height == 16);
+    s0 = _mm_srai_epi16(s0, 3);
+    s1 = _mm_srai_epi16(s1, 3);
+  }
+
+  _mm_storeu_si128((__m128i *)hbuf, s0);
+  hbuf += 8;
+  _mm_storeu_si128((__m128i *)hbuf, s1);
+}
+
+int16_t aom_int_pro_col_sse2(const uint8_t *ref, const int width) {
+  __m128i zero = _mm_setzero_si128();
+  __m128i src_line = _mm_loadu_si128((const __m128i *)ref);
+  __m128i s0 = _mm_sad_epu8(src_line, zero);
+  __m128i s1;
+  int i;
+
+  for (i = 16; i < width; i += 16) {
+    ref += 16;
+    src_line = _mm_loadu_si128((const __m128i *)ref);
+    s1 = _mm_sad_epu8(src_line, zero);
+    s0 = _mm_adds_epu16(s0, s1);
+  }
+
+  s1 = _mm_srli_si128(s0, 8);
+  s0 = _mm_adds_epu16(s0, s1);
+
+  return _mm_extract_epi16(s0, 0);
+}
diff --git a/libs/libaom/src/aom_dsp/x86/bitdepth_conversion_avx2.h b/libs/libaom/src/aom_dsp/x86/bitdepth_conversion_avx2.h
new file mode 100644
index 000000000..85896e276
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/bitdepth_conversion_avx2.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+static INLINE __m256i load_tran_low(const tran_low_t *a) {
+  const __m256i a_low = _mm256_loadu_si256((const __m256i *)a);
+  const __m256i a_high = _mm256_loadu_si256((const __m256i *)(a + 8));
+  return _mm256_packs_epi32(a_low, a_high);
+}
+
+static INLINE void store_tran_low(__m256i a, tran_low_t *b) {
+  const __m256i one = _mm256_set1_epi16(1);
+  const __m256i a_hi = _mm256_mulhi_epi16(a, one);
+  const __m256i a_lo = _mm256_mullo_epi16(a, one);
+  const __m256i a_1 = _mm256_unpacklo_epi16(a_lo, a_hi);
+  const __m256i a_2 = _mm256_unpackhi_epi16(a_lo, a_hi);
+  _mm256_storeu_si256((__m256i *)b, a_1);
+  _mm256_storeu_si256((__m256i *)(b + 8), a_2);
+}
diff --git a/libs/libaom/src/aom_dsp/x86/bitdepth_conversion_sse2.h b/libs/libaom/src/aom_dsp/x86/bitdepth_conversion_sse2.h
new file mode 100644
index 000000000..42bb2d1d3
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/bitdepth_conversion_sse2.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <xmmintrin.h>
+
+#include "config/aom_config.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+// Load 8 16 bit values. If the source is 32 bits then pack down with
+// saturation.
+static INLINE __m128i load_tran_low(const tran_low_t *a) {
+  const __m128i a_low = _mm_load_si128((const __m128i *)a);
+  return _mm_packs_epi32(a_low, *(const __m128i *)(a + 4));
+}
+
+// Store 8 16 bit values. If the destination is 32 bits then sign extend the
+// values by multiplying by 1.
+static INLINE void store_tran_low(__m128i a, tran_low_t *b) {
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i a_hi = _mm_mulhi_epi16(a, one);
+  const __m128i a_lo = _mm_mullo_epi16(a, one);
+  const __m128i a_1 = _mm_unpacklo_epi16(a_lo, a_hi);
+  const __m128i a_2 = _mm_unpackhi_epi16(a_lo, a_hi);
+  _mm_store_si128((__m128i *)(b), a_1);
+  _mm_store_si128((__m128i *)(b + 4), a_2);
+}
diff --git a/libs/libaom/src/aom_dsp/x86/blend_a64_hmask_sse4.c b/libs/libaom/src/aom_dsp/x86/blend_a64_hmask_sse4.c
new file mode 100644
index 000000000..e0289abe1
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/blend_a64_hmask_sse4.c
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom/aom_integer.h"
+
+#include "config/aom_dsp_rtcd.h"
+
+// To start out, just dispatch to the function using the 2D mask and
+// pass mask stride as 0. This can be improved upon if necessary.
+
+void aom_blend_a64_hmask_sse4_1(uint8_t *dst, uint32_t dst_stride,
+                                const uint8_t *src0, uint32_t src0_stride,
+                                const uint8_t *src1, uint32_t src1_stride,
+                                const uint8_t *mask, int w, int h) {
+  aom_blend_a64_mask_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                            src1_stride, mask, 0, w, h, 0, 0);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void aom_highbd_blend_a64_hmask_sse4_1(
+    uint8_t *dst_8, uint32_t dst_stride, const uint8_t *src0_8,
+    uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride,
+    const uint8_t *mask, int w, int h, int bd) {
+  aom_highbd_blend_a64_mask_sse4_1(dst_8, dst_stride, src0_8, src0_stride,
+                                   src1_8, src1_stride, mask, 0, w, h, 0, 0,
+                                   bd);
+}
+#endif
diff --git a/libs/libaom/src/aom_dsp/x86/blend_a64_mask_avx2.c b/libs/libaom/src/aom_dsp/x86/blend_a64_mask_avx2.c
new file mode 100644
index 000000000..95383d2fd
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/blend_a64_mask_avx2.c
@@ -0,0 +1,1374 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h>  // SSE4.1
+#include <immintrin.h>  // AVX2
+
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+#include "aom_dsp/x86/blend_sse4.h"
+#include "aom_dsp/x86/blend_mask_sse4.h"
+
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE void blend_a64_d16_mask_w16_avx2(
+    uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
+    const __m256i *m0, const __m256i *v_round_offset, const __m256i *v_maxval,
+    int shift) {
+  const __m256i max_minus_m0 = _mm256_sub_epi16(*v_maxval, *m0);
+  const __m256i s0_0 = yy_loadu_256(src0);
+  const __m256i s1_0 = yy_loadu_256(src1);
+  __m256i res0_lo = _mm256_madd_epi16(_mm256_unpacklo_epi16(s0_0, s1_0),
+                                      _mm256_unpacklo_epi16(*m0, max_minus_m0));
+  __m256i res0_hi = _mm256_madd_epi16(_mm256_unpackhi_epi16(s0_0, s1_0),
+                                      _mm256_unpackhi_epi16(*m0, max_minus_m0));
+  res0_lo =
+      _mm256_srai_epi32(_mm256_sub_epi32(res0_lo, *v_round_offset), shift);
+  res0_hi =
+      _mm256_srai_epi32(_mm256_sub_epi32(res0_hi, *v_round_offset), shift);
+  const __m256i res0 = _mm256_packs_epi32(res0_lo, res0_hi);
+  __m256i res = _mm256_packus_epi16(res0, res0);
+  res = _mm256_permute4x64_epi64(res, 0xd8);
+  _mm_storeu_si128((__m128i *)(dst), _mm256_castsi256_si128(res));
+}
+
+static INLINE void blend_a64_d16_mask_w32_avx2(
+    uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
+    const __m256i *m0, const __m256i *m1, const __m256i *v_round_offset,
+    const __m256i *v_maxval, int shift) {
+  const __m256i max_minus_m0 = _mm256_sub_epi16(*v_maxval, *m0);
+  const __m256i max_minus_m1 = _mm256_sub_epi16(*v_maxval, *m1);
+  const __m256i s0_0 = yy_loadu_256(src0);
+  const __m256i s0_1 = yy_loadu_256(src0 + 16);
+  const __m256i s1_0 = yy_loadu_256(src1);
+  const __m256i s1_1 = yy_loadu_256(src1 + 16);
+  __m256i res0_lo = _mm256_madd_epi16(_mm256_unpacklo_epi16(s0_0, s1_0),
+                                      _mm256_unpacklo_epi16(*m0, max_minus_m0));
+  __m256i res0_hi = _mm256_madd_epi16(_mm256_unpackhi_epi16(s0_0, s1_0),
+                                      _mm256_unpackhi_epi16(*m0, max_minus_m0));
+  __m256i res1_lo = _mm256_madd_epi16(_mm256_unpacklo_epi16(s0_1, s1_1),
+                                      _mm256_unpacklo_epi16(*m1, max_minus_m1));
+  __m256i res1_hi = _mm256_madd_epi16(_mm256_unpackhi_epi16(s0_1, s1_1),
+                                      _mm256_unpackhi_epi16(*m1, max_minus_m1));
+  res0_lo =
+      _mm256_srai_epi32(_mm256_sub_epi32(res0_lo, *v_round_offset), shift);
+  res0_hi =
+      _mm256_srai_epi32(_mm256_sub_epi32(res0_hi, *v_round_offset), shift);
+  res1_lo =
+      _mm256_srai_epi32(_mm256_sub_epi32(res1_lo, *v_round_offset), shift);
+  res1_hi =
+      _mm256_srai_epi32(_mm256_sub_epi32(res1_hi, *v_round_offset), shift);
+  const __m256i res0 = _mm256_packs_epi32(res0_lo, res0_hi);
+  const __m256i res1 = _mm256_packs_epi32(res1_lo, res1_hi);
+  __m256i res = _mm256_packus_epi16(res0, res1);
+  res = _mm256_permute4x64_epi64(res, 0xd8);
+  _mm256_storeu_si256((__m256i *)(dst), res);
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw0_subh0_w16_avx2(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h,
+    const __m256i *round_offset, int shift) {
+  const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  for (int i = 0; i < h; ++i) {
+    const __m128i m = xx_loadu_128(mask);
+    const __m256i m0 = _mm256_cvtepu8_epi16(m);
+
+    blend_a64_d16_mask_w16_avx2(dst, src0, src1, &m0, round_offset, &v_maxval,
+                                shift);
+    mask += mask_stride;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw0_subh0_w32_avx2(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w,
+    const __m256i *round_offset, int shift) {
+  const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  for (int i = 0; i < h; ++i) {
+    for (int j = 0; j < w; j += 32) {
+      const __m256i m = yy_loadu_256(mask + j);
+      const __m256i m0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(m));
+      const __m256i m1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(m, 1));
+
+      blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1,
+                                  round_offset, &v_maxval, shift);
+    }
+    mask += mask_stride;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw1_subh1_w16_avx2(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h,
+    const __m256i *round_offset, int shift) {
+  const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  const __m256i one_b = _mm256_set1_epi8(1);
+  const __m256i two_w = _mm256_set1_epi16(2);
+  for (int i = 0; i < h; ++i) {
+    const __m256i m_i00 = yy_loadu_256(mask);
+    const __m256i m_i10 = yy_loadu_256(mask + mask_stride);
+
+    const __m256i m0_ac = _mm256_adds_epu8(m_i00, m_i10);
+    const __m256i m0_acbd = _mm256_maddubs_epi16(m0_ac, one_b);
+    const __m256i m0 = _mm256_srli_epi16(_mm256_add_epi16(m0_acbd, two_w), 2);
+
+    blend_a64_d16_mask_w16_avx2(dst, src0, src1, &m0, round_offset, &v_maxval,
+                                shift);
+    mask += mask_stride << 1;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw1_subh1_w32_avx2(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w,
+    const __m256i *round_offset, int shift) {
+  const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  const __m256i one_b = _mm256_set1_epi8(1);
+  const __m256i two_w = _mm256_set1_epi16(2);
+  for (int i = 0; i < h; ++i) {
+    for (int j = 0; j < w; j += 32) {
+      const __m256i m_i00 = yy_loadu_256(mask + 2 * j);
+      const __m256i m_i01 = yy_loadu_256(mask + 2 * j + 32);
+      const __m256i m_i10 = yy_loadu_256(mask + mask_stride + 2 * j);
+      const __m256i m_i11 = yy_loadu_256(mask + mask_stride + 2 * j + 32);
+
+      const __m256i m0_ac = _mm256_adds_epu8(m_i00, m_i10);
+      const __m256i m1_ac = _mm256_adds_epu8(m_i01, m_i11);
+      const __m256i m0_acbd = _mm256_maddubs_epi16(m0_ac, one_b);
+      const __m256i m1_acbd = _mm256_maddubs_epi16(m1_ac, one_b);
+      const __m256i m0 = _mm256_srli_epi16(_mm256_add_epi16(m0_acbd, two_w), 2);
+      const __m256i m1 = _mm256_srli_epi16(_mm256_add_epi16(m1_acbd, two_w), 2);
+
+      blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1,
+                                  round_offset, &v_maxval, shift);
+    }
+    mask += mask_stride << 1;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw1_subh0_w16_avx2(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w,
+    const __m256i *round_offset, int shift) {
+  const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  const __m256i one_b = _mm256_set1_epi8(1);
+  const __m256i zeros = _mm256_setzero_si256();
+  for (int i = 0; i < h; ++i) {
+    for (int j = 0; j < w; j += 16) {
+      const __m256i m_i00 = yy_loadu_256(mask + 2 * j);
+      const __m256i m0_ac = _mm256_maddubs_epi16(m_i00, one_b);
+      const __m256i m0 = _mm256_avg_epu16(m0_ac, zeros);
+
+      blend_a64_d16_mask_w16_avx2(dst + j, src0 + j, src1 + j, &m0,
+                                  round_offset, &v_maxval, shift);
+    }
+    mask += mask_stride;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw1_subh0_w32_avx2(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w,
+    const __m256i *round_offset, int shift) {
+  const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  const __m256i one_b = _mm256_set1_epi8(1);
+  const __m256i zeros = _mm256_setzero_si256();
+  for (int i = 0; i < h; ++i) {
+    for (int j = 0; j < w; j += 32) {
+      const __m256i m_i00 = yy_loadu_256(mask + 2 * j);
+      const __m256i m_i01 = yy_loadu_256(mask + 2 * j + 32);
+      const __m256i m0_ac = _mm256_maddubs_epi16(m_i00, one_b);
+      const __m256i m1_ac = _mm256_maddubs_epi16(m_i01, one_b);
+      const __m256i m0 = _mm256_avg_epu16(m0_ac, zeros);
+      const __m256i m1 = _mm256_avg_epu16(m1_ac, zeros);
+
+      blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1,
+                                  round_offset, &v_maxval, shift);
+    }
+    mask += mask_stride;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw0_subh1_w16_avx2(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w,
+    const __m256i *round_offset, int shift) {
+  const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i zeros = _mm_setzero_si128();
+  for (int i = 0; i < h; ++i) {
+    for (int j = 0; j < w; j += 16) {
+      const __m128i m_i00 = xx_loadu_128(mask + j);
+      const __m128i m_i10 = xx_loadu_128(mask + mask_stride + j);
+
+      const __m128i m_ac = _mm_avg_epu8(_mm_adds_epu8(m_i00, m_i10), zeros);
+      const __m256i m0 = _mm256_cvtepu8_epi16(m_ac);
+
+      blend_a64_d16_mask_w16_avx2(dst + j, src0 + j, src1 + j, &m0,
+                                  round_offset, &v_maxval, shift);
+    }
+    mask += mask_stride << 1;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw0_subh1_w32_avx2(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w,
+    const __m256i *round_offset, int shift) {
+  const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  const __m256i zeros = _mm256_setzero_si256();
+  for (int i = 0; i < h; ++i) {
+    for (int j = 0; j < w; j += 32) {
+      const __m256i m_i00 = yy_loadu_256(mask + j);
+      const __m256i m_i10 = yy_loadu_256(mask + mask_stride + j);
+
+      const __m256i m_ac =
+          _mm256_avg_epu8(_mm256_adds_epu8(m_i00, m_i10), zeros);
+      const __m256i m0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(m_ac));
+      const __m256i m1 =
+          _mm256_cvtepu8_epi16(_mm256_extracti128_si256(m_ac, 1));
+
+      blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1,
+                                  round_offset, &v_maxval, shift);
+    }
+    mask += mask_stride << 1;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+
+void aom_lowbd_blend_a64_d16_mask_avx2(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
+    ConvolveParams *conv_params) {
+  const int bd = 8;
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+
+  const int round_offset =
+      ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) -
+       (1 << (round_bits - 1)))
+      << AOM_BLEND_A64_ROUND_BITS;
+
+  const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS;
+  assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 4);
+  assert(w >= 4);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+  const __m128i v_round_offset = _mm_set1_epi32(round_offset);
+  const __m256i y_round_offset = _mm256_set1_epi32(round_offset);
+
+  if (subw == 0 && subh == 0) {
+    switch (w) {
+      case 4:
+        aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift);
+        break;
+      case 8:
+        aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift);
+        break;
+      case 16:
+        lowbd_blend_a64_d16_mask_subw0_subh0_w16_avx2(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &y_round_offset, shift);
+        break;
+      default:
+        lowbd_blend_a64_d16_mask_subw0_subh0_w32_avx2(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, w, &y_round_offset, shift);
+        break;
+    }
+  } else if (subw == 1 && subh == 1) {
+    switch (w) {
+      case 4:
+        aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift);
+        break;
+      case 8:
+        aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift);
+        break;
+      case 16:
+        lowbd_blend_a64_d16_mask_subw1_subh1_w16_avx2(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &y_round_offset, shift);
+        break;
+      default:
+        lowbd_blend_a64_d16_mask_subw1_subh1_w32_avx2(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, w, &y_round_offset, shift);
+        break;
+    }
+  } else if (subw == 1 && subh == 0) {
+    switch (w) {
+      case 4:
+        aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift);
+        break;
+      case 8:
+        aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift);
+        break;
+      case 16:
+        lowbd_blend_a64_d16_mask_subw1_subh0_w16_avx2(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, w, &y_round_offset, shift);
+        break;
+      default:
+        lowbd_blend_a64_d16_mask_subw1_subh0_w32_avx2(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, w, &y_round_offset, shift);
+        break;
+    }
+  } else {
+    switch (w) {
+      case 4:
+        aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift);
+        break;
+      case 8:
+        aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift);
+        break;
+      case 16:
+        lowbd_blend_a64_d16_mask_subw0_subh1_w16_avx2(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, w, &y_round_offset, shift);
+        break;
+      default:
+        lowbd_blend_a64_d16_mask_subw0_subh1_w32_avx2(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, w, &y_round_offset, shift);
+        break;
+    }
+  }
+}
+
+static INLINE __m256i blend_16_u8_avx2(const uint8_t *src0, const uint8_t *src1,
+                                       const __m256i *v_m0_b,
+                                       const __m256i *v_m1_b,
+                                       const int32_t bits) {
+  const __m256i v_s0_b = _mm256_castsi128_si256(xx_loadu_128(src0));
+  const __m256i v_s1_b = _mm256_castsi128_si256(xx_loadu_128(src1));
+  const __m256i v_s0_s_b = _mm256_permute4x64_epi64(v_s0_b, 0xd8);
+  const __m256i v_s1_s_b = _mm256_permute4x64_epi64(v_s1_b, 0xd8);
+
+  const __m256i v_p0_w =
+      _mm256_maddubs_epi16(_mm256_unpacklo_epi8(v_s0_s_b, v_s1_s_b),
+                           _mm256_unpacklo_epi8(*v_m0_b, *v_m1_b));
+
+  const __m256i v_res0_w = yy_roundn_epu16(v_p0_w, bits);
+  const __m256i v_res_b = _mm256_packus_epi16(v_res0_w, v_res0_w);
+  const __m256i v_res = _mm256_permute4x64_epi64(v_res_b, 0xd8);
+  return v_res;
+}
+
+static INLINE __m256i blend_32_u8_avx2(const uint8_t *src0, const uint8_t *src1,
+                                       const __m256i *v_m0_b,
+                                       const __m256i *v_m1_b,
+                                       const int32_t bits) {
+  const __m256i v_s0_b = yy_loadu_256(src0);
+  const __m256i v_s1_b = yy_loadu_256(src1);
+
+  const __m256i v_p0_w =
+      _mm256_maddubs_epi16(_mm256_unpacklo_epi8(v_s0_b, v_s1_b),
+                           _mm256_unpacklo_epi8(*v_m0_b, *v_m1_b));
+  const __m256i v_p1_w =
+      _mm256_maddubs_epi16(_mm256_unpackhi_epi8(v_s0_b, v_s1_b),
+                           _mm256_unpackhi_epi8(*v_m0_b, *v_m1_b));
+
+  const __m256i v_res0_w = yy_roundn_epu16(v_p0_w, bits);
+  const __m256i v_res1_w = yy_roundn_epu16(v_p1_w, bits);
+  const __m256i v_res = _mm256_packus_epi16(v_res0_w, v_res1_w);
+  return v_res;
+}
+
+static INLINE void blend_a64_mask_sx_sy_w16_avx2(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h) {
+  const __m256i v_zmask_b = _mm256_set1_epi16(0xFF);
+  const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  do {
+    const __m256i v_ral_b = yy_loadu_256(mask);
+    const __m256i v_rbl_b = yy_loadu_256(mask + mask_stride);
+    const __m256i v_rvsl_b = _mm256_add_epi8(v_ral_b, v_rbl_b);
+    const __m256i v_rvsal_w = _mm256_and_si256(v_rvsl_b, v_zmask_b);
+    const __m256i v_rvsbl_w =
+        _mm256_and_si256(_mm256_srli_si256(v_rvsl_b, 1), v_zmask_b);
+    const __m256i v_rsl_w = _mm256_add_epi16(v_rvsal_w, v_rvsbl_w);
+
+    const __m256i v_m0_w = yy_roundn_epu16(v_rsl_w, 2);
+    const __m256i v_m0_b = _mm256_packus_epi16(v_m0_w, v_m0_w);
+    const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
+
+    const __m256i y_res_b = blend_16_u8_avx2(src0, src1, &v_m0_b, &v_m1_b,
+                                             AOM_BLEND_A64_ROUND_BITS);
+
+    xx_storeu_128(dst, _mm256_castsi256_si128(y_res_b));
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+static INLINE void blend_a64_mask_sx_sy_w32n_avx2(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m256i v_zmask_b = _mm256_set1_epi16(0xFF);
+  do {
+    int c;
+    for (c = 0; c < w; c += 32) {
+      const __m256i v_ral_b = yy_loadu_256(mask + 2 * c);
+      const __m256i v_rah_b = yy_loadu_256(mask + 2 * c + 32);
+      const __m256i v_rbl_b = yy_loadu_256(mask + mask_stride + 2 * c);
+      const __m256i v_rbh_b = yy_loadu_256(mask + mask_stride + 2 * c + 32);
+      const __m256i v_rvsl_b = _mm256_add_epi8(v_ral_b, v_rbl_b);
+      const __m256i v_rvsh_b = _mm256_add_epi8(v_rah_b, v_rbh_b);
+      const __m256i v_rvsal_w = _mm256_and_si256(v_rvsl_b, v_zmask_b);
+      const __m256i v_rvsah_w = _mm256_and_si256(v_rvsh_b, v_zmask_b);
+      const __m256i v_rvsbl_w =
+          _mm256_and_si256(_mm256_srli_si256(v_rvsl_b, 1), v_zmask_b);
+      const __m256i v_rvsbh_w =
+          _mm256_and_si256(_mm256_srli_si256(v_rvsh_b, 1), v_zmask_b);
+      const __m256i v_rsl_w = _mm256_add_epi16(v_rvsal_w, v_rvsbl_w);
+      const __m256i v_rsh_w = _mm256_add_epi16(v_rvsah_w, v_rvsbh_w);
+
+      const __m256i v_m0l_w = yy_roundn_epu16(v_rsl_w, 2);
+      const __m256i v_m0h_w = yy_roundn_epu16(v_rsh_w, 2);
+      const __m256i v_m0_b =
+          _mm256_permute4x64_epi64(_mm256_packus_epi16(v_m0l_w, v_m0h_w), 0xd8);
+      const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
+
+      const __m256i v_res_b = blend_32_u8_avx2(
+          src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS);
+
+      yy_storeu_256(dst + c, v_res_b);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+static INLINE void blend_a64_mask_sx_sy_avx2(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+  switch (w) {
+    case 4:
+      do {
+        const __m128i v_ra_b = xx_loadl_64(mask);
+        const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
+        const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
+        const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b);
+        const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b);
+        const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8));
+        const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w);
+        const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
+        const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w);
+        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+        const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+        xx_storel_32(dst, v_res_b);
+
+        dst += dst_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        mask += 2 * mask_stride;
+      } while (--h);
+      break;
+    case 8:
+      do {
+        const __m128i v_ra_b = xx_loadu_128(mask);
+        const __m128i v_rb_b = xx_loadu_128(mask + mask_stride);
+        const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
+        const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b);
+        const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b);
+        const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8));
+        const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w);
+        const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
+        const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w);
+        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+        const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+        xx_storel_64(dst, v_res_b);
+
+        dst += dst_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        mask += 2 * mask_stride;
+      } while (--h);
+      break;
+    case 16:
+      blend_a64_mask_sx_sy_w16_avx2(dst, dst_stride, src0, src0_stride, src1,
+                                    src1_stride, mask, mask_stride, h);
+      break;
+    default:
+      blend_a64_mask_sx_sy_w32n_avx2(dst, dst_stride, src0, src0_stride, src1,
+                                     src1_stride, mask, mask_stride, w, h);
+      break;
+  }
+}
+
+static INLINE void blend_a64_mask_sx_w16_avx2(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h) {
+  const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m256i v_zmask_b = _mm256_set1_epi16(0xff);
+  do {
+    const __m256i v_rl_b = yy_loadu_256(mask);
+    const __m256i v_al_b =
+        _mm256_avg_epu8(v_rl_b, _mm256_srli_si256(v_rl_b, 1));
+
+    const __m256i v_m0_w = _mm256_and_si256(v_al_b, v_zmask_b);
+    const __m256i v_m0_b = _mm256_packus_epi16(v_m0_w, _mm256_setzero_si256());
+    const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
+
+    const __m256i v_res_b = blend_16_u8_avx2(src0, src1, &v_m0_b, &v_m1_b,
+                                             AOM_BLEND_A64_ROUND_BITS);
+
+    xx_storeu_128(dst, _mm256_castsi256_si128(v_res_b));
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  } while (--h);
+}
+
+static INLINE void blend_a64_mask_sx_w32n_avx2(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  const __m256i v_shuffle_b = yy_loadu_256(g_blend_a64_mask_shuffle);
+  const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  do {
+    int c;
+    for (c = 0; c < w; c += 32) {
+      const __m256i v_r0_b = yy_loadu_256(mask + 2 * c);
+      const __m256i v_r1_b = yy_loadu_256(mask + 2 * c + 32);
+      const __m256i v_r0_s_b = _mm256_shuffle_epi8(v_r0_b, v_shuffle_b);
+      const __m256i v_r1_s_b = _mm256_shuffle_epi8(v_r1_b, v_shuffle_b);
+      const __m256i v_al_b =
+          _mm256_avg_epu8(v_r0_s_b, _mm256_srli_si256(v_r0_s_b, 8));
+      const __m256i v_ah_b =
+          _mm256_avg_epu8(v_r1_s_b, _mm256_srli_si256(v_r1_s_b, 8));
+
+      const __m256i v_m0_b =
+          _mm256_permute4x64_epi64(_mm256_unpacklo_epi64(v_al_b, v_ah_b), 0xd8);
+      const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
+
+      const __m256i v_res_b = blend_32_u8_avx2(
+          src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS);
+
+      yy_storeu_256(dst + c, v_res_b);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  } while (--h);
+}
+
+static INLINE void blend_a64_mask_sx_avx2(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+  switch (w) {
+    case 4:
+      do {
+        const __m128i v_r_b = xx_loadl_64(mask);
+        const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b);
+        const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b);
+        const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b);
+        const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
+        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+        const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+        xx_storel_32(dst, v_res_b);
+
+        dst += dst_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        mask += mask_stride;
+      } while (--h);
+      break;
+    case 8:
+      do {
+        const __m128i v_r_b = xx_loadu_128(mask);
+        const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b);
+        const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b);
+        const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b);
+        const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
+        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+        const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+        xx_storel_64(dst, v_res_b);
+
+        dst += dst_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        mask += mask_stride;
+      } while (--h);
+      break;
+    case 16:
+      blend_a64_mask_sx_w16_avx2(dst, dst_stride, src0, src0_stride, src1,
+                                 src1_stride, mask, mask_stride, h);
+      break;
+    default:
+      blend_a64_mask_sx_w32n_avx2(dst, dst_stride, src0, src0_stride, src1,
+                                  src1_stride, mask, mask_stride, w, h);
+      break;
+  }
+}
+
+static INLINE void blend_a64_mask_sy_w16_avx2(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h) {
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  do {
+    const __m128i v_ra_b = xx_loadu_128(mask);
+    const __m128i v_rb_b = xx_loadu_128(mask + mask_stride);
+    const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+
+    const __m128i v_m1_b = _mm_sub_epi16(v_maxval_b, v_m0_b);
+    const __m128i v_res_b = blend_16_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+    xx_storeu_128(dst, v_res_b);
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+static INLINE void blend_a64_mask_sy_w32n_avx2(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  do {
+    int c;
+    for (c = 0; c < w; c += 32) {
+      const __m256i v_ra_b = yy_loadu_256(mask + c);
+      const __m256i v_rb_b = yy_loadu_256(mask + c + mask_stride);
+      const __m256i v_m0_b = _mm256_avg_epu8(v_ra_b, v_rb_b);
+      const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
+      const __m256i v_res_b = blend_32_u8_avx2(
+          src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS);
+
+      yy_storeu_256(dst + c, v_res_b);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+static INLINE void blend_a64_mask_sy_avx2(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  switch (w) {
+    case 4:
+      do {
+        const __m128i v_ra_b = xx_loadl_32(mask);
+        const __m128i v_rb_b = xx_loadl_32(mask + mask_stride);
+        const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+        const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+        xx_storel_32(dst, v_res_b);
+
+        dst += dst_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        mask += 2 * mask_stride;
+      } while (--h);
+      break;
+    case 8:
+      do {
+        const __m128i v_ra_b = xx_loadl_64(mask);
+        const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
+        const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+        const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+        xx_storel_64(dst, v_res_b);
+
+        dst += dst_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        mask += 2 * mask_stride;
+      } while (--h);
+      break;
+    case 16:
+      blend_a64_mask_sy_w16_avx2(dst, dst_stride, src0, src0_stride, src1,
+                                 src1_stride, mask, mask_stride, h);
+      break;
+    default:
+      blend_a64_mask_sy_w32n_avx2(dst, dst_stride, src0, src0_stride, src1,
+                                  src1_stride, mask, mask_stride, w, h);
+  }
+}
+
+static INLINE void blend_a64_mask_w32n_avx2(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  do {
+    int c;
+    for (c = 0; c < w; c += 32) {
+      const __m256i v_m0_b = yy_loadu_256(mask + c);
+      const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
+
+      const __m256i v_res_b = blend_32_u8_avx2(
+          src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS);
+
+      yy_storeu_256(dst + c, v_res_b);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  } while (--h);
+}
+
+static INLINE void blend_a64_mask_avx2(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+  switch (w) {
+    case 4:
+      do {
+        const __m128i v_m0_b = xx_loadl_32(mask);
+        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+        const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+        xx_storel_32(dst, v_res_b);
+
+        dst += dst_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        mask += mask_stride;
+      } while (--h);
+      break;
+    case 8:
+      do {
+        const __m128i v_m0_b = xx_loadl_64(mask);
+        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+        const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+        xx_storel_64(dst, v_res_b);
+
+        dst += dst_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        mask += mask_stride;
+      } while (--h);
+      break;
+    case 16:
+      do {
+        const __m128i v_m0_b = xx_loadu_128(mask);
+        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+        const __m128i v_res_b = blend_16_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+        xx_storeu_128(dst, v_res_b);
+        dst += dst_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        mask += mask_stride;
+      } while (--h);
+      break;
+    default:
+      blend_a64_mask_w32n_avx2(dst, dst_stride, src0, src0_stride, src1,
+                               src1_stride, mask, mask_stride, w, h);
+  }
+}
+
+void aom_blend_a64_mask_avx2(uint8_t *dst, uint32_t dst_stride,
+                             const uint8_t *src0, uint32_t src0_stride,
+                             const uint8_t *src1, uint32_t src1_stride,
+                             const uint8_t *mask, uint32_t mask_stride, int w,
+                             int h, int subw, int subh) {
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
+    aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride,
+                         mask, mask_stride, w, h, subw, subh);
+  } else {
+    if (subw & subh) {
+      blend_a64_mask_sx_sy_avx2(dst, dst_stride, src0, src0_stride, src1,
+                                src1_stride, mask, mask_stride, w, h);
+    } else if (subw) {
+      blend_a64_mask_sx_avx2(dst, dst_stride, src0, src0_stride, src1,
+                             src1_stride, mask, mask_stride, w, h);
+    } else if (subh) {
+      blend_a64_mask_sy_avx2(dst, dst_stride, src0, src0_stride, src1,
+                             src1_stride, mask, mask_stride, w, h);
+    } else {
+      blend_a64_mask_avx2(dst, dst_stride, src0, src0_stride, src1, src1_stride,
+                          mask, mask_stride, w, h);
+    }
+  }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+//////////////////////////////////////////////////////////////////////////////
+// aom_highbd_blend_a64_d16_mask_avx2()
+//////////////////////////////////////////////////////////////////////////////
+
+static INLINE void highbd_blend_a64_d16_mask_w4_avx2(
+    uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
+    const CONV_BUF_TYPE *src1, int src1_stride, const __m256i *mask0,
+    const __m256i *round_offset, int shift, const __m256i *clip_low,
+    const __m256i *clip_high, const __m256i *mask_max) {
+  // Load 4x u16 pixels from each of 4 rows from each source
+  const __m256i s0 = _mm256_set_epi64x(*(uint64_t *)(src0 + 3 * src0_stride),
+                                       *(uint64_t *)(src0 + 2 * src0_stride),
+                                       *(uint64_t *)(src0 + 1 * src0_stride),
+                                       *(uint64_t *)(src0 + 0 * src0_stride));
+  const __m256i s1 = _mm256_set_epi64x(*(uint64_t *)(src1 + 3 * src1_stride),
+                                       *(uint64_t *)(src1 + 2 * src1_stride),
+                                       *(uint64_t *)(src1 + 1 * src1_stride),
+                                       *(uint64_t *)(src1 + 0 * src1_stride));
+  // Generate the inverse mask
+  const __m256i mask1 = _mm256_sub_epi16(*mask_max, *mask0);
+
+  // Multiply each mask by the respective source
+  const __m256i mul0_highs = _mm256_mulhi_epu16(*mask0, s0);
+  const __m256i mul0_lows = _mm256_mullo_epi16(*mask0, s0);
+  const __m256i mul0h = _mm256_unpackhi_epi16(mul0_lows, mul0_highs);
+  const __m256i mul0l = _mm256_unpacklo_epi16(mul0_lows, mul0_highs);
+  // Note that AVX2 unpack orders 64-bit words as [3 1] [2 0] to keep within
+  // lanes Later, packs does the same again which cancels this out with no need
+  // for a permute.  The intermediate values being reordered makes no difference
+
+  const __m256i mul1_highs = _mm256_mulhi_epu16(mask1, s1);
+  const __m256i mul1_lows = _mm256_mullo_epi16(mask1, s1);
+  const __m256i mul1h = _mm256_unpackhi_epi16(mul1_lows, mul1_highs);
+  const __m256i mul1l = _mm256_unpacklo_epi16(mul1_lows, mul1_highs);
+
+  const __m256i sumh = _mm256_add_epi32(mul0h, mul1h);
+  const __m256i suml = _mm256_add_epi32(mul0l, mul1l);
+
+  const __m256i roundh =
+      _mm256_srai_epi32(_mm256_sub_epi32(sumh, *round_offset), shift);
+  const __m256i roundl =
+      _mm256_srai_epi32(_mm256_sub_epi32(suml, *round_offset), shift);
+
+  const __m256i pack = _mm256_packs_epi32(roundl, roundh);
+  const __m256i clip =
+      _mm256_min_epi16(_mm256_max_epi16(pack, *clip_low), *clip_high);
+
+  // _mm256_extract_epi64 doesn't exist on x86, so do it the old-fashioned way:
+  const __m128i cliph = _mm256_extracti128_si256(clip, 1);
+  xx_storel_64(dst + 3 * dst_stride, _mm_srli_si128(cliph, 8));
+  xx_storel_64(dst + 2 * dst_stride, cliph);
+  const __m128i clipl = _mm256_castsi256_si128(clip);
+  xx_storel_64(dst + 1 * dst_stride, _mm_srli_si128(clipl, 8));
+  xx_storel_64(dst + 0 * dst_stride, clipl);
+}
+
+static INLINE void highbd_blend_a64_d16_mask_subw0_subh0_w4_avx2(
+    uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h,
+    const __m256i *round_offset, int shift, const __m256i *clip_low,
+    const __m256i *clip_high, const __m256i *mask_max) {
+  do {
+    // Load 8x u8 pixels from each of 4 rows of the mask, pad each to u16
+    const __m128i mask08 = _mm_set_epi32(*(uint32_t *)(mask + 3 * mask_stride),
+                                         *(uint32_t *)(mask + 2 * mask_stride),
+                                         *(uint32_t *)(mask + 1 * mask_stride),
+                                         *(uint32_t *)(mask + 0 * mask_stride));
+    const __m256i mask0 = _mm256_cvtepu8_epi16(mask08);
+
+    highbd_blend_a64_d16_mask_w4_avx2(dst, dst_stride, src0, src0_stride, src1,
+                                      src1_stride, &mask0, round_offset, shift,
+                                      clip_low, clip_high, mask_max);
+
+    dst += dst_stride * 4;
+    src0 += src0_stride * 4;
+    src1 += src1_stride * 4;
+    mask += mask_stride * 4;
+  } while (h -= 4);
+}
+
+static INLINE void highbd_blend_a64_d16_mask_subw1_subh1_w4_avx2(
+    uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h,
+    const __m256i *round_offset, int shift, const __m256i *clip_low,
+    const __m256i *clip_high, const __m256i *mask_max) {
+  const __m256i one_b = _mm256_set1_epi8(1);
+  const __m256i two_w = _mm256_set1_epi16(2);
+  do {
+    // Load 8 pixels from each of 8 rows of mask,
+    // (saturating) add together rows then use madd to add adjacent pixels
+    // Finally, divide each value by 4 (with rounding)
+    const __m256i m0246 =
+        _mm256_set_epi64x(*(uint64_t *)(mask + 6 * mask_stride),
+                          *(uint64_t *)(mask + 4 * mask_stride),
+                          *(uint64_t *)(mask + 2 * mask_stride),
+                          *(uint64_t *)(mask + 0 * mask_stride));
+    const __m256i m1357 =
+        _mm256_set_epi64x(*(uint64_t *)(mask + 7 * mask_stride),
+                          *(uint64_t *)(mask + 5 * mask_stride),
+                          *(uint64_t *)(mask + 3 * mask_stride),
+                          *(uint64_t *)(mask + 1 * mask_stride));
+    const __m256i addrows = _mm256_adds_epu8(m0246, m1357);
+    const __m256i adjacent = _mm256_maddubs_epi16(addrows, one_b);
+    const __m256i mask0 =
+        _mm256_srli_epi16(_mm256_add_epi16(adjacent, two_w), 2);
+
+    highbd_blend_a64_d16_mask_w4_avx2(dst, dst_stride, src0, src0_stride, src1,
+                                      src1_stride, &mask0, round_offset, shift,
+                                      clip_low, clip_high, mask_max);
+
+    dst += dst_stride * 4;
+    src0 += src0_stride * 4;
+    src1 += src1_stride * 4;
+    mask += mask_stride * 8;
+  } while (h -= 4);
+}
+
+static INLINE void highbd_blend_a64_d16_mask_w8_avx2(
+    uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
+    const CONV_BUF_TYPE *src1, int src1_stride, const __m256i *mask0a,
+    const __m256i *mask0b, const __m256i *round_offset, int shift,
+    const __m256i *clip_low, const __m256i *clip_high,
+    const __m256i *mask_max) {
+  // Load 8x u16 pixels from each of 4 rows from each source
+  const __m256i s0a =
+      yy_loadu2_128(src0 + 0 * src0_stride, src0 + 1 * src0_stride);
+  const __m256i s0b =
+      yy_loadu2_128(src0 + 2 * src0_stride, src0 + 3 * src0_stride);
+  const __m256i s1a =
+      yy_loadu2_128(src1 + 0 * src1_stride, src1 + 1 * src1_stride);
+  const __m256i s1b =
+      yy_loadu2_128(src1 + 2 * src1_stride, src1 + 3 * src1_stride);
+
+  // Generate inverse masks
+  const __m256i mask1a = _mm256_sub_epi16(*mask_max, *mask0a);
+  const __m256i mask1b = _mm256_sub_epi16(*mask_max, *mask0b);
+
+  // Multiply sources by respective masks
+  const __m256i mul0a_highs = _mm256_mulhi_epu16(*mask0a, s0a);
+  const __m256i mul0a_lows = _mm256_mullo_epi16(*mask0a, s0a);
+  const __m256i mul0ah = _mm256_unpackhi_epi16(mul0a_lows, mul0a_highs);
+  const __m256i mul0al = _mm256_unpacklo_epi16(mul0a_lows, mul0a_highs);
+  // Note that AVX2 unpack orders 64-bit words as [3 1] [2 0] to keep within
+  // lanes Later, packs does the same again which cancels this out with no need
+  // for a permute.  The intermediate values being reordered makes no difference
+
+  const __m256i mul1a_highs = _mm256_mulhi_epu16(mask1a, s1a);
+  const __m256i mul1a_lows = _mm256_mullo_epi16(mask1a, s1a);
+  const __m256i mul1ah = _mm256_unpackhi_epi16(mul1a_lows, mul1a_highs);
+  const __m256i mul1al = _mm256_unpacklo_epi16(mul1a_lows, mul1a_highs);
+
+  const __m256i sumah = _mm256_add_epi32(mul0ah, mul1ah);
+  const __m256i sumal = _mm256_add_epi32(mul0al, mul1al);
+
+  const __m256i mul0b_highs = _mm256_mulhi_epu16(*mask0b, s0b);
+  const __m256i mul0b_lows = _mm256_mullo_epi16(*mask0b, s0b);
+  const __m256i mul0bh = _mm256_unpackhi_epi16(mul0b_lows, mul0b_highs);
+  const __m256i mul0bl = _mm256_unpacklo_epi16(mul0b_lows, mul0b_highs);
+
+  const __m256i mul1b_highs = _mm256_mulhi_epu16(mask1b, s1b);
+  const __m256i mul1b_lows = _mm256_mullo_epi16(mask1b, s1b);
+  const __m256i mul1bh = _mm256_unpackhi_epi16(mul1b_lows, mul1b_highs);
+  const __m256i mul1bl = _mm256_unpacklo_epi16(mul1b_lows, mul1b_highs);
+
+  const __m256i sumbh = _mm256_add_epi32(mul0bh, mul1bh);
+  const __m256i sumbl = _mm256_add_epi32(mul0bl, mul1bl);
+
+  // Divide down each result, with rounding
+  const __m256i roundah =
+      _mm256_srai_epi32(_mm256_sub_epi32(sumah, *round_offset), shift);
+  const __m256i roundal =
+      _mm256_srai_epi32(_mm256_sub_epi32(sumal, *round_offset), shift);
+  const __m256i roundbh =
+      _mm256_srai_epi32(_mm256_sub_epi32(sumbh, *round_offset), shift);
+  const __m256i roundbl =
+      _mm256_srai_epi32(_mm256_sub_epi32(sumbl, *round_offset), shift);
+
+  // Pack each i32 down to an i16 with saturation, then clip to valid range
+  const __m256i packa = _mm256_packs_epi32(roundal, roundah);
+  const __m256i clipa =
+      _mm256_min_epi16(_mm256_max_epi16(packa, *clip_low), *clip_high);
+  const __m256i packb = _mm256_packs_epi32(roundbl, roundbh);
+  const __m256i clipb =
+      _mm256_min_epi16(_mm256_max_epi16(packb, *clip_low), *clip_high);
+
+  // Store 8x u16 pixels to each of 4 rows in the destination
+  yy_storeu2_128(dst + 0 * dst_stride, dst + 1 * dst_stride, clipa);
+  yy_storeu2_128(dst + 2 * dst_stride, dst + 3 * dst_stride, clipb);
+}
+
+static INLINE void highbd_blend_a64_d16_mask_subw0_subh0_w8_avx2(
+    uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
+    const CONV_BUF_TYPE *src1, int src1_stride, const uint8_t *mask,
+    int mask_stride, int h, const __m256i *round_offset, int shift,
+    const __m256i *clip_low, const __m256i *clip_high,
+    const __m256i *mask_max) {
+  do {
+    // Load 8x u8 pixels from each of 4 rows in the mask
+    const __m128i mask0a8 =
+        _mm_set_epi64x(*(uint64_t *)mask, *(uint64_t *)(mask + mask_stride));
+    const __m128i mask0b8 =
+        _mm_set_epi64x(*(uint64_t *)(mask + 2 * mask_stride),
+                       *(uint64_t *)(mask + 3 * mask_stride));
+    const __m256i mask0a = _mm256_cvtepu8_epi16(mask0a8);
+    const __m256i mask0b = _mm256_cvtepu8_epi16(mask0b8);
+
+    highbd_blend_a64_d16_mask_w8_avx2(
+        dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask0a, &mask0b,
+        round_offset, shift, clip_low, clip_high, mask_max);
+
+    dst += dst_stride * 4;
+    src0 += src0_stride * 4;
+    src1 += src1_stride * 4;
+    mask += mask_stride * 4;
+  } while (h -= 4);
+}
+
+static INLINE void highbd_blend_a64_d16_mask_subw1_subh1_w8_avx2(
+    uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
+    const CONV_BUF_TYPE *src1, int src1_stride, const uint8_t *mask,
+    int mask_stride, int h, const __m256i *round_offset, int shift,
+    const __m256i *clip_low, const __m256i *clip_high,
+    const __m256i *mask_max) {
+  const __m256i one_b = _mm256_set1_epi8(1);
+  const __m256i two_w = _mm256_set1_epi16(2);
+  do {
+    // Load 16x u8 pixels from each of 8 rows in the mask,
+    // (saturating) add together rows then use madd to add adjacent pixels
+    // Finally, divide each value by 4 (with rounding)
+    const __m256i m02 =
+        yy_loadu2_128(mask + 0 * mask_stride, mask + 2 * mask_stride);
+    const __m256i m13 =
+        yy_loadu2_128(mask + 1 * mask_stride, mask + 3 * mask_stride);
+    const __m256i m0123 =
+        _mm256_maddubs_epi16(_mm256_adds_epu8(m02, m13), one_b);
+    const __m256i mask_0a =
+        _mm256_srli_epi16(_mm256_add_epi16(m0123, two_w), 2);
+    const __m256i m46 =
+        yy_loadu2_128(mask + 4 * mask_stride, mask + 6 * mask_stride);
+    const __m256i m57 =
+        yy_loadu2_128(mask + 5 * mask_stride, mask + 7 * mask_stride);
+    const __m256i m4567 =
+        _mm256_maddubs_epi16(_mm256_adds_epu8(m46, m57), one_b);
+    const __m256i mask_0b =
+        _mm256_srli_epi16(_mm256_add_epi16(m4567, two_w), 2);
+
+    highbd_blend_a64_d16_mask_w8_avx2(
+        dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask_0a,
+        &mask_0b, round_offset, shift, clip_low, clip_high, mask_max);
+
+    dst += dst_stride * 4;
+    src0 += src0_stride * 4;
+    src1 += src1_stride * 4;
+    mask += mask_stride * 8;
+  } while (h -= 4);
+}
+
+static INLINE void highbd_blend_a64_d16_mask_w16_avx2(
+    uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
+    const CONV_BUF_TYPE *src1, int src1_stride, const __m256i *mask0a,
+    const __m256i *mask0b, const __m256i *round_offset, int shift,
+    const __m256i *clip_low, const __m256i *clip_high,
+    const __m256i *mask_max) {
+  // Load 16x pixels from each of 2 rows from each source
+  const __m256i s0a = yy_loadu_256(src0);
+  const __m256i s0b = yy_loadu_256(src0 + src0_stride);
+  const __m256i s1a = yy_loadu_256(src1);
+  const __m256i s1b = yy_loadu_256(src1 + src1_stride);
+
+  // Calculate inverse masks
+  const __m256i mask1a = _mm256_sub_epi16(*mask_max, *mask0a);
+  const __m256i mask1b = _mm256_sub_epi16(*mask_max, *mask0b);
+
+  // Multiply each source by appropriate mask
+  const __m256i mul0a_highs = _mm256_mulhi_epu16(*mask0a, s0a);
+  const __m256i mul0a_lows = _mm256_mullo_epi16(*mask0a, s0a);
+  const __m256i mul0ah = _mm256_unpackhi_epi16(mul0a_lows, mul0a_highs);
+  const __m256i mul0al = _mm256_unpacklo_epi16(mul0a_lows, mul0a_highs);
+  // Note that AVX2 unpack orders 64-bit words as [3 1] [2 0] to keep within
+  // lanes Later, packs does the same again which cancels this out with no need
+  // for a permute.  The intermediate values being reordered makes no difference
+
+  const __m256i mul1a_highs = _mm256_mulhi_epu16(mask1a, s1a);
+  const __m256i mul1a_lows = _mm256_mullo_epi16(mask1a, s1a);
+  const __m256i mul1ah = _mm256_unpackhi_epi16(mul1a_lows, mul1a_highs);
+  const __m256i mul1al = _mm256_unpacklo_epi16(mul1a_lows, mul1a_highs);
+
+  const __m256i mulah = _mm256_add_epi32(mul0ah, mul1ah);
+  const __m256i mulal = _mm256_add_epi32(mul0al, mul1al);
+
+  const __m256i mul0b_highs = _mm256_mulhi_epu16(*mask0b, s0b);
+  const __m256i mul0b_lows = _mm256_mullo_epi16(*mask0b, s0b);
+  const __m256i mul0bh = _mm256_unpackhi_epi16(mul0b_lows, mul0b_highs);
+  const __m256i mul0bl = _mm256_unpacklo_epi16(mul0b_lows, mul0b_highs);
+
+  const __m256i mul1b_highs = _mm256_mulhi_epu16(mask1b, s1b);
+  const __m256i mul1b_lows = _mm256_mullo_epi16(mask1b, s1b);
+  const __m256i mul1bh = _mm256_unpackhi_epi16(mul1b_lows, mul1b_highs);
+  const __m256i mul1bl = _mm256_unpacklo_epi16(mul1b_lows, mul1b_highs);
+
+  const __m256i mulbh = _mm256_add_epi32(mul0bh, mul1bh);
+  const __m256i mulbl = _mm256_add_epi32(mul0bl, mul1bl);
+
+  const __m256i resah =
+      _mm256_srai_epi32(_mm256_sub_epi32(mulah, *round_offset), shift);
+  const __m256i resal =
+      _mm256_srai_epi32(_mm256_sub_epi32(mulal, *round_offset), shift);
+  const __m256i resbh =
+      _mm256_srai_epi32(_mm256_sub_epi32(mulbh, *round_offset), shift);
+  const __m256i resbl =
+      _mm256_srai_epi32(_mm256_sub_epi32(mulbl, *round_offset), shift);
+
+  // Signed saturating pack from i32 to i16:
+  const __m256i packa = _mm256_packs_epi32(resal, resah);
+  const __m256i packb = _mm256_packs_epi32(resbl, resbh);
+
+  // Clip the values to the valid range
+  const __m256i clipa =
+      _mm256_min_epi16(_mm256_max_epi16(packa, *clip_low), *clip_high);
+  const __m256i clipb =
+      _mm256_min_epi16(_mm256_max_epi16(packb, *clip_low), *clip_high);
+
+  // Store 16 pixels
+  yy_storeu_256(dst, clipa);
+  yy_storeu_256(dst + dst_stride, clipb);
+}
+
+static INLINE void highbd_blend_a64_d16_mask_subw0_subh0_w16_avx2(
+    uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
+    const CONV_BUF_TYPE *src1, int src1_stride, const uint8_t *mask,
+    int mask_stride, int h, int w, const __m256i *round_offset, int shift,
+    const __m256i *clip_low, const __m256i *clip_high,
+    const __m256i *mask_max) {
+  for (int i = 0; i < h; i += 2) {
+    for (int j = 0; j < w; j += 16) {
+      // Load 16x u8 alpha-mask values from each of two rows and pad to u16
+      const __m128i masks_a8 = xx_loadu_128(mask + j);
+      const __m128i masks_b8 = xx_loadu_128(mask + mask_stride + j);
+      const __m256i mask0a = _mm256_cvtepu8_epi16(masks_a8);
+      const __m256i mask0b = _mm256_cvtepu8_epi16(masks_b8);
+
+      highbd_blend_a64_d16_mask_w16_avx2(
+          dst + j, dst_stride, src0 + j, src0_stride, src1 + j, src1_stride,
+          &mask0a, &mask0b, round_offset, shift, clip_low, clip_high, mask_max);
+    }
+    dst += dst_stride * 2;
+    src0 += src0_stride * 2;
+    src1 += src1_stride * 2;
+    mask += mask_stride * 2;
+  }
+}
+
+static INLINE void highbd_blend_a64_d16_mask_subw1_subh1_w16_avx2(
+    uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
+    const CONV_BUF_TYPE *src1, int src1_stride, const uint8_t *mask,
+    int mask_stride, int h, int w, const __m256i *round_offset, int shift,
+    const __m256i *clip_low, const __m256i *clip_high,
+    const __m256i *mask_max) {
+  const __m256i one_b = _mm256_set1_epi8(1);
+  const __m256i two_w = _mm256_set1_epi16(2);
+  for (int i = 0; i < h; i += 2) {
+    for (int j = 0; j < w; j += 16) {
+      // Load 32x u8 alpha-mask values from each of four rows
+      // (saturating) add pairs of rows, then use madd to add adjacent values
+      // Finally, divide down each result with rounding
+      const __m256i m0 = yy_loadu_256(mask + 0 * mask_stride + 2 * j);
+      const __m256i m1 = yy_loadu_256(mask + 1 * mask_stride + 2 * j);
+      const __m256i m2 = yy_loadu_256(mask + 2 * mask_stride + 2 * j);
+      const __m256i m3 = yy_loadu_256(mask + 3 * mask_stride + 2 * j);
+
+      const __m256i m01_8 = _mm256_adds_epu8(m0, m1);
+      const __m256i m23_8 = _mm256_adds_epu8(m2, m3);
+
+      const __m256i m01 = _mm256_maddubs_epi16(m01_8, one_b);
+      const __m256i m23 = _mm256_maddubs_epi16(m23_8, one_b);
+
+      const __m256i mask0a = _mm256_srli_epi16(_mm256_add_epi16(m01, two_w), 2);
+      const __m256i mask0b = _mm256_srli_epi16(_mm256_add_epi16(m23, two_w), 2);
+
+      highbd_blend_a64_d16_mask_w16_avx2(
+          dst + j, dst_stride, src0 + j, src0_stride, src1 + j, src1_stride,
+          &mask0a, &mask0b, round_offset, shift, clip_low, clip_high, mask_max);
+    }
+    dst += dst_stride * 2;
+    src0 += src0_stride * 2;
+    src1 += src1_stride * 2;
+    mask += mask_stride * 4;
+  }
+}
+
+void aom_highbd_blend_a64_d16_mask_avx2(
+    uint8_t *dst8, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
+    ConvolveParams *conv_params, const int bd) {
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int32_t round_offset =
+      ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) -
+       (1 << (round_bits - 1)))
+      << AOM_BLEND_A64_ROUND_BITS;
+  const __m256i v_round_offset = _mm256_set1_epi32(round_offset);
+  const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS;
+
+  const __m256i clip_low = _mm256_set1_epi16(0);
+  const __m256i clip_high = _mm256_set1_epi16((1 << bd) - 1);
+  const __m256i mask_max = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+  assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 4);
+  assert(w >= 4);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  if (subw == 0 && subh == 0) {
+    switch (w) {
+      case 4:
+        highbd_blend_a64_d16_mask_subw0_subh0_w4_avx2(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high,
+            &mask_max);
+        break;
+      case 8:
+        highbd_blend_a64_d16_mask_subw0_subh0_w8_avx2(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high,
+            &mask_max);
+        break;
+      default:  // >= 16
+        highbd_blend_a64_d16_mask_subw0_subh0_w16_avx2(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, w, &v_round_offset, shift, &clip_low, &clip_high,
+            &mask_max);
+        break;
+    }
+
+  } else if (subw == 1 && subh == 1) {
+    switch (w) {
+      case 4:
+        highbd_blend_a64_d16_mask_subw1_subh1_w4_avx2(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high,
+            &mask_max);
+        break;
+      case 8:
+        highbd_blend_a64_d16_mask_subw1_subh1_w8_avx2(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high,
+            &mask_max);
+        break;
+      default:  // >= 16
+        highbd_blend_a64_d16_mask_subw1_subh1_w16_avx2(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, w, &v_round_offset, shift, &clip_low, &clip_high,
+            &mask_max);
+        break;
+    }
+  } else {
+    // Sub-sampling in only one axis doesn't seem to happen very much, so fall
+    // back to the vanilla C implementation instead of having all the optimised
+    // code for these.
+    aom_highbd_blend_a64_d16_mask_c(dst8, dst_stride, src0, src0_stride, src1,
+                                    src1_stride, mask, mask_stride, w, h, subw,
+                                    subh, conv_params, bd);
+  }
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/libs/libaom/src/aom_dsp/x86/blend_a64_mask_sse4.c b/libs/libaom/src/aom_dsp/x86/blend_a64_mask_sse4.c
new file mode 100644
index 000000000..4a368ef94
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/blend_a64_mask_sse4.c
@@ -0,0 +1,1560 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h>  // SSE4.1
+
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/blend.h"
+
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/blend_sse4.h"
+#include "aom_dsp/x86/blend_mask_sse4.h"
+
+#include "config/aom_dsp_rtcd.h"
+
+//////////////////////////////////////////////////////////////////////////////
+// No sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static void blend_a64_mask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride,
+                                     const uint8_t *src0, uint32_t src0_stride,
+                                     const uint8_t *src1, uint32_t src1_stride,
+                                     const uint8_t *mask, uint32_t mask_stride,
+                                     int w, int h) {
+  (void)w;
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+  do {
+    const __m128i v_m0_b = xx_loadl_32(mask);
+    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+    const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+    xx_storel_32(dst, v_res_b);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  } while (--h);
+}
+
+static void blend_a64_mask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride,
+                                     const uint8_t *src0, uint32_t src0_stride,
+                                     const uint8_t *src1, uint32_t src1_stride,
+                                     const uint8_t *mask, uint32_t mask_stride,
+                                     int w, int h) {
+  (void)w;
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+  do {
+    const __m128i v_m0_b = xx_loadl_64(mask);
+    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+    const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+    xx_storel_64(dst, v_res_b);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  } while (--h);
+}
+
+static void blend_a64_mask_w16n_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+
+  do {
+    int c;
+    for (c = 0; c < w; c += 16) {
+      const __m128i v_m0_b = xx_loadu_128(mask + c);
+      const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+      const __m128i v_res_b =
+          blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r);
+
+      xx_storeu_128(dst + c, v_res_b);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  } while (--h);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Horizontal sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static void blend_a64_mask_sx_w4_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  (void)w;
+
+  const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+  do {
+    const __m128i v_r_b = xx_loadl_64(mask);
+    const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b);
+    const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b);
+    const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b);
+    const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
+    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+    const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+    xx_storel_32(dst, v_res_b);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  } while (--h);
+}
+
+static void blend_a64_mask_sx_w8_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  (void)w;
+
+  const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+  do {
+    const __m128i v_r_b = xx_loadu_128(mask);
+    const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b);
+    const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b);
+    const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b);
+    const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
+    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+    const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+    xx_storel_64(dst, v_res_b);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  } while (--h);
+}
+
+static void blend_a64_mask_sx_w16n_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+
+  do {
+    int c;
+    for (c = 0; c < w; c += 16) {
+      const __m128i v_r0_b = xx_loadu_128(mask + 2 * c);
+      const __m128i v_r1_b = xx_loadu_128(mask + 2 * c + 16);
+      const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r0_b, v_shuffle_b);
+      const __m128i v_r1_s_b = _mm_shuffle_epi8(v_r1_b, v_shuffle_b);
+      const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r1_s_b);
+      const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r1_s_b);
+      const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
+      const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+      const __m128i v_res_b =
+          blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r);
+
+      xx_storeu_128(dst + c, v_res_b);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  } while (--h);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Vertical sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static void blend_a64_mask_sy_w4_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  (void)w;
+
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+
+  do {
+    const __m128i v_ra_b = xx_loadl_32(mask);
+    const __m128i v_rb_b = xx_loadl_32(mask + mask_stride);
+    const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+    const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+    xx_storel_32(dst, v_res_b);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+static void blend_a64_mask_sy_w8_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  (void)w;
+
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+  do {
+    const __m128i v_ra_b = xx_loadl_64(mask);
+    const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
+    const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+    const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+    xx_storel_64(dst, v_res_b);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+static void blend_a64_mask_sy_w16n_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+  do {
+    int c;
+    for (c = 0; c < w; c += 16) {
+      const __m128i v_ra_b = xx_loadu_128(mask + c);
+      const __m128i v_rb_b = xx_loadu_128(mask + c + mask_stride);
+      const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+      const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+      const __m128i v_res_b =
+          blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r);
+
+      xx_storeu_128(dst + c, v_res_b);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Horizontal and Vertical sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static void blend_a64_mask_sx_sy_w4_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+  (void)w;
+
+  do {
+    const __m128i v_ra_b = xx_loadl_64(mask);
+    const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
+    const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
+    const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b);
+    const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b);
+    const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8));
+    const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w);
+    const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
+    const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w);
+    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+    const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+    xx_storel_32(dst, v_res_b);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+static void blend_a64_mask_sx_sy_w8_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+  (void)w;
+
+  do {
+    const __m128i v_ra_b = xx_loadu_128(mask);
+    const __m128i v_rb_b = xx_loadu_128(mask + mask_stride);
+
+    const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
+    const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b);
+    const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b);
+    const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8));
+    const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w);
+    const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
+    const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w);
+    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+    const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+    xx_storel_64(dst, v_res_b);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+static void blend_a64_mask_sx_sy_w16n_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  const __m128i v_zmask_b =
+      _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1);
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+  do {
+    int c;
+    for (c = 0; c < w; c += 16) {
+      const __m128i v_ral_b = xx_loadu_128(mask + 2 * c);
+      const __m128i v_rah_b = xx_loadu_128(mask + 2 * c + 16);
+      const __m128i v_rbl_b = xx_loadu_128(mask + mask_stride + 2 * c);
+      const __m128i v_rbh_b = xx_loadu_128(mask + mask_stride + 2 * c + 16);
+      const __m128i v_rvsl_b = _mm_add_epi8(v_ral_b, v_rbl_b);
+      const __m128i v_rvsh_b = _mm_add_epi8(v_rah_b, v_rbh_b);
+      const __m128i v_rvsal_w = _mm_and_si128(v_rvsl_b, v_zmask_b);
+      const __m128i v_rvsah_w = _mm_and_si128(v_rvsh_b, v_zmask_b);
+      const __m128i v_rvsbl_w =
+          _mm_and_si128(_mm_srli_si128(v_rvsl_b, 1), v_zmask_b);
+      const __m128i v_rvsbh_w =
+          _mm_and_si128(_mm_srli_si128(v_rvsh_b, 1), v_zmask_b);
+      const __m128i v_rsl_w = _mm_add_epi16(v_rvsal_w, v_rvsbl_w);
+      const __m128i v_rsh_w = _mm_add_epi16(v_rvsah_w, v_rvsbh_w);
+
+      const __m128i v_m0l_w = xx_roundn_epu16(v_rsl_w, 2);
+      const __m128i v_m0h_w = xx_roundn_epu16(v_rsh_w, 2);
+      const __m128i v_m0_b = _mm_packus_epi16(v_m0l_w, v_m0h_w);
+      const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+      const __m128i v_res_b =
+          blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r);
+
+      xx_storeu_128(dst + c, v_res_b);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Dispatch
+//////////////////////////////////////////////////////////////////////////////
+
+void aom_blend_a64_mask_sse4_1(uint8_t *dst, uint32_t dst_stride,
+                               const uint8_t *src0, uint32_t src0_stride,
+                               const uint8_t *src1, uint32_t src1_stride,
+                               const uint8_t *mask, uint32_t mask_stride, int w,
+                               int h, int subw, int subh) {
+  typedef void (*blend_fn)(
+      uint8_t * dst, uint32_t dst_stride, const uint8_t *src0,
+      uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+      const uint8_t *mask, uint32_t mask_stride, int w, int h);
+
+  // Dimensions are: width_index X subx X suby
+  static const blend_fn blend[3][2][2] = {
+    { // w % 16 == 0
+      { blend_a64_mask_w16n_sse4_1, blend_a64_mask_sy_w16n_sse4_1 },
+      { blend_a64_mask_sx_w16n_sse4_1, blend_a64_mask_sx_sy_w16n_sse4_1 } },
+    { // w == 4
+      { blend_a64_mask_w4_sse4_1, blend_a64_mask_sy_w4_sse4_1 },
+      { blend_a64_mask_sx_w4_sse4_1, blend_a64_mask_sx_sy_w4_sse4_1 } },
+    { // w == 8
+      { blend_a64_mask_w8_sse4_1, blend_a64_mask_sy_w8_sse4_1 },
+      { blend_a64_mask_sx_w8_sse4_1, blend_a64_mask_sx_sy_w8_sse4_1 } }
+  };
+
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
+    aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride,
+                         mask, mask_stride, w, h, subw, subh);
+  } else {
+    blend[(w >> 2) & 3][subw != 0][subh != 0](dst, dst_stride, src0,
+                                              src0_stride, src1, src1_stride,
+                                              mask, mask_stride, w, h);
+  }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+//////////////////////////////////////////////////////////////////////////////
+// No sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static INLINE void blend_a64_mask_bn_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
+  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+  do {
+    const __m128i v_m0_b = xx_loadl_32(mask);
+    const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+    const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
+
+    xx_storel_64(dst, v_res_w);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  } while (--h);
+}
+
+static void blend_a64_mask_b10_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  (void)w;
+  blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                              src1_stride, mask, mask_stride, h, blend_4_b10);
+}
+
+static void blend_a64_mask_b12_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  (void)w;
+  blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                              src1_stride, mask, mask_stride, h, blend_4_b12);
+}
+
+static INLINE void blend_a64_mask_bn_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h,
+    blend_unit_fn blend) {
+  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+  do {
+    int c;
+    for (c = 0; c < w; c += 8) {
+      const __m128i v_m0_b = xx_loadl_64(mask + c);
+      const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
+      const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+      const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
+
+      xx_storeu_128(dst + c, v_res_w);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  } while (--h);
+}
+
+static void blend_a64_mask_b10_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                               src1_stride, mask, mask_stride, w, h,
+                               blend_8_b10);
+}
+
+static void blend_a64_mask_b12_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                               src1_stride, mask, mask_stride, w, h,
+                               blend_8_b12);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Horizontal sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static INLINE void blend_a64_mask_bn_sx_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
+  const __m128i v_zmask_b =
+      _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1);
+  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+  do {
+    const __m128i v_r_b = xx_loadl_64(mask);
+    const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
+
+    const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+    const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
+
+    xx_storel_64(dst, v_res_w);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  } while (--h);
+}
+
+static void blend_a64_mask_b10_sx_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  (void)w;
+  blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                 src1_stride, mask, mask_stride, h,
+                                 blend_4_b10);
+}
+
+static void blend_a64_mask_b12_sx_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  (void)w;
+  blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                 src1_stride, mask, mask_stride, h,
+                                 blend_4_b12);
+}
+
+static INLINE void blend_a64_mask_bn_sx_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h,
+    blend_unit_fn blend) {
+  const __m128i v_zmask_b =
+      _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1);
+  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+  do {
+    int c;
+    for (c = 0; c < w; c += 8) {
+      const __m128i v_r_b = xx_loadu_128(mask + 2 * c);
+      const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
+
+      const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
+      const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+      const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
+
+      xx_storeu_128(dst + c, v_res_w);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  } while (--h);
+}
+
+static void blend_a64_mask_b10_sx_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                  src1_stride, mask, mask_stride, w, h,
+                                  blend_8_b10);
+}
+
+static void blend_a64_mask_b12_sx_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                  src1_stride, mask, mask_stride, w, h,
+                                  blend_8_b12);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Vertical sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static INLINE void blend_a64_mask_bn_sy_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
+  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+  do {
+    const __m128i v_ra_b = xx_loadl_32(mask);
+    const __m128i v_rb_b = xx_loadl_32(mask + mask_stride);
+    const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+
+    const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+    const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
+
+    xx_storel_64(dst, v_res_w);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+static void blend_a64_mask_b10_sy_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  (void)w;
+  blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                 src1_stride, mask, mask_stride, h,
+                                 blend_4_b10);
+}
+
+static void blend_a64_mask_b12_sy_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  (void)w;
+  blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                 src1_stride, mask, mask_stride, h,
+                                 blend_4_b12);
+}
+
+static INLINE void blend_a64_mask_bn_sy_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h,
+    blend_unit_fn blend) {
+  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+  do {
+    int c;
+    for (c = 0; c < w; c += 8) {
+      const __m128i v_ra_b = xx_loadl_64(mask + c);
+      const __m128i v_rb_b = xx_loadl_64(mask + c + mask_stride);
+      const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+
+      const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
+      const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+      const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
+
+      xx_storeu_128(dst + c, v_res_w);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+static void blend_a64_mask_b10_sy_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                  src1_stride, mask, mask_stride, w, h,
+                                  blend_8_b10);
+}
+
+static void blend_a64_mask_b12_sy_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                  src1_stride, mask, mask_stride, w, h,
+                                  blend_8_b12);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Horizontal and Vertical sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static INLINE void blend_a64_mask_bn_sx_sy_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
+  const __m128i v_zmask_b =
+      _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1);
+  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+  do {
+    const __m128i v_ra_b = xx_loadl_64(mask);
+    const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
+    const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
+    const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
+    const __m128i v_rvsb_w =
+        _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b);
+    const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
+
+    const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+    const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
+
+    xx_storel_64(dst, v_res_w);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+static void blend_a64_mask_b10_sx_sy_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  (void)w;
+  blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                    src1_stride, mask, mask_stride, h,
+                                    blend_4_b10);
+}
+
+static void blend_a64_mask_b12_sx_sy_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  (void)w;
+  blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                    src1_stride, mask, mask_stride, h,
+                                    blend_4_b12);
+}
+
+static INLINE void blend_a64_mask_bn_sx_sy_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h,
+    blend_unit_fn blend) {
+  const __m128i v_zmask_b =
+      _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1);
+  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+  do {
+    int c;
+    for (c = 0; c < w; c += 8) {
+      const __m128i v_ra_b = xx_loadu_128(mask + 2 * c);
+      const __m128i v_rb_b = xx_loadu_128(mask + 2 * c + mask_stride);
+      const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
+      const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
+      const __m128i v_rvsb_w =
+          _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b);
+      const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
+
+      const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
+      const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+      const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
+
+      xx_storeu_128(dst + c, v_res_w);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+static void blend_a64_mask_b10_sx_sy_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                     src1_stride, mask, mask_stride, w, h,
+                                     blend_8_b10);
+}
+
+static void blend_a64_mask_b12_sx_sy_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                     src1_stride, mask, mask_stride, w, h,
+                                     blend_8_b12);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Dispatch
+//////////////////////////////////////////////////////////////////////////////
+void aom_highbd_blend_a64_mask_sse4_1(uint8_t *dst_8, uint32_t dst_stride,
+                                      const uint8_t *src0_8,
+                                      uint32_t src0_stride,
+                                      const uint8_t *src1_8,
+                                      uint32_t src1_stride, const uint8_t *mask,
+                                      uint32_t mask_stride, int w, int h,
+                                      int subw, int subh, int bd) {
+  typedef void (*blend_fn)(
+      uint16_t * dst, uint32_t dst_stride, const uint16_t *src0,
+      uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+      const uint8_t *mask, uint32_t mask_stride, int w, int h);
+
+  // Dimensions are: bd_index X width_index X subw X subh
+  static const blend_fn blend[2][2][2][2] = {
+    {   // bd == 8 or 10
+      { // w % 8 == 0
+        { blend_a64_mask_b10_w8n_sse4_1, blend_a64_mask_b10_sy_w8n_sse4_1 },
+        { blend_a64_mask_b10_sx_w8n_sse4_1,
+          blend_a64_mask_b10_sx_sy_w8n_sse4_1 } },
+      { // w == 4
+        { blend_a64_mask_b10_w4_sse4_1, blend_a64_mask_b10_sy_w4_sse4_1 },
+        { blend_a64_mask_b10_sx_w4_sse4_1,
+          blend_a64_mask_b10_sx_sy_w4_sse4_1 } } },
+    {   // bd == 12
+      { // w % 8 == 0
+        { blend_a64_mask_b12_w8n_sse4_1, blend_a64_mask_b12_sy_w8n_sse4_1 },
+        { blend_a64_mask_b12_sx_w8n_sse4_1,
+          blend_a64_mask_b12_sx_sy_w8n_sse4_1 } },
+      { // w == 4
+        { blend_a64_mask_b12_w4_sse4_1, blend_a64_mask_b12_sy_w4_sse4_1 },
+        { blend_a64_mask_b12_sx_w4_sse4_1,
+          blend_a64_mask_b12_sx_sy_w4_sse4_1 } } }
+  };
+
+  assert(IMPLIES(src0_8 == dst_8, src0_stride == dst_stride));
+  assert(IMPLIES(src1_8 == dst_8, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  assert(bd == 8 || bd == 10 || bd == 12);
+  if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
+    aom_highbd_blend_a64_mask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8,
+                                src1_stride, mask, mask_stride, w, h, subw,
+                                subh, bd);
+  } else {
+    uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8);
+    const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8);
+    const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8);
+
+    blend[bd == 12][(w >> 2) & 1][subw != 0][subh != 0](
+        dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+        mask_stride, w, h);
+  }
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+static INLINE void blend_a64_d16_mask_w16_sse41(
+    uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
+    const __m128i *m0, const __m128i *m1, const __m128i *v_round_offset,
+    const __m128i *v_maxval, int shift) {
+  const __m128i max_minus_m0 = _mm_sub_epi16(*v_maxval, *m0);
+  const __m128i max_minus_m1 = _mm_sub_epi16(*v_maxval, *m1);
+  const __m128i s0_0 = xx_loadu_128(src0);
+  const __m128i s0_1 = xx_loadu_128(src0 + 8);
+  const __m128i s1_0 = xx_loadu_128(src1);
+  const __m128i s1_1 = xx_loadu_128(src1 + 8);
+  __m128i res0_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0_0, s1_0),
+                                   _mm_unpacklo_epi16(*m0, max_minus_m0));
+  __m128i res0_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0_0, s1_0),
+                                   _mm_unpackhi_epi16(*m0, max_minus_m0));
+  __m128i res1_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0_1, s1_1),
+                                   _mm_unpacklo_epi16(*m1, max_minus_m1));
+  __m128i res1_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0_1, s1_1),
+                                   _mm_unpackhi_epi16(*m1, max_minus_m1));
+  res0_lo = _mm_srai_epi32(_mm_sub_epi32(res0_lo, *v_round_offset), shift);
+  res0_hi = _mm_srai_epi32(_mm_sub_epi32(res0_hi, *v_round_offset), shift);
+  res1_lo = _mm_srai_epi32(_mm_sub_epi32(res1_lo, *v_round_offset), shift);
+  res1_hi = _mm_srai_epi32(_mm_sub_epi32(res1_hi, *v_round_offset), shift);
+  const __m128i res0 = _mm_packs_epi32(res0_lo, res0_hi);
+  const __m128i res1 = _mm_packs_epi32(res1_lo, res1_hi);
+  const __m128i res = _mm_packus_epi16(res0, res1);
+
+  _mm_storeu_si128((__m128i *)(dst), res);
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w,
+    const __m128i *round_offset, int shift) {
+  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  for (int i = 0; i < h; ++i) {
+    for (int j = 0; j < w; j += 16) {
+      const __m128i m = xx_loadu_128(mask + j);
+      const __m128i m0 = _mm_cvtepu8_epi16(m);
+      const __m128i m1 = _mm_cvtepu8_epi16(_mm_srli_si128(m, 8));
+
+      blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1,
+                                   round_offset, &v_maxval, shift);
+    }
+    mask += mask_stride;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w,
+    const __m128i *round_offset, int shift) {
+  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i one_b = _mm_set1_epi8(1);
+  const __m128i two_w = _mm_set1_epi16(2);
+  for (int i = 0; i < h; ++i) {
+    for (int j = 0; j < w; j += 16) {
+      const __m128i m_i00 = xx_loadu_128(mask + 2 * j);
+      const __m128i m_i01 = xx_loadu_128(mask + 2 * j + 16);
+      const __m128i m_i10 = xx_loadu_128(mask + mask_stride + 2 * j);
+      const __m128i m_i11 = xx_loadu_128(mask + mask_stride + 2 * j + 16);
+
+      const __m128i m0_ac = _mm_adds_epu8(m_i00, m_i10);
+      const __m128i m1_ac = _mm_adds_epu8(m_i01, m_i11);
+      const __m128i m0_acbd = _mm_maddubs_epi16(m0_ac, one_b);
+      const __m128i m1_acbd = _mm_maddubs_epi16(m1_ac, one_b);
+      const __m128i m0 = _mm_srli_epi16(_mm_add_epi16(m0_acbd, two_w), 2);
+      const __m128i m1 = _mm_srli_epi16(_mm_add_epi16(m1_acbd, two_w), 2);
+
+      blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1,
+                                   round_offset, &v_maxval, shift);
+    }
+    mask += mask_stride << 1;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw1_subh0_w16_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w,
+    const __m128i *round_offset, int shift) {
+  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i one_b = _mm_set1_epi8(1);
+  const __m128i zeros = _mm_setzero_si128();
+  for (int i = 0; i < h; ++i) {
+    for (int j = 0; j < w; j += 16) {
+      const __m128i m_i00 = xx_loadu_128(mask + 2 * j);
+      const __m128i m_i01 = xx_loadu_128(mask + 2 * j + 16);
+      const __m128i m0_ac = _mm_maddubs_epi16(m_i00, one_b);
+      const __m128i m1_ac = _mm_maddubs_epi16(m_i01, one_b);
+      const __m128i m0 = _mm_avg_epu16(m0_ac, zeros);
+      const __m128i m1 = _mm_avg_epu16(m1_ac, zeros);
+
+      blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1,
+                                   round_offset, &v_maxval, shift);
+    }
+    mask += mask_stride;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw0_subh1_w16_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w,
+    const __m128i *round_offset, int shift) {
+  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i zeros = _mm_setzero_si128();
+  for (int i = 0; i < h; ++i) {
+    for (int j = 0; j < w; j += 16) {
+      const __m128i m_i00 = xx_loadu_128(mask + j);
+      const __m128i m_i10 = xx_loadu_128(mask + mask_stride + j);
+
+      const __m128i m_ac = _mm_avg_epu8(_mm_adds_epu8(m_i00, m_i10), zeros);
+      const __m128i m0 = _mm_cvtepu8_epi16(m_ac);
+      const __m128i m1 = _mm_cvtepu8_epi16(_mm_srli_si128(m_ac, 8));
+
+      blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1,
+                                   round_offset, &v_maxval, shift);
+    }
+    mask += mask_stride << 1;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+
+void aom_lowbd_blend_a64_d16_mask_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
+    ConvolveParams *conv_params) {
+  const int bd = 8;
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+
+  const int round_offset =
+      ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) -
+       (1 << (round_bits - 1)))
+      << AOM_BLEND_A64_ROUND_BITS;
+
+  const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS;
+  assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 4);
+  assert(w >= 4);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  const __m128i v_round_offset = _mm_set1_epi32(round_offset);
+
+  if (subw == 0 && subh == 0) {
+    switch (w) {
+      case 4:
+        aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift);
+        break;
+      case 8:
+        aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift);
+        break;
+      default:
+        lowbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, w, &v_round_offset, shift);
+        break;
+    }
+
+  } else if (subw == 1 && subh == 1) {
+    switch (w) {
+      case 4:
+        aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift);
+        break;
+      case 8:
+        aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift);
+        break;
+      default:
+        lowbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, w, &v_round_offset, shift);
+        break;
+    }
+  } else if (subw == 1 && subh == 0) {
+    switch (w) {
+      case 4:
+        aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift);
+        break;
+      case 8:
+        aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift);
+        break;
+      default:
+        lowbd_blend_a64_d16_mask_subw1_subh0_w16_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, w, &v_round_offset, shift);
+        break;
+    }
+  } else {
+    switch (w) {
+      case 4:
+        aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift);
+        break;
+      case 8:
+        aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift);
+        break;
+      default:
+        lowbd_blend_a64_d16_mask_subw0_subh1_w16_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, w, &v_round_offset, shift);
+        break;
+    }
+  }
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// aom_highbd_blend_a64_d16_mask_sse4_1()
+//////////////////////////////////////////////////////////////////////////////
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE void highbd_blend_a64_d16_mask_w4_sse4_1(
+    uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
+    const CONV_BUF_TYPE *src1, int src1_stride, const __m128i *mask0a,
+    const __m128i *mask0b, const __m128i *round_offset, int shift,
+    const __m128i *clip_low, const __m128i *clip_high,
+    const __m128i *mask_max) {
+  // Load 4 pixels from each of 4 rows from each source
+  const __m128i s0a =
+      _mm_set_epi64x(*(uint64_t *)src0, *(uint64_t *)(src0 + src0_stride));
+  const __m128i s0b = _mm_set_epi64x(*(uint64_t *)(src0 + 2 * src0_stride),
+                                     *(uint64_t *)(src0 + 3 * src0_stride));
+  const __m128i s1a =
+      _mm_set_epi64x(*(uint64_t *)(src1), *(uint64_t *)(src1 + src1_stride));
+  const __m128i s1b = _mm_set_epi64x(*(uint64_t *)(src1 + 2 * src1_stride),
+                                     *(uint64_t *)(src1 + 3 * src1_stride));
+
+  // Generate the inverse masks
+  const __m128i mask1a = _mm_sub_epi16(*mask_max, *mask0a);
+  const __m128i mask1b = _mm_sub_epi16(*mask_max, *mask0b);
+
+  // Multiply each mask by the respective source
+  const __m128i mul0a_highs = _mm_mulhi_epu16(*mask0a, s0a);
+  const __m128i mul0a_lows = _mm_mullo_epi16(*mask0a, s0a);
+  const __m128i mul0ah = _mm_unpackhi_epi16(mul0a_lows, mul0a_highs);
+  const __m128i mul0al = _mm_unpacklo_epi16(mul0a_lows, mul0a_highs);
+  const __m128i mul1a_highs = _mm_mulhi_epu16(mask1a, s1a);
+  const __m128i mul1a_lows = _mm_mullo_epi16(mask1a, s1a);
+  const __m128i mul1ah = _mm_unpackhi_epi16(mul1a_lows, mul1a_highs);
+  const __m128i mul1al = _mm_unpacklo_epi16(mul1a_lows, mul1a_highs);
+
+  const __m128i mul0b_highs = _mm_mulhi_epu16(*mask0b, s0b);
+  const __m128i mul0b_lows = _mm_mullo_epi16(*mask0b, s0b);
+  const __m128i mul0bh = _mm_unpackhi_epi16(mul0b_lows, mul0b_highs);
+  const __m128i mul0bl = _mm_unpacklo_epi16(mul0b_lows, mul0b_highs);
+  const __m128i mul1b_highs = _mm_mulhi_epu16(mask1b, s1b);
+  const __m128i mul1b_lows = _mm_mullo_epi16(mask1b, s1b);
+  const __m128i mul1bh = _mm_unpackhi_epi16(mul1b_lows, mul1b_highs);
+  const __m128i mul1bl = _mm_unpacklo_epi16(mul1b_lows, mul1b_highs);
+
+  const __m128i sumah = _mm_add_epi32(mul0ah, mul1ah);
+  const __m128i sumal = _mm_add_epi32(mul0al, mul1al);
+  const __m128i sumbh = _mm_add_epi32(mul0bh, mul1bh);
+  const __m128i sumbl = _mm_add_epi32(mul0bl, mul1bl);
+
+  const __m128i roundah =
+      _mm_srai_epi32(_mm_sub_epi32(sumah, *round_offset), shift);
+  const __m128i roundbh =
+      _mm_srai_epi32(_mm_sub_epi32(sumbh, *round_offset), shift);
+  const __m128i roundal =
+      _mm_srai_epi32(_mm_sub_epi32(sumal, *round_offset), shift);
+  const __m128i roundbl =
+      _mm_srai_epi32(_mm_sub_epi32(sumbl, *round_offset), shift);
+
+  const __m128i packa = _mm_packs_epi32(roundal, roundah);
+  const __m128i packb = _mm_packs_epi32(roundbl, roundbh);
+
+  const __m128i clipa =
+      _mm_min_epi16(_mm_max_epi16(packa, *clip_low), *clip_high);
+  const __m128i clipb =
+      _mm_min_epi16(_mm_max_epi16(packb, *clip_low), *clip_high);
+
+  xx_storel_64(dst, _mm_srli_si128(clipa, 8));
+  xx_storel_64(dst + dst_stride, clipa);
+  xx_storel_64(dst + 2 * dst_stride, _mm_srli_si128(clipb, 8));
+  xx_storel_64(dst + 3 * dst_stride, clipb);
+}
+
+static INLINE void highbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h,
+    const __m128i *round_offset, int shift, const __m128i *clip_low,
+    const __m128i *clip_high, const __m128i *mask_max) {
+  do {
+    const __m128i mask0a8 = _mm_set_epi32(0, 0, *(uint32_t *)mask,
+                                          *(uint32_t *)(mask + mask_stride));
+    const __m128i mask0b8 =
+        _mm_set_epi32(0, 0, *(uint32_t *)(mask + 2 * mask_stride),
+                      *(uint32_t *)(mask + 3 * mask_stride));
+    const __m128i mask0a = _mm_cvtepu8_epi16(mask0a8);
+    const __m128i mask0b = _mm_cvtepu8_epi16(mask0b8);
+
+    highbd_blend_a64_d16_mask_w4_sse4_1(
+        dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask0a, &mask0b,
+        round_offset, shift, clip_low, clip_high, mask_max);
+
+    dst += dst_stride * 4;
+    src0 += src0_stride * 4;
+    src1 += src1_stride * 4;
+    mask += mask_stride * 4;
+  } while (h -= 4);
+}
+
+static INLINE void highbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h,
+    const __m128i *round_offset, int shift, const __m128i *clip_low,
+    const __m128i *clip_high, const __m128i *mask_max) {
+  const __m128i one_b = _mm_set1_epi8(1);
+  const __m128i two_w = _mm_set1_epi16(2);
+  do {
+    // Load 8 pixels from each of 8 rows of mask,
+    // (saturating) add together rows then use madd to add adjacent pixels
+    // Finally, divide each value by 4 (with rounding)
+    const __m128i m02 = _mm_set_epi64x(*(uint64_t *)(mask),
+                                       *(uint64_t *)(mask + 2 * mask_stride));
+    const __m128i m13 = _mm_set_epi64x(*(uint64_t *)(mask + mask_stride),
+                                       *(uint64_t *)(mask + 3 * mask_stride));
+    const __m128i m0123 = _mm_maddubs_epi16(_mm_adds_epu8(m02, m13), one_b);
+    const __m128i mask_0a = _mm_srli_epi16(_mm_add_epi16(m0123, two_w), 2);
+    const __m128i m46 = _mm_set_epi64x(*(uint64_t *)(mask + 4 * mask_stride),
+                                       *(uint64_t *)(mask + 6 * mask_stride));
+    const __m128i m57 = _mm_set_epi64x(*(uint64_t *)(mask + 5 * mask_stride),
+                                       *(uint64_t *)(mask + 7 * mask_stride));
+    const __m128i m4567 = _mm_maddubs_epi16(_mm_adds_epu8(m46, m57), one_b);
+    const __m128i mask_0b = _mm_srli_epi16(_mm_add_epi16(m4567, two_w), 2);
+
+    highbd_blend_a64_d16_mask_w4_sse4_1(
+        dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask_0a,
+        &mask_0b, round_offset, shift, clip_low, clip_high, mask_max);
+
+    dst += dst_stride * 4;
+    src0 += src0_stride * 4;
+    src1 += src1_stride * 4;
+    mask += mask_stride * 8;
+  } while (h -= 4);
+}
+
+static INLINE void highbd_blend_a64_d16_mask_w8_sse4_1(
+    uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
+    const CONV_BUF_TYPE *src1, int src1_stride, const __m128i *mask0a,
+    const __m128i *mask0b, const __m128i *round_offset, int shift,
+    const __m128i *clip_low, const __m128i *clip_high,
+    const __m128i *max_mask) {
+  // Load 8x pixels from each of 2 rows from each source
+  const __m128i s0a = xx_loadu_128(src0);
+  const __m128i s0b = xx_loadu_128(src0 + src0_stride);
+  const __m128i s1a = xx_loadu_128(src1);
+  const __m128i s1b = xx_loadu_128(src1 + src1_stride);
+
+  // Generate inverse masks
+  const __m128i mask1a = _mm_sub_epi16(*max_mask, *mask0a);
+  const __m128i mask1b = _mm_sub_epi16(*max_mask, *mask0b);
+
+  // Multiply sources by respective masks
+  const __m128i mul0a_highs = _mm_mulhi_epu16(*mask0a, s0a);
+  const __m128i mul0a_lows = _mm_mullo_epi16(*mask0a, s0a);
+  const __m128i mul0ah = _mm_unpackhi_epi16(mul0a_lows, mul0a_highs);
+  const __m128i mul0al = _mm_unpacklo_epi16(mul0a_lows, mul0a_highs);
+
+  const __m128i mul1a_highs = _mm_mulhi_epu16(mask1a, s1a);
+  const __m128i mul1a_lows = _mm_mullo_epi16(mask1a, s1a);
+  const __m128i mul1ah = _mm_unpackhi_epi16(mul1a_lows, mul1a_highs);
+  const __m128i mul1al = _mm_unpacklo_epi16(mul1a_lows, mul1a_highs);
+
+  const __m128i sumah = _mm_add_epi32(mul0ah, mul1ah);
+  const __m128i sumal = _mm_add_epi32(mul0al, mul1al);
+
+  const __m128i mul0b_highs = _mm_mulhi_epu16(*mask0b, s0b);
+  const __m128i mul0b_lows = _mm_mullo_epi16(*mask0b, s0b);
+  const __m128i mul0bh = _mm_unpackhi_epi16(mul0b_lows, mul0b_highs);
+  const __m128i mul0bl = _mm_unpacklo_epi16(mul0b_lows, mul0b_highs);
+
+  const __m128i mul1b_highs = _mm_mulhi_epu16(mask1b, s1b);
+  const __m128i mul1b_lows = _mm_mullo_epi16(mask1b, s1b);
+  const __m128i mul1bh = _mm_unpackhi_epi16(mul1b_lows, mul1b_highs);
+  const __m128i mul1bl = _mm_unpacklo_epi16(mul1b_lows, mul1b_highs);
+
+  const __m128i sumbh = _mm_add_epi32(mul0bh, mul1bh);
+  const __m128i sumbl = _mm_add_epi32(mul0bl, mul1bl);
+
+  const __m128i roundah =
+      _mm_srai_epi32(_mm_sub_epi32(sumah, *round_offset), shift);
+  const __m128i roundal =
+      _mm_srai_epi32(_mm_sub_epi32(sumal, *round_offset), shift);
+  const __m128i roundbh =
+      _mm_srai_epi32(_mm_sub_epi32(sumbh, *round_offset), shift);
+  const __m128i roundbl =
+      _mm_srai_epi32(_mm_sub_epi32(sumbl, *round_offset), shift);
+
+  const __m128i packa = _mm_packs_epi32(roundal, roundah);
+  const __m128i clipa =
+      _mm_min_epi16(_mm_max_epi16(packa, *clip_low), *clip_high);
+  const __m128i packb = _mm_packs_epi32(roundbl, roundbh);
+  const __m128i clipb =
+      _mm_min_epi16(_mm_max_epi16(packb, *clip_low), *clip_high);
+
+  xx_storeu_128(dst, clipa);
+  xx_storeu_128(dst + dst_stride, clipb);
+}
+
+static INLINE void highbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h,
+    const __m128i *round_offset, int shift, const __m128i *clip_low,
+    const __m128i *clip_high, const __m128i *max_mask) {
+  do {
+    const __m128i mask0a = _mm_cvtepu8_epi16(xx_loadl_64(mask));
+    const __m128i mask0b = _mm_cvtepu8_epi16(xx_loadl_64(mask + mask_stride));
+    highbd_blend_a64_d16_mask_w8_sse4_1(
+        dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask0a, &mask0b,
+        round_offset, shift, clip_low, clip_high, max_mask);
+
+    dst += dst_stride * 2;
+    src0 += src0_stride * 2;
+    src1 += src1_stride * 2;
+    mask += mask_stride * 2;
+  } while (h -= 2);
+}
+
+static INLINE void highbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h,
+    const __m128i *round_offset, int shift, const __m128i *clip_low,
+    const __m128i *clip_high, const __m128i *max_mask) {
+  const __m128i one_b = _mm_set1_epi8(1);
+  const __m128i two_w = _mm_set1_epi16(2);
+  do {
+    const __m128i mask_thisrowa = xx_loadu_128(mask);
+    const __m128i mask_nextrowa = xx_loadu_128(mask + mask_stride);
+    const __m128i mask_thisrowb = xx_loadu_128(mask + 2 * mask_stride);
+    const __m128i mask_nextrowb = xx_loadu_128(mask + 3 * mask_stride);
+    const __m128i mask_bothrowsa = _mm_adds_epu8(mask_thisrowa, mask_nextrowa);
+    const __m128i mask_bothrowsb = _mm_adds_epu8(mask_thisrowb, mask_nextrowb);
+    const __m128i mask_16a = _mm_maddubs_epi16(mask_bothrowsa, one_b);
+    const __m128i mask_16b = _mm_maddubs_epi16(mask_bothrowsb, one_b);
+    const __m128i mask_sa = _mm_srli_epi16(_mm_add_epi16(mask_16a, two_w), 2);
+    const __m128i mask_sb = _mm_srli_epi16(_mm_add_epi16(mask_16b, two_w), 2);
+
+    highbd_blend_a64_d16_mask_w8_sse4_1(
+        dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask_sa,
+        &mask_sb, round_offset, shift, clip_low, clip_high, max_mask);
+
+    dst += dst_stride * 2;
+    src0 += src0_stride * 2;
+    src1 += src1_stride * 2;
+    mask += mask_stride * 4;
+  } while (h -= 2);
+}
+
+static INLINE void highbd_blend_a64_d16_mask_w16_sse4_1(
+    uint16_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
+    const __m128i *round_offset, int shift, const __m128i *mask0l,
+    const __m128i *mask0h, const __m128i *clip_low, const __m128i *clip_high,
+    const __m128i *mask_max) {
+  // Load 16x u16 pixels for this row from each src
+  const __m128i s0l = xx_loadu_128(src0);
+  const __m128i s0h = xx_loadu_128(src0 + 8);
+  const __m128i s1l = xx_loadu_128(src1);
+  const __m128i s1h = xx_loadu_128(src1 + 8);
+
+  // Calculate inverse masks
+  const __m128i mask1h = _mm_sub_epi16(*mask_max, *mask0h);
+  const __m128i mask1l = _mm_sub_epi16(*mask_max, *mask0l);
+
+  const __m128i mul0_highs = _mm_mulhi_epu16(*mask0h, s0h);
+  const __m128i mul0_lows = _mm_mullo_epi16(*mask0h, s0h);
+  const __m128i mul0h = _mm_unpackhi_epi16(mul0_lows, mul0_highs);
+  const __m128i mul0l = _mm_unpacklo_epi16(mul0_lows, mul0_highs);
+
+  const __m128i mul1_highs = _mm_mulhi_epu16(mask1h, s1h);
+  const __m128i mul1_lows = _mm_mullo_epi16(mask1h, s1h);
+  const __m128i mul1h = _mm_unpackhi_epi16(mul1_lows, mul1_highs);
+  const __m128i mul1l = _mm_unpacklo_epi16(mul1_lows, mul1_highs);
+
+  const __m128i mulhh = _mm_add_epi32(mul0h, mul1h);
+  const __m128i mulhl = _mm_add_epi32(mul0l, mul1l);
+
+  const __m128i mul2_highs = _mm_mulhi_epu16(*mask0l, s0l);
+  const __m128i mul2_lows = _mm_mullo_epi16(*mask0l, s0l);
+  const __m128i mul2h = _mm_unpackhi_epi16(mul2_lows, mul2_highs);
+  const __m128i mul2l = _mm_unpacklo_epi16(mul2_lows, mul2_highs);
+
+  const __m128i mul3_highs = _mm_mulhi_epu16(mask1l, s1l);
+  const __m128i mul3_lows = _mm_mullo_epi16(mask1l, s1l);
+  const __m128i mul3h = _mm_unpackhi_epi16(mul3_lows, mul3_highs);
+  const __m128i mul3l = _mm_unpacklo_epi16(mul3_lows, mul3_highs);
+
+  const __m128i mullh = _mm_add_epi32(mul2h, mul3h);
+  const __m128i mulll = _mm_add_epi32(mul2l, mul3l);
+
+  const __m128i reshh =
+      _mm_srai_epi32(_mm_sub_epi32(mulhh, *round_offset), shift);
+  const __m128i reshl =
+      _mm_srai_epi32(_mm_sub_epi32(mulhl, *round_offset), shift);
+  const __m128i reslh =
+      _mm_srai_epi32(_mm_sub_epi32(mullh, *round_offset), shift);
+  const __m128i resll =
+      _mm_srai_epi32(_mm_sub_epi32(mulll, *round_offset), shift);
+
+  // Signed saturating pack from i32 to i16:
+  const __m128i packh = _mm_packs_epi32(reshl, reshh);
+  const __m128i packl = _mm_packs_epi32(resll, reslh);
+
+  // Clip the values to the valid range
+  const __m128i cliph =
+      _mm_min_epi16(_mm_max_epi16(packh, *clip_low), *clip_high);
+  const __m128i clipl =
+      _mm_min_epi16(_mm_max_epi16(packl, *clip_low), *clip_high);
+
+  // Store 16 pixels
+  xx_storeu_128(dst, clipl);
+  xx_storeu_128(dst + 8, cliph);
+}
+
+static INLINE void highbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w,
+    const __m128i *round_offset, int shift, const __m128i *clip_low,
+    const __m128i *clip_high, const __m128i *mask_max) {
+  for (int i = 0; i < h; i++) {
+    for (int j = 0; j < w; j += 16) {
+      // Load 16x u8 alpha-mask values and pad to u16
+      const __m128i masks_u8 = xx_loadu_128(mask + j);
+      const __m128i mask0l = _mm_cvtepu8_epi16(masks_u8);
+      const __m128i mask0h = _mm_cvtepu8_epi16(_mm_srli_si128(masks_u8, 8));
+
+      highbd_blend_a64_d16_mask_w16_sse4_1(
+          dst + j, src0 + j, src1 + j, round_offset, shift, &mask0l, &mask0h,
+          clip_low, clip_high, mask_max);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  }
+}
+
+static INLINE void highbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w,
+    const __m128i *round_offset, int shift, const __m128i *clip_low,
+    const __m128i *clip_high, const __m128i *mask_max) {
+  const __m128i one_b = _mm_set1_epi8(1);
+  const __m128i two_w = _mm_set1_epi16(2);
+  for (int i = 0; i < h; i++) {
+    for (int j = 0; j < w; j += 16) {
+      const __m128i m_i00 = xx_loadu_128(mask + 2 * j);
+      const __m128i m_i01 = xx_loadu_128(mask + 2 * j + 16);
+      const __m128i m_i10 = xx_loadu_128(mask + mask_stride + 2 * j);
+      const __m128i m_i11 = xx_loadu_128(mask + mask_stride + 2 * j + 16);
+
+      const __m128i m0_ac = _mm_adds_epu8(m_i00, m_i10);
+      const __m128i m1_ac = _mm_adds_epu8(m_i01, m_i11);
+      const __m128i m0_acbd = _mm_maddubs_epi16(m0_ac, one_b);
+      const __m128i m1_acbd = _mm_maddubs_epi16(m1_ac, one_b);
+      const __m128i mask_l = _mm_srli_epi16(_mm_add_epi16(m0_acbd, two_w), 2);
+      const __m128i mask_h = _mm_srli_epi16(_mm_add_epi16(m1_acbd, two_w), 2);
+
+      highbd_blend_a64_d16_mask_w16_sse4_1(
+          dst + j, src0 + j, src1 + j, round_offset, shift, &mask_l, &mask_h,
+          clip_low, clip_high, mask_max);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride * 2;
+  }
+}
+
+void aom_highbd_blend_a64_d16_mask_sse4_1(
+    uint8_t *dst8, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
+    ConvolveParams *conv_params, const int bd) {
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int32_t round_offset =
+      ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) -
+       (1 << (round_bits - 1)))
+      << AOM_BLEND_A64_ROUND_BITS;
+  const __m128i v_round_offset = _mm_set1_epi32(round_offset);
+  const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS;
+
+  const __m128i clip_low = _mm_set1_epi16(0);
+  const __m128i clip_high = _mm_set1_epi16((1 << bd) - 1);
+  const __m128i mask_max = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+  assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 4);
+  assert(w >= 4);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  if (subw == 0 && subh == 0) {
+    switch (w) {
+      case 4:
+        highbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high,
+            &mask_max);
+        break;
+      case 8:
+        highbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high,
+            &mask_max);
+        break;
+      default:  // >=16
+        highbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, w, &v_round_offset, shift, &clip_low, &clip_high,
+            &mask_max);
+        break;
+    }
+
+  } else if (subw == 1 && subh == 1) {
+    switch (w) {
+      case 4:
+        highbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high,
+            &mask_max);
+        break;
+      case 8:
+        highbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high,
+            &mask_max);
+        break;
+      default:  // >=16
+        highbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, w, &v_round_offset, shift, &clip_low, &clip_high,
+            &mask_max);
+        break;
+    }
+  } else {
+    // Sub-sampling in only one axis doesn't seem to happen very much, so fall
+    // back to the vanilla C implementation instead of having all the optimised
+    // code for these.
+    aom_highbd_blend_a64_d16_mask_c(dst8, dst_stride, src0, src0_stride, src1,
+                                    src1_stride, mask, mask_stride, w, h, subw,
+                                    subh, conv_params, bd);
+  }
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/libs/libaom/src/aom_dsp/x86/blend_a64_vmask_sse4.c b/libs/libaom/src/aom_dsp/x86/blend_a64_vmask_sse4.c
new file mode 100644
index 000000000..75fb1c5a9
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/blend_a64_vmask_sse4.c
@@ -0,0 +1,285 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h>  // SSE4.1
+
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/blend.h"
+
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/blend_sse4.h"
+
+#include "config/aom_dsp_rtcd.h"
+
+//////////////////////////////////////////////////////////////////////////////
+// Implementation - No sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static void blend_a64_vmask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride,
+                                      const uint8_t *src0, uint32_t src0_stride,
+                                      const uint8_t *src1, uint32_t src1_stride,
+                                      const uint8_t *mask, int w, int h) {
+  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+  (void)w;
+
+  do {
+    const __m128i v_m0_w = _mm_set1_epi16(*mask);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+    const __m128i v_res_w = blend_4(src0, src1, &v_m0_w, &v_m1_w);
+
+    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+
+    xx_storel_32(dst, v_res_b);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 1;
+  } while (--h);
+}
+
+static void blend_a64_vmask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride,
+                                      const uint8_t *src0, uint32_t src0_stride,
+                                      const uint8_t *src1, uint32_t src1_stride,
+                                      const uint8_t *mask, int w, int h) {
+  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+  (void)w;
+
+  do {
+    const __m128i v_m0_w = _mm_set1_epi16(*mask);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+    const __m128i v_res_w = blend_8(src0, src1, &v_m0_w, &v_m1_w);
+
+    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+
+    xx_storel_64(dst, v_res_b);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 1;
+  } while (--h);
+}
+
+static void blend_a64_vmask_w16n_sse4_1(uint8_t *dst, uint32_t dst_stride,
+                                        const uint8_t *src0,
+                                        uint32_t src0_stride,
+                                        const uint8_t *src1,
+                                        uint32_t src1_stride,
+                                        const uint8_t *mask, int w, int h) {
+  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+  do {
+    int c;
+    const __m128i v_m0_w = _mm_set1_epi16(*mask);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+    for (c = 0; c < w; c += 16) {
+      const __m128i v_resl_w = blend_8(src0 + c, src1 + c, &v_m0_w, &v_m1_w);
+      const __m128i v_resh_w =
+          blend_8(src0 + c + 8, src1 + c + 8, &v_m0_w, &v_m1_w);
+
+      const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
+
+      xx_storeu_128(dst + c, v_res_b);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 1;
+  } while (--h);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Dispatch
+//////////////////////////////////////////////////////////////////////////////
+
+void aom_blend_a64_vmask_sse4_1(uint8_t *dst, uint32_t dst_stride,
+                                const uint8_t *src0, uint32_t src0_stride,
+                                const uint8_t *src1, uint32_t src1_stride,
+                                const uint8_t *mask, int w, int h) {
+  typedef void (*blend_fn)(uint8_t * dst, uint32_t dst_stride,
+                           const uint8_t *src0, uint32_t src0_stride,
+                           const uint8_t *src1, uint32_t src1_stride,
+                           const uint8_t *mask, int w, int h);
+
+  // Dimension: width_index
+  static const blend_fn blend[9] = {
+    blend_a64_vmask_w16n_sse4_1,  // w % 16 == 0
+    aom_blend_a64_vmask_c,        // w == 1
+    aom_blend_a64_vmask_c,        // w == 2
+    NULL,                         // INVALID
+    blend_a64_vmask_w4_sse4_1,    // w == 4
+    NULL,                         // INVALID
+    NULL,                         // INVALID
+    NULL,                         // INVALID
+    blend_a64_vmask_w8_sse4_1,    // w == 8
+  };
+
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  blend[w & 0xf](dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, w,
+                 h);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+//////////////////////////////////////////////////////////////////////////////
+// Implementation - No sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static INLINE void blend_a64_vmask_bn_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, int h, blend_unit_fn blend) {
+  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+  do {
+    const __m128i v_m0_w = _mm_set1_epi16(*mask);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+    const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
+
+    xx_storel_64(dst, v_res_w);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 1;
+  } while (--h);
+}
+
+static void blend_a64_vmask_b10_w4_sse4_1(uint16_t *dst, uint32_t dst_stride,
+                                          const uint16_t *src0,
+                                          uint32_t src0_stride,
+                                          const uint16_t *src1,
+                                          uint32_t src1_stride,
+                                          const uint8_t *mask, int w, int h) {
+  (void)w;
+  blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                               src1_stride, mask, h, blend_4_b10);
+}
+
+static void blend_a64_vmask_b12_w4_sse4_1(uint16_t *dst, uint32_t dst_stride,
+                                          const uint16_t *src0,
+                                          uint32_t src0_stride,
+                                          const uint16_t *src1,
+                                          uint32_t src1_stride,
+                                          const uint8_t *mask, int w, int h) {
+  (void)w;
+  blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                               src1_stride, mask, h, blend_4_b12);
+}
+
+static INLINE void blend_a64_vmask_bn_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, int w, int h, blend_unit_fn blend) {
+  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+  do {
+    int c;
+    const __m128i v_m0_w = _mm_set1_epi16(*mask);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+    for (c = 0; c < w; c += 8) {
+      const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
+
+      xx_storeu_128(dst + c, v_res_w);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 1;
+  } while (--h);
+}
+
+static void blend_a64_vmask_b10_w8n_sse4_1(uint16_t *dst, uint32_t dst_stride,
+                                           const uint16_t *src0,
+                                           uint32_t src0_stride,
+                                           const uint16_t *src1,
+                                           uint32_t src1_stride,
+                                           const uint8_t *mask, int w, int h) {
+  blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                src1_stride, mask, w, h, blend_8_b10);
+}
+
+static void blend_a64_vmask_b12_w8n_sse4_1(uint16_t *dst, uint32_t dst_stride,
+                                           const uint16_t *src0,
+                                           uint32_t src0_stride,
+                                           const uint16_t *src1,
+                                           uint32_t src1_stride,
+                                           const uint8_t *mask, int w, int h) {
+  blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                src1_stride, mask, w, h, blend_8_b12);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Dispatch
+//////////////////////////////////////////////////////////////////////////////
+
+void aom_highbd_blend_a64_vmask_sse4_1(
+    uint8_t *dst_8, uint32_t dst_stride, const uint8_t *src0_8,
+    uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride,
+    const uint8_t *mask, int w, int h, int bd) {
+  typedef void (*blend_fn)(uint16_t * dst, uint32_t dst_stride,
+                           const uint16_t *src0, uint32_t src0_stride,
+                           const uint16_t *src1, uint32_t src1_stride,
+                           const uint8_t *mask, int w, int h);
+
+  // Dimensions are: bd_index X width_index
+  static const blend_fn blend[2][2] = {
+    {
+        // bd == 8 or 10
+        blend_a64_vmask_b10_w8n_sse4_1,  // w % 8 == 0
+        blend_a64_vmask_b10_w4_sse4_1,   // w == 4
+    },
+    {
+        // bd == 12
+        blend_a64_vmask_b12_w8n_sse4_1,  // w % 8 == 0
+        blend_a64_vmask_b12_w4_sse4_1,   // w == 4
+    }
+  };
+
+  assert(IMPLIES(src0_8 == dst_8, src0_stride == dst_stride));
+  assert(IMPLIES(src1_8 == dst_8, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  assert(bd == 8 || bd == 10 || bd == 12);
+
+  if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
+    aom_highbd_blend_a64_vmask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8,
+                                 src1_stride, mask, w, h, bd);
+  } else {
+    uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8);
+    const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8);
+    const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8);
+
+    blend[bd == 12][(w >> 2) & 1](dst, dst_stride, src0, src0_stride, src1,
+                                  src1_stride, mask, w, h);
+  }
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/libs/libaom/src/aom_dsp/x86/blend_mask_sse4.h b/libs/libaom/src/aom_dsp/x86/blend_mask_sse4.h
new file mode 100644
index 000000000..c071fdcfc
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/blend_mask_sse4.h
@@ -0,0 +1,237 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_BLEND_MASK_SSE4_H_
+#define AOM_AOM_DSP_X86_BLEND_MASK_SSE4_H_
+#include <smmintrin.h>  // SSE4.1
+
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/blend.h"
+
+#include "aom_dsp/x86/synonyms.h"
+
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE void blend_a64_d16_mask_w4_sse41(
+    uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
+    const __m128i *m, const __m128i *v_round_offset, const __m128i *v_maxval,
+    int shift) {
+  const __m128i max_minus_m = _mm_sub_epi16(*v_maxval, *m);
+  const __m128i s0 = xx_loadl_64(src0);
+  const __m128i s1 = xx_loadl_64(src1);
+  const __m128i s0_s1 = _mm_unpacklo_epi16(s0, s1);
+  const __m128i m_max_minus_m = _mm_unpacklo_epi16(*m, max_minus_m);
+  const __m128i res_a = _mm_madd_epi16(s0_s1, m_max_minus_m);
+  const __m128i res_c = _mm_sub_epi32(res_a, *v_round_offset);
+  const __m128i res_d = _mm_srai_epi32(res_c, shift);
+  const __m128i res_e = _mm_packs_epi32(res_d, res_d);
+  const __m128i res = _mm_packus_epi16(res_e, res_e);
+
+  xx_storel_32(dst, res);
+}
+
+static INLINE void blend_a64_d16_mask_w8_sse41(
+    uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
+    const __m128i *m, const __m128i *v_round_offset, const __m128i *v_maxval,
+    int shift) {
+  const __m128i max_minus_m = _mm_sub_epi16(*v_maxval, *m);
+  const __m128i s0 = xx_loadu_128(src0);
+  const __m128i s1 = xx_loadu_128(src1);
+  __m128i res_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0, s1),
+                                  _mm_unpacklo_epi16(*m, max_minus_m));
+  __m128i res_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0, s1),
+                                  _mm_unpackhi_epi16(*m, max_minus_m));
+  res_lo = _mm_srai_epi32(_mm_sub_epi32(res_lo, *v_round_offset), shift);
+  res_hi = _mm_srai_epi32(_mm_sub_epi32(res_hi, *v_round_offset), shift);
+  const __m128i res_e = _mm_packs_epi32(res_lo, res_hi);
+  const __m128i res = _mm_packus_epi16(res_e, res_e);
+
+  _mm_storel_epi64((__m128i *)(dst), res);
+}
+
+static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h,
+    const __m128i *round_offset, int shift) {
+  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  for (int i = 0; i < h; ++i) {
+    const __m128i m0 = xx_loadl_32(mask);
+    const __m128i m = _mm_cvtepu8_epi16(m0);
+
+    blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
+                                shift);
+    mask += mask_stride;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+
+static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h,
+    const __m128i *round_offset, int shift) {
+  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  for (int i = 0; i < h; ++i) {
+    const __m128i m0 = xx_loadl_64(mask);
+    const __m128i m = _mm_cvtepu8_epi16(m0);
+    blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
+                                shift);
+    mask += mask_stride;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+
+static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h,
+    const __m128i *round_offset, int shift) {
+  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i one_b = _mm_set1_epi8(1);
+  const __m128i two_w = _mm_set1_epi16(2);
+  for (int i = 0; i < h; ++i) {
+    const __m128i m_i0 = xx_loadl_64(mask);
+    const __m128i m_i1 = xx_loadl_64(mask + mask_stride);
+    const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1);
+    const __m128i m_acbd = _mm_maddubs_epi16(m_ac, one_b);
+    const __m128i m_acbd_2 = _mm_add_epi16(m_acbd, two_w);
+    const __m128i m = _mm_srli_epi16(m_acbd_2, 2);
+
+    blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
+                                shift);
+    mask += mask_stride << 1;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+
+static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h,
+    const __m128i *round_offset, int shift) {
+  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i one_b = _mm_set1_epi8(1);
+  const __m128i two_w = _mm_set1_epi16(2);
+  for (int i = 0; i < h; ++i) {
+    const __m128i m_i0 = xx_loadu_128(mask);
+    const __m128i m_i1 = xx_loadu_128(mask + mask_stride);
+    const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1);
+    const __m128i m_acbd = _mm_maddubs_epi16(m_ac, one_b);
+    const __m128i m_acbd_2 = _mm_add_epi16(m_acbd, two_w);
+    const __m128i m = _mm_srli_epi16(m_acbd_2, 2);
+
+    blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
+                                shift);
+    mask += mask_stride << 1;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+
+static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h,
+    const __m128i *round_offset, int shift) {
+  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i one_b = _mm_set1_epi8(1);
+  const __m128i zeros = _mm_setzero_si128();
+  for (int i = 0; i < h; ++i) {
+    const __m128i m_i0 = xx_loadl_64(mask);
+    const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b);
+    const __m128i m = _mm_avg_epu16(m_ac, zeros);
+
+    blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
+                                shift);
+    mask += mask_stride;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+
+static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h,
+    const __m128i *round_offset, int shift) {
+  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i one_b = _mm_set1_epi8(1);
+  const __m128i zeros = _mm_setzero_si128();
+  for (int i = 0; i < h; ++i) {
+    const __m128i m_i0 = xx_loadu_128(mask);
+    const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b);
+    const __m128i m = _mm_avg_epu16(m_ac, zeros);
+
+    blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
+                                shift);
+    mask += mask_stride;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h,
+    const __m128i *round_offset, int shift) {
+  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i zeros = _mm_setzero_si128();
+  for (int i = 0; i < h; ++i) {
+    const __m128i m_i0 = xx_loadl_64(mask);
+    const __m128i m_i1 = xx_loadl_64(mask + mask_stride);
+    const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1);
+    const __m128i m = _mm_cvtepu8_epi16(_mm_avg_epu8(m_ac, zeros));
+
+    blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
+                                shift);
+    mask += mask_stride << 1;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+
+static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h,
+    const __m128i *round_offset, int shift) {
+  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i zeros = _mm_setzero_si128();
+  for (int i = 0; i < h; ++i) {
+    const __m128i m_i0 = xx_loadl_64(mask);
+    const __m128i m_i1 = xx_loadl_64(mask + mask_stride);
+    const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1);
+    const __m128i m = _mm_cvtepu8_epi16(_mm_avg_epu8(m_ac, zeros));
+
+    blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
+                                shift);
+    mask += mask_stride << 1;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+#endif  // AOM_AOM_DSP_X86_BLEND_MASK_SSE4_H_
diff --git a/libs/libaom/src/aom_dsp/x86/blend_sse4.h b/libs/libaom/src/aom_dsp/x86/blend_sse4.h
new file mode 100644
index 000000000..8d9b32510
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/blend_sse4.h
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_BLEND_SSE4_H_
+#define AOM_AOM_DSP_X86_BLEND_SSE4_H_
+
+#include "aom_dsp/blend.h"
+#include "aom_dsp/x86/synonyms.h"
+static const uint8_t g_blend_a64_mask_shuffle[32] = {
+  0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
+  0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
+};
+
+//////////////////////////////////////////////////////////////////////////////
+// Common kernels
+//////////////////////////////////////////////////////////////////////////////
+
+static INLINE __m128i blend_4(const uint8_t *src0, const uint8_t *src1,
+                              const __m128i *v_m0_w, const __m128i *v_m1_w) {
+  const __m128i v_s0_b = xx_loadl_32(src0);
+  const __m128i v_s1_b = xx_loadl_32(src1);
+  const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
+  const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
+
+  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, *v_m0_w);
+  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, *v_m1_w);
+  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
+  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
+
+  return v_res_w;
+}
+
+static INLINE __m128i blend_8(const uint8_t *src0, const uint8_t *src1,
+                              const __m128i *v_m0_w, const __m128i *v_m1_w) {
+  const __m128i v_s0_b = xx_loadl_64(src0);
+  const __m128i v_s1_b = xx_loadl_64(src1);
+  const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
+  const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
+
+  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, *v_m0_w);
+  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, *v_m1_w);
+
+  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
+
+  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
+
+  return v_res_w;
+}
+
+static INLINE __m128i blend_4_u8(const uint8_t *src0, const uint8_t *src1,
+                                 const __m128i *v_m0_b, const __m128i *v_m1_b,
+                                 const __m128i *rounding) {
+  const __m128i v_s0_b = xx_loadl_32(src0);
+  const __m128i v_s1_b = xx_loadl_32(src1);
+
+  const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b),
+                                           _mm_unpacklo_epi8(*v_m0_b, *v_m1_b));
+
+  const __m128i v_res_w = _mm_mulhrs_epi16(v_p0_w, *rounding);
+  const __m128i v_res = _mm_packus_epi16(v_res_w, v_res_w);
+  return v_res;
+}
+
+static INLINE __m128i blend_8_u8(const uint8_t *src0, const uint8_t *src1,
+                                 const __m128i *v_m0_b, const __m128i *v_m1_b,
+                                 const __m128i *rounding) {
+  const __m128i v_s0_b = xx_loadl_64(src0);
+  const __m128i v_s1_b = xx_loadl_64(src1);
+
+  const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b),
+                                           _mm_unpacklo_epi8(*v_m0_b, *v_m1_b));
+
+  const __m128i v_res_w = _mm_mulhrs_epi16(v_p0_w, *rounding);
+  const __m128i v_res = _mm_packus_epi16(v_res_w, v_res_w);
+  return v_res;
+}
+
+static INLINE __m128i blend_16_u8(const uint8_t *src0, const uint8_t *src1,
+                                  const __m128i *v_m0_b, const __m128i *v_m1_b,
+                                  const __m128i *rounding) {
+  const __m128i v_s0_b = xx_loadu_128(src0);
+  const __m128i v_s1_b = xx_loadu_128(src1);
+
+  const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b),
+                                           _mm_unpacklo_epi8(*v_m0_b, *v_m1_b));
+  const __m128i v_p1_w = _mm_maddubs_epi16(_mm_unpackhi_epi8(v_s0_b, v_s1_b),
+                                           _mm_unpackhi_epi8(*v_m0_b, *v_m1_b));
+
+  const __m128i v_res0_w = _mm_mulhrs_epi16(v_p0_w, *rounding);
+  const __m128i v_res1_w = _mm_mulhrs_epi16(v_p1_w, *rounding);
+  const __m128i v_res = _mm_packus_epi16(v_res0_w, v_res1_w);
+  return v_res;
+}
+
+typedef __m128i (*blend_unit_fn)(const uint16_t *src0, const uint16_t *src1,
+                                 const __m128i v_m0_w, const __m128i v_m1_w);
+
+static INLINE __m128i blend_4_b10(const uint16_t *src0, const uint16_t *src1,
+                                  const __m128i v_m0_w, const __m128i v_m1_w) {
+  const __m128i v_s0_w = xx_loadl_64(src0);
+  const __m128i v_s1_w = xx_loadl_64(src1);
+
+  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
+  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
+
+  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
+
+  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
+
+  return v_res_w;
+}
+
+static INLINE __m128i blend_8_b10(const uint16_t *src0, const uint16_t *src1,
+                                  const __m128i v_m0_w, const __m128i v_m1_w) {
+  const __m128i v_s0_w = xx_loadu_128(src0);
+  const __m128i v_s1_w = xx_loadu_128(src1);
+
+  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
+  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
+
+  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
+
+  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
+
+  return v_res_w;
+}
+
+static INLINE __m128i blend_4_b12(const uint16_t *src0, const uint16_t *src1,
+                                  const __m128i v_m0_w, const __m128i v_m1_w) {
+  const __m128i v_s0_w = xx_loadl_64(src0);
+  const __m128i v_s1_w = xx_loadl_64(src1);
+
+  // Interleave
+  const __m128i v_m01_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
+  const __m128i v_s01_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
+
+  // Multiply-Add
+  const __m128i v_sum_d = _mm_madd_epi16(v_s01_w, v_m01_w);
+
+  // Scale
+  const __m128i v_ssum_d =
+      _mm_srli_epi32(v_sum_d, AOM_BLEND_A64_ROUND_BITS - 1);
+
+  // Pack
+  const __m128i v_pssum_d = _mm_packs_epi32(v_ssum_d, v_ssum_d);
+
+  // Round
+  const __m128i v_res_w = xx_round_epu16(v_pssum_d);
+
+  return v_res_w;
+}
+
+static INLINE __m128i blend_8_b12(const uint16_t *src0, const uint16_t *src1,
+                                  const __m128i v_m0_w, const __m128i v_m1_w) {
+  const __m128i v_s0_w = xx_loadu_128(src0);
+  const __m128i v_s1_w = xx_loadu_128(src1);
+
+  // Interleave
+  const __m128i v_m01l_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
+  const __m128i v_m01h_w = _mm_unpackhi_epi16(v_m0_w, v_m1_w);
+  const __m128i v_s01l_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
+  const __m128i v_s01h_w = _mm_unpackhi_epi16(v_s0_w, v_s1_w);
+
+  // Multiply-Add
+  const __m128i v_suml_d = _mm_madd_epi16(v_s01l_w, v_m01l_w);
+  const __m128i v_sumh_d = _mm_madd_epi16(v_s01h_w, v_m01h_w);
+
+  // Scale
+  const __m128i v_ssuml_d =
+      _mm_srli_epi32(v_suml_d, AOM_BLEND_A64_ROUND_BITS - 1);
+  const __m128i v_ssumh_d =
+      _mm_srli_epi32(v_sumh_d, AOM_BLEND_A64_ROUND_BITS - 1);
+
+  // Pack
+  const __m128i v_pssum_d = _mm_packs_epi32(v_ssuml_d, v_ssumh_d);
+
+  // Round
+  const __m128i v_res_w = xx_round_epu16(v_pssum_d);
+
+  return v_res_w;
+}
+
+#endif  // AOM_AOM_DSP_X86_BLEND_SSE4_H_
diff --git a/libs/libaom/src/aom_dsp/x86/blk_sse_sum_avx2.c b/libs/libaom/src/aom_dsp/x86/blk_sse_sum_avx2.c
new file mode 100644
index 000000000..f7c0eb037
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/blk_sse_sum_avx2.c
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE void accumulate_sse_sum(__m256i regx_sum, __m256i regx2_sum,
+                                      int *x_sum, int64_t *x2_sum) {
+  __m256i sum_buffer, sse_buffer;
+  __m128i out_buffer;
+
+  // Accumulate the various elements of register into first element.
+  sum_buffer = _mm256_permute2f128_si256(regx_sum, regx_sum, 1);
+  regx_sum = _mm256_add_epi32(sum_buffer, regx_sum);
+  regx_sum = _mm256_add_epi32(regx_sum, _mm256_srli_si256(regx_sum, 8));
+  regx_sum = _mm256_add_epi32(regx_sum, _mm256_srli_si256(regx_sum, 4));
+
+  sse_buffer = _mm256_permute2f128_si256(regx2_sum, regx2_sum, 1);
+  regx2_sum = _mm256_add_epi64(sse_buffer, regx2_sum);
+  regx2_sum = _mm256_add_epi64(regx2_sum, _mm256_srli_si256(regx2_sum, 8));
+
+  out_buffer = _mm256_castsi256_si128(regx_sum);
+  *x_sum += _mm_cvtsi128_si32(out_buffer);
+  out_buffer = _mm256_castsi256_si128(regx2_sum);
+#if ARCH_X86_64
+  *x2_sum += _mm_cvtsi128_si64(out_buffer);
+#else
+  {
+    int64_t tmp;
+    _mm_storel_epi64((__m128i *)&tmp, out_buffer);
+    *x2_sum += tmp;
+  }
+#endif
+}
+
+static INLINE void sse_sum_wd4_avx2(const int16_t *data, int stride, int bh,
+                                    int *x_sum, int64_t *x2_sum) {
+  __m128i row1, row2, row3;
+  __m256i regx_sum, regx2_sum, load_pixels, sum_buffer, sse_buffer,
+      temp_buffer1, temp_buffer2, row_sum_buffer, row_sse_buffer;
+  const int16_t *data_tmp = data;
+  __m256i one = _mm256_set1_epi16(1);
+  regx_sum = _mm256_setzero_si256();
+  regx2_sum = regx_sum;
+  sum_buffer = _mm256_setzero_si256();
+  sse_buffer = sum_buffer;
+
+  for (int j = 0; j < (bh >> 2); ++j) {
+    // Load 4 rows at a time.
+    row1 = _mm_loadl_epi64((__m128i const *)(data_tmp));
+    row2 = _mm_loadl_epi64((__m128i const *)(data_tmp + stride));
+    row1 = _mm_unpacklo_epi64(row1, row2);
+    row2 = _mm_loadl_epi64((__m128i const *)(data_tmp + 2 * stride));
+    row3 = _mm_loadl_epi64((__m128i const *)(data_tmp + 3 * stride));
+    row2 = _mm_unpacklo_epi64(row2, row3);
+    load_pixels =
+        _mm256_insertf128_si256(_mm256_castsi128_si256(row1), row2, 1);
+
+    row_sum_buffer = _mm256_madd_epi16(load_pixels, one);
+    row_sse_buffer = _mm256_madd_epi16(load_pixels, load_pixels);
+    sum_buffer = _mm256_add_epi32(row_sum_buffer, sum_buffer);
+    sse_buffer = _mm256_add_epi32(row_sse_buffer, sse_buffer);
+    data_tmp += 4 * stride;
+  }
+
+  // To prevent 32-bit variable overflow, unpack the elements to 64-bit.
+  temp_buffer1 = _mm256_unpacklo_epi32(sse_buffer, _mm256_setzero_si256());
+  temp_buffer2 = _mm256_unpackhi_epi32(sse_buffer, _mm256_setzero_si256());
+  sse_buffer = _mm256_add_epi64(temp_buffer1, temp_buffer2);
+  regx_sum = _mm256_add_epi32(sum_buffer, regx_sum);
+  regx2_sum = _mm256_add_epi64(sse_buffer, regx2_sum);
+
+  accumulate_sse_sum(regx_sum, regx2_sum, x_sum, x2_sum);
+}
+
+static INLINE void sse_sum_wd8_avx2(const int16_t *data, int stride, int bh,
+                                    int *x_sum, int64_t *x2_sum) {
+  __m128i load_128bit, load_next_128bit;
+  __m256i regx_sum, regx2_sum, load_pixels, sum_buffer, sse_buffer,
+      temp_buffer1, temp_buffer2, row_sum_buffer, row_sse_buffer;
+  const int16_t *data_tmp = data;
+  __m256i one = _mm256_set1_epi16(1);
+  regx_sum = _mm256_setzero_si256();
+  regx2_sum = regx_sum;
+  sum_buffer = _mm256_setzero_si256();
+  sse_buffer = sum_buffer;
+
+  for (int j = 0; j < (bh >> 1); ++j) {
+    // Load 2 rows at a time.
+    load_128bit = _mm_loadu_si128((__m128i const *)(data_tmp));
+    load_next_128bit = _mm_loadu_si128((__m128i const *)(data_tmp + stride));
+    load_pixels = _mm256_insertf128_si256(_mm256_castsi128_si256(load_128bit),
+                                          load_next_128bit, 1);
+
+    row_sum_buffer = _mm256_madd_epi16(load_pixels, one);
+    row_sse_buffer = _mm256_madd_epi16(load_pixels, load_pixels);
+    sum_buffer = _mm256_add_epi32(row_sum_buffer, sum_buffer);
+    sse_buffer = _mm256_add_epi32(row_sse_buffer, sse_buffer);
+    data_tmp += 2 * stride;
+  }
+
+  temp_buffer1 = _mm256_unpacklo_epi32(sse_buffer, _mm256_setzero_si256());
+  temp_buffer2 = _mm256_unpackhi_epi32(sse_buffer, _mm256_setzero_si256());
+  sse_buffer = _mm256_add_epi64(temp_buffer1, temp_buffer2);
+  regx_sum = _mm256_add_epi32(sum_buffer, regx_sum);
+  regx2_sum = _mm256_add_epi64(sse_buffer, regx2_sum);
+
+  accumulate_sse_sum(regx_sum, regx2_sum, x_sum, x2_sum);
+}
+
+static INLINE void sse_sum_wd16_avx2(const int16_t *data, int stride, int bh,
+                                     int *x_sum, int64_t *x2_sum,
+                                     int loop_count) {
+  __m256i regx_sum, regx2_sum, load_pixels, sum_buffer, sse_buffer,
+      temp_buffer1, temp_buffer2, row_sum_buffer, row_sse_buffer;
+  const int16_t *data_tmp = data;
+  __m256i one = _mm256_set1_epi16(1);
+  regx_sum = _mm256_setzero_si256();
+  regx2_sum = regx_sum;
+  sum_buffer = _mm256_setzero_si256();
+  sse_buffer = sum_buffer;
+
+  for (int i = 0; i < loop_count; ++i) {
+    data_tmp = data + 16 * i;
+    for (int j = 0; j < bh; ++j) {
+      load_pixels = _mm256_lddqu_si256((__m256i const *)(data_tmp));
+
+      row_sum_buffer = _mm256_madd_epi16(load_pixels, one);
+      row_sse_buffer = _mm256_madd_epi16(load_pixels, load_pixels);
+      sum_buffer = _mm256_add_epi32(row_sum_buffer, sum_buffer);
+      sse_buffer = _mm256_add_epi32(row_sse_buffer, sse_buffer);
+      data_tmp += stride;
+    }
+  }
+
+  temp_buffer1 = _mm256_unpacklo_epi32(sse_buffer, _mm256_setzero_si256());
+  temp_buffer2 = _mm256_unpackhi_epi32(sse_buffer, _mm256_setzero_si256());
+  sse_buffer = _mm256_add_epi64(temp_buffer1, temp_buffer2);
+  regx_sum = _mm256_add_epi32(sum_buffer, regx_sum);
+  regx2_sum = _mm256_add_epi64(sse_buffer, regx2_sum);
+
+  accumulate_sse_sum(regx_sum, regx2_sum, x_sum, x2_sum);
+}
+
+void aom_get_blk_sse_sum_avx2(const int16_t *data, int stride, int bw, int bh,
+                              int *x_sum, int64_t *x2_sum) {
+  *x_sum = 0;
+  *x2_sum = 0;
+
+  if ((bh & 3) == 0) {
+    switch (bw) {
+        // For smaller block widths, compute multiple rows simultaneously.
+      case 4: sse_sum_wd4_avx2(data, stride, bh, x_sum, x2_sum); break;
+      case 8: sse_sum_wd8_avx2(data, stride, bh, x_sum, x2_sum); break;
+      case 16:
+      case 32:
+        sse_sum_wd16_avx2(data, stride, bh, x_sum, x2_sum, bw >> 4);
+        break;
+      case 64:
+        // 32-bit variables will overflow for 64 rows at a single time, so
+        // compute 32 rows at a time.
+        if (bh <= 32) {
+          sse_sum_wd16_avx2(data, stride, bh, x_sum, x2_sum, bw >> 4);
+        } else {
+          sse_sum_wd16_avx2(data, stride, 32, x_sum, x2_sum, bw >> 4);
+          sse_sum_wd16_avx2(data + 32 * stride, stride, 32, x_sum, x2_sum,
+                            bw >> 4);
+        }
+        break;
+
+      default: aom_get_blk_sse_sum_c(data, stride, bw, bh, x_sum, x2_sum);
+    }
+  } else {
+    aom_get_blk_sse_sum_c(data, stride, bw, bh, x_sum, x2_sum);
+  }
+}
diff --git a/libs/libaom/src/aom_dsp/x86/blk_sse_sum_sse2.c b/libs/libaom/src/aom_dsp/x86/blk_sse_sum_sse2.c
new file mode 100644
index 000000000..ef0a024ee
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/blk_sse_sum_sse2.c
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE void sse_sum_wd4_sse2(const int16_t *data, int stride, int bh,
+                                    int *x_sum, int64_t *x2_sum) {
+  const int16_t *data_tmp = data;
+  __m128i temp_buffer1, temp_buffer2;
+  __m128i load_pixels_low, load_pixels_hi, sum_buffer, sse_buffer;
+  __m128i one = _mm_set1_epi16(1);
+  __m128i regx_sum = _mm_setzero_si128();
+  __m128i regx2_sum = regx_sum;
+
+  for (int j = 0; j < (bh >> 1); ++j) {
+    // Load 2 rows (8 pixels) at a time.
+    load_pixels_low = _mm_loadl_epi64((__m128i const *)(data_tmp));
+    load_pixels_hi = _mm_loadl_epi64((__m128i const *)(data_tmp + stride));
+    load_pixels_low = _mm_unpacklo_epi64(load_pixels_low, load_pixels_hi);
+    sum_buffer = _mm_madd_epi16(load_pixels_low, one);
+    sse_buffer = _mm_madd_epi16(load_pixels_low, load_pixels_low);
+    regx_sum = _mm_add_epi32(sum_buffer, regx_sum);
+    regx2_sum = _mm_add_epi32(sse_buffer, regx2_sum);
+    data_tmp += 2 * stride;
+  }
+
+  regx_sum = _mm_add_epi32(regx_sum, _mm_srli_si128(regx_sum, 8));
+  regx_sum = _mm_add_epi32(regx_sum, _mm_srli_si128(regx_sum, 4));
+  *x_sum = _mm_cvtsi128_si32(regx_sum);
+  temp_buffer1 = _mm_unpacklo_epi32(regx2_sum, _mm_setzero_si128());
+  temp_buffer2 = _mm_unpackhi_epi32(regx2_sum, _mm_setzero_si128());
+  regx2_sum = _mm_add_epi64(temp_buffer1, temp_buffer2);
+  regx2_sum = _mm_add_epi64(regx2_sum, _mm_srli_si128(regx2_sum, 8));
+#if ARCH_X86_64
+  *x2_sum += _mm_cvtsi128_si64(regx2_sum);
+#else
+  {
+    int64_t tmp;
+    _mm_storel_epi64((__m128i *)&tmp, regx2_sum);
+    *x2_sum += tmp;
+  }
+#endif
+}
+
+static INLINE void sse_sum_wd8_sse2(const int16_t *data, int stride, int bh,
+                                    int *x_sum, int64_t *x2_sum,
+                                    int loop_cycles) {
+  const int16_t *data_tmp;
+  __m128i temp_buffer1, temp_buffer2;
+  __m128i one = _mm_set1_epi16(1);
+  __m128i regx_sum = _mm_setzero_si128();
+  __m128i regx2_sum = regx_sum;
+  __m128i load_pixels, sum_buffer, sse_buffer;
+
+  for (int i = 0; i < loop_cycles; ++i) {
+    data_tmp = data + (8 * i);
+    for (int j = 0; j < bh; ++j) {
+      // Load 1 row (8-pixels) at a time.
+      load_pixels = _mm_loadu_si128((__m128i const *)(data_tmp));
+      sum_buffer = _mm_madd_epi16(load_pixels, one);
+      sse_buffer = _mm_madd_epi16(load_pixels, load_pixels);
+      regx_sum = _mm_add_epi32(sum_buffer, regx_sum);
+      regx2_sum = _mm_add_epi32(sse_buffer, regx2_sum);
+      data_tmp += stride;
+    }
+  }
+
+  regx_sum = _mm_add_epi32(regx_sum, _mm_srli_si128(regx_sum, 8));
+  regx_sum = _mm_add_epi32(regx_sum, _mm_srli_si128(regx_sum, 4));
+  *x_sum += _mm_cvtsi128_si32(regx_sum);
+  temp_buffer1 = _mm_unpacklo_epi32(regx2_sum, _mm_setzero_si128());
+  temp_buffer2 = _mm_unpackhi_epi32(regx2_sum, _mm_setzero_si128());
+  regx2_sum = _mm_add_epi64(temp_buffer1, temp_buffer2);
+  regx2_sum = _mm_add_epi64(regx2_sum, _mm_srli_si128(regx2_sum, 8));
+#if ARCH_X86_64
+  *x2_sum += _mm_cvtsi128_si64(regx2_sum);
+#else
+  {
+    int64_t tmp;
+    _mm_storel_epi64((__m128i *)&tmp, regx2_sum);
+    *x2_sum += tmp;
+  }
+#endif
+}
+
+// This functions adds SSE2 Support for the functions 'get_blk_sse_sum_c'
+void aom_get_blk_sse_sum_sse2(const int16_t *data, int stride, int bw, int bh,
+                              int *x_sum, int64_t *x2_sum) {
+  *x_sum = 0;
+  *x2_sum = 0;
+
+  if ((bh & 3) == 0) {
+    switch (bw) {
+      case 4: sse_sum_wd4_sse2(data, stride, bh, x_sum, x2_sum); break;
+      case 8:
+      case 16:
+        sse_sum_wd8_sse2(data, stride, bh, x_sum, x2_sum, bw >> 3);
+        break;
+        // For widths 32 and 64, the registers may overflow. So compute
+        // partial widths at a time.
+      case 32:
+        if (bh <= 32) {
+          sse_sum_wd8_sse2(data, stride, bh, x_sum, x2_sum, bw >> 3);
+          break;
+        } else {
+          sse_sum_wd8_sse2(data, stride, 32, x_sum, x2_sum, bw >> 3);
+          sse_sum_wd8_sse2(data + 32 * stride, stride, 32, x_sum, x2_sum,
+                           bw >> 3);
+          break;
+        }
+
+      case 64:
+        if (bh <= 16) {
+          sse_sum_wd8_sse2(data, stride, bh, x_sum, x2_sum, bw >> 3);
+          break;
+        } else {
+          for (int i = 0; i < bh; i += 16)
+            sse_sum_wd8_sse2(data + i * stride, stride, 16, x_sum, x2_sum,
+                             bw >> 3);
+          break;
+        }
+
+      default: aom_get_blk_sse_sum_c(data, stride, bw, bh, x_sum, x2_sum);
+    }
+  } else {
+    aom_get_blk_sse_sum_c(data, stride, bw, bh, x_sum, x2_sum);
+  }
+}
diff --git a/libs/libaom/src/aom_dsp/x86/common_avx2.h b/libs/libaom/src/aom_dsp/x86/common_avx2.h
new file mode 100644
index 000000000..96fe4ebb6
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/common_avx2.h
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_COMMON_AVX2_H_
+#define AOM_AOM_DSP_X86_COMMON_AVX2_H_
+
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+
+// Note: in and out could have the same value
+static INLINE void mm256_transpose_16x16(const __m256i *in, __m256i *out) {
+  __m256i tr0_0 = _mm256_unpacklo_epi16(in[0], in[1]);
+  __m256i tr0_1 = _mm256_unpackhi_epi16(in[0], in[1]);
+  __m256i tr0_2 = _mm256_unpacklo_epi16(in[2], in[3]);
+  __m256i tr0_3 = _mm256_unpackhi_epi16(in[2], in[3]);
+  __m256i tr0_4 = _mm256_unpacklo_epi16(in[4], in[5]);
+  __m256i tr0_5 = _mm256_unpackhi_epi16(in[4], in[5]);
+  __m256i tr0_6 = _mm256_unpacklo_epi16(in[6], in[7]);
+  __m256i tr0_7 = _mm256_unpackhi_epi16(in[6], in[7]);
+
+  __m256i tr0_8 = _mm256_unpacklo_epi16(in[8], in[9]);
+  __m256i tr0_9 = _mm256_unpackhi_epi16(in[8], in[9]);
+  __m256i tr0_a = _mm256_unpacklo_epi16(in[10], in[11]);
+  __m256i tr0_b = _mm256_unpackhi_epi16(in[10], in[11]);
+  __m256i tr0_c = _mm256_unpacklo_epi16(in[12], in[13]);
+  __m256i tr0_d = _mm256_unpackhi_epi16(in[12], in[13]);
+  __m256i tr0_e = _mm256_unpacklo_epi16(in[14], in[15]);
+  __m256i tr0_f = _mm256_unpackhi_epi16(in[14], in[15]);
+
+  // 00 10 01 11 02 12 03 13  08 18 09 19 0a 1a 0b 1b
+  // 04 14 05 15 06 16 07 17  0c 1c 0d 1d 0e 1e 0f 1f
+  // 20 30 21 31 22 32 23 33  28 38 29 39 2a 3a 2b 3b
+  // 24 34 25 35 26 36 27 37  2c 3c 2d 3d 2e 3e 2f 3f
+  // 40 50 41 51 42 52 43 53  48 58 49 59 4a 5a 4b 5b
+  // 44 54 45 55 46 56 47 57  4c 5c 4d 5d 4e 5e 4f 5f
+  // 60 70 61 71 62 72 63 73  68 78 69 79 6a 7a 6b 7b
+  // 64 74 65 75 66 76 67 77  6c 7c 6d 7d 6e 7e 6f 7f
+
+  // 80 90 81 91 82 92 83 93  88 98 89 99 8a 9a 8b 9b
+  // 84 94 85 95 86 96 87 97  8c 9c 8d 9d 8e 9e 8f 9f
+  // a0 b0 a1 b1 a2 b2 a3 b3  a8 b8 a9 b9 aa ba ab bb
+  // a4 b4 a5 b5 a6 b6 a7 b7  ac bc ad bd ae be af bf
+  // c0 d0 c1 d1 c2 d2 c3 d3  c8 d8 c9 d9 ca da cb db
+  // c4 d4 c5 d5 c6 d6 c7 d7  cc dc cd dd ce de cf df
+  // e0 f0 e1 f1 e2 f2 e3 f3  e8 f8 e9 f9 ea fa eb fb
+  // e4 f4 e5 f5 e6 f6 e7 f7  ec fc ed fd ee fe ef ff
+
+  __m256i tr1_0 = _mm256_unpacklo_epi32(tr0_0, tr0_2);
+  __m256i tr1_1 = _mm256_unpackhi_epi32(tr0_0, tr0_2);
+  __m256i tr1_2 = _mm256_unpacklo_epi32(tr0_1, tr0_3);
+  __m256i tr1_3 = _mm256_unpackhi_epi32(tr0_1, tr0_3);
+  __m256i tr1_4 = _mm256_unpacklo_epi32(tr0_4, tr0_6);
+  __m256i tr1_5 = _mm256_unpackhi_epi32(tr0_4, tr0_6);
+  __m256i tr1_6 = _mm256_unpacklo_epi32(tr0_5, tr0_7);
+  __m256i tr1_7 = _mm256_unpackhi_epi32(tr0_5, tr0_7);
+
+  __m256i tr1_8 = _mm256_unpacklo_epi32(tr0_8, tr0_a);
+  __m256i tr1_9 = _mm256_unpackhi_epi32(tr0_8, tr0_a);
+  __m256i tr1_a = _mm256_unpacklo_epi32(tr0_9, tr0_b);
+  __m256i tr1_b = _mm256_unpackhi_epi32(tr0_9, tr0_b);
+  __m256i tr1_c = _mm256_unpacklo_epi32(tr0_c, tr0_e);
+  __m256i tr1_d = _mm256_unpackhi_epi32(tr0_c, tr0_e);
+  __m256i tr1_e = _mm256_unpacklo_epi32(tr0_d, tr0_f);
+  __m256i tr1_f = _mm256_unpackhi_epi32(tr0_d, tr0_f);
+
+  // 00 10 20 30 01 11 21 31  08 18 28 38 09 19 29 39
+  // 02 12 22 32 03 13 23 33  0a 1a 2a 3a 0b 1b 2b 3b
+  // 04 14 24 34 05 15 25 35  0c 1c 2c 3c 0d 1d 2d 3d
+  // 06 16 26 36 07 17 27 37  0e 1e 2e 3e 0f 1f 2f 3f
+  // 40 50 60 70 41 51 61 71  48 58 68 78 49 59 69 79
+  // 42 52 62 72 43 53 63 73  4a 5a 6a 7a 4b 5b 6b 7b
+  // 44 54 64 74 45 55 65 75  4c 5c 6c 7c 4d 5d 6d 7d
+  // 46 56 66 76 47 57 67 77  4e 5e 6e 7e 4f 5f 6f 7f
+
+  // 80 90 a0 b0 81 91 a1 b1  88 98 a8 b8 89 99 a9 b9
+  // 82 92 a2 b2 83 93 a3 b3  8a 9a aa ba 8b 9b ab bb
+  // 84 94 a4 b4 85 95 a5 b5  8c 9c ac bc 8d 9d ad bd
+  // 86 96 a6 b6 87 97 a7 b7  8e ae 9e be 8f 9f af bf
+  // c0 d0 e0 f0 c1 d1 e1 f1  c8 d8 e8 f8 c9 d9 e9 f9
+  // c2 d2 e2 f2 c3 d3 e3 f3  ca da ea fa cb db eb fb
+  // c4 d4 e4 f4 c5 d5 e5 f5  cc dc ef fc cd dd ed fd
+  // c6 d6 e6 f6 c7 d7 e7 f7  ce de ee fe cf df ef ff
+
+  tr0_0 = _mm256_unpacklo_epi64(tr1_0, tr1_4);
+  tr0_1 = _mm256_unpackhi_epi64(tr1_0, tr1_4);
+  tr0_2 = _mm256_unpacklo_epi64(tr1_1, tr1_5);
+  tr0_3 = _mm256_unpackhi_epi64(tr1_1, tr1_5);
+  tr0_4 = _mm256_unpacklo_epi64(tr1_2, tr1_6);
+  tr0_5 = _mm256_unpackhi_epi64(tr1_2, tr1_6);
+  tr0_6 = _mm256_unpacklo_epi64(tr1_3, tr1_7);
+  tr0_7 = _mm256_unpackhi_epi64(tr1_3, tr1_7);
+
+  tr0_8 = _mm256_unpacklo_epi64(tr1_8, tr1_c);
+  tr0_9 = _mm256_unpackhi_epi64(tr1_8, tr1_c);
+  tr0_a = _mm256_unpacklo_epi64(tr1_9, tr1_d);
+  tr0_b = _mm256_unpackhi_epi64(tr1_9, tr1_d);
+  tr0_c = _mm256_unpacklo_epi64(tr1_a, tr1_e);
+  tr0_d = _mm256_unpackhi_epi64(tr1_a, tr1_e);
+  tr0_e = _mm256_unpacklo_epi64(tr1_b, tr1_f);
+  tr0_f = _mm256_unpackhi_epi64(tr1_b, tr1_f);
+
+  // 00 10 20 30 40 50 60 70  08 18 28 38 48 58 68 78
+  // 01 11 21 31 41 51 61 71  09 19 29 39 49 59 69 79
+  // 02 12 22 32 42 52 62 72  0a 1a 2a 3a 4a 5a 6a 7a
+  // 03 13 23 33 43 53 63 73  0b 1b 2b 3b 4b 5b 6b 7b
+  // 04 14 24 34 44 54 64 74  0c 1c 2c 3c 4c 5c 6c 7c
+  // 05 15 25 35 45 55 65 75  0d 1d 2d 3d 4d 5d 6d 7d
+  // 06 16 26 36 46 56 66 76  0e 1e 2e 3e 4e 5e 6e 7e
+  // 07 17 27 37 47 57 67 77  0f 1f 2f 3f 4f 5f 6f 7f
+
+  // 80 90 a0 b0 c0 d0 e0 f0  88 98 a8 b8 c8 d8 e8 f8
+  // 81 91 a1 b1 c1 d1 e1 f1  89 99 a9 b9 c9 d9 e9 f9
+  // 82 92 a2 b2 c2 d2 e2 f2  8a 9a aa ba ca da ea fa
+  // 83 93 a3 b3 c3 d3 e3 f3  8b 9b ab bb cb db eb fb
+  // 84 94 a4 b4 c4 d4 e4 f4  8c 9c ac bc cc dc ef fc
+  // 85 95 a5 b5 c5 d5 e5 f5  8d 9d ad bd cd dd ed fd
+  // 86 96 a6 b6 c6 d6 e6 f6  8e ae 9e be ce de ee fe
+  // 87 97 a7 b7 c7 d7 e7 f7  8f 9f af bf cf df ef ff
+
+  out[0] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x20);  // 0010 0000
+  out[8] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x31);  // 0011 0001
+  out[1] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x20);
+  out[9] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x31);
+  out[2] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x20);
+  out[10] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x31);
+  out[3] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x20);
+  out[11] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x31);
+
+  out[4] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x20);
+  out[12] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x31);
+  out[5] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x20);
+  out[13] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x31);
+  out[6] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x20);
+  out[14] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x31);
+  out[7] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x20);
+  out[15] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x31);
+}
+#endif  // AOM_AOM_DSP_X86_COMMON_AVX2_H_
diff --git a/libs/libaom/src/aom_dsp/x86/convolve.h b/libs/libaom/src/aom_dsp/x86/convolve.h
new file mode 100644
index 000000000..b4ff6975c
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/convolve.h
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AOM_DSP_X86_CONVOLVE_H_
+#define AOM_AOM_DSP_X86_CONVOLVE_H_
+
+#include <assert.h>
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+
+typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch,
+                                uint8_t *output_ptr, ptrdiff_t out_pitch,
+                                uint32_t output_height, const int16_t *filter);
+
+#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt)         \
+  void aom_convolve8_##name##_##opt(                                         \
+      const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,                \
+      ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,          \
+      const int16_t *filter_y, int y_step_q4, int w, int h) {                \
+    (void)filter_x;                                                          \
+    (void)x_step_q4;                                                         \
+    (void)filter_y;                                                          \
+    (void)y_step_q4;                                                         \
+    assert((-128 <= filter[3]) && (filter[3] <= 127));                       \
+    assert(step_q4 == 16);                                                   \
+    if (((filter[0] | filter[1] | filter[6] | filter[7]) == 0) &&            \
+        (filter[2] | filter[5])) {                                           \
+      while (w >= 16) {                                                      \
+        aom_filter_block1d16_##dir##4_##avg##opt(src_start, src_stride, dst, \
+                                                 dst_stride, h, filter);     \
+        src += 16;                                                           \
+        dst += 16;                                                           \
+        w -= 16;                                                             \
+      }                                                                      \
+      while (w >= 8) {                                                       \
+        aom_filter_block1d8_##dir##4_##avg##opt(src_start, src_stride, dst,  \
+                                                dst_stride, h, filter);      \
+        src += 8;                                                            \
+        dst += 8;                                                            \
+        w -= 8;                                                              \
+      }                                                                      \
+      while (w >= 4) {                                                       \
+        aom_filter_block1d4_##dir##4_##avg##opt(src_start, src_stride, dst,  \
+                                                dst_stride, h, filter);      \
+        src += 4;                                                            \
+        dst += 4;                                                            \
+        w -= 4;                                                              \
+      }                                                                      \
+    } else if (filter[0] | filter[1] | filter[2]) {                          \
+      while (w >= 16) {                                                      \
+        aom_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst, \
+                                                 dst_stride, h, filter);     \
+        src += 16;                                                           \
+        dst += 16;                                                           \
+        w -= 16;                                                             \
+      }                                                                      \
+      while (w >= 8) {                                                       \
+        aom_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, dst,  \
+                                                dst_stride, h, filter);      \
+        src += 8;                                                            \
+        dst += 8;                                                            \
+        w -= 8;                                                              \
+      }                                                                      \
+      while (w >= 4) {                                                       \
+        aom_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, dst,  \
+                                                dst_stride, h, filter);      \
+        src += 4;                                                            \
+        dst += 4;                                                            \
+        w -= 4;                                                              \
+      }                                                                      \
+    } else {                                                                 \
+      while (w >= 16) {                                                      \
+        aom_filter_block1d16_##dir##2_##avg##opt(src, src_stride, dst,       \
+                                                 dst_stride, h, filter);     \
+        src += 16;                                                           \
+        dst += 16;                                                           \
+        w -= 16;                                                             \
+      }                                                                      \
+      while (w >= 8) {                                                       \
+        aom_filter_block1d8_##dir##2_##avg##opt(src, src_stride, dst,        \
+                                                dst_stride, h, filter);      \
+        src += 8;                                                            \
+        dst += 8;                                                            \
+        w -= 8;                                                              \
+      }                                                                      \
+      while (w >= 4) {                                                       \
+        aom_filter_block1d4_##dir##2_##avg##opt(src, src_stride, dst,        \
+                                                dst_stride, h, filter);      \
+        src += 4;                                                            \
+        dst += 4;                                                            \
+        w -= 4;                                                              \
+      }                                                                      \
+    }                                                                        \
+    if (w) {                                                                 \
+      aom_convolve8_##name##_c(src, src_stride, dst, dst_stride, filter_x,   \
+                               x_step_q4, filter_y, y_step_q4, w, h);        \
+    }                                                                        \
+  }
+
+#if CONFIG_AV1_HIGHBITDEPTH
+typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr,
+                                       const ptrdiff_t src_pitch,
+                                       uint16_t *output_ptr,
+                                       ptrdiff_t out_pitch,
+                                       unsigned int output_height,
+                                       const int16_t *filter, int bd);
+
+#define HIGH_FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt)  \
+  void aom_highbd_convolve8_##name##_##opt(                                \
+      const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8,            \
+      ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,        \
+      const int16_t *filter_y, int y_step_q4, int w, int h, int bd) {      \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                             \
+    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                             \
+    if (step_q4 == 16 && filter[3] != 128) {                               \
+      if (((filter[0] | filter[1] | filter[6] | filter[7]) == 0) &&        \
+          (filter[2] | filter[5])) {                                       \
+        while (w >= 16) {                                                  \
+          aom_highbd_filter_block1d16_##dir##4_##avg##opt(                 \
+              src_start, src_stride, dst, dst_stride, h, filter, bd);      \
+          src += 16;                                                       \
+          dst += 16;                                                       \
+          w -= 16;                                                         \
+        }                                                                  \
+        while (w >= 8) {                                                   \
+          aom_highbd_filter_block1d8_##dir##4_##avg##opt(                  \
+              src_start, src_stride, dst, dst_stride, h, filter, bd);      \
+          src += 8;                                                        \
+          dst += 8;                                                        \
+          w -= 8;                                                          \
+        }                                                                  \
+        while (w >= 4) {                                                   \
+          aom_highbd_filter_block1d4_##dir##4_##avg##opt(                  \
+              src_start, src_stride, dst, dst_stride, h, filter, bd);      \
+          src += 4;                                                        \
+          dst += 4;                                                        \
+          w -= 4;                                                          \
+        }                                                                  \
+      } else if (filter[0] | filter[1] | filter[2]) {                      \
+        while (w >= 16) {                                                  \
+          aom_highbd_filter_block1d16_##dir##8_##avg##opt(                 \
+              src_start, src_stride, dst, dst_stride, h, filter, bd);      \
+          src += 16;                                                       \
+          dst += 16;                                                       \
+          w -= 16;                                                         \
+        }                                                                  \
+        while (w >= 8) {                                                   \
+          aom_highbd_filter_block1d8_##dir##8_##avg##opt(                  \
+              src_start, src_stride, dst, dst_stride, h, filter, bd);      \
+          src += 8;                                                        \
+          dst += 8;                                                        \
+          w -= 8;                                                          \
+        }                                                                  \
+        while (w >= 4) {                                                   \
+          aom_highbd_filter_block1d4_##dir##8_##avg##opt(                  \
+              src_start, src_stride, dst, dst_stride, h, filter, bd);      \
+          src += 4;                                                        \
+          dst += 4;                                                        \
+          w -= 4;                                                          \
+        }                                                                  \
+      } else {                                                             \
+        while (w >= 16) {                                                  \
+          aom_highbd_filter_block1d16_##dir##2_##avg##opt(                 \
+              src, src_stride, dst, dst_stride, h, filter, bd);            \
+          src += 16;                                                       \
+          dst += 16;                                                       \
+          w -= 16;                                                         \
+        }                                                                  \
+        while (w >= 8) {                                                   \
+          aom_highbd_filter_block1d8_##dir##2_##avg##opt(                  \
+              src, src_stride, dst, dst_stride, h, filter, bd);            \
+          src += 8;                                                        \
+          dst += 8;                                                        \
+          w -= 8;                                                          \
+        }                                                                  \
+        while (w >= 4) {                                                   \
+          aom_highbd_filter_block1d4_##dir##2_##avg##opt(                  \
+              src, src_stride, dst, dst_stride, h, filter, bd);            \
+          src += 4;                                                        \
+          dst += 4;                                                        \
+          w -= 4;                                                          \
+        }                                                                  \
+      }                                                                    \
+    }                                                                      \
+    if (w) {                                                               \
+      aom_highbd_convolve8_##name##_c(                                     \
+          CONVERT_TO_BYTEPTR(src), src_stride, CONVERT_TO_BYTEPTR(dst),    \
+          dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd); \
+    }                                                                      \
+  }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+#endif  // AOM_AOM_DSP_X86_CONVOLVE_H_
diff --git a/libs/libaom/src/aom_dsp/x86/convolve_avx2.h b/libs/libaom/src/aom_dsp/x86/convolve_avx2.h
new file mode 100644
index 000000000..d516de5f2
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/convolve_avx2.h
@@ -0,0 +1,463 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_
+#define AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_
+
+// filters for 16
+DECLARE_ALIGNED(32, static const uint8_t, filt_global_avx2[]) = {
+  0,  1,  1,  2,  2, 3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8,  0,  1,  1,
+  2,  2,  3,  3,  4, 4,  5,  5,  6,  6,  7,  7,  8,  2,  3,  3,  4,  4,  5,
+  5,  6,  6,  7,  7, 8,  8,  9,  9,  10, 2,  3,  3,  4,  4,  5,  5,  6,  6,
+  7,  7,  8,  8,  9, 9,  10, 4,  5,  5,  6,  6,  7,  7,  8,  8,  9,  9,  10,
+  10, 11, 11, 12, 4, 5,  5,  6,  6,  7,  7,  8,  8,  9,  9,  10, 10, 11, 11,
+  12, 6,  7,  7,  8, 8,  9,  9,  10, 10, 11, 11, 12, 12, 13, 13, 14, 6,  7,
+  7,  8,  8,  9,  9, 10, 10, 11, 11, 12, 12, 13, 13, 14
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, filt_d4_global_avx2[]) = {
+  0, 1, 2, 3,  1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2, 3,  1, 2,
+  3, 4, 2, 3,  4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7,  8, 9,
+  7, 8, 9, 10, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10,
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, filt4_d4_global_avx2[]) = {
+  2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8,
+  2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8,
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, filt_center_global_avx2[32]) = {
+  3, 255, 4, 255, 5, 255, 6, 255, 7, 255, 8, 255, 9, 255, 10, 255,
+  3, 255, 4, 255, 5, 255, 6, 255, 7, 255, 8, 255, 9, 255, 10, 255
+};
+
+DECLARE_ALIGNED(32, static const uint8_t,
+                filt1_global_avx2[32]) = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5,
+                                           6, 6, 7, 7, 8, 0, 1, 1, 2, 2, 3,
+                                           3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+
+DECLARE_ALIGNED(32, static const uint8_t,
+                filt2_global_avx2[32]) = { 2, 3, 3, 4, 4,  5, 5, 6, 6, 7, 7,
+                                           8, 8, 9, 9, 10, 2, 3, 3, 4, 4, 5,
+                                           5, 6, 6, 7, 7,  8, 8, 9, 9, 10 };
+
+DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = {
+  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12,
+  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = {
+  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14,
+  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
+};
+
+#define CONVOLVE_SR_HORIZONTAL_FILTER_8TAP                                     \
+  for (i = 0; i < (im_h - 2); i += 2) {                                        \
+    __m256i data = _mm256_castsi128_si256(                                     \
+        _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));           \
+    data = _mm256_inserti128_si256(                                            \
+        data,                                                                  \
+        _mm_loadu_si128(                                                       \
+            (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]),           \
+        1);                                                                    \
+                                                                               \
+    __m256i res = convolve_lowbd_x(data, coeffs_h, filt);                      \
+    res =                                                                      \
+        _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); \
+    _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);              \
+  }                                                                            \
+                                                                               \
+  __m256i data_1 = _mm256_castsi128_si256(                                     \
+      _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));             \
+                                                                               \
+  __m256i res = convolve_lowbd_x(data_1, coeffs_h, filt);                      \
+                                                                               \
+  res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); \
+                                                                               \
+  _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
+
+#define CONVOLVE_SR_VERTICAL_FILTER_8TAP                                      \
+  __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));  \
+  __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));  \
+  __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));  \
+  __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));  \
+  __m256i src_4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));  \
+  __m256i src_5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));  \
+                                                                              \
+  __m256i s[8];                                                               \
+  s[0] = _mm256_unpacklo_epi16(src_0, src_1);                                 \
+  s[1] = _mm256_unpacklo_epi16(src_2, src_3);                                 \
+  s[2] = _mm256_unpacklo_epi16(src_4, src_5);                                 \
+                                                                              \
+  s[4] = _mm256_unpackhi_epi16(src_0, src_1);                                 \
+  s[5] = _mm256_unpackhi_epi16(src_2, src_3);                                 \
+  s[6] = _mm256_unpackhi_epi16(src_4, src_5);                                 \
+                                                                              \
+  for (i = 0; i < h; i += 2) {                                                \
+    const int16_t *data = &im_block[i * im_stride];                           \
+                                                                              \
+    const __m256i s6 = _mm256_loadu_si256((__m256i *)(data + 6 * im_stride)); \
+    const __m256i s7 = _mm256_loadu_si256((__m256i *)(data + 7 * im_stride)); \
+                                                                              \
+    s[3] = _mm256_unpacklo_epi16(s6, s7);                                     \
+    s[7] = _mm256_unpackhi_epi16(s6, s7);                                     \
+                                                                              \
+    __m256i res_a = convolve(s, coeffs_v);                                    \
+    __m256i res_b = convolve(s + 4, coeffs_v);                                \
+                                                                              \
+    res_a =                                                                   \
+        _mm256_sra_epi32(_mm256_add_epi32(res_a, sum_round_v), sum_shift_v);  \
+    res_b =                                                                   \
+        _mm256_sra_epi32(_mm256_add_epi32(res_b, sum_round_v), sum_shift_v);  \
+                                                                              \
+    const __m256i res_a_round = _mm256_sra_epi32(                             \
+        _mm256_add_epi32(res_a, round_const_v), round_shift_v);               \
+    const __m256i res_b_round = _mm256_sra_epi32(                             \
+        _mm256_add_epi32(res_b, round_const_v), round_shift_v);               \
+                                                                              \
+    const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);   \
+    const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit);         \
+                                                                              \
+    const __m128i res_0 = _mm256_castsi256_si128(res_8b);                     \
+    const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);                \
+                                                                              \
+    __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];                 \
+    __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride];    \
+    if (w - j > 4) {                                                          \
+      _mm_storel_epi64(p_0, res_0);                                           \
+      _mm_storel_epi64(p_1, res_1);                                           \
+    } else if (w == 4) {                                                      \
+      xx_storel_32(p_0, res_0);                                               \
+      xx_storel_32(p_1, res_1);                                               \
+    } else {                                                                  \
+      *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);                  \
+      *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);                  \
+    }                                                                         \
+                                                                              \
+    s[0] = s[1];                                                              \
+    s[1] = s[2];                                                              \
+    s[2] = s[3];                                                              \
+                                                                              \
+    s[4] = s[5];                                                              \
+    s[5] = s[6];                                                              \
+    s[6] = s[7];                                                              \
+  }
+
+#define DIST_WTD_CONVOLVE_HORIZONTAL_FILTER_8TAP                               \
+  for (i = 0; i < im_h; i += 2) {                                              \
+    __m256i data = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)src_h));  \
+    if (i + 1 < im_h)                                                          \
+      data = _mm256_inserti128_si256(                                          \
+          data, _mm_loadu_si128((__m128i *)(src_h + src_stride)), 1);          \
+    src_h += (src_stride << 1);                                                \
+    __m256i res = convolve_lowbd_x(data, coeffs_x, filt);                      \
+                                                                               \
+    res =                                                                      \
+        _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); \
+                                                                               \
+    _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);              \
+  }
+
+#define DIST_WTD_CONVOLVE_VERTICAL_FILTER_8TAP                                 \
+  __m256i s[8];                                                                \
+  __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));      \
+  __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));      \
+  __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));      \
+  __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));      \
+  __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));      \
+  __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));      \
+                                                                               \
+  s[0] = _mm256_unpacklo_epi16(s0, s1);                                        \
+  s[1] = _mm256_unpacklo_epi16(s2, s3);                                        \
+  s[2] = _mm256_unpacklo_epi16(s4, s5);                                        \
+                                                                               \
+  s[4] = _mm256_unpackhi_epi16(s0, s1);                                        \
+  s[5] = _mm256_unpackhi_epi16(s2, s3);                                        \
+  s[6] = _mm256_unpackhi_epi16(s4, s5);                                        \
+                                                                               \
+  for (i = 0; i < h; i += 2) {                                                 \
+    const int16_t *data = &im_block[i * im_stride];                            \
+                                                                               \
+    const __m256i s6 = _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));  \
+    const __m256i s7 = _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));  \
+                                                                               \
+    s[3] = _mm256_unpacklo_epi16(s6, s7);                                      \
+    s[7] = _mm256_unpackhi_epi16(s6, s7);                                      \
+                                                                               \
+    const __m256i res_a = convolve(s, coeffs_y);                               \
+    const __m256i res_a_round = _mm256_sra_epi32(                              \
+        _mm256_add_epi32(res_a, round_const_v), round_shift_v);                \
+                                                                               \
+    if (w - j > 4) {                                                           \
+      const __m256i res_b = convolve(s + 4, coeffs_y);                         \
+      const __m256i res_b_round = _mm256_sra_epi32(                            \
+          _mm256_add_epi32(res_b, round_const_v), round_shift_v);              \
+      const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_b_round);    \
+      const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const);    \
+                                                                               \
+      if (do_average) {                                                        \
+        const __m256i data_ref_0 = load_line2_avx2(                            \
+            &dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]);  \
+        const __m256i comp_avg_res =                                           \
+            comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);  \
+                                                                               \
+        const __m256i round_result = convolve_rounding(                        \
+            &comp_avg_res, &offset_const, &rounding_const, rounding_shift);    \
+                                                                               \
+        const __m256i res_8 = _mm256_packus_epi16(round_result, round_result); \
+        const __m128i res_0 = _mm256_castsi256_si128(res_8);                   \
+        const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);              \
+                                                                               \
+        _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);      \
+        _mm_storel_epi64(                                                      \
+            (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);   \
+      } else {                                                                 \
+        const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);            \
+        _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);         \
+                                                                               \
+        const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);       \
+        _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),    \
+                        res_1);                                                \
+      }                                                                        \
+    } else {                                                                   \
+      const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_a_round);    \
+      const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const);    \
+                                                                               \
+      if (do_average) {                                                        \
+        const __m256i data_ref_0 = load_line2_avx2(                            \
+            &dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]);  \
+                                                                               \
+        const __m256i comp_avg_res =                                           \
+            comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);  \
+                                                                               \
+        const __m256i round_result = convolve_rounding(                        \
+            &comp_avg_res, &offset_const, &rounding_const, rounding_shift);    \
+                                                                               \
+        const __m256i res_8 = _mm256_packus_epi16(round_result, round_result); \
+        const __m128i res_0 = _mm256_castsi256_si128(res_8);                   \
+        const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);              \
+                                                                               \
+        *(uint32_t *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0);  \
+        *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) =              \
+            _mm_cvtsi128_si32(res_1);                                          \
+                                                                               \
+      } else {                                                                 \
+        const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);            \
+        _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);         \
+                                                                               \
+        const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);       \
+        _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),    \
+                        res_1);                                                \
+      }                                                                        \
+    }                                                                          \
+                                                                               \
+    s[0] = s[1];                                                               \
+    s[1] = s[2];                                                               \
+    s[2] = s[3];                                                               \
+                                                                               \
+    s[4] = s[5];                                                               \
+    s[5] = s[6];                                                               \
+    s[6] = s[7];                                                               \
+  }
+static INLINE void prepare_coeffs_lowbd(
+    const InterpFilterParams *const filter_params, const int subpel_q4,
+    __m256i *const coeffs /* [4] */) {
+  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
+      filter_params, subpel_q4 & SUBPEL_MASK);
+  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
+  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
+
+  // right shift all filter co-efficients by 1 to reduce the bits required.
+  // This extra right shift will be taken care of at the end while rounding
+  // the result.
+  // Since all filter co-efficients are even, this change will not affect the
+  // end result
+  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
+                            _mm_set1_epi16((short)0xffff)));
+
+  const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
+
+  // coeffs 0 1 0 1 0 1 0 1
+  coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u));
+  // coeffs 2 3 2 3 2 3 2 3
+  coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u));
+  // coeffs 4 5 4 5 4 5 4 5
+  coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u));
+  // coeffs 6 7 6 7 6 7 6 7
+  coeffs[3] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0e0cu));
+}
+
+static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params,
+                                  const int subpel_q4,
+                                  __m256i *const coeffs /* [4] */) {
+  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
+      filter_params, subpel_q4 & SUBPEL_MASK);
+
+  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
+  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
+
+  // coeffs 0 1 0 1 0 1 0 1
+  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
+  // coeffs 2 3 2 3 2 3 2 3
+  coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
+  // coeffs 4 5 4 5 4 5 4 5
+  coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
+  // coeffs 6 7 6 7 6 7 6 7
+  coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff);
+}
+
+static INLINE __m256i convolve_lowbd(const __m256i *const s,
+                                     const __m256i *const coeffs) {
+  const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]);
+  const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]);
+  const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]);
+  const __m256i res_67 = _mm256_maddubs_epi16(s[3], coeffs[3]);
+
+  // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+  const __m256i res = _mm256_add_epi16(_mm256_add_epi16(res_01, res_45),
+                                       _mm256_add_epi16(res_23, res_67));
+
+  return res;
+}
+
+static INLINE __m256i convolve_lowbd_4tap(const __m256i *const s,
+                                          const __m256i *const coeffs) {
+  const __m256i res_23 = _mm256_maddubs_epi16(s[0], coeffs[0]);
+  const __m256i res_45 = _mm256_maddubs_epi16(s[1], coeffs[1]);
+
+  // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+  const __m256i res = _mm256_add_epi16(res_45, res_23);
+
+  return res;
+}
+
+static INLINE __m256i convolve(const __m256i *const s,
+                               const __m256i *const coeffs) {
+  const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]);
+  const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]);
+  const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]);
+  const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]);
+
+  const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1),
+                                       _mm256_add_epi32(res_2, res_3));
+
+  return res;
+}
+
+static INLINE __m256i convolve_4tap(const __m256i *const s,
+                                    const __m256i *const coeffs) {
+  const __m256i res_1 = _mm256_madd_epi16(s[0], coeffs[0]);
+  const __m256i res_2 = _mm256_madd_epi16(s[1], coeffs[1]);
+
+  const __m256i res = _mm256_add_epi32(res_1, res_2);
+  return res;
+}
+
+static INLINE __m256i convolve_lowbd_x(const __m256i data,
+                                       const __m256i *const coeffs,
+                                       const __m256i *const filt) {
+  __m256i s[4];
+
+  s[0] = _mm256_shuffle_epi8(data, filt[0]);
+  s[1] = _mm256_shuffle_epi8(data, filt[1]);
+  s[2] = _mm256_shuffle_epi8(data, filt[2]);
+  s[3] = _mm256_shuffle_epi8(data, filt[3]);
+
+  return convolve_lowbd(s, coeffs);
+}
+
+static INLINE __m256i convolve_lowbd_x_4tap(const __m256i data,
+                                            const __m256i *const coeffs,
+                                            const __m256i *const filt) {
+  __m256i s[2];
+
+  s[0] = _mm256_shuffle_epi8(data, filt[0]);
+  s[1] = _mm256_shuffle_epi8(data, filt[1]);
+
+  return convolve_lowbd_4tap(s, coeffs);
+}
+
+static INLINE void add_store_aligned_256(CONV_BUF_TYPE *const dst,
+                                         const __m256i *const res,
+                                         const int do_average) {
+  __m256i d;
+  if (do_average) {
+    d = _mm256_load_si256((__m256i *)dst);
+    d = _mm256_add_epi32(d, *res);
+    d = _mm256_srai_epi32(d, 1);
+  } else {
+    d = *res;
+  }
+  _mm256_store_si256((__m256i *)dst, d);
+}
+
+static INLINE __m256i comp_avg(const __m256i *const data_ref_0,
+                               const __m256i *const res_unsigned,
+                               const __m256i *const wt,
+                               const int use_dist_wtd_comp_avg) {
+  __m256i res;
+  if (use_dist_wtd_comp_avg) {
+    const __m256i data_lo = _mm256_unpacklo_epi16(*data_ref_0, *res_unsigned);
+    const __m256i data_hi = _mm256_unpackhi_epi16(*data_ref_0, *res_unsigned);
+
+    const __m256i wt_res_lo = _mm256_madd_epi16(data_lo, *wt);
+    const __m256i wt_res_hi = _mm256_madd_epi16(data_hi, *wt);
+
+    const __m256i res_lo = _mm256_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
+    const __m256i res_hi = _mm256_srai_epi32(wt_res_hi, DIST_PRECISION_BITS);
+
+    res = _mm256_packs_epi32(res_lo, res_hi);
+  } else {
+    const __m256i wt_res = _mm256_add_epi16(*data_ref_0, *res_unsigned);
+    res = _mm256_srai_epi16(wt_res, 1);
+  }
+  return res;
+}
+
+static INLINE __m256i convolve_rounding(const __m256i *const res_unsigned,
+                                        const __m256i *const offset_const,
+                                        const __m256i *const round_const,
+                                        const int round_shift) {
+  const __m256i res_signed = _mm256_sub_epi16(*res_unsigned, *offset_const);
+  const __m256i res_round = _mm256_srai_epi16(
+      _mm256_add_epi16(res_signed, *round_const), round_shift);
+  return res_round;
+}
+
+static INLINE __m256i highbd_comp_avg(const __m256i *const data_ref_0,
+                                      const __m256i *const res_unsigned,
+                                      const __m256i *const wt0,
+                                      const __m256i *const wt1,
+                                      const int use_dist_wtd_comp_avg) {
+  __m256i res;
+  if (use_dist_wtd_comp_avg) {
+    const __m256i wt0_res = _mm256_mullo_epi32(*data_ref_0, *wt0);
+    const __m256i wt1_res = _mm256_mullo_epi32(*res_unsigned, *wt1);
+    const __m256i wt_res = _mm256_add_epi32(wt0_res, wt1_res);
+    res = _mm256_srai_epi32(wt_res, DIST_PRECISION_BITS);
+  } else {
+    const __m256i wt_res = _mm256_add_epi32(*data_ref_0, *res_unsigned);
+    res = _mm256_srai_epi32(wt_res, 1);
+  }
+  return res;
+}
+
+static INLINE __m256i highbd_convolve_rounding(
+    const __m256i *const res_unsigned, const __m256i *const offset_const,
+    const __m256i *const round_const, const int round_shift) {
+  const __m256i res_signed = _mm256_sub_epi32(*res_unsigned, *offset_const);
+  const __m256i res_round = _mm256_srai_epi32(
+      _mm256_add_epi32(res_signed, *round_const), round_shift);
+
+  return res_round;
+}
+
+#endif  // AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_
diff --git a/libs/libaom/src/aom_dsp/x86/convolve_common_intrin.h b/libs/libaom/src/aom_dsp/x86/convolve_common_intrin.h
new file mode 100644
index 000000000..707bd2d78
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/convolve_common_intrin.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_
+#define AOM_AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_
+
+// Note:
+//  This header file should be put below any x86 intrinsics head file
+
+static INLINE void add_store(CONV_BUF_TYPE *const dst, const __m128i *const res,
+                             const int do_average) {
+  __m128i d;
+  if (do_average) {
+    d = _mm_load_si128((__m128i *)dst);
+    d = _mm_add_epi32(d, *res);
+    d = _mm_srai_epi32(d, 1);
+  } else {
+    d = *res;
+  }
+  _mm_store_si128((__m128i *)dst, d);
+}
+
+#endif  // AOM_AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_
diff --git a/libs/libaom/src/aom_dsp/x86/convolve_sse2.h b/libs/libaom/src/aom_dsp/x86/convolve_sse2.h
new file mode 100644
index 000000000..385c7c7e1
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/convolve_sse2.h
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_CONVOLVE_SSE2_H_
+#define AOM_AOM_DSP_X86_CONVOLVE_SSE2_H_
+
+// Note:
+//  This header file should be put below any x86 intrinsics head file
+
+static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params,
+                                  const int subpel_q4,
+                                  __m128i *const coeffs /* [4] */) {
+  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
+      filter_params, subpel_q4 & SUBPEL_MASK);
+  const __m128i coeff = _mm_loadu_si128((__m128i *)filter);
+
+  // coeffs 0 1 0 1 0 1 0 1
+  coeffs[0] = _mm_shuffle_epi32(coeff, 0x00);
+  // coeffs 2 3 2 3 2 3 2 3
+  coeffs[1] = _mm_shuffle_epi32(coeff, 0x55);
+  // coeffs 4 5 4 5 4 5 4 5
+  coeffs[2] = _mm_shuffle_epi32(coeff, 0xaa);
+  // coeffs 6 7 6 7 6 7 6 7
+  coeffs[3] = _mm_shuffle_epi32(coeff, 0xff);
+}
+
+static INLINE __m128i convolve(const __m128i *const s,
+                               const __m128i *const coeffs) {
+  const __m128i res_0 = _mm_madd_epi16(s[0], coeffs[0]);
+  const __m128i res_1 = _mm_madd_epi16(s[1], coeffs[1]);
+  const __m128i res_2 = _mm_madd_epi16(s[2], coeffs[2]);
+  const __m128i res_3 = _mm_madd_epi16(s[3], coeffs[3]);
+
+  const __m128i res =
+      _mm_add_epi32(_mm_add_epi32(res_0, res_1), _mm_add_epi32(res_2, res_3));
+
+  return res;
+}
+
+static INLINE __m128i convolve_lo_x(const __m128i *const s,
+                                    const __m128i *const coeffs) {
+  __m128i ss[4];
+  ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128());
+  ss[1] = _mm_unpacklo_epi8(s[1], _mm_setzero_si128());
+  ss[2] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128());
+  ss[3] = _mm_unpacklo_epi8(s[3], _mm_setzero_si128());
+  return convolve(ss, coeffs);
+}
+
+static INLINE __m128i convolve_lo_y(const __m128i *const s,
+                                    const __m128i *const coeffs) {
+  __m128i ss[4];
+  ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128());
+  ss[1] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128());
+  ss[2] = _mm_unpacklo_epi8(s[4], _mm_setzero_si128());
+  ss[3] = _mm_unpacklo_epi8(s[6], _mm_setzero_si128());
+  return convolve(ss, coeffs);
+}
+
+static INLINE __m128i convolve_hi_y(const __m128i *const s,
+                                    const __m128i *const coeffs) {
+  __m128i ss[4];
+  ss[0] = _mm_unpackhi_epi8(s[0], _mm_setzero_si128());
+  ss[1] = _mm_unpackhi_epi8(s[2], _mm_setzero_si128());
+  ss[2] = _mm_unpackhi_epi8(s[4], _mm_setzero_si128());
+  ss[3] = _mm_unpackhi_epi8(s[6], _mm_setzero_si128());
+  return convolve(ss, coeffs);
+}
+
+static INLINE __m128i comp_avg(const __m128i *const data_ref_0,
+                               const __m128i *const res_unsigned,
+                               const __m128i *const wt,
+                               const int use_dist_wtd_avg) {
+  __m128i res;
+  if (use_dist_wtd_avg) {
+    const __m128i data_lo = _mm_unpacklo_epi16(*data_ref_0, *res_unsigned);
+    const __m128i data_hi = _mm_unpackhi_epi16(*data_ref_0, *res_unsigned);
+
+    const __m128i wt_res_lo = _mm_madd_epi16(data_lo, *wt);
+    const __m128i wt_res_hi = _mm_madd_epi16(data_hi, *wt);
+
+    const __m128i res_lo = _mm_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
+    const __m128i res_hi = _mm_srai_epi32(wt_res_hi, DIST_PRECISION_BITS);
+
+    res = _mm_packs_epi32(res_lo, res_hi);
+  } else {
+    const __m128i wt_res = _mm_add_epi16(*data_ref_0, *res_unsigned);
+    res = _mm_srai_epi16(wt_res, 1);
+  }
+  return res;
+}
+
+static INLINE __m128i convolve_rounding(const __m128i *const res_unsigned,
+                                        const __m128i *const offset_const,
+                                        const __m128i *const round_const,
+                                        const int round_shift) {
+  const __m128i res_signed = _mm_sub_epi16(*res_unsigned, *offset_const);
+  const __m128i res_round =
+      _mm_srai_epi16(_mm_add_epi16(res_signed, *round_const), round_shift);
+  return res_round;
+}
+
+static INLINE __m128i highbd_convolve_rounding_sse2(
+    const __m128i *const res_unsigned, const __m128i *const offset_const,
+    const __m128i *const round_const, const int round_shift) {
+  const __m128i res_signed = _mm_sub_epi32(*res_unsigned, *offset_const);
+  const __m128i res_round =
+      _mm_srai_epi32(_mm_add_epi32(res_signed, *round_const), round_shift);
+
+  return res_round;
+}
+
+#endif  // AOM_AOM_DSP_X86_CONVOLVE_SSE2_H_
diff --git a/libs/libaom/src/aom_dsp/x86/convolve_sse4_1.h b/libs/libaom/src/aom_dsp/x86/convolve_sse4_1.h
new file mode 100644
index 000000000..b1a3bb466
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/convolve_sse4_1.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_CONVOLVE_SSE4_1_H_
+#define AOM_AOM_DSP_X86_CONVOLVE_SSE4_1_H_
+
+// Note:
+//  This header file should be put below any x86 intrinsics head file
+
+static INLINE void mult_add_store(CONV_BUF_TYPE *const dst,
+                                  const __m128i *const res,
+                                  const __m128i *const wt0,
+                                  const __m128i *const wt1,
+                                  const int do_average) {
+  __m128i d;
+  if (do_average) {
+    d = _mm_load_si128((__m128i *)dst);
+    d = _mm_add_epi32(_mm_mullo_epi32(d, *wt0), _mm_mullo_epi32(*res, *wt1));
+    d = _mm_srai_epi32(d, DIST_PRECISION_BITS);
+  } else {
+    d = *res;
+  }
+  _mm_store_si128((__m128i *)dst, d);
+}
+
+static INLINE __m128i highbd_comp_avg_sse4_1(const __m128i *const data_ref_0,
+                                             const __m128i *const res_unsigned,
+                                             const __m128i *const wt0,
+                                             const __m128i *const wt1,
+                                             const int use_dist_wtd_avg) {
+  __m128i res;
+  if (use_dist_wtd_avg) {
+    const __m128i wt0_res = _mm_mullo_epi32(*data_ref_0, *wt0);
+    const __m128i wt1_res = _mm_mullo_epi32(*res_unsigned, *wt1);
+
+    const __m128i wt_res = _mm_add_epi32(wt0_res, wt1_res);
+    res = _mm_srai_epi32(wt_res, DIST_PRECISION_BITS);
+  } else {
+    const __m128i wt_res = _mm_add_epi32(*data_ref_0, *res_unsigned);
+    res = _mm_srai_epi32(wt_res, 1);
+  }
+  return res;
+}
+
+#endif  // AOM_AOM_DSP_X86_CONVOLVE_SSE4_1_H_
diff --git a/libs/libaom/src/aom_dsp/x86/fft_avx2.c b/libs/libaom/src/aom_dsp/x86/fft_avx2.c
new file mode 100644
index 000000000..4cccc5f00
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/fft_avx2.c
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/fft_common.h"
+
+extern void aom_transpose_float_sse2(const float *A, float *B, int n);
+extern void aom_fft_unpack_2d_output_sse2(const float *col_fft, float *output,
+                                          int n);
+
+// Generate the 1d forward transforms for float using _mm256
+GEN_FFT_8(static INLINE void, avx2, float, __m256, _mm256_load_ps,
+          _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
+          _mm256_mul_ps);
+GEN_FFT_16(static INLINE void, avx2, float, __m256, _mm256_load_ps,
+           _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
+           _mm256_mul_ps);
+GEN_FFT_32(static INLINE void, avx2, float, __m256, _mm256_load_ps,
+           _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
+           _mm256_mul_ps);
+
+void aom_fft8x8_float_avx2(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 8, aom_fft1d_8_avx2,
+                 aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 8);
+}
+
+void aom_fft16x16_float_avx2(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 16, aom_fft1d_16_avx2,
+                 aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 8);
+}
+
+void aom_fft32x32_float_avx2(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 32, aom_fft1d_32_avx2,
+                 aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 8);
+}
+
+// Generate the 1d inverse transforms for float using _mm256
+GEN_IFFT_8(static INLINE void, avx2, float, __m256, _mm256_load_ps,
+           _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
+           _mm256_mul_ps);
+GEN_IFFT_16(static INLINE void, avx2, float, __m256, _mm256_load_ps,
+            _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
+            _mm256_mul_ps);
+GEN_IFFT_32(static INLINE void, avx2, float, __m256, _mm256_load_ps,
+            _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
+            _mm256_mul_ps);
+
+void aom_ifft8x8_float_avx2(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, aom_fft1d_8_avx2,
+                  aom_ifft1d_8_avx2, aom_transpose_float_sse2, 8);
+}
+
+void aom_ifft16x16_float_avx2(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 16, aom_fft1d_16_float,
+                  aom_fft1d_16_avx2, aom_ifft1d_16_avx2,
+                  aom_transpose_float_sse2, 8);
+}
+
+void aom_ifft32x32_float_avx2(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 32, aom_fft1d_32_float,
+                  aom_fft1d_32_avx2, aom_ifft1d_32_avx2,
+                  aom_transpose_float_sse2, 8);
+}
diff --git a/libs/libaom/src/aom_dsp/x86/fft_sse2.c b/libs/libaom/src/aom_dsp/x86/fft_sse2.c
new file mode 100644
index 000000000..6f20a3cc0
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/fft_sse2.c
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+s * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <xmmintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/fft_common.h"
+
+static INLINE void transpose4x4(const float *A, float *B, const int lda,
+                                const int ldb) {
+  __m128 row1 = _mm_load_ps(&A[0 * lda]);
+  __m128 row2 = _mm_load_ps(&A[1 * lda]);
+  __m128 row3 = _mm_load_ps(&A[2 * lda]);
+  __m128 row4 = _mm_load_ps(&A[3 * lda]);
+  _MM_TRANSPOSE4_PS(row1, row2, row3, row4);
+  _mm_store_ps(&B[0 * ldb], row1);
+  _mm_store_ps(&B[1 * ldb], row2);
+  _mm_store_ps(&B[2 * ldb], row3);
+  _mm_store_ps(&B[3 * ldb], row4);
+}
+
+void aom_transpose_float_sse2(const float *A, float *B, int n) {
+  for (int y = 0; y < n; y += 4) {
+    for (int x = 0; x < n; x += 4) {
+      transpose4x4(A + y * n + x, B + x * n + y, n, n);
+    }
+  }
+}
+
+void aom_fft_unpack_2d_output_sse2(const float *packed, float *output, int n) {
+  const int n2 = n / 2;
+  output[0] = packed[0];
+  output[1] = 0;
+  output[2 * (n2 * n)] = packed[n2 * n];
+  output[2 * (n2 * n) + 1] = 0;
+
+  output[2 * n2] = packed[n2];
+  output[2 * n2 + 1] = 0;
+  output[2 * (n2 * n + n2)] = packed[n2 * n + n2];
+  output[2 * (n2 * n + n2) + 1] = 0;
+
+  for (int c = 1; c < n2; ++c) {
+    output[2 * (0 * n + c)] = packed[c];
+    output[2 * (0 * n + c) + 1] = packed[c + n2];
+    output[2 * (n2 * n + c) + 0] = packed[n2 * n + c];
+    output[2 * (n2 * n + c) + 1] = packed[n2 * n + c + n2];
+  }
+  for (int r = 1; r < n2; ++r) {
+    output[2 * (r * n + 0)] = packed[r * n];
+    output[2 * (r * n + 0) + 1] = packed[(r + n2) * n];
+    output[2 * (r * n + n2) + 0] = packed[r * n + n2];
+    output[2 * (r * n + n2) + 1] = packed[(r + n2) * n + n2];
+
+    for (int c = 1; c < AOMMIN(n2, 4); ++c) {
+      output[2 * (r * n + c)] =
+          packed[r * n + c] - packed[(r + n2) * n + c + n2];
+      output[2 * (r * n + c) + 1] =
+          packed[(r + n2) * n + c] + packed[r * n + c + n2];
+    }
+
+    for (int c = 4; c < n2; c += 4) {
+      __m128 real1 = _mm_load_ps(packed + r * n + c);
+      __m128 real2 = _mm_load_ps(packed + (r + n2) * n + c + n2);
+      __m128 imag1 = _mm_load_ps(packed + (r + n2) * n + c);
+      __m128 imag2 = _mm_load_ps(packed + r * n + c + n2);
+      real1 = _mm_sub_ps(real1, real2);
+      imag1 = _mm_add_ps(imag1, imag2);
+      _mm_store_ps(output + 2 * (r * n + c), _mm_unpacklo_ps(real1, imag1));
+      _mm_store_ps(output + 2 * (r * n + c + 2), _mm_unpackhi_ps(real1, imag1));
+    }
+
+    int r2 = r + n2;
+    int r3 = n - r2;
+    output[2 * (r2 * n + 0)] = packed[r3 * n];
+    output[2 * (r2 * n + 0) + 1] = -packed[(r3 + n2) * n];
+    output[2 * (r2 * n + n2)] = packed[r3 * n + n2];
+    output[2 * (r2 * n + n2) + 1] = -packed[(r3 + n2) * n + n2];
+    for (int c = 1; c < AOMMIN(4, n2); ++c) {
+      output[2 * (r2 * n + c)] =
+          packed[r3 * n + c] + packed[(r3 + n2) * n + c + n2];
+      output[2 * (r2 * n + c) + 1] =
+          -packed[(r3 + n2) * n + c] + packed[r3 * n + c + n2];
+    }
+    for (int c = 4; c < n2; c += 4) {
+      __m128 real1 = _mm_load_ps(packed + r3 * n + c);
+      __m128 real2 = _mm_load_ps(packed + (r3 + n2) * n + c + n2);
+      __m128 imag1 = _mm_load_ps(packed + (r3 + n2) * n + c);
+      __m128 imag2 = _mm_load_ps(packed + r3 * n + c + n2);
+      real1 = _mm_add_ps(real1, real2);
+      imag1 = _mm_sub_ps(imag2, imag1);
+      _mm_store_ps(output + 2 * (r2 * n + c), _mm_unpacklo_ps(real1, imag1));
+      _mm_store_ps(output + 2 * (r2 * n + c + 2),
+                   _mm_unpackhi_ps(real1, imag1));
+    }
+  }
+}
+
+// Generate definitions for 1d transforms using float and __mm128
+GEN_FFT_4(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+          _mm_set1_ps, _mm_add_ps, _mm_sub_ps);
+GEN_FFT_8(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+          _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps);
+GEN_FFT_16(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+           _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps);
+GEN_FFT_32(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+           _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps);
+
+void aom_fft4x4_float_sse2(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 4, aom_fft1d_4_sse2,
+                 aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4);
+}
+
+void aom_fft8x8_float_sse2(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 8, aom_fft1d_8_sse2,
+                 aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4);
+}
+
+void aom_fft16x16_float_sse2(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 16, aom_fft1d_16_sse2,
+                 aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4);
+}
+
+void aom_fft32x32_float_sse2(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 32, aom_fft1d_32_sse2,
+                 aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4);
+}
+
+// Generate definitions for 1d inverse transforms using float and mm128
+GEN_IFFT_4(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+           _mm_set1_ps, _mm_add_ps, _mm_sub_ps);
+GEN_IFFT_8(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+           _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps);
+GEN_IFFT_16(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+            _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps);
+GEN_IFFT_32(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+            _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps);
+
+void aom_ifft4x4_float_sse2(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 4, aom_fft1d_4_float, aom_fft1d_4_sse2,
+                  aom_ifft1d_4_sse2, aom_transpose_float_sse2, 4);
+}
+
+void aom_ifft8x8_float_sse2(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, aom_fft1d_8_sse2,
+                  aom_ifft1d_8_sse2, aom_transpose_float_sse2, 4);
+}
+
+void aom_ifft16x16_float_sse2(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 16, aom_fft1d_16_float,
+                  aom_fft1d_16_sse2, aom_ifft1d_16_sse2,
+                  aom_transpose_float_sse2, 4);
+}
+
+void aom_ifft32x32_float_sse2(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 32, aom_fft1d_32_float,
+                  aom_fft1d_32_sse2, aom_ifft1d_32_sse2,
+                  aom_transpose_float_sse2, 4);
+}
diff --git a/libs/libaom/src/aom_dsp/x86/fwd_txfm_impl_sse2.h b/libs/libaom/src/aom_dsp/x86/fwd_txfm_impl_sse2.h
new file mode 100644
index 000000000..89fe1899b
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/fwd_txfm_impl_sse2.h
@@ -0,0 +1,544 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>  // SSE2
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/txfm_common.h"
+#include "aom_dsp/x86/fwd_txfm_sse2.h"
+#include "aom_dsp/x86/txfm_common_sse2.h"
+#include "aom_ports/mem.h"
+
+// TODO(jingning) The high bit-depth functions need rework for performance.
+// After we properly fix the high bit-depth function implementations, this
+// file's dependency should be substantially simplified.
+#if DCT_HIGH_BIT_DEPTH
+#define ADD_EPI16 _mm_adds_epi16
+#define SUB_EPI16 _mm_subs_epi16
+
+#else
+#define ADD_EPI16 _mm_add_epi16
+#define SUB_EPI16 _mm_sub_epi16
+#endif
+
+static void FDCT4x4_2D_HELPER(const int16_t *input, int stride, __m128i *in0,
+                              __m128i *in1) {
+  // Constants
+  // These are the coefficients used for the multiplies.
+  // In the comments, pN means cos(N pi /64) and mN is -cos(N pi /64),
+  // where cospi_N_64 = cos(N pi /64)
+  const __m128i k__cospi_A =
+      octa_set_epi16(cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64,
+                     cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_B =
+      octa_set_epi16(cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64,
+                     cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64);
+  const __m128i k__cospi_C =
+      octa_set_epi16(cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64,
+                     cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64);
+  const __m128i k__cospi_D =
+      octa_set_epi16(cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64,
+                     cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_E =
+      octa_set_epi16(cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64,
+                     cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64);
+  const __m128i k__cospi_F =
+      octa_set_epi16(cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64,
+                     cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_G =
+      octa_set_epi16(cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64,
+                     -cospi_8_64, -cospi_24_64, -cospi_8_64, -cospi_24_64);
+  const __m128i k__cospi_H =
+      octa_set_epi16(cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64,
+                     -cospi_24_64, cospi_8_64, -cospi_24_64, cospi_8_64);
+
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  // This second rounding constant saves doing some extra adds at the end
+  const __m128i k__DCT_CONST_ROUNDING2 =
+      _mm_set1_epi32(DCT_CONST_ROUNDING + (DCT_CONST_ROUNDING << 1));
+  const int DCT_CONST_BITS2 = DCT_CONST_BITS + 2;
+  const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
+  const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
+
+  // Load inputs.
+  *in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+  *in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+  *in1 = _mm_unpacklo_epi64(
+      *in1, _mm_loadl_epi64((const __m128i *)(input + 2 * stride)));
+  *in0 = _mm_unpacklo_epi64(
+      *in0, _mm_loadl_epi64((const __m128i *)(input + 3 * stride)));
+  // in0 = [i0 i1 i2 i3 iC iD iE iF]
+  // in1 = [i4 i5 i6 i7 i8 i9 iA iB]
+  // multiply by 16 to give some extra precision
+  *in0 = _mm_slli_epi16(*in0, 4);
+  *in1 = _mm_slli_epi16(*in1, 4);
+  // if (i == 0 && input[0]) input[0] += 1;
+  // add 1 to the upper left pixel if it is non-zero, which helps reduce
+  // the round-trip error
+  {
+    // The mask will only contain whether the first value is zero, all
+    // other comparison will fail as something shifted by 4 (above << 4)
+    // can never be equal to one. To increment in the non-zero case, we
+    // add the mask and one for the first element:
+    //   - if zero, mask = -1, v = v - 1 + 1 = v
+    //   - if non-zero, mask = 0, v = v + 0 + 1 = v + 1
+    __m128i mask = _mm_cmpeq_epi16(*in0, k__nonzero_bias_a);
+    *in0 = _mm_add_epi16(*in0, mask);
+    *in0 = _mm_add_epi16(*in0, k__nonzero_bias_b);
+  }
+  // There are 4 total stages, alternating between an add/subtract stage
+  // followed by an multiply-and-add stage.
+  {
+    // Stage 1: Add/subtract
+
+    // in0 = [i0 i1 i2 i3 iC iD iE iF]
+    // in1 = [i4 i5 i6 i7 i8 i9 iA iB]
+    const __m128i r0 = _mm_unpacklo_epi16(*in0, *in1);
+    const __m128i r1 = _mm_unpackhi_epi16(*in0, *in1);
+    // r0 = [i0 i4 i1 i5 i2 i6 i3 i7]
+    // r1 = [iC i8 iD i9 iE iA iF iB]
+    const __m128i r2 = _mm_shuffle_epi32(r0, 0xB4);
+    const __m128i r3 = _mm_shuffle_epi32(r1, 0xB4);
+    // r2 = [i0 i4 i1 i5 i3 i7 i2 i6]
+    // r3 = [iC i8 iD i9 iF iB iE iA]
+
+    const __m128i t0 = _mm_add_epi16(r2, r3);
+    const __m128i t1 = _mm_sub_epi16(r2, r3);
+    // t0 = [a0 a4 a1 a5 a3 a7 a2 a6]
+    // t1 = [aC a8 aD a9 aF aB aE aA]
+
+    // Stage 2: multiply by constants (which gets us into 32 bits).
+    // The constants needed here are:
+    // k__cospi_A = [p16 p16 p16 p16 p16 m16 p16 m16]
+    // k__cospi_B = [p16 m16 p16 m16 p16 p16 p16 p16]
+    // k__cospi_C = [p08 p24 p08 p24 p24 m08 p24 m08]
+    // k__cospi_D = [p24 m08 p24 m08 p08 p24 p08 p24]
+    const __m128i u0 = _mm_madd_epi16(t0, k__cospi_A);
+    const __m128i u2 = _mm_madd_epi16(t0, k__cospi_B);
+    const __m128i u1 = _mm_madd_epi16(t1, k__cospi_C);
+    const __m128i u3 = _mm_madd_epi16(t1, k__cospi_D);
+    // Then add and right-shift to get back to 16-bit range
+    const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+    const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+    const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+    const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+    const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+    const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+    const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+    const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+    // w0 = [b0 b1 b7 b6]
+    // w1 = [b8 b9 bF bE]
+    // w2 = [b4 b5 b3 b2]
+    // w3 = [bC bD bB bA]
+    const __m128i x0 = _mm_packs_epi32(w0, w1);
+    const __m128i x1 = _mm_packs_epi32(w2, w3);
+
+    // x0 = [b0 b1 b7 b6 b8 b9 bF bE]
+    // x1 = [b4 b5 b3 b2 bC bD bB bA]
+    *in0 = _mm_shuffle_epi32(x0, 0xD8);
+    *in1 = _mm_shuffle_epi32(x1, 0x8D);
+    // in0 = [b0 b1 b8 b9 b7 b6 bF bE]
+    // in1 = [b3 b2 bB bA b4 b5 bC bD]
+  }
+  {
+    // vertical DCTs finished. Now we do the horizontal DCTs.
+    // Stage 3: Add/subtract
+
+    const __m128i t0 = ADD_EPI16(*in0, *in1);
+    const __m128i t1 = SUB_EPI16(*in0, *in1);
+
+    // Stage 4: multiply by constants (which gets us into 32 bits).
+    {
+      // The constants needed here are:
+      // k__cospi_E = [p16 p16 p16 p16 p16 p16 p16 p16]
+      // k__cospi_F = [p16 m16 p16 m16 p16 m16 p16 m16]
+      // k__cospi_G = [p08 p24 p08 p24 m08 m24 m08 m24]
+      // k__cospi_H = [p24 m08 p24 m08 m24 p08 m24 p08]
+      const __m128i u0 = _mm_madd_epi16(t0, k__cospi_E);
+      const __m128i u1 = _mm_madd_epi16(t0, k__cospi_F);
+      const __m128i u2 = _mm_madd_epi16(t1, k__cospi_G);
+      const __m128i u3 = _mm_madd_epi16(t1, k__cospi_H);
+      // Then add and right-shift to get back to 16-bit range
+      // but this combines the final right-shift as well to save operations
+      // This unusual rounding operations is to maintain bit-accurate
+      // compatibility with the c version of this function which has two
+      // rounding steps in a row.
+      const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING2);
+      const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING2);
+      const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING2);
+      const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING2);
+      const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS2);
+      const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS2);
+      const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS2);
+      const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS2);
+      // w0 = [o0 o4 o8 oC]
+      // w1 = [o2 o6 oA oE]
+      // w2 = [o1 o5 o9 oD]
+      // w3 = [o3 o7 oB oF]
+      // remember the o's are numbered according to the correct output location
+      const __m128i x0 = _mm_packs_epi32(w0, w1);
+      const __m128i x1 = _mm_packs_epi32(w2, w3);
+      {
+        // x0 = [o0 o4 o8 oC o2 o6 oA oE]
+        // x1 = [o1 o5 o9 oD o3 o7 oB oF]
+        const __m128i y0 = _mm_unpacklo_epi16(x0, x1);
+        const __m128i y1 = _mm_unpackhi_epi16(x0, x1);
+        // y0 = [o0 o1 o4 o5 o8 o9 oC oD]
+        // y1 = [o2 o3 o6 o7 oA oB oE oF]
+        *in0 = _mm_unpacklo_epi32(y0, y1);
+        // in0 = [o0 o1 o2 o3 o4 o5 o6 o7]
+        *in1 = _mm_unpackhi_epi32(y0, y1);
+        // in1 = [o8 o9 oA oB oC oD oE oF]
+      }
+    }
+  }
+}
+
+void FDCT4x4_2D(const int16_t *input, tran_low_t *output, int stride) {
+  // This 2D transform implements 4 vertical 1D transforms followed
+  // by 4 horizontal 1D transforms.  The multiplies and adds are as given
+  // by Chen, Smith and Fralick ('77).  The commands for moving the data
+  // around have been minimized by hand.
+  // For the purposes of the comments, the 16 inputs are referred to at i0
+  // through iF (in raster order), intermediate variables are a0, b0, c0
+  // through f, and correspond to the in-place computations mapped to input
+  // locations.  The outputs, o0 through oF are labeled according to the
+  // output locations.
+  __m128i in0, in1;
+  FDCT4x4_2D_HELPER(input, stride, &in0, &in1);
+
+  // Post-condition (v + 1) >> 2 is now incorporated into previous
+  // add and right-shift commands.  Only 2 store instructions needed
+  // because we are using the fact that 1/3 are stored just after 0/2.
+  storeu_output(&in0, output + 0 * 4);
+  storeu_output(&in1, output + 2 * 4);
+}
+
+void FDCT4x4_2D_LP(const int16_t *input, int16_t *output, int stride) {
+  __m128i in0, in1;
+  FDCT4x4_2D_HELPER(input, stride, &in0, &in1);
+  _mm_storeu_si128((__m128i *)(output + 0 * 4), in0);
+  _mm_storeu_si128((__m128i *)(output + 2 * 4), in1);
+}
+
+void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) {
+  int pass;
+  // Constants
+  //    When we use them, in one case, they are all the same. In all others
+  //    it's a pair of them that we need to repeat four times. This is done
+  //    by constructing the 32 bit constant corresponding to that pair.
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+#if DCT_HIGH_BIT_DEPTH
+  int overflow;
+#endif
+  // Load input
+  __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
+  __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
+  __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
+  __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
+  __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride));
+  __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride));
+  __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride));
+  __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride));
+  // Pre-condition input (shift by two)
+  in0 = _mm_slli_epi16(in0, 2);
+  in1 = _mm_slli_epi16(in1, 2);
+  in2 = _mm_slli_epi16(in2, 2);
+  in3 = _mm_slli_epi16(in3, 2);
+  in4 = _mm_slli_epi16(in4, 2);
+  in5 = _mm_slli_epi16(in5, 2);
+  in6 = _mm_slli_epi16(in6, 2);
+  in7 = _mm_slli_epi16(in7, 2);
+
+  // We do two passes, first the columns, then the rows. The results of the
+  // first pass are transposed so that the same column code can be reused. The
+  // results of the second pass are also transposed so that the rows (processed
+  // as columns) are put back in row positions.
+  for (pass = 0; pass < 2; pass++) {
+    // To store results of each pass before the transpose.
+    __m128i res0, res1, res2, res3, res4, res5, res6, res7;
+    // Add/subtract
+    const __m128i q0 = ADD_EPI16(in0, in7);
+    const __m128i q1 = ADD_EPI16(in1, in6);
+    const __m128i q2 = ADD_EPI16(in2, in5);
+    const __m128i q3 = ADD_EPI16(in3, in4);
+    const __m128i q4 = SUB_EPI16(in3, in4);
+    const __m128i q5 = SUB_EPI16(in2, in5);
+    const __m128i q6 = SUB_EPI16(in1, in6);
+    const __m128i q7 = SUB_EPI16(in0, in7);
+#if DCT_HIGH_BIT_DEPTH
+    if (pass == 1) {
+      overflow =
+          check_epi16_overflow_x8(&q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7);
+      if (overflow) {
+        aom_highbd_fdct8x8_c(input, output, stride);
+        return;
+      }
+    }
+#endif  // DCT_HIGH_BIT_DEPTH
+    // Work on first four results
+    {
+      // Add/subtract
+      const __m128i r0 = ADD_EPI16(q0, q3);
+      const __m128i r1 = ADD_EPI16(q1, q2);
+      const __m128i r2 = SUB_EPI16(q1, q2);
+      const __m128i r3 = SUB_EPI16(q0, q3);
+#if DCT_HIGH_BIT_DEPTH
+      overflow = check_epi16_overflow_x4(&r0, &r1, &r2, &r3);
+      if (overflow) {
+        aom_highbd_fdct8x8_c(input, output, stride);
+        return;
+      }
+#endif  // DCT_HIGH_BIT_DEPTH
+      // Interleave to do the multiply by constants which gets us into 32bits
+      {
+        const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
+        const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
+        const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
+        const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
+        const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
+        const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
+        const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
+        const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
+        const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
+        const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
+        const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
+        const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
+        // dct_const_round_shift
+        const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+        const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+        const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+        const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+        const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+        const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+        const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+        const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+        const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+        const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+        const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+        const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+        const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+        const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+        const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+        const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+        // Combine
+        res0 = _mm_packs_epi32(w0, w1);
+        res4 = _mm_packs_epi32(w2, w3);
+        res2 = _mm_packs_epi32(w4, w5);
+        res6 = _mm_packs_epi32(w6, w7);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x4(&res0, &res4, &res2, &res6);
+        if (overflow) {
+          aom_highbd_fdct8x8_c(input, output, stride);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
+      }
+    }
+    // Work on next four results
+    {
+      // Interleave to do the multiply by constants which gets us into 32bits
+      const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
+      const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
+      const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
+      const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
+      const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
+      const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
+      // dct_const_round_shift
+      const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
+      const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
+      const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
+      const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
+      const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
+      const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
+      const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
+      const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
+      // Combine
+      const __m128i r0 = _mm_packs_epi32(s0, s1);
+      const __m128i r1 = _mm_packs_epi32(s2, s3);
+#if DCT_HIGH_BIT_DEPTH
+      overflow = check_epi16_overflow_x2(&r0, &r1);
+      if (overflow) {
+        aom_highbd_fdct8x8_c(input, output, stride);
+        return;
+      }
+#endif  // DCT_HIGH_BIT_DEPTH
+      {
+        // Add/subtract
+        const __m128i x0 = ADD_EPI16(q4, r0);
+        const __m128i x1 = SUB_EPI16(q4, r0);
+        const __m128i x2 = SUB_EPI16(q7, r1);
+        const __m128i x3 = ADD_EPI16(q7, r1);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x4(&x0, &x1, &x2, &x3);
+        if (overflow) {
+          aom_highbd_fdct8x8_c(input, output, stride);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
+        // Interleave to do the multiply by constants which gets us into 32bits
+        {
+          const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
+          const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
+          const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
+          const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
+          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
+          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
+          const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
+          const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
+          const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
+          const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
+          const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
+          const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
+          // dct_const_round_shift
+          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+          const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+          const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+          const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+          const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+          const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+          const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+          const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+          const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+          // Combine
+          res1 = _mm_packs_epi32(w0, w1);
+          res7 = _mm_packs_epi32(w2, w3);
+          res5 = _mm_packs_epi32(w4, w5);
+          res3 = _mm_packs_epi32(w6, w7);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x4(&res1, &res7, &res5, &res3);
+          if (overflow) {
+            aom_highbd_fdct8x8_c(input, output, stride);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
+        }
+      }
+    }
+    // Transpose the 8x8.
+    {
+      // 00 01 02 03 04 05 06 07
+      // 10 11 12 13 14 15 16 17
+      // 20 21 22 23 24 25 26 27
+      // 30 31 32 33 34 35 36 37
+      // 40 41 42 43 44 45 46 47
+      // 50 51 52 53 54 55 56 57
+      // 60 61 62 63 64 65 66 67
+      // 70 71 72 73 74 75 76 77
+      const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
+      const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3);
+      const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1);
+      const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3);
+      const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5);
+      const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7);
+      const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5);
+      const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7);
+      // 00 10 01 11 02 12 03 13
+      // 20 30 21 31 22 32 23 33
+      // 04 14 05 15 06 16 07 17
+      // 24 34 25 35 26 36 27 37
+      // 40 50 41 51 42 52 43 53
+      // 60 70 61 71 62 72 63 73
+      // 54 54 55 55 56 56 57 57
+      // 64 74 65 75 66 76 67 77
+      const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+      const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+      const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+      const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+      const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+      const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+      const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+      const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+      // 00 10 20 30 01 11 21 31
+      // 40 50 60 70 41 51 61 71
+      // 02 12 22 32 03 13 23 33
+      // 42 52 62 72 43 53 63 73
+      // 04 14 24 34 05 15 21 36
+      // 44 54 64 74 45 55 61 76
+      // 06 16 26 36 07 17 27 37
+      // 46 56 66 76 47 57 67 77
+      in0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
+      in1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
+      in2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
+      in3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
+      in4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
+      in5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
+      in6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
+      in7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
+      // 00 10 20 30 40 50 60 70
+      // 01 11 21 31 41 51 61 71
+      // 02 12 22 32 42 52 62 72
+      // 03 13 23 33 43 53 63 73
+      // 04 14 24 34 44 54 64 74
+      // 05 15 25 35 45 55 65 75
+      // 06 16 26 36 46 56 66 76
+      // 07 17 27 37 47 57 67 77
+    }
+  }
+  // Post-condition output and store it
+  {
+    // Post-condition (division by two)
+    //    division of two 16 bits signed numbers using shifts
+    //    n / 2 = (n - (n >> 15)) >> 1
+    const __m128i sign_in0 = _mm_srai_epi16(in0, 15);
+    const __m128i sign_in1 = _mm_srai_epi16(in1, 15);
+    const __m128i sign_in2 = _mm_srai_epi16(in2, 15);
+    const __m128i sign_in3 = _mm_srai_epi16(in3, 15);
+    const __m128i sign_in4 = _mm_srai_epi16(in4, 15);
+    const __m128i sign_in5 = _mm_srai_epi16(in5, 15);
+    const __m128i sign_in6 = _mm_srai_epi16(in6, 15);
+    const __m128i sign_in7 = _mm_srai_epi16(in7, 15);
+    in0 = _mm_sub_epi16(in0, sign_in0);
+    in1 = _mm_sub_epi16(in1, sign_in1);
+    in2 = _mm_sub_epi16(in2, sign_in2);
+    in3 = _mm_sub_epi16(in3, sign_in3);
+    in4 = _mm_sub_epi16(in4, sign_in4);
+    in5 = _mm_sub_epi16(in5, sign_in5);
+    in6 = _mm_sub_epi16(in6, sign_in6);
+    in7 = _mm_sub_epi16(in7, sign_in7);
+    in0 = _mm_srai_epi16(in0, 1);
+    in1 = _mm_srai_epi16(in1, 1);
+    in2 = _mm_srai_epi16(in2, 1);
+    in3 = _mm_srai_epi16(in3, 1);
+    in4 = _mm_srai_epi16(in4, 1);
+    in5 = _mm_srai_epi16(in5, 1);
+    in6 = _mm_srai_epi16(in6, 1);
+    in7 = _mm_srai_epi16(in7, 1);
+    // store results
+    store_output(&in0, (output + 0 * 8));
+    store_output(&in1, (output + 1 * 8));
+    store_output(&in2, (output + 2 * 8));
+    store_output(&in3, (output + 3 * 8));
+    store_output(&in4, (output + 4 * 8));
+    store_output(&in5, (output + 5 * 8));
+    store_output(&in6, (output + 6 * 8));
+    store_output(&in7, (output + 7 * 8));
+  }
+}
+
+#undef ADD_EPI16
+#undef SUB_EPI16
diff --git a/libs/libaom/src/aom_dsp/x86/fwd_txfm_sse2.c b/libs/libaom/src/aom_dsp/x86/fwd_txfm_sse2.c
new file mode 100644
index 000000000..0e4fb8046
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/fwd_txfm_sse2.c
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>  // SSE2
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/x86/fwd_txfm_sse2.h"
+
+#define DCT_HIGH_BIT_DEPTH 0
+#define FDCT4x4_2D_HELPER fdct4x4_helper
+#define FDCT4x4_2D aom_fdct4x4_sse2
+#define FDCT4x4_2D_LP aom_fdct4x4_lp_sse2
+#define FDCT8x8_2D aom_fdct8x8_sse2
+#include "aom_dsp/x86/fwd_txfm_impl_sse2.h"
+#undef FDCT4x4_2D_HELPER
+#undef FDCT4x4_2D
+#undef FDCT4x4_2D_LP
+#undef FDCT8x8_2D
+
+#if CONFIG_AV1_HIGHBITDEPTH
+
+#undef DCT_HIGH_BIT_DEPTH
+#define DCT_HIGH_BIT_DEPTH 1
+#define FDCT8x8_2D aom_highbd_fdct8x8_sse2
+#include "aom_dsp/x86/fwd_txfm_impl_sse2.h"  // NOLINT
+#undef FDCT8x8_2D
+
+#endif
diff --git a/libs/libaom/src/aom_dsp/x86/fwd_txfm_sse2.h b/libs/libaom/src/aom_dsp/x86/fwd_txfm_sse2.h
new file mode 100644
index 000000000..ab3cd9155
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/fwd_txfm_sse2.h
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_FWD_TXFM_SSE2_H_
+#define AOM_AOM_DSP_X86_FWD_TXFM_SSE2_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static INLINE __m128i k_madd_epi32(__m128i a, __m128i b) {
+  __m128i buf0, buf1;
+  buf0 = _mm_mul_epu32(a, b);
+  a = _mm_srli_epi64(a, 32);
+  b = _mm_srli_epi64(b, 32);
+  buf1 = _mm_mul_epu32(a, b);
+  return _mm_add_epi64(buf0, buf1);
+}
+
+static INLINE __m128i k_packs_epi64(__m128i a, __m128i b) {
+  __m128i buf0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0));
+  __m128i buf1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0));
+  return _mm_unpacklo_epi64(buf0, buf1);
+}
+
+static INLINE int check_epi16_overflow_x2(const __m128i *preg0,
+                                          const __m128i *preg1) {
+  const __m128i max_overflow = _mm_set1_epi16(0x7fff);
+  const __m128i min_overflow = _mm_set1_epi16(0x8000);
+  __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow),
+                              _mm_cmpeq_epi16(*preg0, min_overflow));
+  __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow),
+                              _mm_cmpeq_epi16(*preg1, min_overflow));
+  cmp0 = _mm_or_si128(cmp0, cmp1);
+  return _mm_movemask_epi8(cmp0);
+}
+
+static INLINE int check_epi16_overflow_x4(const __m128i *preg0,
+                                          const __m128i *preg1,
+                                          const __m128i *preg2,
+                                          const __m128i *preg3) {
+  const __m128i max_overflow = _mm_set1_epi16(0x7fff);
+  const __m128i min_overflow = _mm_set1_epi16(0x8000);
+  __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow),
+                              _mm_cmpeq_epi16(*preg0, min_overflow));
+  __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow),
+                              _mm_cmpeq_epi16(*preg1, min_overflow));
+  __m128i cmp2 = _mm_or_si128(_mm_cmpeq_epi16(*preg2, max_overflow),
+                              _mm_cmpeq_epi16(*preg2, min_overflow));
+  __m128i cmp3 = _mm_or_si128(_mm_cmpeq_epi16(*preg3, max_overflow),
+                              _mm_cmpeq_epi16(*preg3, min_overflow));
+  cmp0 = _mm_or_si128(_mm_or_si128(cmp0, cmp1), _mm_or_si128(cmp2, cmp3));
+  return _mm_movemask_epi8(cmp0);
+}
+
+static INLINE int check_epi16_overflow_x8(
+    const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
+    const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
+    const __m128i *preg6, const __m128i *preg7) {
+  int res0, res1;
+  res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
+  res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
+  return res0 + res1;
+}
+
+static INLINE int check_epi16_overflow_x12(
+    const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
+    const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
+    const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
+    const __m128i *preg9, const __m128i *preg10, const __m128i *preg11) {
+  int res0, res1;
+  res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
+  res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
+  if (!res0) res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
+  return res0 + res1;
+}
+
+static INLINE int check_epi16_overflow_x16(
+    const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
+    const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
+    const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
+    const __m128i *preg9, const __m128i *preg10, const __m128i *preg11,
+    const __m128i *preg12, const __m128i *preg13, const __m128i *preg14,
+    const __m128i *preg15) {
+  int res0, res1;
+  res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
+  res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
+  if (!res0) {
+    res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
+    if (!res1) res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15);
+  }
+  return res0 + res1;
+}
+
+static INLINE int check_epi16_overflow_x32(
+    const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
+    const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
+    const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
+    const __m128i *preg9, const __m128i *preg10, const __m128i *preg11,
+    const __m128i *preg12, const __m128i *preg13, const __m128i *preg14,
+    const __m128i *preg15, const __m128i *preg16, const __m128i *preg17,
+    const __m128i *preg18, const __m128i *preg19, const __m128i *preg20,
+    const __m128i *preg21, const __m128i *preg22, const __m128i *preg23,
+    const __m128i *preg24, const __m128i *preg25, const __m128i *preg26,
+    const __m128i *preg27, const __m128i *preg28, const __m128i *preg29,
+    const __m128i *preg30, const __m128i *preg31) {
+  int res0, res1;
+  res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
+  res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
+  if (!res0) {
+    res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
+    if (!res1) {
+      res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15);
+      if (!res0) {
+        res0 = check_epi16_overflow_x4(preg16, preg17, preg18, preg19);
+        if (!res1) {
+          res1 = check_epi16_overflow_x4(preg20, preg21, preg22, preg23);
+          if (!res0) {
+            res0 = check_epi16_overflow_x4(preg24, preg25, preg26, preg27);
+            if (!res1)
+              res1 = check_epi16_overflow_x4(preg28, preg29, preg30, preg31);
+          }
+        }
+      }
+    }
+  }
+  return res0 + res1;
+}
+
+static INLINE void store_output(const __m128i *poutput, tran_low_t *dst_ptr) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
+  __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
+  __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
+  _mm_store_si128((__m128i *)(dst_ptr), out0);
+  _mm_store_si128((__m128i *)(dst_ptr + 4), out1);
+}
+
+static INLINE void storeu_output(const __m128i *poutput, tran_low_t *dst_ptr) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
+  __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
+  __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
+  _mm_storeu_si128((__m128i *)(dst_ptr), out0);
+  _mm_storeu_si128((__m128i *)(dst_ptr + 4), out1);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_DSP_X86_FWD_TXFM_SSE2_H_
diff --git a/libs/libaom/src/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm b/libs/libaom/src/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm
new file mode 100644
index 000000000..c1fb259a1
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm
@@ -0,0 +1,379 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+
+pw_11585x2: times 8 dw 23170
+pd_8192:    times 4 dd 8192
+
+%macro TRANSFORM_COEFFS 2
+pw_%1_%2:   dw  %1,  %2,  %1,  %2,  %1,  %2,  %1,  %2
+pw_%2_m%1:  dw  %2, -%1,  %2, -%1,  %2, -%1,  %2, -%1
+%endmacro
+
+TRANSFORM_COEFFS 11585,  11585
+TRANSFORM_COEFFS 15137,   6270
+TRANSFORM_COEFFS 16069,   3196
+TRANSFORM_COEFFS  9102,  13623
+
+%macro STORE_OUTPUT 2 ; index, result
+  ; const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
+  ; __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
+  ; __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
+  ; _mm_store_si128((__m128i *)(dst_ptr), out0);
+  ; _mm_store_si128((__m128i *)(dst_ptr + 4), out1);
+  pxor               m11, m11
+  pcmpgtw            m11, m%2
+  movdqa             m12, m%2
+  punpcklwd          m%2, m11
+  punpckhwd          m12, m11
+  mova               [outputq + 4*%1 +  0], m%2
+  mova               [outputq + 4*%1 + 16], m12
+%endmacro
+
+SECTION .text
+
+%if ARCH_X86_64
+INIT_XMM ssse3
+cglobal fdct8x8, 3, 5, 13, input, output, stride
+
+  mova               m8, [GLOBAL(pd_8192)]
+  mova              m12, [GLOBAL(pw_11585x2)]
+
+  lea                r3, [2 * strideq]
+  lea                r4, [4 * strideq]
+  mova               m0, [inputq]
+  mova               m1, [inputq + r3]
+  lea                inputq, [inputq + r4]
+  mova               m2, [inputq]
+  mova               m3, [inputq + r3]
+  lea                inputq, [inputq + r4]
+  mova               m4, [inputq]
+  mova               m5, [inputq + r3]
+  lea                inputq, [inputq + r4]
+  mova               m6, [inputq]
+  mova               m7, [inputq + r3]
+
+  ; left shift by 2 to increase forward transformation precision
+  psllw              m0, 2
+  psllw              m1, 2
+  psllw              m2, 2
+  psllw              m3, 2
+  psllw              m4, 2
+  psllw              m5, 2
+  psllw              m6, 2
+  psllw              m7, 2
+
+  ; column transform
+  ; stage 1
+  paddw m10, m0, m7
+  psubw m0, m7
+
+  paddw m9, m1, m6
+  psubw m1, m6
+
+  paddw m7, m2, m5
+  psubw m2, m5
+
+  paddw m6, m3, m4
+  psubw m3, m4
+
+  ; stage 2
+  paddw m5, m9, m7
+  psubw m9, m7
+
+  paddw m4, m10, m6
+  psubw m10, m6
+
+  paddw m7, m1, m2
+  psubw m1, m2
+
+  ; stage 3
+  paddw m6, m4, m5
+  psubw m4, m5
+
+  pmulhrsw m1, m12
+  pmulhrsw m7, m12
+
+  ; sin(pi / 8), cos(pi / 8)
+  punpcklwd m2, m10, m9
+  punpckhwd m10, m9
+  pmaddwd m5, m2, [GLOBAL(pw_15137_6270)]
+  pmaddwd m2, [GLOBAL(pw_6270_m15137)]
+  pmaddwd m9, m10, [GLOBAL(pw_15137_6270)]
+  pmaddwd m10, [GLOBAL(pw_6270_m15137)]
+  paddd m5, m8
+  paddd m2, m8
+  paddd m9, m8
+  paddd m10, m8
+  psrad m5, 14
+  psrad m2, 14
+  psrad m9, 14
+  psrad m10, 14
+  packssdw m5, m9
+  packssdw m2, m10
+
+  pmulhrsw m6, m12
+  pmulhrsw m4, m12
+
+  paddw m9, m3, m1
+  psubw m3, m1
+
+  paddw m10, m0, m7
+  psubw m0, m7
+
+  ; stage 4
+  ; sin(pi / 16), cos(pi / 16)
+  punpcklwd m1, m10, m9
+  punpckhwd m10, m9
+  pmaddwd m7, m1, [GLOBAL(pw_16069_3196)]
+  pmaddwd m1, [GLOBAL(pw_3196_m16069)]
+  pmaddwd m9, m10, [GLOBAL(pw_16069_3196)]
+  pmaddwd m10, [GLOBAL(pw_3196_m16069)]
+  paddd m7, m8
+  paddd m1, m8
+  paddd m9, m8
+  paddd m10, m8
+  psrad m7, 14
+  psrad m1, 14
+  psrad m9, 14
+  psrad m10, 14
+  packssdw m7, m9
+  packssdw m1, m10
+
+  ; sin(3 * pi / 16), cos(3 * pi / 16)
+  punpcklwd m11, m0, m3
+  punpckhwd m0, m3
+  pmaddwd m9, m11, [GLOBAL(pw_9102_13623)]
+  pmaddwd m11, [GLOBAL(pw_13623_m9102)]
+  pmaddwd m3, m0, [GLOBAL(pw_9102_13623)]
+  pmaddwd m0, [GLOBAL(pw_13623_m9102)]
+  paddd m9, m8
+  paddd m11, m8
+  paddd m3, m8
+  paddd m0, m8
+  psrad m9, 14
+  psrad m11, 14
+  psrad m3, 14
+  psrad m0, 14
+  packssdw m9, m3
+  packssdw m11, m0
+
+  ; transpose
+  ; stage 1
+  punpcklwd m0, m6, m7
+  punpcklwd m3, m5, m11
+  punpckhwd m6, m7
+  punpckhwd m5, m11
+  punpcklwd m7, m4, m9
+  punpcklwd m10, m2, m1
+  punpckhwd m4, m9
+  punpckhwd m2, m1
+
+  ; stage 2
+  punpckldq m9, m0, m3
+  punpckldq m1, m6, m5
+  punpckhdq m0, m3
+  punpckhdq m6, m5
+  punpckldq m3, m7, m10
+  punpckldq m5, m4, m2
+  punpckhdq m7, m10
+  punpckhdq m4, m2
+
+  ; stage 3
+  punpcklqdq m10, m9, m3
+  punpckhqdq m9, m3
+  punpcklqdq m2, m0, m7
+  punpckhqdq m0, m7
+  punpcklqdq m3, m1, m5
+  punpckhqdq m1, m5
+  punpcklqdq m7, m6, m4
+  punpckhqdq m6, m4
+
+  ; row transform
+  ; stage 1
+  paddw m5, m10, m6
+  psubw m10, m6
+
+  paddw m4, m9, m7
+  psubw m9, m7
+
+  paddw m6, m2, m1
+  psubw m2, m1
+
+  paddw m7, m0, m3
+  psubw m0, m3
+
+  ;stage 2
+  paddw m1, m5, m7
+  psubw m5, m7
+
+  paddw m3, m4, m6
+  psubw m4, m6
+
+  paddw m7, m9, m2
+  psubw m9, m2
+
+  ; stage 3
+  punpcklwd m6, m1, m3
+  punpckhwd m1, m3
+  pmaddwd m2, m6, [GLOBAL(pw_11585_11585)]
+  pmaddwd m6, [GLOBAL(pw_11585_m11585)]
+  pmaddwd m3, m1, [GLOBAL(pw_11585_11585)]
+  pmaddwd m1, [GLOBAL(pw_11585_m11585)]
+  paddd m2, m8
+  paddd m6, m8
+  paddd m3, m8
+  paddd m1, m8
+  psrad m2, 14
+  psrad m6, 14
+  psrad m3, 14
+  psrad m1, 14
+  packssdw m2, m3
+  packssdw m6, m1
+
+  pmulhrsw m7, m12
+  pmulhrsw m9, m12
+
+  punpcklwd m3, m5, m4
+  punpckhwd m5, m4
+  pmaddwd m1, m3, [GLOBAL(pw_15137_6270)]
+  pmaddwd m3, [GLOBAL(pw_6270_m15137)]
+  pmaddwd m4, m5, [GLOBAL(pw_15137_6270)]
+  pmaddwd m5, [GLOBAL(pw_6270_m15137)]
+  paddd m1, m8
+  paddd m3, m8
+  paddd m4, m8
+  paddd m5, m8
+  psrad m1, 14
+  psrad m3, 14
+  psrad m4, 14
+  psrad m5, 14
+  packssdw m1, m4
+  packssdw m3, m5
+
+  paddw m4, m0, m9
+  psubw m0, m9
+
+  paddw m5, m10, m7
+  psubw m10, m7
+
+  ; stage 4
+  punpcklwd m9, m5, m4
+  punpckhwd m5, m4
+  pmaddwd m7, m9, [GLOBAL(pw_16069_3196)]
+  pmaddwd m9, [GLOBAL(pw_3196_m16069)]
+  pmaddwd m4, m5, [GLOBAL(pw_16069_3196)]
+  pmaddwd m5, [GLOBAL(pw_3196_m16069)]
+  paddd m7, m8
+  paddd m9, m8
+  paddd m4, m8
+  paddd m5, m8
+  psrad m7, 14
+  psrad m9, 14
+  psrad m4, 14
+  psrad m5, 14
+  packssdw m7, m4
+  packssdw m9, m5
+
+  punpcklwd m4, m10, m0
+  punpckhwd m10, m0
+  pmaddwd m5, m4, [GLOBAL(pw_9102_13623)]
+  pmaddwd m4, [GLOBAL(pw_13623_m9102)]
+  pmaddwd m0, m10, [GLOBAL(pw_9102_13623)]
+  pmaddwd m10, [GLOBAL(pw_13623_m9102)]
+  paddd m5, m8
+  paddd m4, m8
+  paddd m0, m8
+  paddd m10, m8
+  psrad m5, 14
+  psrad m4, 14
+  psrad m0, 14
+  psrad m10, 14
+  packssdw m5, m0
+  packssdw m4, m10
+
+  ; transpose
+  ; stage 1
+  punpcklwd m0, m2, m7
+  punpcklwd m10, m1, m4
+  punpckhwd m2, m7
+  punpckhwd m1, m4
+  punpcklwd m7, m6, m5
+  punpcklwd m4, m3, m9
+  punpckhwd m6, m5
+  punpckhwd m3, m9
+
+  ; stage 2
+  punpckldq m5, m0, m10
+  punpckldq m9, m2, m1
+  punpckhdq m0, m10
+  punpckhdq m2, m1
+  punpckldq m10, m7, m4
+  punpckldq m1, m6, m3
+  punpckhdq m7, m4
+  punpckhdq m6, m3
+
+  ; stage 3
+  punpcklqdq m4, m5, m10
+  punpckhqdq m5, m10
+  punpcklqdq m3, m0, m7
+  punpckhqdq m0, m7
+  punpcklqdq m10, m9, m1
+  punpckhqdq m9, m1
+  punpcklqdq m7, m2, m6
+  punpckhqdq m2, m6
+
+  psraw m1, m4, 15
+  psraw m6, m5, 15
+  psraw m8, m3, 15
+  psraw m11, m0, 15
+
+  psubw m4, m1
+  psubw m5, m6
+  psubw m3, m8
+  psubw m0, m11
+
+  psraw m4, 1
+  psraw m5, 1
+  psraw m3, 1
+  psraw m0, 1
+
+  psraw m1, m10, 15
+  psraw m6, m9, 15
+  psraw m8, m7, 15
+  psraw m11, m2, 15
+
+  psubw m10, m1
+  psubw m9, m6
+  psubw m7, m8
+  psubw m2, m11
+
+  psraw m10, 1
+  psraw m9, 1
+  psraw m7, 1
+  psraw m2, 1
+
+  STORE_OUTPUT  0,  4
+  STORE_OUTPUT  8,  5
+  STORE_OUTPUT 16,  3
+  STORE_OUTPUT 24,  0
+  STORE_OUTPUT 32, 10
+  STORE_OUTPUT 40,  9
+  STORE_OUTPUT 48,  7
+  STORE_OUTPUT 56,  2
+
+  RET
+%endif
diff --git a/libs/libaom/src/aom_dsp/x86/highbd_adaptive_quantize_avx2.c b/libs/libaom/src/aom_dsp/x86/highbd_adaptive_quantize_avx2.c
new file mode 100644
index 000000000..c500b0a26
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/highbd_adaptive_quantize_avx2.c
@@ -0,0 +1,457 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/quantize_x86.h"
+
+#include "av1/encoder/av1_quantize.h"
+
+static INLINE void highbd_load_b_values_avx2(
+    const int16_t *zbin_ptr, __m256i *zbin, const int16_t *round_ptr,
+    __m256i *round, const int16_t *quant_ptr, __m256i *quant,
+    const int16_t *dequant_ptr, __m256i *dequant, const int16_t *shift_ptr,
+    __m256i *shift) {
+  *zbin = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)zbin_ptr));
+  *zbin = _mm256_sub_epi32(*zbin, _mm256_set1_epi32(1));
+  *round = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)round_ptr));
+  *quant = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)quant_ptr));
+  *dequant =
+      _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)dequant_ptr));
+  *shift = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)shift_ptr));
+}
+
+static INLINE void highbd_update_mask1_avx2(__m256i *cmp_mask,
+                                            const int16_t *iscan_ptr,
+                                            int *is_found, __m256i *mask) {
+  __m256i temp_mask = _mm256_setzero_si256();
+  if (_mm256_movemask_epi8(*cmp_mask)) {
+    __m256i iscan = _mm256_loadu_si256((const __m256i *)(iscan_ptr));
+    temp_mask = _mm256_and_si256(*cmp_mask, iscan);
+    *is_found = 1;
+  }
+  *mask = _mm256_max_epi16(temp_mask, *mask);
+}
+
+static INLINE void highbd_update_mask0_avx2(__m256i *qcoeff0, __m256i *qcoeff1,
+                                            __m256i *threshold,
+                                            const int16_t *iscan_ptr,
+                                            int *is_found, __m256i *mask) {
+  __m256i coeff[2], cmp_mask0, cmp_mask1;
+  coeff[0] = _mm256_slli_epi32(*qcoeff0, AOM_QM_BITS);
+  cmp_mask0 = _mm256_cmpgt_epi32(coeff[0], threshold[0]);
+  coeff[1] = _mm256_slli_epi32(*qcoeff1, AOM_QM_BITS);
+  cmp_mask1 = _mm256_cmpgt_epi32(coeff[1], threshold[1]);
+  cmp_mask0 =
+      _mm256_permute4x64_epi64(_mm256_packs_epi32(cmp_mask0, cmp_mask1), 0xd8);
+  highbd_update_mask1_avx2(&cmp_mask0, iscan_ptr, is_found, mask);
+}
+
+static INLINE void highbd_mul_shift_avx2(const __m256i *x, const __m256i *y,
+                                         __m256i *p, const int shift) {
+  __m256i prod_lo = _mm256_mul_epi32(*x, *y);
+  __m256i prod_hi = _mm256_srli_epi64(*x, 32);
+  const __m256i mult_hi = _mm256_srli_epi64(*y, 32);
+  prod_hi = _mm256_mul_epi32(prod_hi, mult_hi);
+
+  prod_lo = _mm256_srli_epi64(prod_lo, shift);
+  prod_hi = _mm256_srli_epi64(prod_hi, shift);
+
+  prod_hi = _mm256_slli_epi64(prod_hi, 32);
+  *p = _mm256_blend_epi32(prod_lo, prod_hi, 0xaa);
+}
+
+static INLINE void highbd_calculate_qcoeff_avx2(__m256i *coeff,
+                                                const __m256i *round,
+                                                const __m256i *quant,
+                                                const __m256i *shift,
+                                                const int *log_scale) {
+  __m256i tmp, qcoeff;
+  qcoeff = _mm256_add_epi32(*coeff, *round);
+  highbd_mul_shift_avx2(&qcoeff, quant, &tmp, 16);
+  qcoeff = _mm256_add_epi32(tmp, qcoeff);
+  highbd_mul_shift_avx2(&qcoeff, shift, coeff, 16 - *log_scale);
+}
+
+static INLINE __m256i highbd_calculate_dqcoeff_avx2(__m256i qcoeff,
+                                                    __m256i dequant) {
+  return _mm256_mullo_epi32(qcoeff, dequant);
+}
+
+static INLINE __m256i highbd_calculate_dqcoeff_log_scale_avx2(
+    __m256i qcoeff, __m256i dequant, const int log_scale) {
+  __m256i abs_coeff = _mm256_abs_epi32(qcoeff);
+  highbd_mul_shift_avx2(&abs_coeff, &dequant, &abs_coeff, log_scale);
+  return _mm256_sign_epi32(abs_coeff, qcoeff);
+}
+
+static INLINE void highbd_store_coefficients_avx2(__m256i coeff0,
+                                                  __m256i coeff1,
+                                                  tran_low_t *coeff_ptr) {
+  _mm256_store_si256((__m256i *)(coeff_ptr), coeff0);
+  _mm256_store_si256((__m256i *)(coeff_ptr + 8), coeff1);
+}
+
+void aom_highbd_quantize_b_adaptive_avx2(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  int index = 16;
+  int non_zero_count = 0;
+  int non_zero_count_prescan_add_zero = 0;
+  int is_found0 = 0, is_found1 = 0;
+  int eob = -1;
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i zbin, round, quant, dequant, shift;
+  __m256i coeff0, qcoeff0, coeff1, qcoeff1;
+  __m256i cmp_mask, mask0 = zero, mask1 = zero;
+  __m128i temp_mask0, temp_mask1;
+  int prescan_add[2];
+  int thresh[2];
+  const int log_scale = 0;
+  const qm_val_t wt = (1 << AOM_QM_BITS);
+  for (int i = 0; i < 2; ++i) {
+    prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
+    thresh[i] = (zbin_ptr[i] * wt + prescan_add[i]) - 1;
+  }
+  __m256i threshold[2];
+  threshold[0] = _mm256_set1_epi32(thresh[0]);
+  threshold[1] = _mm256_set1_epi32(thresh[1]);
+  threshold[0] = _mm256_blend_epi32(threshold[0], threshold[1], 0xfe);
+
+#if SKIP_EOB_FACTOR_ADJUST
+  int first = -1;
+#endif
+
+  // Setup global values.
+  highbd_load_b_values_avx2(zbin_ptr, &zbin, round_ptr, &round, quant_ptr,
+                            &quant, dequant_ptr, &dequant, quant_shift_ptr,
+                            &shift);
+
+  // Do DC and first 15 AC.
+  coeff0 = _mm256_load_si256((__m256i *)(coeff_ptr));
+  qcoeff0 = _mm256_abs_epi32(coeff0);
+  coeff1 = _mm256_load_si256((__m256i *)(coeff_ptr + 8));
+  qcoeff1 = _mm256_abs_epi32(coeff1);
+  highbd_update_mask0_avx2(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0,
+                           &mask0);
+  __m256i temp0 = _mm256_cmpgt_epi32(qcoeff0, zbin);
+  zbin = _mm256_unpackhi_epi64(zbin, zbin);
+  __m256i temp1 = _mm256_cmpgt_epi32(qcoeff1, zbin);
+  cmp_mask = _mm256_permute4x64_epi64(_mm256_packs_epi32(temp0, temp1), 0xd8);
+  highbd_update_mask1_avx2(&cmp_mask, iscan, &is_found1, &mask1);
+  threshold[0] = threshold[1];
+  if (_mm256_movemask_epi8(cmp_mask) == 0) {
+    _mm256_store_si256((__m256i *)(qcoeff_ptr), zero);
+    _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), zero);
+    _mm256_store_si256((__m256i *)(dqcoeff_ptr), zero);
+    _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), zero);
+    round = _mm256_unpackhi_epi64(round, round);
+    quant = _mm256_unpackhi_epi64(quant, quant);
+    shift = _mm256_unpackhi_epi64(shift, shift);
+    dequant = _mm256_unpackhi_epi64(dequant, dequant);
+  } else {
+    highbd_calculate_qcoeff_avx2(&qcoeff0, &round, &quant, &shift, &log_scale);
+    round = _mm256_unpackhi_epi64(round, round);
+    quant = _mm256_unpackhi_epi64(quant, quant);
+    shift = _mm256_unpackhi_epi64(shift, shift);
+    highbd_calculate_qcoeff_avx2(&qcoeff1, &round, &quant, &shift, &log_scale);
+    // Reinsert signs
+    qcoeff0 = _mm256_sign_epi32(qcoeff0, coeff0);
+    qcoeff1 = _mm256_sign_epi32(qcoeff1, coeff1);
+    // Mask out zbin threshold coeffs
+    qcoeff0 = _mm256_and_si256(qcoeff0, temp0);
+    qcoeff1 = _mm256_and_si256(qcoeff1, temp1);
+    highbd_store_coefficients_avx2(qcoeff0, qcoeff1, qcoeff_ptr);
+    coeff0 = highbd_calculate_dqcoeff_avx2(qcoeff0, dequant);
+    dequant = _mm256_unpackhi_epi64(dequant, dequant);
+    coeff1 = highbd_calculate_dqcoeff_avx2(qcoeff1, dequant);
+    highbd_store_coefficients_avx2(coeff0, coeff1, dqcoeff_ptr);
+  }
+
+  // AC only loop.
+  while (index < n_coeffs) {
+    coeff0 = _mm256_load_si256((__m256i *)(coeff_ptr + index));
+    qcoeff0 = _mm256_abs_epi32(coeff0);
+    coeff1 = _mm256_load_si256((__m256i *)(coeff_ptr + index + 8));
+    qcoeff1 = _mm256_abs_epi32(coeff1);
+    highbd_update_mask0_avx2(&qcoeff0, &qcoeff1, threshold, iscan + index,
+                             &is_found0, &mask0);
+    temp0 = _mm256_cmpgt_epi32(qcoeff0, zbin);
+    temp1 = _mm256_cmpgt_epi32(qcoeff1, zbin);
+    cmp_mask = _mm256_permute4x64_epi64(_mm256_packs_epi32(temp0, temp1), 0xd8);
+    highbd_update_mask1_avx2(&cmp_mask, iscan + index, &is_found1, &mask1);
+    if (_mm256_movemask_epi8(cmp_mask) == 0) {
+      _mm256_store_si256((__m256i *)(qcoeff_ptr + index), zero);
+      _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), zero);
+      _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), zero);
+      _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), zero);
+      index += 16;
+      continue;
+    }
+    highbd_calculate_qcoeff_avx2(&qcoeff0, &round, &quant, &shift, &log_scale);
+    highbd_calculate_qcoeff_avx2(&qcoeff1, &round, &quant, &shift, &log_scale);
+    qcoeff0 = _mm256_sign_epi32(qcoeff0, coeff0);
+    qcoeff1 = _mm256_sign_epi32(qcoeff1, coeff1);
+    qcoeff0 = _mm256_and_si256(qcoeff0, temp0);
+    qcoeff1 = _mm256_and_si256(qcoeff1, temp1);
+    highbd_store_coefficients_avx2(qcoeff0, qcoeff1, qcoeff_ptr + index);
+    coeff0 = highbd_calculate_dqcoeff_avx2(qcoeff0, dequant);
+    coeff1 = highbd_calculate_dqcoeff_avx2(qcoeff1, dequant);
+    highbd_store_coefficients_avx2(coeff0, coeff1, dqcoeff_ptr + index);
+    index += 16;
+  }
+  if (is_found0) {
+    temp_mask0 = _mm_max_epi16(_mm256_castsi256_si128(mask0),
+                               _mm256_extracti128_si256(mask0, 1));
+    non_zero_count = calculate_non_zero_count(temp_mask0);
+  }
+  if (is_found1) {
+    temp_mask1 = _mm_max_epi16(_mm256_castsi256_si128(mask1),
+                               _mm256_extracti128_si256(mask1, 1));
+    non_zero_count_prescan_add_zero = calculate_non_zero_count(temp_mask1);
+  }
+
+  for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
+    const int rc = scan[i];
+    qcoeff_ptr[rc] = 0;
+    dqcoeff_ptr[rc] = 0;
+  }
+
+  for (int i = non_zero_count - 1; i >= 0; i--) {
+    const int rc = scan[i];
+    if (qcoeff_ptr[rc]) {
+      eob = i;
+      break;
+    }
+  }
+
+  *eob_ptr = eob + 1;
+#if SKIP_EOB_FACTOR_ADJUST
+  // TODO(Aniket): Experiment the following loop with intrinsic by combining
+  // with the quantization loop above
+  for (int i = 0; i < non_zero_count; i++) {
+    const int rc = scan[i];
+    const int qcoeff = qcoeff_ptr[rc];
+    if (qcoeff) {
+      first = i;
+      break;
+    }
+  }
+  if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
+    const int rc = scan[(*eob_ptr - 1)];
+    if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
+      const int coeff = coeff_ptr[rc] * wt;
+      const int coeff_sign = AOMSIGN(coeff);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
+      const int prescan_add_val =
+          ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
+      if (abs_coeff <
+          (zbin_ptr[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
+        qcoeff_ptr[rc] = 0;
+        dqcoeff_ptr[rc] = 0;
+        *eob_ptr = 0;
+      }
+    }
+  }
+#endif
+}
+
+void aom_highbd_quantize_b_32x32_adaptive_avx2(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  int index = 16;
+  int non_zero_count = 0;
+  int non_zero_count_prescan_add_zero = 0;
+  int is_found0 = 0, is_found1 = 0;
+  int eob = -1;
+  const int log_scale = 1;
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i zbin, round, quant, dequant, shift;
+  __m256i coeff0, qcoeff0, coeff1, qcoeff1;
+  __m256i cmp_mask, mask0 = zero, mask1 = zero;
+  __m128i temp_mask0, temp_mask1;
+  const __m256i one = _mm256_set1_epi32(1);
+  const __m256i log_scale_vec = _mm256_set1_epi32(log_scale);
+  int prescan_add[2];
+  int thresh[2];
+  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
+                         ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
+  const qm_val_t wt = (1 << AOM_QM_BITS);
+  for (int i = 0; i < 2; ++i) {
+    prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
+    thresh[i] = (zbins[i] * wt + prescan_add[i]) - 1;
+  }
+  __m256i threshold[2];
+  threshold[0] = _mm256_set1_epi32(thresh[0]);
+  threshold[1] = _mm256_set1_epi32(thresh[1]);
+  threshold[0] = _mm256_blend_epi32(threshold[0], threshold[1], 0xfe);
+
+#if SKIP_EOB_FACTOR_ADJUST
+  int first = -1;
+#endif
+
+  // Setup global values.
+  zbin = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)zbin_ptr));
+  round = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)round_ptr));
+  quant = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)quant_ptr));
+  dequant = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)dequant_ptr));
+  shift =
+      _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)quant_shift_ptr));
+
+  // Shift with rounding.
+  zbin = _mm256_add_epi32(zbin, log_scale_vec);
+  round = _mm256_add_epi32(round, log_scale_vec);
+  zbin = _mm256_srli_epi32(zbin, log_scale);
+  round = _mm256_srli_epi32(round, log_scale);
+  zbin = _mm256_sub_epi32(zbin, one);
+
+  // Do DC and first 15 AC.
+  coeff0 = _mm256_load_si256((__m256i *)(coeff_ptr));
+  qcoeff0 = _mm256_abs_epi32(coeff0);
+  coeff1 = _mm256_load_si256((__m256i *)(coeff_ptr + 8));
+  qcoeff1 = _mm256_abs_epi32(coeff1);
+  highbd_update_mask0_avx2(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0,
+                           &mask0);
+  __m256i temp0 = _mm256_cmpgt_epi32(qcoeff0, zbin);
+  zbin = _mm256_permute2x128_si256(zbin, zbin, 0x11);
+  __m256i temp1 = _mm256_cmpgt_epi32(qcoeff1, zbin);
+  cmp_mask = _mm256_permute4x64_epi64(_mm256_packs_epi32(temp0, temp1), 0xd8);
+  highbd_update_mask1_avx2(&cmp_mask, iscan, &is_found1, &mask1);
+  threshold[0] = threshold[1];
+  if (_mm256_movemask_epi8(cmp_mask) == 0) {
+    _mm256_store_si256((__m256i *)(qcoeff_ptr), zero);
+    _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), zero);
+    _mm256_store_si256((__m256i *)(dqcoeff_ptr), zero);
+    _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), zero);
+    round = _mm256_permute2x128_si256(round, round, 0x11);
+    quant = _mm256_permute2x128_si256(quant, quant, 0x11);
+    shift = _mm256_permute2x128_si256(shift, shift, 0x11);
+    dequant = _mm256_permute2x128_si256(dequant, dequant, 0x11);
+  } else {
+    highbd_calculate_qcoeff_avx2(&qcoeff0, &round, &quant, &shift, &log_scale);
+    round = _mm256_permute2x128_si256(round, round, 0x11);
+    quant = _mm256_permute2x128_si256(quant, quant, 0x11);
+    shift = _mm256_permute2x128_si256(shift, shift, 0x11);
+    highbd_calculate_qcoeff_avx2(&qcoeff1, &round, &quant, &shift, &log_scale);
+    // Reinsert signs
+    qcoeff0 = _mm256_sign_epi32(qcoeff0, coeff0);
+    qcoeff1 = _mm256_sign_epi32(qcoeff1, coeff1);
+    // Mask out zbin threshold coeffs
+    qcoeff0 = _mm256_and_si256(qcoeff0, temp0);
+    qcoeff1 = _mm256_and_si256(qcoeff1, temp1);
+    highbd_store_coefficients_avx2(qcoeff0, qcoeff1, qcoeff_ptr);
+    coeff0 =
+        highbd_calculate_dqcoeff_log_scale_avx2(qcoeff0, dequant, log_scale);
+    dequant = _mm256_permute2x128_si256(dequant, dequant, 0x11);
+    coeff1 =
+        highbd_calculate_dqcoeff_log_scale_avx2(qcoeff1, dequant, log_scale);
+    highbd_store_coefficients_avx2(coeff0, coeff1, dqcoeff_ptr);
+  }
+
+  // AC only loop.
+  while (index < n_coeffs) {
+    coeff0 = _mm256_load_si256((__m256i *)(coeff_ptr + index));
+    qcoeff0 = _mm256_abs_epi32(coeff0);
+    coeff1 = _mm256_load_si256((__m256i *)(coeff_ptr + index + 8));
+    qcoeff1 = _mm256_abs_epi32(coeff1);
+    highbd_update_mask0_avx2(&qcoeff0, &qcoeff1, threshold, iscan + index,
+                             &is_found0, &mask0);
+    temp0 = _mm256_cmpgt_epi32(qcoeff0, zbin);
+    temp1 = _mm256_cmpgt_epi32(qcoeff1, zbin);
+    cmp_mask = _mm256_permute4x64_epi64(_mm256_packs_epi32(temp0, temp1), 0xd8);
+    highbd_update_mask1_avx2(&cmp_mask, iscan + index, &is_found1, &mask1);
+    if (_mm256_movemask_epi8(cmp_mask) == 0) {
+      _mm256_store_si256((__m256i *)(qcoeff_ptr + index), zero);
+      _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), zero);
+      _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), zero);
+      _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), zero);
+      index += 16;
+      continue;
+    }
+    highbd_calculate_qcoeff_avx2(&qcoeff0, &round, &quant, &shift, &log_scale);
+    highbd_calculate_qcoeff_avx2(&qcoeff1, &round, &quant, &shift, &log_scale);
+    qcoeff0 = _mm256_sign_epi32(qcoeff0, coeff0);
+    qcoeff1 = _mm256_sign_epi32(qcoeff1, coeff1);
+    qcoeff0 = _mm256_and_si256(qcoeff0, temp0);
+    qcoeff1 = _mm256_and_si256(qcoeff1, temp1);
+    highbd_store_coefficients_avx2(qcoeff0, qcoeff1, qcoeff_ptr + index);
+    coeff0 =
+        highbd_calculate_dqcoeff_log_scale_avx2(qcoeff0, dequant, log_scale);
+    coeff1 =
+        highbd_calculate_dqcoeff_log_scale_avx2(qcoeff1, dequant, log_scale);
+    highbd_store_coefficients_avx2(coeff0, coeff1, dqcoeff_ptr + index);
+    index += 16;
+  }
+  if (is_found0) {
+    temp_mask0 = _mm_max_epi16(_mm256_castsi256_si128(mask0),
+                               _mm256_extracti128_si256(mask0, 1));
+    non_zero_count = calculate_non_zero_count(temp_mask0);
+  }
+  if (is_found1) {
+    temp_mask1 = _mm_max_epi16(_mm256_castsi256_si128(mask1),
+                               _mm256_extracti128_si256(mask1, 1));
+    non_zero_count_prescan_add_zero = calculate_non_zero_count(temp_mask1);
+  }
+
+  for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
+    const int rc = scan[i];
+    qcoeff_ptr[rc] = 0;
+    dqcoeff_ptr[rc] = 0;
+  }
+
+  for (int i = non_zero_count - 1; i >= 0; i--) {
+    const int rc = scan[i];
+    if (qcoeff_ptr[rc]) {
+      eob = i;
+      break;
+    }
+  }
+
+  *eob_ptr = eob + 1;
+#if SKIP_EOB_FACTOR_ADJUST
+  // TODO(Aniket): Experiment the following loop with intrinsic by combining
+  // with the quantization loop above
+  for (int i = 0; i < non_zero_count; i++) {
+    const int rc = scan[i];
+    const int qcoeff = qcoeff_ptr[rc];
+    if (qcoeff) {
+      first = i;
+      break;
+    }
+  }
+  if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
+    const int rc = scan[(*eob_ptr - 1)];
+    if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
+      const int coeff = coeff_ptr[rc] * wt;
+      const int coeff_sign = AOMSIGN(coeff);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
+      const int prescan_add_val =
+          ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
+      if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
+        qcoeff_ptr[rc] = 0;
+        dqcoeff_ptr[rc] = 0;
+        *eob_ptr = 0;
+      }
+    }
+  }
+#endif
+}
diff --git a/libs/libaom/src/aom_dsp/x86/highbd_adaptive_quantize_sse2.c b/libs/libaom/src/aom_dsp/x86/highbd_adaptive_quantize_sse2.c
new file mode 100644
index 000000000..8f31f3596
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/highbd_adaptive_quantize_sse2.c
@@ -0,0 +1,732 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/quantize_x86.h"
+#include "av1/encoder/av1_quantize.h"
+
+static INLINE __m128i highbd_invert_sign_64bit_sse2(__m128i a, __m128i sign) {
+  a = _mm_xor_si128(a, sign);
+  return _mm_sub_epi64(a, sign);
+}
+
+static INLINE void highbd_mul_shift_sse2(const __m128i *x, const __m128i *y,
+                                         __m128i *p, const int shift) {
+  __m128i sign = _mm_srai_epi32(*y, 31);
+  __m128i sign_lo = _mm_unpacklo_epi32(sign, sign);
+  __m128i sign_hi = _mm_unpackhi_epi32(sign, sign);
+  __m128i abs_y = invert_sign_32_sse2(*y, sign);
+  __m128i prod_lo = _mm_mul_epu32(*x, abs_y);
+  __m128i prod_hi = _mm_srli_epi64(*x, 32);
+  const __m128i mult_hi = _mm_srli_epi64(abs_y, 32);
+  prod_hi = _mm_mul_epu32(prod_hi, mult_hi);
+  prod_lo = highbd_invert_sign_64bit_sse2(prod_lo, sign_lo);
+  prod_hi = highbd_invert_sign_64bit_sse2(prod_hi, sign_hi);
+
+  prod_lo = _mm_srli_epi64(prod_lo, shift);
+  const __m128i mask = _mm_set_epi32(0, -1, 0, -1);
+  prod_lo = _mm_and_si128(prod_lo, mask);
+  prod_hi = _mm_srli_epi64(prod_hi, shift);
+
+  prod_hi = _mm_slli_epi64(prod_hi, 32);
+  *p = _mm_or_si128(prod_lo, prod_hi);
+}
+
+static INLINE void highbd_calculate_qcoeff(__m128i *coeff, const __m128i *round,
+                                           const __m128i *quant,
+                                           const __m128i *shift,
+                                           const int *log_scale) {
+  __m128i tmp, qcoeff;
+  qcoeff = _mm_add_epi32(*coeff, *round);
+  highbd_mul_shift_sse2(&qcoeff, quant, &tmp, 16);
+  qcoeff = _mm_add_epi32(tmp, qcoeff);
+  highbd_mul_shift_sse2(&qcoeff, shift, coeff, 16 - *log_scale);
+}
+
+static INLINE void highbd_update_mask1(__m128i *cmp_mask0,
+                                       const int16_t *iscan_ptr, int *is_found,
+                                       __m128i *mask) {
+  __m128i temp_mask = _mm_setzero_si128();
+  if (_mm_movemask_epi8(*cmp_mask0)) {
+    __m128i iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr));
+    __m128i mask0 = _mm_and_si128(*cmp_mask0, iscan0);
+    temp_mask = mask0;
+    *is_found = 1;
+  }
+  *mask = _mm_max_epi16(temp_mask, *mask);
+}
+
+static INLINE void highbd_update_mask0(__m128i *qcoeff0, __m128i *qcoeff1,
+                                       __m128i *threshold,
+                                       const int16_t *iscan_ptr, int *is_found,
+                                       __m128i *mask) {
+  __m128i coeff[2], cmp_mask0, cmp_mask1;
+
+  coeff[0] = _mm_slli_epi32(*qcoeff0, AOM_QM_BITS);
+  cmp_mask0 = _mm_cmpgt_epi32(coeff[0], threshold[0]);
+  coeff[1] = _mm_slli_epi32(*qcoeff1, AOM_QM_BITS);
+  cmp_mask1 = _mm_cmpgt_epi32(coeff[1], threshold[1]);
+
+  cmp_mask0 = _mm_packs_epi32(cmp_mask0, cmp_mask1);
+
+  highbd_update_mask1(&cmp_mask0, iscan_ptr, is_found, mask);
+}
+
+static INLINE __m128i highbd_calculate_dqcoeff(__m128i qcoeff, __m128i dequant,
+                                               const int log_scale) {
+  __m128i coeff_sign = _mm_srai_epi32(qcoeff, 31);
+  __m128i abs_coeff = invert_sign_32_sse2(qcoeff, coeff_sign);
+  highbd_mul_shift_sse2(&abs_coeff, &dequant, &abs_coeff, log_scale);
+  return invert_sign_32_sse2(abs_coeff, coeff_sign);
+}
+
+void aom_highbd_quantize_b_adaptive_sse2(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  int index = 8;
+  const int log_scale = 0;
+  int non_zero_count = 0;
+  int non_zero_count_prescan_add_zero = 0;
+  int is_found0 = 0, is_found1 = 0;
+  int eob = -1;
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi32(1);
+  __m128i zbin, round, quant, dequant, shift;
+  __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
+  __m128i qcoeff0, qcoeff1;
+  __m128i cmp_mask0, cmp_mask1, cmp_mask;
+  __m128i all_zero;
+  __m128i mask0 = zero, mask1 = zero;
+
+  int prescan_add[2];
+  int thresh[4];
+  const qm_val_t wt = (1 << AOM_QM_BITS);
+  for (int i = 0; i < 2; ++i) {
+    prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
+    thresh[i] = (zbin_ptr[i] * wt + prescan_add[i]) - 1;
+  }
+  thresh[2] = thresh[3] = thresh[1];
+  __m128i threshold[2];
+  threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]);
+  threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]);
+
+#if SKIP_EOB_FACTOR_ADJUST
+  int first = -1;
+#endif
+  // Setup global values.
+  zbin = _mm_load_si128((const __m128i *)zbin_ptr);
+  round = _mm_load_si128((const __m128i *)round_ptr);
+  quant = _mm_load_si128((const __m128i *)quant_ptr);
+  dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+  shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
+
+  __m128i zbin_sign = _mm_srai_epi16(zbin, 15);
+  __m128i round_sign = _mm_srai_epi16(round, 15);
+  __m128i quant_sign = _mm_srai_epi16(quant, 15);
+  __m128i dequant_sign = _mm_srai_epi16(dequant, 15);
+  __m128i shift_sign = _mm_srai_epi16(shift, 15);
+
+  zbin = _mm_unpacklo_epi16(zbin, zbin_sign);
+  round = _mm_unpacklo_epi16(round, round_sign);
+  quant = _mm_unpacklo_epi16(quant, quant_sign);
+  dequant = _mm_unpacklo_epi16(dequant, dequant_sign);
+  shift = _mm_unpacklo_epi16(shift, shift_sign);
+  zbin = _mm_sub_epi32(zbin, one);
+
+  // Do DC and first 15 AC.
+  coeff0 = _mm_load_si128((__m128i *)(coeff_ptr));
+  coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + 4));
+
+  coeff0_sign = _mm_srai_epi32(coeff0, 31);
+  coeff1_sign = _mm_srai_epi32(coeff1, 31);
+  qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign);
+  qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign);
+
+  highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0);
+
+  cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin);
+  zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
+  cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin);
+  cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1);
+  highbd_update_mask1(&cmp_mask, iscan, &is_found1, &mask1);
+
+  threshold[0] = threshold[1];
+  all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+  if (_mm_movemask_epi8(all_zero) == 0) {
+    _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
+
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+  } else {
+    highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale);
+
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale);
+
+    // Reinsert signs
+    qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign);
+    qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign);
+
+    // Mask out zbin threshold coeffs
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    _mm_store_si128((__m128i *)(qcoeff_ptr), qcoeff0);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 4), qcoeff1);
+
+    coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+    coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr), coeff0);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), coeff1);
+  }
+
+  // AC only loop.
+  while (index < n_coeffs) {
+    coeff0 = _mm_load_si128((__m128i *)(coeff_ptr + index));
+    coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + index + 4));
+
+    coeff0_sign = _mm_srai_epi32(coeff0, 31);
+    coeff1_sign = _mm_srai_epi32(coeff1, 31);
+    qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign);
+    qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign);
+
+    highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index,
+                        &is_found0, &mask0);
+
+    cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin);
+    cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin);
+    cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1);
+    highbd_update_mask1(&cmp_mask, iscan + index, &is_found1, &mask1);
+
+    all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+    if (_mm_movemask_epi8(all_zero) == 0) {
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
+      index += 8;
+      continue;
+    }
+    highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale);
+    highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale);
+
+    qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign);
+    qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign);
+
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    _mm_store_si128((__m128i *)(qcoeff_ptr + index), qcoeff0);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), qcoeff1);
+
+    coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale);
+    coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale);
+
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + index), coeff0);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), coeff1);
+
+    index += 8;
+  }
+  if (is_found0) non_zero_count = calculate_non_zero_count(mask0);
+  if (is_found1)
+    non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1);
+
+  for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
+    const int rc = scan[i];
+    qcoeff_ptr[rc] = 0;
+    dqcoeff_ptr[rc] = 0;
+  }
+
+  for (int i = non_zero_count - 1; i >= 0; i--) {
+    const int rc = scan[i];
+    if (qcoeff_ptr[rc]) {
+      eob = i;
+      break;
+    }
+  }
+
+  *eob_ptr = eob + 1;
+#if SKIP_EOB_FACTOR_ADJUST
+  // TODO(Aniket): Experiment the following loop with intrinsic by combining
+  // with the quantization loop above
+  for (int i = 0; i < non_zero_count; i++) {
+    const int rc = scan[i];
+    const int qcoeff = qcoeff_ptr[rc];
+    if (qcoeff) {
+      first = i;
+      break;
+    }
+  }
+  if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
+    const int rc = scan[(*eob_ptr - 1)];
+    if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
+      const int coeff = coeff_ptr[rc] * wt;
+      const int coeff_sign = AOMSIGN(coeff);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
+      const int prescan_add_val =
+          ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
+      if (abs_coeff <
+          (zbin_ptr[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
+        qcoeff_ptr[rc] = 0;
+        dqcoeff_ptr[rc] = 0;
+        *eob_ptr = 0;
+      }
+    }
+  }
+#endif
+}
+
+void aom_highbd_quantize_b_32x32_adaptive_sse2(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  int index = 8;
+  const int log_scale = 1;
+  int non_zero_count = 0;
+  int non_zero_count_prescan_add_zero = 0;
+  int is_found0 = 0, is_found1 = 0;
+  int eob = -1;
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi32(1);
+  const __m128i log_scale_vec = _mm_set1_epi32(log_scale);
+  __m128i zbin, round, quant, dequant, shift;
+  __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
+  __m128i qcoeff0, qcoeff1;
+  __m128i cmp_mask0, cmp_mask1, cmp_mask;
+  __m128i all_zero;
+  __m128i mask0 = zero, mask1 = zero;
+
+  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
+                         ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
+  int prescan_add[2];
+  int thresh[4];
+  const qm_val_t wt = (1 << AOM_QM_BITS);
+  for (int i = 0; i < 2; ++i) {
+    prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
+    thresh[i] = (zbins[i] * wt + prescan_add[i]) - 1;
+  }
+  thresh[2] = thresh[3] = thresh[1];
+  __m128i threshold[2];
+  threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]);
+  threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]);
+
+#if SKIP_EOB_FACTOR_ADJUST
+  int first = -1;
+#endif
+  // Setup global values.
+  zbin = _mm_load_si128((const __m128i *)zbin_ptr);
+  round = _mm_load_si128((const __m128i *)round_ptr);
+  quant = _mm_load_si128((const __m128i *)quant_ptr);
+  dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+  shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
+
+  __m128i zbin_sign = _mm_srai_epi16(zbin, 15);
+  __m128i round_sign = _mm_srai_epi16(round, 15);
+  __m128i quant_sign = _mm_srai_epi16(quant, 15);
+  __m128i dequant_sign = _mm_srai_epi16(dequant, 15);
+  __m128i shift_sign = _mm_srai_epi16(shift, 15);
+
+  zbin = _mm_unpacklo_epi16(zbin, zbin_sign);
+  round = _mm_unpacklo_epi16(round, round_sign);
+  quant = _mm_unpacklo_epi16(quant, quant_sign);
+  dequant = _mm_unpacklo_epi16(dequant, dequant_sign);
+  shift = _mm_unpacklo_epi16(shift, shift_sign);
+
+  // Shift with rounding.
+  zbin = _mm_add_epi32(zbin, log_scale_vec);
+  round = _mm_add_epi32(round, log_scale_vec);
+  zbin = _mm_srli_epi32(zbin, log_scale);
+  round = _mm_srli_epi32(round, log_scale);
+  zbin = _mm_sub_epi32(zbin, one);
+
+  // Do DC and first 15 AC.
+  coeff0 = _mm_load_si128((__m128i *)(coeff_ptr));
+  coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + 4));
+
+  coeff0_sign = _mm_srai_epi32(coeff0, 31);
+  coeff1_sign = _mm_srai_epi32(coeff1, 31);
+  qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign);
+  qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign);
+
+  highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0);
+
+  cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin);
+  zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
+  cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin);
+  cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1);
+  highbd_update_mask1(&cmp_mask, iscan, &is_found1, &mask1);
+
+  threshold[0] = threshold[1];
+  all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+  if (_mm_movemask_epi8(all_zero) == 0) {
+    _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
+
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+  } else {
+    highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale);
+
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale);
+
+    // Reinsert signs
+    qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign);
+    qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign);
+
+    // Mask out zbin threshold coeffs
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    _mm_store_si128((__m128i *)(qcoeff_ptr), qcoeff0);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 4), qcoeff1);
+
+    coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+    coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr), coeff0);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), coeff1);
+  }
+
+  // AC only loop.
+  while (index < n_coeffs) {
+    coeff0 = _mm_load_si128((__m128i *)(coeff_ptr + index));
+    coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + index + 4));
+
+    coeff0_sign = _mm_srai_epi32(coeff0, 31);
+    coeff1_sign = _mm_srai_epi32(coeff1, 31);
+    qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign);
+    qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign);
+
+    highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index,
+                        &is_found0, &mask0);
+
+    cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin);
+    cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin);
+    cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1);
+    highbd_update_mask1(&cmp_mask, iscan + index, &is_found1, &mask1);
+
+    all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+    if (_mm_movemask_epi8(all_zero) == 0) {
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
+      index += 8;
+      continue;
+    }
+    highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale);
+    highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale);
+
+    qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign);
+    qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign);
+
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    _mm_store_si128((__m128i *)(qcoeff_ptr + index), qcoeff0);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), qcoeff1);
+
+    coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale);
+    coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale);
+
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + index), coeff0);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), coeff1);
+
+    index += 8;
+  }
+  if (is_found0) non_zero_count = calculate_non_zero_count(mask0);
+  if (is_found1)
+    non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1);
+
+  for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
+    const int rc = scan[i];
+    qcoeff_ptr[rc] = 0;
+    dqcoeff_ptr[rc] = 0;
+  }
+
+  for (int i = non_zero_count - 1; i >= 0; i--) {
+    const int rc = scan[i];
+    if (qcoeff_ptr[rc]) {
+      eob = i;
+      break;
+    }
+  }
+
+  *eob_ptr = eob + 1;
+#if SKIP_EOB_FACTOR_ADJUST
+  // TODO(Aniket): Experiment the following loop with intrinsic by combining
+  // with the quantization loop above
+  for (int i = 0; i < non_zero_count; i++) {
+    const int rc = scan[i];
+    const int qcoeff = qcoeff_ptr[rc];
+    if (qcoeff) {
+      first = i;
+      break;
+    }
+  }
+  if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
+    const int rc = scan[(*eob_ptr - 1)];
+    if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
+      const int coeff = coeff_ptr[rc] * wt;
+      const int coeff_sign = AOMSIGN(coeff);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
+      const int prescan_add_val =
+          ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
+      if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
+        qcoeff_ptr[rc] = 0;
+        dqcoeff_ptr[rc] = 0;
+        *eob_ptr = 0;
+      }
+    }
+  }
+#endif
+}
+
+void aom_highbd_quantize_b_64x64_adaptive_sse2(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  int index = 8;
+  const int log_scale = 2;
+  int non_zero_count = 0;
+  int non_zero_count_prescan_add_zero = 0;
+  int is_found0 = 0, is_found1 = 0;
+  int eob = -1;
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi32(1);
+  const __m128i log_scale_vec = _mm_set1_epi32(log_scale);
+  __m128i zbin, round, quant, dequant, shift;
+  __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
+  __m128i qcoeff0, qcoeff1;
+  __m128i cmp_mask0, cmp_mask1, cmp_mask;
+  __m128i all_zero;
+  __m128i mask0 = zero, mask1 = zero;
+
+  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
+                         ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
+  int prescan_add[2];
+  int thresh[4];
+  const qm_val_t wt = (1 << AOM_QM_BITS);
+  for (int i = 0; i < 2; ++i) {
+    prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
+    thresh[i] = (zbins[i] * wt + prescan_add[i]) - 1;
+  }
+  thresh[2] = thresh[3] = thresh[1];
+  __m128i threshold[2];
+  threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]);
+  threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]);
+
+#if SKIP_EOB_FACTOR_ADJUST
+  int first = -1;
+#endif
+  // Setup global values.
+  zbin = _mm_load_si128((const __m128i *)zbin_ptr);
+  round = _mm_load_si128((const __m128i *)round_ptr);
+  quant = _mm_load_si128((const __m128i *)quant_ptr);
+  dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+  shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
+
+  __m128i zbin_sign = _mm_srai_epi16(zbin, 15);
+  __m128i round_sign = _mm_srai_epi16(round, 15);
+  __m128i quant_sign = _mm_srai_epi16(quant, 15);
+  __m128i dequant_sign = _mm_srai_epi16(dequant, 15);
+  __m128i shift_sign = _mm_srai_epi16(shift, 15);
+
+  zbin = _mm_unpacklo_epi16(zbin, zbin_sign);
+  round = _mm_unpacklo_epi16(round, round_sign);
+  quant = _mm_unpacklo_epi16(quant, quant_sign);
+  dequant = _mm_unpacklo_epi16(dequant, dequant_sign);
+  shift = _mm_unpacklo_epi16(shift, shift_sign);
+
+  // Shift with rounding.
+  zbin = _mm_add_epi32(zbin, log_scale_vec);
+  round = _mm_add_epi32(round, log_scale_vec);
+  zbin = _mm_srli_epi32(zbin, log_scale);
+  round = _mm_srli_epi32(round, log_scale);
+  zbin = _mm_sub_epi32(zbin, one);
+
+  // Do DC and first 15 AC.
+  coeff0 = _mm_load_si128((__m128i *)(coeff_ptr));
+  coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + 4));
+
+  coeff0_sign = _mm_srai_epi32(coeff0, 31);
+  coeff1_sign = _mm_srai_epi32(coeff1, 31);
+  qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign);
+  qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign);
+
+  highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0);
+
+  cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin);
+  zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
+  cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin);
+  cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1);
+  highbd_update_mask1(&cmp_mask, iscan, &is_found1, &mask1);
+
+  threshold[0] = threshold[1];
+  all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+  if (_mm_movemask_epi8(all_zero) == 0) {
+    _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
+
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+  } else {
+    highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale);
+
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale);
+
+    // Reinsert signs
+    qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign);
+    qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign);
+
+    // Mask out zbin threshold coeffs
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    _mm_store_si128((__m128i *)(qcoeff_ptr), qcoeff0);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 4), qcoeff1);
+
+    coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+    coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr), coeff0);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), coeff1);
+  }
+
+  // AC only loop.
+  while (index < n_coeffs) {
+    coeff0 = _mm_load_si128((__m128i *)(coeff_ptr + index));
+    coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + index + 4));
+
+    coeff0_sign = _mm_srai_epi32(coeff0, 31);
+    coeff1_sign = _mm_srai_epi32(coeff1, 31);
+    qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign);
+    qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign);
+
+    highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index,
+                        &is_found0, &mask0);
+
+    cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin);
+    cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin);
+    cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1);
+    highbd_update_mask1(&cmp_mask, iscan + index, &is_found1, &mask1);
+
+    all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+    if (_mm_movemask_epi8(all_zero) == 0) {
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
+      index += 8;
+      continue;
+    }
+    highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale);
+    highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale);
+
+    qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign);
+    qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign);
+
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    _mm_store_si128((__m128i *)(qcoeff_ptr + index), qcoeff0);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), qcoeff1);
+
+    coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale);
+    coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale);
+
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + index), coeff0);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), coeff1);
+
+    index += 8;
+  }
+  if (is_found0) non_zero_count = calculate_non_zero_count(mask0);
+  if (is_found1)
+    non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1);
+
+  for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
+    const int rc = scan[i];
+    qcoeff_ptr[rc] = 0;
+    dqcoeff_ptr[rc] = 0;
+  }
+
+  for (int i = non_zero_count - 1; i >= 0; i--) {
+    const int rc = scan[i];
+    if (qcoeff_ptr[rc]) {
+      eob = i;
+      break;
+    }
+  }
+
+  *eob_ptr = eob + 1;
+#if SKIP_EOB_FACTOR_ADJUST
+  // TODO(Aniket): Experiment the following loop with intrinsic by combining
+  // with the quantization loop above
+  for (int i = 0; i < non_zero_count; i++) {
+    const int rc = scan[i];
+    const int qcoeff = qcoeff_ptr[rc];
+    if (qcoeff) {
+      first = i;
+      break;
+    }
+  }
+  if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
+    const int rc = scan[(*eob_ptr - 1)];
+    if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
+      const int coeff = coeff_ptr[rc] * wt;
+      const int coeff_sign = AOMSIGN(coeff);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
+      const int prescan_add_val =
+          ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
+      if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
+        qcoeff_ptr[rc] = 0;
+        dqcoeff_ptr[rc] = 0;
+        *eob_ptr = 0;
+      }
+    }
+  }
+#endif
+}
diff --git a/libs/libaom/src/aom_dsp/x86/highbd_convolve_avx2.c b/libs/libaom/src/aom_dsp/x86/highbd_convolve_avx2.c
new file mode 100644
index 000000000..b43a7d7b5
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/highbd_convolve_avx2.c
@@ -0,0 +1,1323 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <immintrin.h>
+#include <string.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/convolve.h"
+#include "aom_dsp/x86/convolve_avx2.h"
+#include "aom_dsp/x86/synonyms.h"
+
+// -----------------------------------------------------------------------------
+// Copy and average
+
+static const uint8_t ip_shuffle_f2f3[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6,
+                                             7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3,
+                                             4, 5, 4, 5, 6, 7, 6, 7, 8, 9 };
+static const uint8_t ip_shuffle_f4f5[32] = { 4, 5, 6,  7,  6,  7,  8,  9,
+                                             8, 9, 10, 11, 10, 11, 12, 13,
+                                             4, 5, 6,  7,  6,  7,  8,  9,
+                                             8, 9, 10, 11, 10, 11, 12, 13 };
+
+void aom_highbd_convolve_copy_avx2(const uint8_t *src8, ptrdiff_t src_stride,
+                                   uint8_t *dst8, ptrdiff_t dst_stride,
+                                   const int16_t *filter_x, int filter_x_stride,
+                                   const int16_t *filter_y, int filter_y_stride,
+                                   int width, int h, int bd) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  (void)filter_x;
+  (void)filter_y;
+  (void)filter_x_stride;
+  (void)filter_y_stride;
+  (void)bd;
+
+  assert(width % 4 == 0);
+  if (width > 32) {  // width = 64
+    do {
+      const __m256i p0 = _mm256_loadu_si256((const __m256i *)src);
+      const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
+      const __m256i p2 = _mm256_loadu_si256((const __m256i *)(src + 32));
+      const __m256i p3 = _mm256_loadu_si256((const __m256i *)(src + 48));
+      src += src_stride;
+      _mm256_storeu_si256((__m256i *)dst, p0);
+      _mm256_storeu_si256((__m256i *)(dst + 16), p1);
+      _mm256_storeu_si256((__m256i *)(dst + 32), p2);
+      _mm256_storeu_si256((__m256i *)(dst + 48), p3);
+      dst += dst_stride;
+      h--;
+    } while (h > 0);
+  } else if (width > 16) {  // width = 32
+    do {
+      const __m256i p0 = _mm256_loadu_si256((const __m256i *)src);
+      const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
+      src += src_stride;
+      _mm256_storeu_si256((__m256i *)dst, p0);
+      _mm256_storeu_si256((__m256i *)(dst + 16), p1);
+      dst += dst_stride;
+      h--;
+    } while (h > 0);
+  } else if (width > 8) {  // width = 16
+    __m256i p0, p1;
+    do {
+      p0 = _mm256_loadu_si256((const __m256i *)src);
+      src += src_stride;
+      p1 = _mm256_loadu_si256((const __m256i *)src);
+      src += src_stride;
+
+      _mm256_storeu_si256((__m256i *)dst, p0);
+      dst += dst_stride;
+      _mm256_storeu_si256((__m256i *)dst, p1);
+      dst += dst_stride;
+      h -= 2;
+    } while (h > 0);
+  } else if (width > 4) {  // width = 8
+    __m128i p0, p1;
+    do {
+      p0 = _mm_loadu_si128((const __m128i *)src);
+      src += src_stride;
+      p1 = _mm_loadu_si128((const __m128i *)src);
+      src += src_stride;
+
+      _mm_storeu_si128((__m128i *)dst, p0);
+      dst += dst_stride;
+      _mm_storeu_si128((__m128i *)dst, p1);
+      dst += dst_stride;
+      h -= 2;
+    } while (h > 0);
+  } else {  // width = 4
+    __m128i p0, p1;
+    do {
+      p0 = _mm_loadl_epi64((const __m128i *)src);
+      src += src_stride;
+      p1 = _mm_loadl_epi64((const __m128i *)src);
+      src += src_stride;
+
+      _mm_storel_epi64((__m128i *)dst, p0);
+      dst += dst_stride;
+      _mm_storel_epi64((__m128i *)dst, p1);
+      dst += dst_stride;
+      h -= 2;
+    } while (h > 0);
+  }
+}
+
+void av1_highbd_convolve_y_sr_avx2(const uint16_t *src, int src_stride,
+                                   uint16_t *dst, int dst_stride, int w, int h,
+                                   const InterpFilterParams *filter_params_x,
+                                   const InterpFilterParams *filter_params_y,
+                                   const int subpel_x_qn, const int subpel_y_qn,
+                                   ConvolveParams *conv_params, int bd) {
+  int i, j;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const uint16_t *const src_ptr = src - fo_vert * src_stride;
+  (void)filter_params_x;
+  (void)subpel_x_qn;
+  (void)conv_params;
+
+  assert(conv_params->round_0 <= FILTER_BITS);
+  assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
+         ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
+
+  __m256i s[8], coeffs_y[4];
+
+  const int bits = FILTER_BITS;
+
+  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+  const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
+  const __m256i clip_pixel =
+      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+  const __m256i zero = _mm256_setzero_si256();
+
+  prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y);
+
+  for (j = 0; j < w; j += 8) {
+    const uint16_t *data = &src_ptr[j];
+    /* Vertical filter */
+    {
+      __m256i src6;
+      __m256i s01 = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 0 * src_stride))),
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
+          0x20);
+      __m256i s12 = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
+          0x20);
+      __m256i s23 = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
+          0x20);
+      __m256i s34 = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
+          0x20);
+      __m256i s45 = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
+          0x20);
+      src6 = _mm256_castsi128_si256(
+          _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
+      __m256i s56 = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
+          src6, 0x20);
+
+      s[0] = _mm256_unpacklo_epi16(s01, s12);
+      s[1] = _mm256_unpacklo_epi16(s23, s34);
+      s[2] = _mm256_unpacklo_epi16(s45, s56);
+
+      s[4] = _mm256_unpackhi_epi16(s01, s12);
+      s[5] = _mm256_unpackhi_epi16(s23, s34);
+      s[6] = _mm256_unpackhi_epi16(s45, s56);
+
+      for (i = 0; i < h; i += 2) {
+        data = &src_ptr[i * src_stride + j];
+
+        const __m256i s67 = _mm256_permute2x128_si256(
+            src6,
+            _mm256_castsi128_si256(
+                _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
+            0x20);
+
+        src6 = _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + 8 * src_stride)));
+
+        const __m256i s78 = _mm256_permute2x128_si256(
+            _mm256_castsi128_si256(
+                _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
+            src6, 0x20);
+
+        s[3] = _mm256_unpacklo_epi16(s67, s78);
+        s[7] = _mm256_unpackhi_epi16(s67, s78);
+
+        const __m256i res_a = convolve(s, coeffs_y);
+
+        __m256i res_a_round = _mm256_sra_epi32(
+            _mm256_add_epi32(res_a, round_const_bits), round_shift_bits);
+
+        if (w - j > 4) {
+          const __m256i res_b = convolve(s + 4, coeffs_y);
+          __m256i res_b_round = _mm256_sra_epi32(
+              _mm256_add_epi32(res_b, round_const_bits), round_shift_bits);
+
+          __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);
+          res_16bit = _mm256_min_epi16(res_16bit, clip_pixel);
+          res_16bit = _mm256_max_epi16(res_16bit, zero);
+
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j],
+                           _mm256_castsi256_si128(res_16bit));
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                           _mm256_extracti128_si256(res_16bit, 1));
+        } else if (w == 4) {
+          res_a_round = _mm256_packs_epi32(res_a_round, res_a_round);
+          res_a_round = _mm256_min_epi16(res_a_round, clip_pixel);
+          res_a_round = _mm256_max_epi16(res_a_round, zero);
+
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j],
+                           _mm256_castsi256_si128(res_a_round));
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                           _mm256_extracti128_si256(res_a_round, 1));
+        } else {
+          res_a_round = _mm256_packs_epi32(res_a_round, res_a_round);
+          res_a_round = _mm256_min_epi16(res_a_round, clip_pixel);
+          res_a_round = _mm256_max_epi16(res_a_round, zero);
+
+          xx_storel_32((__m128i *)&dst[i * dst_stride + j],
+                       _mm256_castsi256_si128(res_a_round));
+          xx_storel_32((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                       _mm256_extracti128_si256(res_a_round, 1));
+        }
+
+        s[0] = s[1];
+        s[1] = s[2];
+        s[2] = s[3];
+
+        s[4] = s[5];
+        s[5] = s[6];
+        s[6] = s[7];
+      }
+    }
+  }
+}
+
+void av1_highbd_convolve_x_sr_avx2(const uint16_t *src, int src_stride,
+                                   uint16_t *dst, int dst_stride, int w, int h,
+                                   const InterpFilterParams *filter_params_x,
+                                   const InterpFilterParams *filter_params_y,
+                                   const int subpel_x_qn, const int subpel_y_qn,
+                                   ConvolveParams *conv_params, int bd) {
+  int i, j;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const uint16_t *const src_ptr = src - fo_horiz;
+  (void)subpel_y_qn;
+  (void)filter_params_y;
+
+  // Check that, even with 12-bit input, the intermediate values will fit
+  // into an unsigned 16-bit intermediate array.
+  assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
+
+  __m256i s[4], coeffs_x[4];
+
+  const __m256i round_const_x =
+      _mm256_set1_epi32(((1 << conv_params->round_0) >> 1));
+  const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
+
+  const int bits = FILTER_BITS - conv_params->round_0;
+  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+  const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
+  const __m256i clip_pixel =
+      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+  const __m256i zero = _mm256_setzero_si256();
+
+  assert(bits >= 0);
+  assert((FILTER_BITS - conv_params->round_1) >= 0 ||
+         ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
+
+  prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x);
+
+  for (j = 0; j < w; j += 8) {
+    /* Horizontal filter */
+    for (i = 0; i < h; i += 2) {
+      const __m256i row0 =
+          _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]);
+      __m256i row1 =
+          _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]);
+
+      const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20);
+      const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);
+
+      // even pixels
+      s[0] = _mm256_alignr_epi8(r1, r0, 0);
+      s[1] = _mm256_alignr_epi8(r1, r0, 4);
+      s[2] = _mm256_alignr_epi8(r1, r0, 8);
+      s[3] = _mm256_alignr_epi8(r1, r0, 12);
+
+      __m256i res_even = convolve(s, coeffs_x);
+      res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x),
+                                  round_shift_x);
+
+      // odd pixels
+      s[0] = _mm256_alignr_epi8(r1, r0, 2);
+      s[1] = _mm256_alignr_epi8(r1, r0, 6);
+      s[2] = _mm256_alignr_epi8(r1, r0, 10);
+      s[3] = _mm256_alignr_epi8(r1, r0, 14);
+
+      __m256i res_odd = convolve(s, coeffs_x);
+      res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x),
+                                 round_shift_x);
+
+      res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_bits),
+                                  round_shift_bits);
+      res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_bits),
+                                 round_shift_bits);
+
+      __m256i res_even1 = _mm256_packs_epi32(res_even, res_even);
+      __m256i res_odd1 = _mm256_packs_epi32(res_odd, res_odd);
+
+      __m256i res = _mm256_unpacklo_epi16(res_even1, res_odd1);
+      res = _mm256_min_epi16(res, clip_pixel);
+      res = _mm256_max_epi16(res, zero);
+
+      if (w - j > 4) {
+        _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j],
+                         _mm256_castsi256_si128(res));
+        _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                         _mm256_extracti128_si256(res, 1));
+      } else if (w == 4) {
+        _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j],
+                         _mm256_castsi256_si128(res));
+        _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                         _mm256_extracti128_si256(res, 1));
+      } else {
+        xx_storel_32((__m128i *)&dst[i * dst_stride + j],
+                     _mm256_castsi256_si128(res));
+        xx_storel_32((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                     _mm256_extracti128_si256(res, 1));
+      }
+    }
+  }
+}
+
+#define CONV8_ROUNDING_BITS (7)
+
+// -----------------------------------------------------------------------------
+// Horizontal and vertical filtering
+
+static const uint8_t signal_pattern_0[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6,
+                                              7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3,
+                                              4, 5, 4, 5, 6, 7, 6, 7, 8, 9 };
+
+static const uint8_t signal_pattern_1[32] = { 4, 5, 6,  7,  6,  7,  8,  9,
+                                              8, 9, 10, 11, 10, 11, 12, 13,
+                                              4, 5, 6,  7,  6,  7,  8,  9,
+                                              8, 9, 10, 11, 10, 11, 12, 13 };
+
+static const uint8_t signal_pattern_2[32] = { 6,  7,  8,  9,  8,  9,  10, 11,
+                                              10, 11, 12, 13, 12, 13, 14, 15,
+                                              6,  7,  8,  9,  8,  9,  10, 11,
+                                              10, 11, 12, 13, 12, 13, 14, 15 };
+
+static const uint32_t signal_index[8] = { 2, 3, 4, 5, 2, 3, 4, 5 };
+
+// -----------------------------------------------------------------------------
+// Horizontal Filtering
+
+static INLINE void pack_pixels(const __m256i *s, __m256i *p /*p[4]*/) {
+  const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
+  const __m256i sf0 = _mm256_loadu_si256((const __m256i *)signal_pattern_0);
+  const __m256i sf1 = _mm256_loadu_si256((const __m256i *)signal_pattern_1);
+  const __m256i c = _mm256_permutevar8x32_epi32(*s, idx);
+
+  p[0] = _mm256_shuffle_epi8(*s, sf0);  // x0x6
+  p[1] = _mm256_shuffle_epi8(*s, sf1);  // x1x7
+  p[2] = _mm256_shuffle_epi8(c, sf0);   // x2x4
+  p[3] = _mm256_shuffle_epi8(c, sf1);   // x3x5
+}
+
+// Note:
+//  Shared by 8x2 and 16x1 block
+static INLINE void pack_16_pixels(const __m256i *s0, const __m256i *s1,
+                                  __m256i *x /*x[8]*/) {
+  __m256i pp[8];
+  pack_pixels(s0, pp);
+  pack_pixels(s1, &pp[4]);
+  x[0] = _mm256_permute2x128_si256(pp[0], pp[4], 0x20);
+  x[1] = _mm256_permute2x128_si256(pp[1], pp[5], 0x20);
+  x[2] = _mm256_permute2x128_si256(pp[2], pp[6], 0x20);
+  x[3] = _mm256_permute2x128_si256(pp[3], pp[7], 0x20);
+  x[4] = x[2];
+  x[5] = x[3];
+  x[6] = _mm256_permute2x128_si256(pp[0], pp[4], 0x31);
+  x[7] = _mm256_permute2x128_si256(pp[1], pp[5], 0x31);
+}
+
+static INLINE void pack_8x1_pixels(const uint16_t *src, __m256i *x) {
+  __m256i pp[8];
+  __m256i s0;
+  s0 = _mm256_loadu_si256((const __m256i *)src);
+  pack_pixels(&s0, pp);
+  x[0] = _mm256_permute2x128_si256(pp[0], pp[2], 0x30);
+  x[1] = _mm256_permute2x128_si256(pp[1], pp[3], 0x30);
+  x[2] = _mm256_permute2x128_si256(pp[2], pp[0], 0x30);
+  x[3] = _mm256_permute2x128_si256(pp[3], pp[1], 0x30);
+}
+
+static INLINE void pack_8x2_pixels(const uint16_t *src, ptrdiff_t stride,
+                                   __m256i *x) {
+  __m256i s0, s1;
+  s0 = _mm256_loadu_si256((const __m256i *)src);
+  s1 = _mm256_loadu_si256((const __m256i *)(src + stride));
+  pack_16_pixels(&s0, &s1, x);
+}
+
+static INLINE void pack_16x1_pixels(const uint16_t *src, __m256i *x) {
+  __m256i s0, s1;
+  s0 = _mm256_loadu_si256((const __m256i *)src);
+  s1 = _mm256_loadu_si256((const __m256i *)(src + 8));
+  pack_16_pixels(&s0, &s1, x);
+}
+
+// Note:
+//  Shared by horizontal and vertical filtering
+static INLINE void pack_filters(const int16_t *filter, __m256i *f /*f[4]*/) {
+  const __m128i h = _mm_loadu_si128((const __m128i *)filter);
+  const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1);
+  const __m256i p0 = _mm256_set1_epi32(0x03020100);
+  const __m256i p1 = _mm256_set1_epi32(0x07060504);
+  const __m256i p2 = _mm256_set1_epi32(0x0b0a0908);
+  const __m256i p3 = _mm256_set1_epi32(0x0f0e0d0c);
+  f[0] = _mm256_shuffle_epi8(hh, p0);
+  f[1] = _mm256_shuffle_epi8(hh, p1);
+  f[2] = _mm256_shuffle_epi8(hh, p2);
+  f[3] = _mm256_shuffle_epi8(hh, p3);
+}
+
+static INLINE void pack_filters_4tap(const int16_t *filter,
+                                     __m256i *f /*f[4]*/) {
+  const __m128i h = _mm_loadu_si128((const __m128i *)filter);
+  const __m256i coeff = _mm256_broadcastsi128_si256(h);
+
+  // coeffs 2 3 2 3 2 3 2 3
+  f[0] = _mm256_shuffle_epi32(coeff, 0x55);
+  // coeffs 4 5 4 5 4 5 4 5
+  f[1] = _mm256_shuffle_epi32(coeff, 0xaa);
+}
+
+static INLINE void filter_8x1_pixels(const __m256i *sig /*sig[4]*/,
+                                     const __m256i *fil /*fil[4]*/,
+                                     __m256i *y) {
+  __m256i a, a0, a1;
+
+  a0 = _mm256_madd_epi16(fil[0], sig[0]);
+  a1 = _mm256_madd_epi16(fil[3], sig[3]);
+  a = _mm256_add_epi32(a0, a1);
+
+  a0 = _mm256_madd_epi16(fil[1], sig[1]);
+  a1 = _mm256_madd_epi16(fil[2], sig[2]);
+
+  {
+    const __m256i min = _mm256_min_epi32(a0, a1);
+    a = _mm256_add_epi32(a, min);
+  }
+  {
+    const __m256i max = _mm256_max_epi32(a0, a1);
+    a = _mm256_add_epi32(a, max);
+  }
+  {
+    const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
+    a = _mm256_add_epi32(a, rounding);
+    *y = _mm256_srai_epi32(a, CONV8_ROUNDING_BITS);
+  }
+}
+
+static INLINE void store_8x1_pixels(const __m256i *y, const __m256i *mask,
+                                    uint16_t *dst) {
+  const __m128i a0 = _mm256_castsi256_si128(*y);
+  const __m128i a1 = _mm256_extractf128_si256(*y, 1);
+  __m128i res = _mm_packus_epi32(a0, a1);
+  res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask));
+  _mm_storeu_si128((__m128i *)dst, res);
+}
+
+static INLINE void store_8x2_pixels(const __m256i *y0, const __m256i *y1,
+                                    const __m256i *mask, uint16_t *dst,
+                                    ptrdiff_t pitch) {
+  __m256i a = _mm256_packus_epi32(*y0, *y1);
+  a = _mm256_min_epi16(a, *mask);
+  _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a));
+  _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1));
+}
+
+static INLINE void store_16x1_pixels(const __m256i *y0, const __m256i *y1,
+                                     const __m256i *mask, uint16_t *dst) {
+  __m256i a = _mm256_packus_epi32(*y0, *y1);
+  a = _mm256_min_epi16(a, *mask);
+  _mm256_storeu_si256((__m256i *)dst, a);
+}
+
+static void aom_highbd_filter_block1d8_h8_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+  __m256i signal[8], res0, res1;
+  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+  __m256i ff[4];
+  pack_filters(filter, ff);
+
+  src_ptr -= 3;
+  do {
+    pack_8x2_pixels(src_ptr, src_pitch, signal);
+    filter_8x1_pixels(signal, ff, &res0);
+    filter_8x1_pixels(&signal[4], ff, &res1);
+    store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
+    height -= 2;
+    src_ptr += src_pitch << 1;
+    dst_ptr += dst_pitch << 1;
+  } while (height > 1);
+
+  if (height > 0) {
+    pack_8x1_pixels(src_ptr, signal);
+    filter_8x1_pixels(signal, ff, &res0);
+    store_8x1_pixels(&res0, &max, dst_ptr);
+  }
+}
+
+static void aom_highbd_filter_block1d16_h8_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+  __m256i signal[8], res0, res1;
+  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+  __m256i ff[4];
+  pack_filters(filter, ff);
+
+  src_ptr -= 3;
+  do {
+    pack_16x1_pixels(src_ptr, signal);
+    filter_8x1_pixels(signal, ff, &res0);
+    filter_8x1_pixels(&signal[4], ff, &res1);
+    store_16x1_pixels(&res0, &res1, &max, dst_ptr);
+    height -= 1;
+    src_ptr += src_pitch;
+    dst_ptr += dst_pitch;
+  } while (height > 0);
+}
+
+static void aom_highbd_filter_block1d4_h4_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+  const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
+  __m256i ff[2], s[2];
+  uint32_t i;
+  const __m256i clip_pixel =
+      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+  const __m256i zero = _mm256_setzero_si256();
+
+  static const uint8_t shuffle_mask[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6,
+                                            7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3,
+                                            4, 5, 4, 5, 6, 7, 6, 7, 8, 9 };
+
+  __m256i mask = _mm256_loadu_si256((__m256i *)shuffle_mask);
+  __m256i ip_mask_f2f3 = _mm256_loadu_si256((__m256i *)ip_shuffle_f2f3);
+  __m256i ip_mask_f4f5 = _mm256_loadu_si256((__m256i *)ip_shuffle_f4f5);
+
+  pack_filters_4tap(filter, ff);
+  src_ptr -= 3;
+  for (i = 0; i <= (height - 2); i += 2) {
+    __m256i row0 = _mm256_castsi128_si256(
+        _mm_loadu_si128((__m128i *)&src_ptr[i * src_pitch + 2]));
+    __m256i row1 = _mm256_castsi128_si256(
+        _mm_loadu_si128((__m128i *)&src_ptr[(i + 1) * src_pitch + 2]));
+
+    s[0] = _mm256_inserti128_si256(row0, _mm256_castsi256_si128(row1), 1);
+    s[1] = _mm256_alignr_epi8(s[0], s[0], 4);
+
+    s[0] = _mm256_shuffle_epi8(s[0], mask);
+    s[1] = _mm256_shuffle_epi8(s[1], mask);
+
+    __m256i res = convolve_4tap(s, ff);
+    res =
+        _mm256_srai_epi32(_mm256_add_epi32(res, rounding), CONV8_ROUNDING_BITS);
+
+    res = _mm256_packs_epi32(res, res);
+    res = _mm256_min_epi16(res, clip_pixel);
+    res = _mm256_max_epi16(res, zero);
+
+    _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch],
+                     _mm256_castsi256_si128(res));
+    _mm_storel_epi64((__m128i *)&dst_ptr[(i + 1) * dst_pitch],
+                     _mm256_extracti128_si256(res, 1));
+  }
+  if (height % 2 != 0) {
+    i = height - 1;
+    const __m256i row0_0 = _mm256_castsi128_si256(
+        _mm_loadu_si128((__m128i *)&src_ptr[i * src_pitch + 2]));
+    const __m256i row0_1 = _mm256_castsi128_si256(
+        _mm_loadu_si128((__m128i *)&src_ptr[i * src_pitch + 6]));
+
+    const __m256i r0 =
+        _mm256_inserti128_si256(row0_0, _mm256_castsi256_si128(row0_1), 1);
+
+    s[0] = _mm256_shuffle_epi8(r0, ip_mask_f2f3);
+    s[1] = _mm256_shuffle_epi8(r0, ip_mask_f4f5);
+
+    __m256i res = convolve_4tap(s, ff);
+    res =
+        _mm256_srai_epi32(_mm256_add_epi32(res, rounding), CONV8_ROUNDING_BITS);
+
+    res = _mm256_packs_epi32(res, res);
+    res = _mm256_min_epi16(res, clip_pixel);
+    res = _mm256_max_epi16(res, zero);
+
+    _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch],
+                     _mm256_castsi256_si128(res));
+  }
+}
+
+static void aom_highbd_filter_block1d8_h4_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+  const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
+  __m256i ff[2], s[2];
+  uint32_t i = 0;
+  const __m256i clip_pixel =
+      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+  const __m256i zero = _mm256_setzero_si256();
+
+  static const uint8_t shuffle_mask[32] = { 0, 1, 8,  9,  2, 3, 10, 11,
+                                            4, 5, 12, 13, 6, 7, 14, 15,
+                                            0, 1, 8,  9,  2, 3, 10, 11,
+                                            4, 5, 12, 13, 6, 7, 14, 15 };
+
+  __m256i mask = _mm256_loadu_si256((__m256i *)shuffle_mask);
+  __m256i ip_mask_f2f3 = _mm256_loadu_si256((__m256i *)ip_shuffle_f2f3);
+  __m256i ip_mask_f4f5 = _mm256_loadu_si256((__m256i *)ip_shuffle_f4f5);
+
+  pack_filters_4tap(filter, ff);
+  src_ptr -= 3;
+
+  /* Horizontal filter */
+
+  for (i = 0; i <= (height - 2); i += 2) {
+    const __m256i row0 =
+        _mm256_loadu_si256((__m256i *)&src_ptr[i * src_pitch + 2]);
+    __m256i row1 =
+        _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_pitch + 2]);
+
+    const __m256i r0 =
+        _mm256_inserti128_si256(row0, _mm256_castsi256_si128(row1), 1);
+    const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);
+
+    // even pixels
+    s[0] = r0;
+    s[1] = _mm256_alignr_epi8(r1, r0, 4);
+
+    __m256i res_even = convolve_4tap(s, ff);
+    res_even = _mm256_srai_epi32(_mm256_add_epi32(res_even, rounding),
+                                 CONV8_ROUNDING_BITS);
+
+    // odd pixels
+    s[0] = _mm256_alignr_epi8(r1, r0, 2);
+    s[1] = _mm256_alignr_epi8(r1, r0, 6);
+
+    __m256i res_odd = convolve_4tap(s, ff);
+    res_odd = _mm256_srai_epi32(_mm256_add_epi32(res_odd, rounding),
+                                CONV8_ROUNDING_BITS);
+
+    __m256i res = _mm256_packs_epi32(res_even, res_odd);
+    res = _mm256_shuffle_epi8(res, mask);
+
+    res = _mm256_min_epi16(res, clip_pixel);
+    res = _mm256_max_epi16(res, zero);
+
+    _mm_storeu_si128((__m128i *)&dst_ptr[i * dst_pitch],
+                     _mm256_castsi256_si128(res));
+    _mm_storeu_si128((__m128i *)&dst_ptr[i * dst_pitch + dst_pitch],
+                     _mm256_extracti128_si256(res, 1));
+  }
+
+  if (height % 2 != 0) {
+    i = height - 1;
+    const __m256i row0_0 =
+        _mm256_loadu_si256((__m256i *)&src_ptr[i * src_pitch + 2]);
+    const __m256i row0_1 =
+        _mm256_loadu_si256((__m256i *)&src_ptr[i * src_pitch + 6]);
+
+    const __m256i r0 =
+        _mm256_inserti128_si256(row0_0, _mm256_castsi256_si128(row0_1), 1);
+
+    s[0] = _mm256_shuffle_epi8(r0, ip_mask_f2f3);
+    s[1] = _mm256_shuffle_epi8(r0, ip_mask_f4f5);
+
+    __m256i res = convolve_4tap(s, ff);
+    res =
+        _mm256_srai_epi32(_mm256_add_epi32(res, rounding), CONV8_ROUNDING_BITS);
+
+    res = _mm256_packs_epi32(res, res);
+    res = _mm256_min_epi16(res, clip_pixel);
+    res = _mm256_max_epi16(res, zero);
+
+    _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch],
+                     _mm256_castsi256_si128(res));
+    _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch + 4],
+                     _mm256_extracti128_si256(res, 1));
+  }
+}
+
+static void aom_highbd_filter_block1d16_h4_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+  aom_highbd_filter_block1d8_h4_avx2(src_ptr, src_pitch, dst_ptr, dst_pitch,
+                                     height, filter, bd);
+  aom_highbd_filter_block1d8_h4_avx2(src_ptr + 8, src_pitch, dst_ptr + 8,
+                                     dst_pitch, height, filter, bd);
+}
+
+// -----------------------------------------------------------------------------
+// 2-tap horizontal filtering
+
+static INLINE void pack_2t_filter(const int16_t *filter, __m256i *f) {
+  const __m128i h = _mm_loadu_si128((const __m128i *)filter);
+  const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1);
+  const __m256i p = _mm256_set1_epi32(0x09080706);
+  f[0] = _mm256_shuffle_epi8(hh, p);
+}
+
+// can be used by pack_8x2_2t_pixels() and pack_16x1_2t_pixels()
+// the difference is s0/s1 specifies first and second rows or,
+// first 16 samples and 8-sample shifted 16 samples
+static INLINE void pack_16_2t_pixels(const __m256i *s0, const __m256i *s1,
+                                     __m256i *sig) {
+  const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
+  const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2);
+  __m256i x0 = _mm256_shuffle_epi8(*s0, sf2);
+  __m256i x1 = _mm256_shuffle_epi8(*s1, sf2);
+  __m256i r0 = _mm256_permutevar8x32_epi32(*s0, idx);
+  __m256i r1 = _mm256_permutevar8x32_epi32(*s1, idx);
+  r0 = _mm256_shuffle_epi8(r0, sf2);
+  r1 = _mm256_shuffle_epi8(r1, sf2);
+  sig[0] = _mm256_permute2x128_si256(x0, x1, 0x20);
+  sig[1] = _mm256_permute2x128_si256(r0, r1, 0x20);
+}
+
+static INLINE void pack_8x2_2t_pixels(const uint16_t *src,
+                                      const ptrdiff_t pitch, __m256i *sig) {
+  const __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
+  const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + pitch));
+  pack_16_2t_pixels(&r0, &r1, sig);
+}
+
+static INLINE void pack_16x1_2t_pixels(const uint16_t *src,
+                                       __m256i *sig /*sig[2]*/) {
+  const __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
+  const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + 8));
+  pack_16_2t_pixels(&r0, &r1, sig);
+}
+
+static INLINE void pack_8x1_2t_pixels(const uint16_t *src,
+                                      __m256i *sig /*sig[2]*/) {
+  const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
+  const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2);
+  __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
+  __m256i x0 = _mm256_shuffle_epi8(r0, sf2);
+  r0 = _mm256_permutevar8x32_epi32(r0, idx);
+  r0 = _mm256_shuffle_epi8(r0, sf2);
+  sig[0] = _mm256_permute2x128_si256(x0, r0, 0x20);
+}
+
+// can be used by filter_8x2_2t_pixels() and filter_16x1_2t_pixels()
+static INLINE void filter_16_2t_pixels(const __m256i *sig, const __m256i *f,
+                                       __m256i *y0, __m256i *y1) {
+  const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
+  __m256i x0 = _mm256_madd_epi16(sig[0], *f);
+  __m256i x1 = _mm256_madd_epi16(sig[1], *f);
+  x0 = _mm256_add_epi32(x0, rounding);
+  x1 = _mm256_add_epi32(x1, rounding);
+  *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS);
+  *y1 = _mm256_srai_epi32(x1, CONV8_ROUNDING_BITS);
+}
+
+static INLINE void filter_8x1_2t_pixels(const __m256i *sig, const __m256i *f,
+                                        __m256i *y0) {
+  const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
+  __m256i x0 = _mm256_madd_epi16(sig[0], *f);
+  x0 = _mm256_add_epi32(x0, rounding);
+  *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS);
+}
+
+static void aom_highbd_filter_block1d8_h2_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+  __m256i signal[2], res0, res1;
+  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+  __m256i ff;
+  pack_2t_filter(filter, &ff);
+
+  src_ptr -= 3;
+  do {
+    pack_8x2_2t_pixels(src_ptr, src_pitch, signal);
+    filter_16_2t_pixels(signal, &ff, &res0, &res1);
+    store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
+    height -= 2;
+    src_ptr += src_pitch << 1;
+    dst_ptr += dst_pitch << 1;
+  } while (height > 1);
+
+  if (height > 0) {
+    pack_8x1_2t_pixels(src_ptr, signal);
+    filter_8x1_2t_pixels(signal, &ff, &res0);
+    store_8x1_pixels(&res0, &max, dst_ptr);
+  }
+}
+
+static void aom_highbd_filter_block1d16_h2_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+  __m256i signal[2], res0, res1;
+  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+  __m256i ff;
+  pack_2t_filter(filter, &ff);
+
+  src_ptr -= 3;
+  do {
+    pack_16x1_2t_pixels(src_ptr, signal);
+    filter_16_2t_pixels(signal, &ff, &res0, &res1);
+    store_16x1_pixels(&res0, &res1, &max, dst_ptr);
+    height -= 1;
+    src_ptr += src_pitch;
+    dst_ptr += dst_pitch;
+  } while (height > 0);
+}
+
+// -----------------------------------------------------------------------------
+// Vertical Filtering
+
+static void pack_8x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) {
+  __m256i s0 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)src));
+  __m256i s1 =
+      _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src + pitch)));
+  __m256i s2 = _mm256_castsi128_si256(
+      _mm_loadu_si128((const __m128i *)(src + 2 * pitch)));
+  __m256i s3 = _mm256_castsi128_si256(
+      _mm_loadu_si128((const __m128i *)(src + 3 * pitch)));
+  __m256i s4 = _mm256_castsi128_si256(
+      _mm_loadu_si128((const __m128i *)(src + 4 * pitch)));
+  __m256i s5 = _mm256_castsi128_si256(
+      _mm_loadu_si128((const __m128i *)(src + 5 * pitch)));
+  __m256i s6 = _mm256_castsi128_si256(
+      _mm_loadu_si128((const __m128i *)(src + 6 * pitch)));
+
+  s0 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1);
+  s1 = _mm256_inserti128_si256(s1, _mm256_castsi256_si128(s2), 1);
+  s2 = _mm256_inserti128_si256(s2, _mm256_castsi256_si128(s3), 1);
+  s3 = _mm256_inserti128_si256(s3, _mm256_castsi256_si128(s4), 1);
+  s4 = _mm256_inserti128_si256(s4, _mm256_castsi256_si128(s5), 1);
+  s5 = _mm256_inserti128_si256(s5, _mm256_castsi256_si128(s6), 1);
+
+  sig[0] = _mm256_unpacklo_epi16(s0, s1);
+  sig[4] = _mm256_unpackhi_epi16(s0, s1);
+  sig[1] = _mm256_unpacklo_epi16(s2, s3);
+  sig[5] = _mm256_unpackhi_epi16(s2, s3);
+  sig[2] = _mm256_unpacklo_epi16(s4, s5);
+  sig[6] = _mm256_unpackhi_epi16(s4, s5);
+  sig[8] = s6;
+}
+
+static INLINE void pack_8x9_pixels(const uint16_t *src, ptrdiff_t pitch,
+                                   __m256i *sig) {
+  // base + 7th row
+  __m256i s0 = _mm256_castsi128_si256(
+      _mm_loadu_si128((const __m128i *)(src + 7 * pitch)));
+  // base + 8th row
+  __m256i s1 = _mm256_castsi128_si256(
+      _mm_loadu_si128((const __m128i *)(src + 8 * pitch)));
+  __m256i s2 = _mm256_inserti128_si256(sig[8], _mm256_castsi256_si128(s0), 1);
+  __m256i s3 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1);
+  sig[3] = _mm256_unpacklo_epi16(s2, s3);
+  sig[7] = _mm256_unpackhi_epi16(s2, s3);
+  sig[8] = s1;
+}
+
+static INLINE void filter_8x9_pixels(const __m256i *sig, const __m256i *f,
+                                     __m256i *y0, __m256i *y1) {
+  filter_8x1_pixels(sig, f, y0);
+  filter_8x1_pixels(&sig[4], f, y1);
+}
+
+static INLINE void update_pixels(__m256i *sig) {
+  int i;
+  for (i = 0; i < 3; ++i) {
+    sig[i] = sig[i + 1];
+    sig[i + 4] = sig[i + 5];
+  }
+}
+
+static void aom_highbd_filter_block1d8_v8_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+  __m256i signal[9], res0, res1;
+  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+  __m256i ff[4];
+  pack_filters(filter, ff);
+
+  pack_8x9_init(src_ptr, src_pitch, signal);
+
+  do {
+    pack_8x9_pixels(src_ptr, src_pitch, signal);
+
+    filter_8x9_pixels(signal, ff, &res0, &res1);
+    store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
+    update_pixels(signal);
+
+    src_ptr += src_pitch << 1;
+    dst_ptr += dst_pitch << 1;
+    height -= 2;
+  } while (height > 0);
+}
+
+static void pack_16x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) {
+  __m256i u0, u1, u2, u3;
+  // load 0-6 rows
+  const __m256i s0 = _mm256_loadu_si256((const __m256i *)src);
+  const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src + pitch));
+  const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src + 2 * pitch));
+  const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src + 3 * pitch));
+  const __m256i s4 = _mm256_loadu_si256((const __m256i *)(src + 4 * pitch));
+  const __m256i s5 = _mm256_loadu_si256((const __m256i *)(src + 5 * pitch));
+  const __m256i s6 = _mm256_loadu_si256((const __m256i *)(src + 6 * pitch));
+
+  u0 = _mm256_permute2x128_si256(s0, s1, 0x20);  // 0, 1 low
+  u1 = _mm256_permute2x128_si256(s0, s1, 0x31);  // 0, 1 high
+
+  u2 = _mm256_permute2x128_si256(s1, s2, 0x20);  // 1, 2 low
+  u3 = _mm256_permute2x128_si256(s1, s2, 0x31);  // 1, 2 high
+
+  sig[0] = _mm256_unpacklo_epi16(u0, u2);
+  sig[4] = _mm256_unpackhi_epi16(u0, u2);
+
+  sig[8] = _mm256_unpacklo_epi16(u1, u3);
+  sig[12] = _mm256_unpackhi_epi16(u1, u3);
+
+  u0 = _mm256_permute2x128_si256(s2, s3, 0x20);
+  u1 = _mm256_permute2x128_si256(s2, s3, 0x31);
+
+  u2 = _mm256_permute2x128_si256(s3, s4, 0x20);
+  u3 = _mm256_permute2x128_si256(s3, s4, 0x31);
+
+  sig[1] = _mm256_unpacklo_epi16(u0, u2);
+  sig[5] = _mm256_unpackhi_epi16(u0, u2);
+
+  sig[9] = _mm256_unpacklo_epi16(u1, u3);
+  sig[13] = _mm256_unpackhi_epi16(u1, u3);
+
+  u0 = _mm256_permute2x128_si256(s4, s5, 0x20);
+  u1 = _mm256_permute2x128_si256(s4, s5, 0x31);
+
+  u2 = _mm256_permute2x128_si256(s5, s6, 0x20);
+  u3 = _mm256_permute2x128_si256(s5, s6, 0x31);
+
+  sig[2] = _mm256_unpacklo_epi16(u0, u2);
+  sig[6] = _mm256_unpackhi_epi16(u0, u2);
+
+  sig[10] = _mm256_unpacklo_epi16(u1, u3);
+  sig[14] = _mm256_unpackhi_epi16(u1, u3);
+
+  sig[16] = s6;
+}
+
+static void pack_16x9_pixels(const uint16_t *src, ptrdiff_t pitch,
+                             __m256i *sig) {
+  // base + 7th row
+  const __m256i s7 = _mm256_loadu_si256((const __m256i *)(src + 7 * pitch));
+  // base + 8th row
+  const __m256i s8 = _mm256_loadu_si256((const __m256i *)(src + 8 * pitch));
+
+  __m256i u0, u1, u2, u3;
+  u0 = _mm256_permute2x128_si256(sig[16], s7, 0x20);
+  u1 = _mm256_permute2x128_si256(sig[16], s7, 0x31);
+
+  u2 = _mm256_permute2x128_si256(s7, s8, 0x20);
+  u3 = _mm256_permute2x128_si256(s7, s8, 0x31);
+
+  sig[3] = _mm256_unpacklo_epi16(u0, u2);
+  sig[7] = _mm256_unpackhi_epi16(u0, u2);
+
+  sig[11] = _mm256_unpacklo_epi16(u1, u3);
+  sig[15] = _mm256_unpackhi_epi16(u1, u3);
+
+  sig[16] = s8;
+}
+
+static INLINE void filter_16x9_pixels(const __m256i *sig, const __m256i *f,
+                                      __m256i *y0, __m256i *y1) {
+  __m256i res[4];
+  int i;
+  for (i = 0; i < 4; ++i) {
+    filter_8x1_pixels(&sig[i << 2], f, &res[i]);
+  }
+
+  {
+    const __m256i l0l1 = _mm256_packus_epi32(res[0], res[1]);
+    const __m256i h0h1 = _mm256_packus_epi32(res[2], res[3]);
+    *y0 = _mm256_permute2x128_si256(l0l1, h0h1, 0x20);
+    *y1 = _mm256_permute2x128_si256(l0l1, h0h1, 0x31);
+  }
+}
+
+static INLINE void store_16x2_pixels(const __m256i *y0, const __m256i *y1,
+                                     const __m256i *mask, uint16_t *dst,
+                                     ptrdiff_t pitch) {
+  __m256i p = _mm256_min_epi16(*y0, *mask);
+  _mm256_storeu_si256((__m256i *)dst, p);
+  p = _mm256_min_epi16(*y1, *mask);
+  _mm256_storeu_si256((__m256i *)(dst + pitch), p);
+}
+
+static void update_16x9_pixels(__m256i *sig) {
+  update_pixels(&sig[0]);
+  update_pixels(&sig[8]);
+}
+
+static void aom_highbd_filter_block1d16_v8_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+  __m256i signal[17], res0, res1;
+  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+  __m256i ff[4];
+  pack_filters(filter, ff);
+
+  pack_16x9_init(src_ptr, src_pitch, signal);
+
+  do {
+    pack_16x9_pixels(src_ptr, src_pitch, signal);
+    filter_16x9_pixels(signal, ff, &res0, &res1);
+    store_16x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
+    update_16x9_pixels(signal);
+
+    src_ptr += src_pitch << 1;
+    dst_ptr += dst_pitch << 1;
+    height -= 2;
+  } while (height > 0);
+}
+
+static void aom_highbd_filter_block1d4_v4_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+  const int bits = FILTER_BITS;
+
+  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+  const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
+  const __m256i clip_pixel =
+      _mm256_set1_epi32(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+  const __m256i zero = _mm256_setzero_si256();
+  uint32_t i;
+  __m256i s[2], ff[2];
+
+  pack_filters_4tap(filter, ff);
+
+  const uint16_t *data = src_ptr;
+  /* Vertical filter */
+  {
+    __m128i s2 = _mm_loadl_epi64((__m128i *)(data + 2 * src_pitch));
+    __m128i s3 = _mm_loadl_epi64((__m128i *)(data + 3 * src_pitch));
+
+    __m256i s23 = _mm256_inserti128_si256(_mm256_castsi128_si256(s2), s3, 1);
+
+    __m128i s4 = _mm_loadl_epi64((__m128i *)(data + 4 * src_pitch));
+
+    __m256i s34 = _mm256_inserti128_si256(_mm256_castsi128_si256(s3), s4, 1);
+
+    s[0] = _mm256_unpacklo_epi16(s23, s34);
+
+    for (i = 0; i < height; i += 2) {
+      data = &src_ptr[i * src_pitch];
+
+      __m128i s5 = _mm_loadl_epi64((__m128i *)(data + 5 * src_pitch));
+      __m128i s6 = _mm_loadl_epi64((__m128i *)(data + 6 * src_pitch));
+
+      __m256i s45 = _mm256_inserti128_si256(_mm256_castsi128_si256(s4), s5, 1);
+      __m256i s56 = _mm256_inserti128_si256(_mm256_castsi128_si256(s5), s6, 1);
+
+      s[1] = _mm256_unpacklo_epi16(s45, s56);
+
+      const __m256i res_a = convolve_4tap(s, ff);
+
+      __m256i res_a_round = _mm256_sra_epi32(
+          _mm256_add_epi32(res_a, round_const_bits), round_shift_bits);
+
+      __m256i res_16bit = _mm256_min_epi32(res_a_round, clip_pixel);
+      res_16bit = _mm256_max_epi32(res_16bit, zero);
+      res_16bit = _mm256_packs_epi32(res_16bit, res_16bit);
+
+      _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch],
+                       _mm256_castsi256_si128(res_16bit));
+      _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch + dst_pitch],
+                       _mm256_extracti128_si256(res_16bit, 1));
+
+      s[0] = s[1];
+      s4 = s6;
+    }
+  }
+}
+
+static void aom_highbd_filter_block1d8_v4_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+  const int bits = FILTER_BITS;
+
+  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+  const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
+  const __m256i clip_pixel =
+      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i s[4], ff[2];
+  uint32_t i;
+  pack_filters_4tap(filter, ff);
+
+  const uint16_t *data = src_ptr;
+  /* Vertical filter */
+  {
+    __m128i s2 = _mm_loadu_si128((__m128i *)(data + 2 * src_pitch));
+    __m128i s3 = _mm_loadu_si128((__m128i *)(data + 3 * src_pitch));
+
+    __m256i s23 = _mm256_inserti128_si256(_mm256_castsi128_si256(s2), s3, 1);
+
+    __m128i s4 = _mm_loadu_si128((__m128i *)(data + 4 * src_pitch));
+
+    __m256i s34 = _mm256_inserti128_si256(_mm256_castsi128_si256(s3), s4, 1);
+
+    s[0] = _mm256_unpacklo_epi16(s23, s34);
+    s[2] = _mm256_unpackhi_epi16(s23, s34);
+
+    for (i = 0; i < height; i += 2) {
+      data = &src_ptr[i * src_pitch];
+
+      __m128i s5 = _mm_loadu_si128((__m128i *)(data + 5 * src_pitch));
+      __m128i s6 = _mm_loadu_si128((__m128i *)(data + 6 * src_pitch));
+
+      __m256i s45 = _mm256_inserti128_si256(_mm256_castsi128_si256(s4), s5, 1);
+      __m256i s56 = _mm256_inserti128_si256(_mm256_castsi128_si256(s5), s6, 1);
+
+      s[1] = _mm256_unpacklo_epi16(s45, s56);
+      s[3] = _mm256_unpackhi_epi16(s45, s56);
+
+      const __m256i res_a = convolve_4tap(s, ff);
+
+      __m256i res_a_round = _mm256_sra_epi32(
+          _mm256_add_epi32(res_a, round_const_bits), round_shift_bits);
+
+      const __m256i res_b = convolve_4tap(s + 2, ff);
+      __m256i res_b_round = _mm256_sra_epi32(
+          _mm256_add_epi32(res_b, round_const_bits), round_shift_bits);
+
+      __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);
+      res_16bit = _mm256_min_epi16(res_16bit, clip_pixel);
+      res_16bit = _mm256_max_epi16(res_16bit, zero);
+
+      _mm_storeu_si128((__m128i *)&dst_ptr[i * dst_pitch],
+                       _mm256_castsi256_si128(res_16bit));
+      _mm_storeu_si128((__m128i *)&dst_ptr[i * dst_pitch + dst_pitch],
+                       _mm256_extracti128_si256(res_16bit, 1));
+
+      s[0] = s[1];
+      s[2] = s[3];
+      s4 = s6;
+    }
+  }
+}
+
+static void aom_highbd_filter_block1d16_v4_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+  aom_highbd_filter_block1d8_v4_avx2(src_ptr, src_pitch, dst_ptr, dst_pitch,
+                                     height, filter, bd);
+
+  aom_highbd_filter_block1d8_v4_avx2(src_ptr + 8, src_pitch, dst_ptr + 8,
+                                     dst_pitch, height, filter, bd);
+}
+
+// -----------------------------------------------------------------------------
+// 2-tap vertical filtering
+
+static void pack_16x2_init(const uint16_t *src, __m256i *sig) {
+  sig[2] = _mm256_loadu_si256((const __m256i *)src);
+}
+
+static INLINE void pack_16x2_2t_pixels(const uint16_t *src, ptrdiff_t pitch,
+                                       __m256i *sig) {
+  // load the next row
+  const __m256i u = _mm256_loadu_si256((const __m256i *)(src + pitch));
+  sig[0] = _mm256_unpacklo_epi16(sig[2], u);
+  sig[1] = _mm256_unpackhi_epi16(sig[2], u);
+  sig[2] = u;
+}
+
+static INLINE void filter_16x2_2t_pixels(const __m256i *sig, const __m256i *f,
+                                         __m256i *y0, __m256i *y1) {
+  filter_16_2t_pixels(sig, f, y0, y1);
+}
+
+static void aom_highbd_filter_block1d16_v2_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+  __m256i signal[3], res0, res1;
+  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+  __m256i ff;
+
+  pack_2t_filter(filter, &ff);
+  pack_16x2_init(src_ptr, signal);
+
+  do {
+    pack_16x2_2t_pixels(src_ptr, src_pitch, signal);
+    filter_16x2_2t_pixels(signal, &ff, &res0, &res1);
+    store_16x1_pixels(&res0, &res1, &max, dst_ptr);
+
+    src_ptr += src_pitch;
+    dst_ptr += dst_pitch;
+    height -= 1;
+  } while (height > 0);
+}
+
+static INLINE void pack_8x1_2t_filter(const int16_t *filter, __m128i *f) {
+  const __m128i h = _mm_loadu_si128((const __m128i *)filter);
+  const __m128i p = _mm_set1_epi32(0x09080706);
+  f[0] = _mm_shuffle_epi8(h, p);
+}
+
+static void pack_8x2_init(const uint16_t *src, __m128i *sig) {
+  sig[2] = _mm_loadu_si128((const __m128i *)src);
+}
+
+static INLINE void pack_8x2_2t_pixels_ver(const uint16_t *src, ptrdiff_t pitch,
+                                          __m128i *sig) {
+  // load the next row
+  const __m128i u = _mm_loadu_si128((const __m128i *)(src + pitch));
+  sig[0] = _mm_unpacklo_epi16(sig[2], u);
+  sig[1] = _mm_unpackhi_epi16(sig[2], u);
+  sig[2] = u;
+}
+
+static INLINE void filter_8_2t_pixels(const __m128i *sig, const __m128i *f,
+                                      __m128i *y0, __m128i *y1) {
+  const __m128i rounding = _mm_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
+  __m128i x0 = _mm_madd_epi16(sig[0], *f);
+  __m128i x1 = _mm_madd_epi16(sig[1], *f);
+  x0 = _mm_add_epi32(x0, rounding);
+  x1 = _mm_add_epi32(x1, rounding);
+  *y0 = _mm_srai_epi32(x0, CONV8_ROUNDING_BITS);
+  *y1 = _mm_srai_epi32(x1, CONV8_ROUNDING_BITS);
+}
+
+static INLINE void store_8x1_2t_pixels_ver(const __m128i *y0, const __m128i *y1,
+                                           const __m128i *mask, uint16_t *dst) {
+  __m128i res = _mm_packus_epi32(*y0, *y1);
+  res = _mm_min_epi16(res, *mask);
+  _mm_storeu_si128((__m128i *)dst, res);
+}
+
+static void aom_highbd_filter_block1d8_v2_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+  __m128i signal[3], res0, res1;
+  const __m128i max = _mm_set1_epi16((1 << bd) - 1);
+  __m128i ff;
+
+  pack_8x1_2t_filter(filter, &ff);
+  pack_8x2_init(src_ptr, signal);
+
+  do {
+    pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal);
+    filter_8_2t_pixels(signal, &ff, &res0, &res1);
+    store_8x1_2t_pixels_ver(&res0, &res1, &max, dst_ptr);
+
+    src_ptr += src_pitch;
+    dst_ptr += dst_pitch;
+    height -= 1;
+  } while (height > 0);
+}
+
+void aom_highbd_filter_block1d4_h8_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
+                                        ptrdiff_t, uint32_t, const int16_t *,
+                                        int);
+void aom_highbd_filter_block1d4_h2_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
+                                        ptrdiff_t, uint32_t, const int16_t *,
+                                        int);
+void aom_highbd_filter_block1d4_v8_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
+                                        ptrdiff_t, uint32_t, const int16_t *,
+                                        int);
+void aom_highbd_filter_block1d4_v2_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
+                                        ptrdiff_t, uint32_t, const int16_t *,
+                                        int);
+#define aom_highbd_filter_block1d4_h8_avx2 aom_highbd_filter_block1d4_h8_sse2
+#define aom_highbd_filter_block1d4_h2_avx2 aom_highbd_filter_block1d4_h2_sse2
+#define aom_highbd_filter_block1d4_v8_avx2 aom_highbd_filter_block1d4_v8_sse2
+#define aom_highbd_filter_block1d4_v2_avx2 aom_highbd_filter_block1d4_v2_sse2
+
+HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);
+HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
+
+#undef HIGHBD_FUNC
diff --git a/libs/libaom/src/aom_dsp/x86/highbd_convolve_sse2.c b/libs/libaom/src/aom_dsp/x86/highbd_convolve_sse2.c
new file mode 100644
index 000000000..a2bb28322
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/highbd_convolve_sse2.c
@@ -0,0 +1,351 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <emmintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom_dsp/x86/convolve.h"
+
+// -----------------------------------------------------------------------------
+
+void aom_highbd_filter_block1d4_v4_sse2(const uint16_t *src_ptr,
+                                        ptrdiff_t src_pitch, uint16_t *dst_ptr,
+                                        ptrdiff_t dst_pitch, uint32_t height,
+                                        const int16_t *filter, int bd) {
+  __m128i filtersReg;
+  __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
+  __m128i srcReg23_lo, srcReg34_lo;
+  __m128i srcReg45_lo, srcReg56_lo;
+  __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo;
+  __m128i resReg23_45_lo, resReg34_56_lo;
+  __m128i resReg23_45, resReg34_56;
+  __m128i addFilterReg64, secondFilters, thirdFilters;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+
+  const __m128i max = _mm_set1_epi16((1 << bd) - 1);
+  addFilterReg64 = _mm_set1_epi32(64);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+
+  // coeffs 0 1 0 1 2 3 2 3
+  const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
+  // coeffs 4 5 4 5 6 7 6 7
+  const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
+
+  secondFilters = _mm_unpackhi_epi64(tmp0, tmp0);  // coeffs 2 3 2 3 2 3 2 3
+  thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1);   // coeffs 4 5 4 5 4 5 4 5
+
+  // multiply the size of the source and destination stride by two
+  src_stride = src_pitch << 1;
+  dst_stride = dst_pitch << 1;
+
+  srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
+  srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
+  srcReg23_lo = _mm_unpacklo_epi16(srcReg2, srcReg3);
+
+  srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
+  srcReg34_lo = _mm_unpacklo_epi16(srcReg3, srcReg4);
+
+  for (i = height; i > 1; i -= 2) {
+    srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
+    srcReg45_lo = _mm_unpacklo_epi16(srcReg4, srcReg5);
+
+    srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
+    srcReg56_lo = _mm_unpacklo_epi16(srcReg5, srcReg6);
+
+    // multiply 2 adjacent elements with the filter and add the result
+
+    resReg23_lo = _mm_madd_epi16(srcReg23_lo, secondFilters);
+    resReg34_lo = _mm_madd_epi16(srcReg34_lo, secondFilters);
+    resReg45_lo = _mm_madd_epi16(srcReg45_lo, thirdFilters);
+    resReg56_lo = _mm_madd_epi16(srcReg56_lo, thirdFilters);
+
+    resReg23_45_lo = _mm_add_epi32(resReg23_lo, resReg45_lo);
+    resReg34_56_lo = _mm_add_epi32(resReg34_lo, resReg56_lo);
+
+    // shift by 7 bit each 32 bit
+    resReg23_45_lo = _mm_add_epi32(resReg23_45_lo, addFilterReg64);
+    resReg34_56_lo = _mm_add_epi32(resReg34_56_lo, addFilterReg64);
+    resReg23_45_lo = _mm_srai_epi32(resReg23_45_lo, 7);
+    resReg34_56_lo = _mm_srai_epi32(resReg34_56_lo, 7);
+
+    // shrink to 16 bit each 32 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    resReg23_45 = _mm_packs_epi32(resReg23_45_lo, _mm_setzero_si128());
+    resReg34_56 = _mm_packs_epi32(resReg34_56_lo, _mm_setzero_si128());
+
+    resReg23_45 = _mm_max_epi16(resReg23_45, _mm_setzero_si128());
+    resReg23_45 = _mm_min_epi16(resReg23_45, max);
+    resReg34_56 = _mm_max_epi16(resReg34_56, _mm_setzero_si128());
+    resReg34_56 = _mm_min_epi16(resReg34_56, max);
+
+    src_ptr += src_stride;
+
+    _mm_storel_epi64((__m128i *)dst_ptr, (resReg23_45));
+    _mm_storel_epi64((__m128i *)(dst_ptr + dst_pitch), (resReg34_56));
+
+    dst_ptr += dst_stride;
+
+    // save part of the registers for next strides
+    srcReg23_lo = srcReg45_lo;
+    srcReg34_lo = srcReg56_lo;
+    srcReg4 = srcReg6;
+  }
+}
+
+void aom_highbd_filter_block1d4_h4_sse2(const uint16_t *src_ptr,
+                                        ptrdiff_t src_pitch, uint16_t *dst_ptr,
+                                        ptrdiff_t dst_pitch, uint32_t height,
+                                        const int16_t *filter, int bd) {
+  __m128i filtersReg;
+  __m128i addFilterReg64;
+  __m128i secondFilters, thirdFilters;
+  __m128i srcRegFilt32b1_1;
+  __m128i srcReg32b1;
+  unsigned int i;
+  src_ptr -= 3;
+  addFilterReg64 = _mm_set1_epi32(64);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  const __m128i max = _mm_set1_epi16((1 << bd) - 1);
+
+  // coeffs 0 1 0 1 2 3 2 3
+  const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
+  // coeffs 4 5 4 5 6 7 6 7
+  const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
+
+  secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0);  // coeffs 2 3 2 3 2 3 2 3
+  thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1);   // coeffs 4 5 4 5 4 5 4 5
+
+  for (i = height; i > 0; i -= 1) {
+    srcReg32b1 = _mm_loadu_si128((const __m128i *)(src_ptr + 2));
+
+    __m128i ss_3_1 = _mm_srli_si128(srcReg32b1, 2);
+    __m128i ss_4_1 = _mm_srli_si128(srcReg32b1, 4);
+    __m128i ss_5_1 = _mm_srli_si128(srcReg32b1, 6);
+    __m128i ss_23 = _mm_unpacklo_epi32(srcReg32b1, ss_3_1);
+    __m128i ss_45 = _mm_unpacklo_epi32(ss_4_1, ss_5_1);
+
+    ss_23 = _mm_madd_epi16(ss_23, secondFilters);
+    ss_45 = _mm_madd_epi16(ss_45, thirdFilters);
+    srcRegFilt32b1_1 = _mm_add_epi32(ss_23, ss_45);
+
+    // shift by 7 bit each 32 bit
+    srcRegFilt32b1_1 = _mm_add_epi32(srcRegFilt32b1_1, addFilterReg64);
+    srcRegFilt32b1_1 = _mm_srai_epi32(srcRegFilt32b1_1, 7);
+
+    srcRegFilt32b1_1 = _mm_packs_epi32(srcRegFilt32b1_1, _mm_setzero_si128());
+    srcRegFilt32b1_1 = _mm_max_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
+    srcRegFilt32b1_1 = _mm_min_epi16(srcRegFilt32b1_1, max);
+
+    src_ptr += src_pitch;
+
+    _mm_storel_epi64((__m128i *)dst_ptr, srcRegFilt32b1_1);
+
+    dst_ptr += dst_pitch;
+  }
+}
+
+void aom_highbd_filter_block1d8_v4_sse2(const uint16_t *src_ptr,
+                                        ptrdiff_t src_pitch, uint16_t *dst_ptr,
+                                        ptrdiff_t dst_pitch, uint32_t height,
+                                        const int16_t *filter, int bd) {
+  __m128i filtersReg;
+  __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
+  __m128i srcReg23_lo, srcReg23_hi, srcReg34_lo, srcReg34_hi;
+  __m128i srcReg45_lo, srcReg45_hi, srcReg56_lo, srcReg56_hi;
+  __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo;
+  __m128i resReg23_hi, resReg34_hi, resReg45_hi, resReg56_hi;
+  __m128i resReg23_45_lo, resReg34_56_lo, resReg23_45_hi, resReg34_56_hi;
+  __m128i resReg23_45, resReg34_56;
+  __m128i addFilterReg64, secondFilters, thirdFilters;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+
+  const __m128i max = _mm_set1_epi16((1 << bd) - 1);
+  addFilterReg64 = _mm_set1_epi32(64);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+
+  // coeffs 0 1 0 1 2 3 2 3
+  const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
+  // coeffs 4 5 4 5 6 7 6 7
+  const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
+
+  secondFilters = _mm_unpackhi_epi64(tmp0, tmp0);  // coeffs 2 3 2 3 2 3 2 3
+  thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1);   // coeffs 4 5 4 5 4 5 4 5
+
+  // multiple the size of the source and destination stride by two
+  src_stride = src_pitch << 1;
+  dst_stride = dst_pitch << 1;
+
+  srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
+  srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
+  srcReg23_lo = _mm_unpacklo_epi16(srcReg2, srcReg3);
+  srcReg23_hi = _mm_unpackhi_epi16(srcReg2, srcReg3);
+
+  srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
+  srcReg34_lo = _mm_unpacklo_epi16(srcReg3, srcReg4);
+  srcReg34_hi = _mm_unpackhi_epi16(srcReg3, srcReg4);
+
+  for (i = height; i > 1; i -= 2) {
+    srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
+
+    srcReg45_lo = _mm_unpacklo_epi16(srcReg4, srcReg5);
+    srcReg45_hi = _mm_unpackhi_epi16(srcReg4, srcReg5);
+
+    srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
+
+    srcReg56_lo = _mm_unpacklo_epi16(srcReg5, srcReg6);
+    srcReg56_hi = _mm_unpackhi_epi16(srcReg5, srcReg6);
+
+    // multiply 2 adjacent elements with the filter and add the result
+
+    resReg23_lo = _mm_madd_epi16(srcReg23_lo, secondFilters);
+    resReg34_lo = _mm_madd_epi16(srcReg34_lo, secondFilters);
+    resReg45_lo = _mm_madd_epi16(srcReg45_lo, thirdFilters);
+    resReg56_lo = _mm_madd_epi16(srcReg56_lo, thirdFilters);
+
+    resReg23_45_lo = _mm_add_epi32(resReg23_lo, resReg45_lo);
+    resReg34_56_lo = _mm_add_epi32(resReg34_lo, resReg56_lo);
+
+    // multiply 2 adjacent elements with the filter and add the result
+
+    resReg23_hi = _mm_madd_epi16(srcReg23_hi, secondFilters);
+    resReg34_hi = _mm_madd_epi16(srcReg34_hi, secondFilters);
+    resReg45_hi = _mm_madd_epi16(srcReg45_hi, thirdFilters);
+    resReg56_hi = _mm_madd_epi16(srcReg56_hi, thirdFilters);
+
+    resReg23_45_hi = _mm_add_epi32(resReg23_hi, resReg45_hi);
+    resReg34_56_hi = _mm_add_epi32(resReg34_hi, resReg56_hi);
+
+    // shift by 7 bit each 32 bit
+    resReg23_45_lo = _mm_add_epi32(resReg23_45_lo, addFilterReg64);
+    resReg34_56_lo = _mm_add_epi32(resReg34_56_lo, addFilterReg64);
+    resReg23_45_hi = _mm_add_epi32(resReg23_45_hi, addFilterReg64);
+    resReg34_56_hi = _mm_add_epi32(resReg34_56_hi, addFilterReg64);
+    resReg23_45_lo = _mm_srai_epi32(resReg23_45_lo, 7);
+    resReg34_56_lo = _mm_srai_epi32(resReg34_56_lo, 7);
+    resReg23_45_hi = _mm_srai_epi32(resReg23_45_hi, 7);
+    resReg34_56_hi = _mm_srai_epi32(resReg34_56_hi, 7);
+
+    // shrink to 16 bit each 32 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    resReg23_45 = _mm_packs_epi32(resReg23_45_lo, resReg23_45_hi);
+    resReg34_56 = _mm_packs_epi32(resReg34_56_lo, resReg34_56_hi);
+
+    resReg23_45 = _mm_max_epi16(resReg23_45, _mm_setzero_si128());
+    resReg23_45 = _mm_min_epi16(resReg23_45, max);
+    resReg34_56 = _mm_max_epi16(resReg34_56, _mm_setzero_si128());
+    resReg34_56 = _mm_min_epi16(resReg34_56, max);
+
+    src_ptr += src_stride;
+
+    _mm_store_si128((__m128i *)dst_ptr, (resReg23_45));
+    _mm_store_si128((__m128i *)(dst_ptr + dst_pitch), (resReg34_56));
+
+    dst_ptr += dst_stride;
+
+    // save part of the registers for next strides
+    srcReg23_lo = srcReg45_lo;
+    srcReg23_hi = srcReg45_hi;
+    srcReg34_lo = srcReg56_lo;
+    srcReg34_hi = srcReg56_hi;
+    srcReg4 = srcReg6;
+  }
+}
+
+void aom_highbd_filter_block1d8_h4_sse2(const uint16_t *src_ptr,
+                                        ptrdiff_t src_pitch, uint16_t *dst_ptr,
+                                        ptrdiff_t dst_pitch, uint32_t height,
+                                        const int16_t *filter, int bd) {
+  __m128i filtersReg;
+  __m128i addFilterReg64;
+  __m128i secondFilters, thirdFilters;
+  __m128i srcRegFilt32b1_1, srcRegFilt32b1_2;
+  __m128i srcReg32b1, srcReg32b2;
+  unsigned int i;
+  src_ptr -= 3;
+  addFilterReg64 = _mm_set1_epi32(64);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  const __m128i max = _mm_set1_epi16((1 << bd) - 1);
+
+  // coeffs 0 1 0 1 2 3 2 3
+  const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
+  // coeffs 4 5 4 5 6 7 6 7
+  const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
+
+  secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0);  // coeffs 2 3 2 3 2 3 2 3
+  thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1);   // coeffs 4 5 4 5 4 5 4 5
+
+  for (i = height; i > 0; i -= 1) {
+    srcReg32b1 = _mm_loadu_si128((const __m128i *)(src_ptr + 2));
+    srcReg32b2 = _mm_loadu_si128((const __m128i *)(src_ptr + 6));
+
+    __m128i ss_4_1 = _mm_srli_si128(srcReg32b1, 4);
+    __m128i ss_4_2 = _mm_srli_si128(srcReg32b2, 4);
+    __m128i ss_4 = _mm_unpacklo_epi64(ss_4_1, ss_4_2);
+
+    __m128i d1 = _mm_madd_epi16(srcReg32b1, secondFilters);
+    __m128i d2 = _mm_madd_epi16(ss_4, thirdFilters);
+    srcRegFilt32b1_1 = _mm_add_epi32(d1, d2);
+
+    __m128i ss_3_1 = _mm_srli_si128(srcReg32b1, 2);
+    __m128i ss_5_1 = _mm_srli_si128(srcReg32b1, 6);
+    __m128i ss_3_2 = _mm_srli_si128(srcReg32b2, 2);
+    __m128i ss_5_2 = _mm_srli_si128(srcReg32b2, 6);
+    __m128i ss_3 = _mm_unpacklo_epi64(ss_3_1, ss_3_2);
+    __m128i ss_5 = _mm_unpacklo_epi64(ss_5_1, ss_5_2);
+
+    d1 = _mm_madd_epi16(ss_3, secondFilters);
+    d2 = _mm_madd_epi16(ss_5, thirdFilters);
+    srcRegFilt32b1_2 = _mm_add_epi32(d1, d2);
+
+    __m128i res_lo_1 = _mm_unpacklo_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2);
+    __m128i res_hi_1 = _mm_unpackhi_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2);
+
+    // shift by 7 bit each 32 bit
+    res_lo_1 = _mm_add_epi32(res_lo_1, addFilterReg64);
+    res_hi_1 = _mm_add_epi32(res_hi_1, addFilterReg64);
+    res_lo_1 = _mm_srai_epi32(res_lo_1, 7);
+    res_hi_1 = _mm_srai_epi32(res_hi_1, 7);
+
+    srcRegFilt32b1_1 = _mm_packs_epi32(res_lo_1, res_hi_1);
+
+    srcRegFilt32b1_1 = _mm_max_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
+    srcRegFilt32b1_1 = _mm_min_epi16(srcRegFilt32b1_1, max);
+
+    src_ptr += src_pitch;
+
+    _mm_store_si128((__m128i *)dst_ptr, srcRegFilt32b1_1);
+
+    dst_ptr += dst_pitch;
+  }
+}
+
+void aom_highbd_filter_block1d16_v4_sse2(const uint16_t *src_ptr,
+                                         ptrdiff_t src_pitch, uint16_t *dst_ptr,
+                                         ptrdiff_t dst_pitch, uint32_t height,
+                                         const int16_t *filter, int bd) {
+  aom_highbd_filter_block1d8_v4_sse2(src_ptr, src_pitch, dst_ptr, dst_pitch,
+                                     height, filter, bd);
+  aom_highbd_filter_block1d8_v4_sse2((src_ptr + 8), src_pitch, (dst_ptr + 8),
+                                     dst_pitch, height, filter, bd);
+}
+
+void aom_highbd_filter_block1d16_h4_sse2(const uint16_t *src_ptr,
+                                         ptrdiff_t src_pitch, uint16_t *dst_ptr,
+                                         ptrdiff_t dst_pitch, uint32_t height,
+                                         const int16_t *filter, int bd) {
+  aom_highbd_filter_block1d8_h4_sse2(src_ptr, src_pitch, dst_ptr, dst_pitch,
+                                     height, filter, bd);
+  aom_highbd_filter_block1d8_h4_sse2((src_ptr + 8), src_pitch, (dst_ptr + 8),
+                                     dst_pitch, height, filter, bd);
+}
diff --git a/libs/libaom/src/aom_dsp/x86/highbd_convolve_ssse3.c b/libs/libaom/src/aom_dsp/x86/highbd_convolve_ssse3.c
new file mode 100644
index 000000000..a79350f5a
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/highbd_convolve_ssse3.c
@@ -0,0 +1,251 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/convolve_sse2.h"
+
+void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride,
+                                    uint16_t *dst, int dst_stride, int w, int h,
+                                    const InterpFilterParams *filter_params_x,
+                                    const InterpFilterParams *filter_params_y,
+                                    const int subpel_x_qn,
+                                    const int subpel_y_qn,
+                                    ConvolveParams *conv_params, int bd) {
+  int i, j;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const uint16_t *const src_ptr = src - fo_vert * src_stride;
+  (void)filter_params_x;
+  (void)subpel_x_qn;
+  (void)conv_params;
+
+  assert(conv_params->round_0 <= FILTER_BITS);
+  assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
+         ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
+
+  __m128i s[16], coeffs_y[4];
+
+  const int bits = FILTER_BITS;
+
+  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+  const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1);
+  const __m128i clip_pixel =
+      _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+  const __m128i zero = _mm_setzero_si128();
+
+  prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y);
+
+  for (j = 0; j < w; j += 8) {
+    const uint16_t *data = &src_ptr[j];
+    /* Vertical filter */
+    {
+      __m128i s0 = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
+      __m128i s1 = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
+      __m128i s2 = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
+      __m128i s3 = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
+      __m128i s4 = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
+      __m128i s5 = _mm_loadu_si128((__m128i *)(data + 5 * src_stride));
+      __m128i s6 = _mm_loadu_si128((__m128i *)(data + 6 * src_stride));
+
+      s[0] = _mm_unpacklo_epi16(s0, s1);
+      s[1] = _mm_unpacklo_epi16(s2, s3);
+      s[2] = _mm_unpacklo_epi16(s4, s5);
+
+      s[4] = _mm_unpackhi_epi16(s0, s1);
+      s[5] = _mm_unpackhi_epi16(s2, s3);
+      s[6] = _mm_unpackhi_epi16(s4, s5);
+
+      s[0 + 8] = _mm_unpacklo_epi16(s1, s2);
+      s[1 + 8] = _mm_unpacklo_epi16(s3, s4);
+      s[2 + 8] = _mm_unpacklo_epi16(s5, s6);
+
+      s[4 + 8] = _mm_unpackhi_epi16(s1, s2);
+      s[5 + 8] = _mm_unpackhi_epi16(s3, s4);
+      s[6 + 8] = _mm_unpackhi_epi16(s5, s6);
+
+      for (i = 0; i < h; i += 2) {
+        data = &src_ptr[i * src_stride + j];
+
+        __m128i s7 = _mm_loadu_si128((__m128i *)(data + 7 * src_stride));
+        __m128i s8 = _mm_loadu_si128((__m128i *)(data + 8 * src_stride));
+
+        s[3] = _mm_unpacklo_epi16(s6, s7);
+        s[7] = _mm_unpackhi_epi16(s6, s7);
+
+        s[3 + 8] = _mm_unpacklo_epi16(s7, s8);
+        s[7 + 8] = _mm_unpackhi_epi16(s7, s8);
+
+        const __m128i res_a0 = convolve(s, coeffs_y);
+        __m128i res_a_round0 = _mm_sra_epi32(
+            _mm_add_epi32(res_a0, round_const_bits), round_shift_bits);
+
+        const __m128i res_a1 = convolve(s + 8, coeffs_y);
+        __m128i res_a_round1 = _mm_sra_epi32(
+            _mm_add_epi32(res_a1, round_const_bits), round_shift_bits);
+
+        if (w - j > 4) {
+          const __m128i res_b0 = convolve(s + 4, coeffs_y);
+          __m128i res_b_round0 = _mm_sra_epi32(
+              _mm_add_epi32(res_b0, round_const_bits), round_shift_bits);
+
+          const __m128i res_b1 = convolve(s + 4 + 8, coeffs_y);
+          __m128i res_b_round1 = _mm_sra_epi32(
+              _mm_add_epi32(res_b1, round_const_bits), round_shift_bits);
+
+          __m128i res_16bit0 = _mm_packs_epi32(res_a_round0, res_b_round0);
+          res_16bit0 = _mm_min_epi16(res_16bit0, clip_pixel);
+          res_16bit0 = _mm_max_epi16(res_16bit0, zero);
+
+          __m128i res_16bit1 = _mm_packs_epi32(res_a_round1, res_b_round1);
+          res_16bit1 = _mm_min_epi16(res_16bit1, clip_pixel);
+          res_16bit1 = _mm_max_epi16(res_16bit1, zero);
+
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_16bit0);
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                           res_16bit1);
+        } else if (w == 4) {
+          res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0);
+          res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel);
+          res_a_round0 = _mm_max_epi16(res_a_round0, zero);
+
+          res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1);
+          res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel);
+          res_a_round1 = _mm_max_epi16(res_a_round1, zero);
+
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_a_round0);
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                           res_a_round1);
+        } else {
+          res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0);
+          res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel);
+          res_a_round0 = _mm_max_epi16(res_a_round0, zero);
+
+          res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1);
+          res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel);
+          res_a_round1 = _mm_max_epi16(res_a_round1, zero);
+
+          *((uint32_t *)(&dst[i * dst_stride + j])) =
+              _mm_cvtsi128_si32(res_a_round0);
+
+          *((uint32_t *)(&dst[i * dst_stride + j + dst_stride])) =
+              _mm_cvtsi128_si32(res_a_round1);
+        }
+
+        s[0] = s[1];
+        s[1] = s[2];
+        s[2] = s[3];
+
+        s[4] = s[5];
+        s[5] = s[6];
+        s[6] = s[7];
+
+        s[0 + 8] = s[1 + 8];
+        s[1 + 8] = s[2 + 8];
+        s[2 + 8] = s[3 + 8];
+
+        s[4 + 8] = s[5 + 8];
+        s[5 + 8] = s[6 + 8];
+        s[6 + 8] = s[7 + 8];
+
+        s6 = s8;
+      }
+    }
+  }
+}
+
+void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride,
+                                    uint16_t *dst, int dst_stride, int w, int h,
+                                    const InterpFilterParams *filter_params_x,
+                                    const InterpFilterParams *filter_params_y,
+                                    const int subpel_x_qn,
+                                    const int subpel_y_qn,
+                                    ConvolveParams *conv_params, int bd) {
+  int i, j;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const uint16_t *const src_ptr = src - fo_horiz;
+  (void)subpel_y_qn;
+  (void)filter_params_y;
+
+  // Check that, even with 12-bit input, the intermediate values will fit
+  // into an unsigned 16-bit intermediate array.
+  assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
+
+  __m128i s[4], coeffs_x[4];
+
+  const __m128i round_const_x =
+      _mm_set1_epi32(((1 << conv_params->round_0) >> 1));
+  const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
+
+  const int bits = FILTER_BITS - conv_params->round_0;
+
+  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+  const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1);
+  const __m128i clip_pixel =
+      _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+  const __m128i zero = _mm_setzero_si128();
+
+  prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x);
+
+  for (j = 0; j < w; j += 8) {
+    /* Horizontal filter */
+    {
+      for (i = 0; i < h; i += 1) {
+        const __m128i row00 =
+            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+        const __m128i row01 =
+            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 8)]);
+
+        // even pixels
+        s[0] = _mm_alignr_epi8(row01, row00, 0);
+        s[1] = _mm_alignr_epi8(row01, row00, 4);
+        s[2] = _mm_alignr_epi8(row01, row00, 8);
+        s[3] = _mm_alignr_epi8(row01, row00, 12);
+
+        __m128i res_even = convolve(s, coeffs_x);
+        res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_x),
+                                 round_shift_x);
+
+        // odd pixels
+        s[0] = _mm_alignr_epi8(row01, row00, 2);
+        s[1] = _mm_alignr_epi8(row01, row00, 6);
+        s[2] = _mm_alignr_epi8(row01, row00, 10);
+        s[3] = _mm_alignr_epi8(row01, row00, 14);
+
+        __m128i res_odd = convolve(s, coeffs_x);
+        res_odd =
+            _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_x), round_shift_x);
+
+        res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_bits),
+                                 round_shift_bits);
+        res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_bits),
+                                round_shift_bits);
+
+        __m128i res_even1 = _mm_packs_epi32(res_even, res_even);
+        __m128i res_odd1 = _mm_packs_epi32(res_odd, res_odd);
+        __m128i res = _mm_unpacklo_epi16(res_even1, res_odd1);
+
+        res = _mm_min_epi16(res, clip_pixel);
+        res = _mm_max_epi16(res, zero);
+
+        if (w - j > 4) {
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
+        } else if (w == 4) {
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res);
+        } else {
+          *((uint32_t *)(&dst[i * dst_stride + j])) = _mm_cvtsi128_si32(res);
+        }
+      }
+    }
+  }
+}
diff --git a/libs/libaom/src/aom_dsp/x86/highbd_intrapred_asm_sse2.asm b/libs/libaom/src/aom_dsp/x86/highbd_intrapred_asm_sse2.asm
new file mode 100644
index 000000000..91b3d126c
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/highbd_intrapred_asm_sse2.asm
@@ -0,0 +1,259 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_4:  times 8 dw 4
+pw_8:  times 8 dw 8
+pw_16: times 4 dd 16
+pw_32: times 4 dd 32
+
+SECTION .text
+INIT_XMM sse2
+cglobal highbd_dc_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  movq                  m0, [aboveq]
+  movq                  m2, [leftq]
+  paddw                 m0, m2
+  pshuflw               m1, m0, 0xe
+  paddw                 m0, m1
+  pshuflw               m1, m0, 0x1
+  paddw                 m0, m1
+  paddw                 m0, [GLOBAL(pw_4)]
+  psraw                 m0, 3
+  pshuflw               m0, m0, 0x0
+  movq    [dstq          ], m0
+  movq    [dstq+strideq*2], m0
+  lea                 dstq, [dstq+strideq*4]
+  movq    [dstq          ], m0
+  movq    [dstq+strideq*2], m0
+
+  RESTORE_GOT
+  RET
+
+INIT_XMM sse2
+cglobal highbd_dc_predictor_8x8, 4, 5, 4, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  mova                  m0, [aboveq]
+  mova                  m2, [leftq]
+  DEFINE_ARGS dst, stride, stride3, one
+  mov                 oned, 0x00010001
+  lea             stride3q, [strideq*3]
+  movd                  m3, oned
+  pshufd                m3, m3, 0x0
+  paddw                 m0, m2
+  pmaddwd               m0, m3
+  packssdw              m0, m1
+  pmaddwd               m0, m3
+  packssdw              m0, m1
+  pmaddwd               m0, m3
+  paddw                 m0, [GLOBAL(pw_8)]
+  psrlw                 m0, 4
+  pshuflw               m0, m0, 0x0
+  punpcklqdq            m0, m0
+  mova   [dstq           ], m0
+  mova   [dstq+strideq*2 ], m0
+  mova   [dstq+strideq*4 ], m0
+  mova   [dstq+stride3q*2], m0
+  lea                 dstq, [dstq+strideq*8]
+  mova   [dstq           ], m0
+  mova   [dstq+strideq*2 ], m0
+  mova   [dstq+strideq*4 ], m0
+  mova   [dstq+stride3q*2], m0
+
+  RESTORE_GOT
+  RET
+
+INIT_XMM sse2
+cglobal highbd_dc_predictor_16x16, 4, 5, 5, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  mova                  m0, [aboveq]
+  mova                  m3, [aboveq+16]
+  mova                  m2, [leftq]
+  mova                  m4, [leftq+16]
+  DEFINE_ARGS dst, stride, stride3, lines4
+  lea             stride3q, [strideq*3]
+  mov              lines4d, 4
+  paddw                 m0, m2
+  paddw                 m0, m3
+  paddw                 m0, m4
+  movhlps               m2, m0
+  paddw                 m0, m2
+  punpcklwd             m0, m1
+  movhlps               m2, m0
+  paddd                 m0, m2
+  punpckldq             m0, m1
+  movhlps               m2, m0
+  paddd                 m0, m2
+  paddd                 m0, [GLOBAL(pw_16)]
+  psrad                 m0, 5
+  pshuflw               m0, m0, 0x0
+  punpcklqdq            m0, m0
+.loop:
+  mova   [dstq              ], m0
+  mova   [dstq           +16], m0
+  mova   [dstq+strideq*2    ], m0
+  mova   [dstq+strideq*2 +16], m0
+  mova   [dstq+strideq*4    ], m0
+  mova   [dstq+strideq*4 +16], m0
+  mova   [dstq+stride3q*2   ], m0
+  mova   [dstq+stride3q*2+16], m0
+  lea                 dstq, [dstq+strideq*8]
+  dec              lines4d
+  jnz .loop
+
+  RESTORE_GOT
+  REP_RET
+
+INIT_XMM sse2
+cglobal highbd_dc_predictor_32x32, 4, 5, 7, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  mova                  m0, [aboveq]
+  mova                  m2, [aboveq+16]
+  mova                  m3, [aboveq+32]
+  mova                  m4, [aboveq+48]
+  paddw                 m0, m2
+  paddw                 m3, m4
+  mova                  m2, [leftq]
+  mova                  m4, [leftq+16]
+  mova                  m5, [leftq+32]
+  mova                  m6, [leftq+48]
+  paddw                 m2, m4
+  paddw                 m5, m6
+  paddw                 m0, m3
+  paddw                 m2, m5
+  pxor                  m1, m1
+  paddw                 m0, m2
+  DEFINE_ARGS dst, stride, stride3, lines4
+  lea             stride3q, [strideq*3]
+  mov              lines4d, 8
+  movhlps               m2, m0
+  paddw                 m0, m2
+  punpcklwd             m0, m1
+  movhlps               m2, m0
+  paddd                 m0, m2
+  punpckldq             m0, m1
+  movhlps               m2, m0
+  paddd                 m0, m2
+  paddd                 m0, [GLOBAL(pw_32)]
+  psrad                 m0, 6
+  pshuflw               m0, m0, 0x0
+  punpcklqdq            m0, m0
+.loop:
+  mova [dstq               ], m0
+  mova [dstq          +16  ], m0
+  mova [dstq          +32  ], m0
+  mova [dstq          +48  ], m0
+  mova [dstq+strideq*2     ], m0
+  mova [dstq+strideq*2+16  ], m0
+  mova [dstq+strideq*2+32  ], m0
+  mova [dstq+strideq*2+48  ], m0
+  mova [dstq+strideq*4     ], m0
+  mova [dstq+strideq*4+16  ], m0
+  mova [dstq+strideq*4+32  ], m0
+  mova [dstq+strideq*4+48  ], m0
+  mova [dstq+stride3q*2    ], m0
+  mova [dstq+stride3q*2 +16], m0
+  mova [dstq+stride3q*2 +32], m0
+  mova [dstq+stride3q*2 +48], m0
+  lea                 dstq, [dstq+strideq*8]
+  dec              lines4d
+  jnz .loop
+
+  RESTORE_GOT
+  REP_RET
+
+INIT_XMM sse2
+cglobal highbd_v_predictor_4x4, 3, 3, 1, dst, stride, above
+  movq                  m0, [aboveq]
+  movq    [dstq          ], m0
+  movq    [dstq+strideq*2], m0
+  lea                 dstq, [dstq+strideq*4]
+  movq    [dstq          ], m0
+  movq    [dstq+strideq*2], m0
+  RET
+
+INIT_XMM sse2
+cglobal highbd_v_predictor_8x8, 3, 3, 1, dst, stride, above
+  mova                  m0, [aboveq]
+  DEFINE_ARGS dst, stride, stride3
+  lea             stride3q, [strideq*3]
+  mova   [dstq           ], m0
+  mova   [dstq+strideq*2 ], m0
+  mova   [dstq+strideq*4 ], m0
+  mova   [dstq+stride3q*2], m0
+  lea                 dstq, [dstq+strideq*8]
+  mova   [dstq           ], m0
+  mova   [dstq+strideq*2 ], m0
+  mova   [dstq+strideq*4 ], m0
+  mova   [dstq+stride3q*2], m0
+  RET
+
+INIT_XMM sse2
+cglobal highbd_v_predictor_16x16, 3, 4, 2, dst, stride, above
+  mova                  m0, [aboveq]
+  mova                  m1, [aboveq+16]
+  DEFINE_ARGS dst, stride, stride3, nlines4
+  lea             stride3q, [strideq*3]
+  mov              nlines4d, 4
+.loop:
+  mova    [dstq              ], m0
+  mova    [dstq           +16], m1
+  mova    [dstq+strideq*2    ], m0
+  mova    [dstq+strideq*2 +16], m1
+  mova    [dstq+strideq*4    ], m0
+  mova    [dstq+strideq*4 +16], m1
+  mova    [dstq+stride3q*2   ], m0
+  mova    [dstq+stride3q*2+16], m1
+  lea                 dstq, [dstq+strideq*8]
+  dec             nlines4d
+  jnz .loop
+  REP_RET
+
+INIT_XMM sse2
+cglobal highbd_v_predictor_32x32, 3, 4, 4, dst, stride, above
+  mova                  m0, [aboveq]
+  mova                  m1, [aboveq+16]
+  mova                  m2, [aboveq+32]
+  mova                  m3, [aboveq+48]
+  DEFINE_ARGS dst, stride, stride3, nlines4
+  lea             stride3q, [strideq*3]
+  mov              nlines4d, 8
+.loop:
+  mova [dstq               ], m0
+  mova [dstq            +16], m1
+  mova [dstq            +32], m2
+  mova [dstq            +48], m3
+  mova [dstq+strideq*2     ], m0
+  mova [dstq+strideq*2  +16], m1
+  mova [dstq+strideq*2  +32], m2
+  mova [dstq+strideq*2  +48], m3
+  mova [dstq+strideq*4     ], m0
+  mova [dstq+strideq*4  +16], m1
+  mova [dstq+strideq*4  +32], m2
+  mova [dstq+strideq*4  +48], m3
+  mova [dstq+stride3q*2    ], m0
+  mova [dstq+stride3q*2 +16], m1
+  mova [dstq+stride3q*2 +32], m2
+  mova [dstq+stride3q*2 +48], m3
+  lea                 dstq, [dstq+strideq*8]
+  dec             nlines4d
+  jnz .loop
+  REP_RET
diff --git a/libs/libaom/src/aom_dsp/x86/highbd_intrapred_sse2.c b/libs/libaom/src/aom_dsp/x86/highbd_intrapred_sse2.c
new file mode 100644
index 000000000..5a55736c4
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/highbd_intrapred_sse2.c
@@ -0,0 +1,984 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+// -----------------------------------------------------------------------------
+// H_PRED
+
+void aom_highbd_h_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  const __m128i left_u16 = _mm_loadl_epi64((const __m128i *)left);
+  const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
+  const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
+  const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
+  const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
+  (void)above;
+  (void)bd;
+  _mm_storel_epi64((__m128i *)dst, row0);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, row1);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, row2);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, row3);
+}
+
+void aom_highbd_h_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  aom_highbd_h_predictor_4x4_sse2(dst, stride, above, left, bd);
+  dst += stride << 2;
+  left += 4;
+  aom_highbd_h_predictor_4x4_sse2(dst, stride, above, left, bd);
+}
+
+void aom_highbd_h_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
+  const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
+  const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
+  const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
+  const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
+  (void)above;
+  (void)bd;
+  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row0, row0));
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row1, row1));
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row2, row2));
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row3, row3));
+}
+
+void aom_highbd_h_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
+  const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
+  const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
+  const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
+  const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
+  const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0);
+  const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55);
+  const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa);
+  const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff);
+  (void)above;
+  (void)bd;
+  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row0, row0));
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row1, row1));
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row2, row2));
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row3, row3));
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row4, row4));
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row5, row5));
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row6, row6));
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row7, row7));
+}
+
+void aom_highbd_h_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
+                                      const uint16_t *above,
+                                      const uint16_t *left, int bd) {
+  aom_highbd_h_predictor_8x8_sse2(dst, stride, above, left, bd);
+  dst += stride << 3;
+  left += 8;
+  aom_highbd_h_predictor_8x8_sse2(dst, stride, above, left, bd);
+}
+
+static INLINE void h_store_16_unpacklo(uint16_t **dst, const ptrdiff_t stride,
+                                       const __m128i *row) {
+  const __m128i val = _mm_unpacklo_epi64(*row, *row);
+  _mm_store_si128((__m128i *)*dst, val);
+  _mm_store_si128((__m128i *)(*dst + 8), val);
+  *dst += stride;
+}
+
+static INLINE void h_store_16_unpackhi(uint16_t **dst, const ptrdiff_t stride,
+                                       const __m128i *row) {
+  const __m128i val = _mm_unpackhi_epi64(*row, *row);
+  _mm_store_si128((__m128i *)(*dst), val);
+  _mm_store_si128((__m128i *)(*dst + 8), val);
+  *dst += stride;
+}
+
+static INLINE void h_predictor_16x8(uint16_t *dst, ptrdiff_t stride,
+                                    const uint16_t *left) {
+  const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
+  const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
+  const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
+  const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
+  const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
+  const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0);
+  const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55);
+  const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa);
+  const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff);
+  h_store_16_unpacklo(&dst, stride, &row0);
+  h_store_16_unpacklo(&dst, stride, &row1);
+  h_store_16_unpacklo(&dst, stride, &row2);
+  h_store_16_unpacklo(&dst, stride, &row3);
+  h_store_16_unpackhi(&dst, stride, &row4);
+  h_store_16_unpackhi(&dst, stride, &row5);
+  h_store_16_unpackhi(&dst, stride, &row6);
+  h_store_16_unpackhi(&dst, stride, &row7);
+}
+
+void aom_highbd_h_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride,
+                                      const uint16_t *above,
+                                      const uint16_t *left, int bd) {
+  (void)above;
+  (void)bd;
+  h_predictor_16x8(dst, stride, left);
+}
+
+void aom_highbd_h_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
+                                       const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  int i;
+  (void)above;
+  (void)bd;
+
+  for (i = 0; i < 2; i++, left += 8) {
+    h_predictor_16x8(dst, stride, left);
+    dst += stride << 3;
+  }
+}
+
+void aom_highbd_h_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
+                                       const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  int i;
+  (void)above;
+  (void)bd;
+
+  for (i = 0; i < 4; i++, left += 8) {
+    h_predictor_16x8(dst, stride, left);
+    dst += stride << 3;
+  }
+}
+
+static INLINE void h_store_32_unpacklo(uint16_t **dst, const ptrdiff_t stride,
+                                       const __m128i *row) {
+  const __m128i val = _mm_unpacklo_epi64(*row, *row);
+  _mm_store_si128((__m128i *)(*dst), val);
+  _mm_store_si128((__m128i *)(*dst + 8), val);
+  _mm_store_si128((__m128i *)(*dst + 16), val);
+  _mm_store_si128((__m128i *)(*dst + 24), val);
+  *dst += stride;
+}
+
+static INLINE void h_store_32_unpackhi(uint16_t **dst, const ptrdiff_t stride,
+                                       const __m128i *row) {
+  const __m128i val = _mm_unpackhi_epi64(*row, *row);
+  _mm_store_si128((__m128i *)(*dst), val);
+  _mm_store_si128((__m128i *)(*dst + 8), val);
+  _mm_store_si128((__m128i *)(*dst + 16), val);
+  _mm_store_si128((__m128i *)(*dst + 24), val);
+  *dst += stride;
+}
+
+static INLINE void h_predictor_32x8(uint16_t *dst, ptrdiff_t stride,
+                                    const uint16_t *left) {
+  const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
+  const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
+  const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
+  const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
+  const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
+  const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0);
+  const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55);
+  const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa);
+  const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff);
+  h_store_32_unpacklo(&dst, stride, &row0);
+  h_store_32_unpacklo(&dst, stride, &row1);
+  h_store_32_unpacklo(&dst, stride, &row2);
+  h_store_32_unpacklo(&dst, stride, &row3);
+  h_store_32_unpackhi(&dst, stride, &row4);
+  h_store_32_unpackhi(&dst, stride, &row5);
+  h_store_32_unpackhi(&dst, stride, &row6);
+  h_store_32_unpackhi(&dst, stride, &row7);
+}
+
+void aom_highbd_h_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
+                                       const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  int i;
+  (void)above;
+  (void)bd;
+
+  for (i = 0; i < 2; i++, left += 8) {
+    h_predictor_32x8(dst, stride, left);
+    dst += stride << 3;
+  }
+}
+
+void aom_highbd_h_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
+                                       const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  int i;
+  (void)above;
+  (void)bd;
+
+  for (i = 0; i < 4; i++, left += 8) {
+    h_predictor_32x8(dst, stride, left);
+    dst += stride << 3;
+  }
+}
+
+// -----------------------------------------------------------------------------
+// DC_TOP, DC_LEFT, DC_128
+
+// 4x4
+
+static INLINE __m128i dc_sum_4(const uint16_t *ref) {
+  const __m128i _dcba = _mm_loadl_epi64((const __m128i *)ref);
+  const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe);
+  const __m128i a = _mm_add_epi16(_dcba, _xxdc);
+  return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1));
+}
+
+static INLINE void dc_store_4x4(uint16_t *dst, ptrdiff_t stride,
+                                const __m128i *dc) {
+  const __m128i dc_dup = _mm_shufflelo_epi16(*dc, 0x0);
+  int i;
+  for (i = 0; i < 4; ++i, dst += stride) {
+    _mm_storel_epi64((__m128i *)dst, dc_dup);
+  }
+}
+
+void aom_highbd_dc_left_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  const __m128i two = _mm_cvtsi32_si128(2);
+  const __m128i sum = dc_sum_4(left);
+  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
+  (void)above;
+  (void)bd;
+  dc_store_4x4(dst, stride, &dc);
+}
+
+void aom_highbd_dc_top_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  const __m128i two = _mm_cvtsi32_si128(2);
+  const __m128i sum = dc_sum_4(above);
+  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
+  (void)left;
+  (void)bd;
+  dc_store_4x4(dst, stride, &dc);
+}
+
+void aom_highbd_dc_128_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
+  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
+  (void)above;
+  (void)left;
+  dc_store_4x4(dst, stride, &dc_dup);
+}
+
+// -----------------------------------------------------------------------------
+// 4x8
+
+static INLINE void dc_store_4x8(uint16_t *dst, ptrdiff_t stride,
+                                const __m128i *dc) {
+  const __m128i dc_dup = _mm_shufflelo_epi16(*dc, 0x0);
+  int i;
+  for (i = 0; i < 8; ++i, dst += stride) {
+    _mm_storel_epi64((__m128i *)dst, dc_dup);
+  }
+}
+
+// Shared with DC 8xh
+static INLINE __m128i dc_sum_8(const uint16_t *ref) {
+  const __m128i ref_u16 = _mm_load_si128((const __m128i *)ref);
+  const __m128i _dcba = _mm_add_epi16(ref_u16, _mm_srli_si128(ref_u16, 8));
+  const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe);
+  const __m128i a = _mm_add_epi16(_dcba, _xxdc);
+
+  return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1));
+}
+
+void aom_highbd_dc_left_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  const __m128i sum = dc_sum_8(left);
+  const __m128i four = _mm_cvtsi32_si128(4);
+  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
+  (void)above;
+  (void)bd;
+  dc_store_4x8(dst, stride, &dc);
+}
+
+void aom_highbd_dc_top_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  const __m128i two = _mm_cvtsi32_si128(2);
+  const __m128i sum = dc_sum_4(above);
+  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
+  (void)left;
+  (void)bd;
+  dc_store_4x8(dst, stride, &dc);
+}
+
+void aom_highbd_dc_128_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
+  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
+  (void)above;
+  (void)left;
+  dc_store_4x8(dst, stride, &dc_dup);
+}
+
+// -----------------------------------------------------------------------------
+// 8xh
+
+static INLINE void dc_store_8xh(uint16_t *dst, ptrdiff_t stride, int height,
+                                const __m128i *dc) {
+  const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0);
+  const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo);
+  int i;
+  for (i = 0; i < height; ++i, dst += stride) {
+    _mm_store_si128((__m128i *)dst, dc_dup);
+  }
+}
+
+// -----------------------------------------------------------------------------
+// DC_TOP
+
+static INLINE void dc_top_predictor_8xh(uint16_t *dst, ptrdiff_t stride,
+                                        int height, const uint16_t *above) {
+  const __m128i four = _mm_cvtsi32_si128(4);
+  const __m128i sum = dc_sum_8(above);
+  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
+  dc_store_8xh(dst, stride, height, &dc);
+}
+
+void aom_highbd_dc_top_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  (void)left;
+  (void)bd;
+  dc_top_predictor_8xh(dst, stride, 4, above);
+}
+
+void aom_highbd_dc_top_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  (void)left;
+  (void)bd;
+  dc_top_predictor_8xh(dst, stride, 8, above);
+}
+
+void aom_highbd_dc_top_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  (void)left;
+  (void)bd;
+  dc_top_predictor_8xh(dst, stride, 16, above);
+}
+
+// -----------------------------------------------------------------------------
+// DC_LEFT
+
+void aom_highbd_dc_left_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  const __m128i two = _mm_cvtsi32_si128(2);
+  const __m128i sum = dc_sum_4(left);
+  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
+  (void)above;
+  (void)bd;
+  dc_store_8xh(dst, stride, 4, &dc);
+}
+
+void aom_highbd_dc_left_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  const __m128i four = _mm_cvtsi32_si128(4);
+  const __m128i sum = dc_sum_8(left);
+  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
+  (void)above;
+  (void)bd;
+  dc_store_8xh(dst, stride, 8, &dc);
+}
+
+// Shared with DC 16xh
+static INLINE __m128i dc_sum_16(const uint16_t *ref) {
+  const __m128i sum_lo = dc_sum_8(ref);
+  const __m128i sum_hi = dc_sum_8(ref + 8);
+  return _mm_add_epi16(sum_lo, sum_hi);
+}
+
+void aom_highbd_dc_left_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
+                                            const uint16_t *above,
+                                            const uint16_t *left, int bd) {
+  const __m128i eight = _mm_cvtsi32_si128(8);
+  const __m128i sum = dc_sum_16(left);
+  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
+  (void)above;
+  (void)bd;
+  dc_store_8xh(dst, stride, 16, &dc);
+}
+
+// -----------------------------------------------------------------------------
+// DC_128
+
+static INLINE void dc_128_predictor_8xh(uint16_t *dst, ptrdiff_t stride,
+                                        int height, int bd) {
+  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
+  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
+  dc_store_8xh(dst, stride, height, &dc_dup);
+}
+
+void aom_highbd_dc_128_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  (void)above;
+  (void)left;
+  dc_128_predictor_8xh(dst, stride, 4, bd);
+}
+
+void aom_highbd_dc_128_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  (void)above;
+  (void)left;
+  dc_128_predictor_8xh(dst, stride, 8, bd);
+}
+
+void aom_highbd_dc_128_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  (void)above;
+  (void)left;
+  dc_128_predictor_8xh(dst, stride, 16, bd);
+}
+
+// -----------------------------------------------------------------------------
+// 16xh
+
+static INLINE void dc_store_16xh(uint16_t *dst, ptrdiff_t stride, int height,
+                                 const __m128i *dc) {
+  const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0);
+  const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo);
+  int i;
+  for (i = 0; i < height; ++i, dst += stride) {
+    _mm_store_si128((__m128i *)dst, dc_dup);
+    _mm_store_si128((__m128i *)(dst + 8), dc_dup);
+  }
+}
+
+// -----------------------------------------------------------------------------
+// DC_LEFT
+
+void aom_highbd_dc_left_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride,
+                                            const uint16_t *above,
+                                            const uint16_t *left, int bd) {
+  const __m128i four = _mm_cvtsi32_si128(4);
+  const __m128i sum = dc_sum_8(left);
+  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
+  (void)above;
+  (void)bd;
+  dc_store_16xh(dst, stride, 8, &dc);
+}
+
+void aom_highbd_dc_left_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
+                                             const uint16_t *above,
+                                             const uint16_t *left, int bd) {
+  const __m128i eight = _mm_cvtsi32_si128(8);
+  const __m128i sum = dc_sum_16(left);
+  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
+  (void)above;
+  (void)bd;
+  dc_store_16xh(dst, stride, 16, &dc);
+}
+
+// Shared with 32xh
+static INLINE __m128i dc_sum_32(const uint16_t *ref) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i sum_a = dc_sum_16(ref);
+  const __m128i sum_b = dc_sum_16(ref + 16);
+  // 12 bit bd will outrange, so expand to 32 bit before adding final total
+  return _mm_add_epi32(_mm_unpacklo_epi16(sum_a, zero),
+                       _mm_unpacklo_epi16(sum_b, zero));
+}
+
+void aom_highbd_dc_left_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
+                                             const uint16_t *above,
+                                             const uint16_t *left, int bd) {
+  const __m128i sixteen = _mm_cvtsi32_si128(16);
+  const __m128i sum = dc_sum_32(left);
+  const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5);
+  (void)above;
+  (void)bd;
+  dc_store_16xh(dst, stride, 32, &dc);
+}
+
+// -----------------------------------------------------------------------------
+// DC_TOP
+
+void aom_highbd_dc_top_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  const __m128i eight = _mm_cvtsi32_si128(8);
+  const __m128i sum = dc_sum_16(above);
+  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
+  (void)left;
+  (void)bd;
+  dc_store_16xh(dst, stride, 8, &dc);
+}
+
+void aom_highbd_dc_top_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
+                                            const uint16_t *above,
+                                            const uint16_t *left, int bd) {
+  const __m128i eight = _mm_cvtsi32_si128(8);
+  const __m128i sum = dc_sum_16(above);
+  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
+  (void)left;
+  (void)bd;
+  dc_store_16xh(dst, stride, 16, &dc);
+}
+
+void aom_highbd_dc_top_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
+                                            const uint16_t *above,
+                                            const uint16_t *left, int bd) {
+  const __m128i eight = _mm_cvtsi32_si128(8);
+  const __m128i sum = dc_sum_16(above);
+  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
+  (void)left;
+  (void)bd;
+  dc_store_16xh(dst, stride, 32, &dc);
+}
+
+// -----------------------------------------------------------------------------
+// DC_128
+
+void aom_highbd_dc_128_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
+  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
+  (void)above;
+  (void)left;
+  dc_store_16xh(dst, stride, 8, &dc_dup);
+}
+
+void aom_highbd_dc_128_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
+                                            const uint16_t *above,
+                                            const uint16_t *left, int bd) {
+  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
+  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
+  (void)above;
+  (void)left;
+  dc_store_16xh(dst, stride, 16, &dc_dup);
+}
+
+void aom_highbd_dc_128_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
+                                            const uint16_t *above,
+                                            const uint16_t *left, int bd) {
+  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
+  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
+  (void)above;
+  (void)left;
+  dc_store_16xh(dst, stride, 32, &dc_dup);
+}
+
+// -----------------------------------------------------------------------------
+// 32xh
+
+static INLINE void dc_store_32xh(uint16_t *dst, ptrdiff_t stride, int height,
+                                 const __m128i *dc) {
+  const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0);
+  const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo);
+  int i;
+  for (i = 0; i < height; ++i, dst += stride) {
+    _mm_store_si128((__m128i *)dst, dc_dup);
+    _mm_store_si128((__m128i *)(dst + 8), dc_dup);
+    _mm_store_si128((__m128i *)(dst + 16), dc_dup);
+    _mm_store_si128((__m128i *)(dst + 24), dc_dup);
+  }
+}
+
+void aom_highbd_dc_left_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
+                                             const uint16_t *above,
+                                             const uint16_t *left, int bd) {
+  const __m128i eight = _mm_cvtsi32_si128(8);
+  const __m128i sum = dc_sum_16(left);
+  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
+  (void)above;
+  (void)bd;
+  dc_store_32xh(dst, stride, 16, &dc);
+}
+
+void aom_highbd_dc_left_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
+                                             const uint16_t *above,
+                                             const uint16_t *left, int bd) {
+  const __m128i sixteen = _mm_cvtsi32_si128(16);
+  const __m128i sum = dc_sum_32(left);
+  const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5);
+  (void)above;
+  (void)bd;
+  dc_store_32xh(dst, stride, 32, &dc);
+}
+
+void aom_highbd_dc_top_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
+                                            const uint16_t *above,
+                                            const uint16_t *left, int bd) {
+  const __m128i sixteen = _mm_cvtsi32_si128(16);
+  const __m128i sum = dc_sum_32(above);
+  const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5);
+  (void)left;
+  (void)bd;
+  dc_store_32xh(dst, stride, 16, &dc);
+}
+
+void aom_highbd_dc_128_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
+                                            const uint16_t *above,
+                                            const uint16_t *left, int bd) {
+  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
+  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
+  (void)above;
+  (void)left;
+  dc_store_32xh(dst, stride, 16, &dc_dup);
+}
+
+void aom_highbd_dc_top_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
+                                            const uint16_t *above,
+                                            const uint16_t *left, int bd) {
+  const __m128i sixteen = _mm_cvtsi32_si128(16);
+  const __m128i sum = dc_sum_32(above);
+  const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5);
+  (void)left;
+  (void)bd;
+  dc_store_32xh(dst, stride, 32, &dc);
+}
+
+void aom_highbd_dc_128_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
+                                            const uint16_t *above,
+                                            const uint16_t *left, int bd) {
+  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
+  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
+  (void)above;
+  (void)left;
+  dc_store_32xh(dst, stride, 32, &dc_dup);
+}
+
+// -----------------------------------------------------------------------------
+// V_PRED
+
+void aom_highbd_v_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  (void)left;
+  (void)bd;
+  const __m128i above_u16 = _mm_loadl_epi64((const __m128i *)above);
+  int i;
+  for (i = 0; i < 2; ++i) {
+    _mm_storel_epi64((__m128i *)dst, above_u16);
+    _mm_storel_epi64((__m128i *)(dst + stride), above_u16);
+    _mm_storel_epi64((__m128i *)(dst + 2 * stride), above_u16);
+    _mm_storel_epi64((__m128i *)(dst + 3 * stride), above_u16);
+    dst += stride << 2;
+  }
+}
+
+void aom_highbd_v_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  (void)left;
+  (void)bd;
+  const __m128i above_u16 = _mm_load_si128((const __m128i *)above);
+  _mm_store_si128((__m128i *)dst, above_u16);
+  _mm_store_si128((__m128i *)(dst + stride), above_u16);
+  _mm_store_si128((__m128i *)(dst + 2 * stride), above_u16);
+  _mm_store_si128((__m128i *)(dst + 3 * stride), above_u16);
+}
+
+void aom_highbd_v_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
+                                      const uint16_t *above,
+                                      const uint16_t *left, int bd) {
+  (void)left;
+  (void)bd;
+  const __m128i above_u16 = _mm_load_si128((const __m128i *)above);
+  int i;
+  for (i = 0; i < 4; ++i) {
+    _mm_store_si128((__m128i *)dst, above_u16);
+    _mm_store_si128((__m128i *)(dst + stride), above_u16);
+    _mm_store_si128((__m128i *)(dst + 2 * stride), above_u16);
+    _mm_store_si128((__m128i *)(dst + 3 * stride), above_u16);
+    dst += stride << 2;
+  }
+}
+
+void aom_highbd_v_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride,
+                                      const uint16_t *above,
+                                      const uint16_t *left, int bd) {
+  (void)left;
+  (void)bd;
+  const __m128i above0_u16 = _mm_load_si128((const __m128i *)above);
+  const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8));
+  int i;
+  for (i = 0; i < 2; ++i) {
+    _mm_store_si128((__m128i *)dst, above0_u16);
+    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
+    dst += stride;
+    _mm_store_si128((__m128i *)dst, above0_u16);
+    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
+    dst += stride;
+    _mm_store_si128((__m128i *)dst, above0_u16);
+    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
+    dst += stride;
+    _mm_store_si128((__m128i *)dst, above0_u16);
+    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
+    dst += stride;
+  }
+}
+
+void aom_highbd_v_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
+                                       const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  (void)left;
+  (void)bd;
+  const __m128i above0_u16 = _mm_load_si128((const __m128i *)above);
+  const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8));
+  int i;
+  for (i = 0; i < 8; ++i) {
+    _mm_store_si128((__m128i *)dst, above0_u16);
+    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
+    dst += stride;
+    _mm_store_si128((__m128i *)dst, above0_u16);
+    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
+    dst += stride;
+    _mm_store_si128((__m128i *)dst, above0_u16);
+    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
+    dst += stride;
+    _mm_store_si128((__m128i *)dst, above0_u16);
+    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
+    dst += stride;
+  }
+}
+
+void aom_highbd_v_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
+                                       const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  (void)left;
+  (void)bd;
+  const __m128i above0_u16 = _mm_load_si128((const __m128i *)above);
+  const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8));
+  const __m128i above2_u16 = _mm_load_si128((const __m128i *)(above + 16));
+  const __m128i above3_u16 = _mm_load_si128((const __m128i *)(above + 24));
+  int i;
+  for (i = 0; i < 4; ++i) {
+    _mm_store_si128((__m128i *)dst, above0_u16);
+    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
+    _mm_store_si128((__m128i *)(dst + 16), above2_u16);
+    _mm_store_si128((__m128i *)(dst + 24), above3_u16);
+    dst += stride;
+    _mm_store_si128((__m128i *)dst, above0_u16);
+    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
+    _mm_store_si128((__m128i *)(dst + 16), above2_u16);
+    _mm_store_si128((__m128i *)(dst + 24), above3_u16);
+    dst += stride;
+    _mm_store_si128((__m128i *)dst, above0_u16);
+    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
+    _mm_store_si128((__m128i *)(dst + 16), above2_u16);
+    _mm_store_si128((__m128i *)(dst + 24), above3_u16);
+    dst += stride;
+    _mm_store_si128((__m128i *)dst, above0_u16);
+    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
+    _mm_store_si128((__m128i *)(dst + 16), above2_u16);
+    _mm_store_si128((__m128i *)(dst + 24), above3_u16);
+    dst += stride;
+  }
+}
+
+// -----------------------------------------------------------------------------
+// DC_PRED
+
+void aom_highbd_dc_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
+                                      const uint16_t *above,
+                                      const uint16_t *left, int bd) {
+  (void)bd;
+  const __m128i sum_above = dc_sum_4(above);
+  const __m128i sum_left = dc_sum_8(left);
+  const __m128i sum = _mm_add_epi16(sum_above, sum_left);
+  uint32_t sum32 = _mm_cvtsi128_si32(sum);
+  sum32 >>= 16;
+  sum32 += 6;
+  sum32 /= 12;
+  const __m128i row = _mm_set1_epi16((uint16_t)sum32);
+  int i;
+  for (i = 0; i < 4; ++i) {
+    _mm_storel_epi64((__m128i *)dst, row);
+    dst += stride;
+    _mm_storel_epi64((__m128i *)dst, row);
+    dst += stride;
+  }
+}
+
+void aom_highbd_dc_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
+                                      const uint16_t *above,
+                                      const uint16_t *left, int bd) {
+  (void)bd;
+  const __m128i sum_left = dc_sum_4(left);
+  const __m128i sum_above = dc_sum_8(above);
+  const __m128i sum = _mm_add_epi16(sum_above, sum_left);
+  uint32_t sum32 = _mm_cvtsi128_si32(sum);
+  sum32 >>= 16;
+  sum32 += 6;
+  sum32 /= 12;
+  const __m128i row = _mm_set1_epi16((uint16_t)sum32);
+
+  _mm_store_si128((__m128i *)dst, row);
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, row);
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, row);
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, row);
+}
+
+void aom_highbd_dc_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
+                                       const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  (void)bd;
+  __m128i sum_left = dc_sum_16(left);
+  __m128i sum_above = dc_sum_8(above);
+  const __m128i zero = _mm_setzero_si128();
+  sum_left = _mm_unpacklo_epi16(sum_left, zero);
+  sum_above = _mm_unpacklo_epi16(sum_above, zero);
+  const __m128i sum = _mm_add_epi32(sum_left, sum_above);
+  uint32_t sum32 = _mm_cvtsi128_si32(sum);
+  sum32 += 12;
+  sum32 /= 24;
+  const __m128i row = _mm_set1_epi16((uint16_t)sum32);
+  int i;
+  for (i = 0; i < 4; ++i) {
+    _mm_store_si128((__m128i *)dst, row);
+    dst += stride;
+    _mm_store_si128((__m128i *)dst, row);
+    dst += stride;
+    _mm_store_si128((__m128i *)dst, row);
+    dst += stride;
+    _mm_store_si128((__m128i *)dst, row);
+    dst += stride;
+  }
+}
+
+void aom_highbd_dc_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride,
+                                       const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  (void)bd;
+  __m128i sum_left = dc_sum_8(left);
+  __m128i sum_above = dc_sum_16(above);
+  const __m128i zero = _mm_setzero_si128();
+  sum_left = _mm_unpacklo_epi16(sum_left, zero);
+  sum_above = _mm_unpacklo_epi16(sum_above, zero);
+  const __m128i sum = _mm_add_epi32(sum_left, sum_above);
+  uint32_t sum32 = _mm_cvtsi128_si32(sum);
+  sum32 += 12;
+  sum32 /= 24;
+  const __m128i row = _mm_set1_epi16((uint16_t)sum32);
+  int i;
+  for (i = 0; i < 2; ++i) {
+    _mm_store_si128((__m128i *)dst, row);
+    _mm_store_si128((__m128i *)(dst + 8), row);
+    dst += stride;
+    _mm_store_si128((__m128i *)dst, row);
+    _mm_store_si128((__m128i *)(dst + 8), row);
+    dst += stride;
+    _mm_store_si128((__m128i *)dst, row);
+    _mm_store_si128((__m128i *)(dst + 8), row);
+    dst += stride;
+    _mm_store_si128((__m128i *)dst, row);
+    _mm_store_si128((__m128i *)(dst + 8), row);
+    dst += stride;
+  }
+}
+
+void aom_highbd_dc_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  (void)bd;
+  __m128i sum_left = dc_sum_32(left);
+  __m128i sum_above = dc_sum_16(above);
+  const __m128i zero = _mm_setzero_si128();
+  sum_above = _mm_unpacklo_epi16(sum_above, zero);
+  const __m128i sum = _mm_add_epi32(sum_left, sum_above);
+  uint32_t sum32 = _mm_cvtsi128_si32(sum);
+  sum32 += 24;
+  sum32 /= 48;
+  const __m128i row = _mm_set1_epi16((uint16_t)sum32);
+  int i;
+  for (i = 0; i < 8; ++i) {
+    _mm_store_si128((__m128i *)dst, row);
+    _mm_store_si128((__m128i *)(dst + 8), row);
+    dst += stride;
+    _mm_store_si128((__m128i *)dst, row);
+    _mm_store_si128((__m128i *)(dst + 8), row);
+    dst += stride;
+    _mm_store_si128((__m128i *)dst, row);
+    _mm_store_si128((__m128i *)(dst + 8), row);
+    dst += stride;
+    _mm_store_si128((__m128i *)dst, row);
+    _mm_store_si128((__m128i *)(dst + 8), row);
+    dst += stride;
+  }
+}
+
+void aom_highbd_dc_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  (void)bd;
+  __m128i sum_left = dc_sum_16(left);
+  __m128i sum_above = dc_sum_32(above);
+  const __m128i zero = _mm_setzero_si128();
+  sum_left = _mm_unpacklo_epi16(sum_left, zero);
+  const __m128i sum = _mm_add_epi32(sum_left, sum_above);
+  uint32_t sum32 = _mm_cvtsi128_si32(sum);
+  sum32 += 24;
+  sum32 /= 48;
+  const __m128i row = _mm_set1_epi16((uint16_t)sum32);
+  int i;
+  for (i = 0; i < 4; ++i) {
+    _mm_store_si128((__m128i *)dst, row);
+    _mm_store_si128((__m128i *)(dst + 8), row);
+    _mm_store_si128((__m128i *)(dst + 16), row);
+    _mm_store_si128((__m128i *)(dst + 24), row);
+    dst += stride;
+    _mm_store_si128((__m128i *)dst, row);
+    _mm_store_si128((__m128i *)(dst + 8), row);
+    _mm_store_si128((__m128i *)(dst + 16), row);
+    _mm_store_si128((__m128i *)(dst + 24), row);
+    dst += stride;
+    _mm_store_si128((__m128i *)dst, row);
+    _mm_store_si128((__m128i *)(dst + 8), row);
+    _mm_store_si128((__m128i *)(dst + 16), row);
+    _mm_store_si128((__m128i *)(dst + 24), row);
+    dst += stride;
+    _mm_store_si128((__m128i *)dst, row);
+    _mm_store_si128((__m128i *)(dst + 8), row);
+    _mm_store_si128((__m128i *)(dst + 16), row);
+    _mm_store_si128((__m128i *)(dst + 24), row);
+    dst += stride;
+  }
+}
diff --git a/libs/libaom/src/aom_dsp/x86/highbd_loopfilter_avx2.c b/libs/libaom/src/aom_dsp/x86/highbd_loopfilter_avx2.c
new file mode 100644
index 000000000..c954da94e
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/highbd_loopfilter_avx2.c
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/common_avx2.h"
+#include "aom_dsp/x86/lpf_common_sse2.h"
+#include "aom/aom_integer.h"
+
+void aom_highbd_lpf_horizontal_14_dual_avx2(
+    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  aom_highbd_lpf_horizontal_14_dual_sse2(s, p, blimit0, limit0, thresh0,
+                                         blimit1, limit1, thresh1, bd);
+}
+
+void aom_highbd_lpf_vertical_14_dual_avx2(
+    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  aom_highbd_lpf_vertical_14_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1,
+                                       limit1, thresh1, bd);
+}
+
+void aom_highbd_lpf_horizontal_4_dual_avx2(
+    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  aom_highbd_lpf_horizontal_4_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1,
+                                        limit1, thresh1, bd);
+}
+
+void aom_highbd_lpf_horizontal_8_dual_avx2(
+    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  aom_highbd_lpf_horizontal_8_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1,
+                                        limit1, thresh1, bd);
+}
+
+void aom_highbd_lpf_vertical_4_dual_avx2(
+    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  aom_highbd_lpf_vertical_4_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1,
+                                      limit1, thresh1, bd);
+}
+
+void aom_highbd_lpf_vertical_8_dual_avx2(
+    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  aom_highbd_lpf_vertical_8_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1,
+                                      limit1, thresh1, bd);
+}
diff --git a/libs/libaom/src/aom_dsp/x86/highbd_loopfilter_sse2.c b/libs/libaom/src/aom_dsp/x86/highbd_loopfilter_sse2.c
new file mode 100644
index 000000000..ea7dc6a9e
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/highbd_loopfilter_sse2.c
@@ -0,0 +1,1698 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>  // SSE2
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/lpf_common_sse2.h"
+
+static AOM_FORCE_INLINE void pixel_clamp(const __m128i *min, const __m128i *max,
+                                         __m128i *pixel) {
+  *pixel = _mm_min_epi16(*pixel, *max);
+  *pixel = _mm_max_epi16(*pixel, *min);
+}
+
+static AOM_FORCE_INLINE __m128i abs_diff16(__m128i a, __m128i b) {
+  return _mm_or_si128(_mm_subs_epu16(a, b), _mm_subs_epu16(b, a));
+}
+
+static INLINE void get_limit(const uint8_t *bl, const uint8_t *l,
+                             const uint8_t *t, int bd, __m128i *blt,
+                             __m128i *lt, __m128i *thr, __m128i *t80_out) {
+  const int shift = bd - 8;
+  const __m128i zero = _mm_setzero_si128();
+
+  __m128i x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)bl), zero);
+  *blt = _mm_slli_epi16(x, shift);
+
+  x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)l), zero);
+  *lt = _mm_slli_epi16(x, shift);
+
+  x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)t), zero);
+  *thr = _mm_slli_epi16(x, shift);
+
+  *t80_out = _mm_set1_epi16(1 << (bd - 1));
+}
+
+static INLINE void get_limit_dual(
+    const uint8_t *_blimit0, const uint8_t *_limit0, const uint8_t *_thresh0,
+    const uint8_t *_blimit1, const uint8_t *_limit1, const uint8_t *_thresh1,
+    int bd, __m128i *blt_out, __m128i *lt_out, __m128i *thr_out,
+    __m128i *t80_out) {
+  const int shift = bd - 8;
+  const __m128i zero = _mm_setzero_si128();
+
+  __m128i x0 =
+      _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit0), zero);
+  __m128i x1 =
+      _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit1), zero);
+  x0 = _mm_unpacklo_epi64(x0, x1);
+  *blt_out = _mm_slli_epi16(x0, shift);
+
+  x0 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit0), zero);
+  x1 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit1), zero);
+  x0 = _mm_unpacklo_epi64(x0, x1);
+  *lt_out = _mm_slli_epi16(x0, shift);
+
+  x0 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh0), zero);
+  x1 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh1), zero);
+  x0 = _mm_unpacklo_epi64(x0, x1);
+  *thr_out = _mm_slli_epi16(x0, shift);
+
+  *t80_out = _mm_set1_epi16(1 << (bd - 1));
+}
+
+static INLINE void load_highbd_pixel(const uint16_t *s, int size, int pitch,
+                                     __m128i *p, __m128i *q) {
+  int i;
+  for (i = 0; i < size; i++) {
+    p[i] = _mm_loadu_si128((__m128i *)(s - (i + 1) * pitch));
+    q[i] = _mm_loadu_si128((__m128i *)(s + i * pitch));
+  }
+}
+
+static INLINE void highbd_filter_mask_dual(const __m128i *p, const __m128i *q,
+                                           const __m128i *l, const __m128i *bl,
+                                           __m128i *mask) {
+  __m128i abs_p0q0 = abs_diff16(p[0], q[0]);
+  __m128i abs_p1q1 = abs_diff16(p[1], q[1]);
+  abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
+  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
+
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i ffff = _mm_set1_epi16((short)0xFFFF);
+
+  __m128i max = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), *bl);
+  max = _mm_xor_si128(_mm_cmpeq_epi16(max, zero), ffff);
+  max = _mm_and_si128(max, _mm_adds_epu16(*l, one));
+
+  int i;
+  for (i = 1; i < 4; ++i) {
+    max = _mm_max_epi16(max, abs_diff16(p[i], p[i - 1]));
+    max = _mm_max_epi16(max, abs_diff16(q[i], q[i - 1]));
+  }
+  max = _mm_subs_epu16(max, *l);
+  *mask = _mm_cmpeq_epi16(max, zero);  // return ~mask
+}
+
+static INLINE void highbd_hev_filter_mask_x_sse2(__m128i *pq, int x,
+                                                 __m128i *p1p0, __m128i *q1q0,
+                                                 __m128i *abs_p1p0, __m128i *l,
+                                                 __m128i *bl, __m128i *t,
+                                                 __m128i *hev, __m128i *mask) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i ffff = _mm_set1_epi16((short)0xFFFF);
+  __m128i abs_p0q0_p1q1, abs_p0q0, abs_p1q1, abs_q1q0;
+  __m128i max, max01, h;
+
+  *p1p0 = _mm_unpacklo_epi64(pq[0], pq[1]);
+  *q1q0 = _mm_unpackhi_epi64(pq[0], pq[1]);
+
+  abs_p0q0_p1q1 = abs_diff16(*p1p0, *q1q0);
+  abs_p0q0 = _mm_adds_epu16(abs_p0q0_p1q1, abs_p0q0_p1q1);
+  abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, zero);
+
+  abs_p1q1 = _mm_srli_si128(abs_p0q0_p1q1, 8);
+  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);  // divide by 2
+
+  max = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), *bl);
+  max = _mm_xor_si128(_mm_cmpeq_epi16(max, zero), ffff);
+  // mask |= (abs(*p0 - *q0) * 2 + abs(*p1 - *q1) / 2  > blimit) * -1;
+  // So taking maximums continues to work:
+  max = _mm_and_si128(max, _mm_adds_epu16(*l, one));
+
+  *abs_p1p0 = abs_diff16(pq[0], pq[1]);
+  abs_q1q0 = _mm_srli_si128(*abs_p1p0, 8);
+  max01 = _mm_max_epi16(*abs_p1p0, abs_q1q0);
+  // mask |= (abs(*p1 - *p0) > limit) * -1;
+  // mask |= (abs(*q1 - *q0) > limit) * -1;
+  h = _mm_subs_epu16(max01, *t);
+
+  *hev = _mm_xor_si128(_mm_cmpeq_epi16(h, zero), ffff);
+  // replicate for the further "merged variables" usage
+  *hev = _mm_unpacklo_epi64(*hev, *hev);
+
+  max = _mm_max_epi16(max, max01);
+  int i;
+  for (i = 2; i < x; ++i) {
+    max = _mm_max_epi16(max, abs_diff16(pq[i], pq[i - 1]));
+  }
+  max = _mm_max_epi16(max, _mm_srli_si128(max, 8));
+
+  max = _mm_subs_epu16(max, *l);
+  *mask = _mm_cmpeq_epi16(max, zero);  //  ~mask
+}
+
+static INLINE void flat_mask_internal(const __m128i *th, const __m128i *pq,
+                                      int start, int end, __m128i *flat) {
+  int i;
+  __m128i max = _mm_max_epi16(abs_diff16(pq[start], pq[0]),
+                              abs_diff16(pq[start + 1], pq[0]));
+
+  for (i = start + 2; i < end; ++i) {
+    max = _mm_max_epi16(max, abs_diff16(pq[i], pq[0]));
+  }
+  max = _mm_max_epi16(max, _mm_srli_si128(max, 8));
+
+  __m128i ft;
+  ft = _mm_subs_epu16(max, *th);
+
+  const __m128i zero = _mm_setzero_si128();
+  *flat = _mm_cmpeq_epi16(ft, zero);
+}
+
+static INLINE void flat_mask_internal_dual(const __m128i *th, const __m128i *p,
+                                           const __m128i *q, int start, int end,
+                                           __m128i *flat) {
+  int i;
+  __m128i max =
+      _mm_max_epi16(abs_diff16(q[start], q[0]), abs_diff16(p[start], p[0]));
+
+  for (i = start + 1; i < end; ++i) {
+    max = _mm_max_epi16(max, abs_diff16(p[i], p[0]));
+    max = _mm_max_epi16(max, abs_diff16(q[i], q[0]));
+  }
+
+  __m128i ft;
+  ft = _mm_subs_epu16(max, *th);
+
+  const __m128i zero = _mm_setzero_si128();
+  *flat = _mm_cmpeq_epi16(ft, zero);
+}
+
+static INLINE void highbd_flat_mask4_sse2(__m128i *pq, __m128i *flat,
+                                          __m128i *flat2, int bd) {
+  // check the distance 1,2,3 against 0
+  __m128i th = _mm_set1_epi16(1);
+  th = _mm_slli_epi16(th, bd - 8);
+  flat_mask_internal(&th, pq, 1, 4, flat);
+  flat_mask_internal(&th, pq, 4, 7, flat2);
+}
+
+static INLINE void highbd_flat_mask4_dual_sse2(const __m128i *p,
+                                               const __m128i *q, __m128i *flat,
+                                               __m128i *flat2, int bd) {
+  // check the distance 1,2,3 against 0
+  __m128i th = _mm_set1_epi16(1);
+  th = _mm_slli_epi16(th, bd - 8);
+  flat_mask_internal_dual(&th, p, q, 1, 4, flat);
+  flat_mask_internal_dual(&th, p, q, 4, 7, flat2);
+}
+
+static AOM_FORCE_INLINE void highbd_filter4_sse2(__m128i *p1p0, __m128i *q1q0,
+                                                 __m128i *hev, __m128i *mask,
+                                                 __m128i *qs1qs0,
+                                                 __m128i *ps1ps0, __m128i *t80,
+                                                 int bd) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i pmax =
+      _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, bd), one), *t80);
+  const __m128i pmin = _mm_subs_epi16(zero, *t80);
+
+  const __m128i t3t4 = _mm_set_epi16(3, 3, 3, 3, 4, 4, 4, 4);
+  __m128i ps1ps0_work, qs1qs0_work, work;
+  __m128i filt, filter2filter1, filter2filt, filter1filt;
+
+  ps1ps0_work = _mm_subs_epi16(*p1p0, *t80);
+  qs1qs0_work = _mm_subs_epi16(*q1q0, *t80);
+
+  work = _mm_subs_epi16(ps1ps0_work, qs1qs0_work);
+  pixel_clamp(&pmin, &pmax, &work);
+  filt = _mm_and_si128(_mm_srli_si128(work, 8), *hev);
+
+  filt = _mm_subs_epi16(filt, work);
+  filt = _mm_subs_epi16(filt, work);
+  filt = _mm_subs_epi16(filt, work);
+  // (aom_filter + 3 * (qs0 - ps0)) & mask
+  pixel_clamp(&pmin, &pmax, &filt);
+  filt = _mm_and_si128(filt, *mask);
+  filt = _mm_unpacklo_epi64(filt, filt);
+
+  filter2filter1 = _mm_adds_epi16(filt, t3t4); /* signed_short_clamp */
+  pixel_clamp(&pmin, &pmax, &filter2filter1);
+  filter2filter1 = _mm_srai_epi16(filter2filter1, 3); /* >> 3 */
+
+  filt = _mm_unpacklo_epi64(filter2filter1, filter2filter1);
+
+  // filt >> 1
+  filt = _mm_adds_epi16(filt, one);
+  filt = _mm_srai_epi16(filt, 1);
+  filt = _mm_andnot_si128(*hev, filt);
+
+  filter2filt = _mm_unpackhi_epi64(filter2filter1, filt);
+  filter1filt = _mm_unpacklo_epi64(filter2filter1, filt);
+
+  qs1qs0_work = _mm_subs_epi16(qs1qs0_work, filter1filt);
+  ps1ps0_work = _mm_adds_epi16(ps1ps0_work, filter2filt);
+
+  pixel_clamp(&pmin, &pmax, &qs1qs0_work);
+  pixel_clamp(&pmin, &pmax, &ps1ps0_work);
+
+  *qs1qs0 = _mm_adds_epi16(qs1qs0_work, *t80);
+  *ps1ps0 = _mm_adds_epi16(ps1ps0_work, *t80);
+}
+
+static INLINE void highbd_filter4_dual_sse2(__m128i *p, __m128i *q, __m128i *ps,
+                                            __m128i *qs, const __m128i *mask,
+                                            const __m128i *th, int bd,
+                                            __m128i *t80) {
+  __m128i ps0 = _mm_subs_epi16(p[0], *t80);
+  __m128i ps1 = _mm_subs_epi16(p[1], *t80);
+  __m128i qs0 = _mm_subs_epi16(q[0], *t80);
+  __m128i qs1 = _mm_subs_epi16(q[1], *t80);
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i pmax =
+      _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, bd), one), *t80);
+
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i pmin = _mm_subs_epi16(zero, *t80);
+  __m128i filter = _mm_subs_epi16(ps1, qs1);
+  pixel_clamp(&pmin, &pmax, &filter);
+
+  // hev_filter
+  __m128i hev;
+  const __m128i abs_p1p0 = abs_diff16(p[1], p[0]);
+  const __m128i abs_q1q0 = abs_diff16(q[1], q[0]);
+  __m128i h = _mm_max_epi16(abs_p1p0, abs_q1q0);
+  h = _mm_subs_epu16(h, *th);
+  const __m128i ffff = _mm_cmpeq_epi16(h, h);
+  hev = _mm_xor_si128(_mm_cmpeq_epi16(h, zero), ffff);
+
+  filter = _mm_and_si128(filter, hev);
+
+  const __m128i x = _mm_subs_epi16(qs0, ps0);
+  filter = _mm_adds_epi16(filter, x);
+  filter = _mm_adds_epi16(filter, x);
+  filter = _mm_adds_epi16(filter, x);
+  pixel_clamp(&pmin, &pmax, &filter);
+  filter = _mm_and_si128(filter, *mask);
+  const __m128i t3 = _mm_set1_epi16(3);
+  const __m128i t4 = _mm_set1_epi16(4);
+  __m128i filter1 = _mm_adds_epi16(filter, t4);
+  __m128i filter2 = _mm_adds_epi16(filter, t3);
+  pixel_clamp(&pmin, &pmax, &filter1);
+  pixel_clamp(&pmin, &pmax, &filter2);
+  filter1 = _mm_srai_epi16(filter1, 3);
+  filter2 = _mm_srai_epi16(filter2, 3);
+  qs0 = _mm_subs_epi16(qs0, filter1);
+  pixel_clamp(&pmin, &pmax, &qs0);
+  ps0 = _mm_adds_epi16(ps0, filter2);
+  pixel_clamp(&pmin, &pmax, &ps0);
+  qs[0] = _mm_adds_epi16(qs0, *t80);
+  ps[0] = _mm_adds_epi16(ps0, *t80);
+  filter = _mm_adds_epi16(filter1, one);
+  filter = _mm_srai_epi16(filter, 1);
+  filter = _mm_andnot_si128(hev, filter);
+  qs1 = _mm_subs_epi16(qs1, filter);
+  pixel_clamp(&pmin, &pmax, &qs1);
+  ps1 = _mm_adds_epi16(ps1, filter);
+  pixel_clamp(&pmin, &pmax, &ps1);
+  qs[1] = _mm_adds_epi16(qs1, *t80);
+  ps[1] = _mm_adds_epi16(ps1, *t80);
+}
+
+static AOM_FORCE_INLINE void highbd_lpf_internal_14_sse2(
+    __m128i *p, __m128i *q, __m128i *pq, const unsigned char *blt,
+    const unsigned char *lt, const unsigned char *thr, int bd) {
+  int i;
+  const __m128i zero = _mm_setzero_si128();
+  __m128i blimit, limit, thresh;
+  __m128i t80;
+  get_limit(blt, lt, thr, bd, &blimit, &limit, &thresh, &t80);
+
+  for (i = 0; i < 7; i++) {
+    pq[i] = _mm_unpacklo_epi64(p[i], q[i]);
+  }
+  __m128i mask, hevhev;
+  __m128i p1p0, q1q0, abs_p1p0;
+
+  highbd_hev_filter_mask_x_sse2(pq, 4, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit,
+                                &thresh, &hevhev, &mask);
+
+  __m128i ps0ps1, qs0qs1;
+  // filter4
+  highbd_filter4_sse2(&p1p0, &q1q0, &hevhev, &mask, &qs0qs1, &ps0ps1, &t80, bd);
+
+  __m128i flat, flat2;
+  highbd_flat_mask4_sse2(pq, &flat, &flat2, bd);
+
+  flat = _mm_and_si128(flat, mask);
+  flat2 = _mm_and_si128(flat2, flat);
+
+  // replicate for the further "merged variables" usage
+  flat = _mm_unpacklo_epi64(flat, flat);
+  flat2 = _mm_unpacklo_epi64(flat2, flat2);
+
+  // flat and wide flat calculations
+
+  // if flat ==0 then flat2 is zero as well and we don't need any calc below
+  // sse4.1 if (0==_mm_test_all_zeros(flat,ff))
+  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
+    __m128i flat_p[3], flat_q[3], flat_pq[3];
+    __m128i flat2_p[6], flat2_q[6];
+    __m128i flat2_pq[6];
+    __m128i sum_p6, sum_p3;
+    const __m128i eight = _mm_set1_epi16(8);
+    const __m128i four = _mm_set1_epi16(4);
+
+    __m128i work0, work0_0, work0_1, sum_p_0;
+    __m128i sum_p = _mm_add_epi16(pq[5], _mm_add_epi16(pq[4], pq[3]));
+    __m128i sum_lp = _mm_add_epi16(pq[0], _mm_add_epi16(pq[2], pq[1]));
+    sum_p = _mm_add_epi16(sum_p, sum_lp);
+
+    __m128i sum_lq = _mm_srli_si128(sum_lp, 8);
+    __m128i sum_q = _mm_srli_si128(sum_p, 8);
+
+    sum_p_0 = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q));
+    sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq));
+
+    flat_p[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(pq[3], pq[0]));
+    flat_q[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(q[3], q[0]));
+
+    sum_p6 = _mm_add_epi16(pq[6], pq[6]);
+    sum_p3 = _mm_add_epi16(pq[3], pq[3]);
+
+    sum_q = _mm_sub_epi16(sum_p_0, pq[5]);
+    sum_p = _mm_sub_epi16(sum_p_0, q[5]);
+
+    work0_0 = _mm_add_epi16(_mm_add_epi16(pq[6], pq[0]), pq[1]);
+    work0_1 = _mm_add_epi16(sum_p6,
+                            _mm_add_epi16(pq[1], _mm_add_epi16(pq[2], pq[0])));
+
+    sum_lq = _mm_sub_epi16(sum_lp, pq[2]);
+    sum_lp = _mm_sub_epi16(sum_lp, q[2]);
+
+    work0 = _mm_add_epi16(sum_p3, pq[1]);
+    flat_p[1] = _mm_add_epi16(sum_lp, work0);
+    flat_q[1] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8));
+
+    flat_pq[0] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[0], flat_q[0]), 3);
+    flat_pq[1] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[1], flat_q[1]), 3);
+
+    sum_lp = _mm_sub_epi16(sum_lp, q[1]);
+    sum_lq = _mm_sub_epi16(sum_lq, pq[1]);
+
+    sum_p3 = _mm_add_epi16(sum_p3, pq[3]);
+    work0 = _mm_add_epi16(sum_p3, pq[2]);
+
+    flat_p[2] = _mm_add_epi16(sum_lp, work0);
+    flat_q[2] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8));
+    flat_pq[2] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[2], flat_q[2]), 3);
+
+    int flat2_mask =
+        (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat2, zero)));
+    if (flat2_mask) {
+      flat2_p[0] = _mm_add_epi16(sum_p_0, _mm_add_epi16(work0_0, q[0]));
+      flat2_q[0] = _mm_add_epi16(
+          sum_p_0, _mm_add_epi16(_mm_srli_si128(work0_0, 8), pq[0]));
+
+      flat2_p[1] = _mm_add_epi16(sum_p, work0_1);
+      flat2_q[1] = _mm_add_epi16(sum_q, _mm_srli_si128(work0_1, 8));
+
+      flat2_pq[0] =
+          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[0], flat2_q[0]), 4);
+      flat2_pq[1] =
+          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[1], flat2_q[1]), 4);
+
+      sum_p = _mm_sub_epi16(sum_p, q[4]);
+      sum_q = _mm_sub_epi16(sum_q, pq[4]);
+
+      sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
+      work0 = _mm_add_epi16(sum_p6,
+                            _mm_add_epi16(pq[2], _mm_add_epi16(pq[3], pq[1])));
+      flat2_p[2] = _mm_add_epi16(sum_p, work0);
+      flat2_q[2] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+      flat2_pq[2] =
+          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[2], flat2_q[2]), 4);
+
+      sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
+      sum_p = _mm_sub_epi16(sum_p, q[3]);
+      sum_q = _mm_sub_epi16(sum_q, pq[3]);
+
+      work0 = _mm_add_epi16(sum_p6,
+                            _mm_add_epi16(pq[3], _mm_add_epi16(pq[4], pq[2])));
+      flat2_p[3] = _mm_add_epi16(sum_p, work0);
+      flat2_q[3] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+      flat2_pq[3] =
+          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[3], flat2_q[3]), 4);
+
+      sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
+      sum_p = _mm_sub_epi16(sum_p, q[2]);
+      sum_q = _mm_sub_epi16(sum_q, pq[2]);
+
+      work0 = _mm_add_epi16(sum_p6,
+                            _mm_add_epi16(pq[4], _mm_add_epi16(pq[5], pq[3])));
+      flat2_p[4] = _mm_add_epi16(sum_p, work0);
+      flat2_q[4] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+      flat2_pq[4] =
+          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[4], flat2_q[4]), 4);
+
+      sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
+      sum_p = _mm_sub_epi16(sum_p, q[1]);
+      sum_q = _mm_sub_epi16(sum_q, pq[1]);
+
+      work0 = _mm_add_epi16(sum_p6,
+                            _mm_add_epi16(pq[5], _mm_add_epi16(pq[6], pq[4])));
+      flat2_p[5] = _mm_add_epi16(sum_p, work0);
+      flat2_q[5] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+      flat2_pq[5] =
+          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[5], flat2_q[5]), 4);
+    }  // flat2
+       // ~~~~~~~~~~ apply flat ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    // highbd_filter8
+    pq[0] = _mm_unpacklo_epi64(ps0ps1, qs0qs1);
+    pq[1] = _mm_unpackhi_epi64(ps0ps1, qs0qs1);
+
+    for (i = 0; i < 3; i++) {
+      pq[i] = _mm_andnot_si128(flat, pq[i]);
+      flat_pq[i] = _mm_and_si128(flat, flat_pq[i]);
+      pq[i] = _mm_or_si128(pq[i], flat_pq[i]);
+    }
+
+    // wide flat
+    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    if (flat2_mask) {
+      for (i = 0; i < 6; i++) {
+        pq[i] = _mm_andnot_si128(flat2, pq[i]);
+        flat2_pq[i] = _mm_and_si128(flat2, flat2_pq[i]);
+        pq[i] = _mm_or_si128(pq[i], flat2_pq[i]);  // full list of pq values
+      }
+    }
+  } else {
+    pq[0] = _mm_unpacklo_epi64(ps0ps1, qs0qs1);
+    pq[1] = _mm_unpackhi_epi64(ps0ps1, qs0qs1);
+  }
+}
+
+void aom_highbd_lpf_horizontal_14_sse2(uint16_t *s, int pitch,
+                                       const uint8_t *blimit,
+                                       const uint8_t *limit,
+                                       const uint8_t *thresh, int bd) {
+  __m128i p[7], q[7], pq[7];
+  int i;
+
+  for (i = 0; i < 7; i++) {
+    p[i] = _mm_loadl_epi64((__m128i *)(s - (i + 1) * pitch));
+    q[i] = _mm_loadl_epi64((__m128i *)(s + i * pitch));
+  }
+
+  highbd_lpf_internal_14_sse2(p, q, pq, blimit, limit, thresh, bd);
+
+  for (i = 0; i < 6; i++) {
+    _mm_storel_epi64((__m128i *)(s - (i + 1) * pitch), pq[i]);
+    _mm_storel_epi64((__m128i *)(s + i * pitch), _mm_srli_si128(pq[i], 8));
+  }
+}
+
+static AOM_FORCE_INLINE void highbd_lpf_internal_14_dual_sse2(
+    __m128i *p, __m128i *q, const uint8_t *blt0, const uint8_t *lt0,
+    const uint8_t *thr0, const uint8_t *blt1, const uint8_t *lt1,
+    const uint8_t *thr1, int bd) {
+  __m128i blimit, limit, thresh, t80;
+  const __m128i zero = _mm_setzero_si128();
+
+  get_limit_dual(blt0, lt0, thr0, blt1, lt1, thr1, bd, &blimit, &limit, &thresh,
+                 &t80);
+  __m128i mask;
+  highbd_filter_mask_dual(p, q, &limit, &blimit, &mask);
+  __m128i flat, flat2;
+  highbd_flat_mask4_dual_sse2(p, q, &flat, &flat2, bd);
+
+  flat = _mm_and_si128(flat, mask);
+  flat2 = _mm_and_si128(flat2, flat);
+  __m128i ps[2], qs[2];
+  highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh, bd, &t80);
+  // flat and wide flat calculations
+
+  // if flat ==0 then flat2 is zero as well and we don't need any calc below
+  // sse4.1 if (0==_mm_test_all_zeros(flat,ff))
+  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
+    __m128i flat_p[3], flat_q[3];
+    __m128i flat2_p[6], flat2_q[6];
+    const __m128i eight = _mm_set1_epi16(8);
+    const __m128i four = _mm_set1_epi16(4);
+    __m128i sum_p_0 = _mm_add_epi16(p[5], _mm_add_epi16(p[4], p[3]));
+    __m128i sum_q = _mm_add_epi16(q[5], _mm_add_epi16(q[4], q[3]));
+    __m128i sum_lp = _mm_add_epi16(p[0], _mm_add_epi16(p[2], p[1]));
+    sum_p_0 = _mm_add_epi16(sum_p_0, sum_lp);
+    __m128i sum_lq = _mm_add_epi16(q[0], _mm_add_epi16(q[2], q[1]));
+    sum_q = _mm_add_epi16(sum_q, sum_lq);
+    sum_p_0 = _mm_add_epi16(eight, _mm_add_epi16(sum_p_0, sum_q));
+    sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq));
+    flat_p[0] =
+        _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(p[3], p[0])), 3);
+    flat_q[0] =
+        _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(q[3], q[0])), 3);
+    __m128i sum_p6 = _mm_add_epi16(p[6], p[6]);
+    __m128i sum_q6 = _mm_add_epi16(q[6], q[6]);
+    __m128i sum_p3 = _mm_add_epi16(p[3], p[3]);
+    __m128i sum_q3 = _mm_add_epi16(q[3], q[3]);
+
+    sum_q = _mm_sub_epi16(sum_p_0, p[5]);
+    __m128i sum_p = _mm_sub_epi16(sum_p_0, q[5]);
+
+    sum_lq = _mm_sub_epi16(sum_lp, p[2]);
+    sum_lp = _mm_sub_epi16(sum_lp, q[2]);
+    flat_p[1] =
+        _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(sum_p3, p[1])), 3);
+    flat_q[1] =
+        _mm_srli_epi16(_mm_add_epi16(sum_lq, _mm_add_epi16(sum_q3, q[1])), 3);
+
+    sum_lp = _mm_sub_epi16(sum_lp, q[1]);
+    sum_lq = _mm_sub_epi16(sum_lq, p[1]);
+    sum_p3 = _mm_add_epi16(sum_p3, p[3]);
+    sum_q3 = _mm_add_epi16(sum_q3, q[3]);
+    flat_p[2] =
+        _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(sum_p3, p[2])), 3);
+    flat_q[2] =
+        _mm_srli_epi16(_mm_add_epi16(sum_lq, _mm_add_epi16(sum_q3, q[2])), 3);
+
+    int flat2_mask =
+        (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat2, zero)));
+    if (flat2_mask) {
+      flat2_p[0] = _mm_srli_epi16(
+          _mm_add_epi16(sum_p_0, _mm_add_epi16(_mm_add_epi16(p[6], p[0]),
+                                               _mm_add_epi16(p[1], q[0]))),
+          4);
+      flat2_q[0] = _mm_srli_epi16(
+          _mm_add_epi16(sum_p_0, _mm_add_epi16(_mm_add_epi16(q[6], q[0]),
+                                               _mm_add_epi16(p[0], q[1]))),
+          4);
+
+      flat2_p[1] = _mm_srli_epi16(
+          _mm_add_epi16(
+              sum_p,
+              _mm_add_epi16(sum_p6,
+                            _mm_add_epi16(p[1], _mm_add_epi16(p[2], p[0])))),
+          4);
+      flat2_q[1] = _mm_srli_epi16(
+          _mm_add_epi16(
+              sum_q,
+              _mm_add_epi16(sum_q6,
+                            _mm_add_epi16(q[1], _mm_add_epi16(q[0], q[2])))),
+          4);
+      sum_p6 = _mm_add_epi16(sum_p6, p[6]);
+      sum_q6 = _mm_add_epi16(sum_q6, q[6]);
+      sum_p = _mm_sub_epi16(sum_p, q[4]);
+      sum_q = _mm_sub_epi16(sum_q, p[4]);
+      flat2_p[2] = _mm_srli_epi16(
+          _mm_add_epi16(
+              sum_p,
+              _mm_add_epi16(sum_p6,
+                            _mm_add_epi16(p[2], _mm_add_epi16(p[3], p[1])))),
+          4);
+      flat2_q[2] = _mm_srli_epi16(
+          _mm_add_epi16(
+              sum_q,
+              _mm_add_epi16(sum_q6,
+                            _mm_add_epi16(q[2], _mm_add_epi16(q[1], q[3])))),
+          4);
+      sum_p6 = _mm_add_epi16(sum_p6, p[6]);
+      sum_q6 = _mm_add_epi16(sum_q6, q[6]);
+      sum_p = _mm_sub_epi16(sum_p, q[3]);
+      sum_q = _mm_sub_epi16(sum_q, p[3]);
+      flat2_p[3] = _mm_srli_epi16(
+          _mm_add_epi16(
+              sum_p,
+              _mm_add_epi16(sum_p6,
+                            _mm_add_epi16(p[3], _mm_add_epi16(p[4], p[2])))),
+          4);
+      flat2_q[3] = _mm_srli_epi16(
+          _mm_add_epi16(
+              sum_q,
+              _mm_add_epi16(sum_q6,
+                            _mm_add_epi16(q[3], _mm_add_epi16(q[2], q[4])))),
+          4);
+      sum_p6 = _mm_add_epi16(sum_p6, p[6]);
+      sum_q6 = _mm_add_epi16(sum_q6, q[6]);
+      sum_p = _mm_sub_epi16(sum_p, q[2]);
+      sum_q = _mm_sub_epi16(sum_q, p[2]);
+      flat2_p[4] = _mm_srli_epi16(
+          _mm_add_epi16(
+              sum_p,
+              _mm_add_epi16(sum_p6,
+                            _mm_add_epi16(p[4], _mm_add_epi16(p[5], p[3])))),
+          4);
+      flat2_q[4] = _mm_srli_epi16(
+          _mm_add_epi16(
+              sum_q,
+              _mm_add_epi16(sum_q6,
+                            _mm_add_epi16(q[4], _mm_add_epi16(q[3], q[5])))),
+          4);
+      sum_p6 = _mm_add_epi16(sum_p6, p[6]);
+      sum_q6 = _mm_add_epi16(sum_q6, q[6]);
+      sum_p = _mm_sub_epi16(sum_p, q[1]);
+      sum_q = _mm_sub_epi16(sum_q, p[1]);
+      flat2_p[5] = _mm_srli_epi16(
+          _mm_add_epi16(
+              sum_p,
+              _mm_add_epi16(sum_p6,
+                            _mm_add_epi16(p[5], _mm_add_epi16(p[6], p[4])))),
+          4);
+      flat2_q[5] = _mm_srli_epi16(
+          _mm_add_epi16(
+              sum_q,
+              _mm_add_epi16(sum_q6,
+                            _mm_add_epi16(q[5], _mm_add_epi16(q[4], q[6])))),
+          4);
+    }
+    // highbd_filter8
+    int i;
+    for (i = 0; i < 2; i++) {
+      ps[i] = _mm_andnot_si128(flat, ps[i]);
+      flat_p[i] = _mm_and_si128(flat, flat_p[i]);
+      p[i] = _mm_or_si128(ps[i], flat_p[i]);
+      qs[i] = _mm_andnot_si128(flat, qs[i]);
+      flat_q[i] = _mm_and_si128(flat, flat_q[i]);
+      q[i] = _mm_or_si128(qs[i], flat_q[i]);
+    }
+    p[2] = _mm_andnot_si128(flat, p[2]);
+    //  p2 remains unchanged if !(flat && mask)
+    flat_p[2] = _mm_and_si128(flat, flat_p[2]);
+    //  when (flat && mask)
+    p[2] = _mm_or_si128(p[2], flat_p[2]);  // full list of p2 values
+    q[2] = _mm_andnot_si128(flat, q[2]);
+    flat_q[2] = _mm_and_si128(flat, flat_q[2]);
+    q[2] = _mm_or_si128(q[2], flat_q[2]);  // full list of q2 values
+
+    for (i = 0; i < 2; i++) {
+      ps[i] = _mm_andnot_si128(flat, ps[i]);
+      flat_p[i] = _mm_and_si128(flat, flat_p[i]);
+      p[i] = _mm_or_si128(ps[i], flat_p[i]);
+      qs[i] = _mm_andnot_si128(flat, qs[i]);
+      flat_q[i] = _mm_and_si128(flat, flat_q[i]);
+      q[i] = _mm_or_si128(qs[i], flat_q[i]);
+    }
+    // highbd_filter16
+    if (flat2_mask) {
+      for (i = 0; i < 6; i++) {
+        //  p[i] remains unchanged if !(flat2 && flat && mask)
+        p[i] = _mm_andnot_si128(flat2, p[i]);
+        flat2_p[i] = _mm_and_si128(flat2, flat2_p[i]);
+        //  get values for when (flat2 && flat && mask)
+        p[i] = _mm_or_si128(p[i], flat2_p[i]);  // full list of p values
+        q[i] = _mm_andnot_si128(flat2, q[i]);
+        flat2_q[i] = _mm_and_si128(flat2, flat2_q[i]);
+        q[i] = _mm_or_si128(q[i], flat2_q[i]);
+      }
+    }
+  } else {
+    p[0] = ps[0];
+    q[0] = qs[0];
+    p[1] = ps[1];
+    q[1] = qs[1];
+  }
+}
+
+void aom_highbd_lpf_horizontal_14_dual_sse2(
+    uint16_t *s, int pitch, const uint8_t *_blimit0, const uint8_t *_limit0,
+    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
+    const uint8_t *_thresh1, int bd) {
+  __m128i p[7], q[7];
+  int i;
+  load_highbd_pixel(s, 7, pitch, p, q);
+
+  highbd_lpf_internal_14_dual_sse2(p, q, _blimit0, _limit0, _thresh0, _blimit1,
+                                   _limit1, _thresh1, bd);
+
+  for (i = 0; i < 6; i++) {
+    _mm_storeu_si128((__m128i *)(s - (i + 1) * pitch), p[i]);
+    _mm_storeu_si128((__m128i *)(s + i * pitch), q[i]);
+  }
+}
+
+static AOM_FORCE_INLINE void highbd_lpf_internal_6_sse2(
+    __m128i *p2, __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1,
+    __m128i *q2, __m128i *p1p0_out, __m128i *q1q0_out, const uint8_t *_blimit,
+    const uint8_t *_limit, const uint8_t *_thresh, int bd) {
+  __m128i blimit, limit, thresh;
+  __m128i mask, hev, flat;
+  __m128i pq[3];
+  __m128i p1p0, q1q0, abs_p1p0, ps1ps0, qs1qs0;
+  __m128i flat_p1p0, flat_q0q1;
+
+  pq[0] = _mm_unpacklo_epi64(*p0, *q0);
+  pq[1] = _mm_unpacklo_epi64(*p1, *q1);
+  pq[2] = _mm_unpacklo_epi64(*p2, *q2);
+
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i four = _mm_set1_epi16(4);
+  __m128i t80;
+  const __m128i one = _mm_set1_epi16(0x1);
+
+  get_limit(_blimit, _limit, _thresh, bd, &blimit, &limit, &thresh, &t80);
+
+  highbd_hev_filter_mask_x_sse2(pq, 3, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit,
+                                &thresh, &hev, &mask);
+
+  // lp filter
+  highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out, &t80, bd);
+
+  // flat_mask
+  flat = _mm_max_epi16(abs_diff16(pq[2], pq[0]), abs_p1p0);
+  flat = _mm_max_epi16(flat, _mm_srli_si128(flat, 8));
+
+  flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
+
+  flat = _mm_cmpeq_epi16(flat, zero);
+  flat = _mm_and_si128(flat, mask);
+  // replicate for the further "merged variables" usage
+  flat = _mm_unpacklo_epi64(flat, flat);
+
+  // 5 tap filter
+  // need it only if flat !=0
+  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
+    __m128i workp_a, workp_b, workp_c;
+    __m128i pq0x2_pq1, pq1_pq2;
+
+    // op1
+    pq0x2_pq1 =
+        _mm_add_epi16(_mm_add_epi16(pq[0], pq[0]), pq[1]);  // p0 *2 + p1
+    pq1_pq2 = _mm_add_epi16(pq[1], pq[2]);                  // p1 + p2
+    workp_a = _mm_add_epi16(_mm_add_epi16(pq0x2_pq1, four),
+                            pq1_pq2);  // p2 + p0 * 2 + p1 * 2 + 4
+
+    workp_b = _mm_add_epi16(_mm_add_epi16(pq[2], pq[2]), *q0);
+    workp_b =
+        _mm_add_epi16(workp_a, workp_b);  // p2 * 3 + p1 * 2 + p0 * 2 + q0 + 4
+
+    // op0
+    workp_c = _mm_srli_si128(pq0x2_pq1, 8);  // q0 * 2 + q1
+    workp_a = _mm_add_epi16(workp_a,
+                            workp_c);  // p2 + p0 * 2 + p1 * 2 + q0 * 2 + q1 + 4
+    workp_b = _mm_unpacklo_epi64(workp_a, workp_b);
+    flat_p1p0 = _mm_srli_epi16(workp_b, 3);
+
+    // oq0
+    workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq[2]),
+                            pq[1]);  // p0 * 2 + p1  + q0 * 2 + q1 + 4
+    workp_b = _mm_srli_si128(pq1_pq2, 8);
+    workp_a = _mm_add_epi16(
+        workp_a, workp_b);  // p0 * 2 + p1  + q0 * 2 + q1 * 2 + q2 + 4
+    // workp_shft0 = _mm_srli_epi16(workp_a, 3);
+
+    // oq1
+    workp_c = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq[1]),
+                            pq[0]);  // p0   + q0 * 2 + q1 * 2 + q2 + 4
+    workp_b = _mm_add_epi16(*q2, *q2);
+    workp_b =
+        _mm_add_epi16(workp_c, workp_b);  // p0  + q0 * 2 + q1 * 2 + q2 * 3 + 4
+
+    workp_a = _mm_unpacklo_epi64(workp_a, workp_b);
+    flat_q0q1 = _mm_srli_epi16(workp_a, 3);
+
+    qs1qs0 = _mm_andnot_si128(flat, *q1q0_out);
+    q1q0 = _mm_and_si128(flat, flat_q0q1);
+    *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
+
+    ps1ps0 = _mm_andnot_si128(flat, *p1p0_out);
+    p1p0 = _mm_and_si128(flat, flat_p1p0);
+    *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
+  }
+}
+
+static AOM_FORCE_INLINE void highbd_lpf_internal_6_dual_sse2(
+    __m128i *p2, __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1,
+    __m128i *q2, const unsigned char *_blimit0, const unsigned char *_limit0,
+    const unsigned char *_thresh0, const unsigned char *_blimit1,
+    const unsigned char *_limit1, const unsigned char *_thresh1, int bd) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i blimit0, limit0, thresh0;
+  __m128i t80;
+  __m128i mask, flat, work;
+  __m128i abs_p1q1, abs_p0q0, abs_p1p0, abs_p2p1, abs_q1q0, abs_q2q1;
+  __m128i op1, op0, oq0, oq1;
+  const __m128i four = _mm_set1_epi16(4);
+  const __m128i one = _mm_set1_epi16(0x1);
+  const __m128i ffff = _mm_cmpeq_epi16(one, one);
+
+  get_limit_dual(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd,
+                 &blimit0, &limit0, &thresh0, &t80);
+
+  abs_p2p1 = abs_diff16(*p2, *p1);
+  abs_p1p0 = abs_diff16(*p1, *p0);
+  abs_q1q0 = abs_diff16(*q1, *q0);
+  abs_q2q1 = abs_diff16(*q2, *q1);
+
+  abs_p0q0 = abs_diff16(*p0, *q0);
+  abs_p1q1 = abs_diff16(*p1, *q1);
+
+  abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
+  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
+  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit0);
+  mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
+  // mask |= (abs(*p0 - *q0) * 2 + abs(*p1 - *q1) / 2  > blimit) * -1;
+  // So taking maximums continues to work:
+  mask = _mm_and_si128(mask, _mm_adds_epu16(limit0, one));
+
+  mask = _mm_max_epi16(abs_q2q1, mask);
+  work = _mm_max_epi16(abs_p1p0, abs_q1q0);
+  mask = _mm_max_epi16(work, mask);
+  mask = _mm_max_epi16(mask, abs_p2p1);
+  mask = _mm_subs_epu16(mask, limit0);
+  mask = _mm_cmpeq_epi16(mask, zero);
+
+  // lp filter
+  __m128i ps[2], qs[2], p[2], q[2];
+  {
+    p[0] = *p0;
+    p[1] = *p1;
+    q[0] = *q0;
+    q[1] = *q1;
+    // filter_mask and hev_mask
+    highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80);
+  }
+
+  // flat_mask
+  flat = _mm_max_epi16(abs_diff16(*q2, *q0), abs_diff16(*p2, *p0));
+  flat = _mm_max_epi16(flat, work);
+
+  flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
+
+  flat = _mm_cmpeq_epi16(flat, zero);
+  flat = _mm_and_si128(flat, mask);  // flat & mask
+
+  // 5 tap filter
+  // need it only if flat !=0
+  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
+    __m128i workp_a, workp_b, workp_shft0, workp_shft1;
+
+    // op1
+    workp_a = _mm_add_epi16(_mm_add_epi16(*p0, *p0),
+                            _mm_add_epi16(*p1, *p1));  // *p0 *2 + *p1 * 2
+    workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four),
+                            *p2);  // *p2 + *p0 * 2 + *p1 * 2 + 4
+
+    workp_b = _mm_add_epi16(_mm_add_epi16(*p2, *p2), *q0);
+    workp_shft0 = _mm_add_epi16(
+        workp_a, workp_b);  // *p2 * 3 + *p1 * 2 + *p0 * 2 + *q0 + 4
+    op1 = _mm_srli_epi16(workp_shft0, 3);
+
+    // op0
+    workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q0), *q1);  // *q0 * 2 + *q1
+    workp_a =
+        _mm_add_epi16(workp_a,
+                      workp_b);  // *p2 + *p0 * 2 + *p1 * 2 + *q0 * 2 + *q1 + 4
+    op0 = _mm_srli_epi16(workp_a, 3);
+
+    // oq0
+    workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, *p2),
+                            *p1);  // *p0 * 2 + *p1  + *q0 * 2 + *q1 + 4
+    workp_b = _mm_add_epi16(*q1, *q2);
+    workp_shft0 = _mm_add_epi16(
+        workp_a, workp_b);  // *p0 * 2 + *p1  + *q0 * 2 + *q1 * 2 + *q2 + 4
+    oq0 = _mm_srli_epi16(workp_shft0, 3);
+
+    // oq1
+    workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_shft0, *p1),
+                            *p0);  // *p0   + *q0 * 2 + *q1 * 2 + *q2 + 4
+    workp_b = _mm_add_epi16(*q2, *q2);
+    workp_shft1 = _mm_add_epi16(
+        workp_a, workp_b);  // *p0  + *q0 * 2 + *q1 * 2 + *q2 * 3 + 4
+    oq1 = _mm_srli_epi16(workp_shft1, 3);
+
+    qs[0] = _mm_andnot_si128(flat, qs[0]);
+    oq0 = _mm_and_si128(flat, oq0);
+    *q0 = _mm_or_si128(qs[0], oq0);
+
+    qs[1] = _mm_andnot_si128(flat, qs[1]);
+    oq1 = _mm_and_si128(flat, oq1);
+    *q1 = _mm_or_si128(qs[1], oq1);
+
+    ps[0] = _mm_andnot_si128(flat, ps[0]);
+    op0 = _mm_and_si128(flat, op0);
+    *p0 = _mm_or_si128(ps[0], op0);
+
+    ps[1] = _mm_andnot_si128(flat, ps[1]);
+    op1 = _mm_and_si128(flat, op1);
+    *p1 = _mm_or_si128(ps[1], op1);
+  } else {
+    *q0 = qs[0];
+    *q1 = qs[1];
+    *p0 = ps[0];
+    *p1 = ps[1];
+  }
+}
+
+void aom_highbd_lpf_horizontal_6_sse2(uint16_t *s, int p,
+                                      const uint8_t *_blimit,
+                                      const uint8_t *_limit,
+                                      const uint8_t *_thresh, int bd) {
+  __m128i p2, p1, p0, q0, q1, q2, p1p0_out, q1q0_out;
+
+  p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
+  p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
+  p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
+  q0 = _mm_loadl_epi64((__m128i *)(s + 0 * p));
+  q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
+  q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
+
+  highbd_lpf_internal_6_sse2(&p2, &p1, &p0, &q0, &q1, &q2, &p1p0_out, &q1q0_out,
+                             _blimit, _limit, _thresh, bd);
+
+  _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0_out, 8));
+  _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0_out);
+  _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0_out);
+  _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0_out, 8));
+}
+
+void aom_highbd_lpf_horizontal_6_dual_sse2(
+    uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
+    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
+    const uint8_t *_thresh1, int bd) {
+  __m128i p2, p1, p0, q0, q1, q2;
+
+  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
+  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
+  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
+  q0 = _mm_loadu_si128((__m128i *)(s + 0 * p));
+  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
+  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
+
+  highbd_lpf_internal_6_dual_sse2(&p2, &p1, &p0, &q0, &q1, &q2, _blimit0,
+                                  _limit0, _thresh0, _blimit1, _limit1,
+                                  _thresh1, bd);
+
+  _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
+  _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
+  _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
+  _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+}
+
+static AOM_FORCE_INLINE void highbd_lpf_internal_8_sse2(
+    __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1,
+    __m128i *q1, __m128i *p0, __m128i *q0, __m128i *q1q0_out, __m128i *p1p0_out,
+    const unsigned char *_blimit, const unsigned char *_limit,
+    const unsigned char *_thresh, int bd) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i blimit, limit, thresh;
+  __m128i mask, hev, flat;
+  __m128i pq[4];
+  __m128i p1p0, q1q0, ps1ps0, qs1qs0;
+  __m128i work_a, opq2, flat_p1p0, flat_q0q1;
+
+  pq[0] = _mm_unpacklo_epi64(*p0, *q0);
+  pq[1] = _mm_unpacklo_epi64(*p1, *q1);
+  pq[2] = _mm_unpacklo_epi64(*p2, *q2);
+  pq[3] = _mm_unpacklo_epi64(*p3, *q3);
+
+  __m128i abs_p1p0;
+
+  const __m128i four = _mm_set1_epi16(4);
+  __m128i t80;
+  const __m128i one = _mm_set1_epi16(0x1);
+
+  get_limit(_blimit, _limit, _thresh, bd, &blimit, &limit, &thresh, &t80);
+
+  highbd_hev_filter_mask_x_sse2(pq, 4, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit,
+                                &thresh, &hev, &mask);
+
+  // lp filter
+  highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out, &t80, bd);
+
+  // flat_mask4
+  flat = _mm_max_epi16(abs_diff16(pq[2], pq[0]), abs_diff16(pq[3], pq[0]));
+  flat = _mm_max_epi16(abs_p1p0, flat);
+  flat = _mm_max_epi16(flat, _mm_srli_si128(flat, 8));
+
+  flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
+
+  flat = _mm_cmpeq_epi16(flat, zero);
+  flat = _mm_and_si128(flat, mask);
+  // replicate for the further "merged variables" usage
+  flat = _mm_unpacklo_epi64(flat, flat);
+
+  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
+    __m128i workp_a, workp_b, workp_c, workp_shft0, workp_shft1;
+    // Added before shift for rounding part of ROUND_POWER_OF_TWO
+
+    // o*p2
+    workp_a = _mm_add_epi16(_mm_add_epi16(*p3, *p3), _mm_add_epi16(*p2, *p1));
+    workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), *p0);
+    workp_c = _mm_add_epi16(_mm_add_epi16(*q0, *p2), *p3);
+    workp_c = _mm_add_epi16(workp_a, workp_c);
+
+    // o*p1
+    workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q1), *p1);
+    workp_shft0 = _mm_add_epi16(workp_a, workp_b);
+
+    // o*p0
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q2);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p1), *p0);
+    workp_shft1 = _mm_add_epi16(workp_a, workp_b);
+
+    flat_p1p0 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_shft1, workp_shft0), 3);
+
+    // oq0
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q3);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p0), *q0);
+    workp_shft0 = _mm_add_epi16(workp_a, workp_b);
+
+    // oq1
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p2), *q3);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q0), *q1);
+    workp_shft1 = _mm_add_epi16(workp_a, workp_b);
+
+    flat_q0q1 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_shft0, workp_shft1), 3);
+
+    // oq2
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p1), *q3);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q1), *q2);
+    workp_a = _mm_add_epi16(workp_a, workp_b);
+    opq2 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_c, workp_a), 3);
+
+    qs1qs0 = _mm_andnot_si128(flat, *q1q0_out);
+    q1q0 = _mm_and_si128(flat, flat_q0q1);
+    *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
+
+    ps1ps0 = _mm_andnot_si128(flat, *p1p0_out);
+    p1p0 = _mm_and_si128(flat, flat_p1p0);
+    *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
+
+    work_a = _mm_andnot_si128(flat, pq[2]);
+    *p2 = _mm_and_si128(flat, opq2);
+    *p2 = _mm_or_si128(work_a, *p2);
+    *q2 = _mm_srli_si128(*p2, 8);
+  }
+}
+
+static AOM_FORCE_INLINE void highbd_lpf_internal_8_dual_sse2(
+    __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1,
+    __m128i *q1, __m128i *p0, __m128i *q0, const unsigned char *_blimit0,
+    const unsigned char *_limit0, const unsigned char *_thresh0,
+    const unsigned char *_blimit1, const unsigned char *_limit1,
+    const unsigned char *_thresh1, int bd) {
+  __m128i blimit0, limit0, thresh0;
+  __m128i t80;
+  __m128i mask, flat;
+  __m128i work_a, op2, oq2, op1, op0, oq0, oq1;
+  __m128i abs_p1q1, abs_p0q0, work0, work1, work2;
+
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i four = _mm_set1_epi16(4);
+  const __m128i one = _mm_set1_epi16(0x1);
+  const __m128i ffff = _mm_cmpeq_epi16(one, one);
+
+  get_limit_dual(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd,
+                 &blimit0, &limit0, &thresh0, &t80);
+
+  abs_p0q0 = abs_diff16(*p0, *q0);
+  abs_p1q1 = abs_diff16(*p1, *q1);
+
+  abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
+  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
+  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit0);
+  mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
+  // mask |= (abs(*p0 - q0) * 2 + abs(*p1 - q1) / 2  > blimit) * -1;
+
+  // So taking maximums continues to work:
+  mask = _mm_and_si128(mask, _mm_adds_epu16(limit0, one));
+
+  work0 = _mm_max_epi16(abs_diff16(*p3, *p2), abs_diff16(*p2, *p1));
+  work1 =
+      _mm_max_epi16(abs_diff16(*p1, *p0), abs_diff16(*q1, *q0));  // tbu 4 flat
+  work0 = _mm_max_epi16(work0, work1);
+  work2 = _mm_max_epi16(abs_diff16(*q2, *q1), abs_diff16(*q2, *q3));
+  work2 = _mm_max_epi16(work2, work0);
+  mask = _mm_max_epi16(work2, mask);
+
+  mask = _mm_subs_epu16(mask, limit0);
+  mask = _mm_cmpeq_epi16(mask, zero);
+
+  // lp filter
+  __m128i ps[2], qs[2], p[2], q[2];
+  {
+    p[0] = *p0;
+    p[1] = *p1;
+    q[0] = *q0;
+    q[1] = *q1;
+    // filter_mask and hev_mask
+    highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80);
+  }
+
+  flat = _mm_max_epi16(abs_diff16(*p2, *p0), abs_diff16(*q2, *q0));
+  flat = _mm_max_epi16(work1, flat);
+  work0 = _mm_max_epi16(abs_diff16(*p3, *p0), abs_diff16(*q3, *q0));
+  flat = _mm_max_epi16(work0, flat);
+
+  flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
+  flat = _mm_cmpeq_epi16(flat, zero);
+  flat = _mm_and_si128(flat, mask);  // flat & mask
+
+  // filter8 need it only if flat !=0
+  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
+    __m128i workp_a, workp_b;
+    // Added before shift for rounding part of ROUND_POWER_OF_TWO
+
+    // o*p2
+    workp_a = _mm_add_epi16(_mm_add_epi16(*p3, *p3), _mm_add_epi16(*p2, *p1));
+    workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), *p0);
+    workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *p2), *p3);
+    op2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    // o*p1
+    workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q1), *p1);
+    op1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    // o*p0
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q2);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p1), *p0);
+    op0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    // oq0
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q3);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p0), *q0);
+    oq0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    // oq1
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p2), *q3);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q0), *q1);
+    oq1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    // oq2
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p1), *q3);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q1), *q2);
+    oq2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    qs[0] = _mm_andnot_si128(flat, qs[0]);
+    oq0 = _mm_and_si128(flat, oq0);
+    *q0 = _mm_or_si128(qs[0], oq0);
+
+    qs[1] = _mm_andnot_si128(flat, qs[1]);
+    oq1 = _mm_and_si128(flat, oq1);
+    *q1 = _mm_or_si128(qs[1], oq1);
+
+    ps[0] = _mm_andnot_si128(flat, ps[0]);
+    op0 = _mm_and_si128(flat, op0);
+    *p0 = _mm_or_si128(ps[0], op0);
+
+    ps[1] = _mm_andnot_si128(flat, ps[1]);
+    op1 = _mm_and_si128(flat, op1);
+    *p1 = _mm_or_si128(ps[1], op1);
+
+    work_a = _mm_andnot_si128(flat, *q2);
+    *q2 = _mm_and_si128(flat, oq2);
+    *q2 = _mm_or_si128(work_a, *q2);
+
+    work_a = _mm_andnot_si128(flat, *p2);
+    *p2 = _mm_and_si128(flat, op2);
+    *p2 = _mm_or_si128(work_a, *p2);
+  } else {
+    *q0 = qs[0];
+    *q1 = qs[1];
+    *p0 = ps[0];
+    *p1 = ps[1];
+  }
+}
+
+void aom_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
+                                      const uint8_t *_blimit,
+                                      const uint8_t *_limit,
+                                      const uint8_t *_thresh, int bd) {
+  __m128i p2, p1, p0, q0, q1, q2, p3, q3;
+  __m128i q1q0, p1p0;
+
+  p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
+  q3 = _mm_loadl_epi64((__m128i *)(s + 3 * p));
+  p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
+  q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
+  p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
+  q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
+  p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
+  q0 = _mm_loadl_epi64((__m128i *)(s + 0 * p));
+
+  highbd_lpf_internal_8_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0,
+                             &p1p0, _blimit, _limit, _thresh, bd);
+
+  _mm_storel_epi64((__m128i *)(s - 3 * p), p2);
+  _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8));
+  _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0);
+  _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0);
+  _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8));
+  _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
+}
+
+void aom_highbd_lpf_horizontal_8_dual_sse2(
+    uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
+    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
+    const uint8_t *_thresh1, int bd) {
+  __m128i p2, p1, p0, q0, q1, q2, p3, q3;
+
+  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
+  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
+  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
+  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
+  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
+  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
+  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
+  q0 = _mm_loadu_si128((__m128i *)(s + 0 * p));
+
+  highbd_lpf_internal_8_dual_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0,
+                                  _blimit0, _limit0, _thresh0, _blimit1,
+                                  _limit1, _thresh1, bd);
+
+  _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
+  _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
+  _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
+  _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
+  _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+  _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
+}
+
+static AOM_FORCE_INLINE void highbd_lpf_internal_4_sse2(
+    __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *q1q0_out,
+    __m128i *p1p0_out, const uint8_t *_blimit, const uint8_t *_limit,
+    const uint8_t *_thresh, int bd) {
+  __m128i blimit, limit, thresh;
+  __m128i mask, hev;
+  __m128i p1p0, q1q0;
+  __m128i pq[2];
+
+  __m128i abs_p1p0;
+
+  __m128i t80;
+  get_limit(_blimit, _limit, _thresh, bd, &blimit, &limit, &thresh, &t80);
+
+  pq[0] = _mm_unpacklo_epi64(*p0, *q0);
+  pq[1] = _mm_unpacklo_epi64(*p1, *q1);
+
+  highbd_hev_filter_mask_x_sse2(pq, 2, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit,
+                                &thresh, &hev, &mask);
+
+  highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out, &t80, bd);
+}
+
+static AOM_FORCE_INLINE void highbd_lpf_internal_4_dual_sse2(
+    __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *ps,
+    __m128i *qs, const uint8_t *_blimit0, const uint8_t *_limit0,
+    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
+    const uint8_t *_thresh1, int bd) {
+  __m128i blimit0, limit0, thresh0;
+  __m128i mask, flat;
+  __m128i p[2], q[2];
+
+  const __m128i zero = _mm_setzero_si128();
+  __m128i abs_p0q0 = abs_diff16(*q0, *p0);
+  __m128i abs_p1q1 = abs_diff16(*q1, *p1);
+
+  __m128i abs_p1p0 = abs_diff16(*p1, *p0);
+  __m128i abs_q1q0 = abs_diff16(*q1, *q0);
+
+  const __m128i ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0);
+  const __m128i one = _mm_set1_epi16(1);
+
+  __m128i t80;
+
+  get_limit_dual(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd,
+                 &blimit0, &limit0, &thresh0, &t80);
+
+  // filter_mask and hev_mask
+  flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
+
+  abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
+  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
+
+  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit0);
+  mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
+  // mask |= (abs(*p0 - *q0) * 2 + abs(*p1 - *q1) / 2  > blimit) * -1;
+  // So taking maximums continues to work:
+  mask = _mm_and_si128(mask, _mm_adds_epu16(limit0, one));
+  mask = _mm_max_epi16(flat, mask);
+
+  mask = _mm_subs_epu16(mask, limit0);
+  mask = _mm_cmpeq_epi16(mask, zero);
+
+  p[0] = *p0;
+  p[1] = *p1;
+  q[0] = *q0;
+  q[1] = *q1;
+
+  highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80);
+}
+
+void aom_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
+                                      const uint8_t *_blimit,
+                                      const uint8_t *_limit,
+                                      const uint8_t *_thresh, int bd) {
+  __m128i p1p0, q1q0;
+  __m128i p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
+  __m128i p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
+  __m128i q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
+  __m128i q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
+
+  highbd_lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &q1q0, &p1p0, _blimit, _limit,
+                             _thresh, bd);
+
+  _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8));
+  _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0);
+  _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0);
+  _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8));
+}
+
+void aom_highbd_lpf_horizontal_4_dual_sse2(
+    uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
+    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
+    const uint8_t *_thresh1, int bd) {
+  __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
+  __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
+  __m128i q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
+  __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
+  __m128i ps[2], qs[2];
+
+  highbd_lpf_internal_4_dual_sse2(&p1, &p0, &q0, &q1, ps, qs, _blimit0, _limit0,
+                                  _thresh0, _blimit1, _limit1, _thresh1, bd);
+
+  _mm_storeu_si128((__m128i *)(s - 2 * p), ps[1]);
+  _mm_storeu_si128((__m128i *)(s - 1 * p), ps[0]);
+  _mm_storeu_si128((__m128i *)(s + 0 * p), qs[0]);
+  _mm_storeu_si128((__m128i *)(s + 1 * p), qs[1]);
+}
+
+void aom_highbd_lpf_vertical_4_sse2(uint16_t *s, int p, const uint8_t *blimit,
+                                    const uint8_t *limit, const uint8_t *thresh,
+                                    int bd) {
+  __m128i x0, x1, x2, x3, d0, d1, d2, d3;
+  __m128i p1p0, q1q0;
+  __m128i p1, q1;
+
+  x0 = _mm_loadl_epi64((__m128i *)(s - 2 + 0 * p));
+  x1 = _mm_loadl_epi64((__m128i *)(s - 2 + 1 * p));
+  x2 = _mm_loadl_epi64((__m128i *)(s - 2 + 2 * p));
+  x3 = _mm_loadl_epi64((__m128i *)(s - 2 + 3 * p));
+
+  highbd_transpose4x8_8x4_low_sse2(&x0, &x1, &x2, &x3, &d0, &d1, &d2, &d3);
+
+  highbd_lpf_internal_4_sse2(&d0, &d1, &d2, &d3, &q1q0, &p1p0, blimit, limit,
+                             thresh, bd);
+
+  p1 = _mm_srli_si128(p1p0, 8);
+  q1 = _mm_srli_si128(q1q0, 8);
+
+  // transpose from 8x4 to 4x8
+  highbd_transpose4x8_8x4_low_sse2(&p1, &p1p0, &q1q0, &q1, &d0, &d1, &d2, &d3);
+
+  _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0);
+  _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1);
+  _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2);
+  _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3);
+}
+
+void aom_highbd_lpf_vertical_4_dual_sse2(
+    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+  __m128i ps[2], qs[2];
+
+  x0 = _mm_loadl_epi64((__m128i *)(s - 2 + 0 * p));
+  x1 = _mm_loadl_epi64((__m128i *)(s - 2 + 1 * p));
+  x2 = _mm_loadl_epi64((__m128i *)(s - 2 + 2 * p));
+  x3 = _mm_loadl_epi64((__m128i *)(s - 2 + 3 * p));
+  x4 = _mm_loadl_epi64((__m128i *)(s - 2 + 4 * p));
+  x5 = _mm_loadl_epi64((__m128i *)(s - 2 + 5 * p));
+  x6 = _mm_loadl_epi64((__m128i *)(s - 2 + 6 * p));
+  x7 = _mm_loadl_epi64((__m128i *)(s - 2 + 7 * p));
+
+  highbd_transpose8x8_low_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0, &d1,
+                               &d2, &d3);
+
+  highbd_lpf_internal_4_dual_sse2(&d0, &d1, &d2, &d3, ps, qs, blimit0, limit0,
+                                  thresh0, blimit1, limit1, thresh1, bd);
+
+  highbd_transpose4x8_8x4_sse2(&ps[1], &ps[0], &qs[0], &qs[1], &d0, &d1, &d2,
+                               &d3, &d4, &d5, &d6, &d7);
+
+  _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0);
+  _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1);
+  _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2);
+  _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3);
+  _mm_storel_epi64((__m128i *)(s - 2 + 4 * p), d4);
+  _mm_storel_epi64((__m128i *)(s - 2 + 5 * p), d5);
+  _mm_storel_epi64((__m128i *)(s - 2 + 6 * p), d6);
+  _mm_storel_epi64((__m128i *)(s - 2 + 7 * p), d7);
+}
+
+void aom_highbd_lpf_vertical_6_sse2(uint16_t *s, int p, const uint8_t *blimit,
+                                    const uint8_t *limit, const uint8_t *thresh,
+                                    int bd) {
+  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+  __m128i x3, x2, x1, x0, p0, q0;
+  __m128i p1p0, q1q0;
+
+  x3 = _mm_loadu_si128((__m128i *)((s - 3) + 0 * p));
+  x2 = _mm_loadu_si128((__m128i *)((s - 3) + 1 * p));
+  x1 = _mm_loadu_si128((__m128i *)((s - 3) + 2 * p));
+  x0 = _mm_loadu_si128((__m128i *)((s - 3) + 3 * p));
+
+  highbd_transpose4x8_8x4_sse2(&x3, &x2, &x1, &x0, &d0, &d1, &d2, &d3, &d4, &d5,
+                               &d6, &d7);
+
+  highbd_lpf_internal_6_sse2(&d0, &d1, &d2, &d3, &d4, &d5, &p1p0, &q1q0, blimit,
+                             limit, thresh, bd);
+
+  p0 = _mm_srli_si128(p1p0, 8);
+  q0 = _mm_srli_si128(q1q0, 8);
+
+  highbd_transpose4x8_8x4_low_sse2(&p0, &p1p0, &q1q0, &q0, &d0, &d1, &d2, &d3);
+
+  _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0);
+  _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1);
+  _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2);
+  _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3);
+}
+
+void aom_highbd_lpf_vertical_6_dual_sse2(
+    uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
+    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
+    const uint8_t *_thresh1, int bd) {
+  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+  __m128i p0, q0, p1, q1, p2, q2;
+
+  x0 = _mm_loadu_si128((__m128i *)((s - 3) + 0 * p));
+  x1 = _mm_loadu_si128((__m128i *)((s - 3) + 1 * p));
+  x2 = _mm_loadu_si128((__m128i *)((s - 3) + 2 * p));
+  x3 = _mm_loadu_si128((__m128i *)((s - 3) + 3 * p));
+  x4 = _mm_loadu_si128((__m128i *)((s - 3) + 4 * p));
+  x5 = _mm_loadu_si128((__m128i *)((s - 3) + 5 * p));
+  x6 = _mm_loadu_si128((__m128i *)((s - 3) + 6 * p));
+  x7 = _mm_loadu_si128((__m128i *)((s - 3) + 7 * p));
+
+  highbd_transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &p2, &p1,
+                           &p0, &q0, &q1, &q2, &d6, &d7);
+
+  highbd_lpf_internal_6_dual_sse2(&p2, &p1, &p0, &q0, &q1, &q2, _blimit0,
+                                  _limit0, _thresh0, _blimit1, _limit1,
+                                  _thresh1, bd);
+
+  highbd_transpose4x8_8x4_sse2(&p1, &p0, &q0, &q1, &d0, &d1, &d2, &d3, &d4, &d5,
+                               &d6, &d7);
+
+  _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0);
+  _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1);
+  _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2);
+  _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3);
+  _mm_storel_epi64((__m128i *)(s - 2 + 4 * p), d4);
+  _mm_storel_epi64((__m128i *)(s - 2 + 5 * p), d5);
+  _mm_storel_epi64((__m128i *)(s - 2 + 6 * p), d6);
+  _mm_storel_epi64((__m128i *)(s - 2 + 7 * p), d7);
+}
+
+void aom_highbd_lpf_vertical_8_sse2(uint16_t *s, int p, const uint8_t *blimit,
+                                    const uint8_t *limit, const uint8_t *thresh,
+                                    int bd) {
+  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+  __m128i p2, p1, p0, p3, q0;
+  __m128i q1q0, p1p0;
+
+  p3 = _mm_loadu_si128((__m128i *)((s - 4) + 0 * p));
+  p2 = _mm_loadu_si128((__m128i *)((s - 4) + 1 * p));
+  p1 = _mm_loadu_si128((__m128i *)((s - 4) + 2 * p));
+  p0 = _mm_loadu_si128((__m128i *)((s - 4) + 3 * p));
+
+  highbd_transpose4x8_8x4_sse2(&p3, &p2, &p1, &p0, &d0, &d1, &d2, &d3, &d4, &d5,
+                               &d6, &d7);
+
+  // Loop filtering
+  highbd_lpf_internal_8_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4, &q1q0,
+                             &p1p0, blimit, limit, thresh, bd);
+
+  p0 = _mm_srli_si128(p1p0, 8);
+  q0 = _mm_srli_si128(q1q0, 8);
+
+  highbd_transpose8x8_low_sse2(&d0, &d1, &p0, &p1p0, &q1q0, &q0, &d6, &d7, &d0,
+                               &d1, &d2, &d3);
+
+  _mm_storeu_si128((__m128i *)(s - 4 + 0 * p), d0);
+  _mm_storeu_si128((__m128i *)(s - 4 + 1 * p), d1);
+  _mm_storeu_si128((__m128i *)(s - 4 + 2 * p), d2);
+  _mm_storeu_si128((__m128i *)(s - 4 + 3 * p), d3);
+}
+
+void aom_highbd_lpf_vertical_8_dual_sse2(
+    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+
+  x0 = _mm_loadu_si128((__m128i *)(s - 4 + 0 * p));
+  x1 = _mm_loadu_si128((__m128i *)(s - 4 + 1 * p));
+  x2 = _mm_loadu_si128((__m128i *)(s - 4 + 2 * p));
+  x3 = _mm_loadu_si128((__m128i *)(s - 4 + 3 * p));
+  x4 = _mm_loadu_si128((__m128i *)(s - 4 + 4 * p));
+  x5 = _mm_loadu_si128((__m128i *)(s - 4 + 5 * p));
+  x6 = _mm_loadu_si128((__m128i *)(s - 4 + 6 * p));
+  x7 = _mm_loadu_si128((__m128i *)(s - 4 + 7 * p));
+
+  highbd_transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0, &d1,
+                           &d2, &d3, &d4, &d5, &d6, &d7);
+
+  highbd_lpf_internal_8_dual_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4,
+                                  blimit0, limit0, thresh0, blimit1, limit1,
+                                  thresh1, bd);
+
+  highbd_transpose8x8_sse2(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7, &x0, &x1,
+                           &x2, &x3, &x4, &x5, &x6, &x7);
+
+  _mm_storeu_si128((__m128i *)(s - 4 + 0 * p), x0);
+  _mm_storeu_si128((__m128i *)(s - 4 + 1 * p), x1);
+  _mm_storeu_si128((__m128i *)(s - 4 + 2 * p), x2);
+  _mm_storeu_si128((__m128i *)(s - 4 + 3 * p), x3);
+  _mm_storeu_si128((__m128i *)(s - 4 + 4 * p), x4);
+  _mm_storeu_si128((__m128i *)(s - 4 + 5 * p), x5);
+  _mm_storeu_si128((__m128i *)(s - 4 + 6 * p), x6);
+  _mm_storeu_si128((__m128i *)(s - 4 + 7 * p), x7);
+}
+
+void aom_highbd_lpf_vertical_14_sse2(uint16_t *s, int pitch,
+                                     const uint8_t *blimit,
+                                     const uint8_t *limit,
+                                     const uint8_t *thresh, int bd) {
+  __m128i q[7], p[7], pq[7];
+  __m128i p6, p5, p4, p3;
+  __m128i p6_2, p5_2, p4_2, p3_2;
+  __m128i d0, d1, d2, d3;
+  __m128i d0_2, d1_2, d2_2, d3_2, d7_2;
+
+  p6 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * pitch));
+  p5 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * pitch));
+  p4 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * pitch));
+  p3 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * pitch));
+
+  highbd_transpose4x8_8x4_sse2(&p6, &p5, &p4, &p3, &d0, &p[6], &p[5], &p[4],
+                               &p[3], &p[2], &p[1], &p[0]);
+
+  p6_2 = _mm_loadu_si128((__m128i *)(s + 0 * pitch));
+  p5_2 = _mm_loadu_si128((__m128i *)(s + 1 * pitch));
+  p4_2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
+  p3_2 = _mm_loadu_si128((__m128i *)(s + 3 * pitch));
+
+  highbd_transpose4x8_8x4_sse2(&p6_2, &p5_2, &p4_2, &p3_2, &q[0], &q[1], &q[2],
+                               &q[3], &q[4], &q[5], &q[6], &d7_2);
+
+  highbd_lpf_internal_14_sse2(p, q, pq, blimit, limit, thresh, bd);
+
+  highbd_transpose8x8_low_sse2(&d0, &p[6], &pq[5], &pq[4], &pq[3], &pq[2],
+                               &pq[1], &pq[0], &d0, &d1, &d2, &d3);
+
+  q[0] = _mm_srli_si128(pq[0], 8);
+  q[1] = _mm_srli_si128(pq[1], 8);
+  q[2] = _mm_srli_si128(pq[2], 8);
+  q[3] = _mm_srli_si128(pq[3], 8);
+  q[4] = _mm_srli_si128(pq[4], 8);
+  q[5] = _mm_srli_si128(pq[5], 8);
+
+  highbd_transpose8x8_low_sse2(&q[0], &q[1], &q[2], &q[3], &q[4], &q[5], &q[6],
+                               &d7_2, &d0_2, &d1_2, &d2_2, &d3_2);
+
+  _mm_storeu_si128((__m128i *)(s - 8 + 0 * pitch), d0);
+  _mm_storeu_si128((__m128i *)(s + 0 * pitch), d0_2);
+
+  _mm_storeu_si128((__m128i *)(s - 8 + 1 * pitch), d1);
+  _mm_storeu_si128((__m128i *)(s + 1 * pitch), d1_2);
+
+  _mm_storeu_si128((__m128i *)(s - 8 + 2 * pitch), d2);
+  _mm_storeu_si128((__m128i *)(s + 2 * pitch), d2_2);
+
+  _mm_storeu_si128((__m128i *)(s - 8 + 3 * pitch), d3);
+  _mm_storeu_si128((__m128i *)(s + 3 * pitch), d3_2);
+}
+
+void aom_highbd_lpf_vertical_14_dual_sse2(
+    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  __m128i q[7], p[7];
+  __m128i p6, p5, p4, p3, p2, p1, p0, q0;
+  __m128i p6_2, p5_2, p4_2, p3_2, p2_2, p1_2, q0_2, p0_2;
+  __m128i d0, d7;
+  __m128i d0_out, d1_out, d2_out, d3_out, d4_out, d5_out, d6_out, d7_out;
+
+  p6 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * pitch));
+  p5 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * pitch));
+  p4 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * pitch));
+  p3 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * pitch));
+  p2 = _mm_loadu_si128((__m128i *)((s - 8) + 4 * pitch));
+  p1 = _mm_loadu_si128((__m128i *)((s - 8) + 5 * pitch));
+  p0 = _mm_loadu_si128((__m128i *)((s - 8) + 6 * pitch));
+  q0 = _mm_loadu_si128((__m128i *)((s - 8) + 7 * pitch));
+
+  highbd_transpose8x8_sse2(&p6, &p5, &p4, &p3, &p2, &p1, &p0, &q0, &d0, &p[6],
+                           &p[5], &p[4], &p[3], &p[2], &p[1], &p[0]);
+
+  p6_2 = _mm_loadu_si128((__m128i *)(s + 0 * pitch));
+  p5_2 = _mm_loadu_si128((__m128i *)(s + 1 * pitch));
+  p4_2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
+  p3_2 = _mm_loadu_si128((__m128i *)(s + 3 * pitch));
+  p2_2 = _mm_loadu_si128((__m128i *)(s + 4 * pitch));
+  p1_2 = _mm_loadu_si128((__m128i *)(s + 5 * pitch));
+  p0_2 = _mm_loadu_si128((__m128i *)(s + 6 * pitch));
+  q0_2 = _mm_loadu_si128((__m128i *)(s + 7 * pitch));
+
+  highbd_transpose8x8_sse2(&p6_2, &p5_2, &p4_2, &p3_2, &p2_2, &p1_2, &p0_2,
+                           &q0_2, &q[0], &q[1], &q[2], &q[3], &q[4], &q[5],
+                           &q[6], &d7);
+
+  highbd_lpf_internal_14_dual_sse2(p, q, blimit0, limit0, thresh0, blimit1,
+                                   limit1, thresh1, bd);
+
+  highbd_transpose8x8_sse2(&d0, &p[6], &p[5], &p[4], &p[3], &p[2], &p[1], &p[0],
+                           &d0_out, &d1_out, &d2_out, &d3_out, &d4_out, &d5_out,
+                           &d6_out, &d7_out);
+
+  _mm_storeu_si128((__m128i *)(s - 8 + 0 * pitch), d0_out);
+  _mm_storeu_si128((__m128i *)(s - 8 + 1 * pitch), d1_out);
+  _mm_storeu_si128((__m128i *)(s - 8 + 2 * pitch), d2_out);
+  _mm_storeu_si128((__m128i *)(s - 8 + 3 * pitch), d3_out);
+  _mm_storeu_si128((__m128i *)(s - 8 + 4 * pitch), d4_out);
+  _mm_storeu_si128((__m128i *)(s - 8 + 5 * pitch), d5_out);
+  _mm_storeu_si128((__m128i *)(s - 8 + 6 * pitch), d6_out);
+  _mm_storeu_si128((__m128i *)(s - 8 + 7 * pitch), d7_out);
+
+  highbd_transpose8x8_sse2(&q[0], &q[1], &q[2], &q[3], &q[4], &q[5], &q[6], &d7,
+                           &d0_out, &d1_out, &d2_out, &d3_out, &d4_out, &d5_out,
+                           &d6_out, &d7_out);
+
+  _mm_storeu_si128((__m128i *)(s + 0 * pitch), d0_out);
+  _mm_storeu_si128((__m128i *)(s + 1 * pitch), d1_out);
+  _mm_storeu_si128((__m128i *)(s + 2 * pitch), d2_out);
+  _mm_storeu_si128((__m128i *)(s + 3 * pitch), d3_out);
+  _mm_storeu_si128((__m128i *)(s + 4 * pitch), d4_out);
+  _mm_storeu_si128((__m128i *)(s + 5 * pitch), d5_out);
+  _mm_storeu_si128((__m128i *)(s + 6 * pitch), d6_out);
+  _mm_storeu_si128((__m128i *)(s + 7 * pitch), d7_out);
+}
diff --git a/libs/libaom/src/aom_dsp/x86/highbd_quantize_intrin_avx2.c b/libs/libaom/src/aom_dsp/x86/highbd_quantize_intrin_avx2.c
new file mode 100644
index 000000000..b9689202a
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/highbd_quantize_intrin_avx2.c
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+
+static INLINE void init_one_qp(const __m128i *p, __m256i *qp) {
+  const __m128i sign = _mm_srai_epi16(*p, 15);
+  const __m128i dc = _mm_unpacklo_epi16(*p, sign);
+  const __m128i ac = _mm_unpackhi_epi16(*p, sign);
+  *qp = _mm256_insertf128_si256(_mm256_castsi128_si256(dc), ac, 1);
+}
+
+static INLINE void update_qp(__m256i *qp) {
+  int i;
+  for (i = 0; i < 5; ++i) {
+    qp[i] = _mm256_permute2x128_si256(qp[i], qp[i], 0x11);
+  }
+}
+
+static INLINE void init_qp(const int16_t *zbin_ptr, const int16_t *round_ptr,
+                           const int16_t *quant_ptr, const int16_t *dequant_ptr,
+                           const int16_t *quant_shift_ptr, __m256i *qp) {
+  const __m128i zbin = _mm_loadu_si128((const __m128i *)zbin_ptr);
+  const __m128i round = _mm_loadu_si128((const __m128i *)round_ptr);
+  const __m128i quant = _mm_loadu_si128((const __m128i *)quant_ptr);
+  const __m128i dequant = _mm_loadu_si128((const __m128i *)dequant_ptr);
+  const __m128i quant_shift = _mm_loadu_si128((const __m128i *)quant_shift_ptr);
+  init_one_qp(&zbin, &qp[0]);
+  init_one_qp(&round, &qp[1]);
+  init_one_qp(&quant, &qp[2]);
+  init_one_qp(&dequant, &qp[3]);
+  init_one_qp(&quant_shift, &qp[4]);
+}
+
+// Note:
+// *x is vector multiplied by *y which is 16 int32_t parallel multiplication
+// and right shift 16.  The output, 16 int32_t is save in *p.
+static INLINE void mm256_mul_shift_epi32(const __m256i *x, const __m256i *y,
+                                         __m256i *p) {
+  __m256i prod_lo = _mm256_mul_epi32(*x, *y);
+  __m256i prod_hi = _mm256_srli_epi64(*x, 32);
+  const __m256i mult_hi = _mm256_srli_epi64(*y, 32);
+  prod_hi = _mm256_mul_epi32(prod_hi, mult_hi);
+
+  prod_lo = _mm256_srli_epi64(prod_lo, 16);
+  const __m256i mask = _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1);
+  prod_lo = _mm256_and_si256(prod_lo, mask);
+  prod_hi = _mm256_srli_epi64(prod_hi, 16);
+
+  prod_hi = _mm256_slli_epi64(prod_hi, 32);
+  *p = _mm256_or_si256(prod_lo, prod_hi);
+}
+
+static INLINE void quantize(const __m256i *qp, __m256i *c,
+                            const int16_t *iscan_ptr, tran_low_t *qcoeff,
+                            tran_low_t *dqcoeff, __m256i *eob) {
+  const __m256i abs = _mm256_abs_epi32(*c);
+  const __m256i flag1 = _mm256_cmpgt_epi32(abs, qp[0]);
+  __m256i flag2 = _mm256_cmpeq_epi32(abs, qp[0]);
+  flag2 = _mm256_or_si256(flag1, flag2);
+  const int32_t nzflag = _mm256_movemask_epi8(flag2);
+
+  if (LIKELY(nzflag)) {
+    __m256i q = _mm256_add_epi32(abs, qp[1]);
+    __m256i tmp;
+    mm256_mul_shift_epi32(&q, &qp[2], &tmp);
+    q = _mm256_add_epi32(tmp, q);
+
+    mm256_mul_shift_epi32(&q, &qp[4], &q);
+    __m256i dq = _mm256_mullo_epi32(q, qp[3]);
+
+    q = _mm256_sign_epi32(q, *c);
+    dq = _mm256_sign_epi32(dq, *c);
+    q = _mm256_and_si256(q, flag2);
+    dq = _mm256_and_si256(dq, flag2);
+
+    _mm256_storeu_si256((__m256i *)qcoeff, q);
+    _mm256_storeu_si256((__m256i *)dqcoeff, dq);
+
+    const __m128i isc = _mm_loadu_si128((const __m128i *)iscan_ptr);
+    const __m128i zr = _mm_setzero_si128();
+    const __m128i lo = _mm_unpacklo_epi16(isc, zr);
+    const __m128i hi = _mm_unpackhi_epi16(isc, zr);
+    const __m256i iscan =
+        _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
+
+    const __m256i zero = _mm256_setzero_si256();
+    const __m256i zc = _mm256_cmpeq_epi32(dq, zero);
+    const __m256i nz = _mm256_cmpeq_epi32(zc, zero);
+    __m256i cur_eob = _mm256_sub_epi32(iscan, nz);
+    cur_eob = _mm256_and_si256(cur_eob, nz);
+    *eob = _mm256_max_epi32(cur_eob, *eob);
+  } else {
+    const __m256i zero = _mm256_setzero_si256();
+    _mm256_storeu_si256((__m256i *)qcoeff, zero);
+    _mm256_storeu_si256((__m256i *)dqcoeff, zero);
+  }
+}
+
+void aom_highbd_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                                const int16_t *zbin_ptr,
+                                const int16_t *round_ptr,
+                                const int16_t *quant_ptr,
+                                const int16_t *quant_shift_ptr,
+                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                                const int16_t *scan, const int16_t *iscan) {
+  (void)scan;
+  const unsigned int step = 8;
+
+  __m256i qp[5], coeff;
+  init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp);
+  coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+
+  __m256i eob = _mm256_setzero_si256();
+  quantize(qp, &coeff, iscan, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+  coeff_ptr += step;
+  qcoeff_ptr += step;
+  dqcoeff_ptr += step;
+  iscan += step;
+  n_coeffs -= step;
+
+  update_qp(qp);
+
+  while (n_coeffs > 0) {
+    coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+    quantize(qp, &coeff, iscan, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+    coeff_ptr += step;
+    qcoeff_ptr += step;
+    dqcoeff_ptr += step;
+    iscan += step;
+    n_coeffs -= step;
+  }
+  {
+    __m256i eob_s;
+    eob_s = _mm256_shuffle_epi32(eob, 0xe);
+    eob = _mm256_max_epi16(eob, eob_s);
+    eob_s = _mm256_shufflelo_epi16(eob, 0xe);
+    eob = _mm256_max_epi16(eob, eob_s);
+    eob_s = _mm256_shufflelo_epi16(eob, 1);
+    eob = _mm256_max_epi16(eob, eob_s);
+    const __m128i final_eob = _mm_max_epi16(_mm256_castsi256_si128(eob),
+                                            _mm256_extractf128_si256(eob, 1));
+    *eob_ptr = _mm_extract_epi16(final_eob, 0);
+  }
+}
diff --git a/libs/libaom/src/aom_dsp/x86/highbd_quantize_intrin_sse2.c b/libs/libaom/src/aom_dsp/x86/highbd_quantize_intrin_sse2.c
new file mode 100644
index 000000000..1764a4952
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/highbd_quantize_intrin_sse2.c
@@ -0,0 +1,206 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+
+void aom_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
+                                const int16_t *zbin_ptr,
+                                const int16_t *round_ptr,
+                                const int16_t *quant_ptr,
+                                const int16_t *quant_shift_ptr,
+                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                                const int16_t *scan, const int16_t *iscan) {
+  int i, j, non_zero_regs = (int)count / 4, eob_i = -1;
+  __m128i zbins[2];
+  __m128i nzbins[2];
+
+  zbins[0] = _mm_set_epi32((int)zbin_ptr[1], (int)zbin_ptr[1], (int)zbin_ptr[1],
+                           (int)zbin_ptr[0]);
+  zbins[1] = _mm_set1_epi32((int)zbin_ptr[1]);
+
+  nzbins[0] = _mm_setzero_si128();
+  nzbins[1] = _mm_setzero_si128();
+  nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
+  nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
+
+  (void)scan;
+
+  memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));
+
+  // Pre-scan pass
+  for (i = ((int)count / 4) - 1; i >= 0; i--) {
+    __m128i coeffs, cmp1, cmp2;
+    int test;
+    coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+    cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
+    cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
+    cmp1 = _mm_and_si128(cmp1, cmp2);
+    test = _mm_movemask_epi8(cmp1);
+    if (test == 0xffff)
+      non_zero_regs--;
+    else
+      break;
+  }
+
+  // Quantization pass:
+  for (i = 0; i < non_zero_regs; i++) {
+    __m128i coeffs, coeffs_sign, tmp1, tmp2;
+    int test;
+    int abs_coeff[4];
+    int coeff_sign[4];
+
+    coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+    coeffs_sign = _mm_srai_epi32(coeffs, 31);
+    coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign);
+    tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]);
+    tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]);
+    tmp1 = _mm_or_si128(tmp1, tmp2);
+    test = _mm_movemask_epi8(tmp1);
+    _mm_storeu_si128((__m128i *)abs_coeff, coeffs);
+    _mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign);
+
+    for (j = 0; j < 4; j++) {
+      if (test & (1 << (4 * j))) {
+        int k = 4 * i + j;
+        const int64_t tmp3 = abs_coeff[j] + round_ptr[k != 0];
+        const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3;
+        const uint32_t abs_qcoeff =
+            (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16);
+        qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j];
+        dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0];
+        if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i;
+      }
+    }
+  }
+  *eob_ptr = eob_i + 1;
+}
+
+void aom_highbd_quantize_b_32x32_sse2(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  __m128i zbins[2];
+  __m128i nzbins[2];
+  int idx = 0;
+  int idx_arr[1024];
+  int i, eob = -1;
+  const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1);
+  const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1);
+  (void)scan;
+  zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp);
+  zbins[1] = _mm_set1_epi32(zbin1_tmp);
+
+  nzbins[0] = _mm_setzero_si128();
+  nzbins[1] = _mm_setzero_si128();
+  nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
+  nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  // Pre-scan pass
+  for (i = 0; i < n_coeffs / 4; i++) {
+    __m128i coeffs, cmp1, cmp2;
+    int test;
+    coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+    cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
+    cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
+    cmp1 = _mm_and_si128(cmp1, cmp2);
+    test = _mm_movemask_epi8(cmp1);
+    if (!(test & 0xf)) idx_arr[idx++] = i * 4;
+    if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1;
+    if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2;
+    if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3;
+  }
+
+  // Quantization pass: only process the coefficients selected in
+  // pre-scan pass. Note: idx can be zero.
+  for (i = 0; i < idx; i++) {
+    const int rc = idx_arr[i];
+    const int coeff = coeff_ptr[rc];
+    const int coeff_sign = AOMSIGN(coeff);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+    const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+    const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
+    const uint32_t abs_qcoeff =
+        (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
+    qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign;
+    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+    if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
+  }
+  *eob_ptr = eob + 1;
+}
+
+void aom_highbd_quantize_b_64x64_sse2(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  __m128i zbins[2];
+  __m128i nzbins[2];
+  int idx = 0;
+  int idx_arr[1024];
+  int i, eob = -1;
+  const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 2);
+  const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 2);
+  (void)scan;
+  zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp);
+  zbins[1] = _mm_set1_epi32(zbin1_tmp);
+
+  nzbins[0] = _mm_setzero_si128();
+  nzbins[1] = _mm_setzero_si128();
+  nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
+  nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  // Pre-scan pass
+  for (i = 0; i < n_coeffs / 4; i++) {
+    __m128i coeffs, cmp1, cmp2;
+    int test;
+    coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+    cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
+    cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
+    cmp1 = _mm_and_si128(cmp1, cmp2);
+    test = _mm_movemask_epi8(cmp1);
+    if (!(test & 0xf)) idx_arr[idx++] = i * 4;
+    if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1;
+    if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2;
+    if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3;
+  }
+
+  // Quantization pass: only process the coefficients selected in
+  // pre-scan pass. Note: idx can be zero.
+  for (i = 0; i < idx; i++) {
+    const int rc = idx_arr[i];
+    const int coeff = coeff_ptr[rc];
+    const int coeff_sign = AOMSIGN(coeff);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+    const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 2);
+    const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
+    const uint32_t abs_qcoeff =
+        (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 14);
+    qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign;
+    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 4;
+    if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
+  }
+  *eob_ptr = eob + 1;
+}
diff --git a/libs/libaom/src/aom_dsp/x86/highbd_sad4d_sse2.asm b/libs/libaom/src/aom_dsp/x86/highbd_sad4d_sse2.asm
new file mode 100644
index 000000000..e0d22522d
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/highbd_sad4d_sse2.asm
@@ -0,0 +1,296 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+; HIGH_PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro HIGH_PROCESS_4x2x4 5-6 0
+  movh                  m0, [srcq +%2*2]
+%if %1 == 1
+  movu                  m4, [ref1q+%3*2]
+  movu                  m5, [ref2q+%3*2]
+  movu                  m6, [ref3q+%3*2]
+  movu                  m7, [ref4q+%3*2]
+  movhps                m0, [srcq +%4*2]
+  movhps                m4, [ref1q+%5*2]
+  movhps                m5, [ref2q+%5*2]
+  movhps                m6, [ref3q+%5*2]
+  movhps                m7, [ref4q+%5*2]
+  mova                  m3, m0
+  mova                  m2, m0
+  psubusw               m3, m4
+  psubusw               m2, m5
+  psubusw               m4, m0
+  psubusw               m5, m0
+  por                   m4, m3
+  por                   m5, m2
+  pmaddwd               m4, m1
+  pmaddwd               m5, m1
+  mova                  m3, m0
+  mova                  m2, m0
+  psubusw               m3, m6
+  psubusw               m2, m7
+  psubusw               m6, m0
+  psubusw               m7, m0
+  por                   m6, m3
+  por                   m7, m2
+  pmaddwd               m6, m1
+  pmaddwd               m7, m1
+%else
+  movu                  m2, [ref1q+%3*2]
+  movhps                m0, [srcq +%4*2]
+  movhps                m2, [ref1q+%5*2]
+  mova                  m3, m0
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  pmaddwd               m2, m1
+  paddd                 m4, m2
+
+  movu                  m2, [ref2q+%3*2]
+  mova                  m3, m0
+  movhps                m2, [ref2q+%5*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  pmaddwd               m2, m1
+  paddd                 m5, m2
+
+  movu                  m2, [ref3q+%3*2]
+  mova                  m3, m0
+  movhps                m2, [ref3q+%5*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  pmaddwd               m2, m1
+  paddd                 m6, m2
+
+  movu                  m2, [ref4q+%3*2]
+  mova                  m3, m0
+  movhps                m2, [ref4q+%5*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  pmaddwd               m2, m1
+  paddd                 m7, m2
+%endif
+%if %6 == 1
+  lea                 srcq, [srcq +src_strideq*4]
+  lea                ref1q, [ref1q+ref_strideq*4]
+  lea                ref2q, [ref2q+ref_strideq*4]
+  lea                ref3q, [ref3q+ref_strideq*4]
+  lea                ref4q, [ref4q+ref_strideq*4]
+%endif
+%endmacro
+
+; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro HIGH_PROCESS_8x2x4 5-6 0
+  ; 1st 8 px
+  mova                  m0, [srcq +%2*2]
+%if %1 == 1
+  movu                  m4, [ref1q+%3*2]
+  movu                  m5, [ref2q+%3*2]
+  movu                  m6, [ref3q+%3*2]
+  movu                  m7, [ref4q+%3*2]
+  mova                  m3, m0
+  mova                  m2, m0
+  psubusw               m3, m4
+  psubusw               m2, m5
+  psubusw               m4, m0
+  psubusw               m5, m0
+  por                   m4, m3
+  por                   m5, m2
+  pmaddwd               m4, m1
+  pmaddwd               m5, m1
+  mova                  m3, m0
+  mova                  m2, m0
+  psubusw               m3, m6
+  psubusw               m2, m7
+  psubusw               m6, m0
+  psubusw               m7, m0
+  por                   m6, m3
+  por                   m7, m2
+  pmaddwd               m6, m1
+  pmaddwd               m7, m1
+%else
+  mova                  m3, m0
+  movu                  m2, [ref1q+%3*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  mova                  m3, m0
+  pmaddwd               m2, m1
+  paddd                 m4, m2
+  movu                  m2, [ref2q+%3*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  mova                  m3, m0
+  pmaddwd               m2, m1
+  paddd                 m5, m2
+  movu                  m2, [ref3q+%3*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  mova                  m3, m0
+  pmaddwd               m2, m1
+  paddd                 m6, m2
+  movu                  m2, [ref4q+%3*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  pmaddwd               m2, m1
+  paddd                 m7, m2
+%endif
+
+  ; 2nd 8 px
+  mova                  m0, [srcq +(%4)*2]
+  mova                  m3, m0
+  movu                  m2, [ref1q+(%5)*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  mova                  m3, m0
+  pmaddwd               m2, m1
+  paddd                 m4, m2
+  movu                  m2, [ref2q+(%5)*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  mova                  m3, m0
+  pmaddwd               m2, m1
+  paddd                 m5, m2
+  movu                  m2, [ref3q+(%5)*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  mova                  m3, m0
+  pmaddwd               m2, m1
+  paddd                 m6, m2
+  movu                  m2, [ref4q+(%5)*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+%if %6 == 1
+  lea                 srcq, [srcq +src_strideq*4]
+  lea                ref1q, [ref1q+ref_strideq*4]
+  lea                ref2q, [ref2q+ref_strideq*4]
+  lea                ref3q, [ref3q+ref_strideq*4]
+  lea                ref4q, [ref4q+ref_strideq*4]
+%endif
+  por                   m2, m3
+  pmaddwd               m2, m1
+  paddd                 m7, m2
+%endmacro
+
+; HIGH_PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro HIGH_PROCESS_16x2x4 5-6 0
+  HIGH_PROCESS_8x2x4 %1, %2, %3, (%2 + 8), (%3 + 8)
+  HIGH_PROCESS_8x2x4  0, %4, %5, (%4 + 8), (%5 + 8), %6
+%endmacro
+
+; HIGH_PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro HIGH_PROCESS_32x2x4 5-6 0
+  HIGH_PROCESS_16x2x4 %1, %2, %3, (%2 + 16), (%3 + 16)
+  HIGH_PROCESS_16x2x4  0, %4, %5, (%4 + 16), (%5 + 16), %6
+%endmacro
+
+; HIGH_PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro HIGH_PROCESS_64x2x4 5-6 0
+  HIGH_PROCESS_32x2x4 %1, %2, %3, (%2 + 32), (%3 + 32)
+  HIGH_PROCESS_32x2x4  0, %4, %5, (%4 + 32), (%5 + 32), %6
+%endmacro
+
+; void aom_highbd_sadNxNx4d_sse2(uint8_t *src,    int src_stride,
+;                         uint8_t *ref[4], int ref_stride,
+;                         uint32_t res[4]);
+; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8
+%macro HIGH_SADNXN4D 2
+%if UNIX64
+cglobal highbd_sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
+                              res, ref2, ref3, ref4
+%else
+cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
+                              ref2, ref3, ref4
+%endif
+
+; set m1
+  push                srcq
+  mov                 srcd, 0x00010001
+  movd                  m1, srcd
+  pshufd                m1, m1, 0x0
+  pop                 srcq
+
+  movsxdifnidn src_strideq, src_strided
+  movsxdifnidn ref_strideq, ref_strided
+  mov                ref2q, [ref1q+gprsize*1]
+  mov                ref3q, [ref1q+gprsize*2]
+  mov                ref4q, [ref1q+gprsize*3]
+  mov                ref1q, [ref1q+gprsize*0]
+
+; convert byte pointers to short pointers
+  shl                 srcq, 1
+  shl                ref2q, 1
+  shl                ref3q, 1
+  shl                ref4q, 1
+  shl                ref1q, 1
+
+  HIGH_PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1
+%rep (%2-4)/2
+  HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1
+%endrep
+  HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0
+  ; N.B. HIGH_PROCESS outputs dwords (32 bits)
+  ; so in high bit depth even the smallest width (4) needs 128bits i.e. XMM
+  movhlps               m0, m4
+  movhlps               m1, m5
+  movhlps               m2, m6
+  movhlps               m3, m7
+  paddd                 m4, m0
+  paddd                 m5, m1
+  paddd                 m6, m2
+  paddd                 m7, m3
+  punpckldq             m4, m5
+  punpckldq             m6, m7
+  movhlps               m0, m4
+  movhlps               m1, m6
+  paddd                 m4, m0
+  paddd                 m6, m1
+  punpcklqdq            m4, m6
+  movifnidn             r4, r4mp
+  movu                [r4], m4
+  RET
+%endmacro
+
+
+INIT_XMM sse2
+HIGH_SADNXN4D 64, 64
+HIGH_SADNXN4D 64, 32
+HIGH_SADNXN4D 32, 64
+HIGH_SADNXN4D 32, 32
+HIGH_SADNXN4D 32, 16
+HIGH_SADNXN4D 16, 32
+HIGH_SADNXN4D 16, 16
+HIGH_SADNXN4D 16,  8
+HIGH_SADNXN4D  8, 16
+HIGH_SADNXN4D  8,  8
+HIGH_SADNXN4D  8,  4
+HIGH_SADNXN4D  4,  8
+HIGH_SADNXN4D  4,  4
+HIGH_SADNXN4D  4, 16
+HIGH_SADNXN4D 16,  4
+HIGH_SADNXN4D  8, 32
+HIGH_SADNXN4D 32,  8
+HIGH_SADNXN4D 16, 64
+HIGH_SADNXN4D 64, 16
diff --git a/libs/libaom/src/aom_dsp/x86/highbd_sad_sse2.asm b/libs/libaom/src/aom_dsp/x86/highbd_sad_sse2.asm
new file mode 100644
index 000000000..09e64d510
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/highbd_sad_sse2.asm
@@ -0,0 +1,442 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+%macro HIGH_SAD_FN 4
+%if %4 == 0
+%if %3 == 5
+cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, n_rows
+%else ; %3 == 7
+cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, \
+                            src_stride3, ref_stride3, n_rows
+%endif ; %3 == 5/7
+%else ; avg
+%if %3 == 5
+cglobal highbd_sad%1x%2_avg, 5, 1 + %3, 7, src, src_stride, ref, ref_stride, \
+                                    second_pred, n_rows
+%else ; %3 == 7
+cglobal highbd_sad%1x%2_avg, 5, ARCH_X86_64 + %3, 7, src, src_stride, \
+                                              ref, ref_stride, \
+                                              second_pred, \
+                                              src_stride3, ref_stride3
+%if ARCH_X86_64
+%define n_rowsd r7d
+%else ; x86-32
+%define n_rowsd dword r0m
+%endif ; x86-32/64
+%endif ; %3 == 5/7
+%endif ; avg/sad
+  movsxdifnidn src_strideq, src_strided
+  movsxdifnidn ref_strideq, ref_strided
+%if %3 == 7
+  lea         src_stride3q, [src_strideq*3]
+  lea         ref_stride3q, [ref_strideq*3]
+%endif ; %3 == 7
+; convert src, ref & second_pred to short ptrs (from byte ptrs)
+  shl                 srcq, 1
+  shl                 refq, 1
+%if %4 == 1
+  shl         second_predq, 1
+%endif
+%endmacro
+
+; unsigned int aom_highbd_sad64x{16,32,64}_sse2(uint8_t *src, int src_stride,
+;                                    uint8_t *ref, int ref_stride);
+%macro HIGH_SAD64XN 1-2 0
+  HIGH_SAD_FN 64, %1, 5, %2
+  mov              n_rowsd, %1
+  pxor                  m0, m0
+  pxor                  m6, m6
+
+.loop:
+  ; first half of each row
+  movu                  m1, [refq]
+  movu                  m2, [refq+16]
+  movu                  m3, [refq+32]
+  movu                  m4, [refq+48]
+%if %2 == 1
+  pavgw                 m1, [second_predq+mmsize*0]
+  pavgw                 m2, [second_predq+mmsize*1]
+  pavgw                 m3, [second_predq+mmsize*2]
+  pavgw                 m4, [second_predq+mmsize*3]
+  lea         second_predq, [second_predq+mmsize*4]
+%endif
+  mova                  m5, [srcq]
+  psubusw               m5, m1
+  psubusw               m1, [srcq]
+  por                   m1, m5
+  mova                  m5, [srcq+16]
+  psubusw               m5, m2
+  psubusw               m2, [srcq+16]
+  por                   m2, m5
+  mova                  m5, [srcq+32]
+  psubusw               m5, m3
+  psubusw               m3, [srcq+32]
+  por                   m3, m5
+  mova                  m5, [srcq+48]
+  psubusw               m5, m4
+  psubusw               m4, [srcq+48]
+  por                   m4, m5
+  paddw                 m1, m2
+  paddw                 m3, m4
+  movhlps               m2, m1
+  movhlps               m4, m3
+  paddw                 m1, m2
+  paddw                 m3, m4
+  punpcklwd             m1, m6
+  punpcklwd             m3, m6
+  paddd                 m0, m1
+  paddd                 m0, m3
+  ; second half of each row
+  movu                  m1, [refq+64]
+  movu                  m2, [refq+80]
+  movu                  m3, [refq+96]
+  movu                  m4, [refq+112]
+%if %2 == 1
+  pavgw                 m1, [second_predq+mmsize*0]
+  pavgw                 m2, [second_predq+mmsize*1]
+  pavgw                 m3, [second_predq+mmsize*2]
+  pavgw                 m4, [second_predq+mmsize*3]
+  lea         second_predq, [second_predq+mmsize*4]
+%endif
+  mova                  m5, [srcq+64]
+  psubusw               m5, m1
+  psubusw               m1, [srcq+64]
+  por                   m1, m5
+  mova                  m5, [srcq+80]
+  psubusw               m5, m2
+  psubusw               m2, [srcq+80]
+  por                   m2, m5
+  mova                  m5, [srcq+96]
+  psubusw               m5, m3
+  psubusw               m3, [srcq+96]
+  por                   m3, m5
+  mova                  m5, [srcq+112]
+  psubusw               m5, m4
+  psubusw               m4, [srcq+112]
+  por                   m4, m5
+  paddw                 m1, m2
+  paddw                 m3, m4
+  movhlps               m2, m1
+  movhlps               m4, m3
+  paddw                 m1, m2
+  paddw                 m3, m4
+  punpcklwd             m1, m6
+  punpcklwd             m3, m6
+  lea                 refq, [refq+ref_strideq*2]
+  paddd                 m0, m1
+  lea                 srcq, [srcq+src_strideq*2]
+  paddd                 m0, m3
+
+  dec              n_rowsd
+  jg .loop
+
+  movhlps               m1, m0
+  paddd                 m0, m1
+  punpckldq             m0, m6
+  movhlps               m1, m0
+  paddd                 m0, m1
+  movd                 eax, m0
+  RET
+%endmacro
+
+INIT_XMM sse2
+HIGH_SAD64XN 64 ; highbd_sad64x64_sse2
+HIGH_SAD64XN 32 ; highbd_sad64x32_sse2
+HIGH_SAD64XN 64, 1 ; highbd_sad64x64_avg_sse2
+HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2
+HIGH_SAD64XN 16 ; highbd_sad_64x16_sse2
+HIGH_SAD64XN 16, 1 ; highbd_sad_64x16_avg_sse2
+
+; unsigned int aom_highbd_sad32x{16,32,64}_sse2(uint8_t *src, int src_stride,
+;                                    uint8_t *ref, int ref_stride);
+%macro HIGH_SAD32XN 1-2 0
+  HIGH_SAD_FN 32, %1, 5, %2
+  mov              n_rowsd, %1
+  pxor                  m0, m0
+  pxor                  m6, m6
+
+.loop:
+  movu                  m1, [refq]
+  movu                  m2, [refq+16]
+  movu                  m3, [refq+32]
+  movu                  m4, [refq+48]
+%if %2 == 1
+  pavgw                 m1, [second_predq+mmsize*0]
+  pavgw                 m2, [second_predq+mmsize*1]
+  pavgw                 m3, [second_predq+mmsize*2]
+  pavgw                 m4, [second_predq+mmsize*3]
+  lea         second_predq, [second_predq+mmsize*4]
+%endif
+  mova                  m5, [srcq]
+  psubusw               m5, m1
+  psubusw               m1, [srcq]
+  por                   m1, m5
+  mova                  m5, [srcq+16]
+  psubusw               m5, m2
+  psubusw               m2, [srcq+16]
+  por                   m2, m5
+  mova                  m5, [srcq+32]
+  psubusw               m5, m3
+  psubusw               m3, [srcq+32]
+  por                   m3, m5
+  mova                  m5, [srcq+48]
+  psubusw               m5, m4
+  psubusw               m4, [srcq+48]
+  por                   m4, m5
+  paddw                 m1, m2
+  paddw                 m3, m4
+  movhlps               m2, m1
+  movhlps               m4, m3
+  paddw                 m1, m2
+  paddw                 m3, m4
+  punpcklwd             m1, m6
+  punpcklwd             m3, m6
+  lea                 refq, [refq+ref_strideq*2]
+  paddd                 m0, m1
+  lea                 srcq, [srcq+src_strideq*2]
+  paddd                 m0, m3
+  dec              n_rowsd
+  jg .loop
+
+  movhlps               m1, m0
+  paddd                 m0, m1
+  punpckldq             m0, m6
+  movhlps               m1, m0
+  paddd                 m0, m1
+  movd                 eax, m0
+  RET
+%endmacro
+
+INIT_XMM sse2
+HIGH_SAD32XN 64 ; highbd_sad32x64_sse2
+HIGH_SAD32XN 32 ; highbd_sad32x32_sse2
+HIGH_SAD32XN 16 ; highbd_sad32x16_sse2
+HIGH_SAD32XN 64, 1 ; highbd_sad32x64_avg_sse2
+HIGH_SAD32XN 32, 1 ; highbd_sad32x32_avg_sse2
+HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2
+HIGH_SAD32XN 8 ; highbd_sad_32x8_sse2
+HIGH_SAD32XN 8, 1 ; highbd_sad_32x8_avg_sse2
+
+; unsigned int aom_highbd_sad16x{8,16,32}_sse2(uint8_t *src, int src_stride,
+;                                    uint8_t *ref, int ref_stride);
+%macro HIGH_SAD16XN 1-2 0
+  HIGH_SAD_FN 16, %1, 5, %2
+  mov              n_rowsd, %1/2
+  pxor                  m0, m0
+  pxor                  m6, m6
+
+.loop:
+  movu                  m1, [refq]
+  movu                  m2, [refq+16]
+  movu                  m3, [refq+ref_strideq*2]
+  movu                  m4, [refq+ref_strideq*2+16]
+%if %2 == 1
+  pavgw                 m1, [second_predq+mmsize*0]
+  pavgw                 m2, [second_predq+16]
+  pavgw                 m3, [second_predq+mmsize*2]
+  pavgw                 m4, [second_predq+mmsize*2+16]
+  lea         second_predq, [second_predq+mmsize*4]
+%endif
+  mova                  m5, [srcq]
+  psubusw               m5, m1
+  psubusw               m1, [srcq]
+  por                   m1, m5
+  mova                  m5, [srcq+16]
+  psubusw               m5, m2
+  psubusw               m2, [srcq+16]
+  por                   m2, m5
+  mova                  m5, [srcq+src_strideq*2]
+  psubusw               m5, m3
+  psubusw               m3, [srcq+src_strideq*2]
+  por                   m3, m5
+  mova                  m5, [srcq+src_strideq*2+16]
+  psubusw               m5, m4
+  psubusw               m4, [srcq+src_strideq*2+16]
+  por                   m4, m5
+  paddw                 m1, m2
+  paddw                 m3, m4
+  movhlps               m2, m1
+  movhlps               m4, m3
+  paddw                 m1, m2
+  paddw                 m3, m4
+  punpcklwd             m1, m6
+  punpcklwd             m3, m6
+  lea                 refq, [refq+ref_strideq*4]
+  paddd                 m0, m1
+  lea                 srcq, [srcq+src_strideq*4]
+  paddd                 m0, m3
+  dec              n_rowsd
+  jg .loop
+
+  movhlps               m1, m0
+  paddd                 m0, m1
+  punpckldq             m0, m6
+  movhlps               m1, m0
+  paddd                 m0, m1
+  movd                 eax, m0
+  RET
+%endmacro
+
+INIT_XMM sse2
+HIGH_SAD16XN 32 ; highbd_sad16x32_sse2
+HIGH_SAD16XN 16 ; highbd_sad16x16_sse2
+HIGH_SAD16XN  8 ; highbd_sad16x8_sse2
+HIGH_SAD16XN 32, 1 ; highbd_sad16x32_avg_sse2
+HIGH_SAD16XN 16, 1 ; highbd_sad16x16_avg_sse2
+HIGH_SAD16XN  8, 1 ; highbd_sad16x8_avg_sse2
+HIGH_SAD16XN 4 ; highbd_sad_16x4_sse2
+HIGH_SAD16XN 4, 1 ; highbd_sad_16x4_avg_sse2
+HIGH_SAD16XN 64 ; highbd_sad_16x64_sse2
+HIGH_SAD16XN 64, 1 ; highbd_sad_16x64_avg_sse2
+
+; unsigned int aom_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride,
+;                                    uint8_t *ref, int ref_stride);
+%macro HIGH_SAD8XN 1-2 0
+  HIGH_SAD_FN 8, %1, 7, %2
+  mov              n_rowsd, %1/4
+  pxor                  m0, m0
+  pxor                  m6, m6
+
+.loop:
+  movu                  m1, [refq]
+  movu                  m2, [refq+ref_strideq*2]
+  movu                  m3, [refq+ref_strideq*4]
+  movu                  m4, [refq+ref_stride3q*2]
+%if %2 == 1
+  pavgw                 m1, [second_predq+mmsize*0]
+  pavgw                 m2, [second_predq+mmsize*1]
+  pavgw                 m3, [second_predq+mmsize*2]
+  pavgw                 m4, [second_predq+mmsize*3]
+  lea         second_predq, [second_predq+mmsize*4]
+%endif
+  mova                  m5, [srcq]
+  psubusw               m5, m1
+  psubusw               m1, [srcq]
+  por                   m1, m5
+  mova                  m5, [srcq+src_strideq*2]
+  psubusw               m5, m2
+  psubusw               m2, [srcq+src_strideq*2]
+  por                   m2, m5
+  mova                  m5, [srcq+src_strideq*4]
+  psubusw               m5, m3
+  psubusw               m3, [srcq+src_strideq*4]
+  por                   m3, m5
+  mova                  m5, [srcq+src_stride3q*2]
+  psubusw               m5, m4
+  psubusw               m4, [srcq+src_stride3q*2]
+  por                   m4, m5
+  paddw                 m1, m2
+  paddw                 m3, m4
+  movhlps               m2, m1
+  movhlps               m4, m3
+  paddw                 m1, m2
+  paddw                 m3, m4
+  punpcklwd             m1, m6
+  punpcklwd             m3, m6
+  lea                 refq, [refq+ref_strideq*8]
+  paddd                 m0, m1
+  lea                 srcq, [srcq+src_strideq*8]
+  paddd                 m0, m3
+  dec              n_rowsd
+  jg .loop
+
+  movhlps               m1, m0
+  paddd                 m0, m1
+  punpckldq             m0, m6
+  movhlps               m1, m0
+  paddd                 m0, m1
+  movd                 eax, m0
+  RET
+%endmacro
+
+INIT_XMM sse2
+HIGH_SAD8XN 16 ; highbd_sad8x16_sse2
+HIGH_SAD8XN  8 ; highbd_sad8x8_sse2
+HIGH_SAD8XN  4 ; highbd_sad8x4_sse2
+HIGH_SAD8XN 16, 1 ; highbd_sad8x16_avg_sse2
+HIGH_SAD8XN  8, 1 ; highbd_sad8x8_avg_sse2
+HIGH_SAD8XN  4, 1 ; highbd_sad8x4_avg_sse2
+HIGH_SAD8XN 32 ; highbd_sad_8x32_sse2
+HIGH_SAD8XN 32, 1 ; highbd_sad_8x32_avg_sse2
+
+; unsigned int aom_highbd_sad4x{4,8,16}_sse2(uint8_t *src, int src_stride,
+;                                    uint8_t *ref, int ref_stride);
+%macro HIGH_SAD4XN 1-2 0
+  HIGH_SAD_FN 4, %1, 7, %2
+  mov              n_rowsd, %1/4
+  pxor                  m0, m0
+  pxor                  m6, m6
+
+.loop:
+  movq                  m1, [refq]
+  movq                  m2, [refq+ref_strideq*2]
+  movq                  m3, [refq+ref_strideq*4]
+  movq                  m4, [refq+ref_stride3q*2]
+  punpcklwd             m1, m3
+  punpcklwd             m2, m4
+%if %2 == 1
+  movq                  m3, [second_predq+8*0]
+  movq                  m5, [second_predq+8*2]
+  punpcklwd             m3, m5
+  movq                  m4, [second_predq+8*1]
+  movq                  m5, [second_predq+8*3]
+  punpcklwd             m4, m5
+  lea         second_predq, [second_predq+8*4]
+  pavgw                 m1, m3
+  pavgw                 m2, m4
+%endif
+  movq                  m5, [srcq]
+  movq                  m3, [srcq+src_strideq*4]
+  punpcklwd             m5, m3
+  movdqa                m3, m1
+  psubusw               m1, m5
+  psubusw               m5, m3
+  por                   m1, m5
+  movq                  m5, [srcq+src_strideq*2]
+  movq                  m4, [srcq+src_stride3q*2]
+  punpcklwd             m5, m4
+  movdqa                m4, m2
+  psubusw               m2, m5
+  psubusw               m5, m4
+  por                   m2, m5
+  paddw                 m1, m2
+  movdqa                m2, m1
+  punpcklwd             m1, m6
+  punpckhwd             m2, m6
+  lea                 refq, [refq+ref_strideq*8]
+  paddd                 m0, m1
+  lea                 srcq, [srcq+src_strideq*8]
+  paddd                 m0, m2
+  dec              n_rowsd
+  jg .loop
+
+  movhlps               m1, m0
+  paddd                 m0, m1
+  punpckldq             m0, m6
+  movhlps               m1, m0
+  paddd                 m0, m1
+  movd                 eax, m0
+  RET
+%endmacro
+
+INIT_XMM sse2
+HIGH_SAD4XN 16 ; highbd_sad4x16_sse2
+HIGH_SAD4XN  8 ; highbd_sad4x8_sse2
+HIGH_SAD4XN  4 ; highbd_sad4x4_sse2
+HIGH_SAD4XN 16, 1 ; highbd_sad4x16_avg_sse2
+HIGH_SAD4XN  8, 1 ; highbd_sad4x8_avg_sse2
+HIGH_SAD4XN  4, 1 ; highbd_sad4x4_avg_sse2
diff --git a/libs/libaom/src/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm b/libs/libaom/src/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm
new file mode 100644
index 000000000..5c78933df
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm
@@ -0,0 +1,1024 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_8: times  8 dw  8
+bilin_filter_m_sse2: times  8 dw 16
+                     times  8 dw  0
+                     times  8 dw 14
+                     times  8 dw  2
+                     times  8 dw 12
+                     times  8 dw  4
+                     times  8 dw 10
+                     times  8 dw  6
+                     times 16 dw  8
+                     times  8 dw  6
+                     times  8 dw 10
+                     times  8 dw  4
+                     times  8 dw 12
+                     times  8 dw  2
+                     times  8 dw 14
+
+SECTION .text
+
+; int aom_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
+;                               int x_offset, int y_offset,
+;                               const uint8_t *dst, ptrdiff_t dst_stride,
+;                               int height, unsigned int *sse);
+;
+; This function returns the SE and stores SSE in the given pointer.
+
+%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse
+  psubw                %3, %4
+  psubw                %1, %2
+  mova                 %4, %3       ; make copies to manipulate to calc sum
+  mova                 %2, %1       ; use originals for calc sse
+  pmaddwd              %3, %3
+  paddw                %4, %2
+  pmaddwd              %1, %1
+  movhlps              %2, %4
+  paddd                %6, %3
+  paddw                %4, %2
+  pxor                 %2, %2
+  pcmpgtw              %2, %4       ; mask for 0 > %4 (sum)
+  punpcklwd            %4, %2       ; sign-extend word to dword
+  paddd                %6, %1
+  paddd                %5, %4
+
+%endmacro
+
+%macro STORE_AND_RET 0
+%if mmsize == 16
+  ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
+  ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
+  ; We have to sign-extend it before adding the words within the register
+  ; and outputing to a dword.
+  movhlps              m3, m7
+  movhlps              m4, m6
+  paddd                m7, m3
+  paddd                m6, m4
+  pshufd               m3, m7, 0x1
+  pshufd               m4, m6, 0x1
+  paddd                m7, m3
+  paddd                m6, m4
+  mov                  r1, ssem         ; r1 = unsigned int *sse
+  movd               [r1], m7           ; store sse
+  movd                eax, m6           ; store sum as return value
+%endif
+  RET
+%endmacro
+
+%macro INC_SRC_BY_SRC_STRIDE  0
+%if ARCH_X86=1 && CONFIG_PIC=1
+  add                srcq, src_stridemp
+  add                srcq, src_stridemp
+%else
+  lea                srcq, [srcq + src_strideq*2]
+%endif
+%endmacro
+
+%macro SUBPEL_VARIANCE 1-2 0 ; W
+%define bilin_filter_m bilin_filter_m_sse2
+%define filter_idx_shift 5
+
+
+%if ARCH_X86_64
+  %if %2 == 1 ; avg
+    cglobal highbd_sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
+                                      x_offset, y_offset, \
+                                      dst, dst_stride, \
+                                      sec, sec_stride, height, sse
+    %define sec_str sec_strideq
+  %else
+    cglobal highbd_sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \
+                                  x_offset, y_offset, \
+                                  dst, dst_stride, height, sse
+  %endif
+  %define block_height heightd
+  %define bilin_filter sseq
+%else
+  %if CONFIG_PIC=1
+    %if %2 == 1 ; avg
+      cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
+                                        x_offset, y_offset, \
+                                        dst, dst_stride, \
+                                        sec, sec_stride, height, sse
+      %define block_height dword heightm
+      %define sec_str sec_stridemp
+    %else
+      cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
+                                    x_offset, y_offset, \
+                                    dst, dst_stride, height, sse
+      %define block_height heightd
+    %endif
+
+    ; reuse argument stack space
+    %define g_bilin_filterm x_offsetm
+    %define g_pw_8m y_offsetm
+
+    ; Store bilin_filter and pw_8 location in stack
+    %if GET_GOT_DEFINED == 1
+      GET_GOT eax
+      add esp, 4                ; restore esp
+    %endif
+
+    lea ecx, [GLOBAL(bilin_filter_m)]
+    mov g_bilin_filterm, ecx
+
+    lea ecx, [GLOBAL(pw_8)]
+    mov g_pw_8m, ecx
+
+    LOAD_IF_USED 0, 1         ; load eax, ecx back
+  %else
+    %if %2 == 1 ; avg
+      cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
+                                        x_offset, y_offset, \
+                                        dst, dst_stride, \
+                                        sec, sec_stride, height, sse
+      %define block_height dword heightm
+      %define sec_str sec_stridemp
+    %else
+      cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
+                                    x_offset, y_offset, \
+                                    dst, dst_stride, height, sse
+      %define block_height heightd
+    %endif
+
+    %define bilin_filter bilin_filter_m
+  %endif
+%endif
+
+  ASSERT               %1 <= 16         ; m6 overflows if w > 16
+  pxor                 m6, m6           ; sum
+  pxor                 m7, m7           ; sse
+
+%if %1 < 16
+  sar                   block_height, 1
+%endif
+%if %2 == 1 ; avg
+  shl             sec_str, 1
+%endif
+
+  ; FIXME(rbultje) replace by jumptable?
+  test          x_offsetd, x_offsetd
+  jnz .x_nonzero
+  ; x_offset == 0
+  test          y_offsetd, y_offsetd
+  jnz .x_zero_y_nonzero
+
+  ; x_offset == 0 && y_offset == 0
+.x_zero_y_zero_loop:
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m2, [srcq + 16]
+  mova                 m1, [dstq]
+  mova                 m3, [dstq + 16]
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  pavgw                m2, [secq+16]
+%endif
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+
+  lea                srcq, [srcq + src_strideq*2]
+  lea                dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%else ; %1 < 16
+  movu                 m0, [srcq]
+  movu                 m2, [srcq + src_strideq*2]
+  mova                 m1, [dstq]
+  mova                 m3, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  add                secq, sec_str
+  pavgw                m2, [secq]
+%endif
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+
+  lea                srcq, [srcq + src_strideq*4]
+  lea                dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%endif
+  dec                   block_height
+  jg .x_zero_y_zero_loop
+  STORE_AND_RET
+
+.x_zero_y_nonzero:
+  cmp           y_offsetd, 8
+  jne .x_zero_y_nonhalf
+
+  ; x_offset == 0 && y_offset == 0.5
+.x_zero_y_half_loop:
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq+16]
+  movu                 m4, [srcq+src_strideq*2]
+  movu                 m5, [srcq+src_strideq*2+16]
+  mova                 m2, [dstq]
+  mova                 m3, [dstq+16]
+  pavgw                m0, m4
+  pavgw                m1, m5
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  pavgw                m1, [secq+16]
+%endif
+  SUM_SSE              m0, m2, m1, m3, m6, m7
+
+  lea                srcq, [srcq + src_strideq*2]
+  lea                dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%else ; %1 < 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq+src_strideq*2]
+  movu                 m5, [srcq+src_strideq*4]
+  mova                 m2, [dstq]
+  mova                 m3, [dstq+dst_strideq*2]
+  pavgw                m0, m1
+  pavgw                m1, m5
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  add                secq, sec_str
+  pavgw                m1, [secq]
+%endif
+  SUM_SSE              m0, m2, m1, m3, m6, m7
+
+  lea                srcq, [srcq + src_strideq*4]
+  lea                dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%endif
+  dec                   block_height
+  jg .x_zero_y_half_loop
+  STORE_AND_RET
+
+.x_zero_y_nonhalf:
+  ; x_offset == 0 && y_offset == bilin interpolation
+%if ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
+%endif
+  shl           y_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+  mova                 m8, [bilin_filter+y_offsetq]
+  mova                 m9, [bilin_filter+y_offsetq+16]
+  mova                m10, [GLOBAL(pw_8)]
+%define filter_y_a m8
+%define filter_y_b m9
+%define filter_rnd m10
+%else ; x86-32 or mmx
+%if ARCH_X86=1 && CONFIG_PIC=1
+; x_offset == 0, reuse x_offset reg
+%define tempq x_offsetq
+  add y_offsetq, g_bilin_filterm
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+  mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+  add           y_offsetq, bilin_filter
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [GLOBAL(pw_8)]
+%endif
+%endif
+
+.x_zero_y_other_loop:
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq + 16]
+  movu                 m4, [srcq+src_strideq*2]
+  movu                 m5, [srcq+src_strideq*2+16]
+  mova                 m2, [dstq]
+  mova                 m3, [dstq+16]
+  ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can
+  ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of
+  ; instructions is the same (5), but it is 1 mul instead of 2, so might be
+  ; slightly faster because of pmullw latency. It would also cut our rodata
+  ; tables in half for this function, and save 1-2 registers on x86-64.
+  pmullw               m1, filter_y_a
+  pmullw               m5, filter_y_b
+  paddw                m1, filter_rnd
+  pmullw               m0, filter_y_a
+  pmullw               m4, filter_y_b
+  paddw                m0, filter_rnd
+  paddw                m1, m5
+  paddw                m0, m4
+  psrlw                m1, 4
+  psrlw                m0, 4
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  pavgw                m1, [secq+16]
+%endif
+  SUM_SSE              m0, m2, m1, m3, m6, m7
+
+  lea                srcq, [srcq + src_strideq*2]
+  lea                dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%else ; %1 < 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq+src_strideq*2]
+  movu                 m5, [srcq+src_strideq*4]
+  mova                 m4, m1
+  mova                 m2, [dstq]
+  mova                 m3, [dstq+dst_strideq*2]
+  pmullw               m1, filter_y_a
+  pmullw               m5, filter_y_b
+  paddw                m1, filter_rnd
+  pmullw               m0, filter_y_a
+  pmullw               m4, filter_y_b
+  paddw                m0, filter_rnd
+  paddw                m1, m5
+  paddw                m0, m4
+  psrlw                m1, 4
+  psrlw                m0, 4
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  add                secq, sec_str
+  pavgw                m1, [secq]
+%endif
+  SUM_SSE              m0, m2, m1, m3, m6, m7
+
+  lea                srcq, [srcq + src_strideq*4]
+  lea                dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%endif
+  dec                   block_height
+  jg .x_zero_y_other_loop
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+  STORE_AND_RET
+
+.x_nonzero:
+  cmp           x_offsetd, 8
+  jne .x_nonhalf
+  ; x_offset == 0.5
+  test          y_offsetd, y_offsetd
+  jnz .x_half_y_nonzero
+
+  ; x_offset == 0.5 && y_offset == 0
+.x_half_y_zero_loop:
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq + 16]
+  movu                 m4, [srcq + 2]
+  movu                 m5, [srcq + 18]
+  mova                 m2, [dstq]
+  mova                 m3, [dstq + 16]
+  pavgw                m0, m4
+  pavgw                m1, m5
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  pavgw                m1, [secq+16]
+%endif
+  SUM_SSE              m0, m2, m1, m3, m6, m7
+
+  lea                srcq, [srcq + src_strideq*2]
+  lea                dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%else ; %1 < 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq + src_strideq*2]
+  movu                 m4, [srcq + 2]
+  movu                 m5, [srcq + src_strideq*2 + 2]
+  mova                 m2, [dstq]
+  mova                 m3, [dstq + dst_strideq*2]
+  pavgw                m0, m4
+  pavgw                m1, m5
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  add                secq, sec_str
+  pavgw                m1, [secq]
+%endif
+  SUM_SSE              m0, m2, m1, m3, m6, m7
+
+  lea                srcq, [srcq + src_strideq*4]
+  lea                dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%endif
+  dec                   block_height
+  jg .x_half_y_zero_loop
+  STORE_AND_RET
+
+.x_half_y_nonzero:
+  cmp           y_offsetd, 8
+  jne .x_half_y_nonhalf
+
+  ; x_offset == 0.5 && y_offset == 0.5
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq+16]
+  movu                 m2, [srcq+2]
+  movu                 m3, [srcq+18]
+  lea                srcq, [srcq + src_strideq*2]
+  pavgw                m0, m2
+  pavgw                m1, m3
+.x_half_y_half_loop:
+  movu                 m2, [srcq]
+  movu                 m3, [srcq + 16]
+  movu                 m4, [srcq + 2]
+  movu                 m5, [srcq + 18]
+  pavgw                m2, m4
+  pavgw                m3, m5
+  pavgw                m0, m2
+  pavgw                m1, m3
+  mova                 m4, [dstq]
+  mova                 m5, [dstq + 16]
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  pavgw                m1, [secq+16]
+%endif
+  SUM_SSE              m0, m4, m1, m5, m6, m7
+  mova                 m0, m2
+  mova                 m1, m3
+
+  lea                srcq, [srcq + src_strideq*2]
+  lea                dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%else ; %1 < 16
+  movu                 m0, [srcq]
+  movu                 m2, [srcq+2]
+  lea                srcq, [srcq + src_strideq*2]
+  pavgw                m0, m2
+.x_half_y_half_loop:
+  movu                 m2, [srcq]
+  movu                 m3, [srcq + src_strideq*2]
+  movu                 m4, [srcq + 2]
+  movu                 m5, [srcq + src_strideq*2 + 2]
+  pavgw                m2, m4
+  pavgw                m3, m5
+  pavgw                m0, m2
+  pavgw                m2, m3
+  mova                 m4, [dstq]
+  mova                 m5, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  add                secq, sec_str
+  pavgw                m2, [secq]
+%endif
+  SUM_SSE              m0, m4, m2, m5, m6, m7
+  mova                 m0, m3
+
+  lea                srcq, [srcq + src_strideq*4]
+  lea                dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%endif
+  dec                   block_height
+  jg .x_half_y_half_loop
+  STORE_AND_RET
+
+.x_half_y_nonhalf:
+  ; x_offset == 0.5 && y_offset == bilin interpolation
+%if ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
+%endif
+  shl           y_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+  mova                 m8, [bilin_filter+y_offsetq]
+  mova                 m9, [bilin_filter+y_offsetq+16]
+  mova                m10, [GLOBAL(pw_8)]
+%define filter_y_a m8
+%define filter_y_b m9
+%define filter_rnd m10
+%else  ; x86_32
+%if ARCH_X86=1 && CONFIG_PIC=1
+; x_offset == 0.5. We can reuse x_offset reg
+%define tempq x_offsetq
+  add y_offsetq, g_bilin_filterm
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+  mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+  add           y_offsetq, bilin_filter
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [GLOBAL(pw_8)]
+%endif
+%endif
+
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq+16]
+  movu                 m2, [srcq+2]
+  movu                 m3, [srcq+18]
+  lea                srcq, [srcq + src_strideq*2]
+  pavgw                m0, m2
+  pavgw                m1, m3
+.x_half_y_other_loop:
+  movu                 m2, [srcq]
+  movu                 m3, [srcq+16]
+  movu                 m4, [srcq+2]
+  movu                 m5, [srcq+18]
+  pavgw                m2, m4
+  pavgw                m3, m5
+  mova                 m4, m2
+  mova                 m5, m3
+  pmullw               m1, filter_y_a
+  pmullw               m3, filter_y_b
+  paddw                m1, filter_rnd
+  paddw                m1, m3
+  pmullw               m0, filter_y_a
+  pmullw               m2, filter_y_b
+  paddw                m0, filter_rnd
+  psrlw                m1, 4
+  paddw                m0, m2
+  mova                 m2, [dstq]
+  psrlw                m0, 4
+  mova                 m3, [dstq+16]
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  pavgw                m1, [secq+16]
+%endif
+  SUM_SSE              m0, m2, m1, m3, m6, m7
+  mova                 m0, m4
+  mova                 m1, m5
+
+  lea                srcq, [srcq + src_strideq*2]
+  lea                dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%else ; %1 < 16
+  movu                 m0, [srcq]
+  movu                 m2, [srcq+2]
+  lea                srcq, [srcq + src_strideq*2]
+  pavgw                m0, m2
+.x_half_y_other_loop:
+  movu                 m2, [srcq]
+  movu                 m3, [srcq+src_strideq*2]
+  movu                 m4, [srcq+2]
+  movu                 m5, [srcq+src_strideq*2+2]
+  pavgw                m2, m4
+  pavgw                m3, m5
+  mova                 m4, m2
+  mova                 m5, m3
+  pmullw               m4, filter_y_a
+  pmullw               m3, filter_y_b
+  paddw                m4, filter_rnd
+  paddw                m4, m3
+  pmullw               m0, filter_y_a
+  pmullw               m2, filter_y_b
+  paddw                m0, filter_rnd
+  psrlw                m4, 4
+  paddw                m0, m2
+  mova                 m2, [dstq]
+  psrlw                m0, 4
+  mova                 m3, [dstq+dst_strideq*2]
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  add                secq, sec_str
+  pavgw                m4, [secq]
+%endif
+  SUM_SSE              m0, m2, m4, m3, m6, m7
+  mova                 m0, m5
+
+  lea                srcq, [srcq + src_strideq*4]
+  lea                dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%endif
+  dec                   block_height
+  jg .x_half_y_other_loop
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+  STORE_AND_RET
+
+.x_nonhalf:
+  test          y_offsetd, y_offsetd
+  jnz .x_nonhalf_y_nonzero
+
+  ; x_offset == bilin interpolation && y_offset == 0
+%if ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
+%endif
+  shl           x_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+  mova                 m8, [bilin_filter+x_offsetq]
+  mova                 m9, [bilin_filter+x_offsetq+16]
+  mova                m10, [GLOBAL(pw_8)]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_rnd m10
+%else    ; x86-32
+%if ARCH_X86=1 && CONFIG_PIC=1
+; y_offset == 0. We can reuse y_offset reg.
+%define tempq y_offsetq
+  add x_offsetq, g_bilin_filterm
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+  mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+  add           x_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_rnd [GLOBAL(pw_8)]
+%endif
+%endif
+
+.x_other_y_zero_loop:
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq+16]
+  movu                 m2, [srcq+2]
+  movu                 m3, [srcq+18]
+  mova                 m4, [dstq]
+  mova                 m5, [dstq+16]
+  pmullw               m1, filter_x_a
+  pmullw               m3, filter_x_b
+  paddw                m1, filter_rnd
+  pmullw               m0, filter_x_a
+  pmullw               m2, filter_x_b
+  paddw                m0, filter_rnd
+  paddw                m1, m3
+  paddw                m0, m2
+  psrlw                m1, 4
+  psrlw                m0, 4
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  pavgw                m1, [secq+16]
+%endif
+  SUM_SSE              m0, m4, m1, m5, m6, m7
+
+  lea                srcq, [srcq+src_strideq*2]
+  lea                dstq, [dstq+dst_strideq*2]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%else ; %1 < 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq+src_strideq*2]
+  movu                 m2, [srcq+2]
+  movu                 m3, [srcq+src_strideq*2+2]
+  mova                 m4, [dstq]
+  mova                 m5, [dstq+dst_strideq*2]
+  pmullw               m1, filter_x_a
+  pmullw               m3, filter_x_b
+  paddw                m1, filter_rnd
+  pmullw               m0, filter_x_a
+  pmullw               m2, filter_x_b
+  paddw                m0, filter_rnd
+  paddw                m1, m3
+  paddw                m0, m2
+  psrlw                m1, 4
+  psrlw                m0, 4
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  add                secq, sec_str
+  pavgw                m1, [secq]
+%endif
+  SUM_SSE              m0, m4, m1, m5, m6, m7
+
+  lea                srcq, [srcq+src_strideq*4]
+  lea                dstq, [dstq+dst_strideq*4]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%endif
+  dec                   block_height
+  jg .x_other_y_zero_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_rnd
+  STORE_AND_RET
+
+.x_nonhalf_y_nonzero:
+  cmp           y_offsetd, 8
+  jne .x_nonhalf_y_nonhalf
+
+  ; x_offset == bilin interpolation && y_offset == 0.5
+%if ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
+%endif
+  shl           x_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+  mova                 m8, [bilin_filter+x_offsetq]
+  mova                 m9, [bilin_filter+x_offsetq+16]
+  mova                m10, [GLOBAL(pw_8)]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_rnd m10
+%else    ; x86-32
+%if ARCH_X86=1 && CONFIG_PIC=1
+; y_offset == 0.5. We can reuse y_offset reg.
+%define tempq y_offsetq
+  add x_offsetq, g_bilin_filterm
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+  mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+  add           x_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_rnd [GLOBAL(pw_8)]
+%endif
+%endif
+
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq+16]
+  movu                 m2, [srcq+2]
+  movu                 m3, [srcq+18]
+  pmullw               m0, filter_x_a
+  pmullw               m2, filter_x_b
+  paddw                m0, filter_rnd
+  pmullw               m1, filter_x_a
+  pmullw               m3, filter_x_b
+  paddw                m1, filter_rnd
+  paddw                m0, m2
+  paddw                m1, m3
+  psrlw                m0, 4
+  psrlw                m1, 4
+  lea                srcq, [srcq+src_strideq*2]
+.x_other_y_half_loop:
+  movu                 m2, [srcq]
+  movu                 m3, [srcq+16]
+  movu                 m4, [srcq+2]
+  movu                 m5, [srcq+18]
+  pmullw               m2, filter_x_a
+  pmullw               m4, filter_x_b
+  paddw                m2, filter_rnd
+  pmullw               m3, filter_x_a
+  pmullw               m5, filter_x_b
+  paddw                m3, filter_rnd
+  paddw                m2, m4
+  paddw                m3, m5
+  mova                 m4, [dstq]
+  mova                 m5, [dstq+16]
+  psrlw                m2, 4
+  psrlw                m3, 4
+  pavgw                m0, m2
+  pavgw                m1, m3
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  pavgw                m1, [secq+16]
+%endif
+  SUM_SSE              m0, m4, m1, m5, m6, m7
+  mova                 m0, m2
+  mova                 m1, m3
+
+  lea                srcq, [srcq+src_strideq*2]
+  lea                dstq, [dstq+dst_strideq*2]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%else ; %1 < 16
+  movu                 m0, [srcq]
+  movu                 m2, [srcq+2]
+  pmullw               m0, filter_x_a
+  pmullw               m2, filter_x_b
+  paddw                m0, filter_rnd
+  paddw                m0, m2
+  psrlw                m0, 4
+  lea                srcq, [srcq+src_strideq*2]
+.x_other_y_half_loop:
+  movu                 m2, [srcq]
+  movu                 m3, [srcq+src_strideq*2]
+  movu                 m4, [srcq+2]
+  movu                 m5, [srcq+src_strideq*2+2]
+  pmullw               m2, filter_x_a
+  pmullw               m4, filter_x_b
+  paddw                m2, filter_rnd
+  pmullw               m3, filter_x_a
+  pmullw               m5, filter_x_b
+  paddw                m3, filter_rnd
+  paddw                m2, m4
+  paddw                m3, m5
+  mova                 m4, [dstq]
+  mova                 m5, [dstq+dst_strideq*2]
+  psrlw                m2, 4
+  psrlw                m3, 4
+  pavgw                m0, m2
+  pavgw                m2, m3
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  add                secq, sec_str
+  pavgw                m2, [secq]
+%endif
+  SUM_SSE              m0, m4, m2, m5, m6, m7
+  mova                 m0, m3
+
+  lea                srcq, [srcq+src_strideq*4]
+  lea                dstq, [dstq+dst_strideq*4]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%endif
+  dec                   block_height
+  jg .x_other_y_half_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_rnd
+  STORE_AND_RET
+
+.x_nonhalf_y_nonhalf:
+; loading filter - this is same as in 8-bit depth
+%if ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
+%endif
+  shl           x_offsetd, filter_idx_shift ; filter_idx_shift = 5
+  shl           y_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+  mova                 m8, [bilin_filter+x_offsetq]
+  mova                 m9, [bilin_filter+x_offsetq+16]
+  mova                m10, [bilin_filter+y_offsetq]
+  mova                m11, [bilin_filter+y_offsetq+16]
+  mova                m12, [GLOBAL(pw_8)]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_y_a m10
+%define filter_y_b m11
+%define filter_rnd m12
+%else   ; x86-32
+%if ARCH_X86=1 && CONFIG_PIC=1
+; In this case, there is NO unused register. Used src_stride register. Later,
+; src_stride has to be loaded from stack when it is needed.
+%define tempq src_strideq
+  mov tempq, g_bilin_filterm
+  add           x_offsetq, tempq
+  add           y_offsetq, tempq
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+
+  mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+  add           x_offsetq, bilin_filter
+  add           y_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [GLOBAL(pw_8)]
+%endif
+%endif
+; end of load filter
+
+  ; x_offset == bilin interpolation && y_offset == bilin interpolation
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m2, [srcq+2]
+  movu                 m1, [srcq+16]
+  movu                 m3, [srcq+18]
+  pmullw               m0, filter_x_a
+  pmullw               m2, filter_x_b
+  paddw                m0, filter_rnd
+  pmullw               m1, filter_x_a
+  pmullw               m3, filter_x_b
+  paddw                m1, filter_rnd
+  paddw                m0, m2
+  paddw                m1, m3
+  psrlw                m0, 4
+  psrlw                m1, 4
+
+  INC_SRC_BY_SRC_STRIDE
+
+.x_other_y_other_loop:
+  movu                 m2, [srcq]
+  movu                 m4, [srcq+2]
+  movu                 m3, [srcq+16]
+  movu                 m5, [srcq+18]
+  pmullw               m2, filter_x_a
+  pmullw               m4, filter_x_b
+  paddw                m2, filter_rnd
+  pmullw               m3, filter_x_a
+  pmullw               m5, filter_x_b
+  paddw                m3, filter_rnd
+  paddw                m2, m4
+  paddw                m3, m5
+  psrlw                m2, 4
+  psrlw                m3, 4
+  mova                 m4, m2
+  mova                 m5, m3
+  pmullw               m0, filter_y_a
+  pmullw               m2, filter_y_b
+  paddw                m0, filter_rnd
+  pmullw               m1, filter_y_a
+  pmullw               m3, filter_y_b
+  paddw                m0, m2
+  paddw                m1, filter_rnd
+  mova                 m2, [dstq]
+  paddw                m1, m3
+  psrlw                m0, 4
+  psrlw                m1, 4
+  mova                 m3, [dstq+16]
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  pavgw                m1, [secq+16]
+%endif
+  SUM_SSE              m0, m2, m1, m3, m6, m7
+  mova                 m0, m4
+  mova                 m1, m5
+
+  INC_SRC_BY_SRC_STRIDE
+  lea                dstq, [dstq + dst_strideq * 2]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%else ; %1 < 16
+  movu                 m0, [srcq]
+  movu                 m2, [srcq+2]
+  pmullw               m0, filter_x_a
+  pmullw               m2, filter_x_b
+  paddw                m0, filter_rnd
+  paddw                m0, m2
+  psrlw                m0, 4
+
+  INC_SRC_BY_SRC_STRIDE
+
+.x_other_y_other_loop:
+  movu                 m2, [srcq]
+  movu                 m4, [srcq+2]
+  INC_SRC_BY_SRC_STRIDE
+  movu                 m3, [srcq]
+  movu                 m5, [srcq+2]
+  pmullw               m2, filter_x_a
+  pmullw               m4, filter_x_b
+  paddw                m2, filter_rnd
+  pmullw               m3, filter_x_a
+  pmullw               m5, filter_x_b
+  paddw                m3, filter_rnd
+  paddw                m2, m4
+  paddw                m3, m5
+  psrlw                m2, 4
+  psrlw                m3, 4
+  mova                 m4, m2
+  mova                 m5, m3
+  pmullw               m0, filter_y_a
+  pmullw               m2, filter_y_b
+  paddw                m0, filter_rnd
+  pmullw               m4, filter_y_a
+  pmullw               m3, filter_y_b
+  paddw                m0, m2
+  paddw                m4, filter_rnd
+  mova                 m2, [dstq]
+  paddw                m4, m3
+  psrlw                m0, 4
+  psrlw                m4, 4
+  mova                 m3, [dstq+dst_strideq*2]
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  add                secq, sec_str
+  pavgw                m4, [secq]
+%endif
+  SUM_SSE              m0, m2, m4, m3, m6, m7
+  mova                 m0, m5
+
+  INC_SRC_BY_SRC_STRIDE
+  lea                dstq, [dstq + dst_strideq * 4]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%endif
+  dec                   block_height
+  jg .x_other_y_other_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+  STORE_AND_RET
+%endmacro
+
+INIT_XMM sse2
+SUBPEL_VARIANCE  8
+SUBPEL_VARIANCE 16
+
+INIT_XMM sse2
+SUBPEL_VARIANCE  8, 1
+SUBPEL_VARIANCE 16, 1
diff --git a/libs/libaom/src/aom_dsp/x86/highbd_subtract_sse2.c b/libs/libaom/src/aom_dsp/x86/highbd_subtract_sse2.c
new file mode 100644
index 000000000..b72d1cf8b
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/highbd_subtract_sse2.c
@@ -0,0 +1,267 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+#include <stddef.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+typedef void (*SubtractWxHFuncType)(int16_t *diff, ptrdiff_t diff_stride,
+                                    const uint16_t *src, ptrdiff_t src_stride,
+                                    const uint16_t *pred,
+                                    ptrdiff_t pred_stride);
+
+static void subtract_4x4(int16_t *diff, ptrdiff_t diff_stride,
+                         const uint16_t *src, ptrdiff_t src_stride,
+                         const uint16_t *pred, ptrdiff_t pred_stride) {
+  __m128i u0, u1, u2, u3;
+  __m128i v0, v1, v2, v3;
+  __m128i x0, x1, x2, x3;
+  int64_t *store_diff = (int64_t *)(diff + 0 * diff_stride);
+
+  u0 = _mm_loadl_epi64((__m128i const *)(src + 0 * src_stride));
+  u1 = _mm_loadl_epi64((__m128i const *)(src + 1 * src_stride));
+  u2 = _mm_loadl_epi64((__m128i const *)(src + 2 * src_stride));
+  u3 = _mm_loadl_epi64((__m128i const *)(src + 3 * src_stride));
+
+  v0 = _mm_loadl_epi64((__m128i const *)(pred + 0 * pred_stride));
+  v1 = _mm_loadl_epi64((__m128i const *)(pred + 1 * pred_stride));
+  v2 = _mm_loadl_epi64((__m128i const *)(pred + 2 * pred_stride));
+  v3 = _mm_loadl_epi64((__m128i const *)(pred + 3 * pred_stride));
+
+  x0 = _mm_sub_epi16(u0, v0);
+  x1 = _mm_sub_epi16(u1, v1);
+  x2 = _mm_sub_epi16(u2, v2);
+  x3 = _mm_sub_epi16(u3, v3);
+
+  _mm_storel_epi64((__m128i *)store_diff, x0);
+  store_diff = (int64_t *)(diff + 1 * diff_stride);
+  _mm_storel_epi64((__m128i *)store_diff, x1);
+  store_diff = (int64_t *)(diff + 2 * diff_stride);
+  _mm_storel_epi64((__m128i *)store_diff, x2);
+  store_diff = (int64_t *)(diff + 3 * diff_stride);
+  _mm_storel_epi64((__m128i *)store_diff, x3);
+}
+
+static void subtract_4x8(int16_t *diff, ptrdiff_t diff_stride,
+                         const uint16_t *src, ptrdiff_t src_stride,
+                         const uint16_t *pred, ptrdiff_t pred_stride) {
+  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+  int64_t *store_diff = (int64_t *)(diff + 0 * diff_stride);
+
+  u0 = _mm_loadl_epi64((__m128i const *)(src + 0 * src_stride));
+  u1 = _mm_loadl_epi64((__m128i const *)(src + 1 * src_stride));
+  u2 = _mm_loadl_epi64((__m128i const *)(src + 2 * src_stride));
+  u3 = _mm_loadl_epi64((__m128i const *)(src + 3 * src_stride));
+  u4 = _mm_loadl_epi64((__m128i const *)(src + 4 * src_stride));
+  u5 = _mm_loadl_epi64((__m128i const *)(src + 5 * src_stride));
+  u6 = _mm_loadl_epi64((__m128i const *)(src + 6 * src_stride));
+  u7 = _mm_loadl_epi64((__m128i const *)(src + 7 * src_stride));
+
+  v0 = _mm_loadl_epi64((__m128i const *)(pred + 0 * pred_stride));
+  v1 = _mm_loadl_epi64((__m128i const *)(pred + 1 * pred_stride));
+  v2 = _mm_loadl_epi64((__m128i const *)(pred + 2 * pred_stride));
+  v3 = _mm_loadl_epi64((__m128i const *)(pred + 3 * pred_stride));
+  v4 = _mm_loadl_epi64((__m128i const *)(pred + 4 * pred_stride));
+  v5 = _mm_loadl_epi64((__m128i const *)(pred + 5 * pred_stride));
+  v6 = _mm_loadl_epi64((__m128i const *)(pred + 6 * pred_stride));
+  v7 = _mm_loadl_epi64((__m128i const *)(pred + 7 * pred_stride));
+
+  x0 = _mm_sub_epi16(u0, v0);
+  x1 = _mm_sub_epi16(u1, v1);
+  x2 = _mm_sub_epi16(u2, v2);
+  x3 = _mm_sub_epi16(u3, v3);
+  x4 = _mm_sub_epi16(u4, v4);
+  x5 = _mm_sub_epi16(u5, v5);
+  x6 = _mm_sub_epi16(u6, v6);
+  x7 = _mm_sub_epi16(u7, v7);
+
+  _mm_storel_epi64((__m128i *)store_diff, x0);
+  store_diff = (int64_t *)(diff + 1 * diff_stride);
+  _mm_storel_epi64((__m128i *)store_diff, x1);
+  store_diff = (int64_t *)(diff + 2 * diff_stride);
+  _mm_storel_epi64((__m128i *)store_diff, x2);
+  store_diff = (int64_t *)(diff + 3 * diff_stride);
+  _mm_storel_epi64((__m128i *)store_diff, x3);
+  store_diff = (int64_t *)(diff + 4 * diff_stride);
+  _mm_storel_epi64((__m128i *)store_diff, x4);
+  store_diff = (int64_t *)(diff + 5 * diff_stride);
+  _mm_storel_epi64((__m128i *)store_diff, x5);
+  store_diff = (int64_t *)(diff + 6 * diff_stride);
+  _mm_storel_epi64((__m128i *)store_diff, x6);
+  store_diff = (int64_t *)(diff + 7 * diff_stride);
+  _mm_storel_epi64((__m128i *)store_diff, x7);
+}
+
+static void subtract_8x4(int16_t *diff, ptrdiff_t diff_stride,
+                         const uint16_t *src, ptrdiff_t src_stride,
+                         const uint16_t *pred, ptrdiff_t pred_stride) {
+  __m128i u0, u1, u2, u3;
+  __m128i v0, v1, v2, v3;
+  __m128i x0, x1, x2, x3;
+
+  u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
+  u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
+  u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
+  u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
+
+  v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride));
+  v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride));
+  v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride));
+  v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride));
+
+  x0 = _mm_sub_epi16(u0, v0);
+  x1 = _mm_sub_epi16(u1, v1);
+  x2 = _mm_sub_epi16(u2, v2);
+  x3 = _mm_sub_epi16(u3, v3);
+
+  _mm_storeu_si128((__m128i *)(diff + 0 * diff_stride), x0);
+  _mm_storeu_si128((__m128i *)(diff + 1 * diff_stride), x1);
+  _mm_storeu_si128((__m128i *)(diff + 2 * diff_stride), x2);
+  _mm_storeu_si128((__m128i *)(diff + 3 * diff_stride), x3);
+}
+
+static void subtract_8x8(int16_t *diff, ptrdiff_t diff_stride,
+                         const uint16_t *src, ptrdiff_t src_stride,
+                         const uint16_t *pred, ptrdiff_t pred_stride) {
+  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+
+  u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
+  u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
+  u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
+  u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
+  u4 = _mm_loadu_si128((__m128i const *)(src + 4 * src_stride));
+  u5 = _mm_loadu_si128((__m128i const *)(src + 5 * src_stride));
+  u6 = _mm_loadu_si128((__m128i const *)(src + 6 * src_stride));
+  u7 = _mm_loadu_si128((__m128i const *)(src + 7 * src_stride));
+
+  v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride));
+  v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride));
+  v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride));
+  v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride));
+  v4 = _mm_loadu_si128((__m128i const *)(pred + 4 * pred_stride));
+  v5 = _mm_loadu_si128((__m128i const *)(pred + 5 * pred_stride));
+  v6 = _mm_loadu_si128((__m128i const *)(pred + 6 * pred_stride));
+  v7 = _mm_loadu_si128((__m128i const *)(pred + 7 * pred_stride));
+
+  x0 = _mm_sub_epi16(u0, v0);
+  x1 = _mm_sub_epi16(u1, v1);
+  x2 = _mm_sub_epi16(u2, v2);
+  x3 = _mm_sub_epi16(u3, v3);
+  x4 = _mm_sub_epi16(u4, v4);
+  x5 = _mm_sub_epi16(u5, v5);
+  x6 = _mm_sub_epi16(u6, v6);
+  x7 = _mm_sub_epi16(u7, v7);
+
+  _mm_storeu_si128((__m128i *)(diff + 0 * diff_stride), x0);
+  _mm_storeu_si128((__m128i *)(diff + 1 * diff_stride), x1);
+  _mm_storeu_si128((__m128i *)(diff + 2 * diff_stride), x2);
+  _mm_storeu_si128((__m128i *)(diff + 3 * diff_stride), x3);
+  _mm_storeu_si128((__m128i *)(diff + 4 * diff_stride), x4);
+  _mm_storeu_si128((__m128i *)(diff + 5 * diff_stride), x5);
+  _mm_storeu_si128((__m128i *)(diff + 6 * diff_stride), x6);
+  _mm_storeu_si128((__m128i *)(diff + 7 * diff_stride), x7);
+}
+
+#define STACK_V(h, fun)                                                        \
+  do {                                                                         \
+    fun(diff, diff_stride, src, src_stride, pred, pred_stride);                \
+    fun(diff + diff_stride * h, diff_stride, src + src_stride * h, src_stride, \
+        pred + pred_stride * h, pred_stride);                                  \
+  } while (0)
+
+#define STACK_H(w, fun)                                                     \
+  do {                                                                      \
+    fun(diff, diff_stride, src, src_stride, pred, pred_stride);             \
+    fun(diff + w, diff_stride, src + w, src_stride, pred + w, pred_stride); \
+  } while (0)
+
+#define SUBTRACT_FUN(size)                                               \
+  static void subtract_##size(int16_t *diff, ptrdiff_t diff_stride,      \
+                              const uint16_t *src, ptrdiff_t src_stride, \
+                              const uint16_t *pred, ptrdiff_t pred_stride)
+
+SUBTRACT_FUN(8x16) { STACK_V(8, subtract_8x8); }
+SUBTRACT_FUN(16x8) { STACK_H(8, subtract_8x8); }
+SUBTRACT_FUN(16x16) { STACK_V(8, subtract_16x8); }
+SUBTRACT_FUN(16x32) { STACK_V(16, subtract_16x16); }
+SUBTRACT_FUN(32x16) { STACK_H(16, subtract_16x16); }
+SUBTRACT_FUN(32x32) { STACK_V(16, subtract_32x16); }
+SUBTRACT_FUN(32x64) { STACK_V(32, subtract_32x32); }
+SUBTRACT_FUN(64x32) { STACK_H(32, subtract_32x32); }
+SUBTRACT_FUN(64x64) { STACK_V(32, subtract_64x32); }
+SUBTRACT_FUN(64x128) { STACK_V(64, subtract_64x64); }
+SUBTRACT_FUN(128x64) { STACK_H(64, subtract_64x64); }
+SUBTRACT_FUN(128x128) { STACK_V(64, subtract_128x64); }
+SUBTRACT_FUN(4x16) { STACK_V(8, subtract_4x8); }
+SUBTRACT_FUN(16x4) { STACK_H(8, subtract_8x4); }
+SUBTRACT_FUN(8x32) { STACK_V(16, subtract_8x16); }
+SUBTRACT_FUN(32x8) { STACK_H(16, subtract_16x8); }
+SUBTRACT_FUN(16x64) { STACK_V(32, subtract_16x32); }
+SUBTRACT_FUN(64x16) { STACK_H(32, subtract_32x16); }
+
+static SubtractWxHFuncType getSubtractFunc(int rows, int cols) {
+  if (rows == 4) {
+    if (cols == 4) return subtract_4x4;
+    if (cols == 8) return subtract_8x4;
+    if (cols == 16) return subtract_16x4;
+  }
+  if (rows == 8) {
+    if (cols == 4) return subtract_4x8;
+    if (cols == 8) return subtract_8x8;
+    if (cols == 16) return subtract_16x8;
+    if (cols == 32) return subtract_32x8;
+  }
+  if (rows == 16) {
+    if (cols == 4) return subtract_4x16;
+    if (cols == 8) return subtract_8x16;
+    if (cols == 16) return subtract_16x16;
+    if (cols == 32) return subtract_32x16;
+    if (cols == 64) return subtract_64x16;
+  }
+  if (rows == 32) {
+    if (cols == 8) return subtract_8x32;
+    if (cols == 16) return subtract_16x32;
+    if (cols == 32) return subtract_32x32;
+    if (cols == 64) return subtract_64x32;
+  }
+  if (rows == 64) {
+    if (cols == 16) return subtract_16x64;
+    if (cols == 32) return subtract_32x64;
+    if (cols == 64) return subtract_64x64;
+    if (cols == 128) return subtract_128x64;
+  }
+  if (rows == 128) {
+    if (cols == 64) return subtract_64x128;
+    if (cols == 128) return subtract_128x128;
+  }
+  assert(0);
+  return NULL;
+}
+
+void aom_highbd_subtract_block_sse2(int rows, int cols, int16_t *diff,
+                                    ptrdiff_t diff_stride, const uint8_t *src8,
+                                    ptrdiff_t src_stride, const uint8_t *pred8,
+                                    ptrdiff_t pred_stride, int bd) {
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  SubtractWxHFuncType func;
+  (void)bd;
+
+  func = getSubtractFunc(rows, cols);
+  func(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
diff --git a/libs/libaom/src/aom_dsp/x86/highbd_variance_avx2.c b/libs/libaom/src/aom_dsp/x86/highbd_variance_avx2.c
new file mode 100644
index 000000000..9b1b4c9de
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/highbd_variance_avx2.c
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <immintrin.h>  // AVX2
+
+#include "config/aom_dsp_rtcd.h"
+
+typedef void (*high_variance_fn_t)(const uint16_t *src, int src_stride,
+                                   const uint16_t *ref, int ref_stride,
+                                   uint32_t *sse, int *sum);
+
+void aom_highbd_calc8x8var_avx2(const uint16_t *src, int src_stride,
+                                const uint16_t *ref, int ref_stride,
+                                uint32_t *sse, int *sum) {
+  __m256i v_sum_d = _mm256_setzero_si256();
+  __m256i v_sse_d = _mm256_setzero_si256();
+  for (int i = 0; i < 8; i += 2) {
+    const __m128i v_p_a0 = _mm_loadu_si128((const __m128i *)src);
+    const __m128i v_p_a1 = _mm_loadu_si128((const __m128i *)(src + src_stride));
+    const __m128i v_p_b0 = _mm_loadu_si128((const __m128i *)ref);
+    const __m128i v_p_b1 = _mm_loadu_si128((const __m128i *)(ref + ref_stride));
+    __m256i v_p_a = _mm256_castsi128_si256(v_p_a0);
+    __m256i v_p_b = _mm256_castsi128_si256(v_p_b0);
+    v_p_a = _mm256_inserti128_si256(v_p_a, v_p_a1, 1);
+    v_p_b = _mm256_inserti128_si256(v_p_b, v_p_b1, 1);
+    const __m256i v_diff = _mm256_sub_epi16(v_p_a, v_p_b);
+    const __m256i v_sqrdiff = _mm256_madd_epi16(v_diff, v_diff);
+    v_sum_d = _mm256_add_epi16(v_sum_d, v_diff);
+    v_sse_d = _mm256_add_epi32(v_sse_d, v_sqrdiff);
+    src += src_stride * 2;
+    ref += ref_stride * 2;
+  }
+  __m256i v_sum00 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(v_sum_d));
+  __m256i v_sum01 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(v_sum_d, 1));
+  __m256i v_sum0 = _mm256_add_epi32(v_sum00, v_sum01);
+  __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, v_sse_d);
+  __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, v_sse_d);
+  __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
+  const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
+  const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
+  __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
+  v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
+  *sum = _mm_extract_epi32(v_d, 0);
+  *sse = _mm_extract_epi32(v_d, 1);
+}
+
+void aom_highbd_calc16x16var_avx2(const uint16_t *src, int src_stride,
+                                  const uint16_t *ref, int ref_stride,
+                                  uint32_t *sse, int *sum) {
+  __m256i v_sum_d = _mm256_setzero_si256();
+  __m256i v_sse_d = _mm256_setzero_si256();
+  const __m256i one = _mm256_set1_epi16(1);
+  for (int i = 0; i < 16; ++i) {
+    const __m256i v_p_a = _mm256_loadu_si256((const __m256i *)src);
+    const __m256i v_p_b = _mm256_loadu_si256((const __m256i *)ref);
+    const __m256i v_diff = _mm256_sub_epi16(v_p_a, v_p_b);
+    const __m256i v_sqrdiff = _mm256_madd_epi16(v_diff, v_diff);
+    v_sum_d = _mm256_add_epi16(v_sum_d, v_diff);
+    v_sse_d = _mm256_add_epi32(v_sse_d, v_sqrdiff);
+    src += src_stride;
+    ref += ref_stride;
+  }
+  __m256i v_sum0 = _mm256_madd_epi16(v_sum_d, one);
+  __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, v_sse_d);
+  __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, v_sse_d);
+  __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
+  const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
+  const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
+  __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
+  v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
+  *sum = _mm_extract_epi32(v_d, 0);
+  *sse = _mm_extract_epi32(v_d, 1);
+}
+
+static void highbd_10_variance_avx2(const uint16_t *src, int src_stride,
+                                    const uint16_t *ref, int ref_stride, int w,
+                                    int h, uint32_t *sse, int *sum,
+                                    high_variance_fn_t var_fn, int block_size) {
+  int i, j;
+  uint64_t sse_long = 0;
+  int32_t sum_long = 0;
+
+  for (i = 0; i < h; i += block_size) {
+    for (j = 0; j < w; j += block_size) {
+      unsigned int sse0;
+      int sum0;
+      var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
+             ref_stride, &sse0, &sum0);
+      sse_long += sse0;
+      sum_long += sum0;
+    }
+  }
+  *sum = ROUND_POWER_OF_TWO(sum_long, 2);
+  *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
+}
+
+#define VAR_FN(w, h, block_size, shift)                                    \
+  uint32_t aom_highbd_10_variance##w##x##h##_avx2(                         \
+      const uint8_t *src8, int src_stride, const uint8_t *ref8,            \
+      int ref_stride, uint32_t *sse) {                                     \
+    int sum;                                                               \
+    int64_t var;                                                           \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                             \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                             \
+    highbd_10_variance_avx2(                                               \
+        src, src_stride, ref, ref_stride, w, h, sse, &sum,                 \
+        aom_highbd_calc##block_size##x##block_size##var_avx2, block_size); \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift);               \
+    return (var >= 0) ? (uint32_t)var : 0;                                 \
+  }
+
+VAR_FN(128, 128, 16, 14);
+VAR_FN(128, 64, 16, 13);
+VAR_FN(64, 128, 16, 13);
+VAR_FN(64, 64, 16, 12);
+VAR_FN(64, 32, 16, 11);
+VAR_FN(32, 64, 16, 11);
+VAR_FN(32, 32, 16, 10);
+VAR_FN(32, 16, 16, 9);
+VAR_FN(16, 32, 16, 9);
+VAR_FN(16, 16, 16, 8);
+VAR_FN(16, 8, 8, 7);
+VAR_FN(8, 16, 8, 7);
+VAR_FN(8, 8, 8, 6);
+VAR_FN(16, 4, 16, 6);
+VAR_FN(8, 32, 8, 8);
+VAR_FN(32, 8, 8, 8);
+VAR_FN(16, 64, 16, 10);
+VAR_FN(64, 16, 16, 10);
+
+#undef VAR_FN
diff --git a/libs/libaom/src/aom_dsp/x86/highbd_variance_impl_sse2.asm b/libs/libaom/src/aom_dsp/x86/highbd_variance_impl_sse2.asm
new file mode 100644
index 000000000..0d954e178
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/highbd_variance_impl_sse2.asm
@@ -0,0 +1,318 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+
+%include "aom_ports/x86_abi_support.asm"
+
+SECTION .text
+
+;unsigned int aom_highbd_calc16x16var_sse2
+;(
+;    unsigned char   *  src_ptr,
+;    int             source_stride,
+;    unsigned char   *  ref_ptr,
+;    int             recon_stride,
+;    unsigned int    *  SSE,
+;    int             *  Sum
+;)
+global sym(aom_highbd_calc16x16var_sse2) PRIVATE
+sym(aom_highbd_calc16x16var_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push rbx
+    push rsi
+    push rdi
+    ; end prolog
+
+        mov         rsi,            arg(0) ;[src_ptr]
+        mov         rdi,            arg(2) ;[ref_ptr]
+
+        movsxd      rax,            DWORD PTR arg(1) ;[source_stride]
+        movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]
+        add         rax,            rax ; source stride in bytes
+        add         rdx,            rdx ; recon stride in bytes
+
+        ; Prefetch data
+        prefetcht0      [rsi]
+        prefetcht0      [rsi+16]
+        prefetcht0      [rsi+rax]
+        prefetcht0      [rsi+rax+16]
+        lea             rbx,    [rsi+rax*2]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+16]
+        prefetcht0      [rbx+rax]
+        prefetcht0      [rbx+rax+16]
+
+        prefetcht0      [rdi]
+        prefetcht0      [rdi+16]
+        prefetcht0      [rdi+rdx]
+        prefetcht0      [rdi+rdx+16]
+        lea             rbx,    [rdi+rdx*2]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+16]
+        prefetcht0      [rbx+rdx]
+        prefetcht0      [rbx+rdx+16]
+
+        pxor        xmm0,           xmm0     ; clear xmm0 for unpack
+        pxor        xmm7,           xmm7     ; clear xmm7 for accumulating diffs
+
+        pxor        xmm6,           xmm6     ; clear xmm6 for accumulating sse
+        mov         rcx,            16
+
+.var16loop:
+        movdqu      xmm1,           XMMWORD PTR [rsi]
+        movdqu      xmm2,           XMMWORD PTR [rdi]
+
+        lea             rbx,    [rsi+rax*2]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+16]
+        prefetcht0      [rbx+rax]
+        prefetcht0      [rbx+rax+16]
+        lea             rbx,    [rdi+rdx*2]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+16]
+        prefetcht0      [rbx+rdx]
+        prefetcht0      [rbx+rdx+16]
+
+        pxor        xmm5,           xmm5
+
+        psubw       xmm1,           xmm2
+        movdqu      xmm3,           XMMWORD PTR [rsi+16]
+        paddw       xmm5,           xmm1
+        pmaddwd     xmm1,           xmm1
+        movdqu      xmm2,           XMMWORD PTR [rdi+16]
+        paddd       xmm6,           xmm1
+
+        psubw       xmm3,           xmm2
+        movdqu      xmm1,           XMMWORD PTR [rsi+rax]
+        paddw       xmm5,           xmm3
+        pmaddwd     xmm3,           xmm3
+        movdqu      xmm2,           XMMWORD PTR [rdi+rdx]
+        paddd       xmm6,           xmm3
+
+        psubw       xmm1,           xmm2
+        movdqu      xmm3,           XMMWORD PTR [rsi+rax+16]
+        paddw       xmm5,           xmm1
+        pmaddwd     xmm1,           xmm1
+        movdqu      xmm2,           XMMWORD PTR [rdi+rdx+16]
+        paddd       xmm6,           xmm1
+
+        psubw       xmm3,           xmm2
+        paddw       xmm5,           xmm3
+        pmaddwd     xmm3,           xmm3
+        paddd       xmm6,           xmm3
+
+        movdqa      xmm1,           xmm5
+        movdqa      xmm2,           xmm5
+        pcmpgtw     xmm1,           xmm0
+        pcmpeqw     xmm2,           xmm0
+        por         xmm1,           xmm2
+        pcmpeqw     xmm1,           xmm0
+        movdqa      xmm2,           xmm5
+        punpcklwd   xmm5,           xmm1
+        punpckhwd   xmm2,           xmm1
+        paddd       xmm7,           xmm5
+        paddd       xmm7,           xmm2
+
+        lea         rsi,            [rsi + 2*rax]
+        lea         rdi,            [rdi + 2*rdx]
+        sub         rcx,            2
+        jnz         .var16loop
+
+        movdqa      xmm4,           xmm6
+        punpckldq   xmm6,           xmm0
+
+        punpckhdq   xmm4,           xmm0
+        movdqa      xmm5,           xmm7
+
+        paddd       xmm6,           xmm4
+        punpckldq   xmm7,           xmm0
+
+        punpckhdq   xmm5,           xmm0
+        paddd       xmm7,           xmm5
+
+        movdqa      xmm4,           xmm6
+        movdqa      xmm5,           xmm7
+
+        psrldq      xmm4,           8
+        psrldq      xmm5,           8
+
+        paddd       xmm6,           xmm4
+        paddd       xmm7,           xmm5
+
+        mov         rdi,            arg(4)   ; [SSE]
+        mov         rax,            arg(5)   ; [Sum]
+
+        movd DWORD PTR [rdi],       xmm6
+        movd DWORD PTR [rax],       xmm7
+
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    pop rbx
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;unsigned int aom_highbd_calc8x8var_sse2
+;(
+;    unsigned char   *  src_ptr,
+;    int             source_stride,
+;    unsigned char   *  ref_ptr,
+;    int             recon_stride,
+;    unsigned int    *  SSE,
+;    int             *  Sum
+;)
+global sym(aom_highbd_calc8x8var_sse2) PRIVATE
+sym(aom_highbd_calc8x8var_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push rbx
+    push rsi
+    push rdi
+    ; end prolog
+
+        mov         rsi,            arg(0) ;[src_ptr]
+        mov         rdi,            arg(2) ;[ref_ptr]
+
+        movsxd      rax,            DWORD PTR arg(1) ;[source_stride]
+        movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]
+        add         rax,            rax ; source stride in bytes
+        add         rdx,            rdx ; recon stride in bytes
+
+        ; Prefetch data
+        prefetcht0      [rsi]
+        prefetcht0      [rsi+rax]
+        lea             rbx,    [rsi+rax*2]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+rax]
+
+        prefetcht0      [rdi]
+        prefetcht0      [rdi+rdx]
+        lea             rbx,    [rdi+rdx*2]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+rdx]
+
+        pxor        xmm0,           xmm0     ; clear xmm0 for unpack
+        pxor        xmm7,           xmm7     ; clear xmm7 for accumulating diffs
+
+        pxor        xmm6,           xmm6     ; clear xmm6 for accumulating sse
+        mov         rcx,            8
+
+.var8loop:
+        movdqu      xmm1,           XMMWORD PTR [rsi]
+        movdqu      xmm2,           XMMWORD PTR [rdi]
+
+        lea             rbx,    [rsi+rax*4]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+rax]
+        lea             rbx,    [rbx+rax*2]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+rax]
+        lea             rbx,    [rdi+rdx*4]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+rdx]
+        lea             rbx,    [rbx+rdx*2]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+rdx]
+
+        pxor        xmm5,           xmm5
+
+        psubw       xmm1,           xmm2
+        movdqu      xmm3,           XMMWORD PTR [rsi+rax]
+        paddw       xmm5,           xmm1
+        pmaddwd     xmm1,           xmm1
+        movdqu      xmm2,           XMMWORD PTR [rdi+rdx]
+        paddd       xmm6,           xmm1
+
+        lea         rsi,            [rsi + 2*rax]
+        lea         rdi,            [rdi + 2*rdx]
+
+        psubw       xmm3,           xmm2
+        movdqu      xmm1,           XMMWORD PTR [rsi]
+        paddw       xmm5,           xmm3
+        pmaddwd     xmm3,           xmm3
+        movdqu      xmm2,           XMMWORD PTR [rdi]
+        paddd       xmm6,           xmm3
+
+        psubw       xmm1,           xmm2
+        movdqu      xmm3,           XMMWORD PTR [rsi+rax]
+        paddw       xmm5,           xmm1
+        pmaddwd     xmm1,           xmm1
+        movdqu      xmm2,           XMMWORD PTR [rdi+rdx]
+        paddd       xmm6,           xmm1
+
+        psubw       xmm3,           xmm2
+        paddw       xmm5,           xmm3
+        pmaddwd     xmm3,           xmm3
+        paddd       xmm6,           xmm3
+
+        movdqa      xmm1,           xmm5
+        movdqa      xmm2,           xmm5
+        pcmpgtw     xmm1,           xmm0
+        pcmpeqw     xmm2,           xmm0
+        por         xmm1,           xmm2
+        pcmpeqw     xmm1,           xmm0
+        movdqa      xmm2,           xmm5
+        punpcklwd   xmm5,           xmm1
+        punpckhwd   xmm2,           xmm1
+        paddd       xmm7,           xmm5
+        paddd       xmm7,           xmm2
+
+        lea         rsi,            [rsi + 2*rax]
+        lea         rdi,            [rdi + 2*rdx]
+        sub         rcx,            4
+        jnz         .var8loop
+
+        movdqa      xmm4,           xmm6
+        punpckldq   xmm6,           xmm0
+
+        punpckhdq   xmm4,           xmm0
+        movdqa      xmm5,           xmm7
+
+        paddd       xmm6,           xmm4
+        punpckldq   xmm7,           xmm0
+
+        punpckhdq   xmm5,           xmm0
+        paddd       xmm7,           xmm5
+
+        movdqa      xmm4,           xmm6
+        movdqa      xmm5,           xmm7
+
+        psrldq      xmm4,           8
+        psrldq      xmm5,           8
+
+        paddd       xmm6,           xmm4
+        paddd       xmm7,           xmm5
+
+        mov         rdi,            arg(4)   ; [SSE]
+        mov         rax,            arg(5)   ; [Sum]
+
+        movd DWORD PTR [rdi],       xmm6
+        movd DWORD PTR [rax],       xmm7
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    pop rbx
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
diff --git a/libs/libaom/src/aom_dsp/x86/highbd_variance_sse2.c b/libs/libaom/src/aom_dsp/x86/highbd_variance_sse2.c
new file mode 100644
index 000000000..b7d15f93e
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/highbd_variance_sse2.c
@@ -0,0 +1,842 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>  // SSE2
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/x86/synonyms.h"
+
+#include "aom_ports/mem.h"
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/filter.h"
+#include "av1/common/reconinter.h"
+#include "av1/encoder/reconinter_enc.h"
+
+typedef uint32_t (*high_variance_fn_t)(const uint16_t *src, int src_stride,
+                                       const uint16_t *ref, int ref_stride,
+                                       uint32_t *sse, int *sum);
+
+uint32_t aom_highbd_calc8x8var_sse2(const uint16_t *src, int src_stride,
+                                    const uint16_t *ref, int ref_stride,
+                                    uint32_t *sse, int *sum);
+
+uint32_t aom_highbd_calc16x16var_sse2(const uint16_t *src, int src_stride,
+                                      const uint16_t *ref, int ref_stride,
+                                      uint32_t *sse, int *sum);
+
+static void highbd_8_variance_sse2(const uint16_t *src, int src_stride,
+                                   const uint16_t *ref, int ref_stride, int w,
+                                   int h, uint32_t *sse, int *sum,
+                                   high_variance_fn_t var_fn, int block_size) {
+  int i, j;
+
+  *sse = 0;
+  *sum = 0;
+
+  for (i = 0; i < h; i += block_size) {
+    for (j = 0; j < w; j += block_size) {
+      unsigned int sse0;
+      int sum0;
+      var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
+             ref_stride, &sse0, &sum0);
+      *sse += sse0;
+      *sum += sum0;
+    }
+  }
+}
+
+static void highbd_10_variance_sse2(const uint16_t *src, int src_stride,
+                                    const uint16_t *ref, int ref_stride, int w,
+                                    int h, uint32_t *sse, int *sum,
+                                    high_variance_fn_t var_fn, int block_size) {
+  int i, j;
+  uint64_t sse_long = 0;
+  int32_t sum_long = 0;
+
+  for (i = 0; i < h; i += block_size) {
+    for (j = 0; j < w; j += block_size) {
+      unsigned int sse0;
+      int sum0;
+      var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
+             ref_stride, &sse0, &sum0);
+      sse_long += sse0;
+      sum_long += sum0;
+    }
+  }
+  *sum = ROUND_POWER_OF_TWO(sum_long, 2);
+  *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
+}
+
+static void highbd_12_variance_sse2(const uint16_t *src, int src_stride,
+                                    const uint16_t *ref, int ref_stride, int w,
+                                    int h, uint32_t *sse, int *sum,
+                                    high_variance_fn_t var_fn, int block_size) {
+  int i, j;
+  uint64_t sse_long = 0;
+  int32_t sum_long = 0;
+
+  for (i = 0; i < h; i += block_size) {
+    for (j = 0; j < w; j += block_size) {
+      unsigned int sse0;
+      int sum0;
+      var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
+             ref_stride, &sse0, &sum0);
+      sse_long += sse0;
+      sum_long += sum0;
+    }
+  }
+  *sum = ROUND_POWER_OF_TWO(sum_long, 4);
+  *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
+}
+
+#define HIGH_GET_VAR(S)                                                       \
+  void aom_highbd_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
+                                         const uint8_t *ref8, int ref_stride, \
+                                         uint32_t *sse, int *sum) {           \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                                \
+    aom_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \
+                                       sum);                                  \
+  }                                                                           \
+                                                                              \
+  void aom_highbd_10_get##S##x##S##var_sse2(                                  \
+      const uint8_t *src8, int src_stride, const uint8_t *ref8,               \
+      int ref_stride, uint32_t *sse, int *sum) {                              \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                                \
+    aom_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \
+                                       sum);                                  \
+    *sum = ROUND_POWER_OF_TWO(*sum, 2);                                       \
+    *sse = ROUND_POWER_OF_TWO(*sse, 4);                                       \
+  }                                                                           \
+                                                                              \
+  void aom_highbd_12_get##S##x##S##var_sse2(                                  \
+      const uint8_t *src8, int src_stride, const uint8_t *ref8,               \
+      int ref_stride, uint32_t *sse, int *sum) {                              \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                                \
+    aom_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \
+                                       sum);                                  \
+    *sum = ROUND_POWER_OF_TWO(*sum, 4);                                       \
+    *sse = ROUND_POWER_OF_TWO(*sse, 8);                                       \
+  }
+
+HIGH_GET_VAR(16);
+HIGH_GET_VAR(8);
+
+#undef HIGH_GET_VAR
+
+#define VAR_FN(w, h, block_size, shift)                                    \
+  uint32_t aom_highbd_8_variance##w##x##h##_sse2(                          \
+      const uint8_t *src8, int src_stride, const uint8_t *ref8,            \
+      int ref_stride, uint32_t *sse) {                                     \
+    int sum;                                                               \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                             \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                             \
+    highbd_8_variance_sse2(                                                \
+        src, src_stride, ref, ref_stride, w, h, sse, &sum,                 \
+        aom_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
+    return *sse - (uint32_t)(((int64_t)sum * sum) >> shift);               \
+  }                                                                        \
+                                                                           \
+  uint32_t aom_highbd_10_variance##w##x##h##_sse2(                         \
+      const uint8_t *src8, int src_stride, const uint8_t *ref8,            \
+      int ref_stride, uint32_t *sse) {                                     \
+    int sum;                                                               \
+    int64_t var;                                                           \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                             \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                             \
+    highbd_10_variance_sse2(                                               \
+        src, src_stride, ref, ref_stride, w, h, sse, &sum,                 \
+        aom_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift);               \
+    return (var >= 0) ? (uint32_t)var : 0;                                 \
+  }                                                                        \
+                                                                           \
+  uint32_t aom_highbd_12_variance##w##x##h##_sse2(                         \
+      const uint8_t *src8, int src_stride, const uint8_t *ref8,            \
+      int ref_stride, uint32_t *sse) {                                     \
+    int sum;                                                               \
+    int64_t var;                                                           \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                             \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                             \
+    highbd_12_variance_sse2(                                               \
+        src, src_stride, ref, ref_stride, w, h, sse, &sum,                 \
+        aom_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift);               \
+    return (var >= 0) ? (uint32_t)var : 0;                                 \
+  }
+
+VAR_FN(128, 128, 16, 14);
+VAR_FN(128, 64, 16, 13);
+VAR_FN(64, 128, 16, 13);
+VAR_FN(64, 64, 16, 12);
+VAR_FN(64, 32, 16, 11);
+VAR_FN(32, 64, 16, 11);
+VAR_FN(32, 32, 16, 10);
+VAR_FN(32, 16, 16, 9);
+VAR_FN(16, 32, 16, 9);
+VAR_FN(16, 16, 16, 8);
+VAR_FN(16, 8, 8, 7);
+VAR_FN(8, 16, 8, 7);
+VAR_FN(8, 8, 8, 6);
+VAR_FN(8, 32, 8, 8);
+VAR_FN(32, 8, 8, 8);
+VAR_FN(16, 64, 16, 10);
+VAR_FN(64, 16, 16, 10);
+
+#undef VAR_FN
+
+unsigned int aom_highbd_8_mse16x16_sse2(const uint8_t *src8, int src_stride,
+                                        const uint8_t *ref8, int ref_stride,
+                                        unsigned int *sse) {
+  int sum;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
+                         aom_highbd_calc16x16var_sse2, 16);
+  return *sse;
+}
+
+unsigned int aom_highbd_10_mse16x16_sse2(const uint8_t *src8, int src_stride,
+                                         const uint8_t *ref8, int ref_stride,
+                                         unsigned int *sse) {
+  int sum;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
+                          aom_highbd_calc16x16var_sse2, 16);
+  return *sse;
+}
+
+unsigned int aom_highbd_12_mse16x16_sse2(const uint8_t *src8, int src_stride,
+                                         const uint8_t *ref8, int ref_stride,
+                                         unsigned int *sse) {
+  int sum;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
+                          aom_highbd_calc16x16var_sse2, 16);
+  return *sse;
+}
+
+unsigned int aom_highbd_8_mse8x8_sse2(const uint8_t *src8, int src_stride,
+                                      const uint8_t *ref8, int ref_stride,
+                                      unsigned int *sse) {
+  int sum;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum,
+                         aom_highbd_calc8x8var_sse2, 8);
+  return *sse;
+}
+
+unsigned int aom_highbd_10_mse8x8_sse2(const uint8_t *src8, int src_stride,
+                                       const uint8_t *ref8, int ref_stride,
+                                       unsigned int *sse) {
+  int sum;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum,
+                          aom_highbd_calc8x8var_sse2, 8);
+  return *sse;
+}
+
+unsigned int aom_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride,
+                                       const uint8_t *ref8, int ref_stride,
+                                       unsigned int *sse) {
+  int sum;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum,
+                          aom_highbd_calc8x8var_sse2, 8);
+  return *sse;
+}
+
+// The 2 unused parameters are place holders for PIC enabled build.
+// These definitions are for functions defined in
+// highbd_subpel_variance_impl_sse2.asm
+#define DECL(w, opt)                                                         \
+  int aom_highbd_sub_pixel_variance##w##xh_##opt(                            \
+      const uint16_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
+      const uint16_t *dst, ptrdiff_t dst_stride, int height,                 \
+      unsigned int *sse, void *unused0, void *unused);
+#define DECLS(opt) \
+  DECL(8, opt);    \
+  DECL(16, opt)
+
+DECLS(sse2);
+
+#undef DECLS
+#undef DECL
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast)                                  \
+  uint32_t aom_highbd_8_sub_pixel_variance##w##x##h##_##opt(                   \
+      const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
+      const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) {                \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
+    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
+    int se = 0;                                                                \
+    unsigned int sse = 0;                                                      \
+    unsigned int sse2;                                                         \
+    int row_rep = (w > 64) ? 2 : 1;                                            \
+    for (int wd_64 = 0; wd_64 < row_rep; wd_64++) {                            \
+      src += wd_64 * 64;                                                       \
+      dst += wd_64 * 64;                                                       \
+      int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                   \
+          src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse2,      \
+          NULL, NULL);                                                         \
+      se += se2;                                                               \
+      sse += sse2;                                                             \
+      if (w > wf) {                                                            \
+        se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                     \
+            src + wf, src_stride, x_offset, y_offset, dst + wf, dst_stride, h, \
+            &sse2, NULL, NULL);                                                \
+        se += se2;                                                             \
+        sse += sse2;                                                           \
+        if (w > wf * 2) {                                                      \
+          se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                   \
+              src + 2 * wf, src_stride, x_offset, y_offset, dst + 2 * wf,      \
+              dst_stride, h, &sse2, NULL, NULL);                               \
+          se += se2;                                                           \
+          sse += sse2;                                                         \
+          se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                   \
+              src + 3 * wf, src_stride, x_offset, y_offset, dst + 3 * wf,      \
+              dst_stride, h, &sse2, NULL, NULL);                               \
+          se += se2;                                                           \
+          sse += sse2;                                                         \
+        }                                                                      \
+      }                                                                        \
+    }                                                                          \
+    *sse_ptr = sse;                                                            \
+    return sse - (uint32_t)((cast se * se) >> (wlog2 + hlog2));                \
+  }                                                                            \
+                                                                               \
+  uint32_t aom_highbd_10_sub_pixel_variance##w##x##h##_##opt(                  \
+      const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
+      const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) {                \
+    int64_t var;                                                               \
+    uint32_t sse;                                                              \
+    uint64_t long_sse = 0;                                                     \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
+    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
+    int se = 0;                                                                \
+    int row_rep = (w > 64) ? 2 : 1;                                            \
+    for (int wd_64 = 0; wd_64 < row_rep; wd_64++) {                            \
+      src += wd_64 * 64;                                                       \
+      dst += wd_64 * 64;                                                       \
+      int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                   \
+          src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse, NULL, \
+          NULL);                                                               \
+      se += se2;                                                               \
+      long_sse += sse;                                                         \
+      if (w > wf) {                                                            \
+        uint32_t sse2;                                                         \
+        se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                     \
+            src + wf, src_stride, x_offset, y_offset, dst + wf, dst_stride, h, \
+            &sse2, NULL, NULL);                                                \
+        se += se2;                                                             \
+        long_sse += sse2;                                                      \
+        if (w > wf * 2) {                                                      \
+          se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                   \
+              src + 2 * wf, src_stride, x_offset, y_offset, dst + 2 * wf,      \
+              dst_stride, h, &sse2, NULL, NULL);                               \
+          se += se2;                                                           \
+          long_sse += sse2;                                                    \
+          se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                   \
+              src + 3 * wf, src_stride, x_offset, y_offset, dst + 3 * wf,      \
+              dst_stride, h, &sse2, NULL, NULL);                               \
+          se += se2;                                                           \
+          long_sse += sse2;                                                    \
+        }                                                                      \
+      }                                                                        \
+    }                                                                          \
+    se = ROUND_POWER_OF_TWO(se, 2);                                            \
+    sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 4);                           \
+    *sse_ptr = sse;                                                            \
+    var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2));                \
+    return (var >= 0) ? (uint32_t)var : 0;                                     \
+  }                                                                            \
+                                                                               \
+  uint32_t aom_highbd_12_sub_pixel_variance##w##x##h##_##opt(                  \
+      const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
+      const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) {                \
+    int start_row;                                                             \
+    uint32_t sse;                                                              \
+    int se = 0;                                                                \
+    int64_t var;                                                               \
+    uint64_t long_sse = 0;                                                     \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
+    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
+    int row_rep = (w > 64) ? 2 : 1;                                            \
+    for (start_row = 0; start_row < h; start_row += 16) {                      \
+      uint32_t sse2;                                                           \
+      int height = h - start_row < 16 ? h - start_row : 16;                    \
+      uint16_t *src_tmp = src + (start_row * src_stride);                      \
+      uint16_t *dst_tmp = dst + (start_row * dst_stride);                      \
+      for (int wd_64 = 0; wd_64 < row_rep; wd_64++) {                          \
+        src_tmp += wd_64 * 64;                                                 \
+        dst_tmp += wd_64 * 64;                                                 \
+        int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                 \
+            src_tmp, src_stride, x_offset, y_offset, dst_tmp, dst_stride,      \
+            height, &sse2, NULL, NULL);                                        \
+        se += se2;                                                             \
+        long_sse += sse2;                                                      \
+        if (w > wf) {                                                          \
+          se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                   \
+              src_tmp + wf, src_stride, x_offset, y_offset, dst_tmp + wf,      \
+              dst_stride, height, &sse2, NULL, NULL);                          \
+          se += se2;                                                           \
+          long_sse += sse2;                                                    \
+          if (w > wf * 2) {                                                    \
+            se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                 \
+                src_tmp + 2 * wf, src_stride, x_offset, y_offset,              \
+                dst_tmp + 2 * wf, dst_stride, height, &sse2, NULL, NULL);      \
+            se += se2;                                                         \
+            long_sse += sse2;                                                  \
+            se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                 \
+                src_tmp + 3 * wf, src_stride, x_offset, y_offset,              \
+                dst_tmp + 3 * wf, dst_stride, height, &sse2, NULL, NULL);      \
+            se += se2;                                                         \
+            long_sse += sse2;                                                  \
+          }                                                                    \
+        }                                                                      \
+      }                                                                        \
+    }                                                                          \
+    se = ROUND_POWER_OF_TWO(se, 4);                                            \
+    sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8);                           \
+    *sse_ptr = sse;                                                            \
+    var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2));                \
+    return (var >= 0) ? (uint32_t)var : 0;                                     \
+  }
+
+#define FNS(opt)                          \
+  FN(128, 128, 16, 7, 7, opt, (int64_t)); \
+  FN(128, 64, 16, 7, 6, opt, (int64_t));  \
+  FN(64, 128, 16, 6, 7, opt, (int64_t));  \
+  FN(64, 64, 16, 6, 6, opt, (int64_t));   \
+  FN(64, 32, 16, 6, 5, opt, (int64_t));   \
+  FN(32, 64, 16, 5, 6, opt, (int64_t));   \
+  FN(32, 32, 16, 5, 5, opt, (int64_t));   \
+  FN(32, 16, 16, 5, 4, opt, (int64_t));   \
+  FN(16, 32, 16, 4, 5, opt, (int64_t));   \
+  FN(16, 16, 16, 4, 4, opt, (int64_t));   \
+  FN(16, 8, 16, 4, 3, opt, (int64_t));    \
+  FN(8, 16, 8, 3, 4, opt, (int64_t));     \
+  FN(8, 8, 8, 3, 3, opt, (int64_t));      \
+  FN(8, 4, 8, 3, 2, opt, (int64_t));      \
+  FN(16, 4, 16, 4, 2, opt, (int64_t));    \
+  FN(8, 32, 8, 3, 5, opt, (int64_t));     \
+  FN(32, 8, 16, 5, 3, opt, (int64_t));    \
+  FN(16, 64, 16, 4, 6, opt, (int64_t));   \
+  FN(64, 16, 16, 6, 4, opt, (int64_t))
+
+FNS(sse2);
+
+#undef FNS
+#undef FN
+
+// The 2 unused parameters are place holders for PIC enabled build.
+#define DECL(w, opt)                                                         \
+  int aom_highbd_sub_pixel_avg_variance##w##xh_##opt(                        \
+      const uint16_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
+      const uint16_t *dst, ptrdiff_t dst_stride, const uint16_t *sec,        \
+      ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0,    \
+      void *unused);
+#define DECLS(opt) \
+  DECL(16, opt)    \
+  DECL(8, opt)
+
+DECLS(sse2);
+#undef DECL
+#undef DECLS
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast)                                  \
+  uint32_t aom_highbd_8_sub_pixel_avg_variance##w##x##h##_##opt(               \
+      const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
+      const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr,                  \
+      const uint8_t *sec8) {                                                   \
+    uint32_t sse;                                                              \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
+    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
+    uint16_t *sec = CONVERT_TO_SHORTPTR(sec8);                                 \
+    int se = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(                  \
+        src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \
+        NULL, NULL);                                                           \
+    if (w > wf) {                                                              \
+      uint32_t sse2;                                                           \
+      int se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
+          src + wf, src_stride, x_offset, y_offset, dst + wf, dst_stride,      \
+          sec + wf, w, h, &sse2, NULL, NULL);                                  \
+      se += se2;                                                               \
+      sse += sse2;                                                             \
+      if (w > wf * 2) {                                                        \
+        se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
+            src + 2 * wf, src_stride, x_offset, y_offset, dst + 2 * wf,        \
+            dst_stride, sec + 2 * wf, w, h, &sse2, NULL, NULL);                \
+        se += se2;                                                             \
+        sse += sse2;                                                           \
+        se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
+            src + 3 * wf, src_stride, x_offset, y_offset, dst + 3 * wf,        \
+            dst_stride, sec + 3 * wf, w, h, &sse2, NULL, NULL);                \
+        se += se2;                                                             \
+        sse += sse2;                                                           \
+      }                                                                        \
+    }                                                                          \
+    *sse_ptr = sse;                                                            \
+    return sse - (uint32_t)((cast se * se) >> (wlog2 + hlog2));                \
+  }                                                                            \
+                                                                               \
+  uint32_t aom_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt(              \
+      const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
+      const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr,                  \
+      const uint8_t *sec8) {                                                   \
+    int64_t var;                                                               \
+    uint32_t sse;                                                              \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
+    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
+    uint16_t *sec = CONVERT_TO_SHORTPTR(sec8);                                 \
+    int se = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(                  \
+        src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \
+        NULL, NULL);                                                           \
+    if (w > wf) {                                                              \
+      uint32_t sse2;                                                           \
+      int se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
+          src + wf, src_stride, x_offset, y_offset, dst + wf, dst_stride,      \
+          sec + wf, w, h, &sse2, NULL, NULL);                                  \
+      se += se2;                                                               \
+      sse += sse2;                                                             \
+      if (w > wf * 2) {                                                        \
+        se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
+            src + 2 * wf, src_stride, x_offset, y_offset, dst + 2 * wf,        \
+            dst_stride, sec + 2 * wf, w, h, &sse2, NULL, NULL);                \
+        se += se2;                                                             \
+        sse += sse2;                                                           \
+        se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
+            src + 3 * wf, src_stride, x_offset, y_offset, dst + 3 * wf,        \
+            dst_stride, sec + 3 * wf, w, h, &sse2, NULL, NULL);                \
+        se += se2;                                                             \
+        sse += sse2;                                                           \
+      }                                                                        \
+    }                                                                          \
+    se = ROUND_POWER_OF_TWO(se, 2);                                            \
+    sse = ROUND_POWER_OF_TWO(sse, 4);                                          \
+    *sse_ptr = sse;                                                            \
+    var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2));                \
+    return (var >= 0) ? (uint32_t)var : 0;                                     \
+  }                                                                            \
+                                                                               \
+  uint32_t aom_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt(              \
+      const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
+      const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr,                  \
+      const uint8_t *sec8) {                                                   \
+    int start_row;                                                             \
+    int64_t var;                                                               \
+    uint32_t sse;                                                              \
+    int se = 0;                                                                \
+    uint64_t long_sse = 0;                                                     \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
+    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
+    uint16_t *sec = CONVERT_TO_SHORTPTR(sec8);                                 \
+    for (start_row = 0; start_row < h; start_row += 16) {                      \
+      uint32_t sse2;                                                           \
+      int height = h - start_row < 16 ? h - start_row : 16;                    \
+      int se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
+          src + (start_row * src_stride), src_stride, x_offset, y_offset,      \
+          dst + (start_row * dst_stride), dst_stride, sec + (start_row * w),   \
+          w, height, &sse2, NULL, NULL);                                       \
+      se += se2;                                                               \
+      long_sse += sse2;                                                        \
+      if (w > wf) {                                                            \
+        se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
+            src + wf + (start_row * src_stride), src_stride, x_offset,         \
+            y_offset, dst + wf + (start_row * dst_stride), dst_stride,         \
+            sec + wf + (start_row * w), w, height, &sse2, NULL, NULL);         \
+        se += se2;                                                             \
+        long_sse += sse2;                                                      \
+        if (w > wf * 2) {                                                      \
+          se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
+              src + 2 * wf + (start_row * src_stride), src_stride, x_offset,   \
+              y_offset, dst + 2 * wf + (start_row * dst_stride), dst_stride,   \
+              sec + 2 * wf + (start_row * w), w, height, &sse2, NULL, NULL);   \
+          se += se2;                                                           \
+          long_sse += sse2;                                                    \
+          se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
+              src + 3 * wf + (start_row * src_stride), src_stride, x_offset,   \
+              y_offset, dst + 3 * wf + (start_row * dst_stride), dst_stride,   \
+              sec + 3 * wf + (start_row * w), w, height, &sse2, NULL, NULL);   \
+          se += se2;                                                           \
+          long_sse += sse2;                                                    \
+        }                                                                      \
+      }                                                                        \
+    }                                                                          \
+    se = ROUND_POWER_OF_TWO(se, 4);                                            \
+    sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8);                           \
+    *sse_ptr = sse;                                                            \
+    var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2));                \
+    return (var >= 0) ? (uint32_t)var : 0;                                     \
+  }
+
+#define FNS(opt)                        \
+  FN(64, 64, 16, 6, 6, opt, (int64_t)); \
+  FN(64, 32, 16, 6, 5, opt, (int64_t)); \
+  FN(32, 64, 16, 5, 6, opt, (int64_t)); \
+  FN(32, 32, 16, 5, 5, opt, (int64_t)); \
+  FN(32, 16, 16, 5, 4, opt, (int64_t)); \
+  FN(16, 32, 16, 4, 5, opt, (int64_t)); \
+  FN(16, 16, 16, 4, 4, opt, (int64_t)); \
+  FN(16, 8, 16, 4, 3, opt, (int64_t));  \
+  FN(8, 16, 8, 3, 4, opt, (int64_t));   \
+  FN(8, 8, 8, 3, 3, opt, (int64_t));    \
+  FN(8, 4, 8, 3, 2, opt, (int64_t));    \
+  FN(16, 4, 16, 4, 2, opt, (int64_t));  \
+  FN(8, 32, 8, 3, 5, opt, (int64_t));   \
+  FN(32, 8, 16, 5, 3, opt, (int64_t));  \
+  FN(16, 64, 16, 4, 6, opt, (int64_t)); \
+  FN(64, 16, 16, 6, 4, opt, (int64_t));
+
+FNS(sse2);
+
+#undef FNS
+#undef FN
+
+void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd,
+                                    const struct AV1Common *const cm,
+                                    int mi_row, int mi_col, const MV *const mv,
+                                    uint8_t *comp_pred8, int width, int height,
+                                    int subpel_x_q3, int subpel_y_q3,
+                                    const uint8_t *ref8, int ref_stride, int bd,
+                                    int subpel_search) {
+  // expect xd == NULL only in tests
+  if (xd != NULL) {
+    const MB_MODE_INFO *mi = xd->mi[0];
+    const int ref_num = 0;
+    const int is_intrabc = is_intrabc_block(mi);
+    const struct scale_factors *const sf =
+        is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num];
+    const int is_scaled = av1_is_scaled(sf);
+
+    if (is_scaled) {
+      int plane = 0;
+      const int mi_x = mi_col * MI_SIZE;
+      const int mi_y = mi_row * MI_SIZE;
+      const struct macroblockd_plane *const pd = &xd->plane[plane];
+      const struct buf_2d *const dst_buf = &pd->dst;
+      const struct buf_2d *const pre_buf =
+          is_intrabc ? dst_buf : &pd->pre[ref_num];
+
+      InterPredParams inter_pred_params;
+      inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd);
+      const int_interpfilters filters =
+          av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+      av1_init_inter_params(
+          &inter_pred_params, width, height, mi_y >> pd->subsampling_y,
+          mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y,
+          xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters);
+      av1_enc_build_one_inter_predictor(comp_pred8, width, mv,
+                                        &inter_pred_params);
+      return;
+    }
+  }
+
+  const InterpFilterParams *filter = av1_get_filter(subpel_search);
+  int filter_taps = (subpel_search <= USE_4_TAPS) ? 4 : SUBPEL_TAPS;
+  if (!subpel_x_q3 && !subpel_y_q3) {
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+    uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+    if (width >= 8) {
+      int i;
+      assert(!(width & 7));
+      /*Read 8 pixels one row at a time.*/
+      for (i = 0; i < height; i++) {
+        int j;
+        for (j = 0; j < width; j += 8) {
+          __m128i s0 = _mm_loadu_si128((const __m128i *)ref);
+          _mm_storeu_si128((__m128i *)comp_pred, s0);
+          comp_pred += 8;
+          ref += 8;
+        }
+        ref += ref_stride - width;
+      }
+    } else {
+      int i;
+      assert(!(width & 3));
+      /*Read 4 pixels two rows at a time.*/
+      for (i = 0; i < height; i += 2) {
+        __m128i s0 = _mm_loadl_epi64((const __m128i *)ref);
+        __m128i s1 = _mm_loadl_epi64((const __m128i *)(ref + ref_stride));
+        __m128i t0 = _mm_unpacklo_epi64(s0, s1);
+        _mm_storeu_si128((__m128i *)comp_pred, t0);
+        comp_pred += 8;
+        ref += 2 * ref_stride;
+      }
+    }
+  } else if (!subpel_y_q3) {
+    const int16_t *const kernel =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+    aom_highbd_convolve8_horiz(ref8, ref_stride, comp_pred8, width, kernel, 16,
+                               NULL, -1, width, height, bd);
+  } else if (!subpel_x_q3) {
+    const int16_t *const kernel =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+    aom_highbd_convolve8_vert(ref8, ref_stride, comp_pred8, width, NULL, -1,
+                              kernel, 16, width, height, bd);
+  } else {
+    DECLARE_ALIGNED(16, uint16_t,
+                    temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]);
+    const int16_t *const kernel_x =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+    const int16_t *const kernel_y =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+    const uint8_t *ref_start = ref8 - ref_stride * ((filter_taps >> 1) - 1);
+    uint16_t *temp_start_horiz = (subpel_search <= USE_4_TAPS)
+                                     ? temp + (filter_taps >> 1) * MAX_SB_SIZE
+                                     : temp;
+    uint16_t *temp_start_vert = temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1);
+    const int intermediate_height =
+        (((height - 1) * 8 + subpel_y_q3) >> 3) + filter_taps;
+    assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
+    aom_highbd_convolve8_horiz(
+        ref_start, ref_stride, CONVERT_TO_BYTEPTR(temp_start_horiz),
+        MAX_SB_SIZE, kernel_x, 16, NULL, -1, width, intermediate_height, bd);
+    aom_highbd_convolve8_vert(CONVERT_TO_BYTEPTR(temp_start_vert), MAX_SB_SIZE,
+                              comp_pred8, width, NULL, -1, kernel_y, 16, width,
+                              height, bd);
+  }
+}
+
+void aom_highbd_comp_avg_upsampled_pred_sse2(
+    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+    int ref_stride, int bd, int subpel_search) {
+  aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
+                            height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
+                            bd, subpel_search);
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  uint16_t *comp_pred16 = CONVERT_TO_SHORTPTR(comp_pred8);
+  /*The total number of pixels must be a multiple of 8 (e.g., 4x4).*/
+  assert(!(width * height & 7));
+  int n = width * height >> 3;
+  for (int i = 0; i < n; i++) {
+    __m128i s0 = _mm_loadu_si128((const __m128i *)comp_pred16);
+    __m128i p0 = _mm_loadu_si128((const __m128i *)pred);
+    _mm_storeu_si128((__m128i *)comp_pred16, _mm_avg_epu16(s0, p0));
+    comp_pred16 += 8;
+    pred += 8;
+  }
+}
+
+static INLINE void highbd_compute_dist_wtd_comp_avg(__m128i *p0, __m128i *p1,
+                                                    const __m128i *w0,
+                                                    const __m128i *w1,
+                                                    const __m128i *r,
+                                                    void *const result) {
+  assert(DIST_PRECISION_BITS <= 4);
+  __m128i mult0 = _mm_mullo_epi16(*p0, *w0);
+  __m128i mult1 = _mm_mullo_epi16(*p1, *w1);
+  __m128i sum = _mm_adds_epu16(mult0, mult1);
+  __m128i round = _mm_adds_epu16(sum, *r);
+  __m128i shift = _mm_srli_epi16(round, DIST_PRECISION_BITS);
+
+  xx_storeu_128(result, shift);
+}
+
+void aom_highbd_dist_wtd_comp_avg_pred_sse2(
+    uint8_t *comp_pred8, const uint8_t *pred8, int width, int height,
+    const uint8_t *ref8, int ref_stride,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
+  int i;
+  const uint16_t wt0 = (uint16_t)jcp_param->fwd_offset;
+  const uint16_t wt1 = (uint16_t)jcp_param->bck_offset;
+  const __m128i w0 = _mm_set_epi16(wt0, wt0, wt0, wt0, wt0, wt0, wt0, wt0);
+  const __m128i w1 = _mm_set_epi16(wt1, wt1, wt1, wt1, wt1, wt1, wt1, wt1);
+  const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1);
+  const __m128i r =
+      _mm_set_epi16(round, round, round, round, round, round, round, round);
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+
+  if (width >= 8) {
+    // Read 8 pixels one row at a time
+    assert(!(width & 7));
+    for (i = 0; i < height; ++i) {
+      int j;
+      for (j = 0; j < width; j += 8) {
+        __m128i p0 = xx_loadu_128(ref);
+        __m128i p1 = xx_loadu_128(pred);
+
+        highbd_compute_dist_wtd_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred);
+
+        comp_pred += 8;
+        pred += 8;
+        ref += 8;
+      }
+      ref += ref_stride - width;
+    }
+  } else {
+    // Read 4 pixels two rows at a time
+    assert(!(width & 3));
+    for (i = 0; i < height; i += 2) {
+      __m128i p0_0 = xx_loadl_64(ref + 0 * ref_stride);
+      __m128i p0_1 = xx_loadl_64(ref + 1 * ref_stride);
+      __m128i p0 = _mm_unpacklo_epi64(p0_0, p0_1);
+      __m128i p1 = xx_loadu_128(pred);
+
+      highbd_compute_dist_wtd_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred);
+
+      comp_pred += 8;
+      pred += 8;
+      ref += 2 * ref_stride;
+    }
+  }
+}
+
+void aom_highbd_dist_wtd_comp_avg_upsampled_pred_sse2(
+    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+    int ref_stride, int bd, const DIST_WTD_COMP_PARAMS *jcp_param,
+    int subpel_search) {
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  int n;
+  int i;
+  aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
+                            height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
+                            bd, subpel_search);
+  assert(!(width * height & 7));
+  n = width * height >> 3;
+
+  const uint16_t wt0 = (uint16_t)jcp_param->fwd_offset;
+  const uint16_t wt1 = (uint16_t)jcp_param->bck_offset;
+  const __m128i w0 = _mm_set_epi16(wt0, wt0, wt0, wt0, wt0, wt0, wt0, wt0);
+  const __m128i w1 = _mm_set_epi16(wt1, wt1, wt1, wt1, wt1, wt1, wt1, wt1);
+  const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1);
+  const __m128i r =
+      _mm_set_epi16(round, round, round, round, round, round, round, round);
+
+  uint16_t *comp_pred16 = CONVERT_TO_SHORTPTR(comp_pred8);
+  for (i = 0; i < n; i++) {
+    __m128i p0 = xx_loadu_128(comp_pred16);
+    __m128i p1 = xx_loadu_128(pred);
+
+    highbd_compute_dist_wtd_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred16);
+
+    comp_pred16 += 8;
+    pred += 8;
+  }
+}
diff --git a/libs/libaom/src/aom_dsp/x86/highbd_variance_sse4.c b/libs/libaom/src/aom_dsp/x86/highbd_variance_sse4.c
new file mode 100644
index 000000000..df5449a9d
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/highbd_variance_sse4.c
@@ -0,0 +1,216 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h> /* SSE4.1 */
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/variance.h"
+#include "aom_dsp/aom_filter.h"
+
+static INLINE void variance4x4_64_sse4_1(const uint8_t *a8, int a_stride,
+                                         const uint8_t *b8, int b_stride,
+                                         uint64_t *sse, int64_t *sum) {
+  __m128i u0, u1, u2, u3;
+  __m128i s0, s1, s2, s3;
+  __m128i t0, t1, x0, y0;
+  __m128i a0, a1, a2, a3;
+  __m128i b0, b1, b2, b3;
+  __m128i k_one_epi16 = _mm_set1_epi16((int16_t)1);
+
+  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+
+  a0 = _mm_loadl_epi64((__m128i const *)(a + 0 * a_stride));
+  a1 = _mm_loadl_epi64((__m128i const *)(a + 1 * a_stride));
+  a2 = _mm_loadl_epi64((__m128i const *)(a + 2 * a_stride));
+  a3 = _mm_loadl_epi64((__m128i const *)(a + 3 * a_stride));
+
+  b0 = _mm_loadl_epi64((__m128i const *)(b + 0 * b_stride));
+  b1 = _mm_loadl_epi64((__m128i const *)(b + 1 * b_stride));
+  b2 = _mm_loadl_epi64((__m128i const *)(b + 2 * b_stride));
+  b3 = _mm_loadl_epi64((__m128i const *)(b + 3 * b_stride));
+
+  u0 = _mm_unpacklo_epi16(a0, a1);
+  u1 = _mm_unpacklo_epi16(a2, a3);
+  u2 = _mm_unpacklo_epi16(b0, b1);
+  u3 = _mm_unpacklo_epi16(b2, b3);
+
+  s0 = _mm_sub_epi16(u0, u2);
+  s1 = _mm_sub_epi16(u1, u3);
+
+  t0 = _mm_madd_epi16(s0, k_one_epi16);
+  t1 = _mm_madd_epi16(s1, k_one_epi16);
+
+  s2 = _mm_hadd_epi32(t0, t1);
+  s3 = _mm_hadd_epi32(s2, s2);
+  y0 = _mm_hadd_epi32(s3, s3);
+
+  t0 = _mm_madd_epi16(s0, s0);
+  t1 = _mm_madd_epi16(s1, s1);
+
+  s2 = _mm_hadd_epi32(t0, t1);
+  s3 = _mm_hadd_epi32(s2, s2);
+  x0 = _mm_hadd_epi32(s3, s3);
+
+  *sse = (uint64_t)_mm_extract_epi32(x0, 0);
+  *sum = (int64_t)_mm_extract_epi32(y0, 0);
+}
+
+uint32_t aom_highbd_8_variance4x4_sse4_1(const uint8_t *a, int a_stride,
+                                         const uint8_t *b, int b_stride,
+                                         uint32_t *sse) {
+  int64_t sum, diff;
+  uint64_t local_sse;
+
+  variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum);
+  *sse = (uint32_t)local_sse;
+
+  diff = (int64_t)*sse - ((sum * sum) >> 4);
+  return (diff >= 0) ? (uint32_t)diff : 0;
+}
+
+uint32_t aom_highbd_10_variance4x4_sse4_1(const uint8_t *a, int a_stride,
+                                          const uint8_t *b, int b_stride,
+                                          uint32_t *sse) {
+  int64_t sum, diff;
+  uint64_t local_sse;
+
+  variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum);
+  *sse = (uint32_t)ROUND_POWER_OF_TWO(local_sse, 4);
+  sum = ROUND_POWER_OF_TWO(sum, 2);
+
+  diff = (int64_t)*sse - ((sum * sum) >> 4);
+  return (diff >= 0) ? (uint32_t)diff : 0;
+}
+
+uint32_t aom_highbd_12_variance4x4_sse4_1(const uint8_t *a, int a_stride,
+                                          const uint8_t *b, int b_stride,
+                                          uint32_t *sse) {
+  int64_t sum, diff;
+  uint64_t local_sse;
+
+  variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum);
+  *sse = (uint32_t)ROUND_POWER_OF_TWO(local_sse, 8);
+  sum = ROUND_POWER_OF_TWO(sum, 4);
+
+  diff = (int64_t)*sse - ((sum * sum) >> 4);
+  return diff >= 0 ? (uint32_t)diff : 0;
+}
+
+// Sub-pixel
+uint32_t aom_highbd_8_sub_pixel_variance4x4_sse4_1(
+    const uint8_t *src, int src_stride, int xoffset, int yoffset,
+    const uint8_t *dst, int dst_stride, uint32_t *sse) {
+  uint16_t fdata3[(4 + 1) * 4];
+  uint16_t temp2[4 * 4];
+
+  aom_highbd_var_filter_block2d_bil_first_pass(
+      src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
+  aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
+                                                bilinear_filters_2t[yoffset]);
+
+  return aom_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst, dst_stride,
+                                  sse);
+}
+
+uint32_t aom_highbd_10_sub_pixel_variance4x4_sse4_1(
+    const uint8_t *src, int src_stride, int xoffset, int yoffset,
+    const uint8_t *dst, int dst_stride, uint32_t *sse) {
+  uint16_t fdata3[(4 + 1) * 4];
+  uint16_t temp2[4 * 4];
+
+  aom_highbd_var_filter_block2d_bil_first_pass(
+      src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
+  aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
+                                                bilinear_filters_2t[yoffset]);
+
+  return aom_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst,
+                                   dst_stride, sse);
+}
+
+uint32_t aom_highbd_12_sub_pixel_variance4x4_sse4_1(
+    const uint8_t *src, int src_stride, int xoffset, int yoffset,
+    const uint8_t *dst, int dst_stride, uint32_t *sse) {
+  uint16_t fdata3[(4 + 1) * 4];
+  uint16_t temp2[4 * 4];
+
+  aom_highbd_var_filter_block2d_bil_first_pass(
+      src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
+  aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
+                                                bilinear_filters_2t[yoffset]);
+
+  return aom_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst,
+                                   dst_stride, sse);
+}
+
+// Sub-pixel average
+
+uint32_t aom_highbd_8_sub_pixel_avg_variance4x4_sse4_1(
+    const uint8_t *src, int src_stride, int xoffset, int yoffset,
+    const uint8_t *dst, int dst_stride, uint32_t *sse,
+    const uint8_t *second_pred) {
+  uint16_t fdata3[(4 + 1) * 4];
+  uint16_t temp2[4 * 4];
+  DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]);
+
+  aom_highbd_var_filter_block2d_bil_first_pass(
+      src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
+  aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
+                                                bilinear_filters_2t[yoffset]);
+
+  aom_highbd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, 4, 4,
+                           CONVERT_TO_BYTEPTR(temp2), 4);
+
+  return aom_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst, dst_stride,
+                                  sse);
+}
+
+uint32_t aom_highbd_10_sub_pixel_avg_variance4x4_sse4_1(
+    const uint8_t *src, int src_stride, int xoffset, int yoffset,
+    const uint8_t *dst, int dst_stride, uint32_t *sse,
+    const uint8_t *second_pred) {
+  uint16_t fdata3[(4 + 1) * 4];
+  uint16_t temp2[4 * 4];
+  DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]);
+
+  aom_highbd_var_filter_block2d_bil_first_pass(
+      src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
+  aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
+                                                bilinear_filters_2t[yoffset]);
+
+  aom_highbd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, 4, 4,
+                           CONVERT_TO_BYTEPTR(temp2), 4);
+
+  return aom_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst,
+                                   dst_stride, sse);
+}
+
+uint32_t aom_highbd_12_sub_pixel_avg_variance4x4_sse4_1(
+    const uint8_t *src, int src_stride, int xoffset, int yoffset,
+    const uint8_t *dst, int dst_stride, uint32_t *sse,
+    const uint8_t *second_pred) {
+  uint16_t fdata3[(4 + 1) * 4];
+  uint16_t temp2[4 * 4];
+  DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]);
+
+  aom_highbd_var_filter_block2d_bil_first_pass(
+      src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
+  aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
+                                                bilinear_filters_2t[yoffset]);
+
+  aom_highbd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, 4, 4,
+                           CONVERT_TO_BYTEPTR(temp2), 4);
+
+  return aom_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst,
+                                   dst_stride, sse);
+}
diff --git a/libs/libaom/src/aom_dsp/x86/intrapred_asm_sse2.asm b/libs/libaom/src/aom_dsp/x86/intrapred_asm_sse2.asm
new file mode 100644
index 000000000..0eb632326
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/intrapred_asm_sse2.asm
@@ -0,0 +1,608 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pb_1: times 16 db 1
+pw_4:  times 8 dw 4
+pw_8:  times 8 dw 8
+pw_16: times 8 dw 16
+pw_32: times 8 dw 32
+dc_128: times 16 db 128
+pw2_4:  times 8 dw 2
+pw2_8:  times 8 dw 4
+pw2_16:  times 8 dw 8
+pw2_32:  times 8 dw 16
+
+SECTION .text
+
+INIT_XMM sse2
+cglobal dc_predictor_4x4, 4, 5, 3, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  movd                  m2, [leftq]
+  movd                  m0, [aboveq]
+  pxor                  m1, m1
+  punpckldq             m0, m2
+  psadbw                m0, m1
+  paddw                 m0, [GLOBAL(pw_4)]
+  psraw                 m0, 3
+  pshuflw               m0, m0, 0x0
+  packuswb              m0, m0
+  movd      [dstq        ], m0
+  movd      [dstq+strideq], m0
+  lea                 dstq, [dstq+strideq*2]
+  movd      [dstq        ], m0
+  movd      [dstq+strideq], m0
+
+  RESTORE_GOT
+  RET
+
+INIT_XMM sse2
+cglobal dc_left_predictor_4x4, 2, 5, 2, dst, stride, above, left, goffset
+  movifnidn          leftq, leftmp
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  movd                  m0, [leftq]
+  psadbw                m0, m1
+  paddw                 m0, [GLOBAL(pw2_4)]
+  psraw                 m0, 2
+  pshuflw               m0, m0, 0x0
+  packuswb              m0, m0
+  movd      [dstq        ], m0
+  movd      [dstq+strideq], m0
+  lea                 dstq, [dstq+strideq*2]
+  movd      [dstq        ], m0
+  movd      [dstq+strideq], m0
+
+  RESTORE_GOT
+  RET
+
+INIT_XMM sse2
+cglobal dc_top_predictor_4x4, 3, 5, 2, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  movd                  m0, [aboveq]
+  psadbw                m0, m1
+  paddw                 m0, [GLOBAL(pw2_4)]
+  psraw                 m0, 2
+  pshuflw               m0, m0, 0x0
+  packuswb              m0, m0
+  movd      [dstq        ], m0
+  movd      [dstq+strideq], m0
+  lea                 dstq, [dstq+strideq*2]
+  movd      [dstq        ], m0
+  movd      [dstq+strideq], m0
+
+  RESTORE_GOT
+  RET
+
+INIT_XMM sse2
+cglobal dc_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  movq                  m0, [aboveq]
+  movq                  m2, [leftq]
+  DEFINE_ARGS dst, stride, stride3
+  lea             stride3q, [strideq*3]
+  psadbw                m0, m1
+  psadbw                m2, m1
+  paddw                 m0, m2
+  paddw                 m0, [GLOBAL(pw_8)]
+  psraw                 m0, 4
+  punpcklbw             m0, m0
+  pshuflw               m0, m0, 0x0
+  movq    [dstq          ], m0
+  movq    [dstq+strideq  ], m0
+  movq    [dstq+strideq*2], m0
+  movq    [dstq+stride3q ], m0
+  lea                 dstq, [dstq+strideq*4]
+  movq    [dstq          ], m0
+  movq    [dstq+strideq  ], m0
+  movq    [dstq+strideq*2], m0
+  movq    [dstq+stride3q ], m0
+
+  RESTORE_GOT
+  RET
+
+INIT_XMM sse2
+cglobal dc_top_predictor_8x8, 3, 5, 2, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  movq                  m0, [aboveq]
+  DEFINE_ARGS dst, stride, stride3
+  lea             stride3q, [strideq*3]
+  psadbw                m0, m1
+  paddw                 m0, [GLOBAL(pw2_8)]
+  psraw                 m0, 3
+  punpcklbw             m0, m0
+  pshuflw               m0, m0, 0x0
+  movq    [dstq          ], m0
+  movq    [dstq+strideq  ], m0
+  movq    [dstq+strideq*2], m0
+  movq    [dstq+stride3q ], m0
+  lea                 dstq, [dstq+strideq*4]
+  movq    [dstq          ], m0
+  movq    [dstq+strideq  ], m0
+  movq    [dstq+strideq*2], m0
+  movq    [dstq+stride3q ], m0
+
+  RESTORE_GOT
+  RET
+
+INIT_XMM sse2
+cglobal dc_left_predictor_8x8, 2, 5, 2, dst, stride, above, left, goffset
+  movifnidn          leftq, leftmp
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  movq                  m0, [leftq]
+  DEFINE_ARGS dst, stride, stride3
+  lea             stride3q, [strideq*3]
+  psadbw                m0, m1
+  paddw                 m0, [GLOBAL(pw2_8)]
+  psraw                 m0, 3
+  punpcklbw             m0, m0
+  pshuflw               m0, m0, 0x0
+  movq    [dstq          ], m0
+  movq    [dstq+strideq  ], m0
+  movq    [dstq+strideq*2], m0
+  movq    [dstq+stride3q ], m0
+  lea                 dstq, [dstq+strideq*4]
+  movq    [dstq          ], m0
+  movq    [dstq+strideq  ], m0
+  movq    [dstq+strideq*2], m0
+  movq    [dstq+stride3q ], m0
+
+  RESTORE_GOT
+  RET
+
+INIT_XMM sse2
+cglobal dc_128_predictor_4x4, 2, 5, 1, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  DEFINE_ARGS dst, stride, stride3
+  lea             stride3q, [strideq*3]
+  movd     m0,        [GLOBAL(dc_128)]
+  movd    [dstq          ], m0
+  movd    [dstq+strideq  ], m0
+  movd    [dstq+strideq*2], m0
+  movd    [dstq+stride3q ], m0
+  RESTORE_GOT
+  RET
+
+INIT_XMM sse2
+cglobal dc_128_predictor_8x8, 2, 5, 1, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  DEFINE_ARGS dst, stride, stride3
+  lea             stride3q, [strideq*3]
+  movq    m0,        [GLOBAL(dc_128)]
+  movq    [dstq          ], m0
+  movq    [dstq+strideq  ], m0
+  movq    [dstq+strideq*2], m0
+  movq    [dstq+stride3q ], m0
+  lea                 dstq, [dstq+strideq*4]
+  movq    [dstq          ], m0
+  movq    [dstq+strideq  ], m0
+  movq    [dstq+strideq*2], m0
+  movq    [dstq+stride3q ], m0
+  RESTORE_GOT
+  RET
+
+INIT_XMM sse2
+cglobal dc_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  mova                  m0, [aboveq]
+  mova                  m2, [leftq]
+  DEFINE_ARGS dst, stride, stride3, lines4
+  lea             stride3q, [strideq*3]
+  mov              lines4d, 4
+  psadbw                m0, m1
+  psadbw                m2, m1
+  paddw                 m0, m2
+  movhlps               m2, m0
+  paddw                 m0, m2
+  paddw                 m0, [GLOBAL(pw_16)]
+  psraw                 m0, 5
+  pshuflw               m0, m0, 0x0
+  punpcklqdq            m0, m0
+  packuswb              m0, m0
+.loop:
+  mova    [dstq          ], m0
+  mova    [dstq+strideq  ], m0
+  mova    [dstq+strideq*2], m0
+  mova    [dstq+stride3q ], m0
+  lea                 dstq, [dstq+strideq*4]
+  dec              lines4d
+  jnz .loop
+
+  RESTORE_GOT
+  REP_RET
+
+
+INIT_XMM sse2
+cglobal dc_top_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  mova                  m0, [aboveq]
+  DEFINE_ARGS dst, stride, stride3, lines4
+  lea             stride3q, [strideq*3]
+  mov              lines4d, 4
+  psadbw                m0, m1
+  movhlps               m2, m0
+  paddw                 m0, m2
+  paddw                 m0, [GLOBAL(pw2_16)]
+  psraw                 m0, 4
+  pshuflw               m0, m0, 0x0
+  punpcklqdq            m0, m0
+  packuswb              m0, m0
+.loop:
+  mova    [dstq          ], m0
+  mova    [dstq+strideq  ], m0
+  mova    [dstq+strideq*2], m0
+  mova    [dstq+stride3q ], m0
+  lea                 dstq, [dstq+strideq*4]
+  dec              lines4d
+  jnz .loop
+
+  RESTORE_GOT
+  REP_RET
+
+INIT_XMM sse2
+cglobal dc_left_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  mova                  m0, [leftq]
+  DEFINE_ARGS dst, stride, stride3, lines4
+  lea             stride3q, [strideq*3]
+  mov              lines4d, 4
+  psadbw                m0, m1
+  movhlps               m2, m0
+  paddw                 m0, m2
+  paddw                 m0, [GLOBAL(pw2_16)]
+  psraw                 m0, 4
+  pshuflw               m0, m0, 0x0
+  punpcklqdq            m0, m0
+  packuswb              m0, m0
+.loop:
+  mova    [dstq          ], m0
+  mova    [dstq+strideq  ], m0
+  mova    [dstq+strideq*2], m0
+  mova    [dstq+stride3q ], m0
+  lea                 dstq, [dstq+strideq*4]
+  dec              lines4d
+  jnz .loop
+
+  RESTORE_GOT
+  REP_RET
+
+INIT_XMM sse2
+cglobal dc_128_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  DEFINE_ARGS dst, stride, stride3, lines4
+  lea             stride3q, [strideq*3]
+  mov              lines4d, 4
+  mova    m0,        [GLOBAL(dc_128)]
+.loop:
+  mova    [dstq          ], m0
+  mova    [dstq+strideq  ], m0
+  mova    [dstq+strideq*2], m0
+  mova    [dstq+stride3q ], m0
+  lea                 dstq, [dstq+strideq*4]
+  dec              lines4d
+  jnz .loop
+  RESTORE_GOT
+  RET
+
+
+INIT_XMM sse2
+cglobal dc_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  mova                  m0, [aboveq]
+  mova                  m2, [aboveq+16]
+  mova                  m3, [leftq]
+  mova                  m4, [leftq+16]
+  DEFINE_ARGS dst, stride, stride3, lines4
+  lea             stride3q, [strideq*3]
+  mov              lines4d, 8
+  psadbw                m0, m1
+  psadbw                m2, m1
+  psadbw                m3, m1
+  psadbw                m4, m1
+  paddw                 m0, m2
+  paddw                 m0, m3
+  paddw                 m0, m4
+  movhlps               m2, m0
+  paddw                 m0, m2
+  paddw                 m0, [GLOBAL(pw_32)]
+  psraw                 m0, 6
+  pshuflw               m0, m0, 0x0
+  punpcklqdq            m0, m0
+  packuswb              m0, m0
+.loop:
+  mova [dstq             ], m0
+  mova [dstq          +16], m0
+  mova [dstq+strideq     ], m0
+  mova [dstq+strideq  +16], m0
+  mova [dstq+strideq*2   ], m0
+  mova [dstq+strideq*2+16], m0
+  mova [dstq+stride3q    ], m0
+  mova [dstq+stride3q +16], m0
+  lea                 dstq, [dstq+strideq*4]
+  dec              lines4d
+  jnz .loop
+
+  RESTORE_GOT
+  REP_RET
+
+INIT_XMM sse2
+cglobal dc_top_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  mova                  m0, [aboveq]
+  mova                  m2, [aboveq+16]
+  DEFINE_ARGS dst, stride, stride3, lines4
+  lea             stride3q, [strideq*3]
+  mov              lines4d, 8
+  psadbw                m0, m1
+  psadbw                m2, m1
+  paddw                 m0, m2
+  movhlps               m2, m0
+  paddw                 m0, m2
+  paddw                 m0, [GLOBAL(pw2_32)]
+  psraw                 m0, 5
+  pshuflw               m0, m0, 0x0
+  punpcklqdq            m0, m0
+  packuswb              m0, m0
+.loop:
+  mova [dstq             ], m0
+  mova [dstq          +16], m0
+  mova [dstq+strideq     ], m0
+  mova [dstq+strideq  +16], m0
+  mova [dstq+strideq*2   ], m0
+  mova [dstq+strideq*2+16], m0
+  mova [dstq+stride3q    ], m0
+  mova [dstq+stride3q +16], m0
+  lea                 dstq, [dstq+strideq*4]
+  dec              lines4d
+  jnz .loop
+
+  RESTORE_GOT
+  REP_RET
+
+INIT_XMM sse2
+cglobal dc_left_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  mova                  m0, [leftq]
+  mova                  m2, [leftq+16]
+  DEFINE_ARGS dst, stride, stride3, lines4
+  lea             stride3q, [strideq*3]
+  mov              lines4d, 8
+  psadbw                m0, m1
+  psadbw                m2, m1
+  paddw                 m0, m2
+  movhlps               m2, m0
+  paddw                 m0, m2
+  paddw                 m0, [GLOBAL(pw2_32)]
+  psraw                 m0, 5
+  pshuflw               m0, m0, 0x0
+  punpcklqdq            m0, m0
+  packuswb              m0, m0
+.loop:
+  mova [dstq             ], m0
+  mova [dstq          +16], m0
+  mova [dstq+strideq     ], m0
+  mova [dstq+strideq  +16], m0
+  mova [dstq+strideq*2   ], m0
+  mova [dstq+strideq*2+16], m0
+  mova [dstq+stride3q    ], m0
+  mova [dstq+stride3q +16], m0
+  lea                 dstq, [dstq+strideq*4]
+  dec              lines4d
+  jnz .loop
+
+  RESTORE_GOT
+  REP_RET
+
+INIT_XMM sse2
+cglobal dc_128_predictor_32x32, 4, 5, 3, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  DEFINE_ARGS dst, stride, stride3, lines4
+  lea             stride3q, [strideq*3]
+  mov              lines4d, 8
+  mova    m0,        [GLOBAL(dc_128)]
+.loop:
+  mova [dstq             ], m0
+  mova [dstq          +16], m0
+  mova [dstq+strideq     ], m0
+  mova [dstq+strideq  +16], m0
+  mova [dstq+strideq*2   ], m0
+  mova [dstq+strideq*2+16], m0
+  mova [dstq+stride3q    ], m0
+  mova [dstq+stride3q +16], m0
+  lea                 dstq, [dstq+strideq*4]
+  dec              lines4d
+  jnz .loop
+  RESTORE_GOT
+  RET
+
+INIT_XMM sse2
+cglobal v_predictor_4x4, 3, 3, 1, dst, stride, above
+  movd                  m0, [aboveq]
+  movd      [dstq        ], m0
+  movd      [dstq+strideq], m0
+  lea                 dstq, [dstq+strideq*2]
+  movd      [dstq        ], m0
+  movd      [dstq+strideq], m0
+  RET
+
+INIT_XMM sse2
+cglobal v_predictor_8x8, 3, 3, 1, dst, stride, above
+  movq                  m0, [aboveq]
+  DEFINE_ARGS dst, stride, stride3
+  lea             stride3q, [strideq*3]
+  movq    [dstq          ], m0
+  movq    [dstq+strideq  ], m0
+  movq    [dstq+strideq*2], m0
+  movq    [dstq+stride3q ], m0
+  lea                 dstq, [dstq+strideq*4]
+  movq    [dstq          ], m0
+  movq    [dstq+strideq  ], m0
+  movq    [dstq+strideq*2], m0
+  movq    [dstq+stride3q ], m0
+  RET
+
+INIT_XMM sse2
+cglobal v_predictor_16x16, 3, 4, 1, dst, stride, above
+  mova                  m0, [aboveq]
+  DEFINE_ARGS dst, stride, stride3, nlines4
+  lea             stride3q, [strideq*3]
+  mov              nlines4d, 4
+.loop:
+  mova    [dstq          ], m0
+  mova    [dstq+strideq  ], m0
+  mova    [dstq+strideq*2], m0
+  mova    [dstq+stride3q ], m0
+  lea                 dstq, [dstq+strideq*4]
+  dec             nlines4d
+  jnz .loop
+  REP_RET
+
+INIT_XMM sse2
+cglobal v_predictor_32x32, 3, 4, 2, dst, stride, above
+  mova                  m0, [aboveq]
+  mova                  m1, [aboveq+16]
+  DEFINE_ARGS dst, stride, stride3, nlines4
+  lea             stride3q, [strideq*3]
+  mov              nlines4d, 8
+.loop:
+  mova [dstq             ], m0
+  mova [dstq          +16], m1
+  mova [dstq+strideq     ], m0
+  mova [dstq+strideq  +16], m1
+  mova [dstq+strideq*2   ], m0
+  mova [dstq+strideq*2+16], m1
+  mova [dstq+stride3q    ], m0
+  mova [dstq+stride3q +16], m1
+  lea                 dstq, [dstq+strideq*4]
+  dec             nlines4d
+  jnz .loop
+  REP_RET
+
+INIT_XMM sse2
+cglobal h_predictor_4x4, 2, 4, 4, dst, stride, line, left
+  movifnidn          leftq, leftmp
+  movd                  m0, [leftq]
+  punpcklbw             m0, m0
+  punpcklbw             m0, m0
+  pshufd                m1, m0, 0x1
+  movd      [dstq        ], m0
+  movd      [dstq+strideq], m1
+  pshufd                m2, m0, 0x2
+  lea                 dstq, [dstq+strideq*2]
+  pshufd                m3, m0, 0x3
+  movd      [dstq        ], m2
+  movd      [dstq+strideq], m3
+  RET
+
+INIT_XMM sse2
+cglobal h_predictor_8x8, 2, 5, 3, dst, stride, line, left
+  movifnidn          leftq, leftmp
+  mov                lineq, -2
+  DEFINE_ARGS  dst, stride, line, left, stride3
+  lea             stride3q, [strideq*3]
+  movq                  m0, [leftq    ]
+  punpcklbw             m0, m0              ; l1 l1 l2 l2 ... l8 l8
+.loop:
+  pshuflw               m1, m0, 0x0         ; l1 l1 l1 l1 l1 l1 l1 l1
+  pshuflw               m2, m0, 0x55        ; l2 l2 l2 l2 l2 l2 l2 l2
+  movq      [dstq        ], m1
+  movq      [dstq+strideq], m2
+  pshuflw               m1, m0, 0xaa
+  pshuflw               m2, m0, 0xff
+  movq    [dstq+strideq*2], m1
+  movq    [dstq+stride3q ], m2
+  pshufd                m0, m0, 0xe         ; [63:0] l5 l5 l6 l6 l7 l7 l8 l8
+  inc                lineq
+  lea                 dstq, [dstq+strideq*4]
+  jnz .loop
+  REP_RET
+
+INIT_XMM sse2
+cglobal h_predictor_16x16, 2, 5, 3, dst, stride, line, left
+  movifnidn          leftq, leftmp
+  mov                lineq, -4
+  DEFINE_ARGS dst, stride, line, left, stride3
+  lea             stride3q, [strideq*3]
+.loop:
+  movd                  m0, [leftq]
+  punpcklbw             m0, m0
+  punpcklbw             m0, m0              ; l1 to l4 each repeated 4 times
+  pshufd            m1, m0, 0x0             ; l1 repeated 16 times
+  pshufd            m2, m0, 0x55            ; l2 repeated 16 times
+  mova    [dstq          ], m1
+  mova    [dstq+strideq  ], m2
+  pshufd            m1, m0, 0xaa
+  pshufd            m2, m0, 0xff
+  mova    [dstq+strideq*2], m1
+  mova    [dstq+stride3q ], m2
+  inc                lineq
+  lea                leftq, [leftq+4       ]
+  lea                 dstq, [dstq+strideq*4]
+  jnz .loop
+  REP_RET
+
+INIT_XMM sse2
+cglobal h_predictor_32x32, 2, 5, 3, dst, stride, line, left
+  movifnidn              leftq, leftmp
+  mov                    lineq, -8
+  DEFINE_ARGS dst, stride, line, left, stride3
+  lea                 stride3q, [strideq*3]
+.loop:
+  movd                      m0, [leftq]
+  punpcklbw                 m0, m0
+  punpcklbw                 m0, m0              ; l1 to l4 each repeated 4 times
+  pshufd                m1, m0, 0x0             ; l1 repeated 16 times
+  pshufd                m2, m0, 0x55            ; l2 repeated 16 times
+  mova     [dstq             ], m1
+  mova     [dstq+16          ], m1
+  mova     [dstq+strideq     ], m2
+  mova     [dstq+strideq+16  ], m2
+  pshufd                m1, m0, 0xaa
+  pshufd                m2, m0, 0xff
+  mova     [dstq+strideq*2   ], m1
+  mova     [dstq+strideq*2+16], m1
+  mova     [dstq+stride3q    ], m2
+  mova     [dstq+stride3q+16 ], m2
+  inc                    lineq
+  lea                    leftq, [leftq+4       ]
+  lea                     dstq, [dstq+strideq*4]
+  jnz .loop
+  REP_RET
diff --git a/libs/libaom/src/aom_dsp/x86/intrapred_avx2.c b/libs/libaom/src/aom_dsp/x86/intrapred_avx2.c
new file mode 100644
index 000000000..546ee74bb
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/intrapred_avx2.c
@@ -0,0 +1,4895 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom_dsp/x86/intrapred_x86.h"
+#include "aom_dsp/x86/lpf_common_sse2.h"
+
+static INLINE __m256i dc_sum_64(const uint8_t *ref) {
+  const __m256i x0 = _mm256_loadu_si256((const __m256i *)ref);
+  const __m256i x1 = _mm256_loadu_si256((const __m256i *)(ref + 32));
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i y0 = _mm256_sad_epu8(x0, zero);
+  __m256i y1 = _mm256_sad_epu8(x1, zero);
+  y0 = _mm256_add_epi64(y0, y1);
+  __m256i u0 = _mm256_permute2x128_si256(y0, y0, 1);
+  y0 = _mm256_add_epi64(u0, y0);
+  u0 = _mm256_unpackhi_epi64(y0, y0);
+  return _mm256_add_epi16(y0, u0);
+}
+
+static INLINE __m256i dc_sum_32(const uint8_t *ref) {
+  const __m256i x = _mm256_loadu_si256((const __m256i *)ref);
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i y = _mm256_sad_epu8(x, zero);
+  __m256i u = _mm256_permute2x128_si256(y, y, 1);
+  y = _mm256_add_epi64(u, y);
+  u = _mm256_unpackhi_epi64(y, y);
+  return _mm256_add_epi16(y, u);
+}
+
+static INLINE void row_store_32xh(const __m256i *r, int height, uint8_t *dst,
+                                  ptrdiff_t stride) {
+  for (int i = 0; i < height; ++i) {
+    _mm256_storeu_si256((__m256i *)dst, *r);
+    dst += stride;
+  }
+}
+
+static INLINE void row_store_32x2xh(const __m256i *r0, const __m256i *r1,
+                                    int height, uint8_t *dst,
+                                    ptrdiff_t stride) {
+  for (int i = 0; i < height; ++i) {
+    _mm256_storeu_si256((__m256i *)dst, *r0);
+    _mm256_storeu_si256((__m256i *)(dst + 32), *r1);
+    dst += stride;
+  }
+}
+
+static INLINE void row_store_64xh(const __m256i *r, int height, uint8_t *dst,
+                                  ptrdiff_t stride) {
+  for (int i = 0; i < height; ++i) {
+    _mm256_storeu_si256((__m256i *)dst, *r);
+    _mm256_storeu_si256((__m256i *)(dst + 32), *r);
+    dst += stride;
+  }
+}
+
+static DECLARE_ALIGNED(16, uint8_t, HighbdLoadMaskx[8][16]) = {
+  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+  { 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 },
+  { 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 },
+  { 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 },
+  { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7 },
+  { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5 },
+  { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3 },
+  { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 },
+};
+
+static DECLARE_ALIGNED(16, uint8_t, HighbdEvenOddMaskx4[4][16]) = {
+  { 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 },
+  { 0, 1, 2, 3, 6, 7, 10, 11, 14, 15, 4, 5, 8, 9, 12, 13 },
+  { 0, 1, 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 6, 7, 10, 11 },
+  { 0, 1, 0, 1, 0, 1, 6, 7, 10, 11, 14, 15, 0, 1, 8, 9 }
+};
+
+static DECLARE_ALIGNED(16, uint8_t, HighbdEvenOddMaskx[8][32]) = {
+  { 0, 1, 4, 5, 8,  9,  12, 13, 16, 17, 20, 21, 24, 25, 28, 29,
+    2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 },
+  { 0, 1, 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27,
+    0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 },
+  { 0, 1, 0, 1, 4, 5, 8,  9,  12, 13, 16, 17, 20, 21, 24, 25,
+    0, 1, 0, 1, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27 },
+  { 0, 1, 0, 1, 0, 1, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23,
+    0, 1, 0, 1, 0, 1, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25 },
+  { 0, 1, 0, 1, 0, 1, 0, 1, 8,  9,  12, 13, 16, 17, 20, 21,
+    0, 1, 0, 1, 0, 1, 0, 1, 10, 11, 14, 15, 18, 19, 22, 23 },
+  { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 10, 11, 14, 15, 18, 19,
+    0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 12, 13, 16, 17, 20, 21 },
+  { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 12, 13, 16, 17,
+    0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 14, 15, 18, 19 },
+  { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 14, 15,
+    0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 16, 17 }
+};
+
+static DECLARE_ALIGNED(32, uint16_t, HighbdBaseMask[17][16]) = {
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0 },
+  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0,
+    0, 0 },
+  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0,
+    0, 0, 0, 0 },
+  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0,
+    0, 0, 0, 0, 0, 0 },
+  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0, 0, 0, 0, 0, 0 },
+  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0, 0, 0, 0, 0 },
+  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0, 0, 0, 0 },
+  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0 },
+  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0 },
+  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0 },
+  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff }
+};
+
+static INLINE void highbd_transpose16x4_8x8_sse2(__m128i *x, __m128i *d) {
+  __m128i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
+
+  r0 = _mm_unpacklo_epi16(x[0], x[1]);
+  r1 = _mm_unpacklo_epi16(x[2], x[3]);
+  r2 = _mm_unpacklo_epi16(x[4], x[5]);
+  r3 = _mm_unpacklo_epi16(x[6], x[7]);
+
+  r4 = _mm_unpacklo_epi16(x[8], x[9]);
+  r5 = _mm_unpacklo_epi16(x[10], x[11]);
+  r6 = _mm_unpacklo_epi16(x[12], x[13]);
+  r7 = _mm_unpacklo_epi16(x[14], x[15]);
+
+  r8 = _mm_unpacklo_epi32(r0, r1);
+  r9 = _mm_unpackhi_epi32(r0, r1);
+  r10 = _mm_unpacklo_epi32(r2, r3);
+  r11 = _mm_unpackhi_epi32(r2, r3);
+
+  r12 = _mm_unpacklo_epi32(r4, r5);
+  r13 = _mm_unpackhi_epi32(r4, r5);
+  r14 = _mm_unpacklo_epi32(r6, r7);
+  r15 = _mm_unpackhi_epi32(r6, r7);
+
+  r0 = _mm_unpacklo_epi64(r8, r9);
+  r1 = _mm_unpackhi_epi64(r8, r9);
+  r2 = _mm_unpacklo_epi64(r10, r11);
+  r3 = _mm_unpackhi_epi64(r10, r11);
+
+  r4 = _mm_unpacklo_epi64(r12, r13);
+  r5 = _mm_unpackhi_epi64(r12, r13);
+  r6 = _mm_unpacklo_epi64(r14, r15);
+  r7 = _mm_unpackhi_epi64(r14, r15);
+
+  d[0] = _mm_unpacklo_epi64(r0, r2);
+  d[1] = _mm_unpacklo_epi64(r4, r6);
+  d[2] = _mm_unpacklo_epi64(r1, r3);
+  d[3] = _mm_unpacklo_epi64(r5, r7);
+
+  d[4] = _mm_unpackhi_epi64(r0, r2);
+  d[5] = _mm_unpackhi_epi64(r4, r6);
+  d[6] = _mm_unpackhi_epi64(r1, r3);
+  d[7] = _mm_unpackhi_epi64(r5, r7);
+}
+
+static INLINE void highbd_transpose4x16_avx2(__m256i *x, __m256i *d) {
+  __m256i w0, w1, w2, w3, ww0, ww1;
+
+  w0 = _mm256_unpacklo_epi16(x[0], x[1]);  // 00 10 01 11 02 12 03 13
+  w1 = _mm256_unpacklo_epi16(x[2], x[3]);  // 20 30 21 31 22 32 23 33
+  w2 = _mm256_unpackhi_epi16(x[0], x[1]);  // 40 50 41 51 42 52 43 53
+  w3 = _mm256_unpackhi_epi16(x[2], x[3]);  // 60 70 61 71 62 72 63 73
+
+  ww0 = _mm256_unpacklo_epi32(w0, w1);  // 00 10 20 30 01 11 21 31
+  ww1 = _mm256_unpacklo_epi32(w2, w3);  // 40 50 60 70 41 51 61 71
+
+  d[0] = _mm256_unpacklo_epi64(ww0, ww1);  // 00 10 20 30 40 50 60 70
+  d[1] = _mm256_unpackhi_epi64(ww0, ww1);  // 01 11 21 31 41 51 61 71
+
+  ww0 = _mm256_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
+  ww1 = _mm256_unpackhi_epi32(w2, w3);  // 42 52 62 72 43 53 63 73
+
+  d[2] = _mm256_unpacklo_epi64(ww0, ww1);  // 02 12 22 32 42 52 62 72
+  d[3] = _mm256_unpackhi_epi64(ww0, ww1);  // 03 13 23 33 43 53 63 73
+}
+
+static INLINE void highbd_transpose8x16_16x8_avx2(__m256i *x, __m256i *d) {
+  __m256i w0, w1, w2, w3, ww0, ww1;
+
+  w0 = _mm256_unpacklo_epi16(x[0], x[1]);  // 00 10 01 11 02 12 03 13
+  w1 = _mm256_unpacklo_epi16(x[2], x[3]);  // 20 30 21 31 22 32 23 33
+  w2 = _mm256_unpacklo_epi16(x[4], x[5]);  // 40 50 41 51 42 52 43 53
+  w3 = _mm256_unpacklo_epi16(x[6], x[7]);  // 60 70 61 71 62 72 63 73
+
+  ww0 = _mm256_unpacklo_epi32(w0, w1);  // 00 10 20 30 01 11 21 31
+  ww1 = _mm256_unpacklo_epi32(w2, w3);  // 40 50 60 70 41 51 61 71
+
+  d[0] = _mm256_unpacklo_epi64(ww0, ww1);  // 00 10 20 30 40 50 60 70
+  d[1] = _mm256_unpackhi_epi64(ww0, ww1);  // 01 11 21 31 41 51 61 71
+
+  ww0 = _mm256_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
+  ww1 = _mm256_unpackhi_epi32(w2, w3);  // 42 52 62 72 43 53 63 73
+
+  d[2] = _mm256_unpacklo_epi64(ww0, ww1);  // 02 12 22 32 42 52 62 72
+  d[3] = _mm256_unpackhi_epi64(ww0, ww1);  // 03 13 23 33 43 53 63 73
+
+  w0 = _mm256_unpackhi_epi16(x[0], x[1]);  // 04 14 05 15 06 16 07 17
+  w1 = _mm256_unpackhi_epi16(x[2], x[3]);  // 24 34 25 35 26 36 27 37
+  w2 = _mm256_unpackhi_epi16(x[4], x[5]);  // 44 54 45 55 46 56 47 57
+  w3 = _mm256_unpackhi_epi16(x[6], x[7]);  // 64 74 65 75 66 76 67 77
+
+  ww0 = _mm256_unpacklo_epi32(w0, w1);  // 04 14 24 34 05 15 25 35
+  ww1 = _mm256_unpacklo_epi32(w2, w3);  // 44 54 64 74 45 55 65 75
+
+  d[4] = _mm256_unpacklo_epi64(ww0, ww1);  // 04 14 24 34 44 54 64 74
+  d[5] = _mm256_unpackhi_epi64(ww0, ww1);  // 05 15 25 35 45 55 65 75
+
+  ww0 = _mm256_unpackhi_epi32(w0, w1);  // 06 16 26 36 07 17 27 37
+  ww1 = _mm256_unpackhi_epi32(w2, w3);  // 46 56 66 76 47 57 67 77
+
+  d[6] = _mm256_unpacklo_epi64(ww0, ww1);  // 06 16 26 36 46 56 66 76
+  d[7] = _mm256_unpackhi_epi64(ww0, ww1);  // 07 17 27 37 47 57 67 77
+}
+
+static INLINE void highbd_transpose16x16_avx2(__m256i *x, __m256i *d) {
+  __m256i w0, w1, w2, w3, ww0, ww1;
+  __m256i dd[16];
+  w0 = _mm256_unpacklo_epi16(x[0], x[1]);
+  w1 = _mm256_unpacklo_epi16(x[2], x[3]);
+  w2 = _mm256_unpacklo_epi16(x[4], x[5]);
+  w3 = _mm256_unpacklo_epi16(x[6], x[7]);
+
+  ww0 = _mm256_unpacklo_epi32(w0, w1);  //
+  ww1 = _mm256_unpacklo_epi32(w2, w3);  //
+
+  dd[0] = _mm256_unpacklo_epi64(ww0, ww1);
+  dd[1] = _mm256_unpackhi_epi64(ww0, ww1);
+
+  ww0 = _mm256_unpackhi_epi32(w0, w1);  //
+  ww1 = _mm256_unpackhi_epi32(w2, w3);  //
+
+  dd[2] = _mm256_unpacklo_epi64(ww0, ww1);
+  dd[3] = _mm256_unpackhi_epi64(ww0, ww1);
+
+  w0 = _mm256_unpackhi_epi16(x[0], x[1]);
+  w1 = _mm256_unpackhi_epi16(x[2], x[3]);
+  w2 = _mm256_unpackhi_epi16(x[4], x[5]);
+  w3 = _mm256_unpackhi_epi16(x[6], x[7]);
+
+  ww0 = _mm256_unpacklo_epi32(w0, w1);  //
+  ww1 = _mm256_unpacklo_epi32(w2, w3);  //
+
+  dd[4] = _mm256_unpacklo_epi64(ww0, ww1);
+  dd[5] = _mm256_unpackhi_epi64(ww0, ww1);
+
+  ww0 = _mm256_unpackhi_epi32(w0, w1);  //
+  ww1 = _mm256_unpackhi_epi32(w2, w3);  //
+
+  dd[6] = _mm256_unpacklo_epi64(ww0, ww1);
+  dd[7] = _mm256_unpackhi_epi64(ww0, ww1);
+
+  w0 = _mm256_unpacklo_epi16(x[8], x[9]);
+  w1 = _mm256_unpacklo_epi16(x[10], x[11]);
+  w2 = _mm256_unpacklo_epi16(x[12], x[13]);
+  w3 = _mm256_unpacklo_epi16(x[14], x[15]);
+
+  ww0 = _mm256_unpacklo_epi32(w0, w1);
+  ww1 = _mm256_unpacklo_epi32(w2, w3);
+
+  dd[8] = _mm256_unpacklo_epi64(ww0, ww1);
+  dd[9] = _mm256_unpackhi_epi64(ww0, ww1);
+
+  ww0 = _mm256_unpackhi_epi32(w0, w1);
+  ww1 = _mm256_unpackhi_epi32(w2, w3);
+
+  dd[10] = _mm256_unpacklo_epi64(ww0, ww1);
+  dd[11] = _mm256_unpackhi_epi64(ww0, ww1);
+
+  w0 = _mm256_unpackhi_epi16(x[8], x[9]);
+  w1 = _mm256_unpackhi_epi16(x[10], x[11]);
+  w2 = _mm256_unpackhi_epi16(x[12], x[13]);
+  w3 = _mm256_unpackhi_epi16(x[14], x[15]);
+
+  ww0 = _mm256_unpacklo_epi32(w0, w1);
+  ww1 = _mm256_unpacklo_epi32(w2, w3);
+
+  dd[12] = _mm256_unpacklo_epi64(ww0, ww1);
+  dd[13] = _mm256_unpackhi_epi64(ww0, ww1);
+
+  ww0 = _mm256_unpackhi_epi32(w0, w1);
+  ww1 = _mm256_unpackhi_epi32(w2, w3);
+
+  dd[14] = _mm256_unpacklo_epi64(ww0, ww1);
+  dd[15] = _mm256_unpackhi_epi64(ww0, ww1);
+
+  for (int i = 0; i < 8; i++) {
+    d[i] = _mm256_insertf128_si256(dd[i], _mm256_castsi256_si128(dd[i + 8]), 1);
+    d[i + 8] = _mm256_insertf128_si256(dd[i + 8],
+                                       _mm256_extracti128_si256(dd[i], 1), 0);
+  }
+}
+
+void aom_dc_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  const __m256i sum_above = dc_sum_32(above);
+  __m256i sum_left = dc_sum_32(left);
+  sum_left = _mm256_add_epi16(sum_left, sum_above);
+  const __m256i thirtytwo = _mm256_set1_epi16(32);
+  sum_left = _mm256_add_epi16(sum_left, thirtytwo);
+  sum_left = _mm256_srai_epi16(sum_left, 6);
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i row = _mm256_shuffle_epi8(sum_left, zero);
+  row_store_32xh(&row, 32, dst, stride);
+}
+
+void aom_dc_top_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  __m256i sum = dc_sum_32(above);
+  (void)left;
+
+  const __m256i sixteen = _mm256_set1_epi16(16);
+  sum = _mm256_add_epi16(sum, sixteen);
+  sum = _mm256_srai_epi16(sum, 5);
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i row = _mm256_shuffle_epi8(sum, zero);
+  row_store_32xh(&row, 32, dst, stride);
+}
+
+void aom_dc_left_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  __m256i sum = dc_sum_32(left);
+  (void)above;
+
+  const __m256i sixteen = _mm256_set1_epi16(16);
+  sum = _mm256_add_epi16(sum, sixteen);
+  sum = _mm256_srai_epi16(sum, 5);
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i row = _mm256_shuffle_epi8(sum, zero);
+  row_store_32xh(&row, 32, dst, stride);
+}
+
+void aom_dc_128_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
+  row_store_32xh(&row, 32, dst, stride);
+}
+
+void aom_v_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const __m256i row = _mm256_loadu_si256((const __m256i *)above);
+  (void)left;
+  row_store_32xh(&row, 32, dst, stride);
+}
+
+// There are 32 rows togeter. This function does line:
+// 0,1,2,3, and 16,17,18,19. The next call would do
+// 4,5,6,7, and 20,21,22,23. So 4 times of calling
+// would finish 32 rows.
+static INLINE void h_predictor_32x8line(const __m256i *row, uint8_t *dst,
+                                        ptrdiff_t stride) {
+  __m256i t[4];
+  __m256i m = _mm256_setzero_si256();
+  const __m256i inc = _mm256_set1_epi8(4);
+  int i;
+
+  for (i = 0; i < 4; i++) {
+    t[i] = _mm256_shuffle_epi8(*row, m);
+    __m256i r0 = _mm256_permute2x128_si256(t[i], t[i], 0);
+    __m256i r1 = _mm256_permute2x128_si256(t[i], t[i], 0x11);
+    _mm256_storeu_si256((__m256i *)dst, r0);
+    _mm256_storeu_si256((__m256i *)(dst + (stride << 4)), r1);
+    dst += stride;
+    m = _mm256_add_epi8(m, inc);
+  }
+}
+
+void aom_h_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  const __m256i left_col = _mm256_loadu_si256((__m256i const *)left);
+
+  __m256i u = _mm256_unpacklo_epi8(left_col, left_col);
+
+  __m256i v = _mm256_unpacklo_epi8(u, u);
+  h_predictor_32x8line(&v, dst, stride);
+  dst += stride << 2;
+
+  v = _mm256_unpackhi_epi8(u, u);
+  h_predictor_32x8line(&v, dst, stride);
+  dst += stride << 2;
+
+  u = _mm256_unpackhi_epi8(left_col, left_col);
+
+  v = _mm256_unpacklo_epi8(u, u);
+  h_predictor_32x8line(&v, dst, stride);
+  dst += stride << 2;
+
+  v = _mm256_unpackhi_epi8(u, u);
+  h_predictor_32x8line(&v, dst, stride);
+}
+
+// -----------------------------------------------------------------------------
+// Rectangle
+void aom_dc_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  const __m128i top_sum = dc_sum_32_sse2(above);
+  __m128i left_sum = dc_sum_16_sse2(left);
+  left_sum = _mm_add_epi16(top_sum, left_sum);
+  uint16_t sum = _mm_cvtsi128_si32(left_sum);
+  sum += 24;
+  sum /= 48;
+  const __m256i row = _mm256_set1_epi8((uint8_t)sum);
+  row_store_32xh(&row, 16, dst, stride);
+}
+
+void aom_dc_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  const __m256i sum_above = dc_sum_32(above);
+  __m256i sum_left = dc_sum_64(left);
+  sum_left = _mm256_add_epi16(sum_left, sum_above);
+  uint16_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
+  sum += 48;
+  sum /= 96;
+  const __m256i row = _mm256_set1_epi8((uint8_t)sum);
+  row_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_dc_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  const __m256i sum_above = dc_sum_64(above);
+  __m256i sum_left = dc_sum_64(left);
+  sum_left = _mm256_add_epi16(sum_left, sum_above);
+  uint16_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
+  sum += 64;
+  sum /= 128;
+  const __m256i row = _mm256_set1_epi8((uint8_t)sum);
+  row_store_64xh(&row, 64, dst, stride);
+}
+
+void aom_dc_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  const __m256i sum_above = dc_sum_64(above);
+  __m256i sum_left = dc_sum_32(left);
+  sum_left = _mm256_add_epi16(sum_left, sum_above);
+  uint16_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
+  sum += 48;
+  sum /= 96;
+  const __m256i row = _mm256_set1_epi8((uint8_t)sum);
+  row_store_64xh(&row, 32, dst, stride);
+}
+
+void aom_dc_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  const __m256i sum_above = dc_sum_64(above);
+  __m256i sum_left = _mm256_castsi128_si256(dc_sum_16_sse2(left));
+  sum_left = _mm256_add_epi16(sum_left, sum_above);
+  uint16_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
+  sum += 40;
+  sum /= 80;
+  const __m256i row = _mm256_set1_epi8((uint8_t)sum);
+  row_store_64xh(&row, 16, dst, stride);
+}
+
+void aom_dc_top_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  __m256i sum = dc_sum_32(above);
+  (void)left;
+
+  const __m256i sixteen = _mm256_set1_epi16(16);
+  sum = _mm256_add_epi16(sum, sixteen);
+  sum = _mm256_srai_epi16(sum, 5);
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i row = _mm256_shuffle_epi8(sum, zero);
+  row_store_32xh(&row, 16, dst, stride);
+}
+
+void aom_dc_top_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  __m256i sum = dc_sum_32(above);
+  (void)left;
+
+  const __m256i sixteen = _mm256_set1_epi16(16);
+  sum = _mm256_add_epi16(sum, sixteen);
+  sum = _mm256_srai_epi16(sum, 5);
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i row = _mm256_shuffle_epi8(sum, zero);
+  row_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_dc_top_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  __m256i sum = dc_sum_64(above);
+  (void)left;
+
+  const __m256i thirtytwo = _mm256_set1_epi16(32);
+  sum = _mm256_add_epi16(sum, thirtytwo);
+  sum = _mm256_srai_epi16(sum, 6);
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i row = _mm256_shuffle_epi8(sum, zero);
+  row_store_64xh(&row, 64, dst, stride);
+}
+
+void aom_dc_top_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  __m256i sum = dc_sum_64(above);
+  (void)left;
+
+  const __m256i thirtytwo = _mm256_set1_epi16(32);
+  sum = _mm256_add_epi16(sum, thirtytwo);
+  sum = _mm256_srai_epi16(sum, 6);
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i row = _mm256_shuffle_epi8(sum, zero);
+  row_store_64xh(&row, 32, dst, stride);
+}
+
+void aom_dc_top_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  __m256i sum = dc_sum_64(above);
+  (void)left;
+
+  const __m256i thirtytwo = _mm256_set1_epi16(32);
+  sum = _mm256_add_epi16(sum, thirtytwo);
+  sum = _mm256_srai_epi16(sum, 6);
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i row = _mm256_shuffle_epi8(sum, zero);
+  row_store_64xh(&row, 16, dst, stride);
+}
+
+void aom_dc_left_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  __m128i sum = dc_sum_16_sse2(left);
+  (void)above;
+
+  const __m128i eight = _mm_set1_epi16(8);
+  sum = _mm_add_epi16(sum, eight);
+  sum = _mm_srai_epi16(sum, 4);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i r = _mm_shuffle_epi8(sum, zero);
+  const __m256i row = _mm256_inserti128_si256(_mm256_castsi128_si256(r), r, 1);
+  row_store_32xh(&row, 16, dst, stride);
+}
+
+void aom_dc_left_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  __m256i sum = dc_sum_64(left);
+  (void)above;
+
+  const __m256i thirtytwo = _mm256_set1_epi16(32);
+  sum = _mm256_add_epi16(sum, thirtytwo);
+  sum = _mm256_srai_epi16(sum, 6);
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i row = _mm256_shuffle_epi8(sum, zero);
+  row_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_dc_left_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  __m256i sum = dc_sum_64(left);
+  (void)above;
+
+  const __m256i thirtytwo = _mm256_set1_epi16(32);
+  sum = _mm256_add_epi16(sum, thirtytwo);
+  sum = _mm256_srai_epi16(sum, 6);
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i row = _mm256_shuffle_epi8(sum, zero);
+  row_store_64xh(&row, 64, dst, stride);
+}
+
+void aom_dc_left_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  __m256i sum = dc_sum_32(left);
+  (void)above;
+
+  const __m256i sixteen = _mm256_set1_epi16(16);
+  sum = _mm256_add_epi16(sum, sixteen);
+  sum = _mm256_srai_epi16(sum, 5);
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i row = _mm256_shuffle_epi8(sum, zero);
+  row_store_64xh(&row, 32, dst, stride);
+}
+
+void aom_dc_left_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  __m128i sum = dc_sum_16_sse2(left);
+  (void)above;
+
+  const __m128i eight = _mm_set1_epi16(8);
+  sum = _mm_add_epi16(sum, eight);
+  sum = _mm_srai_epi16(sum, 4);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i r = _mm_shuffle_epi8(sum, zero);
+  const __m256i row = _mm256_inserti128_si256(_mm256_castsi128_si256(r), r, 1);
+  row_store_64xh(&row, 16, dst, stride);
+}
+
+void aom_dc_128_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
+  row_store_32xh(&row, 16, dst, stride);
+}
+
+void aom_dc_128_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
+  row_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_dc_128_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
+  row_store_64xh(&row, 64, dst, stride);
+}
+
+void aom_dc_128_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
+  row_store_64xh(&row, 32, dst, stride);
+}
+
+void aom_dc_128_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
+  row_store_64xh(&row, 16, dst, stride);
+}
+
+void aom_v_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const __m256i row = _mm256_loadu_si256((const __m256i *)above);
+  (void)left;
+  row_store_32xh(&row, 16, dst, stride);
+}
+
+void aom_v_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const __m256i row = _mm256_loadu_si256((const __m256i *)above);
+  (void)left;
+  row_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_v_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
+  const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
+  (void)left;
+  row_store_32x2xh(&row0, &row1, 64, dst, stride);
+}
+
+void aom_v_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
+  const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
+  (void)left;
+  row_store_32x2xh(&row0, &row1, 32, dst, stride);
+}
+
+void aom_v_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
+  const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
+  (void)left;
+  row_store_32x2xh(&row0, &row1, 16, dst, stride);
+}
+
+// -----------------------------------------------------------------------------
+// PAETH_PRED
+
+// Return 16 16-bit pixels in one row (__m256i)
+static INLINE __m256i paeth_pred(const __m256i *left, const __m256i *top,
+                                 const __m256i *topleft) {
+  const __m256i base =
+      _mm256_sub_epi16(_mm256_add_epi16(*top, *left), *topleft);
+
+  __m256i pl = _mm256_abs_epi16(_mm256_sub_epi16(base, *left));
+  __m256i pt = _mm256_abs_epi16(_mm256_sub_epi16(base, *top));
+  __m256i ptl = _mm256_abs_epi16(_mm256_sub_epi16(base, *topleft));
+
+  __m256i mask1 = _mm256_cmpgt_epi16(pl, pt);
+  mask1 = _mm256_or_si256(mask1, _mm256_cmpgt_epi16(pl, ptl));
+  __m256i mask2 = _mm256_cmpgt_epi16(pt, ptl);
+
+  pl = _mm256_andnot_si256(mask1, *left);
+
+  ptl = _mm256_and_si256(mask2, *topleft);
+  pt = _mm256_andnot_si256(mask2, *top);
+  pt = _mm256_or_si256(pt, ptl);
+  pt = _mm256_and_si256(mask1, pt);
+
+  return _mm256_or_si256(pt, pl);
+}
+
+// Return 16 8-bit pixels in one row (__m128i)
+static INLINE __m128i paeth_16x1_pred(const __m256i *left, const __m256i *top,
+                                      const __m256i *topleft) {
+  const __m256i p0 = paeth_pred(left, top, topleft);
+  const __m256i p1 = _mm256_permute4x64_epi64(p0, 0xe);
+  const __m256i p = _mm256_packus_epi16(p0, p1);
+  return _mm256_castsi256_si128(p);
+}
+
+static INLINE __m256i get_top_vector(const uint8_t *above) {
+  const __m128i x = _mm_load_si128((const __m128i *)above);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i t0 = _mm_unpacklo_epi8(x, zero);
+  const __m128i t1 = _mm_unpackhi_epi8(x, zero);
+  return _mm256_inserti128_si256(_mm256_castsi128_si256(t0), t1, 1);
+}
+
+void aom_paeth_predictor_16x8_avx2(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  __m128i x = _mm_loadl_epi64((const __m128i *)left);
+  const __m256i l = _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1);
+  const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]);
+  __m256i rep = _mm256_set1_epi16((short)0x8000);
+  const __m256i one = _mm256_set1_epi16(1);
+  const __m256i top = get_top_vector(above);
+
+  int i;
+  for (i = 0; i < 8; ++i) {
+    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+    const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
+
+    _mm_store_si128((__m128i *)dst, row);
+    dst += stride;
+    rep = _mm256_add_epi16(rep, one);
+  }
+}
+
+static INLINE __m256i get_left_vector(const uint8_t *left) {
+  const __m128i x = _mm_load_si128((const __m128i *)left);
+  return _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1);
+}
+
+void aom_paeth_predictor_16x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  const __m256i l = get_left_vector(left);
+  const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]);
+  __m256i rep = _mm256_set1_epi16((short)0x8000);
+  const __m256i one = _mm256_set1_epi16(1);
+  const __m256i top = get_top_vector(above);
+
+  int i;
+  for (i = 0; i < 16; ++i) {
+    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+    const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
+
+    _mm_store_si128((__m128i *)dst, row);
+    dst += stride;
+    rep = _mm256_add_epi16(rep, one);
+  }
+}
+
+void aom_paeth_predictor_16x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  __m256i l = get_left_vector(left);
+  const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]);
+  __m256i rep = _mm256_set1_epi16((short)0x8000);
+  const __m256i one = _mm256_set1_epi16(1);
+  const __m256i top = get_top_vector(above);
+
+  int i;
+  for (i = 0; i < 16; ++i) {
+    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+    const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
+
+    _mm_store_si128((__m128i *)dst, row);
+    dst += stride;
+    rep = _mm256_add_epi16(rep, one);
+  }
+
+  l = get_left_vector(left + 16);
+  rep = _mm256_set1_epi16((short)0x8000);
+  for (i = 0; i < 16; ++i) {
+    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+    const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
+
+    _mm_store_si128((__m128i *)dst, row);
+    dst += stride;
+    rep = _mm256_add_epi16(rep, one);
+  }
+}
+
+void aom_paeth_predictor_16x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]);
+  const __m256i one = _mm256_set1_epi16(1);
+  const __m256i top = get_top_vector(above);
+
+  for (int j = 0; j < 4; ++j) {
+    const __m256i l = get_left_vector(left + j * 16);
+    __m256i rep = _mm256_set1_epi16((short)0x8000);
+    for (int i = 0; i < 16; ++i) {
+      const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+      const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
+
+      _mm_store_si128((__m128i *)dst, row);
+      dst += stride;
+      rep = _mm256_add_epi16(rep, one);
+    }
+  }
+}
+
+// Return 32 8-bit pixels in one row (__m256i)
+static INLINE __m256i paeth_32x1_pred(const __m256i *left, const __m256i *top0,
+                                      const __m256i *top1,
+                                      const __m256i *topleft) {
+  __m256i p0 = paeth_pred(left, top0, topleft);
+  __m256i p1 = _mm256_permute4x64_epi64(p0, 0xe);
+  const __m256i x0 = _mm256_packus_epi16(p0, p1);
+
+  p0 = paeth_pred(left, top1, topleft);
+  p1 = _mm256_permute4x64_epi64(p0, 0xe);
+  const __m256i x1 = _mm256_packus_epi16(p0, p1);
+
+  return _mm256_permute2x128_si256(x0, x1, 0x20);
+}
+
+void aom_paeth_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  const __m256i l = get_left_vector(left);
+  const __m256i t0 = get_top_vector(above);
+  const __m256i t1 = get_top_vector(above + 16);
+  const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
+  __m256i rep = _mm256_set1_epi16((short)0x8000);
+  const __m256i one = _mm256_set1_epi16(1);
+
+  int i;
+  for (i = 0; i < 16; ++i) {
+    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+
+    const __m256i r = paeth_32x1_pred(&l16, &t0, &t1, &tl);
+
+    _mm256_storeu_si256((__m256i *)dst, r);
+
+    dst += stride;
+    rep = _mm256_add_epi16(rep, one);
+  }
+}
+
+void aom_paeth_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  __m256i l = get_left_vector(left);
+  const __m256i t0 = get_top_vector(above);
+  const __m256i t1 = get_top_vector(above + 16);
+  const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
+  __m256i rep = _mm256_set1_epi16((short)0x8000);
+  const __m256i one = _mm256_set1_epi16(1);
+
+  int i;
+  for (i = 0; i < 16; ++i) {
+    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+
+    const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
+    const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
+
+    _mm_store_si128((__m128i *)dst, r0);
+    _mm_store_si128((__m128i *)(dst + 16), r1);
+
+    dst += stride;
+    rep = _mm256_add_epi16(rep, one);
+  }
+
+  l = get_left_vector(left + 16);
+  rep = _mm256_set1_epi16((short)0x8000);
+  for (i = 0; i < 16; ++i) {
+    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+
+    const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
+    const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
+
+    _mm_store_si128((__m128i *)dst, r0);
+    _mm_store_si128((__m128i *)(dst + 16), r1);
+
+    dst += stride;
+    rep = _mm256_add_epi16(rep, one);
+  }
+}
+
+void aom_paeth_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  const __m256i t0 = get_top_vector(above);
+  const __m256i t1 = get_top_vector(above + 16);
+  const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
+  const __m256i one = _mm256_set1_epi16(1);
+
+  int i, j;
+  for (j = 0; j < 4; ++j) {
+    const __m256i l = get_left_vector(left + j * 16);
+    __m256i rep = _mm256_set1_epi16((short)0x8000);
+    for (i = 0; i < 16; ++i) {
+      const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+
+      const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
+      const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
+
+      _mm_store_si128((__m128i *)dst, r0);
+      _mm_store_si128((__m128i *)(dst + 16), r1);
+
+      dst += stride;
+      rep = _mm256_add_epi16(rep, one);
+    }
+  }
+}
+
+void aom_paeth_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  const __m256i t0 = get_top_vector(above);
+  const __m256i t1 = get_top_vector(above + 16);
+  const __m256i t2 = get_top_vector(above + 32);
+  const __m256i t3 = get_top_vector(above + 48);
+  const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
+  const __m256i one = _mm256_set1_epi16(1);
+
+  int i, j;
+  for (j = 0; j < 2; ++j) {
+    const __m256i l = get_left_vector(left + j * 16);
+    __m256i rep = _mm256_set1_epi16((short)0x8000);
+    for (i = 0; i < 16; ++i) {
+      const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+
+      const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
+      const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
+      const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
+      const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
+
+      _mm_store_si128((__m128i *)dst, r0);
+      _mm_store_si128((__m128i *)(dst + 16), r1);
+      _mm_store_si128((__m128i *)(dst + 32), r2);
+      _mm_store_si128((__m128i *)(dst + 48), r3);
+
+      dst += stride;
+      rep = _mm256_add_epi16(rep, one);
+    }
+  }
+}
+
+void aom_paeth_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  const __m256i t0 = get_top_vector(above);
+  const __m256i t1 = get_top_vector(above + 16);
+  const __m256i t2 = get_top_vector(above + 32);
+  const __m256i t3 = get_top_vector(above + 48);
+  const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
+  const __m256i one = _mm256_set1_epi16(1);
+
+  int i, j;
+  for (j = 0; j < 4; ++j) {
+    const __m256i l = get_left_vector(left + j * 16);
+    __m256i rep = _mm256_set1_epi16((short)0x8000);
+    for (i = 0; i < 16; ++i) {
+      const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+
+      const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
+      const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
+      const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
+      const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
+
+      _mm_store_si128((__m128i *)dst, r0);
+      _mm_store_si128((__m128i *)(dst + 16), r1);
+      _mm_store_si128((__m128i *)(dst + 32), r2);
+      _mm_store_si128((__m128i *)(dst + 48), r3);
+
+      dst += stride;
+      rep = _mm256_add_epi16(rep, one);
+    }
+  }
+}
+
+void aom_paeth_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  const __m256i t0 = get_top_vector(above);
+  const __m256i t1 = get_top_vector(above + 16);
+  const __m256i t2 = get_top_vector(above + 32);
+  const __m256i t3 = get_top_vector(above + 48);
+  const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
+  const __m256i one = _mm256_set1_epi16(1);
+
+  int i;
+  const __m256i l = get_left_vector(left);
+  __m256i rep = _mm256_set1_epi16((short)0x8000);
+  for (i = 0; i < 16; ++i) {
+    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+
+    const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
+    const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
+    const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
+    const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
+
+    _mm_store_si128((__m128i *)dst, r0);
+    _mm_store_si128((__m128i *)(dst + 16), r1);
+    _mm_store_si128((__m128i *)(dst + 32), r2);
+    _mm_store_si128((__m128i *)(dst + 48), r3);
+
+    dst += stride;
+    rep = _mm256_add_epi16(rep, one);
+  }
+}
+
+#define PERM4x64(c0, c1, c2, c3) c0 + (c1 << 2) + (c2 << 4) + (c3 << 6)
+#define PERM2x128(c0, c1) c0 + (c1 << 4)
+
+static AOM_FORCE_INLINE void highbd_dr_prediction_z1_4xN_internal_avx2(
+    int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) {
+  const int frac_bits = 6 - upsample_above;
+  const int max_base_x = ((N + 4) - 1) << upsample_above;
+
+  assert(dx > 0);
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m256i a0, a1, a32, a16;
+  __m256i diff, c3f;
+  __m128i a_mbase_x, max_base_x128, base_inc128, mask128;
+  __m128i a0_128, a1_128;
+  a16 = _mm256_set1_epi16(16);
+  a_mbase_x = _mm_set1_epi16(above[max_base_x]);
+  max_base_x128 = _mm_set1_epi16(max_base_x);
+  c3f = _mm256_set1_epi16(0x3f);
+
+  int x = dx;
+  for (int r = 0; r < N; r++) {
+    __m256i b, res, shift;
+    __m128i res1;
+
+    int base = x >> frac_bits;
+    if (base >= max_base_x) {
+      for (int i = r; i < N; ++i) {
+        dst[i] = a_mbase_x;  // save 4 values
+      }
+      return;
+    }
+
+    a0_128 = _mm_loadu_si128((__m128i *)(above + base));
+    a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1));
+
+    if (upsample_above) {
+      a0_128 = _mm_shuffle_epi8(a0_128, *(__m128i *)HighbdEvenOddMaskx4[0]);
+      a1_128 = _mm_srli_si128(a0_128, 8);
+
+      base_inc128 = _mm_setr_epi16(base, base + 2, base + 4, base + 6, base + 8,
+                                   base + 10, base + 12, base + 14);
+      shift = _mm256_srli_epi16(
+          _mm256_and_si256(
+              _mm256_slli_epi16(_mm256_set1_epi16(x), upsample_above),
+              _mm256_set1_epi16(0x3f)),
+          1);
+    } else {
+      base_inc128 = _mm_setr_epi16(base, base + 1, base + 2, base + 3, base + 4,
+                                   base + 5, base + 6, base + 7);
+      shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
+    }
+    a0 = _mm256_castsi128_si256(a0_128);
+    a1 = _mm256_castsi128_si256(a1_128);
+    diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
+    a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
+    a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
+
+    b = _mm256_mullo_epi16(diff, shift);
+    res = _mm256_add_epi16(a32, b);
+    res = _mm256_srli_epi16(res, 5);
+    res1 = _mm256_castsi256_si128(res);
+
+    mask128 = _mm_cmpgt_epi16(max_base_x128, base_inc128);
+    dst[r] = _mm_blendv_epi8(a_mbase_x, res1, mask128);
+    x += dx;
+  }
+}
+
+static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_4xN_internal_avx2(
+    int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) {
+  const int frac_bits = 6 - upsample_above;
+  const int max_base_x = ((N + 4) - 1) << upsample_above;
+
+  assert(dx > 0);
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m256i a0, a1, a32, a16;
+  __m256i diff;
+  __m128i a_mbase_x, max_base_x128, base_inc128, mask128;
+
+  a16 = _mm256_set1_epi32(16);
+  a_mbase_x = _mm_set1_epi16(above[max_base_x]);
+  max_base_x128 = _mm_set1_epi32(max_base_x);
+
+  int x = dx;
+  for (int r = 0; r < N; r++) {
+    __m256i b, res, shift;
+    __m128i res1;
+
+    int base = x >> frac_bits;
+    if (base >= max_base_x) {
+      for (int i = r; i < N; ++i) {
+        dst[i] = a_mbase_x;  // save 4 values
+      }
+      return;
+    }
+
+    a0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base)));
+    a1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 1)));
+
+    if (upsample_above) {
+      a0 = _mm256_permutevar8x32_epi32(
+          a0, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
+      a1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0, 1));
+      base_inc128 = _mm_setr_epi32(base, base + 2, base + 4, base + 6);
+      shift = _mm256_srli_epi32(
+          _mm256_and_si256(
+              _mm256_slli_epi32(_mm256_set1_epi32(x), upsample_above),
+              _mm256_set1_epi32(0x3f)),
+          1);
+    } else {
+      base_inc128 = _mm_setr_epi32(base, base + 1, base + 2, base + 3);
+      shift = _mm256_srli_epi32(
+          _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
+    }
+
+    diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
+    a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
+    a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
+
+    b = _mm256_mullo_epi32(diff, shift);
+    res = _mm256_add_epi32(a32, b);
+    res = _mm256_srli_epi32(res, 5);
+
+    res1 = _mm256_castsi256_si128(res);
+    res1 = _mm_packus_epi32(res1, res1);
+
+    mask128 = _mm_cmpgt_epi32(max_base_x128, base_inc128);
+    mask128 = _mm_packs_epi32(mask128, mask128);  // goto 16 bit
+    dst[r] = _mm_blendv_epi8(a_mbase_x, res1, mask128);
+    x += dx;
+  }
+}
+
+static void highbd_dr_prediction_z1_4xN_avx2(int N, uint16_t *dst,
+                                             ptrdiff_t stride,
+                                             const uint16_t *above,
+                                             int upsample_above, int dx,
+                                             int bd) {
+  __m128i dstvec[16];
+  if (bd < 12) {
+    highbd_dr_prediction_z1_4xN_internal_avx2(N, dstvec, above, upsample_above,
+                                              dx);
+  } else {
+    highbd_dr_prediction_32bit_z1_4xN_internal_avx2(N, dstvec, above,
+                                                    upsample_above, dx);
+  }
+  for (int i = 0; i < N; i++) {
+    _mm_storel_epi64((__m128i *)(dst + stride * i), dstvec[i]);
+  }
+}
+
+static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_8xN_internal_avx2(
+    int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) {
+  const int frac_bits = 6 - upsample_above;
+  const int max_base_x = ((8 + N) - 1) << upsample_above;
+
+  assert(dx > 0);
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m256i a0, a1, a0_1, a1_1, a32, a16;
+  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
+
+  a16 = _mm256_set1_epi32(16);
+  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
+  max_base_x256 = _mm256_set1_epi32(max_base_x);
+
+  int x = dx;
+  for (int r = 0; r < N; r++) {
+    __m256i b, res, res1, shift;
+
+    int base = x >> frac_bits;
+    if (base >= max_base_x) {
+      for (int i = r; i < N; ++i) {
+        dst[i] = _mm256_castsi256_si128(a_mbase_x);  // save 8 values
+      }
+      return;
+    }
+
+    a0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base)));
+    a1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 1)));
+
+    if (upsample_above) {
+      a0 = _mm256_permutevar8x32_epi32(
+          a0, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
+      a1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0, 1));
+
+      a0_1 =
+          _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 8)));
+      a0_1 = _mm256_permutevar8x32_epi32(
+          a0_1, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
+      a1_1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0_1, 1));
+
+      a0 = _mm256_inserti128_si256(a0, _mm256_castsi256_si128(a0_1), 1);
+      a1 = _mm256_inserti128_si256(a1, _mm256_castsi256_si128(a1_1), 1);
+      base_inc256 =
+          _mm256_setr_epi32(base, base + 2, base + 4, base + 6, base + 8,
+                            base + 10, base + 12, base + 14);
+      shift = _mm256_srli_epi32(
+          _mm256_and_si256(
+              _mm256_slli_epi32(_mm256_set1_epi32(x), upsample_above),
+              _mm256_set1_epi32(0x3f)),
+          1);
+    } else {
+      base_inc256 = _mm256_setr_epi32(base, base + 1, base + 2, base + 3,
+                                      base + 4, base + 5, base + 6, base + 7);
+      shift = _mm256_srli_epi32(
+          _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
+    }
+
+    diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
+    a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
+    a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
+
+    b = _mm256_mullo_epi32(diff, shift);
+    res = _mm256_add_epi32(a32, b);
+    res = _mm256_srli_epi32(res, 5);
+
+    res1 = _mm256_packus_epi32(
+        res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
+
+    mask256 = _mm256_cmpgt_epi32(max_base_x256, base_inc256);
+    mask256 = _mm256_packs_epi32(
+        mask256, _mm256_castsi128_si256(
+                     _mm256_extracti128_si256(mask256, 1)));  // goto 16 bit
+    res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
+    dst[r] = _mm256_castsi256_si128(res1);
+    x += dx;
+  }
+}
+
+static AOM_FORCE_INLINE void highbd_dr_prediction_z1_8xN_internal_avx2(
+    int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) {
+  const int frac_bits = 6 - upsample_above;
+  const int max_base_x = ((8 + N) - 1) << upsample_above;
+
+  assert(dx > 0);
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m256i a0, a1, a32, a16, c3f;
+  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
+  __m128i a0_x128, a1_x128;
+
+  a16 = _mm256_set1_epi16(16);
+  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
+  max_base_x256 = _mm256_set1_epi16(max_base_x);
+  c3f = _mm256_set1_epi16(0x3f);
+
+  int x = dx;
+  for (int r = 0; r < N; r++) {
+    __m256i b, res, res1, shift;
+
+    int base = x >> frac_bits;
+    if (base >= max_base_x) {
+      for (int i = r; i < N; ++i) {
+        dst[i] = _mm256_castsi256_si128(a_mbase_x);  // save 8 values
+      }
+      return;
+    }
+
+    a0_x128 = _mm_loadu_si128((__m128i *)(above + base));
+    if (upsample_above) {
+      __m128i mask, atmp0, atmp1, atmp2, atmp3;
+      a1_x128 = _mm_loadu_si128((__m128i *)(above + base + 8));
+      atmp0 = _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdEvenOddMaskx[0]);
+      atmp1 = _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdEvenOddMaskx[0]);
+      atmp2 =
+          _mm_shuffle_epi8(a0_x128, *(__m128i *)(HighbdEvenOddMaskx[0] + 16));
+      atmp3 =
+          _mm_shuffle_epi8(a1_x128, *(__m128i *)(HighbdEvenOddMaskx[0] + 16));
+      mask =
+          _mm_cmpgt_epi8(*(__m128i *)HighbdEvenOddMaskx[0], _mm_set1_epi8(15));
+      a0_x128 = _mm_blendv_epi8(atmp0, atmp1, mask);
+      mask = _mm_cmpgt_epi8(*(__m128i *)(HighbdEvenOddMaskx[0] + 16),
+                            _mm_set1_epi8(15));
+      a1_x128 = _mm_blendv_epi8(atmp2, atmp3, mask);
+
+      base_inc256 = _mm256_setr_epi16(base, base + 2, base + 4, base + 6,
+                                      base + 8, base + 10, base + 12, base + 14,
+                                      0, 0, 0, 0, 0, 0, 0, 0);
+      shift = _mm256_srli_epi16(
+          _mm256_and_si256(
+              _mm256_slli_epi16(_mm256_set1_epi16(x), upsample_above), c3f),
+          1);
+    } else {
+      a1_x128 = _mm_loadu_si128((__m128i *)(above + base + 1));
+      base_inc256 = _mm256_setr_epi16(base, base + 1, base + 2, base + 3,
+                                      base + 4, base + 5, base + 6, base + 7, 0,
+                                      0, 0, 0, 0, 0, 0, 0);
+      shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
+    }
+    a0 = _mm256_castsi128_si256(a0_x128);
+    a1 = _mm256_castsi128_si256(a1_x128);
+
+    diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
+    a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
+    a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
+
+    b = _mm256_mullo_epi16(diff, shift);
+    res = _mm256_add_epi16(a32, b);
+    res = _mm256_srli_epi16(res, 5);
+
+    mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
+    res1 = _mm256_blendv_epi8(a_mbase_x, res, mask256);
+    dst[r] = _mm256_castsi256_si128(res1);
+    x += dx;
+  }
+}
+
+static void highbd_dr_prediction_z1_8xN_avx2(int N, uint16_t *dst,
+                                             ptrdiff_t stride,
+                                             const uint16_t *above,
+                                             int upsample_above, int dx,
+                                             int bd) {
+  __m128i dstvec[32];
+  if (bd < 12) {
+    highbd_dr_prediction_z1_8xN_internal_avx2(N, dstvec, above, upsample_above,
+                                              dx);
+  } else {
+    highbd_dr_prediction_32bit_z1_8xN_internal_avx2(N, dstvec, above,
+                                                    upsample_above, dx);
+  }
+  for (int i = 0; i < N; i++) {
+    _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]);
+  }
+}
+
+static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_16xN_internal_avx2(
+    int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) {
+  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
+  (void)upsample_above;
+  const int frac_bits = 6;
+  const int max_base_x = ((16 + N) - 1);
+
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m256i a0, a0_1, a1, a1_1, a32, a16;
+  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
+
+  a16 = _mm256_set1_epi32(16);
+  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
+  max_base_x256 = _mm256_set1_epi16(max_base_x);
+
+  int x = dx;
+  for (int r = 0; r < N; r++) {
+    __m256i b, res[2], res1;
+
+    int base = x >> frac_bits;
+    if (base >= max_base_x) {
+      for (int i = r; i < N; ++i) {
+        dstvec[i] = a_mbase_x;  // save 16 values
+      }
+      return;
+    }
+    __m256i shift = _mm256_srli_epi32(
+        _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
+
+    a0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base)));
+    a1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 1)));
+
+    diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
+    a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
+    a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
+    b = _mm256_mullo_epi32(diff, shift);
+
+    res[0] = _mm256_add_epi32(a32, b);
+    res[0] = _mm256_srli_epi32(res[0], 5);
+    res[0] = _mm256_packus_epi32(
+        res[0], _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1)));
+
+    int mdif = max_base_x - base;
+    if (mdif > 8) {
+      a0_1 =
+          _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 8)));
+      a1_1 =
+          _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 9)));
+
+      diff = _mm256_sub_epi32(a1_1, a0_1);  // a[x+1] - a[x]
+      a32 = _mm256_slli_epi32(a0_1, 5);     // a[x] * 32
+      a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
+      b = _mm256_mullo_epi32(diff, shift);
+
+      res[1] = _mm256_add_epi32(a32, b);
+      res[1] = _mm256_srli_epi32(res[1], 5);
+      res[1] = _mm256_packus_epi32(
+          res[1], _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1)));
+    } else {
+      res[1] = a_mbase_x;
+    }
+    res1 = _mm256_inserti128_si256(res[0], _mm256_castsi256_si128(res[1]),
+                                   1);  // 16 16bit values
+
+    base_inc256 = _mm256_setr_epi16(base, base + 1, base + 2, base + 3,
+                                    base + 4, base + 5, base + 6, base + 7,
+                                    base + 8, base + 9, base + 10, base + 11,
+                                    base + 12, base + 13, base + 14, base + 15);
+    mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
+    dstvec[r] = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
+    x += dx;
+  }
+}
+
+static AOM_FORCE_INLINE void highbd_dr_prediction_z1_16xN_internal_avx2(
+    int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) {
+  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
+  (void)upsample_above;
+  const int frac_bits = 6;
+  const int max_base_x = ((16 + N) - 1);
+
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m256i a0, a1, a32, a16, c3f;
+  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
+
+  a16 = _mm256_set1_epi16(16);
+  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
+  max_base_x256 = _mm256_set1_epi16(max_base_x);
+  c3f = _mm256_set1_epi16(0x3f);
+
+  int x = dx;
+  for (int r = 0; r < N; r++) {
+    __m256i b, res;
+
+    int base = x >> frac_bits;
+    if (base >= max_base_x) {
+      for (int i = r; i < N; ++i) {
+        dstvec[i] = a_mbase_x;  // save 16 values
+      }
+      return;
+    }
+    __m256i shift =
+        _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
+
+    a0 = _mm256_loadu_si256((__m256i *)(above + base));
+    a1 = _mm256_loadu_si256((__m256i *)(above + base + 1));
+
+    diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
+    a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
+    a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
+    b = _mm256_mullo_epi16(diff, shift);
+
+    res = _mm256_add_epi16(a32, b);
+    res = _mm256_srli_epi16(res, 5);  // 16 16bit values
+
+    base_inc256 = _mm256_setr_epi16(base, base + 1, base + 2, base + 3,
+                                    base + 4, base + 5, base + 6, base + 7,
+                                    base + 8, base + 9, base + 10, base + 11,
+                                    base + 12, base + 13, base + 14, base + 15);
+    mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
+    dstvec[r] = _mm256_blendv_epi8(a_mbase_x, res, mask256);
+    x += dx;
+  }
+}
+
+static void highbd_dr_prediction_z1_16xN_avx2(int N, uint16_t *dst,
+                                              ptrdiff_t stride,
+                                              const uint16_t *above,
+                                              int upsample_above, int dx,
+                                              int bd) {
+  __m256i dstvec[64];
+  if (bd < 12) {
+    highbd_dr_prediction_z1_16xN_internal_avx2(N, dstvec, above, upsample_above,
+                                               dx);
+  } else {
+    highbd_dr_prediction_32bit_z1_16xN_internal_avx2(N, dstvec, above,
+                                                     upsample_above, dx);
+  }
+  for (int i = 0; i < N; i++) {
+    _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]);
+  }
+}
+
+static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_32xN_internal_avx2(
+    int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) {
+  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
+  (void)upsample_above;
+  const int frac_bits = 6;
+  const int max_base_x = ((32 + N) - 1);
+
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m256i a0, a0_1, a1, a1_1, a32, a16, c3f;
+  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
+
+  a16 = _mm256_set1_epi32(16);
+  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
+  max_base_x256 = _mm256_set1_epi16(max_base_x);
+  c3f = _mm256_set1_epi16(0x3f);
+
+  int x = dx;
+  for (int r = 0; r < N; r++) {
+    __m256i b, res[2], res1;
+
+    int base = x >> frac_bits;
+    if (base >= max_base_x) {
+      for (int i = r; i < N; ++i) {
+        dstvec[i] = a_mbase_x;  // save 32 values
+        dstvec[i + N] = a_mbase_x;
+      }
+      return;
+    }
+
+    __m256i shift =
+        _mm256_srli_epi32(_mm256_and_si256(_mm256_set1_epi32(x), c3f), 1);
+
+    for (int j = 0; j < 32; j += 16) {
+      int mdif = max_base_x - (base + j);
+      if (mdif <= 0) {
+        res1 = a_mbase_x;
+      } else {
+        a0 = _mm256_cvtepu16_epi32(
+            _mm_loadu_si128((__m128i *)(above + base + j)));
+        a1 = _mm256_cvtepu16_epi32(
+            _mm_loadu_si128((__m128i *)(above + base + 1 + j)));
+
+        diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
+        a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
+        a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
+        b = _mm256_mullo_epi32(diff, shift);
+
+        res[0] = _mm256_add_epi32(a32, b);
+        res[0] = _mm256_srli_epi32(res[0], 5);
+        res[0] = _mm256_packus_epi32(
+            res[0],
+            _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1)));
+        if (mdif > 8) {
+          a0_1 = _mm256_cvtepu16_epi32(
+              _mm_loadu_si128((__m128i *)(above + base + 8 + j)));
+          a1_1 = _mm256_cvtepu16_epi32(
+              _mm_loadu_si128((__m128i *)(above + base + 9 + j)));
+
+          diff = _mm256_sub_epi32(a1_1, a0_1);  // a[x+1] - a[x]
+          a32 = _mm256_slli_epi32(a0_1, 5);     // a[x] * 32
+          a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
+          b = _mm256_mullo_epi32(diff, shift);
+
+          res[1] = _mm256_add_epi32(a32, b);
+          res[1] = _mm256_srli_epi32(res[1], 5);
+          res[1] = _mm256_packus_epi32(
+              res[1],
+              _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1)));
+        } else {
+          res[1] = a_mbase_x;
+        }
+        res1 = _mm256_inserti128_si256(res[0], _mm256_castsi256_si128(res[1]),
+                                       1);  // 16 16bit values
+        base_inc256 = _mm256_setr_epi16(
+            base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
+            base + j + 5, base + j + 6, base + j + 7, base + j + 8,
+            base + j + 9, base + j + 10, base + j + 11, base + j + 12,
+            base + j + 13, base + j + 14, base + j + 15);
+
+        mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
+        res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
+      }
+      if (!j) {
+        dstvec[r] = res1;
+      } else {
+        dstvec[r + N] = res1;
+      }
+    }
+    x += dx;
+  }
+}
+
+static AOM_FORCE_INLINE void highbd_dr_prediction_z1_32xN_internal_avx2(
+    int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) {
+  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
+  (void)upsample_above;
+  const int frac_bits = 6;
+  const int max_base_x = ((32 + N) - 1);
+
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m256i a0, a1, a32, a16, c3f;
+  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
+
+  a16 = _mm256_set1_epi16(16);
+  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
+  max_base_x256 = _mm256_set1_epi16(max_base_x);
+  c3f = _mm256_set1_epi16(0x3f);
+
+  int x = dx;
+  for (int r = 0; r < N; r++) {
+    __m256i b, res;
+
+    int base = x >> frac_bits;
+    if (base >= max_base_x) {
+      for (int i = r; i < N; ++i) {
+        dstvec[i] = a_mbase_x;  // save 32 values
+        dstvec[i + N] = a_mbase_x;
+      }
+      return;
+    }
+
+    __m256i shift =
+        _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
+
+    for (int j = 0; j < 32; j += 16) {
+      int mdif = max_base_x - (base + j);
+      if (mdif <= 0) {
+        res = a_mbase_x;
+      } else {
+        a0 = _mm256_loadu_si256((__m256i *)(above + base + j));
+        a1 = _mm256_loadu_si256((__m256i *)(above + base + 1 + j));
+
+        diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
+        a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
+        a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
+        b = _mm256_mullo_epi16(diff, shift);
+
+        res = _mm256_add_epi16(a32, b);
+        res = _mm256_srli_epi16(res, 5);
+
+        base_inc256 = _mm256_setr_epi16(
+            base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
+            base + j + 5, base + j + 6, base + j + 7, base + j + 8,
+            base + j + 9, base + j + 10, base + j + 11, base + j + 12,
+            base + j + 13, base + j + 14, base + j + 15);
+
+        mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
+        res = _mm256_blendv_epi8(a_mbase_x, res, mask256);
+      }
+      if (!j) {
+        dstvec[r] = res;
+      } else {
+        dstvec[r + N] = res;
+      }
+    }
+    x += dx;
+  }
+}
+
+static void highbd_dr_prediction_z1_32xN_avx2(int N, uint16_t *dst,
+                                              ptrdiff_t stride,
+                                              const uint16_t *above,
+                                              int upsample_above, int dx,
+                                              int bd) {
+  __m256i dstvec[128];
+  if (bd < 12) {
+    highbd_dr_prediction_z1_32xN_internal_avx2(N, dstvec, above, upsample_above,
+                                               dx);
+  } else {
+    highbd_dr_prediction_32bit_z1_32xN_internal_avx2(N, dstvec, above,
+                                                     upsample_above, dx);
+  }
+  for (int i = 0; i < N; i++) {
+    _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]);
+    _mm256_storeu_si256((__m256i *)(dst + stride * i + 16), dstvec[i + N]);
+  }
+}
+
+static void highbd_dr_prediction_32bit_z1_64xN_avx2(int N, uint16_t *dst,
+                                                    ptrdiff_t stride,
+                                                    const uint16_t *above,
+                                                    int upsample_above,
+                                                    int dx) {
+  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
+  (void)upsample_above;
+  const int frac_bits = 6;
+  const int max_base_x = ((64 + N) - 1);
+
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m256i a0, a0_1, a1, a1_1, a32, a16;
+  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
+
+  a16 = _mm256_set1_epi32(16);
+  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
+  max_base_x256 = _mm256_set1_epi16(max_base_x);
+
+  int x = dx;
+  for (int r = 0; r < N; r++, dst += stride) {
+    __m256i b, res[2], res1;
+
+    int base = x >> frac_bits;
+    if (base >= max_base_x) {
+      for (int i = r; i < N; ++i) {
+        _mm256_storeu_si256((__m256i *)dst, a_mbase_x);  // save 32 values
+        _mm256_storeu_si256((__m256i *)(dst + 16), a_mbase_x);
+        _mm256_storeu_si256((__m256i *)(dst + 32), a_mbase_x);
+        _mm256_storeu_si256((__m256i *)(dst + 48), a_mbase_x);
+        dst += stride;
+      }
+      return;
+    }
+
+    __m256i shift = _mm256_srli_epi32(
+        _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
+
+    __m128i a0_128, a0_1_128, a1_128, a1_1_128;
+    for (int j = 0; j < 64; j += 16) {
+      int mdif = max_base_x - (base + j);
+      if (mdif <= 0) {
+        _mm256_storeu_si256((__m256i *)(dst + j), a_mbase_x);
+      } else {
+        a0_128 = _mm_loadu_si128((__m128i *)(above + base + j));
+        a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1 + j));
+        a0 = _mm256_cvtepu16_epi32(a0_128);
+        a1 = _mm256_cvtepu16_epi32(a1_128);
+
+        diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
+        a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
+        a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
+        b = _mm256_mullo_epi32(diff, shift);
+
+        res[0] = _mm256_add_epi32(a32, b);
+        res[0] = _mm256_srli_epi32(res[0], 5);
+        res[0] = _mm256_packus_epi32(
+            res[0],
+            _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1)));
+        if (mdif > 8) {
+          a0_1_128 = _mm_loadu_si128((__m128i *)(above + base + 8 + j));
+          a1_1_128 = _mm_loadu_si128((__m128i *)(above + base + 9 + j));
+          a0_1 = _mm256_cvtepu16_epi32(a0_1_128);
+          a1_1 = _mm256_cvtepu16_epi32(a1_1_128);
+
+          diff = _mm256_sub_epi32(a1_1, a0_1);  // a[x+1] - a[x]
+          a32 = _mm256_slli_epi32(a0_1, 5);     // a[x] * 32
+          a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
+          b = _mm256_mullo_epi32(diff, shift);
+
+          res[1] = _mm256_add_epi32(a32, b);
+          res[1] = _mm256_srli_epi32(res[1], 5);
+          res[1] = _mm256_packus_epi32(
+              res[1],
+              _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1)));
+        } else {
+          res[1] = a_mbase_x;
+        }
+        res1 = _mm256_inserti128_si256(res[0], _mm256_castsi256_si128(res[1]),
+                                       1);  // 16 16bit values
+        base_inc256 = _mm256_setr_epi16(
+            base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
+            base + j + 5, base + j + 6, base + j + 7, base + j + 8,
+            base + j + 9, base + j + 10, base + j + 11, base + j + 12,
+            base + j + 13, base + j + 14, base + j + 15);
+
+        mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
+        res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
+        _mm256_storeu_si256((__m256i *)(dst + j), res1);
+      }
+    }
+    x += dx;
+  }
+}
+
+static void highbd_dr_prediction_z1_64xN_avx2(int N, uint16_t *dst,
+                                              ptrdiff_t stride,
+                                              const uint16_t *above,
+                                              int upsample_above, int dx) {
+  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
+  (void)upsample_above;
+  const int frac_bits = 6;
+  const int max_base_x = ((64 + N) - 1);
+
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m256i a0, a1, a32, a16, c3f;
+  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
+
+  a16 = _mm256_set1_epi16(16);
+  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
+  max_base_x256 = _mm256_set1_epi16(max_base_x);
+  c3f = _mm256_set1_epi16(0x3f);
+
+  int x = dx;
+  for (int r = 0; r < N; r++, dst += stride) {
+    __m256i b, res;
+
+    int base = x >> frac_bits;
+    if (base >= max_base_x) {
+      for (int i = r; i < N; ++i) {
+        _mm256_storeu_si256((__m256i *)dst, a_mbase_x);  // save 32 values
+        _mm256_storeu_si256((__m256i *)(dst + 16), a_mbase_x);
+        _mm256_storeu_si256((__m256i *)(dst + 32), a_mbase_x);
+        _mm256_storeu_si256((__m256i *)(dst + 48), a_mbase_x);
+        dst += stride;
+      }
+      return;
+    }
+
+    __m256i shift =
+        _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
+
+    for (int j = 0; j < 64; j += 16) {
+      int mdif = max_base_x - (base + j);
+      if (mdif <= 0) {
+        _mm256_storeu_si256((__m256i *)(dst + j), a_mbase_x);
+      } else {
+        a0 = _mm256_loadu_si256((__m256i *)(above + base + j));
+        a1 = _mm256_loadu_si256((__m256i *)(above + base + 1 + j));
+
+        diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
+        a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
+        a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
+        b = _mm256_mullo_epi16(diff, shift);
+
+        res = _mm256_add_epi16(a32, b);
+        res = _mm256_srli_epi16(res, 5);
+
+        base_inc256 = _mm256_setr_epi16(
+            base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
+            base + j + 5, base + j + 6, base + j + 7, base + j + 8,
+            base + j + 9, base + j + 10, base + j + 11, base + j + 12,
+            base + j + 13, base + j + 14, base + j + 15);
+
+        mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
+        res = _mm256_blendv_epi8(a_mbase_x, res, mask256);
+        _mm256_storeu_si256((__m256i *)(dst + j), res);  // 16 16bit values
+      }
+    }
+    x += dx;
+  }
+}
+
+// Directional prediction, zone 1: 0 < angle < 90
+void av1_highbd_dr_prediction_z1_avx2(uint16_t *dst, ptrdiff_t stride, int bw,
+                                      int bh, const uint16_t *above,
+                                      const uint16_t *left, int upsample_above,
+                                      int dx, int dy, int bd) {
+  (void)left;
+  (void)dy;
+
+  switch (bw) {
+    case 4:
+      highbd_dr_prediction_z1_4xN_avx2(bh, dst, stride, above, upsample_above,
+                                       dx, bd);
+      break;
+    case 8:
+      highbd_dr_prediction_z1_8xN_avx2(bh, dst, stride, above, upsample_above,
+                                       dx, bd);
+      break;
+    case 16:
+      highbd_dr_prediction_z1_16xN_avx2(bh, dst, stride, above, upsample_above,
+                                        dx, bd);
+      break;
+    case 32:
+      highbd_dr_prediction_z1_32xN_avx2(bh, dst, stride, above, upsample_above,
+                                        dx, bd);
+      break;
+    case 64:
+      if (bd < 12) {
+        highbd_dr_prediction_z1_64xN_avx2(bh, dst, stride, above,
+                                          upsample_above, dx);
+      } else {
+        highbd_dr_prediction_32bit_z1_64xN_avx2(bh, dst, stride, above,
+                                                upsample_above, dx);
+      }
+      break;
+    default: break;
+  }
+  return;
+}
+
+static void highbd_transpose_TX_16X16(const uint16_t *src, ptrdiff_t pitchSrc,
+                                      uint16_t *dst, ptrdiff_t pitchDst) {
+  __m256i r[16];
+  __m256i d[16];
+  for (int j = 0; j < 16; j++) {
+    r[j] = _mm256_loadu_si256((__m256i *)(src + j * pitchSrc));
+  }
+  highbd_transpose16x16_avx2(r, d);
+  for (int j = 0; j < 16; j++) {
+    _mm256_storeu_si256((__m256i *)(dst + j * pitchDst), d[j]);
+  }
+}
+
+static void highbd_transpose(const uint16_t *src, ptrdiff_t pitchSrc,
+                             uint16_t *dst, ptrdiff_t pitchDst, int width,
+                             int height) {
+  for (int j = 0; j < height; j += 16)
+    for (int i = 0; i < width; i += 16)
+      highbd_transpose_TX_16X16(src + i * pitchSrc + j, pitchSrc,
+                                dst + j * pitchDst + i, pitchDst);
+}
+
+static void highbd_dr_prediction_32bit_z2_Nx4_avx2(
+    int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
+    const uint16_t *left, int upsample_above, int upsample_left, int dx,
+    int dy) {
+  const int min_base_x = -(1 << upsample_above);
+  const int min_base_y = -(1 << upsample_left);
+  const int frac_bits_x = 6 - upsample_above;
+  const int frac_bits_y = 6 - upsample_left;
+
+  assert(dx > 0);
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m256i a0_x, a1_x, a32, a16;
+  __m256i diff;
+  __m128i c3f, min_base_y128;
+
+  a16 = _mm256_set1_epi32(16);
+  c3f = _mm_set1_epi32(0x3f);
+  min_base_y128 = _mm_set1_epi32(min_base_y);
+
+  for (int r = 0; r < N; r++) {
+    __m256i b, res, shift;
+    __m128i resx, resy, resxy;
+    __m128i a0_x128, a1_x128;
+    int y = r + 1;
+    int base_x = (-y * dx) >> frac_bits_x;
+    int base_shift = 0;
+    if (base_x < (min_base_x - 1)) {
+      base_shift = (min_base_x - base_x) >> upsample_above;
+    }
+    int base_min_diff =
+        (min_base_x - base_x + upsample_above) >> upsample_above;
+    if (base_min_diff > 4) {
+      base_min_diff = 4;
+    } else {
+      if (base_min_diff < 0) base_min_diff = 0;
+    }
+
+    if (base_shift > 3) {
+      a0_x = _mm256_setzero_si256();
+      a1_x = _mm256_setzero_si256();
+      shift = _mm256_setzero_si256();
+    } else {
+      a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
+      if (upsample_above) {
+        a0_x128 = _mm_shuffle_epi8(a0_x128,
+                                   *(__m128i *)HighbdEvenOddMaskx4[base_shift]);
+        a1_x128 = _mm_srli_si128(a0_x128, 8);
+
+        shift = _mm256_castsi128_si256(_mm_srli_epi32(
+            _mm_and_si128(
+                _mm_slli_epi32(
+                    _mm_setr_epi32(-y * dx, (1 << 6) - y * dx,
+                                   (2 << 6) - y * dx, (3 << 6) - y * dx),
+                    upsample_above),
+                c3f),
+            1));
+      } else {
+        a0_x128 =
+            _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
+        a1_x128 = _mm_srli_si128(a0_x128, 2);
+
+        shift = _mm256_castsi128_si256(_mm_srli_epi32(
+            _mm_and_si128(_mm_setr_epi32(-y * dx, (1 << 6) - y * dx,
+                                         (2 << 6) - y * dx, (3 << 6) - y * dx),
+                          c3f),
+            1));
+      }
+      a0_x = _mm256_cvtepu16_epi32(a0_x128);
+      a1_x = _mm256_cvtepu16_epi32(a1_x128);
+    }
+    // y calc
+    __m128i a0_y, a1_y, shifty;
+    if (base_x < min_base_x) {
+      __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128;
+      DECLARE_ALIGNED(32, int, base_y_c[4]);
+      r6 = _mm_set1_epi32(r << 6);
+      dy128 = _mm_set1_epi32(dy);
+      c1234 = _mm_setr_epi32(1, 2, 3, 4);
+      y_c128 = _mm_sub_epi32(r6, _mm_mullo_epi32(c1234, dy128));
+      base_y_c128 = _mm_srai_epi32(y_c128, frac_bits_y);
+      mask128 = _mm_cmpgt_epi32(min_base_y128, base_y_c128);
+      base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
+      _mm_store_si128((__m128i *)base_y_c, base_y_c128);
+
+      a0_y = _mm_setr_epi32(left[base_y_c[0]], left[base_y_c[1]],
+                            left[base_y_c[2]], left[base_y_c[3]]);
+      a1_y = _mm_setr_epi32(left[base_y_c[0] + 1], left[base_y_c[1] + 1],
+                            left[base_y_c[2] + 1], left[base_y_c[3] + 1]);
+
+      if (upsample_left) {
+        shifty = _mm_srli_epi32(
+            _mm_and_si128(_mm_slli_epi32(y_c128, upsample_left), c3f), 1);
+      } else {
+        shifty = _mm_srli_epi32(_mm_and_si128(y_c128, c3f), 1);
+      }
+      a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
+      a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
+      shift = _mm256_inserti128_si256(shift, shifty, 1);
+    }
+
+    diff = _mm256_sub_epi32(a1_x, a0_x);  // a[x+1] - a[x]
+    a32 = _mm256_slli_epi32(a0_x, 5);     // a[x] * 32
+    a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
+
+    b = _mm256_mullo_epi32(diff, shift);
+    res = _mm256_add_epi32(a32, b);
+    res = _mm256_srli_epi32(res, 5);
+
+    resx = _mm256_castsi256_si128(res);
+    resx = _mm_packus_epi32(resx, resx);
+
+    resy = _mm256_extracti128_si256(res, 1);
+    resy = _mm_packus_epi32(resy, resy);
+
+    resxy =
+        _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
+    _mm_storel_epi64((__m128i *)(dst), resxy);
+    dst += stride;
+  }
+}
+
+static void highbd_dr_prediction_z2_Nx4_avx2(
+    int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
+    const uint16_t *left, int upsample_above, int upsample_left, int dx,
+    int dy) {
+  const int min_base_x = -(1 << upsample_above);
+  const int min_base_y = -(1 << upsample_left);
+  const int frac_bits_x = 6 - upsample_above;
+  const int frac_bits_y = 6 - upsample_left;
+
+  assert(dx > 0);
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m256i a0_x, a1_x, a32, a16;
+  __m256i diff;
+  __m128i c3f, min_base_y128;
+
+  a16 = _mm256_set1_epi16(16);
+  c3f = _mm_set1_epi16(0x3f);
+  min_base_y128 = _mm_set1_epi16(min_base_y);
+
+  for (int r = 0; r < N; r++) {
+    __m256i b, res, shift;
+    __m128i resx, resy, resxy;
+    __m128i a0_x128, a1_x128;
+    int y = r + 1;
+    int base_x = (-y * dx) >> frac_bits_x;
+    int base_shift = 0;
+    if (base_x < (min_base_x - 1)) {
+      base_shift = (min_base_x - base_x) >> upsample_above;
+    }
+    int base_min_diff =
+        (min_base_x - base_x + upsample_above) >> upsample_above;
+    if (base_min_diff > 4) {
+      base_min_diff = 4;
+    } else {
+      if (base_min_diff < 0) base_min_diff = 0;
+    }
+
+    if (base_shift > 3) {
+      a0_x = _mm256_setzero_si256();
+      a1_x = _mm256_setzero_si256();
+      shift = _mm256_setzero_si256();
+    } else {
+      a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
+      if (upsample_above) {
+        a0_x128 = _mm_shuffle_epi8(a0_x128,
+                                   *(__m128i *)HighbdEvenOddMaskx4[base_shift]);
+        a1_x128 = _mm_srli_si128(a0_x128, 8);
+
+        shift = _mm256_castsi128_si256(_mm_srli_epi16(
+            _mm_and_si128(
+                _mm_slli_epi16(_mm_setr_epi16(-y * dx, (1 << 6) - y * dx,
+                                              (2 << 6) - y * dx,
+                                              (3 << 6) - y * dx, 0, 0, 0, 0),
+                               upsample_above),
+                c3f),
+            1));
+      } else {
+        a0_x128 =
+            _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
+        a1_x128 = _mm_srli_si128(a0_x128, 2);
+
+        shift = _mm256_castsi128_si256(_mm_srli_epi16(
+            _mm_and_si128(
+                _mm_setr_epi16(-y * dx, (1 << 6) - y * dx, (2 << 6) - y * dx,
+                               (3 << 6) - y * dx, 0, 0, 0, 0),
+                c3f),
+            1));
+      }
+      a0_x = _mm256_castsi128_si256(a0_x128);
+      a1_x = _mm256_castsi128_si256(a1_x128);
+    }
+    // y calc
+    __m128i a0_y, a1_y, shifty;
+    if (base_x < min_base_x) {
+      __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128;
+      DECLARE_ALIGNED(32, int16_t, base_y_c[8]);
+      r6 = _mm_set1_epi16(r << 6);
+      dy128 = _mm_set1_epi16(dy);
+      c1234 = _mm_setr_epi16(1, 2, 3, 4, 0, 0, 0, 0);
+      y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128));
+      base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
+      mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
+      base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
+      _mm_store_si128((__m128i *)base_y_c, base_y_c128);
+
+      a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
+                            left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0);
+      a1_y = _mm_setr_epi16(left[base_y_c[0] + 1], left[base_y_c[1] + 1],
+                            left[base_y_c[2] + 1], left[base_y_c[3] + 1], 0, 0,
+                            0, 0);
+
+      if (upsample_left) {
+        shifty = _mm_srli_epi16(
+            _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1);
+      } else {
+        shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
+      }
+      a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
+      a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
+      shift = _mm256_inserti128_si256(shift, shifty, 1);
+    }
+
+    diff = _mm256_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
+    a32 = _mm256_slli_epi16(a0_x, 5);     // a[x] * 32
+    a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
+
+    b = _mm256_mullo_epi16(diff, shift);
+    res = _mm256_add_epi16(a32, b);
+    res = _mm256_srli_epi16(res, 5);
+
+    resx = _mm256_castsi256_si128(res);
+    resy = _mm256_extracti128_si256(res, 1);
+    resxy =
+        _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
+    _mm_storel_epi64((__m128i *)(dst), resxy);
+    dst += stride;
+  }
+}
+
+static void highbd_dr_prediction_32bit_z2_Nx8_avx2(
+    int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
+    const uint16_t *left, int upsample_above, int upsample_left, int dx,
+    int dy) {
+  const int min_base_x = -(1 << upsample_above);
+  const int min_base_y = -(1 << upsample_left);
+  const int frac_bits_x = 6 - upsample_above;
+  const int frac_bits_y = 6 - upsample_left;
+
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m256i a0_x, a1_x, a0_y, a1_y, a32, a16, c3f, min_base_y256;
+  __m256i diff;
+  __m128i a0_x128, a1_x128;
+
+  a16 = _mm256_set1_epi32(16);
+  c3f = _mm256_set1_epi32(0x3f);
+  min_base_y256 = _mm256_set1_epi32(min_base_y);
+
+  for (int r = 0; r < N; r++) {
+    __m256i b, res, shift;
+    __m128i resx, resy, resxy;
+    int y = r + 1;
+    int base_x = (-y * dx) >> frac_bits_x;
+    int base_shift = 0;
+    if (base_x < (min_base_x - 1)) {
+      base_shift = (min_base_x - base_x) >> upsample_above;
+    }
+    int base_min_diff =
+        (min_base_x - base_x + upsample_above) >> upsample_above;
+    if (base_min_diff > 8) {
+      base_min_diff = 8;
+    } else {
+      if (base_min_diff < 0) base_min_diff = 0;
+    }
+
+    if (base_shift > 7) {
+      resx = _mm_setzero_si128();
+    } else {
+      a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
+      if (upsample_above) {
+        __m128i mask, atmp0, atmp1, atmp2, atmp3;
+        a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 8 + base_shift));
+        atmp0 = _mm_shuffle_epi8(a0_x128,
+                                 *(__m128i *)HighbdEvenOddMaskx[base_shift]);
+        atmp1 = _mm_shuffle_epi8(a1_x128,
+                                 *(__m128i *)HighbdEvenOddMaskx[base_shift]);
+        atmp2 = _mm_shuffle_epi8(
+            a0_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
+        atmp3 = _mm_shuffle_epi8(
+            a1_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
+        mask = _mm_cmpgt_epi8(*(__m128i *)HighbdEvenOddMaskx[base_shift],
+                              _mm_set1_epi8(15));
+        a0_x128 = _mm_blendv_epi8(atmp0, atmp1, mask);
+        mask = _mm_cmpgt_epi8(*(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16),
+                              _mm_set1_epi8(15));
+        a1_x128 = _mm_blendv_epi8(atmp2, atmp3, mask);
+        shift = _mm256_srli_epi32(
+            _mm256_and_si256(
+                _mm256_slli_epi32(
+                    _mm256_setr_epi32(-y * dx, (1 << 6) - y * dx,
+                                      (2 << 6) - y * dx, (3 << 6) - y * dx,
+                                      (4 << 6) - y * dx, (5 << 6) - y * dx,
+                                      (6 << 6) - y * dx, (7 << 6) - y * dx),
+                    upsample_above),
+                c3f),
+            1);
+      } else {
+        a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 1 + base_shift));
+        a0_x128 =
+            _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
+        a1_x128 =
+            _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
+
+        shift = _mm256_srli_epi32(
+            _mm256_and_si256(
+                _mm256_setr_epi32(-y * dx, (1 << 6) - y * dx, (2 << 6) - y * dx,
+                                  (3 << 6) - y * dx, (4 << 6) - y * dx,
+                                  (5 << 6) - y * dx, (6 << 6) - y * dx,
+                                  (7 << 6) - y * dx),
+                c3f),
+            1);
+      }
+      a0_x = _mm256_cvtepu16_epi32(a0_x128);
+      a1_x = _mm256_cvtepu16_epi32(a1_x128);
+
+      diff = _mm256_sub_epi32(a1_x, a0_x);  // a[x+1] - a[x]
+      a32 = _mm256_slli_epi32(a0_x, 5);     // a[x] * 32
+      a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
+
+      b = _mm256_mullo_epi32(diff, shift);
+      res = _mm256_add_epi32(a32, b);
+      res = _mm256_srli_epi32(res, 5);
+
+      resx = _mm256_castsi256_si128(_mm256_packus_epi32(
+          res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
+    }
+    // y calc
+    if (base_x < min_base_x) {
+      DECLARE_ALIGNED(32, int, base_y_c[8]);
+      __m256i r6, c256, dy256, y_c256, base_y_c256, mask256;
+      r6 = _mm256_set1_epi32(r << 6);
+      dy256 = _mm256_set1_epi32(dy);
+      c256 = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+      y_c256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256));
+      base_y_c256 = _mm256_srai_epi32(y_c256, frac_bits_y);
+      mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256);
+      base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
+      _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
+
+      a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
+          left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
+          left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
+          left[base_y_c[6]], left[base_y_c[7]]));
+      a1_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
+          left[base_y_c[0] + 1], left[base_y_c[1] + 1], left[base_y_c[2] + 1],
+          left[base_y_c[3] + 1], left[base_y_c[4] + 1], left[base_y_c[5] + 1],
+          left[base_y_c[6] + 1], left[base_y_c[7] + 1]));
+
+      if (upsample_left) {
+        shift = _mm256_srli_epi32(
+            _mm256_and_si256(_mm256_slli_epi32((y_c256), upsample_left), c3f),
+            1);
+      } else {
+        shift = _mm256_srli_epi32(_mm256_and_si256(y_c256, c3f), 1);
+      }
+      diff = _mm256_sub_epi32(a1_y, a0_y);  // a[x+1] - a[x]
+      a32 = _mm256_slli_epi32(a0_y, 5);     // a[x] * 32
+      a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
+
+      b = _mm256_mullo_epi32(diff, shift);
+      res = _mm256_add_epi32(a32, b);
+      res = _mm256_srli_epi32(res, 5);
+
+      resy = _mm256_castsi256_si128(_mm256_packus_epi32(
+          res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
+    } else {
+      resy = resx;
+    }
+    resxy =
+        _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
+    _mm_storeu_si128((__m128i *)(dst), resxy);
+    dst += stride;
+  }
+}
+
+static void highbd_dr_prediction_z2_Nx8_avx2(
+    int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
+    const uint16_t *left, int upsample_above, int upsample_left, int dx,
+    int dy) {
+  const int min_base_x = -(1 << upsample_above);
+  const int min_base_y = -(1 << upsample_left);
+  const int frac_bits_x = 6 - upsample_above;
+  const int frac_bits_y = 6 - upsample_left;
+
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m128i c3f, min_base_y128;
+  __m256i a0_x, a1_x, diff, a32, a16;
+  __m128i a0_x128, a1_x128;
+
+  a16 = _mm256_set1_epi16(16);
+  c3f = _mm_set1_epi16(0x3f);
+  min_base_y128 = _mm_set1_epi16(min_base_y);
+
+  for (int r = 0; r < N; r++) {
+    __m256i b, res, shift;
+    __m128i resx, resy, resxy;
+    int y = r + 1;
+    int base_x = (-y * dx) >> frac_bits_x;
+    int base_shift = 0;
+    if (base_x < (min_base_x - 1)) {
+      base_shift = (min_base_x - base_x) >> upsample_above;
+    }
+    int base_min_diff =
+        (min_base_x - base_x + upsample_above) >> upsample_above;
+    if (base_min_diff > 8) {
+      base_min_diff = 8;
+    } else {
+      if (base_min_diff < 0) base_min_diff = 0;
+    }
+
+    if (base_shift > 7) {
+      a0_x = _mm256_setzero_si256();
+      a1_x = _mm256_setzero_si256();
+      shift = _mm256_setzero_si256();
+    } else {
+      a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
+      if (upsample_above) {
+        __m128i mask, atmp0, atmp1, atmp2, atmp3;
+        a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 8 + base_shift));
+        atmp0 = _mm_shuffle_epi8(a0_x128,
+                                 *(__m128i *)HighbdEvenOddMaskx[base_shift]);
+        atmp1 = _mm_shuffle_epi8(a1_x128,
+                                 *(__m128i *)HighbdEvenOddMaskx[base_shift]);
+        atmp2 = _mm_shuffle_epi8(
+            a0_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
+        atmp3 = _mm_shuffle_epi8(
+            a1_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
+        mask = _mm_cmpgt_epi8(*(__m128i *)HighbdEvenOddMaskx[base_shift],
+                              _mm_set1_epi8(15));
+        a0_x128 = _mm_blendv_epi8(atmp0, atmp1, mask);
+        mask = _mm_cmpgt_epi8(*(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16),
+                              _mm_set1_epi8(15));
+        a1_x128 = _mm_blendv_epi8(atmp2, atmp3, mask);
+
+        shift = _mm256_castsi128_si256(_mm_srli_epi16(
+            _mm_and_si128(
+                _mm_slli_epi16(
+                    _mm_setr_epi16(-y * dx, (1 << 6) - y * dx,
+                                   (2 << 6) - y * dx, (3 << 6) - y * dx,
+                                   (4 << 6) - y * dx, (5 << 6) - y * dx,
+                                   (6 << 6) - y * dx, (7 << 6) - y * dx),
+                    upsample_above),
+                c3f),
+            1));
+      } else {
+        a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 1 + base_shift));
+        a0_x128 =
+            _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
+        a1_x128 =
+            _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
+
+        shift = _mm256_castsi128_si256(_mm_srli_epi16(
+            _mm_and_si128(_mm_setr_epi16(-y * dx, (1 << 6) - y * dx,
+                                         (2 << 6) - y * dx, (3 << 6) - y * dx,
+                                         (4 << 6) - y * dx, (5 << 6) - y * dx,
+                                         (6 << 6) - y * dx, (7 << 6) - y * dx),
+                          c3f),
+            1));
+      }
+      a0_x = _mm256_castsi128_si256(a0_x128);
+      a1_x = _mm256_castsi128_si256(a1_x128);
+    }
+
+    // y calc
+    __m128i a0_y, a1_y, shifty;
+    if (base_x < min_base_x) {
+      DECLARE_ALIGNED(32, int16_t, base_y_c[8]);
+      __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128;
+      r6 = _mm_set1_epi16(r << 6);
+      dy128 = _mm_set1_epi16(dy);
+      c1234 = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+      y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128));
+      base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
+      mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
+      base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
+      _mm_store_si128((__m128i *)base_y_c, base_y_c128);
+
+      a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
+                            left[base_y_c[2]], left[base_y_c[3]],
+                            left[base_y_c[4]], left[base_y_c[5]],
+                            left[base_y_c[6]], left[base_y_c[7]]);
+      a1_y = _mm_setr_epi16(left[base_y_c[0] + 1], left[base_y_c[1] + 1],
+                            left[base_y_c[2] + 1], left[base_y_c[3] + 1],
+                            left[base_y_c[4] + 1], left[base_y_c[5] + 1],
+                            left[base_y_c[6] + 1], left[base_y_c[7] + 1]);
+
+      if (upsample_left) {
+        shifty = _mm_srli_epi16(
+            _mm_and_si128(_mm_slli_epi16((y_c128), upsample_left), c3f), 1);
+      } else {
+        shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
+      }
+      a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
+      a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
+      shift = _mm256_inserti128_si256(shift, shifty, 1);
+    }
+
+    diff = _mm256_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
+    a32 = _mm256_slli_epi16(a0_x, 5);     // a[x] * 32
+    a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
+
+    b = _mm256_mullo_epi16(diff, shift);
+    res = _mm256_add_epi16(a32, b);
+    res = _mm256_srli_epi16(res, 5);
+
+    resx = _mm256_castsi256_si128(res);
+    resy = _mm256_extracti128_si256(res, 1);
+
+    resxy =
+        _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
+    _mm_storeu_si128((__m128i *)(dst), resxy);
+    dst += stride;
+  }
+}
+
+static void highbd_dr_prediction_32bit_z2_HxW_avx2(
+    int H, int W, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
+    const uint16_t *left, int upsample_above, int upsample_left, int dx,
+    int dy) {
+  // here upsample_above and upsample_left are 0 by design of
+  // av1_use_intra_edge_upsample
+  const int min_base_x = -1;
+  const int min_base_y = -1;
+  (void)upsample_above;
+  (void)upsample_left;
+  const int frac_bits_x = 6;
+  const int frac_bits_y = 6;
+
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m256i a0_x, a1_x, a0_y, a1_y, a32, a0_1_x, a1_1_x, a16, c1;
+  __m256i diff, min_base_y256, c3f, dy256, c1234, c0123, c8;
+  __m128i a0_x128, a1_x128, a0_1_x128, a1_1_x128;
+  DECLARE_ALIGNED(32, int, base_y_c[16]);
+
+  a16 = _mm256_set1_epi32(16);
+  c1 = _mm256_srli_epi32(a16, 4);
+  c8 = _mm256_srli_epi32(a16, 1);
+  min_base_y256 = _mm256_set1_epi16(min_base_y);
+  c3f = _mm256_set1_epi32(0x3f);
+  dy256 = _mm256_set1_epi32(dy);
+  c0123 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+  c1234 = _mm256_add_epi32(c0123, c1);
+
+  for (int r = 0; r < H; r++) {
+    __m256i b, res, shift, ydx;
+    __m256i resx[2], resy[2];
+    __m256i resxy, j256, r6;
+    for (int j = 0; j < W; j += 16) {
+      j256 = _mm256_set1_epi32(j);
+      int y = r + 1;
+      ydx = _mm256_set1_epi32(y * dx);
+
+      int base_x = ((j << 6) - y * dx) >> frac_bits_x;
+      int base_shift = 0;
+      if ((base_x) < (min_base_x - 1)) {
+        base_shift = (min_base_x - base_x - 1);
+      }
+      int base_min_diff = (min_base_x - base_x);
+      if (base_min_diff > 16) {
+        base_min_diff = 16;
+      } else {
+        if (base_min_diff < 0) base_min_diff = 0;
+      }
+
+      if (base_shift > 7) {
+        resx[0] = _mm256_setzero_si256();
+      } else {
+        a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
+        a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1));
+        a0_x128 =
+            _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
+        a1_x128 =
+            _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
+
+        a0_x = _mm256_cvtepu16_epi32(a0_x128);
+        a1_x = _mm256_cvtepu16_epi32(a1_x128);
+
+        r6 = _mm256_slli_epi32(_mm256_add_epi32(c0123, j256), 6);
+        shift = _mm256_srli_epi32(
+            _mm256_and_si256(_mm256_sub_epi32(r6, ydx), c3f), 1);
+
+        diff = _mm256_sub_epi32(a1_x, a0_x);  // a[x+1] - a[x]
+        a32 = _mm256_slli_epi32(a0_x, 5);     // a[x] * 32
+        a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
+
+        b = _mm256_mullo_epi32(diff, shift);
+        res = _mm256_add_epi32(a32, b);
+        res = _mm256_srli_epi32(res, 5);
+
+        resx[0] = _mm256_packus_epi32(
+            res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
+      }
+      int base_shift8 = 0;
+      if ((base_x + 8) < (min_base_x - 1)) {
+        base_shift8 = (min_base_x - (base_x + 8) - 1);
+      }
+      if (base_shift8 > 7) {
+        resx[1] = _mm256_setzero_si256();
+      } else {
+        a0_1_x128 =
+            _mm_loadu_si128((__m128i *)(above + base_x + base_shift8 + 8));
+        a1_1_x128 =
+            _mm_loadu_si128((__m128i *)(above + base_x + base_shift8 + 9));
+        a0_1_x128 = _mm_shuffle_epi8(a0_1_x128,
+                                     *(__m128i *)HighbdLoadMaskx[base_shift8]);
+        a1_1_x128 = _mm_shuffle_epi8(a1_1_x128,
+                                     *(__m128i *)HighbdLoadMaskx[base_shift8]);
+
+        a0_1_x = _mm256_cvtepu16_epi32(a0_1_x128);
+        a1_1_x = _mm256_cvtepu16_epi32(a1_1_x128);
+
+        r6 = _mm256_slli_epi32(
+            _mm256_add_epi32(c0123, _mm256_add_epi32(j256, c8)), 6);
+        shift = _mm256_srli_epi32(
+            _mm256_and_si256(_mm256_sub_epi32(r6, ydx), c3f), 1);
+
+        diff = _mm256_sub_epi32(a1_1_x, a0_1_x);  // a[x+1] - a[x]
+        a32 = _mm256_slli_epi32(a0_1_x, 5);       // a[x] * 32
+        a32 = _mm256_add_epi32(a32, a16);         // a[x] * 32 + 16
+        b = _mm256_mullo_epi32(diff, shift);
+
+        resx[1] = _mm256_add_epi32(a32, b);
+        resx[1] = _mm256_srli_epi32(resx[1], 5);
+        resx[1] = _mm256_packus_epi32(
+            resx[1],
+            _mm256_castsi128_si256(_mm256_extracti128_si256(resx[1], 1)));
+      }
+      resx[0] =
+          _mm256_inserti128_si256(resx[0], _mm256_castsi256_si128(resx[1]),
+                                  1);  // 16 16bit values
+
+      // y calc
+      resy[0] = _mm256_setzero_si256();
+      if ((base_x < min_base_x)) {
+        __m256i c256, y_c256, y_c_1_256, base_y_c256, mask256;
+        r6 = _mm256_set1_epi32(r << 6);
+        c256 = _mm256_add_epi32(j256, c1234);
+        y_c256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256));
+        base_y_c256 = _mm256_srai_epi32(y_c256, frac_bits_y);
+        mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256);
+        base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
+        _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
+        c256 = _mm256_add_epi32(c256, c8);
+        y_c_1_256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256));
+        base_y_c256 = _mm256_srai_epi32(y_c_1_256, frac_bits_y);
+        mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256);
+        base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
+        _mm256_store_si256((__m256i *)(base_y_c + 8), base_y_c256);
+
+        a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
+            left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
+            left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
+            left[base_y_c[6]], left[base_y_c[7]]));
+        a1_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
+            left[base_y_c[0] + 1], left[base_y_c[1] + 1], left[base_y_c[2] + 1],
+            left[base_y_c[3] + 1], left[base_y_c[4] + 1], left[base_y_c[5] + 1],
+            left[base_y_c[6] + 1], left[base_y_c[7] + 1]));
+
+        shift = _mm256_srli_epi32(_mm256_and_si256(y_c256, c3f), 1);
+
+        diff = _mm256_sub_epi32(a1_y, a0_y);  // a[x+1] - a[x]
+        a32 = _mm256_slli_epi32(a0_y, 5);     // a[x] * 32
+        a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
+
+        b = _mm256_mullo_epi32(diff, shift);
+        res = _mm256_add_epi32(a32, b);
+        res = _mm256_srli_epi32(res, 5);
+
+        resy[0] = _mm256_packus_epi32(
+            res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
+
+        a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
+            left[base_y_c[8]], left[base_y_c[9]], left[base_y_c[10]],
+            left[base_y_c[11]], left[base_y_c[12]], left[base_y_c[13]],
+            left[base_y_c[14]], left[base_y_c[15]]));
+        a1_y = _mm256_cvtepu16_epi32(
+            _mm_setr_epi16(left[base_y_c[8] + 1], left[base_y_c[9] + 1],
+                           left[base_y_c[10] + 1], left[base_y_c[11] + 1],
+                           left[base_y_c[12] + 1], left[base_y_c[13] + 1],
+                           left[base_y_c[14] + 1], left[base_y_c[15] + 1]));
+        shift = _mm256_srli_epi32(_mm256_and_si256(y_c_1_256, c3f), 1);
+
+        diff = _mm256_sub_epi32(a1_y, a0_y);  // a[x+1] - a[x]
+        a32 = _mm256_slli_epi32(a0_y, 5);     // a[x] * 32
+        a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
+
+        b = _mm256_mullo_epi32(diff, shift);
+        res = _mm256_add_epi32(a32, b);
+        res = _mm256_srli_epi32(res, 5);
+
+        resy[1] = _mm256_packus_epi32(
+            res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
+
+        resy[0] =
+            _mm256_inserti128_si256(resy[0], _mm256_castsi256_si128(resy[1]),
+                                    1);  // 16 16bit values
+      }
+
+      resxy = _mm256_blendv_epi8(resx[0], resy[0],
+                                 *(__m256i *)HighbdBaseMask[base_min_diff]);
+      _mm256_storeu_si256((__m256i *)(dst + j), resxy);
+    }  // for j
+    dst += stride;
+  }
+}
+
+static void highbd_dr_prediction_z2_HxW_avx2(
+    int H, int W, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
+    const uint16_t *left, int upsample_above, int upsample_left, int dx,
+    int dy) {
+  // here upsample_above and upsample_left are 0 by design of
+  // av1_use_intra_edge_upsample
+  const int min_base_x = -1;
+  const int min_base_y = -1;
+  (void)upsample_above;
+  (void)upsample_left;
+  const int frac_bits_x = 6;
+  const int frac_bits_y = 6;
+
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m256i a0_x, a1_x, a32, a16, c3f, c1;
+  __m256i diff, min_base_y256, dy256, c1234, c0123;
+  DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
+
+  a16 = _mm256_set1_epi16(16);
+  c1 = _mm256_srli_epi16(a16, 4);
+  min_base_y256 = _mm256_set1_epi16(min_base_y);
+  c3f = _mm256_set1_epi16(0x3f);
+  dy256 = _mm256_set1_epi16(dy);
+  c0123 =
+      _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+  c1234 = _mm256_add_epi16(c0123, c1);
+
+  for (int r = 0; r < H; r++) {
+    __m256i b, res, shift;
+    __m256i resx, resy, ydx;
+    __m256i resxy, j256, r6;
+    __m128i a0_x128, a1_x128, a0_1_x128, a1_1_x128;
+    int y = r + 1;
+    ydx = _mm256_set1_epi16((short)(y * dx));
+
+    for (int j = 0; j < W; j += 16) {
+      j256 = _mm256_set1_epi16(j);
+      int base_x = ((j << 6) - y * dx) >> frac_bits_x;
+      int base_shift = 0;
+      if ((base_x) < (min_base_x - 1)) {
+        base_shift = (min_base_x - (base_x)-1);
+      }
+      int base_min_diff = (min_base_x - base_x);
+      if (base_min_diff > 16) {
+        base_min_diff = 16;
+      } else {
+        if (base_min_diff < 0) base_min_diff = 0;
+      }
+
+      if (base_shift < 8) {
+        a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
+        a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1));
+        a0_x128 =
+            _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
+        a1_x128 =
+            _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
+
+        a0_x = _mm256_castsi128_si256(a0_x128);
+        a1_x = _mm256_castsi128_si256(a1_x128);
+      } else {
+        a0_x = _mm256_setzero_si256();
+        a1_x = _mm256_setzero_si256();
+      }
+
+      int base_shift1 = 0;
+      if (base_shift > 8) {
+        base_shift1 = base_shift - 8;
+      }
+      if (base_shift1 < 8) {
+        a0_1_x128 =
+            _mm_loadu_si128((__m128i *)(above + base_x + base_shift1 + 8));
+        a1_1_x128 =
+            _mm_loadu_si128((__m128i *)(above + base_x + base_shift1 + 9));
+        a0_1_x128 = _mm_shuffle_epi8(a0_1_x128,
+                                     *(__m128i *)HighbdLoadMaskx[base_shift1]);
+        a1_1_x128 = _mm_shuffle_epi8(a1_1_x128,
+                                     *(__m128i *)HighbdLoadMaskx[base_shift1]);
+
+        a0_x = _mm256_inserti128_si256(a0_x, a0_1_x128, 1);
+        a1_x = _mm256_inserti128_si256(a1_x, a1_1_x128, 1);
+      }
+      r6 = _mm256_slli_epi16(_mm256_add_epi16(c0123, j256), 6);
+      shift = _mm256_srli_epi16(
+          _mm256_and_si256(_mm256_sub_epi16(r6, ydx), c3f), 1);
+
+      diff = _mm256_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
+      a32 = _mm256_slli_epi16(a0_x, 5);     // a[x] * 32
+      a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
+
+      b = _mm256_mullo_epi16(diff, shift);
+      res = _mm256_add_epi16(a32, b);
+      resx = _mm256_srli_epi16(res, 5);  // 16 16-bit values
+
+      // y calc
+      resy = _mm256_setzero_si256();
+      __m256i a0_y, a1_y, shifty;
+      if ((base_x < min_base_x)) {
+        __m256i c256, y_c256, base_y_c256, mask256, mul16;
+        r6 = _mm256_set1_epi16(r << 6);
+        c256 = _mm256_add_epi16(j256, c1234);
+        mul16 = _mm256_min_epu16(_mm256_mullo_epi16(c256, dy256),
+                                 _mm256_srli_epi16(min_base_y256, 1));
+        y_c256 = _mm256_sub_epi16(r6, mul16);
+        base_y_c256 = _mm256_srai_epi16(y_c256, frac_bits_y);
+        mask256 = _mm256_cmpgt_epi16(min_base_y256, base_y_c256);
+        base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
+        _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
+
+        a0_y = _mm256_setr_epi16(
+            left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
+            left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
+            left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
+            left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
+            left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
+            left[base_y_c[15]]);
+        base_y_c256 = _mm256_add_epi16(base_y_c256, c1);
+        _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
+
+        a1_y = _mm256_setr_epi16(
+            left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
+            left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
+            left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
+            left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
+            left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
+            left[base_y_c[15]]);
+
+        shifty = _mm256_srli_epi16(_mm256_and_si256(y_c256, c3f), 1);
+
+        diff = _mm256_sub_epi16(a1_y, a0_y);  // a[x+1] - a[x]
+        a32 = _mm256_slli_epi16(a0_y, 5);     // a[x] * 32
+        a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
+
+        b = _mm256_mullo_epi16(diff, shifty);
+        res = _mm256_add_epi16(a32, b);
+        resy = _mm256_srli_epi16(res, 5);
+      }
+
+      resxy = _mm256_blendv_epi8(resx, resy,
+                                 *(__m256i *)HighbdBaseMask[base_min_diff]);
+      _mm256_storeu_si256((__m256i *)(dst + j), resxy);
+    }  // for j
+    dst += stride;
+  }
+}
+
+// Directional prediction, zone 2: 90 < angle < 180
+void av1_highbd_dr_prediction_z2_avx2(uint16_t *dst, ptrdiff_t stride, int bw,
+                                      int bh, const uint16_t *above,
+                                      const uint16_t *left, int upsample_above,
+                                      int upsample_left, int dx, int dy,
+                                      int bd) {
+  (void)bd;
+  assert(dx > 0);
+  assert(dy > 0);
+  switch (bw) {
+    case 4:
+      if (bd < 12) {
+        highbd_dr_prediction_z2_Nx4_avx2(bh, dst, stride, above, left,
+                                         upsample_above, upsample_left, dx, dy);
+      } else {
+        highbd_dr_prediction_32bit_z2_Nx4_avx2(bh, dst, stride, above, left,
+                                               upsample_above, upsample_left,
+                                               dx, dy);
+      }
+      break;
+    case 8:
+      if (bd < 12) {
+        highbd_dr_prediction_z2_Nx8_avx2(bh, dst, stride, above, left,
+                                         upsample_above, upsample_left, dx, dy);
+      } else {
+        highbd_dr_prediction_32bit_z2_Nx8_avx2(bh, dst, stride, above, left,
+                                               upsample_above, upsample_left,
+                                               dx, dy);
+      }
+      break;
+    default:
+      if (bd < 12) {
+        highbd_dr_prediction_z2_HxW_avx2(bh, bw, dst, stride, above, left,
+                                         upsample_above, upsample_left, dx, dy);
+      } else {
+        highbd_dr_prediction_32bit_z2_HxW_avx2(bh, bw, dst, stride, above, left,
+                                               upsample_above, upsample_left,
+                                               dx, dy);
+      }
+      break;
+  }
+}
+
+//  Directional prediction, zone 3 functions
+static void highbd_dr_prediction_z3_4x4_avx2(uint16_t *dst, ptrdiff_t stride,
+                                             const uint16_t *left,
+                                             int upsample_left, int dy,
+                                             int bd) {
+  __m128i dstvec[4], d[4];
+  if (bd < 12) {
+    highbd_dr_prediction_z1_4xN_internal_avx2(4, dstvec, left, upsample_left,
+                                              dy);
+  } else {
+    highbd_dr_prediction_32bit_z1_4xN_internal_avx2(4, dstvec, left,
+                                                    upsample_left, dy);
+  }
+  highbd_transpose4x8_8x4_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2],
+                                   &dstvec[3], &d[0], &d[1], &d[2], &d[3]);
+  _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
+  _mm_storel_epi64((__m128i *)(dst + 1 * stride), d[1]);
+  _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[2]);
+  _mm_storel_epi64((__m128i *)(dst + 3 * stride), d[3]);
+  return;
+}
+
+static void highbd_dr_prediction_z3_8x8_avx2(uint16_t *dst, ptrdiff_t stride,
+                                             const uint16_t *left,
+                                             int upsample_left, int dy,
+                                             int bd) {
+  __m128i dstvec[8], d[8];
+  if (bd < 12) {
+    highbd_dr_prediction_z1_8xN_internal_avx2(8, dstvec, left, upsample_left,
+                                              dy);
+  } else {
+    highbd_dr_prediction_32bit_z1_8xN_internal_avx2(8, dstvec, left,
+                                                    upsample_left, dy);
+  }
+  highbd_transpose8x8_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
+                           &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7],
+                           &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6],
+                           &d[7]);
+  for (int i = 0; i < 8; i++) {
+    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
+  }
+}
+
+static void highbd_dr_prediction_z3_4x8_avx2(uint16_t *dst, ptrdiff_t stride,
+                                             const uint16_t *left,
+                                             int upsample_left, int dy,
+                                             int bd) {
+  __m128i dstvec[4], d[8];
+  if (bd < 12) {
+    highbd_dr_prediction_z1_8xN_internal_avx2(4, dstvec, left, upsample_left,
+                                              dy);
+  } else {
+    highbd_dr_prediction_32bit_z1_8xN_internal_avx2(4, dstvec, left,
+                                                    upsample_left, dy);
+  }
+
+  highbd_transpose4x8_8x4_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
+                               &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6],
+                               &d[7]);
+  for (int i = 0; i < 8; i++) {
+    _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]);
+  }
+}
+
+static void highbd_dr_prediction_z3_8x4_avx2(uint16_t *dst, ptrdiff_t stride,
+                                             const uint16_t *left,
+                                             int upsample_left, int dy,
+                                             int bd) {
+  __m128i dstvec[8], d[4];
+  if (bd < 12) {
+    highbd_dr_prediction_z1_4xN_internal_avx2(8, dstvec, left, upsample_left,
+                                              dy);
+  } else {
+    highbd_dr_prediction_32bit_z1_4xN_internal_avx2(8, dstvec, left,
+                                                    upsample_left, dy);
+  }
+
+  highbd_transpose8x8_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
+                               &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7],
+                               &d[0], &d[1], &d[2], &d[3]);
+  _mm_storeu_si128((__m128i *)(dst + 0 * stride), d[0]);
+  _mm_storeu_si128((__m128i *)(dst + 1 * stride), d[1]);
+  _mm_storeu_si128((__m128i *)(dst + 2 * stride), d[2]);
+  _mm_storeu_si128((__m128i *)(dst + 3 * stride), d[3]);
+}
+
+static void highbd_dr_prediction_z3_8x16_avx2(uint16_t *dst, ptrdiff_t stride,
+                                              const uint16_t *left,
+                                              int upsample_left, int dy,
+                                              int bd) {
+  __m256i dstvec[8], d[8];
+  if (bd < 12) {
+    highbd_dr_prediction_z1_16xN_internal_avx2(8, dstvec, left, upsample_left,
+                                               dy);
+  } else {
+    highbd_dr_prediction_32bit_z1_16xN_internal_avx2(8, dstvec, left,
+                                                     upsample_left, dy);
+  }
+  highbd_transpose8x16_16x8_avx2(dstvec, d);
+  for (int i = 0; i < 8; i++) {
+    _mm_storeu_si128((__m128i *)(dst + i * stride),
+                     _mm256_castsi256_si128(d[i]));
+  }
+  for (int i = 8; i < 16; i++) {
+    _mm_storeu_si128((__m128i *)(dst + i * stride),
+                     _mm256_extracti128_si256(d[i - 8], 1));
+  }
+}
+
+static void highbd_dr_prediction_z3_16x8_avx2(uint16_t *dst, ptrdiff_t stride,
+                                              const uint16_t *left,
+                                              int upsample_left, int dy,
+                                              int bd) {
+  __m128i dstvec[16], d[16];
+  if (bd < 12) {
+    highbd_dr_prediction_z1_8xN_internal_avx2(16, dstvec, left, upsample_left,
+                                              dy);
+  } else {
+    highbd_dr_prediction_32bit_z1_8xN_internal_avx2(16, dstvec, left,
+                                                    upsample_left, dy);
+  }
+  for (int i = 0; i < 16; i += 8) {
+    highbd_transpose8x8_sse2(&dstvec[0 + i], &dstvec[1 + i], &dstvec[2 + i],
+                             &dstvec[3 + i], &dstvec[4 + i], &dstvec[5 + i],
+                             &dstvec[6 + i], &dstvec[7 + i], &d[0 + i],
+                             &d[1 + i], &d[2 + i], &d[3 + i], &d[4 + i],
+                             &d[5 + i], &d[6 + i], &d[7 + i]);
+  }
+  for (int i = 0; i < 8; i++) {
+    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
+    _mm_storeu_si128((__m128i *)(dst + i * stride + 8), d[i + 8]);
+  }
+}
+
+static void highbd_dr_prediction_z3_4x16_avx2(uint16_t *dst, ptrdiff_t stride,
+                                              const uint16_t *left,
+                                              int upsample_left, int dy,
+                                              int bd) {
+  __m256i dstvec[4], d[4], d1;
+  if (bd < 12) {
+    highbd_dr_prediction_z1_16xN_internal_avx2(4, dstvec, left, upsample_left,
+                                               dy);
+  } else {
+    highbd_dr_prediction_32bit_z1_16xN_internal_avx2(4, dstvec, left,
+                                                     upsample_left, dy);
+  }
+  highbd_transpose4x16_avx2(dstvec, d);
+  for (int i = 0; i < 4; i++) {
+    _mm_storel_epi64((__m128i *)(dst + i * stride),
+                     _mm256_castsi256_si128(d[i]));
+    d1 = _mm256_bsrli_epi128(d[i], 8);
+    _mm_storel_epi64((__m128i *)(dst + (i + 4) * stride),
+                     _mm256_castsi256_si128(d1));
+    _mm_storel_epi64((__m128i *)(dst + (i + 8) * stride),
+                     _mm256_extracti128_si256(d[i], 1));
+    _mm_storel_epi64((__m128i *)(dst + (i + 12) * stride),
+                     _mm256_extracti128_si256(d1, 1));
+  }
+}
+
+static void highbd_dr_prediction_z3_16x4_avx2(uint16_t *dst, ptrdiff_t stride,
+                                              const uint16_t *left,
+                                              int upsample_left, int dy,
+                                              int bd) {
+  __m128i dstvec[16], d[8];
+  if (bd < 12) {
+    highbd_dr_prediction_z1_4xN_internal_avx2(16, dstvec, left, upsample_left,
+                                              dy);
+  } else {
+    highbd_dr_prediction_32bit_z1_4xN_internal_avx2(16, dstvec, left,
+                                                    upsample_left, dy);
+  }
+  highbd_transpose16x4_8x8_sse2(dstvec, d);
+
+  _mm_storeu_si128((__m128i *)(dst + 0 * stride), d[0]);
+  _mm_storeu_si128((__m128i *)(dst + 0 * stride + 8), d[1]);
+  _mm_storeu_si128((__m128i *)(dst + 1 * stride), d[2]);
+  _mm_storeu_si128((__m128i *)(dst + 1 * stride + 8), d[3]);
+  _mm_storeu_si128((__m128i *)(dst + 2 * stride), d[4]);
+  _mm_storeu_si128((__m128i *)(dst + 2 * stride + 8), d[5]);
+  _mm_storeu_si128((__m128i *)(dst + 3 * stride), d[6]);
+  _mm_storeu_si128((__m128i *)(dst + 3 * stride + 8), d[7]);
+}
+
+static void highbd_dr_prediction_z3_8x32_avx2(uint16_t *dst, ptrdiff_t stride,
+                                              const uint16_t *left,
+                                              int upsample_left, int dy,
+                                              int bd) {
+  __m256i dstvec[16], d[16];
+  if (bd < 12) {
+    highbd_dr_prediction_z1_32xN_internal_avx2(8, dstvec, left, upsample_left,
+                                               dy);
+  } else {
+    highbd_dr_prediction_32bit_z1_32xN_internal_avx2(8, dstvec, left,
+                                                     upsample_left, dy);
+  }
+
+  for (int i = 0; i < 16; i += 8) {
+    highbd_transpose8x16_16x8_avx2(dstvec + i, d + i);
+  }
+
+  for (int i = 0; i < 8; i++) {
+    _mm_storeu_si128((__m128i *)(dst + i * stride),
+                     _mm256_castsi256_si128(d[i]));
+  }
+  for (int i = 0; i < 8; i++) {
+    _mm_storeu_si128((__m128i *)(dst + (i + 8) * stride),
+                     _mm256_extracti128_si256(d[i], 1));
+  }
+  for (int i = 8; i < 16; i++) {
+    _mm_storeu_si128((__m128i *)(dst + (i + 8) * stride),
+                     _mm256_castsi256_si128(d[i]));
+  }
+  for (int i = 8; i < 16; i++) {
+    _mm_storeu_si128((__m128i *)(dst + (i + 16) * stride),
+                     _mm256_extracti128_si256(d[i], 1));
+  }
+}
+
+static void highbd_dr_prediction_z3_32x8_avx2(uint16_t *dst, ptrdiff_t stride,
+                                              const uint16_t *left,
+                                              int upsample_left, int dy,
+                                              int bd) {
+  __m128i dstvec[32], d[32];
+  if (bd < 12) {
+    highbd_dr_prediction_z1_8xN_internal_avx2(32, dstvec, left, upsample_left,
+                                              dy);
+  } else {
+    highbd_dr_prediction_32bit_z1_8xN_internal_avx2(32, dstvec, left,
+                                                    upsample_left, dy);
+  }
+
+  for (int i = 0; i < 32; i += 8) {
+    highbd_transpose8x8_sse2(&dstvec[0 + i], &dstvec[1 + i], &dstvec[2 + i],
+                             &dstvec[3 + i], &dstvec[4 + i], &dstvec[5 + i],
+                             &dstvec[6 + i], &dstvec[7 + i], &d[0 + i],
+                             &d[1 + i], &d[2 + i], &d[3 + i], &d[4 + i],
+                             &d[5 + i], &d[6 + i], &d[7 + i]);
+  }
+  for (int i = 0; i < 8; i++) {
+    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
+    _mm_storeu_si128((__m128i *)(dst + i * stride + 8), d[i + 8]);
+    _mm_storeu_si128((__m128i *)(dst + i * stride + 16), d[i + 16]);
+    _mm_storeu_si128((__m128i *)(dst + i * stride + 24), d[i + 24]);
+  }
+}
+
+static void highbd_dr_prediction_z3_16x16_avx2(uint16_t *dst, ptrdiff_t stride,
+                                               const uint16_t *left,
+                                               int upsample_left, int dy,
+                                               int bd) {
+  __m256i dstvec[16], d[16];
+  if (bd < 12) {
+    highbd_dr_prediction_z1_16xN_internal_avx2(16, dstvec, left, upsample_left,
+                                               dy);
+  } else {
+    highbd_dr_prediction_32bit_z1_16xN_internal_avx2(16, dstvec, left,
+                                                     upsample_left, dy);
+  }
+
+  highbd_transpose16x16_avx2(dstvec, d);
+
+  for (int i = 0; i < 16; i++) {
+    _mm256_storeu_si256((__m256i *)(dst + i * stride), d[i]);
+  }
+}
+
+static void highbd_dr_prediction_z3_32x32_avx2(uint16_t *dst, ptrdiff_t stride,
+                                               const uint16_t *left,
+                                               int upsample_left, int dy,
+                                               int bd) {
+  __m256i dstvec[64], d[16];
+  if (bd < 12) {
+    highbd_dr_prediction_z1_32xN_internal_avx2(32, dstvec, left, upsample_left,
+                                               dy);
+  } else {
+    highbd_dr_prediction_32bit_z1_32xN_internal_avx2(32, dstvec, left,
+                                                     upsample_left, dy);
+  }
+  highbd_transpose16x16_avx2(dstvec, d);
+  for (int j = 0; j < 16; j++) {
+    _mm256_storeu_si256((__m256i *)(dst + j * stride), d[j]);
+  }
+  highbd_transpose16x16_avx2(dstvec + 16, d);
+  for (int j = 0; j < 16; j++) {
+    _mm256_storeu_si256((__m256i *)(dst + j * stride + 16), d[j]);
+  }
+  highbd_transpose16x16_avx2(dstvec + 32, d);
+  for (int j = 0; j < 16; j++) {
+    _mm256_storeu_si256((__m256i *)(dst + (j + 16) * stride), d[j]);
+  }
+  highbd_transpose16x16_avx2(dstvec + 48, d);
+  for (int j = 0; j < 16; j++) {
+    _mm256_storeu_si256((__m256i *)(dst + (j + 16) * stride + 16), d[j]);
+  }
+}
+
+static void highbd_dr_prediction_z3_64x64_avx2(uint16_t *dst, ptrdiff_t stride,
+                                               const uint16_t *left,
+                                               int upsample_left, int dy,
+                                               int bd) {
+  DECLARE_ALIGNED(16, uint16_t, dstT[64 * 64]);
+  if (bd < 12) {
+    highbd_dr_prediction_z1_64xN_avx2(64, dstT, 64, left, upsample_left, dy);
+  } else {
+    highbd_dr_prediction_32bit_z1_64xN_avx2(64, dstT, 64, left, upsample_left,
+                                            dy);
+  }
+  highbd_transpose(dstT, 64, dst, stride, 64, 64);
+}
+
+static void highbd_dr_prediction_z3_16x32_avx2(uint16_t *dst, ptrdiff_t stride,
+                                               const uint16_t *left,
+                                               int upsample_left, int dy,
+                                               int bd) {
+  __m256i dstvec[32], d[32];
+  if (bd < 12) {
+    highbd_dr_prediction_z1_32xN_internal_avx2(16, dstvec, left, upsample_left,
+                                               dy);
+  } else {
+    highbd_dr_prediction_32bit_z1_32xN_internal_avx2(16, dstvec, left,
+                                                     upsample_left, dy);
+  }
+  for (int i = 0; i < 32; i += 8) {
+    highbd_transpose8x16_16x8_avx2(dstvec + i, d + i);
+  }
+  // store
+  for (int j = 0; j < 32; j += 16) {
+    for (int i = 0; i < 8; i++) {
+      _mm_storeu_si128((__m128i *)(dst + (i + j) * stride),
+                       _mm256_castsi256_si128(d[(i + j)]));
+    }
+    for (int i = 0; i < 8; i++) {
+      _mm_storeu_si128((__m128i *)(dst + (i + j) * stride + 8),
+                       _mm256_castsi256_si128(d[(i + j) + 8]));
+    }
+    for (int i = 8; i < 16; i++) {
+      _mm256_storeu_si256(
+          (__m256i *)(dst + (i + j) * stride),
+          _mm256_inserti128_si256(
+              d[(i + j)], _mm256_extracti128_si256(d[(i + j) - 8], 1), 0));
+    }
+  }
+}
+
+static void highbd_dr_prediction_z3_32x16_avx2(uint16_t *dst, ptrdiff_t stride,
+                                               const uint16_t *left,
+                                               int upsample_left, int dy,
+                                               int bd) {
+  __m256i dstvec[32], d[16];
+  if (bd < 12) {
+    highbd_dr_prediction_z1_16xN_internal_avx2(32, dstvec, left, upsample_left,
+                                               dy);
+  } else {
+    highbd_dr_prediction_32bit_z1_16xN_internal_avx2(32, dstvec, left,
+                                                     upsample_left, dy);
+  }
+  for (int i = 0; i < 32; i += 16) {
+    highbd_transpose16x16_avx2((dstvec + i), d);
+    for (int j = 0; j < 16; j++) {
+      _mm256_storeu_si256((__m256i *)(dst + j * stride + i), d[j]);
+    }
+  }
+}
+
+static void highbd_dr_prediction_z3_32x64_avx2(uint16_t *dst, ptrdiff_t stride,
+                                               const uint16_t *left,
+                                               int upsample_left, int dy,
+                                               int bd) {
+  uint16_t dstT[64 * 32];
+  if (bd < 12) {
+    highbd_dr_prediction_z1_64xN_avx2(32, dstT, 64, left, upsample_left, dy);
+  } else {
+    highbd_dr_prediction_32bit_z1_64xN_avx2(32, dstT, 64, left, upsample_left,
+                                            dy);
+  }
+  highbd_transpose(dstT, 64, dst, stride, 32, 64);
+}
+
+static void highbd_dr_prediction_z3_64x32_avx2(uint16_t *dst, ptrdiff_t stride,
+                                               const uint16_t *left,
+                                               int upsample_left, int dy,
+                                               int bd) {
+  DECLARE_ALIGNED(16, uint16_t, dstT[32 * 64]);
+  highbd_dr_prediction_z1_32xN_avx2(64, dstT, 32, left, upsample_left, dy, bd);
+  highbd_transpose(dstT, 32, dst, stride, 64, 32);
+  return;
+}
+
+static void highbd_dr_prediction_z3_16x64_avx2(uint16_t *dst, ptrdiff_t stride,
+                                               const uint16_t *left,
+                                               int upsample_left, int dy,
+                                               int bd) {
+  DECLARE_ALIGNED(16, uint16_t, dstT[64 * 16]);
+  if (bd < 12) {
+    highbd_dr_prediction_z1_64xN_avx2(16, dstT, 64, left, upsample_left, dy);
+  } else {
+    highbd_dr_prediction_32bit_z1_64xN_avx2(16, dstT, 64, left, upsample_left,
+                                            dy);
+  }
+  highbd_transpose(dstT, 64, dst, stride, 16, 64);
+}
+
+static void highbd_dr_prediction_z3_64x16_avx2(uint16_t *dst, ptrdiff_t stride,
+                                               const uint16_t *left,
+                                               int upsample_left, int dy,
+                                               int bd) {
+  __m256i dstvec[64], d[16];
+  if (bd < 12) {
+    highbd_dr_prediction_z1_16xN_internal_avx2(64, dstvec, left, upsample_left,
+                                               dy);
+  } else {
+    highbd_dr_prediction_32bit_z1_16xN_internal_avx2(64, dstvec, left,
+                                                     upsample_left, dy);
+  }
+  for (int i = 0; i < 64; i += 16) {
+    highbd_transpose16x16_avx2((dstvec + i), d);
+    for (int j = 0; j < 16; j++) {
+      _mm256_storeu_si256((__m256i *)(dst + j * stride + i), d[j]);
+    }
+  }
+}
+
+void av1_highbd_dr_prediction_z3_avx2(uint16_t *dst, ptrdiff_t stride, int bw,
+                                      int bh, const uint16_t *above,
+                                      const uint16_t *left, int upsample_left,
+                                      int dx, int dy, int bd) {
+  (void)above;
+  (void)dx;
+
+  assert(dx == 1);
+  assert(dy > 0);
+  if (bw == bh) {
+    switch (bw) {
+      case 4:
+        highbd_dr_prediction_z3_4x4_avx2(dst, stride, left, upsample_left, dy,
+                                         bd);
+        break;
+      case 8:
+        highbd_dr_prediction_z3_8x8_avx2(dst, stride, left, upsample_left, dy,
+                                         bd);
+        break;
+      case 16:
+        highbd_dr_prediction_z3_16x16_avx2(dst, stride, left, upsample_left, dy,
+                                           bd);
+        break;
+      case 32:
+        highbd_dr_prediction_z3_32x32_avx2(dst, stride, left, upsample_left, dy,
+                                           bd);
+        break;
+      case 64:
+        highbd_dr_prediction_z3_64x64_avx2(dst, stride, left, upsample_left, dy,
+                                           bd);
+        break;
+    }
+  } else {
+    if (bw < bh) {
+      if (bw + bw == bh) {
+        switch (bw) {
+          case 4:
+            highbd_dr_prediction_z3_4x8_avx2(dst, stride, left, upsample_left,
+                                             dy, bd);
+            break;
+          case 8:
+            highbd_dr_prediction_z3_8x16_avx2(dst, stride, left, upsample_left,
+                                              dy, bd);
+            break;
+          case 16:
+            highbd_dr_prediction_z3_16x32_avx2(dst, stride, left, upsample_left,
+                                               dy, bd);
+            break;
+          case 32:
+            highbd_dr_prediction_z3_32x64_avx2(dst, stride, left, upsample_left,
+                                               dy, bd);
+            break;
+        }
+      } else {
+        switch (bw) {
+          case 4:
+            highbd_dr_prediction_z3_4x16_avx2(dst, stride, left, upsample_left,
+                                              dy, bd);
+            break;
+          case 8:
+            highbd_dr_prediction_z3_8x32_avx2(dst, stride, left, upsample_left,
+                                              dy, bd);
+            break;
+          case 16:
+            highbd_dr_prediction_z3_16x64_avx2(dst, stride, left, upsample_left,
+                                               dy, bd);
+            break;
+        }
+      }
+    } else {
+      if (bh + bh == bw) {
+        switch (bh) {
+          case 4:
+            highbd_dr_prediction_z3_8x4_avx2(dst, stride, left, upsample_left,
+                                             dy, bd);
+            break;
+          case 8:
+            highbd_dr_prediction_z3_16x8_avx2(dst, stride, left, upsample_left,
+                                              dy, bd);
+            break;
+          case 16:
+            highbd_dr_prediction_z3_32x16_avx2(dst, stride, left, upsample_left,
+                                               dy, bd);
+            break;
+          case 32:
+            highbd_dr_prediction_z3_64x32_avx2(dst, stride, left, upsample_left,
+                                               dy, bd);
+            break;
+        }
+      } else {
+        switch (bh) {
+          case 4:
+            highbd_dr_prediction_z3_16x4_avx2(dst, stride, left, upsample_left,
+                                              dy, bd);
+            break;
+          case 8:
+            highbd_dr_prediction_z3_32x8_avx2(dst, stride, left, upsample_left,
+                                              dy, bd);
+            break;
+          case 16:
+            highbd_dr_prediction_z3_64x16_avx2(dst, stride, left, upsample_left,
+                                               dy, bd);
+            break;
+        }
+      }
+    }
+  }
+  return;
+}
+
+// Low bit depth functions
+static DECLARE_ALIGNED(32, uint8_t, BaseMask[33][32]) = {
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0,    0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,    0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
+};
+
+static DECLARE_ALIGNED(16, uint8_t, LoadMaskx[16][16]) = {
+  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+  { 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 },
+  { 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 },
+  { 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 },
+  { 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 },
+  { 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 },
+  { 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 },
+  { 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8 },
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7 },
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6 },
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5 },
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4 },
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3 },
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2 },
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 },
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+};
+
+static DECLARE_ALIGNED(16, uint8_t, EvenOddMaskx[8][16]) = {
+  { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 },
+  { 0, 1, 3, 5, 7, 9, 11, 13, 0, 2, 4, 6, 8, 10, 12, 14 },
+  { 0, 0, 2, 4, 6, 8, 10, 12, 0, 0, 3, 5, 7, 9, 11, 13 },
+  { 0, 0, 0, 3, 5, 7, 9, 11, 0, 0, 0, 4, 6, 8, 10, 12 },
+  { 0, 0, 0, 0, 4, 6, 8, 10, 0, 0, 0, 0, 5, 7, 9, 11 },
+  { 0, 0, 0, 0, 0, 5, 7, 9, 0, 0, 0, 0, 0, 6, 8, 10 },
+  { 0, 0, 0, 0, 0, 0, 6, 8, 0, 0, 0, 0, 0, 0, 7, 9 },
+  { 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 8 }
+};
+/* clang-format off */
+static DECLARE_ALIGNED(32, int, LoadMaskz2[8][8]) = {
+  { -1,  0,  0,  0,  0,  0,  0,  0},
+  { -1, -1,  0,  0,  0,  0,  0,  0},
+  { -1, -1, -1,  0,  0,  0,  0,  0},
+  { -1, -1, -1, -1,  0,  0,  0,  0},
+  { -1, -1, -1, -1, -1,  0,  0,  0},
+  { -1, -1, -1, -1, -1, -1,  0,  0},
+  { -1, -1, -1, -1, -1, -1, -1,  0},
+  { -1, -1, -1, -1, -1, -1, -1, -1},
+};
+/* clang-format on */
+static AOM_FORCE_INLINE void dr_prediction_z1_HxW_internal_avx2(
+    int H, int W, __m128i *dst, const uint8_t *above, int upsample_above,
+    int dx) {
+  const int frac_bits = 6 - upsample_above;
+  const int max_base_x = ((W + H) - 1) << upsample_above;
+
+  assert(dx > 0);
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m256i a0, a1, a32, a16;
+  __m256i diff, c3f;
+  __m128i a_mbase_x;
+
+  a16 = _mm256_set1_epi16(16);
+  a_mbase_x = _mm_set1_epi8(above[max_base_x]);
+  c3f = _mm256_set1_epi16(0x3f);
+
+  int x = dx;
+  for (int r = 0; r < W; r++) {
+    __m256i b, res, shift;
+    __m128i res1, a0_128, a1_128;
+
+    int base = x >> frac_bits;
+    int base_max_diff = (max_base_x - base) >> upsample_above;
+    if (base_max_diff <= 0) {
+      for (int i = r; i < W; ++i) {
+        dst[i] = a_mbase_x;  // save 4 values
+      }
+      return;
+    }
+    if (base_max_diff > H) base_max_diff = H;
+    a0_128 = _mm_loadu_si128((__m128i *)(above + base));
+    a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1));
+
+    if (upsample_above) {
+      a0_128 = _mm_shuffle_epi8(a0_128, *(__m128i *)EvenOddMaskx[0]);
+      a1_128 = _mm_srli_si128(a0_128, 8);
+
+      shift = _mm256_srli_epi16(
+          _mm256_and_si256(
+              _mm256_slli_epi16(_mm256_set1_epi16(x), upsample_above), c3f),
+          1);
+    } else {
+      shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
+    }
+    a0 = _mm256_cvtepu8_epi16(a0_128);
+    a1 = _mm256_cvtepu8_epi16(a1_128);
+
+    diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
+    a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
+    a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
+
+    b = _mm256_mullo_epi16(diff, shift);
+    res = _mm256_add_epi16(a32, b);
+    res = _mm256_srli_epi16(res, 5);
+
+    res = _mm256_packus_epi16(
+        res, _mm256_castsi128_si256(
+                 _mm256_extracti128_si256(res, 1)));  // goto 8 bit
+    res1 = _mm256_castsi256_si128(res);               // 16 8bit values
+
+    dst[r] =
+        _mm_blendv_epi8(a_mbase_x, res1, *(__m128i *)BaseMask[base_max_diff]);
+    x += dx;
+  }
+}
+
+static void dr_prediction_z1_4xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above, int upsample_above,
+                                      int dx) {
+  __m128i dstvec[16];
+
+  dr_prediction_z1_HxW_internal_avx2(4, N, dstvec, above, upsample_above, dx);
+  for (int i = 0; i < N; i++) {
+    *(uint32_t *)(dst + stride * i) = _mm_cvtsi128_si32(dstvec[i]);
+  }
+}
+
+static void dr_prediction_z1_8xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above, int upsample_above,
+                                      int dx) {
+  __m128i dstvec[32];
+
+  dr_prediction_z1_HxW_internal_avx2(8, N, dstvec, above, upsample_above, dx);
+  for (int i = 0; i < N; i++) {
+    _mm_storel_epi64((__m128i *)(dst + stride * i), dstvec[i]);
+  }
+}
+
+static void dr_prediction_z1_16xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above, int upsample_above,
+                                       int dx) {
+  __m128i dstvec[64];
+
+  dr_prediction_z1_HxW_internal_avx2(16, N, dstvec, above, upsample_above, dx);
+  for (int i = 0; i < N; i++) {
+    _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]);
+  }
+}
+
+static AOM_FORCE_INLINE void dr_prediction_z1_32xN_internal_avx2(
+    int N, __m256i *dstvec, const uint8_t *above, int upsample_above, int dx) {
+  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
+  (void)upsample_above;
+  const int frac_bits = 6;
+  const int max_base_x = ((32 + N) - 1);
+
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m256i a0, a1, a32, a16;
+  __m256i a_mbase_x, diff, c3f;
+
+  a16 = _mm256_set1_epi16(16);
+  a_mbase_x = _mm256_set1_epi8(above[max_base_x]);
+  c3f = _mm256_set1_epi16(0x3f);
+
+  int x = dx;
+  for (int r = 0; r < N; r++) {
+    __m256i b, res, res16[2];
+    __m128i a0_128, a1_128;
+
+    int base = x >> frac_bits;
+    int base_max_diff = (max_base_x - base);
+    if (base_max_diff <= 0) {
+      for (int i = r; i < N; ++i) {
+        dstvec[i] = a_mbase_x;  // save 32 values
+      }
+      return;
+    }
+    if (base_max_diff > 32) base_max_diff = 32;
+    __m256i shift =
+        _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
+
+    for (int j = 0, jj = 0; j < 32; j += 16, jj++) {
+      int mdiff = base_max_diff - j;
+      if (mdiff <= 0) {
+        res16[jj] = a_mbase_x;
+      } else {
+        a0_128 = _mm_loadu_si128((__m128i *)(above + base + j));
+        a1_128 = _mm_loadu_si128((__m128i *)(above + base + j + 1));
+        a0 = _mm256_cvtepu8_epi16(a0_128);
+        a1 = _mm256_cvtepu8_epi16(a1_128);
+
+        diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
+        a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
+        a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
+        b = _mm256_mullo_epi16(diff, shift);
+
+        res = _mm256_add_epi16(a32, b);
+        res = _mm256_srli_epi16(res, 5);
+        res16[jj] = _mm256_packus_epi16(
+            res, _mm256_castsi128_si256(
+                     _mm256_extracti128_si256(res, 1)));  // 16 8bit values
+      }
+    }
+    res16[1] =
+        _mm256_inserti128_si256(res16[0], _mm256_castsi256_si128(res16[1]),
+                                1);  // 32 8bit values
+
+    dstvec[r] = _mm256_blendv_epi8(
+        a_mbase_x, res16[1],
+        *(__m256i *)BaseMask[base_max_diff]);  // 32 8bit values
+    x += dx;
+  }
+}
+
+static void dr_prediction_z1_32xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above, int upsample_above,
+                                       int dx) {
+  __m256i dstvec[64];
+  dr_prediction_z1_32xN_internal_avx2(N, dstvec, above, upsample_above, dx);
+  for (int i = 0; i < N; i++) {
+    _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]);
+  }
+}
+
+static void dr_prediction_z1_64xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above, int upsample_above,
+                                       int dx) {
+  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
+  (void)upsample_above;
+  const int frac_bits = 6;
+  const int max_base_x = ((64 + N) - 1);
+
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m256i a0, a1, a32, a16;
+  __m256i a_mbase_x, diff, c3f;
+  __m128i max_base_x128, base_inc128, mask128;
+
+  a16 = _mm256_set1_epi16(16);
+  a_mbase_x = _mm256_set1_epi8(above[max_base_x]);
+  max_base_x128 = _mm_set1_epi8(max_base_x);
+  c3f = _mm256_set1_epi16(0x3f);
+
+  int x = dx;
+  for (int r = 0; r < N; r++, dst += stride) {
+    __m256i b, res;
+    int base = x >> frac_bits;
+    if (base >= max_base_x) {
+      for (int i = r; i < N; ++i) {
+        _mm256_storeu_si256((__m256i *)dst, a_mbase_x);  // save 32 values
+        _mm256_storeu_si256((__m256i *)(dst + 32), a_mbase_x);
+        dst += stride;
+      }
+      return;
+    }
+
+    __m256i shift =
+        _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
+
+    __m128i a0_128, a1_128, res128;
+    for (int j = 0; j < 64; j += 16) {
+      int mdif = max_base_x - (base + j);
+      if (mdif <= 0) {
+        _mm_storeu_si128((__m128i *)(dst + j),
+                         _mm256_castsi256_si128(a_mbase_x));
+      } else {
+        a0_128 = _mm_loadu_si128((__m128i *)(above + base + j));
+        a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1 + j));
+        a0 = _mm256_cvtepu8_epi16(a0_128);
+        a1 = _mm256_cvtepu8_epi16(a1_128);
+
+        diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
+        a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
+        a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
+        b = _mm256_mullo_epi16(diff, shift);
+
+        res = _mm256_add_epi16(a32, b);
+        res = _mm256_srli_epi16(res, 5);
+        res = _mm256_packus_epi16(
+            res, _mm256_castsi128_si256(
+                     _mm256_extracti128_si256(res, 1)));  // 16 8bit values
+
+        base_inc128 =
+            _mm_setr_epi8((uint8_t)(base + j), (uint8_t)(base + j + 1),
+                          (uint8_t)(base + j + 2), (uint8_t)(base + j + 3),
+                          (uint8_t)(base + j + 4), (uint8_t)(base + j + 5),
+                          (uint8_t)(base + j + 6), (uint8_t)(base + j + 7),
+                          (uint8_t)(base + j + 8), (uint8_t)(base + j + 9),
+                          (uint8_t)(base + j + 10), (uint8_t)(base + j + 11),
+                          (uint8_t)(base + j + 12), (uint8_t)(base + j + 13),
+                          (uint8_t)(base + j + 14), (uint8_t)(base + j + 15));
+
+        mask128 = _mm_cmpgt_epi8(_mm_subs_epu8(max_base_x128, base_inc128),
+                                 _mm_setzero_si128());
+        res128 = _mm_blendv_epi8(_mm256_castsi256_si128(a_mbase_x),
+                                 _mm256_castsi256_si128(res), mask128);
+        _mm_storeu_si128((__m128i *)(dst + j), res128);
+      }
+    }
+    x += dx;
+  }
+}
+
+// Directional prediction, zone 1: 0 < angle < 90
+void av1_dr_prediction_z1_avx2(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+                               const uint8_t *above, const uint8_t *left,
+                               int upsample_above, int dx, int dy) {
+  (void)left;
+  (void)dy;
+  switch (bw) {
+    case 4:
+      dr_prediction_z1_4xN_avx2(bh, dst, stride, above, upsample_above, dx);
+      break;
+    case 8:
+      dr_prediction_z1_8xN_avx2(bh, dst, stride, above, upsample_above, dx);
+      break;
+    case 16:
+      dr_prediction_z1_16xN_avx2(bh, dst, stride, above, upsample_above, dx);
+      break;
+    case 32:
+      dr_prediction_z1_32xN_avx2(bh, dst, stride, above, upsample_above, dx);
+      break;
+    case 64:
+      dr_prediction_z1_64xN_avx2(bh, dst, stride, above, upsample_above, dx);
+      break;
+    default: break;
+  }
+  return;
+}
+
+static void dr_prediction_z2_Nx4_avx2(int N, uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above, const uint8_t *left,
+                                      int upsample_above, int upsample_left,
+                                      int dx, int dy) {
+  const int min_base_x = -(1 << upsample_above);
+  const int min_base_y = -(1 << upsample_left);
+  const int frac_bits_x = 6 - upsample_above;
+  const int frac_bits_y = 6 - upsample_left;
+
+  assert(dx > 0);
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m128i a0_x, a1_x, a32, a16, diff;
+  __m128i c3f, min_base_y128, c1234, dy128;
+
+  a16 = _mm_set1_epi16(16);
+  c3f = _mm_set1_epi16(0x3f);
+  min_base_y128 = _mm_set1_epi16(min_base_y);
+  c1234 = _mm_setr_epi16(0, 1, 2, 3, 4, 0, 0, 0);
+  dy128 = _mm_set1_epi16(dy);
+
+  for (int r = 0; r < N; r++) {
+    __m128i b, res, shift, r6, ydx;
+    __m128i resx, resy, resxy;
+    __m128i a0_x128, a1_x128;
+    int y = r + 1;
+    int base_x = (-y * dx) >> frac_bits_x;
+    int base_shift = 0;
+    if (base_x < (min_base_x - 1)) {
+      base_shift = (min_base_x - base_x - 1) >> upsample_above;
+    }
+    int base_min_diff =
+        (min_base_x - base_x + upsample_above) >> upsample_above;
+    if (base_min_diff > 4) {
+      base_min_diff = 4;
+    } else {
+      if (base_min_diff < 0) base_min_diff = 0;
+    }
+
+    if (base_shift > 3) {
+      a0_x = _mm_setzero_si128();
+      a1_x = _mm_setzero_si128();
+      shift = _mm_setzero_si128();
+    } else {
+      a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
+      ydx = _mm_set1_epi16(y * dx);
+      r6 = _mm_slli_epi16(c1234, 6);
+
+      if (upsample_above) {
+        a0_x128 =
+            _mm_shuffle_epi8(a0_x128, *(__m128i *)EvenOddMaskx[base_shift]);
+        a1_x128 = _mm_srli_si128(a0_x128, 8);
+
+        shift = _mm_srli_epi16(
+            _mm_and_si128(
+                _mm_slli_epi16(_mm_sub_epi16(r6, ydx), upsample_above), c3f),
+            1);
+      } else {
+        a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]);
+        a1_x128 = _mm_srli_si128(a0_x128, 1);
+
+        shift = _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1);
+      }
+      a0_x = _mm_cvtepu8_epi16(a0_x128);
+      a1_x = _mm_cvtepu8_epi16(a1_x128);
+    }
+    // y calc
+    __m128i a0_y, a1_y, shifty;
+    if (base_x < min_base_x) {
+      DECLARE_ALIGNED(32, int16_t, base_y_c[8]);
+      __m128i y_c128, base_y_c128, mask128, c1234_;
+      c1234_ = _mm_srli_si128(c1234, 2);
+      r6 = _mm_set1_epi16(r << 6);
+      y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234_, dy128));
+      base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
+      mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
+      base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
+      _mm_store_si128((__m128i *)base_y_c, base_y_c128);
+
+      a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
+                            left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0);
+      base_y_c128 = _mm_add_epi16(base_y_c128, _mm_srli_epi16(a16, 4));
+      _mm_store_si128((__m128i *)base_y_c, base_y_c128);
+      a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
+                            left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0);
+
+      if (upsample_left) {
+        shifty = _mm_srli_epi16(
+            _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1);
+      } else {
+        shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
+      }
+      a0_x = _mm_unpacklo_epi64(a0_x, a0_y);
+      a1_x = _mm_unpacklo_epi64(a1_x, a1_y);
+      shift = _mm_unpacklo_epi64(shift, shifty);
+    }
+
+    diff = _mm_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
+    a32 = _mm_slli_epi16(a0_x, 5);     // a[x] * 32
+    a32 = _mm_add_epi16(a32, a16);     // a[x] * 32 + 16
+
+    b = _mm_mullo_epi16(diff, shift);
+    res = _mm_add_epi16(a32, b);
+    res = _mm_srli_epi16(res, 5);
+
+    resx = _mm_packus_epi16(res, res);
+    resy = _mm_srli_si128(resx, 4);
+
+    resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]);
+    *(uint32_t *)(dst) = _mm_cvtsi128_si32(resxy);
+    dst += stride;
+  }
+}
+
+static void dr_prediction_z2_Nx8_avx2(int N, uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above, const uint8_t *left,
+                                      int upsample_above, int upsample_left,
+                                      int dx, int dy) {
+  const int min_base_x = -(1 << upsample_above);
+  const int min_base_y = -(1 << upsample_left);
+  const int frac_bits_x = 6 - upsample_above;
+  const int frac_bits_y = 6 - upsample_left;
+
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m256i diff, a32, a16;
+  __m256i a0_x, a1_x;
+  __m128i a0_x128, a1_x128, min_base_y128, c3f;
+  __m128i c1234, dy128;
+
+  a16 = _mm256_set1_epi16(16);
+  c3f = _mm_set1_epi16(0x3f);
+  min_base_y128 = _mm_set1_epi16(min_base_y);
+  dy128 = _mm_set1_epi16(dy);
+  c1234 = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+
+  for (int r = 0; r < N; r++) {
+    __m256i b, res, shift;
+    __m128i resx, resy, resxy, r6, ydx;
+
+    int y = r + 1;
+    int base_x = (-y * dx) >> frac_bits_x;
+    int base_shift = 0;
+    if (base_x < (min_base_x - 1)) {
+      base_shift = (min_base_x - base_x - 1) >> upsample_above;
+    }
+    int base_min_diff =
+        (min_base_x - base_x + upsample_above) >> upsample_above;
+    if (base_min_diff > 8) {
+      base_min_diff = 8;
+    } else {
+      if (base_min_diff < 0) base_min_diff = 0;
+    }
+
+    if (base_shift > 7) {
+      a0_x = _mm256_setzero_si256();
+      a1_x = _mm256_setzero_si256();
+      shift = _mm256_setzero_si256();
+    } else {
+      a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
+      ydx = _mm_set1_epi16(y * dx);
+      r6 = _mm_slli_epi16(_mm_srli_si128(c1234, 2), 6);
+      if (upsample_above) {
+        a0_x128 =
+            _mm_shuffle_epi8(a0_x128, *(__m128i *)EvenOddMaskx[base_shift]);
+        a1_x128 = _mm_srli_si128(a0_x128, 8);
+
+        shift = _mm256_castsi128_si256(_mm_srli_epi16(
+            _mm_and_si128(
+                _mm_slli_epi16(_mm_sub_epi16(r6, ydx), upsample_above), c3f),
+            1));
+      } else {
+        a1_x128 = _mm_srli_si128(a0_x128, 1);
+        a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]);
+        a1_x128 = _mm_shuffle_epi8(a1_x128, *(__m128i *)LoadMaskx[base_shift]);
+
+        shift = _mm256_castsi128_si256(
+            _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1));
+      }
+      a0_x = _mm256_castsi128_si256(_mm_cvtepu8_epi16(a0_x128));
+      a1_x = _mm256_castsi128_si256(_mm_cvtepu8_epi16(a1_x128));
+    }
+
+    // y calc
+    __m128i a0_y, a1_y, shifty;
+    if (base_x < min_base_x) {
+      DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
+      __m128i y_c128, base_y_c128, mask128;
+      r6 = _mm_set1_epi16(r << 6);
+      y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128));
+      base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
+      mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
+      base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
+      _mm_store_si128((__m128i *)base_y_c, base_y_c128);
+
+      a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
+                            left[base_y_c[2]], left[base_y_c[3]],
+                            left[base_y_c[4]], left[base_y_c[5]],
+                            left[base_y_c[6]], left[base_y_c[7]]);
+      base_y_c128 = _mm_add_epi16(
+          base_y_c128, _mm_srli_epi16(_mm256_castsi256_si128(a16), 4));
+      _mm_store_si128((__m128i *)base_y_c, base_y_c128);
+
+      a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
+                            left[base_y_c[2]], left[base_y_c[3]],
+                            left[base_y_c[4]], left[base_y_c[5]],
+                            left[base_y_c[6]], left[base_y_c[7]]);
+
+      if (upsample_left) {
+        shifty = _mm_srli_epi16(
+            _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1);
+      } else {
+        shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
+      }
+
+      a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
+      a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
+      shift = _mm256_inserti128_si256(shift, shifty, 1);
+    }
+
+    diff = _mm256_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
+    a32 = _mm256_slli_epi16(a0_x, 5);     // a[x] * 32
+    a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
+
+    b = _mm256_mullo_epi16(diff, shift);
+    res = _mm256_add_epi16(a32, b);
+    res = _mm256_srli_epi16(res, 5);
+
+    resx = _mm_packus_epi16(_mm256_castsi256_si128(res),
+                            _mm256_castsi256_si128(res));
+    resy = _mm256_extracti128_si256(res, 1);
+    resy = _mm_packus_epi16(resy, resy);
+
+    resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]);
+    _mm_storel_epi64((__m128i *)(dst), resxy);
+    dst += stride;
+  }
+}
+
+static void dr_prediction_z2_HxW_avx2(int H, int W, uint8_t *dst,
+                                      ptrdiff_t stride, const uint8_t *above,
+                                      const uint8_t *left, int upsample_above,
+                                      int upsample_left, int dx, int dy) {
+  // here upsample_above and upsample_left are 0 by design of
+  // av1_use_intra_edge_upsample
+  const int min_base_x = -1;
+  const int min_base_y = -1;
+  (void)upsample_above;
+  (void)upsample_left;
+  const int frac_bits_x = 6;
+  const int frac_bits_y = 6;
+
+  __m256i a0_x, a1_x, a0_y, a1_y, a32, a16, c1234, c0123;
+  __m256i diff, min_base_y256, c3f, shifty, dy256, c1;
+  __m128i a0_x128, a1_x128;
+
+  DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
+  a16 = _mm256_set1_epi16(16);
+  c1 = _mm256_srli_epi16(a16, 4);
+  min_base_y256 = _mm256_set1_epi16(min_base_y);
+  c3f = _mm256_set1_epi16(0x3f);
+  dy256 = _mm256_set1_epi16(dy);
+  c0123 =
+      _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+  c1234 = _mm256_add_epi16(c0123, c1);
+
+  for (int r = 0; r < H; r++) {
+    __m256i b, res, shift, j256, r6, ydx;
+    __m128i resx, resy;
+    __m128i resxy;
+    int y = r + 1;
+    ydx = _mm256_set1_epi16((uint16_t)(y * dx));
+
+    int base_x = (-y * dx) >> frac_bits_x;
+    for (int j = 0; j < W; j += 16) {
+      j256 = _mm256_set1_epi16(j);
+      int base_shift = 0;
+      if ((base_x + j) < (min_base_x - 1)) {
+        base_shift = (min_base_x - (base_x + j) - 1);
+      }
+      int base_min_diff = (min_base_x - base_x - j);
+      if (base_min_diff > 16) {
+        base_min_diff = 16;
+      } else {
+        if (base_min_diff < 0) base_min_diff = 0;
+      }
+
+      if (base_shift < 16) {
+        a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + j));
+        a1_x128 =
+            _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1 + j));
+        a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]);
+        a1_x128 = _mm_shuffle_epi8(a1_x128, *(__m128i *)LoadMaskx[base_shift]);
+
+        a0_x = _mm256_cvtepu8_epi16(a0_x128);
+        a1_x = _mm256_cvtepu8_epi16(a1_x128);
+
+        r6 = _mm256_slli_epi16(_mm256_add_epi16(c0123, j256), 6);
+        shift = _mm256_srli_epi16(
+            _mm256_and_si256(_mm256_sub_epi16(r6, ydx), c3f), 1);
+
+        diff = _mm256_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
+        a32 = _mm256_slli_epi16(a0_x, 5);     // a[x] * 32
+        a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
+
+        b = _mm256_mullo_epi16(diff, shift);
+        res = _mm256_add_epi16(a32, b);
+        res = _mm256_srli_epi16(res, 5);  // 16 16-bit values
+        resx = _mm256_castsi256_si128(_mm256_packus_epi16(
+            res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
+      } else {
+        resx = _mm_setzero_si128();
+      }
+
+      // y calc
+      if (base_x < min_base_x) {
+        __m256i c256, y_c256, base_y_c256, mask256, mul16;
+        r6 = _mm256_set1_epi16(r << 6);
+        c256 = _mm256_add_epi16(j256, c1234);
+        mul16 = _mm256_min_epu16(_mm256_mullo_epi16(c256, dy256),
+                                 _mm256_srli_epi16(min_base_y256, 1));
+        y_c256 = _mm256_sub_epi16(r6, mul16);
+
+        base_y_c256 = _mm256_srai_epi16(y_c256, frac_bits_y);
+        mask256 = _mm256_cmpgt_epi16(min_base_y256, base_y_c256);
+
+        base_y_c256 = _mm256_blendv_epi8(base_y_c256, min_base_y256, mask256);
+        int16_t min_y = (int16_t)_mm_extract_epi16(
+            _mm256_extracti128_si256(base_y_c256, 1), 7);
+        int16_t max_y =
+            (int16_t)_mm_extract_epi16(_mm256_castsi256_si128(base_y_c256), 0);
+        int16_t offset_diff = max_y - min_y;
+
+        if (offset_diff < 16) {
+          __m256i min_y256 = _mm256_set1_epi16(min_y);
+
+          __m256i base_y_offset = _mm256_sub_epi16(base_y_c256, min_y256);
+          __m128i base_y_offset128 =
+              _mm_packs_epi16(_mm256_extracti128_si256(base_y_offset, 0),
+                              _mm256_extracti128_si256(base_y_offset, 1));
+
+          __m128i a0_y128 = _mm_maskload_epi32(
+              (int *)(left + min_y), *(__m128i *)LoadMaskz2[offset_diff / 4]);
+          __m128i a1_y128 =
+              _mm_maskload_epi32((int *)(left + min_y + 1),
+                                 *(__m128i *)LoadMaskz2[offset_diff / 4]);
+          a0_y128 = _mm_shuffle_epi8(a0_y128, base_y_offset128);
+          a1_y128 = _mm_shuffle_epi8(a1_y128, base_y_offset128);
+          a0_y = _mm256_cvtepu8_epi16(a0_y128);
+          a1_y = _mm256_cvtepu8_epi16(a1_y128);
+        } else {
+          base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
+          _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
+
+          a0_y = _mm256_setr_epi16(
+              left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
+              left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
+              left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
+              left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
+              left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
+              left[base_y_c[15]]);
+          base_y_c256 = _mm256_add_epi16(base_y_c256, c1);
+          _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
+
+          a1_y = _mm256_setr_epi16(
+              left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
+              left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
+              left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
+              left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
+              left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
+              left[base_y_c[15]]);
+        }
+        shifty = _mm256_srli_epi16(_mm256_and_si256(y_c256, c3f), 1);
+
+        diff = _mm256_sub_epi16(a1_y, a0_y);  // a[x+1] - a[x]
+        a32 = _mm256_slli_epi16(a0_y, 5);     // a[x] * 32
+        a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
+
+        b = _mm256_mullo_epi16(diff, shifty);
+        res = _mm256_add_epi16(a32, b);
+        res = _mm256_srli_epi16(res, 5);  // 16 16-bit values
+        resy = _mm256_castsi256_si128(_mm256_packus_epi16(
+            res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
+      } else {
+        resy = _mm_setzero_si128();
+      }
+      resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]);
+      _mm_storeu_si128((__m128i *)(dst + j), resxy);
+    }  // for j
+    dst += stride;
+  }
+}
+
+// Directional prediction, zone 2: 90 < angle < 180
+void av1_dr_prediction_z2_avx2(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+                               const uint8_t *above, const uint8_t *left,
+                               int upsample_above, int upsample_left, int dx,
+                               int dy) {
+  assert(dx > 0);
+  assert(dy > 0);
+  switch (bw) {
+    case 4:
+      dr_prediction_z2_Nx4_avx2(bh, dst, stride, above, left, upsample_above,
+                                upsample_left, dx, dy);
+      break;
+    case 8:
+      dr_prediction_z2_Nx8_avx2(bh, dst, stride, above, left, upsample_above,
+                                upsample_left, dx, dy);
+      break;
+    default:
+      dr_prediction_z2_HxW_avx2(bh, bw, dst, stride, above, left,
+                                upsample_above, upsample_left, dx, dy);
+      break;
+  }
+  return;
+}
+
+// z3 functions
+static INLINE void transpose4x16_sse2(__m128i *x, __m128i *d) {
+  __m128i w0, w1, w2, w3, ww0, ww1, ww2, ww3;
+  w0 = _mm_unpacklo_epi8(x[0], x[1]);
+  w1 = _mm_unpacklo_epi8(x[2], x[3]);
+  w2 = _mm_unpackhi_epi8(x[0], x[1]);
+  w3 = _mm_unpackhi_epi8(x[2], x[3]);
+
+  ww0 = _mm_unpacklo_epi16(w0, w1);
+  ww1 = _mm_unpacklo_epi16(w2, w3);
+  ww2 = _mm_unpackhi_epi16(w0, w1);
+  ww3 = _mm_unpackhi_epi16(w2, w3);
+
+  w0 = _mm_unpacklo_epi32(ww0, ww1);
+  w2 = _mm_unpacklo_epi32(ww2, ww3);
+  w1 = _mm_unpackhi_epi32(ww0, ww1);
+  w3 = _mm_unpackhi_epi32(ww2, ww3);
+
+  d[0] = _mm_unpacklo_epi64(w0, w2);
+  d[1] = _mm_unpackhi_epi64(w0, w2);
+  d[2] = _mm_unpacklo_epi64(w1, w3);
+  d[3] = _mm_unpackhi_epi64(w1, w3);
+
+  d[4] = _mm_srli_si128(d[0], 8);
+  d[5] = _mm_srli_si128(d[1], 8);
+  d[6] = _mm_srli_si128(d[2], 8);
+  d[7] = _mm_srli_si128(d[3], 8);
+
+  d[8] = _mm_srli_si128(d[0], 4);
+  d[9] = _mm_srli_si128(d[1], 4);
+  d[10] = _mm_srli_si128(d[2], 4);
+  d[11] = _mm_srli_si128(d[3], 4);
+
+  d[12] = _mm_srli_si128(d[0], 12);
+  d[13] = _mm_srli_si128(d[1], 12);
+  d[14] = _mm_srli_si128(d[2], 12);
+  d[15] = _mm_srli_si128(d[3], 12);
+}
+
+static INLINE void transpose16x32_avx2(__m256i *x, __m256i *d) {
+  __m256i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
+  __m256i w10, w11, w12, w13, w14, w15;
+
+  w0 = _mm256_unpacklo_epi8(x[0], x[1]);
+  w1 = _mm256_unpacklo_epi8(x[2], x[3]);
+  w2 = _mm256_unpacklo_epi8(x[4], x[5]);
+  w3 = _mm256_unpacklo_epi8(x[6], x[7]);
+
+  w8 = _mm256_unpacklo_epi8(x[8], x[9]);
+  w9 = _mm256_unpacklo_epi8(x[10], x[11]);
+  w10 = _mm256_unpacklo_epi8(x[12], x[13]);
+  w11 = _mm256_unpacklo_epi8(x[14], x[15]);
+
+  w4 = _mm256_unpacklo_epi16(w0, w1);
+  w5 = _mm256_unpacklo_epi16(w2, w3);
+  w12 = _mm256_unpacklo_epi16(w8, w9);
+  w13 = _mm256_unpacklo_epi16(w10, w11);
+
+  w6 = _mm256_unpacklo_epi32(w4, w5);
+  w7 = _mm256_unpackhi_epi32(w4, w5);
+  w14 = _mm256_unpacklo_epi32(w12, w13);
+  w15 = _mm256_unpackhi_epi32(w12, w13);
+
+  // Store first 4-line result
+  d[0] = _mm256_unpacklo_epi64(w6, w14);
+  d[1] = _mm256_unpackhi_epi64(w6, w14);
+  d[2] = _mm256_unpacklo_epi64(w7, w15);
+  d[3] = _mm256_unpackhi_epi64(w7, w15);
+
+  w4 = _mm256_unpackhi_epi16(w0, w1);
+  w5 = _mm256_unpackhi_epi16(w2, w3);
+  w12 = _mm256_unpackhi_epi16(w8, w9);
+  w13 = _mm256_unpackhi_epi16(w10, w11);
+
+  w6 = _mm256_unpacklo_epi32(w4, w5);
+  w7 = _mm256_unpackhi_epi32(w4, w5);
+  w14 = _mm256_unpacklo_epi32(w12, w13);
+  w15 = _mm256_unpackhi_epi32(w12, w13);
+
+  // Store second 4-line result
+  d[4] = _mm256_unpacklo_epi64(w6, w14);
+  d[5] = _mm256_unpackhi_epi64(w6, w14);
+  d[6] = _mm256_unpacklo_epi64(w7, w15);
+  d[7] = _mm256_unpackhi_epi64(w7, w15);
+
+  // upper half
+  w0 = _mm256_unpackhi_epi8(x[0], x[1]);
+  w1 = _mm256_unpackhi_epi8(x[2], x[3]);
+  w2 = _mm256_unpackhi_epi8(x[4], x[5]);
+  w3 = _mm256_unpackhi_epi8(x[6], x[7]);
+
+  w8 = _mm256_unpackhi_epi8(x[8], x[9]);
+  w9 = _mm256_unpackhi_epi8(x[10], x[11]);
+  w10 = _mm256_unpackhi_epi8(x[12], x[13]);
+  w11 = _mm256_unpackhi_epi8(x[14], x[15]);
+
+  w4 = _mm256_unpacklo_epi16(w0, w1);
+  w5 = _mm256_unpacklo_epi16(w2, w3);
+  w12 = _mm256_unpacklo_epi16(w8, w9);
+  w13 = _mm256_unpacklo_epi16(w10, w11);
+
+  w6 = _mm256_unpacklo_epi32(w4, w5);
+  w7 = _mm256_unpackhi_epi32(w4, w5);
+  w14 = _mm256_unpacklo_epi32(w12, w13);
+  w15 = _mm256_unpackhi_epi32(w12, w13);
+
+  // Store first 4-line result
+  d[8] = _mm256_unpacklo_epi64(w6, w14);
+  d[9] = _mm256_unpackhi_epi64(w6, w14);
+  d[10] = _mm256_unpacklo_epi64(w7, w15);
+  d[11] = _mm256_unpackhi_epi64(w7, w15);
+
+  w4 = _mm256_unpackhi_epi16(w0, w1);
+  w5 = _mm256_unpackhi_epi16(w2, w3);
+  w12 = _mm256_unpackhi_epi16(w8, w9);
+  w13 = _mm256_unpackhi_epi16(w10, w11);
+
+  w6 = _mm256_unpacklo_epi32(w4, w5);
+  w7 = _mm256_unpackhi_epi32(w4, w5);
+  w14 = _mm256_unpacklo_epi32(w12, w13);
+  w15 = _mm256_unpackhi_epi32(w12, w13);
+
+  // Store second 4-line result
+  d[12] = _mm256_unpacklo_epi64(w6, w14);
+  d[13] = _mm256_unpackhi_epi64(w6, w14);
+  d[14] = _mm256_unpacklo_epi64(w7, w15);
+  d[15] = _mm256_unpackhi_epi64(w7, w15);
+}
+
+static INLINE void transpose16x16_sse2(__m128i *x, __m128i *d) {
+  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
+  __m128i w10, w11, w12, w13, w14, w15;
+
+  w0 = _mm_unpacklo_epi8(x[0], x[1]);
+  w1 = _mm_unpacklo_epi8(x[2], x[3]);
+  w2 = _mm_unpacklo_epi8(x[4], x[5]);
+  w3 = _mm_unpacklo_epi8(x[6], x[7]);
+
+  w8 = _mm_unpacklo_epi8(x[8], x[9]);
+  w9 = _mm_unpacklo_epi8(x[10], x[11]);
+  w10 = _mm_unpacklo_epi8(x[12], x[13]);
+  w11 = _mm_unpacklo_epi8(x[14], x[15]);
+
+  w4 = _mm_unpacklo_epi16(w0, w1);
+  w5 = _mm_unpacklo_epi16(w2, w3);
+  w12 = _mm_unpacklo_epi16(w8, w9);
+  w13 = _mm_unpacklo_epi16(w10, w11);
+
+  w6 = _mm_unpacklo_epi32(w4, w5);
+  w7 = _mm_unpackhi_epi32(w4, w5);
+  w14 = _mm_unpacklo_epi32(w12, w13);
+  w15 = _mm_unpackhi_epi32(w12, w13);
+
+  // Store first 4-line result
+  d[0] = _mm_unpacklo_epi64(w6, w14);
+  d[1] = _mm_unpackhi_epi64(w6, w14);
+  d[2] = _mm_unpacklo_epi64(w7, w15);
+  d[3] = _mm_unpackhi_epi64(w7, w15);
+
+  w4 = _mm_unpackhi_epi16(w0, w1);
+  w5 = _mm_unpackhi_epi16(w2, w3);
+  w12 = _mm_unpackhi_epi16(w8, w9);
+  w13 = _mm_unpackhi_epi16(w10, w11);
+
+  w6 = _mm_unpacklo_epi32(w4, w5);
+  w7 = _mm_unpackhi_epi32(w4, w5);
+  w14 = _mm_unpacklo_epi32(w12, w13);
+  w15 = _mm_unpackhi_epi32(w12, w13);
+
+  // Store second 4-line result
+  d[4] = _mm_unpacklo_epi64(w6, w14);
+  d[5] = _mm_unpackhi_epi64(w6, w14);
+  d[6] = _mm_unpacklo_epi64(w7, w15);
+  d[7] = _mm_unpackhi_epi64(w7, w15);
+
+  // upper half
+  w0 = _mm_unpackhi_epi8(x[0], x[1]);
+  w1 = _mm_unpackhi_epi8(x[2], x[3]);
+  w2 = _mm_unpackhi_epi8(x[4], x[5]);
+  w3 = _mm_unpackhi_epi8(x[6], x[7]);
+
+  w8 = _mm_unpackhi_epi8(x[8], x[9]);
+  w9 = _mm_unpackhi_epi8(x[10], x[11]);
+  w10 = _mm_unpackhi_epi8(x[12], x[13]);
+  w11 = _mm_unpackhi_epi8(x[14], x[15]);
+
+  w4 = _mm_unpacklo_epi16(w0, w1);
+  w5 = _mm_unpacklo_epi16(w2, w3);
+  w12 = _mm_unpacklo_epi16(w8, w9);
+  w13 = _mm_unpacklo_epi16(w10, w11);
+
+  w6 = _mm_unpacklo_epi32(w4, w5);
+  w7 = _mm_unpackhi_epi32(w4, w5);
+  w14 = _mm_unpacklo_epi32(w12, w13);
+  w15 = _mm_unpackhi_epi32(w12, w13);
+
+  // Store first 4-line result
+  d[8] = _mm_unpacklo_epi64(w6, w14);
+  d[9] = _mm_unpackhi_epi64(w6, w14);
+  d[10] = _mm_unpacklo_epi64(w7, w15);
+  d[11] = _mm_unpackhi_epi64(w7, w15);
+
+  w4 = _mm_unpackhi_epi16(w0, w1);
+  w5 = _mm_unpackhi_epi16(w2, w3);
+  w12 = _mm_unpackhi_epi16(w8, w9);
+  w13 = _mm_unpackhi_epi16(w10, w11);
+
+  w6 = _mm_unpacklo_epi32(w4, w5);
+  w7 = _mm_unpackhi_epi32(w4, w5);
+  w14 = _mm_unpacklo_epi32(w12, w13);
+  w15 = _mm_unpackhi_epi32(w12, w13);
+
+  // Store second 4-line result
+  d[12] = _mm_unpacklo_epi64(w6, w14);
+  d[13] = _mm_unpackhi_epi64(w6, w14);
+  d[14] = _mm_unpacklo_epi64(w7, w15);
+  d[15] = _mm_unpackhi_epi64(w7, w15);
+}
+
+static void transpose_TX_16X16(const uint8_t *src, ptrdiff_t pitchSrc,
+                               uint8_t *dst, ptrdiff_t pitchDst) {
+  __m128i r[16];
+  __m128i d[16];
+  for (int j = 0; j < 16; j++) {
+    r[j] = _mm_loadu_si128((__m128i *)(src + j * pitchSrc));
+  }
+  transpose16x16_sse2(r, d);
+  for (int j = 0; j < 16; j++) {
+    _mm_storeu_si128((__m128i *)(dst + j * pitchDst), d[j]);
+  }
+}
+
+static void transpose(const uint8_t *src, ptrdiff_t pitchSrc, uint8_t *dst,
+                      ptrdiff_t pitchDst, int width, int height) {
+  for (int j = 0; j < height; j += 16)
+    for (int i = 0; i < width; i += 16)
+      transpose_TX_16X16(src + i * pitchSrc + j, pitchSrc,
+                         dst + j * pitchDst + i, pitchDst);
+}
+
+static void dr_prediction_z3_4x4_avx2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *left, int upsample_left,
+                                      int dy) {
+  __m128i dstvec[4], d[4];
+
+  dr_prediction_z1_HxW_internal_avx2(4, 4, dstvec, left, upsample_left, dy);
+  transpose4x8_8x4_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
+                            &d[0], &d[1], &d[2], &d[3]);
+
+  *(uint32_t *)(dst + stride * 0) = _mm_cvtsi128_si32(d[0]);
+  *(uint32_t *)(dst + stride * 1) = _mm_cvtsi128_si32(d[1]);
+  *(uint32_t *)(dst + stride * 2) = _mm_cvtsi128_si32(d[2]);
+  *(uint32_t *)(dst + stride * 3) = _mm_cvtsi128_si32(d[3]);
+  return;
+}
+
+static void dr_prediction_z3_8x8_avx2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *left, int upsample_left,
+                                      int dy) {
+  __m128i dstvec[8], d[8];
+
+  dr_prediction_z1_HxW_internal_avx2(8, 8, dstvec, left, upsample_left, dy);
+  transpose8x8_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4],
+                    &dstvec[5], &dstvec[6], &dstvec[7], &d[0], &d[1], &d[2],
+                    &d[3]);
+
+  _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
+  _mm_storel_epi64((__m128i *)(dst + 1 * stride), _mm_srli_si128(d[0], 8));
+  _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[1]);
+  _mm_storel_epi64((__m128i *)(dst + 3 * stride), _mm_srli_si128(d[1], 8));
+  _mm_storel_epi64((__m128i *)(dst + 4 * stride), d[2]);
+  _mm_storel_epi64((__m128i *)(dst + 5 * stride), _mm_srli_si128(d[2], 8));
+  _mm_storel_epi64((__m128i *)(dst + 6 * stride), d[3]);
+  _mm_storel_epi64((__m128i *)(dst + 7 * stride), _mm_srli_si128(d[3], 8));
+}
+
+static void dr_prediction_z3_4x8_avx2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *left, int upsample_left,
+                                      int dy) {
+  __m128i dstvec[4], d[8];
+
+  dr_prediction_z1_HxW_internal_avx2(8, 4, dstvec, left, upsample_left, dy);
+  transpose4x8_8x4_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &d[0],
+                        &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]);
+  for (int i = 0; i < 8; i++) {
+    *(uint32_t *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]);
+  }
+}
+
+static void dr_prediction_z3_8x4_avx2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *left, int upsample_left,
+                                      int dy) {
+  __m128i dstvec[8], d[4];
+
+  dr_prediction_z1_HxW_internal_avx2(4, 8, dstvec, left, upsample_left, dy);
+  transpose8x8_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
+                        &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7], &d[0],
+                        &d[1], &d[2], &d[3]);
+  _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
+  _mm_storel_epi64((__m128i *)(dst + 1 * stride), d[1]);
+  _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[2]);
+  _mm_storel_epi64((__m128i *)(dst + 3 * stride), d[3]);
+}
+
+static void dr_prediction_z3_8x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *left, int upsample_left,
+                                       int dy) {
+  __m128i dstvec[8], d[8];
+
+  dr_prediction_z1_HxW_internal_avx2(16, 8, dstvec, left, upsample_left, dy);
+  transpose8x16_16x8_sse2(dstvec, dstvec + 1, dstvec + 2, dstvec + 3,
+                          dstvec + 4, dstvec + 5, dstvec + 6, dstvec + 7, d,
+                          d + 1, d + 2, d + 3, d + 4, d + 5, d + 6, d + 7);
+  for (int i = 0; i < 8; i++) {
+    _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]);
+    _mm_storel_epi64((__m128i *)(dst + (i + 8) * stride),
+                     _mm_srli_si128(d[i], 8));
+  }
+}
+
+static void dr_prediction_z3_16x8_avx2(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *left, int upsample_left,
+                                       int dy) {
+  __m128i dstvec[16], d[16];
+
+  dr_prediction_z1_HxW_internal_avx2(8, 16, dstvec, left, upsample_left, dy);
+  transpose16x8_8x16_sse2(
+      &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
+      &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
+      &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
+      &d[3], &d[4], &d[5], &d[6], &d[7]);
+
+  for (int i = 0; i < 8; i++) {
+    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
+  }
+}
+
+static void dr_prediction_z3_4x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *left, int upsample_left,
+                                       int dy) {
+  __m128i dstvec[4], d[16];
+
+  dr_prediction_z1_HxW_internal_avx2(16, 4, dstvec, left, upsample_left, dy);
+  transpose4x16_sse2(dstvec, d);
+  for (int i = 0; i < 16; i++) {
+    *(uint32_t *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]);
+  }
+}
+
+static void dr_prediction_z3_16x4_avx2(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *left, int upsample_left,
+                                       int dy) {
+  __m128i dstvec[16], d[8];
+
+  dr_prediction_z1_HxW_internal_avx2(4, 16, dstvec, left, upsample_left, dy);
+  for (int i = 4; i < 8; i++) {
+    d[i] = _mm_setzero_si128();
+  }
+  transpose16x8_8x16_sse2(
+      &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
+      &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
+      &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
+      &d[3], &d[4], &d[5], &d[6], &d[7]);
+
+  for (int i = 0; i < 4; i++) {
+    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
+  }
+}
+
+static void dr_prediction_z3_8x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *left, int upsample_left,
+                                       int dy) {
+  __m256i dstvec[16], d[16];
+
+  dr_prediction_z1_32xN_internal_avx2(8, dstvec, left, upsample_left, dy);
+  for (int i = 8; i < 16; i++) {
+    dstvec[i] = _mm256_setzero_si256();
+  }
+  transpose16x32_avx2(dstvec, d);
+
+  for (int i = 0; i < 16; i++) {
+    _mm_storel_epi64((__m128i *)(dst + i * stride),
+                     _mm256_castsi256_si128(d[i]));
+  }
+  for (int i = 0; i < 16; i++) {
+    _mm_storel_epi64((__m128i *)(dst + (i + 16) * stride),
+                     _mm256_extracti128_si256(d[i], 1));
+  }
+}
+
+static void dr_prediction_z3_32x8_avx2(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *left, int upsample_left,
+                                       int dy) {
+  __m128i dstvec[32], d[16];
+
+  dr_prediction_z1_HxW_internal_avx2(8, 32, dstvec, left, upsample_left, dy);
+
+  transpose16x8_8x16_sse2(
+      &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
+      &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
+      &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
+      &d[3], &d[4], &d[5], &d[6], &d[7]);
+  transpose16x8_8x16_sse2(
+      &dstvec[0 + 16], &dstvec[1 + 16], &dstvec[2 + 16], &dstvec[3 + 16],
+      &dstvec[4 + 16], &dstvec[5 + 16], &dstvec[6 + 16], &dstvec[7 + 16],
+      &dstvec[8 + 16], &dstvec[9 + 16], &dstvec[10 + 16], &dstvec[11 + 16],
+      &dstvec[12 + 16], &dstvec[13 + 16], &dstvec[14 + 16], &dstvec[15 + 16],
+      &d[0 + 8], &d[1 + 8], &d[2 + 8], &d[3 + 8], &d[4 + 8], &d[5 + 8],
+      &d[6 + 8], &d[7 + 8]);
+
+  for (int i = 0; i < 8; i++) {
+    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
+    _mm_storeu_si128((__m128i *)(dst + i * stride + 16), d[i + 8]);
+  }
+}
+
+static void dr_prediction_z3_16x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *left, int upsample_left,
+                                        int dy) {
+  __m128i dstvec[16], d[16];
+
+  dr_prediction_z1_HxW_internal_avx2(16, 16, dstvec, left, upsample_left, dy);
+  transpose16x16_sse2(dstvec, d);
+
+  for (int i = 0; i < 16; i++) {
+    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
+  }
+}
+
+static void dr_prediction_z3_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *left, int upsample_left,
+                                        int dy) {
+  __m256i dstvec[32], d[32];
+
+  dr_prediction_z1_32xN_internal_avx2(32, dstvec, left, upsample_left, dy);
+  transpose16x32_avx2(dstvec, d);
+  transpose16x32_avx2(dstvec + 16, d + 16);
+  for (int j = 0; j < 16; j++) {
+    _mm_storeu_si128((__m128i *)(dst + j * stride),
+                     _mm256_castsi256_si128(d[j]));
+    _mm_storeu_si128((__m128i *)(dst + j * stride + 16),
+                     _mm256_castsi256_si128(d[j + 16]));
+  }
+  for (int j = 0; j < 16; j++) {
+    _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride),
+                     _mm256_extracti128_si256(d[j], 1));
+    _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride + 16),
+                     _mm256_extracti128_si256(d[j + 16], 1));
+  }
+}
+
+static void dr_prediction_z3_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *left, int upsample_left,
+                                        int dy) {
+  DECLARE_ALIGNED(16, uint8_t, dstT[64 * 64]);
+  dr_prediction_z1_64xN_avx2(64, dstT, 64, left, upsample_left, dy);
+  transpose(dstT, 64, dst, stride, 64, 64);
+}
+
+static void dr_prediction_z3_16x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *left, int upsample_left,
+                                        int dy) {
+  __m256i dstvec[16], d[16];
+
+  dr_prediction_z1_32xN_internal_avx2(16, dstvec, left, upsample_left, dy);
+  transpose16x32_avx2(dstvec, d);
+  // store
+  for (int j = 0; j < 16; j++) {
+    _mm_storeu_si128((__m128i *)(dst + j * stride),
+                     _mm256_castsi256_si128(d[j]));
+    _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride),
+                     _mm256_extracti128_si256(d[j], 1));
+  }
+}
+
+static void dr_prediction_z3_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *left, int upsample_left,
+                                        int dy) {
+  __m128i dstvec[32], d[16];
+
+  dr_prediction_z1_HxW_internal_avx2(16, 32, dstvec, left, upsample_left, dy);
+  for (int i = 0; i < 32; i += 16) {
+    transpose16x16_sse2((dstvec + i), d);
+    for (int j = 0; j < 16; j++) {
+      _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]);
+    }
+  }
+}
+
+static void dr_prediction_z3_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *left, int upsample_left,
+                                        int dy) {
+  uint8_t dstT[64 * 32];
+  dr_prediction_z1_64xN_avx2(32, dstT, 64, left, upsample_left, dy);
+  transpose(dstT, 64, dst, stride, 32, 64);
+}
+
+static void dr_prediction_z3_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *left, int upsample_left,
+                                        int dy) {
+  uint8_t dstT[32 * 64];
+  dr_prediction_z1_32xN_avx2(64, dstT, 32, left, upsample_left, dy);
+  transpose(dstT, 32, dst, stride, 64, 32);
+  return;
+}
+
+static void dr_prediction_z3_16x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *left, int upsample_left,
+                                        int dy) {
+  uint8_t dstT[64 * 16];
+  dr_prediction_z1_64xN_avx2(16, dstT, 64, left, upsample_left, dy);
+  transpose(dstT, 64, dst, stride, 16, 64);
+}
+
+static void dr_prediction_z3_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *left, int upsample_left,
+                                        int dy) {
+  __m128i dstvec[64], d[16];
+
+  dr_prediction_z1_HxW_internal_avx2(16, 64, dstvec, left, upsample_left, dy);
+  for (int i = 0; i < 64; i += 16) {
+    transpose16x16_sse2((dstvec + i), d);
+    for (int j = 0; j < 16; j++) {
+      _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]);
+    }
+  }
+}
+
+void av1_dr_prediction_z3_avx2(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+                               const uint8_t *above, const uint8_t *left,
+                               int upsample_left, int dx, int dy) {
+  (void)above;
+  (void)dx;
+  assert(dx == 1);
+  assert(dy > 0);
+
+  if (bw == bh) {
+    switch (bw) {
+      case 4:
+        dr_prediction_z3_4x4_avx2(dst, stride, left, upsample_left, dy);
+        break;
+      case 8:
+        dr_prediction_z3_8x8_avx2(dst, stride, left, upsample_left, dy);
+        break;
+      case 16:
+        dr_prediction_z3_16x16_avx2(dst, stride, left, upsample_left, dy);
+        break;
+      case 32:
+        dr_prediction_z3_32x32_avx2(dst, stride, left, upsample_left, dy);
+        break;
+      case 64:
+        dr_prediction_z3_64x64_avx2(dst, stride, left, upsample_left, dy);
+        break;
+    }
+  } else {
+    if (bw < bh) {
+      if (bw + bw == bh) {
+        switch (bw) {
+          case 4:
+            dr_prediction_z3_4x8_avx2(dst, stride, left, upsample_left, dy);
+            break;
+          case 8:
+            dr_prediction_z3_8x16_avx2(dst, stride, left, upsample_left, dy);
+            break;
+          case 16:
+            dr_prediction_z3_16x32_avx2(dst, stride, left, upsample_left, dy);
+            break;
+          case 32:
+            dr_prediction_z3_32x64_avx2(dst, stride, left, upsample_left, dy);
+            break;
+        }
+      } else {
+        switch (bw) {
+          case 4:
+            dr_prediction_z3_4x16_avx2(dst, stride, left, upsample_left, dy);
+            break;
+          case 8:
+            dr_prediction_z3_8x32_avx2(dst, stride, left, upsample_left, dy);
+            break;
+          case 16:
+            dr_prediction_z3_16x64_avx2(dst, stride, left, upsample_left, dy);
+            break;
+        }
+      }
+    } else {
+      if (bh + bh == bw) {
+        switch (bh) {
+          case 4:
+            dr_prediction_z3_8x4_avx2(dst, stride, left, upsample_left, dy);
+            break;
+          case 8:
+            dr_prediction_z3_16x8_avx2(dst, stride, left, upsample_left, dy);
+            break;
+          case 16:
+            dr_prediction_z3_32x16_avx2(dst, stride, left, upsample_left, dy);
+            break;
+          case 32:
+            dr_prediction_z3_64x32_avx2(dst, stride, left, upsample_left, dy);
+            break;
+        }
+      } else {
+        switch (bh) {
+          case 4:
+            dr_prediction_z3_16x4_avx2(dst, stride, left, upsample_left, dy);
+            break;
+          case 8:
+            dr_prediction_z3_32x8_avx2(dst, stride, left, upsample_left, dy);
+            break;
+          case 16:
+            dr_prediction_z3_64x16_avx2(dst, stride, left, upsample_left, dy);
+            break;
+        }
+      }
+    }
+  }
+}
diff --git a/libs/libaom/src/aom_dsp/x86/intrapred_sse2.c b/libs/libaom/src/aom_dsp/x86/intrapred_sse2.c
new file mode 100644
index 000000000..5afef68c3
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/intrapred_sse2.c
@@ -0,0 +1,1411 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+#include "aom_dsp/x86/intrapred_x86.h"
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE void dc_store_4xh(uint32_t dc, int height, uint8_t *dst,
+                                ptrdiff_t stride) {
+  for (int i = 0; i < height; i += 2) {
+    *(uint32_t *)dst = dc;
+    dst += stride;
+    *(uint32_t *)dst = dc;
+    dst += stride;
+  }
+}
+
+static INLINE void dc_store_8xh(const __m128i *row, int height, uint8_t *dst,
+                                ptrdiff_t stride) {
+  int i;
+  for (i = 0; i < height; ++i) {
+    _mm_storel_epi64((__m128i *)dst, *row);
+    dst += stride;
+  }
+}
+
+static INLINE void dc_store_16xh(const __m128i *row, int height, uint8_t *dst,
+                                 ptrdiff_t stride) {
+  int i;
+  for (i = 0; i < height; ++i) {
+    _mm_store_si128((__m128i *)dst, *row);
+    dst += stride;
+  }
+}
+
+static INLINE void dc_store_32xh(const __m128i *row, int height, uint8_t *dst,
+                                 ptrdiff_t stride) {
+  int i;
+  for (i = 0; i < height; ++i) {
+    _mm_store_si128((__m128i *)dst, *row);
+    _mm_store_si128((__m128i *)(dst + 16), *row);
+    dst += stride;
+  }
+}
+
+static INLINE void dc_store_64xh(const __m128i *row, int height, uint8_t *dst,
+                                 ptrdiff_t stride) {
+  for (int i = 0; i < height; ++i) {
+    _mm_store_si128((__m128i *)dst, *row);
+    _mm_store_si128((__m128i *)(dst + 16), *row);
+    _mm_store_si128((__m128i *)(dst + 32), *row);
+    _mm_store_si128((__m128i *)(dst + 48), *row);
+    dst += stride;
+  }
+}
+
+static INLINE __m128i dc_sum_4(const uint8_t *ref) {
+  __m128i x = _mm_loadl_epi64((__m128i const *)ref);
+  const __m128i zero = _mm_setzero_si128();
+  x = _mm_unpacklo_epi8(x, zero);
+  return _mm_sad_epu8(x, zero);
+}
+
+static INLINE __m128i dc_sum_8(const uint8_t *ref) {
+  __m128i x = _mm_loadl_epi64((__m128i const *)ref);
+  const __m128i zero = _mm_setzero_si128();
+  return _mm_sad_epu8(x, zero);
+}
+
+static INLINE __m128i dc_sum_64(const uint8_t *ref) {
+  __m128i x0 = _mm_load_si128((__m128i const *)ref);
+  __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16));
+  __m128i x2 = _mm_load_si128((__m128i const *)(ref + 32));
+  __m128i x3 = _mm_load_si128((__m128i const *)(ref + 48));
+  const __m128i zero = _mm_setzero_si128();
+  x0 = _mm_sad_epu8(x0, zero);
+  x1 = _mm_sad_epu8(x1, zero);
+  x2 = _mm_sad_epu8(x2, zero);
+  x3 = _mm_sad_epu8(x3, zero);
+  x0 = _mm_add_epi16(x0, x1);
+  x2 = _mm_add_epi16(x2, x3);
+  x0 = _mm_add_epi16(x0, x2);
+  const __m128i high = _mm_unpackhi_epi64(x0, x0);
+  return _mm_add_epi16(x0, high);
+}
+
+#define DC_MULTIPLIER_1X2 0x5556
+#define DC_MULTIPLIER_1X4 0x3334
+
+#define DC_SHIFT2 16
+
+static INLINE int divide_using_multiply_shift(int num, int shift1,
+                                              int multiplier) {
+  const int interm = num >> shift1;
+  return interm * multiplier >> DC_SHIFT2;
+}
+
+// -----------------------------------------------------------------------------
+// DC_PRED
+
+void aom_dc_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  const __m128i sum_left = dc_sum_8(left);
+  __m128i sum_above = dc_sum_4(above);
+  sum_above = _mm_add_epi16(sum_left, sum_above);
+
+  uint32_t sum = _mm_cvtsi128_si32(sum_above);
+  sum += 6;
+  sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2);
+
+  const __m128i row = _mm_set1_epi8((uint8_t)sum);
+  const uint32_t pred = _mm_cvtsi128_si32(row);
+  dc_store_4xh(pred, 8, dst, stride);
+}
+
+void aom_dc_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const __m128i sum_left = dc_sum_16_sse2(left);
+  __m128i sum_above = dc_sum_4(above);
+  sum_above = _mm_add_epi16(sum_left, sum_above);
+
+  uint32_t sum = _mm_cvtsi128_si32(sum_above);
+  sum += 10;
+  sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X4);
+
+  const __m128i row = _mm_set1_epi8((uint8_t)sum);
+  const uint32_t pred = _mm_cvtsi128_si32(row);
+  dc_store_4xh(pred, 16, dst, stride);
+}
+
+void aom_dc_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  const __m128i sum_left = dc_sum_4(left);
+  __m128i sum_above = dc_sum_8(above);
+  sum_above = _mm_add_epi16(sum_above, sum_left);
+
+  uint32_t sum = _mm_cvtsi128_si32(sum_above);
+  sum += 6;
+  sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2);
+
+  const __m128i row = _mm_set1_epi8((uint8_t)sum);
+  dc_store_8xh(&row, 4, dst, stride);
+}
+
+void aom_dc_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const __m128i sum_left = dc_sum_16_sse2(left);
+  __m128i sum_above = dc_sum_8(above);
+  sum_above = _mm_add_epi16(sum_above, sum_left);
+
+  uint32_t sum = _mm_cvtsi128_si32(sum_above);
+  sum += 12;
+  sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X2);
+  const __m128i row = _mm_set1_epi8((uint8_t)sum);
+  dc_store_8xh(&row, 16, dst, stride);
+}
+
+void aom_dc_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const __m128i sum_left = dc_sum_32_sse2(left);
+  __m128i sum_above = dc_sum_8(above);
+  sum_above = _mm_add_epi16(sum_above, sum_left);
+
+  uint32_t sum = _mm_cvtsi128_si32(sum_above);
+  sum += 20;
+  sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X4);
+  const __m128i row = _mm_set1_epi8((uint8_t)sum);
+  dc_store_8xh(&row, 32, dst, stride);
+}
+
+void aom_dc_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const __m128i sum_left = dc_sum_4(left);
+  __m128i sum_above = dc_sum_16_sse2(above);
+  sum_above = _mm_add_epi16(sum_above, sum_left);
+
+  uint32_t sum = _mm_cvtsi128_si32(sum_above);
+  sum += 10;
+  sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X4);
+  const __m128i row = _mm_set1_epi8((uint8_t)sum);
+  dc_store_16xh(&row, 4, dst, stride);
+}
+
+void aom_dc_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const __m128i sum_left = dc_sum_8(left);
+  __m128i sum_above = dc_sum_16_sse2(above);
+  sum_above = _mm_add_epi16(sum_above, sum_left);
+
+  uint32_t sum = _mm_cvtsi128_si32(sum_above);
+  sum += 12;
+  sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X2);
+  const __m128i row = _mm_set1_epi8((uint8_t)sum);
+  dc_store_16xh(&row, 8, dst, stride);
+}
+
+void aom_dc_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  const __m128i sum_left = dc_sum_32_sse2(left);
+  __m128i sum_above = dc_sum_16_sse2(above);
+  sum_above = _mm_add_epi16(sum_left, sum_above);
+
+  uint32_t sum = _mm_cvtsi128_si32(sum_above);
+  sum += 24;
+  sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X2);
+  const __m128i row = _mm_set1_epi8((uint8_t)sum);
+  dc_store_16xh(&row, 32, dst, stride);
+}
+
+void aom_dc_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  const __m128i sum_left = dc_sum_64(left);
+  __m128i sum_above = dc_sum_16_sse2(above);
+  sum_above = _mm_add_epi16(sum_left, sum_above);
+
+  uint32_t sum = _mm_cvtsi128_si32(sum_above);
+  sum += 40;
+  sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X4);
+  const __m128i row = _mm_set1_epi8((uint8_t)sum);
+  dc_store_16xh(&row, 64, dst, stride);
+}
+
+void aom_dc_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  __m128i sum_above = dc_sum_32_sse2(above);
+  const __m128i sum_left = dc_sum_8(left);
+  sum_above = _mm_add_epi16(sum_above, sum_left);
+
+  uint32_t sum = _mm_cvtsi128_si32(sum_above);
+  sum += 20;
+  sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X4);
+  const __m128i row = _mm_set1_epi8((uint8_t)sum);
+  dc_store_32xh(&row, 8, dst, stride);
+}
+
+void aom_dc_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  __m128i sum_above = dc_sum_32_sse2(above);
+  const __m128i sum_left = dc_sum_16_sse2(left);
+  sum_above = _mm_add_epi16(sum_above, sum_left);
+
+  uint32_t sum = _mm_cvtsi128_si32(sum_above);
+  sum += 24;
+  sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X2);
+  const __m128i row = _mm_set1_epi8((uint8_t)sum);
+  dc_store_32xh(&row, 16, dst, stride);
+}
+
+void aom_dc_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  __m128i sum_above = dc_sum_32_sse2(above);
+  const __m128i sum_left = dc_sum_64(left);
+  sum_above = _mm_add_epi16(sum_above, sum_left);
+
+  uint32_t sum = _mm_cvtsi128_si32(sum_above);
+  sum += 48;
+  sum = divide_using_multiply_shift(sum, 5, DC_MULTIPLIER_1X2);
+  const __m128i row = _mm_set1_epi8((uint8_t)sum);
+  dc_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_dc_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  __m128i sum_above = dc_sum_64(above);
+  const __m128i sum_left = dc_sum_64(left);
+  sum_above = _mm_add_epi16(sum_above, sum_left);
+
+  uint32_t sum = _mm_cvtsi128_si32(sum_above);
+  sum += 64;
+  sum /= 128;
+  const __m128i row = _mm_set1_epi8((uint8_t)sum);
+  dc_store_64xh(&row, 64, dst, stride);
+}
+
+void aom_dc_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  __m128i sum_above = dc_sum_64(above);
+  const __m128i sum_left = dc_sum_32_sse2(left);
+  sum_above = _mm_add_epi16(sum_above, sum_left);
+
+  uint32_t sum = _mm_cvtsi128_si32(sum_above);
+  sum += 48;
+  sum = divide_using_multiply_shift(sum, 5, DC_MULTIPLIER_1X2);
+  const __m128i row = _mm_set1_epi8((uint8_t)sum);
+  dc_store_64xh(&row, 32, dst, stride);
+}
+
+void aom_dc_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  __m128i sum_above = dc_sum_64(above);
+  const __m128i sum_left = dc_sum_16_sse2(left);
+  sum_above = _mm_add_epi16(sum_above, sum_left);
+
+  uint32_t sum = _mm_cvtsi128_si32(sum_above);
+  sum += 40;
+  sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X4);
+  const __m128i row = _mm_set1_epi8((uint8_t)sum);
+  dc_store_64xh(&row, 16, dst, stride);
+}
+
+// -----------------------------------------------------------------------------
+// DC_TOP
+
+void aom_dc_top_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_4(above);
+  const __m128i two = _mm_set1_epi16((int16_t)2);
+  sum_above = _mm_add_epi16(sum_above, two);
+  sum_above = _mm_srai_epi16(sum_above, 2);
+  sum_above = _mm_shufflelo_epi16(sum_above, 0);
+  sum_above = _mm_packus_epi16(sum_above, sum_above);
+
+  const uint32_t pred = _mm_cvtsi128_si32(sum_above);
+  dc_store_4xh(pred, 8, dst, stride);
+}
+
+void aom_dc_top_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_4(above);
+  const __m128i two = _mm_set1_epi16((int16_t)2);
+  sum_above = _mm_add_epi16(sum_above, two);
+  sum_above = _mm_srai_epi16(sum_above, 2);
+  sum_above = _mm_shufflelo_epi16(sum_above, 0);
+  sum_above = _mm_packus_epi16(sum_above, sum_above);
+
+  const uint32_t pred = _mm_cvtsi128_si32(sum_above);
+  dc_store_4xh(pred, 16, dst, stride);
+}
+
+void aom_dc_top_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_8(above);
+  const __m128i four = _mm_set1_epi16((uint16_t)4);
+  sum_above = _mm_add_epi16(sum_above, four);
+  sum_above = _mm_srai_epi16(sum_above, 3);
+  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+  const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
+  dc_store_8xh(&row, 4, dst, stride);
+}
+
+void aom_dc_top_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_8(above);
+  const __m128i four = _mm_set1_epi16((uint16_t)4);
+  sum_above = _mm_add_epi16(sum_above, four);
+  sum_above = _mm_srai_epi16(sum_above, 3);
+  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+  const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
+  dc_store_8xh(&row, 16, dst, stride);
+}
+
+void aom_dc_top_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_8(above);
+  const __m128i four = _mm_set1_epi16((uint16_t)4);
+  sum_above = _mm_add_epi16(sum_above, four);
+  sum_above = _mm_srai_epi16(sum_above, 3);
+  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+  const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
+  dc_store_8xh(&row, 32, dst, stride);
+}
+
+void aom_dc_top_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_16_sse2(above);
+  const __m128i eight = _mm_set1_epi16((uint16_t)8);
+  sum_above = _mm_add_epi16(sum_above, eight);
+  sum_above = _mm_srai_epi16(sum_above, 4);
+  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+  sum_above = _mm_shufflelo_epi16(sum_above, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+  dc_store_16xh(&row, 4, dst, stride);
+}
+
+void aom_dc_top_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_16_sse2(above);
+  const __m128i eight = _mm_set1_epi16((uint16_t)8);
+  sum_above = _mm_add_epi16(sum_above, eight);
+  sum_above = _mm_srai_epi16(sum_above, 4);
+  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+  sum_above = _mm_shufflelo_epi16(sum_above, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+  dc_store_16xh(&row, 8, dst, stride);
+}
+
+void aom_dc_top_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_16_sse2(above);
+  const __m128i eight = _mm_set1_epi16((uint16_t)8);
+  sum_above = _mm_add_epi16(sum_above, eight);
+  sum_above = _mm_srai_epi16(sum_above, 4);
+  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+  sum_above = _mm_shufflelo_epi16(sum_above, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+  dc_store_16xh(&row, 32, dst, stride);
+}
+
+void aom_dc_top_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_16_sse2(above);
+  const __m128i eight = _mm_set1_epi16((uint16_t)8);
+  sum_above = _mm_add_epi16(sum_above, eight);
+  sum_above = _mm_srai_epi16(sum_above, 4);
+  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+  sum_above = _mm_shufflelo_epi16(sum_above, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+  dc_store_16xh(&row, 64, dst, stride);
+}
+
+void aom_dc_top_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_32_sse2(above);
+  const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
+  sum_above = _mm_add_epi16(sum_above, sixteen);
+  sum_above = _mm_srai_epi16(sum_above, 5);
+  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+  sum_above = _mm_shufflelo_epi16(sum_above, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+  dc_store_32xh(&row, 8, dst, stride);
+}
+
+void aom_dc_top_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_32_sse2(above);
+  const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
+  sum_above = _mm_add_epi16(sum_above, sixteen);
+  sum_above = _mm_srai_epi16(sum_above, 5);
+  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+  sum_above = _mm_shufflelo_epi16(sum_above, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+  dc_store_32xh(&row, 16, dst, stride);
+}
+
+void aom_dc_top_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_32_sse2(above);
+  const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
+  sum_above = _mm_add_epi16(sum_above, sixteen);
+  sum_above = _mm_srai_epi16(sum_above, 5);
+  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+  sum_above = _mm_shufflelo_epi16(sum_above, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+  dc_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_dc_top_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_64(above);
+  const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
+  sum_above = _mm_add_epi16(sum_above, thirtytwo);
+  sum_above = _mm_srai_epi16(sum_above, 6);
+  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+  sum_above = _mm_shufflelo_epi16(sum_above, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+  dc_store_64xh(&row, 64, dst, stride);
+}
+
+void aom_dc_top_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_64(above);
+  const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
+  sum_above = _mm_add_epi16(sum_above, thirtytwo);
+  sum_above = _mm_srai_epi16(sum_above, 6);
+  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+  sum_above = _mm_shufflelo_epi16(sum_above, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+  dc_store_64xh(&row, 32, dst, stride);
+}
+
+void aom_dc_top_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_64(above);
+  const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
+  sum_above = _mm_add_epi16(sum_above, thirtytwo);
+  sum_above = _mm_srai_epi16(sum_above, 6);
+  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+  sum_above = _mm_shufflelo_epi16(sum_above, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+  dc_store_64xh(&row, 16, dst, stride);
+}
+
+// -----------------------------------------------------------------------------
+// DC_LEFT
+
+void aom_dc_left_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_8(left);
+  const __m128i four = _mm_set1_epi16((uint16_t)4);
+  sum_left = _mm_add_epi16(sum_left, four);
+  sum_left = _mm_srai_epi16(sum_left, 3);
+  sum_left = _mm_shufflelo_epi16(sum_left, 0);
+  sum_left = _mm_packus_epi16(sum_left, sum_left);
+
+  const uint32_t pred = _mm_cvtsi128_si32(sum_left);
+  dc_store_4xh(pred, 8, dst, stride);
+}
+
+void aom_dc_left_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_16_sse2(left);
+  const __m128i eight = _mm_set1_epi16((uint16_t)8);
+  sum_left = _mm_add_epi16(sum_left, eight);
+  sum_left = _mm_srai_epi16(sum_left, 4);
+  sum_left = _mm_shufflelo_epi16(sum_left, 0);
+  sum_left = _mm_packus_epi16(sum_left, sum_left);
+
+  const uint32_t pred = _mm_cvtsi128_si32(sum_left);
+  dc_store_4xh(pred, 16, dst, stride);
+}
+
+void aom_dc_left_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_4(left);
+  const __m128i two = _mm_set1_epi16((uint16_t)2);
+  sum_left = _mm_add_epi16(sum_left, two);
+  sum_left = _mm_srai_epi16(sum_left, 2);
+  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+  const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
+  dc_store_8xh(&row, 4, dst, stride);
+}
+
+void aom_dc_left_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_16_sse2(left);
+  const __m128i eight = _mm_set1_epi16((uint16_t)8);
+  sum_left = _mm_add_epi16(sum_left, eight);
+  sum_left = _mm_srai_epi16(sum_left, 4);
+  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+  const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
+  dc_store_8xh(&row, 16, dst, stride);
+}
+
+void aom_dc_left_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_32_sse2(left);
+  const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
+  sum_left = _mm_add_epi16(sum_left, sixteen);
+  sum_left = _mm_srai_epi16(sum_left, 5);
+  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+  const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
+  dc_store_8xh(&row, 32, dst, stride);
+}
+
+void aom_dc_left_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_4(left);
+  const __m128i two = _mm_set1_epi16((uint16_t)2);
+  sum_left = _mm_add_epi16(sum_left, two);
+  sum_left = _mm_srai_epi16(sum_left, 2);
+  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+  sum_left = _mm_shufflelo_epi16(sum_left, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+  dc_store_16xh(&row, 4, dst, stride);
+}
+
+void aom_dc_left_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_8(left);
+  const __m128i four = _mm_set1_epi16((uint16_t)4);
+  sum_left = _mm_add_epi16(sum_left, four);
+  sum_left = _mm_srai_epi16(sum_left, 3);
+  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+  sum_left = _mm_shufflelo_epi16(sum_left, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+  dc_store_16xh(&row, 8, dst, stride);
+}
+
+void aom_dc_left_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_32_sse2(left);
+  const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
+  sum_left = _mm_add_epi16(sum_left, sixteen);
+  sum_left = _mm_srai_epi16(sum_left, 5);
+  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+  sum_left = _mm_shufflelo_epi16(sum_left, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+  dc_store_16xh(&row, 32, dst, stride);
+}
+
+void aom_dc_left_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_64(left);
+  const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
+  sum_left = _mm_add_epi16(sum_left, thirtytwo);
+  sum_left = _mm_srai_epi16(sum_left, 6);
+  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+  sum_left = _mm_shufflelo_epi16(sum_left, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+  dc_store_16xh(&row, 64, dst, stride);
+}
+
+void aom_dc_left_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_8(left);
+  const __m128i four = _mm_set1_epi16((uint16_t)4);
+  sum_left = _mm_add_epi16(sum_left, four);
+  sum_left = _mm_srai_epi16(sum_left, 3);
+  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+  sum_left = _mm_shufflelo_epi16(sum_left, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+  dc_store_32xh(&row, 8, dst, stride);
+}
+
+void aom_dc_left_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_16_sse2(left);
+  const __m128i eight = _mm_set1_epi16((uint16_t)8);
+  sum_left = _mm_add_epi16(sum_left, eight);
+  sum_left = _mm_srai_epi16(sum_left, 4);
+  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+  sum_left = _mm_shufflelo_epi16(sum_left, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+  dc_store_32xh(&row, 16, dst, stride);
+}
+
+void aom_dc_left_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_64(left);
+  const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
+  sum_left = _mm_add_epi16(sum_left, thirtytwo);
+  sum_left = _mm_srai_epi16(sum_left, 6);
+  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+  sum_left = _mm_shufflelo_epi16(sum_left, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+  dc_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_dc_left_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_64(left);
+  const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
+  sum_left = _mm_add_epi16(sum_left, thirtytwo);
+  sum_left = _mm_srai_epi16(sum_left, 6);
+  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+  sum_left = _mm_shufflelo_epi16(sum_left, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+  dc_store_64xh(&row, 64, dst, stride);
+}
+
+void aom_dc_left_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_32_sse2(left);
+  const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
+  sum_left = _mm_add_epi16(sum_left, sixteen);
+  sum_left = _mm_srai_epi16(sum_left, 5);
+  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+  sum_left = _mm_shufflelo_epi16(sum_left, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+  dc_store_64xh(&row, 32, dst, stride);
+}
+
+void aom_dc_left_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_16_sse2(left);
+  const __m128i eight = _mm_set1_epi16((uint16_t)8);
+  sum_left = _mm_add_epi16(sum_left, eight);
+  sum_left = _mm_srai_epi16(sum_left, 4);
+  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+  sum_left = _mm_shufflelo_epi16(sum_left, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+  dc_store_64xh(&row, 16, dst, stride);
+}
+
+// -----------------------------------------------------------------------------
+// DC_128
+
+void aom_dc_128_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const uint32_t pred = 0x80808080;
+  dc_store_4xh(pred, 8, dst, stride);
+}
+
+void aom_dc_128_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const uint32_t pred = 0x80808080;
+  dc_store_4xh(pred, 16, dst, stride);
+}
+
+void aom_dc_128_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  dc_store_8xh(&row, 4, dst, stride);
+}
+
+void aom_dc_128_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  dc_store_8xh(&row, 16, dst, stride);
+}
+
+void aom_dc_128_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  dc_store_8xh(&row, 32, dst, stride);
+}
+
+void aom_dc_128_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  dc_store_16xh(&row, 4, dst, stride);
+}
+
+void aom_dc_128_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  dc_store_16xh(&row, 8, dst, stride);
+}
+
+void aom_dc_128_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  dc_store_16xh(&row, 32, dst, stride);
+}
+
+void aom_dc_128_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  dc_store_16xh(&row, 64, dst, stride);
+}
+
+void aom_dc_128_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  dc_store_32xh(&row, 8, dst, stride);
+}
+
+void aom_dc_128_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  dc_store_32xh(&row, 16, dst, stride);
+}
+
+void aom_dc_128_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  dc_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_dc_128_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  dc_store_64xh(&row, 64, dst, stride);
+}
+
+void aom_dc_128_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  dc_store_64xh(&row, 32, dst, stride);
+}
+
+void aom_dc_128_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  dc_store_64xh(&row, 16, dst, stride);
+}
+
+// -----------------------------------------------------------------------------
+// V_PRED
+
+void aom_v_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  const uint32_t pred = *(uint32_t *)above;
+  (void)left;
+  dc_store_4xh(pred, 8, dst, stride);
+}
+
+void aom_v_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  const uint32_t pred = *(uint32_t *)above;
+  (void)left;
+  dc_store_4xh(pred, 16, dst, stride);
+}
+
+void aom_v_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  const __m128i row = _mm_loadl_epi64((__m128i const *)above);
+  (void)left;
+  dc_store_8xh(&row, 4, dst, stride);
+}
+
+void aom_v_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  const __m128i row = _mm_loadl_epi64((__m128i const *)above);
+  (void)left;
+  dc_store_8xh(&row, 16, dst, stride);
+}
+
+void aom_v_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  const __m128i row = _mm_loadl_epi64((__m128i const *)above);
+  (void)left;
+  dc_store_8xh(&row, 32, dst, stride);
+}
+
+void aom_v_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  const __m128i row = _mm_load_si128((__m128i const *)above);
+  (void)left;
+  dc_store_16xh(&row, 4, dst, stride);
+}
+
+void aom_v_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  const __m128i row = _mm_load_si128((__m128i const *)above);
+  (void)left;
+  dc_store_16xh(&row, 8, dst, stride);
+}
+
+void aom_v_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const __m128i row = _mm_load_si128((__m128i const *)above);
+  (void)left;
+  dc_store_16xh(&row, 32, dst, stride);
+}
+
+void aom_v_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const __m128i row = _mm_load_si128((__m128i const *)above);
+  (void)left;
+  dc_store_16xh(&row, 64, dst, stride);
+}
+
+static INLINE void v_predictor_32xh(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, int height) {
+  const __m128i row0 = _mm_load_si128((__m128i const *)above);
+  const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16));
+  for (int i = 0; i < height; ++i) {
+    _mm_store_si128((__m128i *)dst, row0);
+    _mm_store_si128((__m128i *)(dst + 16), row1);
+    dst += stride;
+  }
+}
+
+void aom_v_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  v_predictor_32xh(dst, stride, above, 8);
+}
+
+void aom_v_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  v_predictor_32xh(dst, stride, above, 16);
+}
+
+void aom_v_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  v_predictor_32xh(dst, stride, above, 64);
+}
+
+static INLINE void v_predictor_64xh(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, int height) {
+  const __m128i row0 = _mm_load_si128((__m128i const *)above);
+  const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16));
+  const __m128i row2 = _mm_load_si128((__m128i const *)(above + 32));
+  const __m128i row3 = _mm_load_si128((__m128i const *)(above + 48));
+  for (int i = 0; i < height; ++i) {
+    _mm_store_si128((__m128i *)dst, row0);
+    _mm_store_si128((__m128i *)(dst + 16), row1);
+    _mm_store_si128((__m128i *)(dst + 32), row2);
+    _mm_store_si128((__m128i *)(dst + 48), row3);
+    dst += stride;
+  }
+}
+
+void aom_v_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  v_predictor_64xh(dst, stride, above, 64);
+}
+
+void aom_v_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  v_predictor_64xh(dst, stride, above, 32);
+}
+
+void aom_v_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  v_predictor_64xh(dst, stride, above, 16);
+}
+
+// -----------------------------------------------------------------------------
+// H_PRED
+
+void aom_h_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  __m128i left_col = _mm_loadl_epi64((__m128i const *)left);
+  left_col = _mm_unpacklo_epi8(left_col, left_col);
+  __m128i row0 = _mm_shufflelo_epi16(left_col, 0);
+  __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55);
+  __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa);
+  __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff);
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
+  dst += stride;
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
+  dst += stride;
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
+  dst += stride;
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
+  dst += stride;
+  left_col = _mm_unpackhi_epi64(left_col, left_col);
+  row0 = _mm_shufflelo_epi16(left_col, 0);
+  row1 = _mm_shufflelo_epi16(left_col, 0x55);
+  row2 = _mm_shufflelo_epi16(left_col, 0xaa);
+  row3 = _mm_shufflelo_epi16(left_col, 0xff);
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
+  dst += stride;
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
+  dst += stride;
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
+  dst += stride;
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
+}
+
+void aom_h_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  const __m128i left_col = _mm_load_si128((__m128i const *)left);
+  __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col);
+  __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col);
+
+  __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0);
+  __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
+  __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
+  __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
+  dst += stride;
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
+  dst += stride;
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
+  dst += stride;
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
+  dst += stride;
+
+  left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low);
+  row0 = _mm_shufflelo_epi16(left_col_low, 0);
+  row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
+  row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
+  row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
+  dst += stride;
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
+  dst += stride;
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
+  dst += stride;
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
+  dst += stride;
+
+  row0 = _mm_shufflelo_epi16(left_col_high, 0);
+  row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
+  row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
+  row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
+  dst += stride;
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
+  dst += stride;
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
+  dst += stride;
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
+  dst += stride;
+
+  left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high);
+  row0 = _mm_shufflelo_epi16(left_col_high, 0);
+  row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
+  row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
+  row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
+  dst += stride;
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
+  dst += stride;
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
+  dst += stride;
+  *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
+}
+
+void aom_h_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  __m128i left_col = _mm_loadl_epi64((__m128i const *)left);
+  left_col = _mm_unpacklo_epi8(left_col, left_col);
+  __m128i row0 = _mm_shufflelo_epi16(left_col, 0);
+  __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55);
+  __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa);
+  __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff);
+  _mm_storel_epi64((__m128i *)dst, row0);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, row1);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, row2);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, row3);
+}
+
+static INLINE void h_predictor_8x16xc(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above, const uint8_t *left,
+                                      int count) {
+  (void)above;
+  for (int i = 0; i < count; ++i) {
+    const __m128i left_col = _mm_load_si128((__m128i const *)left);
+    __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col);
+    __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col);
+
+    __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0);
+    __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
+    __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
+    __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
+    _mm_storel_epi64((__m128i *)dst, row0);
+    dst += stride;
+    _mm_storel_epi64((__m128i *)dst, row1);
+    dst += stride;
+    _mm_storel_epi64((__m128i *)dst, row2);
+    dst += stride;
+    _mm_storel_epi64((__m128i *)dst, row3);
+    dst += stride;
+
+    left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low);
+    row0 = _mm_shufflelo_epi16(left_col_low, 0);
+    row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
+    row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
+    row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
+    _mm_storel_epi64((__m128i *)dst, row0);
+    dst += stride;
+    _mm_storel_epi64((__m128i *)dst, row1);
+    dst += stride;
+    _mm_storel_epi64((__m128i *)dst, row2);
+    dst += stride;
+    _mm_storel_epi64((__m128i *)dst, row3);
+    dst += stride;
+
+    row0 = _mm_shufflelo_epi16(left_col_high, 0);
+    row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
+    row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
+    row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
+    _mm_storel_epi64((__m128i *)dst, row0);
+    dst += stride;
+    _mm_storel_epi64((__m128i *)dst, row1);
+    dst += stride;
+    _mm_storel_epi64((__m128i *)dst, row2);
+    dst += stride;
+    _mm_storel_epi64((__m128i *)dst, row3);
+    dst += stride;
+
+    left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high);
+    row0 = _mm_shufflelo_epi16(left_col_high, 0);
+    row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
+    row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
+    row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
+    _mm_storel_epi64((__m128i *)dst, row0);
+    dst += stride;
+    _mm_storel_epi64((__m128i *)dst, row1);
+    dst += stride;
+    _mm_storel_epi64((__m128i *)dst, row2);
+    dst += stride;
+    _mm_storel_epi64((__m128i *)dst, row3);
+    dst += stride;
+    left += 16;
+  }
+}
+
+void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  h_predictor_8x16xc(dst, stride, above, left, 1);
+}
+
+void aom_h_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  h_predictor_8x16xc(dst, stride, above, left, 2);
+}
+
+static INLINE void h_pred_store_16xh(const __m128i *row, int h, uint8_t *dst,
+                                     ptrdiff_t stride) {
+  int i;
+  for (i = 0; i < h; ++i) {
+    _mm_store_si128((__m128i *)dst, row[i]);
+    dst += stride;
+  }
+}
+
+static INLINE void repeat_low_4pixels(const __m128i *x, __m128i *row) {
+  const __m128i u0 = _mm_shufflelo_epi16(*x, 0);
+  const __m128i u1 = _mm_shufflelo_epi16(*x, 0x55);
+  const __m128i u2 = _mm_shufflelo_epi16(*x, 0xaa);
+  const __m128i u3 = _mm_shufflelo_epi16(*x, 0xff);
+
+  row[0] = _mm_unpacklo_epi64(u0, u0);
+  row[1] = _mm_unpacklo_epi64(u1, u1);
+  row[2] = _mm_unpacklo_epi64(u2, u2);
+  row[3] = _mm_unpacklo_epi64(u3, u3);
+}
+
+static INLINE void repeat_high_4pixels(const __m128i *x, __m128i *row) {
+  const __m128i u0 = _mm_shufflehi_epi16(*x, 0);
+  const __m128i u1 = _mm_shufflehi_epi16(*x, 0x55);
+  const __m128i u2 = _mm_shufflehi_epi16(*x, 0xaa);
+  const __m128i u3 = _mm_shufflehi_epi16(*x, 0xff);
+
+  row[0] = _mm_unpackhi_epi64(u0, u0);
+  row[1] = _mm_unpackhi_epi64(u1, u1);
+  row[2] = _mm_unpackhi_epi64(u2, u2);
+  row[3] = _mm_unpackhi_epi64(u3, u3);
+}
+
+// Process 16x8, first 4 rows
+// Use first 8 bytes of left register: xxxxxxxx33221100
+static INLINE void h_prediction_16x8_1(const __m128i *left, uint8_t *dst,
+                                       ptrdiff_t stride) {
+  __m128i row[4];
+  repeat_low_4pixels(left, row);
+  h_pred_store_16xh(row, 4, dst, stride);
+}
+
+// Process 16x8, second 4 rows
+// Use second 8 bytes of left register: 77665544xxxxxxxx
+static INLINE void h_prediction_16x8_2(const __m128i *left, uint8_t *dst,
+                                       ptrdiff_t stride) {
+  __m128i row[4];
+  repeat_high_4pixels(left, row);
+  h_pred_store_16xh(row, 4, dst, stride);
+}
+
+void aom_h_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  const __m128i left_col = _mm_loadl_epi64((const __m128i *)left);
+  const __m128i left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
+  h_prediction_16x8_1(&left_col_8p, dst, stride);
+}
+
+void aom_h_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  const __m128i left_col = _mm_loadl_epi64((const __m128i *)left);
+  const __m128i left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
+  h_prediction_16x8_1(&left_col_8p, dst, stride);
+  dst += stride << 2;
+  h_prediction_16x8_2(&left_col_8p, dst, stride);
+}
+
+static INLINE void h_predictor_16xh(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *left, int count) {
+  int i = 0;
+  do {
+    const __m128i left_col = _mm_load_si128((const __m128i *)left);
+    const __m128i left_col_8p_lo = _mm_unpacklo_epi8(left_col, left_col);
+    h_prediction_16x8_1(&left_col_8p_lo, dst, stride);
+    dst += stride << 2;
+    h_prediction_16x8_2(&left_col_8p_lo, dst, stride);
+    dst += stride << 2;
+
+    const __m128i left_col_8p_hi = _mm_unpackhi_epi8(left_col, left_col);
+    h_prediction_16x8_1(&left_col_8p_hi, dst, stride);
+    dst += stride << 2;
+    h_prediction_16x8_2(&left_col_8p_hi, dst, stride);
+    dst += stride << 2;
+
+    left += 16;
+    i++;
+  } while (i < count);
+}
+
+void aom_h_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  h_predictor_16xh(dst, stride, left, 2);
+}
+
+void aom_h_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  h_predictor_16xh(dst, stride, left, 4);
+}
+
+static INLINE void h_pred_store_32xh(const __m128i *row, int h, uint8_t *dst,
+                                     ptrdiff_t stride) {
+  int i;
+  for (i = 0; i < h; ++i) {
+    _mm_store_si128((__m128i *)dst, row[i]);
+    _mm_store_si128((__m128i *)(dst + 16), row[i]);
+    dst += stride;
+  }
+}
+
+// Process 32x8, first 4 rows
+// Use first 8 bytes of left register: xxxxxxxx33221100
+static INLINE void h_prediction_32x8_1(const __m128i *left, uint8_t *dst,
+                                       ptrdiff_t stride) {
+  __m128i row[4];
+  repeat_low_4pixels(left, row);
+  h_pred_store_32xh(row, 4, dst, stride);
+}
+
+// Process 32x8, second 4 rows
+// Use second 8 bytes of left register: 77665544xxxxxxxx
+static INLINE void h_prediction_32x8_2(const __m128i *left, uint8_t *dst,
+                                       ptrdiff_t stride) {
+  __m128i row[4];
+  repeat_high_4pixels(left, row);
+  h_pred_store_32xh(row, 4, dst, stride);
+}
+
+void aom_h_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  __m128i left_col, left_col_8p;
+  (void)above;
+
+  left_col = _mm_load_si128((const __m128i *)left);
+
+  left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
+  h_prediction_32x8_1(&left_col_8p, dst, stride);
+  dst += stride << 2;
+  h_prediction_32x8_2(&left_col_8p, dst, stride);
+}
+
+void aom_h_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  __m128i left_col, left_col_8p;
+  (void)above;
+
+  left_col = _mm_load_si128((const __m128i *)left);
+
+  left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
+  h_prediction_32x8_1(&left_col_8p, dst, stride);
+  dst += stride << 2;
+  h_prediction_32x8_2(&left_col_8p, dst, stride);
+  dst += stride << 2;
+
+  left_col_8p = _mm_unpackhi_epi8(left_col, left_col);
+  h_prediction_32x8_1(&left_col_8p, dst, stride);
+  dst += stride << 2;
+  h_prediction_32x8_2(&left_col_8p, dst, stride);
+}
+
+static INLINE void h_predictor_32xh(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *left, int height) {
+  int i = height >> 2;
+  do {
+    __m128i left4 = _mm_cvtsi32_si128(((uint32_t *)left)[0]);
+    left4 = _mm_unpacklo_epi8(left4, left4);
+    left4 = _mm_unpacklo_epi8(left4, left4);
+    const __m128i r0 = _mm_shuffle_epi32(left4, 0x0);
+    const __m128i r1 = _mm_shuffle_epi32(left4, 0x55);
+    _mm_store_si128((__m128i *)dst, r0);
+    _mm_store_si128((__m128i *)(dst + 16), r0);
+    _mm_store_si128((__m128i *)(dst + stride), r1);
+    _mm_store_si128((__m128i *)(dst + stride + 16), r1);
+    const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa);
+    const __m128i r3 = _mm_shuffle_epi32(left4, 0xff);
+    _mm_store_si128((__m128i *)(dst + stride * 2), r2);
+    _mm_store_si128((__m128i *)(dst + stride * 2 + 16), r2);
+    _mm_store_si128((__m128i *)(dst + stride * 3), r3);
+    _mm_store_si128((__m128i *)(dst + stride * 3 + 16), r3);
+    left += 4;
+    dst += stride * 4;
+  } while (--i);
+}
+
+void aom_h_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  h_predictor_32xh(dst, stride, left, 64);
+}
+
+static INLINE void h_predictor_64xh(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *left, int height) {
+  int i = height >> 2;
+  do {
+    __m128i left4 = _mm_cvtsi32_si128(((uint32_t *)left)[0]);
+    left4 = _mm_unpacklo_epi8(left4, left4);
+    left4 = _mm_unpacklo_epi8(left4, left4);
+    const __m128i r0 = _mm_shuffle_epi32(left4, 0x0);
+    const __m128i r1 = _mm_shuffle_epi32(left4, 0x55);
+    _mm_store_si128((__m128i *)dst, r0);
+    _mm_store_si128((__m128i *)(dst + 16), r0);
+    _mm_store_si128((__m128i *)(dst + 32), r0);
+    _mm_store_si128((__m128i *)(dst + 48), r0);
+    _mm_store_si128((__m128i *)(dst + stride), r1);
+    _mm_store_si128((__m128i *)(dst + stride + 16), r1);
+    _mm_store_si128((__m128i *)(dst + stride + 32), r1);
+    _mm_store_si128((__m128i *)(dst + stride + 48), r1);
+    const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa);
+    const __m128i r3 = _mm_shuffle_epi32(left4, 0xff);
+    _mm_store_si128((__m128i *)(dst + stride * 2), r2);
+    _mm_store_si128((__m128i *)(dst + stride * 2 + 16), r2);
+    _mm_store_si128((__m128i *)(dst + stride * 2 + 32), r2);
+    _mm_store_si128((__m128i *)(dst + stride * 2 + 48), r2);
+    _mm_store_si128((__m128i *)(dst + stride * 3), r3);
+    _mm_store_si128((__m128i *)(dst + stride * 3 + 16), r3);
+    _mm_store_si128((__m128i *)(dst + stride * 3 + 32), r3);
+    _mm_store_si128((__m128i *)(dst + stride * 3 + 48), r3);
+    left += 4;
+    dst += stride * 4;
+  } while (--i);
+}
+
+void aom_h_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  h_predictor_64xh(dst, stride, left, 64);
+}
+
+void aom_h_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  h_predictor_64xh(dst, stride, left, 32);
+}
+
+void aom_h_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  h_predictor_64xh(dst, stride, left, 16);
+}
diff --git a/libs/libaom/src/aom_dsp/x86/intrapred_ssse3.c b/libs/libaom/src/aom_dsp/x86/intrapred_ssse3.c
new file mode 100644
index 000000000..5a34ea0c8
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/intrapred_ssse3.c
@@ -0,0 +1,1695 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/intrapred_common.h"
+
+// -----------------------------------------------------------------------------
+// PAETH_PRED
+
+// Return 8 16-bit pixels in one row
+static INLINE __m128i paeth_8x1_pred(const __m128i *left, const __m128i *top,
+                                     const __m128i *topleft) {
+  const __m128i base = _mm_sub_epi16(_mm_add_epi16(*top, *left), *topleft);
+
+  __m128i pl = _mm_abs_epi16(_mm_sub_epi16(base, *left));
+  __m128i pt = _mm_abs_epi16(_mm_sub_epi16(base, *top));
+  __m128i ptl = _mm_abs_epi16(_mm_sub_epi16(base, *topleft));
+
+  __m128i mask1 = _mm_cmpgt_epi16(pl, pt);
+  mask1 = _mm_or_si128(mask1, _mm_cmpgt_epi16(pl, ptl));
+  __m128i mask2 = _mm_cmpgt_epi16(pt, ptl);
+
+  pl = _mm_andnot_si128(mask1, *left);
+
+  ptl = _mm_and_si128(mask2, *topleft);
+  pt = _mm_andnot_si128(mask2, *top);
+  pt = _mm_or_si128(pt, ptl);
+  pt = _mm_and_si128(mask1, pt);
+
+  return _mm_or_si128(pl, pt);
+}
+
+void aom_paeth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  __m128i l = _mm_loadl_epi64((const __m128i *)left);
+  const __m128i t = _mm_loadl_epi64((const __m128i *)above);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
+  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  __m128i rep = _mm_set1_epi16((short)0x8000);
+  const __m128i one = _mm_set1_epi16(1);
+
+  int i;
+  for (i = 0; i < 4; ++i) {
+    const __m128i l16 = _mm_shuffle_epi8(l, rep);
+    const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
+
+    *(uint32_t *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
+    dst += stride;
+    rep = _mm_add_epi16(rep, one);
+  }
+}
+
+void aom_paeth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  __m128i l = _mm_loadl_epi64((const __m128i *)left);
+  const __m128i t = _mm_loadl_epi64((const __m128i *)above);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
+  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  __m128i rep = _mm_set1_epi16((short)0x8000);
+  const __m128i one = _mm_set1_epi16(1);
+
+  int i;
+  for (i = 0; i < 8; ++i) {
+    const __m128i l16 = _mm_shuffle_epi8(l, rep);
+    const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
+
+    *(uint32_t *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
+    dst += stride;
+    rep = _mm_add_epi16(rep, one);
+  }
+}
+
+void aom_paeth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  __m128i l = _mm_load_si128((const __m128i *)left);
+  const __m128i t = _mm_cvtsi32_si128(((const uint32_t *)above)[0]);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
+  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  __m128i rep = _mm_set1_epi16((short)0x8000);
+  const __m128i one = _mm_set1_epi16(1);
+
+  for (int i = 0; i < 16; ++i) {
+    const __m128i l16 = _mm_shuffle_epi8(l, rep);
+    const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
+
+    *(uint32_t *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
+    dst += stride;
+    rep = _mm_add_epi16(rep, one);
+  }
+}
+
+void aom_paeth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  __m128i l = _mm_loadl_epi64((const __m128i *)left);
+  const __m128i t = _mm_loadl_epi64((const __m128i *)above);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
+  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  __m128i rep = _mm_set1_epi16((short)0x8000);
+  const __m128i one = _mm_set1_epi16(1);
+
+  int i;
+  for (i = 0; i < 4; ++i) {
+    const __m128i l16 = _mm_shuffle_epi8(l, rep);
+    const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
+
+    _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
+    dst += stride;
+    rep = _mm_add_epi16(rep, one);
+  }
+}
+
+void aom_paeth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  __m128i l = _mm_loadl_epi64((const __m128i *)left);
+  const __m128i t = _mm_loadl_epi64((const __m128i *)above);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
+  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  __m128i rep = _mm_set1_epi16((short)0x8000);
+  const __m128i one = _mm_set1_epi16(1);
+
+  int i;
+  for (i = 0; i < 8; ++i) {
+    const __m128i l16 = _mm_shuffle_epi8(l, rep);
+    const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
+
+    _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
+    dst += stride;
+    rep = _mm_add_epi16(rep, one);
+  }
+}
+
+void aom_paeth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  __m128i l = _mm_load_si128((const __m128i *)left);
+  const __m128i t = _mm_loadl_epi64((const __m128i *)above);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
+  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  __m128i rep = _mm_set1_epi16((short)0x8000);
+  const __m128i one = _mm_set1_epi16(1);
+
+  int i;
+  for (i = 0; i < 16; ++i) {
+    const __m128i l16 = _mm_shuffle_epi8(l, rep);
+    const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
+
+    _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
+    dst += stride;
+    rep = _mm_add_epi16(rep, one);
+  }
+}
+
+void aom_paeth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  const __m128i t = _mm_loadl_epi64((const __m128i *)above);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
+  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  const __m128i one = _mm_set1_epi16(1);
+
+  for (int j = 0; j < 2; ++j) {
+    const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
+    __m128i rep = _mm_set1_epi16((short)0x8000);
+    for (int i = 0; i < 16; ++i) {
+      const __m128i l16 = _mm_shuffle_epi8(l, rep);
+      const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
+
+      _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
+      dst += stride;
+      rep = _mm_add_epi16(rep, one);
+    }
+  }
+}
+
+// Return 16 8-bit pixels in one row
+static INLINE __m128i paeth_16x1_pred(const __m128i *left, const __m128i *top0,
+                                      const __m128i *top1,
+                                      const __m128i *topleft) {
+  const __m128i p0 = paeth_8x1_pred(left, top0, topleft);
+  const __m128i p1 = paeth_8x1_pred(left, top1, topleft);
+  return _mm_packus_epi16(p0, p1);
+}
+
+void aom_paeth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  __m128i l = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
+  const __m128i t = _mm_load_si128((const __m128i *)above);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i top0 = _mm_unpacklo_epi8(t, zero);
+  const __m128i top1 = _mm_unpackhi_epi8(t, zero);
+  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  __m128i rep = _mm_set1_epi16((short)0x8000);
+  const __m128i one = _mm_set1_epi16(1);
+
+  for (int i = 0; i < 4; ++i) {
+    const __m128i l16 = _mm_shuffle_epi8(l, rep);
+    const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
+
+    _mm_store_si128((__m128i *)dst, row);
+    dst += stride;
+    rep = _mm_add_epi16(rep, one);
+  }
+}
+
+void aom_paeth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  __m128i l = _mm_loadl_epi64((const __m128i *)left);
+  const __m128i t = _mm_load_si128((const __m128i *)above);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i top0 = _mm_unpacklo_epi8(t, zero);
+  const __m128i top1 = _mm_unpackhi_epi8(t, zero);
+  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  __m128i rep = _mm_set1_epi16((short)0x8000);
+  const __m128i one = _mm_set1_epi16(1);
+
+  int i;
+  for (i = 0; i < 8; ++i) {
+    const __m128i l16 = _mm_shuffle_epi8(l, rep);
+    const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
+
+    _mm_store_si128((__m128i *)dst, row);
+    dst += stride;
+    rep = _mm_add_epi16(rep, one);
+  }
+}
+
+void aom_paeth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  __m128i l = _mm_load_si128((const __m128i *)left);
+  const __m128i t = _mm_load_si128((const __m128i *)above);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i top0 = _mm_unpacklo_epi8(t, zero);
+  const __m128i top1 = _mm_unpackhi_epi8(t, zero);
+  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  __m128i rep = _mm_set1_epi16((short)0x8000);
+  const __m128i one = _mm_set1_epi16(1);
+
+  int i;
+  for (i = 0; i < 16; ++i) {
+    const __m128i l16 = _mm_shuffle_epi8(l, rep);
+    const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
+
+    _mm_store_si128((__m128i *)dst, row);
+    dst += stride;
+    rep = _mm_add_epi16(rep, one);
+  }
+}
+
+void aom_paeth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  __m128i l = _mm_load_si128((const __m128i *)left);
+  const __m128i t = _mm_load_si128((const __m128i *)above);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i top0 = _mm_unpacklo_epi8(t, zero);
+  const __m128i top1 = _mm_unpackhi_epi8(t, zero);
+  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  __m128i rep = _mm_set1_epi16((short)0x8000);
+  const __m128i one = _mm_set1_epi16(1);
+  __m128i l16;
+
+  int i;
+  for (i = 0; i < 16; ++i) {
+    l16 = _mm_shuffle_epi8(l, rep);
+    const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
+
+    _mm_store_si128((__m128i *)dst, row);
+    dst += stride;
+    rep = _mm_add_epi16(rep, one);
+  }
+
+  l = _mm_load_si128((const __m128i *)(left + 16));
+  rep = _mm_set1_epi16((short)0x8000);
+  for (i = 0; i < 16; ++i) {
+    l16 = _mm_shuffle_epi8(l, rep);
+    const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
+
+    _mm_store_si128((__m128i *)dst, row);
+    dst += stride;
+    rep = _mm_add_epi16(rep, one);
+  }
+}
+
+void aom_paeth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  const __m128i t = _mm_load_si128((const __m128i *)above);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i top0 = _mm_unpacklo_epi8(t, zero);
+  const __m128i top1 = _mm_unpackhi_epi8(t, zero);
+  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  const __m128i one = _mm_set1_epi16(1);
+
+  for (int j = 0; j < 4; ++j) {
+    const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
+    __m128i rep = _mm_set1_epi16((short)0x8000);
+    for (int i = 0; i < 16; ++i) {
+      const __m128i l16 = _mm_shuffle_epi8(l, rep);
+      const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
+      _mm_store_si128((__m128i *)dst, row);
+      dst += stride;
+      rep = _mm_add_epi16(rep, one);
+    }
+  }
+}
+
+void aom_paeth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  const __m128i a = _mm_load_si128((const __m128i *)above);
+  const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i al = _mm_unpacklo_epi8(a, zero);
+  const __m128i ah = _mm_unpackhi_epi8(a, zero);
+  const __m128i bl = _mm_unpacklo_epi8(b, zero);
+  const __m128i bh = _mm_unpackhi_epi8(b, zero);
+
+  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  __m128i rep = _mm_set1_epi16((short)0x8000);
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i l = _mm_loadl_epi64((const __m128i *)left);
+  __m128i l16;
+
+  for (int i = 0; i < 8; ++i) {
+    l16 = _mm_shuffle_epi8(l, rep);
+    const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
+    const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
+
+    _mm_store_si128((__m128i *)dst, r32l);
+    _mm_store_si128((__m128i *)(dst + 16), r32h);
+    dst += stride;
+    rep = _mm_add_epi16(rep, one);
+  }
+}
+
+void aom_paeth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  const __m128i a = _mm_load_si128((const __m128i *)above);
+  const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i al = _mm_unpacklo_epi8(a, zero);
+  const __m128i ah = _mm_unpackhi_epi8(a, zero);
+  const __m128i bl = _mm_unpacklo_epi8(b, zero);
+  const __m128i bh = _mm_unpackhi_epi8(b, zero);
+
+  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  __m128i rep = _mm_set1_epi16((short)0x8000);
+  const __m128i one = _mm_set1_epi16(1);
+  __m128i l = _mm_load_si128((const __m128i *)left);
+  __m128i l16;
+
+  int i;
+  for (i = 0; i < 16; ++i) {
+    l16 = _mm_shuffle_epi8(l, rep);
+    const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
+    const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
+
+    _mm_store_si128((__m128i *)dst, r32l);
+    _mm_store_si128((__m128i *)(dst + 16), r32h);
+    dst += stride;
+    rep = _mm_add_epi16(rep, one);
+  }
+}
+
+void aom_paeth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  const __m128i a = _mm_load_si128((const __m128i *)above);
+  const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i al = _mm_unpacklo_epi8(a, zero);
+  const __m128i ah = _mm_unpackhi_epi8(a, zero);
+  const __m128i bl = _mm_unpacklo_epi8(b, zero);
+  const __m128i bh = _mm_unpackhi_epi8(b, zero);
+
+  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  __m128i rep = _mm_set1_epi16((short)0x8000);
+  const __m128i one = _mm_set1_epi16(1);
+  __m128i l = _mm_load_si128((const __m128i *)left);
+  __m128i l16;
+
+  int i;
+  for (i = 0; i < 16; ++i) {
+    l16 = _mm_shuffle_epi8(l, rep);
+    const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
+    const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
+
+    _mm_store_si128((__m128i *)dst, r32l);
+    _mm_store_si128((__m128i *)(dst + 16), r32h);
+    dst += stride;
+    rep = _mm_add_epi16(rep, one);
+  }
+
+  rep = _mm_set1_epi16((short)0x8000);
+  l = _mm_load_si128((const __m128i *)(left + 16));
+  for (i = 0; i < 16; ++i) {
+    l16 = _mm_shuffle_epi8(l, rep);
+    const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
+    const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
+
+    _mm_store_si128((__m128i *)dst, r32l);
+    _mm_store_si128((__m128i *)(dst + 16), r32h);
+    dst += stride;
+    rep = _mm_add_epi16(rep, one);
+  }
+}
+
+void aom_paeth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  const __m128i a = _mm_load_si128((const __m128i *)above);
+  const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i al = _mm_unpacklo_epi8(a, zero);
+  const __m128i ah = _mm_unpackhi_epi8(a, zero);
+  const __m128i bl = _mm_unpacklo_epi8(b, zero);
+  const __m128i bh = _mm_unpackhi_epi8(b, zero);
+
+  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  const __m128i one = _mm_set1_epi16(1);
+  __m128i l16;
+
+  int i, j;
+  for (j = 0; j < 4; ++j) {
+    const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
+    __m128i rep = _mm_set1_epi16((short)0x8000);
+    for (i = 0; i < 16; ++i) {
+      l16 = _mm_shuffle_epi8(l, rep);
+      const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
+      const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
+
+      _mm_store_si128((__m128i *)dst, r32l);
+      _mm_store_si128((__m128i *)(dst + 16), r32h);
+      dst += stride;
+      rep = _mm_add_epi16(rep, one);
+    }
+  }
+}
+
+void aom_paeth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  const __m128i a = _mm_load_si128((const __m128i *)above);
+  const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
+  const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
+  const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i al = _mm_unpacklo_epi8(a, zero);
+  const __m128i ah = _mm_unpackhi_epi8(a, zero);
+  const __m128i bl = _mm_unpacklo_epi8(b, zero);
+  const __m128i bh = _mm_unpackhi_epi8(b, zero);
+  const __m128i cl = _mm_unpacklo_epi8(c, zero);
+  const __m128i ch = _mm_unpackhi_epi8(c, zero);
+  const __m128i dl = _mm_unpacklo_epi8(d, zero);
+  const __m128i dh = _mm_unpackhi_epi8(d, zero);
+
+  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  const __m128i one = _mm_set1_epi16(1);
+  __m128i l16;
+
+  int i, j;
+  for (j = 0; j < 2; ++j) {
+    const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
+    __m128i rep = _mm_set1_epi16((short)0x8000);
+    for (i = 0; i < 16; ++i) {
+      l16 = _mm_shuffle_epi8(l, rep);
+      const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
+      const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
+      const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
+      const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
+
+      _mm_store_si128((__m128i *)dst, r0);
+      _mm_store_si128((__m128i *)(dst + 16), r1);
+      _mm_store_si128((__m128i *)(dst + 32), r2);
+      _mm_store_si128((__m128i *)(dst + 48), r3);
+      dst += stride;
+      rep = _mm_add_epi16(rep, one);
+    }
+  }
+}
+
+void aom_paeth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  const __m128i a = _mm_load_si128((const __m128i *)above);
+  const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
+  const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
+  const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i al = _mm_unpacklo_epi8(a, zero);
+  const __m128i ah = _mm_unpackhi_epi8(a, zero);
+  const __m128i bl = _mm_unpacklo_epi8(b, zero);
+  const __m128i bh = _mm_unpackhi_epi8(b, zero);
+  const __m128i cl = _mm_unpacklo_epi8(c, zero);
+  const __m128i ch = _mm_unpackhi_epi8(c, zero);
+  const __m128i dl = _mm_unpacklo_epi8(d, zero);
+  const __m128i dh = _mm_unpackhi_epi8(d, zero);
+
+  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  const __m128i one = _mm_set1_epi16(1);
+  __m128i l16;
+
+  int i, j;
+  for (j = 0; j < 4; ++j) {
+    const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
+    __m128i rep = _mm_set1_epi16((short)0x8000);
+    for (i = 0; i < 16; ++i) {
+      l16 = _mm_shuffle_epi8(l, rep);
+      const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
+      const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
+      const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
+      const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
+
+      _mm_store_si128((__m128i *)dst, r0);
+      _mm_store_si128((__m128i *)(dst + 16), r1);
+      _mm_store_si128((__m128i *)(dst + 32), r2);
+      _mm_store_si128((__m128i *)(dst + 48), r3);
+      dst += stride;
+      rep = _mm_add_epi16(rep, one);
+    }
+  }
+}
+
+void aom_paeth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  const __m128i a = _mm_load_si128((const __m128i *)above);
+  const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
+  const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
+  const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i al = _mm_unpacklo_epi8(a, zero);
+  const __m128i ah = _mm_unpackhi_epi8(a, zero);
+  const __m128i bl = _mm_unpacklo_epi8(b, zero);
+  const __m128i bh = _mm_unpackhi_epi8(b, zero);
+  const __m128i cl = _mm_unpacklo_epi8(c, zero);
+  const __m128i ch = _mm_unpackhi_epi8(c, zero);
+  const __m128i dl = _mm_unpacklo_epi8(d, zero);
+  const __m128i dh = _mm_unpackhi_epi8(d, zero);
+
+  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  const __m128i one = _mm_set1_epi16(1);
+  __m128i l16;
+
+  int i;
+  const __m128i l = _mm_load_si128((const __m128i *)left);
+  __m128i rep = _mm_set1_epi16((short)0x8000);
+  for (i = 0; i < 16; ++i) {
+    l16 = _mm_shuffle_epi8(l, rep);
+    const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
+    const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
+    const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
+    const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
+
+    _mm_store_si128((__m128i *)dst, r0);
+    _mm_store_si128((__m128i *)(dst + 16), r1);
+    _mm_store_si128((__m128i *)(dst + 32), r2);
+    _mm_store_si128((__m128i *)(dst + 48), r3);
+    dst += stride;
+    rep = _mm_add_epi16(rep, one);
+  }
+}
+
+// -----------------------------------------------------------------------------
+// SMOOTH_PRED
+
+// pixels[0]: above and below_pred interleave vector
+// pixels[1]: left vector
+// pixels[2]: right_pred vector
+static INLINE void load_pixel_w4(const uint8_t *above, const uint8_t *left,
+                                 int height, __m128i *pixels) {
+  __m128i d = _mm_cvtsi32_si128(((const uint32_t *)above)[0]);
+  if (height == 4)
+    pixels[1] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
+  else if (height == 8)
+    pixels[1] = _mm_loadl_epi64(((const __m128i *)left));
+  else
+    pixels[1] = _mm_loadu_si128(((const __m128i *)left));
+
+  pixels[2] = _mm_set1_epi16((uint16_t)above[3]);
+
+  const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
+  const __m128i zero = _mm_setzero_si128();
+  d = _mm_unpacklo_epi8(d, zero);
+  pixels[0] = _mm_unpacklo_epi16(d, bp);
+}
+
+// weight_h[0]: weight_h vector
+// weight_h[1]: scale - weight_h vector
+// weight_h[2]: same as [0], second half for height = 16 only
+// weight_h[3]: same as [1], second half for height = 16 only
+// weight_w[0]: weights_w and scale - weights_w interleave vector
+static INLINE void load_weight_w4(const uint8_t *weight_array, int height,
+                                  __m128i *weight_h, __m128i *weight_w) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
+  const __m128i t = _mm_cvtsi32_si128(((const uint32_t *)weight_array)[1]);
+  weight_h[0] = _mm_unpacklo_epi8(t, zero);
+  weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
+  weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
+
+  if (height == 8) {
+    const __m128i weight = _mm_loadl_epi64((const __m128i *)&weight_array[8]);
+    weight_h[0] = _mm_unpacklo_epi8(weight, zero);
+    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
+  } else if (height == 16) {
+    const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]);
+    weight_h[0] = _mm_unpacklo_epi8(weight, zero);
+    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
+    weight_h[2] = _mm_unpackhi_epi8(weight, zero);
+    weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
+  }
+}
+
+static INLINE void smooth_pred_4xh(const __m128i *pixel, const __m128i *wh,
+                                   const __m128i *ww, int h, uint8_t *dst,
+                                   ptrdiff_t stride, int second_half) {
+  const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale));
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i inc = _mm_set1_epi16(0x202);
+  const __m128i gat = _mm_set1_epi32(0xc080400);
+  __m128i rep = second_half ? _mm_set1_epi16((short)0x8008)
+                            : _mm_set1_epi16((short)0x8000);
+  __m128i d = _mm_set1_epi16(0x100);
+
+  for (int i = 0; i < h; ++i) {
+    const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
+    const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
+    const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
+    __m128i s = _mm_madd_epi16(pixel[0], wh_sc);
+
+    __m128i b = _mm_shuffle_epi8(pixel[1], rep);
+    b = _mm_unpacklo_epi16(b, pixel[2]);
+    __m128i sum = _mm_madd_epi16(b, ww[0]);
+
+    sum = _mm_add_epi32(s, sum);
+    sum = _mm_add_epi32(sum, round);
+    sum = _mm_srai_epi32(sum, 1 + sm_weight_log2_scale);
+
+    sum = _mm_shuffle_epi8(sum, gat);
+    *(uint32_t *)dst = _mm_cvtsi128_si32(sum);
+    dst += stride;
+
+    rep = _mm_add_epi16(rep, one);
+    d = _mm_add_epi16(d, inc);
+  }
+}
+
+void aom_smooth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  __m128i pixels[3];
+  load_pixel_w4(above, left, 4, pixels);
+
+  __m128i wh[4], ww[2];
+  load_weight_w4(sm_weight_arrays, 4, wh, ww);
+
+  smooth_pred_4xh(pixels, wh, ww, 4, dst, stride, 0);
+}
+
+void aom_smooth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  __m128i pixels[3];
+  load_pixel_w4(above, left, 8, pixels);
+
+  __m128i wh[4], ww[2];
+  load_weight_w4(sm_weight_arrays, 8, wh, ww);
+
+  smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
+}
+
+void aom_smooth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  __m128i pixels[3];
+  load_pixel_w4(above, left, 16, pixels);
+
+  __m128i wh[4], ww[2];
+  load_weight_w4(sm_weight_arrays, 16, wh, ww);
+
+  smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
+  dst += stride << 3;
+  smooth_pred_4xh(pixels, &wh[2], ww, 8, dst, stride, 1);
+}
+
+// pixels[0]: above and below_pred interleave vector, first half
+// pixels[1]: above and below_pred interleave vector, second half
+// pixels[2]: left vector
+// pixels[3]: right_pred vector
+// pixels[4]: above and below_pred interleave vector, first half
+// pixels[5]: above and below_pred interleave vector, second half
+// pixels[6]: left vector + 16
+// pixels[7]: right_pred vector
+static INLINE void load_pixel_w8(const uint8_t *above, const uint8_t *left,
+                                 int height, __m128i *pixels) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
+  __m128i d = _mm_loadl_epi64((const __m128i *)above);
+  d = _mm_unpacklo_epi8(d, zero);
+  pixels[0] = _mm_unpacklo_epi16(d, bp);
+  pixels[1] = _mm_unpackhi_epi16(d, bp);
+
+  pixels[3] = _mm_set1_epi16((uint16_t)above[7]);
+
+  if (height == 4) {
+    pixels[2] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
+  } else if (height == 8) {
+    pixels[2] = _mm_loadl_epi64((const __m128i *)left);
+  } else if (height == 16) {
+    pixels[2] = _mm_load_si128((const __m128i *)left);
+  } else {
+    pixels[2] = _mm_load_si128((const __m128i *)left);
+    pixels[4] = pixels[0];
+    pixels[5] = pixels[1];
+    pixels[6] = _mm_load_si128((const __m128i *)(left + 16));
+    pixels[7] = pixels[3];
+  }
+}
+
+// weight_h[0]: weight_h vector
+// weight_h[1]: scale - weight_h vector
+// weight_h[2]: same as [0], offset 8
+// weight_h[3]: same as [1], offset 8
+// weight_h[4]: same as [0], offset 16
+// weight_h[5]: same as [1], offset 16
+// weight_h[6]: same as [0], offset 24
+// weight_h[7]: same as [1], offset 24
+// weight_w[0]: weights_w and scale - weights_w interleave vector, first half
+// weight_w[1]: weights_w and scale - weights_w interleave vector, second half
+static INLINE void load_weight_w8(const uint8_t *weight_array, int height,
+                                  __m128i *weight_h, __m128i *weight_w) {
+  const __m128i zero = _mm_setzero_si128();
+  const int we_offset = height < 8 ? 4 : 8;
+  __m128i we = _mm_loadu_si128((const __m128i *)&weight_array[we_offset]);
+  weight_h[0] = _mm_unpacklo_epi8(we, zero);
+  const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
+  weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
+
+  if (height == 4) {
+    we = _mm_srli_si128(we, 4);
+    __m128i tmp1 = _mm_unpacklo_epi8(we, zero);
+    __m128i tmp2 = _mm_sub_epi16(d, tmp1);
+    weight_w[0] = _mm_unpacklo_epi16(tmp1, tmp2);
+    weight_w[1] = _mm_unpackhi_epi16(tmp1, tmp2);
+  } else {
+    weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
+    weight_w[1] = _mm_unpackhi_epi16(weight_h[0], weight_h[1]);
+  }
+
+  if (height == 16) {
+    we = _mm_loadu_si128((const __m128i *)&weight_array[16]);
+    weight_h[0] = _mm_unpacklo_epi8(we, zero);
+    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
+    weight_h[2] = _mm_unpackhi_epi8(we, zero);
+    weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
+  } else if (height == 32) {
+    const __m128i weight_lo =
+        _mm_loadu_si128((const __m128i *)&weight_array[32]);
+    weight_h[0] = _mm_unpacklo_epi8(weight_lo, zero);
+    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
+    weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero);
+    weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
+    const __m128i weight_hi =
+        _mm_loadu_si128((const __m128i *)&weight_array[32 + 16]);
+    weight_h[4] = _mm_unpacklo_epi8(weight_hi, zero);
+    weight_h[5] = _mm_sub_epi16(d, weight_h[4]);
+    weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero);
+    weight_h[7] = _mm_sub_epi16(d, weight_h[6]);
+  }
+}
+
+static INLINE void smooth_pred_8xh(const __m128i *pixels, const __m128i *wh,
+                                   const __m128i *ww, int h, uint8_t *dst,
+                                   ptrdiff_t stride, int second_half) {
+  const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale));
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i inc = _mm_set1_epi16(0x202);
+  const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
+
+  __m128i rep = second_half ? _mm_set1_epi16((short)0x8008)
+                            : _mm_set1_epi16((short)0x8000);
+  __m128i d = _mm_set1_epi16(0x100);
+
+  int i;
+  for (i = 0; i < h; ++i) {
+    const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
+    const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
+    const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
+    __m128i s0 = _mm_madd_epi16(pixels[0], wh_sc);
+    __m128i s1 = _mm_madd_epi16(pixels[1], wh_sc);
+
+    __m128i b = _mm_shuffle_epi8(pixels[2], rep);
+    b = _mm_unpacklo_epi16(b, pixels[3]);
+    __m128i sum0 = _mm_madd_epi16(b, ww[0]);
+    __m128i sum1 = _mm_madd_epi16(b, ww[1]);
+
+    s0 = _mm_add_epi32(s0, sum0);
+    s0 = _mm_add_epi32(s0, round);
+    s0 = _mm_srai_epi32(s0, 1 + sm_weight_log2_scale);
+
+    s1 = _mm_add_epi32(s1, sum1);
+    s1 = _mm_add_epi32(s1, round);
+    s1 = _mm_srai_epi32(s1, 1 + sm_weight_log2_scale);
+
+    sum0 = _mm_packus_epi16(s0, s1);
+    sum0 = _mm_shuffle_epi8(sum0, gat);
+    _mm_storel_epi64((__m128i *)dst, sum0);
+    dst += stride;
+
+    rep = _mm_add_epi16(rep, one);
+    d = _mm_add_epi16(d, inc);
+  }
+}
+
+void aom_smooth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  __m128i pixels[4];
+  load_pixel_w8(above, left, 4, pixels);
+
+  __m128i wh[4], ww[2];
+  load_weight_w8(sm_weight_arrays, 4, wh, ww);
+
+  smooth_pred_8xh(pixels, wh, ww, 4, dst, stride, 0);
+}
+
+void aom_smooth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  __m128i pixels[4];
+  load_pixel_w8(above, left, 8, pixels);
+
+  __m128i wh[4], ww[2];
+  load_weight_w8(sm_weight_arrays, 8, wh, ww);
+
+  smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
+}
+
+void aom_smooth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  __m128i pixels[4];
+  load_pixel_w8(above, left, 16, pixels);
+
+  __m128i wh[4], ww[2];
+  load_weight_w8(sm_weight_arrays, 16, wh, ww);
+
+  smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
+  dst += stride << 3;
+  smooth_pred_8xh(pixels, &wh[2], ww, 8, dst, stride, 1);
+}
+
+void aom_smooth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  __m128i pixels[8];
+  load_pixel_w8(above, left, 32, pixels);
+
+  __m128i wh[8], ww[2];
+  load_weight_w8(sm_weight_arrays, 32, wh, ww);
+
+  smooth_pred_8xh(&pixels[0], wh, ww, 8, dst, stride, 0);
+  dst += stride << 3;
+  smooth_pred_8xh(&pixels[0], &wh[2], ww, 8, dst, stride, 1);
+  dst += stride << 3;
+  smooth_pred_8xh(&pixels[4], &wh[4], ww, 8, dst, stride, 0);
+  dst += stride << 3;
+  smooth_pred_8xh(&pixels[4], &wh[6], ww, 8, dst, stride, 1);
+}
+
+static INLINE void smooth_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left, uint32_t bw,
+                                        uint32_t bh) {
+  const uint8_t *const sm_weights_w = sm_weight_arrays + bw;
+  const uint8_t *const sm_weights_h = sm_weight_arrays + bh;
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i scale_value =
+      _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
+  const __m128i bottom_left = _mm_cvtsi32_si128((uint32_t)left[bh - 1]);
+  const __m128i dup16 = _mm_set1_epi32(0x01000100);
+  const __m128i top_right =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128((uint32_t)above[bw - 1]), dup16);
+  const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
+  const __m128i round = _mm_set1_epi32((uint16_t)(1 << sm_weight_log2_scale));
+
+  for (uint32_t y = 0; y < bh; ++y) {
+    const __m128i weights_y = _mm_cvtsi32_si128((uint32_t)sm_weights_h[y]);
+    const __m128i left_y = _mm_cvtsi32_si128((uint32_t)left[y]);
+    const __m128i scale_m_weights_y = _mm_sub_epi16(scale_value, weights_y);
+    __m128i pred_scaled_bl = _mm_mullo_epi16(scale_m_weights_y, bottom_left);
+    const __m128i wl_y =
+        _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, left_y), 0);
+    pred_scaled_bl = _mm_add_epi32(pred_scaled_bl, round);
+    pred_scaled_bl = _mm_shuffle_epi32(pred_scaled_bl, 0);
+
+    for (uint32_t x = 0; x < bw; x += 8) {
+      const __m128i top_x = _mm_loadl_epi64((const __m128i *)(above + x));
+      const __m128i weights_x =
+          _mm_loadl_epi64((const __m128i *)(sm_weights_w + x));
+      const __m128i tw_x = _mm_unpacklo_epi8(top_x, weights_x);
+      const __m128i tw_x_lo = _mm_unpacklo_epi8(tw_x, zero);
+      const __m128i tw_x_hi = _mm_unpackhi_epi8(tw_x, zero);
+
+      __m128i pred_lo = _mm_madd_epi16(tw_x_lo, wl_y);
+      __m128i pred_hi = _mm_madd_epi16(tw_x_hi, wl_y);
+
+      const __m128i scale_m_weights_x =
+          _mm_sub_epi16(scale_value, _mm_unpacklo_epi8(weights_x, zero));
+      const __m128i swxtr = _mm_mullo_epi16(scale_m_weights_x, top_right);
+      const __m128i swxtr_lo = _mm_unpacklo_epi16(swxtr, zero);
+      const __m128i swxtr_hi = _mm_unpackhi_epi16(swxtr, zero);
+
+      pred_lo = _mm_add_epi32(pred_lo, pred_scaled_bl);
+      pred_hi = _mm_add_epi32(pred_hi, pred_scaled_bl);
+
+      pred_lo = _mm_add_epi32(pred_lo, swxtr_lo);
+      pred_hi = _mm_add_epi32(pred_hi, swxtr_hi);
+
+      pred_lo = _mm_srai_epi32(pred_lo, (1 + sm_weight_log2_scale));
+      pred_hi = _mm_srai_epi32(pred_hi, (1 + sm_weight_log2_scale));
+
+      __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
+      pred = _mm_shuffle_epi8(pred, gat);
+      _mm_storel_epi64((__m128i *)(dst + x), pred);
+    }
+    dst += stride;
+  }
+}
+
+void aom_smooth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 16, 4);
+}
+
+void aom_smooth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 16, 8);
+}
+
+void aom_smooth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 16, 16);
+}
+
+void aom_smooth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 16, 32);
+}
+
+void aom_smooth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 32, 8);
+}
+
+void aom_smooth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 32, 16);
+}
+
+void aom_smooth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 32, 32);
+}
+
+void aom_smooth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 32, 64);
+}
+
+void aom_smooth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 64, 64);
+}
+
+void aom_smooth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 64, 32);
+}
+
+void aom_smooth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 64, 16);
+}
+
+void aom_smooth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 16, 64);
+}
+
+// -----------------------------------------------------------------------------
+// SMOOTH_V_PRED
+
+// pixels[0]: above and below_pred interleave vector
+static INLINE void load_pixel_v_w4(const uint8_t *above, const uint8_t *left,
+                                   int height, __m128i *pixels) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i d = _mm_cvtsi32_si128(((const uint32_t *)above)[0]);
+  const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
+  d = _mm_unpacklo_epi8(d, zero);
+  pixels[0] = _mm_unpacklo_epi16(d, bp);
+}
+
+// weights[0]: weights_h vector
+// weights[1]: scale - weights_h vector
+static INLINE void load_weight_v_w4(const uint8_t *weight_array, int height,
+                                    __m128i *weights) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
+
+  if (height == 4) {
+    const __m128i weight =
+        _mm_cvtsi32_si128(((const uint32_t *)weight_array)[1]);
+    weights[0] = _mm_unpacklo_epi8(weight, zero);
+    weights[1] = _mm_sub_epi16(d, weights[0]);
+  } else if (height == 8) {
+    const __m128i weight = _mm_loadl_epi64((const __m128i *)&weight_array[8]);
+    weights[0] = _mm_unpacklo_epi8(weight, zero);
+    weights[1] = _mm_sub_epi16(d, weights[0]);
+  } else {
+    const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]);
+    weights[0] = _mm_unpacklo_epi8(weight, zero);
+    weights[1] = _mm_sub_epi16(d, weights[0]);
+    weights[2] = _mm_unpackhi_epi8(weight, zero);
+    weights[3] = _mm_sub_epi16(d, weights[2]);
+  }
+}
+
+static INLINE void smooth_v_pred_4xh(const __m128i *pixel,
+                                     const __m128i *weight, int h, uint8_t *dst,
+                                     ptrdiff_t stride) {
+  const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
+  const __m128i inc = _mm_set1_epi16(0x202);
+  const __m128i gat = _mm_set1_epi32(0xc080400);
+  __m128i d = _mm_set1_epi16(0x100);
+
+  for (int i = 0; i < h; ++i) {
+    const __m128i wg_wg = _mm_shuffle_epi8(weight[0], d);
+    const __m128i sc_sc = _mm_shuffle_epi8(weight[1], d);
+    const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
+    __m128i sum = _mm_madd_epi16(pixel[0], wh_sc);
+    sum = _mm_add_epi32(sum, pred_round);
+    sum = _mm_srai_epi32(sum, sm_weight_log2_scale);
+    sum = _mm_shuffle_epi8(sum, gat);
+    *(uint32_t *)dst = _mm_cvtsi128_si32(sum);
+    dst += stride;
+    d = _mm_add_epi16(d, inc);
+  }
+}
+
+void aom_smooth_v_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  __m128i pixels;
+  load_pixel_v_w4(above, left, 4, &pixels);
+
+  __m128i weights[2];
+  load_weight_v_w4(sm_weight_arrays, 4, weights);
+
+  smooth_v_pred_4xh(&pixels, weights, 4, dst, stride);
+}
+
+void aom_smooth_v_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  __m128i pixels;
+  load_pixel_v_w4(above, left, 8, &pixels);
+
+  __m128i weights[2];
+  load_weight_v_w4(sm_weight_arrays, 8, weights);
+
+  smooth_v_pred_4xh(&pixels, weights, 8, dst, stride);
+}
+
+void aom_smooth_v_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above,
+                                       const uint8_t *left) {
+  __m128i pixels;
+  load_pixel_v_w4(above, left, 16, &pixels);
+
+  __m128i weights[4];
+  load_weight_v_w4(sm_weight_arrays, 16, weights);
+
+  smooth_v_pred_4xh(&pixels, weights, 8, dst, stride);
+  dst += stride << 3;
+  smooth_v_pred_4xh(&pixels, &weights[2], 8, dst, stride);
+}
+
+// pixels[0]: above and below_pred interleave vector, first half
+// pixels[1]: above and below_pred interleave vector, second half
+static INLINE void load_pixel_v_w8(const uint8_t *above, const uint8_t *left,
+                                   int height, __m128i *pixels) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i d = _mm_loadl_epi64((const __m128i *)above);
+  const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
+  d = _mm_unpacklo_epi8(d, zero);
+  pixels[0] = _mm_unpacklo_epi16(d, bp);
+  pixels[1] = _mm_unpackhi_epi16(d, bp);
+}
+
+// weight_h[0]: weight_h vector
+// weight_h[1]: scale - weight_h vector
+// weight_h[2]: same as [0], offset 8
+// weight_h[3]: same as [1], offset 8
+// weight_h[4]: same as [0], offset 16
+// weight_h[5]: same as [1], offset 16
+// weight_h[6]: same as [0], offset 24
+// weight_h[7]: same as [1], offset 24
+static INLINE void load_weight_v_w8(const uint8_t *weight_array, int height,
+                                    __m128i *weight_h) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
+
+  if (height < 16) {
+    const int offset = height < 8 ? 4 : 8;
+    const __m128i weight =
+        _mm_loadu_si128((const __m128i *)&weight_array[offset]);
+    weight_h[0] = _mm_unpacklo_epi8(weight, zero);
+    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
+  } else if (height == 16) {
+    const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]);
+    weight_h[0] = _mm_unpacklo_epi8(weight, zero);
+    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
+    weight_h[2] = _mm_unpackhi_epi8(weight, zero);
+    weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
+  } else {
+    const __m128i weight_lo =
+        _mm_loadu_si128((const __m128i *)&weight_array[32]);
+    weight_h[0] = _mm_unpacklo_epi8(weight_lo, zero);
+    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
+    weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero);
+    weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
+    const __m128i weight_hi =
+        _mm_loadu_si128((const __m128i *)&weight_array[32 + 16]);
+    weight_h[4] = _mm_unpacklo_epi8(weight_hi, zero);
+    weight_h[5] = _mm_sub_epi16(d, weight_h[4]);
+    weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero);
+    weight_h[7] = _mm_sub_epi16(d, weight_h[6]);
+  }
+}
+
+static INLINE void smooth_v_pred_8xh(const __m128i *pixels, const __m128i *wh,
+                                     int h, uint8_t *dst, ptrdiff_t stride) {
+  const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
+  const __m128i inc = _mm_set1_epi16(0x202);
+  const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
+  __m128i d = _mm_set1_epi16(0x100);
+
+  for (int i = 0; i < h; ++i) {
+    const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
+    const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
+    const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
+    __m128i s0 = _mm_madd_epi16(pixels[0], wh_sc);
+    __m128i s1 = _mm_madd_epi16(pixels[1], wh_sc);
+
+    s0 = _mm_add_epi32(s0, pred_round);
+    s0 = _mm_srai_epi32(s0, sm_weight_log2_scale);
+
+    s1 = _mm_add_epi32(s1, pred_round);
+    s1 = _mm_srai_epi32(s1, sm_weight_log2_scale);
+
+    __m128i sum01 = _mm_packus_epi16(s0, s1);
+    sum01 = _mm_shuffle_epi8(sum01, gat);
+    _mm_storel_epi64((__m128i *)dst, sum01);
+    dst += stride;
+
+    d = _mm_add_epi16(d, inc);
+  }
+}
+
+void aom_smooth_v_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  __m128i pixels[2];
+  load_pixel_v_w8(above, left, 4, pixels);
+
+  __m128i wh[2];
+  load_weight_v_w8(sm_weight_arrays, 4, wh);
+
+  smooth_v_pred_8xh(pixels, wh, 4, dst, stride);
+}
+
+void aom_smooth_v_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  __m128i pixels[2];
+  load_pixel_v_w8(above, left, 8, pixels);
+
+  __m128i wh[2];
+  load_weight_v_w8(sm_weight_arrays, 8, wh);
+
+  smooth_v_pred_8xh(pixels, wh, 8, dst, stride);
+}
+
+void aom_smooth_v_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above,
+                                       const uint8_t *left) {
+  __m128i pixels[2];
+  load_pixel_v_w8(above, left, 16, pixels);
+
+  __m128i wh[4];
+  load_weight_v_w8(sm_weight_arrays, 16, wh);
+
+  smooth_v_pred_8xh(pixels, wh, 8, dst, stride);
+  dst += stride << 3;
+  smooth_v_pred_8xh(pixels, &wh[2], 8, dst, stride);
+}
+
+void aom_smooth_v_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above,
+                                       const uint8_t *left) {
+  __m128i pixels[2];
+  load_pixel_v_w8(above, left, 32, pixels);
+
+  __m128i wh[8];
+  load_weight_v_w8(sm_weight_arrays, 32, wh);
+
+  smooth_v_pred_8xh(pixels, &wh[0], 8, dst, stride);
+  dst += stride << 3;
+  smooth_v_pred_8xh(pixels, &wh[2], 8, dst, stride);
+  dst += stride << 3;
+  smooth_v_pred_8xh(pixels, &wh[4], 8, dst, stride);
+  dst += stride << 3;
+  smooth_v_pred_8xh(pixels, &wh[6], 8, dst, stride);
+}
+
+static INLINE void smooth_v_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
+                                          const uint8_t *above,
+                                          const uint8_t *left, uint32_t bw,
+                                          uint32_t bh) {
+  const uint8_t *const sm_weights_h = sm_weight_arrays + bh;
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i scale_value =
+      _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
+  const __m128i dup16 = _mm_set1_epi32(0x01000100);
+  const __m128i bottom_left =
+      _mm_shuffle_epi8(_mm_cvtsi32_si128((uint32_t)left[bh - 1]), dup16);
+  const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
+  const __m128i round =
+      _mm_set1_epi32((uint16_t)(1 << (sm_weight_log2_scale - 1)));
+
+  for (uint32_t y = 0; y < bh; ++y) {
+    const __m128i weights_y = _mm_cvtsi32_si128((uint32_t)sm_weights_h[y]);
+    const __m128i scale_m_weights_y =
+        _mm_shuffle_epi8(_mm_sub_epi16(scale_value, weights_y), dup16);
+    const __m128i wl_y =
+        _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, bottom_left), 0);
+
+    for (uint32_t x = 0; x < bw; x += 8) {
+      const __m128i top_x = _mm_loadl_epi64((const __m128i *)(above + x));
+      // 8 -> 16
+      const __m128i tw_x = _mm_unpacklo_epi8(top_x, zero);
+      const __m128i tw_x_lo = _mm_unpacklo_epi16(tw_x, scale_m_weights_y);
+      const __m128i tw_x_hi = _mm_unpackhi_epi16(tw_x, scale_m_weights_y);
+      // top_x * weights_y + scale_m_weights_y * bottom_left
+      __m128i pred_lo = _mm_madd_epi16(tw_x_lo, wl_y);
+      __m128i pred_hi = _mm_madd_epi16(tw_x_hi, wl_y);
+
+      pred_lo = _mm_add_epi32(pred_lo, round);
+      pred_hi = _mm_add_epi32(pred_hi, round);
+      pred_lo = _mm_srai_epi32(pred_lo, sm_weight_log2_scale);
+      pred_hi = _mm_srai_epi32(pred_hi, sm_weight_log2_scale);
+
+      __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
+      pred = _mm_shuffle_epi8(pred, gat);
+      _mm_storel_epi64((__m128i *)(dst + x), pred);
+    }
+    dst += stride;
+  }
+}
+
+void aom_smooth_v_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above,
+                                       const uint8_t *left) {
+  smooth_v_predictor_wxh(dst, stride, above, left, 16, 4);
+}
+
+void aom_smooth_v_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above,
+                                       const uint8_t *left) {
+  smooth_v_predictor_wxh(dst, stride, above, left, 16, 8);
+}
+
+void aom_smooth_v_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_v_predictor_wxh(dst, stride, above, left, 16, 16);
+}
+
+void aom_smooth_v_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_v_predictor_wxh(dst, stride, above, left, 16, 32);
+}
+
+void aom_smooth_v_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above,
+                                       const uint8_t *left) {
+  smooth_v_predictor_wxh(dst, stride, above, left, 32, 8);
+}
+
+void aom_smooth_v_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_v_predictor_wxh(dst, stride, above, left, 32, 16);
+}
+
+void aom_smooth_v_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_v_predictor_wxh(dst, stride, above, left, 32, 32);
+}
+
+void aom_smooth_v_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_v_predictor_wxh(dst, stride, above, left, 32, 64);
+}
+
+void aom_smooth_v_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_v_predictor_wxh(dst, stride, above, left, 64, 64);
+}
+
+void aom_smooth_v_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_v_predictor_wxh(dst, stride, above, left, 64, 32);
+}
+
+void aom_smooth_v_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_v_predictor_wxh(dst, stride, above, left, 64, 16);
+}
+
+void aom_smooth_v_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_v_predictor_wxh(dst, stride, above, left, 16, 64);
+}
+
+// -----------------------------------------------------------------------------
+// SMOOTH_H_PRED
+
+// pixels[0]: left vector
+// pixels[1]: right_pred vector
+static INLINE void load_pixel_h_w4(const uint8_t *above, const uint8_t *left,
+                                   int height, __m128i *pixels) {
+  if (height == 4)
+    pixels[0] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
+  else if (height == 8)
+    pixels[0] = _mm_loadl_epi64(((const __m128i *)left));
+  else
+    pixels[0] = _mm_loadu_si128(((const __m128i *)left));
+  pixels[1] = _mm_set1_epi16((uint16_t)above[3]);
+}
+
+// weights[0]: weights_w and scale - weights_w interleave vector
+static INLINE void load_weight_h_w4(const uint8_t *weight_array, int height,
+                                    __m128i *weights) {
+  (void)height;
+  const __m128i t = _mm_loadu_si128((const __m128i *)&weight_array[4]);
+  const __m128i zero = _mm_setzero_si128();
+
+  const __m128i weights_0 = _mm_unpacklo_epi8(t, zero);
+  const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
+  const __m128i weights_1 = _mm_sub_epi16(d, weights_0);
+  weights[0] = _mm_unpacklo_epi16(weights_0, weights_1);
+}
+
+static INLINE void smooth_h_pred_4xh(const __m128i *pixel,
+                                     const __m128i *weight, int h, uint8_t *dst,
+                                     ptrdiff_t stride) {
+  const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i gat = _mm_set1_epi32(0xc080400);
+  __m128i rep = _mm_set1_epi16((short)0x8000);
+
+  for (int i = 0; i < h; ++i) {
+    __m128i b = _mm_shuffle_epi8(pixel[0], rep);
+    b = _mm_unpacklo_epi16(b, pixel[1]);
+    __m128i sum = _mm_madd_epi16(b, weight[0]);
+
+    sum = _mm_add_epi32(sum, pred_round);
+    sum = _mm_srai_epi32(sum, sm_weight_log2_scale);
+
+    sum = _mm_shuffle_epi8(sum, gat);
+    *(uint32_t *)dst = _mm_cvtsi128_si32(sum);
+    dst += stride;
+
+    rep = _mm_add_epi16(rep, one);
+  }
+}
+
+void aom_smooth_h_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  __m128i pixels[2];
+  load_pixel_h_w4(above, left, 4, pixels);
+
+  __m128i weights;
+  load_weight_h_w4(sm_weight_arrays, 4, &weights);
+
+  smooth_h_pred_4xh(pixels, &weights, 4, dst, stride);
+}
+
+void aom_smooth_h_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  __m128i pixels[2];
+  load_pixel_h_w4(above, left, 8, pixels);
+
+  __m128i weights;
+  load_weight_h_w4(sm_weight_arrays, 8, &weights);
+
+  smooth_h_pred_4xh(pixels, &weights, 8, dst, stride);
+}
+
+void aom_smooth_h_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above,
+                                       const uint8_t *left) {
+  __m128i pixels[2];
+  load_pixel_h_w4(above, left, 16, pixels);
+
+  __m128i weights;
+  load_weight_h_w4(sm_weight_arrays, 8, &weights);
+
+  smooth_h_pred_4xh(pixels, &weights, 8, dst, stride);
+  dst += stride << 3;
+
+  pixels[0] = _mm_srli_si128(pixels[0], 8);
+  smooth_h_pred_4xh(pixels, &weights, 8, dst, stride);
+}
+
+// pixels[0]: left vector
+// pixels[1]: right_pred vector
+// pixels[2]: left vector + 16
+// pixels[3]: right_pred vector
+static INLINE void load_pixel_h_w8(const uint8_t *above, const uint8_t *left,
+                                   int height, __m128i *pixels) {
+  pixels[1] = _mm_set1_epi16((uint16_t)above[7]);
+
+  if (height == 4) {
+    pixels[0] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
+  } else if (height == 8) {
+    pixels[0] = _mm_loadl_epi64((const __m128i *)left);
+  } else if (height == 16) {
+    pixels[0] = _mm_load_si128((const __m128i *)left);
+  } else {
+    pixels[0] = _mm_load_si128((const __m128i *)left);
+    pixels[2] = _mm_load_si128((const __m128i *)(left + 16));
+    pixels[3] = pixels[1];
+  }
+}
+
+// weight_w[0]: weights_w and scale - weights_w interleave vector, first half
+// weight_w[1]: weights_w and scale - weights_w interleave vector, second half
+static INLINE void load_weight_h_w8(const uint8_t *weight_array, int height,
+                                    __m128i *weight_w) {
+  (void)height;
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
+  const __m128i we = _mm_loadu_si128((const __m128i *)&weight_array[8]);
+  const __m128i tmp1 = _mm_unpacklo_epi8(we, zero);
+  const __m128i tmp2 = _mm_sub_epi16(d, tmp1);
+  weight_w[0] = _mm_unpacklo_epi16(tmp1, tmp2);
+  weight_w[1] = _mm_unpackhi_epi16(tmp1, tmp2);
+}
+
+static INLINE void smooth_h_pred_8xh(const __m128i *pixels, const __m128i *ww,
+                                     int h, uint8_t *dst, ptrdiff_t stride,
+                                     int second_half) {
+  const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
+  __m128i rep = second_half ? _mm_set1_epi16((short)0x8008)
+                            : _mm_set1_epi16((short)0x8000);
+
+  for (int i = 0; i < h; ++i) {
+    __m128i b = _mm_shuffle_epi8(pixels[0], rep);
+    b = _mm_unpacklo_epi16(b, pixels[1]);
+    __m128i sum0 = _mm_madd_epi16(b, ww[0]);
+    __m128i sum1 = _mm_madd_epi16(b, ww[1]);
+
+    sum0 = _mm_add_epi32(sum0, pred_round);
+    sum0 = _mm_srai_epi32(sum0, sm_weight_log2_scale);
+
+    sum1 = _mm_add_epi32(sum1, pred_round);
+    sum1 = _mm_srai_epi32(sum1, sm_weight_log2_scale);
+
+    sum0 = _mm_packus_epi16(sum0, sum1);
+    sum0 = _mm_shuffle_epi8(sum0, gat);
+    _mm_storel_epi64((__m128i *)dst, sum0);
+    dst += stride;
+
+    rep = _mm_add_epi16(rep, one);
+  }
+}
+
+void aom_smooth_h_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  __m128i pixels[2];
+  load_pixel_h_w8(above, left, 4, pixels);
+
+  __m128i ww[2];
+  load_weight_h_w8(sm_weight_arrays, 4, ww);
+
+  smooth_h_pred_8xh(pixels, ww, 4, dst, stride, 0);
+}
+
+void aom_smooth_h_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  __m128i pixels[2];
+  load_pixel_h_w8(above, left, 8, pixels);
+
+  __m128i ww[2];
+  load_weight_h_w8(sm_weight_arrays, 8, ww);
+
+  smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 0);
+}
+
+void aom_smooth_h_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above,
+                                       const uint8_t *left) {
+  __m128i pixels[2];
+  load_pixel_h_w8(above, left, 16, pixels);
+
+  __m128i ww[2];
+  load_weight_h_w8(sm_weight_arrays, 16, ww);
+
+  smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 0);
+  dst += stride << 3;
+  smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 1);
+}
+
+void aom_smooth_h_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above,
+                                       const uint8_t *left) {
+  __m128i pixels[4];
+  load_pixel_h_w8(above, left, 32, pixels);
+
+  __m128i ww[2];
+  load_weight_h_w8(sm_weight_arrays, 32, ww);
+
+  smooth_h_pred_8xh(&pixels[0], ww, 8, dst, stride, 0);
+  dst += stride << 3;
+  smooth_h_pred_8xh(&pixels[0], ww, 8, dst, stride, 1);
+  dst += stride << 3;
+  smooth_h_pred_8xh(&pixels[2], ww, 8, dst, stride, 0);
+  dst += stride << 3;
+  smooth_h_pred_8xh(&pixels[2], ww, 8, dst, stride, 1);
+}
+
+static INLINE void smooth_h_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
+                                          const uint8_t *above,
+                                          const uint8_t *left, uint32_t bw,
+                                          uint32_t bh) {
+  const uint8_t *const sm_weights_w = sm_weight_arrays + bw;
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i scale_value =
+      _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
+  const __m128i top_right = _mm_cvtsi32_si128((uint32_t)above[bw - 1]);
+  const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
+  const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
+
+  for (uint32_t y = 0; y < bh; ++y) {
+    const __m128i left_y = _mm_cvtsi32_si128((uint32_t)left[y]);
+    const __m128i tr_ly =
+        _mm_shuffle_epi32(_mm_unpacklo_epi16(top_right, left_y), 0);
+
+    for (uint32_t x = 0; x < bw; x += 8) {
+      const __m128i weights_x =
+          _mm_loadl_epi64((const __m128i *)(sm_weights_w + x));
+      const __m128i weights_xw = _mm_unpacklo_epi8(weights_x, zero);
+      const __m128i scale_m_weights_x = _mm_sub_epi16(scale_value, weights_xw);
+      const __m128i wx_lo = _mm_unpacklo_epi16(scale_m_weights_x, weights_xw);
+      const __m128i wx_hi = _mm_unpackhi_epi16(scale_m_weights_x, weights_xw);
+      __m128i pred_lo = _mm_madd_epi16(wx_lo, tr_ly);
+      __m128i pred_hi = _mm_madd_epi16(wx_hi, tr_ly);
+
+      pred_lo = _mm_add_epi32(pred_lo, pred_round);
+      pred_hi = _mm_add_epi32(pred_hi, pred_round);
+
+      pred_lo = _mm_srai_epi32(pred_lo, sm_weight_log2_scale);
+      pred_hi = _mm_srai_epi32(pred_hi, sm_weight_log2_scale);
+
+      __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
+      pred = _mm_shuffle_epi8(pred, gat);
+      _mm_storel_epi64((__m128i *)(dst + x), pred);
+    }
+    dst += stride;
+  }
+}
+
+void aom_smooth_h_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above,
+                                       const uint8_t *left) {
+  smooth_h_predictor_wxh(dst, stride, above, left, 16, 4);
+}
+
+void aom_smooth_h_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above,
+                                       const uint8_t *left) {
+  smooth_h_predictor_wxh(dst, stride, above, left, 16, 8);
+}
+
+void aom_smooth_h_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_h_predictor_wxh(dst, stride, above, left, 16, 16);
+}
+
+void aom_smooth_h_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_h_predictor_wxh(dst, stride, above, left, 16, 32);
+}
+
+void aom_smooth_h_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_h_predictor_wxh(dst, stride, above, left, 16, 64);
+}
+
+void aom_smooth_h_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above,
+                                       const uint8_t *left) {
+  smooth_h_predictor_wxh(dst, stride, above, left, 32, 8);
+}
+
+void aom_smooth_h_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_h_predictor_wxh(dst, stride, above, left, 32, 16);
+}
+
+void aom_smooth_h_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_h_predictor_wxh(dst, stride, above, left, 32, 32);
+}
+
+void aom_smooth_h_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_h_predictor_wxh(dst, stride, above, left, 32, 64);
+}
+
+void aom_smooth_h_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_h_predictor_wxh(dst, stride, above, left, 64, 64);
+}
+
+void aom_smooth_h_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_h_predictor_wxh(dst, stride, above, left, 64, 32);
+}
+
+void aom_smooth_h_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left) {
+  smooth_h_predictor_wxh(dst, stride, above, left, 64, 16);
+}
diff --git a/libs/libaom/src/aom_dsp/x86/intrapred_x86.h b/libs/libaom/src/aom_dsp/x86/intrapred_x86.h
new file mode 100644
index 000000000..b13f575a7
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/intrapred_x86.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_INTRAPRED_X86_H_
+#define AOM_AOM_DSP_X86_INTRAPRED_X86_H_
+
+#include <emmintrin.h>  // SSE2
+#include "aom/aom_integer.h"
+#include "config/aom_config.h"
+
+static INLINE __m128i dc_sum_16_sse2(const uint8_t *ref) {
+  __m128i x = _mm_load_si128((__m128i const *)ref);
+  const __m128i zero = _mm_setzero_si128();
+  x = _mm_sad_epu8(x, zero);
+  const __m128i high = _mm_unpackhi_epi64(x, x);
+  return _mm_add_epi16(x, high);
+}
+
+static INLINE __m128i dc_sum_32_sse2(const uint8_t *ref) {
+  __m128i x0 = _mm_load_si128((__m128i const *)ref);
+  __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16));
+  const __m128i zero = _mm_setzero_si128();
+  x0 = _mm_sad_epu8(x0, zero);
+  x1 = _mm_sad_epu8(x1, zero);
+  x0 = _mm_add_epi16(x0, x1);
+  const __m128i high = _mm_unpackhi_epi64(x0, x0);
+  return _mm_add_epi16(x0, high);
+}
+
+#endif  // AOM_AOM_DSP_X86_INTRAPRED_X86_H_
diff --git a/libs/libaom/src/aom_dsp/x86/inv_wht_sse2.asm b/libs/libaom/src/aom_dsp/x86/inv_wht_sse2.asm
new file mode 100644
index 000000000..0bc841a7a
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/inv_wht_sse2.asm
@@ -0,0 +1,107 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+%macro REORDER_INPUTS 0
+  ; a c d b  to  a b c d
+  SWAP 1, 3, 2
+%endmacro
+
+%macro TRANSFORM_COLS 0
+  ; input:
+  ; m0 a
+  ; m1 b
+  ; m2 c
+  ; m3 d
+  paddw           m0,        m2
+  psubw           m3,        m1
+
+  ; wide subtract
+  punpcklwd       m4,        m0
+  punpcklwd       m5,        m3
+  psrad           m4,        16
+  psrad           m5,        16
+  psubd           m4,        m5
+  psrad           m4,        1
+  packssdw        m4,        m4             ; e
+
+  psubw           m5,        m4,        m1  ; b
+  psubw           m4,        m2             ; c
+  psubw           m0,        m5
+  paddw           m3,        m4
+                                ; m0 a
+  SWAP            1,         5  ; m1 b
+  SWAP            2,         4  ; m2 c
+                                ; m3 d
+%endmacro
+
+%macro TRANSPOSE_4X4 0
+  punpcklwd       m0,        m2
+  punpcklwd       m1,        m3
+  mova            m2,        m0
+  punpcklwd       m0,        m1
+  punpckhwd       m2,        m1
+  pshufd          m1,        m0, 0x0e
+  pshufd          m3,        m2, 0x0e
+%endmacro
+
+; transpose a 4x4 int16 matrix in xmm0 and xmm1 to the bottom half of xmm0-xmm3
+%macro TRANSPOSE_4X4_WIDE 0
+  mova            m3, m0
+  punpcklwd       m0, m1
+  punpckhwd       m3, m1
+  mova            m2, m0
+  punpcklwd       m0, m3
+  punpckhwd       m2, m3
+  pshufd          m1, m0, 0x0e
+  pshufd          m3, m2, 0x0e
+%endmacro
+
+%macro ADD_STORE_4P_2X 5  ; src1, src2, tmp1, tmp2, zero
+  movd            m%3,       [outputq]
+  movd            m%4,       [outputq + strideq]
+  punpcklbw       m%3,       m%5
+  punpcklbw       m%4,       m%5
+  paddw           m%1,       m%3
+  paddw           m%2,       m%4
+  packuswb        m%1,       m%5
+  packuswb        m%2,       m%5
+  movd            [outputq], m%1
+  movd            [outputq + strideq], m%2
+%endmacro
+
+INIT_XMM sse2
+cglobal iwht4x4_16_add, 3, 3, 7, input, output, stride
+  mova            m0,        [inputq +  0]
+  packssdw        m0,        [inputq + 16]
+  mova            m1,        [inputq + 32]
+  packssdw        m1,        [inputq + 48]
+  psraw           m0,        2
+  psraw           m1,        2
+
+  TRANSPOSE_4X4_WIDE
+  REORDER_INPUTS
+  TRANSFORM_COLS
+  TRANSPOSE_4X4
+  REORDER_INPUTS
+  TRANSFORM_COLS
+
+  pxor            m4, m4
+  ADD_STORE_4P_2X  0, 1, 5, 6, 4
+  lea             outputq, [outputq + 2 * strideq]
+  ADD_STORE_4P_2X  2, 3, 5, 6, 4
+
+  RET
diff --git a/libs/libaom/src/aom_dsp/x86/jnt_sad_ssse3.c b/libs/libaom/src/aom_dsp/x86/jnt_sad_ssse3.c
new file mode 100644
index 000000000..2e3e2be10
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/jnt_sad_ssse3.c
@@ -0,0 +1,238 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>  // SSE2
+#include <tmmintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/x86/synonyms.h"
+
+unsigned int aom_sad4xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b,
+                             int b_stride, int width, int height) {
+  int i;
+  assert(width == 4);
+  (void)width;
+
+  __m128i sad = _mm_setzero_si128();
+  for (i = 0; i < height; i += 4) {
+    __m128i x0 = xx_loadl_32(a + 0 * a_stride);
+    __m128i x1 = xx_loadl_32(a + 1 * a_stride);
+    __m128i x2 = xx_loadl_32(a + 2 * a_stride);
+    __m128i x3 = xx_loadl_32(a + 3 * a_stride);
+    __m128i x_lo = _mm_unpacklo_epi32(x0, x1);
+    __m128i x_hi = _mm_unpacklo_epi32(x2, x3);
+
+    __m128i x = _mm_unpacklo_epi64(x_lo, x_hi);
+
+    x0 = xx_loadl_32(b + 0 * b_stride);
+    x1 = xx_loadl_32(b + 1 * b_stride);
+    x2 = xx_loadl_32(b + 2 * b_stride);
+    x3 = xx_loadl_32(b + 3 * b_stride);
+    x_lo = _mm_unpacklo_epi32(x0, x1);
+    x_hi = _mm_unpacklo_epi32(x2, x3);
+
+    __m128i y = _mm_unpacklo_epi64(x_lo, x_hi);
+
+    __m128i sad4x4 = _mm_sad_epu8(x, y);
+    sad = _mm_add_epi32(sad, sad4x4);
+
+    a += 4 * a_stride;
+    b += 4 * b_stride;
+  }
+
+  // At this point, we have two 32-bit partial SADs at bit[0:31] and [64:95].
+  const unsigned int res =
+      _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8));
+
+  return res;
+}
+
+unsigned int aom_sad8xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b,
+                             int b_stride, int width, int height) {
+  int i;
+  assert(width == 8);
+  (void)width;
+
+  __m128i sad = _mm_setzero_si128();
+  for (i = 0; i < height; i += 2) {
+    __m128i x0 = xx_loadl_64(a + 0 * a_stride);
+    __m128i x1 = xx_loadl_64(a + 1 * a_stride);
+
+    __m128i x = _mm_unpacklo_epi64(x0, x1);
+
+    x0 = xx_loadl_64(b + 0 * b_stride);
+    x1 = xx_loadl_64(b + 1 * b_stride);
+
+    __m128i y = _mm_unpacklo_epi64(x0, x1);
+
+    __m128i sad8x2 = _mm_sad_epu8(x, y);
+    sad = _mm_add_epi32(sad, sad8x2);
+
+    a += 2 * a_stride;
+    b += 2 * b_stride;
+  }
+
+  const unsigned int res =
+      _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8));
+
+  return res;
+}
+
+unsigned int aom_sad16xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b,
+                              int b_stride, int width, int height) {
+  int i;
+  assert(width == 16);
+  (void)width;
+
+  __m128i sad = _mm_setzero_si128();
+  for (i = 0; i < height; ++i) {
+    __m128i x = xx_loadu_128(a);
+    __m128i y = xx_loadu_128(b);
+
+    __m128i sad16x1 = _mm_sad_epu8(x, y);
+    sad = _mm_add_epi32(sad, sad16x1);
+
+    a += a_stride;
+    b += b_stride;
+  }
+
+  const unsigned int res =
+      _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8));
+
+  return res;
+}
+
+unsigned int aom_sad32xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b,
+                              int b_stride, int width, int height) {
+  int i, j;
+  assert(width == 32);
+  (void)width;
+
+  __m128i sad = _mm_setzero_si128();
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < 2; ++j) {
+      __m128i x = xx_loadu_128(a + j * 16);
+      __m128i y = xx_loadu_128(b + j * 16);
+
+      __m128i sad32_half = _mm_sad_epu8(x, y);
+      sad = _mm_add_epi32(sad, sad32_half);
+    }
+
+    a += a_stride;
+    b += b_stride;
+  }
+
+  const unsigned int res =
+      _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8));
+
+  return res;
+}
+
+unsigned int aom_sad64xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b,
+                              int b_stride, int width, int height) {
+  int i, j;
+  assert(width == 64);
+  (void)width;
+
+  __m128i sad = _mm_setzero_si128();
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < 4; ++j) {
+      __m128i x = xx_loadu_128(a + j * 16);
+      __m128i y = xx_loadu_128(b + j * 16);
+
+      __m128i sad64_quarter = _mm_sad_epu8(x, y);
+      sad = _mm_add_epi32(sad, sad64_quarter);
+    }
+
+    a += a_stride;
+    b += b_stride;
+  }
+
+  const unsigned int res =
+      _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8));
+
+  return res;
+}
+
+unsigned int aom_sad128xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b,
+                               int b_stride, int width, int height) {
+  int i, j;
+  assert(width == 128);
+  (void)width;
+
+  __m128i sad = _mm_setzero_si128();
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < 8; ++j) {
+      __m128i x = xx_loadu_128(a + j * 16);
+      __m128i y = xx_loadu_128(b + j * 16);
+
+      __m128i sad64_quarter = _mm_sad_epu8(x, y);
+      sad = _mm_add_epi32(sad, sad64_quarter);
+    }
+
+    a += a_stride;
+    b += b_stride;
+  }
+
+  const unsigned int res =
+      _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8));
+
+  return res;
+}
+
+#define dist_wtd_sadMxN_sse2(m, n)                                            \
+  unsigned int aom_dist_wtd_sad##m##x##n##_avg_ssse3(                         \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {    \
+    uint8_t comp_pred[m * n];                                                 \
+    aom_dist_wtd_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride, \
+                               jcp_param);                                    \
+    return aom_sad##m##xh_sse2(src, src_stride, comp_pred, m, m, n);          \
+  }
+
+#define dist_wtd_sadMxN_avx2(m, n)                                            \
+  unsigned int aom_dist_wtd_sad##m##x##n##_avg_avx2(                          \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {    \
+    uint8_t comp_pred[m * n];                                                 \
+    aom_dist_wtd_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride, \
+                               jcp_param);                                    \
+    return aom_sad##m##xh_avx2(src, src_stride, comp_pred, m, m, n);          \
+  }
+
+/* clang-format off */
+dist_wtd_sadMxN_sse2(128, 128)
+dist_wtd_sadMxN_sse2(128, 64)
+dist_wtd_sadMxN_sse2(64, 128)
+dist_wtd_sadMxN_sse2(64, 64)
+dist_wtd_sadMxN_sse2(64, 32)
+dist_wtd_sadMxN_sse2(32, 64)
+dist_wtd_sadMxN_sse2(32, 32)
+dist_wtd_sadMxN_sse2(32, 16)
+dist_wtd_sadMxN_sse2(16, 32)
+dist_wtd_sadMxN_sse2(16, 16)
+dist_wtd_sadMxN_sse2(16, 8)
+dist_wtd_sadMxN_sse2(8, 16)
+dist_wtd_sadMxN_sse2(8, 8)
+dist_wtd_sadMxN_sse2(8, 4)
+dist_wtd_sadMxN_sse2(4, 8)
+dist_wtd_sadMxN_sse2(4, 4)
+dist_wtd_sadMxN_sse2(4, 16)
+dist_wtd_sadMxN_sse2(16, 4)
+dist_wtd_sadMxN_sse2(8, 32)
+dist_wtd_sadMxN_sse2(32, 8)
+dist_wtd_sadMxN_sse2(16, 64)
+dist_wtd_sadMxN_sse2(64, 16)
+    /* clang-format on */
diff --git a/libs/libaom/src/aom_dsp/x86/jnt_variance_ssse3.c b/libs/libaom/src/aom_dsp/x86/jnt_variance_ssse3.c
new file mode 100644
index 000000000..c8b02f556
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/jnt_variance_ssse3.c
@@ -0,0 +1,192 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>  // SSE2
+#include <tmmintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/x86/synonyms.h"
+
+void aom_var_filter_block2d_bil_first_pass_ssse3(
+    const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line,
+    unsigned int pixel_step, unsigned int output_height,
+    unsigned int output_width, const uint8_t *filter);
+
+void aom_var_filter_block2d_bil_second_pass_ssse3(
+    const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line,
+    unsigned int pixel_step, unsigned int output_height,
+    unsigned int output_width, const uint8_t *filter);
+
+static INLINE void compute_dist_wtd_avg(__m128i *p0, __m128i *p1,
+                                        const __m128i *w, const __m128i *r,
+                                        void *const result) {
+  __m128i p_lo = _mm_unpacklo_epi8(*p0, *p1);
+  __m128i mult_lo = _mm_maddubs_epi16(p_lo, *w);
+  __m128i round_lo = _mm_add_epi16(mult_lo, *r);
+  __m128i shift_lo = _mm_srai_epi16(round_lo, DIST_PRECISION_BITS);
+
+  __m128i p_hi = _mm_unpackhi_epi8(*p0, *p1);
+  __m128i mult_hi = _mm_maddubs_epi16(p_hi, *w);
+  __m128i round_hi = _mm_add_epi16(mult_hi, *r);
+  __m128i shift_hi = _mm_srai_epi16(round_hi, DIST_PRECISION_BITS);
+
+  xx_storeu_128(result, _mm_packus_epi16(shift_lo, shift_hi));
+}
+
+void aom_dist_wtd_comp_avg_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred,
+                                      int width, int height, const uint8_t *ref,
+                                      int ref_stride,
+                                      const DIST_WTD_COMP_PARAMS *jcp_param) {
+  int i;
+  const uint8_t w0 = (uint8_t)jcp_param->fwd_offset;
+  const uint8_t w1 = (uint8_t)jcp_param->bck_offset;
+  const __m128i w = _mm_set_epi8(w1, w0, w1, w0, w1, w0, w1, w0, w1, w0, w1, w0,
+                                 w1, w0, w1, w0);
+  const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1);
+  const __m128i r =
+      _mm_set_epi16(round, round, round, round, round, round, round, round);
+
+  if (width >= 16) {
+    // Read 16 pixels one row at a time
+    assert(!(width & 15));
+    for (i = 0; i < height; ++i) {
+      int j;
+      for (j = 0; j < width; j += 16) {
+        __m128i p0 = xx_loadu_128(ref);
+        __m128i p1 = xx_loadu_128(pred);
+
+        compute_dist_wtd_avg(&p0, &p1, &w, &r, comp_pred);
+
+        comp_pred += 16;
+        pred += 16;
+        ref += 16;
+      }
+      ref += ref_stride - width;
+    }
+  } else if (width >= 8) {
+    // Read 8 pixels two row at a time
+    assert(!(width & 7));
+    assert(!(width & 1));
+    for (i = 0; i < height; i += 2) {
+      __m128i p0_0 = xx_loadl_64(ref + 0 * ref_stride);
+      __m128i p0_1 = xx_loadl_64(ref + 1 * ref_stride);
+      __m128i p0 = _mm_unpacklo_epi64(p0_0, p0_1);
+      __m128i p1 = xx_loadu_128(pred);
+
+      compute_dist_wtd_avg(&p0, &p1, &w, &r, comp_pred);
+
+      comp_pred += 16;
+      pred += 16;
+      ref += 2 * ref_stride;
+    }
+  } else {
+    // Read 4 pixels four row at a time
+    assert(!(width & 3));
+    assert(!(height & 3));
+    for (i = 0; i < height; i += 4) {
+      const uint8_t *row0 = ref + 0 * ref_stride;
+      const uint8_t *row1 = ref + 1 * ref_stride;
+      const uint8_t *row2 = ref + 2 * ref_stride;
+      const uint8_t *row3 = ref + 3 * ref_stride;
+
+      __m128i p0 =
+          _mm_setr_epi8(row0[0], row0[1], row0[2], row0[3], row1[0], row1[1],
+                        row1[2], row1[3], row2[0], row2[1], row2[2], row2[3],
+                        row3[0], row3[1], row3[2], row3[3]);
+      __m128i p1 = xx_loadu_128(pred);
+
+      compute_dist_wtd_avg(&p0, &p1, &w, &r, comp_pred);
+
+      comp_pred += 16;
+      pred += 16;
+      ref += 4 * ref_stride;
+    }
+  }
+}
+
+void aom_dist_wtd_comp_avg_upsampled_pred_ssse3(
+    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+    int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search) {
+  int n;
+  int i;
+  aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
+                     subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search);
+  /*The total number of pixels must be a multiple of 16 (e.g., 4x4).*/
+  assert(!(width * height & 15));
+  n = width * height >> 4;
+
+  const uint8_t w0 = (uint8_t)jcp_param->fwd_offset;
+  const uint8_t w1 = (uint8_t)jcp_param->bck_offset;
+  const __m128i w = _mm_set_epi8(w1, w0, w1, w0, w1, w0, w1, w0, w1, w0, w1, w0,
+                                 w1, w0, w1, w0);
+  const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1);
+  const __m128i r =
+      _mm_set_epi16(round, round, round, round, round, round, round, round);
+
+  for (i = 0; i < n; i++) {
+    __m128i p0 = xx_loadu_128(comp_pred);
+    __m128i p1 = xx_loadu_128(pred);
+
+    compute_dist_wtd_avg(&p0, &p1, &w, &r, comp_pred);
+
+    comp_pred += 16;
+    pred += 16;
+  }
+}
+
+#define DIST_WTD_SUBPIX_AVG_VAR(W, H)                                      \
+  uint32_t aom_dist_wtd_sub_pixel_avg_variance##W##x##H##_ssse3(           \
+      const uint8_t *a, int a_stride, int xoffset, int yoffset,            \
+      const uint8_t *b, int b_stride, uint32_t *sse,                       \
+      const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
+    uint16_t fdata3[(H + 1) * W];                                          \
+    uint8_t temp2[H * W];                                                  \
+    DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                            \
+                                                                           \
+    aom_var_filter_block2d_bil_first_pass_ssse3(                           \
+        a, fdata3, a_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
+    aom_var_filter_block2d_bil_second_pass_ssse3(                          \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);          \
+                                                                           \
+    aom_dist_wtd_comp_avg_pred_ssse3(temp3, second_pred, W, H, temp2, W,   \
+                                     jcp_param);                           \
+                                                                           \
+    return aom_variance##W##x##H(temp3, W, b, b_stride, sse);              \
+  }
+
+DIST_WTD_SUBPIX_AVG_VAR(128, 128)
+DIST_WTD_SUBPIX_AVG_VAR(128, 64)
+DIST_WTD_SUBPIX_AVG_VAR(64, 128)
+DIST_WTD_SUBPIX_AVG_VAR(64, 64)
+DIST_WTD_SUBPIX_AVG_VAR(64, 32)
+DIST_WTD_SUBPIX_AVG_VAR(32, 64)
+DIST_WTD_SUBPIX_AVG_VAR(32, 32)
+DIST_WTD_SUBPIX_AVG_VAR(32, 16)
+DIST_WTD_SUBPIX_AVG_VAR(16, 32)
+DIST_WTD_SUBPIX_AVG_VAR(16, 16)
+DIST_WTD_SUBPIX_AVG_VAR(16, 8)
+DIST_WTD_SUBPIX_AVG_VAR(8, 16)
+DIST_WTD_SUBPIX_AVG_VAR(8, 8)
+DIST_WTD_SUBPIX_AVG_VAR(8, 4)
+DIST_WTD_SUBPIX_AVG_VAR(4, 8)
+DIST_WTD_SUBPIX_AVG_VAR(4, 4)
+DIST_WTD_SUBPIX_AVG_VAR(4, 16)
+DIST_WTD_SUBPIX_AVG_VAR(16, 4)
+DIST_WTD_SUBPIX_AVG_VAR(8, 32)
+DIST_WTD_SUBPIX_AVG_VAR(32, 8)
+DIST_WTD_SUBPIX_AVG_VAR(16, 64)
+DIST_WTD_SUBPIX_AVG_VAR(64, 16)
diff --git a/libs/libaom/src/aom_dsp/x86/loopfilter_sse2.c b/libs/libaom/src/aom_dsp/x86/loopfilter_sse2.c
new file mode 100644
index 000000000..d534683fc
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/loopfilter_sse2.c
@@ -0,0 +1,2100 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>  // SSE2
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/emmintrin_compat.h"
+#include "aom_dsp/x86/lpf_common_sse2.h"
+
+static INLINE __m128i abs_diff(__m128i a, __m128i b) {
+  return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
+}
+
+// this function treats its input as 2 parallel 8x4 matrices, transposes each of
+// them to 4x8  independently while flipping the second matrix horizontally.
+// Used for 14 taps pq pairs creation
+static INLINE void transpose_pq_14_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
+                                        __m128i *x3, __m128i *q0p0,
+                                        __m128i *q1p1, __m128i *q2p2,
+                                        __m128i *q3p3, __m128i *q4p4,
+                                        __m128i *q5p5, __m128i *q6p6,
+                                        __m128i *q7p7) {
+  __m128i w0, w1, ww0, ww1, w2, w3, ww2, ww3;
+  w0 = _mm_unpacklo_epi8(
+      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+  w1 = _mm_unpacklo_epi8(
+      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+  w2 = _mm_unpackhi_epi8(
+      *x0, *x1);  // 08 18 09 19 010 110 011 111 012 112 013 113 014 114 015 115
+  w3 = _mm_unpackhi_epi8(
+      *x2, *x3);  // 28 38 29 39 210 310 211 311 212 312 213 313 214 314 215 315
+
+  ww0 = _mm_unpacklo_epi16(
+      w0, w1);  // 00 10 20 30 01 11 21 31        02 12 22 32 03 13 23 33
+  ww1 = _mm_unpackhi_epi16(
+      w0, w1);  // 04 14 24 34 05 15 25 35        06 16 26 36 07 17 27 37
+  ww2 = _mm_unpacklo_epi16(
+      w2, w3);  // 08 18 28 38 09 19 29 39       010 110 210 310 011 111 211 311
+  ww3 = _mm_unpackhi_epi16(
+      w2,
+      w3);  // 012 112 212 312 013 113 213 313  014 114 214 314 015 115 215 315
+
+  *q7p7 = _mm_unpacklo_epi32(
+      ww0,
+      _mm_srli_si128(
+          ww3, 12));  // 00 10 20 30  015 115 215 315  xx xx xx xx xx xx xx xx
+  *q6p6 = _mm_unpackhi_epi32(
+      _mm_slli_si128(ww0, 4),
+      ww3);  // 01 11 21 31  014 114 214 314  xx xx xx xxxx xx xx xx
+  *q5p5 = _mm_unpackhi_epi32(
+      ww0,
+      _mm_slli_si128(
+          ww3, 4));  // 02 12 22 32  013 113 213 313  xx xx xx x xx xx xx xxx
+  *q4p4 = _mm_unpacklo_epi32(
+      _mm_srli_si128(ww0, 12),
+      ww3);  // 03 13 23 33  012 112 212 312 xx xx xx xx xx xx xx xx
+  *q3p3 = _mm_unpacklo_epi32(
+      ww1,
+      _mm_srli_si128(
+          ww2, 12));  // 04 14 24 34  011 111 211 311 xx xx xx xx xx xx xx xx
+  *q2p2 = _mm_unpackhi_epi32(
+      _mm_slli_si128(ww1, 4),
+      ww2);  // 05 15 25 35   010 110 210 310 xx xx xx xx xx xx xx xx
+  *q1p1 = _mm_unpackhi_epi32(
+      ww1,
+      _mm_slli_si128(
+          ww2, 4));  // 06 16 26 36   09 19 29 39     xx xx xx xx xx xx xx xx
+  *q0p0 = _mm_unpacklo_epi32(
+      _mm_srli_si128(ww1, 12),
+      ww2);  // 07 17 27 37  08 18 28 38     xx xx xx xx xx xx xx xx
+}
+
+// this function treats its input as 2 parallel 8x4 matrices, transposes each of
+// them  independently while flipping the second matrix horizontaly  Used for 14
+// taps filter pq pairs inverse
+static INLINE void transpose_pq_14_inv_sse2(__m128i *x0, __m128i *x1,
+                                            __m128i *x2, __m128i *x3,
+                                            __m128i *x4, __m128i *x5,
+                                            __m128i *x6, __m128i *x7,
+                                            __m128i *pq0, __m128i *pq1,
+                                            __m128i *pq2, __m128i *pq3) {
+  __m128i w10, w11, w12, w13;
+  __m128i w0, w1, w2, w3, w4, w5;
+  __m128i d0, d1, d2, d3;
+
+  w0 = _mm_unpacklo_epi8(
+      *x0, *x1);  // p 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+  w1 = _mm_unpacklo_epi8(
+      *x2, *x3);  // p 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+  w2 = _mm_unpacklo_epi8(
+      *x4, *x5);  // p 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+  w3 = _mm_unpacklo_epi8(
+      *x6, *x7);  // p 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+
+  w4 = _mm_unpacklo_epi16(
+      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+  w5 = _mm_unpacklo_epi16(
+      w2, w3);  // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+
+  d0 = _mm_unpacklo_epi32(
+      w4, w5);  // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+  d2 = _mm_unpackhi_epi32(
+      w4, w5);  // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+
+  w10 = _mm_unpacklo_epi8(
+      *x7, *x6);  // q xx xx xx xx xx xx xx xx 00 10 01 11 02 12 03 13
+  w11 = _mm_unpacklo_epi8(
+      *x5, *x4);  // q  xx xx xx xx xx xx xx xx 20 30 21 31 22 32 23 33
+  w12 = _mm_unpacklo_epi8(
+      *x3, *x2);  // q  xx xx xx xx xx xx xx xx 40 50 41 51 42 52 43 53
+  w13 = _mm_unpacklo_epi8(
+      *x1, *x0);  // q  xx xx xx xx xx xx xx xx 60 70 61 71 62 72 63 73
+
+  w4 = _mm_unpackhi_epi16(
+      w10, w11);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+  w5 = _mm_unpackhi_epi16(
+      w12, w13);  // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+
+  d1 = _mm_unpacklo_epi32(
+      w4, w5);  // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+  d3 = _mm_unpackhi_epi32(
+      w4, w5);  // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+
+  *pq0 = _mm_unpacklo_epi64(d0, d1);  // pq
+  *pq1 = _mm_unpackhi_epi64(d0, d1);  // pq
+  *pq2 = _mm_unpacklo_epi64(d2, d3);  // pq
+  *pq3 = _mm_unpackhi_epi64(d2, d3);  // pq
+}
+
+static AOM_FORCE_INLINE void filter4_sse2(__m128i *p1p0, __m128i *q1q0,
+                                          __m128i *hev, __m128i *mask,
+                                          __m128i *qs1qs0, __m128i *ps1ps0) {
+  __m128i filter, filter2filter1, work;
+  __m128i ps1ps0_work, qs1qs0_work;
+  __m128i hev1;
+  const __m128i t3t4 =
+      _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 4, 4, 4, 4);
+  const __m128i t80 = _mm_set1_epi8((char)0x80);
+  const __m128i ff = _mm_cmpeq_epi8(t80, t80);
+
+  ps1ps0_work = _mm_xor_si128(*p1p0, t80); /* ^ 0x80 */
+  qs1qs0_work = _mm_xor_si128(*q1q0, t80);
+
+  /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */
+  work = _mm_subs_epi8(ps1ps0_work, qs1qs0_work);
+  filter = _mm_and_si128(_mm_srli_si128(work, 4), *hev);
+  /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */
+  filter = _mm_subs_epi8(filter, work);
+  filter = _mm_subs_epi8(filter, work);
+  filter = _mm_subs_epi8(filter, work);  /* + 3 * (qs0 - ps0) */
+  filter = _mm_and_si128(filter, *mask); /* & mask */
+  filter = _mm_unpacklo_epi32(filter, filter);
+
+  /* filter1 = signed_char_clamp(filter + 4) >> 3; */
+  /* filter2 = signed_char_clamp(filter + 3) >> 3; */
+  filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */
+  filter2filter1 =
+      _mm_unpacklo_epi8(filter2filter1, filter2filter1);  // goto 16 bit
+  filter2filter1 = _mm_srai_epi16(filter2filter1, 11);    /* >> 3 */
+  filter2filter1 = _mm_packs_epi16(filter2filter1, filter2filter1);
+
+  /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */
+  filter = _mm_subs_epi8(filter2filter1, ff);  /* + 1 */
+  filter = _mm_unpacklo_epi8(filter, filter);  // goto 16 bit
+  filter = _mm_srai_epi16(filter, 9);          /* round */
+  filter = _mm_packs_epi16(filter, filter);
+  filter = _mm_andnot_si128(*hev, filter);
+  filter = _mm_unpacklo_epi32(filter, filter);
+
+  filter2filter1 = _mm_unpacklo_epi32(filter2filter1, filter);
+  hev1 = _mm_srli_si128(filter2filter1, 8);
+  /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */
+  qs1qs0_work = _mm_subs_epi8(qs1qs0_work, filter2filter1);
+  /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */
+  ps1ps0_work = _mm_adds_epi8(ps1ps0_work, hev1);
+
+  *qs1qs0 = _mm_xor_si128(qs1qs0_work, t80); /* ^ 0x80 */
+  *ps1ps0 = _mm_xor_si128(ps1ps0_work, t80); /* ^ 0x80 */
+}
+
+static AOM_FORCE_INLINE void filter4_dual_sse2(__m128i *p1p0, __m128i *q1q0,
+                                               __m128i *hev, __m128i *mask,
+                                               __m128i *qs1qs0,
+                                               __m128i *ps1ps0) {
+  const __m128i t3t4 =
+      _mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4);
+  const __m128i t80 = _mm_set1_epi8((char)0x80);
+  __m128i filter, filter2filter1, work;
+  __m128i ps1ps0_work, qs1qs0_work;
+  __m128i hev1;
+  const __m128i ff = _mm_cmpeq_epi8(t80, t80);
+
+  ps1ps0_work = _mm_xor_si128(*p1p0, t80); /* ^ 0x80 */
+  qs1qs0_work = _mm_xor_si128(*q1q0, t80);
+
+  /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */
+  work = _mm_subs_epi8(ps1ps0_work, qs1qs0_work);
+  filter = _mm_and_si128(_mm_srli_si128(work, 8), *hev);
+  /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */
+  filter = _mm_subs_epi8(filter, work);
+  filter = _mm_subs_epi8(filter, work);
+  filter = _mm_subs_epi8(filter, work);  /* + 3 * (qs0 - ps0) */
+  filter = _mm_and_si128(filter, *mask); /* & mask */
+  filter = _mm_unpacklo_epi64(filter, filter);
+
+  /* filter1 = signed_char_clamp(filter + 4) >> 3; */
+  /* filter2 = signed_char_clamp(filter + 3) >> 3; */
+  filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */
+  filter = _mm_unpackhi_epi8(filter2filter1, filter2filter1);
+  filter2filter1 = _mm_unpacklo_epi8(filter2filter1, filter2filter1);
+  filter2filter1 = _mm_srai_epi16(filter2filter1, 11); /* >> 3 */
+  filter = _mm_srai_epi16(filter, 11);                 /* >> 3 */
+  filter2filter1 = _mm_packs_epi16(filter2filter1, filter);
+
+  /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */
+  filter = _mm_subs_epi8(filter2filter1, ff); /* + 1 */
+  filter = _mm_unpacklo_epi8(filter, filter);
+  filter = _mm_srai_epi16(filter, 9); /* round */
+  filter = _mm_packs_epi16(filter, filter);
+  filter = _mm_andnot_si128(*hev, filter);
+
+  hev1 = _mm_unpackhi_epi64(filter2filter1, filter);
+  filter2filter1 = _mm_unpacklo_epi64(filter2filter1, filter);
+
+  /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */
+  qs1qs0_work = _mm_subs_epi8(qs1qs0_work, filter2filter1);
+  /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */
+  ps1ps0_work = _mm_adds_epi8(ps1ps0_work, hev1);
+  *qs1qs0 = _mm_xor_si128(qs1qs0_work, t80); /* ^ 0x80 */
+  *ps1ps0 = _mm_xor_si128(ps1ps0_work, t80); /* ^ 0x80 */
+}
+
+static AOM_FORCE_INLINE void lpf_internal_4_sse2(
+    __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *limit,
+    __m128i *thresh, __m128i *q1q0_out, __m128i *p1p0_out) {
+  __m128i q1p1, q0p0, p1p0, q1q0;
+  __m128i abs_p0q0, abs_p1q1;
+  __m128i mask, flat, hev;
+  const __m128i zero = _mm_setzero_si128();
+
+  q1p1 = _mm_unpacklo_epi32(*p1, *q1);
+  q0p0 = _mm_unpacklo_epi32(*p0, *q0);
+
+  p1p0 = _mm_unpacklo_epi32(q0p0, q1p1);
+  q1q0 = _mm_srli_si128(p1p0, 8);
+
+  /* (abs(q1 - q0), abs(p1 - p0) */
+  flat = abs_diff(q1p1, q0p0);
+  /* abs(p1 - q1), abs(p0 - q0) */
+  __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0);
+
+  /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */
+  flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4));
+  hev = _mm_unpacklo_epi8(flat, zero);
+
+  hev = _mm_cmpgt_epi16(hev, *thresh);
+  hev = _mm_packs_epi16(hev, hev);
+  hev = _mm_unpacklo_epi32(hev, hev);
+
+  abs_p0q0 = _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */
+  abs_p1q1 = _mm_srli_si128(abs_p1q1p0q0, 4);           /* abs(p1 - q1) */
+  abs_p1q1 = _mm_unpacklo_epi8(abs_p1q1, abs_p1q1);
+  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9);
+  abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */
+  /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */
+
+  mask = _mm_adds_epu8(abs_p0q0, abs_p1q1);
+  mask = _mm_unpacklo_epi32(mask, flat);
+  mask = _mm_subs_epu8(mask, *limit);
+  mask = _mm_cmpeq_epi8(mask, zero);
+  mask = _mm_and_si128(mask, _mm_srli_si128(mask, 4));
+
+  filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out);
+}
+
+static AOM_FORCE_INLINE void lpf_internal_4_dual_sse2(
+    __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *limit,
+    __m128i *thresh, __m128i *q1q0_out, __m128i *p1p0_out) {
+  __m128i q1p1, q0p0, p1p0, q1q0;
+  __m128i abs_p0q0, abs_p1q1;
+  __m128i mask, hev;
+  const __m128i zero = _mm_setzero_si128();
+
+  q1p1 = _mm_unpacklo_epi64(*p1, *q1);
+  q0p0 = _mm_unpacklo_epi64(*p0, *q0);
+
+  p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
+  q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
+
+  /* (abs(q1 - q0), abs(p1 - p0) */
+  __m128i flat = abs_diff(q1p1, q0p0);
+  /* abs(p1 - q1), abs(p0 - q0) */
+  const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0);
+
+  /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */
+  flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
+  hev = _mm_unpacklo_epi8(flat, zero);
+
+  hev = _mm_cmpgt_epi16(hev, *thresh);
+  hev = _mm_packs_epi16(hev, hev);
+
+  /* const int8_t mask = filter_mask2(*limit, *blimit, */
+  /*                                  p1, p0, q0, q1); */
+  abs_p0q0 = _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */
+  abs_p1q1 = _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p1 - q1) */
+  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9);
+  abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */
+  /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */
+  mask = _mm_adds_epu8(abs_p0q0, abs_p1q1);
+  mask = _mm_unpacklo_epi64(mask, flat);
+  mask = _mm_subs_epu8(mask, *limit);
+  mask = _mm_cmpeq_epi8(mask, zero);
+  mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8));
+
+  filter4_dual_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out);
+}
+
+void aom_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */,
+                               const uint8_t *_blimit, const uint8_t *_limit,
+                               const uint8_t *_thresh) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i limit = _mm_unpacklo_epi32(_mm_loadl_epi64((const __m128i *)_blimit),
+                                     _mm_loadl_epi64((const __m128i *)_limit));
+  __m128i thresh =
+      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
+
+  __m128i qs1qs0, ps1ps0;
+  __m128i p1, p0, q0, q1;
+
+  p1 = xx_loadl_32(s - 2 * p);
+  p0 = xx_loadl_32(s - 1 * p);
+  q0 = xx_loadl_32(s - 0 * p);
+  q1 = xx_loadl_32(s + 1 * p);
+
+  lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &limit, &thresh, &qs1qs0, &ps1ps0);
+
+  xx_storel_32(s - 1 * p, ps1ps0);
+  xx_storel_32(s - 2 * p, _mm_srli_si128(ps1ps0, 4));
+  xx_storel_32(s + 0 * p, qs1qs0);
+  xx_storel_32(s + 1 * p, _mm_srli_si128(qs1qs0, 4));
+}
+
+void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */,
+                             const uint8_t *_blimit, const uint8_t *_limit,
+                             const uint8_t *_thresh) {
+  __m128i p1p0, q1q0;
+  __m128i p1, p0, q0, q1;
+
+  const __m128i zero = _mm_setzero_si128();
+  __m128i limit = _mm_unpacklo_epi32(_mm_loadl_epi64((const __m128i *)_blimit),
+                                     _mm_loadl_epi64((const __m128i *)_limit));
+  __m128i thresh =
+      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
+
+  __m128i x0, x1, x2, x3;
+  __m128i d0, d1, d2, d3;
+  x0 = _mm_loadl_epi64((__m128i *)(s - 2 + 0 * p));
+  x1 = _mm_loadl_epi64((__m128i *)(s - 2 + 1 * p));
+  x2 = _mm_loadl_epi64((__m128i *)(s - 2 + 2 * p));
+  x3 = _mm_loadl_epi64((__m128i *)(s - 2 + 3 * p));
+
+  transpose4x8_8x4_low_sse2(&x0, &x1, &x2, &x3, &p1, &p0, &q0, &q1);
+
+  lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &limit, &thresh, &q1q0, &p1p0);
+
+  // Transpose 8x4 to 4x8
+  p1 = _mm_srli_si128(p1p0, 4);
+  q1 = _mm_srli_si128(q1q0, 4);
+
+  transpose4x8_8x4_low_sse2(&p1, &p1p0, &q1q0, &q1, &d0, &d1, &d2, &d3);
+
+  xx_storel_32(s + 0 * p - 2, d0);
+  xx_storel_32(s + 1 * p - 2, d1);
+  xx_storel_32(s + 2 * p - 2, d2);
+  xx_storel_32(s + 3 * p - 2, d3);
+}
+
+static INLINE void store_buffer_horz_8(__m128i x, int p, int num, uint8_t *s) {
+  xx_storel_32(s - (num + 1) * p, x);
+  xx_storel_32(s + num * p, _mm_srli_si128(x, 4));
+}
+
+static AOM_FORCE_INLINE void lpf_internal_14_dual_sse2(
+    __m128i *q6p6, __m128i *q5p5, __m128i *q4p4, __m128i *q3p3, __m128i *q2p2,
+    __m128i *q1p1, __m128i *q0p0, __m128i *blimit, __m128i *limit,
+    __m128i *thresh) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi8(1);
+  __m128i mask, hev, flat, flat2;
+  __m128i qs0ps0, qs1ps1;
+  __m128i p1p0, q1q0, qs1qs0, ps1ps0;
+  __m128i abs_p1p0;
+
+  p1p0 = _mm_unpacklo_epi64(*q0p0, *q1p1);
+  q1q0 = _mm_unpackhi_epi64(*q0p0, *q1p1);
+
+  {
+    __m128i abs_p1q1, abs_p0q0, abs_q1q0;
+    __m128i fe, ff, work;
+    abs_p1p0 = abs_diff(*q1p1, *q0p0);
+    abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
+    fe = _mm_set1_epi8((char)0xfe);
+    ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
+    abs_p0q0 = abs_diff(p1p0, q1q0);
+    abs_p1q1 = _mm_srli_si128(abs_p0q0, 8);
+    abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, zero);
+
+    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+    hev = _mm_subs_epu8(flat, *thresh);
+    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+    // replicate for the further "merged variables" usage
+    hev = _mm_unpacklo_epi64(hev, hev);
+
+    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
+    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+    mask = _mm_max_epu8(abs_p1p0, mask);
+    // mask |= (abs(p1 - p0) > limit) * -1;
+    // mask |= (abs(q1 - q0) > limit) * -1;
+
+    work = _mm_max_epu8(abs_diff(*q2p2, *q1p1), abs_diff(*q3p3, *q2p2));
+    mask = _mm_max_epu8(work, mask);
+    mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
+    mask = _mm_subs_epu8(mask, *limit);
+    mask = _mm_cmpeq_epi8(mask, zero);
+  }
+
+  // lp filter - the same for 6, 8 and 14 versions
+  filter4_dual_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0);
+  qs0ps0 = _mm_unpacklo_epi64(ps1ps0, qs1qs0);
+  qs1ps1 = _mm_unpackhi_epi64(ps1ps0, qs1qs0);
+  // loopfilter done
+
+  __m128i flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
+  __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
+
+  __m128i work;
+  flat = _mm_max_epu8(abs_diff(*q2p2, *q0p0), abs_diff(*q3p3, *q0p0));
+  flat = _mm_max_epu8(abs_p1p0, flat);
+  flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
+  flat = _mm_subs_epu8(flat, one);
+  flat = _mm_cmpeq_epi8(flat, zero);
+  flat = _mm_and_si128(flat, mask);
+
+  // if flat ==0 then flat2 is zero as well and we don't need any calc below
+  // sse4.1 if (0==_mm_test_all_zeros(flat,ff))
+  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
+    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    // flat and wide flat calculations
+
+    const __m128i eight = _mm_set1_epi16(8);
+    const __m128i four = _mm_set1_epi16(4);
+    __m128i p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
+    __m128i q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
+    __m128i pixelFilter_p, pixelFilter_q;
+    __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
+    __m128i sum_p6, sum_q6;
+    __m128i sum_p3, sum_q3, res_p, res_q;
+
+    p6_16 = _mm_unpacklo_epi8(*q6p6, zero);
+    p5_16 = _mm_unpacklo_epi8(*q5p5, zero);
+    p4_16 = _mm_unpacklo_epi8(*q4p4, zero);
+    p3_16 = _mm_unpacklo_epi8(*q3p3, zero);
+    p2_16 = _mm_unpacklo_epi8(*q2p2, zero);
+    p1_16 = _mm_unpacklo_epi8(*q1p1, zero);
+    p0_16 = _mm_unpacklo_epi8(*q0p0, zero);
+    q0_16 = _mm_unpackhi_epi8(*q0p0, zero);
+    q1_16 = _mm_unpackhi_epi8(*q1p1, zero);
+    q2_16 = _mm_unpackhi_epi8(*q2p2, zero);
+    q3_16 = _mm_unpackhi_epi8(*q3p3, zero);
+    q4_16 = _mm_unpackhi_epi8(*q4p4, zero);
+    q5_16 = _mm_unpackhi_epi8(*q5p5, zero);
+    q6_16 = _mm_unpackhi_epi8(*q6p6, zero);
+    pixelFilter_p = _mm_add_epi16(p5_16, _mm_add_epi16(p4_16, p3_16));
+    pixelFilter_q = _mm_add_epi16(q5_16, _mm_add_epi16(q4_16, q3_16));
+
+    pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16));
+    pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
+
+    pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16));
+    pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
+    pixelFilter_p =
+        _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q));
+    pixetFilter_p2p1p0 = _mm_add_epi16(
+        four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
+    res_p = _mm_srli_epi16(
+        _mm_add_epi16(pixelFilter_p,
+                      _mm_add_epi16(_mm_add_epi16(p6_16, p0_16),
+                                    _mm_add_epi16(p1_16, q0_16))),
+        4);
+    res_q = _mm_srli_epi16(
+        _mm_add_epi16(pixelFilter_p,
+                      _mm_add_epi16(_mm_add_epi16(q6_16, q0_16),
+                                    _mm_add_epi16(p0_16, q1_16))),
+        4);
+    flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
+
+    res_p = _mm_srli_epi16(
+        _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3_16, p0_16)), 3);
+    res_q = _mm_srli_epi16(
+        _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3_16, q0_16)), 3);
+
+    flat_q0p0 = _mm_packus_epi16(res_p, res_q);
+
+    sum_p6 = _mm_add_epi16(p6_16, p6_16);
+    sum_q6 = _mm_add_epi16(q6_16, q6_16);
+    sum_p3 = _mm_add_epi16(p3_16, p3_16);
+    sum_q3 = _mm_add_epi16(q3_16, q3_16);
+
+    pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p5_16);
+    pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
+
+    res_p = _mm_srli_epi16(
+        _mm_add_epi16(
+            pixelFilter_p,
+            _mm_add_epi16(sum_p6,
+                          _mm_add_epi16(p1_16, _mm_add_epi16(p2_16, p0_16)))),
+        4);
+    res_q = _mm_srli_epi16(
+        _mm_add_epi16(
+            pixelFilter_q,
+            _mm_add_epi16(sum_q6,
+                          _mm_add_epi16(q1_16, _mm_add_epi16(q0_16, q2_16)))),
+        4);
+    flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
+
+    pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
+    pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
+    res_p = _mm_srli_epi16(
+        _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1_16)), 3);
+    res_q = _mm_srli_epi16(
+        _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3);
+    flat_q1p1 = _mm_packus_epi16(res_p, res_q);
+
+    pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
+    pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
+
+    sum_p3 = _mm_add_epi16(sum_p3, p3_16);
+    sum_q3 = _mm_add_epi16(sum_q3, q3_16);
+
+    res_p = _mm_srli_epi16(
+        _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3);
+    res_q = _mm_srli_epi16(
+        _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3);
+    flat_q2p2 = _mm_packus_epi16(res_p, res_q);
+
+    // work with flat2
+    flat2 = _mm_max_epu8(abs_diff(*q4p4, *q0p0), abs_diff(*q5p5, *q0p0));
+    work = abs_diff(*q6p6, *q0p0);
+    flat2 = _mm_max_epu8(work, flat2);
+    flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
+    flat2 = _mm_subs_epu8(flat2, one);
+    flat2 = _mm_cmpeq_epi8(flat2, zero);
+    flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
+
+    // ~~~~~~~~~~ apply flat ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    flat = _mm_unpacklo_epi64(flat, flat);
+    *q2p2 = _mm_andnot_si128(flat, *q2p2);
+    flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
+    *q2p2 = _mm_or_si128(*q2p2, flat_q2p2);
+
+    qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
+    flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
+    *q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
+
+    qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
+    flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
+    *q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
+
+    if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat2, zero))) {
+      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
+      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
+
+      sum_p6 = _mm_add_epi16(sum_p6, p6_16);
+      sum_q6 = _mm_add_epi16(sum_q6, q6_16);
+
+      res_p = _mm_srli_epi16(
+          _mm_add_epi16(
+              pixelFilter_p,
+              _mm_add_epi16(sum_p6,
+                            _mm_add_epi16(p2_16, _mm_add_epi16(p3_16, p1_16)))),
+          4);
+      res_q = _mm_srli_epi16(
+          _mm_add_epi16(
+              pixelFilter_q,
+              _mm_add_epi16(sum_q6,
+                            _mm_add_epi16(q2_16, _mm_add_epi16(q1_16, q3_16)))),
+          4);
+      flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
+
+      sum_p6 = _mm_add_epi16(sum_p6, p6_16);
+      sum_q6 = _mm_add_epi16(sum_q6, q6_16);
+
+      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
+      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
+
+      res_p = _mm_srli_epi16(
+          _mm_add_epi16(
+              pixelFilter_p,
+              _mm_add_epi16(sum_p6,
+                            _mm_add_epi16(p3_16, _mm_add_epi16(p4_16, p2_16)))),
+          4);
+      res_q = _mm_srli_epi16(
+          _mm_add_epi16(
+              pixelFilter_q,
+              _mm_add_epi16(sum_q6,
+                            _mm_add_epi16(q3_16, _mm_add_epi16(q2_16, q4_16)))),
+          4);
+      flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
+
+      sum_p6 = _mm_add_epi16(sum_p6, p6_16);
+      sum_q6 = _mm_add_epi16(sum_q6, q6_16);
+
+      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
+      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
+
+      res_p = _mm_srli_epi16(
+          _mm_add_epi16(
+              pixelFilter_p,
+              _mm_add_epi16(sum_p6,
+                            _mm_add_epi16(p4_16, _mm_add_epi16(p5_16, p3_16)))),
+          4);
+      res_q = _mm_srli_epi16(
+          _mm_add_epi16(
+              pixelFilter_q,
+              _mm_add_epi16(sum_q6,
+                            _mm_add_epi16(q4_16, _mm_add_epi16(q3_16, q5_16)))),
+          4);
+      flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
+
+      sum_p6 = _mm_add_epi16(sum_p6, p6_16);
+      sum_q6 = _mm_add_epi16(sum_q6, q6_16);
+      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
+      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
+
+      res_p = _mm_srli_epi16(
+          _mm_add_epi16(
+              pixelFilter_p,
+              _mm_add_epi16(sum_p6,
+                            _mm_add_epi16(p5_16, _mm_add_epi16(p6_16, p4_16)))),
+          4);
+      res_q = _mm_srli_epi16(
+          _mm_add_epi16(
+              pixelFilter_q,
+              _mm_add_epi16(sum_q6,
+                            _mm_add_epi16(q5_16, _mm_add_epi16(q6_16, q4_16)))),
+          4);
+      flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
+
+      // wide flat
+      // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+      flat2 = _mm_unpacklo_epi64(flat2, flat2);
+
+      *q5p5 = _mm_andnot_si128(flat2, *q5p5);
+      flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
+      *q5p5 = _mm_or_si128(*q5p5, flat2_q5p5);
+
+      *q4p4 = _mm_andnot_si128(flat2, *q4p4);
+      flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
+      *q4p4 = _mm_or_si128(*q4p4, flat2_q4p4);
+
+      *q3p3 = _mm_andnot_si128(flat2, *q3p3);
+      flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
+      *q3p3 = _mm_or_si128(*q3p3, flat2_q3p3);
+
+      *q2p2 = _mm_andnot_si128(flat2, *q2p2);
+      flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
+      *q2p2 = _mm_or_si128(*q2p2, flat2_q2p2);
+
+      *q1p1 = _mm_andnot_si128(flat2, *q1p1);
+      flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
+      *q1p1 = _mm_or_si128(*q1p1, flat2_q1p1);
+
+      *q0p0 = _mm_andnot_si128(flat2, *q0p0);
+      flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
+      *q0p0 = _mm_or_si128(*q0p0, flat2_q0p0);
+    }
+  } else {
+    *q0p0 = qs0ps0;
+    *q1p1 = qs1ps1;
+  }
+}
+
+static AOM_FORCE_INLINE void lpf_internal_14_sse2(
+    __m128i *q6p6, __m128i *q5p5, __m128i *q4p4, __m128i *q3p3, __m128i *q2p2,
+    __m128i *q1p1, __m128i *q0p0, __m128i *blimit, __m128i *limit,
+    __m128i *thresh) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi8(1);
+  __m128i mask, hev, flat, flat2;
+  __m128i flat2_pq[6], flat_pq[3];
+  __m128i qs0ps0, qs1ps1;
+  __m128i p1p0, q1q0, qs1qs0, ps1ps0;
+  __m128i abs_p1p0;
+
+  p1p0 = _mm_unpacklo_epi32(*q0p0, *q1p1);
+  q1q0 = _mm_srli_si128(p1p0, 8);
+
+  __m128i fe, ff, work;
+  {
+    __m128i abs_p1q1, abs_p0q0, abs_q1q0;
+    abs_p1p0 = abs_diff(*q1p1, *q0p0);
+    abs_q1q0 = _mm_srli_si128(abs_p1p0, 4);
+    fe = _mm_set1_epi8((char)0xfe);
+    ff = _mm_cmpeq_epi8(fe, fe);
+    abs_p0q0 = abs_diff(p1p0, q1q0);
+    abs_p1q1 = _mm_srli_si128(abs_p0q0, 4);
+
+    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+
+    hev = _mm_subs_epu8(flat, *thresh);
+    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+    // replicate for the further "merged variables" usage
+    hev = _mm_unpacklo_epi32(hev, hev);
+
+    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
+    mask = _mm_unpacklo_epi32(mask, zero);
+    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+    mask = _mm_max_epu8(abs_p1p0, mask);
+    // mask |= (abs(p1 - p0) > limit) * -1;
+    // mask |= (abs(q1 - q0) > limit) * -1;
+
+    work = _mm_max_epu8(abs_diff(*q2p2, *q1p1), abs_diff(*q3p3, *q2p2));
+    mask = _mm_max_epu8(work, mask);
+    mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 4));
+    mask = _mm_subs_epu8(mask, *limit);
+    mask = _mm_cmpeq_epi8(mask, zero);
+  }
+
+  // lp filter - the same for 6, 8 and 14 versions
+  filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0);
+  qs0ps0 = _mm_unpacklo_epi32(ps1ps0, qs1qs0);
+  qs1ps1 = _mm_srli_si128(qs0ps0, 8);
+  // loopfilter done
+
+  flat = _mm_max_epu8(abs_diff(*q2p2, *q0p0), abs_diff(*q3p3, *q0p0));
+  flat = _mm_max_epu8(abs_p1p0, flat);
+  flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4));
+  flat = _mm_subs_epu8(flat, one);
+  flat = _mm_cmpeq_epi8(flat, zero);
+  flat = _mm_and_si128(flat, mask);
+  flat = _mm_unpacklo_epi32(flat, flat);
+  flat = _mm_unpacklo_epi64(flat, flat);
+
+  // if flat ==0 then flat2 is zero as well and we don't need any calc below
+  // sse4.1 if (0==_mm_test_all_zeros(flat,ff))
+  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
+    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    // flat and wide flat calculations
+    __m128i q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
+    __m128i pq_16[7];
+    const __m128i eight = _mm_set1_epi16(8);
+    const __m128i four = _mm_set1_epi16(4);
+    __m128i sum_p6;
+    __m128i sum_p3;
+
+    pq_16[0] = _mm_unpacklo_epi8(*q0p0, zero);
+    pq_16[1] = _mm_unpacklo_epi8(*q1p1, zero);
+    pq_16[2] = _mm_unpacklo_epi8(*q2p2, zero);
+    pq_16[3] = _mm_unpacklo_epi8(*q3p3, zero);
+    pq_16[4] = _mm_unpacklo_epi8(*q4p4, zero);
+    pq_16[5] = _mm_unpacklo_epi8(*q5p5, zero);
+    pq_16[6] = _mm_unpacklo_epi8(*q6p6, zero);
+    q0_16 = _mm_srli_si128(pq_16[0], 8);
+    q1_16 = _mm_srli_si128(pq_16[1], 8);
+    q2_16 = _mm_srli_si128(pq_16[2], 8);
+    q3_16 = _mm_srli_si128(pq_16[3], 8);
+    q4_16 = _mm_srli_si128(pq_16[4], 8);
+    q5_16 = _mm_srli_si128(pq_16[5], 8);
+
+    __m128i flat_p[3], flat_q[3];
+    __m128i flat2_p[6], flat2_q[6];
+
+    __m128i work0, work0_0, work0_1, sum_p_0;
+    __m128i sum_p = _mm_add_epi16(pq_16[5], _mm_add_epi16(pq_16[4], pq_16[3]));
+    __m128i sum_lp = _mm_add_epi16(pq_16[0], _mm_add_epi16(pq_16[2], pq_16[1]));
+    sum_p = _mm_add_epi16(sum_p, sum_lp);
+
+    __m128i sum_lq = _mm_srli_si128(sum_lp, 8);
+    __m128i sum_q = _mm_srli_si128(sum_p, 8);
+
+    sum_p_0 = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q));
+    sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq));
+
+    flat_p[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(pq_16[3], pq_16[0]));
+    flat_q[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(q3_16, q0_16));
+
+    sum_p6 = _mm_add_epi16(pq_16[6], pq_16[6]);
+    sum_p3 = _mm_add_epi16(pq_16[3], pq_16[3]);
+
+    sum_q = _mm_sub_epi16(sum_p_0, pq_16[5]);
+    sum_p = _mm_sub_epi16(sum_p_0, q5_16);
+
+    work0_0 = _mm_add_epi16(_mm_add_epi16(pq_16[6], pq_16[0]), pq_16[1]);
+    work0_1 = _mm_add_epi16(
+        sum_p6, _mm_add_epi16(pq_16[1], _mm_add_epi16(pq_16[2], pq_16[0])));
+
+    sum_lq = _mm_sub_epi16(sum_lp, pq_16[2]);
+    sum_lp = _mm_sub_epi16(sum_lp, q2_16);
+
+    work0 = _mm_add_epi16(sum_p3, pq_16[1]);
+    flat_p[1] = _mm_add_epi16(sum_lp, work0);
+    flat_q[1] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8));
+
+    flat_pq[0] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[0], flat_q[0]), 3);
+    flat_pq[1] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[1], flat_q[1]), 3);
+    flat_pq[0] = _mm_packus_epi16(flat_pq[0], flat_pq[0]);
+    flat_pq[1] = _mm_packus_epi16(flat_pq[1], flat_pq[1]);
+
+    sum_lp = _mm_sub_epi16(sum_lp, q1_16);
+    sum_lq = _mm_sub_epi16(sum_lq, pq_16[1]);
+
+    sum_p3 = _mm_add_epi16(sum_p3, pq_16[3]);
+    work0 = _mm_add_epi16(sum_p3, pq_16[2]);
+
+    flat_p[2] = _mm_add_epi16(sum_lp, work0);
+    flat_q[2] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8));
+    flat_pq[2] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[2], flat_q[2]), 3);
+    flat_pq[2] = _mm_packus_epi16(flat_pq[2], flat_pq[2]);
+
+    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~ flat 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    flat2 = _mm_max_epu8(abs_diff(*q4p4, *q0p0), abs_diff(*q5p5, *q0p0));
+
+    work = abs_diff(*q6p6, *q0p0);
+    flat2 = _mm_max_epu8(work, flat2);
+    flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 4));
+    flat2 = _mm_subs_epu8(flat2, one);
+    flat2 = _mm_cmpeq_epi8(flat2, zero);
+    flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
+    flat2 = _mm_unpacklo_epi32(flat2, flat2);
+
+    // ~~~~~~~~~~ apply flat ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
+    flat_pq[0] = _mm_and_si128(flat, flat_pq[0]);
+    *q0p0 = _mm_or_si128(qs0ps0, flat_pq[0]);
+
+    qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
+    flat_pq[1] = _mm_and_si128(flat, flat_pq[1]);
+    *q1p1 = _mm_or_si128(qs1ps1, flat_pq[1]);
+
+    *q2p2 = _mm_andnot_si128(flat, *q2p2);
+    flat_pq[2] = _mm_and_si128(flat, flat_pq[2]);
+    *q2p2 = _mm_or_si128(*q2p2, flat_pq[2]);
+
+    if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat2, zero))) {
+      flat2_p[0] = _mm_add_epi16(sum_p_0, _mm_add_epi16(work0_0, q0_16));
+      flat2_q[0] = _mm_add_epi16(
+          sum_p_0, _mm_add_epi16(_mm_srli_si128(work0_0, 8), pq_16[0]));
+
+      flat2_p[1] = _mm_add_epi16(sum_p, work0_1);
+      flat2_q[1] = _mm_add_epi16(sum_q, _mm_srli_si128(work0_1, 8));
+
+      flat2_pq[0] =
+          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[0], flat2_q[0]), 4);
+      flat2_pq[1] =
+          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[1], flat2_q[1]), 4);
+      flat2_pq[0] = _mm_packus_epi16(flat2_pq[0], flat2_pq[0]);
+      flat2_pq[1] = _mm_packus_epi16(flat2_pq[1], flat2_pq[1]);
+
+      sum_p = _mm_sub_epi16(sum_p, q4_16);
+      sum_q = _mm_sub_epi16(sum_q, pq_16[4]);
+
+      sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]);
+      work0 = _mm_add_epi16(
+          sum_p6, _mm_add_epi16(pq_16[2], _mm_add_epi16(pq_16[3], pq_16[1])));
+      flat2_p[2] = _mm_add_epi16(sum_p, work0);
+      flat2_q[2] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+      flat2_pq[2] =
+          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[2], flat2_q[2]), 4);
+      flat2_pq[2] = _mm_packus_epi16(flat2_pq[2], flat2_pq[2]);
+
+      sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]);
+      sum_p = _mm_sub_epi16(sum_p, q3_16);
+      sum_q = _mm_sub_epi16(sum_q, pq_16[3]);
+
+      work0 = _mm_add_epi16(
+          sum_p6, _mm_add_epi16(pq_16[3], _mm_add_epi16(pq_16[4], pq_16[2])));
+      flat2_p[3] = _mm_add_epi16(sum_p, work0);
+      flat2_q[3] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+      flat2_pq[3] =
+          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[3], flat2_q[3]), 4);
+      flat2_pq[3] = _mm_packus_epi16(flat2_pq[3], flat2_pq[3]);
+
+      sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]);
+      sum_p = _mm_sub_epi16(sum_p, q2_16);
+      sum_q = _mm_sub_epi16(sum_q, pq_16[2]);
+
+      work0 = _mm_add_epi16(
+          sum_p6, _mm_add_epi16(pq_16[4], _mm_add_epi16(pq_16[5], pq_16[3])));
+      flat2_p[4] = _mm_add_epi16(sum_p, work0);
+      flat2_q[4] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+      flat2_pq[4] =
+          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[4], flat2_q[4]), 4);
+      flat2_pq[4] = _mm_packus_epi16(flat2_pq[4], flat2_pq[4]);
+
+      sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]);
+      sum_p = _mm_sub_epi16(sum_p, q1_16);
+      sum_q = _mm_sub_epi16(sum_q, pq_16[1]);
+
+      work0 = _mm_add_epi16(
+          sum_p6, _mm_add_epi16(pq_16[5], _mm_add_epi16(pq_16[6], pq_16[4])));
+      flat2_p[5] = _mm_add_epi16(sum_p, work0);
+      flat2_q[5] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+      flat2_pq[5] =
+          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[5], flat2_q[5]), 4);
+      flat2_pq[5] = _mm_packus_epi16(flat2_pq[5], flat2_pq[5]);
+
+      // wide flat
+      // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+      *q0p0 = _mm_andnot_si128(flat2, *q0p0);
+      flat2_pq[0] = _mm_and_si128(flat2, flat2_pq[0]);
+      *q0p0 = _mm_or_si128(*q0p0, flat2_pq[0]);
+
+      *q1p1 = _mm_andnot_si128(flat2, *q1p1);
+      flat2_pq[1] = _mm_and_si128(flat2, flat2_pq[1]);
+      *q1p1 = _mm_or_si128(*q1p1, flat2_pq[1]);
+
+      *q2p2 = _mm_andnot_si128(flat2, *q2p2);
+      flat2_pq[2] = _mm_and_si128(flat2, flat2_pq[2]);
+      *q2p2 = _mm_or_si128(*q2p2, flat2_pq[2]);
+
+      *q3p3 = _mm_andnot_si128(flat2, *q3p3);
+      flat2_pq[3] = _mm_and_si128(flat2, flat2_pq[3]);
+      *q3p3 = _mm_or_si128(*q3p3, flat2_pq[3]);
+
+      *q4p4 = _mm_andnot_si128(flat2, *q4p4);
+      flat2_pq[4] = _mm_and_si128(flat2, flat2_pq[4]);
+      *q4p4 = _mm_or_si128(*q4p4, flat2_pq[4]);
+
+      *q5p5 = _mm_andnot_si128(flat2, *q5p5);
+      flat2_pq[5] = _mm_and_si128(flat2, flat2_pq[5]);
+      *q5p5 = _mm_or_si128(*q5p5, flat2_pq[5]);
+    }
+  } else {
+    *q0p0 = qs0ps0;
+    *q1p1 = qs1ps1;
+  }
+}
+
+void aom_lpf_horizontal_14_sse2(unsigned char *s, int p,
+                                const unsigned char *_blimit,
+                                const unsigned char *_limit,
+                                const unsigned char *_thresh) {
+  __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0;
+  __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
+  __m128i limit = _mm_load_si128((const __m128i *)_limit);
+  __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
+
+  q4p4 = _mm_unpacklo_epi32(xx_loadl_32(s - 5 * p), xx_loadl_32(s + 4 * p));
+  q3p3 = _mm_unpacklo_epi32(xx_loadl_32(s - 4 * p), xx_loadl_32(s + 3 * p));
+  q2p2 = _mm_unpacklo_epi32(xx_loadl_32(s - 3 * p), xx_loadl_32(s + 2 * p));
+  q1p1 = _mm_unpacklo_epi32(xx_loadl_32(s - 2 * p), xx_loadl_32(s + 1 * p));
+
+  q0p0 = _mm_unpacklo_epi32(xx_loadl_32(s - 1 * p), xx_loadl_32(s - 0 * p));
+
+  q5p5 = _mm_unpacklo_epi32(xx_loadl_32(s - 6 * p), xx_loadl_32(s + 5 * p));
+
+  q6p6 = _mm_unpacklo_epi32(xx_loadl_32(s - 7 * p), xx_loadl_32(s + 6 * p));
+
+  lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit,
+                       &limit, &thresh);
+
+  store_buffer_horz_8(q0p0, p, 0, s);
+  store_buffer_horz_8(q1p1, p, 1, s);
+  store_buffer_horz_8(q2p2, p, 2, s);
+  store_buffer_horz_8(q3p3, p, 3, s);
+  store_buffer_horz_8(q4p4, p, 4, s);
+  store_buffer_horz_8(q5p5, p, 5, s);
+}
+
+static AOM_FORCE_INLINE void lpf_internal_6_dual_sse2(
+    __m128i *p2, __m128i *q2, __m128i *p1, __m128i *q1, __m128i *p0,
+    __m128i *q0, __m128i *q1q0, __m128i *p1p0, __m128i *blimit, __m128i *limit,
+    __m128i *thresh) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i mask, hev, flat;
+  __m128i q2p2, q1p1, q0p0, flat_p1p0, flat_q0q1;
+  __m128i p2_16, q2_16, p1_16, q1_16, p0_16, q0_16;
+  __m128i ps1ps0, qs1qs0;
+
+  q2p2 = _mm_unpacklo_epi64(*p2, *q2);
+  q1p1 = _mm_unpacklo_epi64(*p1, *q1);
+  q0p0 = _mm_unpacklo_epi64(*p0, *q0);
+
+  *p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
+  *q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
+
+  const __m128i one = _mm_set1_epi8(1);
+  const __m128i fe = _mm_set1_epi8((char)0xfe);
+  const __m128i ff = _mm_cmpeq_epi8(fe, fe);
+
+  {
+    // filter_mask and hev_mask
+    __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
+    abs_p1p0 = abs_diff(q1p1, q0p0);
+    abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
+
+    abs_p0q0 = abs_diff(*p1p0, *q1q0);
+    abs_p1q1 = _mm_srli_si128(abs_p0q0, 8);
+    abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, zero);
+
+    // considering sse doesn't have unsigned elements comparison the idea is
+    // to find at least one case when X > limit, it means the corresponding
+    // mask bit is set.
+    // to achieve that we find global max value of all inputs of abs(x-y) or
+    // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set
+    // otherwise - not
+
+    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+    hev = _mm_subs_epu8(flat, *thresh);
+    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+    // replicate for the further "merged variables" usage
+    hev = _mm_unpacklo_epi64(hev, hev);
+
+    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
+    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+    mask = _mm_max_epu8(abs_p1p0, mask);
+    // mask |= (abs(p1 - p0) > limit) * -1;
+    // mask |= (abs(q1 - q0) > limit) * -1;
+
+    work = abs_diff(q2p2, q1p1);
+    mask = _mm_max_epu8(work, mask);
+    mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
+    mask = _mm_subs_epu8(mask, *limit);
+    mask = _mm_cmpeq_epi8(mask, zero);
+
+    // lp filter - the same for 6, 8 and 14 versions
+    filter4_dual_sse2(p1p0, q1q0, &hev, &mask, q1q0, p1p0);
+
+    // flat_mask
+    flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_p1p0);
+    flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
+    flat = _mm_subs_epu8(flat, one);
+    flat = _mm_cmpeq_epi8(flat, zero);
+    flat = _mm_and_si128(flat, mask);
+    // replicate for the further "merged variables" usage
+    flat = _mm_unpacklo_epi64(flat, flat);
+  }
+
+  // 5 tap filter
+  // need it only if flat !=0
+  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
+    const __m128i four = _mm_set1_epi16(4);
+    __m128i workp_a, workp_b, workp_shft0, workp_shft1;
+    p2_16 = _mm_unpacklo_epi8(*p2, zero);
+    p1_16 = _mm_unpacklo_epi8(*p1, zero);
+    p0_16 = _mm_unpacklo_epi8(*p0, zero);
+    q0_16 = _mm_unpacklo_epi8(*q0, zero);
+    q1_16 = _mm_unpacklo_epi8(*q1, zero);
+    q2_16 = _mm_unpacklo_epi8(*q2, zero);
+
+    // op1
+    workp_a = _mm_add_epi16(_mm_add_epi16(p0_16, p0_16),
+                            _mm_add_epi16(p1_16, p1_16));  // p0 *2 + p1 * 2
+    workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four),
+                            p2_16);  // p2 + p0 * 2 + p1 * 2 + 4
+
+    workp_b = _mm_add_epi16(_mm_add_epi16(p2_16, p2_16), q0_16);
+    workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b),
+                                 3);  // p2 * 3 + p1 * 2 + p0 * 2 + q0 + 4
+
+    // op0
+    workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, q0_16), q1_16);  // q0 * 2 + q1
+    workp_a = _mm_add_epi16(workp_a,
+                            workp_b);  // p2 + p0 * 2 + p1 * 2 + q0 * 2 + q1 + 4
+    workp_shft1 = _mm_srli_epi16(workp_a, 3);
+
+    flat_p1p0 = _mm_packus_epi16(workp_shft1, workp_shft0);
+
+    // oq0
+    workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, p2_16),
+                            p1_16);  // p0 * 2 + p1  + q0 * 2 + q1 + 4
+    workp_b = _mm_add_epi16(q1_16, q2_16);
+    workp_a = _mm_add_epi16(
+        workp_a, workp_b);  // p0 * 2 + p1  + q0 * 2 + q1 * 2 + q2 + 4
+    workp_shft0 = _mm_srli_epi16(workp_a, 3);
+
+    // oq1
+    workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, p1_16),
+                            p0_16);  // p0   + q0 * 2 + q1 * 2 + q2 + 4
+    workp_b = _mm_add_epi16(q2_16, q2_16);
+    workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b),
+                                 3);  // p0  + q0 * 2 + q1 * 2 + q2 * 3 + 4
+
+    flat_q0q1 = _mm_packus_epi16(workp_shft0, workp_shft1);
+
+    qs1qs0 = _mm_andnot_si128(flat, *q1q0);
+    *q1q0 = _mm_and_si128(flat, flat_q0q1);
+    *q1q0 = _mm_or_si128(qs1qs0, *q1q0);
+
+    ps1ps0 = _mm_andnot_si128(flat, *p1p0);
+    *p1p0 = _mm_and_si128(flat, flat_p1p0);
+    *p1p0 = _mm_or_si128(ps1ps0, *p1p0);
+  }
+}
+
+static AOM_FORCE_INLINE void lpf_internal_6_sse2(
+    __m128i *p2, __m128i *q2, __m128i *p1, __m128i *q1, __m128i *p0,
+    __m128i *q0, __m128i *q1q0, __m128i *p1p0, __m128i *blimit, __m128i *limit,
+    __m128i *thresh) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i mask, hev, flat;
+  __m128i q2p2, q1p1, q0p0, flat_p1p0, flat_q0q1;
+  __m128i pq2_16, q2_16, pq1_16, pq0_16, q0_16;
+  __m128i ps1ps0, qs1qs0;
+
+  q2p2 = _mm_unpacklo_epi32(*p2, *q2);
+  q1p1 = _mm_unpacklo_epi32(*p1, *q1);
+  q0p0 = _mm_unpacklo_epi32(*p0, *q0);
+
+  *p1p0 = _mm_unpacklo_epi32(*p0, *p1);
+  *q1q0 = _mm_unpacklo_epi32(*q0, *q1);
+
+  const __m128i one = _mm_set1_epi8(1);
+  const __m128i fe = _mm_set1_epi8((char)0xfe);
+  const __m128i ff = _mm_cmpeq_epi8(fe, fe);
+  {
+    // filter_mask and hev_mask
+    __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
+    abs_p1p0 = abs_diff(q1p1, q0p0);
+    abs_q1q0 = _mm_srli_si128(abs_p1p0, 4);
+
+    abs_p0q0 = abs_diff(*p1p0, *q1q0);
+    abs_p1q1 = _mm_srli_si128(abs_p0q0, 4);
+
+    // considering sse doesn't have unsigned elements comparison the idea is
+    // to find at least one case when X > limit, it means the corresponding
+    // mask bit is set.
+    // to achieve that we find global max value of all inputs of abs(x-y) or
+    // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set
+    // otherwise - not
+
+    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+    hev = _mm_subs_epu8(flat, *thresh);
+    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+    // replicate for the further "merged variables" usage
+    hev = _mm_unpacklo_epi32(hev, hev);
+
+    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
+    mask = _mm_unpacklo_epi32(mask, zero);
+    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+    mask = _mm_max_epu8(abs_p1p0, mask);
+    // mask |= (abs(p1 - p0) > limit) * -1;
+    // mask |= (abs(q1 - q0) > limit) * -1;
+
+    work = abs_diff(q2p2, q1p1);
+    mask = _mm_max_epu8(work, mask);
+    mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 4));
+    mask = _mm_subs_epu8(mask, *limit);
+    mask = _mm_cmpeq_epi8(mask, zero);
+
+    // lp filter - the same for 6, 8 and 14 versions
+    filter4_sse2(p1p0, q1q0, &hev, &mask, q1q0, p1p0);
+
+    // flat_mask
+    flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_p1p0);
+    flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4));
+    flat = _mm_subs_epu8(flat, one);
+    flat = _mm_cmpeq_epi8(flat, zero);
+    flat = _mm_and_si128(flat, mask);
+    // replicate for the further "merged variables" usage
+    flat = _mm_unpacklo_epi32(flat, flat);
+    flat = _mm_unpacklo_epi64(flat, flat);
+  }
+
+  // 5 tap filter
+  // need it only if flat !=0
+  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
+    const __m128i four = _mm_set1_epi16(4);
+    __m128i workp_a, workp_b, workp_c;
+    __m128i pq0x2_pq1, pq1_pq2;
+    pq2_16 = _mm_unpacklo_epi8(q2p2, zero);
+    pq1_16 = _mm_unpacklo_epi8(q1p1, zero);
+    pq0_16 = _mm_unpacklo_epi8(q0p0, zero);
+    q0_16 = _mm_srli_si128(pq0_16, 8);
+    q2_16 = _mm_srli_si128(pq2_16, 8);
+
+    // op1
+    pq0x2_pq1 =
+        _mm_add_epi16(_mm_add_epi16(pq0_16, pq0_16), pq1_16);  // p0 *2 + p1
+    pq1_pq2 = _mm_add_epi16(pq1_16, pq2_16);                   // p1 + p2
+    workp_a = _mm_add_epi16(_mm_add_epi16(pq0x2_pq1, four),
+                            pq1_pq2);  // p2 + p0 * 2 + p1 * 2 + 4
+
+    workp_b = _mm_add_epi16(_mm_add_epi16(pq2_16, pq2_16), q0_16);
+    workp_b =
+        _mm_add_epi16(workp_a, workp_b);  // p2 * 3 + p1 * 2 + p0 * 2 + q0 + 4
+
+    // op0
+    workp_c = _mm_srli_si128(pq0x2_pq1, 8);  // q0 * 2 + q1
+    workp_a = _mm_add_epi16(workp_a,
+                            workp_c);  // p2 + p0 * 2 + p1 * 2 + q0 * 2 + q1 + 4
+    workp_b = _mm_unpacklo_epi64(workp_a, workp_b);
+    workp_b = _mm_srli_epi16(workp_b, 3);
+
+    flat_p1p0 = _mm_packus_epi16(workp_b, workp_b);
+
+    // oq0
+    workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq2_16),
+                            pq1_16);  // p0 * 2 + p1  + q0 * 2 + q1 + 4
+    workp_b = _mm_srli_si128(pq1_pq2, 8);
+    workp_a = _mm_add_epi16(
+        workp_a, workp_b);  // p0 * 2 + p1  + q0 * 2 + q1 * 2 + q2 + 4
+    // workp_shft0 = _mm_srli_epi16(workp_a, 3);
+
+    // oq1
+    workp_c = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq1_16),
+                            pq0_16);  // p0   + q0 * 2 + q1 * 2 + q2 + 4
+    workp_b = _mm_add_epi16(q2_16, q2_16);
+    workp_b =
+        _mm_add_epi16(workp_c, workp_b);  // p0  + q0 * 2 + q1 * 2 + q2 * 3 + 4
+
+    workp_a = _mm_unpacklo_epi64(workp_a, workp_b);
+    workp_a = _mm_srli_epi16(workp_a, 3);
+
+    flat_q0q1 = _mm_packus_epi16(workp_a, workp_a);
+
+    qs1qs0 = _mm_andnot_si128(flat, *q1q0);
+    *q1q0 = _mm_and_si128(flat, flat_q0q1);
+    *q1q0 = _mm_or_si128(qs1qs0, *q1q0);
+
+    ps1ps0 = _mm_andnot_si128(flat, *p1p0);
+    *p1p0 = _mm_and_si128(flat, flat_p1p0);
+    *p1p0 = _mm_or_si128(ps1ps0, *p1p0);
+  }
+}
+
+void aom_lpf_horizontal_6_sse2(unsigned char *s, int p,
+                               const unsigned char *_blimit,
+                               const unsigned char *_limit,
+                               const unsigned char *_thresh) {
+  __m128i p2, p1, p0, q0, q1, q2;
+  __m128i p1p0, q1q0;
+  __m128i blimit = _mm_load_si128((__m128i *)_blimit);
+  __m128i limit = _mm_load_si128((__m128i *)_limit);
+  __m128i thresh = _mm_load_si128((__m128i *)_thresh);
+
+  p2 = xx_loadl_32(s - 3 * p);
+  p1 = xx_loadl_32(s - 2 * p);
+  p0 = xx_loadl_32(s - 1 * p);
+  q0 = xx_loadl_32(s - 0 * p);
+  q1 = xx_loadl_32(s + 1 * p);
+  q2 = xx_loadl_32(s + 2 * p);
+
+  lpf_internal_6_sse2(&p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, &blimit,
+                      &limit, &thresh);
+
+  xx_storel_32(s - 1 * p, p1p0);
+  xx_storel_32(s - 2 * p, _mm_srli_si128(p1p0, 4));
+  xx_storel_32(s + 0 * p, q1q0);
+  xx_storel_32(s + 1 * p, _mm_srli_si128(q1q0, 4));
+}
+
+void aom_lpf_horizontal_6_dual_sse2(unsigned char *s, int p,
+                                    const unsigned char *_blimit0,
+                                    const unsigned char *_limit0,
+                                    const unsigned char *_thresh0,
+                                    const unsigned char *_blimit1,
+                                    const unsigned char *_limit1,
+                                    const unsigned char *_thresh1) {
+  __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0),
+                                      _mm_load_si128((__m128i *)_blimit1));
+  __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0),
+                                     _mm_load_si128((__m128i *)_limit1));
+  __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0),
+                                      _mm_load_si128((__m128i *)_thresh1));
+
+  __m128i p2, p1, p0, q0, q1, q2;
+  __m128i p1p0, q1q0;
+
+  p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
+  p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
+  p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
+  q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
+  q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
+  q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
+
+  lpf_internal_6_dual_sse2(&p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, &blimit,
+                           &limit, &thresh);
+
+  _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0);
+  _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8));
+  _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0);
+  _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8));
+}
+
+static AOM_FORCE_INLINE void lpf_internal_8_sse2(
+    __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1,
+    __m128i *q1, __m128i *p0, __m128i *q0, __m128i *q1q0_out, __m128i *p1p0_out,
+    __m128i *blimit, __m128i *limit, __m128i *thresh) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i mask, hev, flat;
+  __m128i p2_16, q2_16, p1_16, p0_16, q0_16, q1_16, p3_16, q3_16, q3p3,
+      flat_p1p0, flat_q0q1;
+  __m128i q2p2, q1p1, q0p0;
+  __m128i q1q0, p1p0, ps1ps0, qs1qs0;
+  __m128i work_pq, opq2, pq2;
+
+  q3p3 = _mm_unpacklo_epi32(*p3, *q3);
+  q2p2 = _mm_unpacklo_epi32(*p2, *q2);
+  q1p1 = _mm_unpacklo_epi32(*p1, *q1);
+  q0p0 = _mm_unpacklo_epi32(*p0, *q0);
+
+  p1p0 = _mm_unpacklo_epi32(q0p0, q1p1);  // p1p0 q1q0
+  q1q0 = _mm_srli_si128(p1p0, 8);
+
+  // filter_mask and hev_mask
+
+  // considering sse doesn't have unsigned elements comparison the idea is to
+  // find at least one case when X > limit, it means the corresponding  mask
+  // bit is set.
+  // to achieve that we find global max value of all inputs of abs(x-y) or
+  // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set
+  // otherwise - not
+
+  const __m128i one = _mm_set1_epi8(1);
+  const __m128i fe = _mm_set1_epi8((char)0xfe);
+  const __m128i ff = _mm_cmpeq_epi8(fe, fe);
+  __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
+
+  abs_p1p0 = abs_diff(q1p1, q0p0);
+  abs_q1q0 = _mm_srli_si128(abs_p1p0, 4);
+
+  abs_p0q0 = abs_diff(p1p0, q1q0);
+  abs_p1q1 = _mm_srli_si128(abs_p0q0, 4);
+
+  flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+  hev = _mm_subs_epu8(flat, *thresh);
+  hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+  // replicate for the further "merged variables" usage
+  hev = _mm_unpacklo_epi32(hev, hev);
+
+  abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+  abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+  mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
+  mask = _mm_unpacklo_epi32(mask, zero);
+  mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+  // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+  mask = _mm_max_epu8(abs_p1p0, mask);
+  // mask |= (abs(p1 - p0) > limit) * -1;
+  // mask |= (abs(q1 - q0) > limit) * -1;
+
+  work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2));
+
+  mask = _mm_max_epu8(work, mask);
+  mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 4));
+  mask = _mm_subs_epu8(mask, *limit);
+  mask = _mm_cmpeq_epi8(mask, zero);
+
+  // lp filter - the same for 6, 8 and 14 versions
+  filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out);
+
+  // flat_mask4
+  flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0));
+  flat = _mm_max_epu8(abs_p1p0, flat);
+
+  flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4));
+  flat = _mm_subs_epu8(flat, one);
+  flat = _mm_cmpeq_epi8(flat, zero);
+  flat = _mm_and_si128(flat, mask);
+  // replicate for the further "merged variables" usage
+  flat = _mm_unpacklo_epi32(flat, flat);
+  flat = _mm_unpacklo_epi64(flat, flat);
+
+  // filter8 need it only if flat !=0
+  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
+    const __m128i four = _mm_set1_epi16(4);
+    __m128i workp_a, workp_b, workp_c, workp_d, workp_shft1, workp_shft2;
+    p2_16 = _mm_unpacklo_epi8(*p2, zero);
+    p1_16 = _mm_unpacklo_epi8(*p1, zero);
+    p0_16 = _mm_unpacklo_epi8(*p0, zero);
+    q0_16 = _mm_unpacklo_epi8(*q0, zero);
+    q1_16 = _mm_unpacklo_epi8(*q1, zero);
+    q2_16 = _mm_unpacklo_epi8(*q2, zero);
+    p3_16 = _mm_unpacklo_epi8(*p3, zero);
+    q3_16 = _mm_unpacklo_epi8(*q3, zero);
+
+    // op2
+    workp_a =
+        _mm_add_epi16(_mm_add_epi16(p3_16, p3_16), _mm_add_epi16(p2_16, p1_16));
+    workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0_16);
+    workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, p2_16), p3_16);
+    workp_shft2 = _mm_add_epi16(workp_a, workp_b);
+
+    // op1
+    workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, q1_16), p1_16);
+    workp_c = _mm_add_epi16(workp_a, workp_b);
+    // workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    // op0
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q2_16);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1_16), p0_16);
+    workp_d = _mm_add_epi16(workp_a, workp_b);
+    // workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    workp_c = _mm_unpacklo_epi64(workp_d, workp_c);
+    workp_c = _mm_srli_epi16(workp_c, 3);
+    flat_p1p0 = _mm_packus_epi16(workp_c, workp_c);
+
+    // oq0
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q3_16);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0_16), q0_16);
+    // workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+    workp_c = _mm_add_epi16(workp_a, workp_b);
+
+    // oq1
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2_16), q3_16);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0_16), q1_16);
+    workp_d = _mm_add_epi16(workp_a, workp_b);
+    // workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    workp_c = _mm_unpacklo_epi64(workp_c, workp_d);
+    workp_c = _mm_srli_epi16(workp_c, 3);
+    flat_q0q1 = _mm_packus_epi16(workp_c, workp_c);
+
+    // oq2
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1_16), q3_16);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1_16), q2_16);
+    workp_shft1 = _mm_add_epi16(workp_a, workp_b);
+
+    workp_c = _mm_unpacklo_epi64(workp_shft2, workp_shft1);
+    workp_c = _mm_srli_epi16(workp_c, 3);
+
+    opq2 = _mm_packus_epi16(workp_c, workp_c);
+
+    work_pq = _mm_andnot_si128(flat, q2p2);
+    pq2 = _mm_and_si128(flat, opq2);
+    *p2 = _mm_or_si128(work_pq, pq2);
+    *q2 = _mm_srli_si128(*p2, 4);
+
+    qs1qs0 = _mm_andnot_si128(flat, *q1q0_out);
+    q1q0 = _mm_and_si128(flat, flat_q0q1);
+    *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
+
+    ps1ps0 = _mm_andnot_si128(flat, *p1p0_out);
+    p1p0 = _mm_and_si128(flat, flat_p1p0);
+    *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
+  }
+}
+
+static AOM_FORCE_INLINE void lpf_internal_8_dual_sse2(
+    __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1,
+    __m128i *q1, __m128i *p0, __m128i *q0, __m128i *q1q0_out, __m128i *p1p0_out,
+    __m128i *blimit, __m128i *limit, __m128i *thresh) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i mask, hev, flat;
+  __m128i p2_16, q2_16, p1_16, p0_16, q0_16, q1_16, p3_16, q3_16, q3p3,
+      flat_p1p0, flat_q0q1;
+  __m128i q2p2, q1p1, q0p0;
+  __m128i q1q0, p1p0, ps1ps0, qs1qs0;
+  __m128i work_pq, opq2, pq2;
+
+  q3p3 = _mm_unpacklo_epi64(*p3, *q3);
+  q2p2 = _mm_unpacklo_epi64(*p2, *q2);
+  q1p1 = _mm_unpacklo_epi64(*p1, *q1);
+  q0p0 = _mm_unpacklo_epi64(*p0, *q0);
+
+  p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
+  q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
+
+  {
+    // filter_mask and hev_mask
+
+    // considering sse doesn't have unsigned elements comparison the idea is to
+    // find at least one case when X > limit, it means the corresponding  mask
+    // bit is set.
+    // to achieve that we find global max value of all inputs of abs(x-y) or
+    // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set
+    // otherwise - not
+
+    const __m128i one = _mm_set1_epi8(1);
+    const __m128i fe = _mm_set1_epi8((char)0xfe);
+    const __m128i ff = _mm_cmpeq_epi8(fe, fe);
+    __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
+
+    abs_p1p0 = abs_diff(q1p1, q0p0);
+    abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
+
+    abs_p0q0 = abs_diff(p1p0, q1q0);
+    abs_p1q1 = _mm_srli_si128(abs_p0q0, 8);
+    abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, abs_p0q0);
+
+    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+    hev = _mm_subs_epu8(flat, *thresh);
+    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+    // replicate for the further "merged variables" usage
+    hev = _mm_unpacklo_epi64(hev, hev);
+
+    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
+    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+    mask = _mm_max_epu8(abs_p1p0, mask);
+    // mask |= (abs(p1 - p0) > limit) * -1;
+    // mask |= (abs(q1 - q0) > limit) * -1;
+
+    work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2));
+
+    mask = _mm_max_epu8(work, mask);
+    mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
+    mask = _mm_subs_epu8(mask, *limit);
+    mask = _mm_cmpeq_epi8(mask, zero);
+
+    // lp filter - the same for 6, 8 and 14 versions
+    filter4_dual_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out);
+
+    // flat_mask4
+    flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0));
+    flat = _mm_max_epu8(abs_p1p0, flat);
+
+    flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
+    flat = _mm_subs_epu8(flat, one);
+    flat = _mm_cmpeq_epi8(flat, zero);
+    flat = _mm_and_si128(flat, mask);
+    // replicate for the further "merged variables" usage
+    flat = _mm_unpacklo_epi64(flat, flat);
+  }
+
+  // filter8 need it only if flat !=0
+  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
+    const __m128i four = _mm_set1_epi16(4);
+
+    __m128i workp_a, workp_b, workp_shft0, workp_shft1, workp_shft2;
+    p2_16 = _mm_unpacklo_epi8(*p2, zero);
+    p1_16 = _mm_unpacklo_epi8(*p1, zero);
+    p0_16 = _mm_unpacklo_epi8(*p0, zero);
+    q0_16 = _mm_unpacklo_epi8(*q0, zero);
+    q1_16 = _mm_unpacklo_epi8(*q1, zero);
+    q2_16 = _mm_unpacklo_epi8(*q2, zero);
+    p3_16 = _mm_unpacklo_epi8(*p3, zero);
+    q3_16 = _mm_unpacklo_epi8(*q3, zero);
+
+    // op2
+    workp_a =
+        _mm_add_epi16(_mm_add_epi16(p3_16, p3_16), _mm_add_epi16(p2_16, p1_16));
+    workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0_16);
+    workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, p2_16), p3_16);
+    workp_shft2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    // op1
+    workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, q1_16), p1_16);
+    workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    // op0
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q2_16);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1_16), p0_16);
+    workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    flat_p1p0 = _mm_packus_epi16(workp_shft1, workp_shft0);
+
+    // oq0
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q3_16);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0_16), q0_16);
+    workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    // oq1
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2_16), q3_16);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0_16), q1_16);
+    workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    flat_q0q1 = _mm_packus_epi16(workp_shft0, workp_shft1);
+
+    // oq2
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1_16), q3_16);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1_16), q2_16);
+    workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    opq2 = _mm_packus_epi16(workp_shft2, workp_shft1);
+
+    work_pq = _mm_andnot_si128(flat, q2p2);
+    pq2 = _mm_and_si128(flat, opq2);
+    *p2 = _mm_or_si128(work_pq, pq2);
+    *q2 = _mm_srli_si128(*p2, 8);
+
+    qs1qs0 = _mm_andnot_si128(flat, *q1q0_out);
+    q1q0 = _mm_and_si128(flat, flat_q0q1);
+    *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
+
+    ps1ps0 = _mm_andnot_si128(flat, *p1p0_out);
+    p1p0 = _mm_and_si128(flat, flat_p1p0);
+    *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
+  }
+}
+
+void aom_lpf_horizontal_8_sse2(unsigned char *s, int p,
+                               const unsigned char *_blimit,
+                               const unsigned char *_limit,
+                               const unsigned char *_thresh) {
+  __m128i p3, p2, p1, p0, q0, q1, q2, q3;
+  __m128i q1q0, p1p0;
+  __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
+  __m128i limit = _mm_load_si128((const __m128i *)_limit);
+  __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
+
+  p3 = xx_loadl_32(s - 4 * p);
+  p2 = xx_loadl_32(s - 3 * p);
+  p1 = xx_loadl_32(s - 2 * p);
+  p0 = xx_loadl_32(s - 1 * p);
+  q0 = xx_loadl_32(s - 0 * p);
+  q1 = xx_loadl_32(s + 1 * p);
+  q2 = xx_loadl_32(s + 2 * p);
+  q3 = xx_loadl_32(s + 3 * p);
+
+  lpf_internal_8_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0,
+                      &blimit, &limit, &thresh);
+
+  xx_storel_32(s - 1 * p, p1p0);
+  xx_storel_32(s - 2 * p, _mm_srli_si128(p1p0, 4));
+  xx_storel_32(s + 0 * p, q1q0);
+  xx_storel_32(s + 1 * p, _mm_srli_si128(q1q0, 4));
+  xx_storel_32(s - 3 * p, p2);
+  xx_storel_32(s + 2 * p, q2);
+}
+
+void aom_lpf_horizontal_14_dual_sse2(unsigned char *s, int p,
+                                     const unsigned char *_blimit0,
+                                     const unsigned char *_limit0,
+                                     const unsigned char *_thresh0,
+                                     const unsigned char *_blimit1,
+                                     const unsigned char *_limit1,
+                                     const unsigned char *_thresh1) {
+  __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0;
+  __m128i blimit =
+      _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0),
+                         _mm_load_si128((const __m128i *)_blimit1));
+  __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0),
+                                     _mm_load_si128((const __m128i *)_limit1));
+  __m128i thresh =
+      _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_thresh0),
+                         _mm_load_si128((const __m128i *)_thresh1));
+
+  q4p4 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 5 * p)),
+                            _mm_loadl_epi64((__m128i *)(s + 4 * p)));
+  q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)),
+                            _mm_loadl_epi64((__m128i *)(s + 3 * p)));
+  q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
+                            _mm_loadl_epi64((__m128i *)(s + 2 * p)));
+  q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
+                            _mm_loadl_epi64((__m128i *)(s + 1 * p)));
+
+  q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
+                            _mm_loadl_epi64((__m128i *)(s - 0 * p)));
+
+  q5p5 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 6 * p)),
+                            _mm_loadl_epi64((__m128i *)(s + 5 * p)));
+
+  q6p6 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 7 * p)),
+                            _mm_loadl_epi64((__m128i *)(s + 6 * p)));
+
+  lpf_internal_14_dual_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0,
+                            &blimit, &limit, &thresh);
+
+  _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0);
+  _mm_storel_epi64((__m128i *)(s + 0 * p), _mm_srli_si128(q0p0, 8));
+  _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1);
+  _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1p1, 8));
+  _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2);
+  _mm_storel_epi64((__m128i *)(s + 2 * p), _mm_srli_si128(q2p2, 8));
+  _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3);
+  _mm_storel_epi64((__m128i *)(s + 3 * p), _mm_srli_si128(q3p3, 8));
+  _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4);
+  _mm_storel_epi64((__m128i *)(s + 4 * p), _mm_srli_si128(q4p4, 8));
+  _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5);
+  _mm_storel_epi64((__m128i *)(s + 5 * p), _mm_srli_si128(q5p5, 8));
+}
+
+void aom_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
+                                    const uint8_t *_limit0,
+                                    const uint8_t *_thresh0,
+                                    const uint8_t *_blimit1,
+                                    const uint8_t *_limit1,
+                                    const uint8_t *_thresh1) {
+  __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0),
+                                      _mm_load_si128((__m128i *)_blimit1));
+  __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0),
+                                     _mm_load_si128((__m128i *)_limit1));
+  __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0),
+                                      _mm_load_si128((__m128i *)_thresh1));
+
+  __m128i p2, p1, p0, q0, q1, q2, p3, q3;
+  __m128i q1q0, p1p0;
+
+  p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
+  p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
+  p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
+  p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
+  q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
+  q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
+  q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
+  q3 = _mm_loadl_epi64((__m128i *)(s + 3 * p));
+
+  lpf_internal_8_dual_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0,
+                           &blimit, &limit, &thresh);
+
+  _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0);
+  _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8));
+  _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0);
+  _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8));
+  _mm_storel_epi64((__m128i *)(s - 3 * p), p2);
+  _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
+}
+
+void aom_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
+                                    const unsigned char *_blimit0,
+                                    const unsigned char *_limit0,
+                                    const unsigned char *_thresh0,
+                                    const unsigned char *_blimit1,
+                                    const unsigned char *_limit1,
+                                    const unsigned char *_thresh1) {
+  __m128i p1, p0, q0, q1;
+  __m128i qs1qs0, ps1ps0;
+
+  p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
+  p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
+  q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
+  q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
+
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i blimit =
+      _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0),
+                         _mm_load_si128((const __m128i *)_blimit1));
+  const __m128i limit =
+      _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0),
+                         _mm_load_si128((const __m128i *)_limit1));
+
+  __m128i l = _mm_unpacklo_epi64(blimit, limit);
+
+  __m128i thresh0 =
+      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh0), zero);
+
+  __m128i thresh1 =
+      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh1), zero);
+
+  __m128i t = _mm_unpacklo_epi64(thresh0, thresh1);
+
+  lpf_internal_4_dual_sse2(&p1, &p0, &q0, &q1, &l, &t, &qs1qs0, &ps1ps0);
+
+  _mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0);
+  _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(ps1ps0, 8));
+  _mm_storel_epi64((__m128i *)(s + 0 * p), qs1qs0);
+  _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(qs1qs0, 8));
+}
+
+void aom_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
+                                  const uint8_t *_limit0,
+                                  const uint8_t *_thresh0,
+                                  const uint8_t *_blimit1,
+                                  const uint8_t *_limit1,
+                                  const uint8_t *_thresh1) {
+  __m128i p0, q0, q1, p1;
+  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+  __m128i qs1qs0, ps1ps0;
+
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i blimit =
+      _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0),
+                         _mm_load_si128((const __m128i *)_blimit1));
+  const __m128i limit =
+      _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0),
+                         _mm_load_si128((const __m128i *)_limit1));
+
+  __m128i l = _mm_unpacklo_epi64(blimit, limit);
+
+  __m128i thresh0 =
+      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh0), zero);
+
+  __m128i thresh1 =
+      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh1), zero);
+
+  __m128i t = _mm_unpacklo_epi64(thresh0, thresh1);
+
+  x0 = _mm_loadl_epi64((__m128i *)((s - 2)));
+  x1 = _mm_loadl_epi64((__m128i *)((s - 2) + p));
+  x2 = _mm_loadl_epi64((__m128i *)((s - 2) + 2 * p));
+  x3 = _mm_loadl_epi64((__m128i *)((s - 2) + 3 * p));
+  x4 = _mm_loadl_epi64((__m128i *)((s - 2) + 4 * p));
+  x5 = _mm_loadl_epi64((__m128i *)((s - 2) + 5 * p));
+  x6 = _mm_loadl_epi64((__m128i *)((s - 2) + 6 * p));
+  x7 = _mm_loadl_epi64((__m128i *)((s - 2) + 7 * p));
+
+  transpose8x8_low_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &p1, &p0, &q0,
+                        &q1);
+
+  lpf_internal_4_dual_sse2(&p1, &p0, &q0, &q1, &l, &t, &qs1qs0, &ps1ps0);
+
+  p1 = _mm_srli_si128(ps1ps0, 8);
+  q1 = _mm_srli_si128(qs1qs0, 8);
+
+  transpose4x8_8x4_sse2(&p1, &ps1ps0, &qs1qs0, &q1, &d0, &d1, &d2, &d3, &d4,
+                        &d5, &d6, &d7);
+
+  xx_storel_32((s - 2 + 0 * p), d0);
+  xx_storel_32((s - 2 + 1 * p), d1);
+  xx_storel_32((s - 2 + 2 * p), d2);
+  xx_storel_32((s - 2 + 3 * p), d3);
+  xx_storel_32((s - 2 + 4 * p), d4);
+  xx_storel_32((s - 2 + 5 * p), d5);
+  xx_storel_32((s - 2 + 6 * p), d6);
+  xx_storel_32((s - 2 + 7 * p), d7);
+}
+
+void aom_lpf_vertical_6_sse2(unsigned char *s, int p,
+                             const unsigned char *_blimit,
+                             const unsigned char *_limit,
+                             const unsigned char *_thresh) {
+  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+  __m128i x2, x1, x0, x3;
+  __m128i p0, q0;
+  __m128i p1p0, q1q0;
+  __m128i blimit = _mm_load_si128((__m128i *)_blimit);
+  __m128i limit = _mm_load_si128((__m128i *)_limit);
+  __m128i thresh = _mm_load_si128((__m128i *)_thresh);
+
+  x3 = _mm_loadl_epi64((__m128i *)((s - 3) + 0 * p));
+  x2 = _mm_loadl_epi64((__m128i *)((s - 3) + 1 * p));
+  x1 = _mm_loadl_epi64((__m128i *)((s - 3) + 2 * p));
+  x0 = _mm_loadl_epi64((__m128i *)((s - 3) + 3 * p));
+
+  transpose4x8_8x4_sse2(&x3, &x2, &x1, &x0, &d0, &d1, &d2, &d3, &d4, &d5, &d6,
+                        &d7);
+
+  lpf_internal_6_sse2(&d0, &d5, &d1, &d4, &d2, &d3, &q1q0, &p1p0, &blimit,
+                      &limit, &thresh);
+
+  p0 = _mm_srli_si128(p1p0, 4);
+  q0 = _mm_srli_si128(q1q0, 4);
+
+  transpose4x8_8x4_low_sse2(&p0, &p1p0, &q1q0, &q0, &d0, &d1, &d2, &d3);
+
+  xx_storel_32(s + 0 * p - 2, d0);
+  xx_storel_32(s + 1 * p - 2, d1);
+  xx_storel_32(s + 2 * p - 2, d2);
+  xx_storel_32(s + 3 * p - 2, d3);
+}
+
+void aom_lpf_vertical_6_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
+                                  const uint8_t *_limit0,
+                                  const uint8_t *_thresh0,
+                                  const uint8_t *_blimit1,
+                                  const uint8_t *_limit1,
+                                  const uint8_t *_thresh1) {
+  __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0),
+                                      _mm_load_si128((__m128i *)_blimit1));
+  __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0),
+                                     _mm_load_si128((__m128i *)_limit1));
+  __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0),
+                                      _mm_load_si128((__m128i *)_thresh1));
+
+  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+  __m128i p0, q0;
+  __m128i p1p0, q1q0;
+  __m128i d0d1, d2d3, d4d5, d6d7;
+
+  x0 = _mm_loadl_epi64((__m128i *)((s - 3) + 0 * p));
+  x1 = _mm_loadl_epi64((__m128i *)((s - 3) + 1 * p));
+  x2 = _mm_loadl_epi64((__m128i *)((s - 3) + 2 * p));
+  x3 = _mm_loadl_epi64((__m128i *)((s - 3) + 3 * p));
+  x4 = _mm_loadl_epi64((__m128i *)((s - 3) + 4 * p));
+  x5 = _mm_loadl_epi64((__m128i *)((s - 3) + 5 * p));
+  x6 = _mm_loadl_epi64((__m128i *)((s - 3) + 6 * p));
+  x7 = _mm_loadl_epi64((__m128i *)((s - 3) + 7 * p));
+
+  transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0d1, &d2d3, &d4d5,
+                    &d6d7);
+
+  d1 = _mm_srli_si128(d0d1, 8);
+  d3 = _mm_srli_si128(d2d3, 8);
+  d5 = _mm_srli_si128(d4d5, 8);
+  d7 = _mm_srli_si128(d6d7, 8);
+
+  lpf_internal_6_dual_sse2(&d0d1, &d5, &d1, &d4d5, &d2d3, &d3, &q1q0, &p1p0,
+                           &blimit, &limit, &thresh);
+
+  p0 = _mm_srli_si128(p1p0, 8);
+  q0 = _mm_srli_si128(q1q0, 8);
+
+  transpose4x8_8x4_sse2(&p0, &p1p0, &q1q0, &q0, &d0, &d1, &d2, &d3, &d4, &d5,
+                        &d6, &d7);
+
+  xx_storel_32((s - 2 + 0 * p), d0);
+  xx_storel_32((s - 2 + 1 * p), d1);
+  xx_storel_32((s - 2 + 2 * p), d2);
+  xx_storel_32((s - 2 + 3 * p), d3);
+  xx_storel_32((s - 2 + 4 * p), d4);
+  xx_storel_32((s - 2 + 5 * p), d5);
+  xx_storel_32((s - 2 + 6 * p), d6);
+  xx_storel_32((s - 2 + 7 * p), d7);
+}
+
+void aom_lpf_vertical_8_sse2(unsigned char *s, int p,
+                             const unsigned char *_blimit,
+                             const unsigned char *_limit,
+                             const unsigned char *_thresh) {
+  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+
+  __m128i p0, q0;
+  __m128i x2, x1, x0, x3;
+  __m128i q1q0, p1p0;
+  __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
+  __m128i limit = _mm_load_si128((const __m128i *)_limit);
+  __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
+
+  x3 = _mm_loadl_epi64((__m128i *)((s - 4) + 0 * p));
+  x2 = _mm_loadl_epi64((__m128i *)((s - 4) + 1 * p));
+  x1 = _mm_loadl_epi64((__m128i *)((s - 4) + 2 * p));
+  x0 = _mm_loadl_epi64((__m128i *)((s - 4) + 3 * p));
+
+  transpose4x8_8x4_sse2(&x3, &x2, &x1, &x0, &d0, &d1, &d2, &d3, &d4, &d5, &d6,
+                        &d7);
+  // Loop filtering
+  lpf_internal_8_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4, &q1q0, &p1p0,
+                      &blimit, &limit, &thresh);
+
+  p0 = _mm_srli_si128(p1p0, 4);
+  q0 = _mm_srli_si128(q1q0, 4);
+
+  transpose8x8_low_sse2(&d0, &d1, &p0, &p1p0, &q1q0, &q0, &d6, &d7, &d0, &d1,
+                        &d2, &d3);
+
+  _mm_storel_epi64((__m128i *)(s - 4 + 0 * p), d0);
+  _mm_storel_epi64((__m128i *)(s - 4 + 1 * p), d1);
+  _mm_storel_epi64((__m128i *)(s - 4 + 2 * p), d2);
+  _mm_storel_epi64((__m128i *)(s - 4 + 3 * p), d3);
+}
+
+void aom_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
+                                  const uint8_t *_limit0,
+                                  const uint8_t *_thresh0,
+                                  const uint8_t *_blimit1,
+                                  const uint8_t *_limit1,
+                                  const uint8_t *_thresh1) {
+  __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0),
+                                      _mm_load_si128((__m128i *)_blimit1));
+  __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0),
+                                     _mm_load_si128((__m128i *)_limit1));
+  __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0),
+                                      _mm_load_si128((__m128i *)_thresh1));
+
+  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+  __m128i d1, d3, d5, d7;
+  __m128i q1q0, p1p0;
+  __m128i p1, q1;
+  __m128i d0d1, d2d3, d4d5, d6d7;
+
+  x0 = _mm_loadl_epi64((__m128i *)(s - 4 + 0 * p));
+  x1 = _mm_loadl_epi64((__m128i *)(s - 4 + 1 * p));
+  x2 = _mm_loadl_epi64((__m128i *)(s - 4 + 2 * p));
+  x3 = _mm_loadl_epi64((__m128i *)(s - 4 + 3 * p));
+  x4 = _mm_loadl_epi64((__m128i *)(s - 4 + 4 * p));
+  x5 = _mm_loadl_epi64((__m128i *)(s - 4 + 5 * p));
+  x6 = _mm_loadl_epi64((__m128i *)(s - 4 + 6 * p));
+  x7 = _mm_loadl_epi64((__m128i *)(s - 4 + 7 * p));
+
+  transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0d1, &d2d3, &d4d5,
+                    &d6d7);
+
+  d1 = _mm_srli_si128(d0d1, 8);
+  d3 = _mm_srli_si128(d2d3, 8);
+  d5 = _mm_srli_si128(d4d5, 8);
+  d7 = _mm_srli_si128(d6d7, 8);
+
+  lpf_internal_8_dual_sse2(&d0d1, &d7, &d1, &d6d7, &d2d3, &d5, &d3, &d4d5,
+                           &q1q0, &p1p0, &blimit, &limit, &thresh);
+
+  p1 = _mm_srli_si128(p1p0, 8);
+  q1 = _mm_srli_si128(q1q0, 8);
+
+  transpose8x8_sse2(&d0d1, &d1, &p1, &p1p0, &q1q0, &q1, &d6d7, &d7, &d0d1,
+                    &d2d3, &d4d5, &d6d7);
+
+  _mm_storel_epi64((__m128i *)(s - 4 + 0 * p), d0d1);
+  _mm_storel_epi64((__m128i *)(s - 4 + 1 * p), _mm_srli_si128(d0d1, 8));
+  _mm_storel_epi64((__m128i *)(s - 4 + 2 * p), d2d3);
+  _mm_storel_epi64((__m128i *)(s - 4 + 3 * p), _mm_srli_si128(d2d3, 8));
+  _mm_storel_epi64((__m128i *)(s - 4 + 4 * p), d4d5);
+  _mm_storel_epi64((__m128i *)(s - 4 + 5 * p), _mm_srli_si128(d4d5, 8));
+  _mm_storel_epi64((__m128i *)(s - 4 + 6 * p), d6d7);
+  _mm_storel_epi64((__m128i *)(s - 4 + 7 * p), _mm_srli_si128(d6d7, 8));
+}
+
+void aom_lpf_vertical_14_sse2(unsigned char *s, int p,
+                              const unsigned char *_blimit,
+                              const unsigned char *_limit,
+                              const unsigned char *_thresh) {
+  __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0;
+  __m128i x6, x5, x4, x3;
+  __m128i pq0, pq1, pq2, pq3;
+  __m128i blimit = _mm_load_si128((__m128i *)_blimit);
+  __m128i limit = _mm_load_si128((__m128i *)_limit);
+  __m128i thresh = _mm_load_si128((__m128i *)_thresh);
+
+  x6 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * p));
+  x5 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * p));
+  x4 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * p));
+  x3 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * p));
+
+  transpose_pq_14_sse2(&x6, &x5, &x4, &x3, &q0p0, &q1p1, &q2p2, &q3p3, &q4p4,
+                       &q5p5, &q6p6, &q7p7);
+
+  lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit,
+                       &limit, &thresh);
+
+  transpose_pq_14_inv_sse2(&q7p7, &q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1,
+                           &q0p0, &pq0, &pq1, &pq2, &pq3);
+  _mm_storeu_si128((__m128i *)(s - 8 + 0 * p), pq0);
+  _mm_storeu_si128((__m128i *)(s - 8 + 1 * p), pq1);
+  _mm_storeu_si128((__m128i *)(s - 8 + 2 * p), pq2);
+  _mm_storeu_si128((__m128i *)(s - 8 + 3 * p), pq3);
+}
+
+void aom_lpf_vertical_14_dual_sse2(
+    unsigned char *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
+    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
+    const uint8_t *_thresh1) {
+  __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0;
+  __m128i x7, x6, x5, x4, x3, x2, x1, x0;
+  __m128i d0d1, d2d3, d4d5, d6d7, d8d9, d10d11, d12d13, d14d15;
+  __m128i q0, q1, q2, q3, q7;
+  __m128i p0p1, p2p3, p4p5, p6p7;
+
+  __m128i blimit =
+      _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0),
+                         _mm_load_si128((const __m128i *)_blimit1));
+  __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0),
+                                     _mm_load_si128((const __m128i *)_limit1));
+  __m128i thresh =
+      _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_thresh0),
+                         _mm_load_si128((const __m128i *)_thresh1));
+
+  x7 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * p));
+  x6 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * p));
+  x5 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * p));
+  x4 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * p));
+  x3 = _mm_loadu_si128((__m128i *)((s - 8) + 4 * p));
+  x2 = _mm_loadu_si128((__m128i *)((s - 8) + 5 * p));
+  x1 = _mm_loadu_si128((__m128i *)((s - 8) + 6 * p));
+  x0 = _mm_loadu_si128((__m128i *)((s - 8) + 7 * p));
+
+  transpose8x16_16x8_sse2(&x7, &x6, &x5, &x4, &x3, &x2, &x1, &x0, &d0d1, &d2d3,
+                          &d4d5, &d6d7, &d8d9, &d10d11, &d12d13, &d14d15);
+
+  q6p6 = _mm_unpacklo_epi64(d2d3, _mm_srli_si128(d12d13, 8));
+  q5p5 = _mm_unpacklo_epi64(d4d5, _mm_srli_si128(d10d11, 8));
+  q4p4 = _mm_unpacklo_epi64(d6d7, _mm_srli_si128(d8d9, 8));
+  q3p3 = _mm_unpacklo_epi64(d8d9, _mm_srli_si128(d6d7, 8));
+  q2p2 = _mm_unpacklo_epi64(d10d11, _mm_srli_si128(d4d5, 8));
+  q1p1 = _mm_unpacklo_epi64(d12d13, _mm_srli_si128(d2d3, 8));
+  q0p0 = _mm_unpacklo_epi64(d14d15, _mm_srli_si128(d0d1, 8));
+  q7 = _mm_srli_si128(d14d15, 8);
+
+  lpf_internal_14_dual_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0,
+                            &blimit, &limit, &thresh);
+
+  x0 = _mm_srli_si128(q0p0, 8);
+  x1 = _mm_srli_si128(q1p1, 8);
+  x2 = _mm_srli_si128(q2p2, 8);
+  x3 = _mm_srli_si128(q3p3, 8);
+  x4 = _mm_srli_si128(q4p4, 8);
+  x5 = _mm_srli_si128(q5p5, 8);
+  x6 = _mm_srli_si128(q6p6, 8);
+
+  transpose16x8_8x16_sse2(&d0d1, &q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1,
+                          &q0p0, &x0, &x1, &x2, &x3, &x4, &x5, &x6, &q7, &p0p1,
+                          &p2p3, &p4p5, &p6p7, &q0, &q1, &q2, &q3);
+
+  _mm_storeu_si128((__m128i *)(s - 8 + 0 * p), p0p1);
+  _mm_storeu_si128((__m128i *)(s - 8 + 1 * p), p2p3);
+  _mm_storeu_si128((__m128i *)(s - 8 + 2 * p), p4p5);
+  _mm_storeu_si128((__m128i *)(s - 8 + 3 * p), p6p7);
+  _mm_storeu_si128((__m128i *)(s - 8 + 4 * p), q0);
+  _mm_storeu_si128((__m128i *)(s - 8 + 5 * p), q1);
+  _mm_storeu_si128((__m128i *)(s - 8 + 6 * p), q2);
+  _mm_storeu_si128((__m128i *)(s - 8 + 7 * p), q3);
+}
diff --git a/libs/libaom/src/aom_dsp/x86/lpf_common_sse2.h b/libs/libaom/src/aom_dsp/x86/lpf_common_sse2.h
new file mode 100644
index 000000000..6ed2cbfdf
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/lpf_common_sse2.h
@@ -0,0 +1,495 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_
+#define AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_
+
+#include <emmintrin.h>  // SSE2
+
+#include "config/aom_config.h"
+
+static INLINE void highbd_transpose6x6_sse2(__m128i *x0, __m128i *x1,
+                                            __m128i *x2, __m128i *x3,
+                                            __m128i *x4, __m128i *x5,
+                                            __m128i *d0, __m128i *d1,
+                                            __m128i *d2, __m128i *d3,
+                                            __m128i *d4, __m128i *d5) {
+  __m128i w0, w1, w2, w3, w4, w5, ww0;
+
+  // 00 01 02 03 04 05 xx xx
+  // 10 11 12 13 14 15 xx xx
+  // 20 21 22 23 24 25 xx xx
+  // 30 31 32 33 34 35 xx xx
+  // 40 41 42 43 44 45 xx xx
+  // 50 51 52 53 54 55 xx xx
+
+  w0 = _mm_unpacklo_epi16(*x0, *x1);  // 00 10 01 11 02 12 03 13
+  w1 = _mm_unpacklo_epi16(*x2, *x3);  // 20 30 21 31 22 32 23 33
+  w2 = _mm_unpacklo_epi16(*x4, *x5);  // 40 50 41 51 42 52 43 53
+
+  ww0 = _mm_unpacklo_epi32(w0, w1);   // 00 10 20 30 01 11 21 31
+  *d0 = _mm_unpacklo_epi64(ww0, w2);  // 00 10 20 30 40 50 41 51
+  *d1 = _mm_unpackhi_epi64(ww0,
+                           _mm_srli_si128(w2, 4));  // 01 11 21 31 41 51 xx xx
+
+  ww0 = _mm_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
+  *d2 = _mm_unpacklo_epi64(ww0,
+                           _mm_srli_si128(w2, 8));  // 02 12 22 32 42 52 xx xx
+
+  w3 = _mm_unpackhi_epi16(*x0, *x1);  // 04 14 05 15 xx xx xx xx
+  w4 = _mm_unpackhi_epi16(*x2, *x3);  // 24 34 25 35 xx xx xx xx
+  w5 = _mm_unpackhi_epi16(*x4, *x5);  // 44 54 45 55 xx xx xx xx
+
+  *d3 = _mm_unpackhi_epi64(ww0, _mm_srli_si128(w2, 4));  // 03 13 23 33 43 53
+
+  ww0 = _mm_unpacklo_epi32(w3, w4);   //  04 14 24 34 05 15 25 35
+  *d4 = _mm_unpacklo_epi64(ww0, w5);  //  04 14 24 34 44 54 45 55
+  *d5 = _mm_unpackhi_epi64(ww0,
+                           _mm_slli_si128(w5, 4));  // 05 15 25 35 45 55 xx xx
+}
+
+static INLINE void highbd_transpose4x8_8x4_low_sse2(__m128i *x0, __m128i *x1,
+                                                    __m128i *x2, __m128i *x3,
+                                                    __m128i *d0, __m128i *d1,
+                                                    __m128i *d2, __m128i *d3) {
+  __m128i zero = _mm_setzero_si128();
+  __m128i w0, w1, ww0, ww1;
+
+  w0 = _mm_unpacklo_epi16(*x0, *x1);  // 00 10 01 11 02 12 03 13
+  w1 = _mm_unpacklo_epi16(*x2, *x3);  // 20 30 21 31 22 32 23 33
+
+  ww0 = _mm_unpacklo_epi32(w0, w1);  // 00 10 20 30 01 11 21 31
+  ww1 = _mm_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
+
+  *d0 = _mm_unpacklo_epi64(ww0, zero);  // 00 10 20 30 xx xx xx xx
+  *d1 = _mm_unpackhi_epi64(ww0, zero);  // 01 11 21 31 xx xx xx xx
+  *d2 = _mm_unpacklo_epi64(ww1, zero);  // 02 12 22 32 xx xx xx xx
+  *d3 = _mm_unpackhi_epi64(ww1, zero);  // 03 13 23 33 xx xx xx xx
+}
+
+static INLINE void highbd_transpose4x8_8x4_high_sse2(__m128i *x0, __m128i *x1,
+                                                     __m128i *x2, __m128i *x3,
+                                                     __m128i *d4, __m128i *d5,
+                                                     __m128i *d6, __m128i *d7) {
+  __m128i w0, w1, ww2, ww3;
+  __m128i zero = _mm_setzero_si128();
+
+  w0 = _mm_unpackhi_epi16(*x0, *x1);  // 04 14 05 15 06 16 07 17
+  w1 = _mm_unpackhi_epi16(*x2, *x3);  // 24 34 25 35 26 36 27 37
+
+  ww2 = _mm_unpacklo_epi32(w0, w1);  //  04 14 24 34 05 15 25 35
+  ww3 = _mm_unpackhi_epi32(w0, w1);  //  06 16 26 36 07 17 27 37
+
+  *d4 = _mm_unpacklo_epi64(ww2, zero);  // 04 14 24 34 xx xx xx xx
+  *d5 = _mm_unpackhi_epi64(ww2, zero);  // 05 15 25 35 xx xx xx xx
+  *d6 = _mm_unpacklo_epi64(ww3, zero);  // 06 16 26 36 xx xx xx xx
+  *d7 = _mm_unpackhi_epi64(ww3, zero);  // 07 17 27 37 xx xx xx xx
+}
+
+// here in and out pointers (x and d) should be different! we don't store their
+// values inside
+static INLINE void highbd_transpose4x8_8x4_sse2(__m128i *x0, __m128i *x1,
+                                                __m128i *x2, __m128i *x3,
+                                                __m128i *d0, __m128i *d1,
+                                                __m128i *d2, __m128i *d3,
+                                                __m128i *d4, __m128i *d5,
+                                                __m128i *d6, __m128i *d7) {
+  // input
+  // x0 00 01 02 03 04 05 06 07
+  // x1 10 11 12 13 14 15 16 17
+  // x2 20 21 22 23 24 25 26 27
+  // x3 30 31 32 33 34 35 36 37
+  // output
+  // 00 10 20 30 xx xx xx xx
+  // 01 11 21 31 xx xx xx xx
+  // 02 12 22 32 xx xx xx xx
+  // 03 13 23 33 xx xx xx xx
+  // 04 14 24 34 xx xx xx xx
+  // 05 15 25 35 xx xx xx xx
+  // 06 16 26 36 xx xx xx xx
+  // 07 17 27 37 xx xx xx xx
+  highbd_transpose4x8_8x4_low_sse2(x0, x1, x2, x3, d0, d1, d2, d3);
+  highbd_transpose4x8_8x4_high_sse2(x0, x1, x2, x3, d4, d5, d6, d7);
+}
+
+static INLINE void highbd_transpose8x8_low_sse2(__m128i *x0, __m128i *x1,
+                                                __m128i *x2, __m128i *x3,
+                                                __m128i *x4, __m128i *x5,
+                                                __m128i *x6, __m128i *x7,
+                                                __m128i *d0, __m128i *d1,
+                                                __m128i *d2, __m128i *d3) {
+  __m128i w0, w1, w2, w3, ww0, ww1;
+  // x0 00 01 02 03 04 05 06 07
+  // x1 10 11 12 13 14 15 16 17
+  // x2 20 21 22 23 24 25 26 27
+  // x3 30 31 32 33 34 35 36 37
+  // x4 40 41 42 43 44 45 46 47
+  // x5 50 51 52 53 54 55 56 57
+  // x6 60 61 62 63 64 65 66 67
+  // x7 70 71 72 73 74 75 76 77
+
+  w0 = _mm_unpacklo_epi16(*x0, *x1);  // 00 10 01 11 02 12 03 13
+  w1 = _mm_unpacklo_epi16(*x2, *x3);  // 20 30 21 31 22 32 23 33
+  w2 = _mm_unpacklo_epi16(*x4, *x5);  // 40 50 41 51 42 52 43 53
+  w3 = _mm_unpacklo_epi16(*x6, *x7);  // 60 70 61 71 62 72 63 73
+
+  ww0 = _mm_unpacklo_epi32(w0, w1);  // 00 10 20 30 01 11 21 31
+  ww1 = _mm_unpacklo_epi32(w2, w3);  // 40 50 60 70 41 51 61 71
+
+  *d0 = _mm_unpacklo_epi64(ww0, ww1);  // 00 10 20 30 40 50 60 70
+  *d1 = _mm_unpackhi_epi64(ww0, ww1);  // 01 11 21 31 41 51 61 71
+
+  ww0 = _mm_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
+  ww1 = _mm_unpackhi_epi32(w2, w3);  // 42 52 62 72 43 53 63 73
+
+  *d2 = _mm_unpacklo_epi64(ww0, ww1);  // 02 12 22 32 42 52 62 72
+  *d3 = _mm_unpackhi_epi64(ww0, ww1);  // 03 13 23 33 43 53 63 73
+}
+
+static INLINE void highbd_transpose8x8_high_sse2(__m128i *x0, __m128i *x1,
+                                                 __m128i *x2, __m128i *x3,
+                                                 __m128i *x4, __m128i *x5,
+                                                 __m128i *x6, __m128i *x7,
+                                                 __m128i *d4, __m128i *d5,
+                                                 __m128i *d6, __m128i *d7) {
+  __m128i w0, w1, w2, w3, ww0, ww1;
+  // x0 00 01 02 03 04 05 06 07
+  // x1 10 11 12 13 14 15 16 17
+  // x2 20 21 22 23 24 25 26 27
+  // x3 30 31 32 33 34 35 36 37
+  // x4 40 41 42 43 44 45 46 47
+  // x5 50 51 52 53 54 55 56 57
+  // x6 60 61 62 63 64 65 66 67
+  // x7 70 71 72 73 74 75 76 77
+  w0 = _mm_unpackhi_epi16(*x0, *x1);  // 04 14 05 15 06 16 07 17
+  w1 = _mm_unpackhi_epi16(*x2, *x3);  // 24 34 25 35 26 36 27 37
+  w2 = _mm_unpackhi_epi16(*x4, *x5);  // 44 54 45 55 46 56 47 57
+  w3 = _mm_unpackhi_epi16(*x6, *x7);  // 64 74 65 75 66 76 67 77
+
+  ww0 = _mm_unpacklo_epi32(w0, w1);  // 04 14 24 34 05 15 25 35
+  ww1 = _mm_unpacklo_epi32(w2, w3);  // 44 54 64 74 45 55 65 75
+
+  *d4 = _mm_unpacklo_epi64(ww0, ww1);  // 04 14 24 34 44 54 64 74
+  *d5 = _mm_unpackhi_epi64(ww0, ww1);  // 05 15 25 35 45 55 65 75
+
+  ww0 = _mm_unpackhi_epi32(w0, w1);  // 06 16 26 36 07 17 27 37
+  ww1 = _mm_unpackhi_epi32(w2, w3);  // 46 56 66 76 47 57 67 77
+
+  *d6 = _mm_unpacklo_epi64(ww0, ww1);  // 06 16 26 36 46 56 66 76
+  *d7 = _mm_unpackhi_epi64(ww0, ww1);  // 07 17 27 37 47 57 67 77
+}
+
+// here in and out pointers (x and d) should be different! we don't store their
+// values inside
+static INLINE void highbd_transpose8x8_sse2(
+    __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
+    __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0, __m128i *d1,
+    __m128i *d2, __m128i *d3, __m128i *d4, __m128i *d5, __m128i *d6,
+    __m128i *d7) {
+  highbd_transpose8x8_low_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d0, d1, d2, d3);
+  highbd_transpose8x8_high_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d4, d5, d6, d7);
+}
+
+// here in and out pointers (x and d arrays) should be different! we don't store
+// their values inside
+static INLINE void highbd_transpose8x16_sse2(
+    __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
+    __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0, __m128i *d1,
+    __m128i *d2, __m128i *d3, __m128i *d4, __m128i *d5, __m128i *d6,
+    __m128i *d7) {
+  highbd_transpose8x8_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d0, d1, d2, d3, d4,
+                           d5, d6, d7);
+  highbd_transpose8x8_sse2(x0 + 1, x1 + 1, x2 + 1, x3 + 1, x4 + 1, x5 + 1,
+                           x6 + 1, x7 + 1, d0 + 1, d1 + 1, d2 + 1, d3 + 1,
+                           d4 + 1, d5 + 1, d6 + 1, d7 + 1);
+}
+
+// Low bit depth functions
+static INLINE void transpose4x8_8x4_low_sse2(__m128i *x0, __m128i *x1,
+                                             __m128i *x2, __m128i *x3,
+                                             __m128i *d0, __m128i *d1,
+                                             __m128i *d2, __m128i *d3) {
+  // input
+  // x0   00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx
+  // x1   10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx
+  // x2   20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx
+  // x3   30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx
+  // output
+  // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
+
+  __m128i w0, w1;
+
+  w0 = _mm_unpacklo_epi8(
+      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+  w1 = _mm_unpacklo_epi8(
+      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+
+  *d0 = _mm_unpacklo_epi16(
+      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+
+  *d1 = _mm_srli_si128(*d0,
+                       4);  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d2 = _mm_srli_si128(*d0,
+                       8);  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d3 = _mm_srli_si128(*d0,
+                       12);  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
+}
+
+static INLINE void transpose4x8_8x4_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
+                                         __m128i *x3, __m128i *d0, __m128i *d1,
+                                         __m128i *d2, __m128i *d3, __m128i *d4,
+                                         __m128i *d5, __m128i *d6,
+                                         __m128i *d7) {
+  // input
+  // x0   00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx
+  // x1   10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx
+  // x2   20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx
+  // x3   30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx
+  // output
+  // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
+
+  __m128i w0, w1, ww0, ww1;
+
+  w0 = _mm_unpacklo_epi8(
+      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+  w1 = _mm_unpacklo_epi8(
+      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+
+  ww0 = _mm_unpacklo_epi16(
+      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+  ww1 = _mm_unpackhi_epi16(
+      w0, w1);  // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+
+  *d0 = ww0;  // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d1 = _mm_srli_si128(ww0,
+                       4);  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d2 = _mm_srli_si128(ww0,
+                       8);  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d3 = _mm_srli_si128(ww0,
+                       12);  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
+
+  *d4 = ww1;  // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d5 = _mm_srli_si128(ww1,
+                       4);  // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d6 = _mm_srli_si128(ww1,
+                       8);  // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d7 = _mm_srli_si128(ww1,
+                       12);  // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
+}
+
+static INLINE void transpose8x8_low_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
+                                         __m128i *x3, __m128i *x4, __m128i *x5,
+                                         __m128i *x6, __m128i *x7, __m128i *d0,
+                                         __m128i *d1, __m128i *d2,
+                                         __m128i *d3) {
+  // input
+  // x0 00 01 02 03 04 05 06 07
+  // x1 10 11 12 13 14 15 16 17
+  // x2 20 21 22 23 24 25 26 27
+  // x3 30 31 32 33 34 35 36 37
+  // x4 40 41 42 43 44 45 46 47
+  // x5  50 51 52 53 54 55 56 57
+  // x6  60 61 62 63 64 65 66 67
+  // x7 70 71 72 73 74 75 76 77
+  // output
+  // d0 00 10 20 30 40 50 60 70 xx xx xx xx xx xx xx
+  // d1 01 11 21 31 41 51 61 71 xx xx xx xx xx xx xx xx
+  // d2 02 12 22 32 42 52 62 72 xx xx xx xx xx xx xx xx
+  // d3 03 13 23 33 43 53 63 73 xx xx xx xx xx xx xx xx
+
+  __m128i w0, w1, w2, w3, w4, w5;
+
+  w0 = _mm_unpacklo_epi8(
+      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+
+  w1 = _mm_unpacklo_epi8(
+      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+
+  w2 = _mm_unpacklo_epi8(
+      *x4, *x5);  // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+
+  w3 = _mm_unpacklo_epi8(
+      *x6, *x7);  // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+
+  w4 = _mm_unpacklo_epi16(
+      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+  w5 = _mm_unpacklo_epi16(
+      w2, w3);  // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+
+  *d0 = _mm_unpacklo_epi32(
+      w4, w5);  // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+  *d1 = _mm_srli_si128(*d0, 8);
+  *d2 = _mm_unpackhi_epi32(
+      w4, w5);  // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+  *d3 = _mm_srli_si128(*d2, 8);
+}
+
+static INLINE void transpose8x8_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
+                                     __m128i *x3, __m128i *x4, __m128i *x5,
+                                     __m128i *x6, __m128i *x7, __m128i *d0d1,
+                                     __m128i *d2d3, __m128i *d4d5,
+                                     __m128i *d6d7) {
+  __m128i w0, w1, w2, w3, w4, w5, w6, w7;
+  // x0 00 01 02 03 04 05 06 07
+  // x1 10 11 12 13 14 15 16 17
+  w0 = _mm_unpacklo_epi8(
+      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+
+  // x2 20 21 22 23 24 25 26 27
+  // x3 30 31 32 33 34 35 36 37
+  w1 = _mm_unpacklo_epi8(
+      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+
+  // x4 40 41 42 43 44 45 46 47
+  // x5  50 51 52 53 54 55 56 57
+  w2 = _mm_unpacklo_epi8(
+      *x4, *x5);  // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+
+  // x6  60 61 62 63 64 65 66 67
+  // x7 70 71 72 73 74 75 76 77
+  w3 = _mm_unpacklo_epi8(
+      *x6, *x7);  // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+
+  w4 = _mm_unpacklo_epi16(
+      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+  w5 = _mm_unpacklo_epi16(
+      w2, w3);  // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+
+  *d0d1 = _mm_unpacklo_epi32(
+      w4, w5);  // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+  *d2d3 = _mm_unpackhi_epi32(
+      w4, w5);  // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+
+  w6 = _mm_unpackhi_epi16(
+      w0, w1);  // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+  w7 = _mm_unpackhi_epi16(
+      w2, w3);  // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
+
+  *d4d5 = _mm_unpacklo_epi32(
+      w6, w7);  // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
+  *d6d7 = _mm_unpackhi_epi32(
+      w6, w7);  // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
+}
+
+static INLINE void transpose16x8_8x16_sse2(
+    __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
+    __m128i *x5, __m128i *x6, __m128i *x7, __m128i *x8, __m128i *x9,
+    __m128i *x10, __m128i *x11, __m128i *x12, __m128i *x13, __m128i *x14,
+    __m128i *x15, __m128i *d0, __m128i *d1, __m128i *d2, __m128i *d3,
+    __m128i *d4, __m128i *d5, __m128i *d6, __m128i *d7) {
+  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
+  __m128i w10, w11, w12, w13, w14, w15;
+
+  w0 = _mm_unpacklo_epi8(*x0, *x1);
+  w1 = _mm_unpacklo_epi8(*x2, *x3);
+  w2 = _mm_unpacklo_epi8(*x4, *x5);
+  w3 = _mm_unpacklo_epi8(*x6, *x7);
+
+  w8 = _mm_unpacklo_epi8(*x8, *x9);
+  w9 = _mm_unpacklo_epi8(*x10, *x11);
+  w10 = _mm_unpacklo_epi8(*x12, *x13);
+  w11 = _mm_unpacklo_epi8(*x14, *x15);
+
+  w4 = _mm_unpacklo_epi16(w0, w1);
+  w5 = _mm_unpacklo_epi16(w2, w3);
+  w12 = _mm_unpacklo_epi16(w8, w9);
+  w13 = _mm_unpacklo_epi16(w10, w11);
+
+  w6 = _mm_unpacklo_epi32(w4, w5);
+  w7 = _mm_unpackhi_epi32(w4, w5);
+  w14 = _mm_unpacklo_epi32(w12, w13);
+  w15 = _mm_unpackhi_epi32(w12, w13);
+
+  // Store first 4-line result
+  *d0 = _mm_unpacklo_epi64(w6, w14);
+  *d1 = _mm_unpackhi_epi64(w6, w14);
+  *d2 = _mm_unpacklo_epi64(w7, w15);
+  *d3 = _mm_unpackhi_epi64(w7, w15);
+
+  w4 = _mm_unpackhi_epi16(w0, w1);
+  w5 = _mm_unpackhi_epi16(w2, w3);
+  w12 = _mm_unpackhi_epi16(w8, w9);
+  w13 = _mm_unpackhi_epi16(w10, w11);
+
+  w6 = _mm_unpacklo_epi32(w4, w5);
+  w7 = _mm_unpackhi_epi32(w4, w5);
+  w14 = _mm_unpacklo_epi32(w12, w13);
+  w15 = _mm_unpackhi_epi32(w12, w13);
+
+  // Store second 4-line result
+  *d4 = _mm_unpacklo_epi64(w6, w14);
+  *d5 = _mm_unpackhi_epi64(w6, w14);
+  *d6 = _mm_unpacklo_epi64(w7, w15);
+  *d7 = _mm_unpackhi_epi64(w7, w15);
+}
+
+static INLINE void transpose8x16_16x8_sse2(
+    __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
+    __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0d1, __m128i *d2d3,
+    __m128i *d4d5, __m128i *d6d7, __m128i *d8d9, __m128i *d10d11,
+    __m128i *d12d13, __m128i *d14d15) {
+  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
+  __m128i w10, w11, w12, w13, w14, w15;
+
+  w0 = _mm_unpacklo_epi8(*x0, *x1);
+  w1 = _mm_unpacklo_epi8(*x2, *x3);
+  w2 = _mm_unpacklo_epi8(*x4, *x5);
+  w3 = _mm_unpacklo_epi8(*x6, *x7);
+
+  w8 = _mm_unpackhi_epi8(*x0, *x1);
+  w9 = _mm_unpackhi_epi8(*x2, *x3);
+  w10 = _mm_unpackhi_epi8(*x4, *x5);
+  w11 = _mm_unpackhi_epi8(*x6, *x7);
+
+  w4 = _mm_unpacklo_epi16(w0, w1);
+  w5 = _mm_unpacklo_epi16(w2, w3);
+  w12 = _mm_unpacklo_epi16(w8, w9);
+  w13 = _mm_unpacklo_epi16(w10, w11);
+
+  w6 = _mm_unpacklo_epi32(w4, w5);
+  w7 = _mm_unpackhi_epi32(w4, w5);
+  w14 = _mm_unpacklo_epi32(w12, w13);
+  w15 = _mm_unpackhi_epi32(w12, w13);
+
+  // Store first 4-line result
+  *d0d1 = _mm_unpacklo_epi64(w6, w14);
+  *d2d3 = _mm_unpackhi_epi64(w6, w14);
+  *d4d5 = _mm_unpacklo_epi64(w7, w15);
+  *d6d7 = _mm_unpackhi_epi64(w7, w15);
+
+  w4 = _mm_unpackhi_epi16(w0, w1);
+  w5 = _mm_unpackhi_epi16(w2, w3);
+  w12 = _mm_unpackhi_epi16(w8, w9);
+  w13 = _mm_unpackhi_epi16(w10, w11);
+
+  w6 = _mm_unpacklo_epi32(w4, w5);
+  w7 = _mm_unpackhi_epi32(w4, w5);
+  w14 = _mm_unpacklo_epi32(w12, w13);
+  w15 = _mm_unpackhi_epi32(w12, w13);
+
+  // Store second 4-line result
+  *d8d9 = _mm_unpacklo_epi64(w6, w14);
+  *d10d11 = _mm_unpackhi_epi64(w6, w14);
+  *d12d13 = _mm_unpacklo_epi64(w7, w15);
+  *d14d15 = _mm_unpackhi_epi64(w7, w15);
+}
+
+#endif  // AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_
diff --git a/libs/libaom/src/aom_dsp/x86/masked_sad4d_ssse3.c b/libs/libaom/src/aom_dsp/x86/masked_sad4d_ssse3.c
new file mode 100644
index 000000000..8ef7ee0d7
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/masked_sad4d_ssse3.c
@@ -0,0 +1,266 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdio.h>
+#include <tmmintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/blend.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/synonyms.h"
+
+#include "aom_dsp/x86/masked_sad_intrin_ssse3.h"
+
+#define MASK_SAD16XH_ONE_REF(idx)                             \
+  a = _mm_loadu_si128((const __m128i *)&ref##idx[x]);         \
+  data_l = _mm_unpacklo_epi8(a, b);                           \
+  mask_l = _mm_unpacklo_epi8(m, m_inv);                       \
+  pred_l = _mm_maddubs_epi16(data_l, mask_l);                 \
+  pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS); \
+                                                              \
+  data_r = _mm_unpackhi_epi8(a, b);                           \
+  mask_r = _mm_unpackhi_epi8(m, m_inv);                       \
+  pred_r = _mm_maddubs_epi16(data_r, mask_r);                 \
+  pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS); \
+                                                              \
+  pred = _mm_packus_epi16(pred_l, pred_r);                    \
+  res##idx = _mm_add_epi32(res##idx, _mm_sad_epu8(pred, src));
+
+static INLINE void masked_sadx4d_ssse3(const uint8_t *src_ptr, int src_stride,
+                                       const uint8_t *a_ptr[], int a_stride,
+                                       const uint8_t *b_ptr, int b_stride,
+                                       const uint8_t *m_ptr, int m_stride,
+                                       int width, int height, int inv_mask,
+                                       unsigned sad_array[]) {
+  int x, y;
+  __m128i a;
+  __m128i data_l, data_r, mask_l, mask_r, pred_l, pred_r, pred;
+  const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
+  __m128i res0 = _mm_setzero_si128();
+  __m128i res1 = _mm_setzero_si128();
+  __m128i res2 = _mm_setzero_si128();
+  __m128i res3 = _mm_setzero_si128();
+  const uint8_t *ref0 = a_ptr[0];
+  const uint8_t *ref1 = a_ptr[1];
+  const uint8_t *ref2 = a_ptr[2];
+  const uint8_t *ref3 = a_ptr[3];
+
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x += 16) {
+      const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]);
+      const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]);
+      const __m128i m_copy = _mm_loadu_si128((const __m128i *)&m_ptr[x]);
+      __m128i m_inv = _mm_sub_epi8(mask_max, m_copy);
+      __m128i m = inv_mask ? m_inv : m_copy;
+      m_inv = inv_mask ? m_copy : m_inv;
+
+      MASK_SAD16XH_ONE_REF(0)
+      MASK_SAD16XH_ONE_REF(1)
+      MASK_SAD16XH_ONE_REF(2)
+      MASK_SAD16XH_ONE_REF(3)
+    }
+
+    src_ptr += src_stride;
+    ref0 += a_stride;
+    ref1 += a_stride;
+    ref2 += a_stride;
+    ref3 += a_stride;
+    b_ptr += b_stride;
+    m_ptr += m_stride;
+  }
+  res0 = _mm_add_epi32(_mm_unpacklo_epi32(res0, res1),
+                       _mm_unpackhi_epi32(res0, res1));
+  res2 = _mm_add_epi32(_mm_unpacklo_epi32(res2, res3),
+                       _mm_unpackhi_epi32(res2, res3));
+
+  res0 = _mm_unpacklo_epi64(res0, res2);
+  _mm_storeu_si128((__m128i *)sad_array, res0);
+}
+
+#define MASK_SAD8XH_ONE_REF(idx)                                               \
+  const __m128i a##idx##0 = _mm_loadl_epi64((__m128i *)ref##idx);              \
+  const __m128i a##idx##1 = _mm_loadl_epi64((__m128i *)(ref##idx + a_stride)); \
+  data_l = _mm_unpacklo_epi8(a##idx##0, b0);                                   \
+  mask_l = _mm_unpacklo_epi8(m, m_inv);                                        \
+  pred_l = _mm_maddubs_epi16(data_l, mask_l);                                  \
+  pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS);                  \
+                                                                               \
+  data_r = _mm_unpacklo_epi8(a##idx##1, b1);                                   \
+  mask_r = _mm_unpackhi_epi8(m, m_inv);                                        \
+  pred_r = _mm_maddubs_epi16(data_r, mask_r);                                  \
+  pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS);                  \
+                                                                               \
+  pred = _mm_packus_epi16(pred_l, pred_r);                                     \
+  res##idx = _mm_add_epi32(res##idx, _mm_sad_epu8(pred, src));
+
+void aom_masked_sad8xhx4d_ssse3(const uint8_t *src_ptr, int src_stride,
+                                const uint8_t *ref_array[], int a_stride,
+                                const uint8_t *b_ptr, int b_stride,
+                                const uint8_t *m_ptr, int m_stride, int height,
+                                int inv_mask, unsigned sad_array[]) {
+  const uint8_t *ref0 = ref_array[0];
+  const uint8_t *ref1 = ref_array[1];
+  const uint8_t *ref2 = ref_array[2];
+  const uint8_t *ref3 = ref_array[3];
+  __m128i data_l, data_r, pred_l, pred_r, mask_l, mask_r, pred;
+  __m128i res0 = _mm_setzero_si128();
+  __m128i res1 = _mm_setzero_si128();
+  __m128i res2 = _mm_setzero_si128();
+  __m128i res3 = _mm_setzero_si128();
+  const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
+
+  for (int y = 0; y < height; y += 2) {
+    const __m128i src = _mm_unpacklo_epi64(
+        _mm_loadl_epi64((const __m128i *)src_ptr),
+        _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride)));
+    const __m128i b0 = _mm_loadl_epi64((__m128i *)b_ptr);
+    const __m128i b1 = _mm_loadl_epi64((__m128i *)(b_ptr + b_stride));
+    const __m128i m0 = _mm_loadl_epi64((__m128i *)m_ptr);
+    const __m128i m1 = _mm_loadl_epi64((__m128i *)(m_ptr + m_stride));
+    __m128i m_copy = _mm_unpacklo_epi64(m0, m1);
+    __m128i m_inv = _mm_sub_epi8(mask_max, m_copy);
+    __m128i m = inv_mask ? m_inv : m_copy;
+    m_inv = inv_mask ? m_copy : m_inv;
+
+    MASK_SAD8XH_ONE_REF(0)
+    MASK_SAD8XH_ONE_REF(1)
+    MASK_SAD8XH_ONE_REF(2)
+    MASK_SAD8XH_ONE_REF(3)
+
+    ref0 += 2 * a_stride;
+    ref1 += 2 * a_stride;
+    ref2 += 2 * a_stride;
+    ref3 += 2 * a_stride;
+    src_ptr += 2 * src_stride;
+    b_ptr += 2 * b_stride;
+    m_ptr += 2 * m_stride;
+  }
+  res0 = _mm_add_epi32(_mm_unpacklo_epi32(res0, res1),
+                       _mm_unpackhi_epi32(res0, res1));
+  res2 = _mm_add_epi32(_mm_unpacklo_epi32(res2, res3),
+                       _mm_unpackhi_epi32(res2, res3));
+  res0 = _mm_unpacklo_epi64(res0, res2);
+  _mm_storeu_si128((__m128i *)sad_array, res0);
+}
+
+#define MASK_SAD4XH_ONE_REF(idx)                                               \
+  a = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)ref##idx),             \
+                         _mm_cvtsi32_si128(*(uint32_t *)&ref##idx[a_stride])); \
+  data = _mm_unpacklo_epi8(a, b);                                              \
+  mask = _mm_unpacklo_epi8(m, m_inv);                                          \
+  pred = _mm_maddubs_epi16(data, mask);                                        \
+  pred = xx_roundn_epu16(pred, AOM_BLEND_A64_ROUND_BITS);                      \
+                                                                               \
+  pred = _mm_packus_epi16(pred, _mm_setzero_si128());                          \
+  res##idx = _mm_add_epi32(res##idx, _mm_sad_epu8(pred, src));
+
+void aom_masked_sad4xhx4d_ssse3(const uint8_t *src_ptr, int src_stride,
+                                const uint8_t *ref_array[], int a_stride,
+                                const uint8_t *b_ptr, int b_stride,
+                                const uint8_t *m_ptr, int m_stride, int height,
+                                int inv_mask, unsigned sad_array[]) {
+  const uint8_t *ref0 = ref_array[0];
+  const uint8_t *ref1 = ref_array[1];
+  const uint8_t *ref2 = ref_array[2];
+  const uint8_t *ref3 = ref_array[3];
+  __m128i data, pred, mask;
+  __m128i res0 = _mm_setzero_si128();
+  __m128i res1 = _mm_setzero_si128();
+  __m128i res2 = _mm_setzero_si128();
+  __m128i res3 = _mm_setzero_si128();
+  __m128i a;
+  const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
+
+  for (int y = 0; y < height; y += 2) {
+    const __m128i src = _mm_unpacklo_epi32(
+        _mm_cvtsi32_si128(*(uint32_t *)src_ptr),
+        _mm_cvtsi32_si128(*(uint32_t *)&src_ptr[src_stride]));
+    const __m128i b =
+        _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)b_ptr),
+                           _mm_cvtsi32_si128(*(uint32_t *)&b_ptr[b_stride]));
+    const __m128i m_copy =
+        _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)m_ptr),
+                           _mm_cvtsi32_si128(*(uint32_t *)&m_ptr[m_stride]));
+
+    __m128i m_inv = _mm_sub_epi8(mask_max, m_copy);
+    __m128i m = inv_mask ? m_inv : m_copy;
+    m_inv = inv_mask ? m_copy : m_inv;
+
+    MASK_SAD4XH_ONE_REF(0)
+    MASK_SAD4XH_ONE_REF(1)
+    MASK_SAD4XH_ONE_REF(2)
+    MASK_SAD4XH_ONE_REF(3)
+
+    ref0 += 2 * a_stride;
+    ref1 += 2 * a_stride;
+    ref2 += 2 * a_stride;
+    ref3 += 2 * a_stride;
+    src_ptr += 2 * src_stride;
+    b_ptr += 2 * b_stride;
+    m_ptr += 2 * m_stride;
+  }
+  res0 = _mm_unpacklo_epi32(res0, res1);
+  res2 = _mm_unpacklo_epi32(res2, res3);
+  res0 = _mm_unpacklo_epi64(res0, res2);
+  _mm_storeu_si128((__m128i *)sad_array, res0);
+}
+
+#define MASKSADMXN_SSSE3(m, n)                                                 \
+  void aom_masked_sad##m##x##n##x4d_ssse3(                                     \
+      const uint8_t *src, int src_stride, const uint8_t *ref[],                \
+      int ref_stride, const uint8_t *second_pred, const uint8_t *msk,          \
+      int msk_stride, int inv_mask, unsigned sad_array[]) {                    \
+    masked_sadx4d_ssse3(src, src_stride, ref, ref_stride, second_pred, m, msk, \
+                        msk_stride, m, n, inv_mask, sad_array);                \
+  }
+
+#define MASKSAD8XN_SSSE3(n)                                                   \
+  void aom_masked_sad8x##n##x4d_ssse3(                                        \
+      const uint8_t *src, int src_stride, const uint8_t *ref[],               \
+      int ref_stride, const uint8_t *second_pred, const uint8_t *msk,         \
+      int msk_stride, int inv_mask, unsigned sad_array[]) {                   \
+    aom_masked_sad8xhx4d_ssse3(src, src_stride, ref, ref_stride, second_pred, \
+                               8, msk, msk_stride, n, inv_mask, sad_array);   \
+  }
+
+#define MASKSAD4XN_SSSE3(n)                                                   \
+  void aom_masked_sad4x##n##x4d_ssse3(                                        \
+      const uint8_t *src, int src_stride, const uint8_t *ref[],               \
+      int ref_stride, const uint8_t *second_pred, const uint8_t *msk,         \
+      int msk_stride, int inv_mask, unsigned sad_array[]) {                   \
+    aom_masked_sad4xhx4d_ssse3(src, src_stride, ref, ref_stride, second_pred, \
+                               4, msk, msk_stride, n, inv_mask, sad_array);   \
+  }
+
+MASKSADMXN_SSSE3(128, 128)
+MASKSADMXN_SSSE3(128, 64)
+MASKSADMXN_SSSE3(64, 128)
+MASKSADMXN_SSSE3(64, 64)
+MASKSADMXN_SSSE3(64, 32)
+MASKSADMXN_SSSE3(32, 64)
+MASKSADMXN_SSSE3(32, 32)
+MASKSADMXN_SSSE3(32, 16)
+MASKSADMXN_SSSE3(16, 32)
+MASKSADMXN_SSSE3(16, 16)
+MASKSADMXN_SSSE3(16, 8)
+MASKSAD8XN_SSSE3(16)
+MASKSAD8XN_SSSE3(8)
+MASKSAD8XN_SSSE3(4)
+MASKSAD4XN_SSSE3(8)
+MASKSAD4XN_SSSE3(4)
+MASKSAD4XN_SSSE3(16)
+MASKSADMXN_SSSE3(16, 4)
+MASKSAD8XN_SSSE3(32)
+MASKSADMXN_SSSE3(32, 8)
+MASKSADMXN_SSSE3(16, 64)
+MASKSADMXN_SSSE3(64, 16)
diff --git a/libs/libaom/src/aom_dsp/x86/masked_sad_intrin_avx2.c b/libs/libaom/src/aom_dsp/x86/masked_sad_intrin_avx2.c
new file mode 100644
index 000000000..60f0ab339
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/masked_sad_intrin_avx2.c
@@ -0,0 +1,389 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/blend.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/masked_sad_intrin_ssse3.h"
+
+static INLINE unsigned int masked_sad32xh_avx2(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride,
+    const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride,
+    int width, int height) {
+  int x, y;
+  __m256i res = _mm256_setzero_si256();
+  const __m256i mask_max = _mm256_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
+  const __m256i round_scale =
+      _mm256_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x += 32) {
+      const __m256i src = _mm256_lddqu_si256((const __m256i *)&src_ptr[x]);
+      const __m256i a = _mm256_lddqu_si256((const __m256i *)&a_ptr[x]);
+      const __m256i b = _mm256_lddqu_si256((const __m256i *)&b_ptr[x]);
+      const __m256i m = _mm256_lddqu_si256((const __m256i *)&m_ptr[x]);
+      const __m256i m_inv = _mm256_sub_epi8(mask_max, m);
+
+      // Calculate 16 predicted pixels.
+      // Note that the maximum value of any entry of 'pred_l' or 'pred_r'
+      // is 64 * 255, so we have plenty of space to add rounding constants.
+      const __m256i data_l = _mm256_unpacklo_epi8(a, b);
+      const __m256i mask_l = _mm256_unpacklo_epi8(m, m_inv);
+      __m256i pred_l = _mm256_maddubs_epi16(data_l, mask_l);
+      pred_l = _mm256_mulhrs_epi16(pred_l, round_scale);
+
+      const __m256i data_r = _mm256_unpackhi_epi8(a, b);
+      const __m256i mask_r = _mm256_unpackhi_epi8(m, m_inv);
+      __m256i pred_r = _mm256_maddubs_epi16(data_r, mask_r);
+      pred_r = _mm256_mulhrs_epi16(pred_r, round_scale);
+
+      const __m256i pred = _mm256_packus_epi16(pred_l, pred_r);
+      res = _mm256_add_epi32(res, _mm256_sad_epu8(pred, src));
+    }
+
+    src_ptr += src_stride;
+    a_ptr += a_stride;
+    b_ptr += b_stride;
+    m_ptr += m_stride;
+  }
+  // At this point, we have two 32-bit partial SADs in lanes 0 and 2 of 'res'.
+  res = _mm256_shuffle_epi32(res, 0xd8);
+  res = _mm256_permute4x64_epi64(res, 0xd8);
+  res = _mm256_hadd_epi32(res, res);
+  res = _mm256_hadd_epi32(res, res);
+  int32_t sad = _mm256_extract_epi32(res, 0);
+  return sad;
+}
+
+static INLINE __m256i xx_loadu2_m128i(const void *hi, const void *lo) {
+  __m128i a0 = _mm_lddqu_si128((const __m128i *)(lo));
+  __m128i a1 = _mm_lddqu_si128((const __m128i *)(hi));
+  __m256i a = _mm256_castsi128_si256(a0);
+  return _mm256_inserti128_si256(a, a1, 1);
+}
+
+static INLINE unsigned int masked_sad16xh_avx2(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride,
+    const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride,
+    int height) {
+  int y;
+  __m256i res = _mm256_setzero_si256();
+  const __m256i mask_max = _mm256_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
+  const __m256i round_scale =
+      _mm256_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+  for (y = 0; y < height; y += 2) {
+    const __m256i src = xx_loadu2_m128i(src_ptr + src_stride, src_ptr);
+    const __m256i a = xx_loadu2_m128i(a_ptr + a_stride, a_ptr);
+    const __m256i b = xx_loadu2_m128i(b_ptr + b_stride, b_ptr);
+    const __m256i m = xx_loadu2_m128i(m_ptr + m_stride, m_ptr);
+    const __m256i m_inv = _mm256_sub_epi8(mask_max, m);
+
+    // Calculate 16 predicted pixels.
+    // Note that the maximum value of any entry of 'pred_l' or 'pred_r'
+    // is 64 * 255, so we have plenty of space to add rounding constants.
+    const __m256i data_l = _mm256_unpacklo_epi8(a, b);
+    const __m256i mask_l = _mm256_unpacklo_epi8(m, m_inv);
+    __m256i pred_l = _mm256_maddubs_epi16(data_l, mask_l);
+    pred_l = _mm256_mulhrs_epi16(pred_l, round_scale);
+
+    const __m256i data_r = _mm256_unpackhi_epi8(a, b);
+    const __m256i mask_r = _mm256_unpackhi_epi8(m, m_inv);
+    __m256i pred_r = _mm256_maddubs_epi16(data_r, mask_r);
+    pred_r = _mm256_mulhrs_epi16(pred_r, round_scale);
+
+    const __m256i pred = _mm256_packus_epi16(pred_l, pred_r);
+    res = _mm256_add_epi32(res, _mm256_sad_epu8(pred, src));
+
+    src_ptr += src_stride << 1;
+    a_ptr += a_stride << 1;
+    b_ptr += b_stride << 1;
+    m_ptr += m_stride << 1;
+  }
+  // At this point, we have two 32-bit partial SADs in lanes 0 and 2 of 'res'.
+  res = _mm256_shuffle_epi32(res, 0xd8);
+  res = _mm256_permute4x64_epi64(res, 0xd8);
+  res = _mm256_hadd_epi32(res, res);
+  res = _mm256_hadd_epi32(res, res);
+  int32_t sad = _mm256_extract_epi32(res, 0);
+  return sad;
+}
+
+static INLINE unsigned int aom_masked_sad_avx2(
+    const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,
+    const uint8_t *second_pred, const uint8_t *msk, int msk_stride,
+    int invert_mask, int m, int n) {
+  unsigned int sad;
+  if (!invert_mask) {
+    switch (m) {
+      case 4:
+        sad = aom_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride,
+                                      second_pred, m, msk, msk_stride, n);
+        break;
+      case 8:
+        sad = aom_masked_sad8xh_ssse3(src, src_stride, ref, ref_stride,
+                                      second_pred, m, msk, msk_stride, n);
+        break;
+      case 16:
+        sad = masked_sad16xh_avx2(src, src_stride, ref, ref_stride, second_pred,
+                                  m, msk, msk_stride, n);
+        break;
+      default:
+        sad = masked_sad32xh_avx2(src, src_stride, ref, ref_stride, second_pred,
+                                  m, msk, msk_stride, m, n);
+        break;
+    }
+  } else {
+    switch (m) {
+      case 4:
+        sad = aom_masked_sad4xh_ssse3(src, src_stride, second_pred, m, ref,
+                                      ref_stride, msk, msk_stride, n);
+        break;
+      case 8:
+        sad = aom_masked_sad8xh_ssse3(src, src_stride, second_pred, m, ref,
+                                      ref_stride, msk, msk_stride, n);
+        break;
+      case 16:
+        sad = masked_sad16xh_avx2(src, src_stride, second_pred, m, ref,
+                                  ref_stride, msk, msk_stride, n);
+        break;
+      default:
+        sad = masked_sad32xh_avx2(src, src_stride, second_pred, m, ref,
+                                  ref_stride, msk, msk_stride, m, n);
+        break;
+    }
+  }
+  return sad;
+}
+
+#define MASKSADMXN_AVX2(m, n)                                                 \
+  unsigned int aom_masked_sad##m##x##n##_avx2(                                \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *second_pred, const uint8_t *msk, int msk_stride,         \
+      int invert_mask) {                                                      \
+    return aom_masked_sad_avx2(src, src_stride, ref, ref_stride, second_pred, \
+                               msk, msk_stride, invert_mask, m, n);           \
+  }
+
+MASKSADMXN_AVX2(4, 4)
+MASKSADMXN_AVX2(4, 8)
+MASKSADMXN_AVX2(8, 4)
+MASKSADMXN_AVX2(8, 8)
+MASKSADMXN_AVX2(8, 16)
+MASKSADMXN_AVX2(16, 8)
+MASKSADMXN_AVX2(16, 16)
+MASKSADMXN_AVX2(16, 32)
+MASKSADMXN_AVX2(32, 16)
+MASKSADMXN_AVX2(32, 32)
+MASKSADMXN_AVX2(32, 64)
+MASKSADMXN_AVX2(64, 32)
+MASKSADMXN_AVX2(64, 64)
+MASKSADMXN_AVX2(64, 128)
+MASKSADMXN_AVX2(128, 64)
+MASKSADMXN_AVX2(128, 128)
+MASKSADMXN_AVX2(4, 16)
+MASKSADMXN_AVX2(16, 4)
+MASKSADMXN_AVX2(8, 32)
+MASKSADMXN_AVX2(32, 8)
+MASKSADMXN_AVX2(16, 64)
+MASKSADMXN_AVX2(64, 16)
+
+static INLINE unsigned int highbd_masked_sad8xh_avx2(
+    const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+    const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride,
+    int height) {
+  const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8);
+  int y;
+  __m256i res = _mm256_setzero_si256();
+  const __m256i mask_max = _mm256_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
+  const __m256i round_const =
+      _mm256_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
+  const __m256i one = _mm256_set1_epi16(1);
+
+  for (y = 0; y < height; y += 2) {
+    const __m256i src = xx_loadu2_m128i(src_ptr + src_stride, src_ptr);
+    const __m256i a = xx_loadu2_m128i(a_ptr + a_stride, a_ptr);
+    const __m256i b = xx_loadu2_m128i(b_ptr + b_stride, b_ptr);
+    // Zero-extend mask to 16 bits
+    const __m256i m = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(
+        _mm_loadl_epi64((const __m128i *)(m_ptr)),
+        _mm_loadl_epi64((const __m128i *)(m_ptr + m_stride))));
+    const __m256i m_inv = _mm256_sub_epi16(mask_max, m);
+
+    const __m256i data_l = _mm256_unpacklo_epi16(a, b);
+    const __m256i mask_l = _mm256_unpacklo_epi16(m, m_inv);
+    __m256i pred_l = _mm256_madd_epi16(data_l, mask_l);
+    pred_l = _mm256_srai_epi32(_mm256_add_epi32(pred_l, round_const),
+                               AOM_BLEND_A64_ROUND_BITS);
+
+    const __m256i data_r = _mm256_unpackhi_epi16(a, b);
+    const __m256i mask_r = _mm256_unpackhi_epi16(m, m_inv);
+    __m256i pred_r = _mm256_madd_epi16(data_r, mask_r);
+    pred_r = _mm256_srai_epi32(_mm256_add_epi32(pred_r, round_const),
+                               AOM_BLEND_A64_ROUND_BITS);
+
+    // Note: the maximum value in pred_l/r is (2^bd)-1 < 2^15,
+    // so it is safe to do signed saturation here.
+    const __m256i pred = _mm256_packs_epi32(pred_l, pred_r);
+    // There is no 16-bit SAD instruction, so we have to synthesize
+    // an 8-element SAD. We do this by storing 4 32-bit partial SADs,
+    // and accumulating them at the end
+    const __m256i diff = _mm256_abs_epi16(_mm256_sub_epi16(pred, src));
+    res = _mm256_add_epi32(res, _mm256_madd_epi16(diff, one));
+
+    src_ptr += src_stride << 1;
+    a_ptr += a_stride << 1;
+    b_ptr += b_stride << 1;
+    m_ptr += m_stride << 1;
+  }
+  // At this point, we have four 32-bit partial SADs stored in 'res'.
+  res = _mm256_hadd_epi32(res, res);
+  res = _mm256_hadd_epi32(res, res);
+  int sad = _mm256_extract_epi32(res, 0) + _mm256_extract_epi32(res, 4);
+  return sad;
+}
+
+static INLINE unsigned int highbd_masked_sad16xh_avx2(
+    const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+    const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride,
+    int width, int height) {
+  const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8);
+  int x, y;
+  __m256i res = _mm256_setzero_si256();
+  const __m256i mask_max = _mm256_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
+  const __m256i round_const =
+      _mm256_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
+  const __m256i one = _mm256_set1_epi16(1);
+
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x += 16) {
+      const __m256i src = _mm256_lddqu_si256((const __m256i *)&src_ptr[x]);
+      const __m256i a = _mm256_lddqu_si256((const __m256i *)&a_ptr[x]);
+      const __m256i b = _mm256_lddqu_si256((const __m256i *)&b_ptr[x]);
+      // Zero-extend mask to 16 bits
+      const __m256i m =
+          _mm256_cvtepu8_epi16(_mm_lddqu_si128((const __m128i *)&m_ptr[x]));
+      const __m256i m_inv = _mm256_sub_epi16(mask_max, m);
+
+      const __m256i data_l = _mm256_unpacklo_epi16(a, b);
+      const __m256i mask_l = _mm256_unpacklo_epi16(m, m_inv);
+      __m256i pred_l = _mm256_madd_epi16(data_l, mask_l);
+      pred_l = _mm256_srai_epi32(_mm256_add_epi32(pred_l, round_const),
+                                 AOM_BLEND_A64_ROUND_BITS);
+
+      const __m256i data_r = _mm256_unpackhi_epi16(a, b);
+      const __m256i mask_r = _mm256_unpackhi_epi16(m, m_inv);
+      __m256i pred_r = _mm256_madd_epi16(data_r, mask_r);
+      pred_r = _mm256_srai_epi32(_mm256_add_epi32(pred_r, round_const),
+                                 AOM_BLEND_A64_ROUND_BITS);
+
+      // Note: the maximum value in pred_l/r is (2^bd)-1 < 2^15,
+      // so it is safe to do signed saturation here.
+      const __m256i pred = _mm256_packs_epi32(pred_l, pred_r);
+      // There is no 16-bit SAD instruction, so we have to synthesize
+      // an 8-element SAD. We do this by storing 4 32-bit partial SADs,
+      // and accumulating them at the end
+      const __m256i diff = _mm256_abs_epi16(_mm256_sub_epi16(pred, src));
+      res = _mm256_add_epi32(res, _mm256_madd_epi16(diff, one));
+    }
+
+    src_ptr += src_stride;
+    a_ptr += a_stride;
+    b_ptr += b_stride;
+    m_ptr += m_stride;
+  }
+  // At this point, we have four 32-bit partial SADs stored in 'res'.
+  res = _mm256_hadd_epi32(res, res);
+  res = _mm256_hadd_epi32(res, res);
+  int sad = _mm256_extract_epi32(res, 0) + _mm256_extract_epi32(res, 4);
+  return sad;
+}
+
+static INLINE unsigned int aom_highbd_masked_sad_avx2(
+    const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,
+    const uint8_t *second_pred, const uint8_t *msk, int msk_stride,
+    int invert_mask, int m, int n) {
+  unsigned int sad;
+  if (!invert_mask) {
+    switch (m) {
+      case 4:
+        sad =
+            aom_highbd_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride,
+                                           second_pred, m, msk, msk_stride, n);
+        break;
+      case 8:
+        sad = highbd_masked_sad8xh_avx2(src, src_stride, ref, ref_stride,
+                                        second_pred, m, msk, msk_stride, n);
+        break;
+      default:
+        sad = highbd_masked_sad16xh_avx2(src, src_stride, ref, ref_stride,
+                                         second_pred, m, msk, msk_stride, m, n);
+        break;
+    }
+  } else {
+    switch (m) {
+      case 4:
+        sad =
+            aom_highbd_masked_sad4xh_ssse3(src, src_stride, second_pred, m, ref,
+                                           ref_stride, msk, msk_stride, n);
+        break;
+      case 8:
+        sad = highbd_masked_sad8xh_avx2(src, src_stride, second_pred, m, ref,
+                                        ref_stride, msk, msk_stride, n);
+        break;
+      default:
+        sad = highbd_masked_sad16xh_avx2(src, src_stride, second_pred, m, ref,
+                                         ref_stride, msk, msk_stride, m, n);
+        break;
+    }
+  }
+  return sad;
+}
+
+#define HIGHBD_MASKSADMXN_AVX2(m, n)                                      \
+  unsigned int aom_highbd_masked_sad##m##x##n##_avx2(                     \
+      const uint8_t *src8, int src_stride, const uint8_t *ref8,           \
+      int ref_stride, const uint8_t *second_pred8, const uint8_t *msk,    \
+      int msk_stride, int invert_mask) {                                  \
+    return aom_highbd_masked_sad_avx2(src8, src_stride, ref8, ref_stride, \
+                                      second_pred8, msk, msk_stride,      \
+                                      invert_mask, m, n);                 \
+  }
+
+HIGHBD_MASKSADMXN_AVX2(4, 4);
+HIGHBD_MASKSADMXN_AVX2(4, 8);
+HIGHBD_MASKSADMXN_AVX2(8, 4);
+HIGHBD_MASKSADMXN_AVX2(8, 8);
+HIGHBD_MASKSADMXN_AVX2(8, 16);
+HIGHBD_MASKSADMXN_AVX2(16, 8);
+HIGHBD_MASKSADMXN_AVX2(16, 16);
+HIGHBD_MASKSADMXN_AVX2(16, 32);
+HIGHBD_MASKSADMXN_AVX2(32, 16);
+HIGHBD_MASKSADMXN_AVX2(32, 32);
+HIGHBD_MASKSADMXN_AVX2(32, 64);
+HIGHBD_MASKSADMXN_AVX2(64, 32);
+HIGHBD_MASKSADMXN_AVX2(64, 64);
+HIGHBD_MASKSADMXN_AVX2(64, 128);
+HIGHBD_MASKSADMXN_AVX2(128, 64);
+HIGHBD_MASKSADMXN_AVX2(128, 128);
+HIGHBD_MASKSADMXN_AVX2(4, 16);
+HIGHBD_MASKSADMXN_AVX2(16, 4);
+HIGHBD_MASKSADMXN_AVX2(8, 32);
+HIGHBD_MASKSADMXN_AVX2(32, 8);
+HIGHBD_MASKSADMXN_AVX2(16, 64);
+HIGHBD_MASKSADMXN_AVX2(64, 16);
diff --git a/libs/libaom/src/aom_dsp/x86/masked_sad_intrin_ssse3.c b/libs/libaom/src/aom_dsp/x86/masked_sad_intrin_ssse3.c
new file mode 100644
index 000000000..716827796
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/masked_sad_intrin_ssse3.c
@@ -0,0 +1,402 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdio.h>
+#include <tmmintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/blend.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/synonyms.h"
+
+#include "aom_dsp/x86/masked_sad_intrin_ssse3.h"
+
+// For width a multiple of 16
+static INLINE unsigned int masked_sad_ssse3(const uint8_t *src_ptr,
+                                            int src_stride,
+                                            const uint8_t *a_ptr, int a_stride,
+                                            const uint8_t *b_ptr, int b_stride,
+                                            const uint8_t *m_ptr, int m_stride,
+                                            int width, int height);
+
+#define MASKSADMXN_SSSE3(m, n)                                                \
+  unsigned int aom_masked_sad##m##x##n##_ssse3(                               \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *second_pred, const uint8_t *msk, int msk_stride,         \
+      int invert_mask) {                                                      \
+    if (!invert_mask)                                                         \
+      return masked_sad_ssse3(src, src_stride, ref, ref_stride, second_pred,  \
+                              m, msk, msk_stride, m, n);                      \
+    else                                                                      \
+      return masked_sad_ssse3(src, src_stride, second_pred, m, ref,           \
+                              ref_stride, msk, msk_stride, m, n);             \
+  }
+
+#define MASKSAD8XN_SSSE3(n)                                                   \
+  unsigned int aom_masked_sad8x##n##_ssse3(                                   \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *second_pred, const uint8_t *msk, int msk_stride,         \
+      int invert_mask) {                                                      \
+    if (!invert_mask)                                                         \
+      return aom_masked_sad8xh_ssse3(src, src_stride, ref, ref_stride,        \
+                                     second_pred, 8, msk, msk_stride, n);     \
+    else                                                                      \
+      return aom_masked_sad8xh_ssse3(src, src_stride, second_pred, 8, ref,    \
+                                     ref_stride, msk, msk_stride, n);         \
+  }
+
+#define MASKSAD4XN_SSSE3(n)                                                   \
+  unsigned int aom_masked_sad4x##n##_ssse3(                                   \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *second_pred, const uint8_t *msk, int msk_stride,         \
+      int invert_mask) {                                                      \
+    if (!invert_mask)                                                         \
+      return aom_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride,        \
+                                     second_pred, 4, msk, msk_stride, n);     \
+    else                                                                      \
+      return aom_masked_sad4xh_ssse3(src, src_stride, second_pred, 4, ref,    \
+                                     ref_stride, msk, msk_stride, n);         \
+  }
+
+MASKSADMXN_SSSE3(128, 128)
+MASKSADMXN_SSSE3(128, 64)
+MASKSADMXN_SSSE3(64, 128)
+MASKSADMXN_SSSE3(64, 64)
+MASKSADMXN_SSSE3(64, 32)
+MASKSADMXN_SSSE3(32, 64)
+MASKSADMXN_SSSE3(32, 32)
+MASKSADMXN_SSSE3(32, 16)
+MASKSADMXN_SSSE3(16, 32)
+MASKSADMXN_SSSE3(16, 16)
+MASKSADMXN_SSSE3(16, 8)
+MASKSAD8XN_SSSE3(16)
+MASKSAD8XN_SSSE3(8)
+MASKSAD8XN_SSSE3(4)
+MASKSAD4XN_SSSE3(8)
+MASKSAD4XN_SSSE3(4)
+MASKSAD4XN_SSSE3(16)
+MASKSADMXN_SSSE3(16, 4)
+MASKSAD8XN_SSSE3(32)
+MASKSADMXN_SSSE3(32, 8)
+MASKSADMXN_SSSE3(16, 64)
+MASKSADMXN_SSSE3(64, 16)
+
+static INLINE unsigned int masked_sad_ssse3(const uint8_t *src_ptr,
+                                            int src_stride,
+                                            const uint8_t *a_ptr, int a_stride,
+                                            const uint8_t *b_ptr, int b_stride,
+                                            const uint8_t *m_ptr, int m_stride,
+                                            int width, int height) {
+  int x, y;
+  __m128i res = _mm_setzero_si128();
+  const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
+
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x += 16) {
+      const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]);
+      const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]);
+      const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]);
+      const __m128i m = _mm_loadu_si128((const __m128i *)&m_ptr[x]);
+      const __m128i m_inv = _mm_sub_epi8(mask_max, m);
+
+      // Calculate 16 predicted pixels.
+      // Note that the maximum value of any entry of 'pred_l' or 'pred_r'
+      // is 64 * 255, so we have plenty of space to add rounding constants.
+      const __m128i data_l = _mm_unpacklo_epi8(a, b);
+      const __m128i mask_l = _mm_unpacklo_epi8(m, m_inv);
+      __m128i pred_l = _mm_maddubs_epi16(data_l, mask_l);
+      pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS);
+
+      const __m128i data_r = _mm_unpackhi_epi8(a, b);
+      const __m128i mask_r = _mm_unpackhi_epi8(m, m_inv);
+      __m128i pred_r = _mm_maddubs_epi16(data_r, mask_r);
+      pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS);
+
+      const __m128i pred = _mm_packus_epi16(pred_l, pred_r);
+      res = _mm_add_epi32(res, _mm_sad_epu8(pred, src));
+    }
+
+    src_ptr += src_stride;
+    a_ptr += a_stride;
+    b_ptr += b_stride;
+    m_ptr += m_stride;
+  }
+  // At this point, we have two 32-bit partial SADs in lanes 0 and 2 of 'res'.
+  int32_t sad =
+      _mm_cvtsi128_si32(res) + _mm_cvtsi128_si32(_mm_srli_si128(res, 8));
+  return sad;
+}
+
+unsigned int aom_masked_sad8xh_ssse3(const uint8_t *src_ptr, int src_stride,
+                                     const uint8_t *a_ptr, int a_stride,
+                                     const uint8_t *b_ptr, int b_stride,
+                                     const uint8_t *m_ptr, int m_stride,
+                                     int height) {
+  int y;
+  __m128i res = _mm_setzero_si128();
+  const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
+
+  for (y = 0; y < height; y += 2) {
+    const __m128i src = _mm_unpacklo_epi64(
+        _mm_loadl_epi64((const __m128i *)src_ptr),
+        _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride]));
+    const __m128i a0 = _mm_loadl_epi64((const __m128i *)a_ptr);
+    const __m128i a1 = _mm_loadl_epi64((const __m128i *)&a_ptr[a_stride]);
+    const __m128i b0 = _mm_loadl_epi64((const __m128i *)b_ptr);
+    const __m128i b1 = _mm_loadl_epi64((const __m128i *)&b_ptr[b_stride]);
+    const __m128i m =
+        _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)m_ptr),
+                           _mm_loadl_epi64((const __m128i *)&m_ptr[m_stride]));
+    const __m128i m_inv = _mm_sub_epi8(mask_max, m);
+
+    const __m128i data_l = _mm_unpacklo_epi8(a0, b0);
+    const __m128i mask_l = _mm_unpacklo_epi8(m, m_inv);
+    __m128i pred_l = _mm_maddubs_epi16(data_l, mask_l);
+    pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS);
+
+    const __m128i data_r = _mm_unpacklo_epi8(a1, b1);
+    const __m128i mask_r = _mm_unpackhi_epi8(m, m_inv);
+    __m128i pred_r = _mm_maddubs_epi16(data_r, mask_r);
+    pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS);
+
+    const __m128i pred = _mm_packus_epi16(pred_l, pred_r);
+    res = _mm_add_epi32(res, _mm_sad_epu8(pred, src));
+
+    src_ptr += src_stride * 2;
+    a_ptr += a_stride * 2;
+    b_ptr += b_stride * 2;
+    m_ptr += m_stride * 2;
+  }
+  int32_t sad =
+      _mm_cvtsi128_si32(res) + _mm_cvtsi128_si32(_mm_srli_si128(res, 8));
+  return sad;
+}
+
+unsigned int aom_masked_sad4xh_ssse3(const uint8_t *src_ptr, int src_stride,
+                                     const uint8_t *a_ptr, int a_stride,
+                                     const uint8_t *b_ptr, int b_stride,
+                                     const uint8_t *m_ptr, int m_stride,
+                                     int height) {
+  int y;
+  __m128i res = _mm_setzero_si128();
+  const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
+
+  for (y = 0; y < height; y += 2) {
+    // Load two rows at a time, this seems to be a bit faster
+    // than four rows at a time in this case.
+    const __m128i src = _mm_unpacklo_epi32(
+        _mm_cvtsi32_si128(*(uint32_t *)src_ptr),
+        _mm_cvtsi32_si128(*(uint32_t *)&src_ptr[src_stride]));
+    const __m128i a =
+        _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)a_ptr),
+                           _mm_cvtsi32_si128(*(uint32_t *)&a_ptr[a_stride]));
+    const __m128i b =
+        _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)b_ptr),
+                           _mm_cvtsi32_si128(*(uint32_t *)&b_ptr[b_stride]));
+    const __m128i m =
+        _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)m_ptr),
+                           _mm_cvtsi32_si128(*(uint32_t *)&m_ptr[m_stride]));
+    const __m128i m_inv = _mm_sub_epi8(mask_max, m);
+
+    const __m128i data = _mm_unpacklo_epi8(a, b);
+    const __m128i mask = _mm_unpacklo_epi8(m, m_inv);
+    __m128i pred_16bit = _mm_maddubs_epi16(data, mask);
+    pred_16bit = xx_roundn_epu16(pred_16bit, AOM_BLEND_A64_ROUND_BITS);
+
+    const __m128i pred = _mm_packus_epi16(pred_16bit, _mm_setzero_si128());
+    res = _mm_add_epi32(res, _mm_sad_epu8(pred, src));
+
+    src_ptr += src_stride * 2;
+    a_ptr += a_stride * 2;
+    b_ptr += b_stride * 2;
+    m_ptr += m_stride * 2;
+  }
+  // At this point, the SAD is stored in lane 0 of 'res'
+  int32_t sad = _mm_cvtsi128_si32(res);
+  return sad;
+}
+
+// For width a multiple of 8
+static INLINE unsigned int highbd_masked_sad_ssse3(
+    const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+    const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride,
+    int width, int height);
+
+#define HIGHBD_MASKSADMXN_SSSE3(m, n)                                         \
+  unsigned int aom_highbd_masked_sad##m##x##n##_ssse3(                        \
+      const uint8_t *src8, int src_stride, const uint8_t *ref8,               \
+      int ref_stride, const uint8_t *second_pred8, const uint8_t *msk,        \
+      int msk_stride, int invert_mask) {                                      \
+    if (!invert_mask)                                                         \
+      return highbd_masked_sad_ssse3(src8, src_stride, ref8, ref_stride,      \
+                                     second_pred8, m, msk, msk_stride, m, n); \
+    else                                                                      \
+      return highbd_masked_sad_ssse3(src8, src_stride, second_pred8, m, ref8, \
+                                     ref_stride, msk, msk_stride, m, n);      \
+  }
+
+#define HIGHBD_MASKSAD4XN_SSSE3(n)                                             \
+  unsigned int aom_highbd_masked_sad4x##n##_ssse3(                             \
+      const uint8_t *src8, int src_stride, const uint8_t *ref8,                \
+      int ref_stride, const uint8_t *second_pred8, const uint8_t *msk,         \
+      int msk_stride, int invert_mask) {                                       \
+    if (!invert_mask)                                                          \
+      return aom_highbd_masked_sad4xh_ssse3(src8, src_stride, ref8,            \
+                                            ref_stride, second_pred8, 4, msk,  \
+                                            msk_stride, n);                    \
+    else                                                                       \
+      return aom_highbd_masked_sad4xh_ssse3(src8, src_stride, second_pred8, 4, \
+                                            ref8, ref_stride, msk, msk_stride, \
+                                            n);                                \
+  }
+
+HIGHBD_MASKSADMXN_SSSE3(128, 128)
+HIGHBD_MASKSADMXN_SSSE3(128, 64)
+HIGHBD_MASKSADMXN_SSSE3(64, 128)
+HIGHBD_MASKSADMXN_SSSE3(64, 64)
+HIGHBD_MASKSADMXN_SSSE3(64, 32)
+HIGHBD_MASKSADMXN_SSSE3(32, 64)
+HIGHBD_MASKSADMXN_SSSE3(32, 32)
+HIGHBD_MASKSADMXN_SSSE3(32, 16)
+HIGHBD_MASKSADMXN_SSSE3(16, 32)
+HIGHBD_MASKSADMXN_SSSE3(16, 16)
+HIGHBD_MASKSADMXN_SSSE3(16, 8)
+HIGHBD_MASKSADMXN_SSSE3(8, 16)
+HIGHBD_MASKSADMXN_SSSE3(8, 8)
+HIGHBD_MASKSADMXN_SSSE3(8, 4)
+HIGHBD_MASKSAD4XN_SSSE3(8)
+HIGHBD_MASKSAD4XN_SSSE3(4)
+HIGHBD_MASKSAD4XN_SSSE3(16)
+HIGHBD_MASKSADMXN_SSSE3(16, 4)
+HIGHBD_MASKSADMXN_SSSE3(8, 32)
+HIGHBD_MASKSADMXN_SSSE3(32, 8)
+HIGHBD_MASKSADMXN_SSSE3(16, 64)
+HIGHBD_MASKSADMXN_SSSE3(64, 16)
+
+static INLINE unsigned int highbd_masked_sad_ssse3(
+    const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+    const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride,
+    int width, int height) {
+  const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8);
+  int x, y;
+  __m128i res = _mm_setzero_si128();
+  const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
+  const __m128i round_const =
+      _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
+  const __m128i one = _mm_set1_epi16(1);
+
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x += 8) {
+      const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]);
+      const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]);
+      const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]);
+      // Zero-extend mask to 16 bits
+      const __m128i m = _mm_unpacklo_epi8(
+          _mm_loadl_epi64((const __m128i *)&m_ptr[x]), _mm_setzero_si128());
+      const __m128i m_inv = _mm_sub_epi16(mask_max, m);
+
+      const __m128i data_l = _mm_unpacklo_epi16(a, b);
+      const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv);
+      __m128i pred_l = _mm_madd_epi16(data_l, mask_l);
+      pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const),
+                              AOM_BLEND_A64_ROUND_BITS);
+
+      const __m128i data_r = _mm_unpackhi_epi16(a, b);
+      const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv);
+      __m128i pred_r = _mm_madd_epi16(data_r, mask_r);
+      pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const),
+                              AOM_BLEND_A64_ROUND_BITS);
+
+      // Note: the maximum value in pred_l/r is (2^bd)-1 < 2^15,
+      // so it is safe to do signed saturation here.
+      const __m128i pred = _mm_packs_epi32(pred_l, pred_r);
+      // There is no 16-bit SAD instruction, so we have to synthesize
+      // an 8-element SAD. We do this by storing 4 32-bit partial SADs,
+      // and accumulating them at the end
+      const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(pred, src));
+      res = _mm_add_epi32(res, _mm_madd_epi16(diff, one));
+    }
+
+    src_ptr += src_stride;
+    a_ptr += a_stride;
+    b_ptr += b_stride;
+    m_ptr += m_stride;
+  }
+  // At this point, we have four 32-bit partial SADs stored in 'res'.
+  res = _mm_hadd_epi32(res, res);
+  res = _mm_hadd_epi32(res, res);
+  int sad = _mm_cvtsi128_si32(res);
+  return sad;
+}
+
+unsigned int aom_highbd_masked_sad4xh_ssse3(const uint8_t *src8, int src_stride,
+                                            const uint8_t *a8, int a_stride,
+                                            const uint8_t *b8, int b_stride,
+                                            const uint8_t *m_ptr, int m_stride,
+                                            int height) {
+  const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8);
+  int y;
+  __m128i res = _mm_setzero_si128();
+  const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
+  const __m128i round_const =
+      _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
+  const __m128i one = _mm_set1_epi16(1);
+
+  for (y = 0; y < height; y += 2) {
+    const __m128i src = _mm_unpacklo_epi64(
+        _mm_loadl_epi64((const __m128i *)src_ptr),
+        _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride]));
+    const __m128i a =
+        _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)a_ptr),
+                           _mm_loadl_epi64((const __m128i *)&a_ptr[a_stride]));
+    const __m128i b =
+        _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)b_ptr),
+                           _mm_loadl_epi64((const __m128i *)&b_ptr[b_stride]));
+    // Zero-extend mask to 16 bits
+    const __m128i m = _mm_unpacklo_epi8(
+        _mm_unpacklo_epi32(
+            _mm_cvtsi32_si128(*(const uint32_t *)m_ptr),
+            _mm_cvtsi32_si128(*(const uint32_t *)&m_ptr[m_stride])),
+        _mm_setzero_si128());
+    const __m128i m_inv = _mm_sub_epi16(mask_max, m);
+
+    const __m128i data_l = _mm_unpacklo_epi16(a, b);
+    const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv);
+    __m128i pred_l = _mm_madd_epi16(data_l, mask_l);
+    pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const),
+                            AOM_BLEND_A64_ROUND_BITS);
+
+    const __m128i data_r = _mm_unpackhi_epi16(a, b);
+    const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv);
+    __m128i pred_r = _mm_madd_epi16(data_r, mask_r);
+    pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const),
+                            AOM_BLEND_A64_ROUND_BITS);
+
+    const __m128i pred = _mm_packs_epi32(pred_l, pred_r);
+    const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(pred, src));
+    res = _mm_add_epi32(res, _mm_madd_epi16(diff, one));
+
+    src_ptr += src_stride * 2;
+    a_ptr += a_stride * 2;
+    b_ptr += b_stride * 2;
+    m_ptr += m_stride * 2;
+  }
+  res = _mm_hadd_epi32(res, res);
+  res = _mm_hadd_epi32(res, res);
+  int sad = _mm_cvtsi128_si32(res);
+  return sad;
+}
diff --git a/libs/libaom/src/aom_dsp/x86/masked_sad_intrin_ssse3.h b/libs/libaom/src/aom_dsp/x86/masked_sad_intrin_ssse3.h
new file mode 100644
index 000000000..cffbd9672
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/masked_sad_intrin_ssse3.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H_
+#define AOM_AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H_
+
+unsigned int aom_masked_sad8xh_ssse3(const uint8_t *src_ptr, int src_stride,
+                                     const uint8_t *a_ptr, int a_stride,
+                                     const uint8_t *b_ptr, int b_stride,
+                                     const uint8_t *m_ptr, int m_stride,
+                                     int height);
+
+unsigned int aom_masked_sad4xh_ssse3(const uint8_t *src_ptr, int src_stride,
+                                     const uint8_t *a_ptr, int a_stride,
+                                     const uint8_t *b_ptr, int b_stride,
+                                     const uint8_t *m_ptr, int m_stride,
+                                     int height);
+
+unsigned int aom_highbd_masked_sad4xh_ssse3(const uint8_t *src8, int src_stride,
+                                            const uint8_t *a8, int a_stride,
+                                            const uint8_t *b8, int b_stride,
+                                            const uint8_t *m_ptr, int m_stride,
+                                            int height);
+
+#endif  // AOM_AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H_
diff --git a/libs/libaom/src/aom_dsp/x86/masked_variance_intrin_ssse3.c b/libs/libaom/src/aom_dsp/x86/masked_variance_intrin_ssse3.c
new file mode 100644
index 000000000..fa93f0df4
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/masked_variance_intrin_ssse3.c
@@ -0,0 +1,1067 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include <tmmintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/blend.h"
+#include "aom_dsp/x86/masked_variance_intrin_ssse3.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_ports/mem.h"
+
+// For width a multiple of 16
+static void bilinear_filter(const uint8_t *src, int src_stride, int xoffset,
+                            int yoffset, uint8_t *dst, int w, int h);
+
+static void bilinear_filter8xh(const uint8_t *src, int src_stride, int xoffset,
+                               int yoffset, uint8_t *dst, int h);
+
+static void bilinear_filter4xh(const uint8_t *src, int src_stride, int xoffset,
+                               int yoffset, uint8_t *dst, int h);
+
+// For width a multiple of 16
+static void masked_variance(const uint8_t *src_ptr, int src_stride,
+                            const uint8_t *a_ptr, int a_stride,
+                            const uint8_t *b_ptr, int b_stride,
+                            const uint8_t *m_ptr, int m_stride, int width,
+                            int height, unsigned int *sse, int *sum_);
+
+static void masked_variance8xh(const uint8_t *src_ptr, int src_stride,
+                               const uint8_t *a_ptr, const uint8_t *b_ptr,
+                               const uint8_t *m_ptr, int m_stride, int height,
+                               unsigned int *sse, int *sum_);
+
+static void masked_variance4xh(const uint8_t *src_ptr, int src_stride,
+                               const uint8_t *a_ptr, const uint8_t *b_ptr,
+                               const uint8_t *m_ptr, int m_stride, int height,
+                               unsigned int *sse, int *sum_);
+
+#define MASK_SUBPIX_VAR_SSSE3(W, H)                                   \
+  unsigned int aom_masked_sub_pixel_variance##W##x##H##_ssse3(        \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,   \
+      const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
+      const uint8_t *msk, int msk_stride, int invert_mask,            \
+      unsigned int *sse) {                                            \
+    int sum;                                                          \
+    uint8_t temp[(H + 1) * W];                                        \
+                                                                      \
+    bilinear_filter(src, src_stride, xoffset, yoffset, temp, W, H);   \
+                                                                      \
+    if (!invert_mask)                                                 \
+      masked_variance(ref, ref_stride, temp, W, second_pred, W, msk,  \
+                      msk_stride, W, H, sse, &sum);                   \
+    else                                                              \
+      masked_variance(ref, ref_stride, second_pred, W, temp, W, msk,  \
+                      msk_stride, W, H, sse, &sum);                   \
+    return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H));         \
+  }
+
+#define MASK_SUBPIX_VAR8XH_SSSE3(H)                                           \
+  unsigned int aom_masked_sub_pixel_variance8x##H##_ssse3(                    \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
+      const uint8_t *ref, int ref_stride, const uint8_t *second_pred,         \
+      const uint8_t *msk, int msk_stride, int invert_mask,                    \
+      unsigned int *sse) {                                                    \
+    int sum;                                                                  \
+    uint8_t temp[(H + 1) * 8];                                                \
+                                                                              \
+    bilinear_filter8xh(src, src_stride, xoffset, yoffset, temp, H);           \
+                                                                              \
+    if (!invert_mask)                                                         \
+      masked_variance8xh(ref, ref_stride, temp, second_pred, msk, msk_stride, \
+                         H, sse, &sum);                                       \
+    else                                                                      \
+      masked_variance8xh(ref, ref_stride, second_pred, temp, msk, msk_stride, \
+                         H, sse, &sum);                                       \
+    return *sse - (uint32_t)(((int64_t)sum * sum) / (8 * H));                 \
+  }
+
+#define MASK_SUBPIX_VAR4XH_SSSE3(H)                                           \
+  unsigned int aom_masked_sub_pixel_variance4x##H##_ssse3(                    \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
+      const uint8_t *ref, int ref_stride, const uint8_t *second_pred,         \
+      const uint8_t *msk, int msk_stride, int invert_mask,                    \
+      unsigned int *sse) {                                                    \
+    int sum;                                                                  \
+    uint8_t temp[(H + 1) * 4];                                                \
+                                                                              \
+    bilinear_filter4xh(src, src_stride, xoffset, yoffset, temp, H);           \
+                                                                              \
+    if (!invert_mask)                                                         \
+      masked_variance4xh(ref, ref_stride, temp, second_pred, msk, msk_stride, \
+                         H, sse, &sum);                                       \
+    else                                                                      \
+      masked_variance4xh(ref, ref_stride, second_pred, temp, msk, msk_stride, \
+                         H, sse, &sum);                                       \
+    return *sse - (uint32_t)(((int64_t)sum * sum) / (4 * H));                 \
+  }
+
+MASK_SUBPIX_VAR_SSSE3(128, 128)
+MASK_SUBPIX_VAR_SSSE3(128, 64)
+MASK_SUBPIX_VAR_SSSE3(64, 128)
+MASK_SUBPIX_VAR_SSSE3(64, 64)
+MASK_SUBPIX_VAR_SSSE3(64, 32)
+MASK_SUBPIX_VAR_SSSE3(32, 64)
+MASK_SUBPIX_VAR_SSSE3(32, 32)
+MASK_SUBPIX_VAR_SSSE3(32, 16)
+MASK_SUBPIX_VAR_SSSE3(16, 32)
+MASK_SUBPIX_VAR_SSSE3(16, 16)
+MASK_SUBPIX_VAR_SSSE3(16, 8)
+MASK_SUBPIX_VAR8XH_SSSE3(16)
+MASK_SUBPIX_VAR8XH_SSSE3(8)
+MASK_SUBPIX_VAR8XH_SSSE3(4)
+MASK_SUBPIX_VAR4XH_SSSE3(8)
+MASK_SUBPIX_VAR4XH_SSSE3(4)
+MASK_SUBPIX_VAR4XH_SSSE3(16)
+MASK_SUBPIX_VAR_SSSE3(16, 4)
+MASK_SUBPIX_VAR8XH_SSSE3(32)
+MASK_SUBPIX_VAR_SSSE3(32, 8)
+MASK_SUBPIX_VAR_SSSE3(64, 16)
+MASK_SUBPIX_VAR_SSSE3(16, 64)
+
+static INLINE __m128i filter_block(const __m128i a, const __m128i b,
+                                   const __m128i filter) {
+  __m128i v0 = _mm_unpacklo_epi8(a, b);
+  v0 = _mm_maddubs_epi16(v0, filter);
+  v0 = xx_roundn_epu16(v0, FILTER_BITS);
+
+  __m128i v1 = _mm_unpackhi_epi8(a, b);
+  v1 = _mm_maddubs_epi16(v1, filter);
+  v1 = xx_roundn_epu16(v1, FILTER_BITS);
+
+  return _mm_packus_epi16(v0, v1);
+}
+
+static void bilinear_filter(const uint8_t *src, int src_stride, int xoffset,
+                            int yoffset, uint8_t *dst, int w, int h) {
+  int i, j;
+  // Horizontal filter
+  if (xoffset == 0) {
+    uint8_t *b = dst;
+    for (i = 0; i < h + 1; ++i) {
+      for (j = 0; j < w; j += 16) {
+        __m128i x = _mm_loadu_si128((__m128i *)&src[j]);
+        _mm_storeu_si128((__m128i *)&b[j], x);
+      }
+      src += src_stride;
+      b += w;
+    }
+  } else if (xoffset == 4) {
+    uint8_t *b = dst;
+    for (i = 0; i < h + 1; ++i) {
+      for (j = 0; j < w; j += 16) {
+        __m128i x = _mm_loadu_si128((__m128i *)&src[j]);
+        __m128i y = _mm_loadu_si128((__m128i *)&src[j + 16]);
+        __m128i z = _mm_alignr_epi8(y, x, 1);
+        _mm_storeu_si128((__m128i *)&b[j], _mm_avg_epu8(x, z));
+      }
+      src += src_stride;
+      b += w;
+    }
+  } else {
+    uint8_t *b = dst;
+    const uint8_t *hfilter = bilinear_filters_2t[xoffset];
+    const __m128i hfilter_vec = _mm_set1_epi16(hfilter[0] | (hfilter[1] << 8));
+    for (i = 0; i < h + 1; ++i) {
+      for (j = 0; j < w; j += 16) {
+        const __m128i x = _mm_loadu_si128((__m128i *)&src[j]);
+        const __m128i y = _mm_loadu_si128((__m128i *)&src[j + 16]);
+        const __m128i z = _mm_alignr_epi8(y, x, 1);
+        const __m128i res = filter_block(x, z, hfilter_vec);
+        _mm_storeu_si128((__m128i *)&b[j], res);
+      }
+
+      src += src_stride;
+      b += w;
+    }
+  }
+
+  // Vertical filter
+  if (yoffset == 0) {
+    // The data is already in 'dst', so no need to filter
+  } else if (yoffset == 4) {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; j += 16) {
+        __m128i x = _mm_loadu_si128((__m128i *)&dst[j]);
+        __m128i y = _mm_loadu_si128((__m128i *)&dst[j + w]);
+        _mm_storeu_si128((__m128i *)&dst[j], _mm_avg_epu8(x, y));
+      }
+      dst += w;
+    }
+  } else {
+    const uint8_t *vfilter = bilinear_filters_2t[yoffset];
+    const __m128i vfilter_vec = _mm_set1_epi16(vfilter[0] | (vfilter[1] << 8));
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; j += 16) {
+        const __m128i x = _mm_loadu_si128((__m128i *)&dst[j]);
+        const __m128i y = _mm_loadu_si128((__m128i *)&dst[j + w]);
+        const __m128i res = filter_block(x, y, vfilter_vec);
+        _mm_storeu_si128((__m128i *)&dst[j], res);
+      }
+
+      dst += w;
+    }
+  }
+}
+
+static INLINE __m128i filter_block_2rows(const __m128i *a0, const __m128i *b0,
+                                         const __m128i *a1, const __m128i *b1,
+                                         const __m128i *filter) {
+  __m128i v0 = _mm_unpacklo_epi8(*a0, *b0);
+  v0 = _mm_maddubs_epi16(v0, *filter);
+  v0 = xx_roundn_epu16(v0, FILTER_BITS);
+
+  __m128i v1 = _mm_unpacklo_epi8(*a1, *b1);
+  v1 = _mm_maddubs_epi16(v1, *filter);
+  v1 = xx_roundn_epu16(v1, FILTER_BITS);
+
+  return _mm_packus_epi16(v0, v1);
+}
+
+static void bilinear_filter8xh(const uint8_t *src, int src_stride, int xoffset,
+                               int yoffset, uint8_t *dst, int h) {
+  int i;
+  // Horizontal filter
+  if (xoffset == 0) {
+    uint8_t *b = dst;
+    for (i = 0; i < h + 1; ++i) {
+      __m128i x = _mm_loadl_epi64((__m128i *)src);
+      _mm_storel_epi64((__m128i *)b, x);
+      src += src_stride;
+      b += 8;
+    }
+  } else if (xoffset == 4) {
+    uint8_t *b = dst;
+    for (i = 0; i < h + 1; ++i) {
+      __m128i x = _mm_loadu_si128((__m128i *)src);
+      __m128i z = _mm_srli_si128(x, 1);
+      _mm_storel_epi64((__m128i *)b, _mm_avg_epu8(x, z));
+      src += src_stride;
+      b += 8;
+    }
+  } else {
+    uint8_t *b = dst;
+    const uint8_t *hfilter = bilinear_filters_2t[xoffset];
+    const __m128i hfilter_vec = _mm_set1_epi16(hfilter[0] | (hfilter[1] << 8));
+    for (i = 0; i < h; i += 2) {
+      const __m128i x0 = _mm_loadu_si128((__m128i *)src);
+      const __m128i z0 = _mm_srli_si128(x0, 1);
+      const __m128i x1 = _mm_loadu_si128((__m128i *)&src[src_stride]);
+      const __m128i z1 = _mm_srli_si128(x1, 1);
+      const __m128i res = filter_block_2rows(&x0, &z0, &x1, &z1, &hfilter_vec);
+      _mm_storeu_si128((__m128i *)b, res);
+
+      src += src_stride * 2;
+      b += 16;
+    }
+    // Handle i = h separately
+    const __m128i x0 = _mm_loadu_si128((__m128i *)src);
+    const __m128i z0 = _mm_srli_si128(x0, 1);
+
+    __m128i v0 = _mm_unpacklo_epi8(x0, z0);
+    v0 = _mm_maddubs_epi16(v0, hfilter_vec);
+    v0 = xx_roundn_epu16(v0, FILTER_BITS);
+
+    _mm_storel_epi64((__m128i *)b, _mm_packus_epi16(v0, v0));
+  }
+
+  // Vertical filter
+  if (yoffset == 0) {
+    // The data is already in 'dst', so no need to filter
+  } else if (yoffset == 4) {
+    for (i = 0; i < h; ++i) {
+      __m128i x = _mm_loadl_epi64((__m128i *)dst);
+      __m128i y = _mm_loadl_epi64((__m128i *)&dst[8]);
+      _mm_storel_epi64((__m128i *)dst, _mm_avg_epu8(x, y));
+      dst += 8;
+    }
+  } else {
+    const uint8_t *vfilter = bilinear_filters_2t[yoffset];
+    const __m128i vfilter_vec = _mm_set1_epi16(vfilter[0] | (vfilter[1] << 8));
+    for (i = 0; i < h; i += 2) {
+      const __m128i x = _mm_loadl_epi64((__m128i *)dst);
+      const __m128i y = _mm_loadl_epi64((__m128i *)&dst[8]);
+      const __m128i z = _mm_loadl_epi64((__m128i *)&dst[16]);
+      const __m128i res = filter_block_2rows(&x, &y, &y, &z, &vfilter_vec);
+      _mm_storeu_si128((__m128i *)dst, res);
+
+      dst += 16;
+    }
+  }
+}
+
+static void bilinear_filter4xh(const uint8_t *src, int src_stride, int xoffset,
+                               int yoffset, uint8_t *dst, int h) {
+  int i;
+  // Horizontal filter
+  if (xoffset == 0) {
+    uint8_t *b = dst;
+    for (i = 0; i < h + 1; ++i) {
+      __m128i x = xx_loadl_32((__m128i *)src);
+      xx_storel_32((__m128i *)b, x);
+      src += src_stride;
+      b += 4;
+    }
+  } else if (xoffset == 4) {
+    uint8_t *b = dst;
+    for (i = 0; i < h + 1; ++i) {
+      __m128i x = _mm_loadl_epi64((__m128i *)src);
+      __m128i z = _mm_srli_si128(x, 1);
+      xx_storel_32((__m128i *)b, _mm_avg_epu8(x, z));
+      src += src_stride;
+      b += 4;
+    }
+  } else {
+    uint8_t *b = dst;
+    const uint8_t *hfilter = bilinear_filters_2t[xoffset];
+    const __m128i hfilter_vec = _mm_set1_epi16(hfilter[0] | (hfilter[1] << 8));
+    for (i = 0; i < h; i += 4) {
+      const __m128i x0 = _mm_loadl_epi64((__m128i *)src);
+      const __m128i z0 = _mm_srli_si128(x0, 1);
+      const __m128i x1 = _mm_loadl_epi64((__m128i *)&src[src_stride]);
+      const __m128i z1 = _mm_srli_si128(x1, 1);
+      const __m128i x2 = _mm_loadl_epi64((__m128i *)&src[src_stride * 2]);
+      const __m128i z2 = _mm_srli_si128(x2, 1);
+      const __m128i x3 = _mm_loadl_epi64((__m128i *)&src[src_stride * 3]);
+      const __m128i z3 = _mm_srli_si128(x3, 1);
+
+      const __m128i a0 = _mm_unpacklo_epi32(x0, x1);
+      const __m128i b0 = _mm_unpacklo_epi32(z0, z1);
+      const __m128i a1 = _mm_unpacklo_epi32(x2, x3);
+      const __m128i b1 = _mm_unpacklo_epi32(z2, z3);
+      const __m128i res = filter_block_2rows(&a0, &b0, &a1, &b1, &hfilter_vec);
+      _mm_storeu_si128((__m128i *)b, res);
+
+      src += src_stride * 4;
+      b += 16;
+    }
+    // Handle i = h separately
+    const __m128i x = _mm_loadl_epi64((__m128i *)src);
+    const __m128i z = _mm_srli_si128(x, 1);
+
+    __m128i v0 = _mm_unpacklo_epi8(x, z);
+    v0 = _mm_maddubs_epi16(v0, hfilter_vec);
+    v0 = xx_roundn_epu16(v0, FILTER_BITS);
+
+    xx_storel_32((__m128i *)b, _mm_packus_epi16(v0, v0));
+  }
+
+  // Vertical filter
+  if (yoffset == 0) {
+    // The data is already in 'dst', so no need to filter
+  } else if (yoffset == 4) {
+    for (i = 0; i < h; ++i) {
+      __m128i x = xx_loadl_32((__m128i *)dst);
+      __m128i y = xx_loadl_32((__m128i *)&dst[4]);
+      xx_storel_32((__m128i *)dst, _mm_avg_epu8(x, y));
+      dst += 4;
+    }
+  } else {
+    const uint8_t *vfilter = bilinear_filters_2t[yoffset];
+    const __m128i vfilter_vec = _mm_set1_epi16(vfilter[0] | (vfilter[1] << 8));
+    for (i = 0; i < h; i += 4) {
+      const __m128i a = xx_loadl_32((__m128i *)dst);
+      const __m128i b = xx_loadl_32((__m128i *)&dst[4]);
+      const __m128i c = xx_loadl_32((__m128i *)&dst[8]);
+      const __m128i d = xx_loadl_32((__m128i *)&dst[12]);
+      const __m128i e = xx_loadl_32((__m128i *)&dst[16]);
+
+      const __m128i a0 = _mm_unpacklo_epi32(a, b);
+      const __m128i b0 = _mm_unpacklo_epi32(b, c);
+      const __m128i a1 = _mm_unpacklo_epi32(c, d);
+      const __m128i b1 = _mm_unpacklo_epi32(d, e);
+      const __m128i res = filter_block_2rows(&a0, &b0, &a1, &b1, &vfilter_vec);
+      _mm_storeu_si128((__m128i *)dst, res);
+
+      dst += 16;
+    }
+  }
+}
+
+static INLINE void accumulate_block(const __m128i *src, const __m128i *a,
+                                    const __m128i *b, const __m128i *m,
+                                    __m128i *sum, __m128i *sum_sq) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
+  const __m128i m_inv = _mm_sub_epi8(mask_max, *m);
+
+  // Calculate 16 predicted pixels.
+  // Note that the maximum value of any entry of 'pred_l' or 'pred_r'
+  // is 64 * 255, so we have plenty of space to add rounding constants.
+  const __m128i data_l = _mm_unpacklo_epi8(*a, *b);
+  const __m128i mask_l = _mm_unpacklo_epi8(*m, m_inv);
+  __m128i pred_l = _mm_maddubs_epi16(data_l, mask_l);
+  pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS);
+
+  const __m128i data_r = _mm_unpackhi_epi8(*a, *b);
+  const __m128i mask_r = _mm_unpackhi_epi8(*m, m_inv);
+  __m128i pred_r = _mm_maddubs_epi16(data_r, mask_r);
+  pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS);
+
+  const __m128i src_l = _mm_unpacklo_epi8(*src, zero);
+  const __m128i src_r = _mm_unpackhi_epi8(*src, zero);
+  const __m128i diff_l = _mm_sub_epi16(pred_l, src_l);
+  const __m128i diff_r = _mm_sub_epi16(pred_r, src_r);
+
+  // Update partial sums and partial sums of squares
+  *sum =
+      _mm_add_epi32(*sum, _mm_madd_epi16(_mm_add_epi16(diff_l, diff_r), one));
+  *sum_sq =
+      _mm_add_epi32(*sum_sq, _mm_add_epi32(_mm_madd_epi16(diff_l, diff_l),
+                                           _mm_madd_epi16(diff_r, diff_r)));
+}
+
+static void masked_variance(const uint8_t *src_ptr, int src_stride,
+                            const uint8_t *a_ptr, int a_stride,
+                            const uint8_t *b_ptr, int b_stride,
+                            const uint8_t *m_ptr, int m_stride, int width,
+                            int height, unsigned int *sse, int *sum_) {
+  int x, y;
+  __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128();
+
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x += 16) {
+      const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]);
+      const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]);
+      const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]);
+      const __m128i m = _mm_loadu_si128((const __m128i *)&m_ptr[x]);
+      accumulate_block(&src, &a, &b, &m, &sum, &sum_sq);
+    }
+
+    src_ptr += src_stride;
+    a_ptr += a_stride;
+    b_ptr += b_stride;
+    m_ptr += m_stride;
+  }
+  // Reduce down to a single sum and sum of squares
+  sum = _mm_hadd_epi32(sum, sum_sq);
+  sum = _mm_hadd_epi32(sum, sum);
+  *sum_ = _mm_cvtsi128_si32(sum);
+  *sse = _mm_cvtsi128_si32(_mm_srli_si128(sum, 4));
+}
+
+static void masked_variance8xh(const uint8_t *src_ptr, int src_stride,
+                               const uint8_t *a_ptr, const uint8_t *b_ptr,
+                               const uint8_t *m_ptr, int m_stride, int height,
+                               unsigned int *sse, int *sum_) {
+  int y;
+  __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128();
+
+  for (y = 0; y < height; y += 2) {
+    __m128i src = _mm_unpacklo_epi64(
+        _mm_loadl_epi64((const __m128i *)src_ptr),
+        _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride]));
+    const __m128i a = _mm_loadu_si128((const __m128i *)a_ptr);
+    const __m128i b = _mm_loadu_si128((const __m128i *)b_ptr);
+    const __m128i m =
+        _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)m_ptr),
+                           _mm_loadl_epi64((const __m128i *)&m_ptr[m_stride]));
+    accumulate_block(&src, &a, &b, &m, &sum, &sum_sq);
+
+    src_ptr += src_stride * 2;
+    a_ptr += 16;
+    b_ptr += 16;
+    m_ptr += m_stride * 2;
+  }
+  // Reduce down to a single sum and sum of squares
+  sum = _mm_hadd_epi32(sum, sum_sq);
+  sum = _mm_hadd_epi32(sum, sum);
+  *sum_ = _mm_cvtsi128_si32(sum);
+  *sse = _mm_cvtsi128_si32(_mm_srli_si128(sum, 4));
+}
+
+static void masked_variance4xh(const uint8_t *src_ptr, int src_stride,
+                               const uint8_t *a_ptr, const uint8_t *b_ptr,
+                               const uint8_t *m_ptr, int m_stride, int height,
+                               unsigned int *sse, int *sum_) {
+  int y;
+  __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128();
+
+  for (y = 0; y < height; y += 4) {
+    // Load four rows at a time
+    __m128i src =
+        _mm_setr_epi32(*(uint32_t *)src_ptr, *(uint32_t *)&src_ptr[src_stride],
+                       *(uint32_t *)&src_ptr[src_stride * 2],
+                       *(uint32_t *)&src_ptr[src_stride * 3]);
+    const __m128i a = _mm_loadu_si128((const __m128i *)a_ptr);
+    const __m128i b = _mm_loadu_si128((const __m128i *)b_ptr);
+    const __m128i m = _mm_setr_epi32(
+        *(uint32_t *)m_ptr, *(uint32_t *)&m_ptr[m_stride],
+        *(uint32_t *)&m_ptr[m_stride * 2], *(uint32_t *)&m_ptr[m_stride * 3]);
+    accumulate_block(&src, &a, &b, &m, &sum, &sum_sq);
+
+    src_ptr += src_stride * 4;
+    a_ptr += 16;
+    b_ptr += 16;
+    m_ptr += m_stride * 4;
+  }
+  // Reduce down to a single sum and sum of squares
+  sum = _mm_hadd_epi32(sum, sum_sq);
+  sum = _mm_hadd_epi32(sum, sum);
+  *sum_ = _mm_cvtsi128_si32(sum);
+  *sse = _mm_cvtsi128_si32(_mm_srli_si128(sum, 4));
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+// For width a multiple of 8
+static void highbd_bilinear_filter(const uint16_t *src, int src_stride,
+                                   int xoffset, int yoffset, uint16_t *dst,
+                                   int w, int h);
+
+static void highbd_bilinear_filter4xh(const uint16_t *src, int src_stride,
+                                      int xoffset, int yoffset, uint16_t *dst,
+                                      int h);
+
+// For width a multiple of 8
+static void highbd_masked_variance(const uint16_t *src_ptr, int src_stride,
+                                   const uint16_t *a_ptr, int a_stride,
+                                   const uint16_t *b_ptr, int b_stride,
+                                   const uint8_t *m_ptr, int m_stride,
+                                   int width, int height, uint64_t *sse,
+                                   int *sum_);
+
+static void highbd_masked_variance4xh(const uint16_t *src_ptr, int src_stride,
+                                      const uint16_t *a_ptr,
+                                      const uint16_t *b_ptr,
+                                      const uint8_t *m_ptr, int m_stride,
+                                      int height, int *sse, int *sum_);
+
+#define HIGHBD_MASK_SUBPIX_VAR_SSSE3(W, H)                                  \
+  unsigned int aom_highbd_8_masked_sub_pixel_variance##W##x##H##_ssse3(     \
+      const uint8_t *src8, int src_stride, int xoffset, int yoffset,        \
+      const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8,     \
+      const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \
+    uint64_t sse64;                                                         \
+    int sum;                                                                \
+    uint16_t temp[(H + 1) * W];                                             \
+    const uint16_t *src = CONVERT_TO_SHORTPTR(src8);                        \
+    const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                        \
+    const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8);        \
+                                                                            \
+    highbd_bilinear_filter(src, src_stride, xoffset, yoffset, temp, W, H);  \
+                                                                            \
+    if (!invert_mask)                                                       \
+      highbd_masked_variance(ref, ref_stride, temp, W, second_pred, W, msk, \
+                             msk_stride, W, H, &sse64, &sum);               \
+    else                                                                    \
+      highbd_masked_variance(ref, ref_stride, second_pred, W, temp, W, msk, \
+                             msk_stride, W, H, &sse64, &sum);               \
+    *sse = (uint32_t)sse64;                                                 \
+    return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H));               \
+  }                                                                         \
+  unsigned int aom_highbd_10_masked_sub_pixel_variance##W##x##H##_ssse3(    \
+      const uint8_t *src8, int src_stride, int xoffset, int yoffset,        \
+      const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8,     \
+      const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \
+    uint64_t sse64;                                                         \
+    int sum;                                                                \
+    int64_t var;                                                            \
+    uint16_t temp[(H + 1) * W];                                             \
+    const uint16_t *src = CONVERT_TO_SHORTPTR(src8);                        \
+    const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                        \
+    const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8);        \
+                                                                            \
+    highbd_bilinear_filter(src, src_stride, xoffset, yoffset, temp, W, H);  \
+                                                                            \
+    if (!invert_mask)                                                       \
+      highbd_masked_variance(ref, ref_stride, temp, W, second_pred, W, msk, \
+                             msk_stride, W, H, &sse64, &sum);               \
+    else                                                                    \
+      highbd_masked_variance(ref, ref_stride, second_pred, W, temp, W, msk, \
+                             msk_stride, W, H, &sse64, &sum);               \
+    *sse = (uint32_t)ROUND_POWER_OF_TWO(sse64, 4);                          \
+    sum = ROUND_POWER_OF_TWO(sum, 2);                                       \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));               \
+    return (var >= 0) ? (uint32_t)var : 0;                                  \
+  }                                                                         \
+  unsigned int aom_highbd_12_masked_sub_pixel_variance##W##x##H##_ssse3(    \
+      const uint8_t *src8, int src_stride, int xoffset, int yoffset,        \
+      const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8,     \
+      const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \
+    uint64_t sse64;                                                         \
+    int sum;                                                                \
+    int64_t var;                                                            \
+    uint16_t temp[(H + 1) * W];                                             \
+    const uint16_t *src = CONVERT_TO_SHORTPTR(src8);                        \
+    const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                        \
+    const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8);        \
+                                                                            \
+    highbd_bilinear_filter(src, src_stride, xoffset, yoffset, temp, W, H);  \
+                                                                            \
+    if (!invert_mask)                                                       \
+      highbd_masked_variance(ref, ref_stride, temp, W, second_pred, W, msk, \
+                             msk_stride, W, H, &sse64, &sum);               \
+    else                                                                    \
+      highbd_masked_variance(ref, ref_stride, second_pred, W, temp, W, msk, \
+                             msk_stride, W, H, &sse64, &sum);               \
+    *sse = (uint32_t)ROUND_POWER_OF_TWO(sse64, 8);                          \
+    sum = ROUND_POWER_OF_TWO(sum, 4);                                       \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));               \
+    return (var >= 0) ? (uint32_t)var : 0;                                  \
+  }
+
+#define HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(H)                                  \
+  unsigned int aom_highbd_8_masked_sub_pixel_variance4x##H##_ssse3(         \
+      const uint8_t *src8, int src_stride, int xoffset, int yoffset,        \
+      const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8,     \
+      const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \
+    int sse_;                                                               \
+    int sum;                                                                \
+    uint16_t temp[(H + 1) * 4];                                             \
+    const uint16_t *src = CONVERT_TO_SHORTPTR(src8);                        \
+    const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                        \
+    const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8);        \
+                                                                            \
+    highbd_bilinear_filter4xh(src, src_stride, xoffset, yoffset, temp, H);  \
+                                                                            \
+    if (!invert_mask)                                                       \
+      highbd_masked_variance4xh(ref, ref_stride, temp, second_pred, msk,    \
+                                msk_stride, H, &sse_, &sum);                \
+    else                                                                    \
+      highbd_masked_variance4xh(ref, ref_stride, second_pred, temp, msk,    \
+                                msk_stride, H, &sse_, &sum);                \
+    *sse = (uint32_t)sse_;                                                  \
+    return *sse - (uint32_t)(((int64_t)sum * sum) / (4 * H));               \
+  }                                                                         \
+  unsigned int aom_highbd_10_masked_sub_pixel_variance4x##H##_ssse3(        \
+      const uint8_t *src8, int src_stride, int xoffset, int yoffset,        \
+      const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8,     \
+      const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \
+    int sse_;                                                               \
+    int sum;                                                                \
+    int64_t var;                                                            \
+    uint16_t temp[(H + 1) * 4];                                             \
+    const uint16_t *src = CONVERT_TO_SHORTPTR(src8);                        \
+    const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                        \
+    const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8);        \
+                                                                            \
+    highbd_bilinear_filter4xh(src, src_stride, xoffset, yoffset, temp, H);  \
+                                                                            \
+    if (!invert_mask)                                                       \
+      highbd_masked_variance4xh(ref, ref_stride, temp, second_pred, msk,    \
+                                msk_stride, H, &sse_, &sum);                \
+    else                                                                    \
+      highbd_masked_variance4xh(ref, ref_stride, second_pred, temp, msk,    \
+                                msk_stride, H, &sse_, &sum);                \
+    *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_, 4);                           \
+    sum = ROUND_POWER_OF_TWO(sum, 2);                                       \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (4 * H));               \
+    return (var >= 0) ? (uint32_t)var : 0;                                  \
+  }                                                                         \
+  unsigned int aom_highbd_12_masked_sub_pixel_variance4x##H##_ssse3(        \
+      const uint8_t *src8, int src_stride, int xoffset, int yoffset,        \
+      const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8,     \
+      const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \
+    int sse_;                                                               \
+    int sum;                                                                \
+    int64_t var;                                                            \
+    uint16_t temp[(H + 1) * 4];                                             \
+    const uint16_t *src = CONVERT_TO_SHORTPTR(src8);                        \
+    const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                        \
+    const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8);        \
+                                                                            \
+    highbd_bilinear_filter4xh(src, src_stride, xoffset, yoffset, temp, H);  \
+                                                                            \
+    if (!invert_mask)                                                       \
+      highbd_masked_variance4xh(ref, ref_stride, temp, second_pred, msk,    \
+                                msk_stride, H, &sse_, &sum);                \
+    else                                                                    \
+      highbd_masked_variance4xh(ref, ref_stride, second_pred, temp, msk,    \
+                                msk_stride, H, &sse_, &sum);                \
+    *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_, 8);                           \
+    sum = ROUND_POWER_OF_TWO(sum, 4);                                       \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (4 * H));               \
+    return (var >= 0) ? (uint32_t)var : 0;                                  \
+  }
+
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(128, 128)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(128, 64)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 128)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 64)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 32)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 64)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 32)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 16)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 32)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 16)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 8)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 16)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 8)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 4)
+HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(8)
+HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(4)
+HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(16)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 4)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 32)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 8)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 64)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 16)
+
+static INLINE __m128i highbd_filter_block(const __m128i a, const __m128i b,
+                                          const __m128i filter) {
+  __m128i v0 = _mm_unpacklo_epi16(a, b);
+  v0 = _mm_madd_epi16(v0, filter);
+  v0 = xx_roundn_epu32(v0, FILTER_BITS);
+
+  __m128i v1 = _mm_unpackhi_epi16(a, b);
+  v1 = _mm_madd_epi16(v1, filter);
+  v1 = xx_roundn_epu32(v1, FILTER_BITS);
+
+  return _mm_packs_epi32(v0, v1);
+}
+
+static void highbd_bilinear_filter(const uint16_t *src, int src_stride,
+                                   int xoffset, int yoffset, uint16_t *dst,
+                                   int w, int h) {
+  int i, j;
+  // Horizontal filter
+  if (xoffset == 0) {
+    uint16_t *b = dst;
+    for (i = 0; i < h + 1; ++i) {
+      for (j = 0; j < w; j += 8) {
+        __m128i x = _mm_loadu_si128((__m128i *)&src[j]);
+        _mm_storeu_si128((__m128i *)&b[j], x);
+      }
+      src += src_stride;
+      b += w;
+    }
+  } else if (xoffset == 4) {
+    uint16_t *b = dst;
+    for (i = 0; i < h + 1; ++i) {
+      for (j = 0; j < w; j += 8) {
+        __m128i x = _mm_loadu_si128((__m128i *)&src[j]);
+        __m128i y = _mm_loadu_si128((__m128i *)&src[j + 8]);
+        __m128i z = _mm_alignr_epi8(y, x, 2);
+        _mm_storeu_si128((__m128i *)&b[j], _mm_avg_epu16(x, z));
+      }
+      src += src_stride;
+      b += w;
+    }
+  } else {
+    uint16_t *b = dst;
+    const uint8_t *hfilter = bilinear_filters_2t[xoffset];
+    const __m128i hfilter_vec = _mm_set1_epi32(hfilter[0] | (hfilter[1] << 16));
+    for (i = 0; i < h + 1; ++i) {
+      for (j = 0; j < w; j += 8) {
+        const __m128i x = _mm_loadu_si128((__m128i *)&src[j]);
+        const __m128i y = _mm_loadu_si128((__m128i *)&src[j + 8]);
+        const __m128i z = _mm_alignr_epi8(y, x, 2);
+        const __m128i res = highbd_filter_block(x, z, hfilter_vec);
+        _mm_storeu_si128((__m128i *)&b[j], res);
+      }
+
+      src += src_stride;
+      b += w;
+    }
+  }
+
+  // Vertical filter
+  if (yoffset == 0) {
+    // The data is already in 'dst', so no need to filter
+  } else if (yoffset == 4) {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; j += 8) {
+        __m128i x = _mm_loadu_si128((__m128i *)&dst[j]);
+        __m128i y = _mm_loadu_si128((__m128i *)&dst[j + w]);
+        _mm_storeu_si128((__m128i *)&dst[j], _mm_avg_epu16(x, y));
+      }
+      dst += w;
+    }
+  } else {
+    const uint8_t *vfilter = bilinear_filters_2t[yoffset];
+    const __m128i vfilter_vec = _mm_set1_epi32(vfilter[0] | (vfilter[1] << 16));
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; j += 8) {
+        const __m128i x = _mm_loadu_si128((__m128i *)&dst[j]);
+        const __m128i y = _mm_loadu_si128((__m128i *)&dst[j + w]);
+        const __m128i res = highbd_filter_block(x, y, vfilter_vec);
+        _mm_storeu_si128((__m128i *)&dst[j], res);
+      }
+
+      dst += w;
+    }
+  }
+}
+
+static INLINE __m128i highbd_filter_block_2rows(const __m128i *a0,
+                                                const __m128i *b0,
+                                                const __m128i *a1,
+                                                const __m128i *b1,
+                                                const __m128i *filter) {
+  __m128i v0 = _mm_unpacklo_epi16(*a0, *b0);
+  v0 = _mm_madd_epi16(v0, *filter);
+  v0 = xx_roundn_epu32(v0, FILTER_BITS);
+
+  __m128i v1 = _mm_unpacklo_epi16(*a1, *b1);
+  v1 = _mm_madd_epi16(v1, *filter);
+  v1 = xx_roundn_epu32(v1, FILTER_BITS);
+
+  return _mm_packs_epi32(v0, v1);
+}
+
+static void highbd_bilinear_filter4xh(const uint16_t *src, int src_stride,
+                                      int xoffset, int yoffset, uint16_t *dst,
+                                      int h) {
+  int i;
+  // Horizontal filter
+  if (xoffset == 0) {
+    uint16_t *b = dst;
+    for (i = 0; i < h + 1; ++i) {
+      __m128i x = _mm_loadl_epi64((__m128i *)src);
+      _mm_storel_epi64((__m128i *)b, x);
+      src += src_stride;
+      b += 4;
+    }
+  } else if (xoffset == 4) {
+    uint16_t *b = dst;
+    for (i = 0; i < h + 1; ++i) {
+      __m128i x = _mm_loadu_si128((__m128i *)src);
+      __m128i z = _mm_srli_si128(x, 2);
+      _mm_storel_epi64((__m128i *)b, _mm_avg_epu16(x, z));
+      src += src_stride;
+      b += 4;
+    }
+  } else {
+    uint16_t *b = dst;
+    const uint8_t *hfilter = bilinear_filters_2t[xoffset];
+    const __m128i hfilter_vec = _mm_set1_epi32(hfilter[0] | (hfilter[1] << 16));
+    for (i = 0; i < h; i += 2) {
+      const __m128i x0 = _mm_loadu_si128((__m128i *)src);
+      const __m128i z0 = _mm_srli_si128(x0, 2);
+      const __m128i x1 = _mm_loadu_si128((__m128i *)&src[src_stride]);
+      const __m128i z1 = _mm_srli_si128(x1, 2);
+      const __m128i res =
+          highbd_filter_block_2rows(&x0, &z0, &x1, &z1, &hfilter_vec);
+      _mm_storeu_si128((__m128i *)b, res);
+
+      src += src_stride * 2;
+      b += 8;
+    }
+    // Process i = h separately
+    __m128i x = _mm_loadu_si128((__m128i *)src);
+    __m128i z = _mm_srli_si128(x, 2);
+
+    __m128i v0 = _mm_unpacklo_epi16(x, z);
+    v0 = _mm_madd_epi16(v0, hfilter_vec);
+    v0 = xx_roundn_epu32(v0, FILTER_BITS);
+
+    _mm_storel_epi64((__m128i *)b, _mm_packs_epi32(v0, v0));
+  }
+
+  // Vertical filter
+  if (yoffset == 0) {
+    // The data is already in 'dst', so no need to filter
+  } else if (yoffset == 4) {
+    for (i = 0; i < h; ++i) {
+      __m128i x = _mm_loadl_epi64((__m128i *)dst);
+      __m128i y = _mm_loadl_epi64((__m128i *)&dst[4]);
+      _mm_storel_epi64((__m128i *)dst, _mm_avg_epu16(x, y));
+      dst += 4;
+    }
+  } else {
+    const uint8_t *vfilter = bilinear_filters_2t[yoffset];
+    const __m128i vfilter_vec = _mm_set1_epi32(vfilter[0] | (vfilter[1] << 16));
+    for (i = 0; i < h; i += 2) {
+      const __m128i x = _mm_loadl_epi64((__m128i *)dst);
+      const __m128i y = _mm_loadl_epi64((__m128i *)&dst[4]);
+      const __m128i z = _mm_loadl_epi64((__m128i *)&dst[8]);
+      const __m128i res =
+          highbd_filter_block_2rows(&x, &y, &y, &z, &vfilter_vec);
+      _mm_storeu_si128((__m128i *)dst, res);
+
+      dst += 8;
+    }
+  }
+}
+
+static void highbd_masked_variance(const uint16_t *src_ptr, int src_stride,
+                                   const uint16_t *a_ptr, int a_stride,
+                                   const uint16_t *b_ptr, int b_stride,
+                                   const uint8_t *m_ptr, int m_stride,
+                                   int width, int height, uint64_t *sse,
+                                   int *sum_) {
+  int x, y;
+  // Note on bit widths:
+  // The maximum value of 'sum' is (2^12 - 1) * 128 * 128 =~ 2^26,
+  // so this can be kept as four 32-bit values.
+  // But the maximum value of 'sum_sq' is (2^12 - 1)^2 * 128 * 128 =~ 2^38,
+  // so this must be stored as two 64-bit values.
+  __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128();
+  const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
+  const __m128i round_const =
+      _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
+  const __m128i zero = _mm_setzero_si128();
+
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x += 8) {
+      const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]);
+      const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]);
+      const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]);
+      const __m128i m =
+          _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&m_ptr[x]), zero);
+      const __m128i m_inv = _mm_sub_epi16(mask_max, m);
+
+      // Calculate 8 predicted pixels.
+      const __m128i data_l = _mm_unpacklo_epi16(a, b);
+      const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv);
+      __m128i pred_l = _mm_madd_epi16(data_l, mask_l);
+      pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const),
+                              AOM_BLEND_A64_ROUND_BITS);
+
+      const __m128i data_r = _mm_unpackhi_epi16(a, b);
+      const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv);
+      __m128i pred_r = _mm_madd_epi16(data_r, mask_r);
+      pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const),
+                              AOM_BLEND_A64_ROUND_BITS);
+
+      const __m128i src_l = _mm_unpacklo_epi16(src, zero);
+      const __m128i src_r = _mm_unpackhi_epi16(src, zero);
+      __m128i diff_l = _mm_sub_epi32(pred_l, src_l);
+      __m128i diff_r = _mm_sub_epi32(pred_r, src_r);
+
+      // Update partial sums and partial sums of squares
+      sum = _mm_add_epi32(sum, _mm_add_epi32(diff_l, diff_r));
+      // A trick: Now each entry of diff_l and diff_r is stored in a 32-bit
+      // field, but the range of values is only [-(2^12 - 1), 2^12 - 1].
+      // So we can re-pack into 16-bit fields and use _mm_madd_epi16
+      // to calculate the squares and partially sum them.
+      const __m128i tmp = _mm_packs_epi32(diff_l, diff_r);
+      const __m128i prod = _mm_madd_epi16(tmp, tmp);
+      // Then we want to sign-extend to 64 bits and accumulate
+      const __m128i sign = _mm_srai_epi32(prod, 31);
+      const __m128i tmp_0 = _mm_unpacklo_epi32(prod, sign);
+      const __m128i tmp_1 = _mm_unpackhi_epi32(prod, sign);
+      sum_sq = _mm_add_epi64(sum_sq, _mm_add_epi64(tmp_0, tmp_1));
+    }
+
+    src_ptr += src_stride;
+    a_ptr += a_stride;
+    b_ptr += b_stride;
+    m_ptr += m_stride;
+  }
+  // Reduce down to a single sum and sum of squares
+  sum = _mm_hadd_epi32(sum, zero);
+  sum = _mm_hadd_epi32(sum, zero);
+  *sum_ = _mm_cvtsi128_si32(sum);
+  sum_sq = _mm_add_epi64(sum_sq, _mm_srli_si128(sum_sq, 8));
+  _mm_storel_epi64((__m128i *)sse, sum_sq);
+}
+
+static void highbd_masked_variance4xh(const uint16_t *src_ptr, int src_stride,
+                                      const uint16_t *a_ptr,
+                                      const uint16_t *b_ptr,
+                                      const uint8_t *m_ptr, int m_stride,
+                                      int height, int *sse, int *sum_) {
+  int y;
+  // Note: For this function, h <= 8 (or maybe 16 if we add 4:1 partitions).
+  // So the maximum value of sum is (2^12 - 1) * 4 * 16 =~ 2^18
+  // and the maximum value of sum_sq is (2^12 - 1)^2 * 4 * 16 =~ 2^30.
+  // So we can safely pack sum_sq into 32-bit fields, which is slightly more
+  // convenient.
+  __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128();
+  const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
+  const __m128i round_const =
+      _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
+  const __m128i zero = _mm_setzero_si128();
+
+  for (y = 0; y < height; y += 2) {
+    __m128i src = _mm_unpacklo_epi64(
+        _mm_loadl_epi64((const __m128i *)src_ptr),
+        _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride]));
+    const __m128i a = _mm_loadu_si128((const __m128i *)a_ptr);
+    const __m128i b = _mm_loadu_si128((const __m128i *)b_ptr);
+    const __m128i m = _mm_unpacklo_epi8(
+        _mm_unpacklo_epi32(
+            _mm_cvtsi32_si128(*(const uint32_t *)m_ptr),
+            _mm_cvtsi32_si128(*(const uint32_t *)&m_ptr[m_stride])),
+        zero);
+    const __m128i m_inv = _mm_sub_epi16(mask_max, m);
+
+    const __m128i data_l = _mm_unpacklo_epi16(a, b);
+    const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv);
+    __m128i pred_l = _mm_madd_epi16(data_l, mask_l);
+    pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const),
+                            AOM_BLEND_A64_ROUND_BITS);
+
+    const __m128i data_r = _mm_unpackhi_epi16(a, b);
+    const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv);
+    __m128i pred_r = _mm_madd_epi16(data_r, mask_r);
+    pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const),
+                            AOM_BLEND_A64_ROUND_BITS);
+
+    const __m128i src_l = _mm_unpacklo_epi16(src, zero);
+    const __m128i src_r = _mm_unpackhi_epi16(src, zero);
+    __m128i diff_l = _mm_sub_epi32(pred_l, src_l);
+    __m128i diff_r = _mm_sub_epi32(pred_r, src_r);
+
+    // Update partial sums and partial sums of squares
+    sum = _mm_add_epi32(sum, _mm_add_epi32(diff_l, diff_r));
+    const __m128i tmp = _mm_packs_epi32(diff_l, diff_r);
+    const __m128i prod = _mm_madd_epi16(tmp, tmp);
+    sum_sq = _mm_add_epi32(sum_sq, prod);
+
+    src_ptr += src_stride * 2;
+    a_ptr += 8;
+    b_ptr += 8;
+    m_ptr += m_stride * 2;
+  }
+  // Reduce down to a single sum and sum of squares
+  sum = _mm_hadd_epi32(sum, sum_sq);
+  sum = _mm_hadd_epi32(sum, zero);
+  *sum_ = _mm_cvtsi128_si32(sum);
+  *sse = _mm_cvtsi128_si32(_mm_srli_si128(sum, 4));
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+void aom_comp_mask_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred,
+                              int width, int height, const uint8_t *ref,
+                              int ref_stride, const uint8_t *mask,
+                              int mask_stride, int invert_mask) {
+  const uint8_t *src0 = invert_mask ? pred : ref;
+  const uint8_t *src1 = invert_mask ? ref : pred;
+  const int stride0 = invert_mask ? width : ref_stride;
+  const int stride1 = invert_mask ? ref_stride : width;
+  assert(height % 2 == 0);
+  int i = 0;
+  if (width == 8) {
+    comp_mask_pred_8_ssse3(comp_pred, height, src0, stride0, src1, stride1,
+                           mask, mask_stride);
+  } else if (width == 16) {
+    do {
+      comp_mask_pred_16_ssse3(src0, src1, mask, comp_pred);
+      comp_mask_pred_16_ssse3(src0 + stride0, src1 + stride1,
+                              mask + mask_stride, comp_pred + width);
+      comp_pred += (width << 1);
+      src0 += (stride0 << 1);
+      src1 += (stride1 << 1);
+      mask += (mask_stride << 1);
+      i += 2;
+    } while (i < height);
+  } else {  // width == 32
+    assert(width == 32);
+    do {
+      comp_mask_pred_16_ssse3(src0, src1, mask, comp_pred);
+      comp_mask_pred_16_ssse3(src0 + 16, src1 + 16, mask + 16, comp_pred + 16);
+      comp_pred += (width);
+      src0 += (stride0);
+      src1 += (stride1);
+      mask += (mask_stride);
+      i += 1;
+    } while (i < height);
+  }
+}
diff --git a/libs/libaom/src/aom_dsp/x86/masked_variance_intrin_ssse3.h b/libs/libaom/src/aom_dsp/x86/masked_variance_intrin_ssse3.h
new file mode 100644
index 000000000..4faa098ac
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/masked_variance_intrin_ssse3.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_MASKED_VARIANCE_INTRIN_SSSE3_H_
+#define AOM_AOM_DSP_X86_MASKED_VARIANCE_INTRIN_SSSE3_H_
+
+#include <stdlib.h>
+#include <string.h>
+#include <tmmintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/blend.h"
+
+static INLINE void comp_mask_pred_16_ssse3(const uint8_t *src0,
+                                           const uint8_t *src1,
+                                           const uint8_t *mask, uint8_t *dst) {
+  const __m128i alpha_max = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i round_offset =
+      _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+
+  const __m128i sA0 = _mm_lddqu_si128((const __m128i *)(src0));
+  const __m128i sA1 = _mm_lddqu_si128((const __m128i *)(src1));
+  const __m128i aA = _mm_load_si128((const __m128i *)(mask));
+
+  const __m128i maA = _mm_sub_epi8(alpha_max, aA);
+
+  const __m128i ssAL = _mm_unpacklo_epi8(sA0, sA1);
+  const __m128i aaAL = _mm_unpacklo_epi8(aA, maA);
+  const __m128i ssAH = _mm_unpackhi_epi8(sA0, sA1);
+  const __m128i aaAH = _mm_unpackhi_epi8(aA, maA);
+
+  const __m128i blendAL = _mm_maddubs_epi16(ssAL, aaAL);
+  const __m128i blendAH = _mm_maddubs_epi16(ssAH, aaAH);
+
+  const __m128i roundAL = _mm_mulhrs_epi16(blendAL, round_offset);
+  const __m128i roundAH = _mm_mulhrs_epi16(blendAH, round_offset);
+  _mm_store_si128((__m128i *)dst, _mm_packus_epi16(roundAL, roundAH));
+}
+
+static INLINE void comp_mask_pred_8_ssse3(uint8_t *comp_pred, int height,
+                                          const uint8_t *src0, int stride0,
+                                          const uint8_t *src1, int stride1,
+                                          const uint8_t *mask,
+                                          int mask_stride) {
+  int i = 0;
+  const __m128i alpha_max = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i round_offset =
+      _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+  do {
+    // odd line A
+    const __m128i sA0 = _mm_loadl_epi64((const __m128i *)(src0));
+    const __m128i sA1 = _mm_loadl_epi64((const __m128i *)(src1));
+    const __m128i aA = _mm_loadl_epi64((const __m128i *)(mask));
+    // even line B
+    const __m128i sB0 = _mm_loadl_epi64((const __m128i *)(src0 + stride0));
+    const __m128i sB1 = _mm_loadl_epi64((const __m128i *)(src1 + stride1));
+    const __m128i a = _mm_castps_si128(_mm_loadh_pi(
+        _mm_castsi128_ps(aA), (const __m64 *)(mask + mask_stride)));
+
+    const __m128i ssA = _mm_unpacklo_epi8(sA0, sA1);
+    const __m128i ssB = _mm_unpacklo_epi8(sB0, sB1);
+
+    const __m128i ma = _mm_sub_epi8(alpha_max, a);
+    const __m128i aaA = _mm_unpacklo_epi8(a, ma);
+    const __m128i aaB = _mm_unpackhi_epi8(a, ma);
+
+    const __m128i blendA = _mm_maddubs_epi16(ssA, aaA);
+    const __m128i blendB = _mm_maddubs_epi16(ssB, aaB);
+    const __m128i roundA = _mm_mulhrs_epi16(blendA, round_offset);
+    const __m128i roundB = _mm_mulhrs_epi16(blendB, round_offset);
+    const __m128i round = _mm_packus_epi16(roundA, roundB);
+    // comp_pred's stride == width == 8
+    _mm_store_si128((__m128i *)(comp_pred), round);
+    comp_pred += (8 << 1);
+    src0 += (stride0 << 1);
+    src1 += (stride1 << 1);
+    mask += (mask_stride << 1);
+    i += 2;
+  } while (i < height);
+}
+
+#endif  // AOM_AOM_DSP_X86_MASKED_VARIANCE_INTRIN_SSSE3_H_
diff --git a/libs/libaom/src/aom_dsp/x86/mem_sse2.h b/libs/libaom/src/aom_dsp/x86/mem_sse2.h
new file mode 100644
index 000000000..6c821673e
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/mem_sse2.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_MEM_SSE2_H_
+#define AOM_AOM_DSP_X86_MEM_SSE2_H_
+
+#include <emmintrin.h>  // SSE2
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+
+static INLINE __m128i loadh_epi64(const void *const src, const __m128i s) {
+  return _mm_castps_si128(
+      _mm_loadh_pi(_mm_castsi128_ps(s), (const __m64 *)src));
+}
+
+static INLINE __m128i load_8bit_4x4_to_1_reg_sse2(const void *const src,
+                                                  const int byte_stride) {
+  return _mm_setr_epi32(*(const int32_t *)((int8_t *)src + 0 * byte_stride),
+                        *(const int32_t *)((int8_t *)src + 1 * byte_stride),
+                        *(const int32_t *)((int8_t *)src + 2 * byte_stride),
+                        *(const int32_t *)((int8_t *)src + 3 * byte_stride));
+}
+
+static INLINE __m128i load_8bit_8x2_to_1_reg_sse2(const void *const src,
+                                                  const int byte_stride) {
+  __m128i dst;
+  dst = _mm_loadl_epi64((__m128i *)((int8_t *)src + 0 * byte_stride));
+  dst = loadh_epi64((int8_t *)src + 1 * byte_stride, dst);
+  return dst;
+}
+
+#endif  // AOM_AOM_DSP_X86_MEM_SSE2_H_
diff --git a/libs/libaom/src/aom_dsp/x86/obmc_intrinsic_sse4.h b/libs/libaom/src/aom_dsp/x86/obmc_intrinsic_sse4.h
new file mode 100644
index 000000000..5181e444c
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/obmc_intrinsic_sse4.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSE4_H_
+#define AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSE4_H_
+
+#include <smmintrin.h>
+
+#include "aom_dsp/x86/obmc_intrinsic_ssse3.h"
+
+static INLINE void obmc_variance_w4(const uint8_t *pre, const int pre_stride,
+                                    const int32_t *wsrc, const int32_t *mask,
+                                    unsigned int *const sse, int *const sum,
+                                    const int h) {
+  const int pre_step = pre_stride - 4;
+  int n = 0;
+  __m128i v_sum_d = _mm_setzero_si128();
+  __m128i v_sse_d = _mm_setzero_si128();
+
+  assert(IS_POWER_OF_TWO(h));
+
+  do {
+    const __m128i v_p_b = _mm_cvtsi32_si128(*(const uint32_t *)(pre + n));
+    const __m128i v_m_d = _mm_load_si128((const __m128i *)(mask + n));
+    const __m128i v_w_d = _mm_load_si128((const __m128i *)(wsrc + n));
+
+    const __m128i v_p_d = _mm_cvtepu8_epi32(v_p_b);
+
+    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+    // boundaries. We use pmaddwd, as it has lower latency on Haswell
+    // than pmulld but produces the same result with these inputs.
+    const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
+
+    const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
+    const __m128i v_rdiff_d = xx_roundn_epi32(v_diff_d, 12);
+    const __m128i v_sqrdiff_d = _mm_mullo_epi32(v_rdiff_d, v_rdiff_d);
+
+    v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d);
+    v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
+
+    n += 4;
+
+    if (n % 4 == 0) pre += pre_step;
+  } while (n < 4 * h);
+
+  *sum = xx_hsum_epi32_si32(v_sum_d);
+  *sse = xx_hsum_epi32_si32(v_sse_d);
+}
+
+#endif  // AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSE4_H_
diff --git a/libs/libaom/src/aom_dsp/x86/obmc_intrinsic_ssse3.h b/libs/libaom/src/aom_dsp/x86/obmc_intrinsic_ssse3.h
new file mode 100644
index 000000000..48486c6c4
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/obmc_intrinsic_ssse3.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSSE3_H_
+#define AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSSE3_H_
+
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+
+static INLINE int32_t xx_hsum_epi32_si32(__m128i v_d) {
+  v_d = _mm_hadd_epi32(v_d, v_d);
+  v_d = _mm_hadd_epi32(v_d, v_d);
+  return _mm_cvtsi128_si32(v_d);
+}
+
+static INLINE int64_t xx_hsum_epi64_si64(__m128i v_q) {
+  v_q = _mm_add_epi64(v_q, _mm_srli_si128(v_q, 8));
+#if ARCH_X86_64
+  return _mm_cvtsi128_si64(v_q);
+#else
+  {
+    int64_t tmp;
+    _mm_storel_epi64((__m128i *)&tmp, v_q);
+    return tmp;
+  }
+#endif
+}
+
+static INLINE int64_t xx_hsum_epi32_si64(__m128i v_d) {
+  const __m128i v_sign_d = _mm_cmplt_epi32(v_d, _mm_setzero_si128());
+  const __m128i v_0_q = _mm_unpacklo_epi32(v_d, v_sign_d);
+  const __m128i v_1_q = _mm_unpackhi_epi32(v_d, v_sign_d);
+  return xx_hsum_epi64_si64(_mm_add_epi64(v_0_q, v_1_q));
+}
+
+// This is equivalent to ROUND_POWER_OF_TWO_SIGNED(v_val_d, bits)
+static INLINE __m128i xx_roundn_epi32(__m128i v_val_d, int bits) {
+  const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
+  const __m128i v_sign_d = _mm_srai_epi32(v_val_d, 31);
+  const __m128i v_tmp_d =
+      _mm_add_epi32(_mm_add_epi32(v_val_d, v_bias_d), v_sign_d);
+  return _mm_srai_epi32(v_tmp_d, bits);
+}
+
+#endif  // AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSSE3_H_
diff --git a/libs/libaom/src/aom_dsp/x86/obmc_sad_avx2.c b/libs/libaom/src/aom_dsp/x86/obmc_sad_avx2.c
new file mode 100644
index 000000000..2aa2a0555
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/obmc_sad_avx2.c
@@ -0,0 +1,270 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+
+#include "aom_ports/mem.h"
+#include "aom/aom_integer.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/x86/obmc_intrinsic_ssse3.h"
+#include "aom_dsp/x86/synonyms.h"
+
+////////////////////////////////////////////////////////////////////////////////
+// 8 bit
+////////////////////////////////////////////////////////////////////////////////
+
+static INLINE unsigned int obmc_sad_w4_avx2(const uint8_t *pre,
+                                            const int pre_stride,
+                                            const int32_t *wsrc,
+                                            const int32_t *mask,
+                                            const int height) {
+  int n = 0;
+  __m256i v_sad_d = _mm256_setzero_si256();
+  const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
+
+  do {
+    const __m128i v_p_b_0 = xx_loadl_32(pre);
+    const __m128i v_p_b_1 = xx_loadl_32(pre + pre_stride);
+    const __m128i v_p_b = _mm_unpacklo_epi32(v_p_b_0, v_p_b_1);
+    const __m256i v_m_d = _mm256_lddqu_si256((__m256i *)(mask + n));
+    const __m256i v_w_d = _mm256_lddqu_si256((__m256i *)(wsrc + n));
+
+    const __m256i v_p_d = _mm256_cvtepu8_epi32(v_p_b);
+
+    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+    // boundaries. We use pmaddwd, as it has lower latency on Haswell
+    // than pmulld but produces the same result with these inputs.
+    const __m256i v_pm_d = _mm256_madd_epi16(v_p_d, v_m_d);
+
+    const __m256i v_diff_d = _mm256_sub_epi32(v_w_d, v_pm_d);
+    const __m256i v_absdiff_d = _mm256_abs_epi32(v_diff_d);
+
+    // Rounded absolute difference
+    const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff_d, v_bias_d);
+    const __m256i v_rad_d = _mm256_srli_epi32(v_tmp_d, 12);
+
+    v_sad_d = _mm256_add_epi32(v_sad_d, v_rad_d);
+
+    n += 8;
+    pre += pre_stride << 1;
+  } while (n < 8 * (height >> 1));
+
+  __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d);
+  __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1);
+  v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1);
+  return xx_hsum_epi32_si32(v_sad_d_0);
+}
+
+static INLINE unsigned int obmc_sad_w8n_avx2(
+    const uint8_t *pre, const int pre_stride, const int32_t *wsrc,
+    const int32_t *mask, const int width, const int height) {
+  const int pre_step = pre_stride - width;
+  int n = 0;
+  __m256i v_sad_d = _mm256_setzero_si256();
+  const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
+  assert(width >= 8);
+  assert(IS_POWER_OF_TWO(width));
+
+  do {
+    const __m128i v_p0_b = xx_loadl_64(pre + n);
+    const __m256i v_m0_d = _mm256_lddqu_si256((__m256i *)(mask + n));
+    const __m256i v_w0_d = _mm256_lddqu_si256((__m256i *)(wsrc + n));
+
+    const __m256i v_p0_d = _mm256_cvtepu8_epi32(v_p0_b);
+
+    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+    // boundaries. We use pmaddwd, as it has lower latency on Haswell
+    // than pmulld but produces the same result with these inputs.
+    const __m256i v_pm0_d = _mm256_madd_epi16(v_p0_d, v_m0_d);
+
+    const __m256i v_diff0_d = _mm256_sub_epi32(v_w0_d, v_pm0_d);
+    const __m256i v_absdiff0_d = _mm256_abs_epi32(v_diff0_d);
+
+    // Rounded absolute difference
+    const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff0_d, v_bias_d);
+    const __m256i v_rad0_d = _mm256_srli_epi32(v_tmp_d, 12);
+
+    v_sad_d = _mm256_add_epi32(v_sad_d, v_rad0_d);
+
+    n += 8;
+
+    if ((n & (width - 1)) == 0) pre += pre_step;
+  } while (n < width * height);
+
+  __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d);
+  __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1);
+  v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1);
+  return xx_hsum_epi32_si32(v_sad_d_0);
+}
+
+#define OBMCSADWXH(w, h)                                          \
+  unsigned int aom_obmc_sad##w##x##h##_avx2(                      \
+      const uint8_t *pre, int pre_stride, const int32_t *wsrc,    \
+      const int32_t *msk) {                                       \
+    if (w == 4) {                                                 \
+      return obmc_sad_w4_avx2(pre, pre_stride, wsrc, msk, h);     \
+    } else {                                                      \
+      return obmc_sad_w8n_avx2(pre, pre_stride, wsrc, msk, w, h); \
+    }                                                             \
+  }
+
+OBMCSADWXH(128, 128)
+OBMCSADWXH(128, 64)
+OBMCSADWXH(64, 128)
+OBMCSADWXH(64, 64)
+OBMCSADWXH(64, 32)
+OBMCSADWXH(32, 64)
+OBMCSADWXH(32, 32)
+OBMCSADWXH(32, 16)
+OBMCSADWXH(16, 32)
+OBMCSADWXH(16, 16)
+OBMCSADWXH(16, 8)
+OBMCSADWXH(8, 16)
+OBMCSADWXH(8, 8)
+OBMCSADWXH(8, 4)
+OBMCSADWXH(4, 8)
+OBMCSADWXH(4, 4)
+OBMCSADWXH(4, 16)
+OBMCSADWXH(16, 4)
+OBMCSADWXH(8, 32)
+OBMCSADWXH(32, 8)
+OBMCSADWXH(16, 64)
+OBMCSADWXH(64, 16)
+
+////////////////////////////////////////////////////////////////////////////////
+// High bit-depth
+////////////////////////////////////////////////////////////////////////////////
+
+static INLINE unsigned int hbd_obmc_sad_w4_avx2(const uint8_t *pre8,
+                                                const int pre_stride,
+                                                const int32_t *wsrc,
+                                                const int32_t *mask,
+                                                const int height) {
+  const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
+  int n = 0;
+  __m256i v_sad_d = _mm256_setzero_si256();
+  const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
+  do {
+    const __m128i v_p_w_0 = xx_loadl_64(pre);
+    const __m128i v_p_w_1 = xx_loadl_64(pre + pre_stride);
+    const __m128i v_p_w = _mm_unpacklo_epi64(v_p_w_0, v_p_w_1);
+    const __m256i v_m_d = _mm256_lddqu_si256((__m256i *)(mask + n));
+    const __m256i v_w_d = _mm256_lddqu_si256((__m256i *)(wsrc + n));
+
+    const __m256i v_p_d = _mm256_cvtepu16_epi32(v_p_w);
+
+    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+    // boundaries. We use pmaddwd, as it has lower latency on Haswell
+    // than pmulld but produces the same result with these inputs.
+    const __m256i v_pm_d = _mm256_madd_epi16(v_p_d, v_m_d);
+
+    const __m256i v_diff_d = _mm256_sub_epi32(v_w_d, v_pm_d);
+    const __m256i v_absdiff_d = _mm256_abs_epi32(v_diff_d);
+
+    // Rounded absolute difference
+
+    const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff_d, v_bias_d);
+    const __m256i v_rad_d = _mm256_srli_epi32(v_tmp_d, 12);
+
+    v_sad_d = _mm256_add_epi32(v_sad_d, v_rad_d);
+
+    n += 8;
+
+    pre += pre_stride << 1;
+  } while (n < 8 * (height >> 1));
+
+  __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d);
+  __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1);
+  v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1);
+  return xx_hsum_epi32_si32(v_sad_d_0);
+}
+
+static INLINE unsigned int hbd_obmc_sad_w8n_avx2(
+    const uint8_t *pre8, const int pre_stride, const int32_t *wsrc,
+    const int32_t *mask, const int width, const int height) {
+  const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
+  const int pre_step = pre_stride - width;
+  int n = 0;
+  __m256i v_sad_d = _mm256_setzero_si256();
+  const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
+
+  assert(width >= 8);
+  assert(IS_POWER_OF_TWO(width));
+
+  do {
+    const __m128i v_p0_w = _mm_lddqu_si128((__m128i *)(pre + n));
+    const __m256i v_m0_d = _mm256_lddqu_si256((__m256i *)(mask + n));
+    const __m256i v_w0_d = _mm256_lddqu_si256((__m256i *)(wsrc + n));
+
+    const __m256i v_p0_d = _mm256_cvtepu16_epi32(v_p0_w);
+
+    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+    // boundaries. We use pmaddwd, as it has lower latency on Haswell
+    // than pmulld but produces the same result with these inputs.
+    const __m256i v_pm0_d = _mm256_madd_epi16(v_p0_d, v_m0_d);
+
+    const __m256i v_diff0_d = _mm256_sub_epi32(v_w0_d, v_pm0_d);
+    const __m256i v_absdiff0_d = _mm256_abs_epi32(v_diff0_d);
+
+    // Rounded absolute difference
+    const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff0_d, v_bias_d);
+    const __m256i v_rad0_d = _mm256_srli_epi32(v_tmp_d, 12);
+
+    v_sad_d = _mm256_add_epi32(v_sad_d, v_rad0_d);
+
+    n += 8;
+
+    if (n % width == 0) pre += pre_step;
+  } while (n < width * height);
+
+  __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d);
+  __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1);
+  v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1);
+  return xx_hsum_epi32_si32(v_sad_d_0);
+}
+
+#define HBD_OBMCSADWXH(w, h)                                           \
+  unsigned int aom_highbd_obmc_sad##w##x##h##_avx2(                    \
+      const uint8_t *pre, int pre_stride, const int32_t *wsrc,         \
+      const int32_t *mask) {                                           \
+    if (w == 4) {                                                      \
+      return hbd_obmc_sad_w4_avx2(pre, pre_stride, wsrc, mask, h);     \
+    } else {                                                           \
+      return hbd_obmc_sad_w8n_avx2(pre, pre_stride, wsrc, mask, w, h); \
+    }                                                                  \
+  }
+
+HBD_OBMCSADWXH(128, 128)
+HBD_OBMCSADWXH(128, 64)
+HBD_OBMCSADWXH(64, 128)
+HBD_OBMCSADWXH(64, 64)
+HBD_OBMCSADWXH(64, 32)
+HBD_OBMCSADWXH(32, 64)
+HBD_OBMCSADWXH(32, 32)
+HBD_OBMCSADWXH(32, 16)
+HBD_OBMCSADWXH(16, 32)
+HBD_OBMCSADWXH(16, 16)
+HBD_OBMCSADWXH(16, 8)
+HBD_OBMCSADWXH(8, 16)
+HBD_OBMCSADWXH(8, 8)
+HBD_OBMCSADWXH(8, 4)
+HBD_OBMCSADWXH(4, 8)
+HBD_OBMCSADWXH(4, 4)
+HBD_OBMCSADWXH(4, 16)
+HBD_OBMCSADWXH(16, 4)
+HBD_OBMCSADWXH(8, 32)
+HBD_OBMCSADWXH(32, 8)
+HBD_OBMCSADWXH(16, 64)
+HBD_OBMCSADWXH(64, 16)
diff --git a/libs/libaom/src/aom_dsp/x86/obmc_sad_sse4.c b/libs/libaom/src/aom_dsp/x86/obmc_sad_sse4.c
new file mode 100644
index 000000000..0338a8c77
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/obmc_sad_sse4.c
@@ -0,0 +1,268 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+
+#include "aom_ports/mem.h"
+#include "aom/aom_integer.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/x86/obmc_intrinsic_ssse3.h"
+#include "aom_dsp/x86/synonyms.h"
+
+////////////////////////////////////////////////////////////////////////////////
+// 8 bit
+////////////////////////////////////////////////////////////////////////////////
+
+static AOM_FORCE_INLINE unsigned int obmc_sad_w4(const uint8_t *pre,
+                                                 const int pre_stride,
+                                                 const int32_t *wsrc,
+                                                 const int32_t *mask,
+                                                 const int height) {
+  const int pre_step = pre_stride - 4;
+  int n = 0;
+  __m128i v_sad_d = _mm_setzero_si128();
+
+  do {
+    const __m128i v_p_b = xx_loadl_32(pre + n);
+    const __m128i v_m_d = xx_load_128(mask + n);
+    const __m128i v_w_d = xx_load_128(wsrc + n);
+
+    const __m128i v_p_d = _mm_cvtepu8_epi32(v_p_b);
+
+    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+    // boundaries. We use pmaddwd, as it has lower latency on Haswell
+    // than pmulld but produces the same result with these inputs.
+    const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
+
+    const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
+    const __m128i v_absdiff_d = _mm_abs_epi32(v_diff_d);
+
+    // Rounded absolute difference
+    const __m128i v_rad_d = xx_roundn_epu32(v_absdiff_d, 12);
+
+    v_sad_d = _mm_add_epi32(v_sad_d, v_rad_d);
+
+    n += 4;
+
+    if (n % 4 == 0) pre += pre_step;
+  } while (n < 4 * height);
+
+  return xx_hsum_epi32_si32(v_sad_d);
+}
+
+static AOM_FORCE_INLINE unsigned int obmc_sad_w8n(
+    const uint8_t *pre, const int pre_stride, const int32_t *wsrc,
+    const int32_t *mask, const int width, const int height) {
+  const int pre_step = pre_stride - width;
+  int n = 0;
+  __m128i v_sad_d = _mm_setzero_si128();
+
+  assert(width >= 8);
+  assert(IS_POWER_OF_TWO(width));
+
+  do {
+    const __m128i v_p1_b = xx_loadl_32(pre + n + 4);
+    const __m128i v_m1_d = xx_load_128(mask + n + 4);
+    const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
+    const __m128i v_p0_b = xx_loadl_32(pre + n);
+    const __m128i v_m0_d = xx_load_128(mask + n);
+    const __m128i v_w0_d = xx_load_128(wsrc + n);
+
+    const __m128i v_p0_d = _mm_cvtepu8_epi32(v_p0_b);
+    const __m128i v_p1_d = _mm_cvtepu8_epi32(v_p1_b);
+
+    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+    // boundaries. We use pmaddwd, as it has lower latency on Haswell
+    // than pmulld but produces the same result with these inputs.
+    const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
+    const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
+
+    const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
+    const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
+    const __m128i v_absdiff0_d = _mm_abs_epi32(v_diff0_d);
+    const __m128i v_absdiff1_d = _mm_abs_epi32(v_diff1_d);
+
+    // Rounded absolute difference
+    const __m128i v_rad0_d = xx_roundn_epu32(v_absdiff0_d, 12);
+    const __m128i v_rad1_d = xx_roundn_epu32(v_absdiff1_d, 12);
+
+    v_sad_d = _mm_add_epi32(v_sad_d, v_rad0_d);
+    v_sad_d = _mm_add_epi32(v_sad_d, v_rad1_d);
+
+    n += 8;
+
+    if (n % width == 0) pre += pre_step;
+  } while (n < width * height);
+
+  return xx_hsum_epi32_si32(v_sad_d);
+}
+
+#define OBMCSADWXH(w, h)                                       \
+  unsigned int aom_obmc_sad##w##x##h##_sse4_1(                 \
+      const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
+      const int32_t *msk) {                                    \
+    if (w == 4) {                                              \
+      return obmc_sad_w4(pre, pre_stride, wsrc, msk, h);       \
+    } else {                                                   \
+      return obmc_sad_w8n(pre, pre_stride, wsrc, msk, w, h);   \
+    }                                                          \
+  }
+
+OBMCSADWXH(128, 128)
+OBMCSADWXH(128, 64)
+OBMCSADWXH(64, 128)
+OBMCSADWXH(64, 64)
+OBMCSADWXH(64, 32)
+OBMCSADWXH(32, 64)
+OBMCSADWXH(32, 32)
+OBMCSADWXH(32, 16)
+OBMCSADWXH(16, 32)
+OBMCSADWXH(16, 16)
+OBMCSADWXH(16, 8)
+OBMCSADWXH(8, 16)
+OBMCSADWXH(8, 8)
+OBMCSADWXH(8, 4)
+OBMCSADWXH(4, 8)
+OBMCSADWXH(4, 4)
+OBMCSADWXH(4, 16)
+OBMCSADWXH(16, 4)
+OBMCSADWXH(8, 32)
+OBMCSADWXH(32, 8)
+OBMCSADWXH(16, 64)
+OBMCSADWXH(64, 16)
+
+////////////////////////////////////////////////////////////////////////////////
+// High bit-depth
+////////////////////////////////////////////////////////////////////////////////
+
+static AOM_FORCE_INLINE unsigned int hbd_obmc_sad_w4(const uint8_t *pre8,
+                                                     const int pre_stride,
+                                                     const int32_t *wsrc,
+                                                     const int32_t *mask,
+                                                     const int height) {
+  const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
+  const int pre_step = pre_stride - 4;
+  int n = 0;
+  __m128i v_sad_d = _mm_setzero_si128();
+
+  do {
+    const __m128i v_p_w = xx_loadl_64(pre + n);
+    const __m128i v_m_d = xx_load_128(mask + n);
+    const __m128i v_w_d = xx_load_128(wsrc + n);
+
+    const __m128i v_p_d = _mm_cvtepu16_epi32(v_p_w);
+
+    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+    // boundaries. We use pmaddwd, as it has lower latency on Haswell
+    // than pmulld but produces the same result with these inputs.
+    const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
+
+    const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
+    const __m128i v_absdiff_d = _mm_abs_epi32(v_diff_d);
+
+    // Rounded absolute difference
+    const __m128i v_rad_d = xx_roundn_epu32(v_absdiff_d, 12);
+
+    v_sad_d = _mm_add_epi32(v_sad_d, v_rad_d);
+
+    n += 4;
+
+    if (n % 4 == 0) pre += pre_step;
+  } while (n < 4 * height);
+
+  return xx_hsum_epi32_si32(v_sad_d);
+}
+
+static AOM_FORCE_INLINE unsigned int hbd_obmc_sad_w8n(
+    const uint8_t *pre8, const int pre_stride, const int32_t *wsrc,
+    const int32_t *mask, const int width, const int height) {
+  const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
+  const int pre_step = pre_stride - width;
+  int n = 0;
+  __m128i v_sad_d = _mm_setzero_si128();
+
+  assert(width >= 8);
+  assert(IS_POWER_OF_TWO(width));
+
+  do {
+    const __m128i v_p1_w = xx_loadl_64(pre + n + 4);
+    const __m128i v_m1_d = xx_load_128(mask + n + 4);
+    const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
+    const __m128i v_p0_w = xx_loadl_64(pre + n);
+    const __m128i v_m0_d = xx_load_128(mask + n);
+    const __m128i v_w0_d = xx_load_128(wsrc + n);
+
+    const __m128i v_p0_d = _mm_cvtepu16_epi32(v_p0_w);
+    const __m128i v_p1_d = _mm_cvtepu16_epi32(v_p1_w);
+
+    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+    // boundaries. We use pmaddwd, as it has lower latency on Haswell
+    // than pmulld but produces the same result with these inputs.
+    const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
+    const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
+
+    const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
+    const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
+    const __m128i v_absdiff0_d = _mm_abs_epi32(v_diff0_d);
+    const __m128i v_absdiff1_d = _mm_abs_epi32(v_diff1_d);
+
+    // Rounded absolute difference
+    const __m128i v_rad0_d = xx_roundn_epu32(v_absdiff0_d, 12);
+    const __m128i v_rad1_d = xx_roundn_epu32(v_absdiff1_d, 12);
+
+    v_sad_d = _mm_add_epi32(v_sad_d, v_rad0_d);
+    v_sad_d = _mm_add_epi32(v_sad_d, v_rad1_d);
+
+    n += 8;
+
+    if (n % width == 0) pre += pre_step;
+  } while (n < width * height);
+
+  return xx_hsum_epi32_si32(v_sad_d);
+}
+
+#define HBD_OBMCSADWXH(w, h)                                      \
+  unsigned int aom_highbd_obmc_sad##w##x##h##_sse4_1(             \
+      const uint8_t *pre, int pre_stride, const int32_t *wsrc,    \
+      const int32_t *mask) {                                      \
+    if (w == 4) {                                                 \
+      return hbd_obmc_sad_w4(pre, pre_stride, wsrc, mask, h);     \
+    } else {                                                      \
+      return hbd_obmc_sad_w8n(pre, pre_stride, wsrc, mask, w, h); \
+    }                                                             \
+  }
+
+HBD_OBMCSADWXH(128, 128)
+HBD_OBMCSADWXH(128, 64)
+HBD_OBMCSADWXH(64, 128)
+HBD_OBMCSADWXH(64, 64)
+HBD_OBMCSADWXH(64, 32)
+HBD_OBMCSADWXH(32, 64)
+HBD_OBMCSADWXH(32, 32)
+HBD_OBMCSADWXH(32, 16)
+HBD_OBMCSADWXH(16, 32)
+HBD_OBMCSADWXH(16, 16)
+HBD_OBMCSADWXH(16, 8)
+HBD_OBMCSADWXH(8, 16)
+HBD_OBMCSADWXH(8, 8)
+HBD_OBMCSADWXH(8, 4)
+HBD_OBMCSADWXH(4, 8)
+HBD_OBMCSADWXH(4, 4)
+HBD_OBMCSADWXH(4, 16)
+HBD_OBMCSADWXH(16, 4)
+HBD_OBMCSADWXH(8, 32)
+HBD_OBMCSADWXH(32, 8)
+HBD_OBMCSADWXH(16, 64)
+HBD_OBMCSADWXH(64, 16)
diff --git a/libs/libaom/src/aom_dsp/x86/obmc_variance_avx2.c b/libs/libaom/src/aom_dsp/x86/obmc_variance_avx2.c
new file mode 100644
index 000000000..bfec0e8a8
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/obmc_variance_avx2.c
@@ -0,0 +1,190 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+
+#include "aom_ports/mem.h"
+#include "aom/aom_integer.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/obmc_intrinsic_sse4.h"
+
+////////////////////////////////////////////////////////////////////////////////
+// 8 bit
+////////////////////////////////////////////////////////////////////////////////
+
+static INLINE void obmc_variance_w8n(const uint8_t *pre, const int pre_stride,
+                                     const int32_t *wsrc, const int32_t *mask,
+                                     unsigned int *const sse, int *const sum,
+                                     const int w, const int h) {
+  int n = 0, width, height = h;
+  __m128i v_sum_d = _mm_setzero_si128();
+  __m128i v_sse_d = _mm_setzero_si128();
+  const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
+  __m128i v_d;
+  const uint8_t *pre_temp;
+  assert(w >= 8);
+  assert(IS_POWER_OF_TWO(w));
+  assert(IS_POWER_OF_TWO(h));
+  do {
+    width = w;
+    pre_temp = pre;
+    do {
+      const __m128i v_p_b = _mm_loadl_epi64((const __m128i *)pre_temp);
+      const __m256i v_m_d = _mm256_loadu_si256((__m256i const *)(mask + n));
+      const __m256i v_w_d = _mm256_loadu_si256((__m256i const *)(wsrc + n));
+      const __m256i v_p0_d = _mm256_cvtepu8_epi32(v_p_b);
+
+      // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+      // boundaries. We use pmaddwd, as it has lower latency on Haswell
+      // than pmulld but produces the same result with these inputs.
+      const __m256i v_pm_d = _mm256_madd_epi16(v_p0_d, v_m_d);
+      const __m256i v_diff0_d = _mm256_sub_epi32(v_w_d, v_pm_d);
+
+      const __m256i v_sign_d = _mm256_srai_epi32(v_diff0_d, 31);
+      const __m256i v_tmp_d =
+          _mm256_add_epi32(_mm256_add_epi32(v_diff0_d, v_bias_d), v_sign_d);
+      const __m256i v_rdiff0_d = _mm256_srai_epi32(v_tmp_d, 12);
+      const __m128i v_rdiff_d = _mm256_castsi256_si128(v_rdiff0_d);
+      const __m128i v_rdiff1_d = _mm256_extracti128_si256(v_rdiff0_d, 1);
+
+      const __m128i v_rdiff01_w = _mm_packs_epi32(v_rdiff_d, v_rdiff1_d);
+      const __m128i v_sqrdiff_d = _mm_madd_epi16(v_rdiff01_w, v_rdiff01_w);
+
+      v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d);
+      v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff1_d);
+      v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
+
+      pre_temp += 8;
+      n += 8;
+      width -= 8;
+    } while (width > 0);
+    pre += pre_stride;
+    height -= 1;
+  } while (height > 0);
+  v_d = _mm_hadd_epi32(v_sum_d, v_sse_d);
+  v_d = _mm_hadd_epi32(v_d, v_d);
+  *sum = _mm_cvtsi128_si32(v_d);
+  *sse = _mm_cvtsi128_si32(_mm_srli_si128(v_d, 4));
+}
+
+static INLINE void obmc_variance_w16n(const uint8_t *pre, const int pre_stride,
+                                      const int32_t *wsrc, const int32_t *mask,
+                                      unsigned int *const sse, int *const sum,
+                                      const int w, const int h) {
+  int n = 0, width, height = h;
+  __m256i v_d;
+  __m128i res0;
+  const uint8_t *pre_temp;
+  const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
+  __m256i v_sum_d = _mm256_setzero_si256();
+  __m256i v_sse_d = _mm256_setzero_si256();
+
+  assert(w >= 16);
+  assert(IS_POWER_OF_TWO(w));
+  assert(IS_POWER_OF_TWO(h));
+  do {
+    width = w;
+    pre_temp = pre;
+    do {
+      const __m128i v_p_b = _mm_loadu_si128((__m128i *)pre_temp);
+      const __m256i v_m0_d = _mm256_loadu_si256((__m256i const *)(mask + n));
+      const __m256i v_w0_d = _mm256_loadu_si256((__m256i const *)(wsrc + n));
+      const __m256i v_m1_d =
+          _mm256_loadu_si256((__m256i const *)(mask + n + 8));
+      const __m256i v_w1_d =
+          _mm256_loadu_si256((__m256i const *)(wsrc + n + 8));
+
+      const __m256i v_p0_d = _mm256_cvtepu8_epi32(v_p_b);
+      const __m256i v_p1_d = _mm256_cvtepu8_epi32(_mm_srli_si128(v_p_b, 8));
+
+      const __m256i v_pm0_d = _mm256_madd_epi16(v_p0_d, v_m0_d);
+      const __m256i v_pm1_d = _mm256_madd_epi16(v_p1_d, v_m1_d);
+
+      const __m256i v_diff0_d = _mm256_sub_epi32(v_w0_d, v_pm0_d);
+      const __m256i v_diff1_d = _mm256_sub_epi32(v_w1_d, v_pm1_d);
+
+      const __m256i v_sign0_d = _mm256_srai_epi32(v_diff0_d, 31);
+      const __m256i v_sign1_d = _mm256_srai_epi32(v_diff1_d, 31);
+
+      const __m256i v_tmp0_d =
+          _mm256_add_epi32(_mm256_add_epi32(v_diff0_d, v_bias_d), v_sign0_d);
+      const __m256i v_tmp1_d =
+          _mm256_add_epi32(_mm256_add_epi32(v_diff1_d, v_bias_d), v_sign1_d);
+
+      const __m256i v_rdiff0_d = _mm256_srai_epi32(v_tmp0_d, 12);
+      const __m256i v_rdiff2_d = _mm256_srai_epi32(v_tmp1_d, 12);
+
+      const __m256i v_rdiff1_d = _mm256_add_epi32(v_rdiff0_d, v_rdiff2_d);
+      const __m256i v_rdiff01_w = _mm256_packs_epi32(v_rdiff0_d, v_rdiff2_d);
+      const __m256i v_sqrdiff_d = _mm256_madd_epi16(v_rdiff01_w, v_rdiff01_w);
+
+      v_sum_d = _mm256_add_epi32(v_sum_d, v_rdiff1_d);
+      v_sse_d = _mm256_add_epi32(v_sse_d, v_sqrdiff_d);
+
+      pre_temp += 16;
+      n += 16;
+      width -= 16;
+    } while (width > 0);
+    pre += pre_stride;
+    height -= 1;
+  } while (height > 0);
+
+  v_d = _mm256_hadd_epi32(v_sum_d, v_sse_d);
+  v_d = _mm256_hadd_epi32(v_d, v_d);
+  res0 = _mm256_castsi256_si128(v_d);
+  res0 = _mm_add_epi32(res0, _mm256_extractf128_si256(v_d, 1));
+  *sum = _mm_cvtsi128_si32(res0);
+  *sse = _mm_cvtsi128_si32(_mm_srli_si128(res0, 4));
+}
+
+#define OBMCVARWXH(W, H)                                                \
+  unsigned int aom_obmc_variance##W##x##H##_avx2(                       \
+      const uint8_t *pre, int pre_stride, const int32_t *wsrc,          \
+      const int32_t *mask, unsigned int *sse) {                         \
+    int sum;                                                            \
+    if (W == 4) {                                                       \
+      obmc_variance_w4(pre, pre_stride, wsrc, mask, sse, &sum, H);      \
+    } else if (W == 8) {                                                \
+      obmc_variance_w8n(pre, pre_stride, wsrc, mask, sse, &sum, W, H);  \
+    } else {                                                            \
+      obmc_variance_w16n(pre, pre_stride, wsrc, mask, sse, &sum, W, H); \
+    }                                                                   \
+                                                                        \
+    return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H));       \
+  }
+
+OBMCVARWXH(128, 128)
+OBMCVARWXH(128, 64)
+OBMCVARWXH(64, 128)
+OBMCVARWXH(64, 64)
+OBMCVARWXH(64, 32)
+OBMCVARWXH(32, 64)
+OBMCVARWXH(32, 32)
+OBMCVARWXH(32, 16)
+OBMCVARWXH(16, 32)
+OBMCVARWXH(16, 16)
+OBMCVARWXH(16, 8)
+OBMCVARWXH(8, 16)
+OBMCVARWXH(8, 8)
+OBMCVARWXH(8, 4)
+OBMCVARWXH(4, 8)
+OBMCVARWXH(4, 4)
+OBMCVARWXH(4, 16)
+OBMCVARWXH(16, 4)
+OBMCVARWXH(8, 32)
+OBMCVARWXH(32, 8)
+OBMCVARWXH(16, 64)
+OBMCVARWXH(64, 16)
diff --git a/libs/libaom/src/aom_dsp/x86/obmc_variance_sse4.c b/libs/libaom/src/aom_dsp/x86/obmc_variance_sse4.c
new file mode 100644
index 000000000..aa73c392d
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/obmc_variance_sse4.c
@@ -0,0 +1,381 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+
+#include "aom_ports/mem.h"
+#include "aom/aom_integer.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/obmc_intrinsic_sse4.h"
+#include "aom_dsp/x86/synonyms.h"
+
+////////////////////////////////////////////////////////////////////////////////
+// 8 bit
+////////////////////////////////////////////////////////////////////////////////
+
+void aom_var_filter_block2d_bil_first_pass_ssse3(
+    const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line,
+    unsigned int pixel_step, unsigned int output_height,
+    unsigned int output_width, const uint8_t *filter);
+
+void aom_var_filter_block2d_bil_second_pass_ssse3(
+    const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line,
+    unsigned int pixel_step, unsigned int output_height,
+    unsigned int output_width, const uint8_t *filter);
+
+static INLINE void obmc_variance_w8n(const uint8_t *pre, const int pre_stride,
+                                     const int32_t *wsrc, const int32_t *mask,
+                                     unsigned int *const sse, int *const sum,
+                                     const int w, const int h) {
+  const int pre_step = pre_stride - w;
+  int n = 0;
+  __m128i v_sum_d = _mm_setzero_si128();
+  __m128i v_sse_d = _mm_setzero_si128();
+
+  assert(w >= 8);
+  assert(IS_POWER_OF_TWO(w));
+  assert(IS_POWER_OF_TWO(h));
+
+  do {
+    const __m128i v_p1_b = xx_loadl_32(pre + n + 4);
+    const __m128i v_m1_d = xx_load_128(mask + n + 4);
+    const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
+    const __m128i v_p0_b = xx_loadl_32(pre + n);
+    const __m128i v_m0_d = xx_load_128(mask + n);
+    const __m128i v_w0_d = xx_load_128(wsrc + n);
+
+    const __m128i v_p0_d = _mm_cvtepu8_epi32(v_p0_b);
+    const __m128i v_p1_d = _mm_cvtepu8_epi32(v_p1_b);
+
+    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+    // boundaries. We use pmaddwd, as it has lower latency on Haswell
+    // than pmulld but produces the same result with these inputs.
+    const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
+    const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
+
+    const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
+    const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
+
+    const __m128i v_rdiff0_d = xx_roundn_epi32(v_diff0_d, 12);
+    const __m128i v_rdiff1_d = xx_roundn_epi32(v_diff1_d, 12);
+    const __m128i v_rdiff01_w = _mm_packs_epi32(v_rdiff0_d, v_rdiff1_d);
+    const __m128i v_sqrdiff_d = _mm_madd_epi16(v_rdiff01_w, v_rdiff01_w);
+
+    v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff0_d);
+    v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff1_d);
+    v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
+
+    n += 8;
+
+    if (n % w == 0) pre += pre_step;
+  } while (n < w * h);
+
+  *sum = xx_hsum_epi32_si32(v_sum_d);
+  *sse = xx_hsum_epi32_si32(v_sse_d);
+}
+
+#define OBMCVARWXH(W, H)                                               \
+  unsigned int aom_obmc_variance##W##x##H##_sse4_1(                    \
+      const uint8_t *pre, int pre_stride, const int32_t *wsrc,         \
+      const int32_t *mask, unsigned int *sse) {                        \
+    int sum;                                                           \
+    if (W == 4) {                                                      \
+      obmc_variance_w4(pre, pre_stride, wsrc, mask, sse, &sum, H);     \
+    } else {                                                           \
+      obmc_variance_w8n(pre, pre_stride, wsrc, mask, sse, &sum, W, H); \
+    }                                                                  \
+    return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H));      \
+  }
+
+OBMCVARWXH(128, 128)
+OBMCVARWXH(128, 64)
+OBMCVARWXH(64, 128)
+OBMCVARWXH(64, 64)
+OBMCVARWXH(64, 32)
+OBMCVARWXH(32, 64)
+OBMCVARWXH(32, 32)
+OBMCVARWXH(32, 16)
+OBMCVARWXH(16, 32)
+OBMCVARWXH(16, 16)
+OBMCVARWXH(16, 8)
+OBMCVARWXH(8, 16)
+OBMCVARWXH(8, 8)
+OBMCVARWXH(8, 4)
+OBMCVARWXH(4, 8)
+OBMCVARWXH(4, 4)
+OBMCVARWXH(4, 16)
+OBMCVARWXH(16, 4)
+OBMCVARWXH(8, 32)
+OBMCVARWXH(32, 8)
+OBMCVARWXH(16, 64)
+OBMCVARWXH(64, 16)
+
+#include "config/aom_dsp_rtcd.h"
+
+#define OBMC_SUBPIX_VAR(W, H)                                                \
+  uint32_t aom_obmc_sub_pixel_variance##W##x##H##_sse4_1(                    \
+      const uint8_t *pre, int pre_stride, int xoffset, int yoffset,          \
+      const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {         \
+    uint16_t fdata3[(H + 1) * W];                                            \
+    uint8_t temp2[H * W];                                                    \
+                                                                             \
+    aom_var_filter_block2d_bil_first_pass_ssse3(                             \
+        pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
+    aom_var_filter_block2d_bil_second_pass_ssse3(                            \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
+                                                                             \
+    return aom_obmc_variance##W##x##H##_sse4_1(temp2, W, wsrc, mask, sse);   \
+  }
+
+OBMC_SUBPIX_VAR(128, 128)
+OBMC_SUBPIX_VAR(128, 64)
+OBMC_SUBPIX_VAR(64, 128)
+OBMC_SUBPIX_VAR(64, 64)
+OBMC_SUBPIX_VAR(64, 32)
+OBMC_SUBPIX_VAR(32, 64)
+OBMC_SUBPIX_VAR(32, 32)
+OBMC_SUBPIX_VAR(32, 16)
+OBMC_SUBPIX_VAR(16, 32)
+OBMC_SUBPIX_VAR(16, 16)
+OBMC_SUBPIX_VAR(16, 8)
+OBMC_SUBPIX_VAR(8, 16)
+OBMC_SUBPIX_VAR(8, 8)
+OBMC_SUBPIX_VAR(8, 4)
+OBMC_SUBPIX_VAR(4, 8)
+OBMC_SUBPIX_VAR(4, 4)
+OBMC_SUBPIX_VAR(4, 16)
+OBMC_SUBPIX_VAR(16, 4)
+OBMC_SUBPIX_VAR(8, 32)
+OBMC_SUBPIX_VAR(32, 8)
+OBMC_SUBPIX_VAR(16, 64)
+OBMC_SUBPIX_VAR(64, 16)
+
+////////////////////////////////////////////////////////////////////////////////
+// High bit-depth
+////////////////////////////////////////////////////////////////////////////////
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE void hbd_obmc_variance_w4(
+    const uint8_t *pre8, const int pre_stride, const int32_t *wsrc,
+    const int32_t *mask, uint64_t *const sse, int64_t *const sum, const int h) {
+  const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
+  const int pre_step = pre_stride - 4;
+  int n = 0;
+  __m128i v_sum_d = _mm_setzero_si128();
+  __m128i v_sse_d = _mm_setzero_si128();
+
+  assert(IS_POWER_OF_TWO(h));
+
+  do {
+    const __m128i v_p_w = xx_loadl_64(pre + n);
+    const __m128i v_m_d = xx_load_128(mask + n);
+    const __m128i v_w_d = xx_load_128(wsrc + n);
+
+    const __m128i v_p_d = _mm_cvtepu16_epi32(v_p_w);
+
+    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+    // boundaries. We use pmaddwd, as it has lower latency on Haswell
+    // than pmulld but produces the same result with these inputs.
+    const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
+
+    const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
+    const __m128i v_rdiff_d = xx_roundn_epi32(v_diff_d, 12);
+    const __m128i v_sqrdiff_d = _mm_mullo_epi32(v_rdiff_d, v_rdiff_d);
+
+    v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d);
+    v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
+
+    n += 4;
+
+    if (n % 4 == 0) pre += pre_step;
+  } while (n < 4 * h);
+
+  *sum = xx_hsum_epi32_si32(v_sum_d);
+  *sse = xx_hsum_epi32_si32(v_sse_d);
+}
+
+static INLINE void hbd_obmc_variance_w8n(
+    const uint8_t *pre8, const int pre_stride, const int32_t *wsrc,
+    const int32_t *mask, uint64_t *const sse, int64_t *const sum, const int w,
+    const int h) {
+  const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
+  const int pre_step = pre_stride - w;
+  int n = 0;
+  __m128i v_sum_d = _mm_setzero_si128();
+  __m128i v_sse_d = _mm_setzero_si128();
+
+  assert(w >= 8);
+  assert(IS_POWER_OF_TWO(w));
+  assert(IS_POWER_OF_TWO(h));
+
+  do {
+    const __m128i v_p1_w = xx_loadl_64(pre + n + 4);
+    const __m128i v_m1_d = xx_load_128(mask + n + 4);
+    const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
+    const __m128i v_p0_w = xx_loadl_64(pre + n);
+    const __m128i v_m0_d = xx_load_128(mask + n);
+    const __m128i v_w0_d = xx_load_128(wsrc + n);
+
+    const __m128i v_p0_d = _mm_cvtepu16_epi32(v_p0_w);
+    const __m128i v_p1_d = _mm_cvtepu16_epi32(v_p1_w);
+
+    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+    // boundaries. We use pmaddwd, as it has lower latency on Haswell
+    // than pmulld but produces the same result with these inputs.
+    const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
+    const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
+
+    const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
+    const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
+
+    const __m128i v_rdiff0_d = xx_roundn_epi32(v_diff0_d, 12);
+    const __m128i v_rdiff1_d = xx_roundn_epi32(v_diff1_d, 12);
+    const __m128i v_rdiff01_w = _mm_packs_epi32(v_rdiff0_d, v_rdiff1_d);
+    const __m128i v_sqrdiff_d = _mm_madd_epi16(v_rdiff01_w, v_rdiff01_w);
+
+    v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff0_d);
+    v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff1_d);
+    v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
+
+    n += 8;
+
+    if (n % w == 0) pre += pre_step;
+  } while (n < w * h);
+
+  *sum += xx_hsum_epi32_si64(v_sum_d);
+  *sse += xx_hsum_epi32_si64(v_sse_d);
+}
+
+static INLINE void highbd_obmc_variance(const uint8_t *pre8, int pre_stride,
+                                        const int32_t *wsrc,
+                                        const int32_t *mask, int w, int h,
+                                        unsigned int *sse, int *sum) {
+  int64_t sum64 = 0;
+  uint64_t sse64 = 0;
+  if (w == 4) {
+    hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h);
+  } else {
+    hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h);
+  }
+  *sum = (int)sum64;
+  *sse = (unsigned int)sse64;
+}
+
+static INLINE void highbd_10_obmc_variance(const uint8_t *pre8, int pre_stride,
+                                           const int32_t *wsrc,
+                                           const int32_t *mask, int w, int h,
+                                           unsigned int *sse, int *sum) {
+  int64_t sum64 = 0;
+  uint64_t sse64 = 0;
+  if (w == 4) {
+    hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h);
+  } else if (w < 128 || h < 128) {
+    hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h);
+  } else {
+    assert(w == 128 && h == 128);
+
+    do {
+      hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w,
+                            64);
+      pre8 += 64 * pre_stride;
+      wsrc += 64 * w;
+      mask += 64 * w;
+      h -= 64;
+    } while (h > 0);
+  }
+  *sum = (int)ROUND_POWER_OF_TWO(sum64, 2);
+  *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4);
+}
+
+static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride,
+                                           const int32_t *wsrc,
+                                           const int32_t *mask, int w, int h,
+                                           unsigned int *sse, int *sum) {
+  int64_t sum64 = 0;
+  uint64_t sse64 = 0;
+  int max_pel_allowed_per_ovf = 512;
+  if (w == 4) {
+    hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h);
+  } else if (w * h <= max_pel_allowed_per_ovf) {
+    hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h);
+  } else {
+    int h_per_ovf = max_pel_allowed_per_ovf / w;
+
+    assert(max_pel_allowed_per_ovf % w == 0);
+    do {
+      hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w,
+                            h_per_ovf);
+      pre8 += h_per_ovf * pre_stride;
+      wsrc += h_per_ovf * w;
+      mask += h_per_ovf * w;
+      h -= h_per_ovf;
+    } while (h > 0);
+  }
+  *sum = (int)ROUND_POWER_OF_TWO(sum64, 4);
+  *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8);
+}
+
+#define HBD_OBMCVARWXH(W, H)                                               \
+  unsigned int aom_highbd_obmc_variance##W##x##H##_sse4_1(                 \
+      const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
+      const int32_t *mask, unsigned int *sse) {                            \
+    int sum;                                                               \
+    highbd_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum);    \
+    return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H));          \
+  }                                                                        \
+                                                                           \
+  unsigned int aom_highbd_10_obmc_variance##W##x##H##_sse4_1(              \
+      const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
+      const int32_t *mask, unsigned int *sse) {                            \
+    int sum;                                                               \
+    int64_t var;                                                           \
+    highbd_10_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));              \
+    return (var >= 0) ? (uint32_t)var : 0;                                 \
+  }                                                                        \
+                                                                           \
+  unsigned int aom_highbd_12_obmc_variance##W##x##H##_sse4_1(              \
+      const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
+      const int32_t *mask, unsigned int *sse) {                            \
+    int sum;                                                               \
+    int64_t var;                                                           \
+    highbd_12_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));              \
+    return (var >= 0) ? (uint32_t)var : 0;                                 \
+  }
+
+HBD_OBMCVARWXH(128, 128)
+HBD_OBMCVARWXH(128, 64)
+HBD_OBMCVARWXH(64, 128)
+HBD_OBMCVARWXH(64, 64)
+HBD_OBMCVARWXH(64, 32)
+HBD_OBMCVARWXH(32, 64)
+HBD_OBMCVARWXH(32, 32)
+HBD_OBMCVARWXH(32, 16)
+HBD_OBMCVARWXH(16, 32)
+HBD_OBMCVARWXH(16, 16)
+HBD_OBMCVARWXH(16, 8)
+HBD_OBMCVARWXH(8, 16)
+HBD_OBMCVARWXH(8, 8)
+HBD_OBMCVARWXH(8, 4)
+HBD_OBMCVARWXH(4, 8)
+HBD_OBMCVARWXH(4, 4)
+HBD_OBMCVARWXH(4, 16)
+HBD_OBMCVARWXH(16, 4)
+HBD_OBMCVARWXH(8, 32)
+HBD_OBMCVARWXH(32, 8)
+HBD_OBMCVARWXH(16, 64)
+HBD_OBMCVARWXH(64, 16)
+#endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/libs/libaom/src/aom_dsp/x86/quantize_avx_x86_64.asm b/libs/libaom/src/aom_dsp/x86/quantize_avx_x86_64.asm
new file mode 100644
index 000000000..d6e15c4be
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/quantize_avx_x86_64.asm
@@ -0,0 +1,464 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+%macro QUANTIZE_FN 2
+cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, zbin, round, quant, \
+                                shift, qcoeff, dqcoeff, dequant, \
+                                eob, scan, iscan
+
+  vzeroupper
+
+%ifnidn %1, b_32x32
+
+  ; Special case for ncoeff == 16, as it is frequent and we can save on
+  ; not setting up a loop.
+  cmp                       ncoeffmp, 16
+  jne .generic
+
+  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+  ;; Special case of ncoeff == 16
+  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+.single:
+
+  movifnidn                   coeffq, coeffmp
+  movifnidn                    zbinq, zbinmp
+  mova                            m0, [zbinq]              ; m0 = zbin
+
+  ; Get DC and first 15 AC coeffs - in this special case, that is all.
+  ; coeff stored as 32bit numbers but we process them as 16 bit numbers
+  mova                            m9, [coeffq]
+  packssdw                        m9, [coeffq+16]          ; m9 = c[i]
+  mova                           m10, [coeffq+32]
+  packssdw                       m10, [coeffq+48]          ; m10 = c[i]
+
+  mov                             r0, eobmp                ; Output pointer
+  mov                             r1, qcoeffmp             ; Output pointer
+  mov                             r2, dqcoeffmp            ; Output pointer
+
+  pxor                            m5, m5                   ; m5 = dedicated zero
+
+  pcmpeqw                         m4, m4                   ; All word lanes -1
+  paddw                           m0, m4                   ; m0 = zbin - 1
+
+  pabsw                           m6, m9                   ; m6 = abs(m9)
+  pabsw                          m11, m10                  ; m11 = abs(m10)
+  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
+  punpckhqdq                      m0, m0
+  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
+
+  ; Check if all coeffs are less than zbin. If yes, we just write zeros
+  ; to the outputs and we are done.
+  por                            m14, m7, m12
+  ptest                          m14, m14
+  jnz .single_nonzero
+
+  mova                       [r1   ], ymm5
+  mova                       [r1+32], ymm5
+  mova                       [r2   ], ymm5
+  mova                       [r2+32], ymm5
+  mov                           [r0], word 0
+
+  vzeroupper
+  RET
+
+.single_nonzero:
+
+  ; Actual quantization of size 16 block - setup pointers, rounders, etc.
+  movifnidn                       r3, roundmp
+  movifnidn                       r4, quantmp
+  mov                             r6, dequantmp
+  mov                             r5, shiftmp
+  mova                            m1, [r3]              ; m1 = round
+  mova                            m2, [r4]              ; m2 = quant
+  mova                            m3, [r6]              ; m3 = dequant
+  mova                            m4, [r5]              ; m4 = shift
+
+  mov                             r3, iscanmp
+
+  DEFINE_ARGS eob, qcoeff, dqcoeff, iscan
+
+  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+  paddsw                          m6, m1                   ; m6 += round
+  punpckhqdq                      m1, m1
+  paddsw                         m11, m1                   ; m11 += round
+  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
+  punpckhqdq                      m2, m2
+  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
+  paddw                           m8, m6                   ; m8 += m6
+  paddw                          m13, m11                  ; m13 += m11
+  pmulhw                          m8, m4                   ; m8 = m8*qsh>>16
+  punpckhqdq                      m4, m4
+  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
+  psignw                          m8, m9                   ; m8 = reinsert sign
+  psignw                         m13, m10                  ; m13 = reinsert sign
+  pand                            m8, m7
+  pand                           m13, m12
+
+  ; Store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+  pcmpgtw                         m6, m5, m8
+  punpckhwd                       m6, m8, m6
+  pmovsxwd                       m11, m8
+  mova                  [qcoeffq   ], m11
+  mova                  [qcoeffq+16], m6
+  pcmpgtw                         m6, m5, m13
+  punpckhwd                       m6, m13, m6
+  pmovsxwd                       m11, m13
+  mova                  [qcoeffq+32], m11
+  mova                  [qcoeffq+48], m6
+
+  pmullw                          m8, m3                   ; dqc[i] = qc[i] * q
+  punpckhqdq                      m3, m3
+  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
+
+  ; Store 16bit numbers as 32bit numbers in array pointed to by dqcoeff
+  pcmpgtw                         m6, m5, m8
+  punpckhwd                       m6, m8, m6
+  pmovsxwd                       m11, m8
+  mova                 [dqcoeffq   ], m11
+  mova                 [dqcoeffq+16], m6
+  pcmpgtw                         m6, m5, m13
+  punpckhwd                       m6, m13, m6
+  pmovsxwd                       m11, m13
+  mova                 [dqcoeffq+32], m11
+  mova                 [dqcoeffq+48], m6
+
+  mova                            m6, [iscanq]            ; m6 = scan[i]
+  mova                           m11, [iscanq+16]         ; m11 = scan[i]
+
+  pcmpeqw                         m8,  m8,  m5            ; m8 = c[i] == 0
+  pcmpeqw                        m13, m13,  m5            ; m13 = c[i] == 0
+  psubw                           m6,  m6,  m7            ; m6 = scan[i] + 1
+  psubw                          m11, m11, m12            ; m11 = scan[i] + 1
+  pandn                           m8,  m8,  m6            ; m8 = max(eob)
+  pandn                          m13, m13, m11            ; m13 = max(eob)
+  pmaxsw                          m8,  m8, m13
+
+  ; Horizontally accumulate/max eobs and write into [eob] memory pointer
+  pshufd                          m7, m8, 0xe
+  pmaxsw                          m8, m7
+  pshuflw                         m7, m8, 0xe
+  pmaxsw                          m8, m7
+  pshuflw                         m7, m8, 0x1
+  pmaxsw                          m8, m7
+  movq                           rax, m8
+  mov                         [eobq], ax
+
+  vzeroupper
+  RET
+
+  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+  ;; Generic case of ncoeff != 16
+  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+.generic:
+
+%endif ; %ifnidn %1, b_32x32
+
+DEFINE_ARGS coeff, ncoeff, zbin, round, quant, shift, \
+            qcoeff, dqcoeff, dequant, eob, scan, iscan
+
+  ; Actual quantization loop - setup pointers, rounders, etc.
+  movifnidn                   coeffq, coeffmp
+  movifnidn                  ncoeffq, ncoeffmp
+  movifnidn                    zbinq, zbinmp
+  movifnidn                   roundq, roundmp
+  movifnidn                   quantq, quantmp
+  movifnidn                 dequantq, dequantmp
+  mova                            m0, [zbinq]              ; m0 = zbin
+  mova                            m1, [roundq]             ; m1 = round
+  mova                            m2, [quantq]             ; m2 = quant
+  mova                            m3, [dequantq]           ; m3 = dequant
+  pcmpeqw                         m4, m4                   ; All lanes -1
+%ifidn %1, b_32x32
+  psubw                           m0, m4
+  psubw                           m1, m4
+  psrlw                           m0, 1                    ; m0 = (m0 + 1) / 2
+  psrlw                           m1, 1                    ; m1 = (m1 + 1) / 2
+%endif
+  paddw                           m0, m4                   ; m0 = m0 + 1
+
+  mov                             r2, shiftmp
+  mov                             r3, qcoeffmp
+  mova                            m4, [r2]            ; m4 = shift
+  mov                             r4, dqcoeffmp
+  mov                             r5, iscanmp
+  pxor                            m5, m5              ; m5 = dedicated zero
+
+  DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, eob
+
+
+  lea                         coeffq, [  coeffq+ncoeffq*4]
+  lea                        qcoeffq, [ qcoeffq+ncoeffq*4]
+  lea                       dqcoeffq, [dqcoeffq+ncoeffq*4]
+
+  lea                         iscanq, [  iscanq+ncoeffq*2]
+  neg                        ncoeffq
+
+  ; get DC and first 15 AC coeffs
+  ; coeff stored as 32bit numbers & require 16bit numbers
+  mova                            m9, [coeffq+ncoeffq*4+ 0]
+  packssdw                        m9, [coeffq+ncoeffq*4+16]
+  mova                           m10, [coeffq+ncoeffq*4+32]
+  packssdw                       m10, [coeffq+ncoeffq*4+48]
+
+  pabsw                           m6, m9                   ; m6 = abs(m9)
+  pabsw                          m11, m10                  ; m11 = abs(m10)
+  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
+  punpckhqdq                      m0, m0
+  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
+
+  ; Check if all coeffs are less than zbin. If yes, skip forward quickly.
+  por                            m14, m7, m12
+  ptest                          m14, m14
+  jnz .first_nonzero
+
+  mova        [qcoeffq+ncoeffq*4   ], ymm5
+  mova        [qcoeffq+ncoeffq*4+32], ymm5
+  mova       [dqcoeffq+ncoeffq*4   ], ymm5
+  mova       [dqcoeffq+ncoeffq*4+32], ymm5
+  add                        ncoeffq, mmsize
+
+  punpckhqdq                      m1, m1
+  punpckhqdq                      m2, m2
+  punpckhqdq                      m3, m3
+  punpckhqdq                      m4, m4
+  pxor                            m8, m8
+
+  jmp .ac_only_loop
+
+.first_nonzero:
+
+  paddsw                          m6, m1                   ; m6 += round
+  punpckhqdq                      m1, m1
+  paddsw                         m11, m1                   ; m11 += round
+  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
+  punpckhqdq                      m2, m2
+  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
+  paddw                           m8, m6                   ; m8 += m6
+  paddw                          m13, m11                  ; m13 += m11
+  %ifidn %1, b_32x32
+  pmullw                          m5, m8, m4               ; store the lower 16 bits of m8*qsh
+  %endif
+  pmulhw                          m8, m4                   ; m8 = m8*qsh>>16
+  %ifidn %1, b_32x32
+  psllw                           m8, 1
+  psrlw                           m5, 15
+  por                             m8, m5
+  %endif
+  punpckhqdq                      m4, m4
+  %ifidn %1, b_32x32
+  pmullw                          m5, m13, m4              ; store the lower 16 bits of m13*qsh
+  %endif
+  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
+  %ifidn %1, b_32x32
+  psllw                          m13, 1
+  psrlw                           m5, 15
+  por                            m13, m5
+  pxor                            m5, m5                   ; reset m5 to zero register
+  %endif
+  psignw                          m8, m9                   ; m8 = reinsert sign
+  psignw                         m13, m10                  ; m13 = reinsert sign
+  pand                            m8, m7
+  pand                           m13, m12
+
+  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+  pcmpgtw                         m6, m5, m8
+  punpckhwd                       m6, m8, m6
+  pmovsxwd                       m11, m8
+  mova        [qcoeffq+ncoeffq*4+ 0], m11
+  mova        [qcoeffq+ncoeffq*4+16], m6
+  pcmpgtw                         m6, m5, m13
+  punpckhwd                       m6, m13, m6
+  pmovsxwd                       m11, m13
+  mova        [qcoeffq+ncoeffq*4+32], m11
+  mova        [qcoeffq+ncoeffq*4+48], m6
+
+%ifidn %1, b_32x32
+  pabsw                           m8, m8
+  pabsw                          m13, m13
+%endif
+  pmullw                          m8, m3                   ; dqc[i] = qc[i] * q
+  punpckhqdq                      m3, m3
+  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
+%ifidn %1, b_32x32
+  psrlw                           m8, 1
+  psrlw                          m13, 1
+  psignw                          m8, m9
+  psignw                         m13, m10
+%endif
+
+  ; store 16bit numbers as 32bit numbers in array pointed to by dqcoeff
+  pcmpgtw                         m6, m5, m8
+  punpckhwd                       m6, m8, m6
+  pmovsxwd                       m11, m8
+  mova       [dqcoeffq+ncoeffq*4+ 0], m11
+  mova       [dqcoeffq+ncoeffq*4+16], m6
+  pcmpgtw                         m6, m5, m13
+  punpckhwd                       m6, m13, m6
+  pmovsxwd                       m11, m13
+  mova       [dqcoeffq+ncoeffq*4+32], m11
+  mova       [dqcoeffq+ncoeffq*4+48], m6
+
+  pcmpeqw                         m8, m5                    ; m8 = c[i] == 0
+  pcmpeqw                        m13, m5                    ; m13 = c[i] == 0
+  mova                            m6, [iscanq+ncoeffq*2]    ; m6 = scan[i]
+  mova                           m11, [iscanq+ncoeffq*2+16] ; m11 = scan[i]
+  psubw                           m6, m7                    ; m6 = scan[i] + 1
+  psubw                          m11, m12                   ; m11 = scan[i] + 1
+  pandn                           m8, m6                    ; m8 = max(eob)
+  pandn                          m13, m11                   ; m13 = max(eob)
+  pmaxsw                          m8, m13
+  add                        ncoeffq, mmsize
+
+.ac_only_loop:
+
+  ; pack coeff from 32bit to 16bit array
+  mova                            m9, [coeffq+ncoeffq*4+ 0]
+  packssdw                        m9, [coeffq+ncoeffq*4+16]
+  mova                           m10, [coeffq+ncoeffq*4+32]
+  packssdw                       m10, [coeffq+ncoeffq*4+48]
+
+  pabsw                           m6, m9                   ; m6 = abs(m9)
+  pabsw                          m11, m10                  ; m11 = abs(m10)
+  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
+  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
+
+  ; Check if all coeffs are less than zbin. If yes, skip this itertion.
+  ; And just write zeros as the result would be.
+  por                            m14, m7, m12
+  ptest                          m14, m14
+  jnz .rest_nonzero
+
+  mova        [qcoeffq+ncoeffq*4+ 0], ymm5
+  mova        [qcoeffq+ncoeffq*4+32], ymm5
+  mova       [dqcoeffq+ncoeffq*4+ 0], ymm5
+  mova       [dqcoeffq+ncoeffq*4+32], ymm5
+
+  add                        ncoeffq, mmsize
+  jnz .ac_only_loop
+
+  ; Horizontally accumulate/max eobs and write into [eob] memory pointer
+  mov                             r2, eobmp
+  pshufd                          m7, m8, 0xe
+  pmaxsw                          m8, m7
+  pshuflw                         m7, m8, 0xe
+  pmaxsw                          m8, m7
+  pshuflw                         m7, m8, 0x1
+  pmaxsw                          m8, m7
+  movq                           rax, m8
+  mov                           [r2], ax
+  vzeroupper
+  RET
+
+.rest_nonzero:
+  paddsw                          m6, m1                   ; m6 += round
+  paddsw                         m11, m1                   ; m11 += round
+  pmulhw                         m14, m6, m2               ; m14 = m6*q>>16
+  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
+  paddw                          m14, m6                   ; m14 += m6
+  paddw                          m13, m11                  ; m13 += m11
+  %ifidn %1, b_32x32
+  pmullw                          m5, m14, m4              ; store the lower 16 bits of m14*qsh
+  %endif
+  pmulhw                         m14, m4                   ; m14 = m14*qsh>>16
+  %ifidn %1, b_32x32
+  psllw                          m14, 1
+  psrlw                           m5, 15
+  por                            m14, m5
+  pmullw                          m5, m13, m4              ; store the lower 16 bits of m13*qsh
+  %endif
+  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
+  %ifidn %1, b_32x32
+  psllw                          m13, 1
+  psrlw                           m5, 15
+  por                            m13, m5
+  pxor                            m5, m5                   ; reset m5 to zero register
+  %endif
+  psignw                         m14, m9                   ; m14 = reinsert sign
+  psignw                         m13, m10                  ; m13 = reinsert sign
+  pand                           m14, m7
+  pand                           m13, m12
+
+  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+  pcmpgtw                         m6, m5, m14
+  punpckhwd                       m6, m14, m6
+  pmovsxwd                       m11, m14
+  mova        [qcoeffq+ncoeffq*4+ 0], m11
+  mova        [qcoeffq+ncoeffq*4+16], m6
+  pcmpgtw                         m6, m5, m13
+  punpckhwd                       m6, m13, m6
+  pmovsxwd                       m11, m13
+  mova        [qcoeffq+ncoeffq*4+32], m11
+  mova        [qcoeffq+ncoeffq*4+48], m6
+
+%ifidn %1, b_32x32
+  pabsw                          m14, m14
+  pabsw                          m13, m13
+%endif
+  pmullw                         m14, m3                   ; dqc[i] = qc[i] * q
+  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
+%ifidn %1, b_32x32
+  psrlw                          m14, 1
+  psrlw                          m13, 1
+  psignw                         m14, m9
+  psignw                         m13, m10
+%endif
+
+  ; store 16bit numbers as 32bit numbers in array pointed to by dqcoeff
+  pcmpgtw                         m6, m5, m14
+  punpckhwd                       m6, m14, m6
+  pmovsxwd                       m11, m14
+  mova       [dqcoeffq+ncoeffq*4+ 0], m11
+  mova       [dqcoeffq+ncoeffq*4+16], m6
+  pcmpgtw                         m6, m5, m13
+  punpckhwd                       m6, m13, m6
+  pmovsxwd                       m11, m13
+  mova       [dqcoeffq+ncoeffq*4+32], m11
+  mova       [dqcoeffq+ncoeffq*4+48], m6
+
+  pcmpeqw                        m14, m5                    ; m14 = c[i] == 0
+  pcmpeqw                        m13, m5                    ; m13 = c[i] == 0
+  mova                            m6, [iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
+  mova                           m11, [iscanq+ncoeffq*2+16] ; m11 = scan[i]
+  psubw                           m6, m7                    ; m6 = scan[i] + 1
+  psubw                          m11, m12                   ; m11 = scan[i] + 1
+  pandn                          m14, m6                    ; m14 = max(eob)
+  pandn                          m13, m11                   ; m13 = max(eob)
+  pmaxsw                          m8, m14
+  pmaxsw                          m8, m13
+  add                        ncoeffq, mmsize
+  jnz .ac_only_loop
+
+  ; Horizontally accumulate/max eobs and write into [eob] memory pointer
+  mov                             r2, eobmp
+  pshufd                          m7, m8, 0xe
+  pmaxsw                          m8, m7
+  pshuflw                         m7, m8, 0xe
+  pmaxsw                          m8, m7
+  pshuflw                         m7, m8, 0x1
+  pmaxsw                          m8, m7
+  movq                           rax, m8
+  mov                           [r2], ax
+  vzeroupper
+  RET
+%endmacro
+
+INIT_XMM avx
+QUANTIZE_FN b, 9
+QUANTIZE_FN b_32x32, 9
diff --git a/libs/libaom/src/aom_dsp/x86/quantize_sse2.c b/libs/libaom/src/aom_dsp/x86/quantize_sse2.c
new file mode 100644
index 000000000..ebef1fbac
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/quantize_sse2.c
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+#include <xmmintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/quantize_x86.h"
+
+void aom_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                         const int16_t *zbin_ptr, const int16_t *round_ptr,
+                         const int16_t *quant_ptr,
+                         const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+                         tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+                         uint16_t *eob_ptr, const int16_t *scan_ptr,
+                         const int16_t *iscan_ptr) {
+  const __m128i zero = _mm_setzero_si128();
+  int index = 16;
+
+  __m128i zbin, round, quant, dequant, shift;
+  __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
+  __m128i qcoeff0, qcoeff1;
+  __m128i cmp_mask0, cmp_mask1;
+  __m128i eob, eob0;
+
+  (void)scan_ptr;
+
+  // Setup global values.
+  load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant,
+                dequant_ptr, &dequant, quant_shift_ptr, &shift);
+
+  // Do DC and first 15 AC.
+  coeff0 = load_coefficients(coeff_ptr);
+  coeff1 = load_coefficients(coeff_ptr + 8);
+
+  // Poor man's abs().
+  coeff0_sign = _mm_srai_epi16(coeff0, 15);
+  coeff1_sign = _mm_srai_epi16(coeff1, 15);
+  qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+  qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+
+  cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+  zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
+  cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+  calculate_qcoeff(&qcoeff0, round, quant, shift);
+
+  round = _mm_unpackhi_epi64(round, round);
+  quant = _mm_unpackhi_epi64(quant, quant);
+  shift = _mm_unpackhi_epi64(shift, shift);
+
+  calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+  // Reinsert signs
+  qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+  qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+
+  // Mask out zbin threshold coeffs
+  qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+  qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+  store_coefficients(qcoeff0, qcoeff_ptr);
+  store_coefficients(qcoeff1, qcoeff_ptr + 8);
+
+  coeff0 = calculate_dqcoeff(qcoeff0, dequant);
+  dequant = _mm_unpackhi_epi64(dequant, dequant);
+  coeff1 = calculate_dqcoeff(qcoeff1, dequant);
+
+  store_coefficients(coeff0, dqcoeff_ptr);
+  store_coefficients(coeff1, dqcoeff_ptr + 8);
+
+  eob =
+      scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0, zero);
+
+  // AC only loop.
+  while (index < n_coeffs) {
+    coeff0 = load_coefficients(coeff_ptr + index);
+    coeff1 = load_coefficients(coeff_ptr + index + 8);
+
+    coeff0_sign = _mm_srai_epi16(coeff0, 15);
+    coeff1_sign = _mm_srai_epi16(coeff1, 15);
+    qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+    qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+
+    cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+    cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+    calculate_qcoeff(&qcoeff0, round, quant, shift);
+    calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+    qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+    qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    store_coefficients(qcoeff0, qcoeff_ptr + index);
+    store_coefficients(qcoeff1, qcoeff_ptr + index + 8);
+
+    coeff0 = calculate_dqcoeff(qcoeff0, dequant);
+    coeff1 = calculate_dqcoeff(qcoeff1, dequant);
+
+    store_coefficients(coeff0, dqcoeff_ptr + index);
+    store_coefficients(coeff1, dqcoeff_ptr + index + 8);
+
+    eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr,
+                        index, zero);
+    eob = _mm_max_epi16(eob, eob0);
+
+    index += 16;
+  }
+
+  *eob_ptr = accumulate_eob(eob);
+}
diff --git a/libs/libaom/src/aom_dsp/x86/quantize_ssse3.c b/libs/libaom/src/aom_dsp/x86/quantize_ssse3.c
new file mode 100644
index 000000000..25980a055
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/quantize_ssse3.c
@@ -0,0 +1,192 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <tmmintrin.h>
+#include <emmintrin.h>
+#include <xmmintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/quantize_x86.h"
+
+static INLINE void calculate_qcoeff_64x64(__m128i *coeff, const __m128i round,
+                                          const __m128i quant,
+                                          const __m128i *shift) {
+  __m128i tmp, qcoeff, tmp1;
+  qcoeff = _mm_adds_epi16(*coeff, round);
+  tmp = _mm_mulhi_epi16(qcoeff, quant);
+  qcoeff = _mm_add_epi16(tmp, qcoeff);
+  tmp = _mm_mullo_epi16(qcoeff, *shift);
+  tmp = _mm_srli_epi16(tmp, 14);
+  tmp1 = _mm_mulhi_epi16(qcoeff, *shift);
+  tmp1 = _mm_slli_epi16(tmp1, 2);
+  *coeff = _mm_or_si128(tmp, tmp1);
+}
+
+static INLINE void calculate_dqcoeff_and_store_64x64(const __m128i qcoeff,
+                                                     const __m128i dequant,
+                                                     const __m128i zero,
+                                                     tran_low_t *dqcoeff) {
+  // Un-sign to bias rounding like C.
+  const __m128i coeff = _mm_abs_epi16(qcoeff);
+
+  const __m128i sign_0 = _mm_unpacklo_epi16(zero, qcoeff);
+  const __m128i sign_1 = _mm_unpackhi_epi16(zero, qcoeff);
+
+  const __m128i low = _mm_mullo_epi16(coeff, dequant);
+  const __m128i high = _mm_mulhi_epi16(coeff, dequant);
+  __m128i dqcoeff32_0 = _mm_unpacklo_epi16(low, high);
+  __m128i dqcoeff32_1 = _mm_unpackhi_epi16(low, high);
+
+  // "Divide" by 4.
+  dqcoeff32_0 = _mm_srli_epi32(dqcoeff32_0, 2);
+  dqcoeff32_1 = _mm_srli_epi32(dqcoeff32_1, 2);
+
+  dqcoeff32_0 = _mm_sign_epi32(dqcoeff32_0, sign_0);
+  dqcoeff32_1 = _mm_sign_epi32(dqcoeff32_1, sign_1);
+
+  _mm_store_si128((__m128i *)(dqcoeff), dqcoeff32_0);
+  _mm_store_si128((__m128i *)(dqcoeff + 4), dqcoeff32_1);
+}
+
+void aom_quantize_b_64x64_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                                const int16_t *zbin_ptr,
+                                const int16_t *round_ptr,
+                                const int16_t *quant_ptr,
+                                const int16_t *quant_shift_ptr,
+                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                                const int16_t *scan, const int16_t *iscan) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i two = _mm_set1_epi16(2);
+  int index;
+
+  __m128i zbin, round, quant, dequant, shift;
+  __m128i coeff0, coeff1, qcoeff0, qcoeff1;
+  __m128i cmp_mask0, cmp_mask1, all_zero;
+  __m128i eob = zero, eob0;
+
+  (void)scan;
+  (void)n_coeffs;
+
+  // Setup global values.
+  zbin = _mm_load_si128((const __m128i *)zbin_ptr);
+  round = _mm_load_si128((const __m128i *)round_ptr);
+  quant = _mm_load_si128((const __m128i *)quant_ptr);
+  dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+  shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
+
+  // Shift with rounding.
+  zbin = _mm_add_epi16(zbin, two);
+  round = _mm_add_epi16(round, two);
+  zbin = _mm_srli_epi16(zbin, 2);
+  round = _mm_srli_epi16(round, 2);
+  zbin = _mm_sub_epi16(zbin, one);
+  // Do DC and first 15 AC.
+  coeff0 = load_coefficients(coeff_ptr);
+  coeff1 = load_coefficients(coeff_ptr + 8);
+
+  qcoeff0 = _mm_abs_epi16(coeff0);
+  qcoeff1 = _mm_abs_epi16(coeff1);
+
+  cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+  zbin = _mm_unpackhi_epi64(zbin, zbin);
+  cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+  all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+  if (_mm_movemask_epi8(all_zero) == 0) {
+    _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero);
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+  } else {
+    calculate_qcoeff_64x64(&qcoeff0, round, quant, &shift);
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    calculate_qcoeff_64x64(&qcoeff1, round, quant, &shift);
+
+    // Reinsert signs.
+    qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+    qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+    // Mask out zbin threshold coeffs.
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    store_coefficients(qcoeff0, qcoeff_ptr);
+    store_coefficients(qcoeff1, qcoeff_ptr + 8);
+
+    calculate_dqcoeff_and_store_64x64(qcoeff0, dequant, zero, dqcoeff_ptr);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+    calculate_dqcoeff_and_store_64x64(qcoeff1, dequant, zero, dqcoeff_ptr + 8);
+
+    eob =
+        scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
+  }
+
+  // AC only loop.
+  for (index = 16; index < 1024; index += 16) {
+    coeff0 = load_coefficients(coeff_ptr + index);
+    coeff1 = load_coefficients(coeff_ptr + index + 8);
+
+    qcoeff0 = _mm_abs_epi16(coeff0);
+    qcoeff1 = _mm_abs_epi16(coeff1);
+
+    cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+    cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+    all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+    if (_mm_movemask_epi8(all_zero) == 0) {
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero);
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero);
+      continue;
+    }
+    calculate_qcoeff_64x64(&qcoeff0, round, quant, &shift);
+    calculate_qcoeff_64x64(&qcoeff1, round, quant, &shift);
+
+    qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+    qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    store_coefficients(qcoeff0, qcoeff_ptr + index);
+    store_coefficients(qcoeff1, qcoeff_ptr + index + 8);
+
+    calculate_dqcoeff_and_store_64x64(qcoeff0, dequant, zero,
+                                      dqcoeff_ptr + index);
+    calculate_dqcoeff_and_store_64x64(qcoeff1, dequant, zero,
+                                      dqcoeff_ptr + 8 + index);
+
+    eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
+                        zero);
+    eob = _mm_max_epi16(eob, eob0);
+  }
+
+  *eob_ptr = accumulate_eob(eob);
+}
diff --git a/libs/libaom/src/aom_dsp/x86/quantize_ssse3_x86_64.asm b/libs/libaom/src/aom_dsp/x86/quantize_ssse3_x86_64.asm
new file mode 100644
index 000000000..fa616a6f1
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/quantize_ssse3_x86_64.asm
@@ -0,0 +1,302 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_1: times 8 dw 1
+
+SECTION .text
+
+%macro QUANTIZE_FN 2
+cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, zbin, round, quant, \
+                                shift, qcoeff, dqcoeff, dequant, \
+                                eob, scan, iscan
+
+  ; actual quantize loop - setup pointers, rounders, etc.
+  movifnidn                   coeffq, coeffmp
+  movifnidn                  ncoeffq, ncoeffmp
+  movifnidn                    zbinq, zbinmp
+  movifnidn                   roundq, roundmp
+  movifnidn                   quantq, quantmp
+  movifnidn                 dequantq, dequantmp
+  mova                            m0, [zbinq]              ; m0 = zbin
+  mova                            m1, [roundq]             ; m1 = round
+  mova                            m2, [quantq]             ; m2 = quant
+%ifidn %1, b_32x32
+  pcmpeqw                         m5, m5
+  psrlw                           m5, 15
+  paddw                           m0, m5
+  paddw                           m1, m5
+  psrlw                           m0, 1                    ; m0 = (m0 + 1) / 2
+  psrlw                           m1, 1                    ; m1 = (m1 + 1) / 2
+%endif
+  mova                            m3, [dequantq]           ; m3 = dequant
+  mov                             r2, shiftmp
+  psubw                           m0, [GLOBAL(pw_1)]
+  mova                            m4, [r2]                 ; m4 = shift
+  mov                             r3, qcoeffmp
+  mov                             r4, dqcoeffmp
+  mov                             r5, iscanmp
+  pxor                            m5, m5                   ; m5 = dedicated zero
+  DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, eob
+  lea                         coeffq, [  coeffq+ncoeffq*4]
+  lea                        qcoeffq, [ qcoeffq+ncoeffq*4]
+  lea                       dqcoeffq, [dqcoeffq+ncoeffq*4]
+  lea                         iscanq, [  iscanq+ncoeffq*2]
+  neg                        ncoeffq
+
+  ; get DC and first 15 AC coeffs
+  ; coeff stored as 32bit numbers & require 16bit numbers
+  mova                            m9, [  coeffq+ncoeffq*4+ 0]
+  packssdw                        m9, [  coeffq+ncoeffq*4+16]
+  mova                           m10, [  coeffq+ncoeffq*4+32]
+  packssdw                       m10, [  coeffq+ncoeffq*4+48]
+  pabsw                           m6, m9                   ; m6 = abs(m9)
+  pabsw                          m11, m10                  ; m11 = abs(m10)
+  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
+  punpckhqdq                      m0, m0
+  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
+  paddsw                          m6, m1                   ; m6 += round
+  punpckhqdq                      m1, m1
+  paddsw                         m11, m1                   ; m11 += round
+  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
+  punpckhqdq                      m2, m2
+  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
+  paddw                           m8, m6                   ; m8 += m6
+  paddw                          m13, m11                  ; m13 += m11
+  %ifidn %1, b_32x32
+  pmullw                          m5, m8, m4               ; store the lower 16 bits of m8*qsh
+  %endif
+  pmulhw                          m8, m4                   ; m8 = m8*qsh>>16
+  %ifidn %1, b_32x32
+  psllw                           m8, 1
+  psrlw                           m5, 15
+  por                             m8, m5
+  %endif
+  punpckhqdq                      m4, m4
+  %ifidn %1, b_32x32
+  pmullw                          m5, m13, m4              ; store the lower 16 bits of m13*qsh
+  %endif
+  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
+  %ifidn %1, b_32x32
+  psllw                          m13, 1
+  psrlw                           m5, 15
+  por                            m13, m5
+  pxor                            m5, m5                   ; reset m5 to zero register
+  %endif
+  psignw                          m8, m9                   ; m8 = reinsert sign
+  psignw                         m13, m10                  ; m13 = reinsert sign
+  pand                            m8, m7
+  pand                           m13, m12
+
+  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+  mova                           m11, m8
+  mova                            m6, m8
+  pcmpgtw                         m5, m8
+  punpcklwd                      m11, m5
+  punpckhwd                       m6, m5
+  mova        [qcoeffq+ncoeffq*4+ 0], m11
+  mova        [qcoeffq+ncoeffq*4+16], m6
+  pxor                            m5, m5
+  mova                           m11, m13
+  mova                            m6, m13
+  pcmpgtw                         m5, m13
+  punpcklwd                      m11, m5
+  punpckhwd                       m6, m5
+  mova        [qcoeffq+ncoeffq*4+32], m11
+  mova        [qcoeffq+ncoeffq*4+48], m6
+  pxor                            m5, m5             ; reset m5 to zero register
+
+%ifidn %1, b_32x32
+  pabsw                           m8, m8
+  pabsw                          m13, m13
+%endif
+  pmullw                          m8, m3                   ; dqc[i] = qc[i] * q
+  punpckhqdq                      m3, m3
+  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
+%ifidn %1, b_32x32
+  psrlw                           m8, 1
+  psrlw                          m13, 1
+  psignw                          m8, m9
+  psignw                         m13, m10
+%endif
+  ; store 16bit numbers as 32bit numbers in array pointed to by dqcoeff
+  mova                            m11, m8
+  mova                            m6, m8
+  pcmpgtw                         m5, m8
+  punpcklwd                      m11, m5
+  punpckhwd                       m6, m5
+  mova       [dqcoeffq+ncoeffq*4+ 0], m11
+  mova       [dqcoeffq+ncoeffq*4+16], m6
+  pxor                            m5, m5
+  mova                           m11, m13
+  mova                            m6, m13
+  pcmpgtw                         m5, m13
+  punpcklwd                      m11, m5
+  punpckhwd                       m6, m5
+  mova       [dqcoeffq+ncoeffq*4+32], m11
+  mova       [dqcoeffq+ncoeffq*4+48], m6
+  pxor                            m5, m5             ; reset m5 to zero register
+  pcmpeqw                         m8, m5                   ; m8 = c[i] == 0
+  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
+  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
+  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]
+  psubw                           m6, m7                   ; m6 = scan[i] + 1
+  psubw                          m11, m12                  ; m11 = scan[i] + 1
+  pandn                           m8, m6                   ; m8 = max(eob)
+  pandn                          m13, m11                  ; m13 = max(eob)
+  pmaxsw                          m8, m13
+  add                        ncoeffq, mmsize
+  jz .accumulate_eob
+
+.ac_only_loop:
+  ; pack coeff from 32bit to 16bit array
+  mova                            m9, [  coeffq+ncoeffq*4+ 0]
+  packssdw                        m9, [  coeffq+ncoeffq*4+16]
+  mova                           m10, [  coeffq+ncoeffq*4+32]
+  packssdw                       m10, [  coeffq+ncoeffq*4+48]
+
+  pabsw                           m6, m9                   ; m6 = abs(m9)
+  pabsw                          m11, m10                  ; m11 = abs(m10)
+  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
+  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
+%ifidn %1, b_32x32
+  pmovmskb                       r6d, m7
+  pmovmskb                       r2d, m12
+  or                              r6, r2
+  jz .skip_iter
+%endif
+  paddsw                          m6, m1                   ; m6 += round
+  paddsw                         m11, m1                   ; m11 += round
+  pmulhw                         m14, m6, m2               ; m14 = m6*q>>16
+  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
+  paddw                          m14, m6                   ; m14 += m6
+  paddw                          m13, m11                  ; m13 += m11
+  %ifidn %1, b_32x32
+  pmullw                          m5, m14, m4              ; store the lower 16 bits of m14*qsh
+  %endif
+  pmulhw                         m14, m4                   ; m14 = m14*qsh>>16
+  %ifidn %1, b_32x32
+  psllw                          m14, 1
+  psrlw                           m5, 15
+  por                            m14, m5
+  pmullw                          m5, m13, m4              ; store the lower 16 bits of m13*qsh
+  %endif
+  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
+  %ifidn %1, b_32x32
+  psllw                          m13, 1
+  psrlw                           m5, 15
+  por                            m13, m5
+  pxor                            m5, m5                   ; reset m5 to zero register
+  %endif
+  psignw                         m14, m9                   ; m14 = reinsert sign
+  psignw                         m13, m10                  ; m13 = reinsert sign
+  pand                           m14, m7
+  pand                           m13, m12
+
+  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+  pxor                           m11, m11
+  mova                           m11, m14
+  mova                            m6, m14
+  pcmpgtw                         m5, m14
+  punpcklwd                      m11, m5
+  punpckhwd                       m6, m5
+  mova        [qcoeffq+ncoeffq*4+ 0], m11
+  mova        [qcoeffq+ncoeffq*4+16], m6
+  pxor                            m5, m5
+  mova                           m11, m13
+  mova                            m6, m13
+  pcmpgtw                         m5, m13
+  punpcklwd                      m11, m5
+  punpckhwd                       m6, m5
+  mova        [qcoeffq+ncoeffq*4+32], m11
+  mova        [qcoeffq+ncoeffq*4+48], m6
+  pxor                            m5, m5             ; reset m5 to zero register
+
+%ifidn %1, b_32x32
+  pabsw                          m14, m14
+  pabsw                          m13, m13
+%endif
+  pmullw                         m14, m3                   ; dqc[i] = qc[i] * q
+  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
+%ifidn %1, b_32x32
+  psrlw                          m14, 1
+  psrlw                          m13, 1
+  psignw                         m14, m9
+  psignw                         m13, m10
+%endif
+
+  ; store 16bit numbers as 32bit numbers in array pointed to by dqcoeff
+  mova                           m11, m14
+  mova                            m6, m14
+  pcmpgtw                         m5, m14
+  punpcklwd                      m11, m5
+  punpckhwd                       m6, m5
+  mova       [dqcoeffq+ncoeffq*4+ 0], m11
+  mova       [dqcoeffq+ncoeffq*4+16], m6
+  pxor                            m5, m5
+  mova                           m11, m13
+  mova                            m6, m13
+  pcmpgtw                         m5, m13
+  punpcklwd                      m11, m5
+  punpckhwd                       m6, m5
+  mova       [dqcoeffq+ncoeffq*4+32], m11
+  mova       [dqcoeffq+ncoeffq*4+48], m6
+  pxor                            m5, m5
+
+  pcmpeqw                        m14, m5                   ; m14 = c[i] == 0
+  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
+  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
+  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]
+  psubw                           m6, m7                   ; m6 = scan[i] + 1
+  psubw                          m11, m12                  ; m11 = scan[i] + 1
+  pandn                          m14, m6                   ; m14 = max(eob)
+  pandn                          m13, m11                  ; m13 = max(eob)
+  pmaxsw                          m8, m14
+  pmaxsw                          m8, m13
+  add                        ncoeffq, mmsize
+  jl .ac_only_loop
+
+%ifidn %1, b_32x32
+  jmp .accumulate_eob
+.skip_iter:
+  mova        [qcoeffq+ncoeffq*4+ 0], m5
+  mova        [qcoeffq+ncoeffq*4+16], m5
+  mova        [qcoeffq+ncoeffq*4+32], m5
+  mova        [qcoeffq+ncoeffq*4+48], m5
+  mova       [dqcoeffq+ncoeffq*4+ 0], m5
+  mova       [dqcoeffq+ncoeffq*4+16], m5
+  mova       [dqcoeffq+ncoeffq*4+32], m5
+  mova       [dqcoeffq+ncoeffq*4+48], m5
+  add                        ncoeffq, mmsize
+  jl .ac_only_loop
+%endif
+
+.accumulate_eob:
+  ; horizontally accumulate/max eobs and write into [eob] memory pointer
+  mov                             r2, eobmp
+  pshufd                          m7, m8, 0xe
+  pmaxsw                          m8, m7
+  pshuflw                         m7, m8, 0xe
+  pmaxsw                          m8, m7
+  pshuflw                         m7, m8, 0x1
+  pmaxsw                          m8, m7
+  pextrw                          r6, m8, 0
+  mov                             [r2], r6
+  RET
+%endmacro
+
+INIT_XMM ssse3
+QUANTIZE_FN b, 9
+QUANTIZE_FN b_32x32, 9
diff --git a/libs/libaom/src/aom_dsp/x86/quantize_x86.h b/libs/libaom/src/aom_dsp/x86/quantize_x86.h
new file mode 100644
index 000000000..5b040a278
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/quantize_x86.h
@@ -0,0 +1,202 @@
+/*
+ *  Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+
+#include "aom/aom_integer.h"
+
+static INLINE void load_b_values(const int16_t *zbin_ptr, __m128i *zbin,
+                                 const int16_t *round_ptr, __m128i *round,
+                                 const int16_t *quant_ptr, __m128i *quant,
+                                 const int16_t *dequant_ptr, __m128i *dequant,
+                                 const int16_t *shift_ptr, __m128i *shift) {
+  *zbin = _mm_load_si128((const __m128i *)zbin_ptr);
+  *round = _mm_load_si128((const __m128i *)round_ptr);
+  *quant = _mm_load_si128((const __m128i *)quant_ptr);
+  *zbin = _mm_sub_epi16(*zbin, _mm_set1_epi16(1));
+  *dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+  *shift = _mm_load_si128((const __m128i *)shift_ptr);
+}
+
+// With ssse3 and later abs() and sign() are preferred.
+static INLINE __m128i invert_sign_sse2(__m128i a, __m128i sign) {
+  a = _mm_xor_si128(a, sign);
+  return _mm_sub_epi16(a, sign);
+}
+
+static INLINE __m128i invert_sign_32_sse2(__m128i a, __m128i sign) {
+  a = _mm_xor_si128(a, sign);
+  return _mm_sub_epi32(a, sign);
+}
+
+static INLINE void calculate_qcoeff(__m128i *coeff, const __m128i round,
+                                    const __m128i quant, const __m128i shift) {
+  __m128i tmp, qcoeff;
+  qcoeff = _mm_adds_epi16(*coeff, round);
+  tmp = _mm_mulhi_epi16(qcoeff, quant);
+  qcoeff = _mm_add_epi16(tmp, qcoeff);
+  *coeff = _mm_mulhi_epi16(qcoeff, shift);
+}
+
+static INLINE void calculate_qcoeff_log_scale(__m128i *coeff,
+                                              const __m128i round,
+                                              const __m128i quant,
+                                              const __m128i *shift,
+                                              const int *log_scale) {
+  __m128i tmp, tmp1, qcoeff;
+  qcoeff = _mm_adds_epi16(*coeff, round);
+  tmp = _mm_mulhi_epi16(qcoeff, quant);
+  qcoeff = _mm_add_epi16(tmp, qcoeff);
+  tmp = _mm_mullo_epi16(qcoeff, *shift);
+  tmp = _mm_srli_epi16(tmp, (16 - *log_scale));
+  tmp1 = _mm_mulhi_epi16(qcoeff, *shift);
+  tmp1 = _mm_slli_epi16(tmp1, *log_scale);
+  *coeff = _mm_or_si128(tmp, tmp1);
+}
+
+static INLINE __m128i calculate_dqcoeff(__m128i qcoeff, __m128i dequant) {
+  return _mm_mullo_epi16(qcoeff, dequant);
+}
+
+static INLINE void calculate_dqcoeff_and_store_log_scale(__m128i qcoeff,
+                                                         __m128i dequant,
+                                                         const __m128i zero,
+                                                         tran_low_t *dqcoeff,
+                                                         const int *log_scale) {
+  // calculate abs
+  __m128i coeff_sign = _mm_srai_epi16(qcoeff, 15);
+  __m128i coeff = invert_sign_sse2(qcoeff, coeff_sign);
+
+  const __m128i sign_0 = _mm_unpacklo_epi16(coeff_sign, zero);
+  const __m128i sign_1 = _mm_unpackhi_epi16(coeff_sign, zero);
+
+  const __m128i low = _mm_mullo_epi16(coeff, dequant);
+  const __m128i high = _mm_mulhi_epi16(coeff, dequant);
+  __m128i dqcoeff32_0 = _mm_unpacklo_epi16(low, high);
+  __m128i dqcoeff32_1 = _mm_unpackhi_epi16(low, high);
+
+  dqcoeff32_0 = _mm_srli_epi32(dqcoeff32_0, *log_scale);
+  dqcoeff32_1 = _mm_srli_epi32(dqcoeff32_1, *log_scale);
+
+  dqcoeff32_0 = invert_sign_32_sse2(dqcoeff32_0, sign_0);
+  dqcoeff32_1 = invert_sign_32_sse2(dqcoeff32_1, sign_1);
+
+  _mm_store_si128((__m128i *)(dqcoeff), dqcoeff32_0);
+  _mm_store_si128((__m128i *)(dqcoeff + 4), dqcoeff32_1);
+}
+
+// Scan 16 values for eob reference in scan_ptr. Use masks (-1) from comparing
+// to zbin to add 1 to the index in 'scan'.
+static INLINE __m128i scan_for_eob(__m128i *coeff0, __m128i *coeff1,
+                                   const __m128i zbin_mask0,
+                                   const __m128i zbin_mask1,
+                                   const int16_t *scan_ptr, const int index,
+                                   const __m128i zero) {
+  const __m128i zero_coeff0 = _mm_cmpeq_epi16(*coeff0, zero);
+  const __m128i zero_coeff1 = _mm_cmpeq_epi16(*coeff1, zero);
+  __m128i scan0 = _mm_load_si128((const __m128i *)(scan_ptr + index));
+  __m128i scan1 = _mm_load_si128((const __m128i *)(scan_ptr + index + 8));
+  __m128i eob0, eob1;
+  // Add one to convert from indices to counts
+  scan0 = _mm_sub_epi16(scan0, zbin_mask0);
+  scan1 = _mm_sub_epi16(scan1, zbin_mask1);
+  eob0 = _mm_andnot_si128(zero_coeff0, scan0);
+  eob1 = _mm_andnot_si128(zero_coeff1, scan1);
+  return _mm_max_epi16(eob0, eob1);
+}
+
+static INLINE int16_t accumulate_eob(__m128i eob) {
+  __m128i eob_shuffled;
+  eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
+  eob = _mm_max_epi16(eob, eob_shuffled);
+  eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
+  eob = _mm_max_epi16(eob, eob_shuffled);
+  eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
+  eob = _mm_max_epi16(eob, eob_shuffled);
+  return _mm_extract_epi16(eob, 1);
+}
+
+static INLINE __m128i load_coefficients(const tran_low_t *coeff_ptr) {
+  assert(sizeof(tran_low_t) == 4);
+  const __m128i coeff1 = _mm_load_si128((__m128i *)(coeff_ptr));
+  const __m128i coeff2 = _mm_load_si128((__m128i *)(coeff_ptr + 4));
+  return _mm_packs_epi32(coeff1, coeff2);
+}
+
+static INLINE void store_coefficients(__m128i coeff_vals,
+                                      tran_low_t *coeff_ptr) {
+  assert(sizeof(tran_low_t) == 4);
+
+  __m128i one = _mm_set1_epi16(1);
+  __m128i coeff_vals_hi = _mm_mulhi_epi16(coeff_vals, one);
+  __m128i coeff_vals_lo = _mm_mullo_epi16(coeff_vals, one);
+  __m128i coeff_vals_1 = _mm_unpacklo_epi16(coeff_vals_lo, coeff_vals_hi);
+  __m128i coeff_vals_2 = _mm_unpackhi_epi16(coeff_vals_lo, coeff_vals_hi);
+  _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals_1);
+  _mm_store_si128((__m128i *)(coeff_ptr + 4), coeff_vals_2);
+}
+
+static INLINE void update_mask1(__m128i *cmp_mask0, __m128i *cmp_mask1,
+                                const int16_t *iscan_ptr, int *is_found,
+                                __m128i *mask) {
+  __m128i all_zero;
+  __m128i temp_mask = _mm_setzero_si128();
+  all_zero = _mm_or_si128(*cmp_mask0, *cmp_mask1);
+  if (_mm_movemask_epi8(all_zero)) {
+    __m128i iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr));
+    __m128i mask0 = _mm_and_si128(*cmp_mask0, iscan0);
+    __m128i iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + 8));
+    __m128i mask1 = _mm_and_si128(*cmp_mask1, iscan1);
+    temp_mask = _mm_max_epi16(mask0, mask1);
+    *is_found = 1;
+  }
+  *mask = _mm_max_epi16(temp_mask, *mask);
+}
+
+static INLINE void update_mask0(__m128i *qcoeff0, __m128i *qcoeff1,
+                                __m128i *threshold, const int16_t *iscan_ptr,
+                                int *is_found, __m128i *mask) {
+  __m128i zero = _mm_setzero_si128();
+  __m128i coeff[4], cmp_mask0, cmp_mask1, cmp_mask2, cmp_mask3;
+
+  coeff[0] = _mm_unpacklo_epi16(*qcoeff0, zero);
+  coeff[1] = _mm_unpackhi_epi16(*qcoeff0, zero);
+  coeff[2] = _mm_unpacklo_epi16(*qcoeff1, zero);
+  coeff[3] = _mm_unpackhi_epi16(*qcoeff1, zero);
+
+  coeff[0] = _mm_slli_epi32(coeff[0], AOM_QM_BITS);
+  cmp_mask0 = _mm_cmpgt_epi32(coeff[0], threshold[0]);
+  coeff[1] = _mm_slli_epi32(coeff[1], AOM_QM_BITS);
+  cmp_mask1 = _mm_cmpgt_epi32(coeff[1], threshold[1]);
+  coeff[2] = _mm_slli_epi32(coeff[2], AOM_QM_BITS);
+  cmp_mask2 = _mm_cmpgt_epi32(coeff[2], threshold[1]);
+  coeff[3] = _mm_slli_epi32(coeff[3], AOM_QM_BITS);
+  cmp_mask3 = _mm_cmpgt_epi32(coeff[3], threshold[1]);
+
+  cmp_mask0 = _mm_packs_epi32(cmp_mask0, cmp_mask1);
+  cmp_mask1 = _mm_packs_epi32(cmp_mask2, cmp_mask3);
+
+  update_mask1(&cmp_mask0, &cmp_mask1, iscan_ptr, is_found, mask);
+}
+
+static INLINE int calculate_non_zero_count(__m128i mask) {
+  __m128i mask0, mask1;
+  int non_zero_count = 0;
+  mask0 = _mm_unpackhi_epi64(mask, mask);
+  mask1 = _mm_max_epi16(mask0, mask);
+  mask0 = _mm_shuffle_epi32(mask1, 1);
+  mask0 = _mm_max_epi16(mask0, mask1);
+  mask1 = _mm_srli_epi32(mask0, 16);
+  mask0 = _mm_max_epi16(mask0, mask1);
+  non_zero_count = _mm_extract_epi16(mask0, 0) + 1;
+
+  return non_zero_count;
+}
diff --git a/libs/libaom/src/aom_dsp/x86/sad4d_avx2.c b/libs/libaom/src/aom_dsp/x86/sad4d_avx2.c
new file mode 100644
index 000000000..077125258
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/sad4d_avx2.c
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <immintrin.h>  // AVX2
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+
+void aom_sadMxNx4d_avx2(int M, int N, const uint8_t *src, int src_stride,
+                        const uint8_t *const ref[4], int ref_stride,
+                        uint32_t res[4]) {
+  __m256i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg;
+  __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
+  int i, j;
+  const uint8_t *ref0, *ref1, *ref2, *ref3;
+
+  ref0 = ref[0];
+  ref1 = ref[1];
+  ref2 = ref[2];
+  ref3 = ref[3];
+  sum_ref0 = _mm256_setzero_si256();
+  sum_ref2 = _mm256_setzero_si256();
+  sum_ref1 = _mm256_setzero_si256();
+  sum_ref3 = _mm256_setzero_si256();
+
+  for (i = 0; i < N; i++) {
+    for (j = 0; j < M; j += 32) {
+      // load src and all refs
+      src_reg = _mm256_loadu_si256((const __m256i *)(src + j));
+      ref0_reg = _mm256_loadu_si256((const __m256i *)(ref0 + j));
+      ref1_reg = _mm256_loadu_si256((const __m256i *)(ref1 + j));
+      ref2_reg = _mm256_loadu_si256((const __m256i *)(ref2 + j));
+      ref3_reg = _mm256_loadu_si256((const __m256i *)(ref3 + j));
+
+      // sum of the absolute differences between every ref-i to src
+      ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg);
+      ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg);
+      ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg);
+      ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg);
+      // sum every ref-i
+      sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg);
+      sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg);
+      sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg);
+      sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg);
+    }
+    src += src_stride;
+    ref0 += ref_stride;
+    ref1 += ref_stride;
+    ref2 += ref_stride;
+    ref3 += ref_stride;
+  }
+  {
+    __m128i sum;
+    __m256i sum_mlow, sum_mhigh;
+    // in sum_ref-i the result is saved in the first 4 bytes
+    // the other 4 bytes are zeroed.
+    // sum_ref1 and sum_ref3 are shifted left by 4 bytes
+    sum_ref1 = _mm256_slli_si256(sum_ref1, 4);
+    sum_ref3 = _mm256_slli_si256(sum_ref3, 4);
+
+    // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3
+    sum_ref0 = _mm256_or_si256(sum_ref0, sum_ref1);
+    sum_ref2 = _mm256_or_si256(sum_ref2, sum_ref3);
+
+    // merge every 64 bit from each sum_ref-i
+    sum_mlow = _mm256_unpacklo_epi64(sum_ref0, sum_ref2);
+    sum_mhigh = _mm256_unpackhi_epi64(sum_ref0, sum_ref2);
+
+    // add the low 64 bit to the high 64 bit
+    sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh);
+
+    // add the low 128 bit to the high 128 bit
+    sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow),
+                        _mm256_extractf128_si256(sum_mlow, 1));
+
+    _mm_storeu_si128((__m128i *)(res), sum);
+  }
+}
+
+#define sadMxN_avx2(m, n)                                                      \
+  void aom_sad##m##x##n##x4d_avx2(const uint8_t *src, int src_stride,          \
+                                  const uint8_t *const ref[4], int ref_stride, \
+                                  uint32_t res[4]) {                           \
+    aom_sadMxNx4d_avx2(m, n, src, src_stride, ref, ref_stride, res);           \
+  }
+
+sadMxN_avx2(32, 8);
+sadMxN_avx2(32, 16);
+sadMxN_avx2(32, 32);
+sadMxN_avx2(32, 64);
+
+sadMxN_avx2(64, 16);
+sadMxN_avx2(64, 32);
+sadMxN_avx2(64, 64);
+sadMxN_avx2(64, 128);
+
+sadMxN_avx2(128, 64);
+sadMxN_avx2(128, 128);
diff --git a/libs/libaom/src/aom_dsp/x86/sad4d_sse2.asm b/libs/libaom/src/aom_dsp/x86/sad4d_sse2.asm
new file mode 100644
index 000000000..a9043742d
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/sad4d_sse2.asm
@@ -0,0 +1,428 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+%macro AVG_4x2x4 2
+  movh                  m2, [second_predq]
+  movlhps               m2, m2
+  pavgb                 %1, m2
+  pavgb                 %2, m2
+  lea                   second_predq, [second_predq+8]
+%endmacro
+; 'mflag' affect a lot how the code works.
+;
+; When 'mflag' is false, the 'src_strideq' resides in register,
+; [srcq + src_strideq + offset] is allowed, so we can simply
+; use such form to access src memory and don't bother to update
+; 'srcq' at each line. We only update 'srcq' each two-lines using
+; a compact LEA instruction like [srcq+src_strideq*2].
+;
+; When 'mflag' is true, the 'src_strideq' resides in memory.
+; we cannot use above form to access memory, we have to update
+; 'srcq' at each line break. As we process two parts (first,second)
+; together in each macro function, the second part may also sit
+; in the next line, which means we also need to possibly add
+; one 'src_strideq' to 'srcq' before processing second part.
+
+%macro HANDLE_FIRST_OFFSET 2
+  %define first_offset %2
+  %if mflag == 0 && %1 == 1
+    %define first_offset (src_strideq + %2)
+  %endif
+%endmacro
+
+; first_extraline, second_extraline, in_line_offset
+%macro HANDLE_SECOND_OFFSET 3
+  %define second_offset %3
+  %if mflag && %1 == 0 && %2 == 1
+    add srcq, src_strideq
+  %endif
+  %if mflag == 0 && %2 == 1
+    %define second_offset (src_strideq + %3)
+  %endif
+%endmacro
+
+; Notes for line_ending:
+; 0 -- not a line ending
+; 1 -- line ending of a odd line [line numbers starts from one]
+; 2 -- line ending of a even line
+; This is specically designed to handle when src_strideq is a
+; memory position, under such case, we can not accomplish
+; complex address calculation using LEA, and fall back to
+; using simple ADD instruction at each line ending.
+%macro ADVANCE_END_OF_LINE 1
+  %if mflag
+    add srcq, src_strideq
+  %endif
+  %if mflag == 0 && %1 == 2
+    lea                 srcq, [srcq +src_strideq*2]
+  %endif
+
+  %if %1 == 2
+    lea                ref1q, [ref1q+ref_strideq*2]
+    lea                ref2q, [ref2q+ref_strideq*2]
+    lea                ref3q, [ref3q+ref_strideq*2]
+    lea                ref4q, [ref4q+ref_strideq*2]
+  %endif
+%endmacro
+
+; Please note that the second_offset of src is for in_line_offset,
+; so it is less than src_stride.
+; PROCESS_4x2x4 first, off_{first,second}_{src,ref}, do_avg,
+;               {first, second}_extraline, line_ending
+%macro PROCESS_4x2x4 9
+  HANDLE_FIRST_OFFSET   %7, %2
+  movd                  m0, [srcq + first_offset]
+  HANDLE_SECOND_OFFSET  %7, %8, %4
+%if %1 == 1
+  movd                  m6, [ref1q+%3]
+  movd                  m4, [ref2q+%3]
+  movd                  m7, [ref3q+%3]
+  movd                  m5, [ref4q+%3]
+
+  movd                  m1, [srcq + second_offset]
+  movd                  m2, [ref1q+%5]
+  punpckldq             m0, m1
+  punpckldq             m6, m2
+  movd                  m1, [ref2q+%5]
+  movd                  m2, [ref3q+%5]
+  movd                  m3, [ref4q+%5]
+  punpckldq             m4, m1
+  punpckldq             m7, m2
+  punpckldq             m5, m3
+  movlhps               m0, m0
+  movlhps               m6, m4
+  movlhps               m7, m5
+%if %6 == 1
+  AVG_4x2x4             m6, m7
+%endif
+  psadbw                m6, m0
+  psadbw                m7, m0
+%else
+  movd                  m1, [ref1q+%3]
+  movd                  m5, [ref1q+%5]
+  movd                  m2, [ref2q+%3]
+  movd                  m4, [ref2q+%5]
+  punpckldq             m1, m5
+  punpckldq             m2, m4
+  movd                  m3, [ref3q+%3]
+  movd                  m5, [ref3q+%5]
+  punpckldq             m3, m5
+  movd                  m4, [ref4q+%3]
+  movd                  m5, [ref4q+%5]
+  punpckldq             m4, m5
+  movd                  m5, [srcq + second_offset]
+  punpckldq             m0, m5
+  movlhps               m0, m0
+  movlhps               m1, m2
+  movlhps               m3, m4
+%if %6 == 1
+  AVG_4x2x4             m1, m3
+%endif
+  psadbw                m1, m0
+  psadbw                m3, m0
+  paddd                 m6, m1
+  paddd                 m7, m3
+%endif
+%if %9 > 0
+  ADVANCE_END_OF_LINE %9
+%endif
+%endmacro
+
+; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, do_avg,
+;               {first,second}_extraline, line_ending
+%macro PROCESS_8x2x4 9
+  HANDLE_FIRST_OFFSET   %7, %2
+  movh                  m0, [srcq + first_offset]
+  HANDLE_SECOND_OFFSET  %7, %8, %4
+%if %1 == 1
+  movh                  m4, [ref1q+%3]
+  movh                  m5, [ref2q+%3]
+  movh                  m6, [ref3q+%3]
+  movh                  m7, [ref4q+%3]
+  movhps                m0, [srcq + second_offset]
+  movhps                m4, [ref1q+%5]
+  movhps                m5, [ref2q+%5]
+  movhps                m6, [ref3q+%5]
+  movhps                m7, [ref4q+%5]
+%if %6 == 1
+  movu                  m3, [second_predq]
+  pavgb                 m4, m3
+  pavgb                 m5, m3
+  pavgb                 m6, m3
+  pavgb                 m7, m3
+  lea                   second_predq, [second_predq+mmsize]
+%endif
+  psadbw                m4, m0
+  psadbw                m5, m0
+  psadbw                m6, m0
+  psadbw                m7, m0
+%else
+  movh                  m1, [ref1q+%3]
+  movh                  m2, [ref2q+%3]
+  movhps                m0, [srcq + second_offset]
+  movhps                m1, [ref1q+%5]
+  movhps                m2, [ref2q+%5]
+%if %6 == 1
+  movu                  m3, [second_predq]
+  pavgb                 m1, m3
+  pavgb                 m2, m3
+%endif
+  psadbw                m1, m0
+  psadbw                m2, m0
+  paddd                 m4, m1
+  paddd                 m5, m2
+
+  movh                  m1, [ref3q+%3]
+  movhps                m1, [ref3q+%5]
+  movh                  m2, [ref4q+%3]
+  movhps                m2, [ref4q+%5]
+%if %6 == 1
+  pavgb                 m1, m3
+  pavgb                 m2, m3
+  lea                   second_predq, [second_predq+mmsize]
+%endif
+  psadbw                m1, m0
+  psadbw                m2, m0
+  paddd                 m6, m1
+  paddd                 m7, m2
+%endif
+%if %9 > 0
+  ADVANCE_END_OF_LINE %9
+%endif
+%endmacro
+
+; PROCESS_16x2x4 first, off_{first,second}_{src,ref}, do_avg,
+;                {first,second}_extraline, line_ending
+%macro PROCESS_16x2x4 9
+  ; 1st 16 px
+  HANDLE_FIRST_OFFSET   %7, %2
+  mova                  m0, [srcq + first_offset]
+  HANDLE_SECOND_OFFSET  %7, %8, %4
+%if %1 == 1
+  movu                  m4, [ref1q+%3]
+  movu                  m5, [ref2q+%3]
+  movu                  m6, [ref3q+%3]
+  movu                  m7, [ref4q+%3]
+%if %6 == 1
+  movu                  m3, [second_predq]
+  pavgb                 m4, m3
+  pavgb                 m5, m3
+  pavgb                 m6, m3
+  pavgb                 m7, m3
+  lea                   second_predq, [second_predq+mmsize]
+%endif
+  psadbw                m4, m0
+  psadbw                m5, m0
+  psadbw                m6, m0
+  psadbw                m7, m0
+%else ; %1 == 1
+  movu                  m1, [ref1q+%3]
+  movu                  m2, [ref2q+%3]
+%if %6 == 1
+  movu                  m3, [second_predq]
+  pavgb                 m1, m3
+  pavgb                 m2, m3
+%endif
+  psadbw                m1, m0
+  psadbw                m2, m0
+  paddd                 m4, m1
+  paddd                 m5, m2
+
+  movu                  m1, [ref3q+%3]
+  movu                  m2, [ref4q+%3]
+%if %6 == 1
+  pavgb                 m1, m3
+  pavgb                 m2, m3
+  lea                   second_predq, [second_predq+mmsize]
+%endif
+  psadbw                m1, m0
+  psadbw                m2, m0
+  paddd                 m6, m1
+  paddd                 m7, m2
+%endif ; %1 == 1
+
+  ; 2nd 16 px
+  mova                  m0, [srcq + second_offset]
+  movu                  m1, [ref1q+%5]
+  movu                  m2, [ref2q+%5]
+
+%if %6 == 1
+  movu                  m3, [second_predq]
+  pavgb                 m1, m3
+  pavgb                 m2, m3
+%endif
+  psadbw                m1, m0
+  psadbw                m2, m0
+  paddd                 m4, m1
+  paddd                 m5, m2
+
+  movu                  m1, [ref3q+%5]
+  movu                  m2, [ref4q+%5]
+
+%if %9 > 0
+  ADVANCE_END_OF_LINE %9
+%endif
+
+%if %6 == 1
+  pavgb                 m1, m3
+  pavgb                 m2, m3
+  lea                   second_predq, [second_predq+mmsize]
+%endif
+  psadbw                m1, m0
+  psadbw                m2, m0
+  paddd                 m6, m1
+  paddd                 m7, m2
+%endmacro
+
+; PROCESS_32x2x4 first, off_{first,second}_{src,ref}, do_avg,
+;                {first,second}_extraline, line_ending
+%macro PROCESS_32x2x4 9
+  PROCESS_16x2x4 %1, %2, %3, %2 + 16, %3 + 16, %6, %7, %7, %8 - %7
+  PROCESS_16x2x4  0, %4, %5, %4 + 16, %5 + 16, %6, %8, %8, %9
+%endmacro
+
+; PROCESS_64x2x4 first, off_{first,second}_{src,ref}, do_avg,
+;                {first,second}_extraline, line_ending
+%macro PROCESS_64x2x4 9
+  PROCESS_32x2x4 %1, %2, %3, %2 + 32, %3 + 32, %6, %7, %7, %8 - %7
+  PROCESS_32x2x4  0, %4, %5, %4 + 32, %5 + 32, %6, %8, %8, %9
+%endmacro
+
+; PROCESS_128x2x4 first, off_{first,second}_{src,ref}, do_avg,
+;                 {first,second}_extraline, line_ending
+%macro PROCESS_128x2x4 9
+  PROCESS_64x2x4 %1, %2, %3, %2 + 64, %3 + 64, %6, %7, %7, %8 - %7
+  PROCESS_64x2x4  0, %4, %5, %4 + 64, %5 + 64, %6, %8, %8, %9
+%endmacro
+
+; void aom_sadNxNx4d_sse2(uint8_t *src,    int src_stride,
+;                         uint8_t *ref[4], int ref_stride,
+;                         uint32_t res[4]);
+; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16, 8x8, 8x4, 4x8 and 4x4
+%macro SADNXN4D 2-3 0
+%if %3 == 0
+%if UNIX64
+cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
+                              res, ref2, ref3, ref4
+%else
+cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
+                              ref2, ref3, ref4
+%endif
+%else ; avg
+
+%if UNIX64
+cglobal sad%1x%2x4d_avg, 6, 10, 8, src, src_stride, ref1, ref_stride, \
+                                  second_pred, res, ref2, ref3, ref4
+%else
+cglobal sad%1x%2x4d_avg, 5, 7, 8, src, ref4, ref1, ref_stride, \
+                                  second_pred, ref2, ref3
+  %define src_strideq r1mp
+  %define src_strided r1mp
+%endif
+%endif
+
+  %define mflag ((1 - UNIX64) & %3)
+  movsxdifnidn src_strideq, src_strided
+  movsxdifnidn ref_strideq, ref_strided
+
+  mov                ref2q, [ref1q+gprsize*1]
+  mov                ref3q, [ref1q+gprsize*2]
+  mov                ref4q, [ref1q+gprsize*3]
+  mov                ref1q, [ref1q+gprsize*0]
+
+  PROCESS_%1x2x4 1, 0, 0, 0, ref_strideq, %3, 0, 1, 2
+%rep (%2-4)/2
+  PROCESS_%1x2x4 0, 0, 0, 0, ref_strideq, %3, 0, 1, 2
+%endrep
+  PROCESS_%1x2x4 0, 0, 0, 0, ref_strideq, %3, 0, 1, 2
+
+%if %3 == 0
+  %define resultq r4
+  %define resultmp r4mp
+%else
+  %define resultq r5
+  %define resultmp r5mp
+%endif
+
+%if %1 > 4
+  pslldq                m5, 4
+  pslldq                m7, 4
+  por                   m4, m5
+  por                   m6, m7
+  mova                  m5, m4
+  mova                  m7, m6
+  punpcklqdq            m4, m6
+  punpckhqdq            m5, m7
+  paddd                 m4, m5
+  movifnidn             resultq, resultmp
+  movu                [resultq], m4
+  RET
+%else
+  pshufd            m6, m6, 0x08
+  pshufd            m7, m7, 0x08
+  movifnidn             resultq, resultmp
+  movq              [resultq+0], m6
+  movq              [resultq+8], m7
+  RET
+%endif
+%endmacro
+
+INIT_XMM sse2
+SADNXN4D 128, 128
+SADNXN4D 128, 64
+SADNXN4D 64,  128
+SADNXN4D 64, 64
+SADNXN4D 64, 32
+SADNXN4D 32, 64
+SADNXN4D 32, 32
+SADNXN4D 32, 16
+SADNXN4D 16, 32
+SADNXN4D 16, 16
+SADNXN4D 16,  8
+SADNXN4D  8, 16
+SADNXN4D  8,  8
+SADNXN4D  8,  4
+SADNXN4D  4,  8
+SADNXN4D  4,  4
+SADNXN4D  4, 16
+SADNXN4D 16,  4
+SADNXN4D  8, 32
+SADNXN4D 32,  8
+SADNXN4D 16, 64
+SADNXN4D 64, 16
+SADNXN4D 128, 128, 1
+SADNXN4D 128, 64, 1
+SADNXN4D 64,  128, 1
+SADNXN4D 64, 64, 1
+SADNXN4D 64, 32, 1
+SADNXN4D 32, 64, 1
+SADNXN4D 32, 32, 1
+SADNXN4D 32, 16, 1
+SADNXN4D 16, 32, 1
+SADNXN4D 16, 16, 1
+SADNXN4D 16,  8, 1
+SADNXN4D  8, 16, 1
+SADNXN4D  8,  8, 1
+SADNXN4D  8,  4, 1
+SADNXN4D  4,  8, 1
+SADNXN4D  4,  4, 1
+SADNXN4D  4, 16, 1
+SADNXN4D 16,  4, 1
+SADNXN4D  8, 32, 1
+SADNXN4D 32,  8, 1
+SADNXN4D 16, 64, 1
+SADNXN4D 64, 16, 1
diff --git a/libs/libaom/src/aom_dsp/x86/sad_avx2.c b/libs/libaom/src/aom_dsp/x86/sad_avx2.c
new file mode 100644
index 000000000..a50dba64a
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/sad_avx2.c
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_ports/mem.h"
+
+#define FSAD64_H(h)                                                           \
+  unsigned int aom_sad64x##h##_avx2(const uint8_t *src_ptr, int src_stride,   \
+                                    const uint8_t *ref_ptr, int ref_stride) { \
+    int i, res;                                                               \
+    __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;                           \
+    __m256i sum_sad = _mm256_setzero_si256();                                 \
+    __m256i sum_sad_h;                                                        \
+    __m128i sum_sad128;                                                       \
+    for (i = 0; i < h; i++) {                                                 \
+      ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);                \
+      ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32));         \
+      sad1_reg = _mm256_sad_epu8(                                             \
+          ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));            \
+      sad2_reg = _mm256_sad_epu8(                                             \
+          ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32)));     \
+      sum_sad =                                                               \
+          _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));    \
+      ref_ptr += ref_stride;                                                  \
+      src_ptr += src_stride;                                                  \
+    }                                                                         \
+    sum_sad_h = _mm256_srli_si256(sum_sad, 8);                                \
+    sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);                           \
+    sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);                        \
+    sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);  \
+    res = _mm_cvtsi128_si32(sum_sad128);                                      \
+    _mm256_zeroupper();                                                       \
+    return res;                                                               \
+  }
+
+#define FSAD32_H(h)                                                           \
+  unsigned int aom_sad32x##h##_avx2(const uint8_t *src_ptr, int src_stride,   \
+                                    const uint8_t *ref_ptr, int ref_stride) { \
+    int i, res;                                                               \
+    __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;                           \
+    __m256i sum_sad = _mm256_setzero_si256();                                 \
+    __m256i sum_sad_h;                                                        \
+    __m128i sum_sad128;                                                       \
+    int ref2_stride = ref_stride << 1;                                        \
+    int src2_stride = src_stride << 1;                                        \
+    int max = h >> 1;                                                         \
+    for (i = 0; i < max; i++) {                                               \
+      ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);                \
+      ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \
+      sad1_reg = _mm256_sad_epu8(                                             \
+          ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));            \
+      sad2_reg = _mm256_sad_epu8(                                             \
+          ref2_reg,                                                           \
+          _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride)));       \
+      sum_sad =                                                               \
+          _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));    \
+      ref_ptr += ref2_stride;                                                 \
+      src_ptr += src2_stride;                                                 \
+    }                                                                         \
+    sum_sad_h = _mm256_srli_si256(sum_sad, 8);                                \
+    sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);                           \
+    sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);                        \
+    sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);  \
+    res = _mm_cvtsi128_si32(sum_sad128);                                      \
+    _mm256_zeroupper();                                                       \
+    return res;                                                               \
+  }
+
+#define FSAD64  \
+  FSAD64_H(64); \
+  FSAD64_H(32);
+
+#define FSAD32  \
+  FSAD32_H(64); \
+  FSAD32_H(32); \
+  FSAD32_H(16);
+
+/* clang-format off */
+FSAD64
+FSAD32
+/* clang-format on */
+
+#undef FSAD64
+#undef FSAD32
+#undef FSAD64_H
+#undef FSAD32_H
+
+#define FSADAVG64_H(h)                                                        \
+  unsigned int aom_sad64x##h##_avg_avx2(                                      \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
+      int ref_stride, const uint8_t *second_pred) {                           \
+    int i, res;                                                               \
+    __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;                           \
+    __m256i sum_sad = _mm256_setzero_si256();                                 \
+    __m256i sum_sad_h;                                                        \
+    __m128i sum_sad128;                                                       \
+    for (i = 0; i < h; i++) {                                                 \
+      ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);                \
+      ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32));         \
+      ref1_reg = _mm256_avg_epu8(                                             \
+          ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred));        \
+      ref2_reg = _mm256_avg_epu8(                                             \
+          ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \
+      sad1_reg = _mm256_sad_epu8(                                             \
+          ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));            \
+      sad2_reg = _mm256_sad_epu8(                                             \
+          ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32)));     \
+      sum_sad =                                                               \
+          _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));    \
+      ref_ptr += ref_stride;                                                  \
+      src_ptr += src_stride;                                                  \
+      second_pred += 64;                                                      \
+    }                                                                         \
+    sum_sad_h = _mm256_srli_si256(sum_sad, 8);                                \
+    sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);                           \
+    sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);                        \
+    sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);  \
+    res = _mm_cvtsi128_si32(sum_sad128);                                      \
+    _mm256_zeroupper();                                                       \
+    return res;                                                               \
+  }
+
+#define FSADAVG32_H(h)                                                        \
+  unsigned int aom_sad32x##h##_avg_avx2(                                      \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
+      int ref_stride, const uint8_t *second_pred) {                           \
+    int i, res;                                                               \
+    __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;                           \
+    __m256i sum_sad = _mm256_setzero_si256();                                 \
+    __m256i sum_sad_h;                                                        \
+    __m128i sum_sad128;                                                       \
+    int ref2_stride = ref_stride << 1;                                        \
+    int src2_stride = src_stride << 1;                                        \
+    int max = h >> 1;                                                         \
+    for (i = 0; i < max; i++) {                                               \
+      ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);                \
+      ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \
+      ref1_reg = _mm256_avg_epu8(                                             \
+          ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred));        \
+      ref2_reg = _mm256_avg_epu8(                                             \
+          ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \
+      sad1_reg = _mm256_sad_epu8(                                             \
+          ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));            \
+      sad2_reg = _mm256_sad_epu8(                                             \
+          ref2_reg,                                                           \
+          _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride)));       \
+      sum_sad =                                                               \
+          _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));    \
+      ref_ptr += ref2_stride;                                                 \
+      src_ptr += src2_stride;                                                 \
+      second_pred += 64;                                                      \
+    }                                                                         \
+    sum_sad_h = _mm256_srli_si256(sum_sad, 8);                                \
+    sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);                           \
+    sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);                        \
+    sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);  \
+    res = _mm_cvtsi128_si32(sum_sad128);                                      \
+    _mm256_zeroupper();                                                       \
+    return res;                                                               \
+  }
+
+#define FSADAVG64  \
+  FSADAVG64_H(64); \
+  FSADAVG64_H(32);
+
+#define FSADAVG32  \
+  FSADAVG32_H(64); \
+  FSADAVG32_H(32); \
+  FSADAVG32_H(16);
+
+/* clang-format off */
+FSADAVG64
+FSADAVG32
+/* clang-format on */
+
+#undef FSADAVG64
+#undef FSADAVG32
+#undef FSADAVG64_H
+#undef FSADAVG32_H
diff --git a/libs/libaom/src/aom_dsp/x86/sad_highbd_avx2.c b/libs/libaom/src/aom_dsp/x86/sad_highbd_avx2.c
new file mode 100644
index 000000000..2cff2e6a9
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/sad_highbd_avx2.c
@@ -0,0 +1,699 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+#include "aom_ports/mem.h"
+
+// SAD
+static INLINE unsigned int get_sad_from_mm256_epi32(const __m256i *v) {
+  // input 8 32-bit summation
+  __m128i lo128, hi128;
+  __m256i u = _mm256_srli_si256(*v, 8);
+  u = _mm256_add_epi32(u, *v);
+
+  // 4 32-bit summation
+  hi128 = _mm256_extracti128_si256(u, 1);
+  lo128 = _mm256_castsi256_si128(u);
+  lo128 = _mm_add_epi32(hi128, lo128);
+
+  // 2 32-bit summation
+  hi128 = _mm_srli_si128(lo128, 4);
+  lo128 = _mm_add_epi32(lo128, hi128);
+
+  return (unsigned int)_mm_cvtsi128_si32(lo128);
+}
+
+static INLINE void highbd_sad16x4_core_avx2(__m256i *s, __m256i *r,
+                                            __m256i *sad_acc) {
+  const __m256i zero = _mm256_setzero_si256();
+  int i;
+  for (i = 0; i < 4; i++) {
+    s[i] = _mm256_sub_epi16(s[i], r[i]);
+    s[i] = _mm256_abs_epi16(s[i]);
+  }
+
+  s[0] = _mm256_add_epi16(s[0], s[1]);
+  s[0] = _mm256_add_epi16(s[0], s[2]);
+  s[0] = _mm256_add_epi16(s[0], s[3]);
+
+  r[0] = _mm256_unpacklo_epi16(s[0], zero);
+  r[1] = _mm256_unpackhi_epi16(s[0], zero);
+
+  r[0] = _mm256_add_epi32(r[0], r[1]);
+  *sad_acc = _mm256_add_epi32(*sad_acc, r[0]);
+}
+
+// If sec_ptr = 0, calculate regular SAD. Otherwise, calculate average SAD.
+static INLINE void sad16x4(const uint16_t *src_ptr, int src_stride,
+                           const uint16_t *ref_ptr, int ref_stride,
+                           const uint16_t *sec_ptr, __m256i *sad_acc) {
+  __m256i s[4], r[4];
+  s[0] = _mm256_loadu_si256((const __m256i *)src_ptr);
+  s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride));
+  s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + 2 * src_stride));
+  s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + 3 * src_stride));
+
+  r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr);
+  r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride));
+  r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 2 * ref_stride));
+  r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 3 * ref_stride));
+
+  if (sec_ptr) {
+    r[0] = _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr));
+    r[1] = _mm256_avg_epu16(
+        r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16)));
+    r[2] = _mm256_avg_epu16(
+        r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32)));
+    r[3] = _mm256_avg_epu16(
+        r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48)));
+  }
+  highbd_sad16x4_core_avx2(s, r, sad_acc);
+}
+
+static AOM_FORCE_INLINE unsigned int aom_highbd_sad16xN_avx2(int N,
+                                                             const uint8_t *src,
+                                                             int src_stride,
+                                                             const uint8_t *ref,
+                                                             int ref_stride) {
+  const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);
+  const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref);
+  int i;
+  __m256i sad = _mm256_setzero_si256();
+  for (i = 0; i < N; i += 4) {
+    sad16x4(src_ptr, src_stride, ref_ptr, ref_stride, NULL, &sad);
+    src_ptr += src_stride << 2;
+    ref_ptr += ref_stride << 2;
+  }
+  return (unsigned int)get_sad_from_mm256_epi32(&sad);
+}
+
+static void sad32x4(const uint16_t *src_ptr, int src_stride,
+                    const uint16_t *ref_ptr, int ref_stride,
+                    const uint16_t *sec_ptr, __m256i *sad_acc) {
+  __m256i s[4], r[4];
+  int row_sections = 0;
+
+  while (row_sections < 2) {
+    s[0] = _mm256_loadu_si256((const __m256i *)src_ptr);
+    s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + 16));
+    s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride));
+    s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride + 16));
+
+    r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr);
+    r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16));
+    r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride));
+    r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride + 16));
+
+    if (sec_ptr) {
+      r[0] =
+          _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr));
+      r[1] = _mm256_avg_epu16(
+          r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16)));
+      r[2] = _mm256_avg_epu16(
+          r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32)));
+      r[3] = _mm256_avg_epu16(
+          r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48)));
+      sec_ptr += 32 << 1;
+    }
+    highbd_sad16x4_core_avx2(s, r, sad_acc);
+
+    row_sections += 1;
+    src_ptr += src_stride << 1;
+    ref_ptr += ref_stride << 1;
+  }
+}
+
+static AOM_FORCE_INLINE unsigned int aom_highbd_sad32xN_avx2(int N,
+                                                             const uint8_t *src,
+                                                             int src_stride,
+                                                             const uint8_t *ref,
+                                                             int ref_stride) {
+  __m256i sad = _mm256_setzero_si256();
+  uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
+  uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
+  const int left_shift = 2;
+  int i;
+
+  for (i = 0; i < N; i += 4) {
+    sad32x4(srcp, src_stride, refp, ref_stride, NULL, &sad);
+    srcp += src_stride << left_shift;
+    refp += ref_stride << left_shift;
+  }
+  return get_sad_from_mm256_epi32(&sad);
+}
+
+static void sad64x2(const uint16_t *src_ptr, int src_stride,
+                    const uint16_t *ref_ptr, int ref_stride,
+                    const uint16_t *sec_ptr, __m256i *sad_acc) {
+  __m256i s[4], r[4];
+  int i;
+  for (i = 0; i < 2; i++) {
+    s[0] = _mm256_loadu_si256((const __m256i *)src_ptr);
+    s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + 16));
+    s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + 32));
+    s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + 48));
+
+    r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr);
+    r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16));
+    r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 32));
+    r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 48));
+    if (sec_ptr) {
+      r[0] =
+          _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr));
+      r[1] = _mm256_avg_epu16(
+          r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16)));
+      r[2] = _mm256_avg_epu16(
+          r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32)));
+      r[3] = _mm256_avg_epu16(
+          r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48)));
+      sec_ptr += 64;
+    }
+    highbd_sad16x4_core_avx2(s, r, sad_acc);
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  }
+}
+
+static AOM_FORCE_INLINE unsigned int aom_highbd_sad64xN_avx2(int N,
+                                                             const uint8_t *src,
+                                                             int src_stride,
+                                                             const uint8_t *ref,
+                                                             int ref_stride) {
+  __m256i sad = _mm256_setzero_si256();
+  uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
+  uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
+  const int left_shift = 1;
+  int i;
+  for (i = 0; i < N; i += 2) {
+    sad64x2(srcp, src_stride, refp, ref_stride, NULL, &sad);
+    srcp += src_stride << left_shift;
+    refp += ref_stride << left_shift;
+  }
+  return get_sad_from_mm256_epi32(&sad);
+}
+
+static void sad128x1(const uint16_t *src_ptr, const uint16_t *ref_ptr,
+                     const uint16_t *sec_ptr, __m256i *sad_acc) {
+  __m256i s[4], r[4];
+  int i;
+  for (i = 0; i < 2; i++) {
+    s[0] = _mm256_loadu_si256((const __m256i *)src_ptr);
+    s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + 16));
+    s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + 32));
+    s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + 48));
+    r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr);
+    r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16));
+    r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 32));
+    r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 48));
+    if (sec_ptr) {
+      r[0] =
+          _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr));
+      r[1] = _mm256_avg_epu16(
+          r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16)));
+      r[2] = _mm256_avg_epu16(
+          r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32)));
+      r[3] = _mm256_avg_epu16(
+          r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48)));
+      sec_ptr += 64;
+    }
+    highbd_sad16x4_core_avx2(s, r, sad_acc);
+    src_ptr += 64;
+    ref_ptr += 64;
+  }
+}
+
+static AOM_FORCE_INLINE unsigned int aom_highbd_sad128xN_avx2(
+    int N, const uint8_t *src, int src_stride, const uint8_t *ref,
+    int ref_stride) {
+  __m256i sad = _mm256_setzero_si256();
+  uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
+  uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
+  int row = 0;
+  while (row < N) {
+    sad128x1(srcp, refp, NULL, &sad);
+    srcp += src_stride;
+    refp += ref_stride;
+    row++;
+  }
+  return get_sad_from_mm256_epi32(&sad);
+}
+
+#define highbd_sadMxN_avx2(m, n)                                            \
+  unsigned int aom_highbd_sad##m##x##n##_avx2(                              \
+      const uint8_t *src, int src_stride, const uint8_t *ref,               \
+      int ref_stride) {                                                     \
+    return aom_highbd_sad##m##xN_avx2(n, src, src_stride, ref, ref_stride); \
+  }
+
+highbd_sadMxN_avx2(16, 4);
+highbd_sadMxN_avx2(16, 8);
+highbd_sadMxN_avx2(16, 16);
+highbd_sadMxN_avx2(16, 32);
+highbd_sadMxN_avx2(16, 64);
+
+highbd_sadMxN_avx2(32, 8);
+highbd_sadMxN_avx2(32, 16);
+highbd_sadMxN_avx2(32, 32);
+highbd_sadMxN_avx2(32, 64);
+
+highbd_sadMxN_avx2(64, 16);
+highbd_sadMxN_avx2(64, 32);
+highbd_sadMxN_avx2(64, 64);
+highbd_sadMxN_avx2(64, 128);
+
+highbd_sadMxN_avx2(128, 64);
+highbd_sadMxN_avx2(128, 128);
+
+unsigned int aom_highbd_sad16x4_avg_avx2(const uint8_t *src, int src_stride,
+                                         const uint8_t *ref, int ref_stride,
+                                         const uint8_t *second_pred) {
+  __m256i sad = _mm256_setzero_si256();
+  uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
+  uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
+  uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred);
+  sad16x4(srcp, src_stride, refp, ref_stride, secp, &sad);
+
+  return get_sad_from_mm256_epi32(&sad);
+}
+
+unsigned int aom_highbd_sad16x8_avg_avx2(const uint8_t *src, int src_stride,
+                                         const uint8_t *ref, int ref_stride,
+                                         const uint8_t *second_pred) {
+  __m256i sad = _mm256_setzero_si256();
+  uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
+  uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
+  uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred);
+
+  sad16x4(srcp, src_stride, refp, ref_stride, secp, &sad);
+
+  // Next 4 rows
+  srcp += src_stride << 2;
+  refp += ref_stride << 2;
+  secp += 64;
+  sad16x4(srcp, src_stride, refp, ref_stride, secp, &sad);
+  return get_sad_from_mm256_epi32(&sad);
+}
+
+unsigned int aom_highbd_sad16x16_avg_avx2(const uint8_t *src, int src_stride,
+                                          const uint8_t *ref, int ref_stride,
+                                          const uint8_t *second_pred) {
+  const int left_shift = 3;
+  uint32_t sum = aom_highbd_sad16x8_avg_avx2(src, src_stride, ref, ref_stride,
+                                             second_pred);
+  src += src_stride << left_shift;
+  ref += ref_stride << left_shift;
+  second_pred += 16 << left_shift;
+  sum += aom_highbd_sad16x8_avg_avx2(src, src_stride, ref, ref_stride,
+                                     second_pred);
+  return sum;
+}
+
+unsigned int aom_highbd_sad16x32_avg_avx2(const uint8_t *src, int src_stride,
+                                          const uint8_t *ref, int ref_stride,
+                                          const uint8_t *second_pred) {
+  const int left_shift = 4;
+  uint32_t sum = aom_highbd_sad16x16_avg_avx2(src, src_stride, ref, ref_stride,
+                                              second_pred);
+  src += src_stride << left_shift;
+  ref += ref_stride << left_shift;
+  second_pred += 16 << left_shift;
+  sum += aom_highbd_sad16x16_avg_avx2(src, src_stride, ref, ref_stride,
+                                      second_pred);
+  return sum;
+}
+
+unsigned int aom_highbd_sad16x64_avg_avx2(const uint8_t *src, int src_stride,
+                                          const uint8_t *ref, int ref_stride,
+                                          const uint8_t *second_pred) {
+  const int left_shift = 5;
+  uint32_t sum = aom_highbd_sad16x32_avg_avx2(src, src_stride, ref, ref_stride,
+                                              second_pred);
+  src += src_stride << left_shift;
+  ref += ref_stride << left_shift;
+  second_pred += 16 << left_shift;
+  sum += aom_highbd_sad16x32_avg_avx2(src, src_stride, ref, ref_stride,
+                                      second_pred);
+  return sum;
+}
+
+unsigned int aom_highbd_sad32x8_avg_avx2(const uint8_t *src, int src_stride,
+                                         const uint8_t *ref, int ref_stride,
+                                         const uint8_t *second_pred) {
+  __m256i sad = _mm256_setzero_si256();
+  uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
+  uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
+  uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred);
+  const int left_shift = 2;
+  int row_section = 0;
+
+  while (row_section < 2) {
+    sad32x4(srcp, src_stride, refp, ref_stride, secp, &sad);
+    srcp += src_stride << left_shift;
+    refp += ref_stride << left_shift;
+    secp += 32 << left_shift;
+    row_section += 1;
+  }
+  return get_sad_from_mm256_epi32(&sad);
+}
+
+unsigned int aom_highbd_sad32x16_avg_avx2(const uint8_t *src, int src_stride,
+                                          const uint8_t *ref, int ref_stride,
+                                          const uint8_t *second_pred) {
+  __m256i sad = _mm256_setzero_si256();
+  uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
+  uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
+  uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred);
+  const int left_shift = 2;
+  int row_section = 0;
+
+  while (row_section < 4) {
+    sad32x4(srcp, src_stride, refp, ref_stride, secp, &sad);
+    srcp += src_stride << left_shift;
+    refp += ref_stride << left_shift;
+    secp += 32 << left_shift;
+    row_section += 1;
+  }
+  return get_sad_from_mm256_epi32(&sad);
+}
+
+unsigned int aom_highbd_sad32x32_avg_avx2(const uint8_t *src, int src_stride,
+                                          const uint8_t *ref, int ref_stride,
+                                          const uint8_t *second_pred) {
+  const int left_shift = 4;
+  uint32_t sum = aom_highbd_sad32x16_avg_avx2(src, src_stride, ref, ref_stride,
+                                              second_pred);
+  src += src_stride << left_shift;
+  ref += ref_stride << left_shift;
+  second_pred += 32 << left_shift;
+  sum += aom_highbd_sad32x16_avg_avx2(src, src_stride, ref, ref_stride,
+                                      second_pred);
+  return sum;
+}
+
+unsigned int aom_highbd_sad32x64_avg_avx2(const uint8_t *src, int src_stride,
+                                          const uint8_t *ref, int ref_stride,
+                                          const uint8_t *second_pred) {
+  const int left_shift = 5;
+  uint32_t sum = aom_highbd_sad32x32_avg_avx2(src, src_stride, ref, ref_stride,
+                                              second_pred);
+  src += src_stride << left_shift;
+  ref += ref_stride << left_shift;
+  second_pred += 32 << left_shift;
+  sum += aom_highbd_sad32x32_avg_avx2(src, src_stride, ref, ref_stride,
+                                      second_pred);
+  return sum;
+}
+
+unsigned int aom_highbd_sad64x16_avg_avx2(const uint8_t *src, int src_stride,
+                                          const uint8_t *ref, int ref_stride,
+                                          const uint8_t *second_pred) {
+  __m256i sad = _mm256_setzero_si256();
+  uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
+  uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
+  uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred);
+  const int left_shift = 1;
+  int row_section = 0;
+
+  while (row_section < 8) {
+    sad64x2(srcp, src_stride, refp, ref_stride, secp, &sad);
+    srcp += src_stride << left_shift;
+    refp += ref_stride << left_shift;
+    secp += 64 << left_shift;
+    row_section += 1;
+  }
+  return get_sad_from_mm256_epi32(&sad);
+}
+
+unsigned int aom_highbd_sad64x32_avg_avx2(const uint8_t *src, int src_stride,
+                                          const uint8_t *ref, int ref_stride,
+                                          const uint8_t *second_pred) {
+  __m256i sad = _mm256_setzero_si256();
+  uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
+  uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
+  uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred);
+  const int left_shift = 1;
+  int row_section = 0;
+
+  while (row_section < 16) {
+    sad64x2(srcp, src_stride, refp, ref_stride, secp, &sad);
+    srcp += src_stride << left_shift;
+    refp += ref_stride << left_shift;
+    secp += 64 << left_shift;
+    row_section += 1;
+  }
+  return get_sad_from_mm256_epi32(&sad);
+}
+
+unsigned int aom_highbd_sad64x64_avg_avx2(const uint8_t *src, int src_stride,
+                                          const uint8_t *ref, int ref_stride,
+                                          const uint8_t *second_pred) {
+  const int left_shift = 5;
+  uint32_t sum = aom_highbd_sad64x32_avg_avx2(src, src_stride, ref, ref_stride,
+                                              second_pred);
+  src += src_stride << left_shift;
+  ref += ref_stride << left_shift;
+  second_pred += 64 << left_shift;
+  sum += aom_highbd_sad64x32_avg_avx2(src, src_stride, ref, ref_stride,
+                                      second_pred);
+  return sum;
+}
+
+unsigned int aom_highbd_sad64x128_avg_avx2(const uint8_t *src, int src_stride,
+                                           const uint8_t *ref, int ref_stride,
+                                           const uint8_t *second_pred) {
+  const int left_shift = 6;
+  uint32_t sum = aom_highbd_sad64x64_avg_avx2(src, src_stride, ref, ref_stride,
+                                              second_pred);
+  src += src_stride << left_shift;
+  ref += ref_stride << left_shift;
+  second_pred += 64 << left_shift;
+  sum += aom_highbd_sad64x64_avg_avx2(src, src_stride, ref, ref_stride,
+                                      second_pred);
+  return sum;
+}
+
+unsigned int aom_highbd_sad128x64_avg_avx2(const uint8_t *src, int src_stride,
+                                           const uint8_t *ref, int ref_stride,
+                                           const uint8_t *second_pred) {
+  __m256i sad = _mm256_setzero_si256();
+  uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
+  uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
+  uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred);
+  int row = 0;
+  while (row < 64) {
+    sad128x1(srcp, refp, secp, &sad);
+    srcp += src_stride;
+    refp += ref_stride;
+    secp += 16 << 3;
+    row += 1;
+  }
+  return get_sad_from_mm256_epi32(&sad);
+}
+
+unsigned int aom_highbd_sad128x128_avg_avx2(const uint8_t *src, int src_stride,
+                                            const uint8_t *ref, int ref_stride,
+                                            const uint8_t *second_pred) {
+  unsigned int sum;
+  const int left_shift = 6;
+
+  sum = aom_highbd_sad128x64_avg_avx2(src, src_stride, ref, ref_stride,
+                                      second_pred);
+  src += src_stride << left_shift;
+  ref += ref_stride << left_shift;
+  second_pred += 128 << left_shift;
+  sum += aom_highbd_sad128x64_avg_avx2(src, src_stride, ref, ref_stride,
+                                       second_pred);
+  return sum;
+}
+
+// SAD 4D
+// Combine 4 __m256i input vectors  v to uint32_t result[4]
+static INLINE void get_4d_sad_from_mm256_epi32(const __m256i *v,
+                                               uint32_t *res) {
+  __m256i u0, u1, u2, u3;
+  const __m256i mask = yy_set1_64_from_32i(UINT32_MAX);
+  __m128i sad;
+
+  // 8 32-bit summation
+  u0 = _mm256_srli_si256(v[0], 4);
+  u1 = _mm256_srli_si256(v[1], 4);
+  u2 = _mm256_srli_si256(v[2], 4);
+  u3 = _mm256_srli_si256(v[3], 4);
+
+  u0 = _mm256_add_epi32(u0, v[0]);
+  u1 = _mm256_add_epi32(u1, v[1]);
+  u2 = _mm256_add_epi32(u2, v[2]);
+  u3 = _mm256_add_epi32(u3, v[3]);
+
+  u0 = _mm256_and_si256(u0, mask);
+  u1 = _mm256_and_si256(u1, mask);
+  u2 = _mm256_and_si256(u2, mask);
+  u3 = _mm256_and_si256(u3, mask);
+  // 4 32-bit summation, evenly positioned
+
+  u1 = _mm256_slli_si256(u1, 4);
+  u3 = _mm256_slli_si256(u3, 4);
+
+  u0 = _mm256_or_si256(u0, u1);
+  u2 = _mm256_or_si256(u2, u3);
+  // 8 32-bit summation, interleaved
+
+  u1 = _mm256_unpacklo_epi64(u0, u2);
+  u3 = _mm256_unpackhi_epi64(u0, u2);
+
+  u0 = _mm256_add_epi32(u1, u3);
+  sad = _mm_add_epi32(_mm256_extractf128_si256(u0, 1),
+                      _mm256_castsi256_si128(u0));
+  _mm_storeu_si128((__m128i *)res, sad);
+}
+
+static void convert_pointers(const uint8_t *const ref8[],
+                             const uint16_t *ref[]) {
+  ref[0] = CONVERT_TO_SHORTPTR(ref8[0]);
+  ref[1] = CONVERT_TO_SHORTPTR(ref8[1]);
+  ref[2] = CONVERT_TO_SHORTPTR(ref8[2]);
+  ref[3] = CONVERT_TO_SHORTPTR(ref8[3]);
+}
+
+static void init_sad(__m256i *s) {
+  s[0] = _mm256_setzero_si256();
+  s[1] = _mm256_setzero_si256();
+  s[2] = _mm256_setzero_si256();
+  s[3] = _mm256_setzero_si256();
+}
+
+static AOM_FORCE_INLINE void aom_highbd_sad16xNx4d_avx2(
+    int N, const uint8_t *src, int src_stride, const uint8_t *const ref_array[],
+    int ref_stride, uint32_t *sad_array) {
+  __m256i sad_vec[4];
+  const uint16_t *refp[4];
+  const uint16_t *keep = CONVERT_TO_SHORTPTR(src);
+  const uint16_t *srcp;
+  const int shift_for_4_rows = 2;
+  int i, j;
+
+  init_sad(sad_vec);
+  convert_pointers(ref_array, refp);
+
+  for (i = 0; i < 4; ++i) {
+    srcp = keep;
+    for (j = 0; j < N; j += 4) {
+      sad16x4(srcp, src_stride, refp[i], ref_stride, 0, &sad_vec[i]);
+      srcp += src_stride << shift_for_4_rows;
+      refp[i] += ref_stride << shift_for_4_rows;
+    }
+  }
+  get_4d_sad_from_mm256_epi32(sad_vec, sad_array);
+}
+
+static AOM_FORCE_INLINE void aom_highbd_sad32xNx4d_avx2(
+    int N, const uint8_t *src, int src_stride, const uint8_t *const ref_array[],
+    int ref_stride, uint32_t *sad_array) {
+  __m256i sad_vec[4];
+  const uint16_t *refp[4];
+  const uint16_t *keep = CONVERT_TO_SHORTPTR(src);
+  const uint16_t *srcp;
+  const int shift_for_4_rows = 2;
+  int i, r;
+
+  init_sad(sad_vec);
+  convert_pointers(ref_array, refp);
+
+  for (i = 0; i < 4; ++i) {
+    srcp = keep;
+    for (r = 0; r < N; r += 4) {
+      sad32x4(srcp, src_stride, refp[i], ref_stride, 0, &sad_vec[i]);
+      srcp += src_stride << shift_for_4_rows;
+      refp[i] += ref_stride << shift_for_4_rows;
+    }
+  }
+  get_4d_sad_from_mm256_epi32(sad_vec, sad_array);
+}
+
+static AOM_FORCE_INLINE void aom_highbd_sad64xNx4d_avx2(
+    int N, const uint8_t *src, int src_stride, const uint8_t *const ref_array[],
+    int ref_stride, uint32_t *sad_array) {
+  __m256i sad_vec[4];
+  const uint16_t *refp[4];
+  const uint16_t *keep = CONVERT_TO_SHORTPTR(src);
+  const uint16_t *srcp;
+  const int shift_for_rows = 1;
+  int i, r;
+
+  init_sad(sad_vec);
+  convert_pointers(ref_array, refp);
+
+  for (i = 0; i < 4; ++i) {
+    srcp = keep;
+    for (r = 0; r < N; r += 2) {
+      sad64x2(srcp, src_stride, refp[i], ref_stride, NULL, &sad_vec[i]);
+      srcp += src_stride << shift_for_rows;
+      refp[i] += ref_stride << shift_for_rows;
+    }
+  }
+  get_4d_sad_from_mm256_epi32(sad_vec, sad_array);
+}
+
+static AOM_FORCE_INLINE void aom_highbd_sad128xNx4d_avx2(
+    int N, const uint8_t *src, int src_stride, const uint8_t *const ref_array[],
+    int ref_stride, uint32_t *sad_array) {
+  __m256i sad_vec[4];
+  const uint16_t *refp[4];
+  const uint16_t *keep = CONVERT_TO_SHORTPTR(src);
+  const uint16_t *srcp;
+  int i, r;
+
+  init_sad(sad_vec);
+  convert_pointers(ref_array, refp);
+
+  for (i = 0; i < 4; ++i) {
+    srcp = keep;
+    for (r = 0; r < N; r++) {
+      sad128x1(srcp, refp[i], NULL, &sad_vec[i]);
+      srcp += src_stride;
+      refp[i] += ref_stride;
+    }
+  }
+  get_4d_sad_from_mm256_epi32(sad_vec, sad_array);
+}
+
+#define highbd_sadMxNx4d_avx2(m, n)                                          \
+  void aom_highbd_sad##m##x##n##x4d_avx2(                                    \
+      const uint8_t *src, int src_stride, const uint8_t *const ref_array[],  \
+      int ref_stride, uint32_t *sad_array) {                                 \
+    aom_highbd_sad##m##xNx4d_avx2(n, src, src_stride, ref_array, ref_stride, \
+                                  sad_array);                                \
+  }
+
+highbd_sadMxNx4d_avx2(16, 4);
+highbd_sadMxNx4d_avx2(16, 8);
+highbd_sadMxNx4d_avx2(16, 16);
+highbd_sadMxNx4d_avx2(16, 32);
+highbd_sadMxNx4d_avx2(16, 64);
+
+highbd_sadMxNx4d_avx2(32, 8);
+highbd_sadMxNx4d_avx2(32, 16);
+highbd_sadMxNx4d_avx2(32, 32);
+highbd_sadMxNx4d_avx2(32, 64);
+
+highbd_sadMxNx4d_avx2(64, 16);
+highbd_sadMxNx4d_avx2(64, 32);
+highbd_sadMxNx4d_avx2(64, 64);
+highbd_sadMxNx4d_avx2(64, 128);
+
+highbd_sadMxNx4d_avx2(128, 64);
+highbd_sadMxNx4d_avx2(128, 128);
diff --git a/libs/libaom/src/aom_dsp/x86/sad_impl_avx2.c b/libs/libaom/src/aom_dsp/x86/sad_impl_avx2.c
new file mode 100644
index 000000000..f77a585b4
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/sad_impl_avx2.c
@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+static unsigned int sad32x32(const uint8_t *src_ptr, int src_stride,
+                             const uint8_t *ref_ptr, int ref_stride) {
+  __m256i s1, s2, r1, r2;
+  __m256i sum = _mm256_setzero_si256();
+  __m128i sum_i128;
+  int i;
+
+  for (i = 0; i < 16; ++i) {
+    r1 = _mm256_loadu_si256((__m256i const *)ref_ptr);
+    r2 = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride));
+    s1 = _mm256_sad_epu8(r1, _mm256_loadu_si256((__m256i const *)src_ptr));
+    s2 = _mm256_sad_epu8(
+        r2, _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride)));
+    sum = _mm256_add_epi32(sum, _mm256_add_epi32(s1, s2));
+    ref_ptr += ref_stride << 1;
+    src_ptr += src_stride << 1;
+  }
+
+  sum = _mm256_add_epi32(sum, _mm256_srli_si256(sum, 8));
+  sum_i128 = _mm_add_epi32(_mm256_extracti128_si256(sum, 1),
+                           _mm256_castsi256_si128(sum));
+  return _mm_cvtsi128_si32(sum_i128);
+}
+
+static unsigned int sad64x32(const uint8_t *src_ptr, int src_stride,
+                             const uint8_t *ref_ptr, int ref_stride) {
+  unsigned int half_width = 32;
+  uint32_t sum = sad32x32(src_ptr, src_stride, ref_ptr, ref_stride);
+  src_ptr += half_width;
+  ref_ptr += half_width;
+  sum += sad32x32(src_ptr, src_stride, ref_ptr, ref_stride);
+  return sum;
+}
+
+static unsigned int sad64x64(const uint8_t *src_ptr, int src_stride,
+                             const uint8_t *ref_ptr, int ref_stride) {
+  uint32_t sum = sad64x32(src_ptr, src_stride, ref_ptr, ref_stride);
+  src_ptr += src_stride << 5;
+  ref_ptr += ref_stride << 5;
+  sum += sad64x32(src_ptr, src_stride, ref_ptr, ref_stride);
+  return sum;
+}
+
+unsigned int aom_sad128x64_avx2(const uint8_t *src_ptr, int src_stride,
+                                const uint8_t *ref_ptr, int ref_stride) {
+  unsigned int half_width = 64;
+  uint32_t sum = sad64x64(src_ptr, src_stride, ref_ptr, ref_stride);
+  src_ptr += half_width;
+  ref_ptr += half_width;
+  sum += sad64x64(src_ptr, src_stride, ref_ptr, ref_stride);
+  return sum;
+}
+
+unsigned int aom_sad64x128_avx2(const uint8_t *src_ptr, int src_stride,
+                                const uint8_t *ref_ptr, int ref_stride) {
+  uint32_t sum = sad64x64(src_ptr, src_stride, ref_ptr, ref_stride);
+  src_ptr += src_stride << 6;
+  ref_ptr += ref_stride << 6;
+  sum += sad64x64(src_ptr, src_stride, ref_ptr, ref_stride);
+  return sum;
+}
+
+unsigned int aom_sad128x128_avx2(const uint8_t *src_ptr, int src_stride,
+                                 const uint8_t *ref_ptr, int ref_stride) {
+  uint32_t sum = aom_sad128x64_avx2(src_ptr, src_stride, ref_ptr, ref_stride);
+  src_ptr += src_stride << 6;
+  ref_ptr += ref_stride << 6;
+  sum += aom_sad128x64_avx2(src_ptr, src_stride, ref_ptr, ref_stride);
+  return sum;
+}
+
+static unsigned int sad_w64_avg_avx2(const uint8_t *src_ptr, int src_stride,
+                                     const uint8_t *ref_ptr, int ref_stride,
+                                     const int h, const uint8_t *second_pred,
+                                     const int second_pred_stride) {
+  int i, res;
+  __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;
+  __m256i sum_sad = _mm256_setzero_si256();
+  __m256i sum_sad_h;
+  __m128i sum_sad128;
+  for (i = 0; i < h; i++) {
+    ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);
+    ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32));
+    ref1_reg = _mm256_avg_epu8(
+        ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred));
+    ref2_reg = _mm256_avg_epu8(
+        ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32)));
+    sad1_reg =
+        _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));
+    sad2_reg = _mm256_sad_epu8(
+        ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32)));
+    sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));
+    ref_ptr += ref_stride;
+    src_ptr += src_stride;
+    second_pred += second_pred_stride;
+  }
+  sum_sad_h = _mm256_srli_si256(sum_sad, 8);
+  sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);
+  sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);
+  sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);
+  res = _mm_cvtsi128_si32(sum_sad128);
+
+  return res;
+}
+
+unsigned int aom_sad64x128_avg_avx2(const uint8_t *src_ptr, int src_stride,
+                                    const uint8_t *ref_ptr, int ref_stride,
+                                    const uint8_t *second_pred) {
+  uint32_t sum = sad_w64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 64,
+                                  second_pred, 64);
+  src_ptr += src_stride << 6;
+  ref_ptr += ref_stride << 6;
+  second_pred += 64 << 6;
+  sum += sad_w64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 64,
+                          second_pred, 64);
+  return sum;
+}
+
+unsigned int aom_sad128x64_avg_avx2(const uint8_t *src_ptr, int src_stride,
+                                    const uint8_t *ref_ptr, int ref_stride,
+                                    const uint8_t *second_pred) {
+  unsigned int half_width = 64;
+  uint32_t sum = sad_w64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 64,
+                                  second_pred, 128);
+  src_ptr += half_width;
+  ref_ptr += half_width;
+  second_pred += half_width;
+  sum += sad_w64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 64,
+                          second_pred, 128);
+  return sum;
+}
+
+unsigned int aom_sad128x128_avg_avx2(const uint8_t *src_ptr, int src_stride,
+                                     const uint8_t *ref_ptr, int ref_stride,
+                                     const uint8_t *second_pred) {
+  uint32_t sum = aom_sad128x64_avg_avx2(src_ptr, src_stride, ref_ptr,
+                                        ref_stride, second_pred);
+  src_ptr += src_stride << 6;
+  ref_ptr += ref_stride << 6;
+  second_pred += 128 << 6;
+  sum += aom_sad128x64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride,
+                                second_pred);
+  return sum;
+}
diff --git a/libs/libaom/src/aom_dsp/x86/sad_sse2.asm b/libs/libaom/src/aom_dsp/x86/sad_sse2.asm
new file mode 100644
index 000000000..3251b7655
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/sad_sse2.asm
@@ -0,0 +1,353 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+%macro SAD_FN 4
+%if %4 == 0
+%if %3 == 5
+cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows
+%else ; %3 == 7
+cglobal sad%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \
+                            src_stride3, ref_stride3, n_rows
+%endif ; %3 == 5/7
+%else ; avg
+%if %3 == 5
+cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \
+                                    second_pred, n_rows
+%else ; %3 == 7
+cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 6, src, src_stride, \
+                                              ref, ref_stride, \
+                                              second_pred, \
+                                              src_stride3, ref_stride3
+%if ARCH_X86_64
+%define n_rowsd r7d
+%else ; x86-32
+%define n_rowsd dword r0m
+%endif ; x86-32/64
+%endif ; %3 == 5/7
+%endif ; avg/sad
+  movsxdifnidn src_strideq, src_strided
+  movsxdifnidn ref_strideq, ref_strided
+%if %3 == 7
+  lea         src_stride3q, [src_strideq*3]
+  lea         ref_stride3q, [ref_strideq*3]
+%endif ; %3 == 7
+%endmacro
+
+; unsigned int aom_sad128x128_sse2(uint8_t *src, int src_stride,
+;                                  uint8_t *ref, int ref_stride);
+%macro SAD128XN 1-2 0
+  SAD_FN 128, %1, 5, %2
+  mov              n_rowsd, %1
+  pxor                  m0, m0
+
+.loop:
+  movu                  m1, [refq]
+  movu                  m2, [refq+16]
+  movu                  m3, [refq+32]
+  movu                  m4, [refq+48]
+%if %2 == 1
+  pavgb                 m1, [second_predq+mmsize*0]
+  pavgb                 m2, [second_predq+mmsize*1]
+  pavgb                 m3, [second_predq+mmsize*2]
+  pavgb                 m4, [second_predq+mmsize*3]
+%endif
+  psadbw                m1, [srcq]
+  psadbw                m2, [srcq+16]
+  psadbw                m3, [srcq+32]
+  psadbw                m4, [srcq+48]
+
+  paddd                 m1, m2
+  paddd                 m3, m4
+  paddd                 m0, m1
+  paddd                 m0, m3
+
+  movu                  m1, [refq+64]
+  movu                  m2, [refq+80]
+  movu                  m3, [refq+96]
+  movu                  m4, [refq+112]
+%if %2 == 1
+  pavgb                 m1, [second_predq+mmsize*4]
+  pavgb                 m2, [second_predq+mmsize*5]
+  pavgb                 m3, [second_predq+mmsize*6]
+  pavgb                 m4, [second_predq+mmsize*7]
+  lea         second_predq, [second_predq+mmsize*8]
+%endif
+  psadbw                m1, [srcq+64]
+  psadbw                m2, [srcq+80]
+  psadbw                m3, [srcq+96]
+  psadbw                m4, [srcq+112]
+
+  add                 refq, ref_strideq
+  add                 srcq, src_strideq
+
+  paddd                 m1, m2
+  paddd                 m3, m4
+  paddd                 m0, m1
+  paddd                 m0, m3
+
+  sub              n_rowsd, 1
+  jg .loop
+
+  movhlps               m1, m0
+  paddd                 m0, m1
+  movd                 eax, m0
+  RET
+%endmacro
+
+INIT_XMM sse2
+SAD128XN 128     ; sad128x128_sse2
+SAD128XN 128, 1  ; sad128x128_avg_sse2
+SAD128XN 64      ; sad128x64_sse2
+SAD128XN 64, 1   ; sad128x64_avg_sse2
+
+
+; unsigned int aom_sad64x64_sse2(uint8_t *src, int src_stride,
+;                                uint8_t *ref, int ref_stride);
+%macro SAD64XN 1-2 0
+  SAD_FN 64, %1, 5, %2
+  mov              n_rowsd, %1
+  pxor                  m0, m0
+.loop:
+  movu                  m1, [refq]
+  movu                  m2, [refq+16]
+  movu                  m3, [refq+32]
+  movu                  m4, [refq+48]
+%if %2 == 1
+  pavgb                 m1, [second_predq+mmsize*0]
+  pavgb                 m2, [second_predq+mmsize*1]
+  pavgb                 m3, [second_predq+mmsize*2]
+  pavgb                 m4, [second_predq+mmsize*3]
+  lea         second_predq, [second_predq+mmsize*4]
+%endif
+  psadbw                m1, [srcq]
+  psadbw                m2, [srcq+16]
+  psadbw                m3, [srcq+32]
+  psadbw                m4, [srcq+48]
+  paddd                 m1, m2
+  paddd                 m3, m4
+  add                 refq, ref_strideq
+  paddd                 m0, m1
+  add                 srcq, src_strideq
+  paddd                 m0, m3
+  dec              n_rowsd
+  jg .loop
+
+  movhlps               m1, m0
+  paddd                 m0, m1
+  movd                 eax, m0
+  RET
+%endmacro
+
+INIT_XMM sse2
+SAD64XN 128     ; sad64x128_sse2
+SAD64XN 128, 1  ; sad64x128_avg_sse2
+SAD64XN 64 ; sad64x64_sse2
+SAD64XN 32 ; sad64x32_sse2
+SAD64XN 64, 1 ; sad64x64_avg_sse2
+SAD64XN 32, 1 ; sad64x32_avg_sse2
+SAD64XN 16 ; sad64x16_sse2
+SAD64XN 16, 1 ; sad64x16_avg_sse2
+
+; unsigned int aom_sad32x32_sse2(uint8_t *src, int src_stride,
+;                                uint8_t *ref, int ref_stride);
+%macro SAD32XN 1-2 0
+  SAD_FN 32, %1, 5, %2
+  mov              n_rowsd, %1/2
+  pxor                  m0, m0
+.loop:
+  movu                  m1, [refq]
+  movu                  m2, [refq+16]
+  movu                  m3, [refq+ref_strideq]
+  movu                  m4, [refq+ref_strideq+16]
+%if %2 == 1
+  pavgb                 m1, [second_predq+mmsize*0]
+  pavgb                 m2, [second_predq+mmsize*1]
+  pavgb                 m3, [second_predq+mmsize*2]
+  pavgb                 m4, [second_predq+mmsize*3]
+  lea         second_predq, [second_predq+mmsize*4]
+%endif
+  psadbw                m1, [srcq]
+  psadbw                m2, [srcq+16]
+  psadbw                m3, [srcq+src_strideq]
+  psadbw                m4, [srcq+src_strideq+16]
+  paddd                 m1, m2
+  paddd                 m3, m4
+  lea                 refq, [refq+ref_strideq*2]
+  paddd                 m0, m1
+  lea                 srcq, [srcq+src_strideq*2]
+  paddd                 m0, m3
+  dec              n_rowsd
+  jg .loop
+
+  movhlps               m1, m0
+  paddd                 m0, m1
+  movd                 eax, m0
+  RET
+%endmacro
+
+INIT_XMM sse2
+SAD32XN 64 ; sad32x64_sse2
+SAD32XN 32 ; sad32x32_sse2
+SAD32XN 16 ; sad32x16_sse2
+SAD32XN 64, 1 ; sad32x64_avg_sse2
+SAD32XN 32, 1 ; sad32x32_avg_sse2
+SAD32XN 16, 1 ; sad32x16_avg_sse2
+SAD32XN 8 ; sad_32x8_sse2
+SAD32XN 8, 1 ; sad_32x8_avg_sse2
+
+; unsigned int aom_sad16x{8,16}_sse2(uint8_t *src, int src_stride,
+;                                    uint8_t *ref, int ref_stride);
+%macro SAD16XN 1-2 0
+  SAD_FN 16, %1, 7, %2
+  mov              n_rowsd, %1/4
+  pxor                  m0, m0
+
+.loop:
+  movu                  m1, [refq]
+  movu                  m2, [refq+ref_strideq]
+  movu                  m3, [refq+ref_strideq*2]
+  movu                  m4, [refq+ref_stride3q]
+%if %2 == 1
+  pavgb                 m1, [second_predq+mmsize*0]
+  pavgb                 m2, [second_predq+mmsize*1]
+  pavgb                 m3, [second_predq+mmsize*2]
+  pavgb                 m4, [second_predq+mmsize*3]
+  lea         second_predq, [second_predq+mmsize*4]
+%endif
+  psadbw                m1, [srcq]
+  psadbw                m2, [srcq+src_strideq]
+  psadbw                m3, [srcq+src_strideq*2]
+  psadbw                m4, [srcq+src_stride3q]
+  paddd                 m1, m2
+  paddd                 m3, m4
+  lea                 refq, [refq+ref_strideq*4]
+  paddd                 m0, m1
+  lea                 srcq, [srcq+src_strideq*4]
+  paddd                 m0, m3
+  dec              n_rowsd
+  jg .loop
+
+  movhlps               m1, m0
+  paddd                 m0, m1
+  movd                 eax, m0
+  RET
+%endmacro
+
+INIT_XMM sse2
+SAD16XN 32 ; sad16x32_sse2
+SAD16XN 16 ; sad16x16_sse2
+SAD16XN  8 ; sad16x8_sse2
+SAD16XN 32, 1 ; sad16x32_avg_sse2
+SAD16XN 16, 1 ; sad16x16_avg_sse2
+SAD16XN  8, 1 ; sad16x8_avg_sse2
+SAD16XN 4 ; sad_16x4_sse2
+SAD16XN 4, 1 ; sad_16x4_avg_sse2
+SAD16XN 64 ; sad_16x64_sse2
+SAD16XN 64, 1 ; sad_16x64_avg_sse2
+
+; unsigned int aom_sad8x{8,16}_sse2(uint8_t *src, int src_stride,
+;                                   uint8_t *ref, int ref_stride);
+%macro SAD8XN 1-2 0
+  SAD_FN 8, %1, 7, %2
+  mov              n_rowsd, %1/4
+  pxor                  m0, m0
+
+.loop:
+  movh                  m1, [refq]
+  movhps                m1, [refq+ref_strideq]
+  movh                  m2, [refq+ref_strideq*2]
+  movhps                m2, [refq+ref_stride3q]
+%if %2 == 1
+  pavgb                 m1, [second_predq+mmsize*0]
+  pavgb                 m2, [second_predq+mmsize*1]
+  lea         second_predq, [second_predq+mmsize*2]
+%endif
+  movh                  m3, [srcq]
+  movhps                m3, [srcq+src_strideq]
+  movh                  m4, [srcq+src_strideq*2]
+  movhps                m4, [srcq+src_stride3q]
+  psadbw                m1, m3
+  psadbw                m2, m4
+  lea                 refq, [refq+ref_strideq*4]
+  paddd                 m0, m1
+  lea                 srcq, [srcq+src_strideq*4]
+  paddd                 m0, m2
+  dec              n_rowsd
+  jg .loop
+
+  movhlps               m1, m0
+  paddd                 m0, m1
+  movd                 eax, m0
+  RET
+%endmacro
+
+INIT_XMM sse2
+SAD8XN 16 ; sad8x16_sse2
+SAD8XN  8 ; sad8x8_sse2
+SAD8XN  4 ; sad8x4_sse2
+SAD8XN 16, 1 ; sad8x16_avg_sse2
+SAD8XN  8, 1 ; sad8x8_avg_sse2
+SAD8XN  4, 1 ; sad8x4_avg_sse2
+SAD8XN 32 ; sad_8x32_sse2
+SAD8XN 32, 1 ; sad_8x32_avg_sse2
+
+; unsigned int aom_sad4x{4, 8}_sse2(uint8_t *src, int src_stride,
+;                                   uint8_t *ref, int ref_stride);
+%macro SAD4XN 1-2 0
+  SAD_FN 4, %1, 7, %2
+  mov              n_rowsd, %1/4
+  pxor                  m0, m0
+
+.loop:
+  movd                  m1, [refq]
+  movd                  m2, [refq+ref_strideq]
+  movd                  m3, [refq+ref_strideq*2]
+  movd                  m4, [refq+ref_stride3q]
+  punpckldq             m1, m2
+  punpckldq             m3, m4
+  movlhps               m1, m3
+%if %2 == 1
+  pavgb                 m1, [second_predq+mmsize*0]
+  lea         second_predq, [second_predq+mmsize*1]
+%endif
+  movd                  m2, [srcq]
+  movd                  m5, [srcq+src_strideq]
+  movd                  m4, [srcq+src_strideq*2]
+  movd                  m3, [srcq+src_stride3q]
+  punpckldq             m2, m5
+  punpckldq             m4, m3
+  movlhps               m2, m4
+  psadbw                m1, m2
+  lea                 refq, [refq+ref_strideq*4]
+  paddd                 m0, m1
+  lea                 srcq, [srcq+src_strideq*4]
+  dec              n_rowsd
+  jg .loop
+
+  movhlps               m1, m0
+  paddd                 m0, m1
+  movd                 eax, m0
+  RET
+%endmacro
+
+INIT_XMM sse2
+SAD4XN  8 ; sad4x8_sse
+SAD4XN  4 ; sad4x4_sse
+SAD4XN  8, 1 ; sad4x8_avg_sse
+SAD4XN  4, 1 ; sad4x4_avg_sse
+SAD4XN 16 ; sad_4x16_sse2
+SAD4XN 16, 1 ; sad_4x16_avg_sse2
diff --git a/libs/libaom/src/aom_dsp/x86/sse_avx2.c b/libs/libaom/src/aom_dsp/x86/sse_avx2.c
new file mode 100644
index 000000000..e6ee2fcab
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/sse_avx2.c
@@ -0,0 +1,384 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <smmintrin.h>
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_ports/mem.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+
+static INLINE void sse_w32_avx2(__m256i *sum, const uint8_t *a,
+                                const uint8_t *b) {
+  const __m256i v_a0 = yy_loadu_256(a);
+  const __m256i v_b0 = yy_loadu_256(b);
+  const __m256i zero = _mm256_setzero_si256();
+  const __m256i v_a00_w = _mm256_unpacklo_epi8(v_a0, zero);
+  const __m256i v_a01_w = _mm256_unpackhi_epi8(v_a0, zero);
+  const __m256i v_b00_w = _mm256_unpacklo_epi8(v_b0, zero);
+  const __m256i v_b01_w = _mm256_unpackhi_epi8(v_b0, zero);
+  const __m256i v_d00_w = _mm256_sub_epi16(v_a00_w, v_b00_w);
+  const __m256i v_d01_w = _mm256_sub_epi16(v_a01_w, v_b01_w);
+  *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d00_w, v_d00_w));
+  *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d01_w, v_d01_w));
+}
+
+static INLINE int64_t summary_all_avx2(const __m256i *sum_all) {
+  int64_t sum;
+  __m256i zero = _mm256_setzero_si256();
+  const __m256i sum0_4x64 = _mm256_unpacklo_epi32(*sum_all, zero);
+  const __m256i sum1_4x64 = _mm256_unpackhi_epi32(*sum_all, zero);
+  const __m256i sum_4x64 = _mm256_add_epi64(sum0_4x64, sum1_4x64);
+  const __m128i sum_2x64 = _mm_add_epi64(_mm256_castsi256_si128(sum_4x64),
+                                         _mm256_extracti128_si256(sum_4x64, 1));
+  const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));
+  xx_storel_64(&sum, sum_1x64);
+  return sum;
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE void summary_32_avx2(const __m256i *sum32, __m256i *sum) {
+  const __m256i sum0_4x64 =
+      _mm256_cvtepu32_epi64(_mm256_castsi256_si128(*sum32));
+  const __m256i sum1_4x64 =
+      _mm256_cvtepu32_epi64(_mm256_extracti128_si256(*sum32, 1));
+  const __m256i sum_4x64 = _mm256_add_epi64(sum0_4x64, sum1_4x64);
+  *sum = _mm256_add_epi64(*sum, sum_4x64);
+}
+
+static INLINE int64_t summary_4x64_avx2(const __m256i sum_4x64) {
+  int64_t sum;
+  const __m128i sum_2x64 = _mm_add_epi64(_mm256_castsi256_si128(sum_4x64),
+                                         _mm256_extracti128_si256(sum_4x64, 1));
+  const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));
+
+  xx_storel_64(&sum, sum_1x64);
+  return sum;
+}
+#endif
+
+static INLINE void sse_w4x4_avx2(const uint8_t *a, int a_stride,
+                                 const uint8_t *b, int b_stride, __m256i *sum) {
+  const __m128i v_a0 = xx_loadl_32(a);
+  const __m128i v_a1 = xx_loadl_32(a + a_stride);
+  const __m128i v_a2 = xx_loadl_32(a + a_stride * 2);
+  const __m128i v_a3 = xx_loadl_32(a + a_stride * 3);
+  const __m128i v_b0 = xx_loadl_32(b);
+  const __m128i v_b1 = xx_loadl_32(b + b_stride);
+  const __m128i v_b2 = xx_loadl_32(b + b_stride * 2);
+  const __m128i v_b3 = xx_loadl_32(b + b_stride * 3);
+  const __m128i v_a0123 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(v_a0, v_a1),
+                                             _mm_unpacklo_epi32(v_a2, v_a3));
+  const __m128i v_b0123 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(v_b0, v_b1),
+                                             _mm_unpacklo_epi32(v_b2, v_b3));
+  const __m256i v_a_w = _mm256_cvtepu8_epi16(v_a0123);
+  const __m256i v_b_w = _mm256_cvtepu8_epi16(v_b0123);
+  const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
+  *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
+}
+static INLINE void sse_w8x2_avx2(const uint8_t *a, int a_stride,
+                                 const uint8_t *b, int b_stride, __m256i *sum) {
+  const __m128i v_a0 = xx_loadl_64(a);
+  const __m128i v_a1 = xx_loadl_64(a + a_stride);
+  const __m128i v_b0 = xx_loadl_64(b);
+  const __m128i v_b1 = xx_loadl_64(b + b_stride);
+  const __m256i v_a_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_a0, v_a1));
+  const __m256i v_b_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_b0, v_b1));
+  const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
+  *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
+}
+int64_t aom_sse_avx2(const uint8_t *a, int a_stride, const uint8_t *b,
+                     int b_stride, int width, int height) {
+  int32_t y = 0;
+  int64_t sse = 0;
+  __m256i sum = _mm256_setzero_si256();
+  __m256i zero = _mm256_setzero_si256();
+  switch (width) {
+    case 4:
+      do {
+        sse_w4x4_avx2(a, a_stride, b, b_stride, &sum);
+        a += a_stride << 2;
+        b += b_stride << 2;
+        y += 4;
+      } while (y < height);
+      sse = summary_all_avx2(&sum);
+      break;
+    case 8:
+      do {
+        sse_w8x2_avx2(a, a_stride, b, b_stride, &sum);
+        a += a_stride << 1;
+        b += b_stride << 1;
+        y += 2;
+      } while (y < height);
+      sse = summary_all_avx2(&sum);
+      break;
+    case 16:
+      do {
+        const __m128i v_a0 = xx_loadu_128(a);
+        const __m128i v_a1 = xx_loadu_128(a + a_stride);
+        const __m128i v_b0 = xx_loadu_128(b);
+        const __m128i v_b1 = xx_loadu_128(b + b_stride);
+        const __m256i v_a =
+            _mm256_insertf128_si256(_mm256_castsi128_si256(v_a0), v_a1, 0x01);
+        const __m256i v_b =
+            _mm256_insertf128_si256(_mm256_castsi128_si256(v_b0), v_b1, 0x01);
+        const __m256i v_al = _mm256_unpacklo_epi8(v_a, zero);
+        const __m256i v_au = _mm256_unpackhi_epi8(v_a, zero);
+        const __m256i v_bl = _mm256_unpacklo_epi8(v_b, zero);
+        const __m256i v_bu = _mm256_unpackhi_epi8(v_b, zero);
+        const __m256i v_asub = _mm256_sub_epi16(v_al, v_bl);
+        const __m256i v_bsub = _mm256_sub_epi16(v_au, v_bu);
+        const __m256i temp =
+            _mm256_add_epi32(_mm256_madd_epi16(v_asub, v_asub),
+                             _mm256_madd_epi16(v_bsub, v_bsub));
+        sum = _mm256_add_epi32(sum, temp);
+        a += a_stride << 1;
+        b += b_stride << 1;
+        y += 2;
+      } while (y < height);
+      sse = summary_all_avx2(&sum);
+      break;
+    case 32:
+      do {
+        sse_w32_avx2(&sum, a, b);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_avx2(&sum);
+      break;
+    case 64:
+      do {
+        sse_w32_avx2(&sum, a, b);
+        sse_w32_avx2(&sum, a + 32, b + 32);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_avx2(&sum);
+      break;
+    case 128:
+      do {
+        sse_w32_avx2(&sum, a, b);
+        sse_w32_avx2(&sum, a + 32, b + 32);
+        sse_w32_avx2(&sum, a + 64, b + 64);
+        sse_w32_avx2(&sum, a + 96, b + 96);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_avx2(&sum);
+      break;
+    default:
+      if ((width & 0x07) == 0) {
+        do {
+          int i = 0;
+          do {
+            sse_w8x2_avx2(a + i, a_stride, b + i, b_stride, &sum);
+            i += 8;
+          } while (i < width);
+          a += a_stride << 1;
+          b += b_stride << 1;
+          y += 2;
+        } while (y < height);
+      } else {
+        do {
+          int i = 0;
+          do {
+            sse_w8x2_avx2(a + i, a_stride, b + i, b_stride, &sum);
+            const uint8_t *a2 = a + i + (a_stride << 1);
+            const uint8_t *b2 = b + i + (b_stride << 1);
+            sse_w8x2_avx2(a2, a_stride, b2, b_stride, &sum);
+            i += 8;
+          } while (i + 4 < width);
+          sse_w4x4_avx2(a + i, a_stride, b + i, b_stride, &sum);
+          a += a_stride << 2;
+          b += b_stride << 2;
+          y += 4;
+        } while (y < height);
+      }
+      sse = summary_all_avx2(&sum);
+      break;
+  }
+
+  return sse;
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE void highbd_sse_w16_avx2(__m256i *sum, const uint16_t *a,
+                                       const uint16_t *b) {
+  const __m256i v_a_w = yy_loadu_256(a);
+  const __m256i v_b_w = yy_loadu_256(b);
+  const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
+  *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
+}
+
+static INLINE void highbd_sse_w4x4_avx2(__m256i *sum, const uint16_t *a,
+                                        int a_stride, const uint16_t *b,
+                                        int b_stride) {
+  const __m128i v_a0 = xx_loadl_64(a);
+  const __m128i v_a1 = xx_loadl_64(a + a_stride);
+  const __m128i v_a2 = xx_loadl_64(a + a_stride * 2);
+  const __m128i v_a3 = xx_loadl_64(a + a_stride * 3);
+  const __m128i v_b0 = xx_loadl_64(b);
+  const __m128i v_b1 = xx_loadl_64(b + b_stride);
+  const __m128i v_b2 = xx_loadl_64(b + b_stride * 2);
+  const __m128i v_b3 = xx_loadl_64(b + b_stride * 3);
+  const __m256i v_a_w = yy_set_m128i(_mm_unpacklo_epi64(v_a0, v_a1),
+                                     _mm_unpacklo_epi64(v_a2, v_a3));
+  const __m256i v_b_w = yy_set_m128i(_mm_unpacklo_epi64(v_b0, v_b1),
+                                     _mm_unpacklo_epi64(v_b2, v_b3));
+  const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
+  *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
+}
+
+static INLINE void highbd_sse_w8x2_avx2(__m256i *sum, const uint16_t *a,
+                                        int a_stride, const uint16_t *b,
+                                        int b_stride) {
+  const __m256i v_a_w = yy_loadu2_128(a + a_stride, a);
+  const __m256i v_b_w = yy_loadu2_128(b + b_stride, b);
+  const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
+  *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
+}
+int64_t aom_highbd_sse_avx2(const uint8_t *a8, int a_stride, const uint8_t *b8,
+                            int b_stride, int width, int height) {
+  int32_t y = 0;
+  int64_t sse = 0;
+  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  __m256i sum = _mm256_setzero_si256();
+  switch (width) {
+    case 4:
+      do {
+        highbd_sse_w4x4_avx2(&sum, a, a_stride, b, b_stride);
+        a += a_stride << 2;
+        b += b_stride << 2;
+        y += 4;
+      } while (y < height);
+      sse = summary_all_avx2(&sum);
+      break;
+    case 8:
+      do {
+        highbd_sse_w8x2_avx2(&sum, a, a_stride, b, b_stride);
+        a += a_stride << 1;
+        b += b_stride << 1;
+        y += 2;
+      } while (y < height);
+      sse = summary_all_avx2(&sum);
+      break;
+    case 16:
+      do {
+        highbd_sse_w16_avx2(&sum, a, b);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_avx2(&sum);
+      break;
+    case 32:
+      do {
+        int l = 0;
+        __m256i sum32 = _mm256_setzero_si256();
+        do {
+          highbd_sse_w16_avx2(&sum32, a, b);
+          highbd_sse_w16_avx2(&sum32, a + 16, b + 16);
+          a += a_stride;
+          b += b_stride;
+          l += 1;
+        } while (l < 64 && l < (height - y));
+        summary_32_avx2(&sum32, &sum);
+        y += 64;
+      } while (y < height);
+      sse = summary_4x64_avx2(sum);
+      break;
+    case 64:
+      do {
+        int l = 0;
+        __m256i sum32 = _mm256_setzero_si256();
+        do {
+          highbd_sse_w16_avx2(&sum32, a, b);
+          highbd_sse_w16_avx2(&sum32, a + 16 * 1, b + 16 * 1);
+          highbd_sse_w16_avx2(&sum32, a + 16 * 2, b + 16 * 2);
+          highbd_sse_w16_avx2(&sum32, a + 16 * 3, b + 16 * 3);
+          a += a_stride;
+          b += b_stride;
+          l += 1;
+        } while (l < 32 && l < (height - y));
+        summary_32_avx2(&sum32, &sum);
+        y += 32;
+      } while (y < height);
+      sse = summary_4x64_avx2(sum);
+      break;
+    case 128:
+      do {
+        int l = 0;
+        __m256i sum32 = _mm256_setzero_si256();
+        do {
+          highbd_sse_w16_avx2(&sum32, a, b);
+          highbd_sse_w16_avx2(&sum32, a + 16 * 1, b + 16 * 1);
+          highbd_sse_w16_avx2(&sum32, a + 16 * 2, b + 16 * 2);
+          highbd_sse_w16_avx2(&sum32, a + 16 * 3, b + 16 * 3);
+          highbd_sse_w16_avx2(&sum32, a + 16 * 4, b + 16 * 4);
+          highbd_sse_w16_avx2(&sum32, a + 16 * 5, b + 16 * 5);
+          highbd_sse_w16_avx2(&sum32, a + 16 * 6, b + 16 * 6);
+          highbd_sse_w16_avx2(&sum32, a + 16 * 7, b + 16 * 7);
+          a += a_stride;
+          b += b_stride;
+          l += 1;
+        } while (l < 16 && l < (height - y));
+        summary_32_avx2(&sum32, &sum);
+        y += 16;
+      } while (y < height);
+      sse = summary_4x64_avx2(sum);
+      break;
+    default:
+      if (width & 0x7) {
+        do {
+          int i = 0;
+          __m256i sum32 = _mm256_setzero_si256();
+          do {
+            highbd_sse_w8x2_avx2(&sum32, a + i, a_stride, b + i, b_stride);
+            const uint16_t *a2 = a + i + (a_stride << 1);
+            const uint16_t *b2 = b + i + (b_stride << 1);
+            highbd_sse_w8x2_avx2(&sum32, a2, a_stride, b2, b_stride);
+            i += 8;
+          } while (i + 4 < width);
+          highbd_sse_w4x4_avx2(&sum32, a + i, a_stride, b + i, b_stride);
+          summary_32_avx2(&sum32, &sum);
+          a += a_stride << 2;
+          b += b_stride << 2;
+          y += 4;
+        } while (y < height);
+      } else {
+        do {
+          int l = 0;
+          __m256i sum32 = _mm256_setzero_si256();
+          do {
+            int i = 0;
+            do {
+              highbd_sse_w8x2_avx2(&sum32, a + i, a_stride, b + i, b_stride);
+              i += 8;
+            } while (i < width);
+            a += a_stride << 1;
+            b += b_stride << 1;
+            l += 2;
+          } while (l < 8 && l < (height - y));
+          summary_32_avx2(&sum32, &sum);
+          y += 8;
+        } while (y < height);
+      }
+      sse = summary_4x64_avx2(sum);
+      break;
+  }
+  return sse;
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/libs/libaom/src/aom_dsp/x86/sse_sse4.c b/libs/libaom/src/aom_dsp/x86/sse_sse4.c
new file mode 100644
index 000000000..5f95eb9ae
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/sse_sse4.c
@@ -0,0 +1,353 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <smmintrin.h>
+
+#include "config/aom_config.h"
+
+#include "aom_ports/mem.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/synonyms.h"
+
+static INLINE int64_t summary_all_sse4(const __m128i *sum_all) {
+  int64_t sum;
+  const __m128i sum0 = _mm_cvtepu32_epi64(*sum_all);
+  const __m128i sum1 = _mm_cvtepu32_epi64(_mm_srli_si128(*sum_all, 8));
+  const __m128i sum_2x64 = _mm_add_epi64(sum0, sum1);
+  const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));
+  xx_storel_64(&sum, sum_1x64);
+  return sum;
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE void summary_32_sse4(const __m128i *sum32, __m128i *sum64) {
+  const __m128i sum0 = _mm_cvtepu32_epi64(*sum32);
+  const __m128i sum1 = _mm_cvtepu32_epi64(_mm_srli_si128(*sum32, 8));
+  *sum64 = _mm_add_epi64(sum0, *sum64);
+  *sum64 = _mm_add_epi64(sum1, *sum64);
+}
+#endif
+
+static INLINE void sse_w16_sse4_1(__m128i *sum, const uint8_t *a,
+                                  const uint8_t *b) {
+  const __m128i v_a0 = xx_loadu_128(a);
+  const __m128i v_b0 = xx_loadu_128(b);
+  const __m128i v_a00_w = _mm_cvtepu8_epi16(v_a0);
+  const __m128i v_a01_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_a0, 8));
+  const __m128i v_b00_w = _mm_cvtepu8_epi16(v_b0);
+  const __m128i v_b01_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_b0, 8));
+  const __m128i v_d00_w = _mm_sub_epi16(v_a00_w, v_b00_w);
+  const __m128i v_d01_w = _mm_sub_epi16(v_a01_w, v_b01_w);
+  *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d00_w, v_d00_w));
+  *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d01_w, v_d01_w));
+}
+
+static INLINE void sse4x2_sse4_1(const uint8_t *a, int a_stride,
+                                 const uint8_t *b, int b_stride, __m128i *sum) {
+  const __m128i v_a0 = xx_loadl_32(a);
+  const __m128i v_a1 = xx_loadl_32(a + a_stride);
+  const __m128i v_b0 = xx_loadl_32(b);
+  const __m128i v_b1 = xx_loadl_32(b + b_stride);
+  const __m128i v_a_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_a0, v_a1));
+  const __m128i v_b_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_b0, v_b1));
+  const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
+  *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w));
+}
+static INLINE void sse8_sse4_1(const uint8_t *a, const uint8_t *b,
+                               __m128i *sum) {
+  const __m128i v_a0 = xx_loadl_64(a);
+  const __m128i v_b0 = xx_loadl_64(b);
+  const __m128i v_a_w = _mm_cvtepu8_epi16(v_a0);
+  const __m128i v_b_w = _mm_cvtepu8_epi16(v_b0);
+  const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
+  *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w));
+}
+
+int64_t aom_sse_sse4_1(const uint8_t *a, int a_stride, const uint8_t *b,
+                       int b_stride, int width, int height) {
+  int y = 0;
+  int64_t sse = 0;
+  __m128i sum = _mm_setzero_si128();
+  switch (width) {
+    case 4:
+      do {
+        sse4x2_sse4_1(a, a_stride, b, b_stride, &sum);
+        a += a_stride << 1;
+        b += b_stride << 1;
+        y += 2;
+      } while (y < height);
+      sse = summary_all_sse4(&sum);
+      break;
+    case 8:
+      do {
+        sse8_sse4_1(a, b, &sum);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_sse4(&sum);
+      break;
+    case 16:
+      do {
+        sse_w16_sse4_1(&sum, a, b);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_sse4(&sum);
+      break;
+    case 32:
+      do {
+        sse_w16_sse4_1(&sum, a, b);
+        sse_w16_sse4_1(&sum, a + 16, b + 16);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_sse4(&sum);
+      break;
+    case 64:
+      do {
+        sse_w16_sse4_1(&sum, a, b);
+        sse_w16_sse4_1(&sum, a + 16 * 1, b + 16 * 1);
+        sse_w16_sse4_1(&sum, a + 16 * 2, b + 16 * 2);
+        sse_w16_sse4_1(&sum, a + 16 * 3, b + 16 * 3);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_sse4(&sum);
+      break;
+    case 128:
+      do {
+        sse_w16_sse4_1(&sum, a, b);
+        sse_w16_sse4_1(&sum, a + 16 * 1, b + 16 * 1);
+        sse_w16_sse4_1(&sum, a + 16 * 2, b + 16 * 2);
+        sse_w16_sse4_1(&sum, a + 16 * 3, b + 16 * 3);
+        sse_w16_sse4_1(&sum, a + 16 * 4, b + 16 * 4);
+        sse_w16_sse4_1(&sum, a + 16 * 5, b + 16 * 5);
+        sse_w16_sse4_1(&sum, a + 16 * 6, b + 16 * 6);
+        sse_w16_sse4_1(&sum, a + 16 * 7, b + 16 * 7);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_sse4(&sum);
+      break;
+    default:
+      if (width & 0x07) {
+        do {
+          int i = 0;
+          do {
+            sse8_sse4_1(a + i, b + i, &sum);
+            sse8_sse4_1(a + i + a_stride, b + i + b_stride, &sum);
+            i += 8;
+          } while (i + 4 < width);
+          sse4x2_sse4_1(a + i, a_stride, b + i, b_stride, &sum);
+          a += (a_stride << 1);
+          b += (b_stride << 1);
+          y += 2;
+        } while (y < height);
+      } else {
+        do {
+          int i = 0;
+          do {
+            sse8_sse4_1(a + i, b + i, &sum);
+            i += 8;
+          } while (i < width);
+          a += a_stride;
+          b += b_stride;
+          y += 1;
+        } while (y < height);
+      }
+      sse = summary_all_sse4(&sum);
+      break;
+  }
+
+  return sse;
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE void highbd_sse_w4x2_sse4_1(__m128i *sum, const uint16_t *a,
+                                          int a_stride, const uint16_t *b,
+                                          int b_stride) {
+  const __m128i v_a0 = xx_loadl_64(a);
+  const __m128i v_a1 = xx_loadl_64(a + a_stride);
+  const __m128i v_b0 = xx_loadl_64(b);
+  const __m128i v_b1 = xx_loadl_64(b + b_stride);
+  const __m128i v_a_w = _mm_unpacklo_epi64(v_a0, v_a1);
+  const __m128i v_b_w = _mm_unpacklo_epi64(v_b0, v_b1);
+  const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
+  *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w));
+}
+
+static INLINE void highbd_sse_w8_sse4_1(__m128i *sum, const uint16_t *a,
+                                        const uint16_t *b) {
+  const __m128i v_a_w = xx_loadu_128(a);
+  const __m128i v_b_w = xx_loadu_128(b);
+  const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
+  *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w));
+}
+
+int64_t aom_highbd_sse_sse4_1(const uint8_t *a8, int a_stride,
+                              const uint8_t *b8, int b_stride, int width,
+                              int height) {
+  int32_t y = 0;
+  int64_t sse = 0;
+  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  __m128i sum = _mm_setzero_si128();
+  switch (width) {
+    case 4:
+      do {
+        highbd_sse_w4x2_sse4_1(&sum, a, a_stride, b, b_stride);
+        a += a_stride << 1;
+        b += b_stride << 1;
+        y += 2;
+      } while (y < height);
+      sse = summary_all_sse4(&sum);
+      break;
+    case 8:
+      do {
+        highbd_sse_w8_sse4_1(&sum, a, b);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_sse4(&sum);
+      break;
+    case 16:
+      do {
+        int l = 0;
+        __m128i sum32 = _mm_setzero_si128();
+        do {
+          highbd_sse_w8_sse4_1(&sum32, a, b);
+          highbd_sse_w8_sse4_1(&sum32, a + 8, b + 8);
+          a += a_stride;
+          b += b_stride;
+          l += 1;
+        } while (l < 64 && l < (height - y));
+        summary_32_sse4(&sum32, &sum);
+        y += 64;
+      } while (y < height);
+      xx_storel_64(&sse, _mm_add_epi64(sum, _mm_srli_si128(sum, 8)));
+      break;
+    case 32:
+      do {
+        int l = 0;
+        __m128i sum32 = _mm_setzero_si128();
+        do {
+          highbd_sse_w8_sse4_1(&sum32, a, b);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 1, b + 8 * 1);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 2, b + 8 * 2);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 3, b + 8 * 3);
+          a += a_stride;
+          b += b_stride;
+          l += 1;
+        } while (l < 32 && l < (height - y));
+        summary_32_sse4(&sum32, &sum);
+        y += 32;
+      } while (y < height);
+      xx_storel_64(&sse, _mm_add_epi64(sum, _mm_srli_si128(sum, 8)));
+      break;
+    case 64:
+      do {
+        int l = 0;
+        __m128i sum32 = _mm_setzero_si128();
+        do {
+          highbd_sse_w8_sse4_1(&sum32, a, b);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 1, b + 8 * 1);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 2, b + 8 * 2);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 3, b + 8 * 3);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 4, b + 8 * 4);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 5, b + 8 * 5);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 6, b + 8 * 6);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 7, b + 8 * 7);
+          a += a_stride;
+          b += b_stride;
+          l += 1;
+        } while (l < 16 && l < (height - y));
+        summary_32_sse4(&sum32, &sum);
+        y += 16;
+      } while (y < height);
+      xx_storel_64(&sse, _mm_add_epi64(sum, _mm_srli_si128(sum, 8)));
+      break;
+    case 128:
+      do {
+        int l = 0;
+        __m128i sum32 = _mm_setzero_si128();
+        do {
+          highbd_sse_w8_sse4_1(&sum32, a, b);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 1, b + 8 * 1);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 2, b + 8 * 2);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 3, b + 8 * 3);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 4, b + 8 * 4);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 5, b + 8 * 5);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 6, b + 8 * 6);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 7, b + 8 * 7);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 8, b + 8 * 8);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 9, b + 8 * 9);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 10, b + 8 * 10);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 11, b + 8 * 11);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 12, b + 8 * 12);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 13, b + 8 * 13);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 14, b + 8 * 14);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 15, b + 8 * 15);
+          a += a_stride;
+          b += b_stride;
+          l += 1;
+        } while (l < 8 && l < (height - y));
+        summary_32_sse4(&sum32, &sum);
+        y += 8;
+      } while (y < height);
+      xx_storel_64(&sse, _mm_add_epi64(sum, _mm_srli_si128(sum, 8)));
+      break;
+    default:
+      if (width & 0x7) {
+        do {
+          __m128i sum32 = _mm_setzero_si128();
+          int i = 0;
+          do {
+            highbd_sse_w8_sse4_1(&sum32, a + i, b + i);
+            highbd_sse_w8_sse4_1(&sum32, a + i + a_stride, b + i + b_stride);
+            i += 8;
+          } while (i + 4 < width);
+          highbd_sse_w4x2_sse4_1(&sum32, a + i, a_stride, b + i, b_stride);
+          a += (a_stride << 1);
+          b += (b_stride << 1);
+          y += 2;
+          summary_32_sse4(&sum32, &sum);
+        } while (y < height);
+      } else {
+        do {
+          int l = 0;
+          __m128i sum32 = _mm_setzero_si128();
+          do {
+            int i = 0;
+            do {
+              highbd_sse_w8_sse4_1(&sum32, a + i, b + i);
+              i += 8;
+            } while (i < width);
+            a += a_stride;
+            b += b_stride;
+            l += 1;
+          } while (l < 8 && l < (height - y));
+          summary_32_sse4(&sum32, &sum);
+          y += 8;
+        } while (y < height);
+      }
+      xx_storel_64(&sse, _mm_add_epi64(sum, _mm_srli_si128(sum, 8)));
+      break;
+  }
+  return sse;
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/libs/libaom/src/aom_dsp/x86/ssim_sse2_x86_64.asm b/libs/libaom/src/aom_dsp/x86/ssim_sse2_x86_64.asm
new file mode 100644
index 000000000..6d9b5a12f
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/ssim_sse2_x86_64.asm
@@ -0,0 +1,222 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "aom_ports/x86_abi_support.asm"
+
+; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr
+%macro TABULATE_SSIM 0
+        paddusw         xmm15, xmm3  ; sum_s
+        paddusw         xmm14, xmm4  ; sum_r
+        movdqa          xmm1, xmm3
+        pmaddwd         xmm1, xmm1
+        paddd           xmm13, xmm1 ; sum_sq_s
+        movdqa          xmm2, xmm4
+        pmaddwd         xmm2, xmm2
+        paddd           xmm12, xmm2 ; sum_sq_r
+        pmaddwd         xmm3, xmm4
+        paddd           xmm11, xmm3  ; sum_sxr
+%endmacro
+
+; Sum across the register %1 starting with q words
+%macro SUM_ACROSS_Q 1
+        movdqa          xmm2,%1
+        punpckldq       %1,xmm0
+        punpckhdq       xmm2,xmm0
+        paddq           %1,xmm2
+        movdqa          xmm2,%1
+        punpcklqdq      %1,xmm0
+        punpckhqdq      xmm2,xmm0
+        paddq           %1,xmm2
+%endmacro
+
+; Sum across the register %1 starting with q words
+%macro SUM_ACROSS_W 1
+        movdqa          xmm1, %1
+        punpcklwd       %1,xmm0
+        punpckhwd       xmm1,xmm0
+        paddd           %1, xmm1
+        SUM_ACROSS_Q    %1
+%endmacro
+
+SECTION .text
+
+;void ssim_parms_sse2(
+;    unsigned char *s,
+;    int sp,
+;    unsigned char *r,
+;    int rp
+;    uint32_t *sum_s,
+;    uint32_t *sum_r,
+;    uint32_t *sum_sq_s,
+;    uint32_t *sum_sq_r,
+;    uint32_t *sum_sxr);
+;
+; TODO: Use parm passing through structure, probably don't need the pxors
+; ( calling app will initialize to 0 ) could easily fit everything in sse2
+; without too much hastle, and can probably do better estimates with psadw
+; or pavgb At this point this is just meant to be first pass for calculating
+; all the parms needed for 16x16 ssim so we can play with dssim as distortion
+; in mode selection code.
+global sym(aom_ssim_parms_16x16_sse2) PRIVATE
+sym(aom_ssim_parms_16x16_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 9
+    SAVE_XMM 15
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    mov             rsi,        arg(0) ;s
+    mov             rcx,        arg(1) ;sp
+    mov             rdi,        arg(2) ;r
+    mov             rax,        arg(3) ;rp
+
+    pxor            xmm0, xmm0
+    pxor            xmm15,xmm15  ;sum_s
+    pxor            xmm14,xmm14  ;sum_r
+    pxor            xmm13,xmm13  ;sum_sq_s
+    pxor            xmm12,xmm12  ;sum_sq_r
+    pxor            xmm11,xmm11  ;sum_sxr
+
+    mov             rdx, 16      ;row counter
+.NextRow:
+
+    ;grab source and reference pixels
+    movdqu          xmm5, [rsi]
+    movdqu          xmm6, [rdi]
+    movdqa          xmm3, xmm5
+    movdqa          xmm4, xmm6
+    punpckhbw       xmm3, xmm0 ; high_s
+    punpckhbw       xmm4, xmm0 ; high_r
+
+    TABULATE_SSIM
+
+    movdqa          xmm3, xmm5
+    movdqa          xmm4, xmm6
+    punpcklbw       xmm3, xmm0 ; low_s
+    punpcklbw       xmm4, xmm0 ; low_r
+
+    TABULATE_SSIM
+
+    add             rsi, rcx   ; next s row
+    add             rdi, rax   ; next r row
+
+    dec             rdx        ; counter
+    jnz .NextRow
+
+    SUM_ACROSS_W    xmm15
+    SUM_ACROSS_W    xmm14
+    SUM_ACROSS_Q    xmm13
+    SUM_ACROSS_Q    xmm12
+    SUM_ACROSS_Q    xmm11
+
+    mov             rdi,arg(4)
+    movd            [rdi], xmm15;
+    mov             rdi,arg(5)
+    movd            [rdi], xmm14;
+    mov             rdi,arg(6)
+    movd            [rdi], xmm13;
+    mov             rdi,arg(7)
+    movd            [rdi], xmm12;
+    mov             rdi,arg(8)
+    movd            [rdi], xmm11;
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void ssim_parms_sse2(
+;    unsigned char *s,
+;    int sp,
+;    unsigned char *r,
+;    int rp
+;    uint32_t *sum_s,
+;    uint32_t *sum_r,
+;    uint32_t *sum_sq_s,
+;    uint32_t *sum_sq_r,
+;    uint32_t *sum_sxr);
+;
+; TODO: Use parm passing through structure, probably don't need the pxors
+; ( calling app will initialize to 0 ) could easily fit everything in sse2
+; without too much hastle, and can probably do better estimates with psadw
+; or pavgb At this point this is just meant to be first pass for calculating
+; all the parms needed for 16x16 ssim so we can play with dssim as distortion
+; in mode selection code.
+global sym(aom_ssim_parms_8x8_sse2) PRIVATE
+sym(aom_ssim_parms_8x8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 9
+    SAVE_XMM 15
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    mov             rsi,        arg(0) ;s
+    mov             rcx,        arg(1) ;sp
+    mov             rdi,        arg(2) ;r
+    mov             rax,        arg(3) ;rp
+
+    pxor            xmm0, xmm0
+    pxor            xmm15,xmm15  ;sum_s
+    pxor            xmm14,xmm14  ;sum_r
+    pxor            xmm13,xmm13  ;sum_sq_s
+    pxor            xmm12,xmm12  ;sum_sq_r
+    pxor            xmm11,xmm11  ;sum_sxr
+
+    mov             rdx, 8      ;row counter
+.NextRow:
+
+    ;grab source and reference pixels
+    movq            xmm3, [rsi]
+    movq            xmm4, [rdi]
+    punpcklbw       xmm3, xmm0 ; low_s
+    punpcklbw       xmm4, xmm0 ; low_r
+
+    TABULATE_SSIM
+
+    add             rsi, rcx   ; next s row
+    add             rdi, rax   ; next r row
+
+    dec             rdx        ; counter
+    jnz .NextRow
+
+    SUM_ACROSS_W    xmm15
+    SUM_ACROSS_W    xmm14
+    SUM_ACROSS_Q    xmm13
+    SUM_ACROSS_Q    xmm12
+    SUM_ACROSS_Q    xmm11
+
+    mov             rdi,arg(4)
+    movd            [rdi], xmm15;
+    mov             rdi,arg(5)
+    movd            [rdi], xmm14;
+    mov             rdi,arg(6)
+    movd            [rdi], xmm13;
+    mov             rdi,arg(7)
+    movd            [rdi], xmm12;
+    mov             rdi,arg(8)
+    movd            [rdi], xmm11;
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
diff --git a/libs/libaom/src/aom_dsp/x86/subpel_variance_sse2.asm b/libs/libaom/src/aom_dsp/x86/subpel_variance_sse2.asm
new file mode 100644
index 000000000..cbf28901b
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/subpel_variance_sse2.asm
@@ -0,0 +1,1470 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_8: times  8 dw  8
+bilin_filter_m_sse2: times  8 dw 16
+                     times  8 dw  0
+                     times  8 dw 14
+                     times  8 dw  2
+                     times  8 dw 12
+                     times  8 dw  4
+                     times  8 dw 10
+                     times  8 dw  6
+                     times 16 dw  8
+                     times  8 dw  6
+                     times  8 dw 10
+                     times  8 dw  4
+                     times  8 dw 12
+                     times  8 dw  2
+                     times  8 dw 14
+
+bilin_filter_m_ssse3: times  8 db 16,  0
+                      times  8 db 14,  2
+                      times  8 db 12,  4
+                      times  8 db 10,  6
+                      times 16 db  8
+                      times  8 db  6, 10
+                      times  8 db  4, 12
+                      times  8 db  2, 14
+
+SECTION .text
+
+; int aom_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
+;                               int x_offset, int y_offset,
+;                               const uint8_t *dst, ptrdiff_t dst_stride,
+;                               int height, unsigned int *sse);
+;
+; This function returns the SE and stores SSE in the given pointer.
+
+%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse
+  psubw                %3, %4
+  psubw                %1, %2
+  paddw                %5, %3
+  pmaddwd              %3, %3
+  paddw                %5, %1
+  pmaddwd              %1, %1
+  paddd                %6, %3
+  paddd                %6, %1
+%endmacro
+
+%macro STORE_AND_RET 1
+%if %1 > 4
+  ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
+  ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
+  ; We have to sign-extend it before adding the words within the register
+  ; and outputing to a dword.
+  pcmpgtw              m5, m6           ; mask for 0 > x
+  movhlps              m3, m7
+  punpcklwd            m4, m6, m5
+  punpckhwd            m6, m5           ; sign-extend m6 word->dword
+  paddd                m7, m3
+  paddd                m6, m4
+  pshufd               m3, m7, 0x1
+  movhlps              m4, m6
+  paddd                m7, m3
+  paddd                m6, m4
+  mov                  r1, ssem         ; r1 = unsigned int *sse
+  pshufd               m4, m6, 0x1
+  movd               [r1], m7           ; store sse
+  paddd                m6, m4
+  movd               raxd, m6           ; store sum as return value
+%else ; 4xh
+  pshuflw              m4, m6, 0xe
+  pshuflw              m3, m7, 0xe
+  paddw                m6, m4
+  paddd                m7, m3
+  pcmpgtw              m5, m6           ; mask for 0 > x
+  mov                  r1, ssem         ; r1 = unsigned int *sse
+  punpcklwd            m6, m5           ; sign-extend m6 word->dword
+  movd               [r1], m7           ; store sse
+  pshuflw              m4, m6, 0xe
+  paddd                m6, m4
+  movd               raxd, m6           ; store sum as return value
+%endif
+  RET
+%endmacro
+
+%macro INC_SRC_BY_SRC_STRIDE  0
+%if ARCH_X86=1 && CONFIG_PIC=1
+  add                srcq, src_stridemp
+%else
+  add                srcq, src_strideq
+%endif
+%endmacro
+
+%macro SUBPEL_VARIANCE 1-2 0 ; W
+%if cpuflag(ssse3)
+%define bilin_filter_m bilin_filter_m_ssse3
+%define filter_idx_shift 4
+%else
+%define bilin_filter_m bilin_filter_m_sse2
+%define filter_idx_shift 5
+%endif
+; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses
+; 11, not 13, if the registers are ordered correctly. May make a minor speed
+; difference on Win64
+
+%if ARCH_X86_64
+  %if %2 == 1 ; avg
+    cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
+                                        x_offset, y_offset, dst, dst_stride, \
+                                        sec, sec_stride, height, sse
+    %define sec_str sec_strideq
+  %else
+    cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \
+                                    x_offset, y_offset, dst, dst_stride, \
+                                    height, sse
+  %endif
+  %define block_height heightd
+  %define bilin_filter sseq
+%else
+  %if CONFIG_PIC=1
+    %if %2 == 1 ; avg
+      cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
+                                          x_offset, y_offset, dst, dst_stride, \
+                                          sec, sec_stride, height, sse
+      %define block_height dword heightm
+      %define sec_str sec_stridemp
+    %else
+      cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
+                                      x_offset, y_offset, dst, dst_stride, \
+                                      height, sse
+      %define block_height heightd
+    %endif
+
+    ; reuse argument stack space
+    %define g_bilin_filterm x_offsetm
+    %define g_pw_8m y_offsetm
+
+    ;Store bilin_filter and pw_8 location in stack
+    %if GET_GOT_DEFINED == 1
+      GET_GOT eax
+      add esp, 4                ; restore esp
+    %endif
+
+    lea ecx, [GLOBAL(bilin_filter_m)]
+    mov g_bilin_filterm, ecx
+
+    lea ecx, [GLOBAL(pw_8)]
+    mov g_pw_8m, ecx
+
+    LOAD_IF_USED 0, 1         ; load eax, ecx back
+  %else
+    %if %2 == 1 ; avg
+      cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
+                                          x_offset, y_offset, \
+                                          dst, dst_stride, sec, sec_stride, \
+                                          height, sse
+      %define block_height dword heightm
+      %define sec_str sec_stridemp
+    %else
+      cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
+                                      x_offset, y_offset, dst, dst_stride, \
+                                      height, sse
+      %define block_height heightd
+    %endif
+    %define bilin_filter bilin_filter_m
+  %endif
+%endif
+
+%if %1 == 4
+  %define movx movd
+%else
+  %define movx movh
+%endif
+
+  ASSERT               %1 <= 16         ; m6 overflows if w > 16
+  pxor                 m6, m6           ; sum
+  pxor                 m7, m7           ; sse
+  ; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we
+  ; could perhaps use it for something more productive then
+  pxor                 m5, m5           ; dedicated zero register
+%if %1 < 16
+  sar                   block_height, 1
+%if %2 == 1 ; avg
+  shl             sec_str, 1
+%endif
+%endif
+
+  ; FIXME(rbultje) replace by jumptable?
+  test          x_offsetd, x_offsetd
+  jnz .x_nonzero
+  ; x_offset == 0
+  test          y_offsetd, y_offsetd
+  jnz .x_zero_y_nonzero
+
+  ; x_offset == 0 && y_offset == 0
+.x_zero_y_zero_loop:
+%if %1 == 16
+  movu                 m0, [srcq]
+  mova                 m1, [dstq]
+%if %2 == 1 ; avg
+  pavgb                m0, [secq]
+  punpckhbw            m3, m1, m5
+  punpcklbw            m1, m5
+%endif
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+
+%if %2 == 0 ; !avg
+  punpckhbw            m3, m1, m5
+  punpcklbw            m1, m5
+%endif
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+
+  add                srcq, src_strideq
+  add                dstq, dst_strideq
+%else ; %1 < 16
+  movx                 m0, [srcq]
+%if %2 == 1 ; avg
+%if %1 > 4
+  movhps               m0, [srcq+src_strideq]
+%else ; 4xh
+  movx                 m1, [srcq+src_strideq]
+  punpckldq            m0, m1
+%endif
+%else ; !avg
+  movx                 m2, [srcq+src_strideq]
+%endif
+
+  movx                 m1, [dstq]
+  movx                 m3, [dstq+dst_strideq]
+
+%if %2 == 1 ; avg
+%if %1 > 4
+  pavgb                m0, [secq]
+%else
+  movh                 m2, [secq]
+  pavgb                m0, m2
+%endif
+  punpcklbw            m3, m5
+  punpcklbw            m1, m5
+%if %1 > 4
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%else ; 4xh
+  punpcklbw            m0, m5
+  movhlps              m2, m0
+%endif
+%else ; !avg
+  punpcklbw            m0, m5
+  punpcklbw            m2, m5
+  punpcklbw            m3, m5
+  punpcklbw            m1, m5
+%endif
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+
+  lea                srcq, [srcq+src_strideq*2]
+  lea                dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+  dec                   block_height
+  jg .x_zero_y_zero_loop
+  STORE_AND_RET %1
+
+.x_zero_y_nonzero:
+  cmp           y_offsetd, 4
+  jne .x_zero_y_nonhalf
+
+  ; x_offset == 0 && y_offset == 0.5
+.x_zero_y_half_loop:
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m4, [srcq+src_strideq]
+  mova                 m1, [dstq]
+  pavgb                m0, m4
+  punpckhbw            m3, m1, m5
+%if %2 == 1 ; avg
+  pavgb                m0, [secq]
+%endif
+  punpcklbw            m1, m5
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+
+  add                srcq, src_strideq
+  add                dstq, dst_strideq
+%else ; %1 < 16
+  movx                 m0, [srcq]
+  movx                 m2, [srcq+src_strideq]
+%if %2 == 1 ; avg
+%if %1 > 4
+  movhps               m2, [srcq+src_strideq*2]
+%else ; 4xh
+  movx                 m1, [srcq+src_strideq*2]
+  punpckldq            m2, m1
+%endif
+  movx                 m1, [dstq]
+%if %1 > 4
+  movlhps              m0, m2
+%else ; 4xh
+  punpckldq            m0, m2
+%endif
+  movx                 m3, [dstq+dst_strideq]
+  pavgb                m0, m2
+  punpcklbw            m1, m5
+%if %1 > 4
+  pavgb                m0, [secq]
+  punpcklbw            m3, m5
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%else ; 4xh
+  movh                 m4, [secq]
+  pavgb                m0, m4
+  punpcklbw            m3, m5
+  punpcklbw            m0, m5
+  movhlps              m2, m0
+%endif
+%else ; !avg
+  movx                 m4, [srcq+src_strideq*2]
+  movx                 m1, [dstq]
+  pavgb                m0, m2
+  movx                 m3, [dstq+dst_strideq]
+  pavgb                m2, m4
+  punpcklbw            m0, m5
+  punpcklbw            m2, m5
+  punpcklbw            m3, m5
+  punpcklbw            m1, m5
+%endif
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+
+  lea                srcq, [srcq+src_strideq*2]
+  lea                dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+  dec                   block_height
+  jg .x_zero_y_half_loop
+  STORE_AND_RET %1
+
+.x_zero_y_nonhalf:
+  ; x_offset == 0 && y_offset == bilin interpolation
+%if ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
+%endif
+  shl           y_offsetd, filter_idx_shift
+%if ARCH_X86_64 && %1 > 4
+  mova                 m8, [bilin_filter+y_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+  mova                 m9, [bilin_filter+y_offsetq+16]
+%endif
+  mova                m10, [GLOBAL(pw_8)]
+%define filter_y_a m8
+%define filter_y_b m9
+%define filter_rnd m10
+%else ; x86-32 or mmx
+%if ARCH_X86=1 && CONFIG_PIC=1
+; x_offset == 0, reuse x_offset reg
+%define tempq x_offsetq
+  add y_offsetq, g_bilin_filterm
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+  mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+  add           y_offsetq, bilin_filter
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [GLOBAL(pw_8)]
+%endif
+%endif
+
+.x_zero_y_other_loop:
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m4, [srcq+src_strideq]
+  mova                 m1, [dstq]
+%if cpuflag(ssse3)
+  punpckhbw            m2, m0, m4
+  punpcklbw            m0, m4
+  pmaddubsw            m2, filter_y_a
+  pmaddubsw            m0, filter_y_a
+  paddw                m2, filter_rnd
+  paddw                m0, filter_rnd
+%else
+  punpckhbw            m2, m0, m5
+  punpckhbw            m3, m4, m5
+  punpcklbw            m0, m5
+  punpcklbw            m4, m5
+  ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can
+  ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of
+  ; instructions is the same (5), but it is 1 mul instead of 2, so might be
+  ; slightly faster because of pmullw latency. It would also cut our rodata
+  ; tables in half for this function, and save 1-2 registers on x86-64.
+  pmullw               m2, filter_y_a
+  pmullw               m3, filter_y_b
+  paddw                m2, filter_rnd
+  pmullw               m0, filter_y_a
+  pmullw               m4, filter_y_b
+  paddw                m0, filter_rnd
+  paddw                m2, m3
+  paddw                m0, m4
+%endif
+  psraw                m2, 4
+  psraw                m0, 4
+%if %2 == 1 ; avg
+  ; FIXME(rbultje) pipeline
+  packuswb             m0, m2
+  pavgb                m0, [secq]
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%endif
+  punpckhbw            m3, m1, m5
+  punpcklbw            m1, m5
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+
+  add                srcq, src_strideq
+  add                dstq, dst_strideq
+%else ; %1 < 16
+  movx                 m0, [srcq]
+  movx                 m2, [srcq+src_strideq]
+  movx                 m4, [srcq+src_strideq*2]
+  movx                 m3, [dstq+dst_strideq]
+%if cpuflag(ssse3)
+  movx                 m1, [dstq]
+  punpcklbw            m0, m2
+  punpcklbw            m2, m4
+  pmaddubsw            m0, filter_y_a
+  pmaddubsw            m2, filter_y_a
+  punpcklbw            m3, m5
+  paddw                m2, filter_rnd
+  paddw                m0, filter_rnd
+%else
+  punpcklbw            m0, m5
+  punpcklbw            m2, m5
+  punpcklbw            m4, m5
+  pmullw               m0, filter_y_a
+  pmullw               m1, m2, filter_y_b
+  punpcklbw            m3, m5
+  paddw                m0, filter_rnd
+  pmullw               m2, filter_y_a
+  pmullw               m4, filter_y_b
+  paddw                m0, m1
+  paddw                m2, filter_rnd
+  movx                 m1, [dstq]
+  paddw                m2, m4
+%endif
+  psraw                m0, 4
+  psraw                m2, 4
+%if %2 == 1 ; avg
+  ; FIXME(rbultje) pipeline
+%if %1 == 4
+  movlhps              m0, m2
+%endif
+  packuswb             m0, m2
+%if %1 > 4
+  pavgb                m0, [secq]
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%else ; 4xh
+  movh                 m2, [secq]
+  pavgb                m0, m2
+  punpcklbw            m0, m5
+  movhlps              m2, m0
+%endif
+%endif
+  punpcklbw            m1, m5
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+
+  lea                srcq, [srcq+src_strideq*2]
+  lea                dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+  dec                   block_height
+  jg .x_zero_y_other_loop
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+  STORE_AND_RET %1
+
+.x_nonzero:
+  cmp           x_offsetd, 4
+  jne .x_nonhalf
+  ; x_offset == 0.5
+  test          y_offsetd, y_offsetd
+  jnz .x_half_y_nonzero
+
+  ; x_offset == 0.5 && y_offset == 0
+.x_half_y_zero_loop:
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m4, [srcq+1]
+  mova                 m1, [dstq]
+  pavgb                m0, m4
+  punpckhbw            m3, m1, m5
+%if %2 == 1 ; avg
+  pavgb                m0, [secq]
+%endif
+  punpcklbw            m1, m5
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+
+  add                srcq, src_strideq
+  add                dstq, dst_strideq
+%else ; %1 < 16
+  movx                 m0, [srcq]
+  movx                 m4, [srcq+1]
+%if %2 == 1 ; avg
+%if %1 > 4
+  movhps               m0, [srcq+src_strideq]
+  movhps               m4, [srcq+src_strideq+1]
+%else ; 4xh
+  movx                 m1, [srcq+src_strideq]
+  punpckldq            m0, m1
+  movx                 m2, [srcq+src_strideq+1]
+  punpckldq            m4, m2
+%endif
+  movx                 m1, [dstq]
+  movx                 m3, [dstq+dst_strideq]
+  pavgb                m0, m4
+  punpcklbw            m3, m5
+%if %1 > 4
+  pavgb                m0, [secq]
+  punpcklbw            m1, m5
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%else ; 4xh
+  movh                 m2, [secq]
+  pavgb                m0, m2
+  punpcklbw            m1, m5
+  punpcklbw            m0, m5
+  movhlps              m2, m0
+%endif
+%else ; !avg
+  movx                 m2, [srcq+src_strideq]
+  movx                 m1, [dstq]
+  pavgb                m0, m4
+  movx                 m4, [srcq+src_strideq+1]
+  movx                 m3, [dstq+dst_strideq]
+  pavgb                m2, m4
+  punpcklbw            m0, m5
+  punpcklbw            m2, m5
+  punpcklbw            m3, m5
+  punpcklbw            m1, m5
+%endif
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+
+  lea                srcq, [srcq+src_strideq*2]
+  lea                dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+  dec                   block_height
+  jg .x_half_y_zero_loop
+  STORE_AND_RET %1
+
+.x_half_y_nonzero:
+  cmp           y_offsetd, 4
+  jne .x_half_y_nonhalf
+
+  ; x_offset == 0.5 && y_offset == 0.5
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m3, [srcq+1]
+  add                srcq, src_strideq
+  pavgb                m0, m3
+.x_half_y_half_loop:
+  movu                 m4, [srcq]
+  movu                 m3, [srcq+1]
+  mova                 m1, [dstq]
+  pavgb                m4, m3
+  punpckhbw            m3, m1, m5
+  pavgb                m0, m4
+%if %2 == 1 ; avg
+  punpcklbw            m1, m5
+  pavgb                m0, [secq]
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%else
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+  punpcklbw            m1, m5
+%endif
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+  mova                 m0, m4
+
+  add                srcq, src_strideq
+  add                dstq, dst_strideq
+%else ; %1 < 16
+  movx                 m0, [srcq]
+  movx                 m3, [srcq+1]
+  add                srcq, src_strideq
+  pavgb                m0, m3
+.x_half_y_half_loop:
+  movx                 m2, [srcq]
+  movx                 m3, [srcq+1]
+%if %2 == 1 ; avg
+%if %1 > 4
+  movhps               m2, [srcq+src_strideq]
+  movhps               m3, [srcq+src_strideq+1]
+%else
+  movx                 m1, [srcq+src_strideq]
+  punpckldq            m2, m1
+  movx                 m1, [srcq+src_strideq+1]
+  punpckldq            m3, m1
+%endif
+  pavgb                m2, m3
+%if %1 > 4
+  movlhps              m0, m2
+  movhlps              m4, m2
+%else ; 4xh
+  punpckldq            m0, m2
+  pshuflw              m4, m2, 0xe
+%endif
+  movx                 m1, [dstq]
+  pavgb                m0, m2
+  movx                 m3, [dstq+dst_strideq]
+%if %1 > 4
+  pavgb                m0, [secq]
+%else
+  movh                 m2, [secq]
+  pavgb                m0, m2
+%endif
+  punpcklbw            m3, m5
+  punpcklbw            m1, m5
+%if %1 > 4
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%else
+  punpcklbw            m0, m5
+  movhlps              m2, m0
+%endif
+%else ; !avg
+  movx                 m4, [srcq+src_strideq]
+  movx                 m1, [srcq+src_strideq+1]
+  pavgb                m2, m3
+  pavgb                m4, m1
+  pavgb                m0, m2
+  pavgb                m2, m4
+  movx                 m1, [dstq]
+  movx                 m3, [dstq+dst_strideq]
+  punpcklbw            m0, m5
+  punpcklbw            m2, m5
+  punpcklbw            m3, m5
+  punpcklbw            m1, m5
+%endif
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+  mova                 m0, m4
+
+  lea                srcq, [srcq+src_strideq*2]
+  lea                dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+  dec                   block_height
+  jg .x_half_y_half_loop
+  STORE_AND_RET %1
+
+.x_half_y_nonhalf:
+  ; x_offset == 0.5 && y_offset == bilin interpolation
+%if ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
+%endif
+  shl           y_offsetd, filter_idx_shift
+%if ARCH_X86_64 && %1 > 4
+  mova                 m8, [bilin_filter+y_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+  mova                 m9, [bilin_filter+y_offsetq+16]
+%endif
+  mova                m10, [GLOBAL(pw_8)]
+%define filter_y_a m8
+%define filter_y_b m9
+%define filter_rnd m10
+%else  ;x86_32
+%if ARCH_X86=1 && CONFIG_PIC=1
+; x_offset == 0.5. We can reuse x_offset reg
+%define tempq x_offsetq
+  add y_offsetq, g_bilin_filterm
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+  mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+  add           y_offsetq, bilin_filter
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [GLOBAL(pw_8)]
+%endif
+%endif
+
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m3, [srcq+1]
+  add                srcq, src_strideq
+  pavgb                m0, m3
+.x_half_y_other_loop:
+  movu                 m4, [srcq]
+  movu                 m2, [srcq+1]
+  mova                 m1, [dstq]
+  pavgb                m4, m2
+%if cpuflag(ssse3)
+  punpckhbw            m2, m0, m4
+  punpcklbw            m0, m4
+  pmaddubsw            m2, filter_y_a
+  pmaddubsw            m0, filter_y_a
+  paddw                m2, filter_rnd
+  paddw                m0, filter_rnd
+  psraw                m2, 4
+%else
+  punpckhbw            m2, m0, m5
+  punpckhbw            m3, m4, m5
+  pmullw               m2, filter_y_a
+  pmullw               m3, filter_y_b
+  paddw                m2, filter_rnd
+  punpcklbw            m0, m5
+  paddw                m2, m3
+  punpcklbw            m3, m4, m5
+  pmullw               m0, filter_y_a
+  pmullw               m3, filter_y_b
+  paddw                m0, filter_rnd
+  psraw                m2, 4
+  paddw                m0, m3
+%endif
+  punpckhbw            m3, m1, m5
+  psraw                m0, 4
+%if %2 == 1 ; avg
+  ; FIXME(rbultje) pipeline
+  packuswb             m0, m2
+  pavgb                m0, [secq]
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%endif
+  punpcklbw            m1, m5
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+  mova                 m0, m4
+
+  add                srcq, src_strideq
+  add                dstq, dst_strideq
+%else ; %1 < 16
+  movx                 m0, [srcq]
+  movx                 m3, [srcq+1]
+  add                srcq, src_strideq
+  pavgb                m0, m3
+%if notcpuflag(ssse3)
+  punpcklbw            m0, m5
+%endif
+.x_half_y_other_loop:
+  movx                 m2, [srcq]
+  movx                 m1, [srcq+1]
+  movx                 m4, [srcq+src_strideq]
+  movx                 m3, [srcq+src_strideq+1]
+  pavgb                m2, m1
+  pavgb                m4, m3
+  movx                 m3, [dstq+dst_strideq]
+%if cpuflag(ssse3)
+  movx                 m1, [dstq]
+  punpcklbw            m0, m2
+  punpcklbw            m2, m4
+  pmaddubsw            m0, filter_y_a
+  pmaddubsw            m2, filter_y_a
+  punpcklbw            m3, m5
+  paddw                m0, filter_rnd
+  paddw                m2, filter_rnd
+%else
+  punpcklbw            m2, m5
+  punpcklbw            m4, m5
+  pmullw               m0, filter_y_a
+  pmullw               m1, m2, filter_y_b
+  punpcklbw            m3, m5
+  paddw                m0, filter_rnd
+  pmullw               m2, filter_y_a
+  paddw                m0, m1
+  pmullw               m1, m4, filter_y_b
+  paddw                m2, filter_rnd
+  paddw                m2, m1
+  movx                 m1, [dstq]
+%endif
+  psraw                m0, 4
+  psraw                m2, 4
+%if %2 == 1 ; avg
+  ; FIXME(rbultje) pipeline
+%if %1 == 4
+  movlhps              m0, m2
+%endif
+  packuswb             m0, m2
+%if %1 > 4
+  pavgb                m0, [secq]
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%else
+  movh                 m2, [secq]
+  pavgb                m0, m2
+  punpcklbw            m0, m5
+  movhlps              m2, m0
+%endif
+%endif
+  punpcklbw            m1, m5
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+  mova                 m0, m4
+
+  lea                srcq, [srcq+src_strideq*2]
+  lea                dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+  dec                   block_height
+  jg .x_half_y_other_loop
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+  STORE_AND_RET %1
+
+.x_nonhalf:
+  test          y_offsetd, y_offsetd
+  jnz .x_nonhalf_y_nonzero
+
+  ; x_offset == bilin interpolation && y_offset == 0
+%if ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
+%endif
+  shl           x_offsetd, filter_idx_shift
+%if ARCH_X86_64 && %1 > 4
+  mova                 m8, [bilin_filter+x_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+  mova                 m9, [bilin_filter+x_offsetq+16]
+%endif
+  mova                m10, [GLOBAL(pw_8)]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_rnd m10
+%else    ; x86-32
+%if ARCH_X86=1 && CONFIG_PIC=1
+;y_offset == 0. We can reuse y_offset reg.
+%define tempq y_offsetq
+  add x_offsetq, g_bilin_filterm
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+  mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+  add           x_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_rnd [GLOBAL(pw_8)]
+%endif
+%endif
+
+.x_other_y_zero_loop:
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m4, [srcq+1]
+  mova                 m1, [dstq]
+%if cpuflag(ssse3)
+  punpckhbw            m2, m0, m4
+  punpcklbw            m0, m4
+  pmaddubsw            m2, filter_x_a
+  pmaddubsw            m0, filter_x_a
+  paddw                m2, filter_rnd
+  paddw                m0, filter_rnd
+%else
+  punpckhbw            m2, m0, m5
+  punpckhbw            m3, m4, m5
+  punpcklbw            m0, m5
+  punpcklbw            m4, m5
+  pmullw               m2, filter_x_a
+  pmullw               m3, filter_x_b
+  paddw                m2, filter_rnd
+  pmullw               m0, filter_x_a
+  pmullw               m4, filter_x_b
+  paddw                m0, filter_rnd
+  paddw                m2, m3
+  paddw                m0, m4
+%endif
+  psraw                m2, 4
+  psraw                m0, 4
+%if %2 == 1 ; avg
+  ; FIXME(rbultje) pipeline
+  packuswb             m0, m2
+  pavgb                m0, [secq]
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%endif
+  punpckhbw            m3, m1, m5
+  punpcklbw            m1, m5
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+
+  add                srcq, src_strideq
+  add                dstq, dst_strideq
+%else ; %1 < 16
+  movx                 m0, [srcq]
+  movx                 m1, [srcq+1]
+  movx                 m2, [srcq+src_strideq]
+  movx                 m4, [srcq+src_strideq+1]
+  movx                 m3, [dstq+dst_strideq]
+%if cpuflag(ssse3)
+  punpcklbw            m0, m1
+  movx                 m1, [dstq]
+  punpcklbw            m2, m4
+  pmaddubsw            m0, filter_x_a
+  pmaddubsw            m2, filter_x_a
+  punpcklbw            m3, m5
+  paddw                m0, filter_rnd
+  paddw                m2, filter_rnd
+%else
+  punpcklbw            m0, m5
+  punpcklbw            m1, m5
+  punpcklbw            m2, m5
+  punpcklbw            m4, m5
+  pmullw               m0, filter_x_a
+  pmullw               m1, filter_x_b
+  punpcklbw            m3, m5
+  paddw                m0, filter_rnd
+  pmullw               m2, filter_x_a
+  pmullw               m4, filter_x_b
+  paddw                m0, m1
+  paddw                m2, filter_rnd
+  movx                 m1, [dstq]
+  paddw                m2, m4
+%endif
+  psraw                m0, 4
+  psraw                m2, 4
+%if %2 == 1 ; avg
+  ; FIXME(rbultje) pipeline
+%if %1 == 4
+  movlhps              m0, m2
+%endif
+  packuswb             m0, m2
+%if %1 > 4
+  pavgb                m0, [secq]
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%else
+  movh                 m2, [secq]
+  pavgb                m0, m2
+  punpcklbw            m0, m5
+  movhlps              m2, m0
+%endif
+%endif
+  punpcklbw            m1, m5
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+
+  lea                srcq, [srcq+src_strideq*2]
+  lea                dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+  dec                   block_height
+  jg .x_other_y_zero_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_rnd
+  STORE_AND_RET %1
+
+.x_nonhalf_y_nonzero:
+  cmp           y_offsetd, 4
+  jne .x_nonhalf_y_nonhalf
+
+  ; x_offset == bilin interpolation && y_offset == 0.5
+%if ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
+%endif
+  shl           x_offsetd, filter_idx_shift
+%if ARCH_X86_64 && %1 > 4
+  mova                 m8, [bilin_filter+x_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+  mova                 m9, [bilin_filter+x_offsetq+16]
+%endif
+  mova                m10, [GLOBAL(pw_8)]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_rnd m10
+%else    ; x86-32
+%if ARCH_X86=1 && CONFIG_PIC=1
+; y_offset == 0.5. We can reuse y_offset reg.
+%define tempq y_offsetq
+  add x_offsetq, g_bilin_filterm
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+  mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+  add           x_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_rnd [GLOBAL(pw_8)]
+%endif
+%endif
+
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq+1]
+%if cpuflag(ssse3)
+  punpckhbw            m2, m0, m1
+  punpcklbw            m0, m1
+  pmaddubsw            m2, filter_x_a
+  pmaddubsw            m0, filter_x_a
+  paddw                m2, filter_rnd
+  paddw                m0, filter_rnd
+%else
+  punpckhbw            m2, m0, m5
+  punpckhbw            m3, m1, m5
+  punpcklbw            m0, m5
+  punpcklbw            m1, m5
+  pmullw               m0, filter_x_a
+  pmullw               m1, filter_x_b
+  paddw                m0, filter_rnd
+  pmullw               m2, filter_x_a
+  pmullw               m3, filter_x_b
+  paddw                m2, filter_rnd
+  paddw                m0, m1
+  paddw                m2, m3
+%endif
+  psraw                m0, 4
+  psraw                m2, 4
+  add                srcq, src_strideq
+  packuswb             m0, m2
+.x_other_y_half_loop:
+  movu                 m4, [srcq]
+  movu                 m3, [srcq+1]
+%if cpuflag(ssse3)
+  mova                 m1, [dstq]
+  punpckhbw            m2, m4, m3
+  punpcklbw            m4, m3
+  pmaddubsw            m2, filter_x_a
+  pmaddubsw            m4, filter_x_a
+  paddw                m2, filter_rnd
+  paddw                m4, filter_rnd
+  psraw                m2, 4
+  psraw                m4, 4
+  packuswb             m4, m2
+  pavgb                m0, m4
+  punpckhbw            m3, m1, m5
+  punpcklbw            m1, m5
+%else
+  punpckhbw            m2, m4, m5
+  punpckhbw            m1, m3, m5
+  punpcklbw            m4, m5
+  punpcklbw            m3, m5
+  pmullw               m4, filter_x_a
+  pmullw               m3, filter_x_b
+  paddw                m4, filter_rnd
+  pmullw               m2, filter_x_a
+  pmullw               m1, filter_x_b
+  paddw                m2, filter_rnd
+  paddw                m4, m3
+  paddw                m2, m1
+  mova                 m1, [dstq]
+  psraw                m4, 4
+  psraw                m2, 4
+  punpckhbw            m3, m1, m5
+  ; FIXME(rbultje) the repeated pack/unpack here around m0/m2 is because we
+  ; have a 1-register shortage to be able to store the backup of the bilin
+  ; filtered second line as words as cache for the next line. Packing into
+  ; a byte costs 1 pack and 2 unpacks, but saves a register.
+  packuswb             m4, m2
+  punpcklbw            m1, m5
+  pavgb                m0, m4
+%endif
+%if %2 == 1 ; avg
+  ; FIXME(rbultje) pipeline
+  pavgb                m0, [secq]
+%endif
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+  mova                 m0, m4
+
+  add                srcq, src_strideq
+  add                dstq, dst_strideq
+%else ; %1 < 16
+  movx                 m0, [srcq]
+  movx                 m1, [srcq+1]
+%if cpuflag(ssse3)
+  punpcklbw            m0, m1
+  pmaddubsw            m0, filter_x_a
+  paddw                m0, filter_rnd
+%else
+  punpcklbw            m0, m5
+  punpcklbw            m1, m5
+  pmullw               m0, filter_x_a
+  pmullw               m1, filter_x_b
+  paddw                m0, filter_rnd
+  paddw                m0, m1
+%endif
+  add                srcq, src_strideq
+  psraw                m0, 4
+.x_other_y_half_loop:
+  movx                 m2, [srcq]
+  movx                 m1, [srcq+1]
+  movx                 m4, [srcq+src_strideq]
+  movx                 m3, [srcq+src_strideq+1]
+%if cpuflag(ssse3)
+  punpcklbw            m2, m1
+  punpcklbw            m4, m3
+  pmaddubsw            m2, filter_x_a
+  pmaddubsw            m4, filter_x_a
+  movx                 m1, [dstq]
+  movx                 m3, [dstq+dst_strideq]
+  paddw                m2, filter_rnd
+  paddw                m4, filter_rnd
+%else
+  punpcklbw            m2, m5
+  punpcklbw            m1, m5
+  punpcklbw            m4, m5
+  punpcklbw            m3, m5
+  pmullw               m2, filter_x_a
+  pmullw               m1, filter_x_b
+  paddw                m2, filter_rnd
+  pmullw               m4, filter_x_a
+  pmullw               m3, filter_x_b
+  paddw                m4, filter_rnd
+  paddw                m2, m1
+  movx                 m1, [dstq]
+  paddw                m4, m3
+  movx                 m3, [dstq+dst_strideq]
+%endif
+  psraw                m2, 4
+  psraw                m4, 4
+  pavgw                m0, m2
+  pavgw                m2, m4
+%if %2 == 1 ; avg
+  ; FIXME(rbultje) pipeline - also consider going to bytes here
+%if %1 == 4
+  movlhps              m0, m2
+%endif
+  packuswb             m0, m2
+%if %1 > 4
+  pavgb                m0, [secq]
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%else
+  movh                 m2, [secq]
+  pavgb                m0, m2
+  punpcklbw            m0, m5
+  movhlps              m2, m0
+%endif
+%endif
+  punpcklbw            m3, m5
+  punpcklbw            m1, m5
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+  mova                 m0, m4
+
+  lea                srcq, [srcq+src_strideq*2]
+  lea                dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+  dec                   block_height
+  jg .x_other_y_half_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_rnd
+  STORE_AND_RET %1
+
+.x_nonhalf_y_nonhalf:
+%if ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
+%endif
+  shl           x_offsetd, filter_idx_shift
+  shl           y_offsetd, filter_idx_shift
+%if ARCH_X86_64 && %1 > 4
+  mova                 m8, [bilin_filter+x_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+  mova                 m9, [bilin_filter+x_offsetq+16]
+%endif
+  mova                m10, [bilin_filter+y_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+  mova                m11, [bilin_filter+y_offsetq+16]
+%endif
+  mova                m12, [GLOBAL(pw_8)]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_y_a m10
+%define filter_y_b m11
+%define filter_rnd m12
+%else   ; x86-32
+%if ARCH_X86=1 && CONFIG_PIC=1
+; In this case, there is NO unused register. Used src_stride register. Later,
+; src_stride has to be loaded from stack when it is needed.
+%define tempq src_strideq
+  mov tempq, g_bilin_filterm
+  add           x_offsetq, tempq
+  add           y_offsetq, tempq
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+
+  mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+  add           x_offsetq, bilin_filter
+  add           y_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [GLOBAL(pw_8)]
+%endif
+%endif
+
+  ; x_offset == bilin interpolation && y_offset == bilin interpolation
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq+1]
+%if cpuflag(ssse3)
+  punpckhbw            m2, m0, m1
+  punpcklbw            m0, m1
+  pmaddubsw            m2, filter_x_a
+  pmaddubsw            m0, filter_x_a
+  paddw                m2, filter_rnd
+  paddw                m0, filter_rnd
+%else
+  punpckhbw            m2, m0, m5
+  punpckhbw            m3, m1, m5
+  punpcklbw            m0, m5
+  punpcklbw            m1, m5
+  pmullw               m0, filter_x_a
+  pmullw               m1, filter_x_b
+  paddw                m0, filter_rnd
+  pmullw               m2, filter_x_a
+  pmullw               m3, filter_x_b
+  paddw                m2, filter_rnd
+  paddw                m0, m1
+  paddw                m2, m3
+%endif
+  psraw                m0, 4
+  psraw                m2, 4
+
+  INC_SRC_BY_SRC_STRIDE
+
+  packuswb             m0, m2
+.x_other_y_other_loop:
+%if cpuflag(ssse3)
+  movu                 m4, [srcq]
+  movu                 m3, [srcq+1]
+  mova                 m1, [dstq]
+  punpckhbw            m2, m4, m3
+  punpcklbw            m4, m3
+  pmaddubsw            m2, filter_x_a
+  pmaddubsw            m4, filter_x_a
+  punpckhbw            m3, m1, m5
+  paddw                m2, filter_rnd
+  paddw                m4, filter_rnd
+  psraw                m2, 4
+  psraw                m4, 4
+  packuswb             m4, m2
+  punpckhbw            m2, m0, m4
+  punpcklbw            m0, m4
+  pmaddubsw            m2, filter_y_a
+  pmaddubsw            m0, filter_y_a
+  punpcklbw            m1, m5
+  paddw                m2, filter_rnd
+  paddw                m0, filter_rnd
+  psraw                m2, 4
+  psraw                m0, 4
+%else
+  movu                 m3, [srcq]
+  movu                 m4, [srcq+1]
+  punpckhbw            m1, m3, m5
+  punpckhbw            m2, m4, m5
+  punpcklbw            m3, m5
+  punpcklbw            m4, m5
+  pmullw               m3, filter_x_a
+  pmullw               m4, filter_x_b
+  paddw                m3, filter_rnd
+  pmullw               m1, filter_x_a
+  pmullw               m2, filter_x_b
+  paddw                m1, filter_rnd
+  paddw                m3, m4
+  paddw                m1, m2
+  psraw                m3, 4
+  psraw                m1, 4
+  packuswb             m4, m3, m1
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+  pmullw               m2, filter_y_a
+  pmullw               m1, filter_y_b
+  paddw                m2, filter_rnd
+  pmullw               m0, filter_y_a
+  pmullw               m3, filter_y_b
+  paddw                m2, m1
+  mova                 m1, [dstq]
+  paddw                m0, filter_rnd
+  psraw                m2, 4
+  paddw                m0, m3
+  punpckhbw            m3, m1, m5
+  psraw                m0, 4
+  punpcklbw            m1, m5
+%endif
+%if %2 == 1 ; avg
+  ; FIXME(rbultje) pipeline
+  packuswb             m0, m2
+  pavgb                m0, [secq]
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%endif
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+  mova                 m0, m4
+
+  INC_SRC_BY_SRC_STRIDE
+  add                dstq, dst_strideq
+%else ; %1 < 16
+  movx                 m0, [srcq]
+  movx                 m1, [srcq+1]
+%if cpuflag(ssse3)
+  punpcklbw            m0, m1
+  pmaddubsw            m0, filter_x_a
+  paddw                m0, filter_rnd
+%else
+  punpcklbw            m0, m5
+  punpcklbw            m1, m5
+  pmullw               m0, filter_x_a
+  pmullw               m1, filter_x_b
+  paddw                m0, filter_rnd
+  paddw                m0, m1
+%endif
+  psraw                m0, 4
+%if cpuflag(ssse3)
+  packuswb             m0, m0
+%endif
+
+  INC_SRC_BY_SRC_STRIDE
+
+.x_other_y_other_loop:
+  movx                 m2, [srcq]
+  movx                 m1, [srcq+1]
+
+  INC_SRC_BY_SRC_STRIDE
+  movx                 m4, [srcq]
+  movx                 m3, [srcq+1]
+
+%if cpuflag(ssse3)
+  punpcklbw            m2, m1
+  punpcklbw            m4, m3
+  pmaddubsw            m2, filter_x_a
+  pmaddubsw            m4, filter_x_a
+  movx                 m3, [dstq+dst_strideq]
+  movx                 m1, [dstq]
+  paddw                m2, filter_rnd
+  paddw                m4, filter_rnd
+  psraw                m2, 4
+  psraw                m4, 4
+  packuswb             m2, m2
+  packuswb             m4, m4
+  punpcklbw            m0, m2
+  punpcklbw            m2, m4
+  pmaddubsw            m0, filter_y_a
+  pmaddubsw            m2, filter_y_a
+  punpcklbw            m3, m5
+  paddw                m0, filter_rnd
+  paddw                m2, filter_rnd
+  psraw                m0, 4
+  psraw                m2, 4
+  punpcklbw            m1, m5
+%else
+  punpcklbw            m2, m5
+  punpcklbw            m1, m5
+  punpcklbw            m4, m5
+  punpcklbw            m3, m5
+  pmullw               m2, filter_x_a
+  pmullw               m1, filter_x_b
+  paddw                m2, filter_rnd
+  pmullw               m4, filter_x_a
+  pmullw               m3, filter_x_b
+  paddw                m4, filter_rnd
+  paddw                m2, m1
+  paddw                m4, m3
+  psraw                m2, 4
+  psraw                m4, 4
+  pmullw               m0, filter_y_a
+  pmullw               m3, m2, filter_y_b
+  paddw                m0, filter_rnd
+  pmullw               m2, filter_y_a
+  pmullw               m1, m4, filter_y_b
+  paddw                m2, filter_rnd
+  paddw                m0, m3
+  movx                 m3, [dstq+dst_strideq]
+  paddw                m2, m1
+  movx                 m1, [dstq]
+  psraw                m0, 4
+  psraw                m2, 4
+  punpcklbw            m3, m5
+  punpcklbw            m1, m5
+%endif
+%if %2 == 1 ; avg
+  ; FIXME(rbultje) pipeline
+%if %1 == 4
+  movlhps              m0, m2
+%endif
+  packuswb             m0, m2
+%if %1 > 4
+  pavgb                m0, [secq]
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%else
+  movh                 m2, [secq]
+  pavgb                m0, m2
+  punpcklbw            m0, m5
+  movhlps              m2, m0
+%endif
+%endif
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+  mova                 m0, m4
+
+  INC_SRC_BY_SRC_STRIDE
+  lea                dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+  dec                   block_height
+  jg .x_other_y_other_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+%undef movx
+  STORE_AND_RET %1
+%endmacro
+
+; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical
+; between the ssse3 and non-ssse3 version. It may make sense to merge their
+; code in the sense that the ssse3 version would jump to the appropriate
+; location in the sse/2 version, rather than duplicating that code in the
+; binary.
+
+INIT_XMM sse2
+SUBPEL_VARIANCE  4
+SUBPEL_VARIANCE  8
+SUBPEL_VARIANCE 16
+
+INIT_XMM ssse3
+SUBPEL_VARIANCE  4
+SUBPEL_VARIANCE  8
+SUBPEL_VARIANCE 16
+
+INIT_XMM sse2
+SUBPEL_VARIANCE  4, 1
+SUBPEL_VARIANCE  8, 1
+SUBPEL_VARIANCE 16, 1
+
+INIT_XMM ssse3
+SUBPEL_VARIANCE  4, 1
+SUBPEL_VARIANCE  8, 1
+SUBPEL_VARIANCE 16, 1
diff --git a/libs/libaom/src/aom_dsp/x86/subtract_avx2.c b/libs/libaom/src/aom_dsp/x86/subtract_avx2.c
new file mode 100644
index 000000000..40831600a
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/subtract_avx2.c
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE void subtract32_avx2(int16_t *diff_ptr, const uint8_t *src_ptr,
+                                   const uint8_t *pred_ptr) {
+  __m256i s = _mm256_lddqu_si256((__m256i *)(src_ptr));
+  __m256i p = _mm256_lddqu_si256((__m256i *)(pred_ptr));
+  __m256i s_0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s));
+  __m256i s_1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s, 1));
+  __m256i p_0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(p));
+  __m256i p_1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(p, 1));
+  const __m256i d_0 = _mm256_sub_epi16(s_0, p_0);
+  const __m256i d_1 = _mm256_sub_epi16(s_1, p_1);
+  _mm256_store_si256((__m256i *)(diff_ptr), d_0);
+  _mm256_store_si256((__m256i *)(diff_ptr + 16), d_1);
+}
+
+static INLINE void subtract_block_16xn_avx2(
+    int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
+    ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
+  for (int32_t j = 0; j < rows; ++j) {
+    __m128i s = _mm_lddqu_si128((__m128i *)(src_ptr));
+    __m128i p = _mm_lddqu_si128((__m128i *)(pred_ptr));
+    __m256i s_0 = _mm256_cvtepu8_epi16(s);
+    __m256i p_0 = _mm256_cvtepu8_epi16(p);
+    const __m256i d_0 = _mm256_sub_epi16(s_0, p_0);
+    _mm256_store_si256((__m256i *)(diff_ptr), d_0);
+    src_ptr += src_stride;
+    pred_ptr += pred_stride;
+    diff_ptr += diff_stride;
+  }
+}
+
+static INLINE void subtract_block_32xn_avx2(
+    int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
+    ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
+  for (int32_t j = 0; j < rows; ++j) {
+    subtract32_avx2(diff_ptr, src_ptr, pred_ptr);
+    src_ptr += src_stride;
+    pred_ptr += pred_stride;
+    diff_ptr += diff_stride;
+  }
+}
+
+static INLINE void subtract_block_64xn_avx2(
+    int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
+    ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
+  for (int32_t j = 0; j < rows; ++j) {
+    subtract32_avx2(diff_ptr, src_ptr, pred_ptr);
+    subtract32_avx2(diff_ptr + 32, src_ptr + 32, pred_ptr + 32);
+    src_ptr += src_stride;
+    pred_ptr += pred_stride;
+    diff_ptr += diff_stride;
+  }
+}
+
+static INLINE void subtract_block_128xn_avx2(
+    int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
+    ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
+  for (int32_t j = 0; j < rows; ++j) {
+    subtract32_avx2(diff_ptr, src_ptr, pred_ptr);
+    subtract32_avx2(diff_ptr + 32, src_ptr + 32, pred_ptr + 32);
+    subtract32_avx2(diff_ptr + 64, src_ptr + 64, pred_ptr + 64);
+    subtract32_avx2(diff_ptr + 96, src_ptr + 96, pred_ptr + 96);
+    src_ptr += src_stride;
+    pred_ptr += pred_stride;
+    diff_ptr += diff_stride;
+  }
+}
+
+void aom_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr,
+                             ptrdiff_t diff_stride, const uint8_t *src_ptr,
+                             ptrdiff_t src_stride, const uint8_t *pred_ptr,
+                             ptrdiff_t pred_stride) {
+  switch (cols) {
+    case 16:
+      subtract_block_16xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride,
+                               pred_ptr, pred_stride);
+      break;
+    case 32:
+      subtract_block_32xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride,
+                               pred_ptr, pred_stride);
+      break;
+    case 64:
+      subtract_block_64xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride,
+                               pred_ptr, pred_stride);
+      break;
+    case 128:
+      subtract_block_128xn_avx2(rows, diff_ptr, diff_stride, src_ptr,
+                                src_stride, pred_ptr, pred_stride);
+      break;
+    default:
+      aom_subtract_block_sse2(rows, cols, diff_ptr, diff_stride, src_ptr,
+                              src_stride, pred_ptr, pred_stride);
+      break;
+  }
+}
diff --git a/libs/libaom/src/aom_dsp/x86/subtract_sse2.asm b/libs/libaom/src/aom_dsp/x86/subtract_sse2.asm
new file mode 100644
index 000000000..1a75a234f
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/subtract_sse2.asm
@@ -0,0 +1,146 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+; void aom_subtract_block(int rows, int cols,
+;                         int16_t *diff, ptrdiff_t diff_stride,
+;                         const uint8_t *src, ptrdiff_t src_stride,
+;                         const uint8_t *pred, ptrdiff_t pred_stride)
+
+INIT_XMM sse2
+cglobal subtract_block, 7, 7, 8, \
+                        rows, cols, diff, diff_stride, src, src_stride, \
+                        pred, pred_stride
+%define pred_str colsq
+  pxor                  m7, m7         ; dedicated zero register
+  cmp                colsd, 4
+  je .case_4
+  cmp                colsd, 8
+  je .case_8
+  cmp                colsd, 16
+  je .case_16
+  cmp                colsd, 32
+  je .case_32
+  cmp                colsd, 64
+  je .case_64
+
+%macro loop16 6
+  mova                  m0, [srcq+%1]
+  mova                  m4, [srcq+%2]
+  mova                  m1, [predq+%3]
+  mova                  m5, [predq+%4]
+  punpckhbw             m2, m0, m7
+  punpckhbw             m3, m1, m7
+  punpcklbw             m0, m7
+  punpcklbw             m1, m7
+  psubw                 m2, m3
+  psubw                 m0, m1
+  punpckhbw             m1, m4, m7
+  punpckhbw             m3, m5, m7
+  punpcklbw             m4, m7
+  punpcklbw             m5, m7
+  psubw                 m1, m3
+  psubw                 m4, m5
+  mova [diffq+mmsize*0+%5], m0
+  mova [diffq+mmsize*1+%5], m2
+  mova [diffq+mmsize*0+%6], m4
+  mova [diffq+mmsize*1+%6], m1
+%endmacro
+
+  mov             pred_str, pred_stridemp
+.loop_128:
+  loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize,  0*mmsize,  2*mmsize
+  loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize,  4*mmsize,  6*mmsize
+  loop16 4*mmsize, 5*mmsize, 4*mmsize, 5*mmsize,  8*mmsize, 10*mmsize
+  loop16 6*mmsize, 7*mmsize, 6*mmsize, 7*mmsize, 12*mmsize, 14*mmsize
+  lea                diffq, [diffq+diff_strideq*2]
+  add                predq, pred_str
+  add                 srcq, src_strideq
+  sub                rowsd, 1
+  jnz .loop_128
+  RET
+
+.case_64:
+  mov             pred_str, pred_stridemp
+.loop_64:
+  loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize
+  loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize, 4*mmsize, 6*mmsize
+  lea                diffq, [diffq+diff_strideq*2]
+  add                predq, pred_str
+  add                 srcq, src_strideq
+  dec                rowsd
+  jg .loop_64
+  RET
+
+.case_32:
+  mov             pred_str, pred_stridemp
+.loop_32:
+  loop16 0, mmsize, 0, mmsize, 0, 2*mmsize
+  lea                diffq, [diffq+diff_strideq*2]
+  add                predq, pred_str
+  add                 srcq, src_strideq
+  dec                rowsd
+  jg .loop_32
+  RET
+
+.case_16:
+  mov             pred_str, pred_stridemp
+.loop_16:
+  loop16 0, src_strideq, 0, pred_str, 0, diff_strideq*2
+  lea                diffq, [diffq+diff_strideq*4]
+  lea                predq, [predq+pred_str*2]
+  lea                 srcq, [srcq+src_strideq*2]
+  sub                rowsd, 2
+  jg .loop_16
+  RET
+
+%macro loop_h 0
+  movh                  m0, [srcq]
+  movh                  m2, [srcq+src_strideq]
+  movh                  m1, [predq]
+  movh                  m3, [predq+pred_str]
+  punpcklbw             m0, m7
+  punpcklbw             m1, m7
+  punpcklbw             m2, m7
+  punpcklbw             m3, m7
+  psubw                 m0, m1
+  psubw                 m2, m3
+  mova             [diffq], m0
+  mova [diffq+diff_strideq*2], m2
+%endmacro
+
+.case_8:
+  mov             pred_str, pred_stridemp
+.loop_8:
+  loop_h
+  lea                diffq, [diffq+diff_strideq*4]
+  lea                 srcq, [srcq+src_strideq*2]
+  lea                predq, [predq+pred_str*2]
+  sub                rowsd, 2
+  jg .loop_8
+  RET
+
+INIT_MMX
+.case_4:
+  mov             pred_str, pred_stridemp
+.loop_4:
+  loop_h
+  lea                diffq, [diffq+diff_strideq*4]
+  lea                 srcq, [srcq+src_strideq*2]
+  lea                predq, [predq+pred_str*2]
+  sub                rowsd, 2
+  jg .loop_4
+  RET
diff --git a/libs/libaom/src/aom_dsp/x86/sum_squares_avx2.c b/libs/libaom/src/aom_dsp/x86/sum_squares_avx2.c
new file mode 100644
index 000000000..97d78b684
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/sum_squares_avx2.c
@@ -0,0 +1,248 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+#include <smmintrin.h>
+
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+#include "aom_dsp/x86/sum_squares_sse2.h"
+#include "config/aom_dsp_rtcd.h"
+
+static uint64_t aom_sum_squares_2d_i16_nxn_avx2(const int16_t *src, int stride,
+                                                int width, int height) {
+  uint64_t result;
+  __m256i v_acc_q = _mm256_setzero_si256();
+  const __m256i v_zext_mask_q = yy_set1_64_from_32i(0xffffffff);
+  for (int col = 0; col < height; col += 4) {
+    __m256i v_acc_d = _mm256_setzero_si256();
+    for (int row = 0; row < width; row += 16) {
+      const int16_t *tempsrc = src + row;
+      const __m256i v_val_0_w =
+          _mm256_loadu_si256((const __m256i *)(tempsrc + 0 * stride));
+      const __m256i v_val_1_w =
+          _mm256_loadu_si256((const __m256i *)(tempsrc + 1 * stride));
+      const __m256i v_val_2_w =
+          _mm256_loadu_si256((const __m256i *)(tempsrc + 2 * stride));
+      const __m256i v_val_3_w =
+          _mm256_loadu_si256((const __m256i *)(tempsrc + 3 * stride));
+
+      const __m256i v_sq_0_d = _mm256_madd_epi16(v_val_0_w, v_val_0_w);
+      const __m256i v_sq_1_d = _mm256_madd_epi16(v_val_1_w, v_val_1_w);
+      const __m256i v_sq_2_d = _mm256_madd_epi16(v_val_2_w, v_val_2_w);
+      const __m256i v_sq_3_d = _mm256_madd_epi16(v_val_3_w, v_val_3_w);
+
+      const __m256i v_sum_01_d = _mm256_add_epi32(v_sq_0_d, v_sq_1_d);
+      const __m256i v_sum_23_d = _mm256_add_epi32(v_sq_2_d, v_sq_3_d);
+      const __m256i v_sum_0123_d = _mm256_add_epi32(v_sum_01_d, v_sum_23_d);
+
+      v_acc_d = _mm256_add_epi32(v_acc_d, v_sum_0123_d);
+    }
+    v_acc_q =
+        _mm256_add_epi64(v_acc_q, _mm256_and_si256(v_acc_d, v_zext_mask_q));
+    v_acc_q = _mm256_add_epi64(v_acc_q, _mm256_srli_epi64(v_acc_d, 32));
+    src += 4 * stride;
+  }
+  __m128i lower_64_2_Value = _mm256_castsi256_si128(v_acc_q);
+  __m128i higher_64_2_Value = _mm256_extracti128_si256(v_acc_q, 1);
+  __m128i result_64_2_int = _mm_add_epi64(lower_64_2_Value, higher_64_2_Value);
+
+  result_64_2_int = _mm_add_epi64(
+      result_64_2_int, _mm_unpackhi_epi64(result_64_2_int, result_64_2_int));
+
+  xx_storel_64(&result, result_64_2_int);
+
+  return result;
+}
+
+uint64_t aom_sum_squares_2d_i16_avx2(const int16_t *src, int stride, int width,
+                                     int height) {
+  if (LIKELY(width == 4 && height == 4)) {
+    return aom_sum_squares_2d_i16_4x4_sse2(src, stride);
+  } else if (LIKELY(width == 4 && (height & 3) == 0)) {
+    return aom_sum_squares_2d_i16_4xn_sse2(src, stride, height);
+  } else if (LIKELY(width == 8 && (height & 3) == 0)) {
+    return aom_sum_squares_2d_i16_nxn_sse2(src, stride, width, height);
+  } else if (LIKELY(((width & 15) == 0) && ((height & 3) == 0))) {
+    return aom_sum_squares_2d_i16_nxn_avx2(src, stride, width, height);
+  } else {
+    return aom_sum_squares_2d_i16_c(src, stride, width, height);
+  }
+}
+
+// Accumulate sum of 16-bit elements in the vector
+static AOM_INLINE int32_t mm256_accumulate_epi16(__m256i vec_a) {
+  __m128i vtmp1 = _mm256_extracti128_si256(vec_a, 1);
+  __m128i vtmp2 = _mm256_castsi256_si128(vec_a);
+  vtmp1 = _mm_add_epi16(vtmp1, vtmp2);
+  vtmp2 = _mm_srli_si128(vtmp1, 8);
+  vtmp1 = _mm_add_epi16(vtmp1, vtmp2);
+  vtmp2 = _mm_srli_si128(vtmp1, 4);
+  vtmp1 = _mm_add_epi16(vtmp1, vtmp2);
+  vtmp2 = _mm_srli_si128(vtmp1, 2);
+  vtmp1 = _mm_add_epi16(vtmp1, vtmp2);
+  return _mm_extract_epi16(vtmp1, 0);
+}
+
+// Accumulate sum of 32-bit elements in the vector
+static AOM_INLINE int32_t mm256_accumulate_epi32(__m256i vec_a) {
+  __m128i vtmp1 = _mm256_extracti128_si256(vec_a, 1);
+  __m128i vtmp2 = _mm256_castsi256_si128(vec_a);
+  vtmp1 = _mm_add_epi32(vtmp1, vtmp2);
+  vtmp2 = _mm_srli_si128(vtmp1, 8);
+  vtmp1 = _mm_add_epi32(vtmp1, vtmp2);
+  vtmp2 = _mm_srli_si128(vtmp1, 4);
+  vtmp1 = _mm_add_epi32(vtmp1, vtmp2);
+  return _mm_cvtsi128_si32(vtmp1);
+}
+
+uint64_t aom_var_2d_u8_avx2(uint8_t *src, int src_stride, int width,
+                            int height) {
+  uint8_t *srcp;
+  uint64_t s = 0, ss = 0;
+  __m256i vzero = _mm256_setzero_si256();
+  __m256i v_acc_sum = vzero;
+  __m256i v_acc_sqs = vzero;
+  int i, j;
+
+  // Process 32 elements in a row
+  for (i = 0; i < width - 31; i += 32) {
+    srcp = src + i;
+    // Process 8 columns at a time
+    for (j = 0; j < height - 7; j += 8) {
+      __m256i vsrc[8];
+      for (int k = 0; k < 8; k++) {
+        vsrc[k] = _mm256_loadu_si256((__m256i *)srcp);
+        srcp += src_stride;
+      }
+      for (int k = 0; k < 8; k++) {
+        __m256i vsrc0 = _mm256_unpacklo_epi8(vsrc[k], vzero);
+        __m256i vsrc1 = _mm256_unpackhi_epi8(vsrc[k], vzero);
+        v_acc_sum = _mm256_add_epi16(v_acc_sum, vsrc0);
+        v_acc_sum = _mm256_add_epi16(v_acc_sum, vsrc1);
+
+        __m256i vsqs0 = _mm256_madd_epi16(vsrc0, vsrc0);
+        __m256i vsqs1 = _mm256_madd_epi16(vsrc1, vsrc1);
+        v_acc_sqs = _mm256_add_epi32(v_acc_sqs, vsqs0);
+        v_acc_sqs = _mm256_add_epi32(v_acc_sqs, vsqs1);
+      }
+
+      // Update total sum and clear the vectors
+      s += mm256_accumulate_epi16(v_acc_sum);
+      ss += mm256_accumulate_epi32(v_acc_sqs);
+      v_acc_sum = vzero;
+      v_acc_sqs = vzero;
+    }
+
+    // Process remaining rows (height not a multiple of 8)
+    for (; j < height; j++) {
+      __m256i vsrc = _mm256_loadu_si256((__m256i *)srcp);
+      __m256i vsrc0 = _mm256_unpacklo_epi8(vsrc, vzero);
+      __m256i vsrc1 = _mm256_unpackhi_epi8(vsrc, vzero);
+      v_acc_sum = _mm256_add_epi16(v_acc_sum, vsrc0);
+      v_acc_sum = _mm256_add_epi16(v_acc_sum, vsrc1);
+
+      __m256i vsqs0 = _mm256_madd_epi16(vsrc0, vsrc0);
+      __m256i vsqs1 = _mm256_madd_epi16(vsrc1, vsrc1);
+      v_acc_sqs = _mm256_add_epi32(v_acc_sqs, vsqs0);
+      v_acc_sqs = _mm256_add_epi32(v_acc_sqs, vsqs1);
+
+      srcp += src_stride;
+    }
+
+    // Update total sum and clear the vectors
+    s += mm256_accumulate_epi16(v_acc_sum);
+    ss += mm256_accumulate_epi32(v_acc_sqs);
+    v_acc_sum = vzero;
+    v_acc_sqs = vzero;
+  }
+
+  // Process the remaining area using C
+  srcp = src;
+  for (int k = 0; k < height; k++) {
+    for (int m = i; m < width; m++) {
+      uint8_t val = srcp[m];
+      s += val;
+      ss += val * val;
+    }
+    srcp += src_stride;
+  }
+  return (ss - s * s / (width * height));
+}
+
+uint64_t aom_var_2d_u16_avx2(uint8_t *src, int src_stride, int width,
+                             int height) {
+  uint16_t *srcp1 = CONVERT_TO_SHORTPTR(src), *srcp;
+  uint64_t s = 0, ss = 0;
+  __m256i vzero = _mm256_setzero_si256();
+  __m256i v_acc_sum = vzero;
+  __m256i v_acc_sqs = vzero;
+  int i, j;
+
+  // Process 16 elements in a row
+  for (i = 0; i < width - 15; i += 16) {
+    srcp = srcp1 + i;
+    // Process 8 columns at a time
+    for (j = 0; j < height - 8; j += 8) {
+      __m256i vsrc[8];
+      for (int k = 0; k < 8; k++) {
+        vsrc[k] = _mm256_loadu_si256((__m256i *)srcp);
+        srcp += src_stride;
+      }
+      for (int k = 0; k < 8; k++) {
+        __m256i vsrc0 = _mm256_unpacklo_epi16(vsrc[k], vzero);
+        __m256i vsrc1 = _mm256_unpackhi_epi16(vsrc[k], vzero);
+        v_acc_sum = _mm256_add_epi32(vsrc0, v_acc_sum);
+        v_acc_sum = _mm256_add_epi32(vsrc1, v_acc_sum);
+
+        __m256i vsqs0 = _mm256_madd_epi16(vsrc[k], vsrc[k]);
+        v_acc_sqs = _mm256_add_epi32(v_acc_sqs, vsqs0);
+      }
+
+      // Update total sum and clear the vectors
+      s += mm256_accumulate_epi32(v_acc_sum);
+      ss += mm256_accumulate_epi32(v_acc_sqs);
+      v_acc_sum = vzero;
+      v_acc_sqs = vzero;
+    }
+
+    // Process remaining rows (height not a multiple of 8)
+    for (; j < height; j++) {
+      __m256i vsrc = _mm256_loadu_si256((__m256i *)srcp);
+      __m256i vsrc0 = _mm256_unpacklo_epi16(vsrc, vzero);
+      __m256i vsrc1 = _mm256_unpackhi_epi16(vsrc, vzero);
+      v_acc_sum = _mm256_add_epi32(vsrc0, v_acc_sum);
+      v_acc_sum = _mm256_add_epi32(vsrc1, v_acc_sum);
+
+      __m256i vsqs0 = _mm256_madd_epi16(vsrc, vsrc);
+      v_acc_sqs = _mm256_add_epi32(v_acc_sqs, vsqs0);
+      srcp += src_stride;
+    }
+
+    // Update total sum and clear the vectors
+    s += mm256_accumulate_epi32(v_acc_sum);
+    ss += mm256_accumulate_epi32(v_acc_sqs);
+    v_acc_sum = vzero;
+    v_acc_sqs = vzero;
+  }
+
+  // Process the remaining area using C
+  srcp = srcp1;
+  for (int k = 0; k < height; k++) {
+    for (int m = i; m < width; m++) {
+      uint16_t val = srcp[m];
+      s += val;
+      ss += val * val;
+    }
+    srcp += src_stride;
+  }
+  return (ss - s * s / (width * height));
+}
diff --git a/libs/libaom/src/aom_dsp/x86/sum_squares_sse2.c b/libs/libaom/src/aom_dsp/x86/sum_squares_sse2.c
new file mode 100644
index 000000000..85b301a88
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/sum_squares_sse2.c
@@ -0,0 +1,366 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+#include <stdio.h>
+
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/sum_squares_sse2.h"
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE __m128i xx_loadh_64(__m128i a, const void *b) {
+  const __m128d ad = _mm_castsi128_pd(a);
+  return _mm_castpd_si128(_mm_loadh_pd(ad, (double *)b));
+}
+
+static INLINE uint64_t xx_cvtsi128_si64(__m128i a) {
+#if ARCH_X86_64
+  return (uint64_t)_mm_cvtsi128_si64(a);
+#else
+  {
+    uint64_t tmp;
+    _mm_storel_epi64((__m128i *)&tmp, a);
+    return tmp;
+  }
+#endif
+}
+
+static INLINE __m128i sum_squares_i16_4x4_sse2(const int16_t *src, int stride) {
+  const __m128i v_val_0_w = xx_loadl_64(src + 0 * stride);
+  const __m128i v_val_2_w = xx_loadl_64(src + 2 * stride);
+  const __m128i v_val_01_w = xx_loadh_64(v_val_0_w, src + 1 * stride);
+  const __m128i v_val_23_w = xx_loadh_64(v_val_2_w, src + 3 * stride);
+  const __m128i v_sq_01_d = _mm_madd_epi16(v_val_01_w, v_val_01_w);
+  const __m128i v_sq_23_d = _mm_madd_epi16(v_val_23_w, v_val_23_w);
+
+  return _mm_add_epi32(v_sq_01_d, v_sq_23_d);
+}
+
+uint64_t aom_sum_squares_2d_i16_4x4_sse2(const int16_t *src, int stride) {
+  const __m128i v_sum_0123_d = sum_squares_i16_4x4_sse2(src, stride);
+  __m128i v_sum_d =
+      _mm_add_epi32(v_sum_0123_d, _mm_srli_epi64(v_sum_0123_d, 32));
+  v_sum_d = _mm_add_epi32(v_sum_d, _mm_srli_si128(v_sum_d, 8));
+  return (uint64_t)_mm_cvtsi128_si32(v_sum_d);
+}
+
+uint64_t aom_sum_squares_2d_i16_4xn_sse2(const int16_t *src, int stride,
+                                         int height) {
+  int r = 0;
+  __m128i v_acc_q = _mm_setzero_si128();
+  do {
+    const __m128i v_acc_d = sum_squares_i16_4x4_sse2(src, stride);
+    v_acc_q = _mm_add_epi32(v_acc_q, v_acc_d);
+    src += stride << 2;
+    r += 4;
+  } while (r < height);
+  const __m128i v_zext_mask_q = xx_set1_64_from_32i(0xffffffff);
+  __m128i v_acc_64 = _mm_add_epi64(_mm_srli_epi64(v_acc_q, 32),
+                                   _mm_and_si128(v_acc_q, v_zext_mask_q));
+  v_acc_64 = _mm_add_epi64(v_acc_64, _mm_srli_si128(v_acc_64, 8));
+  return xx_cvtsi128_si64(v_acc_64);
+}
+
+#ifdef __GNUC__
+// This prevents GCC/Clang from inlining this function into
+// aom_sum_squares_2d_i16_sse2, which in turn saves some stack
+// maintenance instructions in the common case of 4x4.
+__attribute__((noinline))
+#endif
+uint64_t
+aom_sum_squares_2d_i16_nxn_sse2(const int16_t *src, int stride, int width,
+                                int height) {
+  int r = 0;
+
+  const __m128i v_zext_mask_q = xx_set1_64_from_32i(0xffffffff);
+  __m128i v_acc_q = _mm_setzero_si128();
+
+  do {
+    __m128i v_acc_d = _mm_setzero_si128();
+    int c = 0;
+    do {
+      const int16_t *b = src + c;
+
+      const __m128i v_val_0_w = xx_load_128(b + 0 * stride);
+      const __m128i v_val_1_w = xx_load_128(b + 1 * stride);
+      const __m128i v_val_2_w = xx_load_128(b + 2 * stride);
+      const __m128i v_val_3_w = xx_load_128(b + 3 * stride);
+
+      const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w);
+      const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w);
+      const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w);
+      const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w);
+
+      const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d);
+      const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d);
+
+      const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d);
+
+      v_acc_d = _mm_add_epi32(v_acc_d, v_sum_0123_d);
+      c += 8;
+    } while (c < width);
+
+    v_acc_q = _mm_add_epi64(v_acc_q, _mm_and_si128(v_acc_d, v_zext_mask_q));
+    v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_epi64(v_acc_d, 32));
+
+    src += 4 * stride;
+    r += 4;
+  } while (r < height);
+
+  v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8));
+  return xx_cvtsi128_si64(v_acc_q);
+}
+
+uint64_t aom_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int width,
+                                     int height) {
+  // 4 elements per row only requires half an XMM register, so this
+  // must be a special case, but also note that over 75% of all calls
+  // are with size == 4, so it is also the common case.
+  if (LIKELY(width == 4 && height == 4)) {
+    return aom_sum_squares_2d_i16_4x4_sse2(src, stride);
+  } else if (LIKELY(width == 4 && (height & 3) == 0)) {
+    return aom_sum_squares_2d_i16_4xn_sse2(src, stride, height);
+  } else if (LIKELY((width & 7) == 0 && (height & 3) == 0)) {
+    // Generic case
+    return aom_sum_squares_2d_i16_nxn_sse2(src, stride, width, height);
+  } else {
+    return aom_sum_squares_2d_i16_c(src, stride, width, height);
+  }
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// 1D version
+//////////////////////////////////////////////////////////////////////////////
+
+static uint64_t aom_sum_squares_i16_64n_sse2(const int16_t *src, uint32_t n) {
+  const __m128i v_zext_mask_q = xx_set1_64_from_32i(0xffffffff);
+  __m128i v_acc0_q = _mm_setzero_si128();
+  __m128i v_acc1_q = _mm_setzero_si128();
+
+  const int16_t *const end = src + n;
+
+  assert(n % 64 == 0);
+
+  while (src < end) {
+    const __m128i v_val_0_w = xx_load_128(src);
+    const __m128i v_val_1_w = xx_load_128(src + 8);
+    const __m128i v_val_2_w = xx_load_128(src + 16);
+    const __m128i v_val_3_w = xx_load_128(src + 24);
+    const __m128i v_val_4_w = xx_load_128(src + 32);
+    const __m128i v_val_5_w = xx_load_128(src + 40);
+    const __m128i v_val_6_w = xx_load_128(src + 48);
+    const __m128i v_val_7_w = xx_load_128(src + 56);
+
+    const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w);
+    const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w);
+    const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w);
+    const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w);
+    const __m128i v_sq_4_d = _mm_madd_epi16(v_val_4_w, v_val_4_w);
+    const __m128i v_sq_5_d = _mm_madd_epi16(v_val_5_w, v_val_5_w);
+    const __m128i v_sq_6_d = _mm_madd_epi16(v_val_6_w, v_val_6_w);
+    const __m128i v_sq_7_d = _mm_madd_epi16(v_val_7_w, v_val_7_w);
+
+    const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d);
+    const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d);
+    const __m128i v_sum_45_d = _mm_add_epi32(v_sq_4_d, v_sq_5_d);
+    const __m128i v_sum_67_d = _mm_add_epi32(v_sq_6_d, v_sq_7_d);
+
+    const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d);
+    const __m128i v_sum_4567_d = _mm_add_epi32(v_sum_45_d, v_sum_67_d);
+
+    const __m128i v_sum_d = _mm_add_epi32(v_sum_0123_d, v_sum_4567_d);
+
+    v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_and_si128(v_sum_d, v_zext_mask_q));
+    v_acc1_q = _mm_add_epi64(v_acc1_q, _mm_srli_epi64(v_sum_d, 32));
+
+    src += 64;
+  }
+
+  v_acc0_q = _mm_add_epi64(v_acc0_q, v_acc1_q);
+  v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_srli_si128(v_acc0_q, 8));
+  return xx_cvtsi128_si64(v_acc0_q);
+}
+
+uint64_t aom_sum_squares_i16_sse2(const int16_t *src, uint32_t n) {
+  if (n % 64 == 0) {
+    return aom_sum_squares_i16_64n_sse2(src, n);
+  } else if (n > 64) {
+    int k = n & ~(64 - 1);
+    return aom_sum_squares_i16_64n_sse2(src, k) +
+           aom_sum_squares_i16_c(src + k, n - k);
+  } else {
+    return aom_sum_squares_i16_c(src, n);
+  }
+}
+
+// Accumulate sum of 16-bit elements in the vector
+static AOM_INLINE int32_t mm_accumulate_epi16(__m128i vec_a) {
+  __m128i vtmp = _mm_srli_si128(vec_a, 8);
+  vec_a = _mm_add_epi16(vec_a, vtmp);
+  vtmp = _mm_srli_si128(vec_a, 4);
+  vec_a = _mm_add_epi16(vec_a, vtmp);
+  vtmp = _mm_srli_si128(vec_a, 2);
+  vec_a = _mm_add_epi16(vec_a, vtmp);
+  return _mm_extract_epi16(vec_a, 0);
+}
+
+// Accumulate sum of 32-bit elements in the vector
+static AOM_INLINE int32_t mm_accumulate_epi32(__m128i vec_a) {
+  __m128i vtmp = _mm_srli_si128(vec_a, 8);
+  vec_a = _mm_add_epi32(vec_a, vtmp);
+  vtmp = _mm_srli_si128(vec_a, 4);
+  vec_a = _mm_add_epi32(vec_a, vtmp);
+  return _mm_cvtsi128_si32(vec_a);
+}
+
+uint64_t aom_var_2d_u8_sse2(uint8_t *src, int src_stride, int width,
+                            int height) {
+  uint8_t *srcp;
+  uint64_t s = 0, ss = 0;
+  __m128i vzero = _mm_setzero_si128();
+  __m128i v_acc_sum = vzero;
+  __m128i v_acc_sqs = vzero;
+  int i, j;
+
+  // Process 16 elements in a row
+  for (i = 0; i < width - 15; i += 16) {
+    srcp = src + i;
+    // Process 8 columns at a time
+    for (j = 0; j < height - 7; j += 8) {
+      __m128i vsrc[8];
+      for (int k = 0; k < 8; k++) {
+        vsrc[k] = _mm_loadu_si128((__m128i *)srcp);
+        srcp += src_stride;
+      }
+      for (int k = 0; k < 8; k++) {
+        __m128i vsrc0 = _mm_unpacklo_epi8(vsrc[k], vzero);
+        __m128i vsrc1 = _mm_unpackhi_epi8(vsrc[k], vzero);
+        v_acc_sum = _mm_add_epi16(v_acc_sum, vsrc0);
+        v_acc_sum = _mm_add_epi16(v_acc_sum, vsrc1);
+
+        __m128i vsqs0 = _mm_madd_epi16(vsrc0, vsrc0);
+        __m128i vsqs1 = _mm_madd_epi16(vsrc1, vsrc1);
+        v_acc_sqs = _mm_add_epi32(v_acc_sqs, vsqs0);
+        v_acc_sqs = _mm_add_epi32(v_acc_sqs, vsqs1);
+      }
+
+      // Update total sum and clear the vectors
+      s += mm_accumulate_epi16(v_acc_sum);
+      ss += mm_accumulate_epi32(v_acc_sqs);
+      v_acc_sum = vzero;
+      v_acc_sqs = vzero;
+    }
+
+    // Process remaining rows (height not a multiple of 8)
+    for (; j < height; j++) {
+      __m128i vsrc = _mm_loadu_si128((__m128i *)srcp);
+      __m128i vsrc0 = _mm_unpacklo_epi8(vsrc, vzero);
+      __m128i vsrc1 = _mm_unpackhi_epi8(vsrc, vzero);
+      v_acc_sum = _mm_add_epi16(v_acc_sum, vsrc0);
+      v_acc_sum = _mm_add_epi16(v_acc_sum, vsrc1);
+
+      __m128i vsqs0 = _mm_madd_epi16(vsrc0, vsrc0);
+      __m128i vsqs1 = _mm_madd_epi16(vsrc1, vsrc1);
+      v_acc_sqs = _mm_add_epi32(v_acc_sqs, vsqs0);
+      v_acc_sqs = _mm_add_epi32(v_acc_sqs, vsqs1);
+
+      srcp += src_stride;
+    }
+
+    // Update total sum and clear the vectors
+    s += mm_accumulate_epi16(v_acc_sum);
+    ss += mm_accumulate_epi32(v_acc_sqs);
+    v_acc_sum = vzero;
+    v_acc_sqs = vzero;
+  }
+
+  // Process the remaining area using C
+  srcp = src;
+  for (int k = 0; k < height; k++) {
+    for (int m = i; m < width; m++) {
+      uint8_t val = srcp[m];
+      s += val;
+      ss += val * val;
+    }
+    srcp += src_stride;
+  }
+  return (ss - s * s / (width * height));
+}
+
+uint64_t aom_var_2d_u16_sse2(uint8_t *src, int src_stride, int width,
+                             int height) {
+  uint16_t *srcp1 = CONVERT_TO_SHORTPTR(src), *srcp;
+  uint64_t s = 0, ss = 0;
+  __m128i vzero = _mm_setzero_si128();
+  __m128i v_acc_sum = vzero;
+  __m128i v_acc_sqs = vzero;
+  int i, j;
+
+  // Process 8 elements in a row
+  for (i = 0; i < width - 8; i += 8) {
+    srcp = srcp1 + i;
+    // Process 8 columns at a time
+    for (j = 0; j < height - 8; j += 8) {
+      __m128i vsrc[8];
+      for (int k = 0; k < 8; k++) {
+        vsrc[k] = _mm_loadu_si128((__m128i *)srcp);
+        srcp += src_stride;
+      }
+      for (int k = 0; k < 8; k++) {
+        __m128i vsrc0 = _mm_unpacklo_epi16(vsrc[k], vzero);
+        __m128i vsrc1 = _mm_unpackhi_epi16(vsrc[k], vzero);
+        v_acc_sum = _mm_add_epi32(vsrc0, v_acc_sum);
+        v_acc_sum = _mm_add_epi32(vsrc1, v_acc_sum);
+
+        __m128i vsqs0 = _mm_madd_epi16(vsrc[k], vsrc[k]);
+        v_acc_sqs = _mm_add_epi32(v_acc_sqs, vsqs0);
+      }
+
+      // Update total sum and clear the vectors
+      s += mm_accumulate_epi32(v_acc_sum);
+      ss += mm_accumulate_epi32(v_acc_sqs);
+      v_acc_sum = vzero;
+      v_acc_sqs = vzero;
+    }
+
+    // Process remaining rows (height not a multiple of 8)
+    for (; j < height; j++) {
+      __m128i vsrc = _mm_loadu_si128((__m128i *)srcp);
+      __m128i vsrc0 = _mm_unpacklo_epi16(vsrc, vzero);
+      __m128i vsrc1 = _mm_unpackhi_epi16(vsrc, vzero);
+      v_acc_sum = _mm_add_epi32(vsrc0, v_acc_sum);
+      v_acc_sum = _mm_add_epi32(vsrc1, v_acc_sum);
+
+      __m128i vsqs0 = _mm_madd_epi16(vsrc, vsrc);
+      v_acc_sqs = _mm_add_epi32(v_acc_sqs, vsqs0);
+      srcp += src_stride;
+    }
+
+    // Update total sum and clear the vectors
+    s += mm_accumulate_epi32(v_acc_sum);
+    ss += mm_accumulate_epi32(v_acc_sqs);
+    v_acc_sum = vzero;
+    v_acc_sqs = vzero;
+  }
+
+  // Process the remaining area using C
+  srcp = srcp1;
+  for (int k = 0; k < height; k++) {
+    for (int m = i; m < width; m++) {
+      uint16_t val = srcp[m];
+      s += val;
+      ss += val * val;
+    }
+    srcp += src_stride;
+  }
+  return (ss - s * s / (width * height));
+}
diff --git a/libs/libaom/src/aom_dsp/x86/sum_squares_sse2.h b/libs/libaom/src/aom_dsp/x86/sum_squares_sse2.h
new file mode 100644
index 000000000..491e31cc5
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/sum_squares_sse2.h
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_X86_SUM_SQUARES_SSE2_H_
+#define AOM_DSP_X86_SUM_SQUARES_SSE2_H_
+
+uint64_t aom_sum_squares_2d_i16_nxn_sse2(const int16_t *src, int stride,
+                                         int width, int height);
+
+uint64_t aom_sum_squares_2d_i16_4xn_sse2(const int16_t *src, int stride,
+                                         int height);
+uint64_t aom_sum_squares_2d_i16_4x4_sse2(const int16_t *src, int stride);
+
+#endif  // AOM_DSP_X86_SUM_SQUARES_SSE2_H_
diff --git a/libs/libaom/src/aom_dsp/x86/synonyms.h b/libs/libaom/src/aom_dsp/x86/synonyms.h
new file mode 100644
index 000000000..2e99bee3e
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/synonyms.h
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_SYNONYMS_H_
+#define AOM_AOM_DSP_X86_SYNONYMS_H_
+
+#include <immintrin.h>
+#include <string.h>
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+
+/**
+ * Various reusable shorthands for x86 SIMD intrinsics.
+ *
+ * Intrinsics prefixed with xx_ operate on or return 128bit XMM registers.
+ * Intrinsics prefixed with yy_ operate on or return 256bit YMM registers.
+ */
+
+// Loads and stores to do away with the tedium of casting the address
+// to the right type.
+static INLINE __m128i xx_loadl_32(const void *a) {
+  int val;
+  memcpy(&val, a, sizeof(val));
+  return _mm_cvtsi32_si128(val);
+}
+
+static INLINE __m128i xx_loadl_64(const void *a) {
+  return _mm_loadl_epi64((const __m128i *)a);
+}
+
+static INLINE __m128i xx_load_128(const void *a) {
+  return _mm_load_si128((const __m128i *)a);
+}
+
+static INLINE __m128i xx_loadu_128(const void *a) {
+  return _mm_loadu_si128((const __m128i *)a);
+}
+
+static INLINE void xx_storel_32(void *const a, const __m128i v) {
+  const int val = _mm_cvtsi128_si32(v);
+  memcpy(a, &val, sizeof(val));
+}
+
+static INLINE void xx_storel_64(void *const a, const __m128i v) {
+  _mm_storel_epi64((__m128i *)a, v);
+}
+
+static INLINE void xx_store_128(void *const a, const __m128i v) {
+  _mm_store_si128((__m128i *)a, v);
+}
+
+static INLINE void xx_storeu_128(void *const a, const __m128i v) {
+  _mm_storeu_si128((__m128i *)a, v);
+}
+
+// The _mm_set_epi64x() intrinsic is undefined for some Visual Studio
+// compilers. The following function is equivalent to _mm_set_epi64x()
+// acting on 32-bit integers.
+static INLINE __m128i xx_set_64_from_32i(int32_t e1, int32_t e0) {
+#if defined(_MSC_VER) && _MSC_VER < 1900
+  return _mm_set_epi32(0, e1, 0, e0);
+#else
+  return _mm_set_epi64x((uint32_t)e1, (uint32_t)e0);
+#endif
+}
+
+// The _mm_set1_epi64x() intrinsic is undefined for some Visual Studio
+// compilers. The following function is equivalent to _mm_set1_epi64x()
+// acting on a 32-bit integer.
+static INLINE __m128i xx_set1_64_from_32i(int32_t a) {
+#if defined(_MSC_VER) && _MSC_VER < 1900
+  return _mm_set_epi32(0, a, 0, a);
+#else
+  return _mm_set1_epi64x((uint32_t)a);
+#endif
+}
+
+static INLINE __m128i xx_round_epu16(__m128i v_val_w) {
+  return _mm_avg_epu16(v_val_w, _mm_setzero_si128());
+}
+
+static INLINE __m128i xx_roundn_epu16(__m128i v_val_w, int bits) {
+  const __m128i v_s_w = _mm_srli_epi16(v_val_w, bits - 1);
+  return _mm_avg_epu16(v_s_w, _mm_setzero_si128());
+}
+
+static INLINE __m128i xx_roundn_epu32(__m128i v_val_d, int bits) {
+  const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
+  const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
+  return _mm_srli_epi32(v_tmp_d, bits);
+}
+
+// This is equivalent to ROUND_POWER_OF_TWO(v_val_d, bits)
+static INLINE __m128i xx_roundn_epi32_unsigned(__m128i v_val_d, int bits) {
+  const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
+  const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
+  return _mm_srai_epi32(v_tmp_d, bits);
+}
+
+static INLINE __m128i xx_roundn_epi16(__m128i v_val_d, int bits) {
+  const __m128i v_bias_d = _mm_set1_epi16((1 << bits) >> 1);
+  const __m128i v_sign_d = _mm_srai_epi16(v_val_d, 15);
+  const __m128i v_tmp_d =
+      _mm_add_epi16(_mm_add_epi16(v_val_d, v_bias_d), v_sign_d);
+  return _mm_srai_epi16(v_tmp_d, bits);
+}
+
+#endif  // AOM_AOM_DSP_X86_SYNONYMS_H_
diff --git a/libs/libaom/src/aom_dsp/x86/synonyms_avx2.h b/libs/libaom/src/aom_dsp/x86/synonyms_avx2.h
new file mode 100644
index 000000000..4d6ee6ad6
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/synonyms_avx2.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_SYNONYMS_AVX2_H_
+#define AOM_AOM_DSP_X86_SYNONYMS_AVX2_H_
+
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+
+/**
+ * Various reusable shorthands for x86 SIMD intrinsics.
+ *
+ * Intrinsics prefixed with xx_ operate on or return 128bit XMM registers.
+ * Intrinsics prefixed with yy_ operate on or return 256bit YMM registers.
+ */
+
+// Loads and stores to do away with the tedium of casting the address
+// to the right type.
+static INLINE __m256i yy_load_256(const void *a) {
+  return _mm256_load_si256((const __m256i *)a);
+}
+
+static INLINE __m256i yy_loadu_256(const void *a) {
+  return _mm256_loadu_si256((const __m256i *)a);
+}
+
+static INLINE void yy_store_256(void *const a, const __m256i v) {
+  _mm256_store_si256((__m256i *)a, v);
+}
+
+static INLINE void yy_storeu_256(void *const a, const __m256i v) {
+  _mm256_storeu_si256((__m256i *)a, v);
+}
+
+// The _mm256_set1_epi64x() intrinsic is undefined for some Visual Studio
+// compilers. The following function is equivalent to _mm256_set1_epi64x()
+// acting on a 32-bit integer.
+static INLINE __m256i yy_set1_64_from_32i(int32_t a) {
+#if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900
+  return _mm256_set_epi32(0, a, 0, a, 0, a, 0, a);
+#else
+  return _mm256_set1_epi64x((uint32_t)a);
+#endif
+}
+
+// Some compilers don't have _mm256_set_m128i defined in immintrin.h. We
+// therefore define an equivalent function using a different intrinsic.
+// ([ hi ], [ lo ]) -> [ hi ][ lo ]
+static INLINE __m256i yy_set_m128i(__m128i hi, __m128i lo) {
+  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
+}
+
+static INLINE __m256i yy_loadu2_128(const void *hi, const void *lo) {
+  __m128i mhi = _mm_loadu_si128((__m128i *)(hi));
+  __m128i mlo = _mm_loadu_si128((__m128i *)(lo));
+  return yy_set_m128i(mhi, mlo);
+}
+
+static INLINE void yy_storeu2_128(void *hi, void *lo, const __m256i a) {
+  _mm_storeu_si128((__m128i *)hi, _mm256_extracti128_si256(a, 1));
+  _mm_storeu_si128((__m128i *)lo, _mm256_castsi256_si128(a));
+}
+
+static INLINE __m256i yy_roundn_epu16(__m256i v_val_w, int bits) {
+  const __m256i v_s_w = _mm256_srli_epi16(v_val_w, bits - 1);
+  return _mm256_avg_epu16(v_s_w, _mm256_setzero_si256());
+}
+#endif  // AOM_AOM_DSP_X86_SYNONYMS_AVX2_H_
diff --git a/libs/libaom/src/aom_dsp/x86/transpose_sse2.h b/libs/libaom/src/aom_dsp/x86/transpose_sse2.h
new file mode 100644
index 000000000..7ac692c78
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/transpose_sse2.h
@@ -0,0 +1,420 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_TRANSPOSE_SSE2_H_
+#define AOM_AOM_DSP_X86_TRANSPOSE_SSE2_H_
+
+#include <emmintrin.h>  // SSE2
+
+#include "config/aom_config.h"
+
+static INLINE __m128i transpose_8bit_4x4(const __m128i *const in) {
+  // Unpack 8 bit elements. Goes from:
+  // in[0]: 00 01 02 03
+  // in[1]: 10 11 12 13
+  // in[2]: 20 21 22 23
+  // in[3]: 30 31 32 33
+  // to:
+  // a0:    00 10 01 11  02 12 03 13
+  // a1:    20 30 21 31  22 32 23 33
+  const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]);
+
+  // Unpack 16 bit elements resulting in:
+  // 00 10 20 30  01 11 21 31  02 12 22 32  03 13 23 33
+  return _mm_unpacklo_epi16(a0, a1);
+}
+
+static INLINE void transpose_8bit_8x8(const __m128i *const in,
+                                      __m128i *const out) {
+  // Unpack 8 bit elements. Goes from:
+  // in[0]: 00 01 02 03 04 05 06 07
+  // in[1]: 10 11 12 13 14 15 16 17
+  // in[2]: 20 21 22 23 24 25 26 27
+  // in[3]: 30 31 32 33 34 35 36 37
+  // in[4]: 40 41 42 43 44 45 46 47
+  // in[5]: 50 51 52 53 54 55 56 57
+  // in[6]: 60 61 62 63 64 65 66 67
+  // in[7]: 70 71 72 73 74 75 76 77
+  // to:
+  // a0:    00 10 01 11 02 12 03 13  04 14 05 15 06 16 07 17
+  // a1:    20 30 21 31 22 32 23 33  24 34 25 35 26 36 27 37
+  // a2:    40 50 41 51 42 52 43 53  44 54 45 55 46 56 47 57
+  // a3:    60 70 61 71 62 72 63 73  64 74 65 75 66 76 67 77
+  const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]);
+  const __m128i a2 = _mm_unpacklo_epi8(in[4], in[5]);
+  const __m128i a3 = _mm_unpacklo_epi8(in[6], in[7]);
+
+  // Unpack 16 bit elements resulting in:
+  // b0: 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33
+  // b1: 40 50 60 70 41 51 61 71  42 52 62 72 43 53 63 73
+  // b2: 04 14 24 34 05 15 25 35  06 16 26 36 07 17 27 37
+  // b3: 44 54 64 74 45 55 65 75  46 56 66 76 47 57 67 77
+  const __m128i b0 = _mm_unpacklo_epi16(a0, a1);
+  const __m128i b1 = _mm_unpackhi_epi16(a0, a1);
+  const __m128i b2 = _mm_unpacklo_epi16(a2, a3);
+  const __m128i b3 = _mm_unpackhi_epi16(a2, a3);
+
+  // Unpack 32 bit elements resulting in:
+  // c0: 00 10 20 30 40 50 60 70  01 11 21 31 41 51 61 71
+  // c1: 02 12 22 32 42 52 62 72  03 13 23 33 43 53 63 73
+  // c2: 04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75
+  // c3: 06 16 26 36 46 56 66 76  07 17 27 37 47 57 67 77
+  const __m128i c0 = _mm_unpacklo_epi32(b0, b2);
+  const __m128i c1 = _mm_unpackhi_epi32(b0, b2);
+  const __m128i c2 = _mm_unpacklo_epi32(b1, b3);
+  const __m128i c3 = _mm_unpackhi_epi32(b1, b3);
+
+  // Unpack 64 bit elements resulting in:
+  // out[0]: 00 10 20 30 40 50 60 70
+  // out[1]: 01 11 21 31 41 51 61 71
+  // out[2]: 02 12 22 32 42 52 62 72
+  // out[3]: 03 13 23 33 43 53 63 73
+  // out[4]: 04 14 24 34 44 54 64 74
+  // out[5]: 05 15 25 35 45 55 65 75
+  // out[6]: 06 16 26 36 46 56 66 76
+  // out[7]: 07 17 27 37 47 57 67 77
+  out[0] = _mm_unpacklo_epi64(c0, c0);
+  out[1] = _mm_unpackhi_epi64(c0, c0);
+  out[2] = _mm_unpacklo_epi64(c1, c1);
+  out[3] = _mm_unpackhi_epi64(c1, c1);
+  out[4] = _mm_unpacklo_epi64(c2, c2);
+  out[5] = _mm_unpackhi_epi64(c2, c2);
+  out[6] = _mm_unpacklo_epi64(c3, c3);
+  out[7] = _mm_unpackhi_epi64(c3, c3);
+}
+
+static INLINE void transpose_16bit_4x4(const __m128i *const in,
+                                       __m128i *const out) {
+  // Unpack 16 bit elements. Goes from:
+  // in[0]: 00 01 02 03  XX XX XX XX
+  // in[1]: 10 11 12 13  XX XX XX XX
+  // in[2]: 20 21 22 23  XX XX XX XX
+  // in[3]: 30 31 32 33  XX XX XX XX
+  // to:
+  // a0:    00 10 01 11  02 12 03 13
+  // a1:    20 30 21 31  22 32 23 33
+  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
+
+  // Unpack 32 bit elements resulting in:
+  // out[0]: 00 10 20 30
+  // out[1]: 01 11 21 31
+  // out[2]: 02 12 22 32
+  // out[3]: 03 13 23 33
+  out[0] = _mm_unpacklo_epi32(a0, a1);
+  out[1] = _mm_srli_si128(out[0], 8);
+  out[2] = _mm_unpackhi_epi32(a0, a1);
+  out[3] = _mm_srli_si128(out[2], 8);
+}
+
+static INLINE void transpose_16bit_4x8(const __m128i *const in,
+                                       __m128i *const out) {
+  // Unpack 16 bit elements. Goes from:
+  // in[0]: 00 01 02 03  XX XX XX XX
+  // in[1]: 10 11 12 13  XX XX XX XX
+  // in[2]: 20 21 22 23  XX XX XX XX
+  // in[3]: 30 31 32 33  XX XX XX XX
+  // in[4]: 40 41 42 43  XX XX XX XX
+  // in[5]: 50 51 52 53  XX XX XX XX
+  // in[6]: 60 61 62 63  XX XX XX XX
+  // in[7]: 70 71 72 73  XX XX XX XX
+  // to:
+  // a0:    00 10 01 11  02 12 03 13
+  // a1:    20 30 21 31  22 32 23 33
+  // a2:    40 50 41 51  42 52 43 53
+  // a3:    60 70 61 71  62 72 63 73
+  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
+  const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
+  const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
+
+  // Unpack 32 bit elements resulting in:
+  // b0: 00 10 20 30  01 11 21 31
+  // b1: 40 50 60 70  41 51 61 71
+  // b2: 02 12 22 32  03 13 23 33
+  // b3: 42 52 62 72  43 53 63 73
+  const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
+  const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
+  const __m128i b2 = _mm_unpackhi_epi32(a0, a1);
+  const __m128i b3 = _mm_unpackhi_epi32(a2, a3);
+
+  // Unpack 64 bit elements resulting in:
+  // out[0]: 00 10 20 30  40 50 60 70
+  // out[1]: 01 11 21 31  41 51 61 71
+  // out[2]: 02 12 22 32  42 52 62 72
+  // out[3]: 03 13 23 33  43 53 63 73
+  out[0] = _mm_unpacklo_epi64(b0, b1);
+  out[1] = _mm_unpackhi_epi64(b0, b1);
+  out[2] = _mm_unpacklo_epi64(b2, b3);
+  out[3] = _mm_unpackhi_epi64(b2, b3);
+}
+
+static INLINE void transpose_16bit_8x4(const __m128i *const in,
+                                       __m128i *const out) {
+  // Unpack 16 bit elements. Goes from:
+  // in[0]: 00 01 02 03  04 05 06 07
+  // in[1]: 10 11 12 13  14 15 16 17
+  // in[2]: 20 21 22 23  24 25 26 27
+  // in[3]: 30 31 32 33  34 35 36 37
+
+  // to:
+  // a0:    00 10 01 11  02 12 03 13
+  // a1:    20 30 21 31  22 32 23 33
+  // a4:    04 14 05 15  06 16 07 17
+  // a5:    24 34 25 35  26 36 27 37
+  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
+  const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]);
+  const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]);
+
+  // Unpack 32 bit elements resulting in:
+  // b0: 00 10 20 30  01 11 21 31
+  // b2: 04 14 24 34  05 15 25 35
+  // b4: 02 12 22 32  03 13 23 33
+  // b6: 06 16 26 36  07 17 27 37
+  const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
+  const __m128i b2 = _mm_unpacklo_epi32(a4, a5);
+  const __m128i b4 = _mm_unpackhi_epi32(a0, a1);
+  const __m128i b6 = _mm_unpackhi_epi32(a4, a5);
+
+  // Unpack 64 bit elements resulting in:
+  // out[0]: 00 10 20 30  XX XX XX XX
+  // out[1]: 01 11 21 31  XX XX XX XX
+  // out[2]: 02 12 22 32  XX XX XX XX
+  // out[3]: 03 13 23 33  XX XX XX XX
+  // out[4]: 04 14 24 34  XX XX XX XX
+  // out[5]: 05 15 25 35  XX XX XX XX
+  // out[6]: 06 16 26 36  XX XX XX XX
+  // out[7]: 07 17 27 37  XX XX XX XX
+  const __m128i zeros = _mm_setzero_si128();
+  out[0] = _mm_unpacklo_epi64(b0, zeros);
+  out[1] = _mm_unpackhi_epi64(b0, zeros);
+  out[2] = _mm_unpacklo_epi64(b4, zeros);
+  out[3] = _mm_unpackhi_epi64(b4, zeros);
+  out[4] = _mm_unpacklo_epi64(b2, zeros);
+  out[5] = _mm_unpackhi_epi64(b2, zeros);
+  out[6] = _mm_unpacklo_epi64(b6, zeros);
+  out[7] = _mm_unpackhi_epi64(b6, zeros);
+}
+
+static INLINE void transpose_16bit_8x8(const __m128i *const in,
+                                       __m128i *const out) {
+  // Unpack 16 bit elements. Goes from:
+  // in[0]: 00 01 02 03  04 05 06 07
+  // in[1]: 10 11 12 13  14 15 16 17
+  // in[2]: 20 21 22 23  24 25 26 27
+  // in[3]: 30 31 32 33  34 35 36 37
+  // in[4]: 40 41 42 43  44 45 46 47
+  // in[5]: 50 51 52 53  54 55 56 57
+  // in[6]: 60 61 62 63  64 65 66 67
+  // in[7]: 70 71 72 73  74 75 76 77
+  // to:
+  // a0:    00 10 01 11  02 12 03 13
+  // a1:    20 30 21 31  22 32 23 33
+  // a2:    40 50 41 51  42 52 43 53
+  // a3:    60 70 61 71  62 72 63 73
+  // a4:    04 14 05 15  06 16 07 17
+  // a5:    24 34 25 35  26 36 27 37
+  // a6:    44 54 45 55  46 56 47 57
+  // a7:    64 74 65 75  66 76 67 77
+  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
+  const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
+  const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
+  const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]);
+  const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]);
+  const __m128i a6 = _mm_unpackhi_epi16(in[4], in[5]);
+  const __m128i a7 = _mm_unpackhi_epi16(in[6], in[7]);
+
+  // Unpack 32 bit elements resulting in:
+  // b0: 00 10 20 30  01 11 21 31
+  // b1: 40 50 60 70  41 51 61 71
+  // b2: 04 14 24 34  05 15 25 35
+  // b3: 44 54 64 74  45 55 65 75
+  // b4: 02 12 22 32  03 13 23 33
+  // b5: 42 52 62 72  43 53 63 73
+  // b6: 06 16 26 36  07 17 27 37
+  // b7: 46 56 66 76  47 57 67 77
+  const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
+  const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
+  const __m128i b2 = _mm_unpacklo_epi32(a4, a5);
+  const __m128i b3 = _mm_unpacklo_epi32(a6, a7);
+  const __m128i b4 = _mm_unpackhi_epi32(a0, a1);
+  const __m128i b5 = _mm_unpackhi_epi32(a2, a3);
+  const __m128i b6 = _mm_unpackhi_epi32(a4, a5);
+  const __m128i b7 = _mm_unpackhi_epi32(a6, a7);
+
+  // Unpack 64 bit elements resulting in:
+  // out[0]: 00 10 20 30  40 50 60 70
+  // out[1]: 01 11 21 31  41 51 61 71
+  // out[2]: 02 12 22 32  42 52 62 72
+  // out[3]: 03 13 23 33  43 53 63 73
+  // out[4]: 04 14 24 34  44 54 64 74
+  // out[5]: 05 15 25 35  45 55 65 75
+  // out[6]: 06 16 26 36  46 56 66 76
+  // out[7]: 07 17 27 37  47 57 67 77
+  out[0] = _mm_unpacklo_epi64(b0, b1);
+  out[1] = _mm_unpackhi_epi64(b0, b1);
+  out[2] = _mm_unpacklo_epi64(b4, b5);
+  out[3] = _mm_unpackhi_epi64(b4, b5);
+  out[4] = _mm_unpacklo_epi64(b2, b3);
+  out[5] = _mm_unpackhi_epi64(b2, b3);
+  out[6] = _mm_unpacklo_epi64(b6, b7);
+  out[7] = _mm_unpackhi_epi64(b6, b7);
+}
+
+// Transpose in-place
+static INLINE void transpose_16bit_16x16(__m128i *const left,
+                                         __m128i *const right) {
+  __m128i tbuf[8];
+  transpose_16bit_8x8(left, left);
+  transpose_16bit_8x8(right, tbuf);
+  transpose_16bit_8x8(left + 8, right);
+  transpose_16bit_8x8(right + 8, right + 8);
+
+  left[8] = tbuf[0];
+  left[9] = tbuf[1];
+  left[10] = tbuf[2];
+  left[11] = tbuf[3];
+  left[12] = tbuf[4];
+  left[13] = tbuf[5];
+  left[14] = tbuf[6];
+  left[15] = tbuf[7];
+}
+
+static INLINE void transpose_32bit_4x4(const __m128i *const in,
+                                       __m128i *const out) {
+  // Unpack 32 bit elements. Goes from:
+  // in[0]: 00 01 02 03
+  // in[1]: 10 11 12 13
+  // in[2]: 20 21 22 23
+  // in[3]: 30 31 32 33
+  // to:
+  // a0:    00 10 01 11
+  // a1:    20 30 21 31
+  // a2:    02 12 03 13
+  // a3:    22 32 23 33
+
+  const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]);
+  const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]);
+  const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]);
+
+  // Unpack 64 bit elements resulting in:
+  // out[0]: 00 10 20 30
+  // out[1]: 01 11 21 31
+  // out[2]: 02 12 22 32
+  // out[3]: 03 13 23 33
+  out[0] = _mm_unpacklo_epi64(a0, a1);
+  out[1] = _mm_unpackhi_epi64(a0, a1);
+  out[2] = _mm_unpacklo_epi64(a2, a3);
+  out[3] = _mm_unpackhi_epi64(a2, a3);
+}
+
+static INLINE void transpose_32bit_4x4x2(const __m128i *const in,
+                                         __m128i *const out) {
+  // Unpack 32 bit elements. Goes from:
+  // in[0]: 00 01 02 03
+  // in[1]: 10 11 12 13
+  // in[2]: 20 21 22 23
+  // in[3]: 30 31 32 33
+  // in[4]: 04 05 06 07
+  // in[5]: 14 15 16 17
+  // in[6]: 24 25 26 27
+  // in[7]: 34 35 36 37
+  // to:
+  // a0:    00 10 01 11
+  // a1:    20 30 21 31
+  // a2:    02 12 03 13
+  // a3:    22 32 23 33
+  // a4:    04 14 05 15
+  // a5:    24 34 25 35
+  // a6:    06 16 07 17
+  // a7:    26 36 27 37
+  const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]);
+  const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]);
+  const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]);
+  const __m128i a4 = _mm_unpacklo_epi32(in[4], in[5]);
+  const __m128i a5 = _mm_unpacklo_epi32(in[6], in[7]);
+  const __m128i a6 = _mm_unpackhi_epi32(in[4], in[5]);
+  const __m128i a7 = _mm_unpackhi_epi32(in[6], in[7]);
+
+  // Unpack 64 bit elements resulting in:
+  // out[0]: 00 10 20 30
+  // out[1]: 01 11 21 31
+  // out[2]: 02 12 22 32
+  // out[3]: 03 13 23 33
+  // out[4]: 04 14 24 34
+  // out[5]: 05 15 25 35
+  // out[6]: 06 16 26 36
+  // out[7]: 07 17 27 37
+  out[0] = _mm_unpacklo_epi64(a0, a1);
+  out[1] = _mm_unpackhi_epi64(a0, a1);
+  out[2] = _mm_unpacklo_epi64(a2, a3);
+  out[3] = _mm_unpackhi_epi64(a2, a3);
+  out[4] = _mm_unpacklo_epi64(a4, a5);
+  out[5] = _mm_unpackhi_epi64(a4, a5);
+  out[6] = _mm_unpacklo_epi64(a6, a7);
+  out[7] = _mm_unpackhi_epi64(a6, a7);
+}
+
+static INLINE void transpose_32bit_8x4(const __m128i *const in,
+                                       __m128i *const out) {
+  // Unpack 32 bit elements. Goes from:
+  // in[0]: 00 01 02 03
+  // in[1]: 04 05 06 07
+  // in[2]: 10 11 12 13
+  // in[3]: 14 15 16 17
+  // in[4]: 20 21 22 23
+  // in[5]: 24 25 26 27
+  // in[6]: 30 31 32 33
+  // in[7]: 34 35 36 37
+  // to:
+  // a0: 00 10 01 11
+  // a1: 20 30 21 31
+  // a2: 02 12 03 13
+  // a3: 22 32 23 33
+  // a4: 04 14 05 15
+  // a5: 24 34 25 35
+  // a6: 06 16 07 17
+  // a7: 26 36 27 37
+  const __m128i a0 = _mm_unpacklo_epi32(in[0], in[2]);
+  const __m128i a1 = _mm_unpacklo_epi32(in[4], in[6]);
+  const __m128i a2 = _mm_unpackhi_epi32(in[0], in[2]);
+  const __m128i a3 = _mm_unpackhi_epi32(in[4], in[6]);
+  const __m128i a4 = _mm_unpacklo_epi32(in[1], in[3]);
+  const __m128i a5 = _mm_unpacklo_epi32(in[5], in[7]);
+  const __m128i a6 = _mm_unpackhi_epi32(in[1], in[3]);
+  const __m128i a7 = _mm_unpackhi_epi32(in[5], in[7]);
+
+  // Unpack 64 bit elements resulting in:
+  // out[0]: 00 10 20 30
+  // out[1]: 01 11 21 31
+  // out[2]: 02 12 22 32
+  // out[3]: 03 13 23 33
+  // out[4]: 04 14 24 34
+  // out[5]: 05 15 25 35
+  // out[6]: 06 16 26 36
+  // out[7]: 07 17 27 37
+  out[0] = _mm_unpacklo_epi64(a0, a1);
+  out[1] = _mm_unpackhi_epi64(a0, a1);
+  out[2] = _mm_unpacklo_epi64(a2, a3);
+  out[3] = _mm_unpackhi_epi64(a2, a3);
+  out[4] = _mm_unpacklo_epi64(a4, a5);
+  out[5] = _mm_unpackhi_epi64(a4, a5);
+  out[6] = _mm_unpacklo_epi64(a6, a7);
+  out[7] = _mm_unpackhi_epi64(a6, a7);
+}
+
+#endif  // AOM_AOM_DSP_X86_TRANSPOSE_SSE2_H_
diff --git a/libs/libaom/src/aom_dsp/x86/txfm_common_avx2.h b/libs/libaom/src/aom_dsp/x86/txfm_common_avx2.h
new file mode 100644
index 000000000..ea57c9f35
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/txfm_common_avx2.h
@@ -0,0 +1,360 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_TXFM_COMMON_AVX2_H_
+#define AOM_AOM_DSP_X86_TXFM_COMMON_AVX2_H_
+
+#include <emmintrin.h>
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/synonyms.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static INLINE __m256i pair_set_w16_epi16(int16_t a, int16_t b) {
+  return _mm256_set1_epi32(
+      (int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16)));
+}
+
+static INLINE void btf_16_w16_avx2(const __m256i w0, const __m256i w1,
+                                   __m256i *in0, __m256i *in1, const __m256i _r,
+                                   const int32_t cos_bit) {
+  __m256i t0 = _mm256_unpacklo_epi16(*in0, *in1);
+  __m256i t1 = _mm256_unpackhi_epi16(*in0, *in1);
+  __m256i u0 = _mm256_madd_epi16(t0, w0);
+  __m256i u1 = _mm256_madd_epi16(t1, w0);
+  __m256i v0 = _mm256_madd_epi16(t0, w1);
+  __m256i v1 = _mm256_madd_epi16(t1, w1);
+
+  __m256i a0 = _mm256_add_epi32(u0, _r);
+  __m256i a1 = _mm256_add_epi32(u1, _r);
+  __m256i b0 = _mm256_add_epi32(v0, _r);
+  __m256i b1 = _mm256_add_epi32(v1, _r);
+
+  __m256i c0 = _mm256_srai_epi32(a0, cos_bit);
+  __m256i c1 = _mm256_srai_epi32(a1, cos_bit);
+  __m256i d0 = _mm256_srai_epi32(b0, cos_bit);
+  __m256i d1 = _mm256_srai_epi32(b1, cos_bit);
+
+  *in0 = _mm256_packs_epi32(c0, c1);
+  *in1 = _mm256_packs_epi32(d0, d1);
+}
+
+static INLINE void btf_16_adds_subs_avx2(__m256i *in0, __m256i *in1) {
+  const __m256i _in0 = *in0;
+  const __m256i _in1 = *in1;
+  *in0 = _mm256_adds_epi16(_in0, _in1);
+  *in1 = _mm256_subs_epi16(_in0, _in1);
+}
+
+static INLINE void btf_32_add_sub_avx2(__m256i *in0, __m256i *in1) {
+  const __m256i _in0 = *in0;
+  const __m256i _in1 = *in1;
+  *in0 = _mm256_add_epi32(_in0, _in1);
+  *in1 = _mm256_sub_epi32(_in0, _in1);
+}
+
+static INLINE void btf_16_adds_subs_out_avx2(__m256i *out0, __m256i *out1,
+                                             __m256i in0, __m256i in1) {
+  const __m256i _in0 = in0;
+  const __m256i _in1 = in1;
+  *out0 = _mm256_adds_epi16(_in0, _in1);
+  *out1 = _mm256_subs_epi16(_in0, _in1);
+}
+
+static INLINE void btf_32_add_sub_out_avx2(__m256i *out0, __m256i *out1,
+                                           __m256i in0, __m256i in1) {
+  const __m256i _in0 = in0;
+  const __m256i _in1 = in1;
+  *out0 = _mm256_add_epi32(_in0, _in1);
+  *out1 = _mm256_sub_epi32(_in0, _in1);
+}
+
+static INLINE __m256i load_16bit_to_16bit_avx2(const int16_t *a) {
+  return _mm256_load_si256((const __m256i *)a);
+}
+
+static INLINE void load_buffer_16bit_to_16bit_avx2(const int16_t *in,
+                                                   int stride, __m256i *out,
+                                                   int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    out[i] = load_16bit_to_16bit_avx2(in + i * stride);
+  }
+}
+
+static INLINE void load_buffer_16bit_to_16bit_flip_avx2(const int16_t *in,
+                                                        int stride,
+                                                        __m256i *out,
+                                                        int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    out[out_size - i - 1] = load_16bit_to_16bit_avx2(in + i * stride);
+  }
+}
+
+static INLINE __m256i load_32bit_to_16bit_w16_avx2(const int32_t *a) {
+  const __m256i a_low = _mm256_lddqu_si256((const __m256i *)a);
+  const __m256i b = _mm256_packs_epi32(a_low, *(const __m256i *)(a + 8));
+  return _mm256_permute4x64_epi64(b, 0xD8);
+}
+
+static INLINE void load_buffer_32bit_to_16bit_w16_avx2(const int32_t *in,
+                                                       int stride, __m256i *out,
+                                                       int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    out[i] = load_32bit_to_16bit_w16_avx2(in + i * stride);
+  }
+}
+
+static INLINE void transpose2_8x8_avx2(const __m256i *const in,
+                                       __m256i *const out) {
+  __m256i t[16], u[16];
+  // (1st, 2nd) ==> (lo, hi)
+  //   (0, 1)   ==>  (0, 1)
+  //   (2, 3)   ==>  (2, 3)
+  //   (4, 5)   ==>  (4, 5)
+  //   (6, 7)   ==>  (6, 7)
+  for (int i = 0; i < 4; i++) {
+    t[2 * i] = _mm256_unpacklo_epi16(in[2 * i], in[2 * i + 1]);
+    t[2 * i + 1] = _mm256_unpackhi_epi16(in[2 * i], in[2 * i + 1]);
+  }
+
+  // (1st, 2nd) ==> (lo, hi)
+  //   (0, 2)   ==>  (0, 2)
+  //   (1, 3)   ==>  (1, 3)
+  //   (4, 6)   ==>  (4, 6)
+  //   (5, 7)   ==>  (5, 7)
+  for (int i = 0; i < 2; i++) {
+    u[i] = _mm256_unpacklo_epi32(t[i], t[i + 2]);
+    u[i + 2] = _mm256_unpackhi_epi32(t[i], t[i + 2]);
+
+    u[i + 4] = _mm256_unpacklo_epi32(t[i + 4], t[i + 6]);
+    u[i + 6] = _mm256_unpackhi_epi32(t[i + 4], t[i + 6]);
+  }
+
+  // (1st, 2nd) ==> (lo, hi)
+  //   (0, 4)   ==>  (0, 1)
+  //   (1, 5)   ==>  (4, 5)
+  //   (2, 6)   ==>  (2, 3)
+  //   (3, 7)   ==>  (6, 7)
+  for (int i = 0; i < 2; i++) {
+    out[2 * i] = _mm256_unpacklo_epi64(u[2 * i], u[2 * i + 4]);
+    out[2 * i + 1] = _mm256_unpackhi_epi64(u[2 * i], u[2 * i + 4]);
+
+    out[2 * i + 4] = _mm256_unpacklo_epi64(u[2 * i + 1], u[2 * i + 5]);
+    out[2 * i + 5] = _mm256_unpackhi_epi64(u[2 * i + 1], u[2 * i + 5]);
+  }
+}
+
+static INLINE void transpose_16bit_16x16_avx2(const __m256i *const in,
+                                              __m256i *const out) {
+  __m256i t[16];
+
+#define LOADL(idx)                                                            \
+  t[idx] = _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx])); \
+  t[idx] = _mm256_inserti128_si256(                                           \
+      t[idx], _mm_load_si128((__m128i const *)&in[idx + 8]), 1);
+
+#define LOADR(idx)                                                           \
+  t[8 + idx] =                                                               \
+      _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx] + 1)); \
+  t[8 + idx] = _mm256_inserti128_si256(                                      \
+      t[8 + idx], _mm_load_si128((__m128i const *)&in[idx + 8] + 1), 1);
+
+  // load left 8x16
+  LOADL(0)
+  LOADL(1)
+  LOADL(2)
+  LOADL(3)
+  LOADL(4)
+  LOADL(5)
+  LOADL(6)
+  LOADL(7)
+
+  // load right 8x16
+  LOADR(0)
+  LOADR(1)
+  LOADR(2)
+  LOADR(3)
+  LOADR(4)
+  LOADR(5)
+  LOADR(6)
+  LOADR(7)
+
+  // get the top 16x8 result
+  transpose2_8x8_avx2(t, out);
+  // get the bottom 16x8 result
+  transpose2_8x8_avx2(&t[8], &out[8]);
+}
+
+static INLINE void transpose_16bit_16x8_avx2(const __m256i *const in,
+                                             __m256i *const out) {
+  const __m256i a0 = _mm256_unpacklo_epi16(in[0], in[1]);
+  const __m256i a1 = _mm256_unpacklo_epi16(in[2], in[3]);
+  const __m256i a2 = _mm256_unpacklo_epi16(in[4], in[5]);
+  const __m256i a3 = _mm256_unpacklo_epi16(in[6], in[7]);
+  const __m256i a4 = _mm256_unpackhi_epi16(in[0], in[1]);
+  const __m256i a5 = _mm256_unpackhi_epi16(in[2], in[3]);
+  const __m256i a6 = _mm256_unpackhi_epi16(in[4], in[5]);
+  const __m256i a7 = _mm256_unpackhi_epi16(in[6], in[7]);
+
+  const __m256i b0 = _mm256_unpacklo_epi32(a0, a1);
+  const __m256i b1 = _mm256_unpacklo_epi32(a2, a3);
+  const __m256i b2 = _mm256_unpacklo_epi32(a4, a5);
+  const __m256i b3 = _mm256_unpacklo_epi32(a6, a7);
+  const __m256i b4 = _mm256_unpackhi_epi32(a0, a1);
+  const __m256i b5 = _mm256_unpackhi_epi32(a2, a3);
+  const __m256i b6 = _mm256_unpackhi_epi32(a4, a5);
+  const __m256i b7 = _mm256_unpackhi_epi32(a6, a7);
+
+  out[0] = _mm256_unpacklo_epi64(b0, b1);
+  out[1] = _mm256_unpackhi_epi64(b0, b1);
+  out[2] = _mm256_unpacklo_epi64(b4, b5);
+  out[3] = _mm256_unpackhi_epi64(b4, b5);
+  out[4] = _mm256_unpacklo_epi64(b2, b3);
+  out[5] = _mm256_unpackhi_epi64(b2, b3);
+  out[6] = _mm256_unpacklo_epi64(b6, b7);
+  out[7] = _mm256_unpackhi_epi64(b6, b7);
+}
+
+static INLINE void flip_buf_avx2(__m256i *in, __m256i *out, int size) {
+  for (int i = 0; i < size; ++i) {
+    out[size - i - 1] = in[i];
+  }
+}
+
+static INLINE void round_shift_16bit_w16_avx2(__m256i *in, int size, int bit) {
+  if (bit < 0) {
+    bit = -bit;
+    __m256i round = _mm256_set1_epi16(1 << (bit - 1));
+    for (int i = 0; i < size; ++i) {
+      in[i] = _mm256_adds_epi16(in[i], round);
+      in[i] = _mm256_srai_epi16(in[i], bit);
+    }
+  } else if (bit > 0) {
+    for (int i = 0; i < size; ++i) {
+      in[i] = _mm256_slli_epi16(in[i], bit);
+    }
+  }
+}
+
+static INLINE __m256i av1_round_shift_32_avx2(__m256i vec, int bit) {
+  __m256i tmp, round;
+  round = _mm256_set1_epi32(1 << (bit - 1));
+  tmp = _mm256_add_epi32(vec, round);
+  return _mm256_srai_epi32(tmp, bit);
+}
+
+static INLINE void av1_round_shift_array_32_avx2(__m256i *input,
+                                                 __m256i *output,
+                                                 const int size,
+                                                 const int bit) {
+  if (bit > 0) {
+    int i;
+    for (i = 0; i < size; i++) {
+      output[i] = av1_round_shift_32_avx2(input[i], bit);
+    }
+  } else {
+    int i;
+    for (i = 0; i < size; i++) {
+      output[i] = _mm256_slli_epi32(input[i], -bit);
+    }
+  }
+}
+
+static INLINE void av1_round_shift_rect_array_32_avx2(__m256i *input,
+                                                      __m256i *output,
+                                                      const int size,
+                                                      const int bit,
+                                                      const int val) {
+  const __m256i sqrt2 = _mm256_set1_epi32(val);
+  if (bit > 0) {
+    int i;
+    for (i = 0; i < size; i++) {
+      const __m256i r0 = av1_round_shift_32_avx2(input[i], bit);
+      const __m256i r1 = _mm256_mullo_epi32(sqrt2, r0);
+      output[i] = av1_round_shift_32_avx2(r1, NewSqrt2Bits);
+    }
+  } else {
+    int i;
+    for (i = 0; i < size; i++) {
+      const __m256i r0 = _mm256_slli_epi32(input[i], -bit);
+      const __m256i r1 = _mm256_mullo_epi32(sqrt2, r0);
+      output[i] = av1_round_shift_32_avx2(r1, NewSqrt2Bits);
+    }
+  }
+}
+
+static INLINE __m256i scale_round_avx2(const __m256i a, const int scale) {
+  const __m256i scale_rounding =
+      pair_set_w16_epi16(scale, 1 << (NewSqrt2Bits - 1));
+  const __m256i b = _mm256_madd_epi16(a, scale_rounding);
+  return _mm256_srai_epi32(b, NewSqrt2Bits);
+}
+
+static INLINE void store_rect_16bit_to_32bit_w8_avx2(const __m256i a,
+                                                     int32_t *const b) {
+  const __m256i one = _mm256_set1_epi16(1);
+  const __m256i a_lo = _mm256_unpacklo_epi16(a, one);
+  const __m256i a_hi = _mm256_unpackhi_epi16(a, one);
+  const __m256i b_lo = scale_round_avx2(a_lo, NewSqrt2);
+  const __m256i b_hi = scale_round_avx2(a_hi, NewSqrt2);
+  const __m256i temp = _mm256_permute2f128_si256(b_lo, b_hi, 0x31);
+  _mm_store_si128((__m128i *)b, _mm256_castsi256_si128(b_lo));
+  _mm_store_si128((__m128i *)(b + 4), _mm256_castsi256_si128(b_hi));
+  _mm256_store_si256((__m256i *)(b + 64), temp);
+}
+
+static INLINE void store_rect_buffer_16bit_to_32bit_w8_avx2(
+    const __m256i *const in, int32_t *const out, const int stride,
+    const int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    store_rect_16bit_to_32bit_w8_avx2(in[i], out + i * stride);
+  }
+}
+
+static INLINE void pack_reg(const __m128i *in1, const __m128i *in2,
+                            __m256i *out) {
+  out[0] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[0]), in2[0], 0x1);
+  out[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[1]), in2[1], 0x1);
+  out[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[2]), in2[2], 0x1);
+  out[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[3]), in2[3], 0x1);
+  out[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[4]), in2[4], 0x1);
+  out[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[5]), in2[5], 0x1);
+  out[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[6]), in2[6], 0x1);
+  out[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[7]), in2[7], 0x1);
+}
+
+static INLINE void extract_reg(const __m256i *in, __m128i *out1) {
+  out1[0] = _mm256_castsi256_si128(in[0]);
+  out1[1] = _mm256_castsi256_si128(in[1]);
+  out1[2] = _mm256_castsi256_si128(in[2]);
+  out1[3] = _mm256_castsi256_si128(in[3]);
+  out1[4] = _mm256_castsi256_si128(in[4]);
+  out1[5] = _mm256_castsi256_si128(in[5]);
+  out1[6] = _mm256_castsi256_si128(in[6]);
+  out1[7] = _mm256_castsi256_si128(in[7]);
+
+  out1[8] = _mm256_extracti128_si256(in[0], 0x01);
+  out1[9] = _mm256_extracti128_si256(in[1], 0x01);
+  out1[10] = _mm256_extracti128_si256(in[2], 0x01);
+  out1[11] = _mm256_extracti128_si256(in[3], 0x01);
+  out1[12] = _mm256_extracti128_si256(in[4], 0x01);
+  out1[13] = _mm256_extracti128_si256(in[5], 0x01);
+  out1[14] = _mm256_extracti128_si256(in[6], 0x01);
+  out1[15] = _mm256_extracti128_si256(in[7], 0x01);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // AOM_AOM_DSP_X86_TXFM_COMMON_AVX2_H_
diff --git a/libs/libaom/src/aom_dsp/x86/txfm_common_sse2.h b/libs/libaom/src/aom_dsp/x86/txfm_common_sse2.h
new file mode 100644
index 000000000..9c99eb93b
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/txfm_common_sse2.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_TXFM_COMMON_SSE2_H_
+#define AOM_AOM_DSP_X86_TXFM_COMMON_SSE2_H_
+
+#include <emmintrin.h>
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/synonyms.h"
+
+#define pair_set_epi16(a, b) \
+  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16)))
+
+// Reverse the 8 16 bit words in __m128i
+static INLINE __m128i mm_reverse_epi16(const __m128i x) {
+  const __m128i a = _mm_shufflelo_epi16(x, 0x1b);
+  const __m128i b = _mm_shufflehi_epi16(a, 0x1b);
+  return _mm_shuffle_epi32(b, 0x4e);
+}
+
+#define octa_set_epi16(a, b, c, d, e, f, g, h)                           \
+  _mm_setr_epi16((int16_t)(a), (int16_t)(b), (int16_t)(c), (int16_t)(d), \
+                 (int16_t)(e), (int16_t)(f), (int16_t)(g), (int16_t)(h))
+
+#endif  // AOM_AOM_DSP_X86_TXFM_COMMON_SSE2_H_
diff --git a/libs/libaom/src/aom_dsp/x86/variance_avx2.c b/libs/libaom/src/aom_dsp/x86/variance_avx2.c
new file mode 100644
index 000000000..c4919ba9b
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/variance_avx2.c
@@ -0,0 +1,526 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/masked_variance_intrin_ssse3.h"
+
+static INLINE __m128i mm256_add_hi_lo_epi16(const __m256i val) {
+  return _mm_add_epi16(_mm256_castsi256_si128(val),
+                       _mm256_extractf128_si256(val, 1));
+}
+
+static INLINE __m128i mm256_add_hi_lo_epi32(const __m256i val) {
+  return _mm_add_epi32(_mm256_castsi256_si128(val),
+                       _mm256_extractf128_si256(val, 1));
+}
+
+static INLINE void variance_kernel_avx2(const __m256i src, const __m256i ref,
+                                        __m256i *const sse,
+                                        __m256i *const sum) {
+  const __m256i adj_sub = _mm256_set1_epi16((short)0xff01);  // (1,-1)
+
+  // unpack into pairs of source and reference values
+  const __m256i src_ref0 = _mm256_unpacklo_epi8(src, ref);
+  const __m256i src_ref1 = _mm256_unpackhi_epi8(src, ref);
+
+  // subtract adjacent elements using src*1 + ref*-1
+  const __m256i diff0 = _mm256_maddubs_epi16(src_ref0, adj_sub);
+  const __m256i diff1 = _mm256_maddubs_epi16(src_ref1, adj_sub);
+  const __m256i madd0 = _mm256_madd_epi16(diff0, diff0);
+  const __m256i madd1 = _mm256_madd_epi16(diff1, diff1);
+
+  // add to the running totals
+  *sum = _mm256_add_epi16(*sum, _mm256_add_epi16(diff0, diff1));
+  *sse = _mm256_add_epi32(*sse, _mm256_add_epi32(madd0, madd1));
+}
+
+static INLINE int variance_final_from_32bit_sum_avx2(__m256i vsse, __m128i vsum,
+                                                     unsigned int *const sse) {
+  // extract the low lane and add it to the high lane
+  const __m128i sse_reg_128 = mm256_add_hi_lo_epi32(vsse);
+
+  // unpack sse and sum registers and add
+  const __m128i sse_sum_lo = _mm_unpacklo_epi32(sse_reg_128, vsum);
+  const __m128i sse_sum_hi = _mm_unpackhi_epi32(sse_reg_128, vsum);
+  const __m128i sse_sum = _mm_add_epi32(sse_sum_lo, sse_sum_hi);
+
+  // perform the final summation and extract the results
+  const __m128i res = _mm_add_epi32(sse_sum, _mm_srli_si128(sse_sum, 8));
+  *((int *)sse) = _mm_cvtsi128_si32(res);
+  return _mm_extract_epi32(res, 1);
+}
+
+// handle pixels (<= 512)
+static INLINE int variance_final_512_avx2(__m256i vsse, __m256i vsum,
+                                          unsigned int *const sse) {
+  // extract the low lane and add it to the high lane
+  const __m128i vsum_128 = mm256_add_hi_lo_epi16(vsum);
+  const __m128i vsum_64 = _mm_add_epi16(vsum_128, _mm_srli_si128(vsum_128, 8));
+  const __m128i sum_int32 = _mm_cvtepi16_epi32(vsum_64);
+  return variance_final_from_32bit_sum_avx2(vsse, sum_int32, sse);
+}
+
+// handle 1024 pixels (32x32, 16x64, 64x16)
+static INLINE int variance_final_1024_avx2(__m256i vsse, __m256i vsum,
+                                           unsigned int *const sse) {
+  // extract the low lane and add it to the high lane
+  const __m128i vsum_128 = mm256_add_hi_lo_epi16(vsum);
+  const __m128i vsum_64 =
+      _mm_add_epi32(_mm_cvtepi16_epi32(vsum_128),
+                    _mm_cvtepi16_epi32(_mm_srli_si128(vsum_128, 8)));
+  return variance_final_from_32bit_sum_avx2(vsse, vsum_64, sse);
+}
+
+static INLINE __m256i sum_to_32bit_avx2(const __m256i sum) {
+  const __m256i sum_lo = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(sum));
+  const __m256i sum_hi =
+      _mm256_cvtepi16_epi32(_mm256_extractf128_si256(sum, 1));
+  return _mm256_add_epi32(sum_lo, sum_hi);
+}
+
+// handle 2048 pixels (32x64, 64x32)
+static INLINE int variance_final_2048_avx2(__m256i vsse, __m256i vsum,
+                                           unsigned int *const sse) {
+  vsum = sum_to_32bit_avx2(vsum);
+  const __m128i vsum_128 = mm256_add_hi_lo_epi32(vsum);
+  return variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse);
+}
+
+static INLINE void variance16_kernel_avx2(
+    const uint8_t *const src, const int src_stride, const uint8_t *const ref,
+    const int ref_stride, __m256i *const sse, __m256i *const sum) {
+  const __m128i s0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
+  const __m128i s1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
+  const __m128i r0 = _mm_loadu_si128((__m128i const *)(ref + 0 * ref_stride));
+  const __m128i r1 = _mm_loadu_si128((__m128i const *)(ref + 1 * ref_stride));
+  const __m256i s = _mm256_inserti128_si256(_mm256_castsi128_si256(s0), s1, 1);
+  const __m256i r = _mm256_inserti128_si256(_mm256_castsi128_si256(r0), r1, 1);
+  variance_kernel_avx2(s, r, sse, sum);
+}
+
+static INLINE void variance32_kernel_avx2(const uint8_t *const src,
+                                          const uint8_t *const ref,
+                                          __m256i *const sse,
+                                          __m256i *const sum) {
+  const __m256i s = _mm256_loadu_si256((__m256i const *)(src));
+  const __m256i r = _mm256_loadu_si256((__m256i const *)(ref));
+  variance_kernel_avx2(s, r, sse, sum);
+}
+
+static INLINE void variance16_avx2(const uint8_t *src, const int src_stride,
+                                   const uint8_t *ref, const int ref_stride,
+                                   const int h, __m256i *const vsse,
+                                   __m256i *const vsum) {
+  *vsum = _mm256_setzero_si256();
+
+  for (int i = 0; i < h; i += 2) {
+    variance16_kernel_avx2(src, src_stride, ref, ref_stride, vsse, vsum);
+    src += 2 * src_stride;
+    ref += 2 * ref_stride;
+  }
+}
+
+static INLINE void variance32_avx2(const uint8_t *src, const int src_stride,
+                                   const uint8_t *ref, const int ref_stride,
+                                   const int h, __m256i *const vsse,
+                                   __m256i *const vsum) {
+  *vsum = _mm256_setzero_si256();
+
+  for (int i = 0; i < h; i++) {
+    variance32_kernel_avx2(src, ref, vsse, vsum);
+    src += src_stride;
+    ref += ref_stride;
+  }
+}
+
+static INLINE void variance64_avx2(const uint8_t *src, const int src_stride,
+                                   const uint8_t *ref, const int ref_stride,
+                                   const int h, __m256i *const vsse,
+                                   __m256i *const vsum) {
+  *vsum = _mm256_setzero_si256();
+
+  for (int i = 0; i < h; i++) {
+    variance32_kernel_avx2(src + 0, ref + 0, vsse, vsum);
+    variance32_kernel_avx2(src + 32, ref + 32, vsse, vsum);
+    src += src_stride;
+    ref += ref_stride;
+  }
+}
+
+static INLINE void variance128_avx2(const uint8_t *src, const int src_stride,
+                                    const uint8_t *ref, const int ref_stride,
+                                    const int h, __m256i *const vsse,
+                                    __m256i *const vsum) {
+  *vsum = _mm256_setzero_si256();
+
+  for (int i = 0; i < h; i++) {
+    variance32_kernel_avx2(src + 0, ref + 0, vsse, vsum);
+    variance32_kernel_avx2(src + 32, ref + 32, vsse, vsum);
+    variance32_kernel_avx2(src + 64, ref + 64, vsse, vsum);
+    variance32_kernel_avx2(src + 96, ref + 96, vsse, vsum);
+    src += src_stride;
+    ref += ref_stride;
+  }
+}
+
+#define AOM_VAR_NO_LOOP_AVX2(bw, bh, bits, max_pixel)                         \
+  unsigned int aom_variance##bw##x##bh##_avx2(                                \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      unsigned int *sse) {                                                    \
+    __m256i vsse = _mm256_setzero_si256();                                    \
+    __m256i vsum;                                                             \
+    variance##bw##_avx2(src, src_stride, ref, ref_stride, bh, &vsse, &vsum);  \
+    const int sum = variance_final_##max_pixel##_avx2(vsse, vsum, sse);       \
+    return *sse - (uint32_t)(((int64_t)sum * sum) >> bits);                   \
+  }
+
+AOM_VAR_NO_LOOP_AVX2(16, 4, 6, 512);
+AOM_VAR_NO_LOOP_AVX2(16, 8, 7, 512);
+AOM_VAR_NO_LOOP_AVX2(16, 16, 8, 512);
+AOM_VAR_NO_LOOP_AVX2(16, 32, 9, 512);
+AOM_VAR_NO_LOOP_AVX2(16, 64, 10, 1024);
+
+AOM_VAR_NO_LOOP_AVX2(32, 8, 8, 512);
+AOM_VAR_NO_LOOP_AVX2(32, 16, 9, 512);
+AOM_VAR_NO_LOOP_AVX2(32, 32, 10, 1024);
+AOM_VAR_NO_LOOP_AVX2(32, 64, 11, 2048);
+
+AOM_VAR_NO_LOOP_AVX2(64, 16, 10, 1024);
+AOM_VAR_NO_LOOP_AVX2(64, 32, 11, 2048);
+
+#define AOM_VAR_LOOP_AVX2(bw, bh, bits, uh)                                   \
+  unsigned int aom_variance##bw##x##bh##_avx2(                                \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      unsigned int *sse) {                                                    \
+    __m256i vsse = _mm256_setzero_si256();                                    \
+    __m256i vsum = _mm256_setzero_si256();                                    \
+    for (int i = 0; i < (bh / uh); i++) {                                     \
+      __m256i vsum16;                                                         \
+      variance##bw##_avx2(src, src_stride, ref, ref_stride, uh, &vsse,        \
+                          &vsum16);                                           \
+      vsum = _mm256_add_epi32(vsum, sum_to_32bit_avx2(vsum16));               \
+      src += uh * src_stride;                                                 \
+      ref += uh * ref_stride;                                                 \
+    }                                                                         \
+    const __m128i vsum_128 = mm256_add_hi_lo_epi32(vsum);                     \
+    const int sum = variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse);  \
+    return *sse - (unsigned int)(((int64_t)sum * sum) >> bits);               \
+  }
+
+AOM_VAR_LOOP_AVX2(64, 64, 12, 32);    // 64x32 * ( 64/32)
+AOM_VAR_LOOP_AVX2(64, 128, 13, 32);   // 64x32 * (128/32)
+AOM_VAR_LOOP_AVX2(128, 64, 13, 16);   // 128x16 * ( 64/16)
+AOM_VAR_LOOP_AVX2(128, 128, 14, 16);  // 128x16 * (128/16)
+
+unsigned int aom_mse16x16_avx2(const uint8_t *src, int src_stride,
+                               const uint8_t *ref, int ref_stride,
+                               unsigned int *sse) {
+  aom_variance16x16_avx2(src, src_stride, ref, ref_stride, sse);
+  return *sse;
+}
+
+unsigned int aom_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
+                                             int x_offset, int y_offset,
+                                             const uint8_t *dst, int dst_stride,
+                                             int height, unsigned int *sse);
+unsigned int aom_sub_pixel_variance16xh_avx2(const uint8_t *src, int src_stride,
+                                             int x_offset, int y_offset,
+                                             const uint8_t *dst, int dst_stride,
+                                             int height, unsigned int *sse);
+
+unsigned int aom_sub_pixel_avg_variance32xh_avx2(
+    const uint8_t *src, int src_stride, int x_offset, int y_offset,
+    const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride,
+    int height, unsigned int *sseptr);
+
+#define AOM_SUB_PIXEL_VAR_AVX2(w, h, wf, wlog2, hlog2)                        \
+  unsigned int aom_sub_pixel_variance##w##x##h##_avx2(                        \
+      const uint8_t *src, int src_stride, int x_offset, int y_offset,         \
+      const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) {            \
+    /*Avoid overflow in helper by capping height.*/                           \
+    const int hf = AOMMIN(h, 64);                                             \
+    unsigned int sse = 0;                                                     \
+    int se = 0;                                                               \
+    for (int i = 0; i < (w / wf); ++i) {                                      \
+      const uint8_t *src_ptr = src;                                           \
+      const uint8_t *dst_ptr = dst;                                           \
+      for (int j = 0; j < (h / hf); ++j) {                                    \
+        unsigned int sse2;                                                    \
+        const int se2 = aom_sub_pixel_variance##wf##xh_avx2(                  \
+            src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, hf, \
+            &sse2);                                                           \
+        dst_ptr += hf * dst_stride;                                           \
+        src_ptr += hf * src_stride;                                           \
+        se += se2;                                                            \
+        sse += sse2;                                                          \
+      }                                                                       \
+      src += wf;                                                              \
+      dst += wf;                                                              \
+    }                                                                         \
+    *sse_ptr = sse;                                                           \
+    return sse - (unsigned int)(((int64_t)se * se) >> (wlog2 + hlog2));       \
+  }
+
+AOM_SUB_PIXEL_VAR_AVX2(128, 128, 32, 7, 7);
+AOM_SUB_PIXEL_VAR_AVX2(128, 64, 32, 7, 6);
+AOM_SUB_PIXEL_VAR_AVX2(64, 128, 32, 6, 7);
+AOM_SUB_PIXEL_VAR_AVX2(64, 64, 32, 6, 6);
+AOM_SUB_PIXEL_VAR_AVX2(64, 32, 32, 6, 5);
+AOM_SUB_PIXEL_VAR_AVX2(32, 64, 32, 5, 6);
+AOM_SUB_PIXEL_VAR_AVX2(32, 32, 32, 5, 5);
+AOM_SUB_PIXEL_VAR_AVX2(32, 16, 32, 5, 4);
+AOM_SUB_PIXEL_VAR_AVX2(16, 64, 16, 4, 6);
+AOM_SUB_PIXEL_VAR_AVX2(16, 32, 16, 4, 5);
+AOM_SUB_PIXEL_VAR_AVX2(16, 16, 16, 4, 4);
+AOM_SUB_PIXEL_VAR_AVX2(16, 8, 16, 4, 3);
+AOM_SUB_PIXEL_VAR_AVX2(16, 4, 16, 4, 2);
+
+#define AOM_SUB_PIXEL_AVG_VAR_AVX2(w, h, wf, wlog2, hlog2)                \
+  unsigned int aom_sub_pixel_avg_variance##w##x##h##_avx2(                \
+      const uint8_t *src, int src_stride, int x_offset, int y_offset,     \
+      const uint8_t *dst, int dst_stride, unsigned int *sse_ptr,          \
+      const uint8_t *sec) {                                               \
+    /*Avoid overflow in helper by capping height.*/                       \
+    const int hf = AOMMIN(h, 64);                                         \
+    unsigned int sse = 0;                                                 \
+    int se = 0;                                                           \
+    for (int i = 0; i < (w / wf); ++i) {                                  \
+      const uint8_t *src_ptr = src;                                       \
+      const uint8_t *dst_ptr = dst;                                       \
+      const uint8_t *sec_ptr = sec;                                       \
+      for (int j = 0; j < (h / hf); ++j) {                                \
+        unsigned int sse2;                                                \
+        const int se2 = aom_sub_pixel_avg_variance##wf##xh_avx2(          \
+            src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, \
+            sec_ptr, w, hf, &sse2);                                       \
+        dst_ptr += hf * dst_stride;                                       \
+        src_ptr += hf * src_stride;                                       \
+        sec_ptr += hf * w;                                                \
+        se += se2;                                                        \
+        sse += sse2;                                                      \
+      }                                                                   \
+      src += wf;                                                          \
+      dst += wf;                                                          \
+      sec += wf;                                                          \
+    }                                                                     \
+    *sse_ptr = sse;                                                       \
+    return sse - (unsigned int)(((int64_t)se * se) >> (wlog2 + hlog2));   \
+  }
+
+AOM_SUB_PIXEL_AVG_VAR_AVX2(128, 128, 32, 7, 7);
+AOM_SUB_PIXEL_AVG_VAR_AVX2(128, 64, 32, 7, 6);
+AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 128, 32, 6, 7);
+AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 64, 32, 6, 6);
+AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 32, 32, 6, 5);
+AOM_SUB_PIXEL_AVG_VAR_AVX2(32, 64, 32, 5, 6);
+AOM_SUB_PIXEL_AVG_VAR_AVX2(32, 32, 32, 5, 5);
+AOM_SUB_PIXEL_AVG_VAR_AVX2(32, 16, 32, 5, 4);
+
+static INLINE __m256i mm256_loadu2(const uint8_t *p0, const uint8_t *p1) {
+  const __m256i d =
+      _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)p1));
+  return _mm256_insertf128_si256(d, _mm_loadu_si128((const __m128i *)p0), 1);
+}
+
+static INLINE __m256i mm256_loadu2_16(const uint16_t *p0, const uint16_t *p1) {
+  const __m256i d =
+      _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)p1));
+  return _mm256_insertf128_si256(d, _mm_loadu_si128((const __m128i *)p0), 1);
+}
+
+static INLINE void comp_mask_pred_line_avx2(const __m256i s0, const __m256i s1,
+                                            const __m256i a,
+                                            uint8_t *comp_pred) {
+  const __m256i alpha_max = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const int16_t round_bits = 15 - AOM_BLEND_A64_ROUND_BITS;
+  const __m256i round_offset = _mm256_set1_epi16(1 << (round_bits));
+
+  const __m256i ma = _mm256_sub_epi8(alpha_max, a);
+
+  const __m256i ssAL = _mm256_unpacklo_epi8(s0, s1);
+  const __m256i aaAL = _mm256_unpacklo_epi8(a, ma);
+  const __m256i ssAH = _mm256_unpackhi_epi8(s0, s1);
+  const __m256i aaAH = _mm256_unpackhi_epi8(a, ma);
+
+  const __m256i blendAL = _mm256_maddubs_epi16(ssAL, aaAL);
+  const __m256i blendAH = _mm256_maddubs_epi16(ssAH, aaAH);
+  const __m256i roundAL = _mm256_mulhrs_epi16(blendAL, round_offset);
+  const __m256i roundAH = _mm256_mulhrs_epi16(blendAH, round_offset);
+
+  const __m256i roundA = _mm256_packus_epi16(roundAL, roundAH);
+  _mm256_storeu_si256((__m256i *)(comp_pred), roundA);
+}
+
+void aom_comp_mask_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width,
+                             int height, const uint8_t *ref, int ref_stride,
+                             const uint8_t *mask, int mask_stride,
+                             int invert_mask) {
+  int i = 0;
+  const uint8_t *src0 = invert_mask ? pred : ref;
+  const uint8_t *src1 = invert_mask ? ref : pred;
+  const int stride0 = invert_mask ? width : ref_stride;
+  const int stride1 = invert_mask ? ref_stride : width;
+  if (width == 8) {
+    comp_mask_pred_8_ssse3(comp_pred, height, src0, stride0, src1, stride1,
+                           mask, mask_stride);
+  } else if (width == 16) {
+    do {
+      const __m256i sA0 = mm256_loadu2(src0 + stride0, src0);
+      const __m256i sA1 = mm256_loadu2(src1 + stride1, src1);
+      const __m256i aA = mm256_loadu2(mask + mask_stride, mask);
+      src0 += (stride0 << 1);
+      src1 += (stride1 << 1);
+      mask += (mask_stride << 1);
+      const __m256i sB0 = mm256_loadu2(src0 + stride0, src0);
+      const __m256i sB1 = mm256_loadu2(src1 + stride1, src1);
+      const __m256i aB = mm256_loadu2(mask + mask_stride, mask);
+      src0 += (stride0 << 1);
+      src1 += (stride1 << 1);
+      mask += (mask_stride << 1);
+      // comp_pred's stride == width == 16
+      comp_mask_pred_line_avx2(sA0, sA1, aA, comp_pred);
+      comp_mask_pred_line_avx2(sB0, sB1, aB, comp_pred + 32);
+      comp_pred += (16 << 2);
+      i += 4;
+    } while (i < height);
+  } else {  // for width == 32
+    do {
+      const __m256i sA0 = _mm256_lddqu_si256((const __m256i *)(src0));
+      const __m256i sA1 = _mm256_lddqu_si256((const __m256i *)(src1));
+      const __m256i aA = _mm256_lddqu_si256((const __m256i *)(mask));
+
+      const __m256i sB0 = _mm256_lddqu_si256((const __m256i *)(src0 + stride0));
+      const __m256i sB1 = _mm256_lddqu_si256((const __m256i *)(src1 + stride1));
+      const __m256i aB =
+          _mm256_lddqu_si256((const __m256i *)(mask + mask_stride));
+
+      comp_mask_pred_line_avx2(sA0, sA1, aA, comp_pred);
+      comp_mask_pred_line_avx2(sB0, sB1, aB, comp_pred + 32);
+      comp_pred += (32 << 1);
+
+      src0 += (stride0 << 1);
+      src1 += (stride1 << 1);
+      mask += (mask_stride << 1);
+      i += 2;
+    } while (i < height);
+  }
+}
+
+static INLINE __m256i highbd_comp_mask_pred_line_avx2(const __m256i s0,
+                                                      const __m256i s1,
+                                                      const __m256i a) {
+  const __m256i alpha_max = _mm256_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
+  const __m256i round_const =
+      _mm256_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
+  const __m256i a_inv = _mm256_sub_epi16(alpha_max, a);
+
+  const __m256i s_lo = _mm256_unpacklo_epi16(s0, s1);
+  const __m256i a_lo = _mm256_unpacklo_epi16(a, a_inv);
+  const __m256i pred_lo = _mm256_madd_epi16(s_lo, a_lo);
+  const __m256i pred_l = _mm256_srai_epi32(
+      _mm256_add_epi32(pred_lo, round_const), AOM_BLEND_A64_ROUND_BITS);
+
+  const __m256i s_hi = _mm256_unpackhi_epi16(s0, s1);
+  const __m256i a_hi = _mm256_unpackhi_epi16(a, a_inv);
+  const __m256i pred_hi = _mm256_madd_epi16(s_hi, a_hi);
+  const __m256i pred_h = _mm256_srai_epi32(
+      _mm256_add_epi32(pred_hi, round_const), AOM_BLEND_A64_ROUND_BITS);
+
+  const __m256i comp = _mm256_packs_epi32(pred_l, pred_h);
+
+  return comp;
+}
+
+void aom_highbd_comp_mask_pred_avx2(uint8_t *comp_pred8, const uint8_t *pred8,
+                                    int width, int height, const uint8_t *ref8,
+                                    int ref_stride, const uint8_t *mask,
+                                    int mask_stride, int invert_mask) {
+  int i = 0;
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+  const uint16_t *src0 = invert_mask ? pred : ref;
+  const uint16_t *src1 = invert_mask ? ref : pred;
+  const int stride0 = invert_mask ? width : ref_stride;
+  const int stride1 = invert_mask ? ref_stride : width;
+  const __m256i zero = _mm256_setzero_si256();
+
+  if (width == 8) {
+    do {
+      const __m256i s0 = mm256_loadu2_16(src0 + stride0, src0);
+      const __m256i s1 = mm256_loadu2_16(src1 + stride1, src1);
+
+      const __m128i m_l = _mm_loadl_epi64((const __m128i *)mask);
+      const __m128i m_h = _mm_loadl_epi64((const __m128i *)(mask + 8));
+
+      __m256i m = _mm256_castsi128_si256(m_l);
+      m = _mm256_insertf128_si256(m, m_h, 1);
+      const __m256i m_16 = _mm256_unpacklo_epi8(m, zero);
+
+      const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m_16);
+
+      _mm_storeu_si128((__m128i *)(comp_pred), _mm256_castsi256_si128(comp));
+
+      _mm_storeu_si128((__m128i *)(comp_pred + width),
+                       _mm256_extractf128_si256(comp, 1));
+
+      src0 += (stride0 << 1);
+      src1 += (stride1 << 1);
+      mask += (mask_stride << 1);
+      comp_pred += (width << 1);
+      i += 2;
+    } while (i < height);
+  } else if (width == 16) {
+    do {
+      const __m256i s0 = _mm256_loadu_si256((const __m256i *)(src0));
+      const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src1));
+      const __m256i m_16 =
+          _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)mask));
+
+      const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m_16);
+
+      _mm256_storeu_si256((__m256i *)comp_pred, comp);
+
+      src0 += stride0;
+      src1 += stride1;
+      mask += mask_stride;
+      comp_pred += width;
+      i += 1;
+    } while (i < height);
+  } else if (width == 32) {
+    do {
+      const __m256i s0 = _mm256_loadu_si256((const __m256i *)src0);
+      const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src0 + 16));
+      const __m256i s1 = _mm256_loadu_si256((const __m256i *)src1);
+      const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src1 + 16));
+
+      const __m256i m01_16 =
+          _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)mask));
+      const __m256i m23_16 =
+          _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)(mask + 16)));
+
+      const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m01_16);
+      const __m256i comp1 = highbd_comp_mask_pred_line_avx2(s2, s3, m23_16);
+
+      _mm256_storeu_si256((__m256i *)comp_pred, comp);
+      _mm256_storeu_si256((__m256i *)(comp_pred + 16), comp1);
+
+      src0 += stride0;
+      src1 += stride1;
+      mask += mask_stride;
+      comp_pred += width;
+      i += 1;
+    } while (i < height);
+  }
+}
diff --git a/libs/libaom/src/aom_dsp/x86/variance_impl_avx2.c b/libs/libaom/src/aom_dsp/x86/variance_impl_avx2.c
new file mode 100644
index 000000000..f779270ae
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/variance_impl_avx2.c
@@ -0,0 +1,814 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>  // AVX2
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_ports/mem.h"
+
+/* clang-format off */
+DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = {
+  16,  0, 16,  0, 16,  0, 16,  0, 16,  0, 16,  0, 16,  0, 16,  0,
+  16,  0, 16,  0, 16,  0, 16,  0, 16,  0, 16,  0, 16,  0, 16,  0,
+  14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2,
+  14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2,
+  12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4,
+  12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4,
+  10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6,
+  10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6,
+   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+   6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,
+   6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,
+   4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,
+   4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,
+   2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,
+   2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,
+};
+/* clang-format on */
+
+#define FILTER_SRC(filter)                               \
+  /* filter the source */                                \
+  exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \
+  exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \
+                                                         \
+  /* add 8 to source */                                  \
+  exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8);        \
+  exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8);        \
+                                                         \
+  /* divide source by 16 */                              \
+  exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4);         \
+  exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
+
+#define MERGE_WITH_SRC(src_reg, reg)               \
+  exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \
+  exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg);
+
+#define LOAD_SRC_DST                                    \
+  /* load source and destination */                     \
+  src_reg = _mm256_loadu_si256((__m256i const *)(src)); \
+  dst_reg = _mm256_loadu_si256((__m256i const *)(dst));
+
+#define AVG_NEXT_SRC(src_reg, size_stride)                                 \
+  src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \
+  /* average between current and next stride source */                     \
+  src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
+
+#define MERGE_NEXT_SRC(src_reg, size_stride)                               \
+  src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \
+  MERGE_WITH_SRC(src_reg, src_next_reg)
+
+#define CALC_SUM_SSE_INSIDE_LOOP                          \
+  /* expand each byte to 2 bytes */                       \
+  exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg);   \
+  exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg);   \
+  /* source - dest */                                     \
+  exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo);  \
+  exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi);  \
+  /* caculate sum */                                      \
+  sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo);        \
+  exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \
+  sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi);        \
+  exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \
+  /* calculate sse */                                     \
+  sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo);        \
+  sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
+
+// final calculation to sum and sse
+#define CALC_SUM_AND_SSE                                                   \
+  res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg);                         \
+  sse_reg_hi = _mm256_srli_si256(sse_reg, 8);                              \
+  sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp);                    \
+  sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp);                    \
+  sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi);                         \
+  sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi);                      \
+                                                                           \
+  sse_reg_hi = _mm256_srli_si256(sse_reg, 4);                              \
+  sum_reg_hi = _mm256_srli_si256(sum_reg, 8);                              \
+                                                                           \
+  sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi);                         \
+  sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi);                         \
+  *((int *)sse) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) +     \
+                  _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \
+  sum_reg_hi = _mm256_srli_si256(sum_reg, 4);                              \
+  sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi);                         \
+  sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) +               \
+        _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1));
+
+// Functions related to sub pixel variance width 16
+#define LOAD_SRC_DST_INSERT(src_stride, dst_stride)              \
+  /* load source and destination of 2 rows and insert*/          \
+  src_reg = _mm256_inserti128_si256(                             \
+      _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src))), \
+      _mm_loadu_si128((__m128i *)(src + src_stride)), 1);        \
+  dst_reg = _mm256_inserti128_si256(                             \
+      _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(dst))), \
+      _mm_loadu_si128((__m128i *)(dst + dst_stride)), 1);
+
+#define AVG_NEXT_SRC_INSERT(src_reg, size_stride)                              \
+  src_next_reg = _mm256_inserti128_si256(                                      \
+      _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src + size_stride))), \
+      _mm_loadu_si128((__m128i *)(src + (size_stride << 1))), 1);              \
+  /* average between current and next stride source */                         \
+  src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
+
+#define MERGE_NEXT_SRC_INSERT(src_reg, size_stride)                            \
+  src_next_reg = _mm256_inserti128_si256(                                      \
+      _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src + size_stride))), \
+      _mm_loadu_si128((__m128i *)(src + (src_stride + size_stride))), 1);      \
+  MERGE_WITH_SRC(src_reg, src_next_reg)
+
+#define LOAD_SRC_NEXT_BYTE_INSERT                                    \
+  /* load source and another source from next row   */               \
+  src_reg = _mm256_inserti128_si256(                                 \
+      _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src))),     \
+      _mm_loadu_si128((__m128i *)(src + src_stride)), 1);            \
+  /* load source and next row source from 1 byte onwards   */        \
+  src_next_reg = _mm256_inserti128_si256(                            \
+      _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src + 1))), \
+      _mm_loadu_si128((__m128i *)(src + src_stride + 1)), 1);
+
+#define LOAD_DST_INSERT                                          \
+  dst_reg = _mm256_inserti128_si256(                             \
+      _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(dst))), \
+      _mm_loadu_si128((__m128i *)(dst + dst_stride)), 1);
+
+#define LOAD_SRC_MERGE_128BIT(filter)                        \
+  __m128i src_reg_0 = _mm_loadu_si128((__m128i *)(src));     \
+  __m128i src_reg_1 = _mm_loadu_si128((__m128i *)(src + 1)); \
+  __m128i src_lo = _mm_unpacklo_epi8(src_reg_0, src_reg_1);  \
+  __m128i src_hi = _mm_unpackhi_epi8(src_reg_0, src_reg_1);  \
+  __m128i filter_128bit = _mm256_castsi256_si128(filter);    \
+  __m128i pw8_128bit = _mm256_castsi256_si128(pw8);
+
+#define FILTER_SRC_128BIT(filter)             \
+  /* filter the source */                     \
+  src_lo = _mm_maddubs_epi16(src_lo, filter); \
+  src_hi = _mm_maddubs_epi16(src_hi, filter); \
+                                              \
+  /* add 8 to source */                       \
+  src_lo = _mm_add_epi16(src_lo, pw8_128bit); \
+  src_hi = _mm_add_epi16(src_hi, pw8_128bit); \
+                                              \
+  /* divide source by 16 */                   \
+  src_lo = _mm_srai_epi16(src_lo, 4);         \
+  src_hi = _mm_srai_epi16(src_hi, 4);
+
+unsigned int aom_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
+                                             int x_offset, int y_offset,
+                                             const uint8_t *dst, int dst_stride,
+                                             int height, unsigned int *sse) {
+  __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
+  __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
+  __m256i zero_reg;
+  int i, sum;
+  sum_reg = _mm256_set1_epi16(0);
+  sse_reg = _mm256_set1_epi16(0);
+  zero_reg = _mm256_set1_epi16(0);
+
+  // x_offset = 0 and y_offset = 0
+  if (x_offset == 0) {
+    if (y_offset == 0) {
+      for (i = 0; i < height; i++) {
+        LOAD_SRC_DST
+        // expend each byte to 2 bytes
+        MERGE_WITH_SRC(src_reg, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src += src_stride;
+        dst += dst_stride;
+      }
+      // x_offset = 0 and y_offset = 4
+    } else if (y_offset == 4) {
+      __m256i src_next_reg;
+      for (i = 0; i < height; i++) {
+        LOAD_SRC_DST
+        AVG_NEXT_SRC(src_reg, src_stride)
+        // expend each byte to 2 bytes
+        MERGE_WITH_SRC(src_reg, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src += src_stride;
+        dst += dst_stride;
+      }
+      // x_offset = 0 and y_offset = bilin interpolation
+    } else {
+      __m256i filter, pw8, src_next_reg;
+
+      y_offset <<= 5;
+      filter = _mm256_load_si256(
+          (__m256i const *)(bilinear_filters_avx2 + y_offset));
+      pw8 = _mm256_set1_epi16(8);
+      for (i = 0; i < height; i++) {
+        LOAD_SRC_DST
+        MERGE_NEXT_SRC(src_reg, src_stride)
+        FILTER_SRC(filter)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src += src_stride;
+        dst += dst_stride;
+      }
+    }
+    // x_offset = 4  and y_offset = 0
+  } else if (x_offset == 4) {
+    if (y_offset == 0) {
+      __m256i src_next_reg;
+      for (i = 0; i < height; i++) {
+        LOAD_SRC_DST
+        AVG_NEXT_SRC(src_reg, 1)
+        // expand each byte to 2 bytes
+        MERGE_WITH_SRC(src_reg, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src += src_stride;
+        dst += dst_stride;
+      }
+      // x_offset = 4  and y_offset = 4
+    } else if (y_offset == 4) {
+      __m256i src_next_reg, src_avg;
+      // load source and another source starting from the next
+      // following byte
+      src_reg = _mm256_loadu_si256((__m256i const *)(src));
+      AVG_NEXT_SRC(src_reg, 1)
+      for (i = 0; i < height; i++) {
+        src_avg = src_reg;
+        src += src_stride;
+        LOAD_SRC_DST
+        AVG_NEXT_SRC(src_reg, 1)
+        // average between previous average to current average
+        src_avg = _mm256_avg_epu8(src_avg, src_reg);
+        // expand each byte to 2 bytes
+        MERGE_WITH_SRC(src_avg, zero_reg)
+        // save current source average
+        CALC_SUM_SSE_INSIDE_LOOP
+        dst += dst_stride;
+      }
+      // x_offset = 4  and y_offset = bilin interpolation
+    } else {
+      __m256i filter, pw8, src_next_reg, src_avg;
+      y_offset <<= 5;
+      filter = _mm256_load_si256(
+          (__m256i const *)(bilinear_filters_avx2 + y_offset));
+      pw8 = _mm256_set1_epi16(8);
+      // load source and another source starting from the next
+      // following byte
+      src_reg = _mm256_loadu_si256((__m256i const *)(src));
+      AVG_NEXT_SRC(src_reg, 1)
+      for (i = 0; i < height; i++) {
+        // save current source average
+        src_avg = src_reg;
+        src += src_stride;
+        LOAD_SRC_DST
+        AVG_NEXT_SRC(src_reg, 1)
+        MERGE_WITH_SRC(src_avg, src_reg)
+        FILTER_SRC(filter)
+        CALC_SUM_SSE_INSIDE_LOOP
+        dst += dst_stride;
+      }
+    }
+    // x_offset = bilin interpolation and y_offset = 0
+  } else {
+    if (y_offset == 0) {
+      __m256i filter, pw8, src_next_reg;
+      x_offset <<= 5;
+      filter = _mm256_load_si256(
+          (__m256i const *)(bilinear_filters_avx2 + x_offset));
+      pw8 = _mm256_set1_epi16(8);
+      for (i = 0; i < height; i++) {
+        LOAD_SRC_DST
+        MERGE_NEXT_SRC(src_reg, 1)
+        FILTER_SRC(filter)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src += src_stride;
+        dst += dst_stride;
+      }
+      // x_offset = bilin interpolation and y_offset = 4
+    } else if (y_offset == 4) {
+      __m256i filter, pw8, src_next_reg, src_pack;
+      x_offset <<= 5;
+      filter = _mm256_load_si256(
+          (__m256i const *)(bilinear_filters_avx2 + x_offset));
+      pw8 = _mm256_set1_epi16(8);
+      src_reg = _mm256_loadu_si256((__m256i const *)(src));
+      MERGE_NEXT_SRC(src_reg, 1)
+      FILTER_SRC(filter)
+      // convert each 16 bit to 8 bit to each low and high lane source
+      src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+      for (i = 0; i < height; i++) {
+        src += src_stride;
+        LOAD_SRC_DST
+        MERGE_NEXT_SRC(src_reg, 1)
+        FILTER_SRC(filter)
+        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+        // average between previous pack to the current
+        src_pack = _mm256_avg_epu8(src_pack, src_reg);
+        MERGE_WITH_SRC(src_pack, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src_pack = src_reg;
+        dst += dst_stride;
+      }
+      // x_offset = bilin interpolation and y_offset = bilin interpolation
+    } else {
+      __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
+      x_offset <<= 5;
+      xfilter = _mm256_load_si256(
+          (__m256i const *)(bilinear_filters_avx2 + x_offset));
+      y_offset <<= 5;
+      yfilter = _mm256_load_si256(
+          (__m256i const *)(bilinear_filters_avx2 + y_offset));
+      pw8 = _mm256_set1_epi16(8);
+      // load source and another source starting from the next
+      // following byte
+      src_reg = _mm256_loadu_si256((__m256i const *)(src));
+      MERGE_NEXT_SRC(src_reg, 1)
+
+      FILTER_SRC(xfilter)
+      // convert each 16 bit to 8 bit to each low and high lane source
+      src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+      for (i = 0; i < height; i++) {
+        src += src_stride;
+        LOAD_SRC_DST
+        MERGE_NEXT_SRC(src_reg, 1)
+        FILTER_SRC(xfilter)
+        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+        // merge previous pack to current pack source
+        MERGE_WITH_SRC(src_pack, src_reg)
+        // filter the source
+        FILTER_SRC(yfilter)
+        src_pack = src_reg;
+        CALC_SUM_SSE_INSIDE_LOOP
+        dst += dst_stride;
+      }
+    }
+  }
+  CALC_SUM_AND_SSE
+  _mm256_zeroupper();
+  return sum;
+}
+
+unsigned int aom_sub_pixel_variance16xh_avx2(const uint8_t *src, int src_stride,
+                                             int x_offset, int y_offset,
+                                             const uint8_t *dst, int dst_stride,
+                                             int height, unsigned int *sse) {
+  __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
+  __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
+  __m256i zero_reg;
+  int i, sum;
+  sum_reg = _mm256_set1_epi16(0);
+  sse_reg = _mm256_set1_epi16(0);
+  zero_reg = _mm256_set1_epi16(0);
+
+  // x_offset = 0 and y_offset = 0
+  if (x_offset == 0) {
+    if (y_offset == 0) {
+      for (i = 0; i < height; i += 2) {
+        LOAD_SRC_DST_INSERT(src_stride, dst_stride)
+        // expend each byte to 2 bytes
+        MERGE_WITH_SRC(src_reg, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src += (src_stride << 1);
+        dst += (dst_stride << 1);
+      }
+      // x_offset = 0 and y_offset = 4
+    } else if (y_offset == 4) {
+      __m256i src_next_reg;
+      for (i = 0; i < height; i += 2) {
+        LOAD_SRC_DST_INSERT(src_stride, dst_stride)
+        AVG_NEXT_SRC_INSERT(src_reg, src_stride)
+        // expend each byte to 2 bytes
+        MERGE_WITH_SRC(src_reg, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src += (src_stride << 1);
+        dst += (dst_stride << 1);
+      }
+      // x_offset = 0 and y_offset = bilin interpolation
+    } else {
+      __m256i filter, pw8, src_next_reg;
+      y_offset <<= 5;
+      filter = _mm256_load_si256(
+          (__m256i const *)(bilinear_filters_avx2 + y_offset));
+      pw8 = _mm256_set1_epi16(8);
+      for (i = 0; i < height; i += 2) {
+        LOAD_SRC_DST_INSERT(src_stride, dst_stride)
+        MERGE_NEXT_SRC_INSERT(src_reg, src_stride)
+        FILTER_SRC(filter)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src += (src_stride << 1);
+        dst += (dst_stride << 1);
+      }
+    }
+    // x_offset = 4  and y_offset = 0
+  } else if (x_offset == 4) {
+    if (y_offset == 0) {
+      __m256i src_next_reg;
+      for (i = 0; i < height; i += 2) {
+        LOAD_SRC_NEXT_BYTE_INSERT
+        LOAD_DST_INSERT
+        /* average between current and next stride source */
+        src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
+        // expand each byte to 2 bytes
+        MERGE_WITH_SRC(src_reg, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src += (src_stride << 1);
+        dst += (dst_stride << 1);
+      }
+      // x_offset = 4  and y_offset = 4
+    } else if (y_offset == 4) {
+      __m256i src_next_reg, src_avg, src_temp;
+      // load and insert source and next row source
+      LOAD_SRC_NEXT_BYTE_INSERT
+      src_avg = _mm256_avg_epu8(src_reg, src_next_reg);
+      src += src_stride << 1;
+      for (i = 0; i < height - 2; i += 2) {
+        LOAD_SRC_NEXT_BYTE_INSERT
+        src_next_reg = _mm256_avg_epu8(src_reg, src_next_reg);
+        src_temp = _mm256_permute2x128_si256(src_avg, src_next_reg, 0x21);
+        src_temp = _mm256_avg_epu8(src_avg, src_temp);
+        LOAD_DST_INSERT
+        // expand each byte to 2 bytes
+        MERGE_WITH_SRC(src_temp, zero_reg)
+        // save current source average
+        src_avg = src_next_reg;
+        CALC_SUM_SSE_INSIDE_LOOP
+        dst += dst_stride << 1;
+        src += src_stride << 1;
+      }
+      // last 2 rows processing happens here
+      __m128i src_reg_0 = _mm_loadu_si128((__m128i *)(src));
+      __m128i src_reg_1 = _mm_loadu_si128((__m128i *)(src + 1));
+      src_reg_0 = _mm_avg_epu8(src_reg_0, src_reg_1);
+      src_next_reg = _mm256_permute2x128_si256(
+          src_avg, _mm256_castsi128_si256(src_reg_0), 0x21);
+      LOAD_DST_INSERT
+      src_avg = _mm256_avg_epu8(src_avg, src_next_reg);
+      MERGE_WITH_SRC(src_avg, zero_reg)
+      CALC_SUM_SSE_INSIDE_LOOP
+    } else {
+      // x_offset = 4  and y_offset = bilin interpolation
+      __m256i filter, pw8, src_next_reg, src_avg, src_temp;
+      y_offset <<= 5;
+      filter = _mm256_load_si256(
+          (__m256i const *)(bilinear_filters_avx2 + y_offset));
+      pw8 = _mm256_set1_epi16(8);
+      // load and insert source and next row source
+      LOAD_SRC_NEXT_BYTE_INSERT
+      src_avg = _mm256_avg_epu8(src_reg, src_next_reg);
+      src += src_stride << 1;
+      for (i = 0; i < height - 2; i += 2) {
+        LOAD_SRC_NEXT_BYTE_INSERT
+        src_next_reg = _mm256_avg_epu8(src_reg, src_next_reg);
+        src_temp = _mm256_permute2x128_si256(src_avg, src_next_reg, 0x21);
+        LOAD_DST_INSERT
+        MERGE_WITH_SRC(src_avg, src_temp)
+        // save current source average
+        src_avg = src_next_reg;
+        FILTER_SRC(filter)
+        CALC_SUM_SSE_INSIDE_LOOP
+        dst += dst_stride << 1;
+        src += src_stride << 1;
+      }
+      // last 2 rows processing happens here
+      __m128i src_reg_0 = _mm_loadu_si128((__m128i *)(src));
+      __m128i src_reg_1 = _mm_loadu_si128((__m128i *)(src + 1));
+      src_reg_0 = _mm_avg_epu8(src_reg_0, src_reg_1);
+      src_next_reg = _mm256_permute2x128_si256(
+          src_avg, _mm256_castsi128_si256(src_reg_0), 0x21);
+      LOAD_DST_INSERT
+      MERGE_WITH_SRC(src_avg, src_next_reg)
+      FILTER_SRC(filter)
+      CALC_SUM_SSE_INSIDE_LOOP
+    }
+    // x_offset = bilin interpolation and y_offset = 0
+  } else {
+    if (y_offset == 0) {
+      __m256i filter, pw8, src_next_reg;
+      x_offset <<= 5;
+      filter = _mm256_load_si256(
+          (__m256i const *)(bilinear_filters_avx2 + x_offset));
+      pw8 = _mm256_set1_epi16(8);
+      for (i = 0; i < height; i += 2) {
+        LOAD_SRC_DST_INSERT(src_stride, dst_stride)
+        MERGE_NEXT_SRC_INSERT(src_reg, 1)
+        FILTER_SRC(filter)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src += (src_stride << 1);
+        dst += (dst_stride << 1);
+      }
+      // x_offset = bilin interpolation and y_offset = 4
+    } else if (y_offset == 4) {
+      __m256i filter, pw8, src_next_reg, src_pack;
+      x_offset <<= 5;
+      filter = _mm256_load_si256(
+          (__m256i const *)(bilinear_filters_avx2 + x_offset));
+      pw8 = _mm256_set1_epi16(8);
+      // load and insert source and next row source
+      LOAD_SRC_NEXT_BYTE_INSERT
+      MERGE_WITH_SRC(src_reg, src_next_reg)
+      FILTER_SRC(filter)
+      // convert each 16 bit to 8 bit to each low and high lane source
+      src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+      src += src_stride << 1;
+      for (i = 0; i < height - 2; i += 2) {
+        LOAD_SRC_NEXT_BYTE_INSERT
+        LOAD_DST_INSERT
+        MERGE_WITH_SRC(src_reg, src_next_reg)
+        FILTER_SRC(filter)
+        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+        src_next_reg = _mm256_permute2x128_si256(src_pack, src_reg, 0x21);
+        // average between previous pack to the current
+        src_pack = _mm256_avg_epu8(src_pack, src_next_reg);
+        MERGE_WITH_SRC(src_pack, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src_pack = src_reg;
+        src += src_stride << 1;
+        dst += dst_stride << 1;
+      }
+      // last 2 rows processing happens here
+      LOAD_SRC_MERGE_128BIT(filter)
+      LOAD_DST_INSERT
+      FILTER_SRC_128BIT(filter_128bit)
+      src_reg_0 = _mm_packus_epi16(src_lo, src_hi);
+      src_next_reg = _mm256_permute2x128_si256(
+          src_pack, _mm256_castsi128_si256(src_reg_0), 0x21);
+      // average between previous pack to the current
+      src_pack = _mm256_avg_epu8(src_pack, src_next_reg);
+      MERGE_WITH_SRC(src_pack, zero_reg)
+      CALC_SUM_SSE_INSIDE_LOOP
+    } else {
+      // x_offset = bilin interpolation and y_offset = bilin interpolation
+      __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
+      x_offset <<= 5;
+      xfilter = _mm256_load_si256(
+          (__m256i const *)(bilinear_filters_avx2 + x_offset));
+      y_offset <<= 5;
+      yfilter = _mm256_load_si256(
+          (__m256i const *)(bilinear_filters_avx2 + y_offset));
+      pw8 = _mm256_set1_epi16(8);
+      // load and insert source and next row source
+      LOAD_SRC_NEXT_BYTE_INSERT
+      MERGE_WITH_SRC(src_reg, src_next_reg)
+      FILTER_SRC(xfilter)
+      // convert each 16 bit to 8 bit to each low and high lane source
+      src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+      src += src_stride << 1;
+      for (i = 0; i < height - 2; i += 2) {
+        LOAD_SRC_NEXT_BYTE_INSERT
+        LOAD_DST_INSERT
+        MERGE_WITH_SRC(src_reg, src_next_reg)
+        FILTER_SRC(xfilter)
+        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+        src_next_reg = _mm256_permute2x128_si256(src_pack, src_reg, 0x21);
+        // average between previous pack to the current
+        MERGE_WITH_SRC(src_pack, src_next_reg)
+        // filter the source
+        FILTER_SRC(yfilter)
+        src_pack = src_reg;
+        CALC_SUM_SSE_INSIDE_LOOP
+        src += src_stride << 1;
+        dst += dst_stride << 1;
+      }
+      // last 2 rows processing happens here
+      LOAD_SRC_MERGE_128BIT(xfilter)
+      LOAD_DST_INSERT
+      FILTER_SRC_128BIT(filter_128bit)
+      src_reg_0 = _mm_packus_epi16(src_lo, src_hi);
+      src_next_reg = _mm256_permute2x128_si256(
+          src_pack, _mm256_castsi128_si256(src_reg_0), 0x21);
+      MERGE_WITH_SRC(src_pack, src_next_reg)
+      FILTER_SRC(yfilter)
+      CALC_SUM_SSE_INSIDE_LOOP
+    }
+  }
+  CALC_SUM_AND_SSE
+  _mm256_zeroupper();
+  return sum;
+}
+
+unsigned int aom_sub_pixel_avg_variance32xh_avx2(
+    const uint8_t *src, int src_stride, int x_offset, int y_offset,
+    const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride,
+    int height, unsigned int *sse) {
+  __m256i sec_reg;
+  __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
+  __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
+  __m256i zero_reg;
+  int i, sum;
+  sum_reg = _mm256_set1_epi16(0);
+  sse_reg = _mm256_set1_epi16(0);
+  zero_reg = _mm256_set1_epi16(0);
+
+  // x_offset = 0 and y_offset = 0
+  if (x_offset == 0) {
+    if (y_offset == 0) {
+      for (i = 0; i < height; i++) {
+        LOAD_SRC_DST
+        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
+        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
+        sec += sec_stride;
+        // expend each byte to 2 bytes
+        MERGE_WITH_SRC(src_reg, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src += src_stride;
+        dst += dst_stride;
+      }
+    } else if (y_offset == 8) {
+      __m256i src_next_reg;
+      for (i = 0; i < height; i++) {
+        LOAD_SRC_DST
+        AVG_NEXT_SRC(src_reg, src_stride)
+        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
+        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
+        sec += sec_stride;
+        // expend each byte to 2 bytes
+        MERGE_WITH_SRC(src_reg, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src += src_stride;
+        dst += dst_stride;
+      }
+      // x_offset = 0 and y_offset = bilin interpolation
+    } else {
+      __m256i filter, pw8, src_next_reg;
+
+      y_offset <<= 5;
+      filter = _mm256_load_si256(
+          (__m256i const *)(bilinear_filters_avx2 + y_offset));
+      pw8 = _mm256_set1_epi16(8);
+      for (i = 0; i < height; i++) {
+        LOAD_SRC_DST
+        MERGE_NEXT_SRC(src_reg, src_stride)
+        FILTER_SRC(filter)
+        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
+        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
+        sec += sec_stride;
+        MERGE_WITH_SRC(src_reg, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src += src_stride;
+        dst += dst_stride;
+      }
+    }
+    // x_offset = 8  and y_offset = 0
+  } else if (x_offset == 8) {
+    if (y_offset == 0) {
+      __m256i src_next_reg;
+      for (i = 0; i < height; i++) {
+        LOAD_SRC_DST
+        AVG_NEXT_SRC(src_reg, 1)
+        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
+        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
+        sec += sec_stride;
+        // expand each byte to 2 bytes
+        MERGE_WITH_SRC(src_reg, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src += src_stride;
+        dst += dst_stride;
+      }
+      // x_offset = 8  and y_offset = 8
+    } else if (y_offset == 8) {
+      __m256i src_next_reg, src_avg;
+      // load source and another source starting from the next
+      // following byte
+      src_reg = _mm256_loadu_si256((__m256i const *)(src));
+      AVG_NEXT_SRC(src_reg, 1)
+      for (i = 0; i < height; i++) {
+        // save current source average
+        src_avg = src_reg;
+        src += src_stride;
+        LOAD_SRC_DST
+        AVG_NEXT_SRC(src_reg, 1)
+        // average between previous average to current average
+        src_avg = _mm256_avg_epu8(src_avg, src_reg);
+        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
+        src_avg = _mm256_avg_epu8(src_avg, sec_reg);
+        sec += sec_stride;
+        // expand each byte to 2 bytes
+        MERGE_WITH_SRC(src_avg, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        dst += dst_stride;
+      }
+      // x_offset = 8  and y_offset = bilin interpolation
+    } else {
+      __m256i filter, pw8, src_next_reg, src_avg;
+      y_offset <<= 5;
+      filter = _mm256_load_si256(
+          (__m256i const *)(bilinear_filters_avx2 + y_offset));
+      pw8 = _mm256_set1_epi16(8);
+      // load source and another source starting from the next
+      // following byte
+      src_reg = _mm256_loadu_si256((__m256i const *)(src));
+      AVG_NEXT_SRC(src_reg, 1)
+      for (i = 0; i < height; i++) {
+        // save current source average
+        src_avg = src_reg;
+        src += src_stride;
+        LOAD_SRC_DST
+        AVG_NEXT_SRC(src_reg, 1)
+        MERGE_WITH_SRC(src_avg, src_reg)
+        FILTER_SRC(filter)
+        src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
+        src_avg = _mm256_avg_epu8(src_avg, sec_reg);
+        // expand each byte to 2 bytes
+        MERGE_WITH_SRC(src_avg, zero_reg)
+        sec += sec_stride;
+        CALC_SUM_SSE_INSIDE_LOOP
+        dst += dst_stride;
+      }
+    }
+    // x_offset = bilin interpolation and y_offset = 0
+  } else {
+    if (y_offset == 0) {
+      __m256i filter, pw8, src_next_reg;
+      x_offset <<= 5;
+      filter = _mm256_load_si256(
+          (__m256i const *)(bilinear_filters_avx2 + x_offset));
+      pw8 = _mm256_set1_epi16(8);
+      for (i = 0; i < height; i++) {
+        LOAD_SRC_DST
+        MERGE_NEXT_SRC(src_reg, 1)
+        FILTER_SRC(filter)
+        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
+        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
+        MERGE_WITH_SRC(src_reg, zero_reg)
+        sec += sec_stride;
+        CALC_SUM_SSE_INSIDE_LOOP
+        src += src_stride;
+        dst += dst_stride;
+      }
+      // x_offset = bilin interpolation and y_offset = 8
+    } else if (y_offset == 8) {
+      __m256i filter, pw8, src_next_reg, src_pack;
+      x_offset <<= 5;
+      filter = _mm256_load_si256(
+          (__m256i const *)(bilinear_filters_avx2 + x_offset));
+      pw8 = _mm256_set1_epi16(8);
+      src_reg = _mm256_loadu_si256((__m256i const *)(src));
+      MERGE_NEXT_SRC(src_reg, 1)
+      FILTER_SRC(filter)
+      // convert each 16 bit to 8 bit to each low and high lane source
+      src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+      for (i = 0; i < height; i++) {
+        src += src_stride;
+        LOAD_SRC_DST
+        MERGE_NEXT_SRC(src_reg, 1)
+        FILTER_SRC(filter)
+        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+        // average between previous pack to the current
+        src_pack = _mm256_avg_epu8(src_pack, src_reg);
+        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
+        src_pack = _mm256_avg_epu8(src_pack, sec_reg);
+        sec += sec_stride;
+        MERGE_WITH_SRC(src_pack, zero_reg)
+        src_pack = src_reg;
+        CALC_SUM_SSE_INSIDE_LOOP
+        dst += dst_stride;
+      }
+      // x_offset = bilin interpolation and y_offset = bilin interpolation
+    } else {
+      __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
+      x_offset <<= 5;
+      xfilter = _mm256_load_si256(
+          (__m256i const *)(bilinear_filters_avx2 + x_offset));
+      y_offset <<= 5;
+      yfilter = _mm256_load_si256(
+          (__m256i const *)(bilinear_filters_avx2 + y_offset));
+      pw8 = _mm256_set1_epi16(8);
+      // load source and another source starting from the next
+      // following byte
+      src_reg = _mm256_loadu_si256((__m256i const *)(src));
+      MERGE_NEXT_SRC(src_reg, 1)
+
+      FILTER_SRC(xfilter)
+      // convert each 16 bit to 8 bit to each low and high lane source
+      src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+      for (i = 0; i < height; i++) {
+        src += src_stride;
+        LOAD_SRC_DST
+        MERGE_NEXT_SRC(src_reg, 1)
+        FILTER_SRC(xfilter)
+        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+        // merge previous pack to current pack source
+        MERGE_WITH_SRC(src_pack, src_reg)
+        // filter the source
+        FILTER_SRC(yfilter)
+        src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
+        src_pack = _mm256_avg_epu8(src_pack, sec_reg);
+        MERGE_WITH_SRC(src_pack, zero_reg)
+        src_pack = src_reg;
+        sec += sec_stride;
+        CALC_SUM_SSE_INSIDE_LOOP
+        dst += dst_stride;
+      }
+    }
+  }
+  CALC_SUM_AND_SSE
+  _mm256_zeroupper();
+  return sum;
+}
diff --git a/libs/libaom/src/aom_dsp/x86/variance_impl_ssse3.c b/libs/libaom/src/aom_dsp/x86/variance_impl_ssse3.c
new file mode 100644
index 000000000..66b0d7d84
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/variance_impl_ssse3.c
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/synonyms.h"
+
+void aom_var_filter_block2d_bil_first_pass_ssse3(
+    const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line,
+    unsigned int pixel_step, unsigned int output_height,
+    unsigned int output_width, const uint8_t *filter) {
+  // Note: filter[0], filter[1] could be {128, 0}, where 128 will overflow
+  // in computation using _mm_maddubs_epi16.
+  // Change {128, 0} to {64, 0} and reduce FILTER_BITS by 1 to avoid overflow.
+  const int16_t round = (1 << (FILTER_BITS - 1)) >> 1;
+  const __m128i r = _mm_set1_epi16(round);
+  const uint8_t f0 = filter[0] >> 1;
+  const uint8_t f1 = filter[1] >> 1;
+  const __m128i filters = _mm_setr_epi8(f0, f1, f0, f1, f0, f1, f0, f1, f0, f1,
+                                        f0, f1, f0, f1, f0, f1);
+  unsigned int i, j;
+  (void)pixel_step;
+
+  if (output_width >= 8) {
+    for (i = 0; i < output_height; ++i) {
+      for (j = 0; j < output_width; j += 8) {
+        // load source
+        __m128i source_low = xx_loadl_64(a);
+        __m128i source_hi = xx_loadl_64(a + 1);
+
+        // unpack to:
+        // { a[0], a[1], a[1], a[2], a[2], a[3], a[3], a[4],
+        //   a[4], a[5], a[5], a[6], a[6], a[7], a[7], a[8] }
+        __m128i source = _mm_unpacklo_epi8(source_low, source_hi);
+
+        // b[i] = a[i] * filter[0] + a[i + 1] * filter[1]
+        __m128i res = _mm_maddubs_epi16(source, filters);
+
+        // round
+        res = _mm_srai_epi16(_mm_add_epi16(res, r), FILTER_BITS - 1);
+
+        xx_storeu_128(b, res);
+
+        a += 8;
+        b += 8;
+      }
+
+      a += src_pixels_per_line - output_width;
+    }
+  } else {
+    const __m128i shuffle_mask =
+        _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8);
+    for (i = 0; i < output_height; ++i) {
+      // load source, only first 5 values are meaningful:
+      // { a[0], a[1], a[2], a[3], a[4], xxxx }
+      __m128i source = xx_loadl_64(a);
+
+      // shuffle, up to the first 8 are useful
+      // { a[0], a[1], a[1], a[2], a[2], a[3], a[3], a[4],
+      //   a[4], a[5], a[5], a[6], a[6], a[7], a[7], a[8] }
+      __m128i source_shuffle = _mm_shuffle_epi8(source, shuffle_mask);
+
+      __m128i res = _mm_maddubs_epi16(source_shuffle, filters);
+      res = _mm_srai_epi16(_mm_add_epi16(res, r), FILTER_BITS - 1);
+
+      xx_storel_64(b, res);
+
+      a += src_pixels_per_line;
+      b += output_width;
+    }
+  }
+}
+
+void aom_var_filter_block2d_bil_second_pass_ssse3(
+    const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line,
+    unsigned int pixel_step, unsigned int output_height,
+    unsigned int output_width, const uint8_t *filter) {
+  const int16_t round = (1 << FILTER_BITS) >> 1;
+  const __m128i r = _mm_set1_epi32(round);
+  const __m128i filters =
+      _mm_setr_epi16(filter[0], filter[1], filter[0], filter[1], filter[0],
+                     filter[1], filter[0], filter[1]);
+  const __m128i shuffle_mask =
+      _mm_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
+  const __m128i mask =
+      _mm_setr_epi8(0, 4, 8, 12, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+  unsigned int i, j;
+
+  for (i = 0; i < output_height; ++i) {
+    for (j = 0; j < output_width; j += 4) {
+      // load source as:
+      // { a[0], a[1], a[2], a[3], a[w], a[w+1], a[w+2], a[w+3] }
+      __m128i source1 = xx_loadl_64(a);
+      __m128i source2 = xx_loadl_64(a + pixel_step);
+      __m128i source = _mm_unpacklo_epi64(source1, source2);
+
+      // shuffle source to:
+      // { a[0], a[w], a[1], a[w+1], a[2], a[w+2], a[3], a[w+3] }
+      __m128i source_shuffle = _mm_shuffle_epi8(source, shuffle_mask);
+
+      // b[i] = a[i] * filter[0] + a[w + i] * filter[1]
+      __m128i res = _mm_madd_epi16(source_shuffle, filters);
+
+      // round
+      res = _mm_srai_epi32(_mm_add_epi32(res, r), FILTER_BITS);
+
+      // shuffle to get each lower 8 bit of every 32 bit
+      res = _mm_shuffle_epi8(res, mask);
+
+      xx_storel_32(b, res);
+
+      a += 4;
+      b += 4;
+    }
+
+    a += src_pixels_per_line - output_width;
+  }
+}
diff --git a/libs/libaom/src/aom_dsp/x86/variance_sse2.c b/libs/libaom/src/aom_dsp/x86/variance_sse2.c
new file mode 100644
index 000000000..4e2b5a1aa
--- /dev/null
+++ b/libs/libaom/src/aom_dsp/x86/variance_sse2.c
@@ -0,0 +1,757 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>  // SSE2
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/blend.h"
+#include "aom_dsp/x86/synonyms.h"
+
+#include "aom_ports/mem.h"
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/filter.h"
+#include "av1/common/reconinter.h"
+#include "av1/encoder/reconinter_enc.h"
+
+unsigned int aom_get_mb_ss_sse2(const int16_t *src) {
+  __m128i vsum = _mm_setzero_si128();
+  int i;
+
+  for (i = 0; i < 32; ++i) {
+    const __m128i v = xx_loadu_128(src);
+    vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v));
+    src += 8;
+  }
+
+  vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
+  vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
+  return _mm_cvtsi128_si32(vsum);
+}
+
+static INLINE __m128i load4x2_sse2(const uint8_t *const p, const int stride) {
+  const __m128i p0 = _mm_cvtsi32_si128(*(const uint32_t *)(p + 0 * stride));
+  const __m128i p1 = _mm_cvtsi32_si128(*(const uint32_t *)(p + 1 * stride));
+  return _mm_unpacklo_epi8(_mm_unpacklo_epi32(p0, p1), _mm_setzero_si128());
+}
+
+static INLINE __m128i load8_8to16_sse2(const uint8_t *const p) {
+  const __m128i p0 = _mm_loadl_epi64((const __m128i *)p);
+  return _mm_unpacklo_epi8(p0, _mm_setzero_si128());
+}
+
+// Accumulate 4 32bit numbers in val to 1 32bit number
+static INLINE unsigned int add32x4_sse2(__m128i val) {
+  val = _mm_add_epi32(val, _mm_srli_si128(val, 8));
+  val = _mm_add_epi32(val, _mm_srli_si128(val, 4));
+  return _mm_cvtsi128_si32(val);
+}
+
+// Accumulate 8 16bit in sum to 4 32bit number
+static INLINE __m128i sum_to_32bit_sse2(const __m128i sum) {
+  const __m128i sum_lo = _mm_srai_epi32(_mm_unpacklo_epi16(sum, sum), 16);
+  const __m128i sum_hi = _mm_srai_epi32(_mm_unpackhi_epi16(sum, sum), 16);
+  return _mm_add_epi32(sum_lo, sum_hi);
+}
+
+static INLINE void variance_kernel_sse2(const __m128i src, const __m128i ref,
+                                        __m128i *const sse,
+                                        __m128i *const sum) {
+  const __m128i diff = _mm_sub_epi16(src, ref);
+  *sse = _mm_add_epi32(*sse, _mm_madd_epi16(diff, diff));
+  *sum = _mm_add_epi16(*sum, diff);
+}
+
+// Can handle 128 pixels' diff sum (such as 8x16 or 16x8)
+// Slightly faster than variance_final_256_pel_sse2()
+// diff sum of 128 pixels can still fit in 16bit integer
+static INLINE void variance_final_128_pel_sse2(__m128i vsse, __m128i vsum,
+                                               unsigned int *const sse,
+                                               int *const sum) {
+  *sse = add32x4_sse2(vsse);
+
+  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
+  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
+  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
+  *sum = (int16_t)_mm_extract_epi16(vsum, 0);
+}
+
+// Can handle 256 pixels' diff sum (such as 16x16)
+static INLINE void variance_final_256_pel_sse2(__m128i vsse, __m128i vsum,
+                                               unsigned int *const sse,
+                                               int *const sum) {
+  *sse = add32x4_sse2(vsse);
+
+  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
+  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
+  *sum = (int16_t)_mm_extract_epi16(vsum, 0);
+  *sum += (int16_t)_mm_extract_epi16(vsum, 1);
+}
+
+// Can handle 512 pixels' diff sum (such as 16x32 or 32x16)
+static INLINE void variance_final_512_pel_sse2(__m128i vsse, __m128i vsum,
+                                               unsigned int *const sse,
+                                               int *const sum) {
+  *sse = add32x4_sse2(vsse);
+
+  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
+  vsum = _mm_unpacklo_epi16(vsum, vsum);
+  vsum = _mm_srai_epi32(vsum, 16);
+  *sum = add32x4_sse2(vsum);
+}
+
+// Can handle 1024 pixels' diff sum (such as 32x32)
+static INLINE void variance_final_1024_pel_sse2(__m128i vsse, __m128i vsum,
+                                                unsigned int *const sse,
+                                                int *const sum) {
+  *sse = add32x4_sse2(vsse);
+
+  vsum = sum_to_32bit_sse2(vsum);
+  *sum = add32x4_sse2(vsum);
+}
+
+static INLINE void variance4_sse2(const uint8_t *src, const int src_stride,
+                                  const uint8_t *ref, const int ref_stride,
+                                  const int h, __m128i *const sse,
+                                  __m128i *const sum) {
+  assert(h <= 256);  // May overflow for larger height.
+  *sum = _mm_setzero_si128();
+
+  for (int i = 0; i < h; i += 2) {
+    const __m128i s = load4x2_sse2(src, src_stride);
+    const __m128i r = load4x2_sse2(ref, ref_stride);
+
+    variance_kernel_sse2(s, r, sse, sum);
+    src += 2 * src_stride;
+    ref += 2 * ref_stride;
+  }
+}
+
+static INLINE void variance8_sse2(const uint8_t *src, const int src_stride,
+                                  const uint8_t *ref, const int ref_stride,
+                                  const int h, __m128i *const sse,
+                                  __m128i *const sum) {
+  assert(h <= 128);  // May overflow for larger height.
+  *sum = _mm_setzero_si128();
+  *sse = _mm_setzero_si128();
+  for (int i = 0; i < h; i++) {
+    const __m128i s = load8_8to16_sse2(src);
+    const __m128i r = load8_8to16_sse2(ref);
+
+    variance_kernel_sse2(s, r, sse, sum);
+    src += src_stride;
+    ref += ref_stride;
+  }
+}
+
+static INLINE void variance16_kernel_sse2(const uint8_t *const src,
+                                          const uint8_t *const ref,
+                                          __m128i *const sse,
+                                          __m128i *const sum) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i s = _mm_loadu_si128((const __m128i *)src);
+  const __m128i r = _mm_loadu_si128((const __m128i *)ref);
+  const __m128i src0 = _mm_unpacklo_epi8(s, zero);
+  const __m128i ref0 = _mm_unpacklo_epi8(r, zero);
+  const __m128i src1 = _mm_unpackhi_epi8(s, zero);
+  const __m128i ref1 = _mm_unpackhi_epi8(r, zero);
+
+  variance_kernel_sse2(src0, ref0, sse, sum);
+  variance_kernel_sse2(src1, ref1, sse, sum);
+}
+
+static INLINE void variance16_sse2(const uint8_t *src, const int src_stride,
+                                   const uint8_t *ref, const int ref_stride,
+                                   const int h, __m128i *const sse,
+                                   __m128i *const sum) {
+  assert(h <= 64);  // May overflow for larger height.
+  *sum = _mm_setzero_si128();
+
+  for (int i = 0; i < h; ++i) {
+    variance16_kernel_sse2(src, ref, sse, sum);
+    src += src_stride;
+    ref += ref_stride;
+  }
+}
+
+static INLINE void variance32_sse2(const uint8_t *src, const int src_stride,
+                                   const uint8_t *ref, const int ref_stride,
+                                   const int h, __m128i *const sse,
+                                   __m128i *const sum) {
+  assert(h <= 32);  // May overflow for larger height.
+  // Don't initialize sse here since it's an accumulation.
+  *sum = _mm_setzero_si128();
+
+  for (int i = 0; i < h; ++i) {
+    variance16_kernel_sse2(src + 0, ref + 0, sse, sum);
+    variance16_kernel_sse2(src + 16, ref + 16, sse, sum);
+    src += src_stride;
+    ref += ref_stride;
+  }
+}
+
+static INLINE void variance64_sse2(const uint8_t *src, const int src_stride,
+                                   const uint8_t *ref, const int ref_stride,
+                                   const int h, __m128i *const sse,
+                                   __m128i *const sum) {
+  assert(h <= 16);  // May overflow for larger height.
+  *sum = _mm_setzero_si128();
+
+  for (int i = 0; i < h; ++i) {
+    variance16_kernel_sse2(src + 0, ref + 0, sse, sum);
+    variance16_kernel_sse2(src + 16, ref + 16, sse, sum);
+    variance16_kernel_sse2(src + 32, ref + 32, sse, sum);
+    variance16_kernel_sse2(src + 48, ref + 48, sse, sum);
+    src += src_stride;
+    ref += ref_stride;
+  }
+}
+
+static INLINE void variance128_sse2(const uint8_t *src, const int src_stride,
+                                    const uint8_t *ref, const int ref_stride,
+                                    const int h, __m128i *const sse,
+                                    __m128i *const sum) {
+  assert(h <= 8);  // May overflow for larger height.
+  *sum = _mm_setzero_si128();
+
+  for (int i = 0; i < h; ++i) {
+    for (int j = 0; j < 4; ++j) {
+      const int offset0 = j << 5;
+      const int offset1 = offset0 + 16;
+      variance16_kernel_sse2(src + offset0, ref + offset0, sse, sum);
+      variance16_kernel_sse2(src + offset1, ref + offset1, sse, sum);
+    }
+    src += src_stride;
+    ref += ref_stride;
+  }
+}
+
+void aom_get8x8var_sse2(const uint8_t *src_ptr, int src_stride,
+                        const uint8_t *ref_ptr, int ref_stride,
+                        unsigned int *sse, int *sum) {
+  __m128i vsse, vsum;
+  variance8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum);
+  variance_final_128_pel_sse2(vsse, vsum, sse, sum);
+}
+
+#define AOM_VAR_NO_LOOP_SSE2(bw, bh, bits, max_pixels)                        \
+  unsigned int aom_variance##bw##x##bh##_sse2(                                \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      unsigned int *sse) {                                                    \
+    __m128i vsse = _mm_setzero_si128();                                       \
+    __m128i vsum;                                                             \
+    int sum = 0;                                                              \
+    variance##bw##_sse2(src, src_stride, ref, ref_stride, bh, &vsse, &vsum);  \
+    variance_final_##max_pixels##_pel_sse2(vsse, vsum, sse, &sum);            \
+    assert(sum <= 255 * bw * bh);                                             \
+    assert(sum >= -255 * bw * bh);                                            \
+    return *sse - (uint32_t)(((int64_t)sum * sum) >> bits);                   \
+  }
+
+AOM_VAR_NO_LOOP_SSE2(4, 4, 4, 128);
+AOM_VAR_NO_LOOP_SSE2(4, 8, 5, 128);
+AOM_VAR_NO_LOOP_SSE2(4, 16, 6, 128);
+
+AOM_VAR_NO_LOOP_SSE2(8, 4, 5, 128);
+AOM_VAR_NO_LOOP_SSE2(8, 8, 6, 128);
+AOM_VAR_NO_LOOP_SSE2(8, 16, 7, 128);
+AOM_VAR_NO_LOOP_SSE2(8, 32, 8, 256);
+
+AOM_VAR_NO_LOOP_SSE2(16, 4, 6, 128);
+AOM_VAR_NO_LOOP_SSE2(16, 8, 7, 128);
+AOM_VAR_NO_LOOP_SSE2(16, 16, 8, 256);
+AOM_VAR_NO_LOOP_SSE2(16, 32, 9, 512);
+AOM_VAR_NO_LOOP_SSE2(16, 64, 10, 1024);
+
+AOM_VAR_NO_LOOP_SSE2(32, 8, 8, 256);
+AOM_VAR_NO_LOOP_SSE2(32, 16, 9, 512);
+AOM_VAR_NO_LOOP_SSE2(32, 32, 10, 1024);
+
+#define AOM_VAR_LOOP_SSE2(bw, bh, bits, uh)                                   \
+  unsigned int aom_variance##bw##x##bh##_sse2(                                \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      unsigned int *sse) {                                                    \
+    __m128i vsse = _mm_setzero_si128();                                       \
+    __m128i vsum = _mm_setzero_si128();                                       \
+    for (int i = 0; i < (bh / uh); ++i) {                                     \
+      __m128i vsum16;                                                         \
+      variance##bw##_sse2(src, src_stride, ref, ref_stride, uh, &vsse,        \
+                          &vsum16);                                           \
+      vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16));                  \
+      src += (src_stride * uh);                                               \
+      ref += (ref_stride * uh);                                               \
+    }                                                                         \
+    *sse = add32x4_sse2(vsse);                                                \
+    int sum = add32x4_sse2(vsum);                                             \
+    assert(sum <= 255 * bw * bh);                                             \
+    assert(sum >= -255 * bw * bh);                                            \
+    return *sse - (uint32_t)(((int64_t)sum * sum) >> bits);                   \
+  }
+
+AOM_VAR_LOOP_SSE2(32, 64, 11, 32);  // 32x32 * ( 64/32 )
+
+AOM_VAR_NO_LOOP_SSE2(64, 16, 10, 1024);
+AOM_VAR_LOOP_SSE2(64, 32, 11, 16);   // 64x16 * ( 32/16 )
+AOM_VAR_LOOP_SSE2(64, 64, 12, 16);   // 64x16 * ( 64/16 )
+AOM_VAR_LOOP_SSE2(64, 128, 13, 16);  // 64x16 * ( 128/16 )
+
+AOM_VAR_LOOP_SSE2(128, 64, 13, 8);   // 128x8 * ( 64/8 )
+AOM_VAR_LOOP_SSE2(128, 128, 14, 8);  // 128x8 * ( 128/8 )
+
+unsigned int aom_mse8x8_sse2(const uint8_t *src, int src_stride,
+                             const uint8_t *ref, int ref_stride,
+                             unsigned int *sse) {
+  aom_variance8x8_sse2(src, src_stride, ref, ref_stride, sse);
+  return *sse;
+}
+
+unsigned int aom_mse8x16_sse2(const uint8_t *src, int src_stride,
+                              const uint8_t *ref, int ref_stride,
+                              unsigned int *sse) {
+  aom_variance8x16_sse2(src, src_stride, ref, ref_stride, sse);
+  return *sse;
+}
+
+unsigned int aom_mse16x8_sse2(const uint8_t *src, int src_stride,
+                              const uint8_t *ref, int ref_stride,
+                              unsigned int *sse) {
+  aom_variance16x8_sse2(src, src_stride, ref, ref_stride, sse);
+  return *sse;
+}
+
+unsigned int aom_mse16x16_sse2(const uint8_t *src, int src_stride,
+                               const uint8_t *ref, int ref_stride,
+                               unsigned int *sse) {
+  aom_variance16x16_sse2(src, src_stride, ref, ref_stride, sse);
+  return *sse;
+}
+
+// The 2 unused parameters are place holders for PIC enabled build.
+// These definitions are for functions defined in subpel_variance.asm
+#define DECL(w, opt)                                                           \
+  int aom_sub_pixel_variance##w##xh_##opt(                                     \
+      const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset,    \
+      const uint8_t *dst, ptrdiff_t dst_stride, int height, unsigned int *sse, \
+      void *unused0, void *unused)
+#define DECLS(opt) \
+  DECL(4, opt);    \
+  DECL(8, opt);    \
+  DECL(16, opt)
+
+DECLS(sse2);
+DECLS(ssse3);
+#undef DECLS
+#undef DECL
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast)                      \
+  unsigned int aom_sub_pixel_variance##w##x##h##_##opt(                       \
+      const uint8_t *src, int src_stride, int x_offset, int y_offset,         \
+      const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) {            \
+    /*Avoid overflow in helper by capping height.*/                           \
+    const int hf = AOMMIN(h, 64);                                             \
+    unsigned int sse = 0;                                                     \
+    int se = 0;                                                               \
+    for (int i = 0; i < (w / wf); ++i) {                                      \
+      const uint8_t *src_ptr = src;                                           \
+      const uint8_t *dst_ptr = dst;                                           \
+      for (int j = 0; j < (h / hf); ++j) {                                    \
+        unsigned int sse2;                                                    \
+        const int se2 = aom_sub_pixel_variance##wf##xh_##opt(                 \
+            src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, hf, \
+            &sse2, NULL, NULL);                                               \
+        dst_ptr += hf * dst_stride;                                           \
+        src_ptr += hf * src_stride;                                           \
+        se += se2;                                                            \
+        sse += sse2;                                                          \
+      }                                                                       \
+      src += wf;                                                              \
+      dst += wf;                                                              \
+    }                                                                         \
+    *sse_ptr = sse;                                                           \
+    return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2));  \
+  }
+
+#define FNS(opt)                                     \
+  FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)); \
+  FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t));  \
+  FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t));  \
+  FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t));   \
+  FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t));   \
+  FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t));   \
+  FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t));   \
+  FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t));   \
+  FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t));   \
+  FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t));  \
+  FN(16, 8, 16, 4, 3, opt, (int32_t), (int32_t));    \
+  FN(8, 16, 8, 3, 4, opt, (int32_t), (int32_t));     \
+  FN(8, 8, 8, 3, 3, opt, (int32_t), (int32_t));      \
+  FN(8, 4, 8, 3, 2, opt, (int32_t), (int32_t));      \
+  FN(4, 8, 4, 2, 3, opt, (int32_t), (int32_t));      \
+  FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t));      \
+  FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t));     \
+  FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t));    \
+  FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t));    \
+  FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t));   \
+  FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t));   \
+  FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t))
+
+FNS(sse2);
+FNS(ssse3);
+
+#undef FNS
+#undef FN
+
+// The 2 unused parameters are place holders for PIC enabled build.
+#define DECL(w, opt)                                                        \
+  int aom_sub_pixel_avg_variance##w##xh_##opt(                              \
+      const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
+      const uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *sec,         \
+      ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0,   \
+      void *unused)
+#define DECLS(opt) \
+  DECL(4, opt);    \
+  DECL(8, opt);    \
+  DECL(16, opt)
+
+DECLS(sse2);
+DECLS(ssse3);
+#undef DECL
+#undef DECLS
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast)                     \
+  unsigned int aom_sub_pixel_avg_variance##w##x##h##_##opt(                  \
+      const uint8_t *src, int src_stride, int x_offset, int y_offset,        \
+      const uint8_t *dst, int dst_stride, unsigned int *sse_ptr,             \
+      const uint8_t *sec) {                                                  \
+    /*Avoid overflow in helper by capping height.*/                          \
+    const int hf = AOMMIN(h, 64);                                            \
+    unsigned int sse = 0;                                                    \
+    int se = 0;                                                              \
+    for (int i = 0; i < (w / wf); ++i) {                                     \
+      const uint8_t *src_ptr = src;                                          \
+      const uint8_t *dst_ptr = dst;                                          \
+      const uint8_t *sec_ptr = sec;                                          \
+      for (int j = 0; j < (h / hf); ++j) {                                   \
+        unsigned int sse2;                                                   \
+        const int se2 = aom_sub_pixel_avg_variance##wf##xh_##opt(            \
+            src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride,    \
+            sec_ptr, w, hf, &sse2, NULL, NULL);                              \
+        dst_ptr += hf * dst_stride;                                          \
+        src_ptr += hf * src_stride;                                          \
+        sec_ptr += hf * w;                                                   \
+        se += se2;                                                           \
+        sse += sse2;                                                         \
+      }                                                                      \
+      src += wf;                                                             \
+      dst += wf;                                                             \
+      sec += wf;                                                             \
+    }                                                                        \
+    *sse_ptr = sse;                                                          \
+    return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \
+  }
+
+#define FNS(opt)                                     \
+  FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)); \
+  FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t));  \
+  FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t));  \
+  FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t));   \
+  FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t));   \
+  FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t));   \
+  FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t));   \
+  FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t));   \
+  FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t));   \
+  FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t));  \
+  FN(16, 8, 16, 4, 3, opt, (uint32_t), (int32_t));   \
+  FN(8, 16, 8, 3, 4, opt, (uint32_t), (int32_t));    \
+  FN(8, 8, 8, 3, 3, opt, (uint32_t), (int32_t));     \
+  FN(8, 4, 8, 3, 2, opt, (uint32_t), (int32_t));     \
+  FN(4, 8, 4, 2, 3, opt, (uint32_t), (int32_t));     \
+  FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t));     \
+  FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t));     \
+  FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t));    \
+  FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t));    \
+  FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t));   \
+  FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t));   \
+  FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t))
+
+FNS(sse2);
+FNS(ssse3);
+
+#undef FNS
+#undef FN
+
+void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm,
+                             int mi_row, int mi_col, const MV *const mv,
+                             uint8_t *comp_pred, int width, int height,
+                             int subpel_x_q3, int subpel_y_q3,
+                             const uint8_t *ref, int ref_stride,
+                             int subpel_search) {
+  // expect xd == NULL only in tests
+  if (xd != NULL) {
+    const MB_MODE_INFO *mi = xd->mi[0];
+    const int ref_num = 0;
+    const int is_intrabc = is_intrabc_block(mi);
+    const struct scale_factors *const sf =
+        is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num];
+    const int is_scaled = av1_is_scaled(sf);
+
+    if (is_scaled) {
+      int plane = 0;
+      const int mi_x = mi_col * MI_SIZE;
+      const int mi_y = mi_row * MI_SIZE;
+      const struct macroblockd_plane *const pd = &xd->plane[plane];
+      const struct buf_2d *const dst_buf = &pd->dst;
+      const struct buf_2d *const pre_buf =
+          is_intrabc ? dst_buf : &pd->pre[ref_num];
+
+      InterPredParams inter_pred_params;
+      inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd);
+      const int_interpfilters filters =
+          av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+      av1_init_inter_params(
+          &inter_pred_params, width, height, mi_y >> pd->subsampling_y,
+          mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y,
+          xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters);
+      av1_enc_build_one_inter_predictor(comp_pred, width, mv,
+                                        &inter_pred_params);
+      return;
+    }
+  }
+
+  const InterpFilterParams *filter = av1_get_filter(subpel_search);
+  // (TODO:yunqing) 2-tap case uses 4-tap functions since there is no SIMD for
+  // 2-tap yet.
+  int filter_taps = (subpel_search <= USE_4_TAPS) ? 4 : SUBPEL_TAPS;
+
+  if (!subpel_x_q3 && !subpel_y_q3) {
+    if (width >= 16) {
+      int i;
+      assert(!(width & 15));
+      /*Read 16 pixels one row at a time.*/
+      for (i = 0; i < height; i++) {
+        int j;
+        for (j = 0; j < width; j += 16) {
+          xx_storeu_128(comp_pred, xx_loadu_128(ref));
+          comp_pred += 16;
+          ref += 16;
+        }
+        ref += ref_stride - width;
+      }
+    } else if (width >= 8) {
+      int i;
+      assert(!(width & 7));
+      assert(!(height & 1));
+      /*Read 8 pixels two rows at a time.*/
+      for (i = 0; i < height; i += 2) {
+        __m128i s0 = xx_loadl_64(ref + 0 * ref_stride);
+        __m128i s1 = xx_loadl_64(ref + 1 * ref_stride);
+        xx_storeu_128(comp_pred, _mm_unpacklo_epi64(s0, s1));
+        comp_pred += 16;
+        ref += 2 * ref_stride;
+      }
+    } else {
+      int i;
+      assert(!(width & 3));
+      assert(!(height & 3));
+      /*Read 4 pixels four rows at a time.*/
+      for (i = 0; i < height; i++) {
+        const __m128i row0 = xx_loadl_64(ref + 0 * ref_stride);
+        const __m128i row1 = xx_loadl_64(ref + 1 * ref_stride);
+        const __m128i row2 = xx_loadl_64(ref + 2 * ref_stride);
+        const __m128i row3 = xx_loadl_64(ref + 3 * ref_stride);
+        const __m128i reg = _mm_unpacklo_epi64(_mm_unpacklo_epi32(row0, row1),
+                                               _mm_unpacklo_epi32(row2, row3));
+        xx_storeu_128(comp_pred, reg);
+        comp_pred += 16;
+        ref += 4 * ref_stride;
+      }
+    }
+  } else if (!subpel_y_q3) {
+    const int16_t *const kernel =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+    aom_convolve8_horiz(ref, ref_stride, comp_pred, width, kernel, 16, NULL, -1,
+                        width, height);
+  } else if (!subpel_x_q3) {
+    const int16_t *const kernel =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+    aom_convolve8_vert(ref, ref_stride, comp_pred, width, NULL, -1, kernel, 16,
+                       width, height);
+  } else {
+    DECLARE_ALIGNED(16, uint8_t,
+                    temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
+    const int16_t *const kernel_x =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+    const int16_t *const kernel_y =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+    const uint8_t *ref_start = ref - ref_stride * ((filter_taps >> 1) - 1);
+    uint8_t *temp_start_horiz = (subpel_search <= USE_4_TAPS)
+                                    ? temp + (filter_taps >> 1) * MAX_SB_SIZE
+                                    : temp;
+    uint8_t *temp_start_vert = temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1);
+    int intermediate_height =
+        (((height - 1) * 8 + subpel_y_q3) >> 3) + filter_taps;
+    assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
+    aom_convolve8_horiz(ref_start, ref_stride, temp_start_horiz, MAX_SB_SIZE,
+                        kernel_x, 16, NULL, -1, width, intermediate_height);
+    aom_convolve8_vert(temp_start_vert, MAX_SB_SIZE, comp_pred, width, NULL, -1,
+                       kernel_y, 16, width, height);
+  }
+}
+
+void aom_comp_avg_upsampled_pred_sse2(
+    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+    int ref_stride, int subpel_search) {
+  int n;
+  int i;
+  aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
+                     subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search);
+  /*The total number of pixels must be a multiple of 16 (e.g., 4x4).*/
+  assert(!(width * height & 15));
+  n = width * height >> 4;
+  for (i = 0; i < n; i++) {
+    __m128i s0 = xx_loadu_128(comp_pred);
+    __m128i p0 = xx_loadu_128(pred);
+    xx_storeu_128(comp_pred, _mm_avg_epu8(s0, p0));
+    comp_pred += 16;
+    pred += 16;
+  }
+}
+
+void aom_comp_mask_upsampled_pred_sse2(
+    MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+    int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask,
+    int subpel_search) {
+  if (subpel_x_q3 | subpel_y_q3) {
+    aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
+                       subpel_x_q3, subpel_y_q3, ref, ref_stride,
+                       subpel_search);
+    ref = comp_pred;
+    ref_stride = width;
+  }
+  aom_comp_mask_pred(comp_pred, pred, width, height, ref, ref_stride, mask,
+                     mask_stride, invert_mask);
+}
+
+static INLINE __m128i highbd_comp_mask_pred_line_sse2(const __m128i s0,
+                                                      const __m128i s1,
+                                                      const __m128i a) {
+  const __m128i alpha_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
+  const __m128i round_const =
+      _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
+  const __m128i a_inv = _mm_sub_epi16(alpha_max, a);
+
+  const __m128i s_lo = _mm_unpacklo_epi16(s0, s1);
+  const __m128i a_lo = _mm_unpacklo_epi16(a, a_inv);
+  const __m128i pred_lo = _mm_madd_epi16(s_lo, a_lo);
+  const __m128i pred_l = _mm_srai_epi32(_mm_add_epi32(pred_lo, round_const),
+                                        AOM_BLEND_A64_ROUND_BITS);
+
+  const __m128i s_hi = _mm_unpackhi_epi16(s0, s1);
+  const __m128i a_hi = _mm_unpackhi_epi16(a, a_inv);
+  const __m128i pred_hi = _mm_madd_epi16(s_hi, a_hi);
+  const __m128i pred_h = _mm_srai_epi32(_mm_add_epi32(pred_hi, round_const),
+                                        AOM_BLEND_A64_ROUND_BITS);
+
+  const __m128i comp = _mm_packs_epi32(pred_l, pred_h);
+
+  return comp;
+}
+
+void aom_highbd_comp_mask_pred_sse2(uint8_t *comp_pred8, const uint8_t *pred8,
+                                    int width, int height, const uint8_t *ref8,
+                                    int ref_stride, const uint8_t *mask,
+                                    int mask_stride, int invert_mask) {
+  int i = 0;
+  uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  const uint16_t *src0 = invert_mask ? pred : ref;
+  const uint16_t *src1 = invert_mask ? ref : pred;
+  const int stride0 = invert_mask ? width : ref_stride;
+  const int stride1 = invert_mask ? ref_stride : width;
+  const __m128i zero = _mm_setzero_si128();
+
+  if (width == 8) {
+    do {
+      const __m128i s0 = _mm_loadu_si128((const __m128i *)(src0));
+      const __m128i s1 = _mm_loadu_si128((const __m128i *)(src1));
+      const __m128i m_8 = _mm_loadl_epi64((const __m128i *)mask);
+      const __m128i m_16 = _mm_unpacklo_epi8(m_8, zero);
+
+      const __m128i comp = highbd_comp_mask_pred_line_sse2(s0, s1, m_16);
+
+      _mm_storeu_si128((__m128i *)comp_pred, comp);
+
+      src0 += stride0;
+      src1 += stride1;
+      mask += mask_stride;
+      comp_pred += width;
+      i += 1;
+    } while (i < height);
+  } else if (width == 16) {
+    do {
+      const __m128i s0 = _mm_loadu_si128((const __m128i *)(src0));
+      const __m128i s2 = _mm_loadu_si128((const __m128i *)(src0 + 8));
+      const __m128i s1 = _mm_loadu_si128((const __m128i *)(src1));
+      const __m128i s3 = _mm_loadu_si128((const __m128i *)(src1 + 8));
+
+      const __m128i m_8 = _mm_loadu_si128((const __m128i *)mask);
+      const __m128i m01_16 = _mm_unpacklo_epi8(m_8, zero);
+      const __m128i m23_16 = _mm_unpackhi_epi8(m_8, zero);
+
+      const __m128i comp = highbd_comp_mask_pred_line_sse2(s0, s1, m01_16);
+      const __m128i comp1 = highbd_comp_mask_pred_line_sse2(s2, s3, m23_16);
+
+      _mm_storeu_si128((__m128i *)comp_pred, comp);
+      _mm_storeu_si128((__m128i *)(comp_pred + 8), comp1);
+
+      src0 += stride0;
+      src1 += stride1;
+      mask += mask_stride;
+      comp_pred += width;
+      i += 1;
+    } while (i < height);
+  } else if (width == 32) {
+    do {
+      for (int j = 0; j < 2; j++) {
+        const __m128i s0 = _mm_loadu_si128((const __m128i *)(src0 + j * 16));
+        const __m128i s2 =
+            _mm_loadu_si128((const __m128i *)(src0 + 8 + j * 16));
+        const __m128i s1 = _mm_loadu_si128((const __m128i *)(src1 + j * 16));
+        const __m128i s3 =
+            _mm_loadu_si128((const __m128i *)(src1 + 8 + j * 16));
+
+        const __m128i m_8 = _mm_loadu_si128((const __m128i *)(mask + j * 16));
+        const __m128i m01_16 = _mm_unpacklo_epi8(m_8, zero);
+        const __m128i m23_16 = _mm_unpackhi_epi8(m_8, zero);
+
+        const __m128i comp = highbd_comp_mask_pred_line_sse2(s0, s1, m01_16);
+        const __m128i comp1 = highbd_comp_mask_pred_line_sse2(s2, s3, m23_16);
+
+        _mm_storeu_si128((__m128i *)(comp_pred + j * 16), comp);
+        _mm_storeu_si128((__m128i *)(comp_pred + 8 + j * 16), comp1);
+      }
+      src0 += stride0;
+      src1 += stride1;
+      mask += mask_stride;
+      comp_pred += width;
+      i += 1;
+    } while (i < height);
+  }
+}
diff --git a/libs/libaom/src/aom_mem/aom_mem.c b/libs/libaom/src/aom_mem/aom_mem.c
new file mode 100644
index 000000000..e977b01d7
--- /dev/null
+++ b/libs/libaom/src/aom_mem/aom_mem.c
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_mem.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "include/aom_mem_intrnl.h"
+#include "aom/aom_integer.h"
+
+#if defined(AOM_MAX_ALLOCABLE_MEMORY)
+// Returns 0 in case of overflow of nmemb * size.
+static int check_size_argument_overflow(uint64_t nmemb, uint64_t size) {
+  const uint64_t total_size = nmemb * size;
+  if (nmemb == 0) return 1;
+  if (size > AOM_MAX_ALLOCABLE_MEMORY / nmemb) return 0;
+  if (total_size != (size_t)total_size) return 0;
+  return 1;
+}
+#endif
+
+static size_t GetAlignedMallocSize(size_t size, size_t align) {
+  return size + align - 1 + ADDRESS_STORAGE_SIZE;
+}
+
+static size_t *GetMallocAddressLocation(void *const mem) {
+  return ((size_t *)mem) - 1;
+}
+
+static void SetActualMallocAddress(void *const mem,
+                                   const void *const malloc_addr) {
+  size_t *const malloc_addr_location = GetMallocAddressLocation(mem);
+  *malloc_addr_location = (size_t)malloc_addr;
+}
+
+static void *GetActualMallocAddress(void *const mem) {
+  const size_t *const malloc_addr_location = GetMallocAddressLocation(mem);
+  return (void *)(*malloc_addr_location);
+}
+
+void *aom_memalign(size_t align, size_t size) {
+  void *x = NULL;
+  const size_t aligned_size = GetAlignedMallocSize(size, align);
+#if defined(AOM_MAX_ALLOCABLE_MEMORY)
+  if (!check_size_argument_overflow(1, aligned_size)) return NULL;
+#endif
+  void *const addr = malloc(aligned_size);
+  if (addr) {
+    x = aom_align_addr((unsigned char *)addr + ADDRESS_STORAGE_SIZE, align);
+    SetActualMallocAddress(x, addr);
+  }
+  return x;
+}
+
+void *aom_malloc(size_t size) { return aom_memalign(DEFAULT_ALIGNMENT, size); }
+
+void *aom_calloc(size_t num, size_t size) {
+  const size_t total_size = num * size;
+  void *const x = aom_malloc(total_size);
+  if (x) memset(x, 0, total_size);
+  return x;
+}
+
+void aom_free(void *memblk) {
+  if (memblk) {
+    void *addr = GetActualMallocAddress(memblk);
+    free(addr);
+  }
+}
+
+void *aom_memset16(void *dest, int val, size_t length) {
+  size_t i;
+  uint16_t *dest16 = (uint16_t *)dest;
+  for (i = 0; i < length; i++) *dest16++ = val;
+  return dest;
+}
diff --git a/libs/libaom/src/aom_mem/aom_mem.cmake b/libs/libaom/src/aom_mem/aom_mem.cmake
new file mode 100644
index 000000000..346588d2d
--- /dev/null
+++ b/libs/libaom/src/aom_mem/aom_mem.cmake
@@ -0,0 +1,29 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_AOM_MEM_AOM_MEM_CMAKE_)
+  return()
+endif() # AOM_AOM_MEM_AOM_MEM_CMAKE_
+set(AOM_AOM_MEM_AOM_MEM_CMAKE_ 1)
+
+list(APPEND AOM_MEM_SOURCES "${AOM_ROOT}/aom_mem/aom_mem.c"
+            "${AOM_ROOT}/aom_mem/aom_mem.h"
+            "${AOM_ROOT}/aom_mem/include/aom_mem_intrnl.h")
+
+# Creates the aom_mem build target and makes libaom depend on it. The libaom
+# target must exist before this function is called.
+function(setup_aom_mem_targets)
+  add_library(aom_mem OBJECT ${AOM_MEM_SOURCES})
+  set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_mem PARENT_SCOPE)
+  target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_mem>)
+  if(BUILD_SHARED_LIBS)
+    target_sources(aom_static PRIVATE $<TARGET_OBJECTS:aom_mem>)
+  endif()
+endfunction()
diff --git a/libs/libaom/src/aom_mem/aom_mem.h b/libs/libaom/src/aom_mem/aom_mem.h
new file mode 100644
index 000000000..bc5d8bca3
--- /dev/null
+++ b/libs/libaom/src/aom_mem/aom_mem.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_MEM_AOM_MEM_H_
+#define AOM_AOM_MEM_AOM_MEM_H_
+
+#include "aom/aom_integer.h"
+#include "config/aom_config.h"
+
+#if defined(__uClinux__)
+#include <lddk.h>
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#ifndef AOM_MAX_ALLOCABLE_MEMORY
+#if SIZE_MAX > (1ULL << 32)
+#define AOM_MAX_ALLOCABLE_MEMORY 8589934592  // 8 GB
+#else
+// For 32-bit targets keep this below INT_MAX to avoid valgrind warnings.
+#define AOM_MAX_ALLOCABLE_MEMORY ((1ULL << 31) - (1 << 16))
+#endif
+#endif
+
+void *aom_memalign(size_t align, size_t size);
+void *aom_malloc(size_t size);
+void *aom_calloc(size_t num, size_t size);
+void aom_free(void *memblk);
+void *aom_memset16(void *dest, int val, size_t length);
+
+/*returns an addr aligned to the byte boundary specified by align*/
+#define aom_align_addr(addr, align) \
+  (void *)(((uintptr_t)(addr) + ((align)-1)) & ~(uintptr_t)((align)-1))
+
+#include <string.h>
+
+#ifdef AOM_MEM_PLTFRM
+#include AOM_MEM_PLTFRM
+#endif
+
+#if CONFIG_DEBUG
+#define AOM_CHECK_MEM_ERROR(error_info, lval, expr)                         \
+  do {                                                                      \
+    lval = (expr);                                                          \
+    if (!lval)                                                              \
+      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR,                   \
+                         "Failed to allocate " #lval " at %s:%d", __FILE__, \
+                         __LINE__);                                         \
+  } while (0)
+#else
+#define AOM_CHECK_MEM_ERROR(error_info, lval, expr)       \
+  do {                                                    \
+    lval = (expr);                                        \
+    if (!lval)                                            \
+      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, \
+                         "Failed to allocate " #lval);    \
+  } while (0)
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif  // AOM_AOM_MEM_AOM_MEM_H_
diff --git a/libs/libaom/src/aom_mem/include/aom_mem_intrnl.h b/libs/libaom/src/aom_mem/include/aom_mem_intrnl.h
new file mode 100644
index 000000000..2c9819de9
--- /dev/null
+++ b/libs/libaom/src/aom_mem/include/aom_mem_intrnl.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_MEM_INCLUDE_AOM_MEM_INTRNL_H_
+#define AOM_AOM_MEM_INCLUDE_AOM_MEM_INTRNL_H_
+
+#include "config/aom_config.h"
+
+#define ADDRESS_STORAGE_SIZE sizeof(size_t)
+
+#ifndef DEFAULT_ALIGNMENT
+#if defined(VXWORKS)
+/*default addr alignment to use in calls to aom_* functions other than
+  aom_memalign*/
+#define DEFAULT_ALIGNMENT 32
+#else
+#define DEFAULT_ALIGNMENT (2 * sizeof(void *)) /* NOLINT */
+#endif
+#endif
+
+#endif  // AOM_AOM_MEM_INCLUDE_AOM_MEM_INTRNL_H_
diff --git a/libs/libaom/src/aom_ports/aom_once.h b/libs/libaom/src/aom_ports/aom_once.h
new file mode 100644
index 000000000..d1a031bf1
--- /dev/null
+++ b/libs/libaom/src/aom_ports/aom_once.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_PORTS_AOM_ONCE_H_
+#define AOM_AOM_PORTS_AOM_ONCE_H_
+
+#include "config/aom_config.h"
+
+/* Implement a function wrapper to guarantee initialization
+ * thread-safety for library singletons.
+ *
+ * NOTE: This function uses static locks, and can only be
+ * used with one common argument per compilation unit. So
+ *
+ * file1.c:
+ *   aom_once(foo);
+ *   ...
+ *   aom_once(foo);
+ *
+ * file2.c:
+ *   aom_once(bar);
+ *
+ * will ensure foo() and bar() are each called only once, but in
+ *
+ * file1.c:
+ *   aom_once(foo);
+ *   aom_once(bar):
+ *
+ * bar() will never be called because the lock is used up
+ * by the call to foo().
+ */
+
+#if CONFIG_MULTITHREAD && defined(_WIN32)
+#include <windows.h>
+/* Declare a per-compilation-unit state variable to track the progress
+ * of calling func() only once. This must be at global scope because
+ * local initializers are not thread-safe in MSVC prior to Visual
+ * Studio 2015.
+ */
+static INIT_ONCE aom_init_once = INIT_ONCE_STATIC_INIT;
+
+static void aom_once(void (*func)(void)) {
+  BOOL pending;
+  InitOnceBeginInitialize(&aom_init_once, 0, &pending, NULL);
+  if (!pending) {
+    // Initialization has already completed.
+    return;
+  }
+  func();
+  InitOnceComplete(&aom_init_once, 0, NULL);
+}
+
+#elif CONFIG_MULTITHREAD && defined(__OS2__)
+#define INCL_DOS
+#include <os2.h>
+static void aom_once(void (*func)(void)) {
+  static int done;
+
+  /* If the initialization is complete, return early. */
+  if (done) return;
+
+  /* Causes all other threads in the process to block themselves
+   * and give up their time slice.
+   */
+  DosEnterCritSec();
+
+  if (!done) {
+    func();
+    done = 1;
+  }
+
+  /* Restores normal thread dispatching for the current process. */
+  DosExitCritSec();
+}
+
+#elif CONFIG_MULTITHREAD && HAVE_PTHREAD_H
+#include <pthread.h>
+static void aom_once(void (*func)(void)) {
+  static pthread_once_t lock = PTHREAD_ONCE_INIT;
+  pthread_once(&lock, func);
+}
+
+#else
+/* Default version that performs no synchronization. */
+
+static void aom_once(void (*func)(void)) {
+  static int done;
+
+  if (!done) {
+    func();
+    done = 1;
+  }
+}
+#endif
+
+#endif  // AOM_AOM_PORTS_AOM_ONCE_H_
diff --git a/libs/libaom/src/aom_ports/aom_ports.cmake b/libs/libaom/src/aom_ports/aom_ports.cmake
new file mode 100644
index 000000000..d57989654
--- /dev/null
+++ b/libs/libaom/src/aom_ports/aom_ports.cmake
@@ -0,0 +1,92 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_AOM_PORTS_AOM_PORTS_CMAKE_)
+  return()
+endif() # AOM_AOM_PORTS_AOM_PORTS_CMAKE_
+set(AOM_AOM_PORTS_AOM_PORTS_CMAKE_ 1)
+
+list(APPEND AOM_PORTS_INCLUDES
+            "${AOM_ROOT}/aom_ports/aom_once.h"
+            "${AOM_ROOT}/aom_ports/aom_timer.h"
+            "${AOM_ROOT}/aom_ports/bitops.h"
+            "${AOM_ROOT}/aom_ports/emmintrin_compat.h"
+            "${AOM_ROOT}/aom_ports/mem.h"
+            "${AOM_ROOT}/aom_ports/mem_ops.h"
+            "${AOM_ROOT}/aom_ports/mem_ops_aligned.h"
+            "${AOM_ROOT}/aom_ports/msvc.h"
+            "${AOM_ROOT}/aom_ports/sanitizer.h"
+            "${AOM_ROOT}/aom_ports/system_state.h")
+
+list(APPEND AOM_PORTS_ASM_X86 "${AOM_ROOT}/aom_ports/emms.asm")
+
+list(APPEND AOM_PORTS_INCLUDES_X86 "${AOM_ROOT}/aom_ports/x86_abi_support.asm")
+
+list(APPEND AOM_PORTS_SOURCES_ARM "${AOM_ROOT}/aom_ports/arm.h"
+            "${AOM_ROOT}/aom_ports/arm_cpudetect.c")
+
+list(APPEND AOM_PORTS_SOURCES_PPC "${AOM_ROOT}/aom_ports/ppc.h"
+            "${AOM_ROOT}/aom_ports/ppc_cpudetect.c")
+
+# For arm and x86 targets:
+#
+# * Creates the aom_ports build target, adds the includes in aom_ports to the
+#   target, and makes libaom depend on it.
+#
+# Otherwise:
+#
+# * Adds the includes in aom_ports to the libaom target.
+#
+# For all target platforms:
+#
+# * The libaom target must exist before this function is called.
+function(setup_aom_ports_targets)
+  if("${AOM_TARGET_CPU}" MATCHES "^x86")
+    add_asm_library("aom_ports" "AOM_PORTS_ASM_X86")
+    set(aom_ports_has_symbols 1)
+  elseif("${AOM_TARGET_CPU}" MATCHES "arm")
+    add_library(aom_ports OBJECT ${AOM_PORTS_SOURCES_ARM})
+    set(aom_ports_has_symbols 1)
+  elseif("${AOM_TARGET_CPU}" MATCHES "ppc")
+    add_library(aom_ports OBJECT ${AOM_PORTS_SOURCES_PPC})
+    set(aom_ports_has_symbols 1)
+  endif()
+
+  if("${AOM_TARGET_CPU}" MATCHES "arm|ppc")
+    target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_ports>)
+    if(BUILD_SHARED_LIBS)
+      target_sources(aom_static PRIVATE $<TARGET_OBJECTS:aom_ports>)
+    endif()
+  endif()
+
+  if(aom_ports_has_symbols)
+    target_sources(aom_ports PRIVATE ${AOM_PORTS_INCLUDES})
+
+    if("${AOM_TARGET_CPU}" STREQUAL "x86"
+       OR "${AOM_TARGET_CPU}" STREQUAL "x86_64")
+      target_sources(aom_ports PRIVATE ${AOM_PORTS_INCLUDES_X86})
+    endif()
+
+    set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} PARENT_SCOPE)
+  else()
+    target_sources(aom PRIVATE ${AOM_PORTS_INCLUDES})
+    if(BUILD_SHARED_LIBS)
+      target_sources(aom_static PRIVATE ${AOM_PORTS_INCLUDES})
+    endif()
+
+    if("${AOM_TARGET_CPU}" STREQUAL "x86"
+       OR "${AOM_TARGET_CPU}" STREQUAL "x86_64")
+      target_sources(aom PRIVATE ${AOM_PORTS_INCLUDES_X86})
+      if(BUILD_SHARED_LIBS)
+        target_sources(aom_static PRIVATE ${AOM_PORTS_INCLUDES_X86})
+      endif()
+    endif()
+  endif()
+endfunction()
diff --git a/libs/libaom/src/aom_ports/aom_timer.h b/libs/libaom/src/aom_ports/aom_timer.h
new file mode 100644
index 000000000..9b17b8983
--- /dev/null
+++ b/libs/libaom/src/aom_ports/aom_timer.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_PORTS_AOM_TIMER_H_
+#define AOM_AOM_PORTS_AOM_TIMER_H_
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+
+#if CONFIG_OS_SUPPORT
+
+#if defined(_WIN32)
+/*
+ * Win32 specific includes
+ */
+#ifndef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
+#endif
+#include <windows.h>
+#else
+/*
+ * POSIX specific includes
+ */
+#include <sys/time.h>
+
+/* timersub is not provided by msys at this time. */
+#ifndef timersub
+#define timersub(a, b, result)                       \
+  do {                                               \
+    (result)->tv_sec = (a)->tv_sec - (b)->tv_sec;    \
+    (result)->tv_usec = (a)->tv_usec - (b)->tv_usec; \
+    if ((result)->tv_usec < 0) {                     \
+      --(result)->tv_sec;                            \
+      (result)->tv_usec += 1000000;                  \
+    }                                                \
+  } while (0)
+#endif
+#endif
+
+struct aom_usec_timer {
+#if defined(_WIN32)
+  LARGE_INTEGER begin, end;
+#else
+  struct timeval begin, end;
+#endif
+};
+
+static INLINE void aom_usec_timer_start(struct aom_usec_timer *t) {
+#if defined(_WIN32)
+  QueryPerformanceCounter(&t->begin);
+#else
+  gettimeofday(&t->begin, NULL);
+#endif
+}
+
+static INLINE void aom_usec_timer_mark(struct aom_usec_timer *t) {
+#if defined(_WIN32)
+  QueryPerformanceCounter(&t->end);
+#else
+  gettimeofday(&t->end, NULL);
+#endif
+}
+
+static INLINE int64_t aom_usec_timer_elapsed(struct aom_usec_timer *t) {
+#if defined(_WIN32)
+  LARGE_INTEGER freq, diff;
+
+  diff.QuadPart = t->end.QuadPart - t->begin.QuadPart;
+
+  QueryPerformanceFrequency(&freq);
+  return diff.QuadPart * 1000000 / freq.QuadPart;
+#else
+  struct timeval diff;
+
+  timersub(&t->end, &t->begin, &diff);
+  return ((int64_t)diff.tv_sec) * 1000000 + diff.tv_usec;
+#endif
+}
+
+#else /* CONFIG_OS_SUPPORT = 0*/
+
+/* Empty timer functions if CONFIG_OS_SUPPORT = 0 */
+#ifndef timersub
+#define timersub(a, b, result)
+#endif
+
+struct aom_usec_timer {
+  void *dummy;
+};
+
+static INLINE void aom_usec_timer_start(struct aom_usec_timer *t) { (void)t; }
+
+static INLINE void aom_usec_timer_mark(struct aom_usec_timer *t) { (void)t; }
+
+static INLINE int aom_usec_timer_elapsed(struct aom_usec_timer *t) {
+  (void)t;
+  return 0;
+}
+
+#endif /* CONFIG_OS_SUPPORT */
+
+#endif  // AOM_AOM_PORTS_AOM_TIMER_H_
diff --git a/libs/libaom/src/aom_ports/arm.h b/libs/libaom/src/aom_ports/arm.h
new file mode 100644
index 000000000..cb1fb9bec
--- /dev/null
+++ b/libs/libaom/src/aom_ports/arm.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_PORTS_ARM_H_
+#define AOM_AOM_PORTS_ARM_H_
+#include <stdlib.h>
+
+#include "config/aom_config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*ARMv5TE "Enhanced DSP" instructions.*/
+#define HAS_EDSP 0x01
+/*ARMv6 "Parallel" or "Media" instructions.*/
+#define HAS_MEDIA 0x02
+/*ARMv7 optional NEON instructions.*/
+#define HAS_NEON 0x04
+
+int aom_arm_cpu_caps(void);
+
+// Earlier gcc compilers have issues with some neon intrinsics
+#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ == 4 && \
+    __GNUC_MINOR__ <= 6
+#define AOM_INCOMPATIBLE_GCC
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_PORTS_ARM_H_
diff --git a/libs/libaom/src/aom_ports/arm_cpudetect.c b/libs/libaom/src/aom_ports/arm_cpudetect.c
new file mode 100644
index 000000000..5a75bb348
--- /dev/null
+++ b/libs/libaom/src/aom_ports/arm_cpudetect.c
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include "aom_ports/arm.h"
+#include "config/aom_config.h"
+
+#ifdef WINAPI_FAMILY
+#include <winapifamily.h>
+#if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
+#define getenv(x) NULL
+#endif
+#endif
+
+static int arm_cpu_env_flags(int *flags) {
+  char *env;
+  env = getenv("AOM_SIMD_CAPS");
+  if (env && *env) {
+    *flags = (int)strtol(env, NULL, 0);
+    return 0;
+  }
+  *flags = 0;
+  return -1;
+}
+
+static int arm_cpu_env_mask(void) {
+  char *env;
+  env = getenv("AOM_SIMD_CAPS_MASK");
+  return env && *env ? (int)strtol(env, NULL, 0) : ~0;
+}
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+
+int aom_arm_cpu_caps(void) {
+  /* This function should actually be a no-op. There is no way to adjust any of
+   * these because the RTCD tables do not exist: the functions are called
+   * statically */
+  int flags;
+  int mask;
+  if (!arm_cpu_env_flags(&flags)) {
+    return flags;
+  }
+  mask = arm_cpu_env_mask();
+#if HAVE_NEON
+  flags |= HAS_NEON;
+#endif /* HAVE_NEON */
+  return flags & mask;
+}
+
+#elif defined(_MSC_VER) /* end !CONFIG_RUNTIME_CPU_DETECT */
+/*For GetExceptionCode() and EXCEPTION_ILLEGAL_INSTRUCTION.*/
+#define WIN32_LEAN_AND_MEAN
+#define WIN32_EXTRA_LEAN
+#include <windows.h>
+
+int aom_arm_cpu_caps(void) {
+  int flags;
+  int mask;
+  if (!arm_cpu_env_flags(&flags)) {
+    return flags;
+  }
+  mask = arm_cpu_env_mask();
+/* MSVC has no inline __asm support for ARM, but it does let you __emit
+ *  instructions via their assembled hex code.
+ * All of these instructions should be essentially nops.
+ */
+#if HAVE_NEON
+  if (mask & HAS_NEON) {
+    __try {
+      /*VORR q0,q0,q0*/
+      __emit(0xF2200150);
+      flags |= HAS_NEON;
+    } __except (GetExceptionCode() == EXCEPTION_ILLEGAL_INSTRUCTION) {
+      /*Ignore exception.*/
+    }
+  }
+#endif /* HAVE_NEON */
+  return flags & mask;
+}
+
+#elif defined(__ANDROID__) /* end _MSC_VER */
+#include <cpu-features.h>
+
+int aom_arm_cpu_caps(void) {
+  int flags;
+  int mask;
+  uint64_t features;
+  if (!arm_cpu_env_flags(&flags)) {
+    return flags;
+  }
+  mask = arm_cpu_env_mask();
+  features = android_getCpuFeatures();
+
+#if HAVE_NEON
+  if (features & ANDROID_CPU_ARM_FEATURE_NEON) flags |= HAS_NEON;
+#endif /* HAVE_NEON */
+  return flags & mask;
+}
+
+#elif defined(__linux__) /* end __ANDROID__ */
+
+#include <stdio.h>
+
+int aom_arm_cpu_caps(void) {
+  FILE *fin;
+  int flags;
+  int mask;
+  if (!arm_cpu_env_flags(&flags)) {
+    return flags;
+  }
+  mask = arm_cpu_env_mask();
+  /* Reading /proc/self/auxv would be easier, but that doesn't work reliably
+   *  on Android.
+   * This also means that detection will fail in Scratchbox.
+   */
+  fin = fopen("/proc/cpuinfo", "r");
+  if (fin != NULL) {
+    /* 512 should be enough for anybody (it's even enough for all the flags
+     * that x86 has accumulated... so far).
+     */
+    char buf[512];
+    while (fgets(buf, 511, fin) != NULL) {
+#if HAVE_NEON
+      if (memcmp(buf, "Features", 8) == 0) {
+        char *p;
+        p = strstr(buf, " neon");
+        if (p != NULL && (p[5] == ' ' || p[5] == '\n')) {
+          flags |= HAS_NEON;
+        }
+      }
+#endif /* HAVE_NEON */
+    }
+    fclose(fin);
+  }
+  return flags & mask;
+}
+#else  /* end __linux__ */
+#error \
+    "--enable-runtime-cpu-detect selected, but no CPU detection method " \
+"available for your platform. Reconfigure with --disable-runtime-cpu-detect."
+#endif
diff --git a/libs/libaom/src/aom_ports/bitops.h b/libs/libaom/src/aom_ports/bitops.h
new file mode 100644
index 000000000..44df17307
--- /dev/null
+++ b/libs/libaom/src/aom_ports/bitops.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_PORTS_BITOPS_H_
+#define AOM_AOM_PORTS_BITOPS_H_
+
+#include <assert.h>
+
+#include "aom_ports/msvc.h"
+#include "config/aom_config.h"
+
+#ifdef _MSC_VER
+#if defined(_M_X64) || defined(_M_IX86)
+#include <intrin.h>
+#define USE_MSC_INTRINSICS
+#endif
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// get_msb:
+// Returns (int)floor(log2(n)). n must be > 0.
+// These versions of get_msb() are only valid when n != 0 because all
+// of the optimized versions are undefined when n == 0:
+// https://gcc.gnu.org/onlinedocs/gcc/Other-Builtins.html
+
+// use GNU builtins where available.
+#if defined(__GNUC__) && \
+    ((__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || __GNUC__ >= 4)
+static INLINE int get_msb(unsigned int n) {
+  assert(n != 0);
+  return 31 ^ __builtin_clz(n);
+}
+#elif defined(USE_MSC_INTRINSICS)
+#pragma intrinsic(_BitScanReverse)
+
+static INLINE int get_msb(unsigned int n) {
+  unsigned long first_set_bit;
+  assert(n != 0);
+  _BitScanReverse(&first_set_bit, n);
+  return first_set_bit;
+}
+#undef USE_MSC_INTRINSICS
+#else
+static INLINE int get_msb(unsigned int n) {
+  int log = 0;
+  unsigned int value = n;
+  int i;
+
+  assert(n != 0);
+
+  for (i = 4; i >= 0; --i) {
+    const int shift = (1 << i);
+    const unsigned int x = value >> shift;
+    if (x != 0) {
+      value = x;
+      log += shift;
+    }
+  }
+  return log;
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_PORTS_BITOPS_H_
diff --git a/libs/libaom/src/aom_ports/emmintrin_compat.h b/libs/libaom/src/aom_ports/emmintrin_compat.h
new file mode 100644
index 000000000..85d218a3d
--- /dev/null
+++ b/libs/libaom/src/aom_ports/emmintrin_compat.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_PORTS_EMMINTRIN_COMPAT_H_
+#define AOM_AOM_PORTS_EMMINTRIN_COMPAT_H_
+
+#if defined(__GNUC__) && __GNUC__ < 4
+/* From emmintrin.h (gcc 4.5.3) */
+/* Casts between various SP, DP, INT vector types.  Note that these do no
+   conversion of values, they just change the type.  */
+extern __inline __m128
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_castpd_ps(__m128d __A) {
+  return (__m128)__A;
+}
+
+extern __inline __m128i
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_castpd_si128(__m128d __A) {
+  return (__m128i)__A;
+}
+
+extern __inline __m128d
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_castps_pd(__m128 __A) {
+  return (__m128d)__A;
+}
+
+extern __inline __m128i
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_castps_si128(__m128 __A) {
+  return (__m128i)__A;
+}
+
+extern __inline __m128
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_castsi128_ps(__m128i __A) {
+  return (__m128)__A;
+}
+
+extern __inline __m128d
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_castsi128_pd(__m128i __A) {
+  return (__m128d)__A;
+}
+#endif
+
+#endif  // AOM_AOM_PORTS_EMMINTRIN_COMPAT_H_
diff --git a/libs/libaom/src/aom_ports/emms.asm b/libs/libaom/src/aom_ports/emms.asm
new file mode 100644
index 000000000..90776bacb
--- /dev/null
+++ b/libs/libaom/src/aom_ports/emms.asm
@@ -0,0 +1,41 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+
+%include "aom_ports/x86_abi_support.asm"
+
+section .text
+global sym(aom_reset_mmx_state) PRIVATE
+sym(aom_reset_mmx_state):
+    emms
+    ret
+
+
+%if LIBAOM_YASM_WIN64
+global sym(aom_winx64_fldcw) PRIVATE
+sym(aom_winx64_fldcw):
+    sub   rsp, 8
+    mov   [rsp], rcx ; win x64 specific
+    fldcw [rsp]
+    add   rsp, 8
+    ret
+
+
+global sym(aom_winx64_fstcw) PRIVATE
+sym(aom_winx64_fstcw):
+    sub   rsp, 8
+    fstcw [rsp]
+    mov   rax, [rsp]
+    add   rsp, 8
+    ret
+%endif
diff --git a/libs/libaom/src/aom_ports/mem.h b/libs/libaom/src/aom_ports/mem.h
new file mode 100644
index 000000000..9e3d42403
--- /dev/null
+++ b/libs/libaom/src/aom_ports/mem.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_PORTS_MEM_H_
+#define AOM_AOM_PORTS_MEM_H_
+
+#include "aom/aom_integer.h"
+#include "config/aom_config.h"
+
+#if (defined(__GNUC__) && __GNUC__) || defined(__SUNPRO_C)
+#define DECLARE_ALIGNED(n, typ, val) typ val __attribute__((aligned(n)))
+#elif defined(_MSC_VER)
+#define DECLARE_ALIGNED(n, typ, val) __declspec(align(n)) typ val
+#else
+#warning No alignment directives known for this compiler.
+#define DECLARE_ALIGNED(n, typ, val) typ val
+#endif
+
+/* Indicates that the usage of the specified variable has been audited to assure
+ * that it's safe to use uninitialized. Silences 'may be used uninitialized'
+ * warnings on gcc.
+ */
+#if defined(__GNUC__) && __GNUC__
+#define UNINITIALIZED_IS_SAFE(x) x = x
+#else
+#define UNINITIALIZED_IS_SAFE(x) x
+#endif
+
+#if HAVE_NEON && defined(_MSC_VER)
+#define __builtin_prefetch(x)
+#endif
+
+/* Shift down with rounding for use when n >= 0, value >= 0 */
+#define ROUND_POWER_OF_TWO(value, n) (((value) + (((1 << (n)) >> 1))) >> (n))
+
+/* Shift down with rounding for signed integers, for use when n >= 0 */
+#define ROUND_POWER_OF_TWO_SIGNED(value, n)           \
+  (((value) < 0) ? -ROUND_POWER_OF_TWO(-(value), (n)) \
+                 : ROUND_POWER_OF_TWO((value), (n)))
+
+/* Shift down with rounding for use when n >= 0, value >= 0 for (64 bit) */
+#define ROUND_POWER_OF_TWO_64(value, n) \
+  (((value) + ((((int64_t)1 << (n)) >> 1))) >> (n))
+/* Shift down with rounding for signed integers, for use when n >= 0 (64 bit) */
+#define ROUND_POWER_OF_TWO_SIGNED_64(value, n)           \
+  (((value) < 0) ? -ROUND_POWER_OF_TWO_64(-(value), (n)) \
+                 : ROUND_POWER_OF_TWO_64((value), (n)))
+
+/* shift right or left depending on sign of n */
+#define RIGHT_SIGNED_SHIFT(value, n) \
+  ((n) < 0 ? ((value) << (-(n))) : ((value) >> (n)))
+
+#define ALIGN_POWER_OF_TWO(value, n) \
+  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
+
+#define DIVIDE_AND_ROUND(x, y) (((x) + ((y) >> 1)) / (y))
+
+#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
+#define CONVERT_TO_BYTEPTR(x) ((uint8_t *)(((uintptr_t)(x)) >> 1))
+
+/*!\brief force enum to be unsigned 1 byte*/
+#define UENUM1BYTE(enumvar) \
+  ;                         \
+  typedef uint8_t enumvar
+
+/*!\brief force enum to be signed 1 byte*/
+#define SENUM1BYTE(enumvar) \
+  ;                         \
+  typedef int8_t enumvar
+
+/*!\brief force enum to be unsigned 2 byte*/
+#define UENUM2BYTE(enumvar) \
+  ;                         \
+  typedef uint16_t enumvar
+
+/*!\brief force enum to be signed 2 byte*/
+#define SENUM2BYTE(enumvar) \
+  ;                         \
+  typedef int16_t enumvar
+
+/*!\brief force enum to be unsigned 4 byte*/
+#define UENUM4BYTE(enumvar) \
+  ;                         \
+  typedef uint32_t enumvar
+
+/*!\brief force enum to be unsigned 4 byte*/
+#define SENUM4BYTE(enumvar) \
+  ;                         \
+  typedef int32_t enumvar
+
+#endif  // AOM_AOM_PORTS_MEM_H_
diff --git a/libs/libaom/src/aom_ports/mem_ops.h b/libs/libaom/src/aom_ports/mem_ops.h
new file mode 100644
index 000000000..2b5bc0f0f
--- /dev/null
+++ b/libs/libaom/src/aom_ports/mem_ops.h
@@ -0,0 +1,228 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_PORTS_MEM_OPS_H_
+#define AOM_AOM_PORTS_MEM_OPS_H_
+
+/* \file
+ * \brief Provides portable memory access primitives
+ *
+ * This function provides portable primitives for getting and setting of
+ * signed and unsigned integers in 16, 24, and 32 bit sizes. The operations
+ * can be performed on unaligned data regardless of hardware support for
+ * unaligned accesses.
+ *
+ * The type used to pass the integral values may be changed by defining
+ * MEM_VALUE_T with the appropriate type. The type given must be an integral
+ * numeric type.
+ *
+ * The actual functions instantiated have the MEM_VALUE_T type name pasted
+ * on to the symbol name. This allows the developer to instantiate these
+ * operations for multiple types within the same translation unit. This is
+ * of somewhat questionable utility, but the capability exists nonetheless.
+ * Users not making use of this functionality should call the functions
+ * without the type name appended, and the preprocessor will take care of
+ * it.
+ *
+ * NOTE: This code is not supported on platforms where char > 1 octet ATM.
+ */
+
+#ifndef MAU_T
+/* Minimum Access Unit for this target */
+#define MAU_T unsigned char
+#endif
+
+#ifndef MEM_VALUE_T
+#define MEM_VALUE_T int
+#endif
+
+#undef MEM_VALUE_T_SZ_BITS
+#define MEM_VALUE_T_SZ_BITS (sizeof(MEM_VALUE_T) << 3)
+
+#undef mem_ops_wrap_symbol
+#define mem_ops_wrap_symbol(fn) mem_ops_wrap_symbol2(fn, MEM_VALUE_T)
+#undef mem_ops_wrap_symbol2
+#define mem_ops_wrap_symbol2(fn, typ) mem_ops_wrap_symbol3(fn, typ)
+#undef mem_ops_wrap_symbol3
+#define mem_ops_wrap_symbol3(fn, typ) fn##_as_##typ
+
+/*
+ * Include aligned access routines
+ */
+#define INCLUDED_BY_MEM_OPS_H
+#include "mem_ops_aligned.h"
+#undef INCLUDED_BY_MEM_OPS_H
+
+#undef mem_get_be16
+#define mem_get_be16 mem_ops_wrap_symbol(mem_get_be16)
+static unsigned MEM_VALUE_T mem_get_be16(const void *vmem) {
+  unsigned MEM_VALUE_T val;
+  const MAU_T *mem = (const MAU_T *)vmem;
+
+  val = mem[0] << 8;
+  val |= mem[1];
+  return val;
+}
+
+#undef mem_get_be24
+#define mem_get_be24 mem_ops_wrap_symbol(mem_get_be24)
+static unsigned MEM_VALUE_T mem_get_be24(const void *vmem) {
+  unsigned MEM_VALUE_T val;
+  const MAU_T *mem = (const MAU_T *)vmem;
+
+  val = mem[0] << 16;
+  val |= mem[1] << 8;
+  val |= mem[2];
+  return val;
+}
+
+#undef mem_get_be32
+#define mem_get_be32 mem_ops_wrap_symbol(mem_get_be32)
+static unsigned MEM_VALUE_T mem_get_be32(const void *vmem) {
+  unsigned MEM_VALUE_T val;
+  const MAU_T *mem = (const MAU_T *)vmem;
+
+  val = ((unsigned MEM_VALUE_T)mem[0]) << 24;
+  val |= mem[1] << 16;
+  val |= mem[2] << 8;
+  val |= mem[3];
+  return val;
+}
+
+#undef mem_get_le16
+#define mem_get_le16 mem_ops_wrap_symbol(mem_get_le16)
+static unsigned MEM_VALUE_T mem_get_le16(const void *vmem) {
+  unsigned MEM_VALUE_T val;
+  const MAU_T *mem = (const MAU_T *)vmem;
+
+  val = mem[1] << 8;
+  val |= mem[0];
+  return val;
+}
+
+#undef mem_get_le24
+#define mem_get_le24 mem_ops_wrap_symbol(mem_get_le24)
+static unsigned MEM_VALUE_T mem_get_le24(const void *vmem) {
+  unsigned MEM_VALUE_T val;
+  const MAU_T *mem = (const MAU_T *)vmem;
+
+  val = mem[2] << 16;
+  val |= mem[1] << 8;
+  val |= mem[0];
+  return val;
+}
+
+#undef mem_get_le32
+#define mem_get_le32 mem_ops_wrap_symbol(mem_get_le32)
+static unsigned MEM_VALUE_T mem_get_le32(const void *vmem) {
+  unsigned MEM_VALUE_T val;
+  const MAU_T *mem = (const MAU_T *)vmem;
+
+  val = ((unsigned MEM_VALUE_T)mem[3]) << 24;
+  val |= mem[2] << 16;
+  val |= mem[1] << 8;
+  val |= mem[0];
+  return val;
+}
+
+#define mem_get_s_generic(end, sz)                                            \
+  static AOM_INLINE signed MEM_VALUE_T mem_get_s##end##sz(const void *vmem) { \
+    const MAU_T *mem = (const MAU_T *)vmem;                                   \
+    signed MEM_VALUE_T val = mem_get_##end##sz(mem);                          \
+    return (val << (MEM_VALUE_T_SZ_BITS - sz)) >> (MEM_VALUE_T_SZ_BITS - sz); \
+  }
+
+/* clang-format off */
+#undef  mem_get_sbe16
+#define mem_get_sbe16 mem_ops_wrap_symbol(mem_get_sbe16)
+mem_get_s_generic(be, 16)
+
+#undef  mem_get_sbe24
+#define mem_get_sbe24 mem_ops_wrap_symbol(mem_get_sbe24)
+mem_get_s_generic(be, 24)
+
+#undef  mem_get_sbe32
+#define mem_get_sbe32 mem_ops_wrap_symbol(mem_get_sbe32)
+mem_get_s_generic(be, 32)
+
+#undef  mem_get_sle16
+#define mem_get_sle16 mem_ops_wrap_symbol(mem_get_sle16)
+mem_get_s_generic(le, 16)
+
+#undef  mem_get_sle24
+#define mem_get_sle24 mem_ops_wrap_symbol(mem_get_sle24)
+mem_get_s_generic(le, 24)
+
+#undef  mem_get_sle32
+#define mem_get_sle32 mem_ops_wrap_symbol(mem_get_sle32)
+mem_get_s_generic(le, 32)
+
+#undef  mem_put_be16
+#define mem_put_be16 mem_ops_wrap_symbol(mem_put_be16)
+static AOM_INLINE void mem_put_be16(void *vmem, MEM_VALUE_T val) {
+  MAU_T *mem = (MAU_T *)vmem;
+
+  mem[0] = (MAU_T)((val >> 8) & 0xff);
+  mem[1] = (MAU_T)((val >> 0) & 0xff);
+}
+
+#undef  mem_put_be24
+#define mem_put_be24 mem_ops_wrap_symbol(mem_put_be24)
+static AOM_INLINE void mem_put_be24(void *vmem, MEM_VALUE_T val) {
+  MAU_T *mem = (MAU_T *)vmem;
+
+  mem[0] = (MAU_T)((val >> 16) & 0xff);
+  mem[1] = (MAU_T)((val >>  8) & 0xff);
+  mem[2] = (MAU_T)((val >>  0) & 0xff);
+}
+
+#undef  mem_put_be32
+#define mem_put_be32 mem_ops_wrap_symbol(mem_put_be32)
+static AOM_INLINE void mem_put_be32(void *vmem, MEM_VALUE_T val) {
+  MAU_T *mem = (MAU_T *)vmem;
+
+  mem[0] = (MAU_T)((val >> 24) & 0xff);
+  mem[1] = (MAU_T)((val >> 16) & 0xff);
+  mem[2] = (MAU_T)((val >>  8) & 0xff);
+  mem[3] = (MAU_T)((val >>  0) & 0xff);
+}
+
+#undef  mem_put_le16
+#define mem_put_le16 mem_ops_wrap_symbol(mem_put_le16)
+static AOM_INLINE void mem_put_le16(void *vmem, MEM_VALUE_T val) {
+  MAU_T *mem = (MAU_T *)vmem;
+
+  mem[0] = (MAU_T)((val >> 0) & 0xff);
+  mem[1] = (MAU_T)((val >> 8) & 0xff);
+}
+
+#undef  mem_put_le24
+#define mem_put_le24 mem_ops_wrap_symbol(mem_put_le24)
+static AOM_INLINE void mem_put_le24(void *vmem, MEM_VALUE_T val) {
+  MAU_T *mem = (MAU_T *)vmem;
+
+  mem[0] = (MAU_T)((val >>  0) & 0xff);
+  mem[1] = (MAU_T)((val >>  8) & 0xff);
+  mem[2] = (MAU_T)((val >> 16) & 0xff);
+}
+
+#undef  mem_put_le32
+#define mem_put_le32 mem_ops_wrap_symbol(mem_put_le32)
+static AOM_INLINE void mem_put_le32(void *vmem, MEM_VALUE_T val) {
+  MAU_T *mem = (MAU_T *)vmem;
+
+  mem[0] = (MAU_T)((val >>  0) & 0xff);
+  mem[1] = (MAU_T)((val >>  8) & 0xff);
+  mem[2] = (MAU_T)((val >> 16) & 0xff);
+  mem[3] = (MAU_T)((val >> 24) & 0xff);
+}
+/* clang-format on */
+#endif  // AOM_AOM_PORTS_MEM_OPS_H_
diff --git a/libs/libaom/src/aom_ports/mem_ops_aligned.h b/libs/libaom/src/aom_ports/mem_ops_aligned.h
new file mode 100644
index 000000000..37c367531
--- /dev/null
+++ b/libs/libaom/src/aom_ports/mem_ops_aligned.h
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_PORTS_MEM_OPS_ALIGNED_H_
+#define AOM_AOM_PORTS_MEM_OPS_ALIGNED_H_
+
+#include "aom/aom_integer.h"
+
+/* \file
+ * \brief Provides portable memory access primitives for operating on aligned
+ *        data
+ *
+ * This file is split from mem_ops.h for easier maintenance. See mem_ops.h
+ * for a more detailed description of these primitives.
+ */
+#ifndef INCLUDED_BY_MEM_OPS_H
+#error Include mem_ops.h, not mem_ops_aligned.h directly.
+#endif
+
+/* Architectures that provide instructions for doing this byte swapping
+ * could redefine these macros.
+ */
+#define swap_endian_16(val, raw)                                     \
+  do {                                                               \
+    val = (uint16_t)(((raw >> 8) & 0x00ff) | ((raw << 8) & 0xff00)); \
+  } while (0)
+#define swap_endian_32(val, raw)                                   \
+  do {                                                             \
+    val = ((raw >> 24) & 0x000000ff) | ((raw >> 8) & 0x0000ff00) | \
+          ((raw << 8) & 0x00ff0000) | ((raw << 24) & 0xff000000);  \
+  } while (0)
+#define swap_endian_16_se(val, raw) \
+  do {                              \
+    swap_endian_16(val, raw);       \
+    val = ((val << 16) >> 16);      \
+  } while (0)
+#define swap_endian_32_se(val, raw) swap_endian_32(val, raw)
+
+#define mem_get_ne_aligned_generic(end, sz)                           \
+  static AOM_INLINE unsigned MEM_VALUE_T mem_get_##end##sz##_aligned( \
+      const void *vmem) {                                             \
+    const uint##sz##_t *mem = (const uint##sz##_t *)vmem;             \
+    return *mem;                                                      \
+  }
+
+#define mem_get_sne_aligned_generic(end, sz)                         \
+  static AOM_INLINE signed MEM_VALUE_T mem_get_s##end##sz##_aligned( \
+      const void *vmem) {                                            \
+    const int##sz##_t *mem = (const int##sz##_t *)vmem;              \
+    return *mem;                                                     \
+  }
+
+#define mem_get_se_aligned_generic(end, sz)                           \
+  static AOM_INLINE unsigned MEM_VALUE_T mem_get_##end##sz##_aligned( \
+      const void *vmem) {                                             \
+    const uint##sz##_t *mem = (const uint##sz##_t *)vmem;             \
+    unsigned MEM_VALUE_T val, raw = *mem;                             \
+    swap_endian_##sz(val, raw);                                       \
+    return val;                                                       \
+  }
+
+#define mem_get_sse_aligned_generic(end, sz)                         \
+  static AOM_INLINE signed MEM_VALUE_T mem_get_s##end##sz##_aligned( \
+      const void *vmem) {                                            \
+    const int##sz##_t *mem = (const int##sz##_t *)vmem;              \
+    unsigned MEM_VALUE_T val, raw = *mem;                            \
+    swap_endian_##sz##_se(val, raw);                                 \
+    return val;                                                      \
+  }
+
+#define mem_put_ne_aligned_generic(end, sz)                             \
+  static AOM_INLINE void mem_put_##end##sz##_aligned(void *vmem,        \
+                                                     MEM_VALUE_T val) { \
+    uint##sz##_t *mem = (uint##sz##_t *)vmem;                           \
+    *mem = (uint##sz##_t)val;                                           \
+  }
+
+#define mem_put_se_aligned_generic(end, sz)                             \
+  static AOM_INLINE void mem_put_##end##sz##_aligned(void *vmem,        \
+                                                     MEM_VALUE_T val) { \
+    uint##sz##_t *mem = (uint##sz##_t *)vmem, raw;                      \
+    swap_endian_##sz(raw, val);                                         \
+    *mem = (uint##sz##_t)raw;                                           \
+  }
+
+#include "config/aom_config.h"
+
+#if CONFIG_BIG_ENDIAN
+#define mem_get_be_aligned_generic(sz) mem_get_ne_aligned_generic(be, sz)
+#define mem_get_sbe_aligned_generic(sz) mem_get_sne_aligned_generic(be, sz)
+#define mem_get_le_aligned_generic(sz) mem_get_se_aligned_generic(le, sz)
+#define mem_get_sle_aligned_generic(sz) mem_get_sse_aligned_generic(le, sz)
+#define mem_put_be_aligned_generic(sz) mem_put_ne_aligned_generic(be, sz)
+#define mem_put_le_aligned_generic(sz) mem_put_se_aligned_generic(le, sz)
+#else
+#define mem_get_be_aligned_generic(sz) mem_get_se_aligned_generic(be, sz)
+#define mem_get_sbe_aligned_generic(sz) mem_get_sse_aligned_generic(be, sz)
+#define mem_get_le_aligned_generic(sz) mem_get_ne_aligned_generic(le, sz)
+#define mem_get_sle_aligned_generic(sz) mem_get_sne_aligned_generic(le, sz)
+#define mem_put_be_aligned_generic(sz) mem_put_se_aligned_generic(be, sz)
+#define mem_put_le_aligned_generic(sz) mem_put_ne_aligned_generic(le, sz)
+#endif
+
+/* clang-format off */
+#undef  mem_get_be16_aligned
+#define mem_get_be16_aligned mem_ops_wrap_symbol(mem_get_be16_aligned)
+mem_get_be_aligned_generic(16)
+
+#undef  mem_get_be32_aligned
+#define mem_get_be32_aligned mem_ops_wrap_symbol(mem_get_be32_aligned)
+mem_get_be_aligned_generic(32)
+
+#undef  mem_get_le16_aligned
+#define mem_get_le16_aligned mem_ops_wrap_symbol(mem_get_le16_aligned)
+mem_get_le_aligned_generic(16)
+
+#undef  mem_get_le32_aligned
+#define mem_get_le32_aligned mem_ops_wrap_symbol(mem_get_le32_aligned)
+mem_get_le_aligned_generic(32)
+
+#undef  mem_get_sbe16_aligned
+#define mem_get_sbe16_aligned mem_ops_wrap_symbol(mem_get_sbe16_aligned)
+mem_get_sbe_aligned_generic(16)
+
+#undef  mem_get_sbe32_aligned
+#define mem_get_sbe32_aligned mem_ops_wrap_symbol(mem_get_sbe32_aligned)
+mem_get_sbe_aligned_generic(32)
+
+#undef  mem_get_sle16_aligned
+#define mem_get_sle16_aligned mem_ops_wrap_symbol(mem_get_sle16_aligned)
+mem_get_sle_aligned_generic(16)
+
+#undef  mem_get_sle32_aligned
+#define mem_get_sle32_aligned mem_ops_wrap_symbol(mem_get_sle32_aligned)
+mem_get_sle_aligned_generic(32)
+
+#undef  mem_put_be16_aligned
+#define mem_put_be16_aligned mem_ops_wrap_symbol(mem_put_be16_aligned)
+mem_put_be_aligned_generic(16)
+
+#undef  mem_put_be32_aligned
+#define mem_put_be32_aligned mem_ops_wrap_symbol(mem_put_be32_aligned)
+mem_put_be_aligned_generic(32)
+
+#undef  mem_put_le16_aligned
+#define mem_put_le16_aligned mem_ops_wrap_symbol(mem_put_le16_aligned)
+mem_put_le_aligned_generic(16)
+
+#undef  mem_put_le32_aligned
+#define mem_put_le32_aligned mem_ops_wrap_symbol(mem_put_le32_aligned)
+mem_put_le_aligned_generic(32)
+
+#undef mem_get_ne_aligned_generic
+#undef mem_get_se_aligned_generic
+#undef mem_get_sne_aligned_generic
+#undef mem_get_sse_aligned_generic
+#undef mem_put_ne_aligned_generic
+#undef mem_put_se_aligned_generic
+#undef swap_endian_16
+#undef swap_endian_32
+#undef swap_endian_16_se
+#undef swap_endian_32_se
+/* clang-format on */
+
+#endif  // AOM_AOM_PORTS_MEM_OPS_ALIGNED_H_
diff --git a/libs/libaom/src/aom_ports/msvc.h b/libs/libaom/src/aom_ports/msvc.h
new file mode 100644
index 000000000..e78e605f2
--- /dev/null
+++ b/libs/libaom/src/aom_ports/msvc.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_PORTS_MSVC_H_
+#define AOM_AOM_PORTS_MSVC_H_
+#ifdef _MSC_VER
+
+#include "config/aom_config.h"
+
+#if _MSC_VER < 1900  // VS2015 provides snprintf
+#define snprintf _snprintf
+#endif  // _MSC_VER < 1900
+
+#if _MSC_VER < 1800  // VS2013 provides round
+#include <math.h>
+static INLINE double round(double x) {
+  if (x < 0)
+    return ceil(x - 0.5);
+  else
+    return floor(x + 0.5);
+}
+
+static INLINE float roundf(float x) {
+  if (x < 0)
+    return (float)ceil(x - 0.5f);
+  else
+    return (float)floor(x + 0.5f);
+}
+
+static INLINE long lroundf(float x) {
+  if (x < 0)
+    return (long)(x - 0.5f);
+  else
+    return (long)(x + 0.5f);
+}
+#endif  // _MSC_VER < 1800
+
+#if HAVE_AVX
+#include <immintrin.h>
+// Note:
+// _mm256_insert_epi16 intrinsics is available from vs2017.
+// We define this macro for vs2015 and earlier. The
+// intrinsics used here are in vs2015 document:
+// https://msdn.microsoft.com/en-us/library/hh977022.aspx
+// Input parameters:
+// a: __m256i,
+// d: int16_t,
+// indx: imm8 (0 - 15)
+#if _MSC_VER <= 1900
+#define _mm256_insert_epi16(a, d, indx)                                      \
+  _mm256_insertf128_si256(                                                   \
+      a,                                                                     \
+      _mm_insert_epi16(_mm256_extractf128_si256(a, indx >> 3), d, indx % 8), \
+      indx >> 3)
+
+static INLINE int _mm256_extract_epi32(__m256i a, const int i) {
+  return a.m256i_i32[i & 7];
+}
+static INLINE __m256i _mm256_insert_epi32(__m256i a, int b, const int i) {
+  __m256i c = a;
+  c.m256i_i32[i & 7] = b;
+  return c;
+}
+#endif  // _MSC_VER <= 1900
+#endif  // HAVE_AVX
+#endif  // _MSC_VER
+#endif  // AOM_AOM_PORTS_MSVC_H_
diff --git a/libs/libaom/src/aom_ports/ppc.h b/libs/libaom/src/aom_ports/ppc.h
new file mode 100644
index 000000000..3159bda68
--- /dev/null
+++ b/libs/libaom/src/aom_ports/ppc.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_PORTS_PPC_H_
+#define AOM_AOM_PORTS_PPC_H_
+#include <stdlib.h>
+
+#include "config/aom_config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define HAS_VSX 0x01
+
+int ppc_simd_caps(void);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_PORTS_PPC_H_
diff --git a/libs/libaom/src/aom_ports/ppc_cpudetect.c b/libs/libaom/src/aom_ports/ppc_cpudetect.c
new file mode 100644
index 000000000..ce4d5ae23
--- /dev/null
+++ b/libs/libaom/src/aom_ports/ppc_cpudetect.c
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <asm/cputable.h>
+#include <linux/auxvec.h>
+
+#include "config/aom_config.h"
+
+#include "aom_ports/ppc.h"
+
+#if CONFIG_RUNTIME_CPU_DETECT
+static int cpu_env_flags(int *flags) {
+  char *env;
+  env = getenv("AOM_SIMD_CAPS");
+  if (env && *env) {
+    *flags = (int)strtol(env, NULL, 0);
+    return 0;
+  }
+  *flags = 0;
+  return -1;
+}
+
+static int cpu_env_mask(void) {
+  char *env;
+  env = getenv("AOM_SIMD_CAPS_MASK");
+  return env && *env ? (int)strtol(env, NULL, 0) : ~0;
+}
+
+int ppc_simd_caps(void) {
+  int flags;
+  int mask;
+  int fd;
+  ssize_t count;
+  unsigned int i;
+  uint64_t buf[64];
+
+  // If AOM_SIMD_CAPS_MASK is set then allow only those capabilities.
+  if (!cpu_env_flags(&flags)) {
+    return flags;
+  }
+
+  mask = cpu_env_mask();
+
+  fd = open("/proc/self/auxv", O_RDONLY);
+  if (fd < 0) {
+    return 0;
+  }
+
+  while ((count = read(fd, buf, sizeof(buf))) > 0) {
+    for (i = 0; i < (count / sizeof(*buf)); i += 2) {
+      if (buf[i] == AT_HWCAP) {
+#if HAVE_VSX
+        if (buf[i + 1] & PPC_FEATURE_HAS_VSX) {
+          flags |= HAS_VSX;
+        }
+#endif  // HAVE_VSX
+        goto out_close;
+      } else if (buf[i] == AT_NULL) {
+        goto out_close;
+      }
+    }
+  }
+out_close:
+  close(fd);
+  return flags & mask;
+}
+#else
+// If there is no RTCD the function pointers are not used and can not be
+// changed.
+int ppc_simd_caps(void) { return 0; }
+#endif  // CONFIG_RUNTIME_CPU_DETECT
diff --git a/libs/libaom/src/aom_ports/sanitizer.h b/libs/libaom/src/aom_ports/sanitizer.h
new file mode 100644
index 000000000..1dd8eb4cf
--- /dev/null
+++ b/libs/libaom/src/aom_ports/sanitizer.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_PORTS_SANITIZER_H_
+#define AOM_AOM_PORTS_SANITIZER_H_
+
+// AddressSanitizer support.
+
+// Define AOM_ADDRESS_SANITIZER if AddressSanitizer is used.
+// Clang.
+#if defined(__has_feature)
+#if __has_feature(address_sanitizer)
+#define AOM_ADDRESS_SANITIZER 1
+#endif
+#endif  // defined(__has_feature)
+// GCC.
+#if defined(__SANITIZE_ADDRESS__)
+#define AOM_ADDRESS_SANITIZER 1
+#endif  // defined(__SANITIZE_ADDRESS__)
+
+// Define the macros for AddressSanitizer manual memory poisoning. See
+// https://github.com/google/sanitizers/wiki/AddressSanitizerManualPoisoning.
+#if defined(AOM_ADDRESS_SANITIZER)
+#include <sanitizer/asan_interface.h>
+#else
+#define ASAN_POISON_MEMORY_REGION(addr, size) ((void)(addr), (void)(size))
+#define ASAN_UNPOISON_MEMORY_REGION(addr, size) ((void)(addr), (void)(size))
+#endif
+
+#endif  // AOM_AOM_PORTS_SANITIZER_H_
diff --git a/libs/libaom/src/aom_ports/system_state.h b/libs/libaom/src/aom_ports/system_state.h
new file mode 100644
index 000000000..6640839d8
--- /dev/null
+++ b/libs/libaom/src/aom_ports/system_state.h
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_PORTS_SYSTEM_STATE_H_
+#define AOM_AOM_PORTS_SYSTEM_STATE_H_
+
+#include "config/aom_config.h"
+
+#if ARCH_X86 || ARCH_X86_64
+void aom_reset_mmx_state(void);
+#define aom_clear_system_state() aom_reset_mmx_state()
+#else
+#define aom_clear_system_state()
+#endif  // ARCH_X86 || ARCH_X86_64
+#endif  // AOM_AOM_PORTS_SYSTEM_STATE_H_
diff --git a/libs/libaom/src/aom_ports/x86.h b/libs/libaom/src/aom_ports/x86.h
new file mode 100644
index 000000000..8c1844871
--- /dev/null
+++ b/libs/libaom/src/aom_ports/x86.h
@@ -0,0 +1,375 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_PORTS_X86_H_
+#define AOM_AOM_PORTS_X86_H_
+#include <stdlib.h>
+
+#if defined(_MSC_VER)
+#include <intrin.h> /* For __cpuidex, __rdtsc */
+#endif
+
+#include "aom/aom_integer.h"
+#include "config/aom_config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum {
+  AOM_CPU_UNKNOWN = -1,
+  AOM_CPU_AMD,
+  AOM_CPU_AMD_OLD,
+  AOM_CPU_CENTAUR,
+  AOM_CPU_CYRIX,
+  AOM_CPU_INTEL,
+  AOM_CPU_NEXGEN,
+  AOM_CPU_NSC,
+  AOM_CPU_RISE,
+  AOM_CPU_SIS,
+  AOM_CPU_TRANSMETA,
+  AOM_CPU_TRANSMETA_OLD,
+  AOM_CPU_UMC,
+  AOM_CPU_VIA,
+
+  AOM_CPU_LAST
+} aom_cpu_t;
+
+#if defined(__GNUC__) && __GNUC__ || defined(__ANDROID__)
+#if ARCH_X86_64
+#define cpuid(func, func2, ax, bx, cx, dx)                      \
+  __asm__ __volatile__("cpuid           \n\t"                   \
+                       : "=a"(ax), "=b"(bx), "=c"(cx), "=d"(dx) \
+                       : "a"(func), "c"(func2));
+#else
+#define cpuid(func, func2, ax, bx, cx, dx)     \
+  __asm__ __volatile__(                        \
+      "mov %%ebx, %%edi   \n\t"                \
+      "cpuid              \n\t"                \
+      "xchg %%edi, %%ebx  \n\t"                \
+      : "=a"(ax), "=D"(bx), "=c"(cx), "=d"(dx) \
+      : "a"(func), "c"(func2));
+#endif
+#elif defined(__SUNPRO_C) || \
+    defined(__SUNPRO_CC) /* end __GNUC__ or __ANDROID__*/
+#if ARCH_X86_64
+#define cpuid(func, func2, ax, bx, cx, dx)     \
+  asm volatile(                                \
+      "xchg %rsi, %rbx \n\t"                   \
+      "cpuid           \n\t"                   \
+      "movl %ebx, %edi \n\t"                   \
+      "xchg %rsi, %rbx \n\t"                   \
+      : "=a"(ax), "=D"(bx), "=c"(cx), "=d"(dx) \
+      : "a"(func), "c"(func2));
+#else
+#define cpuid(func, func2, ax, bx, cx, dx)     \
+  asm volatile(                                \
+      "pushl %ebx       \n\t"                  \
+      "cpuid            \n\t"                  \
+      "movl %ebx, %edi  \n\t"                  \
+      "popl %ebx        \n\t"                  \
+      : "=a"(ax), "=D"(bx), "=c"(cx), "=d"(dx) \
+      : "a"(func), "c"(func2));
+#endif
+#else /* end __SUNPRO__ */
+#if ARCH_X86_64
+#if defined(_MSC_VER) && _MSC_VER > 1500
+#define cpuid(func, func2, a, b, c, d) \
+  do {                                 \
+    int regs[4];                       \
+    __cpuidex(regs, func, func2);      \
+    a = regs[0];                       \
+    b = regs[1];                       \
+    c = regs[2];                       \
+    d = regs[3];                       \
+  } while (0)
+#else
+#define cpuid(func, func2, a, b, c, d) \
+  do {                                 \
+    int regs[4];                       \
+    __cpuid(regs, func);               \
+    a = regs[0];                       \
+    b = regs[1];                       \
+    c = regs[2];                       \
+    d = regs[3];                       \
+  } while (0)
+#endif
+#else
+/* clang-format off */
+#define cpuid(func, func2, a, b, c, d) \
+  __asm mov eax, func                  \
+  __asm mov ecx, func2                 \
+  __asm cpuid                          \
+  __asm mov a, eax                     \
+  __asm mov b, ebx                     \
+  __asm mov c, ecx                     \
+  __asm mov d, edx
+#endif
+/* clang-format on */
+#endif /* end others */
+
+// NaCl has no support for xgetbv or the raw opcode.
+#if !defined(__native_client__) && (defined(__i386__) || defined(__x86_64__))
+static INLINE uint64_t xgetbv(void) {
+  const uint32_t ecx = 0;
+  uint32_t eax, edx;
+  // Use the raw opcode for xgetbv for compatibility with older toolchains.
+  __asm__ volatile(".byte 0x0f, 0x01, 0xd0\n"
+                   : "=a"(eax), "=d"(edx)
+                   : "c"(ecx));
+  return ((uint64_t)edx << 32) | eax;
+}
+#elif (defined(_M_X64) || defined(_M_IX86)) && defined(_MSC_FULL_VER) && \
+    _MSC_FULL_VER >= 160040219  // >= VS2010 SP1
+#include <immintrin.h>
+#define xgetbv() _xgetbv(0)
+#elif defined(_MSC_VER) && defined(_M_IX86)
+static INLINE uint64_t xgetbv(void) {
+  uint32_t eax_, edx_;
+  __asm {
+    xor ecx, ecx  // ecx = 0
+    // Use the raw opcode for xgetbv for compatibility with older toolchains.
+    __asm _emit 0x0f __asm _emit 0x01 __asm _emit 0xd0
+    mov eax_, eax
+    mov edx_, edx
+  }
+  return ((uint64_t)edx_ << 32) | eax_;
+}
+#else
+#define xgetbv() 0U  // no AVX for older x64 or unrecognized toolchains.
+#endif
+
+#if defined(_MSC_VER) && _MSC_VER >= 1700
+#include <windows.h>
+#if WINAPI_FAMILY_PARTITION(WINAPI_FAMILY_APP)
+#define getenv(x) NULL
+#endif
+#endif
+
+#define HAS_MMX 0x01
+#define HAS_SSE 0x02
+#define HAS_SSE2 0x04
+#define HAS_SSE3 0x08
+#define HAS_SSSE3 0x10
+#define HAS_SSE4_1 0x20
+#define HAS_AVX 0x40
+#define HAS_AVX2 0x80
+#define HAS_SSE4_2 0x100
+#ifndef BIT
+#define BIT(n) (1 << n)
+#endif
+
+static INLINE int x86_simd_caps(void) {
+  unsigned int flags = 0;
+  unsigned int mask = ~0;
+  unsigned int max_cpuid_val, reg_eax, reg_ebx, reg_ecx, reg_edx;
+  char *env;
+  (void)reg_ebx;
+
+  /* See if the CPU capabilities are being overridden by the environment */
+  env = getenv("AOM_SIMD_CAPS");
+
+  if (env && *env) return (int)strtol(env, NULL, 0);
+
+  env = getenv("AOM_SIMD_CAPS_MASK");
+
+  if (env && *env) mask = (unsigned int)strtoul(env, NULL, 0);
+
+  /* Ensure that the CPUID instruction supports extended features */
+  cpuid(0, 0, max_cpuid_val, reg_ebx, reg_ecx, reg_edx);
+
+  if (max_cpuid_val < 1) return 0;
+
+  /* Get the standard feature flags */
+  cpuid(1, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
+
+  if (reg_edx & BIT(23)) flags |= HAS_MMX;
+
+  if (reg_edx & BIT(25)) flags |= HAS_SSE; /* aka xmm */
+
+  if (reg_edx & BIT(26)) flags |= HAS_SSE2; /* aka wmt */
+
+  if (reg_ecx & BIT(0)) flags |= HAS_SSE3;
+
+  if (reg_ecx & BIT(9)) flags |= HAS_SSSE3;
+
+  if (reg_ecx & BIT(19)) flags |= HAS_SSE4_1;
+
+  if (reg_ecx & BIT(20)) flags |= HAS_SSE4_2;
+
+  // bits 27 (OSXSAVE) & 28 (256-bit AVX)
+  if ((reg_ecx & (BIT(27) | BIT(28))) == (BIT(27) | BIT(28))) {
+    if ((xgetbv() & 0x6) == 0x6) {
+      flags |= HAS_AVX;
+
+      if (max_cpuid_val >= 7) {
+        /* Get the leaf 7 feature flags. Needed to check for AVX2 support */
+        cpuid(7, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
+
+        if (reg_ebx & BIT(5)) flags |= HAS_AVX2;
+      }
+    }
+  }
+
+  return flags & mask;
+}
+
+// Fine-Grain Measurement Functions
+//
+// If you are a timing a small region of code, access the timestamp counter
+// (TSC) via:
+//
+// unsigned int start = x86_tsc_start();
+//   ...
+// unsigned int end = x86_tsc_end();
+// unsigned int diff = end - start;
+//
+// The start/end functions introduce a few more instructions than using
+// x86_readtsc directly, but prevent the CPU's out-of-order execution from
+// affecting the measurement (by having earlier/later instructions be evaluated
+// in the time interval). See the white paper, "How to Benchmark Code
+// Execution Times on Intel® IA-32 and IA-64 Instruction Set Architectures" by
+// Gabriele Paoloni for more information.
+//
+// If you are timing a large function (CPU time > a couple of seconds), use
+// x86_readtsc64 to read the timestamp counter in a 64-bit integer. The
+// out-of-order leakage that can occur is minimal compared to total runtime.
+static INLINE unsigned int x86_readtsc(void) {
+#if defined(__GNUC__) && __GNUC__
+  unsigned int tsc;
+  __asm__ __volatile__("rdtsc\n\t" : "=a"(tsc) :);
+  return tsc;
+#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC)
+  unsigned int tsc;
+  asm volatile("rdtsc\n\t" : "=a"(tsc) :);
+  return tsc;
+#else
+#if ARCH_X86_64
+  return (unsigned int)__rdtsc();
+#else
+  __asm rdtsc;
+#endif
+#endif
+}
+// 64-bit CPU cycle counter
+static INLINE uint64_t x86_readtsc64(void) {
+#if defined(__GNUC__) && __GNUC__
+  uint32_t hi, lo;
+  __asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi));
+  return ((uint64_t)hi << 32) | lo;
+#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC)
+  uint_t hi, lo;
+  asm volatile("rdtsc\n\t" : "=a"(lo), "=d"(hi));
+  return ((uint64_t)hi << 32) | lo;
+#else
+#if ARCH_X86_64
+  return (uint64_t)__rdtsc();
+#else
+  __asm rdtsc;
+#endif
+#endif
+}
+
+// 32-bit CPU cycle counter with a partial fence against out-of-order execution.
+static INLINE unsigned int x86_readtscp(void) {
+#if defined(__GNUC__) && __GNUC__
+  unsigned int tscp;
+  __asm__ __volatile__("rdtscp\n\t" : "=a"(tscp) :);
+  return tscp;
+#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC)
+  unsigned int tscp;
+  asm volatile("rdtscp\n\t" : "=a"(tscp) :);
+  return tscp;
+#elif defined(_MSC_VER)
+  unsigned int ui;
+  return (unsigned int)__rdtscp(&ui);
+#else
+#if ARCH_X86_64
+  return (unsigned int)__rdtscp();
+#else
+  __asm rdtscp;
+#endif
+#endif
+}
+
+static INLINE unsigned int x86_tsc_start(void) {
+  unsigned int reg_eax, reg_ebx, reg_ecx, reg_edx;
+  cpuid(0, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
+  return x86_readtsc();
+}
+
+static INLINE unsigned int x86_tsc_end(void) {
+  uint32_t v = x86_readtscp();
+  unsigned int reg_eax, reg_ebx, reg_ecx, reg_edx;
+  cpuid(0, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
+  return v;
+}
+
+#if defined(__GNUC__) && __GNUC__
+#define x86_pause_hint() __asm__ __volatile__("pause \n\t")
+#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC)
+#define x86_pause_hint() asm volatile("pause \n\t")
+#else
+#if ARCH_X86_64
+#define x86_pause_hint() _mm_pause();
+#else
+#define x86_pause_hint() __asm pause
+#endif
+#endif
+
+#if defined(__GNUC__) && __GNUC__
+static void x87_set_control_word(unsigned short mode) {
+  __asm__ __volatile__("fldcw %0" : : "m"(*&mode));
+}
+static unsigned short x87_get_control_word(void) {
+  unsigned short mode;
+  __asm__ __volatile__("fstcw %0\n\t" : "=m"(*&mode) :);
+  return mode;
+}
+#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC)
+static void x87_set_control_word(unsigned short mode) {
+  asm volatile("fldcw %0" : : "m"(*&mode));
+}
+static unsigned short x87_get_control_word(void) {
+  unsigned short mode;
+  asm volatile("fstcw %0\n\t" : "=m"(*&mode) :);
+  return mode;
+}
+#elif ARCH_X86_64
+/* No fldcw intrinsics on Windows x64, punt to external asm */
+extern void aom_winx64_fldcw(unsigned short mode);
+extern unsigned short aom_winx64_fstcw(void);
+#define x87_set_control_word aom_winx64_fldcw
+#define x87_get_control_word aom_winx64_fstcw
+#else
+static void x87_set_control_word(unsigned short mode) {
+  __asm { fldcw mode }
+}
+static unsigned short x87_get_control_word(void) {
+  unsigned short mode;
+  __asm { fstcw mode }
+  return mode;
+}
+#endif
+
+static INLINE unsigned int x87_set_double_precision(void) {
+  unsigned int mode = x87_get_control_word();
+  x87_set_control_word((mode & ~0x300) | 0x200);
+  return mode;
+}
+
+extern void aom_reset_mmx_state(void);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_PORTS_X86_H_
diff --git a/libs/libaom/src/aom_ports/x86_abi_support.asm b/libs/libaom/src/aom_ports/x86_abi_support.asm
new file mode 100644
index 000000000..64489908f
--- /dev/null
+++ b/libs/libaom/src/aom_ports/x86_abi_support.asm
@@ -0,0 +1,402 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+
+%include "config/aom_config.asm"
+
+; 32/64 bit compatibility macros
+;
+; In general, we make the source use 64 bit syntax, then twiddle with it using
+; the preprocessor to get the 32 bit syntax on 32 bit platforms.
+;
+%ifidn __OUTPUT_FORMAT__,elf32
+%define ABI_IS_32BIT 1
+%elifidn __OUTPUT_FORMAT__,macho32
+%define ABI_IS_32BIT 1
+%elifidn __OUTPUT_FORMAT__,win32
+%define ABI_IS_32BIT 1
+%elifidn __OUTPUT_FORMAT__,aout
+%define ABI_IS_32BIT 1
+%else
+%define ABI_IS_32BIT 0
+%endif
+
+%if ABI_IS_32BIT
+%define rax eax
+%define rbx ebx
+%define rcx ecx
+%define rdx edx
+%define rsi esi
+%define rdi edi
+%define rsp esp
+%define rbp ebp
+%define movsxd mov
+%macro movq 2
+  %ifidn %1,eax
+    movd %1,%2
+  %elifidn %2,eax
+    movd %1,%2
+  %elifidn %1,ebx
+    movd %1,%2
+  %elifidn %2,ebx
+    movd %1,%2
+  %elifidn %1,ecx
+    movd %1,%2
+  %elifidn %2,ecx
+    movd %1,%2
+  %elifidn %1,edx
+    movd %1,%2
+  %elifidn %2,edx
+    movd %1,%2
+  %elifidn %1,esi
+    movd %1,%2
+  %elifidn %2,esi
+    movd %1,%2
+  %elifidn %1,edi
+    movd %1,%2
+  %elifidn %2,edi
+    movd %1,%2
+  %elifidn %1,esp
+    movd %1,%2
+  %elifidn %2,esp
+    movd %1,%2
+  %elifidn %1,ebp
+    movd %1,%2
+  %elifidn %2,ebp
+    movd %1,%2
+  %else
+    movq %1,%2
+  %endif
+%endmacro
+%endif
+
+
+; LIBAOM_YASM_WIN64
+; Set LIBAOM_YASM_WIN64 if output is Windows 64bit so the code will work if x64
+; or win64 is defined on the Yasm command line.
+%ifidn __OUTPUT_FORMAT__,win64
+%define LIBAOM_YASM_WIN64 1
+%elifidn __OUTPUT_FORMAT__,x64
+%define LIBAOM_YASM_WIN64 1
+%else
+%define LIBAOM_YASM_WIN64 0
+%endif
+
+; sym()
+; Return the proper symbol name for the target ABI.
+;
+; Certain ABIs, notably MS COFF and Darwin MACH-O, require that symbols
+; with C linkage be prefixed with an underscore.
+;
+%ifidn   __OUTPUT_FORMAT__,elf32
+%define sym(x) x
+%elifidn __OUTPUT_FORMAT__,elf64
+%define sym(x) x
+%elifidn __OUTPUT_FORMAT__,elfx32
+%define sym(x) x
+%elif LIBAOM_YASM_WIN64
+%define sym(x) x
+%else
+%define sym(x) _ %+ x
+%endif
+
+;  PRIVATE
+;  Macro for the attribute to hide a global symbol for the target ABI.
+;  This is only active if CHROMIUM is defined.
+;
+;  Chromium doesn't like exported global symbols due to symbol clashing with
+;  plugins among other things.
+;
+;  Requires Chromium's patched copy of yasm:
+;    http://src.chromium.org/viewvc/chrome?view=rev&revision=73761
+;    http://www.tortall.net/projects/yasm/ticket/236
+;
+%ifdef CHROMIUM
+  %ifdef __NASM_VER__
+    %if __NASM_VERSION_ID__ < 0x020e0000 ; 2.14
+      ; nasm < 2.14 does not support :private_extern directive
+      %fatal Must use nasm 2.14 or newer
+    %endif
+  %endif
+
+  %ifidn   __OUTPUT_FORMAT__,elf32
+    %define PRIVATE :hidden
+  %elifidn __OUTPUT_FORMAT__,elf64
+    %define PRIVATE :hidden
+  %elifidn __OUTPUT_FORMAT__,elfx32
+    %define PRIVATE :hidden
+  %elif LIBAOM_YASM_WIN64
+    %define PRIVATE
+  %else
+    %define PRIVATE :private_extern
+  %endif
+%else
+  %define PRIVATE
+%endif
+
+; arg()
+; Return the address specification of the given argument
+;
+%if ABI_IS_32BIT
+  %define arg(x) [ebp+8+4*x]
+%else
+  ; 64 bit ABI passes arguments in registers. This is a workaround to get up
+  ; and running quickly. Relies on SHADOW_ARGS_TO_STACK
+  %if LIBAOM_YASM_WIN64
+    %define arg(x) [rbp+16+8*x]
+  %else
+    %define arg(x) [rbp-8-8*x]
+  %endif
+%endif
+
+; REG_SZ_BYTES, REG_SZ_BITS
+; Size of a register
+%if ABI_IS_32BIT
+%define REG_SZ_BYTES 4
+%define REG_SZ_BITS  32
+%else
+%define REG_SZ_BYTES 8
+%define REG_SZ_BITS  64
+%endif
+
+
+; ALIGN_STACK <alignment> <register>
+; This macro aligns the stack to the given alignment (in bytes). The stack
+; is left such that the previous value of the stack pointer is the first
+; argument on the stack (ie, the inverse of this macro is 'pop rsp.')
+; This macro uses one temporary register, which is not preserved, and thus
+; must be specified as an argument.
+%macro ALIGN_STACK 2
+    mov         %2, rsp
+    and         rsp, -%1
+    lea         rsp, [rsp - (%1 - REG_SZ_BYTES)]
+    push        %2
+%endmacro
+
+
+;
+; The Microsoft assembler tries to impose a certain amount of type safety in
+; its register usage. YASM doesn't recognize these directives, so we just
+; %define them away to maintain as much compatibility as possible with the
+; original inline assembler we're porting from.
+;
+%idefine PTR
+%idefine XMMWORD
+%idefine MMWORD
+
+; PIC macros
+;
+%if ABI_IS_32BIT
+  %if CONFIG_PIC=1
+  %ifidn __OUTPUT_FORMAT__,elf32
+    %define WRT_PLT wrt ..plt
+    %macro GET_GOT 1
+      extern _GLOBAL_OFFSET_TABLE_
+      push %1
+      call %%get_got
+      %%sub_offset:
+      jmp %%exitGG
+      %%get_got:
+      mov %1, [esp]
+      add %1, _GLOBAL_OFFSET_TABLE_ + $$ - %%sub_offset wrt ..gotpc
+      ret
+      %%exitGG:
+      %undef GLOBAL
+      %define GLOBAL(x) x + %1 wrt ..gotoff
+      %undef RESTORE_GOT
+      %define RESTORE_GOT pop %1
+    %endmacro
+  %elifidn __OUTPUT_FORMAT__,macho32
+    %macro GET_GOT 1
+      push %1
+      call %%get_got
+      %%get_got:
+      pop  %1
+      %undef GLOBAL
+      %define GLOBAL(x) x + %1 - %%get_got
+      %undef RESTORE_GOT
+      %define RESTORE_GOT pop %1
+    %endmacro
+  %endif
+  %endif
+
+  %ifdef CHROMIUM
+    %ifidn __OUTPUT_FORMAT__,macho32
+      %define HIDDEN_DATA(x) x:private_extern
+    %else
+      %define HIDDEN_DATA(x) x
+    %endif
+  %else
+    %define HIDDEN_DATA(x) x
+  %endif
+%else
+  %macro GET_GOT 1
+  %endmacro
+  %define GLOBAL(x) rel x
+  %ifidn __OUTPUT_FORMAT__,elf64
+    %define WRT_PLT wrt ..plt
+    %define HIDDEN_DATA(x) x:data hidden
+  %elifidn __OUTPUT_FORMAT__,elfx32
+    %define WRT_PLT wrt ..plt
+    %define HIDDEN_DATA(x) x:data hidden
+  %elifidn __OUTPUT_FORMAT__,macho64
+    %ifdef CHROMIUM
+      %define HIDDEN_DATA(x) x:private_extern
+    %else
+      %define HIDDEN_DATA(x) x
+    %endif
+  %else
+    %define HIDDEN_DATA(x) x
+  %endif
+%endif
+%ifnmacro GET_GOT
+    %macro GET_GOT 1
+    %endmacro
+    %define GLOBAL(x) x
+%endif
+%ifndef RESTORE_GOT
+%define RESTORE_GOT
+%endif
+%ifndef WRT_PLT
+%define WRT_PLT
+%endif
+
+%if ABI_IS_32BIT
+  %macro SHADOW_ARGS_TO_STACK 1
+  %endm
+  %define UNSHADOW_ARGS
+%else
+%if LIBAOM_YASM_WIN64
+  %macro SHADOW_ARGS_TO_STACK 1 ; argc
+    %if %1 > 0
+        mov arg(0),rcx
+    %endif
+    %if %1 > 1
+        mov arg(1),rdx
+    %endif
+    %if %1 > 2
+        mov arg(2),r8
+    %endif
+    %if %1 > 3
+        mov arg(3),r9
+    %endif
+  %endm
+%else
+  %macro SHADOW_ARGS_TO_STACK 1 ; argc
+    %if %1 > 0
+        push rdi
+    %endif
+    %if %1 > 1
+        push rsi
+    %endif
+    %if %1 > 2
+        push rdx
+    %endif
+    %if %1 > 3
+        push rcx
+    %endif
+    %if %1 > 4
+        push r8
+    %endif
+    %if %1 > 5
+        push r9
+    %endif
+    %if %1 > 6
+      %assign i %1-6
+      %assign off 16
+      %rep i
+        mov rax,[rbp+off]
+        push rax
+        %assign off off+8
+      %endrep
+    %endif
+  %endm
+%endif
+  %define UNSHADOW_ARGS mov rsp, rbp
+%endif
+
+; Win64 ABI requires that XMM6:XMM15 are callee saved
+; SAVE_XMM n, [u]
+; store registers 6-n on the stack
+; if u is specified, use unaligned movs.
+; Win64 ABI requires 16 byte stack alignment, but then pushes an 8 byte return
+; value. Typically we follow this up with 'push rbp' - re-aligning the stack -
+; but in some cases this is not done and unaligned movs must be used.
+%if LIBAOM_YASM_WIN64
+%macro SAVE_XMM 1-2 a
+  %if %1 < 6
+    %error Only xmm registers 6-15 must be preserved
+  %else
+    %assign last_xmm %1
+    %define movxmm movdq %+ %2
+    %assign xmm_stack_space ((last_xmm - 5) * 16)
+    sub rsp, xmm_stack_space
+    %assign i 6
+    %rep (last_xmm - 5)
+      movxmm [rsp + ((i - 6) * 16)], xmm %+ i
+      %assign i i+1
+    %endrep
+  %endif
+%endmacro
+%macro RESTORE_XMM 0
+  %ifndef last_xmm
+    %error RESTORE_XMM must be paired with SAVE_XMM n
+  %else
+    %assign i last_xmm
+    %rep (last_xmm - 5)
+      movxmm xmm %+ i, [rsp +((i - 6) * 16)]
+      %assign i i-1
+    %endrep
+    add rsp, xmm_stack_space
+    ; there are a couple functions which return from multiple places.
+    ; otherwise, we could uncomment these:
+    ; %undef last_xmm
+    ; %undef xmm_stack_space
+    ; %undef movxmm
+  %endif
+%endmacro
+%else
+%macro SAVE_XMM 1-2
+%endmacro
+%macro RESTORE_XMM 0
+%endmacro
+%endif
+
+; Name of the rodata section
+;
+; .rodata seems to be an elf-ism, as it doesn't work on OSX.
+;
+%ifidn __OUTPUT_FORMAT__,macho64
+%define SECTION_RODATA section .text
+%elifidn __OUTPUT_FORMAT__,macho32
+%macro SECTION_RODATA 0
+section .text
+%endmacro
+%elifidn __OUTPUT_FORMAT__,aout
+%define SECTION_RODATA section .data
+%else
+%define SECTION_RODATA section .rodata
+%endif
+
+
+; Tell GNU ld that we don't require an executable stack.
+%ifidn __OUTPUT_FORMAT__,elf32
+section .note.GNU-stack noalloc noexec nowrite progbits
+section .text
+%elifidn __OUTPUT_FORMAT__,elf64
+section .note.GNU-stack noalloc noexec nowrite progbits
+section .text
+%elifidn __OUTPUT_FORMAT__,elfx32
+section .note.GNU-stack noalloc noexec nowrite progbits
+section .text
+%endif
diff --git a/libs/libaom/src/aom_scale/aom_scale.cmake b/libs/libaom/src/aom_scale/aom_scale.cmake
new file mode 100644
index 000000000..e83299320
--- /dev/null
+++ b/libs/libaom/src/aom_scale/aom_scale.cmake
@@ -0,0 +1,45 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_AOM_SCALE_AOM_SCALE_CMAKE_)
+  return()
+endif() # AOM_AOM_SCALE_AOM_SCALE_CMAKE_
+set(AOM_AOM_SCALE_AOM_SCALE_CMAKE_ 1)
+
+list(APPEND AOM_SCALE_SOURCES "${AOM_ROOT}/aom_scale/aom_scale.h"
+            "${AOM_ROOT}/aom_scale/generic/aom_scale.c"
+            "${AOM_ROOT}/aom_scale/generic/gen_scalers.c"
+            "${AOM_ROOT}/aom_scale/generic/yv12config.c"
+            "${AOM_ROOT}/aom_scale/generic/yv12extend.c"
+            "${AOM_ROOT}/aom_scale/yv12config.h")
+
+list(APPEND AOM_SCALE_INTRIN_DSPR2
+            "${AOM_ROOT}/aom_scale/mips/dspr2/yv12extend_dspr2.c")
+
+# Creates the aom_scale build target and makes libaom depend on it. The libaom
+# target must exist before this function is called.
+function(setup_aom_scale_targets)
+  add_library(aom_scale OBJECT ${AOM_SCALE_SOURCES})
+  target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_scale>)
+
+  if(HAVE_DSPR2)
+    add_intrinsics_object_library("" "dspr2" "aom_scale"
+                                  "AOM_SCALE_INTRIN_DSPR2")
+  endif()
+
+  target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_scale>)
+  if(BUILD_SHARED_LIBS)
+    target_sources(aom_static PRIVATE $<TARGET_OBJECTS:aom_scale>)
+  endif()
+
+  # Pass the new lib targets up to the parent scope instance of
+  # $AOM_LIB_TARGETS.
+  set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_scale PARENT_SCOPE)
+endfunction()
diff --git a/libs/libaom/src/aom_scale/aom_scale.h b/libs/libaom/src/aom_scale/aom_scale.h
new file mode 100644
index 000000000..11812a145
--- /dev/null
+++ b/libs/libaom/src/aom_scale/aom_scale.h
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_SCALE_AOM_SCALE_H_
+#define AOM_AOM_SCALE_AOM_SCALE_H_
+
+#include "aom_scale/yv12config.h"
+
+extern void aom_scale_frame(YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst,
+                            unsigned char *temp_area, unsigned char temp_height,
+                            unsigned int hscale, unsigned int hratio,
+                            unsigned int vscale, unsigned int vratio,
+                            unsigned int interlaced, const int num_planes);
+
+#endif  // AOM_AOM_SCALE_AOM_SCALE_H_
diff --git a/libs/libaom/src/aom_scale/aom_scale_rtcd.c b/libs/libaom/src/aom_scale/aom_scale_rtcd.c
new file mode 100644
index 000000000..a04e053b0
--- /dev/null
+++ b/libs/libaom/src/aom_scale/aom_scale_rtcd.c
@@ -0,0 +1,18 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include "config/aom_config.h"
+
+#define RTCD_C
+#include "config/aom_scale_rtcd.h"
+
+#include "aom_ports/aom_once.h"
+
+void aom_scale_rtcd() { aom_once(setup_rtcd_internal); }
diff --git a/libs/libaom/src/aom_scale/aom_scale_rtcd.pl b/libs/libaom/src/aom_scale/aom_scale_rtcd.pl
new file mode 100644
index 000000000..eef6f16a7
--- /dev/null
+++ b/libs/libaom/src/aom_scale/aom_scale_rtcd.pl
@@ -0,0 +1,55 @@
+##
+## Copyright (c) 2017, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+sub aom_scale_forward_decls() {
+print <<EOF
+struct yv12_buffer_config;
+EOF
+}
+forward_decls qw/aom_scale_forward_decls/;
+
+# Scaler functions
+if (aom_config("CONFIG_SPATIAL_RESAMPLING") eq "yes") {
+  add_proto qw/void aom_horizontal_line_5_4_scale/, "const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width";
+  add_proto qw/void aom_vertical_band_5_4_scale/, "unsigned char *source, int src_pitch, unsigned char *dest, int dest_pitch, unsigned int dest_width";
+  add_proto qw/void aom_horizontal_line_5_3_scale/, "const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width";
+  add_proto qw/void aom_vertical_band_5_3_scale/, "unsigned char *source, int src_pitch, unsigned char *dest, int dest_pitch, unsigned int dest_width";
+  add_proto qw/void aom_horizontal_line_2_1_scale/, "const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width";
+  add_proto qw/void aom_vertical_band_2_1_scale/, "unsigned char *source, int src_pitch, unsigned char *dest, int dest_pitch, unsigned int dest_width";
+  add_proto qw/void aom_vertical_band_2_1_scale_i/, "unsigned char *source, int src_pitch, unsigned char *dest, int dest_pitch, unsigned int dest_width";
+}
+
+add_proto qw/int aom_yv12_realloc_with_new_border/, "struct yv12_buffer_config *ybf, int new_border, int byte_alignment, int num_planes";
+
+add_proto qw/void aom_yv12_extend_frame_borders/, "struct yv12_buffer_config *ybf, const int num_planes";
+
+add_proto qw/void aom_yv12_copy_frame/, "const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, const int num_planes";
+
+add_proto qw/void aom_yv12_copy_y/, "const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc";
+
+add_proto qw/void aom_yv12_copy_u/, "const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc";
+
+add_proto qw/void aom_yv12_copy_v/, "const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc";
+
+add_proto qw/void aom_yv12_partial_copy_y/, "const struct yv12_buffer_config *src_ybc, int hstart1, int hend1, int vstart1, int vend1, struct yv12_buffer_config *dst_ybc, int hstart2, int vstart2";
+add_proto qw/void aom_yv12_partial_coloc_copy_y/, "const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc, int hstart, int hend, int vstart, int vend";
+add_proto qw/void aom_yv12_partial_copy_u/, "const struct yv12_buffer_config *src_bc, int hstart1, int hend1, int vstart1, int vend1, struct yv12_buffer_config *dst_bc, int hstart2, int vstart2";
+add_proto qw/void aom_yv12_partial_coloc_copy_u/, "const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int hstart, int hend, int vstart, int vend";
+add_proto qw/void aom_yv12_partial_copy_v/, "const struct yv12_buffer_config *src_bc, int hstart1, int hend1, int vstart1, int vend1, struct yv12_buffer_config *dst_bc, int hstart2, int vstart2";
+add_proto qw/void aom_yv12_partial_coloc_copy_v/, "const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int hstart, int hend, int vstart, int vend";
+
+add_proto qw/void aom_extend_frame_borders/, "struct yv12_buffer_config *ybf, const int num_planes";
+specialize qw/aom_extend_frame_borders dspr2/;
+
+add_proto qw/void aom_extend_frame_inner_borders/, "struct yv12_buffer_config *ybf, const int num_planes";
+specialize qw/aom_extend_frame_inner_borders dspr2/;
+
+add_proto qw/void aom_extend_frame_borders_y/, "struct yv12_buffer_config *ybf";
+1;
diff --git a/libs/libaom/src/aom_scale/generic/aom_scale.c b/libs/libaom/src/aom_scale/generic/aom_scale.c
new file mode 100644
index 000000000..206c42c9f
--- /dev/null
+++ b/libs/libaom/src/aom_scale/generic/aom_scale.c
@@ -0,0 +1,506 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/****************************************************************************
+ *
+ *   Module Title :     scale.c
+ *
+ *   Description  :     Image scaling functions.
+ *
+ ***************************************************************************/
+
+/****************************************************************************
+ *  Header Files
+ ****************************************************************************/
+#include "config/aom_scale_rtcd.h"
+
+#include "aom_mem/aom_mem.h"
+#include "aom_scale/aom_scale.h"
+#include "aom_scale/yv12config.h"
+
+typedef struct {
+  int expanded_frame_width;
+  int expanded_frame_height;
+
+  int HScale;
+  int HRatio;
+  int VScale;
+  int VRatio;
+
+  YV12_BUFFER_CONFIG *src_yuv_config;
+  YV12_BUFFER_CONFIG *dst_yuv_config;
+
+} SCALE_VARS;
+
+/****************************************************************************
+ *
+ *  ROUTINE       : scale1d_2t1_i
+ *
+ *  INPUTS        : const unsigned char *source : Pointer to data to be scaled.
+ *                  int source_step             : Number of pixels to step on
+ *                                                in source.
+ *                  unsigned int source_scale   : Scale for source (UNUSED).
+ *                  unsigned int source_length  : Length of source (UNUSED).
+ *                  unsigned char *dest         : Pointer to output data array.
+ *                  int dest_step               : Number of pixels to step on
+ *                                                in destination.
+ *                  unsigned int dest_scale     : Scale for destination
+ *                                                (UNUSED).
+ *                  unsigned int dest_length    : Length of destination.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Performs 2-to-1 interpolated scaling.
+ *
+ *  SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+static void scale1d_2t1_i(const unsigned char *source, int source_step,
+                          unsigned int source_scale, unsigned int source_length,
+                          unsigned char *dest, int dest_step,
+                          unsigned int dest_scale, unsigned int dest_length) {
+  const unsigned char *const dest_end = dest + dest_length * dest_step;
+  (void)source_length;
+  (void)source_scale;
+  (void)dest_scale;
+
+  source_step *= 2;  // Every other row.
+
+  dest[0] = source[0];  // Special case: 1st pixel.
+  source += source_step;
+  dest += dest_step;
+
+  while (dest < dest_end) {
+    const unsigned int a = 3 * source[-source_step];
+    const unsigned int b = 10 * source[0];
+    const unsigned int c = 3 * source[source_step];
+    *dest = (unsigned char)((8 + a + b + c) >> 4);
+    source += source_step;
+    dest += dest_step;
+  }
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : scale1d_2t1_ps
+ *
+ *  INPUTS        : const unsigned char *source : Pointer to data to be scaled.
+ *                  int source_step             : Number of pixels to step on
+ *                                                in source.
+ *                  unsigned int source_scale   : Scale for source (UNUSED).
+ *                  unsigned int source_length  : Length of source (UNUSED).
+ *                  unsigned char *dest         : Pointer to output data array.
+ *                  int dest_step               : Number of pixels to step on
+ *                                                in destination.
+ *                  unsigned int dest_scale     : Scale for destination
+ *                                                (UNUSED).
+ *                  unsigned int dest_length    : Length of destination.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Performs 2-to-1 point subsampled scaling.
+ *
+ *  SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+static void scale1d_2t1_ps(const unsigned char *source, int source_step,
+                           unsigned int source_scale,
+                           unsigned int source_length, unsigned char *dest,
+                           int dest_step, unsigned int dest_scale,
+                           unsigned int dest_length) {
+  const unsigned char *const dest_end = dest + dest_length * dest_step;
+  (void)source_length;
+  (void)source_scale;
+  (void)dest_scale;
+
+  source_step *= 2;  // Every other row.
+
+  while (dest < dest_end) {
+    *dest = *source;
+    source += source_step;
+    dest += dest_step;
+  }
+}
+/****************************************************************************
+ *
+ *  ROUTINE       : scale1d_c
+ *
+ *  INPUTS        : const unsigned char *source : Pointer to data to be scaled.
+ *                  int source_step             : Number of pixels to step on
+ *                                                in source.
+ *                  unsigned int source_scale   : Scale for source.
+ *                  unsigned int source_length  : Length of source (UNUSED).
+ *                  unsigned char *dest         : Pointer to output data array.
+ *                  int dest_step               : Number of pixels to step on
+ *                                                in destination.
+ *                  unsigned int dest_scale     : Scale for destination.
+ *                  unsigned int dest_length    : Length of destination.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Performs linear interpolation in one dimension.
+ *
+ *  SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+static void scale1d_c(const unsigned char *source, int source_step,
+                      unsigned int source_scale, unsigned int source_length,
+                      unsigned char *dest, int dest_step,
+                      unsigned int dest_scale, unsigned int dest_length) {
+  const unsigned char *const dest_end = dest + dest_length * dest_step;
+  const unsigned int round_value = dest_scale / 2;
+  unsigned int left_modifier = dest_scale;
+  unsigned int right_modifier = 0;
+  unsigned char left_pixel = source[0];
+  unsigned char right_pixel = source[source_step];
+
+  (void)source_length;
+
+  /* These asserts are needed if there are boundary issues... */
+  /* assert ( dest_scale > source_scale );*/
+  /* assert ( (source_length - 1) * dest_scale >= (dest_length - 1) *
+   * source_scale);*/
+
+  while (dest < dest_end) {
+    *dest = (unsigned char)((left_modifier * left_pixel +
+                             right_modifier * right_pixel + round_value) /
+                            dest_scale);
+
+    right_modifier += source_scale;
+
+    while (right_modifier > dest_scale) {
+      right_modifier -= dest_scale;
+      source += source_step;
+      left_pixel = source[0];
+      right_pixel = source[source_step];
+    }
+
+    left_modifier = dest_scale - right_modifier;
+  }
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : Scale2D
+ *
+ *  INPUTS        : const unsigned char *source    : Pointer to data to be
+ *                                                   scaled.
+ *                  int source_pitch               : Stride of source image.
+ *                  unsigned int source_width      : Width of input image.
+ *                  unsigned int source_height     : Height of input image.
+ *                  unsigned char *dest            : Pointer to output data
+ *                                                   array.
+ *                  int dest_pitch                 : Stride of destination
+ *                                                   image.
+ *                  unsigned int dest_width        : Width of destination image.
+ *                  unsigned int dest_height       : Height of destination
+ *                                                   image.
+ *                  unsigned char *temp_area       : Pointer to temp work area.
+ *                  unsigned char temp_area_height : Height of temp work area.
+ *                  unsigned int hscale            : Horizontal scale factor
+ *                                                   numerator.
+ *                  unsigned int hratio            : Horizontal scale factor
+ *                                                   denominator.
+ *                  unsigned int vscale            : Vertical scale factor
+ *                                                   numerator.
+ *                  unsigned int vratio            : Vertical scale factor
+ *                                                   denominator.
+ *                  unsigned int interlaced        : Interlace flag.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Performs 2-tap linear interpolation in two dimensions.
+ *
+ *  SPECIAL NOTES : Expansion is performed one band at a time to help with
+ *                  caching.
+ *
+ ****************************************************************************/
+static void Scale2D(
+    /*const*/
+    unsigned char *source, int source_pitch, unsigned int source_width,
+    unsigned int source_height, unsigned char *dest, int dest_pitch,
+    unsigned int dest_width, unsigned int dest_height, unsigned char *temp_area,
+    unsigned char temp_area_height, unsigned int hscale, unsigned int hratio,
+    unsigned int vscale, unsigned int vratio, unsigned int interlaced) {
+  unsigned int i, j, k;
+  unsigned int bands;
+  unsigned int dest_band_height;
+  unsigned int source_band_height;
+
+  typedef void (*Scale1D)(const unsigned char *source, int source_step,
+                          unsigned int source_scale, unsigned int source_length,
+                          unsigned char *dest, int dest_step,
+                          unsigned int dest_scale, unsigned int dest_length);
+
+  Scale1D Scale1Dv = scale1d_c;
+  Scale1D Scale1Dh = scale1d_c;
+
+  void (*horiz_line_scale)(const unsigned char *, unsigned int, unsigned char *,
+                           unsigned int) = NULL;
+  void (*vert_band_scale)(unsigned char *, int, unsigned char *, int,
+                          unsigned int) = NULL;
+
+  int ratio_scalable = 1;
+  int interpolation = 0;
+
+  unsigned char *source_base;
+  unsigned char *line_src;
+
+  source_base = (unsigned char *)source;
+
+  if (source_pitch < 0) {
+    int offset;
+
+    offset = (source_height - 1);
+    offset *= source_pitch;
+
+    source_base += offset;
+  }
+
+  /* find out the ratio for each direction */
+  switch (hratio * 10 / hscale) {
+    case 8:
+      /* 4-5 Scale in Width direction */
+      horiz_line_scale = aom_horizontal_line_5_4_scale;
+      break;
+    case 6:
+      /* 3-5 Scale in Width direction */
+      horiz_line_scale = aom_horizontal_line_5_3_scale;
+      break;
+    case 5:
+      /* 1-2 Scale in Width direction */
+      horiz_line_scale = aom_horizontal_line_2_1_scale;
+      break;
+    default:
+      /* The ratio is not acceptable now */
+      /* throw("The ratio is not acceptable for now!"); */
+      ratio_scalable = 0;
+      break;
+  }
+
+  switch (vratio * 10 / vscale) {
+    case 8:
+      /* 4-5 Scale in vertical direction */
+      vert_band_scale = aom_vertical_band_5_4_scale;
+      source_band_height = 5;
+      dest_band_height = 4;
+      break;
+    case 6:
+      /* 3-5 Scale in vertical direction */
+      vert_band_scale = aom_vertical_band_5_3_scale;
+      source_band_height = 5;
+      dest_band_height = 3;
+      break;
+    case 5:
+      /* 1-2 Scale in vertical direction */
+
+      if (interlaced) {
+        /* if the content is interlaced, point sampling is used */
+        vert_band_scale = aom_vertical_band_2_1_scale;
+      } else {
+        interpolation = 1;
+        /* if the content is progressive, interplo */
+        vert_band_scale = aom_vertical_band_2_1_scale_i;
+      }
+
+      source_band_height = 2;
+      dest_band_height = 1;
+      break;
+    default:
+      /* The ratio is not acceptable now */
+      /* throw("The ratio is not acceptable for now!"); */
+      ratio_scalable = 0;
+      break;
+  }
+
+  if (ratio_scalable) {
+    if (source_height == dest_height) {
+      /* for each band of the image */
+      for (k = 0; k < dest_height; ++k) {
+        horiz_line_scale(source, source_width, dest, dest_width);
+        source += source_pitch;
+        dest += dest_pitch;
+      }
+
+      return;
+    }
+
+    if (interpolation) {
+      if (source < source_base) source = source_base;
+
+      horiz_line_scale(source, source_width, temp_area, dest_width);
+    }
+
+    for (k = 0; k < (dest_height + dest_band_height - 1) / dest_band_height;
+         ++k) {
+      /* scale one band horizontally */
+      for (i = 0; i < source_band_height; ++i) {
+        /* Trap case where we could read off the base of the source buffer */
+
+        line_src = source + i * source_pitch;
+
+        if (line_src < source_base) line_src = source_base;
+
+        horiz_line_scale(line_src, source_width,
+                         temp_area + (i + 1) * dest_pitch, dest_width);
+      }
+
+      /* Vertical scaling is in place */
+      vert_band_scale(temp_area + dest_pitch, dest_pitch, dest, dest_pitch,
+                      dest_width);
+
+      if (interpolation)
+        memcpy(temp_area, temp_area + source_band_height * dest_pitch,
+               dest_width);
+
+      /* Next band... */
+      source += (unsigned long)source_band_height * source_pitch;
+      dest += (unsigned long)dest_band_height * dest_pitch;
+    }
+
+    return;
+  }
+
+  if (hscale == 2 && hratio == 1) Scale1Dh = scale1d_2t1_ps;
+
+  if (vscale == 2 && vratio == 1) {
+    if (interlaced)
+      Scale1Dv = scale1d_2t1_ps;
+    else
+      Scale1Dv = scale1d_2t1_i;
+  }
+
+  if (source_height == dest_height) {
+    /* for each band of the image */
+    for (k = 0; k < dest_height; ++k) {
+      Scale1Dh(source, 1, hscale, source_width + 1, dest, 1, hratio,
+               dest_width);
+      source += source_pitch;
+      dest += dest_pitch;
+    }
+
+    return;
+  }
+
+  if (dest_height > source_height) {
+    dest_band_height = temp_area_height - 1;
+    source_band_height = dest_band_height * source_height / dest_height;
+  } else {
+    source_band_height = temp_area_height - 1;
+    dest_band_height = source_band_height * vratio / vscale;
+  }
+
+  /* first row needs to be done so that we can stay one row ahead for vertical
+   * zoom */
+  Scale1Dh(source, 1, hscale, source_width + 1, temp_area, 1, hratio,
+           dest_width);
+
+  /* for each band of the image */
+  bands = (dest_height + dest_band_height - 1) / dest_band_height;
+
+  for (k = 0; k < bands; ++k) {
+    /* scale one band horizontally */
+    for (i = 1; i < source_band_height + 1; ++i) {
+      if (k * source_band_height + i < source_height) {
+        Scale1Dh(source + i * source_pitch, 1, hscale, source_width + 1,
+                 temp_area + i * dest_pitch, 1, hratio, dest_width);
+      } else { /*  Duplicate the last row */
+        /* copy temp_area row 0 over from last row in the past */
+        memcpy(temp_area + i * dest_pitch, temp_area + (i - 1) * dest_pitch,
+               dest_pitch);
+      }
+    }
+
+    /* scale one band vertically */
+    for (j = 0; j < dest_width; ++j) {
+      Scale1Dv(&temp_area[j], dest_pitch, vscale, source_band_height + 1,
+               &dest[j], dest_pitch, vratio, dest_band_height);
+    }
+
+    /* copy temp_area row 0 over from last row in the past */
+    memcpy(temp_area, temp_area + source_band_height * dest_pitch, dest_pitch);
+
+    /* move to the next band */
+    source += source_band_height * source_pitch;
+    dest += dest_band_height * dest_pitch;
+  }
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : aom_scale_frame
+ *
+ *  INPUTS        : YV12_BUFFER_CONFIG *src        : Pointer to frame to be
+ *                                                   scaled.
+ *                  YV12_BUFFER_CONFIG *dst        : Pointer to buffer to hold
+ *                                                   scaled frame.
+ *                  unsigned char *temp_area       : Pointer to temp work area.
+ *                  unsigned char temp_area_height : Height of temp work area.
+ *                  unsigned int hscale            : Horizontal scale factor
+ *                                                   numerator.
+ *                  unsigned int hratio            : Horizontal scale factor
+ *                                                   denominator.
+ *                  unsigned int vscale            : Vertical scale factor
+ *                                                   numerator.
+ *                  unsigned int vratio            : Vertical scale factor
+ *                                                   denominator.
+ *                  unsigned int interlaced        : Interlace flag.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Performs 2-tap linear interpolation in two dimensions.
+ *
+ *  SPECIAL NOTES : Expansion is performed one band at a time to help with
+ *                  caching.
+ *
+ ****************************************************************************/
+void aom_scale_frame(YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst,
+                     unsigned char *temp_area, unsigned char temp_height,
+                     unsigned int hscale, unsigned int hratio,
+                     unsigned int vscale, unsigned int vratio,
+                     unsigned int interlaced, const int num_planes) {
+  const int dw = (hscale - 1 + src->y_width * hratio) / hscale;
+  const int dh = (vscale - 1 + src->y_height * vratio) / vscale;
+
+  for (int plane = 0; plane < num_planes; ++plane) {
+    const int is_uv = plane > 0;
+    const int plane_dw = dw >> is_uv;
+    const int plane_dh = dh >> is_uv;
+
+    Scale2D((unsigned char *)src->buffers[plane], src->strides[is_uv],
+            src->widths[is_uv], src->heights[is_uv],
+            (unsigned char *)dst->buffers[plane], dst->strides[is_uv], plane_dw,
+            plane_dh, temp_area, temp_height, hscale, hratio, vscale, vratio,
+            interlaced);
+
+    if (plane_dw < dst->widths[is_uv])
+      for (int i = 0; i < plane_dh; ++i)
+        memset(dst->buffers[plane] + i * dst->strides[is_uv] + plane_dw - 1,
+               dst->buffers[plane][i * dst->strides[is_uv] + plane_dw - 2],
+               dst->widths[is_uv] - plane_dw + 1);
+
+    if (plane_dh < dst->heights[is_uv])
+      for (int i = plane_dh - 1; i < dst->heights[is_uv]; ++i)
+        memcpy(dst->buffers[plane] + i * dst->strides[is_uv],
+               dst->buffers[plane] + (plane_dh - 2) * dst->strides[is_uv],
+               dst->widths[is_uv] + 1);
+  }
+}
diff --git a/libs/libaom/src/aom_scale/generic/gen_scalers.c b/libs/libaom/src/aom_scale/generic/gen_scalers.c
new file mode 100644
index 000000000..549e2aa69
--- /dev/null
+++ b/libs/libaom/src/aom_scale/generic/gen_scalers.c
@@ -0,0 +1,201 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_scale_rtcd.h"
+
+#include "aom_scale/aom_scale.h"
+#include "aom_mem/aom_mem.h"
+/****************************************************************************
+ *  Imports
+ ****************************************************************************/
+
+/****************************************************************************
+ *
+ *
+ *  INPUTS        : const unsigned char *source : Pointer to source data.
+ *                  unsigned int source_width   : Stride of source.
+ *                  unsigned char *dest         : Pointer to destination data.
+ *                  unsigned int dest_width     : Stride of destination
+ *                                                (NOT USED).
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Copies horizontal line of pixels from source to
+ *                  destination scaling up by 4 to 5.
+ *
+ *  SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+void aom_horizontal_line_5_4_scale_c(const unsigned char *source,
+                                     unsigned int source_width,
+                                     unsigned char *dest,
+                                     unsigned int dest_width) {
+  const unsigned char *const source_end = source + source_width;
+  (void)dest_width;
+
+  while (source < source_end) {
+    const unsigned int a = source[0];
+    const unsigned int b = source[1];
+    const unsigned int c = source[2];
+    const unsigned int d = source[3];
+    const unsigned int e = source[4];
+
+    dest[0] = (unsigned char)a;
+    dest[1] = (unsigned char)((b * 192 + c * 64 + 128) >> 8);
+    dest[2] = (unsigned char)((c * 128 + d * 128 + 128) >> 8);
+    dest[3] = (unsigned char)((d * 64 + e * 192 + 128) >> 8);
+
+    source += 5;
+    dest += 4;
+  }
+}
+
+void aom_vertical_band_5_4_scale_c(unsigned char *source, int src_pitch,
+                                   unsigned char *dest, int dest_pitch,
+                                   unsigned int dest_width) {
+  const unsigned char *const dest_end = dest + dest_width;
+  while (dest < dest_end) {
+    const unsigned int a = source[0 * src_pitch];
+    const unsigned int b = source[1 * src_pitch];
+    const unsigned int c = source[2 * src_pitch];
+    const unsigned int d = source[3 * src_pitch];
+    const unsigned int e = source[4 * src_pitch];
+
+    dest[0 * dest_pitch] = (unsigned char)a;
+    dest[1 * dest_pitch] = (unsigned char)((b * 192 + c * 64 + 128) >> 8);
+    dest[2 * dest_pitch] = (unsigned char)((c * 128 + d * 128 + 128) >> 8);
+    dest[3 * dest_pitch] = (unsigned char)((d * 64 + e * 192 + 128) >> 8);
+
+    ++source;
+    ++dest;
+  }
+}
+
+/*7***************************************************************************
+ *
+ *  ROUTINE       : aom_horizontal_line_3_5_scale_c
+ *
+ *  INPUTS        : const unsigned char *source : Pointer to source data.
+ *                  unsigned int source_width   : Stride of source.
+ *                  unsigned char *dest         : Pointer to destination data.
+ *                  unsigned int dest_width     : Stride of destination
+ *                                                (NOT USED).
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Copies horizontal line of pixels from source to
+ *                  destination scaling up by 3 to 5.
+ *
+ *  SPECIAL NOTES : None.
+ *
+ *
+ ****************************************************************************/
+void aom_horizontal_line_5_3_scale_c(const unsigned char *source,
+                                     unsigned int source_width,
+                                     unsigned char *dest,
+                                     unsigned int dest_width) {
+  const unsigned char *const source_end = source + source_width;
+  (void)dest_width;
+  while (source < source_end) {
+    const unsigned int a = source[0];
+    const unsigned int b = source[1];
+    const unsigned int c = source[2];
+    const unsigned int d = source[3];
+    const unsigned int e = source[4];
+
+    dest[0] = (unsigned char)a;
+    dest[1] = (unsigned char)((b * 85 + c * 171 + 128) >> 8);
+    dest[2] = (unsigned char)((d * 171 + e * 85 + 128) >> 8);
+
+    source += 5;
+    dest += 3;
+  }
+}
+
+void aom_vertical_band_5_3_scale_c(unsigned char *source, int src_pitch,
+                                   unsigned char *dest, int dest_pitch,
+                                   unsigned int dest_width) {
+  const unsigned char *const dest_end = dest + dest_width;
+  while (dest < dest_end) {
+    const unsigned int a = source[0 * src_pitch];
+    const unsigned int b = source[1 * src_pitch];
+    const unsigned int c = source[2 * src_pitch];
+    const unsigned int d = source[3 * src_pitch];
+    const unsigned int e = source[4 * src_pitch];
+
+    dest[0 * dest_pitch] = (unsigned char)a;
+    dest[1 * dest_pitch] = (unsigned char)((b * 85 + c * 171 + 128) >> 8);
+    dest[2 * dest_pitch] = (unsigned char)((d * 171 + e * 85 + 128) >> 8);
+
+    ++source;
+    ++dest;
+  }
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : aom_horizontal_line_1_2_scale_c
+ *
+ *  INPUTS        : const unsigned char *source : Pointer to source data.
+ *                  unsigned int source_width   : Stride of source.
+ *                  unsigned char *dest         : Pointer to destination data.
+ *                  unsigned int dest_width     : Stride of destination
+ *                                                (NOT USED).
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Copies horizontal line of pixels from source to
+ *                  destination scaling up by 1 to 2.
+ *
+ *  SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+void aom_horizontal_line_2_1_scale_c(const unsigned char *source,
+                                     unsigned int source_width,
+                                     unsigned char *dest,
+                                     unsigned int dest_width) {
+  const unsigned char *const source_end = source + source_width;
+  (void)dest_width;
+  while (source < source_end) {
+    dest[0] = source[0];
+    source += 2;
+    ++dest;
+  }
+}
+
+void aom_vertical_band_2_1_scale_c(unsigned char *source, int src_pitch,
+                                   unsigned char *dest, int dest_pitch,
+                                   unsigned int dest_width) {
+  (void)dest_pitch;
+  (void)src_pitch;
+  memcpy(dest, source, dest_width);
+}
+
+void aom_vertical_band_2_1_scale_i_c(unsigned char *source, int src_pitch,
+                                     unsigned char *dest, int dest_pitch,
+                                     unsigned int dest_width) {
+  const unsigned char *const dest_end = dest + dest_width;
+  (void)dest_pitch;
+  while (dest < dest_end) {
+    const unsigned int a = source[-src_pitch] * 3;
+    const unsigned int b = source[0] * 10;
+    const unsigned int c = source[src_pitch] * 3;
+    dest[0] = (unsigned char)((8 + a + b + c) >> 4);
+    ++source;
+    ++dest;
+  }
+}
diff --git a/libs/libaom/src/aom_scale/generic/yv12config.c b/libs/libaom/src/aom_scale/generic/yv12config.c
new file mode 100644
index 000000000..1f80d7ba7
--- /dev/null
+++ b/libs/libaom/src/aom_scale/generic/yv12config.c
@@ -0,0 +1,269 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "aom/internal/aom_image_internal.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+#include "aom_scale/yv12config.h"
+#include "av1/common/enums.h"
+
+/****************************************************************************
+ *  Exports
+ ****************************************************************************/
+
+/****************************************************************************
+ *
+ ****************************************************************************/
+
+// TODO(jkoleszar): Maybe replace this with struct aom_image
+int aom_free_frame_buffer(YV12_BUFFER_CONFIG *ybf) {
+  if (ybf) {
+    if (ybf->buffer_alloc_sz > 0) {
+      aom_free(ybf->buffer_alloc);
+    }
+    if (ybf->y_buffer_8bit) aom_free(ybf->y_buffer_8bit);
+    aom_remove_metadata_from_frame_buffer(ybf);
+    /* buffer_alloc isn't accessed by most functions.  Rather y_buffer,
+      u_buffer and v_buffer point to buffer_alloc and are used.  Clear out
+      all of this so that a freed pointer isn't inadvertently used */
+    memset(ybf, 0, sizeof(YV12_BUFFER_CONFIG));
+    return 0;
+  }
+
+  return AOM_CODEC_MEM_ERROR;
+}
+
+static int realloc_frame_buffer_aligned(
+    YV12_BUFFER_CONFIG *ybf, int width, int height, int ss_x, int ss_y,
+    int use_highbitdepth, int border, int byte_alignment,
+    aom_codec_frame_buffer_t *fb, aom_get_frame_buffer_cb_fn_t cb,
+    void *cb_priv, const int y_stride, const uint64_t yplane_size,
+    const uint64_t uvplane_size, const int aligned_width,
+    const int aligned_height, const int uv_width, const int uv_height,
+    const int uv_stride, const int uv_border_w, const int uv_border_h) {
+  if (ybf) {
+    const int aom_byte_align = (byte_alignment == 0) ? 1 : byte_alignment;
+    const uint64_t frame_size =
+        (1 + use_highbitdepth) * (yplane_size + 2 * uvplane_size);
+
+    uint8_t *buf = NULL;
+
+#if defined AOM_MAX_ALLOCABLE_MEMORY
+    // The size of ybf->buffer_alloc.
+    uint64_t alloc_size = frame_size;
+    // The size of ybf->y_buffer_8bit.
+    if (use_highbitdepth) alloc_size += yplane_size;
+    // The decoder may allocate REF_FRAMES frame buffers in the frame buffer
+    // pool. Bound the total amount of allocated memory as if these REF_FRAMES
+    // frame buffers were allocated in a single allocation.
+    if (alloc_size > AOM_MAX_ALLOCABLE_MEMORY / REF_FRAMES)
+      return AOM_CODEC_MEM_ERROR;
+#endif
+
+    if (cb != NULL) {
+      const int align_addr_extra_size = 31;
+      const uint64_t external_frame_size = frame_size + align_addr_extra_size;
+
+      assert(fb != NULL);
+
+      if (external_frame_size != (size_t)external_frame_size)
+        return AOM_CODEC_MEM_ERROR;
+
+      // Allocation to hold larger frame, or first allocation.
+      if (cb(cb_priv, (size_t)external_frame_size, fb) < 0)
+        return AOM_CODEC_MEM_ERROR;
+
+      if (fb->data == NULL || fb->size < external_frame_size)
+        return AOM_CODEC_MEM_ERROR;
+
+      ybf->buffer_alloc = (uint8_t *)aom_align_addr(fb->data, 32);
+
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer)
+      // This memset is needed for fixing the issue of using uninitialized
+      // value in msan test. It will cause a perf loss, so only do this for
+      // msan test.
+      memset(ybf->buffer_alloc, 0, (size_t)frame_size);
+#endif
+#endif
+    } else if (frame_size > ybf->buffer_alloc_sz) {
+      // Allocation to hold larger frame, or first allocation.
+      aom_free(ybf->buffer_alloc);
+      ybf->buffer_alloc = NULL;
+      ybf->buffer_alloc_sz = 0;
+
+      if (frame_size != (size_t)frame_size) return AOM_CODEC_MEM_ERROR;
+
+      ybf->buffer_alloc = (uint8_t *)aom_memalign(32, (size_t)frame_size);
+      if (!ybf->buffer_alloc) return AOM_CODEC_MEM_ERROR;
+
+      ybf->buffer_alloc_sz = (size_t)frame_size;
+
+      // This memset is needed for fixing valgrind error from C loop filter
+      // due to access uninitialized memory in frame border. It could be
+      // removed if border is totally removed.
+      memset(ybf->buffer_alloc, 0, ybf->buffer_alloc_sz);
+    }
+
+    ybf->y_crop_width = width;
+    ybf->y_crop_height = height;
+    ybf->y_width = aligned_width;
+    ybf->y_height = aligned_height;
+    ybf->y_stride = y_stride;
+
+    ybf->uv_crop_width = (width + ss_x) >> ss_x;
+    ybf->uv_crop_height = (height + ss_y) >> ss_y;
+    ybf->uv_width = uv_width;
+    ybf->uv_height = uv_height;
+    ybf->uv_stride = uv_stride;
+
+    ybf->border = border;
+    ybf->frame_size = (size_t)frame_size;
+    ybf->subsampling_x = ss_x;
+    ybf->subsampling_y = ss_y;
+
+    buf = ybf->buffer_alloc;
+    if (use_highbitdepth) {
+      // Store uint16 addresses when using 16bit framebuffers
+      buf = CONVERT_TO_BYTEPTR(ybf->buffer_alloc);
+      ybf->flags = YV12_FLAG_HIGHBITDEPTH;
+    } else {
+      ybf->flags = 0;
+    }
+
+    ybf->y_buffer = (uint8_t *)aom_align_addr(
+        buf + (border * y_stride) + border, aom_byte_align);
+    ybf->u_buffer = (uint8_t *)aom_align_addr(
+        buf + yplane_size + (uv_border_h * uv_stride) + uv_border_w,
+        aom_byte_align);
+    ybf->v_buffer =
+        (uint8_t *)aom_align_addr(buf + yplane_size + uvplane_size +
+                                      (uv_border_h * uv_stride) + uv_border_w,
+                                  aom_byte_align);
+
+    ybf->use_external_reference_buffers = 0;
+
+    if (use_highbitdepth) {
+      if (ybf->y_buffer_8bit) aom_free(ybf->y_buffer_8bit);
+      ybf->y_buffer_8bit = (uint8_t *)aom_memalign(32, (size_t)yplane_size);
+      if (!ybf->y_buffer_8bit) return AOM_CODEC_MEM_ERROR;
+    } else {
+      if (ybf->y_buffer_8bit) {
+        aom_free(ybf->y_buffer_8bit);
+        ybf->y_buffer_8bit = NULL;
+        ybf->buf_8bit_valid = 0;
+      }
+    }
+
+    ybf->corrupted = 0; /* assume not corrupted by errors */
+    return 0;
+  }
+  return AOM_CODEC_MEM_ERROR;
+}
+
+static int calc_stride_and_planesize(const int ss_x, const int ss_y,
+                                     const int aligned_width,
+                                     const int aligned_height, const int border,
+                                     const int byte_alignment, int *y_stride,
+                                     int *uv_stride, uint64_t *yplane_size,
+                                     uint64_t *uvplane_size,
+                                     const int uv_height) {
+  /* Only support allocating buffers that have a border that's a multiple
+   * of 32. The border restriction is required to get 16-byte alignment of
+   * the start of the chroma rows without introducing an arbitrary gap
+   * between planes, which would break the semantics of things like
+   * aom_img_set_rect(). */
+  if (border & 0x1f) return AOM_CODEC_MEM_ERROR;
+  *y_stride = ((aligned_width + 2 * border) + 31) & ~31;
+  *yplane_size =
+      (aligned_height + 2 * border) * (uint64_t)(*y_stride) + byte_alignment;
+
+  *uv_stride = *y_stride >> ss_x;
+  *uvplane_size = (uv_height + 2 * (border >> ss_y)) * (uint64_t)(*uv_stride) +
+                  byte_alignment;
+  return 0;
+}
+
+int aom_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
+                             int ss_x, int ss_y, int use_highbitdepth,
+                             int border, int byte_alignment,
+                             aom_codec_frame_buffer_t *fb,
+                             aom_get_frame_buffer_cb_fn_t cb, void *cb_priv) {
+#if CONFIG_SIZE_LIMIT
+  if (width > DECODE_WIDTH_LIMIT || height > DECODE_HEIGHT_LIMIT)
+    return AOM_CODEC_MEM_ERROR;
+#endif
+
+  if (ybf) {
+    int y_stride = 0;
+    int uv_stride = 0;
+    uint64_t yplane_size = 0;
+    uint64_t uvplane_size = 0;
+    const int aligned_width = (width + 7) & ~7;
+    const int aligned_height = (height + 7) & ~7;
+    const int uv_width = aligned_width >> ss_x;
+    const int uv_height = aligned_height >> ss_y;
+    const int uv_border_w = border >> ss_x;
+    const int uv_border_h = border >> ss_y;
+
+    int error = calc_stride_and_planesize(
+        ss_x, ss_y, aligned_width, aligned_height, border, byte_alignment,
+        &y_stride, &uv_stride, &yplane_size, &uvplane_size, uv_height);
+    if (error) return error;
+    return realloc_frame_buffer_aligned(
+        ybf, width, height, ss_x, ss_y, use_highbitdepth, border,
+        byte_alignment, fb, cb, cb_priv, y_stride, yplane_size, uvplane_size,
+        aligned_width, aligned_height, uv_width, uv_height, uv_stride,
+        uv_border_w, uv_border_h);
+  }
+  return AOM_CODEC_MEM_ERROR;
+}
+
+int aom_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
+                           int ss_x, int ss_y, int use_highbitdepth, int border,
+                           int byte_alignment) {
+  if (ybf) {
+    aom_free_frame_buffer(ybf);
+    return aom_realloc_frame_buffer(ybf, width, height, ss_x, ss_y,
+                                    use_highbitdepth, border, byte_alignment,
+                                    NULL, NULL, NULL);
+  }
+  return AOM_CODEC_MEM_ERROR;
+}
+
+void aom_remove_metadata_from_frame_buffer(YV12_BUFFER_CONFIG *ybf) {
+  if (ybf && ybf->metadata) {
+    aom_img_metadata_array_free(ybf->metadata);
+    ybf->metadata = NULL;
+  }
+}
+
+int aom_copy_metadata_to_frame_buffer(YV12_BUFFER_CONFIG *ybf,
+                                      const aom_metadata_array_t *arr) {
+  if (!ybf || !arr || !arr->metadata_array) return -1;
+  aom_remove_metadata_from_frame_buffer(ybf);
+  ybf->metadata = aom_img_metadata_array_alloc(arr->sz);
+  if (!ybf->metadata) return -1;
+  for (size_t i = 0; i < ybf->metadata->sz; i++) {
+    ybf->metadata->metadata_array[i] = aom_img_metadata_alloc(
+        arr->metadata_array[i]->type, arr->metadata_array[i]->payload,
+        arr->metadata_array[i]->sz, arr->metadata_array[i]->insert_flag);
+    if (ybf->metadata->metadata_array[i] == NULL) {
+      aom_img_metadata_array_free(ybf->metadata);
+      ybf->metadata = NULL;
+      return -1;
+    }
+  }
+  ybf->metadata->sz = arr->sz;
+  return 0;
+}
diff --git a/libs/libaom/src/aom_scale/generic/yv12extend.c b/libs/libaom/src/aom_scale/generic/yv12extend.c
new file mode 100644
index 000000000..834a59dbf
--- /dev/null
+++ b/libs/libaom/src/aom_scale/generic/yv12extend.c
@@ -0,0 +1,477 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+#include "aom_scale/yv12config.h"
+
+static void extend_plane(uint8_t *const src, int src_stride, int width,
+                         int height, int extend_top, int extend_left,
+                         int extend_bottom, int extend_right) {
+  int i;
+  const int linesize = extend_left + extend_right + width;
+
+  /* copy the left and right most columns out */
+  uint8_t *src_ptr1 = src;
+  uint8_t *src_ptr2 = src + width - 1;
+  uint8_t *dst_ptr1 = src - extend_left;
+  uint8_t *dst_ptr2 = src + width;
+
+  for (i = 0; i < height; ++i) {
+    memset(dst_ptr1, src_ptr1[0], extend_left);
+    memset(dst_ptr2, src_ptr2[0], extend_right);
+    src_ptr1 += src_stride;
+    src_ptr2 += src_stride;
+    dst_ptr1 += src_stride;
+    dst_ptr2 += src_stride;
+  }
+
+  /* Now copy the top and bottom lines into each line of the respective
+   * borders
+   */
+  src_ptr1 = src - extend_left;
+  src_ptr2 = src + src_stride * (height - 1) - extend_left;
+  dst_ptr1 = src + src_stride * -extend_top - extend_left;
+  dst_ptr2 = src + src_stride * height - extend_left;
+
+  for (i = 0; i < extend_top; ++i) {
+    memcpy(dst_ptr1, src_ptr1, linesize);
+    dst_ptr1 += src_stride;
+  }
+
+  for (i = 0; i < extend_bottom; ++i) {
+    memcpy(dst_ptr2, src_ptr2, linesize);
+    dst_ptr2 += src_stride;
+  }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static void extend_plane_high(uint8_t *const src8, int src_stride, int width,
+                              int height, int extend_top, int extend_left,
+                              int extend_bottom, int extend_right) {
+  int i;
+  const int linesize = extend_left + extend_right + width;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+
+  /* copy the left and right most columns out */
+  uint16_t *src_ptr1 = src;
+  uint16_t *src_ptr2 = src + width - 1;
+  uint16_t *dst_ptr1 = src - extend_left;
+  uint16_t *dst_ptr2 = src + width;
+
+  for (i = 0; i < height; ++i) {
+    aom_memset16(dst_ptr1, src_ptr1[0], extend_left);
+    aom_memset16(dst_ptr2, src_ptr2[0], extend_right);
+    src_ptr1 += src_stride;
+    src_ptr2 += src_stride;
+    dst_ptr1 += src_stride;
+    dst_ptr2 += src_stride;
+  }
+
+  /* Now copy the top and bottom lines into each line of the respective
+   * borders
+   */
+  src_ptr1 = src - extend_left;
+  src_ptr2 = src + src_stride * (height - 1) - extend_left;
+  dst_ptr1 = src + src_stride * -extend_top - extend_left;
+  dst_ptr2 = src + src_stride * height - extend_left;
+
+  for (i = 0; i < extend_top; ++i) {
+    memcpy(dst_ptr1, src_ptr1, linesize * sizeof(uint16_t));
+    dst_ptr1 += src_stride;
+  }
+
+  for (i = 0; i < extend_bottom; ++i) {
+    memcpy(dst_ptr2, src_ptr2, linesize * sizeof(uint16_t));
+    dst_ptr2 += src_stride;
+  }
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+void aom_yv12_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf,
+                                     const int num_planes) {
+  assert(ybf->border % 2 == 0);
+  assert(ybf->y_height - ybf->y_crop_height < 16);
+  assert(ybf->y_width - ybf->y_crop_width < 16);
+  assert(ybf->y_height - ybf->y_crop_height >= 0);
+  assert(ybf->y_width - ybf->y_crop_width >= 0);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (ybf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    for (int plane = 0; plane < num_planes; ++plane) {
+      const int is_uv = plane > 0;
+      const int plane_border = ybf->border >> is_uv;
+      extend_plane_high(
+          ybf->buffers[plane], ybf->strides[is_uv], ybf->crop_widths[is_uv],
+          ybf->crop_heights[is_uv], plane_border, plane_border,
+          plane_border + ybf->heights[is_uv] - ybf->crop_heights[is_uv],
+          plane_border + ybf->widths[is_uv] - ybf->crop_widths[is_uv]);
+    }
+    return;
+  }
+#endif
+
+  for (int plane = 0; plane < num_planes; ++plane) {
+    const int is_uv = plane > 0;
+    const int plane_border = ybf->border >> is_uv;
+    extend_plane(ybf->buffers[plane], ybf->strides[is_uv],
+                 ybf->crop_widths[is_uv], ybf->crop_heights[is_uv],
+                 plane_border, plane_border,
+                 plane_border + ybf->heights[is_uv] - ybf->crop_heights[is_uv],
+                 plane_border + ybf->widths[is_uv] - ybf->crop_widths[is_uv]);
+  }
+}
+
+static void extend_frame(YV12_BUFFER_CONFIG *const ybf, int ext_size,
+                         const int num_planes) {
+  const int ss_x = ybf->uv_width < ybf->y_width;
+  const int ss_y = ybf->uv_height < ybf->y_height;
+
+  assert(ybf->y_height - ybf->y_crop_height < 16);
+  assert(ybf->y_width - ybf->y_crop_width < 16);
+  assert(ybf->y_height - ybf->y_crop_height >= 0);
+  assert(ybf->y_width - ybf->y_crop_width >= 0);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (ybf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    for (int plane = 0; plane < num_planes; ++plane) {
+      const int is_uv = plane > 0;
+      const int top = ext_size >> (is_uv ? ss_y : 0);
+      const int left = ext_size >> (is_uv ? ss_x : 0);
+      const int bottom = top + ybf->heights[is_uv] - ybf->crop_heights[is_uv];
+      const int right = left + ybf->widths[is_uv] - ybf->crop_widths[is_uv];
+      extend_plane_high(ybf->buffers[plane], ybf->strides[is_uv],
+                        ybf->crop_widths[is_uv], ybf->crop_heights[is_uv], top,
+                        left, bottom, right);
+    }
+    return;
+  }
+#endif
+
+  for (int plane = 0; plane < num_planes; ++plane) {
+    const int is_uv = plane > 0;
+    const int top = ext_size >> (is_uv ? ss_y : 0);
+    const int left = ext_size >> (is_uv ? ss_x : 0);
+    const int bottom = top + ybf->heights[is_uv] - ybf->crop_heights[is_uv];
+    const int right = left + ybf->widths[is_uv] - ybf->crop_widths[is_uv];
+    extend_plane(ybf->buffers[plane], ybf->strides[is_uv],
+                 ybf->crop_widths[is_uv], ybf->crop_heights[is_uv], top, left,
+                 bottom, right);
+  }
+}
+
+void aom_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf, const int num_planes) {
+  extend_frame(ybf, ybf->border, num_planes);
+}
+
+void aom_extend_frame_inner_borders_c(YV12_BUFFER_CONFIG *ybf,
+                                      const int num_planes) {
+  const int inner_bw = (ybf->border > AOMINNERBORDERINPIXELS)
+                           ? AOMINNERBORDERINPIXELS
+                           : ybf->border;
+  extend_frame(ybf, inner_bw, num_planes);
+}
+
+void aom_extend_frame_borders_y_c(YV12_BUFFER_CONFIG *ybf) {
+  int ext_size = ybf->border;
+  assert(ybf->y_height - ybf->y_crop_height < 16);
+  assert(ybf->y_width - ybf->y_crop_width < 16);
+  assert(ybf->y_height - ybf->y_crop_height >= 0);
+  assert(ybf->y_width - ybf->y_crop_width >= 0);
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (ybf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    extend_plane_high(ybf->y_buffer, ybf->y_stride, ybf->y_crop_width,
+                      ybf->y_crop_height, ext_size, ext_size,
+                      ext_size + ybf->y_height - ybf->y_crop_height,
+                      ext_size + ybf->y_width - ybf->y_crop_width);
+    return;
+  }
+#endif
+  extend_plane(ybf->y_buffer, ybf->y_stride, ybf->y_crop_width,
+               ybf->y_crop_height, ext_size, ext_size,
+               ext_size + ybf->y_height - ybf->y_crop_height,
+               ext_size + ybf->y_width - ybf->y_crop_width);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static void memcpy_short_addr(uint8_t *dst8, const uint8_t *src8, int num) {
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  memcpy(dst, src, num * sizeof(uint16_t));
+}
+#endif
+
+// Copies the source image into the destination image and updates the
+// destination's UMV borders.
+// Note: The frames are assumed to be identical in size.
+void aom_yv12_copy_frame_c(const YV12_BUFFER_CONFIG *src_bc,
+                           YV12_BUFFER_CONFIG *dst_bc, const int num_planes) {
+#if 0
+  /* These assertions are valid in the codec, but the libaom-tester uses
+   * this code slightly differently.
+   */
+  assert(src_bc->y_width == dst_bc->y_width);
+  assert(src_bc->y_height == dst_bc->y_height);
+#endif
+
+#if CONFIG_AV1_HIGHBITDEPTH
+  assert((src_bc->flags & YV12_FLAG_HIGHBITDEPTH) ==
+         (dst_bc->flags & YV12_FLAG_HIGHBITDEPTH));
+
+  if (src_bc->flags & YV12_FLAG_HIGHBITDEPTH) {
+    for (int plane = 0; plane < num_planes; ++plane) {
+      const uint8_t *plane_src = src_bc->buffers[plane];
+      uint8_t *plane_dst = dst_bc->buffers[plane];
+      const int is_uv = plane > 0;
+
+      for (int row = 0; row < src_bc->heights[is_uv]; ++row) {
+        memcpy_short_addr(plane_dst, plane_src, src_bc->widths[is_uv]);
+        plane_src += src_bc->strides[is_uv];
+        plane_dst += dst_bc->strides[is_uv];
+      }
+    }
+    aom_yv12_extend_frame_borders_c(dst_bc, num_planes);
+    return;
+  }
+#endif
+  for (int plane = 0; plane < num_planes; ++plane) {
+    const uint8_t *plane_src = src_bc->buffers[plane];
+    uint8_t *plane_dst = dst_bc->buffers[plane];
+    const int is_uv = plane > 0;
+
+    for (int row = 0; row < src_bc->heights[is_uv]; ++row) {
+      memcpy(plane_dst, plane_src, src_bc->widths[is_uv]);
+      plane_src += src_bc->strides[is_uv];
+      plane_dst += dst_bc->strides[is_uv];
+    }
+  }
+  aom_yv12_extend_frame_borders_c(dst_bc, num_planes);
+}
+
+void aom_yv12_copy_y_c(const YV12_BUFFER_CONFIG *src_ybc,
+                       YV12_BUFFER_CONFIG *dst_ybc) {
+  int row;
+  const uint8_t *src = src_ybc->y_buffer;
+  uint8_t *dst = dst_ybc->y_buffer;
+
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (src_ybc->flags & YV12_FLAG_HIGHBITDEPTH) {
+    const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
+    uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
+    for (row = 0; row < src_ybc->y_height; ++row) {
+      memcpy(dst16, src16, src_ybc->y_width * sizeof(uint16_t));
+      src16 += src_ybc->y_stride;
+      dst16 += dst_ybc->y_stride;
+    }
+    return;
+  }
+#endif
+
+  for (row = 0; row < src_ybc->y_height; ++row) {
+    memcpy(dst, src, src_ybc->y_width);
+    src += src_ybc->y_stride;
+    dst += dst_ybc->y_stride;
+  }
+}
+
+void aom_yv12_copy_u_c(const YV12_BUFFER_CONFIG *src_bc,
+                       YV12_BUFFER_CONFIG *dst_bc) {
+  int row;
+  const uint8_t *src = src_bc->u_buffer;
+  uint8_t *dst = dst_bc->u_buffer;
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (src_bc->flags & YV12_FLAG_HIGHBITDEPTH) {
+    const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
+    uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
+    for (row = 0; row < src_bc->uv_height; ++row) {
+      memcpy(dst16, src16, src_bc->uv_width * sizeof(uint16_t));
+      src16 += src_bc->uv_stride;
+      dst16 += dst_bc->uv_stride;
+    }
+    return;
+  }
+#endif
+  for (row = 0; row < src_bc->uv_height; ++row) {
+    memcpy(dst, src, src_bc->uv_width);
+    src += src_bc->uv_stride;
+    dst += dst_bc->uv_stride;
+  }
+}
+
+void aom_yv12_copy_v_c(const YV12_BUFFER_CONFIG *src_bc,
+                       YV12_BUFFER_CONFIG *dst_bc) {
+  int row;
+  const uint8_t *src = src_bc->v_buffer;
+  uint8_t *dst = dst_bc->v_buffer;
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (src_bc->flags & YV12_FLAG_HIGHBITDEPTH) {
+    const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
+    uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
+    for (row = 0; row < src_bc->uv_height; ++row) {
+      memcpy(dst16, src16, src_bc->uv_width * sizeof(uint16_t));
+      src16 += src_bc->uv_stride;
+      dst16 += dst_bc->uv_stride;
+    }
+    return;
+  }
+#endif
+  for (row = 0; row < src_bc->uv_height; ++row) {
+    memcpy(dst, src, src_bc->uv_width);
+    src += src_bc->uv_stride;
+    dst += dst_bc->uv_stride;
+  }
+}
+
+void aom_yv12_partial_copy_y_c(const YV12_BUFFER_CONFIG *src_ybc, int hstart1,
+                               int hend1, int vstart1, int vend1,
+                               YV12_BUFFER_CONFIG *dst_ybc, int hstart2,
+                               int vstart2) {
+  int row;
+  const uint8_t *src = src_ybc->y_buffer;
+  uint8_t *dst = dst_ybc->y_buffer;
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (src_ybc->flags & YV12_FLAG_HIGHBITDEPTH) {
+    const uint16_t *src16 =
+        CONVERT_TO_SHORTPTR(src + vstart1 * src_ybc->y_stride + hstart1);
+    uint16_t *dst16 =
+        CONVERT_TO_SHORTPTR(dst + vstart2 * dst_ybc->y_stride + hstart2);
+
+    for (row = vstart1; row < vend1; ++row) {
+      memcpy(dst16, src16, (hend1 - hstart1) * sizeof(uint16_t));
+      src16 += src_ybc->y_stride;
+      dst16 += dst_ybc->y_stride;
+    }
+    return;
+  }
+#endif
+  src = (src + vstart1 * src_ybc->y_stride + hstart1);
+  dst = (dst + vstart2 * dst_ybc->y_stride + hstart2);
+
+  for (row = vstart1; row < vend1; ++row) {
+    memcpy(dst, src, (hend1 - hstart1));
+    src += src_ybc->y_stride;
+    dst += dst_ybc->y_stride;
+  }
+}
+
+void aom_yv12_partial_coloc_copy_y_c(const YV12_BUFFER_CONFIG *src_ybc,
+                                     YV12_BUFFER_CONFIG *dst_ybc, int hstart,
+                                     int hend, int vstart, int vend) {
+  aom_yv12_partial_copy_y_c(src_ybc, hstart, hend, vstart, vend, dst_ybc,
+                            hstart, vstart);
+}
+
+void aom_yv12_partial_copy_u_c(const YV12_BUFFER_CONFIG *src_bc, int hstart1,
+                               int hend1, int vstart1, int vend1,
+                               YV12_BUFFER_CONFIG *dst_bc, int hstart2,
+                               int vstart2) {
+  int row;
+  const uint8_t *src = src_bc->u_buffer;
+  uint8_t *dst = dst_bc->u_buffer;
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (src_bc->flags & YV12_FLAG_HIGHBITDEPTH) {
+    const uint16_t *src16 =
+        CONVERT_TO_SHORTPTR(src + vstart1 * src_bc->uv_stride + hstart1);
+    uint16_t *dst16 =
+        CONVERT_TO_SHORTPTR(dst + vstart2 * dst_bc->uv_stride + hstart2);
+    for (row = vstart1; row < vend1; ++row) {
+      memcpy(dst16, src16, (hend1 - hstart1) * sizeof(uint16_t));
+      src16 += src_bc->uv_stride;
+      dst16 += dst_bc->uv_stride;
+    }
+    return;
+  }
+#endif
+  src = (src + vstart1 * src_bc->uv_stride + hstart1);
+  dst = (dst + vstart2 * dst_bc->uv_stride + hstart2);
+
+  for (row = vstart1; row < vend1; ++row) {
+    memcpy(dst, src, (hend1 - hstart1));
+    src += src_bc->uv_stride;
+    dst += dst_bc->uv_stride;
+  }
+}
+
+void aom_yv12_partial_coloc_copy_u_c(const YV12_BUFFER_CONFIG *src_bc,
+                                     YV12_BUFFER_CONFIG *dst_bc, int hstart,
+                                     int hend, int vstart, int vend) {
+  aom_yv12_partial_copy_u_c(src_bc, hstart, hend, vstart, vend, dst_bc, hstart,
+                            vstart);
+}
+
+void aom_yv12_partial_copy_v_c(const YV12_BUFFER_CONFIG *src_bc, int hstart1,
+                               int hend1, int vstart1, int vend1,
+                               YV12_BUFFER_CONFIG *dst_bc, int hstart2,
+                               int vstart2) {
+  int row;
+  const uint8_t *src = src_bc->v_buffer;
+  uint8_t *dst = dst_bc->v_buffer;
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (src_bc->flags & YV12_FLAG_HIGHBITDEPTH) {
+    const uint16_t *src16 =
+        CONVERT_TO_SHORTPTR(src + vstart1 * src_bc->uv_stride + hstart1);
+    uint16_t *dst16 =
+        CONVERT_TO_SHORTPTR(dst + vstart2 * dst_bc->uv_stride + hstart2);
+    for (row = vstart1; row < vend1; ++row) {
+      memcpy(dst16, src16, (hend1 - hstart1) * sizeof(uint16_t));
+      src16 += src_bc->uv_stride;
+      dst16 += dst_bc->uv_stride;
+    }
+    return;
+  }
+#endif
+  src = (src + vstart1 * src_bc->uv_stride + hstart1);
+  dst = (dst + vstart2 * dst_bc->uv_stride + hstart2);
+
+  for (row = vstart1; row < vend1; ++row) {
+    memcpy(dst, src, (hend1 - hstart1));
+    src += src_bc->uv_stride;
+    dst += dst_bc->uv_stride;
+  }
+}
+
+void aom_yv12_partial_coloc_copy_v_c(const YV12_BUFFER_CONFIG *src_bc,
+                                     YV12_BUFFER_CONFIG *dst_bc, int hstart,
+                                     int hend, int vstart, int vend) {
+  aom_yv12_partial_copy_v_c(src_bc, hstart, hend, vstart, vend, dst_bc, hstart,
+                            vstart);
+}
+
+int aom_yv12_realloc_with_new_border_c(YV12_BUFFER_CONFIG *ybf, int new_border,
+                                       int byte_alignment, int num_planes) {
+  if (ybf) {
+    if (new_border == ybf->border) return 0;
+    YV12_BUFFER_CONFIG new_buf;
+    memset(&new_buf, 0, sizeof(new_buf));
+    const int error = aom_alloc_frame_buffer(
+        &new_buf, ybf->y_crop_width, ybf->y_crop_height, ybf->subsampling_x,
+        ybf->subsampling_y, ybf->flags & YV12_FLAG_HIGHBITDEPTH, new_border,
+        byte_alignment);
+    if (error) return error;
+    // Copy image buffer
+    aom_yv12_copy_frame(ybf, &new_buf, num_planes);
+
+    // Extend up to new border
+    aom_extend_frame_borders(&new_buf, num_planes);
+
+    // Now free the old buffer and replace with the new
+    aom_free_frame_buffer(ybf);
+    memcpy(ybf, &new_buf, sizeof(new_buf));
+    return 0;
+  }
+  return -2;
+}
diff --git a/libs/libaom/src/aom_scale/mips/dspr2/yv12extend_dspr2.c b/libs/libaom/src/aom_scale/mips/dspr2/yv12extend_dspr2.c
new file mode 100644
index 000000000..869e594d7
--- /dev/null
+++ b/libs/libaom/src/aom_scale/mips/dspr2/yv12extend_dspr2.c
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "config/aom_config.h"
+
+#include "aom_scale/yv12config.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_scale/aom_scale.h"
+
+#if HAVE_DSPR2
+static void extend_plane(uint8_t *const src, int src_stride, int width,
+                         int height, int extend_top, int extend_left,
+                         int extend_bottom, int extend_right) {
+  int i, j;
+  uint8_t *left_src, *right_src;
+  uint8_t *left_dst_start, *right_dst_start;
+  uint8_t *left_dst, *right_dst;
+  uint8_t *top_src, *bot_src;
+  uint8_t *top_dst, *bot_dst;
+  uint32_t left_pix;
+  uint32_t right_pix;
+  uint32_t linesize;
+
+  /* copy the left and right most columns out */
+  left_src = src;
+  right_src = src + width - 1;
+  left_dst_start = src - extend_left;
+  right_dst_start = src + width;
+
+  for (i = height; i--;) {
+    left_dst = left_dst_start;
+    right_dst = right_dst_start;
+
+    __asm__ __volatile__(
+        "lb        %[left_pix],     0(%[left_src])      \n\t"
+        "lb        %[right_pix],    0(%[right_src])     \n\t"
+        "replv.qb  %[left_pix],     %[left_pix]         \n\t"
+        "replv.qb  %[right_pix],    %[right_pix]        \n\t"
+
+        : [left_pix] "=&r"(left_pix), [right_pix] "=&r"(right_pix)
+        : [left_src] "r"(left_src), [right_src] "r"(right_src));
+
+    for (j = extend_left / 4; j--;) {
+      __asm__ __volatile__(
+          "sw     %[left_pix],    0(%[left_dst])     \n\t"
+          "sw     %[right_pix],   0(%[right_dst])    \n\t"
+
+          :
+          : [left_dst] "r"(left_dst), [left_pix] "r"(left_pix),
+            [right_dst] "r"(right_dst), [right_pix] "r"(right_pix));
+
+      left_dst += 4;
+      right_dst += 4;
+    }
+
+    for (j = extend_left % 4; j--;) {
+      __asm__ __volatile__(
+          "sb     %[left_pix],    0(%[left_dst])     \n\t"
+          "sb     %[right_pix],   0(%[right_dst])     \n\t"
+
+          :
+          : [left_dst] "r"(left_dst), [left_pix] "r"(left_pix),
+            [right_dst] "r"(right_dst), [right_pix] "r"(right_pix));
+
+      left_dst += 1;
+      right_dst += 1;
+    }
+
+    left_src += src_stride;
+    right_src += src_stride;
+    left_dst_start += src_stride;
+    right_dst_start += src_stride;
+  }
+
+  /* Now copy the top and bottom lines into each line of the respective
+   * borders
+   */
+  top_src = src - extend_left;
+  bot_src = src + src_stride * (height - 1) - extend_left;
+  top_dst = src + src_stride * (-extend_top) - extend_left;
+  bot_dst = src + src_stride * (height)-extend_left;
+  linesize = extend_left + extend_right + width;
+
+  for (i = 0; i < extend_top; i++) {
+    memcpy(top_dst, top_src, linesize);
+    top_dst += src_stride;
+  }
+
+  for (i = 0; i < extend_bottom; i++) {
+    memcpy(bot_dst, bot_src, linesize);
+    bot_dst += src_stride;
+  }
+}
+
+static void extend_frame(YV12_BUFFER_CONFIG *const ybf, int ext_size) {
+  const int c_w = ybf->uv_crop_width;
+  const int c_h = ybf->uv_crop_height;
+  const int ss_x = ybf->uv_width < ybf->y_width;
+  const int ss_y = ybf->uv_height < ybf->y_height;
+  const int c_et = ext_size >> ss_y;
+  const int c_el = ext_size >> ss_x;
+  const int c_eb = c_et + ybf->uv_height - ybf->uv_crop_height;
+  const int c_er = c_el + ybf->uv_width - ybf->uv_crop_width;
+
+  assert(ybf->y_height - ybf->y_crop_height < 16);
+  assert(ybf->y_width - ybf->y_crop_width < 16);
+  assert(ybf->y_height - ybf->y_crop_height >= 0);
+  assert(ybf->y_width - ybf->y_crop_width >= 0);
+
+  extend_plane(ybf->y_buffer, ybf->y_stride, ybf->y_crop_width,
+               ybf->y_crop_height, ext_size, ext_size,
+               ext_size + ybf->y_height - ybf->y_crop_height,
+               ext_size + ybf->y_width - ybf->y_crop_width);
+
+  extend_plane(ybf->u_buffer, ybf->uv_stride, c_w, c_h, c_et, c_el, c_eb, c_er);
+
+  extend_plane(ybf->v_buffer, ybf->uv_stride, c_w, c_h, c_et, c_el, c_eb, c_er);
+}
+
+void aom_extend_frame_borders_dspr2(YV12_BUFFER_CONFIG *ybf,
+                                    const int num_planes) {
+  extend_frame(ybf, ybf->border, num_planes);
+}
+
+void aom_extend_frame_inner_borders_dspr2(YV12_BUFFER_CONFIG *ybf,
+                                          const int num_planes) {
+  const int inner_bw = (ybf->border > AOMINNERBORDERINPIXELS)
+                           ? AOMINNERBORDERINPIXELS
+                           : ybf->border;
+  extend_frame(ybf, inner_bw, num_planes);
+}
+#endif
diff --git a/libs/libaom/src/aom_scale/yv12config.h b/libs/libaom/src/aom_scale/yv12config.h
new file mode 100644
index 000000000..3642bb7f3
--- /dev/null
+++ b/libs/libaom/src/aom_scale/yv12config.h
@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_SCALE_YV12CONFIG_H_
+#define AOM_AOM_SCALE_YV12CONFIG_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "config/aom_config.h"
+
+#include "aom/aom_codec.h"
+#include "aom/aom_frame_buffer.h"
+#include "aom/aom_integer.h"
+#include "aom/internal/aom_image_internal.h"
+
+#define AOMINNERBORDERINPIXELS 160
+#define AOM_INTERP_EXTEND 4
+#define AOM_BORDER_IN_PIXELS 288
+#define AOM_ENC_NO_SCALE_BORDER 160
+#define AOM_DEC_BORDER_IN_PIXELS 64
+
+typedef struct yv12_buffer_config {
+  union {
+    struct {
+      int y_width;
+      int uv_width;
+    };
+    int widths[2];
+  };
+  union {
+    struct {
+      int y_height;
+      int uv_height;
+    };
+    int heights[2];
+  };
+  union {
+    struct {
+      int y_crop_width;
+      int uv_crop_width;
+    };
+    int crop_widths[2];
+  };
+  union {
+    struct {
+      int y_crop_height;
+      int uv_crop_height;
+    };
+    int crop_heights[2];
+  };
+  union {
+    struct {
+      int y_stride;
+      int uv_stride;
+    };
+    int strides[2];
+  };
+  union {
+    struct {
+      uint8_t *y_buffer;
+      uint8_t *u_buffer;
+      uint8_t *v_buffer;
+    };
+    uint8_t *buffers[3];
+  };
+
+  // Indicate whether y_buffer, u_buffer, and v_buffer points to the internally
+  // allocated memory or external buffers.
+  int use_external_reference_buffers;
+  // This is needed to store y_buffer, u_buffer, and v_buffer when set reference
+  // uses an external refernece, and restore those buffer pointers after the
+  // external reference frame is no longer used.
+  uint8_t *store_buf_adr[3];
+
+  // If the frame is stored in a 16-bit buffer, this stores an 8-bit version
+  // for use in global motion detection. It is allocated on-demand.
+  uint8_t *y_buffer_8bit;
+  int buf_8bit_valid;
+
+  uint8_t *buffer_alloc;
+  size_t buffer_alloc_sz;
+  int border;
+  size_t frame_size;
+  int subsampling_x;
+  int subsampling_y;
+  unsigned int bit_depth;
+  aom_color_primaries_t color_primaries;
+  aom_transfer_characteristics_t transfer_characteristics;
+  aom_matrix_coefficients_t matrix_coefficients;
+  uint8_t monochrome;
+  aom_chroma_sample_position_t chroma_sample_position;
+  aom_color_range_t color_range;
+  int render_width;
+  int render_height;
+
+  int corrupted;
+  int flags;
+  aom_metadata_array_t *metadata;
+} YV12_BUFFER_CONFIG;
+
+#define YV12_FLAG_HIGHBITDEPTH 8
+
+int aom_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
+                           int ss_x, int ss_y, int use_highbitdepth, int border,
+                           int byte_alignment);
+
+// Updates the yv12 buffer config with the frame buffer. |byte_alignment| must
+// be a power of 2, from 32 to 1024. 0 sets legacy alignment. If cb is not
+// NULL, then libaom is using the frame buffer callbacks to handle memory.
+// If cb is not NULL, libaom will call cb with minimum size in bytes needed
+// to decode the current frame. If cb is NULL, libaom will allocate memory
+// internally to decode the current frame. Returns 0 on success. Returns < 0
+// on failure.
+int aom_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
+                             int ss_x, int ss_y, int use_highbitdepth,
+                             int border, int byte_alignment,
+                             aom_codec_frame_buffer_t *fb,
+                             aom_get_frame_buffer_cb_fn_t cb, void *cb_priv);
+
+int aom_free_frame_buffer(YV12_BUFFER_CONFIG *ybf);
+
+/*!\brief Removes metadata from YUV_BUFFER_CONFIG struct.
+ *
+ * Frees metadata in frame buffer.
+ * Frame buffer metadata pointer will be set to NULL.
+ *
+ * \param[in]    ybf       Frame buffer struct pointer
+ */
+void aom_remove_metadata_from_frame_buffer(YV12_BUFFER_CONFIG *ybf);
+
+/*!\brief Copy metadata to YUV_BUFFER_CONFIG struct.
+ *
+ * Copies metadata in frame buffer.
+ * Frame buffer will clear any previous metadata and will reallocate the
+ * metadata array to the new metadata size. Then, it will copy the new metadata
+ * array into it.
+ * Returns 0 on success or -1 on failure.
+ *
+ * \param[in]    ybf       Frame buffer struct pointer
+ * \param[in]    arr       Metadata array struct pointer
+ */
+int aom_copy_metadata_to_frame_buffer(YV12_BUFFER_CONFIG *ybf,
+                                      const aom_metadata_array_t *arr);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // AOM_AOM_SCALE_YV12CONFIG_H_
diff --git a/libs/libaom/src/aom_util/aom_thread.c b/libs/libaom/src/aom_util/aom_thread.c
new file mode 100644
index 000000000..a749a2240
--- /dev/null
+++ b/libs/libaom/src/aom_util/aom_thread.c
@@ -0,0 +1,212 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+//
+// Multi-threaded worker
+//
+// Original source:
+//  https://chromium.googlesource.com/webm/libwebp
+
+// Enable GNU extensions in glibc so that we can call pthread_setname_np().
+// This must be before any #include statements.
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#include <assert.h>
+#include <string.h>  // for memset()
+
+#include "aom_mem/aom_mem.h"
+#include "aom_util/aom_thread.h"
+
+#if CONFIG_MULTITHREAD
+
+struct AVxWorkerImpl {
+  pthread_mutex_t mutex_;
+  pthread_cond_t condition_;
+  pthread_t thread_;
+};
+
+//------------------------------------------------------------------------------
+
+static void execute(AVxWorker *const worker);  // Forward declaration.
+
+static THREADFN thread_loop(void *ptr) {
+  AVxWorker *const worker = (AVxWorker *)ptr;
+#ifdef __APPLE__
+  if (worker->thread_name != NULL) {
+    // Apple's version of pthread_setname_np takes one argument and operates on
+    // the current thread only. The maximum size of the thread_name buffer was
+    // noted in the Chromium source code and was confirmed by experiments. If
+    // thread_name is too long, pthread_setname_np returns -1 with errno
+    // ENAMETOOLONG (63).
+    char thread_name[64];
+    strncpy(thread_name, worker->thread_name, sizeof(thread_name) - 1);
+    thread_name[sizeof(thread_name) - 1] = '\0';
+    pthread_setname_np(thread_name);
+  }
+#elif defined(__GLIBC__) || defined(__BIONIC__)
+  if (worker->thread_name != NULL) {
+    // Linux and Android require names (with nul) fit in 16 chars, otherwise
+    // pthread_setname_np() returns ERANGE (34).
+    char thread_name[16];
+    strncpy(thread_name, worker->thread_name, sizeof(thread_name) - 1);
+    thread_name[sizeof(thread_name) - 1] = '\0';
+    pthread_setname_np(pthread_self(), thread_name);
+  }
+#endif
+  int done = 0;
+  while (!done) {
+    pthread_mutex_lock(&worker->impl_->mutex_);
+    while (worker->status_ == OK) {  // wait in idling mode
+      pthread_cond_wait(&worker->impl_->condition_, &worker->impl_->mutex_);
+    }
+    if (worker->status_ == WORK) {
+      execute(worker);
+      worker->status_ = OK;
+    } else if (worker->status_ == NOT_OK) {  // finish the worker
+      done = 1;
+    }
+    // signal to the main thread that we're done (for sync())
+    pthread_cond_signal(&worker->impl_->condition_);
+    pthread_mutex_unlock(&worker->impl_->mutex_);
+  }
+  return THREAD_RETURN(NULL);  // Thread is finished
+}
+
+// main thread state control
+static void change_state(AVxWorker *const worker, AVxWorkerStatus new_status) {
+  // No-op when attempting to change state on a thread that didn't come up.
+  // Checking status_ without acquiring the lock first would result in a data
+  // race.
+  if (worker->impl_ == NULL) return;
+
+  pthread_mutex_lock(&worker->impl_->mutex_);
+  if (worker->status_ >= OK) {
+    // wait for the worker to finish
+    while (worker->status_ != OK) {
+      pthread_cond_wait(&worker->impl_->condition_, &worker->impl_->mutex_);
+    }
+    // assign new status and release the working thread if needed
+    if (new_status != OK) {
+      worker->status_ = new_status;
+      pthread_cond_signal(&worker->impl_->condition_);
+    }
+  }
+  pthread_mutex_unlock(&worker->impl_->mutex_);
+}
+
+#endif  // CONFIG_MULTITHREAD
+
+//------------------------------------------------------------------------------
+
+static void init(AVxWorker *const worker) {
+  memset(worker, 0, sizeof(*worker));
+  worker->status_ = NOT_OK;
+}
+
+static int sync(AVxWorker *const worker) {
+#if CONFIG_MULTITHREAD
+  change_state(worker, OK);
+#endif
+  assert(worker->status_ <= OK);
+  return !worker->had_error;
+}
+
+static int reset(AVxWorker *const worker) {
+  int ok = 1;
+  worker->had_error = 0;
+  if (worker->status_ < OK) {
+#if CONFIG_MULTITHREAD
+    worker->impl_ = (AVxWorkerImpl *)aom_calloc(1, sizeof(*worker->impl_));
+    if (worker->impl_ == NULL) {
+      return 0;
+    }
+    if (pthread_mutex_init(&worker->impl_->mutex_, NULL)) {
+      goto Error;
+    }
+    if (pthread_cond_init(&worker->impl_->condition_, NULL)) {
+      pthread_mutex_destroy(&worker->impl_->mutex_);
+      goto Error;
+    }
+    pthread_mutex_lock(&worker->impl_->mutex_);
+    ok = !pthread_create(&worker->impl_->thread_, NULL, thread_loop, worker);
+    if (ok) worker->status_ = OK;
+    pthread_mutex_unlock(&worker->impl_->mutex_);
+    if (!ok) {
+      pthread_mutex_destroy(&worker->impl_->mutex_);
+      pthread_cond_destroy(&worker->impl_->condition_);
+    Error:
+      aom_free(worker->impl_);
+      worker->impl_ = NULL;
+      return 0;
+    }
+#else
+    worker->status_ = OK;
+#endif
+  } else if (worker->status_ > OK) {
+    ok = sync(worker);
+  }
+  assert(!ok || (worker->status_ == OK));
+  return ok;
+}
+
+static void execute(AVxWorker *const worker) {
+  if (worker->hook != NULL) {
+    worker->had_error |= !worker->hook(worker->data1, worker->data2);
+  }
+}
+
+static void launch(AVxWorker *const worker) {
+#if CONFIG_MULTITHREAD
+  change_state(worker, WORK);
+#else
+  execute(worker);
+#endif
+}
+
+static void end(AVxWorker *const worker) {
+#if CONFIG_MULTITHREAD
+  if (worker->impl_ != NULL) {
+    change_state(worker, NOT_OK);
+    pthread_join(worker->impl_->thread_, NULL);
+    pthread_mutex_destroy(&worker->impl_->mutex_);
+    pthread_cond_destroy(&worker->impl_->condition_);
+    aom_free(worker->impl_);
+    worker->impl_ = NULL;
+  }
+#else
+  worker->status_ = NOT_OK;
+  assert(worker->impl_ == NULL);
+#endif
+  assert(worker->status_ == NOT_OK);
+}
+
+//------------------------------------------------------------------------------
+
+static AVxWorkerInterface g_worker_interface = { init,   reset,   sync,
+                                                 launch, execute, end };
+
+int aom_set_worker_interface(const AVxWorkerInterface *const winterface) {
+  if (winterface == NULL || winterface->init == NULL ||
+      winterface->reset == NULL || winterface->sync == NULL ||
+      winterface->launch == NULL || winterface->execute == NULL ||
+      winterface->end == NULL) {
+    return 0;
+  }
+  g_worker_interface = *winterface;
+  return 1;
+}
+
+const AVxWorkerInterface *aom_get_worker_interface(void) {
+  return &g_worker_interface;
+}
+
+//------------------------------------------------------------------------------
diff --git a/libs/libaom/src/aom_util/aom_thread.h b/libs/libaom/src/aom_util/aom_thread.h
new file mode 100644
index 000000000..8d0431258
--- /dev/null
+++ b/libs/libaom/src/aom_util/aom_thread.h
@@ -0,0 +1,364 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+//
+// Multi-threaded worker
+//
+// Original source:
+//  https://chromium.googlesource.com/webm/libwebp
+
+#ifndef AOM_AOM_UTIL_AOM_THREAD_H_
+#define AOM_AOM_UTIL_AOM_THREAD_H_
+
+#include "config/aom_config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_NUM_THREADS 64
+
+#if CONFIG_MULTITHREAD
+
+#if defined(_WIN32) && !HAVE_PTHREAD_H
+#include <errno.h>    // NOLINT
+#include <process.h>  // NOLINT
+#include <windows.h>  // NOLINT
+typedef HANDLE pthread_t;
+typedef CRITICAL_SECTION pthread_mutex_t;
+
+#if _WIN32_WINNT < 0x0600
+#error _WIN32_WINNT must target Windows Vista / Server 2008 or newer.
+#endif
+typedef CONDITION_VARIABLE pthread_cond_t;
+
+#ifndef WINAPI_FAMILY_PARTITION
+#define WINAPI_PARTITION_DESKTOP 1
+#define WINAPI_FAMILY_PARTITION(x) x
+#endif
+
+#if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
+#define USE_CREATE_THREAD
+#endif
+
+//------------------------------------------------------------------------------
+// simplistic pthread emulation layer
+
+// _beginthreadex requires __stdcall
+#define THREADFN unsigned int __stdcall
+#define THREAD_RETURN(val) (unsigned int)((DWORD_PTR)val)
+
+static INLINE int pthread_create(pthread_t *const thread, const void *attr,
+                                 unsigned int(__stdcall *start)(void *),
+                                 void *arg) {
+  (void)attr;
+#ifdef USE_CREATE_THREAD
+  *thread = CreateThread(NULL,          /* lpThreadAttributes */
+                         0,             /* dwStackSize */
+                         start, arg, 0, /* dwStackSize */
+                         NULL);         /* lpThreadId */
+#else
+  *thread = (pthread_t)_beginthreadex(NULL,          /* void *security */
+                                      0,             /* unsigned stack_size */
+                                      start, arg, 0, /* unsigned initflag */
+                                      NULL);         /* unsigned *thrdaddr */
+#endif
+  if (*thread == NULL) return 1;
+  SetThreadPriority(*thread, THREAD_PRIORITY_ABOVE_NORMAL);
+  return 0;
+}
+
+static INLINE int pthread_join(pthread_t thread, void **value_ptr) {
+  (void)value_ptr;
+  return (WaitForSingleObjectEx(thread, INFINITE, FALSE /*bAlertable*/) !=
+              WAIT_OBJECT_0 ||
+          CloseHandle(thread) == 0);
+}
+
+// Mutex
+static INLINE int pthread_mutex_init(pthread_mutex_t *const mutex,
+                                     void *mutexattr) {
+  (void)mutexattr;
+  InitializeCriticalSectionEx(mutex, 0 /*dwSpinCount*/, 0 /*Flags*/);
+  return 0;
+}
+
+static INLINE int pthread_mutex_trylock(pthread_mutex_t *const mutex) {
+  return TryEnterCriticalSection(mutex) ? 0 : EBUSY;
+}
+
+static INLINE int pthread_mutex_lock(pthread_mutex_t *const mutex) {
+  EnterCriticalSection(mutex);
+  return 0;
+}
+
+static INLINE int pthread_mutex_unlock(pthread_mutex_t *const mutex) {
+  LeaveCriticalSection(mutex);
+  return 0;
+}
+
+static INLINE int pthread_mutex_destroy(pthread_mutex_t *const mutex) {
+  DeleteCriticalSection(mutex);
+  return 0;
+}
+
+// Condition
+static INLINE int pthread_cond_destroy(pthread_cond_t *const condition) {
+  (void)condition;
+  return 0;
+}
+
+static INLINE int pthread_cond_init(pthread_cond_t *const condition,
+                                    void *cond_attr) {
+  (void)cond_attr;
+  InitializeConditionVariable(condition);
+  return 0;
+}
+
+static INLINE int pthread_cond_signal(pthread_cond_t *const condition) {
+  WakeConditionVariable(condition);
+  return 0;
+}
+
+static INLINE int pthread_cond_broadcast(pthread_cond_t *const condition) {
+  WakeAllConditionVariable(condition);
+  return 0;
+}
+
+static INLINE int pthread_cond_wait(pthread_cond_t *const condition,
+                                    pthread_mutex_t *const mutex) {
+  int ok;
+  ok = SleepConditionVariableCS(condition, mutex, INFINITE);
+  return !ok;
+}
+#elif defined(__OS2__)
+#define INCL_DOS
+#include <os2.h>  // NOLINT
+
+#include <errno.h>        // NOLINT
+#include <stdlib.h>       // NOLINT
+#include <sys/builtin.h>  // NOLINT
+
+#define pthread_t TID
+#define pthread_mutex_t HMTX
+
+typedef struct {
+  HEV event_sem_;
+  HEV ack_sem_;
+  volatile unsigned wait_count_;
+} pthread_cond_t;
+
+//------------------------------------------------------------------------------
+// simplistic pthread emulation layer
+
+#define THREADFN void *
+#define THREAD_RETURN(val) (val)
+
+typedef struct {
+  void *(*start_)(void *);
+  void *arg_;
+} thread_arg;
+
+static void thread_start(void *arg) {
+  thread_arg targ = *(thread_arg *)arg;
+  free(arg);
+
+  targ.start_(targ.arg_);
+}
+
+static INLINE int pthread_create(pthread_t *const thread, const void *attr,
+                                 void *(*start)(void *), void *arg) {
+  int tid;
+  thread_arg *targ = (thread_arg *)malloc(sizeof(*targ));
+  if (targ == NULL) return 1;
+
+  (void)attr;
+
+  targ->start_ = start;
+  targ->arg_ = arg;
+  tid = (pthread_t)_beginthread(thread_start, NULL, 1024 * 1024, targ);
+  if (tid == -1) {
+    free(targ);
+    return 1;
+  }
+
+  *thread = tid;
+  return 0;
+}
+
+static INLINE int pthread_join(pthread_t thread, void **value_ptr) {
+  (void)value_ptr;
+  return DosWaitThread(&thread, DCWW_WAIT) != 0;
+}
+
+// Mutex
+static INLINE int pthread_mutex_init(pthread_mutex_t *const mutex,
+                                     void *mutexattr) {
+  (void)mutexattr;
+  return DosCreateMutexSem(NULL, mutex, 0, FALSE) != 0;
+}
+
+static INLINE int pthread_mutex_trylock(pthread_mutex_t *const mutex) {
+  return DosRequestMutexSem(*mutex, SEM_IMMEDIATE_RETURN) == 0 ? 0 : EBUSY;
+}
+
+static INLINE int pthread_mutex_lock(pthread_mutex_t *const mutex) {
+  return DosRequestMutexSem(*mutex, SEM_INDEFINITE_WAIT) != 0;
+}
+
+static INLINE int pthread_mutex_unlock(pthread_mutex_t *const mutex) {
+  return DosReleaseMutexSem(*mutex) != 0;
+}
+
+static INLINE int pthread_mutex_destroy(pthread_mutex_t *const mutex) {
+  return DosCloseMutexSem(*mutex) != 0;
+}
+
+// Condition
+static INLINE int pthread_cond_destroy(pthread_cond_t *const condition) {
+  int ok = 1;
+  ok &= DosCloseEventSem(condition->event_sem_) == 0;
+  ok &= DosCloseEventSem(condition->ack_sem_) == 0;
+  return !ok;
+}
+
+static INLINE int pthread_cond_init(pthread_cond_t *const condition,
+                                    void *cond_attr) {
+  int ok = 1;
+  (void)cond_attr;
+
+  ok &=
+      DosCreateEventSem(NULL, &condition->event_sem_, DCE_POSTONE, FALSE) == 0;
+  ok &= DosCreateEventSem(NULL, &condition->ack_sem_, DCE_POSTONE, FALSE) == 0;
+  if (!ok) {
+    pthread_cond_destroy(condition);
+    return 1;
+  }
+  condition->wait_count_ = 0;
+  return 0;
+}
+
+static INLINE int pthread_cond_signal(pthread_cond_t *const condition) {
+  int ok = 1;
+
+  if (!__atomic_cmpxchg32(&condition->wait_count_, 0, 0)) {
+    ok &= DosPostEventSem(condition->event_sem_) == 0;
+    ok &= DosWaitEventSem(condition->ack_sem_, SEM_INDEFINITE_WAIT) == 0;
+  }
+
+  return !ok;
+}
+
+static INLINE int pthread_cond_broadcast(pthread_cond_t *const condition) {
+  int ok = 1;
+
+  while (!__atomic_cmpxchg32(&condition->wait_count_, 0, 0))
+    ok &= pthread_cond_signal(condition) == 0;
+
+  return !ok;
+}
+
+static INLINE int pthread_cond_wait(pthread_cond_t *const condition,
+                                    pthread_mutex_t *const mutex) {
+  int ok = 1;
+
+  __atomic_increment(&condition->wait_count_);
+
+  ok &= pthread_mutex_unlock(mutex) == 0;
+
+  ok &= DosWaitEventSem(condition->event_sem_, SEM_INDEFINITE_WAIT) == 0;
+
+  __atomic_decrement(&condition->wait_count_);
+
+  ok &= DosPostEventSem(condition->ack_sem_) == 0;
+
+  pthread_mutex_lock(mutex);
+
+  return !ok;
+}
+#else                 // _WIN32
+#include <pthread.h>  // NOLINT
+#define THREADFN void *
+#define THREAD_RETURN(val) val
+#endif
+
+#endif  // CONFIG_MULTITHREAD
+
+// State of the worker thread object
+typedef enum {
+  NOT_OK = 0,  // object is unusable
+  OK,          // ready to work
+  WORK         // busy finishing the current task
+} AVxWorkerStatus;
+
+// Function to be called by the worker thread. Takes two opaque pointers as
+// arguments (data1 and data2). Should return true on success and return false
+// in case of error.
+typedef int (*AVxWorkerHook)(void *, void *);
+
+// Platform-dependent implementation details for the worker.
+typedef struct AVxWorkerImpl AVxWorkerImpl;
+
+// Synchronization object used to launch job in the worker thread
+typedef struct {
+  AVxWorkerImpl *impl_;
+  AVxWorkerStatus status_;
+  // Thread name for the debugger. If not NULL, must point to a string that
+  // outlives the worker thread. For portability, use a name <= 15 characters
+  // long (not including the terminating NUL character).
+  const char *thread_name;
+  AVxWorkerHook hook;  // hook to call
+  void *data1;         // first argument passed to 'hook'
+  void *data2;         // second argument passed to 'hook'
+  int had_error;       // true if a call to 'hook' returned false
+} AVxWorker;
+
+// The interface for all thread-worker related functions. All these functions
+// must be implemented.
+typedef struct {
+  // Must be called first, before any other method.
+  void (*init)(AVxWorker *const worker);
+  // Must be called to initialize the object and spawn the thread. Re-entrant.
+  // Will potentially launch the thread. Returns false in case of error.
+  int (*reset)(AVxWorker *const worker);
+  // Makes sure the previous work is finished. Returns true if worker->had_error
+  // was not set and no error condition was triggered by the working thread.
+  int (*sync)(AVxWorker *const worker);
+  // Triggers the thread to call hook() with data1 and data2 arguments. These
+  // hook/data1/data2 values can be changed at any time before calling this
+  // function, but not be changed afterward until the next call to Sync().
+  void (*launch)(AVxWorker *const worker);
+  // This function is similar to launch() except that it calls the
+  // hook directly instead of using a thread. Convenient to bypass the thread
+  // mechanism while still using the AVxWorker structs. sync() must
+  // still be called afterward (for error reporting).
+  void (*execute)(AVxWorker *const worker);
+  // Kill the thread and terminate the object. To use the object again, one
+  // must call reset() again.
+  void (*end)(AVxWorker *const worker);
+} AVxWorkerInterface;
+
+// Install a new set of threading functions, overriding the defaults. This
+// should be done before any workers are started, i.e., before any encoding or
+// decoding takes place. The contents of the interface struct are copied, it
+// is safe to free the corresponding memory after this call. This function is
+// not thread-safe. Return false in case of invalid pointer or methods.
+int aom_set_worker_interface(const AVxWorkerInterface *const winterface);
+
+// Retrieve the currently set thread worker interface.
+const AVxWorkerInterface *aom_get_worker_interface(void);
+
+//------------------------------------------------------------------------------
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_UTIL_AOM_THREAD_H_
diff --git a/libs/libaom/src/aom_util/aom_util.cmake b/libs/libaom/src/aom_util/aom_util.cmake
new file mode 100644
index 000000000..1a1bfe1e6
--- /dev/null
+++ b/libs/libaom/src/aom_util/aom_util.cmake
@@ -0,0 +1,31 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_AOM_UTIL_AOM_UTIL_CMAKE_)
+  return()
+endif() # AOM_AOM_UTIL_AOM_UTIL_CMAKE_
+set(AOM_AOM_UTIL_AOM_UTIL_CMAKE_ 1)
+
+list(APPEND AOM_UTIL_SOURCES "${AOM_ROOT}/aom_util/aom_thread.c"
+            "${AOM_ROOT}/aom_util/aom_thread.h"
+            "${AOM_ROOT}/aom_util/endian_inl.h"
+            "${AOM_ROOT}/aom_util/debug_util.c"
+            "${AOM_ROOT}/aom_util/debug_util.h")
+
+# Creates the aom_util build target and makes libaom depend on it. The libaom
+# target must exist before this function is called.
+function(setup_aom_util_targets)
+  add_library(aom_util OBJECT ${AOM_UTIL_SOURCES})
+  set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_util PARENT_SCOPE)
+  target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_util>)
+  if(BUILD_SHARED_LIBS)
+    target_sources(aom_static PRIVATE $<TARGET_OBJECTS:aom_util>)
+  endif()
+endfunction()
diff --git a/libs/libaom/src/aom_util/debug_util.c b/libs/libaom/src/aom_util/debug_util.c
new file mode 100644
index 000000000..5762e693b
--- /dev/null
+++ b/libs/libaom/src/aom_util/debug_util.c
@@ -0,0 +1,279 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+#include "aom_util/debug_util.h"
+
+static int frame_idx_w = 0;
+
+static int frame_idx_r = 0;
+
+void aom_bitstream_queue_set_frame_write(int frame_idx) {
+  frame_idx_w = frame_idx;
+}
+
+int aom_bitstream_queue_get_frame_writee(void) { return frame_idx_w; }
+
+void aom_bitstream_queue_set_frame_read(int frame_idx) {
+  frame_idx_r = frame_idx;
+}
+
+int aom_bitstream_queue_get_frame_read(void) { return frame_idx_r; }
+
+#if CONFIG_BITSTREAM_DEBUG
+#define QUEUE_MAX_SIZE 2000000
+static int result_queue[QUEUE_MAX_SIZE];
+static int nsymbs_queue[QUEUE_MAX_SIZE];
+static aom_cdf_prob cdf_queue[QUEUE_MAX_SIZE][16];
+
+static int queue_r = 0;
+static int queue_w = 0;
+static int queue_prev_w = -1;
+static int skip_r = 0;
+static int skip_w = 0;
+
+void bitstream_queue_set_skip_write(int skip) { skip_w = skip; }
+
+void bitstream_queue_set_skip_read(int skip) { skip_r = skip; }
+
+void bitstream_queue_record_write(void) { queue_prev_w = queue_w; }
+
+void bitstream_queue_reset_write(void) { queue_w = queue_prev_w; }
+
+int bitstream_queue_get_write(void) { return queue_w; }
+
+int bitstream_queue_get_read(void) { return queue_r; }
+
+void bitstream_queue_pop(int *result, aom_cdf_prob *cdf, int *nsymbs) {
+  if (!skip_r) {
+    if (queue_w == queue_r) {
+      printf("buffer underflow queue_w %d queue_r %d\n", queue_w, queue_r);
+      assert(0);
+    }
+    *result = result_queue[queue_r];
+    *nsymbs = nsymbs_queue[queue_r];
+    memcpy(cdf, cdf_queue[queue_r], *nsymbs * sizeof(*cdf));
+    queue_r = (queue_r + 1) % QUEUE_MAX_SIZE;
+  }
+}
+
+void bitstream_queue_push(int result, const aom_cdf_prob *cdf, int nsymbs) {
+  if (!skip_w) {
+    result_queue[queue_w] = result;
+    nsymbs_queue[queue_w] = nsymbs;
+    memcpy(cdf_queue[queue_w], cdf, nsymbs * sizeof(*cdf));
+    queue_w = (queue_w + 1) % QUEUE_MAX_SIZE;
+    if (queue_w == queue_r) {
+      printf("buffer overflow queue_w %d queue_r %d\n", queue_w, queue_r);
+      assert(0);
+    }
+  }
+}
+#endif  // CONFIG_BITSTREAM_DEBUG
+
+#if CONFIG_MISMATCH_DEBUG
+static int frame_buf_idx_r = 0;
+static int frame_buf_idx_w = 0;
+static int max_frame_buf_num = 5;
+#define MAX_FRAME_STRIDE 1280
+#define MAX_FRAME_HEIGHT 720
+static uint16_t
+    frame_pre[5][3][MAX_FRAME_STRIDE * MAX_FRAME_HEIGHT];  // prediction only
+static uint16_t
+    frame_tx[5][3][MAX_FRAME_STRIDE * MAX_FRAME_HEIGHT];  // prediction + txfm
+static int frame_stride = MAX_FRAME_STRIDE;
+static int frame_height = MAX_FRAME_HEIGHT;
+static int frame_size = MAX_FRAME_STRIDE * MAX_FRAME_HEIGHT;
+void mismatch_move_frame_idx_w() {
+  frame_buf_idx_w = (frame_buf_idx_w + 1) % max_frame_buf_num;
+  if (frame_buf_idx_w == frame_buf_idx_r) {
+    printf("frame_buf overflow\n");
+    assert(0);
+  }
+}
+
+void mismatch_reset_frame(int num_planes) {
+  for (int plane = 0; plane < num_planes; ++plane) {
+    memset(frame_pre[frame_buf_idx_w][plane], 0,
+           sizeof(frame_pre[frame_buf_idx_w][plane][0]) * frame_size);
+    memset(frame_tx[frame_buf_idx_w][plane], 0,
+           sizeof(frame_tx[frame_buf_idx_w][plane][0]) * frame_size);
+  }
+}
+
+void mismatch_move_frame_idx_r() {
+  if (frame_buf_idx_w == frame_buf_idx_r) {
+    printf("frame_buf underflow\n");
+    assert(0);
+  }
+  frame_buf_idx_r = (frame_buf_idx_r + 1) % max_frame_buf_num;
+}
+
+void mismatch_record_block_pre(const uint8_t *src, int src_stride,
+                               int frame_offset, int plane, int pixel_c,
+                               int pixel_r, int blk_w, int blk_h, int highbd) {
+  if (pixel_c + blk_w >= frame_stride || pixel_r + blk_h >= frame_height) {
+    printf("frame_buf undersized\n");
+    assert(0);
+  }
+
+  const uint16_t *src16 = highbd ? CONVERT_TO_SHORTPTR(src) : NULL;
+  for (int r = 0; r < blk_h; ++r) {
+    for (int c = 0; c < blk_w; ++c) {
+      frame_pre[frame_buf_idx_w][plane]
+               [(r + pixel_r) * frame_stride + c + pixel_c] =
+                   src16 ? src16[r * src_stride + c] : src[r * src_stride + c];
+    }
+  }
+#if 0
+  int ref_frame_idx = 3;
+  int ref_frame_offset = 4;
+  int ref_plane = 1;
+  int ref_pixel_c = 162;
+  int ref_pixel_r = 16;
+  if (frame_idx_w == ref_frame_idx && plane == ref_plane &&
+      frame_offset == ref_frame_offset && ref_pixel_c >= pixel_c &&
+      ref_pixel_c < pixel_c + blk_w && ref_pixel_r >= pixel_r &&
+      ref_pixel_r < pixel_r + blk_h) {
+    printf(
+        "\nrecord_block_pre frame_idx %d frame_offset %d plane %d pixel_c %d pixel_r %d blk_w "
+        "%d blk_h %d\n",
+        frame_idx_w, frame_offset, plane, pixel_c, pixel_r, blk_w, blk_h);
+  }
+#endif
+}
+void mismatch_record_block_tx(const uint8_t *src, int src_stride,
+                              int frame_offset, int plane, int pixel_c,
+                              int pixel_r, int blk_w, int blk_h, int highbd) {
+  if (pixel_c + blk_w >= frame_stride || pixel_r + blk_h >= frame_height) {
+    printf("frame_buf undersized\n");
+    assert(0);
+  }
+
+  const uint16_t *src16 = highbd ? CONVERT_TO_SHORTPTR(src) : NULL;
+  for (int r = 0; r < blk_h; ++r) {
+    for (int c = 0; c < blk_w; ++c) {
+      frame_tx[frame_buf_idx_w][plane]
+              [(r + pixel_r) * frame_stride + c + pixel_c] =
+                  src16 ? src16[r * src_stride + c] : src[r * src_stride + c];
+    }
+  }
+#if 0
+  int ref_frame_idx = 3;
+  int ref_frame_offset = 4;
+  int ref_plane = 1;
+  int ref_pixel_c = 162;
+  int ref_pixel_r = 16;
+  if (frame_idx_w == ref_frame_idx && plane == ref_plane && frame_offset == ref_frame_offset &&
+      ref_pixel_c >= pixel_c && ref_pixel_c < pixel_c + blk_w &&
+      ref_pixel_r >= pixel_r && ref_pixel_r < pixel_r + blk_h) {
+    printf(
+        "\nrecord_block_tx frame_idx %d frame_offset %d plane %d pixel_c %d pixel_r %d blk_w "
+        "%d blk_h %d\n",
+        frame_idx_w, frame_offset, plane, pixel_c, pixel_r, blk_w, blk_h);
+  }
+#endif
+}
+void mismatch_check_block_pre(const uint8_t *src, int src_stride,
+                              int frame_offset, int plane, int pixel_c,
+                              int pixel_r, int blk_w, int blk_h, int highbd) {
+  if (pixel_c + blk_w >= frame_stride || pixel_r + blk_h >= frame_height) {
+    printf("frame_buf undersized\n");
+    assert(0);
+  }
+
+  const uint16_t *src16 = highbd ? CONVERT_TO_SHORTPTR(src) : NULL;
+  int mismatch = 0;
+  for (int r = 0; r < blk_h; ++r) {
+    for (int c = 0; c < blk_w; ++c) {
+      if (frame_pre[frame_buf_idx_r][plane]
+                   [(r + pixel_r) * frame_stride + c + pixel_c] !=
+          (uint16_t)(src16 ? src16[r * src_stride + c]
+                           : src[r * src_stride + c])) {
+        mismatch = 1;
+      }
+    }
+  }
+  if (mismatch) {
+    printf(
+        "\ncheck_block_pre failed frame_idx %d frame_offset %d plane %d "
+        "pixel_c %d pixel_r "
+        "%d blk_w %d blk_h %d\n",
+        frame_idx_r, frame_offset, plane, pixel_c, pixel_r, blk_w, blk_h);
+    printf("enc\n");
+    for (int rr = 0; rr < blk_h; ++rr) {
+      for (int cc = 0; cc < blk_w; ++cc) {
+        printf("%d ", frame_pre[frame_buf_idx_r][plane]
+                               [(rr + pixel_r) * frame_stride + cc + pixel_c]);
+      }
+      printf("\n");
+    }
+
+    printf("dec\n");
+    for (int rr = 0; rr < blk_h; ++rr) {
+      for (int cc = 0; cc < blk_w; ++cc) {
+        printf("%d ",
+               src16 ? src16[rr * src_stride + cc] : src[rr * src_stride + cc]);
+      }
+      printf("\n");
+    }
+    assert(0);
+  }
+}
+void mismatch_check_block_tx(const uint8_t *src, int src_stride,
+                             int frame_offset, int plane, int pixel_c,
+                             int pixel_r, int blk_w, int blk_h, int highbd) {
+  if (pixel_c + blk_w >= frame_stride || pixel_r + blk_h >= frame_height) {
+    printf("frame_buf undersized\n");
+    assert(0);
+  }
+
+  const uint16_t *src16 = highbd ? CONVERT_TO_SHORTPTR(src) : NULL;
+  int mismatch = 0;
+  for (int r = 0; r < blk_h; ++r) {
+    for (int c = 0; c < blk_w; ++c) {
+      if (frame_tx[frame_buf_idx_r][plane]
+                  [(r + pixel_r) * frame_stride + c + pixel_c] !=
+          (uint16_t)(src16 ? src16[r * src_stride + c]
+                           : src[r * src_stride + c])) {
+        mismatch = 1;
+      }
+    }
+  }
+  if (mismatch) {
+    printf(
+        "\ncheck_block_tx failed frame_idx %d frame_offset %d plane %d pixel_c "
+        "%d pixel_r "
+        "%d blk_w %d blk_h %d\n",
+        frame_idx_r, frame_offset, plane, pixel_c, pixel_r, blk_w, blk_h);
+    printf("enc\n");
+    for (int rr = 0; rr < blk_h; ++rr) {
+      for (int cc = 0; cc < blk_w; ++cc) {
+        printf("%d ", frame_tx[frame_buf_idx_r][plane]
+                              [(rr + pixel_r) * frame_stride + cc + pixel_c]);
+      }
+      printf("\n");
+    }
+
+    printf("dec\n");
+    for (int rr = 0; rr < blk_h; ++rr) {
+      for (int cc = 0; cc < blk_w; ++cc) {
+        printf("%d ",
+               src16 ? src16[rr * src_stride + cc] : src[rr * src_stride + cc]);
+      }
+      printf("\n");
+    }
+    assert(0);
+  }
+}
+#endif  // CONFIG_MISMATCH_DEBUG
diff --git a/libs/libaom/src/aom_util/debug_util.h b/libs/libaom/src/aom_util/debug_util.h
new file mode 100644
index 000000000..23cad2a5b
--- /dev/null
+++ b/libs/libaom/src/aom_util/debug_util.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_UTIL_DEBUG_UTIL_H_
+#define AOM_AOM_UTIL_DEBUG_UTIL_H_
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/prob.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void aom_bitstream_queue_set_frame_write(int frame_idx);
+int aom_bitstream_queue_get_frame_writee(void);
+void aom_bitstream_queue_set_frame_read(int frame_idx);
+int aom_bitstream_queue_get_frame_read(void);
+
+#if CONFIG_BITSTREAM_DEBUG
+/* This is a debug tool used to detect bitstream error. On encoder side, it
+ * pushes each bit and probability into a queue before the bit is written into
+ * the Arithmetic coder. On decoder side, whenever a bit is read out from the
+ * Arithmetic coder, it pops out the reference bit and probability from the
+ * queue as well. If the two results do not match, this debug tool will report
+ * an error.  This tool can be used to pin down the bitstream error precisely.
+ * By combining gdb's backtrace method, we can detect which module causes the
+ * bitstream error. */
+int bitstream_queue_get_write(void);
+int bitstream_queue_get_read(void);
+void bitstream_queue_record_write(void);
+void bitstream_queue_reset_write(void);
+void bitstream_queue_pop(int *result, aom_cdf_prob *cdf, int *nsymbs);
+void bitstream_queue_push(int result, const aom_cdf_prob *cdf, int nsymbs);
+void bitstream_queue_set_skip_write(int skip);
+void bitstream_queue_set_skip_read(int skip);
+#endif  // CONFIG_BITSTREAM_DEBUG
+
+#if CONFIG_MISMATCH_DEBUG
+void mismatch_move_frame_idx_w();
+void mismatch_move_frame_idx_r();
+void mismatch_reset_frame(int num_planes);
+void mismatch_record_block_pre(const uint8_t *src, int src_stride,
+                               int frame_offset, int plane, int pixel_c,
+                               int pixel_r, int blk_w, int blk_h, int highbd);
+void mismatch_record_block_tx(const uint8_t *src, int src_stride,
+                              int frame_offset, int plane, int pixel_c,
+                              int pixel_r, int blk_w, int blk_h, int highbd);
+void mismatch_check_block_pre(const uint8_t *src, int src_stride,
+                              int frame_offset, int plane, int pixel_c,
+                              int pixel_r, int blk_w, int blk_h, int highbd);
+void mismatch_check_block_tx(const uint8_t *src, int src_stride,
+                             int frame_offset, int plane, int pixel_c,
+                             int pixel_r, int blk_w, int blk_h, int highbd);
+#endif  // CONFIG_MISMATCH_DEBUG
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_UTIL_DEBUG_UTIL_H_
diff --git a/libs/libaom/src/aom_util/endian_inl.h b/libs/libaom/src/aom_util/endian_inl.h
new file mode 100644
index 000000000..f536ec5b8
--- /dev/null
+++ b/libs/libaom/src/aom_util/endian_inl.h
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+//
+// Endian related functions.
+
+#ifndef AOM_AOM_UTIL_ENDIAN_INL_H_
+#define AOM_AOM_UTIL_ENDIAN_INL_H_
+
+#include <stdlib.h>
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+
+#if defined(__GNUC__)
+#define LOCAL_GCC_VERSION ((__GNUC__ << 8) | __GNUC_MINOR__)
+#define LOCAL_GCC_PREREQ(maj, min) (LOCAL_GCC_VERSION >= (((maj) << 8) | (min)))
+#else
+#define LOCAL_GCC_VERSION 0
+#define LOCAL_GCC_PREREQ(maj, min) 0
+#endif
+
+// handle clang compatibility
+#ifndef __has_builtin
+#define __has_builtin(x) 0
+#endif
+
+// some endian fix (e.g.: mips-gcc doesn't define __BIG_ENDIAN__)
+#if !defined(WORDS_BIGENDIAN) &&                   \
+    (defined(__BIG_ENDIAN__) || defined(_M_PPC) || \
+     (defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)))
+#define WORDS_BIGENDIAN
+#endif
+
+#if defined(WORDS_BIGENDIAN)
+#define HToLE32 BSwap32
+#define HToLE16 BSwap16
+#define HToBE64(x) (x)
+#define HToBE32(x) (x)
+#else
+#define HToLE32(x) (x)
+#define HToLE16(x) (x)
+#define HToBE64(X) BSwap64(X)
+#define HToBE32(X) BSwap32(X)
+#endif
+
+#if LOCAL_GCC_PREREQ(4, 8) || __has_builtin(__builtin_bswap16)
+#define HAVE_BUILTIN_BSWAP16
+#endif
+
+#if LOCAL_GCC_PREREQ(4, 3) || __has_builtin(__builtin_bswap32)
+#define HAVE_BUILTIN_BSWAP32
+#endif
+
+#if LOCAL_GCC_PREREQ(4, 3) || __has_builtin(__builtin_bswap64)
+#define HAVE_BUILTIN_BSWAP64
+#endif
+
+#if HAVE_MIPS32 && defined(__mips__) && !defined(__mips64) && \
+    defined(__mips_isa_rev) && (__mips_isa_rev >= 2) && (__mips_isa_rev < 6)
+#define AOM_USE_MIPS32_R2
+#endif
+
+static INLINE uint16_t BSwap16(uint16_t x) {
+#if defined(HAVE_BUILTIN_BSWAP16)
+  return __builtin_bswap16(x);
+#elif defined(_MSC_VER)
+  return _byteswap_ushort(x);
+#else
+  // gcc will recognize a 'rorw $8, ...' here:
+  return (x >> 8) | ((x & 0xff) << 8);
+#endif  // HAVE_BUILTIN_BSWAP16
+}
+
+static INLINE uint32_t BSwap32(uint32_t x) {
+#if defined(AOM_USE_MIPS32_R2)
+  uint32_t ret;
+  __asm__ volatile(
+      "wsbh   %[ret], %[x]          \n\t"
+      "rotr   %[ret], %[ret],  16   \n\t"
+      : [ret] "=r"(ret)
+      : [x] "r"(x));
+  return ret;
+#elif defined(HAVE_BUILTIN_BSWAP32)
+  return __builtin_bswap32(x);
+#elif defined(__i386__) || defined(__x86_64__)
+  uint32_t swapped_bytes;
+  __asm__ volatile("bswap %0" : "=r"(swapped_bytes) : "0"(x));
+  return swapped_bytes;
+#elif defined(_MSC_VER)
+  return (uint32_t)_byteswap_ulong(x);
+#else
+  return (x >> 24) | ((x >> 8) & 0xff00) | ((x << 8) & 0xff0000) | (x << 24);
+#endif  // HAVE_BUILTIN_BSWAP32
+}
+
+static INLINE uint64_t BSwap64(uint64_t x) {
+#if defined(HAVE_BUILTIN_BSWAP64)
+  return __builtin_bswap64(x);
+#elif defined(__x86_64__)
+  uint64_t swapped_bytes;
+  __asm__ volatile("bswapq %0" : "=r"(swapped_bytes) : "0"(x));
+  return swapped_bytes;
+#elif defined(_MSC_VER)
+  return (uint64_t)_byteswap_uint64(x);
+#else   // generic code for swapping 64-bit values (suggested by bdb@)
+  x = ((x & 0xffffffff00000000ull) >> 32) | ((x & 0x00000000ffffffffull) << 32);
+  x = ((x & 0xffff0000ffff0000ull) >> 16) | ((x & 0x0000ffff0000ffffull) << 16);
+  x = ((x & 0xff00ff00ff00ff00ull) >> 8) | ((x & 0x00ff00ff00ff00ffull) << 8);
+  return x;
+#endif  // HAVE_BUILTIN_BSWAP64
+}
+
+#endif  // AOM_AOM_UTIL_ENDIAN_INL_H_
diff --git a/libs/libaom/src/apps/aomdec.c b/libs/libaom/src/apps/aomdec.c
new file mode 100644
index 000000000..2591d41a6
--- /dev/null
+++ b/libs/libaom/src/apps/aomdec.c
@@ -0,0 +1,1024 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <string.h>
+#include <limits.h>
+
+#include "config/aom_config.h"
+
+#if CONFIG_OS_SUPPORT
+#if HAVE_UNISTD_H
+#include <unistd.h>  // NOLINT
+#elif !defined(STDOUT_FILENO)
+#define STDOUT_FILENO 1
+#endif
+#endif
+
+#include "aom/aom_decoder.h"
+#include "aom/aomdx.h"
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/mem_ops.h"
+#include "common/args.h"
+#include "common/ivfdec.h"
+#include "common/md5_utils.h"
+#include "common/obudec.h"
+#include "common/tools_common.h"
+
+#if CONFIG_WEBM_IO
+#include "common/webmdec.h"
+#endif
+
+#include "common/rawenc.h"
+#include "common/y4menc.h"
+
+#if CONFIG_LIBYUV
+#include "third_party/libyuv/include/libyuv/scale.h"
+#endif
+
+static const char *exec_name;
+
+struct AvxDecInputContext {
+  struct AvxInputContext *aom_input_ctx;
+  struct ObuDecInputContext *obu_ctx;
+  struct WebmInputContext *webm_ctx;
+};
+
+static const arg_def_t help =
+    ARG_DEF(NULL, "help", 0, "Show usage options and exit");
+static const arg_def_t looparg =
+    ARG_DEF(NULL, "loops", 1, "Number of times to decode the file");
+static const arg_def_t codecarg = ARG_DEF(NULL, "codec", 1, "Codec to use");
+static const arg_def_t use_yv12 =
+    ARG_DEF(NULL, "yv12", 0, "Output raw YV12 frames");
+static const arg_def_t use_i420 =
+    ARG_DEF(NULL, "i420", 0, "Output raw I420 frames");
+static const arg_def_t flipuvarg =
+    ARG_DEF(NULL, "flipuv", 0, "Flip the chroma planes in the output");
+static const arg_def_t rawvideo =
+    ARG_DEF(NULL, "rawvideo", 0, "Output raw YUV frames");
+static const arg_def_t noblitarg =
+    ARG_DEF(NULL, "noblit", 0, "Don't process the decoded frames");
+static const arg_def_t progressarg =
+    ARG_DEF(NULL, "progress", 0, "Show progress after each frame decodes");
+static const arg_def_t limitarg =
+    ARG_DEF(NULL, "limit", 1, "Stop decoding after n frames");
+static const arg_def_t skiparg =
+    ARG_DEF(NULL, "skip", 1, "Skip the first n input frames");
+static const arg_def_t summaryarg =
+    ARG_DEF(NULL, "summary", 0, "Show timing summary");
+static const arg_def_t outputfile =
+    ARG_DEF("o", "output", 1, "Output file name pattern (see below)");
+static const arg_def_t threadsarg =
+    ARG_DEF("t", "threads", 1, "Max threads to use");
+static const arg_def_t verbosearg =
+    ARG_DEF("v", "verbose", 0, "Show version string");
+static const arg_def_t scalearg =
+    ARG_DEF("S", "scale", 0, "Scale output frames uniformly");
+static const arg_def_t continuearg =
+    ARG_DEF("k", "keep-going", 0, "(debug) Continue decoding after error");
+static const arg_def_t fb_arg =
+    ARG_DEF(NULL, "frame-buffers", 1, "Number of frame buffers to use");
+static const arg_def_t md5arg =
+    ARG_DEF(NULL, "md5", 0, "Compute the MD5 sum of the decoded frame");
+static const arg_def_t framestatsarg =
+    ARG_DEF(NULL, "framestats", 1, "Output per-frame stats (.csv format)");
+static const arg_def_t outbitdeptharg =
+    ARG_DEF(NULL, "output-bit-depth", 1, "Output bit-depth for decoded frames");
+static const arg_def_t isannexb =
+    ARG_DEF(NULL, "annexb", 0, "Bitstream is in Annex-B format");
+static const arg_def_t oppointarg = ARG_DEF(
+    NULL, "oppoint", 1, "Select an operating point of a scalable bitstream");
+static const arg_def_t outallarg = ARG_DEF(
+    NULL, "all-layers", 0, "Output all decoded frames of a scalable bitstream");
+static const arg_def_t skipfilmgrain =
+    ARG_DEF(NULL, "skip-film-grain", 0, "Skip film grain application");
+
+static const arg_def_t *all_args[] = {
+  &help,       &codecarg,   &use_yv12,      &use_i420,      &flipuvarg,
+  &rawvideo,   &noblitarg,  &progressarg,   &limitarg,      &skiparg,
+  &summaryarg, &outputfile, &threadsarg,    &verbosearg,    &scalearg,
+  &fb_arg,     &md5arg,     &framestatsarg, &continuearg,   &outbitdeptharg,
+  &isannexb,   &oppointarg, &outallarg,     &skipfilmgrain, NULL
+};
+
+#if CONFIG_LIBYUV
+static INLINE int libyuv_scale(aom_image_t *src, aom_image_t *dst,
+                               FilterModeEnum mode) {
+  if (src->fmt == AOM_IMG_FMT_I42016) {
+    assert(dst->fmt == AOM_IMG_FMT_I42016);
+    return I420Scale_16(
+        (uint16_t *)src->planes[AOM_PLANE_Y], src->stride[AOM_PLANE_Y] / 2,
+        (uint16_t *)src->planes[AOM_PLANE_U], src->stride[AOM_PLANE_U] / 2,
+        (uint16_t *)src->planes[AOM_PLANE_V], src->stride[AOM_PLANE_V] / 2,
+        src->d_w, src->d_h, (uint16_t *)dst->planes[AOM_PLANE_Y],
+        dst->stride[AOM_PLANE_Y] / 2, (uint16_t *)dst->planes[AOM_PLANE_U],
+        dst->stride[AOM_PLANE_U] / 2, (uint16_t *)dst->planes[AOM_PLANE_V],
+        dst->stride[AOM_PLANE_V] / 2, dst->d_w, dst->d_h, mode);
+  }
+  assert(src->fmt == AOM_IMG_FMT_I420);
+  assert(dst->fmt == AOM_IMG_FMT_I420);
+  return I420Scale(src->planes[AOM_PLANE_Y], src->stride[AOM_PLANE_Y],
+                   src->planes[AOM_PLANE_U], src->stride[AOM_PLANE_U],
+                   src->planes[AOM_PLANE_V], src->stride[AOM_PLANE_V], src->d_w,
+                   src->d_h, dst->planes[AOM_PLANE_Y], dst->stride[AOM_PLANE_Y],
+                   dst->planes[AOM_PLANE_U], dst->stride[AOM_PLANE_U],
+                   dst->planes[AOM_PLANE_V], dst->stride[AOM_PLANE_V], dst->d_w,
+                   dst->d_h, mode);
+}
+#endif
+
+static void show_help(FILE *fout, int shorthelp) {
+  fprintf(fout, "Usage: %s <options> filename\n\n", exec_name);
+
+  if (shorthelp) {
+    fprintf(fout, "Use --help to see the full list of options.\n");
+    return;
+  }
+
+  fprintf(fout, "Options:\n");
+  arg_show_usage(fout, all_args);
+  fprintf(fout,
+          "\nOutput File Patterns:\n\n"
+          "  The -o argument specifies the name of the file(s) to "
+          "write to. If the\n  argument does not include any escape "
+          "characters, the output will be\n  written to a single file. "
+          "Otherwise, the filename will be calculated by\n  expanding "
+          "the following escape characters:\n");
+  fprintf(fout,
+          "\n\t%%w   - Frame width"
+          "\n\t%%h   - Frame height"
+          "\n\t%%<n> - Frame number, zero padded to <n> places (1..9)"
+          "\n\n  Pattern arguments are only supported in conjunction "
+          "with the --yv12 and\n  --i420 options. If the -o option is "
+          "not specified, the output will be\n  directed to stdout.\n");
+  fprintf(fout, "\nIncluded decoders:\n\n");
+
+  for (int i = 0; i < get_aom_decoder_count(); ++i) {
+    const AvxInterface *const decoder = get_aom_decoder_by_index(i);
+    fprintf(fout, "    %-6s - %s\n", decoder->name,
+            aom_codec_iface_name(decoder->codec_interface()));
+  }
+}
+
+void usage_exit(void) {
+  show_help(stderr, 1);
+  exit(EXIT_FAILURE);
+}
+
+static int raw_read_frame(FILE *infile, uint8_t **buffer, size_t *bytes_read,
+                          size_t *buffer_size) {
+  char raw_hdr[RAW_FRAME_HDR_SZ];
+  size_t frame_size = 0;
+
+  if (fread(raw_hdr, RAW_FRAME_HDR_SZ, 1, infile) != 1) {
+    if (!feof(infile)) warn("Failed to read RAW frame size\n");
+  } else {
+    const size_t kCorruptFrameThreshold = 256 * 1024 * 1024;
+    const size_t kFrameTooSmallThreshold = 256 * 1024;
+    frame_size = mem_get_le32(raw_hdr);
+
+    if (frame_size > kCorruptFrameThreshold) {
+      warn("Read invalid frame size (%u)\n", (unsigned int)frame_size);
+      frame_size = 0;
+    }
+
+    if (frame_size < kFrameTooSmallThreshold) {
+      warn("Warning: Read invalid frame size (%u) - not a raw file?\n",
+           (unsigned int)frame_size);
+    }
+
+    if (frame_size > *buffer_size) {
+      uint8_t *new_buf = realloc(*buffer, 2 * frame_size);
+      if (new_buf) {
+        *buffer = new_buf;
+        *buffer_size = 2 * frame_size;
+      } else {
+        warn("Failed to allocate compressed data buffer\n");
+        frame_size = 0;
+      }
+    }
+  }
+
+  if (!feof(infile)) {
+    if (fread(*buffer, 1, frame_size, infile) != frame_size) {
+      warn("Failed to read full frame\n");
+      return 1;
+    }
+    *bytes_read = frame_size;
+  }
+
+  return 0;
+}
+
+static int read_frame(struct AvxDecInputContext *input, uint8_t **buf,
+                      size_t *bytes_in_buffer, size_t *buffer_size) {
+  switch (input->aom_input_ctx->file_type) {
+#if CONFIG_WEBM_IO
+    case FILE_TYPE_WEBM:
+      return webm_read_frame(input->webm_ctx, buf, bytes_in_buffer,
+                             buffer_size);
+#endif
+    case FILE_TYPE_RAW:
+      return raw_read_frame(input->aom_input_ctx->file, buf, bytes_in_buffer,
+                            buffer_size);
+    case FILE_TYPE_IVF:
+      return ivf_read_frame(input->aom_input_ctx->file, buf, bytes_in_buffer,
+                            buffer_size, NULL);
+    case FILE_TYPE_OBU:
+      return obudec_read_temporal_unit(input->obu_ctx, buf, bytes_in_buffer,
+                                       buffer_size);
+    default: return 1;
+  }
+}
+
+static int file_is_raw(struct AvxInputContext *input) {
+  uint8_t buf[32];
+  int is_raw = 0;
+  aom_codec_stream_info_t si;
+  memset(&si, 0, sizeof(si));
+
+  if (fread(buf, 1, 32, input->file) == 32) {
+    int i;
+
+    if (mem_get_le32(buf) < 256 * 1024 * 1024) {
+      for (i = 0; i < get_aom_decoder_count(); ++i) {
+        const AvxInterface *const decoder = get_aom_decoder_by_index(i);
+        if (!aom_codec_peek_stream_info(decoder->codec_interface(), buf + 4,
+                                        32 - 4, &si)) {
+          is_raw = 1;
+          input->fourcc = decoder->fourcc;
+          input->width = si.w;
+          input->height = si.h;
+          input->framerate.numerator = 30;
+          input->framerate.denominator = 1;
+          break;
+        }
+      }
+    }
+  }
+
+  rewind(input->file);
+  return is_raw;
+}
+
+static void show_progress(int frame_in, int frame_out, uint64_t dx_time) {
+  fprintf(stderr,
+          "%d decoded frames/%d showed frames in %" PRId64 " us (%.2f fps)\r",
+          frame_in, frame_out, dx_time,
+          (double)frame_out * 1000000.0 / (double)dx_time);
+}
+
+struct ExternalFrameBuffer {
+  uint8_t *data;
+  size_t size;
+  int in_use;
+};
+
+struct ExternalFrameBufferList {
+  int num_external_frame_buffers;
+  struct ExternalFrameBuffer *ext_fb;
+};
+
+// Callback used by libaom to request an external frame buffer. |cb_priv|
+// Application private data passed into the set function. |min_size| is the
+// minimum size in bytes needed to decode the next frame. |fb| pointer to the
+// frame buffer.
+static int get_av1_frame_buffer(void *cb_priv, size_t min_size,
+                                aom_codec_frame_buffer_t *fb) {
+  int i;
+  struct ExternalFrameBufferList *const ext_fb_list =
+      (struct ExternalFrameBufferList *)cb_priv;
+  if (ext_fb_list == NULL) return -1;
+
+  // Find a free frame buffer.
+  for (i = 0; i < ext_fb_list->num_external_frame_buffers; ++i) {
+    if (!ext_fb_list->ext_fb[i].in_use) break;
+  }
+
+  if (i == ext_fb_list->num_external_frame_buffers) return -1;
+
+  if (ext_fb_list->ext_fb[i].size < min_size) {
+    free(ext_fb_list->ext_fb[i].data);
+    ext_fb_list->ext_fb[i].data = (uint8_t *)calloc(min_size, sizeof(uint8_t));
+    if (!ext_fb_list->ext_fb[i].data) return -1;
+
+    ext_fb_list->ext_fb[i].size = min_size;
+  }
+
+  fb->data = ext_fb_list->ext_fb[i].data;
+  fb->size = ext_fb_list->ext_fb[i].size;
+  ext_fb_list->ext_fb[i].in_use = 1;
+
+  // Set the frame buffer's private data to point at the external frame buffer.
+  fb->priv = &ext_fb_list->ext_fb[i];
+  return 0;
+}
+
+// Callback used by libaom when there are no references to the frame buffer.
+// |cb_priv| user private data passed into the set function. |fb| pointer
+// to the frame buffer.
+static int release_av1_frame_buffer(void *cb_priv,
+                                    aom_codec_frame_buffer_t *fb) {
+  struct ExternalFrameBuffer *const ext_fb =
+      (struct ExternalFrameBuffer *)fb->priv;
+  (void)cb_priv;
+  ext_fb->in_use = 0;
+  return 0;
+}
+
+static void generate_filename(const char *pattern, char *out, size_t q_len,
+                              unsigned int d_w, unsigned int d_h,
+                              unsigned int frame_in) {
+  const char *p = pattern;
+  char *q = out;
+
+  do {
+    char *next_pat = strchr(p, '%');
+
+    if (p == next_pat) {
+      size_t pat_len;
+
+      /* parse the pattern */
+      q[q_len - 1] = '\0';
+      switch (p[1]) {
+        case 'w': snprintf(q, q_len - 1, "%d", d_w); break;
+        case 'h': snprintf(q, q_len - 1, "%d", d_h); break;
+        case '1': snprintf(q, q_len - 1, "%d", frame_in); break;
+        case '2': snprintf(q, q_len - 1, "%02d", frame_in); break;
+        case '3': snprintf(q, q_len - 1, "%03d", frame_in); break;
+        case '4': snprintf(q, q_len - 1, "%04d", frame_in); break;
+        case '5': snprintf(q, q_len - 1, "%05d", frame_in); break;
+        case '6': snprintf(q, q_len - 1, "%06d", frame_in); break;
+        case '7': snprintf(q, q_len - 1, "%07d", frame_in); break;
+        case '8': snprintf(q, q_len - 1, "%08d", frame_in); break;
+        case '9': snprintf(q, q_len - 1, "%09d", frame_in); break;
+        default: die("Unrecognized pattern %%%c\n", p[1]); break;
+      }
+
+      pat_len = strlen(q);
+      if (pat_len >= q_len - 1) die("Output filename too long.\n");
+      q += pat_len;
+      p += 2;
+      q_len -= pat_len;
+    } else {
+      size_t copy_len;
+
+      /* copy the next segment */
+      if (!next_pat)
+        copy_len = strlen(p);
+      else
+        copy_len = next_pat - p;
+
+      if (copy_len >= q_len - 1) die("Output filename too long.\n");
+
+      memcpy(q, p, copy_len);
+      q[copy_len] = '\0';
+      q += copy_len;
+      p += copy_len;
+      q_len -= copy_len;
+    }
+  } while (*p);
+}
+
+static int is_single_file(const char *outfile_pattern) {
+  const char *p = outfile_pattern;
+
+  do {
+    p = strchr(p, '%');
+    if (p && p[1] >= '1' && p[1] <= '9')
+      return 0;  // pattern contains sequence number, so it's not unique
+    if (p) p++;
+  } while (p);
+
+  return 1;
+}
+
+static void print_md5(unsigned char digest[16], const char *filename) {
+  int i;
+
+  for (i = 0; i < 16; ++i) printf("%02x", digest[i]);
+  printf("  %s\n", filename);
+}
+
+static FILE *open_outfile(const char *name) {
+  if (strcmp("-", name) == 0) {
+    set_binary_mode(stdout);
+    return stdout;
+  } else {
+    FILE *file = fopen(name, "wb");
+    if (!file) fatal("Failed to open output file '%s'", name);
+    return file;
+  }
+}
+
+static int main_loop(int argc, const char **argv_) {
+  aom_codec_ctx_t decoder;
+  char *fn = NULL;
+  int i;
+  int ret = EXIT_FAILURE;
+  uint8_t *buf = NULL;
+  size_t bytes_in_buffer = 0, buffer_size = 0;
+  FILE *infile;
+  int frame_in = 0, frame_out = 0, flipuv = 0, noblit = 0;
+  int do_md5 = 0, progress = 0;
+  int stop_after = 0, summary = 0, quiet = 1;
+  int arg_skip = 0;
+  int keep_going = 0;
+  const AvxInterface *interface = NULL;
+  const AvxInterface *fourcc_interface = NULL;
+  uint64_t dx_time = 0;
+  struct arg arg;
+  char **argv, **argi, **argj;
+
+  int single_file;
+  int use_y4m = 1;
+  int opt_yv12 = 0;
+  int opt_i420 = 0;
+  int opt_raw = 0;
+  aom_codec_dec_cfg_t cfg = { 0, 0, 0, !FORCE_HIGHBITDEPTH_DECODING };
+  unsigned int fixed_output_bit_depth = 0;
+  unsigned int is_annexb = 0;
+  int frames_corrupted = 0;
+  int dec_flags = 0;
+  int do_scale = 0;
+  int operating_point = 0;
+  int output_all_layers = 0;
+  int skip_film_grain = 0;
+  aom_image_t *scaled_img = NULL;
+  aom_image_t *img_shifted = NULL;
+  int frame_avail, got_data, flush_decoder = 0;
+  int num_external_frame_buffers = 0;
+  struct ExternalFrameBufferList ext_fb_list = { 0, NULL };
+
+  const char *outfile_pattern = NULL;
+  char outfile_name[PATH_MAX] = { 0 };
+  FILE *outfile = NULL;
+
+  FILE *framestats_file = NULL;
+
+  MD5Context md5_ctx;
+  unsigned char md5_digest[16];
+
+  struct AvxDecInputContext input = { NULL, NULL, NULL };
+  struct AvxInputContext aom_input_ctx;
+  memset(&aom_input_ctx, 0, sizeof(aom_input_ctx));
+#if CONFIG_WEBM_IO
+  struct WebmInputContext webm_ctx;
+  memset(&webm_ctx, 0, sizeof(webm_ctx));
+  input.webm_ctx = &webm_ctx;
+#endif
+  struct ObuDecInputContext obu_ctx = { NULL, NULL, 0, 0, 0 };
+  int is_ivf = 0;
+
+  obu_ctx.avx_ctx = &aom_input_ctx;
+  input.obu_ctx = &obu_ctx;
+  input.aom_input_ctx = &aom_input_ctx;
+
+  /* Parse command line */
+  exec_name = argv_[0];
+  argv = argv_dup(argc - 1, argv_ + 1);
+
+  for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) {
+    memset(&arg, 0, sizeof(arg));
+    arg.argv_step = 1;
+
+    if (arg_match(&arg, &help, argi)) {
+      show_help(stdout, 0);
+      exit(EXIT_SUCCESS);
+    } else if (arg_match(&arg, &codecarg, argi)) {
+      interface = get_aom_decoder_by_name(arg.val);
+      if (!interface)
+        die("Error: Unrecognized argument (%s) to --codec\n", arg.val);
+    } else if (arg_match(&arg, &looparg, argi)) {
+      // no-op
+    } else if (arg_match(&arg, &outputfile, argi)) {
+      outfile_pattern = arg.val;
+    } else if (arg_match(&arg, &use_yv12, argi)) {
+      use_y4m = 0;
+      flipuv = 1;
+      opt_yv12 = 1;
+      opt_i420 = 0;
+      opt_raw = 0;
+    } else if (arg_match(&arg, &use_i420, argi)) {
+      use_y4m = 0;
+      flipuv = 0;
+      opt_yv12 = 0;
+      opt_i420 = 1;
+      opt_raw = 0;
+    } else if (arg_match(&arg, &rawvideo, argi)) {
+      use_y4m = 0;
+      opt_yv12 = 0;
+      opt_i420 = 0;
+      opt_raw = 1;
+    } else if (arg_match(&arg, &flipuvarg, argi)) {
+      flipuv = 1;
+    } else if (arg_match(&arg, &noblitarg, argi)) {
+      noblit = 1;
+    } else if (arg_match(&arg, &progressarg, argi)) {
+      progress = 1;
+    } else if (arg_match(&arg, &limitarg, argi)) {
+      stop_after = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &skiparg, argi)) {
+      arg_skip = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &md5arg, argi)) {
+      do_md5 = 1;
+    } else if (arg_match(&arg, &framestatsarg, argi)) {
+      framestats_file = fopen(arg.val, "w");
+      if (!framestats_file) {
+        die("Error: Could not open --framestats file (%s) for writing.\n",
+            arg.val);
+      }
+    } else if (arg_match(&arg, &summaryarg, argi)) {
+      summary = 1;
+    } else if (arg_match(&arg, &threadsarg, argi)) {
+      cfg.threads = arg_parse_uint(&arg);
+#if !CONFIG_MULTITHREAD
+      if (cfg.threads > 1) {
+        die("Error: --threads=%d is not supported when CONFIG_MULTITHREAD = "
+            "0.\n",
+            cfg.threads);
+      }
+#endif
+    } else if (arg_match(&arg, &verbosearg, argi)) {
+      quiet = 0;
+    } else if (arg_match(&arg, &scalearg, argi)) {
+      do_scale = 1;
+    } else if (arg_match(&arg, &fb_arg, argi)) {
+      num_external_frame_buffers = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &continuearg, argi)) {
+      keep_going = 1;
+    } else if (arg_match(&arg, &outbitdeptharg, argi)) {
+      fixed_output_bit_depth = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &isannexb, argi)) {
+      is_annexb = 1;
+      input.obu_ctx->is_annexb = 1;
+    } else if (arg_match(&arg, &oppointarg, argi)) {
+      operating_point = arg_parse_int(&arg);
+    } else if (arg_match(&arg, &outallarg, argi)) {
+      output_all_layers = 1;
+    } else if (arg_match(&arg, &skipfilmgrain, argi)) {
+      skip_film_grain = 1;
+    } else {
+      argj++;
+    }
+  }
+
+  /* Check for unrecognized options */
+  for (argi = argv; *argi; argi++)
+    if (argi[0][0] == '-' && strlen(argi[0]) > 1)
+      die("Error: Unrecognized option %s\n", *argi);
+
+  /* Handle non-option arguments */
+  fn = argv[0];
+
+  if (!fn) {
+    free(argv);
+    fprintf(stderr, "No input file specified!\n");
+    usage_exit();
+  }
+  /* Open file */
+  infile = strcmp(fn, "-") ? fopen(fn, "rb") : set_binary_mode(stdin);
+
+  if (!infile) {
+    fatal("Failed to open input file '%s'", strcmp(fn, "-") ? fn : "stdin");
+  }
+#if CONFIG_OS_SUPPORT
+  /* Make sure we don't dump to the terminal, unless forced to with -o - */
+  if (!outfile_pattern && isatty(STDOUT_FILENO) && !do_md5 && !noblit) {
+    fprintf(stderr,
+            "Not dumping raw video to your terminal. Use '-o -' to "
+            "override.\n");
+    return EXIT_FAILURE;
+  }
+#endif
+  input.aom_input_ctx->filename = fn;
+  input.aom_input_ctx->file = infile;
+  if (file_is_ivf(input.aom_input_ctx)) {
+    input.aom_input_ctx->file_type = FILE_TYPE_IVF;
+    is_ivf = 1;
+  }
+#if CONFIG_WEBM_IO
+  else if (file_is_webm(input.webm_ctx, input.aom_input_ctx))
+    input.aom_input_ctx->file_type = FILE_TYPE_WEBM;
+#endif
+  else if (file_is_obu(&obu_ctx))
+    input.aom_input_ctx->file_type = FILE_TYPE_OBU;
+  else if (file_is_raw(input.aom_input_ctx))
+    input.aom_input_ctx->file_type = FILE_TYPE_RAW;
+  else {
+    fprintf(stderr, "Unrecognized input file type.\n");
+#if !CONFIG_WEBM_IO
+    fprintf(stderr, "aomdec was built without WebM container support.\n");
+#endif
+    free(argv);
+    return EXIT_FAILURE;
+  }
+
+  outfile_pattern = outfile_pattern ? outfile_pattern : "-";
+  single_file = is_single_file(outfile_pattern);
+
+  if (!noblit && single_file) {
+    generate_filename(outfile_pattern, outfile_name, PATH_MAX,
+                      aom_input_ctx.width, aom_input_ctx.height, 0);
+    if (do_md5)
+      MD5Init(&md5_ctx);
+    else
+      outfile = open_outfile(outfile_name);
+  }
+
+  if (use_y4m && !noblit) {
+    if (!single_file) {
+      fprintf(stderr,
+              "YUV4MPEG2 not supported with output patterns,"
+              " try --i420 or --yv12 or --rawvideo.\n");
+      return EXIT_FAILURE;
+    }
+
+#if CONFIG_WEBM_IO
+    if (aom_input_ctx.file_type == FILE_TYPE_WEBM) {
+      if (webm_guess_framerate(input.webm_ctx, input.aom_input_ctx)) {
+        fprintf(stderr,
+                "Failed to guess framerate -- error parsing "
+                "webm file?\n");
+        return EXIT_FAILURE;
+      }
+    }
+#endif
+  }
+
+  fourcc_interface = get_aom_decoder_by_fourcc(aom_input_ctx.fourcc);
+
+  if (is_ivf && !fourcc_interface)
+    fatal("Unsupported fourcc: %x\n", aom_input_ctx.fourcc);
+
+  if (interface && fourcc_interface && interface != fourcc_interface)
+    warn("Header indicates codec: %s\n", fourcc_interface->name);
+  else
+    interface = fourcc_interface;
+
+  if (!interface) interface = get_aom_decoder_by_index(0);
+
+  dec_flags = 0;
+  if (aom_codec_dec_init(&decoder, interface->codec_interface(), &cfg,
+                         dec_flags)) {
+    fprintf(stderr, "Failed to initialize decoder: %s\n",
+            aom_codec_error(&decoder));
+    goto fail2;
+  }
+
+  if (!quiet) fprintf(stderr, "%s\n", decoder.name);
+
+  if (AOM_CODEC_CONTROL_TYPECHECKED(&decoder, AV1D_SET_IS_ANNEXB, is_annexb)) {
+    fprintf(stderr, "Failed to set is_annexb: %s\n", aom_codec_error(&decoder));
+    goto fail;
+  }
+
+  if (AOM_CODEC_CONTROL_TYPECHECKED(&decoder, AV1D_SET_OPERATING_POINT,
+                                    operating_point)) {
+    fprintf(stderr, "Failed to set operating_point: %s\n",
+            aom_codec_error(&decoder));
+    goto fail;
+  }
+
+  if (AOM_CODEC_CONTROL_TYPECHECKED(&decoder, AV1D_SET_OUTPUT_ALL_LAYERS,
+                                    output_all_layers)) {
+    fprintf(stderr, "Failed to set output_all_layers: %s\n",
+            aom_codec_error(&decoder));
+    goto fail;
+  }
+
+  if (AOM_CODEC_CONTROL_TYPECHECKED(&decoder, AV1D_SET_SKIP_FILM_GRAIN,
+                                    skip_film_grain)) {
+    fprintf(stderr, "Failed to set skip_film_grain: %s\n",
+            aom_codec_error(&decoder));
+    goto fail;
+  }
+
+  if (arg_skip) fprintf(stderr, "Skipping first %d frames.\n", arg_skip);
+  while (arg_skip) {
+    if (read_frame(&input, &buf, &bytes_in_buffer, &buffer_size)) break;
+    arg_skip--;
+  }
+
+  if (num_external_frame_buffers > 0) {
+    ext_fb_list.num_external_frame_buffers = num_external_frame_buffers;
+    ext_fb_list.ext_fb = (struct ExternalFrameBuffer *)calloc(
+        num_external_frame_buffers, sizeof(*ext_fb_list.ext_fb));
+    if (aom_codec_set_frame_buffer_functions(&decoder, get_av1_frame_buffer,
+                                             release_av1_frame_buffer,
+                                             &ext_fb_list)) {
+      fprintf(stderr, "Failed to configure external frame buffers: %s\n",
+              aom_codec_error(&decoder));
+      goto fail;
+    }
+  }
+
+  frame_avail = 1;
+  got_data = 0;
+
+  if (framestats_file) fprintf(framestats_file, "bytes,qp\r\n");
+
+  /* Decode file */
+  while (frame_avail || got_data) {
+    aom_codec_iter_t iter = NULL;
+    aom_image_t *img;
+    struct aom_usec_timer timer;
+    int corrupted = 0;
+
+    frame_avail = 0;
+    if (!stop_after || frame_in < stop_after) {
+      if (!read_frame(&input, &buf, &bytes_in_buffer, &buffer_size)) {
+        frame_avail = 1;
+        frame_in++;
+
+        aom_usec_timer_start(&timer);
+
+        if (aom_codec_decode(&decoder, buf, bytes_in_buffer, NULL)) {
+          const char *detail = aom_codec_error_detail(&decoder);
+          warn("Failed to decode frame %d: %s", frame_in,
+               aom_codec_error(&decoder));
+
+          if (detail) warn("Additional information: %s", detail);
+          if (!keep_going) goto fail;
+        }
+
+        if (framestats_file) {
+          int qp;
+          if (AOM_CODEC_CONTROL_TYPECHECKED(&decoder, AOMD_GET_LAST_QUANTIZER,
+                                            &qp)) {
+            warn("Failed AOMD_GET_LAST_QUANTIZER: %s",
+                 aom_codec_error(&decoder));
+            if (!keep_going) goto fail;
+          }
+          fprintf(framestats_file, "%d,%d\r\n", (int)bytes_in_buffer, qp);
+        }
+
+        aom_usec_timer_mark(&timer);
+        dx_time += aom_usec_timer_elapsed(&timer);
+      } else {
+        flush_decoder = 1;
+      }
+    } else {
+      flush_decoder = 1;
+    }
+
+    aom_usec_timer_start(&timer);
+
+    if (flush_decoder) {
+      // Flush the decoder.
+      if (aom_codec_decode(&decoder, NULL, 0, NULL)) {
+        warn("Failed to flush decoder: %s", aom_codec_error(&decoder));
+      }
+    }
+
+    aom_usec_timer_mark(&timer);
+    dx_time += aom_usec_timer_elapsed(&timer);
+
+    got_data = 0;
+    while ((img = aom_codec_get_frame(&decoder, &iter))) {
+      ++frame_out;
+      got_data = 1;
+
+      if (AOM_CODEC_CONTROL_TYPECHECKED(&decoder, AOMD_GET_FRAME_CORRUPTED,
+                                        &corrupted)) {
+        warn("Failed AOM_GET_FRAME_CORRUPTED: %s", aom_codec_error(&decoder));
+        if (!keep_going) goto fail;
+      }
+      frames_corrupted += corrupted;
+
+      if (progress) show_progress(frame_in, frame_out, dx_time);
+
+      if (!noblit) {
+        const int PLANES_YUV[] = { AOM_PLANE_Y, AOM_PLANE_U, AOM_PLANE_V };
+        const int PLANES_YVU[] = { AOM_PLANE_Y, AOM_PLANE_V, AOM_PLANE_U };
+        const int *planes = flipuv ? PLANES_YVU : PLANES_YUV;
+
+        if (do_scale) {
+          if (frame_out == 1) {
+            // If the output frames are to be scaled to a fixed display size
+            // then use the width and height specified in the container. If
+            // either of these is set to 0, use the display size set in the
+            // first frame header. If that is unavailable, use the raw decoded
+            // size of the first decoded frame.
+            int render_width = aom_input_ctx.width;
+            int render_height = aom_input_ctx.height;
+            if (!render_width || !render_height) {
+              int render_size[2];
+              if (AOM_CODEC_CONTROL_TYPECHECKED(&decoder, AV1D_GET_DISPLAY_SIZE,
+                                                render_size)) {
+                // As last resort use size of first frame as display size.
+                render_width = img->d_w;
+                render_height = img->d_h;
+              } else {
+                render_width = render_size[0];
+                render_height = render_size[1];
+              }
+            }
+            scaled_img =
+                aom_img_alloc(NULL, img->fmt, render_width, render_height, 16);
+            scaled_img->bit_depth = img->bit_depth;
+            scaled_img->monochrome = img->monochrome;
+            scaled_img->csp = img->csp;
+          }
+
+          if (img->d_w != scaled_img->d_w || img->d_h != scaled_img->d_h) {
+#if CONFIG_LIBYUV
+            libyuv_scale(img, scaled_img, kFilterBox);
+            img = scaled_img;
+#else
+            fprintf(
+                stderr,
+                "Failed to scale output frame: %s.\n"
+                "libyuv is required for scaling but is currently disabled.\n"
+                "Be sure to specify -DCONFIG_LIBYUV=1 when running cmake.\n",
+                aom_codec_error(&decoder));
+            goto fail;
+#endif
+          }
+        }
+        // Default to codec bit depth if output bit depth not set
+        unsigned int output_bit_depth;
+        if (!fixed_output_bit_depth && single_file) {
+          output_bit_depth = img->bit_depth;
+        } else {
+          output_bit_depth = fixed_output_bit_depth;
+        }
+        // Shift up or down if necessary
+        if (output_bit_depth != 0)
+          aom_shift_img(output_bit_depth, &img, &img_shifted);
+
+        aom_input_ctx.width = img->d_w;
+        aom_input_ctx.height = img->d_h;
+
+        int num_planes = (opt_raw && img->monochrome) ? 1 : 3;
+        if (single_file) {
+          if (use_y4m) {
+            char y4m_buf[Y4M_BUFFER_SIZE] = { 0 };
+            size_t len = 0;
+            if (frame_out == 1) {
+              // Y4M file header
+              len = y4m_write_file_header(
+                  y4m_buf, sizeof(y4m_buf), aom_input_ctx.width,
+                  aom_input_ctx.height, &aom_input_ctx.framerate,
+                  img->monochrome, img->csp, img->fmt, img->bit_depth);
+              if (img->csp == AOM_CSP_COLOCATED) {
+                fprintf(stderr,
+                        "Warning: Y4M lacks a colorspace for colocated "
+                        "chroma. Using a placeholder.\n");
+              }
+              if (do_md5) {
+                MD5Update(&md5_ctx, (md5byte *)y4m_buf, (unsigned int)len);
+              } else {
+                fputs(y4m_buf, outfile);
+              }
+            }
+
+            // Y4M frame header
+            len = y4m_write_frame_header(y4m_buf, sizeof(y4m_buf));
+            if (do_md5) {
+              MD5Update(&md5_ctx, (md5byte *)y4m_buf, (unsigned int)len);
+              y4m_update_image_md5(img, planes, &md5_ctx);
+            } else {
+              fputs(y4m_buf, outfile);
+              y4m_write_image_file(img, planes, outfile);
+            }
+          } else {
+            if (frame_out == 1) {
+              // Check if --yv12 or --i420 options are consistent with the
+              // bit-stream decoded
+              if (opt_i420) {
+                if (img->fmt != AOM_IMG_FMT_I420 &&
+                    img->fmt != AOM_IMG_FMT_I42016) {
+                  fprintf(stderr,
+                          "Cannot produce i420 output for bit-stream.\n");
+                  goto fail;
+                }
+              }
+              if (opt_yv12) {
+                if ((img->fmt != AOM_IMG_FMT_I420 &&
+                     img->fmt != AOM_IMG_FMT_YV12) ||
+                    img->bit_depth != 8) {
+                  fprintf(stderr,
+                          "Cannot produce yv12 output for bit-stream.\n");
+                  goto fail;
+                }
+              }
+            }
+            if (do_md5) {
+              raw_update_image_md5(img, planes, num_planes, &md5_ctx);
+            } else {
+              raw_write_image_file(img, planes, num_planes, outfile);
+            }
+          }
+        } else {
+          generate_filename(outfile_pattern, outfile_name, PATH_MAX, img->d_w,
+                            img->d_h, frame_in);
+          if (do_md5) {
+            MD5Init(&md5_ctx);
+            if (use_y4m) {
+              y4m_update_image_md5(img, planes, &md5_ctx);
+            } else {
+              raw_update_image_md5(img, planes, num_planes, &md5_ctx);
+            }
+            MD5Final(md5_digest, &md5_ctx);
+            print_md5(md5_digest, outfile_name);
+          } else {
+            outfile = open_outfile(outfile_name);
+            if (use_y4m) {
+              y4m_write_image_file(img, planes, outfile);
+            } else {
+              raw_write_image_file(img, planes, num_planes, outfile);
+            }
+            fclose(outfile);
+          }
+        }
+      }
+    }
+  }
+
+  if (summary || progress) {
+    show_progress(frame_in, frame_out, dx_time);
+    fprintf(stderr, "\n");
+  }
+
+  if (frames_corrupted) {
+    fprintf(stderr, "WARNING: %d frames corrupted.\n", frames_corrupted);
+  } else {
+    ret = EXIT_SUCCESS;
+  }
+
+fail:
+
+  if (aom_codec_destroy(&decoder)) {
+    fprintf(stderr, "Failed to destroy decoder: %s\n",
+            aom_codec_error(&decoder));
+  }
+
+fail2:
+
+  if (!noblit && single_file) {
+    if (do_md5) {
+      MD5Final(md5_digest, &md5_ctx);
+      print_md5(md5_digest, outfile_name);
+    } else {
+      fclose(outfile);
+    }
+  }
+
+#if CONFIG_WEBM_IO
+  if (input.aom_input_ctx->file_type == FILE_TYPE_WEBM)
+    webm_free(input.webm_ctx);
+#endif
+  if (input.aom_input_ctx->file_type == FILE_TYPE_OBU)
+    obudec_free(input.obu_ctx);
+
+  if (input.aom_input_ctx->file_type != FILE_TYPE_WEBM) free(buf);
+
+  if (scaled_img) aom_img_free(scaled_img);
+  if (img_shifted) aom_img_free(img_shifted);
+
+  for (i = 0; i < ext_fb_list.num_external_frame_buffers; ++i) {
+    free(ext_fb_list.ext_fb[i].data);
+  }
+  free(ext_fb_list.ext_fb);
+
+  fclose(infile);
+  if (framestats_file) fclose(framestats_file);
+
+  free(argv);
+
+  return ret;
+}
+
+int main(int argc, const char **argv_) {
+  unsigned int loops = 1, i;
+  char **argv, **argi, **argj;
+  struct arg arg;
+  int error = 0;
+
+  argv = argv_dup(argc - 1, argv_ + 1);
+  for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) {
+    memset(&arg, 0, sizeof(arg));
+    arg.argv_step = 1;
+
+    if (arg_match(&arg, &looparg, argi)) {
+      loops = arg_parse_uint(&arg);
+      break;
+    }
+  }
+  free(argv);
+  for (i = 0; !error && i < loops; i++) error = main_loop(argc, argv_);
+  return error;
+}
diff --git a/libs/libaom/src/apps/aomenc.c b/libs/libaom/src/apps/aomenc.c
new file mode 100644
index 000000000..bb57726b4
--- /dev/null
+++ b/libs/libaom/src/apps/aomenc.c
@@ -0,0 +1,2752 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "apps/aomenc.h"
+
+#include "config/aom_config.h"
+
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#if CONFIG_AV1_DECODER
+#include "aom/aom_decoder.h"
+#include "aom/aomdx.h"
+#endif
+
+#include "aom/aom_encoder.h"
+#include "aom/aom_integer.h"
+#include "aom/aomcx.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/mem_ops.h"
+#include "common/args.h"
+#include "common/ivfenc.h"
+#include "common/tools_common.h"
+#include "common/warnings.h"
+
+#if CONFIG_WEBM_IO
+#include "common/webmenc.h"
+#endif
+
+#include "common/y4minput.h"
+#include "examples/encoder_util.h"
+#include "stats/aomstats.h"
+#include "stats/rate_hist.h"
+
+#if CONFIG_LIBYUV
+#include "third_party/libyuv/include/libyuv/scale.h"
+#endif
+
+/* Swallow warnings about unused results of fread/fwrite */
+static size_t wrap_fread(void *ptr, size_t size, size_t nmemb, FILE *stream) {
+  return fread(ptr, size, nmemb, stream);
+}
+#define fread wrap_fread
+
+static size_t wrap_fwrite(const void *ptr, size_t size, size_t nmemb,
+                          FILE *stream) {
+  return fwrite(ptr, size, nmemb, stream);
+}
+#define fwrite wrap_fwrite
+
+static const char *exec_name;
+
+static void warn_or_exit_on_errorv(aom_codec_ctx_t *ctx, int fatal,
+                                   const char *s, va_list ap) {
+  if (ctx->err) {
+    const char *detail = aom_codec_error_detail(ctx);
+
+    vfprintf(stderr, s, ap);
+    fprintf(stderr, ": %s\n", aom_codec_error(ctx));
+
+    if (detail) fprintf(stderr, "    %s\n", detail);
+
+    if (fatal) exit(EXIT_FAILURE);
+  }
+}
+
+static void ctx_exit_on_error(aom_codec_ctx_t *ctx, const char *s, ...) {
+  va_list ap;
+
+  va_start(ap, s);
+  warn_or_exit_on_errorv(ctx, 1, s, ap);
+  va_end(ap);
+}
+
+static void warn_or_exit_on_error(aom_codec_ctx_t *ctx, int fatal,
+                                  const char *s, ...) {
+  va_list ap;
+
+  va_start(ap, s);
+  warn_or_exit_on_errorv(ctx, fatal, s, ap);
+  va_end(ap);
+}
+
+static int read_frame(struct AvxInputContext *input_ctx, aom_image_t *img) {
+  FILE *f = input_ctx->file;
+  y4m_input *y4m = &input_ctx->y4m;
+  int shortread = 0;
+
+  if (input_ctx->file_type == FILE_TYPE_Y4M) {
+    if (y4m_input_fetch_frame(y4m, f, img) < 1) return 0;
+  } else {
+    shortread = read_yuv_frame(input_ctx, img);
+  }
+
+  return !shortread;
+}
+
+static int file_is_y4m(const char detect[4]) {
+  if (memcmp(detect, "YUV4", 4) == 0) {
+    return 1;
+  }
+  return 0;
+}
+
+static int fourcc_is_ivf(const char detect[4]) {
+  if (memcmp(detect, "DKIF", 4) == 0) {
+    return 1;
+  }
+  return 0;
+}
+
+static const arg_def_t help =
+    ARG_DEF(NULL, "help", 0, "Show usage options and exit");
+static const arg_def_t debugmode =
+    ARG_DEF("D", "debug", 0, "Debug mode (makes output deterministic)");
+static const arg_def_t outputfile =
+    ARG_DEF("o", "output", 1, "Output filename");
+static const arg_def_t use_yv12 =
+    ARG_DEF(NULL, "yv12", 0, "Input file is YV12 ");
+static const arg_def_t use_i420 =
+    ARG_DEF(NULL, "i420", 0, "Input file is I420 (default)");
+static const arg_def_t use_i422 =
+    ARG_DEF(NULL, "i422", 0, "Input file is I422");
+static const arg_def_t use_i444 =
+    ARG_DEF(NULL, "i444", 0, "Input file is I444");
+static const arg_def_t codecarg = ARG_DEF(NULL, "codec", 1, "Codec to use");
+static const arg_def_t passes =
+    ARG_DEF("p", "passes", 1, "Number of passes (1/2)");
+static const arg_def_t pass_arg =
+    ARG_DEF(NULL, "pass", 1, "Pass to execute (1/2)");
+static const arg_def_t fpf_name =
+    ARG_DEF(NULL, "fpf", 1, "First pass statistics file name");
+static const arg_def_t limit =
+    ARG_DEF(NULL, "limit", 1, "Stop encoding after n input frames");
+static const arg_def_t skip =
+    ARG_DEF(NULL, "skip", 1, "Skip the first n input frames");
+static const arg_def_t good_dl =
+    ARG_DEF(NULL, "good", 0, "Use Good Quality Deadline");
+static const arg_def_t rt_dl =
+    ARG_DEF(NULL, "rt", 0, "Use Realtime Quality Deadline");
+static const arg_def_t quietarg =
+    ARG_DEF("q", "quiet", 0, "Do not print encode progress");
+static const arg_def_t verbosearg =
+    ARG_DEF("v", "verbose", 0, "Show encoder parameters");
+static const arg_def_t psnrarg =
+    ARG_DEF(NULL, "psnr", 0, "Show PSNR in status line");
+static const arg_def_t use_cfg = ARG_DEF("c", "cfg", 1, "Config file to use");
+
+static const struct arg_enum_list test_decode_enum[] = {
+  { "off", TEST_DECODE_OFF },
+  { "fatal", TEST_DECODE_FATAL },
+  { "warn", TEST_DECODE_WARN },
+  { NULL, 0 }
+};
+static const arg_def_t recontest = ARG_DEF_ENUM(
+    NULL, "test-decode", 1, "Test encode/decode mismatch", test_decode_enum);
+static const arg_def_t framerate =
+    ARG_DEF(NULL, "fps", 1, "Stream frame rate (rate/scale)");
+static const arg_def_t use_webm =
+    ARG_DEF(NULL, "webm", 0, "Output WebM (default when WebM IO is enabled)");
+static const arg_def_t use_ivf = ARG_DEF(NULL, "ivf", 0, "Output IVF");
+static const arg_def_t use_obu = ARG_DEF(NULL, "obu", 0, "Output OBU");
+static const arg_def_t q_hist_n =
+    ARG_DEF(NULL, "q-hist", 1, "Show quantizer histogram (n-buckets)");
+static const arg_def_t rate_hist_n =
+    ARG_DEF(NULL, "rate-hist", 1, "Show rate histogram (n-buckets)");
+static const arg_def_t disable_warnings =
+    ARG_DEF(NULL, "disable-warnings", 0,
+            "Disable warnings about potentially incorrect encode settings.");
+static const arg_def_t disable_warning_prompt =
+    ARG_DEF("y", "disable-warning-prompt", 0,
+            "Display warnings, but do not prompt user to continue.");
+static const struct arg_enum_list bitdepth_enum[] = {
+  { "8", AOM_BITS_8 }, { "10", AOM_BITS_10 }, { "12", AOM_BITS_12 }, { NULL, 0 }
+};
+
+static const arg_def_t bitdeptharg = ARG_DEF_ENUM(
+    "b", "bit-depth", 1,
+    "Bit depth for codec (8 for version <=1, 10 or 12 for version 2)",
+    bitdepth_enum);
+static const arg_def_t inbitdeptharg =
+    ARG_DEF(NULL, "input-bit-depth", 1, "Bit depth of input");
+
+static const arg_def_t input_chroma_subsampling_x = ARG_DEF(
+    NULL, "input-chroma-subsampling-x", 1, "chroma subsampling x value.");
+static const arg_def_t input_chroma_subsampling_y = ARG_DEF(
+    NULL, "input-chroma-subsampling-y", 1, "chroma subsampling y value.");
+
+static const arg_def_t *main_args[] = { &help,
+                                        &use_cfg,
+                                        &debugmode,
+                                        &outputfile,
+                                        &codecarg,
+                                        &passes,
+                                        &pass_arg,
+                                        &fpf_name,
+                                        &limit,
+                                        &skip,
+                                        &good_dl,
+                                        &rt_dl,
+                                        &quietarg,
+                                        &verbosearg,
+                                        &psnrarg,
+                                        &use_webm,
+                                        &use_ivf,
+                                        &use_obu,
+                                        &q_hist_n,
+                                        &rate_hist_n,
+                                        &disable_warnings,
+                                        &disable_warning_prompt,
+                                        &recontest,
+                                        NULL };
+
+static const arg_def_t usage =
+    ARG_DEF("u", "usage", 1, "Usage profile number to use");
+static const arg_def_t threads =
+    ARG_DEF("t", "threads", 1, "Max number of threads to use");
+static const arg_def_t profile =
+    ARG_DEF(NULL, "profile", 1, "Bitstream profile number to use");
+static const arg_def_t width = ARG_DEF("w", "width", 1, "Frame width");
+static const arg_def_t height = ARG_DEF("h", "height", 1, "Frame height");
+static const arg_def_t forced_max_frame_width = ARG_DEF(
+    NULL, "forced_max_frame_width", 1, "Maximum frame width value to force");
+static const arg_def_t forced_max_frame_height = ARG_DEF(
+    NULL, "forced_max_frame_height", 1, "Maximum frame height value to force");
+#if CONFIG_WEBM_IO
+static const struct arg_enum_list stereo_mode_enum[] = {
+  { "mono", STEREO_FORMAT_MONO },
+  { "left-right", STEREO_FORMAT_LEFT_RIGHT },
+  { "bottom-top", STEREO_FORMAT_BOTTOM_TOP },
+  { "top-bottom", STEREO_FORMAT_TOP_BOTTOM },
+  { "right-left", STEREO_FORMAT_RIGHT_LEFT },
+  { NULL, 0 }
+};
+static const arg_def_t stereo_mode = ARG_DEF_ENUM(
+    NULL, "stereo-mode", 1, "Stereo 3D video format", stereo_mode_enum);
+#endif
+static const arg_def_t timebase = ARG_DEF(
+    NULL, "timebase", 1, "Output timestamp precision (fractional seconds)");
+static const arg_def_t global_error_resilient =
+    ARG_DEF(NULL, "global-error-resilient", 1,
+            "Enable global error resiliency features");
+static const arg_def_t lag_in_frames =
+    ARG_DEF(NULL, "lag-in-frames", 1, "Max number of frames to lag");
+static const arg_def_t large_scale_tile = ARG_DEF(
+    NULL, "large-scale-tile", 1,
+    "Large scale tile coding (0: off (default), 1: on (ivf output only))");
+static const arg_def_t monochrome =
+    ARG_DEF(NULL, "monochrome", 0, "Monochrome video (no chroma planes)");
+static const arg_def_t full_still_picture_hdr = ARG_DEF(
+    NULL, "full-still-picture-hdr", 0, "Use full header for still picture");
+
+static const arg_def_t *global_args[] = { &use_yv12,
+                                          &use_i420,
+                                          &use_i422,
+                                          &use_i444,
+                                          &usage,
+                                          &threads,
+                                          &profile,
+                                          &width,
+                                          &height,
+                                          &forced_max_frame_width,
+                                          &forced_max_frame_height,
+#if CONFIG_WEBM_IO
+                                          &stereo_mode,
+#endif
+                                          &timebase,
+                                          &framerate,
+                                          &global_error_resilient,
+                                          &bitdeptharg,
+                                          &lag_in_frames,
+                                          &large_scale_tile,
+                                          &monochrome,
+                                          &full_still_picture_hdr,
+                                          NULL };
+
+static const arg_def_t dropframe_thresh =
+    ARG_DEF(NULL, "drop-frame", 1, "Temporal resampling threshold (buf %)");
+static const arg_def_t resize_mode =
+    ARG_DEF(NULL, "resize-mode", 1, "Frame resize mode");
+static const arg_def_t resize_denominator =
+    ARG_DEF(NULL, "resize-denominator", 1, "Frame resize denominator");
+static const arg_def_t resize_kf_denominator = ARG_DEF(
+    NULL, "resize-kf-denominator", 1, "Frame resize keyframe denominator");
+static const arg_def_t superres_mode =
+    ARG_DEF(NULL, "superres-mode", 1, "Frame super-resolution mode");
+static const arg_def_t superres_denominator = ARG_DEF(
+    NULL, "superres-denominator", 1, "Frame super-resolution denominator");
+static const arg_def_t superres_kf_denominator =
+    ARG_DEF(NULL, "superres-kf-denominator", 1,
+            "Frame super-resolution keyframe denominator");
+static const arg_def_t superres_qthresh = ARG_DEF(
+    NULL, "superres-qthresh", 1, "Frame super-resolution qindex threshold");
+static const arg_def_t superres_kf_qthresh =
+    ARG_DEF(NULL, "superres-kf-qthresh", 1,
+            "Frame super-resolution keyframe qindex threshold");
+static const struct arg_enum_list end_usage_enum[] = { { "vbr", AOM_VBR },
+                                                       { "cbr", AOM_CBR },
+                                                       { "cq", AOM_CQ },
+                                                       { "q", AOM_Q },
+                                                       { NULL, 0 } };
+static const arg_def_t end_usage =
+    ARG_DEF_ENUM(NULL, "end-usage", 1, "Rate control mode", end_usage_enum);
+static const arg_def_t target_bitrate =
+    ARG_DEF(NULL, "target-bitrate", 1, "Bitrate (kbps)");
+static const arg_def_t min_quantizer =
+    ARG_DEF(NULL, "min-q", 1, "Minimum (best) quantizer");
+static const arg_def_t max_quantizer =
+    ARG_DEF(NULL, "max-q", 1, "Maximum (worst) quantizer");
+static const arg_def_t undershoot_pct =
+    ARG_DEF(NULL, "undershoot-pct", 1, "Datarate undershoot (min) target (%)");
+static const arg_def_t overshoot_pct =
+    ARG_DEF(NULL, "overshoot-pct", 1, "Datarate overshoot (max) target (%)");
+static const arg_def_t buf_sz =
+    ARG_DEF(NULL, "buf-sz", 1, "Client buffer size (ms)");
+static const arg_def_t buf_initial_sz =
+    ARG_DEF(NULL, "buf-initial-sz", 1, "Client initial buffer size (ms)");
+static const arg_def_t buf_optimal_sz =
+    ARG_DEF(NULL, "buf-optimal-sz", 1, "Client optimal buffer size (ms)");
+static const arg_def_t *rc_args[] = { &dropframe_thresh,
+                                      &resize_mode,
+                                      &resize_denominator,
+                                      &resize_kf_denominator,
+                                      &superres_mode,
+                                      &superres_denominator,
+                                      &superres_kf_denominator,
+                                      &superres_qthresh,
+                                      &superres_kf_qthresh,
+                                      &end_usage,
+                                      &target_bitrate,
+                                      &min_quantizer,
+                                      &max_quantizer,
+                                      &undershoot_pct,
+                                      &overshoot_pct,
+                                      &buf_sz,
+                                      &buf_initial_sz,
+                                      &buf_optimal_sz,
+                                      NULL };
+
+static const arg_def_t bias_pct =
+    ARG_DEF(NULL, "bias-pct", 1, "CBR/VBR bias (0=CBR, 100=VBR)");
+static const arg_def_t minsection_pct =
+    ARG_DEF(NULL, "minsection-pct", 1, "GOP min bitrate (% of target)");
+static const arg_def_t maxsection_pct =
+    ARG_DEF(NULL, "maxsection-pct", 1, "GOP max bitrate (% of target)");
+static const arg_def_t *rc_twopass_args[] = { &bias_pct, &minsection_pct,
+                                              &maxsection_pct, NULL };
+static const arg_def_t fwd_kf_enabled =
+    ARG_DEF(NULL, "enable-fwd-kf", 1, "Enable forward reference keyframes");
+static const arg_def_t kf_min_dist =
+    ARG_DEF(NULL, "kf-min-dist", 1, "Minimum keyframe interval (frames)");
+static const arg_def_t kf_max_dist =
+    ARG_DEF(NULL, "kf-max-dist", 1, "Maximum keyframe interval (frames)");
+static const arg_def_t kf_disabled =
+    ARG_DEF(NULL, "disable-kf", 0, "Disable keyframe placement");
+static const arg_def_t *kf_args[] = { &fwd_kf_enabled, &kf_min_dist,
+                                      &kf_max_dist, &kf_disabled, NULL };
+static const arg_def_t sframe_dist =
+    ARG_DEF(NULL, "sframe-dist", 1, "S-Frame interval (frames)");
+static const arg_def_t sframe_mode =
+    ARG_DEF(NULL, "sframe-mode", 1, "S-Frame insertion mode (1..2)");
+static const arg_def_t save_as_annexb =
+    ARG_DEF(NULL, "annexb", 1, "Save as Annex-B");
+static const arg_def_t noise_sens =
+    ARG_DEF(NULL, "noise-sensitivity", 1, "Noise sensitivity (frames to blur)");
+static const arg_def_t sharpness =
+    ARG_DEF(NULL, "sharpness", 1, "Loop filter sharpness (0..7)");
+static const arg_def_t static_thresh =
+    ARG_DEF(NULL, "static-thresh", 1, "Motion detection threshold");
+static const arg_def_t auto_altref =
+    ARG_DEF(NULL, "auto-alt-ref", 1, "Enable automatic alt reference frames");
+static const arg_def_t arnr_maxframes =
+    ARG_DEF(NULL, "arnr-maxframes", 1, "AltRef max frames (0..15)");
+static const arg_def_t arnr_strength =
+    ARG_DEF(NULL, "arnr-strength", 1, "AltRef filter strength (0..6)");
+static const struct arg_enum_list tuning_enum[] = {
+  { "psnr", AOM_TUNE_PSNR },
+  { "ssim", AOM_TUNE_SSIM },
+  { "vmaf_with_preprocessing", AOM_TUNE_VMAF_WITH_PREPROCESSING },
+  { "vmaf_without_preprocessing", AOM_TUNE_VMAF_WITHOUT_PREPROCESSING },
+  { "vmaf", AOM_TUNE_VMAF_MAX_GAIN },
+  { NULL, 0 }
+};
+static const arg_def_t tune_metric =
+    ARG_DEF_ENUM(NULL, "tune", 1, "Distortion metric tuned with", tuning_enum);
+static const arg_def_t cq_level =
+    ARG_DEF(NULL, "cq-level", 1, "Constant/Constrained Quality level");
+static const arg_def_t max_intra_rate_pct =
+    ARG_DEF(NULL, "max-intra-rate", 1, "Max I-frame bitrate (pct)");
+
+#if CONFIG_AV1_ENCODER
+static const arg_def_t cpu_used_av1 =
+    ARG_DEF(NULL, "cpu-used", 1,
+            "Speed setting (0..6 in good mode, 6..8 in realtime mode)");
+static const arg_def_t rowmtarg =
+    ARG_DEF(NULL, "row-mt", 1,
+            "Enable row based multi-threading (0: off, 1: on (default))");
+static const arg_def_t tile_cols =
+    ARG_DEF(NULL, "tile-columns", 1, "Number of tile columns to use, log2");
+static const arg_def_t tile_rows =
+    ARG_DEF(NULL, "tile-rows", 1, "Number of tile rows to use, log2");
+static const arg_def_t enable_tpl_model =
+    ARG_DEF(NULL, "enable-tpl-model", 1,
+            "RDO based on frame temporal dependency "
+            "(0: off, 1: backward source based). "
+            "This is required for deltaq mode.");
+static const arg_def_t enable_keyframe_filtering =
+    ARG_DEF(NULL, "enable-keyframe-filtering", 1,
+            "Apply temporal filtering on key frame "
+            "(0: false, 1: true (default)");
+static const arg_def_t tile_width =
+    ARG_DEF(NULL, "tile-width", 1, "Tile widths (comma separated)");
+static const arg_def_t tile_height =
+    ARG_DEF(NULL, "tile-height", 1, "Tile heights (command separated)");
+static const arg_def_t lossless =
+    ARG_DEF(NULL, "lossless", 1, "Lossless mode (0: false (default), 1: true)");
+static const arg_def_t enable_cdef =
+    ARG_DEF(NULL, "enable-cdef", 1,
+            "Enable the constrained directional enhancement filter (0: false, "
+            "1: true (default))");
+static const arg_def_t enable_restoration = ARG_DEF(
+    NULL, "enable-restoration", 1,
+    "Enable the loop restoration filter (0: false (default in Realtime mode), "
+    "1: true (default in Non-realtime mode))");
+static const arg_def_t enable_rect_partitions =
+    ARG_DEF(NULL, "enable-rect-partitions", 1,
+            "Enable rectangular partitions "
+            "(0: false, 1: true (default))");
+static const arg_def_t enable_ab_partitions =
+    ARG_DEF(NULL, "enable-ab-partitions", 1,
+            "Enable ab partitions (0: false, 1: true (default))");
+static const arg_def_t enable_1to4_partitions =
+    ARG_DEF(NULL, "enable-1to4-partitions", 1,
+            "Enable 1:4 and 4:1 partitions "
+            "(0: false, 1: true (default))");
+static const arg_def_t min_partition_size =
+    ARG_DEF(NULL, "min-partition-size", 4,
+            "Set min partition size "
+            "(4:4x4, 8:8x8, 16:16x16, 32:32x32, 64:64x64, 128:128x128). "
+            "On frame with 4k+ resolutions or higher speed settings, the min "
+            "partition size will have a minimum of 8.");
+static const arg_def_t max_partition_size =
+    ARG_DEF(NULL, "max-partition-size", 128,
+            "Set max partition size "
+            "(4:4x4, 8:8x8, 16:16x16, 32:32x32, 64:64x64, 128:128x128)");
+static const arg_def_t enable_dual_filter =
+    ARG_DEF(NULL, "enable-dual-filter", 1,
+            "Enable dual filter "
+            "(0: false, 1: true (default))");
+static const arg_def_t enable_chroma_deltaq =
+    ARG_DEF(NULL, "enable-chroma-deltaq", 1,
+            "Enable chroma delta quant "
+            "(0: false (default), 1: true)");
+static const arg_def_t enable_intra_edge_filter =
+    ARG_DEF(NULL, "enable-intra-edge-filter", 1,
+            "Enable intra edge filtering "
+            "(0: false, 1: true (default))");
+static const arg_def_t enable_order_hint =
+    ARG_DEF(NULL, "enable-order-hint", 1,
+            "Enable order hint "
+            "(0: false, 1: true (default))");
+static const arg_def_t enable_tx64 =
+    ARG_DEF(NULL, "enable-tx64", 1,
+            "Enable 64-pt transform (0: false, 1: true (default))");
+static const arg_def_t enable_flip_idtx =
+    ARG_DEF(NULL, "enable-flip-idtx", 1,
+            "Enable extended transform type (0: false, 1: true (default)) "
+            "including FLIPADST_DCT, DCT_FLIPADST, FLIPADST_FLIPADST, "
+            "ADST_FLIPADST, FLIPADST_ADST, IDTX, V_DCT, H_DCT, V_ADST, "
+            "H_ADST, V_FLIPADST, H_FLIPADST");
+static const arg_def_t enable_dist_wtd_comp =
+    ARG_DEF(NULL, "enable-dist-wtd-comp", 1,
+            "Enable distance-weighted compound "
+            "(0: false, 1: true (default))");
+static const arg_def_t enable_masked_comp =
+    ARG_DEF(NULL, "enable-masked-comp", 1,
+            "Enable masked (wedge/diff-wtd) compound "
+            "(0: false, 1: true (default))");
+static const arg_def_t enable_onesided_comp =
+    ARG_DEF(NULL, "enable-onesided-comp", 1,
+            "Enable one sided compound "
+            "(0: false, 1: true (default))");
+static const arg_def_t enable_interintra_comp =
+    ARG_DEF(NULL, "enable-interintra-comp", 1,
+            "Enable interintra compound "
+            "(0: false, 1: true (default))");
+static const arg_def_t enable_smooth_interintra =
+    ARG_DEF(NULL, "enable-smooth-interintra", 1,
+            "Enable smooth interintra mode "
+            "(0: false, 1: true (default))");
+static const arg_def_t enable_diff_wtd_comp =
+    ARG_DEF(NULL, "enable-diff-wtd-comp", 1,
+            "Enable difference-weighted compound "
+            "(0: false, 1: true (default))");
+static const arg_def_t enable_interinter_wedge =
+    ARG_DEF(NULL, "enable-interinter-wedge", 1,
+            "Enable interinter wedge compound "
+            "(0: false, 1: true (default))");
+static const arg_def_t enable_interintra_wedge =
+    ARG_DEF(NULL, "enable-interintra-wedge", 1,
+            "Enable interintra wedge compound "
+            "(0: false, 1: true (default))");
+static const arg_def_t enable_global_motion =
+    ARG_DEF(NULL, "enable-global-motion", 1,
+            "Enable global motion "
+            "(0: false, 1: true (default))");
+static const arg_def_t enable_warped_motion =
+    ARG_DEF(NULL, "enable-warped-motion", 1,
+            "Enable local warped motion "
+            "(0: false, 1: true (default))");
+static const arg_def_t enable_filter_intra =
+    ARG_DEF(NULL, "enable-filter-intra", 1,
+            "Enable filter intra prediction mode "
+            "(0: false, 1: true (default))");
+static const arg_def_t enable_smooth_intra =
+    ARG_DEF(NULL, "enable-smooth-intra", 1,
+            "Enable smooth intra prediction modes "
+            "(0: false, 1: true (default))");
+static const arg_def_t enable_paeth_intra =
+    ARG_DEF(NULL, "enable-paeth-intra", 1,
+            "Enable Paeth intra prediction mode (0: false, 1: true (default))");
+static const arg_def_t enable_cfl_intra =
+    ARG_DEF(NULL, "enable-cfl-intra", 1,
+            "Enable chroma from luma intra prediction mode "
+            "(0: false, 1: true (default))");
+static const arg_def_t force_video_mode =
+    ARG_DEF(NULL, "force-video-mode", 1,
+            "Force video mode (0: false, 1: true (default))");
+static const arg_def_t enable_obmc = ARG_DEF(
+    NULL, "enable-obmc", 1, "Enable OBMC (0: false, 1: true (default))");
+static const arg_def_t enable_overlay =
+    ARG_DEF(NULL, "enable-overlay", 1,
+            "Enable coding overlay frames (0: false, 1: true (default))");
+static const arg_def_t enable_palette =
+    ARG_DEF(NULL, "enable-palette", 1,
+            "Enable palette prediction mode (0: false, 1: true (default))");
+static const arg_def_t enable_intrabc =
+    ARG_DEF(NULL, "enable-intrabc", 1,
+            "Enable intra block copy prediction mode "
+            "(0: false, 1: true (default))");
+static const arg_def_t enable_angle_delta =
+    ARG_DEF(NULL, "enable-angle-delta", 1,
+            "Enable intra angle delta (0: false, 1: true (default))");
+static const arg_def_t disable_trellis_quant =
+    ARG_DEF(NULL, "disable-trellis-quant", 1,
+            "Disable trellis optimization of quantized coefficients (0: false "
+            "1: true  2: true for rd search 3: true for estimate yrd serch "
+            "(default))");
+static const arg_def_t enable_qm =
+    ARG_DEF(NULL, "enable-qm", 1,
+            "Enable quantisation matrices (0: false (default), 1: true)");
+static const arg_def_t qm_min = ARG_DEF(
+    NULL, "qm-min", 1, "Min quant matrix flatness (0..15), default is 8");
+static const arg_def_t qm_max = ARG_DEF(
+    NULL, "qm-max", 1, "Max quant matrix flatness (0..15), default is 15");
+static const arg_def_t reduced_tx_type_set = ARG_DEF(
+    NULL, "reduced-tx-type-set", 1, "Use reduced set of transform types");
+static const arg_def_t use_intra_dct_only =
+    ARG_DEF(NULL, "use-intra-dct-only", 1, "Use DCT only for INTRA modes");
+static const arg_def_t use_inter_dct_only =
+    ARG_DEF(NULL, "use-inter-dct-only", 1, "Use DCT only for INTER modes");
+static const arg_def_t use_intra_default_tx_only =
+    ARG_DEF(NULL, "use-intra-default-tx-only", 1,
+            "Use Default-transform only for INTRA modes");
+static const arg_def_t quant_b_adapt =
+    ARG_DEF(NULL, "quant-b-adapt", 1, "Use adaptive quantize_b");
+static const arg_def_t coeff_cost_upd_freq =
+    ARG_DEF(NULL, "coeff-cost-upd-freq", 1,
+            "Update freq for coeff costs"
+            "0: SB, 1: SB Row per Tile, 2: Tile");
+static const arg_def_t mode_cost_upd_freq =
+    ARG_DEF(NULL, "mode-cost-upd-freq", 1,
+            "Update freq for mode costs"
+            "0: SB, 1: SB Row per Tile, 2: Tile");
+static const arg_def_t mv_cost_upd_freq =
+    ARG_DEF(NULL, "mv-cost-upd-freq", 1,
+            "Update freq for mv costs"
+            "0: SB, 1: SB Row per Tile, 2: Tile, 3: Off");
+static const arg_def_t num_tg = ARG_DEF(
+    NULL, "num-tile-groups", 1, "Maximum number of tile groups, default is 1");
+static const arg_def_t mtu_size =
+    ARG_DEF(NULL, "mtu-size", 1,
+            "MTU size for a tile group, default is 0 (no MTU targeting), "
+            "overrides maximum number of tile groups");
+static const struct arg_enum_list timing_info_enum[] = {
+  { "unspecified", AOM_TIMING_UNSPECIFIED },
+  { "constant", AOM_TIMING_EQUAL },
+  { "model", AOM_TIMING_DEC_MODEL },
+  { NULL, 0 }
+};
+static const arg_def_t timing_info =
+    ARG_DEF_ENUM(NULL, "timing-info", 1,
+                 "Signal timing info in the bitstream (model unly works for no "
+                 "hidden frames, no super-res yet):",
+                 timing_info_enum);
+#if CONFIG_TUNE_VMAF
+static const arg_def_t vmaf_model_path =
+    ARG_DEF(NULL, "vmaf-model-path", 1, "Path to the VMAF model file");
+#endif
+static const arg_def_t film_grain_test =
+    ARG_DEF(NULL, "film-grain-test", 1,
+            "Film grain test vectors (0: none (default), 1: test-1  2: test-2, "
+            "... 16: test-16)");
+static const arg_def_t film_grain_table =
+    ARG_DEF(NULL, "film-grain-table", 1,
+            "Path to file containing film grain parameters");
+#if CONFIG_DENOISE
+static const arg_def_t denoise_noise_level =
+    ARG_DEF(NULL, "denoise-noise-level", 1,
+            "Amount of noise (from 0 = don't denoise, to 50)");
+static const arg_def_t denoise_block_size =
+    ARG_DEF(NULL, "denoise-block-size", 1, "Denoise block size (default = 32)");
+#endif
+static const arg_def_t enable_ref_frame_mvs =
+    ARG_DEF(NULL, "enable-ref-frame-mvs", 1,
+            "Enable temporal mv prediction (default is 1)");
+static const arg_def_t frame_parallel_decoding =
+    ARG_DEF(NULL, "frame-parallel", 1,
+            "Enable frame parallel decodability features "
+            "(0: false (default), 1: true)");
+static const arg_def_t error_resilient_mode =
+    ARG_DEF(NULL, "error-resilient", 1,
+            "Enable error resilient features "
+            "(0: false (default), 1: true)");
+static const arg_def_t aq_mode = ARG_DEF(
+    NULL, "aq-mode", 1,
+    "Adaptive quantization mode (0: off (default), 1: variance 2: complexity, "
+    "3: cyclic refresh)");
+static const arg_def_t deltaq_mode =
+    ARG_DEF(NULL, "deltaq-mode", 1,
+            "Delta qindex mode (0: off, 1: deltaq objective (default), "
+            "2: deltaq perceptual). "
+            "Currently this requires enable-tpl-model as a prerequisite.");
+static const arg_def_t deltalf_mode = ARG_DEF(
+    NULL, "delta-lf-mode", 1, "Enable delta-lf-mode (0: off (default), 1: on)");
+static const arg_def_t frame_periodic_boost =
+    ARG_DEF(NULL, "frame-boost", 1,
+            "Enable frame periodic boost (0: off (default), 1: on)");
+static const arg_def_t gf_cbr_boost_pct = ARG_DEF(
+    NULL, "gf-cbr-boost", 1, "Boost for Golden Frame in CBR mode (pct)");
+static const arg_def_t max_inter_rate_pct =
+    ARG_DEF(NULL, "max-inter-rate", 1, "Max P-frame bitrate (pct)");
+static const arg_def_t min_gf_interval = ARG_DEF(
+    NULL, "min-gf-interval", 1,
+    "min gf/arf frame interval (default 0, indicating in-built behavior)");
+static const arg_def_t max_gf_interval = ARG_DEF(
+    NULL, "max-gf-interval", 1,
+    "max gf/arf frame interval (default 0, indicating in-built behavior)");
+static const arg_def_t gf_min_pyr_height =
+    ARG_DEF(NULL, "gf-min-pyr-height", 1,
+            "Min height for GF group pyramid structure (0 (default) to 5)");
+static const arg_def_t gf_max_pyr_height =
+    ARG_DEF(NULL, "gf-max-pyr-height", 1,
+            "maximum height for GF group pyramid structure (0 to 5 (default))");
+static const arg_def_t max_reference_frames = ARG_DEF(
+    NULL, "max-reference-frames", 1,
+    "maximum number of reference frames allowed per frame (3 to 7 (default))");
+static const arg_def_t reduced_reference_set =
+    ARG_DEF(NULL, "reduced-reference-set", 1,
+            "Use reduced set of single and compound references (0: off "
+            "(default), 1: on)");
+static const arg_def_t target_seq_level_idx =
+    ARG_DEF(NULL, "target-seq-level-idx", 1,
+            "Target sequence level index. "
+            "Possible values are in the form of \"ABxy\"(pad leading zeros if "
+            "less than 4 digits). "
+            "AB: Operating point(OP) index; "
+            "xy: Target level index for the OP. "
+            "E.g. \"0\" means target level index 0 for the 0th OP; "
+            "\"1021\" means target level index 21 for the 10th OP.");
+static const arg_def_t set_min_cr =
+    ARG_DEF(NULL, "min-cr", 1,
+            "Set minimum compression ratio. Take integer values. Default is 0. "
+            "If non-zero, encoder will try to keep the compression ratio of "
+            "each frame to be higher than the given value divided by 100.");
+
+static const struct arg_enum_list color_primaries_enum[] = {
+  { "bt709", AOM_CICP_CP_BT_709 },
+  { "unspecified", AOM_CICP_CP_UNSPECIFIED },
+  { "bt601", AOM_CICP_CP_BT_601 },
+  { "bt470m", AOM_CICP_CP_BT_470_M },
+  { "bt470bg", AOM_CICP_CP_BT_470_B_G },
+  { "smpte240", AOM_CICP_CP_SMPTE_240 },
+  { "film", AOM_CICP_CP_GENERIC_FILM },
+  { "bt2020", AOM_CICP_CP_BT_2020 },
+  { "xyz", AOM_CICP_CP_XYZ },
+  { "smpte431", AOM_CICP_CP_SMPTE_431 },
+  { "smpte432", AOM_CICP_CP_SMPTE_432 },
+  { "ebu3213", AOM_CICP_CP_EBU_3213 },
+  { NULL, 0 }
+};
+
+static const arg_def_t input_color_primaries = ARG_DEF_ENUM(
+    NULL, "color-primaries", 1,
+    "Color primaries (CICP) of input content:", color_primaries_enum);
+
+static const struct arg_enum_list transfer_characteristics_enum[] = {
+  { "unspecified", AOM_CICP_CP_UNSPECIFIED },
+  { "bt709", AOM_CICP_TC_BT_709 },
+  { "bt470m", AOM_CICP_TC_BT_470_M },
+  { "bt470bg", AOM_CICP_TC_BT_470_B_G },
+  { "bt601", AOM_CICP_TC_BT_601 },
+  { "smpte240", AOM_CICP_TC_SMPTE_240 },
+  { "lin", AOM_CICP_TC_LINEAR },
+  { "log100", AOM_CICP_TC_LOG_100 },
+  { "log100sq10", AOM_CICP_TC_LOG_100_SQRT10 },
+  { "iec61966", AOM_CICP_TC_IEC_61966 },
+  { "bt1361", AOM_CICP_TC_BT_1361 },
+  { "srgb", AOM_CICP_TC_SRGB },
+  { "bt2020-10bit", AOM_CICP_TC_BT_2020_10_BIT },
+  { "bt2020-12bit", AOM_CICP_TC_BT_2020_12_BIT },
+  { "smpte2084", AOM_CICP_TC_SMPTE_2084 },
+  { "hlg", AOM_CICP_TC_HLG },
+  { "smpte428", AOM_CICP_TC_SMPTE_428 },
+  { NULL, 0 }
+};
+
+static const arg_def_t input_transfer_characteristics =
+    ARG_DEF_ENUM(NULL, "transfer-characteristics", 1,
+                 "Transfer characteristics (CICP) of input content:",
+                 transfer_characteristics_enum);
+
+static const struct arg_enum_list matrix_coefficients_enum[] = {
+  { "identity", AOM_CICP_MC_IDENTITY },
+  { "bt709", AOM_CICP_MC_BT_709 },
+  { "unspecified", AOM_CICP_MC_UNSPECIFIED },
+  { "fcc73", AOM_CICP_MC_FCC },
+  { "bt470bg", AOM_CICP_MC_BT_470_B_G },
+  { "bt601", AOM_CICP_MC_BT_601 },
+  { "smpte240", AOM_CICP_CP_SMPTE_240 },
+  { "ycgco", AOM_CICP_MC_SMPTE_YCGCO },
+  { "bt2020ncl", AOM_CICP_MC_BT_2020_NCL },
+  { "bt2020cl", AOM_CICP_MC_BT_2020_CL },
+  { "smpte2085", AOM_CICP_MC_SMPTE_2085 },
+  { "chromncl", AOM_CICP_MC_CHROMAT_NCL },
+  { "chromcl", AOM_CICP_MC_CHROMAT_CL },
+  { "ictcp", AOM_CICP_MC_ICTCP },
+  { NULL, 0 }
+};
+
+static const arg_def_t input_matrix_coefficients = ARG_DEF_ENUM(
+    NULL, "matrix-coefficients", 1,
+    "Matrix coefficients (CICP) of input content:", matrix_coefficients_enum);
+
+static const struct arg_enum_list chroma_sample_position_enum[] = {
+  { "unknown", AOM_CSP_UNKNOWN },
+  { "vertical", AOM_CSP_VERTICAL },
+  { "colocated", AOM_CSP_COLOCATED },
+  { NULL, 0 }
+};
+
+static const arg_def_t input_chroma_sample_position =
+    ARG_DEF_ENUM(NULL, "chroma-sample-position", 1,
+                 "The chroma sample position when chroma 4:2:0 is signaled:",
+                 chroma_sample_position_enum);
+
+static const struct arg_enum_list tune_content_enum[] = {
+  { "default", AOM_CONTENT_DEFAULT },
+  { "screen", AOM_CONTENT_SCREEN },
+  { NULL, 0 }
+};
+
+static const arg_def_t tune_content = ARG_DEF_ENUM(
+    NULL, "tune-content", 1, "Tune content type", tune_content_enum);
+
+static const arg_def_t cdf_update_mode =
+    ARG_DEF(NULL, "cdf-update-mode", 1,
+            "CDF update mode for entropy coding "
+            "(0: no CDF update; 1: update CDF on all frames(default); "
+            "2: selectively update CDF on some frames");
+
+static const struct arg_enum_list superblock_size_enum[] = {
+  { "dynamic", AOM_SUPERBLOCK_SIZE_DYNAMIC },
+  { "64", AOM_SUPERBLOCK_SIZE_64X64 },
+  { "128", AOM_SUPERBLOCK_SIZE_128X128 },
+  { NULL, 0 }
+};
+static const arg_def_t superblock_size = ARG_DEF_ENUM(
+    NULL, "sb-size", 1, "Superblock size to use", superblock_size_enum);
+
+static const arg_def_t set_tier_mask =
+    ARG_DEF(NULL, "set-tier-mask", 1,
+            "Set bit mask to specify which tier each of the 32 possible "
+            "operating points conforms to. "
+            "Bit value 0(defualt): Main Tier; 1: High Tier.");
+
+static const arg_def_t use_fixed_qp_offsets =
+    ARG_DEF(NULL, "use-fixed-qp-offsets", 1,
+            "Enable fixed QP offsets for frames at different levels of the "
+            "pyramid. Selected automatically from --cq-level if "
+            "--fixed-qp-offsets is not provided. If this option is not "
+            "specified (default), offsets are adaptively chosen by the "
+            "encoder.");
+
+static const arg_def_t fixed_qp_offsets =
+    ARG_DEF(NULL, "fixed-qp-offsets", 1,
+            "Set fixed QP offsets for frames at different levels of the "
+            "pyramid. Comma-separated list of 5 offsets for keyframe, ALTREF, "
+            "and 3 levels of internal alt-refs. If this option is not "
+            "specified (default), offsets are adaptively chosen by the "
+            "encoder.");
+
+static const arg_def_t *av1_args[] = { &cpu_used_av1,
+                                       &auto_altref,
+                                       &sharpness,
+                                       &static_thresh,
+                                       &rowmtarg,
+                                       &tile_cols,
+                                       &tile_rows,
+                                       &enable_tpl_model,
+                                       &enable_keyframe_filtering,
+                                       &arnr_maxframes,
+                                       &arnr_strength,
+                                       &tune_metric,
+                                       &cq_level,
+                                       &max_intra_rate_pct,
+                                       &max_inter_rate_pct,
+                                       &gf_cbr_boost_pct,
+                                       &lossless,
+                                       &enable_cdef,
+                                       &enable_restoration,
+                                       &enable_rect_partitions,
+                                       &enable_ab_partitions,
+                                       &enable_1to4_partitions,
+                                       &min_partition_size,
+                                       &max_partition_size,
+                                       &enable_dual_filter,
+                                       &enable_chroma_deltaq,
+                                       &enable_intra_edge_filter,
+                                       &enable_order_hint,
+                                       &enable_tx64,
+                                       &enable_flip_idtx,
+                                       &enable_dist_wtd_comp,
+                                       &enable_masked_comp,
+                                       &enable_onesided_comp,
+                                       &enable_interintra_comp,
+                                       &enable_smooth_interintra,
+                                       &enable_diff_wtd_comp,
+                                       &enable_interinter_wedge,
+                                       &enable_interintra_wedge,
+                                       &enable_global_motion,
+                                       &enable_warped_motion,
+                                       &enable_filter_intra,
+                                       &enable_smooth_intra,
+                                       &enable_paeth_intra,
+                                       &enable_cfl_intra,
+                                       &force_video_mode,
+                                       &enable_obmc,
+                                       &enable_overlay,
+                                       &enable_palette,
+                                       &enable_intrabc,
+                                       &enable_angle_delta,
+                                       &disable_trellis_quant,
+                                       &enable_qm,
+                                       &qm_min,
+                                       &qm_max,
+                                       &reduced_tx_type_set,
+                                       &use_intra_dct_only,
+                                       &use_inter_dct_only,
+                                       &use_intra_default_tx_only,
+                                       &quant_b_adapt,
+                                       &coeff_cost_upd_freq,
+                                       &mode_cost_upd_freq,
+                                       &mv_cost_upd_freq,
+                                       &frame_parallel_decoding,
+                                       &error_resilient_mode,
+                                       &aq_mode,
+                                       &deltaq_mode,
+                                       &deltalf_mode,
+                                       &frame_periodic_boost,
+                                       &noise_sens,
+                                       &tune_content,
+                                       &cdf_update_mode,
+                                       &input_color_primaries,
+                                       &input_transfer_characteristics,
+                                       &input_matrix_coefficients,
+                                       &input_chroma_sample_position,
+                                       &min_gf_interval,
+                                       &max_gf_interval,
+                                       &gf_min_pyr_height,
+                                       &gf_max_pyr_height,
+                                       &superblock_size,
+                                       &num_tg,
+                                       &mtu_size,
+                                       &timing_info,
+                                       &film_grain_test,
+                                       &film_grain_table,
+#if CONFIG_DENOISE
+                                       &denoise_noise_level,
+                                       &denoise_block_size,
+#endif  // CONFIG_DENOISE
+                                       &max_reference_frames,
+                                       &reduced_reference_set,
+                                       &enable_ref_frame_mvs,
+                                       &target_seq_level_idx,
+                                       &set_tier_mask,
+                                       &set_min_cr,
+                                       &bitdeptharg,
+                                       &inbitdeptharg,
+                                       &input_chroma_subsampling_x,
+                                       &input_chroma_subsampling_y,
+                                       &sframe_dist,
+                                       &sframe_mode,
+                                       &save_as_annexb,
+#if CONFIG_TUNE_VMAF
+                                       &vmaf_model_path,
+#endif
+                                       NULL };
+static const int av1_arg_ctrl_map[] = { AOME_SET_CPUUSED,
+                                        AOME_SET_ENABLEAUTOALTREF,
+                                        AOME_SET_SHARPNESS,
+                                        AOME_SET_STATIC_THRESHOLD,
+                                        AV1E_SET_ROW_MT,
+                                        AV1E_SET_TILE_COLUMNS,
+                                        AV1E_SET_TILE_ROWS,
+                                        AV1E_SET_ENABLE_TPL_MODEL,
+                                        AV1E_SET_ENABLE_KEYFRAME_FILTERING,
+                                        AOME_SET_ARNR_MAXFRAMES,
+                                        AOME_SET_ARNR_STRENGTH,
+                                        AOME_SET_TUNING,
+                                        AOME_SET_CQ_LEVEL,
+                                        AOME_SET_MAX_INTRA_BITRATE_PCT,
+                                        AV1E_SET_MAX_INTER_BITRATE_PCT,
+                                        AV1E_SET_GF_CBR_BOOST_PCT,
+                                        AV1E_SET_LOSSLESS,
+                                        AV1E_SET_ENABLE_CDEF,
+                                        AV1E_SET_ENABLE_RESTORATION,
+                                        AV1E_SET_ENABLE_RECT_PARTITIONS,
+                                        AV1E_SET_ENABLE_AB_PARTITIONS,
+                                        AV1E_SET_ENABLE_1TO4_PARTITIONS,
+                                        AV1E_SET_MIN_PARTITION_SIZE,
+                                        AV1E_SET_MAX_PARTITION_SIZE,
+                                        AV1E_SET_ENABLE_DUAL_FILTER,
+                                        AV1E_SET_ENABLE_CHROMA_DELTAQ,
+                                        AV1E_SET_ENABLE_INTRA_EDGE_FILTER,
+                                        AV1E_SET_ENABLE_ORDER_HINT,
+                                        AV1E_SET_ENABLE_TX64,
+                                        AV1E_SET_ENABLE_FLIP_IDTX,
+                                        AV1E_SET_ENABLE_DIST_WTD_COMP,
+                                        AV1E_SET_ENABLE_MASKED_COMP,
+                                        AV1E_SET_ENABLE_ONESIDED_COMP,
+                                        AV1E_SET_ENABLE_INTERINTRA_COMP,
+                                        AV1E_SET_ENABLE_SMOOTH_INTERINTRA,
+                                        AV1E_SET_ENABLE_DIFF_WTD_COMP,
+                                        AV1E_SET_ENABLE_INTERINTER_WEDGE,
+                                        AV1E_SET_ENABLE_INTERINTRA_WEDGE,
+                                        AV1E_SET_ENABLE_GLOBAL_MOTION,
+                                        AV1E_SET_ENABLE_WARPED_MOTION,
+                                        AV1E_SET_ENABLE_FILTER_INTRA,
+                                        AV1E_SET_ENABLE_SMOOTH_INTRA,
+                                        AV1E_SET_ENABLE_PAETH_INTRA,
+                                        AV1E_SET_ENABLE_CFL_INTRA,
+                                        AV1E_SET_FORCE_VIDEO_MODE,
+                                        AV1E_SET_ENABLE_OBMC,
+                                        AV1E_SET_ENABLE_OVERLAY,
+                                        AV1E_SET_ENABLE_PALETTE,
+                                        AV1E_SET_ENABLE_INTRABC,
+                                        AV1E_SET_ENABLE_ANGLE_DELTA,
+                                        AV1E_SET_DISABLE_TRELLIS_QUANT,
+                                        AV1E_SET_ENABLE_QM,
+                                        AV1E_SET_QM_MIN,
+                                        AV1E_SET_QM_MAX,
+                                        AV1E_SET_REDUCED_TX_TYPE_SET,
+                                        AV1E_SET_INTRA_DCT_ONLY,
+                                        AV1E_SET_INTER_DCT_ONLY,
+                                        AV1E_SET_INTRA_DEFAULT_TX_ONLY,
+                                        AV1E_SET_QUANT_B_ADAPT,
+                                        AV1E_SET_COEFF_COST_UPD_FREQ,
+                                        AV1E_SET_MODE_COST_UPD_FREQ,
+                                        AV1E_SET_MV_COST_UPD_FREQ,
+                                        AV1E_SET_FRAME_PARALLEL_DECODING,
+                                        AV1E_SET_ERROR_RESILIENT_MODE,
+                                        AV1E_SET_AQ_MODE,
+                                        AV1E_SET_DELTAQ_MODE,
+                                        AV1E_SET_DELTALF_MODE,
+                                        AV1E_SET_FRAME_PERIODIC_BOOST,
+                                        AV1E_SET_NOISE_SENSITIVITY,
+                                        AV1E_SET_TUNE_CONTENT,
+                                        AV1E_SET_CDF_UPDATE_MODE,
+                                        AV1E_SET_COLOR_PRIMARIES,
+                                        AV1E_SET_TRANSFER_CHARACTERISTICS,
+                                        AV1E_SET_MATRIX_COEFFICIENTS,
+                                        AV1E_SET_CHROMA_SAMPLE_POSITION,
+                                        AV1E_SET_MIN_GF_INTERVAL,
+                                        AV1E_SET_MAX_GF_INTERVAL,
+                                        AV1E_SET_GF_MIN_PYRAMID_HEIGHT,
+                                        AV1E_SET_GF_MAX_PYRAMID_HEIGHT,
+                                        AV1E_SET_SUPERBLOCK_SIZE,
+                                        AV1E_SET_NUM_TG,
+                                        AV1E_SET_MTU,
+                                        AV1E_SET_TIMING_INFO_TYPE,
+                                        AV1E_SET_FILM_GRAIN_TEST_VECTOR,
+                                        AV1E_SET_FILM_GRAIN_TABLE,
+#if CONFIG_DENOISE
+                                        AV1E_SET_DENOISE_NOISE_LEVEL,
+                                        AV1E_SET_DENOISE_BLOCK_SIZE,
+#endif  // CONFIG_DENOISE
+                                        AV1E_SET_MAX_REFERENCE_FRAMES,
+                                        AV1E_SET_REDUCED_REFERENCE_SET,
+                                        AV1E_SET_ENABLE_REF_FRAME_MVS,
+                                        AV1E_SET_TARGET_SEQ_LEVEL_IDX,
+                                        AV1E_SET_TIER_MASK,
+                                        AV1E_SET_MIN_CR,
+#if CONFIG_TUNE_VMAF
+                                        AV1E_SET_VMAF_MODEL_PATH,
+#endif
+                                        0 };
+#endif  // CONFIG_AV1_ENCODER
+
+static const arg_def_t *no_args[] = { NULL };
+
+static void show_help(FILE *fout, int shorthelp) {
+  fprintf(fout, "Usage: %s <options> -o dst_filename src_filename \n",
+          exec_name);
+
+  if (shorthelp) {
+    fprintf(fout, "Use --help to see the full list of options.\n");
+    return;
+  }
+
+  fprintf(fout, "\nOptions:\n");
+  arg_show_usage(fout, main_args);
+  fprintf(fout, "\nEncoder Global Options:\n");
+  arg_show_usage(fout, global_args);
+  fprintf(fout, "\nRate Control Options:\n");
+  arg_show_usage(fout, rc_args);
+  fprintf(fout, "\nTwopass Rate Control Options:\n");
+  arg_show_usage(fout, rc_twopass_args);
+  fprintf(fout, "\nKeyframe Placement Options:\n");
+  arg_show_usage(fout, kf_args);
+#if CONFIG_AV1_ENCODER
+  fprintf(fout, "\nAV1 Specific Options:\n");
+  arg_show_usage(fout, av1_args);
+#endif
+  fprintf(fout,
+          "\nStream timebase (--timebase):\n"
+          "  The desired precision of timestamps in the output, expressed\n"
+          "  in fractional seconds. Default is 1/1000.\n");
+  fprintf(fout, "\nIncluded encoders:\n\n");
+
+  const int num_encoder = get_aom_encoder_count();
+  for (int i = 0; i < num_encoder; ++i) {
+    const AvxInterface *const encoder = get_aom_encoder_by_index(i);
+    const char *defstr = (i == (num_encoder - 1)) ? "(default)" : "";
+    fprintf(fout, "    %-6s - %s %s\n", encoder->name,
+            aom_codec_iface_name(encoder->codec_interface()), defstr);
+  }
+  fprintf(fout, "\n        ");
+  fprintf(fout, "Use --codec to switch to a non-default encoder.\n\n");
+}
+
+void usage_exit(void) {
+  show_help(stderr, 1);
+  exit(EXIT_FAILURE);
+}
+
+#if CONFIG_AV1_ENCODER
+#define ARG_CTRL_CNT_MAX NELEMENTS(av1_arg_ctrl_map)
+#endif
+
+#if !CONFIG_WEBM_IO
+typedef int stereo_format_t;
+struct WebmOutputContext {
+  int debug;
+};
+#endif
+
+/* Per-stream configuration */
+struct stream_config {
+  struct aom_codec_enc_cfg cfg;
+  const char *out_fn;
+  const char *stats_fn;
+  stereo_format_t stereo_fmt;
+  int arg_ctrls[ARG_CTRL_CNT_MAX][2];
+  int arg_ctrl_cnt;
+  int write_webm;
+  const char *film_grain_filename;
+  int write_ivf;
+  // whether to use 16bit internal buffers
+  int use_16bit_internal;
+#if CONFIG_TUNE_VMAF
+  const char *vmaf_model_path;
+#endif
+};
+
+struct stream_state {
+  int index;
+  struct stream_state *next;
+  struct stream_config config;
+  FILE *file;
+  struct rate_hist *rate_hist;
+  struct WebmOutputContext webm_ctx;
+  uint64_t psnr_sse_total;
+  uint64_t psnr_samples_total;
+  double psnr_totals[4];
+  int psnr_count;
+  int counts[64];
+  aom_codec_ctx_t encoder;
+  unsigned int frames_out;
+  uint64_t cx_time;
+  size_t nbytes;
+  stats_io_t stats;
+  struct aom_image *img;
+  aom_codec_ctx_t decoder;
+  int mismatch_seen;
+  unsigned int chroma_subsampling_x;
+  unsigned int chroma_subsampling_y;
+};
+
+static void validate_positive_rational(const char *msg,
+                                       struct aom_rational *rat) {
+  if (rat->den < 0) {
+    rat->num *= -1;
+    rat->den *= -1;
+  }
+
+  if (rat->num < 0) die("Error: %s must be positive\n", msg);
+
+  if (!rat->den) die("Error: %s has zero denominator\n", msg);
+}
+
+static void init_config(cfg_options_t *config) {
+  memset(config, 0, sizeof(cfg_options_t));
+  config->super_block_size = 0;  // Dynamic
+  config->max_partition_size = 128;
+  config->min_partition_size = 4;
+  config->disable_trellis_quant = 3;
+}
+
+/* Parses global config arguments into the AvxEncoderConfig. Note that
+ * argv is modified and overwrites all parsed arguments.
+ */
+static void parse_global_config(struct AvxEncoderConfig *global, char ***argv) {
+  char **argi, **argj;
+  struct arg arg;
+  const int num_encoder = get_aom_encoder_count();
+  char **argv_local = (char **)*argv;
+  if (num_encoder < 1) die("Error: no valid encoder available\n");
+
+  /* Initialize default parameters */
+  memset(global, 0, sizeof(*global));
+  global->codec = get_aom_encoder_by_index(num_encoder - 1);
+  global->passes = 0;
+  global->color_type = I420;
+  global->csp = AOM_CSP_UNKNOWN;
+
+  int cfg_included = 0;
+  init_config(&global->encoder_config);
+
+  for (argi = argj = argv_local; (*argj = *argi); argi += arg.argv_step) {
+    arg.argv_step = 1;
+
+    if (arg_match(&arg, &use_cfg, argi)) {
+      if (cfg_included) continue;
+      parse_cfg(arg.val, &global->encoder_config);
+      cfg_included = 1;
+      continue;
+    }
+    if (arg_match(&arg, &help, argi)) {
+      show_help(stdout, 0);
+      exit(EXIT_SUCCESS);
+    } else if (arg_match(&arg, &codecarg, argi)) {
+      global->codec = get_aom_encoder_by_name(arg.val);
+      if (!global->codec)
+        die("Error: Unrecognized argument (%s) to --codec\n", arg.val);
+    } else if (arg_match(&arg, &passes, argi)) {
+      global->passes = arg_parse_uint(&arg);
+
+      if (global->passes < 1 || global->passes > 2)
+        die("Error: Invalid number of passes (%d)\n", global->passes);
+    } else if (arg_match(&arg, &pass_arg, argi)) {
+      global->pass = arg_parse_uint(&arg);
+
+      if (global->pass < 1 || global->pass > 2)
+        die("Error: Invalid pass selected (%d)\n", global->pass);
+    } else if (arg_match(&arg, &input_chroma_sample_position, argi)) {
+      global->csp = arg_parse_enum(&arg);
+      /* Flag is used by later code as well, preserve it. */
+      argj++;
+    } else if (arg_match(&arg, &usage, argi))
+      global->usage = arg_parse_uint(&arg);
+    else if (arg_match(&arg, &good_dl, argi))
+      global->usage = AOM_USAGE_GOOD_QUALITY;  // Good quality usage
+    else if (arg_match(&arg, &rt_dl, argi))
+      global->usage = AOM_USAGE_REALTIME;  // Real-time usage
+    else if (arg_match(&arg, &use_yv12, argi))
+      global->color_type = YV12;
+    else if (arg_match(&arg, &use_i420, argi))
+      global->color_type = I420;
+    else if (arg_match(&arg, &use_i422, argi))
+      global->color_type = I422;
+    else if (arg_match(&arg, &use_i444, argi))
+      global->color_type = I444;
+    else if (arg_match(&arg, &quietarg, argi))
+      global->quiet = 1;
+    else if (arg_match(&arg, &verbosearg, argi))
+      global->verbose = 1;
+    else if (arg_match(&arg, &limit, argi))
+      global->limit = arg_parse_uint(&arg);
+    else if (arg_match(&arg, &skip, argi))
+      global->skip_frames = arg_parse_uint(&arg);
+    else if (arg_match(&arg, &psnrarg, argi))
+      global->show_psnr = 1;
+    else if (arg_match(&arg, &recontest, argi))
+      global->test_decode = arg_parse_enum_or_int(&arg);
+    else if (arg_match(&arg, &framerate, argi)) {
+      global->framerate = arg_parse_rational(&arg);
+      validate_positive_rational(arg.name, &global->framerate);
+      global->have_framerate = 1;
+    } else if (arg_match(&arg, &debugmode, argi))
+      global->debug = 1;
+    else if (arg_match(&arg, &q_hist_n, argi))
+      global->show_q_hist_buckets = arg_parse_uint(&arg);
+    else if (arg_match(&arg, &rate_hist_n, argi))
+      global->show_rate_hist_buckets = arg_parse_uint(&arg);
+    else if (arg_match(&arg, &disable_warnings, argi))
+      global->disable_warnings = 1;
+    else if (arg_match(&arg, &disable_warning_prompt, argi))
+      global->disable_warning_prompt = 1;
+    else
+      argj++;
+  }
+
+  if (global->pass) {
+    /* DWIM: Assume the user meant passes=2 if pass=2 is specified */
+    if (global->pass > global->passes) {
+      warn("Assuming --pass=%d implies --passes=%d\n", global->pass,
+           global->pass);
+      global->passes = global->pass;
+    }
+  }
+  /* Validate global config */
+  if (global->passes == 0) {
+#if CONFIG_AV1_ENCODER
+    // Make default AV1 passes = 2 until there is a better quality 1-pass
+    // encoder
+    if (global->codec != NULL && global->codec->name != NULL)
+      global->passes = (strcmp(global->codec->name, "av1") == 0 &&
+                        global->usage != AOM_USAGE_REALTIME)
+                           ? 2
+                           : 1;
+#else
+    global->passes = 1;
+#endif
+  }
+
+  if (global->usage == AOM_USAGE_REALTIME && global->passes > 1) {
+    warn("Enforcing one-pass encoding in realtime mode\n");
+    global->passes = 1;
+  }
+}
+
+static void open_input_file(struct AvxInputContext *input,
+                            aom_chroma_sample_position_t csp) {
+  /* Parse certain options from the input file, if possible */
+  input->file = strcmp(input->filename, "-") ? fopen(input->filename, "rb")
+                                             : set_binary_mode(stdin);
+
+  if (!input->file) fatal("Failed to open input file");
+
+  if (!fseeko(input->file, 0, SEEK_END)) {
+    /* Input file is seekable. Figure out how long it is, so we can get
+     * progress info.
+     */
+    input->length = ftello(input->file);
+    rewind(input->file);
+  }
+
+  /* Default to 1:1 pixel aspect ratio. */
+  input->pixel_aspect_ratio.numerator = 1;
+  input->pixel_aspect_ratio.denominator = 1;
+
+  /* For RAW input sources, these bytes will applied on the first frame
+   *  in read_frame().
+   */
+  input->detect.buf_read = fread(input->detect.buf, 1, 4, input->file);
+  input->detect.position = 0;
+
+  if (input->detect.buf_read == 4 && file_is_y4m(input->detect.buf)) {
+    if (y4m_input_open(&input->y4m, input->file, input->detect.buf, 4, csp,
+                       input->only_i420) >= 0) {
+      input->file_type = FILE_TYPE_Y4M;
+      input->width = input->y4m.pic_w;
+      input->height = input->y4m.pic_h;
+      input->pixel_aspect_ratio.numerator = input->y4m.par_n;
+      input->pixel_aspect_ratio.denominator = input->y4m.par_d;
+      input->framerate.numerator = input->y4m.fps_n;
+      input->framerate.denominator = input->y4m.fps_d;
+      input->fmt = input->y4m.aom_fmt;
+      input->bit_depth = input->y4m.bit_depth;
+    } else
+      fatal("Unsupported Y4M stream.");
+  } else if (input->detect.buf_read == 4 && fourcc_is_ivf(input->detect.buf)) {
+    fatal("IVF is not supported as input.");
+  } else {
+    input->file_type = FILE_TYPE_RAW;
+  }
+}
+
+static void close_input_file(struct AvxInputContext *input) {
+  fclose(input->file);
+  if (input->file_type == FILE_TYPE_Y4M) y4m_input_close(&input->y4m);
+}
+
+static struct stream_state *new_stream(struct AvxEncoderConfig *global,
+                                       struct stream_state *prev) {
+  struct stream_state *stream;
+
+  stream = calloc(1, sizeof(*stream));
+  if (stream == NULL) {
+    fatal("Failed to allocate new stream.");
+  }
+
+  if (prev) {
+    memcpy(stream, prev, sizeof(*stream));
+    stream->index++;
+    prev->next = stream;
+  } else {
+    aom_codec_err_t res;
+
+    /* Populate encoder configuration */
+    res = aom_codec_enc_config_default(global->codec->codec_interface(),
+                                       &stream->config.cfg, global->usage);
+    if (res) fatal("Failed to get config: %s\n", aom_codec_err_to_string(res));
+
+    /* Change the default timebase to a high enough value so that the
+     * encoder will always create strictly increasing timestamps.
+     */
+    stream->config.cfg.g_timebase.den = 1000;
+
+    /* Never use the library's default resolution, require it be parsed
+     * from the file or set on the command line.
+     */
+    stream->config.cfg.g_w = 0;
+    stream->config.cfg.g_h = 0;
+
+    /* Initialize remaining stream parameters */
+    stream->config.write_webm = 1;
+    stream->config.write_ivf = 0;
+
+#if CONFIG_WEBM_IO
+    stream->config.stereo_fmt = STEREO_FORMAT_MONO;
+    stream->webm_ctx.last_pts_ns = -1;
+    stream->webm_ctx.writer = NULL;
+    stream->webm_ctx.segment = NULL;
+#endif
+
+    /* Allows removal of the application version from the EBML tags */
+    stream->webm_ctx.debug = global->debug;
+    memcpy(&stream->config.cfg.encoder_cfg, &global->encoder_config,
+           sizeof(stream->config.cfg.encoder_cfg));
+  }
+
+  /* Output files must be specified for each stream */
+  stream->config.out_fn = NULL;
+
+  stream->next = NULL;
+  return stream;
+}
+
+static void set_config_arg_ctrls(struct stream_config *config, int key,
+                                 const struct arg *arg) {
+  int j;
+  if (key == AV1E_SET_FILM_GRAIN_TABLE) {
+    config->film_grain_filename = arg->val;
+    return;
+  }
+
+  // For target level, the settings should accumulate rather than overwrite,
+  // so we simply append it.
+  if (key == AV1E_SET_TARGET_SEQ_LEVEL_IDX) {
+    j = config->arg_ctrl_cnt;
+    assert(j < (int)ARG_CTRL_CNT_MAX);
+    config->arg_ctrls[j][0] = key;
+    config->arg_ctrls[j][1] = arg_parse_enum_or_int(arg);
+    ++config->arg_ctrl_cnt;
+    return;
+  }
+
+  /* Point either to the next free element or the first instance of this
+   * control.
+   */
+  for (j = 0; j < config->arg_ctrl_cnt; j++)
+    if (config->arg_ctrls[j][0] == key) break;
+
+  /* Update/insert */
+  assert(j < (int)ARG_CTRL_CNT_MAX);
+  config->arg_ctrls[j][0] = key;
+  config->arg_ctrls[j][1] = arg_parse_enum_or_int(arg);
+
+  if (key == AOME_SET_ENABLEAUTOALTREF && config->arg_ctrls[j][1] > 1) {
+    warn("auto-alt-ref > 1 is deprecated... setting auto-alt-ref=1\n");
+    config->arg_ctrls[j][1] = 1;
+  }
+  if (j == config->arg_ctrl_cnt) config->arg_ctrl_cnt++;
+}
+
+static int parse_stream_params(struct AvxEncoderConfig *global,
+                               struct stream_state *stream, char **argv) {
+  char **argi, **argj;
+  struct arg arg;
+  static const arg_def_t **ctrl_args = no_args;
+  static const int *ctrl_args_map = NULL;
+  struct stream_config *config = &stream->config;
+  int eos_mark_found = 0;
+  int webm_forced = 0;
+
+  // Handle codec specific options
+  if (0) {
+#if CONFIG_AV1_ENCODER
+  } else if (strcmp(global->codec->name, "av1") == 0) {
+    // TODO(jingning): Reuse AV1 specific encoder configuration parameters.
+    // Consider to expand this set for AV1 encoder control.
+    ctrl_args = av1_args;
+    ctrl_args_map = av1_arg_ctrl_map;
+#endif
+  }
+
+  for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) {
+    arg.argv_step = 1;
+
+    /* Once we've found an end-of-stream marker (--) we want to continue
+     * shifting arguments but not consuming them.
+     */
+    if (eos_mark_found) {
+      argj++;
+      continue;
+    } else if (!strcmp(*argj, "--")) {
+      eos_mark_found = 1;
+      continue;
+    }
+
+    if (arg_match(&arg, &outputfile, argi)) {
+      config->out_fn = arg.val;
+      if (!webm_forced) {
+        const size_t out_fn_len = strlen(config->out_fn);
+        if (out_fn_len >= 4 &&
+            !strcmp(config->out_fn + out_fn_len - 4, ".ivf")) {
+          config->write_webm = 0;
+          config->write_ivf = 1;
+        } else if (out_fn_len >= 4 &&
+                   !strcmp(config->out_fn + out_fn_len - 4, ".obu")) {
+          config->write_webm = 0;
+          config->write_ivf = 0;
+        }
+      }
+    } else if (arg_match(&arg, &fpf_name, argi)) {
+      config->stats_fn = arg.val;
+    } else if (arg_match(&arg, &use_webm, argi)) {
+#if CONFIG_WEBM_IO
+      config->write_webm = 1;
+      webm_forced = 1;
+#else
+      die("Error: --webm specified but webm is disabled.");
+#endif
+    } else if (arg_match(&arg, &use_ivf, argi)) {
+      config->write_webm = 0;
+      config->write_ivf = 1;
+    } else if (arg_match(&arg, &use_obu, argi)) {
+      config->write_webm = 0;
+      config->write_ivf = 0;
+    } else if (arg_match(&arg, &threads, argi)) {
+      config->cfg.g_threads = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &profile, argi)) {
+      config->cfg.g_profile = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &width, argi)) {
+      config->cfg.g_w = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &height, argi)) {
+      config->cfg.g_h = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &forced_max_frame_width, argi)) {
+      config->cfg.g_forced_max_frame_width = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &forced_max_frame_height, argi)) {
+      config->cfg.g_forced_max_frame_height = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &bitdeptharg, argi)) {
+      config->cfg.g_bit_depth = arg_parse_enum_or_int(&arg);
+    } else if (arg_match(&arg, &inbitdeptharg, argi)) {
+      config->cfg.g_input_bit_depth = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &input_chroma_subsampling_x, argi)) {
+      stream->chroma_subsampling_x = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &input_chroma_subsampling_y, argi)) {
+      stream->chroma_subsampling_y = arg_parse_uint(&arg);
+#if CONFIG_WEBM_IO
+    } else if (arg_match(&arg, &stereo_mode, argi)) {
+      config->stereo_fmt = arg_parse_enum_or_int(&arg);
+#endif
+    } else if (arg_match(&arg, &timebase, argi)) {
+      config->cfg.g_timebase = arg_parse_rational(&arg);
+      validate_positive_rational(arg.name, &config->cfg.g_timebase);
+    } else if (arg_match(&arg, &global_error_resilient, argi)) {
+      config->cfg.g_error_resilient = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &lag_in_frames, argi)) {
+      config->cfg.g_lag_in_frames = arg_parse_uint(&arg);
+      if (global->usage == AOM_USAGE_REALTIME &&
+          config->cfg.rc_end_usage == AOM_CBR &&
+          config->cfg.g_lag_in_frames != 0) {
+        warn("non-zero %s option ignored in realtime CBR mode.\n", arg.name);
+        config->cfg.g_lag_in_frames = 0;
+      }
+    } else if (arg_match(&arg, &large_scale_tile, argi)) {
+      config->cfg.large_scale_tile = arg_parse_uint(&arg);
+      if (config->cfg.large_scale_tile) global->codec = get_aom_lst_encoder();
+    } else if (arg_match(&arg, &monochrome, argi)) {
+      config->cfg.monochrome = 1;
+    } else if (arg_match(&arg, &full_still_picture_hdr, argi)) {
+      config->cfg.full_still_picture_hdr = 1;
+    } else if (arg_match(&arg, &dropframe_thresh, argi)) {
+      config->cfg.rc_dropframe_thresh = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &resize_mode, argi)) {
+      config->cfg.rc_resize_mode = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &resize_denominator, argi)) {
+      config->cfg.rc_resize_denominator = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &resize_kf_denominator, argi)) {
+      config->cfg.rc_resize_kf_denominator = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &superres_mode, argi)) {
+      config->cfg.rc_superres_mode = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &superres_denominator, argi)) {
+      config->cfg.rc_superres_denominator = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &superres_kf_denominator, argi)) {
+      config->cfg.rc_superres_kf_denominator = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &superres_qthresh, argi)) {
+      config->cfg.rc_superres_qthresh = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &superres_kf_qthresh, argi)) {
+      config->cfg.rc_superres_kf_qthresh = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &end_usage, argi)) {
+      config->cfg.rc_end_usage = arg_parse_enum_or_int(&arg);
+    } else if (arg_match(&arg, &target_bitrate, argi)) {
+      config->cfg.rc_target_bitrate = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &min_quantizer, argi)) {
+      config->cfg.rc_min_quantizer = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &max_quantizer, argi)) {
+      config->cfg.rc_max_quantizer = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &undershoot_pct, argi)) {
+      config->cfg.rc_undershoot_pct = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &overshoot_pct, argi)) {
+      config->cfg.rc_overshoot_pct = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &buf_sz, argi)) {
+      config->cfg.rc_buf_sz = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &buf_initial_sz, argi)) {
+      config->cfg.rc_buf_initial_sz = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &buf_optimal_sz, argi)) {
+      config->cfg.rc_buf_optimal_sz = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &bias_pct, argi)) {
+      config->cfg.rc_2pass_vbr_bias_pct = arg_parse_uint(&arg);
+      if (global->passes < 2)
+        warn("option %s ignored in one-pass mode.\n", arg.name);
+    } else if (arg_match(&arg, &minsection_pct, argi)) {
+      config->cfg.rc_2pass_vbr_minsection_pct = arg_parse_uint(&arg);
+
+      if (global->passes < 2)
+        warn("option %s ignored in one-pass mode.\n", arg.name);
+    } else if (arg_match(&arg, &maxsection_pct, argi)) {
+      config->cfg.rc_2pass_vbr_maxsection_pct = arg_parse_uint(&arg);
+
+      if (global->passes < 2)
+        warn("option %s ignored in one-pass mode.\n", arg.name);
+    } else if (arg_match(&arg, &fwd_kf_enabled, argi)) {
+      config->cfg.fwd_kf_enabled = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &kf_min_dist, argi)) {
+      config->cfg.kf_min_dist = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &kf_max_dist, argi)) {
+      config->cfg.kf_max_dist = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &kf_disabled, argi)) {
+      config->cfg.kf_mode = AOM_KF_DISABLED;
+    } else if (arg_match(&arg, &sframe_dist, argi)) {
+      config->cfg.sframe_dist = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &sframe_mode, argi)) {
+      config->cfg.sframe_mode = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &save_as_annexb, argi)) {
+      config->cfg.save_as_annexb = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &tile_width, argi)) {
+      config->cfg.tile_width_count =
+          arg_parse_list(&arg, config->cfg.tile_widths, MAX_TILE_WIDTHS);
+    } else if (arg_match(&arg, &tile_height, argi)) {
+      config->cfg.tile_height_count =
+          arg_parse_list(&arg, config->cfg.tile_heights, MAX_TILE_HEIGHTS);
+#if CONFIG_TUNE_VMAF
+    } else if (arg_match(&arg, &vmaf_model_path, argi)) {
+      config->vmaf_model_path = arg.val;
+#endif
+    } else if (arg_match(&arg, &use_fixed_qp_offsets, argi)) {
+      config->cfg.use_fixed_qp_offsets = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &fixed_qp_offsets, argi)) {
+      const int fixed_qp_offset_count = arg_parse_list(
+          &arg, config->cfg.fixed_qp_offsets, FIXED_QP_OFFSET_COUNT);
+      if (fixed_qp_offset_count < FIXED_QP_OFFSET_COUNT) {
+        die("Option --fixed_qp_offsets requires %d comma-separated values, but "
+            "only %d values were provided.\n",
+            FIXED_QP_OFFSET_COUNT, fixed_qp_offset_count);
+      }
+      config->cfg.use_fixed_qp_offsets = 1;
+    } else if (global->usage == AOM_USAGE_REALTIME &&
+               arg_match(&arg, &enable_restoration, argi)) {
+      if (arg_parse_uint(&arg) == 1) {
+        warn("non-zero %s option ignored in realtime mode.\n", arg.name);
+      }
+    } else {
+      int i, match = 0;
+      for (i = 0; ctrl_args[i]; i++) {
+        if (arg_match(&arg, ctrl_args[i], argi)) {
+          match = 1;
+          if (ctrl_args_map) {
+            set_config_arg_ctrls(config, ctrl_args_map[i], &arg);
+          }
+        }
+      }
+      if (!match) argj++;
+    }
+  }
+  config->use_16bit_internal =
+      config->cfg.g_bit_depth > AOM_BITS_8 || FORCE_HIGHBITDEPTH_DECODING;
+  return eos_mark_found;
+}
+
+#define FOREACH_STREAM(iterator, list)                 \
+  for (struct stream_state *iterator = list; iterator; \
+       iterator = iterator->next)
+
+static void validate_stream_config(const struct stream_state *stream,
+                                   const struct AvxEncoderConfig *global) {
+  const struct stream_state *streami;
+  (void)global;
+
+  if (!stream->config.cfg.g_w || !stream->config.cfg.g_h)
+    fatal(
+        "Stream %d: Specify stream dimensions with --width (-w) "
+        " and --height (-h)",
+        stream->index);
+
+  /* Even if bit depth is set on the command line flag to be lower,
+   * it is upgraded to at least match the input bit depth.
+   */
+  assert(stream->config.cfg.g_input_bit_depth <=
+         (unsigned int)stream->config.cfg.g_bit_depth);
+
+  for (streami = stream; streami; streami = streami->next) {
+    /* All streams require output files */
+    if (!streami->config.out_fn)
+      fatal("Stream %d: Output file is required (specify with -o)",
+            streami->index);
+
+    /* Check for two streams outputting to the same file */
+    if (streami != stream) {
+      const char *a = stream->config.out_fn;
+      const char *b = streami->config.out_fn;
+      if (!strcmp(a, b) && strcmp(a, "/dev/null") && strcmp(a, ":nul"))
+        fatal("Stream %d: duplicate output file (from stream %d)",
+              streami->index, stream->index);
+    }
+
+    /* Check for two streams sharing a stats file. */
+    if (streami != stream) {
+      const char *a = stream->config.stats_fn;
+      const char *b = streami->config.stats_fn;
+      if (a && b && !strcmp(a, b))
+        fatal("Stream %d: duplicate stats file (from stream %d)",
+              streami->index, stream->index);
+    }
+  }
+}
+
+static void set_stream_dimensions(struct stream_state *stream, unsigned int w,
+                                  unsigned int h) {
+  if (!stream->config.cfg.g_w) {
+    if (!stream->config.cfg.g_h)
+      stream->config.cfg.g_w = w;
+    else
+      stream->config.cfg.g_w = w * stream->config.cfg.g_h / h;
+  }
+  if (!stream->config.cfg.g_h) {
+    stream->config.cfg.g_h = h * stream->config.cfg.g_w / w;
+  }
+}
+
+static const char *file_type_to_string(enum VideoFileType t) {
+  switch (t) {
+    case FILE_TYPE_RAW: return "RAW";
+    case FILE_TYPE_Y4M: return "Y4M";
+    default: return "Other";
+  }
+}
+
+static const char *image_format_to_string(aom_img_fmt_t f) {
+  switch (f) {
+    case AOM_IMG_FMT_I420: return "I420";
+    case AOM_IMG_FMT_I422: return "I422";
+    case AOM_IMG_FMT_I444: return "I444";
+    case AOM_IMG_FMT_YV12: return "YV12";
+    case AOM_IMG_FMT_YV1216: return "YV1216";
+    case AOM_IMG_FMT_I42016: return "I42016";
+    case AOM_IMG_FMT_I42216: return "I42216";
+    case AOM_IMG_FMT_I44416: return "I44416";
+    default: return "Other";
+  }
+}
+
+static void show_stream_config(struct stream_state *stream,
+                               struct AvxEncoderConfig *global,
+                               struct AvxInputContext *input) {
+#define SHOW(field) \
+  fprintf(stderr, "    %-28s = %d\n", #field, stream->config.cfg.field)
+
+  if (stream->index == 0) {
+    fprintf(stderr, "Codec: %s\n",
+            aom_codec_iface_name(global->codec->codec_interface()));
+    fprintf(stderr, "Source file: %s File Type: %s Format: %s\n",
+            input->filename, file_type_to_string(input->file_type),
+            image_format_to_string(input->fmt));
+  }
+  if (stream->next || stream->index)
+    fprintf(stderr, "\nStream Index: %d\n", stream->index);
+  fprintf(stderr, "Destination file: %s\n", stream->config.out_fn);
+  fprintf(stderr, "Coding path: %s\n",
+          stream->config.use_16bit_internal ? "HBD" : "LBD");
+  fprintf(stderr, "Encoder parameters:\n");
+
+  SHOW(g_usage);
+  SHOW(g_threads);
+  SHOW(g_profile);
+  SHOW(g_w);
+  SHOW(g_h);
+  SHOW(g_bit_depth);
+  SHOW(g_input_bit_depth);
+  SHOW(g_timebase.num);
+  SHOW(g_timebase.den);
+  SHOW(g_error_resilient);
+  SHOW(g_pass);
+  SHOW(g_lag_in_frames);
+  SHOW(large_scale_tile);
+  SHOW(rc_dropframe_thresh);
+  SHOW(rc_resize_mode);
+  SHOW(rc_resize_denominator);
+  SHOW(rc_resize_kf_denominator);
+  SHOW(rc_superres_mode);
+  SHOW(rc_superres_denominator);
+  SHOW(rc_superres_kf_denominator);
+  SHOW(rc_superres_qthresh);
+  SHOW(rc_superres_kf_qthresh);
+  SHOW(rc_end_usage);
+  SHOW(rc_target_bitrate);
+  SHOW(rc_min_quantizer);
+  SHOW(rc_max_quantizer);
+  SHOW(rc_undershoot_pct);
+  SHOW(rc_overshoot_pct);
+  SHOW(rc_buf_sz);
+  SHOW(rc_buf_initial_sz);
+  SHOW(rc_buf_optimal_sz);
+  SHOW(rc_2pass_vbr_bias_pct);
+  SHOW(rc_2pass_vbr_minsection_pct);
+  SHOW(rc_2pass_vbr_maxsection_pct);
+  SHOW(fwd_kf_enabled);
+  SHOW(kf_mode);
+  SHOW(kf_min_dist);
+  SHOW(kf_max_dist);
+
+#define SHOW_PARAMS(field)                    \
+  fprintf(stderr, "    %-28s = %d\n", #field, \
+          stream->config.cfg.encoder_cfg.field)
+  SHOW_PARAMS(super_block_size);
+  SHOW_PARAMS(max_partition_size);
+  SHOW_PARAMS(min_partition_size);
+  SHOW_PARAMS(disable_ab_partition_type);
+  SHOW_PARAMS(disable_rect_partition_type);
+  SHOW_PARAMS(disable_1to4_partition_type);
+  SHOW_PARAMS(disable_flip_idtx);
+  SHOW_PARAMS(disable_cdef);
+  SHOW_PARAMS(disable_lr);
+  SHOW_PARAMS(disable_obmc);
+  SHOW_PARAMS(disable_warp_motion);
+  SHOW_PARAMS(disable_global_motion);
+  SHOW_PARAMS(disable_dist_wtd_comp);
+  SHOW_PARAMS(disable_diff_wtd_comp);
+  SHOW_PARAMS(disable_inter_intra_comp);
+  SHOW_PARAMS(disable_masked_comp);
+  SHOW_PARAMS(disable_one_sided_comp);
+  SHOW_PARAMS(disable_palette);
+  SHOW_PARAMS(disable_intrabc);
+  SHOW_PARAMS(disable_cfl);
+  SHOW_PARAMS(disable_smooth_intra);
+  SHOW_PARAMS(disable_filter_intra);
+  SHOW_PARAMS(disable_dual_filter);
+  SHOW_PARAMS(disable_intra_angle_delta);
+  SHOW_PARAMS(disable_intra_edge_filter);
+  SHOW_PARAMS(disable_tx_64x64);
+  SHOW_PARAMS(disable_smooth_inter_intra);
+  SHOW_PARAMS(disable_inter_inter_wedge);
+  SHOW_PARAMS(disable_inter_intra_wedge);
+  SHOW_PARAMS(disable_paeth_intra);
+  SHOW_PARAMS(disable_trellis_quant);
+  SHOW_PARAMS(disable_ref_frame_mv);
+  SHOW_PARAMS(reduced_reference_set);
+  SHOW_PARAMS(reduced_tx_type_set);
+}
+
+static void open_output_file(struct stream_state *stream,
+                             struct AvxEncoderConfig *global,
+                             const struct AvxRational *pixel_aspect_ratio) {
+  const char *fn = stream->config.out_fn;
+  const struct aom_codec_enc_cfg *const cfg = &stream->config.cfg;
+
+  if (cfg->g_pass == AOM_RC_FIRST_PASS) return;
+
+  stream->file = strcmp(fn, "-") ? fopen(fn, "wb") : set_binary_mode(stdout);
+
+  if (!stream->file) fatal("Failed to open output file");
+
+  if (stream->config.write_webm && fseek(stream->file, 0, SEEK_CUR))
+    fatal("WebM output to pipes not supported.");
+
+#if CONFIG_WEBM_IO
+  if (stream->config.write_webm) {
+    stream->webm_ctx.stream = stream->file;
+    if (write_webm_file_header(&stream->webm_ctx, &stream->encoder, cfg,
+                               stream->config.stereo_fmt, global->codec->fourcc,
+                               pixel_aspect_ratio) != 0) {
+      fatal("WebM writer initialization failed.");
+    }
+  }
+#else
+  (void)pixel_aspect_ratio;
+#endif
+
+  if (!stream->config.write_webm && stream->config.write_ivf) {
+    ivf_write_file_header(stream->file, cfg, global->codec->fourcc, 0);
+  }
+}
+
+static void close_output_file(struct stream_state *stream,
+                              unsigned int fourcc) {
+  const struct aom_codec_enc_cfg *const cfg = &stream->config.cfg;
+
+  if (cfg->g_pass == AOM_RC_FIRST_PASS) return;
+
+#if CONFIG_WEBM_IO
+  if (stream->config.write_webm) {
+    if (write_webm_file_footer(&stream->webm_ctx) != 0) {
+      fatal("WebM writer finalization failed.");
+    }
+  }
+#endif
+
+  if (!stream->config.write_webm && stream->config.write_ivf) {
+    if (!fseek(stream->file, 0, SEEK_SET))
+      ivf_write_file_header(stream->file, &stream->config.cfg, fourcc,
+                            stream->frames_out);
+  }
+
+  fclose(stream->file);
+}
+
+static void setup_pass(struct stream_state *stream,
+                       struct AvxEncoderConfig *global, int pass) {
+  if (stream->config.stats_fn) {
+    if (!stats_open_file(&stream->stats, stream->config.stats_fn, pass))
+      fatal("Failed to open statistics store");
+  } else {
+    if (!stats_open_mem(&stream->stats, pass))
+      fatal("Failed to open statistics store");
+  }
+
+  stream->config.cfg.g_pass = global->passes == 2
+                                  ? pass ? AOM_RC_LAST_PASS : AOM_RC_FIRST_PASS
+                                  : AOM_RC_ONE_PASS;
+  if (pass) {
+    stream->config.cfg.rc_twopass_stats_in = stats_get(&stream->stats);
+  }
+
+  stream->cx_time = 0;
+  stream->nbytes = 0;
+  stream->frames_out = 0;
+}
+
+static void initialize_encoder(struct stream_state *stream,
+                               struct AvxEncoderConfig *global) {
+  int i;
+  int flags = 0;
+
+  flags |= global->show_psnr ? AOM_CODEC_USE_PSNR : 0;
+  flags |= stream->config.use_16bit_internal ? AOM_CODEC_USE_HIGHBITDEPTH : 0;
+
+  /* Construct Encoder Context */
+  aom_codec_enc_init(&stream->encoder, global->codec->codec_interface(),
+                     &stream->config.cfg, flags);
+  ctx_exit_on_error(&stream->encoder, "Failed to initialize encoder");
+
+  for (i = 0; i < stream->config.arg_ctrl_cnt; i++) {
+    int ctrl = stream->config.arg_ctrls[i][0];
+    int value = stream->config.arg_ctrls[i][1];
+    if (aom_codec_control(&stream->encoder, ctrl, value))
+      fprintf(stderr, "Error: Tried to set control %d = %d\n", ctrl, value);
+
+    ctx_exit_on_error(&stream->encoder, "Failed to control codec");
+  }
+
+#if CONFIG_TUNE_VMAF
+  if (stream->config.vmaf_model_path) {
+    AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder, AV1E_SET_VMAF_MODEL_PATH,
+                                  stream->config.vmaf_model_path);
+  }
+#endif
+
+  if (stream->config.film_grain_filename) {
+    AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder, AV1E_SET_FILM_GRAIN_TABLE,
+                                  stream->config.film_grain_filename);
+  }
+
+#if CONFIG_AV1_DECODER
+  if (global->test_decode != TEST_DECODE_OFF) {
+    const AvxInterface *decoder = get_aom_decoder_by_name(global->codec->name);
+    aom_codec_dec_cfg_t cfg = { 0, 0, 0, !FORCE_HIGHBITDEPTH_DECODING };
+    aom_codec_dec_init(&stream->decoder, decoder->codec_interface(), &cfg, 0);
+
+    if (strcmp(global->codec->name, "av1") == 0) {
+      AOM_CODEC_CONTROL_TYPECHECKED(&stream->decoder, AV1_SET_TILE_MODE,
+                                    stream->config.cfg.large_scale_tile);
+      ctx_exit_on_error(&stream->decoder, "Failed to set decode_tile_mode");
+
+      AOM_CODEC_CONTROL_TYPECHECKED(&stream->decoder, AV1D_SET_IS_ANNEXB,
+                                    stream->config.cfg.save_as_annexb);
+      ctx_exit_on_error(&stream->decoder, "Failed to set is_annexb");
+
+      AOM_CODEC_CONTROL_TYPECHECKED(&stream->decoder, AV1_SET_DECODE_TILE_ROW,
+                                    -1);
+      ctx_exit_on_error(&stream->decoder, "Failed to set decode_tile_row");
+
+      AOM_CODEC_CONTROL_TYPECHECKED(&stream->decoder, AV1_SET_DECODE_TILE_COL,
+                                    -1);
+      ctx_exit_on_error(&stream->decoder, "Failed to set decode_tile_col");
+    }
+  }
+#endif
+}
+
+static void encode_frame(struct stream_state *stream,
+                         struct AvxEncoderConfig *global, struct aom_image *img,
+                         unsigned int frames_in) {
+  aom_codec_pts_t frame_start, next_frame_start;
+  struct aom_codec_enc_cfg *cfg = &stream->config.cfg;
+  struct aom_usec_timer timer;
+
+  frame_start =
+      (cfg->g_timebase.den * (int64_t)(frames_in - 1) * global->framerate.den) /
+      cfg->g_timebase.num / global->framerate.num;
+  next_frame_start =
+      (cfg->g_timebase.den * (int64_t)(frames_in)*global->framerate.den) /
+      cfg->g_timebase.num / global->framerate.num;
+
+  /* Scale if necessary */
+  if (img) {
+    if ((img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) &&
+        (img->d_w != cfg->g_w || img->d_h != cfg->g_h)) {
+      if (img->fmt != AOM_IMG_FMT_I42016) {
+        fprintf(stderr, "%s can only scale 4:2:0 inputs\n", exec_name);
+        exit(EXIT_FAILURE);
+      }
+#if CONFIG_LIBYUV
+      if (!stream->img) {
+        stream->img =
+            aom_img_alloc(NULL, AOM_IMG_FMT_I42016, cfg->g_w, cfg->g_h, 16);
+      }
+      I420Scale_16(
+          (uint16_t *)img->planes[AOM_PLANE_Y], img->stride[AOM_PLANE_Y] / 2,
+          (uint16_t *)img->planes[AOM_PLANE_U], img->stride[AOM_PLANE_U] / 2,
+          (uint16_t *)img->planes[AOM_PLANE_V], img->stride[AOM_PLANE_V] / 2,
+          img->d_w, img->d_h, (uint16_t *)stream->img->planes[AOM_PLANE_Y],
+          stream->img->stride[AOM_PLANE_Y] / 2,
+          (uint16_t *)stream->img->planes[AOM_PLANE_U],
+          stream->img->stride[AOM_PLANE_U] / 2,
+          (uint16_t *)stream->img->planes[AOM_PLANE_V],
+          stream->img->stride[AOM_PLANE_V] / 2, stream->img->d_w,
+          stream->img->d_h, kFilterBox);
+      img = stream->img;
+#else
+      stream->encoder.err = 1;
+      ctx_exit_on_error(&stream->encoder,
+                        "Stream %d: Failed to encode frame.\n"
+                        "libyuv is required for scaling but is currently "
+                        "disabled.\n"
+                        "Be sure to specify -DCONFIG_LIBYUV=1 when running "
+                        "cmake.\n",
+                        stream->index);
+#endif
+    }
+  }
+  if (img && (img->d_w != cfg->g_w || img->d_h != cfg->g_h)) {
+    if (img->fmt != AOM_IMG_FMT_I420 && img->fmt != AOM_IMG_FMT_YV12) {
+      fprintf(stderr, "%s can only scale 4:2:0 8bpp inputs\n", exec_name);
+      exit(EXIT_FAILURE);
+    }
+#if CONFIG_LIBYUV
+    if (!stream->img)
+      stream->img =
+          aom_img_alloc(NULL, AOM_IMG_FMT_I420, cfg->g_w, cfg->g_h, 16);
+    I420Scale(
+        img->planes[AOM_PLANE_Y], img->stride[AOM_PLANE_Y],
+        img->planes[AOM_PLANE_U], img->stride[AOM_PLANE_U],
+        img->planes[AOM_PLANE_V], img->stride[AOM_PLANE_V], img->d_w, img->d_h,
+        stream->img->planes[AOM_PLANE_Y], stream->img->stride[AOM_PLANE_Y],
+        stream->img->planes[AOM_PLANE_U], stream->img->stride[AOM_PLANE_U],
+        stream->img->planes[AOM_PLANE_V], stream->img->stride[AOM_PLANE_V],
+        stream->img->d_w, stream->img->d_h, kFilterBox);
+    img = stream->img;
+#else
+    stream->encoder.err = 1;
+    ctx_exit_on_error(&stream->encoder,
+                      "Stream %d: Failed to encode frame.\n"
+                      "Scaling disabled in this configuration. \n"
+                      "To enable, configure with --enable-libyuv\n",
+                      stream->index);
+#endif
+  }
+
+  aom_usec_timer_start(&timer);
+  aom_codec_encode(&stream->encoder, img, frame_start,
+                   (uint32_t)(next_frame_start - frame_start), 0);
+  aom_usec_timer_mark(&timer);
+  stream->cx_time += aom_usec_timer_elapsed(&timer);
+  ctx_exit_on_error(&stream->encoder, "Stream %d: Failed to encode frame",
+                    stream->index);
+}
+
+static void update_quantizer_histogram(struct stream_state *stream) {
+  if (stream->config.cfg.g_pass != AOM_RC_FIRST_PASS) {
+    int q;
+
+    AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder, AOME_GET_LAST_QUANTIZER_64,
+                                  &q);
+    ctx_exit_on_error(&stream->encoder, "Failed to read quantizer");
+    stream->counts[q]++;
+  }
+}
+
+static void get_cx_data(struct stream_state *stream,
+                        struct AvxEncoderConfig *global, int *got_data) {
+  const aom_codec_cx_pkt_t *pkt;
+  const struct aom_codec_enc_cfg *cfg = &stream->config.cfg;
+  aom_codec_iter_t iter = NULL;
+
+  *got_data = 0;
+  while ((pkt = aom_codec_get_cx_data(&stream->encoder, &iter))) {
+    static size_t fsize = 0;
+    static FileOffset ivf_header_pos = 0;
+
+    switch (pkt->kind) {
+      case AOM_CODEC_CX_FRAME_PKT:
+        ++stream->frames_out;
+        if (!global->quiet)
+          fprintf(stderr, " %6luF", (unsigned long)pkt->data.frame.sz);
+
+        update_rate_histogram(stream->rate_hist, cfg, pkt);
+#if CONFIG_WEBM_IO
+        if (stream->config.write_webm) {
+          if (write_webm_block(&stream->webm_ctx, cfg, pkt) != 0) {
+            fatal("WebM writer failed.");
+          }
+        }
+#endif
+        if (!stream->config.write_webm) {
+          if (stream->config.write_ivf) {
+            if (pkt->data.frame.partition_id <= 0) {
+              ivf_header_pos = ftello(stream->file);
+              fsize = pkt->data.frame.sz;
+
+              ivf_write_frame_header(stream->file, pkt->data.frame.pts, fsize);
+            } else {
+              fsize += pkt->data.frame.sz;
+
+              const FileOffset currpos = ftello(stream->file);
+              fseeko(stream->file, ivf_header_pos, SEEK_SET);
+              ivf_write_frame_size(stream->file, fsize);
+              fseeko(stream->file, currpos, SEEK_SET);
+            }
+          }
+
+          (void)fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz,
+                       stream->file);
+        }
+        stream->nbytes += pkt->data.raw.sz;
+
+        *got_data = 1;
+#if CONFIG_AV1_DECODER
+        if (global->test_decode != TEST_DECODE_OFF && !stream->mismatch_seen) {
+          aom_codec_decode(&stream->decoder, pkt->data.frame.buf,
+                           pkt->data.frame.sz, NULL);
+          if (stream->decoder.err) {
+            warn_or_exit_on_error(&stream->decoder,
+                                  global->test_decode == TEST_DECODE_FATAL,
+                                  "Failed to decode frame %d in stream %d",
+                                  stream->frames_out + 1, stream->index);
+            stream->mismatch_seen = stream->frames_out + 1;
+          }
+        }
+#endif
+        break;
+      case AOM_CODEC_STATS_PKT:
+        stream->frames_out++;
+        stats_write(&stream->stats, pkt->data.twopass_stats.buf,
+                    pkt->data.twopass_stats.sz);
+        stream->nbytes += pkt->data.raw.sz;
+        break;
+      case AOM_CODEC_PSNR_PKT:
+
+        if (global->show_psnr) {
+          int i;
+
+          stream->psnr_sse_total += pkt->data.psnr.sse[0];
+          stream->psnr_samples_total += pkt->data.psnr.samples[0];
+          for (i = 0; i < 4; i++) {
+            if (!global->quiet)
+              fprintf(stderr, "%.3f ", pkt->data.psnr.psnr[i]);
+            stream->psnr_totals[i] += pkt->data.psnr.psnr[i];
+          }
+          stream->psnr_count++;
+        }
+
+        break;
+      default: break;
+    }
+  }
+}
+
+static void show_psnr(struct stream_state *stream, double peak, int64_t bps) {
+  int i;
+  double ovpsnr;
+
+  if (!stream->psnr_count) return;
+
+  fprintf(stderr, "Stream %d PSNR (Overall/Avg/Y/U/V)", stream->index);
+  ovpsnr = sse_to_psnr((double)stream->psnr_samples_total, peak,
+                       (double)stream->psnr_sse_total);
+  fprintf(stderr, " %.3f", ovpsnr);
+
+  for (i = 0; i < 4; i++) {
+    fprintf(stderr, " %.3f", stream->psnr_totals[i] / stream->psnr_count);
+  }
+  if (bps > 0) {
+    fprintf(stderr, " %7" PRId64 " bps", bps);
+  }
+  fprintf(stderr, " %7" PRId64 " ms", stream->cx_time / 1000);
+  fprintf(stderr, "\n");
+}
+
+static float usec_to_fps(uint64_t usec, unsigned int frames) {
+  return (float)(usec > 0 ? frames * 1000000.0 / (float)usec : 0);
+}
+
+static void test_decode(struct stream_state *stream,
+                        enum TestDecodeFatality fatal) {
+  aom_image_t enc_img, dec_img;
+
+  if (stream->mismatch_seen) return;
+
+  /* Get the internal reference frame */
+  AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder, AV1_GET_NEW_FRAME_IMAGE,
+                                &enc_img);
+  AOM_CODEC_CONTROL_TYPECHECKED(&stream->decoder, AV1_GET_NEW_FRAME_IMAGE,
+                                &dec_img);
+
+  if ((enc_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) !=
+      (dec_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH)) {
+    if (enc_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) {
+      aom_image_t enc_hbd_img;
+      aom_img_alloc(&enc_hbd_img, enc_img.fmt - AOM_IMG_FMT_HIGHBITDEPTH,
+                    enc_img.d_w, enc_img.d_h, 16);
+      aom_img_truncate_16_to_8(&enc_hbd_img, &enc_img);
+      enc_img = enc_hbd_img;
+    }
+    if (dec_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) {
+      aom_image_t dec_hbd_img;
+      aom_img_alloc(&dec_hbd_img, dec_img.fmt - AOM_IMG_FMT_HIGHBITDEPTH,
+                    dec_img.d_w, dec_img.d_h, 16);
+      aom_img_truncate_16_to_8(&dec_hbd_img, &dec_img);
+      dec_img = dec_hbd_img;
+    }
+  }
+
+  ctx_exit_on_error(&stream->encoder, "Failed to get encoder reference frame");
+  ctx_exit_on_error(&stream->decoder, "Failed to get decoder reference frame");
+
+  if (!aom_compare_img(&enc_img, &dec_img)) {
+    int y[4], u[4], v[4];
+    if (enc_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) {
+      aom_find_mismatch_high(&enc_img, &dec_img, y, u, v);
+    } else {
+      aom_find_mismatch(&enc_img, &dec_img, y, u, v);
+    }
+    stream->decoder.err = 1;
+    warn_or_exit_on_error(&stream->decoder, fatal == TEST_DECODE_FATAL,
+                          "Stream %d: Encode/decode mismatch on frame %d at"
+                          " Y[%d, %d] {%d/%d},"
+                          " U[%d, %d] {%d/%d},"
+                          " V[%d, %d] {%d/%d}",
+                          stream->index, stream->frames_out, y[0], y[1], y[2],
+                          y[3], u[0], u[1], u[2], u[3], v[0], v[1], v[2], v[3]);
+    stream->mismatch_seen = stream->frames_out;
+  }
+
+  aom_img_free(&enc_img);
+  aom_img_free(&dec_img);
+}
+
+static void print_time(const char *label, int64_t etl) {
+  int64_t hours;
+  int64_t mins;
+  int64_t secs;
+
+  if (etl >= 0) {
+    hours = etl / 3600;
+    etl -= hours * 3600;
+    mins = etl / 60;
+    etl -= mins * 60;
+    secs = etl;
+
+    fprintf(stderr, "[%3s %2" PRId64 ":%02" PRId64 ":%02" PRId64 "] ", label,
+            hours, mins, secs);
+  } else {
+    fprintf(stderr, "[%3s  unknown] ", label);
+  }
+}
+
+int main(int argc, const char **argv_) {
+  int pass;
+  aom_image_t raw;
+  aom_image_t raw_shift;
+  int allocated_raw_shift = 0;
+  int use_16bit_internal = 0;
+  int input_shift = 0;
+  int frame_avail, got_data;
+
+  struct AvxInputContext input;
+  struct AvxEncoderConfig global;
+  struct stream_state *streams = NULL;
+  char **argv, **argi;
+  uint64_t cx_time = 0;
+  int stream_cnt = 0;
+  int res = 0;
+  int profile_updated = 0;
+
+  memset(&input, 0, sizeof(input));
+  exec_name = argv_[0];
+
+  /* Setup default input stream settings */
+  input.framerate.numerator = 30;
+  input.framerate.denominator = 1;
+  input.only_i420 = 1;
+  input.bit_depth = 0;
+
+  /* First parse the global configuration values, because we want to apply
+   * other parameters on top of the default configuration provided by the
+   * codec.
+   */
+  argv = argv_dup(argc - 1, argv_ + 1);
+  parse_global_config(&global, &argv);
+
+  if (argc < 2) usage_exit();
+
+  switch (global.color_type) {
+    case I420: input.fmt = AOM_IMG_FMT_I420; break;
+    case I422: input.fmt = AOM_IMG_FMT_I422; break;
+    case I444: input.fmt = AOM_IMG_FMT_I444; break;
+    case YV12: input.fmt = AOM_IMG_FMT_YV12; break;
+  }
+
+  {
+    /* Now parse each stream's parameters. Using a local scope here
+     * due to the use of 'stream' as loop variable in FOREACH_STREAM
+     * loops
+     */
+    struct stream_state *stream = NULL;
+
+    do {
+      stream = new_stream(&global, stream);
+      stream_cnt++;
+      if (!streams) streams = stream;
+    } while (parse_stream_params(&global, stream, argv));
+  }
+
+  /* Check for unrecognized options */
+  for (argi = argv; *argi; argi++)
+    if (argi[0][0] == '-' && argi[0][1])
+      die("Error: Unrecognized option %s\n", *argi);
+
+  FOREACH_STREAM(stream, streams) {
+    check_encoder_config(global.disable_warning_prompt, &global,
+                         &stream->config.cfg);
+
+    // If large_scale_tile = 1, only support to output to ivf format.
+    if (stream->config.cfg.large_scale_tile && !stream->config.write_ivf)
+      die("only support ivf output format while large-scale-tile=1\n");
+  }
+
+  /* Handle non-option arguments */
+  input.filename = argv[0];
+
+  if (!input.filename) {
+    fprintf(stderr, "No input file specified!\n");
+    usage_exit();
+  }
+
+  /* Decide if other chroma subsamplings than 4:2:0 are supported */
+  if (global.codec->fourcc == AV1_FOURCC) input.only_i420 = 0;
+
+  for (pass = global.pass ? global.pass - 1 : 0; pass < global.passes; pass++) {
+    int frames_in = 0, seen_frames = 0;
+    int64_t estimated_time_left = -1;
+    int64_t average_rate = -1;
+    int64_t lagged_count = 0;
+
+    open_input_file(&input, global.csp);
+
+    /* If the input file doesn't specify its w/h (raw files), try to get
+     * the data from the first stream's configuration.
+     */
+    if (!input.width || !input.height) {
+      FOREACH_STREAM(stream, streams) {
+        if (stream->config.cfg.g_w && stream->config.cfg.g_h) {
+          input.width = stream->config.cfg.g_w;
+          input.height = stream->config.cfg.g_h;
+          break;
+        }
+      };
+    }
+
+    /* Update stream configurations from the input file's parameters */
+    if (!input.width || !input.height)
+      fatal(
+          "Specify stream dimensions with --width (-w) "
+          " and --height (-h)");
+
+    /* If input file does not specify bit-depth but input-bit-depth parameter
+     * exists, assume that to be the input bit-depth. However, if the
+     * input-bit-depth paramter does not exist, assume the input bit-depth
+     * to be the same as the codec bit-depth.
+     */
+    if (!input.bit_depth) {
+      FOREACH_STREAM(stream, streams) {
+        if (stream->config.cfg.g_input_bit_depth)
+          input.bit_depth = stream->config.cfg.g_input_bit_depth;
+        else
+          input.bit_depth = stream->config.cfg.g_input_bit_depth =
+              (int)stream->config.cfg.g_bit_depth;
+      }
+      if (input.bit_depth > 8) input.fmt |= AOM_IMG_FMT_HIGHBITDEPTH;
+    } else {
+      FOREACH_STREAM(stream, streams) {
+        stream->config.cfg.g_input_bit_depth = input.bit_depth;
+      }
+    }
+
+    FOREACH_STREAM(stream, streams) {
+      if (input.fmt != AOM_IMG_FMT_I420 && input.fmt != AOM_IMG_FMT_I42016) {
+        /* Automatically upgrade if input is non-4:2:0 but a 4:2:0 profile
+           was selected. */
+        switch (stream->config.cfg.g_profile) {
+          case 0:
+            if (input.bit_depth < 12 && (input.fmt == AOM_IMG_FMT_I444 ||
+                                         input.fmt == AOM_IMG_FMT_I44416)) {
+              if (!stream->config.cfg.monochrome) {
+                stream->config.cfg.g_profile = 1;
+                profile_updated = 1;
+              }
+            } else if (input.bit_depth == 12 || input.fmt == AOM_IMG_FMT_I422 ||
+                       input.fmt == AOM_IMG_FMT_I42216) {
+              stream->config.cfg.g_profile = 2;
+              profile_updated = 1;
+            }
+            break;
+          case 1:
+            if (input.bit_depth == 12 || input.fmt == AOM_IMG_FMT_I422 ||
+                input.fmt == AOM_IMG_FMT_I42216) {
+              stream->config.cfg.g_profile = 2;
+              profile_updated = 1;
+            } else if (input.bit_depth < 12 &&
+                       (input.fmt == AOM_IMG_FMT_I420 ||
+                        input.fmt == AOM_IMG_FMT_I42016)) {
+              stream->config.cfg.g_profile = 0;
+              profile_updated = 1;
+            }
+            break;
+          case 2:
+            if (input.bit_depth < 12 && (input.fmt == AOM_IMG_FMT_I444 ||
+                                         input.fmt == AOM_IMG_FMT_I44416)) {
+              stream->config.cfg.g_profile = 1;
+              profile_updated = 1;
+            } else if (input.bit_depth < 12 &&
+                       (input.fmt == AOM_IMG_FMT_I420 ||
+                        input.fmt == AOM_IMG_FMT_I42016)) {
+              stream->config.cfg.g_profile = 0;
+              profile_updated = 1;
+            } else if (input.bit_depth == 12 &&
+                       input.file_type == FILE_TYPE_Y4M) {
+              // Note that here the input file values for chroma subsampling
+              // are used instead of those from the command line.
+              AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder,
+                                            AV1E_SET_CHROMA_SUBSAMPLING_X,
+                                            input.y4m.dst_c_dec_h >> 1);
+              AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder,
+                                            AV1E_SET_CHROMA_SUBSAMPLING_Y,
+                                            input.y4m.dst_c_dec_v >> 1);
+            } else if (input.bit_depth == 12 &&
+                       input.file_type == FILE_TYPE_RAW) {
+              AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder,
+                                            AV1E_SET_CHROMA_SUBSAMPLING_X,
+                                            stream->chroma_subsampling_x);
+              AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder,
+                                            AV1E_SET_CHROMA_SUBSAMPLING_Y,
+                                            stream->chroma_subsampling_y);
+            }
+            break;
+          default: break;
+        }
+      }
+      /* Automatically set the codec bit depth to match the input bit depth.
+       * Upgrade the profile if required. */
+      if (stream->config.cfg.g_input_bit_depth >
+          (unsigned int)stream->config.cfg.g_bit_depth) {
+        stream->config.cfg.g_bit_depth = stream->config.cfg.g_input_bit_depth;
+        if (!global.quiet) {
+          fprintf(stderr,
+                  "Warning: automatically updating bit depth to %d to "
+                  "match input format.\n",
+                  stream->config.cfg.g_input_bit_depth);
+        }
+      }
+      if (stream->config.cfg.g_bit_depth > 10) {
+        switch (stream->config.cfg.g_profile) {
+          case 0:
+          case 1:
+            stream->config.cfg.g_profile = 2;
+            profile_updated = 1;
+            break;
+          default: break;
+        }
+      }
+      if (stream->config.cfg.g_bit_depth > 8) {
+        stream->config.use_16bit_internal = 1;
+      }
+      if (profile_updated && !global.quiet) {
+        fprintf(stderr,
+                "Warning: automatically updating to profile %d to "
+                "match input format.\n",
+                stream->config.cfg.g_profile);
+      }
+      /* Set limit */
+      stream->config.cfg.g_limit = global.limit;
+    }
+
+    FOREACH_STREAM(stream, streams) {
+      set_stream_dimensions(stream, input.width, input.height);
+    }
+    FOREACH_STREAM(stream, streams) { validate_stream_config(stream, &global); }
+
+    /* Ensure that --passes and --pass are consistent. If --pass is set and
+     * --passes=2, ensure --fpf was set.
+     */
+    if (global.pass && global.passes == 2) {
+      FOREACH_STREAM(stream, streams) {
+        if (!stream->config.stats_fn)
+          die("Stream %d: Must specify --fpf when --pass=%d"
+              " and --passes=2\n",
+              stream->index, global.pass);
+      }
+    }
+
+#if !CONFIG_WEBM_IO
+    FOREACH_STREAM(stream, streams) {
+      if (stream->config.write_webm) {
+        stream->config.write_webm = 0;
+        stream->config.write_ivf = 0;
+        warn("aomenc compiled w/o WebM support. Writing OBU stream.");
+      }
+    }
+#endif
+
+    /* Use the frame rate from the file only if none was specified
+     * on the command-line.
+     */
+    if (!global.have_framerate) {
+      global.framerate.num = input.framerate.numerator;
+      global.framerate.den = input.framerate.denominator;
+    }
+    FOREACH_STREAM(stream, streams) {
+      stream->config.cfg.g_timebase.den = global.framerate.num;
+      stream->config.cfg.g_timebase.num = global.framerate.den;
+    }
+    /* Show configuration */
+    if (global.verbose && pass == 0) {
+      FOREACH_STREAM(stream, streams) {
+        show_stream_config(stream, &global, &input);
+      }
+    }
+
+    if (pass == (global.pass ? global.pass - 1 : 0)) {
+      if (input.file_type == FILE_TYPE_Y4M)
+        /*The Y4M reader does its own allocation.
+          Just initialize this here to avoid problems if we never read any
+          frames.*/
+        memset(&raw, 0, sizeof(raw));
+      else
+        aom_img_alloc(&raw, input.fmt, input.width, input.height, 32);
+
+      FOREACH_STREAM(stream, streams) {
+        stream->rate_hist =
+            init_rate_histogram(&stream->config.cfg, &global.framerate);
+      }
+    }
+
+    FOREACH_STREAM(stream, streams) { setup_pass(stream, &global, pass); }
+    FOREACH_STREAM(stream, streams) { initialize_encoder(stream, &global); }
+    FOREACH_STREAM(stream, streams) {
+      open_output_file(stream, &global, &input.pixel_aspect_ratio);
+    }
+
+    if (strcmp(global.codec->name, "av1") == 0 ||
+        strcmp(global.codec->name, "av1") == 0) {
+      // Check to see if at least one stream uses 16 bit internal.
+      // Currently assume that the bit_depths for all streams using
+      // highbitdepth are the same.
+      FOREACH_STREAM(stream, streams) {
+        if (stream->config.use_16bit_internal) {
+          use_16bit_internal = 1;
+        }
+        input_shift = (int)stream->config.cfg.g_bit_depth -
+                      stream->config.cfg.g_input_bit_depth;
+      };
+    }
+
+    frame_avail = 1;
+    got_data = 0;
+
+    while (frame_avail || got_data) {
+      struct aom_usec_timer timer;
+
+      if (!global.limit || frames_in < global.limit) {
+        frame_avail = read_frame(&input, &raw);
+
+        if (frame_avail) frames_in++;
+        seen_frames =
+            frames_in > global.skip_frames ? frames_in - global.skip_frames : 0;
+
+        if (!global.quiet) {
+          float fps = usec_to_fps(cx_time, seen_frames);
+          fprintf(stderr, "\rPass %d/%d ", pass + 1, global.passes);
+
+          if (stream_cnt == 1)
+            fprintf(stderr, "frame %4d/%-4d %7" PRId64 "B ", frames_in,
+                    streams->frames_out, (int64_t)streams->nbytes);
+          else
+            fprintf(stderr, "frame %4d ", frames_in);
+
+          fprintf(stderr, "%7" PRId64 " %s %.2f %s ",
+                  cx_time > 9999999 ? cx_time / 1000 : cx_time,
+                  cx_time > 9999999 ? "ms" : "us", fps >= 1.0 ? fps : fps * 60,
+                  fps >= 1.0 ? "fps" : "fpm");
+          print_time("ETA", estimated_time_left);
+        }
+
+      } else {
+        frame_avail = 0;
+      }
+
+      if (frames_in > global.skip_frames) {
+        aom_image_t *frame_to_encode;
+        if (input_shift || (use_16bit_internal && input.bit_depth == 8)) {
+          assert(use_16bit_internal);
+          // Input bit depth and stream bit depth do not match, so up
+          // shift frame to stream bit depth
+          if (!allocated_raw_shift) {
+            aom_img_alloc(&raw_shift, raw.fmt | AOM_IMG_FMT_HIGHBITDEPTH,
+                          input.width, input.height, 32);
+            allocated_raw_shift = 1;
+          }
+          aom_img_upshift(&raw_shift, &raw, input_shift);
+          frame_to_encode = &raw_shift;
+        } else {
+          frame_to_encode = &raw;
+        }
+        aom_usec_timer_start(&timer);
+        if (use_16bit_internal) {
+          assert(frame_to_encode->fmt & AOM_IMG_FMT_HIGHBITDEPTH);
+          FOREACH_STREAM(stream, streams) {
+            if (stream->config.use_16bit_internal)
+              encode_frame(stream, &global,
+                           frame_avail ? frame_to_encode : NULL, frames_in);
+            else
+              assert(0);
+          };
+        } else {
+          assert((frame_to_encode->fmt & AOM_IMG_FMT_HIGHBITDEPTH) == 0);
+          FOREACH_STREAM(stream, streams) {
+            encode_frame(stream, &global, frame_avail ? frame_to_encode : NULL,
+                         frames_in);
+          }
+        }
+        aom_usec_timer_mark(&timer);
+        cx_time += aom_usec_timer_elapsed(&timer);
+
+        FOREACH_STREAM(stream, streams) { update_quantizer_histogram(stream); }
+
+        got_data = 0;
+        FOREACH_STREAM(stream, streams) {
+          get_cx_data(stream, &global, &got_data);
+        }
+
+        if (!got_data && input.length && streams != NULL &&
+            !streams->frames_out) {
+          lagged_count = global.limit ? seen_frames : ftello(input.file);
+        } else if (input.length) {
+          int64_t remaining;
+          int64_t rate;
+
+          if (global.limit) {
+            const int64_t frame_in_lagged = (seen_frames - lagged_count) * 1000;
+
+            rate = cx_time ? frame_in_lagged * (int64_t)1000000 / cx_time : 0;
+            remaining = 1000 * (global.limit - global.skip_frames -
+                                seen_frames + lagged_count);
+          } else {
+            const int64_t input_pos = ftello(input.file);
+            const int64_t input_pos_lagged = input_pos - lagged_count;
+            const int64_t input_limit = input.length;
+
+            rate = cx_time ? input_pos_lagged * (int64_t)1000000 / cx_time : 0;
+            remaining = input_limit - input_pos + lagged_count;
+          }
+
+          average_rate =
+              (average_rate <= 0) ? rate : (average_rate * 7 + rate) / 8;
+          estimated_time_left = average_rate ? remaining / average_rate : -1;
+        }
+
+        if (got_data && global.test_decode != TEST_DECODE_OFF) {
+          FOREACH_STREAM(stream, streams) {
+            test_decode(stream, global.test_decode);
+          }
+        }
+      }
+
+      fflush(stdout);
+      if (!global.quiet) fprintf(stderr, "\033[K");
+    }
+
+    if (stream_cnt > 1) fprintf(stderr, "\n");
+
+    if (!global.quiet) {
+      FOREACH_STREAM(stream, streams) {
+        const int64_t bpf =
+            seen_frames ? (int64_t)(stream->nbytes * 8 / seen_frames) : 0;
+        const int64_t bps = bpf * global.framerate.num / global.framerate.den;
+        fprintf(stderr,
+                "\rPass %d/%d frame %4d/%-4d %7" PRId64 "B %7" PRId64
+                "b/f %7" PRId64
+                "b/s"
+                " %7" PRId64 " %s (%.2f fps)\033[K\n",
+                pass + 1, global.passes, frames_in, stream->frames_out,
+                (int64_t)stream->nbytes, bpf, bps,
+                stream->cx_time > 9999999 ? stream->cx_time / 1000
+                                          : stream->cx_time,
+                stream->cx_time > 9999999 ? "ms" : "us",
+                usec_to_fps(stream->cx_time, seen_frames));
+      }
+    }
+
+    if (global.show_psnr) {
+      if (global.codec->fourcc == AV1_FOURCC) {
+        FOREACH_STREAM(stream, streams) {
+          int64_t bps = 0;
+          if (stream->psnr_count && seen_frames && global.framerate.den) {
+            bps = (int64_t)stream->nbytes * 8 * (int64_t)global.framerate.num /
+                  global.framerate.den / seen_frames;
+          }
+          show_psnr(stream, (1 << stream->config.cfg.g_input_bit_depth) - 1,
+                    bps);
+        }
+      } else {
+        FOREACH_STREAM(stream, streams) { show_psnr(stream, 255.0, 0); }
+      }
+    }
+
+    FOREACH_STREAM(stream, streams) { aom_codec_destroy(&stream->encoder); }
+
+    if (global.test_decode != TEST_DECODE_OFF) {
+      FOREACH_STREAM(stream, streams) { aom_codec_destroy(&stream->decoder); }
+    }
+
+    close_input_file(&input);
+
+    if (global.test_decode == TEST_DECODE_FATAL) {
+      FOREACH_STREAM(stream, streams) { res |= stream->mismatch_seen; }
+    }
+    FOREACH_STREAM(stream, streams) {
+      close_output_file(stream, global.codec->fourcc);
+    }
+
+    FOREACH_STREAM(stream, streams) {
+      stats_close(&stream->stats, global.passes - 1);
+    }
+
+    if (global.pass) break;
+  }
+
+  if (global.show_q_hist_buckets) {
+    FOREACH_STREAM(stream, streams) {
+      show_q_histogram(stream->counts, global.show_q_hist_buckets);
+    }
+  }
+
+  if (global.show_rate_hist_buckets) {
+    FOREACH_STREAM(stream, streams) {
+      show_rate_histogram(stream->rate_hist, &stream->config.cfg,
+                          global.show_rate_hist_buckets);
+    }
+  }
+  FOREACH_STREAM(stream, streams) { destroy_rate_histogram(stream->rate_hist); }
+
+#if CONFIG_INTERNAL_STATS
+  /* TODO(jkoleszar): This doesn't belong in this executable. Do it for now,
+   * to match some existing utilities.
+   */
+  if (!(global.pass == 1 && global.passes == 2)) {
+    FOREACH_STREAM(stream, streams) {
+      FILE *f = fopen("opsnr.stt", "a");
+      if (stream->mismatch_seen) {
+        fprintf(f, "First mismatch occurred in frame %d\n",
+                stream->mismatch_seen);
+      } else {
+        fprintf(f, "No mismatch detected in recon buffers\n");
+      }
+      fclose(f);
+    }
+  }
+#endif
+
+  if (allocated_raw_shift) aom_img_free(&raw_shift);
+  aom_img_free(&raw);
+  free(argv);
+  free(streams);
+  return res ? EXIT_FAILURE : EXIT_SUCCESS;
+}
diff --git a/libs/libaom/src/apps/aomenc.h b/libs/libaom/src/apps/aomenc.h
new file mode 100644
index 000000000..a38258b87
--- /dev/null
+++ b/libs/libaom/src/apps/aomenc.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_APPS_AOMENC_H_
+#define AOM_APPS_AOMENC_H_
+
+#include "aom/aom_codec.h"
+#include "aom/aom_encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+enum TestDecodeFatality {
+  TEST_DECODE_OFF,
+  TEST_DECODE_FATAL,
+  TEST_DECODE_WARN,
+};
+
+typedef enum {
+  I420,  // 4:2:0 8+ bit-depth
+  I422,  // 4:2:2 8+ bit-depth
+  I444,  // 4:4:4 8+ bit-depth
+  YV12,  // 4:2:0 with uv flipped, only 8-bit depth
+} ColorInputType;
+
+struct AvxInterface;
+
+/* Configuration elements common to all streams. */
+struct AvxEncoderConfig {
+  const struct AvxInterface *codec;
+  int passes;
+  int pass;
+  unsigned int usage;
+  ColorInputType color_type;
+  int quiet;
+  int verbose;
+  int limit;
+  int skip_frames;
+  int show_psnr;
+  enum TestDecodeFatality test_decode;
+  int have_framerate;
+  struct aom_rational framerate;
+  int debug;
+  int show_q_hist_buckets;
+  int show_rate_hist_buckets;
+  int disable_warnings;
+  int disable_warning_prompt;
+  int experimental_bitstream;
+  aom_chroma_sample_position_t csp;
+  cfg_options_t encoder_config;
+};
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_APPS_AOMENC_H_
diff --git a/libs/libaom/src/av1/av1.cmake b/libs/libaom/src/av1/av1.cmake
new file mode 100644
index 000000000..2ab349630
--- /dev/null
+++ b/libs/libaom/src/av1/av1.cmake
@@ -0,0 +1,580 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_AV1_AV1_CMAKE_)
+  return()
+endif() # AOM_AV1_AV1_CMAKE_
+set(AOM_AV1_AV1_CMAKE_ 1)
+
+list(APPEND AOM_AV1_COMMON_SOURCES
+            "${AOM_ROOT}/av1/av1_iface_common.h"
+            "${AOM_ROOT}/av1/common/alloccommon.c"
+            "${AOM_ROOT}/av1/common/alloccommon.h"
+            "${AOM_ROOT}/av1/common/av1_common_int.h"
+            "${AOM_ROOT}/av1/common/av1_inv_txfm1d.c"
+            "${AOM_ROOT}/av1/common/av1_inv_txfm1d.h"
+            "${AOM_ROOT}/av1/common/av1_inv_txfm1d_cfg.h"
+            "${AOM_ROOT}/av1/common/av1_inv_txfm2d.c"
+            "${AOM_ROOT}/av1/common/av1_loopfilter.c"
+            "${AOM_ROOT}/av1/common/av1_loopfilter.h"
+            "${AOM_ROOT}/av1/common/av1_txfm.c"
+            "${AOM_ROOT}/av1/common/av1_txfm.h"
+            "${AOM_ROOT}/av1/common/blockd.c"
+            "${AOM_ROOT}/av1/common/blockd.h"
+            "${AOM_ROOT}/av1/common/cdef.c"
+            "${AOM_ROOT}/av1/common/cdef.h"
+            "${AOM_ROOT}/av1/common/cdef_block.c"
+            "${AOM_ROOT}/av1/common/cdef_block.h"
+            "${AOM_ROOT}/av1/common/cfl.c"
+            "${AOM_ROOT}/av1/common/cfl.h"
+            "${AOM_ROOT}/av1/common/common.h"
+            "${AOM_ROOT}/av1/common/common_data.h"
+            "${AOM_ROOT}/av1/common/convolve.c"
+            "${AOM_ROOT}/av1/common/convolve.h"
+            "${AOM_ROOT}/av1/common/debugmodes.c"
+            "${AOM_ROOT}/av1/common/entropy.c"
+            "${AOM_ROOT}/av1/common/entropy.h"
+            "${AOM_ROOT}/av1/common/entropymode.c"
+            "${AOM_ROOT}/av1/common/entropymode.h"
+            "${AOM_ROOT}/av1/common/entropymv.c"
+            "${AOM_ROOT}/av1/common/entropymv.h"
+            "${AOM_ROOT}/av1/common/enums.h"
+            "${AOM_ROOT}/av1/common/filter.h"
+            "${AOM_ROOT}/av1/common/frame_buffers.c"
+            "${AOM_ROOT}/av1/common/frame_buffers.h"
+            "${AOM_ROOT}/av1/common/idct.c"
+            "${AOM_ROOT}/av1/common/idct.h"
+            "${AOM_ROOT}/av1/common/mv.h"
+            "${AOM_ROOT}/av1/common/mvref_common.c"
+            "${AOM_ROOT}/av1/common/mvref_common.h"
+            "${AOM_ROOT}/av1/common/obu_util.c"
+            "${AOM_ROOT}/av1/common/obu_util.h"
+            "${AOM_ROOT}/av1/common/odintrin.c"
+            "${AOM_ROOT}/av1/common/odintrin.h"
+            "${AOM_ROOT}/av1/common/pred_common.c"
+            "${AOM_ROOT}/av1/common/pred_common.h"
+            "${AOM_ROOT}/av1/common/quant_common.c"
+            "${AOM_ROOT}/av1/common/quant_common.h"
+            "${AOM_ROOT}/av1/common/reconinter.c"
+            "${AOM_ROOT}/av1/common/reconinter.h"
+            "${AOM_ROOT}/av1/common/reconintra.c"
+            "${AOM_ROOT}/av1/common/reconintra.h"
+            "${AOM_ROOT}/av1/common/resize.c"
+            "${AOM_ROOT}/av1/common/resize.h"
+            "${AOM_ROOT}/av1/common/restoration.c"
+            "${AOM_ROOT}/av1/common/restoration.h"
+            "${AOM_ROOT}/av1/common/scale.c"
+            "${AOM_ROOT}/av1/common/scale.h"
+            "${AOM_ROOT}/av1/common/scan.c"
+            "${AOM_ROOT}/av1/common/scan.h"
+            "${AOM_ROOT}/av1/common/seg_common.c"
+            "${AOM_ROOT}/av1/common/seg_common.h"
+            "${AOM_ROOT}/av1/common/thread_common.c"
+            "${AOM_ROOT}/av1/common/thread_common.h"
+            "${AOM_ROOT}/av1/common/tile_common.c"
+            "${AOM_ROOT}/av1/common/tile_common.h"
+            "${AOM_ROOT}/av1/common/timing.c"
+            "${AOM_ROOT}/av1/common/timing.h"
+            "${AOM_ROOT}/av1/common/token_cdfs.h"
+            "${AOM_ROOT}/av1/common/txb_common.c"
+            "${AOM_ROOT}/av1/common/txb_common.h"
+            "${AOM_ROOT}/av1/common/warped_motion.c"
+            "${AOM_ROOT}/av1/common/warped_motion.h")
+
+if(CONFIG_LPF_MASK)
+  list(APPEND AOM_AV1_COMMON_SOURCES "${AOM_ROOT}/av1/common/loopfiltermask.c")
+endif()
+
+list(APPEND AOM_AV1_DECODER_SOURCES
+            "${AOM_ROOT}/av1/av1_dx_iface.c"
+            "${AOM_ROOT}/av1/decoder/decodeframe.c"
+            "${AOM_ROOT}/av1/decoder/decodeframe.h"
+            "${AOM_ROOT}/av1/decoder/decodemv.c"
+            "${AOM_ROOT}/av1/decoder/decodemv.h"
+            "${AOM_ROOT}/av1/decoder/decoder.c"
+            "${AOM_ROOT}/av1/decoder/decoder.h"
+            "${AOM_ROOT}/av1/decoder/decodetxb.c"
+            "${AOM_ROOT}/av1/decoder/decodetxb.h"
+            "${AOM_ROOT}/av1/decoder/detokenize.c"
+            "${AOM_ROOT}/av1/decoder/detokenize.h"
+            "${AOM_ROOT}/av1/decoder/dthread.h"
+            "${AOM_ROOT}/av1/decoder/obu.h"
+            "${AOM_ROOT}/av1/decoder/obu.c")
+
+list(APPEND AOM_AV1_ENCODER_SOURCES
+            "${AOM_ROOT}/av1/av1_cx_iface.c"
+            "${AOM_ROOT}/av1/encoder/aq_complexity.c"
+            "${AOM_ROOT}/av1/encoder/aq_complexity.h"
+            "${AOM_ROOT}/av1/encoder/aq_cyclicrefresh.c"
+            "${AOM_ROOT}/av1/encoder/aq_cyclicrefresh.h"
+            "${AOM_ROOT}/av1/encoder/aq_variance.c"
+            "${AOM_ROOT}/av1/encoder/aq_variance.h"
+            "${AOM_ROOT}/av1/encoder/enc_enums.h"
+            "${AOM_ROOT}/av1/encoder/av1_fwd_txfm1d.c"
+            "${AOM_ROOT}/av1/encoder/av1_fwd_txfm1d.h"
+            "${AOM_ROOT}/av1/encoder/av1_fwd_txfm1d_cfg.h"
+            "${AOM_ROOT}/av1/encoder/av1_fwd_txfm2d.c"
+            "${AOM_ROOT}/av1/encoder/av1_multi_thread.c"
+            "${AOM_ROOT}/av1/encoder/av1_multi_thread.h"
+            "${AOM_ROOT}/av1/encoder/av1_quantize.c"
+            "${AOM_ROOT}/av1/encoder/av1_quantize.h"
+            "${AOM_ROOT}/av1/encoder/bitstream.c"
+            "${AOM_ROOT}/av1/encoder/bitstream.h"
+            "${AOM_ROOT}/av1/encoder/block.h"
+            "${AOM_ROOT}/av1/encoder/cnn.c"
+            "${AOM_ROOT}/av1/encoder/cnn.h"
+            "${AOM_ROOT}/av1/encoder/compound_type.c"
+            "${AOM_ROOT}/av1/encoder/compound_type.h"
+            "${AOM_ROOT}/av1/encoder/context_tree.c"
+            "${AOM_ROOT}/av1/encoder/context_tree.h"
+            "${AOM_ROOT}/av1/encoder/corner_detect.c"
+            "${AOM_ROOT}/av1/encoder/corner_detect.h"
+            "${AOM_ROOT}/av1/encoder/corner_match.c"
+            "${AOM_ROOT}/av1/encoder/corner_match.h"
+            "${AOM_ROOT}/av1/encoder/cost.c"
+            "${AOM_ROOT}/av1/encoder/cost.h"
+            "${AOM_ROOT}/av1/encoder/encodeframe.c"
+            "${AOM_ROOT}/av1/encoder/encodeframe.h"
+            "${AOM_ROOT}/av1/encoder/encodemb.c"
+            "${AOM_ROOT}/av1/encoder/encodemb.h"
+            "${AOM_ROOT}/av1/encoder/encodemv.c"
+            "${AOM_ROOT}/av1/encoder/encodemv.h"
+            "${AOM_ROOT}/av1/encoder/encode_strategy.c"
+            "${AOM_ROOT}/av1/encoder/encode_strategy.h"
+            "${AOM_ROOT}/av1/encoder/encoder.c"
+            "${AOM_ROOT}/av1/encoder/encoder.h"
+            "${AOM_ROOT}/av1/encoder/encodetxb.c"
+            "${AOM_ROOT}/av1/encoder/encodetxb.h"
+            "${AOM_ROOT}/av1/encoder/ethread.c"
+            "${AOM_ROOT}/av1/encoder/ethread.h"
+            "${AOM_ROOT}/av1/encoder/extend.c"
+            "${AOM_ROOT}/av1/encoder/extend.h"
+            "${AOM_ROOT}/av1/encoder/firstpass.c"
+            "${AOM_ROOT}/av1/encoder/firstpass.h"
+            "${AOM_ROOT}/av1/encoder/global_motion.c"
+            "${AOM_ROOT}/av1/encoder/global_motion.h"
+            "${AOM_ROOT}/av1/encoder/gop_structure.c"
+            "${AOM_ROOT}/av1/encoder/gop_structure.h"
+            "${AOM_ROOT}/av1/encoder/grain_test_vectors.h"
+            "${AOM_ROOT}/av1/encoder/hash.c"
+            "${AOM_ROOT}/av1/encoder/hash.h"
+            "${AOM_ROOT}/av1/encoder/hash_motion.c"
+            "${AOM_ROOT}/av1/encoder/hash_motion.h"
+            "${AOM_ROOT}/av1/encoder/hybrid_fwd_txfm.c"
+            "${AOM_ROOT}/av1/encoder/hybrid_fwd_txfm.h"
+            "${AOM_ROOT}/av1/encoder/interp_search.c"
+            "${AOM_ROOT}/av1/encoder/interp_search.h"
+            "${AOM_ROOT}/av1/encoder/level.c"
+            "${AOM_ROOT}/av1/encoder/level.h"
+            "${AOM_ROOT}/av1/encoder/lookahead.c"
+            "${AOM_ROOT}/av1/encoder/lookahead.h"
+            "${AOM_ROOT}/av1/encoder/mcomp.c"
+            "${AOM_ROOT}/av1/encoder/mcomp.h"
+            "${AOM_ROOT}/av1/encoder/ml.c"
+            "${AOM_ROOT}/av1/encoder/ml.h"
+            "${AOM_ROOT}/av1/encoder/model_rd.h"
+            "${AOM_ROOT}/av1/encoder/motion_search_facade.c"
+            "${AOM_ROOT}/av1/encoder/motion_search_facade.h"
+            "${AOM_ROOT}/av1/encoder/mv_prec.c"
+            "${AOM_ROOT}/av1/encoder/mv_prec.h"
+            "${AOM_ROOT}/av1/encoder/palette.c"
+            "${AOM_ROOT}/av1/encoder/palette.h"
+            "${AOM_ROOT}/av1/encoder/partition_strategy.h"
+            "${AOM_ROOT}/av1/encoder/partition_strategy.c"
+            "${AOM_ROOT}/av1/encoder/pass2_strategy.h"
+            "${AOM_ROOT}/av1/encoder/pass2_strategy.c"
+            "${AOM_ROOT}/av1/encoder/pickcdef.c"
+            "${AOM_ROOT}/av1/encoder/picklpf.c"
+            "${AOM_ROOT}/av1/encoder/picklpf.h"
+            "${AOM_ROOT}/av1/encoder/pickrst.c"
+            "${AOM_ROOT}/av1/encoder/pickrst.h"
+            "${AOM_ROOT}/av1/encoder/ransac.c"
+            "${AOM_ROOT}/av1/encoder/ransac.h"
+            "${AOM_ROOT}/av1/encoder/ratectrl.c"
+            "${AOM_ROOT}/av1/encoder/ratectrl.h"
+            "${AOM_ROOT}/av1/encoder/rd.c"
+            "${AOM_ROOT}/av1/encoder/rd.h"
+            "${AOM_ROOT}/av1/encoder/rdopt.c"
+            "${AOM_ROOT}/av1/encoder/nonrd_pickmode.c"
+            "${AOM_ROOT}/av1/encoder/rdopt.h"
+            "${AOM_ROOT}/av1/encoder/rdopt_data_defs.h"
+            "${AOM_ROOT}/av1/encoder/rdopt_utils.h"
+            "${AOM_ROOT}/av1/encoder/reconinter_enc.c"
+            "${AOM_ROOT}/av1/encoder/reconinter_enc.h"
+            "${AOM_ROOT}/av1/encoder/segmentation.c"
+            "${AOM_ROOT}/av1/encoder/segmentation.h"
+            "${AOM_ROOT}/av1/encoder/speed_features.c"
+            "${AOM_ROOT}/av1/encoder/speed_features.h"
+            "${AOM_ROOT}/av1/encoder/svc_layercontext.c"
+            "${AOM_ROOT}/av1/encoder/svc_layercontext.h"
+            "${AOM_ROOT}/av1/encoder/temporal_filter.c"
+            "${AOM_ROOT}/av1/encoder/temporal_filter.h"
+            "${AOM_ROOT}/av1/encoder/tokenize.c"
+            "${AOM_ROOT}/av1/encoder/tokenize.h"
+            "${AOM_ROOT}/av1/encoder/tpl_model.c"
+            "${AOM_ROOT}/av1/encoder/tpl_model.h"
+            "${AOM_ROOT}/av1/encoder/tx_search.c"
+            "${AOM_ROOT}/av1/encoder/tx_search.h"
+            "${AOM_ROOT}/av1/encoder/intra_mode_search.c"
+            "${AOM_ROOT}/av1/encoder/intra_mode_search.h"
+            "${AOM_ROOT}/av1/encoder/wedge_utils.c"
+            "${AOM_ROOT}/av1/encoder/var_based_part.c"
+            "${AOM_ROOT}/av1/encoder/var_based_part.h"
+            "${AOM_ROOT}/third_party/fastfeat/fast.c"
+            "${AOM_ROOT}/third_party/fastfeat/fast.h"
+            "${AOM_ROOT}/third_party/fastfeat/fast_9.c"
+            "${AOM_ROOT}/third_party/fastfeat/nonmax.c"
+            "${AOM_ROOT}/third_party/vector/vector.c"
+            "${AOM_ROOT}/third_party/vector/vector.h"
+            "${AOM_ROOT}/av1/encoder/dwt.c"
+            "${AOM_ROOT}/av1/encoder/dwt.h")
+
+if(CONFIG_TUNE_VMAF)
+  list(APPEND AOM_AV1_ENCODER_SOURCES "${AOM_ROOT}/av1/encoder/tune_vmaf.c"
+              "${AOM_ROOT}/av1/encoder/tune_vmaf.h")
+endif()
+
+list(APPEND AOM_AV1_COMMON_INTRIN_SSE2
+            "${AOM_ROOT}/av1/common/cdef_block_sse2.c"
+            "${AOM_ROOT}/av1/common/x86/cfl_sse2.c"
+            "${AOM_ROOT}/av1/common/x86/convolve_2d_sse2.c"
+            "${AOM_ROOT}/av1/common/x86/convolve_sse2.c"
+            "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_sse2.c"
+            "${AOM_ROOT}/av1/common/x86/jnt_convolve_sse2.c"
+            "${AOM_ROOT}/av1/common/x86/wiener_convolve_sse2.c"
+            "${AOM_ROOT}/av1/common/x86/av1_txfm_sse2.h"
+            "${AOM_ROOT}/av1/common/x86/warp_plane_sse2.c")
+
+if(NOT CONFIG_AV1_HIGHBITDEPTH)
+  list(REMOVE_ITEM AOM_AV1_COMMON_INTRIN_SSE2
+                   "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_sse2.c")
+endif()
+
+list(APPEND AOM_AV1_COMMON_INTRIN_SSSE3
+            "${AOM_ROOT}/av1/common/cdef_block_ssse3.c"
+            "${AOM_ROOT}/av1/common/x86/av1_inv_txfm_ssse3.c"
+            "${AOM_ROOT}/av1/common/x86/av1_inv_txfm_ssse3.h"
+            "${AOM_ROOT}/av1/common/x86/cfl_ssse3.c"
+            "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_ssse3.c"
+            "${AOM_ROOT}/av1/common/x86/highbd_wiener_convolve_ssse3.c"
+            "${AOM_ROOT}/av1/common/x86/jnt_convolve_ssse3.c"
+            "${AOM_ROOT}/av1/common/x86/reconinter_ssse3.c")
+
+if(NOT CONFIG_AV1_HIGHBITDEPTH)
+  list(REMOVE_ITEM AOM_AV1_COMMON_INTRIN_SSSE3
+                   "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_ssse3.c")
+endif()
+
+list(APPEND AOM_AV1_COMMON_INTRIN_SSE4_1
+            "${AOM_ROOT}/av1/common/cdef_block_sse4.c"
+            "${AOM_ROOT}/av1/common/x86/av1_convolve_horiz_rs_sse4.c"
+            "${AOM_ROOT}/av1/common/x86/av1_convolve_scale_sse4.c"
+            "${AOM_ROOT}/av1/common/x86/av1_txfm_sse4.c"
+            "${AOM_ROOT}/av1/common/x86/av1_txfm_sse4.h"
+            "${AOM_ROOT}/av1/common/x86/filterintra_sse4.c"
+            "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_sse4.c"
+            "${AOM_ROOT}/av1/common/x86/highbd_inv_txfm_sse4.c"
+            "${AOM_ROOT}/av1/common/x86/highbd_jnt_convolve_sse4.c"
+            "${AOM_ROOT}/av1/common/x86/highbd_warp_plane_sse4.c"
+            "${AOM_ROOT}/av1/common/x86/intra_edge_sse4.c"
+            "${AOM_ROOT}/av1/common/x86/reconinter_sse4.c"
+            "${AOM_ROOT}/av1/common/x86/selfguided_sse4.c"
+            "${AOM_ROOT}/av1/common/x86/warp_plane_sse4.c")
+
+if(NOT CONFIG_AV1_HIGHBITDEPTH)
+  list(REMOVE_ITEM AOM_AV1_COMMON_INTRIN_SSE4_1
+                   "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_sse4.c"
+                   "${AOM_ROOT}/av1/common/x86/highbd_warp_plane_sse4.c")
+endif()
+
+list(APPEND AOM_AV1_COMMON_INTRIN_AVX2
+            "${AOM_ROOT}/av1/common/cdef_block_avx2.c"
+            "${AOM_ROOT}/av1/common/x86/av1_inv_txfm_avx2.c"
+            "${AOM_ROOT}/av1/common/x86/av1_inv_txfm_avx2.h"
+            "${AOM_ROOT}/av1/common/x86/cfl_avx2.c"
+            "${AOM_ROOT}/av1/common/x86/convolve_2d_avx2.c"
+            "${AOM_ROOT}/av1/common/x86/convolve_avx2.c"
+            "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_avx2.c"
+            "${AOM_ROOT}/av1/common/x86/highbd_inv_txfm_avx2.c"
+            "${AOM_ROOT}/av1/common/x86/highbd_jnt_convolve_avx2.c"
+            "${AOM_ROOT}/av1/common/x86/highbd_wiener_convolve_avx2.c"
+            "${AOM_ROOT}/av1/common/x86/jnt_convolve_avx2.c"
+            "${AOM_ROOT}/av1/common/x86/reconinter_avx2.c"
+            "${AOM_ROOT}/av1/common/x86/selfguided_avx2.c"
+            "${AOM_ROOT}/av1/common/x86/warp_plane_avx2.c"
+            "${AOM_ROOT}/av1/common/x86/wiener_convolve_avx2.c")
+
+if(NOT CONFIG_AV1_HIGHBITDEPTH)
+  list(REMOVE_ITEM AOM_AV1_COMMON_INTRIN_AVX2
+                   "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_avx2.c")
+endif()
+
+list(APPEND AOM_AV1_ENCODER_ASM_SSE2 "${AOM_ROOT}/av1/encoder/x86/dct_sse2.asm"
+            "${AOM_ROOT}/av1/encoder/x86/error_sse2.asm")
+
+list(APPEND AOM_AV1_ENCODER_INTRIN_SSE2
+            "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm_sse2.c"
+            "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm_sse2.h"
+            "${AOM_ROOT}/av1/encoder/x86/av1_quantize_sse2.c"
+            "${AOM_ROOT}/av1/encoder/x86/encodetxb_sse2.c"
+            "${AOM_ROOT}/av1/encoder/x86/highbd_block_error_intrin_sse2.c"
+            "${AOM_ROOT}/av1/encoder/x86/temporal_filter_sse2.c"
+            "${AOM_ROOT}/av1/encoder/x86/wedge_utils_sse2.c")
+
+if(NOT CONFIG_AV1_HIGHBITDEPTH)
+  list(
+    REMOVE_ITEM AOM_AV1_ENCODER_INTRIN_SSE2
+                "${AOM_ROOT}/av1/encoder/x86/highbd_block_error_intrin_sse2.c")
+endif()
+
+list(APPEND AOM_AV1_ENCODER_INTRIN_SSE3 "${AOM_ROOT}/av1/encoder/x86/ml_sse3.c")
+
+list(APPEND AOM_AV1_ENCODER_ASM_SSSE3_X86_64
+            "${AOM_ROOT}/av1/encoder/x86/av1_quantize_ssse3_x86_64.asm")
+
+list(APPEND AOM_AV1_ENCODER_INTRIN_SSE4_1
+            "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm1d_sse4.c"
+            "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm2d_sse4.c"
+            "${AOM_ROOT}/av1/encoder/x86/av1_highbd_quantize_sse4.c"
+            "${AOM_ROOT}/av1/encoder/x86/corner_match_sse4.c"
+            "${AOM_ROOT}/av1/encoder/x86/encodetxb_sse4.c"
+            "${AOM_ROOT}/av1/encoder/x86/highbd_fwd_txfm_sse4.c"
+            "${AOM_ROOT}/av1/encoder/x86/rdopt_sse4.c"
+            "${AOM_ROOT}/av1/encoder/x86/temporal_filter_constants.h"
+            "${AOM_ROOT}/av1/encoder/x86/temporal_filter_sse4.c"
+            "${AOM_ROOT}/av1/encoder/x86/pickrst_sse4.c")
+
+list(APPEND AOM_AV1_ENCODER_INTRIN_AVX2
+            "${AOM_ROOT}/av1/encoder/x86/av1_quantize_avx2.c"
+            "${AOM_ROOT}/av1/encoder/x86/av1_highbd_quantize_avx2.c"
+            "${AOM_ROOT}/av1/encoder/x86/corner_match_avx2.c"
+            "${AOM_ROOT}/av1/encoder/x86/error_intrin_avx2.c"
+            "${AOM_ROOT}/av1/encoder/x86/highbd_block_error_intrin_avx2.c"
+            "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm_avx2.h"
+            "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm2d_avx2.c"
+            "${AOM_ROOT}/av1/encoder/x86/highbd_fwd_txfm_avx2.c"
+            "${AOM_ROOT}/av1/encoder/x86/wedge_utils_avx2.c"
+            "${AOM_ROOT}/av1/encoder/x86/encodetxb_avx2.c"
+            "${AOM_ROOT}/av1/encoder/x86/rdopt_avx2.c"
+            "${AOM_ROOT}/av1/encoder/x86/temporal_filter_avx2.c"
+            "${AOM_ROOT}/av1/encoder/x86/pickrst_avx2.c")
+
+if(NOT CONFIG_AV1_HIGHBITDEPTH)
+  list(
+    REMOVE_ITEM AOM_AV1_ENCODER_INTRIN_AVX2
+                "${AOM_ROOT}/av1/encoder/x86/highbd_block_error_intrin_avx2.c")
+endif()
+
+list(APPEND AOM_AV1_ENCODER_INTRIN_NEON
+            "${AOM_ROOT}/av1/encoder/arm/neon/quantize_neon.c"
+            "${AOM_ROOT}/av1/encoder/arm/neon/av1_error_neon.c")
+
+list(APPEND AOM_AV1_ENCODER_INTRIN_MSA
+            "${AOM_ROOT}/av1/encoder/mips/msa/error_msa.c"
+            "${AOM_ROOT}/av1/encoder/mips/msa/fdct4x4_msa.c"
+            "${AOM_ROOT}/av1/encoder/mips/msa/temporal_filter_msa.c")
+
+list(APPEND AOM_AV1_COMMON_INTRIN_NEON
+            "${AOM_ROOT}/av1/common/arm/av1_txfm_neon.c"
+            "${AOM_ROOT}/av1/common/arm/cfl_neon.c"
+            "${AOM_ROOT}/av1/common/arm/convolve_neon.c"
+            "${AOM_ROOT}/av1/common/arm/convolve_neon.h"
+            "${AOM_ROOT}/av1/common/arm/jnt_convolve_neon.c"
+            "${AOM_ROOT}/av1/common/arm/mem_neon.h"
+            "${AOM_ROOT}/av1/common/arm/transpose_neon.h"
+            "${AOM_ROOT}/av1/common/arm/blend_a64_hmask_neon.c"
+            "${AOM_ROOT}/av1/common/arm/blend_a64_vmask_neon.c"
+            "${AOM_ROOT}/av1/common/arm/reconinter_neon.c"
+            "${AOM_ROOT}/av1/common/arm/wiener_convolve_neon.c"
+            "${AOM_ROOT}/av1/common/arm/selfguided_neon.c"
+            "${AOM_ROOT}/av1/common/arm/av1_inv_txfm_neon.c"
+            "${AOM_ROOT}/av1/common/arm/av1_inv_txfm_neon.h"
+            "${AOM_ROOT}/av1/common/arm/warp_plane_neon.c"
+            "${AOM_ROOT}/av1/common/cdef_block_neon.c")
+
+list(APPEND AOM_AV1_ENCODER_INTRIN_SSE4_2
+            "${AOM_ROOT}/av1/encoder/x86/hash_sse42.c")
+
+list(APPEND AOM_AV1_COMMON_INTRIN_VSX "${AOM_ROOT}/av1/common/ppc/cfl_ppc.c")
+
+if(CONFIG_ACCOUNTING)
+  list(APPEND AOM_AV1_DECODER_SOURCES "${AOM_ROOT}/av1/decoder/accounting.c"
+              "${AOM_ROOT}/av1/decoder/accounting.h")
+endif()
+
+if(CONFIG_INSPECTION)
+  list(APPEND AOM_AV1_DECODER_SOURCES "${AOM_ROOT}/av1/decoder/inspection.c"
+              "${AOM_ROOT}/av1/decoder/inspection.h")
+endif()
+
+if(CONFIG_INTERNAL_STATS)
+  list(APPEND AOM_AV1_ENCODER_SOURCES "${AOM_ROOT}/av1/encoder/blockiness.c")
+endif()
+
+if(CONFIG_REALTIME_ONLY)
+  list(REMOVE_ITEM AOM_AV1_ENCODER_SOURCES
+                   "${AOM_ROOT}/av1/encoder/cnn.c"
+                   "${AOM_ROOT}/av1/encoder/cnn.h"
+                   "${AOM_ROOT}/av1/encoder/firstpass.c"
+                   "${AOM_ROOT}/av1/encoder/firstpass.h"
+                   "${AOM_ROOT}/av1/encoder/gop_structure.c"
+                   "${AOM_ROOT}/av1/encoder/gop_structure.h"
+                   "${AOM_ROOT}/av1/encoder/misc_model_weights.h"
+                   "${AOM_ROOT}/av1/encoder/partition_cnn_weights.h"
+                   "${AOM_ROOT}/av1/encoder/partition_model_weights.h"
+                   "${AOM_ROOT}/av1/encoder/pass2_strategy.c"
+                   "${AOM_ROOT}/av1/encoder/temporal_filter.c"
+                   "${AOM_ROOT}/av1/encoder/temporal_filter.h"
+                   "${AOM_ROOT}/av1/encoder/temporal_filter_constants.h"
+                   "${AOM_ROOT}/av1/encoder/tpl_model.c"
+                   "${AOM_ROOT}/av1/encoder/tpl_model.h"
+                   "${AOM_ROOT}/av1/encoder/x86/temporal_filter_sse4.c")
+endif()
+
+# Setup AV1 common/decoder/encoder targets. The libaom target must exist before
+# this function is called.
+function(setup_av1_targets)
+  add_library(aom_av1_common OBJECT ${AOM_AV1_COMMON_SOURCES})
+  list(APPEND AOM_LIB_TARGETS aom_av1_common)
+  target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_av1_common>)
+  if(BUILD_SHARED_LIBS)
+    target_sources(aom_static PRIVATE $<TARGET_OBJECTS:aom_av1_common>)
+  endif()
+
+  if(CONFIG_AV1_DECODER)
+    add_library(aom_av1_decoder OBJECT ${AOM_AV1_DECODER_SOURCES})
+    set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_av1_decoder)
+    target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_av1_decoder>)
+    if(BUILD_SHARED_LIBS)
+      target_sources(aom_static PRIVATE $<TARGET_OBJECTS:aom_av1_decoder>)
+    endif()
+  endif()
+
+  if(CONFIG_AV1_ENCODER)
+    add_library(aom_av1_encoder OBJECT ${AOM_AV1_ENCODER_SOURCES})
+    set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_av1_encoder)
+    target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_av1_encoder>)
+    if(BUILD_SHARED_LIBS)
+      target_sources(aom_static PRIVATE $<TARGET_OBJECTS:aom_av1_encoder>)
+    endif()
+  endif()
+
+  if(HAVE_SSE2)
+    require_compiler_flag_nomsvc("-msse2" NO)
+    add_intrinsics_object_library("-msse2" "sse2" "aom_av1_common"
+                                  "AOM_AV1_COMMON_INTRIN_SSE2")
+    if(CONFIG_AV1_DECODER)
+      if(AOM_AV1_DECODER_ASM_SSE2)
+        add_asm_library("aom_av1_decoder_sse2" "AOM_AV1_DECODER_ASM_SSE2")
+      endif()
+
+      if(AOM_AV1_DECODER_INTRIN_SSE2)
+        add_intrinsics_object_library("-msse2" "sse2" "aom_av1_decoder"
+                                      "AOM_AV1_DECODER_INTRIN_SSE2")
+      endif()
+    endif()
+
+    if(CONFIG_AV1_ENCODER)
+      add_asm_library("aom_av1_encoder_sse2" "AOM_AV1_ENCODER_ASM_SSE2")
+      add_intrinsics_object_library("-msse2" "sse2" "aom_av1_encoder"
+                                    "AOM_AV1_ENCODER_INTRIN_SSE2")
+    endif()
+  endif()
+
+  if(HAVE_SSE3)
+    require_compiler_flag_nomsvc("-msse3" NO)
+    if(CONFIG_AV1_ENCODER)
+      add_intrinsics_object_library("-msse3" "sse3" "aom_av1_encoder"
+                                    "AOM_AV1_ENCODER_INTRIN_SSE3")
+    endif()
+  endif()
+
+  if(HAVE_SSSE3)
+    require_compiler_flag_nomsvc("-mssse3" NO)
+    add_intrinsics_object_library("-mssse3" "ssse3" "aom_av1_common"
+                                  "AOM_AV1_COMMON_INTRIN_SSSE3")
+
+    if(CONFIG_AV1_DECODER)
+      if(AOM_AV1_DECODER_INTRIN_SSSE3)
+        add_intrinsics_object_library("-mssse3" "ssse3" "aom_av1_decoder"
+                                      "AOM_AV1_DECODER_INTRIN_SSSE3")
+      endif()
+    endif()
+  endif()
+
+  if(HAVE_SSE4_1)
+    require_compiler_flag_nomsvc("-msse4.1" NO)
+    add_intrinsics_object_library("-msse4.1" "sse4" "aom_av1_common"
+                                  "AOM_AV1_COMMON_INTRIN_SSE4_1")
+
+    if(CONFIG_AV1_ENCODER)
+      if("${AOM_TARGET_CPU}" STREQUAL "x86_64")
+        add_asm_library("aom_av1_encoder_ssse3"
+                        "AOM_AV1_ENCODER_ASM_SSSE3_X86_64")
+      endif()
+
+      if(AOM_AV1_ENCODER_INTRIN_SSE4_1)
+        add_intrinsics_object_library("-msse4.1" "sse4" "aom_av1_encoder"
+                                      "AOM_AV1_ENCODER_INTRIN_SSE4_1")
+      endif()
+    endif()
+  endif()
+
+  if(HAVE_SSE4_2)
+    require_compiler_flag_nomsvc("-msse4.2" NO)
+    if(CONFIG_AV1_ENCODER)
+      if(AOM_AV1_ENCODER_INTRIN_SSE4_2)
+        add_intrinsics_object_library("-msse4.2" "sse42" "aom_av1_encoder"
+                                      "AOM_AV1_ENCODER_INTRIN_SSE4_2")
+      endif()
+    endif()
+  endif()
+
+  if(HAVE_AVX2)
+    require_compiler_flag_nomsvc("-mavx2" NO)
+    add_intrinsics_object_library("-mavx2" "avx2" "aom_av1_common"
+                                  "AOM_AV1_COMMON_INTRIN_AVX2")
+
+    if(CONFIG_AV1_ENCODER)
+      add_intrinsics_object_library("-mavx2" "avx2" "aom_av1_encoder"
+                                    "AOM_AV1_ENCODER_INTRIN_AVX2")
+    endif()
+  endif()
+
+  if(HAVE_NEON)
+    if(AOM_AV1_COMMON_INTRIN_NEON)
+      add_intrinsics_object_library("${AOM_NEON_INTRIN_FLAG}" "neon"
+                                    "aom_av1_common"
+                                    "AOM_AV1_COMMON_INTRIN_NEON")
+    endif()
+
+    if(AOM_AV1_ENCODER_INTRIN_NEON)
+      add_intrinsics_object_library("${AOM_NEON_INTRIN_FLAG}" "neon"
+                                    "aom_av1_encoder"
+                                    "AOM_AV1_ENCODER_INTRIN_NEON")
+    endif()
+  endif()
+
+  if(HAVE_VSX)
+    if(AOM_AV1_COMMON_INTRIN_VSX)
+      add_intrinsics_object_library("-mvsx -maltivec" "vsx" "aom_av1_common"
+                                    "AOM_AV1_COMMON_INTRIN_VSX")
+    endif()
+  endif()
+
+  if(HAVE_MSA)
+    add_intrinsics_object_library("" "msa" "aom_av1_encoder"
+                                  "AOM_AV1_ENCODER_INTRIN_MSA")
+  endif()
+
+  # Pass the new lib targets up to the parent scope instance of
+  # $AOM_LIB_TARGETS.
+  set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} PARENT_SCOPE)
+endfunction()
diff --git a/libs/libaom/src/av1/av1_cx_iface.c b/libs/libaom/src/av1/av1_cx_iface.c
new file mode 100644
index 000000000..676eaa0ad
--- /dev/null
+++ b/libs/libaom/src/av1/av1_cx_iface.c
@@ -0,0 +1,2936 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <stdlib.h>
+#include <string.h>
+
+#include "config/aom_config.h"
+#include "config/aom_version.h"
+
+#include "aom_ports/aom_once.h"
+#include "aom_ports/mem_ops.h"
+#include "aom_ports/system_state.h"
+
+#include "aom/aom_encoder.h"
+#include "aom/internal/aom_codec_internal.h"
+
+#include "av1/av1_iface_common.h"
+#include "av1/encoder/bitstream.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/firstpass.h"
+
+#define MAG_SIZE (4)
+
+struct av1_extracfg {
+  int cpu_used;
+  unsigned int enable_auto_alt_ref;
+  unsigned int enable_auto_bwd_ref;
+  unsigned int noise_sensitivity;
+  unsigned int sharpness;
+  unsigned int static_thresh;
+  unsigned int row_mt;
+  unsigned int tile_columns;  // log2 number of tile columns
+  unsigned int tile_rows;     // log2 number of tile rows
+  unsigned int enable_tpl_model;
+  unsigned int enable_keyframe_filtering;
+  unsigned int arnr_max_frames;
+  unsigned int arnr_strength;
+  unsigned int min_gf_interval;
+  unsigned int max_gf_interval;
+  unsigned int gf_min_pyr_height;
+  unsigned int gf_max_pyr_height;
+  aom_tune_metric tuning;
+  const char *vmaf_model_path;
+  unsigned int cq_level;  // constrained quality level
+  unsigned int rc_max_intra_bitrate_pct;
+  unsigned int rc_max_inter_bitrate_pct;
+  unsigned int gf_cbr_boost_pct;
+  unsigned int lossless;
+  unsigned int enable_cdef;
+  unsigned int enable_restoration;
+  unsigned int force_video_mode;
+  unsigned int enable_obmc;
+  unsigned int disable_trellis_quant;
+  unsigned int enable_qm;
+  unsigned int qm_y;
+  unsigned int qm_u;
+  unsigned int qm_v;
+  unsigned int qm_min;
+  unsigned int qm_max;
+  unsigned int num_tg;
+  unsigned int mtu_size;
+
+  aom_timing_info_type_t timing_info_type;
+  unsigned int frame_parallel_decoding_mode;
+  int enable_dual_filter;
+  unsigned int enable_chroma_deltaq;
+  AQ_MODE aq_mode;
+  DELTAQ_MODE deltaq_mode;
+  int deltalf_mode;
+  unsigned int frame_periodic_boost;
+  aom_bit_depth_t bit_depth;
+  aom_tune_content content;
+  aom_color_primaries_t color_primaries;
+  aom_transfer_characteristics_t transfer_characteristics;
+  aom_matrix_coefficients_t matrix_coefficients;
+  aom_chroma_sample_position_t chroma_sample_position;
+  int color_range;
+  int render_width;
+  int render_height;
+  aom_superblock_size_t superblock_size;
+  unsigned int single_tile_decoding;
+  int error_resilient_mode;
+  int s_frame_mode;
+
+  int film_grain_test_vector;
+  const char *film_grain_table_filename;
+  unsigned int motion_vector_unit_test;
+  unsigned int cdf_update_mode;
+  int enable_rect_partitions;    // enable rectangular partitions for sequence
+  int enable_ab_partitions;      // enable AB partitions for sequence
+  int enable_1to4_partitions;    // enable 1:4 and 4:1 partitions for sequence
+  int min_partition_size;        // min partition size [4,8,16,32,64,128]
+  int max_partition_size;        // max partition size [4,8,16,32,64,128]
+  int enable_intra_edge_filter;  // enable intra-edge filter for sequence
+  int enable_order_hint;         // enable order hint for sequence
+  int enable_tx64;               // enable 64-pt transform usage for sequence
+  int enable_flip_idtx;          // enable flip and identity transform types
+  int enable_dist_wtd_comp;      // enable dist wtd compound for sequence
+  int max_reference_frames;      // maximum number of references per frame
+  int enable_reduced_reference_set;  // enable reduced set of references
+  int enable_ref_frame_mvs;          // sequence level
+  int allow_ref_frame_mvs;           // frame level
+  int enable_masked_comp;            // enable masked compound for sequence
+  int enable_onesided_comp;          // enable one sided compound for sequence
+  int enable_interintra_comp;        // enable interintra compound for sequence
+  int enable_smooth_interintra;      // enable smooth interintra mode usage
+  int enable_diff_wtd_comp;          // enable diff-wtd compound usage
+  int enable_interinter_wedge;       // enable interinter-wedge compound usage
+  int enable_interintra_wedge;       // enable interintra-wedge compound usage
+  int enable_global_motion;          // enable global motion usage for sequence
+  int enable_warped_motion;          // sequence level
+  int allow_warped_motion;           // frame level
+  int enable_filter_intra;           // enable filter intra for sequence
+  int enable_smooth_intra;           // enable smooth intra modes for sequence
+  int enable_paeth_intra;            // enable Paeth intra mode for sequence
+  int enable_cfl_intra;              // enable CFL uv intra mode for sequence
+  int enable_superres;
+  int enable_overlay;  // enable overlay for filtered arf frames
+  int enable_palette;
+  int enable_intrabc;
+  int enable_angle_delta;
+#if CONFIG_DENOISE
+  float noise_level;
+  int noise_block_size;
+#endif
+
+  unsigned int chroma_subsampling_x;
+  unsigned int chroma_subsampling_y;
+  int reduced_tx_type_set;
+  int use_intra_dct_only;
+  int use_inter_dct_only;
+  int use_intra_default_tx_only;
+  int quant_b_adapt;
+  AV1_LEVEL target_seq_level_idx[MAX_NUM_OPERATING_POINTS];
+  // Bit mask to specify which tier each of the 32 possible operating points
+  // conforms to.
+  unsigned int tier_mask;
+  // min_cr / 100 is the target minimum compression ratio for each frame.
+  unsigned int min_cr;
+  COST_UPDATE_TYPE coeff_cost_upd_freq;
+  COST_UPDATE_TYPE mode_cost_upd_freq;
+  COST_UPDATE_TYPE mv_cost_upd_freq;
+  unsigned int ext_tile_debug;
+  unsigned int sb_multipass_unit_test;
+};
+
+static struct av1_extracfg default_extra_cfg = {
+  0,              // cpu_used
+  1,              // enable_auto_alt_ref
+  0,              // enable_auto_bwd_ref
+  0,              // noise_sensitivity
+  0,              // sharpness
+  0,              // static_thresh
+  1,              // row_mt
+  0,              // tile_columns
+  0,              // tile_rows
+  1,              // enable_tpl_model
+  1,              // enable_keyframe_filtering
+  7,              // arnr_max_frames
+  5,              // arnr_strength
+  0,              // min_gf_interval; 0 -> default decision
+  0,              // max_gf_interval; 0 -> default decision
+  0,              // gf_min_pyr_height
+  5,              // gf_max_pyr_height
+  AOM_TUNE_PSNR,  // tuning
+  "/usr/local/share/model/vmaf_v0.6.1.pkl",  // VMAF model path
+  10,                                        // cq_level
+  0,                                         // rc_max_intra_bitrate_pct
+  0,                                         // rc_max_inter_bitrate_pct
+  0,                                         // gf_cbr_boost_pct
+  0,                                         // lossless
+  1,                                         // enable_cdef
+  1,                                         // enable_restoration
+  0,                                         // force_video_mode
+  1,                                         // enable_obmc
+  3,                                         // disable_trellis_quant
+  0,                                         // enable_qm
+  DEFAULT_QM_Y,                              // qm_y
+  DEFAULT_QM_U,                              // qm_u
+  DEFAULT_QM_V,                              // qm_v
+  DEFAULT_QM_FIRST,                          // qm_min
+  DEFAULT_QM_LAST,                           // qm_max
+  1,                                         // max number of tile groups
+  0,                                         // mtu_size
+  AOM_TIMING_UNSPECIFIED,       // No picture timing signaling in bitstream
+  0,                            // frame_parallel_decoding_mode
+  1,                            // enable dual filter
+  0,                            // enable delta quant in chroma planes
+  NO_AQ,                        // aq_mode
+  DELTA_Q_OBJECTIVE,            // deltaq_mode
+  0,                            // delta lf mode
+  0,                            // frame_periodic_delta_q
+  AOM_BITS_8,                   // Bit depth
+  AOM_CONTENT_DEFAULT,          // content
+  AOM_CICP_CP_UNSPECIFIED,      // CICP color space
+  AOM_CICP_TC_UNSPECIFIED,      // CICP transfer characteristics
+  AOM_CICP_MC_UNSPECIFIED,      // CICP matrix coefficients
+  AOM_CSP_UNKNOWN,              // chroma sample position
+  0,                            // color range
+  0,                            // render width
+  0,                            // render height
+  AOM_SUPERBLOCK_SIZE_DYNAMIC,  // superblock_size
+  1,                            // this depends on large_scale_tile.
+  0,                            // error_resilient_mode off by default.
+  0,                            // s_frame_mode off by default.
+  0,                            // film_grain_test_vector
+  0,                            // film_grain_table_filename
+  0,                            // motion_vector_unit_test
+  1,                            // CDF update mode
+  1,                            // enable rectangular partitions
+  1,                            // enable ab shape partitions
+  1,                            // enable 1:4 and 4:1 partitions
+  4,                            // min_partition_size
+  128,                          // max_partition_size
+  1,                            // enable intra edge filter
+  1,                            // frame order hint
+  1,                            // enable 64-pt transform usage
+  1,                            // enable flip and identity transform
+  1,                            // dist-wtd compound
+  7,                            // max_reference_frames
+  0,                            // enable_reduced_reference_set
+  1,                            // enable_ref_frame_mvs sequence level
+  1,                            // allow ref_frame_mvs frame level
+  1,                            // enable masked compound at sequence level
+  1,                            // enable one sided compound at sequence level
+  1,                            // enable interintra compound at sequence level
+  1,                            // enable smooth interintra mode
+  1,                            // enable difference-weighted compound
+  1,                            // enable interinter wedge compound
+  1,                            // enable interintra wedge compound
+  1,                            // enable_global_motion usage
+  1,                            // enable_warped_motion at sequence level
+  1,                            // allow_warped_motion at frame level
+  1,                            // enable filter intra at sequence level
+  1,                            // enable smooth intra modes usage for sequence
+  1,                            // enable Paeth intra mode usage for sequence
+  1,                            // enable CFL uv intra mode usage for sequence
+  1,                            // superres
+  1,                            // enable overlay
+  1,                            // enable palette
+  !CONFIG_SHARP_SETTINGS,       // enable intrabc
+  1,                            // enable angle delta
+#if CONFIG_DENOISE
+  0,   // noise_level
+  32,  // noise_block_size
+#endif
+  0,  // chroma_subsampling_x
+  0,  // chroma_subsampling_y
+  0,  // reduced_tx_type_set
+  0,  // use_intra_dct_only
+  0,  // use_inter_dct_only
+  0,  // use_intra_default_tx_only
+  0,  // quant_b_adapt
+  {
+      SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX,
+      SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX,
+      SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX,
+      SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX,
+      SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX,
+      SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX,
+      SEQ_LEVEL_MAX, SEQ_LEVEL_MAX,
+  },            // target_seq_level_idx
+  0,            // tier_mask
+  0,            // min_cr
+  COST_UPD_SB,  // coeff_cost_upd_freq
+  COST_UPD_SB,  // mode_cost_upd_freq
+  COST_UPD_SB,  // mv_cost_upd_freq
+  0,            // ext_tile_debug
+  0,            // sb_multipass_unit_test
+};
+
+struct aom_codec_alg_priv {
+  aom_codec_priv_t base;
+  aom_codec_enc_cfg_t cfg;
+  struct av1_extracfg extra_cfg;
+  aom_rational64_t timestamp_ratio;
+  aom_codec_pts_t pts_offset;
+  unsigned char pts_offset_initialized;
+  AV1EncoderConfig oxcf;
+  AV1_COMP *cpi;
+  unsigned char *cx_data;
+  size_t cx_data_sz;
+  unsigned char *pending_cx_data;
+  size_t pending_cx_data_sz;
+  int pending_frame_count;
+  size_t pending_frame_sizes[8];
+  aom_image_t preview_img;
+  aom_enc_frame_flags_t next_frame_flags;
+  aom_codec_pkt_list_decl(256) pkt_list;
+  unsigned int fixed_kf_cntr;
+  // BufferPool that holds all reference frames.
+  BufferPool *buffer_pool;
+
+  // lookahead instance variables
+  BufferPool *buffer_pool_lap;
+  AV1_COMP *cpi_lap;
+  FIRSTPASS_STATS *frame_stats_buffer;
+  // Number of stats buffers required for look ahead
+  int num_lap_buffers;
+  STATS_BUFFER_CTX stats_buf_context;
+};
+
+static INLINE int gcd(int64_t a, int b) {
+  int remainder;  // remainder
+  while (b > 0) {
+    remainder = (int)(a % b);
+    a = b;
+    b = remainder;
+  }
+
+  return (int)a;
+}
+
+static INLINE void reduce_ratio(aom_rational64_t *ratio) {
+  const int denom = gcd(ratio->num, ratio->den);
+  ratio->num /= denom;
+  ratio->den /= denom;
+}
+
+static aom_codec_err_t update_error_state(
+    aom_codec_alg_priv_t *ctx, const struct aom_internal_error_info *error) {
+  const aom_codec_err_t res = error->error_code;
+
+  if (res != AOM_CODEC_OK)
+    ctx->base.err_detail = error->has_detail ? error->detail : NULL;
+
+  return res;
+}
+
+#undef ERROR
+#define ERROR(str)                  \
+  do {                              \
+    ctx->base.err_detail = str;     \
+    return AOM_CODEC_INVALID_PARAM; \
+  } while (0)
+
+#define RANGE_CHECK(p, memb, lo, hi)                   \
+  do {                                                 \
+    if (!((p)->memb >= (lo) && (p)->memb <= (hi)))     \
+      ERROR(#memb " out of range [" #lo ".." #hi "]"); \
+  } while (0)
+
+#define RANGE_CHECK_HI(p, memb, hi)                                     \
+  do {                                                                  \
+    if (!((p)->memb <= (hi))) ERROR(#memb " out of range [.." #hi "]"); \
+  } while (0)
+
+#define RANGE_CHECK_BOOL(p, memb)                                     \
+  do {                                                                \
+    if (!!((p)->memb) != (p)->memb) ERROR(#memb " expected boolean"); \
+  } while (0)
+
+static aom_codec_err_t validate_config(aom_codec_alg_priv_t *ctx,
+                                       const aom_codec_enc_cfg_t *cfg,
+                                       const struct av1_extracfg *extra_cfg) {
+  RANGE_CHECK(cfg, g_w, 1, 65535);  // 16 bits available
+  RANGE_CHECK(cfg, g_h, 1, 65535);  // 16 bits available
+  RANGE_CHECK(cfg, g_timebase.den, 1, 1000000000);
+  RANGE_CHECK(cfg, g_timebase.num, 1, cfg->g_timebase.den);
+  RANGE_CHECK_HI(cfg, g_profile, MAX_PROFILES - 1);
+
+  RANGE_CHECK_HI(cfg, rc_max_quantizer, 63);
+  RANGE_CHECK_HI(cfg, rc_min_quantizer, cfg->rc_max_quantizer);
+  RANGE_CHECK_BOOL(extra_cfg, lossless);
+  RANGE_CHECK_HI(extra_cfg, aq_mode, AQ_MODE_COUNT - 1);
+  RANGE_CHECK_HI(extra_cfg, deltaq_mode, DELTA_Q_MODE_COUNT - 1);
+  RANGE_CHECK_HI(extra_cfg, deltalf_mode, 1);
+  RANGE_CHECK_HI(extra_cfg, frame_periodic_boost, 1);
+  RANGE_CHECK_HI(cfg, g_usage, 1);
+  RANGE_CHECK_HI(cfg, g_threads, MAX_NUM_THREADS);
+  RANGE_CHECK(cfg, rc_end_usage, AOM_VBR, AOM_Q);
+  RANGE_CHECK_HI(cfg, rc_undershoot_pct, 100);
+  RANGE_CHECK_HI(cfg, rc_overshoot_pct, 100);
+  RANGE_CHECK_HI(cfg, rc_2pass_vbr_bias_pct, 100);
+  RANGE_CHECK(cfg, kf_mode, AOM_KF_DISABLED, AOM_KF_AUTO);
+  RANGE_CHECK_HI(cfg, rc_dropframe_thresh, 100);
+  RANGE_CHECK(cfg, g_pass, AOM_RC_ONE_PASS, AOM_RC_LAST_PASS);
+  if (cfg->g_pass == AOM_RC_ONE_PASS) {
+    RANGE_CHECK_HI(cfg, g_lag_in_frames, MAX_TOTAL_BUFFERS);
+  } else {
+    RANGE_CHECK_HI(cfg, g_lag_in_frames, MAX_LAG_BUFFERS);
+  }
+  RANGE_CHECK_HI(extra_cfg, min_gf_interval, MAX_LAG_BUFFERS - 1);
+  RANGE_CHECK_HI(extra_cfg, max_gf_interval, MAX_LAG_BUFFERS - 1);
+  if (extra_cfg->max_gf_interval > 0) {
+    RANGE_CHECK(extra_cfg, max_gf_interval,
+                AOMMAX(2, extra_cfg->min_gf_interval), (MAX_LAG_BUFFERS - 1));
+  }
+  RANGE_CHECK_HI(extra_cfg, gf_min_pyr_height, 5);
+  RANGE_CHECK_HI(extra_cfg, gf_max_pyr_height, 5);
+  if (extra_cfg->gf_min_pyr_height > extra_cfg->gf_max_pyr_height) {
+    ERROR(
+        "gf_min_pyr_height must be less than or equal to "
+        "gf_max_pyramid_height");
+  }
+
+  RANGE_CHECK_HI(cfg, rc_resize_mode, RESIZE_MODES - 1);
+  RANGE_CHECK(cfg, rc_resize_denominator, SCALE_NUMERATOR,
+              SCALE_NUMERATOR << 1);
+  RANGE_CHECK(cfg, rc_resize_kf_denominator, SCALE_NUMERATOR,
+              SCALE_NUMERATOR << 1);
+  RANGE_CHECK_HI(cfg, rc_superres_mode, SUPERRES_MODES - 1);
+  RANGE_CHECK(cfg, rc_superres_denominator, SCALE_NUMERATOR,
+              SCALE_NUMERATOR << 1);
+  RANGE_CHECK(cfg, rc_superres_kf_denominator, SCALE_NUMERATOR,
+              SCALE_NUMERATOR << 1);
+  RANGE_CHECK(cfg, rc_superres_qthresh, 1, 63);
+  RANGE_CHECK(cfg, rc_superres_kf_qthresh, 1, 63);
+  RANGE_CHECK_HI(extra_cfg, cdf_update_mode, 2);
+
+  // AV1 does not support a lower bound on the keyframe interval in
+  // automatic keyframe placement mode.
+  if (cfg->kf_mode != AOM_KF_DISABLED && cfg->kf_min_dist != cfg->kf_max_dist &&
+      cfg->kf_min_dist > 0)
+    ERROR(
+        "kf_min_dist not supported in auto mode, use 0 "
+        "or kf_max_dist instead.");
+
+  RANGE_CHECK_HI(extra_cfg, motion_vector_unit_test, 2);
+  RANGE_CHECK_HI(extra_cfg, sb_multipass_unit_test, 1);
+  RANGE_CHECK_HI(extra_cfg, ext_tile_debug, 1);
+  RANGE_CHECK_HI(extra_cfg, enable_auto_alt_ref, 1);
+  RANGE_CHECK_HI(extra_cfg, enable_auto_bwd_ref, 2);
+  RANGE_CHECK(extra_cfg, cpu_used, 0, 8);
+  RANGE_CHECK_HI(extra_cfg, noise_sensitivity, 6);
+  RANGE_CHECK(extra_cfg, superblock_size, AOM_SUPERBLOCK_SIZE_64X64,
+              AOM_SUPERBLOCK_SIZE_DYNAMIC);
+  RANGE_CHECK_HI(cfg, large_scale_tile, 1);
+  RANGE_CHECK_HI(extra_cfg, single_tile_decoding, 1);
+
+  RANGE_CHECK_HI(extra_cfg, row_mt, 1);
+
+  RANGE_CHECK_HI(extra_cfg, tile_columns, 6);
+  RANGE_CHECK_HI(extra_cfg, tile_rows, 6);
+
+  RANGE_CHECK_HI(cfg, monochrome, 1);
+
+  if (cfg->large_scale_tile && extra_cfg->aq_mode)
+    ERROR(
+        "Adaptive quantization are not supported in large scale tile "
+        "coding.");
+
+  RANGE_CHECK_HI(extra_cfg, sharpness, 7);
+  RANGE_CHECK_HI(extra_cfg, arnr_max_frames, 15);
+  RANGE_CHECK_HI(extra_cfg, arnr_strength, 6);
+  RANGE_CHECK_HI(extra_cfg, cq_level, 63);
+  RANGE_CHECK(cfg, g_bit_depth, AOM_BITS_8, AOM_BITS_12);
+  RANGE_CHECK(cfg, g_input_bit_depth, 8, 12);
+  RANGE_CHECK(extra_cfg, content, AOM_CONTENT_DEFAULT, AOM_CONTENT_INVALID - 1);
+
+  if (cfg->g_pass == AOM_RC_LAST_PASS) {
+    const size_t packet_sz = sizeof(FIRSTPASS_STATS);
+    const int n_packets = (int)(cfg->rc_twopass_stats_in.sz / packet_sz);
+    const FIRSTPASS_STATS *stats;
+
+    if (cfg->rc_twopass_stats_in.buf == NULL)
+      ERROR("rc_twopass_stats_in.buf not set.");
+
+    if (cfg->rc_twopass_stats_in.sz % packet_sz)
+      ERROR("rc_twopass_stats_in.sz indicates truncated packet.");
+
+    if (cfg->rc_twopass_stats_in.sz < 2 * packet_sz)
+      ERROR("rc_twopass_stats_in requires at least two packets.");
+
+    stats =
+        (const FIRSTPASS_STATS *)cfg->rc_twopass_stats_in.buf + n_packets - 1;
+
+    if ((int)(stats->count + 0.5) != n_packets - 1)
+      ERROR("rc_twopass_stats_in missing EOS stats packet");
+  }
+
+  if (cfg->g_profile <= (unsigned int)PROFILE_1 &&
+      cfg->g_bit_depth > AOM_BITS_10) {
+    ERROR("Codec bit-depth 12 not supported in profile < 2");
+  }
+  if (cfg->g_profile <= (unsigned int)PROFILE_1 &&
+      cfg->g_input_bit_depth > 10) {
+    ERROR("Source bit-depth 12 not supported in profile < 2");
+  }
+
+  if (cfg->rc_end_usage == AOM_Q) {
+    RANGE_CHECK_HI(cfg, use_fixed_qp_offsets, 1);
+    for (int i = 0; i < FIXED_QP_OFFSET_COUNT; ++i) {
+      RANGE_CHECK_HI(cfg, fixed_qp_offsets[i], 63);
+    }
+  } else {
+    if (cfg->use_fixed_qp_offsets > 0) {
+      ERROR("--use_fixed_qp_offsets can only be used with --end-usage=q");
+    }
+    for (int i = 0; i < FIXED_QP_OFFSET_COUNT; ++i) {
+      if (cfg->fixed_qp_offsets[i] >= 0) {
+        ERROR("--fixed_qp_offsets can only be used with --end-usage=q");
+      }
+    }
+  }
+
+  RANGE_CHECK(extra_cfg, color_primaries, AOM_CICP_CP_BT_709,
+              AOM_CICP_CP_EBU_3213);  // Need to check range more precisely to
+                                      // check for reserved values?
+  RANGE_CHECK(extra_cfg, transfer_characteristics, AOM_CICP_TC_BT_709,
+              AOM_CICP_TC_HLG);
+  RANGE_CHECK(extra_cfg, matrix_coefficients, AOM_CICP_MC_IDENTITY,
+              AOM_CICP_MC_ICTCP);
+  RANGE_CHECK(extra_cfg, color_range, 0, 1);
+
+#if !CONFIG_TUNE_VMAF
+  if (extra_cfg->tuning == AOM_TUNE_VMAF_WITH_PREPROCESSING ||
+      extra_cfg->tuning == AOM_TUNE_VMAF_WITHOUT_PREPROCESSING ||
+      extra_cfg->tuning == AOM_TUNE_VMAF_MAX_GAIN) {
+    ERROR(
+        "This error may be related to the wrong configuration options: try to "
+        "set -DCONFIG_TUNE_VMAF=1 at the time CMake is run.");
+  }
+#endif
+
+#if CONFIG_TUNE_VMAF
+  RANGE_CHECK(extra_cfg, tuning, AOM_TUNE_PSNR, AOM_TUNE_VMAF_MAX_GAIN);
+#else
+  RANGE_CHECK(extra_cfg, tuning, AOM_TUNE_PSNR, AOM_TUNE_SSIM);
+#endif
+
+  RANGE_CHECK(extra_cfg, timing_info_type, AOM_TIMING_UNSPECIFIED,
+              AOM_TIMING_DEC_MODEL);
+
+  RANGE_CHECK(extra_cfg, film_grain_test_vector, 0, 16);
+
+  if (extra_cfg->lossless) {
+    if (extra_cfg->aq_mode != 0)
+      ERROR("Only --aq_mode=0 can be used with --lossless=1.");
+    if (extra_cfg->enable_chroma_deltaq)
+      ERROR("Only --enable_chroma_deltaq=0 can be used with --lossless=1.");
+  }
+
+  if (cfg->rc_resize_mode != RESIZE_NONE &&
+      extra_cfg->aq_mode == CYCLIC_REFRESH_AQ) {
+    ERROR("--aq_mode=3 is only supported for --resize-mode=0.");
+  }
+
+  RANGE_CHECK(extra_cfg, max_reference_frames, 3, 7);
+  RANGE_CHECK(extra_cfg, enable_reduced_reference_set, 0, 1);
+  RANGE_CHECK_HI(extra_cfg, chroma_subsampling_x, 1);
+  RANGE_CHECK_HI(extra_cfg, chroma_subsampling_y, 1);
+
+  RANGE_CHECK_HI(extra_cfg, disable_trellis_quant, 3);
+  RANGE_CHECK(extra_cfg, coeff_cost_upd_freq, 0, 2);
+  RANGE_CHECK(extra_cfg, mode_cost_upd_freq, 0, 2);
+  RANGE_CHECK(extra_cfg, mv_cost_upd_freq, 0, 3);
+
+  RANGE_CHECK(extra_cfg, min_partition_size, 4, 128);
+  RANGE_CHECK(extra_cfg, max_partition_size, 4, 128);
+  RANGE_CHECK_HI(extra_cfg, min_partition_size, extra_cfg->max_partition_size);
+
+  for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) {
+    const int level_idx = extra_cfg->target_seq_level_idx[i];
+    if (!is_valid_seq_level_idx(level_idx) && level_idx != SEQ_LEVELS) {
+      ERROR("Target sequence level index is invalid");
+    }
+  }
+
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t validate_img(aom_codec_alg_priv_t *ctx,
+                                    const aom_image_t *img) {
+  switch (img->fmt) {
+    case AOM_IMG_FMT_YV12:
+    case AOM_IMG_FMT_I420:
+    case AOM_IMG_FMT_YV1216:
+    case AOM_IMG_FMT_I42016: break;
+    case AOM_IMG_FMT_I444:
+    case AOM_IMG_FMT_I44416:
+      if (ctx->cfg.g_profile == (unsigned int)PROFILE_0 &&
+          !ctx->cfg.monochrome) {
+        ERROR("Invalid image format. I444 images not supported in profile.");
+      }
+      break;
+    case AOM_IMG_FMT_I422:
+    case AOM_IMG_FMT_I42216:
+      if (ctx->cfg.g_profile != (unsigned int)PROFILE_2) {
+        ERROR("Invalid image format. I422 images not supported in profile.");
+      }
+      break;
+    default:
+      ERROR(
+          "Invalid image format. Only YV12, I420, I422, I444 images are "
+          "supported.");
+      break;
+  }
+
+  if (img->d_w != ctx->cfg.g_w || img->d_h != ctx->cfg.g_h)
+    ERROR("Image size must match encoder init configuration size");
+
+  if (img->fmt != AOM_IMG_FMT_I420 && !ctx->extra_cfg.enable_tx64) {
+    ERROR("TX64 can only be disabled on I420 images.");
+  }
+
+  return AOM_CODEC_OK;
+}
+
+static int get_image_bps(const aom_image_t *img) {
+  switch (img->fmt) {
+    case AOM_IMG_FMT_YV12:
+    case AOM_IMG_FMT_I420: return 12;
+    case AOM_IMG_FMT_I422: return 16;
+    case AOM_IMG_FMT_I444: return 24;
+    case AOM_IMG_FMT_YV1216:
+    case AOM_IMG_FMT_I42016: return 24;
+    case AOM_IMG_FMT_I42216: return 32;
+    case AOM_IMG_FMT_I44416: return 48;
+    default: assert(0 && "Invalid image format"); break;
+  }
+  return 0;
+}
+
+// Set appropriate options to disable frame super-resolution.
+static void disable_superres(AV1EncoderConfig *const oxcf) {
+  oxcf->superres_mode = SUPERRES_NONE;
+  oxcf->superres_scale_denominator = SCALE_NUMERATOR;
+  oxcf->superres_kf_scale_denominator = SCALE_NUMERATOR;
+  oxcf->superres_qthresh = 255;
+  oxcf->superres_kf_qthresh = 255;
+}
+
+static void update_default_encoder_config(const cfg_options_t *cfg,
+                                          struct av1_extracfg *extra_cfg) {
+  extra_cfg->enable_cdef = (cfg->disable_cdef == 0);
+  extra_cfg->enable_restoration = (cfg->disable_lr == 0);
+  extra_cfg->superblock_size = (cfg->super_block_size == 64)
+                                   ? AOM_SUPERBLOCK_SIZE_64X64
+                                   : (cfg->super_block_size == 128)
+                                         ? AOM_SUPERBLOCK_SIZE_128X128
+                                         : AOM_SUPERBLOCK_SIZE_DYNAMIC;
+  extra_cfg->enable_warped_motion = (cfg->disable_warp_motion == 0);
+  extra_cfg->enable_dist_wtd_comp = (cfg->disable_dist_wtd_comp == 0);
+  extra_cfg->enable_diff_wtd_comp = (cfg->disable_diff_wtd_comp == 0);
+  extra_cfg->enable_dual_filter = (cfg->disable_dual_filter == 0);
+  extra_cfg->enable_angle_delta = (cfg->disable_intra_angle_delta == 0);
+  extra_cfg->enable_rect_partitions = (cfg->disable_rect_partition_type == 0);
+  extra_cfg->enable_ab_partitions = (cfg->disable_ab_partition_type == 0);
+  extra_cfg->enable_1to4_partitions = (cfg->disable_1to4_partition_type == 0);
+  extra_cfg->max_partition_size = cfg->max_partition_size;
+  extra_cfg->min_partition_size = cfg->min_partition_size;
+  extra_cfg->enable_intra_edge_filter = (cfg->disable_intra_edge_filter == 0);
+  extra_cfg->enable_tx64 = (cfg->disable_tx_64x64 == 0);
+  extra_cfg->enable_flip_idtx = (cfg->disable_flip_idtx == 0);
+  extra_cfg->enable_masked_comp = (cfg->disable_masked_comp == 0);
+  extra_cfg->enable_interintra_comp = (cfg->disable_inter_intra_comp == 0);
+  extra_cfg->enable_smooth_interintra = (cfg->disable_smooth_inter_intra == 0);
+  extra_cfg->enable_interinter_wedge = (cfg->disable_inter_inter_wedge == 0);
+  extra_cfg->enable_interintra_wedge = (cfg->disable_inter_intra_wedge == 0);
+  extra_cfg->enable_global_motion = (cfg->disable_global_motion == 0);
+  extra_cfg->enable_filter_intra = (cfg->disable_filter_intra == 0);
+  extra_cfg->enable_smooth_intra = (cfg->disable_smooth_intra == 0);
+  extra_cfg->enable_paeth_intra = (cfg->disable_paeth_intra == 0);
+  extra_cfg->enable_cfl_intra = (cfg->disable_cfl == 0);
+  extra_cfg->enable_obmc = (cfg->disable_obmc == 0);
+  extra_cfg->enable_palette = (cfg->disable_palette == 0);
+  extra_cfg->enable_intrabc = (cfg->disable_intrabc == 0);
+  extra_cfg->disable_trellis_quant = cfg->disable_trellis_quant;
+  extra_cfg->allow_ref_frame_mvs = (cfg->disable_ref_frame_mv == 0);
+  extra_cfg->enable_ref_frame_mvs = (cfg->disable_ref_frame_mv == 0);
+  extra_cfg->enable_onesided_comp = (cfg->disable_one_sided_comp == 0);
+  extra_cfg->enable_reduced_reference_set = cfg->reduced_reference_set;
+  extra_cfg->reduced_tx_type_set = cfg->reduced_tx_type_set;
+}
+
+static double convert_qp_offset(int cq_level, int q_offset, int bit_depth) {
+  const double base_q_val = av1_convert_qindex_to_q(cq_level, bit_depth);
+  const int new_q_index_offset = av1_quantizer_to_qindex(q_offset);
+  const int new_q_index = AOMMAX(cq_level - new_q_index_offset, 0);
+  const double new_q_val = av1_convert_qindex_to_q(new_q_index, bit_depth);
+  return (base_q_val - new_q_val);
+}
+
+static double get_modeled_qp_offset(int cq_level, int level, int bit_depth) {
+  // 80% for keyframe was derived empirically.
+  // 40% similar to rc_pick_q_and_bounds_one_pass_vbr() for Q mode ARF.
+  // Rest derived similar to rc_pick_q_and_bounds_two_pass()
+  static const int percents[FIXED_QP_OFFSET_COUNT] = { 76, 60, 30, 15, 8 };
+  const double q_val = av1_convert_qindex_to_q(cq_level, bit_depth);
+  return q_val * percents[level] / 100;
+}
+
+static aom_codec_err_t set_encoder_config(AV1EncoderConfig *oxcf,
+                                          const aom_codec_enc_cfg_t *cfg,
+                                          struct av1_extracfg *extra_cfg) {
+  if (cfg->encoder_cfg.init_by_cfg_file) {
+    update_default_encoder_config(&cfg->encoder_cfg, extra_cfg);
+  }
+
+  const int is_vbr = cfg->rc_end_usage == AOM_VBR;
+  oxcf->profile = cfg->g_profile;
+  oxcf->fwd_kf_enabled = cfg->fwd_kf_enabled;
+  oxcf->max_threads = (int)cfg->g_threads;
+  oxcf->mode = (cfg->g_usage == AOM_USAGE_REALTIME) ? REALTIME : GOOD;
+  oxcf->width = cfg->g_w;
+  oxcf->height = cfg->g_h;
+  oxcf->forced_max_frame_width = cfg->g_forced_max_frame_width;
+  oxcf->forced_max_frame_height = cfg->g_forced_max_frame_height;
+  oxcf->bit_depth = cfg->g_bit_depth;
+  oxcf->input_bit_depth = cfg->g_input_bit_depth;
+  // guess a frame rate if out of whack, use 30
+  oxcf->init_framerate = (double)cfg->g_timebase.den / cfg->g_timebase.num;
+  if (extra_cfg->timing_info_type == AOM_TIMING_EQUAL ||
+      extra_cfg->timing_info_type == AOM_TIMING_DEC_MODEL) {
+    oxcf->timing_info_present = 1;
+    oxcf->timing_info.num_units_in_display_tick = cfg->g_timebase.num;
+    oxcf->timing_info.time_scale = cfg->g_timebase.den;
+    oxcf->timing_info.num_ticks_per_picture = 1;
+  } else {
+    oxcf->timing_info_present = 0;
+  }
+  if (extra_cfg->timing_info_type == AOM_TIMING_EQUAL) {
+    oxcf->timing_info.equal_picture_interval = 1;
+    oxcf->decoder_model_info_present_flag = 0;
+    oxcf->display_model_info_present_flag = 1;
+  } else if (extra_cfg->timing_info_type == AOM_TIMING_DEC_MODEL) {
+    //    if( extra_cfg->arnr_strength > 0 )
+    //    {
+    //      printf("Only --arnr-strength=0 can currently be used with
+    //      --timing-info=model."); return AOM_CODEC_INVALID_PARAM;
+    //    }
+    //    if( extra_cfg->enable_superres)
+    //    {
+    //      printf("Only --superres-mode=0 can currently be used with
+    //      --timing-info=model."); return AOM_CODEC_INVALID_PARAM;
+    //    }
+    oxcf->buffer_model.num_units_in_decoding_tick = cfg->g_timebase.num;
+    oxcf->timing_info.equal_picture_interval = 0;
+    oxcf->decoder_model_info_present_flag = 1;
+    oxcf->buffer_removal_time_present = 1;
+    oxcf->display_model_info_present_flag = 1;
+  }
+  if (oxcf->init_framerate > 180) {
+    oxcf->init_framerate = 30;
+    oxcf->timing_info_present = 0;
+  }
+  oxcf->encoder_cfg = &cfg->encoder_cfg;
+
+  switch (cfg->g_pass) {
+    case AOM_RC_ONE_PASS: oxcf->pass = 0; break;
+    case AOM_RC_FIRST_PASS: oxcf->pass = 1; break;
+    case AOM_RC_LAST_PASS: oxcf->pass = 2; break;
+  }
+
+  oxcf->lag_in_frames = clamp(cfg->g_lag_in_frames, 0, MAX_LAG_BUFFERS);
+  oxcf->rc_mode = cfg->rc_end_usage;
+
+  // Convert target bandwidth from Kbit/s to Bit/s
+  oxcf->target_bandwidth = 1000 * cfg->rc_target_bitrate;
+  oxcf->rc_max_intra_bitrate_pct = extra_cfg->rc_max_intra_bitrate_pct;
+  oxcf->rc_max_inter_bitrate_pct = extra_cfg->rc_max_inter_bitrate_pct;
+  oxcf->gf_cbr_boost_pct = extra_cfg->gf_cbr_boost_pct;
+
+  oxcf->best_allowed_q =
+      extra_cfg->lossless ? 0 : av1_quantizer_to_qindex(cfg->rc_min_quantizer);
+  oxcf->worst_allowed_q =
+      extra_cfg->lossless ? 0 : av1_quantizer_to_qindex(cfg->rc_max_quantizer);
+  oxcf->cq_level = av1_quantizer_to_qindex(extra_cfg->cq_level);
+  oxcf->fixed_q = -1;
+
+  oxcf->enable_cdef = extra_cfg->enable_cdef;
+  oxcf->enable_restoration =
+      (cfg->g_usage == AOM_USAGE_REALTIME) ? 0 : extra_cfg->enable_restoration;
+  oxcf->force_video_mode = extra_cfg->force_video_mode;
+  oxcf->enable_obmc = extra_cfg->enable_obmc;
+  oxcf->enable_overlay = extra_cfg->enable_overlay;
+  oxcf->enable_palette = extra_cfg->enable_palette;
+  oxcf->enable_intrabc = extra_cfg->enable_intrabc;
+  oxcf->enable_angle_delta = extra_cfg->enable_angle_delta;
+  oxcf->disable_trellis_quant = extra_cfg->disable_trellis_quant;
+  oxcf->allow_ref_frame_mvs = extra_cfg->enable_ref_frame_mvs;
+  oxcf->using_qm = extra_cfg->enable_qm;
+  oxcf->qm_y = extra_cfg->qm_y;
+  oxcf->qm_u = extra_cfg->qm_u;
+  oxcf->qm_v = extra_cfg->qm_v;
+  oxcf->qm_minlevel = extra_cfg->qm_min;
+  oxcf->qm_maxlevel = extra_cfg->qm_max;
+  oxcf->reduced_tx_type_set = extra_cfg->reduced_tx_type_set;
+  oxcf->use_intra_dct_only = extra_cfg->use_intra_dct_only;
+  oxcf->use_inter_dct_only = extra_cfg->use_inter_dct_only;
+  oxcf->use_intra_default_tx_only = extra_cfg->use_intra_default_tx_only;
+  oxcf->quant_b_adapt = extra_cfg->quant_b_adapt;
+  oxcf->coeff_cost_upd_freq = (COST_UPDATE_TYPE)extra_cfg->coeff_cost_upd_freq;
+  oxcf->mode_cost_upd_freq = (COST_UPDATE_TYPE)extra_cfg->mode_cost_upd_freq;
+  oxcf->mv_cost_upd_freq = (COST_UPDATE_TYPE)extra_cfg->mv_cost_upd_freq;
+  oxcf->num_tile_groups = extra_cfg->num_tg;
+  // In large-scale tile encoding mode, num_tile_groups is always 1.
+  if (cfg->large_scale_tile) oxcf->num_tile_groups = 1;
+  oxcf->mtu = extra_cfg->mtu_size;
+
+  // FIXME(debargha): Should this be:
+  // oxcf->allow_ref_frame_mvs = extra_cfg->allow_ref_frame_mvs &
+  //                             extra_cfg->enable_order_hint ?
+  // Disallow using temporal MVs while large_scale_tile = 1.
+  oxcf->allow_ref_frame_mvs =
+      extra_cfg->allow_ref_frame_mvs && !cfg->large_scale_tile;
+  oxcf->under_shoot_pct = cfg->rc_undershoot_pct;
+  oxcf->over_shoot_pct = cfg->rc_overshoot_pct;
+
+  oxcf->resize_mode = (RESIZE_MODE)cfg->rc_resize_mode;
+  oxcf->resize_scale_denominator = (uint8_t)cfg->rc_resize_denominator;
+  oxcf->resize_kf_scale_denominator = (uint8_t)cfg->rc_resize_kf_denominator;
+  if (oxcf->resize_mode == RESIZE_FIXED &&
+      oxcf->resize_scale_denominator == SCALE_NUMERATOR &&
+      oxcf->resize_kf_scale_denominator == SCALE_NUMERATOR)
+    oxcf->resize_mode = RESIZE_NONE;
+
+  if (extra_cfg->lossless || cfg->large_scale_tile) {
+    disable_superres(oxcf);
+  } else {
+    oxcf->superres_mode = (SUPERRES_MODE)cfg->rc_superres_mode;
+    oxcf->superres_scale_denominator = (uint8_t)cfg->rc_superres_denominator;
+    oxcf->superres_kf_scale_denominator =
+        (uint8_t)cfg->rc_superres_kf_denominator;
+    oxcf->superres_qthresh = av1_quantizer_to_qindex(cfg->rc_superres_qthresh);
+    oxcf->superres_kf_qthresh =
+        av1_quantizer_to_qindex(cfg->rc_superres_kf_qthresh);
+    if (oxcf->superres_mode == SUPERRES_FIXED &&
+        oxcf->superres_scale_denominator == SCALE_NUMERATOR &&
+        oxcf->superres_kf_scale_denominator == SCALE_NUMERATOR) {
+      disable_superres(oxcf);
+    }
+    if (oxcf->superres_mode == SUPERRES_QTHRESH &&
+        oxcf->superres_qthresh == 255 && oxcf->superres_kf_qthresh == 255) {
+      disable_superres(oxcf);
+    }
+  }
+
+  oxcf->maximum_buffer_size_ms = is_vbr ? 240000 : cfg->rc_buf_sz;
+  oxcf->starting_buffer_level_ms = is_vbr ? 60000 : cfg->rc_buf_initial_sz;
+  oxcf->optimal_buffer_level_ms = is_vbr ? 60000 : cfg->rc_buf_optimal_sz;
+
+  oxcf->drop_frames_water_mark = cfg->rc_dropframe_thresh;
+
+  oxcf->two_pass_vbrbias = cfg->rc_2pass_vbr_bias_pct;
+  oxcf->two_pass_vbrmin_section = cfg->rc_2pass_vbr_minsection_pct;
+  oxcf->two_pass_vbrmax_section = cfg->rc_2pass_vbr_maxsection_pct;
+
+  oxcf->auto_key =
+      cfg->kf_mode == AOM_KF_AUTO && cfg->kf_min_dist != cfg->kf_max_dist;
+
+  oxcf->key_freq = cfg->kf_max_dist;
+  oxcf->sframe_dist = cfg->sframe_dist;
+  oxcf->sframe_mode = cfg->sframe_mode;
+  oxcf->sframe_enabled = cfg->sframe_dist != 0;
+  oxcf->speed = extra_cfg->cpu_used;
+  oxcf->enable_auto_arf = extra_cfg->enable_auto_alt_ref;
+  oxcf->enable_auto_brf = extra_cfg->enable_auto_bwd_ref;
+  oxcf->noise_sensitivity = extra_cfg->noise_sensitivity;
+  oxcf->sharpness = extra_cfg->sharpness;
+
+  oxcf->two_pass_stats_in = cfg->rc_twopass_stats_in;
+
+  oxcf->color_primaries = extra_cfg->color_primaries;
+  oxcf->transfer_characteristics = extra_cfg->transfer_characteristics;
+  oxcf->matrix_coefficients = extra_cfg->matrix_coefficients;
+  oxcf->chroma_sample_position = extra_cfg->chroma_sample_position;
+
+  oxcf->color_range = extra_cfg->color_range;
+  oxcf->render_width = extra_cfg->render_width;
+  oxcf->render_height = extra_cfg->render_height;
+  oxcf->arnr_max_frames = extra_cfg->arnr_max_frames;
+  oxcf->arnr_strength = extra_cfg->arnr_strength;
+  oxcf->min_gf_interval = extra_cfg->min_gf_interval;
+  oxcf->max_gf_interval = extra_cfg->max_gf_interval;
+  oxcf->gf_min_pyr_height = extra_cfg->gf_min_pyr_height;
+  oxcf->gf_max_pyr_height = extra_cfg->gf_max_pyr_height;
+
+  oxcf->tuning = extra_cfg->tuning;
+  oxcf->vmaf_model_path = extra_cfg->vmaf_model_path;
+  oxcf->content = extra_cfg->content;
+  oxcf->cdf_update_mode = (uint8_t)extra_cfg->cdf_update_mode;
+  oxcf->superblock_size = extra_cfg->superblock_size;
+  if (cfg->large_scale_tile) {
+    oxcf->film_grain_test_vector = 0;
+    oxcf->film_grain_table_filename = NULL;
+  } else {
+    oxcf->film_grain_test_vector = extra_cfg->film_grain_test_vector;
+    oxcf->film_grain_table_filename = extra_cfg->film_grain_table_filename;
+  }
+#if CONFIG_DENOISE
+  oxcf->noise_level = extra_cfg->noise_level;
+  oxcf->noise_block_size = extra_cfg->noise_block_size;
+#endif
+  oxcf->large_scale_tile = cfg->large_scale_tile;
+  oxcf->single_tile_decoding =
+      (oxcf->large_scale_tile) ? extra_cfg->single_tile_decoding : 0;
+  if (oxcf->large_scale_tile) {
+    // The superblock_size can only be AOM_SUPERBLOCK_SIZE_64X64 or
+    // AOM_SUPERBLOCK_SIZE_128X128 while oxcf->large_scale_tile = 1. If
+    // superblock_size = AOM_SUPERBLOCK_SIZE_DYNAMIC, hard set it to
+    // AOM_SUPERBLOCK_SIZE_64X64(default value in large_scale_tile).
+    if (extra_cfg->superblock_size != AOM_SUPERBLOCK_SIZE_64X64 &&
+        extra_cfg->superblock_size != AOM_SUPERBLOCK_SIZE_128X128)
+      oxcf->superblock_size = AOM_SUPERBLOCK_SIZE_64X64;
+  }
+
+  oxcf->row_mt = extra_cfg->row_mt;
+
+  oxcf->tile_columns = extra_cfg->tile_columns;
+  oxcf->tile_rows = extra_cfg->tile_rows;
+
+  oxcf->monochrome = cfg->monochrome;
+  oxcf->full_still_picture_hdr = cfg->full_still_picture_hdr;
+  oxcf->enable_dual_filter = extra_cfg->enable_dual_filter;
+  oxcf->enable_rect_partitions = extra_cfg->enable_rect_partitions;
+  oxcf->enable_ab_partitions = extra_cfg->enable_ab_partitions;
+  oxcf->enable_1to4_partitions = extra_cfg->enable_1to4_partitions;
+  oxcf->min_partition_size = extra_cfg->min_partition_size;
+  oxcf->max_partition_size = extra_cfg->max_partition_size;
+  oxcf->enable_intra_edge_filter = extra_cfg->enable_intra_edge_filter;
+  oxcf->enable_tx64 = extra_cfg->enable_tx64;
+  oxcf->enable_flip_idtx = extra_cfg->enable_flip_idtx;
+  oxcf->enable_order_hint = extra_cfg->enable_order_hint;
+  oxcf->enable_dist_wtd_comp =
+      extra_cfg->enable_dist_wtd_comp & extra_cfg->enable_order_hint;
+  oxcf->max_reference_frames = extra_cfg->max_reference_frames;
+  oxcf->enable_reduced_reference_set = extra_cfg->enable_reduced_reference_set;
+  oxcf->enable_masked_comp = extra_cfg->enable_masked_comp;
+  oxcf->enable_onesided_comp = extra_cfg->enable_onesided_comp;
+  oxcf->enable_diff_wtd_comp =
+      extra_cfg->enable_masked_comp & extra_cfg->enable_diff_wtd_comp;
+  oxcf->enable_interinter_wedge =
+      extra_cfg->enable_masked_comp & extra_cfg->enable_interinter_wedge;
+  oxcf->enable_interintra_comp = extra_cfg->enable_interintra_comp;
+  oxcf->enable_smooth_interintra =
+      extra_cfg->enable_interintra_comp && extra_cfg->enable_smooth_interintra;
+  oxcf->enable_interintra_wedge =
+      extra_cfg->enable_interintra_comp & extra_cfg->enable_interintra_wedge;
+  oxcf->enable_ref_frame_mvs =
+      extra_cfg->enable_ref_frame_mvs & extra_cfg->enable_order_hint;
+
+  oxcf->enable_global_motion = extra_cfg->enable_global_motion;
+  oxcf->enable_warped_motion = extra_cfg->enable_warped_motion;
+  oxcf->allow_warped_motion =
+      (cfg->g_usage == AOM_USAGE_REALTIME)
+          ? 0
+          : (extra_cfg->allow_warped_motion & extra_cfg->enable_warped_motion);
+  oxcf->enable_filter_intra = extra_cfg->enable_filter_intra;
+  oxcf->enable_smooth_intra = extra_cfg->enable_smooth_intra;
+  oxcf->enable_paeth_intra = extra_cfg->enable_paeth_intra;
+  oxcf->enable_cfl_intra = extra_cfg->enable_cfl_intra;
+
+  oxcf->enable_superres =
+      (oxcf->superres_mode != SUPERRES_NONE) && extra_cfg->enable_superres;
+  if (!oxcf->enable_superres) {
+    disable_superres(oxcf);
+  }
+
+  oxcf->tile_width_count = AOMMIN(cfg->tile_width_count, MAX_TILE_COLS);
+  oxcf->tile_height_count = AOMMIN(cfg->tile_height_count, MAX_TILE_ROWS);
+  for (int i = 0; i < oxcf->tile_width_count; i++) {
+    oxcf->tile_widths[i] = AOMMAX(cfg->tile_widths[i], 1);
+  }
+  for (int i = 0; i < oxcf->tile_height_count; i++) {
+    oxcf->tile_heights[i] = AOMMAX(cfg->tile_heights[i], 1);
+  }
+  oxcf->error_resilient_mode =
+      cfg->g_error_resilient | extra_cfg->error_resilient_mode;
+  oxcf->s_frame_mode = extra_cfg->s_frame_mode;
+  oxcf->frame_parallel_decoding_mode = extra_cfg->frame_parallel_decoding_mode;
+  if (cfg->g_pass == AOM_RC_LAST_PASS) {
+    const size_t packet_sz = sizeof(FIRSTPASS_STATS);
+    const int n_packets = (int)(cfg->rc_twopass_stats_in.sz / packet_sz);
+    oxcf->limit = n_packets - 1;
+  } else {
+    oxcf->limit = cfg->g_limit;
+  }
+
+  if (oxcf->limit == 1) {
+    // still picture mode, display model and timing is meaningless
+    oxcf->display_model_info_present_flag = 0;
+    oxcf->timing_info_present = 0;
+  }
+
+  oxcf->enable_tpl_model = extra_cfg->enable_tpl_model;
+  oxcf->enable_keyframe_filtering = extra_cfg->enable_keyframe_filtering;
+
+  oxcf->enable_chroma_deltaq = extra_cfg->enable_chroma_deltaq;
+  oxcf->aq_mode = extra_cfg->aq_mode;
+  oxcf->deltaq_mode = extra_cfg->deltaq_mode;
+
+  oxcf->deltalf_mode =
+      (oxcf->deltaq_mode != NO_DELTA_Q) && extra_cfg->deltalf_mode;
+
+  oxcf->save_as_annexb = cfg->save_as_annexb;
+
+  oxcf->frame_periodic_boost = extra_cfg->frame_periodic_boost;
+  oxcf->motion_vector_unit_test = extra_cfg->motion_vector_unit_test;
+  oxcf->sb_multipass_unit_test = extra_cfg->sb_multipass_unit_test;
+  oxcf->ext_tile_debug = extra_cfg->ext_tile_debug;
+
+  oxcf->chroma_subsampling_x = extra_cfg->chroma_subsampling_x;
+  oxcf->chroma_subsampling_y = extra_cfg->chroma_subsampling_y;
+  oxcf->border_in_pixels = (oxcf->resize_mode || oxcf->superres_mode)
+                               ? AOM_BORDER_IN_PIXELS
+                               : AOM_ENC_NO_SCALE_BORDER;
+  memcpy(oxcf->target_seq_level_idx, extra_cfg->target_seq_level_idx,
+         sizeof(oxcf->target_seq_level_idx));
+  oxcf->tier_mask = extra_cfg->tier_mask;
+
+  oxcf->use_fixed_qp_offsets =
+      cfg->use_fixed_qp_offsets && (oxcf->rc_mode == AOM_Q);
+  for (int i = 0; i < FIXED_QP_OFFSET_COUNT; ++i) {
+    if (oxcf->use_fixed_qp_offsets) {
+      if (cfg->fixed_qp_offsets[i] >= 0) {  // user-provided qp offset
+        oxcf->fixed_qp_offsets[i] = convert_qp_offset(
+            oxcf->cq_level, cfg->fixed_qp_offsets[i], oxcf->bit_depth);
+      } else {  // auto-selected qp offset
+        oxcf->fixed_qp_offsets[i] =
+            get_modeled_qp_offset(oxcf->cq_level, i, oxcf->bit_depth);
+      }
+    } else {
+      oxcf->fixed_qp_offsets[i] = -1.0;
+    }
+  }
+
+  oxcf->min_cr = extra_cfg->min_cr;
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t encoder_set_config(aom_codec_alg_priv_t *ctx,
+                                          const aom_codec_enc_cfg_t *cfg) {
+  aom_codec_err_t res;
+  int force_key = 0;
+
+  if (cfg->g_w != ctx->cfg.g_w || cfg->g_h != ctx->cfg.g_h) {
+    if (cfg->g_lag_in_frames > 1 || cfg->g_pass != AOM_RC_ONE_PASS)
+      ERROR("Cannot change width or height after initialization");
+    if (!valid_ref_frame_size(ctx->cfg.g_w, ctx->cfg.g_h, cfg->g_w, cfg->g_h) ||
+        (ctx->cpi->initial_width && (int)cfg->g_w > ctx->cpi->initial_width) ||
+        (ctx->cpi->initial_height && (int)cfg->g_h > ctx->cpi->initial_height))
+      force_key = 1;
+  }
+
+  // Prevent increasing lag_in_frames. This check is stricter than it needs
+  // to be -- the limit is not increasing past the first lag_in_frames
+  // value, but we don't track the initial config, only the last successful
+  // config.
+  if (cfg->g_lag_in_frames > ctx->cfg.g_lag_in_frames)
+    ERROR("Cannot increase lag_in_frames");
+  // Prevent changing lag_in_frames if Lookahead Processing is enabled
+  if (cfg->g_lag_in_frames != ctx->cfg.g_lag_in_frames &&
+      ctx->num_lap_buffers > 0)
+    ERROR("Cannot change lag_in_frames if LAP is enabled");
+
+  res = validate_config(ctx, cfg, &ctx->extra_cfg);
+
+  if (res == AOM_CODEC_OK) {
+    ctx->cfg = *cfg;
+    set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg);
+    // On profile change, request a key frame
+    force_key |= ctx->cpi->common.seq_params.profile != ctx->oxcf.profile;
+    av1_change_config(ctx->cpi, &ctx->oxcf);
+  }
+
+  if (force_key) ctx->next_frame_flags |= AOM_EFLAG_FORCE_KF;
+
+  return res;
+}
+
+static aom_fixed_buf_t *encoder_get_global_headers(aom_codec_alg_priv_t *ctx) {
+  return av1_get_global_headers(ctx->cpi);
+}
+
+static aom_codec_err_t ctrl_get_quantizer(aom_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  int *const arg = va_arg(args, int *);
+  if (arg == NULL) return AOM_CODEC_INVALID_PARAM;
+  *arg = av1_get_quantizer(ctx->cpi);
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_get_quantizer64(aom_codec_alg_priv_t *ctx,
+                                            va_list args) {
+  int *const arg = va_arg(args, int *);
+  if (arg == NULL) return AOM_CODEC_INVALID_PARAM;
+  *arg = av1_qindex_to_quantizer(av1_get_quantizer(ctx->cpi));
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t update_extra_cfg(aom_codec_alg_priv_t *ctx,
+                                        struct av1_extracfg *extra_cfg) {
+  const aom_codec_err_t res = validate_config(ctx, &ctx->cfg, extra_cfg);
+  if (res == AOM_CODEC_OK) {
+    ctx->extra_cfg = *extra_cfg;
+    set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg);
+    av1_change_config(ctx->cpi, &ctx->oxcf);
+  }
+  return res;
+}
+
+static aom_codec_err_t ctrl_set_cpuused(aom_codec_alg_priv_t *ctx,
+                                        va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.cpu_used = CAST(AOME_SET_CPUUSED, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_auto_alt_ref(aom_codec_alg_priv_t *ctx,
+                                                    va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_auto_alt_ref = CAST(AOME_SET_ENABLEAUTOALTREF, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_auto_bwd_ref(aom_codec_alg_priv_t *ctx,
+                                                    va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_auto_bwd_ref = CAST(AOME_SET_ENABLEAUTOBWDREF, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_noise_sensitivity(aom_codec_alg_priv_t *ctx,
+                                                  va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.noise_sensitivity = CAST(AV1E_SET_NOISE_SENSITIVITY, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_sharpness(aom_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.sharpness = CAST(AOME_SET_SHARPNESS, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_static_thresh(aom_codec_alg_priv_t *ctx,
+                                              va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.static_thresh = CAST(AOME_SET_STATIC_THRESHOLD, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_row_mt(aom_codec_alg_priv_t *ctx,
+                                       va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.row_mt = CAST(AV1E_SET_ROW_MT, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_tile_columns(aom_codec_alg_priv_t *ctx,
+                                             va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.tile_columns = CAST(AV1E_SET_TILE_COLUMNS, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_tile_rows(aom_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.tile_rows = CAST(AV1E_SET_TILE_ROWS, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_tpl_model(aom_codec_alg_priv_t *ctx,
+                                                 va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_tpl_model = CAST(AV1E_SET_ENABLE_TPL_MODEL, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_keyframe_filtering(
+    aom_codec_alg_priv_t *ctx, va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_keyframe_filtering =
+      CAST(AV1E_SET_ENABLE_KEYFRAME_FILTERING, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_arnr_max_frames(aom_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.arnr_max_frames = CAST(AOME_SET_ARNR_MAXFRAMES, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_arnr_strength(aom_codec_alg_priv_t *ctx,
+                                              va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.arnr_strength = CAST(AOME_SET_ARNR_STRENGTH, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_tuning(aom_codec_alg_priv_t *ctx,
+                                       va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.tuning = CAST(AOME_SET_TUNING, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_cq_level(aom_codec_alg_priv_t *ctx,
+                                         va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.cq_level = CAST(AOME_SET_CQ_LEVEL, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_rc_max_intra_bitrate_pct(
+    aom_codec_alg_priv_t *ctx, va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.rc_max_intra_bitrate_pct =
+      CAST(AOME_SET_MAX_INTRA_BITRATE_PCT, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_rc_max_inter_bitrate_pct(
+    aom_codec_alg_priv_t *ctx, va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.rc_max_inter_bitrate_pct =
+      CAST(AOME_SET_MAX_INTER_BITRATE_PCT, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_rc_gf_cbr_boost_pct(aom_codec_alg_priv_t *ctx,
+                                                    va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.gf_cbr_boost_pct = CAST(AV1E_SET_GF_CBR_BOOST_PCT, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_lossless(aom_codec_alg_priv_t *ctx,
+                                         va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.lossless = CAST(AV1E_SET_LOSSLESS, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_cdef(aom_codec_alg_priv_t *ctx,
+                                            va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_cdef = CAST(AV1E_SET_ENABLE_CDEF, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_restoration(aom_codec_alg_priv_t *ctx,
+                                                   va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_restoration = CAST(AV1E_SET_ENABLE_RESTORATION, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_force_video_mode(aom_codec_alg_priv_t *ctx,
+                                                 va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.force_video_mode = CAST(AV1E_SET_FORCE_VIDEO_MODE, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_obmc(aom_codec_alg_priv_t *ctx,
+                                            va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_obmc = CAST(AV1E_SET_ENABLE_OBMC, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_disable_trellis_quant(aom_codec_alg_priv_t *ctx,
+                                                      va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.disable_trellis_quant = CAST(AV1E_SET_DISABLE_TRELLIS_QUANT, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_qm(aom_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_qm = CAST(AV1E_SET_ENABLE_QM, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+static aom_codec_err_t ctrl_set_qm_y(aom_codec_alg_priv_t *ctx, va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.qm_y = CAST(AV1E_SET_QM_Y, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+static aom_codec_err_t ctrl_set_qm_u(aom_codec_alg_priv_t *ctx, va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.qm_u = CAST(AV1E_SET_QM_U, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+static aom_codec_err_t ctrl_set_qm_v(aom_codec_alg_priv_t *ctx, va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.qm_v = CAST(AV1E_SET_QM_V, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+static aom_codec_err_t ctrl_set_qm_min(aom_codec_alg_priv_t *ctx,
+                                       va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.qm_min = CAST(AV1E_SET_QM_MIN, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_qm_max(aom_codec_alg_priv_t *ctx,
+                                       va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.qm_max = CAST(AV1E_SET_QM_MAX, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_num_tg(aom_codec_alg_priv_t *ctx,
+                                       va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.num_tg = CAST(AV1E_SET_NUM_TG, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_mtu(aom_codec_alg_priv_t *ctx, va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.mtu_size = CAST(AV1E_SET_MTU, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+static aom_codec_err_t ctrl_set_timing_info_type(aom_codec_alg_priv_t *ctx,
+                                                 va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.timing_info_type = CAST(AV1E_SET_TIMING_INFO_TYPE, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_dual_filter(aom_codec_alg_priv_t *ctx,
+                                                   va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_dual_filter = CAST(AV1E_SET_ENABLE_DUAL_FILTER, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_chroma_deltaq(aom_codec_alg_priv_t *ctx,
+                                                     va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_chroma_deltaq = CAST(AV1E_SET_ENABLE_CHROMA_DELTAQ, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_rect_partitions(
+    aom_codec_alg_priv_t *ctx, va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_rect_partitions =
+      CAST(AV1E_SET_ENABLE_RECT_PARTITIONS, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_ab_partitions(aom_codec_alg_priv_t *ctx,
+                                                     va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_ab_partitions = CAST(AV1E_SET_ENABLE_AB_PARTITIONS, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_1to4_partitions(
+    aom_codec_alg_priv_t *ctx, va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_1to4_partitions =
+      CAST(AV1E_SET_ENABLE_1TO4_PARTITIONS, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_min_partition_size(aom_codec_alg_priv_t *ctx,
+                                                   va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.min_partition_size = CAST(AV1E_SET_MIN_PARTITION_SIZE, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_max_partition_size(aom_codec_alg_priv_t *ctx,
+                                                   va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.max_partition_size = CAST(AV1E_SET_MAX_PARTITION_SIZE, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_intra_edge_filter(
+    aom_codec_alg_priv_t *ctx, va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_intra_edge_filter =
+      CAST(AV1E_SET_ENABLE_INTRA_EDGE_FILTER, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_order_hint(aom_codec_alg_priv_t *ctx,
+                                                  va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_order_hint = CAST(AV1E_SET_ENABLE_ORDER_HINT, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_tx64(aom_codec_alg_priv_t *ctx,
+                                            va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_tx64 = CAST(AV1E_SET_ENABLE_TX64, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_flip_idtx(aom_codec_alg_priv_t *ctx,
+                                                 va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_flip_idtx = CAST(AV1E_SET_ENABLE_FLIP_IDTX, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_dist_wtd_comp(aom_codec_alg_priv_t *ctx,
+                                                     va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_dist_wtd_comp = CAST(AV1E_SET_ENABLE_DIST_WTD_COMP, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_max_reference_frames(aom_codec_alg_priv_t *ctx,
+                                                     va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.max_reference_frames = CAST(AV1E_SET_MAX_REFERENCE_FRAMES, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_reduced_reference_set(
+    aom_codec_alg_priv_t *ctx, va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_reduced_reference_set =
+      CAST(AV1E_SET_REDUCED_REFERENCE_SET, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_ref_frame_mvs(aom_codec_alg_priv_t *ctx,
+                                                     va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_ref_frame_mvs = CAST(AV1E_SET_ENABLE_REF_FRAME_MVS, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_allow_ref_frame_mvs(aom_codec_alg_priv_t *ctx,
+                                                    va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.allow_ref_frame_mvs = CAST(AV1E_SET_ALLOW_REF_FRAME_MVS, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_masked_comp(aom_codec_alg_priv_t *ctx,
+                                                   va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_masked_comp = CAST(AV1E_SET_ENABLE_MASKED_COMP, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_onesided_comp(aom_codec_alg_priv_t *ctx,
+                                                     va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_onesided_comp = CAST(AV1E_SET_ENABLE_ONESIDED_COMP, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_interintra_comp(
+    aom_codec_alg_priv_t *ctx, va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_interintra_comp =
+      CAST(AV1E_SET_ENABLE_INTERINTRA_COMP, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_smooth_interintra(
+    aom_codec_alg_priv_t *ctx, va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_smooth_interintra =
+      CAST(AV1E_SET_ENABLE_SMOOTH_INTERINTRA, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_diff_wtd_comp(aom_codec_alg_priv_t *ctx,
+                                                     va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_diff_wtd_comp = CAST(AV1E_SET_ENABLE_DIFF_WTD_COMP, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_interinter_wedge(
+    aom_codec_alg_priv_t *ctx, va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_interinter_wedge =
+      CAST(AV1E_SET_ENABLE_INTERINTER_WEDGE, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_interintra_wedge(
+    aom_codec_alg_priv_t *ctx, va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_interintra_wedge =
+      CAST(AV1E_SET_ENABLE_INTERINTRA_WEDGE, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_global_motion(aom_codec_alg_priv_t *ctx,
+                                                     va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_global_motion = CAST(AV1E_SET_ENABLE_GLOBAL_MOTION, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_warped_motion(aom_codec_alg_priv_t *ctx,
+                                                     va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_warped_motion = CAST(AV1E_SET_ENABLE_WARPED_MOTION, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_allow_warped_motion(aom_codec_alg_priv_t *ctx,
+                                                    va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.allow_warped_motion = CAST(AV1E_SET_ALLOW_WARPED_MOTION, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_filter_intra(aom_codec_alg_priv_t *ctx,
+                                                    va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_filter_intra = CAST(AV1E_SET_ENABLE_FILTER_INTRA, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_smooth_intra(aom_codec_alg_priv_t *ctx,
+                                                    va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_smooth_intra = CAST(AV1E_SET_ENABLE_SMOOTH_INTRA, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_paeth_intra(aom_codec_alg_priv_t *ctx,
+                                                   va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_paeth_intra = CAST(AV1E_SET_ENABLE_PAETH_INTRA, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_cfl_intra(aom_codec_alg_priv_t *ctx,
+                                                 va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_cfl_intra = CAST(AV1E_SET_ENABLE_CFL_INTRA, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_superres(aom_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_superres = CAST(AV1E_SET_ENABLE_SUPERRES, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_overlay(aom_codec_alg_priv_t *ctx,
+                                               va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_overlay = CAST(AV1E_SET_ENABLE_OVERLAY, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_palette(aom_codec_alg_priv_t *ctx,
+                                               va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_palette = CAST(AV1E_SET_ENABLE_PALETTE, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_intrabc(aom_codec_alg_priv_t *ctx,
+                                               va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_intrabc = CAST(AV1E_SET_ENABLE_INTRABC, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_angle_delta(aom_codec_alg_priv_t *ctx,
+                                                   va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_angle_delta = CAST(AV1E_SET_ENABLE_ANGLE_DELTA, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_error_resilient_mode(aom_codec_alg_priv_t *ctx,
+                                                     va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.error_resilient_mode = CAST(AV1E_SET_ERROR_RESILIENT_MODE, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_s_frame_mode(aom_codec_alg_priv_t *ctx,
+                                             va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.s_frame_mode = CAST(AV1E_SET_S_FRAME_MODE, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_frame_parallel_decoding_mode(
+    aom_codec_alg_priv_t *ctx, va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.frame_parallel_decoding_mode =
+      CAST(AV1E_SET_FRAME_PARALLEL_DECODING, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_single_tile_decoding(aom_codec_alg_priv_t *ctx,
+                                                     va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.single_tile_decoding = CAST(AV1E_SET_SINGLE_TILE_DECODING, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_aq_mode(aom_codec_alg_priv_t *ctx,
+                                        va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.aq_mode = CAST(AV1E_SET_AQ_MODE, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_reduced_tx_type_set(aom_codec_alg_priv_t *ctx,
+                                                    va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.reduced_tx_type_set = CAST(AV1E_SET_REDUCED_TX_TYPE_SET, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_intra_dct_only(aom_codec_alg_priv_t *ctx,
+                                               va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.use_intra_dct_only = CAST(AV1E_SET_INTRA_DCT_ONLY, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_inter_dct_only(aom_codec_alg_priv_t *ctx,
+                                               va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.use_inter_dct_only = CAST(AV1E_SET_INTER_DCT_ONLY, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_intra_default_tx_only(aom_codec_alg_priv_t *ctx,
+                                                      va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.use_intra_default_tx_only =
+      CAST(AV1E_SET_INTRA_DEFAULT_TX_ONLY, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_quant_b_adapt(aom_codec_alg_priv_t *ctx,
+                                              va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.quant_b_adapt = CAST(AV1E_SET_QUANT_B_ADAPT, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_coeff_cost_upd_freq(aom_codec_alg_priv_t *ctx,
+                                                    va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.coeff_cost_upd_freq = CAST(AV1E_SET_COEFF_COST_UPD_FREQ, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_mode_cost_upd_freq(aom_codec_alg_priv_t *ctx,
+                                                   va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.mode_cost_upd_freq = CAST(AV1E_SET_MODE_COST_UPD_FREQ, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_mv_cost_upd_freq(aom_codec_alg_priv_t *ctx,
+                                                 va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.mv_cost_upd_freq = CAST(AV1E_SET_MV_COST_UPD_FREQ, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_vmaf_model_path(aom_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.vmaf_model_path = CAST(AV1E_SET_VMAF_MODEL_PATH, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_film_grain_test_vector(
+    aom_codec_alg_priv_t *ctx, va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.film_grain_test_vector =
+      CAST(AV1E_SET_FILM_GRAIN_TEST_VECTOR, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_film_grain_table(aom_codec_alg_priv_t *ctx,
+                                                 va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.film_grain_table_filename = CAST(AV1E_SET_FILM_GRAIN_TABLE, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_denoise_noise_level(aom_codec_alg_priv_t *ctx,
+                                                    va_list args) {
+#if !CONFIG_DENOISE
+  (void)ctx;
+  (void)args;
+  return AOM_CODEC_INCAPABLE;
+#else
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.noise_level =
+      ((float)CAST(AV1E_SET_DENOISE_NOISE_LEVEL, args)) / 10.0f;
+  return update_extra_cfg(ctx, &extra_cfg);
+#endif
+}
+
+static aom_codec_err_t ctrl_set_denoise_block_size(aom_codec_alg_priv_t *ctx,
+                                                   va_list args) {
+#if !CONFIG_DENOISE
+  (void)ctx;
+  (void)args;
+  return AOM_CODEC_INCAPABLE;
+#else
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.noise_block_size = CAST(AV1E_SET_DENOISE_BLOCK_SIZE, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+#endif
+}
+
+static aom_codec_err_t ctrl_set_deltaq_mode(aom_codec_alg_priv_t *ctx,
+                                            va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.deltaq_mode = CAST(AV1E_SET_DELTAQ_MODE, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_deltalf_mode(aom_codec_alg_priv_t *ctx,
+                                             va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.deltalf_mode = CAST(AV1E_SET_DELTALF_MODE, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_min_gf_interval(aom_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.min_gf_interval = CAST(AV1E_SET_MIN_GF_INTERVAL, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_max_gf_interval(aom_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.max_gf_interval = CAST(AV1E_SET_MAX_GF_INTERVAL, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_gf_min_pyr_height(aom_codec_alg_priv_t *ctx,
+                                                  va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.gf_min_pyr_height = CAST(AV1E_SET_GF_MIN_PYRAMID_HEIGHT, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_gf_max_pyr_height(aom_codec_alg_priv_t *ctx,
+                                                  va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.gf_max_pyr_height = CAST(AV1E_SET_GF_MAX_PYRAMID_HEIGHT, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_frame_periodic_boost(aom_codec_alg_priv_t *ctx,
+                                                     va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.frame_periodic_boost = CAST(AV1E_SET_FRAME_PERIODIC_BOOST, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_enable_motion_vector_unit_test(
+    aom_codec_alg_priv_t *ctx, va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.motion_vector_unit_test =
+      CAST(AV1E_ENABLE_MOTION_VECTOR_UNIT_TEST, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_enable_ext_tile_debug(aom_codec_alg_priv_t *ctx,
+                                                  va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.ext_tile_debug = CAST(AV1E_ENABLE_EXT_TILE_DEBUG, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_target_seq_level_idx(aom_codec_alg_priv_t *ctx,
+                                                     va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  const int val = CAST(AV1E_SET_TARGET_SEQ_LEVEL_IDX, args);
+  const int level = val % 100;
+  const int operating_point_idx = val / 100;
+  if (operating_point_idx >= 0 &&
+      operating_point_idx < MAX_NUM_OPERATING_POINTS) {
+    extra_cfg.target_seq_level_idx[operating_point_idx] = (AV1_LEVEL)level;
+  }
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_tier_mask(aom_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.tier_mask = CAST(AV1E_SET_TIER_MASK, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_min_cr(aom_codec_alg_priv_t *ctx,
+                                       va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.min_cr = CAST(AV1E_SET_MIN_CR, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_enable_sb_multipass_unit_test(
+    aom_codec_alg_priv_t *ctx, va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.sb_multipass_unit_test =
+      CAST(AV1E_ENABLE_SB_MULTIPASS_UNIT_TEST, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+#if !CONFIG_REALTIME_ONLY
+static aom_codec_err_t create_stats_buffer(FIRSTPASS_STATS **frame_stats_buffer,
+                                           STATS_BUFFER_CTX *stats_buf_context,
+                                           int num_lap_buffers) {
+  aom_codec_err_t res = AOM_CODEC_OK;
+
+  int size = get_stats_buf_size(num_lap_buffers, MAX_LAG_BUFFERS);
+  *frame_stats_buffer =
+      (FIRSTPASS_STATS *)aom_calloc(size, sizeof(FIRSTPASS_STATS));
+  if (*frame_stats_buffer == NULL) return AOM_CODEC_MEM_ERROR;
+
+  stats_buf_context->stats_in_start = *frame_stats_buffer;
+  stats_buf_context->stats_in_end = stats_buf_context->stats_in_start;
+  stats_buf_context->stats_in_buf_end =
+      stats_buf_context->stats_in_start + size;
+
+  stats_buf_context->total_left_stats = aom_calloc(1, sizeof(FIRSTPASS_STATS));
+  if (stats_buf_context->total_left_stats == NULL) return AOM_CODEC_MEM_ERROR;
+  av1_twopass_zero_stats(stats_buf_context->total_left_stats);
+  stats_buf_context->total_stats = aom_calloc(1, sizeof(FIRSTPASS_STATS));
+  if (stats_buf_context->total_stats == NULL) return AOM_CODEC_MEM_ERROR;
+  av1_twopass_zero_stats(stats_buf_context->total_stats);
+  return res;
+}
+#endif
+
+static aom_codec_err_t create_context_and_bufferpool(
+    AV1_COMP **p_cpi, BufferPool **p_buffer_pool, AV1EncoderConfig *oxcf,
+    struct aom_codec_pkt_list *pkt_list_head, FIRSTPASS_STATS *frame_stats_buf,
+    COMPRESSOR_STAGE stage, int num_lap_buffers, int lap_lag_in_frames,
+    STATS_BUFFER_CTX *stats_buf_context) {
+  aom_codec_err_t res = AOM_CODEC_OK;
+
+  *p_buffer_pool = (BufferPool *)aom_calloc(1, sizeof(BufferPool));
+  if (*p_buffer_pool == NULL) return AOM_CODEC_MEM_ERROR;
+
+#if CONFIG_MULTITHREAD
+  if (pthread_mutex_init(&((*p_buffer_pool)->pool_mutex), NULL)) {
+    return AOM_CODEC_MEM_ERROR;
+  }
+#endif
+  *p_cpi = av1_create_compressor(oxcf, *p_buffer_pool, frame_stats_buf, stage,
+                                 num_lap_buffers, lap_lag_in_frames,
+                                 stats_buf_context);
+  if (*p_cpi == NULL)
+    res = AOM_CODEC_MEM_ERROR;
+  else
+    (*p_cpi)->output_pkt_list = pkt_list_head;
+
+  return res;
+}
+
+static aom_codec_err_t encoder_init(aom_codec_ctx_t *ctx) {
+  aom_codec_err_t res = AOM_CODEC_OK;
+
+  if (ctx->priv == NULL) {
+    aom_codec_alg_priv_t *const priv = aom_calloc(1, sizeof(*priv));
+    if (priv == NULL) return AOM_CODEC_MEM_ERROR;
+
+    ctx->priv = (aom_codec_priv_t *)priv;
+    ctx->priv->init_flags = ctx->init_flags;
+
+    if (ctx->config.enc) {
+      // Update the reference to the config structure to an internal copy.
+      priv->cfg = *ctx->config.enc;
+      ctx->config.enc = &priv->cfg;
+    }
+
+    priv->extra_cfg = default_extra_cfg;
+    aom_once(av1_initialize_enc);
+
+    res = validate_config(priv, &priv->cfg, &priv->extra_cfg);
+
+    if (res == AOM_CODEC_OK) {
+      int *num_lap_buffers = &priv->num_lap_buffers;
+      int lap_lag_in_frames = 0;
+      *num_lap_buffers = 0;
+      priv->timestamp_ratio.den = priv->cfg.g_timebase.den;
+      priv->timestamp_ratio.num =
+          (int64_t)priv->cfg.g_timebase.num * TICKS_PER_SEC;
+      reduce_ratio(&priv->timestamp_ratio);
+
+      set_encoder_config(&priv->oxcf, &priv->cfg, &priv->extra_cfg);
+      if (priv->oxcf.rc_mode == AOM_Q && priv->oxcf.pass == 0 &&
+          priv->oxcf.mode == GOOD) {
+        // Enable look ahead
+        *num_lap_buffers = priv->cfg.g_lag_in_frames;
+        *num_lap_buffers =
+            clamp(*num_lap_buffers, 1,
+                  AOMMIN(MAX_LAP_BUFFERS,
+                         priv->oxcf.key_freq + SCENE_CUT_KEY_TEST_INTERVAL));
+        if ((int)priv->cfg.g_lag_in_frames - (*num_lap_buffers) >=
+            LAP_LAG_IN_FRAMES) {
+          lap_lag_in_frames = LAP_LAG_IN_FRAMES;
+        }
+      }
+      priv->oxcf.use_highbitdepth =
+          (ctx->init_flags & AOM_CODEC_USE_HIGHBITDEPTH) ? 1 : 0;
+
+#if !CONFIG_REALTIME_ONLY
+      res = create_stats_buffer(&priv->frame_stats_buffer,
+                                &priv->stats_buf_context, *num_lap_buffers);
+      if (res != AOM_CODEC_OK) return AOM_CODEC_MEM_ERROR;
+#endif
+
+      res = create_context_and_bufferpool(
+          &priv->cpi, &priv->buffer_pool, &priv->oxcf, &priv->pkt_list.head,
+          priv->frame_stats_buffer, ENCODE_STAGE, *num_lap_buffers, -1,
+          &priv->stats_buf_context);
+
+      // Create another compressor if look ahead is enabled
+      if (res == AOM_CODEC_OK && *num_lap_buffers) {
+        res = create_context_and_bufferpool(
+            &priv->cpi_lap, &priv->buffer_pool_lap, &priv->oxcf, NULL,
+            priv->frame_stats_buffer, LAP_STAGE, *num_lap_buffers,
+            clamp(lap_lag_in_frames, 0, MAX_LAG_BUFFERS),
+            &priv->stats_buf_context);
+      }
+    }
+  }
+
+  return res;
+}
+
+static void destroy_context_and_bufferpool(AV1_COMP *cpi,
+                                           BufferPool *buffer_pool) {
+  av1_remove_compressor(cpi);
+#if CONFIG_MULTITHREAD
+  if (buffer_pool) pthread_mutex_destroy(&buffer_pool->pool_mutex);
+#endif
+  aom_free(buffer_pool);
+}
+
+static void destroy_stats_buffer(STATS_BUFFER_CTX *stats_buf_context,
+                                 FIRSTPASS_STATS *frame_stats_buffer) {
+  aom_free(stats_buf_context->total_left_stats);
+  aom_free(stats_buf_context->total_stats);
+  aom_free(frame_stats_buffer);
+}
+
+static aom_codec_err_t encoder_destroy(aom_codec_alg_priv_t *ctx) {
+  free(ctx->cx_data);
+  destroy_context_and_bufferpool(ctx->cpi, ctx->buffer_pool);
+  if (ctx->cpi_lap) {
+    // As both cpi and cpi_lap have the same lookahead_ctx, it is already freed
+    // when destroy is called on cpi. Thus, setting lookahead_ctx to null here,
+    // so that it doesn't attempt to free it again.
+    ctx->cpi_lap->lookahead = NULL;
+    destroy_context_and_bufferpool(ctx->cpi_lap, ctx->buffer_pool_lap);
+  }
+  destroy_stats_buffer(&ctx->stats_buf_context, ctx->frame_stats_buffer);
+  aom_free(ctx);
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_frame_flags_t get_frame_pkt_flags(const AV1_COMP *cpi,
+                                                   unsigned int lib_flags) {
+  aom_codec_frame_flags_t flags = lib_flags << 16;
+
+  if (lib_flags & FRAMEFLAGS_KEY) flags |= AOM_FRAME_IS_KEY;
+  if (lib_flags & FRAMEFLAGS_INTRAONLY) flags |= AOM_FRAME_IS_INTRAONLY;
+  if (lib_flags & FRAMEFLAGS_SWITCH) flags |= AOM_FRAME_IS_SWITCH;
+  if (lib_flags & FRAMEFLAGS_ERROR_RESILIENT)
+    flags |= AOM_FRAME_IS_ERROR_RESILIENT;
+  if (cpi->droppable) flags |= AOM_FRAME_IS_DROPPABLE;
+
+  return flags;
+}
+
+// TODO(Mufaddal): Check feasibility of abstracting functions related to LAP
+// into a separate function.
+static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
+                                      const aom_image_t *img,
+                                      aom_codec_pts_t pts,
+                                      unsigned long duration,
+                                      aom_enc_frame_flags_t enc_flags) {
+  const size_t kMinCompressedSize = 8192;
+  volatile aom_codec_err_t res = AOM_CODEC_OK;
+  AV1_COMP *const cpi = ctx->cpi;
+  const aom_rational64_t *const timestamp_ratio = &ctx->timestamp_ratio;
+  volatile aom_codec_pts_t ptsvol = pts;
+  // LAP context
+  AV1_COMP *cpi_lap = ctx->cpi_lap;
+
+  if (cpi == NULL) return AOM_CODEC_INVALID_PARAM;
+
+  if (cpi->lap_enabled && cpi_lap == NULL && cpi->oxcf.pass == 0)
+    return AOM_CODEC_INVALID_PARAM;
+
+  if (img != NULL) {
+    res = validate_img(ctx, img);
+    // TODO(jzern) the checks related to cpi's validity should be treated as a
+    // failure condition, encoder setup is done fully in init() currently.
+    if (res == AOM_CODEC_OK) {
+      size_t data_sz = ALIGN_POWER_OF_TWO(ctx->cfg.g_w, 5) *
+                       ALIGN_POWER_OF_TWO(ctx->cfg.g_h, 5) * get_image_bps(img);
+      if (data_sz < kMinCompressedSize) data_sz = kMinCompressedSize;
+      if (ctx->cx_data == NULL || ctx->cx_data_sz < data_sz) {
+        ctx->cx_data_sz = data_sz;
+        free(ctx->cx_data);
+        ctx->cx_data = (unsigned char *)malloc(ctx->cx_data_sz);
+        if (ctx->cx_data == NULL) {
+          return AOM_CODEC_MEM_ERROR;
+        }
+      }
+    }
+  }
+  if (ctx->oxcf.mode != GOOD && ctx->oxcf.mode != REALTIME) {
+    ctx->oxcf.mode = GOOD;
+    av1_change_config(ctx->cpi, &ctx->oxcf);
+  }
+
+  if (!ctx->pts_offset_initialized) {
+    ctx->pts_offset = ptsvol;
+    ctx->pts_offset_initialized = 1;
+  }
+  ptsvol -= ctx->pts_offset;
+
+  aom_codec_pkt_list_init(&ctx->pkt_list);
+
+  volatile aom_enc_frame_flags_t flags = enc_flags;
+
+  // The jmp_buf is valid only for the duration of the function that calls
+  // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+  // before it returns.
+  if (setjmp(cpi->common.error.jmp)) {
+    cpi->common.error.setjmp = 0;
+    res = update_error_state(ctx, &cpi->common.error);
+    aom_clear_system_state();
+    return res;
+  }
+  cpi->common.error.setjmp = 1;
+  if (cpi_lap != NULL) {
+    if (setjmp(cpi_lap->common.error.jmp)) {
+      cpi_lap->common.error.setjmp = 0;
+      res = update_error_state(ctx, &cpi_lap->common.error);
+      aom_clear_system_state();
+      return res;
+    }
+    cpi_lap->common.error.setjmp = 1;
+  }
+
+  // Note(yunqing): While applying encoding flags, always start from enabling
+  // all, and then modifying according to the flags. Previous frame's flags are
+  // overwritten.
+  av1_apply_encoding_flags(cpi, flags);
+  if (cpi_lap != NULL) {
+    av1_apply_encoding_flags(cpi_lap, flags);
+  }
+
+  // Handle fixed keyframe intervals
+  if (is_stat_generation_stage(cpi)) {
+    if (ctx->cfg.kf_mode == AOM_KF_AUTO &&
+        ctx->cfg.kf_min_dist == ctx->cfg.kf_max_dist) {
+      if (cpi->common.spatial_layer_id == 0 &&
+          ++ctx->fixed_kf_cntr > ctx->cfg.kf_min_dist) {
+        flags |= AOM_EFLAG_FORCE_KF;
+        ctx->fixed_kf_cntr = 1;
+      }
+    }
+  }
+
+  if (res == AOM_CODEC_OK) {
+    int64_t dst_time_stamp = timebase_units_to_ticks(timestamp_ratio, ptsvol);
+    int64_t dst_end_time_stamp =
+        timebase_units_to_ticks(timestamp_ratio, ptsvol + duration);
+
+    // Set up internal flags
+    if (ctx->base.init_flags & AOM_CODEC_USE_PSNR) cpi->b_calculate_psnr = 1;
+
+    if (img != NULL) {
+      YV12_BUFFER_CONFIG sd;
+      int use_highbitdepth, subsampling_x, subsampling_y;
+      res = image2yuvconfig(img, &sd);
+      use_highbitdepth = (sd.flags & YV12_FLAG_HIGHBITDEPTH) != 0;
+      subsampling_x = sd.subsampling_x;
+      subsampling_y = sd.subsampling_y;
+
+      if (!cpi->lookahead) {
+        int lag_in_frames = cpi_lap != NULL ? cpi_lap->oxcf.lag_in_frames
+                                            : cpi->oxcf.lag_in_frames;
+
+        cpi->lookahead = av1_lookahead_init(
+            cpi->oxcf.width, cpi->oxcf.height, subsampling_x, subsampling_y,
+            use_highbitdepth, lag_in_frames, cpi->oxcf.border_in_pixels,
+            cpi->common.features.byte_alignment, ctx->num_lap_buffers);
+      }
+      if (!cpi->lookahead)
+        aom_internal_error(&cpi->common.error, AOM_CODEC_MEM_ERROR,
+                           "Failed to allocate lag buffers");
+
+      av1_check_initial_width(cpi, use_highbitdepth, subsampling_x,
+                              subsampling_y);
+      if (cpi_lap != NULL) {
+        cpi_lap->lookahead = cpi->lookahead;
+        av1_check_initial_width(cpi_lap, use_highbitdepth, subsampling_x,
+                                subsampling_y);
+      }
+
+      // Store the original flags in to the frame buffer. Will extract the
+      // key frame flag when we actually encode this frame.
+      if (av1_receive_raw_frame(cpi, flags | ctx->next_frame_flags, &sd,
+                                dst_time_stamp, dst_end_time_stamp)) {
+        res = update_error_state(ctx, &cpi->common.error);
+      }
+      ctx->next_frame_flags = 0;
+    }
+
+    unsigned char *cx_data = ctx->cx_data;
+    size_t cx_data_sz = ctx->cx_data_sz;
+
+    assert(!(cx_data == NULL && cx_data_sz != 0));
+
+    /* Any pending invisible frames? */
+    if (ctx->pending_cx_data) {
+      memmove(cx_data, ctx->pending_cx_data, ctx->pending_cx_data_sz);
+      ctx->pending_cx_data = cx_data;
+      cx_data += ctx->pending_cx_data_sz;
+      cx_data_sz -= ctx->pending_cx_data_sz;
+
+      /* TODO: this is a minimal check, the underlying codec doesn't respect
+       * the buffer size anyway.
+       */
+      if (cx_data_sz < ctx->cx_data_sz / 2) {
+        aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR,
+                           "Compressed data buffer too small");
+      }
+    }
+
+    size_t frame_size = 0;
+    unsigned int lib_flags = 0;
+    int is_frame_visible = 0;
+    int index_size = 0;
+    int has_fwd_keyframe = 0;
+
+    // Call for LAP stage
+    if (cpi_lap != NULL) {
+      int status;
+      aom_rational64_t timestamp_ratio_la = *timestamp_ratio;
+      int64_t dst_time_stamp_la = dst_time_stamp;
+      int64_t dst_end_time_stamp_la = dst_end_time_stamp;
+      status = av1_get_compressed_data(
+          cpi_lap, &lib_flags, &frame_size, NULL, &dst_time_stamp_la,
+          &dst_end_time_stamp_la, !img, &timestamp_ratio_la);
+      if (status != -1) {
+        if (status != AOM_CODEC_OK) {
+          aom_internal_error(&cpi_lap->common.error, AOM_CODEC_ERROR, NULL);
+        }
+        cpi_lap->seq_params_locked = 1;
+      }
+      lib_flags = 0;
+      frame_size = 0;
+    }
+
+    // invisible frames get packed with the next visible frame
+    while (cx_data_sz - index_size >= ctx->cx_data_sz / 2 &&
+           !is_frame_visible) {
+      const int status = av1_get_compressed_data(
+          cpi, &lib_flags, &frame_size, cx_data, &dst_time_stamp,
+          &dst_end_time_stamp, !img, timestamp_ratio);
+      if (status == -1) break;
+      if (status != AOM_CODEC_OK) {
+        aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR, NULL);
+      }
+
+      cpi->seq_params_locked = 1;
+      if (frame_size) {
+        if (ctx->pending_cx_data == 0) ctx->pending_cx_data = cx_data;
+
+        const int write_temporal_delimiter =
+            !cpi->common.spatial_layer_id && !ctx->pending_frame_count;
+
+        if (write_temporal_delimiter) {
+          uint32_t obu_header_size = 1;
+          const uint32_t obu_payload_size = 0;
+          const size_t length_field_size =
+              aom_uleb_size_in_bytes(obu_payload_size);
+
+          if (ctx->pending_cx_data) {
+            const size_t move_offset = length_field_size + 1;
+            memmove(ctx->pending_cx_data + move_offset, ctx->pending_cx_data,
+                    frame_size);
+          }
+          const uint32_t obu_header_offset = 0;
+          obu_header_size = av1_write_obu_header(
+              &cpi->level_params, OBU_TEMPORAL_DELIMITER, 0,
+              (uint8_t *)(ctx->pending_cx_data + obu_header_offset));
+
+          // OBUs are preceded/succeeded by an unsigned leb128 coded integer.
+          if (av1_write_uleb_obu_size(obu_header_size, obu_payload_size,
+                                      ctx->pending_cx_data) != AOM_CODEC_OK) {
+            aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR, NULL);
+          }
+
+          frame_size += obu_header_size + obu_payload_size + length_field_size;
+        }
+
+        if (ctx->oxcf.save_as_annexb) {
+          size_t curr_frame_size = frame_size;
+          if (av1_convert_sect5obus_to_annexb(cx_data, &curr_frame_size) !=
+              AOM_CODEC_OK) {
+            aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR, NULL);
+          }
+          frame_size = curr_frame_size;
+
+          // B_PRIME (add frame size)
+          const size_t length_field_size = aom_uleb_size_in_bytes(frame_size);
+          if (ctx->pending_cx_data) {
+            const size_t move_offset = length_field_size;
+            memmove(cx_data + move_offset, cx_data, frame_size);
+          }
+          if (av1_write_uleb_obu_size(0, (uint32_t)frame_size, cx_data) !=
+              AOM_CODEC_OK) {
+            aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR, NULL);
+          }
+          frame_size += length_field_size;
+        }
+
+        ctx->pending_frame_sizes[ctx->pending_frame_count++] = frame_size;
+        ctx->pending_cx_data_sz += frame_size;
+
+        cx_data += frame_size;
+        cx_data_sz -= frame_size;
+
+        index_size = MAG_SIZE * (ctx->pending_frame_count - 1) + 2;
+
+        is_frame_visible = cpi->common.show_frame;
+
+        has_fwd_keyframe |= (!is_frame_visible &&
+                             cpi->common.current_frame.frame_type == KEY_FRAME);
+      }
+    }
+    if (is_frame_visible) {
+      // Add the frame packet to the list of returned packets.
+      aom_codec_cx_pkt_t pkt;
+
+      if (ctx->oxcf.save_as_annexb) {
+        //  B_PRIME (add TU size)
+        size_t tu_size = ctx->pending_cx_data_sz;
+        const size_t length_field_size = aom_uleb_size_in_bytes(tu_size);
+        if (ctx->pending_cx_data) {
+          const size_t move_offset = length_field_size;
+          memmove(ctx->pending_cx_data + move_offset, ctx->pending_cx_data,
+                  tu_size);
+        }
+        if (av1_write_uleb_obu_size(0, (uint32_t)tu_size,
+                                    ctx->pending_cx_data) != AOM_CODEC_OK) {
+          aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR, NULL);
+        }
+        ctx->pending_cx_data_sz += length_field_size;
+      }
+
+      pkt.kind = AOM_CODEC_CX_FRAME_PKT;
+
+      pkt.data.frame.buf = ctx->pending_cx_data;
+      pkt.data.frame.sz = ctx->pending_cx_data_sz;
+      pkt.data.frame.partition_id = -1;
+      pkt.data.frame.vis_frame_size = frame_size;
+
+      pkt.data.frame.pts =
+          ticks_to_timebase_units(timestamp_ratio, dst_time_stamp) +
+          ctx->pts_offset;
+      pkt.data.frame.flags = get_frame_pkt_flags(cpi, lib_flags);
+      if (has_fwd_keyframe) {
+        // If one of the invisible frames in the packet is a keyframe, set
+        // the delayed random access point flag.
+        pkt.data.frame.flags |= AOM_FRAME_IS_DELAYED_RANDOM_ACCESS_POINT;
+      }
+      pkt.data.frame.duration = (uint32_t)ticks_to_timebase_units(
+          timestamp_ratio, dst_end_time_stamp - dst_time_stamp);
+
+      aom_codec_pkt_list_add(&ctx->pkt_list.head, &pkt);
+
+      ctx->pending_cx_data = NULL;
+      ctx->pending_cx_data_sz = 0;
+      ctx->pending_frame_count = 0;
+    }
+  }
+
+  cpi->common.error.setjmp = 0;
+  return res;
+}
+
+static const aom_codec_cx_pkt_t *encoder_get_cxdata(aom_codec_alg_priv_t *ctx,
+                                                    aom_codec_iter_t *iter) {
+  return aom_codec_pkt_list_get(&ctx->pkt_list.head, iter);
+}
+
+static aom_codec_err_t ctrl_set_reference(aom_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  av1_ref_frame_t *const frame = va_arg(args, av1_ref_frame_t *);
+
+  if (frame != NULL) {
+    YV12_BUFFER_CONFIG sd;
+
+    image2yuvconfig(&frame->img, &sd);
+    av1_set_reference_enc(ctx->cpi, frame->idx, &sd);
+    return AOM_CODEC_OK;
+  } else {
+    return AOM_CODEC_INVALID_PARAM;
+  }
+}
+
+static aom_codec_err_t ctrl_copy_reference(aom_codec_alg_priv_t *ctx,
+                                           va_list args) {
+  av1_ref_frame_t *const frame = va_arg(args, av1_ref_frame_t *);
+
+  if (frame != NULL) {
+    YV12_BUFFER_CONFIG sd;
+
+    image2yuvconfig(&frame->img, &sd);
+    av1_copy_reference_enc(ctx->cpi, frame->idx, &sd);
+    return AOM_CODEC_OK;
+  } else {
+    return AOM_CODEC_INVALID_PARAM;
+  }
+}
+
+static aom_codec_err_t ctrl_get_reference(aom_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  av1_ref_frame_t *const frame = va_arg(args, av1_ref_frame_t *);
+
+  if (frame != NULL) {
+    YV12_BUFFER_CONFIG *fb = get_ref_frame(&ctx->cpi->common, frame->idx);
+    if (fb == NULL) return AOM_CODEC_ERROR;
+
+    yuvconfig2image(&frame->img, fb, NULL);
+    return AOM_CODEC_OK;
+  } else {
+    return AOM_CODEC_INVALID_PARAM;
+  }
+}
+
+static aom_codec_err_t ctrl_get_new_frame_image(aom_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  aom_image_t *const new_img = va_arg(args, aom_image_t *);
+
+  if (new_img != NULL) {
+    YV12_BUFFER_CONFIG new_frame;
+
+    if (av1_get_last_show_frame(ctx->cpi, &new_frame) == 0) {
+      yuvconfig2image(new_img, &new_frame, NULL);
+      return AOM_CODEC_OK;
+    } else {
+      return AOM_CODEC_ERROR;
+    }
+  } else {
+    return AOM_CODEC_INVALID_PARAM;
+  }
+}
+
+static aom_codec_err_t ctrl_copy_new_frame_image(aom_codec_alg_priv_t *ctx,
+                                                 va_list args) {
+  aom_image_t *const new_img = va_arg(args, aom_image_t *);
+
+  if (new_img != NULL) {
+    YV12_BUFFER_CONFIG new_frame;
+
+    if (av1_get_last_show_frame(ctx->cpi, &new_frame) == 0) {
+      YV12_BUFFER_CONFIG sd;
+      image2yuvconfig(new_img, &sd);
+      return av1_copy_new_frame_enc(&ctx->cpi->common, &new_frame, &sd);
+    } else {
+      return AOM_CODEC_ERROR;
+    }
+  } else {
+    return AOM_CODEC_INVALID_PARAM;
+  }
+}
+
+static aom_image_t *encoder_get_preview(aom_codec_alg_priv_t *ctx) {
+  YV12_BUFFER_CONFIG sd;
+
+  if (av1_get_preview_raw_frame(ctx->cpi, &sd) == 0) {
+    yuvconfig2image(&ctx->preview_img, &sd, NULL);
+    return &ctx->preview_img;
+  } else {
+    return NULL;
+  }
+}
+
+static aom_codec_err_t ctrl_use_reference(aom_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  const int reference_flag = va_arg(args, int);
+
+  av1_use_as_reference(&ctx->cpi->ext_flags.ref_frame_flags, reference_flag);
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_roi_map(aom_codec_alg_priv_t *ctx,
+                                        va_list args) {
+  (void)ctx;
+  (void)args;
+
+  // TODO(yaowu): Need to re-implement and test for AV1.
+  return AOM_CODEC_INVALID_PARAM;
+}
+
+static aom_codec_err_t ctrl_set_active_map(aom_codec_alg_priv_t *ctx,
+                                           va_list args) {
+  aom_active_map_t *const map = va_arg(args, aom_active_map_t *);
+
+  if (map) {
+    if (!av1_set_active_map(ctx->cpi, map->active_map, (int)map->rows,
+                            (int)map->cols))
+      return AOM_CODEC_OK;
+    else
+      return AOM_CODEC_INVALID_PARAM;
+  } else {
+    return AOM_CODEC_INVALID_PARAM;
+  }
+}
+
+static aom_codec_err_t ctrl_get_active_map(aom_codec_alg_priv_t *ctx,
+                                           va_list args) {
+  aom_active_map_t *const map = va_arg(args, aom_active_map_t *);
+
+  if (map) {
+    if (!av1_get_active_map(ctx->cpi, map->active_map, (int)map->rows,
+                            (int)map->cols))
+      return AOM_CODEC_OK;
+    else
+      return AOM_CODEC_INVALID_PARAM;
+  } else {
+    return AOM_CODEC_INVALID_PARAM;
+  }
+}
+
+static aom_codec_err_t ctrl_set_scale_mode(aom_codec_alg_priv_t *ctx,
+                                           va_list args) {
+  aom_scaling_mode_t *const mode = va_arg(args, aom_scaling_mode_t *);
+
+  if (mode) {
+    const int res = av1_set_internal_size(
+        &ctx->cpi->oxcf, &ctx->cpi->resize_pending_params,
+        (AOM_SCALING)mode->h_scaling_mode, (AOM_SCALING)mode->v_scaling_mode);
+    return (res == 0) ? AOM_CODEC_OK : AOM_CODEC_INVALID_PARAM;
+  } else {
+    return AOM_CODEC_INVALID_PARAM;
+  }
+}
+
+static aom_codec_err_t ctrl_set_spatial_layer_id(aom_codec_alg_priv_t *ctx,
+                                                 va_list args) {
+  const int spatial_layer_id = va_arg(args, int);
+  if (spatial_layer_id >= MAX_NUM_SPATIAL_LAYERS)
+    return AOM_CODEC_INVALID_PARAM;
+  ctx->cpi->common.spatial_layer_id = spatial_layer_id;
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_number_spatial_layers(aom_codec_alg_priv_t *ctx,
+                                                      va_list args) {
+  const int number_spatial_layers = va_arg(args, int);
+  if (number_spatial_layers > MAX_NUM_SPATIAL_LAYERS)
+    return AOM_CODEC_INVALID_PARAM;
+  ctx->cpi->common.number_spatial_layers = number_spatial_layers;
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_layer_id(aom_codec_alg_priv_t *ctx,
+                                         va_list args) {
+  aom_svc_layer_id_t *const data = va_arg(args, aom_svc_layer_id_t *);
+  ctx->cpi->common.spatial_layer_id = data->spatial_layer_id;
+  ctx->cpi->common.temporal_layer_id = data->temporal_layer_id;
+  ctx->cpi->svc.spatial_layer_id = data->spatial_layer_id;
+  ctx->cpi->svc.temporal_layer_id = data->temporal_layer_id;
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_svc_params(aom_codec_alg_priv_t *ctx,
+                                           va_list args) {
+  AV1_COMP *const cpi = ctx->cpi;
+  aom_svc_params_t *const params = va_arg(args, aom_svc_params_t *);
+  cpi->common.number_spatial_layers = params->number_spatial_layers;
+  cpi->common.number_temporal_layers = params->number_temporal_layers;
+  cpi->svc.number_spatial_layers = params->number_spatial_layers;
+  cpi->svc.number_temporal_layers = params->number_temporal_layers;
+  if (cpi->common.number_spatial_layers > 1 ||
+      cpi->common.number_temporal_layers > 1) {
+    unsigned int sl, tl;
+    cpi->use_svc = 1;
+    for (sl = 0; sl < cpi->common.number_spatial_layers; ++sl) {
+      for (tl = 0; tl < cpi->common.number_temporal_layers; ++tl) {
+        const int layer =
+            LAYER_IDS_TO_IDX(sl, tl, cpi->common.number_temporal_layers);
+        LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer];
+        lc->max_q = params->max_quantizers[layer];
+        lc->min_q = params->min_quantizers[layer];
+        lc->scaling_factor_num = params->scaling_factor_num[sl];
+        lc->scaling_factor_den = params->scaling_factor_den[sl];
+        lc->layer_target_bitrate = 1000 * params->layer_target_bitrate[layer];
+        lc->framerate_factor = params->framerate_factor[tl];
+      }
+    }
+    if (cpi->common.current_frame.frame_number == 0)
+      av1_init_layer_context(cpi);
+    else
+      av1_update_layer_context_change_config(cpi, cpi->oxcf.target_bandwidth);
+  }
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_svc_ref_frame_config(aom_codec_alg_priv_t *ctx,
+                                                     va_list args) {
+  AV1_COMP *const cpi = ctx->cpi;
+  aom_svc_ref_frame_config_t *const data =
+      va_arg(args, aom_svc_ref_frame_config_t *);
+  cpi->svc.external_ref_frame_config = 1;
+  for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+    cpi->svc.reference[i] = data->reference[i];
+    cpi->svc.ref_idx[i] = data->ref_idx[i];
+  }
+  for (unsigned int i = 0; i < REF_FRAMES; ++i)
+    cpi->svc.refresh[i] = data->refresh[i];
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_tune_content(aom_codec_alg_priv_t *ctx,
+                                             va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.content = CAST(AV1E_SET_TUNE_CONTENT, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_cdf_update_mode(aom_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.cdf_update_mode = CAST(AV1E_SET_CDF_UPDATE_MODE, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_color_primaries(aom_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.color_primaries = CAST(AV1E_SET_COLOR_PRIMARIES, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_transfer_characteristics(
+    aom_codec_alg_priv_t *ctx, va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.transfer_characteristics =
+      CAST(AV1E_SET_TRANSFER_CHARACTERISTICS, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_matrix_coefficients(aom_codec_alg_priv_t *ctx,
+                                                    va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.matrix_coefficients = CAST(AV1E_SET_MATRIX_COEFFICIENTS, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_chroma_sample_position(
+    aom_codec_alg_priv_t *ctx, va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.chroma_sample_position =
+      CAST(AV1E_SET_CHROMA_SAMPLE_POSITION, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_color_range(aom_codec_alg_priv_t *ctx,
+                                            va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.color_range = CAST(AV1E_SET_COLOR_RANGE, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_render_size(aom_codec_alg_priv_t *ctx,
+                                            va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  int *const render_size = va_arg(args, int *);
+  extra_cfg.render_width = render_size[0];
+  extra_cfg.render_height = render_size[1];
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_superblock_size(aom_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.superblock_size = CAST(AV1E_SET_SUPERBLOCK_SIZE, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_chroma_subsampling_x(aom_codec_alg_priv_t *ctx,
+                                                     va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.chroma_subsampling_x = CAST(AV1E_SET_CHROMA_SUBSAMPLING_X, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_chroma_subsampling_y(aom_codec_alg_priv_t *ctx,
+                                                     va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.chroma_subsampling_y = CAST(AV1E_SET_CHROMA_SUBSAMPLING_Y, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_get_seq_level_idx(aom_codec_alg_priv_t *ctx,
+                                              va_list args) {
+  int *const arg = va_arg(args, int *);
+  const AV1_COMP *const cpi = ctx->cpi;
+  if (arg == NULL) return AOM_CODEC_INVALID_PARAM;
+  return av1_get_seq_level_idx(&cpi->common.seq_params, &cpi->level_params,
+                               arg);
+}
+
+static aom_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
+  { AV1_COPY_REFERENCE, ctrl_copy_reference },
+  { AOME_USE_REFERENCE, ctrl_use_reference },
+
+  // Setters
+  { AV1_SET_REFERENCE, ctrl_set_reference },
+  { AOME_SET_ROI_MAP, ctrl_set_roi_map },
+  { AOME_SET_ACTIVEMAP, ctrl_set_active_map },
+  { AOME_SET_SCALEMODE, ctrl_set_scale_mode },
+  { AOME_SET_SPATIAL_LAYER_ID, ctrl_set_spatial_layer_id },
+  { AOME_SET_CPUUSED, ctrl_set_cpuused },
+  { AOME_SET_ENABLEAUTOALTREF, ctrl_set_enable_auto_alt_ref },
+  { AOME_SET_ENABLEAUTOBWDREF, ctrl_set_enable_auto_bwd_ref },
+  { AOME_SET_SHARPNESS, ctrl_set_sharpness },
+  { AOME_SET_STATIC_THRESHOLD, ctrl_set_static_thresh },
+  { AV1E_SET_ROW_MT, ctrl_set_row_mt },
+  { AV1E_SET_TILE_COLUMNS, ctrl_set_tile_columns },
+  { AV1E_SET_TILE_ROWS, ctrl_set_tile_rows },
+  { AV1E_SET_ENABLE_TPL_MODEL, ctrl_set_enable_tpl_model },
+  { AV1E_SET_ENABLE_KEYFRAME_FILTERING, ctrl_set_enable_keyframe_filtering },
+  { AOME_SET_ARNR_MAXFRAMES, ctrl_set_arnr_max_frames },
+  { AOME_SET_ARNR_STRENGTH, ctrl_set_arnr_strength },
+  { AOME_SET_TUNING, ctrl_set_tuning },
+  { AOME_SET_CQ_LEVEL, ctrl_set_cq_level },
+  { AOME_SET_MAX_INTRA_BITRATE_PCT, ctrl_set_rc_max_intra_bitrate_pct },
+  { AOME_SET_NUMBER_SPATIAL_LAYERS, ctrl_set_number_spatial_layers },
+  { AV1E_SET_MAX_INTER_BITRATE_PCT, ctrl_set_rc_max_inter_bitrate_pct },
+  { AV1E_SET_GF_CBR_BOOST_PCT, ctrl_set_rc_gf_cbr_boost_pct },
+  { AV1E_SET_LOSSLESS, ctrl_set_lossless },
+  { AV1E_SET_ENABLE_CDEF, ctrl_set_enable_cdef },
+  { AV1E_SET_ENABLE_RESTORATION, ctrl_set_enable_restoration },
+  { AV1E_SET_FORCE_VIDEO_MODE, ctrl_set_force_video_mode },
+  { AV1E_SET_ENABLE_OBMC, ctrl_set_enable_obmc },
+  { AV1E_SET_DISABLE_TRELLIS_QUANT, ctrl_set_disable_trellis_quant },
+  { AV1E_SET_ENABLE_QM, ctrl_set_enable_qm },
+  { AV1E_SET_QM_Y, ctrl_set_qm_y },
+  { AV1E_SET_QM_U, ctrl_set_qm_u },
+  { AV1E_SET_QM_V, ctrl_set_qm_v },
+  { AV1E_SET_QM_MIN, ctrl_set_qm_min },
+  { AV1E_SET_QM_MAX, ctrl_set_qm_max },
+  { AV1E_SET_NUM_TG, ctrl_set_num_tg },
+  { AV1E_SET_MTU, ctrl_set_mtu },
+  { AV1E_SET_TIMING_INFO_TYPE, ctrl_set_timing_info_type },
+  { AV1E_SET_FRAME_PARALLEL_DECODING, ctrl_set_frame_parallel_decoding_mode },
+  { AV1E_SET_ERROR_RESILIENT_MODE, ctrl_set_error_resilient_mode },
+  { AV1E_SET_S_FRAME_MODE, ctrl_set_s_frame_mode },
+  { AV1E_SET_ENABLE_RECT_PARTITIONS, ctrl_set_enable_rect_partitions },
+  { AV1E_SET_ENABLE_AB_PARTITIONS, ctrl_set_enable_ab_partitions },
+  { AV1E_SET_ENABLE_1TO4_PARTITIONS, ctrl_set_enable_1to4_partitions },
+  { AV1E_SET_MIN_PARTITION_SIZE, ctrl_set_min_partition_size },
+  { AV1E_SET_MAX_PARTITION_SIZE, ctrl_set_max_partition_size },
+  { AV1E_SET_ENABLE_DUAL_FILTER, ctrl_set_enable_dual_filter },
+  { AV1E_SET_ENABLE_CHROMA_DELTAQ, ctrl_set_enable_chroma_deltaq },
+  { AV1E_SET_ENABLE_INTRA_EDGE_FILTER, ctrl_set_enable_intra_edge_filter },
+  { AV1E_SET_ENABLE_ORDER_HINT, ctrl_set_enable_order_hint },
+  { AV1E_SET_ENABLE_TX64, ctrl_set_enable_tx64 },
+  { AV1E_SET_ENABLE_FLIP_IDTX, ctrl_set_enable_flip_idtx },
+  { AV1E_SET_ENABLE_DIST_WTD_COMP, ctrl_set_enable_dist_wtd_comp },
+  { AV1E_SET_MAX_REFERENCE_FRAMES, ctrl_set_max_reference_frames },
+  { AV1E_SET_REDUCED_REFERENCE_SET, ctrl_set_enable_reduced_reference_set },
+  { AV1E_SET_ENABLE_REF_FRAME_MVS, ctrl_set_enable_ref_frame_mvs },
+  { AV1E_SET_ALLOW_REF_FRAME_MVS, ctrl_set_allow_ref_frame_mvs },
+  { AV1E_SET_ENABLE_MASKED_COMP, ctrl_set_enable_masked_comp },
+  { AV1E_SET_ENABLE_ONESIDED_COMP, ctrl_set_enable_onesided_comp },
+  { AV1E_SET_ENABLE_INTERINTRA_COMP, ctrl_set_enable_interintra_comp },
+  { AV1E_SET_ENABLE_SMOOTH_INTERINTRA, ctrl_set_enable_smooth_interintra },
+  { AV1E_SET_ENABLE_DIFF_WTD_COMP, ctrl_set_enable_diff_wtd_comp },
+  { AV1E_SET_ENABLE_INTERINTER_WEDGE, ctrl_set_enable_interinter_wedge },
+  { AV1E_SET_ENABLE_INTERINTRA_WEDGE, ctrl_set_enable_interintra_wedge },
+  { AV1E_SET_ENABLE_GLOBAL_MOTION, ctrl_set_enable_global_motion },
+  { AV1E_SET_ENABLE_WARPED_MOTION, ctrl_set_enable_warped_motion },
+  { AV1E_SET_ALLOW_WARPED_MOTION, ctrl_set_allow_warped_motion },
+  { AV1E_SET_ENABLE_FILTER_INTRA, ctrl_set_enable_filter_intra },
+  { AV1E_SET_ENABLE_SMOOTH_INTRA, ctrl_set_enable_smooth_intra },
+  { AV1E_SET_ENABLE_PAETH_INTRA, ctrl_set_enable_paeth_intra },
+  { AV1E_SET_ENABLE_CFL_INTRA, ctrl_set_enable_cfl_intra },
+  { AV1E_SET_ENABLE_SUPERRES, ctrl_set_enable_superres },
+  { AV1E_SET_ENABLE_OVERLAY, ctrl_set_enable_overlay },
+  { AV1E_SET_ENABLE_PALETTE, ctrl_set_enable_palette },
+  { AV1E_SET_ENABLE_INTRABC, ctrl_set_enable_intrabc },
+  { AV1E_SET_ENABLE_ANGLE_DELTA, ctrl_set_enable_angle_delta },
+  { AV1E_SET_AQ_MODE, ctrl_set_aq_mode },
+  { AV1E_SET_REDUCED_TX_TYPE_SET, ctrl_set_reduced_tx_type_set },
+  { AV1E_SET_INTRA_DCT_ONLY, ctrl_set_intra_dct_only },
+  { AV1E_SET_INTER_DCT_ONLY, ctrl_set_inter_dct_only },
+  { AV1E_SET_INTRA_DEFAULT_TX_ONLY, ctrl_set_intra_default_tx_only },
+  { AV1E_SET_QUANT_B_ADAPT, ctrl_set_quant_b_adapt },
+  { AV1E_SET_COEFF_COST_UPD_FREQ, ctrl_set_coeff_cost_upd_freq },
+  { AV1E_SET_MODE_COST_UPD_FREQ, ctrl_set_mode_cost_upd_freq },
+  { AV1E_SET_MV_COST_UPD_FREQ, ctrl_set_mv_cost_upd_freq },
+  { AV1E_SET_DELTAQ_MODE, ctrl_set_deltaq_mode },
+  { AV1E_SET_DELTALF_MODE, ctrl_set_deltalf_mode },
+  { AV1E_SET_FRAME_PERIODIC_BOOST, ctrl_set_frame_periodic_boost },
+  { AV1E_SET_TUNE_CONTENT, ctrl_set_tune_content },
+  { AV1E_SET_CDF_UPDATE_MODE, ctrl_set_cdf_update_mode },
+  { AV1E_SET_COLOR_PRIMARIES, ctrl_set_color_primaries },
+  { AV1E_SET_TRANSFER_CHARACTERISTICS, ctrl_set_transfer_characteristics },
+  { AV1E_SET_MATRIX_COEFFICIENTS, ctrl_set_matrix_coefficients },
+  { AV1E_SET_CHROMA_SAMPLE_POSITION, ctrl_set_chroma_sample_position },
+  { AV1E_SET_COLOR_RANGE, ctrl_set_color_range },
+  { AV1E_SET_NOISE_SENSITIVITY, ctrl_set_noise_sensitivity },
+  { AV1E_SET_MIN_GF_INTERVAL, ctrl_set_min_gf_interval },
+  { AV1E_SET_MAX_GF_INTERVAL, ctrl_set_max_gf_interval },
+  { AV1E_SET_GF_MIN_PYRAMID_HEIGHT, ctrl_set_gf_min_pyr_height },
+  { AV1E_SET_GF_MAX_PYRAMID_HEIGHT, ctrl_set_gf_max_pyr_height },
+  { AV1E_SET_RENDER_SIZE, ctrl_set_render_size },
+  { AV1E_SET_SUPERBLOCK_SIZE, ctrl_set_superblock_size },
+  { AV1E_SET_SINGLE_TILE_DECODING, ctrl_set_single_tile_decoding },
+  { AV1E_SET_VMAF_MODEL_PATH, ctrl_set_vmaf_model_path },
+  { AV1E_SET_FILM_GRAIN_TEST_VECTOR, ctrl_set_film_grain_test_vector },
+  { AV1E_SET_FILM_GRAIN_TABLE, ctrl_set_film_grain_table },
+  { AV1E_SET_DENOISE_NOISE_LEVEL, ctrl_set_denoise_noise_level },
+  { AV1E_SET_DENOISE_BLOCK_SIZE, ctrl_set_denoise_block_size },
+  { AV1E_ENABLE_MOTION_VECTOR_UNIT_TEST, ctrl_enable_motion_vector_unit_test },
+  { AV1E_ENABLE_EXT_TILE_DEBUG, ctrl_enable_ext_tile_debug },
+  { AV1E_SET_TARGET_SEQ_LEVEL_IDX, ctrl_set_target_seq_level_idx },
+  { AV1E_SET_TIER_MASK, ctrl_set_tier_mask },
+  { AV1E_SET_MIN_CR, ctrl_set_min_cr },
+  { AV1E_SET_SVC_LAYER_ID, ctrl_set_layer_id },
+  { AV1E_SET_SVC_PARAMS, ctrl_set_svc_params },
+  { AV1E_SET_SVC_REF_FRAME_CONFIG, ctrl_set_svc_ref_frame_config },
+  { AV1E_ENABLE_SB_MULTIPASS_UNIT_TEST, ctrl_enable_sb_multipass_unit_test },
+
+  // Getters
+  { AOME_GET_LAST_QUANTIZER, ctrl_get_quantizer },
+  { AOME_GET_LAST_QUANTIZER_64, ctrl_get_quantizer64 },
+  { AV1_GET_REFERENCE, ctrl_get_reference },
+  { AV1E_GET_ACTIVEMAP, ctrl_get_active_map },
+  { AV1_GET_NEW_FRAME_IMAGE, ctrl_get_new_frame_image },
+  { AV1_COPY_NEW_FRAME_IMAGE, ctrl_copy_new_frame_image },
+  { AV1E_SET_CHROMA_SUBSAMPLING_X, ctrl_set_chroma_subsampling_x },
+  { AV1E_SET_CHROMA_SUBSAMPLING_Y, ctrl_set_chroma_subsampling_y },
+  { AV1E_GET_SEQ_LEVEL_IDX, ctrl_get_seq_level_idx },
+  { -1, NULL },
+};
+
+static const aom_codec_enc_cfg_t encoder_usage_cfg[] = {
+  {
+      // NOLINT
+      AOM_USAGE_GOOD_QUALITY,  // g_usage - non-realtime usage
+      0,                       // g_threads
+      0,                       // g_profile
+
+      320,         // g_width
+      240,         // g_height
+      0,           // g_limit
+      0,           // g_forced_max_frame_width
+      0,           // g_forced_max_frame_height
+      AOM_BITS_8,  // g_bit_depth
+      8,           // g_input_bit_depth
+
+      { 1, 30 },  // g_timebase
+
+      0,  // g_error_resilient
+
+      AOM_RC_ONE_PASS,  // g_pass
+
+      19,  // g_lag_in_frames
+
+      0,                // rc_dropframe_thresh
+      RESIZE_NONE,      // rc_resize_mode
+      SCALE_NUMERATOR,  // rc_resize_denominator
+      SCALE_NUMERATOR,  // rc_resize_kf_denominator
+
+      SUPERRES_NONE,    // rc_superres_mode
+      SCALE_NUMERATOR,  // rc_superres_denominator
+      SCALE_NUMERATOR,  // rc_superres_kf_denominator
+      63,               // rc_superres_qthresh
+      32,               // rc_superres_kf_qthresh
+
+      AOM_VBR,      // rc_end_usage
+      { NULL, 0 },  // rc_twopass_stats_in
+      { NULL, 0 },  // rc_firstpass_mb_stats_in
+      256,          // rc_target_bandwidth
+      0,            // rc_min_quantizer
+      63,           // rc_max_quantizer
+      25,           // rc_undershoot_pct
+      25,           // rc_overshoot_pct
+
+      6000,  // rc_max_buffer_size
+      4000,  // rc_buffer_initial_size
+      5000,  // rc_buffer_optimal_size
+
+      50,    // rc_two_pass_vbrbias
+      0,     // rc_two_pass_vbrmin_section
+      2000,  // rc_two_pass_vbrmax_section
+
+      // keyframing settings (kf)
+      0,                       // fwd_kf_enabled
+      AOM_KF_AUTO,             // g_kfmode
+      0,                       // kf_min_dist
+      9999,                    // kf_max_dist
+      0,                       // sframe_dist
+      1,                       // sframe_mode
+      0,                       // large_scale_tile
+      0,                       // monochrome
+      0,                       // full_still_picture_hdr
+      0,                       // save_as_annexb
+      0,                       // tile_width_count
+      0,                       // tile_height_count
+      { 0 },                   // tile_widths
+      { 0 },                   // tile_heights
+      0,                       // use_fixed_qp_offsets
+      { -1, -1, -1, -1, -1 },  // fixed_qp_offsets
+      { 0, 128, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0,   0,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  // cfg
+  },
+  {
+      // NOLINT
+      AOM_USAGE_REALTIME,  // g_usage - real-time usage
+      0,                   // g_threads
+      0,                   // g_profile
+
+      320,         // g_width
+      240,         // g_height
+      0,           // g_limit
+      0,           // g_forced_max_frame_width
+      0,           // g_forced_max_frame_height
+      AOM_BITS_8,  // g_bit_depth
+      8,           // g_input_bit_depth
+
+      { 1, 30 },  // g_timebase
+
+      0,  // g_error_resilient
+
+      AOM_RC_ONE_PASS,  // g_pass
+
+      1,  // g_lag_in_frames
+
+      0,                // rc_dropframe_thresh
+      RESIZE_NONE,      // rc_resize_mode
+      SCALE_NUMERATOR,  // rc_resize_denominator
+      SCALE_NUMERATOR,  // rc_resize_kf_denominator
+
+      0,                // rc_superres_mode
+      SCALE_NUMERATOR,  // rc_superres_denominator
+      SCALE_NUMERATOR,  // rc_superres_kf_denominator
+      63,               // rc_superres_qthresh
+      32,               // rc_superres_kf_qthresh
+
+      AOM_CBR,      // rc_end_usage
+      { NULL, 0 },  // rc_twopass_stats_in
+      { NULL, 0 },  // rc_firstpass_mb_stats_in
+      256,          // rc_target_bandwidth
+      0,            // rc_min_quantizer
+      63,           // rc_max_quantizer
+      25,           // rc_undershoot_pct
+      25,           // rc_overshoot_pct
+
+      6000,  // rc_max_buffer_size
+      4000,  // rc_buffer_initial_size
+      5000,  // rc_buffer_optimal_size
+
+      50,    // rc_two_pass_vbrbias
+      0,     // rc_two_pass_vbrmin_section
+      2000,  // rc_two_pass_vbrmax_section
+
+      // keyframing settings (kf)
+      0,                       // fwd_kf_enabled
+      AOM_KF_AUTO,             // g_kfmode
+      0,                       // kf_min_dist
+      9999,                    // kf_max_dist
+      0,                       // sframe_dist
+      1,                       // sframe_mode
+      0,                       // large_scale_tile
+      0,                       // monochrome
+      0,                       // full_still_picture_hdr
+      0,                       // save_as_annexb
+      0,                       // tile_width_count
+      0,                       // tile_height_count
+      { 0 },                   // tile_widths
+      { 0 },                   // tile_heights
+      0,                       // use_fixed_qp_offsets
+      { -1, -1, -1, -1, -1 },  // fixed_qp_offsets
+      { 0, 128, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0,   0,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  // cfg
+  },
+};
+
+#ifndef VERSION_STRING
+#define VERSION_STRING
+#endif
+CODEC_INTERFACE(aom_codec_av1_cx) = {
+  "AOMedia Project AV1 Encoder" VERSION_STRING,
+  AOM_CODEC_INTERNAL_ABI_VERSION,
+  AOM_CODEC_CAP_HIGHBITDEPTH | AOM_CODEC_CAP_ENCODER |
+      AOM_CODEC_CAP_PSNR,  // aom_codec_caps_t
+  encoder_init,            // aom_codec_init_fn_t
+  encoder_destroy,         // aom_codec_destroy_fn_t
+  encoder_ctrl_maps,       // aom_codec_ctrl_fn_map_t
+  {
+      // NOLINT
+      NULL,  // aom_codec_peek_si_fn_t
+      NULL,  // aom_codec_get_si_fn_t
+      NULL,  // aom_codec_decode_fn_t
+      NULL,  // aom_codec_get_frame_fn_t
+      NULL   // aom_codec_set_fb_fn_t
+  },
+  {
+      // NOLINT
+      2,                           // 2 cfg
+      encoder_usage_cfg,           // aom_codec_enc_cfg_t
+      encoder_encode,              // aom_codec_encode_fn_t
+      encoder_get_cxdata,          // aom_codec_get_cx_data_fn_t
+      encoder_set_config,          // aom_codec_enc_config_set_fn_t
+      encoder_get_global_headers,  // aom_codec_get_global_headers_fn_t
+      encoder_get_preview          // aom_codec_get_preview_frame_fn_t
+  }
+};
diff --git a/libs/libaom/src/av1/av1_dx_iface.c b/libs/libaom/src/av1/av1_dx_iface.c
new file mode 100644
index 000000000..d821a52f6
--- /dev/null
+++ b/libs/libaom/src/av1/av1_dx_iface.c
@@ -0,0 +1,1397 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "config/aom_config.h"
+#include "config/aom_version.h"
+
+#include "aom/internal/aom_codec_internal.h"
+#include "aom/internal/aom_image_internal.h"
+#include "aom/aomdx.h"
+#include "aom/aom_decoder.h"
+#include "aom_dsp/bitreader_buffer.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/mem_ops.h"
+#include "aom_util/aom_thread.h"
+
+#include "av1/common/alloccommon.h"
+#include "av1/common/frame_buffers.h"
+#include "av1/common/enums.h"
+#include "av1/common/obu_util.h"
+
+#include "av1/decoder/decoder.h"
+#include "av1/decoder/decodeframe.h"
+#include "av1/decoder/obu.h"
+
+#include "av1/av1_iface_common.h"
+
+struct aom_codec_alg_priv {
+  aom_codec_priv_t base;
+  aom_codec_dec_cfg_t cfg;
+  aom_codec_stream_info_t si;
+  aom_image_t img;
+  int img_avail;
+  int flushed;
+  int invert_tile_order;
+  RefCntBuffer *last_show_frame;  // Last output frame buffer
+  int byte_alignment;
+  int skip_loop_filter;
+  int skip_film_grain;
+  int decode_tile_row;
+  int decode_tile_col;
+  unsigned int tile_mode;
+  unsigned int ext_tile_debug;
+  unsigned int row_mt;
+  EXTERNAL_REFERENCES ext_refs;
+  unsigned int is_annexb;
+  int operating_point;
+  int output_all_layers;
+
+  AVxWorker *frame_worker;
+
+  aom_image_t image_with_grain;
+  aom_codec_frame_buffer_t grain_image_frame_buffers[MAX_NUM_SPATIAL_LAYERS];
+  size_t num_grain_image_frame_buffers;
+  int need_resync;  // wait for key/intra-only frame
+  // BufferPool that holds all reference frames. Shared by all the FrameWorkers.
+  BufferPool *buffer_pool;
+
+  // External frame buffer info to save for AV1 common.
+  void *ext_priv;  // Private data associated with the external frame buffers.
+  aom_get_frame_buffer_cb_fn_t get_ext_fb_cb;
+  aom_release_frame_buffer_cb_fn_t release_ext_fb_cb;
+
+#if CONFIG_INSPECTION
+  aom_inspect_cb inspect_cb;
+  void *inspect_ctx;
+#endif
+};
+
+static aom_codec_err_t decoder_init(aom_codec_ctx_t *ctx) {
+  // This function only allocates space for the aom_codec_alg_priv_t
+  // structure. More memory may be required at the time the stream
+  // information becomes known.
+  if (!ctx->priv) {
+    aom_codec_alg_priv_t *const priv =
+        (aom_codec_alg_priv_t *)aom_calloc(1, sizeof(*priv));
+    if (priv == NULL) return AOM_CODEC_MEM_ERROR;
+
+    ctx->priv = (aom_codec_priv_t *)priv;
+    ctx->priv->init_flags = ctx->init_flags;
+    priv->flushed = 0;
+
+    // TODO(tdaede): this should not be exposed to the API
+    priv->cfg.allow_lowbitdepth = !FORCE_HIGHBITDEPTH_DECODING;
+    if (ctx->config.dec) {
+      priv->cfg = *ctx->config.dec;
+      ctx->config.dec = &priv->cfg;
+    }
+    priv->num_grain_image_frame_buffers = 0;
+    // Turn row_mt on by default.
+    priv->row_mt = 1;
+
+    // Turn on normal tile coding mode by default.
+    // 0 is for normal tile coding mode, and 1 is for large scale tile coding
+    // mode(refer to lightfield example).
+    priv->tile_mode = 0;
+    priv->decode_tile_row = -1;
+    priv->decode_tile_col = -1;
+  }
+
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t decoder_destroy(aom_codec_alg_priv_t *ctx) {
+  if (ctx->frame_worker != NULL) {
+    AVxWorker *const worker = ctx->frame_worker;
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+    aom_get_worker_interface()->end(worker);
+    aom_free(frame_worker_data->pbi->common.tpl_mvs);
+    frame_worker_data->pbi->common.tpl_mvs = NULL;
+    av1_remove_common(&frame_worker_data->pbi->common);
+    av1_free_restoration_buffers(&frame_worker_data->pbi->common);
+    av1_decoder_remove(frame_worker_data->pbi);
+    aom_free(frame_worker_data);
+#if CONFIG_MULTITHREAD
+    pthread_mutex_destroy(&ctx->buffer_pool->pool_mutex);
+#endif
+  }
+
+  if (ctx->buffer_pool) {
+    for (size_t i = 0; i < ctx->num_grain_image_frame_buffers; i++) {
+      ctx->buffer_pool->release_fb_cb(ctx->buffer_pool->cb_priv,
+                                      &ctx->grain_image_frame_buffers[i]);
+    }
+    av1_free_ref_frame_buffers(ctx->buffer_pool);
+    av1_free_internal_frame_buffers(&ctx->buffer_pool->int_frame_buffers);
+  }
+
+  aom_free(ctx->frame_worker);
+  aom_free(ctx->buffer_pool);
+  aom_img_free(&ctx->img);
+  aom_free(ctx);
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t parse_timing_info(struct aom_read_bit_buffer *rb) {
+  const uint32_t num_units_in_display_tick =
+      aom_rb_read_unsigned_literal(rb, 32);
+  const uint32_t time_scale = aom_rb_read_unsigned_literal(rb, 32);
+  if (num_units_in_display_tick == 0 || time_scale == 0)
+    return AOM_CODEC_UNSUP_BITSTREAM;
+  const uint8_t equal_picture_interval = aom_rb_read_bit(rb);
+  if (equal_picture_interval) {
+    const uint32_t num_ticks_per_picture_minus_1 = aom_rb_read_uvlc(rb);
+    if (num_ticks_per_picture_minus_1 == UINT32_MAX) {
+      // num_ticks_per_picture_minus_1 cannot be (1 << 32) − 1.
+      return AOM_CODEC_UNSUP_BITSTREAM;
+    }
+  }
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t parse_decoder_model_info(
+    struct aom_read_bit_buffer *rb, int *buffer_delay_length_minus_1) {
+  *buffer_delay_length_minus_1 = aom_rb_read_literal(rb, 5);
+  const uint32_t num_units_in_decoding_tick =
+      aom_rb_read_unsigned_literal(rb, 32);
+  const uint8_t buffer_removal_time_length_minus_1 = aom_rb_read_literal(rb, 5);
+  const uint8_t frame_presentation_time_length_minus_1 =
+      aom_rb_read_literal(rb, 5);
+  (void)num_units_in_decoding_tick;
+  (void)buffer_removal_time_length_minus_1;
+  (void)frame_presentation_time_length_minus_1;
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t parse_op_parameters_info(
+    struct aom_read_bit_buffer *rb, int buffer_delay_length_minus_1) {
+  const int n = buffer_delay_length_minus_1 + 1;
+  const uint32_t decoder_buffer_delay = aom_rb_read_unsigned_literal(rb, n);
+  const uint32_t encoder_buffer_delay = aom_rb_read_unsigned_literal(rb, n);
+  const uint8_t low_delay_mode_flag = aom_rb_read_bit(rb);
+  (void)decoder_buffer_delay;
+  (void)encoder_buffer_delay;
+  (void)low_delay_mode_flag;
+  return AOM_CODEC_OK;
+}
+
+// Parses the operating points (including operating_point_idc, seq_level_idx,
+// and seq_tier) and then sets si->number_spatial_layers and
+// si->number_temporal_layers based on operating_point_idc[0].
+static aom_codec_err_t parse_operating_points(struct aom_read_bit_buffer *rb,
+                                              int is_reduced_header,
+                                              aom_codec_stream_info_t *si) {
+  int operating_point_idc0 = 0;
+  if (is_reduced_header) {
+    aom_rb_read_literal(rb, LEVEL_BITS);  // level
+  } else {
+    uint8_t decoder_model_info_present_flag = 0;
+    int buffer_delay_length_minus_1 = 0;
+    aom_codec_err_t status;
+    const uint8_t timing_info_present_flag = aom_rb_read_bit(rb);
+    if (timing_info_present_flag) {
+      if ((status = parse_timing_info(rb)) != AOM_CODEC_OK) return status;
+      decoder_model_info_present_flag = aom_rb_read_bit(rb);
+      if (decoder_model_info_present_flag) {
+        if ((status = parse_decoder_model_info(
+                 rb, &buffer_delay_length_minus_1)) != AOM_CODEC_OK)
+          return status;
+      }
+    }
+    const uint8_t initial_display_delay_present_flag = aom_rb_read_bit(rb);
+    const uint8_t operating_points_cnt_minus_1 =
+        aom_rb_read_literal(rb, OP_POINTS_CNT_MINUS_1_BITS);
+    for (int i = 0; i < operating_points_cnt_minus_1 + 1; i++) {
+      int operating_point_idc;
+      operating_point_idc = aom_rb_read_literal(rb, OP_POINTS_IDC_BITS);
+      if (i == 0) operating_point_idc0 = operating_point_idc;
+      int seq_level_idx = aom_rb_read_literal(rb, LEVEL_BITS);  // level
+      if (seq_level_idx > 7) aom_rb_read_bit(rb);               // tier
+      if (decoder_model_info_present_flag) {
+        const uint8_t decoder_model_present_for_this_op = aom_rb_read_bit(rb);
+        if (decoder_model_present_for_this_op) {
+          if ((status = parse_op_parameters_info(
+                   rb, buffer_delay_length_minus_1)) != AOM_CODEC_OK)
+            return status;
+        }
+      }
+      if (initial_display_delay_present_flag) {
+        const uint8_t initial_display_delay_present_for_this_op =
+            aom_rb_read_bit(rb);
+        if (initial_display_delay_present_for_this_op)
+          aom_rb_read_literal(rb, 4);  // initial_display_delay_minus_1
+      }
+    }
+  }
+
+  if (aom_get_num_layers_from_operating_point_idc(
+          operating_point_idc0, &si->number_spatial_layers,
+          &si->number_temporal_layers) != AOM_CODEC_OK) {
+    return AOM_CODEC_ERROR;
+  }
+
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t decoder_peek_si_internal(const uint8_t *data,
+                                                size_t data_sz,
+                                                aom_codec_stream_info_t *si,
+                                                int *is_intra_only) {
+  int intra_only_flag = 0;
+  int got_sequence_header = 0;
+  int found_keyframe = 0;
+
+  if (data + data_sz <= data || data_sz < 1) return AOM_CODEC_INVALID_PARAM;
+
+  si->w = 0;
+  si->h = 0;
+  si->is_kf = 0;  // is_kf indicates whether the current packet contains a RAP
+
+  ObuHeader obu_header;
+  memset(&obu_header, 0, sizeof(obu_header));
+  size_t payload_size = 0;
+  size_t bytes_read = 0;
+  uint8_t reduced_still_picture_hdr = 0;
+  aom_codec_err_t status = aom_read_obu_header_and_size(
+      data, data_sz, si->is_annexb, &obu_header, &payload_size, &bytes_read);
+  if (status != AOM_CODEC_OK) return status;
+
+  // If the first OBU is a temporal delimiter, skip over it and look at the next
+  // OBU in the bitstream
+  if (obu_header.type == OBU_TEMPORAL_DELIMITER) {
+    // Skip any associated payload (there shouldn't be one, but just in case)
+    if (data_sz < bytes_read + payload_size) return AOM_CODEC_CORRUPT_FRAME;
+    data += bytes_read + payload_size;
+    data_sz -= bytes_read + payload_size;
+
+    status = aom_read_obu_header_and_size(
+        data, data_sz, si->is_annexb, &obu_header, &payload_size, &bytes_read);
+    if (status != AOM_CODEC_OK) return status;
+  }
+  while (1) {
+    data += bytes_read;
+    data_sz -= bytes_read;
+    if (data_sz < payload_size) return AOM_CODEC_CORRUPT_FRAME;
+    // Check that the selected OBU is a sequence header
+    if (obu_header.type == OBU_SEQUENCE_HEADER) {
+      // Sanity check on sequence header size
+      if (data_sz < 2) return AOM_CODEC_CORRUPT_FRAME;
+      // Read a few values from the sequence header payload
+      struct aom_read_bit_buffer rb = { data, data + data_sz, 0, NULL, NULL };
+
+      av1_read_profile(&rb);  // profile
+      const uint8_t still_picture = aom_rb_read_bit(&rb);
+      reduced_still_picture_hdr = aom_rb_read_bit(&rb);
+
+      if (!still_picture && reduced_still_picture_hdr) {
+        return AOM_CODEC_UNSUP_BITSTREAM;
+      }
+
+      if (parse_operating_points(&rb, reduced_still_picture_hdr, si) !=
+          AOM_CODEC_OK) {
+        return AOM_CODEC_ERROR;
+      }
+
+      int num_bits_width = aom_rb_read_literal(&rb, 4) + 1;
+      int num_bits_height = aom_rb_read_literal(&rb, 4) + 1;
+      int max_frame_width = aom_rb_read_literal(&rb, num_bits_width) + 1;
+      int max_frame_height = aom_rb_read_literal(&rb, num_bits_height) + 1;
+      si->w = max_frame_width;
+      si->h = max_frame_height;
+      got_sequence_header = 1;
+    } else if (obu_header.type == OBU_FRAME_HEADER ||
+               obu_header.type == OBU_FRAME) {
+      if (got_sequence_header && reduced_still_picture_hdr) {
+        found_keyframe = 1;
+        break;
+      } else {
+        // make sure we have enough bits to get the frame type out
+        if (data_sz < 1) return AOM_CODEC_CORRUPT_FRAME;
+        struct aom_read_bit_buffer rb = { data, data + data_sz, 0, NULL, NULL };
+        const int show_existing_frame = aom_rb_read_bit(&rb);
+        if (!show_existing_frame) {
+          const FRAME_TYPE frame_type = (FRAME_TYPE)aom_rb_read_literal(&rb, 2);
+          if (frame_type == KEY_FRAME) {
+            found_keyframe = 1;
+            break;  // Stop here as no further OBUs will change the outcome.
+          } else if (frame_type == INTRA_ONLY_FRAME) {
+            intra_only_flag = 1;
+          }
+        }
+      }
+    }
+    // skip past any unread OBU header data
+    data += payload_size;
+    data_sz -= payload_size;
+    if (data_sz == 0) break;  // exit if we're out of OBUs
+    status = aom_read_obu_header_and_size(
+        data, data_sz, si->is_annexb, &obu_header, &payload_size, &bytes_read);
+    if (status != AOM_CODEC_OK) return status;
+  }
+  if (got_sequence_header && found_keyframe) si->is_kf = 1;
+  if (is_intra_only != NULL) *is_intra_only = intra_only_flag;
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t decoder_peek_si(const uint8_t *data, size_t data_sz,
+                                       aom_codec_stream_info_t *si) {
+  return decoder_peek_si_internal(data, data_sz, si, NULL);
+}
+
+static aom_codec_err_t decoder_get_si(aom_codec_alg_priv_t *ctx,
+                                      aom_codec_stream_info_t *si) {
+  memcpy(si, &ctx->si, sizeof(*si));
+
+  return AOM_CODEC_OK;
+}
+
+static void set_error_detail(aom_codec_alg_priv_t *ctx,
+                             const char *const error) {
+  ctx->base.err_detail = error;
+}
+
+static aom_codec_err_t update_error_state(
+    aom_codec_alg_priv_t *ctx, const struct aom_internal_error_info *error) {
+  if (error->error_code)
+    set_error_detail(ctx, error->has_detail ? error->detail : NULL);
+
+  return error->error_code;
+}
+
+static void init_buffer_callbacks(aom_codec_alg_priv_t *ctx) {
+  AVxWorker *const worker = ctx->frame_worker;
+  FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+  AV1Decoder *const pbi = frame_worker_data->pbi;
+  AV1_COMMON *const cm = &pbi->common;
+  BufferPool *const pool = cm->buffer_pool;
+
+  cm->cur_frame = NULL;
+  cm->features.byte_alignment = ctx->byte_alignment;
+  pbi->skip_loop_filter = ctx->skip_loop_filter;
+  pbi->skip_film_grain = ctx->skip_film_grain;
+
+  if (ctx->get_ext_fb_cb != NULL && ctx->release_ext_fb_cb != NULL) {
+    pool->get_fb_cb = ctx->get_ext_fb_cb;
+    pool->release_fb_cb = ctx->release_ext_fb_cb;
+    pool->cb_priv = ctx->ext_priv;
+  } else {
+    pool->get_fb_cb = av1_get_frame_buffer;
+    pool->release_fb_cb = av1_release_frame_buffer;
+
+    if (av1_alloc_internal_frame_buffers(&pool->int_frame_buffers))
+      aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                         "Failed to initialize internal frame buffers");
+
+    pool->cb_priv = &pool->int_frame_buffers;
+  }
+}
+
+static int frame_worker_hook(void *arg1, void *arg2) {
+  FrameWorkerData *const frame_worker_data = (FrameWorkerData *)arg1;
+  const uint8_t *data = frame_worker_data->data;
+  (void)arg2;
+
+  int result = av1_receive_compressed_data(frame_worker_data->pbi,
+                                           frame_worker_data->data_size, &data);
+  frame_worker_data->data_end = data;
+
+  if (result != 0) {
+    // Check decode result in serial decode.
+    frame_worker_data->pbi->need_resync = 1;
+  }
+  return !result;
+}
+
+static aom_codec_err_t init_decoder(aom_codec_alg_priv_t *ctx) {
+  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+
+  ctx->last_show_frame = NULL;
+  ctx->need_resync = 1;
+  ctx->flushed = 0;
+
+  ctx->buffer_pool = (BufferPool *)aom_calloc(1, sizeof(BufferPool));
+  if (ctx->buffer_pool == NULL) return AOM_CODEC_MEM_ERROR;
+
+#if CONFIG_MULTITHREAD
+  if (pthread_mutex_init(&ctx->buffer_pool->pool_mutex, NULL)) {
+    set_error_detail(ctx, "Failed to allocate buffer pool mutex");
+    return AOM_CODEC_MEM_ERROR;
+  }
+#endif
+
+  ctx->frame_worker = (AVxWorker *)aom_malloc(sizeof(*ctx->frame_worker));
+  if (ctx->frame_worker == NULL) {
+    set_error_detail(ctx, "Failed to allocate frame_worker");
+    return AOM_CODEC_MEM_ERROR;
+  }
+
+  AVxWorker *const worker = ctx->frame_worker;
+  FrameWorkerData *frame_worker_data = NULL;
+  winterface->init(worker);
+  worker->thread_name = "aom frameworker";
+  worker->data1 = aom_memalign(32, sizeof(FrameWorkerData));
+  if (worker->data1 == NULL) {
+    set_error_detail(ctx, "Failed to allocate frame_worker_data");
+    return AOM_CODEC_MEM_ERROR;
+  }
+  frame_worker_data = (FrameWorkerData *)worker->data1;
+  frame_worker_data->pbi = av1_decoder_create(ctx->buffer_pool);
+  if (frame_worker_data->pbi == NULL) {
+    set_error_detail(ctx, "Failed to allocate frame_worker_data");
+    return AOM_CODEC_MEM_ERROR;
+  }
+  frame_worker_data->frame_context_ready = 0;
+  frame_worker_data->received_frame = 0;
+  frame_worker_data->pbi->allow_lowbitdepth = ctx->cfg.allow_lowbitdepth;
+
+  // If decoding in serial mode, FrameWorker thread could create tile worker
+  // thread or loopfilter thread.
+  frame_worker_data->pbi->max_threads = ctx->cfg.threads;
+  frame_worker_data->pbi->inv_tile_order = ctx->invert_tile_order;
+  frame_worker_data->pbi->common.tiles.large_scale = ctx->tile_mode;
+  frame_worker_data->pbi->is_annexb = ctx->is_annexb;
+  frame_worker_data->pbi->dec_tile_row = ctx->decode_tile_row;
+  frame_worker_data->pbi->dec_tile_col = ctx->decode_tile_col;
+  frame_worker_data->pbi->operating_point = ctx->operating_point;
+  frame_worker_data->pbi->output_all_layers = ctx->output_all_layers;
+  frame_worker_data->pbi->ext_tile_debug = ctx->ext_tile_debug;
+  frame_worker_data->pbi->row_mt = ctx->row_mt;
+
+  worker->hook = frame_worker_hook;
+
+  init_buffer_callbacks(ctx);
+
+  return AOM_CODEC_OK;
+}
+
+static INLINE void check_resync(aom_codec_alg_priv_t *const ctx,
+                                const AV1Decoder *const pbi) {
+  // Clear resync flag if worker got a key frame or intra only frame.
+  if (ctx->need_resync == 1 && pbi->need_resync == 0 &&
+      frame_is_intra_only(&pbi->common))
+    ctx->need_resync = 0;
+}
+
+static aom_codec_err_t decode_one(aom_codec_alg_priv_t *ctx,
+                                  const uint8_t **data, size_t data_sz,
+                                  void *user_priv) {
+  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+
+  // Determine the stream parameters. Note that we rely on peek_si to
+  // validate that we have a buffer that does not wrap around the top
+  // of the heap.
+  if (!ctx->si.h) {
+    int is_intra_only = 0;
+    ctx->si.is_annexb = ctx->is_annexb;
+    const aom_codec_err_t res =
+        decoder_peek_si_internal(*data, data_sz, &ctx->si, &is_intra_only);
+    if (res != AOM_CODEC_OK) return res;
+
+    if (!ctx->si.is_kf && !is_intra_only) return AOM_CODEC_ERROR;
+  }
+
+  AVxWorker *const worker = ctx->frame_worker;
+  FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+  frame_worker_data->data = *data;
+  frame_worker_data->data_size = data_sz;
+  frame_worker_data->user_priv = user_priv;
+  frame_worker_data->received_frame = 1;
+
+  frame_worker_data->pbi->common.tiles.large_scale = ctx->tile_mode;
+  frame_worker_data->pbi->dec_tile_row = ctx->decode_tile_row;
+  frame_worker_data->pbi->dec_tile_col = ctx->decode_tile_col;
+  frame_worker_data->pbi->ext_tile_debug = ctx->ext_tile_debug;
+  frame_worker_data->pbi->row_mt = ctx->row_mt;
+  frame_worker_data->pbi->ext_refs = ctx->ext_refs;
+
+  frame_worker_data->pbi->is_annexb = ctx->is_annexb;
+
+  worker->had_error = 0;
+  winterface->execute(worker);
+
+  // Update data pointer after decode.
+  *data = frame_worker_data->data_end;
+
+  if (worker->had_error)
+    return update_error_state(ctx, &frame_worker_data->pbi->common.error);
+
+  check_resync(ctx, frame_worker_data->pbi);
+
+  return AOM_CODEC_OK;
+}
+
+#if CONFIG_INSPECTION
+// This function enables the inspector to inspect non visible frames.
+static aom_codec_err_t decoder_inspect(aom_codec_alg_priv_t *ctx,
+                                       const uint8_t *data, size_t data_sz,
+                                       void *user_priv) {
+  aom_codec_err_t res = AOM_CODEC_OK;
+
+  const uint8_t *const data_end = data + data_sz;
+  Av1DecodeReturn *data2 = (Av1DecodeReturn *)user_priv;
+
+  if (ctx->frame_worker == NULL) {
+    res = init_decoder(ctx);
+    if (res != AOM_CODEC_OK) return res;
+  }
+  FrameWorkerData *const frame_worker_data =
+      (FrameWorkerData *)ctx->frame_worker->data1;
+  AV1Decoder *const pbi = frame_worker_data->pbi;
+  AV1_COMMON *const cm = &pbi->common;
+  frame_worker_data->pbi->inspect_cb = ctx->inspect_cb;
+  frame_worker_data->pbi->inspect_ctx = ctx->inspect_ctx;
+  res = av1_receive_compressed_data(frame_worker_data->pbi, data_sz, &data);
+  check_resync(ctx, frame_worker_data->pbi);
+
+  if (ctx->frame_worker->had_error)
+    return update_error_state(ctx, &frame_worker_data->pbi->common.error);
+
+  // Allow extra zero bytes after the frame end
+  while (data < data_end) {
+    const uint8_t marker = data[0];
+    if (marker) break;
+    ++data;
+  }
+
+  data2->idx = -1;
+  for (int i = 0; i < REF_FRAMES; ++i)
+    if (cm->ref_frame_map[i] == cm->cur_frame) data2->idx = i;
+  data2->buf = data;
+  data2->show_existing = cm->show_existing_frame;
+  return res;
+}
+#endif
+
+static aom_codec_err_t decoder_decode(aom_codec_alg_priv_t *ctx,
+                                      const uint8_t *data, size_t data_sz,
+                                      void *user_priv) {
+  aom_codec_err_t res = AOM_CODEC_OK;
+
+#if CONFIG_INSPECTION
+  if (user_priv != 0) {
+    return decoder_inspect(ctx, data, data_sz, user_priv);
+  }
+#endif
+  // Release any pending output frames from the previous decoder_decode call.
+  // We need to do this even if the decoder is being flushed or the input
+  // arguments are invalid.
+  if (ctx->frame_worker) {
+    BufferPool *const pool = ctx->buffer_pool;
+    lock_buffer_pool(pool);
+    AVxWorker *const worker = ctx->frame_worker;
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+    struct AV1Decoder *pbi = frame_worker_data->pbi;
+    for (size_t j = 0; j < pbi->num_output_frames; j++) {
+      decrease_ref_count(pbi->output_frames[j], pool);
+    }
+    pbi->num_output_frames = 0;
+    unlock_buffer_pool(pool);
+    for (size_t j = 0; j < ctx->num_grain_image_frame_buffers; j++) {
+      pool->release_fb_cb(pool->cb_priv, &ctx->grain_image_frame_buffers[j]);
+      ctx->grain_image_frame_buffers[j].data = NULL;
+      ctx->grain_image_frame_buffers[j].size = 0;
+      ctx->grain_image_frame_buffers[j].priv = NULL;
+    }
+    ctx->num_grain_image_frame_buffers = 0;
+  }
+
+  /* Sanity checks */
+  /* NULL data ptr allowed if data_sz is 0 too */
+  if (data == NULL && data_sz == 0) {
+    ctx->flushed = 1;
+    return AOM_CODEC_OK;
+  }
+  if (data == NULL || data_sz == 0) return AOM_CODEC_INVALID_PARAM;
+
+  // Reset flushed when receiving a valid frame.
+  ctx->flushed = 0;
+
+  // Initialize the decoder worker on the first frame.
+  if (ctx->frame_worker == NULL) {
+    res = init_decoder(ctx);
+    if (res != AOM_CODEC_OK) return res;
+  }
+
+  const uint8_t *data_start = data;
+  const uint8_t *data_end = data + data_sz;
+
+  if (ctx->is_annexb) {
+    // read the size of this temporal unit
+    size_t length_of_size;
+    uint64_t temporal_unit_size;
+    if (aom_uleb_decode(data_start, data_sz, &temporal_unit_size,
+                        &length_of_size) != 0) {
+      return AOM_CODEC_CORRUPT_FRAME;
+    }
+    data_start += length_of_size;
+    if (temporal_unit_size > (size_t)(data_end - data_start))
+      return AOM_CODEC_CORRUPT_FRAME;
+    data_end = data_start + temporal_unit_size;
+  }
+
+  // Decode in serial mode.
+  while (data_start < data_end) {
+    uint64_t frame_size;
+    if (ctx->is_annexb) {
+      // read the size of this frame unit
+      size_t length_of_size;
+      if (aom_uleb_decode(data_start, (size_t)(data_end - data_start),
+                          &frame_size, &length_of_size) != 0) {
+        return AOM_CODEC_CORRUPT_FRAME;
+      }
+      data_start += length_of_size;
+      if (frame_size > (size_t)(data_end - data_start))
+        return AOM_CODEC_CORRUPT_FRAME;
+    } else {
+      frame_size = (uint64_t)(data_end - data_start);
+    }
+
+    res = decode_one(ctx, &data_start, (size_t)frame_size, user_priv);
+    if (res != AOM_CODEC_OK) return res;
+
+    // Allow extra zero bytes after the frame end
+    while (data_start < data_end) {
+      const uint8_t marker = data_start[0];
+      if (marker) break;
+      ++data_start;
+    }
+  }
+
+  return res;
+}
+
+typedef struct {
+  BufferPool *pool;
+  aom_codec_frame_buffer_t *fb;
+} AllocCbParam;
+
+static void *AllocWithGetFrameBufferCb(void *priv, size_t size) {
+  AllocCbParam *param = (AllocCbParam *)priv;
+  if (param->pool->get_fb_cb(param->pool->cb_priv, size, param->fb) < 0)
+    return NULL;
+  if (param->fb->data == NULL || param->fb->size < size) return NULL;
+  return param->fb->data;
+}
+
+// If grain_params->apply_grain is false, returns img. Otherwise, adds film
+// grain to img, saves the result in grain_img, and returns grain_img.
+static aom_image_t *add_grain_if_needed(aom_codec_alg_priv_t *ctx,
+                                        aom_image_t *img,
+                                        aom_image_t *grain_img,
+                                        aom_film_grain_t *grain_params) {
+  if (!grain_params->apply_grain) return img;
+
+  const int w_even = ALIGN_POWER_OF_TWO(img->d_w, 1);
+  const int h_even = ALIGN_POWER_OF_TWO(img->d_h, 1);
+
+  BufferPool *const pool = ctx->buffer_pool;
+  aom_codec_frame_buffer_t *fb =
+      &ctx->grain_image_frame_buffers[ctx->num_grain_image_frame_buffers];
+  AllocCbParam param;
+  param.pool = pool;
+  param.fb = fb;
+  if (!aom_img_alloc_with_cb(grain_img, img->fmt, w_even, h_even, 16,
+                             AllocWithGetFrameBufferCb, &param)) {
+    return NULL;
+  }
+
+  grain_img->user_priv = img->user_priv;
+  grain_img->fb_priv = fb->priv;
+  if (av1_add_film_grain(grain_params, img, grain_img)) {
+    pool->release_fb_cb(pool->cb_priv, fb);
+    return NULL;
+  }
+
+  ctx->num_grain_image_frame_buffers++;
+  return grain_img;
+}
+
+// Copies and clears the metadata from AV1Decoder.
+static void move_decoder_metadata_to_img(AV1Decoder *pbi, aom_image_t *img) {
+  if (pbi->metadata && img) {
+    assert(!img->metadata);
+    img->metadata = pbi->metadata;
+    pbi->metadata = NULL;
+  }
+}
+
+static aom_image_t *decoder_get_frame(aom_codec_alg_priv_t *ctx,
+                                      aom_codec_iter_t *iter) {
+  aom_image_t *img = NULL;
+
+  if (!iter) {
+    return NULL;
+  }
+
+  // To avoid having to allocate any extra storage, treat 'iter' as
+  // simply a pointer to an integer index
+  uintptr_t *index = (uintptr_t *)iter;
+
+  if (ctx->frame_worker != NULL) {
+    const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+    AVxWorker *const worker = ctx->frame_worker;
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+    AV1Decoder *const pbi = frame_worker_data->pbi;
+    AV1_COMMON *const cm = &pbi->common;
+    CommonTileParams *const tiles = &cm->tiles;
+    // Wait for the frame from worker thread.
+    if (winterface->sync(worker)) {
+      // Check if worker has received any frames.
+      if (frame_worker_data->received_frame == 1) {
+        frame_worker_data->received_frame = 0;
+        check_resync(ctx, frame_worker_data->pbi);
+      }
+      YV12_BUFFER_CONFIG *sd;
+      aom_film_grain_t *grain_params;
+      if (av1_get_raw_frame(frame_worker_data->pbi, *index, &sd,
+                            &grain_params) == 0) {
+        RefCntBuffer *const output_frame_buf = pbi->output_frames[*index];
+        ctx->last_show_frame = output_frame_buf;
+        if (ctx->need_resync) return NULL;
+        aom_img_remove_metadata(&ctx->img);
+        yuvconfig2image(&ctx->img, sd, frame_worker_data->user_priv);
+        move_decoder_metadata_to_img(pbi, &ctx->img);
+
+        if (!pbi->ext_tile_debug && tiles->large_scale) {
+          *index += 1;  // Advance the iterator to point to the next image
+          aom_img_remove_metadata(&ctx->img);
+          yuvconfig2image(&ctx->img, &pbi->tile_list_outbuf, NULL);
+          move_decoder_metadata_to_img(pbi, &ctx->img);
+          img = &ctx->img;
+          return img;
+        }
+
+        const int num_planes = av1_num_planes(cm);
+        if (pbi->ext_tile_debug && tiles->single_tile_decoding &&
+            pbi->dec_tile_row >= 0) {
+          int tile_width, tile_height;
+          av1_get_uniform_tile_size(cm, &tile_width, &tile_height);
+          const int tile_row = AOMMIN(pbi->dec_tile_row, tiles->rows - 1);
+          const int mi_row = tile_row * tile_height;
+          const int ssy = ctx->img.y_chroma_shift;
+          int plane;
+          ctx->img.planes[0] += mi_row * MI_SIZE * ctx->img.stride[0];
+          if (num_planes > 1) {
+            for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
+              ctx->img.planes[plane] +=
+                  mi_row * (MI_SIZE >> ssy) * ctx->img.stride[plane];
+            }
+          }
+          ctx->img.d_h =
+              AOMMIN(tile_height, cm->mi_params.mi_rows - mi_row) * MI_SIZE;
+        }
+
+        if (pbi->ext_tile_debug && tiles->single_tile_decoding &&
+            pbi->dec_tile_col >= 0) {
+          int tile_width, tile_height;
+          av1_get_uniform_tile_size(cm, &tile_width, &tile_height);
+          const int tile_col = AOMMIN(pbi->dec_tile_col, tiles->cols - 1);
+          const int mi_col = tile_col * tile_width;
+          const int ssx = ctx->img.x_chroma_shift;
+          const int is_hbd = (ctx->img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 1 : 0;
+          int plane;
+          ctx->img.planes[0] += mi_col * MI_SIZE * (1 + is_hbd);
+          if (num_planes > 1) {
+            for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
+              ctx->img.planes[plane] +=
+                  mi_col * (MI_SIZE >> ssx) * (1 + is_hbd);
+            }
+          }
+          ctx->img.d_w =
+              AOMMIN(tile_width, cm->mi_params.mi_cols - mi_col) * MI_SIZE;
+        }
+
+        ctx->img.fb_priv = output_frame_buf->raw_frame_buffer.priv;
+        img = &ctx->img;
+        img->temporal_id = cm->temporal_layer_id;
+        img->spatial_id = cm->spatial_layer_id;
+        if (pbi->skip_film_grain) grain_params->apply_grain = 0;
+        aom_image_t *res =
+            add_grain_if_needed(ctx, img, &ctx->image_with_grain, grain_params);
+        if (!res) {
+          aom_internal_error(&pbi->common.error, AOM_CODEC_CORRUPT_FRAME,
+                             "Grain systhesis failed\n");
+        }
+        *index += 1;  // Advance the iterator to point to the next image
+        return res;
+      }
+    } else {
+      // Decoding failed. Release the worker thread.
+      frame_worker_data->received_frame = 0;
+      ctx->need_resync = 1;
+      if (ctx->flushed != 1) return NULL;
+    }
+  }
+  return NULL;
+}
+
+static aom_codec_err_t decoder_set_fb_fn(
+    aom_codec_alg_priv_t *ctx, aom_get_frame_buffer_cb_fn_t cb_get,
+    aom_release_frame_buffer_cb_fn_t cb_release, void *cb_priv) {
+  if (cb_get == NULL || cb_release == NULL) {
+    return AOM_CODEC_INVALID_PARAM;
+  } else if (ctx->frame_worker == NULL) {
+    // If the decoder has already been initialized, do not accept changes to
+    // the frame buffer functions.
+    ctx->get_ext_fb_cb = cb_get;
+    ctx->release_ext_fb_cb = cb_release;
+    ctx->ext_priv = cb_priv;
+    return AOM_CODEC_OK;
+  }
+
+  return AOM_CODEC_ERROR;
+}
+
+static aom_codec_err_t ctrl_set_reference(aom_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  av1_ref_frame_t *const data = va_arg(args, av1_ref_frame_t *);
+
+  if (data) {
+    av1_ref_frame_t *const frame = data;
+    YV12_BUFFER_CONFIG sd;
+    AVxWorker *const worker = ctx->frame_worker;
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+    image2yuvconfig(&frame->img, &sd);
+    return av1_set_reference_dec(&frame_worker_data->pbi->common, frame->idx,
+                                 frame->use_external_ref, &sd);
+  } else {
+    return AOM_CODEC_INVALID_PARAM;
+  }
+}
+
+static aom_codec_err_t ctrl_copy_reference(aom_codec_alg_priv_t *ctx,
+                                           va_list args) {
+  const av1_ref_frame_t *const frame = va_arg(args, av1_ref_frame_t *);
+  if (frame) {
+    YV12_BUFFER_CONFIG sd;
+    AVxWorker *const worker = ctx->frame_worker;
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+    image2yuvconfig(&frame->img, &sd);
+    return av1_copy_reference_dec(frame_worker_data->pbi, frame->idx, &sd);
+  } else {
+    return AOM_CODEC_INVALID_PARAM;
+  }
+}
+
+static aom_codec_err_t ctrl_get_reference(aom_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  av1_ref_frame_t *data = va_arg(args, av1_ref_frame_t *);
+  if (data) {
+    YV12_BUFFER_CONFIG *fb;
+    AVxWorker *const worker = ctx->frame_worker;
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+    fb = get_ref_frame(&frame_worker_data->pbi->common, data->idx);
+    if (fb == NULL) return AOM_CODEC_ERROR;
+    yuvconfig2image(&data->img, fb, NULL);
+    return AOM_CODEC_OK;
+  } else {
+    return AOM_CODEC_INVALID_PARAM;
+  }
+}
+
+static aom_codec_err_t ctrl_get_new_frame_image(aom_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  aom_image_t *new_img = va_arg(args, aom_image_t *);
+  if (new_img) {
+    YV12_BUFFER_CONFIG new_frame;
+    AVxWorker *const worker = ctx->frame_worker;
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+
+    if (av1_get_frame_to_show(frame_worker_data->pbi, &new_frame) == 0) {
+      yuvconfig2image(new_img, &new_frame, NULL);
+      return AOM_CODEC_OK;
+    } else {
+      return AOM_CODEC_ERROR;
+    }
+  } else {
+    return AOM_CODEC_INVALID_PARAM;
+  }
+}
+
+static aom_codec_err_t ctrl_copy_new_frame_image(aom_codec_alg_priv_t *ctx,
+                                                 va_list args) {
+  aom_image_t *img = va_arg(args, aom_image_t *);
+  if (img) {
+    YV12_BUFFER_CONFIG new_frame;
+    AVxWorker *const worker = ctx->frame_worker;
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+
+    if (av1_get_frame_to_show(frame_worker_data->pbi, &new_frame) == 0) {
+      YV12_BUFFER_CONFIG sd;
+      image2yuvconfig(img, &sd);
+      return av1_copy_new_frame_dec(&frame_worker_data->pbi->common, &new_frame,
+                                    &sd);
+    } else {
+      return AOM_CODEC_ERROR;
+    }
+  } else {
+    return AOM_CODEC_INVALID_PARAM;
+  }
+}
+
+static aom_codec_err_t ctrl_get_last_ref_updates(aom_codec_alg_priv_t *ctx,
+                                                 va_list args) {
+  int *const update_info = va_arg(args, int *);
+
+  if (update_info) {
+    if (ctx->frame_worker) {
+      AVxWorker *const worker = ctx->frame_worker;
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      *update_info =
+          frame_worker_data->pbi->common.current_frame.refresh_frame_flags;
+      return AOM_CODEC_OK;
+    } else {
+      return AOM_CODEC_ERROR;
+    }
+  }
+
+  return AOM_CODEC_INVALID_PARAM;
+}
+
+static aom_codec_err_t ctrl_get_last_quantizer(aom_codec_alg_priv_t *ctx,
+                                               va_list args) {
+  int *const arg = va_arg(args, int *);
+  if (arg == NULL) return AOM_CODEC_INVALID_PARAM;
+  *arg = ((FrameWorkerData *)ctx->frame_worker->data1)
+             ->pbi->common.quant_params.base_qindex;
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_get_frame_corrupted(aom_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  int *corrupted = va_arg(args, int *);
+
+  if (corrupted) {
+    if (ctx->frame_worker) {
+      AVxWorker *const worker = ctx->frame_worker;
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      AV1Decoder *const pbi = frame_worker_data->pbi;
+      if (pbi->seen_frame_header && pbi->num_output_frames == 0)
+        return AOM_CODEC_ERROR;
+      if (ctx->last_show_frame != NULL)
+        *corrupted = ctx->last_show_frame->buf.corrupted;
+      return AOM_CODEC_OK;
+    } else {
+      return AOM_CODEC_ERROR;
+    }
+  }
+
+  return AOM_CODEC_INVALID_PARAM;
+}
+
+static aom_codec_err_t ctrl_get_frame_size(aom_codec_alg_priv_t *ctx,
+                                           va_list args) {
+  int *const frame_size = va_arg(args, int *);
+
+  if (frame_size) {
+    if (ctx->frame_worker) {
+      AVxWorker *const worker = ctx->frame_worker;
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      const AV1_COMMON *const cm = &frame_worker_data->pbi->common;
+      frame_size[0] = cm->width;
+      frame_size[1] = cm->height;
+      return AOM_CODEC_OK;
+    } else {
+      return AOM_CODEC_ERROR;
+    }
+  }
+
+  return AOM_CODEC_INVALID_PARAM;
+}
+
+static aom_codec_err_t ctrl_get_frame_header_info(aom_codec_alg_priv_t *ctx,
+                                                  va_list args) {
+  aom_tile_data *const frame_header_info = va_arg(args, aom_tile_data *);
+
+  if (frame_header_info) {
+    if (ctx->frame_worker) {
+      AVxWorker *const worker = ctx->frame_worker;
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      const AV1Decoder *pbi = frame_worker_data->pbi;
+      frame_header_info->coded_tile_data_size = pbi->obu_size_hdr.size;
+      frame_header_info->coded_tile_data = pbi->obu_size_hdr.data;
+      frame_header_info->extra_size = pbi->frame_header_size;
+    } else {
+      return AOM_CODEC_ERROR;
+    }
+  }
+
+  return AOM_CODEC_INVALID_PARAM;
+}
+
+static aom_codec_err_t ctrl_get_tile_data(aom_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  aom_tile_data *const tile_data = va_arg(args, aom_tile_data *);
+
+  if (tile_data) {
+    if (ctx->frame_worker) {
+      AVxWorker *const worker = ctx->frame_worker;
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      const AV1Decoder *pbi = frame_worker_data->pbi;
+      tile_data->coded_tile_data_size =
+          pbi->tile_buffers[pbi->dec_tile_row][pbi->dec_tile_col].size;
+      tile_data->coded_tile_data =
+          pbi->tile_buffers[pbi->dec_tile_row][pbi->dec_tile_col].data;
+      return AOM_CODEC_OK;
+    } else {
+      return AOM_CODEC_ERROR;
+    }
+  }
+
+  return AOM_CODEC_INVALID_PARAM;
+}
+
+static aom_codec_err_t ctrl_set_ext_ref_ptr(aom_codec_alg_priv_t *ctx,
+                                            va_list args) {
+  av1_ext_ref_frame_t *const data = va_arg(args, av1_ext_ref_frame_t *);
+
+  if (data) {
+    av1_ext_ref_frame_t *const ext_frames = data;
+    ctx->ext_refs.num = ext_frames->num;
+    for (int i = 0; i < ctx->ext_refs.num; i++) {
+      image2yuvconfig(ext_frames->img++, &ctx->ext_refs.refs[i]);
+    }
+    return AOM_CODEC_OK;
+  } else {
+    return AOM_CODEC_INVALID_PARAM;
+  }
+}
+
+static aom_codec_err_t ctrl_get_render_size(aom_codec_alg_priv_t *ctx,
+                                            va_list args) {
+  int *const render_size = va_arg(args, int *);
+
+  if (render_size) {
+    if (ctx->frame_worker) {
+      AVxWorker *const worker = ctx->frame_worker;
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      const AV1_COMMON *const cm = &frame_worker_data->pbi->common;
+      render_size[0] = cm->render_width;
+      render_size[1] = cm->render_height;
+      return AOM_CODEC_OK;
+    } else {
+      return AOM_CODEC_ERROR;
+    }
+  }
+
+  return AOM_CODEC_INVALID_PARAM;
+}
+
+static aom_codec_err_t ctrl_get_bit_depth(aom_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  unsigned int *const bit_depth = va_arg(args, unsigned int *);
+  AVxWorker *const worker = ctx->frame_worker;
+
+  if (bit_depth) {
+    if (worker) {
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      const AV1_COMMON *const cm = &frame_worker_data->pbi->common;
+      *bit_depth = cm->seq_params.bit_depth;
+      return AOM_CODEC_OK;
+    } else {
+      return AOM_CODEC_ERROR;
+    }
+  }
+
+  return AOM_CODEC_INVALID_PARAM;
+}
+
+static aom_img_fmt_t get_img_format(int subsampling_x, int subsampling_y,
+                                    int use_highbitdepth) {
+  aom_img_fmt_t fmt = 0;
+
+  if (subsampling_x == 0 && subsampling_y == 0)
+    fmt = AOM_IMG_FMT_I444;
+  else if (subsampling_x == 1 && subsampling_y == 0)
+    fmt = AOM_IMG_FMT_I422;
+  else if (subsampling_x == 1 && subsampling_y == 1)
+    fmt = AOM_IMG_FMT_I420;
+
+  if (use_highbitdepth) fmt |= AOM_IMG_FMT_HIGHBITDEPTH;
+  return fmt;
+}
+
+static aom_codec_err_t ctrl_get_img_format(aom_codec_alg_priv_t *ctx,
+                                           va_list args) {
+  aom_img_fmt_t *const img_fmt = va_arg(args, aom_img_fmt_t *);
+  AVxWorker *const worker = ctx->frame_worker;
+
+  if (img_fmt) {
+    if (worker) {
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      const AV1_COMMON *const cm = &frame_worker_data->pbi->common;
+
+      *img_fmt = get_img_format(cm->seq_params.subsampling_x,
+                                cm->seq_params.subsampling_y,
+                                cm->seq_params.use_highbitdepth);
+      return AOM_CODEC_OK;
+    } else {
+      return AOM_CODEC_ERROR;
+    }
+  }
+
+  return AOM_CODEC_INVALID_PARAM;
+}
+
+static aom_codec_err_t ctrl_get_tile_size(aom_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  unsigned int *const tile_size = va_arg(args, unsigned int *);
+  AVxWorker *const worker = ctx->frame_worker;
+
+  if (tile_size) {
+    if (worker) {
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      const AV1_COMMON *const cm = &frame_worker_data->pbi->common;
+      int tile_width, tile_height;
+      av1_get_uniform_tile_size(cm, &tile_width, &tile_height);
+      *tile_size = ((tile_width * MI_SIZE) << 16) + tile_height * MI_SIZE;
+      return AOM_CODEC_OK;
+    } else {
+      return AOM_CODEC_ERROR;
+    }
+  }
+  return AOM_CODEC_INVALID_PARAM;
+}
+
+static aom_codec_err_t ctrl_get_tile_count(aom_codec_alg_priv_t *ctx,
+                                           va_list args) {
+  unsigned int *const tile_count = va_arg(args, unsigned int *);
+
+  if (tile_count) {
+    AVxWorker *const worker = ctx->frame_worker;
+    if (worker) {
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      *tile_count = frame_worker_data->pbi->tile_count_minus_1 + 1;
+      return AOM_CODEC_OK;
+    } else {
+      return AOM_CODEC_ERROR;
+    }
+  }
+  return AOM_CODEC_INVALID_PARAM;
+}
+
+static aom_codec_err_t ctrl_set_invert_tile_order(aom_codec_alg_priv_t *ctx,
+                                                  va_list args) {
+  ctx->invert_tile_order = va_arg(args, int);
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_byte_alignment(aom_codec_alg_priv_t *ctx,
+                                               va_list args) {
+  const int legacy_byte_alignment = 0;
+  const int min_byte_alignment = 32;
+  const int max_byte_alignment = 1024;
+  const int byte_alignment = va_arg(args, int);
+
+  if (byte_alignment != legacy_byte_alignment &&
+      (byte_alignment < min_byte_alignment ||
+       byte_alignment > max_byte_alignment ||
+       (byte_alignment & (byte_alignment - 1)) != 0))
+    return AOM_CODEC_INVALID_PARAM;
+
+  ctx->byte_alignment = byte_alignment;
+  if (ctx->frame_worker) {
+    AVxWorker *const worker = ctx->frame_worker;
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+    frame_worker_data->pbi->common.features.byte_alignment = byte_alignment;
+  }
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_skip_loop_filter(aom_codec_alg_priv_t *ctx,
+                                                 va_list args) {
+  ctx->skip_loop_filter = va_arg(args, int);
+
+  if (ctx->frame_worker) {
+    AVxWorker *const worker = ctx->frame_worker;
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+    frame_worker_data->pbi->skip_loop_filter = ctx->skip_loop_filter;
+  }
+
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_skip_film_grain(aom_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  ctx->skip_film_grain = va_arg(args, int);
+
+  if (ctx->frame_worker) {
+    AVxWorker *const worker = ctx->frame_worker;
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+    frame_worker_data->pbi->skip_film_grain = ctx->skip_film_grain;
+  }
+
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_get_accounting(aom_codec_alg_priv_t *ctx,
+                                           va_list args) {
+#if !CONFIG_ACCOUNTING
+  (void)ctx;
+  (void)args;
+  return AOM_CODEC_INCAPABLE;
+#else
+  if (ctx->frame_worker) {
+    AVxWorker *const worker = ctx->frame_worker;
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+    AV1Decoder *pbi = frame_worker_data->pbi;
+    Accounting **acct = va_arg(args, Accounting **);
+    *acct = &pbi->accounting;
+    return AOM_CODEC_OK;
+  }
+  return AOM_CODEC_ERROR;
+#endif
+}
+static aom_codec_err_t ctrl_set_decode_tile_row(aom_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  ctx->decode_tile_row = va_arg(args, int);
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_decode_tile_col(aom_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  ctx->decode_tile_col = va_arg(args, int);
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_tile_mode(aom_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  ctx->tile_mode = va_arg(args, unsigned int);
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_is_annexb(aom_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  ctx->is_annexb = va_arg(args, unsigned int);
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_operating_point(aom_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  ctx->operating_point = va_arg(args, int);
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_output_all_layers(aom_codec_alg_priv_t *ctx,
+                                                  va_list args) {
+  ctx->output_all_layers = va_arg(args, int);
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_inspection_callback(aom_codec_alg_priv_t *ctx,
+                                                    va_list args) {
+#if !CONFIG_INSPECTION
+  (void)ctx;
+  (void)args;
+  return AOM_CODEC_INCAPABLE;
+#else
+  aom_inspect_init *init = va_arg(args, aom_inspect_init *);
+  ctx->inspect_cb = init->inspect_cb;
+  ctx->inspect_ctx = init->inspect_ctx;
+  return AOM_CODEC_OK;
+#endif
+}
+
+static aom_codec_err_t ctrl_ext_tile_debug(aom_codec_alg_priv_t *ctx,
+                                           va_list args) {
+  ctx->ext_tile_debug = va_arg(args, int);
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_row_mt(aom_codec_alg_priv_t *ctx,
+                                       va_list args) {
+  ctx->row_mt = va_arg(args, unsigned int);
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_ctrl_fn_map_t decoder_ctrl_maps[] = {
+  { AV1_COPY_REFERENCE, ctrl_copy_reference },
+
+  // Setters
+  { AV1_SET_REFERENCE, ctrl_set_reference },
+  { AV1_INVERT_TILE_DECODE_ORDER, ctrl_set_invert_tile_order },
+  { AV1_SET_BYTE_ALIGNMENT, ctrl_set_byte_alignment },
+  { AV1_SET_SKIP_LOOP_FILTER, ctrl_set_skip_loop_filter },
+  { AV1_SET_DECODE_TILE_ROW, ctrl_set_decode_tile_row },
+  { AV1_SET_DECODE_TILE_COL, ctrl_set_decode_tile_col },
+  { AV1_SET_TILE_MODE, ctrl_set_tile_mode },
+  { AV1D_SET_IS_ANNEXB, ctrl_set_is_annexb },
+  { AV1D_SET_OPERATING_POINT, ctrl_set_operating_point },
+  { AV1D_SET_OUTPUT_ALL_LAYERS, ctrl_set_output_all_layers },
+  { AV1_SET_INSPECTION_CALLBACK, ctrl_set_inspection_callback },
+  { AV1D_EXT_TILE_DEBUG, ctrl_ext_tile_debug },
+  { AV1D_SET_ROW_MT, ctrl_set_row_mt },
+  { AV1D_SET_EXT_REF_PTR, ctrl_set_ext_ref_ptr },
+  { AV1D_SET_SKIP_FILM_GRAIN, ctrl_set_skip_film_grain },
+
+  // Getters
+  { AOMD_GET_FRAME_CORRUPTED, ctrl_get_frame_corrupted },
+  { AOMD_GET_LAST_QUANTIZER, ctrl_get_last_quantizer },
+  { AOMD_GET_LAST_REF_UPDATES, ctrl_get_last_ref_updates },
+  { AV1D_GET_BIT_DEPTH, ctrl_get_bit_depth },
+  { AV1D_GET_IMG_FORMAT, ctrl_get_img_format },
+  { AV1D_GET_TILE_SIZE, ctrl_get_tile_size },
+  { AV1D_GET_TILE_COUNT, ctrl_get_tile_count },
+  { AV1D_GET_DISPLAY_SIZE, ctrl_get_render_size },
+  { AV1D_GET_FRAME_SIZE, ctrl_get_frame_size },
+  { AV1_GET_ACCOUNTING, ctrl_get_accounting },
+  { AV1_GET_NEW_FRAME_IMAGE, ctrl_get_new_frame_image },
+  { AV1_COPY_NEW_FRAME_IMAGE, ctrl_copy_new_frame_image },
+  { AV1_GET_REFERENCE, ctrl_get_reference },
+  { AV1D_GET_FRAME_HEADER_INFO, ctrl_get_frame_header_info },
+  { AV1D_GET_TILE_DATA, ctrl_get_tile_data },
+
+  { -1, NULL },
+};
+
+#ifndef VERSION_STRING
+#define VERSION_STRING
+#endif
+CODEC_INTERFACE(aom_codec_av1_dx) = {
+  "AOMedia Project AV1 Decoder" VERSION_STRING,
+  AOM_CODEC_INTERNAL_ABI_VERSION,
+  AOM_CODEC_CAP_DECODER |
+      AOM_CODEC_CAP_EXTERNAL_FRAME_BUFFER,  // aom_codec_caps_t
+  decoder_init,                             // aom_codec_init_fn_t
+  decoder_destroy,                          // aom_codec_destroy_fn_t
+  decoder_ctrl_maps,                        // aom_codec_ctrl_fn_map_t
+  {
+      // NOLINT
+      decoder_peek_si,    // aom_codec_peek_si_fn_t
+      decoder_get_si,     // aom_codec_get_si_fn_t
+      decoder_decode,     // aom_codec_decode_fn_t
+      decoder_get_frame,  // aom_codec_get_frame_fn_t
+      decoder_set_fb_fn,  // aom_codec_set_fb_fn_t
+  },
+  {
+      // NOLINT
+      0,
+      NULL,  // aom_codec_enc_cfg_t
+      NULL,  // aom_codec_encode_fn_t
+      NULL,  // aom_codec_get_cx_data_fn_t
+      NULL,  // aom_codec_enc_config_set_fn_t
+      NULL,  // aom_codec_get_global_headers_fn_t
+      NULL   // aom_codec_get_preview_frame_fn_t
+  }
+};
diff --git a/libs/libaom/src/av1/av1_iface_common.h b/libs/libaom/src/av1/av1_iface_common.h
new file mode 100644
index 000000000..9b5ffcba4
--- /dev/null
+++ b/libs/libaom/src/av1/av1_iface_common.h
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_AV1_IFACE_COMMON_H_
+#define AOM_AV1_AV1_IFACE_COMMON_H_
+
+#include <assert.h>
+
+#include "aom_ports/mem.h"
+#include "aom_scale/yv12config.h"
+
+static void yuvconfig2image(aom_image_t *img, const YV12_BUFFER_CONFIG *yv12,
+                            void *user_priv) {
+  /* aom_img_wrap() doesn't allow specifying independent strides for
+   * the Y, U, and V planes, nor other alignment adjustments that
+   * might be representable by a YV12_BUFFER_CONFIG, so we just
+   * initialize all the fields.
+   */
+  int bps;
+  if (!yv12->subsampling_y) {
+    if (!yv12->subsampling_x) {
+      img->fmt = AOM_IMG_FMT_I444;
+      bps = 24;
+    } else {
+      img->fmt = AOM_IMG_FMT_I422;
+      bps = 16;
+    }
+  } else {
+    img->fmt = AOM_IMG_FMT_I420;
+    bps = 12;
+  }
+  img->cp = yv12->color_primaries;
+  img->tc = yv12->transfer_characteristics;
+  img->mc = yv12->matrix_coefficients;
+  img->monochrome = yv12->monochrome;
+  img->csp = yv12->chroma_sample_position;
+  img->range = yv12->color_range;
+  img->bit_depth = 8;
+  img->w = yv12->y_width;
+  img->h = yv12->y_height;
+  img->d_w = yv12->y_crop_width;
+  img->d_h = yv12->y_crop_height;
+  img->r_w = yv12->render_width;
+  img->r_h = yv12->render_height;
+  img->x_chroma_shift = yv12->subsampling_x;
+  img->y_chroma_shift = yv12->subsampling_y;
+  img->planes[AOM_PLANE_Y] = yv12->y_buffer;
+  img->planes[AOM_PLANE_U] = yv12->u_buffer;
+  img->planes[AOM_PLANE_V] = yv12->v_buffer;
+  img->stride[AOM_PLANE_Y] = yv12->y_stride;
+  img->stride[AOM_PLANE_U] = yv12->uv_stride;
+  img->stride[AOM_PLANE_V] = yv12->uv_stride;
+  if (yv12->flags & YV12_FLAG_HIGHBITDEPTH) {
+    bps *= 2;
+    // aom_image_t uses byte strides and a pointer to the first byte
+    // of the image.
+    img->fmt = (aom_img_fmt_t)(img->fmt | AOM_IMG_FMT_HIGHBITDEPTH);
+    img->bit_depth = yv12->bit_depth;
+    img->planes[AOM_PLANE_Y] = (uint8_t *)CONVERT_TO_SHORTPTR(yv12->y_buffer);
+    img->planes[AOM_PLANE_U] = (uint8_t *)CONVERT_TO_SHORTPTR(yv12->u_buffer);
+    img->planes[AOM_PLANE_V] = (uint8_t *)CONVERT_TO_SHORTPTR(yv12->v_buffer);
+    img->stride[AOM_PLANE_Y] = 2 * yv12->y_stride;
+    img->stride[AOM_PLANE_U] = 2 * yv12->uv_stride;
+    img->stride[AOM_PLANE_V] = 2 * yv12->uv_stride;
+  }
+  img->bps = bps;
+  img->user_priv = user_priv;
+  img->img_data = yv12->buffer_alloc;
+  img->img_data_owner = 0;
+  img->self_allocd = 0;
+  img->sz = yv12->frame_size;
+  assert(!yv12->metadata);
+  img->metadata = NULL;
+}
+
+static aom_codec_err_t image2yuvconfig(const aom_image_t *img,
+                                       YV12_BUFFER_CONFIG *yv12) {
+  yv12->y_buffer = img->planes[AOM_PLANE_Y];
+  yv12->u_buffer = img->planes[AOM_PLANE_U];
+  yv12->v_buffer = img->planes[AOM_PLANE_V];
+
+  yv12->y_crop_width = img->d_w;
+  yv12->y_crop_height = img->d_h;
+  yv12->render_width = img->r_w;
+  yv12->render_height = img->r_h;
+  yv12->y_width = img->w;
+  yv12->y_height = img->h;
+
+  yv12->uv_width =
+      img->x_chroma_shift == 1 ? (1 + yv12->y_width) / 2 : yv12->y_width;
+  yv12->uv_height =
+      img->y_chroma_shift == 1 ? (1 + yv12->y_height) / 2 : yv12->y_height;
+  yv12->uv_crop_width = yv12->uv_width;
+  yv12->uv_crop_height = yv12->uv_height;
+
+  yv12->y_stride = img->stride[AOM_PLANE_Y];
+  yv12->uv_stride = img->stride[AOM_PLANE_U];
+  yv12->color_primaries = img->cp;
+  yv12->transfer_characteristics = img->tc;
+  yv12->matrix_coefficients = img->mc;
+  yv12->monochrome = img->monochrome;
+  yv12->chroma_sample_position = img->csp;
+  yv12->color_range = img->range;
+
+  if (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) {
+    // In aom_image_t
+    //     planes point to uint8 address of start of data
+    //     stride counts uint8s to reach next row
+    // In YV12_BUFFER_CONFIG
+    //     y_buffer, u_buffer, v_buffer point to uint16 address of data
+    //     stride and border counts in uint16s
+    // This means that all the address calculations in the main body of code
+    // should work correctly.
+    // However, before we do any pixel operations we need to cast the address
+    // to a uint16 ponter and double its value.
+    yv12->y_buffer = CONVERT_TO_BYTEPTR(yv12->y_buffer);
+    yv12->u_buffer = CONVERT_TO_BYTEPTR(yv12->u_buffer);
+    yv12->v_buffer = CONVERT_TO_BYTEPTR(yv12->v_buffer);
+    yv12->y_stride >>= 1;
+    yv12->uv_stride >>= 1;
+    yv12->flags = YV12_FLAG_HIGHBITDEPTH;
+  } else {
+    yv12->flags = 0;
+  }
+
+  // Note(yunqing): if img is allocated the same as the frame buffer, y_stride
+  // is 32-byte aligned. Also, handle the cases while allocating img without a
+  // border or stride_align is less than 32.
+  int border = (yv12->y_stride - (int)((img->w + 31) & ~31)) / 2;
+  yv12->border = (border < 0) ? 0 : border;
+  yv12->subsampling_x = img->x_chroma_shift;
+  yv12->subsampling_y = img->y_chroma_shift;
+  yv12->metadata = img->metadata;
+  return AOM_CODEC_OK;
+}
+
+#endif  // AOM_AV1_AV1_IFACE_COMMON_H_
diff --git a/libs/libaom/src/av1/common/alloccommon.c b/libs/libaom/src/av1/common/alloccommon.c
new file mode 100644
index 000000000..badee3df9
--- /dev/null
+++ b/libs/libaom/src/av1/common/alloccommon.c
@@ -0,0 +1,309 @@
+/*
+ *
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_config.h"
+
+#include "aom_mem/aom_mem.h"
+
+#include "av1/common/alloccommon.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/common/blockd.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/entropymv.h"
+
+int av1_get_MBs(int width, int height) {
+  const int aligned_width = ALIGN_POWER_OF_TWO(width, 3);
+  const int aligned_height = ALIGN_POWER_OF_TWO(height, 3);
+  const int mi_cols = aligned_width >> MI_SIZE_LOG2;
+  const int mi_rows = aligned_height >> MI_SIZE_LOG2;
+
+  const int mb_cols = (mi_cols + 2) >> 2;
+  const int mb_rows = (mi_rows + 2) >> 2;
+  return mb_rows * mb_cols;
+}
+
+void av1_free_ref_frame_buffers(BufferPool *pool) {
+  int i;
+
+  for (i = 0; i < FRAME_BUFFERS; ++i) {
+    if (pool->frame_bufs[i].ref_count > 0 &&
+        pool->frame_bufs[i].raw_frame_buffer.data != NULL) {
+      pool->release_fb_cb(pool->cb_priv, &pool->frame_bufs[i].raw_frame_buffer);
+      pool->frame_bufs[i].raw_frame_buffer.data = NULL;
+      pool->frame_bufs[i].raw_frame_buffer.size = 0;
+      pool->frame_bufs[i].raw_frame_buffer.priv = NULL;
+      pool->frame_bufs[i].ref_count = 0;
+    }
+    aom_free(pool->frame_bufs[i].mvs);
+    pool->frame_bufs[i].mvs = NULL;
+    aom_free(pool->frame_bufs[i].seg_map);
+    pool->frame_bufs[i].seg_map = NULL;
+    aom_free_frame_buffer(&pool->frame_bufs[i].buf);
+  }
+}
+
+// Assumes cm->rst_info[p].restoration_unit_size is already initialized
+void av1_alloc_restoration_buffers(AV1_COMMON *cm) {
+  const int num_planes = av1_num_planes(cm);
+  for (int p = 0; p < num_planes; ++p)
+    av1_alloc_restoration_struct(cm, &cm->rst_info[p], p > 0);
+
+  if (cm->rst_tmpbuf == NULL) {
+    CHECK_MEM_ERROR(cm, cm->rst_tmpbuf,
+                    (int32_t *)aom_memalign(16, RESTORATION_TMPBUF_SIZE));
+  }
+
+  if (cm->rlbs == NULL) {
+    CHECK_MEM_ERROR(cm, cm->rlbs, aom_malloc(sizeof(RestorationLineBuffers)));
+  }
+
+  // For striped loop restoration, we divide each row of tiles into "stripes",
+  // of height 64 luma pixels but with an offset by RESTORATION_UNIT_OFFSET
+  // luma pixels to match the output from CDEF. We will need to store 2 *
+  // RESTORATION_CTX_VERT lines of data for each stripe, and also need to be
+  // able to quickly answer the question "Where is the <n>'th stripe for tile
+  // row <m>?" To make that efficient, we generate the rst_last_stripe array.
+  int num_stripes = 0;
+  for (int i = 0; i < cm->tiles.rows; ++i) {
+    TileInfo tile_info;
+    av1_tile_set_row(&tile_info, cm, i);
+    const int mi_h = tile_info.mi_row_end - tile_info.mi_row_start;
+    const int ext_h = RESTORATION_UNIT_OFFSET + (mi_h << MI_SIZE_LOG2);
+    const int tile_stripes = (ext_h + 63) / 64;
+    num_stripes += tile_stripes;
+  }
+
+  // Now we need to allocate enough space to store the line buffers for the
+  // stripes
+  const int frame_w = cm->superres_upscaled_width;
+  const int use_highbd = cm->seq_params.use_highbitdepth;
+
+  for (int p = 0; p < num_planes; ++p) {
+    const int is_uv = p > 0;
+    const int ss_x = is_uv && cm->seq_params.subsampling_x;
+    const int plane_w = ((frame_w + ss_x) >> ss_x) + 2 * RESTORATION_EXTRA_HORZ;
+    const int stride = ALIGN_POWER_OF_TWO(plane_w, 5);
+    const int buf_size = num_stripes * stride * RESTORATION_CTX_VERT
+                         << use_highbd;
+    RestorationStripeBoundaries *boundaries = &cm->rst_info[p].boundaries;
+
+    if (buf_size != boundaries->stripe_boundary_size ||
+        boundaries->stripe_boundary_above == NULL ||
+        boundaries->stripe_boundary_below == NULL) {
+      aom_free(boundaries->stripe_boundary_above);
+      aom_free(boundaries->stripe_boundary_below);
+
+      CHECK_MEM_ERROR(cm, boundaries->stripe_boundary_above,
+                      (uint8_t *)aom_memalign(32, buf_size));
+      CHECK_MEM_ERROR(cm, boundaries->stripe_boundary_below,
+                      (uint8_t *)aom_memalign(32, buf_size));
+
+      boundaries->stripe_boundary_size = buf_size;
+    }
+    boundaries->stripe_boundary_stride = stride;
+  }
+}
+
+void av1_free_restoration_buffers(AV1_COMMON *cm) {
+  int p;
+  for (p = 0; p < MAX_MB_PLANE; ++p)
+    av1_free_restoration_struct(&cm->rst_info[p]);
+  aom_free(cm->rst_tmpbuf);
+  cm->rst_tmpbuf = NULL;
+  aom_free(cm->rlbs);
+  cm->rlbs = NULL;
+  for (p = 0; p < MAX_MB_PLANE; ++p) {
+    RestorationStripeBoundaries *boundaries = &cm->rst_info[p].boundaries;
+    aom_free(boundaries->stripe_boundary_above);
+    aom_free(boundaries->stripe_boundary_below);
+    boundaries->stripe_boundary_above = NULL;
+    boundaries->stripe_boundary_below = NULL;
+  }
+
+  aom_free_frame_buffer(&cm->rst_frame);
+}
+
+void av1_free_above_context_buffers(CommonContexts *above_contexts) {
+  int i;
+  const int num_planes = above_contexts->num_planes;
+
+  for (int tile_row = 0; tile_row < above_contexts->num_tile_rows; tile_row++) {
+    for (i = 0; i < num_planes; i++) {
+      aom_free(above_contexts->entropy[i][tile_row]);
+      above_contexts->entropy[i][tile_row] = NULL;
+    }
+    aom_free(above_contexts->partition[tile_row]);
+    above_contexts->partition[tile_row] = NULL;
+
+    aom_free(above_contexts->txfm[tile_row]);
+    above_contexts->txfm[tile_row] = NULL;
+  }
+  for (i = 0; i < num_planes; i++) {
+    aom_free(above_contexts->entropy[i]);
+    above_contexts->entropy[i] = NULL;
+  }
+  aom_free(above_contexts->partition);
+  above_contexts->partition = NULL;
+
+  aom_free(above_contexts->txfm);
+  above_contexts->txfm = NULL;
+
+  above_contexts->num_tile_rows = 0;
+  above_contexts->num_mi_cols = 0;
+  above_contexts->num_planes = 0;
+}
+
+void av1_free_context_buffers(AV1_COMMON *cm) {
+  cm->mi_params.free_mi(&cm->mi_params);
+
+  av1_free_above_context_buffers(&cm->above_contexts);
+
+#if CONFIG_LPF_MASK
+  av1_free_loop_filter_mask(cm);
+#endif
+}
+
+int av1_alloc_above_context_buffers(CommonContexts *above_contexts,
+                                    int num_tile_rows, int num_mi_cols,
+                                    int num_planes) {
+  const int aligned_mi_cols =
+      ALIGN_POWER_OF_TWO(num_mi_cols, MAX_MIB_SIZE_LOG2);
+
+  // Allocate above context buffers
+  above_contexts->num_tile_rows = num_tile_rows;
+  above_contexts->num_mi_cols = aligned_mi_cols;
+  above_contexts->num_planes = num_planes;
+  for (int plane_idx = 0; plane_idx < num_planes; plane_idx++) {
+    above_contexts->entropy[plane_idx] = (ENTROPY_CONTEXT **)aom_calloc(
+        num_tile_rows, sizeof(above_contexts->entropy[0]));
+    if (!above_contexts->entropy[plane_idx]) return 1;
+  }
+
+  above_contexts->partition = (PARTITION_CONTEXT **)aom_calloc(
+      num_tile_rows, sizeof(above_contexts->partition));
+  if (!above_contexts->partition) return 1;
+
+  above_contexts->txfm =
+      (TXFM_CONTEXT **)aom_calloc(num_tile_rows, sizeof(above_contexts->txfm));
+  if (!above_contexts->txfm) return 1;
+
+  for (int tile_row = 0; tile_row < num_tile_rows; tile_row++) {
+    for (int plane_idx = 0; plane_idx < num_planes; plane_idx++) {
+      above_contexts->entropy[plane_idx][tile_row] =
+          (ENTROPY_CONTEXT *)aom_calloc(
+              aligned_mi_cols, sizeof(*above_contexts->entropy[0][tile_row]));
+      if (!above_contexts->entropy[plane_idx][tile_row]) return 1;
+    }
+
+    above_contexts->partition[tile_row] = (PARTITION_CONTEXT *)aom_calloc(
+        aligned_mi_cols, sizeof(*above_contexts->partition[tile_row]));
+    if (!above_contexts->partition[tile_row]) return 1;
+
+    above_contexts->txfm[tile_row] = (TXFM_CONTEXT *)aom_calloc(
+        aligned_mi_cols, sizeof(*above_contexts->txfm[tile_row]));
+    if (!above_contexts->txfm[tile_row]) return 1;
+  }
+
+  return 0;
+}
+
+// Allocate the dynamically allocated arrays in 'mi_params' assuming
+// 'mi_params->set_mb_mi()' was already called earlier to initialize the rest of
+// the struct members.
+static int alloc_mi(CommonModeInfoParams *mi_params) {
+  const int aligned_mi_rows = calc_mi_size(mi_params->mi_rows);
+  const int mi_grid_size = mi_params->mi_stride * aligned_mi_rows;
+  const int alloc_size_1d = mi_size_wide[mi_params->mi_alloc_bsize];
+  const int alloc_mi_size =
+      mi_params->mi_alloc_stride * (aligned_mi_rows / alloc_size_1d);
+
+  if (mi_params->mi_alloc_size < alloc_mi_size ||
+      mi_params->mi_grid_size < mi_grid_size) {
+    mi_params->free_mi(mi_params);
+
+    mi_params->mi_alloc =
+        aom_calloc(alloc_mi_size, sizeof(*mi_params->mi_alloc));
+    if (!mi_params->mi_alloc) return 1;
+    mi_params->mi_alloc_size = alloc_mi_size;
+
+    mi_params->mi_grid_base = (MB_MODE_INFO **)aom_calloc(
+        mi_grid_size, sizeof(*mi_params->mi_grid_base));
+    if (!mi_params->mi_grid_base) return 1;
+    mi_params->mi_grid_size = mi_grid_size;
+
+    mi_params->tx_type_map =
+        aom_calloc(mi_grid_size, sizeof(*mi_params->tx_type_map));
+    if (!mi_params->tx_type_map) return 1;
+  }
+
+  return 0;
+}
+
+int av1_alloc_context_buffers(AV1_COMMON *cm, int width, int height) {
+  CommonModeInfoParams *const mi_params = &cm->mi_params;
+  mi_params->set_mb_mi(mi_params, width, height);
+  if (alloc_mi(mi_params)) goto fail;
+  return 0;
+
+fail:
+  // clear the mi_* values to force a realloc on resync
+  mi_params->set_mb_mi(mi_params, 0, 0);
+  av1_free_context_buffers(cm);
+  return 1;
+}
+
+void av1_remove_common(AV1_COMMON *cm) {
+  av1_free_context_buffers(cm);
+
+  aom_free(cm->fc);
+  cm->fc = NULL;
+  aom_free(cm->default_frame_context);
+  cm->default_frame_context = NULL;
+}
+
+void av1_init_mi_buffers(CommonModeInfoParams *mi_params) {
+  mi_params->setup_mi(mi_params);
+}
+
+#if CONFIG_LPF_MASK
+int av1_alloc_loop_filter_mask(AV1_COMMON *cm) {
+  aom_free(cm->lf.lfm);
+  cm->lf.lfm = NULL;
+
+  // Each lfm holds bit masks for all the 4x4 blocks in a max
+  // 64x64 (128x128 for ext_partitions) region.  The stride
+  // and rows are rounded up / truncated to a multiple of 16
+  // (32 for ext_partition).
+  cm->lf.lfm_stride =
+      (cm->mi_params.mi_cols + (MI_SIZE_64X64 - 1)) >> MIN_MIB_SIZE_LOG2;
+  cm->lf.lfm_num =
+      ((cm->mi_params.mi_rows + (MI_SIZE_64X64 - 1)) >> MIN_MIB_SIZE_LOG2) *
+      cm->lf.lfm_stride;
+  cm->lf.lfm =
+      (LoopFilterMask *)aom_calloc(cm->lf.lfm_num, sizeof(*cm->lf.lfm));
+  if (!cm->lf.lfm) return 1;
+
+  unsigned int i;
+  for (i = 0; i < cm->lf.lfm_num; ++i) av1_zero(cm->lf.lfm[i]);
+
+  return 0;
+}
+
+void av1_free_loop_filter_mask(AV1_COMMON *cm) {
+  if (cm->lf.lfm == NULL) return;
+
+  aom_free(cm->lf.lfm);
+  cm->lf.lfm = NULL;
+  cm->lf.lfm_num = 0;
+  cm->lf.lfm_stride = 0;
+}
+#endif
diff --git a/libs/libaom/src/av1/common/alloccommon.h b/libs/libaom/src/av1/common/alloccommon.h
new file mode 100644
index 000000000..fe8e0c530
--- /dev/null
+++ b/libs/libaom/src/av1/common/alloccommon.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_ALLOCCOMMON_H_
+#define AOM_AV1_COMMON_ALLOCCOMMON_H_
+
+#define INVALID_IDX -1  // Invalid buffer index.
+
+#include "config/aom_config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct AV1Common;
+struct BufferPool;
+struct CommonContexts;
+struct CommonModeInfoParams;
+
+void av1_remove_common(struct AV1Common *cm);
+
+int av1_alloc_above_context_buffers(struct CommonContexts *above_contexts,
+                                    int num_tile_rows, int num_mi_cols,
+                                    int num_planes);
+void av1_free_above_context_buffers(struct CommonContexts *above_contexts);
+int av1_alloc_context_buffers(struct AV1Common *cm, int width, int height);
+void av1_init_mi_buffers(struct CommonModeInfoParams *mi_params);
+void av1_free_context_buffers(struct AV1Common *cm);
+
+void av1_free_ref_frame_buffers(struct BufferPool *pool);
+void av1_alloc_restoration_buffers(struct AV1Common *cm);
+void av1_free_restoration_buffers(struct AV1Common *cm);
+
+int av1_alloc_state_buffers(struct AV1Common *cm, int width, int height);
+void av1_free_state_buffers(struct AV1Common *cm);
+
+int av1_get_MBs(int width, int height);
+
+#if CONFIG_LPF_MASK
+int av1_alloc_loop_filter_mask(struct AV1Common *cm);
+void av1_free_loop_filter_mask(struct AV1Common *cm);
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_COMMON_ALLOCCOMMON_H_
diff --git a/libs/libaom/src/av1/common/arm/av1_inv_txfm_neon.c b/libs/libaom/src/av1/common/arm/av1_inv_txfm_neon.c
new file mode 100644
index 000000000..2f3567aea
--- /dev/null
+++ b/libs/libaom/src/av1/common/arm/av1_inv_txfm_neon.c
@@ -0,0 +1,4271 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "av1/common/av1_inv_txfm1d.h"
+#include "av1/common/av1_inv_txfm1d_cfg.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/common/enums.h"
+#include "av1/common/idct.h"
+#include "av1/common/arm/av1_inv_txfm_neon.h"
+#include "av1/common/arm/transpose_neon.h"
+
+// 1D itx types
+typedef enum ATTRIBUTE_PACKED {
+  IDCT_1D,
+  IADST_1D,
+  IFLIPADST_1D = IADST_1D,
+  IIDENTITY_1D,
+  ITX_TYPES_1D,
+} ITX_TYPE_1D;
+
+static const ITX_TYPE_1D vitx_1d_tab[TX_TYPES] = {
+  IDCT_1D,      IADST_1D,     IDCT_1D,      IADST_1D,
+  IFLIPADST_1D, IDCT_1D,      IFLIPADST_1D, IADST_1D,
+  IFLIPADST_1D, IIDENTITY_1D, IDCT_1D,      IIDENTITY_1D,
+  IADST_1D,     IIDENTITY_1D, IFLIPADST_1D, IIDENTITY_1D,
+};
+
+static const ITX_TYPE_1D hitx_1d_tab[TX_TYPES] = {
+  IDCT_1D,      IDCT_1D,      IADST_1D,     IADST_1D,
+  IDCT_1D,      IFLIPADST_1D, IFLIPADST_1D, IFLIPADST_1D,
+  IADST_1D,     IIDENTITY_1D, IIDENTITY_1D, IDCT_1D,
+  IIDENTITY_1D, IADST_1D,     IIDENTITY_1D, IFLIPADST_1D,
+};
+
+// 1D functions
+static const transform_1d_neon lowbd_txfm_all_1d_arr[TX_SIZES][ITX_TYPES_1D] = {
+  { av1_idct4, av1_iadst4, av1_iidentity4_c },
+  { av1_idct8, av1_iadst8, av1_iidentity8_c },
+  { av1_idct16, av1_iadst16, av1_iidentity16_c },
+  { av1_idct32, NULL, NULL },
+  { av1_idct64, NULL, NULL },
+};
+
+static INLINE void lowbd_add_flip_buffer_8xn_neon(int16x8_t *in,
+                                                  uint8_t *output, int stride,
+                                                  int flipud,
+                                                  const int height) {
+  int j = flipud ? (height - 1) : 0;
+  const int step = flipud ? -1 : 1;
+  int16x8_t temp_output;
+  for (int i = 0; i < height; ++i, j += step) {
+    temp_output = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(output)));
+    temp_output = vaddq_s16(temp_output, in[j]);
+    vst1_u8(output, vqmovun_s16(temp_output));
+    output += stride;
+  }
+}
+
+static INLINE uint8x16_t lowbd_get_recon_16x16_neon(const uint8x16_t pred,
+                                                    int16x8_t res0,
+                                                    int16x8_t res1) {
+  int16x8_t temp_output[2];
+  uint8x16_t temp_output_8q;
+  temp_output[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pred)));
+  temp_output[0] = vaddq_s16(temp_output[0], res0);
+  temp_output[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pred)));
+  temp_output[1] = vaddq_s16(temp_output[1], res1);
+  temp_output_8q =
+      vcombine_u8(vqmovun_s16(temp_output[0]), vqmovun_s16(temp_output[1]));
+  return temp_output_8q;
+}
+
+static INLINE void lowbd_add_flip_buffer_16xn_neon(int16x8_t *in,
+                                                   uint8_t *output, int stride,
+                                                   int flipud, int height) {
+  uint8x16_t temp_output_8q;
+  int j = flipud ? (height - 1) : 0;
+  const int step = flipud ? -1 : 1;
+  for (int i = 0; i < height; ++i, j += step) {
+    temp_output_8q = vld1q_u8(output + i * stride);
+    temp_output_8q =
+        lowbd_get_recon_16x16_neon(temp_output_8q, in[j], in[j + height]);
+    vst1q_u8((output + i * stride), temp_output_8q);
+  }
+}
+
+static INLINE void lowbd_inv_txfm2d_memset_neon(int16x8_t *a, int size,
+                                                int value) {
+  for (int i = 0; i < size; i++) {
+    a[i] = vdupq_n_s16((int16_t)value);
+  }
+}
+
+static INLINE void btf_16_lane_0_1_neon(const int16x8_t in0,
+                                        const int16x8_t in1, const int16x4_t c,
+                                        int16x8_t *t0, int16x8_t *t1) {
+  int32x4_t s0[2], s1[2];
+  int16x4_t v0[2], v1[2];
+
+  s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 0);
+  s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 0);
+  s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 1);
+  s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 1);
+
+  s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 1);
+  s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 1);
+  s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 0);
+  s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 0);
+
+  v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT);
+  v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT);
+  v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT);
+  v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT);
+
+  *t0 = vcombine_s16(v0[0], v0[1]);
+  *t1 = vcombine_s16(v1[0], v1[1]);
+}
+
+static INLINE void btf_16_lane_1_0_neon(const int16x8_t in0,
+                                        const int16x8_t in1, const int16x4_t c,
+                                        int16x8_t *t0, int16x8_t *t1) {
+  int32x4_t s0[2], s1[2];
+  int16x4_t v0[2], v1[2];
+
+  s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 1);
+  s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 1);
+  s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 0);
+  s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 0);
+
+  s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 0);
+  s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 0);
+  s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 1);
+  s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 1);
+
+  v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT);
+  v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT);
+  v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT);
+  v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT);
+
+  *t0 = vcombine_s16(v0[0], v0[1]);
+  *t1 = vcombine_s16(v1[0], v1[1]);
+}
+
+static INLINE void btf_16_lane_2_3_neon(const int16x8_t in0,
+                                        const int16x8_t in1, const int16x4_t c,
+                                        int16x8_t *t0, int16x8_t *t1) {
+  int32x4_t s0[2], s1[2];
+  int16x4_t v0[2], v1[2];
+
+  s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 2);
+  s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 2);
+  s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 3);
+  s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 3);
+
+  s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 3);
+  s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 3);
+  s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 2);
+  s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 2);
+
+  v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT);
+  v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT);
+  v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT);
+  v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT);
+
+  *t0 = vcombine_s16(v0[0], v0[1]);
+  *t1 = vcombine_s16(v1[0], v1[1]);
+}
+
+static INLINE void btf_16_neon(const int16x8_t in0, int16_t coef1,
+                               int16_t coef2, int16x8_t *t0, int16x8_t *t1) {
+  int32x4_t s0_l, s0_h, s1_l, s1_h;
+  int16x4_t v0[2], v1[2];
+
+  s0_l = vmull_n_s16(vget_low_s16(in0), coef1);
+  s0_h = vmull_n_s16(vget_high_s16(in0), coef1);
+  s1_l = vmull_n_s16(vget_low_s16(in0), coef2);
+  s1_h = vmull_n_s16(vget_high_s16(in0), coef2);
+
+  v0[0] = vrshrn_n_s32(s0_l, INV_COS_BIT);
+  v0[1] = vrshrn_n_s32(s0_h, INV_COS_BIT);
+  v1[0] = vrshrn_n_s32(s1_l, INV_COS_BIT);
+  v1[1] = vrshrn_n_s32(s1_h, INV_COS_BIT);
+
+  *t0 = vcombine_s16(v0[0], v0[1]);
+  *t1 = vcombine_s16(v1[0], v1[1]);
+}
+
+static INLINE void btf_16_lane_3_2_neon(const int16x8_t in0,
+                                        const int16x8_t in1, const int16x4_t c,
+                                        int16x8_t *t0, int16x8_t *t1) {
+  int32x4_t s0[2], s1[2];
+  int16x4_t v0[2], v1[2];
+
+  s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 3);
+  s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 3);
+  s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 2);
+  s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 2);
+
+  s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 2);
+  s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 2);
+  s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 3);
+  s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 3);
+
+  v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT);
+  v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT);
+  v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT);
+  v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT);
+
+  *t0 = vcombine_s16(v0[0], v0[1]);
+  *t1 = vcombine_s16(v1[0], v1[1]);
+}
+
+static INLINE void btf_16_half_neon(int16x8_t *const x, const int16x4_t c) {
+  int32x4_t t0[2], t1[2];
+  int16x4_t v0[2], v1[2];
+
+  // Don't add/sub before multiply, which will overflow in iadst8.
+  const int32x4_t x0_lo = vmull_lane_s16(vget_low_s16(x[0]), c, 0);
+  const int32x4_t x0_hi = vmull_lane_s16(vget_high_s16(x[0]), c, 0);
+  const int32x4_t x1_lo = vmull_lane_s16(vget_low_s16(x[1]), c, 0);
+  const int32x4_t x1_hi = vmull_lane_s16(vget_high_s16(x[1]), c, 0);
+
+  t0[0] = vaddq_s32(x0_lo, x1_lo);
+  t0[1] = vaddq_s32(x0_hi, x1_hi);
+  t1[0] = vsubq_s32(x0_lo, x1_lo);
+  t1[1] = vsubq_s32(x0_hi, x1_hi);
+
+  v0[0] = vrshrn_n_s32(t0[0], INV_COS_BIT);
+  v0[1] = vrshrn_n_s32(t0[1], INV_COS_BIT);
+  v1[0] = vrshrn_n_s32(t1[0], INV_COS_BIT);
+  v1[1] = vrshrn_n_s32(t1[1], INV_COS_BIT);
+
+  x[0] = vcombine_s16(v0[0], v0[1]);
+  x[1] = vcombine_s16(v1[0], v1[1]);
+}
+
+static INLINE int16x4_t set_s16x4_neon(const int16_t c0, const int16_t c1,
+                                       const int16_t c2, const int16_t c3) {
+  int16x4_t val = vdup_n_s16((int16_t)0);
+  val = vset_lane_s16(c0, val, 0);
+  val = vset_lane_s16(c1, val, 1);
+  val = vset_lane_s16(c2, val, 2);
+  val = vset_lane_s16(c3, val, 3);
+  return val;
+}
+
+static INLINE void iadst8_neon(int16x8_t *const in, int16x8_t *out,
+                               int8_t cos_bit, int bit) {
+  (void)bit;
+  const int32_t *cospi = cospi_arr(cos_bit);
+
+  const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60],
+                                      (int16_t)cospi[20], (int16_t)cospi[44]);
+  const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[36], (int16_t)cospi[28],
+                                      (int16_t)cospi[52], (int16_t)cospi[12]);
+  const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+                                      (int16_t)cospi[16], (int16_t)cospi[48]);
+
+  int16x8_t x[8];
+  int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+  // Stage 1
+  x[0] = in[7];
+  x[1] = in[0];
+  x[2] = in[5];
+  x[3] = in[2];
+  x[4] = in[3];
+  x[5] = in[4];
+  x[6] = in[1];
+  x[7] = in[6];
+
+  // Stage 2
+  btf_16_lane_0_1_neon(x[0], x[1], c0, &s0, &s1);
+  btf_16_lane_2_3_neon(x[2], x[3], c0, &s2, &s3);
+  btf_16_lane_0_1_neon(x[4], x[5], c1, &s4, &s5);
+  btf_16_lane_2_3_neon(x[6], x[7], c1, &s6, &s7);
+
+  // Stage 3
+  x[0] = vqaddq_s16(s0, s4);
+  x[1] = vqaddq_s16(s1, s5);
+  x[2] = vqaddq_s16(s2, s6);
+  x[3] = vqaddq_s16(s3, s7);
+  x[4] = vqsubq_s16(s0, s4);
+  x[5] = vqsubq_s16(s1, s5);
+  x[6] = vqsubq_s16(s2, s6);
+  x[7] = vqsubq_s16(s3, s7);
+
+  // Stage 4
+  s0 = x[0];
+  s1 = x[1];
+  s2 = x[2];
+  s3 = x[3];
+  btf_16_lane_2_3_neon(x[4], x[5], c2, &s4, &s5);
+  btf_16_lane_3_2_neon(x[7], x[6], c2, &s7, &s6);
+
+  // Stage 5
+  x[0] = vqaddq_s16(s0, s2);
+  x[1] = vqaddq_s16(s1, s3);
+  x[2] = vqsubq_s16(s0, s2);
+  x[3] = vqsubq_s16(s1, s3);
+  x[4] = vqaddq_s16(s4, s6);
+  x[5] = vqaddq_s16(s5, s7);
+  x[6] = vqsubq_s16(s4, s6);
+  x[7] = vqsubq_s16(s5, s7);
+
+  // stage 6
+  btf_16_half_neon(x + 2, c2);
+  btf_16_half_neon(x + 6, c2);
+
+  // Stage 7
+  out[0] = x[0];
+  out[1] = vqnegq_s16(x[4]);
+  out[2] = x[6];
+  out[3] = vqnegq_s16(x[2]);
+  out[4] = x[3];
+  out[5] = vqnegq_s16(x[7]);
+  out[6] = x[5];
+  out[7] = vqnegq_s16(x[1]);
+}
+
+static INLINE void iadst8_low1_neon(int16x8_t *const in, int16x8_t *out,
+                                    int8_t cos_bit, int bit) {
+  (void)bit;
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+                                      (int16_t)cospi[16], (int16_t)cospi[48]);
+
+  int16x8_t x[8];
+  int16x8_t s0, s1, s4, s5;
+
+  // Stage 1
+  x[1] = in[0];
+
+  // Stage 2
+
+  btf_16_neon(x[1], cospi[60], -cospi[4], &s0, &s1);
+
+  // Stage 3
+  x[0] = s0;
+  x[1] = s1;
+  x[4] = s0;
+  x[5] = s1;
+
+  // Stage 4
+  s0 = x[0];
+  s1 = x[1];
+  btf_16_lane_2_3_neon(x[4], x[5], c2, &s4, &s5);
+
+  // Stage 5
+  x[0] = s0;
+  x[1] = s1;
+  x[2] = s0;
+  x[3] = s1;
+  x[4] = s4;
+  x[5] = s5;
+  x[6] = s4;
+  x[7] = s5;
+
+  // stage 6
+  btf_16_half_neon(x + 2, c2);
+  btf_16_half_neon(x + 6, c2);
+
+  // Stage 7
+  out[0] = x[0];
+  out[1] = vqnegq_s16(x[4]);
+  out[2] = x[6];
+  out[3] = vqnegq_s16(x[2]);
+  out[4] = x[3];
+  out[5] = vqnegq_s16(x[7]);
+  out[6] = x[5];
+  out[7] = vqnegq_s16(x[1]);
+}
+
+static INLINE void idct8_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit,
+                              int bit) {
+  (void)bit;
+  const int32_t *cospi = cospi_arr(cos_bit);
+  int16x8_t step1[8], step2[8];
+  const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
+                                      (int16_t)cospi[40], (int16_t)cospi[24]);
+  const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+                                      (int16_t)cospi[16], (int16_t)cospi[48]);
+
+  // stage 2
+  btf_16_lane_0_1_neon(in[1], in[7], c0, &step1[7], &step1[4]);
+  btf_16_lane_2_3_neon(in[5], in[3], c0, &step1[6], &step1[5]);
+
+  // stage 3
+  btf_16_lane_0_1_neon(in[0], in[4], c1, &step2[0], &step2[1]);
+  btf_16_lane_2_3_neon(in[2], in[6], c1, &step2[3], &step2[2]);
+  step2[4] = vqaddq_s16(step1[4], step1[5]);
+  step2[5] = vqsubq_s16(step1[4], step1[5]);
+  step2[6] = vqsubq_s16(step1[7], step1[6]);
+  step2[7] = vqaddq_s16(step1[7], step1[6]);
+
+  // stage 4
+  step1[0] = vqaddq_s16(step2[0], step2[3]);
+  step1[1] = vqaddq_s16(step2[1], step2[2]);
+  step1[2] = vqsubq_s16(step2[1], step2[2]);
+  step1[3] = vqsubq_s16(step2[0], step2[3]);
+  btf_16_lane_0_1_neon(step2[6], step2[5], c1, &step1[6], &step1[5]);
+
+  // stage 5
+  out[0] = vqaddq_s16(step1[0], step2[7]);
+  out[1] = vqaddq_s16(step1[1], step1[6]);
+  out[2] = vqaddq_s16(step1[2], step1[5]);
+  out[3] = vqaddq_s16(step1[3], step2[4]);
+  out[4] = vqsubq_s16(step1[3], step2[4]);
+  out[5] = vqsubq_s16(step1[2], step1[5]);
+  out[6] = vqsubq_s16(step1[1], step1[6]);
+  out[7] = vqsubq_s16(step1[0], step2[7]);
+}
+
+static INLINE void idct8_low1_neon(int16x8_t *in, int16x8_t *out,
+                                   int8_t cos_bit, int bit) {
+  (void)bit;
+  const int32_t *cospi = cospi_arr(cos_bit);
+  int16x8_t step1;
+  int32x4_t t32[2];
+
+  // stage 1
+  // stage 2
+  // stage 3
+  t32[0] = vmull_n_s16(vget_low_s16(in[0]), (int16_t)cospi[32]);
+  t32[1] = vmull_n_s16(vget_high_s16(in[0]), (int16_t)cospi[32]);
+
+  step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
+                       vrshrn_n_s32(t32[1], INV_COS_BIT));
+
+  // stage 4
+  // stage 5
+  out[0] = step1;
+  out[1] = step1;
+  out[2] = step1;
+  out[3] = step1;
+  out[4] = step1;
+  out[5] = step1;
+  out[6] = step1;
+  out[7] = step1;
+}
+
+void av1_round_shift_array_16_neon(int16x8_t *arr, int size, int bit) {
+  assert(!(size % 4));
+  if (!bit) return;
+  const int16x8_t dup_bits_n_16x8 = vdupq_n_s16((int16_t)(-bit));
+  for (int i = 0; i < size; i++) {
+    arr[i] = vrshlq_s16(arr[i], dup_bits_n_16x8);
+  }
+}
+
+static INLINE void flip_buf_ud_neon(int16x8_t *input, int size) {
+  int16x8_t temp[8];
+  for (int i = 0; i < size; ++i) {
+    temp[i] = input[size - 1 - i];
+  }
+  for (int i = 0; i < size; ++i) {
+    input[i] = temp[i];
+  }
+}
+
+static INLINE void load_buffer_32bit_to_16bit_neon(const int32_t *input,
+                                                   int16x8_t *const a,
+                                                   int out_size) {
+  for (int i = 0; i < 8; ++i) {
+    a[i] = vcombine_s16(vmovn_s32(vld1q_s32(input)),
+                        vmovn_s32(vld1q_s32(input + 4)));
+    input += out_size;
+  }
+}
+
+static int16_t sqrt_2_list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096,
+                                         4 * 5793 };
+
+static INLINE void identity_txfm_round_neon(int16x8_t *input, int16x8_t *output,
+                                            int txw_idx, int8_t size, int bit) {
+  const int32x4_t dup_bits_n_32x4 = vdupq_n_s32((int32_t)(-bit));
+  int16x4_t scale = vdup_n_s16(sqrt_2_list[txw_idx]);
+  int16x4_t low_i16, high_i16;
+  int32x4_t low_i32, high_i32;
+  for (int i = 0; i < size; i++) {
+    int32x4_t temp_out_low = vmull_s16(vget_low_s16(input[i]), scale);
+    int32x4_t temp_out_high = vmull_s16(vget_high_s16(input[i]), scale);
+    low_i32 = vrshlq_s32(vrshrq_n_s32(temp_out_low, 12), dup_bits_n_32x4);
+    high_i32 = vrshlq_s32(vrshrq_n_s32(temp_out_high, 12), dup_bits_n_32x4);
+    low_i16 = vqmovn_s32(low_i32);
+    high_i16 = vqmovn_s32(high_i32);
+    output[i] = vcombine_s16(low_i16, high_i16);
+  }
+}
+
+static INLINE void round_shift_for_rect(int16x8_t *input, int16x8_t *output,
+                                        int size) {
+  int32x4_t out_low, out_high;
+  int16x4_t low, high;
+
+  for (int z = 0; z < size; ++z) {
+    out_low = vmull_n_s16(vget_low_s16(input[z]), (int16_t)NewInvSqrt2);
+    out_high = vmull_n_s16(vget_high_s16(input[z]), (int16_t)NewInvSqrt2);
+
+    low = vqrshrn_n_s32(out_low, (int32_t)NewSqrt2Bits);
+    high = vqrshrn_n_s32(out_high, (int32_t)NewSqrt2Bits);
+
+    output[z] = vcombine_s16(low, high);
+  }
+}
+
+static INLINE void idct16_low1_neon(int16x8_t *in, int16x8_t *out,
+                                    int8_t cos_bit, int bit) {
+  (void)bit;
+  const int32_t *cospi = cospi_arr(cos_bit);
+  int16x8_t step1;
+  int32x4_t t32[2];
+
+  // stage 4
+
+  t32[0] = vmull_n_s16(vget_low_s16(in[0]), cospi[32]);
+  t32[1] = vmull_n_s16(vget_high_s16(in[0]), cospi[32]);
+  step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
+                       vrshrn_n_s32(t32[1], INV_COS_BIT));
+
+  // stage 6
+  // stage 7
+  out[0] = step1;
+  out[1] = step1;
+  out[2] = step1;
+  out[3] = step1;
+  out[4] = step1;
+  out[5] = step1;
+  out[6] = step1;
+  out[7] = step1;
+  out[8] = step1;
+  out[9] = step1;
+  out[10] = step1;
+  out[11] = step1;
+  out[12] = step1;
+  out[13] = step1;
+  out[14] = step1;
+  out[15] = step1;
+}
+
+static INLINE void idct16_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit,
+                               int bit) {
+  (void)bit;
+  const int32_t *cospi = cospi_arr(cos_bit);
+  int16x8_t step1[16], step2[16];
+
+  const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60],
+                                      (int16_t)cospi[36], (int16_t)cospi[28]);
+  const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44],
+                                      (int16_t)cospi[52], (int16_t)cospi[12]);
+  const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
+                                      (int16_t)cospi[40], (int16_t)cospi[24]);
+  const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+                                      (int16_t)cospi[16], (int16_t)cospi[48]);
+  const int16x4_t c4 =
+      set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
+                     (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
+  // stage 2
+
+  btf_16_lane_0_1_neon(in[1], in[15], c0, &step2[15], &step2[8]);
+  btf_16_lane_2_3_neon(in[9], in[7], c0, &step2[14], &step2[9]);
+  btf_16_lane_0_1_neon(in[5], in[11], c1, &step2[13], &step2[10]);
+  btf_16_lane_2_3_neon(in[13], in[3], c1, &step2[12], &step2[11]);
+
+  step2[0] = in[0];
+  step2[1] = in[8];
+  step2[2] = in[4];
+  step2[3] = in[12];
+  step2[4] = in[2];
+  step2[5] = in[10];
+  step2[6] = in[6];
+  step2[7] = in[14];
+
+  // stage 3
+
+  btf_16_lane_0_1_neon(step2[4], step2[7], c2, &step1[7], &step1[4]);
+  btf_16_lane_2_3_neon(step2[5], step2[6], c2, &step1[6], &step1[5]);
+
+  step1[0] = step2[0];
+  step1[1] = step2[1];
+  step1[2] = step2[2];
+  step1[3] = step2[3];
+  step1[8] = vqaddq_s16(step2[8], step2[9]);
+  step1[9] = vqsubq_s16(step2[8], step2[9]);
+  step1[10] = vqsubq_s16(step2[11], step2[10]);
+  step1[11] = vqaddq_s16(step2[11], step2[10]);
+  step1[12] = vqaddq_s16(step2[12], step2[13]);
+  step1[13] = vqsubq_s16(step2[12], step2[13]);
+  step1[14] = vqsubq_s16(step2[15], step2[14]);
+  step1[15] = vqaddq_s16(step2[15], step2[14]);
+
+  // stage 4
+
+  btf_16_lane_0_1_neon(step1[0], step1[1], c3, &step2[0], &step2[1]);
+  btf_16_lane_2_3_neon(step1[2], step1[3], c3, &step2[3], &step2[2]);
+  btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]);
+  btf_16_lane_3_2_neon(step1[10], step1[13], c4, &step2[10], &step2[13]);
+
+  step2[4] = vqaddq_s16(step1[4], step1[5]);
+  step2[5] = vqsubq_s16(step1[4], step1[5]);
+  step2[6] = vqsubq_s16(step1[7], step1[6]);
+  step2[7] = vqaddq_s16(step1[7], step1[6]);
+  step2[8] = step1[8];
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+  step2[15] = step1[15];
+
+  // stage 5
+
+  btf_16_lane_0_1_neon(step2[6], step2[5], c3, &step1[6], &step1[5]);
+
+  step1[0] = vqaddq_s16(step2[0], step2[3]);
+  step1[1] = vqaddq_s16(step2[1], step2[2]);
+  step1[2] = vqsubq_s16(step2[1], step2[2]);
+  step1[3] = vqsubq_s16(step2[0], step2[3]);
+  step1[4] = step2[4];
+  step1[7] = step2[7];
+  step1[8] = vqaddq_s16(step2[8], step2[11]);
+  step1[9] = vqaddq_s16(step2[9], step2[10]);
+  step1[10] = vqsubq_s16(step2[9], step2[10]);
+  step1[11] = vqsubq_s16(step2[8], step2[11]);
+  step1[12] = vqsubq_s16(step2[15], step2[12]);
+  step1[13] = vqsubq_s16(step2[14], step2[13]);
+  step1[14] = vqaddq_s16(step2[14], step2[13]);
+  step1[15] = vqaddq_s16(step2[15], step2[12]);
+
+  // stage 6
+
+  btf_16_lane_0_1_neon(step1[13], step1[10], c3, &step2[13], &step2[10]);
+  btf_16_lane_0_1_neon(step1[12], step1[11], c3, &step2[12], &step2[11]);
+
+  step2[0] = vqaddq_s16(step1[0], step1[7]);
+  step2[1] = vqaddq_s16(step1[1], step1[6]);
+  step2[2] = vqaddq_s16(step1[2], step1[5]);
+  step2[3] = vqaddq_s16(step1[3], step1[4]);
+  step2[4] = vqsubq_s16(step1[3], step1[4]);
+  step2[5] = vqsubq_s16(step1[2], step1[5]);
+  step2[6] = vqsubq_s16(step1[1], step1[6]);
+  step2[7] = vqsubq_s16(step1[0], step1[7]);
+  step2[8] = step1[8];
+  step2[9] = step1[9];
+  step2[14] = step1[14];
+  step2[15] = step1[15];
+
+  // stage 7
+  out[0] = vqaddq_s16(step2[0], step2[15]);
+  out[1] = vqaddq_s16(step2[1], step2[14]);
+  out[2] = vqaddq_s16(step2[2], step2[13]);
+  out[3] = vqaddq_s16(step2[3], step2[12]);
+  out[4] = vqaddq_s16(step2[4], step2[11]);
+  out[5] = vqaddq_s16(step2[5], step2[10]);
+  out[6] = vqaddq_s16(step2[6], step2[9]);
+  out[7] = vqaddq_s16(step2[7], step2[8]);
+  out[8] = vqsubq_s16(step2[7], step2[8]);
+  out[9] = vqsubq_s16(step2[6], step2[9]);
+  out[10] = vqsubq_s16(step2[5], step2[10]);
+  out[11] = vqsubq_s16(step2[4], step2[11]);
+  out[12] = vqsubq_s16(step2[3], step2[12]);
+  out[13] = vqsubq_s16(step2[2], step2[13]);
+  out[14] = vqsubq_s16(step2[1], step2[14]);
+  out[15] = vqsubq_s16(step2[0], step2[15]);
+}
+
+static INLINE void idct16_low8_neon(int16x8_t *in, int16x8_t *out,
+                                    int8_t cos_bit, int bit) {
+  (void)bit;
+  const int32_t *cospi = cospi_arr(cos_bit);
+  int16x8_t step1[16], step2[16];
+  const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+                                      (int16_t)cospi[16], (int16_t)cospi[48]);
+  const int16x4_t c1 =
+      set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
+                     (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
+
+  // stage 1
+  // stage 2
+
+  step2[0] = in[0];
+  step2[2] = in[4];
+  step2[4] = in[2];
+  step2[6] = in[6];
+
+  btf_16_neon(in[1], cospi[60], cospi[4], &step2[8], &step2[15]);
+  btf_16_neon(in[7], -cospi[36], cospi[28], &step2[9], &step2[14]);
+  btf_16_neon(in[5], cospi[44], cospi[20], &step2[10], &step2[13]);
+  btf_16_neon(in[3], -cospi[52], cospi[12], &step2[11], &step2[12]);
+
+  // stage 3
+
+  btf_16_neon(step2[4], cospi[56], cospi[8], &step1[4], &step1[7]);
+  btf_16_neon(step2[6], -cospi[40], cospi[24], &step1[5], &step1[6]);
+
+  step1[0] = step2[0];
+  step1[2] = step2[2];
+  step1[8] = vqaddq_s16(step2[8], step2[9]);
+  step1[9] = vqsubq_s16(step2[8], step2[9]);
+  step1[10] = vqsubq_s16(step2[11], step2[10]);
+  step1[11] = vqaddq_s16(step2[11], step2[10]);
+  step1[12] = vqaddq_s16(step2[12], step2[13]);
+  step1[13] = vqsubq_s16(step2[12], step2[13]);
+  step1[14] = vqsubq_s16(step2[15], step2[14]);
+  step1[15] = vqaddq_s16(step2[15], step2[14]);
+
+  // stage 4
+
+  btf_16_neon(step1[0], cospi[32], cospi[32], &step2[0], &step2[1]);
+  btf_16_neon(step1[2], cospi[48], cospi[16], &step2[2], &step2[3]);
+  btf_16_lane_2_3_neon(step1[14], step1[9], c0, &step2[14], &step2[9]);
+  btf_16_lane_3_2_neon(step1[10], step1[13], c1, &step2[10], &step2[13]);
+
+  step2[4] = vqaddq_s16(step1[4], step1[5]);
+  step2[5] = vqsubq_s16(step1[4], step1[5]);
+  step2[6] = vqsubq_s16(step1[7], step1[6]);
+  step2[7] = vqaddq_s16(step1[7], step1[6]);
+  step2[8] = step1[8];
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+  step2[15] = step1[15];
+
+  // stage 5
+
+  btf_16_lane_0_1_neon(step2[6], step2[5], c0, &step1[6], &step1[5]);
+  step1[0] = vqaddq_s16(step2[0], step2[3]);
+  step1[1] = vqaddq_s16(step2[1], step2[2]);
+  step1[2] = vqsubq_s16(step2[1], step2[2]);
+  step1[3] = vqsubq_s16(step2[0], step2[3]);
+  step1[4] = step2[4];
+  step1[7] = step2[7];
+  step1[8] = vqaddq_s16(step2[8], step2[11]);
+  step1[9] = vqaddq_s16(step2[9], step2[10]);
+  step1[10] = vqsubq_s16(step2[9], step2[10]);
+  step1[11] = vqsubq_s16(step2[8], step2[11]);
+  step1[12] = vqsubq_s16(step2[15], step2[12]);
+  step1[13] = vqsubq_s16(step2[14], step2[13]);
+  step1[14] = vqaddq_s16(step2[14], step2[13]);
+  step1[15] = vqaddq_s16(step2[15], step2[12]);
+
+  // stage 6
+  btf_16_lane_0_1_neon(step1[13], step1[10], c0, &step2[13], &step2[10]);
+  btf_16_lane_0_1_neon(step1[12], step1[11], c0, &step2[12], &step2[11]);
+
+  step2[0] = vqaddq_s16(step1[0], step1[7]);
+  step2[1] = vqaddq_s16(step1[1], step1[6]);
+  step2[2] = vqaddq_s16(step1[2], step1[5]);
+  step2[3] = vqaddq_s16(step1[3], step1[4]);
+  step2[4] = vqsubq_s16(step1[3], step1[4]);
+  step2[5] = vqsubq_s16(step1[2], step1[5]);
+  step2[6] = vqsubq_s16(step1[1], step1[6]);
+  step2[7] = vqsubq_s16(step1[0], step1[7]);
+  step2[8] = step1[8];
+  step2[9] = step1[9];
+  step2[14] = step1[14];
+  step2[15] = step1[15];
+
+  // stage 7
+
+  out[0] = vqaddq_s16(step2[0], step2[15]);
+  out[1] = vqaddq_s16(step2[1], step2[14]);
+  out[2] = vqaddq_s16(step2[2], step2[13]);
+  out[3] = vqaddq_s16(step2[3], step2[12]);
+  out[4] = vqaddq_s16(step2[4], step2[11]);
+  out[5] = vqaddq_s16(step2[5], step2[10]);
+  out[6] = vqaddq_s16(step2[6], step2[9]);
+  out[7] = vqaddq_s16(step2[7], step2[8]);
+  out[8] = vqsubq_s16(step2[7], step2[8]);
+  out[9] = vqsubq_s16(step2[6], step2[9]);
+  out[10] = vqsubq_s16(step2[5], step2[10]);
+  out[11] = vqsubq_s16(step2[4], step2[11]);
+  out[12] = vqsubq_s16(step2[3], step2[12]);
+  out[13] = vqsubq_s16(step2[2], step2[13]);
+  out[14] = vqsubq_s16(step2[1], step2[14]);
+  out[15] = vqsubq_s16(step2[0], step2[15]);
+}
+
+static INLINE void iadst16_neon(int16x8_t *const in, int16x8_t *out,
+                                int8_t cos_bit, int bit) {
+  (void)bit;
+  const int32_t *cospi = cospi_arr(cos_bit);
+
+  const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[2], (int16_t)cospi[62],
+                                      (int16_t)cospi[10], (int16_t)cospi[54]);
+  const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[18], (int16_t)cospi[46],
+                                      (int16_t)cospi[26], (int16_t)cospi[38]);
+  const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[34], (int16_t)cospi[30],
+                                      (int16_t)cospi[42], (int16_t)cospi[22]);
+  const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[50], (int16_t)cospi[14],
+                                      (int16_t)cospi[58], (int16_t)cospi[6]);
+  const int16x4_t c4 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
+                                      (int16_t)cospi[40], (int16_t)cospi[24]);
+  const int16x4_t c5 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+                                      (int16_t)cospi[16], (int16_t)cospi[48]);
+
+  int16x8_t x[16];
+  int16x8_t t[14];
+  int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+  int16x8_t s8, s9, s10, s11, s12, s13, s14, s15;
+
+  // Stage 1
+  x[0] = in[15];
+  x[1] = in[0];
+  x[2] = in[13];
+  x[3] = in[2];
+  x[4] = in[11];
+  x[5] = in[4];
+  x[6] = in[9];
+  x[7] = in[6];
+  x[8] = in[7];
+  x[9] = in[8];
+  x[10] = in[5];
+  x[11] = in[10];
+  x[12] = in[3];
+  x[13] = in[12];
+  x[14] = in[1];
+  x[15] = in[14];
+
+  // Stage 2
+  btf_16_lane_0_1_neon(x[0], x[1], c0, &s0, &s1);
+  btf_16_lane_2_3_neon(x[2], x[3], c0, &s2, &s3);
+  btf_16_lane_0_1_neon(x[4], x[5], c1, &s4, &s5);
+  btf_16_lane_2_3_neon(x[6], x[7], c1, &s6, &s7);
+  btf_16_lane_0_1_neon(x[8], x[9], c2, &s8, &s9);
+  btf_16_lane_2_3_neon(x[10], x[11], c2, &s10, &s11);
+  btf_16_lane_0_1_neon(x[12], x[13], c3, &s12, &s13);
+  btf_16_lane_2_3_neon(x[14], x[15], c3, &s14, &s15);
+
+  // Stage 3
+  x[0] = vqaddq_s16(s0, s8);
+  x[1] = vqaddq_s16(s1, s9);
+  x[2] = vqaddq_s16(s2, s10);
+  x[3] = vqaddq_s16(s3, s11);
+  x[4] = vqaddq_s16(s4, s12);
+  x[5] = vqaddq_s16(s5, s13);
+  x[6] = vqaddq_s16(s6, s14);
+  x[7] = vqaddq_s16(s7, s15);
+  x[8] = vqsubq_s16(s0, s8);
+  x[9] = vqsubq_s16(s1, s9);
+  x[10] = vqsubq_s16(s2, s10);
+  x[11] = vqsubq_s16(s3, s11);
+  x[12] = vqsubq_s16(s4, s12);
+  x[13] = vqsubq_s16(s5, s13);
+  x[14] = vqsubq_s16(s6, s14);
+  x[15] = vqsubq_s16(s7, s15);
+
+  // Stage 4
+  t[0] = x[0];
+  t[1] = x[1];
+  t[2] = x[2];
+  t[3] = x[3];
+  t[4] = x[4];
+  t[5] = x[5];
+  t[6] = x[6];
+  t[7] = x[7];
+  btf_16_lane_0_1_neon(x[8], x[9], c4, &s8, &s9);
+  btf_16_lane_2_3_neon(x[10], x[11], c4, &s10, &s11);
+  btf_16_lane_1_0_neon(x[13], x[12], c4, &s13, &s12);
+  btf_16_lane_3_2_neon(x[15], x[14], c4, &s15, &s14);
+
+  // Stage 5
+  x[0] = vqaddq_s16(t[0], t[4]);
+  x[1] = vqaddq_s16(t[1], t[5]);
+  x[2] = vqaddq_s16(t[2], t[6]);
+  x[3] = vqaddq_s16(t[3], t[7]);
+  x[4] = vqsubq_s16(t[0], t[4]);
+  x[5] = vqsubq_s16(t[1], t[5]);
+  x[6] = vqsubq_s16(t[2], t[6]);
+  x[7] = vqsubq_s16(t[3], t[7]);
+  x[8] = vqaddq_s16(s8, s12);
+  x[9] = vqaddq_s16(s9, s13);
+  x[10] = vqaddq_s16(s10, s14);
+  x[11] = vqaddq_s16(s11, s15);
+  x[12] = vqsubq_s16(s8, s12);
+  x[13] = vqsubq_s16(s9, s13);
+  x[14] = vqsubq_s16(s10, s14);
+  x[15] = vqsubq_s16(s11, s15);
+
+  // stage 6
+  t[0] = x[0];
+  t[1] = x[1];
+  t[2] = x[2];
+  t[3] = x[3];
+  btf_16_lane_2_3_neon(x[4], x[5], c5, &s4, &s5);
+  btf_16_lane_3_2_neon(x[7], x[6], c5, &s7, &s6);
+  t[8] = x[8];
+  t[9] = x[9];
+  t[10] = x[10];
+  t[11] = x[11];
+  btf_16_lane_2_3_neon(x[12], x[13], c5, &s12, &s13);
+  btf_16_lane_3_2_neon(x[15], x[14], c5, &s15, &s14);
+
+  // Stage 7
+  x[0] = vqaddq_s16(t[0], t[2]);
+  x[1] = vqaddq_s16(t[1], t[3]);
+  x[2] = vqsubq_s16(t[0], t[2]);
+  x[3] = vqsubq_s16(t[1], t[3]);
+  x[4] = vqaddq_s16(s4, s6);
+  x[5] = vqaddq_s16(s5, s7);
+  x[6] = vqsubq_s16(s4, s6);
+  x[7] = vqsubq_s16(s5, s7);
+  x[8] = vqaddq_s16(t[8], t[10]);
+  x[9] = vqaddq_s16(t[9], t[11]);
+  x[10] = vqsubq_s16(t[8], t[10]);
+  x[11] = vqsubq_s16(t[9], t[11]);
+  x[12] = vqaddq_s16(s12, s14);
+  x[13] = vqaddq_s16(s13, s15);
+  x[14] = vqsubq_s16(s12, s14);
+  x[15] = vqsubq_s16(s13, s15);
+
+  // Stage 8
+  btf_16_half_neon(x + 2, c5);
+  btf_16_half_neon(x + 6, c5);
+  btf_16_half_neon(x + 10, c5);
+  btf_16_half_neon(x + 14, c5);
+
+  // Stage 9
+  out[0] = x[0];
+  out[1] = vqnegq_s16(x[8]);
+  out[2] = x[12];
+  out[3] = vqnegq_s16(x[4]);
+  out[4] = x[6];
+  out[5] = vqnegq_s16(x[14]);
+  out[6] = x[10];
+  out[7] = vqnegq_s16(x[2]);
+  out[8] = x[3];
+  out[9] = vqnegq_s16(x[11]);
+  out[10] = x[15];
+  out[11] = vqnegq_s16(x[7]);
+  out[12] = x[5];
+  out[13] = vqnegq_s16(x[13]);
+  out[14] = x[9];
+  out[15] = vqnegq_s16(x[1]);
+}
+
+static INLINE void iadst16_low1_neon(int16x8_t *const in, int16x8_t *out,
+                                     int8_t cos_bit, int bit) {
+  (void)bit;
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
+                                      (int16_t)cospi[40], (int16_t)cospi[24]);
+  const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+                                      (int16_t)cospi[16], (int16_t)cospi[48]);
+
+  int16x8_t x[16];
+  int16x8_t t[10];
+  int16x8_t s0, s1, s4, s5;
+  int16x8_t s8, s9, s12, s13;
+
+  // Stage 1
+  x[1] = in[0];
+
+  // Stage 2
+  btf_16_neon(x[1], cospi[62], -cospi[2], &s0, &s1);
+
+  // Stage 3
+  x[0] = s0;
+  x[1] = s1;
+  x[8] = s0;
+  x[9] = s1;
+
+  // Stage 4
+  t[0] = x[0];
+  t[1] = x[1];
+  btf_16_lane_0_1_neon(x[8], x[9], c0, &s8, &s9);
+
+  // Stage 5
+  x[0] = t[0];
+  x[1] = t[1];
+  x[4] = t[0];
+  x[5] = t[1];
+  x[8] = s8;
+  x[9] = s9;
+  x[12] = s8;
+  x[13] = s9;
+
+  // stage 6
+  t[0] = x[0];
+  t[1] = x[1];
+  btf_16_lane_2_3_neon(x[4], x[5], c1, &s4, &s5);
+  t[8] = x[8];
+  t[9] = x[9];
+  btf_16_lane_2_3_neon(x[12], x[13], c1, &s12, &s13);
+
+  // Stage 7
+  x[0] = t[0];
+  x[1] = t[1];
+  x[2] = t[0];
+  x[3] = t[1];
+  x[4] = s4;
+  x[5] = s5;
+  x[6] = s4;
+  x[7] = s5;
+  x[8] = t[8];
+  x[9] = t[9];
+  x[10] = t[8];
+  x[11] = t[9];
+  x[12] = s12;
+  x[13] = s13;
+  x[14] = s12;
+  x[15] = s13;
+
+  // Stage 8
+  btf_16_half_neon(x + 2, c1);
+  btf_16_half_neon(x + 6, c1);
+  btf_16_half_neon(x + 10, c1);
+  btf_16_half_neon(x + 14, c1);
+
+  // Stage 9
+  out[0] = x[0];
+  out[1] = vqnegq_s16(x[8]);
+  out[2] = x[12];
+  out[3] = vqnegq_s16(x[4]);
+  out[4] = x[6];
+  out[5] = vqnegq_s16(x[14]);
+  out[6] = x[10];
+  out[7] = vqnegq_s16(x[2]);
+  out[8] = x[3];
+  out[9] = vqnegq_s16(x[11]);
+  out[10] = x[15];
+  out[11] = vqnegq_s16(x[7]);
+  out[12] = x[5];
+  out[13] = vqnegq_s16(x[13]);
+  out[14] = x[9];
+  out[15] = vqnegq_s16(x[1]);
+}
+
+static INLINE void iadst16_low8_neon(int16x8_t *const in, int16x8_t *out,
+                                     int8_t cos_bit, int bit) {
+  (void)bit;
+  const int32_t *cospi = cospi_arr(cos_bit);
+
+  const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
+                                      (int16_t)cospi[40], (int16_t)cospi[24]);
+  const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+                                      (int16_t)cospi[16], (int16_t)cospi[48]);
+
+  int16x8_t x[16];
+  int16x8_t t[14];
+  int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+  int16x8_t s8, s9, s10, s11, s12, s13, s14, s15;
+
+  // Stage 1
+  x[1] = in[0];
+  x[3] = in[2];
+  x[5] = in[4];
+  x[7] = in[6];
+  x[8] = in[7];
+  x[10] = in[5];
+  x[12] = in[3];
+  x[14] = in[1];
+
+  // Stage 2
+  btf_16_neon(x[1], cospi[62], -cospi[2], &s0, &s1);
+  btf_16_neon(x[3], cospi[54], -cospi[10], &s2, &s3);
+  btf_16_neon(x[5], cospi[46], -cospi[18], &s4, &s5);
+  btf_16_neon(x[7], cospi[38], -cospi[26], &s6, &s7);
+
+  btf_16_neon(x[8], cospi[34], cospi[30], &s8, &s9);
+  btf_16_neon(x[10], cospi[42], cospi[22], &s10, &s11);
+  btf_16_neon(x[12], cospi[50], cospi[14], &s12, &s13);
+  btf_16_neon(x[14], cospi[58], cospi[6], &s14, &s15);
+
+  // Stage 3
+  x[0] = vqaddq_s16(s0, s8);
+  x[1] = vqaddq_s16(s1, s9);
+  x[2] = vqaddq_s16(s2, s10);
+  x[3] = vqaddq_s16(s3, s11);
+  x[4] = vqaddq_s16(s4, s12);
+  x[5] = vqaddq_s16(s5, s13);
+  x[6] = vqaddq_s16(s6, s14);
+  x[7] = vqaddq_s16(s7, s15);
+  x[8] = vqsubq_s16(s0, s8);
+  x[9] = vqsubq_s16(s1, s9);
+  x[10] = vqsubq_s16(s2, s10);
+  x[11] = vqsubq_s16(s3, s11);
+  x[12] = vqsubq_s16(s4, s12);
+  x[13] = vqsubq_s16(s5, s13);
+  x[14] = vqsubq_s16(s6, s14);
+  x[15] = vqsubq_s16(s7, s15);
+
+  // Stage 4
+  t[0] = x[0];
+  t[1] = x[1];
+  t[2] = x[2];
+  t[3] = x[3];
+  t[4] = x[4];
+  t[5] = x[5];
+  t[6] = x[6];
+  t[7] = x[7];
+  btf_16_lane_0_1_neon(x[8], x[9], c0, &s8, &s9);
+  btf_16_lane_2_3_neon(x[10], x[11], c0, &s10, &s11);
+  btf_16_lane_1_0_neon(x[13], x[12], c0, &s13, &s12);
+  btf_16_lane_3_2_neon(x[15], x[14], c0, &s15, &s14);
+
+  // Stage 5
+  x[0] = vqaddq_s16(t[0], t[4]);
+  x[1] = vqaddq_s16(t[1], t[5]);
+  x[2] = vqaddq_s16(t[2], t[6]);
+  x[3] = vqaddq_s16(t[3], t[7]);
+  x[4] = vqsubq_s16(t[0], t[4]);
+  x[5] = vqsubq_s16(t[1], t[5]);
+  x[6] = vqsubq_s16(t[2], t[6]);
+  x[7] = vqsubq_s16(t[3], t[7]);
+  x[8] = vqaddq_s16(s8, s12);
+  x[9] = vqaddq_s16(s9, s13);
+  x[10] = vqaddq_s16(s10, s14);
+  x[11] = vqaddq_s16(s11, s15);
+  x[12] = vqsubq_s16(s8, s12);
+  x[13] = vqsubq_s16(s9, s13);
+  x[14] = vqsubq_s16(s10, s14);
+  x[15] = vqsubq_s16(s11, s15);
+
+  // stage 6
+  t[0] = x[0];
+  t[1] = x[1];
+  t[2] = x[2];
+  t[3] = x[3];
+  btf_16_lane_2_3_neon(x[4], x[5], c1, &s4, &s5);
+  btf_16_lane_3_2_neon(x[7], x[6], c1, &s7, &s6);
+  t[8] = x[8];
+  t[9] = x[9];
+  t[10] = x[10];
+  t[11] = x[11];
+  btf_16_lane_2_3_neon(x[12], x[13], c1, &s12, &s13);
+  btf_16_lane_3_2_neon(x[15], x[14], c1, &s15, &s14);
+
+  // Stage 7
+  x[0] = vqaddq_s16(t[0], t[2]);
+  x[1] = vqaddq_s16(t[1], t[3]);
+  x[2] = vqsubq_s16(t[0], t[2]);
+  x[3] = vqsubq_s16(t[1], t[3]);
+  x[4] = vqaddq_s16(s4, s6);
+  x[5] = vqaddq_s16(s5, s7);
+  x[6] = vqsubq_s16(s4, s6);
+  x[7] = vqsubq_s16(s5, s7);
+  x[8] = vqaddq_s16(t[8], t[10]);
+  x[9] = vqaddq_s16(t[9], t[11]);
+  x[10] = vqsubq_s16(t[8], t[10]);
+  x[11] = vqsubq_s16(t[9], t[11]);
+  x[12] = vqaddq_s16(s12, s14);
+  x[13] = vqaddq_s16(s13, s15);
+  x[14] = vqsubq_s16(s12, s14);
+  x[15] = vqsubq_s16(s13, s15);
+
+  // Stage 8
+  btf_16_half_neon(x + 2, c1);
+  btf_16_half_neon(x + 6, c1);
+  btf_16_half_neon(x + 10, c1);
+  btf_16_half_neon(x + 14, c1);
+
+  // Stage 9
+  out[0] = x[0];
+  out[1] = vqnegq_s16(x[8]);
+  out[2] = x[12];
+  out[3] = vqnegq_s16(x[4]);
+  out[4] = x[6];
+  out[5] = vqnegq_s16(x[14]);
+  out[6] = x[10];
+  out[7] = vqnegq_s16(x[2]);
+  out[8] = x[3];
+  out[9] = vqnegq_s16(x[11]);
+  out[10] = x[15];
+  out[11] = vqnegq_s16(x[7]);
+  out[12] = x[5];
+  out[13] = vqnegq_s16(x[13]);
+  out[14] = x[9];
+  out[15] = vqnegq_s16(x[1]);
+}
+
+static INLINE void idct32_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit,
+                               int bit) {
+  (void)bit;
+  const int32_t *cospi = cospi_arr(cos_bit);
+  int16x8_t step1[32], step2[32];
+
+  const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[2], (int16_t)cospi[62],
+                                      (int16_t)cospi[34], (int16_t)cospi[30]);
+  const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[18], (int16_t)cospi[46],
+                                      (int16_t)cospi[50], (int16_t)cospi[14]);
+  const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[10], (int16_t)cospi[54],
+                                      (int16_t)cospi[42], (int16_t)cospi[22]);
+  const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[26], (int16_t)cospi[38],
+                                      (int16_t)cospi[58], (int16_t)cospi[6]);
+  const int16x4_t c4 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60],
+                                      (int16_t)cospi[36], (int16_t)cospi[28]);
+  const int16x4_t c5 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44],
+                                      (int16_t)cospi[52], (int16_t)cospi[12]);
+  const int16x4_t c6 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
+                                      (int16_t)cospi[40], (int16_t)cospi[24]);
+  const int16x4_t c7 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+                                      (int16_t)cospi[16], (int16_t)cospi[48]);
+  const int16x4_t c8 =
+      set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]),
+                     (int16_t)(-cospi[40]), (int16_t)(-cospi[24]));
+  const int16x4_t c9 =
+      set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
+                     (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
+
+  // stage 2
+
+  btf_16_lane_0_1_neon(in[1], in[31], c0, &step2[31], &step2[16]);
+  btf_16_lane_2_3_neon(in[17], in[15], c0, &step2[30], &step2[17]);
+  btf_16_lane_0_1_neon(in[9], in[23], c1, &step2[29], &step2[18]);
+  btf_16_lane_2_3_neon(in[25], in[7], c1, &step2[28], &step2[19]);
+  btf_16_lane_0_1_neon(in[5], in[27], c2, &step2[27], &step2[20]);
+  btf_16_lane_2_3_neon(in[21], in[11], c2, &step2[26], &step2[21]);
+  btf_16_lane_0_1_neon(in[13], in[19], c3, &step2[25], &step2[22]);
+  btf_16_lane_2_3_neon(in[29], in[3], c3, &step2[24], &step2[23]);
+
+  step2[0] = in[0];
+  step2[1] = in[16];
+  step2[2] = in[8];
+  step2[3] = in[24];
+  step2[4] = in[4];
+  step2[5] = in[20];
+  step2[6] = in[12];
+  step2[7] = in[28];
+  step2[8] = in[2];
+  step2[9] = in[18];
+  step2[10] = in[10];
+  step2[11] = in[26];
+  step2[12] = in[6];
+  step2[13] = in[22];
+  step2[14] = in[14];
+  step2[15] = in[30];
+
+  // stage 3
+
+  btf_16_lane_0_1_neon(step2[8], step2[15], c4, &step1[15], &step1[8]);
+  btf_16_lane_2_3_neon(step2[9], step2[14], c4, &step1[14], &step1[9]);
+  btf_16_lane_0_1_neon(step2[10], step2[13], c5, &step1[13], &step1[10]);
+  btf_16_lane_2_3_neon(step2[11], step2[12], c5, &step1[12], &step1[11]);
+
+  step1[0] = step2[0];
+  step1[1] = step2[1];
+  step1[2] = step2[2];
+  step1[3] = step2[3];
+  step1[4] = step2[4];
+  step1[5] = step2[5];
+  step1[6] = step2[6];
+  step1[7] = step2[7];
+
+  step1[16] = vqaddq_s16(step2[16], step2[17]);
+  step1[17] = vqsubq_s16(step2[16], step2[17]);
+  step1[18] = vqsubq_s16(step2[19], step2[18]);
+  step1[19] = vqaddq_s16(step2[19], step2[18]);
+  step1[20] = vqaddq_s16(step2[20], step2[21]);
+  step1[21] = vqsubq_s16(step2[20], step2[21]);
+  step1[22] = vqsubq_s16(step2[23], step2[22]);
+  step1[23] = vqaddq_s16(step2[23], step2[22]);
+  step1[24] = vqaddq_s16(step2[24], step2[25]);
+  step1[25] = vqsubq_s16(step2[24], step2[25]);
+  step1[26] = vqsubq_s16(step2[27], step2[26]);
+  step1[27] = vqaddq_s16(step2[27], step2[26]);
+  step1[28] = vqaddq_s16(step2[28], step2[29]);
+  step1[29] = vqsubq_s16(step2[28], step2[29]);
+  step1[30] = vqsubq_s16(step2[31], step2[30]);
+  step1[31] = vqaddq_s16(step2[31], step2[30]);
+
+  // stage 4
+
+  btf_16_lane_0_1_neon(step1[4], step1[7], c6, &step2[7], &step2[4]);
+  btf_16_lane_2_3_neon(step1[5], step1[6], c6, &step2[6], &step2[5]);
+  btf_16_lane_0_1_neon(step1[30], step1[17], c6, &step2[30], &step2[17]);
+  btf_16_lane_1_0_neon(step1[18], step1[29], c8, &step2[18], &step2[29]);
+  btf_16_lane_2_3_neon(step1[26], step1[21], c6, &step2[26], &step2[21]);
+  btf_16_lane_3_2_neon(step1[22], step1[25], c8, &step2[22], &step2[25]);
+
+  step2[0] = step1[0];
+  step2[1] = step1[1];
+  step2[2] = step1[2];
+  step2[3] = step1[3];
+  step2[8] = vqaddq_s16(step1[8], step1[9]);
+  step2[9] = vqsubq_s16(step1[8], step1[9]);
+  step2[10] = vqsubq_s16(step1[11], step1[10]);
+  step2[11] = vqaddq_s16(step1[11], step1[10]);
+  step2[12] = vqaddq_s16(step1[12], step1[13]);
+  step2[13] = vqsubq_s16(step1[12], step1[13]);
+  step2[14] = vqsubq_s16(step1[15], step1[14]);
+  step2[15] = vqaddq_s16(step1[15], step1[14]);
+  step2[16] = step1[16];
+  step2[19] = step1[19];
+  step2[20] = step1[20];
+  step2[23] = step1[23];
+  step2[24] = step1[24];
+  step2[27] = step1[27];
+  step2[28] = step1[28];
+  step2[31] = step1[31];
+
+  // stage 5
+
+  btf_16_lane_0_1_neon(step2[0], step2[1], c7, &step1[0], &step1[1]);
+  btf_16_lane_2_3_neon(step2[2], step2[3], c7, &step1[3], &step1[2]);
+  btf_16_lane_2_3_neon(step2[14], step2[9], c7, &step1[14], &step1[9]);
+  btf_16_lane_3_2_neon(step2[10], step2[13], c9, &step1[10], &step1[13]);
+
+  step1[4] = vqaddq_s16(step2[4], step2[5]);
+  step1[5] = vqsubq_s16(step2[4], step2[5]);
+  step1[6] = vqsubq_s16(step2[7], step2[6]);
+  step1[7] = vqaddq_s16(step2[7], step2[6]);
+  step1[8] = step2[8];
+  step1[11] = step2[11];
+  step1[12] = step2[12];
+  step1[15] = step2[15];
+  step1[16] = vqaddq_s16(step2[16], step2[19]);
+  step1[17] = vqaddq_s16(step2[17], step2[18]);
+  step1[18] = vqsubq_s16(step2[17], step2[18]);
+  step1[19] = vqsubq_s16(step2[16], step2[19]);
+  step1[20] = vqsubq_s16(step2[23], step2[20]);
+  step1[21] = vqsubq_s16(step2[22], step2[21]);
+  step1[22] = vqaddq_s16(step2[22], step2[21]);
+  step1[23] = vqaddq_s16(step2[23], step2[20]);
+  step1[24] = vqaddq_s16(step2[24], step2[27]);
+  step1[25] = vqaddq_s16(step2[25], step2[26]);
+  step1[26] = vqsubq_s16(step2[25], step2[26]);
+  step1[27] = vqsubq_s16(step2[24], step2[27]);
+  step1[28] = vqsubq_s16(step2[31], step2[28]);
+  step1[29] = vqsubq_s16(step2[30], step2[29]);
+  step1[30] = vqaddq_s16(step2[30], step2[29]);
+  step1[31] = vqaddq_s16(step2[31], step2[28]);
+
+  // stage 6
+
+  btf_16_lane_0_1_neon(step1[6], step1[5], c7, &step2[6], &step2[5]);
+  btf_16_lane_2_3_neon(step1[29], step1[18], c7, &step2[29], &step2[18]);
+  btf_16_lane_2_3_neon(step1[28], step1[19], c7, &step2[28], &step2[19]);
+  btf_16_lane_3_2_neon(step1[20], step1[27], c9, &step2[20], &step2[27]);
+  btf_16_lane_3_2_neon(step1[21], step1[26], c9, &step2[21], &step2[26]);
+
+  step2[0] = vqaddq_s16(step1[0], step1[3]);
+  step2[1] = vqaddq_s16(step1[1], step1[2]);
+  step2[2] = vqsubq_s16(step1[1], step1[2]);
+  step2[3] = vqsubq_s16(step1[0], step1[3]);
+  step2[4] = step1[4];
+  step2[7] = step1[7];
+  step2[8] = vqaddq_s16(step1[8], step1[11]);
+  step2[9] = vqaddq_s16(step1[9], step1[10]);
+  step2[10] = vqsubq_s16(step1[9], step1[10]);
+  step2[11] = vqsubq_s16(step1[8], step1[11]);
+  step2[12] = vqsubq_s16(step1[15], step1[12]);
+  step2[13] = vqsubq_s16(step1[14], step1[13]);
+  step2[14] = vqaddq_s16(step1[14], step1[13]);
+  step2[15] = vqaddq_s16(step1[15], step1[12]);
+  step2[16] = step1[16];
+  step2[17] = step1[17];
+  step2[22] = step1[22];
+  step2[23] = step1[23];
+  step2[24] = step1[24];
+  step2[25] = step1[25];
+  step2[30] = step1[30];
+  step2[31] = step1[31];
+
+  // stage 7
+
+  btf_16_lane_0_1_neon(step2[13], step2[10], c7, &step1[13], &step1[10]);
+  btf_16_lane_0_1_neon(step2[12], step2[11], c7, &step1[12], &step1[11]);
+
+  step1[0] = vqaddq_s16(step2[0], step2[7]);
+  step1[1] = vqaddq_s16(step2[1], step2[6]);
+  step1[2] = vqaddq_s16(step2[2], step2[5]);
+  step1[3] = vqaddq_s16(step2[3], step2[4]);
+  step1[4] = vqsubq_s16(step2[3], step2[4]);
+  step1[5] = vqsubq_s16(step2[2], step2[5]);
+  step1[6] = vqsubq_s16(step2[1], step2[6]);
+  step1[7] = vqsubq_s16(step2[0], step2[7]);
+  step1[8] = step2[8];
+  step1[9] = step2[9];
+  step1[14] = step2[14];
+  step1[15] = step2[15];
+  step1[16] = vqaddq_s16(step2[16], step2[23]);
+  step1[17] = vqaddq_s16(step2[17], step2[22]);
+  step1[18] = vqaddq_s16(step2[18], step2[21]);
+  step1[19] = vqaddq_s16(step2[19], step2[20]);
+  step1[20] = vqsubq_s16(step2[19], step2[20]);
+  step1[21] = vqsubq_s16(step2[18], step2[21]);
+  step1[22] = vqsubq_s16(step2[17], step2[22]);
+  step1[23] = vqsubq_s16(step2[16], step2[23]);
+  step1[24] = vqsubq_s16(step2[31], step2[24]);
+  step1[25] = vqsubq_s16(step2[30], step2[25]);
+  step1[26] = vqsubq_s16(step2[29], step2[26]);
+  step1[27] = vqsubq_s16(step2[28], step2[27]);
+  step1[28] = vqaddq_s16(step2[27], step2[28]);
+  step1[29] = vqaddq_s16(step2[26], step2[29]);
+  step1[30] = vqaddq_s16(step2[25], step2[30]);
+  step1[31] = vqaddq_s16(step2[24], step2[31]);
+
+  // stage 8
+
+  btf_16_lane_0_1_neon(step1[27], step1[20], c7, &step2[27], &step2[20]);
+  btf_16_lane_0_1_neon(step1[26], step1[21], c7, &step2[26], &step2[21]);
+  btf_16_lane_0_1_neon(step1[25], step1[22], c7, &step2[25], &step2[22]);
+  btf_16_lane_0_1_neon(step1[24], step1[23], c7, &step2[24], &step2[23]);
+
+  step2[0] = vqaddq_s16(step1[0], step1[15]);
+  step2[1] = vqaddq_s16(step1[1], step1[14]);
+  step2[2] = vqaddq_s16(step1[2], step1[13]);
+  step2[3] = vqaddq_s16(step1[3], step1[12]);
+  step2[4] = vqaddq_s16(step1[4], step1[11]);
+  step2[5] = vqaddq_s16(step1[5], step1[10]);
+  step2[6] = vqaddq_s16(step1[6], step1[9]);
+  step2[7] = vqaddq_s16(step1[7], step1[8]);
+  step2[8] = vqsubq_s16(step1[7], step1[8]);
+  step2[9] = vqsubq_s16(step1[6], step1[9]);
+  step2[10] = vqsubq_s16(step1[5], step1[10]);
+  step2[11] = vqsubq_s16(step1[4], step1[11]);
+  step2[12] = vqsubq_s16(step1[3], step1[12]);
+  step2[13] = vqsubq_s16(step1[2], step1[13]);
+  step2[14] = vqsubq_s16(step1[1], step1[14]);
+  step2[15] = vqsubq_s16(step1[0], step1[15]);
+  step2[16] = step1[16];
+  step2[17] = step1[17];
+  step2[18] = step1[18];
+  step2[19] = step1[19];
+  step2[28] = step1[28];
+  step2[29] = step1[29];
+  step2[30] = step1[30];
+  step2[31] = step1[31];
+
+  // stage 9
+
+  out[0] = vqaddq_s16(step2[0], step2[31]);
+  out[1] = vqaddq_s16(step2[1], step2[30]);
+  out[2] = vqaddq_s16(step2[2], step2[29]);
+  out[3] = vqaddq_s16(step2[3], step2[28]);
+  out[4] = vqaddq_s16(step2[4], step2[27]);
+  out[5] = vqaddq_s16(step2[5], step2[26]);
+  out[6] = vqaddq_s16(step2[6], step2[25]);
+  out[7] = vqaddq_s16(step2[7], step2[24]);
+  out[8] = vqaddq_s16(step2[8], step2[23]);
+  out[9] = vqaddq_s16(step2[9], step2[22]);
+  out[10] = vqaddq_s16(step2[10], step2[21]);
+  out[11] = vqaddq_s16(step2[11], step2[20]);
+  out[12] = vqaddq_s16(step2[12], step2[19]);
+  out[13] = vqaddq_s16(step2[13], step2[18]);
+  out[14] = vqaddq_s16(step2[14], step2[17]);
+  out[15] = vqaddq_s16(step2[15], step2[16]);
+  out[16] = vqsubq_s16(step2[15], step2[16]);
+  out[17] = vqsubq_s16(step2[14], step2[17]);
+  out[18] = vqsubq_s16(step2[13], step2[18]);
+  out[19] = vqsubq_s16(step2[12], step2[19]);
+  out[20] = vqsubq_s16(step2[11], step2[20]);
+  out[21] = vqsubq_s16(step2[10], step2[21]);
+  out[22] = vqsubq_s16(step2[9], step2[22]);
+  out[23] = vqsubq_s16(step2[8], step2[23]);
+  out[24] = vqsubq_s16(step2[7], step2[24]);
+  out[25] = vqsubq_s16(step2[6], step2[25]);
+  out[26] = vqsubq_s16(step2[5], step2[26]);
+  out[27] = vqsubq_s16(step2[4], step2[27]);
+  out[28] = vqsubq_s16(step2[3], step2[28]);
+  out[29] = vqsubq_s16(step2[2], step2[29]);
+  out[30] = vqsubq_s16(step2[1], step2[30]);
+  out[31] = vqsubq_s16(step2[0], step2[31]);
+}
+
+static INLINE void idct32_low1_neon(int16x8_t *in, int16x8_t *out,
+                                    int8_t cos_bit, int bit) {
+  (void)bit;
+  const int32_t *cospi = cospi_arr(cos_bit);
+  int16x8_t step1;
+  int32x4_t t32[2];
+
+  // stage 1
+  // stage 2
+  // stage 3
+  // stage 4
+  // stage 5
+
+  t32[0] = vmull_n_s16(vget_low_s16(in[0]), cospi[32]);
+  t32[1] = vmull_n_s16(vget_high_s16(in[0]), cospi[32]);
+  step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
+                       vrshrn_n_s32(t32[1], INV_COS_BIT));
+
+  // stage 6
+  // stage 7
+  // stage 8
+  // stage 9
+
+  out[0] = step1;
+  out[1] = step1;
+  out[2] = step1;
+  out[3] = step1;
+  out[4] = step1;
+  out[5] = step1;
+  out[6] = step1;
+  out[7] = step1;
+  out[8] = step1;
+  out[9] = step1;
+  out[10] = step1;
+  out[11] = step1;
+  out[12] = step1;
+  out[13] = step1;
+  out[14] = step1;
+  out[15] = step1;
+  out[16] = step1;
+  out[17] = step1;
+  out[18] = step1;
+  out[19] = step1;
+  out[20] = step1;
+  out[21] = step1;
+  out[22] = step1;
+  out[23] = step1;
+  out[24] = step1;
+  out[25] = step1;
+  out[26] = step1;
+  out[27] = step1;
+  out[28] = step1;
+  out[29] = step1;
+  out[30] = step1;
+  out[31] = step1;
+}
+
+static INLINE void idct32_low8_neon(int16x8_t *in, int16x8_t *out,
+                                    int8_t cos_bit, int bit) {
+  (void)bit;
+  const int32_t *cospi = cospi_arr(cos_bit);
+  int16x8_t step1[32], step2[32];
+  int32x4_t t32[16];
+  const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
+                                      (int16_t)cospi[40], (int16_t)cospi[24]);
+  const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+                                      (int16_t)cospi[16], cospi[48]);
+  const int16x4_t c2 =
+      set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]),
+                     (int16_t)(-cospi[40]), (int16_t)(-cospi[24]));
+  const int16x4_t c3 =
+      set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
+                     (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
+  // stage 1
+  // stage 2
+
+  step2[0] = in[0];
+  step2[4] = in[4];
+  step2[8] = in[2];
+  step2[12] = in[6];
+
+  btf_16_neon(in[1], cospi[62], cospi[2], &step2[16], &step2[31]);
+  btf_16_neon(in[7], -cospi[50], cospi[14], &step2[19], &step2[28]);
+  btf_16_neon(in[5], cospi[54], cospi[10], &step2[20], &step2[27]);
+  btf_16_neon(in[3], -cospi[58], cospi[6], &step2[23], &step2[24]);
+
+  // stage 3
+  step1[0] = step2[0];
+  step1[4] = step2[4];
+
+  btf_16_neon(step2[8], cospi[60], cospi[4], &step1[8], &step1[15]);
+  btf_16_neon(step2[12], -cospi[52], cospi[12], &step1[11], &step1[12]);
+
+  step1[16] = step2[16];
+  step1[17] = step2[16];
+  step1[18] = step2[19];
+  step1[19] = step2[19];
+  step1[20] = step2[20];
+  step1[21] = step2[20];
+  step1[22] = step2[23];
+  step1[23] = step2[23];
+  step1[24] = step2[24];
+  step1[25] = step2[24];
+  step1[26] = step2[27];
+  step1[27] = step2[27];
+  step1[28] = step2[28];
+  step1[29] = step2[28];
+  step1[30] = step2[31];
+  step1[31] = step2[31];
+
+  // stage 4
+
+  btf_16_neon(step1[4], cospi[56], cospi[8], &step2[4], &step2[7]);
+  btf_16_lane_0_1_neon(step1[30], step1[17], c0, &step2[30], &step2[17]);
+  btf_16_lane_1_0_neon(step1[18], step1[29], c2, &step2[18], &step2[29]);
+  btf_16_lane_2_3_neon(step1[26], step1[21], c0, &step2[26], &step2[21]);
+  btf_16_lane_3_2_neon(step1[22], step1[25], c2, &step2[22], &step2[25]);
+
+  step2[0] = step1[0];
+  step2[8] = step1[8];
+  step2[9] = step1[8];
+  step2[10] = step1[11];
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+  step2[13] = step1[12];
+  step2[14] = step1[15];
+  step2[15] = step1[15];
+  step2[16] = step1[16];
+  step2[19] = step1[19];
+  step2[20] = step1[20];
+  step2[23] = step1[23];
+  step2[24] = step1[24];
+  step2[27] = step1[27];
+  step2[28] = step1[28];
+  step2[31] = step1[31];
+
+  // stage 5
+
+  t32[0] = vmull_n_s16(vget_low_s16(step2[0]), cospi[32]);
+  t32[1] = vmull_n_s16(vget_high_s16(step2[0]), cospi[32]);
+  step1[0] = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
+                          vrshrn_n_s32(t32[1], INV_COS_BIT));
+
+  btf_16_lane_2_3_neon(step2[14], step2[9], c1, &step1[14], &step1[9]);
+  btf_16_lane_3_2_neon(step2[10], step2[13], c3, &step1[10], &step1[13]);
+
+  step1[4] = step2[4];
+  step1[5] = step2[4];
+  step1[6] = step2[7];
+  step1[7] = step2[7];
+  step1[8] = step2[8];
+  step1[11] = step2[11];
+  step1[12] = step2[12];
+  step1[15] = step2[15];
+  step1[16] = vqaddq_s16(step2[16], step2[19]);
+  step1[17] = vqaddq_s16(step2[17], step2[18]);
+  step1[18] = vqsubq_s16(step2[17], step2[18]);
+  step1[19] = vqsubq_s16(step2[16], step2[19]);
+  step1[20] = vqsubq_s16(step2[23], step2[20]);
+  step1[21] = vqsubq_s16(step2[22], step2[21]);
+  step1[22] = vqaddq_s16(step2[22], step2[21]);
+  step1[23] = vqaddq_s16(step2[23], step2[20]);
+  step1[24] = vqaddq_s16(step2[24], step2[27]);
+  step1[25] = vqaddq_s16(step2[25], step2[26]);
+  step1[26] = vqsubq_s16(step2[25], step2[26]);
+  step1[27] = vqsubq_s16(step2[24], step2[27]);
+  step1[28] = vqsubq_s16(step2[31], step2[28]);
+  step1[29] = vqsubq_s16(step2[30], step2[29]);
+  step1[30] = vqaddq_s16(step2[30], step2[29]);
+  step1[31] = vqaddq_s16(step2[31], step2[28]);
+
+  // stage 6
+
+  btf_16_lane_0_1_neon(step1[6], step1[5], c1, &step2[6], &step2[5]);
+  btf_16_lane_2_3_neon(step1[29], step1[18], c1, &step2[29], &step2[18]);
+  btf_16_lane_2_3_neon(step1[28], step1[19], c1, &step2[28], &step2[19]);
+  btf_16_lane_3_2_neon(step1[20], step1[27], c3, &step2[20], &step2[27]);
+  btf_16_lane_3_2_neon(step1[21], step1[26], c3, &step2[21], &step2[26]);
+
+  step2[0] = step1[0];
+  step2[1] = step1[0];
+  step2[2] = step1[0];
+  step2[3] = step1[0];
+  step2[4] = step1[4];
+  step2[7] = step1[7];
+  step2[8] = vqaddq_s16(step1[8], step1[11]);
+  step2[9] = vqaddq_s16(step1[9], step1[10]);
+  step2[10] = vqsubq_s16(step1[9], step1[10]);
+  step2[11] = vqsubq_s16(step1[8], step1[11]);
+  step2[12] = vqsubq_s16(step1[15], step1[12]);
+  step2[13] = vqsubq_s16(step1[14], step1[13]);
+  step2[14] = vqaddq_s16(step1[14], step1[13]);
+  step2[15] = vqaddq_s16(step1[15], step1[12]);
+  step2[16] = step1[16];
+  step2[17] = step1[17];
+  step2[22] = step1[22];
+  step2[23] = step1[23];
+  step2[24] = step1[24];
+  step2[25] = step1[25];
+  step2[30] = step1[30];
+  step2[31] = step1[31];
+
+  // stage 7
+
+  btf_16_lane_0_1_neon(step2[13], step2[10], c1, &step1[13], &step1[10]);
+  btf_16_lane_0_1_neon(step2[12], step2[11], c1, &step1[12], &step1[11]);
+
+  step1[0] = vqaddq_s16(step2[0], step2[7]);
+  step1[1] = vqaddq_s16(step2[1], step2[6]);
+  step1[2] = vqaddq_s16(step2[2], step2[5]);
+  step1[3] = vqaddq_s16(step2[3], step2[4]);
+  step1[4] = vqsubq_s16(step2[3], step2[4]);
+  step1[5] = vqsubq_s16(step2[2], step2[5]);
+  step1[6] = vqsubq_s16(step2[1], step2[6]);
+  step1[7] = vqsubq_s16(step2[0], step2[7]);
+  step1[8] = step2[8];
+  step1[9] = step2[9];
+  step1[14] = step2[14];
+  step1[15] = step2[15];
+  step1[16] = vqaddq_s16(step2[16], step2[23]);
+  step1[17] = vqaddq_s16(step2[17], step2[22]);
+  step1[18] = vqaddq_s16(step2[18], step2[21]);
+  step1[19] = vqaddq_s16(step2[19], step2[20]);
+  step1[20] = vqsubq_s16(step2[19], step2[20]);
+  step1[21] = vqsubq_s16(step2[18], step2[21]);
+  step1[22] = vqsubq_s16(step2[17], step2[22]);
+  step1[23] = vqsubq_s16(step2[16], step2[23]);
+  step1[24] = vqsubq_s16(step2[31], step2[24]);
+  step1[25] = vqsubq_s16(step2[30], step2[25]);
+  step1[26] = vqsubq_s16(step2[29], step2[26]);
+  step1[27] = vqsubq_s16(step2[28], step2[27]);
+  step1[28] = vqaddq_s16(step2[27], step2[28]);
+  step1[29] = vqaddq_s16(step2[26], step2[29]);
+  step1[30] = vqaddq_s16(step2[25], step2[30]);
+  step1[31] = vqaddq_s16(step2[24], step2[31]);
+
+  // stage 8
+
+  btf_16_lane_0_1_neon(step1[27], step1[20], c1, &step2[27], &step2[20]);
+  btf_16_lane_0_1_neon(step1[26], step1[21], c1, &step2[26], &step2[21]);
+  btf_16_lane_0_1_neon(step1[25], step1[22], c1, &step2[25], &step2[22]);
+  btf_16_lane_0_1_neon(step1[24], step1[23], c1, &step2[24], &step2[23]);
+
+  step2[0] = vqaddq_s16(step1[0], step1[15]);
+  step2[1] = vqaddq_s16(step1[1], step1[14]);
+  step2[2] = vqaddq_s16(step1[2], step1[13]);
+  step2[3] = vqaddq_s16(step1[3], step1[12]);
+  step2[4] = vqaddq_s16(step1[4], step1[11]);
+  step2[5] = vqaddq_s16(step1[5], step1[10]);
+  step2[6] = vqaddq_s16(step1[6], step1[9]);
+  step2[7] = vqaddq_s16(step1[7], step1[8]);
+  step2[8] = vqsubq_s16(step1[7], step1[8]);
+  step2[9] = vqsubq_s16(step1[6], step1[9]);
+  step2[10] = vqsubq_s16(step1[5], step1[10]);
+  step2[11] = vqsubq_s16(step1[4], step1[11]);
+  step2[12] = vqsubq_s16(step1[3], step1[12]);
+  step2[13] = vqsubq_s16(step1[2], step1[13]);
+  step2[14] = vqsubq_s16(step1[1], step1[14]);
+  step2[15] = vqsubq_s16(step1[0], step1[15]);
+  step2[16] = step1[16];
+  step2[17] = step1[17];
+  step2[18] = step1[18];
+  step2[19] = step1[19];
+  step2[28] = step1[28];
+  step2[29] = step1[29];
+  step2[30] = step1[30];
+  step2[31] = step1[31];
+
+  // stage 9
+
+  out[0] = vqaddq_s16(step2[0], step2[31]);
+  out[1] = vqaddq_s16(step2[1], step2[30]);
+  out[2] = vqaddq_s16(step2[2], step2[29]);
+  out[3] = vqaddq_s16(step2[3], step2[28]);
+  out[4] = vqaddq_s16(step2[4], step2[27]);
+  out[5] = vqaddq_s16(step2[5], step2[26]);
+  out[6] = vqaddq_s16(step2[6], step2[25]);
+  out[7] = vqaddq_s16(step2[7], step2[24]);
+  out[8] = vqaddq_s16(step2[8], step2[23]);
+  out[9] = vqaddq_s16(step2[9], step2[22]);
+  out[10] = vqaddq_s16(step2[10], step2[21]);
+  out[11] = vqaddq_s16(step2[11], step2[20]);
+  out[12] = vqaddq_s16(step2[12], step2[19]);
+  out[13] = vqaddq_s16(step2[13], step2[18]);
+  out[14] = vqaddq_s16(step2[14], step2[17]);
+  out[15] = vqaddq_s16(step2[15], step2[16]);
+  out[16] = vqsubq_s16(step2[15], step2[16]);
+  out[17] = vqsubq_s16(step2[14], step2[17]);
+  out[18] = vqsubq_s16(step2[13], step2[18]);
+  out[19] = vqsubq_s16(step2[12], step2[19]);
+  out[20] = vqsubq_s16(step2[11], step2[20]);
+  out[21] = vqsubq_s16(step2[10], step2[21]);
+  out[22] = vqsubq_s16(step2[9], step2[22]);
+  out[23] = vqsubq_s16(step2[8], step2[23]);
+  out[24] = vqsubq_s16(step2[7], step2[24]);
+  out[25] = vqsubq_s16(step2[6], step2[25]);
+  out[26] = vqsubq_s16(step2[5], step2[26]);
+  out[27] = vqsubq_s16(step2[4], step2[27]);
+  out[28] = vqsubq_s16(step2[3], step2[28]);
+  out[29] = vqsubq_s16(step2[2], step2[29]);
+  out[30] = vqsubq_s16(step2[1], step2[30]);
+  out[31] = vqsubq_s16(step2[0], step2[31]);
+}
+
+static INLINE void idct32_low16_neon(int16x8_t *in, int16x8_t *out,
+                                     int8_t cos_bit, int bit) {
+  (void)bit;
+  const int32_t *cospi = cospi_arr(cos_bit);
+  int16x8_t step1[32], step2[32];
+  int32x4_t t32[16];
+  const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
+                                      (int16_t)cospi[40], (int16_t)cospi[24]);
+  const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+                                      (int16_t)cospi[16], (int16_t)cospi[48]);
+  const int16x4_t c2 =
+      set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]),
+                     (int16_t)(-cospi[40]), (int16_t)(-cospi[24]));
+  const int16x4_t c3 =
+      set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
+                     (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
+
+  // stage 1
+  // stage 2
+
+  btf_16_neon(in[1], cospi[62], cospi[2], &step2[16], &step2[31]);
+  btf_16_neon(in[15], -cospi[34], cospi[30], &step2[17], &step2[30]);
+  btf_16_neon(in[9], cospi[46], cospi[18], &step2[18], &step2[29]);
+  btf_16_neon(in[7], -cospi[50], cospi[14], &step2[19], &step2[28]);
+  btf_16_neon(in[5], cospi[54], cospi[10], &step2[20], &step2[27]);
+  btf_16_neon(in[11], -cospi[42], cospi[22], &step2[21], &step2[26]);
+  btf_16_neon(in[13], cospi[38], cospi[26], &step2[22], &step2[25]);
+  btf_16_neon(in[3], -cospi[58], cospi[6], &step2[23], &step2[24]);
+
+  step2[0] = in[0];
+  step2[2] = in[8];
+  step2[4] = in[4];
+  step2[6] = in[12];
+  step2[8] = in[2];
+  step2[10] = in[10];
+  step2[12] = in[6];
+  step2[14] = in[14];
+
+  // stage 3
+
+  btf_16_neon(step2[8], cospi[60], cospi[4], &step1[8], &step1[15]);
+  btf_16_neon(step2[14], -cospi[36], cospi[28], &step1[9], &step1[14]);
+  btf_16_neon(step2[10], cospi[44], cospi[20], &step1[10], &step1[13]);
+  btf_16_neon(step2[12], -cospi[52], cospi[12], &step1[11], &step1[12]);
+
+  step1[0] = step2[0];
+  step1[2] = step2[2];
+  step1[4] = step2[4];
+  step1[6] = step2[6];
+  step1[16] = vqaddq_s16(step2[16], step2[17]);
+  step1[17] = vqsubq_s16(step2[16], step2[17]);
+  step1[18] = vqsubq_s16(step2[19], step2[18]);
+  step1[19] = vqaddq_s16(step2[19], step2[18]);
+  step1[20] = vqaddq_s16(step2[20], step2[21]);
+  step1[21] = vqsubq_s16(step2[20], step2[21]);
+  step1[22] = vqsubq_s16(step2[23], step2[22]);
+  step1[23] = vqaddq_s16(step2[23], step2[22]);
+  step1[24] = vqaddq_s16(step2[24], step2[25]);
+  step1[25] = vqsubq_s16(step2[24], step2[25]);
+  step1[26] = vqsubq_s16(step2[27], step2[26]);
+  step1[27] = vqaddq_s16(step2[27], step2[26]);
+  step1[28] = vqaddq_s16(step2[28], step2[29]);
+  step1[29] = vqsubq_s16(step2[28], step2[29]);
+  step1[30] = vqsubq_s16(step2[31], step2[30]);
+  step1[31] = vqaddq_s16(step2[31], step2[30]);
+
+  // stage 4
+
+  btf_16_neon(step1[4], cospi[56], cospi[8], &step2[4], &step2[7]);
+  btf_16_neon(step1[6], -cospi[40], cospi[24], &step2[5], &step2[6]);
+  btf_16_lane_0_1_neon(step1[30], step1[17], c0, &step2[30], &step2[17]);
+  btf_16_lane_1_0_neon(step1[18], step1[29], c2, &step2[18], &step2[29]);
+  btf_16_lane_2_3_neon(step1[26], step1[21], c0, &step2[26], &step2[21]);
+  btf_16_lane_3_2_neon(step1[22], step1[25], c2, &step2[22], &step2[25]);
+
+  step2[0] = step1[0];
+  step2[2] = step1[2];
+  step2[8] = vqaddq_s16(step1[8], step1[9]);
+  step2[9] = vqsubq_s16(step1[8], step1[9]);
+  step2[10] = vqsubq_s16(step1[11], step1[10]);
+  step2[11] = vqaddq_s16(step1[11], step1[10]);
+  step2[12] = vqaddq_s16(step1[12], step1[13]);
+  step2[13] = vqsubq_s16(step1[12], step1[13]);
+  step2[14] = vqsubq_s16(step1[15], step1[14]);
+  step2[15] = vqaddq_s16(step1[15], step1[14]);
+  step2[16] = step1[16];
+  step2[19] = step1[19];
+  step2[20] = step1[20];
+  step2[23] = step1[23];
+  step2[24] = step1[24];
+  step2[27] = step1[27];
+  step2[28] = step1[28];
+  step2[31] = step1[31];
+
+  // stage 5
+
+  t32[0] = vmull_n_s16(vget_low_s16(step2[0]), cospi[32]);
+  t32[1] = vmull_n_s16(vget_high_s16(step2[0]), cospi[32]);
+
+  step1[0] = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
+                          vrshrn_n_s32(t32[1], INV_COS_BIT));
+
+  btf_16_neon(step2[2], cospi[48], cospi[16], &step1[2], &step1[3]);
+  btf_16_lane_2_3_neon(step2[14], step2[9], c1, &step1[14], &step1[9]);
+  btf_16_lane_3_2_neon(step2[10], step2[13], c3, &step1[10], &step1[13]);
+
+  step1[4] = vqaddq_s16(step2[4], step2[5]);
+  step1[5] = vqsubq_s16(step2[4], step2[5]);
+  step1[6] = vqsubq_s16(step2[7], step2[6]);
+  step1[7] = vqaddq_s16(step2[7], step2[6]);
+  step1[8] = step2[8];
+  step1[11] = step2[11];
+  step1[12] = step2[12];
+  step1[15] = step2[15];
+  step1[16] = vqaddq_s16(step2[16], step2[19]);
+  step1[17] = vqaddq_s16(step2[17], step2[18]);
+  step1[18] = vqsubq_s16(step2[17], step2[18]);
+  step1[19] = vqsubq_s16(step2[16], step2[19]);
+  step1[20] = vqsubq_s16(step2[23], step2[20]);
+  step1[21] = vqsubq_s16(step2[22], step2[21]);
+  step1[22] = vqaddq_s16(step2[22], step2[21]);
+  step1[23] = vqaddq_s16(step2[23], step2[20]);
+  step1[24] = vqaddq_s16(step2[24], step2[27]);
+  step1[25] = vqaddq_s16(step2[25], step2[26]);
+  step1[26] = vqsubq_s16(step2[25], step2[26]);
+  step1[27] = vqsubq_s16(step2[24], step2[27]);
+  step1[28] = vqsubq_s16(step2[31], step2[28]);
+  step1[29] = vqsubq_s16(step2[30], step2[29]);
+  step1[30] = vqaddq_s16(step2[30], step2[29]);
+  step1[31] = vqaddq_s16(step2[31], step2[28]);
+
+  // stage 6
+
+  btf_16_lane_0_1_neon(step1[6], step1[5], c1, &step2[6], &step2[5]);
+  btf_16_lane_2_3_neon(step1[29], step1[18], c1, &step2[29], &step2[18]);
+  btf_16_lane_2_3_neon(step1[28], step1[19], c1, &step2[28], &step2[19]);
+  btf_16_lane_3_2_neon(step1[20], step1[27], c3, &step2[20], &step2[27]);
+  btf_16_lane_3_2_neon(step1[21], step1[26], c3, &step2[21], &step2[26]);
+
+  step2[0] = vqaddq_s16(step1[0], step1[3]);
+  step2[1] = vqaddq_s16(step1[0], step1[2]);
+  step2[2] = vqsubq_s16(step1[0], step1[2]);
+  step2[3] = vqsubq_s16(step1[0], step1[3]);
+  step2[4] = step1[4];
+  step2[7] = step1[7];
+  step2[8] = vqaddq_s16(step1[8], step1[11]);
+  step2[9] = vqaddq_s16(step1[9], step1[10]);
+  step2[10] = vqsubq_s16(step1[9], step1[10]);
+  step2[11] = vqsubq_s16(step1[8], step1[11]);
+  step2[12] = vqsubq_s16(step1[15], step1[12]);
+  step2[13] = vqsubq_s16(step1[14], step1[13]);
+  step2[14] = vqaddq_s16(step1[14], step1[13]);
+  step2[15] = vqaddq_s16(step1[15], step1[12]);
+  step2[16] = step1[16];
+  step2[17] = step1[17];
+  step2[22] = step1[22];
+  step2[23] = step1[23];
+  step2[24] = step1[24];
+  step2[25] = step1[25];
+  step2[30] = step1[30];
+  step2[31] = step1[31];
+
+  // stage 7
+
+  btf_16_lane_0_1_neon(step2[13], step2[10], c1, &step1[13], &step1[10]);
+  btf_16_lane_0_1_neon(step2[12], step2[11], c1, &step1[12], &step1[11]);
+
+  step1[0] = vqaddq_s16(step2[0], step2[7]);
+  step1[1] = vqaddq_s16(step2[1], step2[6]);
+  step1[2] = vqaddq_s16(step2[2], step2[5]);
+  step1[3] = vqaddq_s16(step2[3], step2[4]);
+  step1[4] = vqsubq_s16(step2[3], step2[4]);
+  step1[5] = vqsubq_s16(step2[2], step2[5]);
+  step1[6] = vqsubq_s16(step2[1], step2[6]);
+  step1[7] = vqsubq_s16(step2[0], step2[7]);
+  step1[8] = step2[8];
+  step1[9] = step2[9];
+  step1[14] = step2[14];
+  step1[15] = step2[15];
+  step1[16] = vqaddq_s16(step2[16], step2[23]);
+  step1[17] = vqaddq_s16(step2[17], step2[22]);
+  step1[18] = vqaddq_s16(step2[18], step2[21]);
+  step1[19] = vqaddq_s16(step2[19], step2[20]);
+  step1[20] = vqsubq_s16(step2[19], step2[20]);
+  step1[21] = vqsubq_s16(step2[18], step2[21]);
+  step1[22] = vqsubq_s16(step2[17], step2[22]);
+  step1[23] = vqsubq_s16(step2[16], step2[23]);
+  step1[24] = vqsubq_s16(step2[31], step2[24]);
+  step1[25] = vqsubq_s16(step2[30], step2[25]);
+  step1[26] = vqsubq_s16(step2[29], step2[26]);
+  step1[27] = vqsubq_s16(step2[28], step2[27]);
+  step1[28] = vqaddq_s16(step2[27], step2[28]);
+  step1[29] = vqaddq_s16(step2[26], step2[29]);
+  step1[30] = vqaddq_s16(step2[25], step2[30]);
+  step1[31] = vqaddq_s16(step2[24], step2[31]);
+
+  // stage 8
+
+  btf_16_lane_0_1_neon(step1[27], step1[20], c1, &step2[27], &step2[20]);
+  btf_16_lane_0_1_neon(step1[26], step1[21], c1, &step2[26], &step2[21]);
+  btf_16_lane_0_1_neon(step1[25], step1[22], c1, &step2[25], &step2[22]);
+  btf_16_lane_0_1_neon(step1[24], step1[23], c1, &step2[24], &step2[23]);
+
+  step2[0] = vqaddq_s16(step1[0], step1[15]);
+  step2[1] = vqaddq_s16(step1[1], step1[14]);
+  step2[2] = vqaddq_s16(step1[2], step1[13]);
+  step2[3] = vqaddq_s16(step1[3], step1[12]);
+  step2[4] = vqaddq_s16(step1[4], step1[11]);
+  step2[5] = vqaddq_s16(step1[5], step1[10]);
+  step2[6] = vqaddq_s16(step1[6], step1[9]);
+  step2[7] = vqaddq_s16(step1[7], step1[8]);
+  step2[8] = vqsubq_s16(step1[7], step1[8]);
+  step2[9] = vqsubq_s16(step1[6], step1[9]);
+  step2[10] = vqsubq_s16(step1[5], step1[10]);
+  step2[11] = vqsubq_s16(step1[4], step1[11]);
+  step2[12] = vqsubq_s16(step1[3], step1[12]);
+  step2[13] = vqsubq_s16(step1[2], step1[13]);
+  step2[14] = vqsubq_s16(step1[1], step1[14]);
+  step2[15] = vqsubq_s16(step1[0], step1[15]);
+  step2[16] = step1[16];
+  step2[17] = step1[17];
+  step2[18] = step1[18];
+  step2[19] = step1[19];
+  step2[28] = step1[28];
+  step2[29] = step1[29];
+  step2[30] = step1[30];
+  step2[31] = step1[31];
+
+  // stage 9
+
+  out[0] = vqaddq_s16(step2[0], step2[31]);
+  out[1] = vqaddq_s16(step2[1], step2[30]);
+  out[2] = vqaddq_s16(step2[2], step2[29]);
+  out[3] = vqaddq_s16(step2[3], step2[28]);
+  out[4] = vqaddq_s16(step2[4], step2[27]);
+  out[5] = vqaddq_s16(step2[5], step2[26]);
+  out[6] = vqaddq_s16(step2[6], step2[25]);
+  out[7] = vqaddq_s16(step2[7], step2[24]);
+  out[8] = vqaddq_s16(step2[8], step2[23]);
+  out[9] = vqaddq_s16(step2[9], step2[22]);
+  out[10] = vqaddq_s16(step2[10], step2[21]);
+  out[11] = vqaddq_s16(step2[11], step2[20]);
+  out[12] = vqaddq_s16(step2[12], step2[19]);
+  out[13] = vqaddq_s16(step2[13], step2[18]);
+  out[14] = vqaddq_s16(step2[14], step2[17]);
+  out[15] = vqaddq_s16(step2[15], step2[16]);
+  out[16] = vqsubq_s16(step2[15], step2[16]);
+  out[17] = vqsubq_s16(step2[14], step2[17]);
+  out[18] = vqsubq_s16(step2[13], step2[18]);
+  out[19] = vqsubq_s16(step2[12], step2[19]);
+  out[20] = vqsubq_s16(step2[11], step2[20]);
+  out[21] = vqsubq_s16(step2[10], step2[21]);
+  out[22] = vqsubq_s16(step2[9], step2[22]);
+  out[23] = vqsubq_s16(step2[8], step2[23]);
+  out[24] = vqsubq_s16(step2[7], step2[24]);
+  out[25] = vqsubq_s16(step2[6], step2[25]);
+  out[26] = vqsubq_s16(step2[5], step2[26]);
+  out[27] = vqsubq_s16(step2[4], step2[27]);
+  out[28] = vqsubq_s16(step2[3], step2[28]);
+  out[29] = vqsubq_s16(step2[2], step2[29]);
+  out[30] = vqsubq_s16(step2[1], step2[30]);
+  out[31] = vqsubq_s16(step2[0], step2[31]);
+}
+static INLINE void idct64_stage9_neon(int16x8_t *step2, int16x8_t *step1,
+                                      int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+                                      (int16_t)cospi[16], (int16_t)cospi[48]);
+
+  btf_16_lane_0_1_neon(step2[27], step2[20], c3, &step1[27], &step1[20]);
+  btf_16_lane_0_1_neon(step2[26], step2[21], c3, &step1[26], &step1[21]);
+  btf_16_lane_0_1_neon(step2[25], step2[22], c3, &step1[25], &step1[22]);
+  btf_16_lane_0_1_neon(step2[24], step2[23], c3, &step1[24], &step1[23]);
+
+  step1[0] = vqaddq_s16(step2[0], step2[15]);
+  step1[1] = vqaddq_s16(step2[1], step2[14]);
+  step1[2] = vqaddq_s16(step2[2], step2[13]);
+  step1[3] = vqaddq_s16(step2[3], step2[12]);
+  step1[4] = vqaddq_s16(step2[4], step2[11]);
+  step1[5] = vqaddq_s16(step2[5], step2[10]);
+  step1[6] = vqaddq_s16(step2[6], step2[9]);
+  step1[7] = vqaddq_s16(step2[7], step2[8]);
+  step1[8] = vqsubq_s16(step2[7], step2[8]);
+  step1[9] = vqsubq_s16(step2[6], step2[9]);
+  step1[10] = vqsubq_s16(step2[5], step2[10]);
+  step1[11] = vqsubq_s16(step2[4], step2[11]);
+  step1[12] = vqsubq_s16(step2[3], step2[12]);
+  step1[13] = vqsubq_s16(step2[2], step2[13]);
+  step1[14] = vqsubq_s16(step2[1], step2[14]);
+  step1[15] = vqsubq_s16(step2[0], step2[15]);
+  step1[16] = step2[16];
+  step1[17] = step2[17];
+  step1[18] = step2[18];
+  step1[19] = step2[19];
+  step1[28] = step2[28];
+  step1[29] = step2[29];
+  step1[30] = step2[30];
+  step1[31] = step2[31];
+  step1[32] = vqaddq_s16(step2[32], step2[47]);
+  step1[33] = vqaddq_s16(step2[33], step2[46]);
+  step1[34] = vqaddq_s16(step2[34], step2[45]);
+  step1[35] = vqaddq_s16(step2[35], step2[44]);
+  step1[36] = vqaddq_s16(step2[36], step2[43]);
+  step1[37] = vqaddq_s16(step2[37], step2[42]);
+  step1[38] = vqaddq_s16(step2[38], step2[41]);
+  step1[39] = vqaddq_s16(step2[39], step2[40]);
+  step1[40] = vqsubq_s16(step2[39], step2[40]);
+  step1[41] = vqsubq_s16(step2[38], step2[41]);
+  step1[42] = vqsubq_s16(step2[37], step2[42]);
+  step1[43] = vqsubq_s16(step2[36], step2[43]);
+  step1[44] = vqsubq_s16(step2[35], step2[44]);
+  step1[45] = vqsubq_s16(step2[34], step2[45]);
+  step1[46] = vqsubq_s16(step2[33], step2[46]);
+  step1[47] = vqsubq_s16(step2[32], step2[47]);
+  step1[48] = vqsubq_s16(step2[63], step2[48]);
+  step1[49] = vqsubq_s16(step2[62], step2[49]);
+  step1[50] = vqsubq_s16(step2[61], step2[50]);
+  step1[51] = vqsubq_s16(step2[60], step2[51]);
+  step1[52] = vqsubq_s16(step2[59], step2[52]);
+  step1[53] = vqsubq_s16(step2[58], step2[53]);
+  step1[54] = vqsubq_s16(step2[57], step2[54]);
+  step1[55] = vqsubq_s16(step2[56], step2[55]);
+  step1[56] = vqaddq_s16(step2[56], step2[55]);
+  step1[57] = vqaddq_s16(step2[57], step2[54]);
+  step1[58] = vqaddq_s16(step2[58], step2[53]);
+  step1[59] = vqaddq_s16(step2[59], step2[52]);
+  step1[60] = vqaddq_s16(step2[60], step2[51]);
+  step1[61] = vqaddq_s16(step2[61], step2[50]);
+  step1[62] = vqaddq_s16(step2[62], step2[49]);
+  step1[63] = vqaddq_s16(step2[63], step2[48]);
+}
+
+static INLINE void idct64_stage10_neon(int16x8_t *step1, int16x8_t *step2,
+                                       int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+                                      (int16_t)cospi[16], (int16_t)cospi[48]);
+
+  btf_16_lane_0_1_neon(step1[55], step1[40], c3, &step2[55], &step2[40]);
+  btf_16_lane_0_1_neon(step1[54], step1[41], c3, &step2[54], &step2[41]);
+  btf_16_lane_0_1_neon(step1[53], step1[42], c3, &step2[53], &step2[42]);
+  btf_16_lane_0_1_neon(step1[52], step1[43], c3, &step2[52], &step2[43]);
+  btf_16_lane_0_1_neon(step1[51], step1[44], c3, &step2[51], &step2[44]);
+  btf_16_lane_0_1_neon(step1[50], step1[45], c3, &step2[50], &step2[45]);
+  btf_16_lane_0_1_neon(step1[49], step1[46], c3, &step2[49], &step2[46]);
+  btf_16_lane_0_1_neon(step1[48], step1[47], c3, &step2[48], &step2[47]);
+
+  step2[0] = vqaddq_s16(step1[0], step1[31]);
+  step2[1] = vqaddq_s16(step1[1], step1[30]);
+  step2[2] = vqaddq_s16(step1[2], step1[29]);
+  step2[3] = vqaddq_s16(step1[3], step1[28]);
+  step2[4] = vqaddq_s16(step1[4], step1[27]);
+  step2[5] = vqaddq_s16(step1[5], step1[26]);
+  step2[6] = vqaddq_s16(step1[6], step1[25]);
+  step2[7] = vqaddq_s16(step1[7], step1[24]);
+  step2[8] = vqaddq_s16(step1[8], step1[23]);
+  step2[9] = vqaddq_s16(step1[9], step1[22]);
+  step2[10] = vqaddq_s16(step1[10], step1[21]);
+  step2[11] = vqaddq_s16(step1[11], step1[20]);
+  step2[12] = vqaddq_s16(step1[12], step1[19]);
+  step2[13] = vqaddq_s16(step1[13], step1[18]);
+  step2[14] = vqaddq_s16(step1[14], step1[17]);
+  step2[15] = vqaddq_s16(step1[15], step1[16]);
+  step2[16] = vqsubq_s16(step1[15], step1[16]);
+  step2[17] = vqsubq_s16(step1[14], step1[17]);
+  step2[18] = vqsubq_s16(step1[13], step1[18]);
+  step2[19] = vqsubq_s16(step1[12], step1[19]);
+  step2[20] = vqsubq_s16(step1[11], step1[20]);
+  step2[21] = vqsubq_s16(step1[10], step1[21]);
+  step2[22] = vqsubq_s16(step1[9], step1[22]);
+  step2[23] = vqsubq_s16(step1[8], step1[23]);
+  step2[24] = vqsubq_s16(step1[7], step1[24]);
+  step2[25] = vqsubq_s16(step1[6], step1[25]);
+  step2[26] = vqsubq_s16(step1[5], step1[26]);
+  step2[27] = vqsubq_s16(step1[4], step1[27]);
+  step2[28] = vqsubq_s16(step1[3], step1[28]);
+  step2[29] = vqsubq_s16(step1[2], step1[29]);
+  step2[30] = vqsubq_s16(step1[1], step1[30]);
+  step2[31] = vqsubq_s16(step1[0], step1[31]);
+  step2[32] = step1[32];
+  step2[33] = step1[33];
+  step2[34] = step1[34];
+  step2[35] = step1[35];
+  step2[36] = step1[36];
+  step2[37] = step1[37];
+  step2[38] = step1[38];
+  step2[39] = step1[39];
+  step2[56] = step1[56];
+  step2[57] = step1[57];
+  step2[58] = step1[58];
+  step2[59] = step1[59];
+  step2[60] = step1[60];
+  step2[61] = step1[61];
+  step2[62] = step1[62];
+  step2[63] = step1[63];
+}
+
+static INLINE void idct64_low32_neon(int16x8_t *in, int16x8_t *out,
+                                     int8_t cos_bit, int bit) {
+  (void)bit;
+  const int32_t *cospi = cospi_arr(cos_bit);
+  int16x8_t step2[64], step1[64];
+  const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60],
+                                      (int16_t)cospi[36], (int16_t)cospi[28]);
+  const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44],
+                                      (int16_t)cospi[52], (int16_t)cospi[12]);
+  const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
+                                      (int16_t)cospi[40], (int16_t)cospi[24]);
+  const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+                                      (int16_t)cospi[16], (int16_t)cospi[48]);
+  const int16x4_t c4 =
+      set_s16x4_neon((int16_t)(-cospi[4]), (int16_t)(-cospi[60]),
+                     (int16_t)(-cospi[36]), (int16_t)(-cospi[28]));
+  const int16x4_t c5 =
+      set_s16x4_neon((int16_t)(-cospi[20]), (int16_t)(-cospi[44]),
+                     (int16_t)(-cospi[52]), (int16_t)(-cospi[12]));
+  const int16x4_t c6 =
+      set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]),
+                     (int16_t)(-cospi[40]), (int16_t)(-cospi[24]));
+  const int16x4_t c7 =
+      set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
+                     (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
+
+  // stage 1
+  // stage 2
+
+  step2[0] = in[0];
+  step2[2] = in[16];
+  step2[4] = in[8];
+  step2[6] = in[24];
+  step2[8] = in[4];
+  step2[10] = in[20];
+  step2[12] = in[12];
+  step2[14] = in[28];
+  step2[16] = in[2];
+  step2[18] = in[18];
+  step2[20] = in[10];
+  step2[22] = in[26];
+  step2[24] = in[6];
+  step2[26] = in[22];
+  step2[28] = in[14];
+  step2[30] = in[30];
+
+  btf_16_neon(in[1], cospi[63], cospi[1], &step2[32], &step2[63]);
+  btf_16_neon(in[31], -cospi[33], cospi[31], &step2[33], &step2[62]);
+  btf_16_neon(in[17], cospi[47], cospi[17], &step2[34], &step2[61]);
+  btf_16_neon(in[15], -cospi[49], cospi[15], &step2[35], &step2[60]);
+  btf_16_neon(in[9], cospi[55], cospi[9], &step2[36], &step2[59]);
+  btf_16_neon(in[23], -cospi[41], cospi[23], &step2[37], &step2[58]);
+  btf_16_neon(in[25], cospi[39], cospi[25], &step2[38], &step2[57]);
+  btf_16_neon(in[7], -cospi[57], cospi[7], &step2[39], &step2[56]);
+  btf_16_neon(in[5], cospi[59], cospi[5], &step2[40], &step2[55]);
+  btf_16_neon(in[27], -cospi[37], cospi[27], &step2[41], &step2[54]);
+  btf_16_neon(in[21], cospi[43], cospi[21], &step2[42], &step2[53]);
+  btf_16_neon(in[11], -cospi[53], cospi[11], &step2[43], &step2[52]);
+  btf_16_neon(in[13], cospi[51], cospi[13], &step2[44], &step2[51]);
+  btf_16_neon(in[19], -cospi[45], cospi[19], &step2[45], &step2[50]);
+  btf_16_neon(in[29], cospi[35], cospi[29], &step2[46], &step2[49]);
+  btf_16_neon(in[3], -cospi[61], cospi[3], &step2[47], &step2[48]);
+
+  // stage 3
+
+  step1[0] = step2[0];
+  step1[2] = step2[2];
+  step1[4] = step2[4];
+  step1[6] = step2[6];
+  step1[8] = step2[8];
+  step1[10] = step2[10];
+  step1[12] = step2[12];
+  step1[14] = step2[14];
+
+  btf_16_neon(step2[16], cospi[62], cospi[2], &step1[16], &step1[31]);
+  btf_16_neon(step2[30], -cospi[34], cospi[30], &step1[17], &step1[30]);
+  btf_16_neon(step2[18], cospi[46], cospi[18], &step1[18], &step1[29]);
+  btf_16_neon(step2[28], -cospi[50], cospi[14], &step1[19], &step1[28]);
+  btf_16_neon(step2[20], cospi[54], cospi[10], &step1[20], &step1[27]);
+  btf_16_neon(step2[26], -cospi[42], cospi[22], &step1[21], &step1[26]);
+  btf_16_neon(step2[22], cospi[38], cospi[26], &step1[22], &step1[25]);
+  btf_16_neon(step2[24], -cospi[58], cospi[6], &step1[23], &step1[24]);
+
+  step1[32] = vqaddq_s16(step2[32], step2[33]);
+  step1[33] = vqsubq_s16(step2[32], step2[33]);
+  step1[34] = vqsubq_s16(step2[35], step2[34]);
+  step1[35] = vqaddq_s16(step2[35], step2[34]);
+  step1[36] = vqaddq_s16(step2[36], step2[37]);
+  step1[37] = vqsubq_s16(step2[36], step2[37]);
+  step1[38] = vqsubq_s16(step2[39], step2[38]);
+  step1[39] = vqaddq_s16(step2[39], step2[38]);
+  step1[40] = vqaddq_s16(step2[40], step2[41]);
+  step1[41] = vqsubq_s16(step2[40], step2[41]);
+  step1[42] = vqsubq_s16(step2[43], step2[42]);
+  step1[43] = vqaddq_s16(step2[43], step2[42]);
+  step1[44] = vqaddq_s16(step2[44], step2[45]);
+  step1[45] = vqsubq_s16(step2[44], step2[45]);
+  step1[46] = vqsubq_s16(step2[47], step2[46]);
+  step1[47] = vqaddq_s16(step2[47], step2[46]);
+  step1[48] = vqaddq_s16(step2[48], step2[49]);
+  step1[49] = vqsubq_s16(step2[48], step2[49]);
+  step1[50] = vqsubq_s16(step2[51], step2[50]);
+  step1[51] = vqaddq_s16(step2[51], step2[50]);
+  step1[52] = vqaddq_s16(step2[52], step2[53]);
+  step1[53] = vqsubq_s16(step2[52], step2[53]);
+  step1[54] = vqsubq_s16(step2[55], step2[54]);
+  step1[55] = vqaddq_s16(step2[55], step2[54]);
+  step1[56] = vqaddq_s16(step2[56], step2[57]);
+  step1[57] = vqsubq_s16(step2[56], step2[57]);
+  step1[58] = vqsubq_s16(step2[59], step2[58]);
+  step1[59] = vqaddq_s16(step2[59], step2[58]);
+  step1[60] = vqaddq_s16(step2[60], step2[61]);
+  step1[61] = vqsubq_s16(step2[60], step2[61]);
+  step1[62] = vqsubq_s16(step2[63], step2[62]);
+  step1[63] = vqaddq_s16(step2[63], step2[62]);
+
+  // stage 4
+
+  step2[0] = step1[0];
+  step2[2] = step1[2];
+  step2[4] = step1[4];
+  step2[6] = step1[6];
+
+  btf_16_neon(step1[8], cospi[60], cospi[4], &step2[8], &step2[15]);
+  btf_16_neon(step1[14], -cospi[36], cospi[28], &step2[9], &step2[14]);
+  btf_16_neon(step1[10], cospi[44], cospi[20], &step2[10], &step2[13]);
+  btf_16_neon(step1[12], -cospi[52], cospi[12], &step2[11], &step2[12]);
+  btf_16_lane_0_1_neon(step1[62], step1[33], c0, &step2[62], &step2[33]);
+  btf_16_lane_1_0_neon(step1[34], step1[61], c4, &step2[34], &step2[61]);
+  btf_16_lane_2_3_neon(step1[58], step1[37], c0, &step2[58], &step2[37]);
+  btf_16_lane_3_2_neon(step1[38], step1[57], c4, &step2[38], &step2[57]);
+  btf_16_lane_0_1_neon(step1[54], step1[41], c1, &step2[54], &step2[41]);
+  btf_16_lane_1_0_neon(step1[42], step1[53], c5, &step2[42], &step2[53]);
+  btf_16_lane_2_3_neon(step1[50], step1[45], c1, &step2[50], &step2[45]);
+  btf_16_lane_3_2_neon(step1[46], step1[49], c5, &step2[46], &step2[49]);
+
+  step2[16] = vqaddq_s16(step1[16], step1[17]);
+  step2[17] = vqsubq_s16(step1[16], step1[17]);
+  step2[18] = vqsubq_s16(step1[19], step1[18]);
+  step2[19] = vqaddq_s16(step1[19], step1[18]);
+  step2[20] = vqaddq_s16(step1[20], step1[21]);
+  step2[21] = vqsubq_s16(step1[20], step1[21]);
+  step2[22] = vqsubq_s16(step1[23], step1[22]);
+  step2[23] = vqaddq_s16(step1[23], step1[22]);
+  step2[24] = vqaddq_s16(step1[24], step1[25]);
+  step2[25] = vqsubq_s16(step1[24], step1[25]);
+  step2[26] = vqsubq_s16(step1[27], step1[26]);
+  step2[27] = vqaddq_s16(step1[27], step1[26]);
+  step2[28] = vqaddq_s16(step1[28], step1[29]);
+  step2[29] = vqsubq_s16(step1[28], step1[29]);
+  step2[30] = vqsubq_s16(step1[31], step1[30]);
+  step2[31] = vqaddq_s16(step1[31], step1[30]);
+  step2[32] = step1[32];
+  step2[35] = step1[35];
+  step2[36] = step1[36];
+  step2[39] = step1[39];
+  step2[40] = step1[40];
+  step2[43] = step1[43];
+  step2[44] = step1[44];
+  step2[47] = step1[47];
+  step2[48] = step1[48];
+  step2[51] = step1[51];
+  step2[52] = step1[52];
+  step2[55] = step1[55];
+  step2[56] = step1[56];
+  step2[59] = step1[59];
+  step2[60] = step1[60];
+  step2[63] = step1[63];
+
+  // stage 5
+
+  step1[0] = step2[0];
+  step1[2] = step2[2];
+
+  btf_16_neon(step2[4], cospi[56], cospi[8], &step1[4], &step1[7]);
+  btf_16_neon(step2[6], -cospi[40], cospi[24], &step1[5], &step1[6]);
+  btf_16_lane_0_1_neon(step2[30], step2[17], c2, &step1[30], &step1[17]);
+  btf_16_lane_1_0_neon(step2[18], step2[29], c6, &step1[18], &step1[29]);
+  btf_16_lane_2_3_neon(step2[26], step2[21], c2, &step1[26], &step1[21]);
+  btf_16_lane_3_2_neon(step2[22], step2[25], c6, &step1[22], &step1[25]);
+
+  step1[8] = vqaddq_s16(step2[8], step2[9]);
+  step1[9] = vqsubq_s16(step2[8], step2[9]);
+  step1[10] = vqsubq_s16(step2[11], step2[10]);
+  step1[11] = vqaddq_s16(step2[11], step2[10]);
+  step1[12] = vqaddq_s16(step2[12], step2[13]);
+  step1[13] = vqsubq_s16(step2[12], step2[13]);
+  step1[14] = vqsubq_s16(step2[15], step2[14]);
+  step1[15] = vqaddq_s16(step2[15], step2[14]);
+  step1[16] = step2[16];
+  step1[19] = step2[19];
+  step1[20] = step2[20];
+  step1[23] = step2[23];
+  step1[24] = step2[24];
+  step1[27] = step2[27];
+  step1[28] = step2[28];
+  step1[31] = step2[31];
+  step1[32] = vqaddq_s16(step2[32], step2[35]);
+  step1[33] = vqaddq_s16(step2[33], step2[34]);
+  step1[34] = vqsubq_s16(step2[33], step2[34]);
+  step1[35] = vqsubq_s16(step2[32], step2[35]);
+  step1[36] = vqsubq_s16(step2[39], step2[36]);
+  step1[37] = vqsubq_s16(step2[38], step2[37]);
+  step1[38] = vqaddq_s16(step2[38], step2[37]);
+  step1[39] = vqaddq_s16(step2[39], step2[36]);
+  step1[40] = vqaddq_s16(step2[40], step2[43]);
+  step1[41] = vqaddq_s16(step2[41], step2[42]);
+  step1[42] = vqsubq_s16(step2[41], step2[42]);
+  step1[43] = vqsubq_s16(step2[40], step2[43]);
+  step1[44] = vqsubq_s16(step2[47], step2[44]);
+  step1[45] = vqsubq_s16(step2[46], step2[45]);
+  step1[46] = vqaddq_s16(step2[46], step2[45]);
+  step1[47] = vqaddq_s16(step2[47], step2[44]);
+  step1[48] = vqaddq_s16(step2[48], step2[51]);
+  step1[49] = vqaddq_s16(step2[49], step2[50]);
+  step1[50] = vqsubq_s16(step2[49], step2[50]);
+  step1[51] = vqsubq_s16(step2[48], step2[51]);
+  step1[52] = vqsubq_s16(step2[55], step2[52]);
+  step1[53] = vqsubq_s16(step2[54], step2[53]);
+  step1[54] = vqaddq_s16(step2[54], step2[53]);
+  step1[55] = vqaddq_s16(step2[55], step2[52]);
+  step1[56] = vqaddq_s16(step2[56], step2[59]);
+  step1[57] = vqaddq_s16(step2[57], step2[58]);
+  step1[58] = vqsubq_s16(step2[57], step2[58]);
+  step1[59] = vqsubq_s16(step2[56], step2[59]);
+  step1[60] = vqsubq_s16(step2[63], step2[60]);
+  step1[61] = vqsubq_s16(step2[62], step2[61]);
+  step1[62] = vqaddq_s16(step2[62], step2[61]);
+  step1[63] = vqaddq_s16(step2[63], step2[60]);
+
+  // stage 6
+
+  btf_16_neon(step1[0], cospi[32], cospi[32], &step2[0], &step2[1]);
+  btf_16_neon(step1[2], cospi[48], cospi[16], &step2[2], &step2[3]);
+  btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]);
+  btf_16_lane_3_2_neon(step1[10], step1[13], c7, &step2[10], &step2[13]);
+  btf_16_lane_0_1_neon(step1[61], step1[34], c2, &step2[61], &step2[34]);
+  btf_16_lane_0_1_neon(step1[60], step1[35], c2, &step2[60], &step2[35]);
+  btf_16_lane_1_0_neon(step1[36], step1[59], c6, &step2[36], &step2[59]);
+  btf_16_lane_1_0_neon(step1[37], step1[58], c6, &step2[37], &step2[58]);
+  btf_16_lane_2_3_neon(step1[53], step1[42], c2, &step2[53], &step2[42]);
+  btf_16_lane_2_3_neon(step1[52], step1[43], c2, &step2[52], &step2[43]);
+  btf_16_lane_3_2_neon(step1[44], step1[51], c6, &step2[44], &step2[51]);
+  btf_16_lane_3_2_neon(step1[45], step1[50], c6, &step2[45], &step2[50]);
+
+  step2[4] = vqaddq_s16(step1[4], step1[5]);
+  step2[5] = vqsubq_s16(step1[4], step1[5]);
+  step2[6] = vqsubq_s16(step1[7], step1[6]);
+  step2[7] = vqaddq_s16(step1[7], step1[6]);
+  step2[8] = step1[8];
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+  step2[15] = step1[15];
+  step2[16] = vqaddq_s16(step1[16], step1[19]);
+  step2[17] = vqaddq_s16(step1[17], step1[18]);
+  step2[18] = vqsubq_s16(step1[17], step1[18]);
+  step2[19] = vqsubq_s16(step1[16], step1[19]);
+  step2[20] = vqsubq_s16(step1[23], step1[20]);
+  step2[21] = vqsubq_s16(step1[22], step1[21]);
+  step2[22] = vqaddq_s16(step1[22], step1[21]);
+  step2[23] = vqaddq_s16(step1[23], step1[20]);
+  step2[24] = vqaddq_s16(step1[24], step1[27]);
+  step2[25] = vqaddq_s16(step1[25], step1[26]);
+  step2[26] = vqsubq_s16(step1[25], step1[26]);
+  step2[27] = vqsubq_s16(step1[24], step1[27]);
+  step2[28] = vqsubq_s16(step1[31], step1[28]);
+  step2[29] = vqsubq_s16(step1[30], step1[29]);
+  step2[30] = vqaddq_s16(step1[30], step1[29]);
+  step2[31] = vqaddq_s16(step1[31], step1[28]);
+  step2[32] = step1[32];
+  step2[33] = step1[33];
+  step2[38] = step1[38];
+  step2[39] = step1[39];
+  step2[40] = step1[40];
+  step2[41] = step1[41];
+  step2[46] = step1[46];
+  step2[47] = step1[47];
+  step2[48] = step1[48];
+  step2[49] = step1[49];
+  step2[54] = step1[54];
+  step2[55] = step1[55];
+  step2[56] = step1[56];
+  step2[57] = step1[57];
+  step2[62] = step1[62];
+  step2[63] = step1[63];
+
+  // stage 7
+
+  btf_16_lane_0_1_neon(step2[6], step2[5], c3, &step1[6], &step1[5]);
+  btf_16_lane_2_3_neon(step2[29], step2[18], c3, &step1[29], &step1[18]);
+  btf_16_lane_2_3_neon(step2[28], step2[19], c3, &step1[28], &step1[19]);
+  btf_16_lane_3_2_neon(step2[20], step2[27], c7, &step1[20], &step1[27]);
+  btf_16_lane_3_2_neon(step2[21], step2[26], c7, &step1[21], &step1[26]);
+
+  step1[0] = vqaddq_s16(step2[0], step2[3]);
+  step1[1] = vqaddq_s16(step2[1], step2[2]);
+  step1[2] = vqsubq_s16(step2[1], step2[2]);
+  step1[3] = vqsubq_s16(step2[0], step2[3]);
+  step1[4] = step2[4];
+  step1[7] = step2[7];
+  step1[8] = vqaddq_s16(step2[8], step2[11]);
+  step1[9] = vqaddq_s16(step2[9], step2[10]);
+  step1[10] = vqsubq_s16(step2[9], step2[10]);
+  step1[11] = vqsubq_s16(step2[8], step2[11]);
+  step1[12] = vqsubq_s16(step2[15], step2[12]);
+  step1[13] = vqsubq_s16(step2[14], step2[13]);
+  step1[14] = vqaddq_s16(step2[14], step2[13]);
+  step1[15] = vqaddq_s16(step2[15], step2[12]);
+  step1[16] = step2[16];
+  step1[17] = step2[17];
+  step1[22] = step2[22];
+  step1[23] = step2[23];
+  step1[24] = step2[24];
+  step1[25] = step2[25];
+  step1[30] = step2[30];
+  step1[31] = step2[31];
+  step1[32] = vqaddq_s16(step2[32], step2[39]);
+  step1[33] = vqaddq_s16(step2[33], step2[38]);
+  step1[34] = vqaddq_s16(step2[34], step2[37]);
+  step1[35] = vqaddq_s16(step2[35], step2[36]);
+  step1[36] = vqsubq_s16(step2[35], step2[36]);
+  step1[37] = vqsubq_s16(step2[34], step2[37]);
+  step1[38] = vqsubq_s16(step2[33], step2[38]);
+  step1[39] = vqsubq_s16(step2[32], step2[39]);
+  step1[40] = vqsubq_s16(step2[47], step2[40]);
+  step1[41] = vqsubq_s16(step2[46], step2[41]);
+  step1[42] = vqsubq_s16(step2[45], step2[42]);
+  step1[43] = vqsubq_s16(step2[44], step2[43]);
+  step1[44] = vqaddq_s16(step2[43], step2[44]);
+  step1[45] = vqaddq_s16(step2[42], step2[45]);
+  step1[46] = vqaddq_s16(step2[41], step2[46]);
+  step1[47] = vqaddq_s16(step2[40], step2[47]);
+  step1[48] = vqaddq_s16(step2[48], step2[55]);
+  step1[49] = vqaddq_s16(step2[49], step2[54]);
+  step1[50] = vqaddq_s16(step2[50], step2[53]);
+  step1[51] = vqaddq_s16(step2[51], step2[52]);
+  step1[52] = vqsubq_s16(step2[51], step2[52]);
+  step1[53] = vqsubq_s16(step2[50], step2[53]);
+  step1[54] = vqsubq_s16(step2[49], step2[54]);
+  step1[55] = vqsubq_s16(step2[48], step2[55]);
+  step1[56] = vqsubq_s16(step2[63], step2[56]);
+  step1[57] = vqsubq_s16(step2[62], step2[57]);
+  step1[58] = vqsubq_s16(step2[61], step2[58]);
+  step1[59] = vqsubq_s16(step2[60], step2[59]);
+  step1[60] = vqaddq_s16(step2[59], step2[60]);
+  step1[61] = vqaddq_s16(step2[58], step2[61]);
+  step1[62] = vqaddq_s16(step2[57], step2[62]);
+  step1[63] = vqaddq_s16(step2[56], step2[63]);
+
+  // stage 8
+
+  btf_16_lane_0_1_neon(step1[13], step1[10], c3, &step2[13], &step2[10]);
+  btf_16_lane_0_1_neon(step1[12], step1[11], c3, &step2[12], &step2[11]);
+  btf_16_lane_2_3_neon(step1[59], step1[36], c3, &step2[59], &step2[36]);
+  btf_16_lane_2_3_neon(step1[58], step1[37], c3, &step2[58], &step2[37]);
+  btf_16_lane_2_3_neon(step1[57], step1[38], c3, &step2[57], &step2[38]);
+  btf_16_lane_2_3_neon(step1[56], step1[39], c3, &step2[56], &step2[39]);
+  btf_16_lane_3_2_neon(step1[40], step1[55], c7, &step2[40], &step2[55]);
+  btf_16_lane_3_2_neon(step1[41], step1[54], c7, &step2[41], &step2[54]);
+  btf_16_lane_3_2_neon(step1[42], step1[53], c7, &step2[42], &step2[53]);
+  btf_16_lane_3_2_neon(step1[43], step1[52], c7, &step2[43], &step2[52]);
+
+  step2[0] = vqaddq_s16(step1[0], step1[7]);
+  step2[1] = vqaddq_s16(step1[1], step1[6]);
+  step2[2] = vqaddq_s16(step1[2], step1[5]);
+  step2[3] = vqaddq_s16(step1[3], step1[4]);
+  step2[4] = vqsubq_s16(step1[3], step1[4]);
+  step2[5] = vqsubq_s16(step1[2], step1[5]);
+  step2[6] = vqsubq_s16(step1[1], step1[6]);
+  step2[7] = vqsubq_s16(step1[0], step1[7]);
+  step2[8] = step1[8];
+  step2[9] = step1[9];
+  step2[14] = step1[14];
+  step2[15] = step1[15];
+  step2[16] = vqaddq_s16(step1[16], step1[23]);
+  step2[17] = vqaddq_s16(step1[17], step1[22]);
+  step2[18] = vqaddq_s16(step1[18], step1[21]);
+  step2[19] = vqaddq_s16(step1[19], step1[20]);
+  step2[20] = vqsubq_s16(step1[19], step1[20]);
+  step2[21] = vqsubq_s16(step1[18], step1[21]);
+  step2[22] = vqsubq_s16(step1[17], step1[22]);
+  step2[23] = vqsubq_s16(step1[16], step1[23]);
+  step2[24] = vqsubq_s16(step1[31], step1[24]);
+  step2[25] = vqsubq_s16(step1[30], step1[25]);
+  step2[26] = vqsubq_s16(step1[29], step1[26]);
+  step2[27] = vqsubq_s16(step1[28], step1[27]);
+  step2[28] = vqaddq_s16(step1[28], step1[27]);
+  step2[29] = vqaddq_s16(step1[29], step1[26]);
+  step2[30] = vqaddq_s16(step1[30], step1[25]);
+  step2[31] = vqaddq_s16(step1[31], step1[24]);
+  step2[32] = step1[32];
+  step2[33] = step1[33];
+  step2[34] = step1[34];
+  step2[35] = step1[35];
+  step2[44] = step1[44];
+  step2[45] = step1[45];
+  step2[46] = step1[46];
+  step2[47] = step1[47];
+  step2[48] = step1[48];
+  step2[49] = step1[49];
+  step2[50] = step1[50];
+  step2[51] = step1[51];
+  step2[60] = step1[60];
+  step2[61] = step1[61];
+  step2[62] = step1[62];
+  step2[63] = step1[63];
+
+  // stage 9
+  idct64_stage9_neon(step2, step1, cos_bit);
+
+  // stage 10
+  idct64_stage10_neon(step1, step2, cos_bit);
+
+  // stage 11
+
+  out[0] = vqaddq_s16(step2[0], step2[63]);
+  out[1] = vqaddq_s16(step2[1], step2[62]);
+  out[2] = vqaddq_s16(step2[2], step2[61]);
+  out[3] = vqaddq_s16(step2[3], step2[60]);
+  out[4] = vqaddq_s16(step2[4], step2[59]);
+  out[5] = vqaddq_s16(step2[5], step2[58]);
+  out[6] = vqaddq_s16(step2[6], step2[57]);
+  out[7] = vqaddq_s16(step2[7], step2[56]);
+  out[8] = vqaddq_s16(step2[8], step2[55]);
+  out[9] = vqaddq_s16(step2[9], step2[54]);
+  out[10] = vqaddq_s16(step2[10], step2[53]);
+  out[11] = vqaddq_s16(step2[11], step2[52]);
+  out[12] = vqaddq_s16(step2[12], step2[51]);
+  out[13] = vqaddq_s16(step2[13], step2[50]);
+  out[14] = vqaddq_s16(step2[14], step2[49]);
+  out[15] = vqaddq_s16(step2[15], step2[48]);
+  out[16] = vqaddq_s16(step2[16], step2[47]);
+  out[17] = vqaddq_s16(step2[17], step2[46]);
+  out[18] = vqaddq_s16(step2[18], step2[45]);
+  out[19] = vqaddq_s16(step2[19], step2[44]);
+  out[20] = vqaddq_s16(step2[20], step2[43]);
+  out[21] = vqaddq_s16(step2[21], step2[42]);
+  out[22] = vqaddq_s16(step2[22], step2[41]);
+  out[23] = vqaddq_s16(step2[23], step2[40]);
+  out[24] = vqaddq_s16(step2[24], step2[39]);
+  out[25] = vqaddq_s16(step2[25], step2[38]);
+  out[26] = vqaddq_s16(step2[26], step2[37]);
+  out[27] = vqaddq_s16(step2[27], step2[36]);
+  out[28] = vqaddq_s16(step2[28], step2[35]);
+  out[29] = vqaddq_s16(step2[29], step2[34]);
+  out[30] = vqaddq_s16(step2[30], step2[33]);
+  out[31] = vqaddq_s16(step2[31], step2[32]);
+  out[32] = vqsubq_s16(step2[31], step2[32]);
+  out[33] = vqsubq_s16(step2[30], step2[33]);
+  out[34] = vqsubq_s16(step2[29], step2[34]);
+  out[35] = vqsubq_s16(step2[28], step2[35]);
+  out[36] = vqsubq_s16(step2[27], step2[36]);
+  out[37] = vqsubq_s16(step2[26], step2[37]);
+  out[38] = vqsubq_s16(step2[25], step2[38]);
+  out[39] = vqsubq_s16(step2[24], step2[39]);
+  out[40] = vqsubq_s16(step2[23], step2[40]);
+  out[41] = vqsubq_s16(step2[22], step2[41]);
+  out[42] = vqsubq_s16(step2[21], step2[42]);
+  out[43] = vqsubq_s16(step2[20], step2[43]);
+  out[44] = vqsubq_s16(step2[19], step2[44]);
+  out[45] = vqsubq_s16(step2[18], step2[45]);
+  out[46] = vqsubq_s16(step2[17], step2[46]);
+  out[47] = vqsubq_s16(step2[16], step2[47]);
+  out[48] = vqsubq_s16(step2[15], step2[48]);
+  out[49] = vqsubq_s16(step2[14], step2[49]);
+  out[50] = vqsubq_s16(step2[13], step2[50]);
+  out[51] = vqsubq_s16(step2[12], step2[51]);
+  out[52] = vqsubq_s16(step2[11], step2[52]);
+  out[53] = vqsubq_s16(step2[10], step2[53]);
+  out[54] = vqsubq_s16(step2[9], step2[54]);
+  out[55] = vqsubq_s16(step2[8], step2[55]);
+  out[56] = vqsubq_s16(step2[7], step2[56]);
+  out[57] = vqsubq_s16(step2[6], step2[57]);
+  out[58] = vqsubq_s16(step2[5], step2[58]);
+  out[59] = vqsubq_s16(step2[4], step2[59]);
+  out[60] = vqsubq_s16(step2[3], step2[60]);
+  out[61] = vqsubq_s16(step2[2], step2[61]);
+  out[62] = vqsubq_s16(step2[1], step2[62]);
+  out[63] = vqsubq_s16(step2[0], step2[63]);
+}
+
+static INLINE void idct64_low1_neon(int16x8_t *input, int16x8_t *out,
+                                    int8_t cos_bit, int bit) {
+  (void)bit;
+  const int32_t *cospi = cospi_arr(cos_bit);
+  int16x8_t step1;
+  int32x4_t t32[2];
+
+  // stage 1
+  // stage 2
+  // stage 3
+  // stage 4
+  // stage 5
+  // stage 6
+
+  t32[0] = vmull_n_s16(vget_low_s16(input[0]), cospi[32]);
+  t32[1] = vmull_n_s16(vget_high_s16(input[0]), cospi[32]);
+
+  step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
+                       vrshrn_n_s32(t32[1], INV_COS_BIT));
+  // stage 7
+  // stage 8
+  // stage 9
+  // stage 10
+  // stage 11
+  out[0] = step1;
+  out[1] = step1;
+  out[2] = step1;
+  out[3] = step1;
+  out[4] = step1;
+  out[5] = step1;
+  out[6] = step1;
+  out[7] = step1;
+  out[8] = step1;
+  out[9] = step1;
+  out[10] = step1;
+  out[11] = step1;
+  out[12] = step1;
+  out[13] = step1;
+  out[14] = step1;
+  out[15] = step1;
+  out[16] = step1;
+  out[17] = step1;
+  out[18] = step1;
+  out[19] = step1;
+  out[20] = step1;
+  out[21] = step1;
+  out[22] = step1;
+  out[23] = step1;
+  out[24] = step1;
+  out[25] = step1;
+  out[26] = step1;
+  out[27] = step1;
+  out[28] = step1;
+  out[29] = step1;
+  out[30] = step1;
+  out[31] = step1;
+  out[32] = step1;
+  out[33] = step1;
+  out[34] = step1;
+  out[35] = step1;
+  out[36] = step1;
+  out[37] = step1;
+  out[38] = step1;
+  out[39] = step1;
+  out[40] = step1;
+  out[41] = step1;
+  out[42] = step1;
+  out[43] = step1;
+  out[44] = step1;
+  out[45] = step1;
+  out[46] = step1;
+  out[47] = step1;
+  out[48] = step1;
+  out[49] = step1;
+  out[50] = step1;
+  out[51] = step1;
+  out[52] = step1;
+  out[53] = step1;
+  out[54] = step1;
+  out[55] = step1;
+  out[56] = step1;
+  out[57] = step1;
+  out[58] = step1;
+  out[59] = step1;
+  out[60] = step1;
+  out[61] = step1;
+  out[62] = step1;
+  out[63] = step1;
+}
+
+static INLINE void idct64_low8_neon(int16x8_t *in, int16x8_t *out,
+                                    int8_t cos_bit, int bit) {
+  (void)bit;
+  const int32_t *cospi = cospi_arr(cos_bit);
+  int16x8_t step2[64], step1[64];
+
+  const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60],
+                                      (int16_t)cospi[36], (int16_t)cospi[28]);
+  const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44],
+                                      (int16_t)cospi[52], (int16_t)cospi[12]);
+  const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
+                                      (int16_t)cospi[40], (int16_t)cospi[24]);
+  const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+                                      (int16_t)cospi[16], (int16_t)cospi[48]);
+  const int16x4_t c4 =
+      set_s16x4_neon((int16_t)(-cospi[36]), (int16_t)(-cospi[28]),
+                     (int16_t)(-cospi[52]), (int16_t)(-cospi[12]));
+  const int16x4_t c5 =
+      set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]),
+                     (int16_t)(-cospi[40]), (int16_t)(-cospi[24]));
+  const int16x4_t c6 =
+      set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
+                     (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
+
+  // stage 1
+  // stage 2
+
+  step2[0] = in[0];
+  step2[8] = in[4];
+  step2[16] = in[2];
+  step2[24] = in[6];
+
+  btf_16_neon(in[1], cospi[63], cospi[1], &step2[32], &step2[63]);
+  btf_16_neon(in[7], -cospi[57], cospi[7], &step2[39], &step2[56]);
+  btf_16_neon(in[5], cospi[59], cospi[5], &step2[40], &step2[55]);
+  btf_16_neon(in[3], -cospi[61], cospi[3], &step2[47], &step2[48]);
+
+  // stage 3
+
+  step1[0] = step2[0];
+  step1[8] = step2[8];
+
+  btf_16_neon(step2[16], cospi[62], cospi[2], &step1[16], &step1[31]);
+  btf_16_neon(step2[24], -cospi[58], cospi[6], &step1[23], &step1[24]);
+
+  step1[32] = step2[32];
+  step1[33] = step2[32];
+  step1[38] = step2[39];
+  step1[39] = step2[39];
+  step1[40] = step2[40];
+  step1[41] = step2[40];
+  step1[46] = step2[47];
+  step1[47] = step2[47];
+  step1[48] = step2[48];
+  step1[49] = step2[48];
+  step1[54] = step2[55];
+  step1[55] = step2[55];
+  step1[56] = step2[56];
+  step1[57] = step2[56];
+  step1[62] = step2[63];
+  step1[63] = step2[63];
+
+  // stage 4
+
+  step2[0] = step1[0];
+
+  btf_16_neon(step1[8], cospi[60], cospi[4], &step2[8], &step2[15]);
+  btf_16_lane_0_1_neon(step1[62], step1[33], c0, &step2[62], &step2[33]);
+  btf_16_lane_1_0_neon(step1[38], step1[57], c4, &step2[38], &step2[57]);
+  btf_16_lane_0_1_neon(step1[54], step1[41], c1, &step2[54], &step2[41]);
+  btf_16_lane_3_2_neon(step1[46], step1[49], c4, &step2[46], &step2[49]);
+
+  step2[16] = step1[16];
+  step2[17] = step1[16];
+  step2[22] = step1[23];
+  step2[23] = step1[23];
+  step2[24] = step1[24];
+  step2[25] = step1[24];
+  step2[30] = step1[31];
+  step2[31] = step1[31];
+  step2[32] = step1[32];
+  step2[39] = step1[39];
+  step2[40] = step1[40];
+  step2[47] = step1[47];
+  step2[48] = step1[48];
+  step2[55] = step1[55];
+  step2[56] = step1[56];
+  step2[63] = step1[63];
+
+  // stage 5
+
+  step1[0] = step2[0];
+
+  btf_16_lane_0_1_neon(step2[30], step2[17], c2, &step1[30], &step1[17]);
+  btf_16_lane_3_2_neon(step2[22], step2[25], c5, &step1[22], &step1[25]);
+
+  step1[8] = step2[8];
+  step1[9] = step2[8];
+  step1[14] = step2[15];
+  step1[15] = step2[15];
+
+  step1[16] = step2[16];
+  step1[23] = step2[23];
+  step1[24] = step2[24];
+  step1[31] = step2[31];
+  step1[32] = step2[32];
+  step1[33] = step2[33];
+  step1[34] = step2[33];
+  step1[35] = step2[32];
+  step1[36] = step2[39];
+  step1[37] = step2[38];
+  step1[38] = step2[38];
+  step1[39] = step2[39];
+  step1[40] = step2[40];
+  step1[41] = step2[41];
+  step1[42] = step2[41];
+  step1[43] = step2[40];
+  step1[44] = step2[47];
+  step1[45] = step2[46];
+  step1[46] = step2[46];
+  step1[47] = step2[47];
+  step1[48] = step2[48];
+  step1[49] = step2[49];
+  step1[50] = step2[49];
+  step1[51] = step2[48];
+  step1[52] = step2[55];
+  step1[53] = step2[54];
+  step1[54] = step2[54];
+  step1[55] = step2[55];
+  step1[56] = step2[56];
+  step1[57] = step2[57];
+  step1[58] = step2[57];
+  step1[59] = step2[56];
+  step1[60] = step2[63];
+  step1[61] = step2[62];
+  step1[62] = step2[62];
+  step1[63] = step2[63];
+
+  // stage 6
+
+  btf_16_neon(step1[0], cospi[32], cospi[32], &step2[0], &step2[1]);
+  btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]);
+  btf_16_lane_0_1_neon(step1[61], step1[34], c2, &step2[61], &step2[34]);
+  btf_16_lane_0_1_neon(step1[60], step1[35], c2, &step2[60], &step2[35]);
+  btf_16_lane_1_0_neon(step1[36], step1[59], c5, &step2[36], &step2[59]);
+  btf_16_lane_1_0_neon(step1[37], step1[58], c5, &step2[37], &step2[58]);
+  btf_16_lane_2_3_neon(step1[53], step1[42], c2, &step2[53], &step2[42]);
+  btf_16_lane_2_3_neon(step1[52], step1[43], c2, &step2[52], &step2[43]);
+  btf_16_lane_3_2_neon(step1[44], step1[51], c5, &step2[44], &step2[51]);
+  btf_16_lane_3_2_neon(step1[45], step1[50], c5, &step2[45], &step2[50]);
+
+  step2[8] = step1[8];
+  step2[15] = step1[15];
+  step2[16] = step1[16];
+  step2[17] = step1[17];
+  step2[18] = step1[17];
+  step2[19] = step1[16];
+  step2[20] = step1[23];
+  step2[21] = step1[22];
+  step2[22] = step1[22];
+  step2[23] = step1[23];
+  step2[24] = step1[24];
+  step2[25] = step1[25];
+  step2[26] = step1[25];
+  step2[27] = step1[24];
+  step2[28] = step1[31];
+  step2[29] = step1[30];
+  step2[30] = step1[30];
+  step2[31] = step1[31];
+  step2[32] = step1[32];
+  step2[33] = step1[33];
+  step2[38] = step1[38];
+  step2[39] = step1[39];
+  step2[40] = step1[40];
+  step2[41] = step1[41];
+  step2[46] = step1[46];
+  step2[47] = step1[47];
+  step2[48] = step1[48];
+  step2[49] = step1[49];
+  step2[54] = step1[54];
+  step2[55] = step1[55];
+  step2[56] = step1[56];
+  step2[57] = step1[57];
+  step2[62] = step1[62];
+  step2[63] = step1[63];
+
+  // stage 7
+
+  btf_16_lane_2_3_neon(step2[29], step2[18], c3, &step1[29], &step1[18]);
+  btf_16_lane_2_3_neon(step2[28], step2[19], c3, &step1[28], &step1[19]);
+  btf_16_lane_3_2_neon(step2[20], step2[27], c6, &step1[20], &step1[27]);
+  btf_16_lane_3_2_neon(step2[21], step2[26], c6, &step1[21], &step1[26]);
+
+  step1[0] = step2[0];
+  step1[1] = step2[1];
+  step1[2] = step2[1];
+  step1[3] = step2[0];
+  step1[8] = step2[8];
+  step1[9] = step2[9];
+  step1[10] = step2[9];
+  step1[11] = step2[8];
+  step1[12] = step2[15];
+  step1[13] = step2[14];
+  step1[14] = step2[14];
+  step1[15] = step2[15];
+  step1[16] = step2[16];
+  step1[17] = step2[17];
+  step1[22] = step2[22];
+  step1[23] = step2[23];
+  step1[24] = step2[24];
+  step1[25] = step2[25];
+  step1[30] = step2[30];
+  step1[31] = step2[31];
+  step1[32] = vqaddq_s16(step2[32], step2[39]);
+  step1[33] = vqaddq_s16(step2[33], step2[38]);
+  step1[34] = vqaddq_s16(step2[34], step2[37]);
+  step1[35] = vqaddq_s16(step2[35], step2[36]);
+  step1[36] = vqsubq_s16(step2[35], step2[36]);
+  step1[37] = vqsubq_s16(step2[34], step2[37]);
+  step1[38] = vqsubq_s16(step2[33], step2[38]);
+  step1[39] = vqsubq_s16(step2[32], step2[39]);
+  step1[40] = vqsubq_s16(step2[47], step2[40]);
+  step1[41] = vqsubq_s16(step2[46], step2[41]);
+  step1[42] = vqsubq_s16(step2[45], step2[42]);
+  step1[43] = vqsubq_s16(step2[44], step2[43]);
+  step1[44] = vqaddq_s16(step2[43], step2[44]);
+  step1[45] = vqaddq_s16(step2[42], step2[45]);
+  step1[46] = vqaddq_s16(step2[41], step2[46]);
+  step1[47] = vqaddq_s16(step2[40], step2[47]);
+  step1[48] = vqaddq_s16(step2[48], step2[55]);
+  step1[49] = vqaddq_s16(step2[49], step2[54]);
+  step1[50] = vqaddq_s16(step2[50], step2[53]);
+  step1[51] = vqaddq_s16(step2[51], step2[52]);
+  step1[52] = vqsubq_s16(step2[51], step2[52]);
+  step1[53] = vqsubq_s16(step2[50], step2[53]);
+  step1[54] = vqsubq_s16(step2[49], step2[54]);
+  step1[55] = vqsubq_s16(step2[48], step2[55]);
+  step1[56] = vqsubq_s16(step2[63], step2[56]);
+  step1[57] = vqsubq_s16(step2[62], step2[57]);
+  step1[58] = vqsubq_s16(step2[61], step2[58]);
+  step1[59] = vqsubq_s16(step2[60], step2[59]);
+  step1[60] = vqaddq_s16(step2[59], step2[60]);
+  step1[61] = vqaddq_s16(step2[58], step2[61]);
+  step1[62] = vqaddq_s16(step2[57], step2[62]);
+  step1[63] = vqaddq_s16(step2[56], step2[63]);
+
+  // stage 8
+
+  btf_16_lane_0_1_neon(step1[13], step1[10], c3, &step2[13], &step2[10]);
+  btf_16_lane_0_1_neon(step1[12], step1[11], c3, &step2[12], &step2[11]);
+  btf_16_lane_2_3_neon(step1[59], step1[36], c3, &step2[59], &step2[36]);
+  btf_16_lane_2_3_neon(step1[58], step1[37], c3, &step2[58], &step2[37]);
+  btf_16_lane_2_3_neon(step1[57], step1[38], c3, &step2[57], &step2[38]);
+  btf_16_lane_2_3_neon(step1[56], step1[39], c3, &step2[56], &step2[39]);
+  btf_16_lane_3_2_neon(step1[40], step1[55], c6, &step2[40], &step2[55]);
+  btf_16_lane_3_2_neon(step1[41], step1[54], c6, &step2[41], &step2[54]);
+  btf_16_lane_3_2_neon(step1[42], step1[53], c6, &step2[42], &step2[53]);
+  btf_16_lane_3_2_neon(step1[43], step1[52], c6, &step2[43], &step2[52]);
+
+  step2[0] = step1[0];
+  step2[1] = step1[1];
+  step2[2] = step1[2];
+  step2[3] = step1[3];
+  step2[4] = step1[3];
+  step2[5] = step1[2];
+  step2[6] = step1[1];
+  step2[7] = step1[0];
+  step2[8] = step1[8];
+  step2[9] = step1[9];
+  step2[14] = step1[14];
+  step2[15] = step1[15];
+  step2[16] = vqaddq_s16(step1[16], step1[23]);
+  step2[17] = vqaddq_s16(step1[17], step1[22]);
+  step2[18] = vqaddq_s16(step1[18], step1[21]);
+  step2[19] = vqaddq_s16(step1[19], step1[20]);
+  step2[20] = vqsubq_s16(step1[19], step1[20]);
+  step2[21] = vqsubq_s16(step1[18], step1[21]);
+  step2[22] = vqsubq_s16(step1[17], step1[22]);
+  step2[23] = vqsubq_s16(step1[16], step1[23]);
+  step2[24] = vqsubq_s16(step1[31], step1[24]);
+  step2[25] = vqsubq_s16(step1[30], step1[25]);
+  step2[26] = vqsubq_s16(step1[29], step1[26]);
+  step2[27] = vqsubq_s16(step1[28], step1[27]);
+  step2[28] = vqaddq_s16(step1[28], step1[27]);
+  step2[29] = vqaddq_s16(step1[29], step1[26]);
+  step2[30] = vqaddq_s16(step1[30], step1[25]);
+  step2[31] = vqaddq_s16(step1[31], step1[24]);
+  step2[32] = step1[32];
+  step2[33] = step1[33];
+  step2[34] = step1[34];
+  step2[35] = step1[35];
+  step2[44] = step1[44];
+  step2[45] = step1[45];
+  step2[46] = step1[46];
+  step2[47] = step1[47];
+  step2[48] = step1[48];
+  step2[49] = step1[49];
+  step2[50] = step1[50];
+  step2[51] = step1[51];
+  step2[60] = step1[60];
+  step2[61] = step1[61];
+  step2[62] = step1[62];
+  step2[63] = step1[63];
+
+  // stage 9
+  idct64_stage9_neon(step2, step1, cos_bit);
+
+  // stage 10
+  idct64_stage10_neon(step1, step2, cos_bit);
+
+  // stage 11
+
+  out[0] = vqaddq_s16(step2[0], step2[63]);
+  out[1] = vqaddq_s16(step2[1], step2[62]);
+  out[2] = vqaddq_s16(step2[2], step2[61]);
+  out[3] = vqaddq_s16(step2[3], step2[60]);
+  out[4] = vqaddq_s16(step2[4], step2[59]);
+  out[5] = vqaddq_s16(step2[5], step2[58]);
+  out[6] = vqaddq_s16(step2[6], step2[57]);
+  out[7] = vqaddq_s16(step2[7], step2[56]);
+  out[8] = vqaddq_s16(step2[8], step2[55]);
+  out[9] = vqaddq_s16(step2[9], step2[54]);
+  out[10] = vqaddq_s16(step2[10], step2[53]);
+  out[11] = vqaddq_s16(step2[11], step2[52]);
+  out[12] = vqaddq_s16(step2[12], step2[51]);
+  out[13] = vqaddq_s16(step2[13], step2[50]);
+  out[14] = vqaddq_s16(step2[14], step2[49]);
+  out[15] = vqaddq_s16(step2[15], step2[48]);
+  out[16] = vqaddq_s16(step2[16], step2[47]);
+  out[17] = vqaddq_s16(step2[17], step2[46]);
+  out[18] = vqaddq_s16(step2[18], step2[45]);
+  out[19] = vqaddq_s16(step2[19], step2[44]);
+  out[20] = vqaddq_s16(step2[20], step2[43]);
+  out[21] = vqaddq_s16(step2[21], step2[42]);
+  out[22] = vqaddq_s16(step2[22], step2[41]);
+  out[23] = vqaddq_s16(step2[23], step2[40]);
+  out[24] = vqaddq_s16(step2[24], step2[39]);
+  out[25] = vqaddq_s16(step2[25], step2[38]);
+  out[26] = vqaddq_s16(step2[26], step2[37]);
+  out[27] = vqaddq_s16(step2[27], step2[36]);
+  out[28] = vqaddq_s16(step2[28], step2[35]);
+  out[29] = vqaddq_s16(step2[29], step2[34]);
+  out[30] = vqaddq_s16(step2[30], step2[33]);
+  out[31] = vqaddq_s16(step2[31], step2[32]);
+  out[32] = vqsubq_s16(step2[31], step2[32]);
+  out[33] = vqsubq_s16(step2[30], step2[33]);
+  out[34] = vqsubq_s16(step2[29], step2[34]);
+  out[35] = vqsubq_s16(step2[28], step2[35]);
+  out[36] = vqsubq_s16(step2[27], step2[36]);
+  out[37] = vqsubq_s16(step2[26], step2[37]);
+  out[38] = vqsubq_s16(step2[25], step2[38]);
+  out[39] = vqsubq_s16(step2[24], step2[39]);
+  out[40] = vqsubq_s16(step2[23], step2[40]);
+  out[41] = vqsubq_s16(step2[22], step2[41]);
+  out[42] = vqsubq_s16(step2[21], step2[42]);
+  out[43] = vqsubq_s16(step2[20], step2[43]);
+  out[44] = vqsubq_s16(step2[19], step2[44]);
+  out[45] = vqsubq_s16(step2[18], step2[45]);
+  out[46] = vqsubq_s16(step2[17], step2[46]);
+  out[47] = vqsubq_s16(step2[16], step2[47]);
+  out[48] = vqsubq_s16(step2[15], step2[48]);
+  out[49] = vqsubq_s16(step2[14], step2[49]);
+  out[50] = vqsubq_s16(step2[13], step2[50]);
+  out[51] = vqsubq_s16(step2[12], step2[51]);
+  out[52] = vqsubq_s16(step2[11], step2[52]);
+  out[53] = vqsubq_s16(step2[10], step2[53]);
+  out[54] = vqsubq_s16(step2[9], step2[54]);
+  out[55] = vqsubq_s16(step2[8], step2[55]);
+  out[56] = vqsubq_s16(step2[7], step2[56]);
+  out[57] = vqsubq_s16(step2[6], step2[57]);
+  out[58] = vqsubq_s16(step2[5], step2[58]);
+  out[59] = vqsubq_s16(step2[4], step2[59]);
+  out[60] = vqsubq_s16(step2[3], step2[60]);
+  out[61] = vqsubq_s16(step2[2], step2[61]);
+  out[62] = vqsubq_s16(step2[1], step2[62]);
+  out[63] = vqsubq_s16(step2[0], step2[63]);
+}
+
+static INLINE void idct64_low16_neon(int16x8_t *in, int16x8_t *out,
+                                     int8_t cos_bit, int bit) {
+  (void)bit;
+  const int32_t *cospi = cospi_arr(cos_bit);
+  int16x8_t step2[64], step1[64];
+
+  const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60],
+                                      (int16_t)cospi[36], (int16_t)cospi[28]);
+  const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44],
+                                      (int16_t)cospi[52], (int16_t)cospi[12]);
+  const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
+                                      (int16_t)cospi[40], (int16_t)cospi[24]);
+  const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
+                                      (int16_t)cospi[16], (int16_t)cospi[48]);
+  const int16x4_t c4 =
+      set_s16x4_neon((int16_t)(-cospi[4]), (int16_t)(-cospi[60]),
+                     (int16_t)(-cospi[36]), (int16_t)(-cospi[28]));
+  const int16x4_t c5 =
+      set_s16x4_neon((int16_t)(-cospi[20]), (int16_t)(-cospi[44]),
+                     (int16_t)(-cospi[52]), (int16_t)(-cospi[12]));
+  const int16x4_t c6 =
+      set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]),
+                     (int16_t)(-cospi[40]), (int16_t)(-cospi[24]));
+  const int16x4_t c7 =
+      set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
+                     (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
+
+  // stage 1
+  // stage 2
+
+  step2[0] = in[0];
+  step2[4] = in[8];
+  step2[8] = in[4];
+  step2[12] = in[12];
+  step2[16] = in[2];
+  step2[20] = in[10];
+  step2[24] = in[6];
+  step2[28] = in[14];
+
+  btf_16_neon(in[1], cospi[63], cospi[1], &step2[32], &step2[63]);
+  btf_16_neon(in[15], -cospi[49], cospi[15], &step2[35], &step2[60]);
+  btf_16_neon(in[9], cospi[55], cospi[9], &step2[36], &step2[59]);
+  btf_16_neon(in[7], -cospi[57], cospi[7], &step2[39], &step2[56]);
+  btf_16_neon(in[5], cospi[59], cospi[5], &step2[40], &step2[55]);
+  btf_16_neon(in[11], -cospi[53], cospi[11], &step2[43], &step2[52]);
+  btf_16_neon(in[13], cospi[51], cospi[13], &step2[44], &step2[51]);
+  btf_16_neon(in[3], -cospi[61], cospi[3], &step2[47], &step2[48]);
+
+  // stage 3
+
+  step1[0] = step2[0];
+  step1[4] = step2[4];
+  step1[8] = step2[8];
+  step1[12] = step2[12];
+
+  btf_16_neon(step2[16], cospi[62], cospi[2], &step1[16], &step1[31]);
+  btf_16_neon(step2[20], cospi[54], cospi[10], &step1[20], &step1[27]);
+  btf_16_neon(step2[24], -cospi[58], cospi[6], &step1[23], &step1[24]);
+  btf_16_neon(step2[28], -cospi[50], cospi[14], &step1[19], &step1[28]);
+
+  step1[32] = step2[32];
+  step1[33] = step2[32];
+  step1[34] = step2[35];
+  step1[35] = step2[35];
+  step1[36] = step2[36];
+  step1[37] = step2[36];
+  step1[38] = step2[39];
+  step1[39] = step2[39];
+  step1[40] = step2[40];
+  step1[41] = step2[40];
+  step1[42] = step2[43];
+  step1[43] = step2[43];
+  step1[44] = step2[44];
+  step1[45] = step2[44];
+  step1[46] = step2[47];
+  step1[47] = step2[47];
+  step1[48] = step2[48];
+  step1[49] = step2[48];
+  step1[50] = step2[51];
+  step1[51] = step2[51];
+  step1[52] = step2[52];
+  step1[53] = step2[52];
+  step1[54] = step2[55];
+  step1[55] = step2[55];
+  step1[56] = step2[56];
+  step1[57] = step2[56];
+  step1[58] = step2[59];
+  step1[59] = step2[59];
+  step1[60] = step2[60];
+  step1[61] = step2[60];
+  step1[62] = step2[63];
+  step1[63] = step2[63];
+
+  // stage 4
+
+  step2[0] = step1[0];
+  step2[4] = step1[4];
+
+  btf_16_neon(step1[8], cospi[60], cospi[4], &step2[8], &step2[15]);
+  btf_16_neon(step1[12], -cospi[52], cospi[12], &step2[11], &step2[12]);
+  btf_16_lane_0_1_neon(step1[62], step1[33], c0, &step2[62], &step2[33]);
+  btf_16_lane_1_0_neon(step1[34], step1[61], c4, &step2[34], &step2[61]);
+  btf_16_lane_2_3_neon(step1[58], step1[37], c0, &step2[58], &step2[37]);
+  btf_16_lane_3_2_neon(step1[38], step1[57], c4, &step2[38], &step2[57]);
+  btf_16_lane_0_1_neon(step1[54], step1[41], c1, &step2[54], &step2[41]);
+  btf_16_lane_1_0_neon(step1[42], step1[53], c5, &step2[42], &step2[53]);
+  btf_16_lane_2_3_neon(step1[50], step1[45], c1, &step2[50], &step2[45]);
+  btf_16_lane_3_2_neon(step1[46], step1[49], c5, &step2[46], &step2[49]);
+
+  step2[16] = step1[16];
+  step2[17] = step1[16];
+  step2[18] = step1[19];
+  step2[19] = step1[19];
+  step2[20] = step1[20];
+  step2[21] = step1[20];
+  step2[22] = step1[23];
+  step2[23] = step1[23];
+  step2[24] = step1[24];
+  step2[25] = step1[24];
+  step2[26] = step1[27];
+  step2[27] = step1[27];
+  step2[28] = step1[28];
+  step2[29] = step1[28];
+  step2[30] = step1[31];
+  step2[31] = step1[31];
+  step2[32] = step1[32];
+  step2[35] = step1[35];
+  step2[36] = step1[36];
+  step2[39] = step1[39];
+  step2[40] = step1[40];
+  step2[43] = step1[43];
+  step2[44] = step1[44];
+  step2[47] = step1[47];
+  step2[48] = step1[48];
+  step2[51] = step1[51];
+  step2[52] = step1[52];
+  step2[55] = step1[55];
+  step2[56] = step1[56];
+  step2[59] = step1[59];
+  step2[60] = step1[60];
+  step2[63] = step1[63];
+
+  // stage 5
+
+  step1[0] = step2[0];
+
+  btf_16_neon(step2[4], cospi[56], cospi[8], &step1[4], &step1[7]);
+  btf_16_lane_0_1_neon(step2[30], step2[17], c2, &step1[30], &step1[17]);
+  btf_16_lane_1_0_neon(step2[18], step2[29], c6, &step1[18], &step1[29]);
+  btf_16_lane_2_3_neon(step2[26], step2[21], c2, &step1[26], &step1[21]);
+  btf_16_lane_3_2_neon(step2[22], step2[25], c6, &step1[22], &step1[25]);
+
+  step1[8] = step2[8];
+  step1[9] = step2[8];
+  step1[10] = step2[11];
+  step1[11] = step2[11];
+  step1[12] = step2[12];
+  step1[13] = step2[12];
+  step1[14] = step2[15];
+  step1[15] = step2[15];
+  step1[16] = step2[16];
+  step1[19] = step2[19];
+  step1[20] = step2[20];
+  step1[23] = step2[23];
+  step1[24] = step2[24];
+  step1[27] = step2[27];
+  step1[28] = step2[28];
+  step1[31] = step2[31];
+  step1[32] = vqaddq_s16(step2[32], step2[35]);
+  step1[33] = vqaddq_s16(step2[33], step2[34]);
+  step1[34] = vqsubq_s16(step2[33], step2[34]);
+  step1[35] = vqsubq_s16(step2[32], step2[35]);
+  step1[36] = vqsubq_s16(step2[39], step2[36]);
+  step1[37] = vqsubq_s16(step2[38], step2[37]);
+  step1[38] = vqaddq_s16(step2[38], step2[37]);
+  step1[39] = vqaddq_s16(step2[39], step2[36]);
+  step1[40] = vqaddq_s16(step2[40], step2[43]);
+  step1[41] = vqaddq_s16(step2[41], step2[42]);
+  step1[42] = vqsubq_s16(step2[41], step2[42]);
+  step1[43] = vqsubq_s16(step2[40], step2[43]);
+  step1[44] = vqsubq_s16(step2[47], step2[44]);
+  step1[45] = vqsubq_s16(step2[46], step2[45]);
+  step1[46] = vqaddq_s16(step2[46], step2[45]);
+  step1[47] = vqaddq_s16(step2[47], step2[44]);
+  step1[48] = vqaddq_s16(step2[48], step2[51]);
+  step1[49] = vqaddq_s16(step2[49], step2[50]);
+  step1[50] = vqsubq_s16(step2[49], step2[50]);
+  step1[51] = vqsubq_s16(step2[48], step2[51]);
+  step1[52] = vqsubq_s16(step2[55], step2[52]);
+  step1[53] = vqsubq_s16(step2[54], step2[53]);
+  step1[54] = vqaddq_s16(step2[54], step2[53]);
+  step1[55] = vqaddq_s16(step2[55], step2[52]);
+  step1[56] = vqaddq_s16(step2[56], step2[59]);
+  step1[57] = vqaddq_s16(step2[57], step2[58]);
+  step1[58] = vqsubq_s16(step2[57], step2[58]);
+  step1[59] = vqsubq_s16(step2[56], step2[59]);
+  step1[60] = vqsubq_s16(step2[63], step2[60]);
+  step1[61] = vqsubq_s16(step2[62], step2[61]);
+  step1[62] = vqaddq_s16(step2[62], step2[61]);
+  step1[63] = vqaddq_s16(step2[63], step2[60]);
+
+  // stage 6
+
+  btf_16_neon(step1[0], cospi[32], cospi[32], &step2[0], &step2[1]);
+  btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]);
+  btf_16_lane_3_2_neon(step1[10], step1[13], c7, &step2[10], &step2[13]);
+  btf_16_lane_0_1_neon(step1[61], step1[34], c2, &step2[61], &step2[34]);
+  btf_16_lane_0_1_neon(step1[60], step1[35], c2, &step2[60], &step2[35]);
+  btf_16_lane_1_0_neon(step1[36], step1[59], c6, &step2[36], &step2[59]);
+  btf_16_lane_1_0_neon(step1[37], step1[58], c6, &step2[37], &step2[58]);
+  btf_16_lane_2_3_neon(step1[53], step1[42], c2, &step2[53], &step2[42]);
+  btf_16_lane_2_3_neon(step1[52], step1[43], c2, &step2[52], &step2[43]);
+  btf_16_lane_3_2_neon(step1[44], step1[51], c6, &step2[44], &step2[51]);
+  btf_16_lane_3_2_neon(step1[45], step1[50], c6, &step2[45], &step2[50]);
+
+  step2[4] = step1[4];
+  step2[5] = step1[4];
+  step2[6] = step1[7];
+  step2[7] = step1[7];
+  step2[8] = step1[8];
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+  step2[15] = step1[15];
+  step2[16] = vqaddq_s16(step1[16], step1[19]);
+  step2[17] = vqaddq_s16(step1[17], step1[18]);
+  step2[18] = vqsubq_s16(step1[17], step1[18]);
+  step2[19] = vqsubq_s16(step1[16], step1[19]);
+  step2[20] = vqsubq_s16(step1[23], step1[20]);
+  step2[21] = vqsubq_s16(step1[22], step1[21]);
+  step2[22] = vqaddq_s16(step1[22], step1[21]);
+  step2[23] = vqaddq_s16(step1[23], step1[20]);
+  step2[24] = vqaddq_s16(step1[24], step1[27]);
+  step2[25] = vqaddq_s16(step1[25], step1[26]);
+  step2[26] = vqsubq_s16(step1[25], step1[26]);
+  step2[27] = vqsubq_s16(step1[24], step1[27]);
+  step2[28] = vqsubq_s16(step1[31], step1[28]);
+  step2[29] = vqsubq_s16(step1[30], step1[29]);
+  step2[30] = vqaddq_s16(step1[30], step1[29]);
+  step2[31] = vqaddq_s16(step1[31], step1[28]);
+  step2[32] = step1[32];
+  step2[33] = step1[33];
+  step2[38] = step1[38];
+  step2[39] = step1[39];
+  step2[40] = step1[40];
+  step2[41] = step1[41];
+  step2[46] = step1[46];
+  step2[47] = step1[47];
+  step2[48] = step1[48];
+  step2[49] = step1[49];
+  step2[54] = step1[54];
+  step2[55] = step1[55];
+  step2[56] = step1[56];
+  step2[57] = step1[57];
+  step2[62] = step1[62];
+  step2[63] = step1[63];
+
+  // stage 7
+
+  btf_16_lane_0_1_neon(step2[6], step2[5], c3, &step1[6], &step1[5]);
+  btf_16_lane_2_3_neon(step2[29], step2[18], c3, &step1[29], &step1[18]);
+  btf_16_lane_2_3_neon(step2[28], step2[19], c3, &step1[28], &step1[19]);
+  btf_16_lane_3_2_neon(step2[20], step2[27], c7, &step1[20], &step1[27]);
+  btf_16_lane_3_2_neon(step2[21], step2[26], c7, &step1[21], &step1[26]);
+
+  step1[0] = step2[0];
+  step1[1] = step2[1];
+  step1[2] = step2[1];
+  step1[3] = step2[0];
+  step1[4] = step2[4];
+  step1[7] = step2[7];
+  step1[8] = vqaddq_s16(step2[8], step2[11]);
+  step1[9] = vqaddq_s16(step2[9], step2[10]);
+  step1[10] = vqsubq_s16(step2[9], step2[10]);
+  step1[11] = vqsubq_s16(step2[8], step2[11]);
+  step1[12] = vqsubq_s16(step2[15], step2[12]);
+  step1[13] = vqsubq_s16(step2[14], step2[13]);
+  step1[14] = vqaddq_s16(step2[14], step2[13]);
+  step1[15] = vqaddq_s16(step2[15], step2[12]);
+  step1[16] = step2[16];
+  step1[17] = step2[17];
+  step1[22] = step2[22];
+  step1[23] = step2[23];
+  step1[24] = step2[24];
+  step1[25] = step2[25];
+  step1[30] = step2[30];
+  step1[31] = step2[31];
+  step1[32] = vqaddq_s16(step2[32], step2[39]);
+  step1[33] = vqaddq_s16(step2[33], step2[38]);
+  step1[34] = vqaddq_s16(step2[34], step2[37]);
+  step1[35] = vqaddq_s16(step2[35], step2[36]);
+  step1[36] = vqsubq_s16(step2[35], step2[36]);
+  step1[37] = vqsubq_s16(step2[34], step2[37]);
+  step1[38] = vqsubq_s16(step2[33], step2[38]);
+  step1[39] = vqsubq_s16(step2[32], step2[39]);
+  step1[40] = vqsubq_s16(step2[47], step2[40]);
+  step1[41] = vqsubq_s16(step2[46], step2[41]);
+  step1[42] = vqsubq_s16(step2[45], step2[42]);
+  step1[43] = vqsubq_s16(step2[44], step2[43]);
+  step1[44] = vqaddq_s16(step2[43], step2[44]);
+  step1[45] = vqaddq_s16(step2[42], step2[45]);
+  step1[46] = vqaddq_s16(step2[41], step2[46]);
+  step1[47] = vqaddq_s16(step2[40], step2[47]);
+  step1[48] = vqaddq_s16(step2[48], step2[55]);
+  step1[49] = vqaddq_s16(step2[49], step2[54]);
+  step1[50] = vqaddq_s16(step2[50], step2[53]);
+  step1[51] = vqaddq_s16(step2[51], step2[52]);
+  step1[52] = vqsubq_s16(step2[51], step2[52]);
+  step1[53] = vqsubq_s16(step2[50], step2[53]);
+  step1[54] = vqsubq_s16(step2[49], step2[54]);
+  step1[55] = vqsubq_s16(step2[48], step2[55]);
+  step1[56] = vqsubq_s16(step2[63], step2[56]);
+  step1[57] = vqsubq_s16(step2[62], step2[57]);
+  step1[58] = vqsubq_s16(step2[61], step2[58]);
+  step1[59] = vqsubq_s16(step2[60], step2[59]);
+  step1[60] = vqaddq_s16(step2[59], step2[60]);
+  step1[61] = vqaddq_s16(step2[58], step2[61]);
+  step1[62] = vqaddq_s16(step2[57], step2[62]);
+  step1[63] = vqaddq_s16(step2[56], step2[63]);
+
+  // stage 8
+
+  btf_16_lane_0_1_neon(step1[13], step1[10], c3, &step2[13], &step2[10]);
+  btf_16_lane_0_1_neon(step1[12], step1[11], c3, &step2[12], &step2[11]);
+  btf_16_lane_2_3_neon(step1[59], step1[36], c3, &step2[59], &step2[36]);
+  btf_16_lane_2_3_neon(step1[58], step1[37], c3, &step2[58], &step2[37]);
+  btf_16_lane_2_3_neon(step1[57], step1[38], c3, &step2[57], &step2[38]);
+  btf_16_lane_2_3_neon(step1[56], step1[39], c3, &step2[56], &step2[39]);
+  btf_16_lane_3_2_neon(step1[40], step1[55], c7, &step2[40], &step2[55]);
+  btf_16_lane_3_2_neon(step1[41], step1[54], c7, &step2[41], &step2[54]);
+  btf_16_lane_3_2_neon(step1[42], step1[53], c7, &step2[42], &step2[53]);
+  btf_16_lane_3_2_neon(step1[43], step1[52], c7, &step2[43], &step2[52]);
+
+  step2[0] = vqaddq_s16(step1[0], step1[7]);
+  step2[1] = vqaddq_s16(step1[1], step1[6]);
+  step2[2] = vqaddq_s16(step1[2], step1[5]);
+  step2[3] = vqaddq_s16(step1[3], step1[4]);
+  step2[4] = vqsubq_s16(step1[3], step1[4]);
+  step2[5] = vqsubq_s16(step1[2], step1[5]);
+  step2[6] = vqsubq_s16(step1[1], step1[6]);
+  step2[7] = vqsubq_s16(step1[0], step1[7]);
+  step2[8] = step1[8];
+  step2[9] = step1[9];
+  step2[14] = step1[14];
+  step2[15] = step1[15];
+  step2[16] = vqaddq_s16(step1[16], step1[23]);
+  step2[17] = vqaddq_s16(step1[17], step1[22]);
+  step2[18] = vqaddq_s16(step1[18], step1[21]);
+  step2[19] = vqaddq_s16(step1[19], step1[20]);
+  step2[20] = vqsubq_s16(step1[19], step1[20]);
+  step2[21] = vqsubq_s16(step1[18], step1[21]);
+  step2[22] = vqsubq_s16(step1[17], step1[22]);
+  step2[23] = vqsubq_s16(step1[16], step1[23]);
+  step2[24] = vqsubq_s16(step1[31], step1[24]);
+  step2[25] = vqsubq_s16(step1[30], step1[25]);
+  step2[26] = vqsubq_s16(step1[29], step1[26]);
+  step2[27] = vqsubq_s16(step1[28], step1[27]);
+  step2[28] = vqaddq_s16(step1[28], step1[27]);
+  step2[29] = vqaddq_s16(step1[29], step1[26]);
+  step2[30] = vqaddq_s16(step1[30], step1[25]);
+  step2[31] = vqaddq_s16(step1[31], step1[24]);
+  step2[32] = step1[32];
+  step2[33] = step1[33];
+  step2[34] = step1[34];
+  step2[35] = step1[35];
+  step2[44] = step1[44];
+  step2[45] = step1[45];
+  step2[46] = step1[46];
+  step2[47] = step1[47];
+  step2[48] = step1[48];
+  step2[49] = step1[49];
+  step2[50] = step1[50];
+  step2[51] = step1[51];
+  step2[60] = step1[60];
+  step2[61] = step1[61];
+  step2[62] = step1[62];
+  step2[63] = step1[63];
+
+  // stage 9
+  idct64_stage9_neon(step2, step1, cos_bit);
+
+  // stage 10
+  idct64_stage10_neon(step1, step2, cos_bit);
+
+  // stage 11
+
+  out[0] = vqaddq_s16(step2[0], step2[63]);
+  out[1] = vqaddq_s16(step2[1], step2[62]);
+  out[2] = vqaddq_s16(step2[2], step2[61]);
+  out[3] = vqaddq_s16(step2[3], step2[60]);
+  out[4] = vqaddq_s16(step2[4], step2[59]);
+  out[5] = vqaddq_s16(step2[5], step2[58]);
+  out[6] = vqaddq_s16(step2[6], step2[57]);
+  out[7] = vqaddq_s16(step2[7], step2[56]);
+  out[8] = vqaddq_s16(step2[8], step2[55]);
+  out[9] = vqaddq_s16(step2[9], step2[54]);
+  out[10] = vqaddq_s16(step2[10], step2[53]);
+  out[11] = vqaddq_s16(step2[11], step2[52]);
+  out[12] = vqaddq_s16(step2[12], step2[51]);
+  out[13] = vqaddq_s16(step2[13], step2[50]);
+  out[14] = vqaddq_s16(step2[14], step2[49]);
+  out[15] = vqaddq_s16(step2[15], step2[48]);
+  out[16] = vqaddq_s16(step2[16], step2[47]);
+  out[17] = vqaddq_s16(step2[17], step2[46]);
+  out[18] = vqaddq_s16(step2[18], step2[45]);
+  out[19] = vqaddq_s16(step2[19], step2[44]);
+  out[20] = vqaddq_s16(step2[20], step2[43]);
+  out[21] = vqaddq_s16(step2[21], step2[42]);
+  out[22] = vqaddq_s16(step2[22], step2[41]);
+  out[23] = vqaddq_s16(step2[23], step2[40]);
+  out[24] = vqaddq_s16(step2[24], step2[39]);
+  out[25] = vqaddq_s16(step2[25], step2[38]);
+  out[26] = vqaddq_s16(step2[26], step2[37]);
+  out[27] = vqaddq_s16(step2[27], step2[36]);
+  out[28] = vqaddq_s16(step2[28], step2[35]);
+  out[29] = vqaddq_s16(step2[29], step2[34]);
+  out[30] = vqaddq_s16(step2[30], step2[33]);
+  out[31] = vqaddq_s16(step2[31], step2[32]);
+  out[32] = vqsubq_s16(step2[31], step2[32]);
+  out[33] = vqsubq_s16(step2[30], step2[33]);
+  out[34] = vqsubq_s16(step2[29], step2[34]);
+  out[35] = vqsubq_s16(step2[28], step2[35]);
+  out[36] = vqsubq_s16(step2[27], step2[36]);
+  out[37] = vqsubq_s16(step2[26], step2[37]);
+  out[38] = vqsubq_s16(step2[25], step2[38]);
+  out[39] = vqsubq_s16(step2[24], step2[39]);
+  out[40] = vqsubq_s16(step2[23], step2[40]);
+  out[41] = vqsubq_s16(step2[22], step2[41]);
+  out[42] = vqsubq_s16(step2[21], step2[42]);
+  out[43] = vqsubq_s16(step2[20], step2[43]);
+  out[44] = vqsubq_s16(step2[19], step2[44]);
+  out[45] = vqsubq_s16(step2[18], step2[45]);
+  out[46] = vqsubq_s16(step2[17], step2[46]);
+  out[47] = vqsubq_s16(step2[16], step2[47]);
+  out[48] = vqsubq_s16(step2[15], step2[48]);
+  out[49] = vqsubq_s16(step2[14], step2[49]);
+  out[50] = vqsubq_s16(step2[13], step2[50]);
+  out[51] = vqsubq_s16(step2[12], step2[51]);
+  out[52] = vqsubq_s16(step2[11], step2[52]);
+  out[53] = vqsubq_s16(step2[10], step2[53]);
+  out[54] = vqsubq_s16(step2[9], step2[54]);
+  out[55] = vqsubq_s16(step2[8], step2[55]);
+  out[56] = vqsubq_s16(step2[7], step2[56]);
+  out[57] = vqsubq_s16(step2[6], step2[57]);
+  out[58] = vqsubq_s16(step2[5], step2[58]);
+  out[59] = vqsubq_s16(step2[4], step2[59]);
+  out[60] = vqsubq_s16(step2[3], step2[60]);
+  out[61] = vqsubq_s16(step2[2], step2[61]);
+  out[62] = vqsubq_s16(step2[1], step2[62]);
+  out[63] = vqsubq_s16(step2[0], step2[63]);
+}
+
+// Functions for blocks with eob at DC and within
+// topleft 8x8, 16x16, 32x32 corner
+static const transform_neon
+    lowbd_txfm_all_1d_zeros_w_arr[TX_SIZES][ITX_TYPES_1D][4] = {
+      {
+          { NULL, NULL, NULL, NULL },
+          { NULL, NULL, NULL, NULL },
+          { NULL, NULL, NULL, NULL },
+      },
+      { { idct8_low1_neon, idct8_neon, NULL, NULL },
+        { iadst8_low1_neon, iadst8_neon, NULL, NULL },
+        { NULL, NULL, NULL, NULL } },
+      {
+          { idct16_low1_neon, idct16_low8_neon, idct16_neon, NULL },
+          { iadst16_low1_neon, iadst16_low8_neon, iadst16_neon, NULL },
+          { NULL, NULL, NULL, NULL },
+      },
+      { { idct32_low1_neon, idct32_low8_neon, idct32_low16_neon, idct32_neon },
+        { NULL, NULL, NULL, NULL },
+        { NULL, NULL, NULL, NULL } },
+      { { idct64_low1_neon, idct64_low8_neon, idct64_low16_neon,
+          idct64_low32_neon },
+        { NULL, NULL, NULL, NULL },
+        { NULL, NULL, NULL, NULL } }
+    };
+
+static INLINE void lowbd_inv_txfm2d_add_idtx_neon(const int32_t *input,
+                                                  uint8_t *output, int stride,
+                                                  TX_TYPE tx_type,
+                                                  TX_SIZE tx_size, int eob) {
+  (void)tx_type;
+  int16x8_t a[32 * 4];
+  int16x8_t b[32 * 4];
+  int eobx, eoby;
+  get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  lowbd_inv_txfm2d_memset_neon(&a[0], (txfm_size_col * (txfm_size_row) >> 3),
+                               0);
+  lowbd_inv_txfm2d_memset_neon(&b[0], (txfm_size_col * (txfm_size_row) >> 3),
+                               0);
+  const int buf_size_w_div8 = txfm_size_col >> 3;
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+  const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+  const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
+  const int32_t *input_1;
+  int temp_b = 0;
+
+  for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
+    input_1 = input;
+    for (int j = 0; j < buf_size_nonzero_w_div8; ++j) {
+      int k = j * 8 + i * txfm_size_col;
+      load_buffer_32bit_to_16bit_neon(input_1, &a[k], txfm_size_col);
+      transpose_s16_8x8q(&a[k], &a[k]);
+      input_1 += 8;
+    }
+    input += (txfm_size_col * 8);
+    if (abs(rect_type) == 1) {
+      int y = i * txfm_size_col;
+      round_shift_for_rect(&a[y], &a[y], txfm_size_col);
+    }
+    identity_txfm_round_neon(&a[i * txfm_size_col], &a[i * txfm_size_col],
+                             txw_idx, txfm_size_col, -shift[0]);
+    for (int j = 0; j < buf_size_w_div8; ++j) {
+      int k = j * 8 + i * txfm_size_col;
+      transpose_s16_8x8q(&a[k], &b[temp_b + txfm_size_row * j]);
+    }
+    temp_b += 8;
+  }
+  for (int j = 0; j < buf_size_w_div8; ++j) {
+    identity_txfm_round_neon(&b[j * txfm_size_row], &b[j * txfm_size_row],
+                             txh_idx, txfm_size_row, -shift[1]);
+  }
+  if (txfm_size_col >= 16) {
+    for (int i = 0; i < (txfm_size_col >> 4); i++) {
+      lowbd_add_flip_buffer_16xn_neon(
+          &b[i * txfm_size_row * 2], output + 16 * i, stride, 0, txfm_size_row);
+    }
+  } else if (txfm_size_col == 8) {
+    lowbd_add_flip_buffer_8xn_neon(b, output, stride, 0, txfm_size_row);
+  }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_v_identity_neon(
+    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+    TX_SIZE tx_size, int eob) {
+  int16x8_t a[16 * 2];
+  int16x8_t b[16 * 2];
+  int eobx, eoby, ud_flip, lr_flip;
+  get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  lowbd_inv_txfm2d_memset_neon(&b[0], (txfm_size_col * (txfm_size_row) >> 3),
+                               0);
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+  const int buf_size_w_div8 = txfm_size_col >> 3;
+  const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+  const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
+  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+  const int32_t *input_1;
+  int temp_b = 0;
+  const transform_neon row_txfm =
+      lowbd_txfm_all_1d_zeros_w_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+
+  assert(row_txfm != NULL);
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
+    input_1 = input;
+    for (int j = 0; j < buf_size_nonzero_w_div8; ++j) {
+      int k = j * 8 + i * txfm_size_col;
+      load_buffer_32bit_to_16bit_neon(input_1, &a[k], txfm_size_col);
+      transpose_s16_8x8q(&a[k], &a[k]);
+      input_1 += 8;
+    }
+    input += (txfm_size_col * 8);
+    if (abs(rect_type) == 1) {
+      int y = i * txfm_size_col;
+      round_shift_for_rect(&a[y], &a[y], txfm_size_col);
+    }
+    row_txfm(&a[i * txfm_size_col], &a[i * txfm_size_col], cos_bit_row, 0);
+    av1_round_shift_array_16_neon(&a[i * txfm_size_col], txfm_size_col,
+                                  -shift[0]);
+    if (lr_flip == 1) {
+      for (int j = 0; j < buf_size_w_div8; ++j) {
+        int k = j * 8 + i * txfm_size_col;
+        flip_buf_ud_neon(&a[k], 8);
+        transpose_s16_8x8q(
+            &a[k], &b[temp_b + txfm_size_row * (buf_size_w_div8 - 1 - j)]);
+      }
+      temp_b += 8;
+    } else {
+      for (int j = 0; j < buf_size_w_div8; ++j) {
+        int k = j * 8 + i * txfm_size_col;
+        transpose_s16_8x8q(&a[k], &b[temp_b + txfm_size_row * j]);
+      }
+      temp_b += 8;
+    }
+  }
+  for (int j = 0; j < buf_size_w_div8; ++j) {
+    identity_txfm_round_neon(&b[j * txfm_size_row], &b[j * txfm_size_row],
+                             txh_idx, txfm_size_row, -shift[1]);
+  }
+  if (txfm_size_col >= 16) {
+    for (int i = 0; i < (txfm_size_col >> 4); i++) {
+      lowbd_add_flip_buffer_16xn_neon(
+          &b[i * txfm_size_row * 2], output + 16 * i, stride, 0, txfm_size_row);
+    }
+  } else if (txfm_size_col == 8) {
+    lowbd_add_flip_buffer_8xn_neon(b, output, stride, 0, txfm_size_row);
+  }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_h_identity_neon(
+    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+    TX_SIZE tx_size, int eob) {
+  int16x8_t a[16 * 2];
+  int16x8_t b[16 * 2];
+  int eobx, eoby, ud_flip, lr_flip;
+  get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  lowbd_inv_txfm2d_memset_neon(&a[0], (txfm_size_col * (txfm_size_row) >> 3),
+                               0);
+  const int buf_size_w_div8 = txfm_size_col >> 3;
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+  const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+  const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
+  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+  const int32_t *input_1;
+  int temp_b = 0;
+  const transform_neon col_txfm =
+      lowbd_txfm_all_1d_zeros_w_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+  assert(col_txfm != NULL);
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
+    input_1 = input;
+    for (int j = 0; j < buf_size_nonzero_w_div8; ++j) {
+      int k = j * 8 + i * txfm_size_col;
+      load_buffer_32bit_to_16bit_neon(input_1, &a[k], txfm_size_col);
+      transpose_s16_8x8q(&a[k], &a[k]);
+      input_1 += 8;
+    }
+    input += (txfm_size_col * 8);
+    if (abs(rect_type) == 1) {
+      int y = i * txfm_size_col;
+      round_shift_for_rect(&a[y], &a[y], txfm_size_col);
+    }
+    identity_txfm_round_neon(&a[i * txfm_size_col], &a[i * txfm_size_col],
+                             txw_idx, txfm_size_col, -shift[0]);
+    for (int j = 0; j < buf_size_w_div8; ++j) {
+      int k = j * 8 + i * txfm_size_col;
+      transpose_s16_8x8q(&a[k], &b[temp_b + txfm_size_row * j]);
+    }
+    temp_b += 8;
+  }
+  for (int j = 0; j < buf_size_w_div8; ++j) {
+    col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], cos_bit_col, 0);
+    av1_round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row,
+                                  -shift[1]);
+  }
+  if (txfm_size_col >= 16) {
+    for (int i = 0; i < (txfm_size_col >> 4); i++) {
+      lowbd_add_flip_buffer_16xn_neon(&b[i * txfm_size_row * 2],
+                                      output + 16 * i, stride, ud_flip,
+                                      txfm_size_row);
+    }
+  } else if (txfm_size_col == 8) {
+    lowbd_add_flip_buffer_8xn_neon(b, output, stride, ud_flip, txfm_size_row);
+  }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_4x4_neon(const int32_t *input,
+                                                 uint8_t *output, int stride,
+                                                 TX_TYPE tx_type, int eob) {
+  (void)eob;
+  TX_SIZE tx_size = TX_4X4;
+  DECLARE_ALIGNED(32, int, txfm_buf[4 * 4 + 8 + 8]);
+  int32_t *temp_in = txfm_buf;
+
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
+  int32_t *temp_out = temp_in + buf_offset;
+  int32_t *buf = temp_out + buf_offset;
+  int32_t *buf_ptr = buf;
+  const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16, 16, 16, 16 };
+  int r, bd = 8;
+  const transform_1d_neon row_txfm =
+      lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
+  const transform_1d_neon col_txfm =
+      lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  for (int i = 0; i < txfm_size_row; i++) {
+    row_txfm(input, buf_ptr, cos_bit_row, stage_range);
+
+    input += txfm_size_col;
+    buf_ptr += txfm_size_col;
+  }
+
+  for (int c = 0; c < txfm_size_col; ++c) {
+    if (lr_flip == 0) {
+      for (r = 0; r < txfm_size_row; ++r)
+        temp_in[r] = buf[r * txfm_size_col + c];
+    } else {
+      // flip left right
+      for (r = 0; r < txfm_size_row; ++r)
+        temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
+    }
+    clamp_buf(temp_in, txfm_size_row, bd + 8);
+    col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
+    av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+
+    if (ud_flip == 0) {
+      for (r = 0; r < txfm_size_row; ++r) {
+        output[r * stride + c] =
+            highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
+      }
+    } else {
+      // flip upside down
+      for (r = 0; r < txfm_size_row; ++r) {
+        output[r * stride + c] = highbd_clip_pixel_add(
+            output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
+      }
+    }
+  }
+}
+
+void lowbd_inv_txfm2d_add_4x8_neon(const int32_t *input, uint8_t *output,
+                                   int stride, TX_TYPE tx_type, int eob) {
+  (void)eob;
+  TX_SIZE tx_size = TX_4X8;
+  DECLARE_ALIGNED(32, int, txfm_buf[4 * 8 + 8 + 8]);
+  int32_t *temp_in = txfm_buf;
+
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
+  int32_t *temp_out = temp_in + buf_offset;
+  int32_t *buf = temp_out + buf_offset;
+  int32_t *buf_ptr = buf;
+  const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16,
+                                                   16, 16, 16, 16 };
+  int r, bd = 8;
+  const transform_1d_neon row_txfm =
+      lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
+  const transform_1d_neon col_txfm =
+      lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  for (int i = 0; i < txfm_size_row; i++) {
+    for (int j = 0; j < txfm_size_col; j++)
+      temp_in[j] = round_shift((int64_t)input[j] * NewInvSqrt2, NewSqrt2Bits);
+
+    row_txfm(temp_in, buf_ptr, cos_bit_row, stage_range);
+    input += txfm_size_col;
+    buf_ptr += txfm_size_col;
+  }
+
+  for (int c = 0; c < txfm_size_col; ++c) {
+    if (lr_flip == 0) {
+      for (r = 0; r < txfm_size_row; ++r)
+        temp_in[r] = buf[r * txfm_size_col + c];
+    } else {
+      // flip left right
+      for (r = 0; r < txfm_size_row; ++r)
+        temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
+    }
+    clamp_buf(temp_in, txfm_size_row, bd + 8);
+    col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
+    av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+
+    if (ud_flip == 0) {
+      for (r = 0; r < txfm_size_row; ++r) {
+        output[r * stride + c] =
+            highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
+      }
+    } else {
+      // flip upside down
+      for (r = 0; r < txfm_size_row; ++r) {
+        output[r * stride + c] = highbd_clip_pixel_add(
+            output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
+      }
+    }
+  }
+}
+
+void lowbd_inv_txfm2d_add_8x4_neon(const int32_t *input, uint8_t *output,
+                                   int stride, TX_TYPE tx_type, int eob) {
+  (void)eob;
+  TX_SIZE tx_size = TX_8X4;
+  DECLARE_ALIGNED(32, int, txfm_buf[8 * 4 + 8 + 8]);
+  int32_t *temp_in = txfm_buf;
+
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
+  int32_t *temp_out = temp_in + buf_offset;
+  int32_t *buf = temp_out + buf_offset;
+  int32_t *buf_ptr = buf;
+  const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16,
+                                                   16, 16, 16, 16 };
+  int r, bd = 8;
+  const transform_1d_neon row_txfm =
+      lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
+  const transform_1d_neon col_txfm =
+      lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  for (int i = 0; i < txfm_size_row; i++) {
+    for (int j = 0; j < txfm_size_col; j++)
+      temp_in[j] = round_shift((int64_t)input[j] * NewInvSqrt2, NewSqrt2Bits);
+
+    row_txfm(temp_in, buf_ptr, cos_bit_row, stage_range);
+    input += txfm_size_col;
+    buf_ptr += txfm_size_col;
+  }
+
+  for (int c = 0; c < txfm_size_col; ++c) {
+    if (lr_flip == 0) {
+      for (r = 0; r < txfm_size_row; ++r)
+        temp_in[r] = buf[r * txfm_size_col + c];
+    } else {
+      // flip left right
+      for (r = 0; r < txfm_size_row; ++r)
+        temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
+    }
+    clamp_buf(temp_in, txfm_size_row, bd + 8);
+    col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
+    av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+
+    if (ud_flip == 0) {
+      for (r = 0; r < txfm_size_row; ++r) {
+        output[r * stride + c] =
+            highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
+      }
+    } else {
+      // flip upside down
+      for (r = 0; r < txfm_size_row; ++r) {
+        output[r * stride + c] = highbd_clip_pixel_add(
+            output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
+      }
+    }
+  }
+}
+
+void lowbd_inv_txfm2d_add_4x16_neon(const int32_t *input, uint8_t *output,
+                                    int stride, TX_TYPE tx_type, int eob) {
+  (void)eob;
+  TX_SIZE tx_size = TX_4X16;
+  DECLARE_ALIGNED(32, int, txfm_buf[4 * 16 + 16 + 16]);
+  int32_t *temp_in = txfm_buf;
+
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
+  int32_t *temp_out = temp_in + buf_offset;
+  int32_t *buf = temp_out + buf_offset;
+  int32_t *buf_ptr = buf;
+  const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16, 16,
+                                                   16, 16, 16, 16, 16 };
+  int r, bd = 8;
+  const transform_1d_neon row_txfm =
+      lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
+  const transform_1d_neon col_txfm =
+      lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  for (int i = 0; i < txfm_size_row; i++) {
+    row_txfm(input, buf_ptr, cos_bit_row, stage_range);
+    av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]);
+    input += txfm_size_col;
+    buf_ptr += txfm_size_col;
+  }
+
+  for (int c = 0; c < txfm_size_col; ++c) {
+    if (lr_flip == 0) {
+      for (r = 0; r < txfm_size_row; ++r)
+        temp_in[r] = buf[r * txfm_size_col + c];
+    } else {
+      // flip left right
+      for (r = 0; r < txfm_size_row; ++r)
+        temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
+    }
+    clamp_buf(temp_in, txfm_size_row, bd + 8);
+    col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
+    av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+
+    if (ud_flip == 0) {
+      for (r = 0; r < txfm_size_row; ++r) {
+        output[r * stride + c] =
+            highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
+      }
+    } else {
+      // flip upside down
+      for (r = 0; r < txfm_size_row; ++r) {
+        output[r * stride + c] = highbd_clip_pixel_add(
+            output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
+      }
+    }
+  }
+}
+
+void lowbd_inv_txfm2d_add_16x4_neon(const int32_t *input, uint8_t *output,
+                                    int stride, TX_TYPE tx_type, int eob) {
+  (void)eob;
+  TX_SIZE tx_size = TX_16X4;
+  DECLARE_ALIGNED(32, int, txfm_buf[16 * 4 + 16 + 16]);
+  int32_t *temp_in = txfm_buf;
+
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
+  int32_t *temp_out = temp_in + buf_offset;
+  int32_t *buf = temp_out + buf_offset;
+  int32_t *buf_ptr = buf;
+  const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16, 16,
+                                                   16, 16, 16, 16, 16 };
+  int r, bd = 8;
+  const transform_1d_neon row_txfm =
+      lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
+  const transform_1d_neon col_txfm =
+      lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  for (int i = 0; i < txfm_size_row; i++) {
+    row_txfm(input, buf_ptr, cos_bit_row, stage_range);
+    av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]);
+    input += txfm_size_col;
+    buf_ptr += txfm_size_col;
+  }
+
+  for (int c = 0; c < txfm_size_col; ++c) {
+    if (lr_flip == 0) {
+      for (r = 0; r < txfm_size_row; ++r)
+        temp_in[r] = buf[r * txfm_size_col + c];
+    } else {
+      // flip left right
+      for (r = 0; r < txfm_size_row; ++r)
+        temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
+    }
+    clamp_buf(temp_in, txfm_size_row, bd + 8);
+    col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
+    av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+
+    if (ud_flip == 0) {
+      for (r = 0; r < txfm_size_row; ++r) {
+        output[r * stride + c] =
+            highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
+      }
+    } else {
+      // flip upside down
+      for (r = 0; r < txfm_size_row; ++r) {
+        output[r * stride + c] = highbd_clip_pixel_add(
+            output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
+      }
+    }
+  }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_no_identity_neon(
+    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+    TX_SIZE tx_size, int eob) {
+  int16x8_t a[64 * 8];
+  int16x8_t b[64 * 8];
+  int eobx, eoby, ud_flip, lr_flip;
+  get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+  const int buf_size_w_div8 = txfm_size_col >> 3;
+  const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+  const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
+  const int input_stride = AOMMIN(32, txfm_size_col);
+  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+  const int32_t *input_1;
+  int temp_b = 0;
+
+  const transform_neon row_txfm =
+      lowbd_txfm_all_1d_zeros_w_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+  const transform_neon col_txfm =
+      lowbd_txfm_all_1d_zeros_w_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+  assert(col_txfm != NULL);
+  assert(row_txfm != NULL);
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
+    input_1 = input;
+    for (int j = 0; j < buf_size_nonzero_w_div8; ++j) {
+      int k = j * 8 + i * txfm_size_col;
+      load_buffer_32bit_to_16bit_neon(input_1, &a[k], input_stride);
+      transpose_s16_8x8q(&a[k], &a[k]);
+      input_1 += 8;
+    }
+    input += (input_stride * 8);
+    if (abs(rect_type) == 1) {
+      int y = i * txfm_size_col;
+      round_shift_for_rect(&a[y], &a[y], input_stride);
+    }
+    row_txfm(&a[i * txfm_size_col], &a[i * txfm_size_col], cos_bit_row, 0);
+    av1_round_shift_array_16_neon(&a[i * txfm_size_col], txfm_size_col,
+                                  -shift[0]);
+    if (lr_flip == 1) {
+      for (int j = 0; j < buf_size_w_div8; ++j) {
+        int k = j * 8 + i * txfm_size_col;
+        flip_buf_ud_neon(&a[k], 8);
+        transpose_s16_8x8q(
+            &a[k], &b[temp_b + txfm_size_row * (buf_size_w_div8 - 1 - j)]);
+      }
+      temp_b += 8;
+    } else {
+      for (int j = 0; j < buf_size_w_div8; ++j) {
+        int k = j * 8 + i * txfm_size_col;
+        transpose_s16_8x8q(&a[k], &b[temp_b + txfm_size_row * j]);
+      }
+      temp_b += 8;
+    }
+  }
+  for (int j = 0; j < buf_size_w_div8; ++j) {
+    col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], cos_bit_col, 0);
+    av1_round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row,
+                                  -shift[1]);
+  }
+
+  if (txfm_size_col >= 16) {
+    for (int i = 0; i < (txfm_size_col >> 4); i++) {
+      lowbd_add_flip_buffer_16xn_neon(&b[i * txfm_size_row * 2],
+                                      output + 16 * i, stride, ud_flip,
+                                      txfm_size_row);
+    }
+  } else if (txfm_size_col == 8) {
+    lowbd_add_flip_buffer_8xn_neon(b, output, stride, ud_flip, txfm_size_row);
+  }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_universe_neon(
+    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+    TX_SIZE tx_size, int eob) {
+  switch (tx_type) {
+    case IDTX:
+      lowbd_inv_txfm2d_add_idtx_neon(input, output, stride, tx_type, tx_size,
+                                     eob);
+      break;
+
+    case H_DCT:
+    case H_ADST:
+    case H_FLIPADST:
+      lowbd_inv_txfm2d_add_v_identity_neon(input, output, stride, tx_type,
+                                           tx_size, eob);
+      break;
+
+    case V_DCT:
+    case V_ADST:
+    case V_FLIPADST:
+      lowbd_inv_txfm2d_add_h_identity_neon(input, output, stride, tx_type,
+                                           tx_size, eob);
+      break;
+
+    default:
+      lowbd_inv_txfm2d_add_no_identity_neon(input, output, stride, tx_type,
+                                            tx_size, eob);
+      break;
+  }
+}
+
+void av1_lowbd_inv_txfm2d_add_neon(const int32_t *input, uint8_t *output,
+                                   int stride, TX_TYPE tx_type, TX_SIZE tx_size,
+                                   int eob) {
+  switch (tx_size) {
+    case TX_4X4:
+      lowbd_inv_txfm2d_add_4x4_neon(input, output, stride, tx_type, eob);
+      break;
+
+    case TX_4X8:
+      lowbd_inv_txfm2d_add_4x8_neon(input, output, stride, tx_type, eob);
+      break;
+
+    case TX_8X4:
+      lowbd_inv_txfm2d_add_8x4_neon(input, output, stride, tx_type, eob);
+      break;
+
+    case TX_4X16:
+      lowbd_inv_txfm2d_add_4x16_neon(input, output, stride, tx_type, eob);
+      break;
+
+    case TX_16X4:
+      lowbd_inv_txfm2d_add_16x4_neon(input, output, stride, tx_type, eob);
+      break;
+
+    default:
+      lowbd_inv_txfm2d_add_universe_neon(input, output, stride, tx_type,
+                                         tx_size, eob);
+      break;
+  }
+}
+void av1_inv_txfm_add_neon(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
+                           const TxfmParam *txfm_param) {
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  if (!txfm_param->lossless) {
+    av1_lowbd_inv_txfm2d_add_neon(dqcoeff, dst, stride, tx_type,
+                                  txfm_param->tx_size, txfm_param->eob);
+  } else {
+    av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param);
+  }
+}
diff --git a/libs/libaom/src/av1/common/arm/av1_inv_txfm_neon.h b/libs/libaom/src/av1/common/arm/av1_inv_txfm_neon.h
new file mode 100644
index 000000000..9ec658291
--- /dev/null
+++ b/libs/libaom/src/av1/common/arm/av1_inv_txfm_neon.h
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_
+#define AOM_AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "av1/common/enums.h"
+#include "av1/common/av1_inv_txfm1d.h"
+#include "av1/common/av1_inv_txfm1d_cfg.h"
+#include "av1/common/av1_txfm.h"
+
+typedef void (*transform_1d_neon)(const int32_t *input, int32_t *output,
+                                  const int8_t cos_bit,
+                                  const int8_t *stage_ptr);
+typedef void (*transform_neon)(int16x8_t *input, int16x8_t *output,
+                               int8_t cos_bit, int bit);
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x8_default[8]) = {
+  0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                av1_eob_to_eobxy_16x16_default[16]) = {
+  0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
+  0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                av1_eob_to_eobxy_32x32_default[32]) = {
+  0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+  0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+  0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+  0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x16_default[16]) = {
+  0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07,
+  0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_16x8_default[8]) = {
+  0x0707, 0x0707, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                av1_eob_to_eobxy_16x32_default[32]) = {
+  0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
+  0x0f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
+  0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
+  0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                av1_eob_to_eobxy_32x16_default[16]) = {
+  0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f,
+  0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x32_default[32]) = {
+  0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07,
+  0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x1f07, 0x1f07, 0x1f07,
+  0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07,
+  0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_32x8_default[8]) = {
+  0x0707, 0x070f, 0x070f, 0x071f, 0x071f, 0x071f, 0x071f, 0x071f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t *,
+                av1_eob_to_eobxy_default[TX_SIZES_ALL]) = {
+  NULL,
+  av1_eob_to_eobxy_8x8_default,
+  av1_eob_to_eobxy_16x16_default,
+  av1_eob_to_eobxy_32x32_default,
+  av1_eob_to_eobxy_32x32_default,
+  NULL,
+  NULL,
+  av1_eob_to_eobxy_8x16_default,
+  av1_eob_to_eobxy_16x8_default,
+  av1_eob_to_eobxy_16x32_default,
+  av1_eob_to_eobxy_32x16_default,
+  av1_eob_to_eobxy_32x32_default,
+  av1_eob_to_eobxy_32x32_default,
+  NULL,
+  NULL,
+  av1_eob_to_eobxy_8x32_default,
+  av1_eob_to_eobxy_32x8_default,
+  av1_eob_to_eobxy_16x32_default,
+  av1_eob_to_eobxy_32x16_default,
+};
+
+static const int lowbd_txfm_all_1d_zeros_idx[32] = {
+  0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
+  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+};
+
+// Transform block width in log2 for eob (size of 64 map to 32)
+static const int tx_size_wide_log2_eob[TX_SIZES_ALL] = {
+  2, 3, 4, 5, 5, 2, 3, 3, 4, 4, 5, 5, 5, 2, 4, 3, 5, 4, 5,
+};
+
+static int eob_fill[32] = {
+  0,  7,  7,  7,  7,  7,  7,  7,  15, 15, 15, 15, 15, 15, 15, 15,
+  31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+};
+
+static INLINE void get_eobx_eoby_scan_default(int *eobx, int *eoby,
+                                              TX_SIZE tx_size, int eob) {
+  if (eob == 1) {
+    *eobx = 0;
+    *eoby = 0;
+    return;
+  }
+
+  const int tx_w_log2 = tx_size_wide_log2_eob[tx_size];
+  const int eob_row = (eob - 1) >> tx_w_log2;
+  const int eobxy = av1_eob_to_eobxy_default[tx_size][eob_row];
+  *eobx = eobxy & 0xFF;
+  *eoby = eobxy >> 8;
+}
+
+static INLINE void get_eobx_eoby_scan_v_identity(int *eobx, int *eoby,
+                                                 TX_SIZE tx_size, int eob) {
+  eob -= 1;
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int eoby_max = AOMMIN(32, txfm_size_row) - 1;
+  *eobx = eob / (eoby_max + 1);
+  *eoby = (eob >= eoby_max) ? eoby_max : eob_fill[eob];
+}
+
+static INLINE void get_eobx_eoby_scan_h_identity(int *eobx, int *eoby,
+                                                 TX_SIZE tx_size, int eob) {
+  eob -= 1;
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int eobx_max = AOMMIN(32, txfm_size_col) - 1;
+  *eobx = (eob >= eobx_max) ? eobx_max : eob_fill[eob];
+  const int temp_eoby = eob / (eobx_max + 1);
+  assert(temp_eoby < 32);
+  *eoby = eob_fill[temp_eoby];
+}
+
+#endif  // AOM_AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_
diff --git a/libs/libaom/src/av1/common/arm/av1_txfm_neon.c b/libs/libaom/src/av1/common/arm/av1_txfm_neon.c
new file mode 100644
index 000000000..7e3a05ab7
--- /dev/null
+++ b/libs/libaom/src/av1/common/arm/av1_txfm_neon.c
@@ -0,0 +1,30 @@
+/*
+ *
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom_ports/mem.h"
+#include "av1/common/arm/mem_neon.h"
+
+void av1_round_shift_array_neon(int32_t *arr, int size, int bit) {
+  assert(!(size % 4));
+  if (!bit) return;
+  const int32x4_t dup_bits_n_32x4 = vdupq_n_s32((int32_t)(-bit));
+  for (int i = 0; i < size; i += 4) {
+    int32x4_t tmp_q_s32 = vld1q_s32(arr);
+    tmp_q_s32 = vrshlq_s32(tmp_q_s32, dup_bits_n_32x4);
+    vst1q_s32(arr, tmp_q_s32);
+    arr += 4;
+  }
+}
diff --git a/libs/libaom/src/av1/common/arm/blend_a64_hmask_neon.c b/libs/libaom/src/av1/common/arm/blend_a64_hmask_neon.c
new file mode 100644
index 000000000..7134f183e
--- /dev/null
+++ b/libs/libaom/src/av1/common/arm/blend_a64_hmask_neon.c
@@ -0,0 +1,134 @@
+/*
+ *
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/blend.h"
+#include "aom_ports/mem.h"
+#include "av1/common/arm/mem_neon.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "config/aom_dsp_rtcd.h"
+
+void aom_blend_a64_hmask_neon(uint8_t *dst, uint32_t dst_stride,
+                              const uint8_t *src0, uint32_t src0_stride,
+                              const uint8_t *src1, uint32_t src1_stride,
+                              const uint8_t *mask, int w, int h) {
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 2);
+  assert(w >= 2);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+  uint8x8_t tmp0, tmp1;
+  uint8x16_t res_q;
+  uint16x8_t res, res_low, res_high;
+  uint32x2_t tmp0_32 = vdup_n_u32(0), tmp1_32 = vdup_n_u32(0);
+  uint16x4_t tmp0_16 = vdup_n_u16(0), tmp1_16 = vdup_n_u16(0);
+  const uint8x8_t vdup_64 = vdup_n_u8((uint8_t)64);
+
+  if (w >= 16) {
+    const uint8x16_t vdup_64_q = vdupq_n_u8((uint8_t)64);
+    for (int i = 0; i < h; ++i) {
+      for (int j = 0; j < w; j += 16) {
+        __builtin_prefetch(src0);
+        __builtin_prefetch(src1);
+        const uint8x16_t tmp0_q = vld1q_u8(src0);
+        const uint8x16_t tmp1_q = vld1q_u8(src1);
+        const uint8x16_t m_q = vld1q_u8(mask);
+        const uint8x16_t max_minus_m_q = vsubq_u8(vdup_64_q, m_q);
+        res_low = vmull_u8(vget_low_u8(m_q), vget_low_u8(tmp0_q));
+        res_low =
+            vmlal_u8(res_low, vget_low_u8(max_minus_m_q), vget_low_u8(tmp1_q));
+        res_high = vmull_u8(vget_high_u8(m_q), vget_high_u8(tmp0_q));
+        res_high = vmlal_u8(res_high, vget_high_u8(max_minus_m_q),
+                            vget_high_u8(tmp1_q));
+        res_q = vcombine_u8(vrshrn_n_u16(res_low, AOM_BLEND_A64_ROUND_BITS),
+                            vrshrn_n_u16(res_high, AOM_BLEND_A64_ROUND_BITS));
+        vst1q_u8(dst, res_q);
+        src0 += 16;
+        src1 += 16;
+        dst += 16;
+        mask += 16;
+      }
+      src0 += src0_stride - w;
+      src1 += src1_stride - w;
+      dst += dst_stride - w;
+      mask -= w;
+    }
+  } else if (w == 8) {
+    const uint8x8_t m = vld1_u8(mask);
+    const uint8x8_t max_minus_m = vsub_u8(vdup_64, m);
+    for (int i = 0; i < h; ++i) {
+      __builtin_prefetch(src0);
+      __builtin_prefetch(src1);
+      tmp0 = vld1_u8(src0);
+      tmp1 = vld1_u8(src1);
+      res = vmull_u8(m, tmp0);
+      res = vmlal_u8(res, max_minus_m, tmp1);
+      vst1_u8(dst, vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS));
+      src0 += src0_stride;
+      src1 += src1_stride;
+      dst += dst_stride;
+    }
+  } else if (w == 4) {
+    const uint8x8_t m = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)mask));
+    const uint8x8_t max_minus_m = vsub_u8(vdup_64, m);
+    for (int i = 0; i < h; i += 2) {
+      __builtin_prefetch(src0 + 0 * src0_stride);
+      __builtin_prefetch(src0 + 1 * src0_stride);
+      __builtin_prefetch(src1 + 0 * src1_stride);
+      __builtin_prefetch(src1 + 1 * src1_stride);
+      load_unaligned_u8_4x2(src0, src0_stride, &tmp0_32);
+      tmp0 = vreinterpret_u8_u32(tmp0_32);
+      load_unaligned_u8_4x2(src1, src1_stride, &tmp1_32);
+      tmp1 = vreinterpret_u8_u32(tmp1_32);
+      res = vmull_u8(m, tmp0);
+      res = vmlal_u8(res, max_minus_m, tmp1);
+      vst1_lane_u32(
+          (uint32_t *)(dst + (0 * dst_stride)),
+          vreinterpret_u32_u8(vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS)), 0);
+      vst1_lane_u32(
+          (uint32_t *)(dst + (1 * dst_stride)),
+          vreinterpret_u32_u8(vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS)), 1);
+      src0 += (2 * src0_stride);
+      src1 += (2 * src1_stride);
+      dst += (2 * dst_stride);
+    }
+  } else if (w == 2) {
+    const uint8x8_t m = vreinterpret_u8_u16(vld1_dup_u16((uint16_t *)mask));
+    const uint8x8_t max_minus_m = vsub_u8(vdup_64, m);
+    for (int i = 0; i < h; i += 2) {
+      __builtin_prefetch(src0 + 0 * src0_stride);
+      __builtin_prefetch(src0 + 1 * src0_stride);
+      __builtin_prefetch(src1 + 0 * src1_stride);
+      __builtin_prefetch(src1 + 1 * src1_stride);
+      load_unaligned_u8_2x2(src0, src0_stride, &tmp0_16);
+      tmp0 = vreinterpret_u8_u16(tmp0_16);
+      load_unaligned_u8_2x2(src1, src1_stride, &tmp1_16);
+      tmp1 = vreinterpret_u8_u16(tmp1_16);
+      res = vmull_u8(m, tmp0);
+      res = vmlal_u8(res, max_minus_m, tmp1);
+      vst1_lane_u16(
+          (uint16_t *)(dst + (0 * dst_stride)),
+          vreinterpret_u16_u8(vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS)), 0);
+      vst1_lane_u16(
+          (uint16_t *)(dst + (1 * dst_stride)),
+          vreinterpret_u16_u8(vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS)), 1);
+      src0 += (2 * src0_stride);
+      src1 += (2 * src1_stride);
+      dst += (2 * dst_stride);
+    }
+  }
+}
diff --git a/libs/libaom/src/av1/common/arm/blend_a64_vmask_neon.c b/libs/libaom/src/av1/common/arm/blend_a64_vmask_neon.c
new file mode 100644
index 000000000..194e94c8c
--- /dev/null
+++ b/libs/libaom/src/av1/common/arm/blend_a64_vmask_neon.c
@@ -0,0 +1,141 @@
+/*
+ *
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/blend.h"
+#include "aom_ports/mem.h"
+#include "av1/common/arm/mem_neon.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "config/aom_dsp_rtcd.h"
+
+void aom_blend_a64_vmask_neon(uint8_t *dst, uint32_t dst_stride,
+                              const uint8_t *src0, uint32_t src0_stride,
+                              const uint8_t *src1, uint32_t src1_stride,
+                              const uint8_t *mask, int w, int h) {
+  uint8x8_t tmp0, tmp1;
+  uint8x16_t tmp0_q, tmp1_q, res_q;
+  uint16x8_t res, res_low, res_high;
+  uint32x2_t tmp0_32 = vdup_n_u32(0), tmp1_32 = vdup_n_u32(0);
+  uint16x4_t tmp0_16 = vdup_n_u16(0), tmp1_16 = vdup_n_u16(0);
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 2);
+  assert(w >= 2);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  if (w >= 16) {
+    for (int i = 0; i < h; ++i) {
+      const uint8x8_t m = vdup_n_u8((uint8_t)mask[i]);
+      const uint8x8_t max_minus_m = vdup_n_u8(64 - (uint8_t)mask[i]);
+      for (int j = 0; j < w; j += 16) {
+        __builtin_prefetch(src0);
+        __builtin_prefetch(src1);
+        tmp0_q = vld1q_u8(src0);
+        tmp1_q = vld1q_u8(src1);
+        res_low = vmull_u8(m, vget_low_u8(tmp0_q));
+        res_low = vmlal_u8(res_low, max_minus_m, vget_low_u8(tmp1_q));
+        res_high = vmull_u8(m, vget_high_u8(tmp0_q));
+        res_high = vmlal_u8(res_high, max_minus_m, vget_high_u8(tmp1_q));
+        res_q = vcombine_u8(vrshrn_n_u16(res_low, AOM_BLEND_A64_ROUND_BITS),
+                            vrshrn_n_u16(res_high, AOM_BLEND_A64_ROUND_BITS));
+        vst1q_u8(dst, res_q);
+        src0 += 16;
+        src1 += 16;
+        dst += 16;
+      }
+      src0 += src0_stride - w;
+      src1 += src1_stride - w;
+      dst += dst_stride - w;
+    }
+  } else if (w == 8) {
+    for (int i = 0; i < h; ++i) {
+      __builtin_prefetch(src0);
+      __builtin_prefetch(src1);
+      const uint8x8_t m = vdup_n_u8((uint8_t)mask[i]);
+      const uint8x8_t max_minus_m = vdup_n_u8(64 - (uint8_t)mask[i]);
+      tmp0 = vld1_u8(src0);
+      tmp1 = vld1_u8(src1);
+      res = vmull_u8(m, tmp0);
+      res = vmlal_u8(res, max_minus_m, tmp1);
+      vst1_u8(dst, vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS));
+      src0 += src0_stride;
+      src1 += src1_stride;
+      dst += dst_stride;
+    }
+  } else if (w == 4) {
+    for (int i = 0; i < h; i += 2) {
+      __builtin_prefetch(src0 + 0 * src0_stride);
+      __builtin_prefetch(src0 + 1 * src0_stride);
+      __builtin_prefetch(src1 + 0 * src1_stride);
+      __builtin_prefetch(src1 + 1 * src1_stride);
+      const uint16x4_t m1 = vdup_n_u16((uint16_t)mask[i]);
+      const uint16x4_t m2 = vdup_n_u16((uint16_t)mask[i + 1]);
+      const uint8x8_t m = vmovn_u16(vcombine_u16(m1, m2));
+      const uint16x4_t max_minus_m1 = vdup_n_u16(64 - (uint16_t)mask[i]);
+      const uint16x4_t max_minus_m2 = vdup_n_u16(64 - (uint16_t)mask[i + 1]);
+      const uint8x8_t max_minus_m =
+          vmovn_u16(vcombine_u16(max_minus_m1, max_minus_m2));
+      load_unaligned_u8_4x2(src0, src0_stride, &tmp0_32);
+      tmp0 = vreinterpret_u8_u32(tmp0_32);
+      load_unaligned_u8_4x2(src1, src1_stride, &tmp1_32);
+      tmp1 = vreinterpret_u8_u32(tmp1_32);
+      res = vmull_u8(m, tmp0);
+      res = vmlal_u8(res, max_minus_m, tmp1);
+      vst1_lane_u32(
+          (uint32_t *)(dst + (0 * dst_stride)),
+          vreinterpret_u32_u8(vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS)), 0);
+      vst1_lane_u32(
+          (uint32_t *)(dst + (1 * dst_stride)),
+          vreinterpret_u32_u8(vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS)), 1);
+      src0 += (2 * src0_stride);
+      src1 += (2 * src1_stride);
+      dst += (2 * dst_stride);
+    }
+  } else if (w == 2) {
+    for (int i = 0; i < h; i += 2) {
+      __builtin_prefetch(src0 + 0 * src0_stride);
+      __builtin_prefetch(src0 + 1 * src0_stride);
+      __builtin_prefetch(src1 + 0 * src1_stride);
+      __builtin_prefetch(src1 + 1 * src1_stride);
+      const uint8x8_t m1 = vdup_n_u8(mask[i]);
+      const uint8x8_t m2 = vdup_n_u8(mask[i + 1]);
+      const uint16x4x2_t m_trn =
+          vtrn_u16(vreinterpret_u16_u8(m1), vreinterpret_u16_u8(m2));
+      const uint8x8_t m = vreinterpret_u8_u16(m_trn.val[0]);
+      const uint8x8_t max_minus_m1 = vdup_n_u8(64 - mask[i]);
+      const uint8x8_t max_minus_m2 = vdup_n_u8(64 - mask[i + 1]);
+      const uint16x4x2_t max_minus_m_trn = vtrn_u16(
+          vreinterpret_u16_u8(max_minus_m1), vreinterpret_u16_u8(max_minus_m2));
+      const uint8x8_t max_minus_m = vreinterpret_u8_u16(max_minus_m_trn.val[0]);
+      load_unaligned_u8_2x2(src0, src0_stride, &tmp0_16);
+      tmp0 = vreinterpret_u8_u16(tmp0_16);
+      load_unaligned_u8_2x2(src1, src1_stride, &tmp1_16);
+      tmp1 = vreinterpret_u8_u16(tmp1_16);
+      res = vmull_u8(m, tmp0);
+      res = vmlal_u8(res, max_minus_m, tmp1);
+      vst1_lane_u16(
+          (uint16_t *)(dst + (0 * dst_stride)),
+          vreinterpret_u16_u8(vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS)), 0);
+      vst1_lane_u16(
+          (uint16_t *)(dst + (1 * dst_stride)),
+          vreinterpret_u16_u8(vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS)), 1);
+      src0 += (2 * src0_stride);
+      src1 += (2 * src1_stride);
+      dst += (2 * dst_stride);
+    }
+  }
+}
diff --git a/libs/libaom/src/av1/common/arm/cfl_neon.c b/libs/libaom/src/av1/common/arm/cfl_neon.c
new file mode 100644
index 000000000..371be5f0e
--- /dev/null
+++ b/libs/libaom/src/av1/common/arm/cfl_neon.c
@@ -0,0 +1,588 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <arm_neon.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/cfl.h"
+
+static INLINE void vldsubstq_s16(int16_t *dst, const uint16_t *src, int offset,
+                                 int16x8_t sub) {
+  vst1q_s16(dst + offset,
+            vsubq_s16(vreinterpretq_s16_u16(vld1q_u16(src + offset)), sub));
+}
+
+static INLINE uint16x8_t vldaddq_u16(const uint16_t *buf, size_t offset) {
+  return vaddq_u16(vld1q_u16(buf), vld1q_u16(buf + offset));
+}
+
+// Load half of a vector and duplicated in other half
+static INLINE uint8x8_t vldh_dup_u8(const uint8_t *ptr) {
+  return vreinterpret_u8_u32(vld1_dup_u32((const uint32_t *)ptr));
+}
+
+// Store half of a vector.
+static INLINE void vsth_u16(uint16_t *ptr, uint16x4_t val) {
+  *((uint32_t *)ptr) = vreinterpret_u32_u16(val)[0];
+}
+
+// Store half of a vector.
+static INLINE void vsth_u8(uint8_t *ptr, uint8x8_t val) {
+  *((uint32_t *)ptr) = vreinterpret_u32_u8(val)[0];
+}
+
+static void cfl_luma_subsampling_420_lbd_neon(const uint8_t *input,
+                                              int input_stride,
+                                              uint16_t *pred_buf_q3, int width,
+                                              int height) {
+  const uint16_t *end = pred_buf_q3 + (height >> 1) * CFL_BUF_LINE;
+  const int luma_stride = input_stride << 1;
+  do {
+    if (width == 4) {
+      const uint16x4_t top = vpaddl_u8(vldh_dup_u8(input));
+      const uint16x4_t sum = vpadal_u8(top, vldh_dup_u8(input + input_stride));
+      vsth_u16(pred_buf_q3, vshl_n_u16(sum, 1));
+    } else if (width == 8) {
+      const uint16x4_t top = vpaddl_u8(vld1_u8(input));
+      const uint16x4_t sum = vpadal_u8(top, vld1_u8(input + input_stride));
+      vst1_u16(pred_buf_q3, vshl_n_u16(sum, 1));
+    } else if (width == 16) {
+      const uint16x8_t top = vpaddlq_u8(vld1q_u8(input));
+      const uint16x8_t sum = vpadalq_u8(top, vld1q_u8(input + input_stride));
+      vst1q_u16(pred_buf_q3, vshlq_n_u16(sum, 1));
+    } else {
+      const uint8x8x4_t top = vld4_u8(input);
+      const uint8x8x4_t bot = vld4_u8(input + input_stride);
+      // equivalent to a vpaddlq_u8 (because vld4q interleaves)
+      const uint16x8_t top_0 = vaddl_u8(top.val[0], top.val[1]);
+      // equivalent to a vpaddlq_u8 (because vld4q interleaves)
+      const uint16x8_t bot_0 = vaddl_u8(bot.val[0], bot.val[1]);
+      // equivalent to a vpaddlq_u8 (because vld4q interleaves)
+      const uint16x8_t top_1 = vaddl_u8(top.val[2], top.val[3]);
+      // equivalent to a vpaddlq_u8 (because vld4q interleaves)
+      const uint16x8_t bot_1 = vaddl_u8(bot.val[2], bot.val[3]);
+      uint16x8x2_t sum;
+      sum.val[0] = vshlq_n_u16(vaddq_u16(top_0, bot_0), 1);
+      sum.val[1] = vshlq_n_u16(vaddq_u16(top_1, bot_1), 1);
+      vst2q_u16(pred_buf_q3, sum);
+    }
+    input += luma_stride;
+  } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
+}
+
+static void cfl_luma_subsampling_422_lbd_neon(const uint8_t *input,
+                                              int input_stride,
+                                              uint16_t *pred_buf_q3, int width,
+                                              int height) {
+  const uint16_t *end = pred_buf_q3 + height * CFL_BUF_LINE;
+  do {
+    if (width == 4) {
+      const uint16x4_t top = vpaddl_u8(vldh_dup_u8(input));
+      vsth_u16(pred_buf_q3, vshl_n_u16(top, 2));
+    } else if (width == 8) {
+      const uint16x4_t top = vpaddl_u8(vld1_u8(input));
+      vst1_u16(pred_buf_q3, vshl_n_u16(top, 2));
+    } else if (width == 16) {
+      const uint16x8_t top = vpaddlq_u8(vld1q_u8(input));
+      vst1q_u16(pred_buf_q3, vshlq_n_u16(top, 2));
+    } else {
+      const uint8x8x4_t top = vld4_u8(input);
+      uint16x8x2_t sum;
+      // vaddl_u8 is equivalent to a vpaddlq_u8 (because vld4q interleaves)
+      sum.val[0] = vshlq_n_u16(vaddl_u8(top.val[0], top.val[1]), 2);
+      sum.val[1] = vshlq_n_u16(vaddl_u8(top.val[2], top.val[3]), 2);
+      vst2q_u16(pred_buf_q3, sum);
+    }
+    input += input_stride;
+  } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
+}
+
+static void cfl_luma_subsampling_444_lbd_neon(const uint8_t *input,
+                                              int input_stride,
+                                              uint16_t *pred_buf_q3, int width,
+                                              int height) {
+  const uint16_t *end = pred_buf_q3 + height * CFL_BUF_LINE;
+  do {
+    if (width == 4) {
+      const uint16x8_t top = vshll_n_u8(vldh_dup_u8(input), 3);
+      vst1_u16(pred_buf_q3, vget_low_u16(top));
+    } else if (width == 8) {
+      const uint16x8_t top = vshll_n_u8(vld1_u8(input), 3);
+      vst1q_u16(pred_buf_q3, top);
+    } else {
+      const uint8x16_t top = vld1q_u8(input);
+      vst1q_u16(pred_buf_q3, vshll_n_u8(vget_low_u8(top), 3));
+      vst1q_u16(pred_buf_q3 + 8, vshll_n_u8(vget_high_u8(top), 3));
+      if (width == 32) {
+        const uint8x16_t next_top = vld1q_u8(input + 16);
+        vst1q_u16(pred_buf_q3 + 16, vshll_n_u8(vget_low_u8(next_top), 3));
+        vst1q_u16(pred_buf_q3 + 24, vshll_n_u8(vget_high_u8(next_top), 3));
+      }
+    }
+    input += input_stride;
+  } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+#ifndef __aarch64__
+uint16x8_t vpaddq_u16(uint16x8_t a, uint16x8_t b) {
+  return vcombine_u16(vpadd_u16(vget_low_u16(a), vget_high_u16(a)),
+                      vpadd_u16(vget_low_u16(b), vget_high_u16(b)));
+}
+#endif
+
+static void cfl_luma_subsampling_420_hbd_neon(const uint16_t *input,
+                                              int input_stride,
+                                              uint16_t *pred_buf_q3, int width,
+                                              int height) {
+  const uint16_t *end = pred_buf_q3 + (height >> 1) * CFL_BUF_LINE;
+  const int luma_stride = input_stride << 1;
+  do {
+    if (width == 4) {
+      const uint16x4_t top = vld1_u16(input);
+      const uint16x4_t bot = vld1_u16(input + input_stride);
+      const uint16x4_t sum = vadd_u16(top, bot);
+      const uint16x4_t hsum = vpadd_u16(sum, sum);
+      vsth_u16(pred_buf_q3, vshl_n_u16(hsum, 1));
+    } else if (width < 32) {
+      const uint16x8_t top = vld1q_u16(input);
+      const uint16x8_t bot = vld1q_u16(input + input_stride);
+      const uint16x8_t sum = vaddq_u16(top, bot);
+      if (width == 8) {
+        const uint16x4_t hsum = vget_low_u16(vpaddq_u16(sum, sum));
+        vst1_u16(pred_buf_q3, vshl_n_u16(hsum, 1));
+      } else {
+        const uint16x8_t top_1 = vld1q_u16(input + 8);
+        const uint16x8_t bot_1 = vld1q_u16(input + 8 + input_stride);
+        const uint16x8_t sum_1 = vaddq_u16(top_1, bot_1);
+        const uint16x8_t hsum = vpaddq_u16(sum, sum_1);
+        vst1q_u16(pred_buf_q3, vshlq_n_u16(hsum, 1));
+      }
+    } else {
+      const uint16x8x4_t top = vld4q_u16(input);
+      const uint16x8x4_t bot = vld4q_u16(input + input_stride);
+      // equivalent to a vpaddq_u16 (because vld4q interleaves)
+      const uint16x8_t top_0 = vaddq_u16(top.val[0], top.val[1]);
+      // equivalent to a vpaddq_u16 (because vld4q interleaves)
+      const uint16x8_t bot_0 = vaddq_u16(bot.val[0], bot.val[1]);
+      // equivalent to a vpaddq_u16 (because vld4q interleaves)
+      const uint16x8_t top_1 = vaddq_u16(top.val[2], top.val[3]);
+      // equivalent to a vpaddq_u16 (because vld4q interleaves)
+      const uint16x8_t bot_1 = vaddq_u16(bot.val[2], bot.val[3]);
+      uint16x8x2_t sum;
+      sum.val[0] = vshlq_n_u16(vaddq_u16(top_0, bot_0), 1);
+      sum.val[1] = vshlq_n_u16(vaddq_u16(top_1, bot_1), 1);
+      vst2q_u16(pred_buf_q3, sum);
+    }
+    input += luma_stride;
+  } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
+}
+
+static void cfl_luma_subsampling_422_hbd_neon(const uint16_t *input,
+                                              int input_stride,
+                                              uint16_t *pred_buf_q3, int width,
+                                              int height) {
+  const uint16_t *end = pred_buf_q3 + height * CFL_BUF_LINE;
+  do {
+    if (width == 4) {
+      const uint16x4_t top = vld1_u16(input);
+      const uint16x4_t hsum = vpadd_u16(top, top);
+      vsth_u16(pred_buf_q3, vshl_n_u16(hsum, 2));
+    } else if (width == 8) {
+      const uint16x4x2_t top = vld2_u16(input);
+      // equivalent to a vpadd_u16 (because vld2 interleaves)
+      const uint16x4_t hsum = vadd_u16(top.val[0], top.val[1]);
+      vst1_u16(pred_buf_q3, vshl_n_u16(hsum, 2));
+    } else if (width == 16) {
+      const uint16x8x2_t top = vld2q_u16(input);
+      // equivalent to a vpaddq_u16 (because vld2q interleaves)
+      const uint16x8_t hsum = vaddq_u16(top.val[0], top.val[1]);
+      vst1q_u16(pred_buf_q3, vshlq_n_u16(hsum, 2));
+    } else {
+      const uint16x8x4_t top = vld4q_u16(input);
+      // equivalent to a vpaddq_u16 (because vld4q interleaves)
+      const uint16x8_t hsum_0 = vaddq_u16(top.val[0], top.val[1]);
+      // equivalent to a vpaddq_u16 (because vld4q interleaves)
+      const uint16x8_t hsum_1 = vaddq_u16(top.val[2], top.val[3]);
+      uint16x8x2_t result = { { vshlq_n_u16(hsum_0, 2),
+                                vshlq_n_u16(hsum_1, 2) } };
+      vst2q_u16(pred_buf_q3, result);
+    }
+    input += input_stride;
+  } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
+}
+
+static void cfl_luma_subsampling_444_hbd_neon(const uint16_t *input,
+                                              int input_stride,
+                                              uint16_t *pred_buf_q3, int width,
+                                              int height) {
+  const uint16_t *end = pred_buf_q3 + height * CFL_BUF_LINE;
+  do {
+    if (width == 4) {
+      const uint16x4_t top = vld1_u16(input);
+      vst1_u16(pred_buf_q3, vshl_n_u16(top, 3));
+    } else if (width == 8) {
+      const uint16x8_t top = vld1q_u16(input);
+      vst1q_u16(pred_buf_q3, vshlq_n_u16(top, 3));
+    } else if (width == 16) {
+      uint16x8x2_t top = vld2q_u16(input);
+      top.val[0] = vshlq_n_u16(top.val[0], 3);
+      top.val[1] = vshlq_n_u16(top.val[1], 3);
+      vst2q_u16(pred_buf_q3, top);
+    } else {
+      uint16x8x4_t top = vld4q_u16(input);
+      top.val[0] = vshlq_n_u16(top.val[0], 3);
+      top.val[1] = vshlq_n_u16(top.val[1], 3);
+      top.val[2] = vshlq_n_u16(top.val[2], 3);
+      top.val[3] = vshlq_n_u16(top.val[3], 3);
+      vst4q_u16(pred_buf_q3, top);
+    }
+    input += input_stride;
+  } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+CFL_GET_SUBSAMPLE_FUNCTION(neon)
+
+static INLINE void subtract_average_neon(const uint16_t *src, int16_t *dst,
+                                         int width, int height,
+                                         int round_offset,
+                                         const int num_pel_log2) {
+  const uint16_t *const end = src + height * CFL_BUF_LINE;
+
+  // Round offset is not needed, because NEON will handle the rounding.
+  (void)round_offset;
+
+  // To optimize the use of the CPU pipeline, we process 4 rows per iteration
+  const int step = 4 * CFL_BUF_LINE;
+
+  // At this stage, the prediction buffer contains scaled reconstructed luma
+  // pixels, which are positive integer and only require 15 bits. By using
+  // unsigned integer for the sum, we can do one addition operation inside 16
+  // bits (8 lanes) before having to convert to 32 bits (4 lanes).
+  const uint16_t *sum_buf = src;
+  uint32x4_t sum_32x4 = { 0, 0, 0, 0 };
+  do {
+    // For all widths, we load, add and combine the data so it fits in 4 lanes.
+    if (width == 4) {
+      const uint16x4_t a0 =
+          vadd_u16(vld1_u16(sum_buf), vld1_u16(sum_buf + CFL_BUF_LINE));
+      const uint16x4_t a1 = vadd_u16(vld1_u16(sum_buf + 2 * CFL_BUF_LINE),
+                                     vld1_u16(sum_buf + 3 * CFL_BUF_LINE));
+      sum_32x4 = vaddq_u32(sum_32x4, vaddl_u16(a0, a1));
+    } else if (width == 8) {
+      const uint16x8_t a0 = vldaddq_u16(sum_buf, CFL_BUF_LINE);
+      const uint16x8_t a1 =
+          vldaddq_u16(sum_buf + 2 * CFL_BUF_LINE, CFL_BUF_LINE);
+      sum_32x4 = vpadalq_u16(sum_32x4, a0);
+      sum_32x4 = vpadalq_u16(sum_32x4, a1);
+    } else {
+      const uint16x8_t row0 = vldaddq_u16(sum_buf, 8);
+      const uint16x8_t row1 = vldaddq_u16(sum_buf + CFL_BUF_LINE, 8);
+      const uint16x8_t row2 = vldaddq_u16(sum_buf + 2 * CFL_BUF_LINE, 8);
+      const uint16x8_t row3 = vldaddq_u16(sum_buf + 3 * CFL_BUF_LINE, 8);
+      sum_32x4 = vpadalq_u16(sum_32x4, row0);
+      sum_32x4 = vpadalq_u16(sum_32x4, row1);
+      sum_32x4 = vpadalq_u16(sum_32x4, row2);
+      sum_32x4 = vpadalq_u16(sum_32x4, row3);
+
+      if (width == 32) {
+        const uint16x8_t row0_1 = vldaddq_u16(sum_buf + 16, 8);
+        const uint16x8_t row1_1 = vldaddq_u16(sum_buf + CFL_BUF_LINE + 16, 8);
+        const uint16x8_t row2_1 =
+            vldaddq_u16(sum_buf + 2 * CFL_BUF_LINE + 16, 8);
+        const uint16x8_t row3_1 =
+            vldaddq_u16(sum_buf + 3 * CFL_BUF_LINE + 16, 8);
+
+        sum_32x4 = vpadalq_u16(sum_32x4, row0_1);
+        sum_32x4 = vpadalq_u16(sum_32x4, row1_1);
+        sum_32x4 = vpadalq_u16(sum_32x4, row2_1);
+        sum_32x4 = vpadalq_u16(sum_32x4, row3_1);
+      }
+    }
+    sum_buf += step;
+  } while (sum_buf < end);
+
+  // Permute and add in such a way that each lane contains the block sum.
+  // [A+C+B+D, B+D+A+C, C+A+D+B, D+B+C+A]
+#ifdef __aarch64__
+  sum_32x4 = vpaddq_u32(sum_32x4, sum_32x4);
+  sum_32x4 = vpaddq_u32(sum_32x4, sum_32x4);
+#else
+  uint32x4_t flip =
+      vcombine_u32(vget_high_u32(sum_32x4), vget_low_u32(sum_32x4));
+  sum_32x4 = vaddq_u32(sum_32x4, flip);
+  sum_32x4 = vaddq_u32(sum_32x4, vrev64q_u32(sum_32x4));
+#endif
+
+  // Computing the average could be done using scalars, but getting off the NEON
+  // engine introduces latency, so we use vqrshrn.
+  int16x4_t avg_16x4;
+  // Constant propagation makes for some ugly code.
+  switch (num_pel_log2) {
+    case 4: avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 4)); break;
+    case 5: avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 5)); break;
+    case 6: avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 6)); break;
+    case 7: avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 7)); break;
+    case 8: avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 8)); break;
+    case 9: avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 9)); break;
+    case 10:
+      avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 10));
+      break;
+    default: assert(0);
+  }
+
+  if (width == 4) {
+    do {
+      vst1_s16(dst, vsub_s16(vreinterpret_s16_u16(vld1_u16(src)), avg_16x4));
+      src += CFL_BUF_LINE;
+      dst += CFL_BUF_LINE;
+    } while (src < end);
+  } else {
+    const int16x8_t avg_16x8 = vcombine_s16(avg_16x4, avg_16x4);
+    do {
+      vldsubstq_s16(dst, src, 0, avg_16x8);
+      vldsubstq_s16(dst, src, CFL_BUF_LINE, avg_16x8);
+      vldsubstq_s16(dst, src, 2 * CFL_BUF_LINE, avg_16x8);
+      vldsubstq_s16(dst, src, 3 * CFL_BUF_LINE, avg_16x8);
+
+      if (width > 8) {
+        vldsubstq_s16(dst, src, 8, avg_16x8);
+        vldsubstq_s16(dst, src, 8 + CFL_BUF_LINE, avg_16x8);
+        vldsubstq_s16(dst, src, 8 + 2 * CFL_BUF_LINE, avg_16x8);
+        vldsubstq_s16(dst, src, 8 + 3 * CFL_BUF_LINE, avg_16x8);
+      }
+      if (width == 32) {
+        vldsubstq_s16(dst, src, 16, avg_16x8);
+        vldsubstq_s16(dst, src, 16 + CFL_BUF_LINE, avg_16x8);
+        vldsubstq_s16(dst, src, 16 + 2 * CFL_BUF_LINE, avg_16x8);
+        vldsubstq_s16(dst, src, 16 + 3 * CFL_BUF_LINE, avg_16x8);
+        vldsubstq_s16(dst, src, 24, avg_16x8);
+        vldsubstq_s16(dst, src, 24 + CFL_BUF_LINE, avg_16x8);
+        vldsubstq_s16(dst, src, 24 + 2 * CFL_BUF_LINE, avg_16x8);
+        vldsubstq_s16(dst, src, 24 + 3 * CFL_BUF_LINE, avg_16x8);
+      }
+      src += step;
+      dst += step;
+    } while (src < end);
+  }
+}
+
+CFL_SUB_AVG_FN(neon)
+
+// Saturating negate 16-bit integers in a when the corresponding signed 16-bit
+// integer in b is negative.
+// Notes:
+//   * Negating INT16_MIN results in INT16_MIN. However, this cannot occur in
+//   practice, as scaled_luma is the multiplication of two absolute values.
+//   * In the Intel equivalent, elements in a are zeroed out when the
+//   corresponding elements in b are zero. Because vsign is used twice in a
+//   row, with b in the first call becoming a in the second call, there's no
+//   impact from not zeroing out.
+static int16x4_t vsign_s16(int16x4_t a, int16x4_t b) {
+  const int16x4_t mask = vshr_n_s16(b, 15);
+  return veor_s16(vadd_s16(a, mask), mask);
+}
+
+// Saturating negate 16-bit integers in a when the corresponding signed 16-bit
+// integer in b is negative.
+// Notes:
+//   * Negating INT16_MIN results in INT16_MIN. However, this cannot occur in
+//   practice, as scaled_luma is the multiplication of two absolute values.
+//   * In the Intel equivalent, elements in a are zeroed out when the
+//   corresponding elements in b are zero. Because vsignq is used twice in a
+//   row, with b in the first call becoming a in the second call, there's no
+//   impact from not zeroing out.
+static int16x8_t vsignq_s16(int16x8_t a, int16x8_t b) {
+  const int16x8_t mask = vshrq_n_s16(b, 15);
+  return veorq_s16(vaddq_s16(a, mask), mask);
+}
+
+static INLINE int16x4_t predict_w4(const int16_t *pred_buf_q3,
+                                   int16x4_t alpha_sign, int abs_alpha_q12,
+                                   int16x4_t dc) {
+  const int16x4_t ac_q3 = vld1_s16(pred_buf_q3);
+  const int16x4_t ac_sign = veor_s16(alpha_sign, ac_q3);
+  int16x4_t scaled_luma = vqrdmulh_n_s16(vabs_s16(ac_q3), abs_alpha_q12);
+  return vadd_s16(vsign_s16(scaled_luma, ac_sign), dc);
+}
+
+static INLINE int16x8_t predict_w8(const int16_t *pred_buf_q3,
+                                   int16x8_t alpha_sign, int abs_alpha_q12,
+                                   int16x8_t dc) {
+  const int16x8_t ac_q3 = vld1q_s16(pred_buf_q3);
+  const int16x8_t ac_sign = veorq_s16(alpha_sign, ac_q3);
+  int16x8_t scaled_luma = vqrdmulhq_n_s16(vabsq_s16(ac_q3), abs_alpha_q12);
+  return vaddq_s16(vsignq_s16(scaled_luma, ac_sign), dc);
+}
+
+static INLINE int16x8x2_t predict_w16(const int16_t *pred_buf_q3,
+                                      int16x8_t alpha_sign, int abs_alpha_q12,
+                                      int16x8_t dc) {
+  // vld2q_s16 interleaves, which is not useful for prediction. vst1q_s16_x2
+  // does not interleave, but is not currently available in the compilier used
+  // by the AOM build system.
+  const int16x8x2_t ac_q3 = vld2q_s16(pred_buf_q3);
+  const int16x8_t ac_sign_0 = veorq_s16(alpha_sign, ac_q3.val[0]);
+  const int16x8_t ac_sign_1 = veorq_s16(alpha_sign, ac_q3.val[1]);
+  const int16x8_t scaled_luma_0 =
+      vqrdmulhq_n_s16(vabsq_s16(ac_q3.val[0]), abs_alpha_q12);
+  const int16x8_t scaled_luma_1 =
+      vqrdmulhq_n_s16(vabsq_s16(ac_q3.val[1]), abs_alpha_q12);
+  int16x8x2_t result;
+  result.val[0] = vaddq_s16(vsignq_s16(scaled_luma_0, ac_sign_0), dc);
+  result.val[1] = vaddq_s16(vsignq_s16(scaled_luma_1, ac_sign_1), dc);
+  return result;
+}
+
+static INLINE int16x8x4_t predict_w32(const int16_t *pred_buf_q3,
+                                      int16x8_t alpha_sign, int abs_alpha_q12,
+                                      int16x8_t dc) {
+  // vld4q_s16 interleaves, which is not useful for prediction. vst1q_s16_x4
+  // does not interleave, but is not currently available in the compilier used
+  // by the AOM build system.
+  const int16x8x4_t ac_q3 = vld4q_s16(pred_buf_q3);
+  const int16x8_t ac_sign_0 = veorq_s16(alpha_sign, ac_q3.val[0]);
+  const int16x8_t ac_sign_1 = veorq_s16(alpha_sign, ac_q3.val[1]);
+  const int16x8_t ac_sign_2 = veorq_s16(alpha_sign, ac_q3.val[2]);
+  const int16x8_t ac_sign_3 = veorq_s16(alpha_sign, ac_q3.val[3]);
+  const int16x8_t scaled_luma_0 =
+      vqrdmulhq_n_s16(vabsq_s16(ac_q3.val[0]), abs_alpha_q12);
+  const int16x8_t scaled_luma_1 =
+      vqrdmulhq_n_s16(vabsq_s16(ac_q3.val[1]), abs_alpha_q12);
+  const int16x8_t scaled_luma_2 =
+      vqrdmulhq_n_s16(vabsq_s16(ac_q3.val[2]), abs_alpha_q12);
+  const int16x8_t scaled_luma_3 =
+      vqrdmulhq_n_s16(vabsq_s16(ac_q3.val[3]), abs_alpha_q12);
+  int16x8x4_t result;
+  result.val[0] = vaddq_s16(vsignq_s16(scaled_luma_0, ac_sign_0), dc);
+  result.val[1] = vaddq_s16(vsignq_s16(scaled_luma_1, ac_sign_1), dc);
+  result.val[2] = vaddq_s16(vsignq_s16(scaled_luma_2, ac_sign_2), dc);
+  result.val[3] = vaddq_s16(vsignq_s16(scaled_luma_3, ac_sign_3), dc);
+  return result;
+}
+
+static INLINE void cfl_predict_lbd_neon(const int16_t *pred_buf_q3,
+                                        uint8_t *dst, int dst_stride,
+                                        int alpha_q3, int width, int height) {
+  const int16_t abs_alpha_q12 = abs(alpha_q3) << 9;
+  const int16_t *const end = pred_buf_q3 + height * CFL_BUF_LINE;
+  if (width == 4) {
+    const int16x4_t alpha_sign = vdup_n_s16(alpha_q3);
+    const int16x4_t dc = vdup_n_s16(*dst);
+    do {
+      const int16x4_t pred =
+          predict_w4(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
+      vsth_u8(dst, vqmovun_s16(vcombine_s16(pred, pred)));
+      dst += dst_stride;
+    } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
+  } else {
+    const int16x8_t alpha_sign = vdupq_n_s16(alpha_q3);
+    const int16x8_t dc = vdupq_n_s16(*dst);
+    do {
+      if (width == 8) {
+        vst1_u8(dst, vqmovun_s16(predict_w8(pred_buf_q3, alpha_sign,
+                                            abs_alpha_q12, dc)));
+      } else if (width == 16) {
+        const int16x8x2_t pred =
+            predict_w16(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
+        const uint8x8x2_t predun = { { vqmovun_s16(pred.val[0]),
+                                       vqmovun_s16(pred.val[1]) } };
+        vst2_u8(dst, predun);
+      } else {
+        const int16x8x4_t pred =
+            predict_w32(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
+        const uint8x8x4_t predun = {
+          { vqmovun_s16(pred.val[0]), vqmovun_s16(pred.val[1]),
+            vqmovun_s16(pred.val[2]), vqmovun_s16(pred.val[3]) }
+        };
+        vst4_u8(dst, predun);
+      }
+      dst += dst_stride;
+    } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
+  }
+}
+
+CFL_PREDICT_FN(neon, lbd)
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE uint16x4_t clamp_s16(int16x4_t a, int16x4_t max) {
+  return vreinterpret_u16_s16(vmax_s16(vmin_s16(a, max), vdup_n_s16(0)));
+}
+
+static INLINE uint16x8_t clampq_s16(int16x8_t a, int16x8_t max) {
+  return vreinterpretq_u16_s16(vmaxq_s16(vminq_s16(a, max), vdupq_n_s16(0)));
+}
+
+static INLINE uint16x8x2_t clamp2q_s16(int16x8x2_t a, int16x8_t max) {
+  uint16x8x2_t result;
+  result.val[0] = vreinterpretq_u16_s16(
+      vmaxq_s16(vminq_s16(a.val[0], max), vdupq_n_s16(0)));
+  result.val[1] = vreinterpretq_u16_s16(
+      vmaxq_s16(vminq_s16(a.val[1], max), vdupq_n_s16(0)));
+  return result;
+}
+
+static INLINE uint16x8x4_t clamp4q_s16(int16x8x4_t a, int16x8_t max) {
+  uint16x8x4_t result;
+  result.val[0] = vreinterpretq_u16_s16(
+      vmaxq_s16(vminq_s16(a.val[0], max), vdupq_n_s16(0)));
+  result.val[1] = vreinterpretq_u16_s16(
+      vmaxq_s16(vminq_s16(a.val[1], max), vdupq_n_s16(0)));
+  result.val[2] = vreinterpretq_u16_s16(
+      vmaxq_s16(vminq_s16(a.val[2], max), vdupq_n_s16(0)));
+  result.val[3] = vreinterpretq_u16_s16(
+      vmaxq_s16(vminq_s16(a.val[3], max), vdupq_n_s16(0)));
+  return result;
+}
+
+static INLINE void cfl_predict_hbd_neon(const int16_t *pred_buf_q3,
+                                        uint16_t *dst, int dst_stride,
+                                        int alpha_q3, int bd, int width,
+                                        int height) {
+  const int max = (1 << bd) - 1;
+  const int16_t abs_alpha_q12 = abs(alpha_q3) << 9;
+  const int16_t *const end = pred_buf_q3 + height * CFL_BUF_LINE;
+  if (width == 4) {
+    const int16x4_t alpha_sign = vdup_n_s16(alpha_q3);
+    const int16x4_t dc = vdup_n_s16(*dst);
+    const int16x4_t max_16x4 = vdup_n_s16(max);
+    do {
+      const int16x4_t scaled_luma =
+          predict_w4(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
+      vst1_u16(dst, clamp_s16(scaled_luma, max_16x4));
+      dst += dst_stride;
+    } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
+  } else {
+    const int16x8_t alpha_sign = vdupq_n_s16(alpha_q3);
+    const int16x8_t dc = vdupq_n_s16(*dst);
+    const int16x8_t max_16x8 = vdupq_n_s16(max);
+    do {
+      if (width == 8) {
+        const int16x8_t pred =
+            predict_w8(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
+        vst1q_u16(dst, clampq_s16(pred, max_16x8));
+      } else if (width == 16) {
+        const int16x8x2_t pred =
+            predict_w16(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
+        vst2q_u16(dst, clamp2q_s16(pred, max_16x8));
+      } else {
+        const int16x8x4_t pred =
+            predict_w32(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
+        vst4q_u16(dst, clamp4q_s16(pred, max_16x8));
+      }
+      dst += dst_stride;
+    } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
+  }
+}
+
+CFL_PREDICT_FN(neon, hbd)
+#endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/libs/libaom/src/av1/common/arm/convolve_neon.c b/libs/libaom/src/av1/common/arm/convolve_neon.c
new file mode 100644
index 000000000..51c96961c
--- /dev/null
+++ b/libs/libaom/src/av1/common/arm/convolve_neon.c
@@ -0,0 +1,1593 @@
+/*
+ *
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <arm_neon.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/mem.h"
+#include "av1/common/convolve.h"
+#include "av1/common/filter.h"
+#include "av1/common/arm/convolve_neon.h"
+#include "av1/common/arm/mem_neon.h"
+#include "av1/common/arm/transpose_neon.h"
+
+static INLINE int16x4_t convolve8_4x4(const int16x4_t s0, const int16x4_t s1,
+                                      const int16x4_t s2, const int16x4_t s3,
+                                      const int16x4_t s4, const int16x4_t s5,
+                                      const int16x4_t s6, const int16x4_t s7,
+                                      const int16_t *filter) {
+  int16x4_t sum;
+
+  sum = vmul_n_s16(s0, filter[0]);
+  sum = vmla_n_s16(sum, s1, filter[1]);
+  sum = vmla_n_s16(sum, s2, filter[2]);
+  sum = vmla_n_s16(sum, s5, filter[5]);
+  sum = vmla_n_s16(sum, s6, filter[6]);
+  sum = vmla_n_s16(sum, s7, filter[7]);
+  /* filter[3] can take a max value of 128. So the max value of the result :
+   * 128*255 + sum > 16 bits
+   */
+  sum = vqadd_s16(sum, vmul_n_s16(s3, filter[3]));
+  sum = vqadd_s16(sum, vmul_n_s16(s4, filter[4]));
+
+  return sum;
+}
+
+static INLINE uint8x8_t convolve8_horiz_8x8(
+    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+    const int16x8_t s6, const int16x8_t s7, const int16_t *filter,
+    const int16x8_t shift_round_0, const int16x8_t shift_by_bits) {
+  int16x8_t sum;
+
+  sum = vmulq_n_s16(s0, filter[0]);
+  sum = vmlaq_n_s16(sum, s1, filter[1]);
+  sum = vmlaq_n_s16(sum, s2, filter[2]);
+  sum = vmlaq_n_s16(sum, s5, filter[5]);
+  sum = vmlaq_n_s16(sum, s6, filter[6]);
+  sum = vmlaq_n_s16(sum, s7, filter[7]);
+  /* filter[3] can take a max value of 128. So the max value of the result :
+   * 128*255 + sum > 16 bits
+   */
+  sum = vqaddq_s16(sum, vmulq_n_s16(s3, filter[3]));
+  sum = vqaddq_s16(sum, vmulq_n_s16(s4, filter[4]));
+
+  sum = vqrshlq_s16(sum, shift_round_0);
+  sum = vqrshlq_s16(sum, shift_by_bits);
+
+  return vqmovun_s16(sum);
+}
+
+#if !defined(__aarch64__)
+static INLINE uint8x8_t convolve8_horiz_4x1(
+    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+    const int16x4_t s6, const int16x4_t s7, const int16_t *filter,
+    const int16x4_t shift_round_0, const int16x4_t shift_by_bits) {
+  int16x4_t sum;
+
+  sum = vmul_n_s16(s0, filter[0]);
+  sum = vmla_n_s16(sum, s1, filter[1]);
+  sum = vmla_n_s16(sum, s2, filter[2]);
+  sum = vmla_n_s16(sum, s5, filter[5]);
+  sum = vmla_n_s16(sum, s6, filter[6]);
+  sum = vmla_n_s16(sum, s7, filter[7]);
+  /* filter[3] can take a max value of 128. So the max value of the result :
+   * 128*255 + sum > 16 bits
+   */
+  sum = vqadd_s16(sum, vmul_n_s16(s3, filter[3]));
+  sum = vqadd_s16(sum, vmul_n_s16(s4, filter[4]));
+
+  sum = vqrshl_s16(sum, shift_round_0);
+  sum = vqrshl_s16(sum, shift_by_bits);
+
+  return vqmovun_s16(vcombine_s16(sum, sum));
+}
+#endif  // !defined(__arch64__)
+
+static INLINE uint8x8_t convolve8_vert_8x4(
+    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+    const int16x8_t s6, const int16x8_t s7, const int16_t *filter) {
+  int16x8_t sum;
+
+  sum = vmulq_n_s16(s0, filter[0]);
+  sum = vmlaq_n_s16(sum, s1, filter[1]);
+  sum = vmlaq_n_s16(sum, s2, filter[2]);
+  sum = vmlaq_n_s16(sum, s5, filter[5]);
+  sum = vmlaq_n_s16(sum, s6, filter[6]);
+  sum = vmlaq_n_s16(sum, s7, filter[7]);
+  /* filter[3] can take a max value of 128. So the max value of the result :
+   * 128*255 + sum > 16 bits
+   */
+  sum = vqaddq_s16(sum, vmulq_n_s16(s3, filter[3]));
+  sum = vqaddq_s16(sum, vmulq_n_s16(s4, filter[4]));
+
+  return vqrshrun_n_s16(sum, FILTER_BITS);
+}
+
+static INLINE uint16x4_t convolve8_vert_4x4_s32(
+    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+    const int16x4_t s6, const int16x4_t s7, const int16_t *y_filter,
+    const int32x4_t round_shift_vec, const int32x4_t offset_const,
+    const int32x4_t sub_const_vec) {
+  int32x4_t sum0;
+  uint16x4_t res;
+  const int32x4_t zero = vdupq_n_s32(0);
+
+  sum0 = vmull_n_s16(s0, y_filter[0]);
+  sum0 = vmlal_n_s16(sum0, s1, y_filter[1]);
+  sum0 = vmlal_n_s16(sum0, s2, y_filter[2]);
+  sum0 = vmlal_n_s16(sum0, s3, y_filter[3]);
+  sum0 = vmlal_n_s16(sum0, s4, y_filter[4]);
+  sum0 = vmlal_n_s16(sum0, s5, y_filter[5]);
+  sum0 = vmlal_n_s16(sum0, s6, y_filter[6]);
+  sum0 = vmlal_n_s16(sum0, s7, y_filter[7]);
+
+  sum0 = vaddq_s32(sum0, offset_const);
+  sum0 = vqrshlq_s32(sum0, round_shift_vec);
+  sum0 = vsubq_s32(sum0, sub_const_vec);
+  sum0 = vmaxq_s32(sum0, zero);
+
+  res = vmovn_u32(vreinterpretq_u32_s32(sum0));
+
+  return res;
+}
+
+static INLINE uint8x8_t convolve8_vert_8x4_s32(
+    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+    const int16x8_t s6, const int16x8_t s7, const int16_t *y_filter,
+    const int32x4_t round_shift_vec, const int32x4_t offset_const,
+    const int32x4_t sub_const_vec, const int16x8_t vec_round_bits) {
+  int32x4_t sum0, sum1;
+  uint16x8_t res;
+  const int32x4_t zero = vdupq_n_s32(0);
+
+  sum0 = vmull_n_s16(vget_low_s16(s0), y_filter[0]);
+  sum0 = vmlal_n_s16(sum0, vget_low_s16(s1), y_filter[1]);
+  sum0 = vmlal_n_s16(sum0, vget_low_s16(s2), y_filter[2]);
+  sum0 = vmlal_n_s16(sum0, vget_low_s16(s3), y_filter[3]);
+  sum0 = vmlal_n_s16(sum0, vget_low_s16(s4), y_filter[4]);
+  sum0 = vmlal_n_s16(sum0, vget_low_s16(s5), y_filter[5]);
+  sum0 = vmlal_n_s16(sum0, vget_low_s16(s6), y_filter[6]);
+  sum0 = vmlal_n_s16(sum0, vget_low_s16(s7), y_filter[7]);
+
+  sum1 = vmull_n_s16(vget_high_s16(s0), y_filter[0]);
+  sum1 = vmlal_n_s16(sum1, vget_high_s16(s1), y_filter[1]);
+  sum1 = vmlal_n_s16(sum1, vget_high_s16(s2), y_filter[2]);
+  sum1 = vmlal_n_s16(sum1, vget_high_s16(s3), y_filter[3]);
+  sum1 = vmlal_n_s16(sum1, vget_high_s16(s4), y_filter[4]);
+  sum1 = vmlal_n_s16(sum1, vget_high_s16(s5), y_filter[5]);
+  sum1 = vmlal_n_s16(sum1, vget_high_s16(s6), y_filter[6]);
+  sum1 = vmlal_n_s16(sum1, vget_high_s16(s7), y_filter[7]);
+
+  sum0 = vaddq_s32(sum0, offset_const);
+  sum1 = vaddq_s32(sum1, offset_const);
+  sum0 = vqrshlq_s32(sum0, round_shift_vec);
+  sum1 = vqrshlq_s32(sum1, round_shift_vec);
+  sum0 = vsubq_s32(sum0, sub_const_vec);
+  sum1 = vsubq_s32(sum1, sub_const_vec);
+  sum0 = vmaxq_s32(sum0, zero);
+  sum1 = vmaxq_s32(sum1, zero);
+  res = vcombine_u16(vqmovn_u32(vreinterpretq_u32_s32(sum0)),
+                     vqmovn_u32(vreinterpretq_u32_s32(sum1)));
+
+  res = vqrshlq_u16(res, vec_round_bits);
+
+  return vqmovn_u16(res);
+}
+
+void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
+                            int dst_stride, int w, int h,
+                            const InterpFilterParams *filter_params_x,
+                            const InterpFilterParams *filter_params_y,
+                            const int subpel_x_qn, const int subpel_y_qn,
+                            ConvolveParams *conv_params) {
+  const uint8_t horiz_offset = filter_params_x->taps / 2 - 1;
+  const int8_t bits = FILTER_BITS - conv_params->round_0;
+
+  (void)subpel_y_qn;
+  (void)conv_params;
+  (void)filter_params_y;
+
+  uint8x8_t t0;
+#if defined(__aarch64__)
+  uint8x8_t t1, t2, t3;
+#endif
+
+  assert(bits >= 0);
+  assert((FILTER_BITS - conv_params->round_1) >= 0 ||
+         ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
+
+  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+      filter_params_x, subpel_x_qn & SUBPEL_MASK);
+
+  const int16x8_t shift_round_0 = vdupq_n_s16(-conv_params->round_0);
+  const int16x8_t shift_by_bits = vdupq_n_s16(-bits);
+
+  src -= horiz_offset;
+#if defined(__aarch64__)
+  if (h == 4) {
+    uint8x8_t d01, d23;
+    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
+    int16x8_t d01_temp, d23_temp;
+
+    __builtin_prefetch(src + 0 * src_stride);
+    __builtin_prefetch(src + 1 * src_stride);
+    __builtin_prefetch(src + 2 * src_stride);
+    __builtin_prefetch(src + 3 * src_stride);
+
+    load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
+    transpose_u8_8x4(&t0, &t1, &t2, &t3);
+
+    s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+    s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+    s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+    s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+    s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+    s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+    s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+    __builtin_prefetch(dst + 0 * dst_stride);
+    __builtin_prefetch(dst + 1 * dst_stride);
+    __builtin_prefetch(dst + 2 * dst_stride);
+    __builtin_prefetch(dst + 3 * dst_stride);
+    src += 7;
+
+    do {
+      load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
+      transpose_u8_8x4(&t0, &t1, &t2, &t3);
+
+      s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+      s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+      s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+      s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+
+      d0 = convolve8_4x4(s0, s1, s2, s3, s4, s5, s6, s7, x_filter);
+
+      d1 = convolve8_4x4(s1, s2, s3, s4, s5, s6, s7, s8, x_filter);
+
+      d2 = convolve8_4x4(s2, s3, s4, s5, s6, s7, s8, s9, x_filter);
+
+      d3 = convolve8_4x4(s3, s4, s5, s6, s7, s8, s9, s10, x_filter);
+
+      d01_temp = vqrshlq_s16(vcombine_s16(d0, d1), shift_round_0);
+      d23_temp = vqrshlq_s16(vcombine_s16(d2, d3), shift_round_0);
+
+      d01_temp = vqrshlq_s16(d01_temp, shift_by_bits);
+      d23_temp = vqrshlq_s16(d23_temp, shift_by_bits);
+
+      d01 = vqmovun_s16(d01_temp);
+      d23 = vqmovun_s16(d23_temp);
+
+      transpose_u8_4x4(&d01, &d23);
+
+      if (w != 2) {
+        vst1_lane_u32((uint32_t *)(dst + 0 * dst_stride),  // 00 01 02 03
+                      vreinterpret_u32_u8(d01), 0);
+        vst1_lane_u32((uint32_t *)(dst + 1 * dst_stride),  // 10 11 12 13
+                      vreinterpret_u32_u8(d23), 0);
+        vst1_lane_u32((uint32_t *)(dst + 2 * dst_stride),  // 20 21 22 23
+                      vreinterpret_u32_u8(d01), 1);
+        vst1_lane_u32((uint32_t *)(dst + 3 * dst_stride),  // 30 31 32 33
+                      vreinterpret_u32_u8(d23), 1);
+      } else {
+        vst1_lane_u16((uint16_t *)(dst + 0 * dst_stride),  // 00 01
+                      vreinterpret_u16_u8(d01), 0);
+        vst1_lane_u16((uint16_t *)(dst + 1 * dst_stride),  // 10 11
+                      vreinterpret_u16_u8(d23), 0);
+        vst1_lane_u16((uint16_t *)(dst + 2 * dst_stride),  // 20 21
+                      vreinterpret_u16_u8(d01), 2);
+        vst1_lane_u16((uint16_t *)(dst + 3 * dst_stride),  // 30 31
+                      vreinterpret_u16_u8(d23), 2);
+      }
+
+      s0 = s4;
+      s1 = s5;
+      s2 = s6;
+      s3 = s7;
+      s4 = s8;
+      s5 = s9;
+      s6 = s10;
+      src += 4;
+      dst += 4;
+      w -= 4;
+    } while (w > 0);
+  } else {
+#endif
+    int width;
+    const uint8_t *s;
+    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+#if defined(__aarch64__)
+    int16x8_t s8, s9, s10;
+    uint8x8_t t4, t5, t6, t7;
+#endif
+
+    if (w <= 4) {
+#if defined(__aarch64__)
+      do {
+        load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+        transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+        s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+        s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+        s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+        s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+        load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6,
+                    &t7);
+        src += 8 * src_stride;
+        __builtin_prefetch(dst + 0 * dst_stride);
+        __builtin_prefetch(dst + 1 * dst_stride);
+        __builtin_prefetch(dst + 2 * dst_stride);
+        __builtin_prefetch(dst + 3 * dst_stride);
+        __builtin_prefetch(dst + 4 * dst_stride);
+        __builtin_prefetch(dst + 5 * dst_stride);
+        __builtin_prefetch(dst + 6 * dst_stride);
+        __builtin_prefetch(dst + 7 * dst_stride);
+
+        transpose_u8_4x8(&t0, &t1, &t2, &t3, t4, t5, t6, t7);
+
+        s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+
+        __builtin_prefetch(src + 0 * src_stride);
+        __builtin_prefetch(src + 1 * src_stride);
+        __builtin_prefetch(src + 2 * src_stride);
+        __builtin_prefetch(src + 3 * src_stride);
+        __builtin_prefetch(src + 4 * src_stride);
+        __builtin_prefetch(src + 5 * src_stride);
+        __builtin_prefetch(src + 6 * src_stride);
+        __builtin_prefetch(src + 7 * src_stride);
+        t0 = convolve8_horiz_8x8(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
+                                 shift_round_0, shift_by_bits);
+        t1 = convolve8_horiz_8x8(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
+                                 shift_round_0, shift_by_bits);
+        t2 = convolve8_horiz_8x8(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
+                                 shift_round_0, shift_by_bits);
+        t3 = convolve8_horiz_8x8(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
+                                 shift_round_0, shift_by_bits);
+
+        transpose_u8_8x4(&t0, &t1, &t2, &t3);
+
+        if ((w == 4) && (h > 4)) {
+          vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0),
+                        0);  // 00 01 02 03
+          dst += dst_stride;
+          vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1),
+                        0);  // 10 11 12 13
+          dst += dst_stride;
+          vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t2),
+                        0);  // 20 21 22 23
+          dst += dst_stride;
+          vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t3),
+                        0);  // 30 31 32 33
+          dst += dst_stride;
+          vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0),
+                        1);  // 40 41 42 43
+          dst += dst_stride;
+          vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1),
+                        1);  // 50 51 52 53
+          dst += dst_stride;
+          vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t2),
+                        1);  // 60 61 62 63
+          dst += dst_stride;
+          vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t3),
+                        1);  // 70 71 72 73
+          dst += dst_stride;
+        } else if ((w == 4) && (h == 2)) {
+          vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0),
+                        0);  // 00 01 02 03
+          dst += dst_stride;
+          vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1),
+                        0);  // 10 11 12 13
+          dst += dst_stride;
+        } else if ((w == 2) && (h > 4)) {
+          vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t0), 0);  // 00 01
+          dst += dst_stride;
+          vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t1), 0);  // 10 11
+          dst += dst_stride;
+          vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t2), 0);  // 20 21
+          dst += dst_stride;
+          vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t3), 0);  // 30 31
+          dst += dst_stride;
+          vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t0), 2);  // 40 41
+          dst += dst_stride;
+          vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t1), 2);  // 50 51
+          dst += dst_stride;
+          vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t2), 2);  // 60 61
+          dst += dst_stride;
+          vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t3), 2);  // 70 71
+          dst += dst_stride;
+        } else if ((w == 2) && (h == 2)) {
+          vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t0), 0);  // 00 01
+          dst += dst_stride;
+          vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t1), 0);  // 10 11
+          dst += dst_stride;
+        }
+        h -= 8;
+      } while (h > 0);
+#else
+    int16x8_t tt0;
+    int16x4_t x0, x1, x2, x3, x4, x5, x6, x7;
+    const int16x4_t shift_round_0_low = vget_low_s16(shift_round_0);
+    const int16x4_t shift_by_bits_low = vget_low_s16(shift_by_bits);
+    do {
+      t0 = vld1_u8(src);  // a0 a1 a2 a3 a4 a5 a6 a7
+      tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      x0 = vget_low_s16(tt0);   // a0 a1 a2 a3
+      x4 = vget_high_s16(tt0);  // a4 a5 a6 a7
+
+      t0 = vld1_u8(src + 8);  // a8 a9 a10 a11 a12 a13 a14 a15
+      tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      x7 = vget_low_s16(tt0);  // a8 a9 a10 a11
+
+      x1 = vext_s16(x0, x4, 1);  // a1 a2 a3 a4
+      x2 = vext_s16(x0, x4, 2);  // a2 a3 a4 a5
+      x3 = vext_s16(x0, x4, 3);  // a3 a4 a5 a6
+      x5 = vext_s16(x4, x7, 1);  // a5 a6 a7 a8
+      x6 = vext_s16(x4, x7, 2);  // a6 a7 a8 a9
+      x7 = vext_s16(x4, x7, 3);  // a7 a8 a9 a10
+
+      src += src_stride;
+
+      t0 = convolve8_horiz_4x1(x0, x1, x2, x3, x4, x5, x6, x7, x_filter,
+                               shift_round_0_low, shift_by_bits_low);
+
+      if (w == 4) {
+        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0),
+                      0);  // 00 01 02 03
+        dst += dst_stride;
+      } else if (w == 2) {
+        vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t0), 0);  // 00 01
+        dst += dst_stride;
+      }
+      h -= 1;
+    } while (h > 0);
+#endif
+    } else {
+      uint8_t *d;
+      int16x8_t s11;
+#if defined(__aarch64__)
+      int16x8_t s12, s13, s14;
+      do {
+        __builtin_prefetch(src + 0 * src_stride);
+        __builtin_prefetch(src + 1 * src_stride);
+        __builtin_prefetch(src + 2 * src_stride);
+        __builtin_prefetch(src + 3 * src_stride);
+        __builtin_prefetch(src + 4 * src_stride);
+        __builtin_prefetch(src + 5 * src_stride);
+        __builtin_prefetch(src + 6 * src_stride);
+        __builtin_prefetch(src + 7 * src_stride);
+        load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+        transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+        s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+        s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+        s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+        s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+        width = w;
+        s = src + 7;
+        d = dst;
+        __builtin_prefetch(dst + 0 * dst_stride);
+        __builtin_prefetch(dst + 1 * dst_stride);
+        __builtin_prefetch(dst + 2 * dst_stride);
+        __builtin_prefetch(dst + 3 * dst_stride);
+        __builtin_prefetch(dst + 4 * dst_stride);
+        __builtin_prefetch(dst + 5 * dst_stride);
+        __builtin_prefetch(dst + 6 * dst_stride);
+        __builtin_prefetch(dst + 7 * dst_stride);
+
+        do {
+          load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+          transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+          s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+          s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+          s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+          s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+          s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
+          s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
+          s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
+          s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+          t0 = convolve8_horiz_8x8(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
+                                   shift_round_0, shift_by_bits);
+
+          t1 = convolve8_horiz_8x8(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
+                                   shift_round_0, shift_by_bits);
+
+          t2 = convolve8_horiz_8x8(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
+                                   shift_round_0, shift_by_bits);
+
+          t3 = convolve8_horiz_8x8(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
+                                   shift_round_0, shift_by_bits);
+
+          t4 = convolve8_horiz_8x8(s4, s5, s6, s7, s8, s9, s10, s11, x_filter,
+                                   shift_round_0, shift_by_bits);
+
+          t5 = convolve8_horiz_8x8(s5, s6, s7, s8, s9, s10, s11, s12, x_filter,
+                                   shift_round_0, shift_by_bits);
+
+          t6 = convolve8_horiz_8x8(s6, s7, s8, s9, s10, s11, s12, s13, x_filter,
+                                   shift_round_0, shift_by_bits);
+
+          t7 = convolve8_horiz_8x8(s7, s8, s9, s10, s11, s12, s13, s14,
+                                   x_filter, shift_round_0, shift_by_bits);
+
+          transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+          if (h != 2) {
+            store_u8_8x8(d, dst_stride, t0, t1, t2, t3, t4, t5, t6, t7);
+          } else {
+            store_row2_u8_8x8(d, dst_stride, t0, t1);
+          }
+          s0 = s8;
+          s1 = s9;
+          s2 = s10;
+          s3 = s11;
+          s4 = s12;
+          s5 = s13;
+          s6 = s14;
+          s += 8;
+          d += 8;
+          width -= 8;
+        } while (width > 0);
+        src += 8 * src_stride;
+        dst += 8 * dst_stride;
+        h -= 8;
+      } while (h > 0);
+#else
+    do {
+      t0 = vld1_u8(src);  // a0 a1 a2 a3 a4 a5 a6 a7
+      s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+
+      width = w;
+      s = src + 8;
+      d = dst;
+      __builtin_prefetch(dst);
+
+      do {
+        t0 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
+        s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        s11 = s0;
+        s0 = s7;
+
+        s1 = vextq_s16(s11, s7, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
+        s2 = vextq_s16(s11, s7, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
+        s3 = vextq_s16(s11, s7, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
+        s4 = vextq_s16(s11, s7, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
+        s5 = vextq_s16(s11, s7, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
+        s6 = vextq_s16(s11, s7, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
+        s7 = vextq_s16(s11, s7, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
+
+        t0 = convolve8_horiz_8x8(s11, s1, s2, s3, s4, s5, s6, s7, x_filter,
+                                 shift_round_0, shift_by_bits);
+        vst1_u8(d, t0);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width > 0);
+      src += src_stride;
+      dst += dst_stride;
+      h -= 1;
+    } while (h > 0);
+#endif
+    }
+#if defined(__aarch64__)
+  }
+#endif
+}
+
+void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
+                            int dst_stride, int w, int h,
+                            const InterpFilterParams *filter_params_x,
+                            const InterpFilterParams *filter_params_y,
+                            const int subpel_x_qn, const int subpel_y_qn,
+                            ConvolveParams *conv_params) {
+  const int vert_offset = filter_params_y->taps / 2 - 1;
+
+  src -= vert_offset * src_stride;
+
+  (void)filter_params_x;
+  (void)subpel_x_qn;
+  (void)conv_params;
+
+  assert(conv_params->round_0 <= FILTER_BITS);
+  assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
+         ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
+
+  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+      filter_params_y, subpel_y_qn & SUBPEL_MASK);
+
+  if (w <= 4) {
+    uint8x8_t d01;
+    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0;
+#if defined(__aarch64__)
+    uint8x8_t d23;
+    int16x4_t s8, s9, s10, d1, d2, d3;
+#endif
+    s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+    src += src_stride;
+    s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+    src += src_stride;
+    s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+    src += src_stride;
+    s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+    src += src_stride;
+    s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+    src += src_stride;
+    s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+    src += src_stride;
+    s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+    src += src_stride;
+
+    do {
+      s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+      src += src_stride;
+#if defined(__aarch64__)
+      s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+      src += src_stride;
+      s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+      src += src_stride;
+      s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+      src += src_stride;
+
+      __builtin_prefetch(dst + 0 * dst_stride);
+      __builtin_prefetch(dst + 1 * dst_stride);
+      __builtin_prefetch(dst + 2 * dst_stride);
+      __builtin_prefetch(dst + 3 * dst_stride);
+      __builtin_prefetch(src + 0 * src_stride);
+      __builtin_prefetch(src + 1 * src_stride);
+      __builtin_prefetch(src + 2 * src_stride);
+      __builtin_prefetch(src + 3 * src_stride);
+      d0 = convolve8_4x4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
+      d1 = convolve8_4x4(s1, s2, s3, s4, s5, s6, s7, s8, y_filter);
+      d2 = convolve8_4x4(s2, s3, s4, s5, s6, s7, s8, s9, y_filter);
+      d3 = convolve8_4x4(s3, s4, s5, s6, s7, s8, s9, s10, y_filter);
+
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+      if ((w == 4) && (h != 2)) {
+        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01),
+                      0);  // 00 01 02 03
+        dst += dst_stride;
+        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01),
+                      1);  // 10 11 12 13
+        dst += dst_stride;
+        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23),
+                      0);  // 20 21 22 23
+        dst += dst_stride;
+        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23),
+                      1);  // 30 31 32 33
+        dst += dst_stride;
+      } else if ((w == 4) && (h == 2)) {
+        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01),
+                      0);  // 00 01 02 03
+        dst += dst_stride;
+        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01),
+                      1);  // 10 11 12 13
+        dst += dst_stride;
+      } else if ((w == 2) && (h != 2)) {
+        vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d01), 0);  // 00 01
+        dst += dst_stride;
+        vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d01), 2);  // 10 11
+        dst += dst_stride;
+        vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d23), 0);  // 20 21
+        dst += dst_stride;
+        vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d23), 2);  // 30 31
+        dst += dst_stride;
+      } else if ((w == 2) && (h == 2)) {
+        vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d01), 0);  // 00 01
+        dst += dst_stride;
+        vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d01), 2);  // 10 11
+        dst += dst_stride;
+      }
+      s0 = s4;
+      s1 = s5;
+      s2 = s6;
+      s3 = s7;
+      s4 = s8;
+      s5 = s9;
+      s6 = s10;
+      h -= 4;
+#else
+      __builtin_prefetch(dst + 0 * dst_stride);
+      __builtin_prefetch(src + 0 * src_stride);
+
+      d0 = convolve8_4x4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
+
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d0), FILTER_BITS);
+
+      if (w == 4) {
+        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0);
+        dst += dst_stride;
+      } else if (w == 2) {
+        vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d01), 0);
+        dst += dst_stride;
+      }
+      s0 = s1;
+      s1 = s2;
+      s2 = s3;
+      s3 = s4;
+      s4 = s5;
+      s5 = s6;
+      s6 = s7;
+      h -= 1;
+#endif
+    } while (h > 0);
+  } else {
+    int height;
+    const uint8_t *s;
+    uint8_t *d;
+    uint8x8_t t0;
+    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+#if defined(__aarch64__)
+    uint8x8_t t1, t2, t3;
+    int16x8_t s8, s9, s10;
+#endif
+    do {
+      __builtin_prefetch(src + 0 * src_stride);
+      __builtin_prefetch(src + 1 * src_stride);
+      __builtin_prefetch(src + 2 * src_stride);
+      __builtin_prefetch(src + 3 * src_stride);
+      __builtin_prefetch(src + 4 * src_stride);
+      __builtin_prefetch(src + 5 * src_stride);
+      __builtin_prefetch(src + 6 * src_stride);
+      s = src;
+      s0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+      s += src_stride;
+      s1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+      s += src_stride;
+      s2 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+      s += src_stride;
+      s3 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+      s += src_stride;
+      s4 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+      s += src_stride;
+      s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+      s += src_stride;
+      s6 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+      s += src_stride;
+      d = dst;
+      height = h;
+
+      do {
+        s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+        s += src_stride;
+#if defined(__aarch64__)
+        s8 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+        s += src_stride;
+        s9 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+        s += src_stride;
+        s10 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+        s += src_stride;
+
+        __builtin_prefetch(d + 0 * dst_stride);
+        __builtin_prefetch(d + 1 * dst_stride);
+        __builtin_prefetch(d + 2 * dst_stride);
+        __builtin_prefetch(d + 3 * dst_stride);
+        __builtin_prefetch(s + 0 * src_stride);
+        __builtin_prefetch(s + 1 * src_stride);
+        __builtin_prefetch(s + 2 * src_stride);
+        __builtin_prefetch(s + 3 * src_stride);
+        t0 = convolve8_vert_8x4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
+        t1 = convolve8_vert_8x4(s1, s2, s3, s4, s5, s6, s7, s8, y_filter);
+        t2 = convolve8_vert_8x4(s2, s3, s4, s5, s6, s7, s8, s9, y_filter);
+        t3 = convolve8_vert_8x4(s3, s4, s5, s6, s7, s8, s9, s10, y_filter);
+        if (h != 2) {
+          vst1_u8(d, t0);
+          d += dst_stride;
+          vst1_u8(d, t1);
+          d += dst_stride;
+          vst1_u8(d, t2);
+          d += dst_stride;
+          vst1_u8(d, t3);
+          d += dst_stride;
+        } else {
+          vst1_u8(d, t0);
+          d += dst_stride;
+          vst1_u8(d, t1);
+          d += dst_stride;
+        }
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+        s3 = s7;
+        s4 = s8;
+        s5 = s9;
+        s6 = s10;
+        height -= 4;
+#else
+        __builtin_prefetch(d);
+        __builtin_prefetch(s);
+
+        t0 = convolve8_vert_8x4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
+
+        vst1_u8(d, t0);
+        d += dst_stride;
+
+        s0 = s1;
+        s1 = s2;
+        s2 = s3;
+        s3 = s4;
+        s4 = s5;
+        s5 = s6;
+        s6 = s7;
+        height -= 1;
+#endif
+      } while (height > 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w > 0);
+  }
+}
+
+// Horizontal filtering for convolve_2d_sr for width multiple of 8
+// Processes one row at a time
+static INLINE void horiz_filter_w8_single_row(
+    const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr,
+    const int dst_stride, int width, int height, const int16_t *x_filter,
+    const int16x8_t horiz_const, const int16x8_t shift_round_0) {
+  int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+  do {
+    uint8x8_t t0 = vld1_u8(src_ptr);
+    s0 = vreinterpretq_s16_u16(vmovl_u8(t0));  // a0 a1 a2 a3 a4 a5 a6 a7
+
+    int width_tmp = width;
+    const uint8_t *s = src_ptr + 8;
+    int16_t *dst_tmp = dst_ptr;
+
+    __builtin_prefetch(dst_ptr);
+
+    do {
+      t0 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
+      s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      int16x8_t sum = s0;
+      s0 = s7;
+
+      s1 = vextq_s16(sum, s7, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
+      s2 = vextq_s16(sum, s7, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
+      s3 = vextq_s16(sum, s7, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
+      s4 = vextq_s16(sum, s7, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
+      s5 = vextq_s16(sum, s7, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
+      s6 = vextq_s16(sum, s7, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
+      s7 = vextq_s16(sum, s7, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
+
+      int16x8_t res0 = convolve8_8x8_s16(sum, s1, s2, s3, s4, s5, s6, s7,
+                                         x_filter, horiz_const, shift_round_0);
+
+      vst1q_s16(dst_tmp, res0);
+
+      s += 8;
+      dst_tmp += 8;
+      width_tmp -= 8;
+    } while (width_tmp > 0);
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+    height--;
+  } while (height > 0);
+}
+
+// Horizontal filtering for convolve_2d_sr for width <= 4
+// Processes one row at a time
+static INLINE void horiz_filter_w4_single_row(
+    const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr,
+    const int dst_stride, int width, int height, const int16_t *x_filter,
+    const int16x4_t horiz_const, const int16x4_t shift_round_0) {
+  int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
+  do {
+    const uint8_t *s = src_ptr;
+
+    __builtin_prefetch(s);
+
+    uint8x8_t t0 = vld1_u8(s);  // a0 a1 a2 a3 a4 a5 a6 a7
+    int16x8_t tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+    s0 = vget_low_s16(tt0);
+    s4 = vget_high_s16(tt0);
+
+    __builtin_prefetch(dst_ptr);
+    s += 8;
+
+    t0 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
+    s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+
+    s1 = vext_s16(s0, s4, 1);  // a1 a2 a3 a4
+    s2 = vext_s16(s0, s4, 2);  // a2 a3 a4 a5
+    s3 = vext_s16(s0, s4, 3);  // a3 a4 a5 a6
+    s5 = vext_s16(s4, s7, 1);  // a5 a6 a7 a8
+    s6 = vext_s16(s4, s7, 2);  // a6 a7 a8 a9
+    s7 = vext_s16(s4, s7, 3);  // a7 a8 a9 a10
+
+    int16x4_t d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
+                                     horiz_const, shift_round_0);
+
+    if (width == 4) {
+      vst1_s16(dst_ptr, d0);
+      dst_ptr += dst_stride;
+    } else if (width == 2) {
+      vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_s16(d0), 0);
+      dst_ptr += dst_stride;
+    }
+
+    src_ptr += src_stride;
+    height--;
+  } while (height > 0);
+}
+
+void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
+                             int dst_stride, int w, int h,
+                             const InterpFilterParams *filter_params_x,
+                             const InterpFilterParams *filter_params_y,
+                             const int subpel_x_qn, const int subpel_y_qn,
+                             ConvolveParams *conv_params) {
+  int im_dst_stride;
+  int width, height;
+#if defined(__aarch64__)
+  uint8x8_t t0;
+  uint8x8_t t1, t2, t3, t4, t5, t6, t7;
+  const uint8_t *s;
+#endif
+
+  DECLARE_ALIGNED(16, int16_t,
+                  im_block[(MAX_SB_SIZE + HORIZ_EXTRA_ROWS) * MAX_SB_SIZE]);
+
+  const int bd = 8;
+  const int im_h = h + filter_params_y->taps - 1;
+  const int im_stride = MAX_SB_SIZE;
+  const int vert_offset = filter_params_y->taps / 2 - 1;
+  const int horiz_offset = filter_params_x->taps / 2 - 1;
+
+  const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
+
+  int16_t *dst_ptr;
+
+  dst_ptr = im_block;
+  im_dst_stride = im_stride;
+  height = im_h;
+  width = w;
+
+  const int16_t round_bits =
+      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+  const int16x8_t vec_round_bits = vdupq_n_s16(-round_bits);
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+      filter_params_x, subpel_x_qn & SUBPEL_MASK);
+
+  int16_t x_filter_tmp[8];
+  int16x8_t filter_x_coef = vld1q_s16(x_filter);
+
+  // filter coeffs are even, so downshifting by 1 to reduce intermediate
+  // precision requirements.
+  filter_x_coef = vshrq_n_s16(filter_x_coef, 1);
+  vst1q_s16(&x_filter_tmp[0], filter_x_coef);
+
+  assert(conv_params->round_0 > 0);
+
+  if (w <= 4) {
+    const int16x4_t horiz_const = vdup_n_s16((1 << (bd + FILTER_BITS - 2)));
+    const int16x4_t shift_round_0 = vdup_n_s16(-(conv_params->round_0 - 1));
+
+#if defined(__aarch64__)
+    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
+    do {
+      assert(height >= 4);
+      s = src_ptr;
+      __builtin_prefetch(s + 0 * src_stride);
+      __builtin_prefetch(s + 1 * src_stride);
+      __builtin_prefetch(s + 2 * src_stride);
+      __builtin_prefetch(s + 3 * src_stride);
+
+      load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+      transpose_u8_8x4(&t0, &t1, &t2, &t3);
+
+      s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+      s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+      s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+      s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+      s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+      s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+      s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+
+      __builtin_prefetch(dst_ptr + 0 * im_dst_stride);
+      __builtin_prefetch(dst_ptr + 1 * im_dst_stride);
+      __builtin_prefetch(dst_ptr + 2 * im_dst_stride);
+      __builtin_prefetch(dst_ptr + 3 * im_dst_stride);
+      s += 7;
+
+      load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+      transpose_u8_8x4(&t0, &t1, &t2, &t3);
+
+      s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+      s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+      s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+      s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+
+      d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
+                             horiz_const, shift_round_0);
+      d1 = convolve8_4x4_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp,
+                             horiz_const, shift_round_0);
+      d2 = convolve8_4x4_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp,
+                             horiz_const, shift_round_0);
+      d3 = convolve8_4x4_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp,
+                             horiz_const, shift_round_0);
+
+      transpose_s16_4x4d(&d0, &d1, &d2, &d3);
+      if (w == 4) {
+        vst1_s16((dst_ptr + 0 * im_dst_stride), d0);
+        vst1_s16((dst_ptr + 1 * im_dst_stride), d1);
+        vst1_s16((dst_ptr + 2 * im_dst_stride), d2);
+        vst1_s16((dst_ptr + 3 * im_dst_stride), d3);
+      } else if (w == 2) {
+        vst1_lane_u32((uint32_t *)(dst_ptr + 0 * im_dst_stride),
+                      vreinterpret_u32_s16(d0), 0);
+        vst1_lane_u32((uint32_t *)(dst_ptr + 1 * im_dst_stride),
+                      vreinterpret_u32_s16(d1), 0);
+        vst1_lane_u32((uint32_t *)(dst_ptr + 2 * im_dst_stride),
+                      vreinterpret_u32_s16(d2), 0);
+        vst1_lane_u32((uint32_t *)(dst_ptr + 3 * im_dst_stride),
+                      vreinterpret_u32_s16(d3), 0);
+      }
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * im_dst_stride;
+      height -= 4;
+    } while (height >= 4);
+
+    if (height) {
+      assert(height < 4);
+      horiz_filter_w4_single_row(src_ptr, src_stride, dst_ptr, im_dst_stride, w,
+                                 height, x_filter_tmp, horiz_const,
+                                 shift_round_0);
+    }
+#else
+    horiz_filter_w4_single_row(src_ptr, src_stride, dst_ptr, im_dst_stride, w,
+                               height, x_filter_tmp, horiz_const,
+                               shift_round_0);
+#endif
+
+  } else {
+    const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)));
+    const int16x8_t shift_round_0 = vdupq_n_s16(-(conv_params->round_0 - 1));
+
+#if defined(__aarch64__)
+    int16_t *d_tmp;
+    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14;
+    int16x8_t res0, res1, res2, res3, res4, res5, res6, res7;
+    do {
+      assert(height >= 8);
+      __builtin_prefetch(src_ptr + 0 * src_stride);
+      __builtin_prefetch(src_ptr + 1 * src_stride);
+      __builtin_prefetch(src_ptr + 2 * src_stride);
+      __builtin_prefetch(src_ptr + 3 * src_stride);
+      __builtin_prefetch(src_ptr + 4 * src_stride);
+      __builtin_prefetch(src_ptr + 5 * src_stride);
+      __builtin_prefetch(src_ptr + 6 * src_stride);
+      __builtin_prefetch(src_ptr + 7 * src_stride);
+
+      load_u8_8x8(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+      transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+      s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+      s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+      s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+      s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+      s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+      width = w;
+      s = src_ptr + 7;
+      d_tmp = dst_ptr;
+
+      __builtin_prefetch(dst_ptr + 0 * im_dst_stride);
+      __builtin_prefetch(dst_ptr + 1 * im_dst_stride);
+      __builtin_prefetch(dst_ptr + 2 * im_dst_stride);
+      __builtin_prefetch(dst_ptr + 3 * im_dst_stride);
+      __builtin_prefetch(dst_ptr + 4 * im_dst_stride);
+      __builtin_prefetch(dst_ptr + 5 * im_dst_stride);
+      __builtin_prefetch(dst_ptr + 6 * im_dst_stride);
+      __builtin_prefetch(dst_ptr + 7 * im_dst_stride);
+
+      do {
+        load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+        transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+        s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+        s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
+        s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
+        s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
+        s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+        res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
+                                 horiz_const, shift_round_0);
+        res1 = convolve8_8x8_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp,
+                                 horiz_const, shift_round_0);
+        res2 = convolve8_8x8_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp,
+                                 horiz_const, shift_round_0);
+        res3 = convolve8_8x8_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp,
+                                 horiz_const, shift_round_0);
+        res4 = convolve8_8x8_s16(s4, s5, s6, s7, s8, s9, s10, s11, x_filter_tmp,
+                                 horiz_const, shift_round_0);
+        res5 = convolve8_8x8_s16(s5, s6, s7, s8, s9, s10, s11, s12,
+                                 x_filter_tmp, horiz_const, shift_round_0);
+        res6 = convolve8_8x8_s16(s6, s7, s8, s9, s10, s11, s12, s13,
+                                 x_filter_tmp, horiz_const, shift_round_0);
+        res7 = convolve8_8x8_s16(s7, s8, s9, s10, s11, s12, s13, s14,
+                                 x_filter_tmp, horiz_const, shift_round_0);
+
+        transpose_s16_8x8(&res0, &res1, &res2, &res3, &res4, &res5, &res6,
+                          &res7);
+
+        store_s16_8x8(d_tmp, im_dst_stride, res0, res1, res2, res3, res4, res5,
+                      res6, res7);
+
+        s0 = s8;
+        s1 = s9;
+        s2 = s10;
+        s3 = s11;
+        s4 = s12;
+        s5 = s13;
+        s6 = s14;
+        s += 8;
+        d_tmp += 8;
+        width -= 8;
+      } while (width > 0);
+      src_ptr += 8 * src_stride;
+      dst_ptr += 8 * im_dst_stride;
+      height -= 8;
+    } while (height >= 8);
+
+    if (height >= 4) {
+      assert(height < 8);
+      int16x4_t reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9,
+          reg10, reg11, reg12, reg13, reg14;
+      int16x4_t d0, d1, d2, d3, d4, d5, d6, d7;
+      int16x8_t out0, out1, out2, out3;
+
+      __builtin_prefetch(src_ptr + 0 * src_stride);
+      __builtin_prefetch(src_ptr + 1 * src_stride);
+      __builtin_prefetch(src_ptr + 2 * src_stride);
+      __builtin_prefetch(src_ptr + 3 * src_stride);
+
+      load_u8_8x4(src_ptr, src_stride, &t0, &t1, &t2, &t3);
+      transpose_u8_8x4(&t0, &t1, &t2, &t3);
+
+      reg0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+      reg1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+      reg2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+      reg3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+      reg4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+      reg5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+      reg6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+
+      __builtin_prefetch(dst_ptr + 0 * dst_stride);
+      __builtin_prefetch(dst_ptr + 1 * dst_stride);
+      __builtin_prefetch(dst_ptr + 2 * dst_stride);
+      __builtin_prefetch(dst_ptr + 3 * dst_stride);
+
+      s = src_ptr + 7;
+      d_tmp = dst_ptr;
+      width = w;
+
+      do {
+        load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+        transpose_u8_8x4(&t0, &t1, &t2, &t3);
+
+        reg7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+        reg8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+        reg9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+        reg10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+        reg11 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+        reg12 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+        reg13 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+        reg14 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+
+        d0 = convolve8_4x4(reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7,
+                           x_filter_tmp);
+
+        d1 = convolve8_4x4(reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8,
+                           x_filter_tmp);
+
+        d2 = convolve8_4x4(reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9,
+                           x_filter_tmp);
+
+        d3 = convolve8_4x4(reg3, reg4, reg5, reg6, reg7, reg8, reg9, reg10,
+                           x_filter_tmp);
+
+        d4 = convolve8_4x4(reg4, reg5, reg6, reg7, reg8, reg9, reg10, reg11,
+                           x_filter_tmp);
+
+        d5 = convolve8_4x4(reg5, reg6, reg7, reg8, reg9, reg10, reg11, reg12,
+                           x_filter_tmp);
+
+        d6 = convolve8_4x4(reg6, reg7, reg8, reg9, reg10, reg11, reg12, reg13,
+                           x_filter_tmp);
+
+        d7 = convolve8_4x4(reg7, reg8, reg9, reg10, reg11, reg12, reg13, reg14,
+                           x_filter_tmp);
+
+        transpose_s16_4x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7, &out0, &out1,
+                          &out2, &out3);
+
+        out0 = vaddq_s16(out0, horiz_const);
+        out0 = vqrshlq_s16(out0, shift_round_0);
+
+        out1 = vaddq_s16(out1, horiz_const);
+        out1 = vqrshlq_s16(out1, shift_round_0);
+
+        out2 = vaddq_s16(out2, horiz_const);
+        out2 = vqrshlq_s16(out2, shift_round_0);
+
+        out3 = vaddq_s16(out3, horiz_const);
+        out3 = vqrshlq_s16(out3, shift_round_0);
+
+        store_s16_8x4(d_tmp, im_dst_stride, out0, out1, out2, out3);
+
+        reg0 = reg8;
+        reg1 = reg9;
+        reg2 = reg10;
+        reg3 = reg11;
+        reg4 = reg12;
+        reg5 = reg13;
+        reg6 = reg14;
+        s += 8;
+        d_tmp += 8;
+        width -= 8;
+      } while (width > 0);
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * im_dst_stride;
+      height -= 4;
+    }
+
+    if (height) {
+      assert(height < 4);
+      horiz_filter_w8_single_row(src_ptr, src_stride, dst_ptr, im_stride, w,
+                                 height, x_filter_tmp, horiz_const,
+                                 shift_round_0);
+    }
+#else
+
+    horiz_filter_w8_single_row(src_ptr, src_stride, dst_ptr, im_stride, w,
+                               height, x_filter_tmp, horiz_const,
+                               shift_round_0);
+#endif
+  }
+
+  // vertical
+  {
+    uint8_t *dst_u8_ptr, *d_u8;
+    int16_t *v_src_ptr, *v_s;
+
+    const int32_t sub_const = (1 << (offset_bits - conv_params->round_1)) +
+                              (1 << (offset_bits - conv_params->round_1 - 1));
+    const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+        filter_params_y, subpel_y_qn & SUBPEL_MASK);
+
+    const int32x4_t round_shift_vec = vdupq_n_s32(-(conv_params->round_1));
+    const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits);
+    const int32x4_t sub_const_vec = vdupq_n_s32(sub_const);
+
+    src_stride = im_stride;
+    v_src_ptr = im_block;
+    dst_u8_ptr = dst;
+
+    height = h;
+    width = w;
+
+    if (width <= 4) {
+      int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
+      uint16x4_t d0;
+      uint16x8_t dd0;
+      uint8x8_t d01;
+
+#if defined(__aarch64__)
+      int16x4_t s8, s9, s10;
+      uint16x4_t d1, d2, d3;
+      uint16x8_t dd1;
+      uint8x8_t d23;
+#endif
+
+      d_u8 = dst_u8_ptr;
+      v_s = v_src_ptr;
+
+      __builtin_prefetch(v_s + 0 * im_stride);
+      __builtin_prefetch(v_s + 1 * im_stride);
+      __builtin_prefetch(v_s + 2 * im_stride);
+      __builtin_prefetch(v_s + 3 * im_stride);
+      __builtin_prefetch(v_s + 4 * im_stride);
+      __builtin_prefetch(v_s + 5 * im_stride);
+      __builtin_prefetch(v_s + 6 * im_stride);
+      __builtin_prefetch(v_s + 7 * im_stride);
+
+      load_s16_4x8(v_s, im_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+      v_s += (7 * im_stride);
+
+      do {
+#if defined(__aarch64__)
+        load_s16_4x4(v_s, im_stride, &s7, &s8, &s9, &s10);
+        v_s += (im_stride << 2);
+
+        __builtin_prefetch(d_u8 + 0 * dst_stride);
+        __builtin_prefetch(d_u8 + 1 * dst_stride);
+        __builtin_prefetch(d_u8 + 2 * dst_stride);
+        __builtin_prefetch(d_u8 + 3 * dst_stride);
+
+        d0 = convolve8_vert_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+                                    round_shift_vec, offset_const,
+                                    sub_const_vec);
+        d1 = convolve8_vert_4x4_s32(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
+                                    round_shift_vec, offset_const,
+                                    sub_const_vec);
+        d2 = convolve8_vert_4x4_s32(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
+                                    round_shift_vec, offset_const,
+                                    sub_const_vec);
+        d3 = convolve8_vert_4x4_s32(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
+                                    round_shift_vec, offset_const,
+                                    sub_const_vec);
+
+        dd0 = vqrshlq_u16(vcombine_u16(d0, d1), vec_round_bits);
+        dd1 = vqrshlq_u16(vcombine_u16(d2, d3), vec_round_bits);
+
+        d01 = vqmovn_u16(dd0);
+        d23 = vqmovn_u16(dd1);
+
+        if ((w == 4) && (h != 2)) {
+          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
+                        0);  // 00 01 02 03
+          d_u8 += dst_stride;
+          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
+                        1);  // 10 11 12 13
+          d_u8 += dst_stride;
+          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d23),
+                        0);  // 20 21 22 23
+          d_u8 += dst_stride;
+          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d23),
+                        1);  // 30 31 32 33
+          d_u8 += dst_stride;
+        } else if ((w == 2) && (h != 2)) {
+          vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
+                        0);  // 00 01
+          d_u8 += dst_stride;
+          vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
+                        2);  // 10 11
+          d_u8 += dst_stride;
+          vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d23),
+                        0);  // 20 21
+          d_u8 += dst_stride;
+          vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d23),
+                        2);  // 30 31
+          d_u8 += dst_stride;
+        } else if ((w == 4) && (h == 2)) {
+          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
+                        0);  // 00 01 02 03
+          d_u8 += dst_stride;
+          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
+                        1);  // 10 11 12 13
+          d_u8 += dst_stride;
+        } else if ((w == 2) && (h == 2)) {
+          vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
+                        0);  // 00 01
+          d_u8 += dst_stride;
+          vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
+                        2);  // 10 11
+          d_u8 += dst_stride;
+        }
+
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+        s3 = s7;
+        s4 = s8;
+        s5 = s9;
+        s6 = s10;
+        height -= 4;
+#else
+        s7 = vld1_s16(v_s);
+        v_s += im_stride;
+
+        __builtin_prefetch(d_u8 + 0 * dst_stride);
+
+        d0 = convolve8_vert_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+                                    round_shift_vec, offset_const,
+                                    sub_const_vec);
+
+        dd0 = vqrshlq_u16(vcombine_u16(d0, d0), vec_round_bits);
+        d01 = vqmovn_u16(dd0);
+
+        if (w == 4) {
+          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
+                        0);  // 00 01 02 03
+          d_u8 += dst_stride;
+
+        } else if (w == 2) {
+          vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
+                        0);  // 00 01
+          d_u8 += dst_stride;
+        }
+
+        s0 = s1;
+        s1 = s2;
+        s2 = s3;
+        s3 = s4;
+        s4 = s5;
+        s5 = s6;
+        s6 = s7;
+        height -= 1;
+#endif
+      } while (height > 0);
+    } else {
+      // if width is a multiple of 8 & height is a multiple of 4
+      int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+      uint8x8_t res0;
+#if defined(__aarch64__)
+      int16x8_t s8, s9, s10;
+      uint8x8_t res1, res2, res3;
+#endif
+
+      do {
+        __builtin_prefetch(v_src_ptr + 0 * im_stride);
+        __builtin_prefetch(v_src_ptr + 1 * im_stride);
+        __builtin_prefetch(v_src_ptr + 2 * im_stride);
+        __builtin_prefetch(v_src_ptr + 3 * im_stride);
+        __builtin_prefetch(v_src_ptr + 4 * im_stride);
+        __builtin_prefetch(v_src_ptr + 5 * im_stride);
+        __builtin_prefetch(v_src_ptr + 6 * im_stride);
+        __builtin_prefetch(v_src_ptr + 7 * im_stride);
+
+        v_s = v_src_ptr;
+        load_s16_8x8(v_s, im_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+        v_s += (7 * im_stride);
+
+        d_u8 = dst_u8_ptr;
+        height = h;
+
+        do {
+#if defined(__aarch64__)
+          load_s16_8x4(v_s, im_stride, &s7, &s8, &s9, &s10);
+          v_s += (im_stride << 2);
+
+          __builtin_prefetch(d_u8 + 4 * dst_stride);
+          __builtin_prefetch(d_u8 + 5 * dst_stride);
+          __builtin_prefetch(d_u8 + 6 * dst_stride);
+          __builtin_prefetch(d_u8 + 7 * dst_stride);
+
+          res0 = convolve8_vert_8x4_s32(s0, s1, s2, s3, s4, s5, s6, s7,
+                                        y_filter, round_shift_vec, offset_const,
+                                        sub_const_vec, vec_round_bits);
+          res1 = convolve8_vert_8x4_s32(s1, s2, s3, s4, s5, s6, s7, s8,
+                                        y_filter, round_shift_vec, offset_const,
+                                        sub_const_vec, vec_round_bits);
+          res2 = convolve8_vert_8x4_s32(s2, s3, s4, s5, s6, s7, s8, s9,
+                                        y_filter, round_shift_vec, offset_const,
+                                        sub_const_vec, vec_round_bits);
+          res3 = convolve8_vert_8x4_s32(s3, s4, s5, s6, s7, s8, s9, s10,
+                                        y_filter, round_shift_vec, offset_const,
+                                        sub_const_vec, vec_round_bits);
+
+          if (h != 2) {
+            vst1_u8(d_u8, res0);
+            d_u8 += dst_stride;
+            vst1_u8(d_u8, res1);
+            d_u8 += dst_stride;
+            vst1_u8(d_u8, res2);
+            d_u8 += dst_stride;
+            vst1_u8(d_u8, res3);
+            d_u8 += dst_stride;
+          } else {
+            vst1_u8(d_u8, res0);
+            d_u8 += dst_stride;
+            vst1_u8(d_u8, res1);
+            d_u8 += dst_stride;
+          }
+          s0 = s4;
+          s1 = s5;
+          s2 = s6;
+          s3 = s7;
+          s4 = s8;
+          s5 = s9;
+          s6 = s10;
+          height -= 4;
+#else
+          s7 = vld1q_s16(v_s);
+          v_s += im_stride;
+
+          __builtin_prefetch(d_u8 + 0 * dst_stride);
+
+          res0 = convolve8_vert_8x4_s32(s0, s1, s2, s3, s4, s5, s6, s7,
+                                        y_filter, round_shift_vec, offset_const,
+                                        sub_const_vec, vec_round_bits);
+
+          vst1_u8(d_u8, res0);
+          d_u8 += dst_stride;
+
+          s0 = s1;
+          s1 = s2;
+          s2 = s3;
+          s3 = s4;
+          s4 = s5;
+          s5 = s6;
+          s6 = s7;
+          height -= 1;
+#endif
+        } while (height > 0);
+        v_src_ptr += 8;
+        dst_u8_ptr += 8;
+        w -= 8;
+      } while (w > 0);
+    }
+  }
+}
+void av1_convolve_2d_copy_sr_neon(const uint8_t *src, int src_stride,
+                                  uint8_t *dst, int dst_stride, int w, int h,
+                                  const InterpFilterParams *filter_params_x,
+                                  const InterpFilterParams *filter_params_y,
+                                  const int subpel_x_qn, const int subpel_y_qn,
+                                  ConvolveParams *conv_params) {
+  (void)filter_params_x;
+  (void)filter_params_y;
+  (void)subpel_x_qn;
+  (void)subpel_y_qn;
+  (void)conv_params;
+
+  const uint8_t *src1;
+  uint8_t *dst1;
+  int y;
+
+  if (!(w & 0x0F)) {
+    for (y = 0; y < h; ++y) {
+      src1 = src;
+      dst1 = dst;
+      for (int x = 0; x < (w >> 4); ++x) {
+        vst1q_u8(dst1, vld1q_u8(src1));
+        src1 += 16;
+        dst1 += 16;
+      }
+      src += src_stride;
+      dst += dst_stride;
+    }
+  } else if (!(w & 0x07)) {
+    for (y = 0; y < h; ++y) {
+      vst1_u8(dst, vld1_u8(src));
+      src += src_stride;
+      dst += dst_stride;
+    }
+  } else if (!(w & 0x03)) {
+    for (y = 0; y < h; ++y) {
+      vst1_lane_u32((uint32_t *)(dst), vreinterpret_u32_u8(vld1_u8(src)), 0);
+      src += src_stride;
+      dst += dst_stride;
+    }
+  } else if (!(w & 0x01)) {
+    for (y = 0; y < h; ++y) {
+      vst1_lane_u16((uint16_t *)(dst), vreinterpret_u16_u8(vld1_u8(src)), 0);
+      src += src_stride;
+      dst += dst_stride;
+    }
+  }
+}
diff --git a/libs/libaom/src/av1/common/arm/convolve_neon.h b/libs/libaom/src/av1/common/arm/convolve_neon.h
new file mode 100644
index 000000000..dbcfab631
--- /dev/null
+++ b/libs/libaom/src/av1/common/arm/convolve_neon.h
@@ -0,0 +1,228 @@
+/*
+ *  Copyright (c) 2018, Alliance for Open Media. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef AOM_AV1_COMMON_ARM_CONVOLVE_NEON_H_
+#define AOM_AV1_COMMON_ARM_CONVOLVE_NEON_H_
+
+#include <arm_neon.h>
+
+#define HORIZ_EXTRA_ROWS ((SUBPEL_TAPS + 7) & ~0x07)
+
+static INLINE uint8x8_t wiener_convolve8_vert_4x8(
+    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+    const int16x8_t s6, int16_t *filter_y, const int bd,
+    const int round1_bits) {
+  int16x8_t ss0, ss1, ss2;
+  int32x4_t sum0, sum1;
+  uint16x4_t tmp0, tmp1;
+  uint16x8_t tmp;
+  uint8x8_t res;
+
+  const int32_t round_const = (1 << (bd + round1_bits - 1));
+  const int32x4_t round_bits = vdupq_n_s32(-round1_bits);
+  const int32x4_t zero = vdupq_n_s32(0);
+  const int32x4_t round_vec = vdupq_n_s32(round_const);
+
+  ss0 = vaddq_s16(s0, s6);
+  ss1 = vaddq_s16(s1, s5);
+  ss2 = vaddq_s16(s2, s4);
+
+  sum0 = vmull_n_s16(vget_low_s16(ss0), filter_y[0]);
+  sum0 = vmlal_n_s16(sum0, vget_low_s16(ss1), filter_y[1]);
+  sum0 = vmlal_n_s16(sum0, vget_low_s16(ss2), filter_y[2]);
+  sum0 = vmlal_n_s16(sum0, vget_low_s16(s3), filter_y[3]);
+
+  sum1 = vmull_n_s16(vget_high_s16(ss0), filter_y[0]);
+  sum1 = vmlal_n_s16(sum1, vget_high_s16(ss1), filter_y[1]);
+  sum1 = vmlal_n_s16(sum1, vget_high_s16(ss2), filter_y[2]);
+  sum1 = vmlal_n_s16(sum1, vget_high_s16(s3), filter_y[3]);
+
+  sum0 = vsubq_s32(sum0, round_vec);
+  sum1 = vsubq_s32(sum1, round_vec);
+
+  /* right shift & rounding */
+  sum0 = vrshlq_s32(sum0, round_bits);
+  sum1 = vrshlq_s32(sum1, round_bits);
+
+  sum0 = vmaxq_s32(sum0, zero);
+  sum1 = vmaxq_s32(sum1, zero);
+
+  /* from int32x4_t to uint8x8_t */
+  tmp0 = vqmovn_u32(vreinterpretq_u32_s32(sum0));
+  tmp1 = vqmovn_u32(vreinterpretq_u32_s32(sum1));
+  tmp = vcombine_u16(tmp0, tmp1);
+  res = vqmovn_u16(tmp);
+
+  return res;
+}
+
+static INLINE uint16x8_t wiener_convolve8_horiz_8x8(
+    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+    const int16x8_t s3, int16_t *filter_x, const int bd,
+    const int round0_bits) {
+  int16x8_t sum;
+  uint16x8_t res;
+  int32x4_t sum_0, sum_1;
+  int32x4_t s3_0, s3_1;
+  const int32_t round_const_0 = (1 << (bd + FILTER_BITS - 1));
+  const int32_t round_const_1 = (1 << (bd + 1 + FILTER_BITS - round0_bits)) - 1;
+
+  /* for the purpose of right shift by { conv_params->round_0 } */
+  const int32x4_t round_bits = vdupq_n_s32(-round0_bits);
+
+  const int32x4_t round_vec_0 = vdupq_n_s32(round_const_0);
+  const int32x4_t round_vec_1 = vdupq_n_s32(round_const_1);
+
+  sum = vmulq_n_s16(s0, filter_x[0]);
+  sum = vmlaq_n_s16(sum, s1, filter_x[1]);
+  sum = vmlaq_n_s16(sum, s2, filter_x[2]);
+
+  /* sum from 16x8 to 2 32x4 registers */
+  sum_0 = vmovl_s16(vget_low_s16(sum));
+  sum_1 = vmovl_s16(vget_high_s16(sum));
+
+  /* s[3]*128 -- and filter coef max can be 128
+   *  then max value possible = 128*128*255 exceeding 16 bit
+   */
+
+  s3_0 = vmull_n_s16(vget_low_s16(s3), filter_x[3]);
+  s3_1 = vmull_n_s16(vget_high_s16(s3), filter_x[3]);
+  sum_0 = vaddq_s32(sum_0, s3_0);
+  sum_1 = vaddq_s32(sum_1, s3_1);
+
+  /* Add the constant value */
+  sum_0 = vaddq_s32(sum_0, round_vec_0);
+  sum_1 = vaddq_s32(sum_1, round_vec_0);
+
+  /* right shift & rounding & saturating */
+  sum_0 = vqrshlq_s32(sum_0, round_bits);
+  sum_1 = vqrshlq_s32(sum_1, round_bits);
+
+  /* Clipping to max value */
+  sum_0 = vminq_s32(sum_0, round_vec_1);
+  sum_1 = vminq_s32(sum_1, round_vec_1);
+
+  res = vcombine_u16(vqmovun_s32(sum_0), vqmovun_s32(sum_1));
+  return res;
+}
+
+static INLINE uint16x4_t wiener_convolve8_horiz_4x8(
+    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+    const int16x4_t s6, int16_t *filter_x, const int bd,
+    const int round0_bits) {
+  uint16x4_t res;
+  int32x4_t sum_0, s3_0;
+  int16x4_t sum, temp0, temp1, temp2;
+
+  const int32_t round_const_0 = (1 << (bd + FILTER_BITS - 1));
+  const int32_t round_const_1 = (1 << (bd + 1 + FILTER_BITS - round0_bits)) - 1;
+  const int32x4_t round_bits = vdupq_n_s32(-round0_bits);
+  const int32x4_t zero = vdupq_n_s32(0);
+  const int32x4_t round_vec_0 = vdupq_n_s32(round_const_0);
+  const int32x4_t round_vec_1 = vdupq_n_s32(round_const_1);
+
+  temp0 = vadd_s16(s0, s6);
+  temp1 = vadd_s16(s1, s5);
+  temp2 = vadd_s16(s2, s4);
+
+  sum = vmul_n_s16(temp0, filter_x[0]);
+  sum = vmla_n_s16(sum, temp1, filter_x[1]);
+  sum = vmla_n_s16(sum, temp2, filter_x[2]);
+  sum_0 = vmovl_s16(sum);
+
+  /* s[3]*128 -- and filter coff max can be 128.
+   * then max value possible = 128*128*255 Therefore, 32 bits are required to
+   * hold the result.
+   */
+  s3_0 = vmull_n_s16(s3, filter_x[3]);
+  sum_0 = vaddq_s32(sum_0, s3_0);
+
+  sum_0 = vaddq_s32(sum_0, round_vec_0);
+  sum_0 = vrshlq_s32(sum_0, round_bits);
+
+  sum_0 = vmaxq_s32(sum_0, zero);
+  sum_0 = vminq_s32(sum_0, round_vec_1);
+  res = vqmovun_s32(sum_0);
+  return res;
+}
+
+static INLINE int16x8_t
+convolve8_8x8_s16(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+                  const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+                  const int16x8_t s6, const int16x8_t s7, const int16_t *filter,
+                  const int16x8_t horiz_const, const int16x8_t shift_round_0) {
+  int16x8_t sum;
+  int16x8_t res;
+
+  sum = horiz_const;
+  sum = vmlaq_n_s16(sum, s0, filter[0]);
+  sum = vmlaq_n_s16(sum, s1, filter[1]);
+  sum = vmlaq_n_s16(sum, s2, filter[2]);
+  sum = vmlaq_n_s16(sum, s3, filter[3]);
+  sum = vmlaq_n_s16(sum, s4, filter[4]);
+  sum = vmlaq_n_s16(sum, s5, filter[5]);
+  sum = vmlaq_n_s16(sum, s6, filter[6]);
+  sum = vmlaq_n_s16(sum, s7, filter[7]);
+
+  res = vqrshlq_s16(sum, shift_round_0);
+
+  return res;
+}
+
+static INLINE int16x4_t
+convolve8_4x4_s16(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+                  const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+                  const int16x4_t s6, const int16x4_t s7, const int16_t *filter,
+                  const int16x4_t horiz_const, const int16x4_t shift_round_0) {
+  int16x4_t sum;
+  sum = horiz_const;
+  sum = vmla_n_s16(sum, s0, filter[0]);
+  sum = vmla_n_s16(sum, s1, filter[1]);
+  sum = vmla_n_s16(sum, s2, filter[2]);
+  sum = vmla_n_s16(sum, s3, filter[3]);
+  sum = vmla_n_s16(sum, s4, filter[4]);
+  sum = vmla_n_s16(sum, s5, filter[5]);
+  sum = vmla_n_s16(sum, s6, filter[6]);
+  sum = vmla_n_s16(sum, s7, filter[7]);
+
+  sum = vqrshl_s16(sum, shift_round_0);
+
+  return sum;
+}
+
+static INLINE uint16x4_t convolve8_4x4_s32(
+    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+    const int16x4_t s6, const int16x4_t s7, const int16_t *y_filter,
+    const int32x4_t round_shift_vec, const int32x4_t offset_const) {
+  int32x4_t sum0;
+  uint16x4_t res;
+  const int32x4_t zero = vdupq_n_s32(0);
+
+  sum0 = vmull_n_s16(s0, y_filter[0]);
+  sum0 = vmlal_n_s16(sum0, s1, y_filter[1]);
+  sum0 = vmlal_n_s16(sum0, s2, y_filter[2]);
+  sum0 = vmlal_n_s16(sum0, s3, y_filter[3]);
+  sum0 = vmlal_n_s16(sum0, s4, y_filter[4]);
+  sum0 = vmlal_n_s16(sum0, s5, y_filter[5]);
+  sum0 = vmlal_n_s16(sum0, s6, y_filter[6]);
+  sum0 = vmlal_n_s16(sum0, s7, y_filter[7]);
+
+  sum0 = vaddq_s32(sum0, offset_const);
+  sum0 = vqrshlq_s32(sum0, round_shift_vec);
+  sum0 = vmaxq_s32(sum0, zero);
+  res = vmovn_u32(vreinterpretq_u32_s32(sum0));
+
+  return res;
+}
+
+#endif  // AOM_AV1_COMMON_ARM_CONVOLVE_NEON_H_
diff --git a/libs/libaom/src/av1/common/arm/jnt_convolve_neon.c b/libs/libaom/src/av1/common/arm/jnt_convolve_neon.c
new file mode 100644
index 000000000..92112fb85
--- /dev/null
+++ b/libs/libaom/src/av1/common/arm/jnt_convolve_neon.c
@@ -0,0 +1,1739 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/txfm_common.h"
+#include "aom_ports/mem.h"
+#include "av1/common/common.h"
+#include "av1/common/arm/convolve_neon.h"
+#include "av1/common/arm/mem_neon.h"
+#include "av1/common/arm/transpose_neon.h"
+
+#if !defined(__aarch64__)
+static INLINE void compute_avg_4x1(
+    uint16x4_t res0, uint16x4_t d0, const uint16_t fwd_offset,
+    const uint16_t bck_offset, const int16x4_t sub_const_vec,
+    const int16_t round_bits, const int use_dist_wtd_comp_avg, uint8x8_t *t0) {
+  int16x4_t tmp0;
+  uint16x4_t tmp_u0;
+  uint32x4_t sum0;
+  int32x4_t dst0;
+  int16x8_t tmp4;
+
+  if (use_dist_wtd_comp_avg) {
+    const int32x4_t round_bits_vec = vdupq_n_s32((int32_t)(-round_bits));
+
+    sum0 = vmull_n_u16(res0, fwd_offset);
+    sum0 = vmlal_n_u16(sum0, d0, bck_offset);
+
+    sum0 = vshrq_n_u32(sum0, DIST_PRECISION_BITS);
+
+    dst0 = vsubq_s32(vreinterpretq_s32_u32(sum0), vmovl_s16(sub_const_vec));
+
+    dst0 = vqrshlq_s32(dst0, round_bits_vec);
+
+    tmp0 = vqmovn_s32(dst0);
+    tmp4 = vcombine_s16(tmp0, tmp0);
+
+    *t0 = vqmovun_s16(tmp4);
+  } else {
+    const int16x4_t round_bits_vec = vdup_n_s16(-round_bits);
+    tmp_u0 = vhadd_u16(res0, d0);
+
+    tmp0 = vsub_s16(vreinterpret_s16_u16(tmp_u0), sub_const_vec);
+
+    tmp0 = vqrshl_s16(tmp0, round_bits_vec);
+
+    tmp4 = vcombine_s16(tmp0, tmp0);
+
+    *t0 = vqmovun_s16(tmp4);
+  }
+}
+
+static INLINE void compute_avg_8x1(
+    uint16x8_t res0, uint16x8_t d0, const uint16_t fwd_offset,
+    const uint16_t bck_offset, const int16x4_t sub_const,
+    const int16_t round_bits, const int use_dist_wtd_comp_avg, uint8x8_t *t0) {
+  int16x4_t tmp0, tmp2;
+  int16x8_t f0;
+  uint32x4_t sum0, sum2;
+  int32x4_t dst0, dst2;
+
+  uint16x8_t tmp_u0;
+
+  if (use_dist_wtd_comp_avg) {
+    const int32x4_t sub_const_vec = vmovl_s16(sub_const);
+    const int32x4_t round_bits_vec = vdupq_n_s32(-(int32_t)round_bits);
+
+    sum0 = vmull_n_u16(vget_low_u16(res0), fwd_offset);
+    sum0 = vmlal_n_u16(sum0, vget_low_u16(d0), bck_offset);
+    sum0 = vshrq_n_u32(sum0, DIST_PRECISION_BITS);
+
+    sum2 = vmull_n_u16(vget_high_u16(res0), fwd_offset);
+    sum2 = vmlal_n_u16(sum2, vget_high_u16(d0), bck_offset);
+    sum2 = vshrq_n_u32(sum2, DIST_PRECISION_BITS);
+
+    dst0 = vsubq_s32(vreinterpretq_s32_u32(sum0), sub_const_vec);
+    dst2 = vsubq_s32(vreinterpretq_s32_u32(sum2), sub_const_vec);
+
+    dst0 = vqrshlq_s32(dst0, round_bits_vec);
+    dst2 = vqrshlq_s32(dst2, round_bits_vec);
+
+    tmp0 = vqmovn_s32(dst0);
+    tmp2 = vqmovn_s32(dst2);
+
+    f0 = vcombine_s16(tmp0, tmp2);
+
+    *t0 = vqmovun_s16(f0);
+
+  } else {
+    const int16x8_t sub_const_vec = vcombine_s16(sub_const, sub_const);
+    const int16x8_t round_bits_vec = vdupq_n_s16(-round_bits);
+
+    tmp_u0 = vhaddq_u16(res0, d0);
+
+    f0 = vsubq_s16(vreinterpretq_s16_u16(tmp_u0), sub_const_vec);
+
+    f0 = vqrshlq_s16(f0, round_bits_vec);
+
+    *t0 = vqmovun_s16(f0);
+  }
+}
+#endif  // !defined(__arch64__)
+
+static INLINE void compute_avg_4x4(
+    uint16x4_t res0, uint16x4_t res1, uint16x4_t res2, uint16x4_t res3,
+    uint16x4_t d0, uint16x4_t d1, uint16x4_t d2, uint16x4_t d3,
+    const uint16_t fwd_offset, const uint16_t bck_offset,
+    const int16x4_t sub_const_vec, const int16_t round_bits,
+    const int use_dist_wtd_comp_avg, uint8x8_t *t0, uint8x8_t *t1) {
+  int16x4_t tmp0, tmp1, tmp2, tmp3;
+  uint16x4_t tmp_u0, tmp_u1, tmp_u2, tmp_u3;
+  uint32x4_t sum0, sum1, sum2, sum3;
+
+  int32x4_t dst0, dst1, dst2, dst3;
+  int16x8_t tmp4, tmp5;
+  const int16x8_t zero = vdupq_n_s16(0);
+
+  if (use_dist_wtd_comp_avg) {
+    const int32x4_t round_bits_vec = vdupq_n_s32((int32_t)(-round_bits));
+    const int32x4_t const_vec = vmovl_s16(sub_const_vec);
+
+    sum0 = vmull_n_u16(res0, fwd_offset);
+    sum0 = vmlal_n_u16(sum0, d0, bck_offset);
+    sum1 = vmull_n_u16(res1, fwd_offset);
+    sum1 = vmlal_n_u16(sum1, d1, bck_offset);
+    sum2 = vmull_n_u16(res2, fwd_offset);
+    sum2 = vmlal_n_u16(sum2, d2, bck_offset);
+    sum3 = vmull_n_u16(res3, fwd_offset);
+    sum3 = vmlal_n_u16(sum3, d3, bck_offset);
+
+    sum0 = vshrq_n_u32(sum0, DIST_PRECISION_BITS);
+    sum1 = vshrq_n_u32(sum1, DIST_PRECISION_BITS);
+    sum2 = vshrq_n_u32(sum2, DIST_PRECISION_BITS);
+    sum3 = vshrq_n_u32(sum3, DIST_PRECISION_BITS);
+
+    dst0 = vsubq_s32(vreinterpretq_s32_u32(sum0), const_vec);
+    dst1 = vsubq_s32(vreinterpretq_s32_u32(sum1), const_vec);
+    dst2 = vsubq_s32(vreinterpretq_s32_u32(sum2), const_vec);
+    dst3 = vsubq_s32(vreinterpretq_s32_u32(sum3), const_vec);
+
+    dst0 = vqrshlq_s32(dst0, round_bits_vec);
+    dst1 = vqrshlq_s32(dst1, round_bits_vec);
+    dst2 = vqrshlq_s32(dst2, round_bits_vec);
+    dst3 = vqrshlq_s32(dst3, round_bits_vec);
+
+    tmp0 = vqmovn_s32(dst0);
+    tmp1 = vqmovn_s32(dst1);
+    tmp2 = vqmovn_s32(dst2);
+    tmp3 = vqmovn_s32(dst3);
+    tmp4 = vcombine_s16(tmp0, tmp1);
+    tmp5 = vcombine_s16(tmp2, tmp3);
+    tmp4 = vmaxq_s16(tmp4, zero);
+    tmp5 = vmaxq_s16(tmp5, zero);
+
+    *t0 = vqmovn_u16(vreinterpretq_u16_s16(tmp4));
+    *t1 = vqmovn_u16(vreinterpretq_u16_s16(tmp5));
+  } else {
+    const int16x4_t round_bits_vec = vdup_n_s16(-round_bits);
+    tmp_u0 = vhadd_u16(res0, d0);
+    tmp_u1 = vhadd_u16(res1, d1);
+    tmp_u2 = vhadd_u16(res2, d2);
+    tmp_u3 = vhadd_u16(res3, d3);
+
+    tmp0 = vsub_s16(vreinterpret_s16_u16(tmp_u0), sub_const_vec);
+    tmp1 = vsub_s16(vreinterpret_s16_u16(tmp_u1), sub_const_vec);
+    tmp2 = vsub_s16(vreinterpret_s16_u16(tmp_u2), sub_const_vec);
+    tmp3 = vsub_s16(vreinterpret_s16_u16(tmp_u3), sub_const_vec);
+
+    tmp0 = vqrshl_s16(tmp0, round_bits_vec);
+    tmp1 = vqrshl_s16(tmp1, round_bits_vec);
+    tmp2 = vqrshl_s16(tmp2, round_bits_vec);
+    tmp3 = vqrshl_s16(tmp3, round_bits_vec);
+
+    tmp4 = vcombine_s16(tmp0, tmp1);
+    tmp5 = vcombine_s16(tmp2, tmp3);
+    tmp4 = vmaxq_s16(tmp4, zero);
+    tmp5 = vmaxq_s16(tmp5, zero);
+
+    *t0 = vqmovn_u16(vreinterpretq_u16_s16(tmp4));
+    *t1 = vqmovn_u16(vreinterpretq_u16_s16(tmp5));
+  }
+}
+
+static INLINE void compute_avg_8x4(
+    uint16x8_t res0, uint16x8_t res1, uint16x8_t res2, uint16x8_t res3,
+    uint16x8_t d0, uint16x8_t d1, uint16x8_t d2, uint16x8_t d3,
+    const uint16_t fwd_offset, const uint16_t bck_offset,
+    const int16x4_t sub_const, const int16_t round_bits,
+    const int use_dist_wtd_comp_avg, uint8x8_t *t0, uint8x8_t *t1,
+    uint8x8_t *t2, uint8x8_t *t3) {
+  int16x4_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  int16x8_t f0, f1, f2, f3;
+  uint32x4_t sum0, sum1, sum2, sum3;
+  uint32x4_t sum4, sum5, sum6, sum7;
+  int32x4_t dst0, dst1, dst2, dst3;
+  int32x4_t dst4, dst5, dst6, dst7;
+  uint16x8_t tmp_u0, tmp_u1, tmp_u2, tmp_u3;
+  const int16x8_t zero = vdupq_n_s16(0);
+
+  if (use_dist_wtd_comp_avg) {
+    const int32x4_t sub_const_vec = vmovl_s16(sub_const);
+    const int32x4_t round_bits_vec = vdupq_n_s32(-(int32_t)round_bits);
+
+    sum0 = vmull_n_u16(vget_low_u16(res0), fwd_offset);
+    sum0 = vmlal_n_u16(sum0, vget_low_u16(d0), bck_offset);
+    sum1 = vmull_n_u16(vget_low_u16(res1), fwd_offset);
+    sum1 = vmlal_n_u16(sum1, vget_low_u16(d1), bck_offset);
+    sum0 = vshrq_n_u32(sum0, DIST_PRECISION_BITS);
+    sum1 = vshrq_n_u32(sum1, DIST_PRECISION_BITS);
+
+    sum2 = vmull_n_u16(vget_high_u16(res0), fwd_offset);
+    sum2 = vmlal_n_u16(sum2, vget_high_u16(d0), bck_offset);
+    sum3 = vmull_n_u16(vget_high_u16(res1), fwd_offset);
+    sum3 = vmlal_n_u16(sum3, vget_high_u16(d1), bck_offset);
+    sum2 = vshrq_n_u32(sum2, DIST_PRECISION_BITS);
+    sum3 = vshrq_n_u32(sum3, DIST_PRECISION_BITS);
+
+    sum4 = vmull_n_u16(vget_low_u16(res2), fwd_offset);
+    sum4 = vmlal_n_u16(sum4, vget_low_u16(d2), bck_offset);
+    sum5 = vmull_n_u16(vget_low_u16(res3), fwd_offset);
+    sum5 = vmlal_n_u16(sum5, vget_low_u16(d3), bck_offset);
+    sum4 = vshrq_n_u32(sum4, DIST_PRECISION_BITS);
+    sum5 = vshrq_n_u32(sum5, DIST_PRECISION_BITS);
+
+    sum6 = vmull_n_u16(vget_high_u16(res2), fwd_offset);
+    sum6 = vmlal_n_u16(sum6, vget_high_u16(d2), bck_offset);
+    sum7 = vmull_n_u16(vget_high_u16(res3), fwd_offset);
+    sum7 = vmlal_n_u16(sum7, vget_high_u16(d3), bck_offset);
+    sum6 = vshrq_n_u32(sum6, DIST_PRECISION_BITS);
+    sum7 = vshrq_n_u32(sum7, DIST_PRECISION_BITS);
+
+    dst0 = vsubq_s32(vreinterpretq_s32_u32(sum0), sub_const_vec);
+    dst1 = vsubq_s32(vreinterpretq_s32_u32(sum1), sub_const_vec);
+    dst2 = vsubq_s32(vreinterpretq_s32_u32(sum2), sub_const_vec);
+    dst3 = vsubq_s32(vreinterpretq_s32_u32(sum3), sub_const_vec);
+    dst4 = vsubq_s32(vreinterpretq_s32_u32(sum4), sub_const_vec);
+    dst5 = vsubq_s32(vreinterpretq_s32_u32(sum5), sub_const_vec);
+    dst6 = vsubq_s32(vreinterpretq_s32_u32(sum6), sub_const_vec);
+    dst7 = vsubq_s32(vreinterpretq_s32_u32(sum7), sub_const_vec);
+
+    dst0 = vqrshlq_s32(dst0, round_bits_vec);
+    dst1 = vqrshlq_s32(dst1, round_bits_vec);
+    dst2 = vqrshlq_s32(dst2, round_bits_vec);
+    dst3 = vqrshlq_s32(dst3, round_bits_vec);
+    dst4 = vqrshlq_s32(dst4, round_bits_vec);
+    dst5 = vqrshlq_s32(dst5, round_bits_vec);
+    dst6 = vqrshlq_s32(dst6, round_bits_vec);
+    dst7 = vqrshlq_s32(dst7, round_bits_vec);
+
+    tmp0 = vqmovn_s32(dst0);
+    tmp1 = vqmovn_s32(dst1);
+    tmp2 = vqmovn_s32(dst2);
+    tmp3 = vqmovn_s32(dst3);
+    tmp4 = vqmovn_s32(dst4);
+    tmp5 = vqmovn_s32(dst5);
+    tmp6 = vqmovn_s32(dst6);
+    tmp7 = vqmovn_s32(dst7);
+
+    f0 = vcombine_s16(tmp0, tmp2);
+    f1 = vcombine_s16(tmp1, tmp3);
+    f2 = vcombine_s16(tmp4, tmp6);
+    f3 = vcombine_s16(tmp5, tmp7);
+
+    f0 = vmaxq_s16(f0, zero);
+    f1 = vmaxq_s16(f1, zero);
+    f2 = vmaxq_s16(f2, zero);
+    f3 = vmaxq_s16(f3, zero);
+
+    *t0 = vqmovn_u16(vreinterpretq_u16_s16(f0));
+    *t1 = vqmovn_u16(vreinterpretq_u16_s16(f1));
+    *t2 = vqmovn_u16(vreinterpretq_u16_s16(f2));
+    *t3 = vqmovn_u16(vreinterpretq_u16_s16(f3));
+
+  } else {
+    const int16x8_t sub_const_vec = vcombine_s16(sub_const, sub_const);
+    const int16x8_t round_bits_vec = vdupq_n_s16(-round_bits);
+
+    tmp_u0 = vhaddq_u16(res0, d0);
+    tmp_u1 = vhaddq_u16(res1, d1);
+    tmp_u2 = vhaddq_u16(res2, d2);
+    tmp_u3 = vhaddq_u16(res3, d3);
+
+    f0 = vsubq_s16(vreinterpretq_s16_u16(tmp_u0), sub_const_vec);
+    f1 = vsubq_s16(vreinterpretq_s16_u16(tmp_u1), sub_const_vec);
+    f2 = vsubq_s16(vreinterpretq_s16_u16(tmp_u2), sub_const_vec);
+    f3 = vsubq_s16(vreinterpretq_s16_u16(tmp_u3), sub_const_vec);
+
+    f0 = vqrshlq_s16(f0, round_bits_vec);
+    f1 = vqrshlq_s16(f1, round_bits_vec);
+    f2 = vqrshlq_s16(f2, round_bits_vec);
+    f3 = vqrshlq_s16(f3, round_bits_vec);
+
+    f0 = vmaxq_s16(f0, zero);
+    f1 = vmaxq_s16(f1, zero);
+    f2 = vmaxq_s16(f2, zero);
+    f3 = vmaxq_s16(f3, zero);
+
+    *t0 = vqmovn_u16(vreinterpretq_u16_s16(f0));
+    *t1 = vqmovn_u16(vreinterpretq_u16_s16(f1));
+    *t2 = vqmovn_u16(vreinterpretq_u16_s16(f2));
+    *t3 = vqmovn_u16(vreinterpretq_u16_s16(f3));
+  }
+}
+
+static INLINE void dist_wtd_convolve_2d_horiz_neon(
+    const uint8_t *src, int src_stride, int16_t *im_block, const int im_stride,
+    int16_t *x_filter_tmp, const int im_h, int w, const int round_0) {
+  const int bd = 8;
+  const uint8_t *s;
+  int16_t *dst_ptr;
+  int dst_stride;
+  int width, height;
+
+  dst_ptr = im_block;
+  dst_stride = im_stride;
+  height = im_h;
+  width = w;
+
+  if (w == 4) {
+    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0;
+    int16x8_t tt0;
+    uint8x8_t t0;
+
+    const int16x4_t horiz_const = vdup_n_s16((1 << (bd + FILTER_BITS - 2)));
+    const int16x4_t shift_round_0 = vdup_n_s16(-(round_0));
+
+#if defined(__aarch64__)
+    int16x4_t s8, s9, s10, d1, d2, d3;
+    int16x8_t tt1, tt2, tt3;
+    uint8x8_t t1, t2, t3;
+#endif
+    do {
+      s = src;
+      __builtin_prefetch(s + 0 * src_stride);
+#if defined(__aarch64__)
+      __builtin_prefetch(s + 1 * src_stride);
+      __builtin_prefetch(s + 2 * src_stride);
+      __builtin_prefetch(s + 3 * src_stride);
+
+      load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+      transpose_u8_8x4(&t0, &t1, &t2, &t3);
+      tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+      tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+      s0 = vget_low_s16(tt0);
+      s1 = vget_low_s16(tt1);
+      s2 = vget_low_s16(tt2);
+      s3 = vget_low_s16(tt3);
+      s4 = vget_high_s16(tt0);
+      s5 = vget_high_s16(tt1);
+      s6 = vget_high_s16(tt2);
+      __builtin_prefetch(dst_ptr + 0 * dst_stride);
+      __builtin_prefetch(dst_ptr + 1 * dst_stride);
+      __builtin_prefetch(dst_ptr + 2 * dst_stride);
+      __builtin_prefetch(dst_ptr + 3 * dst_stride);
+      s += 7;
+
+      load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+      transpose_u8_8x4(&t0, &t1, &t2, &t3);
+      tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+      tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+      s7 = vget_low_s16(tt0);
+      s8 = vget_low_s16(tt1);
+      s9 = vget_low_s16(tt2);
+      s10 = vget_low_s16(tt3);
+
+      d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
+                             horiz_const, shift_round_0);
+      d1 = convolve8_4x4_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp,
+                             horiz_const, shift_round_0);
+      d2 = convolve8_4x4_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp,
+                             horiz_const, shift_round_0);
+      d3 = convolve8_4x4_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp,
+                             horiz_const, shift_round_0);
+
+      transpose_s16_4x4d(&d0, &d1, &d2, &d3);
+
+      vst1_s16((dst_ptr + 0 * dst_stride), d0);
+      vst1_s16((dst_ptr + 1 * dst_stride), d1);
+      vst1_s16((dst_ptr + 2 * dst_stride), d2);
+      vst1_s16((dst_ptr + 3 * dst_stride), d3);
+
+      src += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      height -= 4;
+#else
+      t0 = vld1_u8(s);                            // a0 a1 a2 a3 a4 a5 a6 a7
+      tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));  // a0 a1 a2 a3 a4 a5 a6 a7
+      s0 = vget_low_s16(tt0);                     // a0 a1 a2 a3
+      s4 = vget_high_s16(tt0);                    // a4 a5 a6 a7
+      __builtin_prefetch(dst_ptr);
+      s += 8;
+      t0 = vld1_u8(s);  // a8 a9 a10 a11
+
+      // a8 a9 a10 a11
+      s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+
+      s1 = vext_s16(s0, s4, 1);  // a1 a2 a3 a4
+      s2 = vext_s16(s0, s4, 2);  // a2 a3 a4 a5
+      s3 = vext_s16(s0, s4, 3);  // a3 a4 a5 a6
+      s5 = vext_s16(s4, s7, 1);  // a5 a6 a7 a8
+      s6 = vext_s16(s4, s7, 2);  // a6 a7 a8 a9
+      s7 = vext_s16(s4, s7, 3);  // a7 a8 a9 a10
+
+      d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
+                             horiz_const, shift_round_0);
+
+      vst1_s16(dst_ptr, d0);
+
+      src += src_stride;
+      dst_ptr += dst_stride;
+      height -= 1;
+#endif
+    } while (height > 0);
+  } else {
+    int16_t *d_tmp;
+    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+    int16x8_t res0;
+    uint8x8_t t0;
+
+    const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)));
+    const int16x8_t shift_round_0 = vdupq_n_s16(-(round_0));
+    do {
+#if defined(__aarch64__)
+      uint8x8_t t1, t2, t3, t4, t5, t6, t7;
+      int16x8_t s8, s9, s10, s11, s12, s13, s14;
+      int16x8_t res1, res2, res3, res4, res5, res6, res7;
+      __builtin_prefetch(src + 0 * src_stride);
+      __builtin_prefetch(src + 1 * src_stride);
+      __builtin_prefetch(src + 2 * src_stride);
+      __builtin_prefetch(src + 3 * src_stride);
+      __builtin_prefetch(src + 4 * src_stride);
+      __builtin_prefetch(src + 5 * src_stride);
+      __builtin_prefetch(src + 6 * src_stride);
+      __builtin_prefetch(src + 7 * src_stride);
+      load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+      transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+      s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+      s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+      s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+      s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+      s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+      width = w;
+      s = src + 7;
+      d_tmp = dst_ptr;
+      __builtin_prefetch(dst_ptr + 0 * dst_stride);
+      __builtin_prefetch(dst_ptr + 1 * dst_stride);
+      __builtin_prefetch(dst_ptr + 2 * dst_stride);
+      __builtin_prefetch(dst_ptr + 3 * dst_stride);
+      __builtin_prefetch(dst_ptr + 4 * dst_stride);
+      __builtin_prefetch(dst_ptr + 5 * dst_stride);
+      __builtin_prefetch(dst_ptr + 6 * dst_stride);
+      __builtin_prefetch(dst_ptr + 7 * dst_stride);
+
+      do {
+        load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+        transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+        s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+        s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
+        s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
+        s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
+        s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+        res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
+                                 horiz_const, shift_round_0);
+        res1 = convolve8_8x8_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp,
+                                 horiz_const, shift_round_0);
+        res2 = convolve8_8x8_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp,
+                                 horiz_const, shift_round_0);
+        res3 = convolve8_8x8_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp,
+                                 horiz_const, shift_round_0);
+        res4 = convolve8_8x8_s16(s4, s5, s6, s7, s8, s9, s10, s11, x_filter_tmp,
+                                 horiz_const, shift_round_0);
+        res5 = convolve8_8x8_s16(s5, s6, s7, s8, s9, s10, s11, s12,
+                                 x_filter_tmp, horiz_const, shift_round_0);
+        res6 = convolve8_8x8_s16(s6, s7, s8, s9, s10, s11, s12, s13,
+                                 x_filter_tmp, horiz_const, shift_round_0);
+        res7 = convolve8_8x8_s16(s7, s8, s9, s10, s11, s12, s13, s14,
+                                 x_filter_tmp, horiz_const, shift_round_0);
+
+        transpose_s16_8x8(&res0, &res1, &res2, &res3, &res4, &res5, &res6,
+                          &res7);
+
+        store_s16_8x8(d_tmp, dst_stride, res0, res1, res2, res3, res4, res5,
+                      res6, res7);
+        s0 = s8;
+        s1 = s9;
+        s2 = s10;
+        s3 = s11;
+        s4 = s12;
+        s5 = s13;
+        s6 = s14;
+        s += 8;
+        d_tmp += 8;
+        width -= 8;
+      } while (width > 0);
+      src += 8 * src_stride;
+      dst_ptr += 8 * dst_stride;
+      height -= 8;
+#else
+      int16x8_t temp_0;
+      t0 = vld1_u8(src);
+      s0 = vreinterpretq_s16_u16(vmovl_u8(t0));  // a0 a1 a2 a3 a4 a5 a6 a7
+
+      width = w;
+      s = src + 8;
+      d_tmp = dst_ptr;
+      __builtin_prefetch(dst_ptr);
+
+      do {
+        t0 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
+        s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        temp_0 = s0;
+        s0 = s7;
+
+        s1 = vextq_s16(temp_0, s7, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
+        s2 = vextq_s16(temp_0, s7, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
+        s3 = vextq_s16(temp_0, s7, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
+        s4 = vextq_s16(temp_0, s7, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
+        s5 = vextq_s16(temp_0, s7, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
+        s6 = vextq_s16(temp_0, s7, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
+        s7 = vextq_s16(temp_0, s7, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
+
+        res0 = convolve8_8x8_s16(temp_0, s1, s2, s3, s4, s5, s6, s7,
+                                 x_filter_tmp, horiz_const, shift_round_0);
+        vst1q_s16(d_tmp, res0);
+
+        s += 8;
+        d_tmp += 8;
+        width -= 8;
+      } while (width > 0);
+      src += src_stride;
+      dst_ptr += dst_stride;
+      height -= 1;
+#endif
+    } while (height > 0);
+  }
+}
+
+static INLINE void dist_wtd_convolve_2d_vert_neon(
+    int16_t *im_block, const int im_stride, uint8_t *dst8, int dst8_stride,
+    ConvolveParams *conv_params, const int16_t *y_filter, int h, int w) {
+  uint8_t *dst_u8_ptr, *d_u8;
+  CONV_BUF_TYPE *dst_ptr, *dst;
+  int16_t *src_ptr, *s;
+  uint16_t *d;
+
+  const int bd = 8;
+  int height;
+  int dst_stride = conv_params->dst_stride;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int16_t sub_const = (1 << (offset_bits - conv_params->round_1)) +
+                            (1 << (offset_bits - conv_params->round_1 - 1));
+
+  const int16_t round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int32x4_t round_shift_vec = vdupq_n_s32(-(conv_params->round_1));
+  const int32x4_t offset_const = vdupq_n_s32(1 << offset);
+  const int16x4_t sub_const_vec = vdup_n_s16(sub_const);
+  const uint16_t fwd_offset = conv_params->fwd_offset;
+  const uint16_t bck_offset = conv_params->bck_offset;
+  const int do_average = conv_params->do_average;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
+
+  int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
+  uint16x4_t res4, d0;
+  uint8x8_t t0;
+
+#if defined(__aarch64__)
+  int16x4_t s8, s9, s10;
+  uint16x4_t res5, res6, res7, d1, d2, d3;
+  uint8x8_t t1;
+#endif
+
+  dst = conv_params->dst;
+  src_ptr = im_block;
+  dst_u8_ptr = dst8;
+  dst_ptr = dst;
+  height = h;
+
+  do {
+    d = dst_ptr;
+    d_u8 = dst_u8_ptr;
+    s = src_ptr;
+    height = h;
+
+    __builtin_prefetch(s + 0 * im_stride);
+    __builtin_prefetch(s + 1 * im_stride);
+    __builtin_prefetch(s + 2 * im_stride);
+    __builtin_prefetch(s + 3 * im_stride);
+    __builtin_prefetch(s + 4 * im_stride);
+    __builtin_prefetch(s + 5 * im_stride);
+    __builtin_prefetch(s + 6 * im_stride);
+    __builtin_prefetch(s + 7 * im_stride);
+
+    load_s16_4x8(s, im_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+    s += (7 * im_stride);
+
+    do {
+#if defined(__aarch64__)
+      load_s16_4x4(s, im_stride, &s7, &s8, &s9, &s10);
+      s += (im_stride << 2);
+
+      __builtin_prefetch(d + 0 * dst_stride);
+      __builtin_prefetch(d + 1 * dst_stride);
+      __builtin_prefetch(d + 2 * dst_stride);
+      __builtin_prefetch(d + 3 * dst_stride);
+
+      __builtin_prefetch(d_u8 + 4 * dst8_stride);
+      __builtin_prefetch(d_u8 + 5 * dst8_stride);
+      __builtin_prefetch(d_u8 + 6 * dst8_stride);
+      __builtin_prefetch(d_u8 + 7 * dst8_stride);
+
+      d0 = convolve8_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+                             round_shift_vec, offset_const);
+      d1 = convolve8_4x4_s32(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
+                             round_shift_vec, offset_const);
+      d2 = convolve8_4x4_s32(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
+                             round_shift_vec, offset_const);
+      d3 = convolve8_4x4_s32(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
+                             round_shift_vec, offset_const);
+
+      if (do_average) {
+        load_u16_4x4(d, dst_stride, &res4, &res5, &res6, &res7);
+        d += (dst_stride << 2);
+
+        compute_avg_4x4(res4, res5, res6, res7, d0, d1, d2, d3, fwd_offset,
+                        bck_offset, sub_const_vec, round_bits,
+                        use_dist_wtd_comp_avg, &t0, &t1);
+
+        vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 0);
+        d_u8 += dst8_stride;
+        vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 1);
+        d_u8 += dst8_stride;
+        vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t1), 0);
+        d_u8 += dst8_stride;
+        vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t1), 1);
+        d_u8 += dst8_stride;
+
+      } else {
+        store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+        d += (dst_stride << 2);
+      }
+      s0 = s4;
+      s1 = s5;
+      s2 = s6;
+      s3 = s7;
+      s4 = s8;
+      s5 = s9;
+      s6 = s10;
+      height -= 4;
+#else
+      s7 = vld1_s16(s);
+      s += (im_stride);
+
+      __builtin_prefetch(d + 0 * dst_stride);
+      __builtin_prefetch(d_u8 + 0 * dst8_stride);
+
+      d0 = convolve8_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+                             round_shift_vec, offset_const);
+
+      if (do_average) {
+        res4 = vld1_u16(d);
+        d += (dst_stride);
+
+        compute_avg_4x1(res4, d0, fwd_offset, bck_offset, sub_const_vec,
+                        round_bits, use_dist_wtd_comp_avg, &t0);
+
+        vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 0);
+        d_u8 += dst8_stride;
+
+      } else {
+        vst1_u16(d, d0);
+        d += (dst_stride);
+      }
+      s0 = s1;
+      s1 = s2;
+      s2 = s3;
+      s3 = s4;
+      s4 = s5;
+      s5 = s6;
+      s6 = s7;
+      height--;
+#endif
+    } while (height > 0);
+    src_ptr += 4;
+    dst_ptr += 4;
+    dst_u8_ptr += 4;
+    w -= 4;
+  } while (w > 0);
+}
+
+void av1_dist_wtd_convolve_2d_neon(const uint8_t *src, int src_stride,
+                                   uint8_t *dst8, int dst8_stride, int w, int h,
+                                   const InterpFilterParams *filter_params_x,
+                                   const InterpFilterParams *filter_params_y,
+                                   const int subpel_x_qn, const int subpel_y_qn,
+                                   ConvolveParams *conv_params) {
+  assert(!(w % 4));
+  assert(!(h % 4));
+
+  DECLARE_ALIGNED(16, int16_t,
+                  im_block[(MAX_SB_SIZE + HORIZ_EXTRA_ROWS) * MAX_SB_SIZE]);
+
+  const int im_h = h + filter_params_y->taps - 1;
+  const int im_stride = MAX_SB_SIZE;
+  const int vert_offset = filter_params_y->taps / 2 - 1;
+  const int horiz_offset = filter_params_x->taps / 2 - 1;
+  const int round_0 = conv_params->round_0 - 1;
+  const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
+  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+      filter_params_x, subpel_x_qn & SUBPEL_MASK);
+  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+      filter_params_y, subpel_y_qn & SUBPEL_MASK);
+
+  int16_t x_filter_tmp[8];
+  int16x8_t filter_x_coef = vld1q_s16(x_filter);
+
+  // filter coeffs are even, so downshifting by 1 to reduce intermediate
+  // precision requirements.
+  filter_x_coef = vshrq_n_s16(filter_x_coef, 1);
+  vst1q_s16(&x_filter_tmp[0], filter_x_coef);
+
+  dist_wtd_convolve_2d_horiz_neon(src_ptr, src_stride, im_block, im_stride,
+                                  x_filter_tmp, im_h, w, round_0);
+
+  dist_wtd_convolve_2d_vert_neon(im_block, im_stride, dst8, dst8_stride,
+                                 conv_params, y_filter, h, w);
+}
+
+void av1_dist_wtd_convolve_2d_copy_neon(
+    const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params) {
+  uint8x8_t res0_8, res1_8, res2_8, res3_8, tmp_shift0, tmp_shift1, tmp_shift2,
+      tmp_shift3;
+  uint16x8_t res_q0, res_q1, res_q2, res_q3, tmp_q0, tmp_q1, tmp_q2, tmp_q3;
+  uint16x4_t tmp4, tmp5, tmp6, tmp7, res4, res5, res6, res7;
+  const uint8_t *src1, *src2;
+  uint8_t *dst8_1;
+  CONV_BUF_TYPE *dst = conv_params->dst, *dst_1, *dst_2;
+  const int dst_stride = conv_params->dst_stride;
+  int x, y;
+  const int16_t bits =
+      FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
+                           (1 << (offset_bits - conv_params->round_1 - 1));
+  const int16x4_t sub_const_vec = vdup_n_s16((int16_t)round_offset);
+  const uint16x8_t dup_round_offset16x8 = vdupq_n_u16((uint16_t)round_offset);
+  const int16x4_t dup_bits16x4 = vdup_n_s16(bits);
+  const int16x8_t dup_bits16x8 = vdupq_n_s16(bits);
+
+  (void)filter_params_x;
+  (void)filter_params_y;
+  (void)subpel_x_qn;
+  (void)subpel_y_qn;
+
+  if (!(w & 0x07)) {
+    for (y = 0; y < (h >> 2); ++y) {
+      src1 = src;
+      dst8_1 = dst8;
+      dst_1 = dst;
+      for (x = 0; x < (w >> 3); ++x) {
+        src2 = src1;
+        load_u8_8x4(src2, src_stride, &res0_8, &res1_8, &res2_8, &res3_8);
+
+        res_q0 = vaddq_u16(vshlq_u16(vmovl_u8(res0_8), dup_bits16x8),
+                           dup_round_offset16x8);
+        res_q1 = vaddq_u16(vshlq_u16(vmovl_u8(res1_8), dup_bits16x8),
+                           dup_round_offset16x8);
+        res_q2 = vaddq_u16(vshlq_u16(vmovl_u8(res2_8), dup_bits16x8),
+                           dup_round_offset16x8);
+        res_q3 = vaddq_u16(vshlq_u16(vmovl_u8(res3_8), dup_bits16x8),
+                           dup_round_offset16x8);
+
+        if (conv_params->do_average) {
+          dst_2 = dst_1;
+          load_u16_8x4(dst_2, dst_stride, &tmp_q0, &tmp_q1, &tmp_q2, &tmp_q3);
+
+          compute_avg_8x4(tmp_q0, tmp_q1, tmp_q2, tmp_q3, res_q0, res_q1,
+                          res_q2, res_q3, conv_params->fwd_offset,
+                          conv_params->bck_offset, sub_const_vec, bits,
+                          conv_params->use_dist_wtd_comp_avg, &tmp_shift0,
+                          &tmp_shift1, &tmp_shift2, &tmp_shift3);
+
+          vst1_u8(dst8_1 + (0 * dst8_stride), tmp_shift0);
+          vst1_u8(dst8_1 + (1 * dst8_stride), tmp_shift1);
+          vst1_u8(dst8_1 + (2 * dst8_stride), tmp_shift2);
+          vst1_u8(dst8_1 + (3 * dst8_stride), tmp_shift3);
+
+        } else {
+          vst1q_u16(dst_1 + (0 * dst_stride), res_q0);
+          vst1q_u16(dst_1 + (1 * dst_stride), res_q1);
+          vst1q_u16(dst_1 + (2 * dst_stride), res_q2);
+          vst1q_u16(dst_1 + (3 * dst_stride), res_q3);
+        }
+        src1 = src1 + 8;
+        dst_1 = dst_1 + 8;
+        dst8_1 = dst8_1 + 8;
+      }
+      src += src_stride * 4;
+      dst8 += dst8_stride * 4;
+      dst += dst_stride * 4;
+    }
+  } else if (!(w & 0x03)) {
+    for (y = 0; y < (h >> 2); ++y) {
+      src1 = src;
+      dst8_1 = dst8;
+      dst_1 = dst;
+
+      load_u8_8x4(src1, src_stride, &res0_8, &res1_8, &res2_8, &res3_8);
+
+      res4 = vadd_u16(vshl_u16(vget_low_u16(vmovl_u8(res0_8)), dup_bits16x4),
+                      vreinterpret_u16_s16(sub_const_vec));
+      res5 = vadd_u16(vshl_u16(vget_low_u16(vmovl_u8(res1_8)), dup_bits16x4),
+                      vreinterpret_u16_s16(sub_const_vec));
+      res6 = vadd_u16(vshl_u16(vget_low_u16(vmovl_u8(res2_8)), dup_bits16x4),
+                      vreinterpret_u16_s16(sub_const_vec));
+      res7 = vadd_u16(vshl_u16(vget_low_u16(vmovl_u8(res3_8)), dup_bits16x4),
+                      vreinterpret_u16_s16(sub_const_vec));
+      if (conv_params->do_average) {
+        load_u16_4x4(dst_1, dst_stride, &tmp4, &tmp5, &tmp6, &tmp7);
+
+        compute_avg_4x4(tmp4, tmp5, tmp6, tmp7, res4, res5, res6, res7,
+                        conv_params->fwd_offset, conv_params->bck_offset,
+                        sub_const_vec, bits, conv_params->use_dist_wtd_comp_avg,
+                        &tmp_shift0, &tmp_shift1);
+
+        vst1_lane_u32((uint32_t *)(dst8_1), vreinterpret_u32_u8(tmp_shift0), 0);
+        dst8_1 += dst8_stride;
+        vst1_lane_u32((uint32_t *)(dst8_1), vreinterpret_u32_u8(tmp_shift0), 1);
+        dst8_1 += dst8_stride;
+        vst1_lane_u32((uint32_t *)(dst8_1), vreinterpret_u32_u8(tmp_shift1), 0);
+        dst8_1 += dst8_stride;
+        vst1_lane_u32((uint32_t *)(dst8_1), vreinterpret_u32_u8(tmp_shift1), 1);
+
+      } else {
+        vst1_u16(dst_1, res4);
+        dst_1 += dst_stride;
+        vst1_u16(dst_1, res5);
+        dst_1 += dst_stride;
+        vst1_u16(dst_1, res6);
+        dst_1 += dst_stride;
+        vst1_u16(dst_1, res7);
+      }
+      src += src_stride * 4;
+      dst += dst_stride * 4;
+      dst8 += dst8_stride * 4;
+    }
+  }
+}
+
+void av1_dist_wtd_convolve_x_neon(const uint8_t *src, int src_stride,
+                                  uint8_t *dst8, int dst8_stride, int w, int h,
+                                  const InterpFilterParams *filter_params_x,
+                                  const InterpFilterParams *filter_params_y,
+                                  const int subpel_x_qn, const int subpel_y_qn,
+                                  ConvolveParams *conv_params) {
+  assert(!(w % 4));
+  assert(!(h % 4));
+
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  const int horiz_offset = filter_params_x->taps / 2 - 1;
+  const int bits = FILTER_BITS - conv_params->round_1;
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
+                           (1 << (offset_bits - conv_params->round_1 - 1));
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const uint16_t fwd_offset = conv_params->fwd_offset;
+  const uint16_t bck_offset = conv_params->bck_offset;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
+
+  (void)filter_params_y;
+  (void)subpel_y_qn;
+
+  // horizontal filter
+  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+      filter_params_x, subpel_x_qn & SUBPEL_MASK);
+
+  const uint8_t *src_ptr = src - horiz_offset;
+
+  int16_t x_filter_tmp[8];
+  int16x8_t filter_x_coef = vld1q_s16(x_filter);
+
+  // filter coeffs are even, so downshifting by 1 to reduce intermediate
+  // precision requirements.
+  filter_x_coef = vshrq_n_s16(filter_x_coef, 1);
+  vst1q_s16(&x_filter_tmp[0], filter_x_coef);
+
+  const uint8_t *s;
+  uint8_t *d_u8;
+  uint8_t *dst_u8_ptr;
+  CONV_BUF_TYPE *d, *dst_ptr;
+  int width, height;
+  uint8x8_t t0;
+#if defined(__aarch64__)
+  uint8x8_t t1, t2, t3, t4, t5, t6, t7;
+#endif
+  s = src_ptr;
+  dst_ptr = dst;
+  dst_u8_ptr = dst8;
+  width = w;
+  height = h;
+
+  if ((w == 4) || (h == 4)) {
+    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0;
+    int16x8_t tt0;
+    uint16x4_t res4;
+#if defined(__aarch64__)
+    int16x4_t s8, s9, s10, d1, d2, d3;
+    int16x8_t tt1, tt2, tt3;
+    uint16x4_t res5, res6, res7;
+    uint32x2_t tu0 = vdup_n_u32(0), tu1 = vdup_n_u32(0);
+    int16x8_t u0, u1;
+#else
+    int16x4_t temp_0;
+#endif
+    const int16x4_t zero = vdup_n_s16(0);
+    const int16x4_t round_offset_vec = vdup_n_s16(round_offset);
+    const int16x4_t shift_round_0 = vdup_n_s16(-conv_params->round_0 + 1);
+    const int16x4_t horiz_const = vdup_n_s16(bits);
+    do {
+      s = src_ptr;
+      d = dst_ptr;
+      d_u8 = dst_u8_ptr;
+      width = w;
+      __builtin_prefetch(s + 0 * src_stride);
+#if defined(__aarch64__)
+      __builtin_prefetch(s + 1 * src_stride);
+      __builtin_prefetch(s + 2 * src_stride);
+      __builtin_prefetch(s + 3 * src_stride);
+
+      load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+      transpose_u8_8x4(&t0, &t1, &t2, &t3);
+      tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+      tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+      s0 = vget_low_s16(tt0);
+      s1 = vget_low_s16(tt1);
+      s2 = vget_low_s16(tt2);
+      s3 = vget_low_s16(tt3);
+      s4 = vget_high_s16(tt0);
+      s5 = vget_high_s16(tt1);
+      s6 = vget_high_s16(tt2);
+      __builtin_prefetch(d + 0 * dst_stride);
+      __builtin_prefetch(d + 1 * dst_stride);
+      __builtin_prefetch(d + 2 * dst_stride);
+      __builtin_prefetch(d + 3 * dst_stride);
+      s += 7;
+      do {
+        load_unaligned_u8_4x4(s, src_stride, &tu0, &tu1);
+        t0 = vreinterpret_u8_u32(tu0);
+        t1 = vreinterpret_u8_u32(tu1);
+
+        transpose_u8_4x4(&t0, &t1);
+        u0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        u1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+
+        s7 = vget_low_s16(u0);
+        s8 = vget_low_s16(u1);
+        s9 = vget_high_s16(u0);
+        s10 = vget_high_s16(u1);
+
+        d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
+                               zero, shift_round_0);
+        d0 = vrshl_s16(d0, horiz_const);
+        d0 = vadd_s16(d0, round_offset_vec);
+        d1 = convolve8_4x4_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp,
+                               zero, shift_round_0);
+        d1 = vrshl_s16(d1, horiz_const);
+        d1 = vadd_s16(d1, round_offset_vec);
+        d2 = convolve8_4x4_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp,
+                               zero, shift_round_0);
+        d2 = vrshl_s16(d2, horiz_const);
+        d2 = vadd_s16(d2, round_offset_vec);
+        d3 = convolve8_4x4_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp,
+                               zero, shift_round_0);
+        d3 = vrshl_s16(d3, horiz_const);
+        d3 = vadd_s16(d3, round_offset_vec);
+
+        transpose_s16_4x4d(&d0, &d1, &d2, &d3);
+
+        if (conv_params->do_average) {
+          __builtin_prefetch(d + 0 * dst_stride);
+          __builtin_prefetch(d + 1 * dst_stride);
+          __builtin_prefetch(d + 2 * dst_stride);
+          __builtin_prefetch(d + 3 * dst_stride);
+
+          __builtin_prefetch(d_u8 + 0 * dst8_stride);
+          __builtin_prefetch(d_u8 + 1 * dst8_stride);
+          __builtin_prefetch(d_u8 + 2 * dst8_stride);
+          __builtin_prefetch(d_u8 + 3 * dst8_stride);
+
+          load_u16_4x4(d, dst_stride, &res4, &res5, &res6, &res7);
+
+          compute_avg_4x4(res4, res5, res6, res7, vreinterpret_u16_s16(d0),
+                          vreinterpret_u16_s16(d1), vreinterpret_u16_s16(d2),
+                          vreinterpret_u16_s16(d3), fwd_offset, bck_offset,
+                          round_offset_vec, round_bits, use_dist_wtd_comp_avg,
+                          &t0, &t1);
+
+          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0),
+                        0);  // 00 01 02 03
+          vst1_lane_u32((uint32_t *)(d_u8 + dst8_stride),
+                        vreinterpret_u32_u8(t0),
+                        1);  // 10 11 12 13
+          vst1_lane_u32((uint32_t *)(d_u8 + 2 * dst8_stride),
+                        vreinterpret_u32_u8(t1),
+                        0);  // 20 21 22 23
+          vst1_lane_u32((uint32_t *)(d_u8 + 3 * dst8_stride),
+                        vreinterpret_u32_u8(t1),
+                        1);  // 30 31 32 33
+        } else {
+          store_u16_4x4(d, dst_stride, vreinterpret_u16_s16(d0),
+                        vreinterpret_u16_s16(d1), vreinterpret_u16_s16(d2),
+                        vreinterpret_u16_s16(d3));
+        }
+
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+        s3 = s7;
+        s4 = s8;
+        s5 = s9;
+        s6 = s10;
+
+        s += 4;
+        width -= 4;
+        d += 4;
+        d_u8 += 4;
+      } while (width > 0);
+      src_ptr += (src_stride << 2);
+      dst_ptr += (dst_stride << 2);
+      dst_u8_ptr += (dst8_stride << 2);
+      height -= 4;
+#else
+      t0 = vld1_u8(s);                            // a0 a1 a2 a3 a4 a5 a6 a7
+      tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));  // a0 a1 a2 a3 a4 a5 a6 a7
+      s0 = vget_low_s16(tt0);                     // a0 a1 a2 a3
+      s4 = vget_high_s16(tt0);                    // a4 a5 a6 a7
+      __builtin_prefetch(d);
+
+      s += 8;
+      do {
+        t0 = vld1_u8(s);  // a8 a9 a10 a11
+
+        // a8 a9 a10 a11
+        s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+        temp_0 = s7;
+        s1 = vext_s16(s0, s4, 1);  // a1 a2 a3 a4
+        s2 = vext_s16(s0, s4, 2);  // a2 a3 a4 a5
+        s3 = vext_s16(s0, s4, 3);  // a3 a4 a5 a6
+        s5 = vext_s16(s4, s7, 1);  // a5 a6 a7 a8
+        s6 = vext_s16(s4, s7, 2);  // a6 a7 a8 a9
+        s7 = vext_s16(s4, s7, 3);  // a7 a8 a9 a10
+
+        d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
+                               zero, shift_round_0);
+        d0 = vrshl_s16(d0, horiz_const);
+        d0 = vadd_s16(d0, round_offset_vec);
+        s0 = s4;
+        s4 = temp_0;
+        if (conv_params->do_average) {
+          __builtin_prefetch(d);
+          __builtin_prefetch(d_u8);
+
+          res4 = vld1_u16(d);
+
+          compute_avg_4x1(res4, vreinterpret_u16_s16(d0), fwd_offset,
+                          bck_offset, round_offset_vec, round_bits,
+                          use_dist_wtd_comp_avg, &t0);
+
+          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0),
+                        0);  // 00 01 02 03
+        } else {
+          vst1_u16(d, vreinterpret_u16_s16(d0));
+        }
+
+        s += 4;
+        width -= 4;
+        d += 4;
+        d_u8 += 4;
+      } while (width > 0);
+      src_ptr += (src_stride);
+      dst_ptr += (dst_stride);
+      dst_u8_ptr += (dst8_stride);
+      height--;
+#endif
+    } while (height > 0);
+  } else {
+    CONV_BUF_TYPE *d_tmp;
+    uint8_t *d_u8_tmp;
+    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+    int16x8_t res0;
+    uint16x8_t res8;
+    const int16x8_t round_offset128 = vdupq_n_s16(round_offset);
+    const int16x4_t round_offset64 = vdup_n_s16(round_offset);
+    const int16x8_t shift_round_0 = vdupq_n_s16(-conv_params->round_0 + 1);
+    const int16x8_t horiz_const = vdupq_n_s16(bits);
+    const int16x8_t zero = vdupq_n_s16(0);
+
+    d = dst_ptr = dst;
+    d_u8 = dst_u8_ptr = dst8;
+    do {
+#if defined(__aarch64__)
+      int16x8_t s11, s12, s13, s14;
+      int16x8_t s8, s9, s10;
+      int16x8_t res1, res2, res3, res4, res5, res6, res7;
+      uint16x8_t res9, res10, res11;
+      __builtin_prefetch(src_ptr + 0 * src_stride);
+      __builtin_prefetch(src_ptr + 1 * src_stride);
+      __builtin_prefetch(src_ptr + 2 * src_stride);
+      __builtin_prefetch(src_ptr + 3 * src_stride);
+      __builtin_prefetch(src_ptr + 4 * src_stride);
+      __builtin_prefetch(src_ptr + 5 * src_stride);
+      __builtin_prefetch(src_ptr + 6 * src_stride);
+      __builtin_prefetch(src_ptr + 7 * src_stride);
+      load_u8_8x8(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+      transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+      s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+      s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+      s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+      s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+      s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+      width = w;
+      s = src_ptr + 7;
+      d = dst_ptr;
+      d_u8_tmp = dst_u8_ptr;
+
+      __builtin_prefetch(dst_ptr + 0 * dst_stride);
+      __builtin_prefetch(dst_ptr + 1 * dst_stride);
+      __builtin_prefetch(dst_ptr + 2 * dst_stride);
+      __builtin_prefetch(dst_ptr + 3 * dst_stride);
+      __builtin_prefetch(dst_ptr + 4 * dst_stride);
+      __builtin_prefetch(dst_ptr + 5 * dst_stride);
+      __builtin_prefetch(dst_ptr + 6 * dst_stride);
+      __builtin_prefetch(dst_ptr + 7 * dst_stride);
+
+      do {
+        d_u8 = d_u8_tmp;
+        d_tmp = d;
+
+        load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+        transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+        s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+        s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
+        s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
+        s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
+        s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+        res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
+                                 zero, shift_round_0);
+
+        res0 = vrshlq_s16(res0, horiz_const);
+        res0 = vaddq_s16(res0, round_offset128);
+
+        res1 = convolve8_8x8_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp,
+                                 zero, shift_round_0);
+        res1 = vrshlq_s16(res1, horiz_const);
+        res1 = vaddq_s16(res1, round_offset128);
+        res2 = convolve8_8x8_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp,
+                                 zero, shift_round_0);
+        res2 = vrshlq_s16(res2, horiz_const);
+        res2 = vaddq_s16(res2, round_offset128);
+        res3 = convolve8_8x8_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp,
+                                 zero, shift_round_0);
+        res3 = vrshlq_s16(res3, horiz_const);
+        res3 = vaddq_s16(res3, round_offset128);
+        res4 = convolve8_8x8_s16(s4, s5, s6, s7, s8, s9, s10, s11, x_filter_tmp,
+                                 zero, shift_round_0);
+        res4 = vrshlq_s16(res4, horiz_const);
+        res4 = vaddq_s16(res4, round_offset128);
+        res5 = convolve8_8x8_s16(s5, s6, s7, s8, s9, s10, s11, s12,
+                                 x_filter_tmp, zero, shift_round_0);
+        res5 = vrshlq_s16(res5, horiz_const);
+        res5 = vaddq_s16(res5, round_offset128);
+        res6 = convolve8_8x8_s16(s6, s7, s8, s9, s10, s11, s12, s13,
+                                 x_filter_tmp, zero, shift_round_0);
+        res6 = vrshlq_s16(res6, horiz_const);
+        res6 = vaddq_s16(res6, round_offset128);
+        res7 = convolve8_8x8_s16(s7, s8, s9, s10, s11, s12, s13, s14,
+                                 x_filter_tmp, zero, shift_round_0);
+        res7 = vrshlq_s16(res7, horiz_const);
+        res7 = vaddq_s16(res7, round_offset128);
+
+        transpose_s16_8x8(&res0, &res1, &res2, &res3, &res4, &res5, &res6,
+                          &res7);
+
+        if (conv_params->do_average) {
+          load_u16_8x4(d_tmp, dst_stride, &res8, &res9, &res10, &res11);
+          d_tmp += (dst_stride << 2);
+
+          compute_avg_8x4(res8, res9, res10, res11, vreinterpretq_u16_s16(res0),
+                          vreinterpretq_u16_s16(res1),
+                          vreinterpretq_u16_s16(res2),
+                          vreinterpretq_u16_s16(res3), fwd_offset, bck_offset,
+                          round_offset64, round_bits, use_dist_wtd_comp_avg,
+                          &t0, &t1, &t2, &t3);
+
+          store_u8_8x4(d_u8, dst8_stride, t0, t1, t2, t3);
+          d_u8 += (dst8_stride << 2);
+
+          load_u16_8x4(d_tmp, dst_stride, &res8, &res9, &res10, &res11);
+          d_tmp += (dst_stride << 2);
+
+          compute_avg_8x4(res8, res9, res10, res11, vreinterpretq_u16_s16(res4),
+                          vreinterpretq_u16_s16(res5),
+                          vreinterpretq_u16_s16(res6),
+                          vreinterpretq_u16_s16(res7), fwd_offset, bck_offset,
+                          round_offset64, round_bits, use_dist_wtd_comp_avg,
+                          &t0, &t1, &t2, &t3);
+
+          store_u8_8x4(d_u8, dst8_stride, t0, t1, t2, t3);
+          d_u8 += (dst8_stride << 2);
+        } else {
+          store_u16_8x8(
+              d_tmp, dst_stride, vreinterpretq_u16_s16(res0),
+              vreinterpretq_u16_s16(res1), vreinterpretq_u16_s16(res2),
+              vreinterpretq_u16_s16(res3), vreinterpretq_u16_s16(res4),
+              vreinterpretq_u16_s16(res5), vreinterpretq_u16_s16(res6),
+              vreinterpretq_u16_s16(res7));
+          d_tmp += (dst_stride << 3);
+        }
+
+        s0 = s8;
+        s1 = s9;
+        s2 = s10;
+        s3 = s11;
+        s4 = s12;
+        s5 = s13;
+        s6 = s14;
+        s += 8;
+        d += 8;
+        width -= 8;
+        d_u8_tmp += 8;
+      } while (width > 0);
+      src_ptr += 8 * src_stride;
+      dst_ptr += 8 * dst_stride;
+      dst_u8_ptr += 8 * dst8_stride;
+      height -= 8;
+#else
+      int16x8_t temp_0;
+      __builtin_prefetch(src_ptr);
+      t0 = vld1_u8(src_ptr);
+      s0 = vreinterpretq_s16_u16(vmovl_u8(t0));  // a0 a1 a2 a3 a4 a5 a6 a7
+
+      width = w;
+      s = src_ptr + 8;
+      d = dst_ptr;
+      d_u8_tmp = dst_u8_ptr;
+
+      __builtin_prefetch(dst_ptr);
+
+      do {
+        d_u8 = d_u8_tmp;
+        d_tmp = d;
+
+        t0 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
+        s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        temp_0 = s0;
+        s0 = s7;
+
+        s1 = vextq_s16(temp_0, s7, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
+        s2 = vextq_s16(temp_0, s7, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
+        s3 = vextq_s16(temp_0, s7, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
+        s4 = vextq_s16(temp_0, s7, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
+        s5 = vextq_s16(temp_0, s7, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
+        s6 = vextq_s16(temp_0, s7, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
+        s7 = vextq_s16(temp_0, s7, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
+
+        res0 = convolve8_8x8_s16(temp_0, s1, s2, s3, s4, s5, s6, s7,
+                                 x_filter_tmp, zero, shift_round_0);
+
+        res0 = vrshlq_s16(res0, horiz_const);
+        res0 = vaddq_s16(res0, round_offset128);
+
+        if (conv_params->do_average) {
+          res8 = vld1q_u16(d_tmp);
+          d_tmp += (dst_stride);
+
+          compute_avg_8x1(res8, vreinterpretq_u16_s16(res0), fwd_offset,
+                          bck_offset, round_offset64, round_bits,
+                          use_dist_wtd_comp_avg, &t0);
+
+          vst1_u8(d_u8, t0);
+          d_u8 += (dst8_stride);
+        } else {
+          vst1q_u16(d_tmp, vreinterpretq_u16_s16(res0));
+          d_tmp += (dst_stride);
+        }
+
+        s += 8;
+        d += 8;
+        width -= 8;
+        d_u8_tmp += 8;
+      } while (width > 0);
+      src_ptr += src_stride;
+      dst_ptr += dst_stride;
+      dst_u8_ptr += dst8_stride;
+      height--;
+#endif
+    } while (height > 0);
+  }
+}
+
+void av1_dist_wtd_convolve_y_neon(const uint8_t *src, int src_stride,
+                                  uint8_t *dst8, int dst8_stride, int w, int h,
+                                  const InterpFilterParams *filter_params_x,
+                                  const InterpFilterParams *filter_params_y,
+                                  const int subpel_x_qn, const int subpel_y_qn,
+                                  ConvolveParams *conv_params) {
+  assert(!(w % 4));
+  assert(!(h % 4));
+
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  const int dst_stride = conv_params->dst_stride;
+  const int vert_offset = filter_params_y->taps / 2 - 1;
+  const int bits = FILTER_BITS - conv_params->round_0;
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
+                           (1 << (offset_bits - conv_params->round_1 - 1));
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const uint16_t fwd_offset = conv_params->fwd_offset;
+  const uint16_t bck_offset = conv_params->bck_offset;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
+  const int shift_value = (conv_params->round_1 - 1 - bits);
+
+  (void)filter_params_x;
+  (void)subpel_x_qn;
+
+  // vertical filter
+  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+      filter_params_y, subpel_y_qn & SUBPEL_MASK);
+
+  const uint8_t *src_ptr = src - (vert_offset * src_stride);
+
+  int16_t y_filter_tmp[8];
+  int16x8_t filter_y_coef = vld1q_s16(y_filter);
+
+  // filter coeffs are even, so downshifting by 1 to reduce intermediate
+  // precision requirements.
+  filter_y_coef = vshrq_n_s16(filter_y_coef, 1);
+  vst1q_s16(&y_filter_tmp[0], filter_y_coef);
+
+  const uint8_t *s;
+  uint8_t *d_u8;
+  uint8_t *dst_u8_ptr;
+  CONV_BUF_TYPE *d, *dst_ptr;
+  int width, height;
+
+  s = src_ptr;
+  dst_ptr = dst;
+  dst_u8_ptr = dst8;
+  width = w;
+  height = h;
+
+  // used to get rid of multiplication = (vertical filter output sum) *
+  // (1<<bits).
+  assert((conv_params->round_1 - 2) >= bits);
+
+  if ((w == 4) || (h == 4)) {
+    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0;
+    uint16x4_t res4;
+    uint32x2_t tu0 = vdup_n_u32(0), tu1 = vdup_n_u32(0), tu2 = vdup_n_u32(0),
+               tu3 = vdup_n_u32(0);
+    int16x8_t u0, u1, u2, u3;
+    uint8x8_t t0;
+
+#if defined(__aarch64__)
+    int16x4_t s8, s9, s10, d1, d2, d3;
+    uint16x4_t res5, res6, res7;
+    uint8x8_t t1;
+#endif
+    const int16x4_t round_offset64 = vdup_n_s16(round_offset);
+    const int16x4_t shift_vec = vdup_n_s16(-shift_value);
+    const int16x4_t zero = vdup_n_s16(0);
+
+    do {
+      s = src_ptr;
+      d = dst_ptr;
+      d_u8 = dst_u8_ptr;
+      height = h;
+      __builtin_prefetch(s + 0 * src_stride);
+      __builtin_prefetch(s + 1 * src_stride);
+      __builtin_prefetch(s + 2 * src_stride);
+      __builtin_prefetch(s + 3 * src_stride);
+
+      load_unaligned_u8_4x8(s, src_stride, &tu0, &tu1, &tu2, &tu3);
+
+      u0 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu0)));
+      u1 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu1)));
+      u2 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu2)));
+      u3 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu3)));
+
+      s0 = vget_low_s16(u0);
+      s1 = vget_high_s16(u0);
+      s2 = vget_low_s16(u1);
+      s3 = vget_high_s16(u1);
+      s4 = vget_low_s16(u2);
+      s5 = vget_high_s16(u2);
+      s6 = vget_low_s16(u3);
+
+      __builtin_prefetch(d + 0 * dst_stride);
+      __builtin_prefetch(d + 1 * dst_stride);
+      __builtin_prefetch(d + 2 * dst_stride);
+      __builtin_prefetch(d + 3 * dst_stride);
+
+      s += (7 * src_stride);
+      do {
+#if defined(__aarch64__)
+        load_unaligned_u8_4x4(s, src_stride, &tu0, &tu1);
+
+        u0 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu0)));
+        u1 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu1)));
+
+        s7 = vget_low_s16(u0);
+        s8 = vget_high_s16(u0);
+        s9 = vget_low_s16(u1);
+        s10 = vget_high_s16(u1);
+
+        d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter_tmp,
+                               zero, shift_vec);
+        d0 = vadd_s16(d0, round_offset64);
+        d1 = convolve8_4x4_s16(s1, s2, s3, s4, s5, s6, s7, s8, y_filter_tmp,
+                               zero, shift_vec);
+        d1 = vadd_s16(d1, round_offset64);
+        d2 = convolve8_4x4_s16(s2, s3, s4, s5, s6, s7, s8, s9, y_filter_tmp,
+                               zero, shift_vec);
+        d2 = vadd_s16(d2, round_offset64);
+        d3 = convolve8_4x4_s16(s3, s4, s5, s6, s7, s8, s9, s10, y_filter_tmp,
+                               zero, shift_vec);
+        d3 = vadd_s16(d3, round_offset64);
+
+        if (conv_params->do_average) {
+          __builtin_prefetch(d + 0 * dst_stride);
+          __builtin_prefetch(d + 1 * dst_stride);
+          __builtin_prefetch(d + 2 * dst_stride);
+          __builtin_prefetch(d + 3 * dst_stride);
+
+          __builtin_prefetch(d_u8 + 0 * dst8_stride);
+          __builtin_prefetch(d_u8 + 1 * dst8_stride);
+          __builtin_prefetch(d_u8 + 2 * dst8_stride);
+          __builtin_prefetch(d_u8 + 3 * dst8_stride);
+
+          load_u16_4x4(d, dst_stride, &res4, &res5, &res6, &res7);
+          d += (dst_stride << 2);
+
+          compute_avg_4x4(res4, res5, res6, res7, vreinterpret_u16_s16(d0),
+                          vreinterpret_u16_s16(d1), vreinterpret_u16_s16(d2),
+                          vreinterpret_u16_s16(d3), fwd_offset, bck_offset,
+                          round_offset64, round_bits, use_dist_wtd_comp_avg,
+                          &t0, &t1);
+
+          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 0);
+          d_u8 += dst8_stride;
+          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 1);
+          d_u8 += dst8_stride;
+          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t1), 0);
+          d_u8 += dst8_stride;
+          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t1), 1);
+          d_u8 += dst8_stride;
+        } else {
+          store_u16_4x4(d, dst_stride, vreinterpret_u16_s16(d0),
+                        vreinterpret_u16_s16(d1), vreinterpret_u16_s16(d2),
+                        vreinterpret_u16_s16(d3));
+          d += (dst_stride << 2);
+        }
+
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+        s3 = s7;
+        s4 = s8;
+        s5 = s9;
+        s6 = s10;
+
+        s += (src_stride << 2);
+        height -= 4;
+#else
+        load_unaligned_u8_4x1(s, src_stride, &tu0);
+        u0 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu0)));
+        s7 = vget_low_s16(u0);
+
+        d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter_tmp,
+                               zero, shift_vec);
+
+        d0 = vadd_s16(d0, round_offset64);
+
+        if (conv_params->do_average) {
+          __builtin_prefetch(d);
+
+          res4 = vld1_u16(d);
+          d += (dst_stride);
+
+          compute_avg_4x1(res4, vreinterpret_u16_s16(d0), fwd_offset,
+                          bck_offset, round_offset64, round_bits,
+                          use_dist_wtd_comp_avg, &t0);
+
+          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 0);
+          d_u8 += dst8_stride;
+        } else {
+          vst1_u16(d, vreinterpret_u16_s16(d0));
+          d += (dst_stride);
+        }
+
+        s0 = s1;
+        s1 = s2;
+        s2 = s3;
+        s3 = s4;
+        s4 = s5;
+        s5 = s6;
+        s6 = s7;
+
+        s += (src_stride);
+        height--;
+#endif
+      } while (height > 0);
+      src_ptr += 4;
+      dst_ptr += 4;
+      dst_u8_ptr += 4;
+      width -= 4;
+    } while (width > 0);
+  } else {
+    CONV_BUF_TYPE *d_tmp;
+    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+    int16x8_t res0;
+    uint16x8_t res8;
+    uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+    const int16x8_t round_offset128 = vdupq_n_s16(round_offset);
+    const int16x8_t shift_vec = vdupq_n_s16(-shift_value);
+    const int16x4_t round_offset64 = vdup_n_s16(round_offset);
+    const int16x8_t zero = vdupq_n_s16(0);
+#if defined(__aarch64__)
+    int16x8_t s8, s9, s10, s11, s12, s13, s14;
+    int16x8_t res1, res2, res3, res4, res5, res6, res7;
+    uint16x8_t res10, res11, res9;
+#endif
+    dst_ptr = dst;
+    dst_u8_ptr = dst8;
+    do {
+      __builtin_prefetch(src_ptr + 0 * src_stride);
+      __builtin_prefetch(src_ptr + 1 * src_stride);
+      __builtin_prefetch(src_ptr + 2 * src_stride);
+      __builtin_prefetch(src_ptr + 3 * src_stride);
+      __builtin_prefetch(src_ptr + 4 * src_stride);
+      __builtin_prefetch(src_ptr + 5 * src_stride);
+      __builtin_prefetch(src_ptr + 6 * src_stride);
+      __builtin_prefetch(src_ptr + 7 * src_stride);
+      load_u8_8x8(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+      s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+      s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+      s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+      s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+      s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+      height = h;
+      s = src_ptr + (7 * src_stride);
+      d_tmp = dst_ptr;
+      d_u8 = dst_u8_ptr;
+
+      do {
+#if defined(__aarch64__)
+        load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+        s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+        s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
+        s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
+        s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
+        s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+        __builtin_prefetch(dst_ptr + 0 * dst_stride);
+        __builtin_prefetch(dst_ptr + 1 * dst_stride);
+        __builtin_prefetch(dst_ptr + 2 * dst_stride);
+        __builtin_prefetch(dst_ptr + 3 * dst_stride);
+
+        res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter_tmp,
+                                 zero, shift_vec);
+        res0 = vaddq_s16(res0, round_offset128);
+        res1 = convolve8_8x8_s16(s1, s2, s3, s4, s5, s6, s7, s8, y_filter_tmp,
+                                 zero, shift_vec);
+        res1 = vaddq_s16(res1, round_offset128);
+        res2 = convolve8_8x8_s16(s2, s3, s4, s5, s6, s7, s8, s9, y_filter_tmp,
+                                 zero, shift_vec);
+        res2 = vaddq_s16(res2, round_offset128);
+        res3 = convolve8_8x8_s16(s3, s4, s5, s6, s7, s8, s9, s10, y_filter_tmp,
+                                 zero, shift_vec);
+        res3 = vaddq_s16(res3, round_offset128);
+        res4 = convolve8_8x8_s16(s4, s5, s6, s7, s8, s9, s10, s11, y_filter_tmp,
+                                 zero, shift_vec);
+        res4 = vaddq_s16(res4, round_offset128);
+        res5 = convolve8_8x8_s16(s5, s6, s7, s8, s9, s10, s11, s12,
+                                 y_filter_tmp, zero, shift_vec);
+        res5 = vaddq_s16(res5, round_offset128);
+        res6 = convolve8_8x8_s16(s6, s7, s8, s9, s10, s11, s12, s13,
+                                 y_filter_tmp, zero, shift_vec);
+        res6 = vaddq_s16(res6, round_offset128);
+        res7 = convolve8_8x8_s16(s7, s8, s9, s10, s11, s12, s13, s14,
+                                 y_filter_tmp, zero, shift_vec);
+        res7 = vaddq_s16(res7, round_offset128);
+
+        if (conv_params->do_average) {
+          __builtin_prefetch(d_tmp + 0 * dst8_stride);
+          __builtin_prefetch(d_tmp + 1 * dst8_stride);
+          __builtin_prefetch(d_tmp + 2 * dst8_stride);
+          __builtin_prefetch(d_tmp + 3 * dst8_stride);
+
+          load_u16_8x4(d_tmp, dst_stride, &res8, &res9, &res10, &res11);
+          d_tmp += (dst_stride << 2);
+
+          compute_avg_8x4(res8, res9, res10, res11, vreinterpretq_u16_s16(res0),
+                          vreinterpretq_u16_s16(res1),
+                          vreinterpretq_u16_s16(res2),
+                          vreinterpretq_u16_s16(res3), fwd_offset, bck_offset,
+                          round_offset64, round_bits, use_dist_wtd_comp_avg,
+                          &t0, &t1, &t2, &t3);
+
+          store_u8_8x4(d_u8, dst8_stride, t0, t1, t2, t3);
+          d_u8 += (dst8_stride << 2);
+
+          load_u16_8x4(d_tmp, dst_stride, &res8, &res9, &res10, &res11);
+          d_tmp += (dst_stride << 2);
+
+          compute_avg_8x4(res8, res9, res10, res11, vreinterpretq_u16_s16(res4),
+                          vreinterpretq_u16_s16(res5),
+                          vreinterpretq_u16_s16(res6),
+                          vreinterpretq_u16_s16(res7), fwd_offset, bck_offset,
+                          round_offset64, round_bits, use_dist_wtd_comp_avg,
+                          &t0, &t1, &t2, &t3);
+
+          store_u8_8x4(d_u8, dst8_stride, t0, t1, t2, t3);
+          d_u8 += (dst8_stride << 2);
+        } else {
+          store_u16_8x8(
+              d_tmp, dst_stride, vreinterpretq_u16_s16(res0),
+              vreinterpretq_u16_s16(res1), vreinterpretq_u16_s16(res2),
+              vreinterpretq_u16_s16(res3), vreinterpretq_u16_s16(res4),
+              vreinterpretq_u16_s16(res5), vreinterpretq_u16_s16(res6),
+              vreinterpretq_u16_s16(res7));
+          d_tmp += (dst_stride << 3);
+        }
+
+        s0 = s8;
+        s1 = s9;
+        s2 = s10;
+        s3 = s11;
+        s4 = s12;
+        s5 = s13;
+        s6 = s14;
+        s += (8 * src_stride);
+        height -= 8;
+#else
+        s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+
+        __builtin_prefetch(dst_ptr);
+
+        res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter_tmp,
+                                 zero, shift_vec);
+        res0 = vaddq_s16(res0, round_offset128);
+
+        s0 = s1;
+        s1 = s2;
+        s2 = s3;
+        s3 = s4;
+        s4 = s5;
+        s5 = s6;
+        s6 = s7;
+
+        if (conv_params->do_average) {
+          __builtin_prefetch(d_tmp);
+
+          res8 = vld1q_u16(d_tmp);
+          d_tmp += (dst_stride);
+
+          compute_avg_8x1(res8, vreinterpretq_u16_s16(res0), fwd_offset,
+                          bck_offset, round_offset64, round_bits,
+                          use_dist_wtd_comp_avg, &t0);
+
+          vst1_u8(d_u8, t0);
+          d_u8 += (dst8_stride);
+        } else {
+          vst1q_u16(d_tmp, vreinterpretq_u16_s16(res0));
+          d_tmp += dst_stride;
+        }
+
+        s += (src_stride);
+        height--;
+#endif
+      } while (height > 0);
+      src_ptr += 8;
+      dst_ptr += 8;
+      dst_u8_ptr += 8;
+      width -= 8;
+    } while (width > 0);
+  }
+}
diff --git a/libs/libaom/src/av1/common/arm/mem_neon.h b/libs/libaom/src/av1/common/arm/mem_neon.h
new file mode 100644
index 000000000..171055fe1
--- /dev/null
+++ b/libs/libaom/src/av1/common/arm/mem_neon.h
@@ -0,0 +1,539 @@
+/*
+ *  Copyright (c) 2018, Alliance for Open Media. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef AOM_AV1_COMMON_ARM_MEM_NEON_H_
+#define AOM_AV1_COMMON_ARM_MEM_NEON_H_
+
+#include <arm_neon.h>
+#include <string.h>
+#include "aom_dsp/aom_dsp_common.h"
+
+static INLINE void store_row2_u8_8x8(uint8_t *s, int p, const uint8x8_t s0,
+                                     const uint8x8_t s1) {
+  vst1_u8(s, s0);
+  s += p;
+  vst1_u8(s, s1);
+  s += p;
+}
+
+/* These intrinsics require immediate values, so we must use #defines
+   to enforce that. */
+#define load_u8_4x1(s, s0, lane)                                           \
+  do {                                                                     \
+    *(s0) = vreinterpret_u8_u32(                                           \
+        vld1_lane_u32((uint32_t *)(s), vreinterpret_u32_u8(*(s0)), lane)); \
+  } while (0)
+
+static INLINE void load_u8_8x8(const uint8_t *s, ptrdiff_t p,
+                               uint8x8_t *const s0, uint8x8_t *const s1,
+                               uint8x8_t *const s2, uint8x8_t *const s3,
+                               uint8x8_t *const s4, uint8x8_t *const s5,
+                               uint8x8_t *const s6, uint8x8_t *const s7) {
+  *s0 = vld1_u8(s);
+  s += p;
+  *s1 = vld1_u8(s);
+  s += p;
+  *s2 = vld1_u8(s);
+  s += p;
+  *s3 = vld1_u8(s);
+  s += p;
+  *s4 = vld1_u8(s);
+  s += p;
+  *s5 = vld1_u8(s);
+  s += p;
+  *s6 = vld1_u8(s);
+  s += p;
+  *s7 = vld1_u8(s);
+}
+
+static INLINE void load_u8_8x16(const uint8_t *s, ptrdiff_t p,
+                                uint8x16_t *const s0, uint8x16_t *const s1,
+                                uint8x16_t *const s2, uint8x16_t *const s3) {
+  *s0 = vld1q_u8(s);
+  s += p;
+  *s1 = vld1q_u8(s);
+  s += p;
+  *s2 = vld1q_u8(s);
+  s += p;
+  *s3 = vld1q_u8(s);
+}
+
+static INLINE void load_u8_8x4(const uint8_t *s, const ptrdiff_t p,
+                               uint8x8_t *const s0, uint8x8_t *const s1,
+                               uint8x8_t *const s2, uint8x8_t *const s3) {
+  *s0 = vld1_u8(s);
+  s += p;
+  *s1 = vld1_u8(s);
+  s += p;
+  *s2 = vld1_u8(s);
+  s += p;
+  *s3 = vld1_u8(s);
+}
+
+static INLINE void load_u16_4x4(const uint16_t *s, const ptrdiff_t p,
+                                uint16x4_t *const s0, uint16x4_t *const s1,
+                                uint16x4_t *const s2, uint16x4_t *const s3) {
+  *s0 = vld1_u16(s);
+  s += p;
+  *s1 = vld1_u16(s);
+  s += p;
+  *s2 = vld1_u16(s);
+  s += p;
+  *s3 = vld1_u16(s);
+  s += p;
+}
+
+static INLINE void load_u16_8x4(const uint16_t *s, const ptrdiff_t p,
+                                uint16x8_t *const s0, uint16x8_t *const s1,
+                                uint16x8_t *const s2, uint16x8_t *const s3) {
+  *s0 = vld1q_u16(s);
+  s += p;
+  *s1 = vld1q_u16(s);
+  s += p;
+  *s2 = vld1q_u16(s);
+  s += p;
+  *s3 = vld1q_u16(s);
+  s += p;
+}
+
+static INLINE void load_s16_4x8(const int16_t *s, ptrdiff_t p,
+                                int16x4_t *const s0, int16x4_t *const s1,
+                                int16x4_t *const s2, int16x4_t *const s3,
+                                int16x4_t *const s4, int16x4_t *const s5,
+                                int16x4_t *const s6, int16x4_t *const s7) {
+  *s0 = vld1_s16(s);
+  s += p;
+  *s1 = vld1_s16(s);
+  s += p;
+  *s2 = vld1_s16(s);
+  s += p;
+  *s3 = vld1_s16(s);
+  s += p;
+  *s4 = vld1_s16(s);
+  s += p;
+  *s5 = vld1_s16(s);
+  s += p;
+  *s6 = vld1_s16(s);
+  s += p;
+  *s7 = vld1_s16(s);
+}
+
+static INLINE void load_s16_4x4(const int16_t *s, ptrdiff_t p,
+                                int16x4_t *const s0, int16x4_t *const s1,
+                                int16x4_t *const s2, int16x4_t *const s3) {
+  *s0 = vld1_s16(s);
+  s += p;
+  *s1 = vld1_s16(s);
+  s += p;
+  *s2 = vld1_s16(s);
+  s += p;
+  *s3 = vld1_s16(s);
+}
+
+/* These intrinsics require immediate values, so we must use #defines
+   to enforce that. */
+#define store_u8_4x1(s, s0, lane)                                  \
+  do {                                                             \
+    vst1_lane_u32((uint32_t *)(s), vreinterpret_u32_u8(s0), lane); \
+  } while (0)
+
+static INLINE void store_u8_8x8(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
+                                const uint8x8_t s1, const uint8x8_t s2,
+                                const uint8x8_t s3, const uint8x8_t s4,
+                                const uint8x8_t s5, const uint8x8_t s6,
+                                const uint8x8_t s7) {
+  vst1_u8(s, s0);
+  s += p;
+  vst1_u8(s, s1);
+  s += p;
+  vst1_u8(s, s2);
+  s += p;
+  vst1_u8(s, s3);
+  s += p;
+  vst1_u8(s, s4);
+  s += p;
+  vst1_u8(s, s5);
+  s += p;
+  vst1_u8(s, s6);
+  s += p;
+  vst1_u8(s, s7);
+}
+
+static INLINE void store_u8_8x4(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
+                                const uint8x8_t s1, const uint8x8_t s2,
+                                const uint8x8_t s3) {
+  vst1_u8(s, s0);
+  s += p;
+  vst1_u8(s, s1);
+  s += p;
+  vst1_u8(s, s2);
+  s += p;
+  vst1_u8(s, s3);
+}
+
+static INLINE void store_u8_8x16(uint8_t *s, ptrdiff_t p, const uint8x16_t s0,
+                                 const uint8x16_t s1, const uint8x16_t s2,
+                                 const uint8x16_t s3) {
+  vst1q_u8(s, s0);
+  s += p;
+  vst1q_u8(s, s1);
+  s += p;
+  vst1q_u8(s, s2);
+  s += p;
+  vst1q_u8(s, s3);
+}
+
+static INLINE void store_u16_8x8(uint16_t *s, ptrdiff_t dst_stride,
+                                 const uint16x8_t s0, const uint16x8_t s1,
+                                 const uint16x8_t s2, const uint16x8_t s3,
+                                 const uint16x8_t s4, const uint16x8_t s5,
+                                 const uint16x8_t s6, const uint16x8_t s7) {
+  vst1q_u16(s, s0);
+  s += dst_stride;
+  vst1q_u16(s, s1);
+  s += dst_stride;
+  vst1q_u16(s, s2);
+  s += dst_stride;
+  vst1q_u16(s, s3);
+  s += dst_stride;
+  vst1q_u16(s, s4);
+  s += dst_stride;
+  vst1q_u16(s, s5);
+  s += dst_stride;
+  vst1q_u16(s, s6);
+  s += dst_stride;
+  vst1q_u16(s, s7);
+}
+
+static INLINE void store_u16_4x4(uint16_t *s, ptrdiff_t dst_stride,
+                                 const uint16x4_t s0, const uint16x4_t s1,
+                                 const uint16x4_t s2, const uint16x4_t s3) {
+  vst1_u16(s, s0);
+  s += dst_stride;
+  vst1_u16(s, s1);
+  s += dst_stride;
+  vst1_u16(s, s2);
+  s += dst_stride;
+  vst1_u16(s, s3);
+}
+
+static INLINE void store_u16_8x4(uint16_t *s, ptrdiff_t dst_stride,
+                                 const uint16x8_t s0, const uint16x8_t s1,
+                                 const uint16x8_t s2, const uint16x8_t s3) {
+  vst1q_u16(s, s0);
+  s += dst_stride;
+  vst1q_u16(s, s1);
+  s += dst_stride;
+  vst1q_u16(s, s2);
+  s += dst_stride;
+  vst1q_u16(s, s3);
+}
+
+static INLINE void store_s16_8x8(int16_t *s, ptrdiff_t dst_stride,
+                                 const int16x8_t s0, const int16x8_t s1,
+                                 const int16x8_t s2, const int16x8_t s3,
+                                 const int16x8_t s4, const int16x8_t s5,
+                                 const int16x8_t s6, const int16x8_t s7) {
+  vst1q_s16(s, s0);
+  s += dst_stride;
+  vst1q_s16(s, s1);
+  s += dst_stride;
+  vst1q_s16(s, s2);
+  s += dst_stride;
+  vst1q_s16(s, s3);
+  s += dst_stride;
+  vst1q_s16(s, s4);
+  s += dst_stride;
+  vst1q_s16(s, s5);
+  s += dst_stride;
+  vst1q_s16(s, s6);
+  s += dst_stride;
+  vst1q_s16(s, s7);
+}
+
+static INLINE void store_s16_4x4(int16_t *s, ptrdiff_t dst_stride,
+                                 const int16x4_t s0, const int16x4_t s1,
+                                 const int16x4_t s2, const int16x4_t s3) {
+  vst1_s16(s, s0);
+  s += dst_stride;
+  vst1_s16(s, s1);
+  s += dst_stride;
+  vst1_s16(s, s2);
+  s += dst_stride;
+  vst1_s16(s, s3);
+}
+
+static INLINE void store_s16_8x4(int16_t *s, ptrdiff_t dst_stride,
+                                 const int16x8_t s0, const int16x8_t s1,
+                                 const int16x8_t s2, const int16x8_t s3) {
+  vst1q_s16(s, s0);
+  s += dst_stride;
+  vst1q_s16(s, s1);
+  s += dst_stride;
+  vst1q_s16(s, s2);
+  s += dst_stride;
+  vst1q_s16(s, s3);
+}
+
+static INLINE void load_s16_8x8(const int16_t *s, ptrdiff_t p,
+                                int16x8_t *const s0, int16x8_t *const s1,
+                                int16x8_t *const s2, int16x8_t *const s3,
+                                int16x8_t *const s4, int16x8_t *const s5,
+                                int16x8_t *const s6, int16x8_t *const s7) {
+  *s0 = vld1q_s16(s);
+  s += p;
+  *s1 = vld1q_s16(s);
+  s += p;
+  *s2 = vld1q_s16(s);
+  s += p;
+  *s3 = vld1q_s16(s);
+  s += p;
+  *s4 = vld1q_s16(s);
+  s += p;
+  *s5 = vld1q_s16(s);
+  s += p;
+  *s6 = vld1q_s16(s);
+  s += p;
+  *s7 = vld1q_s16(s);
+}
+
+static INLINE void load_s16_8x4(const int16_t *s, ptrdiff_t p,
+                                int16x8_t *const s0, int16x8_t *const s1,
+                                int16x8_t *const s2, int16x8_t *const s3) {
+  *s0 = vld1q_s16(s);
+  s += p;
+  *s1 = vld1q_s16(s);
+  s += p;
+  *s2 = vld1q_s16(s);
+  s += p;
+  *s3 = vld1q_s16(s);
+}
+
+// Load 4 sets of 4 bytes when alignment is not guaranteed.
+static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf, int stride) {
+  uint32_t a;
+  uint32x4_t a_u32 = vdupq_n_u32(0);
+  if (stride == 4) return vld1q_u8(buf);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  a_u32 = vsetq_lane_u32(a, a_u32, 0);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  a_u32 = vsetq_lane_u32(a, a_u32, 1);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  a_u32 = vsetq_lane_u32(a, a_u32, 2);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  a_u32 = vsetq_lane_u32(a, a_u32, 3);
+  return vreinterpretq_u8_u32(a_u32);
+}
+
+static INLINE void load_unaligned_u8_4x8(const uint8_t *buf, int stride,
+                                         uint32x2_t *tu0, uint32x2_t *tu1,
+                                         uint32x2_t *tu2, uint32x2_t *tu3) {
+  uint32_t a;
+
+  memcpy(&a, buf, 4);
+  buf += stride;
+  *tu0 = vset_lane_u32(a, *tu0, 0);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  *tu0 = vset_lane_u32(a, *tu0, 1);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  *tu1 = vset_lane_u32(a, *tu1, 0);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  *tu1 = vset_lane_u32(a, *tu1, 1);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  *tu2 = vset_lane_u32(a, *tu2, 0);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  *tu2 = vset_lane_u32(a, *tu2, 1);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  *tu3 = vset_lane_u32(a, *tu3, 0);
+  memcpy(&a, buf, 4);
+  *tu3 = vset_lane_u32(a, *tu3, 1);
+}
+
+static INLINE void load_unaligned_u8_4x4(const uint8_t *buf, int stride,
+                                         uint32x2_t *tu0, uint32x2_t *tu1) {
+  uint32_t a;
+
+  memcpy(&a, buf, 4);
+  buf += stride;
+  *tu0 = vset_lane_u32(a, *tu0, 0);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  *tu0 = vset_lane_u32(a, *tu0, 1);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  *tu1 = vset_lane_u32(a, *tu1, 0);
+  memcpy(&a, buf, 4);
+  *tu1 = vset_lane_u32(a, *tu1, 1);
+}
+
+static INLINE void load_unaligned_u8_4x1(const uint8_t *buf, int stride,
+                                         uint32x2_t *tu0) {
+  uint32_t a;
+
+  memcpy(&a, buf, 4);
+  buf += stride;
+  *tu0 = vset_lane_u32(a, *tu0, 0);
+}
+
+static INLINE void load_unaligned_u8_4x2(const uint8_t *buf, int stride,
+                                         uint32x2_t *tu0) {
+  uint32_t a;
+
+  memcpy(&a, buf, 4);
+  buf += stride;
+  *tu0 = vset_lane_u32(a, *tu0, 0);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  *tu0 = vset_lane_u32(a, *tu0, 1);
+}
+
+/* These intrinsics require immediate values, so we must use #defines
+   to enforce that. */
+#define store_unaligned_u8_4x1(dst, src, lane)         \
+  do {                                                 \
+    uint32_t a;                                        \
+    a = vget_lane_u32(vreinterpret_u32_u8(src), lane); \
+    memcpy(dst, &a, 4);                                \
+  } while (0)
+
+static INLINE void load_unaligned_u8_2x2(const uint8_t *buf, int stride,
+                                         uint16x4_t *tu0) {
+  uint16_t a;
+
+  memcpy(&a, buf, 2);
+  buf += stride;
+  *tu0 = vset_lane_u16(a, *tu0, 0);
+  memcpy(&a, buf, 2);
+  buf += stride;
+  *tu0 = vset_lane_u16(a, *tu0, 1);
+}
+
+static INLINE void load_u8_16x8(const uint8_t *s, ptrdiff_t p,
+                                uint8x16_t *const s0, uint8x16_t *const s1,
+                                uint8x16_t *const s2, uint8x16_t *const s3,
+                                uint8x16_t *const s4, uint8x16_t *const s5,
+                                uint8x16_t *const s6, uint8x16_t *const s7) {
+  *s0 = vld1q_u8(s);
+  s += p;
+  *s1 = vld1q_u8(s);
+  s += p;
+  *s2 = vld1q_u8(s);
+  s += p;
+  *s3 = vld1q_u8(s);
+  s += p;
+  *s4 = vld1q_u8(s);
+  s += p;
+  *s5 = vld1q_u8(s);
+  s += p;
+  *s6 = vld1q_u8(s);
+  s += p;
+  *s7 = vld1q_u8(s);
+}
+
+static INLINE void load_u8_16x4(const uint8_t *s, ptrdiff_t p,
+                                uint8x16_t *const s0, uint8x16_t *const s1,
+                                uint8x16_t *const s2, uint8x16_t *const s3) {
+  *s0 = vld1q_u8(s);
+  s += p;
+  *s1 = vld1q_u8(s);
+  s += p;
+  *s2 = vld1q_u8(s);
+  s += p;
+  *s3 = vld1q_u8(s);
+}
+
+static INLINE void load_unaligned_u16_4x4(const uint16_t *buf, uint32_t stride,
+                                          uint64x2_t *tu0, uint64x2_t *tu1) {
+  uint64_t a;
+
+  memcpy(&a, buf, 8);
+  buf += stride;
+  *tu0 = vsetq_lane_u64(a, *tu0, 0);
+  memcpy(&a, buf, 8);
+  buf += stride;
+  *tu0 = vsetq_lane_u64(a, *tu0, 1);
+  memcpy(&a, buf, 8);
+  buf += stride;
+  *tu1 = vsetq_lane_u64(a, *tu1, 0);
+  memcpy(&a, buf, 8);
+  *tu1 = vsetq_lane_u64(a, *tu1, 1);
+}
+
+static INLINE void load_s32_4x4(int32_t *s, int32_t p, int32x4_t *s1,
+                                int32x4_t *s2, int32x4_t *s3, int32x4_t *s4) {
+  *s1 = vld1q_s32(s);
+  s += p;
+  *s2 = vld1q_s32(s);
+  s += p;
+  *s3 = vld1q_s32(s);
+  s += p;
+  *s4 = vld1q_s32(s);
+}
+
+static INLINE void store_s32_4x4(int32_t *s, int32_t p, int32x4_t s1,
+                                 int32x4_t s2, int32x4_t s3, int32x4_t s4) {
+  vst1q_s32(s, s1);
+  s += p;
+  vst1q_s32(s, s2);
+  s += p;
+  vst1q_s32(s, s3);
+  s += p;
+  vst1q_s32(s, s4);
+}
+
+static INLINE void load_u32_4x4(uint32_t *s, int32_t p, uint32x4_t *s1,
+                                uint32x4_t *s2, uint32x4_t *s3,
+                                uint32x4_t *s4) {
+  *s1 = vld1q_u32(s);
+  s += p;
+  *s2 = vld1q_u32(s);
+  s += p;
+  *s3 = vld1q_u32(s);
+  s += p;
+  *s4 = vld1q_u32(s);
+}
+
+static INLINE void store_u32_4x4(uint32_t *s, int32_t p, uint32x4_t s1,
+                                 uint32x4_t s2, uint32x4_t s3, uint32x4_t s4) {
+  vst1q_u32(s, s1);
+  s += p;
+  vst1q_u32(s, s2);
+  s += p;
+  vst1q_u32(s, s3);
+  s += p;
+  vst1q_u32(s, s4);
+}
+
+static INLINE int16x8_t load_tran_low_to_s16q(const tran_low_t *buf) {
+  const int32x4_t v0 = vld1q_s32(buf);
+  const int32x4_t v1 = vld1q_s32(buf + 4);
+  const int16x4_t s0 = vmovn_s32(v0);
+  const int16x4_t s1 = vmovn_s32(v1);
+  return vcombine_s16(s0, s1);
+}
+
+static INLINE void store_s16q_to_tran_low(tran_low_t *buf, const int16x8_t a) {
+  const int32x4_t v0 = vmovl_s16(vget_low_s16(a));
+  const int32x4_t v1 = vmovl_s16(vget_high_s16(a));
+  vst1q_s32(buf, v0);
+  vst1q_s32(buf + 4, v1);
+}
+
+#endif  // AOM_AV1_COMMON_ARM_MEM_NEON_H_
diff --git a/libs/libaom/src/av1/common/arm/reconinter_neon.c b/libs/libaom/src/av1/common/arm/reconinter_neon.c
new file mode 100644
index 000000000..44e064195
--- /dev/null
+++ b/libs/libaom/src/av1/common/arm/reconinter_neon.c
@@ -0,0 +1,86 @@
+/*
+ *
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/blend.h"
+#include "aom_ports/mem.h"
+#include "av1/common/arm/mem_neon.h"
+#include "av1/common/blockd.h"
+#include "config/av1_rtcd.h"
+
+void av1_build_compound_diffwtd_mask_d16_neon(
+    uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0,
+    int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w,
+    ConvolveParams *conv_params, int bd) {
+  assert(h >= 4);
+  assert(w >= 4);
+  assert((mask_type == DIFFWTD_38_INV) || (mask_type == DIFFWTD_38));
+  const int round =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1 + (bd - 8);
+  uint16x8_t diff_q, tmp0, tmp1;
+  uint8x8_t diff_d, diff_select;
+  const CONV_BUF_TYPE *src0_1, *src1_1;
+  const int16x8_t dup_round = vdupq_n_s16((int16_t)(-round));
+  const uint8x8_t dup_38 = vdup_n_u8(38);
+  const uint8x8_t dup_64 = vdup_n_u8(AOM_BLEND_A64_MAX_ALPHA);
+  if (mask_type == DIFFWTD_38) {
+    diff_select = vdup_n_u8(255);
+  } else {
+    diff_select = vdup_n_u8(0);
+  }
+  if (w >= 8) {
+    for (int i = 0; i < h; ++i) {
+      src0_1 = src0;
+      src1_1 = src1;
+      for (int j = 0; j < w; j += 8) {
+        __builtin_prefetch(src0_1);
+        __builtin_prefetch(src1_1);
+        diff_q = vabdq_u16(vld1q_u16(src0_1), vld1q_u16(src1_1));
+        diff_q = vrshlq_u16(diff_q, dup_round);
+        diff_d = vshrn_n_u16(diff_q, DIFF_FACTOR_LOG2);
+        diff_d = vmin_u8(vadd_u8(diff_d, dup_38), dup_64);
+        diff_d = vbsl_u8(diff_select, diff_d, vsub_u8(dup_64, diff_d));
+        vst1_u8(mask, diff_d);
+        src0_1 += 8;
+        src1_1 += 8;
+        mask += 8;
+      }
+      src0 += src0_stride;
+      src1 += src1_stride;
+    }
+  } else if (w == 4) {
+    for (int i = 0; i < h; i += 2) {
+      src0_1 = src0;
+      src1_1 = src1;
+      __builtin_prefetch(src0_1 + 0 * src0_stride);
+      __builtin_prefetch(src0_1 + 1 * src0_stride);
+      __builtin_prefetch(src1_1 + 0 * src1_stride);
+      __builtin_prefetch(src1_1 + 1 * src1_stride);
+      tmp0 = vcombine_u16(vld1_u16(src0_1 + (0 * src0_stride)),
+                          vld1_u16(src0_1 + (1 * src0_stride)));
+      tmp1 = vcombine_u16(vld1_u16(src1_1 + (0 * src1_stride)),
+                          vld1_u16(src1_1 + (1 * src1_stride)));
+      diff_q = vabdq_u16(tmp0, tmp1);
+      diff_q = vrshlq_u16(diff_q, dup_round);
+      diff_d = vshrn_n_u16(diff_q, DIFF_FACTOR_LOG2);
+      diff_d = vmin_u8(vadd_u8(diff_d, dup_38), dup_64);
+      diff_d = vbsl_u8(diff_select, diff_d, vsub_u8(dup_64, diff_d));
+      vst1_u8(mask, diff_d);
+      src0 += src0_stride * 2;
+      src1 += src1_stride * 2;
+      mask += w * 2;
+    }
+  }
+}
diff --git a/libs/libaom/src/av1/common/arm/selfguided_neon.c b/libs/libaom/src/av1/common/arm/selfguided_neon.c
new file mode 100644
index 000000000..fc404a64a
--- /dev/null
+++ b/libs/libaom/src/av1/common/arm/selfguided_neon.c
@@ -0,0 +1,1590 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/txfm_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/common/common.h"
+#include "av1/common/resize.h"
+#include "av1/common/restoration.h"
+#include "av1/common/arm/mem_neon.h"
+#include "av1/common/arm/transpose_neon.h"
+
+// Constants used for right shift in final_filter calculation.
+#define NB_EVEN 5
+#define NB_ODD 4
+
+static INLINE void calc_ab_fast_internal_common(
+    uint32x4_t s0, uint32x4_t s1, uint32x4_t s2, uint32x4_t s3, uint32x4_t s4,
+    uint32x4_t s5, uint32x4_t s6, uint32x4_t s7, int32x4_t sr4, int32x4_t sr5,
+    int32x4_t sr6, int32x4_t sr7, uint32x4_t const_n_val, uint32x4_t s_vec,
+    uint32x4_t const_val, uint32x4_t one_by_n_minus_1_vec,
+    uint16x4_t sgrproj_sgr, int32_t *src1, uint16_t *dst_A16, int32_t *src2,
+    const int buf_stride) {
+  uint32x4_t q0, q1, q2, q3;
+  uint32x4_t p0, p1, p2, p3;
+  uint16x4_t d0, d1, d2, d3;
+
+  s0 = vmulq_u32(s0, const_n_val);
+  s1 = vmulq_u32(s1, const_n_val);
+  s2 = vmulq_u32(s2, const_n_val);
+  s3 = vmulq_u32(s3, const_n_val);
+
+  q0 = vmulq_u32(s4, s4);
+  q1 = vmulq_u32(s5, s5);
+  q2 = vmulq_u32(s6, s6);
+  q3 = vmulq_u32(s7, s7);
+
+  p0 = vcleq_u32(q0, s0);
+  p1 = vcleq_u32(q1, s1);
+  p2 = vcleq_u32(q2, s2);
+  p3 = vcleq_u32(q3, s3);
+
+  q0 = vsubq_u32(s0, q0);
+  q1 = vsubq_u32(s1, q1);
+  q2 = vsubq_u32(s2, q2);
+  q3 = vsubq_u32(s3, q3);
+
+  p0 = vandq_u32(p0, q0);
+  p1 = vandq_u32(p1, q1);
+  p2 = vandq_u32(p2, q2);
+  p3 = vandq_u32(p3, q3);
+
+  p0 = vmulq_u32(p0, s_vec);
+  p1 = vmulq_u32(p1, s_vec);
+  p2 = vmulq_u32(p2, s_vec);
+  p3 = vmulq_u32(p3, s_vec);
+
+  p0 = vrshrq_n_u32(p0, SGRPROJ_MTABLE_BITS);
+  p1 = vrshrq_n_u32(p1, SGRPROJ_MTABLE_BITS);
+  p2 = vrshrq_n_u32(p2, SGRPROJ_MTABLE_BITS);
+  p3 = vrshrq_n_u32(p3, SGRPROJ_MTABLE_BITS);
+
+  p0 = vminq_u32(p0, const_val);
+  p1 = vminq_u32(p1, const_val);
+  p2 = vminq_u32(p2, const_val);
+  p3 = vminq_u32(p3, const_val);
+
+  {
+    store_u32_4x4((uint32_t *)src1, buf_stride, p0, p1, p2, p3);
+
+    for (int x = 0; x < 4; x++) {
+      for (int y = 0; y < 4; y++) {
+        dst_A16[x * buf_stride + y] = av1_x_by_xplus1[src1[x * buf_stride + y]];
+      }
+    }
+    load_u16_4x4(dst_A16, buf_stride, &d0, &d1, &d2, &d3);
+  }
+  p0 = vsubl_u16(sgrproj_sgr, d0);
+  p1 = vsubl_u16(sgrproj_sgr, d1);
+  p2 = vsubl_u16(sgrproj_sgr, d2);
+  p3 = vsubl_u16(sgrproj_sgr, d3);
+
+  s4 = vmulq_u32(vreinterpretq_u32_s32(sr4), one_by_n_minus_1_vec);
+  s5 = vmulq_u32(vreinterpretq_u32_s32(sr5), one_by_n_minus_1_vec);
+  s6 = vmulq_u32(vreinterpretq_u32_s32(sr6), one_by_n_minus_1_vec);
+  s7 = vmulq_u32(vreinterpretq_u32_s32(sr7), one_by_n_minus_1_vec);
+
+  s4 = vmulq_u32(s4, p0);
+  s5 = vmulq_u32(s5, p1);
+  s6 = vmulq_u32(s6, p2);
+  s7 = vmulq_u32(s7, p3);
+
+  p0 = vrshrq_n_u32(s4, SGRPROJ_RECIP_BITS);
+  p1 = vrshrq_n_u32(s5, SGRPROJ_RECIP_BITS);
+  p2 = vrshrq_n_u32(s6, SGRPROJ_RECIP_BITS);
+  p3 = vrshrq_n_u32(s7, SGRPROJ_RECIP_BITS);
+
+  store_s32_4x4(src2, buf_stride, vreinterpretq_s32_u32(p0),
+                vreinterpretq_s32_u32(p1), vreinterpretq_s32_u32(p2),
+                vreinterpretq_s32_u32(p3));
+}
+static INLINE void calc_ab_internal_common(
+    uint32x4_t s0, uint32x4_t s1, uint32x4_t s2, uint32x4_t s3, uint32x4_t s4,
+    uint32x4_t s5, uint32x4_t s6, uint32x4_t s7, uint16x8_t s16_0,
+    uint16x8_t s16_1, uint16x8_t s16_2, uint16x8_t s16_3, uint16x8_t s16_4,
+    uint16x8_t s16_5, uint16x8_t s16_6, uint16x8_t s16_7,
+    uint32x4_t const_n_val, uint32x4_t s_vec, uint32x4_t const_val,
+    uint16x4_t one_by_n_minus_1_vec, uint16x8_t sgrproj_sgr, int32_t *src1,
+    uint16_t *dst_A16, int32_t *dst2, const int buf_stride) {
+  uint16x4_t d0, d1, d2, d3, d4, d5, d6, d7;
+  uint32x4_t q0, q1, q2, q3, q4, q5, q6, q7;
+  uint32x4_t p0, p1, p2, p3, p4, p5, p6, p7;
+
+  s0 = vmulq_u32(s0, const_n_val);
+  s1 = vmulq_u32(s1, const_n_val);
+  s2 = vmulq_u32(s2, const_n_val);
+  s3 = vmulq_u32(s3, const_n_val);
+  s4 = vmulq_u32(s4, const_n_val);
+  s5 = vmulq_u32(s5, const_n_val);
+  s6 = vmulq_u32(s6, const_n_val);
+  s7 = vmulq_u32(s7, const_n_val);
+
+  d0 = vget_low_u16(s16_4);
+  d1 = vget_low_u16(s16_5);
+  d2 = vget_low_u16(s16_6);
+  d3 = vget_low_u16(s16_7);
+  d4 = vget_high_u16(s16_4);
+  d5 = vget_high_u16(s16_5);
+  d6 = vget_high_u16(s16_6);
+  d7 = vget_high_u16(s16_7);
+
+  q0 = vmull_u16(d0, d0);
+  q1 = vmull_u16(d1, d1);
+  q2 = vmull_u16(d2, d2);
+  q3 = vmull_u16(d3, d3);
+  q4 = vmull_u16(d4, d4);
+  q5 = vmull_u16(d5, d5);
+  q6 = vmull_u16(d6, d6);
+  q7 = vmull_u16(d7, d7);
+
+  p0 = vcleq_u32(q0, s0);
+  p1 = vcleq_u32(q1, s1);
+  p2 = vcleq_u32(q2, s2);
+  p3 = vcleq_u32(q3, s3);
+  p4 = vcleq_u32(q4, s4);
+  p5 = vcleq_u32(q5, s5);
+  p6 = vcleq_u32(q6, s6);
+  p7 = vcleq_u32(q7, s7);
+
+  q0 = vsubq_u32(s0, q0);
+  q1 = vsubq_u32(s1, q1);
+  q2 = vsubq_u32(s2, q2);
+  q3 = vsubq_u32(s3, q3);
+  q4 = vsubq_u32(s4, q4);
+  q5 = vsubq_u32(s5, q5);
+  q6 = vsubq_u32(s6, q6);
+  q7 = vsubq_u32(s7, q7);
+
+  p0 = vandq_u32(p0, q0);
+  p1 = vandq_u32(p1, q1);
+  p2 = vandq_u32(p2, q2);
+  p3 = vandq_u32(p3, q3);
+  p4 = vandq_u32(p4, q4);
+  p5 = vandq_u32(p5, q5);
+  p6 = vandq_u32(p6, q6);
+  p7 = vandq_u32(p7, q7);
+
+  p0 = vmulq_u32(p0, s_vec);
+  p1 = vmulq_u32(p1, s_vec);
+  p2 = vmulq_u32(p2, s_vec);
+  p3 = vmulq_u32(p3, s_vec);
+  p4 = vmulq_u32(p4, s_vec);
+  p5 = vmulq_u32(p5, s_vec);
+  p6 = vmulq_u32(p6, s_vec);
+  p7 = vmulq_u32(p7, s_vec);
+
+  p0 = vrshrq_n_u32(p0, SGRPROJ_MTABLE_BITS);
+  p1 = vrshrq_n_u32(p1, SGRPROJ_MTABLE_BITS);
+  p2 = vrshrq_n_u32(p2, SGRPROJ_MTABLE_BITS);
+  p3 = vrshrq_n_u32(p3, SGRPROJ_MTABLE_BITS);
+  p4 = vrshrq_n_u32(p4, SGRPROJ_MTABLE_BITS);
+  p5 = vrshrq_n_u32(p5, SGRPROJ_MTABLE_BITS);
+  p6 = vrshrq_n_u32(p6, SGRPROJ_MTABLE_BITS);
+  p7 = vrshrq_n_u32(p7, SGRPROJ_MTABLE_BITS);
+
+  p0 = vminq_u32(p0, const_val);
+  p1 = vminq_u32(p1, const_val);
+  p2 = vminq_u32(p2, const_val);
+  p3 = vminq_u32(p3, const_val);
+  p4 = vminq_u32(p4, const_val);
+  p5 = vminq_u32(p5, const_val);
+  p6 = vminq_u32(p6, const_val);
+  p7 = vminq_u32(p7, const_val);
+
+  {
+    store_u32_4x4((uint32_t *)src1, buf_stride, p0, p1, p2, p3);
+    store_u32_4x4((uint32_t *)src1 + 4, buf_stride, p4, p5, p6, p7);
+
+    for (int x = 0; x < 4; x++) {
+      for (int y = 0; y < 8; y++) {
+        dst_A16[x * buf_stride + y] = av1_x_by_xplus1[src1[x * buf_stride + y]];
+      }
+    }
+    load_u16_8x4(dst_A16, buf_stride, &s16_4, &s16_5, &s16_6, &s16_7);
+  }
+
+  s16_4 = vsubq_u16(sgrproj_sgr, s16_4);
+  s16_5 = vsubq_u16(sgrproj_sgr, s16_5);
+  s16_6 = vsubq_u16(sgrproj_sgr, s16_6);
+  s16_7 = vsubq_u16(sgrproj_sgr, s16_7);
+
+  s0 = vmull_u16(vget_low_u16(s16_0), one_by_n_minus_1_vec);
+  s1 = vmull_u16(vget_low_u16(s16_1), one_by_n_minus_1_vec);
+  s2 = vmull_u16(vget_low_u16(s16_2), one_by_n_minus_1_vec);
+  s3 = vmull_u16(vget_low_u16(s16_3), one_by_n_minus_1_vec);
+  s4 = vmull_u16(vget_high_u16(s16_0), one_by_n_minus_1_vec);
+  s5 = vmull_u16(vget_high_u16(s16_1), one_by_n_minus_1_vec);
+  s6 = vmull_u16(vget_high_u16(s16_2), one_by_n_minus_1_vec);
+  s7 = vmull_u16(vget_high_u16(s16_3), one_by_n_minus_1_vec);
+
+  s0 = vmulq_u32(s0, vmovl_u16(vget_low_u16(s16_4)));
+  s1 = vmulq_u32(s1, vmovl_u16(vget_low_u16(s16_5)));
+  s2 = vmulq_u32(s2, vmovl_u16(vget_low_u16(s16_6)));
+  s3 = vmulq_u32(s3, vmovl_u16(vget_low_u16(s16_7)));
+  s4 = vmulq_u32(s4, vmovl_u16(vget_high_u16(s16_4)));
+  s5 = vmulq_u32(s5, vmovl_u16(vget_high_u16(s16_5)));
+  s6 = vmulq_u32(s6, vmovl_u16(vget_high_u16(s16_6)));
+  s7 = vmulq_u32(s7, vmovl_u16(vget_high_u16(s16_7)));
+
+  p0 = vrshrq_n_u32(s0, SGRPROJ_RECIP_BITS);
+  p1 = vrshrq_n_u32(s1, SGRPROJ_RECIP_BITS);
+  p2 = vrshrq_n_u32(s2, SGRPROJ_RECIP_BITS);
+  p3 = vrshrq_n_u32(s3, SGRPROJ_RECIP_BITS);
+  p4 = vrshrq_n_u32(s4, SGRPROJ_RECIP_BITS);
+  p5 = vrshrq_n_u32(s5, SGRPROJ_RECIP_BITS);
+  p6 = vrshrq_n_u32(s6, SGRPROJ_RECIP_BITS);
+  p7 = vrshrq_n_u32(s7, SGRPROJ_RECIP_BITS);
+
+  store_s32_4x4(dst2, buf_stride, vreinterpretq_s32_u32(p0),
+                vreinterpretq_s32_u32(p1), vreinterpretq_s32_u32(p2),
+                vreinterpretq_s32_u32(p3));
+  store_s32_4x4(dst2 + 4, buf_stride, vreinterpretq_s32_u32(p4),
+                vreinterpretq_s32_u32(p5), vreinterpretq_s32_u32(p6),
+                vreinterpretq_s32_u32(p7));
+}
+
+static INLINE void boxsum2_square_sum_calc(
+    int16x4_t t1, int16x4_t t2, int16x4_t t3, int16x4_t t4, int16x4_t t5,
+    int16x4_t t6, int16x4_t t7, int16x4_t t8, int16x4_t t9, int16x4_t t10,
+    int16x4_t t11, int32x4_t *r0, int32x4_t *r1, int32x4_t *r2, int32x4_t *r3) {
+  int32x4_t d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11;
+  int32x4_t r12, r34, r67, r89, r1011;
+  int32x4_t r345, r6789, r789;
+
+  d1 = vmull_s16(t1, t1);
+  d2 = vmull_s16(t2, t2);
+  d3 = vmull_s16(t3, t3);
+  d4 = vmull_s16(t4, t4);
+  d5 = vmull_s16(t5, t5);
+  d6 = vmull_s16(t6, t6);
+  d7 = vmull_s16(t7, t7);
+  d8 = vmull_s16(t8, t8);
+  d9 = vmull_s16(t9, t9);
+  d10 = vmull_s16(t10, t10);
+  d11 = vmull_s16(t11, t11);
+
+  r12 = vaddq_s32(d1, d2);
+  r34 = vaddq_s32(d3, d4);
+  r67 = vaddq_s32(d6, d7);
+  r89 = vaddq_s32(d8, d9);
+  r1011 = vaddq_s32(d10, d11);
+  r345 = vaddq_s32(r34, d5);
+  r6789 = vaddq_s32(r67, r89);
+  r789 = vsubq_s32(r6789, d6);
+  *r0 = vaddq_s32(r12, r345);
+  *r1 = vaddq_s32(r67, r345);
+  *r2 = vaddq_s32(d5, r6789);
+  *r3 = vaddq_s32(r789, r1011);
+}
+
+static INLINE void boxsum2(int16_t *src, const int src_stride, int16_t *dst16,
+                           int32_t *dst32, int32_t *dst2, const int dst_stride,
+                           const int width, const int height) {
+  assert(width > 2 * SGRPROJ_BORDER_HORZ);
+  assert(height > 2 * SGRPROJ_BORDER_VERT);
+
+  int16_t *dst1_16_ptr, *src_ptr;
+  int32_t *dst2_ptr;
+  int h, w, count = 0;
+  const int dst_stride_2 = (dst_stride << 1);
+  const int dst_stride_8 = (dst_stride << 3);
+
+  dst1_16_ptr = dst16;
+  dst2_ptr = dst2;
+  src_ptr = src;
+  w = width;
+  {
+    int16x8_t t1, t2, t3, t4, t5, t6, t7;
+    int16x8_t t8, t9, t10, t11, t12;
+
+    int16x8_t q12345, q56789, q34567, q7891011;
+    int16x8_t q12, q34, q67, q89, q1011;
+    int16x8_t q345, q6789, q789;
+
+    int32x4_t r12345, r56789, r34567, r7891011;
+
+    do {
+      h = height;
+      dst1_16_ptr = dst16 + (count << 3);
+      dst2_ptr = dst2 + (count << 3);
+      src_ptr = src + (count << 3);
+
+      dst1_16_ptr += dst_stride_2;
+      dst2_ptr += dst_stride_2;
+      do {
+        load_s16_8x4(src_ptr, src_stride, &t1, &t2, &t3, &t4);
+        src_ptr += 4 * src_stride;
+        load_s16_8x4(src_ptr, src_stride, &t5, &t6, &t7, &t8);
+        src_ptr += 4 * src_stride;
+        load_s16_8x4(src_ptr, src_stride, &t9, &t10, &t11, &t12);
+
+        q12 = vaddq_s16(t1, t2);
+        q34 = vaddq_s16(t3, t4);
+        q67 = vaddq_s16(t6, t7);
+        q89 = vaddq_s16(t8, t9);
+        q1011 = vaddq_s16(t10, t11);
+        q345 = vaddq_s16(q34, t5);
+        q6789 = vaddq_s16(q67, q89);
+        q789 = vaddq_s16(q89, t7);
+        q12345 = vaddq_s16(q12, q345);
+        q34567 = vaddq_s16(q67, q345);
+        q56789 = vaddq_s16(t5, q6789);
+        q7891011 = vaddq_s16(q789, q1011);
+
+        store_s16_8x4(dst1_16_ptr, dst_stride_2, q12345, q34567, q56789,
+                      q7891011);
+        dst1_16_ptr += dst_stride_8;
+
+        boxsum2_square_sum_calc(
+            vget_low_s16(t1), vget_low_s16(t2), vget_low_s16(t3),
+            vget_low_s16(t4), vget_low_s16(t5), vget_low_s16(t6),
+            vget_low_s16(t7), vget_low_s16(t8), vget_low_s16(t9),
+            vget_low_s16(t10), vget_low_s16(t11), &r12345, &r34567, &r56789,
+            &r7891011);
+
+        store_s32_4x4(dst2_ptr, dst_stride_2, r12345, r34567, r56789, r7891011);
+
+        boxsum2_square_sum_calc(
+            vget_high_s16(t1), vget_high_s16(t2), vget_high_s16(t3),
+            vget_high_s16(t4), vget_high_s16(t5), vget_high_s16(t6),
+            vget_high_s16(t7), vget_high_s16(t8), vget_high_s16(t9),
+            vget_high_s16(t10), vget_high_s16(t11), &r12345, &r34567, &r56789,
+            &r7891011);
+
+        store_s32_4x4(dst2_ptr + 4, dst_stride_2, r12345, r34567, r56789,
+                      r7891011);
+        dst2_ptr += (dst_stride_8);
+        h -= 8;
+      } while (h > 0);
+      w -= 8;
+      count++;
+    } while (w > 0);
+
+    // memset needed for row pixels as 2nd stage of boxsum filter uses
+    // first 2 rows of dst16, dst2 buffer which is not filled in first stage.
+    for (int x = 0; x < 2; x++) {
+      memset(dst16 + x * dst_stride, 0, (width + 4) * sizeof(*dst16));
+      memset(dst2 + x * dst_stride, 0, (width + 4) * sizeof(*dst2));
+    }
+
+    // memset needed for extra columns as 2nd stage of boxsum filter uses
+    // last 2 columns of dst16, dst2 buffer which is not filled in first stage.
+    for (int x = 2; x < height + 2; x++) {
+      int dst_offset = x * dst_stride + width + 2;
+      memset(dst16 + dst_offset, 0, 3 * sizeof(*dst16));
+      memset(dst2 + dst_offset, 0, 3 * sizeof(*dst2));
+    }
+  }
+
+  {
+    int16x4_t s1, s2, s3, s4, s5, s6, s7, s8;
+    int32x4_t d1, d2, d3, d4, d5, d6, d7, d8;
+    int32x4_t q12345, q34567, q23456, q45678;
+    int32x4_t q23, q45, q67;
+    int32x4_t q2345, q4567;
+
+    int32x4_t r12345, r34567, r23456, r45678;
+    int32x4_t r23, r45, r67;
+    int32x4_t r2345, r4567;
+
+    int32_t *src2_ptr, *dst1_32_ptr;
+    int16_t *src1_ptr;
+    count = 0;
+    h = height;
+    do {
+      dst1_32_ptr = dst32 + count * dst_stride_8 + (dst_stride_2);
+      dst2_ptr = dst2 + count * dst_stride_8 + (dst_stride_2);
+      src1_ptr = dst16 + count * dst_stride_8 + (dst_stride_2);
+      src2_ptr = dst2 + count * dst_stride_8 + (dst_stride_2);
+      w = width;
+
+      dst1_32_ptr += 2;
+      dst2_ptr += 2;
+      load_s16_4x4(src1_ptr, dst_stride_2, &s1, &s2, &s3, &s4);
+      transpose_s16_4x4d(&s1, &s2, &s3, &s4);
+      load_s32_4x4(src2_ptr, dst_stride_2, &d1, &d2, &d3, &d4);
+      transpose_s32_4x4(&d1, &d2, &d3, &d4);
+      do {
+        src1_ptr += 4;
+        src2_ptr += 4;
+        load_s16_4x4(src1_ptr, dst_stride_2, &s5, &s6, &s7, &s8);
+        transpose_s16_4x4d(&s5, &s6, &s7, &s8);
+        load_s32_4x4(src2_ptr, dst_stride_2, &d5, &d6, &d7, &d8);
+        transpose_s32_4x4(&d5, &d6, &d7, &d8);
+        q23 = vaddl_s16(s2, s3);
+        q45 = vaddl_s16(s4, s5);
+        q67 = vaddl_s16(s6, s7);
+        q2345 = vaddq_s32(q23, q45);
+        q4567 = vaddq_s32(q45, q67);
+        q12345 = vaddq_s32(vmovl_s16(s1), q2345);
+        q23456 = vaddq_s32(q2345, vmovl_s16(s6));
+        q34567 = vaddq_s32(q4567, vmovl_s16(s3));
+        q45678 = vaddq_s32(q4567, vmovl_s16(s8));
+
+        transpose_s32_4x4(&q12345, &q23456, &q34567, &q45678);
+        store_s32_4x4(dst1_32_ptr, dst_stride_2, q12345, q23456, q34567,
+                      q45678);
+        dst1_32_ptr += 4;
+        s1 = s5;
+        s2 = s6;
+        s3 = s7;
+        s4 = s8;
+
+        r23 = vaddq_s32(d2, d3);
+        r45 = vaddq_s32(d4, d5);
+        r67 = vaddq_s32(d6, d7);
+        r2345 = vaddq_s32(r23, r45);
+        r4567 = vaddq_s32(r45, r67);
+        r12345 = vaddq_s32(d1, r2345);
+        r23456 = vaddq_s32(r2345, d6);
+        r34567 = vaddq_s32(r4567, d3);
+        r45678 = vaddq_s32(r4567, d8);
+
+        transpose_s32_4x4(&r12345, &r23456, &r34567, &r45678);
+        store_s32_4x4(dst2_ptr, dst_stride_2, r12345, r23456, r34567, r45678);
+        dst2_ptr += 4;
+        d1 = d5;
+        d2 = d6;
+        d3 = d7;
+        d4 = d8;
+        w -= 4;
+      } while (w > 0);
+      h -= 8;
+      count++;
+    } while (h > 0);
+  }
+}
+
+static INLINE void calc_ab_internal_lbd(int32_t *A, uint16_t *A16,
+                                        uint16_t *B16, int32_t *B,
+                                        const int buf_stride, const int width,
+                                        const int height, const int r,
+                                        const int s, const int ht_inc) {
+  int32_t *src1, *dst2, count = 0;
+  uint16_t *dst_A16, *src2;
+  const uint32_t n = (2 * r + 1) * (2 * r + 1);
+  const uint32x4_t const_n_val = vdupq_n_u32(n);
+  const uint16x8_t sgrproj_sgr = vdupq_n_u16(SGRPROJ_SGR);
+  const uint16x4_t one_by_n_minus_1_vec = vdup_n_u16(av1_one_by_x[n - 1]);
+  const uint32x4_t const_val = vdupq_n_u32(255);
+
+  uint16x8_t s16_0, s16_1, s16_2, s16_3, s16_4, s16_5, s16_6, s16_7;
+
+  uint32x4_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+  const uint32x4_t s_vec = vdupq_n_u32(s);
+  int w, h = height;
+
+  do {
+    dst_A16 = A16 + (count << 2) * buf_stride;
+    src1 = A + (count << 2) * buf_stride;
+    src2 = B16 + (count << 2) * buf_stride;
+    dst2 = B + (count << 2) * buf_stride;
+    w = width;
+    do {
+      load_u32_4x4((uint32_t *)src1, buf_stride, &s0, &s1, &s2, &s3);
+      load_u32_4x4((uint32_t *)src1 + 4, buf_stride, &s4, &s5, &s6, &s7);
+      load_u16_8x4(src2, buf_stride, &s16_0, &s16_1, &s16_2, &s16_3);
+
+      s16_4 = s16_0;
+      s16_5 = s16_1;
+      s16_6 = s16_2;
+      s16_7 = s16_3;
+
+      calc_ab_internal_common(
+          s0, s1, s2, s3, s4, s5, s6, s7, s16_0, s16_1, s16_2, s16_3, s16_4,
+          s16_5, s16_6, s16_7, const_n_val, s_vec, const_val,
+          one_by_n_minus_1_vec, sgrproj_sgr, src1, dst_A16, dst2, buf_stride);
+
+      w -= 8;
+      dst2 += 8;
+      src1 += 8;
+      src2 += 8;
+      dst_A16 += 8;
+    } while (w > 0);
+    count++;
+    h -= (ht_inc * 4);
+  } while (h > 0);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE void calc_ab_internal_hbd(int32_t *A, uint16_t *A16,
+                                        uint16_t *B16, int32_t *B,
+                                        const int buf_stride, const int width,
+                                        const int height, const int bit_depth,
+                                        const int r, const int s,
+                                        const int ht_inc) {
+  int32_t *src1, *dst2, count = 0;
+  uint16_t *dst_A16, *src2;
+  const uint32_t n = (2 * r + 1) * (2 * r + 1);
+  const int16x8_t bd_min_2_vec = vdupq_n_s16(-(bit_depth - 8));
+  const int32x4_t bd_min_1_vec = vdupq_n_s32(-((bit_depth - 8) << 1));
+  const uint32x4_t const_n_val = vdupq_n_u32(n);
+  const uint16x8_t sgrproj_sgr = vdupq_n_u16(SGRPROJ_SGR);
+  const uint16x4_t one_by_n_minus_1_vec = vdup_n_u16(av1_one_by_x[n - 1]);
+  const uint32x4_t const_val = vdupq_n_u32(255);
+
+  int32x4_t sr0, sr1, sr2, sr3, sr4, sr5, sr6, sr7;
+  uint16x8_t s16_0, s16_1, s16_2, s16_3;
+  uint16x8_t s16_4, s16_5, s16_6, s16_7;
+  uint32x4_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+  const uint32x4_t s_vec = vdupq_n_u32(s);
+  int w, h = height;
+
+  do {
+    src1 = A + (count << 2) * buf_stride;
+    src2 = B16 + (count << 2) * buf_stride;
+    dst2 = B + (count << 2) * buf_stride;
+    dst_A16 = A16 + (count << 2) * buf_stride;
+    w = width;
+    do {
+      load_s32_4x4(src1, buf_stride, &sr0, &sr1, &sr2, &sr3);
+      load_s32_4x4(src1 + 4, buf_stride, &sr4, &sr5, &sr6, &sr7);
+      load_u16_8x4(src2, buf_stride, &s16_0, &s16_1, &s16_2, &s16_3);
+
+      s0 = vrshlq_u32(vreinterpretq_u32_s32(sr0), bd_min_1_vec);
+      s1 = vrshlq_u32(vreinterpretq_u32_s32(sr1), bd_min_1_vec);
+      s2 = vrshlq_u32(vreinterpretq_u32_s32(sr2), bd_min_1_vec);
+      s3 = vrshlq_u32(vreinterpretq_u32_s32(sr3), bd_min_1_vec);
+      s4 = vrshlq_u32(vreinterpretq_u32_s32(sr4), bd_min_1_vec);
+      s5 = vrshlq_u32(vreinterpretq_u32_s32(sr5), bd_min_1_vec);
+      s6 = vrshlq_u32(vreinterpretq_u32_s32(sr6), bd_min_1_vec);
+      s7 = vrshlq_u32(vreinterpretq_u32_s32(sr7), bd_min_1_vec);
+
+      s16_4 = vrshlq_u16(s16_0, bd_min_2_vec);
+      s16_5 = vrshlq_u16(s16_1, bd_min_2_vec);
+      s16_6 = vrshlq_u16(s16_2, bd_min_2_vec);
+      s16_7 = vrshlq_u16(s16_3, bd_min_2_vec);
+
+      calc_ab_internal_common(
+          s0, s1, s2, s3, s4, s5, s6, s7, s16_0, s16_1, s16_2, s16_3, s16_4,
+          s16_5, s16_6, s16_7, const_n_val, s_vec, const_val,
+          one_by_n_minus_1_vec, sgrproj_sgr, src1, dst_A16, dst2, buf_stride);
+
+      w -= 8;
+      dst2 += 8;
+      src1 += 8;
+      src2 += 8;
+      dst_A16 += 8;
+    } while (w > 0);
+    count++;
+    h -= (ht_inc * 4);
+  } while (h > 0);
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+static INLINE void calc_ab_fast_internal_lbd(int32_t *A, uint16_t *A16,
+                                             int32_t *B, const int buf_stride,
+                                             const int width, const int height,
+                                             const int r, const int s,
+                                             const int ht_inc) {
+  int32_t *src1, *src2, count = 0;
+  uint16_t *dst_A16;
+  const uint32_t n = (2 * r + 1) * (2 * r + 1);
+  const uint32x4_t const_n_val = vdupq_n_u32(n);
+  const uint16x4_t sgrproj_sgr = vdup_n_u16(SGRPROJ_SGR);
+  const uint32x4_t one_by_n_minus_1_vec = vdupq_n_u32(av1_one_by_x[n - 1]);
+  const uint32x4_t const_val = vdupq_n_u32(255);
+
+  int32x4_t sr0, sr1, sr2, sr3, sr4, sr5, sr6, sr7;
+  uint32x4_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+  const uint32x4_t s_vec = vdupq_n_u32(s);
+  int w, h = height;
+
+  do {
+    src1 = A + (count << 2) * buf_stride;
+    src2 = B + (count << 2) * buf_stride;
+    dst_A16 = A16 + (count << 2) * buf_stride;
+    w = width;
+    do {
+      load_s32_4x4(src1, buf_stride, &sr0, &sr1, &sr2, &sr3);
+      load_s32_4x4(src2, buf_stride, &sr4, &sr5, &sr6, &sr7);
+
+      s0 = vreinterpretq_u32_s32(sr0);
+      s1 = vreinterpretq_u32_s32(sr1);
+      s2 = vreinterpretq_u32_s32(sr2);
+      s3 = vreinterpretq_u32_s32(sr3);
+      s4 = vreinterpretq_u32_s32(sr4);
+      s5 = vreinterpretq_u32_s32(sr5);
+      s6 = vreinterpretq_u32_s32(sr6);
+      s7 = vreinterpretq_u32_s32(sr7);
+
+      calc_ab_fast_internal_common(s0, s1, s2, s3, s4, s5, s6, s7, sr4, sr5,
+                                   sr6, sr7, const_n_val, s_vec, const_val,
+                                   one_by_n_minus_1_vec, sgrproj_sgr, src1,
+                                   dst_A16, src2, buf_stride);
+
+      w -= 4;
+      src1 += 4;
+      src2 += 4;
+      dst_A16 += 4;
+    } while (w > 0);
+    count++;
+    h -= (ht_inc * 4);
+  } while (h > 0);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE void calc_ab_fast_internal_hbd(int32_t *A, uint16_t *A16,
+                                             int32_t *B, const int buf_stride,
+                                             const int width, const int height,
+                                             const int bit_depth, const int r,
+                                             const int s, const int ht_inc) {
+  int32_t *src1, *src2, count = 0;
+  uint16_t *dst_A16;
+  const uint32_t n = (2 * r + 1) * (2 * r + 1);
+  const int32x4_t bd_min_2_vec = vdupq_n_s32(-(bit_depth - 8));
+  const int32x4_t bd_min_1_vec = vdupq_n_s32(-((bit_depth - 8) << 1));
+  const uint32x4_t const_n_val = vdupq_n_u32(n);
+  const uint16x4_t sgrproj_sgr = vdup_n_u16(SGRPROJ_SGR);
+  const uint32x4_t one_by_n_minus_1_vec = vdupq_n_u32(av1_one_by_x[n - 1]);
+  const uint32x4_t const_val = vdupq_n_u32(255);
+
+  int32x4_t sr0, sr1, sr2, sr3, sr4, sr5, sr6, sr7;
+  uint32x4_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+  const uint32x4_t s_vec = vdupq_n_u32(s);
+  int w, h = height;
+
+  do {
+    src1 = A + (count << 2) * buf_stride;
+    src2 = B + (count << 2) * buf_stride;
+    dst_A16 = A16 + (count << 2) * buf_stride;
+    w = width;
+    do {
+      load_s32_4x4(src1, buf_stride, &sr0, &sr1, &sr2, &sr3);
+      load_s32_4x4(src2, buf_stride, &sr4, &sr5, &sr6, &sr7);
+
+      s0 = vrshlq_u32(vreinterpretq_u32_s32(sr0), bd_min_1_vec);
+      s1 = vrshlq_u32(vreinterpretq_u32_s32(sr1), bd_min_1_vec);
+      s2 = vrshlq_u32(vreinterpretq_u32_s32(sr2), bd_min_1_vec);
+      s3 = vrshlq_u32(vreinterpretq_u32_s32(sr3), bd_min_1_vec);
+      s4 = vrshlq_u32(vreinterpretq_u32_s32(sr4), bd_min_2_vec);
+      s5 = vrshlq_u32(vreinterpretq_u32_s32(sr5), bd_min_2_vec);
+      s6 = vrshlq_u32(vreinterpretq_u32_s32(sr6), bd_min_2_vec);
+      s7 = vrshlq_u32(vreinterpretq_u32_s32(sr7), bd_min_2_vec);
+
+      calc_ab_fast_internal_common(s0, s1, s2, s3, s4, s5, s6, s7, sr4, sr5,
+                                   sr6, sr7, const_n_val, s_vec, const_val,
+                                   one_by_n_minus_1_vec, sgrproj_sgr, src1,
+                                   dst_A16, src2, buf_stride);
+
+      w -= 4;
+      src1 += 4;
+      src2 += 4;
+      dst_A16 += 4;
+    } while (w > 0);
+    count++;
+    h -= (ht_inc * 4);
+  } while (h > 0);
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+static INLINE void boxsum1(int16_t *src, const int src_stride, uint16_t *dst1,
+                           int32_t *dst2, const int dst_stride, const int width,
+                           const int height) {
+  assert(width > 2 * SGRPROJ_BORDER_HORZ);
+  assert(height > 2 * SGRPROJ_BORDER_VERT);
+
+  int16_t *src_ptr;
+  int32_t *dst2_ptr;
+  uint16_t *dst1_ptr;
+  int h, w, count = 0;
+
+  w = width;
+  {
+    int16x8_t s1, s2, s3, s4, s5, s6, s7, s8;
+    int16x8_t q23, q34, q56, q234, q345, q456, q567;
+    int32x4_t r23, r56, r345, r456, r567, r78, r678;
+    int32x4_t r4_low, r4_high, r34_low, r34_high, r234_low, r234_high;
+    int32x4_t r2, r3, r5, r6, r7, r8;
+    int16x8_t q678, q78;
+
+    do {
+      dst1_ptr = dst1 + (count << 3);
+      dst2_ptr = dst2 + (count << 3);
+      src_ptr = src + (count << 3);
+      h = height;
+
+      load_s16_8x4(src_ptr, src_stride, &s1, &s2, &s3, &s4);
+      src_ptr += 4 * src_stride;
+
+      q23 = vaddq_s16(s2, s3);
+      q234 = vaddq_s16(q23, s4);
+      q34 = vaddq_s16(s3, s4);
+      dst1_ptr += (dst_stride << 1);
+
+      r2 = vmull_s16(vget_low_s16(s2), vget_low_s16(s2));
+      r3 = vmull_s16(vget_low_s16(s3), vget_low_s16(s3));
+      r4_low = vmull_s16(vget_low_s16(s4), vget_low_s16(s4));
+      r23 = vaddq_s32(r2, r3);
+      r234_low = vaddq_s32(r23, r4_low);
+      r34_low = vaddq_s32(r3, r4_low);
+
+      r2 = vmull_s16(vget_high_s16(s2), vget_high_s16(s2));
+      r3 = vmull_s16(vget_high_s16(s3), vget_high_s16(s3));
+      r4_high = vmull_s16(vget_high_s16(s4), vget_high_s16(s4));
+      r23 = vaddq_s32(r2, r3);
+      r234_high = vaddq_s32(r23, r4_high);
+      r34_high = vaddq_s32(r3, r4_high);
+
+      dst2_ptr += (dst_stride << 1);
+
+      do {
+        load_s16_8x4(src_ptr, src_stride, &s5, &s6, &s7, &s8);
+        src_ptr += 4 * src_stride;
+
+        q345 = vaddq_s16(s5, q34);
+        q56 = vaddq_s16(s5, s6);
+        q456 = vaddq_s16(s4, q56);
+        q567 = vaddq_s16(s7, q56);
+        q78 = vaddq_s16(s7, s8);
+        q678 = vaddq_s16(s6, q78);
+
+        store_s16_8x4((int16_t *)dst1_ptr, dst_stride, q234, q345, q456, q567);
+        dst1_ptr += (dst_stride << 2);
+
+        s4 = s8;
+        q34 = q78;
+        q234 = q678;
+
+        r5 = vmull_s16(vget_low_s16(s5), vget_low_s16(s5));
+        r6 = vmull_s16(vget_low_s16(s6), vget_low_s16(s6));
+        r7 = vmull_s16(vget_low_s16(s7), vget_low_s16(s7));
+        r8 = vmull_s16(vget_low_s16(s8), vget_low_s16(s8));
+
+        r345 = vaddq_s32(r5, r34_low);
+        r56 = vaddq_s32(r5, r6);
+        r456 = vaddq_s32(r4_low, r56);
+        r567 = vaddq_s32(r7, r56);
+        r78 = vaddq_s32(r7, r8);
+        r678 = vaddq_s32(r6, r78);
+        store_s32_4x4(dst2_ptr, dst_stride, r234_low, r345, r456, r567);
+
+        r4_low = r8;
+        r34_low = r78;
+        r234_low = r678;
+
+        r5 = vmull_s16(vget_high_s16(s5), vget_high_s16(s5));
+        r6 = vmull_s16(vget_high_s16(s6), vget_high_s16(s6));
+        r7 = vmull_s16(vget_high_s16(s7), vget_high_s16(s7));
+        r8 = vmull_s16(vget_high_s16(s8), vget_high_s16(s8));
+
+        r345 = vaddq_s32(r5, r34_high);
+        r56 = vaddq_s32(r5, r6);
+        r456 = vaddq_s32(r4_high, r56);
+        r567 = vaddq_s32(r7, r56);
+        r78 = vaddq_s32(r7, r8);
+        r678 = vaddq_s32(r6, r78);
+        store_s32_4x4((dst2_ptr + 4), dst_stride, r234_high, r345, r456, r567);
+        dst2_ptr += (dst_stride << 2);
+
+        r4_high = r8;
+        r34_high = r78;
+        r234_high = r678;
+
+        h -= 4;
+      } while (h > 0);
+      w -= 8;
+      count++;
+    } while (w > 0);
+
+    // memset needed for row pixels as 2nd stage of boxsum filter uses
+    // first 2 rows of dst1, dst2 buffer which is not filled in first stage.
+    for (int x = 0; x < 2; x++) {
+      memset(dst1 + x * dst_stride, 0, (width + 4) * sizeof(*dst1));
+      memset(dst2 + x * dst_stride, 0, (width + 4) * sizeof(*dst2));
+    }
+
+    // memset needed for extra columns as 2nd stage of boxsum filter uses
+    // last 2 columns of dst1, dst2 buffer which is not filled in first stage.
+    for (int x = 2; x < height + 2; x++) {
+      int dst_offset = x * dst_stride + width + 2;
+      memset(dst1 + dst_offset, 0, 3 * sizeof(*dst1));
+      memset(dst2 + dst_offset, 0, 3 * sizeof(*dst2));
+    }
+  }
+
+  {
+    int16x4_t d1, d2, d3, d4, d5, d6, d7, d8;
+    int16x4_t q23, q34, q56, q234, q345, q456, q567;
+    int32x4_t r23, r56, r234, r345, r456, r567, r34, r78, r678;
+    int32x4_t r1, r2, r3, r4, r5, r6, r7, r8;
+    int16x4_t q678, q78;
+
+    int32_t *src2_ptr;
+    uint16_t *src1_ptr;
+    count = 0;
+    h = height;
+    w = width;
+    do {
+      dst1_ptr = dst1 + (count << 2) * dst_stride;
+      dst2_ptr = dst2 + (count << 2) * dst_stride;
+      src1_ptr = dst1 + (count << 2) * dst_stride;
+      src2_ptr = dst2 + (count << 2) * dst_stride;
+      w = width;
+
+      load_s16_4x4((int16_t *)src1_ptr, dst_stride, &d1, &d2, &d3, &d4);
+      transpose_s16_4x4d(&d1, &d2, &d3, &d4);
+      load_s32_4x4(src2_ptr, dst_stride, &r1, &r2, &r3, &r4);
+      transpose_s32_4x4(&r1, &r2, &r3, &r4);
+      src1_ptr += 4;
+      src2_ptr += 4;
+
+      q23 = vadd_s16(d2, d3);
+      q234 = vadd_s16(q23, d4);
+      q34 = vadd_s16(d3, d4);
+      dst1_ptr += 2;
+      r23 = vaddq_s32(r2, r3);
+      r234 = vaddq_s32(r23, r4);
+      r34 = vaddq_s32(r3, r4);
+      dst2_ptr += 2;
+
+      do {
+        load_s16_4x4((int16_t *)src1_ptr, dst_stride, &d5, &d6, &d7, &d8);
+        transpose_s16_4x4d(&d5, &d6, &d7, &d8);
+        load_s32_4x4(src2_ptr, dst_stride, &r5, &r6, &r7, &r8);
+        transpose_s32_4x4(&r5, &r6, &r7, &r8);
+        src1_ptr += 4;
+        src2_ptr += 4;
+
+        q345 = vadd_s16(d5, q34);
+        q56 = vadd_s16(d5, d6);
+        q456 = vadd_s16(d4, q56);
+        q567 = vadd_s16(d7, q56);
+        q78 = vadd_s16(d7, d8);
+        q678 = vadd_s16(d6, q78);
+        transpose_s16_4x4d(&q234, &q345, &q456, &q567);
+        store_s16_4x4((int16_t *)dst1_ptr, dst_stride, q234, q345, q456, q567);
+        dst1_ptr += 4;
+
+        d4 = d8;
+        q34 = q78;
+        q234 = q678;
+
+        r345 = vaddq_s32(r5, r34);
+        r56 = vaddq_s32(r5, r6);
+        r456 = vaddq_s32(r4, r56);
+        r567 = vaddq_s32(r7, r56);
+        r78 = vaddq_s32(r7, r8);
+        r678 = vaddq_s32(r6, r78);
+        transpose_s32_4x4(&r234, &r345, &r456, &r567);
+        store_s32_4x4(dst2_ptr, dst_stride, r234, r345, r456, r567);
+        dst2_ptr += 4;
+
+        r4 = r8;
+        r34 = r78;
+        r234 = r678;
+        w -= 4;
+      } while (w > 0);
+      h -= 4;
+      count++;
+    } while (h > 0);
+  }
+}
+
+static INLINE int32x4_t cross_sum_inp_s32(int32_t *buf, int buf_stride) {
+  int32x4_t xtr, xt, xtl, xl, x, xr, xbr, xb, xbl;
+  int32x4_t fours, threes, res;
+
+  xtl = vld1q_s32(buf - buf_stride - 1);
+  xt = vld1q_s32(buf - buf_stride);
+  xtr = vld1q_s32(buf - buf_stride + 1);
+  xl = vld1q_s32(buf - 1);
+  x = vld1q_s32(buf);
+  xr = vld1q_s32(buf + 1);
+  xbl = vld1q_s32(buf + buf_stride - 1);
+  xb = vld1q_s32(buf + buf_stride);
+  xbr = vld1q_s32(buf + buf_stride + 1);
+
+  fours = vaddq_s32(xl, vaddq_s32(xt, vaddq_s32(xr, vaddq_s32(xb, x))));
+  threes = vaddq_s32(xtl, vaddq_s32(xtr, vaddq_s32(xbr, xbl)));
+  res = vsubq_s32(vshlq_n_s32(vaddq_s32(fours, threes), 2), threes);
+  return res;
+}
+
+static INLINE void cross_sum_inp_u16(uint16_t *buf, int buf_stride,
+                                     int32x4_t *a0, int32x4_t *a1) {
+  uint16x8_t xtr, xt, xtl, xl, x, xr, xbr, xb, xbl;
+  uint16x8_t r0, r1;
+
+  xtl = vld1q_u16(buf - buf_stride - 1);
+  xt = vld1q_u16(buf - buf_stride);
+  xtr = vld1q_u16(buf - buf_stride + 1);
+  xl = vld1q_u16(buf - 1);
+  x = vld1q_u16(buf);
+  xr = vld1q_u16(buf + 1);
+  xbl = vld1q_u16(buf + buf_stride - 1);
+  xb = vld1q_u16(buf + buf_stride);
+  xbr = vld1q_u16(buf + buf_stride + 1);
+
+  xb = vaddq_u16(xb, x);
+  xt = vaddq_u16(xt, xr);
+  xl = vaddq_u16(xl, xb);
+  xl = vaddq_u16(xl, xt);
+
+  r0 = vshlq_n_u16(xl, 2);
+
+  xbl = vaddq_u16(xbl, xbr);
+  xtl = vaddq_u16(xtl, xtr);
+  xtl = vaddq_u16(xtl, xbl);
+
+  r1 = vshlq_n_u16(xtl, 2);
+  r1 = vsubq_u16(r1, xtl);
+
+  *a0 = vreinterpretq_s32_u32(
+      vaddq_u32(vmovl_u16(vget_low_u16(r0)), vmovl_u16(vget_low_u16(r1))));
+  *a1 = vreinterpretq_s32_u32(
+      vaddq_u32(vmovl_u16(vget_high_u16(r0)), vmovl_u16(vget_high_u16(r1))));
+}
+
+static INLINE int32x4_t cross_sum_fast_even_row(int32_t *buf, int buf_stride) {
+  int32x4_t xtr, xt, xtl, xbr, xb, xbl;
+  int32x4_t fives, sixes, fives_plus_sixes;
+
+  xtl = vld1q_s32(buf - buf_stride - 1);
+  xt = vld1q_s32(buf - buf_stride);
+  xtr = vld1q_s32(buf - buf_stride + 1);
+  xbl = vld1q_s32(buf + buf_stride - 1);
+  xb = vld1q_s32(buf + buf_stride);
+  xbr = vld1q_s32(buf + buf_stride + 1);
+
+  fives = vaddq_s32(xtl, vaddq_s32(xtr, vaddq_s32(xbr, xbl)));
+  sixes = vaddq_s32(xt, xb);
+  fives_plus_sixes = vaddq_s32(fives, sixes);
+
+  return vaddq_s32(
+      vaddq_s32(vshlq_n_s32(fives_plus_sixes, 2), fives_plus_sixes), sixes);
+}
+
+static INLINE void cross_sum_fast_even_row_inp16(uint16_t *buf, int buf_stride,
+                                                 int32x4_t *a0, int32x4_t *a1) {
+  uint16x8_t xtr, xt, xtl, xbr, xb, xbl, xb0;
+
+  xtl = vld1q_u16(buf - buf_stride - 1);
+  xt = vld1q_u16(buf - buf_stride);
+  xtr = vld1q_u16(buf - buf_stride + 1);
+  xbl = vld1q_u16(buf + buf_stride - 1);
+  xb = vld1q_u16(buf + buf_stride);
+  xbr = vld1q_u16(buf + buf_stride + 1);
+
+  xbr = vaddq_u16(xbr, xbl);
+  xtr = vaddq_u16(xtr, xtl);
+  xbr = vaddq_u16(xbr, xtr);
+  xtl = vshlq_n_u16(xbr, 2);
+  xbr = vaddq_u16(xtl, xbr);
+
+  xb = vaddq_u16(xb, xt);
+  xb0 = vshlq_n_u16(xb, 1);
+  xb = vshlq_n_u16(xb, 2);
+  xb = vaddq_u16(xb, xb0);
+
+  *a0 = vreinterpretq_s32_u32(
+      vaddq_u32(vmovl_u16(vget_low_u16(xbr)), vmovl_u16(vget_low_u16(xb))));
+  *a1 = vreinterpretq_s32_u32(
+      vaddq_u32(vmovl_u16(vget_high_u16(xbr)), vmovl_u16(vget_high_u16(xb))));
+}
+
+static INLINE int32x4_t cross_sum_fast_odd_row(int32_t *buf) {
+  int32x4_t xl, x, xr;
+  int32x4_t fives, sixes, fives_plus_sixes;
+
+  xl = vld1q_s32(buf - 1);
+  x = vld1q_s32(buf);
+  xr = vld1q_s32(buf + 1);
+  fives = vaddq_s32(xl, xr);
+  sixes = x;
+  fives_plus_sixes = vaddq_s32(fives, sixes);
+
+  return vaddq_s32(
+      vaddq_s32(vshlq_n_s32(fives_plus_sixes, 2), fives_plus_sixes), sixes);
+}
+
+static INLINE void cross_sum_fast_odd_row_inp16(uint16_t *buf, int32x4_t *a0,
+                                                int32x4_t *a1) {
+  uint16x8_t xl, x, xr;
+  uint16x8_t x0;
+
+  xl = vld1q_u16(buf - 1);
+  x = vld1q_u16(buf);
+  xr = vld1q_u16(buf + 1);
+  xl = vaddq_u16(xl, xr);
+  x0 = vshlq_n_u16(xl, 2);
+  xl = vaddq_u16(xl, x0);
+
+  x0 = vshlq_n_u16(x, 1);
+  x = vshlq_n_u16(x, 2);
+  x = vaddq_u16(x, x0);
+
+  *a0 = vreinterpretq_s32_u32(
+      vaddq_u32(vmovl_u16(vget_low_u16(xl)), vmovl_u16(vget_low_u16(x))));
+  *a1 = vreinterpretq_s32_u32(
+      vaddq_u32(vmovl_u16(vget_high_u16(xl)), vmovl_u16(vget_high_u16(x))));
+}
+
+static void final_filter_fast_internal(uint16_t *A, int32_t *B,
+                                       const int buf_stride, int16_t *src,
+                                       const int src_stride, int32_t *dst,
+                                       const int dst_stride, const int width,
+                                       const int height) {
+  int16x8_t s0;
+  int32_t *B_tmp, *dst_ptr;
+  uint16_t *A_tmp;
+  int16_t *src_ptr;
+  int32x4_t a_res0, a_res1, b_res0, b_res1;
+  int w, h, count = 0;
+  assert(SGRPROJ_SGR_BITS == 8);
+  assert(SGRPROJ_RST_BITS == 4);
+
+  A_tmp = A;
+  B_tmp = B;
+  src_ptr = src;
+  dst_ptr = dst;
+  h = height;
+  do {
+    A_tmp = (A + count * buf_stride);
+    B_tmp = (B + count * buf_stride);
+    src_ptr = (src + count * src_stride);
+    dst_ptr = (dst + count * dst_stride);
+    w = width;
+    if (!(count & 1)) {
+      do {
+        s0 = vld1q_s16(src_ptr);
+        cross_sum_fast_even_row_inp16(A_tmp, buf_stride, &a_res0, &a_res1);
+        a_res0 = vmulq_s32(vmovl_s16(vget_low_s16(s0)), a_res0);
+        a_res1 = vmulq_s32(vmovl_s16(vget_high_s16(s0)), a_res1);
+
+        b_res0 = cross_sum_fast_even_row(B_tmp, buf_stride);
+        b_res1 = cross_sum_fast_even_row(B_tmp + 4, buf_stride);
+        a_res0 = vaddq_s32(a_res0, b_res0);
+        a_res1 = vaddq_s32(a_res1, b_res1);
+
+        a_res0 =
+            vrshrq_n_s32(a_res0, SGRPROJ_SGR_BITS + NB_EVEN - SGRPROJ_RST_BITS);
+        a_res1 =
+            vrshrq_n_s32(a_res1, SGRPROJ_SGR_BITS + NB_EVEN - SGRPROJ_RST_BITS);
+
+        vst1q_s32(dst_ptr, a_res0);
+        vst1q_s32(dst_ptr + 4, a_res1);
+
+        A_tmp += 8;
+        B_tmp += 8;
+        src_ptr += 8;
+        dst_ptr += 8;
+        w -= 8;
+      } while (w > 0);
+    } else {
+      do {
+        s0 = vld1q_s16(src_ptr);
+        cross_sum_fast_odd_row_inp16(A_tmp, &a_res0, &a_res1);
+        a_res0 = vmulq_s32(vmovl_s16(vget_low_s16(s0)), a_res0);
+        a_res1 = vmulq_s32(vmovl_s16(vget_high_s16(s0)), a_res1);
+
+        b_res0 = cross_sum_fast_odd_row(B_tmp);
+        b_res1 = cross_sum_fast_odd_row(B_tmp + 4);
+        a_res0 = vaddq_s32(a_res0, b_res0);
+        a_res1 = vaddq_s32(a_res1, b_res1);
+
+        a_res0 =
+            vrshrq_n_s32(a_res0, SGRPROJ_SGR_BITS + NB_ODD - SGRPROJ_RST_BITS);
+        a_res1 =
+            vrshrq_n_s32(a_res1, SGRPROJ_SGR_BITS + NB_ODD - SGRPROJ_RST_BITS);
+
+        vst1q_s32(dst_ptr, a_res0);
+        vst1q_s32(dst_ptr + 4, a_res1);
+
+        A_tmp += 8;
+        B_tmp += 8;
+        src_ptr += 8;
+        dst_ptr += 8;
+        w -= 8;
+      } while (w > 0);
+    }
+    count++;
+    h -= 1;
+  } while (h > 0);
+}
+
+void final_filter_internal(uint16_t *A, int32_t *B, const int buf_stride,
+                           int16_t *src, const int src_stride, int32_t *dst,
+                           const int dst_stride, const int width,
+                           const int height) {
+  int16x8_t s0;
+  int32_t *B_tmp, *dst_ptr;
+  uint16_t *A_tmp;
+  int16_t *src_ptr;
+  int32x4_t a_res0, a_res1, b_res0, b_res1;
+  int w, h, count = 0;
+
+  assert(SGRPROJ_SGR_BITS == 8);
+  assert(SGRPROJ_RST_BITS == 4);
+  h = height;
+
+  do {
+    A_tmp = (A + count * buf_stride);
+    B_tmp = (B + count * buf_stride);
+    src_ptr = (src + count * src_stride);
+    dst_ptr = (dst + count * dst_stride);
+    w = width;
+    do {
+      s0 = vld1q_s16(src_ptr);
+      cross_sum_inp_u16(A_tmp, buf_stride, &a_res0, &a_res1);
+      a_res0 = vmulq_s32(vmovl_s16(vget_low_s16(s0)), a_res0);
+      a_res1 = vmulq_s32(vmovl_s16(vget_high_s16(s0)), a_res1);
+
+      b_res0 = cross_sum_inp_s32(B_tmp, buf_stride);
+      b_res1 = cross_sum_inp_s32(B_tmp + 4, buf_stride);
+      a_res0 = vaddq_s32(a_res0, b_res0);
+      a_res1 = vaddq_s32(a_res1, b_res1);
+
+      a_res0 =
+          vrshrq_n_s32(a_res0, SGRPROJ_SGR_BITS + NB_EVEN - SGRPROJ_RST_BITS);
+      a_res1 =
+          vrshrq_n_s32(a_res1, SGRPROJ_SGR_BITS + NB_EVEN - SGRPROJ_RST_BITS);
+      vst1q_s32(dst_ptr, a_res0);
+      vst1q_s32(dst_ptr + 4, a_res1);
+
+      A_tmp += 8;
+      B_tmp += 8;
+      src_ptr += 8;
+      dst_ptr += 8;
+      w -= 8;
+    } while (w > 0);
+    count++;
+    h -= 1;
+  } while (h > 0);
+}
+
+static INLINE void restoration_fast_internal(uint16_t *dgd16, int width,
+                                             int height, int dgd_stride,
+                                             int32_t *dst, int dst_stride,
+                                             int bit_depth, int sgr_params_idx,
+                                             int radius_idx) {
+  const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
+  const int r = params->r[radius_idx];
+  const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
+  const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
+
+  const int buf_stride = ((width_ext + 3) & ~3) + 16;
+  int32_t A_[RESTORATION_PROC_UNIT_PELS];
+  uint16_t A16_[RESTORATION_PROC_UNIT_PELS];
+  int32_t B_[RESTORATION_PROC_UNIT_PELS];
+  int32_t *square_sum_buf = A_;
+  int32_t *sum_buf = B_;
+  uint16_t *tmp16_buf = A16_;
+
+  assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r");
+  assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 &&
+         "Need SGRPROJ_BORDER_* >= r+1");
+
+  assert(radius_idx == 0);
+  assert(r == 2);
+
+  // input(dgd16) is 16bit.
+  // sum of pixels 1st stage output will be in 16bit(tmp16_buf). End output is
+  // kept in 32bit [sum_buf]. sum of squares output is kept in 32bit
+  // buffer(square_sum_buf).
+  boxsum2((int16_t *)(dgd16 - dgd_stride * SGRPROJ_BORDER_VERT -
+                      SGRPROJ_BORDER_HORZ),
+          dgd_stride, (int16_t *)tmp16_buf, sum_buf, square_sum_buf, buf_stride,
+          width_ext, height_ext);
+
+  square_sum_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+  sum_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+  tmp16_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+
+  // Calculation of a, b. a output is in 16bit tmp_buf which is in range of
+  // [1, 256] for all bit depths. b output is kept in 32bit buffer.
+
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (bit_depth > 8) {
+    calc_ab_fast_internal_hbd(
+        (square_sum_buf - buf_stride - 1), (tmp16_buf - buf_stride - 1),
+        (sum_buf - buf_stride - 1), buf_stride * 2, width + 2, height + 2,
+        bit_depth, r, params->s[radius_idx], 2);
+  } else {
+    calc_ab_fast_internal_lbd(
+        (square_sum_buf - buf_stride - 1), (tmp16_buf - buf_stride - 1),
+        (sum_buf - buf_stride - 1), buf_stride * 2, width + 2, height + 2, r,
+        params->s[radius_idx], 2);
+  }
+#else
+  (void)bit_depth;
+  calc_ab_fast_internal_lbd((square_sum_buf - buf_stride - 1),
+                            (tmp16_buf - buf_stride - 1),
+                            (sum_buf - buf_stride - 1), buf_stride * 2,
+                            width + 2, height + 2, r, params->s[radius_idx], 2);
+#endif
+  final_filter_fast_internal(tmp16_buf, sum_buf, buf_stride, (int16_t *)dgd16,
+                             dgd_stride, dst, dst_stride, width, height);
+}
+
+static INLINE void restoration_internal(uint16_t *dgd16, int width, int height,
+                                        int dgd_stride, int32_t *dst,
+                                        int dst_stride, int bit_depth,
+                                        int sgr_params_idx, int radius_idx) {
+  const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
+  const int r = params->r[radius_idx];
+  const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
+  const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
+
+  int buf_stride = ((width_ext + 3) & ~3) + 16;
+  int32_t A_[RESTORATION_PROC_UNIT_PELS];
+  uint16_t A16_[RESTORATION_PROC_UNIT_PELS];
+  uint16_t B16_[RESTORATION_PROC_UNIT_PELS];
+  int32_t B_[RESTORATION_PROC_UNIT_PELS];
+  int32_t *square_sum_buf = A_;
+  uint16_t *sum_buf = B16_;
+  uint16_t *A16 = A16_;
+  int32_t *B = B_;
+
+  assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r");
+  assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 &&
+         "Need SGRPROJ_BORDER_* >= r+1");
+
+  assert(radius_idx == 1);
+  assert(r == 1);
+
+  // input(dgd16) is 16bit.
+  // sum of pixels output will be in 16bit(sum_buf).
+  // sum of squares output is kept in 32bit buffer(square_sum_buf).
+  boxsum1((int16_t *)(dgd16 - dgd_stride * SGRPROJ_BORDER_VERT -
+                      SGRPROJ_BORDER_HORZ),
+          dgd_stride, sum_buf, square_sum_buf, buf_stride, width_ext,
+          height_ext);
+
+  square_sum_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+  B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+  A16 += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+  sum_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+
+#if CONFIG_AV1_HIGHBITDEPTH
+  // Calculation of a, b. a output is in 16bit tmp_buf which is in range of
+  // [1, 256] for all bit depths. b output is kept in 32bit buffer.
+  if (bit_depth > 8) {
+    calc_ab_internal_hbd((square_sum_buf - buf_stride - 1),
+                         (A16 - buf_stride - 1), (sum_buf - buf_stride - 1),
+                         (B - buf_stride - 1), buf_stride, width + 2,
+                         height + 2, bit_depth, r, params->s[radius_idx], 1);
+  } else {
+    calc_ab_internal_lbd((square_sum_buf - buf_stride - 1),
+                         (A16 - buf_stride - 1), (sum_buf - buf_stride - 1),
+                         (B - buf_stride - 1), buf_stride, width + 2,
+                         height + 2, r, params->s[radius_idx], 1);
+  }
+#else
+  (void)bit_depth;
+  calc_ab_internal_lbd((square_sum_buf - buf_stride - 1),
+                       (A16 - buf_stride - 1), (sum_buf - buf_stride - 1),
+                       (B - buf_stride - 1), buf_stride, width + 2, height + 2,
+                       r, params->s[radius_idx], 1);
+#endif
+  final_filter_internal(A16, B, buf_stride, (int16_t *)dgd16, dgd_stride, dst,
+                        dst_stride, width, height);
+}
+
+static INLINE void src_convert_u8_to_u16(const uint8_t *src,
+                                         const int src_stride, uint16_t *dst,
+                                         const int dst_stride, const int width,
+                                         const int height) {
+  const uint8_t *src_ptr;
+  uint16_t *dst_ptr;
+  int h, w, count = 0;
+
+  uint8x8_t t1, t2, t3, t4;
+  uint16x8_t s1, s2, s3, s4;
+  h = height;
+  do {
+    src_ptr = src + (count << 2) * src_stride;
+    dst_ptr = dst + (count << 2) * dst_stride;
+    w = width;
+    if (w >= 7) {
+      do {
+        load_u8_8x4(src_ptr, src_stride, &t1, &t2, &t3, &t4);
+        s1 = vmovl_u8(t1);
+        s2 = vmovl_u8(t2);
+        s3 = vmovl_u8(t3);
+        s4 = vmovl_u8(t4);
+        store_u16_8x4(dst_ptr, dst_stride, s1, s2, s3, s4);
+
+        src_ptr += 8;
+        dst_ptr += 8;
+        w -= 8;
+      } while (w > 7);
+    }
+
+    for (int y = 0; y < w; y++) {
+      dst_ptr[y] = src_ptr[y];
+      dst_ptr[y + 1 * dst_stride] = src_ptr[y + 1 * src_stride];
+      dst_ptr[y + 2 * dst_stride] = src_ptr[y + 2 * src_stride];
+      dst_ptr[y + 3 * dst_stride] = src_ptr[y + 3 * src_stride];
+    }
+    count++;
+    h -= 4;
+  } while (h > 3);
+
+  src_ptr = src + (count << 2) * src_stride;
+  dst_ptr = dst + (count << 2) * dst_stride;
+  for (int x = 0; x < h; x++) {
+    for (int y = 0; y < width; y++) {
+      dst_ptr[y + x * dst_stride] = src_ptr[y + x * src_stride];
+    }
+  }
+
+  // memset uninitialized rows of src buffer as they are needed for the
+  // boxsum filter calculation.
+  for (int x = height; x < height + 5; x++)
+    memset(dst + x * dst_stride, 0, (width + 2) * sizeof(*dst));
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE void src_convert_hbd_copy(const uint16_t *src, int src_stride,
+                                        uint16_t *dst, const int dst_stride,
+                                        int width, int height) {
+  const uint16_t *src_ptr;
+  uint16_t *dst_ptr;
+  int h, w, count = 0;
+  uint16x8_t s1, s2, s3, s4;
+
+  h = height;
+  do {
+    src_ptr = src + (count << 2) * src_stride;
+    dst_ptr = dst + (count << 2) * dst_stride;
+    w = width;
+    do {
+      load_u16_8x4(src_ptr, src_stride, &s1, &s2, &s3, &s4);
+      store_u16_8x4(dst_ptr, dst_stride, s1, s2, s3, s4);
+      src_ptr += 8;
+      dst_ptr += 8;
+      w -= 8;
+    } while (w > 7);
+
+    for (int y = 0; y < w; y++) {
+      dst_ptr[y] = src_ptr[y];
+      dst_ptr[y + 1 * dst_stride] = src_ptr[y + 1 * src_stride];
+      dst_ptr[y + 2 * dst_stride] = src_ptr[y + 2 * src_stride];
+      dst_ptr[y + 3 * dst_stride] = src_ptr[y + 3 * src_stride];
+    }
+    count++;
+    h -= 4;
+  } while (h > 3);
+
+  src_ptr = src + (count << 2) * src_stride;
+  dst_ptr = dst + (count << 2) * dst_stride;
+
+  for (int x = 0; x < h; x++) {
+    memcpy((dst_ptr + x * dst_stride), (src_ptr + x * src_stride),
+           sizeof(uint16_t) * width);
+  }
+  // memset uninitialized rows of src buffer as they are needed for the
+  // boxsum filter calculation.
+  for (int x = height; x < height + 5; x++)
+    memset(dst + x * dst_stride, 0, (width + 2) * sizeof(*dst));
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+int av1_selfguided_restoration_neon(const uint8_t *dat8, int width, int height,
+                                    int stride, int32_t *flt0, int32_t *flt1,
+                                    int flt_stride, int sgr_params_idx,
+                                    int bit_depth, int highbd) {
+  const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
+  assert(!(params->r[0] == 0 && params->r[1] == 0));
+
+  uint16_t dgd16_[RESTORATION_PROC_UNIT_PELS];
+  const int dgd16_stride = width + 2 * SGRPROJ_BORDER_HORZ;
+  uint16_t *dgd16 =
+      dgd16_ + dgd16_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ;
+  const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
+  const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
+  const int dgd_stride = stride;
+
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (highbd) {
+    const uint16_t *dgd16_tmp = CONVERT_TO_SHORTPTR(dat8);
+    src_convert_hbd_copy(
+        dgd16_tmp - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ,
+        dgd_stride,
+        dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ,
+        dgd16_stride, width_ext, height_ext);
+  } else {
+    src_convert_u8_to_u16(
+        dat8 - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ,
+        dgd_stride,
+        dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ,
+        dgd16_stride, width_ext, height_ext);
+  }
+#else
+  (void)highbd;
+  src_convert_u8_to_u16(
+      dat8 - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ, dgd_stride,
+      dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ,
+      dgd16_stride, width_ext, height_ext);
+#endif
+
+  if (params->r[0] > 0)
+    restoration_fast_internal(dgd16, width, height, dgd16_stride, flt0,
+                              flt_stride, bit_depth, sgr_params_idx, 0);
+  if (params->r[1] > 0)
+    restoration_internal(dgd16, width, height, dgd16_stride, flt1, flt_stride,
+                         bit_depth, sgr_params_idx, 1);
+  return 0;
+}
+
+void av1_apply_selfguided_restoration_neon(const uint8_t *dat8, int width,
+                                           int height, int stride, int eps,
+                                           const int *xqd, uint8_t *dst8,
+                                           int dst_stride, int32_t *tmpbuf,
+                                           int bit_depth, int highbd) {
+  int32_t *flt0 = tmpbuf;
+  int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
+  assert(width * height <= RESTORATION_UNITPELS_MAX);
+  uint16_t dgd16_[RESTORATION_PROC_UNIT_PELS];
+  const int dgd16_stride = width + 2 * SGRPROJ_BORDER_HORZ;
+  uint16_t *dgd16 =
+      dgd16_ + dgd16_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ;
+  const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
+  const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
+  const int dgd_stride = stride;
+  const sgr_params_type *const params = &av1_sgr_params[eps];
+  int xq[2];
+
+  assert(!(params->r[0] == 0 && params->r[1] == 0));
+
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (highbd) {
+    const uint16_t *dgd16_tmp = CONVERT_TO_SHORTPTR(dat8);
+    src_convert_hbd_copy(
+        dgd16_tmp - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ,
+        dgd_stride,
+        dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ,
+        dgd16_stride, width_ext, height_ext);
+  } else {
+    src_convert_u8_to_u16(
+        dat8 - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ,
+        dgd_stride,
+        dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ,
+        dgd16_stride, width_ext, height_ext);
+  }
+#else
+  (void)highbd;
+  src_convert_u8_to_u16(
+      dat8 - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ, dgd_stride,
+      dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ,
+      dgd16_stride, width_ext, height_ext);
+#endif
+  if (params->r[0] > 0)
+    restoration_fast_internal(dgd16, width, height, dgd16_stride, flt0, width,
+                              bit_depth, eps, 0);
+  if (params->r[1] > 0)
+    restoration_internal(dgd16, width, height, dgd16_stride, flt1, width,
+                         bit_depth, eps, 1);
+
+  av1_decode_xq(xqd, xq, params);
+
+  {
+    int16_t *src_ptr;
+    uint8_t *dst_ptr;
+    uint16_t *dst16_ptr;
+    int16x4_t d0, d4;
+    int16x8_t r0, s0;
+    uint16x8_t r4;
+    int32x4_t u0, u4, v0, v4, f00, f10;
+    uint8x8_t t0;
+    int count = 0, w = width, h = height, rc = 0;
+
+    const int32x4_t xq0_vec = vdupq_n_s32(xq[0]);
+    const int32x4_t xq1_vec = vdupq_n_s32(xq[1]);
+    const int16x8_t zero = vdupq_n_s16(0);
+    const uint16x8_t max = vdupq_n_u16((1 << bit_depth) - 1);
+    uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst8);
+    dst_ptr = dst8;
+    src_ptr = (int16_t *)dgd16;
+    do {
+      w = width;
+      count = 0;
+      dst_ptr = dst8 + rc * dst_stride;
+      dst16_ptr = dst16 + rc * dst_stride;
+      do {
+        s0 = vld1q_s16(src_ptr + count);
+
+        u0 = vshll_n_s16(vget_low_s16(s0), SGRPROJ_RST_BITS);
+        u4 = vshll_n_s16(vget_high_s16(s0), SGRPROJ_RST_BITS);
+
+        v0 = vshlq_n_s32(u0, SGRPROJ_PRJ_BITS);
+        v4 = vshlq_n_s32(u4, SGRPROJ_PRJ_BITS);
+
+        if (params->r[0] > 0) {
+          f00 = vld1q_s32(flt0 + count);
+          f10 = vld1q_s32(flt0 + count + 4);
+
+          f00 = vsubq_s32(f00, u0);
+          f10 = vsubq_s32(f10, u4);
+
+          v0 = vmlaq_s32(v0, xq0_vec, f00);
+          v4 = vmlaq_s32(v4, xq0_vec, f10);
+        }
+
+        if (params->r[1] > 0) {
+          f00 = vld1q_s32(flt1 + count);
+          f10 = vld1q_s32(flt1 + count + 4);
+
+          f00 = vsubq_s32(f00, u0);
+          f10 = vsubq_s32(f10, u4);
+
+          v0 = vmlaq_s32(v0, xq1_vec, f00);
+          v4 = vmlaq_s32(v4, xq1_vec, f10);
+        }
+
+        d0 = vqrshrn_n_s32(v0, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+        d4 = vqrshrn_n_s32(v4, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+
+        r0 = vcombine_s16(d0, d4);
+
+        r4 = vreinterpretq_u16_s16(vmaxq_s16(r0, zero));
+
+#if CONFIG_AV1_HIGHBITDEPTH
+        if (highbd) {
+          r4 = vminq_u16(r4, max);
+          vst1q_u16(dst16_ptr, r4);
+        } else {
+          t0 = vqmovn_u16(r4);
+          vst1_u8(dst_ptr, t0);
+        }
+#else
+        (void)max;
+        t0 = vqmovn_u16(r4);
+        vst1_u8(dst_ptr, t0);
+#endif
+        w -= 8;
+        count += 8;
+        dst_ptr += 8;
+        dst16_ptr += 8;
+      } while (w > 0);
+
+      src_ptr += dgd16_stride;
+      flt1 += width;
+      flt0 += width;
+      rc++;
+      h--;
+    } while (h > 0);
+  }
+}
diff --git a/libs/libaom/src/av1/common/arm/transpose_neon.h b/libs/libaom/src/av1/common/arm/transpose_neon.h
new file mode 100644
index 000000000..91d89b43f
--- /dev/null
+++ b/libs/libaom/src/av1/common/arm/transpose_neon.h
@@ -0,0 +1,602 @@
+/*
+ *  Copyright (c) 2018, Alliance for Open Media. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef AOM_AV1_COMMON_ARM_TRANSPOSE_NEON_H_
+#define AOM_AV1_COMMON_ARM_TRANSPOSE_NEON_H_
+
+#include <arm_neon.h>
+
+static INLINE void transpose_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
+                                    uint8x8_t *a3, uint8x8_t *a4, uint8x8_t *a5,
+                                    uint8x8_t *a6, uint8x8_t *a7) {
+  // Swap 8 bit elements. Goes from:
+  // a0: 00 01 02 03 04 05 06 07
+  // a1: 10 11 12 13 14 15 16 17
+  // a2: 20 21 22 23 24 25 26 27
+  // a3: 30 31 32 33 34 35 36 37
+  // a4: 40 41 42 43 44 45 46 47
+  // a5: 50 51 52 53 54 55 56 57
+  // a6: 60 61 62 63 64 65 66 67
+  // a7: 70 71 72 73 74 75 76 77
+  // to:
+  // b0.val[0]: 00 10 02 12 04 14 06 16  40 50 42 52 44 54 46 56
+  // b0.val[1]: 01 11 03 13 05 15 07 17  41 51 43 53 45 55 47 57
+  // b1.val[0]: 20 30 22 32 24 34 26 36  60 70 62 72 64 74 66 76
+  // b1.val[1]: 21 31 23 33 25 35 27 37  61 71 63 73 65 75 67 77
+
+  const uint8x16x2_t b0 =
+      vtrnq_u8(vcombine_u8(*a0, *a4), vcombine_u8(*a1, *a5));
+  const uint8x16x2_t b1 =
+      vtrnq_u8(vcombine_u8(*a2, *a6), vcombine_u8(*a3, *a7));
+
+  // Swap 16 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30 04 14 24 34  40 50 60 70 44 54 64 74
+  // c0.val[1]: 02 12 22 32 06 16 26 36  42 52 62 72 46 56 66 76
+  // c1.val[0]: 01 11 21 31 05 15 25 35  41 51 61 71 45 55 65 75
+  // c1.val[1]: 03 13 23 33 07 17 27 37  43 53 63 73 47 57 67 77
+
+  const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
+                                    vreinterpretq_u16_u8(b1.val[0]));
+  const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
+                                    vreinterpretq_u16_u8(b1.val[1]));
+
+  // Unzip 32 bit elements resulting in:
+  // d0.val[0]: 00 10 20 30 40 50 60 70  01 11 21 31 41 51 61 71
+  // d0.val[1]: 04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75
+  // d1.val[0]: 02 12 22 32 42 52 62 72  03 13 23 33 43 53 63 73
+  // d1.val[1]: 06 16 26 36 46 56 66 76  07 17 27 37 47 57 67 77
+  const uint32x4x2_t d0 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[0]),
+                                    vreinterpretq_u32_u16(c1.val[0]));
+  const uint32x4x2_t d1 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[1]),
+                                    vreinterpretq_u32_u16(c1.val[1]));
+
+  *a0 = vreinterpret_u8_u32(vget_low_u32(d0.val[0]));
+  *a1 = vreinterpret_u8_u32(vget_high_u32(d0.val[0]));
+  *a2 = vreinterpret_u8_u32(vget_low_u32(d1.val[0]));
+  *a3 = vreinterpret_u8_u32(vget_high_u32(d1.val[0]));
+  *a4 = vreinterpret_u8_u32(vget_low_u32(d0.val[1]));
+  *a5 = vreinterpret_u8_u32(vget_high_u32(d0.val[1]));
+  *a6 = vreinterpret_u8_u32(vget_low_u32(d1.val[1]));
+  *a7 = vreinterpret_u8_u32(vget_high_u32(d1.val[1]));
+}
+
+static INLINE void transpose_u8_8x4(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
+                                    uint8x8_t *a3) {
+  // Swap 8 bit elements. Goes from:
+  // a0: 00 01 02 03 04 05 06 07
+  // a1: 10 11 12 13 14 15 16 17
+  // a2: 20 21 22 23 24 25 26 27
+  // a3: 30 31 32 33 34 35 36 37
+  // to:
+  // b0.val[0]: 00 10 02 12 04 14 06 16
+  // b0.val[1]: 01 11 03 13 05 15 07 17
+  // b1.val[0]: 20 30 22 32 24 34 26 36
+  // b1.val[1]: 21 31 23 33 25 35 27 37
+
+  const uint8x8x2_t b0 = vtrn_u8(*a0, *a1);
+  const uint8x8x2_t b1 = vtrn_u8(*a2, *a3);
+
+  // Swap 16 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30 04 14 24 34
+  // c0.val[1]: 02 12 22 32 06 16 26 36
+  // c1.val[0]: 01 11 21 31 05 15 25 35
+  // c1.val[1]: 03 13 23 33 07 17 27 37
+
+  const uint16x4x2_t c0 =
+      vtrn_u16(vreinterpret_u16_u8(b0.val[0]), vreinterpret_u16_u8(b1.val[0]));
+  const uint16x4x2_t c1 =
+      vtrn_u16(vreinterpret_u16_u8(b0.val[1]), vreinterpret_u16_u8(b1.val[1]));
+
+  *a0 = vreinterpret_u8_u16(c0.val[0]);
+  *a1 = vreinterpret_u8_u16(c1.val[0]);
+  *a2 = vreinterpret_u8_u16(c0.val[1]);
+  *a3 = vreinterpret_u8_u16(c1.val[1]);
+}
+
+static INLINE void transpose_u8_4x4(uint8x8_t *a0, uint8x8_t *a1) {
+  // Swap 16 bit elements. Goes from:
+  // a0: 00 01 02 03  10 11 12 13
+  // a1: 20 21 22 23  30 31 32 33
+  // to:
+  // b0.val[0]: 00 01 20 21  10 11 30 31
+  // b0.val[1]: 02 03 22 23  12 13 32 33
+
+  const uint16x4x2_t b0 =
+      vtrn_u16(vreinterpret_u16_u8(*a0), vreinterpret_u16_u8(*a1));
+
+  // Swap 32 bit elements resulting in:
+  // c0.val[0]: 00 01 20 21  02 03 22 23
+  // c0.val[1]: 10 11 30 31  12 13 32 33
+
+  const uint32x2x2_t c0 = vtrn_u32(vreinterpret_u32_u16(b0.val[0]),
+                                   vreinterpret_u32_u16(b0.val[1]));
+
+  // Swap 8 bit elements resulting in:
+  // d0.val[0]: 00 10 20 30  02 12 22 32
+  // d0.val[1]: 01 11 21 31  03 13 23 33
+
+  const uint8x8x2_t d0 =
+      vtrn_u8(vreinterpret_u8_u32(c0.val[0]), vreinterpret_u8_u32(c0.val[1]));
+
+  *a0 = d0.val[0];
+  *a1 = d0.val[1];
+}
+
+static INLINE void transpose_u8_4x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
+                                    uint8x8_t *a3, const uint8x8_t a4,
+                                    const uint8x8_t a5, const uint8x8_t a6,
+                                    const uint8x8_t a7) {
+  // Swap 32 bit elements. Goes from:
+  // a0: 00 01 02 03 XX XX XX XX
+  // a1: 10 11 12 13 XX XX XX XX
+  // a2: 20 21 22 23 XX XX XX XX
+  // a3; 30 31 32 33 XX XX XX XX
+  // a4: 40 41 42 43 XX XX XX XX
+  // a5: 50 51 52 53 XX XX XX XX
+  // a6: 60 61 62 63 XX XX XX XX
+  // a7: 70 71 72 73 XX XX XX XX
+  // to:
+  // b0.val[0]: 00 01 02 03 40 41 42 43
+  // b1.val[0]: 10 11 12 13 50 51 52 53
+  // b2.val[0]: 20 21 22 23 60 61 62 63
+  // b3.val[0]: 30 31 32 33 70 71 72 73
+
+  const uint32x2x2_t b0 =
+      vtrn_u32(vreinterpret_u32_u8(*a0), vreinterpret_u32_u8(a4));
+  const uint32x2x2_t b1 =
+      vtrn_u32(vreinterpret_u32_u8(*a1), vreinterpret_u32_u8(a5));
+  const uint32x2x2_t b2 =
+      vtrn_u32(vreinterpret_u32_u8(*a2), vreinterpret_u32_u8(a6));
+  const uint32x2x2_t b3 =
+      vtrn_u32(vreinterpret_u32_u8(*a3), vreinterpret_u32_u8(a7));
+
+  // Swap 16 bit elements resulting in:
+  // c0.val[0]: 00 01 20 21 40 41 60 61
+  // c0.val[1]: 02 03 22 23 42 43 62 63
+  // c1.val[0]: 10 11 30 31 50 51 70 71
+  // c1.val[1]: 12 13 32 33 52 53 72 73
+
+  const uint16x4x2_t c0 = vtrn_u16(vreinterpret_u16_u32(b0.val[0]),
+                                   vreinterpret_u16_u32(b2.val[0]));
+  const uint16x4x2_t c1 = vtrn_u16(vreinterpret_u16_u32(b1.val[0]),
+                                   vreinterpret_u16_u32(b3.val[0]));
+
+  // Swap 8 bit elements resulting in:
+  // d0.val[0]: 00 10 20 30 40 50 60 70
+  // d0.val[1]: 01 11 21 31 41 51 61 71
+  // d1.val[0]: 02 12 22 32 42 52 62 72
+  // d1.val[1]: 03 13 23 33 43 53 63 73
+
+  const uint8x8x2_t d0 =
+      vtrn_u8(vreinterpret_u8_u16(c0.val[0]), vreinterpret_u8_u16(c1.val[0]));
+  const uint8x8x2_t d1 =
+      vtrn_u8(vreinterpret_u8_u16(c0.val[1]), vreinterpret_u8_u16(c1.val[1]));
+
+  *a0 = d0.val[0];
+  *a1 = d0.val[1];
+  *a2 = d1.val[0];
+  *a3 = d1.val[1];
+}
+
+static INLINE void transpose_u16_4x8(uint16x4_t *a0, uint16x4_t *a1,
+                                     uint16x4_t *a2, uint16x4_t *a3,
+                                     uint16x4_t *a4, uint16x4_t *a5,
+                                     uint16x4_t *a6, uint16x4_t *a7,
+                                     uint16x8_t *o0, uint16x8_t *o1,
+                                     uint16x8_t *o2, uint16x8_t *o3) {
+  // Swap 16 bit elements. Goes from:
+  // a0: 00 01 02 03
+  // a1: 10 11 12 13
+  // a2: 20 21 22 23
+  // a3: 30 31 32 33
+  // a4: 40 41 42 43
+  // a5: 50 51 52 53
+  // a6: 60 61 62 63
+  // a7: 70 71 72 73
+  // to:
+  // b0.val[0]: 00 10 02 12
+  // b0.val[1]: 01 11 03 13
+  // b1.val[0]: 20 30 22 32
+  // b1.val[1]: 21 31 23 33
+  // b2.val[0]: 40 50 42 52
+  // b2.val[1]: 41 51 43 53
+  // b3.val[0]: 60 70 62 72
+  // b3.val[1]: 61 71 63 73
+
+  uint16x4x2_t b0 = vtrn_u16(*a0, *a1);
+  uint16x4x2_t b1 = vtrn_u16(*a2, *a3);
+  uint16x4x2_t b2 = vtrn_u16(*a4, *a5);
+  uint16x4x2_t b3 = vtrn_u16(*a6, *a7);
+
+  // Swap 32 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30
+  // c0.val[1]: 02 12 22 32
+  // c1.val[0]: 01 11 21 31
+  // c1.val[1]: 03 13 23 33
+  // c2.val[0]: 40 50 60 70
+  // c2.val[1]: 42 52 62 72
+  // c3.val[0]: 41 51 61 71
+  // c3.val[1]: 43 53 63 73
+
+  uint32x2x2_t c0 = vtrn_u32(vreinterpret_u32_u16(b0.val[0]),
+                             vreinterpret_u32_u16(b1.val[0]));
+  uint32x2x2_t c1 = vtrn_u32(vreinterpret_u32_u16(b0.val[1]),
+                             vreinterpret_u32_u16(b1.val[1]));
+  uint32x2x2_t c2 = vtrn_u32(vreinterpret_u32_u16(b2.val[0]),
+                             vreinterpret_u32_u16(b3.val[0]));
+  uint32x2x2_t c3 = vtrn_u32(vreinterpret_u32_u16(b2.val[1]),
+                             vreinterpret_u32_u16(b3.val[1]));
+
+  // Swap 64 bit elements resulting in:
+  // o0: 00 10 20 30 40 50 60 70
+  // o1: 01 11 21 31 41 51 61 71
+  // o2: 02 12 22 32 42 52 62 72
+  // o3: 03 13 23 33 43 53 63 73
+
+  *o0 = vcombine_u16(vreinterpret_u16_u32(c0.val[0]),
+                     vreinterpret_u16_u32(c2.val[0]));
+  *o1 = vcombine_u16(vreinterpret_u16_u32(c1.val[0]),
+                     vreinterpret_u16_u32(c3.val[0]));
+  *o2 = vcombine_u16(vreinterpret_u16_u32(c0.val[1]),
+                     vreinterpret_u16_u32(c2.val[1]));
+  *o3 = vcombine_u16(vreinterpret_u16_u32(c1.val[1]),
+                     vreinterpret_u16_u32(c3.val[1]));
+}
+
+static INLINE void transpose_s16_4x8(int16x4_t *a0, int16x4_t *a1,
+                                     int16x4_t *a2, int16x4_t *a3,
+                                     int16x4_t *a4, int16x4_t *a5,
+                                     int16x4_t *a6, int16x4_t *a7,
+                                     int16x8_t *o0, int16x8_t *o1,
+                                     int16x8_t *o2, int16x8_t *o3) {
+  // Swap 16 bit elements. Goes from:
+  // a0: 00 01 02 03
+  // a1: 10 11 12 13
+  // a2: 20 21 22 23
+  // a3: 30 31 32 33
+  // a4: 40 41 42 43
+  // a5: 50 51 52 53
+  // a6: 60 61 62 63
+  // a7: 70 71 72 73
+  // to:
+  // b0.val[0]: 00 10 02 12
+  // b0.val[1]: 01 11 03 13
+  // b1.val[0]: 20 30 22 32
+  // b1.val[1]: 21 31 23 33
+  // b2.val[0]: 40 50 42 52
+  // b2.val[1]: 41 51 43 53
+  // b3.val[0]: 60 70 62 72
+  // b3.val[1]: 61 71 63 73
+
+  int16x4x2_t b0 = vtrn_s16(*a0, *a1);
+  int16x4x2_t b1 = vtrn_s16(*a2, *a3);
+  int16x4x2_t b2 = vtrn_s16(*a4, *a5);
+  int16x4x2_t b3 = vtrn_s16(*a6, *a7);
+
+  // Swap 32 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30
+  // c0.val[1]: 02 12 22 32
+  // c1.val[0]: 01 11 21 31
+  // c1.val[1]: 03 13 23 33
+  // c2.val[0]: 40 50 60 70
+  // c2.val[1]: 42 52 62 72
+  // c3.val[0]: 41 51 61 71
+  // c3.val[1]: 43 53 63 73
+
+  int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]),
+                            vreinterpret_s32_s16(b1.val[0]));
+  int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]),
+                            vreinterpret_s32_s16(b1.val[1]));
+  int32x2x2_t c2 = vtrn_s32(vreinterpret_s32_s16(b2.val[0]),
+                            vreinterpret_s32_s16(b3.val[0]));
+  int32x2x2_t c3 = vtrn_s32(vreinterpret_s32_s16(b2.val[1]),
+                            vreinterpret_s32_s16(b3.val[1]));
+
+  // Swap 64 bit elements resulting in:
+  // o0: 00 10 20 30 40 50 60 70
+  // o1: 01 11 21 31 41 51 61 71
+  // o2: 02 12 22 32 42 52 62 72
+  // o3: 03 13 23 33 43 53 63 73
+
+  *o0 = vcombine_s16(vreinterpret_s16_s32(c0.val[0]),
+                     vreinterpret_s16_s32(c2.val[0]));
+  *o1 = vcombine_s16(vreinterpret_s16_s32(c1.val[0]),
+                     vreinterpret_s16_s32(c3.val[0]));
+  *o2 = vcombine_s16(vreinterpret_s16_s32(c0.val[1]),
+                     vreinterpret_s16_s32(c2.val[1]));
+  *o3 = vcombine_s16(vreinterpret_s16_s32(c1.val[1]),
+                     vreinterpret_s16_s32(c3.val[1]));
+}
+
+static INLINE void transpose_u16_8x8(uint16x8_t *a0, uint16x8_t *a1,
+                                     uint16x8_t *a2, uint16x8_t *a3,
+                                     uint16x8_t *a4, uint16x8_t *a5,
+                                     uint16x8_t *a6, uint16x8_t *a7) {
+  // Swap 16 bit elements. Goes from:
+  // a0: 00 01 02 03 04 05 06 07
+  // a1: 10 11 12 13 14 15 16 17
+  // a2: 20 21 22 23 24 25 26 27
+  // a3: 30 31 32 33 34 35 36 37
+  // a4: 40 41 42 43 44 45 46 47
+  // a5: 50 51 52 53 54 55 56 57
+  // a6: 60 61 62 63 64 65 66 67
+  // a7: 70 71 72 73 74 75 76 77
+  // to:
+  // b0.val[0]: 00 10 02 12 04 14 06 16
+  // b0.val[1]: 01 11 03 13 05 15 07 17
+  // b1.val[0]: 20 30 22 32 24 34 26 36
+  // b1.val[1]: 21 31 23 33 25 35 27 37
+  // b2.val[0]: 40 50 42 52 44 54 46 56
+  // b2.val[1]: 41 51 43 53 45 55 47 57
+  // b3.val[0]: 60 70 62 72 64 74 66 76
+  // b3.val[1]: 61 71 63 73 65 75 67 77
+
+  const uint16x8x2_t b0 = vtrnq_u16(*a0, *a1);
+  const uint16x8x2_t b1 = vtrnq_u16(*a2, *a3);
+  const uint16x8x2_t b2 = vtrnq_u16(*a4, *a5);
+  const uint16x8x2_t b3 = vtrnq_u16(*a6, *a7);
+
+  // Swap 32 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30 04 14 24 34
+  // c0.val[1]: 02 12 22 32 06 16 26 36
+  // c1.val[0]: 01 11 21 31 05 15 25 35
+  // c1.val[1]: 03 13 23 33 07 17 27 37
+  // c2.val[0]: 40 50 60 70 44 54 64 74
+  // c2.val[1]: 42 52 62 72 46 56 66 76
+  // c3.val[0]: 41 51 61 71 45 55 65 75
+  // c3.val[1]: 43 53 63 73 47 57 67 77
+
+  const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]),
+                                    vreinterpretq_u32_u16(b1.val[0]));
+  const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]),
+                                    vreinterpretq_u32_u16(b1.val[1]));
+  const uint32x4x2_t c2 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[0]),
+                                    vreinterpretq_u32_u16(b3.val[0]));
+  const uint32x4x2_t c3 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[1]),
+                                    vreinterpretq_u32_u16(b3.val[1]));
+
+  *a0 = vcombine_u16(vget_low_u16(vreinterpretq_u16_u32(c0.val[0])),
+                     vget_low_u16(vreinterpretq_u16_u32(c2.val[0])));
+  *a4 = vcombine_u16(vget_high_u16(vreinterpretq_u16_u32(c0.val[0])),
+                     vget_high_u16(vreinterpretq_u16_u32(c2.val[0])));
+
+  *a2 = vcombine_u16(vget_low_u16(vreinterpretq_u16_u32(c0.val[1])),
+                     vget_low_u16(vreinterpretq_u16_u32(c2.val[1])));
+  *a6 = vcombine_u16(vget_high_u16(vreinterpretq_u16_u32(c0.val[1])),
+                     vget_high_u16(vreinterpretq_u16_u32(c2.val[1])));
+
+  *a1 = vcombine_u16(vget_low_u16(vreinterpretq_u16_u32(c1.val[0])),
+                     vget_low_u16(vreinterpretq_u16_u32(c3.val[0])));
+  *a5 = vcombine_u16(vget_high_u16(vreinterpretq_u16_u32(c1.val[0])),
+                     vget_high_u16(vreinterpretq_u16_u32(c3.val[0])));
+
+  *a3 = vcombine_u16(vget_low_u16(vreinterpretq_u16_u32(c1.val[1])),
+                     vget_low_u16(vreinterpretq_u16_u32(c3.val[1])));
+  *a7 = vcombine_u16(vget_high_u16(vreinterpretq_u16_u32(c1.val[1])),
+                     vget_high_u16(vreinterpretq_u16_u32(c3.val[1])));
+}
+
+static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1,
+                                     int16x8_t *a2, int16x8_t *a3,
+                                     int16x8_t *a4, int16x8_t *a5,
+                                     int16x8_t *a6, int16x8_t *a7) {
+  // Swap 16 bit elements. Goes from:
+  // a0: 00 01 02 03 04 05 06 07
+  // a1: 10 11 12 13 14 15 16 17
+  // a2: 20 21 22 23 24 25 26 27
+  // a3: 30 31 32 33 34 35 36 37
+  // a4: 40 41 42 43 44 45 46 47
+  // a5: 50 51 52 53 54 55 56 57
+  // a6: 60 61 62 63 64 65 66 67
+  // a7: 70 71 72 73 74 75 76 77
+  // to:
+  // b0.val[0]: 00 10 02 12 04 14 06 16
+  // b0.val[1]: 01 11 03 13 05 15 07 17
+  // b1.val[0]: 20 30 22 32 24 34 26 36
+  // b1.val[1]: 21 31 23 33 25 35 27 37
+  // b2.val[0]: 40 50 42 52 44 54 46 56
+  // b2.val[1]: 41 51 43 53 45 55 47 57
+  // b3.val[0]: 60 70 62 72 64 74 66 76
+  // b3.val[1]: 61 71 63 73 65 75 67 77
+
+  const int16x8x2_t b0 = vtrnq_s16(*a0, *a1);
+  const int16x8x2_t b1 = vtrnq_s16(*a2, *a3);
+  const int16x8x2_t b2 = vtrnq_s16(*a4, *a5);
+  const int16x8x2_t b3 = vtrnq_s16(*a6, *a7);
+
+  // Swap 32 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30 04 14 24 34
+  // c0.val[1]: 02 12 22 32 06 16 26 36
+  // c1.val[0]: 01 11 21 31 05 15 25 35
+  // c1.val[1]: 03 13 23 33 07 17 27 37
+  // c2.val[0]: 40 50 60 70 44 54 64 74
+  // c2.val[1]: 42 52 62 72 46 56 66 76
+  // c3.val[0]: 41 51 61 71 45 55 65 75
+  // c3.val[1]: 43 53 63 73 47 57 67 77
+
+  const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
+                                   vreinterpretq_s32_s16(b1.val[0]));
+  const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
+                                   vreinterpretq_s32_s16(b1.val[1]));
+  const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
+                                   vreinterpretq_s32_s16(b3.val[0]));
+  const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
+                                   vreinterpretq_s32_s16(b3.val[1]));
+
+  *a0 = vcombine_s16(vget_low_s16(vreinterpretq_s16_s32(c0.val[0])),
+                     vget_low_s16(vreinterpretq_s16_s32(c2.val[0])));
+  *a4 = vcombine_s16(vget_high_s16(vreinterpretq_s16_s32(c0.val[0])),
+                     vget_high_s16(vreinterpretq_s16_s32(c2.val[0])));
+
+  *a2 = vcombine_s16(vget_low_s16(vreinterpretq_s16_s32(c0.val[1])),
+                     vget_low_s16(vreinterpretq_s16_s32(c2.val[1])));
+  *a6 = vcombine_s16(vget_high_s16(vreinterpretq_s16_s32(c0.val[1])),
+                     vget_high_s16(vreinterpretq_s16_s32(c2.val[1])));
+
+  *a1 = vcombine_s16(vget_low_s16(vreinterpretq_s16_s32(c1.val[0])),
+                     vget_low_s16(vreinterpretq_s16_s32(c3.val[0])));
+  *a5 = vcombine_s16(vget_high_s16(vreinterpretq_s16_s32(c1.val[0])),
+                     vget_high_s16(vreinterpretq_s16_s32(c3.val[0])));
+
+  *a3 = vcombine_s16(vget_low_s16(vreinterpretq_s16_s32(c1.val[1])),
+                     vget_low_s16(vreinterpretq_s16_s32(c3.val[1])));
+  *a7 = vcombine_s16(vget_high_s16(vreinterpretq_s16_s32(c1.val[1])),
+                     vget_high_s16(vreinterpretq_s16_s32(c3.val[1])));
+}
+
+static INLINE int16x8x2_t aom_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) {
+  int16x8x2_t b0;
+  b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)),
+                           vreinterpret_s16_s32(vget_low_s32(a1)));
+  b0.val[1] = vcombine_s16(vreinterpret_s16_s32(vget_high_s32(a0)),
+                           vreinterpret_s16_s32(vget_high_s32(a1)));
+  return b0;
+}
+
+static INLINE void transpose_s16_8x8q(int16x8_t *a0, int16x8_t *out) {
+  // Swap 16 bit elements. Goes from:
+  // a0: 00 01 02 03 04 05 06 07
+  // a1: 10 11 12 13 14 15 16 17
+  // a2: 20 21 22 23 24 25 26 27
+  // a3: 30 31 32 33 34 35 36 37
+  // a4: 40 41 42 43 44 45 46 47
+  // a5: 50 51 52 53 54 55 56 57
+  // a6: 60 61 62 63 64 65 66 67
+  // a7: 70 71 72 73 74 75 76 77
+  // to:
+  // b0.val[0]: 00 10 02 12 04 14 06 16
+  // b0.val[1]: 01 11 03 13 05 15 07 17
+  // b1.val[0]: 20 30 22 32 24 34 26 36
+  // b1.val[1]: 21 31 23 33 25 35 27 37
+  // b2.val[0]: 40 50 42 52 44 54 46 56
+  // b2.val[1]: 41 51 43 53 45 55 47 57
+  // b3.val[0]: 60 70 62 72 64 74 66 76
+  // b3.val[1]: 61 71 63 73 65 75 67 77
+
+  const int16x8x2_t b0 = vtrnq_s16(*a0, *(a0 + 1));
+  const int16x8x2_t b1 = vtrnq_s16(*(a0 + 2), *(a0 + 3));
+  const int16x8x2_t b2 = vtrnq_s16(*(a0 + 4), *(a0 + 5));
+  const int16x8x2_t b3 = vtrnq_s16(*(a0 + 6), *(a0 + 7));
+
+  // Swap 32 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30 04 14 24 34
+  // c0.val[1]: 02 12 22 32 06 16 26 36
+  // c1.val[0]: 01 11 21 31 05 15 25 35
+  // c1.val[1]: 03 13 23 33 07 17 27 37
+  // c2.val[0]: 40 50 60 70 44 54 64 74
+  // c2.val[1]: 42 52 62 72 46 56 66 76
+  // c3.val[0]: 41 51 61 71 45 55 65 75
+  // c3.val[1]: 43 53 63 73 47 57 67 77
+
+  const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
+                                   vreinterpretq_s32_s16(b1.val[0]));
+  const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
+                                   vreinterpretq_s32_s16(b1.val[1]));
+  const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
+                                   vreinterpretq_s32_s16(b3.val[0]));
+  const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
+                                   vreinterpretq_s32_s16(b3.val[1]));
+
+  // Swap 64 bit elements resulting in:
+  // d0.val[0]: 00 10 20 30 40 50 60 70
+  // d0.val[1]: 04 14 24 34 44 54 64 74
+  // d1.val[0]: 01 11 21 31 41 51 61 71
+  // d1.val[1]: 05 15 25 35 45 55 65 75
+  // d2.val[0]: 02 12 22 32 42 52 62 72
+  // d2.val[1]: 06 16 26 36 46 56 66 76
+  // d3.val[0]: 03 13 23 33 43 53 63 73
+  // d3.val[1]: 07 17 27 37 47 57 67 77
+  const int16x8x2_t d0 = aom_vtrnq_s64_to_s16(c0.val[0], c2.val[0]);
+  const int16x8x2_t d1 = aom_vtrnq_s64_to_s16(c1.val[0], c3.val[0]);
+  const int16x8x2_t d2 = aom_vtrnq_s64_to_s16(c0.val[1], c2.val[1]);
+  const int16x8x2_t d3 = aom_vtrnq_s64_to_s16(c1.val[1], c3.val[1]);
+
+  *out = d0.val[0];
+  *(out + 1) = d1.val[0];
+  *(out + 2) = d2.val[0];
+  *(out + 3) = d3.val[0];
+  *(out + 4) = d0.val[1];
+  *(out + 5) = d1.val[1];
+  *(out + 6) = d2.val[1];
+  *(out + 7) = d3.val[1];
+}
+
+static INLINE void transpose_s16_4x4d(int16x4_t *a0, int16x4_t *a1,
+                                      int16x4_t *a2, int16x4_t *a3) {
+  // Swap 16 bit elements. Goes from:
+  // a0: 00 01 02 03
+  // a1: 10 11 12 13
+  // a2: 20 21 22 23
+  // a3: 30 31 32 33
+  // to:
+  // b0.val[0]: 00 10 02 12
+  // b0.val[1]: 01 11 03 13
+  // b1.val[0]: 20 30 22 32
+  // b1.val[1]: 21 31 23 33
+
+  const int16x4x2_t b0 = vtrn_s16(*a0, *a1);
+  const int16x4x2_t b1 = vtrn_s16(*a2, *a3);
+
+  // Swap 32 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30
+  // c0.val[1]: 02 12 22 32
+  // c1.val[0]: 01 11 21 31
+  // c1.val[1]: 03 13 23 33
+
+  const int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]),
+                                  vreinterpret_s32_s16(b1.val[0]));
+  const int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]),
+                                  vreinterpret_s32_s16(b1.val[1]));
+
+  *a0 = vreinterpret_s16_s32(c0.val[0]);
+  *a1 = vreinterpret_s16_s32(c1.val[0]);
+  *a2 = vreinterpret_s16_s32(c0.val[1]);
+  *a3 = vreinterpret_s16_s32(c1.val[1]);
+}
+
+static INLINE int32x4x2_t aom_vtrnq_s64_to_s32(int32x4_t a0, int32x4_t a1) {
+  int32x4x2_t b0;
+  b0.val[0] = vcombine_s32(vget_low_s32(a0), vget_low_s32(a1));
+  b0.val[1] = vcombine_s32(vget_high_s32(a0), vget_high_s32(a1));
+  return b0;
+}
+
+static INLINE void transpose_s32_4x4(int32x4_t *a0, int32x4_t *a1,
+                                     int32x4_t *a2, int32x4_t *a3) {
+  // Swap 32 bit elements. Goes from:
+  // a0: 00 01 02 03
+  // a1: 10 11 12 13
+  // a2: 20 21 22 23
+  // a3: 30 31 32 33
+  // to:
+  // b0.val[0]: 00 10 02 12
+  // b0.val[1]: 01 11 03 13
+  // b1.val[0]: 20 30 22 32
+  // b1.val[1]: 21 31 23 33
+
+  const int32x4x2_t b0 = vtrnq_s32(*a0, *a1);
+  const int32x4x2_t b1 = vtrnq_s32(*a2, *a3);
+
+  // Swap 64 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30
+  // c0.val[1]: 02 12 22 32
+  // c1.val[0]: 01 11 21 31
+  // c1.val[1]: 03 13 23 33
+
+  const int32x4x2_t c0 = aom_vtrnq_s64_to_s32(b0.val[0], b1.val[0]);
+  const int32x4x2_t c1 = aom_vtrnq_s64_to_s32(b0.val[1], b1.val[1]);
+
+  *a0 = c0.val[0];
+  *a1 = c1.val[0];
+  *a2 = c0.val[1];
+  *a3 = c1.val[1];
+}
+
+#endif  // AOM_AV1_COMMON_ARM_TRANSPOSE_NEON_H_
diff --git a/libs/libaom/src/av1/common/arm/warp_plane_neon.c b/libs/libaom/src/av1/common/arm/warp_plane_neon.c
new file mode 100644
index 000000000..c10a34fcd
--- /dev/null
+++ b/libs/libaom/src/av1/common/arm/warp_plane_neon.c
@@ -0,0 +1,714 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <arm_neon.h>
+#include <memory.h>
+#include <math.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/mem.h"
+#include "config/av1_rtcd.h"
+#include "av1/common/warped_motion.h"
+#include "av1/common/scale.h"
+
+/* This is a modified version of 'av1_warped_filter' from warped_motion.c:
+   * Each coefficient is stored in 8 bits instead of 16 bits
+   * The coefficients are rearranged in the column order 0, 2, 4, 6, 1, 3, 5, 7
+
+     This is done in order to avoid overflow: Since the tap with the largest
+     coefficient could be any of taps 2, 3, 4 or 5, we can't use the summation
+     order ((0 + 1) + (4 + 5)) + ((2 + 3) + (6 + 7)) used in the regular
+     convolve functions.
+
+     Instead, we use the summation order
+     ((0 + 2) + (4 + 6)) + ((1 + 3) + (5 + 7)).
+     The rearrangement of coefficients in this table is so that we can get the
+     coefficients into the correct order more quickly.
+*/
+/* clang-format off */
+DECLARE_ALIGNED(8, static const int8_t,
+                filter_8bit_neon[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]) = {
+#if WARPEDPIXEL_PREC_BITS == 6
+  // [-1, 0)
+  { 0, 127,   0, 0,   0,   1, 0, 0}, { 0, 127,   0, 0,  -1,   2, 0, 0},
+  { 1, 127,  -1, 0,  -3,   4, 0, 0}, { 1, 126,  -2, 0,  -4,   6, 1, 0},
+  { 1, 126,  -3, 0,  -5,   8, 1, 0}, { 1, 125,  -4, 0,  -6,  11, 1, 0},
+  { 1, 124,  -4, 0,  -7,  13, 1, 0}, { 2, 123,  -5, 0,  -8,  15, 1, 0},
+  { 2, 122,  -6, 0,  -9,  18, 1, 0}, { 2, 121,  -6, 0, -10,  20, 1, 0},
+  { 2, 120,  -7, 0, -11,  22, 2, 0}, { 2, 119,  -8, 0, -12,  25, 2, 0},
+  { 3, 117,  -8, 0, -13,  27, 2, 0}, { 3, 116,  -9, 0, -13,  29, 2, 0},
+  { 3, 114, -10, 0, -14,  32, 3, 0}, { 3, 113, -10, 0, -15,  35, 2, 0},
+  { 3, 111, -11, 0, -15,  37, 3, 0}, { 3, 109, -11, 0, -16,  40, 3, 0},
+  { 3, 108, -12, 0, -16,  42, 3, 0}, { 4, 106, -13, 0, -17,  45, 3, 0},
+  { 4, 104, -13, 0, -17,  47, 3, 0}, { 4, 102, -14, 0, -17,  50, 3, 0},
+  { 4, 100, -14, 0, -17,  52, 3, 0}, { 4,  98, -15, 0, -18,  55, 4, 0},
+  { 4,  96, -15, 0, -18,  58, 3, 0}, { 4,  94, -16, 0, -18,  60, 4, 0},
+  { 4,  91, -16, 0, -18,  63, 4, 0}, { 4,  89, -16, 0, -18,  65, 4, 0},
+  { 4,  87, -17, 0, -18,  68, 4, 0}, { 4,  85, -17, 0, -18,  70, 4, 0},
+  { 4,  82, -17, 0, -18,  73, 4, 0}, { 4,  80, -17, 0, -18,  75, 4, 0},
+  { 4,  78, -18, 0, -18,  78, 4, 0}, { 4,  75, -18, 0, -17,  80, 4, 0},
+  { 4,  73, -18, 0, -17,  82, 4, 0}, { 4,  70, -18, 0, -17,  85, 4, 0},
+  { 4,  68, -18, 0, -17,  87, 4, 0}, { 4,  65, -18, 0, -16,  89, 4, 0},
+  { 4,  63, -18, 0, -16,  91, 4, 0}, { 4,  60, -18, 0, -16,  94, 4, 0},
+  { 3,  58, -18, 0, -15,  96, 4, 0}, { 4,  55, -18, 0, -15,  98, 4, 0},
+  { 3,  52, -17, 0, -14, 100, 4, 0}, { 3,  50, -17, 0, -14, 102, 4, 0},
+  { 3,  47, -17, 0, -13, 104, 4, 0}, { 3,  45, -17, 0, -13, 106, 4, 0},
+  { 3,  42, -16, 0, -12, 108, 3, 0}, { 3,  40, -16, 0, -11, 109, 3, 0},
+  { 3,  37, -15, 0, -11, 111, 3, 0}, { 2,  35, -15, 0, -10, 113, 3, 0},
+  { 3,  32, -14, 0, -10, 114, 3, 0}, { 2,  29, -13, 0,  -9, 116, 3, 0},
+  { 2,  27, -13, 0,  -8, 117, 3, 0}, { 2,  25, -12, 0,  -8, 119, 2, 0},
+  { 2,  22, -11, 0,  -7, 120, 2, 0}, { 1,  20, -10, 0,  -6, 121, 2, 0},
+  { 1,  18,  -9, 0,  -6, 122, 2, 0}, { 1,  15,  -8, 0,  -5, 123, 2, 0},
+  { 1,  13,  -7, 0,  -4, 124, 1, 0}, { 1,  11,  -6, 0,  -4, 125, 1, 0},
+  { 1,   8,  -5, 0,  -3, 126, 1, 0}, { 1,   6,  -4, 0,  -2, 126, 1, 0},
+  { 0,   4,  -3, 0,  -1, 127, 1, 0}, { 0,   2,  -1, 0,   0, 127, 0, 0},
+  // [0, 1)
+  { 0,   0,   1, 0, 0, 127,   0,  0}, { 0,  -1,   2, 0, 0, 127,   0,  0},
+  { 0,  -3,   4, 1, 1, 127,  -2,  0}, { 0,  -5,   6, 1, 1, 127,  -2,  0},
+  { 0,  -6,   8, 1, 2, 126,  -3,  0}, {-1,  -7,  11, 2, 2, 126,  -4, -1},
+  {-1,  -8,  13, 2, 3, 125,  -5, -1}, {-1, -10,  16, 3, 3, 124,  -6, -1},
+  {-1, -11,  18, 3, 4, 123,  -7, -1}, {-1, -12,  20, 3, 4, 122,  -7, -1},
+  {-1, -13,  23, 3, 4, 121,  -8, -1}, {-2, -14,  25, 4, 5, 120,  -9, -1},
+  {-1, -15,  27, 4, 5, 119, -10, -1}, {-1, -16,  30, 4, 5, 118, -11, -1},
+  {-2, -17,  33, 5, 6, 116, -12, -1}, {-2, -17,  35, 5, 6, 114, -12, -1},
+  {-2, -18,  38, 5, 6, 113, -13, -1}, {-2, -19,  41, 6, 7, 111, -14, -2},
+  {-2, -19,  43, 6, 7, 110, -15, -2}, {-2, -20,  46, 6, 7, 108, -15, -2},
+  {-2, -20,  49, 6, 7, 106, -16, -2}, {-2, -21,  51, 7, 7, 104, -16, -2},
+  {-2, -21,  54, 7, 7, 102, -17, -2}, {-2, -21,  56, 7, 8, 100, -18, -2},
+  {-2, -22,  59, 7, 8,  98, -18, -2}, {-2, -22,  62, 7, 8,  96, -19, -2},
+  {-2, -22,  64, 7, 8,  94, -19, -2}, {-2, -22,  67, 8, 8,  91, -20, -2},
+  {-2, -22,  69, 8, 8,  89, -20, -2}, {-2, -22,  72, 8, 8,  87, -21, -2},
+  {-2, -21,  74, 8, 8,  84, -21, -2}, {-2, -22,  77, 8, 8,  82, -21, -2},
+  {-2, -21,  79, 8, 8,  79, -21, -2}, {-2, -21,  82, 8, 8,  77, -22, -2},
+  {-2, -21,  84, 8, 8,  74, -21, -2}, {-2, -21,  87, 8, 8,  72, -22, -2},
+  {-2, -20,  89, 8, 8,  69, -22, -2}, {-2, -20,  91, 8, 8,  67, -22, -2},
+  {-2, -19,  94, 8, 7,  64, -22, -2}, {-2, -19,  96, 8, 7,  62, -22, -2},
+  {-2, -18,  98, 8, 7,  59, -22, -2}, {-2, -18, 100, 8, 7,  56, -21, -2},
+  {-2, -17, 102, 7, 7,  54, -21, -2}, {-2, -16, 104, 7, 7,  51, -21, -2},
+  {-2, -16, 106, 7, 6,  49, -20, -2}, {-2, -15, 108, 7, 6,  46, -20, -2},
+  {-2, -15, 110, 7, 6,  43, -19, -2}, {-2, -14, 111, 7, 6,  41, -19, -2},
+  {-1, -13, 113, 6, 5,  38, -18, -2}, {-1, -12, 114, 6, 5,  35, -17, -2},
+  {-1, -12, 116, 6, 5,  33, -17, -2}, {-1, -11, 118, 5, 4,  30, -16, -1},
+  {-1, -10, 119, 5, 4,  27, -15, -1}, {-1,  -9, 120, 5, 4,  25, -14, -2},
+  {-1,  -8, 121, 4, 3,  23, -13, -1}, {-1,  -7, 122, 4, 3,  20, -12, -1},
+  {-1,  -7, 123, 4, 3,  18, -11, -1}, {-1,  -6, 124, 3, 3,  16, -10, -1},
+  {-1,  -5, 125, 3, 2,  13,  -8, -1}, {-1,  -4, 126, 2, 2,  11,  -7, -1},
+  { 0,  -3, 126, 2, 1,   8,  -6,  0}, { 0,  -2, 127, 1, 1,   6,  -5,  0},
+  { 0,  -2, 127, 1, 1,   4,  -3,  0}, { 0,   0, 127, 0, 0,   2,  -1,  0},
+  // [1, 2)
+  { 0, 0, 127,   0, 0,   1,   0, 0}, { 0, 0, 127,   0, 0,  -1,   2, 0},
+  { 0, 1, 127,  -1, 0,  -3,   4, 0}, { 0, 1, 126,  -2, 0,  -4,   6, 1},
+  { 0, 1, 126,  -3, 0,  -5,   8, 1}, { 0, 1, 125,  -4, 0,  -6,  11, 1},
+  { 0, 1, 124,  -4, 0,  -7,  13, 1}, { 0, 2, 123,  -5, 0,  -8,  15, 1},
+  { 0, 2, 122,  -6, 0,  -9,  18, 1}, { 0, 2, 121,  -6, 0, -10,  20, 1},
+  { 0, 2, 120,  -7, 0, -11,  22, 2}, { 0, 2, 119,  -8, 0, -12,  25, 2},
+  { 0, 3, 117,  -8, 0, -13,  27, 2}, { 0, 3, 116,  -9, 0, -13,  29, 2},
+  { 0, 3, 114, -10, 0, -14,  32, 3}, { 0, 3, 113, -10, 0, -15,  35, 2},
+  { 0, 3, 111, -11, 0, -15,  37, 3}, { 0, 3, 109, -11, 0, -16,  40, 3},
+  { 0, 3, 108, -12, 0, -16,  42, 3}, { 0, 4, 106, -13, 0, -17,  45, 3},
+  { 0, 4, 104, -13, 0, -17,  47, 3}, { 0, 4, 102, -14, 0, -17,  50, 3},
+  { 0, 4, 100, -14, 0, -17,  52, 3}, { 0, 4,  98, -15, 0, -18,  55, 4},
+  { 0, 4,  96, -15, 0, -18,  58, 3}, { 0, 4,  94, -16, 0, -18,  60, 4},
+  { 0, 4,  91, -16, 0, -18,  63, 4}, { 0, 4,  89, -16, 0, -18,  65, 4},
+  { 0, 4,  87, -17, 0, -18,  68, 4}, { 0, 4,  85, -17, 0, -18,  70, 4},
+  { 0, 4,  82, -17, 0, -18,  73, 4}, { 0, 4,  80, -17, 0, -18,  75, 4},
+  { 0, 4,  78, -18, 0, -18,  78, 4}, { 0, 4,  75, -18, 0, -17,  80, 4},
+  { 0, 4,  73, -18, 0, -17,  82, 4}, { 0, 4,  70, -18, 0, -17,  85, 4},
+  { 0, 4,  68, -18, 0, -17,  87, 4}, { 0, 4,  65, -18, 0, -16,  89, 4},
+  { 0, 4,  63, -18, 0, -16,  91, 4}, { 0, 4,  60, -18, 0, -16,  94, 4},
+  { 0, 3,  58, -18, 0, -15,  96, 4}, { 0, 4,  55, -18, 0, -15,  98, 4},
+  { 0, 3,  52, -17, 0, -14, 100, 4}, { 0, 3,  50, -17, 0, -14, 102, 4},
+  { 0, 3,  47, -17, 0, -13, 104, 4}, { 0, 3,  45, -17, 0, -13, 106, 4},
+  { 0, 3,  42, -16, 0, -12, 108, 3}, { 0, 3,  40, -16, 0, -11, 109, 3},
+  { 0, 3,  37, -15, 0, -11, 111, 3}, { 0, 2,  35, -15, 0, -10, 113, 3},
+  { 0, 3,  32, -14, 0, -10, 114, 3}, { 0, 2,  29, -13, 0,  -9, 116, 3},
+  { 0, 2,  27, -13, 0,  -8, 117, 3}, { 0, 2,  25, -12, 0,  -8, 119, 2},
+  { 0, 2,  22, -11, 0,  -7, 120, 2}, { 0, 1,  20, -10, 0,  -6, 121, 2},
+  { 0, 1,  18,  -9, 0,  -6, 122, 2}, { 0, 1,  15,  -8, 0,  -5, 123, 2},
+  { 0, 1,  13,  -7, 0,  -4, 124, 1}, { 0, 1,  11,  -6, 0,  -4, 125, 1},
+  { 0, 1,   8,  -5, 0,  -3, 126, 1}, { 0, 1,   6,  -4, 0,  -2, 126, 1},
+  { 0, 0,   4,  -3, 0,  -1, 127, 1}, { 0, 0,   2,  -1, 0,   0, 127, 0},
+  // dummy (replicate row index 191)
+  { 0, 0,   2,  -1, 0,   0, 127, 0},
+
+#else
+  // [-1, 0)
+  { 0, 127,   0, 0,   0,   1, 0, 0}, { 1, 127,  -1, 0,  -3,   4, 0, 0},
+  { 1, 126,  -3, 0,  -5,   8, 1, 0}, { 1, 124,  -4, 0,  -7,  13, 1, 0},
+  { 2, 122,  -6, 0,  -9,  18, 1, 0}, { 2, 120,  -7, 0, -11,  22, 2, 0},
+  { 3, 117,  -8, 0, -13,  27, 2, 0}, { 3, 114, -10, 0, -14,  32, 3, 0},
+  { 3, 111, -11, 0, -15,  37, 3, 0}, { 3, 108, -12, 0, -16,  42, 3, 0},
+  { 4, 104, -13, 0, -17,  47, 3, 0}, { 4, 100, -14, 0, -17,  52, 3, 0},
+  { 4,  96, -15, 0, -18,  58, 3, 0}, { 4,  91, -16, 0, -18,  63, 4, 0},
+  { 4,  87, -17, 0, -18,  68, 4, 0}, { 4,  82, -17, 0, -18,  73, 4, 0},
+  { 4,  78, -18, 0, -18,  78, 4, 0}, { 4,  73, -18, 0, -17,  82, 4, 0},
+  { 4,  68, -18, 0, -17,  87, 4, 0}, { 4,  63, -18, 0, -16,  91, 4, 0},
+  { 3,  58, -18, 0, -15,  96, 4, 0}, { 3,  52, -17, 0, -14, 100, 4, 0},
+  { 3,  47, -17, 0, -13, 104, 4, 0}, { 3,  42, -16, 0, -12, 108, 3, 0},
+  { 3,  37, -15, 0, -11, 111, 3, 0}, { 3,  32, -14, 0, -10, 114, 3, 0},
+  { 2,  27, -13, 0,  -8, 117, 3, 0}, { 2,  22, -11, 0,  -7, 120, 2, 0},
+  { 1,  18,  -9, 0,  -6, 122, 2, 0}, { 1,  13,  -7, 0,  -4, 124, 1, 0},
+  { 1,   8,  -5, 0,  -3, 126, 1, 0}, { 0,   4,  -3, 0,  -1, 127, 1, 0},
+  // [0, 1)
+  { 0,   0,   1, 0, 0, 127,   0,  0}, { 0,  -3,   4, 1, 1, 127,  -2,  0},
+  { 0,  -6,   8, 1, 2, 126,  -3,  0}, {-1,  -8,  13, 2, 3, 125,  -5, -1},
+  {-1, -11,  18, 3, 4, 123,  -7, -1}, {-1, -13,  23, 3, 4, 121,  -8, -1},
+  {-1, -15,  27, 4, 5, 119, -10, -1}, {-2, -17,  33, 5, 6, 116, -12, -1},
+  {-2, -18,  38, 5, 6, 113, -13, -1}, {-2, -19,  43, 6, 7, 110, -15, -2},
+  {-2, -20,  49, 6, 7, 106, -16, -2}, {-2, -21,  54, 7, 7, 102, -17, -2},
+  {-2, -22,  59, 7, 8,  98, -18, -2}, {-2, -22,  64, 7, 8,  94, -19, -2},
+  {-2, -22,  69, 8, 8,  89, -20, -2}, {-2, -21,  74, 8, 8,  84, -21, -2},
+  {-2, -21,  79, 8, 8,  79, -21, -2}, {-2, -21,  84, 8, 8,  74, -21, -2},
+  {-2, -20,  89, 8, 8,  69, -22, -2}, {-2, -19,  94, 8, 7,  64, -22, -2},
+  {-2, -18,  98, 8, 7,  59, -22, -2}, {-2, -17, 102, 7, 7,  54, -21, -2},
+  {-2, -16, 106, 7, 6,  49, -20, -2}, {-2, -15, 110, 7, 6,  43, -19, -2},
+  {-1, -13, 113, 6, 5,  38, -18, -2}, {-1, -12, 116, 6, 5,  33, -17, -2},
+  {-1, -10, 119, 5, 4,  27, -15, -1}, {-1,  -8, 121, 4, 3,  23, -13, -1},
+  {-1,  -7, 123, 4, 3,  18, -11, -1}, {-1,  -5, 125, 3, 2,  13,  -8, -1},
+  { 0,  -3, 126, 2, 1,   8,  -6,  0}, { 0,  -2, 127, 1, 1,   4,  -3,  0},
+  // [1, 2)
+  { 0,  0, 127,   0, 0,   1,   0, 0}, { 0, 1, 127,  -1, 0,  -3,   4, 0},
+  { 0,  1, 126,  -3, 0,  -5,   8, 1}, { 0, 1, 124,  -4, 0,  -7,  13, 1},
+  { 0,  2, 122,  -6, 0,  -9,  18, 1}, { 0, 2, 120,  -7, 0, -11,  22, 2},
+  { 0,  3, 117,  -8, 0, -13,  27, 2}, { 0, 3, 114, -10, 0, -14,  32, 3},
+  { 0,  3, 111, -11, 0, -15,  37, 3}, { 0, 3, 108, -12, 0, -16,  42, 3},
+  { 0,  4, 104, -13, 0, -17,  47, 3}, { 0, 4, 100, -14, 0, -17,  52, 3},
+  { 0,  4,  96, -15, 0, -18,  58, 3}, { 0, 4,  91, -16, 0, -18,  63, 4},
+  { 0,  4,  87, -17, 0, -18,  68, 4}, { 0, 4,  82, -17, 0, -18,  73, 4},
+  { 0,  4,  78, -18, 0, -18,  78, 4}, { 0, 4,  73, -18, 0, -17,  82, 4},
+  { 0,  4,  68, -18, 0, -17,  87, 4}, { 0, 4,  63, -18, 0, -16,  91, 4},
+  { 0,  3,  58, -18, 0, -15,  96, 4}, { 0, 3,  52, -17, 0, -14, 100, 4},
+  { 0,  3,  47, -17, 0, -13, 104, 4}, { 0, 3,  42, -16, 0, -12, 108, 3},
+  { 0,  3,  37, -15, 0, -11, 111, 3}, { 0, 3,  32, -14, 0, -10, 114, 3},
+  { 0,  2,  27, -13, 0,  -8, 117, 3}, { 0, 2,  22, -11, 0,  -7, 120, 2},
+  { 0,  1,  18,  -9, 0,  -6, 122, 2}, { 0, 1,  13,  -7, 0,  -4, 124, 1},
+  { 0,  1,   8,  -5, 0,  -3, 126, 1}, { 0, 0,   4,  -3, 0,  -1, 127, 1},
+  // dummy (replicate row index 95)
+  { 0, 0,   4,  -3, 0,  -1, 127, 1},
+#endif  // WARPEDPIXEL_PREC_BITS == 6
+};
+/* clang-format on */
+
+static INLINE void convolve(int32x2x2_t x0, int32x2x2_t x1, uint8x8_t src_0,
+                            uint8x8_t src_1, int16x4_t *res) {
+  int16x8_t coeff_0, coeff_1;
+  int16x8_t pix_0, pix_1;
+
+  coeff_0 = vcombine_s16(vreinterpret_s16_s32(x0.val[0]),
+                         vreinterpret_s16_s32(x1.val[0]));
+  coeff_1 = vcombine_s16(vreinterpret_s16_s32(x0.val[1]),
+                         vreinterpret_s16_s32(x1.val[1]));
+
+  pix_0 = vreinterpretq_s16_u16(vmovl_u8(src_0));
+  pix_0 = vmulq_s16(coeff_0, pix_0);
+
+  pix_1 = vreinterpretq_s16_u16(vmovl_u8(src_1));
+  pix_0 = vmlaq_s16(pix_0, coeff_1, pix_1);
+
+  *res = vpadd_s16(vget_low_s16(pix_0), vget_high_s16(pix_0));
+}
+
+static INLINE void horizontal_filter_neon(uint8x16_t src_1, uint8x16_t src_2,
+                                          uint8x16_t src_3, uint8x16_t src_4,
+                                          int16x8_t *tmp_dst, int sx, int alpha,
+                                          int k, const int offset_bits_horiz,
+                                          const int reduce_bits_horiz) {
+  const uint8x16_t mask = { 255, 0, 255, 0, 255, 0, 255, 0,
+                            255, 0, 255, 0, 255, 0, 255, 0 };
+  const int32x4_t add_const = vdupq_n_s32((int32_t)(1 << offset_bits_horiz));
+  const int16x8_t shift = vdupq_n_s16(-(int16_t)reduce_bits_horiz);
+
+  int16x8_t f0, f1, f2, f3, f4, f5, f6, f7;
+  int32x2x2_t b0, b1;
+  uint8x8_t src_1_low, src_2_low, src_3_low, src_4_low, src_5_low, src_6_low;
+  int32x4_t tmp_res_low, tmp_res_high;
+  uint16x8_t res;
+  int16x4_t res_0246_even, res_0246_odd, res_1357_even, res_1357_odd;
+
+  uint8x16_t tmp_0 = vandq_u8(src_1, mask);
+  uint8x16_t tmp_1 = vandq_u8(src_2, mask);
+  uint8x16_t tmp_2 = vandq_u8(src_3, mask);
+  uint8x16_t tmp_3 = vandq_u8(src_4, mask);
+
+  tmp_2 = vextq_u8(tmp_0, tmp_0, 1);
+  tmp_3 = vextq_u8(tmp_1, tmp_1, 1);
+
+  src_1 = vaddq_u8(tmp_0, tmp_2);
+  src_2 = vaddq_u8(tmp_1, tmp_3);
+
+  src_1_low = vget_low_u8(src_1);
+  src_2_low = vget_low_u8(src_2);
+  src_3_low = vget_low_u8(vextq_u8(src_1, src_1, 4));
+  src_4_low = vget_low_u8(vextq_u8(src_2, src_2, 4));
+  src_5_low = vget_low_u8(vextq_u8(src_1, src_1, 2));
+  src_6_low = vget_low_u8(vextq_u8(src_1, src_1, 6));
+
+  // Loading the 8 filter taps
+  f0 = vmovl_s8(
+      vld1_s8(filter_8bit_neon[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]));
+  f1 = vmovl_s8(
+      vld1_s8(filter_8bit_neon[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]));
+  f2 = vmovl_s8(
+      vld1_s8(filter_8bit_neon[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]));
+  f3 = vmovl_s8(
+      vld1_s8(filter_8bit_neon[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]));
+  f4 = vmovl_s8(
+      vld1_s8(filter_8bit_neon[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]));
+  f5 = vmovl_s8(
+      vld1_s8(filter_8bit_neon[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]));
+  f6 = vmovl_s8(
+      vld1_s8(filter_8bit_neon[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]));
+  f7 = vmovl_s8(
+      vld1_s8(filter_8bit_neon[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]));
+
+  b0 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(f0)),
+                vreinterpret_s32_s16(vget_low_s16(f2)));
+  b1 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(f4)),
+                vreinterpret_s32_s16(vget_low_s16(f6)));
+  convolve(b0, b1, src_1_low, src_3_low, &res_0246_even);
+
+  b0 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(f1)),
+                vreinterpret_s32_s16(vget_low_s16(f3)));
+  b1 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(f5)),
+                vreinterpret_s32_s16(vget_low_s16(f7)));
+  convolve(b0, b1, src_2_low, src_4_low, &res_0246_odd);
+
+  b0 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(f0)),
+                vreinterpret_s32_s16(vget_high_s16(f2)));
+  b1 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(f4)),
+                vreinterpret_s32_s16(vget_high_s16(f6)));
+  convolve(b0, b1, src_2_low, src_4_low, &res_1357_even);
+
+  b0 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(f1)),
+                vreinterpret_s32_s16(vget_high_s16(f3)));
+  b1 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(f5)),
+                vreinterpret_s32_s16(vget_high_s16(f7)));
+  convolve(b0, b1, src_5_low, src_6_low, &res_1357_odd);
+
+  tmp_res_low = vaddl_s16(res_0246_even, res_1357_even);
+  tmp_res_high = vaddl_s16(res_0246_odd, res_1357_odd);
+
+  tmp_res_low = vaddq_s32(tmp_res_low, add_const);
+  tmp_res_high = vaddq_s32(tmp_res_high, add_const);
+
+  res = vcombine_u16(vqmovun_s32(tmp_res_low), vqmovun_s32(tmp_res_high));
+  res = vqrshlq_u16(res, shift);
+
+  tmp_dst[k + 7] = vreinterpretq_s16_u16(res);
+}
+
+static INLINE void vertical_filter_neon(const int16x8_t *src,
+                                        int32x4_t *res_low, int32x4_t *res_high,
+                                        int sy, int gamma) {
+  int16x4_t src_0, src_1, fltr_0, fltr_1;
+  int32x4_t res_0, res_1;
+  int32x2_t res_0_im, res_1_im;
+  int32x4_t res_even, res_odd, im_res_0, im_res_1;
+
+  int16x8_t f0, f1, f2, f3, f4, f5, f6, f7;
+  int16x8x2_t b0, b1, b2, b3;
+  int32x4x2_t c0, c1, c2, c3;
+  int32x4x2_t d0, d1, d2, d3;
+
+  b0 = vtrnq_s16(src[0], src[1]);
+  b1 = vtrnq_s16(src[2], src[3]);
+  b2 = vtrnq_s16(src[4], src[5]);
+  b3 = vtrnq_s16(src[6], src[7]);
+
+  c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
+                 vreinterpretq_s32_s16(b0.val[1]));
+  c1 = vtrnq_s32(vreinterpretq_s32_s16(b1.val[0]),
+                 vreinterpretq_s32_s16(b1.val[1]));
+  c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
+                 vreinterpretq_s32_s16(b2.val[1]));
+  c3 = vtrnq_s32(vreinterpretq_s32_s16(b3.val[0]),
+                 vreinterpretq_s32_s16(b3.val[1]));
+
+  f0 = vld1q_s16((int16_t *)(av1_warped_filter +
+                             ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  f1 = vld1q_s16((int16_t *)(av1_warped_filter +
+                             ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  f2 = vld1q_s16((int16_t *)(av1_warped_filter +
+                             ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  f3 = vld1q_s16((int16_t *)(av1_warped_filter +
+                             ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  f4 = vld1q_s16((int16_t *)(av1_warped_filter +
+                             ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  f5 = vld1q_s16((int16_t *)(av1_warped_filter +
+                             ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  f6 = vld1q_s16((int16_t *)(av1_warped_filter +
+                             ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  f7 = vld1q_s16((int16_t *)(av1_warped_filter +
+                             ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+  d0 = vtrnq_s32(vreinterpretq_s32_s16(f0), vreinterpretq_s32_s16(f2));
+  d1 = vtrnq_s32(vreinterpretq_s32_s16(f4), vreinterpretq_s32_s16(f6));
+  d2 = vtrnq_s32(vreinterpretq_s32_s16(f1), vreinterpretq_s32_s16(f3));
+  d3 = vtrnq_s32(vreinterpretq_s32_s16(f5), vreinterpretq_s32_s16(f7));
+
+  // row:0,1 even_col:0,2
+  src_0 = vget_low_s16(vreinterpretq_s16_s32(c0.val[0]));
+  fltr_0 = vget_low_s16(vreinterpretq_s16_s32(d0.val[0]));
+  res_0 = vmull_s16(src_0, fltr_0);
+
+  // row:0,1,2,3 even_col:0,2
+  src_0 = vget_low_s16(vreinterpretq_s16_s32(c1.val[0]));
+  fltr_0 = vget_low_s16(vreinterpretq_s16_s32(d0.val[1]));
+  res_0 = vmlal_s16(res_0, src_0, fltr_0);
+  res_0_im = vpadd_s32(vget_low_s32(res_0), vget_high_s32(res_0));
+
+  // row:0,1 even_col:4,6
+  src_1 = vget_low_s16(vreinterpretq_s16_s32(c0.val[1]));
+  fltr_1 = vget_low_s16(vreinterpretq_s16_s32(d1.val[0]));
+  res_1 = vmull_s16(src_1, fltr_1);
+
+  // row:0,1,2,3 even_col:4,6
+  src_1 = vget_low_s16(vreinterpretq_s16_s32(c1.val[1]));
+  fltr_1 = vget_low_s16(vreinterpretq_s16_s32(d1.val[1]));
+  res_1 = vmlal_s16(res_1, src_1, fltr_1);
+  res_1_im = vpadd_s32(vget_low_s32(res_1), vget_high_s32(res_1));
+
+  // row:0,1,2,3 even_col:0,2,4,6
+  im_res_0 = vcombine_s32(res_0_im, res_1_im);
+
+  // row:4,5 even_col:0,2
+  src_0 = vget_low_s16(vreinterpretq_s16_s32(c2.val[0]));
+  fltr_0 = vget_high_s16(vreinterpretq_s16_s32(d0.val[0]));
+  res_0 = vmull_s16(src_0, fltr_0);
+
+  // row:4,5,6,7 even_col:0,2
+  src_0 = vget_low_s16(vreinterpretq_s16_s32(c3.val[0]));
+  fltr_0 = vget_high_s16(vreinterpretq_s16_s32(d0.val[1]));
+  res_0 = vmlal_s16(res_0, src_0, fltr_0);
+  res_0_im = vpadd_s32(vget_low_s32(res_0), vget_high_s32(res_0));
+
+  // row:4,5 even_col:4,6
+  src_1 = vget_low_s16(vreinterpretq_s16_s32(c2.val[1]));
+  fltr_1 = vget_high_s16(vreinterpretq_s16_s32(d1.val[0]));
+  res_1 = vmull_s16(src_1, fltr_1);
+
+  // row:4,5,6,7 even_col:4,6
+  src_1 = vget_low_s16(vreinterpretq_s16_s32(c3.val[1]));
+  fltr_1 = vget_high_s16(vreinterpretq_s16_s32(d1.val[1]));
+  res_1 = vmlal_s16(res_1, src_1, fltr_1);
+  res_1_im = vpadd_s32(vget_low_s32(res_1), vget_high_s32(res_1));
+
+  // row:4,5,6,7 even_col:0,2,4,6
+  im_res_1 = vcombine_s32(res_0_im, res_1_im);
+
+  // row:0-7 even_col:0,2,4,6
+  res_even = vaddq_s32(im_res_0, im_res_1);
+
+  // row:0,1 odd_col:1,3
+  src_0 = vget_high_s16(vreinterpretq_s16_s32(c0.val[0]));
+  fltr_0 = vget_low_s16(vreinterpretq_s16_s32(d2.val[0]));
+  res_0 = vmull_s16(src_0, fltr_0);
+
+  // row:0,1,2,3 odd_col:1,3
+  src_0 = vget_high_s16(vreinterpretq_s16_s32(c1.val[0]));
+  fltr_0 = vget_low_s16(vreinterpretq_s16_s32(d2.val[1]));
+  res_0 = vmlal_s16(res_0, src_0, fltr_0);
+  res_0_im = vpadd_s32(vget_low_s32(res_0), vget_high_s32(res_0));
+
+  // row:0,1 odd_col:5,7
+  src_1 = vget_high_s16(vreinterpretq_s16_s32(c0.val[1]));
+  fltr_1 = vget_low_s16(vreinterpretq_s16_s32(d3.val[0]));
+  res_1 = vmull_s16(src_1, fltr_1);
+
+  // row:0,1,2,3 odd_col:5,7
+  src_1 = vget_high_s16(vreinterpretq_s16_s32(c1.val[1]));
+  fltr_1 = vget_low_s16(vreinterpretq_s16_s32(d3.val[1]));
+  res_1 = vmlal_s16(res_1, src_1, fltr_1);
+  res_1_im = vpadd_s32(vget_low_s32(res_1), vget_high_s32(res_1));
+
+  // row:0,1,2,3 odd_col:1,3,5,7
+  im_res_0 = vcombine_s32(res_0_im, res_1_im);
+
+  // row:4,5 odd_col:1,3
+  src_0 = vget_high_s16(vreinterpretq_s16_s32(c2.val[0]));
+  fltr_0 = vget_high_s16(vreinterpretq_s16_s32(d2.val[0]));
+  res_0 = vmull_s16(src_0, fltr_0);
+
+  // row:4,5,6,7 odd_col:1,3
+  src_0 = vget_high_s16(vreinterpretq_s16_s32(c3.val[0]));
+  fltr_0 = vget_high_s16(vreinterpretq_s16_s32(d2.val[1]));
+  res_0 = vmlal_s16(res_0, src_0, fltr_0);
+  res_0_im = vpadd_s32(vget_low_s32(res_0), vget_high_s32(res_0));
+
+  // row:4,5 odd_col:5,7
+  src_1 = vget_high_s16(vreinterpretq_s16_s32(c2.val[1]));
+  fltr_1 = vget_high_s16(vreinterpretq_s16_s32(d3.val[0]));
+  res_1 = vmull_s16(src_1, fltr_1);
+
+  // row:4,5,6,7 odd_col:5,7
+  src_1 = vget_high_s16(vreinterpretq_s16_s32(c3.val[1]));
+  fltr_1 = vget_high_s16(vreinterpretq_s16_s32(d3.val[1]));
+  res_1 = vmlal_s16(res_1, src_1, fltr_1);
+  res_1_im = vpadd_s32(vget_low_s32(res_1), vget_high_s32(res_1));
+
+  // row:4,5,6,7 odd_col:1,3,5,7
+  im_res_1 = vcombine_s32(res_0_im, res_1_im);
+
+  // row:0-7 odd_col:1,3,5,7
+  res_odd = vaddq_s32(im_res_0, im_res_1);
+
+  // reordering as 0 1 2 3 | 4 5 6 7
+  c0 = vtrnq_s32(res_even, res_odd);
+
+  // Final store
+  *res_low = vcombine_s32(vget_low_s32(c0.val[0]), vget_low_s32(c0.val[1]));
+  *res_high = vcombine_s32(vget_high_s32(c0.val[0]), vget_high_s32(c0.val[1]));
+}
+
+void av1_warp_affine_neon(const int32_t *mat, const uint8_t *ref, int width,
+                          int height, int stride, uint8_t *pred, int p_col,
+                          int p_row, int p_width, int p_height, int p_stride,
+                          int subsampling_x, int subsampling_y,
+                          ConvolveParams *conv_params, int16_t alpha,
+                          int16_t beta, int16_t gamma, int16_t delta) {
+  int16x8_t tmp[15];
+  const int bd = 8;
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const int32x4_t fwd = vdupq_n_s32((int32_t)w0);
+  const int32x4_t bwd = vdupq_n_s32((int32_t)w1);
+  const int16x8_t sub_constant = vdupq_n_s16((1 << (bd - 1)) + (1 << bd));
+
+  int limit = 0;
+  uint8x16_t vec_dup, mask_val;
+  int32x4_t res_lo, res_hi;
+  int16x8_t result_final;
+  uint8x16_t src_1, src_2, src_3, src_4;
+  uint8x16_t indx_vec = {
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+  };
+  uint8x16_t cmp_vec;
+
+  const int reduce_bits_horiz = conv_params->round_0;
+  const int reduce_bits_vert = conv_params->is_compound
+                                   ? conv_params->round_1
+                                   : 2 * FILTER_BITS - reduce_bits_horiz;
+  const int32x4_t shift_vert = vdupq_n_s32(-(int32_t)reduce_bits_vert);
+  const int offset_bits_horiz = bd + FILTER_BITS - 1;
+
+  assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
+
+  const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
+  int32x4_t add_const_vert = vdupq_n_s32((int32_t)(1 << offset_bits_vert));
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int16x4_t round_bits_vec = vdup_n_s16(-(int16_t)round_bits);
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int16x4_t res_sub_const =
+      vdup_n_s16(-((1 << (offset_bits - conv_params->round_1)) +
+                   (1 << (offset_bits - conv_params->round_1 - 1))));
+  int k;
+
+  assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
+
+  for (int i = 0; i < p_height; i += 8) {
+    for (int j = 0; j < p_width; j += 8) {
+      const int32_t src_x = (p_col + j + 4) << subsampling_x;
+      const int32_t src_y = (p_row + i + 4) << subsampling_y;
+      const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
+      const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
+      const int32_t x4 = dst_x >> subsampling_x;
+      const int32_t y4 = dst_y >> subsampling_y;
+
+      int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
+      int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+      int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
+      int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+
+      sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+             (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+      sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+             (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+
+      sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+      sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+      // horizontal
+      if (ix4 <= -7) {
+        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+          int iy = iy4 + k;
+          if (iy < 0)
+            iy = 0;
+          else if (iy > height - 1)
+            iy = height - 1;
+          int16_t dup_val =
+              (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
+              ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz));
+
+          tmp[k + 7] = vdupq_n_s16(dup_val);
+        }
+      } else if (ix4 >= width + 6) {
+        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+          int iy = iy4 + k;
+          if (iy < 0)
+            iy = 0;
+          else if (iy > height - 1)
+            iy = height - 1;
+          int16_t dup_val = (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
+                            ref[iy * stride + (width - 1)] *
+                                (1 << (FILTER_BITS - reduce_bits_horiz));
+          tmp[k + 7] = vdupq_n_s16(dup_val);
+        }
+      } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) {
+        const int out_of_boundary_left = -(ix4 - 6);
+        const int out_of_boundary_right = (ix4 + 8) - width;
+
+        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+          int iy = iy4 + k;
+          if (iy < 0)
+            iy = 0;
+          else if (iy > height - 1)
+            iy = height - 1;
+          int sx = sx4 + beta * (k + 4);
+
+          const uint8_t *src = ref + iy * stride + ix4 - 7;
+          src_1 = vld1q_u8(src);
+
+          if (out_of_boundary_left >= 0) {
+            limit = out_of_boundary_left + 1;
+            cmp_vec = vdupq_n_u8(out_of_boundary_left);
+            vec_dup = vdupq_n_u8(*(src + limit));
+            mask_val = vcleq_u8(indx_vec, cmp_vec);
+            src_1 = vbslq_u8(mask_val, vec_dup, src_1);
+          }
+          if (out_of_boundary_right >= 0) {
+            limit = 15 - (out_of_boundary_right + 1);
+            cmp_vec = vdupq_n_u8(15 - out_of_boundary_right);
+            vec_dup = vdupq_n_u8(*(src + limit));
+            mask_val = vcgeq_u8(indx_vec, cmp_vec);
+            src_1 = vbslq_u8(mask_val, vec_dup, src_1);
+          }
+          src_2 = vextq_u8(src_1, src_1, 1);
+          src_3 = vextq_u8(src_2, src_2, 1);
+          src_4 = vextq_u8(src_3, src_3, 1);
+
+          horizontal_filter_neon(src_1, src_2, src_3, src_4, tmp, sx, alpha, k,
+                                 offset_bits_horiz, reduce_bits_horiz);
+        }
+      } else {
+        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+          int iy = iy4 + k;
+          if (iy < 0)
+            iy = 0;
+          else if (iy > height - 1)
+            iy = height - 1;
+          int sx = sx4 + beta * (k + 4);
+
+          const uint8_t *src = ref + iy * stride + ix4 - 7;
+          src_1 = vld1q_u8(src);
+          src_2 = vextq_u8(src_1, src_1, 1);
+          src_3 = vextq_u8(src_2, src_2, 1);
+          src_4 = vextq_u8(src_3, src_3, 1);
+
+          horizontal_filter_neon(src_1, src_2, src_3, src_4, tmp, sx, alpha, k,
+                                 offset_bits_horiz, reduce_bits_horiz);
+        }
+      }
+
+      // vertical
+      for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
+        int sy = sy4 + delta * (k + 4);
+
+        const int16x8_t *v_src = tmp + (k + 4);
+
+        vertical_filter_neon(v_src, &res_lo, &res_hi, sy, gamma);
+
+        res_lo = vaddq_s32(res_lo, add_const_vert);
+        res_hi = vaddq_s32(res_hi, add_const_vert);
+
+        if (conv_params->is_compound) {
+          uint16_t *const p =
+              (uint16_t *)&conv_params
+                  ->dst[(i + k + 4) * conv_params->dst_stride + j];
+
+          res_lo = vrshlq_s32(res_lo, shift_vert);
+          if (conv_params->do_average) {
+            uint8_t *const dst8 = &pred[(i + k + 4) * p_stride + j];
+            uint16x4_t tmp16_lo = vld1_u16(p);
+            int32x4_t tmp32_lo = vreinterpretq_s32_u32(vmovl_u16(tmp16_lo));
+            int16x4_t tmp16_low;
+            if (conv_params->use_dist_wtd_comp_avg) {
+              res_lo = vmulq_s32(res_lo, bwd);
+              tmp32_lo = vmulq_s32(tmp32_lo, fwd);
+              tmp32_lo = vaddq_s32(tmp32_lo, res_lo);
+              tmp16_low = vshrn_n_s32(tmp32_lo, DIST_PRECISION_BITS);
+            } else {
+              tmp32_lo = vaddq_s32(tmp32_lo, res_lo);
+              tmp16_low = vshrn_n_s32(tmp32_lo, 1);
+            }
+            int16x4_t res_low = vadd_s16(tmp16_low, res_sub_const);
+            res_low = vqrshl_s16(res_low, round_bits_vec);
+            int16x8_t final_res_low = vcombine_s16(res_low, res_low);
+            uint8x8_t res_8_low = vqmovun_s16(final_res_low);
+
+            vst1_lane_u32((uint32_t *)dst8, vreinterpret_u32_u8(res_8_low), 0);
+          } else {
+            uint16x4_t res_u16_low = vqmovun_s32(res_lo);
+            vst1_u16(p, res_u16_low);
+          }
+          if (p_width > 4) {
+            uint16_t *const p4 =
+                (uint16_t *)&conv_params
+                    ->dst[(i + k + 4) * conv_params->dst_stride + j + 4];
+
+            res_hi = vrshlq_s32(res_hi, shift_vert);
+            if (conv_params->do_average) {
+              uint8_t *const dst8_4 = &pred[(i + k + 4) * p_stride + j + 4];
+
+              uint16x4_t tmp16_hi = vld1_u16(p4);
+              int32x4_t tmp32_hi = vreinterpretq_s32_u32(vmovl_u16(tmp16_hi));
+              int16x4_t tmp16_high;
+              if (conv_params->use_dist_wtd_comp_avg) {
+                res_hi = vmulq_s32(res_hi, bwd);
+                tmp32_hi = vmulq_s32(tmp32_hi, fwd);
+                tmp32_hi = vaddq_s32(tmp32_hi, res_hi);
+                tmp16_high = vshrn_n_s32(tmp32_hi, DIST_PRECISION_BITS);
+              } else {
+                tmp32_hi = vaddq_s32(tmp32_hi, res_hi);
+                tmp16_high = vshrn_n_s32(tmp32_hi, 1);
+              }
+              int16x4_t res_high = vadd_s16(tmp16_high, res_sub_const);
+              res_high = vqrshl_s16(res_high, round_bits_vec);
+              int16x8_t final_res_high = vcombine_s16(res_high, res_high);
+              uint8x8_t res_8_high = vqmovun_s16(final_res_high);
+
+              vst1_lane_u32((uint32_t *)dst8_4, vreinterpret_u32_u8(res_8_high),
+                            0);
+            } else {
+              uint16x4_t res_u16_high = vqmovun_s32(res_hi);
+              vst1_u16(p4, res_u16_high);
+            }
+          }
+        } else {
+          res_lo = vrshlq_s32(res_lo, shift_vert);
+          res_hi = vrshlq_s32(res_hi, shift_vert);
+
+          result_final = vcombine_s16(vmovn_s32(res_lo), vmovn_s32(res_hi));
+          result_final = vsubq_s16(result_final, sub_constant);
+
+          uint8_t *const p = (uint8_t *)&pred[(i + k + 4) * p_stride + j];
+          uint8x8_t val = vqmovun_s16(result_final);
+
+          if (p_width == 4) {
+            vst1_lane_u32((uint32_t *)p, vreinterpret_u32_u8(val), 0);
+          } else {
+            vst1_u8(p, val);
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/libs/libaom/src/av1/common/arm/wiener_convolve_neon.c b/libs/libaom/src/av1/common/arm/wiener_convolve_neon.c
new file mode 100644
index 000000000..a9bb5bcf0
--- /dev/null
+++ b/libs/libaom/src/av1/common/arm/wiener_convolve_neon.c
@@ -0,0 +1,530 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/txfm_common.h"
+#include "aom_ports/mem.h"
+#include "av1/common/common.h"
+#include "av1/common/arm/convolve_neon.h"
+#include "av1/common/arm/mem_neon.h"
+#include "av1/common/arm/transpose_neon.h"
+
+/* Wiener filter 2D
+   Apply horizontal filter and store in a temporary buffer. When applying
+   vertical filter, overwrite the original pixel values.
+ */
+void av1_wiener_convolve_add_src_neon(const uint8_t *src, ptrdiff_t src_stride,
+                                      uint8_t *dst, ptrdiff_t dst_stride,
+                                      const int16_t *filter_x, int x_step_q4,
+                                      const int16_t *filter_y, int y_step_q4,
+                                      int w, int h,
+                                      const ConvolveParams *conv_params) {
+  uint16_t *d_tmp;
+  uint8_t *d;
+  const uint8_t *src_ptr, *s_tmp;
+  uint16_t *dst_ptr;
+  (void)x_step_q4;
+  (void)y_step_q4;
+
+  int width, height;
+  const int bd = 8;
+  const int intermediate_height = h + SUBPEL_TAPS - 1;
+  const int center_tap = ((SUBPEL_TAPS - 1) / 2);
+  int16_t filter_x_tmp[7], filter_y_tmp[7];
+
+  DECLARE_ALIGNED(16, uint16_t,
+                  temp[(MAX_SB_SIZE + HORIZ_EXTRA_ROWS) * MAX_SB_SIZE]);
+
+  assert(x_step_q4 == 16 && y_step_q4 == 16);
+  assert(!(w % 8));
+
+  assert(w <= MAX_SB_SIZE);
+  assert(h <= MAX_SB_SIZE);
+
+  assert(filter_x[7] == 0);
+  assert(filter_y[7] == 0);
+
+  /* assumption of horizontal filtering output will not exceed 15 bit.
+     ((bd) + 1 + FILTER_BITS - conv_params->round_0) <= 15
+     16 - conv_params->round_0 <= 15 -- (conv_params->round_0) >= 1
+   */
+  assert((conv_params->round_0) >= 1);
+
+  memcpy(&filter_x_tmp[0], filter_x, sizeof(*filter_x) * FILTER_BITS);
+  memcpy(&filter_y_tmp[0], filter_y, sizeof(*filter_y) * FILTER_BITS);
+
+  filter_x_tmp[3] += (1 << FILTER_BITS);
+  filter_y_tmp[3] += (1 << FILTER_BITS);
+
+  s_tmp = src - center_tap * src_stride - center_tap;
+  dst_ptr = temp;
+  src_ptr = s_tmp;
+  height = intermediate_height;
+
+  /* if height is a multiple of 8 */
+  if (!(h & 7)) {
+    int16x8_t res0, res1, res2, res3;
+    uint16x8_t res4;
+    uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+#if defined(__aarch64__)
+    uint16x8_t res5, res6, res7, res8, res9, res10, res11;
+    uint8x8_t t8, t9, t10, t11, t12, t13, t14;
+
+    do {
+      const uint8_t *s;
+
+      __builtin_prefetch(src_ptr + 0 * src_stride);
+      __builtin_prefetch(src_ptr + 1 * src_stride);
+      __builtin_prefetch(src_ptr + 2 * src_stride);
+      __builtin_prefetch(src_ptr + 3 * src_stride);
+      __builtin_prefetch(src_ptr + 4 * src_stride);
+      __builtin_prefetch(src_ptr + 5 * src_stride);
+      __builtin_prefetch(src_ptr + 6 * src_stride);
+      __builtin_prefetch(src_ptr + 7 * src_stride);
+
+      load_u8_8x8(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+      transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+      s = src_ptr + 7;
+      d_tmp = dst_ptr;
+      width = w;
+
+      __builtin_prefetch(dst_ptr + 0 * dst_stride);
+      __builtin_prefetch(dst_ptr + 1 * dst_stride);
+      __builtin_prefetch(dst_ptr + 2 * dst_stride);
+      __builtin_prefetch(dst_ptr + 3 * dst_stride);
+      __builtin_prefetch(dst_ptr + 4 * dst_stride);
+      __builtin_prefetch(dst_ptr + 5 * dst_stride);
+      __builtin_prefetch(dst_ptr + 6 * dst_stride);
+      __builtin_prefetch(dst_ptr + 7 * dst_stride);
+
+      do {
+        load_u8_8x8(s, src_stride, &t7, &t8, &t9, &t10, &t11, &t12, &t13, &t14);
+        transpose_u8_8x8(&t7, &t8, &t9, &t10, &t11, &t12, &t13, &t14);
+
+        res0 = vreinterpretq_s16_u16(vaddl_u8(t0, t6));
+        res1 = vreinterpretq_s16_u16(vaddl_u8(t1, t5));
+        res2 = vreinterpretq_s16_u16(vaddl_u8(t2, t4));
+        res3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+        res4 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
+                                          bd, conv_params->round_0);
+
+        res0 = vreinterpretq_s16_u16(vaddl_u8(t1, t7));
+        res1 = vreinterpretq_s16_u16(vaddl_u8(t2, t6));
+        res2 = vreinterpretq_s16_u16(vaddl_u8(t3, t5));
+        res3 = vreinterpretq_s16_u16(vmovl_u8(t4));
+        res5 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
+                                          bd, conv_params->round_0);
+
+        res0 = vreinterpretq_s16_u16(vaddl_u8(t2, t8));
+        res1 = vreinterpretq_s16_u16(vaddl_u8(t3, t7));
+        res2 = vreinterpretq_s16_u16(vaddl_u8(t4, t6));
+        res3 = vreinterpretq_s16_u16(vmovl_u8(t5));
+        res6 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
+                                          bd, conv_params->round_0);
+
+        res0 = vreinterpretq_s16_u16(vaddl_u8(t3, t9));
+        res1 = vreinterpretq_s16_u16(vaddl_u8(t4, t8));
+        res2 = vreinterpretq_s16_u16(vaddl_u8(t5, t7));
+        res3 = vreinterpretq_s16_u16(vmovl_u8(t6));
+        res7 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
+                                          bd, conv_params->round_0);
+
+        res0 = vreinterpretq_s16_u16(vaddl_u8(t4, t10));
+        res1 = vreinterpretq_s16_u16(vaddl_u8(t5, t9));
+        res2 = vreinterpretq_s16_u16(vaddl_u8(t6, t8));
+        res3 = vreinterpretq_s16_u16(vmovl_u8(t7));
+        res8 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
+                                          bd, conv_params->round_0);
+
+        res0 = vreinterpretq_s16_u16(vaddl_u8(t5, t11));
+        res1 = vreinterpretq_s16_u16(vaddl_u8(t6, t10));
+        res2 = vreinterpretq_s16_u16(vaddl_u8(t7, t9));
+        res3 = vreinterpretq_s16_u16(vmovl_u8(t8));
+        res9 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
+                                          bd, conv_params->round_0);
+
+        res0 = vreinterpretq_s16_u16(vaddl_u8(t6, t12));
+        res1 = vreinterpretq_s16_u16(vaddl_u8(t7, t11));
+        res2 = vreinterpretq_s16_u16(vaddl_u8(t8, t10));
+        res3 = vreinterpretq_s16_u16(vmovl_u8(t9));
+        res10 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
+                                           bd, conv_params->round_0);
+
+        res0 = vreinterpretq_s16_u16(vaddl_u8(t7, t13));
+        res1 = vreinterpretq_s16_u16(vaddl_u8(t8, t12));
+        res2 = vreinterpretq_s16_u16(vaddl_u8(t9, t11));
+        res3 = vreinterpretq_s16_u16(vmovl_u8(t10));
+        res11 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
+                                           bd, conv_params->round_0);
+
+        transpose_u16_8x8(&res4, &res5, &res6, &res7, &res8, &res9, &res10,
+                          &res11);
+        store_u16_8x8(d_tmp, MAX_SB_SIZE, res4, res5, res6, res7, res8, res9,
+                      res10, res11);
+
+        t0 = t8;
+        t1 = t9;
+        t2 = t10;
+        t3 = t11;
+        t4 = t12;
+        t5 = t13;
+        t6 = t14;
+        s += 8;
+        d_tmp += 8;
+        width -= 8;
+      } while (width > 0);
+      src_ptr += 8 * src_stride;
+      dst_ptr += 8 * MAX_SB_SIZE;
+      height -= 8;
+    } while (height > 0);
+#else
+    uint8x8_t temp_0;
+
+    do {
+      const uint8_t *s;
+
+      __builtin_prefetch(src_ptr);
+
+      t0 = vld1_u8(src_ptr);  // a0 a1 a2 a3 a4 a5 a6 a7
+      s = src_ptr + 8;
+      d_tmp = dst_ptr;
+      width = w;
+
+      __builtin_prefetch(dst_ptr);
+
+      do {
+        t7 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
+        temp_0 = t0;
+        t0 = t7;
+
+        t1 = vext_u8(temp_0, t7, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
+        t2 = vext_u8(temp_0, t7, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
+        t3 = vext_u8(temp_0, t7, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
+        t4 = vext_u8(temp_0, t7, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
+        t5 = vext_u8(temp_0, t7, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
+        t6 = vext_u8(temp_0, t7, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
+        t7 = vext_u8(temp_0, t7, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
+
+        res0 = vreinterpretq_s16_u16(vaddl_u8(temp_0, t6));
+        res1 = vreinterpretq_s16_u16(vaddl_u8(t1, t5));
+        res2 = vreinterpretq_s16_u16(vaddl_u8(t2, t4));
+        res3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+        res4 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
+                                          bd, conv_params->round_0);
+
+        vst1q_u16(d_tmp, res4);
+
+        s += 8;
+        d_tmp += 8;
+        width -= 8;
+      } while (width > 0);
+      src_ptr += src_stride;
+      dst_ptr += MAX_SB_SIZE;
+      height--;
+    } while (height > 0);
+#endif
+  } else {
+    /*if height is a multiple of 4*/
+    const uint8_t *s;
+    int16x8_t tt0, tt1, tt2, tt3;
+    uint16x8_t d0;
+    uint8x8_t t0, t1, t2, t3;
+
+#if defined(__aarch64__)
+    uint16x4_t res0, res1, res2, res3, res4, res5, res6, res7;
+    uint16x8_t d1, d2, d3;
+    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+    int16x4_t s11, s12, s13, s14;
+    do {
+      __builtin_prefetch(src_ptr + 0 * src_stride);
+      __builtin_prefetch(src_ptr + 1 * src_stride);
+      __builtin_prefetch(src_ptr + 2 * src_stride);
+      __builtin_prefetch(src_ptr + 3 * src_stride);
+
+      load_u8_8x4(src_ptr, src_stride, &t0, &t1, &t2, &t3); /*8x4*/
+      transpose_u8_8x4(&t0, &t1, &t2,
+                       &t3); /*first 8 pixels of 4 rows transposed-- 4x8*/
+
+      tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+      tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+
+      s0 = vget_low_s16(tt0);  /*pa0 pb0 pc0 pd0 -- pixel_a0*/
+      s1 = vget_low_s16(tt1);  /*pa1 pb1 pc1 pd1 */
+      s2 = vget_low_s16(tt2);  /*pa2 pb2 pc2 pd2 */
+      s3 = vget_low_s16(tt3);  /*pa3 pb3 pc3 pd3 */
+      s4 = vget_high_s16(tt0); /*pa4 pb4 pc4 pd4 */
+      s5 = vget_high_s16(tt1); /*pa5 pb5 pc5 pd5 */
+      s6 = vget_high_s16(tt2); /*pa6 pb6 pc6 pd6 */
+
+      __builtin_prefetch(dst_ptr + 0 * dst_stride);
+      __builtin_prefetch(dst_ptr + 1 * dst_stride);
+      __builtin_prefetch(dst_ptr + 2 * dst_stride);
+      __builtin_prefetch(dst_ptr + 3 * dst_stride);
+
+      s = src_ptr + 7;
+      d_tmp = dst_ptr;
+      width = w;
+
+      do {
+        load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); /*8x4*/
+        transpose_u8_8x4(&t0, &t1, &t2, &t3);
+
+        tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+
+        s7 = vget_low_s16(tt0); /*pa7  pb7  pc7  pd7  */ /*4x8*/
+        s8 = vget_low_s16(tt1);   /*pa8  pb8  pc8  pd8  */
+        s9 = vget_low_s16(tt2);   /*pa9  pb9  pc9  pd9  */
+        s10 = vget_low_s16(tt3);  /*pa10 pb10 pc10 pd10 */
+        s11 = vget_high_s16(tt0); /*pa11 pb11 pc11 pd11 */
+        s12 = vget_high_s16(tt1); /*pa12 pb12 pc12 pd12 */
+        s13 = vget_high_s16(tt2); /*pa13 pb13 pc13 pd13 */
+        s14 = vget_high_s16(tt3); /*pa14 pb14 pc14 pd14 */
+
+        res0 = wiener_convolve8_horiz_4x8(
+            s0, s1, s2, s3, s4, s5, s6, filter_x_tmp, bd, conv_params->round_0);
+        res1 = wiener_convolve8_horiz_4x8(
+            s1, s2, s3, s4, s5, s6, s7, filter_x_tmp, bd, conv_params->round_0);
+        res2 = wiener_convolve8_horiz_4x8(
+            s2, s3, s4, s5, s6, s7, s8, filter_x_tmp, bd, conv_params->round_0);
+        res3 = wiener_convolve8_horiz_4x8(
+            s3, s4, s5, s6, s7, s8, s9, filter_x_tmp, bd, conv_params->round_0);
+        res4 =
+            wiener_convolve8_horiz_4x8(s4, s5, s6, s7, s8, s9, s10,
+                                       filter_x_tmp, bd, conv_params->round_0);
+        res5 =
+            wiener_convolve8_horiz_4x8(s5, s6, s7, s8, s9, s10, s11,
+                                       filter_x_tmp, bd, conv_params->round_0);
+        res6 =
+            wiener_convolve8_horiz_4x8(s6, s7, s8, s9, s10, s11, s12,
+                                       filter_x_tmp, bd, conv_params->round_0);
+        res7 =
+            wiener_convolve8_horiz_4x8(s7, s8, s9, s10, s11, s12, s13,
+                                       filter_x_tmp, bd, conv_params->round_0);
+
+        transpose_u16_4x8(&res0, &res1, &res2, &res3, &res4, &res5, &res6,
+                          &res7, &d0, &d1, &d2, &d3);
+
+        store_u16_8x4(d_tmp, MAX_SB_SIZE, d0, d1, d2, d3);
+
+        s0 = s8;
+        s1 = s9;
+        s2 = s10;
+        s3 = s11;
+        s4 = s12;
+        s5 = s13;
+        s6 = s14;
+        s += 8;
+        d_tmp += 8;
+        width -= 8;
+      } while (width > 0);
+
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * MAX_SB_SIZE;
+      height -= 4;
+    } while (height > 0);
+#else
+    uint8x8_t temp_0, t4, t5, t6, t7;
+
+    do {
+      __builtin_prefetch(src_ptr);
+
+      t0 = vld1_u8(src_ptr);  // a0 a1 a2 a3 a4 a5 a6 a7
+
+      __builtin_prefetch(dst_ptr);
+
+      s = src_ptr + 8;
+      d_tmp = dst_ptr;
+      width = w;
+
+      do {
+        t7 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
+        temp_0 = t0;
+        t0 = t7;
+
+        t1 = vext_u8(temp_0, t7, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
+        t2 = vext_u8(temp_0, t7, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
+        t3 = vext_u8(temp_0, t7, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
+        t4 = vext_u8(temp_0, t7, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
+        t5 = vext_u8(temp_0, t7, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
+        t6 = vext_u8(temp_0, t7, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
+        t7 = vext_u8(temp_0, t7, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
+
+        tt0 = vreinterpretq_s16_u16(vaddl_u8(temp_0, t6));
+        tt1 = vreinterpretq_s16_u16(vaddl_u8(t1, t5));
+        tt2 = vreinterpretq_s16_u16(vaddl_u8(t2, t4));
+        tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+        d0 = wiener_convolve8_horiz_8x8(tt0, tt1, tt2, tt3, filter_x_tmp, bd,
+                                        conv_params->round_0);
+
+        vst1q_u16(d_tmp, d0);
+
+        s += 8;
+        d_tmp += 8;
+        width -= 8;
+      } while (width > 0);
+
+      src_ptr += src_stride;
+      dst_ptr += MAX_SB_SIZE;
+      height -= 1;
+    } while (height > 0);
+#endif
+  }
+
+  {
+    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+    uint8x8_t t0;
+#if defined(__aarch64__)
+    int16x8_t s8, s9, s10;
+    uint8x8_t t1, t2, t3;
+#endif
+    int16_t *src_tmp_ptr, *s;
+    uint8_t *dst_tmp_ptr;
+    height = h;
+    width = w;
+    src_tmp_ptr = (int16_t *)temp;
+    dst_tmp_ptr = dst;
+    src_stride = MAX_SB_SIZE;
+
+    do {
+      s = src_tmp_ptr;
+      s0 = vld1q_s16(s);
+      s += src_stride;
+      s1 = vld1q_s16(s);
+      s += src_stride;
+      s2 = vld1q_s16(s);
+      s += src_stride;
+      s3 = vld1q_s16(s);
+      s += src_stride;
+      s4 = vld1q_s16(s);
+      s += src_stride;
+      s5 = vld1q_s16(s);
+      s += src_stride;
+      s6 = vld1q_s16(s);
+      s += src_stride;
+      d = dst_tmp_ptr;
+      height = h;
+
+#if defined(__aarch64__)
+      do {
+        __builtin_prefetch(dst_tmp_ptr + 0 * dst_stride);
+        __builtin_prefetch(dst_tmp_ptr + 1 * dst_stride);
+        __builtin_prefetch(dst_tmp_ptr + 2 * dst_stride);
+        __builtin_prefetch(dst_tmp_ptr + 3 * dst_stride);
+
+        s7 = vld1q_s16(s);
+        s += src_stride;
+        s8 = vld1q_s16(s);
+        s += src_stride;
+        s9 = vld1q_s16(s);
+        s += src_stride;
+        s10 = vld1q_s16(s);
+        s += src_stride;
+
+        t0 = wiener_convolve8_vert_4x8(s0, s1, s2, s3, s4, s5, s6, filter_y_tmp,
+                                       bd, conv_params->round_1);
+        t1 = wiener_convolve8_vert_4x8(s1, s2, s3, s4, s5, s6, s7, filter_y_tmp,
+                                       bd, conv_params->round_1);
+        t2 = wiener_convolve8_vert_4x8(s2, s3, s4, s5, s6, s7, s8, filter_y_tmp,
+                                       bd, conv_params->round_1);
+        t3 = wiener_convolve8_vert_4x8(s3, s4, s5, s6, s7, s8, s9, filter_y_tmp,
+                                       bd, conv_params->round_1);
+
+        vst1_u8(d, t0);
+        d += dst_stride;
+        vst1_u8(d, t1);
+        d += dst_stride;
+        vst1_u8(d, t2);
+        d += dst_stride;
+        vst1_u8(d, t3);
+        d += dst_stride;
+
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+        s3 = s7;
+        s4 = s8;
+        s5 = s9;
+        s6 = s10;
+        height -= 4;
+      } while (height > 3);
+
+      if (height != 0) {
+        __builtin_prefetch(dst_tmp_ptr + 0 * dst_stride);
+        __builtin_prefetch(dst_tmp_ptr + 1 * dst_stride);
+
+        do {
+          s7 = vld1q_s16(s);
+          s += src_stride;
+
+          t0 =
+              wiener_convolve8_vert_4x8(s0, s1, s2, s3, s4, s5, s6,
+                                        filter_y_tmp, bd, conv_params->round_1);
+          vst1_u8(d, t0);
+          d += dst_stride;
+
+          s0 = s1;
+          s1 = s2;
+          s2 = s3;
+          s3 = s4;
+          s4 = s5;
+          s5 = s6;
+          s6 = s7;
+          height -= 1;
+        } while (height > 0);
+      }
+
+      src_tmp_ptr += 8;
+      dst_tmp_ptr += 8;
+
+      w -= 8;
+    } while (w > 0);
+#else
+      do {
+        __builtin_prefetch(dst_tmp_ptr + 0 * dst_stride);
+
+        s7 = vld1q_s16(s);
+        s += src_stride;
+
+        t0 = wiener_convolve8_vert_4x8(s0, s1, s2, s3, s4, s5, s6, filter_y_tmp,
+                                       bd, conv_params->round_1);
+
+        vst1_u8(d, t0);
+        d += dst_stride;
+
+        s0 = s1;
+        s1 = s2;
+        s2 = s3;
+        s3 = s4;
+        s4 = s5;
+        s5 = s6;
+        s6 = s7;
+        height -= 1;
+      } while (height > 0);
+
+      src_tmp_ptr += 8;
+      dst_tmp_ptr += 8;
+
+      w -= 8;
+    } while (w > 0);
+#endif
+  }
+}
diff --git a/libs/libaom/src/av1/common/av1_common_int.h b/libs/libaom/src/av1/common/av1_common_int.h
new file mode 100644
index 000000000..0403405e9
--- /dev/null
+++ b/libs/libaom/src/av1/common/av1_common_int.h
@@ -0,0 +1,1557 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_AV1_COMMON_INT_H_
+#define AOM_AV1_COMMON_AV1_COMMON_INT_H_
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/internal/aom_codec_internal.h"
+#include "aom_util/aom_thread.h"
+#include "av1/common/alloccommon.h"
+#include "av1/common/av1_loopfilter.h"
+#include "av1/common/entropy.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/entropymv.h"
+#include "av1/common/enums.h"
+#include "av1/common/frame_buffers.h"
+#include "av1/common/mv.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/restoration.h"
+#include "av1/common/tile_common.h"
+#include "av1/common/timing.h"
+#include "av1/common/odintrin.h"
+#include "av1/encoder/hash_motion.h"
+#include "aom_dsp/grain_synthesis.h"
+#include "aom_dsp/grain_table.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(__clang__) && defined(__has_warning)
+#if __has_feature(cxx_attributes) && __has_warning("-Wimplicit-fallthrough")
+#define AOM_FALLTHROUGH_INTENDED [[clang::fallthrough]]  // NOLINT
+#endif
+#elif defined(__GNUC__) && __GNUC__ >= 7
+#define AOM_FALLTHROUGH_INTENDED __attribute__((fallthrough))  // NOLINT
+#endif
+
+#ifndef AOM_FALLTHROUGH_INTENDED
+#define AOM_FALLTHROUGH_INTENDED \
+  do {                           \
+  } while (0)
+#endif
+
+#define CDEF_MAX_STRENGTHS 16
+
+/* Constant values while waiting for the sequence header */
+#define FRAME_ID_LENGTH 15
+#define DELTA_FRAME_ID_LENGTH 14
+
+#define FRAME_CONTEXTS (FRAME_BUFFERS + 1)
+// Extra frame context which is always kept at default values
+#define FRAME_CONTEXT_DEFAULTS (FRAME_CONTEXTS - 1)
+#define PRIMARY_REF_BITS 3
+#define PRIMARY_REF_NONE 7
+
+#define NUM_PING_PONG_BUFFERS 2
+
+#define MAX_NUM_TEMPORAL_LAYERS 8
+#define MAX_NUM_SPATIAL_LAYERS 4
+/* clang-format off */
+// clang-format seems to think this is a pointer dereference and not a
+// multiplication.
+#define MAX_NUM_OPERATING_POINTS \
+  (MAX_NUM_TEMPORAL_LAYERS * MAX_NUM_SPATIAL_LAYERS)
+/* clang-format on */
+
+// TODO(jingning): Turning this on to set up transform coefficient
+// processing timer.
+#define TXCOEFF_TIMER 0
+#define TXCOEFF_COST_TIMER 0
+
+enum {
+  SINGLE_REFERENCE = 0,
+  COMPOUND_REFERENCE = 1,
+  REFERENCE_MODE_SELECT = 2,
+  REFERENCE_MODES = 3,
+} UENUM1BYTE(REFERENCE_MODE);
+
+enum {
+  /**
+   * Frame context updates are disabled
+   */
+  REFRESH_FRAME_CONTEXT_DISABLED,
+  /**
+   * Update frame context to values resulting from backward probability
+   * updates based on entropy/counts in the decoded frame
+   */
+  REFRESH_FRAME_CONTEXT_BACKWARD,
+} UENUM1BYTE(REFRESH_FRAME_CONTEXT_MODE);
+
+#define MFMV_STACK_SIZE 3
+typedef struct {
+  int_mv mfmv0;
+  uint8_t ref_frame_offset;
+} TPL_MV_REF;
+
+typedef struct {
+  int_mv mv;
+  MV_REFERENCE_FRAME ref_frame;
+} MV_REF;
+
+typedef struct RefCntBuffer {
+  // For a RefCntBuffer, the following are reference-holding variables:
+  // - cm->ref_frame_map[]
+  // - cm->cur_frame
+  // - cm->scaled_ref_buf[] (encoder only)
+  // - pbi->output_frame_index[] (decoder only)
+  // With that definition, 'ref_count' is the number of reference-holding
+  // variables that are currently referencing this buffer.
+  // For example:
+  // - suppose this buffer is at index 'k' in the buffer pool, and
+  // - Total 'n' of the variables / array elements above have value 'k' (that
+  // is, they are pointing to buffer at index 'k').
+  // Then, pool->frame_bufs[k].ref_count = n.
+  int ref_count;
+
+  unsigned int order_hint;
+  unsigned int ref_order_hints[INTER_REFS_PER_FRAME];
+
+  // These variables are used only in encoder and compare the absolute
+  // display order hint to compute the relative distance and overcome
+  // the limitation of get_relative_dist() which returns incorrect
+  // distance when a very old frame is used as a reference.
+  unsigned int display_order_hint;
+  unsigned int ref_display_order_hint[INTER_REFS_PER_FRAME];
+
+  MV_REF *mvs;
+  uint8_t *seg_map;
+  struct segmentation seg;
+  int mi_rows;
+  int mi_cols;
+  // Width and height give the size of the buffer (before any upscaling, unlike
+  // the sizes that can be derived from the buf structure)
+  int width;
+  int height;
+  WarpedMotionParams global_motion[REF_FRAMES];
+  int showable_frame;  // frame can be used as show existing frame in future
+  uint8_t film_grain_params_present;
+  aom_film_grain_t film_grain_params;
+  aom_codec_frame_buffer_t raw_frame_buffer;
+  YV12_BUFFER_CONFIG buf;
+  FRAME_TYPE frame_type;
+
+  // This is only used in the encoder but needs to be indexed per ref frame
+  // so it's extremely convenient to keep it here.
+  int interp_filter_selected[SWITCHABLE];
+
+  // Inter frame reference frame delta for loop filter
+  int8_t ref_deltas[REF_FRAMES];
+
+  // 0 = ZERO_MV, MV
+  int8_t mode_deltas[MAX_MODE_LF_DELTAS];
+
+  FRAME_CONTEXT frame_context;
+} RefCntBuffer;
+
+typedef struct BufferPool {
+// Protect BufferPool from being accessed by several FrameWorkers at
+// the same time during frame parallel decode.
+// TODO(hkuang): Try to use atomic variable instead of locking the whole pool.
+// TODO(wtc): Remove this. See
+// https://chromium-review.googlesource.com/c/webm/libvpx/+/560630.
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t pool_mutex;
+#endif
+
+  // Private data associated with the frame buffer callbacks.
+  void *cb_priv;
+
+  aom_get_frame_buffer_cb_fn_t get_fb_cb;
+  aom_release_frame_buffer_cb_fn_t release_fb_cb;
+
+  RefCntBuffer frame_bufs[FRAME_BUFFERS];
+
+  // Frame buffers allocated internally by the codec.
+  InternalFrameBufferList int_frame_buffers;
+} BufferPool;
+
+typedef struct {
+  int cdef_damping;
+  int nb_cdef_strengths;
+  int cdef_strengths[CDEF_MAX_STRENGTHS];
+  int cdef_uv_strengths[CDEF_MAX_STRENGTHS];
+  int cdef_bits;
+} CdefInfo;
+
+typedef struct {
+  int delta_q_present_flag;
+  // Resolution of delta quant
+  int delta_q_res;
+  int delta_lf_present_flag;
+  // Resolution of delta lf level
+  int delta_lf_res;
+  // This is a flag for number of deltas of loop filter level
+  // 0: use 1 delta, for y_vertical, y_horizontal, u, and v
+  // 1: use separate deltas for each filter level
+  int delta_lf_multi;
+} DeltaQInfo;
+
+typedef struct {
+  int enable_order_hint;        // 0 - disable order hint, and related tools
+  int order_hint_bits_minus_1;  // dist_wtd_comp, ref_frame_mvs,
+                                // frame_sign_bias
+                                // if 0, enable_dist_wtd_comp and
+                                // enable_ref_frame_mvs must be set as 0.
+  int enable_dist_wtd_comp;     // 0 - disable dist-wtd compound modes
+                                // 1 - enable it
+  int enable_ref_frame_mvs;     // 0 - disable ref frame mvs
+                                // 1 - enable it
+} OrderHintInfo;
+
+// Sequence header structure.
+// Note: All syntax elements of sequence_header_obu that need to be
+// bit-identical across multiple sequence headers must be part of this struct,
+// so that consistency is checked by are_seq_headers_consistent() function.
+// One exception is the last member 'op_params' that is ignored by
+// are_seq_headers_consistent() function.
+typedef struct SequenceHeader {
+  int num_bits_width;
+  int num_bits_height;
+  int max_frame_width;
+  int max_frame_height;
+  uint8_t frame_id_numbers_present_flag;
+  int frame_id_length;
+  int delta_frame_id_length;
+  BLOCK_SIZE sb_size;  // Size of the superblock used for this frame
+  int mib_size;        // Size of the superblock in units of MI blocks
+  int mib_size_log2;   // Log 2 of above.
+
+  OrderHintInfo order_hint_info;
+
+  uint8_t force_screen_content_tools;  // 0 - force off
+                                       // 1 - force on
+                                       // 2 - adaptive
+  uint8_t still_picture;               // Video is a single frame still picture
+  uint8_t reduced_still_picture_hdr;   // Use reduced header for still picture
+  uint8_t force_integer_mv;            // 0 - Don't force. MV can use subpel
+                                       // 1 - force to integer
+                                       // 2 - adaptive
+  uint8_t enable_filter_intra;         // enables/disables filterintra
+  uint8_t enable_intra_edge_filter;    // enables/disables edge upsampling
+  uint8_t enable_interintra_compound;  // enables/disables interintra_compound
+  uint8_t enable_masked_compound;      // enables/disables masked compound
+  uint8_t enable_dual_filter;          // 0 - disable dual interpolation filter
+                                       // 1 - enable vert/horz filter selection
+  uint8_t enable_warped_motion;        // 0 - disable warp for the sequence
+                                       // 1 - enable warp for the sequence
+  uint8_t enable_superres;             // 0 - Disable superres for the sequence
+                                       //     and no frame level superres flag
+                                       // 1 - Enable superres for the sequence
+                                       //     enable per-frame superres flag
+  uint8_t enable_cdef;                 // To turn on/off CDEF
+  uint8_t enable_restoration;          // To turn on/off loop restoration
+  BITSTREAM_PROFILE profile;
+
+  // Color config.
+  aom_bit_depth_t bit_depth;  // AOM_BITS_8 in profile 0 or 1,
+                              // AOM_BITS_10 or AOM_BITS_12 in profile 2 or 3.
+  uint8_t use_highbitdepth;   // If true, we need to use 16bit frame buffers.
+  uint8_t monochrome;         // Monochorme video
+  aom_color_primaries_t color_primaries;
+  aom_transfer_characteristics_t transfer_characteristics;
+  aom_matrix_coefficients_t matrix_coefficients;
+  int color_range;
+  int subsampling_x;  // Chroma subsampling for x
+  int subsampling_y;  // Chroma subsampling for y
+  aom_chroma_sample_position_t chroma_sample_position;
+  uint8_t separate_uv_delta_q;
+  uint8_t film_grain_params_present;
+
+  // Operating point info.
+  int operating_points_cnt_minus_1;
+  int operating_point_idc[MAX_NUM_OPERATING_POINTS];
+  int timing_info_present;
+  aom_timing_info_t timing_info;
+  uint8_t decoder_model_info_present_flag;
+  aom_dec_model_info_t decoder_model_info;
+  uint8_t display_model_info_present_flag;
+  AV1_LEVEL seq_level_idx[MAX_NUM_OPERATING_POINTS];
+  uint8_t tier[MAX_NUM_OPERATING_POINTS];  // seq_tier in spec. One bit: 0 or 1.
+
+  // IMPORTANT: the op_params member must be at the end of the struct so that
+  // are_seq_headers_consistent() can be implemented with a memcmp() call.
+  // TODO(urvang): We probably don't need the +1 here.
+  aom_dec_model_op_parameters_t op_params[MAX_NUM_OPERATING_POINTS + 1];
+} SequenceHeader;
+
+typedef struct {
+  int skip_mode_allowed;
+  int skip_mode_flag;
+  int ref_frame_idx_0;
+  int ref_frame_idx_1;
+} SkipModeInfo;
+
+typedef struct {
+  FRAME_TYPE frame_type;
+  REFERENCE_MODE reference_mode;
+
+  unsigned int order_hint;
+  unsigned int display_order_hint;
+  unsigned int frame_number;
+  SkipModeInfo skip_mode_info;
+  int refresh_frame_flags;  // Which ref frames are overwritten by this frame
+  int frame_refs_short_signaling;
+} CurrentFrame;
+
+// Struct containing some frame level features.
+typedef struct {
+  bool disable_cdf_update;
+  bool allow_high_precision_mv;
+  bool cur_frame_force_integer_mv;  // 0 the default in AOM, 1 only integer
+  bool allow_screen_content_tools;
+  bool allow_intrabc;
+  bool allow_warped_motion;
+  // Whether to use previous frames' motion vectors for prediction.
+  bool allow_ref_frame_mvs;
+  bool coded_lossless;  // frame is fully lossless at the coded resolution.
+  bool all_lossless;    // frame is fully lossless at the upscaled resolution.
+  bool reduced_tx_set_used;
+  bool error_resilient_mode;
+  bool switchable_motion_mode;
+  TX_MODE tx_mode;
+  InterpFilter interp_filter;
+  int primary_ref_frame;
+  int byte_alignment;
+  // Flag signaling how frame contexts should be updated at the end of
+  // a frame decode
+  REFRESH_FRAME_CONTEXT_MODE refresh_frame_context;
+} FeatureFlags;
+
+// Struct containing params related to tiles.
+typedef struct CommonTileParams {
+  int cols;           // number of tile columns that frame is divided into
+  int rows;           // number of tile rows that frame is divided into
+  int max_width_sb;   // maximum tile width in superblock units.
+  int max_height_sb;  // maximum tile height in superblock units.
+  // Min width of non-rightmost tile in MI units. Only valid if cols > 1.
+  int min_inner_width;
+
+  // If true, tiles are uniformly spaced with power-of-two number of rows and
+  // columns.
+  // If false, tiles have explicitly configured widths and heights.
+  int uniform_spacing;
+
+  // Following members are only valid when uniform_spacing == 1
+  int log2_cols;  // log2 of 'cols'.
+  int log2_rows;  // log2 of 'rows'.
+  int width;      // tile width in MI units
+  int height;     // tile height in MI units
+  // End of members that are only valid when uniform_spacing == 1
+
+  // Min num of tile columns possible based on 'max_width_sb' and frame width.
+  int min_log2_cols;
+  // Min num of tile rows possible based on 'max_height_sb' and frame height.
+  int min_log2_rows;
+  // Min num of tile columns possible based on frame width.
+  int max_log2_cols;
+  // Max num of tile columns possible based on frame width.
+  int max_log2_rows;
+  // log2 of min number of tiles (same as min_log2_cols + min_log2_rows).
+  int min_log2;
+  // col_start_sb[i] is the start position of tile column i in superblock units.
+  // valid for 0 <= i <= cols
+  int col_start_sb[MAX_TILE_COLS + 1];
+  // row_start_sb[i] is the start position of tile row i in superblock units.
+  // valid for 0 <= i <= rows
+  int row_start_sb[MAX_TILE_ROWS + 1];
+  // If true, we are using large scale tile mode.
+  unsigned int large_scale;
+  // Only relevant when large_scale == 1.
+  // If true, the independent decoding of a single tile or a section of a frame
+  // is allowed.
+  unsigned int single_tile_decoding;
+} CommonTileParams;
+
+// Struct containing params related to MB_MODE_INFO arrays and related info.
+typedef struct CommonModeInfoParams CommonModeInfoParams;
+struct CommonModeInfoParams {
+  // Number of rows/cols in the frame in 16 pixel units.
+  // This is computed from frame width and height aligned to a multiple of 8.
+  int mb_rows;
+  int mb_cols;
+  // Total MBs = mb_rows * mb_cols.
+  int MBs;
+
+  // Number of rows/cols in the frame in 4 pixel (MB_MODE_INFO) units.
+  // This is computed from frame width and height aligned to a multiple of 8.
+  int mi_rows;
+  int mi_cols;
+
+  // An array of MB_MODE_INFO structs for every 'mi_alloc_bsize' sized block
+  // in the frame.
+  // Note: This array should be treated like a scratch memory, and should NOT be
+  // accessed directly, in most cases. Please use 'mi_grid_base' array instead.
+  MB_MODE_INFO *mi_alloc;
+  // Number of allocated elements in 'mi_alloc'.
+  int mi_alloc_size;
+  // Stride for 'mi_alloc' array.
+  int mi_alloc_stride;
+  // The minimum block size that each element in 'mi_alloc' can correspond to.
+  // For decoder, this is always BLOCK_4X4.
+  // For encoder, this is currently set to BLOCK_4X4 for resolution < 4k,
+  // and BLOCK_8X8 for resolution >= 4k.
+  BLOCK_SIZE mi_alloc_bsize;
+
+  // Grid of pointers to 4x4 MB_MODE_INFO structs allocated in 'mi_alloc'.
+  // It's possible that:
+  // - Multiple pointers in the grid point to the same element in 'mi_alloc'
+  // (for example, for all 4x4 blocks that belong to the same partition block).
+  // - Some pointers can be NULL (for example, for blocks outside visible area).
+  MB_MODE_INFO **mi_grid_base;
+  // Number of allocated elements in 'mi_grid_base' (and 'tx_type_map' also).
+  int mi_grid_size;
+  // Stride for 'mi_grid_base' (and 'tx_type_map' also).
+  int mi_stride;
+
+  // An array of tx types for each 4x4 block in the frame.
+  // Number of allocated elements is same as 'mi_grid_size', and stride is
+  // same as 'mi_grid_size'. So, indexing into 'tx_type_map' is same as that of
+  // 'mi_grid_base'.
+  TX_TYPE *tx_type_map;
+
+  // Function pointers to allow separate logic for encoder and decoder.
+  void (*free_mi)(struct CommonModeInfoParams *mi_params);
+  void (*setup_mi)(struct CommonModeInfoParams *mi_params);
+  void (*set_mb_mi)(struct CommonModeInfoParams *mi_params, int width,
+                    int height);
+};
+
+// Parameters related to quantization at the frame level.
+typedef struct CommonQuantParams CommonQuantParams;
+struct CommonQuantParams {
+  // Base qindex of the frame in the range 0 to 255.
+  int base_qindex;
+
+  // Delta of qindex (from base_qindex) for Y plane DC coefficient.
+  // Note: y_ac_delta_q is implicitly 0.
+  int y_dc_delta_q;
+
+  // Delta of qindex (from base_qindex) for U plane DC and AC coefficients.
+  int u_dc_delta_q;
+  int v_dc_delta_q;
+
+  // Delta of qindex (from base_qindex) for V plane DC and AC coefficients.
+  // Same as those for U plane if cm->seq_params.separate_uv_delta_q == 0.
+  int u_ac_delta_q;
+  int v_ac_delta_q;
+
+  // Note: The qindex per superblock may have a delta from the qindex obtained
+  // at frame level from parameters above, based on 'cm->delta_q_info'.
+
+  // The dequantizers below are true dequantizers used only in the
+  // dequantization process.  They have the same coefficient
+  // shift/scale as TX.
+  int16_t y_dequant_QTX[MAX_SEGMENTS][2];
+  int16_t u_dequant_QTX[MAX_SEGMENTS][2];
+  int16_t v_dequant_QTX[MAX_SEGMENTS][2];
+
+  // Global quant matrix tables
+  const qm_val_t *giqmatrix[NUM_QM_LEVELS][3][TX_SIZES_ALL];
+  const qm_val_t *gqmatrix[NUM_QM_LEVELS][3][TX_SIZES_ALL];
+
+  // Local quant matrix tables for each frame
+  const qm_val_t *y_iqmatrix[MAX_SEGMENTS][TX_SIZES_ALL];
+  const qm_val_t *u_iqmatrix[MAX_SEGMENTS][TX_SIZES_ALL];
+  const qm_val_t *v_iqmatrix[MAX_SEGMENTS][TX_SIZES_ALL];
+
+  // Flag indicating whether quantization matrices are being used:
+  //  - If true, qm_level_y, qm_level_u and qm_level_v indicate the level
+  //    indices to be used to access appropriate global quant matrix tables.
+  //  - If false, we implicitly use level index 'NUM_QM_LEVELS - 1'.
+  bool using_qmatrix;
+  int qmatrix_level_y;
+  int qmatrix_level_u;
+  int qmatrix_level_v;
+};
+
+// Context used for transmitting various symbols in the bistream.
+typedef struct CommonContexts CommonContexts;
+struct CommonContexts {
+  // Context used by 'FRAME_CONTEXT.partition_cdf' to transmit partition type.
+  // partition[i][j] is the context for ith tile row, jth mi_col.
+  PARTITION_CONTEXT **partition;
+
+  // Context used to derive context for multiple symbols:
+  // - 'TXB_CTX.txb_skip_ctx' used by 'FRAME_CONTEXT.txb_skip_cdf' to transmit
+  // to transmit skip_txfm flag.
+  // - 'TXB_CTX.dc_sign_ctx' used by 'FRAME_CONTEXT.dc_sign_cdf' to transmit
+  // sign.
+  // entropy[i][j][k] is the context for ith plane, jth tile row, kth mi_col.
+  ENTROPY_CONTEXT **entropy[MAX_MB_PLANE];
+
+  // Context used to derive context for 'FRAME_CONTEXT.txfm_partition_cdf' to
+  // transmit 'is_split' flag to indicate if this transform block should be
+  // split into smaller sub-blocks.
+  // txfm[i][j] is the context for ith tile row, jth mi_col.
+  TXFM_CONTEXT **txfm;
+
+  // Dimensions that were used to allocate the arrays above.
+  // If these dimensions change, the arrays may have to be re-allocated.
+  int num_planes;     // Corresponds to av1_num_planes(cm)
+  int num_tile_rows;  // Corresponds to cm->tiles.row
+  int num_mi_cols;    // Corresponds to cm->mi_params.mi_cols
+};
+
+typedef struct AV1Common {
+  // Information about the current frame that is being coded.
+  CurrentFrame current_frame;
+  // Code and details about current error status.
+  struct aom_internal_error_info error;
+
+  // AV1 allows two types of frame scaling operations:
+  // (1) Frame super-resolution: that allows coding a frame at lower resolution
+  // and after decoding the frame, normatively uscales and restores the frame --
+  // inside the coding loop.
+  // (2) Frame resize: that allows coding frame at lower/higher resolution, and
+  // then non-normatively upscale the frame at the time of rendering -- outside
+  // the coding loop.
+  // Hence, the need for 3 types of dimensions.
+
+  // Coded frame dimensions.
+  int width;
+  int height;
+
+  // Rendered frame dimensions, after applying both super-resolution and resize
+  // to the coded frame.
+  // Different from coded dimensions if super-resolution and/or resize are
+  // being used for this frame.
+  int render_width;
+  int render_height;
+
+  // Frame dimensions after applying super-resolution to the coded frame (if
+  // present), but before applying resize.
+  // Larger than the coded dimensions if super-resolution is being used for
+  // this frame.
+  // Different from rendered dimensions if resize is being used for this frame.
+  int superres_upscaled_width;
+  int superres_upscaled_height;
+
+  // The denominator of the superres scale used by this frame.
+  // Note: The numerator is fixed to be SCALE_NUMERATOR.
+  uint8_t superres_scale_denominator;
+
+  // If true, buffer removal times are present.
+  bool buffer_removal_time_present;
+  // buffer_removal_times[op_num] specifies the frame removal time in units of
+  // DecCT clock ticks counted from the removal time of the last random access
+  // point for operating point op_num.
+  // TODO(urvang): We probably don't need the +1 here.
+  uint32_t buffer_removal_times[MAX_NUM_OPERATING_POINTS + 1];
+  // Presentation time of the frame in clock ticks DispCT counted from the
+  // removal time of the last random access point for the operating point that
+  // is being decoded.
+  uint32_t frame_presentation_time;
+
+  // Buffer where previous frame is stored.
+  RefCntBuffer *prev_frame;
+
+  // Buffer into which the current frame will be stored and other related info.
+  // TODO(hkuang): Combine this with cur_buf in macroblockd.
+  RefCntBuffer *cur_frame;
+
+  // For encoder, we have a two-level mapping from reference frame type to the
+  // corresponding buffer in the buffer pool:
+  // * 'remapped_ref_idx[i - 1]' maps reference type 'i' (range: LAST_FRAME ...
+  // EXTREF_FRAME) to a remapped index 'j' (in range: 0 ... REF_FRAMES - 1)
+  // * Later, 'cm->ref_frame_map[j]' maps the remapped index 'j' to a pointer to
+  // the reference counted buffer structure RefCntBuffer, taken from the buffer
+  // pool cm->buffer_pool->frame_bufs.
+  //
+  // LAST_FRAME,                        ...,      EXTREF_FRAME
+  //      |                                           |
+  //      v                                           v
+  // remapped_ref_idx[LAST_FRAME - 1],  ...,  remapped_ref_idx[EXTREF_FRAME - 1]
+  //      |                                           |
+  //      v                                           v
+  // ref_frame_map[],                   ...,     ref_frame_map[]
+  //
+  // Note: INTRA_FRAME always refers to the current frame, so there's no need to
+  // have a remapped index for the same.
+  int remapped_ref_idx[REF_FRAMES];
+
+  // Scale of the current frame with respect to itself.
+  // This is currently used for intra block copy, which behaves like an inter
+  // prediction mode, where the reference frame is the current frame itself.
+  struct scale_factors sf_identity;
+
+  // Scale factors of the reference frame with respect to the current frame.
+  // This is required for generating inter prediction and will be non-identity
+  // for a reference frame, if it has different dimensions than the coded
+  // dimensions of the current frame.
+  struct scale_factors ref_scale_factors[REF_FRAMES];
+
+  // For decoder, ref_frame_map[i] maps reference type 'i' to a pointer to
+  // the buffer in the buffer pool 'cm->buffer_pool.frame_bufs'.
+  // For encoder, ref_frame_map[j] (where j = remapped_ref_idx[i]) maps
+  // remapped reference index 'j' (that is, original reference type 'i') to
+  // a pointer to the buffer in the buffer pool 'cm->buffer_pool.frame_bufs'.
+  RefCntBuffer *ref_frame_map[REF_FRAMES];
+
+  // If true, this frame is actually shown after decoding.
+  // If false, this frame is coded in the bitstream, but not shown. It is only
+  // used as a reference for other frames coded later.
+  int show_frame;
+
+  // If true, this frame can be used as a show-existing frame for other frames
+  // coded later.
+  // When 'show_frame' is true, this is always true for all non-keyframes.
+  // When 'show_frame' is false, this value is transmitted in the bitstream.
+  int showable_frame;
+
+  // If true, show an existing frame coded before, instead of actually coding a
+  // frame. The existing frame comes from one of the existing reference buffers,
+  // as signaled in the bitstream.
+  int show_existing_frame;
+
+  // Whether some features are allowed or not.
+  FeatureFlags features;
+
+  // Params related to MB_MODE_INFO arrays and related info.
+  CommonModeInfoParams mi_params;
+
+#if CONFIG_ENTROPY_STATS
+  int coef_cdf_category;
+#endif
+  // Quantization params.
+  CommonQuantParams quant_params;
+
+  // Segmentation info for current frame.
+  struct segmentation seg;
+
+  // Segmentation map for previous frame.
+  uint8_t *last_frame_seg_map;
+
+  // Deblocking filter parameters.
+  loop_filter_info_n lf_info;
+  struct loopfilter lf;
+
+  // Loop Restoration filter parameters.
+  RestorationInfo rst_info[MAX_MB_PLANE];  // Loop Restoration filter info.
+  int32_t *rst_tmpbuf;  // Scratch buffer for self-guided restoration filter.
+  RestorationLineBuffers *rlbs;  // Line buffers required by loop restoration.
+  YV12_BUFFER_CONFIG rst_frame;  // Stores the output of loop restoration.
+
+  // CDEF (Constrained Directional Enhancement Filter) parameters.
+  CdefInfo cdef_info;
+
+  // Parameters for film grain synthesis.
+  aom_film_grain_t film_grain_params;
+
+  // Parameters for delta quantization and delta loop filter level.
+  DeltaQInfo delta_q_info;
+
+  // Global motion parameters for each reference frame.
+  WarpedMotionParams global_motion[REF_FRAMES];
+
+  // Elements part of the sequence header, that are applicable for all the
+  // frames in the video.
+  SequenceHeader seq_params;
+
+  // Current CDFs of all the symbols for the current frame.
+  FRAME_CONTEXT *fc;
+  // Default CDFs used when features.primary_ref_frame = PRIMARY_REF_NONE
+  // (e.g. for a keyframe). These default CDFs are defined by the bitstream and
+  // copied from default CDF tables for each symbol.
+  FRAME_CONTEXT *default_frame_context;
+
+  // Parameters related to tiling.
+  CommonTileParams tiles;
+
+  // External BufferPool passed from outside.
+  BufferPool *buffer_pool;
+
+  // Above context buffers and their sizes.
+  // Note: above contexts are allocated in this struct, as their size is
+  // dependent on frame width, while left contexts are declared and allocated in
+  // MACROBLOCKD struct, as they have a fixed size.
+  CommonContexts above_contexts;
+
+  // When cm->seq_params.frame_id_numbers_present_flag == 1, current and
+  // reference frame IDs are signaled in the bitstream.
+  int current_frame_id;
+  int ref_frame_id[REF_FRAMES];
+
+  // Motion vectors provided by motion field estimation.
+  // tpl_mvs[row * stride + col] stores MV for block at [mi_row, mi_col] where:
+  // mi_row = 2 * row,
+  // mi_col = 2 * col, and
+  // stride = cm->mi_params.mi_stride / 2
+  TPL_MV_REF *tpl_mvs;
+  // Allocated size of 'tpl_mvs' array. Refer to 'ensure_mv_buffer()' function.
+  int tpl_mvs_mem_size;
+  // ref_frame_sign_bias[k] is 1 if relative distance between reference 'k' and
+  // current frame is positive; and 0 otherwise.
+  int ref_frame_sign_bias[REF_FRAMES];
+  // ref_frame_side[k] is 1 if relative distance between reference 'k' and
+  // current frame is positive, -1 if relative distance is 0; and 0 otherwise.
+  // TODO(jingning): This can be combined with sign_bias later.
+  int8_t ref_frame_side[REF_FRAMES];
+
+  // Number of temporal layers: may be > 1 for SVC (scalable vector coding).
+  unsigned int number_temporal_layers;
+  // Temporal layer ID of this frame
+  // (in the range 0 ... (number_temporal_layers - 1)).
+  int temporal_layer_id;
+
+  // Number of spatial layers: may be > 1 for SVC (scalable vector coding).
+  unsigned int number_spatial_layers;
+  // Spatial layer ID of this frame
+  // (in the range 0 ... (number_spatial_layers - 1)).
+  int spatial_layer_id;
+
+#if TXCOEFF_TIMER
+  int64_t cum_txcoeff_timer;
+  int64_t txcoeff_timer;
+  int txb_count;
+#endif  // TXCOEFF_TIMER
+
+#if TXCOEFF_COST_TIMER
+  int64_t cum_txcoeff_cost_timer;
+  int64_t txcoeff_cost_timer;
+  int64_t txcoeff_cost_count;
+#endif  // TXCOEFF_COST_TIMER
+
+#if CONFIG_LPF_MASK
+  int is_decoding;
+#endif  // CONFIG_LPF_MASK
+} AV1_COMMON;
+
+// TODO(hkuang): Don't need to lock the whole pool after implementing atomic
+// frame reference count.
+static void lock_buffer_pool(BufferPool *const pool) {
+#if CONFIG_MULTITHREAD
+  pthread_mutex_lock(&pool->pool_mutex);
+#else
+  (void)pool;
+#endif
+}
+
+static void unlock_buffer_pool(BufferPool *const pool) {
+#if CONFIG_MULTITHREAD
+  pthread_mutex_unlock(&pool->pool_mutex);
+#else
+  (void)pool;
+#endif
+}
+
+static INLINE YV12_BUFFER_CONFIG *get_ref_frame(AV1_COMMON *cm, int index) {
+  if (index < 0 || index >= REF_FRAMES) return NULL;
+  if (cm->ref_frame_map[index] == NULL) return NULL;
+  return &cm->ref_frame_map[index]->buf;
+}
+
+static INLINE int get_free_fb(AV1_COMMON *cm) {
+  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+  int i;
+
+  lock_buffer_pool(cm->buffer_pool);
+  for (i = 0; i < FRAME_BUFFERS; ++i)
+    if (frame_bufs[i].ref_count == 0) break;
+
+  if (i != FRAME_BUFFERS) {
+    if (frame_bufs[i].buf.use_external_reference_buffers) {
+      // If this frame buffer's y_buffer, u_buffer, and v_buffer point to the
+      // external reference buffers. Restore the buffer pointers to point to the
+      // internally allocated memory.
+      YV12_BUFFER_CONFIG *ybf = &frame_bufs[i].buf;
+      ybf->y_buffer = ybf->store_buf_adr[0];
+      ybf->u_buffer = ybf->store_buf_adr[1];
+      ybf->v_buffer = ybf->store_buf_adr[2];
+      ybf->use_external_reference_buffers = 0;
+    }
+
+    frame_bufs[i].ref_count = 1;
+  } else {
+    // We should never run out of free buffers. If this assertion fails, there
+    // is a reference leak.
+    assert(0 && "Ran out of free frame buffers. Likely a reference leak.");
+    // Reset i to be INVALID_IDX to indicate no free buffer found.
+    i = INVALID_IDX;
+  }
+
+  unlock_buffer_pool(cm->buffer_pool);
+  return i;
+}
+
+static INLINE RefCntBuffer *assign_cur_frame_new_fb(AV1_COMMON *const cm) {
+  // Release the previously-used frame-buffer
+  if (cm->cur_frame != NULL) {
+    --cm->cur_frame->ref_count;
+    cm->cur_frame = NULL;
+  }
+
+  // Assign a new framebuffer
+  const int new_fb_idx = get_free_fb(cm);
+  if (new_fb_idx == INVALID_IDX) return NULL;
+
+  cm->cur_frame = &cm->buffer_pool->frame_bufs[new_fb_idx];
+  cm->cur_frame->buf.buf_8bit_valid = 0;
+  av1_zero(cm->cur_frame->interp_filter_selected);
+  return cm->cur_frame;
+}
+
+// Modify 'lhs_ptr' to reference the buffer at 'rhs_ptr', and update the ref
+// counts accordingly.
+static INLINE void assign_frame_buffer_p(RefCntBuffer **lhs_ptr,
+                                         RefCntBuffer *rhs_ptr) {
+  RefCntBuffer *const old_ptr = *lhs_ptr;
+  if (old_ptr != NULL) {
+    assert(old_ptr->ref_count > 0);
+    // One less reference to the buffer at 'old_ptr', so decrease ref count.
+    --old_ptr->ref_count;
+  }
+
+  *lhs_ptr = rhs_ptr;
+  // One more reference to the buffer at 'rhs_ptr', so increase ref count.
+  ++rhs_ptr->ref_count;
+}
+
+static INLINE int frame_is_intra_only(const AV1_COMMON *const cm) {
+  return cm->current_frame.frame_type == KEY_FRAME ||
+         cm->current_frame.frame_type == INTRA_ONLY_FRAME;
+}
+
+static INLINE int frame_is_sframe(const AV1_COMMON *cm) {
+  return cm->current_frame.frame_type == S_FRAME;
+}
+
+// These functions take a reference frame label between LAST_FRAME and
+// EXTREF_FRAME inclusive.  Note that this is different to the indexing
+// previously used by the frame_refs[] array.
+static INLINE int get_ref_frame_map_idx(const AV1_COMMON *const cm,
+                                        const MV_REFERENCE_FRAME ref_frame) {
+  return (ref_frame >= LAST_FRAME && ref_frame <= EXTREF_FRAME)
+             ? cm->remapped_ref_idx[ref_frame - LAST_FRAME]
+             : INVALID_IDX;
+}
+
+static INLINE RefCntBuffer *get_ref_frame_buf(
+    const AV1_COMMON *const cm, const MV_REFERENCE_FRAME ref_frame) {
+  const int map_idx = get_ref_frame_map_idx(cm, ref_frame);
+  return (map_idx != INVALID_IDX) ? cm->ref_frame_map[map_idx] : NULL;
+}
+
+// Both const and non-const versions of this function are provided so that it
+// can be used with a const AV1_COMMON if needed.
+static INLINE const struct scale_factors *get_ref_scale_factors_const(
+    const AV1_COMMON *const cm, const MV_REFERENCE_FRAME ref_frame) {
+  const int map_idx = get_ref_frame_map_idx(cm, ref_frame);
+  return (map_idx != INVALID_IDX) ? &cm->ref_scale_factors[map_idx] : NULL;
+}
+
+static INLINE struct scale_factors *get_ref_scale_factors(
+    AV1_COMMON *const cm, const MV_REFERENCE_FRAME ref_frame) {
+  const int map_idx = get_ref_frame_map_idx(cm, ref_frame);
+  return (map_idx != INVALID_IDX) ? &cm->ref_scale_factors[map_idx] : NULL;
+}
+
+static INLINE RefCntBuffer *get_primary_ref_frame_buf(
+    const AV1_COMMON *const cm) {
+  const int primary_ref_frame = cm->features.primary_ref_frame;
+  if (primary_ref_frame == PRIMARY_REF_NONE) return NULL;
+  const int map_idx = get_ref_frame_map_idx(cm, primary_ref_frame + 1);
+  return (map_idx != INVALID_IDX) ? cm->ref_frame_map[map_idx] : NULL;
+}
+
+// Returns 1 if this frame might allow mvs from some reference frame.
+static INLINE int frame_might_allow_ref_frame_mvs(const AV1_COMMON *cm) {
+  return !cm->features.error_resilient_mode &&
+         cm->seq_params.order_hint_info.enable_ref_frame_mvs &&
+         cm->seq_params.order_hint_info.enable_order_hint &&
+         !frame_is_intra_only(cm);
+}
+
+// Returns 1 if this frame might use warped_motion
+static INLINE int frame_might_allow_warped_motion(const AV1_COMMON *cm) {
+  return !cm->features.error_resilient_mode && !frame_is_intra_only(cm) &&
+         cm->seq_params.enable_warped_motion;
+}
+
+static INLINE void ensure_mv_buffer(RefCntBuffer *buf, AV1_COMMON *cm) {
+  const int buf_rows = buf->mi_rows;
+  const int buf_cols = buf->mi_cols;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+
+  if (buf->mvs == NULL || buf_rows != mi_params->mi_rows ||
+      buf_cols != mi_params->mi_cols) {
+    aom_free(buf->mvs);
+    buf->mi_rows = mi_params->mi_rows;
+    buf->mi_cols = mi_params->mi_cols;
+    CHECK_MEM_ERROR(cm, buf->mvs,
+                    (MV_REF *)aom_calloc(((mi_params->mi_rows + 1) >> 1) *
+                                             ((mi_params->mi_cols + 1) >> 1),
+                                         sizeof(*buf->mvs)));
+    aom_free(buf->seg_map);
+    CHECK_MEM_ERROR(
+        cm, buf->seg_map,
+        (uint8_t *)aom_calloc(mi_params->mi_rows * mi_params->mi_cols,
+                              sizeof(*buf->seg_map)));
+  }
+
+  const int mem_size =
+      ((mi_params->mi_rows + MAX_MIB_SIZE) >> 1) * (mi_params->mi_stride >> 1);
+  int realloc = cm->tpl_mvs == NULL;
+  if (cm->tpl_mvs) realloc |= cm->tpl_mvs_mem_size < mem_size;
+
+  if (realloc) {
+    aom_free(cm->tpl_mvs);
+    CHECK_MEM_ERROR(cm, cm->tpl_mvs,
+                    (TPL_MV_REF *)aom_calloc(mem_size, sizeof(*cm->tpl_mvs)));
+    cm->tpl_mvs_mem_size = mem_size;
+  }
+}
+
+void cfl_init(CFL_CTX *cfl, const SequenceHeader *seq_params);
+
+static INLINE int av1_num_planes(const AV1_COMMON *cm) {
+  return cm->seq_params.monochrome ? 1 : MAX_MB_PLANE;
+}
+
+static INLINE void av1_init_above_context(CommonContexts *above_contexts,
+                                          int num_planes, int tile_row,
+                                          MACROBLOCKD *xd) {
+  for (int i = 0; i < num_planes; ++i) {
+    xd->above_entropy_context[i] = above_contexts->entropy[i][tile_row];
+  }
+  xd->above_partition_context = above_contexts->partition[tile_row];
+  xd->above_txfm_context = above_contexts->txfm[tile_row];
+}
+
+static INLINE void av1_init_macroblockd(AV1_COMMON *cm, MACROBLOCKD *xd,
+                                        tran_low_t *dqcoeff) {
+  const int num_planes = av1_num_planes(cm);
+  const CommonQuantParams *const quant_params = &cm->quant_params;
+
+  for (int i = 0; i < num_planes; ++i) {
+    xd->plane[i].dqcoeff = dqcoeff;
+
+    if (xd->plane[i].plane_type == PLANE_TYPE_Y) {
+      memcpy(xd->plane[i].seg_dequant_QTX, quant_params->y_dequant_QTX,
+             sizeof(quant_params->y_dequant_QTX));
+      memcpy(xd->plane[i].seg_iqmatrix, quant_params->y_iqmatrix,
+             sizeof(quant_params->y_iqmatrix));
+
+    } else {
+      if (i == AOM_PLANE_U) {
+        memcpy(xd->plane[i].seg_dequant_QTX, quant_params->u_dequant_QTX,
+               sizeof(quant_params->u_dequant_QTX));
+        memcpy(xd->plane[i].seg_iqmatrix, quant_params->u_iqmatrix,
+               sizeof(quant_params->u_iqmatrix));
+      } else {
+        memcpy(xd->plane[i].seg_dequant_QTX, quant_params->v_dequant_QTX,
+               sizeof(quant_params->v_dequant_QTX));
+        memcpy(xd->plane[i].seg_iqmatrix, quant_params->v_iqmatrix,
+               sizeof(quant_params->v_iqmatrix));
+      }
+    }
+  }
+  xd->mi_stride = cm->mi_params.mi_stride;
+  xd->error_info = &cm->error;
+  cfl_init(&xd->cfl, &cm->seq_params);
+}
+
+static INLINE void set_entropy_context(MACROBLOCKD *xd, int mi_row, int mi_col,
+                                       const int num_planes) {
+  int i;
+  int row_offset = mi_row;
+  int col_offset = mi_col;
+  for (i = 0; i < num_planes; ++i) {
+    struct macroblockd_plane *const pd = &xd->plane[i];
+    // Offset the buffer pointer
+    const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+    if (pd->subsampling_y && (mi_row & 0x01) && (mi_size_high[bsize] == 1))
+      row_offset = mi_row - 1;
+    if (pd->subsampling_x && (mi_col & 0x01) && (mi_size_wide[bsize] == 1))
+      col_offset = mi_col - 1;
+    int above_idx = col_offset;
+    int left_idx = row_offset & MAX_MIB_MASK;
+    pd->above_entropy_context =
+        &xd->above_entropy_context[i][above_idx >> pd->subsampling_x];
+    pd->left_entropy_context =
+        &xd->left_entropy_context[i][left_idx >> pd->subsampling_y];
+  }
+}
+
+static INLINE int calc_mi_size(int len) {
+  // len is in mi units. Align to a multiple of SBs.
+  return ALIGN_POWER_OF_TWO(len, MAX_MIB_SIZE_LOG2);
+}
+
+static INLINE void set_plane_n4(MACROBLOCKD *const xd, int bw, int bh,
+                                const int num_planes) {
+  int i;
+  for (i = 0; i < num_planes; i++) {
+    xd->plane[i].width = (bw * MI_SIZE) >> xd->plane[i].subsampling_x;
+    xd->plane[i].height = (bh * MI_SIZE) >> xd->plane[i].subsampling_y;
+
+    xd->plane[i].width = AOMMAX(xd->plane[i].width, 4);
+    xd->plane[i].height = AOMMAX(xd->plane[i].height, 4);
+  }
+}
+
+static INLINE void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile,
+                                  int mi_row, int bh, int mi_col, int bw,
+                                  int mi_rows, int mi_cols) {
+  xd->mb_to_top_edge = -GET_MV_SUBPEL(mi_row * MI_SIZE);
+  xd->mb_to_bottom_edge = GET_MV_SUBPEL((mi_rows - bh - mi_row) * MI_SIZE);
+  xd->mb_to_left_edge = -GET_MV_SUBPEL((mi_col * MI_SIZE));
+  xd->mb_to_right_edge = GET_MV_SUBPEL((mi_cols - bw - mi_col) * MI_SIZE);
+
+  xd->mi_row = mi_row;
+  xd->mi_col = mi_col;
+
+  // Are edges available for intra prediction?
+  xd->up_available = (mi_row > tile->mi_row_start);
+
+  const int ss_x = xd->plane[1].subsampling_x;
+  const int ss_y = xd->plane[1].subsampling_y;
+
+  xd->left_available = (mi_col > tile->mi_col_start);
+  xd->chroma_up_available = xd->up_available;
+  xd->chroma_left_available = xd->left_available;
+  if (ss_x && bw < mi_size_wide[BLOCK_8X8])
+    xd->chroma_left_available = (mi_col - 1) > tile->mi_col_start;
+  if (ss_y && bh < mi_size_high[BLOCK_8X8])
+    xd->chroma_up_available = (mi_row - 1) > tile->mi_row_start;
+  if (xd->up_available) {
+    xd->above_mbmi = xd->mi[-xd->mi_stride];
+  } else {
+    xd->above_mbmi = NULL;
+  }
+
+  if (xd->left_available) {
+    xd->left_mbmi = xd->mi[-1];
+  } else {
+    xd->left_mbmi = NULL;
+  }
+
+  const int chroma_ref = ((mi_row & 0x01) || !(bh & 0x01) || !ss_y) &&
+                         ((mi_col & 0x01) || !(bw & 0x01) || !ss_x);
+  xd->is_chroma_ref = chroma_ref;
+  if (chroma_ref) {
+    // To help calculate the "above" and "left" chroma blocks, note that the
+    // current block may cover multiple luma blocks (eg, if partitioned into
+    // 4x4 luma blocks).
+    // First, find the top-left-most luma block covered by this chroma block
+    MB_MODE_INFO **base_mi =
+        &xd->mi[-(mi_row & ss_y) * xd->mi_stride - (mi_col & ss_x)];
+
+    // Then, we consider the luma region covered by the left or above 4x4 chroma
+    // prediction. We want to point to the chroma reference block in that
+    // region, which is the bottom-right-most mi unit.
+    // This leads to the following offsets:
+    MB_MODE_INFO *chroma_above_mi =
+        xd->chroma_up_available ? base_mi[-xd->mi_stride + ss_x] : NULL;
+    xd->chroma_above_mbmi = chroma_above_mi;
+
+    MB_MODE_INFO *chroma_left_mi =
+        xd->chroma_left_available ? base_mi[ss_y * xd->mi_stride - 1] : NULL;
+    xd->chroma_left_mbmi = chroma_left_mi;
+  }
+
+  xd->height = bh;
+  xd->width = bw;
+  xd->is_sec_rect = 0;
+  if (xd->width < xd->height) {
+    // Only mark is_sec_rect as 1 for the last block.
+    // For PARTITION_VERT_4, it would be (0, 0, 0, 1);
+    // For other partitions, it would be (0, 1).
+    if (!((mi_col + xd->width) & (xd->height - 1))) xd->is_sec_rect = 1;
+  }
+
+  if (xd->width > xd->height)
+    if (mi_row & (xd->width - 1)) xd->is_sec_rect = 1;
+}
+
+static INLINE aom_cdf_prob *get_y_mode_cdf(FRAME_CONTEXT *tile_ctx,
+                                           const MB_MODE_INFO *above_mi,
+                                           const MB_MODE_INFO *left_mi) {
+  const PREDICTION_MODE above = av1_above_block_mode(above_mi);
+  const PREDICTION_MODE left = av1_left_block_mode(left_mi);
+  const int above_ctx = intra_mode_context[above];
+  const int left_ctx = intra_mode_context[left];
+  return tile_ctx->kf_y_cdf[above_ctx][left_ctx];
+}
+
+static INLINE void update_partition_context(MACROBLOCKD *xd, int mi_row,
+                                            int mi_col, BLOCK_SIZE subsize,
+                                            BLOCK_SIZE bsize) {
+  PARTITION_CONTEXT *const above_ctx = xd->above_partition_context + mi_col;
+  PARTITION_CONTEXT *const left_ctx =
+      xd->left_partition_context + (mi_row & MAX_MIB_MASK);
+
+  const int bw = mi_size_wide[bsize];
+  const int bh = mi_size_high[bsize];
+  memset(above_ctx, partition_context_lookup[subsize].above, bw);
+  memset(left_ctx, partition_context_lookup[subsize].left, bh);
+}
+
+static INLINE int is_chroma_reference(int mi_row, int mi_col, BLOCK_SIZE bsize,
+                                      int subsampling_x, int subsampling_y) {
+  assert(bsize < BLOCK_SIZES_ALL);
+  const int bw = mi_size_wide[bsize];
+  const int bh = mi_size_high[bsize];
+  int ref_pos = ((mi_row & 0x01) || !(bh & 0x01) || !subsampling_y) &&
+                ((mi_col & 0x01) || !(bw & 0x01) || !subsampling_x);
+  return ref_pos;
+}
+
+static INLINE aom_cdf_prob cdf_element_prob(const aom_cdf_prob *cdf,
+                                            size_t element) {
+  assert(cdf != NULL);
+  return (element > 0 ? cdf[element - 1] : CDF_PROB_TOP) - cdf[element];
+}
+
+static INLINE void partition_gather_horz_alike(aom_cdf_prob *out,
+                                               const aom_cdf_prob *const in,
+                                               BLOCK_SIZE bsize) {
+  (void)bsize;
+  out[0] = CDF_PROB_TOP;
+  out[0] -= cdf_element_prob(in, PARTITION_HORZ);
+  out[0] -= cdf_element_prob(in, PARTITION_SPLIT);
+  out[0] -= cdf_element_prob(in, PARTITION_HORZ_A);
+  out[0] -= cdf_element_prob(in, PARTITION_HORZ_B);
+  out[0] -= cdf_element_prob(in, PARTITION_VERT_A);
+  if (bsize != BLOCK_128X128) out[0] -= cdf_element_prob(in, PARTITION_HORZ_4);
+  out[0] = AOM_ICDF(out[0]);
+  out[1] = AOM_ICDF(CDF_PROB_TOP);
+}
+
+static INLINE void partition_gather_vert_alike(aom_cdf_prob *out,
+                                               const aom_cdf_prob *const in,
+                                               BLOCK_SIZE bsize) {
+  (void)bsize;
+  out[0] = CDF_PROB_TOP;
+  out[0] -= cdf_element_prob(in, PARTITION_VERT);
+  out[0] -= cdf_element_prob(in, PARTITION_SPLIT);
+  out[0] -= cdf_element_prob(in, PARTITION_HORZ_A);
+  out[0] -= cdf_element_prob(in, PARTITION_VERT_A);
+  out[0] -= cdf_element_prob(in, PARTITION_VERT_B);
+  if (bsize != BLOCK_128X128) out[0] -= cdf_element_prob(in, PARTITION_VERT_4);
+  out[0] = AOM_ICDF(out[0]);
+  out[1] = AOM_ICDF(CDF_PROB_TOP);
+}
+
+static INLINE void update_ext_partition_context(MACROBLOCKD *xd, int mi_row,
+                                                int mi_col, BLOCK_SIZE subsize,
+                                                BLOCK_SIZE bsize,
+                                                PARTITION_TYPE partition) {
+  if (bsize >= BLOCK_8X8) {
+    const int hbs = mi_size_wide[bsize] / 2;
+    BLOCK_SIZE bsize2 = get_partition_subsize(bsize, PARTITION_SPLIT);
+    switch (partition) {
+      case PARTITION_SPLIT:
+        if (bsize != BLOCK_8X8) break;
+        AOM_FALLTHROUGH_INTENDED;
+      case PARTITION_NONE:
+      case PARTITION_HORZ:
+      case PARTITION_VERT:
+      case PARTITION_HORZ_4:
+      case PARTITION_VERT_4:
+        update_partition_context(xd, mi_row, mi_col, subsize, bsize);
+        break;
+      case PARTITION_HORZ_A:
+        update_partition_context(xd, mi_row, mi_col, bsize2, subsize);
+        update_partition_context(xd, mi_row + hbs, mi_col, subsize, subsize);
+        break;
+      case PARTITION_HORZ_B:
+        update_partition_context(xd, mi_row, mi_col, subsize, subsize);
+        update_partition_context(xd, mi_row + hbs, mi_col, bsize2, subsize);
+        break;
+      case PARTITION_VERT_A:
+        update_partition_context(xd, mi_row, mi_col, bsize2, subsize);
+        update_partition_context(xd, mi_row, mi_col + hbs, subsize, subsize);
+        break;
+      case PARTITION_VERT_B:
+        update_partition_context(xd, mi_row, mi_col, subsize, subsize);
+        update_partition_context(xd, mi_row, mi_col + hbs, bsize2, subsize);
+        break;
+      default: assert(0 && "Invalid partition type");
+    }
+  }
+}
+
+static INLINE int partition_plane_context(const MACROBLOCKD *xd, int mi_row,
+                                          int mi_col, BLOCK_SIZE bsize) {
+  const PARTITION_CONTEXT *above_ctx = xd->above_partition_context + mi_col;
+  const PARTITION_CONTEXT *left_ctx =
+      xd->left_partition_context + (mi_row & MAX_MIB_MASK);
+  // Minimum partition point is 8x8. Offset the bsl accordingly.
+  const int bsl = mi_size_wide_log2[bsize] - mi_size_wide_log2[BLOCK_8X8];
+  int above = (*above_ctx >> bsl) & 1, left = (*left_ctx >> bsl) & 1;
+
+  assert(mi_size_wide_log2[bsize] == mi_size_high_log2[bsize]);
+  assert(bsl >= 0);
+
+  return (left * 2 + above) + bsl * PARTITION_PLOFFSET;
+}
+
+// Return the number of elements in the partition CDF when
+// partitioning the (square) block with luma block size of bsize.
+static INLINE int partition_cdf_length(BLOCK_SIZE bsize) {
+  if (bsize <= BLOCK_8X8)
+    return PARTITION_TYPES;
+  else if (bsize == BLOCK_128X128)
+    return EXT_PARTITION_TYPES - 2;
+  else
+    return EXT_PARTITION_TYPES;
+}
+
+static INLINE int max_block_wide(const MACROBLOCKD *xd, BLOCK_SIZE bsize,
+                                 int plane) {
+  assert(bsize < BLOCK_SIZES_ALL);
+  int max_blocks_wide = block_size_wide[bsize];
+
+  if (xd->mb_to_right_edge < 0) {
+    const struct macroblockd_plane *const pd = &xd->plane[plane];
+    max_blocks_wide += xd->mb_to_right_edge >> (3 + pd->subsampling_x);
+  }
+
+  // Scale the width in the transform block unit.
+  return max_blocks_wide >> MI_SIZE_LOG2;
+}
+
+static INLINE int max_block_high(const MACROBLOCKD *xd, BLOCK_SIZE bsize,
+                                 int plane) {
+  int max_blocks_high = block_size_high[bsize];
+
+  if (xd->mb_to_bottom_edge < 0) {
+    const struct macroblockd_plane *const pd = &xd->plane[plane];
+    max_blocks_high += xd->mb_to_bottom_edge >> (3 + pd->subsampling_y);
+  }
+
+  // Scale the height in the transform block unit.
+  return max_blocks_high >> MI_SIZE_LOG2;
+}
+
+static INLINE void av1_zero_above_context(AV1_COMMON *const cm,
+                                          const MACROBLOCKD *xd,
+                                          int mi_col_start, int mi_col_end,
+                                          const int tile_row) {
+  const SequenceHeader *const seq_params = &cm->seq_params;
+  const int num_planes = av1_num_planes(cm);
+  const int width = mi_col_end - mi_col_start;
+  const int aligned_width =
+      ALIGN_POWER_OF_TWO(width, seq_params->mib_size_log2);
+  const int offset_y = mi_col_start;
+  const int width_y = aligned_width;
+  const int offset_uv = offset_y >> seq_params->subsampling_x;
+  const int width_uv = width_y >> seq_params->subsampling_x;
+  CommonContexts *const above_contexts = &cm->above_contexts;
+
+  av1_zero_array(above_contexts->entropy[0][tile_row] + offset_y, width_y);
+  if (num_planes > 1) {
+    if (above_contexts->entropy[1][tile_row] &&
+        above_contexts->entropy[2][tile_row]) {
+      av1_zero_array(above_contexts->entropy[1][tile_row] + offset_uv,
+                     width_uv);
+      av1_zero_array(above_contexts->entropy[2][tile_row] + offset_uv,
+                     width_uv);
+    } else {
+      aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
+                         "Invalid value of planes");
+    }
+  }
+
+  av1_zero_array(above_contexts->partition[tile_row] + mi_col_start,
+                 aligned_width);
+
+  memset(above_contexts->txfm[tile_row] + mi_col_start,
+         tx_size_wide[TX_SIZES_LARGEST], aligned_width * sizeof(TXFM_CONTEXT));
+}
+
+static INLINE void av1_zero_left_context(MACROBLOCKD *const xd) {
+  av1_zero(xd->left_entropy_context);
+  av1_zero(xd->left_partition_context);
+
+  memset(xd->left_txfm_context_buffer, tx_size_high[TX_SIZES_LARGEST],
+         sizeof(xd->left_txfm_context_buffer));
+}
+
+// Disable array-bounds checks as the TX_SIZE enum contains values larger than
+// TX_SIZES_ALL (TX_INVALID) which make extending the array as a workaround
+// infeasible. The assert is enough for static analysis and this or other tools
+// asan, valgrind would catch oob access at runtime.
+#if defined(__GNUC__) && __GNUC__ >= 4
+#pragma GCC diagnostic ignored "-Warray-bounds"
+#endif
+
+#if defined(__GNUC__) && __GNUC__ >= 4
+#pragma GCC diagnostic warning "-Warray-bounds"
+#endif
+
+static INLINE void set_txfm_ctx(TXFM_CONTEXT *txfm_ctx, uint8_t txs, int len) {
+  int i;
+  for (i = 0; i < len; ++i) txfm_ctx[i] = txs;
+}
+
+static INLINE void set_txfm_ctxs(TX_SIZE tx_size, int n4_w, int n4_h, int skip,
+                                 const MACROBLOCKD *xd) {
+  uint8_t bw = tx_size_wide[tx_size];
+  uint8_t bh = tx_size_high[tx_size];
+
+  if (skip) {
+    bw = n4_w * MI_SIZE;
+    bh = n4_h * MI_SIZE;
+  }
+
+  set_txfm_ctx(xd->above_txfm_context, bw, n4_w);
+  set_txfm_ctx(xd->left_txfm_context, bh, n4_h);
+}
+
+static INLINE int get_mi_grid_idx(const CommonModeInfoParams *const mi_params,
+                                  int mi_row, int mi_col) {
+  return mi_row * mi_params->mi_stride + mi_col;
+}
+
+static INLINE int get_alloc_mi_idx(const CommonModeInfoParams *const mi_params,
+                                   int mi_row, int mi_col) {
+  const int mi_alloc_size_1d = mi_size_wide[mi_params->mi_alloc_bsize];
+  const int mi_alloc_row = mi_row / mi_alloc_size_1d;
+  const int mi_alloc_col = mi_col / mi_alloc_size_1d;
+
+  return mi_alloc_row * mi_params->mi_alloc_stride + mi_alloc_col;
+}
+
+// For this partition block, set pointers in mi_params->mi_grid_base and xd->mi.
+static INLINE void set_mi_offsets(const CommonModeInfoParams *const mi_params,
+                                  MACROBLOCKD *const xd, int mi_row,
+                                  int mi_col) {
+  // 'mi_grid_base' should point to appropriate memory in 'mi'.
+  const int mi_grid_idx = get_mi_grid_idx(mi_params, mi_row, mi_col);
+  const int mi_alloc_idx = get_alloc_mi_idx(mi_params, mi_row, mi_col);
+  mi_params->mi_grid_base[mi_grid_idx] = &mi_params->mi_alloc[mi_alloc_idx];
+  // 'xd->mi' should point to an offset in 'mi_grid_base';
+  xd->mi = mi_params->mi_grid_base + mi_grid_idx;
+  // 'xd->tx_type_map' should point to an offset in 'mi_params->tx_type_map'.
+  xd->tx_type_map = mi_params->tx_type_map + mi_grid_idx;
+  xd->tx_type_map_stride = mi_params->mi_stride;
+}
+
+static INLINE void txfm_partition_update(TXFM_CONTEXT *above_ctx,
+                                         TXFM_CONTEXT *left_ctx,
+                                         TX_SIZE tx_size, TX_SIZE txb_size) {
+  BLOCK_SIZE bsize = txsize_to_bsize[txb_size];
+  int bh = mi_size_high[bsize];
+  int bw = mi_size_wide[bsize];
+  uint8_t txw = tx_size_wide[tx_size];
+  uint8_t txh = tx_size_high[tx_size];
+  int i;
+  for (i = 0; i < bh; ++i) left_ctx[i] = txh;
+  for (i = 0; i < bw; ++i) above_ctx[i] = txw;
+}
+
+static INLINE TX_SIZE get_sqr_tx_size(int tx_dim) {
+  switch (tx_dim) {
+    case 128:
+    case 64: return TX_64X64; break;
+    case 32: return TX_32X32; break;
+    case 16: return TX_16X16; break;
+    case 8: return TX_8X8; break;
+    default: return TX_4X4;
+  }
+}
+
+static INLINE TX_SIZE get_tx_size(int width, int height) {
+  if (width == height) {
+    return get_sqr_tx_size(width);
+  }
+  if (width < height) {
+    if (width + width == height) {
+      switch (width) {
+        case 4: return TX_4X8; break;
+        case 8: return TX_8X16; break;
+        case 16: return TX_16X32; break;
+        case 32: return TX_32X64; break;
+      }
+    } else {
+      switch (width) {
+        case 4: return TX_4X16; break;
+        case 8: return TX_8X32; break;
+        case 16: return TX_16X64; break;
+      }
+    }
+  } else {
+    if (height + height == width) {
+      switch (height) {
+        case 4: return TX_8X4; break;
+        case 8: return TX_16X8; break;
+        case 16: return TX_32X16; break;
+        case 32: return TX_64X32; break;
+      }
+    } else {
+      switch (height) {
+        case 4: return TX_16X4; break;
+        case 8: return TX_32X8; break;
+        case 16: return TX_64X16; break;
+      }
+    }
+  }
+  assert(0);
+  return TX_4X4;
+}
+
+static INLINE int txfm_partition_context(const TXFM_CONTEXT *const above_ctx,
+                                         const TXFM_CONTEXT *const left_ctx,
+                                         BLOCK_SIZE bsize, TX_SIZE tx_size) {
+  const uint8_t txw = tx_size_wide[tx_size];
+  const uint8_t txh = tx_size_high[tx_size];
+  const int above = *above_ctx < txw;
+  const int left = *left_ctx < txh;
+  int category = TXFM_PARTITION_CONTEXTS;
+
+  // dummy return, not used by others.
+  if (tx_size <= TX_4X4) return 0;
+
+  TX_SIZE max_tx_size =
+      get_sqr_tx_size(AOMMAX(block_size_wide[bsize], block_size_high[bsize]));
+
+  if (max_tx_size >= TX_8X8) {
+    category =
+        (txsize_sqr_up_map[tx_size] != max_tx_size && max_tx_size > TX_8X8) +
+        (TX_SIZES - 1 - max_tx_size) * 2;
+  }
+  assert(category != TXFM_PARTITION_CONTEXTS);
+  return category * 3 + above + left;
+}
+
+// Compute the next partition in the direction of the sb_type stored in the mi
+// array, starting with bsize.
+static INLINE PARTITION_TYPE get_partition(const AV1_COMMON *const cm,
+                                           int mi_row, int mi_col,
+                                           BLOCK_SIZE bsize) {
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols)
+    return PARTITION_INVALID;
+
+  const int offset = mi_row * mi_params->mi_stride + mi_col;
+  MB_MODE_INFO **mi = mi_params->mi_grid_base + offset;
+  const BLOCK_SIZE subsize = mi[0]->sb_type;
+
+  if (subsize == bsize) return PARTITION_NONE;
+
+  const int bhigh = mi_size_high[bsize];
+  const int bwide = mi_size_wide[bsize];
+  const int sshigh = mi_size_high[subsize];
+  const int sswide = mi_size_wide[subsize];
+
+  if (bsize > BLOCK_8X8 && mi_row + bwide / 2 < mi_params->mi_rows &&
+      mi_col + bhigh / 2 < mi_params->mi_cols) {
+    // In this case, the block might be using an extended partition
+    // type.
+    const MB_MODE_INFO *const mbmi_right = mi[bwide / 2];
+    const MB_MODE_INFO *const mbmi_below = mi[bhigh / 2 * mi_params->mi_stride];
+
+    if (sswide == bwide) {
+      // Smaller height but same width. Is PARTITION_HORZ_4, PARTITION_HORZ or
+      // PARTITION_HORZ_B. To distinguish the latter two, check if the lower
+      // half was split.
+      if (sshigh * 4 == bhigh) return PARTITION_HORZ_4;
+      assert(sshigh * 2 == bhigh);
+
+      if (mbmi_below->sb_type == subsize)
+        return PARTITION_HORZ;
+      else
+        return PARTITION_HORZ_B;
+    } else if (sshigh == bhigh) {
+      // Smaller width but same height. Is PARTITION_VERT_4, PARTITION_VERT or
+      // PARTITION_VERT_B. To distinguish the latter two, check if the right
+      // half was split.
+      if (sswide * 4 == bwide) return PARTITION_VERT_4;
+      assert(sswide * 2 == bhigh);
+
+      if (mbmi_right->sb_type == subsize)
+        return PARTITION_VERT;
+      else
+        return PARTITION_VERT_B;
+    } else {
+      // Smaller width and smaller height. Might be PARTITION_SPLIT or could be
+      // PARTITION_HORZ_A or PARTITION_VERT_A. If subsize isn't halved in both
+      // dimensions, we immediately know this is a split (which will recurse to
+      // get to subsize). Otherwise look down and to the right. With
+      // PARTITION_VERT_A, the right block will have height bhigh; with
+      // PARTITION_HORZ_A, the lower block with have width bwide. Otherwise
+      // it's PARTITION_SPLIT.
+      if (sswide * 2 != bwide || sshigh * 2 != bhigh) return PARTITION_SPLIT;
+
+      if (mi_size_wide[mbmi_below->sb_type] == bwide) return PARTITION_HORZ_A;
+      if (mi_size_high[mbmi_right->sb_type] == bhigh) return PARTITION_VERT_A;
+
+      return PARTITION_SPLIT;
+    }
+  }
+  const int vert_split = sswide < bwide;
+  const int horz_split = sshigh < bhigh;
+  const int split_idx = (vert_split << 1) | horz_split;
+  assert(split_idx != 0);
+
+  static const PARTITION_TYPE base_partitions[4] = {
+    PARTITION_INVALID, PARTITION_HORZ, PARTITION_VERT, PARTITION_SPLIT
+  };
+
+  return base_partitions[split_idx];
+}
+
+static INLINE void set_sb_size(SequenceHeader *const seq_params,
+                               BLOCK_SIZE sb_size) {
+  seq_params->sb_size = sb_size;
+  seq_params->mib_size = mi_size_wide[seq_params->sb_size];
+  seq_params->mib_size_log2 = mi_size_wide_log2[seq_params->sb_size];
+}
+
+// Returns true if the frame is fully lossless at the coded resolution.
+// Note: If super-resolution is used, such a frame will still NOT be lossless at
+// the upscaled resolution.
+static INLINE int is_coded_lossless(const AV1_COMMON *cm,
+                                    const MACROBLOCKD *xd) {
+  int coded_lossless = 1;
+  if (cm->seg.enabled) {
+    for (int i = 0; i < MAX_SEGMENTS; ++i) {
+      if (!xd->lossless[i]) {
+        coded_lossless = 0;
+        break;
+      }
+    }
+  } else {
+    coded_lossless = xd->lossless[0];
+  }
+  return coded_lossless;
+}
+
+static INLINE int is_valid_seq_level_idx(AV1_LEVEL seq_level_idx) {
+  return seq_level_idx == SEQ_LEVEL_MAX ||
+         (seq_level_idx < SEQ_LEVELS &&
+          // The following levels are currently undefined.
+          seq_level_idx != SEQ_LEVEL_2_2 && seq_level_idx != SEQ_LEVEL_2_3 &&
+          seq_level_idx != SEQ_LEVEL_3_2 && seq_level_idx != SEQ_LEVEL_3_3 &&
+          seq_level_idx != SEQ_LEVEL_4_2 && seq_level_idx != SEQ_LEVEL_4_3 &&
+          seq_level_idx != SEQ_LEVEL_7_0 && seq_level_idx != SEQ_LEVEL_7_1 &&
+          seq_level_idx != SEQ_LEVEL_7_2 && seq_level_idx != SEQ_LEVEL_7_3);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_COMMON_AV1_COMMON_INT_H_
diff --git a/libs/libaom/src/av1/common/av1_inv_txfm1d.c b/libs/libaom/src/av1/common/av1_inv_txfm1d.c
new file mode 100644
index 000000000..8d69efcd2
--- /dev/null
+++ b/libs/libaom/src/av1/common/av1_inv_txfm1d.c
@@ -0,0 +1,1841 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+#include "av1/common/av1_inv_txfm1d.h"
+#include "av1/common/av1_txfm.h"
+
+void av1_idct4(const int32_t *input, int32_t *output, int8_t cos_bit,
+               const int8_t *stage_range) {
+  assert(output != input);
+  const int32_t size = 4;
+  const int32_t *cospi = cospi_arr(cos_bit);
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[4];
+
+  // stage 0;
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0];
+  bf1[1] = input[2];
+  bf1[2] = input[1];
+  bf1[3] = input[3];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
+  bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit);
+  bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = clamp_value(bf0[0] + bf0[3], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[2], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[1] - bf0[2], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]);
+}
+
+void av1_idct8(const int32_t *input, int32_t *output, int8_t cos_bit,
+               const int8_t *stage_range) {
+  assert(output != input);
+  const int32_t size = 8;
+  const int32_t *cospi = cospi_arr(cos_bit);
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[8];
+
+  // stage 0;
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0];
+  bf1[1] = input[4];
+  bf1[2] = input[2];
+  bf1[3] = input[6];
+  bf1[4] = input[1];
+  bf1[5] = input[5];
+  bf1[6] = input[3];
+  bf1[7] = input[7];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit);
+  bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit);
+  bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit);
+  bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
+  bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit);
+  bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit);
+  bf1[4] = clamp_value(bf0[4] + bf0[5], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[4] - bf0[5], stage_range[stage]);
+  bf1[6] = clamp_value(-bf0[6] + bf0[7], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[6] + bf0[7], stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = clamp_value(bf0[0] + bf0[3], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[2], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[1] - bf0[2], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]);
+  bf1[4] = bf0[4];
+  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
+  bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
+  bf1[7] = bf0[7];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = clamp_value(bf0[0] + bf0[7], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[6], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[2] + bf0[5], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[3] + bf0[4], stage_range[stage]);
+  bf1[4] = clamp_value(bf0[3] - bf0[4], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[2] - bf0[5], stage_range[stage]);
+  bf1[6] = clamp_value(bf0[1] - bf0[6], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[0] - bf0[7], stage_range[stage]);
+}
+
+void av1_idct16(const int32_t *input, int32_t *output, int8_t cos_bit,
+                const int8_t *stage_range) {
+  assert(output != input);
+  const int32_t size = 16;
+  const int32_t *cospi = cospi_arr(cos_bit);
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[16];
+
+  // stage 0;
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0];
+  bf1[1] = input[8];
+  bf1[2] = input[4];
+  bf1[3] = input[12];
+  bf1[4] = input[2];
+  bf1[5] = input[10];
+  bf1[6] = input[6];
+  bf1[7] = input[14];
+  bf1[8] = input[1];
+  bf1[9] = input[9];
+  bf1[10] = input[5];
+  bf1[11] = input[13];
+  bf1[12] = input[3];
+  bf1[13] = input[11];
+  bf1[14] = input[7];
+  bf1[15] = input[15];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = half_btf(cospi[60], bf0[8], -cospi[4], bf0[15], cos_bit);
+  bf1[9] = half_btf(cospi[28], bf0[9], -cospi[36], bf0[14], cos_bit);
+  bf1[10] = half_btf(cospi[44], bf0[10], -cospi[20], bf0[13], cos_bit);
+  bf1[11] = half_btf(cospi[12], bf0[11], -cospi[52], bf0[12], cos_bit);
+  bf1[12] = half_btf(cospi[52], bf0[11], cospi[12], bf0[12], cos_bit);
+  bf1[13] = half_btf(cospi[20], bf0[10], cospi[44], bf0[13], cos_bit);
+  bf1[14] = half_btf(cospi[36], bf0[9], cospi[28], bf0[14], cos_bit);
+  bf1[15] = half_btf(cospi[4], bf0[8], cospi[60], bf0[15], cos_bit);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit);
+  bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit);
+  bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit);
+  bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit);
+  bf1[8] = clamp_value(bf0[8] + bf0[9], stage_range[stage]);
+  bf1[9] = clamp_value(bf0[8] - bf0[9], stage_range[stage]);
+  bf1[10] = clamp_value(-bf0[10] + bf0[11], stage_range[stage]);
+  bf1[11] = clamp_value(bf0[10] + bf0[11], stage_range[stage]);
+  bf1[12] = clamp_value(bf0[12] + bf0[13], stage_range[stage]);
+  bf1[13] = clamp_value(bf0[12] - bf0[13], stage_range[stage]);
+  bf1[14] = clamp_value(-bf0[14] + bf0[15], stage_range[stage]);
+  bf1[15] = clamp_value(bf0[14] + bf0[15], stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
+  bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit);
+  bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit);
+  bf1[4] = clamp_value(bf0[4] + bf0[5], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[4] - bf0[5], stage_range[stage]);
+  bf1[6] = clamp_value(-bf0[6] + bf0[7], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[6] + bf0[7], stage_range[stage]);
+  bf1[8] = bf0[8];
+  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
+  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = half_btf(-cospi[16], bf0[10], cospi[48], bf0[13], cos_bit);
+  bf1[14] = half_btf(cospi[48], bf0[9], cospi[16], bf0[14], cos_bit);
+  bf1[15] = bf0[15];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = clamp_value(bf0[0] + bf0[3], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[2], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[1] - bf0[2], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]);
+  bf1[4] = bf0[4];
+  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
+  bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
+  bf1[7] = bf0[7];
+  bf1[8] = clamp_value(bf0[8] + bf0[11], stage_range[stage]);
+  bf1[9] = clamp_value(bf0[9] + bf0[10], stage_range[stage]);
+  bf1[10] = clamp_value(bf0[9] - bf0[10], stage_range[stage]);
+  bf1[11] = clamp_value(bf0[8] - bf0[11], stage_range[stage]);
+  bf1[12] = clamp_value(-bf0[12] + bf0[15], stage_range[stage]);
+  bf1[13] = clamp_value(-bf0[13] + bf0[14], stage_range[stage]);
+  bf1[14] = clamp_value(bf0[13] + bf0[14], stage_range[stage]);
+  bf1[15] = clamp_value(bf0[12] + bf0[15], stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = clamp_value(bf0[0] + bf0[7], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[6], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[2] + bf0[5], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[3] + bf0[4], stage_range[stage]);
+  bf1[4] = clamp_value(bf0[3] - bf0[4], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[2] - bf0[5], stage_range[stage]);
+  bf1[6] = clamp_value(bf0[1] - bf0[6], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[0] - bf0[7], stage_range[stage]);
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
+  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
+  bf1[12] = half_btf(cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
+  bf1[13] = half_btf(cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = clamp_value(bf0[0] + bf0[15], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[14], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[2] + bf0[13], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[3] + bf0[12], stage_range[stage]);
+  bf1[4] = clamp_value(bf0[4] + bf0[11], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[5] + bf0[10], stage_range[stage]);
+  bf1[6] = clamp_value(bf0[6] + bf0[9], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[7] + bf0[8], stage_range[stage]);
+  bf1[8] = clamp_value(bf0[7] - bf0[8], stage_range[stage]);
+  bf1[9] = clamp_value(bf0[6] - bf0[9], stage_range[stage]);
+  bf1[10] = clamp_value(bf0[5] - bf0[10], stage_range[stage]);
+  bf1[11] = clamp_value(bf0[4] - bf0[11], stage_range[stage]);
+  bf1[12] = clamp_value(bf0[3] - bf0[12], stage_range[stage]);
+  bf1[13] = clamp_value(bf0[2] - bf0[13], stage_range[stage]);
+  bf1[14] = clamp_value(bf0[1] - bf0[14], stage_range[stage]);
+  bf1[15] = clamp_value(bf0[0] - bf0[15], stage_range[stage]);
+}
+
+void av1_idct32(const int32_t *input, int32_t *output, int8_t cos_bit,
+                const int8_t *stage_range) {
+  assert(output != input);
+  const int32_t size = 32;
+  const int32_t *cospi = cospi_arr(cos_bit);
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[32];
+
+  // stage 0;
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0];
+  bf1[1] = input[16];
+  bf1[2] = input[8];
+  bf1[3] = input[24];
+  bf1[4] = input[4];
+  bf1[5] = input[20];
+  bf1[6] = input[12];
+  bf1[7] = input[28];
+  bf1[8] = input[2];
+  bf1[9] = input[18];
+  bf1[10] = input[10];
+  bf1[11] = input[26];
+  bf1[12] = input[6];
+  bf1[13] = input[22];
+  bf1[14] = input[14];
+  bf1[15] = input[30];
+  bf1[16] = input[1];
+  bf1[17] = input[17];
+  bf1[18] = input[9];
+  bf1[19] = input[25];
+  bf1[20] = input[5];
+  bf1[21] = input[21];
+  bf1[22] = input[13];
+  bf1[23] = input[29];
+  bf1[24] = input[3];
+  bf1[25] = input[19];
+  bf1[26] = input[11];
+  bf1[27] = input[27];
+  bf1[28] = input[7];
+  bf1[29] = input[23];
+  bf1[30] = input[15];
+  bf1[31] = input[31];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = half_btf(cospi[62], bf0[16], -cospi[2], bf0[31], cos_bit);
+  bf1[17] = half_btf(cospi[30], bf0[17], -cospi[34], bf0[30], cos_bit);
+  bf1[18] = half_btf(cospi[46], bf0[18], -cospi[18], bf0[29], cos_bit);
+  bf1[19] = half_btf(cospi[14], bf0[19], -cospi[50], bf0[28], cos_bit);
+  bf1[20] = half_btf(cospi[54], bf0[20], -cospi[10], bf0[27], cos_bit);
+  bf1[21] = half_btf(cospi[22], bf0[21], -cospi[42], bf0[26], cos_bit);
+  bf1[22] = half_btf(cospi[38], bf0[22], -cospi[26], bf0[25], cos_bit);
+  bf1[23] = half_btf(cospi[6], bf0[23], -cospi[58], bf0[24], cos_bit);
+  bf1[24] = half_btf(cospi[58], bf0[23], cospi[6], bf0[24], cos_bit);
+  bf1[25] = half_btf(cospi[26], bf0[22], cospi[38], bf0[25], cos_bit);
+  bf1[26] = half_btf(cospi[42], bf0[21], cospi[22], bf0[26], cos_bit);
+  bf1[27] = half_btf(cospi[10], bf0[20], cospi[54], bf0[27], cos_bit);
+  bf1[28] = half_btf(cospi[50], bf0[19], cospi[14], bf0[28], cos_bit);
+  bf1[29] = half_btf(cospi[18], bf0[18], cospi[46], bf0[29], cos_bit);
+  bf1[30] = half_btf(cospi[34], bf0[17], cospi[30], bf0[30], cos_bit);
+  bf1[31] = half_btf(cospi[2], bf0[16], cospi[62], bf0[31], cos_bit);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = half_btf(cospi[60], bf0[8], -cospi[4], bf0[15], cos_bit);
+  bf1[9] = half_btf(cospi[28], bf0[9], -cospi[36], bf0[14], cos_bit);
+  bf1[10] = half_btf(cospi[44], bf0[10], -cospi[20], bf0[13], cos_bit);
+  bf1[11] = half_btf(cospi[12], bf0[11], -cospi[52], bf0[12], cos_bit);
+  bf1[12] = half_btf(cospi[52], bf0[11], cospi[12], bf0[12], cos_bit);
+  bf1[13] = half_btf(cospi[20], bf0[10], cospi[44], bf0[13], cos_bit);
+  bf1[14] = half_btf(cospi[36], bf0[9], cospi[28], bf0[14], cos_bit);
+  bf1[15] = half_btf(cospi[4], bf0[8], cospi[60], bf0[15], cos_bit);
+  bf1[16] = clamp_value(bf0[16] + bf0[17], stage_range[stage]);
+  bf1[17] = clamp_value(bf0[16] - bf0[17], stage_range[stage]);
+  bf1[18] = clamp_value(-bf0[18] + bf0[19], stage_range[stage]);
+  bf1[19] = clamp_value(bf0[18] + bf0[19], stage_range[stage]);
+  bf1[20] = clamp_value(bf0[20] + bf0[21], stage_range[stage]);
+  bf1[21] = clamp_value(bf0[20] - bf0[21], stage_range[stage]);
+  bf1[22] = clamp_value(-bf0[22] + bf0[23], stage_range[stage]);
+  bf1[23] = clamp_value(bf0[22] + bf0[23], stage_range[stage]);
+  bf1[24] = clamp_value(bf0[24] + bf0[25], stage_range[stage]);
+  bf1[25] = clamp_value(bf0[24] - bf0[25], stage_range[stage]);
+  bf1[26] = clamp_value(-bf0[26] + bf0[27], stage_range[stage]);
+  bf1[27] = clamp_value(bf0[26] + bf0[27], stage_range[stage]);
+  bf1[28] = clamp_value(bf0[28] + bf0[29], stage_range[stage]);
+  bf1[29] = clamp_value(bf0[28] - bf0[29], stage_range[stage]);
+  bf1[30] = clamp_value(-bf0[30] + bf0[31], stage_range[stage]);
+  bf1[31] = clamp_value(bf0[30] + bf0[31], stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit);
+  bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit);
+  bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit);
+  bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit);
+  bf1[8] = clamp_value(bf0[8] + bf0[9], stage_range[stage]);
+  bf1[9] = clamp_value(bf0[8] - bf0[9], stage_range[stage]);
+  bf1[10] = clamp_value(-bf0[10] + bf0[11], stage_range[stage]);
+  bf1[11] = clamp_value(bf0[10] + bf0[11], stage_range[stage]);
+  bf1[12] = clamp_value(bf0[12] + bf0[13], stage_range[stage]);
+  bf1[13] = clamp_value(bf0[12] - bf0[13], stage_range[stage]);
+  bf1[14] = clamp_value(-bf0[14] + bf0[15], stage_range[stage]);
+  bf1[15] = clamp_value(bf0[14] + bf0[15], stage_range[stage]);
+  bf1[16] = bf0[16];
+  bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit);
+  bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit);
+  bf1[19] = bf0[19];
+  bf1[20] = bf0[20];
+  bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit);
+  bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit);
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = half_btf(-cospi[40], bf0[22], cospi[24], bf0[25], cos_bit);
+  bf1[26] = half_btf(cospi[24], bf0[21], cospi[40], bf0[26], cos_bit);
+  bf1[27] = bf0[27];
+  bf1[28] = bf0[28];
+  bf1[29] = half_btf(-cospi[8], bf0[18], cospi[56], bf0[29], cos_bit);
+  bf1[30] = half_btf(cospi[56], bf0[17], cospi[8], bf0[30], cos_bit);
+  bf1[31] = bf0[31];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
+  bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit);
+  bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit);
+  bf1[4] = clamp_value(bf0[4] + bf0[5], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[4] - bf0[5], stage_range[stage]);
+  bf1[6] = clamp_value(-bf0[6] + bf0[7], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[6] + bf0[7], stage_range[stage]);
+  bf1[8] = bf0[8];
+  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
+  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = half_btf(-cospi[16], bf0[10], cospi[48], bf0[13], cos_bit);
+  bf1[14] = half_btf(cospi[48], bf0[9], cospi[16], bf0[14], cos_bit);
+  bf1[15] = bf0[15];
+  bf1[16] = clamp_value(bf0[16] + bf0[19], stage_range[stage]);
+  bf1[17] = clamp_value(bf0[17] + bf0[18], stage_range[stage]);
+  bf1[18] = clamp_value(bf0[17] - bf0[18], stage_range[stage]);
+  bf1[19] = clamp_value(bf0[16] - bf0[19], stage_range[stage]);
+  bf1[20] = clamp_value(-bf0[20] + bf0[23], stage_range[stage]);
+  bf1[21] = clamp_value(-bf0[21] + bf0[22], stage_range[stage]);
+  bf1[22] = clamp_value(bf0[21] + bf0[22], stage_range[stage]);
+  bf1[23] = clamp_value(bf0[20] + bf0[23], stage_range[stage]);
+  bf1[24] = clamp_value(bf0[24] + bf0[27], stage_range[stage]);
+  bf1[25] = clamp_value(bf0[25] + bf0[26], stage_range[stage]);
+  bf1[26] = clamp_value(bf0[25] - bf0[26], stage_range[stage]);
+  bf1[27] = clamp_value(bf0[24] - bf0[27], stage_range[stage]);
+  bf1[28] = clamp_value(-bf0[28] + bf0[31], stage_range[stage]);
+  bf1[29] = clamp_value(-bf0[29] + bf0[30], stage_range[stage]);
+  bf1[30] = clamp_value(bf0[29] + bf0[30], stage_range[stage]);
+  bf1[31] = clamp_value(bf0[28] + bf0[31], stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = clamp_value(bf0[0] + bf0[3], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[2], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[1] - bf0[2], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]);
+  bf1[4] = bf0[4];
+  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
+  bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
+  bf1[7] = bf0[7];
+  bf1[8] = clamp_value(bf0[8] + bf0[11], stage_range[stage]);
+  bf1[9] = clamp_value(bf0[9] + bf0[10], stage_range[stage]);
+  bf1[10] = clamp_value(bf0[9] - bf0[10], stage_range[stage]);
+  bf1[11] = clamp_value(bf0[8] - bf0[11], stage_range[stage]);
+  bf1[12] = clamp_value(-bf0[12] + bf0[15], stage_range[stage]);
+  bf1[13] = clamp_value(-bf0[13] + bf0[14], stage_range[stage]);
+  bf1[14] = clamp_value(bf0[13] + bf0[14], stage_range[stage]);
+  bf1[15] = clamp_value(bf0[12] + bf0[15], stage_range[stage]);
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit);
+  bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit);
+  bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit);
+  bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit);
+  bf1[22] = bf0[22];
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = bf0[25];
+  bf1[26] = half_btf(-cospi[16], bf0[21], cospi[48], bf0[26], cos_bit);
+  bf1[27] = half_btf(-cospi[16], bf0[20], cospi[48], bf0[27], cos_bit);
+  bf1[28] = half_btf(cospi[48], bf0[19], cospi[16], bf0[28], cos_bit);
+  bf1[29] = half_btf(cospi[48], bf0[18], cospi[16], bf0[29], cos_bit);
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = clamp_value(bf0[0] + bf0[7], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[6], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[2] + bf0[5], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[3] + bf0[4], stage_range[stage]);
+  bf1[4] = clamp_value(bf0[3] - bf0[4], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[2] - bf0[5], stage_range[stage]);
+  bf1[6] = clamp_value(bf0[1] - bf0[6], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[0] - bf0[7], stage_range[stage]);
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
+  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
+  bf1[12] = half_btf(cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
+  bf1[13] = half_btf(cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = clamp_value(bf0[16] + bf0[23], stage_range[stage]);
+  bf1[17] = clamp_value(bf0[17] + bf0[22], stage_range[stage]);
+  bf1[18] = clamp_value(bf0[18] + bf0[21], stage_range[stage]);
+  bf1[19] = clamp_value(bf0[19] + bf0[20], stage_range[stage]);
+  bf1[20] = clamp_value(bf0[19] - bf0[20], stage_range[stage]);
+  bf1[21] = clamp_value(bf0[18] - bf0[21], stage_range[stage]);
+  bf1[22] = clamp_value(bf0[17] - bf0[22], stage_range[stage]);
+  bf1[23] = clamp_value(bf0[16] - bf0[23], stage_range[stage]);
+  bf1[24] = clamp_value(-bf0[24] + bf0[31], stage_range[stage]);
+  bf1[25] = clamp_value(-bf0[25] + bf0[30], stage_range[stage]);
+  bf1[26] = clamp_value(-bf0[26] + bf0[29], stage_range[stage]);
+  bf1[27] = clamp_value(-bf0[27] + bf0[28], stage_range[stage]);
+  bf1[28] = clamp_value(bf0[27] + bf0[28], stage_range[stage]);
+  bf1[29] = clamp_value(bf0[26] + bf0[29], stage_range[stage]);
+  bf1[30] = clamp_value(bf0[25] + bf0[30], stage_range[stage]);
+  bf1[31] = clamp_value(bf0[24] + bf0[31], stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 8
+  stage++;
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = clamp_value(bf0[0] + bf0[15], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[14], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[2] + bf0[13], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[3] + bf0[12], stage_range[stage]);
+  bf1[4] = clamp_value(bf0[4] + bf0[11], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[5] + bf0[10], stage_range[stage]);
+  bf1[6] = clamp_value(bf0[6] + bf0[9], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[7] + bf0[8], stage_range[stage]);
+  bf1[8] = clamp_value(bf0[7] - bf0[8], stage_range[stage]);
+  bf1[9] = clamp_value(bf0[6] - bf0[9], stage_range[stage]);
+  bf1[10] = clamp_value(bf0[5] - bf0[10], stage_range[stage]);
+  bf1[11] = clamp_value(bf0[4] - bf0[11], stage_range[stage]);
+  bf1[12] = clamp_value(bf0[3] - bf0[12], stage_range[stage]);
+  bf1[13] = clamp_value(bf0[2] - bf0[13], stage_range[stage]);
+  bf1[14] = clamp_value(bf0[1] - bf0[14], stage_range[stage]);
+  bf1[15] = clamp_value(bf0[0] - bf0[15], stage_range[stage]);
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = bf0[18];
+  bf1[19] = bf0[19];
+  bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
+  bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
+  bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
+  bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
+  bf1[24] = half_btf(cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
+  bf1[25] = half_btf(cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
+  bf1[26] = half_btf(cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
+  bf1[27] = half_btf(cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
+  bf1[28] = bf0[28];
+  bf1[29] = bf0[29];
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 9
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = clamp_value(bf0[0] + bf0[31], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[30], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[2] + bf0[29], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[3] + bf0[28], stage_range[stage]);
+  bf1[4] = clamp_value(bf0[4] + bf0[27], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[5] + bf0[26], stage_range[stage]);
+  bf1[6] = clamp_value(bf0[6] + bf0[25], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[7] + bf0[24], stage_range[stage]);
+  bf1[8] = clamp_value(bf0[8] + bf0[23], stage_range[stage]);
+  bf1[9] = clamp_value(bf0[9] + bf0[22], stage_range[stage]);
+  bf1[10] = clamp_value(bf0[10] + bf0[21], stage_range[stage]);
+  bf1[11] = clamp_value(bf0[11] + bf0[20], stage_range[stage]);
+  bf1[12] = clamp_value(bf0[12] + bf0[19], stage_range[stage]);
+  bf1[13] = clamp_value(bf0[13] + bf0[18], stage_range[stage]);
+  bf1[14] = clamp_value(bf0[14] + bf0[17], stage_range[stage]);
+  bf1[15] = clamp_value(bf0[15] + bf0[16], stage_range[stage]);
+  bf1[16] = clamp_value(bf0[15] - bf0[16], stage_range[stage]);
+  bf1[17] = clamp_value(bf0[14] - bf0[17], stage_range[stage]);
+  bf1[18] = clamp_value(bf0[13] - bf0[18], stage_range[stage]);
+  bf1[19] = clamp_value(bf0[12] - bf0[19], stage_range[stage]);
+  bf1[20] = clamp_value(bf0[11] - bf0[20], stage_range[stage]);
+  bf1[21] = clamp_value(bf0[10] - bf0[21], stage_range[stage]);
+  bf1[22] = clamp_value(bf0[9] - bf0[22], stage_range[stage]);
+  bf1[23] = clamp_value(bf0[8] - bf0[23], stage_range[stage]);
+  bf1[24] = clamp_value(bf0[7] - bf0[24], stage_range[stage]);
+  bf1[25] = clamp_value(bf0[6] - bf0[25], stage_range[stage]);
+  bf1[26] = clamp_value(bf0[5] - bf0[26], stage_range[stage]);
+  bf1[27] = clamp_value(bf0[4] - bf0[27], stage_range[stage]);
+  bf1[28] = clamp_value(bf0[3] - bf0[28], stage_range[stage]);
+  bf1[29] = clamp_value(bf0[2] - bf0[29], stage_range[stage]);
+  bf1[30] = clamp_value(bf0[1] - bf0[30], stage_range[stage]);
+  bf1[31] = clamp_value(bf0[0] - bf0[31], stage_range[stage]);
+}
+
+void av1_iadst4(const int32_t *input, int32_t *output, int8_t cos_bit,
+                const int8_t *stage_range) {
+  int bit = cos_bit;
+  const int32_t *sinpi = sinpi_arr(bit);
+  int32_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+  int32_t x0 = input[0];
+  int32_t x1 = input[1];
+  int32_t x2 = input[2];
+  int32_t x3 = input[3];
+
+  if (!(x0 | x1 | x2 | x3)) {
+    output[0] = output[1] = output[2] = output[3] = 0;
+    return;
+  }
+
+  assert(sinpi[1] + sinpi[2] == sinpi[4]);
+
+  // stage 1
+  s0 = range_check_value(sinpi[1] * x0, stage_range[1] + bit);
+  s1 = range_check_value(sinpi[2] * x0, stage_range[1] + bit);
+  s2 = range_check_value(sinpi[3] * x1, stage_range[1] + bit);
+  s3 = range_check_value(sinpi[4] * x2, stage_range[1] + bit);
+  s4 = range_check_value(sinpi[1] * x2, stage_range[1] + bit);
+  s5 = range_check_value(sinpi[2] * x3, stage_range[1] + bit);
+  s6 = range_check_value(sinpi[4] * x3, stage_range[1] + bit);
+
+  // stage 2
+  // NOTICE: (x0 - x2) here may use one extra bit compared to the
+  // opt_range_row/col specified in av1_gen_inv_stage_range()
+  s7 = range_check_value((x0 - x2) + x3, stage_range[2]);
+
+  // stage 3
+  s0 = range_check_value(s0 + s3, stage_range[3] + bit);
+  s1 = range_check_value(s1 - s4, stage_range[3] + bit);
+  s3 = range_check_value(s2, stage_range[3] + bit);
+  s2 = range_check_value(sinpi[3] * s7, stage_range[3] + bit);
+
+  // stage 4
+  s0 = range_check_value(s0 + s5, stage_range[4] + bit);
+  s1 = range_check_value(s1 - s6, stage_range[4] + bit);
+
+  // stage 5
+  x0 = range_check_value(s0 + s3, stage_range[5] + bit);
+  x1 = range_check_value(s1 + s3, stage_range[5] + bit);
+  x2 = range_check_value(s2, stage_range[5] + bit);
+  x3 = range_check_value(s0 + s1, stage_range[5] + bit);
+
+  // stage 6
+  x3 = range_check_value(x3 - s3, stage_range[6] + bit);
+
+  output[0] = round_shift(x0, bit);
+  output[1] = round_shift(x1, bit);
+  output[2] = round_shift(x2, bit);
+  output[3] = round_shift(x3, bit);
+}
+
+void av1_iadst8(const int32_t *input, int32_t *output, int8_t cos_bit,
+                const int8_t *stage_range) {
+  assert(output != input);
+  const int32_t size = 8;
+  const int32_t *cospi = cospi_arr(cos_bit);
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[8];
+
+  // stage 0;
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[7];
+  bf1[1] = input[0];
+  bf1[2] = input[5];
+  bf1[3] = input[2];
+  bf1[4] = input[3];
+  bf1[5] = input[4];
+  bf1[6] = input[1];
+  bf1[7] = input[6];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[4], bf0[0], cospi[60], bf0[1], cos_bit);
+  bf1[1] = half_btf(cospi[60], bf0[0], -cospi[4], bf0[1], cos_bit);
+  bf1[2] = half_btf(cospi[20], bf0[2], cospi[44], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[44], bf0[2], -cospi[20], bf0[3], cos_bit);
+  bf1[4] = half_btf(cospi[36], bf0[4], cospi[28], bf0[5], cos_bit);
+  bf1[5] = half_btf(cospi[28], bf0[4], -cospi[36], bf0[5], cos_bit);
+  bf1[6] = half_btf(cospi[52], bf0[6], cospi[12], bf0[7], cos_bit);
+  bf1[7] = half_btf(cospi[12], bf0[6], -cospi[52], bf0[7], cos_bit);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = clamp_value(bf0[0] + bf0[4], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[5], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[2] + bf0[6], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[3] + bf0[7], stage_range[stage]);
+  bf1[4] = clamp_value(bf0[0] - bf0[4], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[1] - bf0[5], stage_range[stage]);
+  bf1[6] = clamp_value(bf0[2] - bf0[6], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[3] - bf0[7], stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit);
+  bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
+  bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
+  bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = clamp_value(bf0[0] + bf0[2], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[3], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[0] - bf0[2], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[1] - bf0[3], stage_range[stage]);
+  bf1[4] = clamp_value(bf0[4] + bf0[6], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[5] + bf0[7], stage_range[stage]);
+  bf1[6] = clamp_value(bf0[4] - bf0[6], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[5] - bf0[7], stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit);
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
+  bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = -bf0[4];
+  bf1[2] = bf0[6];
+  bf1[3] = -bf0[2];
+  bf1[4] = bf0[3];
+  bf1[5] = -bf0[7];
+  bf1[6] = bf0[5];
+  bf1[7] = -bf0[1];
+}
+
+void av1_iadst16(const int32_t *input, int32_t *output, int8_t cos_bit,
+                 const int8_t *stage_range) {
+  assert(output != input);
+  const int32_t size = 16;
+  const int32_t *cospi = cospi_arr(cos_bit);
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[16];
+
+  // stage 0;
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[15];
+  bf1[1] = input[0];
+  bf1[2] = input[13];
+  bf1[3] = input[2];
+  bf1[4] = input[11];
+  bf1[5] = input[4];
+  bf1[6] = input[9];
+  bf1[7] = input[6];
+  bf1[8] = input[7];
+  bf1[9] = input[8];
+  bf1[10] = input[5];
+  bf1[11] = input[10];
+  bf1[12] = input[3];
+  bf1[13] = input[12];
+  bf1[14] = input[1];
+  bf1[15] = input[14];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[2], bf0[0], cospi[62], bf0[1], cos_bit);
+  bf1[1] = half_btf(cospi[62], bf0[0], -cospi[2], bf0[1], cos_bit);
+  bf1[2] = half_btf(cospi[10], bf0[2], cospi[54], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[54], bf0[2], -cospi[10], bf0[3], cos_bit);
+  bf1[4] = half_btf(cospi[18], bf0[4], cospi[46], bf0[5], cos_bit);
+  bf1[5] = half_btf(cospi[46], bf0[4], -cospi[18], bf0[5], cos_bit);
+  bf1[6] = half_btf(cospi[26], bf0[6], cospi[38], bf0[7], cos_bit);
+  bf1[7] = half_btf(cospi[38], bf0[6], -cospi[26], bf0[7], cos_bit);
+  bf1[8] = half_btf(cospi[34], bf0[8], cospi[30], bf0[9], cos_bit);
+  bf1[9] = half_btf(cospi[30], bf0[8], -cospi[34], bf0[9], cos_bit);
+  bf1[10] = half_btf(cospi[42], bf0[10], cospi[22], bf0[11], cos_bit);
+  bf1[11] = half_btf(cospi[22], bf0[10], -cospi[42], bf0[11], cos_bit);
+  bf1[12] = half_btf(cospi[50], bf0[12], cospi[14], bf0[13], cos_bit);
+  bf1[13] = half_btf(cospi[14], bf0[12], -cospi[50], bf0[13], cos_bit);
+  bf1[14] = half_btf(cospi[58], bf0[14], cospi[6], bf0[15], cos_bit);
+  bf1[15] = half_btf(cospi[6], bf0[14], -cospi[58], bf0[15], cos_bit);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = clamp_value(bf0[0] + bf0[8], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[9], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[2] + bf0[10], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[3] + bf0[11], stage_range[stage]);
+  bf1[4] = clamp_value(bf0[4] + bf0[12], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[5] + bf0[13], stage_range[stage]);
+  bf1[6] = clamp_value(bf0[6] + bf0[14], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[7] + bf0[15], stage_range[stage]);
+  bf1[8] = clamp_value(bf0[0] - bf0[8], stage_range[stage]);
+  bf1[9] = clamp_value(bf0[1] - bf0[9], stage_range[stage]);
+  bf1[10] = clamp_value(bf0[2] - bf0[10], stage_range[stage]);
+  bf1[11] = clamp_value(bf0[3] - bf0[11], stage_range[stage]);
+  bf1[12] = clamp_value(bf0[4] - bf0[12], stage_range[stage]);
+  bf1[13] = clamp_value(bf0[5] - bf0[13], stage_range[stage]);
+  bf1[14] = clamp_value(bf0[6] - bf0[14], stage_range[stage]);
+  bf1[15] = clamp_value(bf0[7] - bf0[15], stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit);
+  bf1[9] = half_btf(cospi[56], bf0[8], -cospi[8], bf0[9], cos_bit);
+  bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit);
+  bf1[11] = half_btf(cospi[24], bf0[10], -cospi[40], bf0[11], cos_bit);
+  bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit);
+  bf1[13] = half_btf(cospi[8], bf0[12], cospi[56], bf0[13], cos_bit);
+  bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit);
+  bf1[15] = half_btf(cospi[40], bf0[14], cospi[24], bf0[15], cos_bit);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = clamp_value(bf0[0] + bf0[4], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[5], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[2] + bf0[6], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[3] + bf0[7], stage_range[stage]);
+  bf1[4] = clamp_value(bf0[0] - bf0[4], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[1] - bf0[5], stage_range[stage]);
+  bf1[6] = clamp_value(bf0[2] - bf0[6], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[3] - bf0[7], stage_range[stage]);
+  bf1[8] = clamp_value(bf0[8] + bf0[12], stage_range[stage]);
+  bf1[9] = clamp_value(bf0[9] + bf0[13], stage_range[stage]);
+  bf1[10] = clamp_value(bf0[10] + bf0[14], stage_range[stage]);
+  bf1[11] = clamp_value(bf0[11] + bf0[15], stage_range[stage]);
+  bf1[12] = clamp_value(bf0[8] - bf0[12], stage_range[stage]);
+  bf1[13] = clamp_value(bf0[9] - bf0[13], stage_range[stage]);
+  bf1[14] = clamp_value(bf0[10] - bf0[14], stage_range[stage]);
+  bf1[15] = clamp_value(bf0[11] - bf0[15], stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit);
+  bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
+  bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
+  bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit);
+  bf1[13] = half_btf(cospi[48], bf0[12], -cospi[16], bf0[13], cos_bit);
+  bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit);
+  bf1[15] = half_btf(cospi[16], bf0[14], cospi[48], bf0[15], cos_bit);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = clamp_value(bf0[0] + bf0[2], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[3], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[0] - bf0[2], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[1] - bf0[3], stage_range[stage]);
+  bf1[4] = clamp_value(bf0[4] + bf0[6], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[5] + bf0[7], stage_range[stage]);
+  bf1[6] = clamp_value(bf0[4] - bf0[6], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[5] - bf0[7], stage_range[stage]);
+  bf1[8] = clamp_value(bf0[8] + bf0[10], stage_range[stage]);
+  bf1[9] = clamp_value(bf0[9] + bf0[11], stage_range[stage]);
+  bf1[10] = clamp_value(bf0[8] - bf0[10], stage_range[stage]);
+  bf1[11] = clamp_value(bf0[9] - bf0[11], stage_range[stage]);
+  bf1[12] = clamp_value(bf0[12] + bf0[14], stage_range[stage]);
+  bf1[13] = clamp_value(bf0[13] + bf0[15], stage_range[stage]);
+  bf1[14] = clamp_value(bf0[12] - bf0[14], stage_range[stage]);
+  bf1[15] = clamp_value(bf0[13] - bf0[15], stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 8
+  stage++;
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit);
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
+  bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit);
+  bf1[11] = half_btf(cospi[32], bf0[10], -cospi[32], bf0[11], cos_bit);
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit);
+  bf1[15] = half_btf(cospi[32], bf0[14], -cospi[32], bf0[15], cos_bit);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 9
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = -bf0[8];
+  bf1[2] = bf0[12];
+  bf1[3] = -bf0[4];
+  bf1[4] = bf0[6];
+  bf1[5] = -bf0[14];
+  bf1[6] = bf0[10];
+  bf1[7] = -bf0[2];
+  bf1[8] = bf0[3];
+  bf1[9] = -bf0[11];
+  bf1[10] = bf0[15];
+  bf1[11] = -bf0[7];
+  bf1[12] = bf0[5];
+  bf1[13] = -bf0[13];
+  bf1[14] = bf0[9];
+  bf1[15] = -bf0[1];
+}
+
+void av1_iidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+                      const int8_t *stage_range) {
+  (void)cos_bit;
+  (void)stage_range;
+  for (int i = 0; i < 4; ++i) {
+    output[i] = round_shift((int64_t)NewSqrt2 * input[i], NewSqrt2Bits);
+  }
+  assert(stage_range[0] + NewSqrt2Bits <= 32);
+}
+
+void av1_iidentity8_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+                      const int8_t *stage_range) {
+  (void)cos_bit;
+  (void)stage_range;
+  for (int i = 0; i < 8; ++i) output[i] = (int32_t)((int64_t)input[i] * 2);
+}
+
+void av1_iidentity16_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+                       const int8_t *stage_range) {
+  (void)cos_bit;
+  (void)stage_range;
+  for (int i = 0; i < 16; ++i)
+    output[i] = round_shift((int64_t)NewSqrt2 * 2 * input[i], NewSqrt2Bits);
+  assert(stage_range[0] + NewSqrt2Bits <= 32);
+}
+
+void av1_iidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+                       const int8_t *stage_range) {
+  (void)cos_bit;
+  (void)stage_range;
+  for (int i = 0; i < 32; ++i) output[i] = (int32_t)((int64_t)input[i] * 4);
+}
+
+void av1_idct64(const int32_t *input, int32_t *output, int8_t cos_bit,
+                const int8_t *stage_range) {
+  assert(output != input);
+  const int32_t size = 64;
+  const int32_t *cospi = cospi_arr(cos_bit);
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[64];
+
+  // stage 0;
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0];
+  bf1[1] = input[32];
+  bf1[2] = input[16];
+  bf1[3] = input[48];
+  bf1[4] = input[8];
+  bf1[5] = input[40];
+  bf1[6] = input[24];
+  bf1[7] = input[56];
+  bf1[8] = input[4];
+  bf1[9] = input[36];
+  bf1[10] = input[20];
+  bf1[11] = input[52];
+  bf1[12] = input[12];
+  bf1[13] = input[44];
+  bf1[14] = input[28];
+  bf1[15] = input[60];
+  bf1[16] = input[2];
+  bf1[17] = input[34];
+  bf1[18] = input[18];
+  bf1[19] = input[50];
+  bf1[20] = input[10];
+  bf1[21] = input[42];
+  bf1[22] = input[26];
+  bf1[23] = input[58];
+  bf1[24] = input[6];
+  bf1[25] = input[38];
+  bf1[26] = input[22];
+  bf1[27] = input[54];
+  bf1[28] = input[14];
+  bf1[29] = input[46];
+  bf1[30] = input[30];
+  bf1[31] = input[62];
+  bf1[32] = input[1];
+  bf1[33] = input[33];
+  bf1[34] = input[17];
+  bf1[35] = input[49];
+  bf1[36] = input[9];
+  bf1[37] = input[41];
+  bf1[38] = input[25];
+  bf1[39] = input[57];
+  bf1[40] = input[5];
+  bf1[41] = input[37];
+  bf1[42] = input[21];
+  bf1[43] = input[53];
+  bf1[44] = input[13];
+  bf1[45] = input[45];
+  bf1[46] = input[29];
+  bf1[47] = input[61];
+  bf1[48] = input[3];
+  bf1[49] = input[35];
+  bf1[50] = input[19];
+  bf1[51] = input[51];
+  bf1[52] = input[11];
+  bf1[53] = input[43];
+  bf1[54] = input[27];
+  bf1[55] = input[59];
+  bf1[56] = input[7];
+  bf1[57] = input[39];
+  bf1[58] = input[23];
+  bf1[59] = input[55];
+  bf1[60] = input[15];
+  bf1[61] = input[47];
+  bf1[62] = input[31];
+  bf1[63] = input[63];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = bf0[18];
+  bf1[19] = bf0[19];
+  bf1[20] = bf0[20];
+  bf1[21] = bf0[21];
+  bf1[22] = bf0[22];
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = bf0[25];
+  bf1[26] = bf0[26];
+  bf1[27] = bf0[27];
+  bf1[28] = bf0[28];
+  bf1[29] = bf0[29];
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  bf1[32] = half_btf(cospi[63], bf0[32], -cospi[1], bf0[63], cos_bit);
+  bf1[33] = half_btf(cospi[31], bf0[33], -cospi[33], bf0[62], cos_bit);
+  bf1[34] = half_btf(cospi[47], bf0[34], -cospi[17], bf0[61], cos_bit);
+  bf1[35] = half_btf(cospi[15], bf0[35], -cospi[49], bf0[60], cos_bit);
+  bf1[36] = half_btf(cospi[55], bf0[36], -cospi[9], bf0[59], cos_bit);
+  bf1[37] = half_btf(cospi[23], bf0[37], -cospi[41], bf0[58], cos_bit);
+  bf1[38] = half_btf(cospi[39], bf0[38], -cospi[25], bf0[57], cos_bit);
+  bf1[39] = half_btf(cospi[7], bf0[39], -cospi[57], bf0[56], cos_bit);
+  bf1[40] = half_btf(cospi[59], bf0[40], -cospi[5], bf0[55], cos_bit);
+  bf1[41] = half_btf(cospi[27], bf0[41], -cospi[37], bf0[54], cos_bit);
+  bf1[42] = half_btf(cospi[43], bf0[42], -cospi[21], bf0[53], cos_bit);
+  bf1[43] = half_btf(cospi[11], bf0[43], -cospi[53], bf0[52], cos_bit);
+  bf1[44] = half_btf(cospi[51], bf0[44], -cospi[13], bf0[51], cos_bit);
+  bf1[45] = half_btf(cospi[19], bf0[45], -cospi[45], bf0[50], cos_bit);
+  bf1[46] = half_btf(cospi[35], bf0[46], -cospi[29], bf0[49], cos_bit);
+  bf1[47] = half_btf(cospi[3], bf0[47], -cospi[61], bf0[48], cos_bit);
+  bf1[48] = half_btf(cospi[61], bf0[47], cospi[3], bf0[48], cos_bit);
+  bf1[49] = half_btf(cospi[29], bf0[46], cospi[35], bf0[49], cos_bit);
+  bf1[50] = half_btf(cospi[45], bf0[45], cospi[19], bf0[50], cos_bit);
+  bf1[51] = half_btf(cospi[13], bf0[44], cospi[51], bf0[51], cos_bit);
+  bf1[52] = half_btf(cospi[53], bf0[43], cospi[11], bf0[52], cos_bit);
+  bf1[53] = half_btf(cospi[21], bf0[42], cospi[43], bf0[53], cos_bit);
+  bf1[54] = half_btf(cospi[37], bf0[41], cospi[27], bf0[54], cos_bit);
+  bf1[55] = half_btf(cospi[5], bf0[40], cospi[59], bf0[55], cos_bit);
+  bf1[56] = half_btf(cospi[57], bf0[39], cospi[7], bf0[56], cos_bit);
+  bf1[57] = half_btf(cospi[25], bf0[38], cospi[39], bf0[57], cos_bit);
+  bf1[58] = half_btf(cospi[41], bf0[37], cospi[23], bf0[58], cos_bit);
+  bf1[59] = half_btf(cospi[9], bf0[36], cospi[55], bf0[59], cos_bit);
+  bf1[60] = half_btf(cospi[49], bf0[35], cospi[15], bf0[60], cos_bit);
+  bf1[61] = half_btf(cospi[17], bf0[34], cospi[47], bf0[61], cos_bit);
+  bf1[62] = half_btf(cospi[33], bf0[33], cospi[31], bf0[62], cos_bit);
+  bf1[63] = half_btf(cospi[1], bf0[32], cospi[63], bf0[63], cos_bit);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = half_btf(cospi[62], bf0[16], -cospi[2], bf0[31], cos_bit);
+  bf1[17] = half_btf(cospi[30], bf0[17], -cospi[34], bf0[30], cos_bit);
+  bf1[18] = half_btf(cospi[46], bf0[18], -cospi[18], bf0[29], cos_bit);
+  bf1[19] = half_btf(cospi[14], bf0[19], -cospi[50], bf0[28], cos_bit);
+  bf1[20] = half_btf(cospi[54], bf0[20], -cospi[10], bf0[27], cos_bit);
+  bf1[21] = half_btf(cospi[22], bf0[21], -cospi[42], bf0[26], cos_bit);
+  bf1[22] = half_btf(cospi[38], bf0[22], -cospi[26], bf0[25], cos_bit);
+  bf1[23] = half_btf(cospi[6], bf0[23], -cospi[58], bf0[24], cos_bit);
+  bf1[24] = half_btf(cospi[58], bf0[23], cospi[6], bf0[24], cos_bit);
+  bf1[25] = half_btf(cospi[26], bf0[22], cospi[38], bf0[25], cos_bit);
+  bf1[26] = half_btf(cospi[42], bf0[21], cospi[22], bf0[26], cos_bit);
+  bf1[27] = half_btf(cospi[10], bf0[20], cospi[54], bf0[27], cos_bit);
+  bf1[28] = half_btf(cospi[50], bf0[19], cospi[14], bf0[28], cos_bit);
+  bf1[29] = half_btf(cospi[18], bf0[18], cospi[46], bf0[29], cos_bit);
+  bf1[30] = half_btf(cospi[34], bf0[17], cospi[30], bf0[30], cos_bit);
+  bf1[31] = half_btf(cospi[2], bf0[16], cospi[62], bf0[31], cos_bit);
+  bf1[32] = clamp_value(bf0[32] + bf0[33], stage_range[stage]);
+  bf1[33] = clamp_value(bf0[32] - bf0[33], stage_range[stage]);
+  bf1[34] = clamp_value(-bf0[34] + bf0[35], stage_range[stage]);
+  bf1[35] = clamp_value(bf0[34] + bf0[35], stage_range[stage]);
+  bf1[36] = clamp_value(bf0[36] + bf0[37], stage_range[stage]);
+  bf1[37] = clamp_value(bf0[36] - bf0[37], stage_range[stage]);
+  bf1[38] = clamp_value(-bf0[38] + bf0[39], stage_range[stage]);
+  bf1[39] = clamp_value(bf0[38] + bf0[39], stage_range[stage]);
+  bf1[40] = clamp_value(bf0[40] + bf0[41], stage_range[stage]);
+  bf1[41] = clamp_value(bf0[40] - bf0[41], stage_range[stage]);
+  bf1[42] = clamp_value(-bf0[42] + bf0[43], stage_range[stage]);
+  bf1[43] = clamp_value(bf0[42] + bf0[43], stage_range[stage]);
+  bf1[44] = clamp_value(bf0[44] + bf0[45], stage_range[stage]);
+  bf1[45] = clamp_value(bf0[44] - bf0[45], stage_range[stage]);
+  bf1[46] = clamp_value(-bf0[46] + bf0[47], stage_range[stage]);
+  bf1[47] = clamp_value(bf0[46] + bf0[47], stage_range[stage]);
+  bf1[48] = clamp_value(bf0[48] + bf0[49], stage_range[stage]);
+  bf1[49] = clamp_value(bf0[48] - bf0[49], stage_range[stage]);
+  bf1[50] = clamp_value(-bf0[50] + bf0[51], stage_range[stage]);
+  bf1[51] = clamp_value(bf0[50] + bf0[51], stage_range[stage]);
+  bf1[52] = clamp_value(bf0[52] + bf0[53], stage_range[stage]);
+  bf1[53] = clamp_value(bf0[52] - bf0[53], stage_range[stage]);
+  bf1[54] = clamp_value(-bf0[54] + bf0[55], stage_range[stage]);
+  bf1[55] = clamp_value(bf0[54] + bf0[55], stage_range[stage]);
+  bf1[56] = clamp_value(bf0[56] + bf0[57], stage_range[stage]);
+  bf1[57] = clamp_value(bf0[56] - bf0[57], stage_range[stage]);
+  bf1[58] = clamp_value(-bf0[58] + bf0[59], stage_range[stage]);
+  bf1[59] = clamp_value(bf0[58] + bf0[59], stage_range[stage]);
+  bf1[60] = clamp_value(bf0[60] + bf0[61], stage_range[stage]);
+  bf1[61] = clamp_value(bf0[60] - bf0[61], stage_range[stage]);
+  bf1[62] = clamp_value(-bf0[62] + bf0[63], stage_range[stage]);
+  bf1[63] = clamp_value(bf0[62] + bf0[63], stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = half_btf(cospi[60], bf0[8], -cospi[4], bf0[15], cos_bit);
+  bf1[9] = half_btf(cospi[28], bf0[9], -cospi[36], bf0[14], cos_bit);
+  bf1[10] = half_btf(cospi[44], bf0[10], -cospi[20], bf0[13], cos_bit);
+  bf1[11] = half_btf(cospi[12], bf0[11], -cospi[52], bf0[12], cos_bit);
+  bf1[12] = half_btf(cospi[52], bf0[11], cospi[12], bf0[12], cos_bit);
+  bf1[13] = half_btf(cospi[20], bf0[10], cospi[44], bf0[13], cos_bit);
+  bf1[14] = half_btf(cospi[36], bf0[9], cospi[28], bf0[14], cos_bit);
+  bf1[15] = half_btf(cospi[4], bf0[8], cospi[60], bf0[15], cos_bit);
+  bf1[16] = clamp_value(bf0[16] + bf0[17], stage_range[stage]);
+  bf1[17] = clamp_value(bf0[16] - bf0[17], stage_range[stage]);
+  bf1[18] = clamp_value(-bf0[18] + bf0[19], stage_range[stage]);
+  bf1[19] = clamp_value(bf0[18] + bf0[19], stage_range[stage]);
+  bf1[20] = clamp_value(bf0[20] + bf0[21], stage_range[stage]);
+  bf1[21] = clamp_value(bf0[20] - bf0[21], stage_range[stage]);
+  bf1[22] = clamp_value(-bf0[22] + bf0[23], stage_range[stage]);
+  bf1[23] = clamp_value(bf0[22] + bf0[23], stage_range[stage]);
+  bf1[24] = clamp_value(bf0[24] + bf0[25], stage_range[stage]);
+  bf1[25] = clamp_value(bf0[24] - bf0[25], stage_range[stage]);
+  bf1[26] = clamp_value(-bf0[26] + bf0[27], stage_range[stage]);
+  bf1[27] = clamp_value(bf0[26] + bf0[27], stage_range[stage]);
+  bf1[28] = clamp_value(bf0[28] + bf0[29], stage_range[stage]);
+  bf1[29] = clamp_value(bf0[28] - bf0[29], stage_range[stage]);
+  bf1[30] = clamp_value(-bf0[30] + bf0[31], stage_range[stage]);
+  bf1[31] = clamp_value(bf0[30] + bf0[31], stage_range[stage]);
+  bf1[32] = bf0[32];
+  bf1[33] = half_btf(-cospi[4], bf0[33], cospi[60], bf0[62], cos_bit);
+  bf1[34] = half_btf(-cospi[60], bf0[34], -cospi[4], bf0[61], cos_bit);
+  bf1[35] = bf0[35];
+  bf1[36] = bf0[36];
+  bf1[37] = half_btf(-cospi[36], bf0[37], cospi[28], bf0[58], cos_bit);
+  bf1[38] = half_btf(-cospi[28], bf0[38], -cospi[36], bf0[57], cos_bit);
+  bf1[39] = bf0[39];
+  bf1[40] = bf0[40];
+  bf1[41] = half_btf(-cospi[20], bf0[41], cospi[44], bf0[54], cos_bit);
+  bf1[42] = half_btf(-cospi[44], bf0[42], -cospi[20], bf0[53], cos_bit);
+  bf1[43] = bf0[43];
+  bf1[44] = bf0[44];
+  bf1[45] = half_btf(-cospi[52], bf0[45], cospi[12], bf0[50], cos_bit);
+  bf1[46] = half_btf(-cospi[12], bf0[46], -cospi[52], bf0[49], cos_bit);
+  bf1[47] = bf0[47];
+  bf1[48] = bf0[48];
+  bf1[49] = half_btf(-cospi[52], bf0[46], cospi[12], bf0[49], cos_bit);
+  bf1[50] = half_btf(cospi[12], bf0[45], cospi[52], bf0[50], cos_bit);
+  bf1[51] = bf0[51];
+  bf1[52] = bf0[52];
+  bf1[53] = half_btf(-cospi[20], bf0[42], cospi[44], bf0[53], cos_bit);
+  bf1[54] = half_btf(cospi[44], bf0[41], cospi[20], bf0[54], cos_bit);
+  bf1[55] = bf0[55];
+  bf1[56] = bf0[56];
+  bf1[57] = half_btf(-cospi[36], bf0[38], cospi[28], bf0[57], cos_bit);
+  bf1[58] = half_btf(cospi[28], bf0[37], cospi[36], bf0[58], cos_bit);
+  bf1[59] = bf0[59];
+  bf1[60] = bf0[60];
+  bf1[61] = half_btf(-cospi[4], bf0[34], cospi[60], bf0[61], cos_bit);
+  bf1[62] = half_btf(cospi[60], bf0[33], cospi[4], bf0[62], cos_bit);
+  bf1[63] = bf0[63];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit);
+  bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit);
+  bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit);
+  bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit);
+  bf1[8] = clamp_value(bf0[8] + bf0[9], stage_range[stage]);
+  bf1[9] = clamp_value(bf0[8] - bf0[9], stage_range[stage]);
+  bf1[10] = clamp_value(-bf0[10] + bf0[11], stage_range[stage]);
+  bf1[11] = clamp_value(bf0[10] + bf0[11], stage_range[stage]);
+  bf1[12] = clamp_value(bf0[12] + bf0[13], stage_range[stage]);
+  bf1[13] = clamp_value(bf0[12] - bf0[13], stage_range[stage]);
+  bf1[14] = clamp_value(-bf0[14] + bf0[15], stage_range[stage]);
+  bf1[15] = clamp_value(bf0[14] + bf0[15], stage_range[stage]);
+  bf1[16] = bf0[16];
+  bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit);
+  bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit);
+  bf1[19] = bf0[19];
+  bf1[20] = bf0[20];
+  bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit);
+  bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit);
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = half_btf(-cospi[40], bf0[22], cospi[24], bf0[25], cos_bit);
+  bf1[26] = half_btf(cospi[24], bf0[21], cospi[40], bf0[26], cos_bit);
+  bf1[27] = bf0[27];
+  bf1[28] = bf0[28];
+  bf1[29] = half_btf(-cospi[8], bf0[18], cospi[56], bf0[29], cos_bit);
+  bf1[30] = half_btf(cospi[56], bf0[17], cospi[8], bf0[30], cos_bit);
+  bf1[31] = bf0[31];
+  bf1[32] = clamp_value(bf0[32] + bf0[35], stage_range[stage]);
+  bf1[33] = clamp_value(bf0[33] + bf0[34], stage_range[stage]);
+  bf1[34] = clamp_value(bf0[33] - bf0[34], stage_range[stage]);
+  bf1[35] = clamp_value(bf0[32] - bf0[35], stage_range[stage]);
+  bf1[36] = clamp_value(-bf0[36] + bf0[39], stage_range[stage]);
+  bf1[37] = clamp_value(-bf0[37] + bf0[38], stage_range[stage]);
+  bf1[38] = clamp_value(bf0[37] + bf0[38], stage_range[stage]);
+  bf1[39] = clamp_value(bf0[36] + bf0[39], stage_range[stage]);
+  bf1[40] = clamp_value(bf0[40] + bf0[43], stage_range[stage]);
+  bf1[41] = clamp_value(bf0[41] + bf0[42], stage_range[stage]);
+  bf1[42] = clamp_value(bf0[41] - bf0[42], stage_range[stage]);
+  bf1[43] = clamp_value(bf0[40] - bf0[43], stage_range[stage]);
+  bf1[44] = clamp_value(-bf0[44] + bf0[47], stage_range[stage]);
+  bf1[45] = clamp_value(-bf0[45] + bf0[46], stage_range[stage]);
+  bf1[46] = clamp_value(bf0[45] + bf0[46], stage_range[stage]);
+  bf1[47] = clamp_value(bf0[44] + bf0[47], stage_range[stage]);
+  bf1[48] = clamp_value(bf0[48] + bf0[51], stage_range[stage]);
+  bf1[49] = clamp_value(bf0[49] + bf0[50], stage_range[stage]);
+  bf1[50] = clamp_value(bf0[49] - bf0[50], stage_range[stage]);
+  bf1[51] = clamp_value(bf0[48] - bf0[51], stage_range[stage]);
+  bf1[52] = clamp_value(-bf0[52] + bf0[55], stage_range[stage]);
+  bf1[53] = clamp_value(-bf0[53] + bf0[54], stage_range[stage]);
+  bf1[54] = clamp_value(bf0[53] + bf0[54], stage_range[stage]);
+  bf1[55] = clamp_value(bf0[52] + bf0[55], stage_range[stage]);
+  bf1[56] = clamp_value(bf0[56] + bf0[59], stage_range[stage]);
+  bf1[57] = clamp_value(bf0[57] + bf0[58], stage_range[stage]);
+  bf1[58] = clamp_value(bf0[57] - bf0[58], stage_range[stage]);
+  bf1[59] = clamp_value(bf0[56] - bf0[59], stage_range[stage]);
+  bf1[60] = clamp_value(-bf0[60] + bf0[63], stage_range[stage]);
+  bf1[61] = clamp_value(-bf0[61] + bf0[62], stage_range[stage]);
+  bf1[62] = clamp_value(bf0[61] + bf0[62], stage_range[stage]);
+  bf1[63] = clamp_value(bf0[60] + bf0[63], stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
+  bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit);
+  bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit);
+  bf1[4] = clamp_value(bf0[4] + bf0[5], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[4] - bf0[5], stage_range[stage]);
+  bf1[6] = clamp_value(-bf0[6] + bf0[7], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[6] + bf0[7], stage_range[stage]);
+  bf1[8] = bf0[8];
+  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
+  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = half_btf(-cospi[16], bf0[10], cospi[48], bf0[13], cos_bit);
+  bf1[14] = half_btf(cospi[48], bf0[9], cospi[16], bf0[14], cos_bit);
+  bf1[15] = bf0[15];
+  bf1[16] = clamp_value(bf0[16] + bf0[19], stage_range[stage]);
+  bf1[17] = clamp_value(bf0[17] + bf0[18], stage_range[stage]);
+  bf1[18] = clamp_value(bf0[17] - bf0[18], stage_range[stage]);
+  bf1[19] = clamp_value(bf0[16] - bf0[19], stage_range[stage]);
+  bf1[20] = clamp_value(-bf0[20] + bf0[23], stage_range[stage]);
+  bf1[21] = clamp_value(-bf0[21] + bf0[22], stage_range[stage]);
+  bf1[22] = clamp_value(bf0[21] + bf0[22], stage_range[stage]);
+  bf1[23] = clamp_value(bf0[20] + bf0[23], stage_range[stage]);
+  bf1[24] = clamp_value(bf0[24] + bf0[27], stage_range[stage]);
+  bf1[25] = clamp_value(bf0[25] + bf0[26], stage_range[stage]);
+  bf1[26] = clamp_value(bf0[25] - bf0[26], stage_range[stage]);
+  bf1[27] = clamp_value(bf0[24] - bf0[27], stage_range[stage]);
+  bf1[28] = clamp_value(-bf0[28] + bf0[31], stage_range[stage]);
+  bf1[29] = clamp_value(-bf0[29] + bf0[30], stage_range[stage]);
+  bf1[30] = clamp_value(bf0[29] + bf0[30], stage_range[stage]);
+  bf1[31] = clamp_value(bf0[28] + bf0[31], stage_range[stage]);
+  bf1[32] = bf0[32];
+  bf1[33] = bf0[33];
+  bf1[34] = half_btf(-cospi[8], bf0[34], cospi[56], bf0[61], cos_bit);
+  bf1[35] = half_btf(-cospi[8], bf0[35], cospi[56], bf0[60], cos_bit);
+  bf1[36] = half_btf(-cospi[56], bf0[36], -cospi[8], bf0[59], cos_bit);
+  bf1[37] = half_btf(-cospi[56], bf0[37], -cospi[8], bf0[58], cos_bit);
+  bf1[38] = bf0[38];
+  bf1[39] = bf0[39];
+  bf1[40] = bf0[40];
+  bf1[41] = bf0[41];
+  bf1[42] = half_btf(-cospi[40], bf0[42], cospi[24], bf0[53], cos_bit);
+  bf1[43] = half_btf(-cospi[40], bf0[43], cospi[24], bf0[52], cos_bit);
+  bf1[44] = half_btf(-cospi[24], bf0[44], -cospi[40], bf0[51], cos_bit);
+  bf1[45] = half_btf(-cospi[24], bf0[45], -cospi[40], bf0[50], cos_bit);
+  bf1[46] = bf0[46];
+  bf1[47] = bf0[47];
+  bf1[48] = bf0[48];
+  bf1[49] = bf0[49];
+  bf1[50] = half_btf(-cospi[40], bf0[45], cospi[24], bf0[50], cos_bit);
+  bf1[51] = half_btf(-cospi[40], bf0[44], cospi[24], bf0[51], cos_bit);
+  bf1[52] = half_btf(cospi[24], bf0[43], cospi[40], bf0[52], cos_bit);
+  bf1[53] = half_btf(cospi[24], bf0[42], cospi[40], bf0[53], cos_bit);
+  bf1[54] = bf0[54];
+  bf1[55] = bf0[55];
+  bf1[56] = bf0[56];
+  bf1[57] = bf0[57];
+  bf1[58] = half_btf(-cospi[8], bf0[37], cospi[56], bf0[58], cos_bit);
+  bf1[59] = half_btf(-cospi[8], bf0[36], cospi[56], bf0[59], cos_bit);
+  bf1[60] = half_btf(cospi[56], bf0[35], cospi[8], bf0[60], cos_bit);
+  bf1[61] = half_btf(cospi[56], bf0[34], cospi[8], bf0[61], cos_bit);
+  bf1[62] = bf0[62];
+  bf1[63] = bf0[63];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = clamp_value(bf0[0] + bf0[3], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[2], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[1] - bf0[2], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]);
+  bf1[4] = bf0[4];
+  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
+  bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
+  bf1[7] = bf0[7];
+  bf1[8] = clamp_value(bf0[8] + bf0[11], stage_range[stage]);
+  bf1[9] = clamp_value(bf0[9] + bf0[10], stage_range[stage]);
+  bf1[10] = clamp_value(bf0[9] - bf0[10], stage_range[stage]);
+  bf1[11] = clamp_value(bf0[8] - bf0[11], stage_range[stage]);
+  bf1[12] = clamp_value(-bf0[12] + bf0[15], stage_range[stage]);
+  bf1[13] = clamp_value(-bf0[13] + bf0[14], stage_range[stage]);
+  bf1[14] = clamp_value(bf0[13] + bf0[14], stage_range[stage]);
+  bf1[15] = clamp_value(bf0[12] + bf0[15], stage_range[stage]);
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit);
+  bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit);
+  bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit);
+  bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit);
+  bf1[22] = bf0[22];
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = bf0[25];
+  bf1[26] = half_btf(-cospi[16], bf0[21], cospi[48], bf0[26], cos_bit);
+  bf1[27] = half_btf(-cospi[16], bf0[20], cospi[48], bf0[27], cos_bit);
+  bf1[28] = half_btf(cospi[48], bf0[19], cospi[16], bf0[28], cos_bit);
+  bf1[29] = half_btf(cospi[48], bf0[18], cospi[16], bf0[29], cos_bit);
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  bf1[32] = clamp_value(bf0[32] + bf0[39], stage_range[stage]);
+  bf1[33] = clamp_value(bf0[33] + bf0[38], stage_range[stage]);
+  bf1[34] = clamp_value(bf0[34] + bf0[37], stage_range[stage]);
+  bf1[35] = clamp_value(bf0[35] + bf0[36], stage_range[stage]);
+  bf1[36] = clamp_value(bf0[35] - bf0[36], stage_range[stage]);
+  bf1[37] = clamp_value(bf0[34] - bf0[37], stage_range[stage]);
+  bf1[38] = clamp_value(bf0[33] - bf0[38], stage_range[stage]);
+  bf1[39] = clamp_value(bf0[32] - bf0[39], stage_range[stage]);
+  bf1[40] = clamp_value(-bf0[40] + bf0[47], stage_range[stage]);
+  bf1[41] = clamp_value(-bf0[41] + bf0[46], stage_range[stage]);
+  bf1[42] = clamp_value(-bf0[42] + bf0[45], stage_range[stage]);
+  bf1[43] = clamp_value(-bf0[43] + bf0[44], stage_range[stage]);
+  bf1[44] = clamp_value(bf0[43] + bf0[44], stage_range[stage]);
+  bf1[45] = clamp_value(bf0[42] + bf0[45], stage_range[stage]);
+  bf1[46] = clamp_value(bf0[41] + bf0[46], stage_range[stage]);
+  bf1[47] = clamp_value(bf0[40] + bf0[47], stage_range[stage]);
+  bf1[48] = clamp_value(bf0[48] + bf0[55], stage_range[stage]);
+  bf1[49] = clamp_value(bf0[49] + bf0[54], stage_range[stage]);
+  bf1[50] = clamp_value(bf0[50] + bf0[53], stage_range[stage]);
+  bf1[51] = clamp_value(bf0[51] + bf0[52], stage_range[stage]);
+  bf1[52] = clamp_value(bf0[51] - bf0[52], stage_range[stage]);
+  bf1[53] = clamp_value(bf0[50] - bf0[53], stage_range[stage]);
+  bf1[54] = clamp_value(bf0[49] - bf0[54], stage_range[stage]);
+  bf1[55] = clamp_value(bf0[48] - bf0[55], stage_range[stage]);
+  bf1[56] = clamp_value(-bf0[56] + bf0[63], stage_range[stage]);
+  bf1[57] = clamp_value(-bf0[57] + bf0[62], stage_range[stage]);
+  bf1[58] = clamp_value(-bf0[58] + bf0[61], stage_range[stage]);
+  bf1[59] = clamp_value(-bf0[59] + bf0[60], stage_range[stage]);
+  bf1[60] = clamp_value(bf0[59] + bf0[60], stage_range[stage]);
+  bf1[61] = clamp_value(bf0[58] + bf0[61], stage_range[stage]);
+  bf1[62] = clamp_value(bf0[57] + bf0[62], stage_range[stage]);
+  bf1[63] = clamp_value(bf0[56] + bf0[63], stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 8
+  stage++;
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = clamp_value(bf0[0] + bf0[7], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[6], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[2] + bf0[5], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[3] + bf0[4], stage_range[stage]);
+  bf1[4] = clamp_value(bf0[3] - bf0[4], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[2] - bf0[5], stage_range[stage]);
+  bf1[6] = clamp_value(bf0[1] - bf0[6], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[0] - bf0[7], stage_range[stage]);
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
+  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
+  bf1[12] = half_btf(cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
+  bf1[13] = half_btf(cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = clamp_value(bf0[16] + bf0[23], stage_range[stage]);
+  bf1[17] = clamp_value(bf0[17] + bf0[22], stage_range[stage]);
+  bf1[18] = clamp_value(bf0[18] + bf0[21], stage_range[stage]);
+  bf1[19] = clamp_value(bf0[19] + bf0[20], stage_range[stage]);
+  bf1[20] = clamp_value(bf0[19] - bf0[20], stage_range[stage]);
+  bf1[21] = clamp_value(bf0[18] - bf0[21], stage_range[stage]);
+  bf1[22] = clamp_value(bf0[17] - bf0[22], stage_range[stage]);
+  bf1[23] = clamp_value(bf0[16] - bf0[23], stage_range[stage]);
+  bf1[24] = clamp_value(-bf0[24] + bf0[31], stage_range[stage]);
+  bf1[25] = clamp_value(-bf0[25] + bf0[30], stage_range[stage]);
+  bf1[26] = clamp_value(-bf0[26] + bf0[29], stage_range[stage]);
+  bf1[27] = clamp_value(-bf0[27] + bf0[28], stage_range[stage]);
+  bf1[28] = clamp_value(bf0[27] + bf0[28], stage_range[stage]);
+  bf1[29] = clamp_value(bf0[26] + bf0[29], stage_range[stage]);
+  bf1[30] = clamp_value(bf0[25] + bf0[30], stage_range[stage]);
+  bf1[31] = clamp_value(bf0[24] + bf0[31], stage_range[stage]);
+  bf1[32] = bf0[32];
+  bf1[33] = bf0[33];
+  bf1[34] = bf0[34];
+  bf1[35] = bf0[35];
+  bf1[36] = half_btf(-cospi[16], bf0[36], cospi[48], bf0[59], cos_bit);
+  bf1[37] = half_btf(-cospi[16], bf0[37], cospi[48], bf0[58], cos_bit);
+  bf1[38] = half_btf(-cospi[16], bf0[38], cospi[48], bf0[57], cos_bit);
+  bf1[39] = half_btf(-cospi[16], bf0[39], cospi[48], bf0[56], cos_bit);
+  bf1[40] = half_btf(-cospi[48], bf0[40], -cospi[16], bf0[55], cos_bit);
+  bf1[41] = half_btf(-cospi[48], bf0[41], -cospi[16], bf0[54], cos_bit);
+  bf1[42] = half_btf(-cospi[48], bf0[42], -cospi[16], bf0[53], cos_bit);
+  bf1[43] = half_btf(-cospi[48], bf0[43], -cospi[16], bf0[52], cos_bit);
+  bf1[44] = bf0[44];
+  bf1[45] = bf0[45];
+  bf1[46] = bf0[46];
+  bf1[47] = bf0[47];
+  bf1[48] = bf0[48];
+  bf1[49] = bf0[49];
+  bf1[50] = bf0[50];
+  bf1[51] = bf0[51];
+  bf1[52] = half_btf(-cospi[16], bf0[43], cospi[48], bf0[52], cos_bit);
+  bf1[53] = half_btf(-cospi[16], bf0[42], cospi[48], bf0[53], cos_bit);
+  bf1[54] = half_btf(-cospi[16], bf0[41], cospi[48], bf0[54], cos_bit);
+  bf1[55] = half_btf(-cospi[16], bf0[40], cospi[48], bf0[55], cos_bit);
+  bf1[56] = half_btf(cospi[48], bf0[39], cospi[16], bf0[56], cos_bit);
+  bf1[57] = half_btf(cospi[48], bf0[38], cospi[16], bf0[57], cos_bit);
+  bf1[58] = half_btf(cospi[48], bf0[37], cospi[16], bf0[58], cos_bit);
+  bf1[59] = half_btf(cospi[48], bf0[36], cospi[16], bf0[59], cos_bit);
+  bf1[60] = bf0[60];
+  bf1[61] = bf0[61];
+  bf1[62] = bf0[62];
+  bf1[63] = bf0[63];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 9
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = clamp_value(bf0[0] + bf0[15], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[14], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[2] + bf0[13], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[3] + bf0[12], stage_range[stage]);
+  bf1[4] = clamp_value(bf0[4] + bf0[11], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[5] + bf0[10], stage_range[stage]);
+  bf1[6] = clamp_value(bf0[6] + bf0[9], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[7] + bf0[8], stage_range[stage]);
+  bf1[8] = clamp_value(bf0[7] - bf0[8], stage_range[stage]);
+  bf1[9] = clamp_value(bf0[6] - bf0[9], stage_range[stage]);
+  bf1[10] = clamp_value(bf0[5] - bf0[10], stage_range[stage]);
+  bf1[11] = clamp_value(bf0[4] - bf0[11], stage_range[stage]);
+  bf1[12] = clamp_value(bf0[3] - bf0[12], stage_range[stage]);
+  bf1[13] = clamp_value(bf0[2] - bf0[13], stage_range[stage]);
+  bf1[14] = clamp_value(bf0[1] - bf0[14], stage_range[stage]);
+  bf1[15] = clamp_value(bf0[0] - bf0[15], stage_range[stage]);
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = bf0[18];
+  bf1[19] = bf0[19];
+  bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
+  bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
+  bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
+  bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
+  bf1[24] = half_btf(cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
+  bf1[25] = half_btf(cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
+  bf1[26] = half_btf(cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
+  bf1[27] = half_btf(cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
+  bf1[28] = bf0[28];
+  bf1[29] = bf0[29];
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  bf1[32] = clamp_value(bf0[32] + bf0[47], stage_range[stage]);
+  bf1[33] = clamp_value(bf0[33] + bf0[46], stage_range[stage]);
+  bf1[34] = clamp_value(bf0[34] + bf0[45], stage_range[stage]);
+  bf1[35] = clamp_value(bf0[35] + bf0[44], stage_range[stage]);
+  bf1[36] = clamp_value(bf0[36] + bf0[43], stage_range[stage]);
+  bf1[37] = clamp_value(bf0[37] + bf0[42], stage_range[stage]);
+  bf1[38] = clamp_value(bf0[38] + bf0[41], stage_range[stage]);
+  bf1[39] = clamp_value(bf0[39] + bf0[40], stage_range[stage]);
+  bf1[40] = clamp_value(bf0[39] - bf0[40], stage_range[stage]);
+  bf1[41] = clamp_value(bf0[38] - bf0[41], stage_range[stage]);
+  bf1[42] = clamp_value(bf0[37] - bf0[42], stage_range[stage]);
+  bf1[43] = clamp_value(bf0[36] - bf0[43], stage_range[stage]);
+  bf1[44] = clamp_value(bf0[35] - bf0[44], stage_range[stage]);
+  bf1[45] = clamp_value(bf0[34] - bf0[45], stage_range[stage]);
+  bf1[46] = clamp_value(bf0[33] - bf0[46], stage_range[stage]);
+  bf1[47] = clamp_value(bf0[32] - bf0[47], stage_range[stage]);
+  bf1[48] = clamp_value(-bf0[48] + bf0[63], stage_range[stage]);
+  bf1[49] = clamp_value(-bf0[49] + bf0[62], stage_range[stage]);
+  bf1[50] = clamp_value(-bf0[50] + bf0[61], stage_range[stage]);
+  bf1[51] = clamp_value(-bf0[51] + bf0[60], stage_range[stage]);
+  bf1[52] = clamp_value(-bf0[52] + bf0[59], stage_range[stage]);
+  bf1[53] = clamp_value(-bf0[53] + bf0[58], stage_range[stage]);
+  bf1[54] = clamp_value(-bf0[54] + bf0[57], stage_range[stage]);
+  bf1[55] = clamp_value(-bf0[55] + bf0[56], stage_range[stage]);
+  bf1[56] = clamp_value(bf0[55] + bf0[56], stage_range[stage]);
+  bf1[57] = clamp_value(bf0[54] + bf0[57], stage_range[stage]);
+  bf1[58] = clamp_value(bf0[53] + bf0[58], stage_range[stage]);
+  bf1[59] = clamp_value(bf0[52] + bf0[59], stage_range[stage]);
+  bf1[60] = clamp_value(bf0[51] + bf0[60], stage_range[stage]);
+  bf1[61] = clamp_value(bf0[50] + bf0[61], stage_range[stage]);
+  bf1[62] = clamp_value(bf0[49] + bf0[62], stage_range[stage]);
+  bf1[63] = clamp_value(bf0[48] + bf0[63], stage_range[stage]);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 10
+  stage++;
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = clamp_value(bf0[0] + bf0[31], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[30], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[2] + bf0[29], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[3] + bf0[28], stage_range[stage]);
+  bf1[4] = clamp_value(bf0[4] + bf0[27], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[5] + bf0[26], stage_range[stage]);
+  bf1[6] = clamp_value(bf0[6] + bf0[25], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[7] + bf0[24], stage_range[stage]);
+  bf1[8] = clamp_value(bf0[8] + bf0[23], stage_range[stage]);
+  bf1[9] = clamp_value(bf0[9] + bf0[22], stage_range[stage]);
+  bf1[10] = clamp_value(bf0[10] + bf0[21], stage_range[stage]);
+  bf1[11] = clamp_value(bf0[11] + bf0[20], stage_range[stage]);
+  bf1[12] = clamp_value(bf0[12] + bf0[19], stage_range[stage]);
+  bf1[13] = clamp_value(bf0[13] + bf0[18], stage_range[stage]);
+  bf1[14] = clamp_value(bf0[14] + bf0[17], stage_range[stage]);
+  bf1[15] = clamp_value(bf0[15] + bf0[16], stage_range[stage]);
+  bf1[16] = clamp_value(bf0[15] - bf0[16], stage_range[stage]);
+  bf1[17] = clamp_value(bf0[14] - bf0[17], stage_range[stage]);
+  bf1[18] = clamp_value(bf0[13] - bf0[18], stage_range[stage]);
+  bf1[19] = clamp_value(bf0[12] - bf0[19], stage_range[stage]);
+  bf1[20] = clamp_value(bf0[11] - bf0[20], stage_range[stage]);
+  bf1[21] = clamp_value(bf0[10] - bf0[21], stage_range[stage]);
+  bf1[22] = clamp_value(bf0[9] - bf0[22], stage_range[stage]);
+  bf1[23] = clamp_value(bf0[8] - bf0[23], stage_range[stage]);
+  bf1[24] = clamp_value(bf0[7] - bf0[24], stage_range[stage]);
+  bf1[25] = clamp_value(bf0[6] - bf0[25], stage_range[stage]);
+  bf1[26] = clamp_value(bf0[5] - bf0[26], stage_range[stage]);
+  bf1[27] = clamp_value(bf0[4] - bf0[27], stage_range[stage]);
+  bf1[28] = clamp_value(bf0[3] - bf0[28], stage_range[stage]);
+  bf1[29] = clamp_value(bf0[2] - bf0[29], stage_range[stage]);
+  bf1[30] = clamp_value(bf0[1] - bf0[30], stage_range[stage]);
+  bf1[31] = clamp_value(bf0[0] - bf0[31], stage_range[stage]);
+  bf1[32] = bf0[32];
+  bf1[33] = bf0[33];
+  bf1[34] = bf0[34];
+  bf1[35] = bf0[35];
+  bf1[36] = bf0[36];
+  bf1[37] = bf0[37];
+  bf1[38] = bf0[38];
+  bf1[39] = bf0[39];
+  bf1[40] = half_btf(-cospi[32], bf0[40], cospi[32], bf0[55], cos_bit);
+  bf1[41] = half_btf(-cospi[32], bf0[41], cospi[32], bf0[54], cos_bit);
+  bf1[42] = half_btf(-cospi[32], bf0[42], cospi[32], bf0[53], cos_bit);
+  bf1[43] = half_btf(-cospi[32], bf0[43], cospi[32], bf0[52], cos_bit);
+  bf1[44] = half_btf(-cospi[32], bf0[44], cospi[32], bf0[51], cos_bit);
+  bf1[45] = half_btf(-cospi[32], bf0[45], cospi[32], bf0[50], cos_bit);
+  bf1[46] = half_btf(-cospi[32], bf0[46], cospi[32], bf0[49], cos_bit);
+  bf1[47] = half_btf(-cospi[32], bf0[47], cospi[32], bf0[48], cos_bit);
+  bf1[48] = half_btf(cospi[32], bf0[47], cospi[32], bf0[48], cos_bit);
+  bf1[49] = half_btf(cospi[32], bf0[46], cospi[32], bf0[49], cos_bit);
+  bf1[50] = half_btf(cospi[32], bf0[45], cospi[32], bf0[50], cos_bit);
+  bf1[51] = half_btf(cospi[32], bf0[44], cospi[32], bf0[51], cos_bit);
+  bf1[52] = half_btf(cospi[32], bf0[43], cospi[32], bf0[52], cos_bit);
+  bf1[53] = half_btf(cospi[32], bf0[42], cospi[32], bf0[53], cos_bit);
+  bf1[54] = half_btf(cospi[32], bf0[41], cospi[32], bf0[54], cos_bit);
+  bf1[55] = half_btf(cospi[32], bf0[40], cospi[32], bf0[55], cos_bit);
+  bf1[56] = bf0[56];
+  bf1[57] = bf0[57];
+  bf1[58] = bf0[58];
+  bf1[59] = bf0[59];
+  bf1[60] = bf0[60];
+  bf1[61] = bf0[61];
+  bf1[62] = bf0[62];
+  bf1[63] = bf0[63];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 11
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = clamp_value(bf0[0] + bf0[63], stage_range[stage]);
+  bf1[1] = clamp_value(bf0[1] + bf0[62], stage_range[stage]);
+  bf1[2] = clamp_value(bf0[2] + bf0[61], stage_range[stage]);
+  bf1[3] = clamp_value(bf0[3] + bf0[60], stage_range[stage]);
+  bf1[4] = clamp_value(bf0[4] + bf0[59], stage_range[stage]);
+  bf1[5] = clamp_value(bf0[5] + bf0[58], stage_range[stage]);
+  bf1[6] = clamp_value(bf0[6] + bf0[57], stage_range[stage]);
+  bf1[7] = clamp_value(bf0[7] + bf0[56], stage_range[stage]);
+  bf1[8] = clamp_value(bf0[8] + bf0[55], stage_range[stage]);
+  bf1[9] = clamp_value(bf0[9] + bf0[54], stage_range[stage]);
+  bf1[10] = clamp_value(bf0[10] + bf0[53], stage_range[stage]);
+  bf1[11] = clamp_value(bf0[11] + bf0[52], stage_range[stage]);
+  bf1[12] = clamp_value(bf0[12] + bf0[51], stage_range[stage]);
+  bf1[13] = clamp_value(bf0[13] + bf0[50], stage_range[stage]);
+  bf1[14] = clamp_value(bf0[14] + bf0[49], stage_range[stage]);
+  bf1[15] = clamp_value(bf0[15] + bf0[48], stage_range[stage]);
+  bf1[16] = clamp_value(bf0[16] + bf0[47], stage_range[stage]);
+  bf1[17] = clamp_value(bf0[17] + bf0[46], stage_range[stage]);
+  bf1[18] = clamp_value(bf0[18] + bf0[45], stage_range[stage]);
+  bf1[19] = clamp_value(bf0[19] + bf0[44], stage_range[stage]);
+  bf1[20] = clamp_value(bf0[20] + bf0[43], stage_range[stage]);
+  bf1[21] = clamp_value(bf0[21] + bf0[42], stage_range[stage]);
+  bf1[22] = clamp_value(bf0[22] + bf0[41], stage_range[stage]);
+  bf1[23] = clamp_value(bf0[23] + bf0[40], stage_range[stage]);
+  bf1[24] = clamp_value(bf0[24] + bf0[39], stage_range[stage]);
+  bf1[25] = clamp_value(bf0[25] + bf0[38], stage_range[stage]);
+  bf1[26] = clamp_value(bf0[26] + bf0[37], stage_range[stage]);
+  bf1[27] = clamp_value(bf0[27] + bf0[36], stage_range[stage]);
+  bf1[28] = clamp_value(bf0[28] + bf0[35], stage_range[stage]);
+  bf1[29] = clamp_value(bf0[29] + bf0[34], stage_range[stage]);
+  bf1[30] = clamp_value(bf0[30] + bf0[33], stage_range[stage]);
+  bf1[31] = clamp_value(bf0[31] + bf0[32], stage_range[stage]);
+  bf1[32] = clamp_value(bf0[31] - bf0[32], stage_range[stage]);
+  bf1[33] = clamp_value(bf0[30] - bf0[33], stage_range[stage]);
+  bf1[34] = clamp_value(bf0[29] - bf0[34], stage_range[stage]);
+  bf1[35] = clamp_value(bf0[28] - bf0[35], stage_range[stage]);
+  bf1[36] = clamp_value(bf0[27] - bf0[36], stage_range[stage]);
+  bf1[37] = clamp_value(bf0[26] - bf0[37], stage_range[stage]);
+  bf1[38] = clamp_value(bf0[25] - bf0[38], stage_range[stage]);
+  bf1[39] = clamp_value(bf0[24] - bf0[39], stage_range[stage]);
+  bf1[40] = clamp_value(bf0[23] - bf0[40], stage_range[stage]);
+  bf1[41] = clamp_value(bf0[22] - bf0[41], stage_range[stage]);
+  bf1[42] = clamp_value(bf0[21] - bf0[42], stage_range[stage]);
+  bf1[43] = clamp_value(bf0[20] - bf0[43], stage_range[stage]);
+  bf1[44] = clamp_value(bf0[19] - bf0[44], stage_range[stage]);
+  bf1[45] = clamp_value(bf0[18] - bf0[45], stage_range[stage]);
+  bf1[46] = clamp_value(bf0[17] - bf0[46], stage_range[stage]);
+  bf1[47] = clamp_value(bf0[16] - bf0[47], stage_range[stage]);
+  bf1[48] = clamp_value(bf0[15] - bf0[48], stage_range[stage]);
+  bf1[49] = clamp_value(bf0[14] - bf0[49], stage_range[stage]);
+  bf1[50] = clamp_value(bf0[13] - bf0[50], stage_range[stage]);
+  bf1[51] = clamp_value(bf0[12] - bf0[51], stage_range[stage]);
+  bf1[52] = clamp_value(bf0[11] - bf0[52], stage_range[stage]);
+  bf1[53] = clamp_value(bf0[10] - bf0[53], stage_range[stage]);
+  bf1[54] = clamp_value(bf0[9] - bf0[54], stage_range[stage]);
+  bf1[55] = clamp_value(bf0[8] - bf0[55], stage_range[stage]);
+  bf1[56] = clamp_value(bf0[7] - bf0[56], stage_range[stage]);
+  bf1[57] = clamp_value(bf0[6] - bf0[57], stage_range[stage]);
+  bf1[58] = clamp_value(bf0[5] - bf0[58], stage_range[stage]);
+  bf1[59] = clamp_value(bf0[4] - bf0[59], stage_range[stage]);
+  bf1[60] = clamp_value(bf0[3] - bf0[60], stage_range[stage]);
+  bf1[61] = clamp_value(bf0[2] - bf0[61], stage_range[stage]);
+  bf1[62] = clamp_value(bf0[1] - bf0[62], stage_range[stage]);
+  bf1[63] = clamp_value(bf0[0] - bf0[63], stage_range[stage]);
+}
diff --git a/libs/libaom/src/av1/common/av1_inv_txfm1d.h b/libs/libaom/src/av1/common/av1_inv_txfm1d.h
new file mode 100644
index 000000000..e1d5d98d1
--- /dev/null
+++ b/libs/libaom/src/av1/common/av1_inv_txfm1d.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_AV1_INV_TXFM1D_H_
+#define AOM_AV1_COMMON_AV1_INV_TXFM1D_H_
+
+#include "av1/common/av1_txfm.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static INLINE int32_t clamp_value(int32_t value, int8_t bit) {
+  if (bit <= 0) return value;  // Do nothing for invalid clamp bit.
+  const int64_t max_value = (1LL << (bit - 1)) - 1;
+  const int64_t min_value = -(1LL << (bit - 1));
+  return (int32_t)clamp64(value, min_value, max_value);
+}
+
+static INLINE void clamp_buf(int32_t *buf, int32_t size, int8_t bit) {
+  for (int i = 0; i < size; ++i) buf[i] = clamp_value(buf[i], bit);
+}
+
+void av1_idct4(const int32_t *input, int32_t *output, int8_t cos_bit,
+               const int8_t *stage_range);
+void av1_idct8(const int32_t *input, int32_t *output, int8_t cos_bit,
+               const int8_t *stage_range);
+void av1_idct16(const int32_t *input, int32_t *output, int8_t cos_bit,
+                const int8_t *stage_range);
+void av1_idct32(const int32_t *input, int32_t *output, int8_t cos_bit,
+                const int8_t *stage_range);
+void av1_idct64(const int32_t *input, int32_t *output, int8_t cos_bit,
+                const int8_t *stage_range);
+void av1_iadst4(const int32_t *input, int32_t *output, int8_t cos_bit,
+                const int8_t *stage_range);
+void av1_iadst8(const int32_t *input, int32_t *output, int8_t cos_bit,
+                const int8_t *stage_range);
+void av1_iadst16(const int32_t *input, int32_t *output, int8_t cos_bit,
+                 const int8_t *stage_range);
+void av1_iidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+                      const int8_t *stage_range);
+void av1_iidentity8_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+                      const int8_t *stage_range);
+void av1_iidentity16_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+                       const int8_t *stage_range);
+void av1_iidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+                       const int8_t *stage_range);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // AOM_AV1_COMMON_AV1_INV_TXFM1D_H_
diff --git a/libs/libaom/src/av1/common/av1_inv_txfm1d_cfg.h b/libs/libaom/src/av1/common/av1_inv_txfm1d_cfg.h
new file mode 100644
index 000000000..47fedbd2a
--- /dev/null
+++ b/libs/libaom/src/av1/common/av1_inv_txfm1d_cfg.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_AV1_INV_TXFM1D_CFG_H_
+#define AOM_AV1_COMMON_AV1_INV_TXFM1D_CFG_H_
+#include "av1/common/av1_inv_txfm1d.h"
+
+// sum of fwd_shift_##
+static const int8_t inv_start_range[TX_SIZES_ALL] = {
+  5,  // 4x4 transform
+  6,  // 8x8 transform
+  7,  // 16x16 transform
+  7,  // 32x32 transform
+  7,  // 64x64 transform
+  5,  // 4x8 transform
+  5,  // 8x4 transform
+  6,  // 8x16 transform
+  6,  // 16x8 transform
+  6,  // 16x32 transform
+  6,  // 32x16 transform
+  6,  // 32x64 transform
+  6,  // 64x32 transform
+  6,  // 4x16 transform
+  6,  // 16x4 transform
+  7,  // 8x32 transform
+  7,  // 32x8 transform
+  7,  // 16x64 transform
+  7,  // 64x16 transform
+};
+
+extern const int8_t *av1_inv_txfm_shift_ls[TX_SIZES_ALL];
+
+// Values in both av1_inv_cos_bit_col and av1_inv_cos_bit_row are always 12
+// for each valid row and col combination
+#define INV_COS_BIT 12
+extern const int8_t av1_inv_cos_bit_col[5 /*row*/][5 /*col*/];
+extern const int8_t av1_inv_cos_bit_row[5 /*row*/][5 /*col*/];
+
+#endif  // AOM_AV1_COMMON_AV1_INV_TXFM1D_CFG_H_
diff --git a/libs/libaom/src/av1/common/av1_inv_txfm2d.c b/libs/libaom/src/av1/common/av1_inv_txfm2d.c
new file mode 100644
index 000000000..559d12129
--- /dev/null
+++ b/libs/libaom/src/av1/common/av1_inv_txfm2d.c
@@ -0,0 +1,504 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "av1/common/enums.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/common/av1_inv_txfm1d.h"
+#include "av1/common/av1_inv_txfm1d_cfg.h"
+
+void av1_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
+                                 int stride, int bd) {
+  /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
+     0.5 shifts per pixel. */
+  int i;
+  tran_low_t output[16];
+  tran_low_t a1, b1, c1, d1, e1;
+  const tran_low_t *ip = input;
+  tran_low_t *op = output;
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  for (i = 0; i < 4; i++) {
+    a1 = ip[0] >> UNIT_QUANT_SHIFT;
+    c1 = ip[1] >> UNIT_QUANT_SHIFT;
+    d1 = ip[2] >> UNIT_QUANT_SHIFT;
+    b1 = ip[3] >> UNIT_QUANT_SHIFT;
+    a1 += c1;
+    d1 -= b1;
+    e1 = (a1 - d1) >> 1;
+    b1 = e1 - b1;
+    c1 = e1 - c1;
+    a1 -= b1;
+    d1 += c1;
+
+    op[0] = a1;
+    op[1] = b1;
+    op[2] = c1;
+    op[3] = d1;
+    ip += 4;
+    op += 4;
+  }
+
+  ip = output;
+  for (i = 0; i < 4; i++) {
+    a1 = ip[4 * 0];
+    c1 = ip[4 * 1];
+    d1 = ip[4 * 2];
+    b1 = ip[4 * 3];
+    a1 += c1;
+    d1 -= b1;
+    e1 = (a1 - d1) >> 1;
+    b1 = e1 - b1;
+    c1 = e1 - c1;
+    a1 -= b1;
+    d1 += c1;
+
+    range_check_value(a1, bd + 1);
+    range_check_value(b1, bd + 1);
+    range_check_value(c1, bd + 1);
+    range_check_value(d1, bd + 1);
+
+    dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0], a1, bd);
+    dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1], b1, bd);
+    dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2], c1, bd);
+    dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3], d1, bd);
+
+    ip++;
+    dest++;
+  }
+}
+
+void av1_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
+                                int dest_stride, int bd) {
+  int i;
+  tran_low_t a1, e1;
+  tran_low_t tmp[4];
+  const tran_low_t *ip = in;
+  tran_low_t *op = tmp;
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+  (void)bd;
+
+  a1 = ip[0] >> UNIT_QUANT_SHIFT;
+  e1 = a1 >> 1;
+  a1 -= e1;
+  op[0] = a1;
+  op[1] = op[2] = op[3] = e1;
+
+  ip = tmp;
+  for (i = 0; i < 4; i++) {
+    e1 = ip[0] >> 1;
+    a1 = ip[0] - e1;
+    dest[dest_stride * 0] =
+        highbd_clip_pixel_add(dest[dest_stride * 0], a1, bd);
+    dest[dest_stride * 1] =
+        highbd_clip_pixel_add(dest[dest_stride * 1], e1, bd);
+    dest[dest_stride * 2] =
+        highbd_clip_pixel_add(dest[dest_stride * 2], e1, bd);
+    dest[dest_stride * 3] =
+        highbd_clip_pixel_add(dest[dest_stride * 3], e1, bd);
+    ip++;
+    dest++;
+  }
+}
+
+static INLINE TxfmFunc inv_txfm_type_to_func(TXFM_TYPE txfm_type) {
+  switch (txfm_type) {
+    case TXFM_TYPE_DCT4: return av1_idct4;
+    case TXFM_TYPE_DCT8: return av1_idct8;
+    case TXFM_TYPE_DCT16: return av1_idct16;
+    case TXFM_TYPE_DCT32: return av1_idct32;
+    case TXFM_TYPE_DCT64: return av1_idct64;
+    case TXFM_TYPE_ADST4: return av1_iadst4;
+    case TXFM_TYPE_ADST8: return av1_iadst8;
+    case TXFM_TYPE_ADST16: return av1_iadst16;
+    case TXFM_TYPE_IDENTITY4: return av1_iidentity4_c;
+    case TXFM_TYPE_IDENTITY8: return av1_iidentity8_c;
+    case TXFM_TYPE_IDENTITY16: return av1_iidentity16_c;
+    case TXFM_TYPE_IDENTITY32: return av1_iidentity32_c;
+    default: assert(0); return NULL;
+  }
+}
+
+static const int8_t inv_shift_4x4[2] = { 0, -4 };
+static const int8_t inv_shift_8x8[2] = { -1, -4 };
+static const int8_t inv_shift_16x16[2] = { -2, -4 };
+static const int8_t inv_shift_32x32[2] = { -2, -4 };
+static const int8_t inv_shift_64x64[2] = { -2, -4 };
+static const int8_t inv_shift_4x8[2] = { 0, -4 };
+static const int8_t inv_shift_8x4[2] = { 0, -4 };
+static const int8_t inv_shift_8x16[2] = { -1, -4 };
+static const int8_t inv_shift_16x8[2] = { -1, -4 };
+static const int8_t inv_shift_16x32[2] = { -1, -4 };
+static const int8_t inv_shift_32x16[2] = { -1, -4 };
+static const int8_t inv_shift_32x64[2] = { -1, -4 };
+static const int8_t inv_shift_64x32[2] = { -1, -4 };
+static const int8_t inv_shift_4x16[2] = { -1, -4 };
+static const int8_t inv_shift_16x4[2] = { -1, -4 };
+static const int8_t inv_shift_8x32[2] = { -2, -4 };
+static const int8_t inv_shift_32x8[2] = { -2, -4 };
+static const int8_t inv_shift_16x64[2] = { -2, -4 };
+static const int8_t inv_shift_64x16[2] = { -2, -4 };
+
+const int8_t *av1_inv_txfm_shift_ls[TX_SIZES_ALL] = {
+  inv_shift_4x4,   inv_shift_8x8,   inv_shift_16x16, inv_shift_32x32,
+  inv_shift_64x64, inv_shift_4x8,   inv_shift_8x4,   inv_shift_8x16,
+  inv_shift_16x8,  inv_shift_16x32, inv_shift_32x16, inv_shift_32x64,
+  inv_shift_64x32, inv_shift_4x16,  inv_shift_16x4,  inv_shift_8x32,
+  inv_shift_32x8,  inv_shift_16x64, inv_shift_64x16,
+};
+
+/* clang-format off */
+const int8_t av1_inv_cos_bit_col[MAX_TXWH_IDX]      // txw_idx
+                            [MAX_TXWH_IDX] = {  // txh_idx
+    { INV_COS_BIT, INV_COS_BIT, INV_COS_BIT,           0,           0 },
+    { INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT,           0 },
+    { INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT },
+    {           0, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT },
+    {           0,           0, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT }
+  };
+
+const int8_t av1_inv_cos_bit_row[MAX_TXWH_IDX]      // txw_idx
+                            [MAX_TXWH_IDX] = {  // txh_idx
+    { INV_COS_BIT, INV_COS_BIT, INV_COS_BIT,           0,           0 },
+    { INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT,           0 },
+    { INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT },
+    {           0, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT },
+    {           0,           0, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT }
+  };
+/* clang-format on */
+
+static const int8_t iadst4_range[7] = { 0, 1, 0, 0, 0, 0, 0 };
+
+void av1_get_inv_txfm_cfg(TX_TYPE tx_type, TX_SIZE tx_size,
+                          TXFM_2D_FLIP_CFG *cfg) {
+  assert(cfg != NULL);
+  cfg->tx_size = tx_size;
+  av1_zero(cfg->stage_range_col);
+  av1_zero(cfg->stage_range_row);
+  set_flip_cfg(tx_type, cfg);
+  const TX_TYPE_1D tx_type_1d_col = vtx_tab[tx_type];
+  const TX_TYPE_1D tx_type_1d_row = htx_tab[tx_type];
+  cfg->shift = av1_inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  cfg->cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
+  cfg->cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
+  cfg->txfm_type_col = av1_txfm_type_ls[txh_idx][tx_type_1d_col];
+  if (cfg->txfm_type_col == TXFM_TYPE_ADST4) {
+    memcpy(cfg->stage_range_col, iadst4_range, sizeof(iadst4_range));
+  }
+  cfg->txfm_type_row = av1_txfm_type_ls[txw_idx][tx_type_1d_row];
+  if (cfg->txfm_type_row == TXFM_TYPE_ADST4) {
+    memcpy(cfg->stage_range_row, iadst4_range, sizeof(iadst4_range));
+  }
+  cfg->stage_num_col = av1_txfm_stage_num_list[cfg->txfm_type_col];
+  cfg->stage_num_row = av1_txfm_stage_num_list[cfg->txfm_type_row];
+}
+
+void av1_gen_inv_stage_range(int8_t *stage_range_col, int8_t *stage_range_row,
+                             const TXFM_2D_FLIP_CFG *cfg, TX_SIZE tx_size,
+                             int bd) {
+  const int fwd_shift = inv_start_range[tx_size];
+  const int8_t *shift = cfg->shift;
+  int8_t opt_range_row, opt_range_col;
+  if (bd == 8) {
+    opt_range_row = 16;
+    opt_range_col = 16;
+  } else if (bd == 10) {
+    opt_range_row = 18;
+    opt_range_col = 16;
+  } else {
+    assert(bd == 12);
+    opt_range_row = 20;
+    opt_range_col = 18;
+  }
+  // i < MAX_TXFM_STAGE_NUM will mute above array bounds warning
+  for (int i = 0; i < cfg->stage_num_row && i < MAX_TXFM_STAGE_NUM; ++i) {
+    int real_range_row = cfg->stage_range_row[i] + fwd_shift + bd + 1;
+    (void)real_range_row;
+    if (cfg->txfm_type_row == TXFM_TYPE_ADST4 && i == 1) {
+      // the adst4 may use 1 extra bit on top of opt_range_row at stage 1
+      // so opt_range_row >= real_range_row will not hold
+      stage_range_row[i] = opt_range_row;
+    } else {
+      assert(opt_range_row >= real_range_row);
+      stage_range_row[i] = opt_range_row;
+    }
+  }
+  // i < MAX_TXFM_STAGE_NUM will mute above array bounds warning
+  for (int i = 0; i < cfg->stage_num_col && i < MAX_TXFM_STAGE_NUM; ++i) {
+    int real_range_col =
+        cfg->stage_range_col[i] + fwd_shift + shift[0] + bd + 1;
+    (void)real_range_col;
+    if (cfg->txfm_type_col == TXFM_TYPE_ADST4 && i == 1) {
+      // the adst4 may use 1 extra bit on top of opt_range_col at stage 1
+      // so opt_range_col >= real_range_col will not hold
+      stage_range_col[i] = opt_range_col;
+    } else {
+      assert(opt_range_col >= real_range_col);
+      stage_range_col[i] = opt_range_col;
+    }
+  }
+}
+
+static INLINE void inv_txfm2d_add_c(const int32_t *input, uint16_t *output,
+                                    int stride, TXFM_2D_FLIP_CFG *cfg,
+                                    int32_t *txfm_buf, TX_SIZE tx_size,
+                                    int bd) {
+  // Note when assigning txfm_size_col, we use the txfm_size from the
+  // row configuration and vice versa. This is intentionally done to
+  // accurately perform rectangular transforms. When the transform is
+  // rectangular, the number of columns will be the same as the
+  // txfm_size stored in the row cfg struct. It will make no difference
+  // for square transforms.
+  const int txfm_size_col = tx_size_wide[cfg->tx_size];
+  const int txfm_size_row = tx_size_high[cfg->tx_size];
+  // Take the shift from the larger dimension in the rectangular case.
+  const int8_t *shift = cfg->shift;
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+  int8_t stage_range_row[MAX_TXFM_STAGE_NUM];
+  int8_t stage_range_col[MAX_TXFM_STAGE_NUM];
+  assert(cfg->stage_num_row <= MAX_TXFM_STAGE_NUM);
+  assert(cfg->stage_num_col <= MAX_TXFM_STAGE_NUM);
+  av1_gen_inv_stage_range(stage_range_col, stage_range_row, cfg, tx_size, bd);
+
+  const int8_t cos_bit_col = cfg->cos_bit_col;
+  const int8_t cos_bit_row = cfg->cos_bit_row;
+  const TxfmFunc txfm_func_col = inv_txfm_type_to_func(cfg->txfm_type_col);
+  const TxfmFunc txfm_func_row = inv_txfm_type_to_func(cfg->txfm_type_row);
+
+  // txfm_buf's length is  txfm_size_row * txfm_size_col + 2 *
+  // AOMMAX(txfm_size_row, txfm_size_col)
+  // it is used for intermediate data buffering
+  const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
+  int32_t *temp_in = txfm_buf;
+  int32_t *temp_out = temp_in + buf_offset;
+  int32_t *buf = temp_out + buf_offset;
+  int32_t *buf_ptr = buf;
+  int c, r;
+
+  // Rows
+  for (r = 0; r < txfm_size_row; ++r) {
+    if (abs(rect_type) == 1) {
+      for (c = 0; c < txfm_size_col; ++c) {
+        temp_in[c] = round_shift((int64_t)input[c] * NewInvSqrt2, NewSqrt2Bits);
+      }
+      clamp_buf(temp_in, txfm_size_col, bd + 8);
+      txfm_func_row(temp_in, buf_ptr, cos_bit_row, stage_range_row);
+    } else {
+      for (c = 0; c < txfm_size_col; ++c) {
+        temp_in[c] = input[c];
+      }
+      clamp_buf(temp_in, txfm_size_col, bd + 8);
+      txfm_func_row(temp_in, buf_ptr, cos_bit_row, stage_range_row);
+    }
+    av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]);
+    input += txfm_size_col;
+    buf_ptr += txfm_size_col;
+  }
+
+  // Columns
+  for (c = 0; c < txfm_size_col; ++c) {
+    if (cfg->lr_flip == 0) {
+      for (r = 0; r < txfm_size_row; ++r)
+        temp_in[r] = buf[r * txfm_size_col + c];
+    } else {
+      // flip left right
+      for (r = 0; r < txfm_size_row; ++r)
+        temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
+    }
+    clamp_buf(temp_in, txfm_size_row, AOMMAX(bd + 6, 16));
+    txfm_func_col(temp_in, temp_out, cos_bit_col, stage_range_col);
+    av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+    if (cfg->ud_flip == 0) {
+      for (r = 0; r < txfm_size_row; ++r) {
+        output[r * stride + c] =
+            highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
+      }
+    } else {
+      // flip upside down
+      for (r = 0; r < txfm_size_row; ++r) {
+        output[r * stride + c] = highbd_clip_pixel_add(
+            output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
+      }
+    }
+  }
+}
+
+static INLINE void inv_txfm2d_add_facade(const int32_t *input, uint16_t *output,
+                                         int stride, int32_t *txfm_buf,
+                                         TX_TYPE tx_type, TX_SIZE tx_size,
+                                         int bd) {
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_inv_txfm_cfg(tx_type, tx_size, &cfg);
+  // Forward shift sum uses larger square size, to be consistent with what
+  // av1_gen_inv_stage_range() does for inverse shifts.
+  inv_txfm2d_add_c(input, output, stride, &cfg, txfm_buf, tx_size, bd);
+}
+
+void av1_inv_txfm2d_add_4x8_c(const int32_t *input, uint16_t *output,
+                              int stride, TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(32, int, txfm_buf[4 * 8 + 8 + 8]);
+  inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_4X8, bd);
+}
+
+void av1_inv_txfm2d_add_8x4_c(const int32_t *input, uint16_t *output,
+                              int stride, TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(32, int, txfm_buf[8 * 4 + 8 + 8]);
+  inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_8X4, bd);
+}
+
+void av1_inv_txfm2d_add_8x16_c(const int32_t *input, uint16_t *output,
+                               int stride, TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(32, int, txfm_buf[8 * 16 + 16 + 16]);
+  inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_8X16, bd);
+}
+
+void av1_inv_txfm2d_add_16x8_c(const int32_t *input, uint16_t *output,
+                               int stride, TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(32, int, txfm_buf[16 * 8 + 16 + 16]);
+  inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_16X8, bd);
+}
+
+void av1_inv_txfm2d_add_16x32_c(const int32_t *input, uint16_t *output,
+                                int stride, TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(32, int, txfm_buf[16 * 32 + 32 + 32]);
+  inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_16X32, bd);
+}
+
+void av1_inv_txfm2d_add_32x16_c(const int32_t *input, uint16_t *output,
+                                int stride, TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(32, int, txfm_buf[32 * 16 + 32 + 32]);
+  inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_32X16, bd);
+}
+
+void av1_inv_txfm2d_add_4x4_c(const int32_t *input, uint16_t *output,
+                              int stride, TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(32, int, txfm_buf[4 * 4 + 4 + 4]);
+  inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_4X4, bd);
+}
+
+void av1_inv_txfm2d_add_8x8_c(const int32_t *input, uint16_t *output,
+                              int stride, TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(32, int, txfm_buf[8 * 8 + 8 + 8]);
+  inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_8X8, bd);
+}
+
+void av1_inv_txfm2d_add_16x16_c(const int32_t *input, uint16_t *output,
+                                int stride, TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(32, int, txfm_buf[16 * 16 + 16 + 16]);
+  inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_16X16, bd);
+}
+
+void av1_inv_txfm2d_add_32x32_c(const int32_t *input, uint16_t *output,
+                                int stride, TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(32, int, txfm_buf[32 * 32 + 32 + 32]);
+  inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_32X32, bd);
+}
+
+void av1_inv_txfm2d_add_64x64_c(const int32_t *input, uint16_t *output,
+                                int stride, TX_TYPE tx_type, int bd) {
+  // TODO(urvang): Can the same array be reused, instead of using a new array?
+  // Remap 32x32 input into a modified 64x64 by:
+  // - Copying over these values in top-left 32x32 locations.
+  // - Setting the rest of the locations to 0.
+  int32_t mod_input[64 * 64];
+  for (int row = 0; row < 32; ++row) {
+    memcpy(mod_input + row * 64, input + row * 32, 32 * sizeof(*mod_input));
+    memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input));
+  }
+  memset(mod_input + 32 * 64, 0, 32 * 64 * sizeof(*mod_input));
+  DECLARE_ALIGNED(32, int, txfm_buf[64 * 64 + 64 + 64]);
+  inv_txfm2d_add_facade(mod_input, output, stride, txfm_buf, tx_type, TX_64X64,
+                        bd);
+}
+
+void av1_inv_txfm2d_add_64x32_c(const int32_t *input, uint16_t *output,
+                                int stride, TX_TYPE tx_type, int bd) {
+  // Remap 32x32 input into a modified 64x32 by:
+  // - Copying over these values in top-left 32x32 locations.
+  // - Setting the rest of the locations to 0.
+  int32_t mod_input[64 * 32];
+  for (int row = 0; row < 32; ++row) {
+    memcpy(mod_input + row * 64, input + row * 32, 32 * sizeof(*mod_input));
+    memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input));
+  }
+  DECLARE_ALIGNED(32, int, txfm_buf[64 * 32 + 64 + 64]);
+  inv_txfm2d_add_facade(mod_input, output, stride, txfm_buf, tx_type, TX_64X32,
+                        bd);
+}
+
+void av1_inv_txfm2d_add_32x64_c(const int32_t *input, uint16_t *output,
+                                int stride, TX_TYPE tx_type, int bd) {
+  // Remap 32x32 input into a modified 32x64 input by:
+  // - Copying over these values in top-left 32x32 locations.
+  // - Setting the rest of the locations to 0.
+  int32_t mod_input[32 * 64];
+  memcpy(mod_input, input, 32 * 32 * sizeof(*mod_input));
+  memset(mod_input + 32 * 32, 0, 32 * 32 * sizeof(*mod_input));
+  DECLARE_ALIGNED(32, int, txfm_buf[64 * 32 + 64 + 64]);
+  inv_txfm2d_add_facade(mod_input, output, stride, txfm_buf, tx_type, TX_32X64,
+                        bd);
+}
+
+void av1_inv_txfm2d_add_16x64_c(const int32_t *input, uint16_t *output,
+                                int stride, TX_TYPE tx_type, int bd) {
+  // Remap 16x32 input into a modified 16x64 input by:
+  // - Copying over these values in top-left 16x32 locations.
+  // - Setting the rest of the locations to 0.
+  int32_t mod_input[16 * 64];
+  memcpy(mod_input, input, 16 * 32 * sizeof(*mod_input));
+  memset(mod_input + 16 * 32, 0, 16 * 32 * sizeof(*mod_input));
+  DECLARE_ALIGNED(32, int, txfm_buf[16 * 64 + 64 + 64]);
+  inv_txfm2d_add_facade(mod_input, output, stride, txfm_buf, tx_type, TX_16X64,
+                        bd);
+}
+
+void av1_inv_txfm2d_add_64x16_c(const int32_t *input, uint16_t *output,
+                                int stride, TX_TYPE tx_type, int bd) {
+  // Remap 32x16 input into a modified 64x16 by:
+  // - Copying over these values in top-left 32x16 locations.
+  // - Setting the rest of the locations to 0.
+  int32_t mod_input[64 * 16];
+  for (int row = 0; row < 16; ++row) {
+    memcpy(mod_input + row * 64, input + row * 32, 32 * sizeof(*mod_input));
+    memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input));
+  }
+  DECLARE_ALIGNED(32, int, txfm_buf[16 * 64 + 64 + 64]);
+  inv_txfm2d_add_facade(mod_input, output, stride, txfm_buf, tx_type, TX_64X16,
+                        bd);
+}
+
+void av1_inv_txfm2d_add_4x16_c(const int32_t *input, uint16_t *output,
+                               int stride, TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(32, int, txfm_buf[4 * 16 + 16 + 16]);
+  inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_4X16, bd);
+}
+
+void av1_inv_txfm2d_add_16x4_c(const int32_t *input, uint16_t *output,
+                               int stride, TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(32, int, txfm_buf[4 * 16 + 16 + 16]);
+  inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_16X4, bd);
+}
+
+void av1_inv_txfm2d_add_8x32_c(const int32_t *input, uint16_t *output,
+                               int stride, TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(32, int, txfm_buf[8 * 32 + 32 + 32]);
+  inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_8X32, bd);
+}
+
+void av1_inv_txfm2d_add_32x8_c(const int32_t *input, uint16_t *output,
+                               int stride, TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(32, int, txfm_buf[8 * 32 + 32 + 32]);
+  inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_32X8, bd);
+}
diff --git a/libs/libaom/src/av1/common/av1_loopfilter.c b/libs/libaom/src/av1/common/av1_loopfilter.c
new file mode 100644
index 000000000..c756760de
--- /dev/null
+++ b/libs/libaom/src/av1/common/av1_loopfilter.c
@@ -0,0 +1,790 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/common/av1_loopfilter.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/seg_common.h"
+
+static const SEG_LVL_FEATURES seg_lvl_lf_lut[MAX_MB_PLANE][2] = {
+  { SEG_LVL_ALT_LF_Y_V, SEG_LVL_ALT_LF_Y_H },
+  { SEG_LVL_ALT_LF_U, SEG_LVL_ALT_LF_U },
+  { SEG_LVL_ALT_LF_V, SEG_LVL_ALT_LF_V }
+};
+
+static const int delta_lf_id_lut[MAX_MB_PLANE][2] = { { 0, 1 },
+                                                      { 2, 2 },
+                                                      { 3, 3 } };
+
+static const int mode_lf_lut[] = {
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // INTRA_MODES
+  1, 1, 0, 1,                             // INTER_MODES (GLOBALMV == 0)
+  1, 1, 1, 1, 1, 1, 0, 1  // INTER_COMPOUND_MODES (GLOBAL_GLOBALMV == 0)
+};
+
+static void update_sharpness(loop_filter_info_n *lfi, int sharpness_lvl) {
+  int lvl;
+
+  // For each possible value for the loop filter fill out limits
+  for (lvl = 0; lvl <= MAX_LOOP_FILTER; lvl++) {
+    // Set loop filter parameters that control sharpness.
+    int block_inside_limit = lvl >> ((sharpness_lvl > 0) + (sharpness_lvl > 4));
+
+    if (sharpness_lvl > 0) {
+      if (block_inside_limit > (9 - sharpness_lvl))
+        block_inside_limit = (9 - sharpness_lvl);
+    }
+
+    if (block_inside_limit < 1) block_inside_limit = 1;
+
+    memset(lfi->lfthr[lvl].lim, block_inside_limit, SIMD_WIDTH);
+    memset(lfi->lfthr[lvl].mblim, (2 * (lvl + 2) + block_inside_limit),
+           SIMD_WIDTH);
+  }
+}
+
+uint8_t av1_get_filter_level(const AV1_COMMON *cm,
+                             const loop_filter_info_n *lfi_n, const int dir_idx,
+                             int plane, const MB_MODE_INFO *mbmi) {
+  const int segment_id = mbmi->segment_id;
+  if (cm->delta_q_info.delta_lf_present_flag) {
+    int8_t delta_lf;
+    if (cm->delta_q_info.delta_lf_multi) {
+      const int delta_lf_idx = delta_lf_id_lut[plane][dir_idx];
+      delta_lf = mbmi->delta_lf[delta_lf_idx];
+    } else {
+      delta_lf = mbmi->delta_lf_from_base;
+    }
+    int base_level;
+    if (plane == 0)
+      base_level = cm->lf.filter_level[dir_idx];
+    else if (plane == 1)
+      base_level = cm->lf.filter_level_u;
+    else
+      base_level = cm->lf.filter_level_v;
+    int lvl_seg = clamp(delta_lf + base_level, 0, MAX_LOOP_FILTER);
+    assert(plane >= 0 && plane <= 2);
+    const int seg_lf_feature_id = seg_lvl_lf_lut[plane][dir_idx];
+    if (segfeature_active(&cm->seg, segment_id, seg_lf_feature_id)) {
+      const int data = get_segdata(&cm->seg, segment_id, seg_lf_feature_id);
+      lvl_seg = clamp(lvl_seg + data, 0, MAX_LOOP_FILTER);
+    }
+
+    if (cm->lf.mode_ref_delta_enabled) {
+      const int scale = 1 << (lvl_seg >> 5);
+      lvl_seg += cm->lf.ref_deltas[mbmi->ref_frame[0]] * scale;
+      if (mbmi->ref_frame[0] > INTRA_FRAME)
+        lvl_seg += cm->lf.mode_deltas[mode_lf_lut[mbmi->mode]] * scale;
+      lvl_seg = clamp(lvl_seg, 0, MAX_LOOP_FILTER);
+    }
+    return lvl_seg;
+  } else {
+    return lfi_n->lvl[plane][segment_id][dir_idx][mbmi->ref_frame[0]]
+                     [mode_lf_lut[mbmi->mode]];
+  }
+}
+
+void av1_loop_filter_init(AV1_COMMON *cm) {
+  assert(MB_MODE_COUNT == NELEMENTS(mode_lf_lut));
+  loop_filter_info_n *lfi = &cm->lf_info;
+  struct loopfilter *lf = &cm->lf;
+  int lvl;
+
+  lf->combine_vert_horz_lf = 1;
+
+  // init limits for given sharpness
+  update_sharpness(lfi, lf->sharpness_level);
+
+  // init hev threshold const vectors
+  for (lvl = 0; lvl <= MAX_LOOP_FILTER; lvl++)
+    memset(lfi->lfthr[lvl].hev_thr, (lvl >> 4), SIMD_WIDTH);
+}
+
+// Update the loop filter for the current frame.
+// This should be called before loop_filter_rows(),
+// av1_loop_filter_frame() calls this function directly.
+void av1_loop_filter_frame_init(AV1_COMMON *cm, int plane_start,
+                                int plane_end) {
+  int filt_lvl[MAX_MB_PLANE], filt_lvl_r[MAX_MB_PLANE];
+  int plane;
+  int seg_id;
+  // n_shift is the multiplier for lf_deltas
+  // the multiplier is 1 for when filter_lvl is between 0 and 31;
+  // 2 when filter_lvl is between 32 and 63
+  loop_filter_info_n *const lfi = &cm->lf_info;
+  struct loopfilter *const lf = &cm->lf;
+  const struct segmentation *const seg = &cm->seg;
+
+  // update sharpness limits
+  update_sharpness(lfi, lf->sharpness_level);
+
+  filt_lvl[0] = cm->lf.filter_level[0];
+  filt_lvl[1] = cm->lf.filter_level_u;
+  filt_lvl[2] = cm->lf.filter_level_v;
+
+  filt_lvl_r[0] = cm->lf.filter_level[1];
+  filt_lvl_r[1] = cm->lf.filter_level_u;
+  filt_lvl_r[2] = cm->lf.filter_level_v;
+
+  assert(plane_start >= AOM_PLANE_Y);
+  assert(plane_end <= MAX_MB_PLANE);
+
+  for (plane = plane_start; plane < plane_end; plane++) {
+    if (plane == 0 && !filt_lvl[0] && !filt_lvl_r[0])
+      break;
+    else if (plane == 1 && !filt_lvl[1])
+      continue;
+    else if (plane == 2 && !filt_lvl[2])
+      continue;
+
+    for (seg_id = 0; seg_id < MAX_SEGMENTS; seg_id++) {
+      for (int dir = 0; dir < 2; ++dir) {
+        int lvl_seg = (dir == 0) ? filt_lvl[plane] : filt_lvl_r[plane];
+        const int seg_lf_feature_id = seg_lvl_lf_lut[plane][dir];
+        if (segfeature_active(seg, seg_id, seg_lf_feature_id)) {
+          const int data = get_segdata(&cm->seg, seg_id, seg_lf_feature_id);
+          lvl_seg = clamp(lvl_seg + data, 0, MAX_LOOP_FILTER);
+        }
+
+        if (!lf->mode_ref_delta_enabled) {
+          // we could get rid of this if we assume that deltas are set to
+          // zero when not in use; encoder always uses deltas
+          memset(lfi->lvl[plane][seg_id][dir], lvl_seg,
+                 sizeof(lfi->lvl[plane][seg_id][dir]));
+        } else {
+          int ref, mode;
+          const int scale = 1 << (lvl_seg >> 5);
+          const int intra_lvl = lvl_seg + lf->ref_deltas[INTRA_FRAME] * scale;
+          lfi->lvl[plane][seg_id][dir][INTRA_FRAME][0] =
+              clamp(intra_lvl, 0, MAX_LOOP_FILTER);
+
+          for (ref = LAST_FRAME; ref < REF_FRAMES; ++ref) {
+            for (mode = 0; mode < MAX_MODE_LF_DELTAS; ++mode) {
+              const int inter_lvl = lvl_seg + lf->ref_deltas[ref] * scale +
+                                    lf->mode_deltas[mode] * scale;
+              lfi->lvl[plane][seg_id][dir][ref][mode] =
+                  clamp(inter_lvl, 0, MAX_LOOP_FILTER);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+static TX_SIZE get_transform_size(const MACROBLOCKD *const xd,
+                                  const MB_MODE_INFO *const mbmi,
+                                  const EDGE_DIR edge_dir, const int mi_row,
+                                  const int mi_col, const int plane,
+                                  const struct macroblockd_plane *plane_ptr) {
+  assert(mbmi != NULL);
+  if (xd && xd->lossless[mbmi->segment_id]) return TX_4X4;
+
+  TX_SIZE tx_size =
+      (plane == AOM_PLANE_Y)
+          ? mbmi->tx_size
+          : av1_get_max_uv_txsize(mbmi->sb_type, plane_ptr->subsampling_x,
+                                  plane_ptr->subsampling_y);
+  assert(tx_size < TX_SIZES_ALL);
+  if ((plane == AOM_PLANE_Y) && is_inter_block(mbmi) && !mbmi->skip) {
+    const BLOCK_SIZE sb_type = mbmi->sb_type;
+    const int blk_row = mi_row & (mi_size_high[sb_type] - 1);
+    const int blk_col = mi_col & (mi_size_wide[sb_type] - 1);
+    const TX_SIZE mb_tx_size =
+        mbmi->inter_tx_size[av1_get_txb_size_index(sb_type, blk_row, blk_col)];
+    assert(mb_tx_size < TX_SIZES_ALL);
+    tx_size = mb_tx_size;
+  }
+
+  // since in case of chrominance or non-square transform need to convert
+  // transform size into transform size in particular direction.
+  // for vertical edge, filter direction is horizontal, for horizontal
+  // edge, filter direction is vertical.
+  tx_size = (VERT_EDGE == edge_dir) ? txsize_horz_map[tx_size]
+                                    : txsize_vert_map[tx_size];
+  return tx_size;
+}
+
+typedef struct AV1_DEBLOCKING_PARAMETERS {
+  // length of the filter applied to the outer edge
+  uint32_t filter_length;
+  // deblocking limits
+  const uint8_t *lim;
+  const uint8_t *mblim;
+  const uint8_t *hev_thr;
+} AV1_DEBLOCKING_PARAMETERS;
+
+// Return TX_SIZE from get_transform_size(), so it is plane and direction
+// aware
+static TX_SIZE set_lpf_parameters(
+    AV1_DEBLOCKING_PARAMETERS *const params, const ptrdiff_t mode_step,
+    const AV1_COMMON *const cm, const MACROBLOCKD *const xd,
+    const EDGE_DIR edge_dir, const uint32_t x, const uint32_t y,
+    const int plane, const struct macroblockd_plane *const plane_ptr) {
+  // reset to initial values
+  params->filter_length = 0;
+
+  // no deblocking is required
+  const uint32_t width = plane_ptr->dst.width;
+  const uint32_t height = plane_ptr->dst.height;
+  if ((width <= x) || (height <= y)) {
+    // just return the smallest transform unit size
+    return TX_4X4;
+  }
+
+  const uint32_t scale_horz = plane_ptr->subsampling_x;
+  const uint32_t scale_vert = plane_ptr->subsampling_y;
+  // for sub8x8 block, chroma prediction mode is obtained from the bottom/right
+  // mi structure of the co-located 8x8 luma block. so for chroma plane, mi_row
+  // and mi_col should map to the bottom/right mi structure, i.e, both mi_row
+  // and mi_col should be odd number for chroma plane.
+  const int mi_row = scale_vert | ((y << scale_vert) >> MI_SIZE_LOG2);
+  const int mi_col = scale_horz | ((x << scale_horz) >> MI_SIZE_LOG2);
+  MB_MODE_INFO **mi =
+      cm->mi_params.mi_grid_base + mi_row * cm->mi_params.mi_stride + mi_col;
+  const MB_MODE_INFO *mbmi = mi[0];
+  // If current mbmi is not correctly setup, return an invalid value to stop
+  // filtering. One example is that if this tile is not coded, then its mbmi
+  // it not set up.
+  if (mbmi == NULL) return TX_INVALID;
+
+  const TX_SIZE ts =
+      get_transform_size(xd, mi[0], edge_dir, mi_row, mi_col, plane, plane_ptr);
+
+  {
+    const uint32_t coord = (VERT_EDGE == edge_dir) ? (x) : (y);
+    const uint32_t transform_masks =
+        edge_dir == VERT_EDGE ? tx_size_wide[ts] - 1 : tx_size_high[ts] - 1;
+    const int32_t tu_edge = (coord & transform_masks) ? (0) : (1);
+
+    if (!tu_edge) return ts;
+
+    // prepare outer edge parameters. deblock the edge if it's an edge of a TU
+    {
+      const uint32_t curr_level =
+          av1_get_filter_level(cm, &cm->lf_info, edge_dir, plane, mbmi);
+      const int curr_skipped = mbmi->skip && is_inter_block(mbmi);
+      uint32_t level = curr_level;
+      if (coord) {
+        {
+          const MB_MODE_INFO *const mi_prev = *(mi - mode_step);
+          if (mi_prev == NULL) return TX_INVALID;
+          const int pv_row =
+              (VERT_EDGE == edge_dir) ? (mi_row) : (mi_row - (1 << scale_vert));
+          const int pv_col =
+              (VERT_EDGE == edge_dir) ? (mi_col - (1 << scale_horz)) : (mi_col);
+          const TX_SIZE pv_ts = get_transform_size(
+              xd, mi_prev, edge_dir, pv_row, pv_col, plane, plane_ptr);
+
+          const uint32_t pv_lvl =
+              av1_get_filter_level(cm, &cm->lf_info, edge_dir, plane, mi_prev);
+
+          const int pv_skip = mi_prev->skip && is_inter_block(mi_prev);
+          const BLOCK_SIZE bsize =
+              get_plane_block_size(mbmi->sb_type, plane_ptr->subsampling_x,
+                                   plane_ptr->subsampling_y);
+          assert(bsize < BLOCK_SIZES_ALL);
+          const int prediction_masks = edge_dir == VERT_EDGE
+                                           ? block_size_wide[bsize] - 1
+                                           : block_size_high[bsize] - 1;
+          const int32_t pu_edge = !(coord & prediction_masks);
+          // if the current and the previous blocks are skipped,
+          // deblock the edge if the edge belongs to a PU's edge only.
+          if ((curr_level || pv_lvl) &&
+              (!pv_skip || !curr_skipped || pu_edge)) {
+            const TX_SIZE min_ts = AOMMIN(ts, pv_ts);
+            if (TX_4X4 >= min_ts) {
+              params->filter_length = 4;
+            } else if (TX_8X8 == min_ts) {
+              if (plane != 0)
+                params->filter_length = 6;
+              else
+                params->filter_length = 8;
+            } else {
+              params->filter_length = 14;
+              // No wide filtering for chroma plane
+              if (plane != 0) {
+                params->filter_length = 6;
+              }
+            }
+
+            // update the level if the current block is skipped,
+            // but the previous one is not
+            level = (curr_level) ? (curr_level) : (pv_lvl);
+          }
+        }
+      }
+      // prepare common parameters
+      if (params->filter_length) {
+        const loop_filter_thresh *const limits = cm->lf_info.lfthr + level;
+        params->lim = limits->lim;
+        params->mblim = limits->mblim;
+        params->hev_thr = limits->hev_thr;
+      }
+    }
+  }
+
+  return ts;
+}
+
+void av1_filter_block_plane_vert(const AV1_COMMON *const cm,
+                                 const MACROBLOCKD *const xd, const int plane,
+                                 const MACROBLOCKD_PLANE *const plane_ptr,
+                                 const uint32_t mi_row, const uint32_t mi_col) {
+  const uint32_t scale_horz = plane_ptr->subsampling_x;
+  const uint32_t scale_vert = plane_ptr->subsampling_y;
+  uint8_t *const dst_ptr = plane_ptr->dst.buf;
+  const int dst_stride = plane_ptr->dst.stride;
+  const int y_range = (MAX_MIB_SIZE >> scale_vert);
+  const int x_range = (MAX_MIB_SIZE >> scale_horz);
+  for (int y = 0; y < y_range; y++) {
+    uint8_t *p = dst_ptr + y * MI_SIZE * dst_stride;
+    for (int x = 0; x < x_range;) {
+      // inner loop always filter vertical edges in a MI block. If MI size
+      // is 8x8, it will filter the vertical edge aligned with a 8x8 block.
+      // If 4x4 transform is used, it will then filter the internal edge
+      //  aligned with a 4x4 block
+      const uint32_t curr_x = ((mi_col * MI_SIZE) >> scale_horz) + x * MI_SIZE;
+      const uint32_t curr_y = ((mi_row * MI_SIZE) >> scale_vert) + y * MI_SIZE;
+      uint32_t advance_units;
+      TX_SIZE tx_size;
+      AV1_DEBLOCKING_PARAMETERS params;
+      memset(&params, 0, sizeof(params));
+
+      tx_size =
+          set_lpf_parameters(&params, ((ptrdiff_t)1 << scale_horz), cm, xd,
+                             VERT_EDGE, curr_x, curr_y, plane, plane_ptr);
+      if (tx_size == TX_INVALID) {
+        params.filter_length = 0;
+        tx_size = TX_4X4;
+      }
+
+#if CONFIG_AV1_HIGHBITDEPTH
+      const int use_highbitdepth = cm->seq_params.use_highbitdepth;
+      const aom_bit_depth_t bit_depth = cm->seq_params.bit_depth;
+      switch (params.filter_length) {
+        // apply 4-tap filtering
+        case 4:
+          if (use_highbitdepth)
+            aom_highbd_lpf_vertical_4(CONVERT_TO_SHORTPTR(p), dst_stride,
+                                      params.mblim, params.lim, params.hev_thr,
+                                      bit_depth);
+          else
+            aom_lpf_vertical_4(p, dst_stride, params.mblim, params.lim,
+                               params.hev_thr);
+          break;
+        case 6:  // apply 6-tap filter for chroma plane only
+          assert(plane != 0);
+          if (use_highbitdepth)
+            aom_highbd_lpf_vertical_6(CONVERT_TO_SHORTPTR(p), dst_stride,
+                                      params.mblim, params.lim, params.hev_thr,
+                                      bit_depth);
+          else
+            aom_lpf_vertical_6(p, dst_stride, params.mblim, params.lim,
+                               params.hev_thr);
+          break;
+        // apply 8-tap filtering
+        case 8:
+          if (use_highbitdepth)
+            aom_highbd_lpf_vertical_8(CONVERT_TO_SHORTPTR(p), dst_stride,
+                                      params.mblim, params.lim, params.hev_thr,
+                                      bit_depth);
+          else
+            aom_lpf_vertical_8(p, dst_stride, params.mblim, params.lim,
+                               params.hev_thr);
+          break;
+        // apply 14-tap filtering
+        case 14:
+          if (use_highbitdepth)
+            aom_highbd_lpf_vertical_14(CONVERT_TO_SHORTPTR(p), dst_stride,
+                                       params.mblim, params.lim, params.hev_thr,
+                                       bit_depth);
+          else
+            aom_lpf_vertical_14(p, dst_stride, params.mblim, params.lim,
+                                params.hev_thr);
+          break;
+        // no filtering
+        default: break;
+      }
+#else
+      switch (params.filter_length) {
+        // apply 4-tap filtering
+        case 4:
+          aom_lpf_vertical_4(p, dst_stride, params.mblim, params.lim,
+                             params.hev_thr);
+          break;
+        case 6:  // apply 6-tap filter for chroma plane only
+          assert(plane != 0);
+          aom_lpf_vertical_6(p, dst_stride, params.mblim, params.lim,
+                             params.hev_thr);
+          break;
+        // apply 8-tap filtering
+        case 8:
+          aom_lpf_vertical_8(p, dst_stride, params.mblim, params.lim,
+                             params.hev_thr);
+          break;
+        // apply 14-tap filtering
+        case 14:
+          aom_lpf_vertical_14(p, dst_stride, params.mblim, params.lim,
+                              params.hev_thr);
+          break;
+        // no filtering
+        default: break;
+      }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+      // advance the destination pointer
+      advance_units = tx_size_wide_unit[tx_size];
+      x += advance_units;
+      p += advance_units * MI_SIZE;
+    }
+  }
+}
+
+void av1_filter_block_plane_horz(const AV1_COMMON *const cm,
+                                 const MACROBLOCKD *const xd, const int plane,
+                                 const MACROBLOCKD_PLANE *const plane_ptr,
+                                 const uint32_t mi_row, const uint32_t mi_col) {
+  const uint32_t scale_horz = plane_ptr->subsampling_x;
+  const uint32_t scale_vert = plane_ptr->subsampling_y;
+  uint8_t *const dst_ptr = plane_ptr->dst.buf;
+  const int dst_stride = plane_ptr->dst.stride;
+  const int y_range = (MAX_MIB_SIZE >> scale_vert);
+  const int x_range = (MAX_MIB_SIZE >> scale_horz);
+  for (int x = 0; x < x_range; x++) {
+    uint8_t *p = dst_ptr + x * MI_SIZE;
+    for (int y = 0; y < y_range;) {
+      // inner loop always filter vertical edges in a MI block. If MI size
+      // is 8x8, it will first filter the vertical edge aligned with a 8x8
+      // block. If 4x4 transform is used, it will then filter the internal
+      // edge aligned with a 4x4 block
+      const uint32_t curr_x = ((mi_col * MI_SIZE) >> scale_horz) + x * MI_SIZE;
+      const uint32_t curr_y = ((mi_row * MI_SIZE) >> scale_vert) + y * MI_SIZE;
+      uint32_t advance_units;
+      TX_SIZE tx_size;
+      AV1_DEBLOCKING_PARAMETERS params;
+      memset(&params, 0, sizeof(params));
+
+      tx_size = set_lpf_parameters(
+          &params, (cm->mi_params.mi_stride << scale_vert), cm, xd, HORZ_EDGE,
+          curr_x, curr_y, plane, plane_ptr);
+      if (tx_size == TX_INVALID) {
+        params.filter_length = 0;
+        tx_size = TX_4X4;
+      }
+
+#if CONFIG_AV1_HIGHBITDEPTH
+      const int use_highbitdepth = cm->seq_params.use_highbitdepth;
+      const aom_bit_depth_t bit_depth = cm->seq_params.bit_depth;
+      switch (params.filter_length) {
+        // apply 4-tap filtering
+        case 4:
+          if (use_highbitdepth)
+            aom_highbd_lpf_horizontal_4(CONVERT_TO_SHORTPTR(p), dst_stride,
+                                        params.mblim, params.lim,
+                                        params.hev_thr, bit_depth);
+          else
+            aom_lpf_horizontal_4(p, dst_stride, params.mblim, params.lim,
+                                 params.hev_thr);
+          break;
+        // apply 6-tap filtering
+        case 6:
+          assert(plane != 0);
+          if (use_highbitdepth)
+            aom_highbd_lpf_horizontal_6(CONVERT_TO_SHORTPTR(p), dst_stride,
+                                        params.mblim, params.lim,
+                                        params.hev_thr, bit_depth);
+          else
+            aom_lpf_horizontal_6(p, dst_stride, params.mblim, params.lim,
+                                 params.hev_thr);
+          break;
+        // apply 8-tap filtering
+        case 8:
+          if (use_highbitdepth)
+            aom_highbd_lpf_horizontal_8(CONVERT_TO_SHORTPTR(p), dst_stride,
+                                        params.mblim, params.lim,
+                                        params.hev_thr, bit_depth);
+          else
+            aom_lpf_horizontal_8(p, dst_stride, params.mblim, params.lim,
+                                 params.hev_thr);
+          break;
+        // apply 14-tap filtering
+        case 14:
+          if (use_highbitdepth)
+            aom_highbd_lpf_horizontal_14(CONVERT_TO_SHORTPTR(p), dst_stride,
+                                         params.mblim, params.lim,
+                                         params.hev_thr, bit_depth);
+          else
+            aom_lpf_horizontal_14(p, dst_stride, params.mblim, params.lim,
+                                  params.hev_thr);
+          break;
+        // no filtering
+        default: break;
+      }
+#else
+      switch (params.filter_length) {
+        // apply 4-tap filtering
+        case 4:
+          aom_lpf_horizontal_4(p, dst_stride, params.mblim, params.lim,
+                               params.hev_thr);
+          break;
+        // apply 6-tap filtering
+        case 6:
+          assert(plane != 0);
+          aom_lpf_horizontal_6(p, dst_stride, params.mblim, params.lim,
+                               params.hev_thr);
+          break;
+        // apply 8-tap filtering
+        case 8:
+          aom_lpf_horizontal_8(p, dst_stride, params.mblim, params.lim,
+                               params.hev_thr);
+          break;
+        // apply 14-tap filtering
+        case 14:
+          aom_lpf_horizontal_14(p, dst_stride, params.mblim, params.lim,
+                                params.hev_thr);
+          break;
+        // no filtering
+        default: break;
+      }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+      // advance the destination pointer
+      advance_units = tx_size_high_unit[tx_size];
+      y += advance_units;
+      p += advance_units * dst_stride * MI_SIZE;
+    }
+  }
+}
+
+void av1_filter_block_plane_vert_test(const AV1_COMMON *const cm,
+                                      const MACROBLOCKD *const xd,
+                                      const int plane,
+                                      const MACROBLOCKD_PLANE *const plane_ptr,
+                                      const uint32_t mi_row,
+                                      const uint32_t mi_col) {
+  const uint32_t scale_horz = plane_ptr->subsampling_x;
+  const uint32_t scale_vert = plane_ptr->subsampling_y;
+  uint8_t *const dst_ptr = plane_ptr->dst.buf;
+  const int dst_stride = plane_ptr->dst.stride;
+  const int y_range = cm->mi_params.mi_rows >> scale_vert;
+  const int x_range = cm->mi_params.mi_cols >> scale_horz;
+  for (int y = 0; y < y_range; y++) {
+    uint8_t *p = dst_ptr + y * MI_SIZE * dst_stride;
+    for (int x = 0; x < x_range;) {
+      // inner loop always filter vertical edges in a MI block. If MI size
+      // is 8x8, it will filter the vertical edge aligned with a 8x8 block.
+      // If 4x4 transform is used, it will then filter the internal edge
+      //  aligned with a 4x4 block
+      const uint32_t curr_x = ((mi_col * MI_SIZE) >> scale_horz) + x * MI_SIZE;
+      const uint32_t curr_y = ((mi_row * MI_SIZE) >> scale_vert) + y * MI_SIZE;
+      uint32_t advance_units;
+      TX_SIZE tx_size;
+      AV1_DEBLOCKING_PARAMETERS params;
+      memset(&params, 0, sizeof(params));
+
+      tx_size =
+          set_lpf_parameters(&params, ((ptrdiff_t)1 << scale_horz), cm, xd,
+                             VERT_EDGE, curr_x, curr_y, plane, plane_ptr);
+      if (tx_size == TX_INVALID) {
+        params.filter_length = 0;
+        tx_size = TX_4X4;
+      }
+
+      // advance the destination pointer
+      advance_units = tx_size_wide_unit[tx_size];
+      x += advance_units;
+      p += advance_units * MI_SIZE;
+    }
+  }
+}
+
+void av1_filter_block_plane_horz_test(const AV1_COMMON *const cm,
+                                      const MACROBLOCKD *const xd,
+                                      const int plane,
+                                      const MACROBLOCKD_PLANE *const plane_ptr,
+                                      const uint32_t mi_row,
+                                      const uint32_t mi_col) {
+  const uint32_t scale_horz = plane_ptr->subsampling_x;
+  const uint32_t scale_vert = plane_ptr->subsampling_y;
+  uint8_t *const dst_ptr = plane_ptr->dst.buf;
+  const int dst_stride = plane_ptr->dst.stride;
+  const int y_range = cm->mi_params.mi_rows >> scale_vert;
+  const int x_range = cm->mi_params.mi_cols >> scale_horz;
+  for (int x = 0; x < x_range; x++) {
+    uint8_t *p = dst_ptr + x * MI_SIZE;
+    for (int y = 0; y < y_range;) {
+      // inner loop always filter vertical edges in a MI block. If MI size
+      // is 8x8, it will first filter the vertical edge aligned with a 8x8
+      // block. If 4x4 transform is used, it will then filter the internal
+      // edge aligned with a 4x4 block
+      const uint32_t curr_x = ((mi_col * MI_SIZE) >> scale_horz) + x * MI_SIZE;
+      const uint32_t curr_y = ((mi_row * MI_SIZE) >> scale_vert) + y * MI_SIZE;
+      uint32_t advance_units;
+      TX_SIZE tx_size;
+      AV1_DEBLOCKING_PARAMETERS params;
+      memset(&params, 0, sizeof(params));
+
+      tx_size = set_lpf_parameters(
+          &params, (cm->mi_params.mi_stride << scale_vert), cm, xd, HORZ_EDGE,
+          curr_x, curr_y, plane, plane_ptr);
+      if (tx_size == TX_INVALID) {
+        params.filter_length = 0;
+        tx_size = TX_4X4;
+      }
+
+      // advance the destination pointer
+      advance_units = tx_size_high_unit[tx_size];
+      y += advance_units;
+      p += advance_units * dst_stride * MI_SIZE;
+    }
+  }
+}
+
+static void loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm,
+                             MACROBLOCKD *xd, int start, int stop,
+#if CONFIG_LPF_MASK
+                             int is_decoding,
+#endif
+                             int plane_start, int plane_end) {
+  struct macroblockd_plane *pd = xd->plane;
+  const int col_start = 0;
+  const int col_end = cm->mi_params.mi_cols;
+  int mi_row, mi_col;
+  int plane;
+
+#if CONFIG_LPF_MASK
+  if (is_decoding) {
+    cm->is_decoding = is_decoding;
+    for (plane = plane_start; plane < plane_end; plane++) {
+      if (plane == 0 && !(cm->lf.filter_level[0]) && !(cm->lf.filter_level[1]))
+        break;
+      else if (plane == 1 && !(cm->lf.filter_level_u))
+        continue;
+      else if (plane == 2 && !(cm->lf.filter_level_v))
+        continue;
+
+      av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, 0, 0,
+                           plane, plane + 1);
+
+      av1_build_bitmask_vert_info(cm, &pd[plane], plane);
+      av1_build_bitmask_horz_info(cm, &pd[plane], plane);
+
+      // apply loop filtering which only goes through buffer once
+      for (mi_row = start; mi_row < stop; mi_row += MI_SIZE_64X64) {
+        for (mi_col = col_start; mi_col < col_end; mi_col += MI_SIZE_64X64) {
+          av1_setup_dst_planes(pd, BLOCK_64X64, frame_buffer, mi_row, mi_col,
+                               plane, plane + 1);
+          av1_filter_block_plane_bitmask_vert(cm, &pd[plane], plane, mi_row,
+                                              mi_col);
+          if (mi_col - MI_SIZE_64X64 >= 0) {
+            av1_setup_dst_planes(pd, BLOCK_64X64, frame_buffer, mi_row,
+                                 mi_col - MI_SIZE_64X64, plane, plane + 1);
+            av1_filter_block_plane_bitmask_horz(cm, &pd[plane], plane, mi_row,
+                                                mi_col - MI_SIZE_64X64);
+          }
+        }
+        av1_setup_dst_planes(pd, BLOCK_64X64, frame_buffer, mi_row,
+                             mi_col - MI_SIZE_64X64, plane, plane + 1);
+        av1_filter_block_plane_bitmask_horz(cm, &pd[plane], plane, mi_row,
+                                            mi_col - MI_SIZE_64X64);
+      }
+    }
+    return;
+  }
+#endif
+
+  for (plane = plane_start; plane < plane_end; plane++) {
+    if (plane == 0 && !(cm->lf.filter_level[0]) && !(cm->lf.filter_level[1]))
+      break;
+    else if (plane == 1 && !(cm->lf.filter_level_u))
+      continue;
+    else if (plane == 2 && !(cm->lf.filter_level_v))
+      continue;
+
+    if (cm->lf.combine_vert_horz_lf) {
+      // filter all vertical and horizontal edges in every 128x128 super block
+      for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
+        for (mi_col = col_start; mi_col < col_end; mi_col += MAX_MIB_SIZE) {
+          // filter vertical edges
+          av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, mi_row,
+                               mi_col, plane, plane + 1);
+          av1_filter_block_plane_vert(cm, xd, plane, &pd[plane], mi_row,
+                                      mi_col);
+          // filter horizontal edges
+          if (mi_col - MAX_MIB_SIZE >= 0) {
+            av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer,
+                                 mi_row, mi_col - MAX_MIB_SIZE, plane,
+                                 plane + 1);
+            av1_filter_block_plane_horz(cm, xd, plane, &pd[plane], mi_row,
+                                        mi_col - MAX_MIB_SIZE);
+          }
+        }
+        // filter horizontal edges
+        av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, mi_row,
+                             mi_col - MAX_MIB_SIZE, plane, plane + 1);
+        av1_filter_block_plane_horz(cm, xd, plane, &pd[plane], mi_row,
+                                    mi_col - MAX_MIB_SIZE);
+      }
+    } else {
+      // filter all vertical edges in every 128x128 super block
+      for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
+        for (mi_col = col_start; mi_col < col_end; mi_col += MAX_MIB_SIZE) {
+          av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, mi_row,
+                               mi_col, plane, plane + 1);
+          av1_filter_block_plane_vert(cm, xd, plane, &pd[plane], mi_row,
+                                      mi_col);
+        }
+      }
+
+      // filter all horizontal edges in every 128x128 super block
+      for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
+        for (mi_col = col_start; mi_col < col_end; mi_col += MAX_MIB_SIZE) {
+          av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, mi_row,
+                               mi_col, plane, plane + 1);
+          av1_filter_block_plane_horz(cm, xd, plane, &pd[plane], mi_row,
+                                      mi_col);
+        }
+      }
+    }
+  }
+}
+
+void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
+                           MACROBLOCKD *xd,
+#if CONFIG_LPF_MASK
+                           int is_decoding,
+#endif
+                           int plane_start, int plane_end, int partial_frame) {
+  int start_mi_row, end_mi_row, mi_rows_to_filter;
+
+  start_mi_row = 0;
+  mi_rows_to_filter = cm->mi_params.mi_rows;
+  if (partial_frame && cm->mi_params.mi_rows > 8) {
+    start_mi_row = cm->mi_params.mi_rows >> 1;
+    start_mi_row &= 0xfffffff8;
+    mi_rows_to_filter = AOMMAX(cm->mi_params.mi_rows / 8, 8);
+  }
+  end_mi_row = start_mi_row + mi_rows_to_filter;
+  av1_loop_filter_frame_init(cm, plane_start, plane_end);
+  loop_filter_rows(frame, cm, xd, start_mi_row, end_mi_row,
+#if CONFIG_LPF_MASK
+                   is_decoding,
+#endif
+                   plane_start, plane_end);
+}
diff --git a/libs/libaom/src/av1/common/av1_loopfilter.h b/libs/libaom/src/av1/common/av1_loopfilter.h
new file mode 100644
index 000000000..ce26d1647
--- /dev/null
+++ b/libs/libaom/src/av1/common/av1_loopfilter.h
@@ -0,0 +1,208 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_AV1_LOOPFILTER_H_
+#define AOM_AV1_COMMON_AV1_LOOPFILTER_H_
+
+#include "config/aom_config.h"
+
+#include "aom_ports/mem.h"
+#include "av1/common/blockd.h"
+#include "av1/common/seg_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_LOOP_FILTER 63
+#define MAX_SHARPNESS 7
+
+#define SIMD_WIDTH 16
+
+enum lf_path {
+  LF_PATH_420,
+  LF_PATH_444,
+  LF_PATH_SLOW,
+};
+
+enum { VERT_EDGE = 0, HORZ_EDGE = 1, NUM_EDGE_DIRS } UENUM1BYTE(EDGE_DIR);
+typedef struct {
+  uint64_t bits[4];
+} FilterMask;
+
+#if CONFIG_LPF_MASK
+// This structure holds bit masks for all 4x4 blocks in a 64x64 region.
+// Each 1 bit represents a position in which we want to apply the loop filter.
+// For Y plane, 4x4 in 64x64 requires 16x16 = 256 bit, therefore we use 4
+// uint64_t; For U, V plane, for 420 format, plane size is 32x32, thus we use
+// a uint64_t to represent bitmask.
+// Left_ entries refer to whether we apply a filter on the border to the
+// left of the block.   Above_ entries refer to whether or not to apply a
+// filter on the above border.
+// Since each transform is accompanied by a potentially different type of
+// loop filter there is a different entry in the array for each transform size.
+typedef struct {
+  FilterMask left_y[TX_SIZES];
+  FilterMask above_y[TX_SIZES];
+  FilterMask left_u[TX_SIZES];
+  FilterMask above_u[TX_SIZES];
+  FilterMask left_v[TX_SIZES];
+  FilterMask above_v[TX_SIZES];
+
+  // Y plane vertical edge and horizontal edge filter level
+  uint8_t lfl_y_hor[MI_SIZE_64X64][MI_SIZE_64X64];
+  uint8_t lfl_y_ver[MI_SIZE_64X64][MI_SIZE_64X64];
+
+  // U plane filter level
+  uint8_t lfl_u_ver[MI_SIZE_64X64][MI_SIZE_64X64];
+  uint8_t lfl_u_hor[MI_SIZE_64X64][MI_SIZE_64X64];
+
+  // V plane filter level
+  uint8_t lfl_v_ver[MI_SIZE_64X64][MI_SIZE_64X64];
+  uint8_t lfl_v_hor[MI_SIZE_64X64][MI_SIZE_64X64];
+
+  // other info
+  FilterMask skip;
+  FilterMask is_vert_border;
+  FilterMask is_horz_border;
+  // Y or UV planes, 5 tx sizes: 4x4, 8x8, 16x16, 32x32, 64x64
+  FilterMask tx_size_ver[2][5];
+  FilterMask tx_size_hor[2][5];
+} LoopFilterMask;
+#endif  // CONFIG_LPF_MASK
+
+struct loopfilter {
+  int filter_level[2];
+  int filter_level_u;
+  int filter_level_v;
+
+  int sharpness_level;
+
+  uint8_t mode_ref_delta_enabled;
+  uint8_t mode_ref_delta_update;
+
+  // 0 = Intra, Last, Last2+Last3,
+  // GF, BRF, ARF2, ARF
+  int8_t ref_deltas[REF_FRAMES];
+
+  // 0 = ZERO_MV, MV
+  int8_t mode_deltas[MAX_MODE_LF_DELTAS];
+
+  int combine_vert_horz_lf;
+
+#if CONFIG_LPF_MASK
+  LoopFilterMask *lfm;
+  size_t lfm_num;
+  int lfm_stride;
+#endif  // CONFIG_LPF_MASK
+};
+
+// Need to align this structure so when it is declared and
+// passed it can be loaded into vector registers.
+typedef struct {
+  DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, mblim[SIMD_WIDTH]);
+  DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, lim[SIMD_WIDTH]);
+  DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, hev_thr[SIMD_WIDTH]);
+} loop_filter_thresh;
+
+typedef struct {
+  loop_filter_thresh lfthr[MAX_LOOP_FILTER + 1];
+  uint8_t lvl[MAX_MB_PLANE][MAX_SEGMENTS][2][REF_FRAMES][MAX_MODE_LF_DELTAS];
+} loop_filter_info_n;
+
+/* assorted loopfilter functions which get used elsewhere */
+struct AV1Common;
+struct macroblockd;
+struct AV1LfSyncData;
+
+void av1_loop_filter_init(struct AV1Common *cm);
+
+void av1_loop_filter_frame_init(struct AV1Common *cm, int plane_start,
+                                int plane_end);
+
+#if CONFIG_LPF_MASK
+void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm,
+                           struct macroblockd *xd, int is_decoding,
+                           int plane_start, int plane_end, int partial_frame);
+#else
+void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm,
+                           struct macroblockd *xd, int plane_start,
+                           int plane_end, int partial_frame);
+#endif
+
+void av1_filter_block_plane_vert(const struct AV1Common *const cm,
+                                 const MACROBLOCKD *const xd, const int plane,
+                                 const MACROBLOCKD_PLANE *const plane_ptr,
+                                 const uint32_t mi_row, const uint32_t mi_col);
+
+void av1_filter_block_plane_horz(const struct AV1Common *const cm,
+                                 const MACROBLOCKD *const xd, const int plane,
+                                 const MACROBLOCKD_PLANE *const plane_ptr,
+                                 const uint32_t mi_row, const uint32_t mi_col);
+
+typedef struct LoopFilterWorkerData {
+  YV12_BUFFER_CONFIG *frame_buffer;
+  struct AV1Common *cm;
+  struct macroblockd_plane planes[MAX_MB_PLANE];
+  // TODO(Ranjit): When the filter functions are modified to use xd->lossless
+  // add lossless as a member here.
+  MACROBLOCKD *xd;
+} LFWorkerData;
+
+uint8_t av1_get_filter_level(const struct AV1Common *cm,
+                             const loop_filter_info_n *lfi_n, const int dir_idx,
+                             int plane, const MB_MODE_INFO *mbmi);
+#if CONFIG_LPF_MASK
+void av1_filter_block_plane_ver(struct AV1Common *const cm,
+                                struct macroblockd_plane *const plane_ptr,
+                                int pl, int mi_row, int mi_col);
+
+void av1_filter_block_plane_hor(struct AV1Common *const cm,
+                                struct macroblockd_plane *const plane, int pl,
+                                int mi_row, int mi_col);
+
+int get_index_shift(int mi_col, int mi_row, int *index);
+
+void av1_build_bitmask_vert_info(
+    struct AV1Common *const cm, const struct macroblockd_plane *const plane_ptr,
+    int plane);
+
+void av1_build_bitmask_horz_info(
+    struct AV1Common *const cm, const struct macroblockd_plane *const plane_ptr,
+    int plane);
+
+void av1_filter_block_plane_bitmask_vert(
+    struct AV1Common *const cm, struct macroblockd_plane *const plane_ptr,
+    int pl, int mi_row, int mi_col);
+
+void av1_filter_block_plane_bitmask_horz(
+    struct AV1Common *const cm, struct macroblockd_plane *const plane_ptr,
+    int pl, int mi_row, int mi_col);
+
+void av1_store_bitmask_univariant_tx(struct AV1Common *cm, int mi_row,
+                                     int mi_col, BLOCK_SIZE bsize,
+                                     MB_MODE_INFO *mbmi);
+
+void av1_store_bitmask_other_info(struct AV1Common *cm, int mi_row, int mi_col,
+                                  BLOCK_SIZE bsize, MB_MODE_INFO *mbmi,
+                                  int is_horz_coding_block_border,
+                                  int is_vert_coding_block_border);
+
+void av1_store_bitmask_vartx(struct AV1Common *cm, int mi_row, int mi_col,
+                             BLOCK_SIZE bsize, TX_SIZE tx_size,
+                             MB_MODE_INFO *mbmi);
+#endif  // CONFIG_LPF_MASK
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_COMMON_AV1_LOOPFILTER_H_
diff --git a/libs/libaom/src/av1/common/av1_rtcd.c b/libs/libaom/src/av1/common/av1_rtcd.c
new file mode 100644
index 000000000..a77a4d254
--- /dev/null
+++ b/libs/libaom/src/av1/common/av1_rtcd.c
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include "config/aom_config.h"
+
+#define RTCD_C
+#include "config/av1_rtcd.h"
+
+#include "aom_ports/aom_once.h"
+
+void av1_rtcd() {
+  // TODO(JBB): Remove this aom_once, by insuring that both the encoder and
+  // decoder setup functions are protected by aom_once();
+  aom_once(setup_rtcd_internal);
+}
diff --git a/libs/libaom/src/av1/common/av1_rtcd_defs.pl b/libs/libaom/src/av1/common/av1_rtcd_defs.pl
new file mode 100644
index 000000000..296c6c572
--- /dev/null
+++ b/libs/libaom/src/av1/common/av1_rtcd_defs.pl
@@ -0,0 +1,496 @@
+##
+## Copyright (c) 2017, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+sub av1_common_forward_decls() {
+print <<EOF
+/*
+ * AV1
+ */
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/txfm_common.h"
+#include "av1/common/common.h"
+#include "av1/common/enums.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/filter.h"
+#include "av1/common/convolve.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/common/odintrin.h"
+#include "av1/common/restoration.h"
+
+struct macroblockd;
+
+/* Encoder forward decls */
+struct macroblock;
+struct txfm_param;
+struct aom_variance_vtable;
+struct search_site_config;
+struct yv12_buffer_config;
+struct NN_CONFIG;
+typedef struct NN_CONFIG NN_CONFIG;
+
+enum { NONE, RELU, SOFTSIGN, SIGMOID } UENUM1BYTE(ACTIVATION);
+#if CONFIG_NN_V2
+enum { SOFTMAX_CROSS_ENTROPY } UENUM1BYTE(LOSS);
+struct NN_CONFIG_V2;
+typedef struct NN_CONFIG_V2 NN_CONFIG_V2;
+struct FC_LAYER;
+typedef struct FC_LAYER FC_LAYER;
+#endif  // CONFIG_NN_V2
+
+struct CNN_CONFIG;
+typedef struct CNN_CONFIG CNN_CONFIG;
+struct CNN_LAYER_CONFIG;
+typedef struct CNN_LAYER_CONFIG CNN_LAYER_CONFIG;
+struct CNN_THREAD_DATA;
+typedef struct CNN_THREAD_DATA CNN_THREAD_DATA;
+struct CNN_BRANCH_CONFIG;
+typedef struct CNN_BRANCH_CONFIG CNN_BRANCH_CONFIG;
+struct CNN_MULTI_OUT;
+typedef struct CNN_MULTI_OUT CNN_MULTI_OUT;
+
+/* Function pointers return by CfL functions */
+typedef void (*cfl_subsample_lbd_fn)(const uint8_t *input, int input_stride,
+                                     uint16_t *output_q3);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+typedef void (*cfl_subsample_hbd_fn)(const uint16_t *input, int input_stride,
+                                     uint16_t *output_q3);
+
+typedef void (*cfl_predict_hbd_fn)(const int16_t *src, uint16_t *dst,
+                                   int dst_stride, int alpha_q3, int bd);
+#endif
+
+typedef void (*cfl_subtract_average_fn)(const uint16_t *src, int16_t *dst);
+
+typedef void (*cfl_predict_lbd_fn)(const int16_t *src, uint8_t *dst,
+                                   int dst_stride, int alpha_q3);
+
+EOF
+}
+forward_decls qw/av1_common_forward_decls/;
+
+# functions that are 64 bit only.
+$mmx_x86_64 = $sse2_x86_64 = $ssse3_x86_64 = $avx_x86_64 = $avx2_x86_64 = '';
+if ($opts{arch} eq "x86_64") {
+  $mmx_x86_64 = 'mmx';
+  $sse2_x86_64 = 'sse2';
+  $ssse3_x86_64 = 'ssse3';
+  $avx_x86_64 = 'avx';
+  $avx2_x86_64 = 'avx2';
+}
+
+add_proto qw/void av1_convolve_horiz_rs/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn";
+specialize qw/av1_convolve_horiz_rs sse4_1/;
+
+if(aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+  add_proto qw/void av1_highbd_convolve_horiz_rs/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn, int bd";
+  specialize qw/av1_highbd_convolve_horiz_rs sse4_1/;
+
+  add_proto qw/void av1_highbd_wiener_convolve_add_src/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bd";
+  specialize qw/av1_highbd_wiener_convolve_add_src ssse3 avx2/;
+}
+
+add_proto qw/void av1_wiener_convolve_add_src/,       "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params";
+specialize qw/av1_wiener_convolve_add_src sse2 avx2 neon/;
+
+# directional intra predictor functions
+add_proto qw/void av1_dr_prediction_z1/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int dx, int dy";
+specialize qw/av1_dr_prediction_z1 avx2/;
+add_proto qw/void av1_dr_prediction_z2/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int upsample_left, int dx, int dy";
+specialize qw/av1_dr_prediction_z2 avx2/;
+add_proto qw/void av1_dr_prediction_z3/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_left, int dx, int dy";
+specialize qw/av1_dr_prediction_z3 avx2/;
+
+# FILTER_INTRA predictor functions
+add_proto qw/void av1_filter_intra_predictor/, "uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left, int mode";
+specialize qw/av1_filter_intra_predictor sse4_1/;
+
+# High bitdepth functions
+
+#
+# Sub Pixel Filters
+#
+add_proto qw/void av1_highbd_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+
+add_proto qw/void av1_highbd_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+
+add_proto qw/void av1_highbd_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+specialize qw/av1_highbd_convolve8/, "$sse2_x86_64";
+
+add_proto qw/void av1_highbd_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+specialize qw/av1_highbd_convolve8_horiz/, "$sse2_x86_64";
+
+add_proto qw/void av1_highbd_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+specialize qw/av1_highbd_convolve8_vert/, "$sse2_x86_64";
+
+#inv txfm
+add_proto qw/void av1_inv_txfm_add/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_inv_txfm_add ssse3 avx2 neon/;
+
+add_proto qw/void av1_highbd_inv_txfm_add/, "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add sse4_1 avx2/;
+
+add_proto qw/void av1_highbd_inv_txfm_add_4x4/,  "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_4x4 sse4_1/;
+add_proto qw/void av1_highbd_inv_txfm_add_8x8/,  "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_8x8 sse4_1/;
+add_proto qw/void av1_highbd_inv_txfm_add_4x8/,  "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_4x8 sse4_1/;
+add_proto qw/void av1_highbd_inv_txfm_add_8x4/,  "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_8x4 sse4_1/;
+add_proto qw/void av1_highbd_inv_txfm_add_4x16/,  "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_4x16 sse4_1/;
+add_proto qw/void av1_highbd_inv_txfm_add_16x4/,  "const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_16x4 sse4_1/;
+
+add_proto qw/void av1_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+add_proto qw/void av1_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+
+add_proto qw/void av1_inv_txfm2d_add_4x8/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+add_proto qw/void av1_inv_txfm2d_add_8x4/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+add_proto qw/void av1_inv_txfm2d_add_8x16/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+add_proto qw/void av1_inv_txfm2d_add_16x8/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+add_proto qw/void av1_inv_txfm2d_add_16x32/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+add_proto qw/void av1_inv_txfm2d_add_32x16/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+add_proto qw/void av1_inv_txfm2d_add_4x4/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+specialize qw/av1_inv_txfm2d_add_4x4 sse4_1/;
+add_proto qw/void av1_inv_txfm2d_add_8x8/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+specialize qw/av1_inv_txfm2d_add_8x8 sse4_1/;
+add_proto qw/void av1_inv_txfm2d_add_16x16/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+add_proto qw/void av1_inv_txfm2d_add_32x32/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+
+add_proto qw/void av1_inv_txfm2d_add_64x64/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+add_proto qw/void av1_inv_txfm2d_add_32x64/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+add_proto qw/void av1_inv_txfm2d_add_64x32/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+add_proto qw/void av1_inv_txfm2d_add_16x64/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+add_proto qw/void av1_inv_txfm2d_add_64x16/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+
+add_proto qw/void av1_inv_txfm2d_add_4x16/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+add_proto qw/void av1_inv_txfm2d_add_16x4/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+add_proto qw/void av1_inv_txfm2d_add_8x32/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+add_proto qw/void av1_inv_txfm2d_add_32x8/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+
+if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+  # directional intra predictor functions
+  add_proto qw/void av1_highbd_dr_prediction_z1/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int dx, int dy, int bd";
+  specialize qw/av1_highbd_dr_prediction_z1 avx2/;
+  add_proto qw/void av1_highbd_dr_prediction_z2/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int upsample_left, int dx, int dy, int bd";
+  specialize qw/av1_highbd_dr_prediction_z2 avx2/;
+  add_proto qw/void av1_highbd_dr_prediction_z3/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_left, int dx, int dy, int bd";
+  specialize qw/av1_highbd_dr_prediction_z3 avx2/;
+}
+
+# build compound seg mask functions
+add_proto qw/void av1_build_compound_diffwtd_mask/, "uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w";
+specialize qw/av1_build_compound_diffwtd_mask sse4_1 avx2/;
+
+add_proto qw/void av1_build_compound_diffwtd_mask_highbd/, "uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd";
+specialize qw/av1_build_compound_diffwtd_mask_highbd ssse3 avx2/;
+
+add_proto qw/void av1_build_compound_diffwtd_mask_d16/, "uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, ConvolveParams *conv_params, int bd";
+specialize qw/av1_build_compound_diffwtd_mask_d16 sse4_1 avx2 neon/;
+
+# Helper functions.
+add_proto qw/void av1_round_shift_array/, "int32_t *arr, int size, int bit";
+specialize "av1_round_shift_array", qw/sse4_1 neon/;
+
+#
+# Encoder functions below this point.
+#
+if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
+
+  # ENCODEMB INVOKE
+
+  # the transform coefficients are held in 32-bit
+  # values, so the assembler code for  av1_block_error can no longer be used.
+  add_proto qw/int64_t av1_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
+  specialize qw/av1_block_error sse2 avx2 neon/;
+
+  add_proto qw/int64_t av1_block_error_lp/, "const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size";
+  specialize qw/av1_block_error_lp avx2 neon/;
+
+  add_proto qw/void av1_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/av1_quantize_fp sse2 avx2 neon/;
+
+  add_proto qw/void av1_quantize_lp/, "const int16_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan";
+  specialize qw/av1_quantize_lp avx2 neon/;
+
+
+  add_proto qw/void av1_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/av1_quantize_fp_32x32 avx2/;
+
+  add_proto qw/void av1_quantize_fp_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/av1_quantize_fp_64x64 avx2/;
+
+  # fdct functions
+
+  add_proto qw/void av1_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
+
+  #fwd txfm
+  add_proto qw/void av1_lowbd_fwd_txfm/, "const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param";
+  specialize qw/av1_lowbd_fwd_txfm sse2 sse4_1 avx2/;
+
+  add_proto qw/void av1_fwd_txfm2d_4x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+  specialize qw/av1_fwd_txfm2d_4x8 sse4_1/;
+  add_proto qw/void av1_fwd_txfm2d_8x4/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+  specialize qw/av1_fwd_txfm2d_8x4 sse4_1/;
+  add_proto qw/void av1_fwd_txfm2d_8x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+  specialize qw/av1_fwd_txfm2d_8x16 sse4_1 avx2/;
+  add_proto qw/void av1_fwd_txfm2d_16x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+  specialize qw/av1_fwd_txfm2d_16x8 sse4_1 avx2/;
+  add_proto qw/void av1_fwd_txfm2d_16x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+  specialize qw/av1_fwd_txfm2d_16x32 sse4_1/;
+  add_proto qw/void av1_fwd_txfm2d_32x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+  specialize qw/av1_fwd_txfm2d_32x16 sse4_1/;
+  add_proto qw/void av1_fwd_txfm2d_4x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+  specialize qw/av1_fwd_txfm2d_4x16 sse4_1/;
+  add_proto qw/void av1_fwd_txfm2d_16x4/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+  specialize qw/av1_fwd_txfm2d_16x4 sse4_1/;
+  add_proto qw/void av1_fwd_txfm2d_8x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+  specialize qw/av1_fwd_txfm2d_8x32 sse4_1/;
+  add_proto qw/void av1_fwd_txfm2d_32x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+  specialize qw/av1_fwd_txfm2d_32x8 sse4_1/;
+  add_proto qw/void av1_fwd_txfm2d_4x4/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+  specialize qw/av1_fwd_txfm2d_4x4 sse4_1/;
+  add_proto qw/void av1_fwd_txfm2d_8x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+  specialize qw/av1_fwd_txfm2d_8x8 sse4_1 avx2/;
+  add_proto qw/void av1_fwd_txfm2d_16x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+  specialize qw/av1_fwd_txfm2d_16x16 sse4_1 avx2/;
+  add_proto qw/void av1_fwd_txfm2d_32x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+  specialize qw/av1_fwd_txfm2d_32x32 sse4_1 avx2/;
+
+  add_proto qw/void av1_fwd_txfm2d_64x64/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+  specialize qw/av1_fwd_txfm2d_64x64 sse4_1 avx2/;
+  add_proto qw/void av1_fwd_txfm2d_32x64/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+  specialize qw/av1_fwd_txfm2d_32x64 sse4_1/;
+  add_proto qw/void av1_fwd_txfm2d_64x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+  specialize qw/av1_fwd_txfm2d_64x32 sse4_1/;
+  add_proto qw/void av1_fwd_txfm2d_16x64/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+  specialize qw/av1_fwd_txfm2d_16x64 sse4_1/;
+  add_proto qw/void av1_fwd_txfm2d_64x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+  specialize qw/av1_fwd_txfm2d_64x16 sse4_1/;
+
+  #
+  # Motion search
+  #
+  add_proto qw/int av1_full_range_search/, "const struct macroblock *x, const struct search_site_config *cfg, MV *ref_mv, MV *best_mv, int search_param, int sad_per_bit, int *num00, const struct aom_variance_vtable *fn_ptr, const MV *center_mv";
+
+  if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
+    add_proto qw/void av1_apply_temporal_filter_yuv/, "const struct yv12_buffer_config *ref_frame, const struct macroblockd *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const int strength, const int use_subblock, const int *subblock_filter_weights, const uint8_t *pred, uint32_t *accum, uint16_t *count";
+    specialize qw/av1_apply_temporal_filter_yuv sse4_1/;
+  }
+
+  if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
+    add_proto qw/void av1_apply_temporal_filter_planewise/, "const struct yv12_buffer_config *ref_frame, const struct macroblockd *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const double *noise_levels, const int use_subblock, const int block_mse, const int *subblock_mses, const int q_factor, const uint8_t *pred, uint32_t *accum, uint16_t *count";
+    specialize qw/av1_apply_temporal_filter_planewise sse2 avx2/;
+  }
+  add_proto qw/void av1_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr, int log_scale";
+
+  # ENCODEMB INVOKE
+  if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+    add_proto qw/int64_t av1_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
+    specialize qw/av1_highbd_block_error sse2 avx2/;
+  }
+
+  if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+    add_proto qw/void av1_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale";
+    specialize qw/av1_highbd_quantize_fp sse4_1 avx2/;
+  }
+
+  add_proto qw/void av1_highbd_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
+
+  # End av1_high encoder functions
+
+  # txb
+  add_proto qw/void av1_get_nz_map_contexts/, "const uint8_t *const levels, const int16_t *const scan, const uint16_t eob, const TX_SIZE tx_size, const TX_CLASS tx_class, int8_t *const coeff_contexts";
+  specialize qw/av1_get_nz_map_contexts sse2/;
+  add_proto qw/void av1_txb_init_levels/, "const tran_low_t *const coeff, const int width, const int height, uint8_t *const levels";
+  specialize qw/av1_txb_init_levels sse4_1 avx2/;
+
+  add_proto qw/uint64_t av1_wedge_sse_from_residuals/, "const int16_t *r1, const int16_t *d, const uint8_t *m, int N";
+  specialize qw/av1_wedge_sse_from_residuals sse2 avx2/;
+  add_proto qw/int8_t av1_wedge_sign_from_residuals/, "const int16_t *ds, const uint8_t *m, int N, int64_t limit";
+  specialize qw/av1_wedge_sign_from_residuals sse2 avx2/;
+  add_proto qw/void av1_wedge_compute_delta_squares/, "int16_t *d, const int16_t *a, const int16_t *b, int N";
+  specialize qw/av1_wedge_compute_delta_squares sse2 avx2/;
+
+  # hash
+  add_proto qw/uint32_t av1_get_crc32c_value/, "void *crc_calculator, uint8_t *p, size_t length";
+  specialize qw/av1_get_crc32c_value sse4_2/;
+
+  add_proto qw/void av1_compute_stats/,  "int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H";
+  specialize qw/av1_compute_stats sse4_1 avx2/;
+
+  if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+    add_proto qw/void av1_compute_stats_highbd/,  "int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, aom_bit_depth_t bit_depth";
+    specialize qw/av1_compute_stats_highbd sse4_1 avx2/;
+  }
+
+  add_proto qw/void av1_calc_proj_params/, " const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2], const sgr_params_type *params";
+  specialize qw/av1_calc_proj_params avx2/;
+
+  add_proto qw/int64_t av1_lowbd_pixel_proj_error/, " const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params";
+  specialize qw/av1_lowbd_pixel_proj_error sse4_1 avx2/;
+
+  if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+    add_proto qw/int64_t av1_highbd_pixel_proj_error/, " const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params";
+    specialize qw/av1_highbd_pixel_proj_error sse4_1 avx2/;
+  }
+  add_proto qw/void av1_get_horver_correlation_full/, " const int16_t *diff, int stride, int w, int h, float *hcorr, float *vcorr";
+  specialize qw/av1_get_horver_correlation_full sse4_1 avx2/;
+
+  add_proto qw/void av1_nn_predict/, " const float *input_nodes, const NN_CONFIG *const nn_config, int reduce_prec, float *const output";
+  specialize qw/av1_nn_predict sse3/;
+}
+# end encoder functions
+
+# CNN functions
+
+add_proto qw/void av1_cnn_activate/, " float **input, int channels, int width, int height, int stride, ACTIVATION layer_activation";
+add_proto qw/void av1_cnn_add/, " float **input, int channels, int width, int height, int stride, const float **add";
+add_proto qw/void av1_cnn_predict/, " const float **input, int in_width, int in_height, int in_stride, const CNN_CONFIG *cnn_config, const CNN_THREAD_DATA *thread_data, CNN_MULTI_OUT *output_struct";
+add_proto qw/void av1_cnn_convolve/, " const float **input, int in_width, int in_height, int in_stride, const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride, int start_idx, int step";
+add_proto qw/void av1_cnn_deconvolve/, " const float **input, int in_width, int in_height, int in_stride, const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride";
+add_proto qw/void av1_cnn_batchnorm/, "float **image, int channels, int width, int height, int stride, const float *gamma, const float *beta, const float *mean, const float *std";
+
+# Deringing Functions
+
+add_proto qw/int cdef_find_dir/, "const uint16_t *img, int stride, int32_t *var, int coeff_shift";
+add_proto qw/void cdef_filter_block/, "uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int coeff_shift";
+
+add_proto qw/void cdef_copy_rect8_8bit_to_16bit/, "uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h";
+add_proto qw/void cdef_copy_rect8_16bit_to_16bit/, "uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h";
+
+# VS compiling for 32 bit targets does not support vector types in
+# structs as arguments, which makes the v256 type of the intrinsics
+# hard to support, so optimizations for this target are disabled.
+if ($opts{config} !~ /libs-x86-win32-vs.*/) {
+  specialize qw/cdef_find_dir sse2 ssse3 sse4_1 avx2 neon/;
+  specialize qw/cdef_filter_block sse2 ssse3 sse4_1 avx2 neon/;
+  specialize qw/cdef_copy_rect8_8bit_to_16bit sse2 ssse3 sse4_1 avx2 neon/;
+  specialize qw/cdef_copy_rect8_16bit_to_16bit sse2 ssse3 sse4_1 avx2 neon/;
+}
+
+# WARPED_MOTION / GLOBAL_MOTION functions
+
+add_proto qw/void av1_warp_affine/, "const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
+specialize qw/av1_warp_affine sse4_1 avx2 neon/;
+
+if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+  add_proto qw/void av1_highbd_warp_affine/, "const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
+  specialize qw/av1_highbd_warp_affine sse4_1/;
+}
+
+add_proto qw/int64_t av1_calc_frame_error/, "const uint8_t *const ref, int stride, const uint8_t *const dst, int p_width, int p_height, int p_stride";
+specialize qw/av1_calc_frame_error sse2 avx2/;
+
+if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
+  add_proto qw/double av1_compute_cross_correlation/, "unsigned char *im1, int stride1, int x1, int y1, unsigned char *im2, int stride2, int x2, int y2";
+  specialize qw/av1_compute_cross_correlation sse4_1 avx2/;
+}
+
+# LOOP_RESTORATION functions
+
+add_proto qw/void av1_apply_selfguided_restoration/, "const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd";
+specialize qw/av1_apply_selfguided_restoration sse4_1 avx2 neon/;
+
+add_proto qw/int av1_selfguided_restoration/, "const uint8_t *dgd8, int width, int height,
+                                 int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
+                                 int sgr_params_idx, int bit_depth, int highbd";
+specialize qw/av1_selfguided_restoration sse4_1 avx2 neon/;
+
+# CONVOLVE_ROUND/COMPOUND_ROUND functions
+
+add_proto qw/void av1_convolve_2d_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params";
+add_proto qw/void av1_convolve_2d_copy_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params";
+add_proto qw/void av1_convolve_x_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params";
+add_proto qw/void av1_convolve_y_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params";
+add_proto qw/void av1_dist_wtd_convolve_2d/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params";
+add_proto qw/void av1_dist_wtd_convolve_2d_copy/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params";
+add_proto qw/void av1_dist_wtd_convolve_x/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params";
+add_proto qw/void av1_dist_wtd_convolve_y/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params";
+if(aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+  add_proto qw/void av1_highbd_convolve_2d_copy_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd";
+  add_proto qw/void av1_highbd_convolve_2d_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd";
+  add_proto qw/void av1_highbd_convolve_x_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd";
+  add_proto qw/void av1_highbd_convolve_y_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd";
+  add_proto qw/void av1_highbd_dist_wtd_convolve_2d/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd";
+  add_proto qw/void av1_highbd_dist_wtd_convolve_x/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd";
+  add_proto qw/void av1_highbd_dist_wtd_convolve_y/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd";
+  add_proto qw/void av1_highbd_dist_wtd_convolve_2d_copy/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int subpel_y_qn, ConvolveParams *conv_params, int bd";
+  add_proto qw/void av1_highbd_convolve_2d_scale/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_qn, const int y_step_qn, ConvolveParams *conv_params, int bd";
+}
+
+  add_proto qw/void av1_convolve_2d_scale/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_qn, const int y_step_qn, ConvolveParams *conv_params";
+
+  specialize qw/av1_convolve_2d_sr sse2 avx2 neon/;
+  specialize qw/av1_convolve_2d_copy_sr sse2 avx2 neon/;
+  specialize qw/av1_convolve_x_sr sse2 avx2 neon/;
+  specialize qw/av1_convolve_y_sr sse2 avx2 neon/;
+  specialize qw/av1_convolve_2d_scale sse4_1/;
+  specialize qw/av1_dist_wtd_convolve_2d sse2 ssse3 avx2 neon/;
+  specialize qw/av1_dist_wtd_convolve_2d_copy sse2 avx2 neon/;
+  specialize qw/av1_dist_wtd_convolve_x sse2 avx2 neon/;
+  specialize qw/av1_dist_wtd_convolve_y sse2 avx2 neon/;
+  if(aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+    specialize qw/av1_highbd_dist_wtd_convolve_2d sse4_1 avx2/;
+    specialize qw/av1_highbd_dist_wtd_convolve_x sse4_1 avx2/;
+    specialize qw/av1_highbd_dist_wtd_convolve_y sse4_1 avx2/;
+    specialize qw/av1_highbd_dist_wtd_convolve_2d_copy sse4_1 avx2/;
+    specialize qw/av1_highbd_convolve_2d_copy_sr sse2 avx2/;
+    specialize qw/av1_highbd_convolve_2d_sr ssse3 avx2/;
+    specialize qw/av1_highbd_convolve_x_sr ssse3 avx2/;
+    specialize qw/av1_highbd_convolve_y_sr ssse3 avx2/;
+    specialize qw/av1_highbd_convolve_2d_scale sse4_1/;
+  }
+
+# INTRA_EDGE functions
+add_proto qw/void av1_filter_intra_edge/, "uint8_t *p, int sz, int strength";
+specialize qw/av1_filter_intra_edge sse4_1/;
+add_proto qw/void av1_upsample_intra_edge/, "uint8_t *p, int sz";
+specialize qw/av1_upsample_intra_edge sse4_1/;
+
+add_proto qw/void av1_filter_intra_edge_high/, "uint16_t *p, int sz, int strength";
+specialize qw/av1_filter_intra_edge_high sse4_1/;
+add_proto qw/void av1_upsample_intra_edge_high/, "uint16_t *p, int sz, int bd";
+specialize qw/av1_upsample_intra_edge_high sse4_1/;
+
+# CFL
+add_proto qw/cfl_subtract_average_fn cfl_get_subtract_average_fn/, "TX_SIZE tx_size";
+specialize qw/cfl_get_subtract_average_fn sse2 avx2 neon vsx/;
+
+add_proto qw/cfl_subsample_lbd_fn cfl_get_luma_subsampling_420_lbd/, "TX_SIZE tx_size";
+specialize qw/cfl_get_luma_subsampling_420_lbd ssse3 avx2 neon/;
+
+add_proto qw/cfl_subsample_lbd_fn cfl_get_luma_subsampling_422_lbd/, "TX_SIZE tx_size";
+specialize qw/cfl_get_luma_subsampling_422_lbd ssse3 avx2 neon/;
+
+add_proto qw/cfl_subsample_lbd_fn cfl_get_luma_subsampling_444_lbd/, "TX_SIZE tx_size";
+specialize qw/cfl_get_luma_subsampling_444_lbd ssse3 avx2 neon/;
+
+if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+  add_proto qw/cfl_subsample_hbd_fn cfl_get_luma_subsampling_420_hbd/, "TX_SIZE tx_size";
+  specialize qw/cfl_get_luma_subsampling_420_hbd ssse3 avx2 neon/;
+
+  add_proto qw/cfl_subsample_hbd_fn cfl_get_luma_subsampling_422_hbd/, "TX_SIZE tx_size";
+  specialize qw/cfl_get_luma_subsampling_422_hbd ssse3 avx2 neon/;
+
+  add_proto qw/cfl_subsample_hbd_fn cfl_get_luma_subsampling_444_hbd/, "TX_SIZE tx_size";
+  specialize qw/cfl_get_luma_subsampling_444_hbd ssse3 avx2 neon/;
+
+  add_proto qw/cfl_predict_hbd_fn cfl_get_predict_hbd_fn/, "TX_SIZE tx_size";
+  specialize qw/cfl_get_predict_hbd_fn ssse3 avx2 neon/;
+}
+
+add_proto qw/cfl_predict_lbd_fn cfl_get_predict_lbd_fn/, "TX_SIZE tx_size";
+specialize qw/cfl_get_predict_lbd_fn ssse3 avx2 neon/;
+
+1;
diff --git a/libs/libaom/src/av1/common/av1_txfm.c b/libs/libaom/src/av1/common/av1_txfm.c
new file mode 100644
index 000000000..ac43402f4
--- /dev/null
+++ b/libs/libaom/src/av1/common/av1_txfm.c
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "av1/common/av1_txfm.h"
+
+// av1_cospi_arr[i][j] = (int)round(cos(PI*j/128) * (1<<(cos_bit_min+i)));
+const int32_t av1_cospi_arr_data[7][64] = {
+  { 1024, 1024, 1023, 1021, 1019, 1016, 1013, 1009, 1004, 999, 993, 987, 980,
+    972,  964,  955,  946,  936,  926,  915,  903,  891,  878, 865, 851, 837,
+    822,  807,  792,  775,  759,  742,  724,  706,  688,  669, 650, 630, 610,
+    590,  569,  548,  526,  505,  483,  460,  438,  415,  392, 369, 345, 321,
+    297,  273,  249,  224,  200,  175,  150,  125,  100,  75,  50,  25 },
+  { 2048, 2047, 2046, 2042, 2038, 2033, 2026, 2018, 2009, 1998, 1987,
+    1974, 1960, 1945, 1928, 1911, 1892, 1872, 1851, 1829, 1806, 1782,
+    1757, 1730, 1703, 1674, 1645, 1615, 1583, 1551, 1517, 1483, 1448,
+    1412, 1375, 1338, 1299, 1260, 1220, 1179, 1138, 1096, 1053, 1009,
+    965,  921,  876,  830,  784,  737,  690,  642,  595,  546,  498,
+    449,  400,  350,  301,  251,  201,  151,  100,  50 },
+  { 4096, 4095, 4091, 4085, 4076, 4065, 4052, 4036, 4017, 3996, 3973,
+    3948, 3920, 3889, 3857, 3822, 3784, 3745, 3703, 3659, 3612, 3564,
+    3513, 3461, 3406, 3349, 3290, 3229, 3166, 3102, 3035, 2967, 2896,
+    2824, 2751, 2675, 2598, 2520, 2440, 2359, 2276, 2191, 2106, 2019,
+    1931, 1842, 1751, 1660, 1567, 1474, 1380, 1285, 1189, 1092, 995,
+    897,  799,  700,  601,  501,  401,  301,  201,  101 },
+  { 8192, 8190, 8182, 8170, 8153, 8130, 8103, 8071, 8035, 7993, 7946,
+    7895, 7839, 7779, 7713, 7643, 7568, 7489, 7405, 7317, 7225, 7128,
+    7027, 6921, 6811, 6698, 6580, 6458, 6333, 6203, 6070, 5933, 5793,
+    5649, 5501, 5351, 5197, 5040, 4880, 4717, 4551, 4383, 4212, 4038,
+    3862, 3683, 3503, 3320, 3135, 2948, 2760, 2570, 2378, 2185, 1990,
+    1795, 1598, 1401, 1202, 1003, 803,  603,  402,  201 },
+  { 16384, 16379, 16364, 16340, 16305, 16261, 16207, 16143, 16069, 15986, 15893,
+    15791, 15679, 15557, 15426, 15286, 15137, 14978, 14811, 14635, 14449, 14256,
+    14053, 13842, 13623, 13395, 13160, 12916, 12665, 12406, 12140, 11866, 11585,
+    11297, 11003, 10702, 10394, 10080, 9760,  9434,  9102,  8765,  8423,  8076,
+    7723,  7366,  7005,  6639,  6270,  5897,  5520,  5139,  4756,  4370,  3981,
+    3590,  3196,  2801,  2404,  2006,  1606,  1205,  804,   402 },
+  { 32768, 32758, 32729, 32679, 32610, 32522, 32413, 32286, 32138, 31972, 31786,
+    31581, 31357, 31114, 30853, 30572, 30274, 29957, 29622, 29269, 28899, 28511,
+    28106, 27684, 27246, 26791, 26320, 25833, 25330, 24812, 24279, 23732, 23170,
+    22595, 22006, 21403, 20788, 20160, 19520, 18868, 18205, 17531, 16846, 16151,
+    15447, 14733, 14010, 13279, 12540, 11793, 11039, 10279, 9512,  8740,  7962,
+    7180,  6393,  5602,  4808,  4011,  3212,  2411,  1608,  804 },
+  { 65536, 65516, 65457, 65358, 65220, 65043, 64827, 64571, 64277, 63944, 63572,
+    63162, 62714, 62228, 61705, 61145, 60547, 59914, 59244, 58538, 57798, 57022,
+    56212, 55368, 54491, 53581, 52639, 51665, 50660, 49624, 48559, 47464, 46341,
+    45190, 44011, 42806, 41576, 40320, 39040, 37736, 36410, 35062, 33692, 32303,
+    30893, 29466, 28020, 26558, 25080, 23586, 22078, 20557, 19024, 17479, 15924,
+    14359, 12785, 11204, 9616,  8022,  6424,  4821,  3216,  1608 }
+};
+
+// av1_sinpi_arr_data[i][j] = (int)round((sqrt(2) * sin(j*Pi/9) * 2 / 3) * (1
+// << (cos_bit_min + i))) modified so that elements j=1,2 sum to element j=4.
+const int32_t av1_sinpi_arr_data[7][5] = {
+  { 0, 330, 621, 836, 951 },        { 0, 660, 1241, 1672, 1901 },
+  { 0, 1321, 2482, 3344, 3803 },    { 0, 2642, 4964, 6689, 7606 },
+  { 0, 5283, 9929, 13377, 15212 },  { 0, 10566, 19858, 26755, 30424 },
+  { 0, 21133, 39716, 53510, 60849 }
+};
+
+void av1_round_shift_array_c(int32_t *arr, int size, int bit) {
+  int i;
+  if (bit == 0) {
+    return;
+  } else {
+    if (bit > 0) {
+      for (i = 0; i < size; i++) {
+        arr[i] = round_shift(arr[i], bit);
+      }
+    } else {
+      for (i = 0; i < size; i++) {
+        arr[i] = (int32_t)clamp64(((int64_t)1 << (-bit)) * arr[i], INT32_MIN,
+                                  INT32_MAX);
+      }
+    }
+  }
+}
+
+const TXFM_TYPE av1_txfm_type_ls[5][TX_TYPES_1D] = {
+  { TXFM_TYPE_DCT4, TXFM_TYPE_ADST4, TXFM_TYPE_ADST4, TXFM_TYPE_IDENTITY4 },
+  { TXFM_TYPE_DCT8, TXFM_TYPE_ADST8, TXFM_TYPE_ADST8, TXFM_TYPE_IDENTITY8 },
+  { TXFM_TYPE_DCT16, TXFM_TYPE_ADST16, TXFM_TYPE_ADST16, TXFM_TYPE_IDENTITY16 },
+  { TXFM_TYPE_DCT32, TXFM_TYPE_INVALID, TXFM_TYPE_INVALID,
+    TXFM_TYPE_IDENTITY32 },
+  { TXFM_TYPE_DCT64, TXFM_TYPE_INVALID, TXFM_TYPE_INVALID, TXFM_TYPE_INVALID }
+};
+
+const int8_t av1_txfm_stage_num_list[TXFM_TYPES] = {
+  4,   // TXFM_TYPE_DCT4
+  6,   // TXFM_TYPE_DCT8
+  8,   // TXFM_TYPE_DCT16
+  10,  // TXFM_TYPE_DCT32
+  12,  // TXFM_TYPE_DCT64
+  7,   // TXFM_TYPE_ADST4
+  8,   // TXFM_TYPE_ADST8
+  10,  // TXFM_TYPE_ADST16
+  1,   // TXFM_TYPE_IDENTITY4
+  1,   // TXFM_TYPE_IDENTITY8
+  1,   // TXFM_TYPE_IDENTITY16
+  1,   // TXFM_TYPE_IDENTITY32
+};
+
+void av1_range_check_buf(int32_t stage, const int32_t *input,
+                         const int32_t *buf, int32_t size, int8_t bit) {
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+  const int64_t max_value = (1LL << (bit - 1)) - 1;
+  const int64_t min_value = -(1LL << (bit - 1));
+
+  int in_range = 1;
+
+  for (int i = 0; i < size; ++i) {
+    if (buf[i] < min_value || buf[i] > max_value) {
+      in_range = 0;
+    }
+  }
+
+  if (!in_range) {
+    fprintf(stderr, "Error: coeffs contain out-of-range values\n");
+    fprintf(stderr, "size: %d\n", size);
+    fprintf(stderr, "stage: %d\n", stage);
+    fprintf(stderr, "allowed range: [%" PRId64 ";%" PRId64 "]\n", min_value,
+            max_value);
+
+    fprintf(stderr, "coeffs: ");
+
+    fprintf(stderr, "[");
+    for (int j = 0; j < size; j++) {
+      if (j > 0) fprintf(stderr, ", ");
+      fprintf(stderr, "%d", input[j]);
+    }
+    fprintf(stderr, "]\n");
+
+    fprintf(stderr, "   buf: ");
+
+    fprintf(stderr, "[");
+    for (int j = 0; j < size; j++) {
+      if (j > 0) fprintf(stderr, ", ");
+      fprintf(stderr, "%d", buf[j]);
+    }
+    fprintf(stderr, "]\n\n");
+  }
+
+  assert(in_range);
+#else
+  (void)stage;
+  (void)input;
+  (void)buf;
+  (void)size;
+  (void)bit;
+#endif
+}
diff --git a/libs/libaom/src/av1/common/av1_txfm.h b/libs/libaom/src/av1/common/av1_txfm.h
new file mode 100644
index 000000000..20049b680
--- /dev/null
+++ b/libs/libaom/src/av1/common/av1_txfm.h
@@ -0,0 +1,234 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_AV1_TXFM_H_
+#define AOM_AV1_COMMON_AV1_TXFM_H_
+
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+
+#include "config/aom_config.h"
+
+#include "av1/common/enums.h"
+#include "av1/common/blockd.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if !defined(DO_RANGE_CHECK_CLAMP)
+#define DO_RANGE_CHECK_CLAMP 0
+#endif
+
+extern const int32_t av1_cospi_arr_data[7][64];
+extern const int32_t av1_sinpi_arr_data[7][5];
+
+#define MAX_TXFM_STAGE_NUM 12
+
+static const int cos_bit_min = 10;
+static const int cos_bit_max = 16;
+
+#define NewSqrt2Bits ((int32_t)12)
+// 2^12 * sqrt(2)
+static const int32_t NewSqrt2 = 5793;
+// 2^12 / sqrt(2)
+static const int32_t NewInvSqrt2 = 2896;
+
+static INLINE const int32_t *cospi_arr(int n) {
+  return av1_cospi_arr_data[n - cos_bit_min];
+}
+
+static INLINE const int32_t *sinpi_arr(int n) {
+  return av1_sinpi_arr_data[n - cos_bit_min];
+}
+
+static INLINE int32_t range_check_value(int32_t value, int8_t bit) {
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+  const int64_t max_value = (1LL << (bit - 1)) - 1;
+  const int64_t min_value = -(1LL << (bit - 1));
+  if (value < min_value || value > max_value) {
+    fprintf(stderr, "coeff out of bit range, value: %d bit %d\n", value, bit);
+#if !CONFIG_AV1_ENCODER
+    assert(0);
+#endif
+  }
+#endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
+#if DO_RANGE_CHECK_CLAMP
+  bit = AOMMIN(bit, 31);
+  return clamp(value, -(1 << (bit - 1)), (1 << (bit - 1)) - 1);
+#endif  // DO_RANGE_CHECK_CLAMP
+  (void)bit;
+  return value;
+}
+
+static INLINE int32_t round_shift(int64_t value, int bit) {
+  assert(bit >= 1);
+  return (int32_t)((value + (1ll << (bit - 1))) >> bit);
+}
+
+static INLINE int32_t half_btf(int32_t w0, int32_t in0, int32_t w1, int32_t in1,
+                               int bit) {
+  int64_t result_64 = (int64_t)(w0 * in0) + (int64_t)(w1 * in1);
+  int64_t intermediate = result_64 + (1LL << (bit - 1));
+  // NOTE(david.barker): The value 'result_64' may not necessarily fit
+  // into 32 bits. However, the result of this function is nominally
+  // ROUND_POWER_OF_TWO_64(result_64, bit)
+  // and that is required to fit into stage_range[stage] many bits
+  // (checked by range_check_buf()).
+  //
+  // Here we've unpacked that rounding operation, and it can be shown
+  // that the value of 'intermediate' here *does* fit into 32 bits
+  // for any conformant bitstream.
+  // The upshot is that, if you do all this calculation using
+  // wrapping 32-bit arithmetic instead of (non-wrapping) 64-bit arithmetic,
+  // then you'll still get the correct result.
+  // To provide a check on this logic, we assert that 'intermediate'
+  // would fit into an int32 if range checking is enabled.
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+  assert(intermediate >= INT32_MIN && intermediate <= INT32_MAX);
+#endif
+  return (int32_t)(intermediate >> bit);
+}
+
+static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans,
+                                             int bd) {
+  return clip_pixel_highbd(dest + (int)trans, bd);
+}
+
+typedef void (*TxfmFunc)(const int32_t *input, int32_t *output, int8_t cos_bit,
+                         const int8_t *stage_range);
+
+typedef void (*FwdTxfm2dFunc)(const int16_t *input, int32_t *output, int stride,
+                              TX_TYPE tx_type, int bd);
+
+enum {
+  TXFM_TYPE_DCT4,
+  TXFM_TYPE_DCT8,
+  TXFM_TYPE_DCT16,
+  TXFM_TYPE_DCT32,
+  TXFM_TYPE_DCT64,
+  TXFM_TYPE_ADST4,
+  TXFM_TYPE_ADST8,
+  TXFM_TYPE_ADST16,
+  TXFM_TYPE_IDENTITY4,
+  TXFM_TYPE_IDENTITY8,
+  TXFM_TYPE_IDENTITY16,
+  TXFM_TYPE_IDENTITY32,
+  TXFM_TYPES,
+  TXFM_TYPE_INVALID,
+} UENUM1BYTE(TXFM_TYPE);
+
+typedef struct TXFM_2D_FLIP_CFG {
+  TX_SIZE tx_size;
+  int ud_flip;  // flip upside down
+  int lr_flip;  // flip left to right
+  const int8_t *shift;
+  int8_t cos_bit_col;
+  int8_t cos_bit_row;
+  int8_t stage_range_col[MAX_TXFM_STAGE_NUM];
+  int8_t stage_range_row[MAX_TXFM_STAGE_NUM];
+  TXFM_TYPE txfm_type_col;
+  TXFM_TYPE txfm_type_row;
+  int stage_num_col;
+  int stage_num_row;
+} TXFM_2D_FLIP_CFG;
+
+static INLINE void get_flip_cfg(TX_TYPE tx_type, int *ud_flip, int *lr_flip) {
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+      *ud_flip = 0;
+      *lr_flip = 0;
+      break;
+    case IDTX:
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+      *ud_flip = 0;
+      *lr_flip = 0;
+      break;
+    case FLIPADST_DCT:
+    case FLIPADST_ADST:
+    case V_FLIPADST:
+      *ud_flip = 1;
+      *lr_flip = 0;
+      break;
+    case DCT_FLIPADST:
+    case ADST_FLIPADST:
+    case H_FLIPADST:
+      *ud_flip = 0;
+      *lr_flip = 1;
+      break;
+    case FLIPADST_FLIPADST:
+      *ud_flip = 1;
+      *lr_flip = 1;
+      break;
+    default:
+      *ud_flip = 0;
+      *lr_flip = 0;
+      assert(0);
+  }
+}
+
+static INLINE void set_flip_cfg(TX_TYPE tx_type, TXFM_2D_FLIP_CFG *cfg) {
+  get_flip_cfg(tx_type, &cfg->ud_flip, &cfg->lr_flip);
+}
+
+// Utility function that returns the log of the ratio of the col and row
+// sizes.
+static INLINE int get_rect_tx_log_ratio(int col, int row) {
+  if (col == row) return 0;
+  if (col > row) {
+    if (col == row * 2) return 1;
+    if (col == row * 4) return 2;
+    assert(0 && "Unsupported transform size");
+  } else {
+    if (row == col * 2) return -1;
+    if (row == col * 4) return -2;
+    assert(0 && "Unsupported transform size");
+  }
+  return 0;  // Invalid
+}
+
+void av1_gen_fwd_stage_range(int8_t *stage_range_col, int8_t *stage_range_row,
+                             const TXFM_2D_FLIP_CFG *cfg, int bd);
+
+void av1_gen_inv_stage_range(int8_t *stage_range_col, int8_t *stage_range_row,
+                             const TXFM_2D_FLIP_CFG *cfg, TX_SIZE tx_size,
+                             int bd);
+
+void av1_get_fwd_txfm_cfg(TX_TYPE tx_type, TX_SIZE tx_size,
+                          TXFM_2D_FLIP_CFG *cfg);
+void av1_get_inv_txfm_cfg(TX_TYPE tx_type, TX_SIZE tx_size,
+                          TXFM_2D_FLIP_CFG *cfg);
+extern const TXFM_TYPE av1_txfm_type_ls[5][TX_TYPES_1D];
+extern const int8_t av1_txfm_stage_num_list[TXFM_TYPES];
+static INLINE int get_txw_idx(TX_SIZE tx_size) {
+  return tx_size_wide_log2[tx_size] - tx_size_wide_log2[0];
+}
+static INLINE int get_txh_idx(TX_SIZE tx_size) {
+  return tx_size_high_log2[tx_size] - tx_size_high_log2[0];
+}
+
+void av1_range_check_buf(int32_t stage, const int32_t *input,
+                         const int32_t *buf, int32_t size, int8_t bit);
+#define MAX_TXWH_IDX 5
+#ifdef __cplusplus
+}
+#endif  // __cplusplus
+
+#endif  // AOM_AV1_COMMON_AV1_TXFM_H_
diff --git a/libs/libaom/src/av1/common/blockd.c b/libs/libaom/src/av1/common/blockd.c
new file mode 100644
index 000000000..00725ea2d
--- /dev/null
+++ b/libs/libaom/src/av1/common/blockd.c
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+
+#include "aom_ports/system_state.h"
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/blockd.h"
+
+PREDICTION_MODE av1_left_block_mode(const MB_MODE_INFO *left_mi) {
+  if (!left_mi) return DC_PRED;
+  assert(!is_inter_block(left_mi) || is_intrabc_block(left_mi));
+  return left_mi->mode;
+}
+
+PREDICTION_MODE av1_above_block_mode(const MB_MODE_INFO *above_mi) {
+  if (!above_mi) return DC_PRED;
+  assert(!is_inter_block(above_mi) || is_intrabc_block(above_mi));
+  return above_mi->mode;
+}
+
+void av1_set_entropy_contexts(const MACROBLOCKD *xd,
+                              struct macroblockd_plane *pd, int plane,
+                              BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                              int has_eob, int aoff, int loff) {
+  ENTROPY_CONTEXT *const a = pd->above_entropy_context + aoff;
+  ENTROPY_CONTEXT *const l = pd->left_entropy_context + loff;
+  const int txs_wide = tx_size_wide_unit[tx_size];
+  const int txs_high = tx_size_high_unit[tx_size];
+
+  // above
+  if (has_eob && xd->mb_to_right_edge < 0) {
+    const int blocks_wide = max_block_wide(xd, plane_bsize, plane);
+    const int above_contexts = AOMMIN(txs_wide, blocks_wide - aoff);
+    memset(a, has_eob, sizeof(*a) * above_contexts);
+    memset(a + above_contexts, 0, sizeof(*a) * (txs_wide - above_contexts));
+  } else {
+    memset(a, has_eob, sizeof(*a) * txs_wide);
+  }
+
+  // left
+  if (has_eob && xd->mb_to_bottom_edge < 0) {
+    const int blocks_high = max_block_high(xd, plane_bsize, plane);
+    const int left_contexts = AOMMIN(txs_high, blocks_high - loff);
+    memset(l, has_eob, sizeof(*l) * left_contexts);
+    memset(l + left_contexts, 0, sizeof(*l) * (txs_high - left_contexts));
+  } else {
+    memset(l, has_eob, sizeof(*l) * txs_high);
+  }
+}
+void av1_reset_entropy_context(MACROBLOCKD *xd, BLOCK_SIZE bsize,
+                               const int num_planes) {
+  assert(bsize < BLOCK_SIZES_ALL);
+  const int nplanes = 1 + (num_planes - 1) * xd->is_chroma_ref;
+  for (int i = 0; i < nplanes; i++) {
+    struct macroblockd_plane *const pd = &xd->plane[i];
+    const BLOCK_SIZE plane_bsize =
+        get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+    const int txs_wide = mi_size_wide[plane_bsize];
+    const int txs_high = mi_size_high[plane_bsize];
+    memset(pd->above_entropy_context, 0, sizeof(ENTROPY_CONTEXT) * txs_wide);
+    memset(pd->left_entropy_context, 0, sizeof(ENTROPY_CONTEXT) * txs_high);
+  }
+}
+
+void av1_reset_loop_filter_delta(MACROBLOCKD *xd, int num_planes) {
+  xd->delta_lf_from_base = 0;
+  const int frame_lf_count =
+      num_planes > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
+  for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) xd->delta_lf[lf_id] = 0;
+}
+
+void av1_reset_loop_restoration(MACROBLOCKD *xd, const int num_planes) {
+  for (int p = 0; p < num_planes; ++p) {
+    set_default_wiener(xd->wiener_info + p);
+    set_default_sgrproj(xd->sgrproj_info + p);
+  }
+}
+
+void av1_setup_block_planes(MACROBLOCKD *xd, int ss_x, int ss_y,
+                            const int num_planes) {
+  int i;
+
+  for (i = 0; i < num_planes; i++) {
+    xd->plane[i].plane_type = get_plane_type(i);
+    xd->plane[i].subsampling_x = i ? ss_x : 0;
+    xd->plane[i].subsampling_y = i ? ss_y : 0;
+  }
+  for (i = num_planes; i < MAX_MB_PLANE; i++) {
+    xd->plane[i].subsampling_x = 1;
+    xd->plane[i].subsampling_y = 1;
+  }
+}
diff --git a/libs/libaom/src/av1/common/blockd.h b/libs/libaom/src/av1/common/blockd.h
new file mode 100644
index 000000000..47597bc83
--- /dev/null
+++ b/libs/libaom/src/av1/common/blockd.h
@@ -0,0 +1,1296 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_BLOCKD_H_
+#define AOM_AV1_COMMON_BLOCKD_H_
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/mem.h"
+#include "aom_scale/yv12config.h"
+
+#include "av1/common/common_data.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/entropy.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/mv.h"
+#include "av1/common/scale.h"
+#include "av1/common/seg_common.h"
+#include "av1/common/tile_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define USE_B_QUANT_NO_TRELLIS 1
+
+#define MAX_MB_PLANE 3
+
+#define MAX_DIFFWTD_MASK_BITS 1
+
+#define INTERINTRA_WEDGE_SIGN 0
+
+// DIFFWTD_MASK_TYPES should not surpass 1 << MAX_DIFFWTD_MASK_BITS
+enum {
+  DIFFWTD_38 = 0,
+  DIFFWTD_38_INV,
+  DIFFWTD_MASK_TYPES,
+} UENUM1BYTE(DIFFWTD_MASK_TYPE);
+
+enum {
+  KEY_FRAME = 0,
+  INTER_FRAME = 1,
+  INTRA_ONLY_FRAME = 2,  // replaces intra-only
+  S_FRAME = 3,
+  FRAME_TYPES,
+} UENUM1BYTE(FRAME_TYPE);
+
+static INLINE int is_comp_ref_allowed(BLOCK_SIZE bsize) {
+  return AOMMIN(block_size_wide[bsize], block_size_high[bsize]) >= 8;
+}
+
+static INLINE int is_inter_mode(PREDICTION_MODE mode) {
+  return mode >= INTER_MODE_START && mode < INTER_MODE_END;
+}
+
+typedef struct {
+  uint8_t *plane[MAX_MB_PLANE];
+  int stride[MAX_MB_PLANE];
+} BUFFER_SET;
+
+static INLINE int is_inter_singleref_mode(PREDICTION_MODE mode) {
+  return mode >= SINGLE_INTER_MODE_START && mode < SINGLE_INTER_MODE_END;
+}
+static INLINE int is_inter_compound_mode(PREDICTION_MODE mode) {
+  return mode >= COMP_INTER_MODE_START && mode < COMP_INTER_MODE_END;
+}
+
+static INLINE PREDICTION_MODE compound_ref0_mode(PREDICTION_MODE mode) {
+  static const PREDICTION_MODE lut[] = {
+    DC_PRED,        // DC_PRED
+    V_PRED,         // V_PRED
+    H_PRED,         // H_PRED
+    D45_PRED,       // D45_PRED
+    D135_PRED,      // D135_PRED
+    D113_PRED,      // D113_PRED
+    D157_PRED,      // D157_PRED
+    D203_PRED,      // D203_PRED
+    D67_PRED,       // D67_PRED
+    SMOOTH_PRED,    // SMOOTH_PRED
+    SMOOTH_V_PRED,  // SMOOTH_V_PRED
+    SMOOTH_H_PRED,  // SMOOTH_H_PRED
+    PAETH_PRED,     // PAETH_PRED
+    NEARESTMV,      // NEARESTMV
+    NEARMV,         // NEARMV
+    GLOBALMV,       // GLOBALMV
+    NEWMV,          // NEWMV
+    NEARESTMV,      // NEAREST_NEARESTMV
+    NEARMV,         // NEAR_NEARMV
+    NEARESTMV,      // NEAREST_NEWMV
+    NEWMV,          // NEW_NEARESTMV
+    NEARMV,         // NEAR_NEWMV
+    NEWMV,          // NEW_NEARMV
+    GLOBALMV,       // GLOBAL_GLOBALMV
+    NEWMV,          // NEW_NEWMV
+  };
+  assert(NELEMENTS(lut) == MB_MODE_COUNT);
+  assert(is_inter_compound_mode(mode) || is_inter_singleref_mode(mode));
+  return lut[mode];
+}
+
+static INLINE PREDICTION_MODE compound_ref1_mode(PREDICTION_MODE mode) {
+  static const PREDICTION_MODE lut[] = {
+    MB_MODE_COUNT,  // DC_PRED
+    MB_MODE_COUNT,  // V_PRED
+    MB_MODE_COUNT,  // H_PRED
+    MB_MODE_COUNT,  // D45_PRED
+    MB_MODE_COUNT,  // D135_PRED
+    MB_MODE_COUNT,  // D113_PRED
+    MB_MODE_COUNT,  // D157_PRED
+    MB_MODE_COUNT,  // D203_PRED
+    MB_MODE_COUNT,  // D67_PRED
+    MB_MODE_COUNT,  // SMOOTH_PRED
+    MB_MODE_COUNT,  // SMOOTH_V_PRED
+    MB_MODE_COUNT,  // SMOOTH_H_PRED
+    MB_MODE_COUNT,  // PAETH_PRED
+    MB_MODE_COUNT,  // NEARESTMV
+    MB_MODE_COUNT,  // NEARMV
+    MB_MODE_COUNT,  // GLOBALMV
+    MB_MODE_COUNT,  // NEWMV
+    NEARESTMV,      // NEAREST_NEARESTMV
+    NEARMV,         // NEAR_NEARMV
+    NEWMV,          // NEAREST_NEWMV
+    NEARESTMV,      // NEW_NEARESTMV
+    NEWMV,          // NEAR_NEWMV
+    NEARMV,         // NEW_NEARMV
+    GLOBALMV,       // GLOBAL_GLOBALMV
+    NEWMV,          // NEW_NEWMV
+  };
+  assert(NELEMENTS(lut) == MB_MODE_COUNT);
+  assert(is_inter_compound_mode(mode));
+  return lut[mode];
+}
+
+static INLINE int have_nearmv_in_inter_mode(PREDICTION_MODE mode) {
+  return (mode == NEARMV || mode == NEAR_NEARMV || mode == NEAR_NEWMV ||
+          mode == NEW_NEARMV);
+}
+
+static INLINE int have_newmv_in_inter_mode(PREDICTION_MODE mode) {
+  return (mode == NEWMV || mode == NEW_NEWMV || mode == NEAREST_NEWMV ||
+          mode == NEW_NEARESTMV || mode == NEAR_NEWMV || mode == NEW_NEARMV);
+}
+
+static INLINE int is_masked_compound_type(COMPOUND_TYPE type) {
+  return (type == COMPOUND_WEDGE || type == COMPOUND_DIFFWTD);
+}
+
+/* For keyframes, intra block modes are predicted by the (already decoded)
+   modes for the Y blocks to the left and above us; for interframes, there
+   is a single probability table. */
+
+typedef struct {
+  // Value of base colors for Y, U, and V
+  uint16_t palette_colors[3 * PALETTE_MAX_SIZE];
+  // Number of base colors for Y (0) and UV (1)
+  uint8_t palette_size[2];
+} PALETTE_MODE_INFO;
+
+typedef struct {
+  FILTER_INTRA_MODE filter_intra_mode;
+  uint8_t use_filter_intra;
+} FILTER_INTRA_MODE_INFO;
+
+static const PREDICTION_MODE fimode_to_intradir[FILTER_INTRA_MODES] = {
+  DC_PRED, V_PRED, H_PRED, D157_PRED, DC_PRED
+};
+
+#if CONFIG_RD_DEBUG
+#define TXB_COEFF_COST_MAP_SIZE (MAX_MIB_SIZE)
+#endif
+
+typedef struct RD_STATS {
+  int rate;
+  int64_t dist;
+  // Please be careful of using rdcost, it's not guaranteed to be set all the
+  // time.
+  // TODO(angiebird): Create a set of functions to manipulate the RD_STATS. In
+  // these functions, make sure rdcost is always up-to-date according to
+  // rate/dist.
+  int64_t rdcost;
+  int64_t sse;
+  int skip;  // sse should equal to dist when skip == 1
+  int zero_rate;
+#if CONFIG_RD_DEBUG
+  int txb_coeff_cost[MAX_MB_PLANE];
+  // TODO(jingning): Temporary solution to silence stack over-size warning
+  // in handle_inter_mode. This should be fixed after rate-distortion
+  // optimization refactoring.
+  int16_t txb_coeff_cost_map[MAX_MB_PLANE][TXB_COEFF_COST_MAP_SIZE]
+                            [TXB_COEFF_COST_MAP_SIZE];
+#endif  // CONFIG_RD_DEBUG
+} RD_STATS;
+
+// This struct is used to group function args that are commonly
+// sent together in functions related to interinter compound modes
+typedef struct {
+  uint8_t *seg_mask;
+  int8_t wedge_index;
+  int8_t wedge_sign;
+  DIFFWTD_MASK_TYPE mask_type;
+  COMPOUND_TYPE type;
+} INTERINTER_COMPOUND_DATA;
+
+#define INTER_TX_SIZE_BUF_LEN 16
+#define TXK_TYPE_BUF_LEN 64
+// This structure now relates to 4x4 block regions.
+typedef struct MB_MODE_INFO {
+  // interinter members
+  INTERINTER_COMPOUND_DATA interinter_comp;
+  WarpedMotionParams wm_params;
+  int_mv mv[2];
+  int current_qindex;
+  // Only for INTER blocks
+  int_interpfilters interp_filters;
+  // TODO(debargha): Consolidate these flags
+#if CONFIG_RD_DEBUG
+  RD_STATS rd_stats;
+  int mi_row;
+  int mi_col;
+#endif
+#if CONFIG_INSPECTION
+  int16_t tx_skip[TXK_TYPE_BUF_LEN];
+#endif
+  PALETTE_MODE_INFO palette_mode_info;
+  // Common for both INTER and INTRA blocks
+  BLOCK_SIZE sb_type;
+  PREDICTION_MODE mode;
+  // Only for INTRA blocks
+  UV_PREDICTION_MODE uv_mode;
+  // interintra members
+  INTERINTRA_MODE interintra_mode;
+  MOTION_MODE motion_mode;
+  PARTITION_TYPE partition;
+  MV_REFERENCE_FRAME ref_frame[2];
+  FILTER_INTRA_MODE_INFO filter_intra_mode_info;
+  int8_t skip;
+  uint8_t inter_tx_size[INTER_TX_SIZE_BUF_LEN];
+  TX_SIZE tx_size;
+  int8_t delta_lf_from_base;
+  int8_t delta_lf[FRAME_LF_COUNT];
+  int8_t interintra_wedge_index;
+  // The actual prediction angle is the base angle + (angle_delta * step).
+  int8_t angle_delta[PLANE_TYPES];
+  /* deringing gain *per-superblock* */
+  // Joint sign of alpha Cb and alpha Cr
+  int8_t cfl_alpha_signs;
+  // Index of the alpha Cb and alpha Cr combination
+  uint8_t cfl_alpha_idx;
+  uint8_t num_proj_ref;
+  uint8_t overlappable_neighbors[2];
+  // If comp_group_idx=0, indicate if dist_wtd_comp(0) or avg_comp(1) is used.
+  uint8_t compound_idx;
+  uint8_t use_wedge_interintra : 1;
+  uint8_t segment_id : 3;
+  uint8_t seg_id_predicted : 1;  // valid only when temporal_update is enabled
+  uint8_t skip_mode : 1;
+  uint8_t use_intrabc : 1;
+  uint8_t ref_mv_idx : 2;
+  // Indicate if masked compound is used(1) or not(0).
+  uint8_t comp_group_idx : 1;
+  int8_t cdef_strength : 4;
+} MB_MODE_INFO;
+
+static INLINE int is_intrabc_block(const MB_MODE_INFO *mbmi) {
+  return mbmi->use_intrabc;
+}
+
+static INLINE PREDICTION_MODE get_uv_mode(UV_PREDICTION_MODE mode) {
+  assert(mode < UV_INTRA_MODES);
+  static const PREDICTION_MODE uv2y[] = {
+    DC_PRED,        // UV_DC_PRED
+    V_PRED,         // UV_V_PRED
+    H_PRED,         // UV_H_PRED
+    D45_PRED,       // UV_D45_PRED
+    D135_PRED,      // UV_D135_PRED
+    D113_PRED,      // UV_D113_PRED
+    D157_PRED,      // UV_D157_PRED
+    D203_PRED,      // UV_D203_PRED
+    D67_PRED,       // UV_D67_PRED
+    SMOOTH_PRED,    // UV_SMOOTH_PRED
+    SMOOTH_V_PRED,  // UV_SMOOTH_V_PRED
+    SMOOTH_H_PRED,  // UV_SMOOTH_H_PRED
+    PAETH_PRED,     // UV_PAETH_PRED
+    DC_PRED,        // UV_CFL_PRED
+    INTRA_INVALID,  // UV_INTRA_MODES
+    INTRA_INVALID,  // UV_MODE_INVALID
+  };
+  return uv2y[mode];
+}
+
+static INLINE int is_inter_block(const MB_MODE_INFO *mbmi) {
+  return is_intrabc_block(mbmi) || mbmi->ref_frame[0] > INTRA_FRAME;
+}
+
+static INLINE int has_second_ref(const MB_MODE_INFO *mbmi) {
+  return mbmi->ref_frame[1] > INTRA_FRAME;
+}
+
+static INLINE int has_uni_comp_refs(const MB_MODE_INFO *mbmi) {
+  return has_second_ref(mbmi) && (!((mbmi->ref_frame[0] >= BWDREF_FRAME) ^
+                                    (mbmi->ref_frame[1] >= BWDREF_FRAME)));
+}
+
+static INLINE MV_REFERENCE_FRAME comp_ref0(int ref_idx) {
+  static const MV_REFERENCE_FRAME lut[] = {
+    LAST_FRAME,     // LAST_LAST2_FRAMES,
+    LAST_FRAME,     // LAST_LAST3_FRAMES,
+    LAST_FRAME,     // LAST_GOLDEN_FRAMES,
+    BWDREF_FRAME,   // BWDREF_ALTREF_FRAMES,
+    LAST2_FRAME,    // LAST2_LAST3_FRAMES
+    LAST2_FRAME,    // LAST2_GOLDEN_FRAMES,
+    LAST3_FRAME,    // LAST3_GOLDEN_FRAMES,
+    BWDREF_FRAME,   // BWDREF_ALTREF2_FRAMES,
+    ALTREF2_FRAME,  // ALTREF2_ALTREF_FRAMES,
+  };
+  assert(NELEMENTS(lut) == TOTAL_UNIDIR_COMP_REFS);
+  return lut[ref_idx];
+}
+
+static INLINE MV_REFERENCE_FRAME comp_ref1(int ref_idx) {
+  static const MV_REFERENCE_FRAME lut[] = {
+    LAST2_FRAME,    // LAST_LAST2_FRAMES,
+    LAST3_FRAME,    // LAST_LAST3_FRAMES,
+    GOLDEN_FRAME,   // LAST_GOLDEN_FRAMES,
+    ALTREF_FRAME,   // BWDREF_ALTREF_FRAMES,
+    LAST3_FRAME,    // LAST2_LAST3_FRAMES
+    GOLDEN_FRAME,   // LAST2_GOLDEN_FRAMES,
+    GOLDEN_FRAME,   // LAST3_GOLDEN_FRAMES,
+    ALTREF2_FRAME,  // BWDREF_ALTREF2_FRAMES,
+    ALTREF_FRAME,   // ALTREF2_ALTREF_FRAMES,
+  };
+  assert(NELEMENTS(lut) == TOTAL_UNIDIR_COMP_REFS);
+  return lut[ref_idx];
+}
+
+PREDICTION_MODE av1_left_block_mode(const MB_MODE_INFO *left_mi);
+
+PREDICTION_MODE av1_above_block_mode(const MB_MODE_INFO *above_mi);
+
+static INLINE int is_global_mv_block(const MB_MODE_INFO *const mbmi,
+                                     TransformationType type) {
+  const PREDICTION_MODE mode = mbmi->mode;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const int block_size_allowed =
+      AOMMIN(block_size_wide[bsize], block_size_high[bsize]) >= 8;
+  return (mode == GLOBALMV || mode == GLOBAL_GLOBALMV) && type > TRANSLATION &&
+         block_size_allowed;
+}
+
+#if CONFIG_MISMATCH_DEBUG
+static INLINE void mi_to_pixel_loc(int *pixel_c, int *pixel_r, int mi_col,
+                                   int mi_row, int tx_blk_col, int tx_blk_row,
+                                   int subsampling_x, int subsampling_y) {
+  *pixel_c = ((mi_col >> subsampling_x) << MI_SIZE_LOG2) +
+             (tx_blk_col << MI_SIZE_LOG2);
+  *pixel_r = ((mi_row >> subsampling_y) << MI_SIZE_LOG2) +
+             (tx_blk_row << MI_SIZE_LOG2);
+}
+#endif
+
+enum { MV_PRECISION_Q3, MV_PRECISION_Q4 } UENUM1BYTE(mv_precision);
+
+struct buf_2d {
+  uint8_t *buf;
+  uint8_t *buf0;
+  int width;
+  int height;
+  int stride;
+};
+
+typedef struct eob_info {
+  uint16_t eob;
+  uint16_t max_scan_line;
+} eob_info;
+
+typedef struct {
+  DECLARE_ALIGNED(32, tran_low_t, dqcoeff[MAX_MB_PLANE][MAX_SB_SQUARE]);
+  eob_info eob_data[MAX_MB_PLANE]
+                   [MAX_SB_SQUARE / (TX_SIZE_W_MIN * TX_SIZE_H_MIN)];
+  DECLARE_ALIGNED(16, uint8_t, color_index_map[2][MAX_SB_SQUARE]);
+} CB_BUFFER;
+
+typedef struct macroblockd_plane {
+  tran_low_t *dqcoeff;
+  tran_low_t *dqcoeff_block;
+  eob_info *eob_data;
+  PLANE_TYPE plane_type;
+  int subsampling_x;
+  int subsampling_y;
+  struct buf_2d dst;
+  struct buf_2d pre[2];
+  ENTROPY_CONTEXT *above_entropy_context;
+  ENTROPY_CONTEXT *left_entropy_context;
+
+  // The dequantizers below are true dequantizers used only in the
+  // dequantization process.  They have the same coefficient
+  // shift/scale as TX.
+  int16_t seg_dequant_QTX[MAX_SEGMENTS][2];
+  uint8_t *color_index_map;
+
+  // block size in pixels
+  uint8_t width, height;
+
+  qm_val_t *seg_iqmatrix[MAX_SEGMENTS][TX_SIZES_ALL];
+  qm_val_t *seg_qmatrix[MAX_SEGMENTS][TX_SIZES_ALL];
+} MACROBLOCKD_PLANE;
+
+#define BLOCK_OFFSET(i) ((i) << 4)
+
+typedef struct {
+  DECLARE_ALIGNED(16, InterpKernel, vfilter);
+  DECLARE_ALIGNED(16, InterpKernel, hfilter);
+} WienerInfo;
+
+typedef struct {
+  int ep;
+  int xqd[2];
+} SgrprojInfo;
+
+#if CONFIG_DEBUG
+#define CFL_SUB8X8_VAL_MI_SIZE (4)
+#define CFL_SUB8X8_VAL_MI_SQUARE \
+  (CFL_SUB8X8_VAL_MI_SIZE * CFL_SUB8X8_VAL_MI_SIZE)
+#endif  // CONFIG_DEBUG
+#define CFL_MAX_BLOCK_SIZE (BLOCK_32X32)
+#define CFL_BUF_LINE (32)
+#define CFL_BUF_LINE_I128 (CFL_BUF_LINE >> 3)
+#define CFL_BUF_LINE_I256 (CFL_BUF_LINE >> 4)
+#define CFL_BUF_SQUARE (CFL_BUF_LINE * CFL_BUF_LINE)
+typedef struct cfl_ctx {
+  // Q3 reconstructed luma pixels (only Q2 is required, but Q3 is used to avoid
+  // shifts)
+  uint16_t recon_buf_q3[CFL_BUF_SQUARE];
+  // Q3 AC contributions (reconstructed luma pixels - tx block avg)
+  int16_t ac_buf_q3[CFL_BUF_SQUARE];
+
+  // Cache the DC_PRED when performing RDO, so it does not have to be recomputed
+  // for every scaling parameter
+  int dc_pred_is_cached[CFL_PRED_PLANES];
+  // The DC_PRED cache is disable when decoding
+  int use_dc_pred_cache;
+  // Only cache the first row of the DC_PRED
+  int16_t dc_pred_cache[CFL_PRED_PLANES][CFL_BUF_LINE];
+
+  // Height and width currently used in the CfL prediction buffer.
+  int buf_height, buf_width;
+
+  int are_parameters_computed;
+
+  // Chroma subsampling
+  int subsampling_x, subsampling_y;
+
+  // Whether the reconstructed luma pixels need to be stored
+  int store_y;
+
+#if CONFIG_DEBUG
+  int rate;
+#endif  // CONFIG_DEBUG
+} CFL_CTX;
+
+typedef struct dist_wtd_comp_params {
+  int use_dist_wtd_comp_avg;
+  int fwd_offset;
+  int bck_offset;
+} DIST_WTD_COMP_PARAMS;
+
+struct scale_factors;
+
+// Most/all of the pointers are mere pointers to actual arrays are allocated
+// elsewhere. This is mostly for coding convenience.
+typedef struct macroblockd {
+  // Row and column position of current macroblock in mi units.
+  int mi_row;
+  int mi_col;
+  // Same as cm->mi_params.mi_stride, copied here for convenience.
+  int mi_stride;
+
+  // True if current block transmits chroma information.
+  // More detail:
+  // Smallest supported block size for both luma and chroma plane is 4x4. Hence,
+  // in case of subsampled chroma plane (YUV 4:2:0 or YUV 4:2:2), multiple luma
+  // blocks smaller than 8x8 maybe combined into one chroma block.
+  // For example, for YUV 4:2:0, let's say an 8x8 area is split into four 4x4
+  // luma blocks. Then, a single chroma block of size 4x4 will cover the area of
+  // these four luma blocks. This is implemented in bitstream as follows:
+  // - There are four MB_MODE_INFO structs for the four luma blocks.
+  // - First 3 MB_MODE_INFO have is_chroma_ref = false, and so do not transmit
+  // any information for chroma planes.
+  // - Last block will have is_chroma_ref = true and transmits chroma
+  // information for the 4x4 chroma block that covers whole 8x8 area covered by
+  // four luma blocks.
+  // Similar logic applies for chroma blocks that cover 2 or 3 luma blocks.
+  bool is_chroma_ref;
+
+  struct macroblockd_plane plane[MAX_MB_PLANE];
+
+  TileInfo tile;
+
+  // Appropriate offset inside cm->mi_params.mi_grid_base based on current
+  // mi_row and mi_col.
+  MB_MODE_INFO **mi;
+
+  // True if 4x4 block above the current block is available.
+  bool up_available;
+  // True if 4x4 block to the left of the current block is available.
+  bool left_available;
+  // True if the above chrome reference block is available.
+  bool chroma_up_available;
+  // True if the left chrome reference block is available.
+  bool chroma_left_available;
+
+  // MB_MODE_INFO for 4x4 block to the left of the current block, if
+  // left_available == true; otherwise NULL.
+  MB_MODE_INFO *left_mbmi;
+  // MB_MODE_INFO for 4x4 block above the current block, if
+  // up_available == true; otherwise NULL.
+  MB_MODE_INFO *above_mbmi;
+  // Above chroma reference block if is_chroma_ref == true for the current block
+  // and chroma_up_available == true; otherwise NULL.
+  // See also: the special case logic when current chroma block covers more than
+  // one luma blocks in set_mi_row_col().
+  MB_MODE_INFO *chroma_left_mbmi;
+  // Left chroma reference block if is_chroma_ref == true for the current block
+  // and chroma_left_available == true; otherwise NULL.
+  // See also: the special case logic when current chroma block covers more than
+  // one luma blocks in set_mi_row_col().
+  MB_MODE_INFO *chroma_above_mbmi;
+
+  // Appropriate offset based on current 'mi_row' and 'mi_col', inside
+  // 'tx_type_map' in one of 'CommonModeInfoParams', 'PICK_MODE_CONTEXT' or
+  // 'MACROBLOCK' structs.
+  uint8_t *tx_type_map;
+  // Stride for 'tx_type_map'. Note that this may / may not be same as
+  // 'mi_stride', depending on which actual array 'tx_type_map' points to.
+  int tx_type_map_stride;
+
+  // Distance of this macroblock from frame edges in 1/8th pixel units.
+  int mb_to_left_edge;
+  int mb_to_right_edge;
+  int mb_to_top_edge;
+  int mb_to_bottom_edge;
+
+  // Scale factors for reference frames of the current block.
+  // These are pointers into 'cm->ref_scale_factors'.
+  const struct scale_factors *block_ref_scale_factors[2];
+
+  const YV12_BUFFER_CONFIG *cur_buf;
+
+  // Entropy contexts for the above blocks.
+  // above_entropy_context[i][j] corresponds to above entropy context for ith
+  // plane and jth mi column of this *frame*, wrt current 'mi_row'.
+  // These are pointers into 'cm->above_contexts.entropy'.
+  ENTROPY_CONTEXT *above_entropy_context[MAX_MB_PLANE];
+  // Entropy contexts for the left blocks.
+  // left_entropy_context[i][j] corresponds to left entropy context for ith
+  // plane and jth mi row of this *superblock*, wrt current 'mi_col'.
+  // Note: These contain actual data, NOT pointers.
+  ENTROPY_CONTEXT left_entropy_context[MAX_MB_PLANE][MAX_MIB_SIZE];
+
+  // Partition contexts for the above blocks.
+  // above_partition_context[i] corresponds to above partition context for ith
+  // mi column of this *frame*, wrt current 'mi_row'.
+  // These are pointers into 'cm->above_contexts.partition'.
+  PARTITION_CONTEXT *above_partition_context;
+  // Partition contexts for the left blocks.
+  // left_partition_context[i] corresponds to left partition context for ith
+  // mi row of this *superblock*, wrt current 'mi_col'.
+  // Note: These contain actual data, NOT pointers.
+  PARTITION_CONTEXT left_partition_context[MAX_MIB_SIZE];
+
+  // Transform contexts for the above blocks.
+  // TODO(urvang): Indexed two different ways from cm->above_contexts.txfm in
+  // code currently. Need to make it consistent / document why.
+  TXFM_CONTEXT *above_txfm_context;
+  // Transform contexts for the left blocks.
+  TXFM_CONTEXT *left_txfm_context;
+  // TODO(urvang): 'left_txfm_context' points to 'left_txfm_context_buffer'.
+  // Can we remove this indirection?
+  TXFM_CONTEXT left_txfm_context_buffer[MAX_MIB_SIZE];
+
+  // Default values for the two restoration filters for each plane.
+  // These values are used as reference values when writing the bitstream. That
+  // is, we transmit the delta between the actual values in
+  // cm->rst_info[plane].unit_info[unit_idx] and these reference values.
+  WienerInfo wiener_info[MAX_MB_PLANE];
+  SgrprojInfo sgrproj_info[MAX_MB_PLANE];
+
+  // Block dimensions in MB_MODE_INFO units.
+  uint8_t width;
+  uint8_t height;
+
+  uint8_t ref_mv_count[MODE_CTX_REF_FRAMES];
+  CANDIDATE_MV ref_mv_stack[MODE_CTX_REF_FRAMES][MAX_REF_MV_STACK_SIZE];
+  uint16_t weight[MODE_CTX_REF_FRAMES][MAX_REF_MV_STACK_SIZE];
+  uint8_t is_sec_rect;
+
+  // Counts of each reference frame in the above and left neighboring blocks.
+  // NOTE: Take into account both single and comp references.
+  uint8_t neighbors_ref_counts[REF_FRAMES];
+
+  FRAME_CONTEXT *tile_ctx;
+  // Bit depth: copied from cm->seq_params.bit_depth for convenience.
+  int bd;
+
+  int qindex[MAX_SEGMENTS];
+  int lossless[MAX_SEGMENTS];
+  // TODO(urvang): Move to decoder.
+  int corrupted;
+  // Same as cm->features.cur_frame_force_integer_mv.
+  int cur_frame_force_integer_mv;
+  // Pointer to cm->error.
+  struct aom_internal_error_info *error_info;
+  // Same as cm->global_motion.
+  const WarpedMotionParams *global_motion;
+  int delta_qindex;
+  int current_qindex;
+  // Since actual frame level loop filtering level value is not available
+  // at the beginning of the tile (only available during actual filtering)
+  // at encoder side.we record the delta_lf (against the frame level loop
+  // filtering level) and code the delta between previous superblock's delta
+  // lf and current delta lf. It is equivalent to the delta between previous
+  // superblock's actual lf and current lf.
+  int8_t delta_lf_from_base;
+  // For this experiment, we have four frame filter levels for different plane
+  // and direction. So, to support the per superblock update, we need to add
+  // a few more params as below.
+  // 0: delta loop filter level for y plane vertical
+  // 1: delta loop filter level for y plane horizontal
+  // 2: delta loop filter level for u plane
+  // 3: delta loop filter level for v plane
+  // To make it consistent with the reference to each filter level in segment,
+  // we need to -1, since
+  // SEG_LVL_ALT_LF_Y_V = 1;
+  // SEG_LVL_ALT_LF_Y_H = 2;
+  // SEG_LVL_ALT_LF_U   = 3;
+  // SEG_LVL_ALT_LF_V   = 4;
+  int8_t delta_lf[FRAME_LF_COUNT];
+  // cdef_transmitted[i] is true if CDEF strength for ith CDEF unit in the
+  // current superblock has already been read from (decoder) / written to
+  // (encoder) the bitstream; and false otherwise.
+  // More detail:
+  // (1) CDEF strength is transmitted only once per CDEF unit, in the 1st
+  // non-skip coding block. So, we need this array to keep track of whether CDEF
+  // strengths for the given CDEF units have been transmitted yet or not.
+  // (2) Superblock size can be either 128x128 or 64x64, but CDEF unit size is
+  // fixed to be 64x64. So, there may be 4 CDEF units within a superblock (if
+  // superblock size is 128x128). Hence the array size is 4.
+  // (3) In the current implementation, CDEF strength for this CDEF unit is
+  // stored in the MB_MODE_INFO of the 1st block in this CDEF unit (inside
+  // cm->mi_params.mi_grid_base).
+  bool cdef_transmitted[4];
+
+  DECLARE_ALIGNED(16, uint8_t, seg_mask[2 * MAX_SB_SQUARE]);
+  uint8_t *mc_buf[2];
+  CFL_CTX cfl;
+
+  DIST_WTD_COMP_PARAMS jcp_param;
+
+  uint16_t cb_offset[MAX_MB_PLANE];
+  uint16_t txb_offset[MAX_MB_PLANE];
+  uint16_t color_index_map_offset[2];
+
+  CONV_BUF_TYPE *tmp_conv_dst;
+  uint8_t *tmp_obmc_bufs[2];
+} MACROBLOCKD;
+
+static INLINE int is_cur_buf_hbd(const MACROBLOCKD *xd) {
+  return xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH ? 1 : 0;
+}
+
+static INLINE uint8_t *get_buf_by_bd(const MACROBLOCKD *xd, uint8_t *buf16) {
+  return (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+             ? CONVERT_TO_BYTEPTR(buf16)
+             : buf16;
+}
+
+static INLINE int get_sqr_bsize_idx(BLOCK_SIZE bsize) {
+  switch (bsize) {
+    case BLOCK_4X4: return 0;
+    case BLOCK_8X8: return 1;
+    case BLOCK_16X16: return 2;
+    case BLOCK_32X32: return 3;
+    case BLOCK_64X64: return 4;
+    case BLOCK_128X128: return 5;
+    default: return SQR_BLOCK_SIZES;
+  }
+}
+
+// For a square block size 'bsize', returns the size of the sub-blocks used by
+// the given partition type. If the partition produces sub-blocks of different
+// sizes, then the function returns the largest sub-block size.
+// Implements the Partition_Subsize lookup table in the spec (Section 9.3.
+// Conversion tables).
+// Note: the input block size should be square.
+// Otherwise it's considered invalid.
+static INLINE BLOCK_SIZE get_partition_subsize(BLOCK_SIZE bsize,
+                                               PARTITION_TYPE partition) {
+  if (partition == PARTITION_INVALID) {
+    return BLOCK_INVALID;
+  } else {
+    const int sqr_bsize_idx = get_sqr_bsize_idx(bsize);
+    return sqr_bsize_idx >= SQR_BLOCK_SIZES
+               ? BLOCK_INVALID
+               : subsize_lookup[partition][sqr_bsize_idx];
+  }
+}
+
+static TX_TYPE intra_mode_to_tx_type(const MB_MODE_INFO *mbmi,
+                                     PLANE_TYPE plane_type) {
+  static const TX_TYPE _intra_mode_to_tx_type[INTRA_MODES] = {
+    DCT_DCT,    // DC_PRED
+    ADST_DCT,   // V_PRED
+    DCT_ADST,   // H_PRED
+    DCT_DCT,    // D45_PRED
+    ADST_ADST,  // D135_PRED
+    ADST_DCT,   // D113_PRED
+    DCT_ADST,   // D157_PRED
+    DCT_ADST,   // D203_PRED
+    ADST_DCT,   // D67_PRED
+    ADST_ADST,  // SMOOTH_PRED
+    ADST_DCT,   // SMOOTH_V_PRED
+    DCT_ADST,   // SMOOTH_H_PRED
+    ADST_ADST,  // PAETH_PRED
+  };
+  const PREDICTION_MODE mode =
+      (plane_type == PLANE_TYPE_Y) ? mbmi->mode : get_uv_mode(mbmi->uv_mode);
+  assert(mode < INTRA_MODES);
+  return _intra_mode_to_tx_type[mode];
+}
+
+static INLINE int is_rect_tx(TX_SIZE tx_size) { return tx_size >= TX_SIZES; }
+
+static INLINE int block_signals_txsize(BLOCK_SIZE bsize) {
+  return bsize > BLOCK_4X4;
+}
+
+// Number of transform types in each set type
+static const int av1_num_ext_tx_set[EXT_TX_SET_TYPES] = {
+  1, 2, 5, 7, 12, 16,
+};
+
+static const int av1_ext_tx_used[EXT_TX_SET_TYPES][TX_TYPES] = {
+  { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0 },
+  { 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0 },
+  { 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0 },
+  { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0 },
+  { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
+};
+
+static const uint16_t av1_reduced_intra_tx_used_flag[INTRA_MODES] = {
+  0x080F,  // DC_PRED:       0000 1000 0000 1111
+  0x040F,  // V_PRED:        0000 0100 0000 1111
+  0x080F,  // H_PRED:        0000 1000 0000 1111
+  0x020F,  // D45_PRED:      0000 0010 0000 1111
+  0x080F,  // D135_PRED:     0000 1000 0000 1111
+  0x040F,  // D113_PRED:     0000 0100 0000 1111
+  0x080F,  // D157_PRED:     0000 1000 0000 1111
+  0x080F,  // D203_PRED:     0000 1000 0000 1111
+  0x040F,  // D67_PRED:      0000 0100 0000 1111
+  0x080F,  // SMOOTH_PRED:   0000 1000 0000 1111
+  0x040F,  // SMOOTH_V_PRED: 0000 0100 0000 1111
+  0x080F,  // SMOOTH_H_PRED: 0000 1000 0000 1111
+  0x0C0E,  // PAETH_PRED:    0000 1100 0000 1110
+};
+
+static const uint16_t av1_ext_tx_used_flag[EXT_TX_SET_TYPES] = {
+  0x0001,  // 0000 0000 0000 0001
+  0x0201,  // 0000 0010 0000 0001
+  0x020F,  // 0000 0010 0000 1111
+  0x0E0F,  // 0000 1110 0000 1111
+  0x0FFF,  // 0000 1111 1111 1111
+  0xFFFF,  // 1111 1111 1111 1111
+};
+
+static const TxSetType av1_ext_tx_set_lookup[2][2] = {
+  { EXT_TX_SET_DTT4_IDTX_1DDCT, EXT_TX_SET_DTT4_IDTX },
+  { EXT_TX_SET_ALL16, EXT_TX_SET_DTT9_IDTX_1DDCT },
+};
+
+static INLINE TxSetType av1_get_ext_tx_set_type(TX_SIZE tx_size, int is_inter,
+                                                int use_reduced_set) {
+  const TX_SIZE tx_size_sqr_up = txsize_sqr_up_map[tx_size];
+  if (tx_size_sqr_up > TX_32X32) return EXT_TX_SET_DCTONLY;
+  if (tx_size_sqr_up == TX_32X32)
+    return is_inter ? EXT_TX_SET_DCT_IDTX : EXT_TX_SET_DCTONLY;
+  if (use_reduced_set)
+    return is_inter ? EXT_TX_SET_DCT_IDTX : EXT_TX_SET_DTT4_IDTX;
+  const TX_SIZE tx_size_sqr = txsize_sqr_map[tx_size];
+  return av1_ext_tx_set_lookup[is_inter][tx_size_sqr == TX_16X16];
+}
+
+// Maps tx set types to the indices.
+static const int ext_tx_set_index[2][EXT_TX_SET_TYPES] = {
+  { // Intra
+    0, -1, 2, 1, -1, -1 },
+  { // Inter
+    0, 3, -1, -1, 2, 1 },
+};
+
+static INLINE int get_ext_tx_set(TX_SIZE tx_size, int is_inter,
+                                 int use_reduced_set) {
+  const TxSetType set_type =
+      av1_get_ext_tx_set_type(tx_size, is_inter, use_reduced_set);
+  return ext_tx_set_index[is_inter][set_type];
+}
+
+static INLINE int get_ext_tx_types(TX_SIZE tx_size, int is_inter,
+                                   int use_reduced_set) {
+  const int set_type =
+      av1_get_ext_tx_set_type(tx_size, is_inter, use_reduced_set);
+  return av1_num_ext_tx_set[set_type];
+}
+
+#define TXSIZEMAX(t1, t2) (tx_size_2d[(t1)] >= tx_size_2d[(t2)] ? (t1) : (t2))
+#define TXSIZEMIN(t1, t2) (tx_size_2d[(t1)] <= tx_size_2d[(t2)] ? (t1) : (t2))
+
+static INLINE TX_SIZE tx_size_from_tx_mode(BLOCK_SIZE bsize, TX_MODE tx_mode) {
+  const TX_SIZE largest_tx_size = tx_mode_to_biggest_tx_size[tx_mode];
+  const TX_SIZE max_rect_tx_size = max_txsize_rect_lookup[bsize];
+  if (bsize == BLOCK_4X4)
+    return AOMMIN(max_txsize_lookup[bsize], largest_tx_size);
+  if (txsize_sqr_map[max_rect_tx_size] <= largest_tx_size)
+    return max_rect_tx_size;
+  else
+    return largest_tx_size;
+}
+
+static const uint8_t mode_to_angle_map[] = {
+  0, 90, 180, 45, 135, 113, 157, 203, 67, 0, 0, 0, 0,
+};
+
+// Converts block_index for given transform size to index of the block in raster
+// order.
+static INLINE int av1_block_index_to_raster_order(TX_SIZE tx_size,
+                                                  int block_idx) {
+  // For transform size 4x8, the possible block_idx values are 0 & 2, because
+  // block_idx values are incremented in steps of size 'tx_width_unit x
+  // tx_height_unit'. But, for this transform size, block_idx = 2 corresponds to
+  // block number 1 in raster order, inside an 8x8 MI block.
+  // For any other transform size, the two indices are equivalent.
+  return (tx_size == TX_4X8 && block_idx == 2) ? 1 : block_idx;
+}
+
+// Inverse of above function.
+// Note: only implemented for transform sizes 4x4, 4x8 and 8x4 right now.
+static INLINE int av1_raster_order_to_block_index(TX_SIZE tx_size,
+                                                  int raster_order) {
+  assert(tx_size == TX_4X4 || tx_size == TX_4X8 || tx_size == TX_8X4);
+  // We ensure that block indices are 0 & 2 if tx size is 4x8 or 8x4.
+  return (tx_size == TX_4X4) ? raster_order : (raster_order > 0) ? 2 : 0;
+}
+
+static INLINE TX_TYPE get_default_tx_type(PLANE_TYPE plane_type,
+                                          const MACROBLOCKD *xd,
+                                          TX_SIZE tx_size,
+                                          int is_screen_content_type) {
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+
+  if (is_inter_block(mbmi) || plane_type != PLANE_TYPE_Y ||
+      xd->lossless[mbmi->segment_id] || tx_size >= TX_32X32 ||
+      is_screen_content_type)
+    return DCT_DCT;
+
+  return intra_mode_to_tx_type(mbmi, plane_type);
+}
+
+// Implements the get_plane_residual_size() function in the spec (Section
+// 5.11.38. Get plane residual size function).
+static INLINE BLOCK_SIZE get_plane_block_size(BLOCK_SIZE bsize,
+                                              int subsampling_x,
+                                              int subsampling_y) {
+  assert(bsize < BLOCK_SIZES_ALL);
+  assert(subsampling_x >= 0 && subsampling_x < 2);
+  assert(subsampling_y >= 0 && subsampling_y < 2);
+  return ss_size_lookup[bsize][subsampling_x][subsampling_y];
+}
+
+/*
+ * Logic to generate the lookup tables:
+ *
+ * TX_SIZE txs = max_txsize_rect_lookup[bsize];
+ * for (int level = 0; level < MAX_VARTX_DEPTH - 1; ++level)
+ *   txs = sub_tx_size_map[txs];
+ * const int tx_w_log2 = tx_size_wide_log2[txs] - MI_SIZE_LOG2;
+ * const int tx_h_log2 = tx_size_high_log2[txs] - MI_SIZE_LOG2;
+ * const int bw_uint_log2 = mi_size_wide_log2[bsize];
+ * const int stride_log2 = bw_uint_log2 - tx_w_log2;
+ */
+static INLINE int av1_get_txb_size_index(BLOCK_SIZE bsize, int blk_row,
+                                         int blk_col) {
+  static const uint8_t tw_w_log2_table[BLOCK_SIZES_ALL] = {
+    0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 0, 1, 1, 2, 2, 3,
+  };
+  static const uint8_t tw_h_log2_table[BLOCK_SIZES_ALL] = {
+    0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 1, 0, 2, 1, 3, 2,
+  };
+  static const uint8_t stride_log2_table[BLOCK_SIZES_ALL] = {
+    0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 2, 2, 0, 1, 0, 1, 0, 1,
+  };
+  const int index =
+      ((blk_row >> tw_h_log2_table[bsize]) << stride_log2_table[bsize]) +
+      (blk_col >> tw_w_log2_table[bsize]);
+  assert(index < INTER_TX_SIZE_BUF_LEN);
+  return index;
+}
+
+#if CONFIG_INSPECTION
+/*
+ * Here is the logic to generate the lookup tables:
+ *
+ * TX_SIZE txs = max_txsize_rect_lookup[bsize];
+ * for (int level = 0; level < MAX_VARTX_DEPTH; ++level)
+ *   txs = sub_tx_size_map[txs];
+ * const int tx_w_log2 = tx_size_wide_log2[txs] - MI_SIZE_LOG2;
+ * const int tx_h_log2 = tx_size_high_log2[txs] - MI_SIZE_LOG2;
+ * const int bw_uint_log2 = mi_size_wide_log2[bsize];
+ * const int stride_log2 = bw_uint_log2 - tx_w_log2;
+ */
+static INLINE int av1_get_txk_type_index(BLOCK_SIZE bsize, int blk_row,
+                                         int blk_col) {
+  static const uint8_t tw_w_log2_table[BLOCK_SIZES_ALL] = {
+    0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 0, 0, 1, 1, 2, 2,
+  };
+  static const uint8_t tw_h_log2_table[BLOCK_SIZES_ALL] = {
+    0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 0, 0, 1, 1, 2, 2,
+  };
+  static const uint8_t stride_log2_table[BLOCK_SIZES_ALL] = {
+    0, 0, 1, 1, 1, 2, 2, 1, 2, 2, 1, 2, 2, 2, 3, 3, 0, 2, 0, 2, 0, 2,
+  };
+  const int index =
+      ((blk_row >> tw_h_log2_table[bsize]) << stride_log2_table[bsize]) +
+      (blk_col >> tw_w_log2_table[bsize]);
+  assert(index < TXK_TYPE_BUF_LEN);
+  return index;
+}
+#endif  // CONFIG_INSPECTION
+
+static INLINE void update_txk_array(MACROBLOCKD *const xd, int blk_row,
+                                    int blk_col, TX_SIZE tx_size,
+                                    TX_TYPE tx_type) {
+  const int stride = xd->tx_type_map_stride;
+  xd->tx_type_map[blk_row * stride + blk_col] = tx_type;
+
+  const int txw = tx_size_wide_unit[tx_size];
+  const int txh = tx_size_high_unit[tx_size];
+  // The 16x16 unit is due to the constraint from tx_64x64 which sets the
+  // maximum tx size for chroma as 32x32. Coupled with 4x1 transform block
+  // size, the constraint takes effect in 32x16 / 16x32 size too. To solve
+  // the intricacy, cover all the 16x16 units inside a 64 level transform.
+  if (txw == tx_size_wide_unit[TX_64X64] ||
+      txh == tx_size_high_unit[TX_64X64]) {
+    const int tx_unit = tx_size_wide_unit[TX_16X16];
+    for (int idy = 0; idy < txh; idy += tx_unit) {
+      for (int idx = 0; idx < txw; idx += tx_unit) {
+        xd->tx_type_map[(blk_row + idy) * stride + blk_col + idx] = tx_type;
+      }
+    }
+  }
+}
+
+static INLINE TX_TYPE av1_get_tx_type(const MACROBLOCKD *xd,
+                                      PLANE_TYPE plane_type, int blk_row,
+                                      int blk_col, TX_SIZE tx_size,
+                                      int reduced_tx_set) {
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  if (xd->lossless[mbmi->segment_id] || txsize_sqr_up_map[tx_size] > TX_32X32) {
+    return DCT_DCT;
+  }
+
+  TX_TYPE tx_type;
+  if (plane_type == PLANE_TYPE_Y) {
+    tx_type = xd->tx_type_map[blk_row * xd->tx_type_map_stride + blk_col];
+  } else {
+    if (is_inter_block(mbmi)) {
+      // scale back to y plane's coordinate
+      const struct macroblockd_plane *const pd = &xd->plane[plane_type];
+      blk_row <<= pd->subsampling_y;
+      blk_col <<= pd->subsampling_x;
+      tx_type = xd->tx_type_map[blk_row * xd->tx_type_map_stride + blk_col];
+    } else {
+      // In intra mode, uv planes don't share the same prediction mode as y
+      // plane, so the tx_type should not be shared
+      tx_type = intra_mode_to_tx_type(mbmi, PLANE_TYPE_UV);
+    }
+    const TxSetType tx_set_type =
+        av1_get_ext_tx_set_type(tx_size, is_inter_block(mbmi), reduced_tx_set);
+    if (!av1_ext_tx_used[tx_set_type][tx_type]) tx_type = DCT_DCT;
+  }
+  assert(tx_type < TX_TYPES);
+  assert(av1_ext_tx_used[av1_get_ext_tx_set_type(tx_size, is_inter_block(mbmi),
+                                                 reduced_tx_set)][tx_type]);
+  return tx_type;
+}
+
+void av1_setup_block_planes(MACROBLOCKD *xd, int ss_x, int ss_y,
+                            const int num_planes);
+
+/*
+ * Logic to generate the lookup table:
+ *
+ * TX_SIZE tx_size = max_txsize_rect_lookup[bsize];
+ * int depth = 0;
+ * while (depth < MAX_TX_DEPTH && tx_size != TX_4X4) {
+ *   depth++;
+ *   tx_size = sub_tx_size_map[tx_size];
+ * }
+ */
+static INLINE int bsize_to_max_depth(BLOCK_SIZE bsize) {
+  static const uint8_t bsize_to_max_depth_table[BLOCK_SIZES_ALL] = {
+    0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  };
+  return bsize_to_max_depth_table[bsize];
+}
+
+/*
+ * Logic to generate the lookup table:
+ *
+ * TX_SIZE tx_size = max_txsize_rect_lookup[bsize];
+ * assert(tx_size != TX_4X4);
+ * int depth = 0;
+ * while (tx_size != TX_4X4) {
+ *   depth++;
+ *   tx_size = sub_tx_size_map[tx_size];
+ * }
+ * assert(depth < 10);
+ */
+static INLINE int bsize_to_tx_size_cat(BLOCK_SIZE bsize) {
+  assert(bsize < BLOCK_SIZES_ALL);
+  static const uint8_t bsize_to_tx_size_depth_table[BLOCK_SIZES_ALL] = {
+    0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 2, 2, 3, 3, 4, 4,
+  };
+  const int depth = bsize_to_tx_size_depth_table[bsize];
+  assert(depth <= MAX_TX_CATS);
+  return depth - 1;
+}
+
+static INLINE TX_SIZE depth_to_tx_size(int depth, BLOCK_SIZE bsize) {
+  TX_SIZE max_tx_size = max_txsize_rect_lookup[bsize];
+  TX_SIZE tx_size = max_tx_size;
+  for (int d = 0; d < depth; ++d) tx_size = sub_tx_size_map[tx_size];
+  return tx_size;
+}
+
+static INLINE TX_SIZE av1_get_adjusted_tx_size(TX_SIZE tx_size) {
+  switch (tx_size) {
+    case TX_64X64:
+    case TX_64X32:
+    case TX_32X64: return TX_32X32;
+    case TX_64X16: return TX_32X16;
+    case TX_16X64: return TX_16X32;
+    default: return tx_size;
+  }
+}
+
+static INLINE TX_SIZE av1_get_max_uv_txsize(BLOCK_SIZE bsize, int subsampling_x,
+                                            int subsampling_y) {
+  const BLOCK_SIZE plane_bsize =
+      get_plane_block_size(bsize, subsampling_x, subsampling_y);
+  assert(plane_bsize < BLOCK_SIZES_ALL);
+  const TX_SIZE uv_tx = max_txsize_rect_lookup[plane_bsize];
+  return av1_get_adjusted_tx_size(uv_tx);
+}
+
+static INLINE TX_SIZE av1_get_tx_size(int plane, const MACROBLOCKD *xd) {
+  const MB_MODE_INFO *mbmi = xd->mi[0];
+  if (xd->lossless[mbmi->segment_id]) return TX_4X4;
+  if (plane == 0) return mbmi->tx_size;
+  const MACROBLOCKD_PLANE *pd = &xd->plane[plane];
+  return av1_get_max_uv_txsize(mbmi->sb_type, pd->subsampling_x,
+                               pd->subsampling_y);
+}
+
+void av1_reset_entropy_context(MACROBLOCKD *xd, BLOCK_SIZE bsize,
+                               const int num_planes);
+
+void av1_reset_loop_filter_delta(MACROBLOCKD *xd, int num_planes);
+
+void av1_reset_loop_restoration(MACROBLOCKD *xd, const int num_planes);
+
+typedef void (*foreach_transformed_block_visitor)(int plane, int block,
+                                                  int blk_row, int blk_col,
+                                                  BLOCK_SIZE plane_bsize,
+                                                  TX_SIZE tx_size, void *arg);
+
+void av1_set_entropy_contexts(const MACROBLOCKD *xd,
+                              struct macroblockd_plane *pd, int plane,
+                              BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                              int has_eob, int aoff, int loff);
+
+#define MAX_INTERINTRA_SB_SQUARE 32 * 32
+static INLINE int is_interintra_mode(const MB_MODE_INFO *mbmi) {
+  return (mbmi->ref_frame[0] > INTRA_FRAME &&
+          mbmi->ref_frame[1] == INTRA_FRAME);
+}
+
+static INLINE int is_interintra_allowed_bsize(const BLOCK_SIZE bsize) {
+  return (bsize >= BLOCK_8X8) && (bsize <= BLOCK_32X32);
+}
+
+static INLINE int is_interintra_allowed_mode(const PREDICTION_MODE mode) {
+  return (mode >= SINGLE_INTER_MODE_START) && (mode < SINGLE_INTER_MODE_END);
+}
+
+static INLINE int is_interintra_allowed_ref(const MV_REFERENCE_FRAME rf[2]) {
+  return (rf[0] > INTRA_FRAME) && (rf[1] <= INTRA_FRAME);
+}
+
+static INLINE int is_interintra_allowed(const MB_MODE_INFO *mbmi) {
+  return is_interintra_allowed_bsize(mbmi->sb_type) &&
+         is_interintra_allowed_mode(mbmi->mode) &&
+         is_interintra_allowed_ref(mbmi->ref_frame);
+}
+
+static INLINE int is_interintra_allowed_bsize_group(int group) {
+  int i;
+  for (i = 0; i < BLOCK_SIZES_ALL; i++) {
+    if (size_group_lookup[i] == group &&
+        is_interintra_allowed_bsize((BLOCK_SIZE)i)) {
+      return 1;
+    }
+  }
+  return 0;
+}
+
+static INLINE int is_interintra_pred(const MB_MODE_INFO *mbmi) {
+  return mbmi->ref_frame[0] > INTRA_FRAME &&
+         mbmi->ref_frame[1] == INTRA_FRAME && is_interintra_allowed(mbmi);
+}
+
+static INLINE int get_vartx_max_txsize(const MACROBLOCKD *xd, BLOCK_SIZE bsize,
+                                       int plane) {
+  if (xd->lossless[xd->mi[0]->segment_id]) return TX_4X4;
+  const TX_SIZE max_txsize = max_txsize_rect_lookup[bsize];
+  if (plane == 0) return max_txsize;            // luma
+  return av1_get_adjusted_tx_size(max_txsize);  // chroma
+}
+
+static INLINE int is_motion_variation_allowed_bsize(BLOCK_SIZE bsize) {
+  assert(bsize < BLOCK_SIZES_ALL);
+  return AOMMIN(block_size_wide[bsize], block_size_high[bsize]) >= 8;
+}
+
+static INLINE int is_motion_variation_allowed_compound(
+    const MB_MODE_INFO *mbmi) {
+  return !has_second_ref(mbmi);
+}
+
+// input: log2 of length, 0(4), 1(8), ...
+static const int max_neighbor_obmc[6] = { 0, 1, 2, 3, 4, 4 };
+
+static INLINE int check_num_overlappable_neighbors(const MB_MODE_INFO *mbmi) {
+  return !(mbmi->overlappable_neighbors[0] == 0 &&
+           mbmi->overlappable_neighbors[1] == 0);
+}
+
+static INLINE MOTION_MODE
+motion_mode_allowed(const WarpedMotionParams *gm_params, const MACROBLOCKD *xd,
+                    const MB_MODE_INFO *mbmi, int allow_warped_motion) {
+  if (xd->cur_frame_force_integer_mv == 0) {
+    const TransformationType gm_type = gm_params[mbmi->ref_frame[0]].wmtype;
+    if (is_global_mv_block(mbmi, gm_type)) return SIMPLE_TRANSLATION;
+  }
+  if (is_motion_variation_allowed_bsize(mbmi->sb_type) &&
+      is_inter_mode(mbmi->mode) && mbmi->ref_frame[1] != INTRA_FRAME &&
+      is_motion_variation_allowed_compound(mbmi)) {
+    if (!check_num_overlappable_neighbors(mbmi)) return SIMPLE_TRANSLATION;
+    assert(!has_second_ref(mbmi));
+    if (mbmi->num_proj_ref >= 1 &&
+        (allow_warped_motion &&
+         !av1_is_scaled(xd->block_ref_scale_factors[0]))) {
+      if (xd->cur_frame_force_integer_mv) {
+        return OBMC_CAUSAL;
+      }
+      return WARPED_CAUSAL;
+    }
+    return OBMC_CAUSAL;
+  } else {
+    return SIMPLE_TRANSLATION;
+  }
+}
+
+static INLINE int is_neighbor_overlappable(const MB_MODE_INFO *mbmi) {
+  return (is_inter_block(mbmi));
+}
+
+static INLINE int av1_allow_palette(int allow_screen_content_tools,
+                                    BLOCK_SIZE sb_type) {
+  assert(sb_type < BLOCK_SIZES_ALL);
+  return allow_screen_content_tools && block_size_wide[sb_type] <= 64 &&
+         block_size_high[sb_type] <= 64 && sb_type >= BLOCK_8X8;
+}
+
+// Returns sub-sampled dimensions of the given block.
+// The output values for 'rows_within_bounds' and 'cols_within_bounds' will
+// differ from 'height' and 'width' when part of the block is outside the
+// right
+// and/or bottom image boundary.
+static INLINE void av1_get_block_dimensions(BLOCK_SIZE bsize, int plane,
+                                            const MACROBLOCKD *xd, int *width,
+                                            int *height,
+                                            int *rows_within_bounds,
+                                            int *cols_within_bounds) {
+  const int block_height = block_size_high[bsize];
+  const int block_width = block_size_wide[bsize];
+  const int block_rows = (xd->mb_to_bottom_edge >= 0)
+                             ? block_height
+                             : (xd->mb_to_bottom_edge >> 3) + block_height;
+  const int block_cols = (xd->mb_to_right_edge >= 0)
+                             ? block_width
+                             : (xd->mb_to_right_edge >> 3) + block_width;
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  assert(IMPLIES(plane == PLANE_TYPE_Y, pd->subsampling_x == 0));
+  assert(IMPLIES(plane == PLANE_TYPE_Y, pd->subsampling_y == 0));
+  assert(block_width >= block_cols);
+  assert(block_height >= block_rows);
+  const int plane_block_width = block_width >> pd->subsampling_x;
+  const int plane_block_height = block_height >> pd->subsampling_y;
+  // Special handling for chroma sub8x8.
+  const int is_chroma_sub8_x = plane > 0 && plane_block_width < 4;
+  const int is_chroma_sub8_y = plane > 0 && plane_block_height < 4;
+  if (width) *width = plane_block_width + 2 * is_chroma_sub8_x;
+  if (height) *height = plane_block_height + 2 * is_chroma_sub8_y;
+  if (rows_within_bounds) {
+    *rows_within_bounds =
+        (block_rows >> pd->subsampling_y) + 2 * is_chroma_sub8_y;
+  }
+  if (cols_within_bounds) {
+    *cols_within_bounds =
+        (block_cols >> pd->subsampling_x) + 2 * is_chroma_sub8_x;
+  }
+}
+
+/* clang-format off */
+typedef aom_cdf_prob (*MapCdf)[PALETTE_COLOR_INDEX_CONTEXTS]
+                              [CDF_SIZE(PALETTE_COLORS)];
+typedef const int (*ColorCost)[PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS]
+                              [PALETTE_COLORS];
+/* clang-format on */
+
+typedef struct {
+  int rows;
+  int cols;
+  int n_colors;
+  int plane_width;
+  int plane_height;
+  uint8_t *color_map;
+  MapCdf map_cdf;
+  ColorCost color_cost;
+} Av1ColorMapParam;
+
+static INLINE int is_nontrans_global_motion(const MACROBLOCKD *xd,
+                                            const MB_MODE_INFO *mbmi) {
+  int ref;
+
+  // First check if all modes are GLOBALMV
+  if (mbmi->mode != GLOBALMV && mbmi->mode != GLOBAL_GLOBALMV) return 0;
+
+  if (AOMMIN(mi_size_wide[mbmi->sb_type], mi_size_high[mbmi->sb_type]) < 2)
+    return 0;
+
+  // Now check if all global motion is non translational
+  for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
+    if (xd->global_motion[mbmi->ref_frame[ref]].wmtype == TRANSLATION) return 0;
+  }
+  return 1;
+}
+
+static INLINE PLANE_TYPE get_plane_type(int plane) {
+  return (plane == 0) ? PLANE_TYPE_Y : PLANE_TYPE_UV;
+}
+
+static INLINE int av1_get_max_eob(TX_SIZE tx_size) {
+  if (tx_size == TX_64X64 || tx_size == TX_64X32 || tx_size == TX_32X64) {
+    return 1024;
+  }
+  if (tx_size == TX_16X64 || tx_size == TX_64X16) {
+    return 512;
+  }
+  return tx_size_2d[tx_size];
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_COMMON_BLOCKD_H_
diff --git a/libs/libaom/src/av1/common/cdef.c b/libs/libaom/src/av1/common/cdef.c
new file mode 100644
index 000000000..ef7b866b5
--- /dev/null
+++ b/libs/libaom/src/av1/common/cdef.c
@@ -0,0 +1,388 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <math.h>
+#include <string.h>
+
+#include "config/aom_scale_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/common/cdef.h"
+#include "av1/common/cdef_block.h"
+#include "av1/common/reconinter.h"
+
+static int is_8x8_block_skip(MB_MODE_INFO **grid, int mi_row, int mi_col,
+                             int mi_stride) {
+  MB_MODE_INFO **mbmi = grid + mi_row * mi_stride + mi_col;
+  for (int r = 0; r < mi_size_high[BLOCK_8X8]; ++r, mbmi += mi_stride) {
+    for (int c = 0; c < mi_size_wide[BLOCK_8X8]; ++c) {
+      if (!mbmi[c]->skip) return 0;
+    }
+  }
+
+  return 1;
+}
+
+int av1_cdef_compute_sb_list(const CommonModeInfoParams *const mi_params,
+                             int mi_row, int mi_col, cdef_list *dlist,
+                             BLOCK_SIZE bs) {
+  MB_MODE_INFO **grid = mi_params->mi_grid_base;
+  int maxc = mi_params->mi_cols - mi_col;
+  int maxr = mi_params->mi_rows - mi_row;
+
+  if (bs == BLOCK_128X128 || bs == BLOCK_128X64)
+    maxc = AOMMIN(maxc, MI_SIZE_128X128);
+  else
+    maxc = AOMMIN(maxc, MI_SIZE_64X64);
+  if (bs == BLOCK_128X128 || bs == BLOCK_64X128)
+    maxr = AOMMIN(maxr, MI_SIZE_128X128);
+  else
+    maxr = AOMMIN(maxr, MI_SIZE_64X64);
+
+  const int r_step = 2;  // mi_size_high[BLOCK_8X8]
+  const int c_step = 2;  // mi_size_wide[BLOCK_8X8]
+  const int r_shift = 1;
+  const int c_shift = 1;
+  int count = 0;
+  for (int r = 0; r < maxr; r += r_step) {
+    for (int c = 0; c < maxc; c += c_step) {
+      if (!is_8x8_block_skip(grid, mi_row + r, mi_col + c,
+                             mi_params->mi_stride)) {
+        dlist[count].by = r >> r_shift;
+        dlist[count].bx = c >> c_shift;
+        count++;
+      }
+    }
+  }
+  return count;
+}
+
+void cdef_copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride,
+                                     const uint8_t *src, int sstride, int v,
+                                     int h) {
+  for (int i = 0; i < v; i++) {
+    for (int j = 0; j < h; j++) {
+      dst[i * dstride + j] = src[i * sstride + j];
+    }
+  }
+}
+
+void cdef_copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride,
+                                      const uint16_t *src, int sstride, int v,
+                                      int h) {
+  for (int i = 0; i < v; i++) {
+    for (int j = 0; j < h; j++) {
+      dst[i * dstride + j] = src[i * sstride + j];
+    }
+  }
+}
+
+static void copy_sb8_16(AV1_COMMON *cm, uint16_t *dst, int dstride,
+                        const uint8_t *src, int src_voffset, int src_hoffset,
+                        int sstride, int vsize, int hsize) {
+  if (cm->seq_params.use_highbitdepth) {
+    const uint16_t *base =
+        &CONVERT_TO_SHORTPTR(src)[src_voffset * sstride + src_hoffset];
+    cdef_copy_rect8_16bit_to_16bit(dst, dstride, base, sstride, vsize, hsize);
+  } else {
+    const uint8_t *base = &src[src_voffset * sstride + src_hoffset];
+    cdef_copy_rect8_8bit_to_16bit(dst, dstride, base, sstride, vsize, hsize);
+  }
+}
+
+static INLINE void fill_rect(uint16_t *dst, int dstride, int v, int h,
+                             uint16_t x) {
+  for (int i = 0; i < v; i++) {
+    for (int j = 0; j < h; j++) {
+      dst[i * dstride + j] = x;
+    }
+  }
+}
+
+static INLINE void copy_rect(uint16_t *dst, int dstride, const uint16_t *src,
+                             int sstride, int v, int h) {
+  for (int i = 0; i < v; i++) {
+    for (int j = 0; j < h; j++) {
+      dst[i * dstride + j] = src[i * sstride + j];
+    }
+  }
+}
+
+void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
+                    MACROBLOCKD *xd) {
+  const CdefInfo *const cdef_info = &cm->cdef_info;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  const int num_planes = av1_num_planes(cm);
+  DECLARE_ALIGNED(16, uint16_t, src[CDEF_INBUF_SIZE]);
+  uint16_t *linebuf[3];
+  uint16_t *colbuf[3];
+  cdef_list dlist[MI_SIZE_64X64 * MI_SIZE_64X64];
+  unsigned char *row_cdef, *prev_row_cdef, *curr_row_cdef;
+  int cdef_count;
+  int dir[CDEF_NBLOCKS][CDEF_NBLOCKS] = { { 0 } };
+  int var[CDEF_NBLOCKS][CDEF_NBLOCKS] = { { 0 } };
+  int mi_wide_l2[3];
+  int mi_high_l2[3];
+  int xdec[3];
+  int ydec[3];
+  int coeff_shift = AOMMAX(cm->seq_params.bit_depth - 8, 0);
+  const int nvfb = (mi_params->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+  const int nhfb = (mi_params->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+  av1_setup_dst_planes(xd->plane, cm->seq_params.sb_size, frame, 0, 0, 0,
+                       num_planes);
+  row_cdef = aom_malloc(sizeof(*row_cdef) * (nhfb + 2) * 2);
+  memset(row_cdef, 1, sizeof(*row_cdef) * (nhfb + 2) * 2);
+  prev_row_cdef = row_cdef + 1;
+  curr_row_cdef = prev_row_cdef + nhfb + 2;
+  for (int pli = 0; pli < num_planes; pli++) {
+    xdec[pli] = xd->plane[pli].subsampling_x;
+    ydec[pli] = xd->plane[pli].subsampling_y;
+    mi_wide_l2[pli] = MI_SIZE_LOG2 - xd->plane[pli].subsampling_x;
+    mi_high_l2[pli] = MI_SIZE_LOG2 - xd->plane[pli].subsampling_y;
+  }
+  const int stride = (mi_params->mi_cols << MI_SIZE_LOG2) + 2 * CDEF_HBORDER;
+  for (int pli = 0; pli < num_planes; pli++) {
+    linebuf[pli] = aom_malloc(sizeof(*linebuf) * CDEF_VBORDER * stride);
+    colbuf[pli] =
+        aom_malloc(sizeof(*colbuf) *
+                   ((CDEF_BLOCKSIZE << mi_high_l2[pli]) + 2 * CDEF_VBORDER) *
+                   CDEF_HBORDER);
+  }
+  for (int fbr = 0; fbr < nvfb; fbr++) {
+    for (int pli = 0; pli < num_planes; pli++) {
+      const int block_height =
+          (MI_SIZE_64X64 << mi_high_l2[pli]) + 2 * CDEF_VBORDER;
+      fill_rect(colbuf[pli], CDEF_HBORDER, block_height, CDEF_HBORDER,
+                CDEF_VERY_LARGE);
+    }
+    int cdef_left = 1;
+    for (int fbc = 0; fbc < nhfb; fbc++) {
+      int level, sec_strength;
+      int uv_level, uv_sec_strength;
+      int nhb, nvb;
+      int cstart = 0;
+      curr_row_cdef[fbc] = 0;
+      if (mi_params->mi_grid_base[MI_SIZE_64X64 * fbr * mi_params->mi_stride +
+                                  MI_SIZE_64X64 * fbc] == NULL ||
+          mi_params
+                  ->mi_grid_base[MI_SIZE_64X64 * fbr * mi_params->mi_stride +
+                                 MI_SIZE_64X64 * fbc]
+                  ->cdef_strength == -1) {
+        cdef_left = 0;
+        continue;
+      }
+      if (!cdef_left) cstart = -CDEF_HBORDER;
+      nhb = AOMMIN(MI_SIZE_64X64, mi_params->mi_cols - MI_SIZE_64X64 * fbc);
+      nvb = AOMMIN(MI_SIZE_64X64, mi_params->mi_rows - MI_SIZE_64X64 * fbr);
+      int frame_top, frame_left, frame_bottom, frame_right;
+
+      int mi_row = MI_SIZE_64X64 * fbr;
+      int mi_col = MI_SIZE_64X64 * fbc;
+      // for the current filter block, it's top left corner mi structure (mi_tl)
+      // is first accessed to check whether the top and left boundaries are
+      // frame boundaries. Then bottom-left and top-right mi structures are
+      // accessed to check whether the bottom and right boundaries
+      // (respectively) are frame boundaries.
+      //
+      // Note that we can't just check the bottom-right mi structure - eg. if
+      // we're at the right-hand edge of the frame but not the bottom, then
+      // the bottom-right mi is NULL but the bottom-left is not.
+      frame_top = (mi_row == 0) ? 1 : 0;
+      frame_left = (mi_col == 0) ? 1 : 0;
+
+      if (fbr != nvfb - 1)
+        frame_bottom = (mi_row + MI_SIZE_64X64 == mi_params->mi_rows) ? 1 : 0;
+      else
+        frame_bottom = 1;
+
+      if (fbc != nhfb - 1)
+        frame_right = (mi_col + MI_SIZE_64X64 == mi_params->mi_cols) ? 1 : 0;
+      else
+        frame_right = 1;
+
+      const int mbmi_cdef_strength =
+          mi_params
+              ->mi_grid_base[MI_SIZE_64X64 * fbr * mi_params->mi_stride +
+                             MI_SIZE_64X64 * fbc]
+              ->cdef_strength;
+      level =
+          cdef_info->cdef_strengths[mbmi_cdef_strength] / CDEF_SEC_STRENGTHS;
+      sec_strength =
+          cdef_info->cdef_strengths[mbmi_cdef_strength] % CDEF_SEC_STRENGTHS;
+      sec_strength += sec_strength == 3;
+      uv_level =
+          cdef_info->cdef_uv_strengths[mbmi_cdef_strength] / CDEF_SEC_STRENGTHS;
+      uv_sec_strength =
+          cdef_info->cdef_uv_strengths[mbmi_cdef_strength] % CDEF_SEC_STRENGTHS;
+      uv_sec_strength += uv_sec_strength == 3;
+      if ((level == 0 && sec_strength == 0 && uv_level == 0 &&
+           uv_sec_strength == 0) ||
+          (cdef_count = av1_cdef_compute_sb_list(mi_params, fbr * MI_SIZE_64X64,
+                                                 fbc * MI_SIZE_64X64, dlist,
+                                                 BLOCK_64X64)) == 0) {
+        cdef_left = 0;
+        continue;
+      }
+
+      curr_row_cdef[fbc] = 1;
+      for (int pli = 0; pli < num_planes; pli++) {
+        int coffset;
+        int rend, cend;
+        int damping = cdef_info->cdef_damping;
+        int hsize = nhb << mi_wide_l2[pli];
+        int vsize = nvb << mi_high_l2[pli];
+
+        if (pli) {
+          level = uv_level;
+          sec_strength = uv_sec_strength;
+        }
+
+        if (fbc == nhfb - 1)
+          cend = hsize;
+        else
+          cend = hsize + CDEF_HBORDER;
+
+        if (fbr == nvfb - 1)
+          rend = vsize;
+        else
+          rend = vsize + CDEF_VBORDER;
+
+        coffset = fbc * MI_SIZE_64X64 << mi_wide_l2[pli];
+        if (fbc == nhfb - 1) {
+          /* On the last superblock column, fill in the right border with
+             CDEF_VERY_LARGE to avoid filtering with the outside. */
+          fill_rect(&src[cend + CDEF_HBORDER], CDEF_BSTRIDE,
+                    rend + CDEF_VBORDER, hsize + CDEF_HBORDER - cend,
+                    CDEF_VERY_LARGE);
+        }
+        if (fbr == nvfb - 1) {
+          /* On the last superblock row, fill in the bottom border with
+             CDEF_VERY_LARGE to avoid filtering with the outside. */
+          fill_rect(&src[(rend + CDEF_VBORDER) * CDEF_BSTRIDE], CDEF_BSTRIDE,
+                    CDEF_VBORDER, hsize + 2 * CDEF_HBORDER, CDEF_VERY_LARGE);
+        }
+        /* Copy in the pixels we need from the current superblock for
+           deringing.*/
+        copy_sb8_16(cm,
+                    &src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER + cstart],
+                    CDEF_BSTRIDE, xd->plane[pli].dst.buf,
+                    (MI_SIZE_64X64 << mi_high_l2[pli]) * fbr, coffset + cstart,
+                    xd->plane[pli].dst.stride, rend, cend - cstart);
+        if (!prev_row_cdef[fbc]) {
+          copy_sb8_16(cm, &src[CDEF_HBORDER], CDEF_BSTRIDE,
+                      xd->plane[pli].dst.buf,
+                      (MI_SIZE_64X64 << mi_high_l2[pli]) * fbr - CDEF_VBORDER,
+                      coffset, xd->plane[pli].dst.stride, CDEF_VBORDER, hsize);
+        } else if (fbr > 0) {
+          copy_rect(&src[CDEF_HBORDER], CDEF_BSTRIDE, &linebuf[pli][coffset],
+                    stride, CDEF_VBORDER, hsize);
+        } else {
+          fill_rect(&src[CDEF_HBORDER], CDEF_BSTRIDE, CDEF_VBORDER, hsize,
+                    CDEF_VERY_LARGE);
+        }
+        if (!prev_row_cdef[fbc - 1]) {
+          copy_sb8_16(cm, src, CDEF_BSTRIDE, xd->plane[pli].dst.buf,
+                      (MI_SIZE_64X64 << mi_high_l2[pli]) * fbr - CDEF_VBORDER,
+                      coffset - CDEF_HBORDER, xd->plane[pli].dst.stride,
+                      CDEF_VBORDER, CDEF_HBORDER);
+        } else if (fbr > 0 && fbc > 0) {
+          copy_rect(src, CDEF_BSTRIDE, &linebuf[pli][coffset - CDEF_HBORDER],
+                    stride, CDEF_VBORDER, CDEF_HBORDER);
+        } else {
+          fill_rect(src, CDEF_BSTRIDE, CDEF_VBORDER, CDEF_HBORDER,
+                    CDEF_VERY_LARGE);
+        }
+        if (!prev_row_cdef[fbc + 1]) {
+          copy_sb8_16(cm, &src[CDEF_HBORDER + (nhb << mi_wide_l2[pli])],
+                      CDEF_BSTRIDE, xd->plane[pli].dst.buf,
+                      (MI_SIZE_64X64 << mi_high_l2[pli]) * fbr - CDEF_VBORDER,
+                      coffset + hsize, xd->plane[pli].dst.stride, CDEF_VBORDER,
+                      CDEF_HBORDER);
+        } else if (fbr > 0 && fbc < nhfb - 1) {
+          copy_rect(&src[hsize + CDEF_HBORDER], CDEF_BSTRIDE,
+                    &linebuf[pli][coffset + hsize], stride, CDEF_VBORDER,
+                    CDEF_HBORDER);
+        } else {
+          fill_rect(&src[hsize + CDEF_HBORDER], CDEF_BSTRIDE, CDEF_VBORDER,
+                    CDEF_HBORDER, CDEF_VERY_LARGE);
+        }
+        if (cdef_left) {
+          /* If we deringed the superblock on the left then we need to copy in
+             saved pixels. */
+          copy_rect(src, CDEF_BSTRIDE, colbuf[pli], CDEF_HBORDER,
+                    rend + CDEF_VBORDER, CDEF_HBORDER);
+        }
+        /* Saving pixels in case we need to dering the superblock on the
+            right. */
+        copy_rect(colbuf[pli], CDEF_HBORDER, src + hsize, CDEF_BSTRIDE,
+                  rend + CDEF_VBORDER, CDEF_HBORDER);
+        copy_sb8_16(
+            cm, &linebuf[pli][coffset], stride, xd->plane[pli].dst.buf,
+            (MI_SIZE_64X64 << mi_high_l2[pli]) * (fbr + 1) - CDEF_VBORDER,
+            coffset, xd->plane[pli].dst.stride, CDEF_VBORDER, hsize);
+
+        if (frame_top) {
+          fill_rect(src, CDEF_BSTRIDE, CDEF_VBORDER, hsize + 2 * CDEF_HBORDER,
+                    CDEF_VERY_LARGE);
+        }
+        if (frame_left) {
+          fill_rect(src, CDEF_BSTRIDE, vsize + 2 * CDEF_VBORDER, CDEF_HBORDER,
+                    CDEF_VERY_LARGE);
+        }
+        if (frame_bottom) {
+          fill_rect(&src[(vsize + CDEF_VBORDER) * CDEF_BSTRIDE], CDEF_BSTRIDE,
+                    CDEF_VBORDER, hsize + 2 * CDEF_HBORDER, CDEF_VERY_LARGE);
+        }
+        if (frame_right) {
+          fill_rect(&src[hsize + CDEF_HBORDER], CDEF_BSTRIDE,
+                    vsize + 2 * CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE);
+        }
+
+        if (cm->seq_params.use_highbitdepth) {
+          av1_cdef_filter_fb(
+              NULL,
+              &CONVERT_TO_SHORTPTR(
+                  xd->plane[pli]
+                      .dst.buf)[xd->plane[pli].dst.stride *
+                                    (MI_SIZE_64X64 * fbr << mi_high_l2[pli]) +
+                                (fbc * MI_SIZE_64X64 << mi_wide_l2[pli])],
+              xd->plane[pli].dst.stride,
+              &src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER], xdec[pli],
+              ydec[pli], dir, NULL, var, pli, dlist, cdef_count, level,
+              sec_strength, damping, coeff_shift);
+        } else {
+          av1_cdef_filter_fb(
+              &xd->plane[pli]
+                   .dst.buf[xd->plane[pli].dst.stride *
+                                (MI_SIZE_64X64 * fbr << mi_high_l2[pli]) +
+                            (fbc * MI_SIZE_64X64 << mi_wide_l2[pli])],
+              NULL, xd->plane[pli].dst.stride,
+              &src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER], xdec[pli],
+              ydec[pli], dir, NULL, var, pli, dlist, cdef_count, level,
+              sec_strength, damping, coeff_shift);
+        }
+      }
+      cdef_left = 1;
+    }
+    {
+      unsigned char *tmp = prev_row_cdef;
+      prev_row_cdef = curr_row_cdef;
+      curr_row_cdef = tmp;
+    }
+  }
+  aom_free(row_cdef);
+  for (int pli = 0; pli < num_planes; pli++) {
+    aom_free(linebuf[pli]);
+    aom_free(colbuf[pli]);
+  }
+}
diff --git a/libs/libaom/src/av1/common/cdef.h b/libs/libaom/src/av1/common/cdef.h
new file mode 100644
index 000000000..c36fd135a
--- /dev/null
+++ b/libs/libaom/src/av1/common/cdef.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_COMMON_CDEF_H_
+#define AOM_AV1_COMMON_CDEF_H_
+
+#define CDEF_STRENGTH_BITS 6
+
+#define CDEF_PRI_STRENGTHS 16
+#define CDEF_SEC_STRENGTHS 4
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/common/cdef_block.h"
+
+static INLINE int sign(int i) { return i < 0 ? -1 : 1; }
+
+static INLINE int constrain(int diff, int threshold, int damping) {
+  if (!threshold) return 0;
+
+  const int shift = AOMMAX(0, damping - get_msb(threshold));
+  return sign(diff) *
+         AOMMIN(abs(diff), AOMMAX(0, threshold - (abs(diff) >> shift)));
+}
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int av1_cdef_compute_sb_list(const CommonModeInfoParams *const mi_params,
+                             int mi_row, int mi_col, cdef_list *dlist,
+                             BLOCK_SIZE bsize);
+void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, MACROBLOCKD *xd);
+
+void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
+                     AV1_COMMON *cm, MACROBLOCKD *xd, int pick_method,
+                     int rdmult);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+#endif  // AOM_AV1_COMMON_CDEF_H_
diff --git a/libs/libaom/src/av1/common/cdef_block.c b/libs/libaom/src/av1/common/cdef_block.c
new file mode 100644
index 000000000..7120705d3
--- /dev/null
+++ b/libs/libaom/src/av1/common/cdef_block.c
@@ -0,0 +1,253 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "av1/common/cdef.h"
+
+/* Generated from gen_filter_tables.c. */
+DECLARE_ALIGNED(16, const int, cdef_directions[8][2]) = {
+  { -1 * CDEF_BSTRIDE + 1, -2 * CDEF_BSTRIDE + 2 },
+  { 0 * CDEF_BSTRIDE + 1, -1 * CDEF_BSTRIDE + 2 },
+  { 0 * CDEF_BSTRIDE + 1, 0 * CDEF_BSTRIDE + 2 },
+  { 0 * CDEF_BSTRIDE + 1, 1 * CDEF_BSTRIDE + 2 },
+  { 1 * CDEF_BSTRIDE + 1, 2 * CDEF_BSTRIDE + 2 },
+  { 1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE + 1 },
+  { 1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE + 0 },
+  { 1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE - 1 }
+};
+
+/* Detect direction. 0 means 45-degree up-right, 2 is horizontal, and so on.
+   The search minimizes the weighted variance along all the lines in a
+   particular direction, i.e. the squared error between the input and a
+   "predicted" block where each pixel is replaced by the average along a line
+   in a particular direction. Since each direction have the same sum(x^2) term,
+   that term is never computed. See Section 2, step 2, of:
+   http://jmvalin.ca/notes/intra_paint.pdf */
+int cdef_find_dir_c(const uint16_t *img, int stride, int32_t *var,
+                    int coeff_shift) {
+  int i;
+  int32_t cost[8] = { 0 };
+  int partial[8][15] = { { 0 } };
+  int32_t best_cost = 0;
+  int best_dir = 0;
+  /* Instead of dividing by n between 2 and 8, we multiply by 3*5*7*8/n.
+     The output is then 840 times larger, but we don't care for finding
+     the max. */
+  static const int div_table[] = { 0, 840, 420, 280, 210, 168, 140, 120, 105 };
+  for (i = 0; i < 8; i++) {
+    int j;
+    for (j = 0; j < 8; j++) {
+      int x;
+      /* We subtract 128 here to reduce the maximum range of the squared
+         partial sums. */
+      x = (img[i * stride + j] >> coeff_shift) - 128;
+      partial[0][i + j] += x;
+      partial[1][i + j / 2] += x;
+      partial[2][i] += x;
+      partial[3][3 + i - j / 2] += x;
+      partial[4][7 + i - j] += x;
+      partial[5][3 - i / 2 + j] += x;
+      partial[6][j] += x;
+      partial[7][i / 2 + j] += x;
+    }
+  }
+  for (i = 0; i < 8; i++) {
+    cost[2] += partial[2][i] * partial[2][i];
+    cost[6] += partial[6][i] * partial[6][i];
+  }
+  cost[2] *= div_table[8];
+  cost[6] *= div_table[8];
+  for (i = 0; i < 7; i++) {
+    cost[0] += (partial[0][i] * partial[0][i] +
+                partial[0][14 - i] * partial[0][14 - i]) *
+               div_table[i + 1];
+    cost[4] += (partial[4][i] * partial[4][i] +
+                partial[4][14 - i] * partial[4][14 - i]) *
+               div_table[i + 1];
+  }
+  cost[0] += partial[0][7] * partial[0][7] * div_table[8];
+  cost[4] += partial[4][7] * partial[4][7] * div_table[8];
+  for (i = 1; i < 8; i += 2) {
+    int j;
+    for (j = 0; j < 4 + 1; j++) {
+      cost[i] += partial[i][3 + j] * partial[i][3 + j];
+    }
+    cost[i] *= div_table[8];
+    for (j = 0; j < 4 - 1; j++) {
+      cost[i] += (partial[i][j] * partial[i][j] +
+                  partial[i][10 - j] * partial[i][10 - j]) *
+                 div_table[2 * j + 2];
+    }
+  }
+  for (i = 0; i < 8; i++) {
+    if (cost[i] > best_cost) {
+      best_cost = cost[i];
+      best_dir = i;
+    }
+  }
+  /* Difference between the optimal variance and the variance along the
+     orthogonal direction. Again, the sum(x^2) terms cancel out. */
+  *var = best_cost - cost[(best_dir + 4) & 7];
+  /* We'd normally divide by 840, but dividing by 1024 is close enough
+     for what we're going to do with this. */
+  *var >>= 10;
+  return best_dir;
+}
+
+const int cdef_pri_taps[2][2] = { { 4, 2 }, { 3, 3 } };
+const int cdef_sec_taps[2] = { 2, 1 };
+
+/* Smooth in the direction detected. */
+void cdef_filter_block_c(uint8_t *dst8, uint16_t *dst16, int dstride,
+                         const uint16_t *in, int pri_strength, int sec_strength,
+                         int dir, int pri_damping, int sec_damping, int bsize,
+                         int coeff_shift) {
+  int i, j, k;
+  const int s = CDEF_BSTRIDE;
+  const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
+  const int *sec_taps = cdef_sec_taps;
+  for (i = 0; i < 4 << (bsize == BLOCK_8X8 || bsize == BLOCK_4X8); i++) {
+    for (j = 0; j < 4 << (bsize == BLOCK_8X8 || bsize == BLOCK_8X4); j++) {
+      int16_t sum = 0;
+      int16_t y;
+      int16_t x = in[i * s + j];
+      int max = x;
+      int min = x;
+      for (k = 0; k < 2; k++) {
+        int16_t p0 = in[i * s + j + cdef_directions[dir][k]];
+        int16_t p1 = in[i * s + j - cdef_directions[dir][k]];
+        sum += pri_taps[k] * constrain(p0 - x, pri_strength, pri_damping);
+        sum += pri_taps[k] * constrain(p1 - x, pri_strength, pri_damping);
+        if (p0 != CDEF_VERY_LARGE) max = AOMMAX(p0, max);
+        if (p1 != CDEF_VERY_LARGE) max = AOMMAX(p1, max);
+        min = AOMMIN(p0, min);
+        min = AOMMIN(p1, min);
+        int16_t s0 = in[i * s + j + cdef_directions[(dir + 2) & 7][k]];
+        int16_t s1 = in[i * s + j - cdef_directions[(dir + 2) & 7][k]];
+        int16_t s2 = in[i * s + j + cdef_directions[(dir + 6) & 7][k]];
+        int16_t s3 = in[i * s + j - cdef_directions[(dir + 6) & 7][k]];
+        if (s0 != CDEF_VERY_LARGE) max = AOMMAX(s0, max);
+        if (s1 != CDEF_VERY_LARGE) max = AOMMAX(s1, max);
+        if (s2 != CDEF_VERY_LARGE) max = AOMMAX(s2, max);
+        if (s3 != CDEF_VERY_LARGE) max = AOMMAX(s3, max);
+        min = AOMMIN(s0, min);
+        min = AOMMIN(s1, min);
+        min = AOMMIN(s2, min);
+        min = AOMMIN(s3, min);
+        sum += sec_taps[k] * constrain(s0 - x, sec_strength, sec_damping);
+        sum += sec_taps[k] * constrain(s1 - x, sec_strength, sec_damping);
+        sum += sec_taps[k] * constrain(s2 - x, sec_strength, sec_damping);
+        sum += sec_taps[k] * constrain(s3 - x, sec_strength, sec_damping);
+      }
+      y = clamp((int16_t)x + ((8 + sum - (sum < 0)) >> 4), min, max);
+      if (dst8)
+        dst8[i * dstride + j] = (uint8_t)y;
+      else
+        dst16[i * dstride + j] = (uint16_t)y;
+    }
+  }
+}
+
+/* Compute the primary filter strength for an 8x8 block based on the
+   directional variance difference. A high variance difference means
+   that we have a highly directional pattern (e.g. a high contrast
+   edge), so we can apply more deringing. A low variance means that we
+   either have a low contrast edge, or a non-directional texture, so
+   we want to be careful not to blur. */
+static INLINE int adjust_strength(int strength, int32_t var) {
+  const int i = var >> 6 ? AOMMIN(get_msb(var >> 6), 12) : 0;
+  /* We use the variance of 8x8 blocks to adjust the strength. */
+  return var ? (strength * (4 + i) + 8) >> 4 : 0;
+}
+
+void av1_cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride,
+                        uint16_t *in, int xdec, int ydec,
+                        int dir[CDEF_NBLOCKS][CDEF_NBLOCKS], int *dirinit,
+                        int var[CDEF_NBLOCKS][CDEF_NBLOCKS], int pli,
+                        cdef_list *dlist, int cdef_count, int level,
+                        int sec_strength, int damping, int coeff_shift) {
+  int bi;
+  int bx;
+  int by;
+  const int pri_strength = level << coeff_shift;
+  sec_strength <<= coeff_shift;
+  damping += coeff_shift - (pli != AOM_PLANE_Y);
+  const int bw_log2 = 3 - xdec;
+  const int bh_log2 = 3 - ydec;
+  if (dirinit && pri_strength == 0 && sec_strength == 0) {
+    // If we're here, both primary and secondary strengths are 0, and
+    // we still haven't written anything to y[] yet, so we just copy
+    // the input to y[]. This is necessary only for av1_cdef_search()
+    // and only av1_cdef_search() sets dirinit.
+    for (bi = 0; bi < cdef_count; bi++) {
+      by = dlist[bi].by;
+      bx = dlist[bi].bx;
+      // TODO(stemidts/jmvalin): SIMD optimisations
+      for (int iy = 0; iy < 1 << bh_log2; iy++) {
+        memcpy(&dst16[(bi << (bw_log2 + bh_log2)) + (iy << bw_log2)],
+               &in[((by << bh_log2) + iy) * CDEF_BSTRIDE + (bx << bw_log2)],
+               ((size_t)1 << bw_log2) * sizeof(*dst16));
+      }
+    }
+    return;
+  }
+
+  if (pli == 0) {
+    if (!dirinit || !*dirinit) {
+      for (bi = 0; bi < cdef_count; bi++) {
+        by = dlist[bi].by;
+        bx = dlist[bi].bx;
+        dir[by][bx] = cdef_find_dir(&in[8 * by * CDEF_BSTRIDE + 8 * bx],
+                                    CDEF_BSTRIDE, &var[by][bx], coeff_shift);
+      }
+      if (dirinit) *dirinit = 1;
+    }
+  }
+  if (pli == 1 && xdec != ydec) {
+    for (bi = 0; bi < cdef_count; bi++) {
+      static const int conv422[8] = { 7, 0, 2, 4, 5, 6, 6, 6 };
+      static const int conv440[8] = { 1, 2, 2, 2, 3, 4, 6, 0 };
+      by = dlist[bi].by;
+      bx = dlist[bi].bx;
+      dir[by][bx] = (xdec ? conv422 : conv440)[dir[by][bx]];
+    }
+  }
+
+  const int bsize =
+      ydec ? (xdec ? BLOCK_4X4 : BLOCK_8X4) : (xdec ? BLOCK_4X8 : BLOCK_8X8);
+  const int t = pri_strength;
+  const int s = sec_strength;
+  for (bi = 0; bi < cdef_count; bi++) {
+    by = dlist[bi].by;
+    bx = dlist[bi].bx;
+    if (dst8) {
+      cdef_filter_block(
+          &dst8[(by << bh_log2) * dstride + (bx << bw_log2)], NULL, dstride,
+          &in[(by * CDEF_BSTRIDE << bh_log2) + (bx << bw_log2)],
+          (pli ? t : adjust_strength(t, var[by][bx])), s, t ? dir[by][bx] : 0,
+          damping, damping, bsize, coeff_shift);
+    } else {
+      cdef_filter_block(
+          NULL,
+          &dst16[dirinit ? bi << (bw_log2 + bh_log2)
+                         : (by << bh_log2) * dstride + (bx << bw_log2)],
+          dirinit ? 1 << bw_log2 : dstride,
+          &in[(by * CDEF_BSTRIDE << bh_log2) + (bx << bw_log2)],
+          (pli ? t : adjust_strength(t, var[by][bx])), s, t ? dir[by][bx] : 0,
+          damping, damping, bsize, coeff_shift);
+    }
+  }
+}
diff --git a/libs/libaom/src/av1/common/cdef_block.h b/libs/libaom/src/av1/common/cdef_block.h
new file mode 100644
index 000000000..6b0ae0a9d
--- /dev/null
+++ b/libs/libaom/src/av1/common/cdef_block.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_CDEF_BLOCK_H_
+#define AOM_AV1_COMMON_CDEF_BLOCK_H_
+
+#include "av1/common/odintrin.h"
+
+#define CDEF_BLOCKSIZE 64
+#define CDEF_BLOCKSIZE_LOG2 6
+#define CDEF_NBLOCKS ((1 << MAX_SB_SIZE_LOG2) / 8)
+#define CDEF_SB_SHIFT (MAX_SB_SIZE_LOG2 - CDEF_BLOCKSIZE_LOG2)
+
+/* We need to buffer three vertical lines. */
+#define CDEF_VBORDER (3)
+/* We only need to buffer three horizontal pixels too, but let's align to
+   16 bytes (8 x 16 bits) to make vectorization easier. */
+#define CDEF_HBORDER (8)
+#define CDEF_BSTRIDE \
+  ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
+
+#define CDEF_VERY_LARGE (30000)
+#define CDEF_INBUF_SIZE \
+  (CDEF_BSTRIDE * ((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_VBORDER))
+
+extern const int cdef_pri_taps[2][2];
+extern const int cdef_sec_taps[2];
+DECLARE_ALIGNED(16, extern const int, cdef_directions[8][2]);
+
+typedef struct {
+  uint8_t by;
+  uint8_t bx;
+} cdef_list;
+
+typedef void (*cdef_filter_block_func)(uint8_t *dst8, uint16_t *dst16,
+                                       int dstride, const uint16_t *in,
+                                       int pri_strength, int sec_strength,
+                                       int dir, int pri_damping,
+                                       int sec_damping, int bsize,
+                                       int coeff_shift);
+void copy_cdef_16bit_to_16bit(uint16_t *dst, int dstride, uint16_t *src,
+                              cdef_list *dlist, int cdef_count, int bsize);
+
+void av1_cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride,
+                        uint16_t *in, int xdec, int ydec,
+                        int dir[CDEF_NBLOCKS][CDEF_NBLOCKS], int *dirinit,
+                        int var[CDEF_NBLOCKS][CDEF_NBLOCKS], int pli,
+                        cdef_list *dlist, int cdef_count, int level,
+                        int sec_strength, int damping, int coeff_shift);
+#endif  // AOM_AV1_COMMON_CDEF_BLOCK_H_
diff --git a/libs/libaom/src/av1/common/cdef_block_avx2.c b/libs/libaom/src/av1/common/cdef_block_avx2.c
new file mode 100644
index 000000000..e2b85b3e2
--- /dev/null
+++ b/libs/libaom/src/av1/common/cdef_block_avx2.c
@@ -0,0 +1,14 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/aom_simd.h"
+#define SIMD_FUNC(name) name##_avx2
+#include "av1/common/cdef_block_simd.h"
diff --git a/libs/libaom/src/av1/common/cdef_block_neon.c b/libs/libaom/src/av1/common/cdef_block_neon.c
new file mode 100644
index 000000000..2d6bc65e3
--- /dev/null
+++ b/libs/libaom/src/av1/common/cdef_block_neon.c
@@ -0,0 +1,14 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/aom_simd.h"
+#define SIMD_FUNC(name) name##_neon
+#include "av1/common/cdef_block_simd.h"
diff --git a/libs/libaom/src/av1/common/cdef_block_simd.h b/libs/libaom/src/av1/common/cdef_block_simd.h
new file mode 100644
index 000000000..5a52bc1e4
--- /dev/null
+++ b/libs/libaom/src/av1/common/cdef_block_simd.h
@@ -0,0 +1,915 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_CDEF_BLOCK_SIMD_H_
+#define AOM_AV1_COMMON_CDEF_BLOCK_SIMD_H_
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/cdef_block.h"
+
+/* partial A is a 16-bit vector of the form:
+   [x8 x7 x6 x5 x4 x3 x2 x1] and partial B has the form:
+   [0  y1 y2 y3 y4 y5 y6 y7].
+   This function computes (x1^2+y1^2)*C1 + (x2^2+y2^2)*C2 + ...
+   (x7^2+y2^7)*C7 + (x8^2+0^2)*C8 where the C1..C8 constants are in const1
+   and const2. */
+static INLINE v128 fold_mul_and_sum(v128 partiala, v128 partialb, v128 const1,
+                                    v128 const2) {
+  v128 tmp;
+  /* Reverse partial B. */
+  partialb = v128_shuffle_8(
+      partialb, v128_from_32(0x0f0e0100, 0x03020504, 0x07060908, 0x0b0a0d0c));
+  /* Interleave the x and y values of identical indices and pair x8 with 0. */
+  tmp = partiala;
+  partiala = v128_ziplo_16(partialb, partiala);
+  partialb = v128_ziphi_16(partialb, tmp);
+  /* Square and add the corresponding x and y values. */
+  partiala = v128_madd_s16(partiala, partiala);
+  partialb = v128_madd_s16(partialb, partialb);
+  /* Multiply by constant. */
+  partiala = v128_mullo_s32(partiala, const1);
+  partialb = v128_mullo_s32(partialb, const2);
+  /* Sum all results. */
+  partiala = v128_add_32(partiala, partialb);
+  return partiala;
+}
+
+static INLINE v128 hsum4(v128 x0, v128 x1, v128 x2, v128 x3) {
+  v128 t0, t1, t2, t3;
+  t0 = v128_ziplo_32(x1, x0);
+  t1 = v128_ziplo_32(x3, x2);
+  t2 = v128_ziphi_32(x1, x0);
+  t3 = v128_ziphi_32(x3, x2);
+  x0 = v128_ziplo_64(t1, t0);
+  x1 = v128_ziphi_64(t1, t0);
+  x2 = v128_ziplo_64(t3, t2);
+  x3 = v128_ziphi_64(t3, t2);
+  return v128_add_32(v128_add_32(x0, x1), v128_add_32(x2, x3));
+}
+
+/* Computes cost for directions 0, 5, 6 and 7. We can call this function again
+   to compute the remaining directions. */
+static INLINE v128 compute_directions(v128 lines[8], int32_t tmp_cost1[4]) {
+  v128 partial4a, partial4b, partial5a, partial5b, partial7a, partial7b;
+  v128 partial6;
+  v128 tmp;
+  /* Partial sums for lines 0 and 1. */
+  partial4a = v128_shl_n_byte(lines[0], 14);
+  partial4b = v128_shr_n_byte(lines[0], 2);
+  partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[1], 12));
+  partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[1], 4));
+  tmp = v128_add_16(lines[0], lines[1]);
+  partial5a = v128_shl_n_byte(tmp, 10);
+  partial5b = v128_shr_n_byte(tmp, 6);
+  partial7a = v128_shl_n_byte(tmp, 4);
+  partial7b = v128_shr_n_byte(tmp, 12);
+  partial6 = tmp;
+
+  /* Partial sums for lines 2 and 3. */
+  partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[2], 10));
+  partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[2], 6));
+  partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[3], 8));
+  partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[3], 8));
+  tmp = v128_add_16(lines[2], lines[3]);
+  partial5a = v128_add_16(partial5a, v128_shl_n_byte(tmp, 8));
+  partial5b = v128_add_16(partial5b, v128_shr_n_byte(tmp, 8));
+  partial7a = v128_add_16(partial7a, v128_shl_n_byte(tmp, 6));
+  partial7b = v128_add_16(partial7b, v128_shr_n_byte(tmp, 10));
+  partial6 = v128_add_16(partial6, tmp);
+
+  /* Partial sums for lines 4 and 5. */
+  partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[4], 6));
+  partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[4], 10));
+  partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[5], 4));
+  partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[5], 12));
+  tmp = v128_add_16(lines[4], lines[5]);
+  partial5a = v128_add_16(partial5a, v128_shl_n_byte(tmp, 6));
+  partial5b = v128_add_16(partial5b, v128_shr_n_byte(tmp, 10));
+  partial7a = v128_add_16(partial7a, v128_shl_n_byte(tmp, 8));
+  partial7b = v128_add_16(partial7b, v128_shr_n_byte(tmp, 8));
+  partial6 = v128_add_16(partial6, tmp);
+
+  /* Partial sums for lines 6 and 7. */
+  partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[6], 2));
+  partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[6], 14));
+  partial4a = v128_add_16(partial4a, lines[7]);
+  tmp = v128_add_16(lines[6], lines[7]);
+  partial5a = v128_add_16(partial5a, v128_shl_n_byte(tmp, 4));
+  partial5b = v128_add_16(partial5b, v128_shr_n_byte(tmp, 12));
+  partial7a = v128_add_16(partial7a, v128_shl_n_byte(tmp, 10));
+  partial7b = v128_add_16(partial7b, v128_shr_n_byte(tmp, 6));
+  partial6 = v128_add_16(partial6, tmp);
+
+  /* Compute costs in terms of partial sums. */
+  partial4a =
+      fold_mul_and_sum(partial4a, partial4b, v128_from_32(210, 280, 420, 840),
+                       v128_from_32(105, 120, 140, 168));
+  partial7a =
+      fold_mul_and_sum(partial7a, partial7b, v128_from_32(210, 420, 0, 0),
+                       v128_from_32(105, 105, 105, 140));
+  partial5a =
+      fold_mul_and_sum(partial5a, partial5b, v128_from_32(210, 420, 0, 0),
+                       v128_from_32(105, 105, 105, 140));
+  partial6 = v128_madd_s16(partial6, partial6);
+  partial6 = v128_mullo_s32(partial6, v128_dup_32(105));
+
+  partial4a = hsum4(partial4a, partial5a, partial6, partial7a);
+  v128_store_unaligned(tmp_cost1, partial4a);
+  return partial4a;
+}
+
+/* transpose and reverse the order of the lines -- equivalent to a 90-degree
+   counter-clockwise rotation of the pixels. */
+static INLINE void array_reverse_transpose_8x8(v128 *in, v128 *res) {
+  const v128 tr0_0 = v128_ziplo_16(in[1], in[0]);
+  const v128 tr0_1 = v128_ziplo_16(in[3], in[2]);
+  const v128 tr0_2 = v128_ziphi_16(in[1], in[0]);
+  const v128 tr0_3 = v128_ziphi_16(in[3], in[2]);
+  const v128 tr0_4 = v128_ziplo_16(in[5], in[4]);
+  const v128 tr0_5 = v128_ziplo_16(in[7], in[6]);
+  const v128 tr0_6 = v128_ziphi_16(in[5], in[4]);
+  const v128 tr0_7 = v128_ziphi_16(in[7], in[6]);
+
+  const v128 tr1_0 = v128_ziplo_32(tr0_1, tr0_0);
+  const v128 tr1_1 = v128_ziplo_32(tr0_5, tr0_4);
+  const v128 tr1_2 = v128_ziphi_32(tr0_1, tr0_0);
+  const v128 tr1_3 = v128_ziphi_32(tr0_5, tr0_4);
+  const v128 tr1_4 = v128_ziplo_32(tr0_3, tr0_2);
+  const v128 tr1_5 = v128_ziplo_32(tr0_7, tr0_6);
+  const v128 tr1_6 = v128_ziphi_32(tr0_3, tr0_2);
+  const v128 tr1_7 = v128_ziphi_32(tr0_7, tr0_6);
+
+  res[7] = v128_ziplo_64(tr1_1, tr1_0);
+  res[6] = v128_ziphi_64(tr1_1, tr1_0);
+  res[5] = v128_ziplo_64(tr1_3, tr1_2);
+  res[4] = v128_ziphi_64(tr1_3, tr1_2);
+  res[3] = v128_ziplo_64(tr1_5, tr1_4);
+  res[2] = v128_ziphi_64(tr1_5, tr1_4);
+  res[1] = v128_ziplo_64(tr1_7, tr1_6);
+  res[0] = v128_ziphi_64(tr1_7, tr1_6);
+}
+
+int SIMD_FUNC(cdef_find_dir)(const uint16_t *img, int stride, int32_t *var,
+                             int coeff_shift) {
+  int i;
+  int32_t cost[8];
+  int32_t best_cost = 0;
+  int best_dir = 0;
+  v128 lines[8];
+  for (i = 0; i < 8; i++) {
+    lines[i] = v128_load_unaligned(&img[i * stride]);
+    lines[i] =
+        v128_sub_16(v128_shr_s16(lines[i], coeff_shift), v128_dup_16(128));
+  }
+
+  /* Compute "mostly vertical" directions. */
+  v128 dir47 = compute_directions(lines, cost + 4);
+
+  array_reverse_transpose_8x8(lines, lines);
+
+  /* Compute "mostly horizontal" directions. */
+  v128 dir03 = compute_directions(lines, cost);
+
+  v128 max = v128_max_s32(dir03, dir47);
+  max = v128_max_s32(max, v128_align(max, max, 8));
+  max = v128_max_s32(max, v128_align(max, max, 4));
+  best_cost = v128_low_u32(max);
+  v128 t =
+      v128_pack_s32_s16(v128_cmpeq_32(max, dir47), v128_cmpeq_32(max, dir03));
+  best_dir = v128_movemask_8(v128_pack_s16_s8(t, t));
+  best_dir = get_msb(best_dir ^ (best_dir - 1));  // Count trailing zeros
+
+  /* Difference between the optimal variance and the variance along the
+     orthogonal direction. Again, the sum(x^2) terms cancel out. */
+  *var = best_cost - cost[(best_dir + 4) & 7];
+  /* We'd normally divide by 840, but dividing by 1024 is close enough
+     for what we're going to do with this. */
+  *var >>= 10;
+  return best_dir;
+}
+
+// sign(a-b) * min(abs(a-b), max(0, threshold - (abs(a-b) >> adjdamp)))
+SIMD_INLINE v256 constrain16(v256 a, v256 b, unsigned int threshold,
+                             unsigned int adjdamp) {
+  v256 diff = v256_sub_16(a, b);
+  const v256 sign = v256_shr_n_s16(diff, 15);
+  diff = v256_abs_s16(diff);
+  const v256 s =
+      v256_ssub_u16(v256_dup_16(threshold), v256_shr_u16(diff, adjdamp));
+  return v256_xor(v256_add_16(sign, v256_min_s16(diff, s)), sign);
+}
+
+// sign(a - b) * min(abs(a - b), max(0, strength - (abs(a - b) >> adjdamp)))
+SIMD_INLINE v128 constrain(v256 a, v256 b, unsigned int strength,
+                           unsigned int adjdamp) {
+  const v256 diff16 = v256_sub_16(a, b);
+  v128 diff = v128_pack_s16_s8(v256_high_v128(diff16), v256_low_v128(diff16));
+  const v128 sign = v128_cmplt_s8(diff, v128_zero());
+  diff = v128_abs_s8(diff);
+  return v128_xor(
+      v128_add_8(sign,
+                 v128_min_u8(diff, v128_ssub_u8(v128_dup_8(strength),
+                                                v128_shr_u8(diff, adjdamp)))),
+      sign);
+}
+
+void SIMD_FUNC(cdef_filter_block_4x4_8)(uint8_t *dst, int dstride,
+                                        const uint16_t *in, int pri_strength,
+                                        int sec_strength, int dir,
+                                        int pri_damping, int sec_damping,
+                                        int coeff_shift) {
+  v128 p0, p1, p2, p3;
+  v256 sum, row, tap, res;
+  v256 max, min, large = v256_dup_16(CDEF_VERY_LARGE);
+  int po1 = cdef_directions[dir][0];
+  int po2 = cdef_directions[dir][1];
+  int s1o1 = cdef_directions[(dir + 2) & 7][0];
+  int s1o2 = cdef_directions[(dir + 2) & 7][1];
+  int s2o1 = cdef_directions[(dir + 6) & 7][0];
+  int s2o2 = cdef_directions[(dir + 6) & 7][1];
+
+  const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
+  const int *sec_taps = cdef_sec_taps;
+
+  if (pri_strength)
+    pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
+  if (sec_strength)
+    sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
+
+  sum = v256_zero();
+  row = v256_from_v64(v64_load_aligned(&in[0 * CDEF_BSTRIDE]),
+                      v64_load_aligned(&in[1 * CDEF_BSTRIDE]),
+                      v64_load_aligned(&in[2 * CDEF_BSTRIDE]),
+                      v64_load_aligned(&in[3 * CDEF_BSTRIDE]));
+  max = min = row;
+
+  if (pri_strength) {
+    // Primary near taps
+    tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE + po1]),
+                        v64_load_unaligned(&in[1 * CDEF_BSTRIDE + po1]),
+                        v64_load_unaligned(&in[2 * CDEF_BSTRIDE + po1]),
+                        v64_load_unaligned(&in[3 * CDEF_BSTRIDE + po1]));
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+    p0 = constrain(tap, row, pri_strength, pri_damping);
+    tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - po1]),
+                        v64_load_unaligned(&in[1 * CDEF_BSTRIDE - po1]),
+                        v64_load_unaligned(&in[2 * CDEF_BSTRIDE - po1]),
+                        v64_load_unaligned(&in[3 * CDEF_BSTRIDE - po1]));
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+    p1 = constrain(tap, row, pri_strength, pri_damping);
+
+    // sum += pri_taps[0] * (p0 + p1)
+    sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(pri_taps[0]),
+                                         v256_from_v128(v128_ziphi_8(p0, p1),
+                                                        v128_ziplo_8(p0, p1))));
+
+    // Primary far taps
+    tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE + po2]),
+                        v64_load_unaligned(&in[1 * CDEF_BSTRIDE + po2]),
+                        v64_load_unaligned(&in[2 * CDEF_BSTRIDE + po2]),
+                        v64_load_unaligned(&in[3 * CDEF_BSTRIDE + po2]));
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+    p0 = constrain(tap, row, pri_strength, pri_damping);
+    tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - po2]),
+                        v64_load_unaligned(&in[1 * CDEF_BSTRIDE - po2]),
+                        v64_load_unaligned(&in[2 * CDEF_BSTRIDE - po2]),
+                        v64_load_unaligned(&in[3 * CDEF_BSTRIDE - po2]));
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+    p1 = constrain(tap, row, pri_strength, pri_damping);
+
+    // sum += pri_taps[1] * (p0 + p1)
+    sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(pri_taps[1]),
+                                         v256_from_v128(v128_ziphi_8(p0, p1),
+                                                        v128_ziplo_8(p0, p1))));
+  }
+
+  if (sec_strength) {
+    // Secondary near taps
+    tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE + s1o1]),
+                        v64_load_unaligned(&in[1 * CDEF_BSTRIDE + s1o1]),
+                        v64_load_unaligned(&in[2 * CDEF_BSTRIDE + s1o1]),
+                        v64_load_unaligned(&in[3 * CDEF_BSTRIDE + s1o1]));
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+    p0 = constrain(tap, row, sec_strength, sec_damping);
+    tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - s1o1]),
+                        v64_load_unaligned(&in[1 * CDEF_BSTRIDE - s1o1]),
+                        v64_load_unaligned(&in[2 * CDEF_BSTRIDE - s1o1]),
+                        v64_load_unaligned(&in[3 * CDEF_BSTRIDE - s1o1]));
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+    p1 = constrain(tap, row, sec_strength, sec_damping);
+    tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE + s2o1]),
+                        v64_load_unaligned(&in[1 * CDEF_BSTRIDE + s2o1]),
+                        v64_load_unaligned(&in[2 * CDEF_BSTRIDE + s2o1]),
+                        v64_load_unaligned(&in[3 * CDEF_BSTRIDE + s2o1]));
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+    p2 = constrain(tap, row, sec_strength, sec_damping);
+    tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - s2o1]),
+                        v64_load_unaligned(&in[1 * CDEF_BSTRIDE - s2o1]),
+                        v64_load_unaligned(&in[2 * CDEF_BSTRIDE - s2o1]),
+                        v64_load_unaligned(&in[3 * CDEF_BSTRIDE - s2o1]));
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+    p3 = constrain(tap, row, sec_strength, sec_damping);
+
+    // sum += sec_taps[0] * (p0 + p1 + p2 + p3)
+    p0 = v128_add_8(p0, p1);
+    p2 = v128_add_8(p2, p3);
+    sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(sec_taps[0]),
+                                         v256_from_v128(v128_ziphi_8(p0, p2),
+                                                        v128_ziplo_8(p0, p2))));
+
+    // Secondary far taps
+    tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE + s1o2]),
+                        v64_load_unaligned(&in[1 * CDEF_BSTRIDE + s1o2]),
+                        v64_load_unaligned(&in[2 * CDEF_BSTRIDE + s1o2]),
+                        v64_load_unaligned(&in[3 * CDEF_BSTRIDE + s1o2]));
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+    p0 = constrain(tap, row, sec_strength, sec_damping);
+    tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - s1o2]),
+                        v64_load_unaligned(&in[1 * CDEF_BSTRIDE - s1o2]),
+                        v64_load_unaligned(&in[2 * CDEF_BSTRIDE - s1o2]),
+                        v64_load_unaligned(&in[3 * CDEF_BSTRIDE - s1o2]));
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+    p1 = constrain(tap, row, sec_strength, sec_damping);
+    tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE + s2o2]),
+                        v64_load_unaligned(&in[1 * CDEF_BSTRIDE + s2o2]),
+                        v64_load_unaligned(&in[2 * CDEF_BSTRIDE + s2o2]),
+                        v64_load_unaligned(&in[3 * CDEF_BSTRIDE + s2o2]));
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+    p2 = constrain(tap, row, sec_strength, sec_damping);
+    tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - s2o2]),
+                        v64_load_unaligned(&in[1 * CDEF_BSTRIDE - s2o2]),
+                        v64_load_unaligned(&in[2 * CDEF_BSTRIDE - s2o2]),
+                        v64_load_unaligned(&in[3 * CDEF_BSTRIDE - s2o2]));
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+    p3 = constrain(tap, row, sec_strength, sec_damping);
+
+    // sum += sec_taps[1] * (p0 + p1 + p2 + p3)
+    p0 = v128_add_8(p0, p1);
+    p2 = v128_add_8(p2, p3);
+
+    sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(sec_taps[1]),
+                                         v256_from_v128(v128_ziphi_8(p0, p2),
+                                                        v128_ziplo_8(p0, p2))));
+  }
+
+  // res = row + ((sum - (sum < 0) + 8) >> 4)
+  sum = v256_add_16(sum, v256_cmplt_s16(sum, v256_zero()));
+  res = v256_add_16(sum, v256_dup_16(8));
+  res = v256_shr_n_s16(res, 4);
+  res = v256_add_16(row, res);
+  res = v256_min_s16(v256_max_s16(res, min), max);
+  res = v256_pack_s16_u8(res, res);
+
+  p0 = v256_low_v128(res);
+  u32_store_aligned(&dst[0 * dstride], v64_high_u32(v128_high_v64(p0)));
+  u32_store_aligned(&dst[1 * dstride], v64_low_u32(v128_high_v64(p0)));
+  u32_store_aligned(&dst[2 * dstride], v64_high_u32(v128_low_v64(p0)));
+  u32_store_aligned(&dst[3 * dstride], v64_low_u32(v128_low_v64(p0)));
+}
+
+void SIMD_FUNC(cdef_filter_block_8x8_8)(uint8_t *dst, int dstride,
+                                        const uint16_t *in, int pri_strength,
+                                        int sec_strength, int dir,
+                                        int pri_damping, int sec_damping,
+                                        int coeff_shift) {
+  int i;
+  v128 p0, p1, p2, p3;
+  v256 sum, row, res, tap;
+  v256 max, min, large = v256_dup_16(CDEF_VERY_LARGE);
+  int po1 = cdef_directions[dir][0];
+  int po2 = cdef_directions[dir][1];
+  int s1o1 = cdef_directions[(dir + 2) & 7][0];
+  int s1o2 = cdef_directions[(dir + 2) & 7][1];
+  int s2o1 = cdef_directions[(dir + 6) & 7][0];
+  int s2o2 = cdef_directions[(dir + 6) & 7][1];
+
+  const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
+  const int *sec_taps = cdef_sec_taps;
+
+  if (pri_strength)
+    pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
+  if (sec_strength)
+    sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
+  for (i = 0; i < 8; i += 2) {
+    sum = v256_zero();
+    row = v256_from_v128(v128_load_aligned(&in[i * CDEF_BSTRIDE]),
+                         v128_load_aligned(&in[(i + 1) * CDEF_BSTRIDE]));
+
+    max = min = row;
+    // Primary near taps
+    tap =
+        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + po1]),
+                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po1]));
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+    p0 = constrain(tap, row, pri_strength, pri_damping);
+    tap =
+        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - po1]),
+                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po1]));
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+    p1 = constrain(tap, row, pri_strength, pri_damping);
+
+    // sum += pri_taps[0] * (p0 + p1)
+    sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(pri_taps[0]),
+                                         v256_from_v128(v128_ziphi_8(p0, p1),
+                                                        v128_ziplo_8(p0, p1))));
+
+    // Primary far taps
+    tap =
+        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + po2]),
+                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po2]));
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+    p0 = constrain(tap, row, pri_strength, pri_damping);
+    tap =
+        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - po2]),
+                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po2]));
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+    p1 = constrain(tap, row, pri_strength, pri_damping);
+
+    // sum += pri_taps[1] * (p0 + p1)
+    sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(pri_taps[1]),
+                                         v256_from_v128(v128_ziphi_8(p0, p1),
+                                                        v128_ziplo_8(p0, p1))));
+
+    // Secondary near taps
+    tap =
+        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s1o1]),
+                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o1]));
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+    p0 = constrain(tap, row, sec_strength, sec_damping);
+    tap =
+        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o1]),
+                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o1]));
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+    p1 = constrain(tap, row, sec_strength, sec_damping);
+    tap =
+        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o1]),
+                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o1]));
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+    p2 = constrain(tap, row, sec_strength, sec_damping);
+    tap =
+        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o1]),
+                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o1]));
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+    p3 = constrain(tap, row, sec_strength, sec_damping);
+
+    // sum += sec_taps[0] * (p0 + p1 + p2 + p3)
+    p0 = v128_add_8(p0, p1);
+    p2 = v128_add_8(p2, p3);
+    sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(sec_taps[0]),
+                                         v256_from_v128(v128_ziphi_8(p0, p2),
+                                                        v128_ziplo_8(p0, p2))));
+
+    // Secondary far taps
+    tap =
+        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s1o2]),
+                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o2]));
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+    p0 = constrain(tap, row, sec_strength, sec_damping);
+    tap =
+        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o2]),
+                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o2]));
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+    p1 = constrain(tap, row, sec_strength, sec_damping);
+    tap =
+        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o2]),
+                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o2]));
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+    p2 = constrain(tap, row, sec_strength, sec_damping);
+    tap =
+        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o2]),
+                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o2]));
+    max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+    min = v256_min_s16(min, tap);
+    p3 = constrain(tap, row, sec_strength, sec_damping);
+
+    // sum += sec_taps[1] * (p0 + p1 + p2 + p3)
+    p0 = v128_add_8(p0, p1);
+    p2 = v128_add_8(p2, p3);
+    sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(sec_taps[1]),
+                                         v256_from_v128(v128_ziphi_8(p0, p2),
+                                                        v128_ziplo_8(p0, p2))));
+
+    // res = row + ((sum - (sum < 0) + 8) >> 4)
+    sum = v256_add_16(sum, v256_cmplt_s16(sum, v256_zero()));
+    res = v256_add_16(sum, v256_dup_16(8));
+    res = v256_shr_n_s16(res, 4);
+    res = v256_add_16(row, res);
+    res = v256_min_s16(v256_max_s16(res, min), max);
+    res = v256_pack_s16_u8(res, res);
+
+    p0 = v256_low_v128(res);
+    v64_store_aligned(&dst[i * dstride], v128_high_v64(p0));
+    v64_store_aligned(&dst[(i + 1) * dstride], v128_low_v64(p0));
+  }
+}
+
+void SIMD_FUNC(cdef_filter_block_4x4_16)(uint16_t *dst, int dstride,
+                                         const uint16_t *in, int pri_strength,
+                                         int sec_strength, int dir,
+                                         int pri_damping, int sec_damping,
+                                         int coeff_shift) {
+  int i;
+  v256 p0, p1, p2, p3, sum, row, res;
+  v256 max, min, large = v256_dup_16(CDEF_VERY_LARGE);
+  int po1 = cdef_directions[dir][0];
+  int po2 = cdef_directions[dir][1];
+  int s1o1 = cdef_directions[(dir + 2) & 7][0];
+  int s1o2 = cdef_directions[(dir + 2) & 7][1];
+  int s2o1 = cdef_directions[(dir + 6) & 7][0];
+  int s2o2 = cdef_directions[(dir + 6) & 7][1];
+
+  const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
+  const int *sec_taps = cdef_sec_taps;
+
+  if (pri_strength)
+    pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
+  if (sec_strength)
+    sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
+  for (i = 0; i < 4; i += 4) {
+    sum = v256_zero();
+    row = v256_from_v64(v64_load_aligned(&in[i * CDEF_BSTRIDE]),
+                        v64_load_aligned(&in[(i + 1) * CDEF_BSTRIDE]),
+                        v64_load_aligned(&in[(i + 2) * CDEF_BSTRIDE]),
+                        v64_load_aligned(&in[(i + 3) * CDEF_BSTRIDE]));
+    min = max = row;
+
+    // Primary near taps
+    p0 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + po1]),
+                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po1]),
+                       v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + po1]),
+                       v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + po1]));
+    p1 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - po1]),
+                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po1]),
+                       v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - po1]),
+                       v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - po1]));
+    max =
+        v256_max_s16(v256_max_s16(max, v256_andn(p0, v256_cmpeq_16(p0, large))),
+                     v256_andn(p1, v256_cmpeq_16(p1, large)));
+    min = v256_min_s16(v256_min_s16(min, p0), p1);
+    p0 = constrain16(p0, row, pri_strength, pri_damping);
+    p1 = constrain16(p1, row, pri_strength, pri_damping);
+
+    // sum += pri_taps[0] * (p0 + p1)
+    sum = v256_add_16(
+        sum, v256_mullo_s16(v256_dup_16(pri_taps[0]), v256_add_16(p0, p1)));
+
+    // Primary far taps
+    p0 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + po2]),
+                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po2]),
+                       v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + po2]),
+                       v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + po2]));
+    p1 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - po2]),
+                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po2]),
+                       v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - po2]),
+                       v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - po2]));
+    max =
+        v256_max_s16(v256_max_s16(max, v256_andn(p0, v256_cmpeq_16(p0, large))),
+                     v256_andn(p1, v256_cmpeq_16(p1, large)));
+    min = v256_min_s16(v256_min_s16(min, p0), p1);
+    p0 = constrain16(p0, row, pri_strength, pri_damping);
+    p1 = constrain16(p1, row, pri_strength, pri_damping);
+
+    // sum += pri_taps[1] * (p0 + p1)
+    sum = v256_add_16(
+        sum, v256_mullo_s16(v256_dup_16(pri_taps[1]), v256_add_16(p0, p1)));
+
+    // Secondary near taps
+    p0 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + s1o1]),
+                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o1]),
+                       v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s1o1]),
+                       v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s1o1]));
+    p1 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - s1o1]),
+                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o1]),
+                       v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s1o1]),
+                       v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s1o1]));
+    p2 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + s2o1]),
+                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o1]),
+                       v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s2o1]),
+                       v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s2o1]));
+    p3 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - s2o1]),
+                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o1]),
+                       v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s2o1]),
+                       v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s2o1]));
+    max =
+        v256_max_s16(v256_max_s16(max, v256_andn(p0, v256_cmpeq_16(p0, large))),
+                     v256_andn(p1, v256_cmpeq_16(p1, large)));
+    max =
+        v256_max_s16(v256_max_s16(max, v256_andn(p2, v256_cmpeq_16(p2, large))),
+                     v256_andn(p3, v256_cmpeq_16(p3, large)));
+    min = v256_min_s16(
+        v256_min_s16(v256_min_s16(v256_min_s16(min, p0), p1), p2), p3);
+    p0 = constrain16(p0, row, sec_strength, sec_damping);
+    p1 = constrain16(p1, row, sec_strength, sec_damping);
+    p2 = constrain16(p2, row, sec_strength, sec_damping);
+    p3 = constrain16(p3, row, sec_strength, sec_damping);
+
+    // sum += sec_taps[0] * (p0 + p1 + p2 + p3)
+    sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[0]),
+                                          v256_add_16(v256_add_16(p0, p1),
+                                                      v256_add_16(p2, p3))));
+
+    // Secondary far taps
+    p0 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + s1o2]),
+                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o2]),
+                       v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s1o2]),
+                       v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s1o2]));
+    p1 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - s1o2]),
+                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o2]),
+                       v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s1o2]),
+                       v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s1o2]));
+    p2 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + s2o2]),
+                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o2]),
+                       v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s2o2]),
+                       v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s2o2]));
+    p3 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - s2o2]),
+                       v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o2]),
+                       v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s2o2]),
+                       v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s2o2]));
+    max =
+        v256_max_s16(v256_max_s16(max, v256_andn(p0, v256_cmpeq_16(p0, large))),
+                     v256_andn(p1, v256_cmpeq_16(p1, large)));
+    max =
+        v256_max_s16(v256_max_s16(max, v256_andn(p2, v256_cmpeq_16(p2, large))),
+                     v256_andn(p3, v256_cmpeq_16(p3, large)));
+    min = v256_min_s16(
+        v256_min_s16(v256_min_s16(v256_min_s16(min, p0), p1), p2), p3);
+    p0 = constrain16(p0, row, sec_strength, sec_damping);
+    p1 = constrain16(p1, row, sec_strength, sec_damping);
+    p2 = constrain16(p2, row, sec_strength, sec_damping);
+    p3 = constrain16(p3, row, sec_strength, sec_damping);
+
+    // sum += sec_taps[1] * (p0 + p1 + p2 + p3)
+    sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[1]),
+                                          v256_add_16(v256_add_16(p0, p1),
+                                                      v256_add_16(p2, p3))));
+
+    // res = row + ((sum - (sum < 0) + 8) >> 4)
+    sum = v256_add_16(sum, v256_cmplt_s16(sum, v256_zero()));
+    res = v256_add_16(sum, v256_dup_16(8));
+    res = v256_shr_n_s16(res, 4);
+    res = v256_add_16(row, res);
+    res = v256_min_s16(v256_max_s16(res, min), max);
+
+    v64_store_aligned(&dst[i * dstride], v128_high_v64(v256_high_v128(res)));
+    v64_store_aligned(&dst[(i + 1) * dstride],
+                      v128_low_v64(v256_high_v128(res)));
+    v64_store_aligned(&dst[(i + 2) * dstride],
+                      v128_high_v64(v256_low_v128(res)));
+    v64_store_aligned(&dst[(i + 3) * dstride],
+                      v128_low_v64(v256_low_v128(res)));
+  }
+}
+
+void SIMD_FUNC(cdef_filter_block_8x8_16)(uint16_t *dst, int dstride,
+                                         const uint16_t *in, int pri_strength,
+                                         int sec_strength, int dir,
+                                         int pri_damping, int sec_damping,
+                                         int coeff_shift) {
+  int i;
+  v256 sum, p0, p1, p2, p3, row, res;
+  v256 max, min, large = v256_dup_16(CDEF_VERY_LARGE);
+  int po1 = cdef_directions[dir][0];
+  int po2 = cdef_directions[dir][1];
+  int s1o1 = cdef_directions[(dir + 2) & 7][0];
+  int s1o2 = cdef_directions[(dir + 2) & 7][1];
+  int s2o1 = cdef_directions[(dir + 6) & 7][0];
+  int s2o2 = cdef_directions[(dir + 6) & 7][1];
+
+  const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
+  const int *sec_taps = cdef_sec_taps;
+
+  if (pri_strength)
+    pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
+  if (sec_strength)
+    sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
+
+  for (i = 0; i < 8; i += 2) {
+    sum = v256_zero();
+    row = v256_from_v128(v128_load_aligned(&in[i * CDEF_BSTRIDE]),
+                         v128_load_aligned(&in[(i + 1) * CDEF_BSTRIDE]));
+
+    min = max = row;
+    // Primary near taps
+    p0 = v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + po1]),
+                        v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po1]));
+    p1 = v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - po1]),
+                        v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po1]));
+    max =
+        v256_max_s16(v256_max_s16(max, v256_andn(p0, v256_cmpeq_16(p0, large))),
+                     v256_andn(p1, v256_cmpeq_16(p1, large)));
+    min = v256_min_s16(v256_min_s16(min, p0), p1);
+    p0 = constrain16(p0, row, pri_strength, pri_damping);
+    p1 = constrain16(p1, row, pri_strength, pri_damping);
+
+    // sum += pri_taps[0] * (p0 + p1)
+    sum = v256_add_16(
+        sum, v256_mullo_s16(v256_dup_16(pri_taps[0]), v256_add_16(p0, p1)));
+
+    // Primary far taps
+    p0 = v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + po2]),
+                        v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po2]));
+    p1 = v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - po2]),
+                        v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po2]));
+    max =
+        v256_max_s16(v256_max_s16(max, v256_andn(p0, v256_cmpeq_16(p0, large))),
+                     v256_andn(p1, v256_cmpeq_16(p1, large)));
+    min = v256_min_s16(v256_min_s16(min, p0), p1);
+    p0 = constrain16(p0, row, pri_strength, pri_damping);
+    p1 = constrain16(p1, row, pri_strength, pri_damping);
+
+    // sum += pri_taps[1] * (p0 + p1)
+    sum = v256_add_16(
+        sum, v256_mullo_s16(v256_dup_16(pri_taps[1]), v256_add_16(p0, p1)));
+
+    // Secondary near taps
+    p0 =
+        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s1o1]),
+                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o1]));
+    p1 =
+        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o1]),
+                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o1]));
+    p2 =
+        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o1]),
+                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o1]));
+    p3 =
+        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o1]),
+                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o1]));
+    max =
+        v256_max_s16(v256_max_s16(max, v256_andn(p0, v256_cmpeq_16(p0, large))),
+                     v256_andn(p1, v256_cmpeq_16(p1, large)));
+    max =
+        v256_max_s16(v256_max_s16(max, v256_andn(p2, v256_cmpeq_16(p2, large))),
+                     v256_andn(p3, v256_cmpeq_16(p3, large)));
+    min = v256_min_s16(
+        v256_min_s16(v256_min_s16(v256_min_s16(min, p0), p1), p2), p3);
+    p0 = constrain16(p0, row, sec_strength, sec_damping);
+    p1 = constrain16(p1, row, sec_strength, sec_damping);
+    p2 = constrain16(p2, row, sec_strength, sec_damping);
+    p3 = constrain16(p3, row, sec_strength, sec_damping);
+
+    // sum += sec_taps[0] * (p0 + p1 + p2 + p3)
+    sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[0]),
+                                          v256_add_16(v256_add_16(p0, p1),
+                                                      v256_add_16(p2, p3))));
+
+    // Secondary far taps
+    p0 =
+        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s1o2]),
+                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o2]));
+    p1 =
+        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o2]),
+                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o2]));
+    p2 =
+        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o2]),
+                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o2]));
+    p3 =
+        v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o2]),
+                       v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o2]));
+    max =
+        v256_max_s16(v256_max_s16(max, v256_andn(p0, v256_cmpeq_16(p0, large))),
+                     v256_andn(p1, v256_cmpeq_16(p1, large)));
+    max =
+        v256_max_s16(v256_max_s16(max, v256_andn(p2, v256_cmpeq_16(p2, large))),
+                     v256_andn(p3, v256_cmpeq_16(p3, large)));
+    min = v256_min_s16(
+        v256_min_s16(v256_min_s16(v256_min_s16(min, p0), p1), p2), p3);
+    p0 = constrain16(p0, row, sec_strength, sec_damping);
+    p1 = constrain16(p1, row, sec_strength, sec_damping);
+    p2 = constrain16(p2, row, sec_strength, sec_damping);
+    p3 = constrain16(p3, row, sec_strength, sec_damping);
+
+    // sum += sec_taps[1] * (p0 + p1 + p2 + p3)
+    sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[1]),
+                                          v256_add_16(v256_add_16(p0, p1),
+                                                      v256_add_16(p2, p3))));
+
+    // res = row + ((sum - (sum < 0) + 8) >> 4)
+    sum = v256_add_16(sum, v256_cmplt_s16(sum, v256_zero()));
+    res = v256_add_16(sum, v256_dup_16(8));
+    res = v256_shr_n_s16(res, 4);
+    res = v256_add_16(row, res);
+    res = v256_min_s16(v256_max_s16(res, min), max);
+    v128_store_unaligned(&dst[i * dstride], v256_high_v128(res));
+    v128_store_unaligned(&dst[(i + 1) * dstride], v256_low_v128(res));
+  }
+}
+
+void SIMD_FUNC(cdef_filter_block)(uint8_t *dst8, uint16_t *dst16, int dstride,
+                                  const uint16_t *in, int pri_strength,
+                                  int sec_strength, int dir, int pri_damping,
+                                  int sec_damping, int bsize, int coeff_shift) {
+  if (dst8) {
+    if (bsize == BLOCK_8X8) {
+      SIMD_FUNC(cdef_filter_block_8x8_8)
+      (dst8, dstride, in, pri_strength, sec_strength, dir, pri_damping,
+       sec_damping, coeff_shift);
+    } else if (bsize == BLOCK_4X8) {
+      SIMD_FUNC(cdef_filter_block_4x4_8)
+      (dst8, dstride, in, pri_strength, sec_strength, dir, pri_damping,
+       sec_damping, coeff_shift);
+      SIMD_FUNC(cdef_filter_block_4x4_8)
+      (dst8 + 4 * dstride, dstride, in + 4 * CDEF_BSTRIDE, pri_strength,
+       sec_strength, dir, pri_damping, sec_damping, coeff_shift);
+    } else if (bsize == BLOCK_8X4) {
+      SIMD_FUNC(cdef_filter_block_4x4_8)
+      (dst8, dstride, in, pri_strength, sec_strength, dir, pri_damping,
+       sec_damping, coeff_shift);
+      SIMD_FUNC(cdef_filter_block_4x4_8)
+      (dst8 + 4, dstride, in + 4, pri_strength, sec_strength, dir, pri_damping,
+       sec_damping, coeff_shift);
+    } else {
+      SIMD_FUNC(cdef_filter_block_4x4_8)
+      (dst8, dstride, in, pri_strength, sec_strength, dir, pri_damping,
+       sec_damping, coeff_shift);
+    }
+  } else {
+    if (bsize == BLOCK_8X8) {
+      SIMD_FUNC(cdef_filter_block_8x8_16)
+      (dst16, dstride, in, pri_strength, sec_strength, dir, pri_damping,
+       sec_damping, coeff_shift);
+    } else if (bsize == BLOCK_4X8) {
+      SIMD_FUNC(cdef_filter_block_4x4_16)
+      (dst16, dstride, in, pri_strength, sec_strength, dir, pri_damping,
+       sec_damping, coeff_shift);
+      SIMD_FUNC(cdef_filter_block_4x4_16)
+      (dst16 + 4 * dstride, dstride, in + 4 * CDEF_BSTRIDE, pri_strength,
+       sec_strength, dir, pri_damping, sec_damping, coeff_shift);
+    } else if (bsize == BLOCK_8X4) {
+      SIMD_FUNC(cdef_filter_block_4x4_16)
+      (dst16, dstride, in, pri_strength, sec_strength, dir, pri_damping,
+       sec_damping, coeff_shift);
+      SIMD_FUNC(cdef_filter_block_4x4_16)
+      (dst16 + 4, dstride, in + 4, pri_strength, sec_strength, dir, pri_damping,
+       sec_damping, coeff_shift);
+    } else {
+      assert(bsize == BLOCK_4X4);
+      SIMD_FUNC(cdef_filter_block_4x4_16)
+      (dst16, dstride, in, pri_strength, sec_strength, dir, pri_damping,
+       sec_damping, coeff_shift);
+    }
+  }
+}
+
+void SIMD_FUNC(cdef_copy_rect8_8bit_to_16bit)(uint16_t *dst, int dstride,
+                                              const uint8_t *src, int sstride,
+                                              int v, int h) {
+  int i, j;
+  for (i = 0; i < v; i++) {
+    for (j = 0; j < (h & ~0x7); j += 8) {
+      v64 row = v64_load_unaligned(&src[i * sstride + j]);
+      v128_store_unaligned(&dst[i * dstride + j], v128_unpack_u8_s16(row));
+    }
+    for (; j < h; j++) {
+      dst[i * dstride + j] = src[i * sstride + j];
+    }
+  }
+}
+
+void SIMD_FUNC(cdef_copy_rect8_16bit_to_16bit)(uint16_t *dst, int dstride,
+                                               const uint16_t *src, int sstride,
+                                               int v, int h) {
+  int i, j;
+  for (i = 0; i < v; i++) {
+    for (j = 0; j < (h & ~0x7); j += 8) {
+      v128 row = v128_load_unaligned(&src[i * sstride + j]);
+      v128_store_unaligned(&dst[i * dstride + j], row);
+    }
+    for (; j < h; j++) {
+      dst[i * dstride + j] = src[i * sstride + j];
+    }
+  }
+}
+
+#endif  // AOM_AV1_COMMON_CDEF_BLOCK_SIMD_H_
diff --git a/libs/libaom/src/av1/common/cdef_block_sse2.c b/libs/libaom/src/av1/common/cdef_block_sse2.c
new file mode 100644
index 000000000..73f115d17
--- /dev/null
+++ b/libs/libaom/src/av1/common/cdef_block_sse2.c
@@ -0,0 +1,14 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/aom_simd.h"
+#define SIMD_FUNC(name) name##_sse2
+#include "av1/common/cdef_block_simd.h"
diff --git a/libs/libaom/src/av1/common/cdef_block_sse4.c b/libs/libaom/src/av1/common/cdef_block_sse4.c
new file mode 100644
index 000000000..349329af6
--- /dev/null
+++ b/libs/libaom/src/av1/common/cdef_block_sse4.c
@@ -0,0 +1,14 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/aom_simd.h"
+#define SIMD_FUNC(name) name##_sse4_1
+#include "av1/common/cdef_block_simd.h"
diff --git a/libs/libaom/src/av1/common/cdef_block_ssse3.c b/libs/libaom/src/av1/common/cdef_block_ssse3.c
new file mode 100644
index 000000000..3a93b150f
--- /dev/null
+++ b/libs/libaom/src/av1/common/cdef_block_ssse3.c
@@ -0,0 +1,14 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/aom_simd.h"
+#define SIMD_FUNC(name) name##_ssse3
+#include "av1/common/cdef_block_simd.h"
diff --git a/libs/libaom/src/av1/common/cfl.c b/libs/libaom/src/av1/common/cfl.c
new file mode 100644
index 000000000..98199cb95
--- /dev/null
+++ b/libs/libaom/src/av1/common/cfl.c
@@ -0,0 +1,436 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/cfl.h"
+#include "av1/common/common_data.h"
+
+#include "config/av1_rtcd.h"
+
+void cfl_init(CFL_CTX *cfl, const SequenceHeader *seq_params) {
+  assert(block_size_wide[CFL_MAX_BLOCK_SIZE] == CFL_BUF_LINE);
+  assert(block_size_high[CFL_MAX_BLOCK_SIZE] == CFL_BUF_LINE);
+
+  memset(&cfl->recon_buf_q3, 0, sizeof(cfl->recon_buf_q3));
+  memset(&cfl->ac_buf_q3, 0, sizeof(cfl->ac_buf_q3));
+  cfl->subsampling_x = seq_params->subsampling_x;
+  cfl->subsampling_y = seq_params->subsampling_y;
+  cfl->are_parameters_computed = 0;
+  cfl->store_y = 0;
+  // The DC_PRED cache is disabled by default and is only enabled in
+  // cfl_rd_pick_alpha
+  cfl->use_dc_pred_cache = 0;
+  cfl->dc_pred_is_cached[CFL_PRED_U] = 0;
+  cfl->dc_pred_is_cached[CFL_PRED_V] = 0;
+}
+
+void cfl_store_dc_pred(MACROBLOCKD *const xd, const uint8_t *input,
+                       CFL_PRED_TYPE pred_plane, int width) {
+  assert(pred_plane < CFL_PRED_PLANES);
+  assert(width <= CFL_BUF_LINE);
+
+  if (is_cur_buf_hbd(xd)) {
+    uint16_t *const input_16 = CONVERT_TO_SHORTPTR(input);
+    memcpy(xd->cfl.dc_pred_cache[pred_plane], input_16, width << 1);
+    return;
+  }
+
+  memcpy(xd->cfl.dc_pred_cache[pred_plane], input, width);
+}
+
+static void cfl_load_dc_pred_lbd(const int16_t *dc_pred_cache, uint8_t *dst,
+                                 int dst_stride, int width, int height) {
+  for (int j = 0; j < height; j++) {
+    memcpy(dst, dc_pred_cache, width);
+    dst += dst_stride;
+  }
+}
+
+static void cfl_load_dc_pred_hbd(const int16_t *dc_pred_cache, uint16_t *dst,
+                                 int dst_stride, int width, int height) {
+  const size_t num_bytes = width << 1;
+  for (int j = 0; j < height; j++) {
+    memcpy(dst, dc_pred_cache, num_bytes);
+    dst += dst_stride;
+  }
+}
+void cfl_load_dc_pred(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride,
+                      TX_SIZE tx_size, CFL_PRED_TYPE pred_plane) {
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  assert(pred_plane < CFL_PRED_PLANES);
+  assert(width <= CFL_BUF_LINE);
+  assert(height <= CFL_BUF_LINE);
+  if (is_cur_buf_hbd(xd)) {
+    uint16_t *dst_16 = CONVERT_TO_SHORTPTR(dst);
+    cfl_load_dc_pred_hbd(xd->cfl.dc_pred_cache[pred_plane], dst_16, dst_stride,
+                         width, height);
+    return;
+  }
+  cfl_load_dc_pred_lbd(xd->cfl.dc_pred_cache[pred_plane], dst, dst_stride,
+                       width, height);
+}
+
+// Due to frame boundary issues, it is possible that the total area covered by
+// chroma exceeds that of luma. When this happens, we fill the missing pixels by
+// repeating the last columns and/or rows.
+static INLINE void cfl_pad(CFL_CTX *cfl, int width, int height) {
+  const int diff_width = width - cfl->buf_width;
+  const int diff_height = height - cfl->buf_height;
+
+  if (diff_width > 0) {
+    const int min_height = height - diff_height;
+    uint16_t *recon_buf_q3 = cfl->recon_buf_q3 + (width - diff_width);
+    for (int j = 0; j < min_height; j++) {
+      const uint16_t last_pixel = recon_buf_q3[-1];
+      assert(recon_buf_q3 + diff_width <= cfl->recon_buf_q3 + CFL_BUF_SQUARE);
+      for (int i = 0; i < diff_width; i++) {
+        recon_buf_q3[i] = last_pixel;
+      }
+      recon_buf_q3 += CFL_BUF_LINE;
+    }
+    cfl->buf_width = width;
+  }
+  if (diff_height > 0) {
+    uint16_t *recon_buf_q3 =
+        cfl->recon_buf_q3 + ((height - diff_height) * CFL_BUF_LINE);
+    for (int j = 0; j < diff_height; j++) {
+      const uint16_t *last_row_q3 = recon_buf_q3 - CFL_BUF_LINE;
+      assert(recon_buf_q3 + width <= cfl->recon_buf_q3 + CFL_BUF_SQUARE);
+      for (int i = 0; i < width; i++) {
+        recon_buf_q3[i] = last_row_q3[i];
+      }
+      recon_buf_q3 += CFL_BUF_LINE;
+    }
+    cfl->buf_height = height;
+  }
+}
+
+static void subtract_average_c(const uint16_t *src, int16_t *dst, int width,
+                               int height, int round_offset, int num_pel_log2) {
+  int sum = round_offset;
+  const uint16_t *recon = src;
+  for (int j = 0; j < height; j++) {
+    for (int i = 0; i < width; i++) {
+      sum += recon[i];
+    }
+    recon += CFL_BUF_LINE;
+  }
+  const int avg = sum >> num_pel_log2;
+  for (int j = 0; j < height; j++) {
+    for (int i = 0; i < width; i++) {
+      dst[i] = src[i] - avg;
+    }
+    src += CFL_BUF_LINE;
+    dst += CFL_BUF_LINE;
+  }
+}
+
+CFL_SUB_AVG_FN(c)
+
+static INLINE int cfl_idx_to_alpha(uint8_t alpha_idx, int8_t joint_sign,
+                                   CFL_PRED_TYPE pred_type) {
+  const int alpha_sign = (pred_type == CFL_PRED_U) ? CFL_SIGN_U(joint_sign)
+                                                   : CFL_SIGN_V(joint_sign);
+  if (alpha_sign == CFL_SIGN_ZERO) return 0;
+  const int abs_alpha_q3 =
+      (pred_type == CFL_PRED_U) ? CFL_IDX_U(alpha_idx) : CFL_IDX_V(alpha_idx);
+  return (alpha_sign == CFL_SIGN_POS) ? abs_alpha_q3 + 1 : -abs_alpha_q3 - 1;
+}
+
+static INLINE void cfl_predict_lbd_c(const int16_t *ac_buf_q3, uint8_t *dst,
+                                     int dst_stride, int alpha_q3, int width,
+                                     int height) {
+  for (int j = 0; j < height; j++) {
+    for (int i = 0; i < width; i++) {
+      dst[i] = clip_pixel(get_scaled_luma_q0(alpha_q3, ac_buf_q3[i]) + dst[i]);
+    }
+    dst += dst_stride;
+    ac_buf_q3 += CFL_BUF_LINE;
+  }
+}
+
+CFL_PREDICT_FN(c, lbd)
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void cfl_predict_hbd_c(const int16_t *ac_buf_q3, uint16_t *dst, int dst_stride,
+                       int alpha_q3, int bit_depth, int width, int height) {
+  for (int j = 0; j < height; j++) {
+    for (int i = 0; i < width; i++) {
+      dst[i] = clip_pixel_highbd(
+          get_scaled_luma_q0(alpha_q3, ac_buf_q3[i]) + dst[i], bit_depth);
+    }
+    dst += dst_stride;
+    ac_buf_q3 += CFL_BUF_LINE;
+  }
+}
+
+CFL_PREDICT_FN(c, hbd)
+#endif
+
+static void cfl_compute_parameters(MACROBLOCKD *const xd, TX_SIZE tx_size) {
+  CFL_CTX *const cfl = &xd->cfl;
+  // Do not call cfl_compute_parameters multiple time on the same values.
+  assert(cfl->are_parameters_computed == 0);
+
+  cfl_pad(cfl, tx_size_wide[tx_size], tx_size_high[tx_size]);
+  cfl_get_subtract_average_fn(tx_size)(cfl->recon_buf_q3, cfl->ac_buf_q3);
+  cfl->are_parameters_computed = 1;
+}
+
+void cfl_predict_block(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride,
+                       TX_SIZE tx_size, int plane) {
+  CFL_CTX *const cfl = &xd->cfl;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  assert(is_cfl_allowed(xd));
+
+  if (!cfl->are_parameters_computed) cfl_compute_parameters(xd, tx_size);
+
+  const int alpha_q3 =
+      cfl_idx_to_alpha(mbmi->cfl_alpha_idx, mbmi->cfl_alpha_signs, plane - 1);
+  assert((tx_size_high[tx_size] - 1) * CFL_BUF_LINE + tx_size_wide[tx_size] <=
+         CFL_BUF_SQUARE);
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (is_cur_buf_hbd(xd)) {
+    uint16_t *dst_16 = CONVERT_TO_SHORTPTR(dst);
+    cfl_get_predict_hbd_fn(tx_size)(cfl->ac_buf_q3, dst_16, dst_stride,
+                                    alpha_q3, xd->bd);
+    return;
+  }
+#endif
+  cfl_get_predict_lbd_fn(tx_size)(cfl->ac_buf_q3, dst, dst_stride, alpha_q3);
+}
+
+static void cfl_luma_subsampling_420_lbd_c(const uint8_t *input,
+                                           int input_stride,
+                                           uint16_t *output_q3, int width,
+                                           int height) {
+  for (int j = 0; j < height; j += 2) {
+    for (int i = 0; i < width; i += 2) {
+      const int bot = i + input_stride;
+      output_q3[i >> 1] =
+          (input[i] + input[i + 1] + input[bot] + input[bot + 1]) << 1;
+    }
+    input += input_stride << 1;
+    output_q3 += CFL_BUF_LINE;
+  }
+}
+
+static void cfl_luma_subsampling_422_lbd_c(const uint8_t *input,
+                                           int input_stride,
+                                           uint16_t *output_q3, int width,
+                                           int height) {
+  assert((height - 1) * CFL_BUF_LINE + width <= CFL_BUF_SQUARE);
+  for (int j = 0; j < height; j++) {
+    for (int i = 0; i < width; i += 2) {
+      output_q3[i >> 1] = (input[i] + input[i + 1]) << 2;
+    }
+    input += input_stride;
+    output_q3 += CFL_BUF_LINE;
+  }
+}
+
+static void cfl_luma_subsampling_444_lbd_c(const uint8_t *input,
+                                           int input_stride,
+                                           uint16_t *output_q3, int width,
+                                           int height) {
+  assert((height - 1) * CFL_BUF_LINE + width <= CFL_BUF_SQUARE);
+  for (int j = 0; j < height; j++) {
+    for (int i = 0; i < width; i++) {
+      output_q3[i] = input[i] << 3;
+    }
+    input += input_stride;
+    output_q3 += CFL_BUF_LINE;
+  }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static void cfl_luma_subsampling_420_hbd_c(const uint16_t *input,
+                                           int input_stride,
+                                           uint16_t *output_q3, int width,
+                                           int height) {
+  for (int j = 0; j < height; j += 2) {
+    for (int i = 0; i < width; i += 2) {
+      const int bot = i + input_stride;
+      output_q3[i >> 1] =
+          (input[i] + input[i + 1] + input[bot] + input[bot + 1]) << 1;
+    }
+    input += input_stride << 1;
+    output_q3 += CFL_BUF_LINE;
+  }
+}
+
+static void cfl_luma_subsampling_422_hbd_c(const uint16_t *input,
+                                           int input_stride,
+                                           uint16_t *output_q3, int width,
+                                           int height) {
+  assert((height - 1) * CFL_BUF_LINE + width <= CFL_BUF_SQUARE);
+  for (int j = 0; j < height; j++) {
+    for (int i = 0; i < width; i += 2) {
+      output_q3[i >> 1] = (input[i] + input[i + 1]) << 2;
+    }
+    input += input_stride;
+    output_q3 += CFL_BUF_LINE;
+  }
+}
+
+static void cfl_luma_subsampling_444_hbd_c(const uint16_t *input,
+                                           int input_stride,
+                                           uint16_t *output_q3, int width,
+                                           int height) {
+  assert((height - 1) * CFL_BUF_LINE + width <= CFL_BUF_SQUARE);
+  for (int j = 0; j < height; j++) {
+    for (int i = 0; i < width; i++) {
+      output_q3[i] = input[i] << 3;
+    }
+    input += input_stride;
+    output_q3 += CFL_BUF_LINE;
+  }
+}
+#endif
+
+CFL_GET_SUBSAMPLE_FUNCTION(c)
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE cfl_subsample_hbd_fn cfl_subsampling_hbd(TX_SIZE tx_size,
+                                                       int sub_x, int sub_y) {
+  if (sub_x == 1) {
+    if (sub_y == 1) {
+      return cfl_get_luma_subsampling_420_hbd(tx_size);
+    }
+    return cfl_get_luma_subsampling_422_hbd(tx_size);
+  }
+  return cfl_get_luma_subsampling_444_hbd(tx_size);
+}
+#endif
+
+static INLINE cfl_subsample_lbd_fn cfl_subsampling_lbd(TX_SIZE tx_size,
+                                                       int sub_x, int sub_y) {
+  if (sub_x == 1) {
+    if (sub_y == 1) {
+      return cfl_get_luma_subsampling_420_lbd(tx_size);
+    }
+    return cfl_get_luma_subsampling_422_lbd(tx_size);
+  }
+  return cfl_get_luma_subsampling_444_lbd(tx_size);
+}
+
+static void cfl_store(CFL_CTX *cfl, const uint8_t *input, int input_stride,
+                      int row, int col, TX_SIZE tx_size, int use_hbd) {
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const int tx_off_log2 = MI_SIZE_LOG2;
+  const int sub_x = cfl->subsampling_x;
+  const int sub_y = cfl->subsampling_y;
+  const int store_row = row << (tx_off_log2 - sub_y);
+  const int store_col = col << (tx_off_log2 - sub_x);
+  const int store_height = height >> sub_y;
+  const int store_width = width >> sub_x;
+
+  // Invalidate current parameters
+  cfl->are_parameters_computed = 0;
+
+  // Store the surface of the pixel buffer that was written to, this way we
+  // can manage chroma overrun (e.g. when the chroma surfaces goes beyond the
+  // frame boundary)
+  if (col == 0 && row == 0) {
+    cfl->buf_width = store_width;
+    cfl->buf_height = store_height;
+  } else {
+    cfl->buf_width = OD_MAXI(store_col + store_width, cfl->buf_width);
+    cfl->buf_height = OD_MAXI(store_row + store_height, cfl->buf_height);
+  }
+
+  // Check that we will remain inside the pixel buffer.
+  assert(store_row + store_height <= CFL_BUF_LINE);
+  assert(store_col + store_width <= CFL_BUF_LINE);
+
+  // Store the input into the CfL pixel buffer
+  uint16_t *recon_buf_q3 =
+      cfl->recon_buf_q3 + (store_row * CFL_BUF_LINE + store_col);
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (use_hbd) {
+    cfl_subsampling_hbd(tx_size, sub_x, sub_y)(CONVERT_TO_SHORTPTR(input),
+                                               input_stride, recon_buf_q3);
+  } else {
+    cfl_subsampling_lbd(tx_size, sub_x, sub_y)(input, input_stride,
+                                               recon_buf_q3);
+  }
+#else
+  (void)use_hbd;
+  cfl_subsampling_lbd(tx_size, sub_x, sub_y)(input, input_stride, recon_buf_q3);
+#endif
+}
+
+// Adjust the row and column of blocks smaller than 8X8, as chroma-referenced
+// and non-chroma-referenced blocks are stored together in the CfL buffer.
+static INLINE void sub8x8_adjust_offset(const CFL_CTX *cfl, int mi_row,
+                                        int mi_col, int *row_out,
+                                        int *col_out) {
+  // Increment row index for bottom: 8x4, 16x4 or both bottom 4x4s.
+  if ((mi_row & 0x01) && cfl->subsampling_y) {
+    assert(*row_out == 0);
+    (*row_out)++;
+  }
+
+  // Increment col index for right: 4x8, 4x16 or both right 4x4s.
+  if ((mi_col & 0x01) && cfl->subsampling_x) {
+    assert(*col_out == 0);
+    (*col_out)++;
+  }
+}
+
+void cfl_store_tx(MACROBLOCKD *const xd, int row, int col, TX_SIZE tx_size,
+                  BLOCK_SIZE bsize) {
+  CFL_CTX *const cfl = &xd->cfl;
+  struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y];
+  uint8_t *dst = &pd->dst.buf[(row * pd->dst.stride + col) << MI_SIZE_LOG2];
+
+  if (block_size_high[bsize] == 4 || block_size_wide[bsize] == 4) {
+    // Only dimensions of size 4 can have an odd offset.
+    assert(!((col & 1) && tx_size_wide[tx_size] != 4));
+    assert(!((row & 1) && tx_size_high[tx_size] != 4));
+    sub8x8_adjust_offset(cfl, xd->mi_row, xd->mi_col, &row, &col);
+  }
+  cfl_store(cfl, dst, pd->dst.stride, row, col, tx_size, is_cur_buf_hbd(xd));
+}
+
+static INLINE int max_intra_block_width(const MACROBLOCKD *xd,
+                                        BLOCK_SIZE plane_bsize, int plane,
+                                        TX_SIZE tx_size) {
+  const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane)
+                              << MI_SIZE_LOG2;
+  return ALIGN_POWER_OF_TWO(max_blocks_wide, tx_size_wide_log2[tx_size]);
+}
+
+static INLINE int max_intra_block_height(const MACROBLOCKD *xd,
+                                         BLOCK_SIZE plane_bsize, int plane,
+                                         TX_SIZE tx_size) {
+  const int max_blocks_high = max_block_high(xd, plane_bsize, plane)
+                              << MI_SIZE_LOG2;
+  return ALIGN_POWER_OF_TWO(max_blocks_high, tx_size_high_log2[tx_size]);
+}
+
+void cfl_store_block(MACROBLOCKD *const xd, BLOCK_SIZE bsize, TX_SIZE tx_size) {
+  CFL_CTX *const cfl = &xd->cfl;
+  struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y];
+  int row = 0;
+  int col = 0;
+
+  if (block_size_high[bsize] == 4 || block_size_wide[bsize] == 4) {
+    sub8x8_adjust_offset(cfl, xd->mi_row, xd->mi_col, &row, &col);
+  }
+  const int width = max_intra_block_width(xd, bsize, AOM_PLANE_Y, tx_size);
+  const int height = max_intra_block_height(xd, bsize, AOM_PLANE_Y, tx_size);
+  tx_size = get_tx_size(width, height);
+  cfl_store(cfl, pd->dst.buf, pd->dst.stride, row, col, tx_size,
+            is_cur_buf_hbd(xd));
+}
diff --git a/libs/libaom/src/av1/common/cfl.h b/libs/libaom/src/av1/common/cfl.h
new file mode 100644
index 000000000..a1d6dc2ea
--- /dev/null
+++ b/libs/libaom/src/av1/common/cfl.h
@@ -0,0 +1,288 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_CFL_H_
+#define AOM_AV1_COMMON_CFL_H_
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/blockd.h"
+
+// Can we use CfL for the current block?
+static INLINE CFL_ALLOWED_TYPE is_cfl_allowed(const MACROBLOCKD *xd) {
+  const MB_MODE_INFO *mbmi = xd->mi[0];
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  assert(bsize < BLOCK_SIZES_ALL);
+  if (xd->lossless[mbmi->segment_id]) {
+    // In lossless, CfL is available when the partition size is equal to the
+    // transform size.
+    const int ssx = xd->plane[AOM_PLANE_U].subsampling_x;
+    const int ssy = xd->plane[AOM_PLANE_U].subsampling_y;
+    const int plane_bsize = get_plane_block_size(bsize, ssx, ssy);
+    return (CFL_ALLOWED_TYPE)(plane_bsize == BLOCK_4X4);
+  }
+  // Spec: CfL is available to luma partitions lesser than or equal to 32x32
+  return (CFL_ALLOWED_TYPE)(block_size_wide[bsize] <= 32 &&
+                            block_size_high[bsize] <= 32);
+}
+
+// Do we need to save the luma pixels from the current block,
+// for a possible future CfL prediction?
+static INLINE CFL_ALLOWED_TYPE store_cfl_required(const AV1_COMMON *cm,
+                                                  const MACROBLOCKD *xd) {
+  const MB_MODE_INFO *mbmi = xd->mi[0];
+
+  if (cm->seq_params.monochrome) return CFL_DISALLOWED;
+
+  if (!xd->is_chroma_ref) {
+    // For non-chroma-reference blocks, we should always store the luma pixels,
+    // in case the corresponding chroma-reference block uses CfL.
+    // Note that this can only happen for block sizes which are <8 on
+    // their shortest side, as otherwise they would be chroma reference
+    // blocks.
+    return CFL_ALLOWED;
+  }
+
+  // If this block has chroma information, we know whether we're
+  // actually going to perform a CfL prediction
+  return (CFL_ALLOWED_TYPE)(!is_inter_block(mbmi) &&
+                            mbmi->uv_mode == UV_CFL_PRED);
+}
+
+static INLINE int get_scaled_luma_q0(int alpha_q3, int16_t pred_buf_q3) {
+  int scaled_luma_q6 = alpha_q3 * pred_buf_q3;
+  return ROUND_POWER_OF_TWO_SIGNED(scaled_luma_q6, 6);
+}
+
+static INLINE CFL_PRED_TYPE get_cfl_pred_type(PLANE_TYPE plane) {
+  assert(plane > 0);
+  return (CFL_PRED_TYPE)(plane - 1);
+}
+
+void cfl_predict_block(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride,
+                       TX_SIZE tx_size, int plane);
+
+void cfl_store_block(MACROBLOCKD *const xd, BLOCK_SIZE bsize, TX_SIZE tx_size);
+
+void cfl_store_tx(MACROBLOCKD *const xd, int row, int col, TX_SIZE tx_size,
+                  BLOCK_SIZE bsize);
+
+void cfl_store_dc_pred(MACROBLOCKD *const xd, const uint8_t *input,
+                       CFL_PRED_TYPE pred_plane, int width);
+
+void cfl_load_dc_pred(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride,
+                      TX_SIZE tx_size, CFL_PRED_TYPE pred_plane);
+
+// Allows the CFL_SUBSAMPLE function to switch types depending on the bitdepth.
+#define CFL_lbd_TYPE uint8_t *cfl_type
+#define CFL_hbd_TYPE uint16_t *cfl_type
+
+// Declare a size-specific wrapper for the size-generic function. The compiler
+// will inline the size generic function in here, the advantage is that the size
+// will be constant allowing for loop unrolling and other constant propagated
+// goodness.
+#define CFL_SUBSAMPLE(arch, sub, bd, width, height)                       \
+  void cfl_subsample_##bd##_##sub##_##width##x##height##_##arch(          \
+      const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) {     \
+    cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride,    \
+                                               output_q3, width, height); \
+  }
+
+// Declare size-specific wrappers for all valid CfL sizes.
+#define CFL_SUBSAMPLE_FUNCTIONS(arch, sub, bd)                            \
+  CFL_SUBSAMPLE(arch, sub, bd, 4, 4)                                      \
+  CFL_SUBSAMPLE(arch, sub, bd, 8, 8)                                      \
+  CFL_SUBSAMPLE(arch, sub, bd, 16, 16)                                    \
+  CFL_SUBSAMPLE(arch, sub, bd, 32, 32)                                    \
+  CFL_SUBSAMPLE(arch, sub, bd, 4, 8)                                      \
+  CFL_SUBSAMPLE(arch, sub, bd, 8, 4)                                      \
+  CFL_SUBSAMPLE(arch, sub, bd, 8, 16)                                     \
+  CFL_SUBSAMPLE(arch, sub, bd, 16, 8)                                     \
+  CFL_SUBSAMPLE(arch, sub, bd, 16, 32)                                    \
+  CFL_SUBSAMPLE(arch, sub, bd, 32, 16)                                    \
+  CFL_SUBSAMPLE(arch, sub, bd, 4, 16)                                     \
+  CFL_SUBSAMPLE(arch, sub, bd, 16, 4)                                     \
+  CFL_SUBSAMPLE(arch, sub, bd, 8, 32)                                     \
+  CFL_SUBSAMPLE(arch, sub, bd, 32, 8)                                     \
+  cfl_subsample_##bd##_fn cfl_get_luma_subsampling_##sub##_##bd##_##arch( \
+      TX_SIZE tx_size) {                                                  \
+    CFL_SUBSAMPLE_FUNCTION_ARRAY(arch, sub, bd)                           \
+    return subfn_##sub[tx_size];                                          \
+  }
+
+// Declare an architecture-specific array of function pointers for size-specific
+// wrappers.
+#define CFL_SUBSAMPLE_FUNCTION_ARRAY(arch, sub, bd)                           \
+  static const cfl_subsample_##bd##_fn subfn_##sub[TX_SIZES_ALL] = {          \
+    cfl_subsample_##bd##_##sub##_4x4_##arch,   /* 4x4 */                      \
+    cfl_subsample_##bd##_##sub##_8x8_##arch,   /* 8x8 */                      \
+    cfl_subsample_##bd##_##sub##_16x16_##arch, /* 16x16 */                    \
+    cfl_subsample_##bd##_##sub##_32x32_##arch, /* 32x32 */                    \
+    NULL,                                      /* 64x64 (invalid CFL size) */ \
+    cfl_subsample_##bd##_##sub##_4x8_##arch,   /* 4x8 */                      \
+    cfl_subsample_##bd##_##sub##_8x4_##arch,   /* 8x4 */                      \
+    cfl_subsample_##bd##_##sub##_8x16_##arch,  /* 8x16 */                     \
+    cfl_subsample_##bd##_##sub##_16x8_##arch,  /* 16x8 */                     \
+    cfl_subsample_##bd##_##sub##_16x32_##arch, /* 16x32 */                    \
+    cfl_subsample_##bd##_##sub##_32x16_##arch, /* 32x16 */                    \
+    NULL,                                      /* 32x64 (invalid CFL size) */ \
+    NULL,                                      /* 64x32 (invalid CFL size) */ \
+    cfl_subsample_##bd##_##sub##_4x16_##arch,  /* 4x16  */                    \
+    cfl_subsample_##bd##_##sub##_16x4_##arch,  /* 16x4  */                    \
+    cfl_subsample_##bd##_##sub##_8x32_##arch,  /* 8x32  */                    \
+    cfl_subsample_##bd##_##sub##_32x8_##arch,  /* 32x8  */                    \
+    NULL,                                      /* 16x64 (invalid CFL size) */ \
+    NULL,                                      /* 64x16 (invalid CFL size) */ \
+  };
+
+// The RTCD script does not support passing in an array, so we wrap it in this
+// function.
+#if CONFIG_AV1_HIGHBITDEPTH
+#define CFL_GET_SUBSAMPLE_FUNCTION(arch)  \
+  CFL_SUBSAMPLE_FUNCTIONS(arch, 420, lbd) \
+  CFL_SUBSAMPLE_FUNCTIONS(arch, 422, lbd) \
+  CFL_SUBSAMPLE_FUNCTIONS(arch, 444, lbd) \
+  CFL_SUBSAMPLE_FUNCTIONS(arch, 420, hbd) \
+  CFL_SUBSAMPLE_FUNCTIONS(arch, 422, hbd) \
+  CFL_SUBSAMPLE_FUNCTIONS(arch, 444, hbd)
+#else
+#define CFL_GET_SUBSAMPLE_FUNCTION(arch)  \
+  CFL_SUBSAMPLE_FUNCTIONS(arch, 420, lbd) \
+  CFL_SUBSAMPLE_FUNCTIONS(arch, 422, lbd) \
+  CFL_SUBSAMPLE_FUNCTIONS(arch, 444, lbd)
+#endif
+
+// Declare a size-specific wrapper for the size-generic function. The compiler
+// will inline the size generic function in here, the advantage is that the size
+// will be constant allowing for loop unrolling and other constant propagated
+// goodness.
+#define CFL_SUB_AVG_X(arch, width, height, round_offset, num_pel_log2)       \
+  void cfl_subtract_average_##width##x##height##_##arch(const uint16_t *src, \
+                                                        int16_t *dst) {      \
+    subtract_average_##arch(src, dst, width, height, round_offset,           \
+                            num_pel_log2);                                   \
+  }
+
+// Declare size-specific wrappers for all valid CfL sizes.
+#define CFL_SUB_AVG_FN(arch)                                              \
+  CFL_SUB_AVG_X(arch, 4, 4, 8, 4)                                         \
+  CFL_SUB_AVG_X(arch, 4, 8, 16, 5)                                        \
+  CFL_SUB_AVG_X(arch, 4, 16, 32, 6)                                       \
+  CFL_SUB_AVG_X(arch, 8, 4, 16, 5)                                        \
+  CFL_SUB_AVG_X(arch, 8, 8, 32, 6)                                        \
+  CFL_SUB_AVG_X(arch, 8, 16, 64, 7)                                       \
+  CFL_SUB_AVG_X(arch, 8, 32, 128, 8)                                      \
+  CFL_SUB_AVG_X(arch, 16, 4, 32, 6)                                       \
+  CFL_SUB_AVG_X(arch, 16, 8, 64, 7)                                       \
+  CFL_SUB_AVG_X(arch, 16, 16, 128, 8)                                     \
+  CFL_SUB_AVG_X(arch, 16, 32, 256, 9)                                     \
+  CFL_SUB_AVG_X(arch, 32, 8, 128, 8)                                      \
+  CFL_SUB_AVG_X(arch, 32, 16, 256, 9)                                     \
+  CFL_SUB_AVG_X(arch, 32, 32, 512, 10)                                    \
+  cfl_subtract_average_fn cfl_get_subtract_average_fn_##arch(             \
+      TX_SIZE tx_size) {                                                  \
+    static const cfl_subtract_average_fn sub_avg[TX_SIZES_ALL] = {        \
+      cfl_subtract_average_4x4_##arch,   /* 4x4 */                        \
+      cfl_subtract_average_8x8_##arch,   /* 8x8 */                        \
+      cfl_subtract_average_16x16_##arch, /* 16x16 */                      \
+      cfl_subtract_average_32x32_##arch, /* 32x32 */                      \
+      NULL,                              /* 64x64 (invalid CFL size) */   \
+      cfl_subtract_average_4x8_##arch,   /* 4x8 */                        \
+      cfl_subtract_average_8x4_##arch,   /* 8x4 */                        \
+      cfl_subtract_average_8x16_##arch,  /* 8x16 */                       \
+      cfl_subtract_average_16x8_##arch,  /* 16x8 */                       \
+      cfl_subtract_average_16x32_##arch, /* 16x32 */                      \
+      cfl_subtract_average_32x16_##arch, /* 32x16 */                      \
+      NULL,                              /* 32x64 (invalid CFL size) */   \
+      NULL,                              /* 64x32 (invalid CFL size) */   \
+      cfl_subtract_average_4x16_##arch,  /* 4x16 (invalid CFL size) */    \
+      cfl_subtract_average_16x4_##arch,  /* 16x4 (invalid CFL size) */    \
+      cfl_subtract_average_8x32_##arch,  /* 8x32 (invalid CFL size) */    \
+      cfl_subtract_average_32x8_##arch,  /* 32x8 (invalid CFL size) */    \
+      NULL,                              /* 16x64 (invalid CFL size) */   \
+      NULL,                              /* 64x16 (invalid CFL size) */   \
+    };                                                                    \
+    /* Modulo TX_SIZES_ALL to ensure that an attacker won't be able to */ \
+    /* index the function pointer array out of bounds. */                 \
+    return sub_avg[tx_size % TX_SIZES_ALL];                               \
+  }
+
+// For VSX SIMD optimization, the C versions of width == 4 subtract are
+// faster than the VSX. As such, the VSX code calls the C versions.
+void cfl_subtract_average_4x4_c(const uint16_t *src, int16_t *dst);
+void cfl_subtract_average_4x8_c(const uint16_t *src, int16_t *dst);
+void cfl_subtract_average_4x16_c(const uint16_t *src, int16_t *dst);
+
+#define CFL_PREDICT_lbd(arch, width, height)                              \
+  void cfl_predict_lbd_##width##x##height##_##arch(                       \
+      const int16_t *pred_buf_q3, uint8_t *dst, int dst_stride,           \
+      int alpha_q3) {                                                     \
+    cfl_predict_lbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, width, \
+                           height);                                       \
+  }
+
+#if CONFIG_AV1_HIGHBITDEPTH
+#define CFL_PREDICT_hbd(arch, width, height)                                   \
+  void cfl_predict_hbd_##width##x##height##_##arch(                            \
+      const int16_t *pred_buf_q3, uint16_t *dst, int dst_stride, int alpha_q3, \
+      int bd) {                                                                \
+    cfl_predict_hbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, bd, width,  \
+                           height);                                            \
+  }
+#endif
+
+// This wrapper exists because clang format does not like calling macros with
+// lowercase letters.
+#define CFL_PREDICT_X(arch, width, height, bd) \
+  CFL_PREDICT_##bd(arch, width, height)
+
+#define CFL_PREDICT_FN(arch, bd)                                            \
+  CFL_PREDICT_X(arch, 4, 4, bd)                                             \
+  CFL_PREDICT_X(arch, 4, 8, bd)                                             \
+  CFL_PREDICT_X(arch, 4, 16, bd)                                            \
+  CFL_PREDICT_X(arch, 8, 4, bd)                                             \
+  CFL_PREDICT_X(arch, 8, 8, bd)                                             \
+  CFL_PREDICT_X(arch, 8, 16, bd)                                            \
+  CFL_PREDICT_X(arch, 8, 32, bd)                                            \
+  CFL_PREDICT_X(arch, 16, 4, bd)                                            \
+  CFL_PREDICT_X(arch, 16, 8, bd)                                            \
+  CFL_PREDICT_X(arch, 16, 16, bd)                                           \
+  CFL_PREDICT_X(arch, 16, 32, bd)                                           \
+  CFL_PREDICT_X(arch, 32, 8, bd)                                            \
+  CFL_PREDICT_X(arch, 32, 16, bd)                                           \
+  CFL_PREDICT_X(arch, 32, 32, bd)                                           \
+  cfl_predict_##bd##_fn cfl_get_predict_##bd##_fn_##arch(TX_SIZE tx_size) { \
+    static const cfl_predict_##bd##_fn pred[TX_SIZES_ALL] = {               \
+      cfl_predict_##bd##_4x4_##arch,   /* 4x4 */                            \
+      cfl_predict_##bd##_8x8_##arch,   /* 8x8 */                            \
+      cfl_predict_##bd##_16x16_##arch, /* 16x16 */                          \
+      cfl_predict_##bd##_32x32_##arch, /* 32x32 */                          \
+      NULL,                            /* 64x64 (invalid CFL size) */       \
+      cfl_predict_##bd##_4x8_##arch,   /* 4x8 */                            \
+      cfl_predict_##bd##_8x4_##arch,   /* 8x4 */                            \
+      cfl_predict_##bd##_8x16_##arch,  /* 8x16 */                           \
+      cfl_predict_##bd##_16x8_##arch,  /* 16x8 */                           \
+      cfl_predict_##bd##_16x32_##arch, /* 16x32 */                          \
+      cfl_predict_##bd##_32x16_##arch, /* 32x16 */                          \
+      NULL,                            /* 32x64 (invalid CFL size) */       \
+      NULL,                            /* 64x32 (invalid CFL size) */       \
+      cfl_predict_##bd##_4x16_##arch,  /* 4x16  */                          \
+      cfl_predict_##bd##_16x4_##arch,  /* 16x4  */                          \
+      cfl_predict_##bd##_8x32_##arch,  /* 8x32  */                          \
+      cfl_predict_##bd##_32x8_##arch,  /* 32x8  */                          \
+      NULL,                            /* 16x64 (invalid CFL size) */       \
+      NULL,                            /* 64x16 (invalid CFL size) */       \
+    };                                                                      \
+    /* Modulo TX_SIZES_ALL to ensure that an attacker won't be able to */   \
+    /* index the function pointer array out of bounds. */                   \
+    return pred[tx_size % TX_SIZES_ALL];                                    \
+  }
+
+#endif  // AOM_AV1_COMMON_CFL_H_
diff --git a/libs/libaom/src/av1/common/common.h b/libs/libaom/src/av1/common/common.h
new file mode 100644
index 000000000..bed6083db
--- /dev/null
+++ b/libs/libaom/src/av1/common/common.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_COMMON_H_
+#define AOM_AV1_COMMON_COMMON_H_
+
+/* Interface header for common constant data structures and lookup tables */
+
+#include <assert.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom/aom_integer.h"
+#include "aom_ports/bitops.h"
+#include "config/aom_config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define PI 3.141592653589793238462643383279502884
+
+// Only need this for fixed-size arrays, for structs just assign.
+#define av1_copy(dest, src)              \
+  {                                      \
+    assert(sizeof(dest) == sizeof(src)); \
+    memcpy(dest, src, sizeof(src));      \
+  }
+
+// Use this for variably-sized arrays.
+#define av1_copy_array(dest, src, n)           \
+  {                                            \
+    assert(sizeof(*(dest)) == sizeof(*(src))); \
+    memcpy(dest, src, n * sizeof(*(src)));     \
+  }
+
+#define av1_zero(dest) memset(&(dest), 0, sizeof(dest))
+#define av1_zero_array(dest, n) memset(dest, 0, n * sizeof(*(dest)))
+
+static INLINE int get_unsigned_bits(unsigned int num_values) {
+  return num_values > 0 ? get_msb(num_values) + 1 : 0;
+}
+
+#define CHECK_MEM_ERROR(cm, lval, expr) \
+  AOM_CHECK_MEM_ERROR(&cm->error, lval, expr)
+
+#define AOM_FRAME_MARKER 0x2
+
+#define AV1_MIN_TILE_SIZE_BYTES 1
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_COMMON_COMMON_H_
diff --git a/libs/libaom/src/av1/common/common_data.h b/libs/libaom/src/av1/common/common_data.h
new file mode 100644
index 000000000..402845caf
--- /dev/null
+++ b/libs/libaom/src/av1/common/common_data.h
@@ -0,0 +1,446 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_COMMON_DATA_H_
+#define AOM_AV1_COMMON_COMMON_DATA_H_
+
+#include "av1/common/enums.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Log 2 conversion lookup tables in units of mode info (4x4).
+// The Mi_Width_Log2 table in the spec (Section 9.3. Conversion tables).
+static const uint8_t mi_size_wide_log2[BLOCK_SIZES_ALL] = {
+  0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 0, 2, 1, 3, 2, 4
+};
+// The Mi_Height_Log2 table in the spec (Section 9.3. Conversion tables).
+static const uint8_t mi_size_high_log2[BLOCK_SIZES_ALL] = {
+  0, 1, 0, 1, 2, 1, 2, 3, 2, 3, 4, 3, 4, 5, 4, 5, 2, 0, 3, 1, 4, 2
+};
+
+// Width/height lookup tables in units of mode info (4x4).
+// The Num_4x4_Blocks_Wide table in the spec (Section 9.3. Conversion tables).
+static const uint8_t mi_size_wide[BLOCK_SIZES_ALL] = {
+  1, 1, 2, 2, 2, 4, 4, 4, 8, 8, 8, 16, 16, 16, 32, 32, 1, 4, 2, 8, 4, 16
+};
+
+// The Num_4x4_Blocks_High table in the spec (Section 9.3. Conversion tables).
+static const uint8_t mi_size_high[BLOCK_SIZES_ALL] = {
+  1, 2, 1, 2, 4, 2, 4, 8, 4, 8, 16, 8, 16, 32, 16, 32, 4, 1, 8, 2, 16, 4
+};
+
+// Width/height lookup tables in units of samples.
+// The Block_Width table in the spec (Section 9.3. Conversion tables).
+static const uint8_t block_size_wide[BLOCK_SIZES_ALL] = {
+  4,  4,  8,  8,   8,   16, 16, 16, 32, 32, 32,
+  64, 64, 64, 128, 128, 4,  16, 8,  32, 16, 64
+};
+
+// The Block_Height table in the spec (Section 9.3. Conversion tables).
+static const uint8_t block_size_high[BLOCK_SIZES_ALL] = {
+  4,  8,  4,   8,  16,  8,  16, 32, 16, 32, 64,
+  32, 64, 128, 64, 128, 16, 4,  32, 8,  64, 16
+};
+
+// Maps a block size to a context.
+// The Size_Group table in the spec (Section 9.3. Conversion tables).
+// AOMMIN(3, AOMMIN(mi_size_wide_log2(bsize), mi_size_high_log2(bsize)))
+static const uint8_t size_group_lookup[BLOCK_SIZES_ALL] = {
+  0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 0, 0, 1, 1, 2, 2
+};
+
+static const uint8_t num_pels_log2_lookup[BLOCK_SIZES_ALL] = {
+  4, 5, 5, 6, 7, 7, 8, 9, 9, 10, 11, 11, 12, 13, 13, 14, 6, 6, 8, 8, 10, 10
+};
+
+// A compressed version of the Partition_Subsize table in the spec (9.3.
+// Conversion tables), for square block sizes only.
+/* clang-format off */
+static const BLOCK_SIZE subsize_lookup[EXT_PARTITION_TYPES][SQR_BLOCK_SIZES] = {
+  {     // PARTITION_NONE
+    BLOCK_4X4, BLOCK_8X8, BLOCK_16X16,
+    BLOCK_32X32, BLOCK_64X64, BLOCK_128X128
+  }, {  // PARTITION_HORZ
+    BLOCK_INVALID, BLOCK_8X4, BLOCK_16X8,
+    BLOCK_32X16, BLOCK_64X32, BLOCK_128X64
+  }, {  // PARTITION_VERT
+    BLOCK_INVALID, BLOCK_4X8, BLOCK_8X16,
+    BLOCK_16X32, BLOCK_32X64, BLOCK_64X128
+  }, {  // PARTITION_SPLIT
+    BLOCK_INVALID, BLOCK_4X4, BLOCK_8X8,
+    BLOCK_16X16, BLOCK_32X32, BLOCK_64X64
+  }, {  // PARTITION_HORZ_A
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X8,
+    BLOCK_32X16, BLOCK_64X32, BLOCK_128X64
+  }, {  // PARTITION_HORZ_B
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X8,
+    BLOCK_32X16, BLOCK_64X32, BLOCK_128X64
+  }, {  // PARTITION_VERT_A
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X16,
+    BLOCK_16X32, BLOCK_32X64, BLOCK_64X128
+  }, {  // PARTITION_VERT_B
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X16,
+    BLOCK_16X32, BLOCK_32X64, BLOCK_64X128
+  }, {  // PARTITION_HORZ_4
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X4,
+    BLOCK_32X8, BLOCK_64X16, BLOCK_INVALID
+  }, {  // PARTITION_VERT_4
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X16,
+    BLOCK_8X32, BLOCK_16X64, BLOCK_INVALID
+  }
+};
+
+static const TX_SIZE max_txsize_lookup[BLOCK_SIZES_ALL] = {
+  //                   4X4
+                       TX_4X4,
+  // 4X8,    8X4,      8X8
+  TX_4X4,    TX_4X4,   TX_8X8,
+  // 8X16,   16X8,     16X16
+  TX_8X8,    TX_8X8,   TX_16X16,
+  // 16X32,  32X16,    32X32
+  TX_16X16,  TX_16X16, TX_32X32,
+  // 32X64,  64X32,
+  TX_32X32,  TX_32X32,
+  // 64X64
+  TX_64X64,
+  // 64x128, 128x64,   128x128
+  TX_64X64,  TX_64X64, TX_64X64,
+  // 4x16,   16x4,     8x32
+  TX_4X4,    TX_4X4,   TX_8X8,
+  // 32x8,   16x64     64x16
+  TX_8X8,    TX_16X16, TX_16X16
+};
+
+static const TX_SIZE max_txsize_rect_lookup[BLOCK_SIZES_ALL] = {
+      // 4X4
+      TX_4X4,
+      // 4X8,    8X4,      8X8
+      TX_4X8,    TX_8X4,   TX_8X8,
+      // 8X16,   16X8,     16X16
+      TX_8X16,   TX_16X8,  TX_16X16,
+      // 16X32,  32X16,    32X32
+      TX_16X32,  TX_32X16, TX_32X32,
+      // 32X64,  64X32,
+      TX_32X64,  TX_64X32,
+      // 64X64
+      TX_64X64,
+      // 64x128, 128x64,   128x128
+      TX_64X64,  TX_64X64, TX_64X64,
+      // 4x16,   16x4,
+      TX_4X16,   TX_16X4,
+      // 8x32,   32x8
+      TX_8X32,   TX_32X8,
+      // 16x64,  64x16
+      TX_16X64,  TX_64X16
+};
+
+static const TX_TYPE_1D vtx_tab[TX_TYPES] = {
+  DCT_1D,      ADST_1D, DCT_1D,      ADST_1D,
+  FLIPADST_1D, DCT_1D,  FLIPADST_1D, ADST_1D, FLIPADST_1D, IDTX_1D,
+  DCT_1D,      IDTX_1D, ADST_1D,     IDTX_1D, FLIPADST_1D, IDTX_1D,
+};
+
+static const TX_TYPE_1D htx_tab[TX_TYPES] = {
+  DCT_1D,  DCT_1D,      ADST_1D,     ADST_1D,
+  DCT_1D,  FLIPADST_1D, FLIPADST_1D, FLIPADST_1D, ADST_1D, IDTX_1D,
+  IDTX_1D, DCT_1D,      IDTX_1D,     ADST_1D,     IDTX_1D, FLIPADST_1D,
+};
+
+#define TXSIZE_CAT_INVALID (-1)
+
+/* clang-format on */
+
+static const TX_SIZE sub_tx_size_map[TX_SIZES_ALL] = {
+  TX_4X4,    // TX_4X4
+  TX_4X4,    // TX_8X8
+  TX_8X8,    // TX_16X16
+  TX_16X16,  // TX_32X32
+  TX_32X32,  // TX_64X64
+  TX_4X4,    // TX_4X8
+  TX_4X4,    // TX_8X4
+  TX_8X8,    // TX_8X16
+  TX_8X8,    // TX_16X8
+  TX_16X16,  // TX_16X32
+  TX_16X16,  // TX_32X16
+  TX_32X32,  // TX_32X64
+  TX_32X32,  // TX_64X32
+  TX_4X8,    // TX_4X16
+  TX_8X4,    // TX_16X4
+  TX_8X16,   // TX_8X32
+  TX_16X8,   // TX_32X8
+  TX_16X32,  // TX_16X64
+  TX_32X16,  // TX_64X16
+};
+
+static const TX_SIZE txsize_horz_map[TX_SIZES_ALL] = {
+  TX_4X4,    // TX_4X4
+  TX_8X8,    // TX_8X8
+  TX_16X16,  // TX_16X16
+  TX_32X32,  // TX_32X32
+  TX_64X64,  // TX_64X64
+  TX_4X4,    // TX_4X8
+  TX_8X8,    // TX_8X4
+  TX_8X8,    // TX_8X16
+  TX_16X16,  // TX_16X8
+  TX_16X16,  // TX_16X32
+  TX_32X32,  // TX_32X16
+  TX_32X32,  // TX_32X64
+  TX_64X64,  // TX_64X32
+  TX_4X4,    // TX_4X16
+  TX_16X16,  // TX_16X4
+  TX_8X8,    // TX_8X32
+  TX_32X32,  // TX_32X8
+  TX_16X16,  // TX_16X64
+  TX_64X64,  // TX_64X16
+};
+
+static const TX_SIZE txsize_vert_map[TX_SIZES_ALL] = {
+  TX_4X4,    // TX_4X4
+  TX_8X8,    // TX_8X8
+  TX_16X16,  // TX_16X16
+  TX_32X32,  // TX_32X32
+  TX_64X64,  // TX_64X64
+  TX_8X8,    // TX_4X8
+  TX_4X4,    // TX_8X4
+  TX_16X16,  // TX_8X16
+  TX_8X8,    // TX_16X8
+  TX_32X32,  // TX_16X32
+  TX_16X16,  // TX_32X16
+  TX_64X64,  // TX_32X64
+  TX_32X32,  // TX_64X32
+  TX_16X16,  // TX_4X16
+  TX_4X4,    // TX_16X4
+  TX_32X32,  // TX_8X32
+  TX_8X8,    // TX_32X8
+  TX_64X64,  // TX_16X64
+  TX_16X16,  // TX_64X16
+};
+
+#define TX_SIZE_W_MIN 4
+
+// Transform block width in pixels
+static const int tx_size_wide[TX_SIZES_ALL] = {
+  4, 8, 16, 32, 64, 4, 8, 8, 16, 16, 32, 32, 64, 4, 16, 8, 32, 16, 64,
+};
+
+#define TX_SIZE_H_MIN 4
+
+// Transform block height in pixels
+static const int tx_size_high[TX_SIZES_ALL] = {
+  4, 8, 16, 32, 64, 8, 4, 16, 8, 32, 16, 64, 32, 16, 4, 32, 8, 64, 16,
+};
+
+// Transform block width in unit
+static const int tx_size_wide_unit[TX_SIZES_ALL] = {
+  1, 2, 4, 8, 16, 1, 2, 2, 4, 4, 8, 8, 16, 1, 4, 2, 8, 4, 16,
+};
+
+// Transform block height in unit
+static const int tx_size_high_unit[TX_SIZES_ALL] = {
+  1, 2, 4, 8, 16, 2, 1, 4, 2, 8, 4, 16, 8, 4, 1, 8, 2, 16, 4,
+};
+
+// Transform block width in log2
+static const int tx_size_wide_log2[TX_SIZES_ALL] = {
+  2, 3, 4, 5, 6, 2, 3, 3, 4, 4, 5, 5, 6, 2, 4, 3, 5, 4, 6,
+};
+
+// Transform block height in log2
+static const int tx_size_high_log2[TX_SIZES_ALL] = {
+  2, 3, 4, 5, 6, 3, 2, 4, 3, 5, 4, 6, 5, 4, 2, 5, 3, 6, 4,
+};
+
+static const int tx_size_2d[TX_SIZES_ALL + 1] = {
+  16,  64,   256,  1024, 4096, 32,  32,  128,  128,  512,
+  512, 2048, 2048, 64,   64,   256, 256, 1024, 1024,
+};
+
+static const BLOCK_SIZE txsize_to_bsize[TX_SIZES_ALL] = {
+  BLOCK_4X4,    // TX_4X4
+  BLOCK_8X8,    // TX_8X8
+  BLOCK_16X16,  // TX_16X16
+  BLOCK_32X32,  // TX_32X32
+  BLOCK_64X64,  // TX_64X64
+  BLOCK_4X8,    // TX_4X8
+  BLOCK_8X4,    // TX_8X4
+  BLOCK_8X16,   // TX_8X16
+  BLOCK_16X8,   // TX_16X8
+  BLOCK_16X32,  // TX_16X32
+  BLOCK_32X16,  // TX_32X16
+  BLOCK_32X64,  // TX_32X64
+  BLOCK_64X32,  // TX_64X32
+  BLOCK_4X16,   // TX_4X16
+  BLOCK_16X4,   // TX_16X4
+  BLOCK_8X32,   // TX_8X32
+  BLOCK_32X8,   // TX_32X8
+  BLOCK_16X64,  // TX_16X64
+  BLOCK_64X16,  // TX_64X16
+};
+
+static const TX_SIZE txsize_sqr_map[TX_SIZES_ALL] = {
+  TX_4X4,    // TX_4X4
+  TX_8X8,    // TX_8X8
+  TX_16X16,  // TX_16X16
+  TX_32X32,  // TX_32X32
+  TX_64X64,  // TX_64X64
+  TX_4X4,    // TX_4X8
+  TX_4X4,    // TX_8X4
+  TX_8X8,    // TX_8X16
+  TX_8X8,    // TX_16X8
+  TX_16X16,  // TX_16X32
+  TX_16X16,  // TX_32X16
+  TX_32X32,  // TX_32X64
+  TX_32X32,  // TX_64X32
+  TX_4X4,    // TX_4X16
+  TX_4X4,    // TX_16X4
+  TX_8X8,    // TX_8X32
+  TX_8X8,    // TX_32X8
+  TX_16X16,  // TX_16X64
+  TX_16X16,  // TX_64X16
+};
+
+static const TX_SIZE txsize_sqr_up_map[TX_SIZES_ALL] = {
+  TX_4X4,    // TX_4X4
+  TX_8X8,    // TX_8X8
+  TX_16X16,  // TX_16X16
+  TX_32X32,  // TX_32X32
+  TX_64X64,  // TX_64X64
+  TX_8X8,    // TX_4X8
+  TX_8X8,    // TX_8X4
+  TX_16X16,  // TX_8X16
+  TX_16X16,  // TX_16X8
+  TX_32X32,  // TX_16X32
+  TX_32X32,  // TX_32X16
+  TX_64X64,  // TX_32X64
+  TX_64X64,  // TX_64X32
+  TX_16X16,  // TX_4X16
+  TX_16X16,  // TX_16X4
+  TX_32X32,  // TX_8X32
+  TX_32X32,  // TX_32X8
+  TX_64X64,  // TX_16X64
+  TX_64X64,  // TX_64X16
+};
+
+static const int8_t txsize_log2_minus4[TX_SIZES_ALL] = {
+  0,  // TX_4X4
+  2,  // TX_8X8
+  4,  // TX_16X16
+  6,  // TX_32X32
+  6,  // TX_64X64
+  1,  // TX_4X8
+  1,  // TX_8X4
+  3,  // TX_8X16
+  3,  // TX_16X8
+  5,  // TX_16X32
+  5,  // TX_32X16
+  6,  // TX_32X64
+  6,  // TX_64X32
+  2,  // TX_4X16
+  2,  // TX_16X4
+  4,  // TX_8X32
+  4,  // TX_32X8
+  5,  // TX_16X64
+  5,  // TX_64X16
+};
+
+/* clang-format off */
+static const TX_SIZE tx_mode_to_biggest_tx_size[TX_MODES] = {
+  TX_4X4,    // ONLY_4X4
+  TX_64X64,  // TX_MODE_LARGEST
+  TX_64X64,  // TX_MODE_SELECT
+};
+
+// The Subsampled_Size table in the spec (Section 5.11.38. Get plane residual
+// size function).
+static const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES_ALL][2][2] = {
+  //  ss_x == 0      ss_x == 0          ss_x == 1      ss_x == 1
+  //  ss_y == 0      ss_y == 1          ss_y == 0      ss_y == 1
+  { { BLOCK_4X4,     BLOCK_4X4 },     { BLOCK_4X4,     BLOCK_4X4 } },
+  { { BLOCK_4X8,     BLOCK_4X4 },     { BLOCK_INVALID, BLOCK_4X4 } },
+  { { BLOCK_8X4,     BLOCK_INVALID }, { BLOCK_4X4,     BLOCK_4X4 } },
+  { { BLOCK_8X8,     BLOCK_8X4 },     { BLOCK_4X8,     BLOCK_4X4 } },
+  { { BLOCK_8X16,    BLOCK_8X8 },     { BLOCK_INVALID, BLOCK_4X8 } },
+  { { BLOCK_16X8,    BLOCK_INVALID }, { BLOCK_8X8,     BLOCK_8X4 } },
+  { { BLOCK_16X16,   BLOCK_16X8 },    { BLOCK_8X16,    BLOCK_8X8 } },
+  { { BLOCK_16X32,   BLOCK_16X16 },   { BLOCK_INVALID, BLOCK_8X16 } },
+  { { BLOCK_32X16,   BLOCK_INVALID }, { BLOCK_16X16,   BLOCK_16X8 } },
+  { { BLOCK_32X32,   BLOCK_32X16 },   { BLOCK_16X32,   BLOCK_16X16 } },
+  { { BLOCK_32X64,   BLOCK_32X32 },   { BLOCK_INVALID, BLOCK_16X32 } },
+  { { BLOCK_64X32,   BLOCK_INVALID }, { BLOCK_32X32,   BLOCK_32X16 } },
+  { { BLOCK_64X64,   BLOCK_64X32 },   { BLOCK_32X64,   BLOCK_32X32 } },
+  { { BLOCK_64X128,  BLOCK_64X64 },   { BLOCK_INVALID, BLOCK_32X64 } },
+  { { BLOCK_128X64,  BLOCK_INVALID }, { BLOCK_64X64,   BLOCK_64X32 } },
+  { { BLOCK_128X128, BLOCK_128X64 },  { BLOCK_64X128,  BLOCK_64X64 } },
+  { { BLOCK_4X16,    BLOCK_4X8 },     { BLOCK_INVALID, BLOCK_4X8 } },
+  { { BLOCK_16X4,    BLOCK_INVALID }, { BLOCK_8X4,     BLOCK_8X4 } },
+  { { BLOCK_8X32,    BLOCK_8X16 },    { BLOCK_INVALID, BLOCK_4X16 } },
+  { { BLOCK_32X8,    BLOCK_INVALID }, { BLOCK_16X8,    BLOCK_16X4 } },
+  { { BLOCK_16X64,   BLOCK_16X32 },   { BLOCK_INVALID, BLOCK_8X32 } },
+  { { BLOCK_64X16,   BLOCK_INVALID }, { BLOCK_32X16,   BLOCK_32X8 } }
+};
+/* clang-format on */
+
+// Generates 5 bit field in which each bit set to 1 represents
+// a blocksize partition  11111 means we split 128x128, 64x64, 32x32, 16x16
+// and 8x8.  10000 means we just split the 128x128 to 64x64
+/* clang-format off */
+static const struct {
+  PARTITION_CONTEXT above;
+  PARTITION_CONTEXT left;
+} partition_context_lookup[BLOCK_SIZES_ALL] = {
+  { 31, 31 },  // 4X4   - {0b11111, 0b11111}
+  { 31, 30 },  // 4X8   - {0b11111, 0b11110}
+  { 30, 31 },  // 8X4   - {0b11110, 0b11111}
+  { 30, 30 },  // 8X8   - {0b11110, 0b11110}
+  { 30, 28 },  // 8X16  - {0b11110, 0b11100}
+  { 28, 30 },  // 16X8  - {0b11100, 0b11110}
+  { 28, 28 },  // 16X16 - {0b11100, 0b11100}
+  { 28, 24 },  // 16X32 - {0b11100, 0b11000}
+  { 24, 28 },  // 32X16 - {0b11000, 0b11100}
+  { 24, 24 },  // 32X32 - {0b11000, 0b11000}
+  { 24, 16 },  // 32X64 - {0b11000, 0b10000}
+  { 16, 24 },  // 64X32 - {0b10000, 0b11000}
+  { 16, 16 },  // 64X64 - {0b10000, 0b10000}
+  { 16, 0 },   // 64X128- {0b10000, 0b00000}
+  { 0, 16 },   // 128X64- {0b00000, 0b10000}
+  { 0, 0 },    // 128X128-{0b00000, 0b00000}
+  { 31, 28 },  // 4X16  - {0b11111, 0b11100}
+  { 28, 31 },  // 16X4  - {0b11100, 0b11111}
+  { 30, 24 },  // 8X32  - {0b11110, 0b11000}
+  { 24, 30 },  // 32X8  - {0b11000, 0b11110}
+  { 28, 16 },  // 16X64 - {0b11100, 0b10000}
+  { 16, 28 },  // 64X16 - {0b10000, 0b11100}
+};
+/* clang-format on */
+
+static const int intra_mode_context[INTRA_MODES] = {
+  0, 1, 2, 3, 4, 4, 4, 4, 3, 0, 1, 2, 0,
+};
+
+// Note: this is also used in unit tests. So whenever one changes the table,
+// the unit tests need to be changed accordingly.
+static const int quant_dist_weight[4][2] = {
+  { 2, 3 }, { 2, 5 }, { 2, 7 }, { 1, MAX_FRAME_DISTANCE }
+};
+static const int quant_dist_lookup_table[2][4][2] = {
+  { { 9, 7 }, { 11, 5 }, { 12, 4 }, { 13, 3 } },
+  { { 7, 9 }, { 5, 11 }, { 4, 12 }, { 3, 13 } },
+};
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_COMMON_COMMON_DATA_H_
diff --git a/libs/libaom/src/av1/common/convolve.c b/libs/libaom/src/av1/common/convolve.c
new file mode 100644
index 000000000..e177e3cad
--- /dev/null
+++ b/libs/libaom/src/av1/common/convolve.c
@@ -0,0 +1,1274 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <string.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/blockd.h"
+#include "av1/common/convolve.h"
+#include "av1/common/filter.h"
+#include "av1/common/resize.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/mem.h"
+
+void av1_convolve_horiz_rs_c(const uint8_t *src, int src_stride, uint8_t *dst,
+                             int dst_stride, int w, int h,
+                             const int16_t *x_filters, int x0_qn,
+                             int x_step_qn) {
+  src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
+  for (int y = 0; y < h; ++y) {
+    int x_qn = x0_qn;
+    for (int x = 0; x < w; ++x) {
+      const uint8_t *const src_x = &src[x_qn >> RS_SCALE_SUBPEL_BITS];
+      const int x_filter_idx =
+          (x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+      assert(x_filter_idx <= RS_SUBPEL_MASK);
+      const int16_t *const x_filter =
+          &x_filters[x_filter_idx * UPSCALE_NORMATIVE_TAPS];
+      int sum = 0;
+      for (int k = 0; k < UPSCALE_NORMATIVE_TAPS; ++k)
+        sum += src_x[k] * x_filter[k];
+      dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+      x_qn += x_step_qn;
+    }
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+void av1_highbd_convolve_horiz_rs_c(const uint16_t *src, int src_stride,
+                                    uint16_t *dst, int dst_stride, int w, int h,
+                                    const int16_t *x_filters, int x0_qn,
+                                    int x_step_qn, int bd) {
+  src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
+  for (int y = 0; y < h; ++y) {
+    int x_qn = x0_qn;
+    for (int x = 0; x < w; ++x) {
+      const uint16_t *const src_x = &src[x_qn >> RS_SCALE_SUBPEL_BITS];
+      const int x_filter_idx =
+          (x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+      assert(x_filter_idx <= RS_SUBPEL_MASK);
+      const int16_t *const x_filter =
+          &x_filters[x_filter_idx * UPSCALE_NORMATIVE_TAPS];
+      int sum = 0;
+      for (int k = 0; k < UPSCALE_NORMATIVE_TAPS; ++k)
+        sum += src_x[k] * x_filter[k];
+      dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+      x_qn += x_step_qn;
+    }
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+void av1_convolve_2d_sobel_y_c(const uint8_t *src, int src_stride, double *dst,
+                               int dst_stride, int w, int h, int dir,
+                               double norm) {
+  int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
+  DECLARE_ALIGNED(256, static const int16_t, sobel_a[3]) = { 1, 0, -1 };
+  DECLARE_ALIGNED(256, static const int16_t, sobel_b[3]) = { 1, 2, 1 };
+  const int taps = 3;
+  int im_h = h + taps - 1;
+  int im_stride = w;
+  const int fo_vert = 1;
+  const int fo_horiz = 1;
+
+  // horizontal filter
+  const uint8_t *src_horiz = src - fo_vert * src_stride;
+  const int16_t *x_filter = dir ? sobel_a : sobel_b;
+  for (int y = 0; y < im_h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      int16_t sum = 0;
+      for (int k = 0; k < taps; ++k) {
+        sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
+      }
+      im_block[y * im_stride + x] = sum;
+    }
+  }
+
+  // vertical filter
+  int16_t *src_vert = im_block + fo_vert * im_stride;
+  const int16_t *y_filter = dir ? sobel_b : sobel_a;
+  for (int y = 0; y < h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      int16_t sum = 0;
+      for (int k = 0; k < taps; ++k) {
+        sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
+      }
+      dst[y * dst_stride + x] = sum * norm;
+    }
+  }
+}
+
+void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
+                          int dst_stride, int w, int h,
+                          const InterpFilterParams *filter_params_x,
+                          const InterpFilterParams *filter_params_y,
+                          const int subpel_x_qn, const int subpel_y_qn,
+                          ConvolveParams *conv_params) {
+  int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
+  int im_h = h + filter_params_y->taps - 1;
+  int im_stride = w;
+  assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const int bd = 8;
+  const int bits =
+      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+
+  // horizontal filter
+  const uint8_t *src_horiz = src - fo_vert * src_stride;
+  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+      filter_params_x, subpel_x_qn & SUBPEL_MASK);
+  for (int y = 0; y < im_h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      int32_t sum = (1 << (bd + FILTER_BITS - 1));
+      for (int k = 0; k < filter_params_x->taps; ++k) {
+        sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
+      }
+      assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
+      im_block[y * im_stride + x] =
+          (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
+    }
+  }
+
+  // vertical filter
+  int16_t *src_vert = im_block + fo_vert * im_stride;
+  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+      filter_params_y, subpel_y_qn & SUBPEL_MASK);
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  for (int y = 0; y < h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      int32_t sum = 1 << offset_bits;
+      for (int k = 0; k < filter_params_y->taps; ++k) {
+        sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
+      }
+      assert(0 <= sum && sum < (1 << (offset_bits + 2)));
+      int16_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
+                    ((1 << (offset_bits - conv_params->round_1)) +
+                     (1 << (offset_bits - conv_params->round_1 - 1)));
+      dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(res, bits));
+    }
+  }
+}
+
+void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
+                         int dst_stride, int w, int h,
+                         const InterpFilterParams *filter_params_x,
+                         const InterpFilterParams *filter_params_y,
+                         const int subpel_x_qn, const int subpel_y_qn,
+                         ConvolveParams *conv_params) {
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  (void)filter_params_x;
+  (void)subpel_x_qn;
+  (void)conv_params;
+
+  assert(conv_params->round_0 <= FILTER_BITS);
+  assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
+         ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
+
+  // vertical filter
+  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+      filter_params_y, subpel_y_qn & SUBPEL_MASK);
+  for (int y = 0; y < h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      int32_t res = 0;
+      for (int k = 0; k < filter_params_y->taps; ++k) {
+        res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
+      }
+      dst[y * dst_stride + x] =
+          clip_pixel(ROUND_POWER_OF_TWO(res, FILTER_BITS));
+    }
+  }
+}
+
+void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
+                         int dst_stride, int w, int h,
+                         const InterpFilterParams *filter_params_x,
+                         const InterpFilterParams *filter_params_y,
+                         const int subpel_x_qn, const int subpel_y_qn,
+                         ConvolveParams *conv_params) {
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const int bits = FILTER_BITS - conv_params->round_0;
+  (void)filter_params_y;
+  (void)subpel_y_qn;
+  (void)conv_params;
+
+  assert(bits >= 0);
+  assert((FILTER_BITS - conv_params->round_1) >= 0 ||
+         ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
+
+  // horizontal filter
+  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+      filter_params_x, subpel_x_qn & SUBPEL_MASK);
+
+  for (int y = 0; y < h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      int32_t res = 0;
+      for (int k = 0; k < filter_params_x->taps; ++k) {
+        res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
+      }
+      res = ROUND_POWER_OF_TWO(res, conv_params->round_0);
+      dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(res, bits));
+    }
+  }
+}
+
+void av1_convolve_2d_copy_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
+                               int dst_stride, int w, int h,
+                               const InterpFilterParams *filter_params_x,
+                               const InterpFilterParams *filter_params_y,
+                               const int subpel_x_qn, const int subpel_y_qn,
+                               ConvolveParams *conv_params) {
+  (void)filter_params_x;
+  (void)filter_params_y;
+  (void)subpel_x_qn;
+  (void)subpel_y_qn;
+  (void)conv_params;
+
+  for (int y = 0; y < h; ++y) {
+    memmove(dst + y * dst_stride, src + y * src_stride, w * sizeof(src[0]));
+  }
+}
+
+void av1_dist_wtd_convolve_2d_c(const uint8_t *src, int src_stride,
+                                uint8_t *dst, int dst_stride, int w, int h,
+                                const InterpFilterParams *filter_params_x,
+                                const InterpFilterParams *filter_params_y,
+                                const int subpel_x_qn, const int subpel_y_qn,
+                                ConvolveParams *conv_params) {
+  CONV_BUF_TYPE *dst16 = conv_params->dst;
+  int dst16_stride = conv_params->dst_stride;
+  int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
+  int im_h = h + filter_params_y->taps - 1;
+  int im_stride = w;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const int bd = 8;
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+
+  // horizontal filter
+  const uint8_t *src_horiz = src - fo_vert * src_stride;
+  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+      filter_params_x, subpel_x_qn & SUBPEL_MASK);
+  for (int y = 0; y < im_h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      int32_t sum = (1 << (bd + FILTER_BITS - 1));
+      for (int k = 0; k < filter_params_x->taps; ++k) {
+        sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
+      }
+      assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
+      im_block[y * im_stride + x] =
+          (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
+    }
+  }
+
+  // vertical filter
+  int16_t *src_vert = im_block + fo_vert * im_stride;
+  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+      filter_params_y, subpel_y_qn & SUBPEL_MASK);
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  for (int y = 0; y < h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      int32_t sum = 1 << offset_bits;
+      for (int k = 0; k < filter_params_y->taps; ++k) {
+        sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
+      }
+      assert(0 <= sum && sum < (1 << (offset_bits + 2)));
+      CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
+      if (conv_params->do_average) {
+        int32_t tmp = dst16[y * dst16_stride + x];
+        if (conv_params->use_dist_wtd_comp_avg) {
+          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+          tmp = tmp >> DIST_PRECISION_BITS;
+        } else {
+          tmp += res;
+          tmp = tmp >> 1;
+        }
+        tmp -= (1 << (offset_bits - conv_params->round_1)) +
+               (1 << (offset_bits - conv_params->round_1 - 1));
+        dst[y * dst_stride + x] =
+            clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
+      } else {
+        dst16[y * dst16_stride + x] = res;
+      }
+    }
+  }
+}
+
+void av1_dist_wtd_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst,
+                               int dst_stride, int w, int h,
+                               const InterpFilterParams *filter_params_x,
+                               const InterpFilterParams *filter_params_y,
+                               const int subpel_x_qn, const int subpel_y_qn,
+                               ConvolveParams *conv_params) {
+  CONV_BUF_TYPE *dst16 = conv_params->dst;
+  int dst16_stride = conv_params->dst_stride;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const int bits = FILTER_BITS - conv_params->round_0;
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
+                           (1 << (offset_bits - conv_params->round_1 - 1));
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  (void)filter_params_x;
+  (void)subpel_x_qn;
+
+  // vertical filter
+  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+      filter_params_y, subpel_y_qn & SUBPEL_MASK);
+  for (int y = 0; y < h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      int32_t res = 0;
+      for (int k = 0; k < filter_params_y->taps; ++k) {
+        res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
+      }
+      res *= (1 << bits);
+      res = ROUND_POWER_OF_TWO(res, conv_params->round_1) + round_offset;
+
+      if (conv_params->do_average) {
+        int32_t tmp = dst16[y * dst16_stride + x];
+        if (conv_params->use_dist_wtd_comp_avg) {
+          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+          tmp = tmp >> DIST_PRECISION_BITS;
+        } else {
+          tmp += res;
+          tmp = tmp >> 1;
+        }
+        tmp -= round_offset;
+        dst[y * dst_stride + x] =
+            clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
+      } else {
+        dst16[y * dst16_stride + x] = res;
+      }
+    }
+  }
+}
+
+void av1_dist_wtd_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst,
+                               int dst_stride, int w, int h,
+                               const InterpFilterParams *filter_params_x,
+                               const InterpFilterParams *filter_params_y,
+                               const int subpel_x_qn, const int subpel_y_qn,
+                               ConvolveParams *conv_params) {
+  CONV_BUF_TYPE *dst16 = conv_params->dst;
+  int dst16_stride = conv_params->dst_stride;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const int bits = FILTER_BITS - conv_params->round_1;
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
+                           (1 << (offset_bits - conv_params->round_1 - 1));
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  (void)filter_params_y;
+  (void)subpel_y_qn;
+
+  // horizontal filter
+  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+      filter_params_x, subpel_x_qn & SUBPEL_MASK);
+  for (int y = 0; y < h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      int32_t res = 0;
+      for (int k = 0; k < filter_params_x->taps; ++k) {
+        res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
+      }
+      res = (1 << bits) * ROUND_POWER_OF_TWO(res, conv_params->round_0);
+      res += round_offset;
+
+      if (conv_params->do_average) {
+        int32_t tmp = dst16[y * dst16_stride + x];
+        if (conv_params->use_dist_wtd_comp_avg) {
+          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+          tmp = tmp >> DIST_PRECISION_BITS;
+        } else {
+          tmp += res;
+          tmp = tmp >> 1;
+        }
+        tmp -= round_offset;
+        dst[y * dst_stride + x] =
+            clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
+      } else {
+        dst16[y * dst16_stride + x] = res;
+      }
+    }
+  }
+}
+
+void av1_dist_wtd_convolve_2d_copy_c(const uint8_t *src, int src_stride,
+                                     uint8_t *dst, int dst_stride, int w, int h,
+                                     const InterpFilterParams *filter_params_x,
+                                     const InterpFilterParams *filter_params_y,
+                                     const int subpel_x_qn,
+                                     const int subpel_y_qn,
+                                     ConvolveParams *conv_params) {
+  CONV_BUF_TYPE *dst16 = conv_params->dst;
+  int dst16_stride = conv_params->dst_stride;
+  const int bits =
+      FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
+                           (1 << (offset_bits - conv_params->round_1 - 1));
+  (void)filter_params_x;
+  (void)filter_params_y;
+  (void)subpel_x_qn;
+  (void)subpel_y_qn;
+
+  for (int y = 0; y < h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      CONV_BUF_TYPE res = src[y * src_stride + x] << bits;
+      res += round_offset;
+
+      if (conv_params->do_average) {
+        int32_t tmp = dst16[y * dst16_stride + x];
+        if (conv_params->use_dist_wtd_comp_avg) {
+          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+          tmp = tmp >> DIST_PRECISION_BITS;
+        } else {
+          tmp += res;
+          tmp = tmp >> 1;
+        }
+        tmp -= round_offset;
+        dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
+      } else {
+        dst16[y * dst16_stride + x] = res;
+      }
+    }
+  }
+}
+
+void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst,
+                             int dst_stride, int w, int h,
+                             const InterpFilterParams *filter_params_x,
+                             const InterpFilterParams *filter_params_y,
+                             const int subpel_x_qn, const int x_step_qn,
+                             const int subpel_y_qn, const int y_step_qn,
+                             ConvolveParams *conv_params) {
+  int16_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
+  int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
+             filter_params_y->taps;
+  CONV_BUF_TYPE *dst16 = conv_params->dst;
+  const int dst16_stride = conv_params->dst_stride;
+  const int bits =
+      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+  assert(bits >= 0);
+  int im_stride = w;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const int bd = 8;
+
+  // horizontal filter
+  const uint8_t *src_horiz = src - fo_vert * src_stride;
+  for (int y = 0; y < im_h; ++y) {
+    int x_qn = subpel_x_qn;
+    for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
+      const uint8_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)];
+      const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
+      assert(x_filter_idx < SUBPEL_SHIFTS);
+      const int16_t *x_filter =
+          av1_get_interp_filter_subpel_kernel(filter_params_x, x_filter_idx);
+      int32_t sum = (1 << (bd + FILTER_BITS - 1));
+      for (int k = 0; k < filter_params_x->taps; ++k) {
+        sum += x_filter[k] * src_x[k - fo_horiz];
+      }
+      assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
+      im_block[y * im_stride + x] =
+          (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
+    }
+    src_horiz += src_stride;
+  }
+
+  // vertical filter
+  int16_t *src_vert = im_block + fo_vert * im_stride;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  for (int x = 0; x < w; ++x) {
+    int y_qn = subpel_y_qn;
+    for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
+      const int16_t *src_y = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride];
+      const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
+      assert(y_filter_idx < SUBPEL_SHIFTS);
+      const int16_t *y_filter =
+          av1_get_interp_filter_subpel_kernel(filter_params_y, y_filter_idx);
+      int32_t sum = 1 << offset_bits;
+      for (int k = 0; k < filter_params_y->taps; ++k) {
+        sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
+      }
+      assert(0 <= sum && sum < (1 << (offset_bits + 2)));
+      CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
+      if (conv_params->is_compound) {
+        if (conv_params->do_average) {
+          int32_t tmp = dst16[y * dst16_stride + x];
+          if (conv_params->use_dist_wtd_comp_avg) {
+            tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+            tmp = tmp >> DIST_PRECISION_BITS;
+          } else {
+            tmp += res;
+            tmp = tmp >> 1;
+          }
+          /* Subtract round offset and convolve round */
+          tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) +
+                       (1 << (offset_bits - conv_params->round_1 - 1)));
+          dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
+        } else {
+          dst16[y * dst16_stride + x] = res;
+        }
+      } else {
+        /* Subtract round offset and convolve round */
+        int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
+                             (1 << (offset_bits - conv_params->round_1 - 1)));
+        dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
+      }
+    }
+    src_vert++;
+  }
+}
+
+static void convolve_2d_scale_wrapper(
+    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int x_step_qn, const int subpel_y_qn, const int y_step_qn,
+    ConvolveParams *conv_params) {
+  if (conv_params->is_compound) {
+    assert(conv_params->dst != NULL);
+  }
+  av1_convolve_2d_scale(src, src_stride, dst, dst_stride, w, h, filter_params_x,
+                        filter_params_y, subpel_x_qn, x_step_qn, subpel_y_qn,
+                        y_step_qn, conv_params);
+}
+
+void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst,
+                            int dst_stride, int w, int h,
+                            const InterpFilterParams *interp_filters[2],
+                            const int subpel_x_qn, int x_step_q4,
+                            const int subpel_y_qn, int y_step_q4, int scaled,
+                            ConvolveParams *conv_params,
+                            const struct scale_factors *sf) {
+  (void)x_step_q4;
+  (void)y_step_q4;
+  (void)dst;
+  (void)dst_stride;
+
+  const InterpFilterParams *filter_params_x = interp_filters[0];
+  const InterpFilterParams *filter_params_y = interp_filters[1];
+
+  // TODO(jingning, yunqing): Add SIMD support to 2-tap filter case.
+  // Do we have SIMD support to 4-tap case?
+  // 2-tap filter indicates that it is for IntraBC.
+  if (filter_params_x->taps == 2 || filter_params_y->taps == 2) {
+    assert(filter_params_x->taps == 2 && filter_params_y->taps == 2);
+    assert(!scaled);
+    if (subpel_x_qn && subpel_y_qn) {
+      av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
+                           filter_params_x, filter_params_y, subpel_x_qn,
+                           subpel_y_qn, conv_params);
+      return;
+    } else if (subpel_x_qn) {
+      av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h,
+                          filter_params_x, filter_params_y, subpel_x_qn,
+                          subpel_y_qn, conv_params);
+      return;
+    } else if (subpel_y_qn) {
+      av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h,
+                          filter_params_x, filter_params_y, subpel_x_qn,
+                          subpel_y_qn, conv_params);
+      return;
+    }
+  }
+
+  if (scaled) {
+    convolve_2d_scale_wrapper(src, src_stride, dst, dst_stride, w, h,
+                              filter_params_x, filter_params_y, subpel_x_qn,
+                              x_step_q4, subpel_y_qn, y_step_q4, conv_params);
+  } else {
+    sf->convolve[subpel_x_qn != 0][subpel_y_qn != 0][conv_params->is_compound](
+        src, src_stride, dst, dst_stride, w, h, filter_params_x,
+        filter_params_y, subpel_x_qn, subpel_y_qn, conv_params);
+  }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void av1_highbd_convolve_2d_copy_sr_c(
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
+  (void)filter_params_x;
+  (void)filter_params_y;
+  (void)subpel_x_qn;
+  (void)subpel_y_qn;
+  (void)conv_params;
+  (void)bd;
+
+  for (int y = 0; y < h; ++y) {
+    memmove(dst + y * dst_stride, src + y * src_stride, w * sizeof(src[0]));
+  }
+}
+
+void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride,
+                                uint16_t *dst, int dst_stride, int w, int h,
+                                const InterpFilterParams *filter_params_x,
+                                const InterpFilterParams *filter_params_y,
+                                const int subpel_x_qn, const int subpel_y_qn,
+                                ConvolveParams *conv_params, int bd) {
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const int bits = FILTER_BITS - conv_params->round_0;
+  (void)filter_params_y;
+  (void)subpel_y_qn;
+
+  assert(bits >= 0);
+  assert((FILTER_BITS - conv_params->round_1) >= 0 ||
+         ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
+
+  // horizontal filter
+  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+      filter_params_x, subpel_x_qn & SUBPEL_MASK);
+  for (int y = 0; y < h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      int32_t res = 0;
+      for (int k = 0; k < filter_params_x->taps; ++k) {
+        res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
+      }
+      res = ROUND_POWER_OF_TWO(res, conv_params->round_0);
+      dst[y * dst_stride + x] =
+          clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd);
+    }
+  }
+}
+
+void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride,
+                                uint16_t *dst, int dst_stride, int w, int h,
+                                const InterpFilterParams *filter_params_x,
+                                const InterpFilterParams *filter_params_y,
+                                const int subpel_x_qn, const int subpel_y_qn,
+                                ConvolveParams *conv_params, int bd) {
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  (void)filter_params_x;
+  (void)subpel_x_qn;
+  (void)conv_params;
+
+  assert(conv_params->round_0 <= FILTER_BITS);
+  assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
+         ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
+  // vertical filter
+  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+      filter_params_y, subpel_y_qn & SUBPEL_MASK);
+  for (int y = 0; y < h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      int32_t res = 0;
+      for (int k = 0; k < filter_params_y->taps; ++k) {
+        res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
+      }
+      dst[y * dst_stride + x] =
+          clip_pixel_highbd(ROUND_POWER_OF_TWO(res, FILTER_BITS), bd);
+    }
+  }
+}
+
+void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride,
+                                 uint16_t *dst, int dst_stride, int w, int h,
+                                 const InterpFilterParams *filter_params_x,
+                                 const InterpFilterParams *filter_params_y,
+                                 const int subpel_x_qn, const int subpel_y_qn,
+                                 ConvolveParams *conv_params, int bd) {
+  int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
+  int im_h = h + filter_params_y->taps - 1;
+  int im_stride = w;
+  assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const int bits =
+      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+  assert(bits >= 0);
+
+  // horizontal filter
+  const uint16_t *src_horiz = src - fo_vert * src_stride;
+  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+      filter_params_x, subpel_x_qn & SUBPEL_MASK);
+  for (int y = 0; y < im_h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      int32_t sum = (1 << (bd + FILTER_BITS - 1));
+      for (int k = 0; k < filter_params_x->taps; ++k) {
+        sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
+      }
+      assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
+      im_block[y * im_stride + x] =
+          ROUND_POWER_OF_TWO(sum, conv_params->round_0);
+    }
+  }
+
+  // vertical filter
+  int16_t *src_vert = im_block + fo_vert * im_stride;
+  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+      filter_params_y, subpel_y_qn & SUBPEL_MASK);
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  for (int y = 0; y < h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      int32_t sum = 1 << offset_bits;
+      for (int k = 0; k < filter_params_y->taps; ++k) {
+        sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
+      }
+      assert(0 <= sum && sum < (1 << (offset_bits + 2)));
+      int32_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
+                    ((1 << (offset_bits - conv_params->round_1)) +
+                     (1 << (offset_bits - conv_params->round_1 - 1)));
+      dst[y * dst_stride + x] =
+          clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd);
+    }
+  }
+}
+
+void av1_highbd_dist_wtd_convolve_2d_c(
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
+  int x, y, k;
+  int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
+  CONV_BUF_TYPE *dst16 = conv_params->dst;
+  int dst16_stride = conv_params->dst_stride;
+  int im_h = h + filter_params_y->taps - 1;
+  int im_stride = w;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  assert(round_bits >= 0);
+
+  // horizontal filter
+  const uint16_t *src_horiz = src - fo_vert * src_stride;
+  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+      filter_params_x, subpel_x_qn & SUBPEL_MASK);
+  for (y = 0; y < im_h; ++y) {
+    for (x = 0; x < w; ++x) {
+      int32_t sum = (1 << (bd + FILTER_BITS - 1));
+      for (k = 0; k < filter_params_x->taps; ++k) {
+        sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
+      }
+      assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
+      (void)bd;
+      im_block[y * im_stride + x] =
+          (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
+    }
+  }
+
+  // vertical filter
+  int16_t *src_vert = im_block + fo_vert * im_stride;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+      filter_params_y, subpel_y_qn & SUBPEL_MASK);
+  for (y = 0; y < h; ++y) {
+    for (x = 0; x < w; ++x) {
+      int32_t sum = 1 << offset_bits;
+      for (k = 0; k < filter_params_y->taps; ++k) {
+        sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
+      }
+      assert(0 <= sum && sum < (1 << (offset_bits + 2)));
+      CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
+      if (conv_params->do_average) {
+        int32_t tmp = dst16[y * dst16_stride + x];
+        if (conv_params->use_dist_wtd_comp_avg) {
+          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+          tmp = tmp >> DIST_PRECISION_BITS;
+        } else {
+          tmp += res;
+          tmp = tmp >> 1;
+        }
+        tmp -= (1 << (offset_bits - conv_params->round_1)) +
+               (1 << (offset_bits - conv_params->round_1 - 1));
+        dst[y * dst_stride + x] =
+            clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
+      } else {
+        dst16[y * dst16_stride + x] = res;
+      }
+    }
+  }
+}
+
+void av1_highbd_dist_wtd_convolve_x_c(
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
+  CONV_BUF_TYPE *dst16 = conv_params->dst;
+  int dst16_stride = conv_params->dst_stride;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const int bits = FILTER_BITS - conv_params->round_1;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
+                           (1 << (offset_bits - conv_params->round_1 - 1));
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  assert(round_bits >= 0);
+  (void)filter_params_y;
+  (void)subpel_y_qn;
+  assert(bits >= 0);
+  // horizontal filter
+  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+      filter_params_x, subpel_x_qn & SUBPEL_MASK);
+  for (int y = 0; y < h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      int32_t res = 0;
+      for (int k = 0; k < filter_params_x->taps; ++k) {
+        res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
+      }
+      res = (1 << bits) * ROUND_POWER_OF_TWO(res, conv_params->round_0);
+      res += round_offset;
+
+      if (conv_params->do_average) {
+        int32_t tmp = dst16[y * dst16_stride + x];
+        if (conv_params->use_dist_wtd_comp_avg) {
+          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+          tmp = tmp >> DIST_PRECISION_BITS;
+        } else {
+          tmp += res;
+          tmp = tmp >> 1;
+        }
+        tmp -= round_offset;
+        dst[y * dst_stride + x] =
+            clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
+      } else {
+        dst16[y * dst16_stride + x] = res;
+      }
+    }
+  }
+}
+
+void av1_highbd_dist_wtd_convolve_y_c(
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
+  CONV_BUF_TYPE *dst16 = conv_params->dst;
+  int dst16_stride = conv_params->dst_stride;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const int bits = FILTER_BITS - conv_params->round_0;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
+                           (1 << (offset_bits - conv_params->round_1 - 1));
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  assert(round_bits >= 0);
+  (void)filter_params_x;
+  (void)subpel_x_qn;
+  assert(bits >= 0);
+  // vertical filter
+  const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+      filter_params_y, subpel_y_qn & SUBPEL_MASK);
+  for (int y = 0; y < h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      int32_t res = 0;
+      for (int k = 0; k < filter_params_y->taps; ++k) {
+        res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
+      }
+      res *= (1 << bits);
+      res = ROUND_POWER_OF_TWO(res, conv_params->round_1) + round_offset;
+
+      if (conv_params->do_average) {
+        int32_t tmp = dst16[y * dst16_stride + x];
+        if (conv_params->use_dist_wtd_comp_avg) {
+          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+          tmp = tmp >> DIST_PRECISION_BITS;
+        } else {
+          tmp += res;
+          tmp = tmp >> 1;
+        }
+        tmp -= round_offset;
+        dst[y * dst_stride + x] =
+            clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
+      } else {
+        dst16[y * dst16_stride + x] = res;
+      }
+    }
+  }
+}
+
+void av1_highbd_dist_wtd_convolve_2d_copy_c(
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
+  CONV_BUF_TYPE *dst16 = conv_params->dst;
+  int dst16_stride = conv_params->dst_stride;
+  const int bits =
+      FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
+                           (1 << (offset_bits - conv_params->round_1 - 1));
+  assert(bits >= 0);
+  (void)filter_params_x;
+  (void)filter_params_y;
+  (void)subpel_x_qn;
+  (void)subpel_y_qn;
+
+  for (int y = 0; y < h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      CONV_BUF_TYPE res = src[y * src_stride + x] << bits;
+      res += round_offset;
+      if (conv_params->do_average) {
+        int32_t tmp = dst16[y * dst16_stride + x];
+        if (conv_params->use_dist_wtd_comp_avg) {
+          tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+          tmp = tmp >> DIST_PRECISION_BITS;
+        } else {
+          tmp += res;
+          tmp = tmp >> 1;
+        }
+        tmp -= round_offset;
+        dst[y * dst_stride + x] =
+            clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
+      } else {
+        dst16[y * dst16_stride + x] = res;
+      }
+    }
+  }
+}
+
+void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride,
+                                    uint16_t *dst, int dst_stride, int w, int h,
+                                    const InterpFilterParams *filter_params_x,
+                                    const InterpFilterParams *filter_params_y,
+                                    const int subpel_x_qn, const int x_step_qn,
+                                    const int subpel_y_qn, const int y_step_qn,
+                                    ConvolveParams *conv_params, int bd) {
+  int16_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
+  int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
+             filter_params_y->taps;
+  int im_stride = w;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  CONV_BUF_TYPE *dst16 = conv_params->dst;
+  const int dst16_stride = conv_params->dst_stride;
+  const int bits =
+      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+  assert(bits >= 0);
+  // horizontal filter
+  const uint16_t *src_horiz = src - fo_vert * src_stride;
+  for (int y = 0; y < im_h; ++y) {
+    int x_qn = subpel_x_qn;
+    for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
+      const uint16_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)];
+      const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
+      assert(x_filter_idx < SUBPEL_SHIFTS);
+      const int16_t *x_filter =
+          av1_get_interp_filter_subpel_kernel(filter_params_x, x_filter_idx);
+      int32_t sum = (1 << (bd + FILTER_BITS - 1));
+      for (int k = 0; k < filter_params_x->taps; ++k) {
+        sum += x_filter[k] * src_x[k - fo_horiz];
+      }
+      assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
+      im_block[y * im_stride + x] =
+          (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
+    }
+    src_horiz += src_stride;
+  }
+
+  // vertical filter
+  int16_t *src_vert = im_block + fo_vert * im_stride;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  for (int x = 0; x < w; ++x) {
+    int y_qn = subpel_y_qn;
+    for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
+      const int16_t *src_y = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride];
+      const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
+      assert(y_filter_idx < SUBPEL_SHIFTS);
+      const int16_t *y_filter =
+          av1_get_interp_filter_subpel_kernel(filter_params_y, y_filter_idx);
+      int32_t sum = 1 << offset_bits;
+      for (int k = 0; k < filter_params_y->taps; ++k) {
+        sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
+      }
+      assert(0 <= sum && sum < (1 << (offset_bits + 2)));
+      CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
+      if (conv_params->is_compound) {
+        if (conv_params->do_average) {
+          int32_t tmp = dst16[y * dst16_stride + x];
+          if (conv_params->use_dist_wtd_comp_avg) {
+            tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+            tmp = tmp >> DIST_PRECISION_BITS;
+          } else {
+            tmp += res;
+            tmp = tmp >> 1;
+          }
+          /* Subtract round offset and convolve round */
+          tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) +
+                       (1 << (offset_bits - conv_params->round_1 - 1)));
+          dst[y * dst_stride + x] =
+              clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
+        } else {
+          dst16[y * dst16_stride + x] = res;
+        }
+      } else {
+        /* Subtract round offset and convolve round */
+        int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
+                             (1 << (offset_bits - conv_params->round_1 - 1)));
+        dst[y * dst_stride + x] =
+            clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
+      }
+    }
+    src_vert++;
+  }
+}
+
+void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride,
+                                   uint8_t *dst8, int dst_stride, int w, int h,
+                                   const InterpFilterParams *interp_filters[2],
+                                   const int subpel_x_qn, int x_step_q4,
+                                   const int subpel_y_qn, int y_step_q4,
+                                   int scaled, ConvolveParams *conv_params,
+                                   const struct scale_factors *sf, int bd) {
+  (void)x_step_q4;
+  (void)y_step_q4;
+  (void)dst_stride;
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+
+  const int need_filter_params_x = (subpel_x_qn != 0) | scaled;
+  const int need_filter_params_y = (subpel_y_qn != 0) | scaled;
+  const InterpFilterParams *filter_params_x =
+      need_filter_params_x ? interp_filters[0] : NULL;
+  const InterpFilterParams *filter_params_y =
+      need_filter_params_y ? interp_filters[1] : NULL;
+
+  if (scaled) {
+    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+    if (conv_params->is_compound) {
+      assert(conv_params->dst != NULL);
+    }
+    av1_highbd_convolve_2d_scale(src, src_stride, dst, dst_stride, w, h,
+                                 filter_params_x, filter_params_y, subpel_x_qn,
+                                 x_step_q4, subpel_y_qn, y_step_q4, conv_params,
+                                 bd);
+  } else {
+    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+
+    sf->highbd_convolve[subpel_x_qn != 0][subpel_y_qn !=
+                                          0][conv_params->is_compound](
+        src, src_stride, dst, dst_stride, w, h, filter_params_x,
+        filter_params_y, subpel_x_qn, subpel_y_qn, conv_params, bd);
+  }
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+// Note: Fixed size intermediate buffers, place limits on parameters
+// of some functions. 2d filtering proceeds in 2 steps:
+//   (1) Interpolate horizontally into an intermediate buffer, temp.
+//   (2) Interpolate temp vertically to derive the sub-pixel result.
+// Deriving the maximum number of rows in the temp buffer (135):
+// --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+// --Largest block size is 128x128 pixels.
+// --128 rows in the downscaled frame span a distance of (128 - 1) * 32 in the
+//   original frame (in 1/16th pixel units).
+// --Must round-up because block may be located at sub-pixel position.
+// --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+// --((128 - 1) * 32 + 15) >> 4 + 8 = 263.
+#define WIENER_MAX_EXT_SIZE 263
+
+static INLINE int horz_scalar_product(const uint8_t *a, const int16_t *b) {
+  int sum = 0;
+  for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
+  return sum;
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE int highbd_horz_scalar_product(const uint16_t *a,
+                                             const int16_t *b) {
+  int sum = 0;
+  for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
+  return sum;
+}
+#endif
+
+static INLINE int highbd_vert_scalar_product(const uint16_t *a,
+                                             ptrdiff_t a_stride,
+                                             const int16_t *b) {
+  int sum = 0;
+  for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k * a_stride] * b[k];
+  return sum;
+}
+
+static const InterpKernel *get_filter_base(const int16_t *filter) {
+  // NOTE: This assumes that the filter table is 256-byte aligned.
+  // TODO(agrange) Modify to make independent of table alignment.
+  return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
+}
+
+static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
+  return (int)((const InterpKernel *)(intptr_t)f - base);
+}
+
+static void convolve_add_src_horiz_hip(const uint8_t *src, ptrdiff_t src_stride,
+                                       uint16_t *dst, ptrdiff_t dst_stride,
+                                       const InterpKernel *x_filters, int x0_q4,
+                                       int x_step_q4, int w, int h,
+                                       int round0_bits) {
+  const int bd = 8;
+  src -= SUBPEL_TAPS / 2 - 1;
+  for (int y = 0; y < h; ++y) {
+    int x_q4 = x0_q4;
+    for (int x = 0; x < w; ++x) {
+      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+      const int rounding = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
+                           (1 << (bd + FILTER_BITS - 1));
+      const int sum = horz_scalar_product(src_x, x_filter) + rounding;
+      dst[x] = (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, round0_bits), 0,
+                               WIENER_CLAMP_LIMIT(round0_bits, bd) - 1);
+      x_q4 += x_step_q4;
+    }
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void convolve_add_src_vert_hip(const uint16_t *src, ptrdiff_t src_stride,
+                                      uint8_t *dst, ptrdiff_t dst_stride,
+                                      const InterpKernel *y_filters, int y0_q4,
+                                      int y_step_q4, int w, int h,
+                                      int round1_bits) {
+  const int bd = 8;
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+
+  for (int x = 0; x < w; ++x) {
+    int y_q4 = y0_q4;
+    for (int y = 0; y < h; ++y) {
+      const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+      const int rounding =
+          ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
+          (1 << (bd + round1_bits - 1));
+      const int sum =
+          highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding;
+      dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, round1_bits));
+      y_q4 += y_step_q4;
+    }
+    ++src;
+    ++dst;
+  }
+}
+
+void av1_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride,
+                                   uint8_t *dst, ptrdiff_t dst_stride,
+                                   const int16_t *filter_x, int x_step_q4,
+                                   const int16_t *filter_y, int y_step_q4,
+                                   int w, int h,
+                                   const ConvolveParams *conv_params) {
+  const InterpKernel *const filters_x = get_filter_base(filter_x);
+  const int x0_q4 = get_filter_offset(filter_x, filters_x);
+
+  const InterpKernel *const filters_y = get_filter_base(filter_y);
+  const int y0_q4 = get_filter_offset(filter_y, filters_y);
+
+  uint16_t temp[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE];
+  const int intermediate_height =
+      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS - 1;
+  memset(temp + (intermediate_height * MAX_SB_SIZE), 0, MAX_SB_SIZE);
+
+  assert(w <= MAX_SB_SIZE);
+  assert(h <= MAX_SB_SIZE);
+  assert(y_step_q4 <= 32);
+  assert(x_step_q4 <= 32);
+
+  convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+                             src_stride, temp, MAX_SB_SIZE, filters_x, x0_q4,
+                             x_step_q4, w, intermediate_height,
+                             conv_params->round_0);
+  convolve_add_src_vert_hip(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
+                            MAX_SB_SIZE, dst, dst_stride, filters_y, y0_q4,
+                            y_step_q4, w, h, conv_params->round_1);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static void highbd_convolve_add_src_horiz_hip(
+    const uint8_t *src8, ptrdiff_t src_stride, uint16_t *dst,
+    ptrdiff_t dst_stride, const InterpKernel *x_filters, int x0_q4,
+    int x_step_q4, int w, int h, int round0_bits, int bd) {
+  const int extraprec_clamp_limit = WIENER_CLAMP_LIMIT(round0_bits, bd);
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  src -= SUBPEL_TAPS / 2 - 1;
+  for (int y = 0; y < h; ++y) {
+    int x_q4 = x0_q4;
+    for (int x = 0; x < w; ++x) {
+      const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+      const int rounding = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
+                           (1 << (bd + FILTER_BITS - 1));
+      const int sum = highbd_horz_scalar_product(src_x, x_filter) + rounding;
+      dst[x] = (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, round0_bits), 0,
+                               extraprec_clamp_limit - 1);
+      x_q4 += x_step_q4;
+    }
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void highbd_convolve_add_src_vert_hip(
+    const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst8,
+    ptrdiff_t dst_stride, const InterpKernel *y_filters, int y0_q4,
+    int y_step_q4, int w, int h, int round1_bits, int bd) {
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+  for (int x = 0; x < w; ++x) {
+    int y_q4 = y0_q4;
+    for (int y = 0; y < h; ++y) {
+      const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+      const int rounding =
+          ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
+          (1 << (bd + round1_bits - 1));
+      const int sum =
+          highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding;
+      dst[y * dst_stride] =
+          clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, round1_bits), bd);
+      y_q4 += y_step_q4;
+    }
+    ++src;
+    ++dst;
+  }
+}
+
+void av1_highbd_wiener_convolve_add_src_c(
+    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+    ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
+    const int16_t *filter_y, int y_step_q4, int w, int h,
+    const ConvolveParams *conv_params, int bd) {
+  const InterpKernel *const filters_x = get_filter_base(filter_x);
+  const int x0_q4 = get_filter_offset(filter_x, filters_x);
+
+  const InterpKernel *const filters_y = get_filter_base(filter_y);
+  const int y0_q4 = get_filter_offset(filter_y, filters_y);
+
+  uint16_t temp[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE];
+  const int intermediate_height =
+      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+
+  assert(w <= MAX_SB_SIZE);
+  assert(h <= MAX_SB_SIZE);
+  assert(y_step_q4 <= 32);
+  assert(x_step_q4 <= 32);
+  assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16);
+
+  highbd_convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+                                    src_stride, temp, MAX_SB_SIZE, filters_x,
+                                    x0_q4, x_step_q4, w, intermediate_height,
+                                    conv_params->round_0, bd);
+  highbd_convolve_add_src_vert_hip(
+      temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE, dst, dst_stride,
+      filters_y, y0_q4, y_step_q4, w, h, conv_params->round_1, bd);
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/libs/libaom/src/av1/common/convolve.h b/libs/libaom/src/av1/common/convolve.h
new file mode 100644
index 000000000..04df86c42
--- /dev/null
+++ b/libs/libaom/src/av1/common/convolve.h
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_CONVOLVE_H_
+#define AOM_AV1_COMMON_CONVOLVE_H_
+#include "av1/common/filter.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef uint16_t CONV_BUF_TYPE;
+typedef struct ConvolveParams {
+  int do_average;
+  CONV_BUF_TYPE *dst;
+  int dst_stride;
+  int round_0;
+  int round_1;
+  int plane;
+  int is_compound;
+  int compound_index;  // 0: the first single in compound mode, 1: the second.
+  int use_dist_wtd_comp_avg;
+  int fwd_offset;
+  int bck_offset;
+} ConvolveParams;
+
+#define ROUND0_BITS 3
+#define COMPOUND_ROUND1_BITS 7
+#define WIENER_ROUND0_BITS 3
+
+#define WIENER_CLAMP_LIMIT(r0, bd) (1 << ((bd) + 1 + FILTER_BITS - r0))
+
+typedef void (*aom_convolve_fn_t)(const uint8_t *src, int src_stride,
+                                  uint8_t *dst, int dst_stride, int w, int h,
+                                  const InterpFilterParams *filter_params_x,
+                                  const InterpFilterParams *filter_params_y,
+                                  const int subpel_x_qn, const int subpel_y_qn,
+                                  ConvolveParams *conv_params);
+
+typedef void (*aom_highbd_convolve_fn_t)(
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params, int bd);
+
+struct AV1Common;
+struct scale_factors;
+
+void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst,
+                            int dst_stride, int w, int h,
+                            const InterpFilterParams *interp_filters[2],
+                            const int subpel_x_qn, int x_step_q4,
+                            const int subpel_y_qn, int y_step_q4, int scaled,
+                            ConvolveParams *conv_params,
+                            const struct scale_factors *sf);
+
+static INLINE ConvolveParams get_conv_params_no_round(int cmp_index, int plane,
+                                                      CONV_BUF_TYPE *dst,
+                                                      int dst_stride,
+                                                      int is_compound, int bd) {
+  ConvolveParams conv_params;
+  conv_params.compound_index = cmp_index;
+  assert(IMPLIES(cmp_index, is_compound));
+
+  conv_params.is_compound = is_compound;
+  conv_params.round_0 = ROUND0_BITS;
+  conv_params.round_1 = is_compound ? COMPOUND_ROUND1_BITS
+                                    : 2 * FILTER_BITS - conv_params.round_0;
+  const int intbufrange = bd + FILTER_BITS - conv_params.round_0 + 2;
+  assert(IMPLIES(bd < 12, intbufrange <= 16));
+  if (intbufrange > 16) {
+    conv_params.round_0 += intbufrange - 16;
+    if (!is_compound) conv_params.round_1 -= intbufrange - 16;
+  }
+  // TODO(yunqing): The following dst should only be valid while
+  // is_compound = 1;
+  conv_params.dst = dst;
+  conv_params.dst_stride = dst_stride;
+  conv_params.plane = plane;
+
+  // By default, set do average to 1 if this is the second single prediction
+  // in a compound mode.
+  conv_params.do_average = cmp_index;
+  return conv_params;
+}
+
+static INLINE ConvolveParams get_conv_params(int do_average, int plane,
+                                             int bd) {
+  return get_conv_params_no_round(do_average, plane, NULL, 0, 0, bd);
+}
+
+static INLINE ConvolveParams get_conv_params_wiener(int bd) {
+  ConvolveParams conv_params;
+  (void)bd;
+  conv_params.do_average = 0;
+  conv_params.is_compound = 0;
+  conv_params.round_0 = WIENER_ROUND0_BITS;
+  conv_params.round_1 = 2 * FILTER_BITS - conv_params.round_0;
+  const int intbufrange = bd + FILTER_BITS - conv_params.round_0 + 2;
+  assert(IMPLIES(bd < 12, intbufrange <= 16));
+  if (intbufrange > 16) {
+    conv_params.round_0 += intbufrange - 16;
+    conv_params.round_1 -= intbufrange - 16;
+  }
+  conv_params.dst = NULL;
+  conv_params.dst_stride = 0;
+  conv_params.plane = 0;
+  return conv_params;
+}
+
+void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride,
+                                   uint8_t *dst, int dst_stride, int w, int h,
+                                   const InterpFilterParams *interp_filters[2],
+                                   const int subpel_x_qn, int x_step_q4,
+                                   const int subpel_y_qn, int y_step_q4,
+                                   int scaled, ConvolveParams *conv_params,
+                                   const struct scale_factors *sf, int bd);
+
+// TODO(sarahparker) This will need to be integerized and optimized
+void av1_convolve_2d_sobel_y_c(const uint8_t *src, int src_stride, double *dst,
+                               int dst_stride, int w, int h, int dir,
+                               double norm);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_COMMON_CONVOLVE_H_
diff --git a/libs/libaom/src/av1/common/debugmodes.c b/libs/libaom/src/av1/common/debugmodes.c
new file mode 100644
index 000000000..ff02ddde0
--- /dev/null
+++ b/libs/libaom/src/av1/common/debugmodes.c
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdio.h>
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/blockd.h"
+#include "av1/common/enums.h"
+
+static void log_frame_info(AV1_COMMON *cm, const char *str, FILE *f) {
+  fprintf(f, "%s", str);
+  fprintf(f, "(Frame %d, Show:%d, Q:%d): \n", cm->current_frame.frame_number,
+          cm->show_frame, cm->quant_params.base_qindex);
+}
+/* This function dereferences a pointer to the mbmi structure
+ * and uses the passed in member offset to print out the value of an integer
+ * for each mbmi member value in the mi structure.
+ */
+static void print_mi_data(AV1_COMMON *cm, FILE *file, const char *descriptor,
+                          size_t member_offset) {
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  MB_MODE_INFO **mi = mi_params->mi_grid_base;
+  int rows = mi_params->mi_rows;
+  int cols = mi_params->mi_cols;
+  char prefix = descriptor[0];
+
+  log_frame_info(cm, descriptor, file);
+  for (int mi_row = 0; mi_row < rows; mi_row++) {
+    fprintf(file, "%c ", prefix);
+    for (int mi_col = 0; mi_col < cols; mi_col++) {
+      fprintf(file, "%2d ", *((char *)((char *)(mi[0]) + member_offset)));
+      mi++;
+    }
+    fprintf(file, "\n");
+    mi += mi_params->mi_stride - cols;
+  }
+  fprintf(file, "\n");
+}
+
+void av1_print_modes_and_motion_vectors(AV1_COMMON *cm, const char *file) {
+  CommonModeInfoParams *mi_params = &cm->mi_params;
+  FILE *mvs = fopen(file, "a");
+  MB_MODE_INFO **mi = mi_params->mi_grid_base;
+  const int rows = mi_params->mi_rows;
+  const int cols = mi_params->mi_cols;
+
+  print_mi_data(cm, mvs, "Partitions:", offsetof(MB_MODE_INFO, sb_type));
+  print_mi_data(cm, mvs, "Modes:", offsetof(MB_MODE_INFO, mode));
+  print_mi_data(cm, mvs, "Ref frame:", offsetof(MB_MODE_INFO, ref_frame[0]));
+  print_mi_data(cm, mvs, "Transform:", offsetof(MB_MODE_INFO, tx_size));
+  print_mi_data(cm, mvs, "UV Modes:", offsetof(MB_MODE_INFO, uv_mode));
+
+  // output skip infomation.
+  log_frame_info(cm, "Skips:", mvs);
+  for (int mi_row = 0; mi_row < rows; mi_row++) {
+    fprintf(mvs, "S ");
+    for (int mi_col = 0; mi_col < cols; mi_col++) {
+      fprintf(mvs, "%2d ", mi[0]->skip);
+      mi++;
+    }
+    fprintf(mvs, "\n");
+    mi += mi_params->mi_stride - cols;
+  }
+  fprintf(mvs, "\n");
+
+  // output motion vectors.
+  log_frame_info(cm, "Vectors ", mvs);
+  mi = mi_params->mi_grid_base;
+  for (int mi_row = 0; mi_row < rows; mi_row++) {
+    fprintf(mvs, "V ");
+    for (int mi_col = 0; mi_col < cols; mi_col++) {
+      fprintf(mvs, "%4d:%4d ", mi[0]->mv[0].as_mv.row, mi[0]->mv[0].as_mv.col);
+      mi++;
+    }
+    fprintf(mvs, "\n");
+    mi += mi_params->mi_stride - cols;
+  }
+  fprintf(mvs, "\n");
+
+  fclose(mvs);
+}
+
+void av1_print_uncompressed_frame_header(const uint8_t *data, int size,
+                                         const char *filename) {
+  FILE *hdrFile = fopen(filename, "w");
+  fwrite(data, size, sizeof(uint8_t), hdrFile);
+
+  // Reset order hints(7bit + a previous bit) to 0, so that all camera frame
+  // headers are identical in large scale coding.
+  uint8_t zero = 0;
+  fseek(hdrFile, 1, SEEK_SET);
+  // Reset second byte.
+  fwrite(&zero, 1, sizeof(uint8_t), hdrFile);
+  fclose(hdrFile);
+}
+
+void av1_print_frame_contexts(const FRAME_CONTEXT *fc, const char *filename) {
+  FILE *fcFile = fopen(filename, "w");
+  const uint16_t *fcp = (uint16_t *)fc;
+  const unsigned int n_contexts = sizeof(FRAME_CONTEXT) / sizeof(uint16_t);
+  unsigned int i;
+
+  for (i = 0; i < n_contexts; ++i) fprintf(fcFile, "%d ", *fcp++);
+  fclose(fcFile);
+}
diff --git a/libs/libaom/src/av1/common/entropy.c b/libs/libaom/src/av1/common/entropy.c
new file mode 100644
index 000000000..1f7a0efe0
--- /dev/null
+++ b/libs/libaom/src/av1/common/entropy.c
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+#include "aom_mem/aom_mem.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/common/blockd.h"
+#include "av1/common/entropy.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/scan.h"
+#include "av1/common/token_cdfs.h"
+#include "av1/common/txb_common.h"
+
+static int get_q_ctx(int q) {
+  if (q <= 20) return 0;
+  if (q <= 60) return 1;
+  if (q <= 120) return 2;
+  return 3;
+}
+
+void av1_default_coef_probs(AV1_COMMON *cm) {
+  const int index = get_q_ctx(cm->quant_params.base_qindex);
+#if CONFIG_ENTROPY_STATS
+  cm->coef_cdf_category = index;
+#endif
+
+  av1_copy(cm->fc->txb_skip_cdf, av1_default_txb_skip_cdfs[index]);
+  av1_copy(cm->fc->eob_extra_cdf, av1_default_eob_extra_cdfs[index]);
+  av1_copy(cm->fc->dc_sign_cdf, av1_default_dc_sign_cdfs[index]);
+  av1_copy(cm->fc->coeff_br_cdf, av1_default_coeff_lps_multi_cdfs[index]);
+  av1_copy(cm->fc->coeff_base_cdf, av1_default_coeff_base_multi_cdfs[index]);
+  av1_copy(cm->fc->coeff_base_eob_cdf,
+           av1_default_coeff_base_eob_multi_cdfs[index]);
+  av1_copy(cm->fc->eob_flag_cdf16, av1_default_eob_multi16_cdfs[index]);
+  av1_copy(cm->fc->eob_flag_cdf32, av1_default_eob_multi32_cdfs[index]);
+  av1_copy(cm->fc->eob_flag_cdf64, av1_default_eob_multi64_cdfs[index]);
+  av1_copy(cm->fc->eob_flag_cdf128, av1_default_eob_multi128_cdfs[index]);
+  av1_copy(cm->fc->eob_flag_cdf256, av1_default_eob_multi256_cdfs[index]);
+  av1_copy(cm->fc->eob_flag_cdf512, av1_default_eob_multi512_cdfs[index]);
+  av1_copy(cm->fc->eob_flag_cdf1024, av1_default_eob_multi1024_cdfs[index]);
+}
+
+static AOM_INLINE void reset_cdf_symbol_counter(aom_cdf_prob *cdf_ptr,
+                                                int num_cdfs, int cdf_stride,
+                                                int nsymbs) {
+  for (int i = 0; i < num_cdfs; i++) {
+    cdf_ptr[i * cdf_stride + nsymbs] = 0;
+  }
+}
+
+#define RESET_CDF_COUNTER(cname, nsymbs) \
+  RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
+
+#define RESET_CDF_COUNTER_STRIDE(cname, nsymbs, cdf_stride)          \
+  do {                                                               \
+    aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname;                   \
+    int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob);      \
+    int num_cdfs = array_size / cdf_stride;                          \
+    reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
+  } while (0)
+
+static AOM_INLINE void reset_nmv_counter(nmv_context *nmv) {
+  RESET_CDF_COUNTER(nmv->joints_cdf, 4);
+  for (int i = 0; i < 2; i++) {
+    RESET_CDF_COUNTER(nmv->comps[i].classes_cdf, MV_CLASSES);
+    RESET_CDF_COUNTER(nmv->comps[i].class0_fp_cdf, MV_FP_SIZE);
+    RESET_CDF_COUNTER(nmv->comps[i].fp_cdf, MV_FP_SIZE);
+    RESET_CDF_COUNTER(nmv->comps[i].sign_cdf, 2);
+    RESET_CDF_COUNTER(nmv->comps[i].class0_hp_cdf, 2);
+    RESET_CDF_COUNTER(nmv->comps[i].hp_cdf, 2);
+    RESET_CDF_COUNTER(nmv->comps[i].class0_cdf, CLASS0_SIZE);
+    RESET_CDF_COUNTER(nmv->comps[i].bits_cdf, 2);
+  }
+}
+
+void av1_reset_cdf_symbol_counters(FRAME_CONTEXT *fc) {
+  RESET_CDF_COUNTER(fc->txb_skip_cdf, 2);
+  RESET_CDF_COUNTER(fc->eob_extra_cdf, 2);
+  RESET_CDF_COUNTER(fc->dc_sign_cdf, 2);
+  RESET_CDF_COUNTER(fc->eob_flag_cdf16, 5);
+  RESET_CDF_COUNTER(fc->eob_flag_cdf32, 6);
+  RESET_CDF_COUNTER(fc->eob_flag_cdf64, 7);
+  RESET_CDF_COUNTER(fc->eob_flag_cdf128, 8);
+  RESET_CDF_COUNTER(fc->eob_flag_cdf256, 9);
+  RESET_CDF_COUNTER(fc->eob_flag_cdf512, 10);
+  RESET_CDF_COUNTER(fc->eob_flag_cdf1024, 11);
+  RESET_CDF_COUNTER(fc->coeff_base_eob_cdf, 3);
+  RESET_CDF_COUNTER(fc->coeff_base_cdf, 4);
+  RESET_CDF_COUNTER(fc->coeff_br_cdf, BR_CDF_SIZE);
+  RESET_CDF_COUNTER(fc->newmv_cdf, 2);
+  RESET_CDF_COUNTER(fc->zeromv_cdf, 2);
+  RESET_CDF_COUNTER(fc->refmv_cdf, 2);
+  RESET_CDF_COUNTER(fc->drl_cdf, 2);
+  RESET_CDF_COUNTER(fc->inter_compound_mode_cdf, INTER_COMPOUND_MODES);
+  RESET_CDF_COUNTER(fc->compound_type_cdf, MASKED_COMPOUND_TYPES);
+  RESET_CDF_COUNTER(fc->wedge_idx_cdf, 16);
+  RESET_CDF_COUNTER(fc->interintra_cdf, 2);
+  RESET_CDF_COUNTER(fc->wedge_interintra_cdf, 2);
+  RESET_CDF_COUNTER(fc->interintra_mode_cdf, INTERINTRA_MODES);
+  RESET_CDF_COUNTER(fc->motion_mode_cdf, MOTION_MODES);
+  RESET_CDF_COUNTER(fc->obmc_cdf, 2);
+  RESET_CDF_COUNTER(fc->palette_y_size_cdf, PALETTE_SIZES);
+  RESET_CDF_COUNTER(fc->palette_uv_size_cdf, PALETTE_SIZES);
+  for (int j = 0; j < PALETTE_SIZES; j++) {
+    int nsymbs = j + PALETTE_MIN_SIZE;
+    RESET_CDF_COUNTER_STRIDE(fc->palette_y_color_index_cdf[j], nsymbs,
+                             CDF_SIZE(PALETTE_COLORS));
+    RESET_CDF_COUNTER_STRIDE(fc->palette_uv_color_index_cdf[j], nsymbs,
+                             CDF_SIZE(PALETTE_COLORS));
+  }
+  RESET_CDF_COUNTER(fc->palette_y_mode_cdf, 2);
+  RESET_CDF_COUNTER(fc->palette_uv_mode_cdf, 2);
+  RESET_CDF_COUNTER(fc->comp_inter_cdf, 2);
+  RESET_CDF_COUNTER(fc->single_ref_cdf, 2);
+  RESET_CDF_COUNTER(fc->comp_ref_type_cdf, 2);
+  RESET_CDF_COUNTER(fc->uni_comp_ref_cdf, 2);
+  RESET_CDF_COUNTER(fc->comp_ref_cdf, 2);
+  RESET_CDF_COUNTER(fc->comp_bwdref_cdf, 2);
+  RESET_CDF_COUNTER(fc->txfm_partition_cdf, 2);
+  RESET_CDF_COUNTER(fc->compound_index_cdf, 2);
+  RESET_CDF_COUNTER(fc->comp_group_idx_cdf, 2);
+  RESET_CDF_COUNTER(fc->skip_mode_cdfs, 2);
+  RESET_CDF_COUNTER(fc->skip_cdfs, 2);
+  RESET_CDF_COUNTER(fc->intra_inter_cdf, 2);
+  reset_nmv_counter(&fc->nmvc);
+  reset_nmv_counter(&fc->ndvc);
+  RESET_CDF_COUNTER(fc->intrabc_cdf, 2);
+  RESET_CDF_COUNTER(fc->seg.tree_cdf, MAX_SEGMENTS);
+  RESET_CDF_COUNTER(fc->seg.pred_cdf, 2);
+  RESET_CDF_COUNTER(fc->seg.spatial_pred_seg_cdf, MAX_SEGMENTS);
+  RESET_CDF_COUNTER(fc->filter_intra_cdfs, 2);
+  RESET_CDF_COUNTER(fc->filter_intra_mode_cdf, FILTER_INTRA_MODES);
+  RESET_CDF_COUNTER(fc->switchable_restore_cdf, RESTORE_SWITCHABLE_TYPES);
+  RESET_CDF_COUNTER(fc->wiener_restore_cdf, 2);
+  RESET_CDF_COUNTER(fc->sgrproj_restore_cdf, 2);
+  RESET_CDF_COUNTER(fc->y_mode_cdf, INTRA_MODES);
+  RESET_CDF_COUNTER_STRIDE(fc->uv_mode_cdf[0], UV_INTRA_MODES - 1,
+                           CDF_SIZE(UV_INTRA_MODES));
+  RESET_CDF_COUNTER(fc->uv_mode_cdf[1], UV_INTRA_MODES);
+  for (int i = 0; i < PARTITION_CONTEXTS; i++) {
+    if (i < 4) {
+      RESET_CDF_COUNTER_STRIDE(fc->partition_cdf[i], 4, CDF_SIZE(10));
+    } else if (i < 16) {
+      RESET_CDF_COUNTER(fc->partition_cdf[i], 10);
+    } else {
+      RESET_CDF_COUNTER_STRIDE(fc->partition_cdf[i], 8, CDF_SIZE(10));
+    }
+  }
+  RESET_CDF_COUNTER(fc->switchable_interp_cdf, SWITCHABLE_FILTERS);
+  RESET_CDF_COUNTER(fc->kf_y_cdf, INTRA_MODES);
+  RESET_CDF_COUNTER(fc->angle_delta_cdf, 2 * MAX_ANGLE_DELTA + 1);
+  RESET_CDF_COUNTER_STRIDE(fc->tx_size_cdf[0], MAX_TX_DEPTH,
+                           CDF_SIZE(MAX_TX_DEPTH + 1));
+  RESET_CDF_COUNTER(fc->tx_size_cdf[1], MAX_TX_DEPTH + 1);
+  RESET_CDF_COUNTER(fc->tx_size_cdf[2], MAX_TX_DEPTH + 1);
+  RESET_CDF_COUNTER(fc->tx_size_cdf[3], MAX_TX_DEPTH + 1);
+  RESET_CDF_COUNTER(fc->delta_q_cdf, DELTA_Q_PROBS + 1);
+  RESET_CDF_COUNTER(fc->delta_lf_cdf, DELTA_LF_PROBS + 1);
+  for (int i = 0; i < FRAME_LF_COUNT; i++) {
+    RESET_CDF_COUNTER(fc->delta_lf_multi_cdf[i], DELTA_LF_PROBS + 1);
+  }
+  RESET_CDF_COUNTER_STRIDE(fc->intra_ext_tx_cdf[1], 7, CDF_SIZE(TX_TYPES));
+  RESET_CDF_COUNTER_STRIDE(fc->intra_ext_tx_cdf[2], 5, CDF_SIZE(TX_TYPES));
+  RESET_CDF_COUNTER_STRIDE(fc->inter_ext_tx_cdf[1], 16, CDF_SIZE(TX_TYPES));
+  RESET_CDF_COUNTER_STRIDE(fc->inter_ext_tx_cdf[2], 12, CDF_SIZE(TX_TYPES));
+  RESET_CDF_COUNTER_STRIDE(fc->inter_ext_tx_cdf[3], 2, CDF_SIZE(TX_TYPES));
+  RESET_CDF_COUNTER(fc->cfl_sign_cdf, CFL_JOINT_SIGNS);
+  RESET_CDF_COUNTER(fc->cfl_alpha_cdf, CFL_ALPHABET_SIZE);
+}
diff --git a/libs/libaom/src/av1/common/entropy.h b/libs/libaom/src/av1/common/entropy.h
new file mode 100644
index 000000000..ee78f56a3
--- /dev/null
+++ b/libs/libaom/src/av1/common/entropy.h
@@ -0,0 +1,181 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_ENTROPY_H_
+#define AOM_AV1_COMMON_ENTROPY_H_
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/prob.h"
+
+#include "av1/common/common.h"
+#include "av1/common/common_data.h"
+#include "av1/common/enums.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define TOKEN_CDF_Q_CTXS 4
+
+#define TXB_SKIP_CONTEXTS 13
+
+#define EOB_COEF_CONTEXTS 9
+
+#define SIG_COEF_CONTEXTS_2D 26
+#define SIG_COEF_CONTEXTS_1D 16
+#define SIG_COEF_CONTEXTS_EOB 4
+#define SIG_COEF_CONTEXTS (SIG_COEF_CONTEXTS_2D + SIG_COEF_CONTEXTS_1D)
+
+#define COEFF_BASE_CONTEXTS (SIG_COEF_CONTEXTS)
+#define DC_SIGN_CONTEXTS 3
+
+#define BR_TMP_OFFSET 12
+#define BR_REF_CAT 4
+#define LEVEL_CONTEXTS 21
+
+#define NUM_BASE_LEVELS 2
+
+#define BR_CDF_SIZE (4)
+#define COEFF_BASE_RANGE (4 * (BR_CDF_SIZE - 1))
+
+#define COEFF_CONTEXT_BITS 3
+#define COEFF_CONTEXT_MASK ((1 << COEFF_CONTEXT_BITS) - 1)
+#define MAX_BASE_BR_RANGE (COEFF_BASE_RANGE + NUM_BASE_LEVELS + 1)
+
+#define BASE_CONTEXT_POSITION_NUM 12
+
+enum {
+  TX_CLASS_2D = 0,
+  TX_CLASS_HORIZ = 1,
+  TX_CLASS_VERT = 2,
+  TX_CLASSES = 3,
+} UENUM1BYTE(TX_CLASS);
+
+#define DCT_MAX_VALUE 16384
+#define DCT_MAX_VALUE_HIGH10 65536
+#define DCT_MAX_VALUE_HIGH12 262144
+
+/* Coefficients are predicted via a 3-dimensional probability table indexed on
+ * REF_TYPES, COEF_BANDS and COEF_CONTEXTS. */
+#define REF_TYPES 2  // intra=0, inter=1
+
+struct AV1Common;
+struct frame_contexts;
+void av1_reset_cdf_symbol_counters(struct frame_contexts *fc);
+void av1_default_coef_probs(struct AV1Common *cm);
+
+struct frame_contexts;
+
+typedef char ENTROPY_CONTEXT;
+
+static INLINE int combine_entropy_contexts(ENTROPY_CONTEXT a,
+                                           ENTROPY_CONTEXT b) {
+  return (a != 0) + (b != 0);
+}
+
+static INLINE int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a,
+                                      const ENTROPY_CONTEXT *l) {
+  ENTROPY_CONTEXT above_ec = 0, left_ec = 0;
+
+  switch (tx_size) {
+    case TX_4X4:
+      above_ec = a[0] != 0;
+      left_ec = l[0] != 0;
+      break;
+    case TX_4X8:
+      above_ec = a[0] != 0;
+      left_ec = !!*(const uint16_t *)l;
+      break;
+    case TX_8X4:
+      above_ec = !!*(const uint16_t *)a;
+      left_ec = l[0] != 0;
+      break;
+    case TX_8X16:
+      above_ec = !!*(const uint16_t *)a;
+      left_ec = !!*(const uint32_t *)l;
+      break;
+    case TX_16X8:
+      above_ec = !!*(const uint32_t *)a;
+      left_ec = !!*(const uint16_t *)l;
+      break;
+    case TX_16X32:
+      above_ec = !!*(const uint32_t *)a;
+      left_ec = !!*(const uint64_t *)l;
+      break;
+    case TX_32X16:
+      above_ec = !!*(const uint64_t *)a;
+      left_ec = !!*(const uint32_t *)l;
+      break;
+    case TX_8X8:
+      above_ec = !!*(const uint16_t *)a;
+      left_ec = !!*(const uint16_t *)l;
+      break;
+    case TX_16X16:
+      above_ec = !!*(const uint32_t *)a;
+      left_ec = !!*(const uint32_t *)l;
+      break;
+    case TX_32X32:
+      above_ec = !!*(const uint64_t *)a;
+      left_ec = !!*(const uint64_t *)l;
+      break;
+    case TX_64X64:
+      above_ec = !!(*(const uint64_t *)a | *(const uint64_t *)(a + 8));
+      left_ec = !!(*(const uint64_t *)l | *(const uint64_t *)(l + 8));
+      break;
+    case TX_32X64:
+      above_ec = !!*(const uint64_t *)a;
+      left_ec = !!(*(const uint64_t *)l | *(const uint64_t *)(l + 8));
+      break;
+    case TX_64X32:
+      above_ec = !!(*(const uint64_t *)a | *(const uint64_t *)(a + 8));
+      left_ec = !!*(const uint64_t *)l;
+      break;
+    case TX_4X16:
+      above_ec = a[0] != 0;
+      left_ec = !!*(const uint32_t *)l;
+      break;
+    case TX_16X4:
+      above_ec = !!*(const uint32_t *)a;
+      left_ec = l[0] != 0;
+      break;
+    case TX_8X32:
+      above_ec = !!*(const uint16_t *)a;
+      left_ec = !!*(const uint64_t *)l;
+      break;
+    case TX_32X8:
+      above_ec = !!*(const uint64_t *)a;
+      left_ec = !!*(const uint16_t *)l;
+      break;
+    case TX_16X64:
+      above_ec = !!*(const uint32_t *)a;
+      left_ec = !!(*(const uint64_t *)l | *(const uint64_t *)(l + 8));
+      break;
+    case TX_64X16:
+      above_ec = !!(*(const uint64_t *)a | *(const uint64_t *)(a + 8));
+      left_ec = !!*(const uint32_t *)l;
+      break;
+    default: assert(0 && "Invalid transform size."); break;
+  }
+  return combine_entropy_contexts(above_ec, left_ec);
+}
+
+static INLINE TX_SIZE get_txsize_entropy_ctx(TX_SIZE txsize) {
+  return (TX_SIZE)((txsize_sqr_map[txsize] + txsize_sqr_up_map[txsize] + 1) >>
+                   1);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_COMMON_ENTROPY_H_
diff --git a/libs/libaom/src/av1/common/entropymode.c b/libs/libaom/src/av1/common/entropymode.c
new file mode 100644
index 000000000..5f061be35
--- /dev/null
+++ b/libs/libaom/src/av1/common/entropymode.c
@@ -0,0 +1,1103 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_mem/aom_mem.h"
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/scan.h"
+#include "av1/common/seg_common.h"
+#include "av1/common/txb_common.h"
+
+static const aom_cdf_prob
+    default_kf_y_mode_cdf[KF_MODE_CONTEXTS][KF_MODE_CONTEXTS][CDF_SIZE(
+        INTRA_MODES)] = {
+      { { AOM_CDF13(15588, 17027, 19338, 20218, 20682, 21110, 21825, 23244,
+                    24189, 28165, 29093, 30466) },
+        { AOM_CDF13(12016, 18066, 19516, 20303, 20719, 21444, 21888, 23032,
+                    24434, 28658, 30172, 31409) },
+        { AOM_CDF13(10052, 10771, 22296, 22788, 23055, 23239, 24133, 25620,
+                    26160, 29336, 29929, 31567) },
+        { AOM_CDF13(14091, 15406, 16442, 18808, 19136, 19546, 19998, 22096,
+                    24746, 29585, 30958, 32462) },
+        { AOM_CDF13(12122, 13265, 15603, 16501, 18609, 20033, 22391, 25583,
+                    26437, 30261, 31073, 32475) } },
+      { { AOM_CDF13(10023, 19585, 20848, 21440, 21832, 22760, 23089, 24023,
+                    25381, 29014, 30482, 31436) },
+        { AOM_CDF13(5983, 24099, 24560, 24886, 25066, 25795, 25913, 26423,
+                    27610, 29905, 31276, 31794) },
+        { AOM_CDF13(7444, 12781, 20177, 20728, 21077, 21607, 22170, 23405,
+                    24469, 27915, 29090, 30492) },
+        { AOM_CDF13(8537, 14689, 15432, 17087, 17408, 18172, 18408, 19825,
+                    24649, 29153, 31096, 32210) },
+        { AOM_CDF13(7543, 14231, 15496, 16195, 17905, 20717, 21984, 24516,
+                    26001, 29675, 30981, 31994) } },
+      { { AOM_CDF13(12613, 13591, 21383, 22004, 22312, 22577, 23401, 25055,
+                    25729, 29538, 30305, 32077) },
+        { AOM_CDF13(9687, 13470, 18506, 19230, 19604, 20147, 20695, 22062,
+                    23219, 27743, 29211, 30907) },
+        { AOM_CDF13(6183, 6505, 26024, 26252, 26366, 26434, 27082, 28354, 28555,
+                    30467, 30794, 32086) },
+        { AOM_CDF13(10718, 11734, 14954, 17224, 17565, 17924, 18561, 21523,
+                    23878, 28975, 30287, 32252) },
+        { AOM_CDF13(9194, 9858, 16501, 17263, 18424, 19171, 21563, 25961, 26561,
+                    30072, 30737, 32463) } },
+      { { AOM_CDF13(12602, 14399, 15488, 18381, 18778, 19315, 19724, 21419,
+                    25060, 29696, 30917, 32409) },
+        { AOM_CDF13(8203, 13821, 14524, 17105, 17439, 18131, 18404, 19468,
+                    25225, 29485, 31158, 32342) },
+        { AOM_CDF13(8451, 9731, 15004, 17643, 18012, 18425, 19070, 21538, 24605,
+                    29118, 30078, 32018) },
+        { AOM_CDF13(7714, 9048, 9516, 16667, 16817, 16994, 17153, 18767, 26743,
+                    30389, 31536, 32528) },
+        { AOM_CDF13(8843, 10280, 11496, 15317, 16652, 17943, 19108, 22718,
+                    25769, 29953, 30983, 32485) } },
+      { { AOM_CDF13(12578, 13671, 15979, 16834, 19075, 20913, 22989, 25449,
+                    26219, 30214, 31150, 32477) },
+        { AOM_CDF13(9563, 13626, 15080, 15892, 17756, 20863, 22207, 24236,
+                    25380, 29653, 31143, 32277) },
+        { AOM_CDF13(8356, 8901, 17616, 18256, 19350, 20106, 22598, 25947, 26466,
+                    29900, 30523, 32261) },
+        { AOM_CDF13(10835, 11815, 13124, 16042, 17018, 18039, 18947, 22753,
+                    24615, 29489, 30883, 32482) },
+        { AOM_CDF13(7618, 8288, 9859, 10509, 15386, 18657, 22903, 28776, 29180,
+                    31355, 31802, 32593) } }
+    };
+
+static const aom_cdf_prob default_angle_delta_cdf[DIRECTIONAL_MODES][CDF_SIZE(
+    2 * MAX_ANGLE_DELTA + 1)] = {
+  { AOM_CDF7(2180, 5032, 7567, 22776, 26989, 30217) },
+  { AOM_CDF7(2301, 5608, 8801, 23487, 26974, 30330) },
+  { AOM_CDF7(3780, 11018, 13699, 19354, 23083, 31286) },
+  { AOM_CDF7(4581, 11226, 15147, 17138, 21834, 28397) },
+  { AOM_CDF7(1737, 10927, 14509, 19588, 22745, 28823) },
+  { AOM_CDF7(2664, 10176, 12485, 17650, 21600, 30495) },
+  { AOM_CDF7(2240, 11096, 15453, 20341, 22561, 28917) },
+  { AOM_CDF7(3605, 10428, 12459, 17676, 21244, 30655) }
+};
+
+static const aom_cdf_prob default_if_y_mode_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE(
+    INTRA_MODES)] = { { AOM_CDF13(22801, 23489, 24293, 24756, 25601, 26123,
+                                  26606, 27418, 27945, 29228, 29685, 30349) },
+                      { AOM_CDF13(18673, 19845, 22631, 23318, 23950, 24649,
+                                  25527, 27364, 28152, 29701, 29984, 30852) },
+                      { AOM_CDF13(19770, 20979, 23396, 23939, 24241, 24654,
+                                  25136, 27073, 27830, 29360, 29730, 30659) },
+                      { AOM_CDF13(20155, 21301, 22838, 23178, 23261, 23533,
+                                  23703, 24804, 25352, 26575, 27016, 28049) } };
+
+static const aom_cdf_prob
+    default_uv_mode_cdf[CFL_ALLOWED_TYPES][INTRA_MODES][CDF_SIZE(
+        UV_INTRA_MODES)] = {
+      { { AOM_CDF13(22631, 24152, 25378, 25661, 25986, 26520, 27055, 27923,
+                    28244, 30059, 30941, 31961) },
+        { AOM_CDF13(9513, 26881, 26973, 27046, 27118, 27664, 27739, 27824,
+                    28359, 29505, 29800, 31796) },
+        { AOM_CDF13(9845, 9915, 28663, 28704, 28757, 28780, 29198, 29822, 29854,
+                    30764, 31777, 32029) },
+        { AOM_CDF13(13639, 13897, 14171, 25331, 25606, 25727, 25953, 27148,
+                    28577, 30612, 31355, 32493) },
+        { AOM_CDF13(9764, 9835, 9930, 9954, 25386, 27053, 27958, 28148, 28243,
+                    31101, 31744, 32363) },
+        { AOM_CDF13(11825, 13589, 13677, 13720, 15048, 29213, 29301, 29458,
+                    29711, 31161, 31441, 32550) },
+        { AOM_CDF13(14175, 14399, 16608, 16821, 17718, 17775, 28551, 30200,
+                    30245, 31837, 32342, 32667) },
+        { AOM_CDF13(12885, 13038, 14978, 15590, 15673, 15748, 16176, 29128,
+                    29267, 30643, 31961, 32461) },
+        { AOM_CDF13(12026, 13661, 13874, 15305, 15490, 15726, 15995, 16273,
+                    28443, 30388, 30767, 32416) },
+        { AOM_CDF13(19052, 19840, 20579, 20916, 21150, 21467, 21885, 22719,
+                    23174, 28861, 30379, 32175) },
+        { AOM_CDF13(18627, 19649, 20974, 21219, 21492, 21816, 22199, 23119,
+                    23527, 27053, 31397, 32148) },
+        { AOM_CDF13(17026, 19004, 19997, 20339, 20586, 21103, 21349, 21907,
+                    22482, 25896, 26541, 31819) },
+        { AOM_CDF13(12124, 13759, 14959, 14992, 15007, 15051, 15078, 15166,
+                    15255, 15753, 16039, 16606) } },
+      { { AOM_CDF14(10407, 11208, 12900, 13181, 13823, 14175, 14899, 15656,
+                    15986, 20086, 20995, 22455, 24212) },
+        { AOM_CDF14(4532, 19780, 20057, 20215, 20428, 21071, 21199, 21451,
+                    22099, 24228, 24693, 27032, 29472) },
+        { AOM_CDF14(5273, 5379, 20177, 20270, 20385, 20439, 20949, 21695, 21774,
+                    23138, 24256, 24703, 26679) },
+        { AOM_CDF14(6740, 7167, 7662, 14152, 14536, 14785, 15034, 16741, 18371,
+                    21520, 22206, 23389, 24182) },
+        { AOM_CDF14(4987, 5368, 5928, 6068, 19114, 20315, 21857, 22253, 22411,
+                    24911, 25380, 26027, 26376) },
+        { AOM_CDF14(5370, 6889, 7247, 7393, 9498, 21114, 21402, 21753, 21981,
+                    24780, 25386, 26517, 27176) },
+        { AOM_CDF14(4816, 4961, 7204, 7326, 8765, 8930, 20169, 20682, 20803,
+                    23188, 23763, 24455, 24940) },
+        { AOM_CDF14(6608, 6740, 8529, 9049, 9257, 9356, 9735, 18827, 19059,
+                    22336, 23204, 23964, 24793) },
+        { AOM_CDF14(5998, 7419, 7781, 8933, 9255, 9549, 9753, 10417, 18898,
+                    22494, 23139, 24764, 25989) },
+        { AOM_CDF14(10660, 11298, 12550, 12957, 13322, 13624, 14040, 15004,
+                    15534, 20714, 21789, 23443, 24861) },
+        { AOM_CDF14(10522, 11530, 12552, 12963, 13378, 13779, 14245, 15235,
+                    15902, 20102, 22696, 23774, 25838) },
+        { AOM_CDF14(10099, 10691, 12639, 13049, 13386, 13665, 14125, 15163,
+                    15636, 19676, 20474, 23519, 25208) },
+        { AOM_CDF14(3144, 5087, 7382, 7504, 7593, 7690, 7801, 8064, 8232, 9248,
+                    9875, 10521, 29048) } }
+    };
+
+static const aom_cdf_prob default_partition_cdf[PARTITION_CONTEXTS][CDF_SIZE(
+    EXT_PARTITION_TYPES)] = {
+  { AOM_CDF4(19132, 25510, 30392) },
+  { AOM_CDF4(13928, 19855, 28540) },
+  { AOM_CDF4(12522, 23679, 28629) },
+  { AOM_CDF4(9896, 18783, 25853) },
+  { AOM_CDF10(15597, 20929, 24571, 26706, 27664, 28821, 29601, 30571, 31902) },
+  { AOM_CDF10(7925, 11043, 16785, 22470, 23971, 25043, 26651, 28701, 29834) },
+  { AOM_CDF10(5414, 13269, 15111, 20488, 22360, 24500, 25537, 26336, 32117) },
+  { AOM_CDF10(2662, 6362, 8614, 20860, 23053, 24778, 26436, 27829, 31171) },
+  { AOM_CDF10(18462, 20920, 23124, 27647, 28227, 29049, 29519, 30178, 31544) },
+  { AOM_CDF10(7689, 9060, 12056, 24992, 25660, 26182, 26951, 28041, 29052) },
+  { AOM_CDF10(6015, 9009, 10062, 24544, 25409, 26545, 27071, 27526, 32047) },
+  { AOM_CDF10(1394, 2208, 2796, 28614, 29061, 29466, 29840, 30185, 31899) },
+  { AOM_CDF10(20137, 21547, 23078, 29566, 29837, 30261, 30524, 30892, 31724) },
+  { AOM_CDF10(6732, 7490, 9497, 27944, 28250, 28515, 28969, 29630, 30104) },
+  { AOM_CDF10(5945, 7663, 8348, 28683, 29117, 29749, 30064, 30298, 32238) },
+  { AOM_CDF10(870, 1212, 1487, 31198, 31394, 31574, 31743, 31881, 32332) },
+  { AOM_CDF8(27899, 28219, 28529, 32484, 32539, 32619, 32639) },
+  { AOM_CDF8(6607, 6990, 8268, 32060, 32219, 32338, 32371) },
+  { AOM_CDF8(5429, 6676, 7122, 32027, 32227, 32531, 32582) },
+  { AOM_CDF8(711, 966, 1172, 32448, 32538, 32617, 32664) },
+};
+
+static const aom_cdf_prob default_intra_ext_tx_cdf
+    [EXT_TX_SETS_INTRA][EXT_TX_SIZES][INTRA_MODES][CDF_SIZE(TX_TYPES)] = {
+      {
+          {
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+          },
+          {
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+          },
+          {
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+          },
+          {
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+              { 0 },
+          },
+      },
+      {
+          {
+              { AOM_CDF7(1535, 8035, 9461, 12751, 23467, 27825) },
+              { AOM_CDF7(564, 3335, 9709, 10870, 18143, 28094) },
+              { AOM_CDF7(672, 3247, 3676, 11982, 19415, 23127) },
+              { AOM_CDF7(5279, 13885, 15487, 18044, 23527, 30252) },
+              { AOM_CDF7(4423, 6074, 7985, 10416, 25693, 29298) },
+              { AOM_CDF7(1486, 4241, 9460, 10662, 16456, 27694) },
+              { AOM_CDF7(439, 2838, 3522, 6737, 18058, 23754) },
+              { AOM_CDF7(1190, 4233, 4855, 11670, 20281, 24377) },
+              { AOM_CDF7(1045, 4312, 8647, 10159, 18644, 29335) },
+              { AOM_CDF7(202, 3734, 4747, 7298, 17127, 24016) },
+              { AOM_CDF7(447, 4312, 6819, 8884, 16010, 23858) },
+              { AOM_CDF7(277, 4369, 5255, 8905, 16465, 22271) },
+              { AOM_CDF7(3409, 5436, 10599, 15599, 19687, 24040) },
+          },
+          {
+              { AOM_CDF7(1870, 13742, 14530, 16498, 23770, 27698) },
+              { AOM_CDF7(326, 8796, 14632, 15079, 19272, 27486) },
+              { AOM_CDF7(484, 7576, 7712, 14443, 19159, 22591) },
+              { AOM_CDF7(1126, 15340, 15895, 17023, 20896, 30279) },
+              { AOM_CDF7(655, 4854, 5249, 5913, 22099, 27138) },
+              { AOM_CDF7(1299, 6458, 8885, 9290, 14851, 25497) },
+              { AOM_CDF7(311, 5295, 5552, 6885, 16107, 22672) },
+              { AOM_CDF7(883, 8059, 8270, 11258, 17289, 21549) },
+              { AOM_CDF7(741, 7580, 9318, 10345, 16688, 29046) },
+              { AOM_CDF7(110, 7406, 7915, 9195, 16041, 23329) },
+              { AOM_CDF7(363, 7974, 9357, 10673, 15629, 24474) },
+              { AOM_CDF7(153, 7647, 8112, 9936, 15307, 19996) },
+              { AOM_CDF7(3511, 6332, 11165, 15335, 19323, 23594) },
+          },
+          {
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+          },
+          {
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+              { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+          },
+      },
+      {
+          {
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+          },
+          {
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+          },
+          {
+              { AOM_CDF5(1127, 12814, 22772, 27483) },
+              { AOM_CDF5(145, 6761, 11980, 26667) },
+              { AOM_CDF5(362, 5887, 11678, 16725) },
+              { AOM_CDF5(385, 15213, 18587, 30693) },
+              { AOM_CDF5(25, 2914, 23134, 27903) },
+              { AOM_CDF5(60, 4470, 11749, 23991) },
+              { AOM_CDF5(37, 3332, 14511, 21448) },
+              { AOM_CDF5(157, 6320, 13036, 17439) },
+              { AOM_CDF5(119, 6719, 12906, 29396) },
+              { AOM_CDF5(47, 5537, 12576, 21499) },
+              { AOM_CDF5(269, 6076, 11258, 23115) },
+              { AOM_CDF5(83, 5615, 12001, 17228) },
+              { AOM_CDF5(1968, 5556, 12023, 18547) },
+          },
+          {
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+              { AOM_CDF5(6554, 13107, 19661, 26214) },
+          },
+      },
+    };
+
+static const aom_cdf_prob
+    default_inter_ext_tx_cdf[EXT_TX_SETS_INTER][EXT_TX_SIZES][CDF_SIZE(
+        TX_TYPES)] = {
+      {
+          { 0 },
+          { 0 },
+          { 0 },
+          { 0 },
+      },
+      {
+          { AOM_CDF16(4458, 5560, 7695, 9709, 13330, 14789, 17537, 20266, 21504,
+                      22848, 23934, 25474, 27727, 28915, 30631) },
+          { AOM_CDF16(1645, 2573, 4778, 5711, 7807, 8622, 10522, 15357, 17674,
+                      20408, 22517, 25010, 27116, 28856, 30749) },
+          { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
+                      20480, 22528, 24576, 26624, 28672, 30720) },
+          { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
+                      20480, 22528, 24576, 26624, 28672, 30720) },
+      },
+      {
+          { AOM_CDF12(2731, 5461, 8192, 10923, 13653, 16384, 19115, 21845,
+                      24576, 27307, 30037) },
+          { AOM_CDF12(2731, 5461, 8192, 10923, 13653, 16384, 19115, 21845,
+                      24576, 27307, 30037) },
+          { AOM_CDF12(770, 2421, 5225, 12907, 15819, 18927, 21561, 24089, 26595,
+                      28526, 30529) },
+          { AOM_CDF12(2731, 5461, 8192, 10923, 13653, 16384, 19115, 21845,
+                      24576, 27307, 30037) },
+      },
+      {
+          { AOM_CDF2(16384) },
+          { AOM_CDF2(4167) },
+          { AOM_CDF2(1998) },
+          { AOM_CDF2(748) },
+      },
+    };
+
+static const aom_cdf_prob default_cfl_sign_cdf[CDF_SIZE(CFL_JOINT_SIGNS)] = {
+  AOM_CDF8(1418, 2123, 13340, 18405, 26972, 28343, 32294)
+};
+
+static const aom_cdf_prob
+    default_cfl_alpha_cdf[CFL_ALPHA_CONTEXTS][CDF_SIZE(CFL_ALPHABET_SIZE)] = {
+      { AOM_CDF16(7637, 20719, 31401, 32481, 32657, 32688, 32692, 32696, 32700,
+                  32704, 32708, 32712, 32716, 32720, 32724) },
+      { AOM_CDF16(14365, 23603, 28135, 31168, 32167, 32395, 32487, 32573, 32620,
+                  32647, 32668, 32672, 32676, 32680, 32684) },
+      { AOM_CDF16(11532, 22380, 28445, 31360, 32349, 32523, 32584, 32649, 32673,
+                  32677, 32681, 32685, 32689, 32693, 32697) },
+      { AOM_CDF16(26990, 31402, 32282, 32571, 32692, 32696, 32700, 32704, 32708,
+                  32712, 32716, 32720, 32724, 32728, 32732) },
+      { AOM_CDF16(17248, 26058, 28904, 30608, 31305, 31877, 32126, 32321, 32394,
+                  32464, 32516, 32560, 32576, 32593, 32622) },
+      { AOM_CDF16(14738, 21678, 25779, 27901, 29024, 30302, 30980, 31843, 32144,
+                  32413, 32520, 32594, 32622, 32656, 32660) }
+    };
+
+static const aom_cdf_prob
+    default_switchable_interp_cdf[SWITCHABLE_FILTER_CONTEXTS][CDF_SIZE(
+        SWITCHABLE_FILTERS)] = {
+      { AOM_CDF3(31935, 32720) }, { AOM_CDF3(5568, 32719) },
+      { AOM_CDF3(422, 2938) },    { AOM_CDF3(28244, 32608) },
+      { AOM_CDF3(31206, 31953) }, { AOM_CDF3(4862, 32121) },
+      { AOM_CDF3(770, 1152) },    { AOM_CDF3(20889, 25637) },
+      { AOM_CDF3(31910, 32724) }, { AOM_CDF3(4120, 32712) },
+      { AOM_CDF3(305, 2247) },    { AOM_CDF3(27403, 32636) },
+      { AOM_CDF3(31022, 32009) }, { AOM_CDF3(2963, 32093) },
+      { AOM_CDF3(601, 943) },     { AOM_CDF3(14969, 21398) }
+    };
+
+static const aom_cdf_prob default_newmv_cdf[NEWMV_MODE_CONTEXTS][CDF_SIZE(
+    2)] = { { AOM_CDF2(24035) }, { AOM_CDF2(16630) }, { AOM_CDF2(15339) },
+            { AOM_CDF2(8386) },  { AOM_CDF2(12222) }, { AOM_CDF2(4676) } };
+
+static const aom_cdf_prob default_zeromv_cdf[GLOBALMV_MODE_CONTEXTS][CDF_SIZE(
+    2)] = { { AOM_CDF2(2175) }, { AOM_CDF2(1054) } };
+
+static const aom_cdf_prob default_refmv_cdf[REFMV_MODE_CONTEXTS][CDF_SIZE(
+    2)] = { { AOM_CDF2(23974) }, { AOM_CDF2(24188) }, { AOM_CDF2(17848) },
+            { AOM_CDF2(28622) }, { AOM_CDF2(24312) }, { AOM_CDF2(19923) } };
+
+static const aom_cdf_prob default_drl_cdf[DRL_MODE_CONTEXTS][CDF_SIZE(2)] = {
+  { AOM_CDF2(13104) }, { AOM_CDF2(24560) }, { AOM_CDF2(18945) }
+};
+
+static const aom_cdf_prob
+    default_inter_compound_mode_cdf[INTER_MODE_CONTEXTS][CDF_SIZE(
+        INTER_COMPOUND_MODES)] = {
+      { AOM_CDF8(7760, 13823, 15808, 17641, 19156, 20666, 26891) },
+      { AOM_CDF8(10730, 19452, 21145, 22749, 24039, 25131, 28724) },
+      { AOM_CDF8(10664, 20221, 21588, 22906, 24295, 25387, 28436) },
+      { AOM_CDF8(13298, 16984, 20471, 24182, 25067, 25736, 26422) },
+      { AOM_CDF8(18904, 23325, 25242, 27432, 27898, 28258, 30758) },
+      { AOM_CDF8(10725, 17454, 20124, 22820, 24195, 25168, 26046) },
+      { AOM_CDF8(17125, 24273, 25814, 27492, 28214, 28704, 30592) },
+      { AOM_CDF8(13046, 23214, 24505, 25942, 27435, 28442, 29330) }
+    };
+
+static const aom_cdf_prob default_interintra_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE(
+    2)] = { { AOM_CDF2(16384) },
+            { AOM_CDF2(26887) },
+            { AOM_CDF2(27597) },
+            { AOM_CDF2(30237) } };
+
+static const aom_cdf_prob
+    default_interintra_mode_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE(
+        INTERINTRA_MODES)] = { { AOM_CDF4(8192, 16384, 24576) },
+                               { AOM_CDF4(1875, 11082, 27332) },
+                               { AOM_CDF4(2473, 9996, 26388) },
+                               { AOM_CDF4(4238, 11537, 25926) } };
+
+static const aom_cdf_prob
+    default_wedge_interintra_cdf[BLOCK_SIZES_ALL][CDF_SIZE(2)] = {
+      { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+      { AOM_CDF2(20036) }, { AOM_CDF2(24957) }, { AOM_CDF2(26704) },
+      { AOM_CDF2(27530) }, { AOM_CDF2(29564) }, { AOM_CDF2(29444) },
+      { AOM_CDF2(26872) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+      { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+      { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+      { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+      { AOM_CDF2(16384) }
+    };
+
+static const aom_cdf_prob default_compound_type_cdf[BLOCK_SIZES_ALL][CDF_SIZE(
+    MASKED_COMPOUND_TYPES)] = {
+  { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+  { AOM_CDF2(23431) }, { AOM_CDF2(13171) }, { AOM_CDF2(11470) },
+  { AOM_CDF2(9770) },  { AOM_CDF2(9100) },  { AOM_CDF2(8233) },
+  { AOM_CDF2(6172) },  { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+  { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+  { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+  { AOM_CDF2(11820) }, { AOM_CDF2(7701) },  { AOM_CDF2(16384) },
+  { AOM_CDF2(16384) }
+};
+
+static const aom_cdf_prob default_wedge_idx_cdf[BLOCK_SIZES_ALL][CDF_SIZE(
+    16)] = { { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384,
+                         18432, 20480, 22528, 24576, 26624, 28672, 30720) },
+             { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384,
+                         18432, 20480, 22528, 24576, 26624, 28672, 30720) },
+             { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384,
+                         18432, 20480, 22528, 24576, 26624, 28672, 30720) },
+             { AOM_CDF16(2438, 4440, 6599, 8663, 11005, 12874, 15751, 18094,
+                         20359, 22362, 24127, 25702, 27752, 29450, 31171) },
+             { AOM_CDF16(806, 3266, 6005, 6738, 7218, 7367, 7771, 14588, 16323,
+                         17367, 18452, 19422, 22839, 26127, 29629) },
+             { AOM_CDF16(2779, 3738, 4683, 7213, 7775, 8017, 8655, 14357, 17939,
+                         21332, 24520, 27470, 29456, 30529, 31656) },
+             { AOM_CDF16(1684, 3625, 5675, 7108, 9302, 11274, 14429, 17144,
+                         19163, 20961, 22884, 24471, 26719, 28714, 30877) },
+             { AOM_CDF16(1142, 3491, 6277, 7314, 8089, 8355, 9023, 13624, 15369,
+                         16730, 18114, 19313, 22521, 26012, 29550) },
+             { AOM_CDF16(2742, 4195, 5727, 8035, 8980, 9336, 10146, 14124,
+                         17270, 20533, 23434, 25972, 27944, 29570, 31416) },
+             { AOM_CDF16(1727, 3948, 6101, 7796, 9841, 12344, 15766, 18944,
+                         20638, 22038, 23963, 25311, 26988, 28766, 31012) },
+             { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384,
+                         18432, 20480, 22528, 24576, 26624, 28672, 30720) },
+             { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384,
+                         18432, 20480, 22528, 24576, 26624, 28672, 30720) },
+             { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384,
+                         18432, 20480, 22528, 24576, 26624, 28672, 30720) },
+             { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384,
+                         18432, 20480, 22528, 24576, 26624, 28672, 30720) },
+             { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384,
+                         18432, 20480, 22528, 24576, 26624, 28672, 30720) },
+             { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384,
+                         18432, 20480, 22528, 24576, 26624, 28672, 30720) },
+             { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384,
+                         18432, 20480, 22528, 24576, 26624, 28672, 30720) },
+             { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384,
+                         18432, 20480, 22528, 24576, 26624, 28672, 30720) },
+             { AOM_CDF16(154, 987, 1925, 2051, 2088, 2111, 2151, 23033, 23703,
+                         24284, 24985, 25684, 27259, 28883, 30911) },
+             { AOM_CDF16(1135, 1322, 1493, 2635, 2696, 2737, 2770, 21016, 22935,
+                         25057, 27251, 29173, 30089, 30960, 31933) },
+             { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384,
+                         18432, 20480, 22528, 24576, 26624, 28672, 30720) },
+             { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384,
+                         18432, 20480, 22528, 24576, 26624, 28672, 30720) } };
+
+static const aom_cdf_prob default_motion_mode_cdf[BLOCK_SIZES_ALL][CDF_SIZE(
+    MOTION_MODES)] = { { AOM_CDF3(10923, 21845) }, { AOM_CDF3(10923, 21845) },
+                       { AOM_CDF3(10923, 21845) }, { AOM_CDF3(7651, 24760) },
+                       { AOM_CDF3(4738, 24765) },  { AOM_CDF3(5391, 25528) },
+                       { AOM_CDF3(19419, 26810) }, { AOM_CDF3(5123, 23606) },
+                       { AOM_CDF3(11606, 24308) }, { AOM_CDF3(26260, 29116) },
+                       { AOM_CDF3(20360, 28062) }, { AOM_CDF3(21679, 26830) },
+                       { AOM_CDF3(29516, 30701) }, { AOM_CDF3(28898, 30397) },
+                       { AOM_CDF3(30878, 31335) }, { AOM_CDF3(32507, 32558) },
+                       { AOM_CDF3(10923, 21845) }, { AOM_CDF3(10923, 21845) },
+                       { AOM_CDF3(28799, 31390) }, { AOM_CDF3(26431, 30774) },
+                       { AOM_CDF3(28973, 31594) }, { AOM_CDF3(29742, 31203) } };
+
+static const aom_cdf_prob default_obmc_cdf[BLOCK_SIZES_ALL][CDF_SIZE(2)] = {
+  { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+  { AOM_CDF2(10437) }, { AOM_CDF2(9371) },  { AOM_CDF2(9301) },
+  { AOM_CDF2(17432) }, { AOM_CDF2(14423) }, { AOM_CDF2(15142) },
+  { AOM_CDF2(25817) }, { AOM_CDF2(22823) }, { AOM_CDF2(22083) },
+  { AOM_CDF2(30128) }, { AOM_CDF2(31014) }, { AOM_CDF2(31560) },
+  { AOM_CDF2(32638) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+  { AOM_CDF2(23664) }, { AOM_CDF2(20901) }, { AOM_CDF2(24008) },
+  { AOM_CDF2(26879) }
+};
+
+static const aom_cdf_prob default_intra_inter_cdf[INTRA_INTER_CONTEXTS]
+                                                 [CDF_SIZE(2)] = {
+                                                   { AOM_CDF2(806) },
+                                                   { AOM_CDF2(16662) },
+                                                   { AOM_CDF2(20186) },
+                                                   { AOM_CDF2(26538) }
+                                                 };
+
+static const aom_cdf_prob default_comp_inter_cdf[COMP_INTER_CONTEXTS][CDF_SIZE(
+    2)] = { { AOM_CDF2(26828) },
+            { AOM_CDF2(24035) },
+            { AOM_CDF2(12031) },
+            { AOM_CDF2(10640) },
+            { AOM_CDF2(2901) } };
+
+static const aom_cdf_prob default_comp_ref_type_cdf[COMP_REF_TYPE_CONTEXTS]
+                                                   [CDF_SIZE(2)] = {
+                                                     { AOM_CDF2(1198) },
+                                                     { AOM_CDF2(2070) },
+                                                     { AOM_CDF2(9166) },
+                                                     { AOM_CDF2(7499) },
+                                                     { AOM_CDF2(22475) }
+                                                   };
+
+static const aom_cdf_prob
+    default_uni_comp_ref_cdf[UNI_COMP_REF_CONTEXTS][UNIDIR_COMP_REFS -
+                                                    1][CDF_SIZE(2)] = {
+      { { AOM_CDF2(5284) }, { AOM_CDF2(3865) }, { AOM_CDF2(3128) } },
+      { { AOM_CDF2(23152) }, { AOM_CDF2(14173) }, { AOM_CDF2(15270) } },
+      { { AOM_CDF2(31774) }, { AOM_CDF2(25120) }, { AOM_CDF2(26710) } }
+    };
+
+static const aom_cdf_prob default_single_ref_cdf[REF_CONTEXTS][SINGLE_REFS - 1]
+                                                [CDF_SIZE(2)] = {
+                                                  { { AOM_CDF2(4897) },
+                                                    { AOM_CDF2(1555) },
+                                                    { AOM_CDF2(4236) },
+                                                    { AOM_CDF2(8650) },
+                                                    { AOM_CDF2(904) },
+                                                    { AOM_CDF2(1444) } },
+                                                  { { AOM_CDF2(16973) },
+                                                    { AOM_CDF2(16751) },
+                                                    { AOM_CDF2(19647) },
+                                                    { AOM_CDF2(24773) },
+                                                    { AOM_CDF2(11014) },
+                                                    { AOM_CDF2(15087) } },
+                                                  { { AOM_CDF2(29744) },
+                                                    { AOM_CDF2(30279) },
+                                                    { AOM_CDF2(31194) },
+                                                    { AOM_CDF2(31895) },
+                                                    { AOM_CDF2(26875) },
+                                                    { AOM_CDF2(30304) } }
+                                                };
+
+static const aom_cdf_prob
+    default_comp_ref_cdf[REF_CONTEXTS][FWD_REFS - 1][CDF_SIZE(2)] = {
+      { { AOM_CDF2(4946) }, { AOM_CDF2(9468) }, { AOM_CDF2(1503) } },
+      { { AOM_CDF2(19891) }, { AOM_CDF2(22441) }, { AOM_CDF2(15160) } },
+      { { AOM_CDF2(30731) }, { AOM_CDF2(31059) }, { AOM_CDF2(27544) } }
+    };
+
+static const aom_cdf_prob
+    default_comp_bwdref_cdf[REF_CONTEXTS][BWD_REFS - 1][CDF_SIZE(2)] = {
+      { { AOM_CDF2(2235) }, { AOM_CDF2(1423) } },
+      { { AOM_CDF2(17182) }, { AOM_CDF2(15175) } },
+      { { AOM_CDF2(30606) }, { AOM_CDF2(30489) } }
+    };
+
+static const aom_cdf_prob
+    default_palette_y_size_cdf[PALATTE_BSIZE_CTXS][CDF_SIZE(PALETTE_SIZES)] = {
+      { AOM_CDF7(7952, 13000, 18149, 21478, 25527, 29241) },
+      { AOM_CDF7(7139, 11421, 16195, 19544, 23666, 28073) },
+      { AOM_CDF7(7788, 12741, 17325, 20500, 24315, 28530) },
+      { AOM_CDF7(8271, 14064, 18246, 21564, 25071, 28533) },
+      { AOM_CDF7(12725, 19180, 21863, 24839, 27535, 30120) },
+      { AOM_CDF7(9711, 14888, 16923, 21052, 25661, 27875) },
+      { AOM_CDF7(14940, 20797, 21678, 24186, 27033, 28999) }
+    };
+
+static const aom_cdf_prob
+    default_palette_uv_size_cdf[PALATTE_BSIZE_CTXS][CDF_SIZE(PALETTE_SIZES)] = {
+      { AOM_CDF7(8713, 19979, 27128, 29609, 31331, 32272) },
+      { AOM_CDF7(5839, 15573, 23581, 26947, 29848, 31700) },
+      { AOM_CDF7(4426, 11260, 17999, 21483, 25863, 29430) },
+      { AOM_CDF7(3228, 9464, 14993, 18089, 22523, 27420) },
+      { AOM_CDF7(3768, 8886, 13091, 17852, 22495, 27207) },
+      { AOM_CDF7(2464, 8451, 12861, 21632, 25525, 28555) },
+      { AOM_CDF7(1269, 5435, 10433, 18963, 21700, 25865) }
+    };
+
+static const aom_cdf_prob default_palette_y_mode_cdf
+    [PALATTE_BSIZE_CTXS][PALETTE_Y_MODE_CONTEXTS][CDF_SIZE(2)] = {
+      { { AOM_CDF2(31676) }, { AOM_CDF2(3419) }, { AOM_CDF2(1261) } },
+      { { AOM_CDF2(31912) }, { AOM_CDF2(2859) }, { AOM_CDF2(980) } },
+      { { AOM_CDF2(31823) }, { AOM_CDF2(3400) }, { AOM_CDF2(781) } },
+      { { AOM_CDF2(32030) }, { AOM_CDF2(3561) }, { AOM_CDF2(904) } },
+      { { AOM_CDF2(32309) }, { AOM_CDF2(7337) }, { AOM_CDF2(1462) } },
+      { { AOM_CDF2(32265) }, { AOM_CDF2(4015) }, { AOM_CDF2(1521) } },
+      { { AOM_CDF2(32450) }, { AOM_CDF2(7946) }, { AOM_CDF2(129) } }
+    };
+
+static const aom_cdf_prob
+    default_palette_uv_mode_cdf[PALETTE_UV_MODE_CONTEXTS][CDF_SIZE(2)] = {
+      { AOM_CDF2(32461) }, { AOM_CDF2(21488) }
+    };
+
+static const aom_cdf_prob default_palette_y_color_index_cdf
+    [PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS][CDF_SIZE(PALETTE_COLORS)] = {
+      {
+          { AOM_CDF2(28710) },
+          { AOM_CDF2(16384) },
+          { AOM_CDF2(10553) },
+          { AOM_CDF2(27036) },
+          { AOM_CDF2(31603) },
+      },
+      {
+          { AOM_CDF3(27877, 30490) },
+          { AOM_CDF3(11532, 25697) },
+          { AOM_CDF3(6544, 30234) },
+          { AOM_CDF3(23018, 28072) },
+          { AOM_CDF3(31915, 32385) },
+      },
+      {
+          { AOM_CDF4(25572, 28046, 30045) },
+          { AOM_CDF4(9478, 21590, 27256) },
+          { AOM_CDF4(7248, 26837, 29824) },
+          { AOM_CDF4(19167, 24486, 28349) },
+          { AOM_CDF4(31400, 31825, 32250) },
+      },
+      {
+          { AOM_CDF5(24779, 26955, 28576, 30282) },
+          { AOM_CDF5(8669, 20364, 24073, 28093) },
+          { AOM_CDF5(4255, 27565, 29377, 31067) },
+          { AOM_CDF5(19864, 23674, 26716, 29530) },
+          { AOM_CDF5(31646, 31893, 32147, 32426) },
+      },
+      {
+          { AOM_CDF6(23132, 25407, 26970, 28435, 30073) },
+          { AOM_CDF6(7443, 17242, 20717, 24762, 27982) },
+          { AOM_CDF6(6300, 24862, 26944, 28784, 30671) },
+          { AOM_CDF6(18916, 22895, 25267, 27435, 29652) },
+          { AOM_CDF6(31270, 31550, 31808, 32059, 32353) },
+      },
+      {
+          { AOM_CDF7(23105, 25199, 26464, 27684, 28931, 30318) },
+          { AOM_CDF7(6950, 15447, 18952, 22681, 25567, 28563) },
+          { AOM_CDF7(7560, 23474, 25490, 27203, 28921, 30708) },
+          { AOM_CDF7(18544, 22373, 24457, 26195, 28119, 30045) },
+          { AOM_CDF7(31198, 31451, 31670, 31882, 32123, 32391) },
+      },
+      {
+          { AOM_CDF8(21689, 23883, 25163, 26352, 27506, 28827, 30195) },
+          { AOM_CDF8(6892, 15385, 17840, 21606, 24287, 26753, 29204) },
+          { AOM_CDF8(5651, 23182, 25042, 26518, 27982, 29392, 30900) },
+          { AOM_CDF8(19349, 22578, 24418, 25994, 27524, 29031, 30448) },
+          { AOM_CDF8(31028, 31270, 31504, 31705, 31927, 32153, 32392) },
+      },
+    };
+
+static const aom_cdf_prob default_palette_uv_color_index_cdf
+    [PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS][CDF_SIZE(PALETTE_COLORS)] = {
+      {
+          { AOM_CDF2(29089) },
+          { AOM_CDF2(16384) },
+          { AOM_CDF2(8713) },
+          { AOM_CDF2(29257) },
+          { AOM_CDF2(31610) },
+      },
+      {
+          { AOM_CDF3(25257, 29145) },
+          { AOM_CDF3(12287, 27293) },
+          { AOM_CDF3(7033, 27960) },
+          { AOM_CDF3(20145, 25405) },
+          { AOM_CDF3(30608, 31639) },
+      },
+      {
+          { AOM_CDF4(24210, 27175, 29903) },
+          { AOM_CDF4(9888, 22386, 27214) },
+          { AOM_CDF4(5901, 26053, 29293) },
+          { AOM_CDF4(18318, 22152, 28333) },
+          { AOM_CDF4(30459, 31136, 31926) },
+      },
+      {
+          { AOM_CDF5(22980, 25479, 27781, 29986) },
+          { AOM_CDF5(8413, 21408, 24859, 28874) },
+          { AOM_CDF5(2257, 29449, 30594, 31598) },
+          { AOM_CDF5(19189, 21202, 25915, 28620) },
+          { AOM_CDF5(31844, 32044, 32281, 32518) },
+      },
+      {
+          { AOM_CDF6(22217, 24567, 26637, 28683, 30548) },
+          { AOM_CDF6(7307, 16406, 19636, 24632, 28424) },
+          { AOM_CDF6(4441, 25064, 26879, 28942, 30919) },
+          { AOM_CDF6(17210, 20528, 23319, 26750, 29582) },
+          { AOM_CDF6(30674, 30953, 31396, 31735, 32207) },
+      },
+      {
+          { AOM_CDF7(21239, 23168, 25044, 26962, 28705, 30506) },
+          { AOM_CDF7(6545, 15012, 18004, 21817, 25503, 28701) },
+          { AOM_CDF7(3448, 26295, 27437, 28704, 30126, 31442) },
+          { AOM_CDF7(15889, 18323, 21704, 24698, 26976, 29690) },
+          { AOM_CDF7(30988, 31204, 31479, 31734, 31983, 32325) },
+      },
+      {
+          { AOM_CDF8(21442, 23288, 24758, 26246, 27649, 28980, 30563) },
+          { AOM_CDF8(5863, 14933, 17552, 20668, 23683, 26411, 29273) },
+          { AOM_CDF8(3415, 25810, 26877, 27990, 29223, 30394, 31618) },
+          { AOM_CDF8(17965, 20084, 22232, 23974, 26274, 28402, 30390) },
+          { AOM_CDF8(31190, 31329, 31516, 31679, 31825, 32026, 32322) },
+      },
+    };
+
+static const aom_cdf_prob
+    default_txfm_partition_cdf[TXFM_PARTITION_CONTEXTS][CDF_SIZE(2)] = {
+      { AOM_CDF2(28581) }, { AOM_CDF2(23846) }, { AOM_CDF2(20847) },
+      { AOM_CDF2(24315) }, { AOM_CDF2(18196) }, { AOM_CDF2(12133) },
+      { AOM_CDF2(18791) }, { AOM_CDF2(10887) }, { AOM_CDF2(11005) },
+      { AOM_CDF2(27179) }, { AOM_CDF2(20004) }, { AOM_CDF2(11281) },
+      { AOM_CDF2(26549) }, { AOM_CDF2(19308) }, { AOM_CDF2(14224) },
+      { AOM_CDF2(28015) }, { AOM_CDF2(21546) }, { AOM_CDF2(14400) },
+      { AOM_CDF2(28165) }, { AOM_CDF2(22401) }, { AOM_CDF2(16088) }
+    };
+
+static const aom_cdf_prob default_skip_cdfs[SKIP_CONTEXTS][CDF_SIZE(2)] = {
+  { AOM_CDF2(31671) }, { AOM_CDF2(16515) }, { AOM_CDF2(4576) }
+};
+
+static const aom_cdf_prob default_skip_mode_cdfs[SKIP_MODE_CONTEXTS][CDF_SIZE(
+    2)] = { { AOM_CDF2(32621) }, { AOM_CDF2(20708) }, { AOM_CDF2(8127) } };
+
+static const aom_cdf_prob
+    default_compound_idx_cdfs[COMP_INDEX_CONTEXTS][CDF_SIZE(2)] = {
+      { AOM_CDF2(18244) }, { AOM_CDF2(12865) }, { AOM_CDF2(7053) },
+      { AOM_CDF2(13259) }, { AOM_CDF2(9334) },  { AOM_CDF2(4644) }
+    };
+
+static const aom_cdf_prob
+    default_comp_group_idx_cdfs[COMP_GROUP_IDX_CONTEXTS][CDF_SIZE(2)] = {
+      { AOM_CDF2(26607) }, { AOM_CDF2(22891) }, { AOM_CDF2(18840) },
+      { AOM_CDF2(24594) }, { AOM_CDF2(19934) }, { AOM_CDF2(22674) }
+    };
+
+static const aom_cdf_prob default_intrabc_cdf[CDF_SIZE(2)] = { AOM_CDF2(
+    30531) };
+
+static const aom_cdf_prob default_filter_intra_mode_cdf[CDF_SIZE(
+    FILTER_INTRA_MODES)] = { AOM_CDF5(8949, 12776, 17211, 29558) };
+
+static const aom_cdf_prob default_filter_intra_cdfs[BLOCK_SIZES_ALL][CDF_SIZE(
+    2)] = { { AOM_CDF2(4621) },  { AOM_CDF2(6743) },  { AOM_CDF2(5893) },
+            { AOM_CDF2(7866) },  { AOM_CDF2(12551) }, { AOM_CDF2(9394) },
+            { AOM_CDF2(12408) }, { AOM_CDF2(14301) }, { AOM_CDF2(12756) },
+            { AOM_CDF2(22343) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+            { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+            { AOM_CDF2(16384) }, { AOM_CDF2(12770) }, { AOM_CDF2(10368) },
+            { AOM_CDF2(20229) }, { AOM_CDF2(18101) }, { AOM_CDF2(16384) },
+            { AOM_CDF2(16384) } };
+
+static const aom_cdf_prob default_switchable_restore_cdf[CDF_SIZE(
+    RESTORE_SWITCHABLE_TYPES)] = { AOM_CDF3(9413, 22581) };
+
+static const aom_cdf_prob default_wiener_restore_cdf[CDF_SIZE(2)] = { AOM_CDF2(
+    11570) };
+
+static const aom_cdf_prob default_sgrproj_restore_cdf[CDF_SIZE(2)] = { AOM_CDF2(
+    16855) };
+
+static const aom_cdf_prob default_delta_q_cdf[CDF_SIZE(DELTA_Q_PROBS + 1)] = {
+  AOM_CDF4(28160, 32120, 32677)
+};
+
+static const aom_cdf_prob default_delta_lf_multi_cdf[FRAME_LF_COUNT][CDF_SIZE(
+    DELTA_LF_PROBS + 1)] = { { AOM_CDF4(28160, 32120, 32677) },
+                             { AOM_CDF4(28160, 32120, 32677) },
+                             { AOM_CDF4(28160, 32120, 32677) },
+                             { AOM_CDF4(28160, 32120, 32677) } };
+static const aom_cdf_prob default_delta_lf_cdf[CDF_SIZE(DELTA_LF_PROBS + 1)] = {
+  AOM_CDF4(28160, 32120, 32677)
+};
+
+// FIXME(someone) need real defaults here
+static const aom_cdf_prob default_seg_tree_cdf[CDF_SIZE(MAX_SEGMENTS)] = {
+  AOM_CDF8(4096, 8192, 12288, 16384, 20480, 24576, 28672)
+};
+
+static const aom_cdf_prob
+    default_segment_pred_cdf[SEG_TEMPORAL_PRED_CTXS][CDF_SIZE(2)] = {
+      { AOM_CDF2(128 * 128) }, { AOM_CDF2(128 * 128) }, { AOM_CDF2(128 * 128) }
+    };
+
+static const aom_cdf_prob
+    default_spatial_pred_seg_tree_cdf[SPATIAL_PREDICTION_PROBS][CDF_SIZE(
+        MAX_SEGMENTS)] = {
+      {
+          AOM_CDF8(5622, 7893, 16093, 18233, 27809, 28373, 32533),
+      },
+      {
+          AOM_CDF8(14274, 18230, 22557, 24935, 29980, 30851, 32344),
+      },
+      {
+          AOM_CDF8(27527, 28487, 28723, 28890, 32397, 32647, 32679),
+      },
+    };
+
+static const aom_cdf_prob default_tx_size_cdf[MAX_TX_CATS][TX_SIZE_CONTEXTS]
+                                             [CDF_SIZE(MAX_TX_DEPTH + 1)] = {
+                                               { { AOM_CDF2(19968) },
+                                                 { AOM_CDF2(19968) },
+                                                 { AOM_CDF2(24320) } },
+                                               { { AOM_CDF3(12272, 30172) },
+                                                 { AOM_CDF3(12272, 30172) },
+                                                 { AOM_CDF3(18677, 30848) } },
+                                               { { AOM_CDF3(12986, 15180) },
+                                                 { AOM_CDF3(12986, 15180) },
+                                                 { AOM_CDF3(24302, 25602) } },
+                                               { { AOM_CDF3(5782, 11475) },
+                                                 { AOM_CDF3(5782, 11475) },
+                                                 { AOM_CDF3(16803, 22759) } },
+                                             };
+
+#define MAX_COLOR_CONTEXT_HASH 8
+// Negative values are invalid
+static const int palette_color_index_context_lookup[MAX_COLOR_CONTEXT_HASH +
+                                                    1] = { -1, -1, 0, -1, -1,
+                                                           4,  3,  2, 1 };
+
+#define NUM_PALETTE_NEIGHBORS 3  // left, top-left and top.
+int av1_get_palette_color_index_context(const uint8_t *color_map, int stride,
+                                        int r, int c, int palette_size,
+                                        uint8_t *color_order, int *color_idx) {
+  assert(palette_size <= PALETTE_MAX_SIZE);
+  assert(r > 0 || c > 0);
+
+  // Get color indices of neighbors.
+  int color_neighbors[NUM_PALETTE_NEIGHBORS];
+  color_neighbors[0] = (c - 1 >= 0) ? color_map[r * stride + c - 1] : -1;
+  color_neighbors[1] =
+      (c - 1 >= 0 && r - 1 >= 0) ? color_map[(r - 1) * stride + c - 1] : -1;
+  color_neighbors[2] = (r - 1 >= 0) ? color_map[(r - 1) * stride + c] : -1;
+
+  // The +10 below should not be needed. But we get a warning "array subscript
+  // is above array bounds [-Werror=array-bounds]" without it, possibly due to
+  // this (or similar) bug: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=59124
+  int scores[PALETTE_MAX_SIZE + 10] = { 0 };
+  int i;
+  static const int weights[NUM_PALETTE_NEIGHBORS] = { 2, 1, 2 };
+  for (i = 0; i < NUM_PALETTE_NEIGHBORS; ++i) {
+    if (color_neighbors[i] >= 0) {
+      scores[color_neighbors[i]] += weights[i];
+    }
+  }
+
+  int inverse_color_order[PALETTE_MAX_SIZE];
+  for (i = 0; i < PALETTE_MAX_SIZE; ++i) {
+    color_order[i] = i;
+    inverse_color_order[i] = i;
+  }
+
+  // Get the top NUM_PALETTE_NEIGHBORS scores (sorted from large to small).
+  for (i = 0; i < NUM_PALETTE_NEIGHBORS; ++i) {
+    int max = scores[i];
+    int max_idx = i;
+    for (int j = i + 1; j < palette_size; ++j) {
+      if (scores[j] > max) {
+        max = scores[j];
+        max_idx = j;
+      }
+    }
+    if (max_idx != i) {
+      // Move the score at index 'max_idx' to index 'i', and shift the scores
+      // from 'i' to 'max_idx - 1' by 1.
+      const int max_score = scores[max_idx];
+      const uint8_t max_color_order = color_order[max_idx];
+      for (int k = max_idx; k > i; --k) {
+        scores[k] = scores[k - 1];
+        color_order[k] = color_order[k - 1];
+        inverse_color_order[color_order[k]] = k;
+      }
+      scores[i] = max_score;
+      color_order[i] = max_color_order;
+      inverse_color_order[color_order[i]] = i;
+    }
+  }
+
+  if (color_idx != NULL)
+    *color_idx = inverse_color_order[color_map[r * stride + c]];
+
+  // Get hash value of context.
+  int color_index_ctx_hash = 0;
+  static const int hash_multipliers[NUM_PALETTE_NEIGHBORS] = { 1, 2, 2 };
+  for (i = 0; i < NUM_PALETTE_NEIGHBORS; ++i) {
+    color_index_ctx_hash += scores[i] * hash_multipliers[i];
+  }
+  assert(color_index_ctx_hash > 0);
+  assert(color_index_ctx_hash <= MAX_COLOR_CONTEXT_HASH);
+
+  // Lookup context from hash.
+  const int color_index_ctx =
+      palette_color_index_context_lookup[color_index_ctx_hash];
+  assert(color_index_ctx >= 0);
+  assert(color_index_ctx < PALETTE_COLOR_INDEX_CONTEXTS);
+  return color_index_ctx;
+}
+#undef NUM_PALETTE_NEIGHBORS
+#undef MAX_COLOR_CONTEXT_HASH
+
+static void init_mode_probs(FRAME_CONTEXT *fc) {
+  av1_copy(fc->palette_y_size_cdf, default_palette_y_size_cdf);
+  av1_copy(fc->palette_uv_size_cdf, default_palette_uv_size_cdf);
+  av1_copy(fc->palette_y_color_index_cdf, default_palette_y_color_index_cdf);
+  av1_copy(fc->palette_uv_color_index_cdf, default_palette_uv_color_index_cdf);
+  av1_copy(fc->kf_y_cdf, default_kf_y_mode_cdf);
+  av1_copy(fc->angle_delta_cdf, default_angle_delta_cdf);
+  av1_copy(fc->comp_inter_cdf, default_comp_inter_cdf);
+  av1_copy(fc->comp_ref_type_cdf, default_comp_ref_type_cdf);
+  av1_copy(fc->uni_comp_ref_cdf, default_uni_comp_ref_cdf);
+  av1_copy(fc->palette_y_mode_cdf, default_palette_y_mode_cdf);
+  av1_copy(fc->palette_uv_mode_cdf, default_palette_uv_mode_cdf);
+  av1_copy(fc->comp_ref_cdf, default_comp_ref_cdf);
+  av1_copy(fc->comp_bwdref_cdf, default_comp_bwdref_cdf);
+  av1_copy(fc->single_ref_cdf, default_single_ref_cdf);
+  av1_copy(fc->txfm_partition_cdf, default_txfm_partition_cdf);
+  av1_copy(fc->compound_index_cdf, default_compound_idx_cdfs);
+  av1_copy(fc->comp_group_idx_cdf, default_comp_group_idx_cdfs);
+  av1_copy(fc->newmv_cdf, default_newmv_cdf);
+  av1_copy(fc->zeromv_cdf, default_zeromv_cdf);
+  av1_copy(fc->refmv_cdf, default_refmv_cdf);
+  av1_copy(fc->drl_cdf, default_drl_cdf);
+  av1_copy(fc->motion_mode_cdf, default_motion_mode_cdf);
+  av1_copy(fc->obmc_cdf, default_obmc_cdf);
+  av1_copy(fc->inter_compound_mode_cdf, default_inter_compound_mode_cdf);
+  av1_copy(fc->compound_type_cdf, default_compound_type_cdf);
+  av1_copy(fc->wedge_idx_cdf, default_wedge_idx_cdf);
+  av1_copy(fc->interintra_cdf, default_interintra_cdf);
+  av1_copy(fc->wedge_interintra_cdf, default_wedge_interintra_cdf);
+  av1_copy(fc->interintra_mode_cdf, default_interintra_mode_cdf);
+  av1_copy(fc->seg.pred_cdf, default_segment_pred_cdf);
+  av1_copy(fc->seg.tree_cdf, default_seg_tree_cdf);
+  av1_copy(fc->filter_intra_cdfs, default_filter_intra_cdfs);
+  av1_copy(fc->filter_intra_mode_cdf, default_filter_intra_mode_cdf);
+  av1_copy(fc->switchable_restore_cdf, default_switchable_restore_cdf);
+  av1_copy(fc->wiener_restore_cdf, default_wiener_restore_cdf);
+  av1_copy(fc->sgrproj_restore_cdf, default_sgrproj_restore_cdf);
+  av1_copy(fc->y_mode_cdf, default_if_y_mode_cdf);
+  av1_copy(fc->uv_mode_cdf, default_uv_mode_cdf);
+  av1_copy(fc->switchable_interp_cdf, default_switchable_interp_cdf);
+  av1_copy(fc->partition_cdf, default_partition_cdf);
+  av1_copy(fc->intra_ext_tx_cdf, default_intra_ext_tx_cdf);
+  av1_copy(fc->inter_ext_tx_cdf, default_inter_ext_tx_cdf);
+  av1_copy(fc->skip_mode_cdfs, default_skip_mode_cdfs);
+  av1_copy(fc->skip_cdfs, default_skip_cdfs);
+  av1_copy(fc->intra_inter_cdf, default_intra_inter_cdf);
+  for (int i = 0; i < SPATIAL_PREDICTION_PROBS; i++)
+    av1_copy(fc->seg.spatial_pred_seg_cdf[i],
+             default_spatial_pred_seg_tree_cdf[i]);
+  av1_copy(fc->tx_size_cdf, default_tx_size_cdf);
+  av1_copy(fc->delta_q_cdf, default_delta_q_cdf);
+  av1_copy(fc->delta_lf_cdf, default_delta_lf_cdf);
+  av1_copy(fc->delta_lf_multi_cdf, default_delta_lf_multi_cdf);
+  av1_copy(fc->cfl_sign_cdf, default_cfl_sign_cdf);
+  av1_copy(fc->cfl_alpha_cdf, default_cfl_alpha_cdf);
+  av1_copy(fc->intrabc_cdf, default_intrabc_cdf);
+}
+
+void av1_set_default_ref_deltas(int8_t *ref_deltas) {
+  assert(ref_deltas != NULL);
+
+  ref_deltas[INTRA_FRAME] = 1;
+  ref_deltas[LAST_FRAME] = 0;
+  ref_deltas[LAST2_FRAME] = ref_deltas[LAST_FRAME];
+  ref_deltas[LAST3_FRAME] = ref_deltas[LAST_FRAME];
+  ref_deltas[BWDREF_FRAME] = ref_deltas[LAST_FRAME];
+  ref_deltas[GOLDEN_FRAME] = -1;
+  ref_deltas[ALTREF2_FRAME] = -1;
+  ref_deltas[ALTREF_FRAME] = -1;
+}
+
+void av1_set_default_mode_deltas(int8_t *mode_deltas) {
+  assert(mode_deltas != NULL);
+
+  mode_deltas[0] = 0;
+  mode_deltas[1] = 0;
+}
+
+static void set_default_lf_deltas(struct loopfilter *lf) {
+  lf->mode_ref_delta_enabled = 1;
+  lf->mode_ref_delta_update = 1;
+
+  av1_set_default_ref_deltas(lf->ref_deltas);
+  av1_set_default_mode_deltas(lf->mode_deltas);
+}
+
+void av1_setup_frame_contexts(AV1_COMMON *cm) {
+  // Store the frame context into a special slot (not associated with any
+  // reference buffer), so that we can set up cm->pre_fc correctly later
+  // This function must ONLY be called when cm->fc has been initialized with
+  // default probs, either by av1_setup_past_independence or after manually
+  // initializing them
+  *cm->default_frame_context = *cm->fc;
+  // TODO(jack.haughton@argondesign.com): don't think this should be necessary,
+  // but could do with fuller testing
+  if (cm->tiles.large_scale) {
+    for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
+      RefCntBuffer *const buf = get_ref_frame_buf(cm, i);
+      if (buf != NULL) buf->frame_context = *cm->fc;
+    }
+    for (int i = 0; i < FRAME_BUFFERS; ++i)
+      cm->buffer_pool->frame_bufs[i].frame_context = *cm->fc;
+  }
+}
+
+void av1_setup_past_independence(AV1_COMMON *cm) {
+  // Reset the segment feature data to the default stats:
+  // Features disabled, 0, with delta coding (Default state).
+  av1_clearall_segfeatures(&cm->seg);
+
+  if (cm->cur_frame->seg_map)
+    memset(cm->cur_frame->seg_map, 0,
+           (cm->mi_params.mi_rows * cm->mi_params.mi_cols));
+
+  // reset mode ref deltas
+  av1_set_default_ref_deltas(cm->cur_frame->ref_deltas);
+  av1_set_default_mode_deltas(cm->cur_frame->mode_deltas);
+  set_default_lf_deltas(&cm->lf);
+
+  av1_default_coef_probs(cm);
+  init_mode_probs(cm->fc);
+  av1_init_mv_probs(cm);
+  cm->fc->initialized = 1;
+  av1_setup_frame_contexts(cm);
+}
diff --git a/libs/libaom/src/av1/common/entropymode.h b/libs/libaom/src/av1/common/entropymode.h
new file mode 100644
index 000000000..bbbf55dc8
--- /dev/null
+++ b/libs/libaom/src/av1/common/entropymode.h
@@ -0,0 +1,212 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_ENTROPYMODE_H_
+#define AOM_AV1_COMMON_ENTROPYMODE_H_
+
+#include "av1/common/entropy.h"
+#include "av1/common/entropymv.h"
+#include "av1/common/filter.h"
+#include "av1/common/seg_common.h"
+#include "aom_dsp/aom_filter.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define BLOCK_SIZE_GROUPS 4
+
+#define TX_SIZE_CONTEXTS 3
+
+#define INTER_OFFSET(mode) ((mode)-NEARESTMV)
+#define INTER_COMPOUND_OFFSET(mode) (uint8_t)((mode)-NEAREST_NEARESTMV)
+
+// Number of possible contexts for a color index.
+// As can be seen from av1_get_palette_color_index_context(), the possible
+// contexts are (2,0,0), (2,2,1), (3,2,0), (4,1,0), (5,0,0). These are mapped to
+// a value from 0 to 4 using 'palette_color_index_context_lookup' table.
+#define PALETTE_COLOR_INDEX_CONTEXTS 5
+
+// Palette Y mode context for a block is determined by number of neighboring
+// blocks (top and/or left) using a palette for Y plane. So, possible Y mode'
+// context values are:
+// 0 if neither left nor top block uses palette for Y plane,
+// 1 if exactly one of left or top block uses palette for Y plane, and
+// 2 if both left and top blocks use palette for Y plane.
+#define PALETTE_Y_MODE_CONTEXTS 3
+
+// Palette UV mode context for a block is determined by whether this block uses
+// palette for the Y plane. So, possible values are:
+// 0 if this block doesn't use palette for Y plane.
+// 1 if this block uses palette for Y plane (i.e. Y palette size > 0).
+#define PALETTE_UV_MODE_CONTEXTS 2
+
+// Map the number of pixels in a block size to a context
+//   64(BLOCK_8X8, BLOCK_4x16, BLOCK_16X4)  -> 0
+//  128(BLOCK_8X16, BLOCK_16x8)             -> 1
+//   ...
+// 4096(BLOCK_64X64)                        -> 6
+#define PALATTE_BSIZE_CTXS 7
+
+#define KF_MODE_CONTEXTS 5
+
+struct AV1Common;
+
+typedef struct {
+  const int16_t *scan;
+  const int16_t *iscan;
+} SCAN_ORDER;
+
+typedef struct frame_contexts {
+  aom_cdf_prob txb_skip_cdf[TX_SIZES][TXB_SKIP_CONTEXTS][CDF_SIZE(2)];
+  aom_cdf_prob eob_extra_cdf[TX_SIZES][PLANE_TYPES][EOB_COEF_CONTEXTS]
+                            [CDF_SIZE(2)];
+  aom_cdf_prob dc_sign_cdf[PLANE_TYPES][DC_SIGN_CONTEXTS][CDF_SIZE(2)];
+  aom_cdf_prob eob_flag_cdf16[PLANE_TYPES][2][CDF_SIZE(5)];
+  aom_cdf_prob eob_flag_cdf32[PLANE_TYPES][2][CDF_SIZE(6)];
+  aom_cdf_prob eob_flag_cdf64[PLANE_TYPES][2][CDF_SIZE(7)];
+  aom_cdf_prob eob_flag_cdf128[PLANE_TYPES][2][CDF_SIZE(8)];
+  aom_cdf_prob eob_flag_cdf256[PLANE_TYPES][2][CDF_SIZE(9)];
+  aom_cdf_prob eob_flag_cdf512[PLANE_TYPES][2][CDF_SIZE(10)];
+  aom_cdf_prob eob_flag_cdf1024[PLANE_TYPES][2][CDF_SIZE(11)];
+  aom_cdf_prob coeff_base_eob_cdf[TX_SIZES][PLANE_TYPES][SIG_COEF_CONTEXTS_EOB]
+                                 [CDF_SIZE(3)];
+  aom_cdf_prob coeff_base_cdf[TX_SIZES][PLANE_TYPES][SIG_COEF_CONTEXTS]
+                             [CDF_SIZE(4)];
+  aom_cdf_prob coeff_br_cdf[TX_SIZES][PLANE_TYPES][LEVEL_CONTEXTS]
+                           [CDF_SIZE(BR_CDF_SIZE)];
+
+  aom_cdf_prob newmv_cdf[NEWMV_MODE_CONTEXTS][CDF_SIZE(2)];
+  aom_cdf_prob zeromv_cdf[GLOBALMV_MODE_CONTEXTS][CDF_SIZE(2)];
+  aom_cdf_prob refmv_cdf[REFMV_MODE_CONTEXTS][CDF_SIZE(2)];
+  aom_cdf_prob drl_cdf[DRL_MODE_CONTEXTS][CDF_SIZE(2)];
+
+  aom_cdf_prob inter_compound_mode_cdf[INTER_MODE_CONTEXTS]
+                                      [CDF_SIZE(INTER_COMPOUND_MODES)];
+  aom_cdf_prob compound_type_cdf[BLOCK_SIZES_ALL]
+                                [CDF_SIZE(MASKED_COMPOUND_TYPES)];
+  aom_cdf_prob wedge_idx_cdf[BLOCK_SIZES_ALL][CDF_SIZE(16)];
+  aom_cdf_prob interintra_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE(2)];
+  aom_cdf_prob wedge_interintra_cdf[BLOCK_SIZES_ALL][CDF_SIZE(2)];
+  aom_cdf_prob interintra_mode_cdf[BLOCK_SIZE_GROUPS]
+                                  [CDF_SIZE(INTERINTRA_MODES)];
+  aom_cdf_prob motion_mode_cdf[BLOCK_SIZES_ALL][CDF_SIZE(MOTION_MODES)];
+  aom_cdf_prob obmc_cdf[BLOCK_SIZES_ALL][CDF_SIZE(2)];
+  aom_cdf_prob palette_y_size_cdf[PALATTE_BSIZE_CTXS][CDF_SIZE(PALETTE_SIZES)];
+  aom_cdf_prob palette_uv_size_cdf[PALATTE_BSIZE_CTXS][CDF_SIZE(PALETTE_SIZES)];
+  aom_cdf_prob palette_y_color_index_cdf[PALETTE_SIZES]
+                                        [PALETTE_COLOR_INDEX_CONTEXTS]
+                                        [CDF_SIZE(PALETTE_COLORS)];
+  aom_cdf_prob palette_uv_color_index_cdf[PALETTE_SIZES]
+                                         [PALETTE_COLOR_INDEX_CONTEXTS]
+                                         [CDF_SIZE(PALETTE_COLORS)];
+  aom_cdf_prob palette_y_mode_cdf[PALATTE_BSIZE_CTXS][PALETTE_Y_MODE_CONTEXTS]
+                                 [CDF_SIZE(2)];
+  aom_cdf_prob palette_uv_mode_cdf[PALETTE_UV_MODE_CONTEXTS][CDF_SIZE(2)];
+  aom_cdf_prob comp_inter_cdf[COMP_INTER_CONTEXTS][CDF_SIZE(2)];
+  aom_cdf_prob single_ref_cdf[REF_CONTEXTS][SINGLE_REFS - 1][CDF_SIZE(2)];
+  aom_cdf_prob comp_ref_type_cdf[COMP_REF_TYPE_CONTEXTS][CDF_SIZE(2)];
+  aom_cdf_prob uni_comp_ref_cdf[UNI_COMP_REF_CONTEXTS][UNIDIR_COMP_REFS - 1]
+                               [CDF_SIZE(2)];
+  aom_cdf_prob comp_ref_cdf[REF_CONTEXTS][FWD_REFS - 1][CDF_SIZE(2)];
+  aom_cdf_prob comp_bwdref_cdf[REF_CONTEXTS][BWD_REFS - 1][CDF_SIZE(2)];
+  aom_cdf_prob txfm_partition_cdf[TXFM_PARTITION_CONTEXTS][CDF_SIZE(2)];
+  aom_cdf_prob compound_index_cdf[COMP_INDEX_CONTEXTS][CDF_SIZE(2)];
+  aom_cdf_prob comp_group_idx_cdf[COMP_GROUP_IDX_CONTEXTS][CDF_SIZE(2)];
+  aom_cdf_prob skip_mode_cdfs[SKIP_CONTEXTS][CDF_SIZE(2)];
+  aom_cdf_prob skip_cdfs[SKIP_CONTEXTS][CDF_SIZE(2)];
+  aom_cdf_prob intra_inter_cdf[INTRA_INTER_CONTEXTS][CDF_SIZE(2)];
+  nmv_context nmvc;
+  nmv_context ndvc;
+  aom_cdf_prob intrabc_cdf[CDF_SIZE(2)];
+  struct segmentation_probs seg;
+  aom_cdf_prob filter_intra_cdfs[BLOCK_SIZES_ALL][CDF_SIZE(2)];
+  aom_cdf_prob filter_intra_mode_cdf[CDF_SIZE(FILTER_INTRA_MODES)];
+  aom_cdf_prob switchable_restore_cdf[CDF_SIZE(RESTORE_SWITCHABLE_TYPES)];
+  aom_cdf_prob wiener_restore_cdf[CDF_SIZE(2)];
+  aom_cdf_prob sgrproj_restore_cdf[CDF_SIZE(2)];
+  aom_cdf_prob y_mode_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE(INTRA_MODES)];
+  aom_cdf_prob uv_mode_cdf[CFL_ALLOWED_TYPES][INTRA_MODES]
+                          [CDF_SIZE(UV_INTRA_MODES)];
+  aom_cdf_prob partition_cdf[PARTITION_CONTEXTS][CDF_SIZE(EXT_PARTITION_TYPES)];
+  aom_cdf_prob switchable_interp_cdf[SWITCHABLE_FILTER_CONTEXTS]
+                                    [CDF_SIZE(SWITCHABLE_FILTERS)];
+  /* kf_y_cdf is discarded after use, so does not require persistent storage.
+     However, we keep it with the other CDFs in this struct since it needs to
+     be copied to each tile to support parallelism just like the others.
+  */
+  aom_cdf_prob kf_y_cdf[KF_MODE_CONTEXTS][KF_MODE_CONTEXTS]
+                       [CDF_SIZE(INTRA_MODES)];
+
+  aom_cdf_prob angle_delta_cdf[DIRECTIONAL_MODES]
+                              [CDF_SIZE(2 * MAX_ANGLE_DELTA + 1)];
+
+  aom_cdf_prob tx_size_cdf[MAX_TX_CATS][TX_SIZE_CONTEXTS]
+                          [CDF_SIZE(MAX_TX_DEPTH + 1)];
+  aom_cdf_prob delta_q_cdf[CDF_SIZE(DELTA_Q_PROBS + 1)];
+  aom_cdf_prob delta_lf_multi_cdf[FRAME_LF_COUNT][CDF_SIZE(DELTA_LF_PROBS + 1)];
+  aom_cdf_prob delta_lf_cdf[CDF_SIZE(DELTA_LF_PROBS + 1)];
+  aom_cdf_prob intra_ext_tx_cdf[EXT_TX_SETS_INTRA][EXT_TX_SIZES][INTRA_MODES]
+                               [CDF_SIZE(TX_TYPES)];
+  aom_cdf_prob inter_ext_tx_cdf[EXT_TX_SETS_INTER][EXT_TX_SIZES]
+                               [CDF_SIZE(TX_TYPES)];
+  aom_cdf_prob cfl_sign_cdf[CDF_SIZE(CFL_JOINT_SIGNS)];
+  aom_cdf_prob cfl_alpha_cdf[CFL_ALPHA_CONTEXTS][CDF_SIZE(CFL_ALPHABET_SIZE)];
+  int initialized;
+} FRAME_CONTEXT;
+
+static const int av1_ext_tx_ind[EXT_TX_SET_TYPES][TX_TYPES] = {
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 1, 3, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 1, 5, 6, 4, 0, 0, 0, 0, 0, 0, 2, 3, 0, 0, 0, 0 },
+  { 3, 4, 5, 8, 6, 7, 9, 10, 11, 0, 1, 2, 0, 0, 0, 0 },
+  { 7, 8, 9, 12, 10, 11, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6 },
+};
+
+static const int av1_ext_tx_inv[EXT_TX_SET_TYPES][TX_TYPES] = {
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 9, 0, 3, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 9, 0, 10, 11, 3, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 9, 10, 11, 0, 1, 2, 4, 5, 3, 6, 7, 8, 0, 0, 0, 0 },
+  { 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 4, 5, 3, 6, 7, 8 },
+};
+
+void av1_set_default_ref_deltas(int8_t *ref_deltas);
+void av1_set_default_mode_deltas(int8_t *mode_deltas);
+void av1_setup_frame_contexts(struct AV1Common *cm);
+void av1_setup_past_independence(struct AV1Common *cm);
+
+// Returns (int)ceil(log2(n)).
+// NOTE: This implementation only works for n <= 2^30.
+static INLINE int av1_ceil_log2(int n) {
+  if (n < 2) return 0;
+  int i = 1, p = 2;
+  while (p < n) {
+    i++;
+    p = p << 1;
+  }
+  return i;
+}
+
+// Returns the context for palette color index at row 'r' and column 'c',
+// along with the 'color_order' of neighbors and the 'color_idx'.
+// The 'color_map' is a 2D array with the given 'stride'.
+int av1_get_palette_color_index_context(const uint8_t *color_map, int stride,
+                                        int r, int c, int palette_size,
+                                        uint8_t *color_order, int *color_idx);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_COMMON_ENTROPYMODE_H_
diff --git a/libs/libaom/src/av1/common/entropymv.c b/libs/libaom/src/av1/common/entropymv.c
new file mode 100644
index 000000000..e1e42f2f1
--- /dev/null
+++ b/libs/libaom/src/av1/common/entropymv.c
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/entropymv.h"
+
+static const nmv_context default_nmv_context = {
+  { AOM_CDF4(4096, 11264, 19328) },  // joints_cdf
+  { {
+        // Vertical component
+        { AOM_CDF11(28672, 30976, 31858, 32320, 32551, 32656, 32740, 32757,
+                    32762, 32767) },  // class_cdf // fp
+        { { AOM_CDF4(16384, 24576, 26624) },
+          { AOM_CDF4(12288, 21248, 24128) } },  // class0_fp_cdf
+        { AOM_CDF4(8192, 17408, 21248) },       // fp_cdf
+        { AOM_CDF2(128 * 128) },                // sign_cdf
+        { AOM_CDF2(160 * 128) },                // class0_hp_cdf
+        { AOM_CDF2(128 * 128) },                // hp_cdf
+        { AOM_CDF2(216 * 128) },                // class0_cdf
+        { { AOM_CDF2(128 * 136) },
+          { AOM_CDF2(128 * 140) },
+          { AOM_CDF2(128 * 148) },
+          { AOM_CDF2(128 * 160) },
+          { AOM_CDF2(128 * 176) },
+          { AOM_CDF2(128 * 192) },
+          { AOM_CDF2(128 * 224) },
+          { AOM_CDF2(128 * 234) },
+          { AOM_CDF2(128 * 234) },
+          { AOM_CDF2(128 * 240) } },  // bits_cdf
+    },
+    {
+        // Horizontal component
+        { AOM_CDF11(28672, 30976, 31858, 32320, 32551, 32656, 32740, 32757,
+                    32762, 32767) },  // class_cdf // fp
+        { { AOM_CDF4(16384, 24576, 26624) },
+          { AOM_CDF4(12288, 21248, 24128) } },  // class0_fp_cdf
+        { AOM_CDF4(8192, 17408, 21248) },       // fp_cdf
+        { AOM_CDF2(128 * 128) },                // sign_cdf
+        { AOM_CDF2(160 * 128) },                // class0_hp_cdf
+        { AOM_CDF2(128 * 128) },                // hp_cdf
+        { AOM_CDF2(216 * 128) },                // class0_cdf
+        { { AOM_CDF2(128 * 136) },
+          { AOM_CDF2(128 * 140) },
+          { AOM_CDF2(128 * 148) },
+          { AOM_CDF2(128 * 160) },
+          { AOM_CDF2(128 * 176) },
+          { AOM_CDF2(128 * 192) },
+          { AOM_CDF2(128 * 224) },
+          { AOM_CDF2(128 * 234) },
+          { AOM_CDF2(128 * 234) },
+          { AOM_CDF2(128 * 240) } },  // bits_cdf
+    } },
+};
+
+void av1_init_mv_probs(AV1_COMMON *cm) {
+  // NB: this sets CDFs too
+  cm->fc->nmvc = default_nmv_context;
+  cm->fc->ndvc = default_nmv_context;
+}
diff --git a/libs/libaom/src/av1/common/entropymv.h b/libs/libaom/src/av1/common/entropymv.h
new file mode 100644
index 000000000..cddc80768
--- /dev/null
+++ b/libs/libaom/src/av1/common/entropymv.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_ENTROPYMV_H_
+#define AOM_AV1_COMMON_ENTROPYMV_H_
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/prob.h"
+
+#include "av1/common/mv.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct AV1Common;
+
+void av1_init_mv_probs(struct AV1Common *cm);
+
+#define MV_UPDATE_PROB 252
+
+/* Symbols for coding which components are zero jointly */
+#define MV_JOINTS 4
+enum {
+  MV_JOINT_ZERO = 0,   /* Zero vector */
+  MV_JOINT_HNZVZ = 1,  /* Vert zero, hor nonzero */
+  MV_JOINT_HZVNZ = 2,  /* Hor zero, vert nonzero */
+  MV_JOINT_HNZVNZ = 3, /* Both components nonzero */
+} UENUM1BYTE(MV_JOINT_TYPE);
+
+static INLINE int mv_joint_vertical(MV_JOINT_TYPE type) {
+  return type == MV_JOINT_HZVNZ || type == MV_JOINT_HNZVNZ;
+}
+
+static INLINE int mv_joint_horizontal(MV_JOINT_TYPE type) {
+  return type == MV_JOINT_HNZVZ || type == MV_JOINT_HNZVNZ;
+}
+
+/* Symbols for coding magnitude class of nonzero components */
+#define MV_CLASSES 11
+enum {
+  MV_CLASS_0 = 0,   /* (0, 2]     integer pel */
+  MV_CLASS_1 = 1,   /* (2, 4]     integer pel */
+  MV_CLASS_2 = 2,   /* (4, 8]     integer pel */
+  MV_CLASS_3 = 3,   /* (8, 16]    integer pel */
+  MV_CLASS_4 = 4,   /* (16, 32]   integer pel */
+  MV_CLASS_5 = 5,   /* (32, 64]   integer pel */
+  MV_CLASS_6 = 6,   /* (64, 128]  integer pel */
+  MV_CLASS_7 = 7,   /* (128, 256] integer pel */
+  MV_CLASS_8 = 8,   /* (256, 512] integer pel */
+  MV_CLASS_9 = 9,   /* (512, 1024] integer pel */
+  MV_CLASS_10 = 10, /* (1024,2048] integer pel */
+} UENUM1BYTE(MV_CLASS_TYPE);
+
+#define CLASS0_BITS 1 /* bits at integer precision for class 0 */
+#define CLASS0_SIZE (1 << CLASS0_BITS)
+#define MV_OFFSET_BITS (MV_CLASSES + CLASS0_BITS - 2)
+#define MV_BITS_CONTEXTS 6
+#define MV_FP_SIZE 4
+
+#define MV_MAX_BITS (MV_CLASSES + CLASS0_BITS + 2)
+#define MV_MAX ((1 << MV_MAX_BITS) - 1)
+#define MV_VALS ((MV_MAX << 1) + 1)
+
+#define MV_IN_USE_BITS 14
+#define MV_UPP (1 << MV_IN_USE_BITS)
+#define MV_LOW (-(1 << MV_IN_USE_BITS))
+
+typedef struct {
+  aom_cdf_prob classes_cdf[CDF_SIZE(MV_CLASSES)];
+  aom_cdf_prob class0_fp_cdf[CLASS0_SIZE][CDF_SIZE(MV_FP_SIZE)];
+  aom_cdf_prob fp_cdf[CDF_SIZE(MV_FP_SIZE)];
+  aom_cdf_prob sign_cdf[CDF_SIZE(2)];
+  aom_cdf_prob class0_hp_cdf[CDF_SIZE(2)];
+  aom_cdf_prob hp_cdf[CDF_SIZE(2)];
+  aom_cdf_prob class0_cdf[CDF_SIZE(CLASS0_SIZE)];
+  aom_cdf_prob bits_cdf[MV_OFFSET_BITS][CDF_SIZE(2)];
+} nmv_component;
+
+typedef struct {
+  aom_cdf_prob joints_cdf[CDF_SIZE(MV_JOINTS)];
+  nmv_component comps[2];
+} nmv_context;
+
+enum {
+  MV_SUBPEL_NONE = -1,
+  MV_SUBPEL_LOW_PRECISION = 0,
+  MV_SUBPEL_HIGH_PRECISION,
+} SENUM1BYTE(MvSubpelPrecision);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_COMMON_ENTROPYMV_H_
diff --git a/libs/libaom/src/av1/common/enums.h b/libs/libaom/src/av1/common/enums.h
new file mode 100644
index 000000000..0c09a1bc7
--- /dev/null
+++ b/libs/libaom/src/av1/common/enums.h
@@ -0,0 +1,678 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_ENUMS_H_
+#define AOM_AV1_COMMON_ENUMS_H_
+
+#include "config/aom_config.h"
+
+#include "aom/aom_codec.h"
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#undef MAX_SB_SIZE
+
+// Max superblock size
+#define MAX_SB_SIZE_LOG2 7
+#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
+#define MAX_SB_SQUARE (MAX_SB_SIZE * MAX_SB_SIZE)
+
+// Min superblock size
+#define MIN_SB_SIZE_LOG2 6
+
+// Pixels per Mode Info (MI) unit
+#define MI_SIZE_LOG2 2
+#define MI_SIZE (1 << MI_SIZE_LOG2)
+
+// MI-units per max superblock (MI Block - MIB)
+#define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
+#define MAX_MIB_SIZE (1 << MAX_MIB_SIZE_LOG2)
+
+// MI-units per min superblock
+#define MIN_MIB_SIZE_LOG2 (MIN_SB_SIZE_LOG2 - MI_SIZE_LOG2)
+
+// Mask to extract MI offset within max MIB
+#define MAX_MIB_MASK (MAX_MIB_SIZE - 1)
+
+// Maximum number of tile rows and tile columns
+#define MAX_TILE_ROWS 64
+#define MAX_TILE_COLS 64
+
+#define MAX_VARTX_DEPTH 2
+
+#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
+#define MI_SIZE_128X128 (128 >> MI_SIZE_LOG2)
+
+#define MAX_PALETTE_SQUARE (64 * 64)
+// Maximum number of colors in a palette.
+#define PALETTE_MAX_SIZE 8
+// Minimum number of colors in a palette.
+#define PALETTE_MIN_SIZE 2
+
+#define FRAME_OFFSET_BITS 5
+#define MAX_FRAME_DISTANCE ((1 << FRAME_OFFSET_BITS) - 1)
+
+// 4 frame filter levels: y plane vertical, y plane horizontal,
+// u plane, and v plane
+#define FRAME_LF_COUNT 4
+#define DEFAULT_DELTA_LF_MULTI 0
+#define MAX_MODE_LF_DELTAS 2
+
+#define DIST_PRECISION_BITS 4
+#define DIST_PRECISION (1 << DIST_PRECISION_BITS)  // 16
+
+#define PROFILE_BITS 3
+// The following three profiles are currently defined.
+// Profile 0.  8-bit and 10-bit 4:2:0 and 4:0:0 only.
+// Profile 1.  8-bit and 10-bit 4:4:4
+// Profile 2.  8-bit and 10-bit 4:2:2
+//            12-bit  4:0:0, 4:2:2 and 4:4:4
+// Since we have three bits for the profiles, it can be extended later.
+enum {
+  PROFILE_0,
+  PROFILE_1,
+  PROFILE_2,
+  MAX_PROFILES,
+} SENUM1BYTE(BITSTREAM_PROFILE);
+
+#define OP_POINTS_CNT_MINUS_1_BITS 5
+#define OP_POINTS_IDC_BITS 12
+
+// Note: Some enums use the attribute 'packed' to use smallest possible integer
+// type, so that we can save memory when they are used in structs/arrays.
+
+typedef enum ATTRIBUTE_PACKED {
+  BLOCK_4X4,
+  BLOCK_4X8,
+  BLOCK_8X4,
+  BLOCK_8X8,
+  BLOCK_8X16,
+  BLOCK_16X8,
+  BLOCK_16X16,
+  BLOCK_16X32,
+  BLOCK_32X16,
+  BLOCK_32X32,
+  BLOCK_32X64,
+  BLOCK_64X32,
+  BLOCK_64X64,
+  BLOCK_64X128,
+  BLOCK_128X64,
+  BLOCK_128X128,
+  BLOCK_4X16,
+  BLOCK_16X4,
+  BLOCK_8X32,
+  BLOCK_32X8,
+  BLOCK_16X64,
+  BLOCK_64X16,
+  BLOCK_SIZES_ALL,
+  BLOCK_SIZES = BLOCK_4X16,
+  BLOCK_INVALID = 255,
+  BLOCK_LARGEST = (BLOCK_SIZES - 1)
+} BLOCK_SIZE;
+
+// 4X4, 8X8, 16X16, 32X32, 64X64, 128X128
+#define SQR_BLOCK_SIZES 6
+
+//  Partition types.  R: Recursive
+//
+//  NONE          HORZ          VERT          SPLIT
+//  +-------+     +-------+     +---+---+     +---+---+
+//  |       |     |       |     |   |   |     | R | R |
+//  |       |     +-------+     |   |   |     +---+---+
+//  |       |     |       |     |   |   |     | R | R |
+//  +-------+     +-------+     +---+---+     +---+---+
+//
+//  HORZ_A        HORZ_B        VERT_A        VERT_B
+//  +---+---+     +-------+     +---+---+     +---+---+
+//  |   |   |     |       |     |   |   |     |   |   |
+//  +---+---+     +---+---+     +---+   |     |   +---+
+//  |       |     |   |   |     |   |   |     |   |   |
+//  +-------+     +---+---+     +---+---+     +---+---+
+//
+//  HORZ_4        VERT_4
+//  +-----+       +-+-+-+
+//  +-----+       | | | |
+//  +-----+       | | | |
+//  +-----+       +-+-+-+
+enum {
+  PARTITION_NONE,
+  PARTITION_HORZ,
+  PARTITION_VERT,
+  PARTITION_SPLIT,
+  PARTITION_HORZ_A,  // HORZ split and the top partition is split again
+  PARTITION_HORZ_B,  // HORZ split and the bottom partition is split again
+  PARTITION_VERT_A,  // VERT split and the left partition is split again
+  PARTITION_VERT_B,  // VERT split and the right partition is split again
+  PARTITION_HORZ_4,  // 4:1 horizontal partition
+  PARTITION_VERT_4,  // 4:1 vertical partition
+  EXT_PARTITION_TYPES,
+  PARTITION_TYPES = PARTITION_SPLIT + 1,
+  PARTITION_INVALID = 255
+} UENUM1BYTE(PARTITION_TYPE);
+
+typedef char PARTITION_CONTEXT;
+#define PARTITION_PLOFFSET 4  // number of probability models per block size
+#define PARTITION_BLOCK_SIZES 5
+#define PARTITION_CONTEXTS (PARTITION_BLOCK_SIZES * PARTITION_PLOFFSET)
+
+// block transform size
+enum {
+  TX_4X4,             // 4x4 transform
+  TX_8X8,             // 8x8 transform
+  TX_16X16,           // 16x16 transform
+  TX_32X32,           // 32x32 transform
+  TX_64X64,           // 64x64 transform
+  TX_4X8,             // 4x8 transform
+  TX_8X4,             // 8x4 transform
+  TX_8X16,            // 8x16 transform
+  TX_16X8,            // 16x8 transform
+  TX_16X32,           // 16x32 transform
+  TX_32X16,           // 32x16 transform
+  TX_32X64,           // 32x64 transform
+  TX_64X32,           // 64x32 transform
+  TX_4X16,            // 4x16 transform
+  TX_16X4,            // 16x4 transform
+  TX_8X32,            // 8x32 transform
+  TX_32X8,            // 32x8 transform
+  TX_16X64,           // 16x64 transform
+  TX_64X16,           // 64x16 transform
+  TX_SIZES_ALL,       // Includes rectangular transforms
+  TX_SIZES = TX_4X8,  // Does NOT include rectangular transforms
+  TX_SIZES_LARGEST = TX_64X64,
+  TX_INVALID = 255  // Invalid transform size
+} UENUM1BYTE(TX_SIZE);
+
+#define TX_SIZE_LUMA_MIN (TX_4X4)
+/* We don't need to code a transform size unless the allowed size is at least
+   one more than the minimum. */
+#define TX_SIZE_CTX_MIN (TX_SIZE_LUMA_MIN + 1)
+
+// Maximum tx_size categories
+#define MAX_TX_CATS (TX_SIZES - TX_SIZE_CTX_MIN)
+#define MAX_TX_DEPTH 2
+
+#define MAX_TX_SIZE_LOG2 (6)
+#define MAX_TX_SIZE (1 << MAX_TX_SIZE_LOG2)
+#define MIN_TX_SIZE_LOG2 2
+#define MIN_TX_SIZE (1 << MIN_TX_SIZE_LOG2)
+#define MAX_TX_SQUARE (MAX_TX_SIZE * MAX_TX_SIZE)
+
+// Pad 4 extra columns to remove horizontal availability check.
+#define TX_PAD_HOR_LOG2 2
+#define TX_PAD_HOR 4
+// Pad 6 extra rows (2 on top and 4 on bottom) to remove vertical availability
+// check.
+#define TX_PAD_TOP 0
+#define TX_PAD_BOTTOM 4
+#define TX_PAD_VER (TX_PAD_TOP + TX_PAD_BOTTOM)
+// Pad 16 extra bytes to avoid reading overflow in SIMD optimization.
+#define TX_PAD_END 16
+#define TX_PAD_2D ((32 + TX_PAD_HOR) * (32 + TX_PAD_VER) + TX_PAD_END)
+
+// Number of maxium size transform blocks in the maximum size superblock
+#define MAX_TX_BLOCKS_IN_MAX_SB_LOG2 ((MAX_SB_SIZE_LOG2 - MAX_TX_SIZE_LOG2) * 2)
+#define MAX_TX_BLOCKS_IN_MAX_SB (1 << MAX_TX_BLOCKS_IN_MAX_SB_LOG2)
+
+// frame transform mode
+enum {
+  ONLY_4X4,         // use only 4x4 transform
+  TX_MODE_LARGEST,  // transform size is the largest possible for pu size
+  TX_MODE_SELECT,   // transform specified for each block
+  TX_MODES,
+} UENUM1BYTE(TX_MODE);
+
+// 1D tx types
+enum {
+  DCT_1D,
+  ADST_1D,
+  FLIPADST_1D,
+  IDTX_1D,
+  TX_TYPES_1D,
+} UENUM1BYTE(TX_TYPE_1D);
+
+enum {
+  DCT_DCT,            // DCT in both horizontal and vertical
+  ADST_DCT,           // ADST in vertical, DCT in horizontal
+  DCT_ADST,           // DCT in vertical, ADST in horizontal
+  ADST_ADST,          // ADST in both directions
+  FLIPADST_DCT,       // FLIPADST in vertical, DCT in horizontal
+  DCT_FLIPADST,       // DCT in vertical, FLIPADST in horizontal
+  FLIPADST_FLIPADST,  // FLIPADST in both directions
+  ADST_FLIPADST,      // ADST in vertical, FLIPADST in horizontal
+  FLIPADST_ADST,      // FLIPADST in vertical, ADST in horizontal
+  IDTX,               // Identity in both directions
+  V_DCT,              // DCT in vertical, identity in horizontal
+  H_DCT,              // Identity in vertical, DCT in horizontal
+  V_ADST,             // ADST in vertical, identity in horizontal
+  H_ADST,             // Identity in vertical, ADST in horizontal
+  V_FLIPADST,         // FLIPADST in vertical, identity in horizontal
+  H_FLIPADST,         // Identity in vertical, FLIPADST in horizontal
+  TX_TYPES,
+  DCT_ADST_TX_MASK = 0x000F,  // Either DCT or ADST in each direction
+} UENUM1BYTE(TX_TYPE);
+
+enum {
+  REG_REG,
+  REG_SMOOTH,
+  REG_SHARP,
+  SMOOTH_REG,
+  SMOOTH_SMOOTH,
+  SMOOTH_SHARP,
+  SHARP_REG,
+  SHARP_SMOOTH,
+  SHARP_SHARP,
+} UENUM1BYTE(DUAL_FILTER_TYPE);
+
+enum {
+  // DCT only
+  EXT_TX_SET_DCTONLY,
+  // DCT + Identity only
+  EXT_TX_SET_DCT_IDTX,
+  // Discrete Trig transforms w/o flip (4) + Identity (1)
+  EXT_TX_SET_DTT4_IDTX,
+  // Discrete Trig transforms w/o flip (4) + Identity (1) + 1D Hor/vert DCT (2)
+  EXT_TX_SET_DTT4_IDTX_1DDCT,
+  // Discrete Trig transforms w/ flip (9) + Identity (1) + 1D Hor/Ver DCT (2)
+  EXT_TX_SET_DTT9_IDTX_1DDCT,
+  // Discrete Trig transforms w/ flip (9) + Identity (1) + 1D Hor/Ver (6)
+  EXT_TX_SET_ALL16,
+  EXT_TX_SET_TYPES
+} UENUM1BYTE(TxSetType);
+
+#define EXT_TX_SIZES 4       // number of sizes that use extended transforms
+#define EXT_TX_SETS_INTER 4  // Sets of transform selections for INTER
+#define EXT_TX_SETS_INTRA 3  // Sets of transform selections for INTRA
+
+enum {
+  AOM_LAST_FLAG = 1 << 0,
+  AOM_LAST2_FLAG = 1 << 1,
+  AOM_LAST3_FLAG = 1 << 2,
+  AOM_GOLD_FLAG = 1 << 3,
+  AOM_BWD_FLAG = 1 << 4,
+  AOM_ALT2_FLAG = 1 << 5,
+  AOM_ALT_FLAG = 1 << 6,
+  AOM_REFFRAME_ALL = (1 << 7) - 1
+} UENUM1BYTE(AOM_REFFRAME);
+
+enum {
+  UNIDIR_COMP_REFERENCE,
+  BIDIR_COMP_REFERENCE,
+  COMP_REFERENCE_TYPES,
+} UENUM1BYTE(COMP_REFERENCE_TYPE);
+
+enum { PLANE_TYPE_Y, PLANE_TYPE_UV, PLANE_TYPES } UENUM1BYTE(PLANE_TYPE);
+
+#define CFL_ALPHABET_SIZE_LOG2 4
+#define CFL_ALPHABET_SIZE (1 << CFL_ALPHABET_SIZE_LOG2)
+#define CFL_MAGS_SIZE ((2 << CFL_ALPHABET_SIZE_LOG2) + 1)
+#define CFL_IDX_U(idx) (idx >> CFL_ALPHABET_SIZE_LOG2)
+#define CFL_IDX_V(idx) (idx & (CFL_ALPHABET_SIZE - 1))
+
+enum { CFL_PRED_U, CFL_PRED_V, CFL_PRED_PLANES } UENUM1BYTE(CFL_PRED_TYPE);
+
+enum {
+  CFL_SIGN_ZERO,
+  CFL_SIGN_NEG,
+  CFL_SIGN_POS,
+  CFL_SIGNS
+} UENUM1BYTE(CFL_SIGN_TYPE);
+
+enum {
+  CFL_DISALLOWED,
+  CFL_ALLOWED,
+  CFL_ALLOWED_TYPES
+} UENUM1BYTE(CFL_ALLOWED_TYPE);
+
+// CFL_SIGN_ZERO,CFL_SIGN_ZERO is invalid
+#define CFL_JOINT_SIGNS (CFL_SIGNS * CFL_SIGNS - 1)
+// CFL_SIGN_U is equivalent to (js + 1) / 3 for js in 0 to 8
+#define CFL_SIGN_U(js) (((js + 1) * 11) >> 5)
+// CFL_SIGN_V is equivalent to (js + 1) % 3 for js in 0 to 8
+#define CFL_SIGN_V(js) ((js + 1) - CFL_SIGNS * CFL_SIGN_U(js))
+
+// There is no context when the alpha for a given plane is zero.
+// So there are 2 fewer contexts than joint signs.
+#define CFL_ALPHA_CONTEXTS (CFL_JOINT_SIGNS + 1 - CFL_SIGNS)
+#define CFL_CONTEXT_U(js) (js + 1 - CFL_SIGNS)
+// Also, the contexts are symmetric under swapping the planes.
+#define CFL_CONTEXT_V(js) \
+  (CFL_SIGN_V(js) * CFL_SIGNS + CFL_SIGN_U(js) - CFL_SIGNS)
+
+enum {
+  PALETTE_MAP,
+  COLOR_MAP_TYPES,
+} UENUM1BYTE(COLOR_MAP_TYPE);
+
+enum {
+  TWO_COLORS,
+  THREE_COLORS,
+  FOUR_COLORS,
+  FIVE_COLORS,
+  SIX_COLORS,
+  SEVEN_COLORS,
+  EIGHT_COLORS,
+  PALETTE_SIZES
+} UENUM1BYTE(PALETTE_SIZE);
+
+enum {
+  PALETTE_COLOR_ONE,
+  PALETTE_COLOR_TWO,
+  PALETTE_COLOR_THREE,
+  PALETTE_COLOR_FOUR,
+  PALETTE_COLOR_FIVE,
+  PALETTE_COLOR_SIX,
+  PALETTE_COLOR_SEVEN,
+  PALETTE_COLOR_EIGHT,
+  PALETTE_COLORS
+} UENUM1BYTE(PALETTE_COLOR);
+
+// Note: All directional predictors must be between V_PRED and D67_PRED (both
+// inclusive).
+enum {
+  DC_PRED,        // Average of above and left pixels
+  V_PRED,         // Vertical
+  H_PRED,         // Horizontal
+  D45_PRED,       // Directional 45  degree
+  D135_PRED,      // Directional 135 degree
+  D113_PRED,      // Directional 113 degree
+  D157_PRED,      // Directional 157 degree
+  D203_PRED,      // Directional 203 degree
+  D67_PRED,       // Directional 67  degree
+  SMOOTH_PRED,    // Combination of horizontal and vertical interpolation
+  SMOOTH_V_PRED,  // Vertical interpolation
+  SMOOTH_H_PRED,  // Horizontal interpolation
+  PAETH_PRED,     // Predict from the direction of smallest gradient
+  NEARESTMV,
+  NEARMV,
+  GLOBALMV,
+  NEWMV,
+  // Compound ref compound modes
+  NEAREST_NEARESTMV,
+  NEAR_NEARMV,
+  NEAREST_NEWMV,
+  NEW_NEARESTMV,
+  NEAR_NEWMV,
+  NEW_NEARMV,
+  GLOBAL_GLOBALMV,
+  NEW_NEWMV,
+  MB_MODE_COUNT,
+  INTRA_MODE_START = DC_PRED,
+  INTRA_MODE_END = NEARESTMV,
+  DIR_MODE_START = V_PRED,
+  DIR_MODE_END = D67_PRED + 1,
+  INTRA_MODE_NUM = INTRA_MODE_END - INTRA_MODE_START,
+  SINGLE_INTER_MODE_START = NEARESTMV,
+  SINGLE_INTER_MODE_END = NEAREST_NEARESTMV,
+  SINGLE_INTER_MODE_NUM = SINGLE_INTER_MODE_END - SINGLE_INTER_MODE_START,
+  COMP_INTER_MODE_START = NEAREST_NEARESTMV,
+  COMP_INTER_MODE_END = MB_MODE_COUNT,
+  COMP_INTER_MODE_NUM = COMP_INTER_MODE_END - COMP_INTER_MODE_START,
+  INTER_MODE_START = NEARESTMV,
+  INTER_MODE_END = MB_MODE_COUNT,
+  INTRA_MODES = PAETH_PRED + 1,  // PAETH_PRED has to be the last intra mode.
+  INTRA_INVALID = MB_MODE_COUNT  // For uv_mode in inter blocks
+} UENUM1BYTE(PREDICTION_MODE);
+
+// TODO(ltrudeau) Do we really want to pack this?
+// TODO(ltrudeau) Do we match with PREDICTION_MODE?
+enum {
+  UV_DC_PRED,        // Average of above and left pixels
+  UV_V_PRED,         // Vertical
+  UV_H_PRED,         // Horizontal
+  UV_D45_PRED,       // Directional 45  degree
+  UV_D135_PRED,      // Directional 135 degree
+  UV_D113_PRED,      // Directional 113 degree
+  UV_D157_PRED,      // Directional 157 degree
+  UV_D203_PRED,      // Directional 203 degree
+  UV_D67_PRED,       // Directional 67  degree
+  UV_SMOOTH_PRED,    // Combination of horizontal and vertical interpolation
+  UV_SMOOTH_V_PRED,  // Vertical interpolation
+  UV_SMOOTH_H_PRED,  // Horizontal interpolation
+  UV_PAETH_PRED,     // Predict from the direction of smallest gradient
+  UV_CFL_PRED,       // Chroma-from-Luma
+  UV_INTRA_MODES,
+  UV_MODE_INVALID,  // For uv_mode in inter blocks
+} UENUM1BYTE(UV_PREDICTION_MODE);
+
+enum {
+  SIMPLE_TRANSLATION,
+  OBMC_CAUSAL,    // 2-sided OBMC
+  WARPED_CAUSAL,  // 2-sided WARPED
+  MOTION_MODES
+} UENUM1BYTE(MOTION_MODE);
+
+enum {
+  II_DC_PRED,
+  II_V_PRED,
+  II_H_PRED,
+  II_SMOOTH_PRED,
+  INTERINTRA_MODES
+} UENUM1BYTE(INTERINTRA_MODE);
+
+enum {
+  COMPOUND_AVERAGE,
+  COMPOUND_DISTWTD,
+  COMPOUND_WEDGE,
+  COMPOUND_DIFFWTD,
+  COMPOUND_TYPES,
+  MASKED_COMPOUND_TYPES = 2,
+} UENUM1BYTE(COMPOUND_TYPE);
+
+enum {
+  FILTER_DC_PRED,
+  FILTER_V_PRED,
+  FILTER_H_PRED,
+  FILTER_D157_PRED,
+  FILTER_PAETH_PRED,
+  FILTER_INTRA_MODES,
+} UENUM1BYTE(FILTER_INTRA_MODE);
+
+enum {
+  SEQ_LEVEL_2_0,
+  SEQ_LEVEL_2_1,
+  SEQ_LEVEL_2_2,
+  SEQ_LEVEL_2_3,
+  SEQ_LEVEL_3_0,
+  SEQ_LEVEL_3_1,
+  SEQ_LEVEL_3_2,
+  SEQ_LEVEL_3_3,
+  SEQ_LEVEL_4_0,
+  SEQ_LEVEL_4_1,
+  SEQ_LEVEL_4_2,
+  SEQ_LEVEL_4_3,
+  SEQ_LEVEL_5_0,
+  SEQ_LEVEL_5_1,
+  SEQ_LEVEL_5_2,
+  SEQ_LEVEL_5_3,
+  SEQ_LEVEL_6_0,
+  SEQ_LEVEL_6_1,
+  SEQ_LEVEL_6_2,
+  SEQ_LEVEL_6_3,
+  SEQ_LEVEL_7_0,
+  SEQ_LEVEL_7_1,
+  SEQ_LEVEL_7_2,
+  SEQ_LEVEL_7_3,
+  SEQ_LEVELS,
+  SEQ_LEVEL_MAX = 31
+} UENUM1BYTE(AV1_LEVEL);
+
+#define LEVEL_BITS 5
+
+#define DIRECTIONAL_MODES 8
+#define MAX_ANGLE_DELTA 3
+#define ANGLE_STEP 3
+
+#define INTER_MODES (1 + NEWMV - NEARESTMV)
+
+#define INTER_COMPOUND_MODES (1 + NEW_NEWMV - NEAREST_NEARESTMV)
+
+#define SKIP_CONTEXTS 3
+#define SKIP_MODE_CONTEXTS 3
+
+#define COMP_INDEX_CONTEXTS 6
+#define COMP_GROUP_IDX_CONTEXTS 6
+
+#define NMV_CONTEXTS 3
+
+#define NEWMV_MODE_CONTEXTS 6
+#define GLOBALMV_MODE_CONTEXTS 2
+#define REFMV_MODE_CONTEXTS 6
+#define DRL_MODE_CONTEXTS 3
+
+#define GLOBALMV_OFFSET 3
+#define REFMV_OFFSET 4
+
+#define NEWMV_CTX_MASK ((1 << GLOBALMV_OFFSET) - 1)
+#define GLOBALMV_CTX_MASK ((1 << (REFMV_OFFSET - GLOBALMV_OFFSET)) - 1)
+#define REFMV_CTX_MASK ((1 << (8 - REFMV_OFFSET)) - 1)
+
+#define COMP_NEWMV_CTXS 5
+#define INTER_MODE_CONTEXTS 8
+
+#define DELTA_Q_SMALL 3
+#define DELTA_Q_PROBS (DELTA_Q_SMALL)
+#define DEFAULT_DELTA_Q_RES_PERCEPTUAL 4
+#define DEFAULT_DELTA_Q_RES_OBJECTIVE 4
+
+#define DELTA_LF_SMALL 3
+#define DELTA_LF_PROBS (DELTA_LF_SMALL)
+#define DEFAULT_DELTA_LF_RES 2
+
+/* Segment Feature Masks */
+#define MAX_MV_REF_CANDIDATES 2
+
+#define MAX_REF_MV_STACK_SIZE 8
+#define USABLE_REF_MV_STACK_SIZE 4
+#define REF_CAT_LEVEL 640
+
+#define INTRA_INTER_CONTEXTS 4
+#define COMP_INTER_CONTEXTS 5
+#define REF_CONTEXTS 3
+
+#define COMP_REF_TYPE_CONTEXTS 5
+#define UNI_COMP_REF_CONTEXTS 3
+
+#define TXFM_PARTITION_CONTEXTS ((TX_SIZES - TX_8X8) * 6 - 3)
+typedef uint8_t TXFM_CONTEXT;
+
+// An enum for single reference types (and some derived values).
+enum {
+  NONE_FRAME = -1,
+  INTRA_FRAME,
+  LAST_FRAME,
+  LAST2_FRAME,
+  LAST3_FRAME,
+  GOLDEN_FRAME,
+  BWDREF_FRAME,
+  ALTREF2_FRAME,
+  ALTREF_FRAME,
+  REF_FRAMES,
+
+  // Extra/scratch reference frame. It may be:
+  // - used to update the ALTREF2_FRAME ref (see lshift_bwd_ref_frames()), or
+  // - updated from ALTREF2_FRAME ref (see rshift_bwd_ref_frames()).
+  EXTREF_FRAME = REF_FRAMES,
+
+  // Number of inter (non-intra) reference types.
+  INTER_REFS_PER_FRAME = ALTREF_FRAME - LAST_FRAME + 1,
+
+  // Number of forward (aka past) reference types.
+  FWD_REFS = GOLDEN_FRAME - LAST_FRAME + 1,
+
+  // Number of backward (aka future) reference types.
+  BWD_REFS = ALTREF_FRAME - BWDREF_FRAME + 1,
+
+  SINGLE_REFS = FWD_REFS + BWD_REFS,
+};
+
+#define REF_FRAMES_LOG2 3
+
+// REF_FRAMES for the cm->ref_frame_map array, 1 scratch frame for the new
+// frame in cm->cur_frame, INTER_REFS_PER_FRAME for scaled references on the
+// encoder in the cpi->scaled_ref_buf array.
+#define FRAME_BUFFERS (REF_FRAMES + 1 + INTER_REFS_PER_FRAME)
+
+#define FWD_RF_OFFSET(ref) (ref - LAST_FRAME)
+#define BWD_RF_OFFSET(ref) (ref - BWDREF_FRAME)
+
+enum {
+  LAST_LAST2_FRAMES,      // { LAST_FRAME, LAST2_FRAME }
+  LAST_LAST3_FRAMES,      // { LAST_FRAME, LAST3_FRAME }
+  LAST_GOLDEN_FRAMES,     // { LAST_FRAME, GOLDEN_FRAME }
+  BWDREF_ALTREF_FRAMES,   // { BWDREF_FRAME, ALTREF_FRAME }
+  LAST2_LAST3_FRAMES,     // { LAST2_FRAME, LAST3_FRAME }
+  LAST2_GOLDEN_FRAMES,    // { LAST2_FRAME, GOLDEN_FRAME }
+  LAST3_GOLDEN_FRAMES,    // { LAST3_FRAME, GOLDEN_FRAME }
+  BWDREF_ALTREF2_FRAMES,  // { BWDREF_FRAME, ALTREF2_FRAME }
+  ALTREF2_ALTREF_FRAMES,  // { ALTREF2_FRAME, ALTREF_FRAME }
+  TOTAL_UNIDIR_COMP_REFS,
+  // NOTE: UNIDIR_COMP_REFS is the number of uni-directional reference pairs
+  //       that are explicitly signaled.
+  UNIDIR_COMP_REFS = BWDREF_ALTREF_FRAMES + 1,
+} UENUM1BYTE(UNIDIR_COMP_REF);
+
+#define TOTAL_COMP_REFS (FWD_REFS * BWD_REFS + TOTAL_UNIDIR_COMP_REFS)
+
+#define COMP_REFS (FWD_REFS * BWD_REFS + UNIDIR_COMP_REFS)
+
+// NOTE: A limited number of unidirectional reference pairs can be signalled for
+//       compound prediction. The use of skip mode, on the other hand, makes it
+//       possible to have a reference pair not listed for explicit signaling.
+#define MODE_CTX_REF_FRAMES (REF_FRAMES + TOTAL_COMP_REFS)
+
+// Note: It includes single and compound references. So, it can take values from
+// NONE_FRAME to (MODE_CTX_REF_FRAMES - 1). Hence, it is not defined as an enum.
+typedef int8_t MV_REFERENCE_FRAME;
+
+enum {
+  RESTORE_NONE,
+  RESTORE_WIENER,
+  RESTORE_SGRPROJ,
+  RESTORE_SWITCHABLE,
+  RESTORE_SWITCHABLE_TYPES = RESTORE_SWITCHABLE,
+  RESTORE_TYPES = 4,
+} UENUM1BYTE(RestorationType);
+
+// Picture prediction structures (0-12 are predefined) in scalability metadata.
+enum {
+  SCALABILITY_L1T2 = 0,
+  SCALABILITY_L1T3 = 1,
+  SCALABILITY_L2T1 = 2,
+  SCALABILITY_L2T2 = 3,
+  SCALABILITY_L2T3 = 4,
+  SCALABILITY_S2T1 = 5,
+  SCALABILITY_S2T2 = 6,
+  SCALABILITY_S2T3 = 7,
+  SCALABILITY_L2T1h = 8,
+  SCALABILITY_L2T2h = 9,
+  SCALABILITY_L2T3h = 10,
+  SCALABILITY_S2T1h = 11,
+  SCALABILITY_S2T2h = 12,
+  SCALABILITY_S2T3h = 13,
+  SCALABILITY_SS = 14
+} UENUM1BYTE(SCALABILITY_STRUCTURES);
+
+#define SUPERRES_SCALE_BITS 3
+#define SUPERRES_SCALE_DENOMINATOR_MIN (SCALE_NUMERATOR + 1)
+
+// In large_scale_tile coding, external references are used.
+#define MAX_EXTERNAL_REFERENCES 128
+#define MAX_TILES 512
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_COMMON_ENUMS_H_
diff --git a/libs/libaom/src/av1/common/filter.h b/libs/libaom/src/av1/common/filter.h
new file mode 100644
index 000000000..91791d3dc
--- /dev/null
+++ b/libs/libaom/src/av1/common/filter.h
@@ -0,0 +1,279 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_FILTER_H_
+#define AOM_AV1_COMMON_FILTER_H_
+
+#include <assert.h>
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_ports/mem.h"
+#include "av1/common/enums.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_FILTER_TAP 8
+
+typedef enum ATTRIBUTE_PACKED {
+  EIGHTTAP_REGULAR,
+  EIGHTTAP_SMOOTH,
+  MULTITAP_SHARP,
+  BILINEAR,
+  INTERP_FILTERS_ALL,
+  SWITCHABLE_FILTERS = BILINEAR,
+  SWITCHABLE = SWITCHABLE_FILTERS + 1, /* the last switchable one */
+  EXTRA_FILTERS = INTERP_FILTERS_ALL - SWITCHABLE_FILTERS,
+  INTERP_INVALID = 0xff,
+} InterpFilter;
+
+enum {
+  USE_2_TAPS_ORIG = 0,  // This is used in temporal filtering.
+  USE_2_TAPS,
+  USE_4_TAPS,
+  USE_8_TAPS,
+} UENUM1BYTE(SUBPEL_SEARCH_TYPE);
+
+enum {
+  INTERP_EVAL_LUMA_EVAL_CHROMA = 0,
+  INTERP_SKIP_LUMA_EVAL_CHROMA,
+  INTERP_EVAL_INVALID,
+  INTERP_SKIP_LUMA_SKIP_CHROMA,
+} UENUM1BYTE(INTERP_EVAL_PLANE);
+
+enum {
+  INTERP_HORZ_NEQ_VERT_NEQ = 0,
+  INTERP_HORZ_EQ_VERT_NEQ,
+  INTERP_HORZ_NEQ_VERT_EQ,
+  INTERP_HORZ_EQ_VERT_EQ,
+  INTERP_PRED_TYPE_ALL,
+} UENUM1BYTE(INTERP_PRED_TYPE);
+// Pack two InterpFilter's into a uint32_t: since there are at most 10 filters,
+// we can use 16 bits for each and have more than enough space. This reduces
+// argument passing and unifies the operation of setting a (pair of) filters.
+typedef struct InterpFilters {
+  uint16_t y_filter;
+  uint16_t x_filter;
+} InterpFilters;
+
+typedef union int_interpfilters {
+  uint32_t as_int;
+  InterpFilters as_filters;
+} int_interpfilters;
+
+static INLINE InterpFilter av1_extract_interp_filter(int_interpfilters filters,
+                                                     int dir) {
+  return (InterpFilter)((dir) ? filters.as_filters.x_filter
+                              : filters.as_filters.y_filter);
+}
+
+static INLINE int_interpfilters
+av1_broadcast_interp_filter(InterpFilter filter) {
+  int_interpfilters filters;
+  filters.as_filters.x_filter = filter;
+  filters.as_filters.y_filter = filter;
+  return filters;
+}
+
+static INLINE InterpFilter av1_unswitchable_filter(InterpFilter filter) {
+  return filter == SWITCHABLE ? EIGHTTAP_REGULAR : filter;
+}
+
+/* (1 << LOG_SWITCHABLE_FILTERS) > SWITCHABLE_FILTERS */
+#define LOG_SWITCHABLE_FILTERS 2
+
+#define SWITCHABLE_FILTER_CONTEXTS ((SWITCHABLE_FILTERS + 1) * 4)
+#define INTER_FILTER_COMP_OFFSET (SWITCHABLE_FILTERS + 1)
+#define INTER_FILTER_DIR_OFFSET ((SWITCHABLE_FILTERS + 1) * 2)
+#define ALLOW_ALL_INTERP_FILT_MASK (0x01ff)
+
+typedef struct InterpFilterParams {
+  const int16_t *filter_ptr;
+  uint16_t taps;
+  uint16_t subpel_shifts;
+  InterpFilter interp_filter;
+} InterpFilterParams;
+
+DECLARE_ALIGNED(256, static const InterpKernel,
+                av1_bilinear_filters[SUBPEL_SHIFTS]) = {
+  { 0, 0, 0, 128, 0, 0, 0, 0 },  { 0, 0, 0, 120, 8, 0, 0, 0 },
+  { 0, 0, 0, 112, 16, 0, 0, 0 }, { 0, 0, 0, 104, 24, 0, 0, 0 },
+  { 0, 0, 0, 96, 32, 0, 0, 0 },  { 0, 0, 0, 88, 40, 0, 0, 0 },
+  { 0, 0, 0, 80, 48, 0, 0, 0 },  { 0, 0, 0, 72, 56, 0, 0, 0 },
+  { 0, 0, 0, 64, 64, 0, 0, 0 },  { 0, 0, 0, 56, 72, 0, 0, 0 },
+  { 0, 0, 0, 48, 80, 0, 0, 0 },  { 0, 0, 0, 40, 88, 0, 0, 0 },
+  { 0, 0, 0, 32, 96, 0, 0, 0 },  { 0, 0, 0, 24, 104, 0, 0, 0 },
+  { 0, 0, 0, 16, 112, 0, 0, 0 }, { 0, 0, 0, 8, 120, 0, 0, 0 }
+};
+
+DECLARE_ALIGNED(256, static const InterpKernel,
+                av1_sub_pel_filters_8[SUBPEL_SHIFTS]) = {
+  { 0, 0, 0, 128, 0, 0, 0, 0 },      { 0, 2, -6, 126, 8, -2, 0, 0 },
+  { 0, 2, -10, 122, 18, -4, 0, 0 },  { 0, 2, -12, 116, 28, -8, 2, 0 },
+  { 0, 2, -14, 110, 38, -10, 2, 0 }, { 0, 2, -14, 102, 48, -12, 2, 0 },
+  { 0, 2, -16, 94, 58, -12, 2, 0 },  { 0, 2, -14, 84, 66, -12, 2, 0 },
+  { 0, 2, -14, 76, 76, -14, 2, 0 },  { 0, 2, -12, 66, 84, -14, 2, 0 },
+  { 0, 2, -12, 58, 94, -16, 2, 0 },  { 0, 2, -12, 48, 102, -14, 2, 0 },
+  { 0, 2, -10, 38, 110, -14, 2, 0 }, { 0, 2, -8, 28, 116, -12, 2, 0 },
+  { 0, 0, -4, 18, 122, -10, 2, 0 },  { 0, 0, -2, 8, 126, -6, 2, 0 }
+};
+
+DECLARE_ALIGNED(256, static const InterpKernel,
+                av1_sub_pel_filters_8sharp[SUBPEL_SHIFTS]) = {
+  { 0, 0, 0, 128, 0, 0, 0, 0 },         { -2, 2, -6, 126, 8, -2, 2, 0 },
+  { -2, 6, -12, 124, 16, -6, 4, -2 },   { -2, 8, -18, 120, 26, -10, 6, -2 },
+  { -4, 10, -22, 116, 38, -14, 6, -2 }, { -4, 10, -22, 108, 48, -18, 8, -2 },
+  { -4, 10, -24, 100, 60, -20, 8, -2 }, { -4, 10, -24, 90, 70, -22, 10, -2 },
+  { -4, 12, -24, 80, 80, -24, 12, -4 }, { -2, 10, -22, 70, 90, -24, 10, -4 },
+  { -2, 8, -20, 60, 100, -24, 10, -4 }, { -2, 8, -18, 48, 108, -22, 10, -4 },
+  { -2, 6, -14, 38, 116, -22, 10, -4 }, { -2, 6, -10, 26, 120, -18, 8, -2 },
+  { -2, 4, -6, 16, 124, -12, 6, -2 },   { 0, 2, -2, 8, 126, -6, 2, -2 }
+};
+
+DECLARE_ALIGNED(256, static const InterpKernel,
+                av1_sub_pel_filters_8smooth[SUBPEL_SHIFTS]) = {
+  { 0, 0, 0, 128, 0, 0, 0, 0 },     { 0, 2, 28, 62, 34, 2, 0, 0 },
+  { 0, 0, 26, 62, 36, 4, 0, 0 },    { 0, 0, 22, 62, 40, 4, 0, 0 },
+  { 0, 0, 20, 60, 42, 6, 0, 0 },    { 0, 0, 18, 58, 44, 8, 0, 0 },
+  { 0, 0, 16, 56, 46, 10, 0, 0 },   { 0, -2, 16, 54, 48, 12, 0, 0 },
+  { 0, -2, 14, 52, 52, 14, -2, 0 }, { 0, 0, 12, 48, 54, 16, -2, 0 },
+  { 0, 0, 10, 46, 56, 16, 0, 0 },   { 0, 0, 8, 44, 58, 18, 0, 0 },
+  { 0, 0, 6, 42, 60, 20, 0, 0 },    { 0, 0, 4, 40, 62, 22, 0, 0 },
+  { 0, 0, 4, 36, 62, 26, 0, 0 },    { 0, 0, 2, 34, 62, 28, 2, 0 }
+};
+
+static const InterpFilterParams
+    av1_interp_filter_params_list[SWITCHABLE_FILTERS + 1] = {
+      { (const int16_t *)av1_sub_pel_filters_8, SUBPEL_TAPS, SUBPEL_SHIFTS,
+        EIGHTTAP_REGULAR },
+      { (const int16_t *)av1_sub_pel_filters_8smooth, SUBPEL_TAPS,
+        SUBPEL_SHIFTS, EIGHTTAP_SMOOTH },
+      { (const int16_t *)av1_sub_pel_filters_8sharp, SUBPEL_TAPS, SUBPEL_SHIFTS,
+        MULTITAP_SHARP },
+      { (const int16_t *)av1_bilinear_filters, SUBPEL_TAPS, SUBPEL_SHIFTS,
+        BILINEAR }
+    };
+
+// A special 2-tap bilinear filter for IntraBC chroma. IntraBC uses full pixel
+// MV for luma. If sub-sampling exists, chroma may possibly use half-pel MV.
+DECLARE_ALIGNED(256, static const int16_t,
+                av1_intrabc_bilinear_filter[2 * SUBPEL_SHIFTS]) = {
+  128, 0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  64,  64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+};
+
+static const InterpFilterParams av1_intrabc_filter_params = {
+  av1_intrabc_bilinear_filter, 2, 0, BILINEAR
+};
+
+DECLARE_ALIGNED(256, static const InterpKernel,
+                av1_sub_pel_filters_4[SUBPEL_SHIFTS]) = {
+  { 0, 0, 0, 128, 0, 0, 0, 0 },     { 0, 0, -4, 126, 8, -2, 0, 0 },
+  { 0, 0, -8, 122, 18, -4, 0, 0 },  { 0, 0, -10, 116, 28, -6, 0, 0 },
+  { 0, 0, -12, 110, 38, -8, 0, 0 }, { 0, 0, -12, 102, 48, -10, 0, 0 },
+  { 0, 0, -14, 94, 58, -10, 0, 0 }, { 0, 0, -12, 84, 66, -10, 0, 0 },
+  { 0, 0, -12, 76, 76, -12, 0, 0 }, { 0, 0, -10, 66, 84, -12, 0, 0 },
+  { 0, 0, -10, 58, 94, -14, 0, 0 }, { 0, 0, -10, 48, 102, -12, 0, 0 },
+  { 0, 0, -8, 38, 110, -12, 0, 0 }, { 0, 0, -6, 28, 116, -10, 0, 0 },
+  { 0, 0, -4, 18, 122, -8, 0, 0 },  { 0, 0, -2, 8, 126, -4, 0, 0 }
+};
+DECLARE_ALIGNED(256, static const InterpKernel,
+                av1_sub_pel_filters_4smooth[SUBPEL_SHIFTS]) = {
+  { 0, 0, 0, 128, 0, 0, 0, 0 },   { 0, 0, 30, 62, 34, 2, 0, 0 },
+  { 0, 0, 26, 62, 36, 4, 0, 0 },  { 0, 0, 22, 62, 40, 4, 0, 0 },
+  { 0, 0, 20, 60, 42, 6, 0, 0 },  { 0, 0, 18, 58, 44, 8, 0, 0 },
+  { 0, 0, 16, 56, 46, 10, 0, 0 }, { 0, 0, 14, 54, 48, 12, 0, 0 },
+  { 0, 0, 12, 52, 52, 12, 0, 0 }, { 0, 0, 12, 48, 54, 14, 0, 0 },
+  { 0, 0, 10, 46, 56, 16, 0, 0 }, { 0, 0, 8, 44, 58, 18, 0, 0 },
+  { 0, 0, 6, 42, 60, 20, 0, 0 },  { 0, 0, 4, 40, 62, 22, 0, 0 },
+  { 0, 0, 4, 36, 62, 26, 0, 0 },  { 0, 0, 2, 34, 62, 30, 0, 0 }
+};
+
+static const uint16_t
+    av1_interp_dual_filt_mask[INTERP_PRED_TYPE_ALL - 2][SWITCHABLE_FILTERS] = {
+      { (1 << REG_REG) | (1 << SMOOTH_REG) | (1 << SHARP_REG),
+        (1 << REG_SMOOTH) | (1 << SMOOTH_SMOOTH) | (1 << SHARP_SMOOTH),
+        (1 << REG_SHARP) | (1 << SMOOTH_SHARP) | (1 << SHARP_SHARP) },
+      { (1 << REG_REG) | (1 << REG_SMOOTH) | (1 << REG_SHARP),
+        (1 << SMOOTH_REG) | (1 << SMOOTH_SMOOTH) | (1 << SMOOTH_SHARP),
+        (1 << SHARP_REG) | (1 << SHARP_SMOOTH) | (1 << SHARP_SHARP) }
+    };
+
+// For w<=4, MULTITAP_SHARP is the same as EIGHTTAP_REGULAR
+static const InterpFilterParams av1_interp_4tap[SWITCHABLE_FILTERS + 1] = {
+  { (const int16_t *)av1_sub_pel_filters_4, SUBPEL_TAPS, SUBPEL_SHIFTS,
+    EIGHTTAP_REGULAR },
+  { (const int16_t *)av1_sub_pel_filters_4smooth, SUBPEL_TAPS, SUBPEL_SHIFTS,
+    EIGHTTAP_SMOOTH },
+  { (const int16_t *)av1_sub_pel_filters_4, SUBPEL_TAPS, SUBPEL_SHIFTS,
+    EIGHTTAP_REGULAR },
+  { (const int16_t *)av1_bilinear_filters, SUBPEL_TAPS, SUBPEL_SHIFTS,
+    BILINEAR },
+};
+
+static INLINE const InterpFilterParams *
+av1_get_interp_filter_params_with_block_size(const InterpFilter interp_filter,
+                                             const int w) {
+  if (w <= 4) return &av1_interp_4tap[interp_filter];
+  return &av1_interp_filter_params_list[interp_filter];
+}
+
+static INLINE const int16_t *av1_get_interp_filter_kernel(
+    const InterpFilter interp_filter, int subpel_search) {
+  assert(subpel_search >= USE_2_TAPS);
+  return (subpel_search == USE_2_TAPS)
+             ? av1_interp_4tap[BILINEAR].filter_ptr
+             : ((subpel_search == USE_4_TAPS)
+                    ? av1_interp_4tap[interp_filter].filter_ptr
+                    : av1_interp_filter_params_list[interp_filter].filter_ptr);
+}
+
+static INLINE const int16_t *av1_get_interp_filter_subpel_kernel(
+    const InterpFilterParams *const filter_params, const int subpel) {
+  return filter_params->filter_ptr + filter_params->taps * subpel;
+}
+
+static INLINE const InterpFilterParams *av1_get_filter(int subpel_search) {
+  assert(subpel_search >= USE_2_TAPS);
+
+  switch (subpel_search) {
+    case USE_2_TAPS: return &av1_interp_4tap[BILINEAR];
+    case USE_4_TAPS: return &av1_interp_4tap[EIGHTTAP_REGULAR];
+    case USE_8_TAPS: return &av1_interp_filter_params_list[EIGHTTAP_REGULAR];
+    default: assert(0); return NULL;
+  }
+}
+
+static INLINE void reset_interp_filter_allowed_mask(
+    uint16_t *allow_interp_mask, DUAL_FILTER_TYPE filt_type) {
+  uint16_t tmp = (~(1 << filt_type)) & 0xffff;
+  *allow_interp_mask &= (tmp & ALLOW_ALL_INTERP_FILT_MASK);
+}
+
+static INLINE void set_interp_filter_allowed_mask(uint16_t *allow_interp_mask,
+                                                  DUAL_FILTER_TYPE filt_type) {
+  *allow_interp_mask |= (1 << filt_type);
+}
+
+static INLINE uint8_t get_interp_filter_allowed_mask(
+    uint16_t allow_interp_mask, DUAL_FILTER_TYPE filt_type) {
+  return (allow_interp_mask >> filt_type) & 1;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_COMMON_FILTER_H_
diff --git a/libs/libaom/src/av1/common/frame_buffers.c b/libs/libaom/src/av1/common/frame_buffers.c
new file mode 100644
index 000000000..f10ccd594
--- /dev/null
+++ b/libs/libaom/src/av1/common/frame_buffers.c
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "av1/common/frame_buffers.h"
+#include "aom_mem/aom_mem.h"
+
+int av1_alloc_internal_frame_buffers(InternalFrameBufferList *list) {
+  assert(list != NULL);
+  av1_free_internal_frame_buffers(list);
+
+  list->num_internal_frame_buffers =
+      AOM_MAXIMUM_REF_BUFFERS + AOM_MAXIMUM_WORK_BUFFERS;
+  list->int_fb = (InternalFrameBuffer *)aom_calloc(
+      list->num_internal_frame_buffers, sizeof(*list->int_fb));
+  if (list->int_fb == NULL) {
+    list->num_internal_frame_buffers = 0;
+    return 1;
+  }
+  return 0;
+}
+
+void av1_free_internal_frame_buffers(InternalFrameBufferList *list) {
+  int i;
+
+  assert(list != NULL);
+
+  for (i = 0; i < list->num_internal_frame_buffers; ++i) {
+    aom_free(list->int_fb[i].data);
+    list->int_fb[i].data = NULL;
+  }
+  aom_free(list->int_fb);
+  list->int_fb = NULL;
+  list->num_internal_frame_buffers = 0;
+}
+
+void av1_zero_unused_internal_frame_buffers(InternalFrameBufferList *list) {
+  int i;
+
+  assert(list != NULL);
+
+  for (i = 0; i < list->num_internal_frame_buffers; ++i) {
+    if (list->int_fb[i].data && !list->int_fb[i].in_use)
+      memset(list->int_fb[i].data, 0, list->int_fb[i].size);
+  }
+}
+
+int av1_get_frame_buffer(void *cb_priv, size_t min_size,
+                         aom_codec_frame_buffer_t *fb) {
+  int i;
+  InternalFrameBufferList *const int_fb_list =
+      (InternalFrameBufferList *)cb_priv;
+  if (int_fb_list == NULL) return -1;
+
+  // Find a free frame buffer.
+  for (i = 0; i < int_fb_list->num_internal_frame_buffers; ++i) {
+    if (!int_fb_list->int_fb[i].in_use) break;
+  }
+
+  if (i == int_fb_list->num_internal_frame_buffers) return -1;
+
+  if (int_fb_list->int_fb[i].size < min_size) {
+    aom_free(int_fb_list->int_fb[i].data);
+    // The data must be zeroed to fix a valgrind error from the C loop filter
+    // due to access uninitialized memory in frame border. It could be
+    // skipped if border were totally removed.
+    int_fb_list->int_fb[i].data = (uint8_t *)aom_calloc(1, min_size);
+    if (!int_fb_list->int_fb[i].data) {
+      int_fb_list->int_fb[i].size = 0;
+      return -1;
+    }
+    int_fb_list->int_fb[i].size = min_size;
+  }
+
+  fb->data = int_fb_list->int_fb[i].data;
+  fb->size = int_fb_list->int_fb[i].size;
+  int_fb_list->int_fb[i].in_use = 1;
+
+  // Set the frame buffer's private data to point at the internal frame buffer.
+  fb->priv = &int_fb_list->int_fb[i];
+  return 0;
+}
+
+int av1_release_frame_buffer(void *cb_priv, aom_codec_frame_buffer_t *fb) {
+  InternalFrameBuffer *const int_fb = (InternalFrameBuffer *)fb->priv;
+  (void)cb_priv;
+  if (int_fb) int_fb->in_use = 0;
+  return 0;
+}
diff --git a/libs/libaom/src/av1/common/frame_buffers.h b/libs/libaom/src/av1/common/frame_buffers.h
new file mode 100644
index 000000000..16188e51c
--- /dev/null
+++ b/libs/libaom/src/av1/common/frame_buffers.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_FRAME_BUFFERS_H_
+#define AOM_AV1_COMMON_FRAME_BUFFERS_H_
+
+#include "aom/aom_frame_buffer.h"
+#include "aom/aom_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct InternalFrameBuffer {
+  uint8_t *data;
+  size_t size;
+  int in_use;
+} InternalFrameBuffer;
+
+typedef struct InternalFrameBufferList {
+  int num_internal_frame_buffers;
+  InternalFrameBuffer *int_fb;
+} InternalFrameBufferList;
+
+// Initializes |list|. Returns 0 on success.
+int av1_alloc_internal_frame_buffers(InternalFrameBufferList *list);
+
+// Free any data allocated to the frame buffers.
+void av1_free_internal_frame_buffers(InternalFrameBufferList *list);
+
+// Zeros all unused internal frame buffers. In particular, this zeros the
+// frame borders. Call this function after a sequence header change to
+// re-initialize the frame borders for the different width, height, or bit
+// depth.
+void av1_zero_unused_internal_frame_buffers(InternalFrameBufferList *list);
+
+// Callback used by libaom to request an external frame buffer. |cb_priv|
+// Callback private data, which points to an InternalFrameBufferList.
+// |min_size| is the minimum size in bytes needed to decode the next frame.
+// |fb| pointer to the frame buffer.
+int av1_get_frame_buffer(void *cb_priv, size_t min_size,
+                         aom_codec_frame_buffer_t *fb);
+
+// Callback used by libaom when there are no references to the frame buffer.
+// |cb_priv| is not used. |fb| pointer to the frame buffer.
+int av1_release_frame_buffer(void *cb_priv, aom_codec_frame_buffer_t *fb);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_COMMON_FRAME_BUFFERS_H_
diff --git a/libs/libaom/src/av1/common/idct.c b/libs/libaom/src/av1/common/idct.c
new file mode 100644
index 000000000..bff438f3c
--- /dev/null
+++ b/libs/libaom/src/av1/common/idct.c
@@ -0,0 +1,322 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_ports/mem.h"
+#include "av1/common/av1_inv_txfm1d_cfg.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/common/blockd.h"
+#include "av1/common/enums.h"
+#include "av1/common/idct.h"
+
+int av1_get_tx_scale(const TX_SIZE tx_size) {
+  const int pels = tx_size_2d[tx_size];
+  // Largest possible pels is 4096 (64x64).
+  return (pels > 256) + (pels > 1024);
+}
+
+// NOTE: The implementation of all inverses need to be aware of the fact
+// that input and output could be the same buffer.
+
+// idct
+void av1_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+                            int eob, int bd) {
+  if (eob > 1)
+    av1_highbd_iwht4x4_16_add(input, dest, stride, bd);
+  else
+    av1_highbd_iwht4x4_1_add(input, dest, stride, bd);
+}
+
+void av1_highbd_inv_txfm_add_4x4_c(const tran_low_t *input, uint8_t *dest,
+                                   int stride, const TxfmParam *txfm_param) {
+  assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
+  int eob = txfm_param->eob;
+  int bd = txfm_param->bd;
+  int lossless = txfm_param->lossless;
+  const int32_t *src = cast_to_int32(input);
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  if (lossless) {
+    assert(tx_type == DCT_DCT);
+    av1_highbd_iwht4x4_add(input, dest, stride, eob, bd);
+    return;
+  }
+
+  av1_inv_txfm2d_add_4x4_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, bd);
+}
+
+void av1_highbd_inv_txfm_add_4x8_c(const tran_low_t *input, uint8_t *dest,
+                                   int stride, const TxfmParam *txfm_param) {
+  assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
+  const int32_t *src = cast_to_int32(input);
+  av1_inv_txfm2d_add_4x8_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+                           txfm_param->tx_type, txfm_param->bd);
+}
+
+void av1_highbd_inv_txfm_add_8x4_c(const tran_low_t *input, uint8_t *dest,
+                                   int stride, const TxfmParam *txfm_param) {
+  assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
+  const int32_t *src = cast_to_int32(input);
+  av1_inv_txfm2d_add_8x4_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+                           txfm_param->tx_type, txfm_param->bd);
+}
+
+void av1_highbd_inv_txfm_add_16x32_c(const tran_low_t *input, uint8_t *dest,
+                                     int stride, const TxfmParam *txfm_param) {
+  const int32_t *src = cast_to_int32(input);
+  av1_inv_txfm2d_add_16x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+                             txfm_param->tx_type, txfm_param->bd);
+}
+
+void av1_highbd_inv_txfm_add_32x16_c(const tran_low_t *input, uint8_t *dest,
+                                     int stride, const TxfmParam *txfm_param) {
+  const int32_t *src = cast_to_int32(input);
+  av1_inv_txfm2d_add_32x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+                             txfm_param->tx_type, txfm_param->bd);
+}
+
+void av1_highbd_inv_txfm_add_16x4_c(const tran_low_t *input, uint8_t *dest,
+                                    int stride, const TxfmParam *txfm_param) {
+  const int32_t *src = cast_to_int32(input);
+  av1_inv_txfm2d_add_16x4_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+                            txfm_param->tx_type, txfm_param->bd);
+}
+
+void av1_highbd_inv_txfm_add_4x16_c(const tran_low_t *input, uint8_t *dest,
+                                    int stride, const TxfmParam *txfm_param) {
+  const int32_t *src = cast_to_int32(input);
+  av1_inv_txfm2d_add_4x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+                            txfm_param->tx_type, txfm_param->bd);
+}
+
+void av1_highbd_inv_txfm_add_32x8_c(const tran_low_t *input, uint8_t *dest,
+                                    int stride, const TxfmParam *txfm_param) {
+  const int32_t *src = cast_to_int32(input);
+  av1_inv_txfm2d_add_32x8_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+                            txfm_param->tx_type, txfm_param->bd);
+}
+
+void av1_highbd_inv_txfm_add_8x32_c(const tran_low_t *input, uint8_t *dest,
+                                    int stride, const TxfmParam *txfm_param) {
+  const int32_t *src = cast_to_int32(input);
+  av1_inv_txfm2d_add_8x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+                            txfm_param->tx_type, txfm_param->bd);
+}
+
+void av1_highbd_inv_txfm_add_32x64_c(const tran_low_t *input, uint8_t *dest,
+                                     int stride, const TxfmParam *txfm_param) {
+  const int32_t *src = cast_to_int32(input);
+  av1_inv_txfm2d_add_32x64_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+                             txfm_param->tx_type, txfm_param->bd);
+}
+
+void av1_highbd_inv_txfm_add_64x32_c(const tran_low_t *input, uint8_t *dest,
+                                     int stride, const TxfmParam *txfm_param) {
+  const int32_t *src = cast_to_int32(input);
+  av1_inv_txfm2d_add_64x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+                             txfm_param->tx_type, txfm_param->bd);
+}
+
+void av1_highbd_inv_txfm_add_16x64_c(const tran_low_t *input, uint8_t *dest,
+                                     int stride, const TxfmParam *txfm_param) {
+  const int32_t *src = cast_to_int32(input);
+  av1_inv_txfm2d_add_16x64_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+                             txfm_param->tx_type, txfm_param->bd);
+}
+
+void av1_highbd_inv_txfm_add_64x16_c(const tran_low_t *input, uint8_t *dest,
+                                     int stride, const TxfmParam *txfm_param) {
+  const int32_t *src = cast_to_int32(input);
+  av1_inv_txfm2d_add_64x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+                             txfm_param->tx_type, txfm_param->bd);
+}
+
+void av1_highbd_inv_txfm_add_8x8_c(const tran_low_t *input, uint8_t *dest,
+                                   int stride, const TxfmParam *txfm_param) {
+  int bd = txfm_param->bd;
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  const int32_t *src = cast_to_int32(input);
+
+  av1_inv_txfm2d_add_8x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, bd);
+}
+
+void av1_highbd_inv_txfm_add_16x16_c(const tran_low_t *input, uint8_t *dest,
+                                     int stride, const TxfmParam *txfm_param) {
+  int bd = txfm_param->bd;
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  const int32_t *src = cast_to_int32(input);
+
+  av1_inv_txfm2d_add_16x16_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
+                             bd);
+}
+
+void av1_highbd_inv_txfm_add_8x16_c(const tran_low_t *input, uint8_t *dest,
+                                    int stride, const TxfmParam *txfm_param) {
+  const int32_t *src = cast_to_int32(input);
+  av1_inv_txfm2d_add_8x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+                            txfm_param->tx_type, txfm_param->bd);
+}
+
+void av1_highbd_inv_txfm_add_16x8_c(const tran_low_t *input, uint8_t *dest,
+                                    int stride, const TxfmParam *txfm_param) {
+  const int32_t *src = cast_to_int32(input);
+  av1_inv_txfm2d_add_16x8_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+                            txfm_param->tx_type, txfm_param->bd);
+}
+
+void av1_highbd_inv_txfm_add_32x32_c(const tran_low_t *input, uint8_t *dest,
+                                     int stride, const TxfmParam *txfm_param) {
+  const int bd = txfm_param->bd;
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  const int32_t *src = cast_to_int32(input);
+
+  av1_inv_txfm2d_add_32x32_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
+                             bd);
+}
+
+void av1_highbd_inv_txfm_add_64x64_c(const tran_low_t *input, uint8_t *dest,
+                                     int stride, const TxfmParam *txfm_param) {
+  const int bd = txfm_param->bd;
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  const int32_t *src = cast_to_int32(input);
+  assert(tx_type == DCT_DCT);
+  av1_inv_txfm2d_add_64x64_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
+                             bd);
+}
+
+static void init_txfm_param(const MACROBLOCKD *xd, int plane, TX_SIZE tx_size,
+                            TX_TYPE tx_type, int eob, int reduced_tx_set,
+                            TxfmParam *txfm_param) {
+  (void)plane;
+  txfm_param->tx_type = tx_type;
+  txfm_param->tx_size = tx_size;
+  txfm_param->eob = eob;
+  txfm_param->lossless = xd->lossless[xd->mi[0]->segment_id];
+  txfm_param->bd = xd->bd;
+  txfm_param->is_hbd = is_cur_buf_hbd(xd);
+  txfm_param->tx_set_type = av1_get_ext_tx_set_type(
+      txfm_param->tx_size, is_inter_block(xd->mi[0]), reduced_tx_set);
+}
+
+void av1_highbd_inv_txfm_add_c(const tran_low_t *input, uint8_t *dest,
+                               int stride, const TxfmParam *txfm_param) {
+  assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
+  const TX_SIZE tx_size = txfm_param->tx_size;
+  switch (tx_size) {
+    case TX_32X32:
+      av1_highbd_inv_txfm_add_32x32_c(input, dest, stride, txfm_param);
+      break;
+    case TX_16X16:
+      av1_highbd_inv_txfm_add_16x16_c(input, dest, stride, txfm_param);
+      break;
+    case TX_8X8:
+      av1_highbd_inv_txfm_add_8x8_c(input, dest, stride, txfm_param);
+      break;
+    case TX_4X8:
+      av1_highbd_inv_txfm_add_4x8_c(input, dest, stride, txfm_param);
+      break;
+    case TX_8X4:
+      av1_highbd_inv_txfm_add_8x4_c(input, dest, stride, txfm_param);
+      break;
+    case TX_8X16:
+      av1_highbd_inv_txfm_add_8x16_c(input, dest, stride, txfm_param);
+      break;
+    case TX_16X8:
+      av1_highbd_inv_txfm_add_16x8_c(input, dest, stride, txfm_param);
+      break;
+    case TX_16X32:
+      av1_highbd_inv_txfm_add_16x32_c(input, dest, stride, txfm_param);
+      break;
+    case TX_32X16:
+      av1_highbd_inv_txfm_add_32x16_c(input, dest, stride, txfm_param);
+      break;
+    case TX_64X64:
+      av1_highbd_inv_txfm_add_64x64_c(input, dest, stride, txfm_param);
+      break;
+    case TX_32X64:
+      av1_highbd_inv_txfm_add_32x64_c(input, dest, stride, txfm_param);
+      break;
+    case TX_64X32:
+      av1_highbd_inv_txfm_add_64x32_c(input, dest, stride, txfm_param);
+      break;
+    case TX_16X64:
+      av1_highbd_inv_txfm_add_16x64_c(input, dest, stride, txfm_param);
+      break;
+    case TX_64X16:
+      av1_highbd_inv_txfm_add_64x16_c(input, dest, stride, txfm_param);
+      break;
+    case TX_4X4:
+      // this is like av1_short_idct4x4 but has a special case around eob<=1
+      // which is significant (not just an optimization) for the lossless
+      // case.
+      av1_highbd_inv_txfm_add_4x4_c(input, dest, stride, txfm_param);
+      break;
+    case TX_16X4:
+      av1_highbd_inv_txfm_add_16x4_c(input, dest, stride, txfm_param);
+      break;
+    case TX_4X16:
+      av1_highbd_inv_txfm_add_4x16_c(input, dest, stride, txfm_param);
+      break;
+    case TX_8X32:
+      av1_highbd_inv_txfm_add_8x32_c(input, dest, stride, txfm_param);
+      break;
+    case TX_32X8:
+      av1_highbd_inv_txfm_add_32x8_c(input, dest, stride, txfm_param);
+      break;
+    default: assert(0 && "Invalid transform size"); break;
+  }
+}
+
+void av1_inv_txfm_add_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
+                        const TxfmParam *txfm_param) {
+  const TX_SIZE tx_size = txfm_param->tx_size;
+  DECLARE_ALIGNED(32, uint16_t, tmp[MAX_TX_SQUARE]);
+  int tmp_stride = MAX_TX_SIZE;
+  int w = tx_size_wide[tx_size];
+  int h = tx_size_high[tx_size];
+  for (int r = 0; r < h; ++r) {
+    for (int c = 0; c < w; ++c) {
+      tmp[r * tmp_stride + c] = dst[r * stride + c];
+    }
+  }
+
+  av1_highbd_inv_txfm_add(dqcoeff, CONVERT_TO_BYTEPTR(tmp), tmp_stride,
+                          txfm_param);
+
+  for (int r = 0; r < h; ++r) {
+    for (int c = 0; c < w; ++c) {
+      dst[r * stride + c] = (uint8_t)tmp[r * tmp_stride + c];
+    }
+  }
+}
+
+void av1_inverse_transform_block(const MACROBLOCKD *xd,
+                                 const tran_low_t *dqcoeff, int plane,
+                                 TX_TYPE tx_type, TX_SIZE tx_size, uint8_t *dst,
+                                 int stride, int eob, int reduced_tx_set) {
+  if (!eob) return;
+
+  assert(eob <= av1_get_max_eob(tx_size));
+
+  TxfmParam txfm_param;
+  init_txfm_param(xd, plane, tx_size, tx_type, eob, reduced_tx_set,
+                  &txfm_param);
+  assert(av1_ext_tx_used[txfm_param.tx_set_type][txfm_param.tx_type]);
+
+  if (txfm_param.is_hbd) {
+    av1_highbd_inv_txfm_add(dqcoeff, dst, stride, &txfm_param);
+  } else {
+    av1_inv_txfm_add(dqcoeff, dst, stride, &txfm_param);
+  }
+}
diff --git a/libs/libaom/src/av1/common/idct.h b/libs/libaom/src/av1/common/idct.h
new file mode 100644
index 000000000..004d25d49
--- /dev/null
+++ b/libs/libaom/src/av1/common/idct.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_IDCT_H_
+#define AOM_AV1_COMMON_IDCT_H_
+
+#include "config/aom_config.h"
+
+#include "av1/common/blockd.h"
+#include "av1/common/common.h"
+#include "av1/common/enums.h"
+#include "aom_dsp/txfm_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef void (*transform_1d)(const tran_low_t *, tran_low_t *);
+
+typedef struct {
+  transform_1d cols, rows;  // vertical and horizontal
+} transform_2d;
+
+#define MAX_TX_SCALE 1
+int av1_get_tx_scale(const TX_SIZE tx_size);
+
+void av1_inverse_transform_block(const MACROBLOCKD *xd,
+                                 const tran_low_t *dqcoeff, int plane,
+                                 TX_TYPE tx_type, TX_SIZE tx_size, uint8_t *dst,
+                                 int stride, int eob, int reduced_tx_set);
+void av1_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+                            int eob, int bd);
+
+static INLINE const int32_t *cast_to_int32(const tran_low_t *input) {
+  assert(sizeof(int32_t) == sizeof(tran_low_t));
+  return (const int32_t *)input;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_COMMON_IDCT_H_
diff --git a/libs/libaom/src/av1/common/loopfiltermask.c b/libs/libaom/src/av1/common/loopfiltermask.c
new file mode 100644
index 000000000..157310f2d
--- /dev/null
+++ b/libs/libaom/src/av1/common/loopfiltermask.c
@@ -0,0 +1,1458 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/common/av1_loopfilter.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/seg_common.h"
+
+// 256 bit masks (64x64 / 4x4) for left transform size for Y plane.
+// We use 4 uint64_t to represent the 256 bit.
+// Each 1 represents a position where we should apply a loop filter
+// across the left border of an 4x4 block boundary.
+//
+// In the case of TX_8x8->  ( in low order byte first we end up with
+// a mask that looks like this (-- and | are used for better view)
+//
+//    10101010|10101010
+//    10101010|10101010
+//    10101010|10101010
+//    10101010|10101010
+//    10101010|10101010
+//    10101010|10101010
+//    10101010|10101010
+//    10101010|10101010
+//    -----------------
+//    10101010|10101010
+//    10101010|10101010
+//    10101010|10101010
+//    10101010|10101010
+//    10101010|10101010
+//    10101010|10101010
+//    10101010|10101010
+//    10101010|10101010
+//
+// A loopfilter should be applied to every other 4x4 horizontally.
+
+// 256 bit masks (64x64 / 4x4) for above transform size for Y plane.
+// We use 4 uint64_t to represent the 256 bit.
+// Each 1 represents a position where we should apply a loop filter
+// across the top border of an 4x4 block boundary.
+//
+// In the case of TX_8x8->  ( in low order byte first we end up with
+// a mask that looks like this
+//
+//    11111111|11111111
+//    00000000|00000000
+//    11111111|11111111
+//    00000000|00000000
+//    11111111|11111111
+//    00000000|00000000
+//    11111111|11111111
+//    00000000|00000000
+//    -----------------
+//    11111111|11111111
+//    00000000|00000000
+//    11111111|11111111
+//    00000000|00000000
+//    11111111|11111111
+//    00000000|00000000
+//    11111111|11111111
+//    00000000|00000000
+//
+// A loopfilter should be applied to every other 4x4 horizontally.
+#if CONFIG_LPF_MASK
+static const int mask_id_table_tx_4x4[BLOCK_SIZES_ALL] = {
+  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, -1, -1, 13, 14, 15, 16, 17, 18
+};
+
+static const int mask_id_table_tx_8x8[BLOCK_SIZES_ALL] = {
+  -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, 10, 11, 12, 13
+};
+
+static const int mask_id_table_tx_16x16[BLOCK_SIZES_ALL] = {
+  -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, -1, -1, -1, -1, -1, -1, -1, 7, 8
+};
+
+static const int mask_id_table_tx_32x32[BLOCK_SIZES_ALL] = {
+  -1, -1, -1, -1, -1, -1, -1, -1, -1, 0,  1,
+  2,  3,  -1, -1, -1, -1, -1, -1, -1, -1, -1
+};
+static const int mask_id_table_vert_border[BLOCK_SIZES_ALL] = {
+  0,  47, 49, 19, 51, 53, 33, 55, 57, 42, 59,
+  60, 46, -1, -1, -1, 61, 62, 63, 64, 65, 66
+};
+
+static const FilterMask left_mask_univariant_reordered[67] = {
+  // TX_4X4
+  { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 4X4, TX_4X4
+  { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 4X8, TX_4X4
+  { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X4, TX_4X4
+  { { 0x0000000000030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X8, TX_4X4
+  { { 0x0003000300030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X16, TX_4X4
+  { { 0x00000000000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X8, TX_4X4
+  { { 0x000f000f000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X16, TX_4X4
+  { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X32, TX_4X4
+  { { 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X16, TX_4X4
+  { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X32, TX_4X4
+  { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL,
+      0x00ff00ff00ff00ffULL } },  // block size 32X64, TX_4X4
+  { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X32, TX_4X4
+  { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL,
+      0xffffffffffffffffULL } },  // block size 64X64, TX_4X4
+  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 4X16, TX_4X4
+  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X4, TX_4X4
+  { { 0x0003000300030003ULL, 0x0003000300030003ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X32, TX_4X4
+  { { 0x0000000000ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X8, TX_4X4
+  { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL,
+      0x000f000f000f000fULL } },  // block size 16X64, TX_4X4
+  { { 0xffffffffffffffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X16, TX_4X4
+  // TX_8X8
+  { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X8, TX_8X8
+  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X16, TX_8X8
+  { { 0x0000000000050005ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X8, TX_8X8
+  { { 0x0005000500050005ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X16, TX_8X8
+  { { 0x0005000500050005ULL, 0x0005000500050005ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X32, TX_8X8
+  { { 0x0055005500550055ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X16, TX_8X8
+  { { 0x0055005500550055ULL, 0x0055005500550055ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X32, TX_8X8
+  { { 0x0055005500550055ULL, 0x0055005500550055ULL, 0x0055005500550055ULL,
+      0x0055005500550055ULL } },  // block size 32X64, TX_8X8
+  { { 0x5555555555555555ULL, 0x5555555555555555ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X32, TX_8X8
+  { { 0x5555555555555555ULL, 0x5555555555555555ULL, 0x5555555555555555ULL,
+      0x5555555555555555ULL } },  // block size 64X64, TX_8X8
+  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X32, TX_8X8
+  { { 0x0000000000550055ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X8, TX_8X8
+  { { 0x0005000500050005ULL, 0x0005000500050005ULL, 0x0005000500050005ULL,
+      0x0005000500050005ULL } },  // block size 16X64, TX_8X8
+  { { 0x5555555555555555ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X16, TX_8X8
+  // TX_16X16
+  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X16, TX_16X16
+  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X32, TX_16X16
+  { { 0x0011001100110011ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X16, TX_16X16
+  { { 0x0011001100110011ULL, 0x0011001100110011ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X32, TX_16X16
+  { { 0x0011001100110011ULL, 0x0011001100110011ULL, 0x0011001100110011ULL,
+      0x0011001100110011ULL } },  // block size 32X64, TX_16X16
+  { { 0x1111111111111111ULL, 0x1111111111111111ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X32, TX_16X16
+  { { 0x1111111111111111ULL, 0x1111111111111111ULL, 0x1111111111111111ULL,
+      0x1111111111111111ULL } },  // block size 64X64, TX_16X16
+  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
+      0x0001000100010001ULL } },  // block size 16X64, TX_16X16
+  { { 0x1111111111111111ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X16, TX_16X16
+  // TX_32X32
+  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X32, TX_32X32
+  { { 0x0101010101010101ULL, 0x0101010101010101ULL, 0x0101010101010101ULL,
+      0x0101010101010101ULL } },  // block size 32X64, TX_32X32
+  { { 0x0101010101010101ULL, 0x0101010101010101ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X32, TX_32X32
+  { { 0x0101010101010101ULL, 0x0101010101010101ULL, 0x0101010101010101ULL,
+      0x0101010101010101ULL } },  // block size 64X64, TX_32X32
+  // TX_64X64
+  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
+      0x0001000100010001ULL } },  // block size 64X64, TX_64X64
+  // 2:1, 1:2 transform sizes.
+  { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 4X8, TX_4X8
+  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 4X16, TX_4X8
+  { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X4, TX_8X4
+  { { 0x0000000000000005ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X4, TX_8X4
+  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X16, TX_8X16
+  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X32, TX_8X16
+  { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X8, TX_16X8
+  { { 0x0000000000110011ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X8, TX_16X8
+  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X32, TX_16X32
+  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
+      0x0001000100010001ULL } },  // block size 16X64, TX_16X32
+  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X16, TX_32X16
+  { { 0x0101010101010101ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X16, TX_32X16
+  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
+      0x0001000100010001ULL } },  // block size 32X64, TX_32X64
+  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X32, TX_64X32
+  // 4:1, 1:4 transform sizes.
+  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 4X16, TX_4X16
+  { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X4, TX_16X4
+  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X32, TX_8X32
+  { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X8, TX_32X8
+  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
+      0x0001000100010001ULL } },  // block size 16X64, TX_16X64
+  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X16, TX_64X16
+};
+
+static const FilterMask above_mask_univariant_reordered[67] = {
+  // TX_4X4
+  { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 4X4, TX_4X4
+  { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 4X8, TX_4X4
+  { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X4, TX_4X4
+  { { 0x0000000000030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X8, TX_4X4
+  { { 0x0003000300030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X16, TX_4X4
+  { { 0x00000000000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X8, TX_4X4
+  { { 0x000f000f000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X16, TX_4X4
+  { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X32, TX_4X4
+  { { 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X16, TX_4X4
+  { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X32, TX_4X4
+  { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL,
+      0x00ff00ff00ff00ffULL } },  // block size 32X64, TX_4X4
+  { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X32, TX_4X4
+  { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL,
+      0xffffffffffffffffULL } },  // block size 64X64, TX_4x4
+  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 4X16, TX_4X4
+  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X4, TX_4X4
+  { { 0x0003000300030003ULL, 0x0003000300030003ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X32, TX_4X4
+  { { 0x0000000000ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X8, TX_4X4
+  { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL,
+      0x000f000f000f000fULL } },  // block size 16X64, TX_4X4
+  { { 0xffffffffffffffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X16, TX_4X4
+  // TX_8X8
+  { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X8, TX_8X8
+  { { 0x0000000300000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X16, TX_8X8
+  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X8, TX_8X8
+  { { 0x0000000f0000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X16, TX_8X8
+  { { 0x0000000f0000000fULL, 0x0000000f0000000fULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X32, TX_8X8
+  { { 0x000000ff000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X16, TX_8X8
+  { { 0x000000ff000000ffULL, 0x000000ff000000ffULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X32, TX_8X8
+  { { 0x000000ff000000ffULL, 0x000000ff000000ffULL, 0x000000ff000000ffULL,
+      0x000000ff000000ffULL } },  // block size 32X64, TX_8X8
+  { { 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X32, TX_8X8
+  { { 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL,
+      0x0000ffff0000ffffULL } },  // block size 64X64, TX_8X8
+  { { 0x0000000300000003ULL, 0x0000000300000003ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X32, TX_8X8
+  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X8, TX_8X8
+  { { 0x0000000f0000000fULL, 0x0000000f0000000fULL, 0x0000000f0000000fULL,
+      0x0000000f0000000fULL } },  // block size 16X64, TX_8X8
+  { { 0x0000ffff0000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X16, TX_8X8
+  // TX_16X16
+  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X16, TX_16X16
+  { { 0x000000000000000fULL, 0x000000000000000fULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X32, TX_16X16
+  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X16, TX_16X16
+  { { 0x00000000000000ffULL, 0x00000000000000ffULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X32, TX_16X16
+  { { 0x00000000000000ffULL, 0x00000000000000ffULL, 0x00000000000000ffULL,
+      0x00000000000000ffULL } },  // block size 32X64, TX_16X16
+  { { 0x000000000000ffffULL, 0x000000000000ffffULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X32, TX_16X16
+  { { 0x000000000000ffffULL, 0x000000000000ffffULL, 0x000000000000ffffULL,
+      0x000000000000ffffULL } },  // block size 64X64, TX_16X16
+  { { 0x000000000000000fULL, 0x000000000000000fULL, 0x000000000000000fULL,
+      0x000000000000000fULL } },  // block size 16X64, TX_16X16
+  { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X16, TX_16X16
+  // TX_32X32
+  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X32, TX_32X32
+  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x00000000000000ffULL,
+      0x0000000000000000ULL } },  // block size 32X64, TX_32X32
+  { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X32, TX_32X32
+  { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x000000000000ffffULL,
+      0x0000000000000000ULL } },  // block size 64X64, TX_32X32
+  // TX_64X64
+  { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X64, TX_64X64
+  // 2:1, 1:2 transform sizes.
+  { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 4X8, TX_4X8
+  { { 0x0000000100000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 4X16, TX_4X8
+  { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X4, TX_8X4
+  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X4, TX_8X4
+  { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X16, TX_8X16
+  { { 0x0000000000000003ULL, 0x0000000000000003ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X32, TX_8X16
+  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X8, TX_16X8
+  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X8, TX_16X8
+  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X32, TX_16X32
+  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x000000000000000fULL,
+      0x0000000000000000ULL } },  // block size 16X64, TX_16X32
+  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X16, TX_32X16
+  { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X16, TX_32X16
+  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X64, TX_32X64
+  { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X32, TX_64X32
+  // 4:1, 1:4 transform sizes.
+  { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 4X16, TX_4X16
+  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X4, TX_16X4
+  { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 8X32, TX_8X32
+  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 32X8, TX_32X8
+  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 16X64, TX_16X64
+  { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+      0x0000000000000000ULL } },  // block size 64X16, TX_64X16
+};
+
+static LoopFilterMask *get_loop_filter_mask(const AV1_COMMON *const cm,
+                                            int mi_row, int mi_col) {
+  assert(cm->lf.lfm != NULL);
+  const int row = mi_row >> MIN_MIB_SIZE_LOG2;  // 64x64
+  const int col = mi_col >> MIN_MIB_SIZE_LOG2;
+  return &cm->lf.lfm[row * cm->lf.lfm_stride + col];
+}
+
+typedef void (*LpfFunc)(uint8_t *s, int p, const uint8_t *blimit,
+                        const uint8_t *limit, const uint8_t *thresh);
+
+typedef void (*LpfDualFunc)(uint8_t *s, int p, const uint8_t *blimit0,
+                            const uint8_t *limit0, const uint8_t *thresh0,
+                            const uint8_t *blimit1, const uint8_t *limit1,
+                            const uint8_t *thresh1);
+
+typedef void (*HbdLpfFunc)(uint16_t *s, int p, const uint8_t *blimit,
+                           const uint8_t *limit, const uint8_t *thresh, int bd);
+
+typedef void (*HbdLpfDualFunc)(uint16_t *s, int p, const uint8_t *blimit0,
+                               const uint8_t *limit0, const uint8_t *thresh0,
+                               const uint8_t *blimit1, const uint8_t *limit1,
+                               const uint8_t *thresh1, int bd);
+// A 64x64 tx block requires 256 bits to represent each 4x4 tx block.
+// Every 4 rows is represented by one uint64_t mask. Hence,
+// there are 4 uint64_t bitmask[4] to represent the 64x64 block.
+//
+// Given a location by (mi_col, mi_row), This function returns the index
+// 0, 1, 2, 3 to select which bitmask[] to use, and the shift value.
+//
+// For example, mi_row is the offset of pixels in mi size (4),
+// (mi_row / 4) returns which uint64_t.
+// After locating which uint64_t, mi_row % 4 is the
+// row offset, and each row has 16 = 1 << stride_log2 4x4 units.
+// Therefore, shift = (row << stride_log2) + mi_col;
+int get_index_shift(int mi_col, int mi_row, int *index) {
+  // *index = mi_row >> 2;
+  // rows = mi_row % 4;
+  // stride_log2 = 4;
+  // shift = (rows << stride_log2) + mi_col;
+  *index = mi_row >> 2;
+  return ((mi_row & 3) << 4) | mi_col;
+}
+
+static void filter_selectively_vert_row2(
+    int subsampling_factor, uint8_t *s, int pitch, int plane,
+    uint64_t mask_16x16_0, uint64_t mask_8x8_0, uint64_t mask_4x4_0,
+    uint64_t mask_16x16_1, uint64_t mask_8x8_1, uint64_t mask_4x4_1,
+    const loop_filter_info_n *lfi_n, uint8_t *lfl, uint8_t *lfl2) {
+  uint64_t mask;
+  const int step = 1 << subsampling_factor;
+
+  for (mask = mask_16x16_0 | mask_8x8_0 | mask_4x4_0 | mask_16x16_1 |
+              mask_8x8_1 | mask_4x4_1;
+       mask; mask >>= step) {
+    const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl;
+    const loop_filter_thresh *lfi1 = lfi_n->lfthr + *lfl2;
+
+    if (mask & 1) {
+      if ((mask_16x16_0 | mask_16x16_1) & 1) {
+        // chroma plane filters less pixels introduced in deblock_13tap
+        // experiment
+        LpfFunc lpf_vertical = plane ? aom_lpf_vertical_6 : aom_lpf_vertical_14;
+
+        if ((mask_16x16_0 & mask_16x16_1) & 1) {
+          if (plane) {
+            aom_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                    lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+                                    lfi1->hev_thr);
+          } else {
+            aom_lpf_vertical_14_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                     lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+                                     lfi1->hev_thr);
+          }
+        } else if (mask_16x16_0 & 1) {
+          lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
+        } else {
+          lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
+                       lfi1->hev_thr);
+        }
+      }
+
+      if ((mask_8x8_0 | mask_8x8_1) & 1) {
+        // chroma plane filters less pixels introduced in deblock_13tap
+        // experiment
+        LpfFunc lpf_vertical = plane ? aom_lpf_vertical_6 : aom_lpf_vertical_8;
+
+        if ((mask_8x8_0 & mask_8x8_1) & 1) {
+          if (plane) {
+            aom_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                    lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+                                    lfi1->hev_thr);
+          } else {
+            aom_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                    lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+                                    lfi1->hev_thr);
+          }
+        } else if (mask_8x8_0 & 1) {
+          lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
+        } else {
+          lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
+                       lfi1->hev_thr);
+        }
+      }
+
+      if ((mask_4x4_0 | mask_4x4_1) & 1) {
+        if ((mask_4x4_0 & mask_4x4_1) & 1) {
+          aom_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                  lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+                                  lfi1->hev_thr);
+        } else if (mask_4x4_0 & 1) {
+          aom_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
+        } else {
+          aom_lpf_vertical_4(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
+                             lfi1->hev_thr);
+        }
+      }
+    }
+
+    s += 4;
+    lfl += step;
+    lfl2 += step;
+    mask_16x16_0 >>= step;
+    mask_8x8_0 >>= step;
+    mask_4x4_0 >>= step;
+    mask_16x16_1 >>= step;
+    mask_8x8_1 >>= step;
+    mask_4x4_1 >>= step;
+  }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static void highbd_filter_selectively_vert_row2(
+    int subsampling_factor, uint16_t *s, int pitch, int plane,
+    uint64_t mask_16x16_0, uint64_t mask_8x8_0, uint64_t mask_4x4_0,
+    uint64_t mask_16x16_1, uint64_t mask_8x8_1, uint64_t mask_4x4_1,
+    const loop_filter_info_n *lfi_n, uint8_t *lfl, uint8_t *lfl2, int bd) {
+  uint64_t mask;
+  const int step = 1 << subsampling_factor;
+
+  for (mask = mask_16x16_0 | mask_8x8_0 | mask_4x4_0 | mask_16x16_1 |
+              mask_8x8_1 | mask_4x4_1;
+       mask; mask >>= step) {
+    const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl;
+    const loop_filter_thresh *lfi1 = lfi_n->lfthr + *lfl2;
+
+    if (mask & 1) {
+      if ((mask_16x16_0 | mask_16x16_1) & 1) {
+        // chroma plane filters less pixels introduced in deblock_13tap
+        // experiment
+        HbdLpfFunc highbd_lpf_vertical =
+            plane ? aom_highbd_lpf_vertical_6 : aom_highbd_lpf_vertical_14;
+
+        if ((mask_16x16_0 & mask_16x16_1) & 1) {
+          if (plane) {
+            aom_highbd_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                           lfi0->hev_thr, lfi1->mblim,
+                                           lfi1->lim, lfi1->hev_thr, bd);
+          } else {
+            aom_highbd_lpf_vertical_14_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                            lfi0->hev_thr, lfi1->mblim,
+                                            lfi1->lim, lfi1->hev_thr, bd);
+          }
+        } else if (mask_16x16_0 & 1) {
+          highbd_lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr,
+                              bd);
+        } else {
+          highbd_lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
+                              lfi1->hev_thr, bd);
+        }
+      }
+
+      if ((mask_8x8_0 | mask_8x8_1) & 1) {
+        HbdLpfFunc highbd_lpf_vertical =
+            plane ? aom_highbd_lpf_vertical_6 : aom_highbd_lpf_vertical_8;
+
+        if ((mask_8x8_0 & mask_8x8_1) & 1) {
+          if (plane) {
+            aom_highbd_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                           lfi0->hev_thr, lfi1->mblim,
+                                           lfi1->lim, lfi1->hev_thr, bd);
+          } else {
+            aom_highbd_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                           lfi0->hev_thr, lfi1->mblim,
+                                           lfi1->lim, lfi1->hev_thr, bd);
+          }
+        } else if (mask_8x8_0 & 1) {
+          highbd_lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr,
+                              bd);
+        } else {
+          highbd_lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
+                              lfi1->hev_thr, bd);
+        }
+      }
+
+      if ((mask_4x4_0 | mask_4x4_1) & 1) {
+        if ((mask_4x4_0 & mask_4x4_1) & 1) {
+          aom_highbd_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim,
+                                         lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+                                         lfi1->hev_thr, bd);
+        } else if (mask_4x4_0 & 1) {
+          aom_highbd_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim,
+                                    lfi0->hev_thr, bd);
+        } else {
+          aom_highbd_lpf_vertical_4(s + 4 * pitch, pitch, lfi1->mblim,
+                                    lfi1->lim, lfi1->hev_thr, bd);
+        }
+      }
+    }
+
+    s += 4;
+    lfl += step;
+    lfl2 += step;
+    mask_16x16_0 >>= step;
+    mask_8x8_0 >>= step;
+    mask_4x4_0 >>= step;
+    mask_16x16_1 >>= step;
+    mask_8x8_1 >>= step;
+    mask_4x4_1 >>= step;
+  }
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+static void filter_selectively_horiz(uint8_t *s, int pitch, int plane,
+                                     int subsampling, uint64_t mask_16x16,
+                                     uint64_t mask_8x8, uint64_t mask_4x4,
+                                     const loop_filter_info_n *lfi_n,
+                                     const uint8_t *lfl) {
+  uint64_t mask;
+  int count;
+  const int step = 1 << subsampling;
+  const unsigned int two_block_mask = subsampling ? 5 : 3;
+  int offset = 0;
+
+  for (mask = mask_16x16 | mask_8x8 | mask_4x4; mask; mask >>= step * count) {
+    const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
+    // Next block's thresholds, when it is within current 64x64 block.
+    // If it is out of bound, its mask is zero, and it points to current edge's
+    // filter parameters, instead of next edge's.
+    int next_edge = step;
+    if (offset + next_edge >= MI_SIZE_64X64) next_edge = 0;
+    const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + next_edge);
+
+    count = 1;
+    if (mask & 1) {
+      if (mask_16x16 & 1) {
+        // chroma plane filters less pixels introduced in deblock_13tap
+        // experiment
+        LpfFunc lpf_horizontal =
+            plane ? aom_lpf_horizontal_6 : aom_lpf_horizontal_14;
+
+        if ((mask_16x16 & two_block_mask) == two_block_mask) {
+          if (plane) {
+            aom_lpf_horizontal_6_dual(s, pitch, lfi->mblim, lfi->lim,
+                                      lfi->hev_thr, lfin->mblim, lfin->lim,
+                                      lfin->hev_thr);
+          } else {
+            aom_lpf_horizontal_14_dual(s, pitch, lfi->mblim, lfi->lim,
+                                       lfi->hev_thr, lfin->mblim, lfin->lim,
+                                       lfin->hev_thr);
+          }
+          count = 2;
+        } else {
+          lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
+        }
+      } else if (mask_8x8 & 1) {
+        // chroma plane filters less pixels introduced in deblock_13tap
+        // experiment
+        LpfFunc lpf_horizontal =
+            plane ? aom_lpf_horizontal_6 : aom_lpf_horizontal_8;
+
+        if ((mask_8x8 & two_block_mask) == two_block_mask) {
+          if (plane) {
+            aom_lpf_horizontal_6_dual(s, pitch, lfi->mblim, lfi->lim,
+                                      lfi->hev_thr, lfin->mblim, lfin->lim,
+                                      lfin->hev_thr);
+          } else {
+            aom_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
+                                      lfi->hev_thr, lfin->mblim, lfin->lim,
+                                      lfin->hev_thr);
+          }
+          count = 2;
+        } else {
+          lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
+        }
+      } else if (mask_4x4 & 1) {
+        if ((mask_4x4 & two_block_mask) == two_block_mask) {
+          aom_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim,
+                                    lfi->hev_thr, lfin->mblim, lfin->lim,
+                                    lfin->hev_thr);
+          count = 2;
+        } else {
+          aom_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
+        }
+      }
+    }
+
+    s += 4 * count;
+    lfl += step * count;
+    mask_16x16 >>= step * count;
+    mask_8x8 >>= step * count;
+    mask_4x4 >>= step * count;
+    offset += step * count;
+  }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static void highbd_filter_selectively_horiz(
+    uint16_t *s, int pitch, int plane, int subsampling, uint64_t mask_16x16,
+    uint64_t mask_8x8, uint64_t mask_4x4, const loop_filter_info_n *lfi_n,
+    uint8_t *lfl, int bd) {
+  uint64_t mask;
+  int count;
+  const int step = 1 << subsampling;
+  const unsigned int two_block_mask = subsampling ? 5 : 3;
+  int offset = 0;
+
+  for (mask = mask_16x16 | mask_8x8 | mask_4x4; mask; mask >>= step * count) {
+    const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
+    // Next block's thresholds, when it is within current 64x64 block.
+    // If it is out of bound, its mask is zero, and it points to current edge's
+    // filter parameters, instead of next edge's.
+    int next_edge = step;
+    if (offset + next_edge >= MI_SIZE_64X64) next_edge = 0;
+    const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + next_edge);
+
+    count = 1;
+    if (mask & 1) {
+      if (mask_16x16 & 1) {
+        HbdLpfFunc highbd_lpf_horizontal =
+            plane ? aom_highbd_lpf_horizontal_6 : aom_highbd_lpf_horizontal_14;
+
+        if ((mask_16x16 & two_block_mask) == two_block_mask) {
+          if (plane) {
+            aom_highbd_lpf_horizontal_6_dual_c(s, pitch, lfi->mblim, lfi->lim,
+                                               lfi->hev_thr, lfin->mblim,
+                                               lfin->lim, lfin->hev_thr, bd);
+          } else {
+            aom_highbd_lpf_horizontal_14_dual(s, pitch, lfi->mblim, lfi->lim,
+                                              lfi->hev_thr, lfin->mblim,
+                                              lfin->lim, lfin->hev_thr, bd);
+          }
+          count = 2;
+        } else {
+          highbd_lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr,
+                                bd);
+        }
+      } else if (mask_8x8 & 1) {
+        HbdLpfFunc highbd_lpf_horizontal =
+            plane ? aom_highbd_lpf_horizontal_6 : aom_highbd_lpf_horizontal_8;
+
+        if ((mask_8x8 & two_block_mask) == two_block_mask) {
+          if (plane) {
+            aom_highbd_lpf_horizontal_6_dual_c(s, pitch, lfi->mblim, lfi->lim,
+                                               lfi->hev_thr, lfin->mblim,
+                                               lfin->lim, lfin->hev_thr, bd);
+          } else {
+            aom_highbd_lpf_horizontal_8_dual_c(s, pitch, lfi->mblim, lfi->lim,
+                                               lfi->hev_thr, lfin->mblim,
+                                               lfin->lim, lfin->hev_thr, bd);
+          }
+          count = 2;
+        } else {
+          highbd_lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr,
+                                bd);
+        }
+      } else if (mask_4x4 & 1) {
+        if ((mask_4x4 & two_block_mask) == two_block_mask) {
+          aom_highbd_lpf_horizontal_4_dual_c(s, pitch, lfi->mblim, lfi->lim,
+                                             lfi->hev_thr, lfin->mblim,
+                                             lfin->lim, lfin->hev_thr, bd);
+          count = 2;
+        } else {
+          aom_highbd_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim,
+                                      lfi->hev_thr, bd);
+        }
+      }
+    }
+
+    s += 4 * count;
+    lfl += step * count;
+    mask_16x16 >>= step * count;
+    mask_8x8 >>= step * count;
+    mask_4x4 >>= step * count;
+    offset += step * count;
+  }
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+void av1_build_bitmask_vert_info(
+    AV1_COMMON *const cm, const struct macroblockd_plane *const plane_ptr,
+    int plane) {
+  const int subsampling_x = plane_ptr->subsampling_x;
+  const int subsampling_y = plane_ptr->subsampling_y;
+  const int is_uv = plane > 0;
+  TX_SIZE tx_size = TX_16X16, prev_tx_size = TX_16X16;
+  uint8_t level, prev_level = 1;
+  uint64_t skip, prev_skip = 0;
+  uint64_t is_coding_block_border;
+
+  for (int r = 0; (r << MI_SIZE_LOG2) < plane_ptr->dst.height; r++) {
+    const int mi_row = r << subsampling_y;
+    const int row = mi_row % MI_SIZE_64X64;
+    const int row_uv = row | subsampling_y;
+    int index = 0;
+    const int shift = get_index_shift(0, row, &index);
+
+    for (int c = 0; (c << MI_SIZE_LOG2) < plane_ptr->dst.width;
+         c += (tx_size_wide_unit[TX_64X64] >> subsampling_x)) {
+      const int mi_col = c << subsampling_x;
+      LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
+
+      for (int col_in_unit = 0;
+           col_in_unit < (tx_size_wide_unit[TX_64X64] >> subsampling_x);) {
+        const int x = (c + col_in_unit) << MI_SIZE_LOG2;
+        if (x >= plane_ptr->dst.width) break;
+        const int col = col_in_unit << subsampling_x;
+        const int col_uv = col | subsampling_x;
+        const uint64_t mask = ((uint64_t)1 << (shift | col));
+        skip = lfm->skip.bits[index] & mask;
+        is_coding_block_border = lfm->is_vert_border.bits[index] & mask;
+        switch (plane) {
+          case 0: level = lfm->lfl_y_ver[row_uv][col_uv]; break;
+          case 1: level = lfm->lfl_u_ver[row_uv][col_uv]; break;
+          case 2: level = lfm->lfl_v_ver[row_uv][col_uv]; break;
+          default: assert(plane >= 0 && plane <= 2); return;
+        }
+        for (TX_SIZE ts = TX_4X4; ts <= TX_64X64; ++ts) {
+          if (is_uv && ts == TX_64X64) continue;
+          if (lfm->tx_size_ver[is_uv][ts].bits[index] & mask) {
+            tx_size = ts;
+            break;
+          }
+        }
+        if ((c + col_in_unit > 0) && (level || prev_level) &&
+            (!prev_skip || !skip || is_coding_block_border)) {
+          const TX_SIZE min_tx_size =
+              AOMMIN(TX_16X16, AOMMIN(tx_size, prev_tx_size));
+          const int shift_1 = get_index_shift(col_uv, row_uv, &index);
+          const uint64_t mask_1 = ((uint64_t)1 << shift_1);
+          switch (plane) {
+            case 0: lfm->left_y[min_tx_size].bits[index] |= mask_1; break;
+            case 1: lfm->left_u[min_tx_size].bits[index] |= mask_1; break;
+            case 2: lfm->left_v[min_tx_size].bits[index] |= mask_1; break;
+            default: assert(plane >= 0 && plane <= 2); return;
+          }
+          if (level == 0 && prev_level != 0) {
+            switch (plane) {
+              case 0: lfm->lfl_y_ver[row_uv][col_uv] = prev_level; break;
+              case 1: lfm->lfl_u_ver[row_uv][col_uv] = prev_level; break;
+              case 2: lfm->lfl_v_ver[row_uv][col_uv] = prev_level; break;
+              default: assert(plane >= 0 && plane <= 2); return;
+            }
+          }
+        }
+
+        // update prev info
+        prev_level = level;
+        prev_skip = skip;
+        prev_tx_size = tx_size;
+        // advance
+        col_in_unit += tx_size_wide_unit[tx_size];
+      }
+    }
+  }
+}
+
+void av1_build_bitmask_horz_info(
+    AV1_COMMON *const cm, const struct macroblockd_plane *const plane_ptr,
+    int plane) {
+  const int subsampling_x = plane_ptr->subsampling_x;
+  const int subsampling_y = plane_ptr->subsampling_y;
+  const int is_uv = plane > 0;
+  TX_SIZE tx_size = TX_16X16, prev_tx_size = TX_16X16;
+  uint8_t level, prev_level = 1;
+  uint64_t skip, prev_skip = 0;
+  uint64_t is_coding_block_border;
+
+  for (int c = 0; (c << MI_SIZE_LOG2) < plane_ptr->dst.width; c++) {
+    const int mi_col = c << subsampling_x;
+    const int col = mi_col % MI_SIZE_64X64;
+    const int col_uv = col | subsampling_x;
+
+    for (int r = 0; (r << MI_SIZE_LOG2) < plane_ptr->dst.height;
+         r += (tx_size_high_unit[TX_64X64] >> subsampling_y)) {
+      const int mi_row = r << subsampling_y;
+      LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
+
+      for (int r_in_unit = 0;
+           r_in_unit < (tx_size_high_unit[TX_64X64] >> subsampling_y);) {
+        const int y = (r + r_in_unit) << MI_SIZE_LOG2;
+        if (y >= plane_ptr->dst.height) break;
+        const int row = r_in_unit << subsampling_y;
+        const int row_uv = row | subsampling_y;
+        int index = 0;
+        const int shift = get_index_shift(col, row, &index);
+        const uint64_t mask = ((uint64_t)1 << shift);
+        skip = lfm->skip.bits[index] & mask;
+        is_coding_block_border = lfm->is_horz_border.bits[index] & mask;
+        switch (plane) {
+          case 0: level = lfm->lfl_y_hor[row_uv][col_uv]; break;
+          case 1: level = lfm->lfl_u_hor[row_uv][col_uv]; break;
+          case 2: level = lfm->lfl_v_hor[row_uv][col_uv]; break;
+          default: assert(plane >= 0 && plane <= 2); return;
+        }
+        for (TX_SIZE ts = TX_4X4; ts <= TX_64X64; ++ts) {
+          if (is_uv && ts == TX_64X64) continue;
+          if (lfm->tx_size_hor[is_uv][ts].bits[index] & mask) {
+            tx_size = ts;
+            break;
+          }
+        }
+        if ((r + r_in_unit > 0) && (level || prev_level) &&
+            (!prev_skip || !skip || is_coding_block_border)) {
+          const TX_SIZE min_tx_size =
+              AOMMIN(TX_16X16, AOMMIN(tx_size, prev_tx_size));
+          const int shift_1 = get_index_shift(col_uv, row_uv, &index);
+          const uint64_t mask_1 = ((uint64_t)1 << shift_1);
+
+          switch (plane) {
+            case 0: lfm->above_y[min_tx_size].bits[index] |= mask_1; break;
+            case 1: lfm->above_u[min_tx_size].bits[index] |= mask_1; break;
+            case 2: lfm->above_v[min_tx_size].bits[index] |= mask_1; break;
+            default: assert(plane >= 0 && plane <= 2); return;
+          }
+          if (level == 0 && prev_level != 0) {
+            switch (plane) {
+              case 0: lfm->lfl_y_hor[row_uv][col_uv] = prev_level; break;
+              case 1: lfm->lfl_u_hor[row_uv][col_uv] = prev_level; break;
+              case 2: lfm->lfl_v_hor[row_uv][col_uv] = prev_level; break;
+              default: assert(plane >= 0 && plane <= 2); return;
+            }
+          }
+        }
+
+        // update prev info
+        prev_level = level;
+        prev_skip = skip;
+        prev_tx_size = tx_size;
+        // advance
+        r_in_unit += tx_size_high_unit[tx_size];
+      }
+    }
+  }
+}
+
+void av1_filter_block_plane_bitmask_vert(
+    AV1_COMMON *const cm, struct macroblockd_plane *const plane_ptr, int pl,
+    int mi_row, int mi_col) {
+  struct buf_2d *const dst = &plane_ptr->dst;
+  uint8_t *const buf0 = dst->buf;
+  const int ssx = plane_ptr->subsampling_x;
+  const int ssy = plane_ptr->subsampling_y;
+  const int mask_cutoff = 0xffff;
+  const int row_step = 1 << ssy;
+  const int two_row_step = 2 << ssy;
+  const int row_stride = dst->stride << MI_SIZE_LOG2;
+  const int two_row_stride = row_stride << 1;
+  uint64_t mask_16x16 = 0;
+  uint64_t mask_8x8 = 0;
+  uint64_t mask_4x4 = 0;
+  uint8_t *lfl;
+  uint8_t *lfl2;
+  LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
+  assert(lfm);
+
+  // 1. vertical filtering. filter two rows at a time
+  for (int r = 0;
+       ((mi_row + r) << MI_SIZE_LOG2) < cm->height && r < MI_SIZE_64X64;
+       r += two_row_step) {
+    const int row = r | ssy;
+    const int row_next = row + row_step;
+    const int col = ssx;
+    int index = 0;
+    const int shift = get_index_shift(col, row, &index);
+    int index_next = 0;
+    const int shift_next = get_index_shift(col, row_next, &index_next);
+    const int has_next_row = row_next < cm->mi_params.mi_rows;
+    switch (pl) {
+      case 0:
+        mask_16x16 = lfm->left_y[TX_16X16].bits[index];
+        mask_8x8 = lfm->left_y[TX_8X8].bits[index];
+        mask_4x4 = lfm->left_y[TX_4X4].bits[index];
+        lfl = &lfm->lfl_y_ver[row][col];
+        lfl2 = &lfm->lfl_y_ver[row_next][col];
+        break;
+      case 1:
+        mask_16x16 = lfm->left_u[TX_16X16].bits[index];
+        mask_8x8 = lfm->left_u[TX_8X8].bits[index];
+        mask_4x4 = lfm->left_u[TX_4X4].bits[index];
+        lfl = &lfm->lfl_u_ver[row][col];
+        lfl2 = &lfm->lfl_u_ver[row_next][col];
+        break;
+      case 2:
+        mask_16x16 = lfm->left_v[TX_16X16].bits[index];
+        mask_8x8 = lfm->left_v[TX_8X8].bits[index];
+        mask_4x4 = lfm->left_v[TX_4X4].bits[index];
+        lfl = &lfm->lfl_v_ver[row][col];
+        lfl2 = &lfm->lfl_v_ver[row_next][col];
+        break;
+      default: assert(pl >= 0 && pl <= 2); return;
+    }
+    uint64_t mask_16x16_0 = (mask_16x16 >> shift) & mask_cutoff;
+    uint64_t mask_8x8_0 = (mask_8x8 >> shift) & mask_cutoff;
+    uint64_t mask_4x4_0 = (mask_4x4 >> shift) & mask_cutoff;
+    uint64_t mask_16x16_1 = (mask_16x16 >> shift_next) & mask_cutoff;
+    uint64_t mask_8x8_1 = (mask_8x8 >> shift_next) & mask_cutoff;
+    uint64_t mask_4x4_1 = (mask_4x4 >> shift_next) & mask_cutoff;
+    if (!has_next_row) {
+      mask_16x16_1 = 0;
+      mask_8x8_1 = 0;
+      mask_4x4_1 = 0;
+    }
+
+#if CONFIG_AV1_HIGHBITDEPTH
+    if (cm->seq_params.use_highbitdepth)
+      highbd_filter_selectively_vert_row2(
+          ssx, CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, mask_16x16_0,
+          mask_8x8_0, mask_4x4_0, mask_16x16_1, mask_8x8_1, mask_4x4_1,
+          &cm->lf_info, lfl, lfl2, (int)cm->seq_params.bit_depth);
+    else
+      filter_selectively_vert_row2(
+          ssx, dst->buf, dst->stride, pl, mask_16x16_0, mask_8x8_0, mask_4x4_0,
+          mask_16x16_1, mask_8x8_1, mask_4x4_1, &cm->lf_info, lfl, lfl2);
+#else
+    filter_selectively_vert_row2(
+        ssx, dst->buf, dst->stride, pl, mask_16x16_0, mask_8x8_0, mask_4x4_0,
+        mask_16x16_1, mask_8x8_1, mask_4x4_1, &cm->lf_info, lfl, lfl2);
+#endif
+    dst->buf += two_row_stride;
+  }
+  // reset buf pointer for horizontal filtering
+  dst->buf = buf0;
+}
+
+void av1_filter_block_plane_bitmask_horz(
+    AV1_COMMON *const cm, struct macroblockd_plane *const plane_ptr, int pl,
+    int mi_row, int mi_col) {
+  struct buf_2d *const dst = &plane_ptr->dst;
+  uint8_t *const buf0 = dst->buf;
+  const int ssx = plane_ptr->subsampling_x;
+  const int ssy = plane_ptr->subsampling_y;
+  const int mask_cutoff = 0xffff;
+  const int row_step = 1 << ssy;
+  const int row_stride = dst->stride << MI_SIZE_LOG2;
+  uint64_t mask_16x16 = 0;
+  uint64_t mask_8x8 = 0;
+  uint64_t mask_4x4 = 0;
+  uint8_t *lfl;
+  LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
+  assert(lfm);
+  for (int r = 0;
+       ((mi_row + r) << MI_SIZE_LOG2) < cm->height && r < MI_SIZE_64X64;
+       r += row_step) {
+    if (mi_row + r == 0) {
+      dst->buf += row_stride;
+      continue;
+    }
+    const int row = r | ssy;
+    const int col = ssx;
+    int index = 0;
+    const int shift = get_index_shift(col, row, &index);
+    switch (pl) {
+      case 0:
+        mask_16x16 = lfm->above_y[TX_16X16].bits[index];
+        mask_8x8 = lfm->above_y[TX_8X8].bits[index];
+        mask_4x4 = lfm->above_y[TX_4X4].bits[index];
+        lfl = &lfm->lfl_y_hor[row][col];
+        break;
+      case 1:
+        mask_16x16 = lfm->above_u[TX_16X16].bits[index];
+        mask_8x8 = lfm->above_u[TX_8X8].bits[index];
+        mask_4x4 = lfm->above_u[TX_4X4].bits[index];
+        lfl = &lfm->lfl_u_hor[row][col];
+        break;
+      case 2:
+        mask_16x16 = lfm->above_v[TX_16X16].bits[index];
+        mask_8x8 = lfm->above_v[TX_8X8].bits[index];
+        mask_4x4 = lfm->above_v[TX_4X4].bits[index];
+        lfl = &lfm->lfl_v_hor[row][col];
+        break;
+      default: assert(pl >= 0 && pl <= 2); return;
+    }
+    mask_16x16 = (mask_16x16 >> shift) & mask_cutoff;
+    mask_8x8 = (mask_8x8 >> shift) & mask_cutoff;
+    mask_4x4 = (mask_4x4 >> shift) & mask_cutoff;
+
+#if CONFIG_AV1_HIGHBITDEPTH
+    if (cm->seq_params.use_highbitdepth)
+      highbd_filter_selectively_horiz(
+          CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, ssx, mask_16x16,
+          mask_8x8, mask_4x4, &cm->lf_info, lfl, (int)cm->seq_params.bit_depth);
+    else
+      filter_selectively_horiz(dst->buf, dst->stride, pl, ssx, mask_16x16,
+                               mask_8x8, mask_4x4, &cm->lf_info, lfl);
+#else
+    filter_selectively_horiz(dst->buf, dst->stride, pl, ssx, mask_16x16,
+                             mask_8x8, mask_4x4, &cm->lf_info, lfl);
+#endif
+    dst->buf += row_stride;
+  }
+  // reset buf pointer for next block
+  dst->buf = buf0;
+}
+
+void av1_filter_block_plane_ver(AV1_COMMON *const cm,
+                                struct macroblockd_plane *const plane_ptr,
+                                int pl, int mi_row, int mi_col) {
+  struct buf_2d *const dst = &plane_ptr->dst;
+  int r, c;
+  const int ssx = plane_ptr->subsampling_x;
+  const int ssy = plane_ptr->subsampling_y;
+  const int mask_cutoff = 0xffff;
+  const int single_step = 1 << ssy;
+  const int r_step = 2 << ssy;
+  uint64_t mask_16x16 = 0;
+  uint64_t mask_8x8 = 0;
+  uint64_t mask_4x4 = 0;
+  uint8_t *lfl;
+  uint8_t *lfl2;
+
+  // filter two rows at a time
+  for (r = 0; r < cm->seq_params.mib_size &&
+              ((mi_row + r) << MI_SIZE_LOG2 < cm->height);
+       r += r_step) {
+    for (c = 0; c < cm->seq_params.mib_size &&
+                ((mi_col + c) << MI_SIZE_LOG2 < cm->width);
+         c += MI_SIZE_64X64) {
+      dst->buf += ((c << MI_SIZE_LOG2) >> ssx);
+      LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row + r, mi_col + c);
+      assert(lfm);
+      const int row = ((mi_row + r) | ssy) % MI_SIZE_64X64;
+      const int col = ((mi_col + c) | ssx) % MI_SIZE_64X64;
+      int index = 0;
+      const int shift = get_index_shift(col, row, &index);
+      // current and next row should belong to the same mask_idx and index
+      // next row's shift
+      const int row_next = row + single_step;
+      int index_next = 0;
+      const int shift_next = get_index_shift(col, row_next, &index_next);
+      switch (pl) {
+        case 0:
+          mask_16x16 = lfm->left_y[TX_16X16].bits[index];
+          mask_8x8 = lfm->left_y[TX_8X8].bits[index];
+          mask_4x4 = lfm->left_y[TX_4X4].bits[index];
+          lfl = &lfm->lfl_y_ver[row][col];
+          lfl2 = &lfm->lfl_y_ver[row_next][col];
+          break;
+        case 1:
+          mask_16x16 = lfm->left_u[TX_16X16].bits[index];
+          mask_8x8 = lfm->left_u[TX_8X8].bits[index];
+          mask_4x4 = lfm->left_u[TX_4X4].bits[index];
+          lfl = &lfm->lfl_u_ver[row][col];
+          lfl2 = &lfm->lfl_u_ver[row_next][col];
+          break;
+        case 2:
+          mask_16x16 = lfm->left_v[TX_16X16].bits[index];
+          mask_8x8 = lfm->left_v[TX_8X8].bits[index];
+          mask_4x4 = lfm->left_v[TX_4X4].bits[index];
+          lfl = &lfm->lfl_v_ver[row][col];
+          lfl2 = &lfm->lfl_v_ver[row_next][col];
+          break;
+        default: assert(pl >= 0 && pl <= 2); return;
+      }
+      uint64_t mask_16x16_0 = (mask_16x16 >> shift) & mask_cutoff;
+      uint64_t mask_8x8_0 = (mask_8x8 >> shift) & mask_cutoff;
+      uint64_t mask_4x4_0 = (mask_4x4 >> shift) & mask_cutoff;
+      uint64_t mask_16x16_1 = (mask_16x16 >> shift_next) & mask_cutoff;
+      uint64_t mask_8x8_1 = (mask_8x8 >> shift_next) & mask_cutoff;
+      uint64_t mask_4x4_1 = (mask_4x4 >> shift_next) & mask_cutoff;
+
+#if CONFIG_AV1_HIGHBITDEPTH
+      if (cm->seq_params.use_highbitdepth)
+        highbd_filter_selectively_vert_row2(
+            ssx, CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, mask_16x16_0,
+            mask_8x8_0, mask_4x4_0, mask_16x16_1, mask_8x8_1, mask_4x4_1,
+            &cm->lf_info, lfl, lfl2, (int)cm->seq_params.bit_depth);
+      else
+        filter_selectively_vert_row2(ssx, dst->buf, dst->stride, pl,
+                                     mask_16x16_0, mask_8x8_0, mask_4x4_0,
+                                     mask_16x16_1, mask_8x8_1, mask_4x4_1,
+                                     &cm->lf_info, lfl, lfl2);
+#else
+      filter_selectively_vert_row2(
+          ssx, dst->buf, dst->stride, pl, mask_16x16_0, mask_8x8_0, mask_4x4_0,
+          mask_16x16_1, mask_8x8_1, mask_4x4_1, &cm->lf_info, lfl, lfl2);
+#endif
+      dst->buf -= ((c << MI_SIZE_LOG2) >> ssx);
+    }
+    dst->buf += 2 * MI_SIZE * dst->stride;
+  }
+}
+
+void av1_filter_block_plane_hor(AV1_COMMON *const cm,
+                                struct macroblockd_plane *const plane_ptr,
+                                int pl, int mi_row, int mi_col) {
+  struct buf_2d *const dst = &plane_ptr->dst;
+  int r, c;
+  const int ssx = plane_ptr->subsampling_x;
+  const int ssy = plane_ptr->subsampling_y;
+  const int mask_cutoff = 0xffff;
+  const int r_step = 1 << ssy;
+  uint64_t mask_16x16 = 0;
+  uint64_t mask_8x8 = 0;
+  uint64_t mask_4x4 = 0;
+  uint8_t *lfl;
+
+  for (r = 0; r < cm->seq_params.mib_size &&
+              ((mi_row + r) << MI_SIZE_LOG2 < cm->height);
+       r += r_step) {
+    for (c = 0; c < cm->seq_params.mib_size &&
+                ((mi_col + c) << MI_SIZE_LOG2 < cm->width);
+         c += MI_SIZE_64X64) {
+      if (mi_row + r == 0) continue;
+
+      dst->buf += ((c << MI_SIZE_LOG2) >> ssx);
+      LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row + r, mi_col + c);
+      assert(lfm);
+      const int row = ((mi_row + r) | ssy) % MI_SIZE_64X64;
+      const int col = ((mi_col + c) | ssx) % MI_SIZE_64X64;
+      int index = 0;
+      const int shift = get_index_shift(col, row, &index);
+      switch (pl) {
+        case 0:
+          mask_16x16 = lfm->above_y[TX_16X16].bits[index];
+          mask_8x8 = lfm->above_y[TX_8X8].bits[index];
+          mask_4x4 = lfm->above_y[TX_4X4].bits[index];
+          lfl = &lfm->lfl_y_hor[row][col];
+          break;
+        case 1:
+          mask_16x16 = lfm->above_u[TX_16X16].bits[index];
+          mask_8x8 = lfm->above_u[TX_8X8].bits[index];
+          mask_4x4 = lfm->above_u[TX_4X4].bits[index];
+          lfl = &lfm->lfl_u_hor[row][col];
+          break;
+        case 2:
+          mask_16x16 = lfm->above_v[TX_16X16].bits[index];
+          mask_8x8 = lfm->above_v[TX_8X8].bits[index];
+          mask_4x4 = lfm->above_v[TX_4X4].bits[index];
+          lfl = &lfm->lfl_v_hor[row][col];
+          break;
+        default: assert(pl >= 0 && pl <= 2); return;
+      }
+      mask_16x16 = (mask_16x16 >> shift) & mask_cutoff;
+      mask_8x8 = (mask_8x8 >> shift) & mask_cutoff;
+      mask_4x4 = (mask_4x4 >> shift) & mask_cutoff;
+
+#if CONFIG_AV1_HIGHBITDEPTH
+      if (cm->seq_params.use_highbitdepth)
+        highbd_filter_selectively_horiz(CONVERT_TO_SHORTPTR(dst->buf),
+                                        dst->stride, pl, ssx, mask_16x16,
+                                        mask_8x8, mask_4x4, &cm->lf_info, lfl,
+                                        (int)cm->seq_params.bit_depth);
+      else
+        filter_selectively_horiz(dst->buf, dst->stride, pl, ssx, mask_16x16,
+                                 mask_8x8, mask_4x4, &cm->lf_info, lfl);
+#else
+      filter_selectively_horiz(dst->buf, dst->stride, pl, ssx, mask_16x16,
+                               mask_8x8, mask_4x4, &cm->lf_info, lfl);
+#endif
+      dst->buf -= ((c << MI_SIZE_LOG2) >> ssx);
+    }
+    dst->buf += MI_SIZE * dst->stride;
+  }
+}
+
+void av1_store_bitmask_vartx(AV1_COMMON *cm, int mi_row, int mi_col,
+                             BLOCK_SIZE bsize, TX_SIZE tx_size,
+                             MB_MODE_INFO *mbmi) {
+  LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
+  const TX_SIZE tx_size_y_vert = txsize_vert_map[tx_size];
+  const TX_SIZE tx_size_y_horz = txsize_horz_map[tx_size];
+  const TX_SIZE tx_size_uv_vert = txsize_vert_map[av1_get_max_uv_txsize(
+      mbmi->sb_type, cm->seq_params.subsampling_x,
+      cm->seq_params.subsampling_y)];
+  const TX_SIZE tx_size_uv_horz = txsize_horz_map[av1_get_max_uv_txsize(
+      mbmi->sb_type, cm->seq_params.subsampling_x,
+      cm->seq_params.subsampling_y)];
+  const int is_square_transform_size = tx_size <= TX_64X64;
+  int mask_id = 0;
+  int offset = 0;
+  const int half_ratio_tx_size_max32 =
+      (tx_size > TX_64X64) & (tx_size <= TX_32X16);
+  if (is_square_transform_size) {
+    switch (tx_size) {
+      case TX_4X4: mask_id = mask_id_table_tx_4x4[bsize]; break;
+      case TX_8X8:
+        mask_id = mask_id_table_tx_8x8[bsize];
+        offset = 19;
+        break;
+      case TX_16X16:
+        mask_id = mask_id_table_tx_16x16[bsize];
+        offset = 33;
+        break;
+      case TX_32X32:
+        mask_id = mask_id_table_tx_32x32[bsize];
+        offset = 42;
+        break;
+      case TX_64X64: mask_id = 46; break;
+      default: assert(!is_square_transform_size); return;
+    }
+    mask_id += offset;
+  } else if (half_ratio_tx_size_max32) {
+    int tx_size_equal_block_size = bsize == txsize_to_bsize[tx_size];
+    mask_id = 47 + 2 * (tx_size - TX_4X8) + (tx_size_equal_block_size ? 0 : 1);
+  } else if (tx_size == TX_32X64) {
+    mask_id = 59;
+  } else if (tx_size == TX_64X32) {
+    mask_id = 60;
+  } else {  // quarter ratio tx size
+    mask_id = 61 + (tx_size - TX_4X16);
+  }
+  int index = 0;
+  const int row = mi_row % MI_SIZE_64X64;
+  const int col = mi_col % MI_SIZE_64X64;
+  const int shift = get_index_shift(col, row, &index);
+  const int vert_shift = tx_size_y_vert <= TX_8X8 ? shift : col;
+  for (int i = 0; i + index < 4; ++i) {
+    // y vertical.
+    lfm->tx_size_ver[0][tx_size_y_horz].bits[i + index] |=
+        (left_mask_univariant_reordered[mask_id].bits[i] << vert_shift);
+    // y horizontal.
+    lfm->tx_size_hor[0][tx_size_y_vert].bits[i + index] |=
+        (above_mask_univariant_reordered[mask_id].bits[i] << shift);
+    // u/v vertical.
+    lfm->tx_size_ver[1][tx_size_uv_horz].bits[i + index] |=
+        (left_mask_univariant_reordered[mask_id].bits[i] << vert_shift);
+    // u/v horizontal.
+    lfm->tx_size_hor[1][tx_size_uv_vert].bits[i + index] |=
+        (above_mask_univariant_reordered[mask_id].bits[i] << shift);
+  }
+}
+
+void av1_store_bitmask_univariant_tx(AV1_COMMON *cm, int mi_row, int mi_col,
+                                     BLOCK_SIZE bsize, MB_MODE_INFO *mbmi) {
+  // Use a lookup table that provides one bitmask for a given block size and
+  // a univariant transform size.
+  int index;
+  int shift;
+  int row;
+  int col;
+  LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
+  const TX_SIZE tx_size_y_vert = txsize_vert_map[mbmi->tx_size];
+  const TX_SIZE tx_size_y_horz = txsize_horz_map[mbmi->tx_size];
+  const TX_SIZE tx_size_uv_vert = txsize_vert_map[av1_get_max_uv_txsize(
+      mbmi->sb_type, cm->seq_params.subsampling_x,
+      cm->seq_params.subsampling_y)];
+  const TX_SIZE tx_size_uv_horz = txsize_horz_map[av1_get_max_uv_txsize(
+      mbmi->sb_type, cm->seq_params.subsampling_x,
+      cm->seq_params.subsampling_y)];
+  const int is_square_transform_size = mbmi->tx_size <= TX_64X64;
+  int mask_id = 0;
+  int offset = 0;
+  const int half_ratio_tx_size_max32 =
+      (mbmi->tx_size > TX_64X64) & (mbmi->tx_size <= TX_32X16);
+  if (is_square_transform_size) {
+    switch (mbmi->tx_size) {
+      case TX_4X4: mask_id = mask_id_table_tx_4x4[bsize]; break;
+      case TX_8X8:
+        mask_id = mask_id_table_tx_8x8[bsize];
+        offset = 19;
+        break;
+      case TX_16X16:
+        mask_id = mask_id_table_tx_16x16[bsize];
+        offset = 33;
+        break;
+      case TX_32X32:
+        mask_id = mask_id_table_tx_32x32[bsize];
+        offset = 42;
+        break;
+      case TX_64X64: mask_id = 46; break;
+      default: assert(!is_square_transform_size); return;
+    }
+    mask_id += offset;
+  } else if (half_ratio_tx_size_max32) {
+    int tx_size_equal_block_size = bsize == txsize_to_bsize[mbmi->tx_size];
+    mask_id =
+        47 + 2 * (mbmi->tx_size - TX_4X8) + (tx_size_equal_block_size ? 0 : 1);
+  } else if (mbmi->tx_size == TX_32X64) {
+    mask_id = 59;
+  } else if (mbmi->tx_size == TX_64X32) {
+    mask_id = 60;
+  } else {  // quarter ratio tx size
+    mask_id = 61 + (mbmi->tx_size - TX_4X16);
+  }
+  row = mi_row % MI_SIZE_64X64;
+  col = mi_col % MI_SIZE_64X64;
+  shift = get_index_shift(col, row, &index);
+  const int vert_shift = tx_size_y_vert <= TX_8X8 ? shift : col;
+  for (int i = 0; i + index < 4; ++i) {
+    // y vertical.
+    lfm->tx_size_ver[0][tx_size_y_horz].bits[i + index] |=
+        (left_mask_univariant_reordered[mask_id].bits[i] << vert_shift);
+    // y horizontal.
+    lfm->tx_size_hor[0][tx_size_y_vert].bits[i + index] |=
+        (above_mask_univariant_reordered[mask_id].bits[i] << shift);
+    // u/v vertical.
+    lfm->tx_size_ver[1][tx_size_uv_horz].bits[i + index] |=
+        (left_mask_univariant_reordered[mask_id].bits[i] << vert_shift);
+    // u/v horizontal.
+    lfm->tx_size_hor[1][tx_size_uv_vert].bits[i + index] |=
+        (above_mask_univariant_reordered[mask_id].bits[i] << shift);
+  }
+}
+
+void av1_store_bitmask_other_info(AV1_COMMON *cm, int mi_row, int mi_col,
+                                  BLOCK_SIZE bsize, MB_MODE_INFO *mbmi,
+                                  int is_horz_coding_block_border,
+                                  int is_vert_coding_block_border) {
+  int index;
+  int shift;
+  int row;
+  LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
+  const int row_start = mi_row % MI_SIZE_64X64;
+  const int col_start = mi_col % MI_SIZE_64X64;
+  shift = get_index_shift(col_start, row_start, &index);
+  if (is_horz_coding_block_border) {
+    const int block_shift = shift + mi_size_wide[bsize];
+    assert(block_shift <= 64);
+    const uint64_t right_edge_shift =
+        (block_shift == 64) ? 0xffffffffffffffff : ((uint64_t)1 << block_shift);
+    const uint64_t left_edge_shift = (block_shift == 64)
+                                         ? (((uint64_t)1 << shift) - 1)
+                                         : ((uint64_t)1 << shift);
+    assert(right_edge_shift > left_edge_shift);
+    const uint64_t top_edge_mask = right_edge_shift - left_edge_shift;
+    lfm->is_horz_border.bits[index] |= top_edge_mask;
+  }
+  if (is_vert_coding_block_border) {
+    const int is_vert_border = mask_id_table_vert_border[bsize];
+    const int vert_shift = block_size_high[bsize] <= 8 ? shift : col_start;
+    for (int i = 0; i + index < 4; ++i) {
+      lfm->is_vert_border.bits[i + index] |=
+          (left_mask_univariant_reordered[is_vert_border].bits[i]
+           << vert_shift);
+    }
+  }
+  const int is_skip = mbmi->skip && is_inter_block(mbmi);
+  if (is_skip) {
+    const int is_skip_mask = mask_id_table_tx_4x4[bsize];
+    for (int i = 0; i + index < 4; ++i) {
+      lfm->skip.bits[i + index] |=
+          (above_mask_univariant_reordered[is_skip_mask].bits[i] << shift);
+    }
+  }
+  const uint8_t level_vert_y =
+      av1_get_filter_level(cm, &cm->lf_info, 0, 0, mbmi);
+  const uint8_t level_horz_y =
+      av1_get_filter_level(cm, &cm->lf_info, 1, 0, mbmi);
+  const uint8_t level_u = av1_get_filter_level(cm, &cm->lf_info, 0, 1, mbmi);
+  const uint8_t level_v = av1_get_filter_level(cm, &cm->lf_info, 0, 2, mbmi);
+  for (int r = mi_row; r < mi_row + mi_size_high[bsize]; r++) {
+    index = 0;
+    row = r % MI_SIZE_64X64;
+    memset(&lfm->lfl_y_ver[row][col_start], level_vert_y,
+           sizeof(uint8_t) * mi_size_wide[bsize]);
+    memset(&lfm->lfl_y_hor[row][col_start], level_horz_y,
+           sizeof(uint8_t) * mi_size_wide[bsize]);
+    memset(&lfm->lfl_u_ver[row][col_start], level_u,
+           sizeof(uint8_t) * mi_size_wide[bsize]);
+    memset(&lfm->lfl_u_hor[row][col_start], level_u,
+           sizeof(uint8_t) * mi_size_wide[bsize]);
+    memset(&lfm->lfl_v_ver[row][col_start], level_v,
+           sizeof(uint8_t) * mi_size_wide[bsize]);
+    memset(&lfm->lfl_v_hor[row][col_start], level_v,
+           sizeof(uint8_t) * mi_size_wide[bsize]);
+  }
+}
+#endif  // CONFIG_LPF_MASK
diff --git a/libs/libaom/src/av1/common/mv.h b/libs/libaom/src/av1/common/mv.h
new file mode 100644
index 000000000..be539e820
--- /dev/null
+++ b/libs/libaom/src/av1/common/mv.h
@@ -0,0 +1,354 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_MV_H_
+#define AOM_AV1_COMMON_MV_H_
+
+#include "av1/common/common.h"
+#include "av1/common/common_data.h"
+#include "aom_dsp/aom_filter.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define INVALID_MV 0x80008000
+#define GET_MV_RAWPEL(x) (((x) + 3 + ((x) >= 0)) >> 3)
+#define GET_MV_SUBPEL(x) ((x)*8)
+
+#define MARK_MV_INVALID(mv)                \
+  do {                                     \
+    ((int_mv *)(mv))->as_int = INVALID_MV; \
+  } while (0);
+#define CHECK_MV_EQUAL(x, y) (((x).row == (y).row) && ((x).col == (y).col))
+
+// The motion vector in units of full pixel
+typedef struct fullpel_mv {
+  int16_t row;
+  int16_t col;
+} FULLPEL_MV;
+
+// The motion vector in units of 1/8-pel
+typedef struct mv {
+  int16_t row;
+  int16_t col;
+} MV;
+
+static const MV kZeroMv = { 0, 0 };
+static const FULLPEL_MV kZeroFullMv = { 0, 0 };
+
+typedef union int_mv {
+  uint32_t as_int;
+  MV as_mv;
+  FULLPEL_MV as_fullmv;
+} int_mv; /* facilitates faster equality tests and copies */
+
+typedef struct mv32 {
+  int32_t row;
+  int32_t col;
+} MV32;
+
+// The mv limit for fullpel mvs
+typedef struct {
+  int col_min;
+  int col_max;
+  int row_min;
+  int row_max;
+} FullMvLimits;
+
+// The mv limit for subpel mvs
+typedef struct {
+  int col_min;
+  int col_max;
+  int row_min;
+  int row_max;
+} SubpelMvLimits;
+
+static AOM_INLINE FULLPEL_MV get_fullmv_from_mv(const MV *subpel_mv) {
+  const FULLPEL_MV full_mv = { (int16_t)GET_MV_RAWPEL(subpel_mv->row),
+                               (int16_t)GET_MV_RAWPEL(subpel_mv->col) };
+  return full_mv;
+}
+
+static AOM_INLINE MV get_mv_from_fullmv(const FULLPEL_MV *full_mv) {
+  const MV subpel_mv = { (int16_t)GET_MV_SUBPEL(full_mv->row),
+                         (int16_t)GET_MV_SUBPEL(full_mv->col) };
+  return subpel_mv;
+}
+
+static AOM_INLINE void convert_fullmv_to_mv(int_mv *mv) {
+  mv->as_mv = get_mv_from_fullmv(&mv->as_fullmv);
+}
+
+// Bits of precision used for the model
+#define WARPEDMODEL_PREC_BITS 16
+#define WARPEDMODEL_ROW3HOMO_PREC_BITS 16
+
+#define WARPEDMODEL_TRANS_CLAMP (128 << WARPEDMODEL_PREC_BITS)
+#define WARPEDMODEL_NONDIAGAFFINE_CLAMP (1 << (WARPEDMODEL_PREC_BITS - 3))
+#define WARPEDMODEL_ROW3HOMO_CLAMP (1 << (WARPEDMODEL_PREC_BITS - 2))
+
+// Bits of subpel precision for warped interpolation
+#define WARPEDPIXEL_PREC_BITS 6
+#define WARPEDPIXEL_PREC_SHIFTS (1 << WARPEDPIXEL_PREC_BITS)
+
+#define WARP_PARAM_REDUCE_BITS 6
+
+#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
+
+/* clang-format off */
+enum {
+  IDENTITY = 0,      // identity transformation, 0-parameter
+  TRANSLATION = 1,   // translational motion 2-parameter
+  ROTZOOM = 2,       // simplified affine with rotation + zoom only, 4-parameter
+  AFFINE = 3,        // affine, 6-parameter
+  TRANS_TYPES,
+} UENUM1BYTE(TransformationType);
+/* clang-format on */
+
+// Number of types used for global motion (must be >= 3 and <= TRANS_TYPES)
+// The following can be useful:
+// GLOBAL_TRANS_TYPES 3 - up to rotation-zoom
+// GLOBAL_TRANS_TYPES 4 - up to affine
+// GLOBAL_TRANS_TYPES 6 - up to hor/ver trapezoids
+// GLOBAL_TRANS_TYPES 7 - up to full homography
+#define GLOBAL_TRANS_TYPES 4
+
+typedef struct {
+  int global_warp_allowed;
+  int local_warp_allowed;
+} WarpTypesAllowed;
+
+// number of parameters used by each transformation in TransformationTypes
+static const int trans_model_params[TRANS_TYPES] = { 0, 2, 4, 6 };
+
+// The order of values in the wmmat matrix below is best described
+// by the homography:
+//      [x'     (m2 m3 m0   [x
+//  z .  y'  =   m4 m5 m1 *  y
+//       1]      m6 m7 1)    1]
+typedef struct {
+  int32_t wmmat[8];
+  int16_t alpha, beta, gamma, delta;
+  TransformationType wmtype;
+  int8_t invalid;
+} WarpedMotionParams;
+
+/* clang-format off */
+static const WarpedMotionParams default_warp_params = {
+  { 0, 0, (1 << WARPEDMODEL_PREC_BITS), 0, 0, (1 << WARPEDMODEL_PREC_BITS), 0,
+    0 },
+  0, 0, 0, 0,
+  IDENTITY,
+  0,
+};
+/* clang-format on */
+
+// The following constants describe the various precisions
+// of different parameters in the global motion experiment.
+//
+// Given the general homography:
+//      [x'     (a  b  c   [x
+//  z .  y'  =   d  e  f *  y
+//       1]      g  h  i)    1]
+//
+// Constants using the name ALPHA here are related to parameters
+// a, b, d, e. Constants using the name TRANS are related
+// to parameters c and f.
+//
+// Anything ending in PREC_BITS is the number of bits of precision
+// to maintain when converting from double to integer.
+//
+// The ABS parameters are used to create an upper and lower bound
+// for each parameter. In other words, after a parameter is integerized
+// it is clamped between -(1 << ABS_XXX_BITS) and (1 << ABS_XXX_BITS).
+//
+// XXX_PREC_DIFF and XXX_DECODE_FACTOR
+// are computed once here to prevent repetitive
+// computation on the decoder side. These are
+// to allow the global motion parameters to be encoded in a lower
+// precision than the warped model precision. This means that they
+// need to be changed to warped precision when they are decoded.
+//
+// XX_MIN, XX_MAX are also computed to avoid repeated computation
+
+#define SUBEXPFIN_K 3
+#define GM_TRANS_PREC_BITS 6
+#define GM_ABS_TRANS_BITS 12
+#define GM_ABS_TRANS_ONLY_BITS (GM_ABS_TRANS_BITS - GM_TRANS_PREC_BITS + 3)
+#define GM_TRANS_PREC_DIFF (WARPEDMODEL_PREC_BITS - GM_TRANS_PREC_BITS)
+#define GM_TRANS_ONLY_PREC_DIFF (WARPEDMODEL_PREC_BITS - 3)
+#define GM_TRANS_DECODE_FACTOR (1 << GM_TRANS_PREC_DIFF)
+#define GM_TRANS_ONLY_DECODE_FACTOR (1 << GM_TRANS_ONLY_PREC_DIFF)
+
+#define GM_ALPHA_PREC_BITS 15
+#define GM_ABS_ALPHA_BITS 12
+#define GM_ALPHA_PREC_DIFF (WARPEDMODEL_PREC_BITS - GM_ALPHA_PREC_BITS)
+#define GM_ALPHA_DECODE_FACTOR (1 << GM_ALPHA_PREC_DIFF)
+
+#define GM_ROW3HOMO_PREC_BITS 16
+#define GM_ABS_ROW3HOMO_BITS 11
+#define GM_ROW3HOMO_PREC_DIFF \
+  (WARPEDMODEL_ROW3HOMO_PREC_BITS - GM_ROW3HOMO_PREC_BITS)
+#define GM_ROW3HOMO_DECODE_FACTOR (1 << GM_ROW3HOMO_PREC_DIFF)
+
+#define GM_TRANS_MAX (1 << GM_ABS_TRANS_BITS)
+#define GM_ALPHA_MAX (1 << GM_ABS_ALPHA_BITS)
+#define GM_ROW3HOMO_MAX (1 << GM_ABS_ROW3HOMO_BITS)
+
+#define GM_TRANS_MIN -GM_TRANS_MAX
+#define GM_ALPHA_MIN -GM_ALPHA_MAX
+#define GM_ROW3HOMO_MIN -GM_ROW3HOMO_MAX
+
+static INLINE int block_center_x(int mi_col, BLOCK_SIZE bs) {
+  const int bw = block_size_wide[bs];
+  return mi_col * MI_SIZE + bw / 2 - 1;
+}
+
+static INLINE int block_center_y(int mi_row, BLOCK_SIZE bs) {
+  const int bh = block_size_high[bs];
+  return mi_row * MI_SIZE + bh / 2 - 1;
+}
+
+static INLINE int convert_to_trans_prec(int allow_hp, int coor) {
+  if (allow_hp)
+    return ROUND_POWER_OF_TWO_SIGNED(coor, WARPEDMODEL_PREC_BITS - 3);
+  else
+    return ROUND_POWER_OF_TWO_SIGNED(coor, WARPEDMODEL_PREC_BITS - 2) * 2;
+}
+static INLINE void integer_mv_precision(MV *mv) {
+  int mod = (mv->row % 8);
+  if (mod != 0) {
+    mv->row -= mod;
+    if (abs(mod) > 4) {
+      if (mod > 0) {
+        mv->row += 8;
+      } else {
+        mv->row -= 8;
+      }
+    }
+  }
+
+  mod = (mv->col % 8);
+  if (mod != 0) {
+    mv->col -= mod;
+    if (abs(mod) > 4) {
+      if (mod > 0) {
+        mv->col += 8;
+      } else {
+        mv->col -= 8;
+      }
+    }
+  }
+}
+// Convert a global motion vector into a motion vector at the centre of the
+// given block.
+//
+// The resulting motion vector will have three fractional bits of precision. If
+// allow_hp is zero, the bottom bit will always be zero. If CONFIG_AMVR and
+// is_integer is true, the bottom three bits will be zero (so the motion vector
+// represents an integer)
+static INLINE int_mv gm_get_motion_vector(const WarpedMotionParams *gm,
+                                          int allow_hp, BLOCK_SIZE bsize,
+                                          int mi_col, int mi_row,
+                                          int is_integer) {
+  int_mv res;
+
+  if (gm->wmtype == IDENTITY) {
+    res.as_int = 0;
+    return res;
+  }
+
+  const int32_t *mat = gm->wmmat;
+  int x, y, tx, ty;
+
+  if (gm->wmtype == TRANSLATION) {
+    // All global motion vectors are stored with WARPEDMODEL_PREC_BITS (16)
+    // bits of fractional precision. The offset for a translation is stored in
+    // entries 0 and 1. For translations, all but the top three (two if
+    // cm->features.allow_high_precision_mv is false) fractional bits are always
+    // zero.
+    //
+    // After the right shifts, there are 3 fractional bits of precision. If
+    // allow_hp is false, the bottom bit is always zero (so we don't need a
+    // call to convert_to_trans_prec here)
+    res.as_mv.row = gm->wmmat[0] >> GM_TRANS_ONLY_PREC_DIFF;
+    res.as_mv.col = gm->wmmat[1] >> GM_TRANS_ONLY_PREC_DIFF;
+    assert(IMPLIES(1 & (res.as_mv.row | res.as_mv.col), allow_hp));
+    if (is_integer) {
+      integer_mv_precision(&res.as_mv);
+    }
+    return res;
+  }
+
+  x = block_center_x(mi_col, bsize);
+  y = block_center_y(mi_row, bsize);
+
+  if (gm->wmtype == ROTZOOM) {
+    assert(gm->wmmat[5] == gm->wmmat[2]);
+    assert(gm->wmmat[4] == -gm->wmmat[3]);
+  }
+
+  const int xc =
+      (mat[2] - (1 << WARPEDMODEL_PREC_BITS)) * x + mat[3] * y + mat[0];
+  const int yc =
+      mat[4] * x + (mat[5] - (1 << WARPEDMODEL_PREC_BITS)) * y + mat[1];
+  tx = convert_to_trans_prec(allow_hp, xc);
+  ty = convert_to_trans_prec(allow_hp, yc);
+
+  res.as_mv.row = ty;
+  res.as_mv.col = tx;
+
+  if (is_integer) {
+    integer_mv_precision(&res.as_mv);
+  }
+  return res;
+}
+
+static INLINE TransformationType get_wmtype(const WarpedMotionParams *gm) {
+  if (gm->wmmat[5] == (1 << WARPEDMODEL_PREC_BITS) && !gm->wmmat[4] &&
+      gm->wmmat[2] == (1 << WARPEDMODEL_PREC_BITS) && !gm->wmmat[3]) {
+    return ((!gm->wmmat[1] && !gm->wmmat[0]) ? IDENTITY : TRANSLATION);
+  }
+  if (gm->wmmat[2] == gm->wmmat[5] && gm->wmmat[3] == -gm->wmmat[4])
+    return ROTZOOM;
+  else
+    return AFFINE;
+}
+
+typedef struct candidate_mv {
+  int_mv this_mv;
+  int_mv comp_mv;
+} CANDIDATE_MV;
+
+static INLINE int is_zero_mv(const MV *mv) {
+  return *((const uint32_t *)mv) == 0;
+}
+
+static INLINE int is_equal_mv(const MV *a, const MV *b) {
+  return *((const uint32_t *)a) == *((const uint32_t *)b);
+}
+
+static INLINE void clamp_mv(MV *mv, const SubpelMvLimits *mv_limits) {
+  mv->col = clamp(mv->col, mv_limits->col_min, mv_limits->col_max);
+  mv->row = clamp(mv->row, mv_limits->row_min, mv_limits->row_max);
+}
+
+static INLINE void clamp_fullmv(FULLPEL_MV *mv, const FullMvLimits *mv_limits) {
+  mv->col = clamp(mv->col, mv_limits->col_min, mv_limits->col_max);
+  mv->row = clamp(mv->row, mv_limits->row_min, mv_limits->row_max);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_COMMON_MV_H_
diff --git a/libs/libaom/src/av1/common/mvref_common.c b/libs/libaom/src/av1/common/mvref_common.c
new file mode 100644
index 000000000..db3098cc0
--- /dev/null
+++ b/libs/libaom/src/av1/common/mvref_common.c
@@ -0,0 +1,1511 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+
+#include "av1/common/mvref_common.h"
+#include "av1/common/warped_motion.h"
+
+// Although we assign 32 bit integers, all the values are strictly under 14
+// bits.
+static int div_mult[32] = { 0,    16384, 8192, 5461, 4096, 3276, 2730, 2340,
+                            2048, 1820,  1638, 1489, 1365, 1260, 1170, 1092,
+                            1024, 963,   910,  862,  819,  780,  744,  712,
+                            682,  655,   630,  606,  585,  564,  546,  528 };
+
+// TODO(jingning): Consider the use of lookup table for (num / den)
+// altogether.
+static AOM_INLINE void get_mv_projection(MV *output, MV ref, int num, int den) {
+  den = AOMMIN(den, MAX_FRAME_DISTANCE);
+  num = num > 0 ? AOMMIN(num, MAX_FRAME_DISTANCE)
+                : AOMMAX(num, -MAX_FRAME_DISTANCE);
+  const int mv_row =
+      ROUND_POWER_OF_TWO_SIGNED(ref.row * num * div_mult[den], 14);
+  const int mv_col =
+      ROUND_POWER_OF_TWO_SIGNED(ref.col * num * div_mult[den], 14);
+  const int clamp_max = MV_UPP - 1;
+  const int clamp_min = MV_LOW + 1;
+  output->row = (int16_t)clamp(mv_row, clamp_min, clamp_max);
+  output->col = (int16_t)clamp(mv_col, clamp_min, clamp_max);
+}
+
+void av1_copy_frame_mvs(const AV1_COMMON *const cm,
+                        const MB_MODE_INFO *const mi, int mi_row, int mi_col,
+                        int x_mis, int y_mis) {
+  const int frame_mvs_stride = ROUND_POWER_OF_TWO(cm->mi_params.mi_cols, 1);
+  MV_REF *frame_mvs =
+      cm->cur_frame->mvs + (mi_row >> 1) * frame_mvs_stride + (mi_col >> 1);
+  x_mis = ROUND_POWER_OF_TWO(x_mis, 1);
+  y_mis = ROUND_POWER_OF_TWO(y_mis, 1);
+  int w, h;
+
+  for (h = 0; h < y_mis; h++) {
+    MV_REF *mv = frame_mvs;
+    for (w = 0; w < x_mis; w++) {
+      mv->ref_frame = NONE_FRAME;
+      mv->mv.as_int = 0;
+
+      for (int idx = 0; idx < 2; ++idx) {
+        MV_REFERENCE_FRAME ref_frame = mi->ref_frame[idx];
+        if (ref_frame > INTRA_FRAME) {
+          int8_t ref_idx = cm->ref_frame_side[ref_frame];
+          if (ref_idx) continue;
+          if ((abs(mi->mv[idx].as_mv.row) > REFMVS_LIMIT) ||
+              (abs(mi->mv[idx].as_mv.col) > REFMVS_LIMIT))
+            continue;
+          mv->ref_frame = ref_frame;
+          mv->mv.as_int = mi->mv[idx].as_int;
+        }
+      }
+      mv++;
+    }
+    frame_mvs += frame_mvs_stride;
+  }
+}
+
+static AOM_INLINE void add_ref_mv_candidate(
+    const MB_MODE_INFO *const candidate, const MV_REFERENCE_FRAME rf[2],
+    uint8_t *refmv_count, uint8_t *ref_match_count, uint8_t *newmv_count,
+    CANDIDATE_MV *ref_mv_stack, uint16_t *ref_mv_weight,
+    int_mv *gm_mv_candidates, const WarpedMotionParams *gm_params,
+    uint16_t weight) {
+  if (!is_inter_block(candidate)) return;
+  assert(weight % 2 == 0);
+  int index, ref;
+
+  if (rf[1] == NONE_FRAME) {
+    // single reference frame
+    for (ref = 0; ref < 2; ++ref) {
+      if (candidate->ref_frame[ref] == rf[0]) {
+        const int is_gm_block =
+            is_global_mv_block(candidate, gm_params[rf[0]].wmtype);
+        const int_mv this_refmv =
+            is_gm_block ? gm_mv_candidates[0] : get_block_mv(candidate, ref);
+        for (index = 0; index < *refmv_count; ++index) {
+          if (ref_mv_stack[index].this_mv.as_int == this_refmv.as_int) {
+            ref_mv_weight[index] += weight;
+            break;
+          }
+        }
+
+        // Add a new item to the list.
+        if (index == *refmv_count && *refmv_count < MAX_REF_MV_STACK_SIZE) {
+          ref_mv_stack[index].this_mv = this_refmv;
+          ref_mv_weight[index] = weight;
+          ++(*refmv_count);
+        }
+        if (have_newmv_in_inter_mode(candidate->mode)) ++*newmv_count;
+        ++*ref_match_count;
+      }
+    }
+  } else {
+    // compound reference frame
+    if (candidate->ref_frame[0] == rf[0] && candidate->ref_frame[1] == rf[1]) {
+      int_mv this_refmv[2];
+
+      for (ref = 0; ref < 2; ++ref) {
+        if (is_global_mv_block(candidate, gm_params[rf[ref]].wmtype))
+          this_refmv[ref] = gm_mv_candidates[ref];
+        else
+          this_refmv[ref] = get_block_mv(candidate, ref);
+      }
+
+      for (index = 0; index < *refmv_count; ++index) {
+        if ((ref_mv_stack[index].this_mv.as_int == this_refmv[0].as_int) &&
+            (ref_mv_stack[index].comp_mv.as_int == this_refmv[1].as_int)) {
+          ref_mv_weight[index] += weight;
+          break;
+        }
+      }
+
+      // Add a new item to the list.
+      if (index == *refmv_count && *refmv_count < MAX_REF_MV_STACK_SIZE) {
+        ref_mv_stack[index].this_mv = this_refmv[0];
+        ref_mv_stack[index].comp_mv = this_refmv[1];
+        ref_mv_weight[index] = weight;
+        ++(*refmv_count);
+      }
+      if (have_newmv_in_inter_mode(candidate->mode)) ++*newmv_count;
+      ++*ref_match_count;
+    }
+  }
+}
+
+static AOM_INLINE void scan_row_mbmi(
+    const AV1_COMMON *cm, const MACROBLOCKD *xd, int mi_col,
+    const MV_REFERENCE_FRAME rf[2], int row_offset, CANDIDATE_MV *ref_mv_stack,
+    uint16_t *ref_mv_weight, uint8_t *refmv_count, uint8_t *ref_match_count,
+    uint8_t *newmv_count, int_mv *gm_mv_candidates, int max_row_offset,
+    int *processed_rows) {
+  int end_mi = AOMMIN(xd->width, cm->mi_params.mi_cols - mi_col);
+  end_mi = AOMMIN(end_mi, mi_size_wide[BLOCK_64X64]);
+  const int width_8x8 = mi_size_wide[BLOCK_8X8];
+  const int width_16x16 = mi_size_wide[BLOCK_16X16];
+  int col_offset = 0;
+  // TODO(jingning): Revisit this part after cb4x4 is stable.
+  if (abs(row_offset) > 1) {
+    col_offset = 1;
+    if ((mi_col & 0x01) && xd->width < width_8x8) --col_offset;
+  }
+  const int use_step_16 = (xd->width >= 16);
+  MB_MODE_INFO **const candidate_mi0 = xd->mi + row_offset * xd->mi_stride;
+
+  for (int i = 0; i < end_mi;) {
+    const MB_MODE_INFO *const candidate = candidate_mi0[col_offset + i];
+    const int candidate_bsize = candidate->sb_type;
+    const int n4_w = mi_size_wide[candidate_bsize];
+    int len = AOMMIN(xd->width, n4_w);
+    if (use_step_16)
+      len = AOMMAX(width_16x16, len);
+    else if (abs(row_offset) > 1)
+      len = AOMMAX(len, width_8x8);
+
+    uint16_t weight = 2;
+    if (xd->width >= width_8x8 && xd->width <= n4_w) {
+      uint16_t inc = AOMMIN(-max_row_offset + row_offset + 1,
+                            mi_size_high[candidate_bsize]);
+      // Obtain range used in weight calculation.
+      weight = AOMMAX(weight, inc);
+      // Update processed rows.
+      *processed_rows = inc - row_offset - 1;
+    }
+
+    add_ref_mv_candidate(candidate, rf, refmv_count, ref_match_count,
+                         newmv_count, ref_mv_stack, ref_mv_weight,
+                         gm_mv_candidates, cm->global_motion, len * weight);
+
+    i += len;
+  }
+}
+
+static AOM_INLINE void scan_col_mbmi(
+    const AV1_COMMON *cm, const MACROBLOCKD *xd, int mi_row,
+    const MV_REFERENCE_FRAME rf[2], int col_offset, CANDIDATE_MV *ref_mv_stack,
+    uint16_t *ref_mv_weight, uint8_t *refmv_count, uint8_t *ref_match_count,
+    uint8_t *newmv_count, int_mv *gm_mv_candidates, int max_col_offset,
+    int *processed_cols) {
+  int end_mi = AOMMIN(xd->height, cm->mi_params.mi_rows - mi_row);
+  end_mi = AOMMIN(end_mi, mi_size_high[BLOCK_64X64]);
+  const int n8_h_8 = mi_size_high[BLOCK_8X8];
+  const int n8_h_16 = mi_size_high[BLOCK_16X16];
+  int i;
+  int row_offset = 0;
+  if (abs(col_offset) > 1) {
+    row_offset = 1;
+    if ((mi_row & 0x01) && xd->height < n8_h_8) --row_offset;
+  }
+  const int use_step_16 = (xd->height >= 16);
+
+  for (i = 0; i < end_mi;) {
+    const MB_MODE_INFO *const candidate =
+        xd->mi[(row_offset + i) * xd->mi_stride + col_offset];
+    const int candidate_bsize = candidate->sb_type;
+    const int n4_h = mi_size_high[candidate_bsize];
+    int len = AOMMIN(xd->height, n4_h);
+    if (use_step_16)
+      len = AOMMAX(n8_h_16, len);
+    else if (abs(col_offset) > 1)
+      len = AOMMAX(len, n8_h_8);
+
+    int weight = 2;
+    if (xd->height >= n8_h_8 && xd->height <= n4_h) {
+      int inc = AOMMIN(-max_col_offset + col_offset + 1,
+                       mi_size_wide[candidate_bsize]);
+      // Obtain range used in weight calculation.
+      weight = AOMMAX(weight, inc);
+      // Update processed cols.
+      *processed_cols = inc - col_offset - 1;
+    }
+
+    add_ref_mv_candidate(candidate, rf, refmv_count, ref_match_count,
+                         newmv_count, ref_mv_stack, ref_mv_weight,
+                         gm_mv_candidates, cm->global_motion, len * weight);
+
+    i += len;
+  }
+}
+
+static AOM_INLINE void scan_blk_mbmi(
+    const AV1_COMMON *cm, const MACROBLOCKD *xd, const int mi_row,
+    const int mi_col, const MV_REFERENCE_FRAME rf[2], int row_offset,
+    int col_offset, CANDIDATE_MV *ref_mv_stack, uint16_t *ref_mv_weight,
+    uint8_t *ref_match_count, uint8_t *newmv_count, int_mv *gm_mv_candidates,
+    uint8_t *refmv_count) {
+  const TileInfo *const tile = &xd->tile;
+  POSITION mi_pos;
+
+  mi_pos.row = row_offset;
+  mi_pos.col = col_offset;
+
+  if (is_inside(tile, mi_col, mi_row, &mi_pos)) {
+    const MB_MODE_INFO *const candidate =
+        xd->mi[mi_pos.row * xd->mi_stride + mi_pos.col];
+    const int len = mi_size_wide[BLOCK_8X8];
+
+    add_ref_mv_candidate(candidate, rf, refmv_count, ref_match_count,
+                         newmv_count, ref_mv_stack, ref_mv_weight,
+                         gm_mv_candidates, cm->global_motion, 2 * len);
+  }  // Analyze a single 8x8 block motion information.
+}
+
+static int has_top_right(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+                         int mi_row, int mi_col, int bs) {
+  const int sb_mi_size = mi_size_wide[cm->seq_params.sb_size];
+  const int mask_row = mi_row & (sb_mi_size - 1);
+  const int mask_col = mi_col & (sb_mi_size - 1);
+
+  if (bs > mi_size_wide[BLOCK_64X64]) return 0;
+
+  // In a split partition all apart from the bottom right has a top right
+  int has_tr = !((mask_row & bs) && (mask_col & bs));
+
+  // bs > 0 and bs is a power of 2
+  assert(bs > 0 && !(bs & (bs - 1)));
+
+  // For each 4x4 group of blocks, when the bottom right is decoded the blocks
+  // to the right have not been decoded therefore the bottom right does
+  // not have a top right
+  while (bs < sb_mi_size) {
+    if (mask_col & bs) {
+      if ((mask_col & (2 * bs)) && (mask_row & (2 * bs))) {
+        has_tr = 0;
+        break;
+      }
+    } else {
+      break;
+    }
+    bs <<= 1;
+  }
+
+  // The left hand of two vertical rectangles always has a top right (as the
+  // block above will have been decoded)
+  if (xd->width < xd->height)
+    if (!xd->is_sec_rect) has_tr = 1;
+
+  // The bottom of two horizontal rectangles never has a top right (as the block
+  // to the right won't have been decoded)
+  if (xd->width > xd->height)
+    if (xd->is_sec_rect) has_tr = 0;
+
+  // The bottom left square of a Vertical A (in the old format) does
+  // not have a top right as it is decoded before the right hand
+  // rectangle of the partition
+  if (xd->mi[0]->partition == PARTITION_VERT_A) {
+    if (xd->width == xd->height)
+      if (mask_row & bs) has_tr = 0;
+  }
+
+  return has_tr;
+}
+
+static int check_sb_border(const int mi_row, const int mi_col,
+                           const int row_offset, const int col_offset) {
+  const int sb_mi_size = mi_size_wide[BLOCK_64X64];
+  const int row = mi_row & (sb_mi_size - 1);
+  const int col = mi_col & (sb_mi_size - 1);
+
+  if (row + row_offset < 0 || row + row_offset >= sb_mi_size ||
+      col + col_offset < 0 || col + col_offset >= sb_mi_size)
+    return 0;
+
+  return 1;
+}
+
+static int add_tpl_ref_mv(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+                          int mi_row, int mi_col, MV_REFERENCE_FRAME ref_frame,
+                          int blk_row, int blk_col, int_mv *gm_mv_candidates,
+                          uint8_t *const refmv_count,
+                          CANDIDATE_MV ref_mv_stack[MAX_REF_MV_STACK_SIZE],
+                          uint16_t ref_mv_weight[MAX_REF_MV_STACK_SIZE],
+                          int16_t *mode_context) {
+  POSITION mi_pos;
+  mi_pos.row = (mi_row & 0x01) ? blk_row : blk_row + 1;
+  mi_pos.col = (mi_col & 0x01) ? blk_col : blk_col + 1;
+
+  if (!is_inside(&xd->tile, mi_col, mi_row, &mi_pos)) return 0;
+
+  const TPL_MV_REF *prev_frame_mvs =
+      cm->tpl_mvs +
+      ((mi_row + mi_pos.row) >> 1) * (cm->mi_params.mi_stride >> 1) +
+      ((mi_col + mi_pos.col) >> 1);
+  if (prev_frame_mvs->mfmv0.as_int == INVALID_MV) return 0;
+
+  MV_REFERENCE_FRAME rf[2];
+  av1_set_ref_frame(rf, ref_frame);
+
+  const uint16_t weight_unit = 1;  // mi_size_wide[BLOCK_8X8];
+  const int cur_frame_index = cm->cur_frame->order_hint;
+  const RefCntBuffer *const buf_0 = get_ref_frame_buf(cm, rf[0]);
+  const int frame0_index = buf_0->order_hint;
+  const int cur_offset_0 = get_relative_dist(&cm->seq_params.order_hint_info,
+                                             cur_frame_index, frame0_index);
+  int idx;
+  const int allow_high_precision_mv = cm->features.allow_high_precision_mv;
+  const int force_integer_mv = cm->features.cur_frame_force_integer_mv;
+
+  int_mv this_refmv;
+  get_mv_projection(&this_refmv.as_mv, prev_frame_mvs->mfmv0.as_mv,
+                    cur_offset_0, prev_frame_mvs->ref_frame_offset);
+  lower_mv_precision(&this_refmv.as_mv, allow_high_precision_mv,
+                     force_integer_mv);
+
+  if (rf[1] == NONE_FRAME) {
+    if (blk_row == 0 && blk_col == 0) {
+      if (abs(this_refmv.as_mv.row - gm_mv_candidates[0].as_mv.row) >= 16 ||
+          abs(this_refmv.as_mv.col - gm_mv_candidates[0].as_mv.col) >= 16)
+        mode_context[ref_frame] |= (1 << GLOBALMV_OFFSET);
+    }
+
+    for (idx = 0; idx < *refmv_count; ++idx)
+      if (this_refmv.as_int == ref_mv_stack[idx].this_mv.as_int) break;
+
+    if (idx < *refmv_count) ref_mv_weight[idx] += 2 * weight_unit;
+
+    if (idx == *refmv_count && *refmv_count < MAX_REF_MV_STACK_SIZE) {
+      ref_mv_stack[idx].this_mv.as_int = this_refmv.as_int;
+      ref_mv_weight[idx] = 2 * weight_unit;
+      ++(*refmv_count);
+    }
+  } else {
+    // Process compound inter mode
+    const RefCntBuffer *const buf_1 = get_ref_frame_buf(cm, rf[1]);
+    const int frame1_index = buf_1->order_hint;
+    const int cur_offset_1 = get_relative_dist(&cm->seq_params.order_hint_info,
+                                               cur_frame_index, frame1_index);
+    int_mv comp_refmv;
+    get_mv_projection(&comp_refmv.as_mv, prev_frame_mvs->mfmv0.as_mv,
+                      cur_offset_1, prev_frame_mvs->ref_frame_offset);
+    lower_mv_precision(&comp_refmv.as_mv, allow_high_precision_mv,
+                       force_integer_mv);
+
+    if (blk_row == 0 && blk_col == 0) {
+      if (abs(this_refmv.as_mv.row - gm_mv_candidates[0].as_mv.row) >= 16 ||
+          abs(this_refmv.as_mv.col - gm_mv_candidates[0].as_mv.col) >= 16 ||
+          abs(comp_refmv.as_mv.row - gm_mv_candidates[1].as_mv.row) >= 16 ||
+          abs(comp_refmv.as_mv.col - gm_mv_candidates[1].as_mv.col) >= 16)
+        mode_context[ref_frame] |= (1 << GLOBALMV_OFFSET);
+    }
+
+    for (idx = 0; idx < *refmv_count; ++idx) {
+      if (this_refmv.as_int == ref_mv_stack[idx].this_mv.as_int &&
+          comp_refmv.as_int == ref_mv_stack[idx].comp_mv.as_int)
+        break;
+    }
+
+    if (idx < *refmv_count) ref_mv_weight[idx] += 2 * weight_unit;
+
+    if (idx == *refmv_count && *refmv_count < MAX_REF_MV_STACK_SIZE) {
+      ref_mv_stack[idx].this_mv.as_int = this_refmv.as_int;
+      ref_mv_stack[idx].comp_mv.as_int = comp_refmv.as_int;
+      ref_mv_weight[idx] = 2 * weight_unit;
+      ++(*refmv_count);
+    }
+  }
+
+  return 1;
+}
+
+static AOM_INLINE void process_compound_ref_mv_candidate(
+    const MB_MODE_INFO *const candidate, const AV1_COMMON *const cm,
+    const MV_REFERENCE_FRAME *const rf, int_mv ref_id[2][2],
+    int ref_id_count[2], int_mv ref_diff[2][2], int ref_diff_count[2]) {
+  for (int rf_idx = 0; rf_idx < 2; ++rf_idx) {
+    MV_REFERENCE_FRAME can_rf = candidate->ref_frame[rf_idx];
+
+    for (int cmp_idx = 0; cmp_idx < 2; ++cmp_idx) {
+      if (can_rf == rf[cmp_idx] && ref_id_count[cmp_idx] < 2) {
+        ref_id[cmp_idx][ref_id_count[cmp_idx]] = candidate->mv[rf_idx];
+        ++ref_id_count[cmp_idx];
+      } else if (can_rf > INTRA_FRAME && ref_diff_count[cmp_idx] < 2) {
+        int_mv this_mv = candidate->mv[rf_idx];
+        if (cm->ref_frame_sign_bias[can_rf] !=
+            cm->ref_frame_sign_bias[rf[cmp_idx]]) {
+          this_mv.as_mv.row = -this_mv.as_mv.row;
+          this_mv.as_mv.col = -this_mv.as_mv.col;
+        }
+        ref_diff[cmp_idx][ref_diff_count[cmp_idx]] = this_mv;
+        ++ref_diff_count[cmp_idx];
+      }
+    }
+  }
+}
+
+static AOM_INLINE void process_single_ref_mv_candidate(
+    const MB_MODE_INFO *const candidate, const AV1_COMMON *const cm,
+    MV_REFERENCE_FRAME ref_frame, uint8_t *const refmv_count,
+    CANDIDATE_MV ref_mv_stack[MAX_REF_MV_STACK_SIZE],
+    uint16_t ref_mv_weight[MAX_REF_MV_STACK_SIZE]) {
+  for (int rf_idx = 0; rf_idx < 2; ++rf_idx) {
+    if (candidate->ref_frame[rf_idx] > INTRA_FRAME) {
+      int_mv this_mv = candidate->mv[rf_idx];
+      if (cm->ref_frame_sign_bias[candidate->ref_frame[rf_idx]] !=
+          cm->ref_frame_sign_bias[ref_frame]) {
+        this_mv.as_mv.row = -this_mv.as_mv.row;
+        this_mv.as_mv.col = -this_mv.as_mv.col;
+      }
+      int stack_idx;
+      for (stack_idx = 0; stack_idx < *refmv_count; ++stack_idx) {
+        const int_mv stack_mv = ref_mv_stack[stack_idx].this_mv;
+        if (this_mv.as_int == stack_mv.as_int) break;
+      }
+
+      if (stack_idx == *refmv_count) {
+        ref_mv_stack[stack_idx].this_mv = this_mv;
+
+        // TODO(jingning): Set an arbitrary small number here. The weight
+        // doesn't matter as long as it is properly initialized.
+        ref_mv_weight[stack_idx] = 2;
+        ++(*refmv_count);
+      }
+    }
+  }
+}
+
+static AOM_INLINE void setup_ref_mv_list(
+    const AV1_COMMON *cm, const MACROBLOCKD *xd, MV_REFERENCE_FRAME ref_frame,
+    uint8_t *const refmv_count,
+    CANDIDATE_MV ref_mv_stack[MAX_REF_MV_STACK_SIZE],
+    uint16_t ref_mv_weight[MAX_REF_MV_STACK_SIZE],
+    int_mv mv_ref_list[MAX_MV_REF_CANDIDATES], int_mv *gm_mv_candidates,
+    int mi_row, int mi_col, int16_t *mode_context) {
+  const int bs = AOMMAX(xd->width, xd->height);
+  const int has_tr = has_top_right(cm, xd, mi_row, mi_col, bs);
+  MV_REFERENCE_FRAME rf[2];
+
+  const TileInfo *const tile = &xd->tile;
+  int max_row_offset = 0, max_col_offset = 0;
+  const int row_adj = (xd->height < mi_size_high[BLOCK_8X8]) && (mi_row & 0x01);
+  const int col_adj = (xd->width < mi_size_wide[BLOCK_8X8]) && (mi_col & 0x01);
+  int processed_rows = 0;
+  int processed_cols = 0;
+
+  av1_set_ref_frame(rf, ref_frame);
+  mode_context[ref_frame] = 0;
+  *refmv_count = 0;
+
+  // Find valid maximum row/col offset.
+  if (xd->up_available) {
+    max_row_offset = -(MVREF_ROW_COLS << 1) + row_adj;
+
+    if (xd->height < mi_size_high[BLOCK_8X8])
+      max_row_offset = -(2 << 1) + row_adj;
+
+    max_row_offset = find_valid_row_offset(tile, mi_row, max_row_offset);
+  }
+
+  if (xd->left_available) {
+    max_col_offset = -(MVREF_ROW_COLS << 1) + col_adj;
+
+    if (xd->width < mi_size_wide[BLOCK_8X8])
+      max_col_offset = -(2 << 1) + col_adj;
+
+    max_col_offset = find_valid_col_offset(tile, mi_col, max_col_offset);
+  }
+
+  uint8_t col_match_count = 0;
+  uint8_t row_match_count = 0;
+  uint8_t newmv_count = 0;
+
+  // Scan the first above row mode info. row_offset = -1;
+  if (abs(max_row_offset) >= 1)
+    scan_row_mbmi(cm, xd, mi_col, rf, -1, ref_mv_stack, ref_mv_weight,
+                  refmv_count, &row_match_count, &newmv_count, gm_mv_candidates,
+                  max_row_offset, &processed_rows);
+  // Scan the first left column mode info. col_offset = -1;
+  if (abs(max_col_offset) >= 1)
+    scan_col_mbmi(cm, xd, mi_row, rf, -1, ref_mv_stack, ref_mv_weight,
+                  refmv_count, &col_match_count, &newmv_count, gm_mv_candidates,
+                  max_col_offset, &processed_cols);
+  // Check top-right boundary
+  if (has_tr)
+    scan_blk_mbmi(cm, xd, mi_row, mi_col, rf, -1, xd->width, ref_mv_stack,
+                  ref_mv_weight, &row_match_count, &newmv_count,
+                  gm_mv_candidates, refmv_count);
+
+  const uint8_t nearest_match = (row_match_count > 0) + (col_match_count > 0);
+  const uint8_t nearest_refmv_count = *refmv_count;
+
+  // TODO(yunqing): for comp_search, do it for all 3 cases.
+  for (int idx = 0; idx < nearest_refmv_count; ++idx)
+    ref_mv_weight[idx] += REF_CAT_LEVEL;
+
+  if (cm->features.allow_ref_frame_mvs) {
+    int is_available = 0;
+    const int voffset = AOMMAX(mi_size_high[BLOCK_8X8], xd->height);
+    const int hoffset = AOMMAX(mi_size_wide[BLOCK_8X8], xd->width);
+    const int blk_row_end = AOMMIN(xd->height, mi_size_high[BLOCK_64X64]);
+    const int blk_col_end = AOMMIN(xd->width, mi_size_wide[BLOCK_64X64]);
+
+    const int tpl_sample_pos[3][2] = {
+      { voffset, -2 },
+      { voffset, hoffset },
+      { voffset - 2, hoffset },
+    };
+    const int allow_extension = (xd->height >= mi_size_high[BLOCK_8X8]) &&
+                                (xd->height < mi_size_high[BLOCK_64X64]) &&
+                                (xd->width >= mi_size_wide[BLOCK_8X8]) &&
+                                (xd->width < mi_size_wide[BLOCK_64X64]);
+
+    const int step_h = (xd->height >= mi_size_high[BLOCK_64X64])
+                           ? mi_size_high[BLOCK_16X16]
+                           : mi_size_high[BLOCK_8X8];
+    const int step_w = (xd->width >= mi_size_wide[BLOCK_64X64])
+                           ? mi_size_wide[BLOCK_16X16]
+                           : mi_size_wide[BLOCK_8X8];
+
+    for (int blk_row = 0; blk_row < blk_row_end; blk_row += step_h) {
+      for (int blk_col = 0; blk_col < blk_col_end; blk_col += step_w) {
+        int ret = add_tpl_ref_mv(cm, xd, mi_row, mi_col, ref_frame, blk_row,
+                                 blk_col, gm_mv_candidates, refmv_count,
+                                 ref_mv_stack, ref_mv_weight, mode_context);
+        if (blk_row == 0 && blk_col == 0) is_available = ret;
+      }
+    }
+
+    if (is_available == 0) mode_context[ref_frame] |= (1 << GLOBALMV_OFFSET);
+
+    for (int i = 0; i < 3 && allow_extension; ++i) {
+      const int blk_row = tpl_sample_pos[i][0];
+      const int blk_col = tpl_sample_pos[i][1];
+
+      if (!check_sb_border(mi_row, mi_col, blk_row, blk_col)) continue;
+      add_tpl_ref_mv(cm, xd, mi_row, mi_col, ref_frame, blk_row, blk_col,
+                     gm_mv_candidates, refmv_count, ref_mv_stack, ref_mv_weight,
+                     mode_context);
+    }
+  }
+
+  uint8_t dummy_newmv_count = 0;
+
+  // Scan the second outer area.
+  scan_blk_mbmi(cm, xd, mi_row, mi_col, rf, -1, -1, ref_mv_stack, ref_mv_weight,
+                &row_match_count, &dummy_newmv_count, gm_mv_candidates,
+                refmv_count);
+
+  for (int idx = 2; idx <= MVREF_ROW_COLS; ++idx) {
+    const int row_offset = -(idx << 1) + 1 + row_adj;
+    const int col_offset = -(idx << 1) + 1 + col_adj;
+
+    if (abs(row_offset) <= abs(max_row_offset) &&
+        abs(row_offset) > processed_rows)
+      scan_row_mbmi(cm, xd, mi_col, rf, row_offset, ref_mv_stack, ref_mv_weight,
+                    refmv_count, &row_match_count, &dummy_newmv_count,
+                    gm_mv_candidates, max_row_offset, &processed_rows);
+
+    if (abs(col_offset) <= abs(max_col_offset) &&
+        abs(col_offset) > processed_cols)
+      scan_col_mbmi(cm, xd, mi_row, rf, col_offset, ref_mv_stack, ref_mv_weight,
+                    refmv_count, &col_match_count, &dummy_newmv_count,
+                    gm_mv_candidates, max_col_offset, &processed_cols);
+  }
+
+  const uint8_t ref_match_count = (row_match_count > 0) + (col_match_count > 0);
+
+  switch (nearest_match) {
+    case 0:
+      if (ref_match_count >= 1) mode_context[ref_frame] |= 1;
+      if (ref_match_count == 1)
+        mode_context[ref_frame] |= (1 << REFMV_OFFSET);
+      else if (ref_match_count >= 2)
+        mode_context[ref_frame] |= (2 << REFMV_OFFSET);
+      break;
+    case 1:
+      mode_context[ref_frame] |= (newmv_count > 0) ? 2 : 3;
+      if (ref_match_count == 1)
+        mode_context[ref_frame] |= (3 << REFMV_OFFSET);
+      else if (ref_match_count >= 2)
+        mode_context[ref_frame] |= (4 << REFMV_OFFSET);
+      break;
+    case 2:
+    default:
+      if (newmv_count >= 1)
+        mode_context[ref_frame] |= 4;
+      else
+        mode_context[ref_frame] |= 5;
+
+      mode_context[ref_frame] |= (5 << REFMV_OFFSET);
+      break;
+  }
+
+  // Rank the likelihood and assign nearest and near mvs.
+  int len = nearest_refmv_count;
+  while (len > 0) {
+    int nr_len = 0;
+    for (int idx = 1; idx < len; ++idx) {
+      if (ref_mv_weight[idx - 1] < ref_mv_weight[idx]) {
+        const CANDIDATE_MV tmp_mv = ref_mv_stack[idx - 1];
+        const uint16_t tmp_ref_mv_weight = ref_mv_weight[idx - 1];
+        ref_mv_stack[idx - 1] = ref_mv_stack[idx];
+        ref_mv_stack[idx] = tmp_mv;
+        ref_mv_weight[idx - 1] = ref_mv_weight[idx];
+        ref_mv_weight[idx] = tmp_ref_mv_weight;
+        nr_len = idx;
+      }
+    }
+    len = nr_len;
+  }
+
+  len = *refmv_count;
+  while (len > nearest_refmv_count) {
+    int nr_len = nearest_refmv_count;
+    for (int idx = nearest_refmv_count + 1; idx < len; ++idx) {
+      if (ref_mv_weight[idx - 1] < ref_mv_weight[idx]) {
+        const CANDIDATE_MV tmp_mv = ref_mv_stack[idx - 1];
+        const uint16_t tmp_ref_mv_weight = ref_mv_weight[idx - 1];
+        ref_mv_stack[idx - 1] = ref_mv_stack[idx];
+        ref_mv_stack[idx] = tmp_mv;
+        ref_mv_weight[idx - 1] = ref_mv_weight[idx];
+        ref_mv_weight[idx] = tmp_ref_mv_weight;
+        nr_len = idx;
+      }
+    }
+    len = nr_len;
+  }
+
+  int mi_width = AOMMIN(mi_size_wide[BLOCK_64X64], xd->width);
+  mi_width = AOMMIN(mi_width, cm->mi_params.mi_cols - mi_col);
+  int mi_height = AOMMIN(mi_size_high[BLOCK_64X64], xd->height);
+  mi_height = AOMMIN(mi_height, cm->mi_params.mi_rows - mi_row);
+  const int mi_size = AOMMIN(mi_width, mi_height);
+  if (rf[1] > NONE_FRAME) {
+    // TODO(jingning, yunqing): Refactor and consolidate the compound and
+    // single reference frame modes. Reduce unnecessary redundancy.
+    if (*refmv_count < MAX_MV_REF_CANDIDATES) {
+      int_mv ref_id[2][2], ref_diff[2][2];
+      int ref_id_count[2] = { 0 }, ref_diff_count[2] = { 0 };
+
+      for (int idx = 0; abs(max_row_offset) >= 1 && idx < mi_size;) {
+        const MB_MODE_INFO *const candidate = xd->mi[-xd->mi_stride + idx];
+        process_compound_ref_mv_candidate(
+            candidate, cm, rf, ref_id, ref_id_count, ref_diff, ref_diff_count);
+        idx += mi_size_wide[candidate->sb_type];
+      }
+
+      for (int idx = 0; abs(max_col_offset) >= 1 && idx < mi_size;) {
+        const MB_MODE_INFO *const candidate = xd->mi[idx * xd->mi_stride - 1];
+        process_compound_ref_mv_candidate(
+            candidate, cm, rf, ref_id, ref_id_count, ref_diff, ref_diff_count);
+        idx += mi_size_high[candidate->sb_type];
+      }
+
+      // Build up the compound mv predictor
+      int_mv comp_list[MAX_MV_REF_CANDIDATES][2];
+
+      for (int idx = 0; idx < 2; ++idx) {
+        int comp_idx = 0;
+        for (int list_idx = 0;
+             list_idx < ref_id_count[idx] && comp_idx < MAX_MV_REF_CANDIDATES;
+             ++list_idx, ++comp_idx)
+          comp_list[comp_idx][idx] = ref_id[idx][list_idx];
+        for (int list_idx = 0;
+             list_idx < ref_diff_count[idx] && comp_idx < MAX_MV_REF_CANDIDATES;
+             ++list_idx, ++comp_idx)
+          comp_list[comp_idx][idx] = ref_diff[idx][list_idx];
+        for (; comp_idx < MAX_MV_REF_CANDIDATES; ++comp_idx)
+          comp_list[comp_idx][idx] = gm_mv_candidates[idx];
+      }
+
+      if (*refmv_count) {
+        assert(*refmv_count == 1);
+        if (comp_list[0][0].as_int == ref_mv_stack[0].this_mv.as_int &&
+            comp_list[0][1].as_int == ref_mv_stack[0].comp_mv.as_int) {
+          ref_mv_stack[*refmv_count].this_mv = comp_list[1][0];
+          ref_mv_stack[*refmv_count].comp_mv = comp_list[1][1];
+        } else {
+          ref_mv_stack[*refmv_count].this_mv = comp_list[0][0];
+          ref_mv_stack[*refmv_count].comp_mv = comp_list[0][1];
+        }
+        ref_mv_weight[*refmv_count] = 2;
+        ++*refmv_count;
+      } else {
+        for (int idx = 0; idx < MAX_MV_REF_CANDIDATES; ++idx) {
+          ref_mv_stack[*refmv_count].this_mv = comp_list[idx][0];
+          ref_mv_stack[*refmv_count].comp_mv = comp_list[idx][1];
+          ref_mv_weight[*refmv_count] = 2;
+          ++*refmv_count;
+        }
+      }
+    }
+
+    assert(*refmv_count >= 2);
+
+    for (int idx = 0; idx < *refmv_count; ++idx) {
+      clamp_mv_ref(&ref_mv_stack[idx].this_mv.as_mv, xd->width << MI_SIZE_LOG2,
+                   xd->height << MI_SIZE_LOG2, xd);
+      clamp_mv_ref(&ref_mv_stack[idx].comp_mv.as_mv, xd->width << MI_SIZE_LOG2,
+                   xd->height << MI_SIZE_LOG2, xd);
+    }
+  } else {
+    // Handle single reference frame extension
+    for (int idx = 0; abs(max_row_offset) >= 1 && idx < mi_size &&
+                      *refmv_count < MAX_MV_REF_CANDIDATES;) {
+      const MB_MODE_INFO *const candidate = xd->mi[-xd->mi_stride + idx];
+      process_single_ref_mv_candidate(candidate, cm, ref_frame, refmv_count,
+                                      ref_mv_stack, ref_mv_weight);
+      idx += mi_size_wide[candidate->sb_type];
+    }
+
+    for (int idx = 0; abs(max_col_offset) >= 1 && idx < mi_size &&
+                      *refmv_count < MAX_MV_REF_CANDIDATES;) {
+      const MB_MODE_INFO *const candidate = xd->mi[idx * xd->mi_stride - 1];
+      process_single_ref_mv_candidate(candidate, cm, ref_frame, refmv_count,
+                                      ref_mv_stack, ref_mv_weight);
+      idx += mi_size_high[candidate->sb_type];
+    }
+
+    for (int idx = 0; idx < *refmv_count; ++idx) {
+      clamp_mv_ref(&ref_mv_stack[idx].this_mv.as_mv, xd->width << MI_SIZE_LOG2,
+                   xd->height << MI_SIZE_LOG2, xd);
+    }
+
+    if (mv_ref_list != NULL) {
+      for (int idx = *refmv_count; idx < MAX_MV_REF_CANDIDATES; ++idx)
+        mv_ref_list[idx].as_int = gm_mv_candidates[0].as_int;
+
+      for (int idx = 0; idx < AOMMIN(MAX_MV_REF_CANDIDATES, *refmv_count);
+           ++idx) {
+        mv_ref_list[idx].as_int = ref_mv_stack[idx].this_mv.as_int;
+      }
+    }
+  }
+}
+
+void av1_find_mv_refs(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+                      MB_MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
+                      uint8_t ref_mv_count[MODE_CTX_REF_FRAMES],
+                      CANDIDATE_MV ref_mv_stack[][MAX_REF_MV_STACK_SIZE],
+                      uint16_t ref_mv_weight[][MAX_REF_MV_STACK_SIZE],
+                      int_mv mv_ref_list[][MAX_MV_REF_CANDIDATES],
+                      int_mv *global_mvs, int16_t *mode_context) {
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+  int_mv gm_mv[2];
+
+  if (ref_frame == INTRA_FRAME) {
+    gm_mv[0].as_int = gm_mv[1].as_int = 0;
+    if (global_mvs != NULL) {
+      global_mvs[ref_frame].as_int = INVALID_MV;
+    }
+  } else {
+    const BLOCK_SIZE bsize = mi->sb_type;
+    const int allow_high_precision_mv = cm->features.allow_high_precision_mv;
+    const int force_integer_mv = cm->features.cur_frame_force_integer_mv;
+    if (ref_frame < REF_FRAMES) {
+      gm_mv[0] = gm_get_motion_vector(&cm->global_motion[ref_frame],
+                                      allow_high_precision_mv, bsize, mi_col,
+                                      mi_row, force_integer_mv);
+      gm_mv[1].as_int = 0;
+      if (global_mvs != NULL) global_mvs[ref_frame] = gm_mv[0];
+    } else {
+      MV_REFERENCE_FRAME rf[2];
+      av1_set_ref_frame(rf, ref_frame);
+      gm_mv[0] = gm_get_motion_vector(&cm->global_motion[rf[0]],
+                                      allow_high_precision_mv, bsize, mi_col,
+                                      mi_row, force_integer_mv);
+      gm_mv[1] = gm_get_motion_vector(&cm->global_motion[rf[1]],
+                                      allow_high_precision_mv, bsize, mi_col,
+                                      mi_row, force_integer_mv);
+    }
+  }
+
+  setup_ref_mv_list(cm, xd, ref_frame, &ref_mv_count[ref_frame],
+                    ref_mv_stack[ref_frame], ref_mv_weight[ref_frame],
+                    mv_ref_list ? mv_ref_list[ref_frame] : NULL, gm_mv, mi_row,
+                    mi_col, mode_context);
+}
+
+void av1_find_best_ref_mvs(int allow_hp, int_mv *mvlist, int_mv *nearest_mv,
+                           int_mv *near_mv, int is_integer) {
+  int i;
+  // Make sure all the candidates are properly clamped etc
+  for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) {
+    lower_mv_precision(&mvlist[i].as_mv, allow_hp, is_integer);
+  }
+  *nearest_mv = mvlist[0];
+  *near_mv = mvlist[1];
+}
+
+void av1_setup_frame_buf_refs(AV1_COMMON *cm) {
+  cm->cur_frame->order_hint = cm->current_frame.order_hint;
+  cm->cur_frame->display_order_hint = cm->current_frame.display_order_hint;
+
+  MV_REFERENCE_FRAME ref_frame;
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame);
+    if (buf != NULL) {
+      cm->cur_frame->ref_order_hints[ref_frame - LAST_FRAME] = buf->order_hint;
+      cm->cur_frame->ref_display_order_hint[ref_frame - LAST_FRAME] =
+          buf->display_order_hint;
+    }
+  }
+}
+
+void av1_setup_frame_sign_bias(AV1_COMMON *cm) {
+  MV_REFERENCE_FRAME ref_frame;
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame);
+    if (cm->seq_params.order_hint_info.enable_order_hint && buf != NULL) {
+      const int ref_order_hint = buf->order_hint;
+      cm->ref_frame_sign_bias[ref_frame] =
+          (get_relative_dist(&cm->seq_params.order_hint_info, ref_order_hint,
+                             (int)cm->current_frame.order_hint) <= 0)
+              ? 0
+              : 1;
+    } else {
+      cm->ref_frame_sign_bias[ref_frame] = 0;
+    }
+  }
+}
+
+#define MAX_OFFSET_WIDTH 64
+#define MAX_OFFSET_HEIGHT 0
+
+static int get_block_position(AV1_COMMON *cm, int *mi_r, int *mi_c, int blk_row,
+                              int blk_col, MV mv, int sign_bias) {
+  const int base_blk_row = (blk_row >> 3) << 3;
+  const int base_blk_col = (blk_col >> 3) << 3;
+
+  const int row_offset = (mv.row >= 0) ? (mv.row >> (4 + MI_SIZE_LOG2))
+                                       : -((-mv.row) >> (4 + MI_SIZE_LOG2));
+
+  const int col_offset = (mv.col >= 0) ? (mv.col >> (4 + MI_SIZE_LOG2))
+                                       : -((-mv.col) >> (4 + MI_SIZE_LOG2));
+
+  const int row =
+      (sign_bias == 1) ? blk_row - row_offset : blk_row + row_offset;
+  const int col =
+      (sign_bias == 1) ? blk_col - col_offset : blk_col + col_offset;
+
+  if (row < 0 || row >= (cm->mi_params.mi_rows >> 1) || col < 0 ||
+      col >= (cm->mi_params.mi_cols >> 1))
+    return 0;
+
+  if (row < base_blk_row - (MAX_OFFSET_HEIGHT >> 3) ||
+      row >= base_blk_row + 8 + (MAX_OFFSET_HEIGHT >> 3) ||
+      col < base_blk_col - (MAX_OFFSET_WIDTH >> 3) ||
+      col >= base_blk_col + 8 + (MAX_OFFSET_WIDTH >> 3))
+    return 0;
+
+  *mi_r = row;
+  *mi_c = col;
+
+  return 1;
+}
+
+// Note: motion_filed_projection finds motion vectors of current frame's
+// reference frame, and projects them to current frame. To make it clear,
+// let's call current frame's reference frame as start frame.
+// Call Start frame's reference frames as reference frames.
+// Call ref_offset as frame distances between start frame and its reference
+// frames.
+static int motion_field_projection(AV1_COMMON *cm,
+                                   MV_REFERENCE_FRAME start_frame, int dir) {
+  TPL_MV_REF *tpl_mvs_base = cm->tpl_mvs;
+  int ref_offset[REF_FRAMES] = { 0 };
+
+  const RefCntBuffer *const start_frame_buf =
+      get_ref_frame_buf(cm, start_frame);
+  if (start_frame_buf == NULL) return 0;
+
+  if (start_frame_buf->frame_type == KEY_FRAME ||
+      start_frame_buf->frame_type == INTRA_ONLY_FRAME)
+    return 0;
+
+  if (start_frame_buf->mi_rows != cm->mi_params.mi_rows ||
+      start_frame_buf->mi_cols != cm->mi_params.mi_cols)
+    return 0;
+
+  const int start_frame_order_hint = start_frame_buf->order_hint;
+  const unsigned int *const ref_order_hints =
+      &start_frame_buf->ref_order_hints[0];
+  const int cur_order_hint = cm->cur_frame->order_hint;
+  int start_to_current_frame_offset = get_relative_dist(
+      &cm->seq_params.order_hint_info, start_frame_order_hint, cur_order_hint);
+
+  for (MV_REFERENCE_FRAME rf = LAST_FRAME; rf <= INTER_REFS_PER_FRAME; ++rf) {
+    ref_offset[rf] = get_relative_dist(&cm->seq_params.order_hint_info,
+                                       start_frame_order_hint,
+                                       ref_order_hints[rf - LAST_FRAME]);
+  }
+
+  if (dir == 2) start_to_current_frame_offset = -start_to_current_frame_offset;
+
+  MV_REF *mv_ref_base = start_frame_buf->mvs;
+  const int mvs_rows = (cm->mi_params.mi_rows + 1) >> 1;
+  const int mvs_cols = (cm->mi_params.mi_cols + 1) >> 1;
+
+  for (int blk_row = 0; blk_row < mvs_rows; ++blk_row) {
+    for (int blk_col = 0; blk_col < mvs_cols; ++blk_col) {
+      MV_REF *mv_ref = &mv_ref_base[blk_row * mvs_cols + blk_col];
+      MV fwd_mv = mv_ref->mv.as_mv;
+
+      if (mv_ref->ref_frame > INTRA_FRAME) {
+        int_mv this_mv;
+        int mi_r, mi_c;
+        const int ref_frame_offset = ref_offset[mv_ref->ref_frame];
+
+        int pos_valid =
+            abs(ref_frame_offset) <= MAX_FRAME_DISTANCE &&
+            ref_frame_offset > 0 &&
+            abs(start_to_current_frame_offset) <= MAX_FRAME_DISTANCE;
+
+        if (pos_valid) {
+          get_mv_projection(&this_mv.as_mv, fwd_mv,
+                            start_to_current_frame_offset, ref_frame_offset);
+          pos_valid = get_block_position(cm, &mi_r, &mi_c, blk_row, blk_col,
+                                         this_mv.as_mv, dir >> 1);
+        }
+
+        if (pos_valid) {
+          const int mi_offset = mi_r * (cm->mi_params.mi_stride >> 1) + mi_c;
+
+          tpl_mvs_base[mi_offset].mfmv0.as_mv.row = fwd_mv.row;
+          tpl_mvs_base[mi_offset].mfmv0.as_mv.col = fwd_mv.col;
+          tpl_mvs_base[mi_offset].ref_frame_offset = ref_frame_offset;
+        }
+      }
+    }
+  }
+
+  return 1;
+}
+
+void av1_setup_motion_field(AV1_COMMON *cm) {
+  const OrderHintInfo *const order_hint_info = &cm->seq_params.order_hint_info;
+
+  memset(cm->ref_frame_side, 0, sizeof(cm->ref_frame_side));
+  if (!order_hint_info->enable_order_hint) return;
+
+  TPL_MV_REF *tpl_mvs_base = cm->tpl_mvs;
+  int size = ((cm->mi_params.mi_rows + MAX_MIB_SIZE) >> 1) *
+             (cm->mi_params.mi_stride >> 1);
+  for (int idx = 0; idx < size; ++idx) {
+    tpl_mvs_base[idx].mfmv0.as_int = INVALID_MV;
+    tpl_mvs_base[idx].ref_frame_offset = 0;
+  }
+
+  const int cur_order_hint = cm->cur_frame->order_hint;
+
+  const RefCntBuffer *ref_buf[INTER_REFS_PER_FRAME];
+  int ref_order_hint[INTER_REFS_PER_FRAME];
+
+  for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
+    const int ref_idx = ref_frame - LAST_FRAME;
+    const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame);
+    int order_hint = 0;
+
+    if (buf != NULL) order_hint = buf->order_hint;
+
+    ref_buf[ref_idx] = buf;
+    ref_order_hint[ref_idx] = order_hint;
+
+    if (get_relative_dist(order_hint_info, order_hint, cur_order_hint) > 0)
+      cm->ref_frame_side[ref_frame] = 1;
+    else if (order_hint == cur_order_hint)
+      cm->ref_frame_side[ref_frame] = -1;
+  }
+
+  int ref_stamp = MFMV_STACK_SIZE - 1;
+
+  if (ref_buf[LAST_FRAME - LAST_FRAME] != NULL) {
+    const int alt_of_lst_order_hint =
+        ref_buf[LAST_FRAME - LAST_FRAME]
+            ->ref_order_hints[ALTREF_FRAME - LAST_FRAME];
+
+    const int is_lst_overlay =
+        (alt_of_lst_order_hint == ref_order_hint[GOLDEN_FRAME - LAST_FRAME]);
+    if (!is_lst_overlay) motion_field_projection(cm, LAST_FRAME, 2);
+    --ref_stamp;
+  }
+
+  if (get_relative_dist(order_hint_info,
+                        ref_order_hint[BWDREF_FRAME - LAST_FRAME],
+                        cur_order_hint) > 0) {
+    if (motion_field_projection(cm, BWDREF_FRAME, 0)) --ref_stamp;
+  }
+
+  if (get_relative_dist(order_hint_info,
+                        ref_order_hint[ALTREF2_FRAME - LAST_FRAME],
+                        cur_order_hint) > 0) {
+    if (motion_field_projection(cm, ALTREF2_FRAME, 0)) --ref_stamp;
+  }
+
+  if (get_relative_dist(order_hint_info,
+                        ref_order_hint[ALTREF_FRAME - LAST_FRAME],
+                        cur_order_hint) > 0 &&
+      ref_stamp >= 0)
+    if (motion_field_projection(cm, ALTREF_FRAME, 0)) --ref_stamp;
+
+  if (ref_stamp >= 0) motion_field_projection(cm, LAST2_FRAME, 2);
+}
+
+static INLINE void record_samples(const MB_MODE_INFO *mbmi, int *pts,
+                                  int *pts_inref, int row_offset, int sign_r,
+                                  int col_offset, int sign_c) {
+  int bw = block_size_wide[mbmi->sb_type];
+  int bh = block_size_high[mbmi->sb_type];
+  int x = col_offset * MI_SIZE + sign_c * AOMMAX(bw, MI_SIZE) / 2 - 1;
+  int y = row_offset * MI_SIZE + sign_r * AOMMAX(bh, MI_SIZE) / 2 - 1;
+
+  pts[0] = GET_MV_SUBPEL(x);
+  pts[1] = GET_MV_SUBPEL(y);
+  pts_inref[0] = GET_MV_SUBPEL(x) + mbmi->mv[0].as_mv.col;
+  pts_inref[1] = GET_MV_SUBPEL(y) + mbmi->mv[0].as_mv.row;
+}
+
+// Select samples according to the motion vector difference.
+uint8_t av1_selectSamples(MV *mv, int *pts, int *pts_inref, int len,
+                          BLOCK_SIZE bsize) {
+  const int bw = block_size_wide[bsize];
+  const int bh = block_size_high[bsize];
+  const int thresh = clamp(AOMMAX(bw, bh), 16, 112);
+  int pts_mvd[SAMPLES_ARRAY_SIZE] = { 0 };
+  int i, j, k, l = len;
+  uint8_t ret = 0;
+  assert(len <= LEAST_SQUARES_SAMPLES_MAX);
+
+  // Obtain the motion vector difference.
+  for (i = 0; i < len; ++i) {
+    pts_mvd[i] = abs(pts_inref[2 * i] - pts[2 * i] - mv->col) +
+                 abs(pts_inref[2 * i + 1] - pts[2 * i + 1] - mv->row);
+
+    if (pts_mvd[i] > thresh)
+      pts_mvd[i] = -1;
+    else
+      ret++;
+  }
+
+  // Keep at least 1 sample.
+  if (!ret) return 1;
+
+  i = 0;
+  j = l - 1;
+  for (k = 0; k < l - ret; k++) {
+    while (pts_mvd[i] != -1) i++;
+    while (pts_mvd[j] == -1) j--;
+    assert(i != j);
+    if (i > j) break;
+
+    // Replace the discarded samples;
+    pts_mvd[i] = pts_mvd[j];
+    pts[2 * i] = pts[2 * j];
+    pts[2 * i + 1] = pts[2 * j + 1];
+    pts_inref[2 * i] = pts_inref[2 * j];
+    pts_inref[2 * i + 1] = pts_inref[2 * j + 1];
+    i++;
+    j--;
+  }
+
+  return ret;
+}
+
+// Note: Samples returned are at 1/8-pel precision
+// Sample are the neighbor block center point's coordinates relative to the
+// left-top pixel of current block.
+uint8_t av1_findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int *pts,
+                        int *pts_inref) {
+  const MB_MODE_INFO *const mbmi0 = xd->mi[0];
+  const int ref_frame = mbmi0->ref_frame[0];
+  const int up_available = xd->up_available;
+  const int left_available = xd->left_available;
+  int i, mi_step;
+  uint8_t np = 0;
+  int do_tl = 1;
+  int do_tr = 1;
+  const int mi_stride = xd->mi_stride;
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+
+  // scan the nearest above rows
+  if (up_available) {
+    const int mi_row_offset = -1;
+    const MB_MODE_INFO *mbmi = xd->mi[mi_row_offset * mi_stride];
+    uint8_t superblock_width = mi_size_wide[mbmi->sb_type];
+
+    if (xd->width <= superblock_width) {
+      // Handle "current block width <= above block width" case.
+      const int col_offset = -mi_col % superblock_width;
+
+      if (col_offset < 0) do_tl = 0;
+      if (col_offset + superblock_width > xd->width) do_tr = 0;
+
+      if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) {
+        record_samples(mbmi, pts, pts_inref, 0, -1, col_offset, 1);
+        pts += 2;
+        pts_inref += 2;
+        np++;
+        if (np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
+      }
+    } else {
+      // Handle "current block width > above block width" case.
+      for (i = 0; i < AOMMIN(xd->width, cm->mi_params.mi_cols - mi_col);
+           i += mi_step) {
+        mbmi = xd->mi[i + mi_row_offset * mi_stride];
+        superblock_width = mi_size_wide[mbmi->sb_type];
+        mi_step = AOMMIN(xd->width, superblock_width);
+
+        if (mbmi->ref_frame[0] == ref_frame &&
+            mbmi->ref_frame[1] == NONE_FRAME) {
+          record_samples(mbmi, pts, pts_inref, 0, -1, i, 1);
+          pts += 2;
+          pts_inref += 2;
+          np++;
+          if (np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
+        }
+      }
+    }
+  }
+  assert(np <= LEAST_SQUARES_SAMPLES_MAX);
+
+  // scan the nearest left columns
+  if (left_available) {
+    const int mi_col_offset = -1;
+    const MB_MODE_INFO *mbmi = xd->mi[mi_col_offset];
+    uint8_t superblock_height = mi_size_high[mbmi->sb_type];
+
+    if (xd->height <= superblock_height) {
+      // Handle "current block height <= above block height" case.
+      const int row_offset = -mi_row % superblock_height;
+
+      if (row_offset < 0) do_tl = 0;
+
+      if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) {
+        record_samples(mbmi, pts, pts_inref, row_offset, 1, 0, -1);
+        pts += 2;
+        pts_inref += 2;
+        np++;
+        if (np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
+      }
+    } else {
+      // Handle "current block height > above block height" case.
+      for (i = 0; i < AOMMIN(xd->height, cm->mi_params.mi_rows - mi_row);
+           i += mi_step) {
+        mbmi = xd->mi[mi_col_offset + i * mi_stride];
+        superblock_height = mi_size_high[mbmi->sb_type];
+        mi_step = AOMMIN(xd->height, superblock_height);
+
+        if (mbmi->ref_frame[0] == ref_frame &&
+            mbmi->ref_frame[1] == NONE_FRAME) {
+          record_samples(mbmi, pts, pts_inref, i, 1, 0, -1);
+          pts += 2;
+          pts_inref += 2;
+          np++;
+          if (np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
+        }
+      }
+    }
+  }
+  assert(np <= LEAST_SQUARES_SAMPLES_MAX);
+
+  // Top-left block
+  if (do_tl && left_available && up_available) {
+    const int mi_row_offset = -1;
+    const int mi_col_offset = -1;
+    MB_MODE_INFO *mbmi = xd->mi[mi_col_offset + mi_row_offset * mi_stride];
+
+    if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) {
+      record_samples(mbmi, pts, pts_inref, 0, -1, 0, -1);
+      pts += 2;
+      pts_inref += 2;
+      np++;
+      if (np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
+    }
+  }
+  assert(np <= LEAST_SQUARES_SAMPLES_MAX);
+
+  // Top-right block
+  if (do_tr &&
+      has_top_right(cm, xd, mi_row, mi_col, AOMMAX(xd->width, xd->height))) {
+    const POSITION trb_pos = { -1, xd->width };
+    const TileInfo *const tile = &xd->tile;
+    if (is_inside(tile, mi_col, mi_row, &trb_pos)) {
+      const int mi_row_offset = -1;
+      const int mi_col_offset = xd->width;
+      const MB_MODE_INFO *mbmi =
+          xd->mi[mi_col_offset + mi_row_offset * mi_stride];
+
+      if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) {
+        record_samples(mbmi, pts, pts_inref, 0, -1, xd->width, 1);
+        np++;
+        if (np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
+      }
+    }
+  }
+  assert(np <= LEAST_SQUARES_SAMPLES_MAX);
+
+  return np;
+}
+
+void av1_setup_skip_mode_allowed(AV1_COMMON *cm) {
+  const OrderHintInfo *const order_hint_info = &cm->seq_params.order_hint_info;
+  SkipModeInfo *const skip_mode_info = &cm->current_frame.skip_mode_info;
+
+  skip_mode_info->skip_mode_allowed = 0;
+  skip_mode_info->ref_frame_idx_0 = INVALID_IDX;
+  skip_mode_info->ref_frame_idx_1 = INVALID_IDX;
+
+  if (!order_hint_info->enable_order_hint || frame_is_intra_only(cm) ||
+      cm->current_frame.reference_mode == SINGLE_REFERENCE)
+    return;
+
+  const int cur_order_hint = cm->current_frame.order_hint;
+  int ref_order_hints[2] = { -1, INT_MAX };
+  int ref_idx[2] = { INVALID_IDX, INVALID_IDX };
+
+  // Identify the nearest forward and backward references.
+  for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+    const RefCntBuffer *const buf = get_ref_frame_buf(cm, LAST_FRAME + i);
+    if (buf == NULL) continue;
+
+    const int ref_order_hint = buf->order_hint;
+    if (get_relative_dist(order_hint_info, ref_order_hint, cur_order_hint) <
+        0) {
+      // Forward reference
+      if (ref_order_hints[0] == -1 ||
+          get_relative_dist(order_hint_info, ref_order_hint,
+                            ref_order_hints[0]) > 0) {
+        ref_order_hints[0] = ref_order_hint;
+        ref_idx[0] = i;
+      }
+    } else if (get_relative_dist(order_hint_info, ref_order_hint,
+                                 cur_order_hint) > 0) {
+      // Backward reference
+      if (ref_order_hints[1] == INT_MAX ||
+          get_relative_dist(order_hint_info, ref_order_hint,
+                            ref_order_hints[1]) < 0) {
+        ref_order_hints[1] = ref_order_hint;
+        ref_idx[1] = i;
+      }
+    }
+  }
+
+  if (ref_idx[0] != INVALID_IDX && ref_idx[1] != INVALID_IDX) {
+    // == Bi-directional prediction ==
+    skip_mode_info->skip_mode_allowed = 1;
+    skip_mode_info->ref_frame_idx_0 = AOMMIN(ref_idx[0], ref_idx[1]);
+    skip_mode_info->ref_frame_idx_1 = AOMMAX(ref_idx[0], ref_idx[1]);
+  } else if (ref_idx[0] != INVALID_IDX && ref_idx[1] == INVALID_IDX) {
+    // == Forward prediction only ==
+    // Identify the second nearest forward reference.
+    ref_order_hints[1] = -1;
+    for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+      const RefCntBuffer *const buf = get_ref_frame_buf(cm, LAST_FRAME + i);
+      if (buf == NULL) continue;
+
+      const int ref_order_hint = buf->order_hint;
+      if ((ref_order_hints[0] != -1 &&
+           get_relative_dist(order_hint_info, ref_order_hint,
+                             ref_order_hints[0]) < 0) &&
+          (ref_order_hints[1] == -1 ||
+           get_relative_dist(order_hint_info, ref_order_hint,
+                             ref_order_hints[1]) > 0)) {
+        // Second closest forward reference
+        ref_order_hints[1] = ref_order_hint;
+        ref_idx[1] = i;
+      }
+    }
+    if (ref_order_hints[1] != -1) {
+      skip_mode_info->skip_mode_allowed = 1;
+      skip_mode_info->ref_frame_idx_0 = AOMMIN(ref_idx[0], ref_idx[1]);
+      skip_mode_info->ref_frame_idx_1 = AOMMAX(ref_idx[0], ref_idx[1]);
+    }
+  }
+}
+
+typedef struct {
+  int map_idx;        // frame map index
+  RefCntBuffer *buf;  // frame buffer
+  int sort_idx;       // index based on the offset to be used for sorting
+} REF_FRAME_INFO;
+
+// Compares the sort_idx fields. If they are equal, then compares the map_idx
+// fields to break the tie. This ensures a stable sort.
+static int compare_ref_frame_info(const void *arg_a, const void *arg_b) {
+  const REF_FRAME_INFO *info_a = (REF_FRAME_INFO *)arg_a;
+  const REF_FRAME_INFO *info_b = (REF_FRAME_INFO *)arg_b;
+
+  const int sort_idx_diff = info_a->sort_idx - info_b->sort_idx;
+  if (sort_idx_diff != 0) return sort_idx_diff;
+  return info_a->map_idx - info_b->map_idx;
+}
+
+static AOM_INLINE void set_ref_frame_info(int *remapped_ref_idx, int frame_idx,
+                                          REF_FRAME_INFO *ref_info) {
+  assert(frame_idx >= 0 && frame_idx < INTER_REFS_PER_FRAME);
+
+  remapped_ref_idx[frame_idx] = ref_info->map_idx;
+}
+
+void av1_set_frame_refs(AV1_COMMON *const cm, int *remapped_ref_idx,
+                        int lst_map_idx, int gld_map_idx) {
+  int lst_frame_sort_idx = -1;
+  int gld_frame_sort_idx = -1;
+
+  assert(cm->seq_params.order_hint_info.enable_order_hint);
+  assert(cm->seq_params.order_hint_info.order_hint_bits_minus_1 >= 0);
+  const int cur_order_hint = (int)cm->current_frame.order_hint;
+  const int cur_frame_sort_idx =
+      1 << cm->seq_params.order_hint_info.order_hint_bits_minus_1;
+
+  REF_FRAME_INFO ref_frame_info[REF_FRAMES];
+  int ref_flag_list[INTER_REFS_PER_FRAME] = { 0, 0, 0, 0, 0, 0, 0 };
+
+  for (int i = 0; i < REF_FRAMES; ++i) {
+    const int map_idx = i;
+
+    ref_frame_info[i].map_idx = map_idx;
+    ref_frame_info[i].sort_idx = -1;
+
+    RefCntBuffer *const buf = cm->ref_frame_map[map_idx];
+    ref_frame_info[i].buf = buf;
+
+    if (buf == NULL) continue;
+    // If this assertion fails, there is a reference leak.
+    assert(buf->ref_count > 0);
+
+    const int offset = (int)buf->order_hint;
+    ref_frame_info[i].sort_idx =
+        (offset == -1) ? -1
+                       : cur_frame_sort_idx +
+                             get_relative_dist(&cm->seq_params.order_hint_info,
+                                               offset, cur_order_hint);
+    assert(ref_frame_info[i].sort_idx >= -1);
+
+    if (map_idx == lst_map_idx) lst_frame_sort_idx = ref_frame_info[i].sort_idx;
+    if (map_idx == gld_map_idx) gld_frame_sort_idx = ref_frame_info[i].sort_idx;
+  }
+
+  // Confirm both LAST_FRAME and GOLDEN_FRAME are valid forward reference
+  // frames.
+  if (lst_frame_sort_idx == -1 || lst_frame_sort_idx >= cur_frame_sort_idx) {
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "Inter frame requests a look-ahead frame as LAST");
+  }
+  if (gld_frame_sort_idx == -1 || gld_frame_sort_idx >= cur_frame_sort_idx) {
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "Inter frame requests a look-ahead frame as GOLDEN");
+  }
+
+  // Sort ref frames based on their frame_offset values.
+  qsort(ref_frame_info, REF_FRAMES, sizeof(REF_FRAME_INFO),
+        compare_ref_frame_info);
+
+  // Identify forward and backward reference frames.
+  // Forward  reference: offset < order_hint
+  // Backward reference: offset >= order_hint
+  int fwd_start_idx = 0, fwd_end_idx = REF_FRAMES - 1;
+
+  for (int i = 0; i < REF_FRAMES; i++) {
+    if (ref_frame_info[i].sort_idx == -1) {
+      fwd_start_idx++;
+      continue;
+    }
+
+    if (ref_frame_info[i].sort_idx >= cur_frame_sort_idx) {
+      fwd_end_idx = i - 1;
+      break;
+    }
+  }
+
+  int bwd_start_idx = fwd_end_idx + 1;
+  int bwd_end_idx = REF_FRAMES - 1;
+
+  // === Backward Reference Frames ===
+
+  // == ALTREF_FRAME ==
+  if (bwd_start_idx <= bwd_end_idx) {
+    set_ref_frame_info(remapped_ref_idx, ALTREF_FRAME - LAST_FRAME,
+                       &ref_frame_info[bwd_end_idx]);
+    ref_flag_list[ALTREF_FRAME - LAST_FRAME] = 1;
+    bwd_end_idx--;
+  }
+
+  // == BWDREF_FRAME ==
+  if (bwd_start_idx <= bwd_end_idx) {
+    set_ref_frame_info(remapped_ref_idx, BWDREF_FRAME - LAST_FRAME,
+                       &ref_frame_info[bwd_start_idx]);
+    ref_flag_list[BWDREF_FRAME - LAST_FRAME] = 1;
+    bwd_start_idx++;
+  }
+
+  // == ALTREF2_FRAME ==
+  if (bwd_start_idx <= bwd_end_idx) {
+    set_ref_frame_info(remapped_ref_idx, ALTREF2_FRAME - LAST_FRAME,
+                       &ref_frame_info[bwd_start_idx]);
+    ref_flag_list[ALTREF2_FRAME - LAST_FRAME] = 1;
+  }
+
+  // === Forward Reference Frames ===
+
+  for (int i = fwd_start_idx; i <= fwd_end_idx; ++i) {
+    // == LAST_FRAME ==
+    if (ref_frame_info[i].map_idx == lst_map_idx) {
+      set_ref_frame_info(remapped_ref_idx, LAST_FRAME - LAST_FRAME,
+                         &ref_frame_info[i]);
+      ref_flag_list[LAST_FRAME - LAST_FRAME] = 1;
+    }
+
+    // == GOLDEN_FRAME ==
+    if (ref_frame_info[i].map_idx == gld_map_idx) {
+      set_ref_frame_info(remapped_ref_idx, GOLDEN_FRAME - LAST_FRAME,
+                         &ref_frame_info[i]);
+      ref_flag_list[GOLDEN_FRAME - LAST_FRAME] = 1;
+    }
+  }
+
+  assert(ref_flag_list[LAST_FRAME - LAST_FRAME] == 1 &&
+         ref_flag_list[GOLDEN_FRAME - LAST_FRAME] == 1);
+
+  // == LAST2_FRAME ==
+  // == LAST3_FRAME ==
+  // == BWDREF_FRAME ==
+  // == ALTREF2_FRAME ==
+  // == ALTREF_FRAME ==
+
+  // Set up the reference frames in the anti-chronological order.
+  static const MV_REFERENCE_FRAME ref_frame_list[INTER_REFS_PER_FRAME - 2] = {
+    LAST2_FRAME, LAST3_FRAME, BWDREF_FRAME, ALTREF2_FRAME, ALTREF_FRAME
+  };
+
+  int ref_idx;
+  for (ref_idx = 0; ref_idx < (INTER_REFS_PER_FRAME - 2); ref_idx++) {
+    const MV_REFERENCE_FRAME ref_frame = ref_frame_list[ref_idx];
+
+    if (ref_flag_list[ref_frame - LAST_FRAME] == 1) continue;
+
+    while (fwd_start_idx <= fwd_end_idx &&
+           (ref_frame_info[fwd_end_idx].map_idx == lst_map_idx ||
+            ref_frame_info[fwd_end_idx].map_idx == gld_map_idx)) {
+      fwd_end_idx--;
+    }
+    if (fwd_start_idx > fwd_end_idx) break;
+
+    set_ref_frame_info(remapped_ref_idx, ref_frame - LAST_FRAME,
+                       &ref_frame_info[fwd_end_idx]);
+    ref_flag_list[ref_frame - LAST_FRAME] = 1;
+
+    fwd_end_idx--;
+  }
+
+  // Assign all the remaining frame(s), if any, to the earliest reference
+  // frame.
+  for (; ref_idx < (INTER_REFS_PER_FRAME - 2); ref_idx++) {
+    const MV_REFERENCE_FRAME ref_frame = ref_frame_list[ref_idx];
+    if (ref_flag_list[ref_frame - LAST_FRAME] == 1) continue;
+    set_ref_frame_info(remapped_ref_idx, ref_frame - LAST_FRAME,
+                       &ref_frame_info[fwd_start_idx]);
+    ref_flag_list[ref_frame - LAST_FRAME] = 1;
+  }
+
+  for (int i = 0; i < INTER_REFS_PER_FRAME; i++) {
+    assert(ref_flag_list[i] == 1);
+  }
+}
diff --git a/libs/libaom/src/av1/common/mvref_common.h b/libs/libaom/src/av1/common/mvref_common.h
new file mode 100644
index 000000000..05a0dbc04
--- /dev/null
+++ b/libs/libaom/src/av1/common/mvref_common.h
@@ -0,0 +1,341 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_COMMON_MVREF_COMMON_H_
+#define AOM_AV1_COMMON_MVREF_COMMON_H_
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/blockd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MVREF_ROW_COLS 3
+
+// Set the upper limit of the motion vector component magnitude.
+// This would make a motion vector fit in 26 bits. Plus 3 bits for the
+// reference frame index. A tuple of motion vector can hence be stored within
+// 32 bit range for efficient load/store operations.
+#define REFMVS_LIMIT ((1 << 12) - 1)
+
+typedef struct position {
+  int row;
+  int col;
+} POSITION;
+
+// clamp_mv_ref
+#define MV_BORDER (16 << 3)  // Allow 16 pels in 1/8th pel units
+
+static INLINE int get_relative_dist(const OrderHintInfo *oh, int a, int b) {
+  if (!oh->enable_order_hint) return 0;
+
+  const int bits = oh->order_hint_bits_minus_1 + 1;
+
+  assert(bits >= 1);
+  assert(a >= 0 && a < (1 << bits));
+  assert(b >= 0 && b < (1 << bits));
+
+  int diff = a - b;
+  const int m = 1 << (bits - 1);
+  diff = (diff & (m - 1)) - (diff & m);
+  return diff;
+}
+
+static INLINE void clamp_mv_ref(MV *mv, int bw, int bh, const MACROBLOCKD *xd) {
+  const SubpelMvLimits mv_limits = {
+    xd->mb_to_left_edge - GET_MV_SUBPEL(bw) - MV_BORDER,
+    xd->mb_to_right_edge + GET_MV_SUBPEL(bw) + MV_BORDER,
+    xd->mb_to_top_edge - GET_MV_SUBPEL(bh) - MV_BORDER,
+    xd->mb_to_bottom_edge + GET_MV_SUBPEL(bh) + MV_BORDER
+  };
+  clamp_mv(mv, &mv_limits);
+}
+
+static INLINE int_mv get_block_mv(const MB_MODE_INFO *candidate, int which_mv) {
+  return candidate->mv[which_mv];
+}
+
+// Checks that the given mi_row, mi_col and search point
+// are inside the borders of the tile.
+static INLINE int is_inside(const TileInfo *const tile, int mi_col, int mi_row,
+                            const POSITION *mi_pos) {
+  return !(mi_row + mi_pos->row < tile->mi_row_start ||
+           mi_col + mi_pos->col < tile->mi_col_start ||
+           mi_row + mi_pos->row >= tile->mi_row_end ||
+           mi_col + mi_pos->col >= tile->mi_col_end);
+}
+
+static INLINE int find_valid_row_offset(const TileInfo *const tile, int mi_row,
+                                        int row_offset) {
+  return clamp(row_offset, tile->mi_row_start - mi_row,
+               tile->mi_row_end - mi_row - 1);
+}
+
+static INLINE int find_valid_col_offset(const TileInfo *const tile, int mi_col,
+                                        int col_offset) {
+  return clamp(col_offset, tile->mi_col_start - mi_col,
+               tile->mi_col_end - mi_col - 1);
+}
+
+static INLINE void lower_mv_precision(MV *mv, int allow_hp, int is_integer) {
+  if (is_integer) {
+    integer_mv_precision(mv);
+  } else {
+    if (!allow_hp) {
+      if (mv->row & 1) mv->row += (mv->row > 0 ? -1 : 1);
+      if (mv->col & 1) mv->col += (mv->col > 0 ? -1 : 1);
+    }
+  }
+}
+
+static INLINE int8_t get_uni_comp_ref_idx(const MV_REFERENCE_FRAME *const rf) {
+  // Single ref pred
+  if (rf[1] <= INTRA_FRAME) return -1;
+
+  // Bi-directional comp ref pred
+  if ((rf[0] < BWDREF_FRAME) && (rf[1] >= BWDREF_FRAME)) return -1;
+
+  for (int8_t ref_idx = 0; ref_idx < TOTAL_UNIDIR_COMP_REFS; ++ref_idx) {
+    if (rf[0] == comp_ref0(ref_idx) && rf[1] == comp_ref1(ref_idx))
+      return ref_idx;
+  }
+  return -1;
+}
+
+static INLINE int8_t av1_ref_frame_type(const MV_REFERENCE_FRAME *const rf) {
+  if (rf[1] > INTRA_FRAME) {
+    const int8_t uni_comp_ref_idx = get_uni_comp_ref_idx(rf);
+    if (uni_comp_ref_idx >= 0) {
+      assert((REF_FRAMES + FWD_REFS * BWD_REFS + uni_comp_ref_idx) <
+             MODE_CTX_REF_FRAMES);
+      return REF_FRAMES + FWD_REFS * BWD_REFS + uni_comp_ref_idx;
+    } else {
+      return REF_FRAMES + FWD_RF_OFFSET(rf[0]) +
+             BWD_RF_OFFSET(rf[1]) * FWD_REFS;
+    }
+  }
+
+  return rf[0];
+}
+
+// clang-format off
+static MV_REFERENCE_FRAME ref_frame_map[TOTAL_COMP_REFS][2] = {
+  { LAST_FRAME, BWDREF_FRAME },  { LAST2_FRAME, BWDREF_FRAME },
+  { LAST3_FRAME, BWDREF_FRAME }, { GOLDEN_FRAME, BWDREF_FRAME },
+
+  { LAST_FRAME, ALTREF2_FRAME },  { LAST2_FRAME, ALTREF2_FRAME },
+  { LAST3_FRAME, ALTREF2_FRAME }, { GOLDEN_FRAME, ALTREF2_FRAME },
+
+  { LAST_FRAME, ALTREF_FRAME },  { LAST2_FRAME, ALTREF_FRAME },
+  { LAST3_FRAME, ALTREF_FRAME }, { GOLDEN_FRAME, ALTREF_FRAME },
+
+  { LAST_FRAME, LAST2_FRAME }, { LAST_FRAME, LAST3_FRAME },
+  { LAST_FRAME, GOLDEN_FRAME }, { BWDREF_FRAME, ALTREF_FRAME },
+
+  // NOTE: Following reference frame pairs are not supported to be explicitly
+  //       signalled, but they are possibly chosen by the use of skip_mode,
+  //       which may use the most recent one-sided reference frame pair.
+  { LAST2_FRAME, LAST3_FRAME }, { LAST2_FRAME, GOLDEN_FRAME },
+  { LAST3_FRAME, GOLDEN_FRAME }, {BWDREF_FRAME, ALTREF2_FRAME},
+  { ALTREF2_FRAME, ALTREF_FRAME }
+};
+// clang-format on
+
+static INLINE void av1_set_ref_frame(MV_REFERENCE_FRAME *rf,
+                                     MV_REFERENCE_FRAME ref_frame_type) {
+  if (ref_frame_type >= REF_FRAMES) {
+    rf[0] = ref_frame_map[ref_frame_type - REF_FRAMES][0];
+    rf[1] = ref_frame_map[ref_frame_type - REF_FRAMES][1];
+  } else {
+    assert(ref_frame_type > NONE_FRAME);
+    rf[0] = ref_frame_type;
+    rf[1] = NONE_FRAME;
+  }
+}
+
+static uint16_t compound_mode_ctx_map[3][COMP_NEWMV_CTXS] = {
+  { 0, 1, 1, 1, 1 },
+  { 1, 2, 3, 4, 4 },
+  { 4, 4, 5, 6, 7 },
+};
+
+static INLINE int16_t av1_mode_context_analyzer(
+    const int16_t *const mode_context, const MV_REFERENCE_FRAME *const rf) {
+  const int8_t ref_frame = av1_ref_frame_type(rf);
+
+  if (rf[1] <= INTRA_FRAME) return mode_context[ref_frame];
+
+  const int16_t newmv_ctx = mode_context[ref_frame] & NEWMV_CTX_MASK;
+  const int16_t refmv_ctx =
+      (mode_context[ref_frame] >> REFMV_OFFSET) & REFMV_CTX_MASK;
+
+  const int16_t comp_ctx = compound_mode_ctx_map[refmv_ctx >> 1][AOMMIN(
+      newmv_ctx, COMP_NEWMV_CTXS - 1)];
+  return comp_ctx;
+}
+
+static INLINE uint8_t av1_drl_ctx(const uint16_t *ref_mv_weight, int ref_idx) {
+  if (ref_mv_weight[ref_idx] >= REF_CAT_LEVEL &&
+      ref_mv_weight[ref_idx + 1] >= REF_CAT_LEVEL)
+    return 0;
+
+  if (ref_mv_weight[ref_idx] >= REF_CAT_LEVEL &&
+      ref_mv_weight[ref_idx + 1] < REF_CAT_LEVEL)
+    return 1;
+
+  if (ref_mv_weight[ref_idx] < REF_CAT_LEVEL &&
+      ref_mv_weight[ref_idx + 1] < REF_CAT_LEVEL)
+    return 2;
+
+  return 0;
+}
+
+void av1_setup_frame_buf_refs(AV1_COMMON *cm);
+void av1_setup_frame_sign_bias(AV1_COMMON *cm);
+void av1_setup_skip_mode_allowed(AV1_COMMON *cm);
+void av1_setup_motion_field(AV1_COMMON *cm);
+void av1_set_frame_refs(AV1_COMMON *const cm, int *remapped_ref_idx,
+                        int lst_map_idx, int gld_map_idx);
+
+static INLINE void av1_collect_neighbors_ref_counts(MACROBLOCKD *const xd) {
+  av1_zero(xd->neighbors_ref_counts);
+
+  uint8_t *const ref_counts = xd->neighbors_ref_counts;
+
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const int above_in_image = xd->up_available;
+  const int left_in_image = xd->left_available;
+
+  // Above neighbor
+  if (above_in_image && is_inter_block(above_mbmi)) {
+    ref_counts[above_mbmi->ref_frame[0]]++;
+    if (has_second_ref(above_mbmi)) {
+      ref_counts[above_mbmi->ref_frame[1]]++;
+    }
+  }
+
+  // Left neighbor
+  if (left_in_image && is_inter_block(left_mbmi)) {
+    ref_counts[left_mbmi->ref_frame[0]]++;
+    if (has_second_ref(left_mbmi)) {
+      ref_counts[left_mbmi->ref_frame[1]]++;
+    }
+  }
+}
+
+void av1_copy_frame_mvs(const AV1_COMMON *const cm,
+                        const MB_MODE_INFO *const mi, int mi_row, int mi_col,
+                        int x_mis, int y_mis);
+
+// The global_mvs output parameter points to an array of REF_FRAMES elements.
+// The caller may pass a null global_mvs if it does not need the global_mvs
+// output.
+void av1_find_mv_refs(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+                      MB_MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
+                      uint8_t ref_mv_count[MODE_CTX_REF_FRAMES],
+                      CANDIDATE_MV ref_mv_stack[][MAX_REF_MV_STACK_SIZE],
+                      uint16_t ref_mv_weight[][MAX_REF_MV_STACK_SIZE],
+                      int_mv mv_ref_list[][MAX_MV_REF_CANDIDATES],
+                      int_mv *global_mvs, int16_t *mode_context);
+
+// check a list of motion vectors by sad score using a number rows of pixels
+// above and a number cols of pixels in the left to select the one with best
+// score to use as ref motion vector
+void av1_find_best_ref_mvs(int allow_hp, int_mv *mvlist, int_mv *nearest_mv,
+                           int_mv *near_mv, int is_integer);
+
+uint8_t av1_selectSamples(MV *mv, int *pts, int *pts_inref, int len,
+                          BLOCK_SIZE bsize);
+uint8_t av1_findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int *pts,
+                        int *pts_inref);
+
+#define INTRABC_DELAY_PIXELS 256  //  Delay of 256 pixels
+#define INTRABC_DELAY_SB64 (INTRABC_DELAY_PIXELS / 64)
+
+static INLINE void av1_find_ref_dv(int_mv *ref_dv, const TileInfo *const tile,
+                                   int mib_size, int mi_row) {
+  if (mi_row - mib_size < tile->mi_row_start) {
+    ref_dv->as_fullmv.row = 0;
+    ref_dv->as_fullmv.col = -MI_SIZE * mib_size - INTRABC_DELAY_PIXELS;
+  } else {
+    ref_dv->as_fullmv.row = -MI_SIZE * mib_size;
+    ref_dv->as_fullmv.col = 0;
+  }
+  convert_fullmv_to_mv(ref_dv);
+}
+
+static INLINE int av1_is_dv_valid(const MV dv, const AV1_COMMON *cm,
+                                  const MACROBLOCKD *xd, int mi_row, int mi_col,
+                                  BLOCK_SIZE bsize, int mib_size_log2) {
+  const int bw = block_size_wide[bsize];
+  const int bh = block_size_high[bsize];
+  const int SCALE_PX_TO_MV = 8;
+  // Disallow subpixel for now
+  // SUBPEL_MASK is not the correct scale
+  if (((dv.row & (SCALE_PX_TO_MV - 1)) || (dv.col & (SCALE_PX_TO_MV - 1))))
+    return 0;
+
+  const TileInfo *const tile = &xd->tile;
+  // Is the source top-left inside the current tile?
+  const int src_top_edge = mi_row * MI_SIZE * SCALE_PX_TO_MV + dv.row;
+  const int tile_top_edge = tile->mi_row_start * MI_SIZE * SCALE_PX_TO_MV;
+  if (src_top_edge < tile_top_edge) return 0;
+  const int src_left_edge = mi_col * MI_SIZE * SCALE_PX_TO_MV + dv.col;
+  const int tile_left_edge = tile->mi_col_start * MI_SIZE * SCALE_PX_TO_MV;
+  if (src_left_edge < tile_left_edge) return 0;
+  // Is the bottom right inside the current tile?
+  const int src_bottom_edge = (mi_row * MI_SIZE + bh) * SCALE_PX_TO_MV + dv.row;
+  const int tile_bottom_edge = tile->mi_row_end * MI_SIZE * SCALE_PX_TO_MV;
+  if (src_bottom_edge > tile_bottom_edge) return 0;
+  const int src_right_edge = (mi_col * MI_SIZE + bw) * SCALE_PX_TO_MV + dv.col;
+  const int tile_right_edge = tile->mi_col_end * MI_SIZE * SCALE_PX_TO_MV;
+  if (src_right_edge > tile_right_edge) return 0;
+
+  // Special case for sub 8x8 chroma cases, to prevent referring to chroma
+  // pixels outside current tile.
+  if (xd->is_chroma_ref && av1_num_planes(cm) > 1) {
+    const struct macroblockd_plane *const pd = &xd->plane[1];
+    if (bw < 8 && pd->subsampling_x)
+      if (src_left_edge < tile_left_edge + 4 * SCALE_PX_TO_MV) return 0;
+    if (bh < 8 && pd->subsampling_y)
+      if (src_top_edge < tile_top_edge + 4 * SCALE_PX_TO_MV) return 0;
+  }
+
+  // Is the bottom right within an already coded SB? Also consider additional
+  // constraints to facilitate HW decoder.
+  const int max_mib_size = 1 << mib_size_log2;
+  const int active_sb_row = mi_row >> mib_size_log2;
+  const int active_sb64_col = (mi_col * MI_SIZE) >> 6;
+  const int sb_size = max_mib_size * MI_SIZE;
+  const int src_sb_row = ((src_bottom_edge >> 3) - 1) / sb_size;
+  const int src_sb64_col = ((src_right_edge >> 3) - 1) >> 6;
+  const int total_sb64_per_row =
+      ((tile->mi_col_end - tile->mi_col_start - 1) >> 4) + 1;
+  const int active_sb64 = active_sb_row * total_sb64_per_row + active_sb64_col;
+  const int src_sb64 = src_sb_row * total_sb64_per_row + src_sb64_col;
+  if (src_sb64 >= active_sb64 - INTRABC_DELAY_SB64) return 0;
+
+  // Wavefront constraint: use only top left area of frame for reference.
+  const int gradient = 1 + INTRABC_DELAY_SB64 + (sb_size > 64);
+  const int wf_offset = gradient * (active_sb_row - src_sb_row);
+  if (src_sb_row > active_sb_row ||
+      src_sb64_col >= active_sb64_col - INTRABC_DELAY_SB64 + wf_offset)
+    return 0;
+
+  return 1;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_COMMON_MVREF_COMMON_H_
diff --git a/libs/libaom/src/av1/common/obmc.h b/libs/libaom/src/av1/common/obmc.h
new file mode 100644
index 000000000..cc97b6bb1
--- /dev/null
+++ b/libs/libaom/src/av1/common/obmc.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_OBMC_H_
+#define AOM_AV1_COMMON_OBMC_H_
+
+typedef void (*overlappable_nb_visitor_t)(MACROBLOCKD *xd, int rel_mi_row,
+                                          int rel_mi_col, uint8_t op_mi_size,
+                                          int dir, MB_MODE_INFO *nb_mi,
+                                          void *fun_ctxt, const int num_planes);
+
+static INLINE void foreach_overlappable_nb_above(const AV1_COMMON *cm,
+                                                 MACROBLOCKD *xd, int nb_max,
+                                                 overlappable_nb_visitor_t fun,
+                                                 void *fun_ctxt) {
+  if (!xd->up_available) return;
+
+  const int num_planes = av1_num_planes(cm);
+  int nb_count = 0;
+  const int mi_col = xd->mi_col;
+  // prev_row_mi points into the mi array, starting at the beginning of the
+  // previous row.
+  MB_MODE_INFO **prev_row_mi = xd->mi - mi_col - 1 * xd->mi_stride;
+  const int end_col = AOMMIN(mi_col + xd->width, cm->mi_params.mi_cols);
+  uint8_t mi_step;
+  for (int above_mi_col = mi_col; above_mi_col < end_col && nb_count < nb_max;
+       above_mi_col += mi_step) {
+    MB_MODE_INFO **above_mi = prev_row_mi + above_mi_col;
+    mi_step =
+        AOMMIN(mi_size_wide[above_mi[0]->sb_type], mi_size_wide[BLOCK_64X64]);
+    // If we're considering a block with width 4, it should be treated as
+    // half of a pair of blocks with chroma information in the second. Move
+    // above_mi_col back to the start of the pair if needed, set above_mbmi
+    // to point at the block with chroma information, and set mi_step to 2 to
+    // step over the entire pair at the end of the iteration.
+    if (mi_step == 1) {
+      above_mi_col &= ~1;
+      above_mi = prev_row_mi + above_mi_col + 1;
+      mi_step = 2;
+    }
+    if (is_neighbor_overlappable(*above_mi)) {
+      ++nb_count;
+      fun(xd, 0, above_mi_col - mi_col, AOMMIN(xd->width, mi_step), 0,
+          *above_mi, fun_ctxt, num_planes);
+    }
+  }
+}
+
+static INLINE void foreach_overlappable_nb_left(const AV1_COMMON *cm,
+                                                MACROBLOCKD *xd, int nb_max,
+                                                overlappable_nb_visitor_t fun,
+                                                void *fun_ctxt) {
+  if (!xd->left_available) return;
+
+  const int num_planes = av1_num_planes(cm);
+  int nb_count = 0;
+  // prev_col_mi points into the mi array, starting at the top of the
+  // previous column
+  const int mi_row = xd->mi_row;
+  MB_MODE_INFO **prev_col_mi = xd->mi - 1 - mi_row * xd->mi_stride;
+  const int end_row = AOMMIN(mi_row + xd->height, cm->mi_params.mi_rows);
+  uint8_t mi_step;
+  for (int left_mi_row = mi_row; left_mi_row < end_row && nb_count < nb_max;
+       left_mi_row += mi_step) {
+    MB_MODE_INFO **left_mi = prev_col_mi + left_mi_row * xd->mi_stride;
+    mi_step =
+        AOMMIN(mi_size_high[left_mi[0]->sb_type], mi_size_high[BLOCK_64X64]);
+    if (mi_step == 1) {
+      left_mi_row &= ~1;
+      left_mi = prev_col_mi + (left_mi_row + 1) * xd->mi_stride;
+      mi_step = 2;
+    }
+    if (is_neighbor_overlappable(*left_mi)) {
+      ++nb_count;
+      fun(xd, left_mi_row - mi_row, 0, AOMMIN(xd->height, mi_step), 1, *left_mi,
+          fun_ctxt, num_planes);
+    }
+  }
+}
+
+#endif  // AOM_AV1_COMMON_OBMC_H_
diff --git a/libs/libaom/src/av1/common/obu_util.c b/libs/libaom/src/av1/common/obu_util.c
new file mode 100644
index 000000000..7d2694b89
--- /dev/null
+++ b/libs/libaom/src/av1/common/obu_util.c
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <assert.h>
+
+#include "av1/common/obu_util.h"
+
+#include "aom_dsp/bitreader_buffer.h"
+
+// Returns 1 when OBU type is valid, and 0 otherwise.
+static int valid_obu_type(int obu_type) {
+  int valid_type = 0;
+  switch (obu_type) {
+    case OBU_SEQUENCE_HEADER:
+    case OBU_TEMPORAL_DELIMITER:
+    case OBU_FRAME_HEADER:
+    case OBU_TILE_GROUP:
+    case OBU_METADATA:
+    case OBU_FRAME:
+    case OBU_REDUNDANT_FRAME_HEADER:
+    case OBU_TILE_LIST:
+    case OBU_PADDING: valid_type = 1; break;
+    default: break;
+  }
+  return valid_type;
+}
+
+static aom_codec_err_t read_obu_size(const uint8_t *data,
+                                     size_t bytes_available,
+                                     size_t *const obu_size,
+                                     size_t *const length_field_size) {
+  uint64_t u_obu_size = 0;
+  if (aom_uleb_decode(data, bytes_available, &u_obu_size, length_field_size) !=
+      0) {
+    return AOM_CODEC_CORRUPT_FRAME;
+  }
+
+  if (u_obu_size > UINT32_MAX) return AOM_CODEC_CORRUPT_FRAME;
+  *obu_size = (size_t)u_obu_size;
+  return AOM_CODEC_OK;
+}
+
+// Parses OBU header and stores values in 'header'.
+static aom_codec_err_t read_obu_header(struct aom_read_bit_buffer *rb,
+                                       int is_annexb, ObuHeader *header) {
+  if (!rb || !header) return AOM_CODEC_INVALID_PARAM;
+
+  const ptrdiff_t bit_buffer_byte_length = rb->bit_buffer_end - rb->bit_buffer;
+  if (bit_buffer_byte_length < 1) return AOM_CODEC_CORRUPT_FRAME;
+
+  header->size = 1;
+
+  if (aom_rb_read_bit(rb) != 0) {
+    // Forbidden bit. Must not be set.
+    return AOM_CODEC_CORRUPT_FRAME;
+  }
+
+  header->type = (OBU_TYPE)aom_rb_read_literal(rb, 4);
+
+  if (!valid_obu_type(header->type)) return AOM_CODEC_CORRUPT_FRAME;
+
+  header->has_extension = aom_rb_read_bit(rb);
+  header->has_size_field = aom_rb_read_bit(rb);
+
+  if (!header->has_size_field && !is_annexb) {
+    // section 5 obu streams must have obu_size field set.
+    return AOM_CODEC_UNSUP_BITSTREAM;
+  }
+
+  if (aom_rb_read_bit(rb) != 0) {
+    // obu_reserved_1bit must be set to 0.
+    return AOM_CODEC_CORRUPT_FRAME;
+  }
+
+  if (header->has_extension) {
+    if (bit_buffer_byte_length == 1) return AOM_CODEC_CORRUPT_FRAME;
+
+    header->size += 1;
+    header->temporal_layer_id = aom_rb_read_literal(rb, 3);
+    header->spatial_layer_id = aom_rb_read_literal(rb, 2);
+    if (aom_rb_read_literal(rb, 3) != 0) {
+      // extension_header_reserved_3bits must be set to 0.
+      return AOM_CODEC_CORRUPT_FRAME;
+    }
+  }
+
+  return AOM_CODEC_OK;
+}
+
+aom_codec_err_t aom_read_obu_header(uint8_t *buffer, size_t buffer_length,
+                                    size_t *consumed, ObuHeader *header,
+                                    int is_annexb) {
+  if (buffer_length < 1 || !consumed || !header) return AOM_CODEC_INVALID_PARAM;
+
+  // TODO(tomfinegan): Set the error handler here and throughout this file, and
+  // confirm parsing work done via aom_read_bit_buffer is successful.
+  struct aom_read_bit_buffer rb = { buffer, buffer + buffer_length, 0, NULL,
+                                    NULL };
+  aom_codec_err_t parse_result = read_obu_header(&rb, is_annexb, header);
+  if (parse_result == AOM_CODEC_OK) *consumed = header->size;
+  return parse_result;
+}
+
+aom_codec_err_t aom_read_obu_header_and_size(const uint8_t *data,
+                                             size_t bytes_available,
+                                             int is_annexb,
+                                             ObuHeader *obu_header,
+                                             size_t *const payload_size,
+                                             size_t *const bytes_read) {
+  size_t length_field_size_obu = 0;
+  size_t length_field_size_payload = 0;
+  size_t obu_size = 0;
+  aom_codec_err_t status;
+
+  if (is_annexb) {
+    // Size field comes before the OBU header, and includes the OBU header
+    status =
+        read_obu_size(data, bytes_available, &obu_size, &length_field_size_obu);
+
+    if (status != AOM_CODEC_OK) return status;
+  }
+
+  struct aom_read_bit_buffer rb = { data + length_field_size_obu,
+                                    data + bytes_available, 0, NULL, NULL };
+
+  status = read_obu_header(&rb, is_annexb, obu_header);
+  if (status != AOM_CODEC_OK) return status;
+
+  if (!obu_header->has_size_field) {
+    assert(is_annexb);
+    // Derive the payload size from the data we've already read
+    if (obu_size < obu_header->size) return AOM_CODEC_CORRUPT_FRAME;
+
+    *payload_size = obu_size - obu_header->size;
+  } else {
+    // Size field comes after the OBU header, and is just the payload size
+    status = read_obu_size(
+        data + length_field_size_obu + obu_header->size,
+        bytes_available - length_field_size_obu - obu_header->size,
+        payload_size, &length_field_size_payload);
+    if (status != AOM_CODEC_OK) return status;
+  }
+
+  *bytes_read =
+      length_field_size_obu + obu_header->size + length_field_size_payload;
+  return AOM_CODEC_OK;
+}
diff --git a/libs/libaom/src/av1/common/obu_util.h b/libs/libaom/src/av1/common/obu_util.h
new file mode 100644
index 000000000..7c56904c8
--- /dev/null
+++ b/libs/libaom/src/av1/common/obu_util.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_COMMON_OBU_UTIL_H_
+#define AOM_AV1_COMMON_OBU_UTIL_H_
+
+#include "aom/aom_codec.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+  size_t size;  // Size (1 or 2 bytes) of the OBU header (including the
+                // optional OBU extension header) in the bitstream.
+  OBU_TYPE type;
+  int has_size_field;
+  int has_extension;
+  // The following fields come from the OBU extension header and therefore are
+  // only used if has_extension is true.
+  int temporal_layer_id;
+  int spatial_layer_id;
+} ObuHeader;
+
+aom_codec_err_t aom_read_obu_header(uint8_t *buffer, size_t buffer_length,
+                                    size_t *consumed, ObuHeader *header,
+                                    int is_annexb);
+
+aom_codec_err_t aom_read_obu_header_and_size(const uint8_t *data,
+                                             size_t bytes_available,
+                                             int is_annexb,
+                                             ObuHeader *obu_header,
+                                             size_t *const payload_size,
+                                             size_t *const bytes_read);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_COMMON_OBU_UTIL_H_
diff --git a/libs/libaom/src/av1/common/odintrin.c b/libs/libaom/src/av1/common/odintrin.c
new file mode 100644
index 000000000..7584b2e52
--- /dev/null
+++ b/libs/libaom/src/av1/common/odintrin.c
@@ -0,0 +1,541 @@
+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/* clang-format off */
+
+#include "av1/common/odintrin.h"
+
+/*Constants for use with OD_DIVU_SMALL().
+  See \cite{Rob05} for details on computing these constants.
+  @INPROCEEDINGS{Rob05,
+    author="Arch D. Robison",
+    title="{N}-bit Unsigned Division via {N}-bit Multiply-Add",
+    booktitle="Proc. of the 17th IEEE Symposium on Computer Arithmetic
+     (ARITH'05)",
+    pages="131--139",
+    address="Cape Cod, MA",
+    month=Jun,
+    year=2005
+  }*/
+uint32_t OD_DIVU_SMALL_CONSTS[OD_DIVU_DMAX][2] = {
+  { 0xFFFFFFFF, 0xFFFFFFFF }, { 0xFFFFFFFF, 0xFFFFFFFF },
+  { 0xAAAAAAAB, 0 },          { 0xFFFFFFFF, 0xFFFFFFFF },
+  { 0xCCCCCCCD, 0 },          { 0xAAAAAAAB, 0 },
+  { 0x92492492, 0x92492492 }, { 0xFFFFFFFF, 0xFFFFFFFF },
+  { 0xE38E38E4, 0 },          { 0xCCCCCCCD, 0 },
+  { 0xBA2E8BA3, 0 },          { 0xAAAAAAAB, 0 },
+  { 0x9D89D89E, 0 },          { 0x92492492, 0x92492492 },
+  { 0x88888889, 0 },          { 0xFFFFFFFF, 0xFFFFFFFF },
+  { 0xF0F0F0F1, 0 },          { 0xE38E38E4, 0 },
+  { 0xD79435E5, 0xD79435E5 }, { 0xCCCCCCCD, 0 },
+  { 0xC30C30C3, 0xC30C30C3 }, { 0xBA2E8BA3, 0 },
+  { 0xB21642C9, 0 },          { 0xAAAAAAAB, 0 },
+  { 0xA3D70A3E, 0 },          { 0x9D89D89E, 0 },
+  { 0x97B425ED, 0x97B425ED }, { 0x92492492, 0x92492492 },
+  { 0x8D3DCB09, 0 },          { 0x88888889, 0 },
+  { 0x84210842, 0x84210842 }, { 0xFFFFFFFF, 0xFFFFFFFF },
+  { 0xF83E0F84, 0 },          { 0xF0F0F0F1, 0 },
+  { 0xEA0EA0EA, 0xEA0EA0EA }, { 0xE38E38E4, 0 },
+  { 0xDD67C8A6, 0xDD67C8A6 }, { 0xD79435E5, 0xD79435E5 },
+  { 0xD20D20D2, 0xD20D20D2 }, { 0xCCCCCCCD, 0 },
+  { 0xC7CE0C7D, 0 },          { 0xC30C30C3, 0xC30C30C3 },
+  { 0xBE82FA0C, 0 },          { 0xBA2E8BA3, 0 },
+  { 0xB60B60B6, 0xB60B60B6 }, { 0xB21642C9, 0 },
+  { 0xAE4C415D, 0 },          { 0xAAAAAAAB, 0 },
+  { 0xA72F053A, 0 },          { 0xA3D70A3E, 0 },
+  { 0xA0A0A0A1, 0 },          { 0x9D89D89E, 0 },
+  { 0x9A90E7D9, 0x9A90E7D9 }, { 0x97B425ED, 0x97B425ED },
+  { 0x94F2094F, 0x94F2094F }, { 0x92492492, 0x92492492 },
+  { 0x8FB823EE, 0x8FB823EE }, { 0x8D3DCB09, 0 },
+  { 0x8AD8F2FC, 0 },          { 0x88888889, 0 },
+  { 0x864B8A7E, 0 },          { 0x84210842, 0x84210842 },
+  { 0x82082082, 0x82082082 }, { 0xFFFFFFFF, 0xFFFFFFFF },
+  { 0xFC0FC0FD, 0 },          { 0xF83E0F84, 0 },
+  { 0xF4898D60, 0 },          { 0xF0F0F0F1, 0 },
+  { 0xED7303B6, 0 },          { 0xEA0EA0EA, 0xEA0EA0EA },
+  { 0xE6C2B449, 0 },          { 0xE38E38E4, 0 },
+  { 0xE070381C, 0xE070381C }, { 0xDD67C8A6, 0xDD67C8A6 },
+  { 0xDA740DA8, 0 },          { 0xD79435E5, 0xD79435E5 },
+  { 0xD4C77B04, 0 },          { 0xD20D20D2, 0xD20D20D2 },
+  { 0xCF6474A9, 0 },          { 0xCCCCCCCD, 0 },
+  { 0xCA4587E7, 0 },          { 0xC7CE0C7D, 0 },
+  { 0xC565C87C, 0 },          { 0xC30C30C3, 0xC30C30C3 },
+  { 0xC0C0C0C1, 0 },          { 0xBE82FA0C, 0 },
+  { 0xBC52640C, 0 },          { 0xBA2E8BA3, 0 },
+  { 0xB81702E1, 0 },          { 0xB60B60B6, 0xB60B60B6 },
+  { 0xB40B40B4, 0xB40B40B4 }, { 0xB21642C9, 0 },
+  { 0xB02C0B03, 0 },          { 0xAE4C415D, 0 },
+  { 0xAC769184, 0xAC769184 }, { 0xAAAAAAAB, 0 },
+  { 0xA8E83F57, 0xA8E83F57 }, { 0xA72F053A, 0 },
+  { 0xA57EB503, 0 },          { 0xA3D70A3E, 0 },
+  { 0xA237C32B, 0xA237C32B }, { 0xA0A0A0A1, 0 },
+  { 0x9F1165E7, 0x9F1165E7 }, { 0x9D89D89E, 0 },
+  { 0x9C09C09C, 0x9C09C09C }, { 0x9A90E7D9, 0x9A90E7D9 },
+  { 0x991F1A51, 0x991F1A51 }, { 0x97B425ED, 0x97B425ED },
+  { 0x964FDA6C, 0x964FDA6C }, { 0x94F2094F, 0x94F2094F },
+  { 0x939A85C4, 0x939A85C4 }, { 0x92492492, 0x92492492 },
+  { 0x90FDBC09, 0x90FDBC09 }, { 0x8FB823EE, 0x8FB823EE },
+  { 0x8E78356D, 0x8E78356D }, { 0x8D3DCB09, 0 },
+  { 0x8C08C08C, 0x8C08C08C }, { 0x8AD8F2FC, 0 },
+  { 0x89AE408A, 0 },          { 0x88888889, 0 },
+  { 0x8767AB5F, 0x8767AB5F }, { 0x864B8A7E, 0 },
+  { 0x85340853, 0x85340853 }, { 0x84210842, 0x84210842 },
+  { 0x83126E98, 0 },          { 0x82082082, 0x82082082 },
+  { 0x81020408, 0x81020408 }, { 0xFFFFFFFF, 0xFFFFFFFF },
+  { 0xFE03F810, 0 },          { 0xFC0FC0FD, 0 },
+  { 0xFA232CF3, 0 },          { 0xF83E0F84, 0 },
+  { 0xF6603D99, 0 },          { 0xF4898D60, 0 },
+  { 0xF2B9D649, 0 },          { 0xF0F0F0F1, 0 },
+  { 0xEF2EB720, 0 },          { 0xED7303B6, 0 },
+  { 0xEBBDB2A6, 0 },          { 0xEA0EA0EA, 0xEA0EA0EA },
+  { 0xE865AC7C, 0 },          { 0xE6C2B449, 0 },
+  { 0xE525982B, 0 },          { 0xE38E38E4, 0 },
+  { 0xE1FC780F, 0 },          { 0xE070381C, 0xE070381C },
+  { 0xDEE95C4D, 0 },          { 0xDD67C8A6, 0xDD67C8A6 },
+  { 0xDBEB61EF, 0 },          { 0xDA740DA8, 0 },
+  { 0xD901B204, 0 },          { 0xD79435E5, 0xD79435E5 },
+  { 0xD62B80D7, 0 },          { 0xD4C77B04, 0 },
+  { 0xD3680D37, 0 },          { 0xD20D20D2, 0xD20D20D2 },
+  { 0xD0B69FCC, 0 },          { 0xCF6474A9, 0 },
+  { 0xCE168A77, 0xCE168A77 }, { 0xCCCCCCCD, 0 },
+  { 0xCB8727C1, 0 },          { 0xCA4587E7, 0 },
+  { 0xC907DA4F, 0 },          { 0xC7CE0C7D, 0 },
+  { 0xC6980C6A, 0 },          { 0xC565C87C, 0 },
+  { 0xC4372F86, 0 },          { 0xC30C30C3, 0xC30C30C3 },
+  { 0xC1E4BBD6, 0 },          { 0xC0C0C0C1, 0 },
+  { 0xBFA02FE8, 0xBFA02FE8 }, { 0xBE82FA0C, 0 },
+  { 0xBD691047, 0xBD691047 }, { 0xBC52640C, 0 },
+  { 0xBB3EE722, 0 },          { 0xBA2E8BA3, 0 },
+  { 0xB92143FA, 0xB92143FA }, { 0xB81702E1, 0 },
+  { 0xB70FBB5A, 0xB70FBB5A }, { 0xB60B60B6, 0xB60B60B6 },
+  { 0xB509E68B, 0 },          { 0xB40B40B4, 0xB40B40B4 },
+  { 0xB30F6353, 0 },          { 0xB21642C9, 0 },
+  { 0xB11FD3B8, 0xB11FD3B8 }, { 0xB02C0B03, 0 },
+  { 0xAF3ADDC7, 0 },          { 0xAE4C415D, 0 },
+  { 0xAD602B58, 0xAD602B58 }, { 0xAC769184, 0xAC769184 },
+  { 0xAB8F69E3, 0 },          { 0xAAAAAAAB, 0 },
+  { 0xA9C84A48, 0 },          { 0xA8E83F57, 0xA8E83F57 },
+  { 0xA80A80A8, 0xA80A80A8 }, { 0xA72F053A, 0 },
+  { 0xA655C439, 0xA655C439 }, { 0xA57EB503, 0 },
+  { 0xA4A9CF1E, 0 },          { 0xA3D70A3E, 0 },
+  { 0xA3065E40, 0 },          { 0xA237C32B, 0xA237C32B },
+  { 0xA16B312F, 0 },          { 0xA0A0A0A1, 0 },
+  { 0x9FD809FE, 0 },          { 0x9F1165E7, 0x9F1165E7 },
+  { 0x9E4CAD24, 0 },          { 0x9D89D89E, 0 },
+  { 0x9CC8E161, 0 },          { 0x9C09C09C, 0x9C09C09C },
+  { 0x9B4C6F9F, 0 },          { 0x9A90E7D9, 0x9A90E7D9 },
+  { 0x99D722DB, 0 },          { 0x991F1A51, 0x991F1A51 },
+  { 0x9868C80A, 0 },          { 0x97B425ED, 0x97B425ED },
+  { 0x97012E02, 0x97012E02 }, { 0x964FDA6C, 0x964FDA6C },
+  { 0x95A02568, 0x95A02568 }, { 0x94F2094F, 0x94F2094F },
+  { 0x94458094, 0x94458094 }, { 0x939A85C4, 0x939A85C4 },
+  { 0x92F11384, 0x92F11384 }, { 0x92492492, 0x92492492 },
+  { 0x91A2B3C5, 0 },          { 0x90FDBC09, 0x90FDBC09 },
+  { 0x905A3863, 0x905A3863 }, { 0x8FB823EE, 0x8FB823EE },
+  { 0x8F1779DA, 0 },          { 0x8E78356D, 0x8E78356D },
+  { 0x8DDA5202, 0x8DDA5202 }, { 0x8D3DCB09, 0 },
+  { 0x8CA29C04, 0x8CA29C04 }, { 0x8C08C08C, 0x8C08C08C },
+  { 0x8B70344A, 0x8B70344A }, { 0x8AD8F2FC, 0 },
+  { 0x8A42F870, 0x8A42F870 }, { 0x89AE408A, 0 },
+  { 0x891AC73B, 0 },          { 0x88888889, 0 },
+  { 0x87F78088, 0 },          { 0x8767AB5F, 0x8767AB5F },
+  { 0x86D90545, 0 },          { 0x864B8A7E, 0 },
+  { 0x85BF3761, 0x85BF3761 }, { 0x85340853, 0x85340853 },
+  { 0x84A9F9C8, 0x84A9F9C8 }, { 0x84210842, 0x84210842 },
+  { 0x83993052, 0x83993052 }, { 0x83126E98, 0 },
+  { 0x828CBFBF, 0 },          { 0x82082082, 0x82082082 },
+  { 0x81848DA9, 0 },          { 0x81020408, 0x81020408 },
+  { 0x80808081, 0 },          { 0xFFFFFFFF, 0xFFFFFFFF },
+  { 0xFF00FF01, 0 },          { 0xFE03F810, 0 },
+  { 0xFD08E551, 0 },          { 0xFC0FC0FD, 0 },
+  { 0xFB188566, 0 },          { 0xFA232CF3, 0 },
+  { 0xF92FB222, 0 },          { 0xF83E0F84, 0 },
+  { 0xF74E3FC3, 0 },          { 0xF6603D99, 0 },
+  { 0xF57403D6, 0 },          { 0xF4898D60, 0 },
+  { 0xF3A0D52D, 0 },          { 0xF2B9D649, 0 },
+  { 0xF1D48BCF, 0 },          { 0xF0F0F0F1, 0 },
+  { 0xF00F00F0, 0xF00F00F0 }, { 0xEF2EB720, 0 },
+  { 0xEE500EE5, 0xEE500EE5 }, { 0xED7303B6, 0 },
+  { 0xEC979119, 0 },          { 0xEBBDB2A6, 0 },
+  { 0xEAE56404, 0 },          { 0xEA0EA0EA, 0xEA0EA0EA },
+  { 0xE9396520, 0 },          { 0xE865AC7C, 0 },
+  { 0xE79372E3, 0 },          { 0xE6C2B449, 0 },
+  { 0xE5F36CB0, 0xE5F36CB0 }, { 0xE525982B, 0 },
+  { 0xE45932D8, 0 },          { 0xE38E38E4, 0 },
+  { 0xE2C4A689, 0 },          { 0xE1FC780F, 0 },
+  { 0xE135A9CA, 0 },          { 0xE070381C, 0xE070381C },
+  { 0xDFAC1F75, 0 },          { 0xDEE95C4D, 0 },
+  { 0xDE27EB2D, 0 },          { 0xDD67C8A6, 0xDD67C8A6 },
+  { 0xDCA8F159, 0 },          { 0xDBEB61EF, 0 },
+  { 0xDB2F171E, 0 },          { 0xDA740DA8, 0 },
+  { 0xD9BA4257, 0 },          { 0xD901B204, 0 },
+  { 0xD84A598F, 0 },          { 0xD79435E5, 0xD79435E5 },
+  { 0xD6DF43FD, 0 },          { 0xD62B80D7, 0 },
+  { 0xD578E97D, 0 },          { 0xD4C77B04, 0 },
+  { 0xD417328A, 0 },          { 0xD3680D37, 0 },
+  { 0xD2BA083C, 0 },          { 0xD20D20D2, 0xD20D20D2 },
+  { 0xD161543E, 0xD161543E }, { 0xD0B69FCC, 0 },
+  { 0xD00D00D0, 0xD00D00D0 }, { 0xCF6474A9, 0 },
+  { 0xCEBCF8BC, 0 },          { 0xCE168A77, 0xCE168A77 },
+  { 0xCD712753, 0 },          { 0xCCCCCCCD, 0 },
+  { 0xCC29786D, 0 },          { 0xCB8727C1, 0 },
+  { 0xCAE5D85F, 0xCAE5D85F }, { 0xCA4587E7, 0 },
+  { 0xC9A633FD, 0 },          { 0xC907DA4F, 0 },
+  { 0xC86A7890, 0xC86A7890 }, { 0xC7CE0C7D, 0 },
+  { 0xC73293D8, 0 },          { 0xC6980C6A, 0 },
+  { 0xC5FE7403, 0xC5FE7403 }, { 0xC565C87C, 0 },
+  { 0xC4CE07B0, 0xC4CE07B0 }, { 0xC4372F86, 0 },
+  { 0xC3A13DE6, 0xC3A13DE6 }, { 0xC30C30C3, 0xC30C30C3 },
+  { 0xC2780614, 0 },          { 0xC1E4BBD6, 0 },
+  { 0xC152500C, 0xC152500C }, { 0xC0C0C0C1, 0 },
+  { 0xC0300C03, 0xC0300C03 }, { 0xBFA02FE8, 0xBFA02FE8 },
+  { 0xBF112A8B, 0 },          { 0xBE82FA0C, 0 },
+  { 0xBDF59C92, 0 },          { 0xBD691047, 0xBD691047 },
+  { 0xBCDD535E, 0 },          { 0xBC52640C, 0 },
+  { 0xBBC8408D, 0 },          { 0xBB3EE722, 0 },
+  { 0xBAB65610, 0xBAB65610 }, { 0xBA2E8BA3, 0 },
+  { 0xB9A7862A, 0xB9A7862A }, { 0xB92143FA, 0xB92143FA },
+  { 0xB89BC36D, 0 },          { 0xB81702E1, 0 },
+  { 0xB79300B8, 0 },          { 0xB70FBB5A, 0xB70FBB5A },
+  { 0xB68D3134, 0xB68D3134 }, { 0xB60B60B6, 0xB60B60B6 },
+  { 0xB58A4855, 0xB58A4855 }, { 0xB509E68B, 0 },
+  { 0xB48A39D4, 0xB48A39D4 }, { 0xB40B40B4, 0xB40B40B4 },
+  { 0xB38CF9B0, 0xB38CF9B0 }, { 0xB30F6353, 0 },
+  { 0xB2927C2A, 0 },          { 0xB21642C9, 0 },
+  { 0xB19AB5C5, 0 },          { 0xB11FD3B8, 0xB11FD3B8 },
+  { 0xB0A59B42, 0 },          { 0xB02C0B03, 0 },
+  { 0xAFB321A1, 0xAFB321A1 }, { 0xAF3ADDC7, 0 },
+  { 0xAEC33E20, 0 },          { 0xAE4C415D, 0 },
+  { 0xADD5E632, 0xADD5E632 }, { 0xAD602B58, 0xAD602B58 },
+  { 0xACEB0F89, 0xACEB0F89 }, { 0xAC769184, 0xAC769184 },
+  { 0xAC02B00B, 0 },          { 0xAB8F69E3, 0 },
+  { 0xAB1CBDD4, 0 },          { 0xAAAAAAAB, 0 },
+  { 0xAA392F36, 0 },          { 0xA9C84A48, 0 },
+  { 0xA957FAB5, 0xA957FAB5 }, { 0xA8E83F57, 0xA8E83F57 },
+  { 0xA8791709, 0 },          { 0xA80A80A8, 0xA80A80A8 },
+  { 0xA79C7B17, 0 },          { 0xA72F053A, 0 },
+  { 0xA6C21DF7, 0 },          { 0xA655C439, 0xA655C439 },
+  { 0xA5E9F6ED, 0xA5E9F6ED }, { 0xA57EB503, 0 },
+  { 0xA513FD6C, 0 },          { 0xA4A9CF1E, 0 },
+  { 0xA4402910, 0xA4402910 }, { 0xA3D70A3E, 0 },
+  { 0xA36E71A3, 0 },          { 0xA3065E40, 0 },
+  { 0xA29ECF16, 0xA29ECF16 }, { 0xA237C32B, 0xA237C32B },
+  { 0xA1D13986, 0 },          { 0xA16B312F, 0 },
+  { 0xA105A933, 0 },          { 0xA0A0A0A1, 0 },
+  { 0xA03C1689, 0 },          { 0x9FD809FE, 0 },
+  { 0x9F747A15, 0x9F747A15 }, { 0x9F1165E7, 0x9F1165E7 },
+  { 0x9EAECC8D, 0x9EAECC8D }, { 0x9E4CAD24, 0 },
+  { 0x9DEB06C9, 0x9DEB06C9 }, { 0x9D89D89E, 0 },
+  { 0x9D2921C4, 0 },          { 0x9CC8E161, 0 },
+  { 0x9C69169B, 0x9C69169B }, { 0x9C09C09C, 0x9C09C09C },
+  { 0x9BAADE8E, 0x9BAADE8E }, { 0x9B4C6F9F, 0 },
+  { 0x9AEE72FD, 0 },          { 0x9A90E7D9, 0x9A90E7D9 },
+  { 0x9A33CD67, 0x9A33CD67 }, { 0x99D722DB, 0 },
+  { 0x997AE76B, 0x997AE76B }, { 0x991F1A51, 0x991F1A51 },
+  { 0x98C3BAC7, 0x98C3BAC7 }, { 0x9868C80A, 0 },
+  { 0x980E4156, 0x980E4156 }, { 0x97B425ED, 0x97B425ED },
+  { 0x975A7510, 0 },          { 0x97012E02, 0x97012E02 },
+  { 0x96A8500A, 0 },          { 0x964FDA6C, 0x964FDA6C },
+  { 0x95F7CC73, 0 },          { 0x95A02568, 0x95A02568 },
+  { 0x9548E498, 0 },          { 0x94F2094F, 0x94F2094F },
+  { 0x949B92DE, 0 },          { 0x94458094, 0x94458094 },
+  { 0x93EFD1C5, 0x93EFD1C5 }, { 0x939A85C4, 0x939A85C4 },
+  { 0x93459BE7, 0 },          { 0x92F11384, 0x92F11384 },
+  { 0x929CEBF5, 0 },          { 0x92492492, 0x92492492 },
+  { 0x91F5BCB9, 0 },          { 0x91A2B3C5, 0 },
+  { 0x91500915, 0x91500915 }, { 0x90FDBC09, 0x90FDBC09 },
+  { 0x90ABCC02, 0x90ABCC02 }, { 0x905A3863, 0x905A3863 },
+  { 0x90090090, 0x90090090 }, { 0x8FB823EE, 0x8FB823EE },
+  { 0x8F67A1E4, 0 },          { 0x8F1779DA, 0 },
+  { 0x8EC7AB3A, 0 },          { 0x8E78356D, 0x8E78356D },
+  { 0x8E2917E1, 0 },          { 0x8DDA5202, 0x8DDA5202 },
+  { 0x8D8BE340, 0 },          { 0x8D3DCB09, 0 },
+  { 0x8CF008CF, 0x8CF008CF }, { 0x8CA29C04, 0x8CA29C04 },
+  { 0x8C55841D, 0 },          { 0x8C08C08C, 0x8C08C08C },
+  { 0x8BBC50C9, 0 },          { 0x8B70344A, 0x8B70344A },
+  { 0x8B246A88, 0 },          { 0x8AD8F2FC, 0 },
+  { 0x8A8DCD20, 0 },          { 0x8A42F870, 0x8A42F870 },
+  { 0x89F8746A, 0 },          { 0x89AE408A, 0 },
+  { 0x89645C4F, 0x89645C4F }, { 0x891AC73B, 0 },
+  { 0x88D180CD, 0x88D180CD }, { 0x88888889, 0 },
+  { 0x883FDDF0, 0x883FDDF0 }, { 0x87F78088, 0 },
+  { 0x87AF6FD6, 0 },          { 0x8767AB5F, 0x8767AB5F },
+  { 0x872032AC, 0x872032AC }, { 0x86D90545, 0 },
+  { 0x869222B2, 0 },          { 0x864B8A7E, 0 },
+  { 0x86053C34, 0x86053C34 }, { 0x85BF3761, 0x85BF3761 },
+  { 0x85797B91, 0x85797B91 }, { 0x85340853, 0x85340853 },
+  { 0x84EEDD36, 0 },          { 0x84A9F9C8, 0x84A9F9C8 },
+  { 0x84655D9C, 0 },          { 0x84210842, 0x84210842 },
+  { 0x83DCF94E, 0 },          { 0x83993052, 0x83993052 },
+  { 0x8355ACE4, 0 },          { 0x83126E98, 0 },
+  { 0x82CF7504, 0 },          { 0x828CBFBF, 0 },
+  { 0x824A4E61, 0 },          { 0x82082082, 0x82082082 },
+  { 0x81C635BC, 0x81C635BC }, { 0x81848DA9, 0 },
+  { 0x814327E4, 0 },          { 0x81020408, 0x81020408 },
+  { 0x80C121B3, 0 },          { 0x80808081, 0 },
+  { 0x80402010, 0x80402010 }, { 0xFFFFFFFF, 0xFFFFFFFF },
+  { 0xFF803FE1, 0 },          { 0xFF00FF01, 0 },
+  { 0xFE823CA6, 0 },          { 0xFE03F810, 0 },
+  { 0xFD863087, 0 },          { 0xFD08E551, 0 },
+  { 0xFC8C15B5, 0 },          { 0xFC0FC0FD, 0 },
+  { 0xFB93E673, 0 },          { 0xFB188566, 0 },
+  { 0xFA9D9D20, 0 },          { 0xFA232CF3, 0 },
+  { 0xF9A9342D, 0 },          { 0xF92FB222, 0 },
+  { 0xF8B6A622, 0xF8B6A622 }, { 0xF83E0F84, 0 },
+  { 0xF7C5ED9D, 0 },          { 0xF74E3FC3, 0 },
+  { 0xF6D7054E, 0 },          { 0xF6603D99, 0 },
+  { 0xF5E9E7FD, 0 },          { 0xF57403D6, 0 },
+  { 0xF4FE9083, 0 },          { 0xF4898D60, 0 },
+  { 0xF414F9CE, 0 },          { 0xF3A0D52D, 0 },
+  { 0xF32D1EE0, 0 },          { 0xF2B9D649, 0 },
+  { 0xF246FACC, 0 },          { 0xF1D48BCF, 0 },
+  { 0xF16288B9, 0 },          { 0xF0F0F0F1, 0 },
+  { 0xF07FC3E0, 0xF07FC3E0 }, { 0xF00F00F0, 0xF00F00F0 },
+  { 0xEF9EA78C, 0 },          { 0xEF2EB720, 0 },
+  { 0xEEBF2F19, 0 },          { 0xEE500EE5, 0xEE500EE5 },
+  { 0xEDE155F4, 0 },          { 0xED7303B6, 0 },
+  { 0xED05179C, 0xED05179C }, { 0xEC979119, 0 },
+  { 0xEC2A6FA0, 0xEC2A6FA0 }, { 0xEBBDB2A6, 0 },
+  { 0xEB5159A0, 0 },          { 0xEAE56404, 0 },
+  { 0xEA79D14A, 0 },          { 0xEA0EA0EA, 0xEA0EA0EA },
+  { 0xE9A3D25E, 0xE9A3D25E }, { 0xE9396520, 0 },
+  { 0xE8CF58AB, 0 },          { 0xE865AC7C, 0 },
+  { 0xE7FC600F, 0 },          { 0xE79372E3, 0 },
+  { 0xE72AE476, 0 },          { 0xE6C2B449, 0 },
+  { 0xE65AE1DC, 0 },          { 0xE5F36CB0, 0xE5F36CB0 },
+  { 0xE58C544A, 0 },          { 0xE525982B, 0 },
+  { 0xE4BF37D9, 0 },          { 0xE45932D8, 0 },
+  { 0xE3F388AF, 0 },          { 0xE38E38E4, 0 },
+  { 0xE32942FF, 0 },          { 0xE2C4A689, 0 },
+  { 0xE260630B, 0 },          { 0xE1FC780F, 0 },
+  { 0xE198E520, 0 },          { 0xE135A9CA, 0 },
+  { 0xE0D2C59A, 0 },          { 0xE070381C, 0xE070381C },
+  { 0xE00E00E0, 0xE00E00E0 }, { 0xDFAC1F75, 0 },
+  { 0xDF4A9369, 0 },          { 0xDEE95C4D, 0 },
+  { 0xDE8879B3, 0 },          { 0xDE27EB2D, 0 },
+  { 0xDDC7B04D, 0 },          { 0xDD67C8A6, 0xDD67C8A6 },
+  { 0xDD0833CE, 0 },          { 0xDCA8F159, 0 },
+  { 0xDC4A00DD, 0 },          { 0xDBEB61EF, 0 },
+  { 0xDB8D1428, 0 },          { 0xDB2F171E, 0 },
+  { 0xDAD16A6B, 0 },          { 0xDA740DA8, 0 },
+  { 0xDA17006D, 0xDA17006D }, { 0xD9BA4257, 0 },
+  { 0xD95DD300, 0 },          { 0xD901B204, 0 },
+  { 0xD8A5DEFF, 0 },          { 0xD84A598F, 0 },
+  { 0xD7EF2152, 0 },          { 0xD79435E5, 0xD79435E5 },
+  { 0xD73996E9, 0 },          { 0xD6DF43FD, 0 },
+  { 0xD6853CC1, 0 },          { 0xD62B80D7, 0 },
+  { 0xD5D20FDF, 0 },          { 0xD578E97D, 0 },
+  { 0xD5200D52, 0xD5200D52 }, { 0xD4C77B04, 0 },
+  { 0xD46F3235, 0 },          { 0xD417328A, 0 },
+  { 0xD3BF7BA9, 0 },          { 0xD3680D37, 0 },
+  { 0xD310E6DB, 0 },          { 0xD2BA083C, 0 },
+  { 0xD2637101, 0 },          { 0xD20D20D2, 0xD20D20D2 },
+  { 0xD1B71759, 0 },          { 0xD161543E, 0xD161543E },
+  { 0xD10BD72C, 0 },          { 0xD0B69FCC, 0 },
+  { 0xD061ADCA, 0 },          { 0xD00D00D0, 0xD00D00D0 },
+  { 0xCFB8988C, 0 },          { 0xCF6474A9, 0 },
+  { 0xCF1094D4, 0 },          { 0xCEBCF8BC, 0 },
+  { 0xCE69A00D, 0 },          { 0xCE168A77, 0xCE168A77 },
+  { 0xCDC3B7A9, 0xCDC3B7A9 }, { 0xCD712753, 0 },
+  { 0xCD1ED924, 0 },          { 0xCCCCCCCD, 0 },
+  { 0xCC7B0200, 0 },          { 0xCC29786D, 0 },
+  { 0xCBD82FC7, 0 },          { 0xCB8727C1, 0 },
+  { 0xCB36600D, 0 },          { 0xCAE5D85F, 0xCAE5D85F },
+  { 0xCA95906C, 0 },          { 0xCA4587E7, 0 },
+  { 0xC9F5BE86, 0 },          { 0xC9A633FD, 0 },
+  { 0xC956E803, 0xC956E803 }, { 0xC907DA4F, 0 },
+  { 0xC8B90A96, 0 },          { 0xC86A7890, 0xC86A7890 },
+  { 0xC81C23F5, 0xC81C23F5 }, { 0xC7CE0C7D, 0 },
+  { 0xC78031E0, 0xC78031E0 }, { 0xC73293D8, 0 },
+  { 0xC6E5321D, 0 },          { 0xC6980C6A, 0 },
+  { 0xC64B2278, 0xC64B2278 }, { 0xC5FE7403, 0xC5FE7403 },
+  { 0xC5B200C6, 0 },          { 0xC565C87C, 0 },
+  { 0xC519CAE0, 0xC519CAE0 }, { 0xC4CE07B0, 0xC4CE07B0 },
+  { 0xC4827EA8, 0xC4827EA8 }, { 0xC4372F86, 0 },
+  { 0xC3EC1A06, 0 },          { 0xC3A13DE6, 0xC3A13DE6 },
+  { 0xC3569AE6, 0 },          { 0xC30C30C3, 0xC30C30C3 },
+  { 0xC2C1FF3E, 0 },          { 0xC2780614, 0 },
+  { 0xC22E4507, 0 },          { 0xC1E4BBD6, 0 },
+  { 0xC19B6A42, 0 },          { 0xC152500C, 0xC152500C },
+  { 0xC1096CF6, 0 },          { 0xC0C0C0C1, 0 },
+  { 0xC0784B2F, 0 },          { 0xC0300C03, 0xC0300C03 },
+  { 0xBFE80300, 0 },          { 0xBFA02FE8, 0xBFA02FE8 },
+  { 0xBF589280, 0 },          { 0xBF112A8B, 0 },
+  { 0xBEC9F7CE, 0 },          { 0xBE82FA0C, 0 },
+  { 0xBE3C310C, 0 },          { 0xBDF59C92, 0 },
+  { 0xBDAF3C64, 0 },          { 0xBD691047, 0xBD691047 },
+  { 0xBD231803, 0 },          { 0xBCDD535E, 0 },
+  { 0xBC97C21E, 0xBC97C21E }, { 0xBC52640C, 0 },
+  { 0xBC0D38EE, 0xBC0D38EE }, { 0xBBC8408D, 0 },
+  { 0xBB837AB1, 0 },          { 0xBB3EE722, 0 },
+  { 0xBAFA85A9, 0xBAFA85A9 }, { 0xBAB65610, 0xBAB65610 },
+  { 0xBA725820, 0xBA725820 }, { 0xBA2E8BA3, 0 },
+  { 0xB9EAF063, 0 },          { 0xB9A7862A, 0xB9A7862A },
+  { 0xB9644CC4, 0 },          { 0xB92143FA, 0xB92143FA },
+  { 0xB8DE6B9A, 0 },          { 0xB89BC36D, 0 },
+  { 0xB8594B41, 0 },          { 0xB81702E1, 0 },
+  { 0xB7D4EA19, 0xB7D4EA19 }, { 0xB79300B8, 0 },
+  { 0xB7514689, 0 },          { 0xB70FBB5A, 0xB70FBB5A },
+  { 0xB6CE5EF9, 0xB6CE5EF9 }, { 0xB68D3134, 0xB68D3134 },
+  { 0xB64C31D9, 0 },          { 0xB60B60B6, 0xB60B60B6 },
+  { 0xB5CABD9B, 0 },          { 0xB58A4855, 0xB58A4855 },
+  { 0xB54A00B5, 0xB54A00B5 }, { 0xB509E68B, 0 },
+  { 0xB4C9F9A5, 0 },          { 0xB48A39D4, 0xB48A39D4 },
+  { 0xB44AA6E9, 0xB44AA6E9 }, { 0xB40B40B4, 0xB40B40B4 },
+  { 0xB3CC0706, 0 },          { 0xB38CF9B0, 0xB38CF9B0 },
+  { 0xB34E1884, 0 },          { 0xB30F6353, 0 },
+  { 0xB2D0D9EF, 0 },          { 0xB2927C2A, 0 },
+  { 0xB25449D7, 0 },          { 0xB21642C9, 0 },
+  { 0xB1D866D1, 0xB1D866D1 }, { 0xB19AB5C5, 0 },
+  { 0xB15D2F76, 0 },          { 0xB11FD3B8, 0xB11FD3B8 },
+  { 0xB0E2A260, 0xB0E2A260 }, { 0xB0A59B42, 0 },
+  { 0xB068BE31, 0 },          { 0xB02C0B03, 0 },
+  { 0xAFEF818C, 0 },          { 0xAFB321A1, 0xAFB321A1 },
+  { 0xAF76EB19, 0 },          { 0xAF3ADDC7, 0 },
+  { 0xAEFEF982, 0 },          { 0xAEC33E20, 0 },
+  { 0xAE87AB76, 0xAE87AB76 }, { 0xAE4C415D, 0 },
+  { 0xAE10FFA9, 0 },          { 0xADD5E632, 0xADD5E632 },
+  { 0xAD9AF4D0, 0 },          { 0xAD602B58, 0xAD602B58 },
+  { 0xAD2589A4, 0 },          { 0xACEB0F89, 0xACEB0F89 },
+  { 0xACB0BCE1, 0xACB0BCE1 }, { 0xAC769184, 0xAC769184 },
+  { 0xAC3C8D4A, 0 },          { 0xAC02B00B, 0 },
+  { 0xABC8F9A0, 0xABC8F9A0 }, { 0xAB8F69E3, 0 },
+  { 0xAB5600AC, 0 },          { 0xAB1CBDD4, 0 },
+  { 0xAAE3A136, 0 },          { 0xAAAAAAAB, 0 },
+  { 0xAA71DA0D, 0 },          { 0xAA392F36, 0 },
+  { 0xAA00AA01, 0 },          { 0xA9C84A48, 0 },
+  { 0xA9900FE6, 0 },          { 0xA957FAB5, 0xA957FAB5 },
+  { 0xA9200A92, 0xA9200A92 }, { 0xA8E83F57, 0xA8E83F57 },
+  { 0xA8B098E0, 0xA8B098E0 }, { 0xA8791709, 0 },
+  { 0xA841B9AD, 0 },          { 0xA80A80A8, 0xA80A80A8 },
+  { 0xA7D36BD8, 0 },          { 0xA79C7B17, 0 },
+  { 0xA765AE44, 0 },          { 0xA72F053A, 0 },
+  { 0xA6F87FD6, 0xA6F87FD6 }, { 0xA6C21DF7, 0 },
+  { 0xA68BDF79, 0 },          { 0xA655C439, 0xA655C439 },
+  { 0xA61FCC16, 0xA61FCC16 }, { 0xA5E9F6ED, 0xA5E9F6ED },
+  { 0xA5B4449D, 0 },          { 0xA57EB503, 0 },
+  { 0xA54947FE, 0 },          { 0xA513FD6C, 0 },
+  { 0xA4DED52C, 0xA4DED52C }, { 0xA4A9CF1E, 0 },
+  { 0xA474EB1F, 0xA474EB1F }, { 0xA4402910, 0xA4402910 },
+  { 0xA40B88D0, 0 },          { 0xA3D70A3E, 0 },
+  { 0xA3A2AD39, 0xA3A2AD39 }, { 0xA36E71A3, 0 },
+  { 0xA33A575A, 0xA33A575A }, { 0xA3065E40, 0 },
+  { 0xA2D28634, 0 },          { 0xA29ECF16, 0xA29ECF16 },
+  { 0xA26B38C9, 0 },          { 0xA237C32B, 0xA237C32B },
+  { 0xA2046E1F, 0xA2046E1F }, { 0xA1D13986, 0 },
+  { 0xA19E2540, 0 },          { 0xA16B312F, 0 },
+  { 0xA1385D35, 0 },          { 0xA105A933, 0 },
+  { 0xA0D3150C, 0 },          { 0xA0A0A0A1, 0 },
+  { 0xA06E4BD4, 0xA06E4BD4 }, { 0xA03C1689, 0 },
+  { 0xA00A00A0, 0xA00A00A0 }, { 0x9FD809FE, 0 },
+  { 0x9FA63284, 0 },          { 0x9F747A15, 0x9F747A15 },
+  { 0x9F42E095, 0x9F42E095 }, { 0x9F1165E7, 0x9F1165E7 },
+  { 0x9EE009EE, 0x9EE009EE }, { 0x9EAECC8D, 0x9EAECC8D },
+  { 0x9E7DADA9, 0 },          { 0x9E4CAD24, 0 },
+  { 0x9E1BCAE3, 0 },          { 0x9DEB06C9, 0x9DEB06C9 },
+  { 0x9DBA60BB, 0x9DBA60BB }, { 0x9D89D89E, 0 },
+  { 0x9D596E54, 0x9D596E54 }, { 0x9D2921C4, 0 },
+  { 0x9CF8F2D1, 0x9CF8F2D1 }, { 0x9CC8E161, 0 },
+  { 0x9C98ED58, 0 },          { 0x9C69169B, 0x9C69169B },
+  { 0x9C395D10, 0x9C395D10 }, { 0x9C09C09C, 0x9C09C09C },
+  { 0x9BDA4124, 0x9BDA4124 }, { 0x9BAADE8E, 0x9BAADE8E },
+  { 0x9B7B98C0, 0 },          { 0x9B4C6F9F, 0 },
+  { 0x9B1D6311, 0x9B1D6311 }, { 0x9AEE72FD, 0 },
+  { 0x9ABF9F48, 0x9ABF9F48 }, { 0x9A90E7D9, 0x9A90E7D9 },
+  { 0x9A624C97, 0 },          { 0x9A33CD67, 0x9A33CD67 },
+  { 0x9A056A31, 0 },          { 0x99D722DB, 0 },
+  { 0x99A8F74C, 0 },          { 0x997AE76B, 0x997AE76B },
+  { 0x994CF320, 0x994CF320 }, { 0x991F1A51, 0x991F1A51 },
+  { 0x98F15CE7, 0 },          { 0x98C3BAC7, 0x98C3BAC7 },
+  { 0x989633DB, 0x989633DB }, { 0x9868C80A, 0 },
+  { 0x983B773B, 0 },          { 0x980E4156, 0x980E4156 },
+  { 0x97E12644, 0x97E12644 }, { 0x97B425ED, 0x97B425ED },
+  { 0x97874039, 0 },          { 0x975A7510, 0 },
+  { 0x972DC45B, 0 },          { 0x97012E02, 0x97012E02 },
+  { 0x96D4B1EF, 0 },          { 0x96A8500A, 0 },
+  { 0x967C083B, 0 },          { 0x964FDA6C, 0x964FDA6C },
+  { 0x9623C686, 0x9623C686 }, { 0x95F7CC73, 0 },
+  { 0x95CBEC1B, 0 },          { 0x95A02568, 0x95A02568 },
+  { 0x95747844, 0 },          { 0x9548E498, 0 },
+  { 0x951D6A4E, 0 },          { 0x94F2094F, 0x94F2094F },
+  { 0x94C6C187, 0 },          { 0x949B92DE, 0 },
+  { 0x94707D3F, 0 },          { 0x94458094, 0x94458094 },
+  { 0x941A9CC8, 0x941A9CC8 }, { 0x93EFD1C5, 0x93EFD1C5 },
+  { 0x93C51F76, 0 },          { 0x939A85C4, 0x939A85C4 },
+  { 0x9370049C, 0 },          { 0x93459BE7, 0 },
+  { 0x931B4B91, 0 },          { 0x92F11384, 0x92F11384 },
+  { 0x92C6F3AC, 0x92C6F3AC }, { 0x929CEBF5, 0 },
+  { 0x9272FC48, 0x9272FC48 }, { 0x92492492, 0x92492492 },
+  { 0x921F64BF, 0 },          { 0x91F5BCB9, 0 },
+  { 0x91CC2C6C, 0x91CC2C6C }, { 0x91A2B3C5, 0 },
+  { 0x917952AF, 0 },          { 0x91500915, 0x91500915 },
+  { 0x9126D6E5, 0 },          { 0x90FDBC09, 0x90FDBC09 },
+  { 0x90D4B86F, 0 },          { 0x90ABCC02, 0x90ABCC02 },
+  { 0x9082F6B0, 0 },          { 0x905A3863, 0x905A3863 },
+  { 0x9031910A, 0 },          { 0x90090090, 0x90090090 },
+  { 0x8FE086E3, 0 },          { 0x8FB823EE, 0x8FB823EE },
+  { 0x8F8FD7A0, 0 },          { 0x8F67A1E4, 0 },
+  { 0x8F3F82A8, 0x8F3F82A8 }, { 0x8F1779DA, 0 },
+  { 0x8EEF8766, 0 },          { 0x8EC7AB3A, 0 },
+  { 0x8E9FE542, 0x8E9FE542 }, { 0x8E78356D, 0x8E78356D },
+  { 0x8E509BA8, 0x8E509BA8 }, { 0x8E2917E1, 0 },
+  { 0x8E01AA05, 0 },          { 0x8DDA5202, 0x8DDA5202 },
+  { 0x8DB30FC6, 0x8DB30FC6 }, { 0x8D8BE340, 0 },
+  { 0x8D64CC5C, 0 },          { 0x8D3DCB09, 0 },
+  { 0x8D16DF35, 0x8D16DF35 }, { 0x8CF008CF, 0x8CF008CF },
+  { 0x8CC947C5, 0 },          { 0x8CA29C04, 0x8CA29C04 },
+  { 0x8C7C057D, 0 },          { 0x8C55841D, 0 },
+  { 0x8C2F17D2, 0x8C2F17D2 }, { 0x8C08C08C, 0x8C08C08C },
+  { 0x8BE27E39, 0x8BE27E39 }, { 0x8BBC50C9, 0 },
+  { 0x8B963829, 0x8B963829 }, { 0x8B70344A, 0x8B70344A },
+  { 0x8B4A451A, 0 },          { 0x8B246A88, 0 },
+  { 0x8AFEA483, 0x8AFEA483 }, { 0x8AD8F2FC, 0 },
+  { 0x8AB355E0, 0x8AB355E0 }, { 0x8A8DCD20, 0 },
+  { 0x8A6858AB, 0 },          { 0x8A42F870, 0x8A42F870 },
+  { 0x8A1DAC60, 0x8A1DAC60 }, { 0x89F8746A, 0 },
+  { 0x89D3507D, 0 },          { 0x89AE408A, 0 },
+  { 0x89894480, 0 },          { 0x89645C4F, 0x89645C4F },
+  { 0x893F87E8, 0x893F87E8 }, { 0x891AC73B, 0 },
+  { 0x88F61A37, 0x88F61A37 }, { 0x88D180CD, 0x88D180CD },
+  { 0x88ACFAEE, 0 },          { 0x88888889, 0 },
+  { 0x8864298F, 0 },          { 0x883FDDF0, 0x883FDDF0 },
+  { 0x881BA59E, 0 },          { 0x87F78088, 0 },
+  { 0x87D36EA0, 0 },          { 0x87AF6FD6, 0 },
+  { 0x878B841B, 0 },          { 0x8767AB5F, 0x8767AB5F },
+  { 0x8743E595, 0 },          { 0x872032AC, 0x872032AC },
+  { 0x86FC9296, 0x86FC9296 }, { 0x86D90545, 0 },
+  { 0x86B58AA8, 0 },          { 0x869222B2, 0 },
+  { 0x866ECD53, 0x866ECD53 }, { 0x864B8A7E, 0 },
+  { 0x86285A23, 0x86285A23 }, { 0x86053C34, 0x86053C34 },
+  { 0x85E230A3, 0x85E230A3 }, { 0x85BF3761, 0x85BF3761 },
+  { 0x859C5060, 0x859C5060 }, { 0x85797B91, 0x85797B91 },
+  { 0x8556B8E7, 0x8556B8E7 }, { 0x85340853, 0x85340853 },
+  { 0x851169C7, 0x851169C7 }, { 0x84EEDD36, 0 },
+  { 0x84CC6290, 0 },          { 0x84A9F9C8, 0x84A9F9C8 },
+  { 0x8487A2D1, 0 },          { 0x84655D9C, 0 },
+  { 0x84432A1B, 0x84432A1B }, { 0x84210842, 0x84210842 },
+  { 0x83FEF802, 0x83FEF802 }, { 0x83DCF94E, 0 },
+  { 0x83BB0C18, 0 },          { 0x83993052, 0x83993052 },
+  { 0x837765F0, 0x837765F0 }, { 0x8355ACE4, 0 },
+  { 0x83340520, 0x83340520 }, { 0x83126E98, 0 },
+  { 0x82F0E93D, 0x82F0E93D }, { 0x82CF7504, 0 },
+  { 0x82AE11DE, 0 },          { 0x828CBFBF, 0 },
+  { 0x826B7E99, 0x826B7E99 }, { 0x824A4E61, 0 },
+  { 0x82292F08, 0 },          { 0x82082082, 0x82082082 },
+  { 0x81E722C2, 0x81E722C2 }, { 0x81C635BC, 0x81C635BC },
+  { 0x81A55963, 0 },          { 0x81848DA9, 0 },
+  { 0x8163D283, 0 },          { 0x814327E4, 0 },
+  { 0x81228DBF, 0 },          { 0x81020408, 0x81020408 },
+  { 0x80E18AB3, 0 },          { 0x80C121B3, 0 },
+  { 0x80A0C8FB, 0x80A0C8FB }, { 0x80808081, 0 },
+  { 0x80604836, 0x80604836 }, { 0x80402010, 0x80402010 },
+  { 0x80200802, 0x80200802 }, { 0xFFFFFFFF, 0xFFFFFFFF }
+};
diff --git a/libs/libaom/src/av1/common/odintrin.h b/libs/libaom/src/av1/common/odintrin.h
new file mode 100644
index 000000000..e1db0f44d
--- /dev/null
+++ b/libs/libaom/src/av1/common/odintrin.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/* clang-format off */
+
+#ifndef AOM_AV1_COMMON_ODINTRIN_H_
+#define AOM_AV1_COMMON_ODINTRIN_H_
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/bitops.h"
+#include "av1/common/enums.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef int od_coeff;
+
+#define OD_DIVU_DMAX (1024)
+
+extern uint32_t OD_DIVU_SMALL_CONSTS[OD_DIVU_DMAX][2];
+
+/*Translate unsigned division by small divisors into multiplications.*/
+#define OD_DIVU_SMALL(_x, _d)                                     \
+  ((uint32_t)((OD_DIVU_SMALL_CONSTS[(_d)-1][0] * (uint64_t)(_x) + \
+               OD_DIVU_SMALL_CONSTS[(_d)-1][1]) >>                \
+              32) >>                                              \
+   (OD_ILOG_NZ(_d) - 1))
+
+#define OD_DIVU(_x, _d) \
+  (((_d) < OD_DIVU_DMAX) ? (OD_DIVU_SMALL((_x), (_d))) : ((_x) / (_d)))
+
+#define OD_MINI AOMMIN
+#define OD_MAXI AOMMAX
+#define OD_CLAMPI(min, val, max) (OD_MAXI(min, OD_MINI(val, max)))
+
+/*Integer logarithm (base 2) of a nonzero unsigned 32-bit integer.
+  OD_ILOG_NZ(x) = (int)floor(log2(x)) + 1.*/
+#define OD_ILOG_NZ(x) (1 + get_msb(x))
+
+/*Enable special features for gcc and compatible compilers.*/
+#if defined(__GNUC__) && defined(__GNUC_MINOR__) && defined(__GNUC_PATCHLEVEL__)
+#define OD_GNUC_PREREQ(maj, min, pat)                                \
+  ((__GNUC__ << 16) + (__GNUC_MINOR__ << 8) + __GNUC_PATCHLEVEL__ >= \
+   ((maj) << 16) + ((min) << 8) + pat)  // NOLINT
+#else
+#define OD_GNUC_PREREQ(maj, min, pat) (0)
+#endif
+
+#if OD_GNUC_PREREQ(3, 4, 0)
+#define OD_WARN_UNUSED_RESULT __attribute__((__warn_unused_result__))
+#else
+#define OD_WARN_UNUSED_RESULT
+#endif
+
+#if OD_GNUC_PREREQ(3, 4, 0)
+#define OD_ARG_NONNULL(x) __attribute__((__nonnull__(x)))
+#else
+#define OD_ARG_NONNULL(x)
+#endif
+
+/** Copy n elements of memory from src to dst. The 0* term provides
+    compile-time type checking  */
+#if !defined(OVERRIDE_OD_COPY)
+#define OD_COPY(dst, src, n) \
+  (memcpy((dst), (src), sizeof(*(dst)) * (n) + 0 * ((dst) - (src))))
+#endif
+
+/** Copy n elements of memory from src to dst, allowing overlapping regions.
+    The 0* term provides compile-time type checking */
+#if !defined(OVERRIDE_OD_MOVE)
+# define OD_MOVE(dst, src, n) \
+ (memmove((dst), (src), sizeof(*(dst))*(n) + 0*((dst) - (src)) ))
+#endif
+
+/*All of these macros should expect floats as arguments.*/
+# define OD_SIGNMASK(a) (-((a) < 0))
+# define OD_FLIPSIGNI(a, b) (((a) + OD_SIGNMASK(b)) ^ OD_SIGNMASK(b))
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_COMMON_ODINTRIN_H_
diff --git a/libs/libaom/src/av1/common/ppc/cfl_ppc.c b/libs/libaom/src/av1/common/ppc/cfl_ppc.c
new file mode 100644
index 000000000..6f88768f2
--- /dev/null
+++ b/libs/libaom/src/av1/common/ppc/cfl_ppc.c
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <altivec.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/cfl.h"
+
+#define OFF_0 0
+#define OFF_1 16
+#define OFF_2 32
+#define OFF_3 48
+#define CFL_BUF_LINE_BYTES 64
+#define CFL_LINE_1 64
+#define CFL_LINE_2 128
+#define CFL_LINE_3 192
+
+typedef vector signed char int8x16_t;          // NOLINT(runtime/int)
+typedef vector unsigned char uint8x16_t;       // NOLINT(runtime/int)
+typedef vector signed short int16x8_t;         // NOLINT(runtime/int)
+typedef vector unsigned short uint16x8_t;      // NOLINT(runtime/int)
+typedef vector signed int int32x4_t;           // NOLINT(runtime/int)
+typedef vector unsigned int uint32x4_t;        // NOLINT(runtime/int)
+typedef vector unsigned long long uint64x2_t;  // NOLINT(runtime/int)
+
+static INLINE void subtract_average_vsx(const uint16_t *src_ptr, int16_t *dst,
+                                        int width, int height, int round_offset,
+                                        int num_pel_log2) {
+  //  int16_t *dst = dst_ptr;
+  const int16_t *dst_end = dst + height * CFL_BUF_LINE;
+  const int16_t *sum_buf = (const int16_t *)src_ptr;
+  const int16_t *end = sum_buf + height * CFL_BUF_LINE;
+  const uint32x4_t div_shift = vec_splats((uint32_t)num_pel_log2);
+  const uint8x16_t mask_64 = { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
+                               0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 };
+  const uint8x16_t mask_32 = { 0x14, 0x15, 0x16, 0x17, 0x00, 0x01, 0x02, 0x03,
+                               0x1C, 0x1D, 0x1E, 0x1F, 0x08, 0x09, 0x0A, 0x0B };
+
+  int32x4_t sum_32x4_0 = { 0, 0, 0, round_offset };
+  int32x4_t sum_32x4_1 = { 0, 0, 0, 0 };
+  do {
+    sum_32x4_0 = vec_sum4s(vec_vsx_ld(OFF_0, sum_buf), sum_32x4_0);
+    sum_32x4_1 = vec_sum4s(vec_vsx_ld(OFF_0 + CFL_LINE_1, sum_buf), sum_32x4_1);
+    if (width >= 16) {
+      sum_32x4_0 = vec_sum4s(vec_vsx_ld(OFF_1, sum_buf), sum_32x4_0);
+      sum_32x4_1 =
+          vec_sum4s(vec_vsx_ld(OFF_1 + CFL_LINE_1, sum_buf), sum_32x4_1);
+    }
+    if (width == 32) {
+      sum_32x4_0 = vec_sum4s(vec_vsx_ld(OFF_2, sum_buf), sum_32x4_0);
+      sum_32x4_1 =
+          vec_sum4s(vec_vsx_ld(OFF_2 + CFL_LINE_1, sum_buf), sum_32x4_1);
+      sum_32x4_0 = vec_sum4s(vec_vsx_ld(OFF_3, sum_buf), sum_32x4_0);
+      sum_32x4_1 =
+          vec_sum4s(vec_vsx_ld(OFF_3 + CFL_LINE_1, sum_buf), sum_32x4_1);
+    }
+  } while ((sum_buf += (CFL_BUF_LINE * 2)) < end);
+  int32x4_t sum_32x4 = vec_add(sum_32x4_0, sum_32x4_1);
+
+  const int32x4_t perm_64 = vec_perm(sum_32x4, sum_32x4, mask_64);
+  sum_32x4 = vec_add(sum_32x4, perm_64);
+  const int32x4_t perm_32 = vec_perm(sum_32x4, sum_32x4, mask_32);
+  sum_32x4 = vec_add(sum_32x4, perm_32);
+  const int32x4_t avg = vec_sr(sum_32x4, div_shift);
+  const int16x8_t vec_avg = vec_pack(avg, avg);
+  do {
+    vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0, dst), vec_avg), OFF_0, dst);
+    vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_1, dst), vec_avg),
+               OFF_0 + CFL_BUF_LINE_BYTES, dst);
+    vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_2, dst), vec_avg),
+               OFF_0 + CFL_LINE_2, dst);
+    vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_3, dst), vec_avg),
+               OFF_0 + CFL_LINE_3, dst);
+    if (width >= 16) {
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1, dst), vec_avg), OFF_1, dst);
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_1, dst), vec_avg),
+                 OFF_1 + CFL_LINE_1, dst);
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_2, dst), vec_avg),
+                 OFF_1 + CFL_LINE_2, dst);
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_3, dst), vec_avg),
+                 OFF_1 + CFL_LINE_3, dst);
+    }
+    if (width == 32) {
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2, dst), vec_avg), OFF_2, dst);
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_1, dst), vec_avg),
+                 OFF_2 + CFL_LINE_1, dst);
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_2, dst), vec_avg),
+                 OFF_2 + CFL_LINE_2, dst);
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_3, dst), vec_avg),
+                 OFF_2 + CFL_LINE_3, dst);
+
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3, dst), vec_avg), OFF_3, dst);
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_1, dst), vec_avg),
+                 OFF_3 + CFL_LINE_1, dst);
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_2, dst), vec_avg),
+                 OFF_3 + CFL_LINE_2, dst);
+      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_3, dst), vec_avg),
+                 OFF_3 + CFL_LINE_3, dst);
+    }
+  } while ((dst += CFL_BUF_LINE * 4) < dst_end);
+}
+
+// Declare wrappers for VSX sizes
+CFL_SUB_AVG_X(vsx, 8, 4, 16, 5)
+CFL_SUB_AVG_X(vsx, 8, 8, 32, 6)
+CFL_SUB_AVG_X(vsx, 8, 16, 64, 7)
+CFL_SUB_AVG_X(vsx, 8, 32, 128, 8)
+CFL_SUB_AVG_X(vsx, 16, 4, 32, 6)
+CFL_SUB_AVG_X(vsx, 16, 8, 64, 7)
+CFL_SUB_AVG_X(vsx, 16, 16, 128, 8)
+CFL_SUB_AVG_X(vsx, 16, 32, 256, 9)
+CFL_SUB_AVG_X(vsx, 32, 8, 128, 8)
+CFL_SUB_AVG_X(vsx, 32, 16, 256, 9)
+CFL_SUB_AVG_X(vsx, 32, 32, 512, 10)
+
+// Based on observation, for small blocks VSX does not outperform C (no 64bit
+// load and store intrinsics). So we call the C code for block widths 4.
+cfl_subtract_average_fn cfl_get_subtract_average_fn_vsx(TX_SIZE tx_size) {
+  static const cfl_subtract_average_fn sub_avg[TX_SIZES_ALL] = {
+    cfl_subtract_average_4x4_c,     /* 4x4 */
+    cfl_subtract_average_8x8_vsx,   /* 8x8 */
+    cfl_subtract_average_16x16_vsx, /* 16x16 */
+    cfl_subtract_average_32x32_vsx, /* 32x32 */
+    NULL,                           /* 64x64 (invalid CFL size) */
+    cfl_subtract_average_4x8_c,     /* 4x8 */
+    cfl_subtract_average_8x4_vsx,   /* 8x4 */
+    cfl_subtract_average_8x16_vsx,  /* 8x16 */
+    cfl_subtract_average_16x8_vsx,  /* 16x8 */
+    cfl_subtract_average_16x32_vsx, /* 16x32 */
+    cfl_subtract_average_32x16_vsx, /* 32x16 */
+    NULL,                           /* 32x64 (invalid CFL size) */
+    NULL,                           /* 64x32 (invalid CFL size) */
+    cfl_subtract_average_4x16_c,    /* 4x16 */
+    cfl_subtract_average_16x4_vsx,  /* 16x4 */
+    cfl_subtract_average_8x32_vsx,  /* 8x32 */
+    cfl_subtract_average_32x8_vsx,  /* 32x8 */
+    NULL,                           /* 16x64 (invalid CFL size) */
+    NULL,                           /* 64x16 (invalid CFL size) */
+  };
+  // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to
+  // index the function pointer array out of bounds.
+  return sub_avg[tx_size % TX_SIZES_ALL];
+}
diff --git a/libs/libaom/src/av1/common/pred_common.c b/libs/libaom/src/av1/common/pred_common.c
new file mode 100644
index 000000000..5952441d1
--- /dev/null
+++ b/libs/libaom/src/av1/common/pred_common.c
@@ -0,0 +1,501 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/common.h"
+#include "av1/common/pred_common.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+#include "av1/common/seg_common.h"
+
+// Returns a context number for the given MB prediction signal
+static InterpFilter get_ref_filter_type(const MB_MODE_INFO *ref_mbmi,
+                                        const MACROBLOCKD *xd, int dir,
+                                        MV_REFERENCE_FRAME ref_frame) {
+  (void)xd;
+
+  return ((ref_mbmi->ref_frame[0] == ref_frame ||
+           ref_mbmi->ref_frame[1] == ref_frame)
+              ? av1_extract_interp_filter(ref_mbmi->interp_filters, dir & 0x01)
+              : SWITCHABLE_FILTERS);
+}
+
+int av1_get_pred_context_switchable_interp(const MACROBLOCKD *xd, int dir) {
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int ctx_offset =
+      (mbmi->ref_frame[1] > INTRA_FRAME) * INTER_FILTER_COMP_OFFSET;
+  assert(dir == 0 || dir == 1);
+  const MV_REFERENCE_FRAME ref_frame = mbmi->ref_frame[0];
+  // Note:
+  // The mode info data structure has a one element border above and to the
+  // left of the entries corresponding to real macroblocks.
+  // The prediction flags in these dummy entries are initialized to 0.
+  int filter_type_ctx = ctx_offset + (dir & 0x01) * INTER_FILTER_DIR_OFFSET;
+  int left_type = SWITCHABLE_FILTERS;
+  int above_type = SWITCHABLE_FILTERS;
+
+  if (xd->left_available)
+    left_type = get_ref_filter_type(xd->mi[-1], xd, dir, ref_frame);
+
+  if (xd->up_available)
+    above_type =
+        get_ref_filter_type(xd->mi[-xd->mi_stride], xd, dir, ref_frame);
+
+  if (left_type == above_type) {
+    filter_type_ctx += left_type;
+  } else if (left_type == SWITCHABLE_FILTERS) {
+    assert(above_type != SWITCHABLE_FILTERS);
+    filter_type_ctx += above_type;
+  } else if (above_type == SWITCHABLE_FILTERS) {
+    assert(left_type != SWITCHABLE_FILTERS);
+    filter_type_ctx += left_type;
+  } else {
+    filter_type_ctx += SWITCHABLE_FILTERS;
+  }
+
+  return filter_type_ctx;
+}
+
+static void palette_add_to_cache(uint16_t *cache, int *n, uint16_t val) {
+  // Do not add an already existing value
+  if (*n > 0 && val == cache[*n - 1]) return;
+
+  cache[(*n)++] = val;
+}
+
+int av1_get_palette_cache(const MACROBLOCKD *const xd, int plane,
+                          uint16_t *cache) {
+  const int row = -xd->mb_to_top_edge >> 3;
+  // Do not refer to above SB row when on SB boundary.
+  const MB_MODE_INFO *const above_mi =
+      (row % (1 << MIN_SB_SIZE_LOG2)) ? xd->above_mbmi : NULL;
+  const MB_MODE_INFO *const left_mi = xd->left_mbmi;
+  int above_n = 0, left_n = 0;
+  if (above_mi) above_n = above_mi->palette_mode_info.palette_size[plane != 0];
+  if (left_mi) left_n = left_mi->palette_mode_info.palette_size[plane != 0];
+  if (above_n == 0 && left_n == 0) return 0;
+  int above_idx = plane * PALETTE_MAX_SIZE;
+  int left_idx = plane * PALETTE_MAX_SIZE;
+  int n = 0;
+  const uint16_t *above_colors =
+      above_mi ? above_mi->palette_mode_info.palette_colors : NULL;
+  const uint16_t *left_colors =
+      left_mi ? left_mi->palette_mode_info.palette_colors : NULL;
+  // Merge the sorted lists of base colors from above and left to get
+  // combined sorted color cache.
+  while (above_n > 0 && left_n > 0) {
+    uint16_t v_above = above_colors[above_idx];
+    uint16_t v_left = left_colors[left_idx];
+    if (v_left < v_above) {
+      palette_add_to_cache(cache, &n, v_left);
+      ++left_idx, --left_n;
+    } else {
+      palette_add_to_cache(cache, &n, v_above);
+      ++above_idx, --above_n;
+      if (v_left == v_above) ++left_idx, --left_n;
+    }
+  }
+  while (above_n-- > 0) {
+    uint16_t val = above_colors[above_idx++];
+    palette_add_to_cache(cache, &n, val);
+  }
+  while (left_n-- > 0) {
+    uint16_t val = left_colors[left_idx++];
+    palette_add_to_cache(cache, &n, val);
+  }
+  assert(n <= 2 * PALETTE_MAX_SIZE);
+  return n;
+}
+
+// The mode info data structure has a one element border above and to the
+// left of the entries corresponding to real macroblocks.
+// The prediction flags in these dummy entries are initialized to 0.
+// 0 - inter/inter, inter/--, --/inter, --/--
+// 1 - intra/inter, inter/intra
+// 2 - intra/--, --/intra
+// 3 - intra/intra
+int av1_get_intra_inter_context(const MACROBLOCKD *xd) {
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const int has_above = xd->up_available;
+  const int has_left = xd->left_available;
+
+  if (has_above && has_left) {  // both edges available
+    const int above_intra = !is_inter_block(above_mbmi);
+    const int left_intra = !is_inter_block(left_mbmi);
+    return left_intra && above_intra ? 3 : left_intra || above_intra;
+  } else if (has_above || has_left) {  // one edge available
+    return 2 * !is_inter_block(has_above ? above_mbmi : left_mbmi);
+  } else {
+    return 0;
+  }
+}
+
+#define CHECK_BACKWARD_REFS(ref_frame) \
+  (((ref_frame) >= BWDREF_FRAME) && ((ref_frame) <= ALTREF_FRAME))
+#define IS_BACKWARD_REF_FRAME(ref_frame) CHECK_BACKWARD_REFS(ref_frame)
+
+int av1_get_reference_mode_context(const MACROBLOCKD *xd) {
+  int ctx;
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const int has_above = xd->up_available;
+  const int has_left = xd->left_available;
+
+  // Note:
+  // The mode info data structure has a one element border above and to the
+  // left of the entries corresponding to real macroblocks.
+  // The prediction flags in these dummy entries are initialized to 0.
+  if (has_above && has_left) {  // both edges available
+    if (!has_second_ref(above_mbmi) && !has_second_ref(left_mbmi))
+      // neither edge uses comp pred (0/1)
+      ctx = IS_BACKWARD_REF_FRAME(above_mbmi->ref_frame[0]) ^
+            IS_BACKWARD_REF_FRAME(left_mbmi->ref_frame[0]);
+    else if (!has_second_ref(above_mbmi))
+      // one of two edges uses comp pred (2/3)
+      ctx = 2 + (IS_BACKWARD_REF_FRAME(above_mbmi->ref_frame[0]) ||
+                 !is_inter_block(above_mbmi));
+    else if (!has_second_ref(left_mbmi))
+      // one of two edges uses comp pred (2/3)
+      ctx = 2 + (IS_BACKWARD_REF_FRAME(left_mbmi->ref_frame[0]) ||
+                 !is_inter_block(left_mbmi));
+    else  // both edges use comp pred (4)
+      ctx = 4;
+  } else if (has_above || has_left) {  // one edge available
+    const MB_MODE_INFO *edge_mbmi = has_above ? above_mbmi : left_mbmi;
+
+    if (!has_second_ref(edge_mbmi))
+      // edge does not use comp pred (0/1)
+      ctx = IS_BACKWARD_REF_FRAME(edge_mbmi->ref_frame[0]);
+    else
+      // edge uses comp pred (3)
+      ctx = 3;
+  } else {  // no edges available (1)
+    ctx = 1;
+  }
+  assert(ctx >= 0 && ctx < COMP_INTER_CONTEXTS);
+  return ctx;
+}
+
+int av1_get_comp_reference_type_context(const MACROBLOCKD *xd) {
+  int pred_context;
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const int above_in_image = xd->up_available;
+  const int left_in_image = xd->left_available;
+
+  if (above_in_image && left_in_image) {  // both edges available
+    const int above_intra = !is_inter_block(above_mbmi);
+    const int left_intra = !is_inter_block(left_mbmi);
+
+    if (above_intra && left_intra) {  // intra/intra
+      pred_context = 2;
+    } else if (above_intra || left_intra) {  // intra/inter
+      const MB_MODE_INFO *inter_mbmi = above_intra ? left_mbmi : above_mbmi;
+
+      if (!has_second_ref(inter_mbmi))  // single pred
+        pred_context = 2;
+      else  // comp pred
+        pred_context = 1 + 2 * has_uni_comp_refs(inter_mbmi);
+    } else {  // inter/inter
+      const int a_sg = !has_second_ref(above_mbmi);
+      const int l_sg = !has_second_ref(left_mbmi);
+      const MV_REFERENCE_FRAME frfa = above_mbmi->ref_frame[0];
+      const MV_REFERENCE_FRAME frfl = left_mbmi->ref_frame[0];
+
+      if (a_sg && l_sg) {  // single/single
+        pred_context = 1 + 2 * (!(IS_BACKWARD_REF_FRAME(frfa) ^
+                                  IS_BACKWARD_REF_FRAME(frfl)));
+      } else if (l_sg || a_sg) {  // single/comp
+        const int uni_rfc =
+            a_sg ? has_uni_comp_refs(left_mbmi) : has_uni_comp_refs(above_mbmi);
+
+        if (!uni_rfc)  // comp bidir
+          pred_context = 1;
+        else  // comp unidir
+          pred_context = 3 + (!(IS_BACKWARD_REF_FRAME(frfa) ^
+                                IS_BACKWARD_REF_FRAME(frfl)));
+      } else {  // comp/comp
+        const int a_uni_rfc = has_uni_comp_refs(above_mbmi);
+        const int l_uni_rfc = has_uni_comp_refs(left_mbmi);
+
+        if (!a_uni_rfc && !l_uni_rfc)  // bidir/bidir
+          pred_context = 0;
+        else if (!a_uni_rfc || !l_uni_rfc)  // unidir/bidir
+          pred_context = 2;
+        else  // unidir/unidir
+          pred_context =
+              3 + (!((frfa == BWDREF_FRAME) ^ (frfl == BWDREF_FRAME)));
+      }
+    }
+  } else if (above_in_image || left_in_image) {  // one edge available
+    const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi;
+
+    if (!is_inter_block(edge_mbmi)) {  // intra
+      pred_context = 2;
+    } else {                           // inter
+      if (!has_second_ref(edge_mbmi))  // single pred
+        pred_context = 2;
+      else  // comp pred
+        pred_context = 4 * has_uni_comp_refs(edge_mbmi);
+    }
+  } else {  // no edges available
+    pred_context = 2;
+  }
+
+  assert(pred_context >= 0 && pred_context < COMP_REF_TYPE_CONTEXTS);
+  return pred_context;
+}
+
+// Returns a context number for the given MB prediction signal
+//
+// Signal the uni-directional compound reference frame pair as either
+// (BWDREF, ALTREF), or (LAST, LAST2) / (LAST, LAST3) / (LAST, GOLDEN),
+// conditioning on the pair is known as uni-directional.
+//
+// 3 contexts: Voting is used to compare the count of forward references with
+//             that of backward references from the spatial neighbors.
+int av1_get_pred_context_uni_comp_ref_p(const MACROBLOCKD *xd) {
+  const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0];
+
+  // Count of forward references (L, L2, L3, or G)
+  const int frf_count = ref_counts[LAST_FRAME] + ref_counts[LAST2_FRAME] +
+                        ref_counts[LAST3_FRAME] + ref_counts[GOLDEN_FRAME];
+  // Count of backward references (B or A)
+  const int brf_count = ref_counts[BWDREF_FRAME] + ref_counts[ALTREF2_FRAME] +
+                        ref_counts[ALTREF_FRAME];
+
+  const int pred_context =
+      (frf_count == brf_count) ? 1 : ((frf_count < brf_count) ? 0 : 2);
+
+  assert(pred_context >= 0 && pred_context < UNI_COMP_REF_CONTEXTS);
+  return pred_context;
+}
+
+// Returns a context number for the given MB prediction signal
+//
+// Signal the uni-directional compound reference frame pair as
+// either (LAST, LAST2), or (LAST, LAST3) / (LAST, GOLDEN),
+// conditioning on the pair is known as one of the above three.
+//
+// 3 contexts: Voting is used to compare the count of LAST2_FRAME with the
+//             total count of LAST3/GOLDEN from the spatial neighbors.
+int av1_get_pred_context_uni_comp_ref_p1(const MACROBLOCKD *xd) {
+  const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0];
+
+  // Count of LAST2
+  const int last2_count = ref_counts[LAST2_FRAME];
+  // Count of LAST3 or GOLDEN
+  const int last3_or_gld_count =
+      ref_counts[LAST3_FRAME] + ref_counts[GOLDEN_FRAME];
+
+  const int pred_context = (last2_count == last3_or_gld_count)
+                               ? 1
+                               : ((last2_count < last3_or_gld_count) ? 0 : 2);
+
+  assert(pred_context >= 0 && pred_context < UNI_COMP_REF_CONTEXTS);
+  return pred_context;
+}
+
+// Returns a context number for the given MB prediction signal
+//
+// Signal the uni-directional compound reference frame pair as
+// either (LAST, LAST3) or (LAST, GOLDEN),
+// conditioning on the pair is known as one of the above two.
+//
+// 3 contexts: Voting is used to compare the count of LAST3_FRAME with the
+//             total count of GOLDEN_FRAME from the spatial neighbors.
+int av1_get_pred_context_uni_comp_ref_p2(const MACROBLOCKD *xd) {
+  const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0];
+
+  // Count of LAST3
+  const int last3_count = ref_counts[LAST3_FRAME];
+  // Count of GOLDEN
+  const int gld_count = ref_counts[GOLDEN_FRAME];
+
+  const int pred_context =
+      (last3_count == gld_count) ? 1 : ((last3_count < gld_count) ? 0 : 2);
+
+  assert(pred_context >= 0 && pred_context < UNI_COMP_REF_CONTEXTS);
+  return pred_context;
+}
+
+// == Common context functions for both comp and single ref ==
+//
+// Obtain contexts to signal a reference frame to be either LAST/LAST2 or
+// LAST3/GOLDEN.
+static int get_pred_context_ll2_or_l3gld(const MACROBLOCKD *xd) {
+  const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0];
+
+  // Count of LAST + LAST2
+  const int last_last2_count = ref_counts[LAST_FRAME] + ref_counts[LAST2_FRAME];
+  // Count of LAST3 + GOLDEN
+  const int last3_gld_count =
+      ref_counts[LAST3_FRAME] + ref_counts[GOLDEN_FRAME];
+
+  const int pred_context = (last_last2_count == last3_gld_count)
+                               ? 1
+                               : ((last_last2_count < last3_gld_count) ? 0 : 2);
+
+  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
+  return pred_context;
+}
+
+// Obtain contexts to signal a reference frame to be either LAST or LAST2.
+static int get_pred_context_last_or_last2(const MACROBLOCKD *xd) {
+  const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0];
+
+  // Count of LAST
+  const int last_count = ref_counts[LAST_FRAME];
+  // Count of LAST2
+  const int last2_count = ref_counts[LAST2_FRAME];
+
+  const int pred_context =
+      (last_count == last2_count) ? 1 : ((last_count < last2_count) ? 0 : 2);
+
+  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
+  return pred_context;
+}
+
+// Obtain contexts to signal a reference frame to be either LAST3 or GOLDEN.
+static int get_pred_context_last3_or_gld(const MACROBLOCKD *xd) {
+  const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0];
+
+  // Count of LAST3
+  const int last3_count = ref_counts[LAST3_FRAME];
+  // Count of GOLDEN
+  const int gld_count = ref_counts[GOLDEN_FRAME];
+
+  const int pred_context =
+      (last3_count == gld_count) ? 1 : ((last3_count < gld_count) ? 0 : 2);
+
+  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
+  return pred_context;
+}
+
+// Obtain contexts to signal a reference frame be either BWDREF/ALTREF2, or
+// ALTREF.
+static int get_pred_context_brfarf2_or_arf(const MACROBLOCKD *xd) {
+  const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0];
+
+  // Counts of BWDREF, ALTREF2, or ALTREF frames (B, A2, or A)
+  const int brfarf2_count =
+      ref_counts[BWDREF_FRAME] + ref_counts[ALTREF2_FRAME];
+  const int arf_count = ref_counts[ALTREF_FRAME];
+
+  const int pred_context =
+      (brfarf2_count == arf_count) ? 1 : ((brfarf2_count < arf_count) ? 0 : 2);
+
+  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
+  return pred_context;
+}
+
+// Obtain contexts to signal a reference frame be either BWDREF or ALTREF2.
+static int get_pred_context_brf_or_arf2(const MACROBLOCKD *xd) {
+  const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0];
+
+  // Count of BWDREF frames (B)
+  const int brf_count = ref_counts[BWDREF_FRAME];
+  // Count of ALTREF2 frames (A2)
+  const int arf2_count = ref_counts[ALTREF2_FRAME];
+
+  const int pred_context =
+      (brf_count == arf2_count) ? 1 : ((brf_count < arf2_count) ? 0 : 2);
+
+  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
+  return pred_context;
+}
+
+// == Context functions for comp ref ==
+//
+// Returns a context number for the given MB prediction signal
+// Signal the first reference frame for a compound mode be either
+// GOLDEN/LAST3, or LAST/LAST2.
+int av1_get_pred_context_comp_ref_p(const MACROBLOCKD *xd) {
+  return get_pred_context_ll2_or_l3gld(xd);
+}
+
+// Returns a context number for the given MB prediction signal
+// Signal the first reference frame for a compound mode be LAST,
+// conditioning on that it is known either LAST/LAST2.
+int av1_get_pred_context_comp_ref_p1(const MACROBLOCKD *xd) {
+  return get_pred_context_last_or_last2(xd);
+}
+
+// Returns a context number for the given MB prediction signal
+// Signal the first reference frame for a compound mode be GOLDEN,
+// conditioning on that it is known either GOLDEN or LAST3.
+int av1_get_pred_context_comp_ref_p2(const MACROBLOCKD *xd) {
+  return get_pred_context_last3_or_gld(xd);
+}
+
+// Signal the 2nd reference frame for a compound mode be either
+// ALTREF, or ALTREF2/BWDREF.
+int av1_get_pred_context_comp_bwdref_p(const MACROBLOCKD *xd) {
+  return get_pred_context_brfarf2_or_arf(xd);
+}
+
+// Signal the 2nd reference frame for a compound mode be either
+// ALTREF2 or BWDREF.
+int av1_get_pred_context_comp_bwdref_p1(const MACROBLOCKD *xd) {
+  return get_pred_context_brf_or_arf2(xd);
+}
+
+// == Context functions for single ref ==
+//
+// For the bit to signal whether the single reference is a forward reference
+// frame or a backward reference frame.
+int av1_get_pred_context_single_ref_p1(const MACROBLOCKD *xd) {
+  const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0];
+
+  // Count of forward reference frames
+  const int fwd_count = ref_counts[LAST_FRAME] + ref_counts[LAST2_FRAME] +
+                        ref_counts[LAST3_FRAME] + ref_counts[GOLDEN_FRAME];
+  // Count of backward reference frames
+  const int bwd_count = ref_counts[BWDREF_FRAME] + ref_counts[ALTREF2_FRAME] +
+                        ref_counts[ALTREF_FRAME];
+
+  const int pred_context =
+      (fwd_count == bwd_count) ? 1 : ((fwd_count < bwd_count) ? 0 : 2);
+
+  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
+  return pred_context;
+}
+
+// For the bit to signal whether the single reference is ALTREF_FRAME or
+// non-ALTREF backward reference frame, knowing that it shall be either of
+// these 2 choices.
+int av1_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) {
+  return get_pred_context_brfarf2_or_arf(xd);
+}
+
+// For the bit to signal whether the single reference is LAST3/GOLDEN or
+// LAST2/LAST, knowing that it shall be either of these 2 choices.
+int av1_get_pred_context_single_ref_p3(const MACROBLOCKD *xd) {
+  return get_pred_context_ll2_or_l3gld(xd);
+}
+
+// For the bit to signal whether the single reference is LAST2_FRAME or
+// LAST_FRAME, knowing that it shall be either of these 2 choices.
+int av1_get_pred_context_single_ref_p4(const MACROBLOCKD *xd) {
+  return get_pred_context_last_or_last2(xd);
+}
+
+// For the bit to signal whether the single reference is GOLDEN_FRAME or
+// LAST3_FRAME, knowing that it shall be either of these 2 choices.
+int av1_get_pred_context_single_ref_p5(const MACROBLOCKD *xd) {
+  return get_pred_context_last3_or_gld(xd);
+}
+
+// For the bit to signal whether the single reference is ALTREF2_FRAME or
+// BWDREF_FRAME, knowing that it shall be either of these 2 choices.
+int av1_get_pred_context_single_ref_p6(const MACROBLOCKD *xd) {
+  return get_pred_context_brf_or_arf2(xd);
+}
diff --git a/libs/libaom/src/av1/common/pred_common.h b/libs/libaom/src/av1/common/pred_common.h
new file mode 100644
index 000000000..d1dab97e7
--- /dev/null
+++ b/libs/libaom/src/av1/common/pred_common.h
@@ -0,0 +1,374 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_PRED_COMMON_H_
+#define AOM_AV1_COMMON_PRED_COMMON_H_
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/blockd.h"
+#include "av1/common/mvref_common.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static INLINE int get_segment_id(const CommonModeInfoParams *const mi_params,
+                                 const uint8_t *segment_ids, BLOCK_SIZE bsize,
+                                 int mi_row, int mi_col) {
+  const int mi_offset = mi_row * mi_params->mi_cols + mi_col;
+  const int bw = mi_size_wide[bsize];
+  const int bh = mi_size_high[bsize];
+  const int xmis = AOMMIN(mi_params->mi_cols - mi_col, bw);
+  const int ymis = AOMMIN(mi_params->mi_rows - mi_row, bh);
+  int segment_id = MAX_SEGMENTS;
+
+  for (int y = 0; y < ymis; ++y) {
+    for (int x = 0; x < xmis; ++x) {
+      segment_id = AOMMIN(segment_id,
+                          segment_ids[mi_offset + y * mi_params->mi_cols + x]);
+    }
+  }
+
+  assert(segment_id >= 0 && segment_id < MAX_SEGMENTS);
+  return segment_id;
+}
+
+static INLINE int av1_get_spatial_seg_pred(const AV1_COMMON *const cm,
+                                           const MACROBLOCKD *const xd,
+                                           int *cdf_index) {
+  int prev_ul = -1;  // top left segment_id
+  int prev_l = -1;   // left segment_id
+  int prev_u = -1;   // top segment_id
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  const uint8_t *seg_map = cm->cur_frame->seg_map;
+  if ((xd->up_available) && (xd->left_available)) {
+    prev_ul =
+        get_segment_id(mi_params, seg_map, BLOCK_4X4, mi_row - 1, mi_col - 1);
+  }
+  if (xd->up_available) {
+    prev_u =
+        get_segment_id(mi_params, seg_map, BLOCK_4X4, mi_row - 1, mi_col - 0);
+  }
+  if (xd->left_available) {
+    prev_l =
+        get_segment_id(mi_params, seg_map, BLOCK_4X4, mi_row - 0, mi_col - 1);
+  }
+  // This property follows from the fact that get_segment_id() returns a
+  // nonnegative value. This allows us to test for all edge cases with a simple
+  // prev_ul < 0 check.
+  assert(IMPLIES(prev_ul >= 0, prev_u >= 0 && prev_l >= 0));
+
+  // Pick CDF index based on number of matching/out-of-bounds segment IDs.
+  if (prev_ul < 0) /* Edge cases */
+    *cdf_index = 0;
+  else if ((prev_ul == prev_u) && (prev_ul == prev_l))
+    *cdf_index = 2;
+  else if ((prev_ul == prev_u) || (prev_ul == prev_l) || (prev_u == prev_l))
+    *cdf_index = 1;
+  else
+    *cdf_index = 0;
+
+  // If 2 or more are identical returns that as predictor, otherwise prev_l.
+  if (prev_u == -1)  // edge case
+    return prev_l == -1 ? 0 : prev_l;
+  if (prev_l == -1)  // edge case
+    return prev_u;
+  return (prev_ul == prev_u) ? prev_u : prev_l;
+}
+
+static INLINE int av1_get_pred_context_seg_id(const MACROBLOCKD *xd) {
+  const MB_MODE_INFO *const above_mi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mi = xd->left_mbmi;
+  const int above_sip = (above_mi != NULL) ? above_mi->seg_id_predicted : 0;
+  const int left_sip = (left_mi != NULL) ? left_mi->seg_id_predicted : 0;
+
+  return above_sip + left_sip;
+}
+
+static INLINE int get_comp_index_context(const AV1_COMMON *cm,
+                                         const MACROBLOCKD *xd) {
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  const RefCntBuffer *const bck_buf = get_ref_frame_buf(cm, mbmi->ref_frame[0]);
+  const RefCntBuffer *const fwd_buf = get_ref_frame_buf(cm, mbmi->ref_frame[1]);
+  int bck_frame_index = 0, fwd_frame_index = 0;
+  int cur_frame_index = cm->cur_frame->order_hint;
+
+  if (bck_buf != NULL) bck_frame_index = bck_buf->order_hint;
+  if (fwd_buf != NULL) fwd_frame_index = fwd_buf->order_hint;
+
+  int fwd = abs(get_relative_dist(&cm->seq_params.order_hint_info,
+                                  fwd_frame_index, cur_frame_index));
+  int bck = abs(get_relative_dist(&cm->seq_params.order_hint_info,
+                                  cur_frame_index, bck_frame_index));
+
+  const MB_MODE_INFO *const above_mi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mi = xd->left_mbmi;
+
+  int above_ctx = 0, left_ctx = 0;
+  const int offset = (fwd == bck);
+
+  if (above_mi != NULL) {
+    if (has_second_ref(above_mi))
+      above_ctx = above_mi->compound_idx;
+    else if (above_mi->ref_frame[0] == ALTREF_FRAME)
+      above_ctx = 1;
+  }
+
+  if (left_mi != NULL) {
+    if (has_second_ref(left_mi))
+      left_ctx = left_mi->compound_idx;
+    else if (left_mi->ref_frame[0] == ALTREF_FRAME)
+      left_ctx = 1;
+  }
+
+  return above_ctx + left_ctx + 3 * offset;
+}
+
+static INLINE int get_comp_group_idx_context(const MACROBLOCKD *xd) {
+  const MB_MODE_INFO *const above_mi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mi = xd->left_mbmi;
+  int above_ctx = 0, left_ctx = 0;
+
+  if (above_mi) {
+    if (has_second_ref(above_mi))
+      above_ctx = above_mi->comp_group_idx;
+    else if (above_mi->ref_frame[0] == ALTREF_FRAME)
+      above_ctx = 3;
+  }
+  if (left_mi) {
+    if (has_second_ref(left_mi))
+      left_ctx = left_mi->comp_group_idx;
+    else if (left_mi->ref_frame[0] == ALTREF_FRAME)
+      left_ctx = 3;
+  }
+
+  return AOMMIN(5, above_ctx + left_ctx);
+}
+
+static INLINE aom_cdf_prob *av1_get_pred_cdf_seg_id(
+    struct segmentation_probs *segp, const MACROBLOCKD *xd) {
+  return segp->pred_cdf[av1_get_pred_context_seg_id(xd)];
+}
+
+static INLINE int av1_get_skip_mode_context(const MACROBLOCKD *xd) {
+  const MB_MODE_INFO *const above_mi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mi = xd->left_mbmi;
+  const int above_skip_mode = above_mi ? above_mi->skip_mode : 0;
+  const int left_skip_mode = left_mi ? left_mi->skip_mode : 0;
+  return above_skip_mode + left_skip_mode;
+}
+
+static INLINE int av1_get_skip_context(const MACROBLOCKD *xd) {
+  const MB_MODE_INFO *const above_mi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mi = xd->left_mbmi;
+  const int above_skip = above_mi ? above_mi->skip : 0;
+  const int left_skip = left_mi ? left_mi->skip : 0;
+  return above_skip + left_skip;
+}
+
+int av1_get_pred_context_switchable_interp(const MACROBLOCKD *xd, int dir);
+
+// Get a list of palette base colors that are used in the above and left blocks,
+// referred to as "color cache". The return value is the number of colors in the
+// cache (<= 2 * PALETTE_MAX_SIZE). The color values are stored in "cache"
+// in ascending order.
+int av1_get_palette_cache(const MACROBLOCKD *const xd, int plane,
+                          uint16_t *cache);
+
+static INLINE int av1_get_palette_bsize_ctx(BLOCK_SIZE bsize) {
+  assert(bsize < BLOCK_SIZES_ALL);
+  return num_pels_log2_lookup[bsize] - num_pels_log2_lookup[BLOCK_8X8];
+}
+
+static INLINE int av1_get_palette_mode_ctx(const MACROBLOCKD *xd) {
+  const MB_MODE_INFO *const above_mi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mi = xd->left_mbmi;
+  int ctx = 0;
+  if (above_mi) ctx += (above_mi->palette_mode_info.palette_size[0] > 0);
+  if (left_mi) ctx += (left_mi->palette_mode_info.palette_size[0] > 0);
+  return ctx;
+}
+
+int av1_get_intra_inter_context(const MACROBLOCKD *xd);
+
+int av1_get_reference_mode_context(const MACROBLOCKD *xd);
+
+static INLINE aom_cdf_prob *av1_get_reference_mode_cdf(const MACROBLOCKD *xd) {
+  return xd->tile_ctx->comp_inter_cdf[av1_get_reference_mode_context(xd)];
+}
+
+static INLINE aom_cdf_prob *av1_get_skip_cdf(const MACROBLOCKD *xd) {
+  return xd->tile_ctx->skip_cdfs[av1_get_skip_context(xd)];
+}
+
+int av1_get_comp_reference_type_context(const MACROBLOCKD *xd);
+
+// == Uni-directional contexts ==
+
+int av1_get_pred_context_uni_comp_ref_p(const MACROBLOCKD *xd);
+
+int av1_get_pred_context_uni_comp_ref_p1(const MACROBLOCKD *xd);
+
+int av1_get_pred_context_uni_comp_ref_p2(const MACROBLOCKD *xd);
+
+static INLINE aom_cdf_prob *av1_get_comp_reference_type_cdf(
+    const MACROBLOCKD *xd) {
+  const int pred_context = av1_get_comp_reference_type_context(xd);
+  return xd->tile_ctx->comp_ref_type_cdf[pred_context];
+}
+
+static INLINE aom_cdf_prob *av1_get_pred_cdf_uni_comp_ref_p(
+    const MACROBLOCKD *xd) {
+  const int pred_context = av1_get_pred_context_uni_comp_ref_p(xd);
+  return xd->tile_ctx->uni_comp_ref_cdf[pred_context][0];
+}
+
+static INLINE aom_cdf_prob *av1_get_pred_cdf_uni_comp_ref_p1(
+    const MACROBLOCKD *xd) {
+  const int pred_context = av1_get_pred_context_uni_comp_ref_p1(xd);
+  return xd->tile_ctx->uni_comp_ref_cdf[pred_context][1];
+}
+
+static INLINE aom_cdf_prob *av1_get_pred_cdf_uni_comp_ref_p2(
+    const MACROBLOCKD *xd) {
+  const int pred_context = av1_get_pred_context_uni_comp_ref_p2(xd);
+  return xd->tile_ctx->uni_comp_ref_cdf[pred_context][2];
+}
+
+// == Bi-directional contexts ==
+
+int av1_get_pred_context_comp_ref_p(const MACROBLOCKD *xd);
+
+int av1_get_pred_context_comp_ref_p1(const MACROBLOCKD *xd);
+
+int av1_get_pred_context_comp_ref_p2(const MACROBLOCKD *xd);
+
+int av1_get_pred_context_comp_bwdref_p(const MACROBLOCKD *xd);
+
+int av1_get_pred_context_comp_bwdref_p1(const MACROBLOCKD *xd);
+
+static INLINE aom_cdf_prob *av1_get_pred_cdf_comp_ref_p(const MACROBLOCKD *xd) {
+  const int pred_context = av1_get_pred_context_comp_ref_p(xd);
+  return xd->tile_ctx->comp_ref_cdf[pred_context][0];
+}
+
+static INLINE aom_cdf_prob *av1_get_pred_cdf_comp_ref_p1(
+    const MACROBLOCKD *xd) {
+  const int pred_context = av1_get_pred_context_comp_ref_p1(xd);
+  return xd->tile_ctx->comp_ref_cdf[pred_context][1];
+}
+
+static INLINE aom_cdf_prob *av1_get_pred_cdf_comp_ref_p2(
+    const MACROBLOCKD *xd) {
+  const int pred_context = av1_get_pred_context_comp_ref_p2(xd);
+  return xd->tile_ctx->comp_ref_cdf[pred_context][2];
+}
+
+static INLINE aom_cdf_prob *av1_get_pred_cdf_comp_bwdref_p(
+    const MACROBLOCKD *xd) {
+  const int pred_context = av1_get_pred_context_comp_bwdref_p(xd);
+  return xd->tile_ctx->comp_bwdref_cdf[pred_context][0];
+}
+
+static INLINE aom_cdf_prob *av1_get_pred_cdf_comp_bwdref_p1(
+    const MACROBLOCKD *xd) {
+  const int pred_context = av1_get_pred_context_comp_bwdref_p1(xd);
+  return xd->tile_ctx->comp_bwdref_cdf[pred_context][1];
+}
+
+// == Single contexts ==
+
+int av1_get_pred_context_single_ref_p1(const MACROBLOCKD *xd);
+
+int av1_get_pred_context_single_ref_p2(const MACROBLOCKD *xd);
+
+int av1_get_pred_context_single_ref_p3(const MACROBLOCKD *xd);
+
+int av1_get_pred_context_single_ref_p4(const MACROBLOCKD *xd);
+
+int av1_get_pred_context_single_ref_p5(const MACROBLOCKD *xd);
+
+int av1_get_pred_context_single_ref_p6(const MACROBLOCKD *xd);
+
+static INLINE aom_cdf_prob *av1_get_pred_cdf_single_ref_p1(
+    const MACROBLOCKD *xd) {
+  return xd->tile_ctx
+      ->single_ref_cdf[av1_get_pred_context_single_ref_p1(xd)][0];
+}
+static INLINE aom_cdf_prob *av1_get_pred_cdf_single_ref_p2(
+    const MACROBLOCKD *xd) {
+  return xd->tile_ctx
+      ->single_ref_cdf[av1_get_pred_context_single_ref_p2(xd)][1];
+}
+static INLINE aom_cdf_prob *av1_get_pred_cdf_single_ref_p3(
+    const MACROBLOCKD *xd) {
+  return xd->tile_ctx
+      ->single_ref_cdf[av1_get_pred_context_single_ref_p3(xd)][2];
+}
+static INLINE aom_cdf_prob *av1_get_pred_cdf_single_ref_p4(
+    const MACROBLOCKD *xd) {
+  return xd->tile_ctx
+      ->single_ref_cdf[av1_get_pred_context_single_ref_p4(xd)][3];
+}
+static INLINE aom_cdf_prob *av1_get_pred_cdf_single_ref_p5(
+    const MACROBLOCKD *xd) {
+  return xd->tile_ctx
+      ->single_ref_cdf[av1_get_pred_context_single_ref_p5(xd)][4];
+}
+static INLINE aom_cdf_prob *av1_get_pred_cdf_single_ref_p6(
+    const MACROBLOCKD *xd) {
+  return xd->tile_ctx
+      ->single_ref_cdf[av1_get_pred_context_single_ref_p6(xd)][5];
+}
+
+// Returns a context number for the given MB prediction signal
+// The mode info data structure has a one element border above and to the
+// left of the entries corresponding to real blocks.
+// The prediction flags in these dummy entries are initialized to 0.
+static INLINE int get_tx_size_context(const MACROBLOCKD *xd) {
+  const MB_MODE_INFO *mbmi = xd->mi[0];
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const TX_SIZE max_tx_size = max_txsize_rect_lookup[mbmi->sb_type];
+  const int max_tx_wide = tx_size_wide[max_tx_size];
+  const int max_tx_high = tx_size_high[max_tx_size];
+  const int has_above = xd->up_available;
+  const int has_left = xd->left_available;
+
+  int above = xd->above_txfm_context[0] >= max_tx_wide;
+  int left = xd->left_txfm_context[0] >= max_tx_high;
+
+  if (has_above)
+    if (is_inter_block(above_mbmi))
+      above = block_size_wide[above_mbmi->sb_type] >= max_tx_wide;
+
+  if (has_left)
+    if (is_inter_block(left_mbmi))
+      left = block_size_high[left_mbmi->sb_type] >= max_tx_high;
+
+  if (has_above && has_left)
+    return (above + left);
+  else if (has_above)
+    return above;
+  else if (has_left)
+    return left;
+  else
+    return 0;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_COMMON_PRED_COMMON_H_
diff --git a/libs/libaom/src/av1/common/quant_common.c b/libs/libaom/src/av1/common/quant_common.c
new file mode 100644
index 000000000..e96d71a3b
--- /dev/null
+++ b/libs/libaom/src/av1/common/quant_common.c
@@ -0,0 +1,12875 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/blockd.h"
+#include "av1/common/common.h"
+#include "av1/common/entropy.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/seg_common.h"
+
+static const int16_t dc_qlookup_QTX[QINDEX_RANGE] = {
+  4,    8,    8,    9,    10,  11,  12,  12,  13,  14,  15,   16,   17,   18,
+  19,   19,   20,   21,   22,  23,  24,  25,  26,  26,  27,   28,   29,   30,
+  31,   32,   32,   33,   34,  35,  36,  37,  38,  38,  39,   40,   41,   42,
+  43,   43,   44,   45,   46,  47,  48,  48,  49,  50,  51,   52,   53,   53,
+  54,   55,   56,   57,   57,  58,  59,  60,  61,  62,  62,   63,   64,   65,
+  66,   66,   67,   68,   69,  70,  70,  71,  72,  73,  74,   74,   75,   76,
+  77,   78,   78,   79,   80,  81,  81,  82,  83,  84,  85,   85,   87,   88,
+  90,   92,   93,   95,   96,  98,  99,  101, 102, 104, 105,  107,  108,  110,
+  111,  113,  114,  116,  117, 118, 120, 121, 123, 125, 127,  129,  131,  134,
+  136,  138,  140,  142,  144, 146, 148, 150, 152, 154, 156,  158,  161,  164,
+  166,  169,  172,  174,  177, 180, 182, 185, 187, 190, 192,  195,  199,  202,
+  205,  208,  211,  214,  217, 220, 223, 226, 230, 233, 237,  240,  243,  247,
+  250,  253,  257,  261,  265, 269, 272, 276, 280, 284, 288,  292,  296,  300,
+  304,  309,  313,  317,  322, 326, 330, 335, 340, 344, 349,  354,  359,  364,
+  369,  374,  379,  384,  389, 395, 400, 406, 411, 417, 423,  429,  435,  441,
+  447,  454,  461,  467,  475, 482, 489, 497, 505, 513, 522,  530,  539,  549,
+  559,  569,  579,  590,  602, 614, 626, 640, 654, 668, 684,  700,  717,  736,
+  755,  775,  796,  819,  843, 869, 896, 925, 955, 988, 1022, 1058, 1098, 1139,
+  1184, 1232, 1282, 1336,
+};
+
+static const int16_t dc_qlookup_10_QTX[QINDEX_RANGE] = {
+  4,    9,    10,   13,   15,   17,   20,   22,   25,   28,   31,   34,   37,
+  40,   43,   47,   50,   53,   57,   60,   64,   68,   71,   75,   78,   82,
+  86,   90,   93,   97,   101,  105,  109,  113,  116,  120,  124,  128,  132,
+  136,  140,  143,  147,  151,  155,  159,  163,  166,  170,  174,  178,  182,
+  185,  189,  193,  197,  200,  204,  208,  212,  215,  219,  223,  226,  230,
+  233,  237,  241,  244,  248,  251,  255,  259,  262,  266,  269,  273,  276,
+  280,  283,  287,  290,  293,  297,  300,  304,  307,  310,  314,  317,  321,
+  324,  327,  331,  334,  337,  343,  350,  356,  362,  369,  375,  381,  387,
+  394,  400,  406,  412,  418,  424,  430,  436,  442,  448,  454,  460,  466,
+  472,  478,  484,  490,  499,  507,  516,  525,  533,  542,  550,  559,  567,
+  576,  584,  592,  601,  609,  617,  625,  634,  644,  655,  666,  676,  687,
+  698,  708,  718,  729,  739,  749,  759,  770,  782,  795,  807,  819,  831,
+  844,  856,  868,  880,  891,  906,  920,  933,  947,  961,  975,  988,  1001,
+  1015, 1030, 1045, 1061, 1076, 1090, 1105, 1120, 1137, 1153, 1170, 1186, 1202,
+  1218, 1236, 1253, 1271, 1288, 1306, 1323, 1342, 1361, 1379, 1398, 1416, 1436,
+  1456, 1476, 1496, 1516, 1537, 1559, 1580, 1601, 1624, 1647, 1670, 1692, 1717,
+  1741, 1766, 1791, 1817, 1844, 1871, 1900, 1929, 1958, 1990, 2021, 2054, 2088,
+  2123, 2159, 2197, 2236, 2276, 2319, 2363, 2410, 2458, 2508, 2561, 2616, 2675,
+  2737, 2802, 2871, 2944, 3020, 3102, 3188, 3280, 3375, 3478, 3586, 3702, 3823,
+  3953, 4089, 4236, 4394, 4559, 4737, 4929, 5130, 5347,
+};
+
+static const int16_t dc_qlookup_12_QTX[QINDEX_RANGE] = {
+  4,     12,    18,    25,    33,    41,    50,    60,    70,    80,    91,
+  103,   115,   127,   140,   153,   166,   180,   194,   208,   222,   237,
+  251,   266,   281,   296,   312,   327,   343,   358,   374,   390,   405,
+  421,   437,   453,   469,   484,   500,   516,   532,   548,   564,   580,
+  596,   611,   627,   643,   659,   674,   690,   706,   721,   737,   752,
+  768,   783,   798,   814,   829,   844,   859,   874,   889,   904,   919,
+  934,   949,   964,   978,   993,   1008,  1022,  1037,  1051,  1065,  1080,
+  1094,  1108,  1122,  1136,  1151,  1165,  1179,  1192,  1206,  1220,  1234,
+  1248,  1261,  1275,  1288,  1302,  1315,  1329,  1342,  1368,  1393,  1419,
+  1444,  1469,  1494,  1519,  1544,  1569,  1594,  1618,  1643,  1668,  1692,
+  1717,  1741,  1765,  1789,  1814,  1838,  1862,  1885,  1909,  1933,  1957,
+  1992,  2027,  2061,  2096,  2130,  2165,  2199,  2233,  2267,  2300,  2334,
+  2367,  2400,  2434,  2467,  2499,  2532,  2575,  2618,  2661,  2704,  2746,
+  2788,  2830,  2872,  2913,  2954,  2995,  3036,  3076,  3127,  3177,  3226,
+  3275,  3324,  3373,  3421,  3469,  3517,  3565,  3621,  3677,  3733,  3788,
+  3843,  3897,  3951,  4005,  4058,  4119,  4181,  4241,  4301,  4361,  4420,
+  4479,  4546,  4612,  4677,  4742,  4807,  4871,  4942,  5013,  5083,  5153,
+  5222,  5291,  5367,  5442,  5517,  5591,  5665,  5745,  5825,  5905,  5984,
+  6063,  6149,  6234,  6319,  6404,  6495,  6587,  6678,  6769,  6867,  6966,
+  7064,  7163,  7269,  7376,  7483,  7599,  7715,  7832,  7958,  8085,  8214,
+  8352,  8492,  8635,  8788,  8945,  9104,  9275,  9450,  9639,  9832,  10031,
+  10245, 10465, 10702, 10946, 11210, 11482, 11776, 12081, 12409, 12750, 13118,
+  13501, 13913, 14343, 14807, 15290, 15812, 16356, 16943, 17575, 18237, 18949,
+  19718, 20521, 21387,
+};
+
+static const int16_t ac_qlookup_QTX[QINDEX_RANGE] = {
+  4,    8,    9,    10,   11,   12,   13,   14,   15,   16,   17,   18,   19,
+  20,   21,   22,   23,   24,   25,   26,   27,   28,   29,   30,   31,   32,
+  33,   34,   35,   36,   37,   38,   39,   40,   41,   42,   43,   44,   45,
+  46,   47,   48,   49,   50,   51,   52,   53,   54,   55,   56,   57,   58,
+  59,   60,   61,   62,   63,   64,   65,   66,   67,   68,   69,   70,   71,
+  72,   73,   74,   75,   76,   77,   78,   79,   80,   81,   82,   83,   84,
+  85,   86,   87,   88,   89,   90,   91,   92,   93,   94,   95,   96,   97,
+  98,   99,   100,  101,  102,  104,  106,  108,  110,  112,  114,  116,  118,
+  120,  122,  124,  126,  128,  130,  132,  134,  136,  138,  140,  142,  144,
+  146,  148,  150,  152,  155,  158,  161,  164,  167,  170,  173,  176,  179,
+  182,  185,  188,  191,  194,  197,  200,  203,  207,  211,  215,  219,  223,
+  227,  231,  235,  239,  243,  247,  251,  255,  260,  265,  270,  275,  280,
+  285,  290,  295,  300,  305,  311,  317,  323,  329,  335,  341,  347,  353,
+  359,  366,  373,  380,  387,  394,  401,  408,  416,  424,  432,  440,  448,
+  456,  465,  474,  483,  492,  501,  510,  520,  530,  540,  550,  560,  571,
+  582,  593,  604,  615,  627,  639,  651,  663,  676,  689,  702,  715,  729,
+  743,  757,  771,  786,  801,  816,  832,  848,  864,  881,  898,  915,  933,
+  951,  969,  988,  1007, 1026, 1046, 1066, 1087, 1108, 1129, 1151, 1173, 1196,
+  1219, 1243, 1267, 1292, 1317, 1343, 1369, 1396, 1423, 1451, 1479, 1508, 1537,
+  1567, 1597, 1628, 1660, 1692, 1725, 1759, 1793, 1828,
+};
+
+static const int16_t ac_qlookup_10_QTX[QINDEX_RANGE] = {
+  4,    9,    11,   13,   16,   18,   21,   24,   27,   30,   33,   37,   40,
+  44,   48,   51,   55,   59,   63,   67,   71,   75,   79,   83,   88,   92,
+  96,   100,  105,  109,  114,  118,  122,  127,  131,  136,  140,  145,  149,
+  154,  158,  163,  168,  172,  177,  181,  186,  190,  195,  199,  204,  208,
+  213,  217,  222,  226,  231,  235,  240,  244,  249,  253,  258,  262,  267,
+  271,  275,  280,  284,  289,  293,  297,  302,  306,  311,  315,  319,  324,
+  328,  332,  337,  341,  345,  349,  354,  358,  362,  367,  371,  375,  379,
+  384,  388,  392,  396,  401,  409,  417,  425,  433,  441,  449,  458,  466,
+  474,  482,  490,  498,  506,  514,  523,  531,  539,  547,  555,  563,  571,
+  579,  588,  596,  604,  616,  628,  640,  652,  664,  676,  688,  700,  713,
+  725,  737,  749,  761,  773,  785,  797,  809,  825,  841,  857,  873,  889,
+  905,  922,  938,  954,  970,  986,  1002, 1018, 1038, 1058, 1078, 1098, 1118,
+  1138, 1158, 1178, 1198, 1218, 1242, 1266, 1290, 1314, 1338, 1362, 1386, 1411,
+  1435, 1463, 1491, 1519, 1547, 1575, 1603, 1631, 1663, 1695, 1727, 1759, 1791,
+  1823, 1859, 1895, 1931, 1967, 2003, 2039, 2079, 2119, 2159, 2199, 2239, 2283,
+  2327, 2371, 2415, 2459, 2507, 2555, 2603, 2651, 2703, 2755, 2807, 2859, 2915,
+  2971, 3027, 3083, 3143, 3203, 3263, 3327, 3391, 3455, 3523, 3591, 3659, 3731,
+  3803, 3876, 3952, 4028, 4104, 4184, 4264, 4348, 4432, 4516, 4604, 4692, 4784,
+  4876, 4972, 5068, 5168, 5268, 5372, 5476, 5584, 5692, 5804, 5916, 6032, 6148,
+  6268, 6388, 6512, 6640, 6768, 6900, 7036, 7172, 7312,
+};
+
+static const int16_t ac_qlookup_12_QTX[QINDEX_RANGE] = {
+  4,     13,    19,    27,    35,    44,    54,    64,    75,    87,    99,
+  112,   126,   139,   154,   168,   183,   199,   214,   230,   247,   263,
+  280,   297,   314,   331,   349,   366,   384,   402,   420,   438,   456,
+  475,   493,   511,   530,   548,   567,   586,   604,   623,   642,   660,
+  679,   698,   716,   735,   753,   772,   791,   809,   828,   846,   865,
+  884,   902,   920,   939,   957,   976,   994,   1012,  1030,  1049,  1067,
+  1085,  1103,  1121,  1139,  1157,  1175,  1193,  1211,  1229,  1246,  1264,
+  1282,  1299,  1317,  1335,  1352,  1370,  1387,  1405,  1422,  1440,  1457,
+  1474,  1491,  1509,  1526,  1543,  1560,  1577,  1595,  1627,  1660,  1693,
+  1725,  1758,  1791,  1824,  1856,  1889,  1922,  1954,  1987,  2020,  2052,
+  2085,  2118,  2150,  2183,  2216,  2248,  2281,  2313,  2346,  2378,  2411,
+  2459,  2508,  2556,  2605,  2653,  2701,  2750,  2798,  2847,  2895,  2943,
+  2992,  3040,  3088,  3137,  3185,  3234,  3298,  3362,  3426,  3491,  3555,
+  3619,  3684,  3748,  3812,  3876,  3941,  4005,  4069,  4149,  4230,  4310,
+  4390,  4470,  4550,  4631,  4711,  4791,  4871,  4967,  5064,  5160,  5256,
+  5352,  5448,  5544,  5641,  5737,  5849,  5961,  6073,  6185,  6297,  6410,
+  6522,  6650,  6778,  6906,  7034,  7162,  7290,  7435,  7579,  7723,  7867,
+  8011,  8155,  8315,  8475,  8635,  8795,  8956,  9132,  9308,  9484,  9660,
+  9836,  10028, 10220, 10412, 10604, 10812, 11020, 11228, 11437, 11661, 11885,
+  12109, 12333, 12573, 12813, 13053, 13309, 13565, 13821, 14093, 14365, 14637,
+  14925, 15213, 15502, 15806, 16110, 16414, 16734, 17054, 17390, 17726, 18062,
+  18414, 18766, 19134, 19502, 19886, 20270, 20670, 21070, 21486, 21902, 22334,
+  22766, 23214, 23662, 24126, 24590, 25070, 25551, 26047, 26559, 27071, 27599,
+  28143, 28687, 29247,
+};
+
+// Coefficient scaling and quantization with AV1 TX are tailored to
+// the AV1 TX transforms.  Regardless of the bit-depth of the input,
+// the transform stages scale the coefficient values up by a factor of
+// 8 (3 bits) over the scale of the pixel values.  Thus, for 8-bit
+// input, the coefficients have effectively 11 bits of scale depth
+// (8+3), 10-bit input pixels result in 13-bit coefficient depth
+// (10+3) and 12-bit pixels yield 15-bit (12+3) coefficient depth.
+// All quantizers are built using this invariant of x8, 3-bit scaling,
+// thus the Q3 suffix.
+
+// A partial exception to this rule is large transforms; to avoid
+// overflow, TX blocks with > 256 pels (>16x16) are scaled only
+// 4-times unity (2 bits) over the pixel depth, and TX blocks with
+// over 1024 pixels (>32x32) are scaled up only 2x unity (1 bit).
+// This descaling is found via av1_tx_get_scale().  Thus, 16x32, 32x16
+// and 32x32 transforms actually return Q2 coefficients, and 32x64,
+// 64x32 and 64x64 transforms return Q1 coefficients.  However, the
+// quantizers are de-scaled down on-the-fly by the same amount
+// (av1_tx_get_scale()) during quantization, and as such the
+// dequantized/decoded coefficients, even for large TX blocks, are always
+// effectively Q3. Meanwhile, quantized/coded coefficients are Q0
+// because Qn quantizers are applied to Qn tx coefficients.
+
+// Note that encoder decision making (which uses the quantizer to
+// generate several bespoke lamdas for RDO and other heuristics)
+// expects quantizers to be larger for higher-bitdepth input.  In
+// addition, the minimum allowable quantizer is 4; smaller values will
+// underflow to 0 in the actual quantization routines.
+
+int16_t av1_dc_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth) {
+  const int q_clamped = clamp(qindex + delta, 0, MAXQ);
+  switch (bit_depth) {
+    case AOM_BITS_8: return dc_qlookup_QTX[q_clamped];
+    case AOM_BITS_10: return dc_qlookup_10_QTX[q_clamped];
+    case AOM_BITS_12: return dc_qlookup_12_QTX[q_clamped];
+    default:
+      assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
+      return -1;
+  }
+}
+
+int16_t av1_ac_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth) {
+  const int q_clamped = clamp(qindex + delta, 0, MAXQ);
+  switch (bit_depth) {
+    case AOM_BITS_8: return ac_qlookup_QTX[q_clamped];
+    case AOM_BITS_10: return ac_qlookup_10_QTX[q_clamped];
+    case AOM_BITS_12: return ac_qlookup_12_QTX[q_clamped];
+    default:
+      assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
+      return -1;
+  }
+}
+
+int av1_get_qindex(const struct segmentation *seg, int segment_id,
+                   int base_qindex) {
+  if (segfeature_active(seg, segment_id, SEG_LVL_ALT_Q)) {
+    const int data = get_segdata(seg, segment_id, SEG_LVL_ALT_Q);
+    const int seg_qindex = base_qindex + data;
+    return clamp(seg_qindex, 0, MAXQ);
+  } else {
+    return base_qindex;
+  }
+}
+
+bool av1_use_qmatrix(const CommonQuantParams *quant_params,
+                     const struct macroblockd *xd, int segment_id) {
+  // True if explicit Q matrix levels and this is not a lossless segment.
+  return quant_params->using_qmatrix && !xd->lossless[segment_id];
+}
+
+const qm_val_t *av1_iqmatrix(const CommonQuantParams *quant_params, int qmlevel,
+                             int plane, TX_SIZE tx_size) {
+  assert(quant_params->giqmatrix[qmlevel][plane][tx_size] != NULL ||
+         qmlevel == NUM_QM_LEVELS - 1);
+  return quant_params->giqmatrix[qmlevel][plane][tx_size];
+}
+const qm_val_t *av1_qmatrix(const CommonQuantParams *quant_params, int qmlevel,
+                            int plane, TX_SIZE tx_size) {
+  assert(quant_params->gqmatrix[qmlevel][plane][tx_size] != NULL ||
+         qmlevel == NUM_QM_LEVELS - 1);
+  return quant_params->gqmatrix[qmlevel][plane][tx_size];
+}
+
+// Returns true if the tx_type corresponds to non-identity transform in both
+// horizontal and vertical directions.
+static INLINE bool is_2d_transform(TX_TYPE tx_type) { return (tx_type < IDTX); }
+
+const qm_val_t *av1_get_iqmatrix(const CommonQuantParams *quant_params,
+                                 const MACROBLOCKD *xd, int plane,
+                                 TX_SIZE tx_size, TX_TYPE tx_type) {
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int seg_id = mbmi->segment_id;
+  const TX_SIZE qm_tx_size = av1_get_adjusted_tx_size(tx_size);
+  // Use a flat matrix (i.e. no weighting) for 1D and Identity transforms
+  return is_2d_transform(tx_type)
+             ? pd->seg_iqmatrix[seg_id][qm_tx_size]
+             : quant_params->giqmatrix[NUM_QM_LEVELS - 1][0][qm_tx_size];
+}
+
+const qm_val_t *av1_get_qmatrix(const CommonQuantParams *quant_params,
+                                const MACROBLOCKD *xd, int plane,
+                                TX_SIZE tx_size, TX_TYPE tx_type) {
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int seg_id = mbmi->segment_id;
+  const TX_SIZE qm_tx_size = av1_get_adjusted_tx_size(tx_size);
+  // Use a flat matrix (i.e. no weighting) for 1D and Identity transforms
+  return is_2d_transform(tx_type)
+             ? pd->seg_qmatrix[seg_id][qm_tx_size]
+             : quant_params->gqmatrix[NUM_QM_LEVELS - 1][0][qm_tx_size];
+}
+
+#define QM_TOTAL_SIZE 3344
+// We only use wt_matrix_ref[q] and iwt_matrix_ref[q]
+// for q = 0, ..., NUM_QM_LEVELS - 2.
+static const qm_val_t wt_matrix_ref[NUM_QM_LEVELS - 1][2][QM_TOTAL_SIZE];
+static const qm_val_t iwt_matrix_ref[NUM_QM_LEVELS - 1][2][QM_TOTAL_SIZE];
+
+void av1_qm_init(CommonQuantParams *quant_params, int num_planes) {
+  for (int q = 0; q < NUM_QM_LEVELS; ++q) {
+    for (int c = 0; c < num_planes; ++c) {
+      int current = 0;
+      for (int t = 0; t < TX_SIZES_ALL; ++t) {
+        const int size = tx_size_2d[t];
+        const int qm_tx_size = av1_get_adjusted_tx_size(t);
+        if (q == NUM_QM_LEVELS - 1) {
+          quant_params->gqmatrix[q][c][t] = NULL;
+          quant_params->giqmatrix[q][c][t] = NULL;
+        } else if (t != qm_tx_size) {  // Reuse matrices for 'qm_tx_size'
+          assert(t > qm_tx_size);
+          quant_params->gqmatrix[q][c][t] =
+              quant_params->gqmatrix[q][c][qm_tx_size];
+          quant_params->giqmatrix[q][c][t] =
+              quant_params->giqmatrix[q][c][qm_tx_size];
+        } else {
+          assert(current + size <= QM_TOTAL_SIZE);
+          quant_params->gqmatrix[q][c][t] = &wt_matrix_ref[q][c >= 1][current];
+          quant_params->giqmatrix[q][c][t] =
+              &iwt_matrix_ref[q][c >= 1][current];
+          current += size;
+        }
+      }
+    }
+  }
+}
+
+/* Provide 15 sets of quantization matrices for chroma and luma
+   and each TX size. Matrices for different TX sizes are in fact
+   sub-sampled from the 32x32 and 16x16 sizes, but explicitly
+   defined here for convenience. Intra and inter matrix sets are the
+   same but changing DEFAULT_QM_INTER_OFFSET from zero allows
+   for different matrices for inter and intra blocks in the same
+   frame.
+   Matrices for different QM levels have been rescaled in the
+   frequency domain according to different nominal viewing
+   distances. Matrices for QM level 15 are omitted because they are
+   not used.
+ */
+static const qm_val_t iwt_matrix_ref[NUM_QM_LEVELS - 1][2][QM_TOTAL_SIZE] = {
+  {
+      { /* Luma */
+        /* Size 4x4 */
+        32, 43, 73, 97, 43, 67, 94, 110, 73, 94, 137, 150, 97, 110, 150, 200,
+        /* Size 8x8 */
+        32, 32, 38, 51, 68, 84, 95, 109, 32, 35, 40, 49, 63, 76, 89, 102, 38,
+        40, 54, 65, 78, 91, 98, 106, 51, 49, 65, 82, 97, 111, 113, 121, 68, 63,
+        78, 97, 117, 134, 138, 142, 84, 76, 91, 111, 134, 152, 159, 168, 95, 89,
+        98, 113, 138, 159, 183, 199, 109, 102, 106, 121, 142, 168, 199, 220,
+        /* Size 16x16 */
+        32, 31, 31, 34, 36, 44, 48, 59, 65, 80, 83, 91, 97, 104, 111, 119, 31,
+        32, 32, 33, 34, 41, 44, 54, 59, 72, 75, 83, 90, 97, 104, 112, 31, 32,
+        33, 35, 36, 42, 45, 54, 59, 71, 74, 81, 86, 93, 100, 107, 34, 33, 35,
+        39, 42, 47, 51, 58, 63, 74, 76, 81, 84, 90, 97, 105, 36, 34, 36, 42, 48,
+        54, 57, 64, 68, 79, 81, 88, 91, 96, 102, 105, 44, 41, 42, 47, 54, 63,
+        67, 75, 79, 90, 92, 95, 100, 102, 109, 112, 48, 44, 45, 51, 57, 67, 71,
+        80, 85, 96, 99, 107, 108, 111, 117, 120, 59, 54, 54, 58, 64, 75, 80, 92,
+        98, 110, 113, 115, 116, 122, 125, 130, 65, 59, 59, 63, 68, 79, 85, 98,
+        105, 118, 121, 127, 130, 134, 135, 140, 80, 72, 71, 74, 79, 90, 96, 110,
+        118, 134, 137, 140, 143, 144, 146, 152, 83, 75, 74, 76, 81, 92, 99, 113,
+        121, 137, 140, 151, 152, 155, 158, 165, 91, 83, 81, 81, 88, 95, 107,
+        115, 127, 140, 151, 159, 166, 169, 173, 179, 97, 90, 86, 84, 91, 100,
+        108, 116, 130, 143, 152, 166, 174, 182, 189, 193, 104, 97, 93, 90, 96,
+        102, 111, 122, 134, 144, 155, 169, 182, 191, 200, 210, 111, 104, 100,
+        97, 102, 109, 117, 125, 135, 146, 158, 173, 189, 200, 210, 220, 119,
+        112, 107, 105, 105, 112, 120, 130, 140, 152, 165, 179, 193, 210, 220,
+        231,
+        /* Size 32x32 */
+        32, 31, 31, 31, 31, 32, 34, 35, 36, 39, 44, 46, 48, 54, 59, 62, 65, 71,
+        80, 81, 83, 88, 91, 94, 97, 101, 104, 107, 111, 115, 119, 123, 31, 32,
+        32, 32, 32, 32, 34, 34, 35, 38, 42, 44, 46, 51, 56, 59, 62, 68, 76, 77,
+        78, 84, 86, 89, 92, 95, 99, 102, 105, 109, 113, 116, 31, 32, 32, 32, 32,
+        32, 33, 34, 34, 37, 41, 42, 44, 49, 54, 56, 59, 65, 72, 73, 75, 80, 83,
+        86, 90, 93, 97, 101, 104, 108, 112, 116, 31, 32, 32, 32, 33, 33, 34, 35,
+        35, 38, 41, 43, 45, 49, 54, 56, 59, 64, 72, 73, 74, 79, 82, 85, 88, 91,
+        94, 97, 101, 104, 107, 111, 31, 32, 32, 33, 33, 34, 35, 36, 36, 39, 42,
+        44, 45, 50, 54, 56, 59, 64, 71, 72, 74, 78, 81, 84, 86, 89, 93, 96, 100,
+        104, 107, 111, 32, 32, 32, 33, 34, 35, 37, 37, 38, 40, 42, 44, 46, 49,
+        53, 55, 58, 63, 69, 70, 72, 76, 79, 82, 85, 89, 93, 96, 99, 102, 106,
+        109, 34, 34, 33, 34, 35, 37, 39, 41, 42, 45, 47, 49, 51, 54, 58, 60, 63,
+        68, 74, 75, 76, 80, 81, 82, 84, 87, 90, 93, 97, 101, 105, 110, 35, 34,
+        34, 35, 36, 37, 41, 43, 45, 47, 50, 52, 53, 57, 61, 63, 65, 70, 76, 77,
+        79, 82, 84, 86, 89, 91, 92, 93, 96, 100, 103, 107, 36, 35, 34, 35, 36,
+        38, 42, 45, 48, 50, 54, 55, 57, 60, 64, 66, 68, 73, 79, 80, 81, 85, 88,
+        90, 91, 93, 96, 99, 102, 103, 105, 107, 39, 38, 37, 38, 39, 40, 45, 47,
+        50, 54, 58, 59, 61, 65, 69, 71, 73, 78, 84, 85, 86, 91, 92, 92, 95, 98,
+        100, 101, 103, 106, 110, 114, 44, 42, 41, 41, 42, 42, 47, 50, 54, 58,
+        63, 65, 67, 71, 75, 77, 79, 84, 90, 91, 92, 95, 95, 97, 100, 101, 102,
+        105, 109, 111, 112, 114, 46, 44, 42, 43, 44, 44, 49, 52, 55, 59, 65, 67,
+        69, 74, 78, 80, 82, 87, 93, 94, 95, 98, 100, 103, 102, 105, 108, 110,
+        111, 113, 117, 121, 48, 46, 44, 45, 45, 46, 51, 53, 57, 61, 67, 69, 71,
+        76, 80, 83, 85, 90, 96, 97, 99, 103, 107, 105, 108, 111, 111, 113, 117,
+        119, 120, 122, 54, 51, 49, 49, 50, 49, 54, 57, 60, 65, 71, 74, 76, 82,
+        87, 89, 92, 97, 104, 105, 106, 111, 110, 111, 114, 113, 116, 120, 120,
+        121, 125, 130, 59, 56, 54, 54, 54, 53, 58, 61, 64, 69, 75, 78, 80, 87,
+        92, 95, 98, 103, 110, 111, 113, 115, 115, 119, 116, 120, 122, 122, 125,
+        129, 130, 130, 62, 59, 56, 56, 56, 55, 60, 63, 66, 71, 77, 80, 83, 89,
+        95, 98, 101, 107, 114, 115, 117, 119, 123, 121, 125, 126, 125, 129, 131,
+        131, 135, 140, 65, 62, 59, 59, 59, 58, 63, 65, 68, 73, 79, 82, 85, 92,
+        98, 101, 105, 111, 118, 119, 121, 126, 127, 128, 130, 130, 134, 133,
+        135, 140, 140, 140, 71, 68, 65, 64, 64, 63, 68, 70, 73, 78, 84, 87, 90,
+        97, 103, 107, 111, 117, 125, 126, 128, 134, 132, 136, 133, 138, 137,
+        140, 143, 142, 145, 150, 80, 76, 72, 72, 71, 69, 74, 76, 79, 84, 90, 93,
+        96, 104, 110, 114, 118, 125, 134, 135, 137, 139, 140, 139, 143, 142,
+        144, 146, 146, 151, 152, 151, 81, 77, 73, 73, 72, 70, 75, 77, 80, 85,
+        91, 94, 97, 105, 111, 115, 119, 126, 135, 137, 138, 144, 147, 146, 148,
+        149, 151, 150, 156, 155, 157, 163, 83, 78, 75, 74, 74, 72, 76, 79, 81,
+        86, 92, 95, 99, 106, 113, 117, 121, 128, 137, 138, 140, 147, 151, 156,
+        152, 157, 155, 161, 158, 162, 165, 164, 88, 84, 80, 79, 78, 76, 80, 82,
+        85, 91, 95, 98, 103, 111, 115, 119, 126, 134, 139, 144, 147, 152, 154,
+        158, 163, 159, 165, 163, 168, 168, 169, 176, 91, 86, 83, 82, 81, 79, 81,
+        84, 88, 92, 95, 100, 107, 110, 115, 123, 127, 132, 140, 147, 151, 154,
+        159, 161, 166, 171, 169, 173, 173, 176, 179, 177, 94, 89, 86, 85, 84,
+        82, 82, 86, 90, 92, 97, 103, 105, 111, 119, 121, 128, 136, 139, 146,
+        156, 158, 161, 166, 168, 174, 179, 178, 180, 183, 183, 190, 97, 92, 90,
+        88, 86, 85, 84, 89, 91, 95, 100, 102, 108, 114, 116, 125, 130, 133, 143,
+        148, 152, 163, 166, 168, 174, 176, 182, 187, 189, 188, 193, 191, 101,
+        95, 93, 91, 89, 89, 87, 91, 93, 98, 101, 105, 111, 113, 120, 126, 130,
+        138, 142, 149, 157, 159, 171, 174, 176, 183, 184, 191, 195, 199, 197,
+        204, 104, 99, 97, 94, 93, 93, 90, 92, 96, 100, 102, 108, 111, 116, 122,
+        125, 134, 137, 144, 151, 155, 165, 169, 179, 182, 184, 191, 193, 200,
+        204, 210, 206, 107, 102, 101, 97, 96, 96, 93, 93, 99, 101, 105, 110,
+        113, 120, 122, 129, 133, 140, 146, 150, 161, 163, 173, 178, 187, 191,
+        193, 200, 202, 210, 214, 222, 111, 105, 104, 101, 100, 99, 97, 96, 102,
+        103, 109, 111, 117, 120, 125, 131, 135, 143, 146, 156, 158, 168, 173,
+        180, 189, 195, 200, 202, 210, 212, 220, 224, 115, 109, 108, 104, 104,
+        102, 101, 100, 103, 106, 111, 113, 119, 121, 129, 131, 140, 142, 151,
+        155, 162, 168, 176, 183, 188, 199, 204, 210, 212, 220, 222, 230, 119,
+        113, 112, 107, 107, 106, 105, 103, 105, 110, 112, 117, 120, 125, 130,
+        135, 140, 145, 152, 157, 165, 169, 179, 183, 193, 197, 210, 214, 220,
+        222, 231, 232, 123, 116, 116, 111, 111, 109, 110, 107, 107, 114, 114,
+        121, 122, 130, 130, 140, 140, 150, 151, 163, 164, 176, 177, 190, 191,
+        204, 206, 222, 224, 230, 232, 242,
+        /* Size 4x8 */
+        32, 42, 75, 91, 33, 42, 69, 86, 37, 58, 84, 91, 49, 71, 103, 110, 65,
+        84, 125, 128, 80, 97, 142, 152, 91, 100, 145, 178, 104, 112, 146, 190,
+        /* Size 8x4 */
+        32, 33, 37, 49, 65, 80, 91, 104, 42, 42, 58, 71, 84, 97, 100, 112, 75,
+        69, 84, 103, 125, 142, 145, 146, 91, 86, 91, 110, 128, 152, 178, 190,
+        /* Size 8x16 */
+        32, 32, 36, 53, 65, 87, 93, 99, 31, 33, 34, 49, 59, 78, 86, 93, 32, 34,
+        36, 50, 59, 77, 82, 89, 34, 37, 42, 54, 63, 79, 80, 88, 36, 38, 48, 60,
+        68, 84, 86, 90, 44, 43, 53, 71, 79, 95, 94, 97, 48, 46, 56, 76, 85, 102,
+        105, 105, 58, 54, 63, 87, 98, 116, 112, 115, 65, 58, 68, 92, 105, 124,
+        122, 124, 79, 70, 79, 104, 118, 141, 135, 135, 82, 72, 81, 106, 121,
+        144, 149, 146, 91, 80, 88, 106, 130, 148, 162, 159, 97, 86, 94, 107,
+        128, 157, 167, 171, 103, 93, 98, 114, 131, 150, 174, 186, 110, 100, 101,
+        117, 138, 161, 183, 193, 118, 107, 105, 118, 136, 157, 182, 203,
+        /* Size 16x8 */
+        32, 31, 32, 34, 36, 44, 48, 58, 65, 79, 82, 91, 97, 103, 110, 118, 32,
+        33, 34, 37, 38, 43, 46, 54, 58, 70, 72, 80, 86, 93, 100, 107, 36, 34,
+        36, 42, 48, 53, 56, 63, 68, 79, 81, 88, 94, 98, 101, 105, 53, 49, 50,
+        54, 60, 71, 76, 87, 92, 104, 106, 106, 107, 114, 117, 118, 65, 59, 59,
+        63, 68, 79, 85, 98, 105, 118, 121, 130, 128, 131, 138, 136, 87, 78, 77,
+        79, 84, 95, 102, 116, 124, 141, 144, 148, 157, 150, 161, 157, 93, 86,
+        82, 80, 86, 94, 105, 112, 122, 135, 149, 162, 167, 174, 183, 182, 99,
+        93, 89, 88, 90, 97, 105, 115, 124, 135, 146, 159, 171, 186, 193, 203,
+        /* Size 16x32 */
+        32, 31, 32, 34, 36, 44, 53, 59, 65, 79, 87, 90, 93, 96, 99, 102, 31, 32,
+        32, 34, 35, 42, 51, 56, 62, 75, 82, 85, 88, 91, 94, 97, 31, 32, 33, 33,
+        34, 41, 49, 54, 59, 72, 78, 82, 86, 90, 93, 97, 31, 32, 33, 34, 35, 41,
+        49, 54, 59, 71, 78, 81, 84, 87, 90, 93, 32, 32, 34, 35, 36, 42, 50, 54,
+        59, 71, 77, 80, 82, 86, 89, 93, 32, 33, 35, 37, 38, 42, 49, 53, 58, 69,
+        75, 78, 82, 86, 89, 92, 34, 34, 37, 39, 42, 48, 54, 58, 63, 73, 79, 78,
+        80, 83, 88, 92, 35, 34, 37, 41, 45, 50, 57, 61, 65, 76, 82, 83, 84, 84,
+        87, 90, 36, 34, 38, 43, 48, 54, 60, 64, 68, 78, 84, 87, 86, 89, 90, 90,
+        39, 37, 40, 45, 50, 58, 65, 69, 73, 84, 89, 89, 91, 91, 93, 96, 44, 41,
+        43, 48, 53, 63, 71, 75, 79, 90, 95, 93, 94, 95, 97, 97, 46, 43, 44, 49,
+        55, 65, 73, 78, 82, 93, 98, 100, 98, 100, 99, 103, 48, 45, 46, 51, 56,
+        67, 76, 80, 85, 96, 102, 102, 105, 102, 105, 104, 53, 49, 50, 54, 60,
+        71, 82, 87, 92, 103, 109, 107, 107, 110, 107, 111, 58, 54, 54, 58, 63,
+        75, 87, 92, 98, 110, 116, 115, 112, 111, 115, 112, 61, 57, 56, 60, 66,
+        77, 89, 95, 101, 114, 120, 118, 119, 118, 116, 120, 65, 60, 58, 63, 68,
+        79, 92, 98, 105, 118, 124, 123, 122, 123, 124, 121, 71, 65, 63, 68, 73,
+        84, 97, 103, 111, 125, 132, 132, 130, 128, 127, 130, 79, 72, 70, 74, 79,
+        90, 104, 110, 118, 133, 141, 136, 135, 135, 135, 131, 81, 74, 71, 75,
+        80, 91, 105, 112, 119, 135, 142, 140, 140, 138, 139, 142, 82, 75, 72,
+        76, 81, 92, 106, 113, 121, 136, 144, 151, 149, 149, 146, 143, 88, 80,
+        77, 80, 85, 97, 108, 115, 126, 142, 149, 153, 153, 152, 152, 154, 91,
+        83, 80, 81, 88, 100, 106, 114, 130, 142, 148, 155, 162, 160, 159, 155,
+        94, 85, 83, 82, 91, 100, 105, 118, 131, 137, 153, 160, 165, 167, 166,
+        168, 97, 88, 86, 85, 94, 100, 107, 123, 128, 140, 157, 161, 167, 173,
+        171, 169, 100, 91, 89, 87, 97, 100, 111, 121, 127, 145, 152, 164, 173,
+        178, 182, 181, 103, 94, 93, 90, 98, 101, 114, 120, 131, 144, 150, 170,
+        174, 180, 186, 183, 107, 97, 96, 93, 100, 104, 117, 119, 136, 142, 155,
+        168, 177, 187, 191, 198, 110, 101, 100, 97, 101, 108, 117, 123, 138,
+        141, 161, 165, 183, 188, 193, 200, 114, 104, 104, 100, 103, 112, 117,
+        127, 137, 146, 159, 167, 185, 190, 201, 206, 118, 108, 107, 103, 105,
+        115, 118, 131, 136, 151, 157, 172, 182, 197, 203, 208, 122, 111, 111,
+        107, 107, 119, 119, 136, 136, 156, 156, 178, 179, 203, 204, 217,
+        /* Size 32x16 */
+        32, 31, 31, 31, 32, 32, 34, 35, 36, 39, 44, 46, 48, 53, 58, 61, 65, 71,
+        79, 81, 82, 88, 91, 94, 97, 100, 103, 107, 110, 114, 118, 122, 31, 32,
+        32, 32, 32, 33, 34, 34, 34, 37, 41, 43, 45, 49, 54, 57, 60, 65, 72, 74,
+        75, 80, 83, 85, 88, 91, 94, 97, 101, 104, 108, 111, 32, 32, 33, 33, 34,
+        35, 37, 37, 38, 40, 43, 44, 46, 50, 54, 56, 58, 63, 70, 71, 72, 77, 80,
+        83, 86, 89, 93, 96, 100, 104, 107, 111, 34, 34, 33, 34, 35, 37, 39, 41,
+        43, 45, 48, 49, 51, 54, 58, 60, 63, 68, 74, 75, 76, 80, 81, 82, 85, 87,
+        90, 93, 97, 100, 103, 107, 36, 35, 34, 35, 36, 38, 42, 45, 48, 50, 53,
+        55, 56, 60, 63, 66, 68, 73, 79, 80, 81, 85, 88, 91, 94, 97, 98, 100,
+        101, 103, 105, 107, 44, 42, 41, 41, 42, 42, 48, 50, 54, 58, 63, 65, 67,
+        71, 75, 77, 79, 84, 90, 91, 92, 97, 100, 100, 100, 100, 101, 104, 108,
+        112, 115, 119, 53, 51, 49, 49, 50, 49, 54, 57, 60, 65, 71, 73, 76, 82,
+        87, 89, 92, 97, 104, 105, 106, 108, 106, 105, 107, 111, 114, 117, 117,
+        117, 118, 119, 59, 56, 54, 54, 54, 53, 58, 61, 64, 69, 75, 78, 80, 87,
+        92, 95, 98, 103, 110, 112, 113, 115, 114, 118, 123, 121, 120, 119, 123,
+        127, 131, 136, 65, 62, 59, 59, 59, 58, 63, 65, 68, 73, 79, 82, 85, 92,
+        98, 101, 105, 111, 118, 119, 121, 126, 130, 131, 128, 127, 131, 136,
+        138, 137, 136, 136, 79, 75, 72, 71, 71, 69, 73, 76, 78, 84, 90, 93, 96,
+        103, 110, 114, 118, 125, 133, 135, 136, 142, 142, 137, 140, 145, 144,
+        142, 141, 146, 151, 156, 87, 82, 78, 78, 77, 75, 79, 82, 84, 89, 95, 98,
+        102, 109, 116, 120, 124, 132, 141, 142, 144, 149, 148, 153, 157, 152,
+        150, 155, 161, 159, 157, 156, 90, 85, 82, 81, 80, 78, 78, 83, 87, 89,
+        93, 100, 102, 107, 115, 118, 123, 132, 136, 140, 151, 153, 155, 160,
+        161, 164, 170, 168, 165, 167, 172, 178, 93, 88, 86, 84, 82, 82, 80, 84,
+        86, 91, 94, 98, 105, 107, 112, 119, 122, 130, 135, 140, 149, 153, 162,
+        165, 167, 173, 174, 177, 183, 185, 182, 179, 96, 91, 90, 87, 86, 86, 83,
+        84, 89, 91, 95, 100, 102, 110, 111, 118, 123, 128, 135, 138, 149, 152,
+        160, 167, 173, 178, 180, 187, 188, 190, 197, 203, 99, 94, 93, 90, 89,
+        89, 88, 87, 90, 93, 97, 99, 105, 107, 115, 116, 124, 127, 135, 139, 146,
+        152, 159, 166, 171, 182, 186, 191, 193, 201, 203, 204, 102, 97, 97, 93,
+        93, 92, 92, 90, 90, 96, 97, 103, 104, 111, 112, 120, 121, 130, 131, 142,
+        143, 154, 155, 168, 169, 181, 183, 198, 200, 206, 208, 217,
+        /* Size 4x16 */
+        31, 44, 79, 96, 32, 41, 72, 90, 32, 42, 71, 86, 34, 48, 73, 83, 34, 54,
+        78, 89, 41, 63, 90, 95, 45, 67, 96, 102, 54, 75, 110, 111, 60, 79, 118,
+        123, 72, 90, 133, 135, 75, 92, 136, 149, 83, 100, 142, 160, 88, 100,
+        140, 173, 94, 101, 144, 180, 101, 108, 141, 188, 108, 115, 151, 197,
+        /* Size 16x4 */
+        31, 32, 32, 34, 34, 41, 45, 54, 60, 72, 75, 83, 88, 94, 101, 108, 44,
+        41, 42, 48, 54, 63, 67, 75, 79, 90, 92, 100, 100, 101, 108, 115, 79, 72,
+        71, 73, 78, 90, 96, 110, 118, 133, 136, 142, 140, 144, 141, 151, 96, 90,
+        86, 83, 89, 95, 102, 111, 123, 135, 149, 160, 173, 180, 188, 197,
+        /* Size 8x32 */
+        32, 32, 36, 53, 65, 87, 93, 99, 31, 32, 35, 51, 62, 82, 88, 94, 31, 33,
+        34, 49, 59, 78, 86, 93, 31, 33, 35, 49, 59, 78, 84, 90, 32, 34, 36, 50,
+        59, 77, 82, 89, 32, 35, 38, 49, 58, 75, 82, 89, 34, 37, 42, 54, 63, 79,
+        80, 88, 35, 37, 45, 57, 65, 82, 84, 87, 36, 38, 48, 60, 68, 84, 86, 90,
+        39, 40, 50, 65, 73, 89, 91, 93, 44, 43, 53, 71, 79, 95, 94, 97, 46, 44,
+        55, 73, 82, 98, 98, 99, 48, 46, 56, 76, 85, 102, 105, 105, 53, 50, 60,
+        82, 92, 109, 107, 107, 58, 54, 63, 87, 98, 116, 112, 115, 61, 56, 66,
+        89, 101, 120, 119, 116, 65, 58, 68, 92, 105, 124, 122, 124, 71, 63, 73,
+        97, 111, 132, 130, 127, 79, 70, 79, 104, 118, 141, 135, 135, 81, 71, 80,
+        105, 119, 142, 140, 139, 82, 72, 81, 106, 121, 144, 149, 146, 88, 77,
+        85, 108, 126, 149, 153, 152, 91, 80, 88, 106, 130, 148, 162, 159, 94,
+        83, 91, 105, 131, 153, 165, 166, 97, 86, 94, 107, 128, 157, 167, 171,
+        100, 89, 97, 111, 127, 152, 173, 182, 103, 93, 98, 114, 131, 150, 174,
+        186, 107, 96, 100, 117, 136, 155, 177, 191, 110, 100, 101, 117, 138,
+        161, 183, 193, 114, 104, 103, 117, 137, 159, 185, 201, 118, 107, 105,
+        118, 136, 157, 182, 203, 122, 111, 107, 119, 136, 156, 179, 204,
+        /* Size 32x8 */
+        32, 31, 31, 31, 32, 32, 34, 35, 36, 39, 44, 46, 48, 53, 58, 61, 65, 71,
+        79, 81, 82, 88, 91, 94, 97, 100, 103, 107, 110, 114, 118, 122, 32, 32,
+        33, 33, 34, 35, 37, 37, 38, 40, 43, 44, 46, 50, 54, 56, 58, 63, 70, 71,
+        72, 77, 80, 83, 86, 89, 93, 96, 100, 104, 107, 111, 36, 35, 34, 35, 36,
+        38, 42, 45, 48, 50, 53, 55, 56, 60, 63, 66, 68, 73, 79, 80, 81, 85, 88,
+        91, 94, 97, 98, 100, 101, 103, 105, 107, 53, 51, 49, 49, 50, 49, 54, 57,
+        60, 65, 71, 73, 76, 82, 87, 89, 92, 97, 104, 105, 106, 108, 106, 105,
+        107, 111, 114, 117, 117, 117, 118, 119, 65, 62, 59, 59, 59, 58, 63, 65,
+        68, 73, 79, 82, 85, 92, 98, 101, 105, 111, 118, 119, 121, 126, 130, 131,
+        128, 127, 131, 136, 138, 137, 136, 136, 87, 82, 78, 78, 77, 75, 79, 82,
+        84, 89, 95, 98, 102, 109, 116, 120, 124, 132, 141, 142, 144, 149, 148,
+        153, 157, 152, 150, 155, 161, 159, 157, 156, 93, 88, 86, 84, 82, 82, 80,
+        84, 86, 91, 94, 98, 105, 107, 112, 119, 122, 130, 135, 140, 149, 153,
+        162, 165, 167, 173, 174, 177, 183, 185, 182, 179, 99, 94, 93, 90, 89,
+        89, 88, 87, 90, 93, 97, 99, 105, 107, 115, 116, 124, 127, 135, 139, 146,
+        152, 159, 166, 171, 182, 186, 191, 193, 201, 203, 204 },
+      { /* Chroma */
+        /* Size 4x4 */
+        35, 46, 57, 66, 46, 60, 69, 71, 57, 69, 90, 90, 66, 71, 90, 109,
+        /* Size 8x8 */
+        31, 38, 47, 50, 57, 63, 67, 71, 38, 47, 46, 47, 52, 57, 62, 67, 47, 46,
+        54, 57, 61, 66, 67, 68, 50, 47, 57, 66, 72, 77, 75, 75, 57, 52, 61, 72,
+        82, 88, 86, 84, 63, 57, 66, 77, 88, 96, 95, 95, 67, 62, 67, 75, 86, 95,
+        104, 107, 71, 67, 68, 75, 84, 95, 107, 113,
+        /* Size 16x16 */
+        32, 30, 33, 41, 49, 49, 50, 54, 57, 63, 65, 68, 70, 72, 74, 76, 30, 32,
+        35, 42, 46, 45, 46, 49, 52, 57, 58, 62, 64, 67, 70, 72, 33, 35, 39, 45,
+        47, 45, 46, 49, 51, 56, 57, 60, 62, 64, 66, 69, 41, 42, 45, 48, 50, 49,
+        50, 52, 53, 57, 58, 59, 60, 61, 64, 67, 49, 46, 47, 50, 53, 53, 54, 55,
+        56, 60, 61, 64, 64, 65, 66, 66, 49, 45, 45, 49, 53, 58, 60, 62, 63, 67,
+        68, 67, 69, 68, 70, 70, 50, 46, 46, 50, 54, 60, 61, 65, 67, 71, 71, 74,
+        73, 73, 74, 74, 54, 49, 49, 52, 55, 62, 65, 71, 73, 78, 79, 78, 77, 78,
+        78, 78, 57, 52, 51, 53, 56, 63, 67, 73, 76, 82, 83, 84, 84, 84, 82, 83,
+        63, 57, 56, 57, 60, 67, 71, 78, 82, 89, 90, 90, 89, 88, 87, 88, 65, 58,
+        57, 58, 61, 68, 71, 79, 83, 90, 91, 94, 93, 93, 92, 93, 68, 62, 60, 59,
+        64, 67, 74, 78, 84, 90, 94, 98, 99, 98, 98, 98, 70, 64, 62, 60, 64, 69,
+        73, 77, 84, 89, 93, 99, 102, 103, 104, 104, 72, 67, 64, 61, 65, 68, 73,
+        78, 84, 88, 93, 98, 103, 106, 108, 109, 74, 70, 66, 64, 66, 70, 74, 78,
+        82, 87, 92, 98, 104, 108, 111, 112, 76, 72, 69, 67, 66, 70, 74, 78, 83,
+        88, 93, 98, 104, 109, 112, 116,
+        /* Size 32x32 */
+        32, 31, 30, 32, 33, 36, 41, 45, 49, 48, 49, 50, 50, 52, 54, 56, 57, 60,
+        63, 64, 65, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 78, 31, 31, 31, 33,
+        34, 38, 42, 45, 47, 47, 47, 47, 48, 50, 52, 53, 54, 57, 60, 61, 61, 63,
+        64, 65, 66, 67, 68, 69, 70, 71, 72, 74, 30, 31, 32, 33, 35, 40, 42, 44,
+        46, 45, 45, 45, 46, 47, 49, 51, 52, 54, 57, 58, 58, 61, 62, 63, 64, 66,
+        67, 68, 70, 71, 72, 74, 32, 33, 33, 35, 37, 41, 43, 45, 47, 46, 45, 46,
+        46, 47, 49, 50, 51, 54, 57, 57, 58, 60, 61, 62, 63, 64, 65, 66, 67, 68,
+        69, 70, 33, 34, 35, 37, 39, 43, 45, 46, 47, 46, 45, 46, 46, 47, 49, 50,
+        51, 53, 56, 57, 57, 59, 60, 61, 62, 63, 64, 65, 66, 68, 69, 70, 36, 38,
+        40, 41, 43, 47, 47, 47, 48, 46, 45, 46, 46, 47, 48, 49, 50, 52, 54, 55,
+        55, 57, 58, 59, 61, 62, 64, 65, 66, 67, 68, 69, 41, 42, 42, 43, 45, 47,
+        48, 49, 50, 49, 49, 49, 50, 50, 52, 52, 53, 55, 57, 58, 58, 60, 59, 59,
+        60, 61, 61, 63, 64, 66, 67, 69, 45, 45, 44, 45, 46, 47, 49, 50, 51, 51,
+        51, 51, 52, 52, 53, 54, 55, 57, 59, 59, 60, 61, 61, 62, 63, 63, 63, 63,
+        63, 64, 65, 66, 49, 47, 46, 47, 47, 48, 50, 51, 53, 53, 53, 54, 54, 54,
+        55, 56, 56, 58, 60, 61, 61, 63, 64, 64, 64, 64, 65, 66, 66, 66, 66, 66,
+        48, 47, 45, 46, 46, 46, 49, 51, 53, 54, 55, 56, 56, 57, 58, 59, 60, 61,
+        63, 64, 64, 66, 66, 65, 66, 67, 67, 67, 67, 68, 69, 70, 49, 47, 45, 45,
+        45, 45, 49, 51, 53, 55, 58, 59, 60, 61, 62, 63, 63, 65, 67, 67, 68, 69,
+        67, 68, 69, 68, 68, 69, 70, 70, 70, 70, 50, 47, 45, 46, 46, 46, 49, 51,
+        54, 56, 59, 60, 60, 62, 64, 64, 65, 67, 69, 69, 70, 70, 71, 71, 70, 70,
+        71, 71, 71, 71, 72, 74, 50, 48, 46, 46, 46, 46, 50, 52, 54, 56, 60, 60,
+        61, 63, 65, 66, 67, 68, 71, 71, 71, 73, 74, 72, 73, 74, 73, 73, 74, 74,
+        74, 74, 52, 50, 47, 47, 47, 47, 50, 52, 54, 57, 61, 62, 63, 66, 68, 69,
+        70, 72, 75, 75, 75, 77, 75, 75, 76, 75, 75, 76, 75, 75, 76, 77, 54, 52,
+        49, 49, 49, 48, 52, 53, 55, 58, 62, 64, 65, 68, 71, 72, 73, 75, 78, 78,
+        79, 79, 78, 79, 77, 78, 78, 77, 78, 79, 78, 78, 56, 53, 51, 50, 50, 49,
+        52, 54, 56, 59, 63, 64, 66, 69, 72, 73, 75, 77, 80, 80, 81, 81, 82, 80,
+        81, 81, 79, 81, 80, 79, 81, 82, 57, 54, 52, 51, 51, 50, 53, 55, 56, 60,
+        63, 65, 67, 70, 73, 75, 76, 79, 82, 82, 83, 85, 84, 83, 84, 83, 84, 82,
+        82, 84, 83, 82, 60, 57, 54, 54, 53, 52, 55, 57, 58, 61, 65, 67, 68, 72,
+        75, 77, 79, 82, 85, 85, 86, 88, 86, 87, 85, 86, 85, 85, 86, 84, 85, 86,
+        63, 60, 57, 57, 56, 54, 57, 59, 60, 63, 67, 69, 71, 75, 78, 80, 82, 85,
+        89, 89, 90, 90, 90, 89, 89, 88, 88, 88, 87, 88, 88, 87, 64, 61, 58, 57,
+        57, 55, 58, 59, 61, 64, 67, 69, 71, 75, 78, 80, 82, 85, 89, 90, 91, 92,
+        93, 92, 92, 91, 91, 90, 91, 90, 90, 92, 65, 61, 58, 58, 57, 55, 58, 60,
+        61, 64, 68, 70, 71, 75, 79, 81, 83, 86, 90, 91, 91, 94, 94, 96, 93, 94,
+        93, 94, 92, 93, 93, 92, 67, 63, 61, 60, 59, 57, 60, 61, 63, 66, 69, 70,
+        73, 77, 79, 81, 85, 88, 90, 92, 94, 96, 96, 97, 98, 95, 97, 95, 96, 95,
+        95, 96, 68, 64, 62, 61, 60, 58, 59, 61, 64, 66, 67, 71, 74, 75, 78, 82,
+        84, 86, 90, 93, 94, 96, 98, 98, 99, 100, 98, 99, 98, 98, 98, 97, 69, 65,
+        63, 62, 61, 59, 59, 62, 64, 65, 68, 71, 72, 75, 79, 80, 83, 87, 89, 92,
+        96, 97, 98, 100, 100, 101, 102, 101, 101, 101, 100, 102, 70, 66, 64, 63,
+        62, 61, 60, 63, 64, 66, 69, 70, 73, 76, 77, 81, 84, 85, 89, 92, 93, 98,
+        99, 100, 102, 102, 103, 104, 104, 103, 104, 102, 71, 67, 66, 64, 63, 62,
+        61, 63, 64, 67, 68, 70, 74, 75, 78, 81, 83, 86, 88, 91, 94, 95, 100,
+        101, 102, 104, 104, 105, 106, 107, 105, 107, 72, 68, 67, 65, 64, 64, 61,
+        63, 65, 67, 68, 71, 73, 75, 78, 79, 84, 85, 88, 91, 93, 97, 98, 102,
+        103, 104, 106, 106, 108, 108, 109, 107, 73, 69, 68, 66, 65, 65, 63, 63,
+        66, 67, 69, 71, 73, 76, 77, 81, 82, 85, 88, 90, 94, 95, 99, 101, 104,
+        105, 106, 109, 108, 110, 111, 112, 74, 70, 70, 67, 66, 66, 64, 63, 66,
+        67, 70, 71, 74, 75, 78, 80, 82, 86, 87, 91, 92, 96, 98, 101, 104, 106,
+        108, 108, 111, 111, 112, 113, 75, 71, 71, 68, 68, 67, 66, 64, 66, 68,
+        70, 71, 74, 75, 79, 79, 84, 84, 88, 90, 93, 95, 98, 101, 103, 107, 108,
+        110, 111, 113, 113, 115, 76, 72, 72, 69, 69, 68, 67, 65, 66, 69, 70, 72,
+        74, 76, 78, 81, 83, 85, 88, 90, 93, 95, 98, 100, 104, 105, 109, 111,
+        112, 113, 116, 115, 78, 74, 74, 70, 70, 69, 69, 66, 66, 70, 70, 74, 74,
+        77, 78, 82, 82, 86, 87, 92, 92, 96, 97, 102, 102, 107, 107, 112, 113,
+        115, 115, 118,
+        /* Size 4x8 */
+        31, 47, 60, 66, 40, 45, 54, 61, 46, 56, 64, 64, 48, 61, 75, 73, 54, 65,
+        85, 82, 61, 69, 92, 92, 64, 68, 90, 102, 68, 71, 87, 105,
+        /* Size 8x4 */
+        31, 40, 46, 48, 54, 61, 64, 68, 47, 45, 56, 61, 65, 69, 68, 71, 60, 54,
+        64, 75, 85, 92, 90, 87, 66, 61, 64, 73, 82, 92, 102, 105,
+        /* Size 8x16 */
+        32, 37, 48, 52, 57, 66, 68, 71, 30, 40, 46, 48, 52, 60, 63, 66, 33, 43,
+        47, 47, 51, 59, 60, 63, 42, 47, 50, 50, 53, 60, 59, 62, 49, 48, 53, 54,
+        57, 62, 62, 62, 49, 46, 53, 61, 64, 69, 66, 66, 50, 46, 54, 64, 67, 73,
+        72, 70, 54, 49, 55, 68, 73, 80, 76, 75, 57, 50, 56, 70, 76, 84, 80, 79,
+        63, 55, 60, 75, 82, 92, 87, 84, 64, 56, 61, 75, 83, 93, 93, 89, 68, 59,
+        64, 74, 86, 94, 98, 94, 70, 62, 66, 73, 83, 96, 99, 98, 72, 64, 66, 75,
+        83, 92, 101, 104, 74, 67, 66, 74, 84, 94, 103, 106, 76, 69, 67, 73, 82,
+        91, 101, 109,
+        /* Size 16x8 */
+        32, 30, 33, 42, 49, 49, 50, 54, 57, 63, 64, 68, 70, 72, 74, 76, 37, 40,
+        43, 47, 48, 46, 46, 49, 50, 55, 56, 59, 62, 64, 67, 69, 48, 46, 47, 50,
+        53, 53, 54, 55, 56, 60, 61, 64, 66, 66, 66, 67, 52, 48, 47, 50, 54, 61,
+        64, 68, 70, 75, 75, 74, 73, 75, 74, 73, 57, 52, 51, 53, 57, 64, 67, 73,
+        76, 82, 83, 86, 83, 83, 84, 82, 66, 60, 59, 60, 62, 69, 73, 80, 84, 92,
+        93, 94, 96, 92, 94, 91, 68, 63, 60, 59, 62, 66, 72, 76, 80, 87, 93, 98,
+        99, 101, 103, 101, 71, 66, 63, 62, 62, 66, 70, 75, 79, 84, 89, 94, 98,
+        104, 106, 109,
+        /* Size 16x32 */
+        32, 31, 37, 42, 48, 49, 52, 54, 57, 63, 66, 67, 68, 69, 71, 72, 31, 31,
+        38, 42, 47, 47, 50, 52, 54, 60, 63, 64, 65, 66, 67, 68, 30, 32, 40, 42,
+        46, 45, 48, 50, 52, 57, 60, 62, 63, 65, 66, 68, 32, 34, 41, 44, 46, 45,
+        48, 49, 51, 57, 59, 61, 62, 63, 64, 65, 33, 36, 43, 45, 47, 46, 47, 49,
+        51, 56, 59, 60, 60, 62, 63, 65, 37, 40, 47, 47, 47, 45, 47, 48, 50, 54,
+        57, 58, 60, 61, 62, 63, 42, 43, 47, 48, 50, 49, 50, 52, 53, 57, 60, 58,
+        59, 60, 62, 63, 45, 44, 47, 49, 51, 51, 52, 54, 55, 59, 61, 61, 61, 60,
+        61, 61, 49, 46, 48, 50, 53, 53, 54, 55, 57, 60, 62, 63, 62, 63, 62, 62,
+        48, 46, 47, 50, 53, 56, 57, 59, 60, 64, 66, 65, 65, 64, 64, 65, 49, 45,
+        46, 49, 53, 58, 61, 62, 64, 67, 69, 67, 66, 66, 66, 65, 49, 46, 46, 49,
+        53, 59, 62, 64, 65, 69, 71, 70, 68, 68, 67, 68, 50, 46, 46, 50, 54, 59,
+        64, 65, 67, 71, 73, 72, 72, 70, 70, 69, 52, 48, 47, 50, 54, 61, 66, 68,
+        71, 75, 77, 74, 73, 73, 71, 72, 54, 50, 49, 52, 55, 62, 68, 71, 73, 78,
+        80, 78, 76, 74, 75, 73, 55, 51, 49, 52, 56, 63, 69, 72, 75, 80, 82, 80,
+        79, 78, 76, 77, 57, 52, 50, 53, 56, 64, 70, 73, 76, 82, 84, 82, 80, 80,
+        79, 77, 60, 54, 52, 55, 58, 65, 72, 75, 79, 85, 88, 86, 84, 82, 81, 81,
+        63, 57, 55, 58, 60, 67, 75, 78, 82, 89, 92, 88, 87, 85, 84, 81, 64, 58,
+        55, 58, 61, 68, 75, 78, 82, 89, 92, 90, 89, 87, 86, 86, 64, 59, 56, 58,
+        61, 68, 75, 79, 83, 90, 93, 95, 93, 91, 89, 87, 67, 61, 58, 60, 63, 69,
+        76, 79, 85, 92, 95, 96, 94, 92, 91, 91, 68, 62, 59, 60, 64, 71, 74, 78,
+        86, 91, 94, 96, 98, 96, 94, 91, 69, 62, 60, 60, 65, 70, 72, 79, 85, 88,
+        95, 98, 99, 98, 97, 96, 70, 63, 62, 60, 66, 69, 73, 81, 83, 89, 96, 97,
+        99, 101, 98, 97, 71, 64, 63, 61, 67, 68, 74, 79, 82, 90, 93, 98, 102,
+        102, 102, 101, 72, 65, 64, 62, 66, 68, 75, 78, 83, 89, 92, 100, 101,
+        103, 104, 102, 73, 66, 65, 63, 66, 69, 75, 76, 84, 87, 93, 98, 102, 105,
+        106, 107, 74, 67, 67, 64, 66, 70, 74, 77, 84, 86, 94, 96, 103, 105, 106,
+        107, 75, 68, 68, 65, 66, 71, 74, 78, 83, 87, 93, 96, 103, 105, 109, 109,
+        76, 69, 69, 66, 67, 72, 73, 80, 82, 88, 91, 97, 101, 107, 109, 110, 77,
+        70, 70, 67, 67, 73, 73, 81, 81, 90, 90, 99, 99, 108, 108, 113,
+        /* Size 32x16 */
+        32, 31, 30, 32, 33, 37, 42, 45, 49, 48, 49, 49, 50, 52, 54, 55, 57, 60,
+        63, 64, 64, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 31, 31, 32, 34,
+        36, 40, 43, 44, 46, 46, 45, 46, 46, 48, 50, 51, 52, 54, 57, 58, 59, 61,
+        62, 62, 63, 64, 65, 66, 67, 68, 69, 70, 37, 38, 40, 41, 43, 47, 47, 47,
+        48, 47, 46, 46, 46, 47, 49, 49, 50, 52, 55, 55, 56, 58, 59, 60, 62, 63,
+        64, 65, 67, 68, 69, 70, 42, 42, 42, 44, 45, 47, 48, 49, 50, 50, 49, 49,
+        50, 50, 52, 52, 53, 55, 58, 58, 58, 60, 60, 60, 60, 61, 62, 63, 64, 65,
+        66, 67, 48, 47, 46, 46, 47, 47, 50, 51, 53, 53, 53, 53, 54, 54, 55, 56,
+        56, 58, 60, 61, 61, 63, 64, 65, 66, 67, 66, 66, 66, 66, 67, 67, 49, 47,
+        45, 45, 46, 45, 49, 51, 53, 56, 58, 59, 59, 61, 62, 63, 64, 65, 67, 68,
+        68, 69, 71, 70, 69, 68, 68, 69, 70, 71, 72, 73, 52, 50, 48, 48, 47, 47,
+        50, 52, 54, 57, 61, 62, 64, 66, 68, 69, 70, 72, 75, 75, 75, 76, 74, 72,
+        73, 74, 75, 75, 74, 74, 73, 73, 54, 52, 50, 49, 49, 48, 52, 54, 55, 59,
+        62, 64, 65, 68, 71, 72, 73, 75, 78, 78, 79, 79, 78, 79, 81, 79, 78, 76,
+        77, 78, 80, 81, 57, 54, 52, 51, 51, 50, 53, 55, 57, 60, 64, 65, 67, 71,
+        73, 75, 76, 79, 82, 82, 83, 85, 86, 85, 83, 82, 83, 84, 84, 83, 82, 81,
+        63, 60, 57, 57, 56, 54, 57, 59, 60, 64, 67, 69, 71, 75, 78, 80, 82, 85,
+        89, 89, 90, 92, 91, 88, 89, 90, 89, 87, 86, 87, 88, 90, 66, 63, 60, 59,
+        59, 57, 60, 61, 62, 66, 69, 71, 73, 77, 80, 82, 84, 88, 92, 92, 93, 95,
+        94, 95, 96, 93, 92, 93, 94, 93, 91, 90, 67, 64, 62, 61, 60, 58, 58, 61,
+        63, 65, 67, 70, 72, 74, 78, 80, 82, 86, 88, 90, 95, 96, 96, 98, 97, 98,
+        100, 98, 96, 96, 97, 99, 68, 65, 63, 62, 60, 60, 59, 61, 62, 65, 66, 68,
+        72, 73, 76, 79, 80, 84, 87, 89, 93, 94, 98, 99, 99, 102, 101, 102, 103,
+        103, 101, 99, 69, 66, 65, 63, 62, 61, 60, 60, 63, 64, 66, 68, 70, 73,
+        74, 78, 80, 82, 85, 87, 91, 92, 96, 98, 101, 102, 103, 105, 105, 105,
+        107, 108, 71, 67, 66, 64, 63, 62, 62, 61, 62, 64, 66, 67, 70, 71, 75,
+        76, 79, 81, 84, 86, 89, 91, 94, 97, 98, 102, 104, 106, 106, 109, 109,
+        108, 72, 68, 68, 65, 65, 63, 63, 61, 62, 65, 65, 68, 69, 72, 73, 77, 77,
+        81, 81, 86, 87, 91, 91, 96, 97, 101, 102, 107, 107, 109, 110, 113,
+        /* Size 4x16 */
+        31, 49, 63, 69, 32, 45, 57, 65, 36, 46, 56, 62, 43, 49, 57, 60, 46, 53,
+        60, 63, 45, 58, 67, 66, 46, 59, 71, 70, 50, 62, 78, 74, 52, 64, 82, 80,
+        57, 67, 89, 85, 59, 68, 90, 91, 62, 71, 91, 96, 63, 69, 89, 101, 65, 68,
+        89, 103, 67, 70, 86, 105, 69, 72, 88, 107,
+        /* Size 16x4 */
+        31, 32, 36, 43, 46, 45, 46, 50, 52, 57, 59, 62, 63, 65, 67, 69, 49, 45,
+        46, 49, 53, 58, 59, 62, 64, 67, 68, 71, 69, 68, 70, 72, 63, 57, 56, 57,
+        60, 67, 71, 78, 82, 89, 90, 91, 89, 89, 86, 88, 69, 65, 62, 60, 63, 66,
+        70, 74, 80, 85, 91, 96, 101, 103, 105, 107,
+        /* Size 8x32 */
+        32, 37, 48, 52, 57, 66, 68, 71, 31, 38, 47, 50, 54, 63, 65, 67, 30, 40,
+        46, 48, 52, 60, 63, 66, 32, 41, 46, 48, 51, 59, 62, 64, 33, 43, 47, 47,
+        51, 59, 60, 63, 37, 47, 47, 47, 50, 57, 60, 62, 42, 47, 50, 50, 53, 60,
+        59, 62, 45, 47, 51, 52, 55, 61, 61, 61, 49, 48, 53, 54, 57, 62, 62, 62,
+        48, 47, 53, 57, 60, 66, 65, 64, 49, 46, 53, 61, 64, 69, 66, 66, 49, 46,
+        53, 62, 65, 71, 68, 67, 50, 46, 54, 64, 67, 73, 72, 70, 52, 47, 54, 66,
+        71, 77, 73, 71, 54, 49, 55, 68, 73, 80, 76, 75, 55, 49, 56, 69, 75, 82,
+        79, 76, 57, 50, 56, 70, 76, 84, 80, 79, 60, 52, 58, 72, 79, 88, 84, 81,
+        63, 55, 60, 75, 82, 92, 87, 84, 64, 55, 61, 75, 82, 92, 89, 86, 64, 56,
+        61, 75, 83, 93, 93, 89, 67, 58, 63, 76, 85, 95, 94, 91, 68, 59, 64, 74,
+        86, 94, 98, 94, 69, 60, 65, 72, 85, 95, 99, 97, 70, 62, 66, 73, 83, 96,
+        99, 98, 71, 63, 67, 74, 82, 93, 102, 102, 72, 64, 66, 75, 83, 92, 101,
+        104, 73, 65, 66, 75, 84, 93, 102, 106, 74, 67, 66, 74, 84, 94, 103, 106,
+        75, 68, 66, 74, 83, 93, 103, 109, 76, 69, 67, 73, 82, 91, 101, 109, 77,
+        70, 67, 73, 81, 90, 99, 108,
+        /* Size 32x8 */
+        32, 31, 30, 32, 33, 37, 42, 45, 49, 48, 49, 49, 50, 52, 54, 55, 57, 60,
+        63, 64, 64, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 37, 38, 40, 41,
+        43, 47, 47, 47, 48, 47, 46, 46, 46, 47, 49, 49, 50, 52, 55, 55, 56, 58,
+        59, 60, 62, 63, 64, 65, 67, 68, 69, 70, 48, 47, 46, 46, 47, 47, 50, 51,
+        53, 53, 53, 53, 54, 54, 55, 56, 56, 58, 60, 61, 61, 63, 64, 65, 66, 67,
+        66, 66, 66, 66, 67, 67, 52, 50, 48, 48, 47, 47, 50, 52, 54, 57, 61, 62,
+        64, 66, 68, 69, 70, 72, 75, 75, 75, 76, 74, 72, 73, 74, 75, 75, 74, 74,
+        73, 73, 57, 54, 52, 51, 51, 50, 53, 55, 57, 60, 64, 65, 67, 71, 73, 75,
+        76, 79, 82, 82, 83, 85, 86, 85, 83, 82, 83, 84, 84, 83, 82, 81, 66, 63,
+        60, 59, 59, 57, 60, 61, 62, 66, 69, 71, 73, 77, 80, 82, 84, 88, 92, 92,
+        93, 95, 94, 95, 96, 93, 92, 93, 94, 93, 91, 90, 68, 65, 63, 62, 60, 60,
+        59, 61, 62, 65, 66, 68, 72, 73, 76, 79, 80, 84, 87, 89, 93, 94, 98, 99,
+        99, 102, 101, 102, 103, 103, 101, 99, 71, 67, 66, 64, 63, 62, 62, 61,
+        62, 64, 66, 67, 70, 71, 75, 76, 79, 81, 84, 86, 89, 91, 94, 97, 98, 102,
+        104, 106, 106, 109, 109, 108 },
+  },
+  {
+      { /* Luma */
+        /* Size 4x4 */
+        32, 41, 69, 92, 41, 63, 88, 103, 69, 88, 127, 140, 92, 103, 140, 184,
+        /* Size 8x8 */
+        32, 32, 37, 47, 62, 78, 90, 102, 32, 35, 39, 46, 58, 72, 84, 96, 37, 39,
+        51, 60, 71, 84, 93, 100, 47, 46, 60, 73, 87, 100, 106, 113, 62, 58, 71,
+        87, 105, 121, 129, 132, 78, 72, 84, 100, 121, 140, 148, 155, 90, 84, 93,
+        106, 129, 148, 169, 183, 102, 96, 100, 113, 132, 155, 183, 201,
+        /* Size 16x16 */
+        32, 31, 31, 32, 36, 39, 47, 54, 61, 71, 80, 86, 92, 98, 104, 111, 31,
+        32, 32, 33, 34, 37, 44, 50, 56, 65, 73, 79, 85, 91, 98, 105, 31, 32, 33,
+        34, 36, 39, 45, 50, 56, 64, 71, 77, 82, 88, 94, 100, 32, 33, 34, 36, 40,
+        42, 47, 51, 57, 65, 71, 76, 80, 85, 91, 98, 36, 34, 36, 40, 48, 50, 56,
+        60, 65, 73, 79, 84, 86, 90, 95, 98, 39, 37, 39, 42, 50, 54, 60, 65, 70,
+        78, 84, 89, 95, 96, 102, 105, 47, 44, 45, 47, 56, 60, 69, 75, 81, 89,
+        95, 100, 102, 104, 109, 112, 54, 50, 50, 51, 60, 65, 75, 82, 89, 97,
+        104, 109, 110, 114, 117, 121, 61, 56, 56, 57, 65, 70, 81, 89, 97, 106,
+        113, 119, 122, 126, 125, 130, 71, 65, 64, 65, 73, 78, 89, 97, 106, 117,
+        125, 131, 134, 134, 136, 141, 80, 73, 71, 71, 79, 84, 95, 104, 113, 125,
+        134, 140, 142, 145, 146, 152, 86, 79, 77, 76, 84, 89, 100, 109, 119,
+        131, 140, 147, 154, 157, 160, 165, 92, 85, 82, 80, 86, 95, 102, 110,
+        122, 134, 142, 154, 162, 168, 174, 178, 98, 91, 88, 85, 90, 96, 104,
+        114, 126, 134, 145, 157, 168, 176, 184, 193, 104, 98, 94, 91, 95, 102,
+        109, 117, 125, 136, 146, 160, 174, 184, 193, 201, 111, 105, 100, 98, 98,
+        105, 112, 121, 130, 141, 152, 165, 178, 193, 201, 210,
+        /* Size 32x32 */
+        32, 31, 31, 31, 31, 32, 32, 34, 36, 38, 39, 44, 47, 49, 54, 59, 61, 65,
+        71, 76, 80, 83, 86, 89, 92, 95, 98, 101, 104, 108, 111, 114, 31, 32, 32,
+        32, 32, 32, 33, 34, 35, 37, 38, 42, 45, 47, 51, 56, 58, 62, 68, 72, 76,
+        78, 82, 85, 88, 90, 93, 96, 99, 102, 105, 109, 31, 32, 32, 32, 32, 32,
+        33, 33, 34, 36, 37, 41, 44, 46, 50, 54, 56, 60, 65, 70, 73, 76, 79, 82,
+        85, 88, 91, 95, 98, 101, 105, 109, 31, 32, 32, 32, 32, 33, 33, 34, 35,
+        36, 38, 41, 44, 45, 49, 54, 56, 59, 65, 69, 72, 75, 78, 81, 84, 86, 89,
+        92, 95, 98, 101, 104, 31, 32, 32, 32, 33, 34, 34, 35, 36, 38, 39, 42,
+        45, 46, 50, 54, 56, 59, 64, 68, 71, 74, 77, 79, 82, 85, 88, 91, 94, 97,
+        100, 104, 32, 32, 32, 33, 34, 35, 36, 37, 38, 39, 40, 42, 45, 46, 49,
+        53, 55, 58, 63, 66, 69, 72, 74, 78, 81, 84, 87, 90, 93, 96, 99, 102, 32,
+        33, 33, 33, 34, 36, 36, 38, 40, 41, 42, 44, 47, 48, 51, 55, 57, 60, 65,
+        68, 71, 73, 76, 78, 80, 82, 85, 88, 91, 95, 98, 102, 34, 34, 33, 34, 35,
+        37, 38, 39, 42, 44, 45, 47, 50, 51, 54, 58, 60, 63, 68, 71, 74, 76, 79,
+        82, 85, 86, 87, 88, 90, 93, 96, 99, 36, 35, 34, 35, 36, 38, 40, 42, 48,
+        50, 50, 54, 56, 57, 60, 64, 65, 68, 73, 76, 79, 81, 84, 86, 86, 88, 90,
+        93, 95, 97, 98, 100, 38, 37, 36, 36, 38, 39, 41, 44, 50, 51, 52, 56, 58,
+        60, 63, 67, 68, 71, 76, 79, 82, 84, 87, 87, 90, 93, 94, 95, 96, 100,
+        103, 106, 39, 38, 37, 38, 39, 40, 42, 45, 50, 52, 54, 58, 60, 62, 65,
+        69, 70, 73, 78, 81, 84, 86, 89, 92, 95, 95, 96, 99, 102, 104, 105, 106,
+        44, 42, 41, 41, 42, 42, 44, 47, 54, 56, 58, 63, 66, 68, 71, 75, 77, 79,
+        84, 88, 90, 92, 95, 97, 97, 99, 102, 103, 103, 106, 109, 113, 47, 45,
+        44, 44, 45, 45, 47, 50, 56, 58, 60, 66, 69, 71, 75, 79, 81, 84, 89, 92,
+        95, 97, 100, 100, 102, 105, 104, 106, 109, 111, 112, 113, 49, 47, 46,
+        45, 46, 46, 48, 51, 57, 60, 62, 68, 71, 73, 77, 81, 83, 87, 92, 95, 98,
+        100, 103, 105, 107, 106, 109, 112, 112, 113, 117, 120, 54, 51, 50, 49,
+        50, 49, 51, 54, 60, 63, 65, 71, 75, 77, 82, 87, 89, 92, 97, 101, 104,
+        106, 109, 112, 110, 113, 114, 114, 117, 121, 121, 121, 59, 56, 54, 54,
+        54, 53, 55, 58, 64, 67, 69, 75, 79, 81, 87, 92, 94, 98, 103, 107, 110,
+        113, 116, 114, 117, 118, 117, 121, 122, 122, 125, 129, 61, 58, 56, 56,
+        56, 55, 57, 60, 65, 68, 70, 77, 81, 83, 89, 94, 97, 101, 106, 110, 113,
+        116, 119, 120, 122, 121, 126, 124, 125, 130, 130, 130, 65, 62, 60, 59,
+        59, 58, 60, 63, 68, 71, 73, 79, 84, 87, 92, 98, 101, 105, 111, 115, 118,
+        121, 124, 128, 125, 129, 128, 131, 133, 132, 135, 139, 71, 68, 65, 65,
+        64, 63, 65, 68, 73, 76, 78, 84, 89, 92, 97, 103, 106, 111, 117, 122,
+        125, 128, 131, 131, 134, 132, 134, 136, 136, 140, 141, 140, 76, 72, 70,
+        69, 68, 66, 68, 71, 76, 79, 81, 88, 92, 95, 101, 107, 110, 115, 122,
+        127, 130, 133, 136, 136, 138, 139, 141, 140, 145, 143, 146, 151, 80, 76,
+        73, 72, 71, 69, 71, 74, 79, 82, 84, 90, 95, 98, 104, 110, 113, 118, 125,
+        130, 134, 137, 140, 146, 142, 146, 145, 149, 146, 150, 152, 151, 83, 78,
+        76, 75, 74, 72, 73, 76, 81, 84, 86, 92, 97, 100, 106, 113, 116, 121,
+        128, 133, 137, 140, 144, 147, 152, 148, 154, 151, 156, 155, 156, 162,
+        86, 82, 79, 78, 77, 74, 76, 79, 84, 87, 89, 95, 100, 103, 109, 116, 119,
+        124, 131, 136, 140, 144, 147, 150, 154, 159, 157, 160, 160, 162, 165,
+        162, 89, 85, 82, 81, 79, 78, 78, 82, 86, 87, 92, 97, 100, 105, 112, 114,
+        120, 128, 131, 136, 146, 147, 150, 155, 156, 161, 166, 165, 167, 169,
+        169, 175, 92, 88, 85, 84, 82, 81, 80, 85, 86, 90, 95, 97, 102, 107, 110,
+        117, 122, 125, 134, 138, 142, 152, 154, 156, 162, 163, 168, 173, 174,
+        174, 178, 176, 95, 90, 88, 86, 85, 84, 82, 86, 88, 93, 95, 99, 105, 106,
+        113, 118, 121, 129, 132, 139, 146, 148, 159, 161, 163, 169, 170, 176,
+        180, 183, 181, 187, 98, 93, 91, 89, 88, 87, 85, 87, 90, 94, 96, 102,
+        104, 109, 114, 117, 126, 128, 134, 141, 145, 154, 157, 166, 168, 170,
+        176, 178, 184, 188, 193, 188, 101, 96, 95, 92, 91, 90, 88, 88, 93, 95,
+        99, 103, 106, 112, 114, 121, 124, 131, 136, 140, 149, 151, 160, 165,
+        173, 176, 178, 184, 186, 192, 196, 203, 104, 99, 98, 95, 94, 93, 91, 90,
+        95, 96, 102, 103, 109, 112, 117, 122, 125, 133, 136, 145, 146, 156, 160,
+        167, 174, 180, 184, 186, 193, 194, 201, 204, 108, 102, 101, 98, 97, 96,
+        95, 93, 97, 100, 104, 106, 111, 113, 121, 122, 130, 132, 140, 143, 150,
+        155, 162, 169, 174, 183, 188, 192, 194, 201, 202, 210, 111, 105, 105,
+        101, 100, 99, 98, 96, 98, 103, 105, 109, 112, 117, 121, 125, 130, 135,
+        141, 146, 152, 156, 165, 169, 178, 181, 193, 196, 201, 202, 210, 211,
+        114, 109, 109, 104, 104, 102, 102, 99, 100, 106, 106, 113, 113, 120,
+        121, 129, 130, 139, 140, 151, 151, 162, 162, 175, 176, 187, 188, 203,
+        204, 210, 211, 219,
+        /* Size 4x8 */
+        32, 42, 69, 88, 33, 42, 64, 83, 36, 56, 77, 88, 46, 67, 93, 105, 60, 79,
+        112, 122, 75, 92, 130, 144, 86, 95, 136, 167, 98, 105, 136, 177,
+        /* Size 8x4 */
+        32, 33, 36, 46, 60, 75, 86, 98, 42, 42, 56, 67, 79, 92, 95, 105, 69, 64,
+        77, 93, 112, 130, 136, 136, 88, 83, 88, 105, 122, 144, 167, 177,
+        /* Size 8x16 */
+        32, 32, 36, 47, 65, 79, 90, 96, 31, 32, 35, 44, 60, 72, 84, 90, 32, 34,
+        36, 45, 59, 71, 80, 87, 32, 35, 40, 47, 60, 71, 78, 85, 36, 37, 48, 56,
+        68, 78, 83, 87, 39, 40, 50, 60, 73, 84, 91, 94, 47, 45, 56, 69, 84, 95,
+        101, 101, 53, 50, 60, 75, 92, 103, 108, 110, 61, 56, 65, 81, 100, 113,
+        116, 118, 71, 64, 73, 89, 111, 125, 129, 129, 79, 70, 79, 95, 118, 133,
+        142, 138, 86, 76, 84, 100, 124, 140, 153, 150, 92, 82, 89, 101, 121,
+        148, 157, 161, 98, 88, 93, 108, 124, 141, 163, 174, 104, 94, 95, 110,
+        129, 151, 171, 181, 110, 100, 98, 111, 127, 147, 169, 188,
+        /* Size 16x8 */
+        32, 31, 32, 32, 36, 39, 47, 53, 61, 71, 79, 86, 92, 98, 104, 110, 32,
+        32, 34, 35, 37, 40, 45, 50, 56, 64, 70, 76, 82, 88, 94, 100, 36, 35, 36,
+        40, 48, 50, 56, 60, 65, 73, 79, 84, 89, 93, 95, 98, 47, 44, 45, 47, 56,
+        60, 69, 75, 81, 89, 95, 100, 101, 108, 110, 111, 65, 60, 59, 60, 68, 73,
+        84, 92, 100, 111, 118, 124, 121, 124, 129, 127, 79, 72, 71, 71, 78, 84,
+        95, 103, 113, 125, 133, 140, 148, 141, 151, 147, 90, 84, 80, 78, 83, 91,
+        101, 108, 116, 129, 142, 153, 157, 163, 171, 169, 96, 90, 87, 85, 87,
+        94, 101, 110, 118, 129, 138, 150, 161, 174, 181, 188,
+        /* Size 16x32 */
+        32, 31, 32, 32, 36, 44, 47, 53, 65, 73, 79, 87, 90, 93, 96, 99, 31, 32,
+        32, 33, 35, 42, 45, 51, 62, 69, 75, 83, 86, 88, 91, 94, 31, 32, 32, 33,
+        35, 41, 44, 49, 60, 67, 72, 80, 84, 87, 90, 94, 31, 32, 33, 33, 35, 41,
+        44, 49, 59, 66, 71, 79, 82, 84, 87, 90, 32, 32, 34, 34, 36, 42, 45, 50,
+        59, 65, 71, 78, 80, 83, 87, 90, 32, 33, 35, 36, 38, 42, 45, 49, 58, 64,
+        69, 76, 80, 83, 86, 88, 32, 33, 35, 36, 40, 44, 47, 51, 60, 66, 71, 76,
+        78, 81, 85, 89, 34, 34, 36, 38, 42, 48, 50, 54, 63, 69, 73, 80, 82, 81,
+        84, 86, 36, 34, 37, 40, 48, 54, 56, 60, 68, 74, 78, 84, 83, 86, 87, 87,
+        38, 36, 39, 41, 49, 56, 58, 63, 71, 77, 81, 86, 88, 88, 90, 93, 39, 37,
+        40, 42, 50, 58, 60, 65, 73, 79, 84, 90, 91, 92, 94, 93, 44, 41, 42, 45,
+        53, 63, 66, 71, 79, 85, 90, 96, 94, 96, 96, 99, 47, 44, 45, 47, 56, 66,
+        69, 75, 84, 90, 95, 99, 101, 98, 101, 99, 49, 46, 47, 48, 57, 67, 71,
+        77, 86, 93, 97, 103, 103, 105, 102, 106, 53, 49, 50, 51, 60, 71, 75, 82,
+        92, 99, 103, 111, 108, 107, 110, 107, 58, 54, 54, 55, 63, 75, 79, 87,
+        98, 105, 110, 114, 114, 113, 111, 115, 61, 56, 56, 57, 65, 77, 81, 89,
+        100, 107, 113, 118, 116, 117, 118, 116, 65, 60, 59, 60, 68, 79, 84, 92,
+        105, 112, 118, 126, 124, 122, 121, 124, 71, 65, 64, 65, 73, 84, 89, 97,
+        111, 119, 125, 130, 129, 129, 129, 125, 76, 69, 68, 69, 76, 88, 92, 101,
+        115, 123, 130, 134, 134, 131, 132, 135, 79, 72, 70, 71, 79, 90, 95, 104,
+        118, 127, 133, 143, 142, 141, 138, 136, 82, 75, 73, 74, 81, 92, 97, 106,
+        121, 130, 136, 146, 145, 144, 144, 145, 86, 78, 76, 77, 84, 95, 100,
+        109, 124, 133, 140, 147, 153, 151, 150, 146, 89, 81, 79, 78, 87, 95, 99,
+        112, 124, 130, 145, 152, 156, 157, 156, 158, 92, 84, 82, 80, 89, 95,
+        101, 116, 121, 132, 148, 151, 157, 163, 161, 159, 95, 86, 85, 83, 92,
+        95, 105, 114, 120, 136, 143, 155, 163, 167, 171, 170, 98, 89, 88, 85,
+        93, 95, 108, 113, 124, 136, 141, 160, 163, 169, 174, 171, 101, 92, 91,
+        88, 94, 98, 110, 112, 128, 133, 146, 158, 166, 175, 179, 185, 104, 95,
+        94, 91, 95, 101, 110, 115, 129, 132, 151, 154, 171, 175, 181, 186, 107,
+        98, 97, 94, 96, 105, 110, 119, 128, 136, 149, 156, 173, 177, 188, 192,
+        110, 101, 100, 97, 98, 108, 111, 123, 127, 141, 147, 161, 169, 183, 188,
+        193, 114, 104, 104, 100, 100, 111, 111, 126, 127, 145, 145, 166, 166,
+        189, 190, 201,
+        /* Size 32x16 */
+        32, 31, 31, 31, 32, 32, 32, 34, 36, 38, 39, 44, 47, 49, 53, 58, 61, 65,
+        71, 76, 79, 82, 86, 89, 92, 95, 98, 101, 104, 107, 110, 114, 31, 32, 32,
+        32, 32, 33, 33, 34, 34, 36, 37, 41, 44, 46, 49, 54, 56, 60, 65, 69, 72,
+        75, 78, 81, 84, 86, 89, 92, 95, 98, 101, 104, 32, 32, 32, 33, 34, 35,
+        35, 36, 37, 39, 40, 42, 45, 47, 50, 54, 56, 59, 64, 68, 70, 73, 76, 79,
+        82, 85, 88, 91, 94, 97, 100, 104, 32, 33, 33, 33, 34, 36, 36, 38, 40,
+        41, 42, 45, 47, 48, 51, 55, 57, 60, 65, 69, 71, 74, 77, 78, 80, 83, 85,
+        88, 91, 94, 97, 100, 36, 35, 35, 35, 36, 38, 40, 42, 48, 49, 50, 53, 56,
+        57, 60, 63, 65, 68, 73, 76, 79, 81, 84, 87, 89, 92, 93, 94, 95, 96, 98,
+        100, 44, 42, 41, 41, 42, 42, 44, 48, 54, 56, 58, 63, 66, 67, 71, 75, 77,
+        79, 84, 88, 90, 92, 95, 95, 95, 95, 95, 98, 101, 105, 108, 111, 47, 45,
+        44, 44, 45, 45, 47, 50, 56, 58, 60, 66, 69, 71, 75, 79, 81, 84, 89, 92,
+        95, 97, 100, 99, 101, 105, 108, 110, 110, 110, 111, 111, 53, 51, 49, 49,
+        50, 49, 51, 54, 60, 63, 65, 71, 75, 77, 82, 87, 89, 92, 97, 101, 104,
+        106, 109, 112, 116, 114, 113, 112, 115, 119, 123, 126, 65, 62, 60, 59,
+        59, 58, 60, 63, 68, 71, 73, 79, 84, 86, 92, 98, 100, 105, 111, 115, 118,
+        121, 124, 124, 121, 120, 124, 128, 129, 128, 127, 127, 73, 69, 67, 66,
+        65, 64, 66, 69, 74, 77, 79, 85, 90, 93, 99, 105, 107, 112, 119, 123,
+        127, 130, 133, 130, 132, 136, 136, 133, 132, 136, 141, 145, 79, 75, 72,
+        71, 71, 69, 71, 73, 78, 81, 84, 90, 95, 97, 103, 110, 113, 118, 125,
+        130, 133, 136, 140, 145, 148, 143, 141, 146, 151, 149, 147, 145, 87, 83,
+        80, 79, 78, 76, 76, 80, 84, 86, 90, 96, 99, 103, 111, 114, 118, 126,
+        130, 134, 143, 146, 147, 152, 151, 155, 160, 158, 154, 156, 161, 166,
+        90, 86, 84, 82, 80, 80, 78, 82, 83, 88, 91, 94, 101, 103, 108, 114, 116,
+        124, 129, 134, 142, 145, 153, 156, 157, 163, 163, 166, 171, 173, 169,
+        166, 93, 88, 87, 84, 83, 83, 81, 81, 86, 88, 92, 96, 98, 105, 107, 113,
+        117, 122, 129, 131, 141, 144, 151, 157, 163, 167, 169, 175, 175, 177,
+        183, 189, 96, 91, 90, 87, 87, 86, 85, 84, 87, 90, 94, 96, 101, 102, 110,
+        111, 118, 121, 129, 132, 138, 144, 150, 156, 161, 171, 174, 179, 181,
+        188, 188, 190, 99, 94, 94, 90, 90, 88, 89, 86, 87, 93, 93, 99, 99, 106,
+        107, 115, 116, 124, 125, 135, 136, 145, 146, 158, 159, 170, 171, 185,
+        186, 192, 193, 201,
+        /* Size 4x16 */
+        31, 44, 73, 93, 32, 41, 67, 87, 32, 42, 65, 83, 33, 44, 66, 81, 34, 54,
+        74, 86, 37, 58, 79, 92, 44, 66, 90, 98, 49, 71, 99, 107, 56, 77, 107,
+        117, 65, 84, 119, 129, 72, 90, 127, 141, 78, 95, 133, 151, 84, 95, 132,
+        163, 89, 95, 136, 169, 95, 101, 132, 175, 101, 108, 141, 183,
+        /* Size 16x4 */
+        31, 32, 32, 33, 34, 37, 44, 49, 56, 65, 72, 78, 84, 89, 95, 101, 44, 41,
+        42, 44, 54, 58, 66, 71, 77, 84, 90, 95, 95, 95, 101, 108, 73, 67, 65,
+        66, 74, 79, 90, 99, 107, 119, 127, 133, 132, 136, 132, 141, 93, 87, 83,
+        81, 86, 92, 98, 107, 117, 129, 141, 151, 163, 169, 175, 183,
+        /* Size 8x32 */
+        32, 32, 36, 47, 65, 79, 90, 96, 31, 32, 35, 45, 62, 75, 86, 91, 31, 32,
+        35, 44, 60, 72, 84, 90, 31, 33, 35, 44, 59, 71, 82, 87, 32, 34, 36, 45,
+        59, 71, 80, 87, 32, 35, 38, 45, 58, 69, 80, 86, 32, 35, 40, 47, 60, 71,
+        78, 85, 34, 36, 42, 50, 63, 73, 82, 84, 36, 37, 48, 56, 68, 78, 83, 87,
+        38, 39, 49, 58, 71, 81, 88, 90, 39, 40, 50, 60, 73, 84, 91, 94, 44, 42,
+        53, 66, 79, 90, 94, 96, 47, 45, 56, 69, 84, 95, 101, 101, 49, 47, 57,
+        71, 86, 97, 103, 102, 53, 50, 60, 75, 92, 103, 108, 110, 58, 54, 63, 79,
+        98, 110, 114, 111, 61, 56, 65, 81, 100, 113, 116, 118, 65, 59, 68, 84,
+        105, 118, 124, 121, 71, 64, 73, 89, 111, 125, 129, 129, 76, 68, 76, 92,
+        115, 130, 134, 132, 79, 70, 79, 95, 118, 133, 142, 138, 82, 73, 81, 97,
+        121, 136, 145, 144, 86, 76, 84, 100, 124, 140, 153, 150, 89, 79, 87, 99,
+        124, 145, 156, 156, 92, 82, 89, 101, 121, 148, 157, 161, 95, 85, 92,
+        105, 120, 143, 163, 171, 98, 88, 93, 108, 124, 141, 163, 174, 101, 91,
+        94, 110, 128, 146, 166, 179, 104, 94, 95, 110, 129, 151, 171, 181, 107,
+        97, 96, 110, 128, 149, 173, 188, 110, 100, 98, 111, 127, 147, 169, 188,
+        114, 104, 100, 111, 127, 145, 166, 190,
+        /* Size 32x8 */
+        32, 31, 31, 31, 32, 32, 32, 34, 36, 38, 39, 44, 47, 49, 53, 58, 61, 65,
+        71, 76, 79, 82, 86, 89, 92, 95, 98, 101, 104, 107, 110, 114, 32, 32, 32,
+        33, 34, 35, 35, 36, 37, 39, 40, 42, 45, 47, 50, 54, 56, 59, 64, 68, 70,
+        73, 76, 79, 82, 85, 88, 91, 94, 97, 100, 104, 36, 35, 35, 35, 36, 38,
+        40, 42, 48, 49, 50, 53, 56, 57, 60, 63, 65, 68, 73, 76, 79, 81, 84, 87,
+        89, 92, 93, 94, 95, 96, 98, 100, 47, 45, 44, 44, 45, 45, 47, 50, 56, 58,
+        60, 66, 69, 71, 75, 79, 81, 84, 89, 92, 95, 97, 100, 99, 101, 105, 108,
+        110, 110, 110, 111, 111, 65, 62, 60, 59, 59, 58, 60, 63, 68, 71, 73, 79,
+        84, 86, 92, 98, 100, 105, 111, 115, 118, 121, 124, 124, 121, 120, 124,
+        128, 129, 128, 127, 127, 79, 75, 72, 71, 71, 69, 71, 73, 78, 81, 84, 90,
+        95, 97, 103, 110, 113, 118, 125, 130, 133, 136, 140, 145, 148, 143, 141,
+        146, 151, 149, 147, 145, 90, 86, 84, 82, 80, 80, 78, 82, 83, 88, 91, 94,
+        101, 103, 108, 114, 116, 124, 129, 134, 142, 145, 153, 156, 157, 163,
+        163, 166, 171, 173, 169, 166, 96, 91, 90, 87, 87, 86, 85, 84, 87, 90,
+        94, 96, 101, 102, 110, 111, 118, 121, 129, 132, 138, 144, 150, 156, 161,
+        171, 174, 179, 181, 188, 188, 190 },
+      { /* Chroma */
+        /* Size 4x4 */
+        33, 45, 56, 64, 45, 58, 66, 69, 56, 66, 86, 87, 64, 69, 87, 105,
+        /* Size 8x8 */
+        31, 38, 47, 48, 54, 61, 66, 69, 38, 47, 47, 46, 50, 55, 61, 65, 47, 47,
+        53, 55, 58, 63, 65, 66, 48, 46, 55, 62, 67, 72, 73, 73, 54, 50, 58, 67,
+        76, 83, 84, 82, 61, 55, 63, 72, 83, 91, 92, 92, 66, 61, 65, 73, 84, 92,
+        101, 103, 69, 65, 66, 73, 82, 92, 103, 109,
+        /* Size 16x16 */
+        32, 30, 33, 38, 49, 48, 50, 52, 55, 60, 63, 66, 68, 70, 72, 74, 30, 31,
+        35, 41, 46, 46, 46, 48, 51, 55, 58, 60, 63, 65, 68, 70, 33, 35, 39, 44,
+        47, 46, 46, 47, 50, 53, 56, 58, 60, 62, 65, 67, 38, 41, 44, 47, 49, 48,
+        47, 48, 50, 53, 55, 58, 58, 60, 62, 65, 49, 46, 47, 49, 53, 53, 54, 54,
+        56, 58, 60, 62, 62, 63, 64, 64, 48, 46, 46, 48, 53, 54, 56, 57, 59, 61,
+        63, 65, 67, 66, 68, 68, 50, 46, 46, 47, 54, 56, 61, 63, 65, 68, 70, 72,
+        71, 71, 72, 72, 52, 48, 47, 48, 54, 57, 63, 66, 69, 72, 75, 76, 75, 76,
+        76, 76, 55, 51, 50, 50, 56, 59, 65, 69, 73, 77, 79, 81, 81, 81, 80, 80,
+        60, 55, 53, 53, 58, 61, 68, 72, 77, 82, 85, 87, 87, 85, 84, 85, 63, 58,
+        56, 55, 60, 63, 70, 75, 79, 85, 89, 91, 91, 90, 89, 90, 66, 60, 58, 58,
+        62, 65, 72, 76, 81, 87, 91, 94, 96, 95, 95, 95, 68, 63, 60, 58, 62, 67,
+        71, 75, 81, 87, 91, 96, 99, 100, 100, 100, 70, 65, 62, 60, 63, 66, 71,
+        76, 81, 85, 90, 95, 100, 103, 104, 105, 72, 68, 65, 62, 64, 68, 72, 76,
+        80, 84, 89, 95, 100, 104, 107, 108, 74, 70, 67, 65, 64, 68, 72, 76, 80,
+        85, 90, 95, 100, 105, 108, 111,
+        /* Size 32x32 */
+        32, 31, 30, 31, 33, 36, 38, 41, 49, 49, 48, 49, 50, 51, 52, 54, 55, 57,
+        60, 62, 63, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 31, 31, 31, 32,
+        34, 38, 40, 42, 47, 47, 47, 47, 48, 48, 50, 52, 53, 54, 57, 59, 60, 61,
+        63, 64, 65, 66, 67, 67, 68, 69, 70, 71, 30, 31, 31, 32, 35, 39, 41, 42,
+        46, 46, 46, 45, 46, 47, 48, 50, 51, 52, 55, 57, 58, 59, 60, 62, 63, 64,
+        65, 67, 68, 69, 70, 71, 31, 32, 32, 33, 36, 40, 41, 43, 46, 46, 45, 45,
+        46, 46, 47, 49, 50, 51, 54, 56, 57, 58, 59, 61, 62, 63, 63, 64, 65, 66,
+        67, 68, 33, 34, 35, 36, 39, 43, 44, 45, 47, 46, 46, 45, 46, 47, 47, 49,
+        50, 51, 53, 55, 56, 57, 58, 59, 60, 61, 62, 63, 65, 66, 67, 68, 36, 38,
+        39, 40, 43, 47, 47, 47, 48, 47, 46, 45, 46, 46, 47, 48, 49, 50, 52, 53,
+        54, 55, 56, 58, 59, 61, 62, 63, 64, 65, 66, 66, 38, 40, 41, 41, 44, 47,
+        47, 48, 49, 48, 48, 47, 47, 47, 48, 49, 50, 51, 53, 54, 55, 56, 58, 58,
+        58, 59, 60, 61, 62, 64, 65, 66, 41, 42, 42, 43, 45, 47, 48, 48, 50, 50,
+        49, 49, 50, 50, 50, 52, 52, 53, 55, 56, 57, 58, 59, 60, 61, 61, 61, 61,
+        62, 63, 63, 64, 49, 47, 46, 46, 47, 48, 49, 50, 53, 53, 53, 53, 54, 54,
+        54, 55, 56, 56, 58, 59, 60, 61, 62, 63, 62, 62, 63, 64, 64, 64, 64, 64,
+        49, 47, 46, 46, 46, 47, 48, 50, 53, 53, 54, 55, 55, 55, 56, 57, 58, 58,
+        60, 61, 62, 63, 64, 64, 64, 65, 65, 65, 65, 66, 67, 68, 48, 47, 46, 45,
+        46, 46, 48, 49, 53, 54, 54, 55, 56, 56, 57, 58, 59, 60, 61, 63, 63, 64,
+        65, 66, 67, 66, 66, 67, 68, 68, 68, 68, 49, 47, 45, 45, 45, 45, 47, 49,
+        53, 55, 55, 58, 59, 60, 61, 62, 63, 63, 65, 66, 67, 68, 69, 69, 68, 68,
+        69, 69, 69, 69, 70, 71, 50, 48, 46, 46, 46, 46, 47, 50, 54, 55, 56, 59,
+        61, 61, 63, 64, 65, 66, 68, 69, 70, 71, 72, 71, 71, 72, 71, 71, 72, 72,
+        72, 71, 51, 48, 47, 46, 47, 46, 47, 50, 54, 55, 56, 60, 61, 62, 64, 66,
+        66, 67, 69, 70, 71, 72, 73, 73, 74, 73, 73, 74, 73, 73, 74, 75, 52, 50,
+        48, 47, 47, 47, 48, 50, 54, 56, 57, 61, 63, 64, 66, 68, 69, 70, 72, 74,
+        75, 75, 76, 77, 75, 76, 76, 75, 76, 77, 76, 75, 54, 52, 50, 49, 49, 48,
+        49, 52, 55, 57, 58, 62, 64, 66, 68, 71, 72, 73, 75, 77, 78, 79, 80, 78,
+        79, 78, 77, 78, 78, 77, 78, 79, 55, 53, 51, 50, 50, 49, 50, 52, 56, 58,
+        59, 63, 65, 66, 69, 72, 73, 74, 77, 78, 79, 80, 81, 81, 81, 80, 81, 80,
+        80, 81, 80, 79, 57, 54, 52, 51, 51, 50, 51, 53, 56, 58, 60, 63, 66, 67,
+        70, 73, 74, 76, 79, 80, 82, 83, 84, 85, 83, 84, 83, 83, 83, 82, 82, 83,
+        60, 57, 55, 54, 53, 52, 53, 55, 58, 60, 61, 65, 68, 69, 72, 75, 77, 79,
+        82, 84, 85, 86, 87, 86, 87, 85, 85, 85, 84, 86, 85, 84, 62, 59, 57, 56,
+        55, 53, 54, 56, 59, 61, 63, 66, 69, 70, 74, 77, 78, 80, 84, 86, 87, 88,
+        90, 89, 89, 88, 88, 87, 88, 87, 87, 88, 63, 60, 58, 57, 56, 54, 55, 57,
+        60, 62, 63, 67, 70, 71, 75, 78, 79, 82, 85, 87, 89, 90, 91, 93, 91, 91,
+        90, 91, 89, 90, 90, 89, 65, 61, 59, 58, 57, 55, 56, 58, 61, 63, 64, 68,
+        71, 72, 75, 79, 80, 83, 86, 88, 90, 91, 93, 94, 95, 92, 94, 92, 93, 92,
+        91, 93, 66, 63, 60, 59, 58, 56, 58, 59, 62, 64, 65, 69, 72, 73, 76, 80,
+        81, 84, 87, 90, 91, 93, 94, 95, 96, 97, 95, 95, 95, 95, 95, 93, 67, 64,
+        62, 61, 59, 58, 58, 60, 63, 64, 66, 69, 71, 73, 77, 78, 81, 85, 86, 89,
+        93, 94, 95, 97, 97, 98, 99, 97, 97, 97, 96, 98, 68, 65, 63, 62, 60, 59,
+        58, 61, 62, 64, 67, 68, 71, 74, 75, 79, 81, 83, 87, 89, 91, 95, 96, 97,
+        99, 98, 100, 100, 100, 99, 100, 98, 69, 66, 64, 63, 61, 61, 59, 61, 62,
+        65, 66, 68, 72, 73, 76, 78, 80, 84, 85, 88, 91, 92, 97, 98, 98, 101,
+        100, 102, 102, 103, 101, 102, 70, 67, 65, 63, 62, 62, 60, 61, 63, 65,
+        66, 69, 71, 73, 76, 77, 81, 83, 85, 88, 90, 94, 95, 99, 100, 100, 103,
+        102, 104, 104, 105, 103, 71, 67, 67, 64, 63, 63, 61, 61, 64, 65, 67, 69,
+        71, 74, 75, 78, 80, 83, 85, 87, 91, 92, 95, 97, 100, 102, 102, 105, 104,
+        106, 106, 108, 72, 68, 68, 65, 65, 64, 62, 62, 64, 65, 68, 69, 72, 73,
+        76, 78, 80, 83, 84, 88, 89, 93, 95, 97, 100, 102, 104, 104, 107, 106,
+        108, 108, 73, 69, 69, 66, 66, 65, 64, 63, 64, 66, 68, 69, 72, 73, 77,
+        77, 81, 82, 86, 87, 90, 92, 95, 97, 99, 103, 104, 106, 106, 109, 108,
+        110, 74, 70, 70, 67, 67, 66, 65, 63, 64, 67, 68, 70, 72, 74, 76, 78, 80,
+        82, 85, 87, 90, 91, 95, 96, 100, 101, 105, 106, 108, 108, 111, 110, 75,
+        71, 71, 68, 68, 66, 66, 64, 64, 68, 68, 71, 71, 75, 75, 79, 79, 83, 84,
+        88, 89, 93, 93, 98, 98, 102, 103, 108, 108, 110, 110, 113,
+        /* Size 4x8 */
+        31, 47, 57, 65, 40, 45, 52, 61, 46, 55, 61, 63, 47, 60, 70, 72, 52, 64,
+        79, 81, 59, 68, 87, 90, 63, 66, 88, 99, 66, 69, 85, 102,
+        /* Size 8x4 */
+        31, 40, 46, 47, 52, 59, 63, 66, 47, 45, 55, 60, 64, 68, 66, 69, 57, 52,
+        61, 70, 79, 87, 88, 85, 65, 61, 63, 72, 81, 90, 99, 102,
+        /* Size 8x16 */
+        32, 35, 48, 50, 57, 63, 68, 70, 30, 38, 46, 46, 52, 58, 63, 65, 33, 41,
+        47, 46, 51, 56, 60, 63, 39, 46, 48, 47, 51, 55, 58, 61, 49, 48, 53, 54,
+        57, 60, 61, 61, 48, 46, 53, 56, 60, 64, 65, 65, 50, 46, 54, 61, 66, 70,
+        71, 69, 52, 47, 54, 63, 71, 75, 75, 74, 55, 49, 56, 65, 74, 79, 79, 78,
+        60, 53, 58, 68, 79, 85, 85, 82, 63, 55, 60, 70, 82, 89, 91, 87, 66, 58,
+        62, 72, 84, 91, 95, 91, 68, 60, 64, 71, 81, 94, 97, 96, 70, 62, 65, 73,
+        81, 89, 98, 101, 72, 65, 65, 72, 82, 92, 100, 103, 74, 67, 65, 71, 79,
+        89, 98, 105,
+        /* Size 16x8 */
+        32, 30, 33, 39, 49, 48, 50, 52, 55, 60, 63, 66, 68, 70, 72, 74, 35, 38,
+        41, 46, 48, 46, 46, 47, 49, 53, 55, 58, 60, 62, 65, 67, 48, 46, 47, 48,
+        53, 53, 54, 54, 56, 58, 60, 62, 64, 65, 65, 65, 50, 46, 46, 47, 54, 56,
+        61, 63, 65, 68, 70, 72, 71, 73, 72, 71, 57, 52, 51, 51, 57, 60, 66, 71,
+        74, 79, 82, 84, 81, 81, 82, 79, 63, 58, 56, 55, 60, 64, 70, 75, 79, 85,
+        89, 91, 94, 89, 92, 89, 68, 63, 60, 58, 61, 65, 71, 75, 79, 85, 91, 95,
+        97, 98, 100, 98, 70, 65, 63, 61, 61, 65, 69, 74, 78, 82, 87, 91, 96,
+        101, 103, 105,
+        /* Size 16x32 */
+        32, 31, 35, 38, 48, 49, 50, 52, 57, 61, 63, 67, 68, 69, 70, 71, 31, 31,
+        37, 40, 47, 47, 48, 50, 54, 57, 60, 63, 64, 65, 66, 67, 30, 32, 38, 40,
+        46, 45, 46, 48, 52, 55, 58, 61, 63, 64, 65, 67, 31, 33, 38, 41, 46, 45,
+        46, 48, 52, 55, 57, 60, 61, 62, 63, 64, 33, 36, 41, 44, 47, 46, 46, 47,
+        51, 54, 56, 59, 60, 61, 63, 64, 37, 40, 45, 47, 47, 45, 46, 47, 50, 52,
+        54, 57, 59, 61, 62, 62, 39, 41, 46, 47, 48, 47, 47, 48, 51, 54, 55, 57,
+        58, 59, 61, 62, 42, 43, 46, 48, 50, 49, 50, 50, 53, 56, 57, 60, 60, 59,
+        60, 60, 49, 46, 48, 49, 53, 53, 54, 54, 57, 59, 60, 63, 61, 62, 61, 61,
+        48, 46, 47, 48, 53, 55, 55, 56, 58, 61, 62, 64, 64, 63, 63, 64, 48, 46,
+        46, 48, 53, 56, 56, 57, 60, 62, 64, 66, 65, 65, 65, 64, 49, 45, 45, 47,
+        53, 58, 59, 61, 64, 66, 67, 69, 67, 67, 66, 67, 50, 46, 46, 48, 54, 59,
+        61, 63, 66, 68, 70, 71, 71, 68, 69, 67, 51, 47, 47, 48, 54, 60, 61, 64,
+        68, 70, 71, 73, 72, 72, 70, 71, 52, 48, 47, 48, 54, 61, 63, 66, 71, 73,
+        75, 77, 75, 73, 74, 71, 54, 50, 49, 50, 55, 62, 65, 68, 73, 76, 78, 79,
+        78, 76, 74, 75, 55, 51, 49, 50, 56, 63, 65, 69, 74, 77, 79, 81, 79, 78,
+        78, 75, 57, 52, 50, 51, 56, 64, 66, 70, 76, 79, 82, 85, 83, 81, 79, 79,
+        60, 54, 53, 53, 58, 65, 68, 72, 79, 82, 85, 87, 85, 84, 82, 80, 62, 56,
+        54, 55, 60, 66, 69, 74, 81, 84, 87, 88, 87, 85, 84, 84, 63, 57, 55, 56,
+        60, 67, 70, 75, 82, 86, 89, 92, 91, 89, 87, 84, 64, 59, 56, 57, 61, 68,
+        71, 75, 83, 87, 90, 93, 92, 90, 89, 89, 66, 60, 58, 58, 62, 69, 72, 76,
+        84, 88, 91, 94, 95, 93, 91, 89, 67, 61, 59, 58, 63, 68, 71, 78, 83, 86,
+        93, 96, 96, 96, 94, 94, 68, 62, 60, 59, 64, 67, 71, 79, 81, 86, 94, 95,
+        97, 98, 96, 94, 69, 63, 61, 60, 65, 66, 72, 77, 80, 88, 91, 96, 99, 99,
+        100, 98, 70, 64, 62, 60, 65, 66, 73, 76, 81, 87, 89, 97, 98, 100, 101,
+        99, 71, 65, 64, 61, 65, 67, 73, 74, 82, 85, 90, 95, 99, 102, 103, 104,
+        72, 65, 65, 62, 65, 68, 72, 75, 82, 83, 92, 93, 100, 102, 103, 104, 73,
+        66, 66, 63, 65, 69, 72, 76, 81, 85, 90, 93, 100, 102, 105, 106, 74, 67,
+        67, 64, 65, 70, 71, 77, 79, 86, 89, 94, 98, 103, 105, 106, 75, 68, 68,
+        65, 65, 71, 71, 78, 78, 87, 87, 96, 96, 105, 105, 109,
+        /* Size 32x16 */
+        32, 31, 30, 31, 33, 37, 39, 42, 49, 48, 48, 49, 50, 51, 52, 54, 55, 57,
+        60, 62, 63, 64, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 31, 31, 32, 33,
+        36, 40, 41, 43, 46, 46, 46, 45, 46, 47, 48, 50, 51, 52, 54, 56, 57, 59,
+        60, 61, 62, 63, 64, 65, 65, 66, 67, 68, 35, 37, 38, 38, 41, 45, 46, 46,
+        48, 47, 46, 45, 46, 47, 47, 49, 49, 50, 53, 54, 55, 56, 58, 59, 60, 61,
+        62, 64, 65, 66, 67, 68, 38, 40, 40, 41, 44, 47, 47, 48, 49, 48, 48, 47,
+        48, 48, 48, 50, 50, 51, 53, 55, 56, 57, 58, 58, 59, 60, 60, 61, 62, 63,
+        64, 65, 48, 47, 46, 46, 47, 47, 48, 50, 53, 53, 53, 53, 54, 54, 54, 55,
+        56, 56, 58, 60, 60, 61, 62, 63, 64, 65, 65, 65, 65, 65, 65, 65, 49, 47,
+        45, 45, 46, 45, 47, 49, 53, 55, 56, 58, 59, 60, 61, 62, 63, 64, 65, 66,
+        67, 68, 69, 68, 67, 66, 66, 67, 68, 69, 70, 71, 50, 48, 46, 46, 46, 46,
+        47, 50, 54, 55, 56, 59, 61, 61, 63, 65, 65, 66, 68, 69, 70, 71, 72, 71,
+        71, 72, 73, 73, 72, 72, 71, 71, 52, 50, 48, 48, 47, 47, 48, 50, 54, 56,
+        57, 61, 63, 64, 66, 68, 69, 70, 72, 74, 75, 75, 76, 78, 79, 77, 76, 74,
+        75, 76, 77, 78, 57, 54, 52, 52, 51, 50, 51, 53, 57, 58, 60, 64, 66, 68,
+        71, 73, 74, 76, 79, 81, 82, 83, 84, 83, 81, 80, 81, 82, 82, 81, 79, 78,
+        61, 57, 55, 55, 54, 52, 54, 56, 59, 61, 62, 66, 68, 70, 73, 76, 77, 79,
+        82, 84, 86, 87, 88, 86, 86, 88, 87, 85, 83, 85, 86, 87, 63, 60, 58, 57,
+        56, 54, 55, 57, 60, 62, 64, 67, 70, 71, 75, 78, 79, 82, 85, 87, 89, 90,
+        91, 93, 94, 91, 89, 90, 92, 90, 89, 87, 67, 63, 61, 60, 59, 57, 57, 60,
+        63, 64, 66, 69, 71, 73, 77, 79, 81, 85, 87, 88, 92, 93, 94, 96, 95, 96,
+        97, 95, 93, 93, 94, 96, 68, 64, 63, 61, 60, 59, 58, 60, 61, 64, 65, 67,
+        71, 72, 75, 78, 79, 83, 85, 87, 91, 92, 95, 96, 97, 99, 98, 99, 100,
+        100, 98, 96, 69, 65, 64, 62, 61, 61, 59, 59, 62, 63, 65, 67, 68, 72, 73,
+        76, 78, 81, 84, 85, 89, 90, 93, 96, 98, 99, 100, 102, 102, 102, 103,
+        105, 70, 66, 65, 63, 63, 62, 61, 60, 61, 63, 65, 66, 69, 70, 74, 74, 78,
+        79, 82, 84, 87, 89, 91, 94, 96, 100, 101, 103, 103, 105, 105, 105, 71,
+        67, 67, 64, 64, 62, 62, 60, 61, 64, 64, 67, 67, 71, 71, 75, 75, 79, 80,
+        84, 84, 89, 89, 94, 94, 98, 99, 104, 104, 106, 106, 109,
+        /* Size 4x16 */
+        31, 49, 61, 69, 32, 45, 55, 64, 36, 46, 54, 61, 41, 47, 54, 59, 46, 53,
+        59, 62, 46, 56, 62, 65, 46, 59, 68, 68, 48, 61, 73, 73, 51, 63, 77, 78,
+        54, 65, 82, 84, 57, 67, 86, 89, 60, 69, 88, 93, 62, 67, 86, 98, 64, 66,
+        87, 100, 65, 68, 83, 102, 67, 70, 86, 103,
+        /* Size 16x4 */
+        31, 32, 36, 41, 46, 46, 46, 48, 51, 54, 57, 60, 62, 64, 65, 67, 49, 45,
+        46, 47, 53, 56, 59, 61, 63, 65, 67, 69, 67, 66, 68, 70, 61, 55, 54, 54,
+        59, 62, 68, 73, 77, 82, 86, 88, 86, 87, 83, 86, 69, 64, 61, 59, 62, 65,
+        68, 73, 78, 84, 89, 93, 98, 100, 102, 103,
+        /* Size 8x32 */
+        32, 35, 48, 50, 57, 63, 68, 70, 31, 37, 47, 48, 54, 60, 64, 66, 30, 38,
+        46, 46, 52, 58, 63, 65, 31, 38, 46, 46, 52, 57, 61, 63, 33, 41, 47, 46,
+        51, 56, 60, 63, 37, 45, 47, 46, 50, 54, 59, 62, 39, 46, 48, 47, 51, 55,
+        58, 61, 42, 46, 50, 50, 53, 57, 60, 60, 49, 48, 53, 54, 57, 60, 61, 61,
+        48, 47, 53, 55, 58, 62, 64, 63, 48, 46, 53, 56, 60, 64, 65, 65, 49, 45,
+        53, 59, 64, 67, 67, 66, 50, 46, 54, 61, 66, 70, 71, 69, 51, 47, 54, 61,
+        68, 71, 72, 70, 52, 47, 54, 63, 71, 75, 75, 74, 54, 49, 55, 65, 73, 78,
+        78, 74, 55, 49, 56, 65, 74, 79, 79, 78, 57, 50, 56, 66, 76, 82, 83, 79,
+        60, 53, 58, 68, 79, 85, 85, 82, 62, 54, 60, 69, 81, 87, 87, 84, 63, 55,
+        60, 70, 82, 89, 91, 87, 64, 56, 61, 71, 83, 90, 92, 89, 66, 58, 62, 72,
+        84, 91, 95, 91, 67, 59, 63, 71, 83, 93, 96, 94, 68, 60, 64, 71, 81, 94,
+        97, 96, 69, 61, 65, 72, 80, 91, 99, 100, 70, 62, 65, 73, 81, 89, 98,
+        101, 71, 64, 65, 73, 82, 90, 99, 103, 72, 65, 65, 72, 82, 92, 100, 103,
+        73, 66, 65, 72, 81, 90, 100, 105, 74, 67, 65, 71, 79, 89, 98, 105, 75,
+        68, 65, 71, 78, 87, 96, 105,
+        /* Size 32x8 */
+        32, 31, 30, 31, 33, 37, 39, 42, 49, 48, 48, 49, 50, 51, 52, 54, 55, 57,
+        60, 62, 63, 64, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 35, 37, 38, 38,
+        41, 45, 46, 46, 48, 47, 46, 45, 46, 47, 47, 49, 49, 50, 53, 54, 55, 56,
+        58, 59, 60, 61, 62, 64, 65, 66, 67, 68, 48, 47, 46, 46, 47, 47, 48, 50,
+        53, 53, 53, 53, 54, 54, 54, 55, 56, 56, 58, 60, 60, 61, 62, 63, 64, 65,
+        65, 65, 65, 65, 65, 65, 50, 48, 46, 46, 46, 46, 47, 50, 54, 55, 56, 59,
+        61, 61, 63, 65, 65, 66, 68, 69, 70, 71, 72, 71, 71, 72, 73, 73, 72, 72,
+        71, 71, 57, 54, 52, 52, 51, 50, 51, 53, 57, 58, 60, 64, 66, 68, 71, 73,
+        74, 76, 79, 81, 82, 83, 84, 83, 81, 80, 81, 82, 82, 81, 79, 78, 63, 60,
+        58, 57, 56, 54, 55, 57, 60, 62, 64, 67, 70, 71, 75, 78, 79, 82, 85, 87,
+        89, 90, 91, 93, 94, 91, 89, 90, 92, 90, 89, 87, 68, 64, 63, 61, 60, 59,
+        58, 60, 61, 64, 65, 67, 71, 72, 75, 78, 79, 83, 85, 87, 91, 92, 95, 96,
+        97, 99, 98, 99, 100, 100, 98, 96, 70, 66, 65, 63, 63, 62, 61, 60, 61,
+        63, 65, 66, 69, 70, 74, 74, 78, 79, 82, 84, 87, 89, 91, 94, 96, 100,
+        101, 103, 103, 105, 105, 105 },
+  },
+  {
+      { /* Luma */
+        /* Size 4x4 */
+        32, 38, 63, 86, 38, 56, 78, 97, 63, 78, 113, 130, 86, 97, 130, 169,
+        /* Size 8x8 */
+        32, 32, 35, 46, 57, 76, 85, 96, 32, 34, 37, 45, 54, 70, 79, 90, 35, 37,
+        48, 56, 64, 79, 87, 93, 46, 45, 56, 70, 80, 96, 100, 105, 57, 54, 64,
+        80, 93, 111, 121, 122, 76, 70, 79, 96, 111, 134, 138, 144, 85, 79, 87,
+        100, 121, 138, 156, 168, 96, 90, 93, 105, 122, 144, 168, 184,
+        /* Size 16x16 */
+        32, 31, 31, 32, 34, 39, 44, 49, 58, 65, 71, 81, 87, 93, 98, 104, 31, 32,
+        32, 32, 34, 38, 41, 46, 54, 60, 66, 75, 81, 86, 92, 98, 31, 32, 33, 34,
+        36, 39, 42, 46, 53, 59, 64, 73, 78, 83, 88, 94, 32, 32, 34, 35, 37, 40,
+        42, 46, 52, 58, 63, 71, 75, 80, 86, 92, 34, 34, 36, 37, 42, 47, 50, 53,
+        59, 65, 70, 77, 82, 85, 89, 92, 39, 38, 39, 40, 47, 54, 58, 62, 68, 73,
+        78, 85, 90, 90, 96, 98, 44, 41, 42, 42, 50, 58, 63, 68, 74, 79, 84, 91,
+        96, 98, 102, 104, 49, 46, 46, 46, 53, 62, 68, 73, 81, 87, 92, 99, 103,
+        107, 109, 112, 58, 54, 53, 52, 59, 68, 74, 81, 90, 97, 102, 110, 114,
+        118, 117, 121, 65, 60, 59, 58, 65, 73, 79, 87, 97, 105, 111, 120, 125,
+        125, 126, 130, 71, 66, 64, 63, 70, 78, 84, 92, 102, 111, 117, 127, 133,
+        134, 136, 141, 81, 75, 73, 71, 77, 85, 91, 99, 110, 120, 127, 137, 143,
+        145, 148, 152, 87, 81, 78, 75, 82, 90, 96, 103, 114, 125, 133, 143, 150,
+        156, 160, 163, 93, 86, 83, 80, 85, 90, 98, 107, 118, 125, 134, 145, 156,
+        163, 169, 177, 98, 92, 88, 86, 89, 96, 102, 109, 117, 126, 136, 148,
+        160, 169, 176, 184, 104, 98, 94, 92, 92, 98, 104, 112, 121, 130, 141,
+        152, 163, 177, 184, 191,
+        /* Size 32x32 */
+        32, 31, 31, 31, 31, 32, 32, 34, 34, 36, 39, 41, 44, 48, 49, 54, 58, 59,
+        65, 69, 71, 80, 81, 83, 87, 90, 93, 95, 98, 101, 104, 107, 31, 32, 32,
+        32, 32, 32, 32, 34, 34, 35, 38, 39, 42, 46, 47, 51, 55, 57, 62, 66, 68,
+        76, 77, 78, 83, 85, 88, 90, 93, 96, 99, 101, 31, 32, 32, 32, 32, 32, 32,
+        33, 34, 34, 38, 39, 41, 45, 46, 50, 54, 55, 60, 64, 66, 73, 75, 76, 81,
+        83, 86, 89, 92, 95, 98, 101, 31, 32, 32, 32, 32, 32, 32, 33, 34, 34, 37,
+        38, 41, 44, 45, 49, 53, 54, 59, 63, 65, 72, 74, 75, 79, 81, 84, 86, 89,
+        91, 94, 97, 31, 32, 32, 32, 33, 33, 34, 35, 36, 36, 39, 40, 42, 45, 46,
+        50, 53, 54, 59, 63, 64, 71, 73, 74, 78, 80, 83, 85, 88, 91, 94, 97, 32,
+        32, 32, 32, 33, 34, 34, 36, 36, 37, 40, 40, 42, 45, 46, 49, 53, 54, 58,
+        62, 63, 70, 72, 73, 77, 79, 82, 85, 87, 90, 92, 95, 32, 32, 32, 32, 34,
+        34, 35, 37, 37, 38, 40, 41, 42, 45, 46, 49, 52, 54, 58, 61, 63, 69, 71,
+        72, 75, 78, 80, 83, 86, 89, 92, 95, 34, 34, 33, 33, 35, 36, 37, 39, 41,
+        42, 45, 46, 47, 50, 51, 54, 57, 59, 63, 66, 68, 74, 75, 76, 80, 81, 82,
+        83, 85, 87, 90, 93, 34, 34, 34, 34, 36, 36, 37, 41, 42, 45, 47, 48, 50,
+        53, 53, 56, 59, 61, 65, 68, 70, 76, 77, 78, 82, 83, 85, 88, 89, 90, 92,
+        93, 36, 35, 34, 34, 36, 37, 38, 42, 45, 48, 50, 51, 54, 56, 57, 60, 63,
+        64, 68, 71, 73, 79, 80, 81, 85, 87, 89, 89, 90, 93, 96, 99, 39, 38, 38,
+        37, 39, 40, 40, 45, 47, 50, 54, 55, 58, 61, 62, 65, 68, 69, 73, 76, 78,
+        84, 85, 86, 90, 89, 90, 93, 96, 97, 98, 99, 41, 39, 39, 38, 40, 40, 41,
+        46, 48, 51, 55, 56, 59, 62, 63, 67, 70, 71, 75, 78, 80, 86, 87, 88, 91,
+        93, 96, 97, 97, 99, 102, 105, 44, 42, 41, 41, 42, 42, 42, 47, 50, 54,
+        58, 59, 63, 66, 68, 71, 74, 75, 79, 83, 84, 90, 91, 92, 96, 98, 98, 99,
+        102, 104, 104, 105, 48, 46, 45, 44, 45, 45, 45, 50, 53, 56, 61, 62, 66,
+        70, 71, 76, 79, 80, 85, 88, 90, 96, 97, 98, 101, 100, 102, 105, 105,
+        105, 109, 112, 49, 47, 46, 45, 46, 46, 46, 51, 53, 57, 62, 63, 68, 71,
+        73, 77, 81, 82, 87, 90, 92, 98, 99, 100, 103, 106, 107, 106, 109, 112,
+        112, 112, 54, 51, 50, 49, 50, 49, 49, 54, 56, 60, 65, 67, 71, 76, 77,
+        82, 86, 87, 92, 96, 97, 104, 105, 106, 110, 110, 109, 113, 114, 113,
+        116, 120, 58, 55, 54, 53, 53, 53, 52, 57, 59, 63, 68, 70, 74, 79, 81,
+        86, 90, 91, 97, 100, 102, 109, 110, 111, 114, 114, 118, 116, 117, 121,
+        121, 120, 59, 57, 55, 54, 54, 54, 54, 59, 61, 64, 69, 71, 75, 80, 82,
+        87, 91, 93, 99, 102, 104, 111, 112, 113, 117, 121, 120, 122, 124, 122,
+        125, 129, 65, 62, 60, 59, 59, 58, 58, 63, 65, 68, 73, 75, 79, 85, 87,
+        92, 97, 99, 105, 109, 111, 118, 120, 121, 125, 124, 125, 127, 126, 130,
+        130, 129, 69, 66, 64, 63, 63, 62, 61, 66, 68, 71, 76, 78, 83, 88, 90,
+        96, 100, 102, 109, 113, 115, 123, 125, 126, 129, 130, 131, 130, 134,
+        133, 135, 139, 71, 68, 66, 65, 64, 63, 63, 68, 70, 73, 78, 80, 84, 90,
+        92, 97, 102, 104, 111, 115, 117, 125, 127, 128, 133, 136, 134, 139, 136,
+        139, 141, 140, 80, 76, 73, 72, 71, 70, 69, 74, 76, 79, 84, 86, 90, 96,
+        98, 104, 109, 111, 118, 123, 125, 134, 136, 137, 142, 138, 143, 140,
+        144, 144, 144, 149, 81, 77, 75, 74, 73, 72, 71, 75, 77, 80, 85, 87, 91,
+        97, 99, 105, 110, 112, 120, 125, 127, 136, 137, 139, 143, 148, 145, 148,
+        148, 150, 152, 149, 83, 78, 76, 75, 74, 73, 72, 76, 78, 81, 86, 88, 92,
+        98, 100, 106, 111, 113, 121, 126, 128, 137, 139, 140, 145, 149, 153,
+        153, 154, 155, 155, 161, 87, 83, 81, 79, 78, 77, 75, 80, 82, 85, 90, 91,
+        96, 101, 103, 110, 114, 117, 125, 129, 133, 142, 143, 145, 150, 151,
+        156, 159, 160, 160, 163, 161, 90, 85, 83, 81, 80, 79, 78, 81, 83, 87,
+        89, 93, 98, 100, 106, 110, 114, 121, 124, 130, 136, 138, 148, 149, 151,
+        156, 157, 162, 166, 168, 166, 172, 93, 88, 86, 84, 83, 82, 80, 82, 85,
+        89, 90, 96, 98, 102, 107, 109, 118, 120, 125, 131, 134, 143, 145, 153,
+        156, 157, 163, 164, 169, 172, 177, 172, 95, 90, 89, 86, 85, 85, 83, 83,
+        88, 89, 93, 97, 99, 105, 106, 113, 116, 122, 127, 130, 139, 140, 148,
+        153, 159, 162, 164, 169, 170, 176, 179, 185, 98, 93, 92, 89, 88, 87, 86,
+        85, 89, 90, 96, 97, 102, 105, 109, 114, 117, 124, 126, 134, 136, 144,
+        148, 154, 160, 166, 169, 170, 176, 177, 184, 186, 101, 96, 95, 91, 91,
+        90, 89, 87, 90, 93, 97, 99, 104, 105, 112, 113, 121, 122, 130, 133, 139,
+        144, 150, 155, 160, 168, 172, 176, 177, 184, 185, 191, 104, 99, 98, 94,
+        94, 92, 92, 90, 92, 96, 98, 102, 104, 109, 112, 116, 121, 125, 130, 135,
+        141, 144, 152, 155, 163, 166, 177, 179, 184, 185, 191, 192, 107, 101,
+        101, 97, 97, 95, 95, 93, 93, 99, 99, 105, 105, 112, 112, 120, 120, 129,
+        129, 139, 140, 149, 149, 161, 161, 172, 172, 185, 186, 191, 192, 199,
+        /* Size 4x8 */
+        32, 38, 62, 86, 32, 40, 58, 80, 34, 51, 68, 85, 44, 61, 85, 101, 54, 69,
+        98, 117, 72, 84, 118, 136, 82, 89, 129, 157, 92, 98, 127, 165,
+        /* Size 8x4 */
+        32, 32, 34, 44, 54, 72, 82, 92, 38, 40, 51, 61, 69, 84, 89, 98, 62, 58,
+        68, 85, 98, 118, 129, 127, 86, 80, 85, 101, 117, 136, 157, 165,
+        /* Size 8x16 */
+        32, 32, 36, 44, 58, 79, 88, 93, 31, 32, 35, 41, 54, 73, 81, 88, 32, 33,
+        36, 42, 53, 71, 78, 84, 32, 34, 38, 42, 52, 69, 76, 82, 34, 36, 44, 50,
+        59, 75, 81, 84, 39, 39, 50, 58, 68, 84, 88, 90, 44, 42, 53, 63, 74, 90,
+        97, 97, 49, 46, 57, 67, 81, 97, 104, 105, 57, 53, 63, 74, 90, 108, 111,
+        113, 65, 59, 68, 79, 97, 118, 123, 122, 71, 64, 73, 84, 102, 125, 135,
+        131, 81, 72, 80, 91, 110, 135, 145, 141, 87, 77, 85, 96, 114, 140, 148,
+        151, 92, 83, 88, 102, 117, 133, 153, 163, 98, 88, 89, 103, 121, 141,
+        160, 169, 103, 94, 92, 103, 119, 137, 158, 175,
+        /* Size 16x8 */
+        32, 31, 32, 32, 34, 39, 44, 49, 57, 65, 71, 81, 87, 92, 98, 103, 32, 32,
+        33, 34, 36, 39, 42, 46, 53, 59, 64, 72, 77, 83, 88, 94, 36, 35, 36, 38,
+        44, 50, 53, 57, 63, 68, 73, 80, 85, 88, 89, 92, 44, 41, 42, 42, 50, 58,
+        63, 67, 74, 79, 84, 91, 96, 102, 103, 103, 58, 54, 53, 52, 59, 68, 74,
+        81, 90, 97, 102, 110, 114, 117, 121, 119, 79, 73, 71, 69, 75, 84, 90,
+        97, 108, 118, 125, 135, 140, 133, 141, 137, 88, 81, 78, 76, 81, 88, 97,
+        104, 111, 123, 135, 145, 148, 153, 160, 158, 93, 88, 84, 82, 84, 90, 97,
+        105, 113, 122, 131, 141, 151, 163, 169, 175,
+        /* Size 16x32 */
+        32, 31, 32, 32, 36, 39, 44, 53, 58, 65, 79, 81, 88, 90, 93, 96, 31, 32,
+        32, 32, 35, 38, 42, 51, 55, 62, 75, 77, 83, 86, 88, 91, 31, 32, 32, 32,
+        35, 38, 41, 50, 54, 60, 73, 75, 81, 84, 88, 91, 31, 32, 32, 33, 34, 37,
+        41, 49, 53, 59, 72, 74, 79, 82, 84, 87, 32, 32, 33, 34, 36, 39, 42, 50,
+        53, 59, 71, 72, 78, 81, 84, 87, 32, 32, 34, 34, 37, 40, 42, 49, 53, 58,
+        70, 71, 77, 80, 83, 85, 32, 33, 34, 35, 38, 40, 42, 49, 52, 58, 69, 70,
+        76, 78, 82, 86, 34, 34, 35, 37, 42, 45, 48, 54, 57, 63, 73, 75, 79, 79,
+        81, 83, 34, 34, 36, 37, 44, 47, 50, 56, 59, 65, 75, 77, 81, 83, 84, 84,
+        36, 34, 37, 38, 48, 51, 54, 60, 63, 68, 78, 80, 85, 85, 86, 89, 39, 37,
+        39, 40, 50, 54, 58, 65, 68, 73, 84, 85, 88, 89, 90, 89, 40, 38, 40, 41,
+        51, 55, 59, 67, 70, 75, 85, 87, 91, 92, 92, 95, 44, 41, 42, 43, 53, 58,
+        63, 71, 74, 79, 90, 91, 97, 94, 97, 95, 47, 44, 45, 46, 56, 61, 66, 75,
+        79, 85, 95, 97, 99, 101, 98, 102, 49, 46, 46, 47, 57, 62, 67, 77, 81,
+        86, 97, 99, 104, 102, 105, 102, 53, 49, 50, 50, 60, 65, 71, 82, 86, 92,
+        103, 105, 109, 108, 106, 110, 57, 53, 53, 53, 63, 68, 74, 86, 90, 97,
+        108, 110, 111, 112, 113, 110, 59, 54, 54, 54, 64, 69, 75, 87, 91, 98,
+        111, 112, 119, 117, 115, 118, 65, 60, 59, 58, 68, 73, 79, 92, 97, 105,
+        118, 119, 123, 123, 122, 119, 69, 63, 62, 62, 71, 76, 83, 96, 100, 109,
+        122, 124, 127, 125, 125, 128, 71, 65, 64, 63, 73, 78, 84, 97, 102, 111,
+        125, 127, 135, 134, 131, 129, 79, 72, 71, 70, 79, 84, 90, 104, 109, 118,
+        133, 135, 137, 136, 136, 137, 81, 74, 72, 71, 80, 85, 91, 105, 110, 120,
+        135, 137, 145, 143, 141, 138, 82, 75, 73, 72, 81, 86, 92, 106, 111, 121,
+        136, 139, 147, 148, 147, 149, 87, 79, 77, 76, 85, 90, 96, 110, 114, 125,
+        140, 143, 148, 154, 151, 149, 90, 82, 80, 78, 87, 89, 99, 108, 113, 129,
+        135, 146, 153, 157, 160, 159, 92, 84, 83, 81, 88, 90, 102, 106, 117,
+        128, 133, 150, 153, 158, 163, 160, 95, 87, 85, 83, 88, 92, 103, 105,
+        120, 125, 137, 148, 155, 164, 168, 173, 98, 89, 88, 85, 89, 95, 103,
+        108, 121, 124, 141, 144, 160, 164, 169, 174, 100, 92, 91, 88, 90, 98,
+        103, 111, 120, 127, 139, 146, 161, 165, 175, 179, 103, 94, 94, 90, 92,
+        101, 103, 114, 119, 131, 137, 150, 158, 170, 175, 180, 106, 97, 97, 93,
+        93, 104, 104, 118, 118, 135, 135, 154, 155, 175, 176, 187,
+        /* Size 32x16 */
+        32, 31, 31, 31, 32, 32, 32, 34, 34, 36, 39, 40, 44, 47, 49, 53, 57, 59,
+        65, 69, 71, 79, 81, 82, 87, 90, 92, 95, 98, 100, 103, 106, 31, 32, 32,
+        32, 32, 32, 33, 34, 34, 34, 37, 38, 41, 44, 46, 49, 53, 54, 60, 63, 65,
+        72, 74, 75, 79, 82, 84, 87, 89, 92, 94, 97, 32, 32, 32, 32, 33, 34, 34,
+        35, 36, 37, 39, 40, 42, 45, 46, 50, 53, 54, 59, 62, 64, 71, 72, 73, 77,
+        80, 83, 85, 88, 91, 94, 97, 32, 32, 32, 33, 34, 34, 35, 37, 37, 38, 40,
+        41, 43, 46, 47, 50, 53, 54, 58, 62, 63, 70, 71, 72, 76, 78, 81, 83, 85,
+        88, 90, 93, 36, 35, 35, 34, 36, 37, 38, 42, 44, 48, 50, 51, 53, 56, 57,
+        60, 63, 64, 68, 71, 73, 79, 80, 81, 85, 87, 88, 88, 89, 90, 92, 93, 39,
+        38, 38, 37, 39, 40, 40, 45, 47, 51, 54, 55, 58, 61, 62, 65, 68, 69, 73,
+        76, 78, 84, 85, 86, 90, 89, 90, 92, 95, 98, 101, 104, 44, 42, 41, 41,
+        42, 42, 42, 48, 50, 54, 58, 59, 63, 66, 67, 71, 74, 75, 79, 83, 84, 90,
+        91, 92, 96, 99, 102, 103, 103, 103, 103, 104, 53, 51, 50, 49, 50, 49,
+        49, 54, 56, 60, 65, 67, 71, 75, 77, 82, 86, 87, 92, 96, 97, 104, 105,
+        106, 110, 108, 106, 105, 108, 111, 114, 118, 58, 55, 54, 53, 53, 53, 52,
+        57, 59, 63, 68, 70, 74, 79, 81, 86, 90, 91, 97, 100, 102, 109, 110, 111,
+        114, 113, 117, 120, 121, 120, 119, 118, 65, 62, 60, 59, 59, 58, 58, 63,
+        65, 68, 73, 75, 79, 85, 86, 92, 97, 98, 105, 109, 111, 118, 120, 121,
+        125, 129, 128, 125, 124, 127, 131, 135, 79, 75, 73, 72, 71, 70, 69, 73,
+        75, 78, 84, 85, 90, 95, 97, 103, 108, 111, 118, 122, 125, 133, 135, 136,
+        140, 135, 133, 137, 141, 139, 137, 135, 81, 77, 75, 74, 72, 71, 70, 75,
+        77, 80, 85, 87, 91, 97, 99, 105, 110, 112, 119, 124, 127, 135, 137, 139,
+        143, 146, 150, 148, 144, 146, 150, 154, 88, 83, 81, 79, 78, 77, 76, 79,
+        81, 85, 88, 91, 97, 99, 104, 109, 111, 119, 123, 127, 135, 137, 145,
+        147, 148, 153, 153, 155, 160, 161, 158, 155, 90, 86, 84, 82, 81, 80, 78,
+        79, 83, 85, 89, 92, 94, 101, 102, 108, 112, 117, 123, 125, 134, 136,
+        143, 148, 154, 157, 158, 164, 164, 165, 170, 175, 93, 88, 88, 84, 84,
+        83, 82, 81, 84, 86, 90, 92, 97, 98, 105, 106, 113, 115, 122, 125, 131,
+        136, 141, 147, 151, 160, 163, 168, 169, 175, 175, 176, 96, 91, 91, 87,
+        87, 85, 86, 83, 84, 89, 89, 95, 95, 102, 102, 110, 110, 118, 119, 128,
+        129, 137, 138, 149, 149, 159, 160, 173, 174, 179, 180, 187,
+        /* Size 4x16 */
+        31, 39, 65, 90, 32, 38, 60, 84, 32, 39, 59, 81, 33, 40, 58, 78, 34, 47,
+        65, 83, 37, 54, 73, 89, 41, 58, 79, 94, 46, 62, 86, 102, 53, 68, 97,
+        112, 60, 73, 105, 123, 65, 78, 111, 134, 74, 85, 120, 143, 79, 90, 125,
+        154, 84, 90, 128, 158, 89, 95, 124, 164, 94, 101, 131, 170,
+        /* Size 16x4 */
+        31, 32, 32, 33, 34, 37, 41, 46, 53, 60, 65, 74, 79, 84, 89, 94, 39, 38,
+        39, 40, 47, 54, 58, 62, 68, 73, 78, 85, 90, 90, 95, 101, 65, 60, 59, 58,
+        65, 73, 79, 86, 97, 105, 111, 120, 125, 128, 124, 131, 90, 84, 81, 78,
+        83, 89, 94, 102, 112, 123, 134, 143, 154, 158, 164, 170,
+        /* Size 8x32 */
+        32, 32, 36, 44, 58, 79, 88, 93, 31, 32, 35, 42, 55, 75, 83, 88, 31, 32,
+        35, 41, 54, 73, 81, 88, 31, 32, 34, 41, 53, 72, 79, 84, 32, 33, 36, 42,
+        53, 71, 78, 84, 32, 34, 37, 42, 53, 70, 77, 83, 32, 34, 38, 42, 52, 69,
+        76, 82, 34, 35, 42, 48, 57, 73, 79, 81, 34, 36, 44, 50, 59, 75, 81, 84,
+        36, 37, 48, 54, 63, 78, 85, 86, 39, 39, 50, 58, 68, 84, 88, 90, 40, 40,
+        51, 59, 70, 85, 91, 92, 44, 42, 53, 63, 74, 90, 97, 97, 47, 45, 56, 66,
+        79, 95, 99, 98, 49, 46, 57, 67, 81, 97, 104, 105, 53, 50, 60, 71, 86,
+        103, 109, 106, 57, 53, 63, 74, 90, 108, 111, 113, 59, 54, 64, 75, 91,
+        111, 119, 115, 65, 59, 68, 79, 97, 118, 123, 122, 69, 62, 71, 83, 100,
+        122, 127, 125, 71, 64, 73, 84, 102, 125, 135, 131, 79, 71, 79, 90, 109,
+        133, 137, 136, 81, 72, 80, 91, 110, 135, 145, 141, 82, 73, 81, 92, 111,
+        136, 147, 147, 87, 77, 85, 96, 114, 140, 148, 151, 90, 80, 87, 99, 113,
+        135, 153, 160, 92, 83, 88, 102, 117, 133, 153, 163, 95, 85, 88, 103,
+        120, 137, 155, 168, 98, 88, 89, 103, 121, 141, 160, 169, 100, 91, 90,
+        103, 120, 139, 161, 175, 103, 94, 92, 103, 119, 137, 158, 175, 106, 97,
+        93, 104, 118, 135, 155, 176,
+        /* Size 32x8 */
+        32, 31, 31, 31, 32, 32, 32, 34, 34, 36, 39, 40, 44, 47, 49, 53, 57, 59,
+        65, 69, 71, 79, 81, 82, 87, 90, 92, 95, 98, 100, 103, 106, 32, 32, 32,
+        32, 33, 34, 34, 35, 36, 37, 39, 40, 42, 45, 46, 50, 53, 54, 59, 62, 64,
+        71, 72, 73, 77, 80, 83, 85, 88, 91, 94, 97, 36, 35, 35, 34, 36, 37, 38,
+        42, 44, 48, 50, 51, 53, 56, 57, 60, 63, 64, 68, 71, 73, 79, 80, 81, 85,
+        87, 88, 88, 89, 90, 92, 93, 44, 42, 41, 41, 42, 42, 42, 48, 50, 54, 58,
+        59, 63, 66, 67, 71, 74, 75, 79, 83, 84, 90, 91, 92, 96, 99, 102, 103,
+        103, 103, 103, 104, 58, 55, 54, 53, 53, 53, 52, 57, 59, 63, 68, 70, 74,
+        79, 81, 86, 90, 91, 97, 100, 102, 109, 110, 111, 114, 113, 117, 120,
+        121, 120, 119, 118, 79, 75, 73, 72, 71, 70, 69, 73, 75, 78, 84, 85, 90,
+        95, 97, 103, 108, 111, 118, 122, 125, 133, 135, 136, 140, 135, 133, 137,
+        141, 139, 137, 135, 88, 83, 81, 79, 78, 77, 76, 79, 81, 85, 88, 91, 97,
+        99, 104, 109, 111, 119, 123, 127, 135, 137, 145, 147, 148, 153, 153,
+        155, 160, 161, 158, 155, 93, 88, 88, 84, 84, 83, 82, 81, 84, 86, 90, 92,
+        97, 98, 105, 106, 113, 115, 122, 125, 131, 136, 141, 147, 151, 160, 163,
+        168, 169, 175, 175, 176 },
+      { /* Chroma */
+        /* Size 4x4 */
+        32, 45, 53, 63, 45, 55, 62, 67, 53, 62, 80, 84, 63, 67, 84, 101,
+        /* Size 8x8 */
+        31, 36, 47, 48, 52, 60, 64, 67, 36, 43, 47, 46, 49, 55, 59, 63, 47, 47,
+        53, 54, 55, 60, 63, 64, 48, 46, 54, 61, 65, 70, 71, 71, 52, 49, 55, 65,
+        71, 78, 81, 79, 60, 55, 60, 70, 78, 89, 89, 89, 64, 59, 63, 71, 81, 89,
+        97, 99, 67, 63, 64, 71, 79, 89, 99, 104,
+        /* Size 16x16 */
+        32, 30, 33, 36, 44, 48, 49, 51, 54, 57, 60, 64, 67, 68, 70, 72, 30, 31,
+        35, 39, 44, 46, 46, 47, 50, 53, 55, 59, 61, 64, 66, 68, 33, 35, 39, 43,
+        46, 46, 45, 47, 49, 51, 53, 57, 59, 61, 63, 65, 36, 39, 43, 47, 47, 46,
+        45, 46, 48, 50, 52, 55, 57, 58, 61, 63, 44, 44, 46, 47, 50, 51, 51, 51,
+        53, 54, 56, 59, 61, 61, 63, 62, 48, 46, 46, 46, 51, 54, 55, 56, 58, 60,
+        61, 64, 65, 64, 66, 66, 49, 46, 45, 45, 51, 55, 58, 60, 62, 63, 65, 68,
+        69, 69, 69, 69, 51, 47, 47, 46, 51, 56, 60, 62, 65, 67, 69, 72, 73, 74,
+        73, 73, 54, 50, 49, 48, 53, 58, 62, 65, 70, 73, 75, 78, 79, 79, 77, 77,
+        57, 53, 51, 50, 54, 60, 63, 67, 73, 76, 79, 82, 84, 83, 82, 82, 60, 55,
+        53, 52, 56, 61, 65, 69, 75, 79, 82, 86, 88, 87, 86, 87, 64, 59, 57, 55,
+        59, 64, 68, 72, 78, 82, 86, 90, 93, 92, 91, 92, 67, 61, 59, 57, 61, 65,
+        69, 73, 79, 84, 88, 93, 95, 96, 96, 96, 68, 64, 61, 58, 61, 64, 69, 74,
+        79, 83, 87, 92, 96, 99, 100, 101, 70, 66, 63, 61, 63, 66, 69, 73, 77,
+        82, 86, 91, 96, 100, 103, 104, 72, 68, 65, 63, 62, 66, 69, 73, 77, 82,
+        87, 92, 96, 101, 104, 106,
+        /* Size 32x32 */
+        32, 31, 30, 30, 33, 35, 36, 41, 44, 49, 48, 48, 49, 50, 51, 52, 54, 55,
+        57, 59, 60, 63, 64, 65, 67, 68, 68, 69, 70, 71, 72, 73, 31, 31, 31, 31,
+        34, 36, 38, 42, 44, 47, 47, 47, 47, 48, 48, 50, 51, 52, 54, 56, 57, 60,
+        61, 61, 63, 64, 65, 66, 67, 67, 68, 69, 30, 31, 31, 31, 35, 37, 39, 42,
+        44, 47, 46, 46, 46, 47, 47, 48, 50, 51, 53, 54, 55, 58, 59, 60, 61, 63,
+        64, 65, 66, 67, 68, 69, 30, 31, 31, 32, 35, 37, 40, 42, 44, 46, 45, 45,
+        45, 46, 46, 47, 49, 50, 52, 53, 54, 57, 58, 58, 60, 61, 62, 63, 63, 64,
+        65, 66, 33, 34, 35, 35, 39, 41, 43, 45, 46, 47, 46, 46, 45, 46, 47, 47,
+        49, 49, 51, 53, 53, 56, 57, 57, 59, 60, 61, 62, 63, 64, 65, 66, 35, 36,
+        37, 37, 41, 43, 45, 46, 46, 47, 46, 46, 45, 46, 46, 47, 48, 49, 50, 52,
+        53, 55, 56, 56, 58, 59, 60, 61, 62, 63, 64, 64, 36, 38, 39, 40, 43, 45,
+        47, 47, 47, 48, 46, 46, 45, 46, 46, 47, 48, 48, 50, 51, 52, 54, 55, 55,
+        57, 58, 58, 59, 61, 62, 63, 64, 41, 42, 42, 42, 45, 46, 47, 48, 49, 50,
+        49, 49, 49, 50, 50, 50, 51, 52, 53, 54, 55, 57, 58, 58, 60, 60, 59, 59,
+        60, 61, 61, 62, 44, 44, 44, 44, 46, 46, 47, 49, 50, 51, 51, 51, 51, 51,
+        51, 52, 53, 53, 54, 56, 56, 59, 59, 59, 61, 61, 61, 62, 63, 62, 62, 62,
+        49, 47, 47, 46, 47, 47, 48, 50, 51, 53, 53, 53, 53, 54, 54, 54, 55, 55,
+        56, 58, 58, 60, 61, 61, 63, 63, 64, 63, 63, 64, 65, 66, 48, 47, 46, 45,
+        46, 46, 46, 49, 51, 53, 54, 54, 55, 56, 56, 57, 58, 59, 60, 61, 61, 63,
+        64, 64, 65, 65, 64, 65, 66, 66, 66, 66, 48, 47, 46, 45, 46, 46, 46, 49,
+        51, 53, 54, 55, 56, 57, 57, 58, 59, 60, 61, 62, 63, 65, 65, 65, 66, 67,
+        68, 67, 67, 67, 68, 69, 49, 47, 46, 45, 45, 45, 45, 49, 51, 53, 55, 56,
+        58, 59, 60, 61, 62, 62, 63, 65, 65, 67, 68, 68, 69, 70, 69, 69, 69, 70,
+        69, 69, 50, 48, 47, 46, 46, 46, 46, 50, 51, 54, 56, 57, 59, 61, 62, 63,
+        64, 65, 66, 68, 68, 70, 71, 71, 72, 71, 71, 72, 71, 71, 71, 72, 51, 48,
+        47, 46, 47, 46, 46, 50, 51, 54, 56, 57, 60, 62, 62, 64, 65, 66, 67, 69,
+        69, 71, 72, 72, 73, 74, 74, 72, 73, 74, 73, 73, 52, 50, 48, 47, 47, 47,
+        47, 50, 52, 54, 57, 58, 61, 63, 64, 66, 68, 68, 70, 72, 72, 75, 75, 75,
+        77, 76, 75, 76, 76, 74, 75, 76, 54, 51, 50, 49, 49, 48, 48, 51, 53, 55,
+        58, 59, 62, 64, 65, 68, 70, 70, 73, 74, 75, 77, 78, 78, 79, 78, 79, 78,
+        77, 78, 77, 77, 55, 52, 51, 50, 49, 49, 48, 52, 53, 55, 59, 60, 62, 65,
+        66, 68, 70, 71, 73, 75, 76, 78, 79, 79, 80, 81, 80, 80, 81, 79, 79, 81,
+        57, 54, 53, 52, 51, 50, 50, 53, 54, 56, 60, 61, 63, 66, 67, 70, 73, 73,
+        76, 78, 79, 82, 82, 83, 84, 83, 83, 83, 82, 83, 82, 81, 59, 56, 54, 53,
+        53, 52, 51, 54, 56, 58, 61, 62, 65, 68, 69, 72, 74, 75, 78, 80, 81, 84,
+        85, 85, 86, 86, 86, 84, 85, 84, 84, 85, 60, 57, 55, 54, 53, 53, 52, 55,
+        56, 58, 61, 63, 65, 68, 69, 72, 75, 76, 79, 81, 82, 85, 86, 86, 88, 88,
+        87, 88, 86, 87, 87, 85, 63, 60, 58, 57, 56, 55, 54, 57, 59, 60, 63, 65,
+        67, 70, 71, 75, 77, 78, 82, 84, 85, 89, 89, 90, 92, 89, 91, 89, 90, 89,
+        88, 89, 64, 61, 59, 58, 57, 56, 55, 58, 59, 61, 64, 65, 68, 71, 72, 75,
+        78, 79, 82, 85, 86, 89, 90, 91, 93, 94, 92, 92, 91, 91, 92, 90, 65, 61,
+        60, 58, 57, 56, 55, 58, 59, 61, 64, 65, 68, 71, 72, 75, 78, 79, 83, 85,
+        86, 90, 91, 91, 93, 94, 95, 94, 94, 94, 93, 94, 67, 63, 61, 60, 59, 58,
+        57, 60, 61, 63, 65, 66, 69, 72, 73, 77, 79, 80, 84, 86, 88, 92, 93, 93,
+        95, 95, 96, 97, 96, 95, 96, 94, 68, 64, 63, 61, 60, 59, 58, 60, 61, 63,
+        65, 67, 70, 71, 74, 76, 78, 81, 83, 86, 88, 89, 94, 94, 95, 97, 97, 98,
+        99, 99, 97, 99, 68, 65, 64, 62, 61, 60, 58, 59, 61, 64, 64, 68, 69, 71,
+        74, 75, 79, 80, 83, 86, 87, 91, 92, 95, 96, 97, 99, 99, 100, 100, 101,
+        99, 69, 66, 65, 63, 62, 61, 59, 59, 62, 63, 65, 67, 69, 72, 72, 76, 78,
+        80, 83, 84, 88, 89, 92, 94, 97, 98, 99, 101, 100, 102, 102, 104, 70, 67,
+        66, 63, 63, 62, 61, 60, 63, 63, 66, 67, 69, 71, 73, 76, 77, 81, 82, 85,
+        86, 90, 91, 94, 96, 99, 100, 100, 103, 102, 104, 104, 71, 67, 67, 64,
+        64, 63, 62, 61, 62, 64, 66, 67, 70, 71, 74, 74, 78, 79, 83, 84, 87, 89,
+        91, 94, 95, 99, 100, 102, 102, 104, 104, 106, 72, 68, 68, 65, 65, 64,
+        63, 61, 62, 65, 66, 68, 69, 71, 73, 75, 77, 79, 82, 84, 87, 88, 92, 93,
+        96, 97, 101, 102, 104, 104, 106, 106, 73, 69, 69, 66, 66, 64, 64, 62,
+        62, 66, 66, 69, 69, 72, 73, 76, 77, 81, 81, 85, 85, 89, 90, 94, 94, 99,
+        99, 104, 104, 106, 106, 108,
+        /* Size 4x8 */
+        31, 47, 54, 64, 38, 46, 50, 60, 46, 53, 57, 62, 46, 56, 66, 71, 50, 59,
+        74, 79, 57, 64, 82, 88, 61, 65, 85, 97, 65, 67, 82, 99,
+        /* Size 8x4 */
+        31, 38, 46, 46, 50, 57, 61, 65, 47, 46, 53, 56, 59, 64, 65, 67, 54, 50,
+        57, 66, 74, 82, 85, 82, 64, 60, 62, 71, 79, 88, 97, 99,
+        /* Size 8x16 */
+        32, 34, 48, 49, 54, 63, 67, 69, 31, 36, 46, 46, 50, 58, 62, 65, 33, 40,
+        47, 46, 49, 56, 59, 62, 37, 44, 47, 45, 48, 54, 57, 60, 44, 46, 51, 51,
+        53, 59, 60, 61, 48, 46, 53, 56, 58, 64, 64, 64, 49, 45, 53, 58, 62, 67,
+        70, 68, 51, 47, 54, 60, 65, 71, 73, 72, 54, 49, 55, 62, 70, 77, 77, 76,
+        57, 51, 56, 64, 73, 82, 83, 81, 60, 53, 58, 65, 75, 85, 89, 85, 64, 57,
+        61, 68, 78, 89, 93, 89, 66, 59, 63, 69, 79, 91, 94, 93, 68, 61, 63, 71,
+        79, 87, 96, 98, 70, 63, 63, 70, 80, 89, 97, 100, 72, 65, 63, 69, 77, 86,
+        95, 102,
+        /* Size 16x8 */
+        32, 31, 33, 37, 44, 48, 49, 51, 54, 57, 60, 64, 66, 68, 70, 72, 34, 36,
+        40, 44, 46, 46, 45, 47, 49, 51, 53, 57, 59, 61, 63, 65, 48, 46, 47, 47,
+        51, 53, 53, 54, 55, 56, 58, 61, 63, 63, 63, 63, 49, 46, 46, 45, 51, 56,
+        58, 60, 62, 64, 65, 68, 69, 71, 70, 69, 54, 50, 49, 48, 53, 58, 62, 65,
+        70, 73, 75, 78, 79, 79, 80, 77, 63, 58, 56, 54, 59, 64, 67, 71, 77, 82,
+        85, 89, 91, 87, 89, 86, 67, 62, 59, 57, 60, 64, 70, 73, 77, 83, 89, 93,
+        94, 96, 97, 95, 69, 65, 62, 60, 61, 64, 68, 72, 76, 81, 85, 89, 93, 98,
+        100, 102,
+        /* Size 16x32 */
+        32, 31, 34, 37, 48, 48, 49, 52, 54, 57, 63, 64, 67, 68, 69, 69, 31, 31,
+        35, 38, 47, 47, 47, 50, 51, 54, 60, 61, 63, 64, 65, 66, 31, 32, 36, 39,
+        46, 46, 46, 48, 50, 53, 58, 59, 62, 63, 65, 66, 30, 32, 36, 40, 46, 45,
+        45, 48, 49, 52, 57, 58, 60, 61, 62, 63, 33, 36, 40, 43, 47, 46, 46, 47,
+        49, 51, 56, 57, 59, 60, 62, 63, 35, 38, 42, 45, 47, 46, 45, 47, 48, 50,
+        55, 56, 58, 60, 61, 61, 37, 40, 44, 47, 47, 46, 45, 47, 48, 50, 54, 55,
+        57, 58, 60, 61, 42, 43, 45, 47, 50, 50, 49, 50, 51, 53, 57, 58, 59, 58,
+        59, 59, 44, 44, 46, 47, 51, 51, 51, 52, 53, 54, 59, 59, 60, 61, 61, 60,
+        49, 46, 47, 48, 53, 53, 53, 54, 55, 57, 60, 61, 63, 62, 62, 63, 48, 46,
+        46, 47, 53, 54, 56, 57, 58, 60, 64, 64, 64, 64, 64, 63, 48, 45, 46, 46,
+        53, 55, 56, 58, 59, 61, 65, 65, 66, 66, 65, 66, 49, 45, 45, 46, 53, 56,
+        58, 61, 62, 64, 67, 68, 70, 67, 68, 66, 50, 46, 46, 46, 54, 56, 59, 63,
+        65, 66, 70, 71, 70, 71, 68, 70, 51, 47, 47, 47, 54, 57, 60, 64, 65, 68,
+        71, 72, 73, 71, 72, 70, 52, 48, 47, 47, 54, 57, 61, 66, 68, 71, 75, 75,
+        76, 75, 73, 73, 54, 49, 49, 48, 55, 58, 62, 68, 70, 73, 77, 78, 77, 77,
+        76, 74, 54, 50, 49, 49, 55, 59, 62, 68, 70, 74, 78, 79, 81, 79, 77, 78,
+        57, 52, 51, 50, 56, 60, 64, 70, 73, 76, 82, 82, 83, 82, 81, 78, 59, 54,
+        52, 52, 58, 61, 65, 72, 74, 78, 84, 85, 85, 83, 82, 82, 60, 54, 53, 52,
+        58, 62, 65, 72, 75, 79, 85, 86, 89, 87, 85, 82, 63, 57, 56, 55, 60, 64,
+        67, 75, 77, 82, 89, 90, 90, 88, 87, 86, 64, 58, 57, 55, 61, 64, 68, 75,
+        78, 82, 89, 90, 93, 91, 89, 87, 64, 59, 57, 56, 61, 65, 68, 75, 78, 83,
+        90, 91, 94, 93, 92, 91, 66, 60, 59, 57, 63, 66, 69, 77, 79, 84, 91, 93,
+        94, 95, 93, 91, 67, 61, 60, 58, 63, 65, 70, 75, 78, 85, 88, 93, 96, 97,
+        97, 95, 68, 62, 61, 59, 63, 64, 71, 74, 79, 84, 87, 94, 96, 97, 98, 96,
+        69, 63, 62, 60, 63, 65, 71, 72, 80, 82, 88, 93, 96, 99, 100, 101, 70,
+        64, 63, 60, 63, 66, 70, 73, 80, 81, 89, 90, 97, 99, 100, 101, 71, 65,
+        64, 61, 63, 67, 70, 74, 78, 82, 88, 90, 97, 99, 102, 103, 72, 65, 65,
+        62, 63, 68, 69, 75, 77, 83, 86, 92, 95, 100, 102, 103, 73, 66, 66, 63,
+        63, 69, 69, 76, 76, 84, 84, 93, 93, 101, 101, 105,
+        /* Size 32x16 */
+        32, 31, 31, 30, 33, 35, 37, 42, 44, 49, 48, 48, 49, 50, 51, 52, 54, 54,
+        57, 59, 60, 63, 64, 64, 66, 67, 68, 69, 70, 71, 72, 73, 31, 31, 32, 32,
+        36, 38, 40, 43, 44, 46, 46, 45, 45, 46, 47, 48, 49, 50, 52, 54, 54, 57,
+        58, 59, 60, 61, 62, 63, 64, 65, 65, 66, 34, 35, 36, 36, 40, 42, 44, 45,
+        46, 47, 46, 46, 45, 46, 47, 47, 49, 49, 51, 52, 53, 56, 57, 57, 59, 60,
+        61, 62, 63, 64, 65, 66, 37, 38, 39, 40, 43, 45, 47, 47, 47, 48, 47, 46,
+        46, 46, 47, 47, 48, 49, 50, 52, 52, 55, 55, 56, 57, 58, 59, 60, 60, 61,
+        62, 63, 48, 47, 46, 46, 47, 47, 47, 50, 51, 53, 53, 53, 53, 54, 54, 54,
+        55, 55, 56, 58, 58, 60, 61, 61, 63, 63, 63, 63, 63, 63, 63, 63, 48, 47,
+        46, 45, 46, 46, 46, 50, 51, 53, 54, 55, 56, 56, 57, 57, 58, 59, 60, 61,
+        62, 64, 64, 65, 66, 65, 64, 65, 66, 67, 68, 69, 49, 47, 46, 45, 46, 45,
+        45, 49, 51, 53, 56, 56, 58, 59, 60, 61, 62, 62, 64, 65, 65, 67, 68, 68,
+        69, 70, 71, 71, 70, 70, 69, 69, 52, 50, 48, 48, 47, 47, 47, 50, 52, 54,
+        57, 58, 61, 63, 64, 66, 68, 68, 70, 72, 72, 75, 75, 75, 77, 75, 74, 72,
+        73, 74, 75, 76, 54, 51, 50, 49, 49, 48, 48, 51, 53, 55, 58, 59, 62, 65,
+        65, 68, 70, 70, 73, 74, 75, 77, 78, 78, 79, 78, 79, 80, 80, 78, 77, 76,
+        57, 54, 53, 52, 51, 50, 50, 53, 54, 57, 60, 61, 64, 66, 68, 71, 73, 74,
+        76, 78, 79, 82, 82, 83, 84, 85, 84, 82, 81, 82, 83, 84, 63, 60, 58, 57,
+        56, 55, 54, 57, 59, 60, 64, 65, 67, 70, 71, 75, 77, 78, 82, 84, 85, 89,
+        89, 90, 91, 88, 87, 88, 89, 88, 86, 84, 64, 61, 59, 58, 57, 56, 55, 58,
+        59, 61, 64, 65, 68, 71, 72, 75, 78, 79, 82, 85, 86, 90, 90, 91, 93, 93,
+        94, 93, 90, 90, 92, 93, 67, 63, 62, 60, 59, 58, 57, 59, 60, 63, 64, 66,
+        70, 70, 73, 76, 77, 81, 83, 85, 89, 90, 93, 94, 94, 96, 96, 96, 97, 97,
+        95, 93, 68, 64, 63, 61, 60, 60, 58, 58, 61, 62, 64, 66, 67, 71, 71, 75,
+        77, 79, 82, 83, 87, 88, 91, 93, 95, 97, 97, 99, 99, 99, 100, 101, 69,
+        65, 65, 62, 62, 61, 60, 59, 61, 62, 64, 65, 68, 68, 72, 73, 76, 77, 81,
+        82, 85, 87, 89, 92, 93, 97, 98, 100, 100, 102, 102, 101, 69, 66, 66, 63,
+        63, 61, 61, 59, 60, 63, 63, 66, 66, 70, 70, 73, 74, 78, 78, 82, 82, 86,
+        87, 91, 91, 95, 96, 101, 101, 103, 103, 105,
+        /* Size 4x16 */
+        31, 48, 57, 68, 32, 46, 53, 63, 36, 46, 51, 60, 40, 46, 50, 58, 44, 51,
+        54, 61, 46, 54, 60, 64, 45, 56, 64, 67, 47, 57, 68, 71, 49, 58, 73, 77,
+        52, 60, 76, 82, 54, 62, 79, 87, 58, 64, 82, 91, 60, 66, 84, 95, 62, 64,
+        84, 97, 64, 66, 81, 99, 65, 68, 83, 100,
+        /* Size 16x4 */
+        31, 32, 36, 40, 44, 46, 45, 47, 49, 52, 54, 58, 60, 62, 64, 65, 48, 46,
+        46, 46, 51, 54, 56, 57, 58, 60, 62, 64, 66, 64, 66, 68, 57, 53, 51, 50,
+        54, 60, 64, 68, 73, 76, 79, 82, 84, 84, 81, 83, 68, 63, 60, 58, 61, 64,
+        67, 71, 77, 82, 87, 91, 95, 97, 99, 100,
+        /* Size 8x32 */
+        32, 34, 48, 49, 54, 63, 67, 69, 31, 35, 47, 47, 51, 60, 63, 65, 31, 36,
+        46, 46, 50, 58, 62, 65, 30, 36, 46, 45, 49, 57, 60, 62, 33, 40, 47, 46,
+        49, 56, 59, 62, 35, 42, 47, 45, 48, 55, 58, 61, 37, 44, 47, 45, 48, 54,
+        57, 60, 42, 45, 50, 49, 51, 57, 59, 59, 44, 46, 51, 51, 53, 59, 60, 61,
+        49, 47, 53, 53, 55, 60, 63, 62, 48, 46, 53, 56, 58, 64, 64, 64, 48, 46,
+        53, 56, 59, 65, 66, 65, 49, 45, 53, 58, 62, 67, 70, 68, 50, 46, 54, 59,
+        65, 70, 70, 68, 51, 47, 54, 60, 65, 71, 73, 72, 52, 47, 54, 61, 68, 75,
+        76, 73, 54, 49, 55, 62, 70, 77, 77, 76, 54, 49, 55, 62, 70, 78, 81, 77,
+        57, 51, 56, 64, 73, 82, 83, 81, 59, 52, 58, 65, 74, 84, 85, 82, 60, 53,
+        58, 65, 75, 85, 89, 85, 63, 56, 60, 67, 77, 89, 90, 87, 64, 57, 61, 68,
+        78, 89, 93, 89, 64, 57, 61, 68, 78, 90, 94, 92, 66, 59, 63, 69, 79, 91,
+        94, 93, 67, 60, 63, 70, 78, 88, 96, 97, 68, 61, 63, 71, 79, 87, 96, 98,
+        69, 62, 63, 71, 80, 88, 96, 100, 70, 63, 63, 70, 80, 89, 97, 100, 71,
+        64, 63, 70, 78, 88, 97, 102, 72, 65, 63, 69, 77, 86, 95, 102, 73, 66,
+        63, 69, 76, 84, 93, 101,
+        /* Size 32x8 */
+        32, 31, 31, 30, 33, 35, 37, 42, 44, 49, 48, 48, 49, 50, 51, 52, 54, 54,
+        57, 59, 60, 63, 64, 64, 66, 67, 68, 69, 70, 71, 72, 73, 34, 35, 36, 36,
+        40, 42, 44, 45, 46, 47, 46, 46, 45, 46, 47, 47, 49, 49, 51, 52, 53, 56,
+        57, 57, 59, 60, 61, 62, 63, 64, 65, 66, 48, 47, 46, 46, 47, 47, 47, 50,
+        51, 53, 53, 53, 53, 54, 54, 54, 55, 55, 56, 58, 58, 60, 61, 61, 63, 63,
+        63, 63, 63, 63, 63, 63, 49, 47, 46, 45, 46, 45, 45, 49, 51, 53, 56, 56,
+        58, 59, 60, 61, 62, 62, 64, 65, 65, 67, 68, 68, 69, 70, 71, 71, 70, 70,
+        69, 69, 54, 51, 50, 49, 49, 48, 48, 51, 53, 55, 58, 59, 62, 65, 65, 68,
+        70, 70, 73, 74, 75, 77, 78, 78, 79, 78, 79, 80, 80, 78, 77, 76, 63, 60,
+        58, 57, 56, 55, 54, 57, 59, 60, 64, 65, 67, 70, 71, 75, 77, 78, 82, 84,
+        85, 89, 89, 90, 91, 88, 87, 88, 89, 88, 86, 84, 67, 63, 62, 60, 59, 58,
+        57, 59, 60, 63, 64, 66, 70, 70, 73, 76, 77, 81, 83, 85, 89, 90, 93, 94,
+        94, 96, 96, 96, 97, 97, 95, 93, 69, 65, 65, 62, 62, 61, 60, 59, 61, 62,
+        64, 65, 68, 68, 72, 73, 76, 77, 81, 82, 85, 87, 89, 92, 93, 97, 98, 100,
+        100, 102, 102, 101 },
+  },
+  {
+      { /* Luma */
+        /* Size 4x4 */
+        32, 37, 58, 81, 37, 54, 72, 91, 58, 72, 102, 121, 81, 91, 121, 156,
+        /* Size 8x8 */
+        32, 32, 35, 42, 53, 68, 78, 90, 32, 33, 36, 42, 51, 64, 74, 84, 35, 36,
+        46, 52, 60, 72, 80, 87, 42, 42, 52, 63, 73, 84, 92, 98, 53, 51, 60, 73,
+        86, 100, 109, 114, 68, 64, 72, 84, 100, 117, 128, 133, 78, 74, 80, 92,
+        109, 128, 140, 155, 90, 84, 87, 98, 114, 133, 155, 168,
+        /* Size 16x16 */
+        32, 31, 31, 32, 34, 36, 41, 47, 54, 59, 65, 74, 82, 87, 92, 97, 31, 32,
+        32, 32, 34, 35, 39, 45, 50, 55, 61, 69, 76, 81, 87, 92, 31, 32, 33, 33,
+        35, 36, 40, 44, 49, 54, 59, 67, 73, 78, 83, 88, 32, 32, 33, 35, 37, 38,
+        41, 45, 49, 53, 58, 65, 71, 75, 80, 86, 34, 34, 35, 37, 39, 42, 46, 50,
+        54, 58, 63, 70, 76, 80, 84, 85, 36, 35, 36, 38, 42, 48, 52, 56, 60, 64,
+        68, 75, 80, 85, 90, 91, 41, 39, 40, 41, 46, 52, 57, 62, 67, 71, 75, 83,
+        88, 92, 95, 97, 47, 45, 44, 45, 50, 56, 62, 69, 75, 79, 84, 91, 97, 100,
+        102, 104, 54, 50, 49, 49, 54, 60, 67, 75, 82, 87, 92, 100, 106, 110,
+        109, 112, 59, 55, 54, 53, 58, 64, 71, 79, 87, 92, 98, 106, 112, 117,
+        117, 121, 65, 61, 59, 58, 63, 68, 75, 84, 92, 98, 105, 114, 120, 125,
+        126, 130, 74, 69, 67, 65, 70, 75, 83, 91, 100, 106, 114, 123, 131, 135,
+        137, 140, 82, 76, 73, 71, 76, 80, 88, 97, 106, 112, 120, 131, 139, 144,
+        148, 150, 87, 81, 78, 75, 80, 85, 92, 100, 110, 117, 125, 135, 144, 150,
+        155, 162, 92, 87, 83, 80, 84, 90, 95, 102, 109, 117, 126, 137, 148, 155,
+        162, 168, 97, 92, 88, 86, 85, 91, 97, 104, 112, 121, 130, 140, 150, 162,
+        168, 174,
+        /* Size 32x32 */
+        32, 31, 31, 31, 31, 31, 32, 32, 34, 35, 36, 39, 41, 44, 47, 48, 54, 56,
+        59, 64, 65, 71, 74, 80, 82, 83, 87, 90, 92, 95, 97, 100, 31, 32, 32, 32,
+        32, 32, 32, 33, 34, 35, 35, 38, 40, 42, 45, 46, 51, 53, 56, 61, 62, 68,
+        71, 76, 78, 78, 83, 85, 88, 90, 92, 95, 31, 32, 32, 32, 32, 32, 32, 33,
+        34, 34, 35, 38, 39, 42, 45, 45, 50, 52, 55, 60, 61, 67, 69, 74, 76, 77,
+        81, 84, 87, 89, 92, 95, 31, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 37,
+        38, 41, 44, 44, 49, 51, 54, 58, 59, 65, 68, 72, 74, 75, 79, 81, 84, 86,
+        88, 90, 31, 32, 32, 32, 33, 33, 33, 34, 35, 36, 36, 39, 40, 42, 44, 45,
+        49, 51, 54, 58, 59, 64, 67, 71, 73, 74, 78, 80, 83, 85, 88, 90, 31, 32,
+        32, 32, 33, 33, 34, 34, 35, 36, 36, 39, 40, 42, 45, 45, 50, 51, 54, 58,
+        59, 64, 67, 71, 73, 74, 78, 80, 82, 84, 86, 89, 32, 32, 32, 32, 33, 34,
+        35, 36, 37, 38, 38, 40, 41, 42, 45, 46, 49, 51, 53, 57, 58, 63, 65, 69,
+        71, 72, 75, 78, 80, 83, 86, 89, 32, 33, 33, 33, 34, 34, 36, 36, 38, 39,
+        40, 42, 43, 44, 47, 47, 51, 53, 55, 59, 60, 65, 67, 71, 73, 73, 77, 78,
+        80, 82, 84, 86, 34, 34, 34, 33, 35, 35, 37, 38, 39, 42, 42, 45, 46, 47,
+        50, 51, 54, 56, 58, 62, 63, 68, 70, 74, 76, 76, 80, 82, 84, 85, 85, 86,
+        35, 35, 34, 34, 36, 36, 38, 39, 42, 46, 47, 49, 50, 52, 55, 55, 59, 60,
+        62, 66, 67, 72, 74, 78, 79, 80, 83, 84, 85, 87, 90, 92, 36, 35, 35, 34,
+        36, 36, 38, 40, 42, 47, 48, 50, 52, 54, 56, 57, 60, 61, 64, 67, 68, 73,
+        75, 79, 80, 81, 85, 87, 90, 91, 91, 92, 39, 38, 38, 37, 39, 39, 40, 42,
+        45, 49, 50, 54, 55, 58, 60, 61, 65, 66, 69, 72, 73, 78, 80, 84, 86, 86,
+        90, 91, 91, 92, 95, 97, 41, 40, 39, 38, 40, 40, 41, 43, 46, 50, 52, 55,
+        57, 60, 62, 63, 67, 69, 71, 75, 75, 80, 83, 86, 88, 89, 92, 93, 95, 97,
+        97, 98, 44, 42, 42, 41, 42, 42, 42, 44, 47, 52, 54, 58, 60, 63, 66, 67,
+        71, 73, 75, 79, 79, 84, 86, 90, 92, 92, 96, 98, 98, 98, 101, 104, 47,
+        45, 45, 44, 44, 45, 45, 47, 50, 55, 56, 60, 62, 66, 69, 70, 75, 77, 79,
+        83, 84, 89, 91, 95, 97, 97, 100, 99, 102, 105, 104, 104, 48, 46, 45, 44,
+        45, 45, 46, 47, 51, 55, 57, 61, 63, 67, 70, 71, 76, 78, 80, 84, 85, 90,
+        93, 96, 98, 99, 102, 106, 106, 105, 108, 111, 54, 51, 50, 49, 49, 50,
+        49, 51, 54, 59, 60, 65, 67, 71, 75, 76, 82, 84, 87, 91, 92, 97, 100,
+        104, 106, 106, 110, 108, 109, 112, 112, 111, 56, 53, 52, 51, 51, 51, 51,
+        53, 56, 60, 61, 66, 69, 73, 77, 78, 84, 86, 89, 93, 94, 100, 102, 106,
+        108, 109, 112, 113, 115, 114, 116, 119, 59, 56, 55, 54, 54, 54, 53, 55,
+        58, 62, 64, 69, 71, 75, 79, 80, 87, 89, 92, 97, 98, 103, 106, 110, 112,
+        113, 117, 118, 117, 121, 121, 119, 64, 61, 60, 58, 58, 58, 57, 59, 62,
+        66, 67, 72, 75, 79, 83, 84, 91, 93, 97, 102, 103, 109, 112, 116, 118,
+        119, 122, 121, 125, 123, 125, 128, 65, 62, 61, 59, 59, 59, 58, 60, 63,
+        67, 68, 73, 75, 79, 84, 85, 92, 94, 98, 103, 105, 111, 114, 118, 120,
+        121, 125, 129, 126, 129, 130, 129, 71, 68, 67, 65, 64, 64, 63, 65, 68,
+        72, 73, 78, 80, 84, 89, 90, 97, 100, 103, 109, 111, 117, 120, 125, 127,
+        128, 133, 130, 134, 133, 133, 137, 74, 71, 69, 68, 67, 67, 65, 67, 70,
+        74, 75, 80, 83, 86, 91, 93, 100, 102, 106, 112, 114, 120, 123, 128, 131,
+        131, 135, 137, 137, 138, 140, 137, 80, 76, 74, 72, 71, 71, 69, 71, 74,
+        78, 79, 84, 86, 90, 95, 96, 104, 106, 110, 116, 118, 125, 128, 134, 136,
+        137, 142, 141, 142, 143, 143, 147, 82, 78, 76, 74, 73, 73, 71, 73, 76,
+        79, 80, 86, 88, 92, 97, 98, 106, 108, 112, 118, 120, 127, 131, 136, 139,
+        139, 144, 147, 148, 147, 150, 148, 83, 78, 77, 75, 74, 74, 72, 73, 76,
+        80, 81, 86, 89, 92, 97, 99, 106, 109, 113, 119, 121, 128, 131, 137, 139,
+        140, 145, 150, 152, 155, 152, 157, 87, 83, 81, 79, 78, 78, 75, 77, 80,
+        83, 85, 90, 92, 96, 100, 102, 110, 112, 117, 122, 125, 133, 135, 142,
+        144, 145, 150, 151, 155, 158, 162, 158, 90, 85, 84, 81, 80, 80, 78, 78,
+        82, 84, 87, 91, 93, 98, 99, 106, 108, 113, 118, 121, 129, 130, 137, 141,
+        147, 150, 151, 156, 156, 161, 164, 169, 92, 88, 87, 84, 83, 82, 80, 80,
+        84, 85, 90, 91, 95, 98, 102, 106, 109, 115, 117, 125, 126, 134, 137,
+        142, 148, 152, 155, 156, 162, 162, 168, 170, 95, 90, 89, 86, 85, 84, 83,
+        82, 85, 87, 91, 92, 97, 98, 105, 105, 112, 114, 121, 123, 129, 133, 138,
+        143, 147, 155, 158, 161, 162, 168, 168, 174, 97, 92, 92, 88, 88, 86, 86,
+        84, 85, 90, 91, 95, 97, 101, 104, 108, 112, 116, 121, 125, 130, 133,
+        140, 143, 150, 152, 162, 164, 168, 168, 174, 175, 100, 95, 95, 90, 90,
+        89, 89, 86, 86, 92, 92, 97, 98, 104, 104, 111, 111, 119, 119, 128, 129,
+        137, 137, 147, 148, 157, 158, 169, 170, 174, 175, 181,
+        /* Size 4x8 */
+        32, 35, 59, 83, 32, 36, 57, 78, 34, 47, 65, 82, 41, 53, 78, 97, 51, 61,
+        92, 111, 65, 73, 108, 129, 75, 81, 117, 148, 86, 92, 119, 154,
+        /* Size 8x4 */
+        32, 32, 34, 41, 51, 65, 75, 86, 35, 36, 47, 53, 61, 73, 81, 92, 59, 57,
+        65, 78, 92, 108, 117, 119, 83, 78, 82, 97, 111, 129, 148, 154,
+        /* Size 8x16 */
+        32, 31, 35, 44, 53, 65, 82, 90, 31, 32, 34, 41, 50, 61, 76, 85, 31, 33,
+        35, 42, 49, 59, 73, 81, 32, 34, 37, 42, 49, 58, 71, 79, 34, 35, 41, 48,
+        54, 63, 76, 81, 36, 36, 46, 54, 60, 68, 80, 87, 41, 40, 49, 60, 67, 76,
+        88, 93, 47, 44, 53, 66, 75, 84, 97, 101, 53, 50, 57, 71, 82, 92, 106,
+        108, 58, 54, 61, 75, 87, 98, 112, 116, 65, 59, 66, 79, 92, 105, 120,
+        124, 74, 67, 73, 86, 100, 113, 131, 134, 82, 73, 79, 92, 105, 120, 139,
+        142, 87, 78, 83, 96, 110, 125, 144, 153, 92, 83, 84, 97, 114, 132, 150,
+        157, 97, 88, 86, 97, 111, 128, 147, 163,
+        /* Size 16x8 */
+        32, 31, 31, 32, 34, 36, 41, 47, 53, 58, 65, 74, 82, 87, 92, 97, 31, 32,
+        33, 34, 35, 36, 40, 44, 50, 54, 59, 67, 73, 78, 83, 88, 35, 34, 35, 37,
+        41, 46, 49, 53, 57, 61, 66, 73, 79, 83, 84, 86, 44, 41, 42, 42, 48, 54,
+        60, 66, 71, 75, 79, 86, 92, 96, 97, 97, 53, 50, 49, 49, 54, 60, 67, 75,
+        82, 87, 92, 100, 105, 110, 114, 111, 65, 61, 59, 58, 63, 68, 76, 84, 92,
+        98, 105, 113, 120, 125, 132, 128, 82, 76, 73, 71, 76, 80, 88, 97, 106,
+        112, 120, 131, 139, 144, 150, 147, 90, 85, 81, 79, 81, 87, 93, 101, 108,
+        116, 124, 134, 142, 153, 157, 163,
+        /* Size 16x32 */
+        32, 31, 31, 32, 35, 36, 44, 47, 53, 62, 65, 79, 82, 88, 90, 93, 31, 32,
+        32, 32, 35, 35, 42, 45, 51, 59, 62, 75, 78, 83, 86, 88, 31, 32, 32, 32,
+        34, 35, 41, 45, 50, 58, 61, 74, 76, 82, 85, 88, 31, 32, 32, 33, 34, 34,
+        41, 44, 49, 57, 59, 72, 74, 79, 82, 84, 31, 32, 33, 34, 35, 36, 42, 44,
+        49, 57, 59, 71, 73, 79, 81, 84, 32, 32, 33, 34, 36, 36, 42, 45, 50, 57,
+        59, 71, 73, 78, 80, 82, 32, 33, 34, 35, 37, 38, 42, 45, 49, 56, 58, 69,
+        71, 76, 79, 83, 32, 33, 34, 36, 39, 40, 44, 47, 51, 58, 60, 71, 73, 76,
+        78, 80, 34, 34, 35, 37, 41, 42, 48, 50, 54, 61, 63, 73, 76, 81, 81, 80,
+        35, 34, 36, 38, 45, 47, 52, 55, 59, 65, 67, 77, 79, 82, 83, 86, 36, 34,
+        36, 38, 46, 48, 54, 56, 60, 66, 68, 78, 80, 85, 87, 86, 39, 37, 39, 40,
+        48, 50, 58, 60, 65, 71, 73, 84, 86, 89, 88, 91, 41, 39, 40, 41, 49, 51,
+        60, 62, 67, 74, 76, 86, 88, 91, 93, 91, 44, 41, 42, 43, 51, 53, 63, 66,
+        71, 78, 79, 90, 92, 97, 94, 97, 47, 44, 44, 45, 53, 56, 66, 69, 75, 82,
+        84, 95, 97, 98, 101, 98, 48, 45, 45, 46, 54, 56, 67, 70, 76, 83, 85, 96,
+        98, 104, 101, 105, 53, 49, 50, 50, 57, 60, 71, 75, 82, 90, 92, 103, 106,
+        107, 108, 105, 55, 51, 51, 51, 59, 61, 72, 77, 84, 92, 94, 106, 108,
+        111, 110, 112, 58, 54, 54, 54, 61, 63, 75, 79, 87, 95, 98, 110, 112,
+        117, 116, 113, 63, 58, 58, 57, 65, 67, 78, 83, 91, 100, 103, 116, 118,
+        119, 119, 121, 65, 60, 59, 58, 66, 68, 79, 84, 92, 102, 105, 118, 120,
+        127, 124, 122, 71, 65, 64, 63, 71, 73, 84, 89, 97, 108, 111, 125, 127,
+        129, 129, 130, 74, 68, 67, 66, 73, 75, 86, 91, 100, 110, 113, 128, 131,
+        135, 134, 130, 79, 72, 71, 70, 77, 79, 90, 95, 104, 115, 118, 133, 136,
+        140, 139, 140, 82, 75, 73, 72, 79, 81, 92, 97, 105, 117, 120, 136, 139,
+        145, 142, 140, 82, 75, 74, 72, 79, 81, 92, 97, 106, 117, 121, 136, 139,
+        148, 150, 149, 87, 79, 78, 76, 83, 85, 96, 100, 110, 120, 125, 141, 144,
+        148, 153, 150, 89, 82, 81, 78, 83, 87, 97, 99, 113, 118, 128, 139, 145,
+        153, 157, 161, 92, 84, 83, 80, 84, 89, 97, 101, 114, 116, 132, 135, 150,
+        153, 157, 162, 94, 86, 85, 82, 85, 92, 97, 104, 112, 119, 130, 136, 151,
+        154, 163, 166, 97, 88, 88, 85, 86, 94, 97, 107, 111, 123, 128, 140, 147,
+        159, 163, 167, 99, 91, 91, 87, 87, 97, 97, 110, 110, 126, 126, 144, 144,
+        163, 163, 173,
+        /* Size 32x16 */
+        32, 31, 31, 31, 31, 32, 32, 32, 34, 35, 36, 39, 41, 44, 47, 48, 53, 55,
+        58, 63, 65, 71, 74, 79, 82, 82, 87, 89, 92, 94, 97, 99, 31, 32, 32, 32,
+        32, 32, 33, 33, 34, 34, 34, 37, 39, 41, 44, 45, 49, 51, 54, 58, 60, 65,
+        68, 72, 75, 75, 79, 82, 84, 86, 88, 91, 31, 32, 32, 32, 33, 33, 34, 34,
+        35, 36, 36, 39, 40, 42, 44, 45, 50, 51, 54, 58, 59, 64, 67, 71, 73, 74,
+        78, 81, 83, 85, 88, 91, 32, 32, 32, 33, 34, 34, 35, 36, 37, 38, 38, 40,
+        41, 43, 45, 46, 50, 51, 54, 57, 58, 63, 66, 70, 72, 72, 76, 78, 80, 82,
+        85, 87, 35, 35, 34, 34, 35, 36, 37, 39, 41, 45, 46, 48, 49, 51, 53, 54,
+        57, 59, 61, 65, 66, 71, 73, 77, 79, 79, 83, 83, 84, 85, 86, 87, 36, 35,
+        35, 34, 36, 36, 38, 40, 42, 47, 48, 50, 51, 53, 56, 56, 60, 61, 63, 67,
+        68, 73, 75, 79, 81, 81, 85, 87, 89, 92, 94, 97, 44, 42, 41, 41, 42, 42,
+        42, 44, 48, 52, 54, 58, 60, 63, 66, 67, 71, 72, 75, 78, 79, 84, 86, 90,
+        92, 92, 96, 97, 97, 97, 97, 97, 47, 45, 45, 44, 44, 45, 45, 47, 50, 55,
+        56, 60, 62, 66, 69, 70, 75, 77, 79, 83, 84, 89, 91, 95, 97, 97, 100, 99,
+        101, 104, 107, 110, 53, 51, 50, 49, 49, 50, 49, 51, 54, 59, 60, 65, 67,
+        71, 75, 76, 82, 84, 87, 91, 92, 97, 100, 104, 105, 106, 110, 113, 114,
+        112, 111, 110, 62, 59, 58, 57, 57, 57, 56, 58, 61, 65, 66, 71, 74, 78,
+        82, 83, 90, 92, 95, 100, 102, 108, 110, 115, 117, 117, 120, 118, 116,
+        119, 123, 126, 65, 62, 61, 59, 59, 59, 58, 60, 63, 67, 68, 73, 76, 79,
+        84, 85, 92, 94, 98, 103, 105, 111, 113, 118, 120, 121, 125, 128, 132,
+        130, 128, 126, 79, 75, 74, 72, 71, 71, 69, 71, 73, 77, 78, 84, 86, 90,
+        95, 96, 103, 106, 110, 116, 118, 125, 128, 133, 136, 136, 141, 139, 135,
+        136, 140, 144, 82, 78, 76, 74, 73, 73, 71, 73, 76, 79, 80, 86, 88, 92,
+        97, 98, 106, 108, 112, 118, 120, 127, 131, 136, 139, 139, 144, 145, 150,
+        151, 147, 144, 88, 83, 82, 79, 79, 78, 76, 76, 81, 82, 85, 89, 91, 97,
+        98, 104, 107, 111, 117, 119, 127, 129, 135, 140, 145, 148, 148, 153,
+        153, 154, 159, 163, 90, 86, 85, 82, 81, 80, 79, 78, 81, 83, 87, 88, 93,
+        94, 101, 101, 108, 110, 116, 119, 124, 129, 134, 139, 142, 150, 153,
+        157, 157, 163, 163, 163, 93, 88, 88, 84, 84, 82, 83, 80, 80, 86, 86, 91,
+        91, 97, 98, 105, 105, 112, 113, 121, 122, 130, 130, 140, 140, 149, 150,
+        161, 162, 166, 167, 173,
+        /* Size 4x16 */
+        31, 36, 62, 88, 32, 35, 58, 82, 32, 36, 57, 79, 33, 38, 56, 76, 34, 42,
+        61, 81, 34, 48, 66, 85, 39, 51, 74, 91, 44, 56, 82, 98, 49, 60, 90, 107,
+        54, 63, 95, 117, 60, 68, 102, 127, 68, 75, 110, 135, 75, 81, 117, 145,
+        79, 85, 120, 148, 84, 89, 116, 153, 88, 94, 123, 159,
+        /* Size 16x4 */
+        31, 32, 32, 33, 34, 34, 39, 44, 49, 54, 60, 68, 75, 79, 84, 88, 36, 35,
+        36, 38, 42, 48, 51, 56, 60, 63, 68, 75, 81, 85, 89, 94, 62, 58, 57, 56,
+        61, 66, 74, 82, 90, 95, 102, 110, 117, 120, 116, 123, 88, 82, 79, 76,
+        81, 85, 91, 98, 107, 117, 127, 135, 145, 148, 153, 159,
+        /* Size 8x32 */
+        32, 31, 35, 44, 53, 65, 82, 90, 31, 32, 35, 42, 51, 62, 78, 86, 31, 32,
+        34, 41, 50, 61, 76, 85, 31, 32, 34, 41, 49, 59, 74, 82, 31, 33, 35, 42,
+        49, 59, 73, 81, 32, 33, 36, 42, 50, 59, 73, 80, 32, 34, 37, 42, 49, 58,
+        71, 79, 32, 34, 39, 44, 51, 60, 73, 78, 34, 35, 41, 48, 54, 63, 76, 81,
+        35, 36, 45, 52, 59, 67, 79, 83, 36, 36, 46, 54, 60, 68, 80, 87, 39, 39,
+        48, 58, 65, 73, 86, 88, 41, 40, 49, 60, 67, 76, 88, 93, 44, 42, 51, 63,
+        71, 79, 92, 94, 47, 44, 53, 66, 75, 84, 97, 101, 48, 45, 54, 67, 76, 85,
+        98, 101, 53, 50, 57, 71, 82, 92, 106, 108, 55, 51, 59, 72, 84, 94, 108,
+        110, 58, 54, 61, 75, 87, 98, 112, 116, 63, 58, 65, 78, 91, 103, 118,
+        119, 65, 59, 66, 79, 92, 105, 120, 124, 71, 64, 71, 84, 97, 111, 127,
+        129, 74, 67, 73, 86, 100, 113, 131, 134, 79, 71, 77, 90, 104, 118, 136,
+        139, 82, 73, 79, 92, 105, 120, 139, 142, 82, 74, 79, 92, 106, 121, 139,
+        150, 87, 78, 83, 96, 110, 125, 144, 153, 89, 81, 83, 97, 113, 128, 145,
+        157, 92, 83, 84, 97, 114, 132, 150, 157, 94, 85, 85, 97, 112, 130, 151,
+        163, 97, 88, 86, 97, 111, 128, 147, 163, 99, 91, 87, 97, 110, 126, 144,
+        163,
+        /* Size 32x8 */
+        32, 31, 31, 31, 31, 32, 32, 32, 34, 35, 36, 39, 41, 44, 47, 48, 53, 55,
+        58, 63, 65, 71, 74, 79, 82, 82, 87, 89, 92, 94, 97, 99, 31, 32, 32, 32,
+        33, 33, 34, 34, 35, 36, 36, 39, 40, 42, 44, 45, 50, 51, 54, 58, 59, 64,
+        67, 71, 73, 74, 78, 81, 83, 85, 88, 91, 35, 35, 34, 34, 35, 36, 37, 39,
+        41, 45, 46, 48, 49, 51, 53, 54, 57, 59, 61, 65, 66, 71, 73, 77, 79, 79,
+        83, 83, 84, 85, 86, 87, 44, 42, 41, 41, 42, 42, 42, 44, 48, 52, 54, 58,
+        60, 63, 66, 67, 71, 72, 75, 78, 79, 84, 86, 90, 92, 92, 96, 97, 97, 97,
+        97, 97, 53, 51, 50, 49, 49, 50, 49, 51, 54, 59, 60, 65, 67, 71, 75, 76,
+        82, 84, 87, 91, 92, 97, 100, 104, 105, 106, 110, 113, 114, 112, 111,
+        110, 65, 62, 61, 59, 59, 59, 58, 60, 63, 67, 68, 73, 76, 79, 84, 85, 92,
+        94, 98, 103, 105, 111, 113, 118, 120, 121, 125, 128, 132, 130, 128, 126,
+        82, 78, 76, 74, 73, 73, 71, 73, 76, 79, 80, 86, 88, 92, 97, 98, 106,
+        108, 112, 118, 120, 127, 131, 136, 139, 139, 144, 145, 150, 151, 147,
+        144, 90, 86, 85, 82, 81, 80, 79, 78, 81, 83, 87, 88, 93, 94, 101, 101,
+        108, 110, 116, 119, 124, 129, 134, 139, 142, 150, 153, 157, 157, 163,
+        163, 163 },
+      { /* Chroma */
+        /* Size 4x4 */
+        32, 45, 51, 61, 45, 54, 59, 65, 51, 59, 75, 81, 61, 65, 81, 97,
+        /* Size 8x8 */
+        31, 34, 46, 47, 50, 57, 61, 65, 34, 39, 47, 45, 48, 53, 57, 61, 46, 47,
+        52, 52, 54, 58, 61, 62, 47, 45, 52, 58, 62, 65, 68, 68, 50, 48, 54, 62,
+        68, 73, 77, 76, 57, 53, 58, 65, 73, 82, 86, 86, 61, 57, 61, 68, 77, 86,
+        91, 95, 65, 61, 62, 68, 76, 86, 95, 100,
+        /* Size 16x16 */
+        32, 31, 33, 36, 41, 49, 49, 50, 52, 54, 57, 61, 64, 67, 68, 70, 31, 31,
+        34, 39, 42, 47, 46, 47, 49, 51, 53, 57, 60, 62, 64, 66, 33, 34, 37, 42,
+        44, 47, 46, 46, 47, 49, 51, 55, 57, 59, 61, 63, 36, 39, 42, 47, 47, 48,
+        46, 46, 47, 48, 50, 53, 55, 57, 59, 61, 41, 42, 44, 47, 48, 50, 49, 50,
+        50, 52, 53, 56, 58, 60, 61, 60, 49, 47, 47, 48, 50, 53, 53, 54, 54, 55,
+        56, 59, 61, 63, 64, 64, 49, 46, 46, 46, 49, 53, 55, 57, 59, 60, 61, 64,
+        66, 67, 67, 67, 50, 47, 46, 46, 50, 54, 57, 61, 63, 64, 66, 69, 70, 72,
+        71, 71, 52, 49, 47, 47, 50, 54, 59, 63, 66, 68, 70, 73, 75, 77, 75, 75,
+        54, 51, 49, 48, 52, 55, 60, 64, 68, 71, 73, 76, 79, 80, 79, 79, 57, 53,
+        51, 50, 53, 56, 61, 66, 70, 73, 76, 80, 82, 84, 83, 84, 61, 57, 55, 53,
+        56, 59, 64, 69, 73, 76, 80, 84, 87, 89, 88, 88, 64, 60, 57, 55, 58, 61,
+        66, 70, 75, 79, 82, 87, 91, 93, 93, 93, 67, 62, 59, 57, 60, 63, 67, 72,
+        77, 80, 84, 89, 93, 95, 96, 97, 68, 64, 61, 59, 61, 64, 67, 71, 75, 79,
+        83, 88, 93, 96, 99, 100, 70, 66, 63, 61, 60, 64, 67, 71, 75, 79, 84, 88,
+        93, 97, 100, 102,
+        /* Size 32x32 */
+        32, 31, 31, 30, 33, 33, 36, 38, 41, 47, 49, 48, 49, 49, 50, 50, 52, 53,
+        54, 56, 57, 60, 61, 63, 64, 65, 67, 67, 68, 69, 70, 71, 31, 31, 31, 31,
+        34, 34, 38, 40, 42, 46, 47, 47, 47, 47, 48, 48, 50, 50, 52, 54, 54, 57,
+        58, 60, 61, 61, 63, 64, 65, 65, 66, 67, 31, 31, 31, 31, 34, 35, 39, 40,
+        42, 46, 47, 46, 46, 46, 47, 47, 49, 50, 51, 53, 53, 56, 57, 59, 60, 60,
+        62, 63, 64, 65, 66, 67, 30, 31, 31, 32, 34, 35, 40, 41, 42, 45, 46, 45,
+        45, 45, 46, 46, 47, 48, 49, 51, 52, 54, 55, 57, 58, 58, 60, 61, 62, 62,
+        63, 64, 33, 34, 34, 34, 37, 38, 42, 43, 44, 46, 47, 46, 46, 45, 46, 46,
+        47, 48, 49, 51, 51, 53, 55, 56, 57, 57, 59, 60, 61, 62, 63, 64, 33, 34,
+        35, 35, 38, 39, 43, 44, 45, 47, 47, 46, 46, 45, 46, 46, 47, 48, 49, 51,
+        51, 53, 54, 56, 57, 57, 59, 60, 60, 61, 62, 62, 36, 38, 39, 40, 42, 43,
+        47, 47, 47, 47, 48, 46, 46, 45, 46, 46, 47, 47, 48, 49, 50, 52, 53, 54,
+        55, 55, 57, 58, 59, 60, 61, 62, 38, 40, 40, 41, 43, 44, 47, 47, 48, 48,
+        49, 48, 47, 47, 47, 47, 48, 49, 49, 51, 51, 53, 54, 55, 56, 56, 58, 58,
+        58, 59, 60, 60, 41, 42, 42, 42, 44, 45, 47, 48, 48, 50, 50, 49, 49, 49,
+        50, 50, 50, 51, 52, 53, 53, 55, 56, 57, 58, 58, 60, 61, 61, 61, 60, 60,
+        47, 46, 46, 45, 46, 47, 47, 48, 50, 52, 52, 52, 52, 52, 53, 53, 53, 54,
+        55, 55, 56, 58, 58, 60, 60, 61, 62, 61, 61, 62, 63, 64, 49, 47, 47, 46,
+        47, 47, 48, 49, 50, 52, 53, 53, 53, 53, 54, 54, 54, 55, 55, 56, 56, 58,
+        59, 60, 61, 61, 63, 63, 64, 64, 64, 64, 48, 47, 46, 45, 46, 46, 46, 48,
+        49, 52, 53, 54, 55, 55, 56, 56, 57, 58, 58, 59, 60, 61, 62, 63, 64, 64,
+        66, 65, 65, 65, 66, 67, 49, 47, 46, 45, 46, 46, 46, 47, 49, 52, 53, 55,
+        55, 57, 57, 58, 59, 59, 60, 61, 61, 63, 64, 65, 66, 66, 67, 67, 67, 68,
+        67, 67, 49, 47, 46, 45, 45, 45, 45, 47, 49, 52, 53, 55, 57, 58, 59, 60,
+        61, 62, 62, 63, 63, 65, 66, 67, 68, 68, 69, 70, 69, 68, 69, 70, 50, 48,
+        47, 46, 46, 46, 46, 47, 50, 53, 54, 56, 57, 59, 61, 61, 63, 64, 64, 66,
+        66, 68, 69, 70, 70, 71, 72, 70, 71, 72, 71, 70, 50, 48, 47, 46, 46, 46,
+        46, 47, 50, 53, 54, 56, 58, 60, 61, 61, 63, 64, 65, 66, 67, 68, 69, 71,
+        71, 71, 73, 74, 73, 72, 73, 74, 52, 50, 49, 47, 47, 47, 47, 48, 50, 53,
+        54, 57, 59, 61, 63, 63, 66, 67, 68, 70, 70, 72, 73, 75, 75, 75, 77, 75,
+        75, 76, 75, 74, 53, 50, 50, 48, 48, 48, 47, 49, 51, 54, 55, 58, 59, 62,
+        64, 64, 67, 68, 69, 71, 71, 73, 74, 76, 77, 77, 78, 78, 78, 76, 77, 78,
+        54, 52, 51, 49, 49, 49, 48, 49, 52, 55, 55, 58, 60, 62, 64, 65, 68, 69,
+        71, 73, 73, 75, 76, 78, 79, 79, 80, 80, 79, 80, 79, 78, 56, 54, 53, 51,
+        51, 51, 49, 51, 53, 55, 56, 59, 61, 63, 66, 66, 70, 71, 73, 75, 76, 78,
+        79, 81, 82, 82, 83, 81, 83, 81, 81, 82, 57, 54, 53, 52, 51, 51, 50, 51,
+        53, 56, 56, 60, 61, 63, 66, 67, 70, 71, 73, 76, 76, 79, 80, 82, 82, 83,
+        84, 85, 83, 84, 84, 82, 60, 57, 56, 54, 53, 53, 52, 53, 55, 58, 58, 61,
+        63, 65, 68, 68, 72, 73, 75, 78, 79, 82, 83, 85, 86, 86, 88, 86, 87, 86,
+        85, 86, 61, 58, 57, 55, 55, 54, 53, 54, 56, 58, 59, 62, 64, 66, 69, 69,
+        73, 74, 76, 79, 80, 83, 84, 86, 87, 88, 89, 89, 88, 88, 88, 86, 63, 60,
+        59, 57, 56, 56, 54, 55, 57, 60, 60, 63, 65, 67, 70, 71, 75, 76, 78, 81,
+        82, 85, 86, 89, 90, 90, 92, 91, 91, 90, 89, 91, 64, 61, 60, 58, 57, 57,
+        55, 56, 58, 60, 61, 64, 66, 68, 70, 71, 75, 77, 79, 82, 82, 86, 87, 90,
+        91, 91, 93, 93, 93, 92, 93, 91, 65, 61, 60, 58, 57, 57, 55, 56, 58, 61,
+        61, 64, 66, 68, 71, 71, 75, 77, 79, 82, 83, 86, 88, 90, 91, 91, 93, 94,
+        95, 95, 93, 95, 67, 63, 62, 60, 59, 59, 57, 58, 60, 62, 63, 66, 67, 69,
+        72, 73, 77, 78, 80, 83, 84, 88, 89, 92, 93, 93, 95, 95, 96, 96, 97, 95,
+        67, 64, 63, 61, 60, 60, 58, 58, 61, 61, 63, 65, 67, 70, 70, 74, 75, 78,
+        80, 81, 85, 86, 89, 91, 93, 94, 95, 97, 97, 98, 98, 100, 68, 65, 64, 62,
+        61, 60, 59, 58, 61, 61, 64, 65, 67, 69, 71, 73, 75, 78, 79, 83, 83, 87,
+        88, 91, 93, 95, 96, 97, 99, 98, 100, 100, 69, 65, 65, 62, 62, 61, 60,
+        59, 61, 62, 64, 65, 68, 68, 72, 72, 76, 76, 80, 81, 84, 86, 88, 90, 92,
+        95, 96, 98, 98, 100, 100, 101, 70, 66, 66, 63, 63, 62, 61, 60, 60, 63,
+        64, 66, 67, 69, 71, 73, 75, 77, 79, 81, 84, 85, 88, 89, 93, 93, 97, 98,
+        100, 100, 102, 101, 71, 67, 67, 64, 64, 62, 62, 60, 60, 64, 64, 67, 67,
+        70, 70, 74, 74, 78, 78, 82, 82, 86, 86, 91, 91, 95, 95, 100, 100, 101,
+        101, 104,
+        /* Size 4x8 */
+        31, 47, 53, 63, 36, 47, 50, 59, 46, 52, 55, 61, 45, 53, 63, 70, 49, 55,
+        71, 77, 54, 58, 77, 86, 59, 61, 81, 94, 63, 65, 80, 95,
+        /* Size 8x4 */
+        31, 36, 46, 45, 49, 54, 59, 63, 47, 47, 52, 53, 55, 58, 61, 65, 53, 50,
+        55, 63, 71, 77, 81, 80, 63, 59, 61, 70, 77, 86, 94, 95,
+        /* Size 8x16 */
+        32, 33, 45, 49, 52, 57, 64, 68, 31, 34, 45, 46, 49, 53, 60, 64, 33, 37,
+        46, 45, 47, 51, 57, 61, 37, 43, 47, 45, 47, 50, 55, 59, 42, 44, 49, 49,
+        50, 53, 58, 60, 49, 47, 52, 53, 54, 57, 61, 63, 48, 46, 51, 57, 59, 61,
+        66, 67, 50, 46, 52, 59, 63, 66, 71, 71, 52, 47, 53, 61, 66, 71, 75, 74,
+        54, 49, 54, 62, 68, 73, 79, 79, 57, 51, 55, 64, 70, 76, 83, 83, 61, 55,
+        58, 66, 73, 80, 87, 87, 64, 57, 60, 68, 75, 83, 91, 91, 66, 59, 61, 69,
+        77, 84, 93, 95, 68, 61, 61, 68, 77, 86, 94, 97, 70, 63, 61, 67, 75, 83,
+        92, 98,
+        /* Size 16x8 */
+        32, 31, 33, 37, 42, 49, 48, 50, 52, 54, 57, 61, 64, 66, 68, 70, 33, 34,
+        37, 43, 44, 47, 46, 46, 47, 49, 51, 55, 57, 59, 61, 63, 45, 45, 46, 47,
+        49, 52, 51, 52, 53, 54, 55, 58, 60, 61, 61, 61, 49, 46, 45, 45, 49, 53,
+        57, 59, 61, 62, 64, 66, 68, 69, 68, 67, 52, 49, 47, 47, 50, 54, 59, 63,
+        66, 68, 70, 73, 75, 77, 77, 75, 57, 53, 51, 50, 53, 57, 61, 66, 71, 73,
+        76, 80, 83, 84, 86, 83, 64, 60, 57, 55, 58, 61, 66, 71, 75, 79, 83, 87,
+        91, 93, 94, 92, 68, 64, 61, 59, 60, 63, 67, 71, 74, 79, 83, 87, 91, 95,
+        97, 98,
+        /* Size 16x32 */
+        32, 31, 33, 37, 45, 48, 49, 50, 52, 56, 57, 63, 64, 67, 68, 68, 31, 31,
+        34, 38, 45, 47, 47, 48, 50, 53, 54, 60, 61, 63, 64, 65, 31, 32, 34, 39,
+        45, 46, 46, 47, 49, 52, 53, 59, 60, 62, 64, 65, 30, 32, 35, 40, 44, 46,
+        45, 46, 48, 51, 52, 57, 58, 60, 61, 62, 33, 35, 37, 42, 46, 47, 45, 46,
+        47, 50, 51, 56, 57, 60, 61, 62, 33, 36, 38, 43, 46, 47, 46, 46, 47, 50,
+        51, 56, 57, 59, 60, 60, 37, 40, 43, 47, 47, 47, 45, 46, 47, 49, 50, 54,
+        55, 57, 59, 61, 39, 41, 43, 47, 48, 48, 47, 47, 48, 50, 51, 55, 56, 57,
+        58, 59, 42, 43, 44, 47, 49, 50, 49, 50, 50, 53, 53, 57, 58, 60, 60, 59,
+        47, 46, 46, 48, 51, 52, 53, 53, 53, 55, 56, 60, 61, 61, 61, 62, 49, 46,
+        47, 48, 52, 53, 53, 54, 54, 56, 57, 60, 61, 63, 63, 62, 48, 46, 46, 47,
+        51, 53, 56, 56, 57, 59, 60, 64, 64, 65, 64, 65, 48, 45, 46, 46, 51, 53,
+        57, 57, 59, 61, 61, 65, 66, 66, 67, 65, 49, 45, 45, 46, 51, 53, 58, 59,
+        61, 63, 64, 67, 68, 70, 67, 68, 50, 46, 46, 46, 52, 54, 59, 61, 63, 65,
+        66, 70, 71, 70, 71, 68, 50, 46, 46, 46, 52, 54, 59, 61, 64, 66, 67, 71,
+        71, 73, 71, 72, 52, 48, 47, 47, 53, 54, 61, 63, 66, 70, 71, 75, 75, 75,
+        74, 72, 53, 49, 48, 48, 53, 55, 61, 64, 67, 71, 72, 76, 77, 77, 75, 76,
+        54, 50, 49, 49, 54, 55, 62, 65, 68, 72, 73, 78, 79, 80, 79, 76, 56, 51,
+        51, 50, 55, 56, 63, 66, 70, 74, 76, 81, 82, 81, 80, 80, 57, 52, 51, 50,
+        55, 56, 64, 66, 70, 75, 76, 82, 83, 85, 83, 80, 60, 54, 54, 52, 57, 58,
+        65, 68, 72, 77, 79, 85, 86, 86, 85, 84, 61, 56, 55, 53, 58, 59, 66, 69,
+        73, 79, 80, 86, 87, 89, 87, 84, 63, 57, 56, 55, 59, 60, 67, 70, 75, 80,
+        82, 89, 90, 91, 89, 89, 64, 58, 57, 56, 60, 61, 68, 71, 75, 81, 83, 90,
+        91, 93, 91, 89, 64, 59, 58, 56, 60, 61, 68, 71, 75, 81, 83, 90, 91, 94,
+        94, 93, 66, 60, 59, 57, 61, 63, 69, 72, 77, 82, 84, 92, 93, 94, 95, 93,
+        67, 61, 60, 58, 61, 63, 69, 70, 78, 80, 85, 90, 93, 96, 97, 97, 68, 62,
+        61, 59, 61, 64, 68, 71, 77, 79, 86, 88, 94, 96, 97, 98, 69, 63, 62, 59,
+        61, 65, 68, 72, 76, 80, 85, 88, 94, 95, 99, 99, 70, 63, 63, 60, 61, 66,
+        67, 73, 75, 81, 83, 89, 92, 97, 98, 99, 70, 64, 64, 61, 61, 67, 67, 74,
+        74, 82, 82, 90, 90, 98, 98, 102,
+        /* Size 32x16 */
+        32, 31, 31, 30, 33, 33, 37, 39, 42, 47, 49, 48, 48, 49, 50, 50, 52, 53,
+        54, 56, 57, 60, 61, 63, 64, 64, 66, 67, 68, 69, 70, 70, 31, 31, 32, 32,
+        35, 36, 40, 41, 43, 46, 46, 46, 45, 45, 46, 46, 48, 49, 50, 51, 52, 54,
+        56, 57, 58, 59, 60, 61, 62, 63, 63, 64, 33, 34, 34, 35, 37, 38, 43, 43,
+        44, 46, 47, 46, 46, 45, 46, 46, 47, 48, 49, 51, 51, 54, 55, 56, 57, 58,
+        59, 60, 61, 62, 63, 64, 37, 38, 39, 40, 42, 43, 47, 47, 47, 48, 48, 47,
+        46, 46, 46, 46, 47, 48, 49, 50, 50, 52, 53, 55, 56, 56, 57, 58, 59, 59,
+        60, 61, 45, 45, 45, 44, 46, 46, 47, 48, 49, 51, 52, 51, 51, 51, 52, 52,
+        53, 53, 54, 55, 55, 57, 58, 59, 60, 60, 61, 61, 61, 61, 61, 61, 48, 47,
+        46, 46, 47, 47, 47, 48, 50, 52, 53, 53, 53, 53, 54, 54, 54, 55, 55, 56,
+        56, 58, 59, 60, 61, 61, 63, 63, 64, 65, 66, 67, 49, 47, 46, 45, 45, 46,
+        45, 47, 49, 53, 53, 56, 57, 58, 59, 59, 61, 61, 62, 63, 64, 65, 66, 67,
+        68, 68, 69, 69, 68, 68, 67, 67, 50, 48, 47, 46, 46, 46, 46, 47, 50, 53,
+        54, 56, 57, 59, 61, 61, 63, 64, 65, 66, 66, 68, 69, 70, 71, 71, 72, 70,
+        71, 72, 73, 74, 52, 50, 49, 48, 47, 47, 47, 48, 50, 53, 54, 57, 59, 61,
+        63, 64, 66, 67, 68, 70, 70, 72, 73, 75, 75, 75, 77, 78, 77, 76, 75, 74,
+        56, 53, 52, 51, 50, 50, 49, 50, 53, 55, 56, 59, 61, 63, 65, 66, 70, 71,
+        72, 74, 75, 77, 79, 80, 81, 81, 82, 80, 79, 80, 81, 82, 57, 54, 53, 52,
+        51, 51, 50, 51, 53, 56, 57, 60, 61, 64, 66, 67, 71, 72, 73, 76, 76, 79,
+        80, 82, 83, 83, 84, 85, 86, 85, 83, 82, 63, 60, 59, 57, 56, 56, 54, 55,
+        57, 60, 60, 64, 65, 67, 70, 71, 75, 76, 78, 81, 82, 85, 86, 89, 90, 90,
+        92, 90, 88, 88, 89, 90, 64, 61, 60, 58, 57, 57, 55, 56, 58, 61, 61, 64,
+        66, 68, 71, 71, 75, 77, 79, 82, 83, 86, 87, 90, 91, 91, 93, 93, 94, 94,
+        92, 90, 67, 63, 62, 60, 60, 59, 57, 57, 60, 61, 63, 65, 66, 70, 70, 73,
+        75, 77, 80, 81, 85, 86, 89, 91, 93, 94, 94, 96, 96, 95, 97, 98, 68, 64,
+        64, 61, 61, 60, 59, 58, 60, 61, 63, 64, 67, 67, 71, 71, 74, 75, 79, 80,
+        83, 85, 87, 89, 91, 94, 95, 97, 97, 99, 98, 98, 68, 65, 65, 62, 62, 60,
+        61, 59, 59, 62, 62, 65, 65, 68, 68, 72, 72, 76, 76, 80, 80, 84, 84, 89,
+        89, 93, 93, 97, 98, 99, 99, 102,
+        /* Size 4x16 */
+        31, 48, 56, 67, 32, 46, 52, 62, 35, 47, 50, 60, 40, 47, 49, 57, 43, 50,
+        53, 60, 46, 53, 56, 63, 45, 53, 61, 66, 46, 54, 65, 70, 48, 54, 70, 75,
+        50, 55, 72, 80, 52, 56, 75, 85, 56, 59, 79, 89, 58, 61, 81, 93, 60, 63,
+        82, 94, 62, 64, 79, 96, 63, 66, 81, 97,
+        /* Size 16x4 */
+        31, 32, 35, 40, 43, 46, 45, 46, 48, 50, 52, 56, 58, 60, 62, 63, 48, 46,
+        47, 47, 50, 53, 53, 54, 54, 55, 56, 59, 61, 63, 64, 66, 56, 52, 50, 49,
+        53, 56, 61, 65, 70, 72, 75, 79, 81, 82, 79, 81, 67, 62, 60, 57, 60, 63,
+        66, 70, 75, 80, 85, 89, 93, 94, 96, 97,
+        /* Size 8x32 */
+        32, 33, 45, 49, 52, 57, 64, 68, 31, 34, 45, 47, 50, 54, 61, 64, 31, 34,
+        45, 46, 49, 53, 60, 64, 30, 35, 44, 45, 48, 52, 58, 61, 33, 37, 46, 45,
+        47, 51, 57, 61, 33, 38, 46, 46, 47, 51, 57, 60, 37, 43, 47, 45, 47, 50,
+        55, 59, 39, 43, 48, 47, 48, 51, 56, 58, 42, 44, 49, 49, 50, 53, 58, 60,
+        47, 46, 51, 53, 53, 56, 61, 61, 49, 47, 52, 53, 54, 57, 61, 63, 48, 46,
+        51, 56, 57, 60, 64, 64, 48, 46, 51, 57, 59, 61, 66, 67, 49, 45, 51, 58,
+        61, 64, 68, 67, 50, 46, 52, 59, 63, 66, 71, 71, 50, 46, 52, 59, 64, 67,
+        71, 71, 52, 47, 53, 61, 66, 71, 75, 74, 53, 48, 53, 61, 67, 72, 77, 75,
+        54, 49, 54, 62, 68, 73, 79, 79, 56, 51, 55, 63, 70, 76, 82, 80, 57, 51,
+        55, 64, 70, 76, 83, 83, 60, 54, 57, 65, 72, 79, 86, 85, 61, 55, 58, 66,
+        73, 80, 87, 87, 63, 56, 59, 67, 75, 82, 90, 89, 64, 57, 60, 68, 75, 83,
+        91, 91, 64, 58, 60, 68, 75, 83, 91, 94, 66, 59, 61, 69, 77, 84, 93, 95,
+        67, 60, 61, 69, 78, 85, 93, 97, 68, 61, 61, 68, 77, 86, 94, 97, 69, 62,
+        61, 68, 76, 85, 94, 99, 70, 63, 61, 67, 75, 83, 92, 98, 70, 64, 61, 67,
+        74, 82, 90, 98,
+        /* Size 32x8 */
+        32, 31, 31, 30, 33, 33, 37, 39, 42, 47, 49, 48, 48, 49, 50, 50, 52, 53,
+        54, 56, 57, 60, 61, 63, 64, 64, 66, 67, 68, 69, 70, 70, 33, 34, 34, 35,
+        37, 38, 43, 43, 44, 46, 47, 46, 46, 45, 46, 46, 47, 48, 49, 51, 51, 54,
+        55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 45, 45, 45, 44, 46, 46, 47, 48,
+        49, 51, 52, 51, 51, 51, 52, 52, 53, 53, 54, 55, 55, 57, 58, 59, 60, 60,
+        61, 61, 61, 61, 61, 61, 49, 47, 46, 45, 45, 46, 45, 47, 49, 53, 53, 56,
+        57, 58, 59, 59, 61, 61, 62, 63, 64, 65, 66, 67, 68, 68, 69, 69, 68, 68,
+        67, 67, 52, 50, 49, 48, 47, 47, 47, 48, 50, 53, 54, 57, 59, 61, 63, 64,
+        66, 67, 68, 70, 70, 72, 73, 75, 75, 75, 77, 78, 77, 76, 75, 74, 57, 54,
+        53, 52, 51, 51, 50, 51, 53, 56, 57, 60, 61, 64, 66, 67, 71, 72, 73, 76,
+        76, 79, 80, 82, 83, 83, 84, 85, 86, 85, 83, 82, 64, 61, 60, 58, 57, 57,
+        55, 56, 58, 61, 61, 64, 66, 68, 71, 71, 75, 77, 79, 82, 83, 86, 87, 90,
+        91, 91, 93, 93, 94, 94, 92, 90, 68, 64, 64, 61, 61, 60, 59, 58, 60, 61,
+        63, 64, 67, 67, 71, 71, 74, 75, 79, 80, 83, 85, 87, 89, 91, 94, 95, 97,
+        97, 99, 98, 98 },
+  },
+  {
+      { /* Luma */
+        /* Size 4x4 */
+        32, 34, 53, 75, 34, 49, 64, 81, 53, 64, 91, 112, 75, 81, 112, 140,
+        /* Size 8x8 */
+        32, 32, 34, 39, 50, 62, 76, 84, 32, 33, 35, 40, 48, 59, 71, 79, 34, 35,
+        39, 46, 53, 63, 74, 81, 39, 40, 46, 56, 65, 75, 86, 92, 50, 48, 53, 65,
+        78, 90, 101, 106, 62, 59, 63, 75, 90, 105, 118, 123, 76, 71, 74, 86,
+        101, 118, 134, 142, 84, 79, 81, 92, 106, 123, 142, 153,
+        /* Size 16x16 */
+        32, 31, 31, 32, 33, 36, 39, 44, 48, 54, 59, 66, 74, 81, 86, 91, 31, 32,
+        32, 32, 33, 35, 38, 42, 46, 51, 56, 63, 70, 77, 81, 86, 31, 32, 32, 33,
+        34, 35, 38, 41, 45, 49, 54, 60, 67, 73, 77, 82, 32, 32, 33, 34, 36, 37,
+        40, 42, 45, 49, 53, 59, 66, 71, 75, 80, 33, 33, 34, 36, 38, 42, 44, 46,
+        50, 53, 57, 63, 69, 74, 78, 80, 36, 35, 35, 37, 42, 48, 50, 54, 57, 60,
+        64, 69, 75, 80, 84, 85, 39, 38, 38, 40, 44, 50, 54, 58, 61, 65, 69, 74,
+        80, 85, 89, 91, 44, 42, 41, 42, 46, 54, 58, 63, 67, 71, 75, 80, 86, 91,
+        95, 97, 48, 46, 45, 45, 50, 57, 61, 67, 71, 76, 80, 86, 93, 98, 101,
+        104, 54, 51, 49, 49, 53, 60, 65, 71, 76, 82, 87, 93, 100, 105, 109, 112,
+        59, 56, 54, 53, 57, 64, 69, 75, 80, 87, 92, 99, 106, 112, 116, 120, 66,
+        63, 60, 59, 63, 69, 74, 80, 86, 93, 99, 107, 115, 121, 125, 129, 74, 70,
+        67, 66, 69, 75, 80, 86, 93, 100, 106, 115, 123, 130, 135, 138, 81, 77,
+        73, 71, 74, 80, 85, 91, 98, 105, 112, 121, 130, 137, 142, 148, 86, 81,
+        77, 75, 78, 84, 89, 95, 101, 109, 116, 125, 135, 142, 147, 153, 91, 86,
+        82, 80, 80, 85, 91, 97, 104, 112, 120, 129, 138, 148, 153, 159,
+        /* Size 32x32 */
+        32, 31, 31, 31, 31, 31, 32, 32, 33, 34, 36, 36, 39, 41, 44, 46, 48, 52,
+        54, 58, 59, 65, 66, 71, 74, 80, 81, 83, 86, 89, 91, 93, 31, 32, 32, 32,
+        32, 32, 32, 32, 33, 34, 35, 35, 38, 39, 42, 44, 46, 50, 51, 56, 56, 62,
+        63, 68, 71, 76, 77, 78, 82, 84, 86, 88, 31, 32, 32, 32, 32, 32, 32, 32,
+        33, 34, 35, 35, 38, 39, 42, 44, 46, 49, 51, 55, 56, 61, 63, 67, 70, 75,
+        77, 78, 81, 84, 86, 88, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34,
+        37, 38, 41, 42, 44, 48, 49, 53, 54, 59, 60, 65, 68, 72, 74, 75, 78, 80,
+        82, 84, 31, 32, 32, 32, 32, 33, 33, 33, 34, 34, 35, 35, 38, 39, 41, 43,
+        45, 48, 49, 53, 54, 59, 60, 65, 67, 72, 73, 74, 77, 80, 82, 84, 31, 32,
+        32, 32, 33, 33, 33, 34, 35, 35, 36, 36, 39, 40, 42, 44, 45, 48, 50, 53,
+        54, 59, 60, 64, 67, 71, 73, 74, 77, 79, 81, 83, 32, 32, 32, 32, 33, 33,
+        34, 35, 36, 36, 37, 38, 40, 40, 42, 44, 45, 48, 49, 53, 53, 58, 59, 63,
+        66, 70, 71, 72, 75, 78, 80, 83, 32, 32, 32, 32, 33, 34, 35, 35, 36, 37,
+        38, 38, 40, 41, 42, 44, 46, 48, 49, 53, 53, 58, 59, 63, 65, 69, 71, 72,
+        74, 77, 79, 80, 33, 33, 33, 33, 34, 35, 36, 36, 38, 39, 42, 42, 44, 45,
+        46, 48, 50, 52, 53, 57, 57, 62, 63, 67, 69, 73, 74, 75, 78, 79, 80, 81,
+        34, 34, 34, 33, 34, 35, 36, 37, 39, 39, 42, 43, 45, 46, 47, 49, 51, 53,
+        54, 58, 58, 63, 64, 68, 70, 74, 75, 76, 79, 81, 84, 86, 36, 35, 35, 34,
+        35, 36, 37, 38, 42, 42, 48, 48, 50, 51, 54, 55, 57, 59, 60, 63, 64, 68,
+        69, 73, 75, 79, 80, 81, 84, 85, 85, 86, 36, 35, 35, 34, 35, 36, 38, 38,
+        42, 43, 48, 49, 51, 52, 54, 55, 57, 59, 60, 64, 64, 68, 69, 73, 75, 79,
+        80, 81, 84, 86, 88, 91, 39, 38, 38, 37, 38, 39, 40, 40, 44, 45, 50, 51,
+        54, 55, 58, 59, 61, 64, 65, 68, 69, 73, 74, 78, 80, 84, 85, 86, 89, 91,
+        91, 91, 41, 39, 39, 38, 39, 40, 40, 41, 45, 46, 51, 52, 55, 56, 59, 61,
+        63, 65, 67, 70, 70, 75, 76, 80, 82, 86, 87, 88, 91, 92, 94, 96, 44, 42,
+        42, 41, 41, 42, 42, 42, 46, 47, 54, 54, 58, 59, 63, 65, 67, 70, 71, 75,
+        75, 79, 80, 84, 86, 90, 91, 92, 95, 97, 97, 97, 46, 44, 44, 42, 43, 44,
+        44, 44, 48, 49, 55, 55, 59, 61, 65, 67, 69, 72, 74, 77, 78, 82, 83, 87,
+        89, 93, 94, 95, 98, 98, 100, 103, 48, 46, 46, 44, 45, 45, 45, 46, 50,
+        51, 57, 57, 61, 63, 67, 69, 71, 74, 76, 80, 80, 85, 86, 90, 93, 96, 98,
+        99, 101, 104, 104, 103, 52, 50, 49, 48, 48, 48, 48, 48, 52, 53, 59, 59,
+        64, 65, 70, 72, 74, 78, 80, 84, 85, 90, 91, 95, 97, 101, 103, 104, 106,
+        106, 107, 110, 54, 51, 51, 49, 49, 50, 49, 49, 53, 54, 60, 60, 65, 67,
+        71, 74, 76, 80, 82, 86, 87, 92, 93, 97, 100, 104, 105, 106, 109, 112,
+        112, 110, 58, 56, 55, 53, 53, 53, 53, 53, 57, 58, 63, 64, 68, 70, 75,
+        77, 80, 84, 86, 91, 91, 97, 98, 103, 105, 110, 111, 112, 115, 114, 115,
+        118, 59, 56, 56, 54, 54, 54, 53, 53, 57, 58, 64, 64, 69, 70, 75, 78, 80,
+        85, 87, 91, 92, 98, 99, 103, 106, 110, 112, 113, 116, 119, 120, 119, 65,
+        62, 61, 59, 59, 59, 58, 58, 62, 63, 68, 68, 73, 75, 79, 82, 85, 90, 92,
+        97, 98, 105, 106, 111, 114, 118, 120, 121, 124, 123, 123, 126, 66, 63,
+        63, 60, 60, 60, 59, 59, 63, 64, 69, 69, 74, 76, 80, 83, 86, 91, 93, 98,
+        99, 106, 107, 112, 115, 119, 121, 122, 125, 128, 129, 126, 71, 68, 67,
+        65, 65, 64, 63, 63, 67, 68, 73, 73, 78, 80, 84, 87, 90, 95, 97, 103,
+        103, 111, 112, 117, 120, 125, 127, 128, 131, 132, 132, 135, 74, 71, 70,
+        68, 67, 67, 66, 65, 69, 70, 75, 75, 80, 82, 86, 89, 93, 97, 100, 105,
+        106, 114, 115, 120, 123, 128, 130, 131, 135, 135, 138, 136, 80, 76, 75,
+        72, 72, 71, 70, 69, 73, 74, 79, 79, 84, 86, 90, 93, 96, 101, 104, 110,
+        110, 118, 119, 125, 128, 134, 136, 137, 140, 142, 140, 144, 81, 77, 77,
+        74, 73, 73, 71, 71, 74, 75, 80, 80, 85, 87, 91, 94, 98, 103, 105, 111,
+        112, 120, 121, 127, 130, 136, 137, 139, 142, 145, 148, 144, 83, 78, 78,
+        75, 74, 74, 72, 72, 75, 76, 81, 81, 86, 88, 92, 95, 99, 104, 106, 112,
+        113, 121, 122, 128, 131, 137, 139, 140, 144, 148, 150, 155, 86, 82, 81,
+        78, 77, 77, 75, 74, 78, 79, 84, 84, 89, 91, 95, 98, 101, 106, 109, 115,
+        116, 124, 125, 131, 135, 140, 142, 144, 147, 149, 153, 155, 89, 84, 84,
+        80, 80, 79, 78, 77, 79, 81, 85, 86, 91, 92, 97, 98, 104, 106, 112, 114,
+        119, 123, 128, 132, 135, 142, 145, 148, 149, 153, 154, 159, 91, 86, 86,
+        82, 82, 81, 80, 79, 80, 84, 85, 88, 91, 94, 97, 100, 104, 107, 112, 115,
+        120, 123, 129, 132, 138, 140, 148, 150, 153, 154, 159, 159, 93, 88, 88,
+        84, 84, 83, 83, 80, 81, 86, 86, 91, 91, 96, 97, 103, 103, 110, 110, 118,
+        119, 126, 126, 135, 136, 144, 144, 155, 155, 159, 159, 164,
+        /* Size 4x8 */
+        32, 35, 51, 77, 32, 36, 50, 72, 34, 42, 54, 75, 38, 51, 67, 87, 48, 59,
+        80, 103, 60, 68, 92, 119, 72, 79, 104, 135, 81, 86, 112, 144,
+        /* Size 8x4 */
+        32, 32, 34, 38, 48, 60, 72, 81, 35, 36, 42, 51, 59, 68, 79, 86, 51, 50,
+        54, 67, 80, 92, 104, 112, 77, 72, 75, 87, 103, 119, 135, 144,
+        /* Size 8x16 */
+        32, 31, 33, 40, 51, 65, 79, 87, 31, 32, 33, 39, 49, 61, 74, 82, 31, 32,
+        34, 38, 47, 59, 71, 79, 32, 33, 36, 40, 48, 58, 69, 77, 33, 34, 38, 44,
+        52, 62, 72, 78, 36, 35, 42, 51, 58, 68, 78, 84, 39, 38, 44, 54, 63, 73,
+        84, 89, 44, 41, 46, 59, 69, 79, 90, 96, 48, 45, 50, 62, 74, 85, 96, 103,
+        53, 49, 53, 66, 79, 92, 103, 111, 58, 54, 57, 70, 84, 98, 110, 118, 66,
+        60, 63, 75, 90, 106, 119, 126, 74, 67, 69, 81, 97, 113, 128, 134, 81,
+        73, 75, 86, 102, 120, 135, 143, 86, 78, 78, 90, 106, 124, 140, 147, 91,
+        82, 80, 90, 103, 119, 137, 151,
+        /* Size 16x8 */
+        32, 31, 31, 32, 33, 36, 39, 44, 48, 53, 58, 66, 74, 81, 86, 91, 31, 32,
+        32, 33, 34, 35, 38, 41, 45, 49, 54, 60, 67, 73, 78, 82, 33, 33, 34, 36,
+        38, 42, 44, 46, 50, 53, 57, 63, 69, 75, 78, 80, 40, 39, 38, 40, 44, 51,
+        54, 59, 62, 66, 70, 75, 81, 86, 90, 90, 51, 49, 47, 48, 52, 58, 63, 69,
+        74, 79, 84, 90, 97, 102, 106, 103, 65, 61, 59, 58, 62, 68, 73, 79, 85,
+        92, 98, 106, 113, 120, 124, 119, 79, 74, 71, 69, 72, 78, 84, 90, 96,
+        103, 110, 119, 128, 135, 140, 137, 87, 82, 79, 77, 78, 84, 89, 96, 103,
+        111, 118, 126, 134, 143, 147, 151,
+        /* Size 16x32 */
+        32, 31, 31, 32, 33, 36, 40, 44, 51, 53, 65, 66, 79, 81, 87, 90, 31, 32,
+        32, 32, 33, 35, 39, 42, 49, 51, 62, 63, 75, 77, 83, 85, 31, 32, 32, 32,
+        33, 35, 39, 42, 49, 51, 61, 62, 74, 76, 82, 85, 31, 32, 32, 33, 33, 34,
+        38, 41, 47, 49, 59, 60, 72, 74, 79, 81, 31, 32, 32, 33, 34, 35, 38, 41,
+        47, 49, 59, 60, 71, 73, 79, 81, 32, 32, 33, 34, 35, 36, 39, 42, 48, 50,
+        59, 60, 71, 72, 78, 80, 32, 32, 33, 35, 36, 37, 40, 42, 48, 49, 58, 59,
+        69, 71, 77, 80, 32, 33, 33, 35, 36, 38, 41, 42, 48, 49, 58, 59, 69, 70,
+        75, 77, 33, 33, 34, 36, 38, 41, 44, 46, 52, 53, 62, 63, 72, 74, 78, 78,
+        34, 34, 34, 37, 39, 42, 45, 48, 53, 54, 63, 64, 73, 75, 80, 83, 36, 34,
+        35, 38, 42, 48, 51, 54, 58, 60, 68, 69, 78, 80, 84, 83, 36, 35, 35, 38,
+        42, 48, 51, 54, 59, 60, 68, 69, 79, 80, 85, 87, 39, 37, 38, 40, 44, 50,
+        54, 58, 63, 65, 73, 74, 84, 85, 89, 88, 40, 38, 39, 41, 45, 51, 56, 59,
+        65, 67, 75, 76, 85, 87, 90, 93, 44, 41, 41, 43, 46, 53, 59, 63, 69, 71,
+        79, 80, 90, 91, 96, 93, 46, 43, 43, 44, 48, 55, 60, 65, 72, 73, 82, 83,
+        93, 94, 97, 100, 48, 45, 45, 46, 50, 56, 62, 67, 74, 76, 85, 86, 96, 98,
+        103, 100, 52, 48, 48, 49, 52, 59, 65, 70, 78, 80, 90, 91, 101, 103, 105,
+        107, 53, 49, 49, 50, 53, 60, 66, 71, 79, 82, 92, 93, 103, 105, 111, 107,
+        58, 53, 53, 53, 57, 63, 69, 74, 83, 86, 97, 98, 109, 111, 113, 115, 58,
+        54, 54, 54, 57, 63, 70, 75, 84, 87, 98, 99, 110, 112, 118, 115, 65, 60,
+        59, 58, 62, 68, 74, 79, 89, 92, 105, 106, 118, 119, 122, 123, 66, 61,
+        60, 59, 63, 69, 75, 80, 90, 93, 106, 107, 119, 121, 126, 123, 71, 65,
+        65, 63, 67, 73, 79, 84, 94, 97, 111, 112, 125, 127, 131, 132, 74, 68,
+        67, 66, 69, 75, 81, 86, 97, 100, 113, 115, 128, 130, 134, 132, 79, 72,
+        72, 70, 73, 79, 85, 90, 101, 104, 118, 119, 133, 135, 141, 140, 81, 74,
+        73, 71, 75, 80, 86, 91, 102, 105, 120, 121, 135, 137, 143, 140, 82, 75,
+        74, 72, 75, 81, 87, 92, 103, 106, 121, 122, 136, 139, 147, 151, 86, 78,
+        78, 75, 78, 84, 90, 95, 106, 109, 124, 125, 140, 142, 147, 151, 88, 81,
+        80, 77, 80, 86, 90, 98, 105, 112, 122, 127, 140, 144, 152, 155, 91, 83,
+        82, 79, 80, 88, 90, 100, 103, 114, 119, 130, 137, 148, 151, 155, 93, 85,
+        85, 81, 81, 90, 90, 102, 103, 117, 117, 134, 134, 151, 152, 160,
+        /* Size 32x16 */
+        32, 31, 31, 31, 31, 32, 32, 32, 33, 34, 36, 36, 39, 40, 44, 46, 48, 52,
+        53, 58, 58, 65, 66, 71, 74, 79, 81, 82, 86, 88, 91, 93, 31, 32, 32, 32,
+        32, 32, 32, 33, 33, 34, 34, 35, 37, 38, 41, 43, 45, 48, 49, 53, 54, 60,
+        61, 65, 68, 72, 74, 75, 78, 81, 83, 85, 31, 32, 32, 32, 32, 33, 33, 33,
+        34, 34, 35, 35, 38, 39, 41, 43, 45, 48, 49, 53, 54, 59, 60, 65, 67, 72,
+        73, 74, 78, 80, 82, 85, 32, 32, 32, 33, 33, 34, 35, 35, 36, 37, 38, 38,
+        40, 41, 43, 44, 46, 49, 50, 53, 54, 58, 59, 63, 66, 70, 71, 72, 75, 77,
+        79, 81, 33, 33, 33, 33, 34, 35, 36, 36, 38, 39, 42, 42, 44, 45, 46, 48,
+        50, 52, 53, 57, 57, 62, 63, 67, 69, 73, 75, 75, 78, 80, 80, 81, 36, 35,
+        35, 34, 35, 36, 37, 38, 41, 42, 48, 48, 50, 51, 53, 55, 56, 59, 60, 63,
+        63, 68, 69, 73, 75, 79, 80, 81, 84, 86, 88, 90, 40, 39, 39, 38, 38, 39,
+        40, 41, 44, 45, 51, 51, 54, 56, 59, 60, 62, 65, 66, 69, 70, 74, 75, 79,
+        81, 85, 86, 87, 90, 90, 90, 90, 44, 42, 42, 41, 41, 42, 42, 42, 46, 48,
+        54, 54, 58, 59, 63, 65, 67, 70, 71, 74, 75, 79, 80, 84, 86, 90, 91, 92,
+        95, 98, 100, 102, 51, 49, 49, 47, 47, 48, 48, 48, 52, 53, 58, 59, 63,
+        65, 69, 72, 74, 78, 79, 83, 84, 89, 90, 94, 97, 101, 102, 103, 106, 105,
+        103, 103, 53, 51, 51, 49, 49, 50, 49, 49, 53, 54, 60, 60, 65, 67, 71,
+        73, 76, 80, 82, 86, 87, 92, 93, 97, 100, 104, 105, 106, 109, 112, 114,
+        117, 65, 62, 61, 59, 59, 59, 58, 58, 62, 63, 68, 68, 73, 75, 79, 82, 85,
+        90, 92, 97, 98, 105, 106, 111, 113, 118, 120, 121, 124, 122, 119, 117,
+        66, 63, 62, 60, 60, 60, 59, 59, 63, 64, 69, 69, 74, 76, 80, 83, 86, 91,
+        93, 98, 99, 106, 107, 112, 115, 119, 121, 122, 125, 127, 130, 134, 79,
+        75, 74, 72, 71, 71, 69, 69, 72, 73, 78, 79, 84, 85, 90, 93, 96, 101,
+        103, 109, 110, 118, 119, 125, 128, 133, 135, 136, 140, 140, 137, 134,
+        81, 77, 76, 74, 73, 72, 71, 70, 74, 75, 80, 80, 85, 87, 91, 94, 98, 103,
+        105, 111, 112, 119, 121, 127, 130, 135, 137, 139, 142, 144, 148, 151,
+        87, 83, 82, 79, 79, 78, 77, 75, 78, 80, 84, 85, 89, 90, 96, 97, 103,
+        105, 111, 113, 118, 122, 126, 131, 134, 141, 143, 147, 147, 152, 151,
+        152, 90, 85, 85, 81, 81, 80, 80, 77, 78, 83, 83, 87, 88, 93, 93, 100,
+        100, 107, 107, 115, 115, 123, 123, 132, 132, 140, 140, 151, 151, 155,
+        155, 160,
+        /* Size 4x16 */
+        31, 36, 53, 81, 32, 35, 51, 76, 32, 35, 49, 73, 32, 37, 49, 71, 33, 41,
+        53, 74, 34, 48, 60, 80, 37, 50, 65, 85, 41, 53, 71, 91, 45, 56, 76, 98,
+        49, 60, 82, 105, 54, 63, 87, 112, 61, 69, 93, 121, 68, 75, 100, 130, 74,
+        80, 105, 137, 78, 84, 109, 142, 83, 88, 114, 148,
+        /* Size 16x4 */
+        31, 32, 32, 32, 33, 34, 37, 41, 45, 49, 54, 61, 68, 74, 78, 83, 36, 35,
+        35, 37, 41, 48, 50, 53, 56, 60, 63, 69, 75, 80, 84, 88, 53, 51, 49, 49,
+        53, 60, 65, 71, 76, 82, 87, 93, 100, 105, 109, 114, 81, 76, 73, 71, 74,
+        80, 85, 91, 98, 105, 112, 121, 130, 137, 142, 148,
+        /* Size 8x32 */
+        32, 31, 33, 40, 51, 65, 79, 87, 31, 32, 33, 39, 49, 62, 75, 83, 31, 32,
+        33, 39, 49, 61, 74, 82, 31, 32, 33, 38, 47, 59, 72, 79, 31, 32, 34, 38,
+        47, 59, 71, 79, 32, 33, 35, 39, 48, 59, 71, 78, 32, 33, 36, 40, 48, 58,
+        69, 77, 32, 33, 36, 41, 48, 58, 69, 75, 33, 34, 38, 44, 52, 62, 72, 78,
+        34, 34, 39, 45, 53, 63, 73, 80, 36, 35, 42, 51, 58, 68, 78, 84, 36, 35,
+        42, 51, 59, 68, 79, 85, 39, 38, 44, 54, 63, 73, 84, 89, 40, 39, 45, 56,
+        65, 75, 85, 90, 44, 41, 46, 59, 69, 79, 90, 96, 46, 43, 48, 60, 72, 82,
+        93, 97, 48, 45, 50, 62, 74, 85, 96, 103, 52, 48, 52, 65, 78, 90, 101,
+        105, 53, 49, 53, 66, 79, 92, 103, 111, 58, 53, 57, 69, 83, 97, 109, 113,
+        58, 54, 57, 70, 84, 98, 110, 118, 65, 59, 62, 74, 89, 105, 118, 122, 66,
+        60, 63, 75, 90, 106, 119, 126, 71, 65, 67, 79, 94, 111, 125, 131, 74,
+        67, 69, 81, 97, 113, 128, 134, 79, 72, 73, 85, 101, 118, 133, 141, 81,
+        73, 75, 86, 102, 120, 135, 143, 82, 74, 75, 87, 103, 121, 136, 147, 86,
+        78, 78, 90, 106, 124, 140, 147, 88, 80, 80, 90, 105, 122, 140, 152, 91,
+        82, 80, 90, 103, 119, 137, 151, 93, 85, 81, 90, 103, 117, 134, 152,
+        /* Size 32x8 */
+        32, 31, 31, 31, 31, 32, 32, 32, 33, 34, 36, 36, 39, 40, 44, 46, 48, 52,
+        53, 58, 58, 65, 66, 71, 74, 79, 81, 82, 86, 88, 91, 93, 31, 32, 32, 32,
+        32, 33, 33, 33, 34, 34, 35, 35, 38, 39, 41, 43, 45, 48, 49, 53, 54, 59,
+        60, 65, 67, 72, 73, 74, 78, 80, 82, 85, 33, 33, 33, 33, 34, 35, 36, 36,
+        38, 39, 42, 42, 44, 45, 46, 48, 50, 52, 53, 57, 57, 62, 63, 67, 69, 73,
+        75, 75, 78, 80, 80, 81, 40, 39, 39, 38, 38, 39, 40, 41, 44, 45, 51, 51,
+        54, 56, 59, 60, 62, 65, 66, 69, 70, 74, 75, 79, 81, 85, 86, 87, 90, 90,
+        90, 90, 51, 49, 49, 47, 47, 48, 48, 48, 52, 53, 58, 59, 63, 65, 69, 72,
+        74, 78, 79, 83, 84, 89, 90, 94, 97, 101, 102, 103, 106, 105, 103, 103,
+        65, 62, 61, 59, 59, 59, 58, 58, 62, 63, 68, 68, 73, 75, 79, 82, 85, 90,
+        92, 97, 98, 105, 106, 111, 113, 118, 120, 121, 124, 122, 119, 117, 79,
+        75, 74, 72, 71, 71, 69, 69, 72, 73, 78, 79, 84, 85, 90, 93, 96, 101,
+        103, 109, 110, 118, 119, 125, 128, 133, 135, 136, 140, 140, 137, 134,
+        87, 83, 82, 79, 79, 78, 77, 75, 78, 80, 84, 85, 89, 90, 96, 97, 103,
+        105, 111, 113, 118, 122, 126, 131, 134, 141, 143, 147, 147, 152, 151,
+        152 },
+      { /* Chroma */
+        /* Size 4x4 */
+        32, 46, 49, 58, 46, 53, 55, 62, 49, 55, 70, 78, 58, 62, 78, 91,
+        /* Size 8x8 */
+        31, 34, 42, 47, 49, 54, 60, 64, 34, 39, 45, 46, 47, 51, 56, 59, 42, 45,
+        48, 49, 50, 53, 57, 60, 47, 46, 49, 55, 58, 61, 65, 66, 49, 47, 50, 58,
+        65, 69, 73, 74, 54, 51, 53, 61, 69, 76, 82, 83, 60, 56, 57, 65, 73, 82,
+        89, 92, 64, 59, 60, 66, 74, 83, 92, 96,
+        /* Size 16x16 */
+        32, 31, 31, 35, 40, 49, 48, 49, 50, 52, 54, 57, 61, 64, 66, 68, 31, 31,
+        32, 37, 41, 47, 47, 46, 48, 49, 51, 54, 57, 60, 62, 64, 31, 32, 34, 39,
+        43, 46, 46, 45, 46, 47, 49, 52, 55, 57, 59, 61, 35, 37, 39, 44, 46, 47,
+        46, 45, 46, 47, 48, 51, 53, 56, 57, 59, 40, 41, 43, 46, 48, 50, 49, 48,
+        49, 49, 51, 53, 55, 57, 59, 59, 49, 47, 46, 47, 50, 53, 53, 53, 54, 54,
+        55, 57, 59, 61, 62, 62, 48, 47, 46, 46, 49, 53, 54, 55, 56, 57, 58, 60,
+        62, 64, 65, 65, 49, 46, 45, 45, 48, 53, 55, 58, 60, 61, 62, 64, 66, 68,
+        69, 69, 50, 48, 46, 46, 49, 54, 56, 60, 61, 63, 65, 67, 69, 71, 72, 72,
+        52, 49, 47, 47, 49, 54, 57, 61, 63, 66, 68, 71, 73, 75, 76, 77, 54, 51,
+        49, 48, 51, 55, 58, 62, 65, 68, 71, 74, 76, 78, 80, 81, 57, 54, 52, 51,
+        53, 57, 60, 64, 67, 71, 74, 77, 80, 83, 84, 85, 61, 57, 55, 53, 55, 59,
+        62, 66, 69, 73, 76, 80, 84, 87, 89, 89, 64, 60, 57, 56, 57, 61, 64, 68,
+        71, 75, 78, 83, 87, 90, 92, 94, 66, 62, 59, 57, 59, 62, 65, 69, 72, 76,
+        80, 84, 89, 92, 94, 96, 68, 64, 61, 59, 59, 62, 65, 69, 72, 77, 81, 85,
+        89, 94, 96, 98,
+        /* Size 32x32 */
+        32, 31, 31, 30, 31, 33, 35, 36, 40, 41, 49, 49, 48, 48, 49, 50, 50, 52,
+        52, 54, 54, 57, 57, 60, 61, 63, 64, 65, 66, 67, 68, 69, 31, 31, 31, 31,
+        32, 34, 37, 38, 41, 42, 47, 47, 47, 47, 47, 47, 48, 49, 50, 52, 52, 54,
+        55, 57, 58, 60, 61, 61, 63, 64, 64, 65, 31, 31, 31, 31, 32, 35, 37, 39,
+        41, 42, 47, 47, 47, 46, 46, 47, 48, 49, 49, 51, 51, 54, 54, 56, 57, 59,
+        60, 61, 62, 63, 64, 65, 30, 31, 31, 32, 33, 35, 38, 40, 42, 42, 46, 46,
+        45, 45, 45, 45, 46, 47, 47, 49, 49, 52, 52, 54, 55, 57, 58, 58, 60, 61,
+        61, 62, 31, 32, 32, 33, 34, 37, 39, 41, 43, 43, 46, 46, 46, 45, 45, 46,
+        46, 47, 47, 49, 49, 51, 52, 54, 55, 57, 57, 58, 59, 60, 61, 62, 33, 34,
+        35, 35, 37, 39, 41, 43, 44, 45, 47, 47, 46, 46, 45, 46, 46, 47, 47, 49,
+        49, 51, 51, 53, 54, 56, 57, 57, 58, 59, 60, 61, 35, 37, 37, 38, 39, 41,
+        44, 46, 46, 46, 47, 47, 46, 46, 45, 46, 46, 47, 47, 48, 48, 50, 51, 52,
+        53, 55, 56, 56, 57, 58, 59, 61, 36, 38, 39, 40, 41, 43, 46, 47, 47, 47,
+        48, 47, 46, 46, 45, 46, 46, 46, 47, 48, 48, 50, 50, 52, 53, 54, 55, 55,
+        56, 57, 58, 58, 40, 41, 41, 42, 43, 44, 46, 47, 48, 48, 50, 49, 49, 49,
+        48, 49, 49, 49, 49, 51, 51, 52, 53, 54, 55, 57, 57, 58, 59, 59, 59, 59,
+        41, 42, 42, 42, 43, 45, 46, 47, 48, 48, 50, 50, 49, 49, 49, 49, 50, 50,
+        50, 52, 52, 53, 53, 55, 56, 57, 58, 58, 59, 60, 61, 62, 49, 47, 47, 46,
+        46, 47, 47, 48, 50, 50, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56,
+        57, 58, 59, 60, 61, 61, 62, 62, 62, 62, 49, 47, 47, 46, 46, 47, 47, 47,
+        49, 50, 53, 53, 53, 53, 54, 54, 54, 54, 54, 55, 56, 57, 57, 59, 59, 61,
+        61, 62, 63, 63, 64, 65, 48, 47, 47, 45, 46, 46, 46, 46, 49, 49, 53, 53,
+        54, 54, 55, 56, 56, 57, 57, 58, 58, 60, 60, 61, 62, 63, 64, 64, 65, 66,
+        65, 65, 48, 47, 46, 45, 45, 46, 46, 46, 49, 49, 53, 53, 54, 55, 56, 57,
+        57, 58, 58, 59, 60, 61, 61, 63, 63, 65, 65, 65, 66, 66, 67, 68, 49, 47,
+        46, 45, 45, 45, 45, 45, 48, 49, 53, 54, 55, 56, 58, 59, 60, 61, 61, 62,
+        62, 63, 64, 65, 66, 67, 68, 68, 69, 70, 69, 68, 50, 47, 47, 45, 46, 46,
+        46, 46, 49, 49, 54, 54, 56, 57, 59, 60, 60, 62, 62, 63, 64, 65, 65, 67,
+        68, 69, 69, 70, 70, 70, 71, 71, 50, 48, 48, 46, 46, 46, 46, 46, 49, 50,
+        54, 54, 56, 57, 60, 60, 61, 63, 63, 65, 65, 67, 67, 68, 69, 71, 71, 71,
+        72, 73, 72, 71, 52, 49, 49, 47, 47, 47, 47, 46, 49, 50, 54, 54, 57, 58,
+        61, 62, 63, 65, 65, 67, 67, 69, 70, 71, 72, 73, 74, 74, 75, 74, 74, 75,
+        52, 50, 49, 47, 47, 47, 47, 47, 49, 50, 54, 54, 57, 58, 61, 62, 63, 65,
+        66, 68, 68, 70, 71, 72, 73, 75, 75, 75, 76, 77, 77, 75, 54, 52, 51, 49,
+        49, 49, 48, 48, 51, 52, 55, 55, 58, 59, 62, 63, 65, 67, 68, 70, 70, 73,
+        73, 75, 76, 78, 78, 78, 79, 78, 78, 79, 54, 52, 51, 49, 49, 49, 48, 48,
+        51, 52, 55, 56, 58, 60, 62, 64, 65, 67, 68, 70, 71, 73, 74, 75, 76, 78,
+        78, 79, 80, 81, 81, 79, 57, 54, 54, 52, 51, 51, 50, 50, 52, 53, 56, 57,
+        60, 61, 63, 65, 67, 69, 70, 73, 73, 76, 77, 79, 80, 82, 82, 83, 84, 83,
+        82, 83, 57, 55, 54, 52, 52, 51, 51, 50, 53, 53, 57, 57, 60, 61, 64, 65,
+        67, 70, 71, 73, 74, 77, 77, 79, 80, 82, 83, 83, 84, 85, 85, 83, 60, 57,
+        56, 54, 54, 53, 52, 52, 54, 55, 58, 59, 61, 63, 65, 67, 68, 71, 72, 75,
+        75, 79, 79, 82, 83, 85, 86, 86, 87, 87, 86, 87, 61, 58, 57, 55, 55, 54,
+        53, 53, 55, 56, 59, 59, 62, 63, 66, 68, 69, 72, 73, 76, 76, 80, 80, 83,
+        84, 86, 87, 88, 89, 89, 89, 87, 63, 60, 59, 57, 57, 56, 55, 54, 57, 57,
+        60, 61, 63, 65, 67, 69, 71, 73, 75, 78, 78, 82, 82, 85, 86, 89, 89, 90,
+        91, 92, 90, 91, 64, 61, 60, 58, 57, 57, 56, 55, 57, 58, 61, 61, 64, 65,
+        68, 69, 71, 74, 75, 78, 78, 82, 83, 86, 87, 89, 90, 91, 92, 93, 94, 91,
+        65, 61, 61, 58, 58, 57, 56, 55, 58, 58, 61, 62, 64, 65, 68, 70, 71, 74,
+        75, 78, 79, 83, 83, 86, 88, 90, 91, 91, 93, 94, 94, 96, 66, 63, 62, 60,
+        59, 58, 57, 56, 59, 59, 62, 63, 65, 66, 69, 70, 72, 75, 76, 79, 80, 84,
+        84, 87, 89, 91, 92, 93, 94, 94, 96, 96, 67, 64, 63, 61, 60, 59, 58, 57,
+        59, 60, 62, 63, 66, 66, 70, 70, 73, 74, 77, 78, 81, 83, 85, 87, 89, 92,
+        93, 94, 94, 96, 96, 97, 68, 64, 64, 61, 61, 60, 59, 58, 59, 61, 62, 64,
+        65, 67, 69, 71, 72, 74, 77, 78, 81, 82, 85, 86, 89, 90, 94, 94, 96, 96,
+        98, 97, 69, 65, 65, 62, 62, 61, 61, 58, 59, 62, 62, 65, 65, 68, 68, 71,
+        71, 75, 75, 79, 79, 83, 83, 87, 87, 91, 91, 96, 96, 97, 97, 99,
+        /* Size 4x8 */
+        31, 47, 50, 61, 36, 47, 47, 57, 43, 50, 50, 58, 45, 53, 58, 65, 47, 54,
+        66, 74, 52, 56, 70, 82, 57, 60, 75, 90, 61, 63, 77, 93,
+        /* Size 8x4 */
+        31, 36, 43, 45, 47, 52, 57, 61, 47, 47, 50, 53, 54, 56, 60, 63, 50, 47,
+        50, 58, 66, 70, 75, 77, 61, 57, 58, 65, 74, 82, 90, 93,
+        /* Size 8x16 */
+        32, 32, 40, 49, 51, 57, 63, 67, 31, 33, 41, 47, 49, 54, 59, 63, 31, 35,
+        43, 46, 47, 51, 57, 60, 35, 39, 46, 46, 47, 50, 55, 58, 41, 43, 48, 49,
+        49, 52, 57, 59, 49, 47, 50, 53, 54, 57, 60, 62, 48, 46, 49, 54, 57, 60,
+        64, 65, 49, 45, 48, 56, 61, 64, 67, 69, 50, 46, 49, 57, 63, 67, 71, 73,
+        52, 48, 50, 58, 65, 71, 75, 77, 54, 50, 51, 59, 67, 73, 78, 81, 57, 52,
+        53, 61, 69, 77, 82, 85, 61, 55, 56, 63, 72, 80, 86, 88, 64, 58, 58, 65,
+        73, 82, 89, 92, 66, 59, 59, 66, 75, 84, 91, 94, 68, 61, 59, 65, 72, 81,
+        89, 95,
+        /* Size 16x8 */
+        32, 31, 31, 35, 41, 49, 48, 49, 50, 52, 54, 57, 61, 64, 66, 68, 32, 33,
+        35, 39, 43, 47, 46, 45, 46, 48, 50, 52, 55, 58, 59, 61, 40, 41, 43, 46,
+        48, 50, 49, 48, 49, 50, 51, 53, 56, 58, 59, 59, 49, 47, 46, 46, 49, 53,
+        54, 56, 57, 58, 59, 61, 63, 65, 66, 65, 51, 49, 47, 47, 49, 54, 57, 61,
+        63, 65, 67, 69, 72, 73, 75, 72, 57, 54, 51, 50, 52, 57, 60, 64, 67, 71,
+        73, 77, 80, 82, 84, 81, 63, 59, 57, 55, 57, 60, 64, 67, 71, 75, 78, 82,
+        86, 89, 91, 89, 67, 63, 60, 58, 59, 62, 65, 69, 73, 77, 81, 85, 88, 92,
+        94, 95,
+        /* Size 16x32 */
+        32, 31, 32, 37, 40, 48, 49, 49, 51, 52, 57, 58, 63, 64, 67, 67, 31, 31,
+        33, 38, 41, 47, 47, 47, 49, 50, 54, 55, 60, 61, 63, 64, 31, 31, 33, 38,
+        41, 47, 47, 47, 49, 49, 54, 54, 59, 60, 63, 64, 30, 32, 33, 40, 42, 46,
+        45, 45, 47, 48, 52, 52, 57, 58, 60, 61, 31, 33, 35, 41, 43, 46, 46, 45,
+        47, 48, 51, 52, 57, 57, 60, 61, 33, 36, 37, 43, 44, 47, 46, 46, 47, 47,
+        51, 52, 56, 57, 59, 60, 35, 38, 39, 45, 46, 47, 46, 45, 47, 47, 50, 51,
+        55, 56, 58, 60, 37, 40, 41, 47, 47, 47, 46, 45, 46, 47, 50, 50, 54, 55,
+        57, 58, 41, 42, 43, 47, 48, 49, 49, 48, 49, 50, 52, 53, 57, 57, 59, 58,
+        42, 43, 43, 47, 48, 50, 49, 49, 50, 50, 53, 54, 57, 58, 60, 61, 49, 46,
+        47, 48, 50, 53, 53, 53, 54, 54, 57, 57, 60, 61, 62, 61, 49, 46, 47, 48,
+        50, 53, 53, 54, 54, 55, 57, 57, 61, 61, 63, 64, 48, 46, 46, 47, 49, 53,
+        54, 56, 57, 57, 60, 60, 64, 64, 65, 64, 48, 45, 46, 46, 49, 53, 55, 56,
+        58, 58, 61, 61, 65, 65, 66, 67, 49, 45, 45, 46, 48, 53, 56, 58, 61, 61,
+        64, 64, 67, 68, 69, 67, 49, 46, 46, 46, 49, 53, 57, 59, 62, 62, 65, 66,
+        69, 69, 70, 70, 50, 46, 46, 46, 49, 54, 57, 59, 63, 64, 67, 67, 71, 71,
+        73, 71, 51, 47, 47, 47, 49, 54, 58, 61, 64, 66, 69, 70, 73, 74, 74, 74,
+        52, 48, 48, 47, 50, 54, 58, 61, 65, 66, 71, 71, 75, 75, 77, 74, 54, 50,
+        49, 48, 51, 55, 59, 62, 67, 68, 73, 73, 77, 78, 78, 78, 54, 50, 50, 49,
+        51, 55, 59, 62, 67, 68, 73, 74, 78, 78, 81, 78, 57, 52, 52, 50, 52, 56,
+        60, 64, 69, 70, 76, 77, 82, 82, 83, 82, 57, 52, 52, 51, 53, 57, 61, 64,
+        69, 71, 77, 77, 82, 83, 85, 82, 60, 54, 54, 52, 55, 58, 62, 65, 71, 72,
+        79, 79, 85, 86, 87, 86, 61, 56, 55, 53, 56, 59, 63, 66, 72, 73, 80, 81,
+        86, 87, 88, 86, 63, 57, 57, 55, 57, 60, 64, 67, 73, 75, 82, 82, 89, 90,
+        92, 90, 64, 58, 58, 55, 58, 61, 65, 68, 73, 75, 82, 83, 89, 90, 92, 90,
+        64, 59, 58, 56, 58, 61, 65, 68, 74, 75, 83, 83, 90, 91, 94, 95, 66, 60,
+        59, 57, 59, 62, 66, 69, 75, 76, 84, 85, 91, 92, 94, 95, 67, 61, 60, 58,
+        59, 63, 66, 70, 74, 77, 82, 85, 91, 93, 96, 96, 68, 62, 61, 58, 59, 64,
+        65, 71, 72, 78, 81, 86, 89, 94, 95, 96, 68, 62, 62, 59, 59, 65, 65, 71,
+        71, 79, 79, 87, 87, 95, 95, 98,
+        /* Size 32x16 */
+        32, 31, 31, 30, 31, 33, 35, 37, 41, 42, 49, 49, 48, 48, 49, 49, 50, 51,
+        52, 54, 54, 57, 57, 60, 61, 63, 64, 64, 66, 67, 68, 68, 31, 31, 31, 32,
+        33, 36, 38, 40, 42, 43, 46, 46, 46, 45, 45, 46, 46, 47, 48, 50, 50, 52,
+        52, 54, 56, 57, 58, 59, 60, 61, 62, 62, 32, 33, 33, 33, 35, 37, 39, 41,
+        43, 43, 47, 47, 46, 46, 45, 46, 46, 47, 48, 49, 50, 52, 52, 54, 55, 57,
+        58, 58, 59, 60, 61, 62, 37, 38, 38, 40, 41, 43, 45, 47, 47, 47, 48, 48,
+        47, 46, 46, 46, 46, 47, 47, 48, 49, 50, 51, 52, 53, 55, 55, 56, 57, 58,
+        58, 59, 40, 41, 41, 42, 43, 44, 46, 47, 48, 48, 50, 50, 49, 49, 48, 49,
+        49, 49, 50, 51, 51, 52, 53, 55, 56, 57, 58, 58, 59, 59, 59, 59, 48, 47,
+        47, 46, 46, 47, 47, 47, 49, 50, 53, 53, 53, 53, 53, 53, 54, 54, 54, 55,
+        55, 56, 57, 58, 59, 60, 61, 61, 62, 63, 64, 65, 49, 47, 47, 45, 46, 46,
+        46, 46, 49, 49, 53, 53, 54, 55, 56, 57, 57, 58, 58, 59, 59, 60, 61, 62,
+        63, 64, 65, 65, 66, 66, 65, 65, 49, 47, 47, 45, 45, 46, 45, 45, 48, 49,
+        53, 54, 56, 56, 58, 59, 59, 61, 61, 62, 62, 64, 64, 65, 66, 67, 68, 68,
+        69, 70, 71, 71, 51, 49, 49, 47, 47, 47, 47, 46, 49, 50, 54, 54, 57, 58,
+        61, 62, 63, 64, 65, 67, 67, 69, 69, 71, 72, 73, 73, 74, 75, 74, 72, 71,
+        52, 50, 49, 48, 48, 47, 47, 47, 50, 50, 54, 55, 57, 58, 61, 62, 64, 66,
+        66, 68, 68, 70, 71, 72, 73, 75, 75, 75, 76, 77, 78, 79, 57, 54, 54, 52,
+        51, 51, 50, 50, 52, 53, 57, 57, 60, 61, 64, 65, 67, 69, 71, 73, 73, 76,
+        77, 79, 80, 82, 82, 83, 84, 82, 81, 79, 58, 55, 54, 52, 52, 52, 51, 50,
+        53, 54, 57, 57, 60, 61, 64, 66, 67, 70, 71, 73, 74, 77, 77, 79, 81, 82,
+        83, 83, 85, 85, 86, 87, 63, 60, 59, 57, 57, 56, 55, 54, 57, 57, 60, 61,
+        64, 65, 67, 69, 71, 73, 75, 77, 78, 82, 82, 85, 86, 89, 89, 90, 91, 91,
+        89, 87, 64, 61, 60, 58, 57, 57, 56, 55, 57, 58, 61, 61, 64, 65, 68, 69,
+        71, 74, 75, 78, 78, 82, 83, 86, 87, 90, 90, 91, 92, 93, 94, 95, 67, 63,
+        63, 60, 60, 59, 58, 57, 59, 60, 62, 63, 65, 66, 69, 70, 73, 74, 77, 78,
+        81, 83, 85, 87, 88, 92, 92, 94, 94, 96, 95, 95, 67, 64, 64, 61, 61, 60,
+        60, 58, 58, 61, 61, 64, 64, 67, 67, 70, 71, 74, 74, 78, 78, 82, 82, 86,
+        86, 90, 90, 95, 95, 96, 96, 98,
+        /* Size 4x16 */
+        31, 48, 52, 64, 31, 47, 49, 60, 33, 46, 48, 57, 38, 47, 47, 56, 42, 49,
+        50, 57, 46, 53, 54, 61, 46, 53, 57, 64, 45, 53, 61, 68, 46, 54, 64, 71,
+        48, 54, 66, 75, 50, 55, 68, 78, 52, 57, 71, 83, 56, 59, 73, 87, 58, 61,
+        75, 90, 60, 62, 76, 92, 62, 64, 78, 94,
+        /* Size 16x4 */
+        31, 31, 33, 38, 42, 46, 46, 45, 46, 48, 50, 52, 56, 58, 60, 62, 48, 47,
+        46, 47, 49, 53, 53, 53, 54, 54, 55, 57, 59, 61, 62, 64, 52, 49, 48, 47,
+        50, 54, 57, 61, 64, 66, 68, 71, 73, 75, 76, 78, 64, 60, 57, 56, 57, 61,
+        64, 68, 71, 75, 78, 83, 87, 90, 92, 94,
+        /* Size 8x32 */
+        32, 32, 40, 49, 51, 57, 63, 67, 31, 33, 41, 47, 49, 54, 60, 63, 31, 33,
+        41, 47, 49, 54, 59, 63, 30, 33, 42, 45, 47, 52, 57, 60, 31, 35, 43, 46,
+        47, 51, 57, 60, 33, 37, 44, 46, 47, 51, 56, 59, 35, 39, 46, 46, 47, 50,
+        55, 58, 37, 41, 47, 46, 46, 50, 54, 57, 41, 43, 48, 49, 49, 52, 57, 59,
+        42, 43, 48, 49, 50, 53, 57, 60, 49, 47, 50, 53, 54, 57, 60, 62, 49, 47,
+        50, 53, 54, 57, 61, 63, 48, 46, 49, 54, 57, 60, 64, 65, 48, 46, 49, 55,
+        58, 61, 65, 66, 49, 45, 48, 56, 61, 64, 67, 69, 49, 46, 49, 57, 62, 65,
+        69, 70, 50, 46, 49, 57, 63, 67, 71, 73, 51, 47, 49, 58, 64, 69, 73, 74,
+        52, 48, 50, 58, 65, 71, 75, 77, 54, 49, 51, 59, 67, 73, 77, 78, 54, 50,
+        51, 59, 67, 73, 78, 81, 57, 52, 52, 60, 69, 76, 82, 83, 57, 52, 53, 61,
+        69, 77, 82, 85, 60, 54, 55, 62, 71, 79, 85, 87, 61, 55, 56, 63, 72, 80,
+        86, 88, 63, 57, 57, 64, 73, 82, 89, 92, 64, 58, 58, 65, 73, 82, 89, 92,
+        64, 58, 58, 65, 74, 83, 90, 94, 66, 59, 59, 66, 75, 84, 91, 94, 67, 60,
+        59, 66, 74, 82, 91, 96, 68, 61, 59, 65, 72, 81, 89, 95, 68, 62, 59, 65,
+        71, 79, 87, 95,
+        /* Size 32x8 */
+        32, 31, 31, 30, 31, 33, 35, 37, 41, 42, 49, 49, 48, 48, 49, 49, 50, 51,
+        52, 54, 54, 57, 57, 60, 61, 63, 64, 64, 66, 67, 68, 68, 32, 33, 33, 33,
+        35, 37, 39, 41, 43, 43, 47, 47, 46, 46, 45, 46, 46, 47, 48, 49, 50, 52,
+        52, 54, 55, 57, 58, 58, 59, 60, 61, 62, 40, 41, 41, 42, 43, 44, 46, 47,
+        48, 48, 50, 50, 49, 49, 48, 49, 49, 49, 50, 51, 51, 52, 53, 55, 56, 57,
+        58, 58, 59, 59, 59, 59, 49, 47, 47, 45, 46, 46, 46, 46, 49, 49, 53, 53,
+        54, 55, 56, 57, 57, 58, 58, 59, 59, 60, 61, 62, 63, 64, 65, 65, 66, 66,
+        65, 65, 51, 49, 49, 47, 47, 47, 47, 46, 49, 50, 54, 54, 57, 58, 61, 62,
+        63, 64, 65, 67, 67, 69, 69, 71, 72, 73, 73, 74, 75, 74, 72, 71, 57, 54,
+        54, 52, 51, 51, 50, 50, 52, 53, 57, 57, 60, 61, 64, 65, 67, 69, 71, 73,
+        73, 76, 77, 79, 80, 82, 82, 83, 84, 82, 81, 79, 63, 60, 59, 57, 57, 56,
+        55, 54, 57, 57, 60, 61, 64, 65, 67, 69, 71, 73, 75, 77, 78, 82, 82, 85,
+        86, 89, 89, 90, 91, 91, 89, 87, 67, 63, 63, 60, 60, 59, 58, 57, 59, 60,
+        62, 63, 65, 66, 69, 70, 73, 74, 77, 78, 81, 83, 85, 87, 88, 92, 92, 94,
+        94, 96, 95, 95 },
+  },
+  {
+      { /* Luma */
+        /* Size 4x4 */
+        32, 34, 49, 72, 34, 48, 60, 79, 49, 60, 82, 104, 72, 79, 104, 134,
+        /* Size 8x8 */
+        32, 32, 34, 38, 46, 56, 68, 78, 32, 33, 35, 39, 45, 54, 64, 74, 34, 35,
+        39, 45, 51, 58, 68, 76, 38, 39, 45, 54, 61, 69, 78, 86, 46, 45, 51, 61,
+        71, 80, 90, 99, 56, 54, 58, 69, 80, 92, 103, 113, 68, 64, 68, 78, 90,
+        103, 117, 128, 78, 74, 76, 86, 99, 113, 128, 140,
+        /* Size 16x16 */
+        32, 31, 31, 31, 32, 34, 36, 39, 44, 48, 54, 59, 65, 71, 80, 83, 31, 32,
+        32, 32, 32, 34, 35, 38, 42, 46, 51, 56, 62, 68, 76, 78, 31, 32, 32, 32,
+        32, 33, 34, 37, 41, 44, 49, 54, 59, 65, 72, 75, 31, 32, 32, 33, 34, 35,
+        36, 39, 42, 45, 50, 54, 59, 64, 71, 74, 32, 32, 32, 34, 35, 37, 38, 40,
+        42, 46, 49, 53, 58, 63, 69, 72, 34, 34, 33, 35, 37, 39, 42, 45, 47, 51,
+        54, 58, 63, 68, 74, 76, 36, 35, 34, 36, 38, 42, 48, 50, 54, 57, 60, 64,
+        68, 73, 79, 81, 39, 38, 37, 39, 40, 45, 50, 54, 58, 61, 65, 69, 73, 78,
+        84, 86, 44, 42, 41, 42, 42, 47, 54, 58, 63, 67, 71, 75, 79, 84, 90, 92,
+        48, 46, 44, 45, 46, 51, 57, 61, 67, 71, 76, 80, 85, 90, 96, 99, 54, 51,
+        49, 50, 49, 54, 60, 65, 71, 76, 82, 87, 92, 97, 104, 106, 59, 56, 54,
+        54, 53, 58, 64, 69, 75, 80, 87, 92, 98, 103, 110, 113, 65, 62, 59, 59,
+        58, 63, 68, 73, 79, 85, 92, 98, 105, 111, 118, 121, 71, 68, 65, 64, 63,
+        68, 73, 78, 84, 90, 97, 103, 111, 117, 125, 128, 80, 76, 72, 71, 69, 74,
+        79, 84, 90, 96, 104, 110, 118, 125, 134, 137, 83, 78, 75, 74, 72, 76,
+        81, 86, 92, 99, 106, 113, 121, 128, 137, 140,
+        /* Size 32x32 */
+        32, 31, 31, 31, 31, 31, 31, 32, 32, 34, 34, 36, 36, 39, 39, 44, 44, 48,
+        48, 54, 54, 59, 59, 65, 65, 71, 71, 80, 80, 83, 83, 87, 31, 32, 32, 32,
+        32, 32, 32, 32, 32, 34, 34, 35, 35, 38, 38, 42, 42, 46, 46, 51, 51, 56,
+        56, 62, 62, 68, 68, 76, 76, 78, 78, 83, 31, 32, 32, 32, 32, 32, 32, 32,
+        32, 34, 34, 35, 35, 38, 38, 42, 42, 46, 46, 51, 51, 56, 56, 62, 62, 68,
+        68, 76, 76, 78, 78, 83, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34,
+        34, 37, 37, 41, 41, 44, 44, 49, 49, 54, 54, 59, 59, 65, 65, 72, 72, 75,
+        75, 79, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 37, 37, 41,
+        41, 44, 44, 49, 49, 54, 54, 59, 59, 65, 65, 72, 72, 75, 75, 79, 31, 32,
+        32, 32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 39, 39, 42, 42, 45, 45, 50,
+        50, 54, 54, 59, 59, 64, 64, 71, 71, 74, 74, 77, 31, 32, 32, 32, 32, 33,
+        33, 34, 34, 35, 35, 36, 36, 39, 39, 42, 42, 45, 45, 50, 50, 54, 54, 59,
+        59, 64, 64, 71, 71, 74, 74, 77, 32, 32, 32, 32, 32, 34, 34, 35, 35, 37,
+        37, 38, 38, 40, 40, 42, 42, 46, 46, 49, 49, 53, 53, 58, 58, 63, 63, 69,
+        69, 72, 72, 75, 32, 32, 32, 32, 32, 34, 34, 35, 35, 37, 37, 38, 38, 40,
+        40, 42, 42, 46, 46, 49, 49, 53, 53, 58, 58, 63, 63, 69, 69, 72, 72, 75,
+        34, 34, 34, 33, 33, 35, 35, 37, 37, 39, 39, 42, 42, 45, 45, 47, 47, 51,
+        51, 54, 54, 58, 58, 63, 63, 68, 68, 74, 74, 76, 76, 80, 34, 34, 34, 33,
+        33, 35, 35, 37, 37, 39, 39, 42, 42, 45, 45, 47, 47, 51, 51, 54, 54, 58,
+        58, 63, 63, 68, 68, 74, 74, 76, 76, 80, 36, 35, 35, 34, 34, 36, 36, 38,
+        38, 42, 42, 48, 48, 50, 50, 54, 54, 57, 57, 60, 60, 64, 64, 68, 68, 73,
+        73, 79, 79, 81, 81, 84, 36, 35, 35, 34, 34, 36, 36, 38, 38, 42, 42, 48,
+        48, 50, 50, 54, 54, 57, 57, 60, 60, 64, 64, 68, 68, 73, 73, 79, 79, 81,
+        81, 84, 39, 38, 38, 37, 37, 39, 39, 40, 40, 45, 45, 50, 50, 54, 54, 58,
+        58, 61, 61, 65, 65, 69, 69, 73, 73, 78, 78, 84, 84, 86, 86, 90, 39, 38,
+        38, 37, 37, 39, 39, 40, 40, 45, 45, 50, 50, 54, 54, 58, 58, 61, 61, 65,
+        65, 69, 69, 73, 73, 78, 78, 84, 84, 86, 86, 90, 44, 42, 42, 41, 41, 42,
+        42, 42, 42, 47, 47, 54, 54, 58, 58, 63, 63, 67, 67, 71, 71, 75, 75, 79,
+        79, 84, 84, 90, 90, 92, 92, 96, 44, 42, 42, 41, 41, 42, 42, 42, 42, 47,
+        47, 54, 54, 58, 58, 63, 63, 67, 67, 71, 71, 75, 75, 79, 79, 84, 84, 90,
+        90, 92, 92, 96, 48, 46, 46, 44, 44, 45, 45, 46, 46, 51, 51, 57, 57, 61,
+        61, 67, 67, 71, 71, 76, 76, 80, 80, 85, 85, 90, 90, 96, 96, 99, 99, 102,
+        48, 46, 46, 44, 44, 45, 45, 46, 46, 51, 51, 57, 57, 61, 61, 67, 67, 71,
+        71, 76, 76, 80, 80, 85, 85, 90, 90, 96, 96, 99, 99, 102, 54, 51, 51, 49,
+        49, 50, 50, 49, 49, 54, 54, 60, 60, 65, 65, 71, 71, 76, 76, 82, 82, 87,
+        87, 92, 92, 97, 97, 104, 104, 106, 106, 109, 54, 51, 51, 49, 49, 50, 50,
+        49, 49, 54, 54, 60, 60, 65, 65, 71, 71, 76, 76, 82, 82, 87, 87, 92, 92,
+        97, 97, 104, 104, 106, 106, 109, 59, 56, 56, 54, 54, 54, 54, 53, 53, 58,
+        58, 64, 64, 69, 69, 75, 75, 80, 80, 87, 87, 92, 92, 98, 98, 103, 103,
+        110, 110, 113, 113, 116, 59, 56, 56, 54, 54, 54, 54, 53, 53, 58, 58, 64,
+        64, 69, 69, 75, 75, 80, 80, 87, 87, 92, 92, 98, 98, 103, 103, 110, 110,
+        113, 113, 116, 65, 62, 62, 59, 59, 59, 59, 58, 58, 63, 63, 68, 68, 73,
+        73, 79, 79, 85, 85, 92, 92, 98, 98, 105, 105, 111, 111, 118, 118, 121,
+        121, 124, 65, 62, 62, 59, 59, 59, 59, 58, 58, 63, 63, 68, 68, 73, 73,
+        79, 79, 85, 85, 92, 92, 98, 98, 105, 105, 111, 111, 118, 118, 121, 121,
+        124, 71, 68, 68, 65, 65, 64, 64, 63, 63, 68, 68, 73, 73, 78, 78, 84, 84,
+        90, 90, 97, 97, 103, 103, 111, 111, 117, 117, 125, 125, 128, 128, 132,
+        71, 68, 68, 65, 65, 64, 64, 63, 63, 68, 68, 73, 73, 78, 78, 84, 84, 90,
+        90, 97, 97, 103, 103, 111, 111, 117, 117, 125, 125, 128, 128, 132, 80,
+        76, 76, 72, 72, 71, 71, 69, 69, 74, 74, 79, 79, 84, 84, 90, 90, 96, 96,
+        104, 104, 110, 110, 118, 118, 125, 125, 134, 134, 137, 137, 141, 80, 76,
+        76, 72, 72, 71, 71, 69, 69, 74, 74, 79, 79, 84, 84, 90, 90, 96, 96, 104,
+        104, 110, 110, 118, 118, 125, 125, 134, 134, 137, 137, 141, 83, 78, 78,
+        75, 75, 74, 74, 72, 72, 76, 76, 81, 81, 86, 86, 92, 92, 99, 99, 106,
+        106, 113, 113, 121, 121, 128, 128, 137, 137, 140, 140, 144, 83, 78, 78,
+        75, 75, 74, 74, 72, 72, 76, 76, 81, 81, 86, 86, 92, 92, 99, 99, 106,
+        106, 113, 113, 121, 121, 128, 128, 137, 137, 140, 140, 144, 87, 83, 83,
+        79, 79, 77, 77, 75, 75, 80, 80, 84, 84, 90, 90, 96, 96, 102, 102, 109,
+        109, 116, 116, 124, 124, 132, 132, 141, 141, 144, 144, 149,
+        /* Size 4x8 */
+        32, 35, 51, 75, 32, 36, 50, 71, 34, 42, 54, 73, 37, 50, 65, 84, 45, 56,
+        76, 96, 54, 63, 87, 110, 65, 73, 97, 125, 75, 81, 106, 136,
+        /* Size 8x4 */
+        32, 32, 34, 37, 45, 54, 65, 75, 35, 36, 42, 50, 56, 63, 73, 81, 51, 50,
+        54, 65, 76, 87, 97, 106, 75, 71, 73, 84, 96, 110, 125, 136,
+        /* Size 8x16 */
+        32, 31, 32, 36, 44, 53, 65, 79, 31, 32, 32, 35, 42, 51, 62, 75, 31, 32,
+        33, 34, 41, 49, 59, 72, 32, 32, 34, 36, 42, 50, 59, 71, 32, 33, 35, 38,
+        42, 49, 58, 69, 34, 34, 37, 42, 48, 54, 63, 73, 36, 34, 38, 48, 54, 60,
+        68, 78, 39, 37, 40, 50, 58, 65, 73, 84, 44, 41, 43, 53, 63, 71, 79, 90,
+        48, 45, 46, 56, 67, 76, 85, 96, 53, 49, 50, 60, 71, 82, 92, 103, 58, 54,
+        54, 63, 75, 87, 98, 110, 65, 60, 58, 68, 79, 92, 105, 118, 71, 65, 63,
+        73, 84, 97, 111, 125, 79, 72, 70, 79, 90, 104, 118, 133, 82, 75, 72, 81,
+        92, 106, 121, 136,
+        /* Size 16x8 */
+        32, 31, 31, 32, 32, 34, 36, 39, 44, 48, 53, 58, 65, 71, 79, 82, 31, 32,
+        32, 32, 33, 34, 34, 37, 41, 45, 49, 54, 60, 65, 72, 75, 32, 32, 33, 34,
+        35, 37, 38, 40, 43, 46, 50, 54, 58, 63, 70, 72, 36, 35, 34, 36, 38, 42,
+        48, 50, 53, 56, 60, 63, 68, 73, 79, 81, 44, 42, 41, 42, 42, 48, 54, 58,
+        63, 67, 71, 75, 79, 84, 90, 92, 53, 51, 49, 50, 49, 54, 60, 65, 71, 76,
+        82, 87, 92, 97, 104, 106, 65, 62, 59, 59, 58, 63, 68, 73, 79, 85, 92,
+        98, 105, 111, 118, 121, 79, 75, 72, 71, 69, 73, 78, 84, 90, 96, 103,
+        110, 118, 125, 133, 136,
+        /* Size 16x32 */
+        32, 31, 31, 32, 32, 36, 36, 44, 44, 53, 53, 65, 65, 79, 79, 87, 31, 32,
+        32, 32, 32, 35, 35, 42, 42, 51, 51, 62, 62, 75, 75, 82, 31, 32, 32, 32,
+        32, 35, 35, 42, 42, 51, 51, 62, 62, 75, 75, 82, 31, 32, 32, 33, 33, 34,
+        34, 41, 41, 49, 49, 59, 59, 72, 72, 78, 31, 32, 32, 33, 33, 34, 34, 41,
+        41, 49, 49, 59, 59, 72, 72, 78, 32, 32, 32, 34, 34, 36, 36, 42, 42, 50,
+        50, 59, 59, 71, 71, 77, 32, 32, 32, 34, 34, 36, 36, 42, 42, 50, 50, 59,
+        59, 71, 71, 77, 32, 33, 33, 35, 35, 38, 38, 42, 42, 49, 49, 58, 58, 69,
+        69, 75, 32, 33, 33, 35, 35, 38, 38, 42, 42, 49, 49, 58, 58, 69, 69, 75,
+        34, 34, 34, 37, 37, 42, 42, 48, 48, 54, 54, 63, 63, 73, 73, 79, 34, 34,
+        34, 37, 37, 42, 42, 48, 48, 54, 54, 63, 63, 73, 73, 79, 36, 34, 34, 38,
+        38, 48, 48, 54, 54, 60, 60, 68, 68, 78, 78, 84, 36, 34, 34, 38, 38, 48,
+        48, 54, 54, 60, 60, 68, 68, 78, 78, 84, 39, 37, 37, 40, 40, 50, 50, 58,
+        58, 65, 65, 73, 73, 84, 84, 89, 39, 37, 37, 40, 40, 50, 50, 58, 58, 65,
+        65, 73, 73, 84, 84, 89, 44, 41, 41, 43, 43, 53, 53, 63, 63, 71, 71, 79,
+        79, 90, 90, 95, 44, 41, 41, 43, 43, 53, 53, 63, 63, 71, 71, 79, 79, 90,
+        90, 95, 48, 45, 45, 46, 46, 56, 56, 67, 67, 76, 76, 85, 85, 96, 96, 102,
+        48, 45, 45, 46, 46, 56, 56, 67, 67, 76, 76, 85, 85, 96, 96, 102, 53, 49,
+        49, 50, 50, 60, 60, 71, 71, 82, 82, 92, 92, 103, 103, 109, 53, 49, 49,
+        50, 50, 60, 60, 71, 71, 82, 82, 92, 92, 103, 103, 109, 58, 54, 54, 54,
+        54, 63, 63, 75, 75, 87, 87, 98, 98, 110, 110, 116, 58, 54, 54, 54, 54,
+        63, 63, 75, 75, 87, 87, 98, 98, 110, 110, 116, 65, 60, 60, 58, 58, 68,
+        68, 79, 79, 92, 92, 105, 105, 118, 118, 124, 65, 60, 60, 58, 58, 68, 68,
+        79, 79, 92, 92, 105, 105, 118, 118, 124, 71, 65, 65, 63, 63, 73, 73, 84,
+        84, 97, 97, 111, 111, 125, 125, 132, 71, 65, 65, 63, 63, 73, 73, 84, 84,
+        97, 97, 111, 111, 125, 125, 132, 79, 72, 72, 70, 70, 79, 79, 90, 90,
+        104, 104, 118, 118, 133, 133, 141, 79, 72, 72, 70, 70, 79, 79, 90, 90,
+        104, 104, 118, 118, 133, 133, 141, 82, 75, 75, 72, 72, 81, 81, 92, 92,
+        106, 106, 121, 121, 136, 136, 144, 82, 75, 75, 72, 72, 81, 81, 92, 92,
+        106, 106, 121, 121, 136, 136, 144, 87, 79, 79, 76, 76, 84, 84, 96, 96,
+        109, 109, 124, 124, 141, 141, 149,
+        /* Size 32x16 */
+        32, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 36, 36, 39, 39, 44, 44, 48,
+        48, 53, 53, 58, 58, 65, 65, 71, 71, 79, 79, 82, 82, 87, 31, 32, 32, 32,
+        32, 32, 32, 33, 33, 34, 34, 34, 34, 37, 37, 41, 41, 45, 45, 49, 49, 54,
+        54, 60, 60, 65, 65, 72, 72, 75, 75, 79, 31, 32, 32, 32, 32, 32, 32, 33,
+        33, 34, 34, 34, 34, 37, 37, 41, 41, 45, 45, 49, 49, 54, 54, 60, 60, 65,
+        65, 72, 72, 75, 75, 79, 32, 32, 32, 33, 33, 34, 34, 35, 35, 37, 37, 38,
+        38, 40, 40, 43, 43, 46, 46, 50, 50, 54, 54, 58, 58, 63, 63, 70, 70, 72,
+        72, 76, 32, 32, 32, 33, 33, 34, 34, 35, 35, 37, 37, 38, 38, 40, 40, 43,
+        43, 46, 46, 50, 50, 54, 54, 58, 58, 63, 63, 70, 70, 72, 72, 76, 36, 35,
+        35, 34, 34, 36, 36, 38, 38, 42, 42, 48, 48, 50, 50, 53, 53, 56, 56, 60,
+        60, 63, 63, 68, 68, 73, 73, 79, 79, 81, 81, 84, 36, 35, 35, 34, 34, 36,
+        36, 38, 38, 42, 42, 48, 48, 50, 50, 53, 53, 56, 56, 60, 60, 63, 63, 68,
+        68, 73, 73, 79, 79, 81, 81, 84, 44, 42, 42, 41, 41, 42, 42, 42, 42, 48,
+        48, 54, 54, 58, 58, 63, 63, 67, 67, 71, 71, 75, 75, 79, 79, 84, 84, 90,
+        90, 92, 92, 96, 44, 42, 42, 41, 41, 42, 42, 42, 42, 48, 48, 54, 54, 58,
+        58, 63, 63, 67, 67, 71, 71, 75, 75, 79, 79, 84, 84, 90, 90, 92, 92, 96,
+        53, 51, 51, 49, 49, 50, 50, 49, 49, 54, 54, 60, 60, 65, 65, 71, 71, 76,
+        76, 82, 82, 87, 87, 92, 92, 97, 97, 104, 104, 106, 106, 109, 53, 51, 51,
+        49, 49, 50, 50, 49, 49, 54, 54, 60, 60, 65, 65, 71, 71, 76, 76, 82, 82,
+        87, 87, 92, 92, 97, 97, 104, 104, 106, 106, 109, 65, 62, 62, 59, 59, 59,
+        59, 58, 58, 63, 63, 68, 68, 73, 73, 79, 79, 85, 85, 92, 92, 98, 98, 105,
+        105, 111, 111, 118, 118, 121, 121, 124, 65, 62, 62, 59, 59, 59, 59, 58,
+        58, 63, 63, 68, 68, 73, 73, 79, 79, 85, 85, 92, 92, 98, 98, 105, 105,
+        111, 111, 118, 118, 121, 121, 124, 79, 75, 75, 72, 72, 71, 71, 69, 69,
+        73, 73, 78, 78, 84, 84, 90, 90, 96, 96, 103, 103, 110, 110, 118, 118,
+        125, 125, 133, 133, 136, 136, 141, 79, 75, 75, 72, 72, 71, 71, 69, 69,
+        73, 73, 78, 78, 84, 84, 90, 90, 96, 96, 103, 103, 110, 110, 118, 118,
+        125, 125, 133, 133, 136, 136, 141, 87, 82, 82, 78, 78, 77, 77, 75, 75,
+        79, 79, 84, 84, 89, 89, 95, 95, 102, 102, 109, 109, 116, 116, 124, 124,
+        132, 132, 141, 141, 144, 144, 149,
+        /* Size 4x16 */
+        31, 36, 53, 79, 32, 35, 51, 75, 32, 34, 49, 72, 32, 36, 50, 71, 33, 38,
+        49, 69, 34, 42, 54, 73, 34, 48, 60, 78, 37, 50, 65, 84, 41, 53, 71, 90,
+        45, 56, 76, 96, 49, 60, 82, 103, 54, 63, 87, 110, 60, 68, 92, 118, 65,
+        73, 97, 125, 72, 79, 104, 133, 75, 81, 106, 136,
+        /* Size 16x4 */
+        31, 32, 32, 32, 33, 34, 34, 37, 41, 45, 49, 54, 60, 65, 72, 75, 36, 35,
+        34, 36, 38, 42, 48, 50, 53, 56, 60, 63, 68, 73, 79, 81, 53, 51, 49, 50,
+        49, 54, 60, 65, 71, 76, 82, 87, 92, 97, 104, 106, 79, 75, 72, 71, 69,
+        73, 78, 84, 90, 96, 103, 110, 118, 125, 133, 136,
+        /* Size 8x32 */
+        32, 31, 32, 36, 44, 53, 65, 79, 31, 32, 32, 35, 42, 51, 62, 75, 31, 32,
+        32, 35, 42, 51, 62, 75, 31, 32, 33, 34, 41, 49, 59, 72, 31, 32, 33, 34,
+        41, 49, 59, 72, 32, 32, 34, 36, 42, 50, 59, 71, 32, 32, 34, 36, 42, 50,
+        59, 71, 32, 33, 35, 38, 42, 49, 58, 69, 32, 33, 35, 38, 42, 49, 58, 69,
+        34, 34, 37, 42, 48, 54, 63, 73, 34, 34, 37, 42, 48, 54, 63, 73, 36, 34,
+        38, 48, 54, 60, 68, 78, 36, 34, 38, 48, 54, 60, 68, 78, 39, 37, 40, 50,
+        58, 65, 73, 84, 39, 37, 40, 50, 58, 65, 73, 84, 44, 41, 43, 53, 63, 71,
+        79, 90, 44, 41, 43, 53, 63, 71, 79, 90, 48, 45, 46, 56, 67, 76, 85, 96,
+        48, 45, 46, 56, 67, 76, 85, 96, 53, 49, 50, 60, 71, 82, 92, 103, 53, 49,
+        50, 60, 71, 82, 92, 103, 58, 54, 54, 63, 75, 87, 98, 110, 58, 54, 54,
+        63, 75, 87, 98, 110, 65, 60, 58, 68, 79, 92, 105, 118, 65, 60, 58, 68,
+        79, 92, 105, 118, 71, 65, 63, 73, 84, 97, 111, 125, 71, 65, 63, 73, 84,
+        97, 111, 125, 79, 72, 70, 79, 90, 104, 118, 133, 79, 72, 70, 79, 90,
+        104, 118, 133, 82, 75, 72, 81, 92, 106, 121, 136, 82, 75, 72, 81, 92,
+        106, 121, 136, 87, 79, 76, 84, 96, 109, 124, 141,
+        /* Size 32x8 */
+        32, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 36, 36, 39, 39, 44, 44, 48,
+        48, 53, 53, 58, 58, 65, 65, 71, 71, 79, 79, 82, 82, 87, 31, 32, 32, 32,
+        32, 32, 32, 33, 33, 34, 34, 34, 34, 37, 37, 41, 41, 45, 45, 49, 49, 54,
+        54, 60, 60, 65, 65, 72, 72, 75, 75, 79, 32, 32, 32, 33, 33, 34, 34, 35,
+        35, 37, 37, 38, 38, 40, 40, 43, 43, 46, 46, 50, 50, 54, 54, 58, 58, 63,
+        63, 70, 70, 72, 72, 76, 36, 35, 35, 34, 34, 36, 36, 38, 38, 42, 42, 48,
+        48, 50, 50, 53, 53, 56, 56, 60, 60, 63, 63, 68, 68, 73, 73, 79, 79, 81,
+        81, 84, 44, 42, 42, 41, 41, 42, 42, 42, 42, 48, 48, 54, 54, 58, 58, 63,
+        63, 67, 67, 71, 71, 75, 75, 79, 79, 84, 84, 90, 90, 92, 92, 96, 53, 51,
+        51, 49, 49, 50, 50, 49, 49, 54, 54, 60, 60, 65, 65, 71, 71, 76, 76, 82,
+        82, 87, 87, 92, 92, 97, 97, 104, 104, 106, 106, 109, 65, 62, 62, 59, 59,
+        59, 59, 58, 58, 63, 63, 68, 68, 73, 73, 79, 79, 85, 85, 92, 92, 98, 98,
+        105, 105, 111, 111, 118, 118, 121, 121, 124, 79, 75, 75, 72, 72, 71, 71,
+        69, 69, 73, 73, 78, 78, 84, 84, 90, 90, 96, 96, 103, 103, 110, 110, 118,
+        118, 125, 125, 133, 133, 136, 136, 141 },
+      { /* Chroma */
+        /* Size 4x4 */
+        32, 46, 47, 57, 46, 53, 54, 60, 47, 54, 66, 75, 57, 60, 75, 89,
+        /* Size 8x8 */
+        31, 34, 42, 47, 48, 52, 57, 61, 34, 39, 45, 46, 46, 49, 53, 57, 42, 45,
+        48, 49, 50, 52, 55, 58, 47, 46, 49, 54, 56, 58, 61, 64, 48, 46, 50, 56,
+        61, 65, 68, 71, 52, 49, 52, 58, 65, 71, 75, 79, 57, 53, 55, 61, 68, 75,
+        82, 86, 61, 57, 58, 64, 71, 79, 86, 91,
+        /* Size 16x16 */
+        32, 31, 30, 33, 36, 41, 49, 48, 49, 50, 52, 54, 57, 60, 63, 65, 31, 31,
+        31, 34, 38, 42, 47, 47, 47, 48, 50, 52, 54, 57, 60, 61, 30, 31, 32, 35,
+        40, 42, 46, 45, 45, 46, 47, 49, 52, 54, 57, 58, 33, 34, 35, 39, 43, 45,
+        47, 46, 45, 46, 47, 49, 51, 53, 56, 57, 36, 38, 40, 43, 47, 47, 48, 46,
+        45, 46, 47, 48, 50, 52, 54, 55, 41, 42, 42, 45, 47, 48, 50, 49, 49, 50,
+        50, 52, 53, 55, 57, 58, 49, 47, 46, 47, 48, 50, 53, 53, 53, 54, 54, 55,
+        56, 58, 60, 61, 48, 47, 45, 46, 46, 49, 53, 54, 55, 56, 57, 58, 60, 61,
+        63, 64, 49, 47, 45, 45, 45, 49, 53, 55, 58, 60, 61, 62, 63, 65, 67, 68,
+        50, 48, 46, 46, 46, 50, 54, 56, 60, 61, 63, 65, 67, 68, 71, 71, 52, 50,
+        47, 47, 47, 50, 54, 57, 61, 63, 66, 68, 70, 72, 75, 75, 54, 52, 49, 49,
+        48, 52, 55, 58, 62, 65, 68, 71, 73, 75, 78, 79, 57, 54, 52, 51, 50, 53,
+        56, 60, 63, 67, 70, 73, 76, 79, 82, 83, 60, 57, 54, 53, 52, 55, 58, 61,
+        65, 68, 72, 75, 79, 82, 85, 86, 63, 60, 57, 56, 54, 57, 60, 63, 67, 71,
+        75, 78, 82, 85, 89, 90, 65, 61, 58, 57, 55, 58, 61, 64, 68, 71, 75, 79,
+        83, 86, 90, 91,
+        /* Size 32x32 */
+        32, 31, 31, 30, 30, 33, 33, 36, 36, 41, 41, 49, 49, 48, 48, 49, 49, 50,
+        50, 52, 52, 54, 54, 57, 57, 60, 60, 63, 63, 65, 65, 67, 31, 31, 31, 31,
+        31, 34, 34, 38, 38, 42, 42, 47, 47, 47, 47, 47, 47, 48, 48, 50, 50, 52,
+        52, 54, 54, 57, 57, 60, 60, 61, 61, 63, 31, 31, 31, 31, 31, 34, 34, 38,
+        38, 42, 42, 47, 47, 47, 47, 47, 47, 48, 48, 50, 50, 52, 52, 54, 54, 57,
+        57, 60, 60, 61, 61, 63, 30, 31, 31, 32, 32, 35, 35, 40, 40, 42, 42, 46,
+        46, 45, 45, 45, 45, 46, 46, 47, 47, 49, 49, 52, 52, 54, 54, 57, 57, 58,
+        58, 60, 30, 31, 31, 32, 32, 35, 35, 40, 40, 42, 42, 46, 46, 45, 45, 45,
+        45, 46, 46, 47, 47, 49, 49, 52, 52, 54, 54, 57, 57, 58, 58, 60, 33, 34,
+        34, 35, 35, 39, 39, 43, 43, 45, 45, 47, 47, 46, 46, 45, 45, 46, 46, 47,
+        47, 49, 49, 51, 51, 53, 53, 56, 56, 57, 57, 59, 33, 34, 34, 35, 35, 39,
+        39, 43, 43, 45, 45, 47, 47, 46, 46, 45, 45, 46, 46, 47, 47, 49, 49, 51,
+        51, 53, 53, 56, 56, 57, 57, 59, 36, 38, 38, 40, 40, 43, 43, 47, 47, 47,
+        47, 48, 48, 46, 46, 45, 45, 46, 46, 47, 47, 48, 48, 50, 50, 52, 52, 54,
+        54, 55, 55, 57, 36, 38, 38, 40, 40, 43, 43, 47, 47, 47, 47, 48, 48, 46,
+        46, 45, 45, 46, 46, 47, 47, 48, 48, 50, 50, 52, 52, 54, 54, 55, 55, 57,
+        41, 42, 42, 42, 42, 45, 45, 47, 47, 48, 48, 50, 50, 49, 49, 49, 49, 50,
+        50, 50, 50, 52, 52, 53, 53, 55, 55, 57, 57, 58, 58, 60, 41, 42, 42, 42,
+        42, 45, 45, 47, 47, 48, 48, 50, 50, 49, 49, 49, 49, 50, 50, 50, 50, 52,
+        52, 53, 53, 55, 55, 57, 57, 58, 58, 60, 49, 47, 47, 46, 46, 47, 47, 48,
+        48, 50, 50, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56, 56, 58,
+        58, 60, 60, 61, 61, 62, 49, 47, 47, 46, 46, 47, 47, 48, 48, 50, 50, 53,
+        53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56, 56, 58, 58, 60, 60, 61,
+        61, 62, 48, 47, 47, 45, 45, 46, 46, 46, 46, 49, 49, 53, 53, 54, 54, 55,
+        55, 56, 56, 57, 57, 58, 58, 60, 60, 61, 61, 63, 63, 64, 64, 66, 48, 47,
+        47, 45, 45, 46, 46, 46, 46, 49, 49, 53, 53, 54, 54, 55, 55, 56, 56, 57,
+        57, 58, 58, 60, 60, 61, 61, 63, 63, 64, 64, 66, 49, 47, 47, 45, 45, 45,
+        45, 45, 45, 49, 49, 53, 53, 55, 55, 58, 58, 60, 60, 61, 61, 62, 62, 63,
+        63, 65, 65, 67, 67, 68, 68, 69, 49, 47, 47, 45, 45, 45, 45, 45, 45, 49,
+        49, 53, 53, 55, 55, 58, 58, 60, 60, 61, 61, 62, 62, 63, 63, 65, 65, 67,
+        67, 68, 68, 69, 50, 48, 48, 46, 46, 46, 46, 46, 46, 50, 50, 54, 54, 56,
+        56, 60, 60, 61, 61, 63, 63, 65, 65, 67, 67, 68, 68, 71, 71, 71, 71, 72,
+        50, 48, 48, 46, 46, 46, 46, 46, 46, 50, 50, 54, 54, 56, 56, 60, 60, 61,
+        61, 63, 63, 65, 65, 67, 67, 68, 68, 71, 71, 71, 71, 72, 52, 50, 50, 47,
+        47, 47, 47, 47, 47, 50, 50, 54, 54, 57, 57, 61, 61, 63, 63, 66, 66, 68,
+        68, 70, 70, 72, 72, 75, 75, 75, 75, 76, 52, 50, 50, 47, 47, 47, 47, 47,
+        47, 50, 50, 54, 54, 57, 57, 61, 61, 63, 63, 66, 66, 68, 68, 70, 70, 72,
+        72, 75, 75, 75, 75, 76, 54, 52, 52, 49, 49, 49, 49, 48, 48, 52, 52, 55,
+        55, 58, 58, 62, 62, 65, 65, 68, 68, 71, 71, 73, 73, 75, 75, 78, 78, 79,
+        79, 80, 54, 52, 52, 49, 49, 49, 49, 48, 48, 52, 52, 55, 55, 58, 58, 62,
+        62, 65, 65, 68, 68, 71, 71, 73, 73, 75, 75, 78, 78, 79, 79, 80, 57, 54,
+        54, 52, 52, 51, 51, 50, 50, 53, 53, 56, 56, 60, 60, 63, 63, 67, 67, 70,
+        70, 73, 73, 76, 76, 79, 79, 82, 82, 83, 83, 84, 57, 54, 54, 52, 52, 51,
+        51, 50, 50, 53, 53, 56, 56, 60, 60, 63, 63, 67, 67, 70, 70, 73, 73, 76,
+        76, 79, 79, 82, 82, 83, 83, 84, 60, 57, 57, 54, 54, 53, 53, 52, 52, 55,
+        55, 58, 58, 61, 61, 65, 65, 68, 68, 72, 72, 75, 75, 79, 79, 82, 82, 85,
+        85, 86, 86, 88, 60, 57, 57, 54, 54, 53, 53, 52, 52, 55, 55, 58, 58, 61,
+        61, 65, 65, 68, 68, 72, 72, 75, 75, 79, 79, 82, 82, 85, 85, 86, 86, 88,
+        63, 60, 60, 57, 57, 56, 56, 54, 54, 57, 57, 60, 60, 63, 63, 67, 67, 71,
+        71, 75, 75, 78, 78, 82, 82, 85, 85, 89, 89, 90, 90, 92, 63, 60, 60, 57,
+        57, 56, 56, 54, 54, 57, 57, 60, 60, 63, 63, 67, 67, 71, 71, 75, 75, 78,
+        78, 82, 82, 85, 85, 89, 89, 90, 90, 92, 65, 61, 61, 58, 58, 57, 57, 55,
+        55, 58, 58, 61, 61, 64, 64, 68, 68, 71, 71, 75, 75, 79, 79, 83, 83, 86,
+        86, 90, 90, 91, 91, 93, 65, 61, 61, 58, 58, 57, 57, 55, 55, 58, 58, 61,
+        61, 64, 64, 68, 68, 71, 71, 75, 75, 79, 79, 83, 83, 86, 86, 90, 90, 91,
+        91, 93, 67, 63, 63, 60, 60, 59, 59, 57, 57, 60, 60, 62, 62, 66, 66, 69,
+        69, 72, 72, 76, 76, 80, 80, 84, 84, 88, 88, 92, 92, 93, 93, 95,
+        /* Size 4x8 */
+        31, 47, 50, 60, 36, 47, 47, 56, 43, 50, 50, 57, 46, 53, 57, 64, 46, 54,
+        64, 71, 50, 55, 68, 78, 54, 58, 72, 85, 59, 61, 75, 90,
+        /* Size 8x4 */
+        31, 36, 43, 46, 46, 50, 54, 59, 47, 47, 50, 53, 54, 55, 58, 61, 50, 47,
+        50, 57, 64, 68, 72, 75, 60, 56, 57, 64, 71, 78, 85, 90,
+        /* Size 8x16 */
+        32, 31, 37, 48, 49, 52, 57, 63, 31, 31, 38, 47, 47, 50, 54, 60, 30, 32,
+        40, 46, 45, 48, 52, 57, 33, 36, 43, 47, 46, 47, 51, 56, 37, 40, 47, 47,
+        45, 47, 50, 54, 42, 43, 47, 50, 49, 50, 53, 57, 49, 46, 48, 53, 53, 54,
+        57, 60, 48, 46, 47, 53, 56, 57, 60, 64, 49, 45, 46, 53, 58, 61, 64, 67,
+        50, 46, 46, 54, 59, 64, 67, 71, 52, 48, 47, 54, 61, 66, 71, 75, 54, 50,
+        49, 55, 62, 68, 73, 78, 57, 52, 50, 56, 64, 70, 76, 82, 60, 54, 52, 58,
+        65, 72, 79, 85, 63, 57, 55, 60, 67, 75, 82, 89, 64, 59, 56, 61, 68, 75,
+        83, 90,
+        /* Size 16x8 */
+        32, 31, 30, 33, 37, 42, 49, 48, 49, 50, 52, 54, 57, 60, 63, 64, 31, 31,
+        32, 36, 40, 43, 46, 46, 45, 46, 48, 50, 52, 54, 57, 59, 37, 38, 40, 43,
+        47, 47, 48, 47, 46, 46, 47, 49, 50, 52, 55, 56, 48, 47, 46, 47, 47, 50,
+        53, 53, 53, 54, 54, 55, 56, 58, 60, 61, 49, 47, 45, 46, 45, 49, 53, 56,
+        58, 59, 61, 62, 64, 65, 67, 68, 52, 50, 48, 47, 47, 50, 54, 57, 61, 64,
+        66, 68, 70, 72, 75, 75, 57, 54, 52, 51, 50, 53, 57, 60, 64, 67, 71, 73,
+        76, 79, 82, 83, 63, 60, 57, 56, 54, 57, 60, 64, 67, 71, 75, 78, 82, 85,
+        89, 90,
+        /* Size 16x32 */
+        32, 31, 31, 37, 37, 48, 48, 49, 49, 52, 52, 57, 57, 63, 63, 66, 31, 31,
+        31, 38, 38, 47, 47, 47, 47, 50, 50, 54, 54, 60, 60, 63, 31, 31, 31, 38,
+        38, 47, 47, 47, 47, 50, 50, 54, 54, 60, 60, 63, 30, 32, 32, 40, 40, 46,
+        46, 45, 45, 48, 48, 52, 52, 57, 57, 60, 30, 32, 32, 40, 40, 46, 46, 45,
+        45, 48, 48, 52, 52, 57, 57, 60, 33, 36, 36, 43, 43, 47, 47, 46, 46, 47,
+        47, 51, 51, 56, 56, 59, 33, 36, 36, 43, 43, 47, 47, 46, 46, 47, 47, 51,
+        51, 56, 56, 59, 37, 40, 40, 47, 47, 47, 47, 45, 45, 47, 47, 50, 50, 54,
+        54, 57, 37, 40, 40, 47, 47, 47, 47, 45, 45, 47, 47, 50, 50, 54, 54, 57,
+        42, 43, 43, 47, 47, 50, 50, 49, 49, 50, 50, 53, 53, 57, 57, 60, 42, 43,
+        43, 47, 47, 50, 50, 49, 49, 50, 50, 53, 53, 57, 57, 60, 49, 46, 46, 48,
+        48, 53, 53, 53, 53, 54, 54, 57, 57, 60, 60, 62, 49, 46, 46, 48, 48, 53,
+        53, 53, 53, 54, 54, 57, 57, 60, 60, 62, 48, 46, 46, 47, 47, 53, 53, 56,
+        56, 57, 57, 60, 60, 64, 64, 66, 48, 46, 46, 47, 47, 53, 53, 56, 56, 57,
+        57, 60, 60, 64, 64, 66, 49, 45, 45, 46, 46, 53, 53, 58, 58, 61, 61, 64,
+        64, 67, 67, 69, 49, 45, 45, 46, 46, 53, 53, 58, 58, 61, 61, 64, 64, 67,
+        67, 69, 50, 46, 46, 46, 46, 54, 54, 59, 59, 64, 64, 67, 67, 71, 71, 73,
+        50, 46, 46, 46, 46, 54, 54, 59, 59, 64, 64, 67, 67, 71, 71, 73, 52, 48,
+        48, 47, 47, 54, 54, 61, 61, 66, 66, 71, 71, 75, 75, 77, 52, 48, 48, 47,
+        47, 54, 54, 61, 61, 66, 66, 71, 71, 75, 75, 77, 54, 50, 50, 49, 49, 55,
+        55, 62, 62, 68, 68, 73, 73, 78, 78, 80, 54, 50, 50, 49, 49, 55, 55, 62,
+        62, 68, 68, 73, 73, 78, 78, 80, 57, 52, 52, 50, 50, 56, 56, 64, 64, 70,
+        70, 76, 76, 82, 82, 84, 57, 52, 52, 50, 50, 56, 56, 64, 64, 70, 70, 76,
+        76, 82, 82, 84, 60, 54, 54, 52, 52, 58, 58, 65, 65, 72, 72, 79, 79, 85,
+        85, 88, 60, 54, 54, 52, 52, 58, 58, 65, 65, 72, 72, 79, 79, 85, 85, 88,
+        63, 57, 57, 55, 55, 60, 60, 67, 67, 75, 75, 82, 82, 89, 89, 92, 63, 57,
+        57, 55, 55, 60, 60, 67, 67, 75, 75, 82, 82, 89, 89, 92, 64, 59, 59, 56,
+        56, 61, 61, 68, 68, 75, 75, 83, 83, 90, 90, 93, 64, 59, 59, 56, 56, 61,
+        61, 68, 68, 75, 75, 83, 83, 90, 90, 93, 66, 60, 60, 57, 57, 63, 63, 69,
+        69, 77, 77, 84, 84, 92, 92, 95,
+        /* Size 32x16 */
+        32, 31, 31, 30, 30, 33, 33, 37, 37, 42, 42, 49, 49, 48, 48, 49, 49, 50,
+        50, 52, 52, 54, 54, 57, 57, 60, 60, 63, 63, 64, 64, 66, 31, 31, 31, 32,
+        32, 36, 36, 40, 40, 43, 43, 46, 46, 46, 46, 45, 45, 46, 46, 48, 48, 50,
+        50, 52, 52, 54, 54, 57, 57, 59, 59, 60, 31, 31, 31, 32, 32, 36, 36, 40,
+        40, 43, 43, 46, 46, 46, 46, 45, 45, 46, 46, 48, 48, 50, 50, 52, 52, 54,
+        54, 57, 57, 59, 59, 60, 37, 38, 38, 40, 40, 43, 43, 47, 47, 47, 47, 48,
+        48, 47, 47, 46, 46, 46, 46, 47, 47, 49, 49, 50, 50, 52, 52, 55, 55, 56,
+        56, 57, 37, 38, 38, 40, 40, 43, 43, 47, 47, 47, 47, 48, 48, 47, 47, 46,
+        46, 46, 46, 47, 47, 49, 49, 50, 50, 52, 52, 55, 55, 56, 56, 57, 48, 47,
+        47, 46, 46, 47, 47, 47, 47, 50, 50, 53, 53, 53, 53, 53, 53, 54, 54, 54,
+        54, 55, 55, 56, 56, 58, 58, 60, 60, 61, 61, 63, 48, 47, 47, 46, 46, 47,
+        47, 47, 47, 50, 50, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56,
+        56, 58, 58, 60, 60, 61, 61, 63, 49, 47, 47, 45, 45, 46, 46, 45, 45, 49,
+        49, 53, 53, 56, 56, 58, 58, 59, 59, 61, 61, 62, 62, 64, 64, 65, 65, 67,
+        67, 68, 68, 69, 49, 47, 47, 45, 45, 46, 46, 45, 45, 49, 49, 53, 53, 56,
+        56, 58, 58, 59, 59, 61, 61, 62, 62, 64, 64, 65, 65, 67, 67, 68, 68, 69,
+        52, 50, 50, 48, 48, 47, 47, 47, 47, 50, 50, 54, 54, 57, 57, 61, 61, 64,
+        64, 66, 66, 68, 68, 70, 70, 72, 72, 75, 75, 75, 75, 77, 52, 50, 50, 48,
+        48, 47, 47, 47, 47, 50, 50, 54, 54, 57, 57, 61, 61, 64, 64, 66, 66, 68,
+        68, 70, 70, 72, 72, 75, 75, 75, 75, 77, 57, 54, 54, 52, 52, 51, 51, 50,
+        50, 53, 53, 57, 57, 60, 60, 64, 64, 67, 67, 71, 71, 73, 73, 76, 76, 79,
+        79, 82, 82, 83, 83, 84, 57, 54, 54, 52, 52, 51, 51, 50, 50, 53, 53, 57,
+        57, 60, 60, 64, 64, 67, 67, 71, 71, 73, 73, 76, 76, 79, 79, 82, 82, 83,
+        83, 84, 63, 60, 60, 57, 57, 56, 56, 54, 54, 57, 57, 60, 60, 64, 64, 67,
+        67, 71, 71, 75, 75, 78, 78, 82, 82, 85, 85, 89, 89, 90, 90, 92, 63, 60,
+        60, 57, 57, 56, 56, 54, 54, 57, 57, 60, 60, 64, 64, 67, 67, 71, 71, 75,
+        75, 78, 78, 82, 82, 85, 85, 89, 89, 90, 90, 92, 66, 63, 63, 60, 60, 59,
+        59, 57, 57, 60, 60, 62, 62, 66, 66, 69, 69, 73, 73, 77, 77, 80, 80, 84,
+        84, 88, 88, 92, 92, 93, 93, 95,
+        /* Size 4x16 */
+        31, 48, 52, 63, 31, 47, 50, 60, 32, 46, 48, 57, 36, 47, 47, 56, 40, 47,
+        47, 54, 43, 50, 50, 57, 46, 53, 54, 60, 46, 53, 57, 64, 45, 53, 61, 67,
+        46, 54, 64, 71, 48, 54, 66, 75, 50, 55, 68, 78, 52, 56, 70, 82, 54, 58,
+        72, 85, 57, 60, 75, 89, 59, 61, 75, 90,
+        /* Size 16x4 */
+        31, 31, 32, 36, 40, 43, 46, 46, 45, 46, 48, 50, 52, 54, 57, 59, 48, 47,
+        46, 47, 47, 50, 53, 53, 53, 54, 54, 55, 56, 58, 60, 61, 52, 50, 48, 47,
+        47, 50, 54, 57, 61, 64, 66, 68, 70, 72, 75, 75, 63, 60, 57, 56, 54, 57,
+        60, 64, 67, 71, 75, 78, 82, 85, 89, 90,
+        /* Size 8x32 */
+        32, 31, 37, 48, 49, 52, 57, 63, 31, 31, 38, 47, 47, 50, 54, 60, 31, 31,
+        38, 47, 47, 50, 54, 60, 30, 32, 40, 46, 45, 48, 52, 57, 30, 32, 40, 46,
+        45, 48, 52, 57, 33, 36, 43, 47, 46, 47, 51, 56, 33, 36, 43, 47, 46, 47,
+        51, 56, 37, 40, 47, 47, 45, 47, 50, 54, 37, 40, 47, 47, 45, 47, 50, 54,
+        42, 43, 47, 50, 49, 50, 53, 57, 42, 43, 47, 50, 49, 50, 53, 57, 49, 46,
+        48, 53, 53, 54, 57, 60, 49, 46, 48, 53, 53, 54, 57, 60, 48, 46, 47, 53,
+        56, 57, 60, 64, 48, 46, 47, 53, 56, 57, 60, 64, 49, 45, 46, 53, 58, 61,
+        64, 67, 49, 45, 46, 53, 58, 61, 64, 67, 50, 46, 46, 54, 59, 64, 67, 71,
+        50, 46, 46, 54, 59, 64, 67, 71, 52, 48, 47, 54, 61, 66, 71, 75, 52, 48,
+        47, 54, 61, 66, 71, 75, 54, 50, 49, 55, 62, 68, 73, 78, 54, 50, 49, 55,
+        62, 68, 73, 78, 57, 52, 50, 56, 64, 70, 76, 82, 57, 52, 50, 56, 64, 70,
+        76, 82, 60, 54, 52, 58, 65, 72, 79, 85, 60, 54, 52, 58, 65, 72, 79, 85,
+        63, 57, 55, 60, 67, 75, 82, 89, 63, 57, 55, 60, 67, 75, 82, 89, 64, 59,
+        56, 61, 68, 75, 83, 90, 64, 59, 56, 61, 68, 75, 83, 90, 66, 60, 57, 63,
+        69, 77, 84, 92,
+        /* Size 32x8 */
+        32, 31, 31, 30, 30, 33, 33, 37, 37, 42, 42, 49, 49, 48, 48, 49, 49, 50,
+        50, 52, 52, 54, 54, 57, 57, 60, 60, 63, 63, 64, 64, 66, 31, 31, 31, 32,
+        32, 36, 36, 40, 40, 43, 43, 46, 46, 46, 46, 45, 45, 46, 46, 48, 48, 50,
+        50, 52, 52, 54, 54, 57, 57, 59, 59, 60, 37, 38, 38, 40, 40, 43, 43, 47,
+        47, 47, 47, 48, 48, 47, 47, 46, 46, 46, 46, 47, 47, 49, 49, 50, 50, 52,
+        52, 55, 55, 56, 56, 57, 48, 47, 47, 46, 46, 47, 47, 47, 47, 50, 50, 53,
+        53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56, 56, 58, 58, 60, 60, 61,
+        61, 63, 49, 47, 47, 45, 45, 46, 46, 45, 45, 49, 49, 53, 53, 56, 56, 58,
+        58, 59, 59, 61, 61, 62, 62, 64, 64, 65, 65, 67, 67, 68, 68, 69, 52, 50,
+        50, 48, 48, 47, 47, 47, 47, 50, 50, 54, 54, 57, 57, 61, 61, 64, 64, 66,
+        66, 68, 68, 70, 70, 72, 72, 75, 75, 75, 75, 77, 57, 54, 54, 52, 52, 51,
+        51, 50, 50, 53, 53, 57, 57, 60, 60, 64, 64, 67, 67, 71, 71, 73, 73, 76,
+        76, 79, 79, 82, 82, 83, 83, 84, 63, 60, 60, 57, 57, 56, 56, 54, 54, 57,
+        57, 60, 60, 64, 64, 67, 67, 71, 71, 75, 75, 78, 78, 82, 82, 85, 85, 89,
+        89, 90, 90, 92 },
+  },
+  {
+      { /* Luma */
+        /* Size 4x4 */
+        32, 33, 45, 62, 33, 39, 51, 64, 45, 51, 71, 87, 62, 64, 87, 108,
+        /* Size 8x8 */
+        31, 32, 32, 35, 42, 51, 59, 69, 32, 32, 33, 35, 41, 49, 56, 65, 32, 33,
+        35, 38, 43, 49, 56, 64, 35, 35, 38, 48, 54, 59, 66, 73, 42, 41, 43, 54,
+        63, 71, 77, 85, 51, 49, 49, 59, 71, 81, 89, 97, 59, 56, 56, 66, 77, 89,
+        98, 108, 69, 65, 64, 73, 85, 97, 108, 119,
+        /* Size 16x16 */
+        32, 31, 31, 31, 32, 34, 35, 38, 41, 45, 48, 54, 59, 65, 71, 80, 31, 32,
+        32, 32, 32, 34, 35, 37, 40, 43, 46, 51, 56, 62, 68, 76, 31, 32, 32, 32,
+        32, 33, 34, 36, 38, 41, 44, 49, 54, 59, 65, 72, 31, 32, 32, 33, 34, 35,
+        36, 38, 40, 42, 45, 50, 54, 59, 64, 71, 32, 32, 32, 34, 35, 37, 38, 39,
+        41, 43, 46, 49, 53, 58, 63, 69, 34, 34, 33, 35, 37, 39, 42, 44, 46, 48,
+        51, 54, 58, 63, 68, 74, 35, 35, 34, 36, 38, 42, 46, 48, 50, 53, 55, 59,
+        62, 67, 72, 78, 38, 37, 36, 38, 39, 44, 48, 51, 54, 57, 59, 63, 67, 71,
+        76, 82, 41, 40, 38, 40, 41, 46, 50, 54, 57, 60, 63, 67, 71, 75, 80, 86,
+        45, 43, 41, 42, 43, 48, 53, 57, 60, 65, 68, 72, 76, 81, 85, 91, 48, 46,
+        44, 45, 46, 51, 55, 59, 63, 68, 71, 76, 80, 85, 90, 96, 54, 51, 49, 50,
+        49, 54, 59, 63, 67, 72, 76, 82, 87, 92, 97, 104, 59, 56, 54, 54, 53, 58,
+        62, 67, 71, 76, 80, 87, 92, 98, 103, 110, 65, 62, 59, 59, 58, 63, 67,
+        71, 75, 81, 85, 92, 98, 105, 111, 118, 71, 68, 65, 64, 63, 68, 72, 76,
+        80, 85, 90, 97, 103, 111, 117, 125, 80, 76, 72, 71, 69, 74, 78, 82, 86,
+        91, 96, 104, 110, 118, 125, 134,
+        /* Size 32x32 */
+        32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 34, 34, 35, 36, 38, 39, 41, 44,
+        45, 48, 48, 53, 54, 57, 59, 62, 65, 67, 71, 72, 80, 80, 31, 31, 32, 32,
+        32, 32, 32, 32, 32, 32, 34, 34, 35, 35, 37, 38, 40, 42, 43, 46, 46, 51,
+        52, 55, 56, 59, 62, 64, 68, 69, 76, 76, 31, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 34, 34, 35, 35, 37, 38, 40, 42, 43, 46, 46, 51, 51, 55, 56, 59,
+        62, 64, 68, 69, 76, 76, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+        34, 34, 36, 38, 39, 41, 42, 45, 45, 49, 50, 53, 54, 57, 60, 62, 66, 66,
+        73, 73, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 36, 37,
+        38, 41, 41, 44, 44, 49, 49, 52, 54, 56, 59, 61, 65, 65, 72, 72, 31, 32,
+        32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 35, 35, 37, 38, 39, 41, 42, 45,
+        45, 49, 49, 52, 54, 56, 59, 61, 64, 65, 72, 72, 31, 32, 32, 32, 32, 33,
+        33, 33, 34, 34, 35, 35, 36, 36, 38, 39, 40, 42, 42, 45, 45, 49, 50, 52,
+        54, 56, 59, 60, 64, 65, 71, 71, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34,
+        35, 35, 36, 37, 38, 39, 40, 42, 43, 45, 45, 49, 49, 52, 54, 56, 59, 60,
+        64, 64, 70, 70, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 37, 37, 38, 38,
+        39, 40, 41, 42, 43, 46, 46, 49, 49, 52, 53, 55, 58, 59, 63, 63, 69, 69,
+        32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 37, 37, 38, 38, 40, 41, 41, 43,
+        43, 46, 46, 49, 50, 52, 54, 56, 58, 60, 63, 64, 70, 70, 34, 34, 34, 33,
+        33, 34, 35, 35, 37, 37, 39, 39, 42, 42, 44, 45, 46, 47, 48, 51, 51, 54,
+        54, 57, 58, 60, 63, 64, 68, 68, 74, 74, 34, 34, 34, 33, 33, 34, 35, 35,
+        37, 37, 39, 39, 42, 42, 44, 45, 46, 47, 48, 51, 51, 54, 54, 57, 58, 60,
+        63, 64, 68, 68, 74, 74, 35, 35, 35, 34, 34, 35, 36, 36, 38, 38, 42, 42,
+        46, 47, 48, 49, 50, 52, 53, 55, 55, 58, 59, 61, 62, 64, 67, 68, 72, 72,
+        78, 78, 36, 35, 35, 34, 34, 35, 36, 37, 38, 38, 42, 42, 47, 48, 50, 50,
+        52, 54, 54, 57, 57, 59, 60, 62, 64, 66, 68, 69, 73, 73, 79, 79, 38, 37,
+        37, 36, 36, 37, 38, 38, 39, 40, 44, 44, 48, 50, 51, 52, 54, 56, 57, 59,
+        59, 62, 63, 65, 67, 69, 71, 72, 76, 76, 82, 82, 39, 38, 38, 38, 37, 38,
+        39, 39, 40, 41, 45, 45, 49, 50, 52, 54, 55, 58, 58, 61, 61, 64, 65, 67,
+        69, 71, 73, 74, 78, 78, 84, 84, 41, 40, 40, 39, 38, 39, 40, 40, 41, 41,
+        46, 46, 50, 52, 54, 55, 57, 60, 60, 63, 63, 67, 67, 70, 71, 73, 75, 77,
+        80, 81, 86, 86, 44, 42, 42, 41, 41, 41, 42, 42, 42, 43, 47, 47, 52, 54,
+        56, 58, 60, 63, 64, 67, 67, 71, 71, 74, 75, 77, 79, 81, 84, 85, 90, 90,
+        45, 43, 43, 42, 41, 42, 42, 43, 43, 43, 48, 48, 53, 54, 57, 58, 60, 64,
+        65, 68, 68, 72, 72, 75, 76, 78, 81, 82, 85, 86, 91, 91, 48, 46, 46, 45,
+        44, 45, 45, 45, 46, 46, 51, 51, 55, 57, 59, 61, 63, 67, 68, 71, 71, 75,
+        76, 79, 80, 83, 85, 87, 90, 91, 96, 96, 48, 46, 46, 45, 44, 45, 45, 45,
+        46, 46, 51, 51, 55, 57, 59, 61, 63, 67, 68, 71, 71, 75, 76, 79, 80, 83,
+        85, 87, 90, 91, 96, 96, 53, 51, 51, 49, 49, 49, 49, 49, 49, 49, 54, 54,
+        58, 59, 62, 64, 67, 71, 72, 75, 75, 81, 81, 85, 86, 89, 91, 93, 97, 97,
+        103, 103, 54, 52, 51, 50, 49, 49, 50, 49, 49, 50, 54, 54, 59, 60, 63,
+        65, 67, 71, 72, 76, 76, 81, 82, 85, 87, 89, 92, 94, 97, 98, 104, 104,
+        57, 55, 55, 53, 52, 52, 52, 52, 52, 52, 57, 57, 61, 62, 65, 67, 70, 74,
+        75, 79, 79, 85, 85, 89, 90, 93, 96, 98, 102, 102, 108, 108, 59, 56, 56,
+        54, 54, 54, 54, 54, 53, 54, 58, 58, 62, 64, 67, 69, 71, 75, 76, 80, 80,
+        86, 87, 90, 92, 95, 98, 99, 103, 104, 110, 110, 62, 59, 59, 57, 56, 56,
+        56, 56, 55, 56, 60, 60, 64, 66, 69, 71, 73, 77, 78, 83, 83, 89, 89, 93,
+        95, 98, 101, 103, 107, 108, 114, 114, 65, 62, 62, 60, 59, 59, 59, 59,
+        58, 58, 63, 63, 67, 68, 71, 73, 75, 79, 81, 85, 85, 91, 92, 96, 98, 101,
+        105, 106, 111, 111, 118, 118, 67, 64, 64, 62, 61, 61, 60, 60, 59, 60,
+        64, 64, 68, 69, 72, 74, 77, 81, 82, 87, 87, 93, 94, 98, 99, 103, 106,
+        108, 113, 113, 120, 120, 71, 68, 68, 66, 65, 64, 64, 64, 63, 63, 68, 68,
+        72, 73, 76, 78, 80, 84, 85, 90, 90, 97, 97, 102, 103, 107, 111, 113,
+        117, 118, 125, 125, 72, 69, 69, 66, 65, 65, 65, 64, 63, 64, 68, 68, 72,
+        73, 76, 78, 81, 85, 86, 91, 91, 97, 98, 102, 104, 108, 111, 113, 118,
+        119, 126, 126, 80, 76, 76, 73, 72, 72, 71, 70, 69, 70, 74, 74, 78, 79,
+        82, 84, 86, 90, 91, 96, 96, 103, 104, 108, 110, 114, 118, 120, 125, 126,
+        134, 134, 80, 76, 76, 73, 72, 72, 71, 70, 69, 70, 74, 74, 78, 79, 82,
+        84, 86, 90, 91, 96, 96, 103, 104, 108, 110, 114, 118, 120, 125, 126,
+        134, 134,
+        /* Size 4x8 */
+        32, 34, 43, 62, 32, 34, 42, 59, 33, 37, 44, 58, 35, 43, 54, 68, 41, 48,
+        64, 79, 49, 54, 71, 91, 57, 60, 78, 101, 66, 68, 86, 111,
+        /* Size 8x4 */
+        32, 32, 33, 35, 41, 49, 57, 66, 34, 34, 37, 43, 48, 54, 60, 68, 43, 42,
+        44, 54, 64, 71, 78, 86, 62, 59, 58, 68, 79, 91, 101, 111,
+        /* Size 8x16 */
+        32, 31, 32, 36, 44, 53, 62, 73, 31, 32, 32, 35, 42, 51, 59, 69, 31, 32,
+        33, 34, 41, 49, 57, 66, 32, 32, 34, 36, 42, 50, 57, 65, 32, 33, 35, 38,
+        42, 49, 56, 64, 34, 34, 37, 42, 48, 54, 61, 69, 35, 34, 38, 47, 52, 59,
+        65, 73, 38, 36, 40, 49, 56, 63, 69, 77, 41, 39, 41, 51, 60, 67, 74, 81,
+        44, 42, 43, 54, 64, 72, 79, 86, 48, 45, 46, 56, 67, 76, 83, 91, 53, 49,
+        50, 60, 71, 82, 90, 99, 58, 54, 54, 63, 75, 87, 95, 105, 65, 60, 58, 68,
+        79, 92, 102, 112, 71, 65, 63, 73, 84, 97, 108, 119, 79, 72, 70, 79, 90,
+        104, 115, 127,
+        /* Size 16x8 */
+        32, 31, 31, 32, 32, 34, 35, 38, 41, 44, 48, 53, 58, 65, 71, 79, 31, 32,
+        32, 32, 33, 34, 34, 36, 39, 42, 45, 49, 54, 60, 65, 72, 32, 32, 33, 34,
+        35, 37, 38, 40, 41, 43, 46, 50, 54, 58, 63, 70, 36, 35, 34, 36, 38, 42,
+        47, 49, 51, 54, 56, 60, 63, 68, 73, 79, 44, 42, 41, 42, 42, 48, 52, 56,
+        60, 64, 67, 71, 75, 79, 84, 90, 53, 51, 49, 50, 49, 54, 59, 63, 67, 72,
+        76, 82, 87, 92, 97, 104, 62, 59, 57, 57, 56, 61, 65, 69, 74, 79, 83, 90,
+        95, 102, 108, 115, 73, 69, 66, 65, 64, 69, 73, 77, 81, 86, 91, 99, 105,
+        112, 119, 127,
+        /* Size 16x32 */
+        32, 31, 31, 32, 32, 34, 36, 38, 44, 44, 53, 53, 62, 65, 73, 79, 31, 32,
+        32, 32, 32, 34, 35, 37, 42, 43, 51, 51, 60, 62, 70, 75, 31, 32, 32, 32,
+        32, 34, 35, 37, 42, 43, 51, 51, 59, 62, 69, 75, 31, 32, 32, 32, 32, 33,
+        35, 36, 41, 42, 50, 50, 58, 60, 67, 73, 31, 32, 32, 32, 33, 33, 34, 36,
+        41, 41, 49, 49, 57, 59, 66, 72, 31, 32, 32, 33, 33, 34, 35, 37, 41, 42,
+        49, 49, 57, 59, 66, 71, 32, 32, 32, 33, 34, 35, 36, 38, 42, 43, 50, 50,
+        57, 59, 65, 71, 32, 32, 32, 34, 34, 35, 37, 38, 42, 43, 49, 49, 56, 59,
+        65, 70, 32, 32, 33, 34, 35, 37, 38, 39, 42, 43, 49, 49, 56, 58, 64, 69,
+        32, 33, 33, 34, 35, 37, 39, 40, 43, 44, 50, 50, 56, 58, 64, 69, 34, 34,
+        34, 36, 37, 39, 42, 44, 48, 48, 54, 54, 61, 63, 69, 73, 34, 34, 34, 36,
+        37, 39, 42, 44, 48, 48, 54, 54, 61, 63, 69, 73, 35, 34, 34, 37, 38, 42,
+        47, 48, 52, 53, 59, 59, 65, 67, 73, 77, 36, 35, 34, 37, 38, 43, 48, 49,
+        54, 54, 60, 60, 66, 68, 74, 78, 38, 36, 36, 38, 40, 44, 49, 51, 56, 57,
+        63, 63, 69, 71, 77, 81, 39, 38, 37, 40, 40, 45, 50, 52, 58, 58, 65, 65,
+        71, 73, 79, 84, 41, 39, 39, 41, 41, 46, 51, 54, 60, 60, 67, 67, 74, 76,
+        81, 86, 44, 41, 41, 42, 43, 48, 53, 56, 63, 64, 71, 71, 78, 79, 85, 90,
+        44, 42, 42, 43, 43, 48, 54, 56, 64, 64, 72, 72, 79, 81, 86, 91, 48, 45,
+        45, 46, 46, 51, 56, 59, 67, 67, 76, 76, 83, 85, 91, 96, 48, 45, 45, 46,
+        46, 51, 56, 59, 67, 67, 76, 76, 83, 85, 91, 96, 53, 49, 49, 49, 49, 54,
+        59, 62, 71, 71, 81, 81, 89, 91, 98, 103, 53, 50, 49, 50, 50, 54, 60, 63,
+        71, 72, 82, 82, 90, 92, 99, 103, 57, 53, 52, 52, 52, 57, 62, 65, 74, 75,
+        85, 85, 94, 96, 103, 108, 58, 54, 54, 54, 54, 58, 63, 67, 75, 76, 87,
+        87, 95, 98, 105, 110, 61, 57, 57, 56, 56, 60, 66, 69, 77, 78, 89, 89,
+        98, 101, 108, 114, 65, 60, 60, 59, 58, 63, 68, 71, 79, 80, 92, 92, 102,
+        105, 112, 118, 67, 62, 61, 60, 60, 64, 69, 72, 81, 82, 94, 94, 103, 106,
+        114, 120, 71, 66, 65, 64, 63, 68, 73, 76, 84, 85, 97, 97, 108, 111, 119,
+        125, 72, 66, 66, 64, 64, 68, 73, 76, 85, 86, 98, 98, 108, 111, 119, 125,
+        79, 73, 72, 71, 70, 74, 79, 82, 90, 91, 104, 104, 115, 118, 127, 133,
+        79, 73, 72, 71, 70, 74, 79, 82, 90, 91, 104, 104, 115, 118, 127, 133,
+        /* Size 32x16 */
+        32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 35, 36, 38, 39, 41, 44,
+        44, 48, 48, 53, 53, 57, 58, 61, 65, 67, 71, 72, 79, 79, 31, 32, 32, 32,
+        32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 36, 38, 39, 41, 42, 45, 45, 49,
+        50, 53, 54, 57, 60, 62, 66, 66, 73, 73, 31, 32, 32, 32, 32, 32, 32, 32,
+        33, 33, 34, 34, 34, 34, 36, 37, 39, 41, 42, 45, 45, 49, 49, 52, 54, 57,
+        60, 61, 65, 66, 72, 72, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 36, 36,
+        37, 37, 38, 40, 41, 42, 43, 46, 46, 49, 50, 52, 54, 56, 59, 60, 64, 64,
+        71, 71, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 37, 37, 38, 38, 40, 40,
+        41, 43, 43, 46, 46, 49, 50, 52, 54, 56, 58, 60, 63, 64, 70, 70, 34, 34,
+        34, 33, 33, 34, 35, 35, 37, 37, 39, 39, 42, 43, 44, 45, 46, 48, 48, 51,
+        51, 54, 54, 57, 58, 60, 63, 64, 68, 68, 74, 74, 36, 35, 35, 35, 34, 35,
+        36, 37, 38, 39, 42, 42, 47, 48, 49, 50, 51, 53, 54, 56, 56, 59, 60, 62,
+        63, 66, 68, 69, 73, 73, 79, 79, 38, 37, 37, 36, 36, 37, 38, 38, 39, 40,
+        44, 44, 48, 49, 51, 52, 54, 56, 56, 59, 59, 62, 63, 65, 67, 69, 71, 72,
+        76, 76, 82, 82, 44, 42, 42, 41, 41, 41, 42, 42, 42, 43, 48, 48, 52, 54,
+        56, 58, 60, 63, 64, 67, 67, 71, 71, 74, 75, 77, 79, 81, 84, 85, 90, 90,
+        44, 43, 43, 42, 41, 42, 43, 43, 43, 44, 48, 48, 53, 54, 57, 58, 60, 64,
+        64, 67, 67, 71, 72, 75, 76, 78, 80, 82, 85, 86, 91, 91, 53, 51, 51, 50,
+        49, 49, 50, 49, 49, 50, 54, 54, 59, 60, 63, 65, 67, 71, 72, 76, 76, 81,
+        82, 85, 87, 89, 92, 94, 97, 98, 104, 104, 53, 51, 51, 50, 49, 49, 50,
+        49, 49, 50, 54, 54, 59, 60, 63, 65, 67, 71, 72, 76, 76, 81, 82, 85, 87,
+        89, 92, 94, 97, 98, 104, 104, 62, 60, 59, 58, 57, 57, 57, 56, 56, 56,
+        61, 61, 65, 66, 69, 71, 74, 78, 79, 83, 83, 89, 90, 94, 95, 98, 102,
+        103, 108, 108, 115, 115, 65, 62, 62, 60, 59, 59, 59, 59, 58, 58, 63, 63,
+        67, 68, 71, 73, 76, 79, 81, 85, 85, 91, 92, 96, 98, 101, 105, 106, 111,
+        111, 118, 118, 73, 70, 69, 67, 66, 66, 65, 65, 64, 64, 69, 69, 73, 74,
+        77, 79, 81, 85, 86, 91, 91, 98, 99, 103, 105, 108, 112, 114, 119, 119,
+        127, 127, 79, 75, 75, 73, 72, 71, 71, 70, 69, 69, 73, 73, 77, 78, 81,
+        84, 86, 90, 91, 96, 96, 103, 103, 108, 110, 114, 118, 120, 125, 125,
+        133, 133,
+        /* Size 4x16 */
+        31, 34, 44, 65, 32, 34, 43, 62, 32, 33, 41, 59, 32, 35, 43, 59, 32, 37,
+        43, 58, 34, 39, 48, 63, 34, 42, 53, 67, 36, 44, 57, 71, 39, 46, 60, 76,
+        42, 48, 64, 81, 45, 51, 67, 85, 50, 54, 72, 92, 54, 58, 76, 98, 60, 63,
+        80, 105, 66, 68, 85, 111, 73, 74, 91, 118,
+        /* Size 16x4 */
+        31, 32, 32, 32, 32, 34, 34, 36, 39, 42, 45, 50, 54, 60, 66, 73, 34, 34,
+        33, 35, 37, 39, 42, 44, 46, 48, 51, 54, 58, 63, 68, 74, 44, 43, 41, 43,
+        43, 48, 53, 57, 60, 64, 67, 72, 76, 80, 85, 91, 65, 62, 59, 59, 58, 63,
+        67, 71, 76, 81, 85, 92, 98, 105, 111, 118,
+        /* Size 8x32 */
+        32, 31, 32, 36, 44, 53, 62, 73, 31, 32, 32, 35, 42, 51, 60, 70, 31, 32,
+        32, 35, 42, 51, 59, 69, 31, 32, 32, 35, 41, 50, 58, 67, 31, 32, 33, 34,
+        41, 49, 57, 66, 31, 32, 33, 35, 41, 49, 57, 66, 32, 32, 34, 36, 42, 50,
+        57, 65, 32, 32, 34, 37, 42, 49, 56, 65, 32, 33, 35, 38, 42, 49, 56, 64,
+        32, 33, 35, 39, 43, 50, 56, 64, 34, 34, 37, 42, 48, 54, 61, 69, 34, 34,
+        37, 42, 48, 54, 61, 69, 35, 34, 38, 47, 52, 59, 65, 73, 36, 34, 38, 48,
+        54, 60, 66, 74, 38, 36, 40, 49, 56, 63, 69, 77, 39, 37, 40, 50, 58, 65,
+        71, 79, 41, 39, 41, 51, 60, 67, 74, 81, 44, 41, 43, 53, 63, 71, 78, 85,
+        44, 42, 43, 54, 64, 72, 79, 86, 48, 45, 46, 56, 67, 76, 83, 91, 48, 45,
+        46, 56, 67, 76, 83, 91, 53, 49, 49, 59, 71, 81, 89, 98, 53, 49, 50, 60,
+        71, 82, 90, 99, 57, 52, 52, 62, 74, 85, 94, 103, 58, 54, 54, 63, 75, 87,
+        95, 105, 61, 57, 56, 66, 77, 89, 98, 108, 65, 60, 58, 68, 79, 92, 102,
+        112, 67, 61, 60, 69, 81, 94, 103, 114, 71, 65, 63, 73, 84, 97, 108, 119,
+        72, 66, 64, 73, 85, 98, 108, 119, 79, 72, 70, 79, 90, 104, 115, 127, 79,
+        72, 70, 79, 90, 104, 115, 127,
+        /* Size 32x8 */
+        32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 35, 36, 38, 39, 41, 44,
+        44, 48, 48, 53, 53, 57, 58, 61, 65, 67, 71, 72, 79, 79, 31, 32, 32, 32,
+        32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 36, 37, 39, 41, 42, 45, 45, 49,
+        49, 52, 54, 57, 60, 61, 65, 66, 72, 72, 32, 32, 32, 32, 33, 33, 34, 34,
+        35, 35, 37, 37, 38, 38, 40, 40, 41, 43, 43, 46, 46, 49, 50, 52, 54, 56,
+        58, 60, 63, 64, 70, 70, 36, 35, 35, 35, 34, 35, 36, 37, 38, 39, 42, 42,
+        47, 48, 49, 50, 51, 53, 54, 56, 56, 59, 60, 62, 63, 66, 68, 69, 73, 73,
+        79, 79, 44, 42, 42, 41, 41, 41, 42, 42, 42, 43, 48, 48, 52, 54, 56, 58,
+        60, 63, 64, 67, 67, 71, 71, 74, 75, 77, 79, 81, 84, 85, 90, 90, 53, 51,
+        51, 50, 49, 49, 50, 49, 49, 50, 54, 54, 59, 60, 63, 65, 67, 71, 72, 76,
+        76, 81, 82, 85, 87, 89, 92, 94, 97, 98, 104, 104, 62, 60, 59, 58, 57,
+        57, 57, 56, 56, 56, 61, 61, 65, 66, 69, 71, 74, 78, 79, 83, 83, 89, 90,
+        94, 95, 98, 102, 103, 108, 108, 115, 115, 73, 70, 69, 67, 66, 66, 65,
+        65, 64, 64, 69, 69, 73, 74, 77, 79, 81, 85, 86, 91, 91, 98, 99, 103,
+        105, 108, 112, 114, 119, 119, 127, 127 },
+      { /* Chroma */
+        /* Size 4x4 */
+        31, 42, 47, 53, 42, 48, 50, 54, 47, 50, 61, 67, 53, 54, 67, 78,
+        /* Size 8x8 */
+        31, 32, 38, 48, 47, 50, 53, 57, 32, 35, 42, 47, 45, 47, 50, 54, 38, 42,
+        47, 48, 45, 47, 49, 52, 48, 47, 48, 53, 53, 54, 56, 58, 47, 45, 45, 53,
+        58, 61, 63, 65, 50, 47, 47, 54, 61, 66, 69, 72, 53, 50, 49, 56, 63, 69,
+        73, 77, 57, 54, 52, 58, 65, 72, 77, 82,
+        /* Size 16x16 */
+        32, 31, 30, 33, 36, 41, 47, 49, 49, 49, 50, 52, 54, 57, 60, 63, 31, 31,
+        31, 34, 38, 42, 46, 47, 47, 47, 48, 50, 52, 54, 57, 60, 30, 31, 32, 35,
+        40, 42, 45, 46, 45, 45, 46, 47, 49, 52, 54, 57, 33, 34, 35, 39, 43, 45,
+        47, 46, 46, 45, 46, 47, 49, 51, 53, 56, 36, 38, 40, 43, 47, 47, 47, 47,
+        46, 45, 46, 47, 48, 50, 52, 54, 41, 42, 42, 45, 47, 48, 50, 50, 49, 49,
+        50, 50, 52, 53, 55, 57, 47, 46, 45, 47, 47, 50, 52, 52, 52, 52, 53, 53,
+        55, 56, 58, 60, 49, 47, 46, 46, 47, 50, 52, 53, 54, 55, 55, 56, 57, 58,
+        60, 62, 49, 47, 45, 46, 46, 49, 52, 54, 55, 57, 58, 59, 60, 61, 63, 65,
+        49, 47, 45, 45, 45, 49, 52, 55, 57, 59, 60, 61, 63, 64, 66, 68, 50, 48,
+        46, 46, 46, 50, 53, 55, 58, 60, 61, 63, 65, 67, 68, 71, 52, 50, 47, 47,
+        47, 50, 53, 56, 59, 61, 63, 66, 68, 70, 72, 75, 54, 52, 49, 49, 48, 52,
+        55, 57, 60, 63, 65, 68, 71, 73, 75, 78, 57, 54, 52, 51, 50, 53, 56, 58,
+        61, 64, 67, 70, 73, 76, 79, 82, 60, 57, 54, 53, 52, 55, 58, 60, 63, 66,
+        68, 72, 75, 79, 82, 85, 63, 60, 57, 56, 54, 57, 60, 62, 65, 68, 71, 75,
+        78, 82, 85, 89,
+        /* Size 32x32 */
+        32, 31, 31, 30, 30, 32, 33, 34, 36, 37, 41, 41, 47, 49, 49, 48, 49, 49,
+        49, 50, 50, 52, 52, 54, 54, 56, 57, 58, 60, 60, 63, 63, 31, 31, 31, 31,
+        31, 32, 34, 35, 38, 38, 42, 42, 46, 48, 47, 47, 47, 47, 47, 48, 48, 50,
+        50, 51, 52, 53, 54, 55, 57, 57, 60, 60, 31, 31, 31, 31, 31, 33, 34, 35,
+        38, 39, 42, 42, 46, 47, 47, 47, 47, 47, 47, 48, 48, 49, 50, 51, 52, 53,
+        54, 55, 57, 57, 60, 60, 30, 31, 31, 31, 31, 33, 35, 36, 39, 40, 42, 42,
+        46, 47, 46, 46, 46, 45, 46, 47, 47, 48, 48, 50, 50, 51, 52, 53, 55, 55,
+        58, 58, 30, 31, 31, 31, 32, 33, 35, 36, 40, 40, 42, 42, 45, 46, 46, 45,
+        45, 45, 45, 46, 46, 47, 47, 49, 49, 51, 52, 52, 54, 54, 57, 57, 32, 32,
+        33, 33, 33, 35, 37, 38, 41, 42, 43, 43, 46, 47, 46, 46, 45, 45, 45, 46,
+        46, 47, 47, 49, 49, 50, 51, 52, 54, 54, 57, 57, 33, 34, 34, 35, 35, 37,
+        39, 40, 43, 43, 45, 45, 47, 47, 46, 46, 46, 45, 45, 46, 46, 47, 47, 49,
+        49, 50, 51, 52, 53, 54, 56, 56, 34, 35, 35, 36, 36, 38, 40, 41, 44, 44,
+        45, 45, 47, 47, 47, 46, 46, 45, 45, 46, 46, 47, 47, 48, 49, 50, 51, 51,
+        53, 53, 55, 55, 36, 38, 38, 39, 40, 41, 43, 44, 47, 47, 47, 47, 47, 48,
+        47, 46, 46, 45, 45, 46, 46, 46, 47, 48, 48, 49, 50, 50, 52, 52, 54, 54,
+        37, 38, 39, 40, 40, 42, 43, 44, 47, 47, 47, 47, 48, 48, 47, 47, 46, 45,
+        46, 46, 46, 47, 47, 48, 48, 49, 50, 51, 52, 52, 55, 55, 41, 42, 42, 42,
+        42, 43, 45, 45, 47, 47, 48, 48, 50, 50, 50, 49, 49, 49, 49, 50, 50, 50,
+        50, 51, 52, 52, 53, 54, 55, 55, 57, 57, 41, 42, 42, 42, 42, 43, 45, 45,
+        47, 47, 48, 48, 50, 50, 50, 49, 49, 49, 49, 50, 50, 50, 50, 51, 52, 52,
+        53, 54, 55, 55, 57, 57, 47, 46, 46, 46, 45, 46, 47, 47, 47, 48, 50, 50,
+        52, 52, 52, 52, 52, 52, 52, 53, 53, 53, 53, 54, 55, 55, 56, 56, 58, 58,
+        60, 60, 49, 48, 47, 47, 46, 47, 47, 47, 48, 48, 50, 50, 52, 53, 53, 53,
+        53, 53, 53, 54, 54, 54, 54, 55, 55, 56, 56, 57, 58, 58, 60, 60, 49, 47,
+        47, 46, 46, 46, 46, 47, 47, 47, 50, 50, 52, 53, 53, 54, 54, 55, 55, 55,
+        55, 56, 56, 57, 57, 58, 58, 59, 60, 60, 62, 62, 48, 47, 47, 46, 45, 46,
+        46, 46, 46, 47, 49, 49, 52, 53, 54, 54, 55, 55, 56, 56, 56, 57, 57, 58,
+        58, 59, 60, 60, 61, 62, 63, 63, 49, 47, 47, 46, 45, 45, 46, 46, 46, 46,
+        49, 49, 52, 53, 54, 55, 55, 57, 57, 58, 58, 59, 59, 60, 60, 61, 61, 62,
+        63, 63, 65, 65, 49, 47, 47, 45, 45, 45, 45, 45, 45, 45, 49, 49, 52, 53,
+        55, 55, 57, 58, 59, 60, 60, 61, 61, 62, 62, 63, 63, 64, 65, 65, 67, 67,
+        49, 47, 47, 46, 45, 45, 45, 45, 45, 46, 49, 49, 52, 53, 55, 56, 57, 59,
+        59, 60, 60, 61, 61, 62, 63, 63, 64, 65, 66, 66, 68, 68, 50, 48, 48, 47,
+        46, 46, 46, 46, 46, 46, 50, 50, 53, 54, 55, 56, 58, 60, 60, 61, 61, 63,
+        63, 65, 65, 66, 67, 67, 68, 69, 71, 71, 50, 48, 48, 47, 46, 46, 46, 46,
+        46, 46, 50, 50, 53, 54, 55, 56, 58, 60, 60, 61, 61, 63, 63, 65, 65, 66,
+        67, 67, 68, 69, 71, 71, 52, 50, 49, 48, 47, 47, 47, 47, 46, 47, 50, 50,
+        53, 54, 56, 57, 59, 61, 61, 63, 63, 66, 66, 67, 68, 69, 70, 71, 72, 72,
+        74, 74, 52, 50, 50, 48, 47, 47, 47, 47, 47, 47, 50, 50, 53, 54, 56, 57,
+        59, 61, 61, 63, 63, 66, 66, 68, 68, 69, 70, 71, 72, 73, 75, 75, 54, 51,
+        51, 50, 49, 49, 49, 48, 48, 48, 51, 51, 54, 55, 57, 58, 60, 62, 62, 65,
+        65, 67, 68, 69, 70, 71, 72, 73, 74, 75, 77, 77, 54, 52, 52, 50, 49, 49,
+        49, 49, 48, 48, 52, 52, 55, 55, 57, 58, 60, 62, 63, 65, 65, 68, 68, 70,
+        71, 72, 73, 74, 75, 76, 78, 78, 56, 53, 53, 51, 51, 50, 50, 50, 49, 49,
+        52, 52, 55, 56, 58, 59, 61, 63, 63, 66, 66, 69, 69, 71, 72, 73, 75, 75,
+        77, 77, 80, 80, 57, 54, 54, 52, 52, 51, 51, 51, 50, 50, 53, 53, 56, 56,
+        58, 60, 61, 63, 64, 67, 67, 70, 70, 72, 73, 75, 76, 77, 79, 79, 82, 82,
+        58, 55, 55, 53, 52, 52, 52, 51, 50, 51, 54, 54, 56, 57, 59, 60, 62, 64,
+        65, 67, 67, 71, 71, 73, 74, 75, 77, 78, 80, 80, 83, 83, 60, 57, 57, 55,
+        54, 54, 53, 53, 52, 52, 55, 55, 58, 58, 60, 61, 63, 65, 66, 68, 68, 72,
+        72, 74, 75, 77, 79, 80, 82, 82, 85, 85, 60, 57, 57, 55, 54, 54, 54, 53,
+        52, 52, 55, 55, 58, 58, 60, 62, 63, 65, 66, 69, 69, 72, 73, 75, 76, 77,
+        79, 80, 82, 82, 85, 85, 63, 60, 60, 58, 57, 57, 56, 55, 54, 55, 57, 57,
+        60, 60, 62, 63, 65, 67, 68, 71, 71, 74, 75, 77, 78, 80, 82, 83, 85, 85,
+        89, 89, 63, 60, 60, 58, 57, 57, 56, 55, 54, 55, 57, 57, 60, 60, 62, 63,
+        65, 67, 68, 71, 71, 74, 75, 77, 78, 80, 82, 83, 85, 85, 89, 89,
+        /* Size 4x8 */
+        31, 42, 47, 54, 33, 44, 45, 51, 40, 47, 46, 50, 47, 50, 54, 57, 45, 49,
+        59, 64, 48, 50, 61, 70, 51, 52, 63, 75, 55, 55, 66, 79,
+        /* Size 8x4 */
+        31, 33, 40, 47, 45, 48, 51, 55, 42, 44, 47, 50, 49, 50, 52, 55, 47, 45,
+        46, 54, 59, 61, 63, 66, 54, 51, 50, 57, 64, 70, 75, 79,
+        /* Size 8x16 */
+        32, 31, 37, 48, 49, 52, 56, 61, 31, 31, 38, 47, 47, 50, 53, 57, 30, 32,
+        40, 46, 45, 48, 51, 55, 33, 36, 43, 47, 46, 47, 50, 54, 37, 40, 47, 47,
+        45, 47, 49, 52, 42, 43, 47, 50, 49, 50, 53, 56, 47, 46, 48, 52, 53, 53,
+        55, 58, 48, 46, 47, 53, 55, 56, 58, 61, 48, 45, 46, 53, 57, 59, 61, 63,
+        49, 45, 46, 53, 58, 62, 64, 66, 50, 46, 46, 54, 59, 64, 66, 69, 52, 48,
+        47, 54, 61, 66, 70, 73, 54, 50, 49, 55, 62, 68, 72, 76, 57, 52, 50, 56,
+        64, 70, 75, 79, 60, 54, 52, 58, 65, 72, 77, 82, 63, 57, 55, 60, 67, 75,
+        80, 86,
+        /* Size 16x8 */
+        32, 31, 30, 33, 37, 42, 47, 48, 48, 49, 50, 52, 54, 57, 60, 63, 31, 31,
+        32, 36, 40, 43, 46, 46, 45, 45, 46, 48, 50, 52, 54, 57, 37, 38, 40, 43,
+        47, 47, 48, 47, 46, 46, 46, 47, 49, 50, 52, 55, 48, 47, 46, 47, 47, 50,
+        52, 53, 53, 53, 54, 54, 55, 56, 58, 60, 49, 47, 45, 46, 45, 49, 53, 55,
+        57, 58, 59, 61, 62, 64, 65, 67, 52, 50, 48, 47, 47, 50, 53, 56, 59, 62,
+        64, 66, 68, 70, 72, 75, 56, 53, 51, 50, 49, 53, 55, 58, 61, 64, 66, 70,
+        72, 75, 77, 80, 61, 57, 55, 54, 52, 56, 58, 61, 63, 66, 69, 73, 76, 79,
+        82, 86,
+        /* Size 16x32 */
+        32, 31, 31, 35, 37, 42, 48, 48, 49, 49, 52, 52, 56, 57, 61, 63, 31, 31,
+        31, 36, 38, 42, 47, 47, 47, 47, 50, 50, 54, 54, 58, 60, 31, 31, 31, 36,
+        38, 42, 47, 47, 47, 47, 50, 50, 53, 54, 57, 60, 30, 32, 32, 37, 39, 42,
+        46, 46, 46, 46, 48, 48, 52, 52, 56, 58, 30, 32, 32, 37, 40, 42, 46, 46,
+        45, 45, 48, 48, 51, 52, 55, 57, 32, 33, 34, 39, 41, 44, 46, 46, 45, 45,
+        48, 48, 51, 51, 54, 57, 33, 35, 36, 40, 43, 45, 47, 46, 46, 46, 47, 47,
+        50, 51, 54, 56, 34, 37, 37, 42, 44, 45, 47, 47, 45, 46, 47, 47, 50, 51,
+        53, 55, 37, 40, 40, 45, 47, 47, 47, 47, 45, 46, 47, 47, 49, 50, 52, 54,
+        37, 40, 40, 45, 47, 47, 48, 47, 46, 46, 47, 47, 49, 50, 53, 55, 42, 43,
+        43, 46, 47, 48, 50, 50, 49, 49, 50, 50, 53, 53, 56, 57, 42, 43, 43, 46,
+        47, 48, 50, 50, 49, 49, 50, 50, 53, 53, 56, 57, 47, 46, 46, 47, 48, 50,
+        52, 52, 53, 53, 53, 53, 55, 56, 58, 60, 49, 47, 46, 47, 48, 50, 53, 53,
+        53, 54, 54, 54, 56, 57, 59, 60, 48, 46, 46, 47, 47, 50, 53, 53, 55, 55,
+        56, 56, 58, 58, 61, 62, 48, 46, 46, 46, 47, 50, 53, 54, 56, 56, 57, 57,
+        59, 60, 62, 64, 48, 46, 45, 46, 46, 49, 53, 54, 57, 57, 59, 59, 61, 61,
+        63, 65, 49, 45, 45, 45, 46, 49, 53, 55, 58, 59, 61, 61, 63, 64, 66, 67,
+        49, 46, 45, 46, 46, 49, 53, 55, 58, 59, 62, 62, 64, 64, 66, 68, 50, 47,
+        46, 46, 46, 50, 54, 55, 59, 60, 64, 64, 66, 67, 69, 71, 50, 47, 46, 46,
+        46, 50, 54, 55, 59, 60, 64, 64, 66, 67, 69, 71, 52, 48, 48, 47, 47, 50,
+        54, 56, 61, 61, 66, 66, 69, 70, 72, 74, 52, 48, 48, 47, 47, 50, 54, 56,
+        61, 61, 66, 66, 70, 71, 73, 75, 53, 50, 49, 48, 48, 51, 55, 57, 62, 62,
+        68, 68, 71, 72, 75, 77, 54, 50, 50, 49, 49, 52, 55, 57, 62, 63, 68, 68,
+        72, 73, 76, 78, 55, 51, 51, 50, 49, 52, 56, 58, 63, 63, 69, 69, 74, 75,
+        78, 80, 57, 52, 52, 51, 50, 53, 56, 58, 64, 64, 70, 70, 75, 76, 79, 82,
+        58, 53, 53, 51, 51, 54, 57, 59, 64, 65, 71, 71, 76, 77, 80, 83, 60, 55,
+        54, 53, 52, 55, 58, 60, 65, 66, 72, 72, 77, 79, 82, 85, 60, 55, 55, 53,
+        53, 55, 59, 60, 65, 66, 73, 73, 78, 79, 83, 85, 63, 58, 57, 56, 55, 58,
+        60, 62, 67, 68, 75, 75, 80, 82, 86, 89, 63, 58, 57, 56, 55, 58, 60, 62,
+        67, 68, 75, 75, 80, 82, 86, 89,
+        /* Size 32x16 */
+        32, 31, 31, 30, 30, 32, 33, 34, 37, 37, 42, 42, 47, 49, 48, 48, 48, 49,
+        49, 50, 50, 52, 52, 53, 54, 55, 57, 58, 60, 60, 63, 63, 31, 31, 31, 32,
+        32, 33, 35, 37, 40, 40, 43, 43, 46, 47, 46, 46, 46, 45, 46, 47, 47, 48,
+        48, 50, 50, 51, 52, 53, 55, 55, 58, 58, 31, 31, 31, 32, 32, 34, 36, 37,
+        40, 40, 43, 43, 46, 46, 46, 46, 45, 45, 45, 46, 46, 48, 48, 49, 50, 51,
+        52, 53, 54, 55, 57, 57, 35, 36, 36, 37, 37, 39, 40, 42, 45, 45, 46, 46,
+        47, 47, 47, 46, 46, 45, 46, 46, 46, 47, 47, 48, 49, 50, 51, 51, 53, 53,
+        56, 56, 37, 38, 38, 39, 40, 41, 43, 44, 47, 47, 47, 47, 48, 48, 47, 47,
+        46, 46, 46, 46, 46, 47, 47, 48, 49, 49, 50, 51, 52, 53, 55, 55, 42, 42,
+        42, 42, 42, 44, 45, 45, 47, 47, 48, 48, 50, 50, 50, 50, 49, 49, 49, 50,
+        50, 50, 50, 51, 52, 52, 53, 54, 55, 55, 58, 58, 48, 47, 47, 46, 46, 46,
+        47, 47, 47, 48, 50, 50, 52, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55,
+        55, 56, 56, 57, 58, 59, 60, 60, 48, 47, 47, 46, 46, 46, 46, 47, 47, 47,
+        50, 50, 52, 53, 53, 54, 54, 55, 55, 55, 55, 56, 56, 57, 57, 58, 58, 59,
+        60, 60, 62, 62, 49, 47, 47, 46, 45, 45, 46, 45, 45, 46, 49, 49, 53, 53,
+        55, 56, 57, 58, 58, 59, 59, 61, 61, 62, 62, 63, 64, 64, 65, 65, 67, 67,
+        49, 47, 47, 46, 45, 45, 46, 46, 46, 46, 49, 49, 53, 54, 55, 56, 57, 59,
+        59, 60, 60, 61, 61, 62, 63, 63, 64, 65, 66, 66, 68, 68, 52, 50, 50, 48,
+        48, 48, 47, 47, 47, 47, 50, 50, 53, 54, 56, 57, 59, 61, 62, 64, 64, 66,
+        66, 68, 68, 69, 70, 71, 72, 73, 75, 75, 52, 50, 50, 48, 48, 48, 47, 47,
+        47, 47, 50, 50, 53, 54, 56, 57, 59, 61, 62, 64, 64, 66, 66, 68, 68, 69,
+        70, 71, 72, 73, 75, 75, 56, 54, 53, 52, 51, 51, 50, 50, 49, 49, 53, 53,
+        55, 56, 58, 59, 61, 63, 64, 66, 66, 69, 70, 71, 72, 74, 75, 76, 77, 78,
+        80, 80, 57, 54, 54, 52, 52, 51, 51, 51, 50, 50, 53, 53, 56, 57, 58, 60,
+        61, 64, 64, 67, 67, 70, 71, 72, 73, 75, 76, 77, 79, 79, 82, 82, 61, 58,
+        57, 56, 55, 54, 54, 53, 52, 53, 56, 56, 58, 59, 61, 62, 63, 66, 66, 69,
+        69, 72, 73, 75, 76, 78, 79, 80, 82, 83, 86, 86, 63, 60, 60, 58, 57, 57,
+        56, 55, 54, 55, 57, 57, 60, 60, 62, 64, 65, 67, 68, 71, 71, 74, 75, 77,
+        78, 80, 82, 83, 85, 85, 89, 89,
+        /* Size 4x16 */
+        31, 42, 49, 57, 31, 42, 47, 54, 32, 42, 45, 52, 35, 45, 46, 51, 40, 47,
+        46, 50, 43, 48, 49, 53, 46, 50, 53, 56, 46, 50, 55, 58, 46, 49, 57, 61,
+        46, 49, 59, 64, 47, 50, 60, 67, 48, 50, 61, 71, 50, 52, 63, 73, 52, 53,
+        64, 76, 55, 55, 66, 79, 58, 58, 68, 82,
+        /* Size 16x4 */
+        31, 31, 32, 35, 40, 43, 46, 46, 46, 46, 47, 48, 50, 52, 55, 58, 42, 42,
+        42, 45, 47, 48, 50, 50, 49, 49, 50, 50, 52, 53, 55, 58, 49, 47, 45, 46,
+        46, 49, 53, 55, 57, 59, 60, 61, 63, 64, 66, 68, 57, 54, 52, 51, 50, 53,
+        56, 58, 61, 64, 67, 71, 73, 76, 79, 82,
+        /* Size 8x32 */
+        32, 31, 37, 48, 49, 52, 56, 61, 31, 31, 38, 47, 47, 50, 54, 58, 31, 31,
+        38, 47, 47, 50, 53, 57, 30, 32, 39, 46, 46, 48, 52, 56, 30, 32, 40, 46,
+        45, 48, 51, 55, 32, 34, 41, 46, 45, 48, 51, 54, 33, 36, 43, 47, 46, 47,
+        50, 54, 34, 37, 44, 47, 45, 47, 50, 53, 37, 40, 47, 47, 45, 47, 49, 52,
+        37, 40, 47, 48, 46, 47, 49, 53, 42, 43, 47, 50, 49, 50, 53, 56, 42, 43,
+        47, 50, 49, 50, 53, 56, 47, 46, 48, 52, 53, 53, 55, 58, 49, 46, 48, 53,
+        53, 54, 56, 59, 48, 46, 47, 53, 55, 56, 58, 61, 48, 46, 47, 53, 56, 57,
+        59, 62, 48, 45, 46, 53, 57, 59, 61, 63, 49, 45, 46, 53, 58, 61, 63, 66,
+        49, 45, 46, 53, 58, 62, 64, 66, 50, 46, 46, 54, 59, 64, 66, 69, 50, 46,
+        46, 54, 59, 64, 66, 69, 52, 48, 47, 54, 61, 66, 69, 72, 52, 48, 47, 54,
+        61, 66, 70, 73, 53, 49, 48, 55, 62, 68, 71, 75, 54, 50, 49, 55, 62, 68,
+        72, 76, 55, 51, 49, 56, 63, 69, 74, 78, 57, 52, 50, 56, 64, 70, 75, 79,
+        58, 53, 51, 57, 64, 71, 76, 80, 60, 54, 52, 58, 65, 72, 77, 82, 60, 55,
+        53, 59, 65, 73, 78, 83, 63, 57, 55, 60, 67, 75, 80, 86, 63, 57, 55, 60,
+        67, 75, 80, 86,
+        /* Size 32x8 */
+        32, 31, 31, 30, 30, 32, 33, 34, 37, 37, 42, 42, 47, 49, 48, 48, 48, 49,
+        49, 50, 50, 52, 52, 53, 54, 55, 57, 58, 60, 60, 63, 63, 31, 31, 31, 32,
+        32, 34, 36, 37, 40, 40, 43, 43, 46, 46, 46, 46, 45, 45, 45, 46, 46, 48,
+        48, 49, 50, 51, 52, 53, 54, 55, 57, 57, 37, 38, 38, 39, 40, 41, 43, 44,
+        47, 47, 47, 47, 48, 48, 47, 47, 46, 46, 46, 46, 46, 47, 47, 48, 49, 49,
+        50, 51, 52, 53, 55, 55, 48, 47, 47, 46, 46, 46, 47, 47, 47, 48, 50, 50,
+        52, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56, 56, 57, 58, 59,
+        60, 60, 49, 47, 47, 46, 45, 45, 46, 45, 45, 46, 49, 49, 53, 53, 55, 56,
+        57, 58, 58, 59, 59, 61, 61, 62, 62, 63, 64, 64, 65, 65, 67, 67, 52, 50,
+        50, 48, 48, 48, 47, 47, 47, 47, 50, 50, 53, 54, 56, 57, 59, 61, 62, 64,
+        64, 66, 66, 68, 68, 69, 70, 71, 72, 73, 75, 75, 56, 54, 53, 52, 51, 51,
+        50, 50, 49, 49, 53, 53, 55, 56, 58, 59, 61, 63, 64, 66, 66, 69, 70, 71,
+        72, 74, 75, 76, 77, 78, 80, 80, 61, 58, 57, 56, 55, 54, 54, 53, 52, 53,
+        56, 56, 58, 59, 61, 62, 63, 66, 66, 69, 69, 72, 73, 75, 76, 78, 79, 80,
+        82, 83, 86, 86 },
+  },
+  {
+      { /* Luma */
+        /* Size 4x4 */
+        32, 33, 42, 55, 33, 38, 46, 57, 42, 46, 63, 75, 55, 57, 75, 92,
+        /* Size 8x8 */
+        31, 32, 32, 34, 38, 46, 52, 63, 32, 32, 32, 34, 37, 44, 49, 59, 32, 32,
+        35, 37, 40, 45, 49, 58, 34, 34, 37, 42, 47, 52, 56, 65, 38, 37, 40, 47,
+        54, 60, 65, 73, 46, 44, 45, 52, 60, 69, 75, 84, 52, 49, 49, 56, 65, 75,
+        82, 92, 63, 59, 58, 65, 73, 84, 92, 105,
+        /* Size 16x16 */
+        32, 31, 31, 31, 32, 32, 34, 36, 38, 41, 44, 48, 54, 58, 61, 65, 31, 32,
+        32, 32, 32, 32, 34, 35, 38, 40, 42, 46, 51, 55, 58, 62, 31, 32, 32, 32,
+        32, 32, 33, 34, 37, 38, 41, 44, 49, 53, 56, 59, 31, 32, 32, 33, 33, 33,
+        35, 36, 38, 40, 42, 45, 49, 53, 56, 59, 32, 32, 32, 33, 34, 34, 36, 37,
+        39, 40, 42, 45, 49, 53, 55, 59, 32, 32, 32, 33, 34, 35, 37, 38, 40, 41,
+        42, 46, 49, 52, 55, 58, 34, 34, 33, 35, 36, 37, 39, 42, 44, 46, 47, 51,
+        54, 57, 60, 63, 36, 35, 34, 36, 37, 38, 42, 48, 50, 52, 54, 57, 60, 63,
+        65, 68, 38, 38, 37, 38, 39, 40, 44, 50, 52, 54, 57, 60, 64, 67, 69, 72,
+        41, 40, 38, 40, 40, 41, 46, 52, 54, 57, 60, 63, 67, 70, 73, 75, 44, 42,
+        41, 42, 42, 42, 47, 54, 57, 60, 63, 67, 71, 74, 77, 79, 48, 46, 44, 45,
+        45, 46, 51, 57, 60, 63, 67, 71, 76, 79, 82, 85, 54, 51, 49, 49, 49, 49,
+        54, 60, 64, 67, 71, 76, 82, 86, 89, 92, 58, 55, 53, 53, 53, 52, 57, 63,
+        67, 70, 74, 79, 86, 90, 93, 97, 61, 58, 56, 56, 55, 55, 60, 65, 69, 73,
+        77, 82, 89, 93, 97, 101, 65, 62, 59, 59, 59, 58, 63, 68, 72, 75, 79, 85,
+        92, 97, 101, 105,
+        /* Size 32x32 */
+        32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 34, 36, 36, 38, 39,
+        41, 44, 44, 47, 48, 50, 54, 54, 58, 59, 61, 65, 65, 70, 31, 31, 31, 32,
+        32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 38, 38, 40, 42, 42, 46,
+        47, 49, 52, 52, 56, 57, 59, 63, 63, 67, 31, 31, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 33, 34, 34, 35, 35, 38, 38, 40, 42, 42, 45, 46, 48, 51, 51,
+        55, 56, 58, 62, 62, 67, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+        34, 34, 35, 35, 37, 38, 39, 42, 42, 45, 45, 47, 50, 50, 54, 55, 57, 61,
+        61, 65, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34,
+        37, 37, 38, 41, 41, 44, 44, 46, 49, 49, 53, 54, 56, 59, 59, 64, 31, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 37, 37, 38, 41,
+        41, 44, 44, 46, 49, 49, 53, 54, 56, 59, 59, 64, 31, 32, 32, 32, 32, 32,
+        33, 33, 33, 33, 33, 34, 35, 35, 36, 36, 38, 39, 40, 42, 42, 44, 45, 47,
+        49, 49, 53, 54, 56, 59, 59, 63, 31, 32, 32, 32, 32, 32, 33, 33, 33, 34,
+        34, 35, 35, 36, 36, 36, 38, 39, 40, 42, 42, 45, 45, 47, 50, 50, 53, 54,
+        56, 59, 59, 63, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 35, 36, 36,
+        37, 37, 39, 39, 40, 42, 42, 45, 45, 47, 49, 49, 53, 54, 55, 59, 59, 63,
+        32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 36, 37, 37, 38, 38, 40, 40,
+        41, 42, 42, 45, 46, 47, 49, 49, 52, 53, 55, 58, 58, 62, 32, 32, 32, 32,
+        32, 32, 33, 34, 34, 35, 35, 36, 37, 37, 38, 38, 40, 40, 41, 42, 42, 45,
+        46, 47, 49, 49, 52, 53, 55, 58, 58, 62, 33, 33, 33, 33, 33, 33, 34, 35,
+        35, 36, 36, 38, 39, 40, 42, 42, 43, 44, 45, 46, 46, 49, 50, 51, 53, 53,
+        56, 57, 59, 62, 62, 66, 34, 34, 34, 34, 33, 33, 35, 35, 36, 37, 37, 39,
+        39, 41, 42, 42, 44, 45, 46, 47, 47, 50, 51, 52, 54, 54, 57, 58, 60, 63,
+        63, 67, 34, 34, 34, 34, 34, 34, 35, 36, 36, 37, 37, 40, 41, 42, 45, 45,
+        46, 47, 48, 50, 50, 52, 53, 54, 56, 56, 59, 60, 62, 65, 65, 69, 36, 35,
+        35, 35, 34, 34, 36, 36, 37, 38, 38, 42, 42, 45, 48, 48, 50, 50, 52, 54,
+        54, 56, 57, 58, 60, 60, 63, 64, 65, 68, 68, 72, 36, 35, 35, 35, 34, 34,
+        36, 36, 37, 38, 38, 42, 42, 45, 48, 48, 50, 50, 52, 54, 54, 56, 57, 58,
+        60, 60, 63, 64, 65, 68, 68, 72, 38, 38, 38, 37, 37, 37, 38, 38, 39, 40,
+        40, 43, 44, 46, 50, 50, 52, 53, 54, 57, 57, 59, 60, 61, 64, 64, 67, 68,
+        69, 72, 72, 76, 39, 38, 38, 38, 37, 37, 39, 39, 39, 40, 40, 44, 45, 47,
+        50, 50, 53, 54, 55, 58, 58, 60, 61, 62, 65, 65, 68, 69, 70, 73, 73, 77,
+        41, 40, 40, 39, 38, 38, 40, 40, 40, 41, 41, 45, 46, 48, 52, 52, 54, 55,
+        57, 60, 60, 62, 63, 65, 67, 67, 70, 71, 73, 75, 75, 79, 44, 42, 42, 42,
+        41, 41, 42, 42, 42, 42, 42, 46, 47, 50, 54, 54, 57, 58, 60, 63, 63, 66,
+        67, 68, 71, 71, 74, 75, 77, 79, 79, 83, 44, 42, 42, 42, 41, 41, 42, 42,
+        42, 42, 42, 46, 47, 50, 54, 54, 57, 58, 60, 63, 63, 66, 67, 68, 71, 71,
+        74, 75, 77, 79, 79, 83, 47, 46, 45, 45, 44, 44, 44, 45, 45, 45, 45, 49,
+        50, 52, 56, 56, 59, 60, 62, 66, 66, 69, 70, 72, 75, 75, 78, 79, 81, 84,
+        84, 88, 48, 47, 46, 45, 44, 44, 45, 45, 45, 46, 46, 50, 51, 53, 57, 57,
+        60, 61, 63, 67, 67, 70, 71, 73, 76, 76, 79, 80, 82, 85, 85, 89, 50, 49,
+        48, 47, 46, 46, 47, 47, 47, 47, 47, 51, 52, 54, 58, 58, 61, 62, 65, 68,
+        68, 72, 73, 75, 78, 78, 82, 83, 85, 88, 88, 92, 54, 52, 51, 50, 49, 49,
+        49, 50, 49, 49, 49, 53, 54, 56, 60, 60, 64, 65, 67, 71, 71, 75, 76, 78,
+        82, 82, 86, 87, 89, 92, 92, 96, 54, 52, 51, 50, 49, 49, 49, 50, 49, 49,
+        49, 53, 54, 56, 60, 60, 64, 65, 67, 71, 71, 75, 76, 78, 82, 82, 86, 87,
+        89, 92, 92, 96, 58, 56, 55, 54, 53, 53, 53, 53, 53, 52, 52, 56, 57, 59,
+        63, 63, 67, 68, 70, 74, 74, 78, 79, 82, 86, 86, 90, 91, 93, 97, 97, 101,
+        59, 57, 56, 55, 54, 54, 54, 54, 54, 53, 53, 57, 58, 60, 64, 64, 68, 69,
+        71, 75, 75, 79, 80, 83, 87, 87, 91, 92, 94, 98, 98, 102, 61, 59, 58, 57,
+        56, 56, 56, 56, 55, 55, 55, 59, 60, 62, 65, 65, 69, 70, 73, 77, 77, 81,
+        82, 85, 89, 89, 93, 94, 97, 101, 101, 105, 65, 63, 62, 61, 59, 59, 59,
+        59, 59, 58, 58, 62, 63, 65, 68, 68, 72, 73, 75, 79, 79, 84, 85, 88, 92,
+        92, 97, 98, 101, 105, 105, 109, 65, 63, 62, 61, 59, 59, 59, 59, 59, 58,
+        58, 62, 63, 65, 68, 68, 72, 73, 75, 79, 79, 84, 85, 88, 92, 92, 97, 98,
+        101, 105, 105, 109, 70, 67, 67, 65, 64, 64, 63, 63, 63, 62, 62, 66, 67,
+        69, 72, 72, 76, 77, 79, 83, 83, 88, 89, 92, 96, 96, 101, 102, 105, 109,
+        109, 114,
+        /* Size 4x8 */
+        32, 32, 42, 56, 32, 33, 41, 53, 32, 35, 42, 52, 34, 37, 50, 59, 38, 40,
+        58, 68, 44, 45, 66, 78, 50, 50, 71, 86, 61, 58, 79, 97,
+        /* Size 8x4 */
+        32, 32, 32, 34, 38, 44, 50, 61, 32, 33, 35, 37, 40, 45, 50, 58, 42, 41,
+        42, 50, 58, 66, 71, 79, 56, 53, 52, 59, 68, 78, 86, 97,
+        /* Size 8x16 */
+        32, 31, 32, 35, 39, 44, 53, 65, 31, 32, 32, 35, 38, 42, 51, 62, 31, 32,
+        33, 34, 37, 41, 49, 59, 31, 32, 34, 35, 38, 42, 49, 59, 32, 32, 34, 36,
+        39, 42, 49, 58, 32, 33, 35, 37, 40, 42, 49, 58, 34, 34, 37, 41, 44, 48,
+        54, 63, 36, 34, 38, 46, 50, 54, 60, 68, 38, 37, 40, 47, 52, 57, 64, 72,
+        41, 39, 41, 49, 54, 60, 67, 76, 44, 41, 43, 51, 57, 63, 71, 79, 48, 45,
+        46, 54, 60, 67, 76, 85, 53, 49, 50, 57, 64, 71, 82, 92, 57, 53, 53, 60,
+        67, 74, 86, 97, 61, 56, 56, 63, 69, 77, 89, 100, 65, 60, 58, 66, 72, 79,
+        92, 105,
+        /* Size 16x8 */
+        32, 31, 31, 31, 32, 32, 34, 36, 38, 41, 44, 48, 53, 57, 61, 65, 31, 32,
+        32, 32, 32, 33, 34, 34, 37, 39, 41, 45, 49, 53, 56, 60, 32, 32, 33, 34,
+        34, 35, 37, 38, 40, 41, 43, 46, 50, 53, 56, 58, 35, 35, 34, 35, 36, 37,
+        41, 46, 47, 49, 51, 54, 57, 60, 63, 66, 39, 38, 37, 38, 39, 40, 44, 50,
+        52, 54, 57, 60, 64, 67, 69, 72, 44, 42, 41, 42, 42, 42, 48, 54, 57, 60,
+        63, 67, 71, 74, 77, 79, 53, 51, 49, 49, 49, 49, 54, 60, 64, 67, 71, 76,
+        82, 86, 89, 92, 65, 62, 59, 59, 58, 58, 63, 68, 72, 76, 79, 85, 92, 97,
+        100, 105,
+        /* Size 16x32 */
+        32, 31, 31, 31, 32, 32, 35, 36, 39, 44, 44, 51, 53, 58, 65, 65, 31, 32,
+        32, 32, 32, 32, 35, 35, 38, 42, 42, 49, 52, 56, 63, 63, 31, 32, 32, 32,
+        32, 32, 35, 35, 38, 42, 42, 49, 51, 55, 62, 62, 31, 32, 32, 32, 32, 32,
+        34, 35, 37, 41, 41, 48, 50, 54, 61, 61, 31, 32, 32, 32, 33, 33, 34, 34,
+        37, 41, 41, 47, 49, 53, 59, 59, 31, 32, 32, 32, 33, 33, 34, 34, 37, 41,
+        41, 47, 49, 53, 59, 59, 31, 32, 32, 33, 34, 34, 35, 36, 38, 42, 42, 48,
+        49, 53, 59, 59, 32, 32, 32, 33, 34, 34, 36, 36, 38, 42, 42, 48, 50, 53,
+        59, 59, 32, 32, 32, 33, 34, 34, 36, 37, 39, 42, 42, 48, 49, 53, 58, 58,
+        32, 32, 33, 34, 35, 35, 37, 38, 40, 42, 42, 48, 49, 52, 58, 58, 32, 32,
+        33, 34, 35, 35, 37, 38, 40, 42, 42, 48, 49, 52, 58, 58, 33, 33, 33, 35,
+        36, 36, 40, 41, 43, 46, 46, 52, 53, 56, 62, 62, 34, 34, 34, 35, 37, 37,
+        41, 42, 44, 48, 48, 53, 54, 57, 63, 63, 34, 34, 34, 35, 37, 37, 43, 44,
+        46, 50, 50, 55, 56, 59, 65, 65, 36, 35, 34, 36, 38, 38, 46, 48, 50, 54,
+        54, 58, 60, 63, 68, 68, 36, 35, 34, 36, 38, 38, 46, 48, 50, 54, 54, 58,
+        60, 63, 68, 68, 38, 37, 37, 38, 40, 40, 47, 50, 52, 57, 57, 62, 64, 67,
+        72, 72, 39, 38, 37, 39, 40, 40, 48, 50, 53, 58, 58, 63, 65, 68, 73, 73,
+        41, 39, 39, 40, 41, 41, 49, 51, 54, 60, 60, 66, 67, 70, 76, 76, 44, 41,
+        41, 42, 43, 43, 51, 53, 57, 63, 63, 69, 71, 74, 79, 79, 44, 41, 41, 42,
+        43, 43, 51, 53, 57, 63, 63, 69, 71, 74, 79, 79, 47, 44, 44, 44, 45, 45,
+        53, 56, 59, 66, 66, 73, 75, 78, 84, 84, 48, 45, 45, 45, 46, 46, 54, 56,
+        60, 67, 67, 74, 76, 79, 85, 85, 50, 47, 46, 47, 47, 47, 55, 58, 61, 68,
+        68, 76, 78, 82, 88, 88, 53, 50, 49, 50, 50, 50, 57, 60, 64, 71, 71, 79,
+        82, 86, 92, 92, 53, 50, 49, 50, 50, 50, 57, 60, 64, 71, 71, 79, 82, 86,
+        92, 92, 57, 54, 53, 53, 53, 53, 60, 63, 67, 74, 74, 83, 86, 90, 97, 97,
+        58, 55, 54, 54, 54, 54, 61, 63, 68, 75, 75, 84, 87, 91, 98, 98, 61, 57,
+        56, 56, 56, 56, 63, 65, 69, 77, 77, 86, 89, 93, 100, 100, 65, 61, 60,
+        59, 58, 58, 66, 68, 72, 79, 79, 89, 92, 97, 105, 105, 65, 61, 60, 59,
+        58, 58, 66, 68, 72, 79, 79, 89, 92, 97, 105, 105, 70, 65, 64, 63, 62,
+        62, 70, 72, 76, 83, 83, 93, 96, 101, 109, 109,
+        /* Size 32x16 */
+        32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 33, 34, 34, 36, 36, 38, 39,
+        41, 44, 44, 47, 48, 50, 53, 53, 57, 58, 61, 65, 65, 70, 31, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 37, 38, 39, 41, 41, 44,
+        45, 47, 50, 50, 54, 55, 57, 61, 61, 65, 31, 32, 32, 32, 32, 32, 32, 32,
+        32, 33, 33, 33, 34, 34, 34, 34, 37, 37, 39, 41, 41, 44, 45, 46, 49, 49,
+        53, 54, 56, 60, 60, 64, 31, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 35,
+        35, 35, 36, 36, 38, 39, 40, 42, 42, 44, 45, 47, 50, 50, 53, 54, 56, 59,
+        59, 63, 32, 32, 32, 32, 33, 33, 34, 34, 34, 35, 35, 36, 37, 37, 38, 38,
+        40, 40, 41, 43, 43, 45, 46, 47, 50, 50, 53, 54, 56, 58, 58, 62, 32, 32,
+        32, 32, 33, 33, 34, 34, 34, 35, 35, 36, 37, 37, 38, 38, 40, 40, 41, 43,
+        43, 45, 46, 47, 50, 50, 53, 54, 56, 58, 58, 62, 35, 35, 35, 34, 34, 34,
+        35, 36, 36, 37, 37, 40, 41, 43, 46, 46, 47, 48, 49, 51, 51, 53, 54, 55,
+        57, 57, 60, 61, 63, 66, 66, 70, 36, 35, 35, 35, 34, 34, 36, 36, 37, 38,
+        38, 41, 42, 44, 48, 48, 50, 50, 51, 53, 53, 56, 56, 58, 60, 60, 63, 63,
+        65, 68, 68, 72, 39, 38, 38, 37, 37, 37, 38, 38, 39, 40, 40, 43, 44, 46,
+        50, 50, 52, 53, 54, 57, 57, 59, 60, 61, 64, 64, 67, 68, 69, 72, 72, 76,
+        44, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 46, 48, 50, 54, 54, 57, 58,
+        60, 63, 63, 66, 67, 68, 71, 71, 74, 75, 77, 79, 79, 83, 44, 42, 42, 41,
+        41, 41, 42, 42, 42, 42, 42, 46, 48, 50, 54, 54, 57, 58, 60, 63, 63, 66,
+        67, 68, 71, 71, 74, 75, 77, 79, 79, 83, 51, 49, 49, 48, 47, 47, 48, 48,
+        48, 48, 48, 52, 53, 55, 58, 58, 62, 63, 66, 69, 69, 73, 74, 76, 79, 79,
+        83, 84, 86, 89, 89, 93, 53, 52, 51, 50, 49, 49, 49, 50, 49, 49, 49, 53,
+        54, 56, 60, 60, 64, 65, 67, 71, 71, 75, 76, 78, 82, 82, 86, 87, 89, 92,
+        92, 96, 58, 56, 55, 54, 53, 53, 53, 53, 53, 52, 52, 56, 57, 59, 63, 63,
+        67, 68, 70, 74, 74, 78, 79, 82, 86, 86, 90, 91, 93, 97, 97, 101, 65, 63,
+        62, 61, 59, 59, 59, 59, 58, 58, 58, 62, 63, 65, 68, 68, 72, 73, 76, 79,
+        79, 84, 85, 88, 92, 92, 97, 98, 100, 105, 105, 109, 65, 63, 62, 61, 59,
+        59, 59, 59, 58, 58, 58, 62, 63, 65, 68, 68, 72, 73, 76, 79, 79, 84, 85,
+        88, 92, 92, 97, 98, 100, 105, 105, 109,
+        /* Size 4x16 */
+        31, 32, 44, 58, 32, 32, 42, 55, 32, 33, 41, 53, 32, 34, 42, 53, 32, 34,
+        42, 53, 32, 35, 42, 52, 34, 37, 48, 57, 35, 38, 54, 63, 37, 40, 57, 67,
+        39, 41, 60, 70, 41, 43, 63, 74, 45, 46, 67, 79, 50, 50, 71, 86, 54, 53,
+        74, 90, 57, 56, 77, 93, 61, 58, 79, 97,
+        /* Size 16x4 */
+        31, 32, 32, 32, 32, 32, 34, 35, 37, 39, 41, 45, 50, 54, 57, 61, 32, 32,
+        33, 34, 34, 35, 37, 38, 40, 41, 43, 46, 50, 53, 56, 58, 44, 42, 41, 42,
+        42, 42, 48, 54, 57, 60, 63, 67, 71, 74, 77, 79, 58, 55, 53, 53, 53, 52,
+        57, 63, 67, 70, 74, 79, 86, 90, 93, 97,
+        /* Size 8x32 */
+        32, 31, 32, 35, 39, 44, 53, 65, 31, 32, 32, 35, 38, 42, 52, 63, 31, 32,
+        32, 35, 38, 42, 51, 62, 31, 32, 32, 34, 37, 41, 50, 61, 31, 32, 33, 34,
+        37, 41, 49, 59, 31, 32, 33, 34, 37, 41, 49, 59, 31, 32, 34, 35, 38, 42,
+        49, 59, 32, 32, 34, 36, 38, 42, 50, 59, 32, 32, 34, 36, 39, 42, 49, 58,
+        32, 33, 35, 37, 40, 42, 49, 58, 32, 33, 35, 37, 40, 42, 49, 58, 33, 33,
+        36, 40, 43, 46, 53, 62, 34, 34, 37, 41, 44, 48, 54, 63, 34, 34, 37, 43,
+        46, 50, 56, 65, 36, 34, 38, 46, 50, 54, 60, 68, 36, 34, 38, 46, 50, 54,
+        60, 68, 38, 37, 40, 47, 52, 57, 64, 72, 39, 37, 40, 48, 53, 58, 65, 73,
+        41, 39, 41, 49, 54, 60, 67, 76, 44, 41, 43, 51, 57, 63, 71, 79, 44, 41,
+        43, 51, 57, 63, 71, 79, 47, 44, 45, 53, 59, 66, 75, 84, 48, 45, 46, 54,
+        60, 67, 76, 85, 50, 46, 47, 55, 61, 68, 78, 88, 53, 49, 50, 57, 64, 71,
+        82, 92, 53, 49, 50, 57, 64, 71, 82, 92, 57, 53, 53, 60, 67, 74, 86, 97,
+        58, 54, 54, 61, 68, 75, 87, 98, 61, 56, 56, 63, 69, 77, 89, 100, 65, 60,
+        58, 66, 72, 79, 92, 105, 65, 60, 58, 66, 72, 79, 92, 105, 70, 64, 62,
+        70, 76, 83, 96, 109,
+        /* Size 32x8 */
+        32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 33, 34, 34, 36, 36, 38, 39,
+        41, 44, 44, 47, 48, 50, 53, 53, 57, 58, 61, 65, 65, 70, 31, 32, 32, 32,
+        32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 37, 37, 39, 41, 41, 44,
+        45, 46, 49, 49, 53, 54, 56, 60, 60, 64, 32, 32, 32, 32, 33, 33, 34, 34,
+        34, 35, 35, 36, 37, 37, 38, 38, 40, 40, 41, 43, 43, 45, 46, 47, 50, 50,
+        53, 54, 56, 58, 58, 62, 35, 35, 35, 34, 34, 34, 35, 36, 36, 37, 37, 40,
+        41, 43, 46, 46, 47, 48, 49, 51, 51, 53, 54, 55, 57, 57, 60, 61, 63, 66,
+        66, 70, 39, 38, 38, 37, 37, 37, 38, 38, 39, 40, 40, 43, 44, 46, 50, 50,
+        52, 53, 54, 57, 57, 59, 60, 61, 64, 64, 67, 68, 69, 72, 72, 76, 44, 42,
+        42, 41, 41, 41, 42, 42, 42, 42, 42, 46, 48, 50, 54, 54, 57, 58, 60, 63,
+        63, 66, 67, 68, 71, 71, 74, 75, 77, 79, 79, 83, 53, 52, 51, 50, 49, 49,
+        49, 50, 49, 49, 49, 53, 54, 56, 60, 60, 64, 65, 67, 71, 71, 75, 76, 78,
+        82, 82, 86, 87, 89, 92, 92, 96, 65, 63, 62, 61, 59, 59, 59, 59, 58, 58,
+        58, 62, 63, 65, 68, 68, 72, 73, 76, 79, 79, 84, 85, 88, 92, 92, 97, 98,
+        100, 105, 105, 109 },
+      { /* Chroma */
+        /* Size 4x4 */
+        31, 41, 46, 51, 41, 48, 48, 51, 46, 48, 58, 62, 51, 51, 62, 71,
+        /* Size 8x8 */
+        31, 31, 38, 44, 47, 48, 50, 55, 31, 32, 40, 44, 45, 46, 47, 52, 38, 40,
+        47, 47, 46, 46, 47, 50, 44, 44, 47, 50, 51, 51, 52, 54, 47, 45, 46, 51,
+        54, 56, 57, 60, 48, 46, 46, 51, 56, 61, 63, 66, 50, 47, 47, 52, 57, 63,
+        66, 70, 55, 52, 50, 54, 60, 66, 70, 76,
+        /* Size 16x16 */
+        32, 31, 30, 33, 34, 36, 41, 49, 48, 49, 49, 50, 52, 54, 55, 57, 31, 31,
+        31, 34, 36, 38, 42, 47, 47, 47, 47, 48, 50, 51, 53, 54, 30, 31, 32, 34,
+        37, 40, 42, 46, 45, 45, 45, 46, 47, 49, 50, 52, 33, 34, 34, 37, 40, 42,
+        44, 47, 46, 46, 45, 46, 47, 49, 50, 51, 34, 36, 37, 40, 42, 45, 46, 47,
+        46, 46, 45, 46, 47, 48, 49, 50, 36, 38, 40, 42, 45, 47, 47, 48, 47, 46,
+        45, 46, 47, 48, 49, 50, 41, 42, 42, 44, 46, 47, 48, 50, 50, 49, 49, 50,
+        50, 51, 52, 53, 49, 47, 46, 47, 47, 48, 50, 53, 53, 53, 53, 54, 54, 55,
+        56, 56, 48, 47, 45, 46, 46, 47, 50, 53, 54, 54, 55, 56, 57, 58, 58, 59,
+        49, 47, 45, 46, 46, 46, 49, 53, 54, 55, 57, 58, 59, 60, 60, 61, 49, 47,
+        45, 45, 45, 45, 49, 53, 55, 57, 58, 60, 61, 62, 63, 63, 50, 48, 46, 46,
+        46, 46, 50, 54, 56, 58, 60, 61, 63, 65, 66, 67, 52, 50, 47, 47, 47, 47,
+        50, 54, 57, 59, 61, 63, 66, 68, 69, 70, 54, 51, 49, 49, 48, 48, 51, 55,
+        58, 60, 62, 65, 68, 70, 71, 73, 55, 53, 50, 50, 49, 49, 52, 56, 58, 60,
+        63, 66, 69, 71, 73, 74, 57, 54, 52, 51, 50, 50, 53, 56, 59, 61, 63, 67,
+        70, 73, 74, 76,
+        /* Size 32x32 */
+        32, 31, 31, 31, 30, 30, 33, 33, 34, 36, 36, 40, 41, 44, 49, 49, 48, 48,
+        49, 49, 49, 50, 50, 51, 52, 52, 54, 54, 55, 57, 57, 59, 31, 31, 31, 31,
+        31, 31, 33, 34, 36, 38, 38, 41, 42, 44, 48, 48, 47, 47, 47, 47, 47, 48,
+        49, 49, 50, 50, 52, 52, 53, 55, 55, 57, 31, 31, 31, 31, 31, 31, 34, 34,
+        36, 38, 38, 41, 42, 44, 47, 47, 47, 47, 47, 47, 47, 48, 48, 49, 50, 50,
+        51, 52, 53, 54, 54, 56, 31, 31, 31, 31, 31, 31, 34, 35, 36, 39, 39, 41,
+        42, 44, 47, 47, 46, 46, 46, 46, 46, 47, 47, 48, 49, 49, 50, 51, 52, 53,
+        53, 55, 30, 31, 31, 31, 32, 32, 34, 35, 37, 40, 40, 42, 42, 44, 46, 46,
+        45, 45, 45, 45, 45, 46, 46, 47, 47, 47, 49, 49, 50, 52, 52, 54, 30, 31,
+        31, 31, 32, 32, 34, 35, 37, 40, 40, 42, 42, 44, 46, 46, 45, 45, 45, 45,
+        45, 46, 46, 47, 47, 47, 49, 49, 50, 52, 52, 54, 33, 33, 34, 34, 34, 34,
+        37, 38, 40, 42, 42, 44, 44, 45, 47, 47, 46, 46, 46, 45, 45, 46, 46, 47,
+        47, 47, 49, 49, 50, 51, 51, 53, 33, 34, 34, 35, 35, 35, 38, 39, 40, 43,
+        43, 44, 45, 46, 47, 47, 46, 46, 46, 45, 45, 46, 46, 47, 47, 47, 49, 49,
+        50, 51, 51, 53, 34, 36, 36, 36, 37, 37, 40, 40, 42, 45, 45, 45, 46, 46,
+        47, 47, 46, 46, 46, 45, 45, 46, 46, 47, 47, 47, 48, 49, 49, 50, 50, 52,
+        36, 38, 38, 39, 40, 40, 42, 43, 45, 47, 47, 47, 47, 47, 48, 48, 47, 46,
+        46, 45, 45, 46, 46, 46, 47, 47, 48, 48, 49, 50, 50, 51, 36, 38, 38, 39,
+        40, 40, 42, 43, 45, 47, 47, 47, 47, 47, 48, 48, 47, 46, 46, 45, 45, 46,
+        46, 46, 47, 47, 48, 48, 49, 50, 50, 51, 40, 41, 41, 41, 42, 42, 44, 44,
+        45, 47, 47, 48, 48, 49, 50, 50, 49, 49, 49, 48, 48, 49, 49, 49, 49, 49,
+        51, 51, 51, 52, 52, 54, 41, 42, 42, 42, 42, 42, 44, 45, 46, 47, 47, 48,
+        48, 49, 50, 50, 50, 49, 49, 49, 49, 50, 50, 50, 50, 50, 51, 52, 52, 53,
+        53, 55, 44, 44, 44, 44, 44, 44, 45, 46, 46, 47, 47, 49, 49, 50, 51, 51,
+        51, 51, 51, 51, 51, 51, 51, 51, 52, 52, 53, 53, 54, 54, 54, 56, 49, 48,
+        47, 47, 46, 46, 47, 47, 47, 48, 48, 50, 50, 51, 53, 53, 53, 53, 53, 53,
+        53, 54, 54, 54, 54, 54, 55, 55, 56, 56, 56, 58, 49, 48, 47, 47, 46, 46,
+        47, 47, 47, 48, 48, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54,
+        54, 54, 55, 55, 56, 56, 56, 58, 48, 47, 47, 46, 45, 45, 46, 46, 46, 47,
+        47, 49, 50, 51, 53, 53, 54, 54, 54, 55, 55, 56, 56, 56, 57, 57, 58, 58,
+        58, 59, 59, 60, 48, 47, 47, 46, 45, 45, 46, 46, 46, 46, 46, 49, 49, 51,
+        53, 53, 54, 54, 55, 55, 55, 56, 56, 57, 57, 57, 58, 58, 59, 60, 60, 61,
+        49, 47, 47, 46, 45, 45, 46, 46, 46, 46, 46, 49, 49, 51, 53, 53, 54, 55,
+        55, 57, 57, 57, 58, 58, 59, 59, 60, 60, 60, 61, 61, 63, 49, 47, 47, 46,
+        45, 45, 45, 45, 45, 45, 45, 48, 49, 51, 53, 53, 55, 55, 57, 58, 58, 59,
+        60, 60, 61, 61, 62, 62, 63, 63, 63, 65, 49, 47, 47, 46, 45, 45, 45, 45,
+        45, 45, 45, 48, 49, 51, 53, 53, 55, 55, 57, 58, 58, 59, 60, 60, 61, 61,
+        62, 62, 63, 63, 63, 65, 50, 48, 48, 47, 46, 46, 46, 46, 46, 46, 46, 49,
+        50, 51, 54, 54, 56, 56, 57, 59, 59, 61, 61, 62, 63, 63, 64, 64, 65, 66,
+        66, 67, 50, 49, 48, 47, 46, 46, 46, 46, 46, 46, 46, 49, 50, 51, 54, 54,
+        56, 56, 58, 60, 60, 61, 61, 62, 63, 63, 65, 65, 66, 67, 67, 68, 51, 49,
+        49, 48, 47, 47, 47, 47, 47, 46, 46, 49, 50, 51, 54, 54, 56, 57, 58, 60,
+        60, 62, 62, 63, 65, 65, 66, 66, 67, 68, 68, 70, 52, 50, 50, 49, 47, 47,
+        47, 47, 47, 47, 47, 49, 50, 52, 54, 54, 57, 57, 59, 61, 61, 63, 63, 65,
+        66, 66, 68, 68, 69, 70, 70, 72, 52, 50, 50, 49, 47, 47, 47, 47, 47, 47,
+        47, 49, 50, 52, 54, 54, 57, 57, 59, 61, 61, 63, 63, 65, 66, 66, 68, 68,
+        69, 70, 70, 72, 54, 52, 51, 50, 49, 49, 49, 49, 48, 48, 48, 51, 51, 53,
+        55, 55, 58, 58, 60, 62, 62, 64, 65, 66, 68, 68, 70, 70, 71, 73, 73, 74,
+        54, 52, 52, 51, 49, 49, 49, 49, 49, 48, 48, 51, 52, 53, 55, 55, 58, 58,
+        60, 62, 62, 64, 65, 66, 68, 68, 70, 71, 72, 73, 73, 75, 55, 53, 53, 52,
+        50, 50, 50, 50, 49, 49, 49, 51, 52, 54, 56, 56, 58, 59, 60, 63, 63, 65,
+        66, 67, 69, 69, 71, 72, 73, 74, 74, 76, 57, 55, 54, 53, 52, 52, 51, 51,
+        50, 50, 50, 52, 53, 54, 56, 56, 59, 60, 61, 63, 63, 66, 67, 68, 70, 70,
+        73, 73, 74, 76, 76, 78, 57, 55, 54, 53, 52, 52, 51, 51, 50, 50, 50, 52,
+        53, 54, 56, 56, 59, 60, 61, 63, 63, 66, 67, 68, 70, 70, 73, 73, 74, 76,
+        76, 78, 59, 57, 56, 55, 54, 54, 53, 53, 52, 51, 51, 54, 55, 56, 58, 58,
+        60, 61, 63, 65, 65, 67, 68, 70, 72, 72, 74, 75, 76, 78, 78, 80,
+        /* Size 4x8 */
+        31, 38, 47, 52, 32, 40, 45, 49, 39, 47, 45, 48, 44, 47, 51, 53, 46, 47,
+        56, 58, 47, 46, 59, 64, 48, 47, 61, 68, 53, 50, 64, 73,
+        /* Size 8x4 */
+        31, 32, 39, 44, 46, 47, 48, 53, 38, 40, 47, 47, 47, 46, 47, 50, 47, 45,
+        45, 51, 56, 59, 61, 64, 52, 49, 48, 53, 58, 64, 68, 73,
+        /* Size 8x16 */
+        32, 31, 37, 45, 48, 49, 52, 57, 31, 31, 38, 45, 47, 47, 50, 54, 30, 32,
+        40, 44, 45, 45, 48, 52, 33, 35, 42, 46, 46, 45, 47, 51, 35, 37, 44, 46,
+        46, 45, 47, 51, 37, 40, 47, 47, 47, 45, 47, 50, 42, 43, 47, 49, 50, 49,
+        50, 53, 49, 46, 48, 52, 53, 53, 54, 57, 48, 46, 47, 51, 54, 55, 57, 59,
+        48, 45, 46, 51, 54, 57, 59, 61, 49, 45, 46, 51, 55, 58, 61, 64, 50, 46,
+        46, 52, 56, 59, 64, 67, 52, 48, 47, 53, 57, 61, 66, 71, 54, 49, 48, 54,
+        58, 62, 68, 73, 55, 51, 49, 54, 58, 63, 69, 74, 57, 52, 50, 55, 59, 64,
+        70, 76,
+        /* Size 16x8 */
+        32, 31, 30, 33, 35, 37, 42, 49, 48, 48, 49, 50, 52, 54, 55, 57, 31, 31,
+        32, 35, 37, 40, 43, 46, 46, 45, 45, 46, 48, 49, 51, 52, 37, 38, 40, 42,
+        44, 47, 47, 48, 47, 46, 46, 46, 47, 48, 49, 50, 45, 45, 44, 46, 46, 47,
+        49, 52, 51, 51, 51, 52, 53, 54, 54, 55, 48, 47, 45, 46, 46, 47, 50, 53,
+        54, 54, 55, 56, 57, 58, 58, 59, 49, 47, 45, 45, 45, 45, 49, 53, 55, 57,
+        58, 59, 61, 62, 63, 64, 52, 50, 48, 47, 47, 47, 50, 54, 57, 59, 61, 64,
+        66, 68, 69, 70, 57, 54, 52, 51, 51, 50, 53, 57, 59, 61, 64, 67, 71, 73,
+        74, 76,
+        /* Size 16x32 */
+        32, 31, 31, 33, 37, 37, 45, 48, 48, 49, 49, 51, 52, 54, 57, 57, 31, 31,
+        31, 34, 38, 38, 45, 47, 47, 47, 47, 50, 50, 52, 55, 55, 31, 31, 31, 34,
+        38, 38, 45, 47, 47, 47, 47, 49, 50, 51, 54, 54, 31, 31, 32, 34, 39, 39,
+        45, 46, 46, 46, 46, 48, 49, 51, 53, 53, 30, 32, 32, 35, 40, 40, 44, 46,
+        45, 45, 45, 47, 48, 49, 52, 52, 30, 32, 32, 35, 40, 40, 44, 46, 45, 45,
+        45, 47, 48, 49, 52, 52, 33, 34, 35, 37, 42, 42, 46, 47, 46, 45, 45, 47,
+        47, 49, 51, 51, 33, 35, 36, 38, 43, 43, 46, 47, 46, 46, 46, 47, 47, 49,
+        51, 51, 35, 37, 37, 40, 44, 44, 46, 47, 46, 45, 45, 47, 47, 48, 51, 51,
+        37, 39, 40, 43, 47, 47, 47, 47, 47, 45, 45, 46, 47, 48, 50, 50, 37, 39,
+        40, 43, 47, 47, 47, 47, 47, 45, 45, 46, 47, 48, 50, 50, 41, 42, 42, 44,
+        47, 47, 49, 49, 49, 48, 48, 49, 50, 51, 52, 52, 42, 42, 43, 44, 47, 47,
+        49, 50, 50, 49, 49, 50, 50, 51, 53, 53, 44, 44, 44, 45, 47, 47, 50, 51,
+        51, 51, 51, 52, 52, 53, 54, 54, 49, 47, 46, 47, 48, 48, 52, 53, 53, 53,
+        53, 54, 54, 55, 57, 57, 49, 47, 46, 47, 48, 48, 52, 53, 53, 53, 53, 54,
+        54, 55, 57, 57, 48, 46, 46, 46, 47, 47, 51, 53, 54, 55, 55, 56, 57, 58,
+        59, 59, 48, 46, 46, 46, 47, 47, 51, 53, 54, 56, 56, 57, 57, 58, 60, 60,
+        48, 46, 45, 46, 46, 46, 51, 53, 54, 57, 57, 58, 59, 60, 61, 61, 49, 46,
+        45, 45, 46, 46, 51, 53, 55, 58, 58, 61, 61, 62, 64, 64, 49, 46, 45, 45,
+        46, 46, 51, 53, 55, 58, 58, 61, 61, 62, 64, 64, 50, 47, 46, 46, 46, 46,
+        52, 54, 56, 59, 59, 62, 63, 64, 66, 66, 50, 47, 46, 46, 46, 46, 52, 54,
+        56, 59, 59, 63, 64, 65, 67, 67, 51, 48, 47, 47, 47, 47, 52, 54, 56, 60,
+        60, 64, 65, 66, 68, 68, 52, 48, 48, 47, 47, 47, 53, 54, 57, 61, 61, 65,
+        66, 68, 71, 71, 52, 48, 48, 47, 47, 47, 53, 54, 57, 61, 61, 65, 66, 68,
+        71, 71, 54, 50, 49, 49, 48, 48, 54, 55, 58, 62, 62, 67, 68, 70, 73, 73,
+        54, 51, 50, 49, 49, 49, 54, 55, 58, 62, 62, 67, 68, 70, 73, 73, 55, 51,
+        51, 50, 49, 49, 54, 56, 58, 63, 63, 68, 69, 71, 74, 74, 57, 53, 52, 51,
+        50, 50, 55, 56, 59, 64, 64, 69, 70, 73, 76, 76, 57, 53, 52, 51, 50, 50,
+        55, 56, 59, 64, 64, 69, 70, 73, 76, 76, 59, 55, 54, 53, 52, 52, 57, 58,
+        61, 65, 65, 70, 72, 74, 78, 78,
+        /* Size 32x16 */
+        32, 31, 31, 31, 30, 30, 33, 33, 35, 37, 37, 41, 42, 44, 49, 49, 48, 48,
+        48, 49, 49, 50, 50, 51, 52, 52, 54, 54, 55, 57, 57, 59, 31, 31, 31, 31,
+        32, 32, 34, 35, 37, 39, 39, 42, 42, 44, 47, 47, 46, 46, 46, 46, 46, 47,
+        47, 48, 48, 48, 50, 51, 51, 53, 53, 55, 31, 31, 31, 32, 32, 32, 35, 36,
+        37, 40, 40, 42, 43, 44, 46, 46, 46, 46, 45, 45, 45, 46, 46, 47, 48, 48,
+        49, 50, 51, 52, 52, 54, 33, 34, 34, 34, 35, 35, 37, 38, 40, 43, 43, 44,
+        44, 45, 47, 47, 46, 46, 46, 45, 45, 46, 46, 47, 47, 47, 49, 49, 50, 51,
+        51, 53, 37, 38, 38, 39, 40, 40, 42, 43, 44, 47, 47, 47, 47, 47, 48, 48,
+        47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 48, 49, 49, 50, 50, 52, 37, 38,
+        38, 39, 40, 40, 42, 43, 44, 47, 47, 47, 47, 47, 48, 48, 47, 47, 46, 46,
+        46, 46, 46, 47, 47, 47, 48, 49, 49, 50, 50, 52, 45, 45, 45, 45, 44, 44,
+        46, 46, 46, 47, 47, 49, 49, 50, 52, 52, 51, 51, 51, 51, 51, 52, 52, 52,
+        53, 53, 54, 54, 54, 55, 55, 57, 48, 47, 47, 46, 46, 46, 47, 47, 47, 47,
+        47, 49, 50, 51, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 54, 55, 55,
+        56, 56, 56, 58, 48, 47, 47, 46, 45, 45, 46, 46, 46, 47, 47, 49, 50, 51,
+        53, 53, 54, 54, 54, 55, 55, 56, 56, 56, 57, 57, 58, 58, 58, 59, 59, 61,
+        49, 47, 47, 46, 45, 45, 45, 46, 45, 45, 45, 48, 49, 51, 53, 53, 55, 56,
+        57, 58, 58, 59, 59, 60, 61, 61, 62, 62, 63, 64, 64, 65, 49, 47, 47, 46,
+        45, 45, 45, 46, 45, 45, 45, 48, 49, 51, 53, 53, 55, 56, 57, 58, 58, 59,
+        59, 60, 61, 61, 62, 62, 63, 64, 64, 65, 51, 50, 49, 48, 47, 47, 47, 47,
+        47, 46, 46, 49, 50, 52, 54, 54, 56, 57, 58, 61, 61, 62, 63, 64, 65, 65,
+        67, 67, 68, 69, 69, 70, 52, 50, 50, 49, 48, 48, 47, 47, 47, 47, 47, 50,
+        50, 52, 54, 54, 57, 57, 59, 61, 61, 63, 64, 65, 66, 66, 68, 68, 69, 70,
+        70, 72, 54, 52, 51, 51, 49, 49, 49, 49, 48, 48, 48, 51, 51, 53, 55, 55,
+        58, 58, 60, 62, 62, 64, 65, 66, 68, 68, 70, 70, 71, 73, 73, 74, 57, 55,
+        54, 53, 52, 52, 51, 51, 51, 50, 50, 52, 53, 54, 57, 57, 59, 60, 61, 64,
+        64, 66, 67, 68, 71, 71, 73, 73, 74, 76, 76, 78, 57, 55, 54, 53, 52, 52,
+        51, 51, 51, 50, 50, 52, 53, 54, 57, 57, 59, 60, 61, 64, 64, 66, 67, 68,
+        71, 71, 73, 73, 74, 76, 76, 78,
+        /* Size 4x16 */
+        31, 37, 49, 54, 31, 38, 47, 51, 32, 40, 45, 49, 34, 42, 45, 49, 37, 44,
+        45, 48, 39, 47, 45, 48, 42, 47, 49, 51, 47, 48, 53, 55, 46, 47, 55, 58,
+        46, 46, 57, 60, 46, 46, 58, 62, 47, 46, 59, 65, 48, 47, 61, 68, 50, 48,
+        62, 70, 51, 49, 63, 71, 53, 50, 64, 73,
+        /* Size 16x4 */
+        31, 31, 32, 34, 37, 39, 42, 47, 46, 46, 46, 47, 48, 50, 51, 53, 37, 38,
+        40, 42, 44, 47, 47, 48, 47, 46, 46, 46, 47, 48, 49, 50, 49, 47, 45, 45,
+        45, 45, 49, 53, 55, 57, 58, 59, 61, 62, 63, 64, 54, 51, 49, 49, 48, 48,
+        51, 55, 58, 60, 62, 65, 68, 70, 71, 73,
+        /* Size 8x32 */
+        32, 31, 37, 45, 48, 49, 52, 57, 31, 31, 38, 45, 47, 47, 50, 55, 31, 31,
+        38, 45, 47, 47, 50, 54, 31, 32, 39, 45, 46, 46, 49, 53, 30, 32, 40, 44,
+        45, 45, 48, 52, 30, 32, 40, 44, 45, 45, 48, 52, 33, 35, 42, 46, 46, 45,
+        47, 51, 33, 36, 43, 46, 46, 46, 47, 51, 35, 37, 44, 46, 46, 45, 47, 51,
+        37, 40, 47, 47, 47, 45, 47, 50, 37, 40, 47, 47, 47, 45, 47, 50, 41, 42,
+        47, 49, 49, 48, 50, 52, 42, 43, 47, 49, 50, 49, 50, 53, 44, 44, 47, 50,
+        51, 51, 52, 54, 49, 46, 48, 52, 53, 53, 54, 57, 49, 46, 48, 52, 53, 53,
+        54, 57, 48, 46, 47, 51, 54, 55, 57, 59, 48, 46, 47, 51, 54, 56, 57, 60,
+        48, 45, 46, 51, 54, 57, 59, 61, 49, 45, 46, 51, 55, 58, 61, 64, 49, 45,
+        46, 51, 55, 58, 61, 64, 50, 46, 46, 52, 56, 59, 63, 66, 50, 46, 46, 52,
+        56, 59, 64, 67, 51, 47, 47, 52, 56, 60, 65, 68, 52, 48, 47, 53, 57, 61,
+        66, 71, 52, 48, 47, 53, 57, 61, 66, 71, 54, 49, 48, 54, 58, 62, 68, 73,
+        54, 50, 49, 54, 58, 62, 68, 73, 55, 51, 49, 54, 58, 63, 69, 74, 57, 52,
+        50, 55, 59, 64, 70, 76, 57, 52, 50, 55, 59, 64, 70, 76, 59, 54, 52, 57,
+        61, 65, 72, 78,
+        /* Size 32x8 */
+        32, 31, 31, 31, 30, 30, 33, 33, 35, 37, 37, 41, 42, 44, 49, 49, 48, 48,
+        48, 49, 49, 50, 50, 51, 52, 52, 54, 54, 55, 57, 57, 59, 31, 31, 31, 32,
+        32, 32, 35, 36, 37, 40, 40, 42, 43, 44, 46, 46, 46, 46, 45, 45, 45, 46,
+        46, 47, 48, 48, 49, 50, 51, 52, 52, 54, 37, 38, 38, 39, 40, 40, 42, 43,
+        44, 47, 47, 47, 47, 47, 48, 48, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47,
+        48, 49, 49, 50, 50, 52, 45, 45, 45, 45, 44, 44, 46, 46, 46, 47, 47, 49,
+        49, 50, 52, 52, 51, 51, 51, 51, 51, 52, 52, 52, 53, 53, 54, 54, 54, 55,
+        55, 57, 48, 47, 47, 46, 45, 45, 46, 46, 46, 47, 47, 49, 50, 51, 53, 53,
+        54, 54, 54, 55, 55, 56, 56, 56, 57, 57, 58, 58, 58, 59, 59, 61, 49, 47,
+        47, 46, 45, 45, 45, 46, 45, 45, 45, 48, 49, 51, 53, 53, 55, 56, 57, 58,
+        58, 59, 59, 60, 61, 61, 62, 62, 63, 64, 64, 65, 52, 50, 50, 49, 48, 48,
+        47, 47, 47, 47, 47, 50, 50, 52, 54, 54, 57, 57, 59, 61, 61, 63, 64, 65,
+        66, 66, 68, 68, 69, 70, 70, 72, 57, 55, 54, 53, 52, 52, 51, 51, 51, 50,
+        50, 52, 53, 54, 57, 57, 59, 60, 61, 64, 64, 66, 67, 68, 71, 71, 73, 73,
+        74, 76, 76, 78 },
+  },
+  {
+      { /* Luma */
+        /* Size 4x4 */
+        32, 32, 38, 51, 32, 35, 40, 49, 38, 40, 54, 64, 51, 49, 64, 81,
+        /* Size 8x8 */
+        31, 32, 32, 34, 35, 41, 47, 53, 32, 32, 32, 33, 34, 40, 44, 50, 32, 32,
+        34, 35, 37, 41, 45, 51, 34, 33, 35, 39, 42, 47, 51, 55, 35, 34, 37, 42,
+        48, 53, 57, 61, 41, 40, 41, 47, 53, 60, 65, 70, 47, 44, 45, 51, 57, 65,
+        71, 77, 53, 50, 51, 55, 61, 70, 77, 85,
+        /* Size 16x16 */
+        32, 31, 31, 31, 31, 32, 32, 34, 36, 38, 39, 44, 47, 49, 54, 59, 31, 32,
+        32, 32, 32, 32, 33, 34, 35, 37, 38, 42, 45, 47, 51, 56, 31, 32, 32, 32,
+        32, 32, 33, 33, 34, 36, 37, 41, 44, 46, 50, 54, 31, 32, 32, 32, 32, 33,
+        33, 34, 35, 36, 38, 41, 44, 45, 49, 54, 31, 32, 32, 32, 33, 34, 34, 35,
+        36, 38, 39, 42, 45, 46, 50, 54, 32, 32, 32, 33, 34, 35, 36, 37, 38, 39,
+        40, 42, 45, 46, 49, 53, 32, 33, 33, 33, 34, 36, 36, 38, 40, 41, 42, 44,
+        47, 48, 51, 55, 34, 34, 33, 34, 35, 37, 38, 39, 42, 44, 45, 47, 50, 51,
+        54, 58, 36, 35, 34, 35, 36, 38, 40, 42, 48, 50, 50, 54, 56, 57, 60, 64,
+        38, 37, 36, 36, 38, 39, 41, 44, 50, 51, 52, 56, 58, 60, 63, 67, 39, 38,
+        37, 38, 39, 40, 42, 45, 50, 52, 54, 58, 60, 62, 65, 69, 44, 42, 41, 41,
+        42, 42, 44, 47, 54, 56, 58, 63, 66, 68, 71, 75, 47, 45, 44, 44, 45, 45,
+        47, 50, 56, 58, 60, 66, 69, 71, 75, 79, 49, 47, 46, 45, 46, 46, 48, 51,
+        57, 60, 62, 68, 71, 73, 77, 81, 54, 51, 50, 49, 50, 49, 51, 54, 60, 63,
+        65, 71, 75, 77, 82, 87, 59, 56, 54, 54, 54, 53, 55, 58, 64, 67, 69, 75,
+        79, 81, 87, 92,
+        /* Size 32x32 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 35, 36, 36,
+        38, 39, 39, 42, 44, 44, 47, 48, 49, 53, 54, 55, 59, 59, 31, 31, 31, 31,
+        32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 37, 39, 39, 41,
+        43, 43, 46, 47, 48, 51, 52, 53, 57, 57, 31, 31, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 37, 38, 38, 41, 42, 43, 45, 46,
+        47, 51, 51, 53, 56, 56, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        33, 34, 34, 34, 35, 35, 37, 38, 38, 41, 42, 42, 45, 46, 47, 51, 51, 52,
+        56, 56, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34,
+        34, 34, 36, 37, 37, 40, 41, 41, 44, 45, 46, 49, 50, 51, 54, 54, 31, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 36, 37,
+        37, 40, 41, 41, 44, 44, 45, 49, 49, 50, 54, 54, 31, 32, 32, 32, 32, 32,
+        32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, 35, 36, 38, 38, 40, 41, 41,
+        44, 45, 45, 49, 49, 50, 54, 54, 31, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+        34, 34, 34, 35, 35, 35, 36, 36, 38, 39, 39, 41, 42, 42, 44, 45, 46, 49,
+        50, 51, 54, 54, 31, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35,
+        35, 36, 36, 36, 38, 39, 39, 41, 42, 42, 45, 45, 46, 49, 50, 51, 54, 54,
+        32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 37, 37,
+        38, 39, 39, 41, 42, 42, 45, 45, 46, 49, 49, 51, 54, 54, 32, 32, 32, 32,
+        32, 32, 33, 34, 34, 34, 35, 35, 36, 37, 37, 37, 38, 38, 39, 40, 40, 42,
+        42, 43, 45, 46, 46, 49, 49, 50, 53, 53, 32, 32, 32, 32, 32, 32, 33, 34,
+        34, 34, 35, 35, 36, 37, 37, 37, 38, 38, 39, 40, 40, 42, 42, 43, 45, 46,
+        46, 49, 49, 50, 53, 53, 32, 33, 33, 33, 33, 33, 33, 34, 34, 35, 36, 36,
+        36, 38, 38, 39, 40, 40, 41, 42, 42, 44, 44, 45, 47, 47, 48, 51, 51, 52,
+        55, 55, 34, 34, 34, 34, 33, 33, 34, 35, 35, 35, 37, 37, 38, 39, 39, 41,
+        42, 42, 44, 45, 45, 47, 47, 48, 50, 51, 51, 54, 54, 55, 58, 58, 34, 34,
+        34, 34, 33, 33, 34, 35, 35, 35, 37, 37, 38, 39, 39, 41, 42, 42, 44, 45,
+        45, 47, 47, 48, 50, 51, 51, 54, 54, 55, 58, 58, 35, 34, 34, 34, 34, 34,
+        34, 35, 36, 36, 37, 37, 39, 41, 41, 43, 45, 45, 47, 47, 47, 49, 50, 51,
+        53, 53, 54, 57, 57, 58, 61, 61, 36, 35, 35, 35, 34, 34, 35, 36, 36, 37,
+        38, 38, 40, 42, 42, 45, 48, 48, 50, 50, 50, 53, 54, 54, 56, 57, 57, 59,
+        60, 61, 64, 64, 36, 35, 35, 35, 34, 34, 35, 36, 36, 37, 38, 38, 40, 42,
+        42, 45, 48, 48, 50, 50, 50, 53, 54, 54, 56, 57, 57, 59, 60, 61, 64, 64,
+        38, 37, 37, 37, 36, 36, 36, 38, 38, 38, 39, 39, 41, 44, 44, 47, 50, 50,
+        51, 52, 52, 55, 56, 56, 58, 59, 60, 62, 63, 64, 67, 67, 39, 39, 38, 38,
+        37, 37, 38, 39, 39, 39, 40, 40, 42, 45, 45, 47, 50, 50, 52, 54, 54, 56,
+        58, 58, 60, 61, 62, 64, 65, 66, 69, 69, 39, 39, 38, 38, 37, 37, 38, 39,
+        39, 39, 40, 40, 42, 45, 45, 47, 50, 50, 52, 54, 54, 56, 58, 58, 60, 61,
+        62, 64, 65, 66, 69, 69, 42, 41, 41, 41, 40, 40, 40, 41, 41, 41, 42, 42,
+        44, 47, 47, 49, 53, 53, 55, 56, 56, 60, 61, 62, 64, 65, 66, 69, 69, 70,
+        73, 73, 44, 43, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 44, 47, 47, 50,
+        54, 54, 56, 58, 58, 61, 63, 64, 66, 67, 68, 71, 71, 72, 75, 75, 44, 43,
+        43, 42, 41, 41, 41, 42, 42, 42, 43, 43, 45, 48, 48, 51, 54, 54, 56, 58,
+        58, 62, 64, 64, 66, 67, 68, 71, 72, 73, 76, 76, 47, 46, 45, 45, 44, 44,
+        44, 44, 45, 45, 45, 45, 47, 50, 50, 53, 56, 56, 58, 60, 60, 64, 66, 66,
+        69, 70, 71, 74, 75, 76, 79, 79, 48, 47, 46, 46, 45, 44, 45, 45, 45, 45,
+        46, 46, 47, 51, 51, 53, 57, 57, 59, 61, 61, 65, 67, 67, 70, 71, 72, 75,
+        76, 77, 80, 80, 49, 48, 47, 47, 46, 45, 45, 46, 46, 46, 46, 46, 48, 51,
+        51, 54, 57, 57, 60, 62, 62, 66, 68, 68, 71, 72, 73, 77, 77, 78, 81, 81,
+        53, 51, 51, 51, 49, 49, 49, 49, 49, 49, 49, 49, 51, 54, 54, 57, 59, 59,
+        62, 64, 64, 69, 71, 71, 74, 75, 77, 81, 81, 83, 86, 86, 54, 52, 51, 51,
+        50, 49, 49, 50, 50, 49, 49, 49, 51, 54, 54, 57, 60, 60, 63, 65, 65, 69,
+        71, 72, 75, 76, 77, 81, 82, 83, 87, 87, 55, 53, 53, 52, 51, 50, 50, 51,
+        51, 51, 50, 50, 52, 55, 55, 58, 61, 61, 64, 66, 66, 70, 72, 73, 76, 77,
+        78, 83, 83, 85, 88, 88, 59, 57, 56, 56, 54, 54, 54, 54, 54, 54, 53, 53,
+        55, 58, 58, 61, 64, 64, 67, 69, 69, 73, 75, 76, 79, 80, 81, 86, 87, 88,
+        92, 92, 59, 57, 56, 56, 54, 54, 54, 54, 54, 54, 53, 53, 55, 58, 58, 61,
+        64, 64, 67, 69, 69, 73, 75, 76, 79, 80, 81, 86, 87, 88, 92, 92,
+        /* Size 4x8 */
+        32, 32, 37, 52, 32, 33, 36, 49, 32, 34, 38, 49, 34, 37, 44, 54, 35, 38,
+        49, 60, 40, 42, 55, 69, 46, 46, 59, 76, 52, 51, 64, 83,
+        /* Size 8x4 */
+        32, 32, 32, 34, 35, 40, 46, 52, 32, 33, 34, 37, 38, 42, 46, 51, 37, 36,
+        38, 44, 49, 55, 59, 64, 52, 49, 49, 54, 60, 69, 76, 83,
+        /* Size 8x16 */
+        32, 31, 32, 32, 36, 44, 47, 53, 31, 32, 32, 33, 35, 42, 45, 51, 31, 32,
+        32, 33, 35, 41, 44, 49, 31, 32, 33, 33, 35, 41, 44, 49, 32, 32, 34, 34,
+        36, 42, 45, 50, 32, 33, 35, 36, 38, 42, 45, 49, 32, 33, 35, 36, 40, 44,
+        47, 51, 34, 34, 36, 38, 42, 48, 50, 54, 36, 34, 37, 40, 48, 54, 56, 60,
+        38, 36, 39, 41, 49, 56, 58, 63, 39, 37, 40, 42, 50, 58, 60, 65, 44, 41,
+        42, 45, 53, 63, 66, 71, 47, 44, 45, 47, 56, 66, 69, 75, 49, 46, 47, 48,
+        57, 67, 71, 77, 53, 49, 50, 51, 60, 71, 75, 82, 58, 54, 54, 55, 63, 75,
+        79, 87,
+        /* Size 16x8 */
+        32, 31, 31, 31, 32, 32, 32, 34, 36, 38, 39, 44, 47, 49, 53, 58, 31, 32,
+        32, 32, 32, 33, 33, 34, 34, 36, 37, 41, 44, 46, 49, 54, 32, 32, 32, 33,
+        34, 35, 35, 36, 37, 39, 40, 42, 45, 47, 50, 54, 32, 33, 33, 33, 34, 36,
+        36, 38, 40, 41, 42, 45, 47, 48, 51, 55, 36, 35, 35, 35, 36, 38, 40, 42,
+        48, 49, 50, 53, 56, 57, 60, 63, 44, 42, 41, 41, 42, 42, 44, 48, 54, 56,
+        58, 63, 66, 67, 71, 75, 47, 45, 44, 44, 45, 45, 47, 50, 56, 58, 60, 66,
+        69, 71, 75, 79, 53, 51, 49, 49, 50, 49, 51, 54, 60, 63, 65, 71, 75, 77,
+        82, 87,
+        /* Size 16x32 */
+        32, 31, 31, 31, 32, 32, 32, 35, 36, 38, 44, 44, 47, 53, 53, 59, 31, 32,
+        32, 32, 32, 32, 33, 35, 35, 37, 43, 43, 46, 52, 52, 57, 31, 32, 32, 32,
+        32, 32, 33, 35, 35, 37, 42, 42, 45, 51, 51, 56, 31, 32, 32, 32, 32, 32,
+        33, 35, 35, 37, 42, 42, 45, 51, 51, 56, 31, 32, 32, 32, 32, 32, 33, 34,
+        35, 36, 41, 41, 44, 49, 49, 54, 31, 32, 32, 32, 32, 33, 33, 34, 34, 36,
+        41, 41, 44, 49, 49, 54, 31, 32, 32, 32, 33, 33, 33, 35, 35, 36, 41, 41,
+        44, 49, 49, 54, 32, 32, 32, 32, 33, 34, 34, 36, 36, 38, 42, 42, 45, 49,
+        49, 54, 32, 32, 32, 33, 34, 34, 34, 36, 36, 38, 42, 42, 45, 50, 50, 54,
+        32, 32, 32, 33, 34, 34, 35, 37, 37, 38, 42, 42, 45, 49, 49, 54, 32, 32,
+        33, 33, 35, 35, 36, 38, 38, 39, 42, 42, 45, 49, 49, 53, 32, 32, 33, 33,
+        35, 35, 36, 38, 38, 39, 42, 42, 45, 49, 49, 53, 32, 33, 33, 33, 35, 36,
+        36, 39, 40, 41, 44, 44, 47, 51, 51, 55, 34, 34, 34, 34, 36, 37, 38, 42,
+        42, 44, 48, 48, 50, 54, 54, 58, 34, 34, 34, 34, 36, 37, 38, 42, 42, 44,
+        48, 48, 50, 54, 54, 58, 35, 34, 34, 34, 37, 37, 39, 44, 45, 46, 50, 50,
+        53, 57, 57, 61, 36, 35, 34, 35, 37, 38, 40, 47, 48, 49, 54, 54, 56, 60,
+        60, 64, 36, 35, 34, 35, 37, 38, 40, 47, 48, 49, 54, 54, 56, 60, 60, 64,
+        38, 37, 36, 37, 39, 40, 41, 48, 49, 51, 56, 56, 58, 63, 63, 67, 39, 38,
+        37, 38, 40, 40, 42, 49, 50, 52, 58, 58, 60, 65, 65, 69, 39, 38, 37, 38,
+        40, 40, 42, 49, 50, 52, 58, 58, 60, 65, 65, 69, 42, 40, 40, 40, 42, 42,
+        44, 51, 52, 55, 61, 61, 64, 69, 69, 73, 44, 42, 41, 41, 42, 43, 45, 52,
+        53, 56, 63, 63, 66, 71, 71, 75, 44, 42, 41, 41, 43, 43, 45, 52, 54, 56,
+        63, 63, 66, 72, 72, 76, 47, 45, 44, 44, 45, 45, 47, 54, 56, 58, 66, 66,
+        69, 75, 75, 79, 48, 46, 45, 45, 46, 46, 48, 55, 56, 59, 67, 67, 70, 76,
+        76, 80, 49, 47, 46, 46, 47, 47, 48, 56, 57, 60, 67, 67, 71, 77, 77, 81,
+        53, 50, 49, 49, 49, 49, 51, 58, 59, 62, 71, 71, 74, 81, 81, 86, 53, 51,
+        49, 49, 50, 50, 51, 59, 60, 63, 71, 71, 75, 82, 82, 87, 55, 52, 51, 51,
+        51, 51, 53, 60, 61, 64, 72, 72, 76, 83, 83, 88, 58, 55, 54, 54, 54, 54,
+        55, 62, 63, 67, 75, 75, 79, 87, 87, 92, 58, 55, 54, 54, 54, 54, 55, 62,
+        63, 67, 75, 75, 79, 87, 87, 92,
+        /* Size 32x16 */
+        32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 34, 35, 36, 36,
+        38, 39, 39, 42, 44, 44, 47, 48, 49, 53, 53, 55, 58, 58, 31, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 37, 38, 38, 40,
+        42, 42, 45, 46, 47, 50, 51, 52, 55, 55, 31, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 36, 37, 37, 40, 41, 41, 44, 45,
+        46, 49, 49, 51, 54, 54, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+        33, 34, 34, 34, 35, 35, 37, 38, 38, 40, 41, 41, 44, 45, 46, 49, 49, 51,
+        54, 54, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 36, 36, 37,
+        37, 37, 39, 40, 40, 42, 42, 43, 45, 46, 47, 49, 50, 51, 54, 54, 32, 32,
+        32, 32, 32, 33, 33, 34, 34, 34, 35, 35, 36, 37, 37, 37, 38, 38, 40, 40,
+        40, 42, 43, 43, 45, 46, 47, 49, 50, 51, 54, 54, 32, 33, 33, 33, 33, 33,
+        33, 34, 34, 35, 36, 36, 36, 38, 38, 39, 40, 40, 41, 42, 42, 44, 45, 45,
+        47, 48, 48, 51, 51, 53, 55, 55, 35, 35, 35, 35, 34, 34, 35, 36, 36, 37,
+        38, 38, 39, 42, 42, 44, 47, 47, 48, 49, 49, 51, 52, 52, 54, 55, 56, 58,
+        59, 60, 62, 62, 36, 35, 35, 35, 35, 34, 35, 36, 36, 37, 38, 38, 40, 42,
+        42, 45, 48, 48, 49, 50, 50, 52, 53, 54, 56, 56, 57, 59, 60, 61, 63, 63,
+        38, 37, 37, 37, 36, 36, 36, 38, 38, 38, 39, 39, 41, 44, 44, 46, 49, 49,
+        51, 52, 52, 55, 56, 56, 58, 59, 60, 62, 63, 64, 67, 67, 44, 43, 42, 42,
+        41, 41, 41, 42, 42, 42, 42, 42, 44, 48, 48, 50, 54, 54, 56, 58, 58, 61,
+        63, 63, 66, 67, 67, 71, 71, 72, 75, 75, 44, 43, 42, 42, 41, 41, 41, 42,
+        42, 42, 42, 42, 44, 48, 48, 50, 54, 54, 56, 58, 58, 61, 63, 63, 66, 67,
+        67, 71, 71, 72, 75, 75, 47, 46, 45, 45, 44, 44, 44, 45, 45, 45, 45, 45,
+        47, 50, 50, 53, 56, 56, 58, 60, 60, 64, 66, 66, 69, 70, 71, 74, 75, 76,
+        79, 79, 53, 52, 51, 51, 49, 49, 49, 49, 50, 49, 49, 49, 51, 54, 54, 57,
+        60, 60, 63, 65, 65, 69, 71, 72, 75, 76, 77, 81, 82, 83, 87, 87, 53, 52,
+        51, 51, 49, 49, 49, 49, 50, 49, 49, 49, 51, 54, 54, 57, 60, 60, 63, 65,
+        65, 69, 71, 72, 75, 76, 77, 81, 82, 83, 87, 87, 59, 57, 56, 56, 54, 54,
+        54, 54, 54, 54, 53, 53, 55, 58, 58, 61, 64, 64, 67, 69, 69, 73, 75, 76,
+        79, 80, 81, 86, 87, 88, 92, 92,
+        /* Size 4x16 */
+        31, 32, 38, 53, 32, 32, 37, 51, 32, 32, 36, 49, 32, 33, 36, 49, 32, 34,
+        38, 50, 32, 35, 39, 49, 33, 36, 41, 51, 34, 37, 44, 54, 35, 38, 49, 60,
+        37, 40, 51, 63, 38, 40, 52, 65, 42, 43, 56, 71, 45, 45, 58, 75, 47, 47,
+        60, 77, 51, 50, 63, 82, 55, 54, 67, 87,
+        /* Size 16x4 */
+        31, 32, 32, 32, 32, 32, 33, 34, 35, 37, 38, 42, 45, 47, 51, 55, 32, 32,
+        32, 33, 34, 35, 36, 37, 38, 40, 40, 43, 45, 47, 50, 54, 38, 37, 36, 36,
+        38, 39, 41, 44, 49, 51, 52, 56, 58, 60, 63, 67, 53, 51, 49, 49, 50, 49,
+        51, 54, 60, 63, 65, 71, 75, 77, 82, 87,
+        /* Size 8x32 */
+        32, 31, 32, 32, 36, 44, 47, 53, 31, 32, 32, 33, 35, 43, 46, 52, 31, 32,
+        32, 33, 35, 42, 45, 51, 31, 32, 32, 33, 35, 42, 45, 51, 31, 32, 32, 33,
+        35, 41, 44, 49, 31, 32, 32, 33, 34, 41, 44, 49, 31, 32, 33, 33, 35, 41,
+        44, 49, 32, 32, 33, 34, 36, 42, 45, 49, 32, 32, 34, 34, 36, 42, 45, 50,
+        32, 32, 34, 35, 37, 42, 45, 49, 32, 33, 35, 36, 38, 42, 45, 49, 32, 33,
+        35, 36, 38, 42, 45, 49, 32, 33, 35, 36, 40, 44, 47, 51, 34, 34, 36, 38,
+        42, 48, 50, 54, 34, 34, 36, 38, 42, 48, 50, 54, 35, 34, 37, 39, 45, 50,
+        53, 57, 36, 34, 37, 40, 48, 54, 56, 60, 36, 34, 37, 40, 48, 54, 56, 60,
+        38, 36, 39, 41, 49, 56, 58, 63, 39, 37, 40, 42, 50, 58, 60, 65, 39, 37,
+        40, 42, 50, 58, 60, 65, 42, 40, 42, 44, 52, 61, 64, 69, 44, 41, 42, 45,
+        53, 63, 66, 71, 44, 41, 43, 45, 54, 63, 66, 72, 47, 44, 45, 47, 56, 66,
+        69, 75, 48, 45, 46, 48, 56, 67, 70, 76, 49, 46, 47, 48, 57, 67, 71, 77,
+        53, 49, 49, 51, 59, 71, 74, 81, 53, 49, 50, 51, 60, 71, 75, 82, 55, 51,
+        51, 53, 61, 72, 76, 83, 58, 54, 54, 55, 63, 75, 79, 87, 58, 54, 54, 55,
+        63, 75, 79, 87,
+        /* Size 32x8 */
+        32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 34, 35, 36, 36,
+        38, 39, 39, 42, 44, 44, 47, 48, 49, 53, 53, 55, 58, 58, 31, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 36, 37, 37, 40,
+        41, 41, 44, 45, 46, 49, 49, 51, 54, 54, 32, 32, 32, 32, 32, 32, 33, 33,
+        34, 34, 35, 35, 35, 36, 36, 37, 37, 37, 39, 40, 40, 42, 42, 43, 45, 46,
+        47, 49, 50, 51, 54, 54, 32, 33, 33, 33, 33, 33, 33, 34, 34, 35, 36, 36,
+        36, 38, 38, 39, 40, 40, 41, 42, 42, 44, 45, 45, 47, 48, 48, 51, 51, 53,
+        55, 55, 36, 35, 35, 35, 35, 34, 35, 36, 36, 37, 38, 38, 40, 42, 42, 45,
+        48, 48, 49, 50, 50, 52, 53, 54, 56, 56, 57, 59, 60, 61, 63, 63, 44, 43,
+        42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 44, 48, 48, 50, 54, 54, 56, 58,
+        58, 61, 63, 63, 66, 67, 67, 71, 71, 72, 75, 75, 47, 46, 45, 45, 44, 44,
+        44, 45, 45, 45, 45, 45, 47, 50, 50, 53, 56, 56, 58, 60, 60, 64, 66, 66,
+        69, 70, 71, 74, 75, 76, 79, 79, 53, 52, 51, 51, 49, 49, 49, 49, 50, 49,
+        49, 49, 51, 54, 54, 57, 60, 60, 63, 65, 65, 69, 71, 72, 75, 76, 77, 81,
+        82, 83, 87, 87 },
+      { /* Chroma */
+        /* Size 4x4 */
+        31, 38, 47, 49, 38, 47, 46, 46, 47, 46, 54, 57, 49, 46, 57, 66,
+        /* Size 8x8 */
+        31, 31, 35, 42, 48, 47, 49, 51, 31, 32, 36, 42, 46, 45, 46, 48, 35, 36,
+        41, 45, 47, 45, 46, 48, 42, 42, 45, 48, 50, 49, 50, 51, 48, 46, 47, 50,
+        53, 53, 54, 54, 47, 45, 45, 49, 53, 57, 59, 60, 49, 46, 46, 50, 54, 59,
+        61, 64, 51, 48, 48, 51, 54, 60, 64, 68,
+        /* Size 16x16 */
+        32, 31, 30, 31, 33, 36, 38, 41, 49, 49, 48, 49, 50, 51, 52, 54, 31, 31,
+        31, 32, 34, 38, 40, 42, 47, 47, 47, 47, 48, 48, 50, 52, 30, 31, 31, 32,
+        35, 39, 41, 42, 46, 46, 46, 45, 46, 47, 48, 50, 31, 32, 32, 33, 36, 40,
+        41, 43, 46, 46, 45, 45, 46, 46, 47, 49, 33, 34, 35, 36, 39, 43, 44, 45,
+        47, 46, 46, 45, 46, 47, 47, 49, 36, 38, 39, 40, 43, 47, 47, 47, 48, 47,
+        46, 45, 46, 46, 47, 48, 38, 40, 41, 41, 44, 47, 47, 48, 49, 48, 48, 47,
+        47, 47, 48, 49, 41, 42, 42, 43, 45, 47, 48, 48, 50, 50, 49, 49, 50, 50,
+        50, 52, 49, 47, 46, 46, 47, 48, 49, 50, 53, 53, 53, 53, 54, 54, 54, 55,
+        49, 47, 46, 46, 46, 47, 48, 50, 53, 53, 54, 55, 55, 55, 56, 57, 48, 47,
+        46, 45, 46, 46, 48, 49, 53, 54, 54, 55, 56, 56, 57, 58, 49, 47, 45, 45,
+        45, 45, 47, 49, 53, 55, 55, 58, 59, 60, 61, 62, 50, 48, 46, 46, 46, 46,
+        47, 50, 54, 55, 56, 59, 61, 61, 63, 64, 51, 48, 47, 46, 47, 46, 47, 50,
+        54, 55, 56, 60, 61, 62, 64, 66, 52, 50, 48, 47, 47, 47, 48, 50, 54, 56,
+        57, 61, 63, 64, 66, 68, 54, 52, 50, 49, 49, 48, 49, 52, 55, 57, 58, 62,
+        64, 66, 68, 71,
+        /* Size 32x32 */
+        32, 31, 31, 31, 30, 30, 31, 33, 33, 34, 36, 36, 38, 41, 41, 45, 49, 49,
+        49, 48, 48, 49, 49, 49, 50, 50, 51, 52, 52, 53, 54, 54, 31, 31, 31, 31,
+        31, 31, 31, 34, 34, 35, 38, 38, 39, 42, 42, 45, 48, 48, 47, 47, 47, 47,
+        47, 47, 49, 49, 49, 50, 50, 51, 53, 53, 31, 31, 31, 31, 31, 31, 32, 34,
+        34, 35, 38, 38, 40, 42, 42, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48,
+        48, 49, 50, 50, 52, 52, 31, 31, 31, 31, 31, 31, 32, 34, 34, 36, 38, 38,
+        40, 42, 42, 45, 47, 47, 47, 47, 47, 47, 46, 47, 48, 48, 48, 49, 49, 50,
+        52, 52, 30, 31, 31, 31, 31, 31, 32, 35, 35, 36, 39, 39, 41, 42, 42, 44,
+        46, 46, 46, 46, 46, 45, 45, 45, 46, 47, 47, 48, 48, 48, 50, 50, 30, 31,
+        31, 31, 31, 32, 32, 35, 35, 36, 40, 40, 41, 42, 42, 44, 46, 46, 46, 45,
+        45, 45, 45, 45, 46, 46, 46, 47, 47, 48, 49, 49, 31, 31, 32, 32, 32, 32,
+        33, 35, 36, 37, 40, 40, 41, 43, 43, 44, 46, 46, 46, 45, 45, 45, 45, 45,
+        46, 46, 46, 47, 47, 48, 49, 49, 33, 34, 34, 34, 35, 35, 35, 38, 38, 40,
+        43, 43, 43, 44, 44, 46, 47, 47, 46, 46, 46, 45, 45, 45, 46, 46, 47, 47,
+        47, 48, 49, 49, 33, 34, 34, 34, 35, 35, 36, 38, 39, 40, 43, 43, 44, 45,
+        45, 46, 47, 47, 46, 46, 46, 45, 45, 45, 46, 46, 47, 47, 47, 48, 49, 49,
+        34, 35, 35, 36, 36, 36, 37, 40, 40, 41, 44, 44, 45, 45, 45, 46, 47, 47,
+        47, 46, 46, 45, 45, 45, 46, 46, 46, 47, 47, 48, 49, 49, 36, 38, 38, 38,
+        39, 40, 40, 43, 43, 44, 47, 47, 47, 47, 47, 47, 48, 48, 47, 46, 46, 45,
+        45, 45, 46, 46, 46, 46, 47, 47, 48, 48, 36, 38, 38, 38, 39, 40, 40, 43,
+        43, 44, 47, 47, 47, 47, 47, 47, 48, 48, 47, 46, 46, 45, 45, 45, 46, 46,
+        46, 46, 47, 47, 48, 48, 38, 39, 40, 40, 41, 41, 41, 43, 44, 45, 47, 47,
+        47, 48, 48, 48, 49, 49, 48, 48, 48, 47, 47, 47, 47, 47, 47, 48, 48, 48,
+        49, 49, 41, 42, 42, 42, 42, 42, 43, 44, 45, 45, 47, 47, 48, 48, 48, 49,
+        50, 50, 50, 49, 49, 49, 49, 49, 50, 50, 50, 50, 50, 51, 52, 52, 41, 42,
+        42, 42, 42, 42, 43, 44, 45, 45, 47, 47, 48, 48, 48, 49, 50, 50, 50, 49,
+        49, 49, 49, 49, 50, 50, 50, 50, 50, 51, 52, 52, 45, 45, 45, 45, 44, 44,
+        44, 46, 46, 46, 47, 47, 48, 49, 49, 50, 51, 51, 51, 51, 51, 51, 51, 51,
+        52, 52, 52, 52, 52, 52, 53, 53, 49, 48, 47, 47, 46, 46, 46, 47, 47, 47,
+        48, 48, 49, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54,
+        54, 54, 55, 55, 49, 48, 47, 47, 46, 46, 46, 47, 47, 47, 48, 48, 49, 50,
+        50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 54, 54, 55, 55,
+        49, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 48, 50, 50, 51, 53, 53,
+        53, 54, 54, 54, 55, 55, 55, 55, 55, 56, 56, 56, 57, 57, 48, 47, 47, 47,
+        46, 45, 45, 46, 46, 46, 46, 46, 48, 49, 49, 51, 53, 53, 54, 54, 54, 55,
+        55, 56, 56, 56, 56, 57, 57, 58, 58, 58, 48, 47, 47, 47, 46, 45, 45, 46,
+        46, 46, 46, 46, 48, 49, 49, 51, 53, 53, 54, 54, 54, 55, 55, 56, 56, 56,
+        56, 57, 57, 58, 58, 58, 49, 47, 47, 47, 45, 45, 45, 45, 45, 45, 45, 45,
+        47, 49, 49, 51, 53, 53, 54, 55, 55, 57, 57, 58, 58, 59, 59, 60, 60, 60,
+        61, 61, 49, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 51,
+        53, 53, 55, 55, 55, 57, 58, 58, 59, 60, 60, 61, 61, 61, 62, 62, 49, 47,
+        47, 47, 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 51, 53, 53, 55, 56,
+        56, 58, 58, 59, 59, 60, 60, 61, 61, 62, 63, 63, 50, 49, 48, 48, 46, 46,
+        46, 46, 46, 46, 46, 46, 47, 50, 50, 52, 54, 54, 55, 56, 56, 58, 59, 59,
+        61, 61, 61, 63, 63, 63, 64, 64, 50, 49, 48, 48, 47, 46, 46, 46, 46, 46,
+        46, 46, 47, 50, 50, 52, 54, 54, 55, 56, 56, 59, 60, 60, 61, 61, 62, 63,
+        63, 64, 65, 65, 51, 49, 48, 48, 47, 46, 46, 47, 47, 46, 46, 46, 47, 50,
+        50, 52, 54, 54, 55, 56, 56, 59, 60, 60, 61, 62, 62, 64, 64, 64, 66, 66,
+        52, 50, 49, 49, 48, 47, 47, 47, 47, 47, 46, 46, 48, 50, 50, 52, 54, 54,
+        56, 57, 57, 60, 61, 61, 63, 63, 64, 66, 66, 67, 68, 68, 52, 50, 50, 49,
+        48, 47, 47, 47, 47, 47, 47, 47, 48, 50, 50, 52, 54, 54, 56, 57, 57, 60,
+        61, 61, 63, 63, 64, 66, 66, 67, 68, 68, 53, 51, 50, 50, 48, 48, 48, 48,
+        48, 48, 47, 47, 48, 51, 51, 52, 54, 54, 56, 58, 58, 60, 61, 62, 63, 64,
+        64, 67, 67, 68, 69, 69, 54, 53, 52, 52, 50, 49, 49, 49, 49, 49, 48, 48,
+        49, 52, 52, 53, 55, 55, 57, 58, 58, 61, 62, 63, 64, 65, 66, 68, 68, 69,
+        71, 71, 54, 53, 52, 52, 50, 49, 49, 49, 49, 49, 48, 48, 49, 52, 52, 53,
+        55, 55, 57, 58, 58, 61, 62, 63, 64, 65, 66, 68, 68, 69, 71, 71,
+        /* Size 4x8 */
+        31, 38, 47, 50, 31, 40, 46, 48, 36, 44, 47, 47, 42, 47, 50, 50, 47, 48,
+        53, 54, 46, 46, 54, 60, 48, 46, 55, 64, 50, 48, 56, 67,
+        /* Size 8x4 */
+        31, 31, 36, 42, 47, 46, 48, 50, 38, 40, 44, 47, 48, 46, 46, 48, 47, 46,
+        47, 50, 53, 54, 55, 56, 50, 48, 47, 50, 54, 60, 64, 67,
+        /* Size 8x16 */
+        32, 31, 35, 38, 48, 49, 50, 52, 31, 31, 37, 40, 47, 47, 48, 50, 30, 32,
+        38, 40, 46, 45, 46, 48, 31, 33, 38, 41, 46, 45, 46, 48, 33, 36, 41, 44,
+        47, 46, 46, 47, 37, 40, 45, 47, 47, 45, 46, 47, 39, 41, 46, 47, 48, 47,
+        47, 48, 42, 43, 46, 48, 50, 49, 50, 50, 49, 46, 48, 49, 53, 53, 54, 54,
+        48, 46, 47, 48, 53, 55, 55, 56, 48, 46, 46, 48, 53, 56, 56, 57, 49, 45,
+        45, 47, 53, 58, 59, 61, 50, 46, 46, 48, 54, 59, 61, 63, 51, 47, 47, 48,
+        54, 60, 61, 64, 52, 48, 47, 48, 54, 61, 63, 66, 54, 50, 49, 50, 55, 62,
+        65, 68,
+        /* Size 16x8 */
+        32, 31, 30, 31, 33, 37, 39, 42, 49, 48, 48, 49, 50, 51, 52, 54, 31, 31,
+        32, 33, 36, 40, 41, 43, 46, 46, 46, 45, 46, 47, 48, 50, 35, 37, 38, 38,
+        41, 45, 46, 46, 48, 47, 46, 45, 46, 47, 47, 49, 38, 40, 40, 41, 44, 47,
+        47, 48, 49, 48, 48, 47, 48, 48, 48, 50, 48, 47, 46, 46, 47, 47, 48, 50,
+        53, 53, 53, 53, 54, 54, 54, 55, 49, 47, 45, 45, 46, 45, 47, 49, 53, 55,
+        56, 58, 59, 60, 61, 62, 50, 48, 46, 46, 46, 46, 47, 50, 54, 55, 56, 59,
+        61, 61, 63, 65, 52, 50, 48, 48, 47, 47, 48, 50, 54, 56, 57, 61, 63, 64,
+        66, 68,
+        /* Size 16x32 */
+        32, 31, 31, 31, 35, 37, 38, 47, 48, 48, 49, 49, 50, 52, 52, 54, 31, 31,
+        31, 32, 36, 38, 39, 46, 47, 47, 48, 48, 49, 50, 50, 53, 31, 31, 31, 32,
+        37, 38, 40, 46, 47, 47, 47, 47, 48, 50, 50, 52, 31, 31, 31, 32, 37, 38,
+        40, 46, 47, 47, 47, 47, 48, 50, 50, 52, 30, 31, 32, 32, 38, 39, 40, 45,
+        46, 46, 45, 45, 46, 48, 48, 50, 30, 31, 32, 33, 38, 40, 41, 45, 46, 46,
+        45, 45, 46, 48, 48, 50, 31, 32, 33, 33, 38, 40, 41, 45, 46, 46, 45, 45,
+        46, 48, 48, 50, 33, 35, 35, 36, 41, 43, 43, 46, 47, 46, 45, 45, 46, 47,
+        47, 49, 33, 35, 36, 36, 41, 43, 44, 46, 47, 46, 46, 46, 46, 47, 47, 49,
+        34, 36, 37, 37, 42, 44, 45, 47, 47, 47, 45, 45, 46, 47, 47, 49, 37, 39,
+        40, 41, 45, 47, 47, 47, 47, 47, 45, 45, 46, 47, 47, 48, 37, 39, 40, 41,
+        45, 47, 47, 47, 47, 47, 45, 45, 46, 47, 47, 48, 39, 40, 41, 42, 46, 47,
+        47, 48, 48, 48, 47, 47, 47, 48, 48, 50, 42, 42, 43, 43, 46, 47, 48, 50,
+        50, 50, 49, 49, 50, 50, 50, 52, 42, 42, 43, 43, 46, 47, 48, 50, 50, 50,
+        49, 49, 50, 50, 50, 52, 45, 45, 44, 45, 47, 47, 48, 51, 51, 51, 51, 51,
+        52, 52, 52, 54, 49, 47, 46, 47, 48, 48, 49, 52, 53, 53, 53, 53, 54, 54,
+        54, 55, 49, 47, 46, 47, 48, 48, 49, 52, 53, 53, 53, 53, 54, 54, 54, 55,
+        48, 47, 46, 46, 47, 47, 48, 52, 53, 53, 55, 55, 55, 56, 56, 57, 48, 46,
+        46, 46, 46, 47, 48, 52, 53, 54, 56, 56, 56, 57, 57, 59, 48, 46, 46, 46,
+        46, 47, 48, 52, 53, 54, 56, 56, 56, 57, 57, 59, 49, 46, 45, 45, 46, 46,
+        47, 52, 53, 54, 57, 57, 58, 60, 60, 61, 49, 46, 45, 45, 45, 46, 47, 52,
+        53, 55, 58, 58, 59, 61, 61, 62, 49, 46, 45, 45, 46, 46, 47, 52, 53, 55,
+        58, 58, 60, 61, 61, 63, 50, 47, 46, 46, 46, 46, 48, 53, 54, 55, 59, 59,
+        61, 63, 63, 65, 50, 48, 46, 46, 46, 46, 48, 53, 54, 55, 59, 59, 61, 64,
+        64, 65, 51, 48, 47, 47, 47, 47, 48, 53, 54, 55, 60, 60, 61, 64, 64, 66,
+        52, 49, 48, 48, 47, 47, 48, 53, 54, 56, 61, 61, 63, 66, 66, 68, 52, 49,
+        48, 48, 47, 47, 48, 53, 54, 56, 61, 61, 63, 66, 66, 68, 53, 50, 48, 48,
+        48, 48, 49, 54, 54, 56, 61, 61, 63, 67, 67, 69, 54, 51, 50, 50, 49, 49,
+        50, 55, 55, 57, 62, 62, 65, 68, 68, 71, 54, 51, 50, 50, 49, 49, 50, 55,
+        55, 57, 62, 62, 65, 68, 68, 71,
+        /* Size 32x16 */
+        32, 31, 31, 31, 30, 30, 31, 33, 33, 34, 37, 37, 39, 42, 42, 45, 49, 49,
+        48, 48, 48, 49, 49, 49, 50, 50, 51, 52, 52, 53, 54, 54, 31, 31, 31, 31,
+        31, 31, 32, 35, 35, 36, 39, 39, 40, 42, 42, 45, 47, 47, 47, 46, 46, 46,
+        46, 46, 47, 48, 48, 49, 49, 50, 51, 51, 31, 31, 31, 31, 32, 32, 33, 35,
+        36, 37, 40, 40, 41, 43, 43, 44, 46, 46, 46, 46, 46, 45, 45, 45, 46, 46,
+        47, 48, 48, 48, 50, 50, 31, 32, 32, 32, 32, 33, 33, 36, 36, 37, 41, 41,
+        42, 43, 43, 45, 47, 47, 46, 46, 46, 45, 45, 45, 46, 46, 47, 48, 48, 48,
+        50, 50, 35, 36, 37, 37, 38, 38, 38, 41, 41, 42, 45, 45, 46, 46, 46, 47,
+        48, 48, 47, 46, 46, 46, 45, 46, 46, 46, 47, 47, 47, 48, 49, 49, 37, 38,
+        38, 38, 39, 40, 40, 43, 43, 44, 47, 47, 47, 47, 47, 47, 48, 48, 47, 47,
+        47, 46, 46, 46, 46, 46, 47, 47, 47, 48, 49, 49, 38, 39, 40, 40, 40, 41,
+        41, 43, 44, 45, 47, 47, 47, 48, 48, 48, 49, 49, 48, 48, 48, 47, 47, 47,
+        48, 48, 48, 48, 48, 49, 50, 50, 47, 46, 46, 46, 45, 45, 45, 46, 46, 47,
+        47, 47, 48, 50, 50, 51, 52, 52, 52, 52, 52, 52, 52, 52, 53, 53, 53, 53,
+        53, 54, 55, 55, 48, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 48, 50,
+        50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 54, 54, 55, 55,
+        48, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 48, 50, 50, 51, 53, 53,
+        53, 54, 54, 54, 55, 55, 55, 55, 55, 56, 56, 56, 57, 57, 49, 48, 47, 47,
+        45, 45, 45, 45, 46, 45, 45, 45, 47, 49, 49, 51, 53, 53, 55, 56, 56, 57,
+        58, 58, 59, 59, 60, 61, 61, 61, 62, 62, 49, 48, 47, 47, 45, 45, 45, 45,
+        46, 45, 45, 45, 47, 49, 49, 51, 53, 53, 55, 56, 56, 57, 58, 58, 59, 59,
+        60, 61, 61, 61, 62, 62, 50, 49, 48, 48, 46, 46, 46, 46, 46, 46, 46, 46,
+        47, 50, 50, 52, 54, 54, 55, 56, 56, 58, 59, 60, 61, 61, 61, 63, 63, 63,
+        65, 65, 52, 50, 50, 50, 48, 48, 48, 47, 47, 47, 47, 47, 48, 50, 50, 52,
+        54, 54, 56, 57, 57, 60, 61, 61, 63, 64, 64, 66, 66, 67, 68, 68, 52, 50,
+        50, 50, 48, 48, 48, 47, 47, 47, 47, 47, 48, 50, 50, 52, 54, 54, 56, 57,
+        57, 60, 61, 61, 63, 64, 64, 66, 66, 67, 68, 68, 54, 53, 52, 52, 50, 50,
+        50, 49, 49, 49, 48, 48, 50, 52, 52, 54, 55, 55, 57, 59, 59, 61, 62, 63,
+        65, 65, 66, 68, 68, 69, 71, 71,
+        /* Size 4x16 */
+        31, 37, 48, 52, 31, 38, 47, 50, 31, 39, 46, 48, 32, 40, 46, 48, 35, 43,
+        46, 47, 39, 47, 47, 47, 40, 47, 48, 48, 42, 47, 50, 50, 47, 48, 53, 54,
+        47, 47, 53, 56, 46, 47, 54, 57, 46, 46, 55, 61, 47, 46, 55, 63, 48, 47,
+        55, 64, 49, 47, 56, 66, 51, 49, 57, 68,
+        /* Size 16x4 */
+        31, 31, 31, 32, 35, 39, 40, 42, 47, 47, 46, 46, 47, 48, 49, 51, 37, 38,
+        39, 40, 43, 47, 47, 47, 48, 47, 47, 46, 46, 47, 47, 49, 48, 47, 46, 46,
+        46, 47, 48, 50, 53, 53, 54, 55, 55, 55, 56, 57, 52, 50, 48, 48, 47, 47,
+        48, 50, 54, 56, 57, 61, 63, 64, 66, 68,
+        /* Size 8x32 */
+        32, 31, 35, 38, 48, 49, 50, 52, 31, 31, 36, 39, 47, 48, 49, 50, 31, 31,
+        37, 40, 47, 47, 48, 50, 31, 31, 37, 40, 47, 47, 48, 50, 30, 32, 38, 40,
+        46, 45, 46, 48, 30, 32, 38, 41, 46, 45, 46, 48, 31, 33, 38, 41, 46, 45,
+        46, 48, 33, 35, 41, 43, 47, 45, 46, 47, 33, 36, 41, 44, 47, 46, 46, 47,
+        34, 37, 42, 45, 47, 45, 46, 47, 37, 40, 45, 47, 47, 45, 46, 47, 37, 40,
+        45, 47, 47, 45, 46, 47, 39, 41, 46, 47, 48, 47, 47, 48, 42, 43, 46, 48,
+        50, 49, 50, 50, 42, 43, 46, 48, 50, 49, 50, 50, 45, 44, 47, 48, 51, 51,
+        52, 52, 49, 46, 48, 49, 53, 53, 54, 54, 49, 46, 48, 49, 53, 53, 54, 54,
+        48, 46, 47, 48, 53, 55, 55, 56, 48, 46, 46, 48, 53, 56, 56, 57, 48, 46,
+        46, 48, 53, 56, 56, 57, 49, 45, 46, 47, 53, 57, 58, 60, 49, 45, 45, 47,
+        53, 58, 59, 61, 49, 45, 46, 47, 53, 58, 60, 61, 50, 46, 46, 48, 54, 59,
+        61, 63, 50, 46, 46, 48, 54, 59, 61, 64, 51, 47, 47, 48, 54, 60, 61, 64,
+        52, 48, 47, 48, 54, 61, 63, 66, 52, 48, 47, 48, 54, 61, 63, 66, 53, 48,
+        48, 49, 54, 61, 63, 67, 54, 50, 49, 50, 55, 62, 65, 68, 54, 50, 49, 50,
+        55, 62, 65, 68,
+        /* Size 32x8 */
+        32, 31, 31, 31, 30, 30, 31, 33, 33, 34, 37, 37, 39, 42, 42, 45, 49, 49,
+        48, 48, 48, 49, 49, 49, 50, 50, 51, 52, 52, 53, 54, 54, 31, 31, 31, 31,
+        32, 32, 33, 35, 36, 37, 40, 40, 41, 43, 43, 44, 46, 46, 46, 46, 46, 45,
+        45, 45, 46, 46, 47, 48, 48, 48, 50, 50, 35, 36, 37, 37, 38, 38, 38, 41,
+        41, 42, 45, 45, 46, 46, 46, 47, 48, 48, 47, 46, 46, 46, 45, 46, 46, 46,
+        47, 47, 47, 48, 49, 49, 38, 39, 40, 40, 40, 41, 41, 43, 44, 45, 47, 47,
+        47, 48, 48, 48, 49, 49, 48, 48, 48, 47, 47, 47, 48, 48, 48, 48, 48, 49,
+        50, 50, 48, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 48, 50, 50, 51,
+        53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 54, 54, 55, 55, 49, 48,
+        47, 47, 45, 45, 45, 45, 46, 45, 45, 45, 47, 49, 49, 51, 53, 53, 55, 56,
+        56, 57, 58, 58, 59, 59, 60, 61, 61, 61, 62, 62, 50, 49, 48, 48, 46, 46,
+        46, 46, 46, 46, 46, 46, 47, 50, 50, 52, 54, 54, 55, 56, 56, 58, 59, 60,
+        61, 61, 61, 63, 63, 63, 65, 65, 52, 50, 50, 50, 48, 48, 48, 47, 47, 47,
+        47, 47, 48, 50, 50, 52, 54, 54, 56, 57, 57, 60, 61, 61, 63, 64, 64, 66,
+        66, 67, 68, 68 },
+  },
+  {
+      { /* Luma */
+        /* Size 4x4 */
+        32, 32, 35, 43, 32, 34, 37, 43, 35, 37, 48, 54, 43, 43, 54, 65,
+        /* Size 8x8 */
+        31, 31, 32, 32, 34, 37, 43, 47, 31, 32, 32, 32, 34, 36, 41, 44, 32, 32,
+        33, 34, 35, 38, 42, 45, 32, 32, 34, 35, 37, 39, 42, 46, 34, 34, 35, 37,
+        41, 45, 49, 52, 37, 36, 38, 39, 45, 51, 56, 59, 43, 41, 42, 42, 49, 56,
+        63, 67, 47, 44, 45, 46, 52, 59, 67, 71,
+        /* Size 16x16 */
+        32, 31, 31, 31, 31, 31, 32, 32, 34, 35, 36, 39, 41, 44, 47, 48, 31, 32,
+        32, 32, 32, 32, 32, 33, 34, 35, 35, 38, 40, 42, 45, 46, 31, 32, 32, 32,
+        32, 32, 32, 33, 34, 34, 35, 38, 39, 42, 45, 45, 31, 32, 32, 32, 32, 32,
+        32, 33, 33, 34, 34, 37, 38, 41, 44, 44, 31, 32, 32, 32, 33, 33, 33, 34,
+        35, 36, 36, 39, 40, 42, 44, 45, 31, 32, 32, 32, 33, 33, 34, 34, 35, 36,
+        36, 39, 40, 42, 45, 45, 32, 32, 32, 32, 33, 34, 35, 36, 37, 38, 38, 40,
+        41, 42, 45, 46, 32, 33, 33, 33, 34, 34, 36, 36, 38, 39, 40, 42, 43, 44,
+        47, 47, 34, 34, 34, 33, 35, 35, 37, 38, 39, 42, 42, 45, 46, 47, 50, 51,
+        35, 35, 34, 34, 36, 36, 38, 39, 42, 46, 47, 49, 50, 52, 55, 55, 36, 35,
+        35, 34, 36, 36, 38, 40, 42, 47, 48, 50, 52, 54, 56, 57, 39, 38, 38, 37,
+        39, 39, 40, 42, 45, 49, 50, 54, 55, 58, 60, 61, 41, 40, 39, 38, 40, 40,
+        41, 43, 46, 50, 52, 55, 57, 60, 62, 63, 44, 42, 42, 41, 42, 42, 42, 44,
+        47, 52, 54, 58, 60, 63, 66, 67, 47, 45, 45, 44, 44, 45, 45, 47, 50, 55,
+        56, 60, 62, 66, 69, 70, 48, 46, 45, 44, 45, 45, 46, 47, 51, 55, 57, 61,
+        63, 67, 70, 71,
+        /* Size 32x32 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 34,
+        35, 36, 36, 38, 39, 39, 41, 44, 44, 45, 47, 48, 48, 51, 31, 31, 31, 31,
+        31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 35, 37,
+        39, 39, 40, 43, 43, 44, 46, 47, 47, 50, 31, 31, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 35, 37, 38, 38, 40, 42,
+        42, 43, 45, 46, 46, 49, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 33, 34, 34, 34, 35, 35, 35, 37, 38, 38, 40, 42, 42, 43, 45, 46,
+        46, 49, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34,
+        34, 34, 34, 35, 35, 36, 38, 38, 39, 42, 42, 42, 45, 45, 45, 48, 31, 31,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34,
+        34, 36, 37, 37, 38, 41, 41, 41, 44, 44, 44, 47, 31, 31, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 36, 37, 37,
+        38, 41, 41, 41, 44, 44, 44, 47, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 36, 38, 38, 39, 41, 41, 42,
+        44, 45, 45, 47, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33,
+        34, 35, 35, 35, 36, 36, 36, 37, 39, 39, 40, 42, 42, 42, 44, 45, 45, 48,
+        31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, 35, 35,
+        36, 36, 36, 38, 39, 39, 40, 42, 42, 42, 45, 45, 45, 48, 31, 32, 32, 32,
+        32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 36, 36, 38,
+        39, 39, 40, 42, 42, 42, 45, 45, 45, 48, 32, 32, 32, 32, 32, 32, 32, 33,
+        33, 33, 33, 34, 35, 35, 35, 36, 36, 36, 37, 37, 37, 39, 40, 40, 41, 42,
+        42, 43, 45, 45, 45, 48, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 35,
+        35, 35, 36, 37, 37, 37, 38, 38, 38, 39, 40, 40, 41, 42, 42, 43, 45, 46,
+        46, 48, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 36, 37,
+        37, 37, 38, 38, 38, 39, 40, 40, 41, 42, 42, 43, 45, 46, 46, 48, 32, 33,
+        33, 33, 33, 33, 33, 33, 34, 34, 34, 35, 36, 36, 36, 38, 38, 38, 39, 40,
+        40, 41, 42, 42, 43, 44, 44, 45, 47, 47, 47, 50, 34, 34, 34, 34, 34, 33,
+        33, 34, 35, 35, 35, 36, 37, 37, 38, 39, 39, 40, 42, 42, 42, 44, 45, 45,
+        46, 47, 47, 48, 50, 51, 51, 53, 34, 34, 34, 34, 34, 33, 33, 34, 35, 35,
+        35, 36, 37, 37, 38, 39, 39, 40, 42, 42, 42, 44, 45, 45, 46, 47, 47, 48,
+        50, 51, 51, 53, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 36, 37, 37,
+        38, 40, 40, 41, 43, 44, 44, 45, 46, 46, 47, 49, 49, 49, 51, 52, 52, 54,
+        35, 35, 35, 35, 34, 34, 34, 34, 36, 36, 36, 37, 38, 38, 39, 42, 42, 43,
+        46, 47, 47, 48, 49, 49, 50, 52, 52, 53, 55, 55, 55, 57, 36, 35, 35, 35,
+        35, 34, 34, 35, 36, 36, 36, 37, 38, 38, 40, 42, 42, 44, 47, 48, 48, 50,
+        50, 50, 52, 54, 54, 54, 56, 57, 57, 58, 36, 35, 35, 35, 35, 34, 34, 35,
+        36, 36, 36, 37, 38, 38, 40, 42, 42, 44, 47, 48, 48, 50, 50, 50, 52, 54,
+        54, 54, 56, 57, 57, 58, 38, 37, 37, 37, 36, 36, 36, 36, 37, 38, 38, 39,
+        39, 39, 41, 44, 44, 45, 48, 50, 50, 51, 52, 52, 54, 56, 56, 57, 58, 59,
+        59, 61, 39, 39, 38, 38, 38, 37, 37, 38, 39, 39, 39, 40, 40, 40, 42, 45,
+        45, 46, 49, 50, 50, 52, 54, 54, 55, 58, 58, 58, 60, 61, 61, 63, 39, 39,
+        38, 38, 38, 37, 37, 38, 39, 39, 39, 40, 40, 40, 42, 45, 45, 46, 49, 50,
+        50, 52, 54, 54, 55, 58, 58, 58, 60, 61, 61, 63, 41, 40, 40, 40, 39, 38,
+        38, 39, 40, 40, 40, 41, 41, 41, 43, 46, 46, 47, 50, 52, 52, 54, 55, 55,
+        57, 60, 60, 60, 62, 63, 63, 66, 44, 43, 42, 42, 42, 41, 41, 41, 42, 42,
+        42, 42, 42, 42, 44, 47, 47, 49, 52, 54, 54, 56, 58, 58, 60, 63, 63, 64,
+        66, 67, 67, 69, 44, 43, 42, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 42,
+        44, 47, 47, 49, 52, 54, 54, 56, 58, 58, 60, 63, 63, 64, 66, 67, 67, 69,
+        45, 44, 43, 43, 42, 41, 41, 42, 42, 42, 42, 43, 43, 43, 45, 48, 48, 49,
+        53, 54, 54, 57, 58, 58, 60, 64, 64, 65, 67, 68, 68, 70, 47, 46, 45, 45,
+        45, 44, 44, 44, 44, 45, 45, 45, 45, 45, 47, 50, 50, 51, 55, 56, 56, 58,
+        60, 60, 62, 66, 66, 67, 69, 70, 70, 73, 48, 47, 46, 46, 45, 44, 44, 45,
+        45, 45, 45, 45, 46, 46, 47, 51, 51, 52, 55, 57, 57, 59, 61, 61, 63, 67,
+        67, 68, 70, 71, 71, 74, 48, 47, 46, 46, 45, 44, 44, 45, 45, 45, 45, 45,
+        46, 46, 47, 51, 51, 52, 55, 57, 57, 59, 61, 61, 63, 67, 67, 68, 70, 71,
+        71, 74, 51, 50, 49, 49, 48, 47, 47, 47, 48, 48, 48, 48, 48, 48, 50, 53,
+        53, 54, 57, 58, 58, 61, 63, 63, 66, 69, 69, 70, 73, 74, 74, 77,
+        /* Size 4x8 */
+        31, 32, 35, 43, 32, 33, 34, 41, 32, 34, 36, 42, 32, 35, 38, 42, 34, 37,
+        43, 49, 37, 40, 49, 56, 42, 43, 53, 63, 46, 46, 56, 67,
+        /* Size 8x4 */
+        31, 32, 32, 32, 34, 37, 42, 46, 32, 33, 34, 35, 37, 40, 43, 46, 35, 34,
+        36, 38, 43, 49, 53, 56, 43, 41, 42, 42, 49, 56, 63, 67,
+        /* Size 8x16 */
+        32, 31, 31, 32, 35, 36, 44, 47, 31, 32, 32, 32, 35, 35, 42, 45, 31, 32,
+        32, 32, 34, 35, 41, 45, 31, 32, 32, 33, 34, 34, 41, 44, 31, 32, 33, 34,
+        35, 36, 42, 44, 32, 32, 33, 34, 36, 36, 42, 45, 32, 33, 34, 35, 37, 38,
+        42, 45, 32, 33, 34, 36, 39, 40, 44, 47, 34, 34, 35, 37, 41, 42, 48, 50,
+        35, 34, 36, 38, 45, 47, 52, 55, 36, 34, 36, 38, 46, 48, 54, 56, 39, 37,
+        39, 40, 48, 50, 58, 60, 41, 39, 40, 41, 49, 51, 60, 62, 44, 41, 42, 43,
+        51, 53, 63, 66, 47, 44, 44, 45, 53, 56, 66, 69, 48, 45, 45, 46, 54, 56,
+        67, 70,
+        /* Size 16x8 */
+        32, 31, 31, 31, 31, 32, 32, 32, 34, 35, 36, 39, 41, 44, 47, 48, 31, 32,
+        32, 32, 32, 32, 33, 33, 34, 34, 34, 37, 39, 41, 44, 45, 31, 32, 32, 32,
+        33, 33, 34, 34, 35, 36, 36, 39, 40, 42, 44, 45, 32, 32, 32, 33, 34, 34,
+        35, 36, 37, 38, 38, 40, 41, 43, 45, 46, 35, 35, 34, 34, 35, 36, 37, 39,
+        41, 45, 46, 48, 49, 51, 53, 54, 36, 35, 35, 34, 36, 36, 38, 40, 42, 47,
+        48, 50, 51, 53, 56, 56, 44, 42, 41, 41, 42, 42, 42, 44, 48, 52, 54, 58,
+        60, 63, 66, 67, 47, 45, 45, 44, 44, 45, 45, 47, 50, 55, 56, 60, 62, 66,
+        69, 70,
+        /* Size 16x32 */
+        32, 31, 31, 31, 31, 32, 32, 32, 35, 36, 36, 40, 44, 44, 47, 53, 31, 31,
+        32, 32, 32, 32, 32, 33, 35, 35, 35, 39, 43, 43, 46, 52, 31, 32, 32, 32,
+        32, 32, 32, 33, 35, 35, 35, 39, 42, 42, 45, 51, 31, 32, 32, 32, 32, 32,
+        32, 33, 35, 35, 35, 39, 42, 42, 45, 51, 31, 32, 32, 32, 32, 32, 32, 33,
+        34, 35, 35, 39, 41, 41, 45, 50, 31, 32, 32, 32, 32, 33, 33, 33, 34, 34,
+        34, 38, 41, 41, 44, 49, 31, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 38,
+        41, 41, 44, 49, 31, 32, 32, 32, 32, 33, 33, 33, 34, 35, 35, 38, 41, 41,
+        44, 49, 31, 32, 32, 32, 33, 34, 34, 34, 35, 36, 36, 39, 42, 42, 44, 49,
+        32, 32, 32, 32, 33, 34, 34, 34, 36, 36, 36, 39, 42, 42, 45, 50, 32, 32,
+        32, 32, 33, 34, 34, 34, 36, 36, 36, 39, 42, 42, 45, 50, 32, 32, 32, 32,
+        33, 35, 35, 35, 37, 37, 37, 40, 42, 42, 45, 49, 32, 32, 33, 33, 34, 35,
+        35, 36, 37, 38, 38, 41, 42, 42, 45, 49, 32, 32, 33, 33, 34, 35, 35, 36,
+        37, 38, 38, 41, 42, 42, 45, 49, 32, 33, 33, 33, 34, 36, 36, 36, 39, 40,
+        40, 42, 44, 44, 47, 51, 34, 34, 34, 34, 35, 37, 37, 38, 41, 42, 42, 45,
+        48, 48, 50, 54, 34, 34, 34, 34, 35, 37, 37, 38, 41, 42, 42, 45, 48, 48,
+        50, 54, 34, 34, 34, 34, 35, 37, 37, 38, 42, 43, 43, 46, 49, 49, 51, 55,
+        35, 35, 34, 34, 36, 38, 38, 39, 45, 47, 47, 50, 52, 52, 55, 59, 36, 35,
+        34, 34, 36, 38, 38, 40, 46, 48, 48, 51, 54, 54, 56, 60, 36, 35, 34, 34,
+        36, 38, 38, 40, 46, 48, 48, 51, 54, 54, 56, 60, 38, 37, 36, 36, 37, 40,
+        40, 41, 47, 49, 49, 53, 56, 56, 58, 63, 39, 38, 37, 37, 39, 40, 40, 42,
+        48, 50, 50, 54, 58, 58, 60, 65, 39, 38, 37, 37, 39, 40, 40, 42, 48, 50,
+        50, 54, 58, 58, 60, 65, 41, 40, 39, 39, 40, 41, 41, 43, 49, 51, 51, 56,
+        60, 60, 62, 67, 44, 42, 41, 41, 42, 43, 43, 45, 51, 53, 53, 59, 63, 63,
+        66, 71, 44, 42, 41, 41, 42, 43, 43, 45, 51, 53, 53, 59, 63, 63, 66, 71,
+        44, 43, 42, 42, 42, 43, 43, 45, 51, 54, 54, 59, 64, 64, 67, 72, 47, 45,
+        44, 44, 44, 45, 45, 47, 53, 56, 56, 61, 66, 66, 69, 75, 48, 46, 45, 45,
+        45, 46, 46, 48, 54, 56, 56, 62, 67, 67, 70, 76, 48, 46, 45, 45, 45, 46,
+        46, 48, 54, 56, 56, 62, 67, 67, 70, 76, 51, 49, 47, 47, 48, 48, 48, 50,
+        56, 58, 58, 64, 69, 69, 73, 79,
+        /* Size 32x16 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 34, 34,
+        35, 36, 36, 38, 39, 39, 41, 44, 44, 44, 47, 48, 48, 51, 31, 31, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 35, 37,
+        38, 38, 40, 42, 42, 43, 45, 46, 46, 49, 31, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 34, 36, 37, 37, 39, 41,
+        41, 42, 44, 45, 45, 47, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        33, 33, 33, 34, 34, 34, 34, 34, 34, 36, 37, 37, 39, 41, 41, 42, 44, 45,
+        45, 47, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35,
+        35, 35, 36, 36, 36, 37, 39, 39, 40, 42, 42, 42, 44, 45, 45, 48, 32, 32,
+        32, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 38, 38,
+        38, 40, 40, 40, 41, 43, 43, 43, 45, 46, 46, 48, 32, 32, 32, 32, 32, 33,
+        33, 33, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 38, 38, 38, 40, 40, 40,
+        41, 43, 43, 43, 45, 46, 46, 48, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34,
+        34, 35, 36, 36, 36, 38, 38, 38, 39, 40, 40, 41, 42, 42, 43, 45, 45, 45,
+        47, 48, 48, 50, 35, 35, 35, 35, 34, 34, 34, 34, 35, 36, 36, 37, 37, 37,
+        39, 41, 41, 42, 45, 46, 46, 47, 48, 48, 49, 51, 51, 51, 53, 54, 54, 56,
+        36, 35, 35, 35, 35, 34, 34, 35, 36, 36, 36, 37, 38, 38, 40, 42, 42, 43,
+        47, 48, 48, 49, 50, 50, 51, 53, 53, 54, 56, 56, 56, 58, 36, 35, 35, 35,
+        35, 34, 34, 35, 36, 36, 36, 37, 38, 38, 40, 42, 42, 43, 47, 48, 48, 49,
+        50, 50, 51, 53, 53, 54, 56, 56, 56, 58, 40, 39, 39, 39, 39, 38, 38, 38,
+        39, 39, 39, 40, 41, 41, 42, 45, 45, 46, 50, 51, 51, 53, 54, 54, 56, 59,
+        59, 59, 61, 62, 62, 64, 44, 43, 42, 42, 41, 41, 41, 41, 42, 42, 42, 42,
+        42, 42, 44, 48, 48, 49, 52, 54, 54, 56, 58, 58, 60, 63, 63, 64, 66, 67,
+        67, 69, 44, 43, 42, 42, 41, 41, 41, 41, 42, 42, 42, 42, 42, 42, 44, 48,
+        48, 49, 52, 54, 54, 56, 58, 58, 60, 63, 63, 64, 66, 67, 67, 69, 47, 46,
+        45, 45, 45, 44, 44, 44, 44, 45, 45, 45, 45, 45, 47, 50, 50, 51, 55, 56,
+        56, 58, 60, 60, 62, 66, 66, 67, 69, 70, 70, 73, 53, 52, 51, 51, 50, 49,
+        49, 49, 49, 50, 50, 49, 49, 49, 51, 54, 54, 55, 59, 60, 60, 63, 65, 65,
+        67, 71, 71, 72, 75, 76, 76, 79,
+        /* Size 4x16 */
+        31, 32, 36, 44, 32, 32, 35, 42, 32, 32, 35, 41, 32, 33, 34, 41, 32, 34,
+        36, 42, 32, 34, 36, 42, 32, 35, 38, 42, 33, 36, 40, 44, 34, 37, 42, 48,
+        35, 38, 47, 52, 35, 38, 48, 54, 38, 40, 50, 58, 40, 41, 51, 60, 42, 43,
+        53, 63, 45, 45, 56, 66, 46, 46, 56, 67,
+        /* Size 16x4 */
+        31, 32, 32, 32, 32, 32, 32, 33, 34, 35, 35, 38, 40, 42, 45, 46, 32, 32,
+        32, 33, 34, 34, 35, 36, 37, 38, 38, 40, 41, 43, 45, 46, 36, 35, 35, 34,
+        36, 36, 38, 40, 42, 47, 48, 50, 51, 53, 56, 56, 44, 42, 41, 41, 42, 42,
+        42, 44, 48, 52, 54, 58, 60, 63, 66, 67,
+        /* Size 8x32 */
+        32, 31, 31, 32, 35, 36, 44, 47, 31, 32, 32, 32, 35, 35, 43, 46, 31, 32,
+        32, 32, 35, 35, 42, 45, 31, 32, 32, 32, 35, 35, 42, 45, 31, 32, 32, 32,
+        34, 35, 41, 45, 31, 32, 32, 33, 34, 34, 41, 44, 31, 32, 32, 33, 34, 34,
+        41, 44, 31, 32, 32, 33, 34, 35, 41, 44, 31, 32, 33, 34, 35, 36, 42, 44,
+        32, 32, 33, 34, 36, 36, 42, 45, 32, 32, 33, 34, 36, 36, 42, 45, 32, 32,
+        33, 35, 37, 37, 42, 45, 32, 33, 34, 35, 37, 38, 42, 45, 32, 33, 34, 35,
+        37, 38, 42, 45, 32, 33, 34, 36, 39, 40, 44, 47, 34, 34, 35, 37, 41, 42,
+        48, 50, 34, 34, 35, 37, 41, 42, 48, 50, 34, 34, 35, 37, 42, 43, 49, 51,
+        35, 34, 36, 38, 45, 47, 52, 55, 36, 34, 36, 38, 46, 48, 54, 56, 36, 34,
+        36, 38, 46, 48, 54, 56, 38, 36, 37, 40, 47, 49, 56, 58, 39, 37, 39, 40,
+        48, 50, 58, 60, 39, 37, 39, 40, 48, 50, 58, 60, 41, 39, 40, 41, 49, 51,
+        60, 62, 44, 41, 42, 43, 51, 53, 63, 66, 44, 41, 42, 43, 51, 53, 63, 66,
+        44, 42, 42, 43, 51, 54, 64, 67, 47, 44, 44, 45, 53, 56, 66, 69, 48, 45,
+        45, 46, 54, 56, 67, 70, 48, 45, 45, 46, 54, 56, 67, 70, 51, 47, 48, 48,
+        56, 58, 69, 73,
+        /* Size 32x8 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 34, 34,
+        35, 36, 36, 38, 39, 39, 41, 44, 44, 44, 47, 48, 48, 51, 31, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 34, 36,
+        37, 37, 39, 41, 41, 42, 44, 45, 45, 47, 31, 32, 32, 32, 32, 32, 32, 32,
+        33, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 36, 36, 37, 39, 39, 40, 42,
+        42, 42, 44, 45, 45, 48, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35,
+        35, 35, 36, 37, 37, 37, 38, 38, 38, 40, 40, 40, 41, 43, 43, 43, 45, 46,
+        46, 48, 35, 35, 35, 35, 34, 34, 34, 34, 35, 36, 36, 37, 37, 37, 39, 41,
+        41, 42, 45, 46, 46, 47, 48, 48, 49, 51, 51, 51, 53, 54, 54, 56, 36, 35,
+        35, 35, 35, 34, 34, 35, 36, 36, 36, 37, 38, 38, 40, 42, 42, 43, 47, 48,
+        48, 49, 50, 50, 51, 53, 53, 54, 56, 56, 56, 58, 44, 43, 42, 42, 41, 41,
+        41, 41, 42, 42, 42, 42, 42, 42, 44, 48, 48, 49, 52, 54, 54, 56, 58, 58,
+        60, 63, 63, 64, 66, 67, 67, 69, 47, 46, 45, 45, 45, 44, 44, 44, 44, 45,
+        45, 45, 45, 45, 47, 50, 50, 51, 55, 56, 56, 58, 60, 60, 62, 66, 66, 67,
+        69, 70, 70, 73 },
+      { /* Chroma */
+        /* Size 4x4 */
+        31, 37, 47, 47, 37, 44, 47, 45, 47, 47, 53, 53, 47, 45, 53, 59,
+        /* Size 8x8 */
+        31, 31, 34, 37, 43, 48, 47, 49, 31, 32, 35, 40, 43, 46, 45, 46, 34, 35,
+        39, 43, 45, 46, 45, 46, 37, 40, 43, 47, 47, 47, 45, 46, 43, 43, 45, 47,
+        49, 50, 50, 50, 48, 46, 46, 47, 50, 53, 55, 55, 47, 45, 45, 45, 50, 55,
+        58, 60, 49, 46, 46, 46, 50, 55, 60, 61,
+        /* Size 16x16 */
+        32, 31, 31, 30, 33, 33, 36, 38, 41, 47, 49, 48, 49, 49, 50, 50, 31, 31,
+        31, 31, 34, 34, 38, 40, 42, 46, 47, 47, 47, 47, 48, 48, 31, 31, 31, 31,
+        34, 35, 39, 40, 42, 46, 47, 46, 46, 46, 47, 47, 30, 31, 31, 32, 34, 35,
+        40, 41, 42, 45, 46, 45, 45, 45, 46, 46, 33, 34, 34, 34, 37, 38, 42, 43,
+        44, 46, 47, 46, 46, 45, 46, 46, 33, 34, 35, 35, 38, 39, 43, 44, 45, 47,
+        47, 46, 46, 45, 46, 46, 36, 38, 39, 40, 42, 43, 47, 47, 47, 47, 48, 46,
+        46, 45, 46, 46, 38, 40, 40, 41, 43, 44, 47, 47, 48, 48, 49, 48, 47, 47,
+        47, 47, 41, 42, 42, 42, 44, 45, 47, 48, 48, 50, 50, 49, 49, 49, 50, 50,
+        47, 46, 46, 45, 46, 47, 47, 48, 50, 52, 52, 52, 52, 52, 53, 53, 49, 47,
+        47, 46, 47, 47, 48, 49, 50, 52, 53, 53, 53, 53, 54, 54, 48, 47, 46, 45,
+        46, 46, 46, 48, 49, 52, 53, 54, 55, 55, 56, 56, 49, 47, 46, 45, 46, 46,
+        46, 47, 49, 52, 53, 55, 55, 57, 57, 58, 49, 47, 46, 45, 45, 45, 45, 47,
+        49, 52, 53, 55, 57, 58, 59, 60, 50, 48, 47, 46, 46, 46, 46, 47, 50, 53,
+        54, 56, 57, 59, 61, 61, 50, 48, 47, 46, 46, 46, 46, 47, 50, 53, 54, 56,
+        58, 60, 61, 61,
+        /* Size 32x32 */
+        32, 31, 31, 31, 31, 30, 30, 31, 33, 33, 33, 35, 36, 36, 38, 41, 41, 43,
+        47, 49, 49, 49, 48, 48, 49, 49, 49, 49, 50, 50, 50, 51, 31, 31, 31, 31,
+        31, 31, 31, 31, 33, 34, 34, 36, 37, 37, 39, 42, 42, 43, 47, 48, 48, 48,
+        47, 47, 47, 47, 47, 48, 49, 49, 49, 50, 31, 31, 31, 31, 31, 31, 31, 32,
+        34, 34, 34, 37, 38, 38, 40, 42, 42, 43, 46, 47, 47, 47, 47, 47, 47, 47,
+        47, 47, 48, 48, 48, 49, 31, 31, 31, 31, 31, 31, 31, 32, 34, 34, 34, 37,
+        38, 38, 40, 42, 42, 43, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48,
+        48, 49, 31, 31, 31, 31, 31, 31, 31, 32, 34, 35, 35, 37, 39, 39, 40, 42,
+        42, 43, 46, 47, 47, 46, 46, 46, 46, 46, 46, 46, 47, 47, 47, 48, 30, 31,
+        31, 31, 31, 32, 32, 32, 34, 35, 35, 38, 40, 40, 41, 42, 42, 43, 45, 46,
+        46, 46, 45, 45, 45, 45, 45, 45, 46, 46, 46, 47, 30, 31, 31, 31, 31, 32,
+        32, 32, 34, 35, 35, 38, 40, 40, 41, 42, 42, 43, 45, 46, 46, 46, 45, 45,
+        45, 45, 45, 45, 46, 46, 46, 47, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36,
+        36, 38, 40, 40, 41, 43, 43, 43, 46, 46, 46, 46, 45, 45, 45, 45, 45, 45,
+        46, 46, 46, 47, 33, 33, 34, 34, 34, 34, 34, 35, 37, 38, 38, 41, 42, 42,
+        43, 44, 44, 45, 46, 47, 47, 46, 46, 46, 46, 45, 45, 45, 46, 46, 46, 47,
+        33, 34, 34, 34, 35, 35, 35, 36, 38, 39, 39, 41, 43, 43, 44, 45, 45, 45,
+        47, 47, 47, 46, 46, 46, 46, 45, 45, 45, 46, 46, 46, 47, 33, 34, 34, 34,
+        35, 35, 35, 36, 38, 39, 39, 41, 43, 43, 44, 45, 45, 45, 47, 47, 47, 46,
+        46, 46, 46, 45, 45, 45, 46, 46, 46, 47, 35, 36, 37, 37, 37, 38, 38, 38,
+        41, 41, 41, 44, 46, 46, 46, 46, 46, 46, 47, 47, 47, 47, 46, 46, 46, 45,
+        45, 45, 46, 46, 46, 47, 36, 37, 38, 38, 39, 40, 40, 40, 42, 43, 43, 46,
+        47, 47, 47, 47, 47, 47, 47, 48, 48, 47, 46, 46, 46, 45, 45, 45, 46, 46,
+        46, 46, 36, 37, 38, 38, 39, 40, 40, 40, 42, 43, 43, 46, 47, 47, 47, 47,
+        47, 47, 47, 48, 48, 47, 46, 46, 46, 45, 45, 45, 46, 46, 46, 46, 38, 39,
+        40, 40, 40, 41, 41, 41, 43, 44, 44, 46, 47, 47, 47, 48, 48, 48, 48, 49,
+        49, 48, 48, 48, 47, 47, 47, 47, 47, 47, 47, 48, 41, 42, 42, 42, 42, 42,
+        42, 43, 44, 45, 45, 46, 47, 47, 48, 48, 48, 49, 50, 50, 50, 50, 49, 49,
+        49, 49, 49, 49, 50, 50, 50, 50, 41, 42, 42, 42, 42, 42, 42, 43, 44, 45,
+        45, 46, 47, 47, 48, 48, 48, 49, 50, 50, 50, 50, 49, 49, 49, 49, 49, 49,
+        50, 50, 50, 50, 43, 43, 43, 43, 43, 43, 43, 43, 45, 45, 45, 46, 47, 47,
+        48, 49, 49, 49, 50, 51, 51, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 51,
+        47, 47, 46, 46, 46, 45, 45, 46, 46, 47, 47, 47, 47, 47, 48, 50, 50, 50,
+        52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 53, 53, 53, 53, 49, 48, 47, 47,
+        47, 46, 46, 46, 47, 47, 47, 47, 48, 48, 49, 50, 50, 51, 52, 53, 53, 53,
+        53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 49, 48, 47, 47, 47, 46, 46, 46,
+        47, 47, 47, 47, 48, 48, 49, 50, 50, 51, 52, 53, 53, 53, 53, 53, 53, 53,
+        53, 53, 54, 54, 54, 54, 49, 48, 47, 47, 46, 46, 46, 46, 46, 46, 46, 47,
+        47, 47, 48, 50, 50, 50, 52, 53, 53, 53, 54, 54, 54, 55, 55, 55, 55, 55,
+        55, 56, 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 48, 49,
+        49, 50, 52, 53, 53, 54, 54, 54, 55, 55, 55, 56, 56, 56, 56, 57, 48, 47,
+        47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 48, 49, 49, 50, 52, 53,
+        53, 54, 54, 54, 55, 55, 55, 56, 56, 56, 56, 57, 49, 47, 47, 47, 46, 45,
+        45, 45, 46, 46, 46, 46, 46, 46, 47, 49, 49, 50, 52, 53, 53, 54, 55, 55,
+        55, 57, 57, 57, 57, 58, 58, 58, 49, 47, 47, 47, 46, 45, 45, 45, 45, 45,
+        45, 45, 45, 45, 47, 49, 49, 50, 52, 53, 53, 55, 55, 55, 57, 58, 58, 59,
+        59, 60, 60, 60, 49, 47, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+        47, 49, 49, 50, 52, 53, 53, 55, 55, 55, 57, 58, 58, 59, 59, 60, 60, 60,
+        49, 48, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 50,
+        52, 53, 53, 55, 56, 56, 57, 59, 59, 59, 60, 60, 60, 61, 50, 49, 48, 48,
+        47, 46, 46, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 50, 53, 54, 54, 55,
+        56, 56, 57, 59, 59, 60, 61, 61, 61, 62, 50, 49, 48, 48, 47, 46, 46, 46,
+        46, 46, 46, 46, 46, 46, 47, 50, 50, 50, 53, 54, 54, 55, 56, 56, 58, 60,
+        60, 60, 61, 61, 61, 63, 50, 49, 48, 48, 47, 46, 46, 46, 46, 46, 46, 46,
+        46, 46, 47, 50, 50, 50, 53, 54, 54, 55, 56, 56, 58, 60, 60, 60, 61, 61,
+        61, 63, 51, 50, 49, 49, 48, 47, 47, 47, 47, 47, 47, 47, 46, 46, 48, 50,
+        50, 51, 53, 54, 54, 56, 57, 57, 58, 60, 60, 61, 62, 63, 63, 64,
+        /* Size 4x8 */
+        31, 38, 47, 48, 31, 40, 46, 45, 35, 43, 47, 46, 39, 47, 47, 45, 43, 47,
+        50, 50, 47, 47, 53, 55, 46, 46, 53, 58, 48, 46, 54, 59,
+        /* Size 8x4 */
+        31, 31, 35, 39, 43, 47, 46, 48, 38, 40, 43, 47, 47, 47, 46, 46, 47, 46,
+        47, 47, 50, 53, 53, 54, 48, 45, 46, 45, 50, 55, 58, 59,
+        /* Size 8x16 */
+        32, 31, 33, 37, 45, 48, 49, 50, 31, 31, 34, 38, 45, 47, 47, 48, 31, 32,
+        34, 39, 45, 46, 46, 47, 30, 32, 35, 40, 44, 46, 45, 46, 33, 35, 37, 42,
+        46, 47, 45, 46, 33, 36, 38, 43, 46, 47, 46, 46, 37, 40, 43, 47, 47, 47,
+        45, 46, 39, 41, 43, 47, 48, 48, 47, 47, 42, 43, 44, 47, 49, 50, 49, 50,
+        47, 46, 46, 48, 51, 52, 53, 53, 49, 46, 47, 48, 52, 53, 53, 54, 48, 46,
+        46, 47, 51, 53, 56, 56, 48, 45, 46, 46, 51, 53, 57, 57, 49, 45, 45, 46,
+        51, 53, 58, 59, 50, 46, 46, 46, 52, 54, 59, 61, 50, 46, 46, 46, 52, 54,
+        59, 61,
+        /* Size 16x8 */
+        32, 31, 31, 30, 33, 33, 37, 39, 42, 47, 49, 48, 48, 49, 50, 50, 31, 31,
+        32, 32, 35, 36, 40, 41, 43, 46, 46, 46, 45, 45, 46, 46, 33, 34, 34, 35,
+        37, 38, 43, 43, 44, 46, 47, 46, 46, 45, 46, 46, 37, 38, 39, 40, 42, 43,
+        47, 47, 47, 48, 48, 47, 46, 46, 46, 46, 45, 45, 45, 44, 46, 46, 47, 48,
+        49, 51, 52, 51, 51, 51, 52, 52, 48, 47, 46, 46, 47, 47, 47, 48, 50, 52,
+        53, 53, 53, 53, 54, 54, 49, 47, 46, 45, 45, 46, 45, 47, 49, 53, 53, 56,
+        57, 58, 59, 59, 50, 48, 47, 46, 46, 46, 46, 47, 50, 53, 54, 56, 57, 59,
+        61, 61,
+        /* Size 16x32 */
+        32, 31, 31, 31, 33, 37, 37, 38, 45, 48, 48, 49, 49, 49, 50, 52, 31, 31,
+        31, 31, 33, 38, 38, 39, 45, 47, 47, 48, 48, 48, 49, 51, 31, 31, 31, 31,
+        34, 38, 38, 40, 45, 47, 47, 47, 47, 47, 48, 50, 31, 31, 31, 31, 34, 38,
+        38, 40, 45, 47, 47, 47, 47, 47, 48, 50, 31, 31, 32, 32, 34, 39, 39, 40,
+        45, 46, 46, 46, 46, 46, 47, 49, 30, 31, 32, 32, 35, 40, 40, 41, 44, 46,
+        46, 45, 45, 45, 46, 48, 30, 31, 32, 32, 35, 40, 40, 41, 44, 46, 46, 45,
+        45, 45, 46, 48, 31, 32, 33, 33, 35, 40, 40, 41, 45, 46, 46, 45, 45, 45,
+        46, 48, 33, 34, 35, 35, 37, 42, 42, 43, 46, 47, 47, 46, 45, 45, 46, 47,
+        33, 35, 36, 36, 38, 43, 43, 44, 46, 47, 47, 46, 46, 46, 46, 47, 33, 35,
+        36, 36, 38, 43, 43, 44, 46, 47, 47, 46, 46, 46, 46, 47, 35, 37, 38, 38,
+        41, 45, 45, 46, 47, 47, 47, 46, 45, 45, 46, 47, 37, 39, 40, 40, 43, 47,
+        47, 47, 47, 47, 47, 46, 45, 45, 46, 47, 37, 39, 40, 40, 43, 47, 47, 47,
+        47, 47, 47, 46, 45, 45, 46, 47, 39, 40, 41, 41, 43, 47, 47, 47, 48, 48,
+        48, 47, 47, 47, 47, 48, 42, 42, 43, 43, 44, 47, 47, 48, 49, 50, 50, 49,
+        49, 49, 50, 50, 42, 42, 43, 43, 44, 47, 47, 48, 49, 50, 50, 49, 49, 49,
+        50, 50, 43, 43, 43, 43, 45, 47, 47, 48, 50, 50, 50, 50, 50, 50, 50, 51,
+        47, 46, 46, 46, 46, 48, 48, 48, 51, 52, 52, 52, 53, 53, 53, 53, 49, 47,
+        46, 46, 47, 48, 48, 49, 52, 53, 53, 53, 53, 53, 54, 54, 49, 47, 46, 46,
+        47, 48, 48, 49, 52, 53, 53, 53, 53, 53, 54, 54, 48, 47, 46, 46, 46, 47,
+        47, 48, 52, 53, 53, 54, 55, 55, 55, 56, 48, 47, 46, 46, 46, 47, 47, 48,
+        51, 53, 53, 54, 56, 56, 56, 57, 48, 47, 46, 46, 46, 47, 47, 48, 51, 53,
+        53, 54, 56, 56, 56, 57, 48, 47, 45, 45, 46, 46, 46, 47, 51, 53, 53, 55,
+        57, 57, 57, 59, 49, 46, 45, 45, 45, 46, 46, 47, 51, 53, 53, 56, 58, 58,
+        59, 61, 49, 46, 45, 45, 45, 46, 46, 47, 51, 53, 53, 56, 58, 58, 59, 61,
+        49, 47, 45, 45, 45, 46, 46, 47, 52, 53, 53, 56, 58, 58, 60, 62, 50, 48,
+        46, 46, 46, 46, 46, 48, 52, 54, 54, 57, 59, 59, 61, 63, 50, 48, 46, 46,
+        46, 46, 46, 48, 52, 54, 54, 57, 59, 59, 61, 64, 50, 48, 46, 46, 46, 46,
+        46, 48, 52, 54, 54, 57, 59, 59, 61, 64, 51, 49, 47, 47, 47, 47, 47, 48,
+        52, 54, 54, 58, 60, 60, 62, 65,
+        /* Size 32x16 */
+        32, 31, 31, 31, 31, 30, 30, 31, 33, 33, 33, 35, 37, 37, 39, 42, 42, 43,
+        47, 49, 49, 48, 48, 48, 48, 49, 49, 49, 50, 50, 50, 51, 31, 31, 31, 31,
+        31, 31, 31, 32, 34, 35, 35, 37, 39, 39, 40, 42, 42, 43, 46, 47, 47, 47,
+        47, 47, 47, 46, 46, 47, 48, 48, 48, 49, 31, 31, 31, 31, 32, 32, 32, 33,
+        35, 36, 36, 38, 40, 40, 41, 43, 43, 43, 46, 46, 46, 46, 46, 46, 45, 45,
+        45, 45, 46, 46, 46, 47, 31, 31, 31, 31, 32, 32, 32, 33, 35, 36, 36, 38,
+        40, 40, 41, 43, 43, 43, 46, 46, 46, 46, 46, 46, 45, 45, 45, 45, 46, 46,
+        46, 47, 33, 33, 34, 34, 34, 35, 35, 35, 37, 38, 38, 41, 43, 43, 43, 44,
+        44, 45, 46, 47, 47, 46, 46, 46, 46, 45, 45, 45, 46, 46, 46, 47, 37, 38,
+        38, 38, 39, 40, 40, 40, 42, 43, 43, 45, 47, 47, 47, 47, 47, 47, 48, 48,
+        48, 47, 47, 47, 46, 46, 46, 46, 46, 46, 46, 47, 37, 38, 38, 38, 39, 40,
+        40, 40, 42, 43, 43, 45, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47,
+        46, 46, 46, 46, 46, 46, 46, 47, 38, 39, 40, 40, 40, 41, 41, 41, 43, 44,
+        44, 46, 47, 47, 47, 48, 48, 48, 48, 49, 49, 48, 48, 48, 47, 47, 47, 47,
+        48, 48, 48, 48, 45, 45, 45, 45, 45, 44, 44, 45, 46, 46, 46, 47, 47, 47,
+        48, 49, 49, 50, 51, 52, 52, 52, 51, 51, 51, 51, 51, 52, 52, 52, 52, 52,
+        48, 47, 47, 47, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 50, 50, 50,
+        52, 53, 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 48, 47, 47, 47,
+        46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 50, 50, 50, 52, 53, 53, 53,
+        53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 49, 48, 47, 47, 46, 45, 45, 45,
+        46, 46, 46, 46, 46, 46, 47, 49, 49, 50, 52, 53, 53, 54, 54, 54, 55, 56,
+        56, 56, 57, 57, 57, 58, 49, 48, 47, 47, 46, 45, 45, 45, 45, 46, 46, 45,
+        45, 45, 47, 49, 49, 50, 53, 53, 53, 55, 56, 56, 57, 58, 58, 58, 59, 59,
+        59, 60, 49, 48, 47, 47, 46, 45, 45, 45, 45, 46, 46, 45, 45, 45, 47, 49,
+        49, 50, 53, 53, 53, 55, 56, 56, 57, 58, 58, 58, 59, 59, 59, 60, 50, 49,
+        48, 48, 47, 46, 46, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 50, 53, 54,
+        54, 55, 56, 56, 57, 59, 59, 60, 61, 61, 61, 62, 52, 51, 50, 50, 49, 48,
+        48, 48, 47, 47, 47, 47, 47, 47, 48, 50, 50, 51, 53, 54, 54, 56, 57, 57,
+        59, 61, 61, 62, 63, 64, 64, 65,
+        /* Size 4x16 */
+        31, 37, 48, 49, 31, 38, 47, 47, 31, 39, 46, 46, 31, 40, 46, 45, 34, 42,
+        47, 45, 35, 43, 47, 46, 39, 47, 47, 45, 40, 47, 48, 47, 42, 47, 50, 49,
+        46, 48, 52, 53, 47, 48, 53, 53, 47, 47, 53, 56, 47, 46, 53, 57, 46, 46,
+        53, 58, 48, 46, 54, 59, 48, 46, 54, 59,
+        /* Size 16x4 */
+        31, 31, 31, 31, 34, 35, 39, 40, 42, 46, 47, 47, 47, 46, 48, 48, 37, 38,
+        39, 40, 42, 43, 47, 47, 47, 48, 48, 47, 46, 46, 46, 46, 48, 47, 46, 46,
+        47, 47, 47, 48, 50, 52, 53, 53, 53, 53, 54, 54, 49, 47, 46, 45, 45, 46,
+        45, 47, 49, 53, 53, 56, 57, 58, 59, 59,
+        /* Size 8x32 */
+        32, 31, 33, 37, 45, 48, 49, 50, 31, 31, 33, 38, 45, 47, 48, 49, 31, 31,
+        34, 38, 45, 47, 47, 48, 31, 31, 34, 38, 45, 47, 47, 48, 31, 32, 34, 39,
+        45, 46, 46, 47, 30, 32, 35, 40, 44, 46, 45, 46, 30, 32, 35, 40, 44, 46,
+        45, 46, 31, 33, 35, 40, 45, 46, 45, 46, 33, 35, 37, 42, 46, 47, 45, 46,
+        33, 36, 38, 43, 46, 47, 46, 46, 33, 36, 38, 43, 46, 47, 46, 46, 35, 38,
+        41, 45, 47, 47, 45, 46, 37, 40, 43, 47, 47, 47, 45, 46, 37, 40, 43, 47,
+        47, 47, 45, 46, 39, 41, 43, 47, 48, 48, 47, 47, 42, 43, 44, 47, 49, 50,
+        49, 50, 42, 43, 44, 47, 49, 50, 49, 50, 43, 43, 45, 47, 50, 50, 50, 50,
+        47, 46, 46, 48, 51, 52, 53, 53, 49, 46, 47, 48, 52, 53, 53, 54, 49, 46,
+        47, 48, 52, 53, 53, 54, 48, 46, 46, 47, 52, 53, 55, 55, 48, 46, 46, 47,
+        51, 53, 56, 56, 48, 46, 46, 47, 51, 53, 56, 56, 48, 45, 46, 46, 51, 53,
+        57, 57, 49, 45, 45, 46, 51, 53, 58, 59, 49, 45, 45, 46, 51, 53, 58, 59,
+        49, 45, 45, 46, 52, 53, 58, 60, 50, 46, 46, 46, 52, 54, 59, 61, 50, 46,
+        46, 46, 52, 54, 59, 61, 50, 46, 46, 46, 52, 54, 59, 61, 51, 47, 47, 47,
+        52, 54, 60, 62,
+        /* Size 32x8 */
+        32, 31, 31, 31, 31, 30, 30, 31, 33, 33, 33, 35, 37, 37, 39, 42, 42, 43,
+        47, 49, 49, 48, 48, 48, 48, 49, 49, 49, 50, 50, 50, 51, 31, 31, 31, 31,
+        32, 32, 32, 33, 35, 36, 36, 38, 40, 40, 41, 43, 43, 43, 46, 46, 46, 46,
+        46, 46, 45, 45, 45, 45, 46, 46, 46, 47, 33, 33, 34, 34, 34, 35, 35, 35,
+        37, 38, 38, 41, 43, 43, 43, 44, 44, 45, 46, 47, 47, 46, 46, 46, 46, 45,
+        45, 45, 46, 46, 46, 47, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43, 45,
+        47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 46, 46, 46, 46, 46, 46,
+        46, 47, 45, 45, 45, 45, 45, 44, 44, 45, 46, 46, 46, 47, 47, 47, 48, 49,
+        49, 50, 51, 52, 52, 52, 51, 51, 51, 51, 51, 52, 52, 52, 52, 52, 48, 47,
+        47, 47, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 50, 50, 50, 52, 53,
+        53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 49, 48, 47, 47, 46, 45,
+        45, 45, 45, 46, 46, 45, 45, 45, 47, 49, 49, 50, 53, 53, 53, 55, 56, 56,
+        57, 58, 58, 58, 59, 59, 59, 60, 50, 49, 48, 48, 47, 46, 46, 46, 46, 46,
+        46, 46, 46, 46, 47, 50, 50, 50, 53, 54, 54, 55, 56, 56, 57, 59, 59, 60,
+        61, 61, 61, 62 },
+  },
+  {
+      { /* Luma */
+        /* Size 4x4 */
+        32, 32, 34, 38, 32, 33, 35, 39, 34, 35, 39, 45, 38, 39, 45, 54,
+        /* Size 8x8 */
+        31, 31, 32, 32, 33, 34, 37, 41, 31, 32, 32, 32, 33, 34, 36, 39, 32, 32,
+        32, 33, 34, 35, 37, 40, 32, 32, 33, 34, 35, 36, 38, 41, 33, 33, 34, 35,
+        37, 39, 41, 44, 34, 34, 35, 36, 39, 43, 46, 49, 37, 36, 37, 38, 41, 46,
+        51, 54, 41, 39, 40, 41, 44, 49, 54, 58,
+        /* Size 16x16 */
+        32, 31, 31, 31, 31, 31, 31, 32, 32, 34, 34, 36, 36, 39, 39, 44, 31, 32,
+        32, 32, 32, 32, 32, 32, 32, 34, 34, 35, 35, 38, 38, 42, 31, 32, 32, 32,
+        32, 32, 32, 32, 32, 34, 34, 35, 35, 38, 38, 42, 31, 32, 32, 32, 32, 32,
+        32, 32, 32, 33, 33, 34, 34, 37, 37, 41, 31, 32, 32, 32, 32, 32, 32, 32,
+        32, 33, 33, 34, 34, 37, 37, 41, 31, 32, 32, 32, 32, 33, 33, 34, 34, 35,
+        35, 36, 36, 39, 39, 42, 31, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 36,
+        36, 39, 39, 42, 32, 32, 32, 32, 32, 34, 34, 35, 35, 37, 37, 38, 38, 40,
+        40, 42, 32, 32, 32, 32, 32, 34, 34, 35, 35, 37, 37, 38, 38, 40, 40, 42,
+        34, 34, 34, 33, 33, 35, 35, 37, 37, 39, 39, 42, 42, 45, 45, 47, 34, 34,
+        34, 33, 33, 35, 35, 37, 37, 39, 39, 42, 42, 45, 45, 47, 36, 35, 35, 34,
+        34, 36, 36, 38, 38, 42, 42, 48, 48, 50, 50, 54, 36, 35, 35, 34, 34, 36,
+        36, 38, 38, 42, 42, 48, 48, 50, 50, 54, 39, 38, 38, 37, 37, 39, 39, 40,
+        40, 45, 45, 50, 50, 54, 54, 58, 39, 38, 38, 37, 37, 39, 39, 40, 40, 45,
+        45, 50, 50, 54, 54, 58, 44, 42, 42, 41, 41, 42, 42, 42, 42, 47, 47, 54,
+        54, 58, 58, 63,
+        /* Size 32x32 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 33,
+        34, 34, 34, 35, 36, 36, 36, 37, 39, 39, 39, 41, 44, 44, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34,
+        35, 35, 35, 37, 39, 39, 39, 41, 43, 43, 31, 31, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 35, 35, 37,
+        38, 38, 38, 40, 42, 42, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 35, 35, 37, 38, 38, 38, 40,
+        42, 42, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 33, 34, 34, 34, 34, 35, 35, 35, 37, 38, 38, 38, 40, 42, 42, 31, 31,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34,
+        34, 34, 35, 35, 35, 36, 38, 38, 38, 39, 41, 41, 31, 31, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34,
+        34, 36, 37, 37, 37, 39, 41, 41, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 36, 37, 37,
+        37, 39, 41, 41, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 36, 37, 37, 37, 39, 41, 41,
+        31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34,
+        34, 34, 34, 35, 35, 35, 35, 37, 38, 38, 38, 40, 41, 41, 31, 32, 32, 32,
+        32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 36,
+        36, 36, 36, 38, 39, 39, 39, 40, 42, 42, 31, 32, 32, 32, 32, 32, 32, 32,
+        32, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 36, 36, 36, 36, 38,
+        39, 39, 39, 40, 42, 42, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+        33, 33, 34, 34, 34, 34, 35, 35, 35, 36, 36, 36, 36, 38, 39, 39, 39, 40,
+        42, 42, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34,
+        34, 35, 36, 36, 36, 36, 37, 37, 37, 38, 40, 40, 40, 41, 42, 42, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 35, 35, 36, 37, 37,
+        37, 37, 38, 38, 38, 39, 40, 40, 40, 41, 42, 42, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 33, 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 37, 38, 38,
+        38, 39, 40, 40, 40, 41, 42, 42, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+        34, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40, 40,
+        40, 41, 42, 42, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35,
+        36, 36, 36, 37, 38, 38, 38, 39, 40, 40, 40, 41, 42, 42, 42, 44, 45, 45,
+        34, 34, 34, 34, 34, 34, 33, 33, 33, 34, 35, 35, 35, 36, 37, 37, 37, 38,
+        39, 39, 39, 41, 42, 42, 42, 44, 45, 45, 45, 46, 47, 47, 34, 34, 34, 34,
+        34, 34, 33, 33, 33, 34, 35, 35, 35, 36, 37, 37, 37, 38, 39, 39, 39, 41,
+        42, 42, 42, 44, 45, 45, 45, 46, 47, 47, 34, 34, 34, 34, 34, 34, 33, 33,
+        33, 34, 35, 35, 35, 36, 37, 37, 37, 38, 39, 39, 39, 41, 42, 42, 42, 44,
+        45, 45, 45, 46, 47, 47, 35, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 36,
+        36, 36, 37, 37, 37, 39, 41, 41, 41, 43, 45, 45, 45, 46, 47, 47, 47, 49,
+        50, 50, 36, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38,
+        38, 40, 42, 42, 42, 45, 48, 48, 48, 49, 50, 50, 50, 52, 54, 54, 36, 35,
+        35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42,
+        42, 45, 48, 48, 48, 49, 50, 50, 50, 52, 54, 54, 36, 35, 35, 35, 35, 35,
+        34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48,
+        48, 49, 50, 50, 50, 52, 54, 54, 37, 37, 37, 37, 37, 36, 36, 36, 36, 37,
+        38, 38, 38, 38, 39, 39, 39, 41, 44, 44, 44, 46, 49, 49, 49, 51, 52, 52,
+        52, 54, 56, 56, 39, 39, 38, 38, 38, 38, 37, 37, 37, 38, 39, 39, 39, 40,
+        40, 40, 40, 42, 45, 45, 45, 47, 50, 50, 50, 52, 54, 54, 54, 56, 58, 58,
+        39, 39, 38, 38, 38, 38, 37, 37, 37, 38, 39, 39, 39, 40, 40, 40, 40, 42,
+        45, 45, 45, 47, 50, 50, 50, 52, 54, 54, 54, 56, 58, 58, 39, 39, 38, 38,
+        38, 38, 37, 37, 37, 38, 39, 39, 39, 40, 40, 40, 40, 42, 45, 45, 45, 47,
+        50, 50, 50, 52, 54, 54, 54, 56, 58, 58, 41, 41, 40, 40, 40, 39, 39, 39,
+        39, 40, 40, 40, 40, 41, 41, 41, 41, 44, 46, 46, 46, 49, 52, 52, 52, 54,
+        56, 56, 56, 58, 60, 60, 44, 43, 42, 42, 42, 41, 41, 41, 41, 41, 42, 42,
+        42, 42, 42, 42, 42, 45, 47, 47, 47, 50, 54, 54, 54, 56, 58, 58, 58, 60,
+        63, 63, 44, 43, 42, 42, 42, 41, 41, 41, 41, 41, 42, 42, 42, 42, 42, 42,
+        42, 45, 47, 47, 47, 50, 54, 54, 54, 56, 58, 58, 58, 60, 63, 63,
+        /* Size 4x8 */
+        31, 32, 34, 39, 32, 32, 34, 38, 32, 33, 34, 38, 32, 33, 36, 40, 33, 34,
+        38, 42, 34, 36, 41, 47, 37, 38, 44, 52, 40, 40, 46, 56,
+        /* Size 8x4 */
+        31, 32, 32, 32, 33, 34, 37, 40, 32, 32, 33, 33, 34, 36, 38, 40, 34, 34,
+        34, 36, 38, 41, 44, 46, 39, 38, 38, 40, 42, 47, 52, 56,
+        /* Size 8x16 */
+        32, 31, 31, 32, 32, 36, 36, 44, 31, 32, 32, 32, 32, 35, 35, 42, 31, 32,
+        32, 32, 32, 35, 35, 42, 31, 32, 32, 33, 33, 34, 34, 41, 31, 32, 32, 33,
+        33, 34, 34, 41, 32, 32, 32, 34, 34, 36, 36, 42, 32, 32, 32, 34, 34, 36,
+        36, 42, 32, 33, 33, 35, 35, 38, 38, 42, 32, 33, 33, 35, 35, 38, 38, 42,
+        34, 34, 34, 37, 37, 42, 42, 48, 34, 34, 34, 37, 37, 42, 42, 48, 36, 34,
+        34, 38, 38, 48, 48, 54, 36, 34, 34, 38, 38, 48, 48, 54, 39, 37, 37, 40,
+        40, 50, 50, 58, 39, 37, 37, 40, 40, 50, 50, 58, 44, 41, 41, 43, 43, 53,
+        53, 63,
+        /* Size 16x8 */
+        32, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 36, 36, 39, 39, 44, 31, 32,
+        32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 37, 37, 41, 31, 32, 32, 32,
+        32, 32, 32, 33, 33, 34, 34, 34, 34, 37, 37, 41, 32, 32, 32, 33, 33, 34,
+        34, 35, 35, 37, 37, 38, 38, 40, 40, 43, 32, 32, 32, 33, 33, 34, 34, 35,
+        35, 37, 37, 38, 38, 40, 40, 43, 36, 35, 35, 34, 34, 36, 36, 38, 38, 42,
+        42, 48, 48, 50, 50, 53, 36, 35, 35, 34, 34, 36, 36, 38, 38, 42, 42, 48,
+        48, 50, 50, 53, 44, 42, 42, 41, 41, 42, 42, 42, 42, 48, 48, 54, 54, 58,
+        58, 63,
+        /* Size 16x32 */
+        32, 31, 31, 31, 31, 32, 32, 32, 32, 34, 36, 36, 36, 39, 44, 44, 31, 31,
+        31, 31, 31, 32, 32, 32, 32, 34, 35, 35, 35, 39, 43, 43, 31, 32, 32, 32,
+        32, 32, 32, 32, 32, 34, 35, 35, 35, 38, 42, 42, 31, 32, 32, 32, 32, 32,
+        32, 32, 32, 34, 35, 35, 35, 38, 42, 42, 31, 32, 32, 32, 32, 32, 32, 32,
+        32, 34, 35, 35, 35, 38, 42, 42, 31, 32, 32, 32, 32, 32, 32, 32, 32, 34,
+        35, 35, 35, 38, 41, 41, 31, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34,
+        34, 37, 41, 41, 31, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 37,
+        41, 41, 31, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 37, 41, 41,
+        31, 32, 32, 32, 32, 33, 33, 33, 33, 34, 35, 35, 35, 38, 41, 41, 32, 32,
+        32, 32, 32, 33, 34, 34, 34, 35, 36, 36, 36, 39, 42, 42, 32, 32, 32, 32,
+        32, 33, 34, 34, 34, 35, 36, 36, 36, 39, 42, 42, 32, 32, 32, 32, 32, 33,
+        34, 34, 34, 35, 36, 36, 36, 39, 42, 42, 32, 32, 32, 32, 32, 33, 34, 34,
+        34, 36, 37, 37, 37, 40, 42, 42, 32, 32, 33, 33, 33, 34, 35, 35, 35, 37,
+        38, 38, 38, 40, 42, 42, 32, 32, 33, 33, 33, 34, 35, 35, 35, 37, 38, 38,
+        38, 40, 42, 42, 32, 32, 33, 33, 33, 34, 35, 35, 35, 37, 38, 38, 38, 40,
+        42, 42, 33, 33, 33, 33, 33, 34, 36, 36, 36, 38, 40, 40, 40, 42, 45, 45,
+        34, 34, 34, 34, 34, 35, 37, 37, 37, 39, 42, 42, 42, 45, 48, 48, 34, 34,
+        34, 34, 34, 35, 37, 37, 37, 39, 42, 42, 42, 45, 48, 48, 34, 34, 34, 34,
+        34, 35, 37, 37, 37, 39, 42, 42, 42, 45, 48, 48, 35, 34, 34, 34, 34, 36,
+        37, 37, 37, 41, 45, 45, 45, 47, 50, 50, 36, 35, 34, 34, 34, 36, 38, 38,
+        38, 43, 48, 48, 48, 51, 54, 54, 36, 35, 34, 34, 34, 36, 38, 38, 38, 43,
+        48, 48, 48, 51, 54, 54, 36, 35, 34, 34, 34, 36, 38, 38, 38, 43, 48, 48,
+        48, 51, 54, 54, 37, 37, 36, 36, 36, 38, 39, 39, 39, 44, 49, 49, 49, 52,
+        56, 56, 39, 38, 37, 37, 37, 39, 40, 40, 40, 45, 50, 50, 50, 54, 58, 58,
+        39, 38, 37, 37, 37, 39, 40, 40, 40, 45, 50, 50, 50, 54, 58, 58, 39, 38,
+        37, 37, 37, 39, 40, 40, 40, 45, 50, 50, 50, 54, 58, 58, 41, 40, 39, 39,
+        39, 40, 42, 42, 42, 46, 52, 52, 52, 56, 60, 60, 44, 42, 41, 41, 41, 42,
+        43, 43, 43, 48, 53, 53, 53, 58, 63, 63, 44, 42, 41, 41, 41, 42, 43, 43,
+        43, 48, 53, 53, 53, 58, 63, 63,
+        /* Size 32x16 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33,
+        34, 34, 34, 35, 36, 36, 36, 37, 39, 39, 39, 41, 44, 44, 31, 31, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34,
+        35, 35, 35, 37, 38, 38, 38, 40, 42, 42, 31, 31, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 36,
+        37, 37, 37, 39, 41, 41, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 36, 37, 37, 37, 39,
+        41, 41, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+        33, 33, 34, 34, 34, 34, 34, 34, 34, 36, 37, 37, 37, 39, 41, 41, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35,
+        35, 36, 36, 36, 36, 38, 39, 39, 39, 40, 42, 42, 32, 32, 32, 32, 32, 32,
+        33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 37, 38, 38,
+        38, 39, 40, 40, 40, 42, 43, 43, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+        34, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40, 40,
+        40, 42, 43, 43, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34,
+        35, 35, 35, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43,
+        34, 34, 34, 34, 34, 34, 33, 33, 33, 34, 35, 35, 35, 36, 37, 37, 37, 38,
+        39, 39, 39, 41, 43, 43, 43, 44, 45, 45, 45, 46, 48, 48, 36, 35, 35, 35,
+        35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45,
+        48, 48, 48, 49, 50, 50, 50, 52, 53, 53, 36, 35, 35, 35, 35, 35, 34, 34,
+        34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, 48, 49,
+        50, 50, 50, 52, 53, 53, 36, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36,
+        36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, 48, 49, 50, 50, 50, 52,
+        53, 53, 39, 39, 38, 38, 38, 38, 37, 37, 37, 38, 39, 39, 39, 40, 40, 40,
+        40, 42, 45, 45, 45, 47, 51, 51, 51, 52, 54, 54, 54, 56, 58, 58, 44, 43,
+        42, 42, 42, 41, 41, 41, 41, 41, 42, 42, 42, 42, 42, 42, 42, 45, 48, 48,
+        48, 50, 54, 54, 54, 56, 58, 58, 58, 60, 63, 63, 44, 43, 42, 42, 42, 41,
+        41, 41, 41, 41, 42, 42, 42, 42, 42, 42, 42, 45, 48, 48, 48, 50, 54, 54,
+        54, 56, 58, 58, 58, 60, 63, 63,
+        /* Size 4x16 */
+        31, 32, 34, 39, 32, 32, 34, 38, 32, 32, 34, 38, 32, 32, 33, 37, 32, 32,
+        33, 37, 32, 33, 35, 39, 32, 33, 35, 39, 32, 34, 37, 40, 32, 34, 37, 40,
+        34, 35, 39, 45, 34, 35, 39, 45, 35, 36, 43, 51, 35, 36, 43, 51, 38, 39,
+        45, 54, 38, 39, 45, 54, 42, 42, 48, 58,
+        /* Size 16x4 */
+        31, 32, 32, 32, 32, 32, 32, 32, 32, 34, 34, 35, 35, 38, 38, 42, 32, 32,
+        32, 32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 39, 39, 42, 34, 34, 34, 33,
+        33, 35, 35, 37, 37, 39, 39, 43, 43, 45, 45, 48, 39, 38, 38, 37, 37, 39,
+        39, 40, 40, 45, 45, 51, 51, 54, 54, 58,
+        /* Size 8x32 */
+        32, 31, 31, 32, 32, 36, 36, 44, 31, 31, 31, 32, 32, 35, 35, 43, 31, 32,
+        32, 32, 32, 35, 35, 42, 31, 32, 32, 32, 32, 35, 35, 42, 31, 32, 32, 32,
+        32, 35, 35, 42, 31, 32, 32, 32, 32, 35, 35, 41, 31, 32, 32, 33, 33, 34,
+        34, 41, 31, 32, 32, 33, 33, 34, 34, 41, 31, 32, 32, 33, 33, 34, 34, 41,
+        31, 32, 32, 33, 33, 35, 35, 41, 32, 32, 32, 34, 34, 36, 36, 42, 32, 32,
+        32, 34, 34, 36, 36, 42, 32, 32, 32, 34, 34, 36, 36, 42, 32, 32, 32, 34,
+        34, 37, 37, 42, 32, 33, 33, 35, 35, 38, 38, 42, 32, 33, 33, 35, 35, 38,
+        38, 42, 32, 33, 33, 35, 35, 38, 38, 42, 33, 33, 33, 36, 36, 40, 40, 45,
+        34, 34, 34, 37, 37, 42, 42, 48, 34, 34, 34, 37, 37, 42, 42, 48, 34, 34,
+        34, 37, 37, 42, 42, 48, 35, 34, 34, 37, 37, 45, 45, 50, 36, 34, 34, 38,
+        38, 48, 48, 54, 36, 34, 34, 38, 38, 48, 48, 54, 36, 34, 34, 38, 38, 48,
+        48, 54, 37, 36, 36, 39, 39, 49, 49, 56, 39, 37, 37, 40, 40, 50, 50, 58,
+        39, 37, 37, 40, 40, 50, 50, 58, 39, 37, 37, 40, 40, 50, 50, 58, 41, 39,
+        39, 42, 42, 52, 52, 60, 44, 41, 41, 43, 43, 53, 53, 63, 44, 41, 41, 43,
+        43, 53, 53, 63,
+        /* Size 32x8 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33,
+        34, 34, 34, 35, 36, 36, 36, 37, 39, 39, 39, 41, 44, 44, 31, 31, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34,
+        34, 34, 34, 36, 37, 37, 37, 39, 41, 41, 31, 31, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 36,
+        37, 37, 37, 39, 41, 41, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34,
+        34, 34, 35, 35, 35, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40, 40, 40, 42,
+        43, 43, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35,
+        35, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43, 36, 35,
+        35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42,
+        42, 45, 48, 48, 48, 49, 50, 50, 50, 52, 53, 53, 36, 35, 35, 35, 35, 35,
+        34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48,
+        48, 49, 50, 50, 50, 52, 53, 53, 44, 43, 42, 42, 42, 41, 41, 41, 41, 41,
+        42, 42, 42, 42, 42, 42, 42, 45, 48, 48, 48, 50, 54, 54, 54, 56, 58, 58,
+        58, 60, 63, 63 },
+      { /* Chroma */
+        /* Size 4x4 */
+        31, 34, 42, 47, 34, 39, 45, 46, 42, 45, 48, 49, 47, 46, 49, 54,
+        /* Size 8x8 */
+        31, 31, 32, 35, 39, 45, 48, 48, 31, 31, 33, 37, 41, 44, 46, 46, 32, 33,
+        35, 39, 42, 45, 46, 45, 35, 37, 39, 43, 45, 47, 47, 46, 39, 41, 42, 45,
+        47, 48, 48, 47, 45, 44, 45, 47, 48, 50, 51, 51, 48, 46, 46, 47, 48, 51,
+        53, 54, 48, 46, 45, 46, 47, 51, 54, 56,
+        /* Size 16x16 */
+        32, 31, 31, 30, 30, 33, 33, 36, 36, 41, 41, 49, 49, 48, 48, 49, 31, 31,
+        31, 31, 31, 34, 34, 38, 38, 42, 42, 47, 47, 47, 47, 47, 31, 31, 31, 31,
+        31, 34, 34, 38, 38, 42, 42, 47, 47, 47, 47, 47, 30, 31, 31, 32, 32, 35,
+        35, 40, 40, 42, 42, 46, 46, 45, 45, 45, 30, 31, 31, 32, 32, 35, 35, 40,
+        40, 42, 42, 46, 46, 45, 45, 45, 33, 34, 34, 35, 35, 39, 39, 43, 43, 45,
+        45, 47, 47, 46, 46, 45, 33, 34, 34, 35, 35, 39, 39, 43, 43, 45, 45, 47,
+        47, 46, 46, 45, 36, 38, 38, 40, 40, 43, 43, 47, 47, 47, 47, 48, 48, 46,
+        46, 45, 36, 38, 38, 40, 40, 43, 43, 47, 47, 47, 47, 48, 48, 46, 46, 45,
+        41, 42, 42, 42, 42, 45, 45, 47, 47, 48, 48, 50, 50, 49, 49, 49, 41, 42,
+        42, 42, 42, 45, 45, 47, 47, 48, 48, 50, 50, 49, 49, 49, 49, 47, 47, 46,
+        46, 47, 47, 48, 48, 50, 50, 53, 53, 53, 53, 53, 49, 47, 47, 46, 46, 47,
+        47, 48, 48, 50, 50, 53, 53, 53, 53, 53, 48, 47, 47, 45, 45, 46, 46, 46,
+        46, 49, 49, 53, 53, 54, 54, 55, 48, 47, 47, 45, 45, 46, 46, 46, 46, 49,
+        49, 53, 53, 54, 54, 55, 49, 47, 47, 45, 45, 45, 45, 45, 45, 49, 49, 53,
+        53, 55, 55, 58,
+        /* Size 32x32 */
+        32, 31, 31, 31, 31, 31, 30, 30, 30, 32, 33, 33, 33, 35, 36, 36, 36, 39,
+        41, 41, 41, 45, 49, 49, 49, 49, 48, 48, 48, 49, 49, 49, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 32, 34, 34, 34, 35, 37, 37, 37, 39, 42, 42, 42, 45,
+        48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 33, 34, 34, 34, 36, 38, 38, 38, 40, 42, 42, 42, 45, 47, 47, 47, 47,
+        47, 47, 47, 47, 47, 47, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 34,
+        34, 36, 38, 38, 38, 40, 42, 42, 42, 45, 47, 47, 47, 47, 47, 47, 47, 47,
+        47, 47, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 34, 34, 36, 38, 38,
+        38, 40, 42, 42, 42, 45, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 33, 35, 35, 35, 37, 39, 39, 39, 41, 42, 42,
+        42, 44, 47, 47, 47, 46, 46, 46, 46, 46, 46, 46, 30, 31, 31, 31, 31, 31,
+        32, 32, 32, 33, 35, 35, 35, 37, 40, 40, 40, 41, 42, 42, 42, 44, 46, 46,
+        46, 46, 45, 45, 45, 45, 45, 45, 30, 31, 31, 31, 31, 31, 32, 32, 32, 33,
+        35, 35, 35, 37, 40, 40, 40, 41, 42, 42, 42, 44, 46, 46, 46, 46, 45, 45,
+        45, 45, 45, 45, 30, 31, 31, 31, 31, 31, 32, 32, 32, 33, 35, 35, 35, 37,
+        40, 40, 40, 41, 42, 42, 42, 44, 46, 46, 46, 46, 45, 45, 45, 45, 45, 45,
+        32, 32, 33, 33, 33, 33, 33, 33, 33, 35, 37, 37, 37, 39, 41, 41, 41, 42,
+        43, 43, 43, 45, 47, 47, 47, 46, 46, 46, 46, 45, 45, 45, 33, 34, 34, 34,
+        34, 35, 35, 35, 35, 37, 39, 39, 39, 41, 43, 43, 43, 44, 45, 45, 45, 46,
+        47, 47, 47, 47, 46, 46, 46, 46, 45, 45, 33, 34, 34, 34, 34, 35, 35, 35,
+        35, 37, 39, 39, 39, 41, 43, 43, 43, 44, 45, 45, 45, 46, 47, 47, 47, 47,
+        46, 46, 46, 46, 45, 45, 33, 34, 34, 34, 34, 35, 35, 35, 35, 37, 39, 39,
+        39, 41, 43, 43, 43, 44, 45, 45, 45, 46, 47, 47, 47, 47, 46, 46, 46, 46,
+        45, 45, 35, 35, 36, 36, 36, 37, 37, 37, 37, 39, 41, 41, 41, 43, 45, 45,
+        45, 45, 46, 46, 46, 47, 47, 47, 47, 47, 46, 46, 46, 46, 45, 45, 36, 37,
+        38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, 47, 47, 47, 47, 47, 47,
+        47, 47, 48, 48, 48, 47, 46, 46, 46, 46, 45, 45, 36, 37, 38, 38, 38, 39,
+        40, 40, 40, 41, 43, 43, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48,
+        48, 47, 46, 46, 46, 46, 45, 45, 36, 37, 38, 38, 38, 39, 40, 40, 40, 41,
+        43, 43, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 46, 46,
+        46, 46, 45, 45, 39, 39, 40, 40, 40, 41, 41, 41, 41, 42, 44, 44, 44, 45,
+        47, 47, 47, 47, 48, 48, 48, 48, 49, 49, 49, 48, 48, 48, 48, 47, 47, 47,
+        41, 42, 42, 42, 42, 42, 42, 42, 42, 43, 45, 45, 45, 46, 47, 47, 47, 48,
+        48, 48, 48, 49, 50, 50, 50, 50, 49, 49, 49, 49, 49, 49, 41, 42, 42, 42,
+        42, 42, 42, 42, 42, 43, 45, 45, 45, 46, 47, 47, 47, 48, 48, 48, 48, 49,
+        50, 50, 50, 50, 49, 49, 49, 49, 49, 49, 41, 42, 42, 42, 42, 42, 42, 42,
+        42, 43, 45, 45, 45, 46, 47, 47, 47, 48, 48, 48, 48, 49, 50, 50, 50, 50,
+        49, 49, 49, 49, 49, 49, 45, 45, 45, 45, 45, 44, 44, 44, 44, 45, 46, 46,
+        46, 47, 47, 47, 47, 48, 49, 49, 49, 50, 51, 51, 51, 51, 51, 51, 51, 51,
+        51, 51, 49, 48, 47, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 48, 48,
+        48, 49, 50, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 49, 48,
+        47, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50,
+        50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 49, 48, 47, 47, 47, 47,
+        46, 46, 46, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50, 50, 51, 53, 53,
+        53, 53, 53, 53, 53, 53, 53, 53, 49, 48, 47, 47, 47, 46, 46, 46, 46, 46,
+        47, 47, 47, 47, 47, 47, 47, 48, 50, 50, 50, 51, 53, 53, 53, 53, 53, 53,
+        53, 54, 54, 54, 48, 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46,
+        46, 46, 46, 48, 49, 49, 49, 51, 53, 53, 53, 53, 54, 54, 54, 55, 55, 55,
+        48, 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 46, 48,
+        49, 49, 49, 51, 53, 53, 53, 53, 54, 54, 54, 55, 55, 55, 48, 48, 47, 47,
+        47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 46, 48, 49, 49, 49, 51,
+        53, 53, 53, 53, 54, 54, 54, 55, 55, 55, 49, 48, 47, 47, 47, 46, 45, 45,
+        45, 45, 46, 46, 46, 46, 46, 46, 46, 47, 49, 49, 49, 51, 53, 53, 53, 54,
+        55, 55, 55, 56, 57, 57, 49, 48, 47, 47, 47, 46, 45, 45, 45, 45, 45, 45,
+        45, 45, 45, 45, 45, 47, 49, 49, 49, 51, 53, 53, 53, 54, 55, 55, 55, 57,
+        58, 58, 49, 48, 47, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+        45, 47, 49, 49, 49, 51, 53, 53, 53, 54, 55, 55, 55, 57, 58, 58,
+        /* Size 4x8 */
+        31, 34, 42, 48, 31, 35, 42, 46, 33, 37, 44, 46, 36, 41, 46, 46, 40, 44,
+        48, 48, 45, 46, 49, 51, 47, 47, 50, 54, 47, 46, 49, 55,
+        /* Size 8x4 */
+        31, 31, 33, 36, 40, 45, 47, 47, 34, 35, 37, 41, 44, 46, 47, 46, 42, 42,
+        44, 46, 48, 49, 50, 49, 48, 46, 46, 46, 48, 51, 54, 55,
+        /* Size 8x16 */
+        32, 31, 31, 37, 37, 48, 48, 49, 31, 31, 31, 38, 38, 47, 47, 47, 31, 31,
+        31, 38, 38, 47, 47, 47, 30, 32, 32, 40, 40, 46, 46, 45, 30, 32, 32, 40,
+        40, 46, 46, 45, 33, 36, 36, 43, 43, 47, 47, 46, 33, 36, 36, 43, 43, 47,
+        47, 46, 37, 40, 40, 47, 47, 47, 47, 45, 37, 40, 40, 47, 47, 47, 47, 45,
+        42, 43, 43, 47, 47, 50, 50, 49, 42, 43, 43, 47, 47, 50, 50, 49, 49, 46,
+        46, 48, 48, 53, 53, 53, 49, 46, 46, 48, 48, 53, 53, 53, 48, 46, 46, 47,
+        47, 53, 53, 56, 48, 46, 46, 47, 47, 53, 53, 56, 49, 45, 45, 46, 46, 53,
+        53, 58,
+        /* Size 16x8 */
+        32, 31, 31, 30, 30, 33, 33, 37, 37, 42, 42, 49, 49, 48, 48, 49, 31, 31,
+        31, 32, 32, 36, 36, 40, 40, 43, 43, 46, 46, 46, 46, 45, 31, 31, 31, 32,
+        32, 36, 36, 40, 40, 43, 43, 46, 46, 46, 46, 45, 37, 38, 38, 40, 40, 43,
+        43, 47, 47, 47, 47, 48, 48, 47, 47, 46, 37, 38, 38, 40, 40, 43, 43, 47,
+        47, 47, 47, 48, 48, 47, 47, 46, 48, 47, 47, 46, 46, 47, 47, 47, 47, 50,
+        50, 53, 53, 53, 53, 53, 48, 47, 47, 46, 46, 47, 47, 47, 47, 50, 50, 53,
+        53, 53, 53, 53, 49, 47, 47, 45, 45, 46, 46, 45, 45, 49, 49, 53, 53, 56,
+        56, 58,
+        /* Size 16x32 */
+        32, 31, 31, 31, 31, 33, 37, 37, 37, 42, 48, 48, 48, 48, 49, 49, 31, 31,
+        31, 31, 31, 34, 37, 37, 37, 42, 47, 47, 47, 48, 48, 48, 31, 31, 31, 31,
+        31, 34, 38, 38, 38, 42, 47, 47, 47, 47, 47, 47, 31, 31, 31, 31, 31, 34,
+        38, 38, 38, 42, 47, 47, 47, 47, 47, 47, 31, 31, 31, 31, 31, 34, 38, 38,
+        38, 42, 47, 47, 47, 47, 47, 47, 31, 31, 32, 32, 32, 35, 39, 39, 39, 42,
+        46, 46, 46, 46, 46, 46, 30, 31, 32, 32, 32, 35, 40, 40, 40, 42, 46, 46,
+        46, 45, 45, 45, 30, 31, 32, 32, 32, 35, 40, 40, 40, 42, 46, 46, 46, 45,
+        45, 45, 30, 31, 32, 32, 32, 35, 40, 40, 40, 42, 46, 46, 46, 45, 45, 45,
+        32, 33, 34, 34, 34, 37, 41, 41, 41, 44, 46, 46, 46, 46, 45, 45, 33, 34,
+        36, 36, 36, 39, 43, 43, 43, 45, 47, 47, 47, 46, 46, 46, 33, 34, 36, 36,
+        36, 39, 43, 43, 43, 45, 47, 47, 47, 46, 46, 46, 33, 34, 36, 36, 36, 39,
+        43, 43, 43, 45, 47, 47, 47, 46, 46, 46, 35, 36, 38, 38, 38, 41, 45, 45,
+        45, 46, 47, 47, 47, 46, 45, 45, 37, 38, 40, 40, 40, 43, 47, 47, 47, 47,
+        47, 47, 47, 46, 45, 45, 37, 38, 40, 40, 40, 43, 47, 47, 47, 47, 47, 47,
+        47, 46, 45, 45, 37, 38, 40, 40, 40, 43, 47, 47, 47, 47, 47, 47, 47, 46,
+        45, 45, 39, 40, 41, 41, 41, 44, 47, 47, 47, 48, 49, 49, 49, 48, 47, 47,
+        42, 42, 43, 43, 43, 45, 47, 47, 47, 48, 50, 50, 50, 50, 49, 49, 42, 42,
+        43, 43, 43, 45, 47, 47, 47, 48, 50, 50, 50, 50, 49, 49, 42, 42, 43, 43,
+        43, 45, 47, 47, 47, 48, 50, 50, 50, 50, 49, 49, 45, 45, 44, 44, 44, 46,
+        47, 47, 47, 49, 51, 51, 51, 51, 51, 51, 49, 48, 46, 46, 46, 47, 48, 48,
+        48, 50, 53, 53, 53, 53, 53, 53, 49, 48, 46, 46, 46, 47, 48, 48, 48, 50,
+        53, 53, 53, 53, 53, 53, 49, 48, 46, 46, 46, 47, 48, 48, 48, 50, 53, 53,
+        53, 53, 53, 53, 48, 47, 46, 46, 46, 47, 47, 47, 47, 50, 53, 53, 53, 54,
+        54, 54, 48, 47, 46, 46, 46, 46, 47, 47, 47, 50, 53, 53, 53, 54, 56, 56,
+        48, 47, 46, 46, 46, 46, 47, 47, 47, 50, 53, 53, 53, 54, 56, 56, 48, 47,
+        46, 46, 46, 46, 47, 47, 47, 50, 53, 53, 53, 54, 56, 56, 48, 47, 45, 45,
+        45, 46, 46, 46, 46, 49, 53, 53, 53, 55, 57, 57, 49, 47, 45, 45, 45, 45,
+        46, 46, 46, 49, 53, 53, 53, 56, 58, 58, 49, 47, 45, 45, 45, 45, 46, 46,
+        46, 49, 53, 53, 53, 56, 58, 58,
+        /* Size 32x16 */
+        32, 31, 31, 31, 31, 31, 30, 30, 30, 32, 33, 33, 33, 35, 37, 37, 37, 39,
+        42, 42, 42, 45, 49, 49, 49, 48, 48, 48, 48, 48, 49, 49, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 33, 34, 34, 34, 36, 38, 38, 38, 40, 42, 42, 42, 45,
+        48, 48, 48, 47, 47, 47, 47, 47, 47, 47, 31, 31, 31, 31, 31, 32, 32, 32,
+        32, 34, 36, 36, 36, 38, 40, 40, 40, 41, 43, 43, 43, 44, 46, 46, 46, 46,
+        46, 46, 46, 45, 45, 45, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 36, 36,
+        36, 38, 40, 40, 40, 41, 43, 43, 43, 44, 46, 46, 46, 46, 46, 46, 46, 45,
+        45, 45, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 36, 36, 36, 38, 40, 40,
+        40, 41, 43, 43, 43, 44, 46, 46, 46, 46, 46, 46, 46, 45, 45, 45, 33, 34,
+        34, 34, 34, 35, 35, 35, 35, 37, 39, 39, 39, 41, 43, 43, 43, 44, 45, 45,
+        45, 46, 47, 47, 47, 47, 46, 46, 46, 46, 45, 45, 37, 37, 38, 38, 38, 39,
+        40, 40, 40, 41, 43, 43, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48,
+        48, 47, 47, 47, 47, 46, 46, 46, 37, 37, 38, 38, 38, 39, 40, 40, 40, 41,
+        43, 43, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47,
+        47, 46, 46, 46, 37, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45,
+        47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 47, 46, 46, 46,
+        42, 42, 42, 42, 42, 42, 42, 42, 42, 44, 45, 45, 45, 46, 47, 47, 47, 48,
+        48, 48, 48, 49, 50, 50, 50, 50, 50, 50, 50, 49, 49, 49, 48, 47, 47, 47,
+        47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 49, 50, 50, 50, 51,
+        53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 48, 47, 47, 47, 47, 46, 46, 46,
+        46, 46, 47, 47, 47, 47, 47, 47, 47, 49, 50, 50, 50, 51, 53, 53, 53, 53,
+        53, 53, 53, 53, 53, 53, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47,
+        47, 47, 47, 47, 47, 49, 50, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 53,
+        53, 53, 48, 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 46,
+        46, 48, 50, 50, 50, 51, 53, 53, 53, 54, 54, 54, 54, 55, 56, 56, 49, 48,
+        47, 47, 47, 46, 45, 45, 45, 45, 46, 46, 46, 45, 45, 45, 45, 47, 49, 49,
+        49, 51, 53, 53, 53, 54, 56, 56, 56, 57, 58, 58, 49, 48, 47, 47, 47, 46,
+        45, 45, 45, 45, 46, 46, 46, 45, 45, 45, 45, 47, 49, 49, 49, 51, 53, 53,
+        53, 54, 56, 56, 56, 57, 58, 58,
+        /* Size 4x16 */
+        31, 33, 42, 48, 31, 34, 42, 47, 31, 34, 42, 47, 31, 35, 42, 45, 31, 35,
+        42, 45, 34, 39, 45, 46, 34, 39, 45, 46, 38, 43, 47, 46, 38, 43, 47, 46,
+        42, 45, 48, 50, 42, 45, 48, 50, 48, 47, 50, 53, 48, 47, 50, 53, 47, 46,
+        50, 54, 47, 46, 50, 54, 47, 45, 49, 56,
+        /* Size 16x4 */
+        31, 31, 31, 31, 31, 34, 34, 38, 38, 42, 42, 48, 48, 47, 47, 47, 33, 34,
+        34, 35, 35, 39, 39, 43, 43, 45, 45, 47, 47, 46, 46, 45, 42, 42, 42, 42,
+        42, 45, 45, 47, 47, 48, 48, 50, 50, 50, 50, 49, 48, 47, 47, 45, 45, 46,
+        46, 46, 46, 50, 50, 53, 53, 54, 54, 56,
+        /* Size 8x32 */
+        32, 31, 31, 37, 37, 48, 48, 49, 31, 31, 31, 37, 37, 47, 47, 48, 31, 31,
+        31, 38, 38, 47, 47, 47, 31, 31, 31, 38, 38, 47, 47, 47, 31, 31, 31, 38,
+        38, 47, 47, 47, 31, 32, 32, 39, 39, 46, 46, 46, 30, 32, 32, 40, 40, 46,
+        46, 45, 30, 32, 32, 40, 40, 46, 46, 45, 30, 32, 32, 40, 40, 46, 46, 45,
+        32, 34, 34, 41, 41, 46, 46, 45, 33, 36, 36, 43, 43, 47, 47, 46, 33, 36,
+        36, 43, 43, 47, 47, 46, 33, 36, 36, 43, 43, 47, 47, 46, 35, 38, 38, 45,
+        45, 47, 47, 45, 37, 40, 40, 47, 47, 47, 47, 45, 37, 40, 40, 47, 47, 47,
+        47, 45, 37, 40, 40, 47, 47, 47, 47, 45, 39, 41, 41, 47, 47, 49, 49, 47,
+        42, 43, 43, 47, 47, 50, 50, 49, 42, 43, 43, 47, 47, 50, 50, 49, 42, 43,
+        43, 47, 47, 50, 50, 49, 45, 44, 44, 47, 47, 51, 51, 51, 49, 46, 46, 48,
+        48, 53, 53, 53, 49, 46, 46, 48, 48, 53, 53, 53, 49, 46, 46, 48, 48, 53,
+        53, 53, 48, 46, 46, 47, 47, 53, 53, 54, 48, 46, 46, 47, 47, 53, 53, 56,
+        48, 46, 46, 47, 47, 53, 53, 56, 48, 46, 46, 47, 47, 53, 53, 56, 48, 45,
+        45, 46, 46, 53, 53, 57, 49, 45, 45, 46, 46, 53, 53, 58, 49, 45, 45, 46,
+        46, 53, 53, 58,
+        /* Size 32x8 */
+        32, 31, 31, 31, 31, 31, 30, 30, 30, 32, 33, 33, 33, 35, 37, 37, 37, 39,
+        42, 42, 42, 45, 49, 49, 49, 48, 48, 48, 48, 48, 49, 49, 31, 31, 31, 31,
+        31, 32, 32, 32, 32, 34, 36, 36, 36, 38, 40, 40, 40, 41, 43, 43, 43, 44,
+        46, 46, 46, 46, 46, 46, 46, 45, 45, 45, 31, 31, 31, 31, 31, 32, 32, 32,
+        32, 34, 36, 36, 36, 38, 40, 40, 40, 41, 43, 43, 43, 44, 46, 46, 46, 46,
+        46, 46, 46, 45, 45, 45, 37, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43,
+        43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 47, 46,
+        46, 46, 37, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, 47, 47,
+        47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 47, 46, 46, 46, 48, 47,
+        47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 49, 50, 50,
+        50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 48, 47, 47, 47, 47, 46,
+        46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 49, 50, 50, 50, 51, 53, 53,
+        53, 53, 53, 53, 53, 53, 53, 53, 49, 48, 47, 47, 47, 46, 45, 45, 45, 45,
+        46, 46, 46, 45, 45, 45, 45, 47, 49, 49, 49, 51, 53, 53, 53, 54, 56, 56,
+        56, 57, 58, 58 },
+  },
+  {
+      { /* Luma */
+        /* Size 4x4 */
+        32, 32, 32, 35, 32, 32, 33, 35, 32, 33, 35, 38, 35, 35, 38, 46,
+        /* Size 8x8 */
+        31, 31, 31, 32, 32, 32, 34, 35, 31, 32, 32, 32, 32, 33, 34, 35, 31, 32,
+        32, 32, 32, 33, 33, 34, 32, 32, 32, 33, 34, 34, 35, 36, 32, 32, 32, 34,
+        35, 35, 36, 38, 32, 33, 33, 34, 35, 36, 38, 40, 34, 34, 33, 35, 36, 38,
+        39, 42, 35, 35, 34, 36, 38, 40, 42, 48,
+        /* Size 16x16 */
+        32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 34, 36, 36, 31, 31,
+        31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 31, 31, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 31, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 31, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 33, 33, 34, 34, 34, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 33, 33, 34, 34, 34, 31, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34,
+        35, 35, 36, 36, 31, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 35, 35, 36,
+        36, 36, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 35, 36, 36, 37, 37,
+        32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 36, 37, 37, 38, 38, 32, 32,
+        32, 32, 32, 32, 33, 34, 34, 35, 35, 36, 37, 37, 38, 38, 33, 33, 33, 33,
+        33, 33, 34, 35, 35, 36, 36, 38, 39, 40, 42, 42, 34, 34, 34, 34, 33, 33,
+        35, 35, 36, 37, 37, 39, 39, 41, 42, 42, 34, 34, 34, 34, 34, 34, 35, 36,
+        36, 37, 37, 40, 41, 42, 45, 45, 36, 35, 35, 35, 34, 34, 36, 36, 37, 38,
+        38, 42, 42, 45, 48, 48, 36, 35, 35, 35, 34, 34, 36, 36, 37, 38, 38, 42,
+        42, 45, 48, 48,
+        /* Size 32x32 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+        32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 36, 36, 37, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        33, 34, 34, 34, 34, 35, 35, 35, 35, 37, 31, 31, 31, 31, 31, 31, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34,
+        34, 35, 35, 35, 35, 36, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35, 35,
+        35, 36, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 36, 31, 31,
+        31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 36, 31, 31, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34,
+        34, 34, 34, 34, 35, 35, 35, 36, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 34,
+        34, 34, 34, 35, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35,
+        31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 31, 31, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+        33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 31, 31, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34,
+        34, 35, 35, 35, 35, 36, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 36, 36, 36,
+        36, 37, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33,
+        33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 36, 36, 36, 36, 36, 37, 31, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34,
+        34, 34, 35, 35, 35, 35, 36, 36, 36, 36, 36, 37, 31, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35,
+        35, 35, 36, 36, 36, 36, 36, 37, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 36, 36, 36, 36, 37,
+        37, 37, 37, 38, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34,
+        34, 34, 34, 35, 35, 35, 35, 35, 36, 36, 36, 36, 37, 37, 38, 38, 38, 39,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 35,
+        35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 36,
+        36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37,
+        37, 38, 38, 38, 38, 39, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        34, 34, 34, 34, 35, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 39, 40, 40,
+        40, 41, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35,
+        35, 36, 36, 36, 36, 37, 38, 39, 39, 39, 40, 41, 42, 42, 42, 42, 34, 34,
+        34, 34, 34, 34, 34, 33, 33, 33, 33, 34, 35, 35, 35, 35, 36, 36, 37, 37,
+        37, 38, 39, 39, 39, 39, 41, 42, 42, 42, 42, 43, 34, 34, 34, 34, 34, 34,
+        34, 33, 33, 33, 33, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 38, 39, 39,
+        39, 39, 41, 42, 42, 42, 42, 43, 34, 34, 34, 34, 34, 34, 34, 33, 33, 33,
+        33, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 38, 39, 39, 39, 39, 41, 42,
+        42, 42, 42, 43, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36,
+        36, 36, 36, 37, 37, 37, 37, 38, 40, 41, 41, 41, 42, 44, 45, 45, 45, 45,
+        35, 35, 35, 35, 35, 35, 34, 34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 37,
+        38, 38, 38, 39, 41, 42, 42, 42, 44, 46, 47, 47, 47, 48, 36, 35, 35, 35,
+        35, 35, 35, 34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40,
+        42, 42, 42, 42, 45, 47, 48, 48, 48, 49, 36, 35, 35, 35, 35, 35, 35, 34,
+        34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 42, 42, 42, 42,
+        45, 47, 48, 48, 48, 49, 36, 35, 35, 35, 35, 35, 35, 34, 34, 34, 34, 35,
+        36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 42, 42, 42, 42, 45, 47, 48, 48,
+        48, 49, 37, 37, 36, 36, 36, 36, 36, 35, 35, 35, 35, 36, 37, 37, 37, 37,
+        38, 39, 39, 39, 39, 41, 42, 43, 43, 43, 45, 48, 49, 49, 49, 50,
+        /* Size 4x8 */
+        31, 31, 32, 35, 32, 32, 32, 35, 32, 32, 33, 34, 32, 32, 34, 36, 32, 33,
+        35, 38, 33, 33, 36, 40, 34, 34, 37, 42, 35, 34, 38, 48,
+        /* Size 8x4 */
+        31, 32, 32, 32, 32, 33, 34, 35, 31, 32, 32, 32, 33, 33, 34, 34, 32, 32,
+        33, 34, 35, 36, 37, 38, 35, 35, 34, 36, 38, 40, 42, 48,
+        /* Size 8x16 */
+        32, 31, 31, 31, 32, 32, 35, 36, 31, 32, 32, 32, 32, 32, 35, 35, 31, 32,
+        32, 32, 32, 32, 35, 35, 31, 32, 32, 32, 32, 32, 34, 35, 31, 32, 32, 32,
+        33, 33, 34, 34, 31, 32, 32, 32, 33, 33, 34, 34, 31, 32, 32, 33, 34, 34,
+        35, 36, 32, 32, 32, 33, 34, 34, 36, 36, 32, 32, 32, 33, 34, 34, 36, 37,
+        32, 32, 33, 34, 35, 35, 37, 38, 32, 32, 33, 34, 35, 35, 37, 38, 33, 33,
+        33, 35, 36, 36, 40, 41, 34, 34, 34, 35, 37, 37, 41, 42, 34, 34, 34, 35,
+        37, 37, 43, 44, 36, 35, 34, 36, 38, 38, 46, 48, 36, 35, 34, 36, 38, 38,
+        46, 48,
+        /* Size 16x8 */
+        32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 33, 34, 34, 36, 36, 31, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 31, 32, 32, 32,
+        32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 31, 32, 32, 32, 32, 32,
+        33, 33, 33, 34, 34, 35, 35, 35, 36, 36, 32, 32, 32, 32, 33, 33, 34, 34,
+        34, 35, 35, 36, 37, 37, 38, 38, 32, 32, 32, 32, 33, 33, 34, 34, 34, 35,
+        35, 36, 37, 37, 38, 38, 35, 35, 35, 34, 34, 34, 35, 36, 36, 37, 37, 40,
+        41, 43, 46, 46, 36, 35, 35, 35, 34, 34, 36, 36, 37, 38, 38, 41, 42, 44,
+        48, 48,
+        /* Size 16x32 */
+        32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 33, 35, 36, 36, 36, 31, 31,
+        31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 31, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 33, 35, 35, 35, 35, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 33, 35, 35, 35, 35, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+        34, 35, 35, 35, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 35,
+        35, 35, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34,
+        31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 31, 32,
+        32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 31, 32, 32, 32,
+        32, 32, 33, 33, 33, 33, 33, 34, 35, 35, 35, 35, 31, 32, 32, 32, 32, 32,
+        33, 33, 34, 34, 34, 34, 35, 36, 36, 36, 32, 32, 32, 32, 32, 32, 33, 34,
+        34, 34, 34, 35, 36, 36, 36, 36, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34,
+        34, 35, 36, 36, 36, 36, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35,
+        36, 36, 36, 36, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 37,
+        37, 37, 32, 32, 32, 33, 33, 33, 33, 34, 35, 35, 35, 36, 37, 38, 38, 38,
+        32, 32, 32, 33, 33, 33, 34, 35, 35, 35, 35, 36, 37, 38, 38, 38, 32, 32,
+        32, 33, 33, 33, 34, 35, 35, 35, 35, 36, 37, 38, 38, 38, 32, 32, 32, 33,
+        33, 33, 34, 35, 35, 35, 35, 36, 37, 38, 38, 38, 32, 33, 33, 33, 33, 33,
+        34, 35, 36, 36, 36, 37, 39, 40, 40, 40, 33, 33, 33, 33, 33, 33, 35, 36,
+        36, 36, 36, 38, 40, 41, 41, 41, 34, 34, 34, 34, 34, 34, 35, 36, 37, 37,
+        37, 39, 41, 42, 42, 42, 34, 34, 34, 34, 34, 34, 35, 36, 37, 37, 37, 39,
+        41, 42, 42, 42, 34, 34, 34, 34, 34, 34, 35, 36, 37, 37, 37, 39, 41, 42,
+        42, 42, 34, 34, 34, 34, 34, 34, 35, 37, 37, 37, 37, 40, 43, 44, 44, 44,
+        35, 35, 34, 34, 34, 34, 36, 37, 38, 38, 38, 41, 45, 47, 47, 47, 36, 35,
+        35, 34, 34, 34, 36, 37, 38, 38, 38, 42, 46, 48, 48, 48, 36, 35, 35, 34,
+        34, 34, 36, 37, 38, 38, 38, 42, 46, 48, 48, 48, 36, 35, 35, 34, 34, 34,
+        36, 37, 38, 38, 38, 42, 46, 48, 48, 48, 37, 36, 36, 36, 36, 36, 37, 38,
+        39, 39, 39, 42, 46, 49, 49, 49,
+        /* Size 32x16 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 36, 36, 37, 31, 31, 31, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+        33, 34, 34, 34, 34, 35, 35, 35, 35, 36, 31, 31, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34,
+        34, 34, 35, 35, 35, 36, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34,
+        34, 36, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 36, 31, 31,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+        33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 36, 31, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35,
+        35, 35, 35, 36, 36, 36, 36, 37, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 36, 36, 36, 36, 37, 37,
+        37, 37, 37, 38, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34,
+        34, 34, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39,
+        32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35,
+        35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 32, 32, 32, 32,
+        32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 36,
+        36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 34, 34, 35, 35, 35, 35, 36, 36, 36, 36, 37, 38, 39, 39, 39,
+        40, 41, 42, 42, 42, 42, 35, 35, 35, 35, 35, 35, 34, 34, 34, 34, 34, 35,
+        35, 36, 36, 36, 36, 37, 37, 37, 37, 39, 40, 41, 41, 41, 43, 45, 46, 46,
+        46, 46, 36, 35, 35, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 36,
+        37, 38, 38, 38, 38, 40, 41, 42, 42, 42, 44, 47, 48, 48, 48, 49, 36, 35,
+        35, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38,
+        38, 40, 41, 42, 42, 42, 44, 47, 48, 48, 48, 49, 36, 35, 35, 35, 35, 35,
+        35, 35, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 41, 42,
+        42, 42, 44, 47, 48, 48, 48, 49,
+        /* Size 4x16 */
+        31, 31, 32, 36, 31, 32, 32, 35, 32, 32, 32, 35, 32, 32, 32, 35, 32, 32,
+        33, 34, 32, 32, 33, 34, 32, 32, 34, 36, 32, 32, 34, 36, 32, 32, 34, 37,
+        32, 33, 35, 38, 32, 33, 35, 38, 33, 33, 36, 41, 34, 34, 37, 42, 34, 34,
+        37, 44, 35, 34, 38, 48, 35, 34, 38, 48,
+        /* Size 16x4 */
+        31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 31, 32,
+        32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 32, 32, 32, 32,
+        33, 33, 34, 34, 34, 35, 35, 36, 37, 37, 38, 38, 36, 35, 35, 35, 34, 34,
+        36, 36, 37, 38, 38, 41, 42, 44, 48, 48,
+        /* Size 8x32 */
+        32, 31, 31, 31, 32, 32, 35, 36, 31, 31, 31, 32, 32, 32, 35, 35, 31, 32,
+        32, 32, 32, 32, 35, 35, 31, 32, 32, 32, 32, 32, 35, 35, 31, 32, 32, 32,
+        32, 32, 35, 35, 31, 32, 32, 32, 32, 32, 35, 35, 31, 32, 32, 32, 32, 32,
+        34, 35, 31, 32, 32, 32, 32, 32, 34, 35, 31, 32, 32, 32, 33, 33, 34, 34,
+        31, 32, 32, 32, 33, 33, 34, 34, 31, 32, 32, 32, 33, 33, 34, 34, 31, 32,
+        32, 33, 33, 33, 35, 35, 31, 32, 32, 33, 34, 34, 35, 36, 32, 32, 32, 33,
+        34, 34, 36, 36, 32, 32, 32, 33, 34, 34, 36, 36, 32, 32, 32, 33, 34, 34,
+        36, 36, 32, 32, 32, 33, 34, 34, 36, 37, 32, 32, 33, 33, 35, 35, 37, 38,
+        32, 32, 33, 34, 35, 35, 37, 38, 32, 32, 33, 34, 35, 35, 37, 38, 32, 32,
+        33, 34, 35, 35, 37, 38, 32, 33, 33, 34, 36, 36, 39, 40, 33, 33, 33, 35,
+        36, 36, 40, 41, 34, 34, 34, 35, 37, 37, 41, 42, 34, 34, 34, 35, 37, 37,
+        41, 42, 34, 34, 34, 35, 37, 37, 41, 42, 34, 34, 34, 35, 37, 37, 43, 44,
+        35, 34, 34, 36, 38, 38, 45, 47, 36, 35, 34, 36, 38, 38, 46, 48, 36, 35,
+        34, 36, 38, 38, 46, 48, 36, 35, 34, 36, 38, 38, 46, 48, 37, 36, 36, 37,
+        39, 39, 46, 49,
+        /* Size 32x8 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 36, 36, 37, 31, 31, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+        33, 34, 34, 34, 34, 34, 35, 35, 35, 36, 31, 31, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34,
+        34, 34, 34, 34, 34, 36, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+        33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 35, 36, 36, 36,
+        36, 37, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34,
+        34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 32, 32,
+        32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35,
+        35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 35, 35, 35, 35, 35, 35,
+        34, 34, 34, 34, 34, 35, 35, 36, 36, 36, 36, 37, 37, 37, 37, 39, 40, 41,
+        41, 41, 43, 45, 46, 46, 46, 46, 36, 35, 35, 35, 35, 35, 35, 35, 34, 34,
+        34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 41, 42, 42, 42, 44, 47,
+        48, 48, 48, 49 },
+      { /* Chroma */
+        /* Size 4x4 */
+        31, 32, 38, 46, 32, 34, 41, 46, 38, 41, 47, 47, 46, 46, 47, 52,
+        /* Size 8x8 */
+        31, 31, 30, 34, 36, 39, 42, 48, 31, 31, 31, 34, 37, 40, 42, 47, 30, 31,
+        32, 35, 39, 41, 42, 46, 34, 34, 35, 39, 42, 44, 45, 47, 36, 37, 39, 42,
+        46, 47, 47, 47, 39, 40, 41, 44, 47, 47, 48, 49, 42, 42, 42, 45, 47, 48,
+        48, 50, 48, 47, 46, 47, 47, 49, 50, 53,
+        /* Size 16x16 */
+        32, 31, 31, 31, 30, 30, 33, 33, 34, 36, 36, 40, 41, 44, 49, 49, 31, 31,
+        31, 31, 31, 31, 33, 34, 36, 38, 38, 41, 42, 44, 48, 48, 31, 31, 31, 31,
+        31, 31, 34, 34, 36, 38, 38, 41, 42, 44, 47, 47, 31, 31, 31, 31, 31, 31,
+        34, 35, 36, 39, 39, 41, 42, 44, 47, 47, 30, 31, 31, 31, 32, 32, 34, 35,
+        37, 40, 40, 42, 42, 44, 46, 46, 30, 31, 31, 31, 32, 32, 34, 35, 37, 40,
+        40, 42, 42, 44, 46, 46, 33, 33, 34, 34, 34, 34, 37, 38, 40, 42, 42, 44,
+        44, 45, 47, 47, 33, 34, 34, 35, 35, 35, 38, 39, 40, 43, 43, 44, 45, 46,
+        47, 47, 34, 36, 36, 36, 37, 37, 40, 40, 42, 45, 45, 45, 46, 46, 47, 47,
+        36, 38, 38, 39, 40, 40, 42, 43, 45, 47, 47, 47, 47, 47, 48, 48, 36, 38,
+        38, 39, 40, 40, 42, 43, 45, 47, 47, 47, 47, 47, 48, 48, 40, 41, 41, 41,
+        42, 42, 44, 44, 45, 47, 47, 48, 48, 49, 50, 50, 41, 42, 42, 42, 42, 42,
+        44, 45, 46, 47, 47, 48, 48, 49, 50, 50, 44, 44, 44, 44, 44, 44, 45, 46,
+        46, 47, 47, 49, 49, 50, 51, 51, 49, 48, 47, 47, 46, 46, 47, 47, 47, 48,
+        48, 50, 50, 51, 53, 53, 49, 48, 47, 47, 46, 46, 47, 47, 47, 48, 48, 50,
+        50, 51, 53, 53,
+        /* Size 32x32 */
+        32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 33, 33, 33, 33, 34, 36,
+        36, 36, 36, 38, 40, 41, 41, 41, 44, 47, 49, 49, 49, 49, 31, 31, 31, 31,
+        31, 31, 31, 31, 30, 30, 30, 32, 33, 34, 34, 34, 35, 36, 37, 37, 37, 39,
+        41, 42, 42, 42, 44, 47, 48, 48, 48, 48, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 32, 33, 34, 34, 34, 36, 37, 38, 38, 38, 39, 41, 42, 42, 42,
+        44, 46, 48, 48, 48, 47, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32,
+        34, 34, 34, 34, 36, 37, 38, 38, 38, 40, 41, 42, 42, 42, 44, 46, 47, 47,
+        47, 47, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 34, 34, 34, 34,
+        36, 37, 38, 38, 38, 40, 41, 42, 42, 42, 44, 46, 47, 47, 47, 47, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 34, 34, 34, 34, 36, 37, 38, 38,
+        38, 40, 41, 42, 42, 42, 44, 46, 47, 47, 47, 47, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 33, 34, 35, 35, 35, 36, 38, 39, 39, 39, 40, 41, 42,
+        42, 42, 44, 46, 47, 47, 47, 47, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 33, 34, 35, 35, 35, 37, 38, 39, 39, 39, 41, 42, 42, 42, 42, 44, 46,
+        46, 46, 46, 46, 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 35,
+        35, 35, 37, 39, 40, 40, 40, 41, 42, 42, 42, 42, 44, 45, 46, 46, 46, 46,
+        30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35, 37, 39,
+        40, 40, 40, 41, 42, 42, 42, 42, 44, 45, 46, 46, 46, 46, 30, 30, 31, 31,
+        31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35, 37, 39, 40, 40, 40, 41,
+        42, 42, 42, 42, 44, 45, 46, 46, 46, 46, 31, 32, 32, 32, 32, 32, 33, 33,
+        33, 33, 33, 34, 36, 37, 37, 37, 38, 40, 41, 41, 41, 42, 43, 43, 43, 43,
+        44, 46, 46, 46, 46, 46, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 36,
+        37, 38, 38, 38, 40, 41, 42, 42, 42, 43, 44, 44, 44, 44, 45, 46, 47, 47,
+        47, 46, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 37, 38, 39, 39, 39,
+        40, 42, 43, 43, 43, 44, 44, 45, 45, 45, 46, 47, 47, 47, 47, 47, 33, 34,
+        34, 34, 34, 34, 35, 35, 35, 35, 35, 37, 38, 39, 39, 39, 40, 42, 43, 43,
+        43, 44, 44, 45, 45, 45, 46, 47, 47, 47, 47, 47, 33, 34, 34, 34, 34, 34,
+        35, 35, 35, 35, 35, 37, 38, 39, 39, 39, 40, 42, 43, 43, 43, 44, 44, 45,
+        45, 45, 46, 47, 47, 47, 47, 47, 34, 35, 36, 36, 36, 36, 36, 37, 37, 37,
+        37, 38, 40, 40, 40, 40, 42, 44, 45, 45, 45, 45, 45, 46, 46, 46, 46, 47,
+        47, 47, 47, 47, 36, 36, 37, 37, 37, 37, 38, 38, 39, 39, 39, 40, 41, 42,
+        42, 42, 44, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47,
+        36, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 45, 46,
+        47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 36, 37, 38, 38,
+        38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 45, 46, 47, 47, 47, 47,
+        47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 36, 37, 38, 38, 38, 38, 39, 39,
+        40, 40, 40, 41, 42, 43, 43, 43, 45, 46, 47, 47, 47, 47, 47, 47, 47, 47,
+        47, 47, 48, 48, 48, 47, 38, 39, 39, 40, 40, 40, 40, 41, 41, 41, 41, 42,
+        43, 44, 44, 44, 45, 47, 47, 47, 47, 47, 48, 48, 48, 48, 48, 48, 49, 49,
+        49, 48, 40, 41, 41, 41, 41, 41, 41, 42, 42, 42, 42, 43, 44, 44, 44, 44,
+        45, 47, 47, 47, 47, 48, 48, 48, 48, 48, 49, 49, 50, 50, 50, 49, 41, 42,
+        42, 42, 42, 42, 42, 42, 42, 42, 42, 43, 44, 45, 45, 45, 46, 47, 47, 47,
+        47, 48, 48, 48, 48, 48, 49, 50, 50, 50, 50, 50, 41, 42, 42, 42, 42, 42,
+        42, 42, 42, 42, 42, 43, 44, 45, 45, 45, 46, 47, 47, 47, 47, 48, 48, 48,
+        48, 48, 49, 50, 50, 50, 50, 50, 41, 42, 42, 42, 42, 42, 42, 42, 42, 42,
+        42, 43, 44, 45, 45, 45, 46, 47, 47, 47, 47, 48, 48, 48, 48, 48, 49, 50,
+        50, 50, 50, 50, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 45, 46,
+        46, 46, 46, 47, 47, 47, 47, 48, 49, 49, 49, 49, 50, 51, 51, 51, 51, 51,
+        47, 47, 46, 46, 46, 46, 46, 46, 45, 45, 45, 46, 46, 47, 47, 47, 47, 47,
+        47, 47, 47, 48, 49, 50, 50, 50, 51, 52, 52, 52, 52, 52, 49, 48, 48, 47,
+        47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 48, 48, 49,
+        50, 50, 50, 50, 51, 52, 53, 53, 53, 53, 49, 48, 48, 47, 47, 47, 47, 46,
+        46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50, 50, 50,
+        51, 52, 53, 53, 53, 53, 49, 48, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46,
+        47, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50, 50, 50, 51, 52, 53, 53,
+        53, 53, 49, 48, 47, 47, 47, 47, 47, 46, 46, 46, 46, 46, 46, 47, 47, 47,
+        47, 47, 47, 47, 47, 48, 49, 50, 50, 50, 51, 52, 53, 53, 53, 53,
+        /* Size 4x8 */
+        31, 31, 37, 48, 31, 31, 38, 47, 31, 32, 40, 46, 34, 36, 43, 47, 37, 39,
+        46, 47, 39, 41, 47, 48, 42, 43, 47, 50, 48, 46, 48, 53,
+        /* Size 8x4 */
+        31, 31, 31, 34, 37, 39, 42, 48, 31, 31, 32, 36, 39, 41, 43, 46, 37, 38,
+        40, 43, 46, 47, 47, 48, 48, 47, 46, 47, 47, 48, 50, 53,
+        /* Size 8x16 */
+        32, 31, 31, 33, 37, 37, 45, 48, 31, 31, 31, 34, 38, 38, 45, 47, 31, 31,
+        31, 34, 38, 38, 45, 47, 31, 31, 32, 34, 39, 39, 45, 46, 30, 32, 32, 35,
+        40, 40, 44, 46, 30, 32, 32, 35, 40, 40, 44, 46, 33, 34, 35, 37, 42, 42,
+        46, 47, 33, 35, 36, 38, 43, 43, 46, 47, 35, 37, 37, 40, 44, 44, 46, 47,
+        37, 39, 40, 43, 47, 47, 47, 47, 37, 39, 40, 43, 47, 47, 47, 47, 41, 42,
+        42, 44, 47, 47, 49, 49, 42, 42, 43, 44, 47, 47, 49, 50, 44, 44, 44, 45,
+        47, 47, 50, 51, 49, 47, 46, 47, 48, 48, 52, 53, 49, 47, 46, 47, 48, 48,
+        52, 53,
+        /* Size 16x8 */
+        32, 31, 31, 31, 30, 30, 33, 33, 35, 37, 37, 41, 42, 44, 49, 49, 31, 31,
+        31, 31, 32, 32, 34, 35, 37, 39, 39, 42, 42, 44, 47, 47, 31, 31, 31, 32,
+        32, 32, 35, 36, 37, 40, 40, 42, 43, 44, 46, 46, 33, 34, 34, 34, 35, 35,
+        37, 38, 40, 43, 43, 44, 44, 45, 47, 47, 37, 38, 38, 39, 40, 40, 42, 43,
+        44, 47, 47, 47, 47, 47, 48, 48, 37, 38, 38, 39, 40, 40, 42, 43, 44, 47,
+        47, 47, 47, 47, 48, 48, 45, 45, 45, 45, 44, 44, 46, 46, 46, 47, 47, 49,
+        49, 50, 52, 52, 48, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 49, 50, 51,
+        53, 53,
+        /* Size 16x32 */
+        32, 31, 31, 31, 31, 31, 33, 35, 37, 37, 37, 40, 45, 48, 48, 48, 31, 31,
+        31, 31, 31, 31, 33, 36, 37, 37, 37, 41, 45, 48, 48, 48, 31, 31, 31, 31,
+        31, 31, 34, 36, 38, 38, 38, 41, 45, 47, 47, 47, 31, 31, 31, 31, 31, 31,
+        34, 37, 38, 38, 38, 41, 45, 47, 47, 47, 31, 31, 31, 31, 31, 31, 34, 37,
+        38, 38, 38, 41, 45, 47, 47, 47, 31, 31, 31, 31, 31, 31, 34, 37, 38, 38,
+        38, 41, 45, 47, 47, 47, 31, 31, 31, 32, 32, 32, 34, 37, 39, 39, 39, 41,
+        45, 46, 46, 46, 30, 31, 31, 32, 32, 32, 34, 38, 39, 39, 39, 42, 44, 46,
+        46, 46, 30, 31, 32, 32, 32, 32, 35, 38, 40, 40, 40, 42, 44, 46, 46, 46,
+        30, 31, 32, 32, 32, 32, 35, 38, 40, 40, 40, 42, 44, 46, 46, 46, 30, 31,
+        32, 32, 32, 32, 35, 38, 40, 40, 40, 42, 44, 46, 46, 46, 31, 32, 33, 33,
+        33, 33, 36, 39, 41, 41, 41, 43, 45, 46, 46, 46, 33, 34, 34, 35, 35, 35,
+        37, 40, 42, 42, 42, 44, 46, 47, 47, 47, 33, 34, 35, 36, 36, 36, 38, 41,
+        43, 43, 43, 44, 46, 47, 47, 47, 33, 34, 35, 36, 36, 36, 38, 41, 43, 43,
+        43, 44, 46, 47, 47, 47, 33, 34, 35, 36, 36, 36, 38, 41, 43, 43, 43, 44,
+        46, 47, 47, 47, 35, 36, 37, 37, 37, 37, 40, 43, 44, 44, 44, 45, 46, 47,
+        47, 47, 36, 37, 38, 39, 39, 39, 42, 44, 46, 46, 46, 47, 47, 47, 47, 47,
+        37, 38, 39, 40, 40, 40, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 37, 38,
+        39, 40, 40, 40, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 37, 38, 39, 40,
+        40, 40, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 39, 39, 40, 41, 41, 41,
+        43, 46, 47, 47, 47, 48, 48, 48, 48, 48, 41, 41, 42, 42, 42, 42, 44, 46,
+        47, 47, 47, 48, 49, 49, 49, 49, 42, 42, 42, 43, 43, 43, 44, 46, 47, 47,
+        47, 48, 49, 50, 50, 50, 42, 42, 42, 43, 43, 43, 44, 46, 47, 47, 47, 48,
+        49, 50, 50, 50, 42, 42, 42, 43, 43, 43, 44, 46, 47, 47, 47, 48, 49, 50,
+        50, 50, 44, 44, 44, 44, 44, 44, 45, 47, 47, 47, 47, 49, 50, 51, 51, 51,
+        47, 46, 46, 46, 46, 46, 46, 47, 48, 48, 48, 49, 51, 52, 52, 52, 49, 48,
+        47, 46, 46, 46, 47, 48, 48, 48, 48, 50, 52, 53, 53, 53, 49, 48, 47, 46,
+        46, 46, 47, 48, 48, 48, 48, 50, 52, 53, 53, 53, 49, 48, 47, 46, 46, 46,
+        47, 48, 48, 48, 48, 50, 52, 53, 53, 53, 49, 48, 47, 46, 46, 46, 47, 47,
+        47, 47, 47, 49, 52, 53, 53, 53,
+        /* Size 32x16 */
+        32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 33, 33, 33, 33, 35, 36,
+        37, 37, 37, 39, 41, 42, 42, 42, 44, 47, 49, 49, 49, 49, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 32, 34, 34, 34, 34, 36, 37, 38, 38, 38, 39,
+        41, 42, 42, 42, 44, 46, 48, 48, 48, 48, 31, 31, 31, 31, 31, 31, 31, 31,
+        32, 32, 32, 33, 34, 35, 35, 35, 37, 38, 39, 39, 39, 40, 42, 42, 42, 42,
+        44, 46, 47, 47, 47, 47, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33,
+        35, 36, 36, 36, 37, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 46, 46,
+        46, 46, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36,
+        37, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 46, 46, 46, 46, 31, 31,
+        31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, 37, 39, 40, 40,
+        40, 41, 42, 43, 43, 43, 44, 46, 46, 46, 46, 46, 33, 33, 34, 34, 34, 34,
+        34, 34, 35, 35, 35, 36, 37, 38, 38, 38, 40, 42, 43, 43, 43, 43, 44, 44,
+        44, 44, 45, 46, 47, 47, 47, 47, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38,
+        38, 39, 40, 41, 41, 41, 43, 44, 45, 45, 45, 46, 46, 46, 46, 46, 47, 47,
+        48, 48, 48, 47, 37, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43,
+        43, 43, 44, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 48, 47,
+        37, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46,
+        47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 48, 47, 37, 37, 38, 38,
+        38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 47, 47, 47, 47,
+        47, 47, 47, 47, 47, 48, 48, 48, 48, 47, 40, 41, 41, 41, 41, 41, 41, 42,
+        42, 42, 42, 43, 44, 44, 44, 44, 45, 47, 47, 47, 47, 48, 48, 48, 48, 48,
+        49, 49, 50, 50, 50, 49, 45, 45, 45, 45, 45, 45, 45, 44, 44, 44, 44, 45,
+        46, 46, 46, 46, 46, 47, 47, 47, 47, 48, 49, 49, 49, 49, 50, 51, 52, 52,
+        52, 52, 48, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 46, 47, 47, 47, 47,
+        47, 47, 47, 47, 47, 48, 49, 50, 50, 50, 51, 52, 53, 53, 53, 53, 48, 48,
+        47, 47, 47, 47, 46, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 47,
+        47, 48, 49, 50, 50, 50, 51, 52, 53, 53, 53, 53, 48, 48, 47, 47, 47, 47,
+        46, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 49, 50,
+        50, 50, 51, 52, 53, 53, 53, 53,
+        /* Size 4x16 */
+        31, 31, 37, 48, 31, 31, 38, 47, 31, 31, 38, 47, 31, 32, 39, 46, 31, 32,
+        40, 46, 31, 32, 40, 46, 34, 35, 42, 47, 34, 36, 43, 47, 36, 37, 44, 47,
+        38, 40, 47, 47, 38, 40, 47, 47, 41, 42, 47, 49, 42, 43, 47, 50, 44, 44,
+        47, 51, 48, 46, 48, 53, 48, 46, 48, 53,
+        /* Size 16x4 */
+        31, 31, 31, 31, 31, 31, 34, 34, 36, 38, 38, 41, 42, 44, 48, 48, 31, 31,
+        31, 32, 32, 32, 35, 36, 37, 40, 40, 42, 43, 44, 46, 46, 37, 38, 38, 39,
+        40, 40, 42, 43, 44, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 46, 46, 46,
+        47, 47, 47, 47, 47, 49, 50, 51, 53, 53,
+        /* Size 8x32 */
+        32, 31, 31, 33, 37, 37, 45, 48, 31, 31, 31, 33, 37, 37, 45, 48, 31, 31,
+        31, 34, 38, 38, 45, 47, 31, 31, 31, 34, 38, 38, 45, 47, 31, 31, 31, 34,
+        38, 38, 45, 47, 31, 31, 31, 34, 38, 38, 45, 47, 31, 31, 32, 34, 39, 39,
+        45, 46, 30, 31, 32, 34, 39, 39, 44, 46, 30, 32, 32, 35, 40, 40, 44, 46,
+        30, 32, 32, 35, 40, 40, 44, 46, 30, 32, 32, 35, 40, 40, 44, 46, 31, 33,
+        33, 36, 41, 41, 45, 46, 33, 34, 35, 37, 42, 42, 46, 47, 33, 35, 36, 38,
+        43, 43, 46, 47, 33, 35, 36, 38, 43, 43, 46, 47, 33, 35, 36, 38, 43, 43,
+        46, 47, 35, 37, 37, 40, 44, 44, 46, 47, 36, 38, 39, 42, 46, 46, 47, 47,
+        37, 39, 40, 43, 47, 47, 47, 47, 37, 39, 40, 43, 47, 47, 47, 47, 37, 39,
+        40, 43, 47, 47, 47, 47, 39, 40, 41, 43, 47, 47, 48, 48, 41, 42, 42, 44,
+        47, 47, 49, 49, 42, 42, 43, 44, 47, 47, 49, 50, 42, 42, 43, 44, 47, 47,
+        49, 50, 42, 42, 43, 44, 47, 47, 49, 50, 44, 44, 44, 45, 47, 47, 50, 51,
+        47, 46, 46, 46, 48, 48, 51, 52, 49, 47, 46, 47, 48, 48, 52, 53, 49, 47,
+        46, 47, 48, 48, 52, 53, 49, 47, 46, 47, 48, 48, 52, 53, 49, 47, 46, 47,
+        47, 47, 52, 53,
+        /* Size 32x8 */
+        32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 33, 33, 33, 33, 35, 36,
+        37, 37, 37, 39, 41, 42, 42, 42, 44, 47, 49, 49, 49, 49, 31, 31, 31, 31,
+        31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35, 37, 38, 39, 39, 39, 40,
+        42, 42, 42, 42, 44, 46, 47, 47, 47, 47, 31, 31, 31, 31, 31, 31, 32, 32,
+        32, 32, 32, 33, 35, 36, 36, 36, 37, 39, 40, 40, 40, 41, 42, 43, 43, 43,
+        44, 46, 46, 46, 46, 46, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 36,
+        37, 38, 38, 38, 40, 42, 43, 43, 43, 43, 44, 44, 44, 44, 45, 46, 47, 47,
+        47, 47, 37, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43,
+        44, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 48, 47, 37, 37,
+        38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 47, 47,
+        47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 48, 47, 45, 45, 45, 45, 45, 45,
+        45, 44, 44, 44, 44, 45, 46, 46, 46, 46, 46, 47, 47, 47, 47, 48, 49, 49,
+        49, 49, 50, 51, 52, 52, 52, 52, 48, 48, 47, 47, 47, 47, 46, 46, 46, 46,
+        46, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 49, 50, 50, 50, 51, 52,
+        53, 53, 53, 53 },
+  },
+  {
+      { /* Luma */
+        /* Size 4x4 */
+        31, 32, 32, 32, 32, 32, 32, 33, 32, 32, 33, 34, 32, 33, 34, 35,
+        /* Size 8x8 */
+        31, 31, 31, 31, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 31, 32,
+        32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32,
+        33, 33, 34, 35, 32, 32, 32, 32, 33, 34, 34, 35, 32, 32, 32, 32, 34, 34,
+        35, 36, 33, 33, 33, 33, 35, 35, 36, 38,
+        /* Size 16x16 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 31, 31,
+        31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 33, 33, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+        33, 34, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 35,
+        31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, 31, 32,
+        32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, 32, 32, 32, 32,
+        32, 32, 32, 33, 33, 33, 33, 34, 35, 35, 35, 36, 32, 32, 32, 32, 32, 32,
+        32, 33, 33, 34, 34, 35, 35, 35, 36, 37, 32, 32, 32, 32, 32, 32, 32, 33,
+        33, 34, 34, 35, 35, 35, 36, 37, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34,
+        34, 35, 36, 36, 36, 38, 34, 34, 34, 34, 34, 33, 33, 34, 35, 35, 35, 36,
+        37, 37, 38, 39,
+        /* Size 32x32 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 33, 33, 34, 34, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+        34, 34, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 31, 31,
+        31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 31, 31, 31, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 33, 33, 34, 34, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        33, 33, 34, 34, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34,
+        31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 31, 31, 31, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 31, 31, 31, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 33, 33, 33, 33, 33, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+        33, 33, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 31, 31,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 31, 31, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 34, 34, 34, 34, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34,
+        34, 34, 35, 35, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35,
+        31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+        33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 31, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33,
+        33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 31, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34,
+        34, 34, 34, 35, 35, 35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35,
+        35, 35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+        33, 33, 33, 33, 33, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34,
+        34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 36, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35,
+        35, 35, 35, 35, 36, 36, 37, 37, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35,
+        36, 36, 37, 37, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34,
+        34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 32, 32, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35,
+        35, 35, 36, 36, 36, 36, 36, 37, 38, 38, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 36,
+        36, 36, 37, 38, 38, 38, 34, 34, 34, 34, 34, 34, 34, 34, 34, 33, 33, 33,
+        33, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38,
+        39, 39, 34, 34, 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 33, 34, 34,
+        35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 39, 39,
+        /* Size 4x8 */
+        31, 31, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32,
+        33, 34, 32, 32, 34, 34, 32, 33, 34, 35, 33, 33, 35, 36,
+        /* Size 8x4 */
+        31, 31, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 33, 33, 32, 32,
+        32, 32, 33, 34, 34, 35, 32, 32, 32, 33, 34, 34, 35, 36,
+        /* Size 8x16 */
+        32, 31, 31, 31, 31, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 33, 31, 32,
+        32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32,
+        32, 32, 32, 33, 31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32, 33,
+        33, 33, 31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 33, 34, 34, 34,
+        32, 32, 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 34, 34, 34, 32, 32,
+        32, 32, 33, 35, 35, 35, 32, 32, 33, 33, 34, 35, 35, 36, 32, 32, 33, 33,
+        34, 35, 35, 36, 32, 33, 33, 33, 34, 36, 36, 36, 34, 34, 34, 34, 35, 37,
+        37, 38,
+        /* Size 16x8 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 31, 31,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 31, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 31, 32, 32, 32, 32, 32, 32, 32,
+        33, 33, 33, 33, 34, 34, 34, 35, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34,
+        34, 35, 35, 35, 36, 37, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35,
+        35, 35, 36, 37, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 35, 36, 36,
+        36, 38,
+        /* Size 16x32 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 31, 31,
+        31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 31, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 33, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        33, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34,
+        31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 31, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+        33, 33, 33, 33, 33, 34, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+        33, 33, 34, 34, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34,
+        34, 35, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35,
+        32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 32, 32,
+        32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 32, 32, 32, 32,
+        32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 32, 32, 32, 32, 32, 32,
+        32, 33, 33, 34, 34, 34, 34, 34, 35, 35, 32, 32, 32, 32, 32, 32, 32, 33,
+        33, 34, 35, 35, 35, 35, 35, 36, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34,
+        35, 35, 35, 35, 36, 36, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 35, 35,
+        35, 35, 36, 37, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35,
+        36, 37, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 36, 37,
+        32, 32, 32, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 36, 37, 32, 33,
+        33, 33, 33, 33, 33, 33, 34, 35, 36, 36, 36, 36, 36, 38, 33, 33, 33, 33,
+        33, 33, 33, 34, 34, 35, 36, 36, 36, 36, 37, 38, 34, 34, 34, 34, 34, 34,
+        34, 34, 35, 36, 37, 37, 37, 37, 38, 39, 34, 34, 34, 34, 34, 34, 34, 34,
+        35, 36, 37, 37, 37, 37, 38, 39,
+        /* Size 32x16 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 31, 31, 31, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 33, 33, 34, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+        34, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 31, 31,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 31, 31, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+        33, 33, 33, 33, 33, 33, 34, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 34, 34, 34, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+        33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 36, 36, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34,
+        35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35,
+        35, 35, 36, 36, 37, 37, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+        33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36,
+        37, 37, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33,
+        34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 32, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34,
+        34, 35, 35, 36, 36, 36, 36, 36, 36, 37, 38, 38, 34, 34, 34, 34, 34, 34,
+        34, 34, 34, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36,
+        37, 37, 37, 37, 38, 38, 39, 39,
+        /* Size 4x16 */
+        31, 31, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32,
+        32, 32, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 33, 33, 32, 32, 33, 34,
+        32, 32, 33, 34, 32, 32, 33, 34, 32, 32, 34, 35, 32, 33, 34, 35, 32, 33,
+        34, 35, 33, 33, 35, 36, 34, 34, 36, 37,
+        /* Size 16x4 */
+        31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 32, 32, 32, 32,
+        32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, 36, 32, 32, 32, 32, 32, 33,
+        33, 33, 34, 34, 34, 35, 35, 35, 36, 37,
+        /* Size 8x32 */
+        32, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 33, 31, 31,
+        32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32,
+        32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32,
+        32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33,
+        31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 33, 33, 33, 31, 32,
+        32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32,
+        32, 33, 33, 33, 31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 33, 33,
+        33, 34, 31, 32, 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 34, 34, 34,
+        32, 32, 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 34, 34, 34, 32, 32,
+        32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 34, 34, 35, 32, 32, 32, 32,
+        33, 35, 35, 35, 32, 32, 33, 33, 33, 35, 35, 36, 32, 32, 33, 33, 34, 35,
+        35, 36, 32, 32, 33, 33, 34, 35, 35, 36, 32, 32, 33, 33, 34, 35, 35, 36,
+        32, 32, 33, 33, 34, 35, 35, 36, 32, 33, 33, 33, 34, 36, 36, 36, 33, 33,
+        33, 33, 34, 36, 36, 37, 34, 34, 34, 34, 35, 37, 37, 38, 34, 34, 34, 34,
+        35, 37, 37, 38,
+        /* Size 32x8 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 31, 31, 31, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 31, 31, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+        33, 33, 33, 33, 34, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33,
+        34, 34, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34,
+        34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35,
+        35, 35, 35, 35, 36, 36, 37, 37, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 36, 36, 36, 36, 36,
+        36, 37, 38, 38 },
+      { /* Chroma */
+        /* Size 4x4 */
+        31, 31, 34, 38, 31, 32, 35, 40, 34, 35, 39, 43, 38, 40, 43, 47,
+        /* Size 8x8 */
+        31, 31, 31, 30, 34, 35, 37, 40, 31, 31, 31, 31, 34, 35, 38, 41, 31, 31,
+        31, 31, 35, 36, 39, 41, 30, 31, 31, 32, 35, 36, 40, 42, 34, 34, 35, 35,
+        39, 40, 43, 44, 35, 35, 36, 36, 40, 41, 44, 45, 37, 38, 39, 40, 43, 44,
+        47, 47, 40, 41, 41, 42, 44, 45, 47, 48,
+        /* Size 16x16 */
+        32, 31, 31, 31, 31, 30, 30, 31, 33, 33, 33, 35, 36, 36, 38, 41, 31, 31,
+        31, 31, 31, 31, 31, 31, 33, 34, 34, 36, 37, 37, 39, 42, 31, 31, 31, 31,
+        31, 31, 31, 32, 34, 34, 34, 37, 38, 38, 40, 42, 31, 31, 31, 31, 31, 31,
+        31, 32, 34, 34, 34, 37, 38, 38, 40, 42, 31, 31, 31, 31, 31, 31, 31, 32,
+        34, 35, 35, 37, 39, 39, 40, 42, 30, 31, 31, 31, 31, 32, 32, 32, 34, 35,
+        35, 38, 40, 40, 41, 42, 30, 31, 31, 31, 31, 32, 32, 32, 34, 35, 35, 38,
+        40, 40, 41, 42, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 38, 40, 40,
+        41, 43, 33, 33, 34, 34, 34, 34, 34, 35, 37, 38, 38, 41, 42, 42, 43, 44,
+        33, 34, 34, 34, 35, 35, 35, 36, 38, 39, 39, 41, 43, 43, 44, 45, 33, 34,
+        34, 34, 35, 35, 35, 36, 38, 39, 39, 41, 43, 43, 44, 45, 35, 36, 37, 37,
+        37, 38, 38, 38, 41, 41, 41, 44, 46, 46, 46, 46, 36, 37, 38, 38, 39, 40,
+        40, 40, 42, 43, 43, 46, 47, 47, 47, 47, 36, 37, 38, 38, 39, 40, 40, 40,
+        42, 43, 43, 46, 47, 47, 47, 47, 38, 39, 40, 40, 40, 41, 41, 41, 43, 44,
+        44, 46, 47, 47, 47, 48, 41, 42, 42, 42, 42, 42, 42, 43, 44, 45, 45, 46,
+        47, 47, 48, 48,
+        /* Size 32x32 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 32, 33, 33,
+        33, 33, 33, 34, 35, 36, 36, 36, 36, 37, 38, 40, 41, 41, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 32, 33, 34, 34, 34, 34, 35,
+        36, 37, 37, 37, 37, 37, 39, 40, 42, 42, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 34, 35, 36, 37, 37, 37,
+        37, 38, 39, 40, 42, 42, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 32, 32, 34, 34, 34, 34, 34, 35, 36, 38, 38, 38, 38, 38, 40, 41,
+        42, 42, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33,
+        34, 34, 34, 34, 34, 35, 37, 38, 38, 38, 38, 39, 40, 41, 42, 42, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 34,
+        34, 35, 37, 38, 38, 38, 38, 39, 40, 41, 42, 42, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 34, 34, 35, 37, 38,
+        38, 38, 38, 39, 40, 41, 42, 42, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 32, 33, 34, 34, 34, 34, 34, 36, 37, 38, 38, 38, 38, 39,
+        40, 41, 42, 42, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        32, 33, 34, 35, 35, 35, 35, 36, 37, 38, 39, 39, 39, 39, 40, 41, 42, 42,
+        30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 35,
+        35, 35, 35, 36, 37, 39, 39, 39, 39, 40, 40, 41, 42, 42, 30, 30, 31, 31,
+        31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 34, 35, 35, 35, 35, 36,
+        38, 39, 40, 40, 40, 40, 41, 42, 42, 42, 30, 30, 31, 31, 31, 31, 31, 31,
+        31, 31, 32, 32, 32, 32, 32, 33, 34, 35, 35, 35, 35, 36, 38, 39, 40, 40,
+        40, 40, 41, 42, 42, 42, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+        32, 32, 32, 33, 34, 35, 35, 35, 35, 36, 38, 39, 40, 40, 40, 40, 41, 42,
+        42, 42, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33,
+        34, 35, 35, 35, 35, 36, 38, 39, 40, 40, 40, 40, 41, 42, 42, 42, 31, 31,
+        31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 35, 36, 36, 36,
+        36, 37, 38, 40, 40, 40, 40, 41, 41, 42, 43, 43, 32, 32, 32, 32, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 34, 35, 36, 37, 37, 37, 37, 38, 39, 41,
+        41, 41, 41, 42, 42, 43, 43, 43, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34,
+        34, 34, 34, 34, 35, 36, 37, 38, 38, 38, 38, 39, 41, 42, 42, 42, 42, 43,
+        43, 44, 44, 44, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35,
+        36, 37, 38, 39, 39, 39, 39, 40, 41, 43, 43, 43, 43, 43, 44, 44, 45, 45,
+        33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 37, 38, 39,
+        39, 39, 39, 40, 41, 43, 43, 43, 43, 43, 44, 44, 45, 45, 33, 34, 34, 34,
+        34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 37, 38, 39, 39, 39, 39, 40,
+        41, 43, 43, 43, 43, 43, 44, 44, 45, 45, 33, 34, 34, 34, 34, 34, 34, 34,
+        35, 35, 35, 35, 35, 35, 36, 37, 38, 39, 39, 39, 39, 40, 41, 43, 43, 43,
+        43, 43, 44, 44, 45, 45, 34, 35, 35, 35, 35, 35, 35, 36, 36, 36, 36, 36,
+        36, 36, 37, 38, 39, 40, 40, 40, 40, 41, 42, 44, 44, 44, 44, 44, 45, 45,
+        45, 45, 35, 36, 36, 36, 37, 37, 37, 37, 37, 37, 38, 38, 38, 38, 38, 39,
+        41, 41, 41, 41, 41, 42, 44, 45, 46, 46, 46, 46, 46, 46, 46, 46, 36, 37,
+        37, 38, 38, 38, 38, 38, 38, 39, 39, 39, 39, 39, 40, 41, 42, 43, 43, 43,
+        43, 44, 45, 46, 47, 47, 47, 47, 47, 47, 47, 47, 36, 37, 37, 38, 38, 38,
+        38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 46, 47,
+        47, 47, 47, 47, 47, 47, 47, 47, 36, 37, 37, 38, 38, 38, 38, 38, 39, 39,
+        40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 46, 47, 47, 47, 47, 47,
+        47, 47, 47, 47, 36, 37, 37, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40,
+        40, 41, 42, 43, 43, 43, 43, 44, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47,
+        37, 37, 38, 38, 39, 39, 39, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43,
+        43, 43, 43, 44, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 38, 39, 39, 40,
+        40, 40, 40, 40, 40, 40, 41, 41, 41, 41, 41, 42, 43, 44, 44, 44, 44, 45,
+        46, 47, 47, 47, 47, 47, 47, 48, 48, 48, 40, 40, 40, 41, 41, 41, 41, 41,
+        41, 41, 42, 42, 42, 42, 42, 43, 44, 44, 44, 44, 44, 45, 46, 47, 47, 47,
+        47, 47, 48, 48, 48, 48, 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42,
+        42, 42, 43, 43, 44, 45, 45, 45, 45, 45, 46, 47, 47, 47, 47, 47, 48, 48,
+        48, 48, 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 43, 43,
+        44, 45, 45, 45, 45, 45, 46, 47, 47, 47, 47, 47, 48, 48, 48, 48,
+        /* Size 4x8 */
+        31, 31, 35, 37, 31, 31, 36, 38, 31, 32, 37, 39, 31, 32, 37, 40, 34, 36,
+        40, 43, 35, 37, 42, 44, 38, 40, 45, 47, 41, 42, 45, 47,
+        /* Size 8x4 */
+        31, 31, 31, 31, 34, 35, 38, 41, 31, 31, 32, 32, 36, 37, 40, 42, 35, 36,
+        37, 37, 40, 42, 45, 45, 37, 38, 39, 40, 43, 44, 47, 47,
+        /* Size 8x16 */
+        32, 31, 31, 31, 33, 37, 37, 38, 31, 31, 31, 31, 33, 38, 38, 39, 31, 31,
+        31, 31, 34, 38, 38, 40, 31, 31, 31, 31, 34, 38, 38, 40, 31, 31, 32, 32,
+        34, 39, 39, 40, 30, 31, 32, 32, 35, 40, 40, 41, 30, 31, 32, 32, 35, 40,
+        40, 41, 31, 32, 33, 33, 35, 40, 40, 41, 33, 34, 35, 35, 37, 42, 42, 43,
+        33, 35, 36, 36, 38, 43, 43, 44, 33, 35, 36, 36, 38, 43, 43, 44, 35, 37,
+        38, 38, 41, 45, 45, 46, 37, 39, 40, 40, 43, 47, 47, 47, 37, 39, 40, 40,
+        43, 47, 47, 47, 39, 40, 41, 41, 43, 47, 47, 47, 42, 42, 43, 43, 44, 47,
+        47, 48,
+        /* Size 16x8 */
+        32, 31, 31, 31, 31, 30, 30, 31, 33, 33, 33, 35, 37, 37, 39, 42, 31, 31,
+        31, 31, 31, 31, 31, 32, 34, 35, 35, 37, 39, 39, 40, 42, 31, 31, 31, 31,
+        32, 32, 32, 33, 35, 36, 36, 38, 40, 40, 41, 43, 31, 31, 31, 31, 32, 32,
+        32, 33, 35, 36, 36, 38, 40, 40, 41, 43, 33, 33, 34, 34, 34, 35, 35, 35,
+        37, 38, 38, 41, 43, 43, 43, 44, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43,
+        43, 45, 47, 47, 47, 47, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43, 45,
+        47, 47, 47, 47, 38, 39, 40, 40, 40, 41, 41, 41, 43, 44, 44, 46, 47, 47,
+        47, 48,
+        /* Size 16x32 */
+        32, 31, 31, 31, 31, 31, 31, 31, 33, 35, 37, 37, 37, 37, 38, 42, 31, 31,
+        31, 31, 31, 31, 31, 31, 33, 35, 37, 37, 37, 37, 39, 42, 31, 31, 31, 31,
+        31, 31, 31, 32, 33, 35, 38, 38, 38, 38, 39, 42, 31, 31, 31, 31, 31, 31,
+        31, 32, 34, 36, 38, 38, 38, 38, 40, 42, 31, 31, 31, 31, 31, 31, 31, 32,
+        34, 36, 38, 38, 38, 38, 40, 42, 31, 31, 31, 31, 31, 31, 31, 32, 34, 36,
+        38, 38, 38, 38, 40, 42, 31, 31, 31, 31, 31, 31, 31, 32, 34, 36, 38, 38,
+        38, 38, 40, 42, 31, 31, 31, 31, 31, 31, 31, 32, 34, 36, 38, 38, 38, 38,
+        40, 42, 31, 31, 31, 31, 32, 32, 32, 32, 34, 36, 39, 39, 39, 39, 40, 42,
+        30, 31, 31, 32, 32, 32, 32, 32, 34, 37, 39, 39, 39, 39, 40, 42, 30, 31,
+        31, 32, 32, 32, 32, 33, 35, 37, 40, 40, 40, 40, 41, 42, 30, 31, 31, 32,
+        32, 32, 32, 33, 35, 37, 40, 40, 40, 40, 41, 42, 30, 31, 31, 32, 32, 32,
+        32, 33, 35, 37, 40, 40, 40, 40, 41, 42, 30, 31, 31, 32, 32, 32, 32, 33,
+        35, 37, 40, 40, 40, 40, 41, 42, 31, 31, 32, 32, 33, 33, 33, 33, 35, 38,
+        40, 40, 40, 40, 41, 43, 32, 32, 33, 33, 34, 34, 34, 34, 36, 39, 41, 41,
+        41, 41, 42, 44, 33, 33, 34, 35, 35, 35, 35, 35, 37, 40, 42, 42, 42, 42,
+        43, 44, 33, 34, 35, 35, 36, 36, 36, 36, 38, 40, 43, 43, 43, 43, 44, 45,
+        33, 34, 35, 35, 36, 36, 36, 36, 38, 40, 43, 43, 43, 43, 44, 45, 33, 34,
+        35, 35, 36, 36, 36, 36, 38, 40, 43, 43, 43, 43, 44, 45, 33, 34, 35, 35,
+        36, 36, 36, 36, 38, 40, 43, 43, 43, 43, 44, 45, 34, 35, 36, 37, 37, 37,
+        37, 37, 39, 42, 44, 44, 44, 44, 45, 45, 35, 36, 37, 38, 38, 38, 38, 39,
+        41, 43, 45, 45, 45, 45, 46, 46, 36, 37, 38, 39, 39, 39, 39, 40, 42, 44,
+        47, 47, 47, 47, 47, 47, 37, 38, 39, 40, 40, 40, 40, 41, 43, 45, 47, 47,
+        47, 47, 47, 47, 37, 38, 39, 40, 40, 40, 40, 41, 43, 45, 47, 47, 47, 47,
+        47, 47, 37, 38, 39, 40, 40, 40, 40, 41, 43, 45, 47, 47, 47, 47, 47, 47,
+        37, 38, 39, 40, 40, 40, 40, 41, 43, 45, 47, 47, 47, 47, 47, 47, 39, 39,
+        40, 41, 41, 41, 41, 42, 43, 45, 47, 47, 47, 47, 47, 48, 40, 41, 41, 42,
+        42, 42, 42, 42, 44, 45, 47, 47, 47, 47, 47, 48, 42, 42, 42, 43, 43, 43,
+        43, 43, 44, 46, 47, 47, 47, 47, 48, 48, 42, 42, 42, 43, 43, 43, 43, 43,
+        44, 46, 47, 47, 47, 47, 48, 48,
+        /* Size 32x16 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 32, 33, 33,
+        33, 33, 33, 34, 35, 36, 37, 37, 37, 37, 39, 40, 42, 42, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 34, 35,
+        36, 37, 38, 38, 38, 38, 39, 41, 42, 42, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 32, 33, 34, 35, 35, 35, 35, 36, 37, 38, 39, 39,
+        39, 39, 40, 41, 42, 42, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32,
+        32, 32, 32, 33, 35, 35, 35, 35, 35, 37, 38, 39, 40, 40, 40, 40, 41, 42,
+        43, 43, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34,
+        35, 36, 36, 36, 36, 37, 38, 39, 40, 40, 40, 40, 41, 42, 43, 43, 31, 31,
+        31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 35, 36, 36, 36,
+        36, 37, 38, 39, 40, 40, 40, 40, 41, 42, 43, 43, 31, 31, 31, 31, 31, 31,
+        31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 35, 36, 36, 36, 36, 37, 38, 39,
+        40, 40, 40, 40, 41, 42, 43, 43, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+        33, 33, 33, 33, 33, 34, 35, 36, 36, 36, 36, 37, 39, 40, 41, 41, 41, 41,
+        42, 42, 43, 43, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35,
+        35, 36, 37, 38, 38, 38, 38, 39, 41, 42, 43, 43, 43, 43, 43, 44, 44, 44,
+        35, 35, 35, 36, 36, 36, 36, 36, 36, 37, 37, 37, 37, 37, 38, 39, 40, 40,
+        40, 40, 40, 42, 43, 44, 45, 45, 45, 45, 45, 45, 46, 46, 37, 37, 38, 38,
+        38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44,
+        45, 47, 47, 47, 47, 47, 47, 47, 47, 47, 37, 37, 38, 38, 38, 38, 38, 38,
+        39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 45, 47, 47, 47,
+        47, 47, 47, 47, 47, 47, 37, 37, 38, 38, 38, 38, 38, 38, 39, 39, 40, 40,
+        40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 45, 47, 47, 47, 47, 47, 47, 47,
+        47, 47, 37, 37, 38, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40, 41,
+        42, 43, 43, 43, 43, 44, 45, 47, 47, 47, 47, 47, 47, 47, 47, 47, 38, 39,
+        39, 40, 40, 40, 40, 40, 40, 40, 41, 41, 41, 41, 41, 42, 43, 44, 44, 44,
+        44, 45, 46, 47, 47, 47, 47, 47, 47, 47, 48, 48, 42, 42, 42, 42, 42, 42,
+        42, 42, 42, 42, 42, 42, 42, 42, 43, 44, 44, 45, 45, 45, 45, 45, 46, 47,
+        47, 47, 47, 47, 48, 48, 48, 48,
+        /* Size 4x16 */
+        31, 31, 35, 37, 31, 31, 35, 38, 31, 31, 36, 38, 31, 31, 36, 38, 31, 32,
+        36, 39, 31, 32, 37, 40, 31, 32, 37, 40, 31, 33, 38, 40, 33, 35, 40, 42,
+        34, 36, 40, 43, 34, 36, 40, 43, 36, 38, 43, 45, 38, 40, 45, 47, 38, 40,
+        45, 47, 39, 41, 45, 47, 42, 43, 46, 47,
+        /* Size 16x4 */
+        31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 34, 36, 38, 38, 39, 42, 31, 31,
+        31, 31, 32, 32, 32, 33, 35, 36, 36, 38, 40, 40, 41, 43, 35, 35, 36, 36,
+        36, 37, 37, 38, 40, 40, 40, 43, 45, 45, 45, 46, 37, 38, 38, 38, 39, 40,
+        40, 40, 42, 43, 43, 45, 47, 47, 47, 47,
+        /* Size 8x32 */
+        32, 31, 31, 31, 33, 37, 37, 38, 31, 31, 31, 31, 33, 37, 37, 39, 31, 31,
+        31, 31, 33, 38, 38, 39, 31, 31, 31, 31, 34, 38, 38, 40, 31, 31, 31, 31,
+        34, 38, 38, 40, 31, 31, 31, 31, 34, 38, 38, 40, 31, 31, 31, 31, 34, 38,
+        38, 40, 31, 31, 31, 31, 34, 38, 38, 40, 31, 31, 32, 32, 34, 39, 39, 40,
+        30, 31, 32, 32, 34, 39, 39, 40, 30, 31, 32, 32, 35, 40, 40, 41, 30, 31,
+        32, 32, 35, 40, 40, 41, 30, 31, 32, 32, 35, 40, 40, 41, 30, 31, 32, 32,
+        35, 40, 40, 41, 31, 32, 33, 33, 35, 40, 40, 41, 32, 33, 34, 34, 36, 41,
+        41, 42, 33, 34, 35, 35, 37, 42, 42, 43, 33, 35, 36, 36, 38, 43, 43, 44,
+        33, 35, 36, 36, 38, 43, 43, 44, 33, 35, 36, 36, 38, 43, 43, 44, 33, 35,
+        36, 36, 38, 43, 43, 44, 34, 36, 37, 37, 39, 44, 44, 45, 35, 37, 38, 38,
+        41, 45, 45, 46, 36, 38, 39, 39, 42, 47, 47, 47, 37, 39, 40, 40, 43, 47,
+        47, 47, 37, 39, 40, 40, 43, 47, 47, 47, 37, 39, 40, 40, 43, 47, 47, 47,
+        37, 39, 40, 40, 43, 47, 47, 47, 39, 40, 41, 41, 43, 47, 47, 47, 40, 41,
+        42, 42, 44, 47, 47, 47, 42, 42, 43, 43, 44, 47, 47, 48, 42, 42, 43, 43,
+        44, 47, 47, 48,
+        /* Size 32x8 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 32, 33, 33,
+        33, 33, 33, 34, 35, 36, 37, 37, 37, 37, 39, 40, 42, 42, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 35, 35, 35, 35, 36,
+        37, 38, 39, 39, 39, 39, 40, 41, 42, 42, 31, 31, 31, 31, 31, 31, 31, 31,
+        32, 32, 32, 32, 32, 32, 33, 34, 35, 36, 36, 36, 36, 37, 38, 39, 40, 40,
+        40, 40, 41, 42, 43, 43, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
+        32, 32, 33, 34, 35, 36, 36, 36, 36, 37, 38, 39, 40, 40, 40, 40, 41, 42,
+        43, 43, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 36,
+        37, 38, 38, 38, 38, 39, 41, 42, 43, 43, 43, 43, 43, 44, 44, 44, 37, 37,
+        38, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43,
+        43, 44, 45, 47, 47, 47, 47, 47, 47, 47, 47, 47, 37, 37, 38, 38, 38, 38,
+        38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 45, 47,
+        47, 47, 47, 47, 47, 47, 47, 47, 38, 39, 39, 40, 40, 40, 40, 40, 40, 40,
+        41, 41, 41, 41, 41, 42, 43, 44, 44, 44, 44, 45, 46, 47, 47, 47, 47, 47,
+        47, 47, 48, 48 },
+  },
+  {
+      { /* Luma */
+        /* Size 4x4 */
+        31, 31, 31, 32, 31, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 33,
+        /* Size 8x8 */
+        31, 31, 31, 31, 31, 31, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32,
+        32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32,
+        32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        33, 33, 32, 32, 32, 32, 32, 32, 33, 33,
+        /* Size 16x16 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 31, 31, 31, 31,
+        31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+        33, 33, 33, 33,
+        /* Size 32x32 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+        31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+        31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+        31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 33, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 31, 31,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        /* Size 4x8 */
+        31, 31, 31, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32,
+        32, 32, 31, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33,
+        /* Size 8x4 */
+        31, 31, 31, 31, 31, 31, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+        /* Size 8x16 */
+        32, 31, 31, 31, 31, 31, 31, 32, 31, 31, 31, 31, 31, 31, 32, 32, 31, 31,
+        32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32,
+        32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32,
+        32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+        31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32,
+        32, 32, 32, 32, 33, 33, 31, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32,
+        32, 32, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32, 32,
+        33, 34,
+        /* Size 16x8 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 31,
+        31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+        33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34,
+        34, 34,
+        /* Size 16x32 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 33, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 33, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+        31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 31,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 31, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 31, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+        33, 33, 33, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+        34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 33, 33, 33, 34, 34,
+        /* Size 32x16 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+        31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+        33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        34, 34, 34, 34, 34, 34, 34, 34,
+        /* Size 4x16 */
+        31, 31, 31, 32, 31, 31, 31, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32,
+        32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+        31, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32,
+        32, 33, 32, 32, 32, 33, 32, 32, 32, 33,
+        /* Size 16x4 */
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 31, 31,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 33, 33, 33, 33, 33,
+        /* Size 8x32 */
+        32, 31, 31, 31, 31, 31, 31, 32, 31, 31, 31, 31, 31, 31, 32, 32, 31, 31,
+        31, 31, 31, 31, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32,
+        32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32,
+        32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+        31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32,
+        32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32,
+        32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32,
+        32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+        31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32,
+        32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32,
+        32, 32, 33, 33, 31, 32, 32, 32, 32, 32, 33, 33, 31, 32, 32, 32, 32, 32,
+        33, 33, 32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34,
+        32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34, 32, 32,
+        32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32,
+        32, 32, 33, 34,
+        /* Size 32x8 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+        31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+        31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34,
+        34, 34, 34, 34 },
+      { /* Chroma */
+        /* Size 4x4 */
+        31, 31, 31, 34, 31, 31, 31, 35, 31, 31, 32, 35, 34, 35, 35, 39,
+        /* Size 8x8 */
+        31, 31, 31, 31, 30, 31, 33, 33, 31, 31, 31, 31, 31, 32, 34, 34, 31, 31,
+        31, 31, 31, 32, 34, 34, 31, 31, 31, 31, 31, 32, 35, 35, 30, 31, 31, 31,
+        32, 32, 35, 35, 31, 32, 32, 32, 32, 33, 36, 36, 33, 34, 34, 35, 35, 36,
+        39, 39, 33, 34, 34, 35, 35, 36, 39, 39,
+        /* Size 16x16 */
+        32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 33, 33, 33, 33, 31, 31,
+        31, 31, 31, 31, 31, 31, 30, 30, 30, 32, 33, 34, 34, 34, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 32, 34, 34, 34, 34, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 32, 34, 34, 34, 34, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 32, 34, 34, 34, 34, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33,
+        34, 35, 35, 35, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 35,
+        35, 35, 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35,
+        30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35, 30, 30,
+        31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35, 31, 32, 32, 32,
+        32, 32, 33, 33, 33, 33, 33, 34, 36, 37, 37, 37, 33, 33, 33, 34, 34, 34,
+        34, 34, 34, 34, 34, 36, 37, 38, 38, 38, 33, 34, 34, 34, 34, 34, 35, 35,
+        35, 35, 35, 37, 38, 39, 39, 39, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35,
+        35, 37, 38, 39, 39, 39, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 37,
+        38, 39, 39, 39,
+        /* Size 32x32 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30,
+        30, 30, 30, 31, 31, 32, 33, 33, 33, 33, 33, 33, 33, 34, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 31,
+        31, 32, 33, 33, 33, 33, 33, 33, 33, 34, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 31, 32, 32, 33, 34,
+        34, 34, 34, 34, 34, 34, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33, 34, 34, 34, 34, 34,
+        34, 35, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 32, 33, 33, 34, 34, 34, 34, 34, 34, 35, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 32, 32, 33, 34, 34, 34, 34, 34, 34, 34, 35, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 33,
+        34, 34, 34, 34, 34, 34, 34, 35, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 33, 34, 34, 34, 34,
+        34, 34, 34, 35, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 32, 32, 33, 34, 34, 34, 34, 34, 34, 34, 35,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 32, 32, 33, 34, 34, 34, 34, 34, 34, 34, 35, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32,
+        32, 33, 34, 34, 34, 34, 34, 34, 34, 35, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 33, 34, 35,
+        35, 35, 35, 35, 35, 35, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33, 34, 35, 35, 35, 35, 35,
+        35, 35, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 32, 33, 33, 34, 35, 35, 35, 35, 35, 35, 36, 30, 30,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 32, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 30, 30, 30, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34,
+        34, 35, 35, 35, 35, 35, 35, 36, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 35,
+        35, 35, 35, 36, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36,
+        30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32,
+        32, 32, 32, 32, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 30, 30, 30, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+        33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 30, 30, 30, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35,
+        35, 35, 35, 35, 35, 36, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 36, 36, 36, 36, 36,
+        36, 37, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 34, 34, 35, 36, 37, 37, 37, 37, 37, 37, 37, 32, 32,
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34,
+        34, 34, 35, 36, 37, 37, 37, 37, 37, 37, 37, 38, 33, 33, 33, 33, 33, 34,
+        34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 37,
+        37, 38, 38, 38, 38, 38, 38, 39, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34,
+        34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, 39, 39,
+        39, 39, 39, 40, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35,
+        35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, 39, 39, 39, 39, 39, 40,
+        33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35,
+        35, 35, 35, 36, 37, 37, 38, 39, 39, 39, 39, 39, 39, 40, 33, 33, 34, 34,
+        34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36,
+        37, 37, 38, 39, 39, 39, 39, 39, 39, 40, 33, 33, 34, 34, 34, 34, 34, 34,
+        34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39,
+        39, 39, 39, 39, 39, 40, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35,
+        35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, 39, 39, 39, 39,
+        39, 40, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 36, 36,
+        36, 36, 36, 36, 36, 37, 37, 38, 39, 40, 40, 40, 40, 40, 40, 40,
+        /* Size 4x8 */
+        31, 31, 31, 34, 31, 31, 31, 35, 31, 31, 31, 35, 31, 32, 32, 36, 31, 32,
+        32, 36, 31, 33, 33, 37, 34, 36, 36, 40, 34, 36, 36, 40,
+        /* Size 8x4 */
+        31, 31, 31, 31, 31, 31, 34, 34, 31, 31, 31, 32, 32, 33, 36, 36, 31, 31,
+        31, 32, 32, 33, 36, 36, 34, 35, 35, 36, 36, 37, 40, 40,
+        /* Size 8x16 */
+        32, 31, 31, 31, 31, 31, 33, 35, 31, 31, 31, 31, 31, 31, 33, 36, 31, 31,
+        31, 31, 31, 31, 34, 36, 31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 31,
+        31, 31, 34, 37, 31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 32, 32, 32,
+        34, 37, 30, 31, 31, 32, 32, 32, 34, 38, 30, 31, 32, 32, 32, 32, 35, 38,
+        30, 31, 32, 32, 32, 32, 35, 38, 30, 31, 32, 32, 32, 32, 35, 38, 31, 32,
+        33, 33, 33, 33, 36, 39, 33, 34, 34, 35, 35, 35, 37, 40, 33, 34, 35, 36,
+        36, 36, 38, 41, 33, 34, 35, 36, 36, 36, 38, 41, 33, 34, 35, 36, 36, 36,
+        38, 41,
+        /* Size 16x8 */
+        32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 33, 33, 33, 33, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 34, 34, 34, 34, 31, 31, 31, 31,
+        31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35, 31, 31, 31, 31, 31, 31,
+        32, 32, 32, 32, 32, 33, 35, 36, 36, 36, 31, 31, 31, 31, 31, 31, 32, 32,
+        32, 32, 32, 33, 35, 36, 36, 36, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
+        32, 33, 35, 36, 36, 36, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 36,
+        37, 38, 38, 38, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 40, 41,
+        41, 41,
+        /* Size 16x32 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 35, 37, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 35, 37, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 36, 37, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 32, 33, 35, 36, 38, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 32, 34, 35, 36, 38, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33,
+        34, 35, 37, 38, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 35,
+        37, 38, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 35, 37, 38,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 35, 37, 38, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 35, 37, 38, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 32,
+        32, 32, 32, 32, 32, 33, 34, 36, 37, 39, 31, 31, 31, 31, 31, 32, 32, 32,
+        32, 32, 32, 33, 34, 36, 37, 39, 30, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+        32, 33, 34, 36, 38, 39, 30, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33,
+        35, 36, 38, 40, 30, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36,
+        38, 40, 30, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40,
+        30, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40, 30, 31,
+        31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40, 30, 31, 31, 31,
+        32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40, 31, 31, 31, 32, 32, 33,
+        33, 33, 33, 33, 33, 34, 35, 37, 38, 40, 31, 32, 32, 33, 33, 33, 33, 33,
+        33, 33, 33, 35, 36, 37, 39, 41, 32, 32, 33, 33, 34, 34, 34, 34, 34, 34,
+        34, 35, 37, 38, 40, 41, 33, 33, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36,
+        37, 39, 40, 42, 33, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36, 37, 38, 40,
+        41, 43, 33, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36, 37, 38, 40, 41, 43,
+        33, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36, 37, 38, 40, 41, 43, 33, 34,
+        34, 35, 35, 36, 36, 36, 36, 36, 36, 37, 38, 40, 41, 43, 33, 34, 34, 35,
+        35, 36, 36, 36, 36, 36, 36, 37, 38, 40, 41, 43, 33, 34, 34, 35, 35, 36,
+        36, 36, 36, 36, 36, 37, 38, 40, 41, 43, 34, 34, 35, 35, 36, 36, 36, 36,
+        36, 36, 36, 38, 39, 40, 42, 44,
+        /* Size 32x16 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30,
+        30, 30, 30, 31, 31, 32, 33, 33, 33, 33, 33, 33, 33, 34, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        32, 32, 33, 34, 34, 34, 34, 34, 34, 34, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34,
+        34, 34, 34, 34, 34, 35, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33, 34, 35, 35, 35, 35, 35,
+        35, 35, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32,
+        32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34,
+        35, 36, 36, 36, 36, 36, 36, 36, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36,
+        36, 36, 36, 36, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+        33, 34, 35, 36, 36, 36, 36, 36, 36, 36, 32, 32, 32, 32, 32, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 35, 35, 36, 37,
+        37, 37, 37, 37, 37, 38, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34,
+        34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 38, 38, 38, 38,
+        38, 39, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 36, 36, 36,
+        36, 36, 36, 36, 36, 37, 37, 38, 39, 40, 40, 40, 40, 40, 40, 40, 35, 35,
+        36, 36, 36, 37, 37, 37, 37, 37, 37, 37, 37, 37, 38, 38, 38, 38, 38, 38,
+        38, 38, 39, 40, 40, 41, 41, 41, 41, 41, 41, 42, 37, 37, 37, 38, 38, 38,
+        38, 38, 38, 38, 38, 38, 39, 39, 39, 40, 40, 40, 40, 40, 40, 40, 41, 41,
+        42, 43, 43, 43, 43, 43, 43, 44,
+        /* Size 4x16 */
+        31, 31, 31, 34, 31, 31, 31, 34, 31, 31, 31, 35, 31, 31, 31, 35, 31, 31,
+        31, 35, 31, 31, 31, 35, 31, 32, 32, 36, 31, 32, 32, 36, 31, 32, 32, 36,
+        31, 32, 32, 36, 31, 32, 32, 36, 32, 33, 33, 37, 33, 35, 35, 39, 34, 36,
+        36, 40, 34, 36, 36, 40, 34, 36, 36, 40,
+        /* Size 16x4 */
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 31, 31,
+        31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, 31, 31, 31, 31,
+        31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, 34, 34, 35, 35, 35, 35,
+        36, 36, 36, 36, 36, 37, 39, 40, 40, 40,
+        /* Size 8x32 */
+        32, 31, 31, 31, 31, 31, 33, 35, 31, 31, 31, 31, 31, 31, 33, 35, 31, 31,
+        31, 31, 31, 31, 33, 36, 31, 31, 31, 31, 31, 31, 33, 36, 31, 31, 31, 31,
+        31, 31, 34, 36, 31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 31, 31, 31,
+        34, 37, 31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 31, 31, 31, 34, 37,
+        31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 31, 31, 31, 34, 37, 31, 31,
+        31, 31, 31, 31, 34, 37, 31, 31, 31, 32, 32, 32, 34, 37, 31, 31, 31, 32,
+        32, 32, 34, 37, 30, 31, 31, 32, 32, 32, 34, 38, 30, 31, 32, 32, 32, 32,
+        35, 38, 30, 31, 32, 32, 32, 32, 35, 38, 30, 31, 32, 32, 32, 32, 35, 38,
+        30, 31, 32, 32, 32, 32, 35, 38, 30, 31, 32, 32, 32, 32, 35, 38, 30, 31,
+        32, 32, 32, 32, 35, 38, 31, 31, 32, 33, 33, 33, 35, 38, 31, 32, 33, 33,
+        33, 33, 36, 39, 32, 33, 34, 34, 34, 34, 37, 40, 33, 34, 34, 35, 35, 35,
+        37, 40, 33, 34, 35, 36, 36, 36, 38, 41, 33, 34, 35, 36, 36, 36, 38, 41,
+        33, 34, 35, 36, 36, 36, 38, 41, 33, 34, 35, 36, 36, 36, 38, 41, 33, 34,
+        35, 36, 36, 36, 38, 41, 33, 34, 35, 36, 36, 36, 38, 41, 34, 35, 36, 36,
+        36, 36, 39, 42,
+        /* Size 32x8 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30,
+        30, 30, 30, 31, 31, 32, 33, 33, 33, 33, 33, 33, 33, 34, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        32, 33, 34, 34, 34, 34, 34, 34, 34, 35, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35,
+        35, 35, 35, 35, 35, 36, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36,
+        36, 36, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36, 33, 33, 33, 33, 34, 34,
+        34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 36, 37,
+        37, 38, 38, 38, 38, 38, 38, 39, 35, 35, 36, 36, 36, 37, 37, 37, 37, 37,
+        37, 37, 37, 37, 38, 38, 38, 38, 38, 38, 38, 38, 39, 40, 40, 41, 41, 41,
+        41, 41, 41, 42 },
+  },
+  {
+      { /* Luma */
+        /* Size 4x4 */
+        31, 31, 31, 31, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+        /* Size 8x8 */
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32,
+        32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32,
+        32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+        /* Size 16x16 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+        31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32,
+        /* Size 32x32 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32,
+        32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+        31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31,
+        31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+        31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        /* Size 4x8 */
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32,
+        32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+        /* Size 8x4 */
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31,
+        32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+        /* Size 8x16 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 31, 31, 31, 32,
+        32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32,
+        32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+        31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31,
+        32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32,
+        32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32,
+        32, 32,
+        /* Size 16x8 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32,
+        /* Size 16x32 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
+        32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+        31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+        31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+        31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32,
+        /* Size 32x16 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
+        32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+        31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31,
+        31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+        31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+        31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32,
+        /* Size 4x16 */
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 31, 32,
+        32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+        31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32,
+        32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+        /* Size 16x4 */
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        /* Size 8x32 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+        32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32,
+        31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31,
+        32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32,
+        32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32,
+        32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+        31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31,
+        32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32,
+        32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32,
+        32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+        31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31,
+        32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32,
+        32, 32, 32, 32,
+        /* Size 32x8 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+        31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32 },
+      { /* Chroma */
+        /* Size 4x4 */
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        /* Size 8x8 */
+        31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 30, 31, 31, 31, 31, 31, 31, 31,
+        /* Size 16x16 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 32,
+        /* Size 32x32 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30,
+        30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        32, 32, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+        /* Size 4x8 */
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 30, 31, 32, 32,
+        /* Size 8x4 */
+        31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32,
+        /* Size 8x16 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31,
+        31, 32, 32, 32, 30, 31, 31, 31, 31, 32, 32, 32, 30, 31, 31, 31, 32, 32,
+        32, 32,
+        /* Size 16x8 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+        32, 32,
+        /* Size 16x32 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+        32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+        32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+        30, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 30, 31,
+        31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 30, 30, 31, 31,
+        31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 30, 30, 31, 31, 31, 31,
+        31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 30, 30, 31, 31, 31, 31, 31, 31,
+        32, 32, 32, 32, 32, 32, 32, 32,
+        /* Size 32x16 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32,
+        32, 32, 32, 32, 32, 32, 32, 32,
+        /* Size 4x16 */
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 31, 31,
+        32, 32, 31, 31, 32, 32, 30, 31, 32, 32,
+        /* Size 16x4 */
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
+        /* Size 8x32 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 31, 32,
+        32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32,
+        30, 31, 31, 31, 31, 32, 32, 32, 30, 31, 31, 31, 31, 32, 32, 32, 30, 31,
+        31, 31, 32, 32, 32, 32, 30, 31, 31, 31, 32, 32, 32, 32, 30, 31, 31, 31,
+        32, 32, 32, 32,
+        /* Size 32x8 */
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+        32, 32, 32, 32 },
+  },
+};
+
+static const qm_val_t wt_matrix_ref[NUM_QM_LEVELS - 1][2][QM_TOTAL_SIZE] = {
+  {
+      { /* Luma */
+        /* Size 4x4 */
+        32, 24, 14, 11, 24, 15, 11, 9, 14, 11, 7, 7, 11, 9, 7, 5,
+        /* Size 8x8 */
+        32, 32, 27, 20, 15, 12, 11, 9, 32, 29, 26, 21, 16, 13, 12, 10, 27, 26,
+        19, 16, 13, 11, 10, 10, 20, 21, 16, 12, 11, 9, 9, 8, 15, 16, 13, 11, 9,
+        8, 7, 7, 12, 13, 11, 9, 8, 7, 6, 6, 11, 12, 10, 9, 7, 6, 6, 5, 9, 10,
+        10, 8, 7, 6, 5, 5,
+        /* Size 16x16 */
+        32, 33, 33, 30, 28, 23, 21, 17, 16, 13, 12, 11, 11, 10, 9, 9, 33, 32,
+        32, 31, 30, 25, 23, 19, 17, 14, 14, 12, 11, 11, 10, 9, 33, 32, 31, 29,
+        28, 24, 23, 19, 17, 14, 14, 13, 12, 11, 10, 10, 30, 31, 29, 26, 24, 22,
+        20, 18, 16, 14, 13, 13, 12, 11, 11, 10, 28, 30, 28, 24, 21, 19, 18, 16,
+        15, 13, 13, 12, 11, 11, 10, 10, 23, 25, 24, 22, 19, 16, 15, 14, 13, 11,
+        11, 11, 10, 10, 9, 9, 21, 23, 23, 20, 18, 15, 14, 13, 12, 11, 10, 10, 9,
+        9, 9, 9, 17, 19, 19, 18, 16, 14, 13, 11, 10, 9, 9, 9, 9, 8, 8, 8, 16,
+        17, 17, 16, 15, 13, 12, 10, 10, 9, 8, 8, 8, 8, 8, 7, 13, 14, 14, 14, 13,
+        11, 11, 9, 9, 8, 7, 7, 7, 7, 7, 7, 12, 14, 14, 13, 13, 11, 10, 9, 8, 7,
+        7, 7, 7, 7, 6, 6, 11, 12, 13, 13, 12, 11, 10, 9, 8, 7, 7, 6, 6, 6, 6, 6,
+        11, 11, 12, 12, 11, 10, 9, 9, 8, 7, 7, 6, 6, 6, 5, 5, 10, 11, 11, 11,
+        11, 10, 9, 8, 8, 7, 7, 6, 6, 5, 5, 5, 9, 10, 10, 11, 10, 9, 9, 8, 8, 7,
+        6, 6, 5, 5, 5, 5, 9, 9, 10, 10, 10, 9, 9, 8, 7, 7, 6, 6, 5, 5, 5, 4,
+        /* Size 32x32 */
+        32, 33, 33, 33, 33, 32, 30, 29, 28, 26, 23, 22, 21, 19, 17, 17, 16, 14,
+        13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 9, 8, 33, 32, 32, 32, 32,
+        32, 30, 30, 29, 27, 24, 23, 22, 20, 18, 17, 17, 15, 13, 13, 13, 12, 12,
+        12, 11, 11, 10, 10, 10, 9, 9, 9, 33, 32, 32, 32, 32, 32, 31, 30, 30, 28,
+        25, 24, 23, 21, 19, 18, 17, 16, 14, 14, 14, 13, 12, 12, 11, 11, 11, 10,
+        10, 9, 9, 9, 33, 32, 32, 32, 31, 31, 30, 29, 29, 27, 25, 24, 23, 21, 19,
+        18, 17, 16, 14, 14, 14, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 9, 33,
+        32, 32, 31, 31, 30, 29, 28, 28, 26, 24, 23, 23, 20, 19, 18, 17, 16, 14,
+        14, 14, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 32, 32, 32, 31, 30,
+        29, 28, 28, 27, 26, 24, 23, 22, 21, 19, 19, 18, 16, 15, 15, 14, 13, 13,
+        12, 12, 12, 11, 11, 10, 10, 10, 9, 30, 30, 31, 30, 29, 28, 26, 25, 24,
+        23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11,
+        11, 11, 10, 10, 9, 29, 30, 30, 29, 28, 28, 25, 24, 23, 22, 20, 20, 19,
+        18, 17, 16, 16, 15, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 10, 10,
+        10, 28, 29, 30, 29, 28, 27, 24, 23, 21, 20, 19, 19, 18, 17, 16, 16, 15,
+        14, 13, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10, 26, 27, 28,
+        27, 26, 26, 23, 22, 20, 19, 18, 17, 17, 16, 15, 14, 14, 13, 12, 12, 12,
+        11, 11, 11, 11, 10, 10, 10, 10, 10, 9, 9, 23, 24, 25, 25, 24, 24, 22,
+        20, 19, 18, 16, 16, 15, 14, 14, 13, 13, 12, 11, 11, 11, 11, 11, 11, 10,
+        10, 10, 10, 9, 9, 9, 9, 22, 23, 24, 24, 23, 23, 21, 20, 19, 17, 16, 15,
+        15, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 9, 9, 9, 9, 9,
+        8, 21, 22, 23, 23, 23, 22, 20, 19, 18, 17, 15, 15, 14, 13, 13, 12, 12,
+        11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9, 9, 9, 8, 19, 20, 21, 21, 20,
+        21, 19, 18, 17, 16, 14, 14, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9,
+        9, 9, 9, 9, 9, 8, 8, 8, 17, 18, 19, 19, 19, 19, 18, 17, 16, 15, 14, 13,
+        13, 12, 11, 11, 10, 10, 9, 9, 9, 9, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 17,
+        17, 18, 18, 18, 19, 17, 16, 16, 14, 13, 13, 12, 12, 11, 10, 10, 10, 9,
+        9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 16, 17, 17, 17, 17, 18, 16, 16,
+        15, 14, 13, 12, 12, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+        7, 7, 7, 14, 15, 16, 16, 16, 16, 15, 15, 14, 13, 12, 12, 11, 11, 10, 10,
+        9, 9, 8, 8, 8, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 13, 13, 14, 14, 14, 15,
+        14, 13, 13, 12, 11, 11, 11, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7,
+        7, 7, 7, 7, 7, 13, 13, 14, 14, 14, 15, 14, 13, 13, 12, 11, 11, 11, 10,
+        9, 9, 9, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 12, 13, 14, 14,
+        14, 14, 13, 13, 13, 12, 11, 11, 10, 10, 9, 9, 8, 8, 7, 7, 7, 7, 7, 7, 7,
+        7, 7, 6, 6, 6, 6, 6, 12, 12, 13, 13, 13, 13, 13, 12, 12, 11, 11, 10, 10,
+        9, 9, 9, 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 11, 12, 12, 12,
+        13, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6,
+        6, 6, 6, 6, 6, 6, 6, 11, 12, 12, 12, 12, 12, 12, 12, 11, 11, 11, 10, 10,
+        9, 9, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 11, 11, 11, 12,
+        12, 12, 12, 12, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6,
+        6, 6, 5, 5, 5, 5, 5, 10, 11, 11, 11, 12, 12, 12, 11, 11, 10, 10, 10, 9,
+        9, 9, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 10, 10, 11, 11,
+        11, 11, 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6,
+        5, 5, 5, 5, 5, 5, 10, 10, 10, 11, 11, 11, 11, 11, 10, 10, 10, 9, 9, 9,
+        8, 8, 8, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 9, 10, 10, 10, 10,
+        10, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5,
+        5, 5, 5, 5, 5, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 9, 9, 9, 8, 8, 8, 7,
+        7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 4, 9, 9, 9, 10, 10, 10, 10,
+        10, 10, 9, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5,
+        4, 4, 8, 9, 9, 9, 9, 9, 9, 10, 10, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 6, 6,
+        6, 6, 5, 5, 5, 5, 5, 5, 4, 4, 4,
+        /* Size 4x8 */
+        32, 24, 14, 11, 31, 24, 15, 12, 28, 18, 12, 11, 21, 14, 10, 9, 16, 12,
+        8, 8, 13, 11, 7, 7, 11, 10, 7, 6, 10, 9, 7, 5,
+        /* Size 8x4 */
+        32, 31, 28, 21, 16, 13, 11, 10, 24, 24, 18, 14, 12, 11, 10, 9, 14, 15,
+        12, 10, 8, 7, 7, 7, 11, 12, 11, 9, 8, 7, 6, 5,
+        /* Size 8x16 */
+        32, 32, 28, 19, 16, 12, 11, 10, 33, 31, 30, 21, 17, 13, 12, 11, 32, 30,
+        28, 20, 17, 13, 12, 12, 30, 28, 24, 19, 16, 13, 13, 12, 28, 27, 21, 17,
+        15, 12, 12, 11, 23, 24, 19, 14, 13, 11, 11, 11, 21, 22, 18, 13, 12, 10,
+        10, 10, 18, 19, 16, 12, 10, 9, 9, 9, 16, 18, 15, 11, 10, 8, 8, 8, 13,
+        15, 13, 10, 9, 7, 8, 8, 12, 14, 13, 10, 8, 7, 7, 7, 11, 13, 12, 10, 8,
+        7, 6, 6, 11, 12, 11, 10, 8, 7, 6, 6, 10, 11, 10, 9, 8, 7, 6, 6, 9, 10,
+        10, 9, 7, 6, 6, 5, 9, 10, 10, 9, 8, 7, 6, 5,
+        /* Size 16x8 */
+        32, 33, 32, 30, 28, 23, 21, 18, 16, 13, 12, 11, 11, 10, 9, 9, 32, 31,
+        30, 28, 27, 24, 22, 19, 18, 15, 14, 13, 12, 11, 10, 10, 28, 30, 28, 24,
+        21, 19, 18, 16, 15, 13, 13, 12, 11, 10, 10, 10, 19, 21, 20, 19, 17, 14,
+        13, 12, 11, 10, 10, 10, 10, 9, 9, 9, 16, 17, 17, 16, 15, 13, 12, 10, 10,
+        9, 8, 8, 8, 8, 7, 8, 12, 13, 13, 13, 12, 11, 10, 9, 8, 7, 7, 7, 7, 7, 6,
+        7, 11, 12, 12, 13, 12, 11, 10, 9, 8, 8, 7, 6, 6, 6, 6, 6, 10, 11, 12,
+        12, 11, 11, 10, 9, 8, 8, 7, 6, 6, 6, 5, 5,
+        /* Size 16x32 */
+        32, 33, 32, 30, 28, 23, 19, 17, 16, 13, 12, 11, 11, 11, 10, 10, 33, 32,
+        32, 30, 29, 24, 20, 18, 17, 14, 12, 12, 12, 11, 11, 11, 33, 32, 31, 31,
+        30, 25, 21, 19, 17, 14, 13, 12, 12, 11, 11, 11, 33, 32, 31, 30, 29, 25,
+        21, 19, 17, 14, 13, 13, 12, 12, 11, 11, 32, 32, 30, 29, 28, 24, 20, 19,
+        17, 14, 13, 13, 12, 12, 12, 11, 32, 31, 29, 28, 27, 24, 21, 19, 18, 15,
+        14, 13, 12, 12, 12, 11, 30, 30, 28, 26, 24, 21, 19, 18, 16, 14, 13, 13,
+        13, 12, 12, 11, 29, 30, 28, 25, 23, 20, 18, 17, 16, 13, 12, 12, 12, 12,
+        12, 11, 28, 30, 27, 24, 21, 19, 17, 16, 15, 13, 12, 12, 12, 12, 11, 11,
+        26, 28, 26, 23, 20, 18, 16, 15, 14, 12, 12, 12, 11, 11, 11, 11, 23, 25,
+        24, 21, 19, 16, 14, 14, 13, 11, 11, 11, 11, 11, 11, 11, 22, 24, 23, 21,
+        19, 16, 14, 13, 12, 11, 10, 10, 10, 10, 10, 10, 21, 23, 22, 20, 18, 15,
+        13, 13, 12, 11, 10, 10, 10, 10, 10, 10, 19, 21, 20, 19, 17, 14, 12, 12,
+        11, 10, 9, 10, 10, 9, 10, 9, 18, 19, 19, 18, 16, 14, 12, 11, 10, 9, 9,
+        9, 9, 9, 9, 9, 17, 18, 18, 17, 16, 13, 12, 11, 10, 9, 9, 9, 9, 9, 9, 9,
+        16, 17, 18, 16, 15, 13, 11, 10, 10, 9, 8, 8, 8, 8, 8, 8, 14, 16, 16, 15,
+        14, 12, 11, 10, 9, 8, 8, 8, 8, 8, 8, 8, 13, 14, 15, 14, 13, 11, 10, 9,
+        9, 8, 7, 8, 8, 8, 8, 8, 13, 14, 14, 14, 13, 11, 10, 9, 9, 8, 7, 7, 7, 7,
+        7, 7, 12, 14, 14, 13, 13, 11, 10, 9, 8, 8, 7, 7, 7, 7, 7, 7, 12, 13, 13,
+        13, 12, 11, 9, 9, 8, 7, 7, 7, 7, 7, 7, 7, 11, 12, 13, 13, 12, 10, 10, 9,
+        8, 7, 7, 7, 6, 6, 6, 7, 11, 12, 12, 12, 11, 10, 10, 9, 8, 7, 7, 6, 6, 6,
+        6, 6, 11, 12, 12, 12, 11, 10, 10, 8, 8, 7, 7, 6, 6, 6, 6, 6, 10, 11, 12,
+        12, 11, 10, 9, 8, 8, 7, 7, 6, 6, 6, 6, 6, 10, 11, 11, 11, 10, 10, 9, 9,
+        8, 7, 7, 6, 6, 6, 6, 6, 10, 11, 11, 11, 10, 10, 9, 9, 8, 7, 7, 6, 6, 5,
+        5, 5, 9, 10, 10, 11, 10, 9, 9, 8, 7, 7, 6, 6, 6, 5, 5, 5, 9, 10, 10, 10,
+        10, 9, 9, 8, 7, 7, 6, 6, 6, 5, 5, 5, 9, 9, 10, 10, 10, 9, 9, 8, 8, 7, 7,
+        6, 6, 5, 5, 5, 8, 9, 9, 10, 10, 9, 9, 8, 8, 7, 7, 6, 6, 5, 5, 5,
+        /* Size 32x16 */
+        32, 33, 33, 33, 32, 32, 30, 29, 28, 26, 23, 22, 21, 19, 18, 17, 16, 14,
+        13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 9, 8, 33, 32, 32, 32, 32,
+        31, 30, 30, 30, 28, 25, 24, 23, 21, 19, 18, 17, 16, 14, 14, 14, 13, 12,
+        12, 12, 11, 11, 11, 10, 10, 9, 9, 32, 32, 31, 31, 30, 29, 28, 28, 27,
+        26, 24, 23, 22, 20, 19, 18, 18, 16, 15, 14, 14, 13, 13, 12, 12, 12, 11,
+        11, 10, 10, 10, 9, 30, 30, 31, 30, 29, 28, 26, 25, 24, 23, 21, 21, 20,
+        19, 18, 17, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10,
+        10, 28, 29, 30, 29, 28, 27, 24, 23, 21, 20, 19, 19, 18, 17, 16, 16, 15,
+        14, 13, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 23, 24, 25,
+        25, 24, 24, 21, 20, 19, 18, 16, 16, 15, 14, 14, 13, 13, 12, 11, 11, 11,
+        11, 10, 10, 10, 10, 10, 10, 9, 9, 9, 9, 19, 20, 21, 21, 20, 21, 19, 18,
+        17, 16, 14, 14, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 10, 10, 10, 9, 9,
+        9, 9, 9, 9, 9, 17, 18, 19, 19, 19, 19, 18, 17, 16, 15, 14, 13, 13, 12,
+        11, 11, 10, 10, 9, 9, 9, 9, 9, 9, 8, 8, 9, 9, 8, 8, 8, 8, 16, 17, 17,
+        17, 17, 18, 16, 16, 15, 14, 13, 12, 12, 11, 10, 10, 10, 9, 9, 9, 8, 8,
+        8, 8, 8, 8, 8, 8, 7, 7, 8, 8, 13, 14, 14, 14, 14, 15, 14, 13, 13, 12,
+        11, 11, 11, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+        12, 12, 13, 13, 13, 14, 13, 12, 12, 12, 11, 10, 10, 9, 9, 9, 8, 8, 7, 7,
+        7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 7, 7, 11, 12, 12, 13, 13, 13, 13, 12, 12,
+        12, 11, 10, 10, 10, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6,
+        6, 11, 12, 12, 12, 12, 12, 13, 12, 12, 11, 11, 10, 10, 10, 9, 9, 8, 8,
+        8, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 11, 11, 11, 12, 12, 12, 12,
+        12, 12, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, 6, 5, 5,
+        5, 5, 5, 10, 11, 11, 11, 12, 12, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9,
+        8, 8, 8, 7, 7, 7, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 10, 11, 11, 11, 11, 11,
+        11, 11, 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 5,
+        5, 5, 5, 5,
+        /* Size 4x16 */
+        33, 23, 13, 11, 32, 25, 14, 11, 32, 24, 14, 12, 30, 21, 14, 12, 30, 19,
+        13, 12, 25, 16, 11, 11, 23, 15, 11, 10, 19, 14, 9, 9, 17, 13, 9, 8, 14,
+        11, 8, 8, 14, 11, 8, 7, 12, 10, 7, 6, 12, 10, 7, 6, 11, 10, 7, 6, 10, 9,
+        7, 5, 9, 9, 7, 5,
+        /* Size 16x4 */
+        33, 32, 32, 30, 30, 25, 23, 19, 17, 14, 14, 12, 12, 11, 10, 9, 23, 25,
+        24, 21, 19, 16, 15, 14, 13, 11, 11, 10, 10, 10, 9, 9, 13, 14, 14, 14,
+        13, 11, 11, 9, 9, 8, 8, 7, 7, 7, 7, 7, 11, 11, 12, 12, 12, 11, 10, 9, 8,
+        8, 7, 6, 6, 6, 5, 5,
+        /* Size 8x32 */
+        32, 32, 28, 19, 16, 12, 11, 10, 33, 32, 29, 20, 17, 12, 12, 11, 33, 31,
+        30, 21, 17, 13, 12, 11, 33, 31, 29, 21, 17, 13, 12, 11, 32, 30, 28, 20,
+        17, 13, 12, 12, 32, 29, 27, 21, 18, 14, 12, 12, 30, 28, 24, 19, 16, 13,
+        13, 12, 29, 28, 23, 18, 16, 12, 12, 12, 28, 27, 21, 17, 15, 12, 12, 11,
+        26, 26, 20, 16, 14, 12, 11, 11, 23, 24, 19, 14, 13, 11, 11, 11, 22, 23,
+        19, 14, 12, 10, 10, 10, 21, 22, 18, 13, 12, 10, 10, 10, 19, 20, 17, 12,
+        11, 9, 10, 10, 18, 19, 16, 12, 10, 9, 9, 9, 17, 18, 16, 12, 10, 9, 9, 9,
+        16, 18, 15, 11, 10, 8, 8, 8, 14, 16, 14, 11, 9, 8, 8, 8, 13, 15, 13, 10,
+        9, 7, 8, 8, 13, 14, 13, 10, 9, 7, 7, 7, 12, 14, 13, 10, 8, 7, 7, 7, 12,
+        13, 12, 9, 8, 7, 7, 7, 11, 13, 12, 10, 8, 7, 6, 6, 11, 12, 11, 10, 8, 7,
+        6, 6, 11, 12, 11, 10, 8, 7, 6, 6, 10, 12, 11, 9, 8, 7, 6, 6, 10, 11, 10,
+        9, 8, 7, 6, 6, 10, 11, 10, 9, 8, 7, 6, 5, 9, 10, 10, 9, 7, 6, 6, 5, 9,
+        10, 10, 9, 7, 6, 6, 5, 9, 10, 10, 9, 8, 7, 6, 5, 8, 9, 10, 9, 8, 7, 6,
+        5,
+        /* Size 32x8 */
+        32, 33, 33, 33, 32, 32, 30, 29, 28, 26, 23, 22, 21, 19, 18, 17, 16, 14,
+        13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 9, 8, 32, 32, 31, 31, 30,
+        29, 28, 28, 27, 26, 24, 23, 22, 20, 19, 18, 18, 16, 15, 14, 14, 13, 13,
+        12, 12, 12, 11, 11, 10, 10, 10, 9, 28, 29, 30, 29, 28, 27, 24, 23, 21,
+        20, 19, 19, 18, 17, 16, 16, 15, 14, 13, 13, 13, 12, 12, 11, 11, 11, 10,
+        10, 10, 10, 10, 10, 19, 20, 21, 21, 20, 21, 19, 18, 17, 16, 14, 14, 13,
+        12, 12, 12, 11, 11, 10, 10, 10, 9, 10, 10, 10, 9, 9, 9, 9, 9, 9, 9, 16,
+        17, 17, 17, 17, 18, 16, 16, 15, 14, 13, 12, 12, 11, 10, 10, 10, 9, 9, 9,
+        8, 8, 8, 8, 8, 8, 8, 8, 7, 7, 8, 8, 12, 12, 13, 13, 13, 14, 13, 12, 12,
+        12, 11, 10, 10, 9, 9, 9, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 7, 7,
+        11, 12, 12, 12, 12, 12, 13, 12, 12, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8,
+        7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 10, 11, 11, 11, 12, 12, 12, 12,
+        11, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, 6, 5, 5, 5,
+        5, 5 },
+      { /* Chroma */
+        /* Size 4x4 */
+        29, 22, 18, 16, 22, 17, 15, 14, 18, 15, 11, 11, 16, 14, 11, 9,
+        /* Size 8x8 */
+        33, 27, 22, 20, 18, 16, 15, 14, 27, 22, 22, 22, 20, 18, 17, 15, 22, 22,
+        19, 18, 17, 16, 15, 15, 20, 22, 18, 16, 14, 13, 14, 14, 18, 20, 17, 14,
+        12, 12, 12, 12, 16, 18, 16, 13, 12, 11, 11, 11, 15, 17, 15, 14, 12, 11,
+        10, 10, 14, 15, 15, 14, 12, 11, 10, 9,
+        /* Size 16x16 */
+        32, 34, 31, 25, 21, 21, 20, 19, 18, 16, 16, 15, 15, 14, 14, 13, 34, 32,
+        29, 24, 22, 23, 22, 21, 20, 18, 18, 17, 16, 15, 15, 14, 31, 29, 26, 23,
+        22, 23, 22, 21, 20, 18, 18, 17, 17, 16, 16, 15, 25, 24, 23, 21, 20, 21,
+        20, 20, 19, 18, 18, 17, 17, 17, 16, 15, 21, 22, 22, 20, 19, 19, 19, 19,
+        18, 17, 17, 16, 16, 16, 16, 16, 21, 23, 23, 21, 19, 18, 17, 17, 16, 15,
+        15, 15, 15, 15, 15, 15, 20, 22, 22, 20, 19, 17, 17, 16, 15, 14, 14, 14,
+        14, 14, 14, 14, 19, 21, 21, 20, 19, 17, 16, 14, 14, 13, 13, 13, 13, 13,
+        13, 13, 18, 20, 20, 19, 18, 16, 15, 14, 13, 12, 12, 12, 12, 12, 12, 12,
+        16, 18, 18, 18, 17, 15, 14, 13, 12, 12, 11, 11, 12, 12, 12, 12, 16, 18,
+        18, 18, 17, 15, 14, 13, 12, 11, 11, 11, 11, 11, 11, 11, 15, 17, 17, 17,
+        16, 15, 14, 13, 12, 11, 11, 10, 10, 10, 10, 10, 15, 16, 17, 17, 16, 15,
+        14, 13, 12, 12, 11, 10, 10, 10, 10, 10, 14, 15, 16, 17, 16, 15, 14, 13,
+        12, 12, 11, 10, 10, 10, 9, 9, 14, 15, 16, 16, 16, 15, 14, 13, 12, 12,
+        11, 10, 10, 9, 9, 9, 13, 14, 15, 15, 16, 15, 14, 13, 12, 12, 11, 10, 10,
+        9, 9, 9,
+        /* Size 32x32 */
+        32, 33, 34, 32, 31, 28, 25, 23, 21, 21, 21, 20, 20, 20, 19, 18, 18, 17,
+        16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 13, 13, 33, 33, 33, 31,
+        30, 27, 24, 23, 22, 22, 22, 22, 21, 20, 20, 19, 19, 18, 17, 17, 17, 16,
+        16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 34, 33, 32, 31, 29, 26, 24, 23,
+        22, 23, 23, 23, 22, 22, 21, 20, 20, 19, 18, 18, 18, 17, 17, 16, 16, 16,
+        15, 15, 15, 14, 14, 14, 32, 31, 31, 29, 28, 25, 24, 23, 22, 22, 23, 22,
+        22, 22, 21, 20, 20, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, 15, 15,
+        15, 15, 31, 30, 29, 28, 26, 24, 23, 22, 22, 22, 23, 22, 22, 22, 21, 20,
+        20, 19, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 28, 27,
+        26, 25, 24, 22, 22, 22, 21, 22, 23, 22, 22, 22, 21, 21, 20, 20, 19, 19,
+        19, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 15, 25, 24, 24, 24, 23, 22,
+        21, 21, 20, 21, 21, 21, 20, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17,
+        17, 17, 17, 16, 16, 16, 15, 15, 23, 23, 23, 23, 22, 22, 21, 20, 20, 20,
+        20, 20, 20, 20, 19, 19, 19, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16,
+        16, 16, 16, 16, 21, 22, 22, 22, 22, 21, 20, 20, 19, 19, 19, 19, 19, 19,
+        19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+        21, 22, 23, 22, 22, 22, 21, 20, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17,
+        16, 16, 16, 16, 16, 16, 16, 15, 15, 15, 15, 15, 15, 15, 21, 22, 23, 23,
+        23, 23, 21, 20, 19, 19, 18, 17, 17, 17, 17, 16, 16, 16, 15, 15, 15, 15,
+        15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 20, 22, 23, 22, 22, 22, 21, 20,
+        19, 18, 17, 17, 17, 17, 16, 16, 16, 15, 15, 15, 15, 15, 14, 14, 15, 15,
+        14, 14, 14, 14, 14, 14, 20, 21, 22, 22, 22, 22, 20, 20, 19, 18, 17, 17,
+        17, 16, 16, 16, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+        14, 14, 20, 20, 22, 22, 22, 22, 20, 20, 19, 18, 17, 17, 16, 16, 15, 15,
+        15, 14, 14, 14, 14, 13, 14, 14, 13, 14, 14, 13, 14, 14, 13, 13, 19, 20,
+        21, 21, 21, 21, 20, 19, 19, 18, 17, 16, 16, 15, 14, 14, 14, 14, 13, 13,
+        13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 18, 19, 20, 20, 20, 21,
+        20, 19, 18, 17, 16, 16, 16, 15, 14, 14, 14, 13, 13, 13, 13, 13, 12, 13,
+        13, 13, 13, 13, 13, 13, 13, 12, 18, 19, 20, 20, 20, 20, 19, 19, 18, 17,
+        16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+        12, 12, 12, 12, 17, 18, 19, 19, 19, 20, 19, 18, 18, 17, 16, 15, 15, 14,
+        14, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+        16, 17, 18, 18, 18, 19, 18, 17, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12,
+        12, 12, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 16, 17, 18, 18,
+        18, 19, 18, 17, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11,
+        11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 16, 17, 18, 18, 18, 19, 18, 17,
+        17, 16, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11,
+        11, 11, 11, 11, 11, 11, 15, 16, 17, 17, 17, 18, 17, 17, 16, 16, 15, 15,
+        14, 13, 13, 13, 12, 12, 11, 11, 11, 11, 11, 11, 10, 11, 11, 11, 11, 11,
+        11, 11, 15, 16, 17, 17, 17, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 12,
+        12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 15, 16,
+        16, 17, 17, 17, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11,
+        11, 11, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 15, 16, 16, 16, 17, 17,
+        17, 16, 16, 16, 15, 15, 14, 13, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10,
+        10, 10, 10, 10, 10, 10, 10, 10, 14, 15, 16, 16, 16, 17, 17, 16, 16, 15,
+        15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10,
+        10, 10, 10, 10, 14, 15, 15, 16, 16, 16, 17, 16, 16, 15, 15, 14, 14, 14,
+        13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 9, 9, 9, 10, 14,
+        15, 15, 16, 16, 16, 16, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12,
+        11, 11, 11, 10, 10, 10, 10, 10, 9, 9, 9, 9, 9, 14, 15, 15, 15, 16, 16,
+        16, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10,
+        10, 10, 9, 9, 9, 9, 9, 9, 14, 14, 14, 15, 15, 15, 16, 16, 16, 15, 15,
+        14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9,
+        9, 9, 13, 14, 14, 15, 15, 15, 15, 16, 16, 15, 15, 14, 14, 13, 13, 13,
+        12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9, 9, 13, 14, 14,
+        15, 15, 15, 15, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11,
+        11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9,
+        /* Size 4x8 */
+        33, 22, 17, 16, 26, 23, 19, 17, 22, 18, 16, 16, 21, 17, 14, 14, 19, 16,
+        12, 12, 17, 15, 11, 11, 16, 15, 11, 10, 15, 14, 12, 10,
+        /* Size 8x4 */
+        33, 26, 22, 21, 19, 17, 16, 15, 22, 23, 18, 17, 16, 15, 15, 14, 17, 19,
+        16, 14, 12, 11, 11, 12, 16, 17, 16, 14, 12, 11, 10, 10,
+        /* Size 8x16 */
+        32, 28, 21, 20, 18, 16, 15, 14, 34, 26, 22, 21, 20, 17, 16, 16, 31, 24,
+        22, 22, 20, 17, 17, 16, 24, 22, 20, 20, 19, 17, 17, 17, 21, 21, 19, 19,
+        18, 17, 17, 17, 21, 22, 19, 17, 16, 15, 16, 16, 20, 22, 19, 16, 15, 14,
+        14, 15, 19, 21, 19, 15, 14, 13, 13, 14, 18, 20, 18, 15, 13, 12, 13, 13,
+        16, 19, 17, 14, 12, 11, 12, 12, 16, 18, 17, 14, 12, 11, 11, 12, 15, 17,
+        16, 14, 12, 11, 10, 11, 15, 17, 16, 14, 12, 11, 10, 10, 14, 16, 16, 14,
+        12, 11, 10, 10, 14, 15, 16, 14, 12, 11, 10, 10, 13, 15, 15, 14, 12, 11,
+        10, 9,
+        /* Size 16x8 */
+        32, 34, 31, 24, 21, 21, 20, 19, 18, 16, 16, 15, 15, 14, 14, 13, 28, 26,
+        24, 22, 21, 22, 22, 21, 20, 19, 18, 17, 17, 16, 15, 15, 21, 22, 22, 20,
+        19, 19, 19, 19, 18, 17, 17, 16, 16, 16, 16, 15, 20, 21, 22, 20, 19, 17,
+        16, 15, 15, 14, 14, 14, 14, 14, 14, 14, 18, 20, 20, 19, 18, 16, 15, 14,
+        13, 12, 12, 12, 12, 12, 12, 12, 16, 17, 17, 17, 17, 15, 14, 13, 12, 11,
+        11, 11, 11, 11, 11, 11, 15, 16, 17, 17, 17, 16, 14, 13, 13, 12, 11, 10,
+        10, 10, 10, 10, 14, 16, 16, 17, 17, 16, 15, 14, 13, 12, 12, 11, 10, 10,
+        10, 9,
+        /* Size 16x32 */
+        32, 33, 28, 24, 21, 21, 20, 19, 18, 16, 16, 15, 15, 15, 14, 14, 33, 33,
+        27, 24, 22, 22, 20, 20, 19, 17, 16, 16, 16, 16, 15, 15, 34, 32, 26, 24,
+        22, 23, 21, 20, 20, 18, 17, 17, 16, 16, 16, 15, 32, 30, 25, 23, 22, 23,
+        21, 21, 20, 18, 17, 17, 17, 16, 16, 16, 31, 28, 24, 23, 22, 22, 22, 21,
+        20, 18, 17, 17, 17, 17, 16, 16, 28, 26, 22, 22, 22, 23, 22, 21, 20, 19,
+        18, 18, 17, 17, 17, 16, 24, 24, 22, 21, 20, 21, 20, 20, 19, 18, 17, 18,
+        17, 17, 17, 16, 23, 23, 22, 21, 20, 20, 20, 19, 19, 17, 17, 17, 17, 17,
+        17, 17, 21, 22, 21, 20, 19, 19, 19, 19, 18, 17, 17, 16, 17, 16, 17, 17,
+        21, 22, 22, 20, 19, 18, 18, 17, 17, 16, 16, 16, 16, 16, 16, 16, 21, 23,
+        22, 21, 19, 18, 17, 17, 16, 15, 15, 15, 16, 16, 16, 16, 21, 22, 22, 21,
+        19, 17, 17, 16, 16, 15, 14, 15, 15, 15, 15, 15, 20, 22, 22, 20, 19, 17,
+        16, 16, 15, 14, 14, 14, 14, 15, 15, 15, 20, 21, 22, 20, 19, 17, 16, 15,
+        14, 14, 13, 14, 14, 14, 14, 14, 19, 20, 21, 20, 19, 17, 15, 14, 14, 13,
+        13, 13, 13, 14, 14, 14, 19, 20, 21, 20, 18, 16, 15, 14, 14, 13, 12, 13,
+        13, 13, 13, 13, 18, 20, 20, 19, 18, 16, 15, 14, 13, 12, 12, 12, 13, 13,
+        13, 13, 17, 19, 20, 19, 18, 16, 14, 14, 13, 12, 12, 12, 12, 12, 13, 13,
+        16, 18, 19, 18, 17, 15, 14, 13, 12, 12, 11, 12, 12, 12, 12, 13, 16, 18,
+        19, 18, 17, 15, 14, 13, 12, 12, 11, 11, 12, 12, 12, 12, 16, 17, 18, 18,
+        17, 15, 14, 13, 12, 11, 11, 11, 11, 11, 12, 12, 15, 17, 18, 17, 16, 15,
+        13, 13, 12, 11, 11, 11, 11, 11, 11, 11, 15, 17, 17, 17, 16, 14, 14, 13,
+        12, 11, 11, 11, 10, 11, 11, 11, 15, 17, 17, 17, 16, 15, 14, 13, 12, 12,
+        11, 10, 10, 10, 11, 11, 15, 16, 17, 17, 16, 15, 14, 13, 12, 12, 11, 11,
+        10, 10, 10, 11, 14, 16, 16, 17, 15, 15, 14, 13, 12, 11, 11, 10, 10, 10,
+        10, 10, 14, 16, 16, 17, 16, 15, 14, 13, 12, 12, 11, 10, 10, 10, 10, 10,
+        14, 16, 16, 16, 16, 15, 14, 13, 12, 12, 11, 10, 10, 10, 10, 10, 14, 15,
+        15, 16, 16, 15, 14, 13, 12, 12, 11, 11, 10, 10, 10, 10, 14, 15, 15, 16,
+        16, 14, 14, 13, 12, 12, 11, 11, 10, 10, 9, 9, 13, 15, 15, 16, 15, 14,
+        14, 13, 12, 12, 11, 11, 10, 10, 9, 9, 13, 15, 15, 15, 15, 14, 14, 13,
+        13, 11, 11, 10, 10, 9, 9, 9,
+        /* Size 32x16 */
+        32, 33, 34, 32, 31, 28, 24, 23, 21, 21, 21, 21, 20, 20, 19, 19, 18, 17,
+        16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 13, 13, 33, 33, 32, 30,
+        28, 26, 24, 23, 22, 22, 23, 22, 22, 21, 20, 20, 20, 19, 18, 18, 17, 17,
+        17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 28, 27, 26, 25, 24, 22, 22, 22,
+        21, 22, 22, 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16,
+        16, 16, 15, 15, 15, 15, 24, 24, 24, 23, 23, 22, 21, 21, 20, 20, 21, 21,
+        20, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16,
+        16, 15, 21, 22, 22, 22, 22, 22, 20, 20, 19, 19, 19, 19, 19, 19, 19, 18,
+        18, 18, 17, 17, 17, 16, 16, 16, 16, 15, 16, 16, 16, 16, 15, 15, 21, 22,
+        23, 23, 22, 23, 21, 20, 19, 18, 18, 17, 17, 17, 17, 16, 16, 16, 15, 15,
+        15, 15, 14, 15, 15, 15, 15, 15, 15, 14, 14, 14, 20, 20, 21, 21, 22, 22,
+        20, 20, 19, 18, 17, 17, 16, 16, 15, 15, 15, 14, 14, 14, 14, 13, 14, 14,
+        14, 14, 14, 14, 14, 14, 14, 14, 19, 20, 20, 21, 21, 21, 20, 19, 19, 17,
+        17, 16, 16, 15, 14, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+        13, 13, 13, 13, 18, 19, 20, 20, 20, 20, 19, 19, 18, 17, 16, 16, 15, 14,
+        14, 14, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13,
+        16, 17, 18, 18, 18, 19, 18, 17, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12,
+        12, 12, 11, 11, 11, 12, 12, 11, 12, 12, 12, 12, 12, 11, 16, 16, 17, 17,
+        17, 18, 17, 17, 17, 16, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11,
+        11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 15, 16, 17, 17, 17, 18, 18, 17,
+        16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 10, 11, 10,
+        10, 10, 11, 11, 11, 10, 15, 16, 16, 17, 17, 17, 17, 17, 17, 16, 16, 15,
+        14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 10, 10, 10, 10, 10,
+        10, 10, 15, 16, 16, 16, 17, 17, 17, 17, 16, 16, 16, 15, 15, 14, 14, 13,
+        13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 10, 9, 14, 15,
+        16, 16, 16, 17, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12,
+        12, 11, 11, 11, 10, 10, 10, 10, 10, 9, 9, 9, 14, 15, 15, 16, 16, 16, 16,
+        17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 11, 11, 11, 11,
+        10, 10, 10, 10, 9, 9, 9,
+        /* Size 4x16 */
+        33, 21, 16, 15, 32, 23, 18, 16, 28, 22, 18, 17, 24, 21, 18, 17, 22, 19,
+        17, 16, 23, 18, 15, 16, 22, 17, 14, 15, 20, 17, 13, 14, 20, 16, 12, 13,
+        18, 15, 12, 12, 17, 15, 11, 11, 17, 14, 11, 11, 16, 15, 12, 10, 16, 15,
+        12, 10, 15, 15, 12, 10, 15, 14, 12, 10,
+        /* Size 16x4 */
+        33, 32, 28, 24, 22, 23, 22, 20, 20, 18, 17, 17, 16, 16, 15, 15, 21, 23,
+        22, 21, 19, 18, 17, 17, 16, 15, 15, 14, 15, 15, 15, 14, 16, 18, 18, 18,
+        17, 15, 14, 13, 12, 12, 11, 11, 12, 12, 12, 12, 15, 16, 17, 17, 16, 16,
+        15, 14, 13, 12, 11, 11, 10, 10, 10, 10,
+        /* Size 8x32 */
+        32, 28, 21, 20, 18, 16, 15, 14, 33, 27, 22, 20, 19, 16, 16, 15, 34, 26,
+        22, 21, 20, 17, 16, 16, 32, 25, 22, 21, 20, 17, 17, 16, 31, 24, 22, 22,
+        20, 17, 17, 16, 28, 22, 22, 22, 20, 18, 17, 17, 24, 22, 20, 20, 19, 17,
+        17, 17, 23, 22, 20, 20, 19, 17, 17, 17, 21, 21, 19, 19, 18, 17, 17, 17,
+        21, 22, 19, 18, 17, 16, 16, 16, 21, 22, 19, 17, 16, 15, 16, 16, 21, 22,
+        19, 17, 16, 14, 15, 15, 20, 22, 19, 16, 15, 14, 14, 15, 20, 22, 19, 16,
+        14, 13, 14, 14, 19, 21, 19, 15, 14, 13, 13, 14, 19, 21, 18, 15, 14, 12,
+        13, 13, 18, 20, 18, 15, 13, 12, 13, 13, 17, 20, 18, 14, 13, 12, 12, 13,
+        16, 19, 17, 14, 12, 11, 12, 12, 16, 19, 17, 14, 12, 11, 12, 12, 16, 18,
+        17, 14, 12, 11, 11, 12, 15, 18, 16, 13, 12, 11, 11, 11, 15, 17, 16, 14,
+        12, 11, 10, 11, 15, 17, 16, 14, 12, 11, 10, 11, 15, 17, 16, 14, 12, 11,
+        10, 10, 14, 16, 15, 14, 12, 11, 10, 10, 14, 16, 16, 14, 12, 11, 10, 10,
+        14, 16, 16, 14, 12, 11, 10, 10, 14, 15, 16, 14, 12, 11, 10, 10, 14, 15,
+        16, 14, 12, 11, 10, 9, 13, 15, 15, 14, 12, 11, 10, 9, 13, 15, 15, 14,
+        13, 11, 10, 9,
+        /* Size 32x8 */
+        32, 33, 34, 32, 31, 28, 24, 23, 21, 21, 21, 21, 20, 20, 19, 19, 18, 17,
+        16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 13, 13, 28, 27, 26, 25,
+        24, 22, 22, 22, 21, 22, 22, 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18,
+        17, 17, 17, 16, 16, 16, 15, 15, 15, 15, 21, 22, 22, 22, 22, 22, 20, 20,
+        19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, 15,
+        16, 16, 16, 16, 15, 15, 20, 20, 21, 21, 22, 22, 20, 20, 19, 18, 17, 17,
+        16, 16, 15, 15, 15, 14, 14, 14, 14, 13, 14, 14, 14, 14, 14, 14, 14, 14,
+        14, 14, 18, 19, 20, 20, 20, 20, 19, 19, 18, 17, 16, 16, 15, 14, 14, 14,
+        13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 16, 16,
+        17, 17, 17, 18, 17, 17, 17, 16, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11,
+        11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 15, 16, 16, 17, 17, 17,
+        17, 17, 17, 16, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 10, 10,
+        10, 10, 10, 10, 10, 10, 10, 10, 14, 15, 16, 16, 16, 17, 17, 17, 17, 16,
+        16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10,
+        10, 9, 9, 9 },
+  },
+  {
+      { /* Luma */
+        /* Size 4x4 */
+        32, 25, 15, 11, 25, 16, 12, 10, 15, 12, 8, 7, 11, 10, 7, 6,
+        /* Size 8x8 */
+        32, 32, 28, 22, 17, 13, 11, 10, 32, 29, 26, 22, 18, 14, 12, 11, 28, 26,
+        20, 17, 14, 12, 11, 10, 22, 22, 17, 14, 12, 10, 10, 9, 17, 18, 14, 12,
+        10, 8, 8, 8, 13, 14, 12, 10, 8, 7, 7, 7, 11, 12, 11, 10, 8, 7, 6, 6, 10,
+        11, 10, 9, 8, 7, 6, 5,
+        /* Size 16x16 */
+        32, 33, 33, 32, 28, 26, 22, 19, 17, 14, 13, 12, 11, 10, 10, 9, 33, 32,
+        32, 31, 30, 28, 23, 20, 18, 16, 14, 13, 12, 11, 10, 10, 33, 32, 31, 30,
+        28, 26, 23, 20, 18, 16, 14, 13, 12, 12, 11, 10, 32, 31, 30, 28, 26, 24,
+        22, 20, 18, 16, 14, 13, 13, 12, 11, 10, 28, 30, 28, 26, 21, 20, 18, 17,
+        16, 14, 13, 12, 12, 11, 11, 10, 26, 28, 26, 24, 20, 19, 17, 16, 15, 13,
+        12, 12, 11, 11, 10, 10, 22, 23, 23, 22, 18, 17, 15, 14, 13, 12, 11, 10,
+        10, 10, 9, 9, 19, 20, 20, 20, 17, 16, 14, 12, 12, 11, 10, 9, 9, 9, 9, 8,
+        17, 18, 18, 18, 16, 15, 13, 12, 11, 10, 9, 9, 8, 8, 8, 8, 14, 16, 16,
+        16, 14, 13, 12, 11, 10, 9, 8, 8, 8, 8, 8, 7, 13, 14, 14, 14, 13, 12, 11,
+        10, 9, 8, 8, 7, 7, 7, 7, 7, 12, 13, 13, 13, 12, 12, 10, 9, 9, 8, 7, 7,
+        7, 7, 6, 6, 11, 12, 12, 13, 12, 11, 10, 9, 8, 8, 7, 7, 6, 6, 6, 6, 10,
+        11, 12, 12, 11, 11, 10, 9, 8, 8, 7, 7, 6, 6, 6, 5, 10, 10, 11, 11, 11,
+        10, 9, 9, 8, 8, 7, 6, 6, 6, 5, 5, 9, 10, 10, 10, 10, 10, 9, 8, 8, 7, 7,
+        6, 6, 5, 5, 5,
+        /* Size 32x32 */
+        32, 33, 33, 33, 33, 32, 32, 30, 28, 27, 26, 23, 22, 21, 19, 17, 17, 16,
+        14, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 33, 32, 32, 32, 32,
+        32, 31, 30, 29, 28, 27, 24, 23, 22, 20, 18, 18, 17, 15, 14, 13, 13, 12,
+        12, 12, 11, 11, 11, 10, 10, 10, 9, 33, 32, 32, 32, 32, 32, 31, 31, 30,
+        28, 28, 25, 23, 22, 20, 19, 18, 17, 16, 15, 14, 13, 13, 12, 12, 12, 11,
+        11, 10, 10, 10, 9, 33, 32, 32, 32, 32, 31, 31, 30, 29, 28, 27, 25, 23,
+        23, 21, 19, 18, 17, 16, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 10, 10,
+        10, 33, 32, 32, 32, 31, 30, 30, 29, 28, 27, 26, 24, 23, 22, 20, 19, 18,
+        17, 16, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 32, 32, 32,
+        31, 30, 29, 28, 28, 27, 26, 26, 24, 23, 22, 21, 19, 19, 18, 16, 16, 15,
+        14, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 32, 31, 31, 31, 30, 28, 28,
+        27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 14, 13, 13, 13,
+        12, 12, 12, 11, 11, 10, 10, 30, 30, 31, 30, 29, 28, 27, 26, 24, 23, 23,
+        22, 20, 20, 19, 18, 17, 16, 15, 14, 14, 13, 13, 12, 12, 12, 12, 12, 11,
+        11, 11, 10, 28, 29, 30, 29, 28, 27, 26, 24, 21, 20, 20, 19, 18, 18, 17,
+        16, 16, 15, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 10, 10, 27,
+        28, 28, 28, 27, 26, 25, 23, 20, 20, 20, 18, 18, 17, 16, 15, 15, 14, 13,
+        13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 10, 10, 10, 26, 27, 28, 27, 26,
+        26, 24, 23, 20, 20, 19, 18, 17, 17, 16, 15, 15, 14, 13, 13, 12, 12, 12,
+        11, 11, 11, 11, 10, 10, 10, 10, 10, 23, 24, 25, 25, 24, 24, 23, 22, 19,
+        18, 18, 16, 16, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 11, 10, 10,
+        10, 10, 10, 9, 9, 22, 23, 23, 23, 23, 23, 22, 20, 18, 18, 17, 16, 15,
+        14, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 9, 9, 9, 9,
+        21, 22, 22, 23, 22, 22, 21, 20, 18, 17, 17, 15, 14, 14, 13, 13, 12, 12,
+        11, 11, 10, 10, 10, 10, 10, 10, 9, 9, 9, 9, 9, 9, 19, 20, 20, 21, 20,
+        21, 20, 19, 17, 16, 16, 14, 14, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9,
+        9, 9, 9, 9, 9, 9, 8, 8, 8, 17, 18, 19, 19, 19, 19, 19, 18, 16, 15, 15,
+        14, 13, 13, 12, 11, 11, 10, 10, 10, 9, 9, 9, 9, 9, 9, 9, 8, 8, 8, 8, 8,
+        17, 18, 18, 18, 18, 19, 18, 17, 16, 15, 15, 13, 13, 12, 12, 11, 11, 10,
+        10, 9, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 16, 17, 17, 17, 17, 18, 17,
+        16, 15, 14, 14, 13, 12, 12, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, 8, 8,
+        8, 8, 8, 8, 7, 14, 15, 16, 16, 16, 16, 16, 15, 14, 13, 13, 12, 12, 11,
+        11, 10, 10, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 7, 7, 13, 14, 15, 15,
+        15, 16, 15, 14, 13, 13, 13, 12, 11, 11, 10, 10, 9, 9, 8, 8, 8, 8, 8, 8,
+        7, 7, 7, 7, 7, 7, 7, 7, 13, 13, 14, 14, 14, 15, 14, 14, 13, 12, 12, 11,
+        11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 12, 13,
+        13, 14, 14, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8, 8, 8, 7, 7,
+        7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 12, 12, 13, 13, 13, 14, 13, 13, 12, 12,
+        12, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 6, 7, 6, 6, 6, 6, 6,
+        12, 12, 12, 13, 13, 13, 13, 12, 12, 12, 11, 11, 10, 10, 9, 9, 9, 8, 8,
+        8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 11, 12, 12, 12, 12, 13, 13, 12,
+        12, 11, 11, 11, 10, 10, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6,
+        6, 6, 11, 11, 12, 12, 12, 12, 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 8,
+        8, 8, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 10, 11, 11, 12, 12, 12, 12,
+        12, 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6,
+        5, 5, 5, 10, 11, 11, 11, 11, 11, 12, 12, 11, 11, 10, 10, 10, 9, 9, 8, 8,
+        8, 8, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 10, 10, 10, 11, 11, 11, 11,
+        11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, 6, 6, 5,
+        5, 5, 5, 9, 10, 10, 10, 11, 11, 11, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8,
+        8, 7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 9, 10, 10, 10, 10, 10, 10,
+        11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5,
+        5, 5, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7,
+        7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5,
+        /* Size 4x8 */
+        32, 24, 15, 12, 31, 24, 16, 12, 28, 18, 13, 12, 22, 15, 11, 10, 17, 13,
+        9, 8, 14, 11, 8, 7, 12, 11, 8, 6, 10, 10, 8, 6,
+        /* Size 8x4 */
+        32, 31, 28, 22, 17, 14, 12, 10, 24, 24, 18, 15, 13, 11, 11, 10, 15, 16,
+        13, 11, 9, 8, 8, 8, 12, 12, 12, 10, 8, 7, 6, 6,
+        /* Size 8x16 */
+        32, 32, 28, 22, 16, 13, 11, 11, 33, 32, 29, 23, 17, 14, 12, 11, 32, 30,
+        28, 23, 17, 14, 13, 12, 32, 29, 26, 22, 17, 14, 13, 12, 28, 28, 21, 18,
+        15, 13, 12, 12, 26, 26, 20, 17, 14, 12, 11, 11, 22, 23, 18, 15, 12, 11,
+        10, 10, 19, 20, 17, 14, 11, 10, 9, 9, 17, 18, 16, 13, 10, 9, 9, 9, 14,
+        16, 14, 12, 9, 8, 8, 8, 13, 15, 13, 11, 9, 8, 7, 7, 12, 13, 12, 10, 8,
+        7, 7, 7, 11, 12, 12, 10, 8, 7, 7, 6, 10, 12, 11, 9, 8, 7, 6, 6, 10, 11,
+        11, 9, 8, 7, 6, 6, 9, 10, 10, 9, 8, 7, 6, 5,
+        /* Size 16x8 */
+        32, 33, 32, 32, 28, 26, 22, 19, 17, 14, 13, 12, 11, 10, 10, 9, 32, 32,
+        30, 29, 28, 26, 23, 20, 18, 16, 15, 13, 12, 12, 11, 10, 28, 29, 28, 26,
+        21, 20, 18, 17, 16, 14, 13, 12, 12, 11, 11, 10, 22, 23, 23, 22, 18, 17,
+        15, 14, 13, 12, 11, 10, 10, 9, 9, 9, 16, 17, 17, 17, 15, 14, 12, 11, 10,
+        9, 9, 8, 8, 8, 8, 8, 13, 14, 14, 14, 13, 12, 11, 10, 9, 8, 8, 7, 7, 7,
+        7, 7, 11, 12, 13, 13, 12, 11, 10, 9, 9, 8, 7, 7, 7, 6, 6, 6, 11, 11, 12,
+        12, 12, 11, 10, 9, 9, 8, 7, 7, 6, 6, 6, 5,
+        /* Size 16x32 */
+        32, 33, 32, 32, 28, 23, 22, 19, 16, 14, 13, 12, 11, 11, 11, 10, 33, 32,
+        32, 31, 29, 24, 23, 20, 17, 15, 14, 12, 12, 12, 11, 11, 33, 32, 32, 31,
+        29, 25, 23, 21, 17, 15, 14, 13, 12, 12, 11, 11, 33, 32, 31, 31, 29, 25,
+        23, 21, 17, 16, 14, 13, 12, 12, 12, 11, 32, 32, 30, 30, 28, 24, 23, 20,
+        17, 16, 14, 13, 13, 12, 12, 11, 32, 31, 29, 28, 27, 24, 23, 21, 18, 16,
+        15, 13, 13, 12, 12, 12, 32, 31, 29, 28, 26, 23, 22, 20, 17, 16, 14, 13,
+        13, 13, 12, 12, 30, 30, 28, 27, 24, 21, 20, 19, 16, 15, 14, 13, 12, 13,
+        12, 12, 28, 30, 28, 26, 21, 19, 18, 17, 15, 14, 13, 12, 12, 12, 12, 12,
+        27, 28, 26, 25, 21, 18, 18, 16, 14, 13, 13, 12, 12, 12, 11, 11, 26, 28,
+        26, 24, 20, 18, 17, 16, 14, 13, 12, 11, 11, 11, 11, 11, 23, 25, 24, 23,
+        19, 16, 16, 14, 13, 12, 11, 11, 11, 11, 11, 10, 22, 23, 23, 22, 18, 16,
+        15, 14, 12, 11, 11, 10, 10, 10, 10, 10, 21, 22, 22, 21, 18, 15, 14, 13,
+        12, 11, 11, 10, 10, 10, 10, 10, 19, 21, 20, 20, 17, 14, 14, 12, 11, 10,
+        10, 9, 9, 10, 9, 10, 18, 19, 19, 19, 16, 14, 13, 12, 10, 10, 9, 9, 9, 9,
+        9, 9, 17, 18, 18, 18, 16, 13, 13, 12, 10, 10, 9, 9, 9, 9, 9, 9, 16, 17,
+        17, 17, 15, 13, 12, 11, 10, 9, 9, 8, 8, 8, 8, 8, 14, 16, 16, 16, 14, 12,
+        12, 11, 9, 9, 8, 8, 8, 8, 8, 8, 13, 15, 15, 15, 13, 12, 11, 10, 9, 8, 8,
+        8, 8, 8, 8, 8, 13, 14, 15, 14, 13, 11, 11, 10, 9, 8, 8, 7, 7, 7, 7, 8,
+        12, 14, 14, 14, 13, 11, 11, 10, 8, 8, 8, 7, 7, 7, 7, 7, 12, 13, 13, 13,
+        12, 11, 10, 9, 8, 8, 7, 7, 7, 7, 7, 7, 12, 13, 13, 13, 12, 11, 10, 9, 8,
+        8, 7, 7, 7, 7, 7, 6, 11, 12, 12, 13, 12, 11, 10, 9, 8, 8, 7, 7, 7, 6, 6,
+        6, 11, 12, 12, 12, 11, 11, 10, 9, 9, 8, 7, 7, 6, 6, 6, 6, 10, 12, 12,
+        12, 11, 11, 9, 9, 8, 8, 7, 6, 6, 6, 6, 6, 10, 11, 11, 12, 11, 10, 9, 9,
+        8, 8, 7, 6, 6, 6, 6, 6, 10, 11, 11, 11, 11, 10, 9, 9, 8, 8, 7, 7, 6, 6,
+        6, 6, 10, 10, 11, 11, 11, 10, 9, 9, 8, 8, 7, 7, 6, 6, 5, 5, 9, 10, 10,
+        11, 10, 9, 9, 8, 8, 7, 7, 6, 6, 6, 5, 5, 9, 10, 10, 10, 10, 9, 9, 8, 8,
+        7, 7, 6, 6, 5, 5, 5,
+        /* Size 32x16 */
+        32, 33, 33, 33, 32, 32, 32, 30, 28, 27, 26, 23, 22, 21, 19, 18, 17, 16,
+        14, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 33, 32, 32, 32,
+        32, 31, 31, 30, 30, 28, 28, 25, 23, 22, 21, 19, 18, 17, 16, 15, 14, 14,
+        13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 32, 32, 32, 31, 30, 29, 29, 28,
+        28, 26, 26, 24, 23, 22, 20, 19, 18, 17, 16, 15, 15, 14, 13, 13, 12, 12,
+        12, 11, 11, 11, 10, 10, 32, 31, 31, 31, 30, 28, 28, 27, 26, 25, 24, 23,
+        22, 21, 20, 19, 18, 17, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11,
+        11, 10, 28, 29, 29, 29, 28, 27, 26, 24, 21, 21, 20, 19, 18, 18, 17, 16,
+        16, 15, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 10, 10, 23, 24,
+        25, 25, 24, 24, 23, 21, 19, 18, 18, 16, 16, 15, 14, 14, 13, 13, 12, 12,
+        11, 11, 11, 11, 11, 11, 11, 10, 10, 10, 9, 9, 22, 23, 23, 23, 23, 23,
+        22, 20, 18, 18, 17, 16, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10,
+        10, 10, 9, 9, 9, 9, 9, 9, 19, 20, 21, 21, 20, 21, 20, 19, 17, 16, 16,
+        14, 14, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 9, 9, 9, 9, 9, 8,
+        8, 16, 17, 17, 17, 17, 18, 17, 16, 15, 14, 14, 13, 12, 12, 11, 10, 10,
+        10, 9, 9, 9, 8, 8, 8, 8, 9, 8, 8, 8, 8, 8, 8, 14, 15, 15, 16, 16, 16,
+        16, 15, 14, 13, 13, 12, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, 8, 8, 8, 8,
+        8, 8, 8, 8, 7, 7, 13, 14, 14, 14, 14, 15, 14, 14, 13, 13, 12, 11, 11,
+        11, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 12, 12, 13,
+        13, 13, 13, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7,
+        7, 7, 7, 6, 6, 7, 7, 6, 6, 11, 12, 12, 12, 13, 13, 13, 12, 12, 12, 11,
+        11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 11,
+        12, 12, 12, 12, 12, 13, 13, 12, 12, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8,
+        7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 5, 11, 11, 11, 12, 12, 12, 12, 12, 12,
+        11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 5, 5,
+        5, 10, 11, 11, 11, 11, 12, 12, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 8,
+        8, 8, 8, 7, 7, 6, 6, 6, 6, 6, 6, 5, 5, 5,
+        /* Size 4x16 */
+        33, 23, 14, 11, 32, 25, 15, 12, 32, 24, 16, 12, 31, 23, 16, 13, 30, 19,
+        14, 12, 28, 18, 13, 11, 23, 16, 11, 10, 21, 14, 10, 10, 18, 13, 10, 9,
+        16, 12, 9, 8, 14, 11, 8, 7, 13, 11, 8, 7, 12, 11, 8, 6, 12, 11, 8, 6,
+        11, 10, 8, 6, 10, 9, 7, 6,
+        /* Size 16x4 */
+        33, 32, 32, 31, 30, 28, 23, 21, 18, 16, 14, 13, 12, 12, 11, 10, 23, 25,
+        24, 23, 19, 18, 16, 14, 13, 12, 11, 11, 11, 11, 10, 9, 14, 15, 16, 16,
+        14, 13, 11, 10, 10, 9, 8, 8, 8, 8, 8, 7, 11, 12, 12, 13, 12, 11, 10, 10,
+        9, 8, 7, 7, 6, 6, 6, 6,
+        /* Size 8x32 */
+        32, 32, 28, 22, 16, 13, 11, 11, 33, 32, 29, 23, 17, 14, 12, 11, 33, 32,
+        29, 23, 17, 14, 12, 11, 33, 31, 29, 23, 17, 14, 12, 12, 32, 30, 28, 23,
+        17, 14, 13, 12, 32, 29, 27, 23, 18, 15, 13, 12, 32, 29, 26, 22, 17, 14,
+        13, 12, 30, 28, 24, 20, 16, 14, 12, 12, 28, 28, 21, 18, 15, 13, 12, 12,
+        27, 26, 21, 18, 14, 13, 12, 11, 26, 26, 20, 17, 14, 12, 11, 11, 23, 24,
+        19, 16, 13, 11, 11, 11, 22, 23, 18, 15, 12, 11, 10, 10, 21, 22, 18, 14,
+        12, 11, 10, 10, 19, 20, 17, 14, 11, 10, 9, 9, 18, 19, 16, 13, 10, 9, 9,
+        9, 17, 18, 16, 13, 10, 9, 9, 9, 16, 17, 15, 12, 10, 9, 8, 8, 14, 16, 14,
+        12, 9, 8, 8, 8, 13, 15, 13, 11, 9, 8, 8, 8, 13, 15, 13, 11, 9, 8, 7, 7,
+        12, 14, 13, 11, 8, 8, 7, 7, 12, 13, 12, 10, 8, 7, 7, 7, 12, 13, 12, 10,
+        8, 7, 7, 7, 11, 12, 12, 10, 8, 7, 7, 6, 11, 12, 11, 10, 9, 7, 6, 6, 10,
+        12, 11, 9, 8, 7, 6, 6, 10, 11, 11, 9, 8, 7, 6, 6, 10, 11, 11, 9, 8, 7,
+        6, 6, 10, 11, 11, 9, 8, 7, 6, 5, 9, 10, 10, 9, 8, 7, 6, 5, 9, 10, 10, 9,
+        8, 7, 6, 5,
+        /* Size 32x8 */
+        32, 33, 33, 33, 32, 32, 32, 30, 28, 27, 26, 23, 22, 21, 19, 18, 17, 16,
+        14, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 32, 32, 32, 31,
+        30, 29, 29, 28, 28, 26, 26, 24, 23, 22, 20, 19, 18, 17, 16, 15, 15, 14,
+        13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 28, 29, 29, 29, 28, 27, 26, 24,
+        21, 21, 20, 19, 18, 18, 17, 16, 16, 15, 14, 13, 13, 13, 12, 12, 12, 11,
+        11, 11, 11, 11, 10, 10, 22, 23, 23, 23, 23, 23, 22, 20, 18, 18, 17, 16,
+        15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9,
+        9, 16, 17, 17, 17, 17, 18, 17, 16, 15, 14, 14, 13, 12, 12, 11, 10, 10,
+        10, 9, 9, 9, 8, 8, 8, 8, 9, 8, 8, 8, 8, 8, 8, 13, 14, 14, 14, 14, 15,
+        14, 14, 13, 13, 12, 11, 11, 11, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7,
+        7, 7, 7, 7, 7, 11, 12, 12, 12, 13, 13, 13, 12, 12, 12, 11, 11, 10, 10,
+        9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 11, 11, 11, 12,
+        12, 12, 12, 12, 12, 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6,
+        6, 6, 6, 6, 5, 5, 5 },
+      { /* Chroma */
+        /* Size 4x4 */
+        31, 23, 18, 16, 23, 18, 16, 15, 18, 16, 12, 12, 16, 15, 12, 10,
+        /* Size 8x8 */
+        33, 27, 22, 21, 19, 17, 16, 15, 27, 22, 22, 22, 20, 19, 17, 16, 22, 22,
+        19, 19, 18, 16, 16, 16, 21, 22, 19, 17, 15, 14, 14, 14, 19, 20, 18, 15,
+        13, 12, 12, 12, 17, 19, 16, 14, 12, 11, 11, 11, 16, 17, 16, 14, 12, 11,
+        10, 10, 15, 16, 16, 14, 12, 11, 10, 9,
+        /* Size 16x16 */
+        32, 34, 31, 27, 21, 21, 20, 20, 19, 17, 16, 16, 15, 15, 14, 14, 34, 33,
+        29, 25, 22, 22, 22, 21, 20, 19, 18, 17, 16, 16, 15, 15, 31, 29, 26, 23,
+        22, 22, 22, 22, 20, 19, 18, 18, 17, 17, 16, 15, 27, 25, 23, 22, 21, 21,
+        22, 21, 20, 19, 19, 18, 18, 17, 17, 16, 21, 22, 22, 21, 19, 19, 19, 19,
+        18, 18, 17, 17, 17, 16, 16, 16, 21, 22, 22, 21, 19, 19, 18, 18, 17, 17,
+        16, 16, 15, 16, 15, 15, 20, 22, 22, 22, 19, 18, 17, 16, 16, 15, 15, 14,
+        14, 14, 14, 14, 20, 21, 22, 21, 19, 18, 16, 16, 15, 14, 14, 13, 14, 13,
+        13, 13, 19, 20, 20, 20, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13,
+        17, 19, 19, 19, 18, 17, 15, 14, 13, 12, 12, 12, 12, 12, 12, 12, 16, 18,
+        18, 19, 17, 16, 15, 14, 13, 12, 12, 11, 11, 11, 12, 11, 16, 17, 18, 18,
+        17, 16, 14, 13, 13, 12, 11, 11, 11, 11, 11, 11, 15, 16, 17, 18, 17, 15,
+        14, 14, 13, 12, 11, 11, 10, 10, 10, 10, 15, 16, 17, 17, 16, 16, 14, 13,
+        13, 12, 11, 11, 10, 10, 10, 10, 14, 15, 16, 17, 16, 15, 14, 13, 13, 12,
+        12, 11, 10, 10, 10, 9, 14, 15, 15, 16, 16, 15, 14, 13, 13, 12, 11, 11,
+        10, 10, 9, 9,
+        /* Size 32x32 */
+        32, 33, 34, 33, 31, 28, 27, 25, 21, 21, 21, 21, 20, 20, 20, 19, 19, 18,
+        17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 33, 33, 33, 32,
+        30, 27, 26, 24, 22, 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 17, 17, 17,
+        16, 16, 16, 16, 15, 15, 15, 15, 15, 14, 34, 33, 33, 32, 29, 26, 25, 24,
+        22, 22, 22, 23, 22, 22, 21, 20, 20, 20, 19, 18, 18, 17, 17, 17, 16, 16,
+        16, 15, 15, 15, 15, 14, 33, 32, 32, 31, 28, 26, 25, 24, 22, 22, 23, 23,
+        22, 22, 22, 21, 20, 20, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16,
+        15, 15, 31, 30, 29, 28, 26, 24, 23, 23, 22, 22, 22, 23, 22, 22, 22, 21,
+        20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 15, 15, 28, 27,
+        26, 26, 24, 22, 22, 22, 21, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20, 19,
+        19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 27, 26, 25, 25, 23, 22,
+        22, 21, 21, 21, 21, 22, 22, 22, 21, 21, 20, 20, 19, 19, 19, 18, 18, 18,
+        18, 17, 17, 17, 17, 16, 16, 16, 25, 24, 24, 24, 23, 22, 21, 21, 20, 20,
+        21, 21, 20, 20, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 17,
+        17, 16, 16, 16, 21, 22, 22, 22, 22, 21, 21, 20, 19, 19, 19, 19, 19, 19,
+        19, 19, 18, 18, 18, 17, 17, 17, 17, 16, 17, 17, 16, 16, 16, 16, 16, 16,
+        21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18,
+        17, 17, 17, 16, 16, 16, 16, 16, 16, 16, 16, 16, 15, 15, 21, 22, 22, 23,
+        22, 22, 21, 21, 19, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 16, 16, 16,
+        16, 16, 15, 16, 16, 15, 15, 15, 15, 15, 21, 22, 23, 23, 23, 23, 22, 21,
+        19, 19, 19, 18, 17, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 15, 15,
+        15, 15, 15, 15, 15, 14, 20, 21, 22, 22, 22, 22, 22, 20, 19, 19, 18, 17,
+        17, 17, 16, 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+        14, 14, 20, 21, 22, 22, 22, 22, 22, 20, 19, 19, 18, 17, 17, 17, 16, 16,
+        16, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 20, 20,
+        21, 22, 22, 22, 21, 20, 19, 18, 18, 17, 16, 16, 16, 15, 15, 15, 14, 14,
+        14, 14, 13, 13, 14, 13, 13, 14, 13, 13, 13, 14, 19, 20, 20, 21, 21, 21,
+        21, 20, 19, 18, 18, 17, 16, 16, 15, 14, 14, 14, 14, 13, 13, 13, 13, 13,
+        13, 13, 13, 13, 13, 13, 13, 13, 19, 19, 20, 20, 20, 21, 20, 20, 18, 18,
+        17, 16, 16, 16, 15, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+        13, 13, 13, 13, 18, 19, 20, 20, 20, 20, 20, 19, 18, 18, 17, 16, 16, 15,
+        15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+        17, 18, 19, 19, 19, 20, 19, 19, 18, 17, 17, 16, 15, 15, 14, 14, 13, 13,
+        12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 17, 17, 18, 18,
+        19, 19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 13, 13, 13, 12, 12, 12, 12,
+        11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 16, 17, 18, 18, 18, 19, 19, 18,
+        17, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11,
+        11, 11, 12, 11, 11, 12, 16, 17, 17, 18, 18, 19, 18, 18, 17, 16, 16, 15,
+        14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+        11, 11, 16, 16, 17, 17, 18, 18, 18, 17, 17, 16, 16, 15, 14, 14, 13, 13,
+        13, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 15, 16,
+        17, 17, 17, 18, 18, 17, 16, 16, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12,
+        11, 11, 11, 11, 11, 10, 10, 11, 11, 11, 11, 10, 15, 16, 16, 17, 17, 17,
+        18, 17, 17, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11,
+        10, 10, 10, 10, 10, 10, 10, 10, 15, 16, 16, 16, 17, 17, 17, 17, 17, 16,
+        16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10,
+        10, 10, 10, 10, 15, 15, 16, 16, 17, 17, 17, 17, 16, 16, 16, 15, 14, 14,
+        13, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+        14, 15, 15, 16, 16, 16, 17, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12,
+        12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 9, 14, 15, 15, 16,
+        16, 16, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11,
+        11, 11, 10, 10, 10, 10, 10, 10, 9, 9, 14, 15, 15, 16, 16, 16, 16, 16,
+        16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 11, 10, 10,
+        10, 10, 10, 9, 9, 9, 14, 15, 15, 15, 15, 16, 16, 16, 16, 15, 15, 15, 14,
+        14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9,
+        14, 14, 14, 15, 15, 16, 16, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12,
+        12, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9,
+        /* Size 4x8 */
+        33, 22, 18, 16, 26, 23, 20, 17, 22, 19, 17, 16, 22, 17, 15, 14, 20, 16,
+        13, 13, 17, 15, 12, 11, 16, 16, 12, 10, 16, 15, 12, 10,
+        /* Size 8x4 */
+        33, 26, 22, 22, 20, 17, 16, 16, 22, 23, 19, 17, 16, 15, 16, 15, 18, 20,
+        17, 15, 13, 12, 12, 12, 16, 17, 16, 14, 13, 11, 10, 10,
+        /* Size 8x16 */
+        32, 29, 21, 20, 18, 16, 15, 15, 34, 27, 22, 22, 20, 18, 16, 16, 31, 25,
+        22, 22, 20, 18, 17, 16, 26, 22, 21, 22, 20, 19, 18, 17, 21, 21, 19, 19,
+        18, 17, 17, 17, 21, 22, 19, 18, 17, 16, 16, 16, 20, 22, 19, 17, 16, 15,
+        14, 15, 20, 22, 19, 16, 14, 14, 14, 14, 19, 21, 18, 16, 14, 13, 13, 13,
+        17, 19, 18, 15, 13, 12, 12, 12, 16, 19, 17, 15, 12, 12, 11, 12, 16, 18,
+        17, 14, 12, 11, 11, 11, 15, 17, 16, 14, 13, 11, 11, 11, 15, 17, 16, 14,
+        13, 12, 10, 10, 14, 16, 16, 14, 12, 11, 10, 10, 14, 15, 16, 14, 13, 12,
+        10, 10,
+        /* Size 16x8 */
+        32, 34, 31, 26, 21, 21, 20, 20, 19, 17, 16, 16, 15, 15, 14, 14, 29, 27,
+        25, 22, 21, 22, 22, 22, 21, 19, 19, 18, 17, 17, 16, 15, 21, 22, 22, 21,
+        19, 19, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 20, 22, 22, 22, 19, 18,
+        17, 16, 16, 15, 15, 14, 14, 14, 14, 14, 18, 20, 20, 20, 18, 17, 16, 14,
+        14, 13, 12, 12, 13, 13, 12, 13, 16, 18, 18, 19, 17, 16, 15, 14, 13, 12,
+        12, 11, 11, 12, 11, 12, 15, 16, 17, 18, 17, 16, 14, 14, 13, 12, 11, 11,
+        11, 10, 10, 10, 15, 16, 16, 17, 17, 16, 15, 14, 13, 12, 12, 11, 11, 10,
+        10, 10,
+        /* Size 16x32 */
+        32, 33, 29, 27, 21, 21, 20, 20, 18, 17, 16, 15, 15, 15, 15, 14, 33, 33,
+        28, 26, 22, 22, 21, 20, 19, 18, 17, 16, 16, 16, 16, 15, 34, 32, 27, 26,
+        22, 23, 22, 21, 20, 19, 18, 17, 16, 16, 16, 15, 33, 31, 27, 25, 22, 23,
+        22, 21, 20, 19, 18, 17, 17, 17, 16, 16, 31, 28, 25, 23, 22, 22, 22, 22,
+        20, 19, 18, 17, 17, 17, 16, 16, 28, 26, 23, 22, 22, 23, 22, 22, 20, 20,
+        19, 18, 17, 17, 17, 17, 26, 25, 22, 22, 21, 22, 22, 21, 20, 19, 19, 18,
+        18, 17, 17, 17, 24, 24, 22, 21, 20, 21, 20, 20, 19, 18, 18, 17, 17, 17,
+        17, 17, 21, 22, 21, 21, 19, 19, 19, 19, 18, 17, 17, 16, 17, 17, 17, 17,
+        21, 22, 22, 21, 19, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 16, 21, 22,
+        22, 21, 19, 18, 18, 18, 17, 17, 16, 16, 16, 16, 16, 16, 21, 23, 23, 22,
+        19, 18, 17, 17, 16, 16, 15, 15, 15, 15, 16, 15, 20, 22, 22, 21, 19, 17,
+        17, 16, 16, 15, 15, 14, 14, 15, 15, 15, 20, 22, 22, 21, 19, 17, 17, 16,
+        15, 15, 14, 14, 14, 14, 15, 14, 20, 21, 22, 21, 19, 17, 16, 16, 14, 14,
+        14, 13, 14, 14, 14, 14, 19, 20, 21, 20, 19, 17, 16, 15, 14, 13, 13, 13,
+        13, 13, 14, 14, 19, 20, 21, 20, 18, 16, 16, 15, 14, 13, 13, 13, 13, 13,
+        13, 14, 18, 20, 20, 20, 18, 16, 16, 15, 13, 13, 12, 12, 12, 13, 13, 13,
+        17, 19, 19, 19, 18, 16, 15, 14, 13, 12, 12, 12, 12, 12, 12, 13, 17, 18,
+        19, 19, 17, 16, 15, 14, 13, 12, 12, 12, 12, 12, 12, 12, 16, 18, 19, 18,
+        17, 15, 15, 14, 12, 12, 12, 11, 11, 12, 12, 12, 16, 17, 18, 18, 17, 15,
+        14, 14, 12, 12, 11, 11, 11, 11, 12, 12, 16, 17, 18, 18, 17, 15, 14, 13,
+        12, 12, 11, 11, 11, 11, 11, 12, 15, 17, 17, 18, 16, 15, 14, 13, 12, 12,
+        11, 11, 11, 11, 11, 11, 15, 17, 17, 17, 16, 15, 14, 13, 13, 12, 11, 11,
+        11, 10, 11, 11, 15, 16, 17, 17, 16, 16, 14, 13, 13, 12, 11, 11, 10, 10,
+        10, 10, 15, 16, 17, 17, 16, 16, 14, 13, 13, 12, 12, 11, 10, 10, 10, 10,
+        14, 16, 16, 17, 16, 15, 14, 14, 12, 12, 11, 11, 10, 10, 10, 10, 14, 16,
+        16, 17, 16, 15, 14, 14, 12, 12, 11, 11, 10, 10, 10, 10, 14, 16, 16, 16,
+        16, 15, 14, 13, 13, 12, 11, 11, 10, 10, 10, 10, 14, 15, 15, 16, 16, 15,
+        14, 13, 13, 12, 12, 11, 10, 10, 10, 10, 14, 15, 15, 16, 16, 14, 14, 13,
+        13, 12, 12, 11, 11, 10, 10, 9,
+        /* Size 32x16 */
+        32, 33, 34, 33, 31, 28, 26, 24, 21, 21, 21, 21, 20, 20, 20, 19, 19, 18,
+        17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 33, 33, 32, 31,
+        28, 26, 25, 24, 22, 22, 22, 23, 22, 22, 21, 20, 20, 20, 19, 18, 18, 17,
+        17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 29, 28, 27, 27, 25, 23, 22, 22,
+        21, 22, 22, 23, 22, 22, 22, 21, 21, 20, 19, 19, 19, 18, 18, 17, 17, 17,
+        17, 16, 16, 16, 15, 15, 27, 26, 26, 25, 23, 22, 22, 21, 21, 21, 21, 22,
+        21, 21, 21, 20, 20, 20, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 17, 16,
+        16, 16, 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 19, 19, 19, 19, 19, 19,
+        18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 16, 16, 16, 16, 16, 21, 22,
+        23, 23, 22, 23, 22, 21, 19, 19, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16,
+        15, 15, 15, 15, 15, 16, 16, 15, 15, 15, 15, 14, 20, 21, 22, 22, 22, 22,
+        22, 20, 19, 19, 18, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 14, 14, 14,
+        14, 14, 14, 14, 14, 14, 14, 14, 20, 20, 21, 21, 22, 22, 21, 20, 19, 18,
+        18, 17, 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 13, 13, 13, 13, 13, 14,
+        14, 13, 13, 13, 18, 19, 20, 20, 20, 20, 20, 19, 18, 18, 17, 16, 16, 15,
+        14, 14, 14, 13, 13, 13, 12, 12, 12, 12, 13, 13, 13, 12, 12, 13, 13, 13,
+        17, 18, 19, 19, 19, 20, 19, 18, 17, 17, 17, 16, 15, 15, 14, 13, 13, 13,
+        12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 16, 17, 18, 18,
+        18, 19, 19, 18, 17, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11,
+        11, 11, 11, 11, 12, 11, 11, 11, 12, 12, 15, 16, 17, 17, 17, 18, 18, 17,
+        16, 16, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 11,
+        11, 11, 11, 11, 11, 11, 15, 16, 16, 17, 17, 17, 18, 17, 17, 16, 16, 15,
+        14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 10, 10, 10, 10, 10,
+        10, 11, 15, 16, 16, 17, 17, 17, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13,
+        13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 10, 15, 16,
+        16, 16, 16, 17, 17, 17, 17, 16, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12,
+        12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 14, 15, 15, 16, 16, 17,
+        17, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 12, 12, 11,
+        11, 10, 10, 10, 10, 10, 10, 9,
+        /* Size 4x16 */
+        33, 21, 17, 15, 32, 23, 19, 16, 28, 22, 19, 17, 25, 22, 19, 17, 22, 19,
+        17, 17, 22, 18, 17, 16, 22, 17, 15, 15, 21, 17, 14, 14, 20, 16, 13, 13,
+        19, 16, 12, 12, 18, 15, 12, 12, 17, 15, 12, 11, 17, 15, 12, 10, 16, 16,
+        12, 10, 16, 15, 12, 10, 15, 15, 12, 10,
+        /* Size 16x4 */
+        33, 32, 28, 25, 22, 22, 22, 21, 20, 19, 18, 17, 17, 16, 16, 15, 21, 23,
+        22, 22, 19, 18, 17, 17, 16, 16, 15, 15, 15, 16, 15, 15, 17, 19, 19, 19,
+        17, 17, 15, 14, 13, 12, 12, 12, 12, 12, 12, 12, 15, 16, 17, 17, 17, 16,
+        15, 14, 13, 12, 12, 11, 10, 10, 10, 10,
+        /* Size 8x32 */
+        32, 29, 21, 20, 18, 16, 15, 15, 33, 28, 22, 21, 19, 17, 16, 16, 34, 27,
+        22, 22, 20, 18, 16, 16, 33, 27, 22, 22, 20, 18, 17, 16, 31, 25, 22, 22,
+        20, 18, 17, 16, 28, 23, 22, 22, 20, 19, 17, 17, 26, 22, 21, 22, 20, 19,
+        18, 17, 24, 22, 20, 20, 19, 18, 17, 17, 21, 21, 19, 19, 18, 17, 17, 17,
+        21, 22, 19, 19, 18, 17, 16, 16, 21, 22, 19, 18, 17, 16, 16, 16, 21, 23,
+        19, 17, 16, 15, 15, 16, 20, 22, 19, 17, 16, 15, 14, 15, 20, 22, 19, 17,
+        15, 14, 14, 15, 20, 22, 19, 16, 14, 14, 14, 14, 19, 21, 19, 16, 14, 13,
+        13, 14, 19, 21, 18, 16, 14, 13, 13, 13, 18, 20, 18, 16, 13, 12, 12, 13,
+        17, 19, 18, 15, 13, 12, 12, 12, 17, 19, 17, 15, 13, 12, 12, 12, 16, 19,
+        17, 15, 12, 12, 11, 12, 16, 18, 17, 14, 12, 11, 11, 12, 16, 18, 17, 14,
+        12, 11, 11, 11, 15, 17, 16, 14, 12, 11, 11, 11, 15, 17, 16, 14, 13, 11,
+        11, 11, 15, 17, 16, 14, 13, 11, 10, 10, 15, 17, 16, 14, 13, 12, 10, 10,
+        14, 16, 16, 14, 12, 11, 10, 10, 14, 16, 16, 14, 12, 11, 10, 10, 14, 16,
+        16, 14, 13, 11, 10, 10, 14, 15, 16, 14, 13, 12, 10, 10, 14, 15, 16, 14,
+        13, 12, 11, 10,
+        /* Size 32x8 */
+        32, 33, 34, 33, 31, 28, 26, 24, 21, 21, 21, 21, 20, 20, 20, 19, 19, 18,
+        17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 29, 28, 27, 27,
+        25, 23, 22, 22, 21, 22, 22, 23, 22, 22, 22, 21, 21, 20, 19, 19, 19, 18,
+        18, 17, 17, 17, 17, 16, 16, 16, 15, 15, 21, 22, 22, 22, 22, 22, 21, 20,
+        19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16,
+        16, 16, 16, 16, 16, 16, 20, 21, 22, 22, 22, 22, 22, 20, 19, 19, 18, 17,
+        17, 17, 16, 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+        14, 14, 18, 19, 20, 20, 20, 20, 20, 19, 18, 18, 17, 16, 16, 15, 14, 14,
+        14, 13, 13, 13, 12, 12, 12, 12, 13, 13, 13, 12, 12, 13, 13, 13, 16, 17,
+        18, 18, 18, 19, 19, 18, 17, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12,
+        12, 11, 11, 11, 11, 11, 12, 11, 11, 11, 12, 12, 15, 16, 16, 17, 17, 17,
+        18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11,
+        11, 10, 10, 10, 10, 10, 10, 11, 15, 16, 16, 16, 16, 17, 17, 17, 17, 16,
+        16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 10, 10, 10,
+        10, 10, 10, 10 },
+  },
+  {
+      { /* Luma */
+        /* Size 4x4 */
+        32, 27, 16, 12, 27, 18, 13, 11, 16, 13, 9, 8, 12, 11, 8, 6,
+        /* Size 8x8 */
+        32, 32, 29, 22, 18, 13, 12, 11, 32, 30, 28, 23, 19, 15, 13, 11, 29, 28,
+        21, 18, 16, 13, 12, 11, 22, 23, 18, 15, 13, 11, 10, 10, 18, 19, 16, 13,
+        11, 9, 8, 8, 13, 15, 13, 11, 9, 8, 7, 7, 12, 13, 12, 10, 8, 7, 7, 6, 11,
+        11, 11, 10, 8, 7, 6, 6,
+        /* Size 16x16 */
+        32, 33, 33, 32, 30, 26, 23, 21, 18, 16, 14, 13, 12, 11, 10, 10, 33, 32,
+        32, 32, 30, 27, 25, 22, 19, 17, 16, 14, 13, 12, 11, 10, 33, 32, 31, 30,
+        28, 26, 24, 22, 19, 17, 16, 14, 13, 12, 12, 11, 32, 32, 30, 29, 28, 26,
+        24, 22, 20, 18, 16, 14, 14, 13, 12, 11, 30, 30, 28, 28, 24, 22, 20, 19,
+        17, 16, 15, 13, 12, 12, 12, 11, 26, 27, 26, 26, 22, 19, 18, 17, 15, 14,
+        13, 12, 11, 11, 11, 10, 23, 25, 24, 24, 20, 18, 16, 15, 14, 13, 12, 11,
+        11, 10, 10, 10, 21, 22, 22, 22, 19, 17, 15, 14, 13, 12, 11, 10, 10, 10,
+        9, 9, 18, 19, 19, 20, 17, 15, 14, 13, 11, 11, 10, 9, 9, 9, 9, 8, 16, 17,
+        17, 18, 16, 14, 13, 12, 11, 10, 9, 9, 8, 8, 8, 8, 14, 16, 16, 16, 15,
+        13, 12, 11, 10, 9, 9, 8, 8, 8, 8, 7, 13, 14, 14, 14, 13, 12, 11, 10, 9,
+        9, 8, 7, 7, 7, 7, 7, 12, 13, 13, 14, 12, 11, 11, 10, 9, 8, 8, 7, 7, 7,
+        6, 6, 11, 12, 12, 13, 12, 11, 10, 10, 9, 8, 8, 7, 7, 6, 6, 6, 10, 11,
+        12, 12, 12, 11, 10, 9, 9, 8, 8, 7, 6, 6, 6, 6, 10, 10, 11, 11, 11, 10,
+        10, 9, 8, 8, 7, 7, 6, 6, 6, 5,
+        /* Size 32x32 */
+        32, 33, 33, 33, 33, 32, 32, 30, 30, 28, 26, 25, 23, 21, 21, 19, 18, 17,
+        16, 15, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 33, 32, 32, 32,
+        32, 32, 32, 30, 30, 29, 27, 26, 24, 22, 22, 20, 19, 18, 17, 16, 15, 13,
+        13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 33, 32, 32, 32, 32, 32, 32, 31,
+        30, 30, 27, 26, 25, 23, 22, 20, 19, 19, 17, 16, 16, 14, 14, 13, 13, 12,
+        12, 12, 11, 11, 10, 10, 33, 32, 32, 32, 32, 32, 32, 31, 30, 30, 28, 27,
+        25, 23, 23, 21, 19, 19, 17, 16, 16, 14, 14, 14, 13, 13, 12, 12, 12, 11,
+        11, 11, 33, 32, 32, 32, 31, 31, 30, 29, 28, 28, 26, 26, 24, 23, 22, 20,
+        19, 19, 17, 16, 16, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 32, 32,
+        32, 32, 31, 30, 30, 28, 28, 28, 26, 26, 24, 23, 22, 21, 19, 19, 18, 17,
+        16, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 32, 32, 32, 32, 30, 30,
+        29, 28, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 14,
+        14, 13, 13, 12, 12, 12, 11, 11, 30, 30, 31, 31, 29, 28, 28, 26, 25, 24,
+        23, 22, 22, 20, 20, 19, 18, 17, 16, 16, 15, 14, 14, 13, 13, 13, 12, 12,
+        12, 12, 11, 11, 30, 30, 30, 30, 28, 28, 28, 25, 24, 23, 22, 21, 20, 19,
+        19, 18, 17, 17, 16, 15, 15, 13, 13, 13, 12, 12, 12, 12, 12, 11, 11, 11,
+        28, 29, 30, 30, 28, 28, 27, 24, 23, 21, 20, 20, 19, 18, 18, 17, 16, 16,
+        15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 10, 26, 27, 27, 28,
+        26, 26, 26, 23, 22, 20, 19, 19, 18, 17, 17, 16, 15, 15, 14, 13, 13, 12,
+        12, 12, 11, 12, 11, 11, 11, 11, 10, 10, 25, 26, 26, 27, 26, 26, 25, 22,
+        21, 20, 19, 18, 17, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11,
+        11, 11, 11, 10, 10, 10, 23, 24, 25, 25, 24, 24, 24, 22, 20, 19, 18, 17,
+        16, 16, 15, 14, 14, 14, 13, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10,
+        10, 10, 21, 22, 23, 23, 23, 23, 23, 20, 19, 18, 17, 17, 16, 15, 14, 13,
+        13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 9, 9, 21, 22,
+        22, 23, 22, 22, 22, 20, 19, 18, 17, 16, 15, 14, 14, 13, 13, 12, 12, 11,
+        11, 10, 10, 10, 10, 10, 10, 10, 9, 9, 9, 9, 19, 20, 20, 21, 20, 21, 21,
+        19, 18, 17, 16, 15, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 9,
+        9, 9, 9, 9, 9, 9, 9, 18, 19, 19, 19, 19, 19, 20, 18, 17, 16, 15, 15, 14,
+        13, 13, 12, 11, 11, 11, 10, 10, 9, 9, 9, 9, 9, 9, 9, 9, 8, 8, 9, 17, 18,
+        19, 19, 19, 19, 19, 17, 17, 16, 15, 14, 14, 13, 12, 12, 11, 11, 10, 10,
+        10, 9, 9, 9, 9, 8, 9, 8, 8, 8, 8, 8, 16, 17, 17, 17, 17, 18, 18, 16, 16,
+        15, 14, 14, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8,
+        8, 8, 15, 16, 16, 16, 16, 17, 17, 16, 15, 14, 13, 13, 12, 12, 11, 11,
+        10, 10, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 14, 15, 16, 16, 16,
+        16, 16, 15, 15, 14, 13, 13, 12, 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 8,
+        8, 8, 7, 8, 7, 7, 7, 13, 13, 14, 14, 14, 15, 15, 14, 13, 13, 12, 12, 11,
+        11, 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 13, 13, 14,
+        14, 14, 14, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7,
+        7, 7, 7, 7, 7, 7, 7, 7, 7, 12, 13, 13, 14, 14, 14, 14, 13, 13, 13, 12,
+        12, 11, 10, 10, 10, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 12,
+        12, 13, 13, 13, 13, 14, 13, 12, 12, 11, 11, 11, 10, 10, 9, 9, 9, 8, 8,
+        8, 7, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 11, 12, 12, 13, 13, 13, 13, 13, 12,
+        12, 12, 11, 10, 10, 10, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 6, 6, 6, 6,
+        6, 11, 12, 12, 12, 12, 12, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9,
+        8, 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 11, 11, 12, 12, 12, 12, 12,
+        12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6,
+        6, 6, 6, 6, 10, 11, 11, 12, 12, 12, 12, 12, 12, 11, 11, 11, 10, 10, 9,
+        9, 9, 8, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 10, 11, 11, 11, 11,
+        11, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 6, 6,
+        6, 6, 6, 6, 6, 5, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 10, 10, 10, 9,
+        9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 5, 5, 10, 10, 10, 11,
+        11, 11, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 8, 8, 7, 7, 7, 7, 6, 6,
+        6, 6, 6, 6, 5, 5, 5,
+        /* Size 4x8 */
+        32, 27, 17, 12, 32, 26, 18, 13, 30, 20, 15, 12, 23, 17, 12, 10, 19, 15,
+        10, 9, 14, 12, 9, 8, 12, 12, 8, 7, 11, 10, 8, 6,
+        /* Size 8x4 */
+        32, 32, 30, 23, 19, 14, 12, 11, 27, 26, 20, 17, 15, 12, 12, 10, 17, 18,
+        15, 12, 10, 9, 8, 8, 12, 13, 12, 10, 9, 8, 7, 6,
+        /* Size 8x16 */
+        32, 32, 28, 23, 18, 13, 12, 11, 33, 32, 29, 25, 19, 14, 13, 12, 32, 31,
+        28, 24, 19, 14, 13, 12, 32, 30, 27, 24, 20, 15, 13, 12, 30, 28, 23, 20,
+        17, 14, 13, 12, 26, 26, 20, 18, 15, 12, 12, 11, 23, 24, 19, 16, 14, 11,
+        11, 11, 21, 22, 18, 15, 13, 11, 10, 10, 18, 19, 16, 14, 11, 9, 9, 9, 16,
+        17, 15, 13, 11, 9, 8, 8, 14, 16, 14, 12, 10, 8, 8, 8, 13, 14, 13, 11, 9,
+        8, 7, 7, 12, 13, 12, 11, 9, 7, 7, 7, 11, 12, 12, 10, 9, 8, 7, 6, 10, 12,
+        12, 10, 8, 7, 6, 6, 10, 11, 11, 10, 9, 7, 6, 6,
+        /* Size 16x8 */
+        32, 33, 32, 32, 30, 26, 23, 21, 18, 16, 14, 13, 12, 11, 10, 10, 32, 32,
+        31, 30, 28, 26, 24, 22, 19, 17, 16, 14, 13, 12, 12, 11, 28, 29, 28, 27,
+        23, 20, 19, 18, 16, 15, 14, 13, 12, 12, 12, 11, 23, 25, 24, 24, 20, 18,
+        16, 15, 14, 13, 12, 11, 11, 10, 10, 10, 18, 19, 19, 20, 17, 15, 14, 13,
+        11, 11, 10, 9, 9, 9, 8, 9, 13, 14, 14, 15, 14, 12, 11, 11, 9, 9, 8, 8,
+        7, 8, 7, 7, 12, 13, 13, 13, 13, 12, 11, 10, 9, 8, 8, 7, 7, 7, 6, 6, 11,
+        12, 12, 12, 12, 11, 11, 10, 9, 8, 8, 7, 7, 6, 6, 6,
+        /* Size 16x32 */
+        32, 33, 32, 32, 28, 26, 23, 19, 18, 16, 13, 13, 12, 11, 11, 11, 33, 32,
+        32, 32, 29, 27, 24, 20, 19, 17, 14, 13, 12, 12, 12, 11, 33, 32, 32, 32,
+        29, 27, 25, 20, 19, 17, 14, 14, 13, 12, 12, 11, 33, 32, 32, 31, 30, 28,
+        25, 21, 19, 17, 14, 14, 13, 12, 12, 12, 32, 32, 31, 30, 28, 26, 24, 20,
+        19, 17, 14, 14, 13, 13, 12, 12, 32, 32, 30, 30, 28, 26, 24, 21, 19, 18,
+        15, 14, 13, 13, 12, 12, 32, 31, 30, 29, 27, 26, 24, 21, 20, 18, 15, 15,
+        13, 13, 12, 12, 30, 30, 29, 28, 24, 23, 21, 19, 18, 16, 14, 14, 13, 13,
+        13, 12, 30, 30, 28, 28, 23, 22, 20, 18, 17, 16, 14, 13, 13, 12, 12, 12,
+        28, 30, 28, 27, 21, 20, 19, 17, 16, 15, 13, 13, 12, 12, 12, 12, 26, 28,
+        26, 26, 20, 19, 18, 16, 15, 14, 12, 12, 12, 12, 11, 12, 26, 27, 26, 25,
+        20, 19, 17, 15, 15, 14, 12, 12, 11, 11, 11, 11, 23, 25, 24, 24, 19, 18,
+        16, 14, 14, 13, 11, 11, 11, 11, 11, 11, 22, 23, 23, 22, 18, 17, 16, 14,
+        13, 12, 11, 11, 10, 10, 10, 10, 21, 22, 22, 22, 18, 17, 15, 13, 13, 12,
+        11, 10, 10, 10, 10, 10, 19, 21, 20, 20, 17, 16, 14, 12, 12, 11, 10, 10,
+        9, 9, 10, 9, 18, 19, 19, 19, 16, 15, 14, 12, 11, 11, 9, 9, 9, 9, 9, 9,
+        17, 19, 19, 19, 16, 15, 14, 12, 11, 10, 9, 9, 9, 9, 9, 9, 16, 17, 17,
+        18, 15, 14, 13, 11, 11, 10, 9, 9, 8, 8, 8, 9, 15, 16, 17, 17, 14, 13,
+        12, 11, 10, 9, 8, 8, 8, 8, 8, 8, 14, 16, 16, 16, 14, 13, 12, 11, 10, 9,
+        8, 8, 8, 8, 8, 8, 13, 14, 14, 15, 13, 12, 11, 10, 9, 9, 8, 8, 7, 8, 8,
+        7, 13, 14, 14, 14, 13, 12, 11, 10, 9, 9, 8, 7, 7, 7, 7, 7, 12, 14, 14,
+        14, 13, 12, 11, 10, 9, 8, 8, 7, 7, 7, 7, 7, 12, 13, 13, 13, 12, 11, 11,
+        9, 9, 8, 7, 7, 7, 7, 7, 7, 11, 12, 13, 13, 12, 12, 10, 9, 9, 8, 8, 7, 7,
+        7, 6, 6, 11, 12, 12, 13, 12, 11, 10, 10, 9, 8, 8, 7, 7, 6, 6, 6, 11, 12,
+        12, 12, 12, 11, 10, 10, 9, 8, 7, 7, 7, 6, 6, 6, 10, 12, 12, 12, 12, 11,
+        10, 9, 8, 8, 7, 7, 6, 6, 6, 6, 10, 11, 11, 12, 11, 10, 10, 9, 9, 8, 7,
+        7, 6, 6, 6, 6, 10, 11, 11, 11, 11, 10, 10, 9, 9, 8, 7, 7, 6, 6, 6, 6,
+        10, 11, 11, 11, 11, 10, 10, 9, 9, 8, 8, 7, 7, 6, 6, 5,
+        /* Size 32x16 */
+        32, 33, 33, 33, 32, 32, 32, 30, 30, 28, 26, 26, 23, 22, 21, 19, 18, 17,
+        16, 15, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 33, 32, 32, 32,
+        32, 32, 31, 30, 30, 30, 28, 27, 25, 23, 22, 21, 19, 19, 17, 16, 16, 14,
+        14, 14, 13, 12, 12, 12, 12, 11, 11, 11, 32, 32, 32, 32, 31, 30, 30, 29,
+        28, 28, 26, 26, 24, 23, 22, 20, 19, 19, 17, 17, 16, 14, 14, 14, 13, 13,
+        12, 12, 12, 11, 11, 11, 32, 32, 32, 31, 30, 30, 29, 28, 28, 27, 26, 25,
+        24, 22, 22, 20, 19, 19, 18, 17, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12,
+        11, 11, 28, 29, 29, 30, 28, 28, 27, 24, 23, 21, 20, 20, 19, 18, 18, 17,
+        16, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 12, 11, 11, 11, 26, 27,
+        27, 28, 26, 26, 26, 23, 22, 20, 19, 19, 18, 17, 17, 16, 15, 15, 14, 13,
+        13, 12, 12, 12, 11, 12, 11, 11, 11, 10, 10, 10, 23, 24, 25, 25, 24, 24,
+        24, 21, 20, 19, 18, 17, 16, 16, 15, 14, 14, 14, 13, 12, 12, 11, 11, 11,
+        11, 10, 10, 10, 10, 10, 10, 10, 19, 20, 20, 21, 20, 21, 21, 19, 18, 17,
+        16, 15, 14, 14, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 10, 10, 9,
+        9, 9, 9, 18, 19, 19, 19, 19, 19, 20, 18, 17, 16, 15, 15, 14, 13, 13, 12,
+        11, 11, 11, 10, 10, 9, 9, 9, 9, 9, 9, 9, 8, 9, 9, 9, 16, 17, 17, 17, 17,
+        18, 18, 16, 16, 15, 14, 14, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8,
+        8, 8, 8, 8, 8, 8, 8, 8, 13, 14, 14, 14, 14, 15, 15, 14, 14, 13, 12, 12,
+        11, 11, 11, 10, 9, 9, 9, 8, 8, 8, 8, 8, 7, 8, 8, 7, 7, 7, 7, 8, 13, 13,
+        14, 14, 14, 14, 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 8, 8,
+        8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 12, 12, 13, 13, 13, 13, 13, 13, 13, 12,
+        12, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 7,
+        11, 12, 12, 12, 13, 13, 13, 13, 12, 12, 12, 11, 11, 10, 10, 9, 9, 9, 8,
+        8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 11, 12, 12, 12, 12, 12, 12, 13,
+        12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6,
+        6, 6, 6, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 11, 11, 10, 10, 9,
+        9, 9, 9, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 5,
+        /* Size 4x16 */
+        33, 26, 16, 11, 32, 27, 17, 12, 32, 26, 17, 13, 31, 26, 18, 13, 30, 22,
+        16, 12, 28, 19, 14, 12, 25, 18, 13, 11, 22, 17, 12, 10, 19, 15, 11, 9,
+        17, 14, 10, 8, 16, 13, 9, 8, 14, 12, 9, 7, 13, 11, 8, 7, 12, 11, 8, 6,
+        12, 11, 8, 6, 11, 10, 8, 6,
+        /* Size 16x4 */
+        33, 32, 32, 31, 30, 28, 25, 22, 19, 17, 16, 14, 13, 12, 12, 11, 26, 27,
+        26, 26, 22, 19, 18, 17, 15, 14, 13, 12, 11, 11, 11, 10, 16, 17, 17, 18,
+        16, 14, 13, 12, 11, 10, 9, 9, 8, 8, 8, 8, 11, 12, 13, 13, 12, 12, 11,
+        10, 9, 8, 8, 7, 7, 6, 6, 6,
+        /* Size 8x32 */
+        32, 32, 28, 23, 18, 13, 12, 11, 33, 32, 29, 24, 19, 14, 12, 12, 33, 32,
+        29, 25, 19, 14, 13, 12, 33, 32, 30, 25, 19, 14, 13, 12, 32, 31, 28, 24,
+        19, 14, 13, 12, 32, 30, 28, 24, 19, 15, 13, 12, 32, 30, 27, 24, 20, 15,
+        13, 12, 30, 29, 24, 21, 18, 14, 13, 13, 30, 28, 23, 20, 17, 14, 13, 12,
+        28, 28, 21, 19, 16, 13, 12, 12, 26, 26, 20, 18, 15, 12, 12, 11, 26, 26,
+        20, 17, 15, 12, 11, 11, 23, 24, 19, 16, 14, 11, 11, 11, 22, 23, 18, 16,
+        13, 11, 10, 10, 21, 22, 18, 15, 13, 11, 10, 10, 19, 20, 17, 14, 12, 10,
+        9, 10, 18, 19, 16, 14, 11, 9, 9, 9, 17, 19, 16, 14, 11, 9, 9, 9, 16, 17,
+        15, 13, 11, 9, 8, 8, 15, 17, 14, 12, 10, 8, 8, 8, 14, 16, 14, 12, 10, 8,
+        8, 8, 13, 14, 13, 11, 9, 8, 7, 8, 13, 14, 13, 11, 9, 8, 7, 7, 12, 14,
+        13, 11, 9, 8, 7, 7, 12, 13, 12, 11, 9, 7, 7, 7, 11, 13, 12, 10, 9, 8, 7,
+        6, 11, 12, 12, 10, 9, 8, 7, 6, 11, 12, 12, 10, 9, 7, 7, 6, 10, 12, 12,
+        10, 8, 7, 6, 6, 10, 11, 11, 10, 9, 7, 6, 6, 10, 11, 11, 10, 9, 7, 6, 6,
+        10, 11, 11, 10, 9, 8, 7, 6,
+        /* Size 32x8 */
+        32, 33, 33, 33, 32, 32, 32, 30, 30, 28, 26, 26, 23, 22, 21, 19, 18, 17,
+        16, 15, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 32, 32, 32, 32,
+        31, 30, 30, 29, 28, 28, 26, 26, 24, 23, 22, 20, 19, 19, 17, 17, 16, 14,
+        14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 28, 29, 29, 30, 28, 28, 27, 24,
+        23, 21, 20, 20, 19, 18, 18, 17, 16, 16, 15, 14, 14, 13, 13, 13, 12, 12,
+        12, 12, 12, 11, 11, 11, 23, 24, 25, 25, 24, 24, 24, 21, 20, 19, 18, 17,
+        16, 16, 15, 14, 14, 14, 13, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10,
+        10, 10, 18, 19, 19, 19, 19, 19, 20, 18, 17, 16, 15, 15, 14, 13, 13, 12,
+        11, 11, 11, 10, 10, 9, 9, 9, 9, 9, 9, 9, 8, 9, 9, 9, 13, 14, 14, 14, 14,
+        15, 15, 14, 14, 13, 12, 12, 11, 11, 11, 10, 9, 9, 9, 8, 8, 8, 8, 8, 7,
+        8, 8, 7, 7, 7, 7, 8, 12, 12, 13, 13, 13, 13, 13, 13, 13, 12, 12, 11, 11,
+        10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 7, 11, 12, 12,
+        12, 12, 12, 12, 13, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, 8, 7,
+        7, 7, 6, 6, 6, 6, 6, 6, 6 },
+      { /* Chroma */
+        /* Size 4x4 */
+        32, 23, 19, 16, 23, 19, 17, 15, 19, 17, 13, 12, 16, 15, 12, 10,
+        /* Size 8x8 */
+        33, 28, 22, 21, 20, 17, 16, 15, 28, 24, 22, 22, 21, 19, 17, 16, 22, 22,
+        19, 19, 19, 17, 16, 16, 21, 22, 19, 17, 16, 15, 14, 14, 20, 21, 19, 16,
+        14, 13, 13, 13, 17, 19, 17, 15, 13, 12, 12, 12, 16, 17, 16, 14, 13, 12,
+        11, 10, 15, 16, 16, 14, 13, 12, 10, 10,
+        /* Size 16x16 */
+        32, 34, 31, 28, 23, 21, 21, 20, 19, 18, 17, 16, 15, 15, 15, 14, 34, 33,
+        29, 26, 23, 22, 22, 22, 20, 19, 19, 17, 17, 16, 16, 15, 31, 29, 26, 24,
+        22, 22, 23, 22, 21, 20, 19, 18, 17, 17, 16, 16, 28, 26, 24, 22, 22, 22,
+        23, 22, 21, 20, 20, 19, 18, 18, 17, 16, 23, 23, 22, 22, 20, 20, 20, 20,
+        19, 19, 18, 17, 17, 17, 16, 17, 21, 22, 22, 22, 20, 19, 19, 18, 18, 17,
+        17, 16, 16, 16, 16, 16, 21, 22, 23, 23, 20, 19, 18, 17, 17, 16, 16, 15,
+        15, 15, 15, 15, 20, 22, 22, 22, 20, 18, 17, 17, 16, 15, 15, 14, 14, 14,
+        14, 14, 19, 20, 21, 21, 19, 18, 17, 16, 15, 14, 14, 13, 13, 13, 13, 13,
+        18, 19, 20, 20, 19, 17, 16, 15, 14, 13, 13, 12, 12, 12, 12, 12, 17, 19,
+        19, 20, 18, 17, 16, 15, 14, 13, 12, 12, 12, 12, 12, 12, 16, 17, 18, 19,
+        17, 16, 15, 14, 13, 12, 12, 11, 11, 11, 11, 11, 15, 17, 17, 18, 17, 16,
+        15, 14, 13, 12, 12, 11, 11, 11, 11, 11, 15, 16, 17, 18, 17, 16, 15, 14,
+        13, 12, 12, 11, 11, 10, 10, 10, 15, 16, 16, 17, 16, 16, 15, 14, 13, 12,
+        12, 11, 11, 10, 10, 10, 14, 15, 16, 16, 17, 16, 15, 14, 13, 12, 12, 11,
+        11, 10, 10, 10,
+        /* Size 32x32 */
+        32, 33, 34, 34, 31, 29, 28, 25, 23, 21, 21, 21, 21, 20, 20, 20, 19, 19,
+        18, 17, 17, 16, 16, 16, 15, 15, 15, 15, 15, 14, 14, 14, 33, 33, 33, 33,
+        30, 28, 27, 24, 23, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 18, 18, 17,
+        17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 34, 33, 33, 33, 29, 28, 26, 24,
+        23, 22, 22, 22, 22, 22, 22, 21, 20, 20, 19, 19, 19, 18, 17, 17, 17, 16,
+        16, 16, 16, 15, 15, 15, 34, 33, 33, 32, 29, 28, 26, 24, 23, 22, 23, 23,
+        23, 22, 22, 22, 21, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16,
+        16, 16, 31, 30, 29, 29, 26, 25, 24, 23, 22, 22, 22, 22, 23, 22, 22, 22,
+        21, 21, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 29, 28,
+        28, 28, 25, 24, 23, 22, 22, 22, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20,
+        19, 19, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 28, 27, 26, 26, 24, 23,
+        22, 22, 22, 21, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 19,
+        18, 18, 18, 17, 17, 17, 16, 16, 25, 24, 24, 24, 23, 22, 22, 21, 21, 20,
+        21, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17,
+        17, 17, 17, 17, 23, 23, 23, 23, 22, 22, 22, 21, 20, 20, 20, 20, 20, 20,
+        20, 20, 19, 19, 19, 18, 18, 17, 17, 17, 17, 17, 17, 17, 16, 17, 17, 17,
+        21, 22, 22, 22, 22, 22, 21, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+        18, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 16, 16, 16, 21, 22, 22, 23,
+        22, 22, 22, 21, 20, 19, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 16,
+        16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 21, 22, 22, 23, 22, 22, 22, 21,
+        20, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 16, 15,
+        15, 15, 15, 15, 15, 15, 21, 22, 22, 23, 23, 23, 23, 21, 20, 19, 19, 18,
+        18, 17, 17, 17, 17, 17, 16, 16, 16, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+        15, 15, 20, 21, 22, 22, 22, 22, 22, 20, 20, 19, 18, 18, 17, 17, 17, 16,
+        16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 20, 21,
+        22, 22, 22, 22, 22, 20, 20, 19, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15,
+        15, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 20, 20, 21, 22, 22, 22,
+        22, 20, 20, 19, 18, 18, 17, 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 14,
+        13, 13, 14, 13, 13, 14, 14, 13, 19, 20, 20, 21, 21, 21, 21, 20, 19, 19,
+        18, 17, 17, 16, 16, 15, 15, 15, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13,
+        13, 13, 13, 13, 19, 20, 20, 20, 21, 21, 21, 20, 19, 19, 17, 17, 17, 16,
+        16, 15, 15, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+        18, 19, 19, 20, 20, 20, 20, 19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14,
+        13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 17, 18, 19, 19,
+        19, 20, 20, 19, 18, 18, 17, 17, 16, 15, 15, 14, 14, 14, 13, 13, 13, 12,
+        12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 17, 18, 19, 19, 19, 19, 20, 19,
+        18, 18, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 12, 12,
+        12, 12, 12, 12, 12, 12, 16, 17, 18, 18, 18, 19, 19, 18, 17, 17, 16, 16,
+        15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 12, 11, 11, 12, 11, 12, 11, 12,
+        12, 12, 16, 17, 17, 18, 18, 18, 19, 18, 17, 17, 16, 16, 15, 14, 14, 14,
+        13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 16, 17,
+        17, 18, 18, 18, 19, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12,
+        12, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 15, 16, 17, 17, 17, 18,
+        18, 17, 17, 16, 16, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11,
+        11, 11, 11, 11, 11, 11, 11, 11, 15, 16, 16, 17, 17, 17, 18, 17, 17, 16,
+        16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 10,
+        10, 10, 11, 10, 15, 16, 16, 17, 17, 17, 18, 17, 17, 16, 16, 15, 15, 14,
+        14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10,
+        15, 16, 16, 16, 17, 17, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13,
+        12, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 15, 15, 16, 16,
+        16, 17, 17, 17, 16, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11,
+        11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 14, 15, 15, 16, 16, 16, 17, 17,
+        17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 10,
+        10, 10, 10, 10, 10, 10, 14, 15, 15, 16, 16, 16, 16, 17, 17, 16, 16, 15,
+        15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10,
+        10, 10, 14, 15, 15, 16, 16, 16, 16, 17, 17, 16, 16, 15, 15, 14, 14, 13,
+        13, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 9,
+        /* Size 4x8 */
+        33, 22, 19, 16, 27, 22, 20, 17, 22, 19, 18, 17, 22, 18, 16, 14, 20, 17,
+        14, 13, 18, 16, 12, 12, 17, 16, 12, 11, 16, 15, 12, 10,
+        /* Size 8x4 */
+        33, 27, 22, 22, 20, 18, 17, 16, 22, 22, 19, 18, 17, 16, 16, 15, 19, 20,
+        18, 16, 14, 12, 12, 12, 16, 17, 17, 14, 13, 12, 11, 10,
+        /* Size 8x16 */
+        32, 30, 21, 21, 19, 16, 15, 15, 33, 28, 22, 22, 20, 18, 17, 16, 31, 26,
+        22, 22, 21, 18, 17, 17, 28, 23, 22, 23, 21, 19, 18, 17, 23, 22, 20, 20,
+        19, 17, 17, 17, 21, 22, 19, 18, 18, 16, 16, 16, 21, 23, 19, 18, 17, 15,
+        15, 15, 20, 22, 19, 17, 16, 14, 14, 14, 19, 21, 19, 17, 15, 13, 13, 13,
+        18, 20, 18, 16, 14, 12, 12, 13, 17, 19, 18, 16, 14, 12, 12, 12, 16, 18,
+        17, 15, 13, 12, 11, 12, 16, 17, 16, 15, 13, 11, 11, 11, 15, 17, 16, 14,
+        13, 12, 11, 10, 15, 16, 16, 15, 13, 12, 11, 10, 14, 16, 16, 15, 13, 12,
+        11, 10,
+        /* Size 16x8 */
+        32, 33, 31, 28, 23, 21, 21, 20, 19, 18, 17, 16, 16, 15, 15, 14, 30, 28,
+        26, 23, 22, 22, 23, 22, 21, 20, 19, 18, 17, 17, 16, 16, 21, 22, 22, 22,
+        20, 19, 19, 19, 19, 18, 18, 17, 16, 16, 16, 16, 21, 22, 22, 23, 20, 18,
+        18, 17, 17, 16, 16, 15, 15, 14, 15, 15, 19, 20, 21, 21, 19, 18, 17, 16,
+        15, 14, 14, 13, 13, 13, 13, 13, 16, 18, 18, 19, 17, 16, 15, 14, 13, 12,
+        12, 12, 11, 12, 12, 12, 15, 17, 17, 18, 17, 16, 15, 14, 13, 12, 12, 11,
+        11, 11, 11, 11, 15, 16, 17, 17, 17, 16, 15, 14, 13, 13, 12, 12, 11, 10,
+        10, 10,
+        /* Size 16x32 */
+        32, 33, 30, 28, 21, 21, 21, 20, 19, 18, 16, 16, 15, 15, 15, 15, 33, 33,
+        29, 27, 22, 22, 22, 20, 20, 19, 17, 17, 16, 16, 16, 16, 33, 32, 28, 26,
+        22, 22, 22, 21, 20, 19, 18, 17, 17, 16, 16, 16, 34, 32, 28, 26, 22, 23,
+        23, 21, 21, 20, 18, 18, 17, 17, 17, 16, 31, 28, 26, 24, 22, 22, 22, 22,
+        21, 20, 18, 18, 17, 17, 17, 16, 29, 27, 24, 23, 22, 22, 23, 22, 21, 20,
+        19, 18, 18, 17, 17, 17, 28, 26, 23, 22, 22, 22, 23, 22, 21, 20, 19, 19,
+        18, 18, 17, 17, 24, 24, 23, 22, 20, 20, 21, 20, 20, 19, 18, 18, 17, 18,
+        17, 17, 23, 23, 22, 22, 20, 20, 20, 20, 19, 19, 17, 17, 17, 17, 17, 17,
+        21, 22, 22, 21, 19, 19, 19, 19, 19, 18, 17, 17, 16, 17, 17, 16, 21, 22,
+        22, 22, 19, 19, 18, 18, 18, 17, 16, 16, 16, 16, 16, 16, 21, 23, 22, 22,
+        19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 16, 16, 21, 23, 23, 22, 19, 18,
+        18, 17, 17, 16, 15, 15, 15, 15, 15, 16, 20, 22, 22, 22, 19, 18, 17, 16,
+        16, 16, 15, 14, 15, 14, 15, 15, 20, 22, 22, 22, 19, 18, 17, 16, 16, 15,
+        14, 14, 14, 14, 14, 15, 20, 21, 22, 22, 19, 18, 17, 16, 15, 14, 14, 14,
+        13, 14, 14, 14, 19, 21, 21, 21, 19, 18, 17, 15, 15, 14, 13, 13, 13, 13,
+        13, 14, 19, 20, 21, 21, 19, 17, 17, 15, 15, 14, 13, 13, 13, 13, 13, 13,
+        18, 20, 20, 20, 18, 17, 16, 15, 14, 13, 12, 12, 12, 12, 13, 13, 17, 19,
+        20, 20, 18, 17, 16, 14, 14, 13, 12, 12, 12, 12, 12, 12, 17, 19, 19, 20,
+        18, 17, 16, 14, 14, 13, 12, 12, 12, 12, 12, 12, 16, 18, 18, 19, 17, 16,
+        15, 14, 13, 12, 12, 11, 11, 12, 12, 12, 16, 18, 18, 19, 17, 16, 15, 14,
+        13, 12, 12, 11, 11, 11, 12, 12, 16, 17, 18, 18, 17, 16, 15, 14, 13, 12,
+        11, 11, 11, 11, 11, 11, 16, 17, 17, 18, 16, 16, 15, 13, 13, 12, 11, 11,
+        11, 11, 11, 11, 15, 17, 17, 18, 16, 16, 15, 14, 13, 12, 12, 11, 11, 11,
+        11, 11, 15, 17, 17, 17, 16, 16, 14, 14, 13, 12, 12, 11, 11, 11, 10, 11,
+        15, 16, 17, 17, 16, 16, 14, 14, 13, 12, 12, 11, 11, 10, 10, 10, 15, 16,
+        16, 17, 16, 16, 15, 14, 13, 13, 12, 11, 11, 10, 10, 10, 14, 16, 16, 17,
+        16, 15, 15, 14, 13, 12, 12, 11, 11, 10, 10, 10, 14, 16, 16, 17, 16, 15,
+        15, 14, 13, 12, 12, 11, 11, 10, 10, 10, 14, 16, 16, 16, 16, 15, 15, 13,
+        13, 12, 12, 11, 11, 10, 10, 10,
+        /* Size 32x16 */
+        32, 33, 33, 34, 31, 29, 28, 24, 23, 21, 21, 21, 21, 20, 20, 20, 19, 19,
+        18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 33, 33, 32, 32,
+        28, 27, 26, 24, 23, 22, 22, 23, 23, 22, 22, 21, 21, 20, 20, 19, 19, 18,
+        18, 17, 17, 17, 17, 16, 16, 16, 16, 16, 30, 29, 28, 28, 26, 24, 23, 23,
+        22, 22, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20, 19, 18, 18, 18, 17, 17,
+        17, 17, 16, 16, 16, 16, 28, 27, 26, 26, 24, 23, 22, 22, 22, 21, 22, 22,
+        22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17,
+        17, 16, 21, 22, 22, 22, 22, 22, 22, 20, 20, 19, 19, 19, 19, 19, 19, 19,
+        19, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 16, 16, 16, 21, 22,
+        22, 23, 22, 22, 22, 20, 20, 19, 19, 19, 18, 18, 18, 18, 18, 17, 17, 17,
+        17, 16, 16, 16, 16, 16, 16, 16, 16, 15, 15, 15, 21, 22, 22, 23, 22, 23,
+        23, 21, 20, 19, 18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 15, 15, 15,
+        15, 15, 14, 14, 15, 15, 15, 15, 20, 20, 21, 21, 22, 22, 22, 20, 20, 19,
+        18, 18, 17, 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 14, 13, 14, 14, 14,
+        14, 14, 14, 13, 19, 20, 20, 21, 21, 21, 21, 20, 19, 19, 18, 17, 17, 16,
+        16, 15, 15, 15, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+        18, 19, 19, 20, 20, 20, 20, 19, 19, 18, 17, 17, 16, 16, 15, 14, 14, 14,
+        13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 13, 12, 12, 12, 16, 17, 18, 18,
+        18, 19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12,
+        12, 11, 11, 12, 12, 12, 12, 12, 12, 12, 16, 17, 17, 18, 18, 18, 19, 18,
+        17, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11,
+        11, 11, 11, 11, 11, 11, 15, 16, 17, 17, 17, 18, 18, 17, 17, 16, 16, 16,
+        15, 15, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+        11, 11, 15, 16, 16, 17, 17, 17, 18, 18, 17, 17, 16, 16, 15, 14, 14, 14,
+        13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 10, 10, 10, 10, 10, 15, 16,
+        16, 17, 17, 17, 17, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12,
+        12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 15, 16, 16, 16, 16, 17,
+        17, 17, 17, 16, 16, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11,
+        11, 11, 11, 10, 10, 10, 10, 10,
+        /* Size 4x16 */
+        33, 21, 18, 15, 32, 22, 19, 16, 28, 22, 20, 17, 26, 22, 20, 18, 23, 20,
+        19, 17, 22, 19, 17, 16, 23, 18, 16, 15, 22, 18, 15, 14, 21, 18, 14, 13,
+        20, 17, 13, 12, 19, 17, 13, 12, 18, 16, 12, 11, 17, 16, 12, 11, 17, 16,
+        12, 11, 16, 16, 13, 10, 16, 15, 12, 10,
+        /* Size 16x4 */
+        33, 32, 28, 26, 23, 22, 23, 22, 21, 20, 19, 18, 17, 17, 16, 16, 21, 22,
+        22, 22, 20, 19, 18, 18, 18, 17, 17, 16, 16, 16, 16, 15, 18, 19, 20, 20,
+        19, 17, 16, 15, 14, 13, 13, 12, 12, 12, 13, 12, 15, 16, 17, 18, 17, 16,
+        15, 14, 13, 12, 12, 11, 11, 11, 10, 10,
+        /* Size 8x32 */
+        32, 30, 21, 21, 19, 16, 15, 15, 33, 29, 22, 22, 20, 17, 16, 16, 33, 28,
+        22, 22, 20, 18, 17, 16, 34, 28, 22, 23, 21, 18, 17, 17, 31, 26, 22, 22,
+        21, 18, 17, 17, 29, 24, 22, 23, 21, 19, 18, 17, 28, 23, 22, 23, 21, 19,
+        18, 17, 24, 23, 20, 21, 20, 18, 17, 17, 23, 22, 20, 20, 19, 17, 17, 17,
+        21, 22, 19, 19, 19, 17, 16, 17, 21, 22, 19, 18, 18, 16, 16, 16, 21, 22,
+        19, 18, 17, 16, 16, 16, 21, 23, 19, 18, 17, 15, 15, 15, 20, 22, 19, 17,
+        16, 15, 15, 15, 20, 22, 19, 17, 16, 14, 14, 14, 20, 22, 19, 17, 15, 14,
+        13, 14, 19, 21, 19, 17, 15, 13, 13, 13, 19, 21, 19, 17, 15, 13, 13, 13,
+        18, 20, 18, 16, 14, 12, 12, 13, 17, 20, 18, 16, 14, 12, 12, 12, 17, 19,
+        18, 16, 14, 12, 12, 12, 16, 18, 17, 15, 13, 12, 11, 12, 16, 18, 17, 15,
+        13, 12, 11, 12, 16, 18, 17, 15, 13, 11, 11, 11, 16, 17, 16, 15, 13, 11,
+        11, 11, 15, 17, 16, 15, 13, 12, 11, 11, 15, 17, 16, 14, 13, 12, 11, 10,
+        15, 17, 16, 14, 13, 12, 11, 10, 15, 16, 16, 15, 13, 12, 11, 10, 14, 16,
+        16, 15, 13, 12, 11, 10, 14, 16, 16, 15, 13, 12, 11, 10, 14, 16, 16, 15,
+        13, 12, 11, 10,
+        /* Size 32x8 */
+        32, 33, 33, 34, 31, 29, 28, 24, 23, 21, 21, 21, 21, 20, 20, 20, 19, 19,
+        18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 30, 29, 28, 28,
+        26, 24, 23, 23, 22, 22, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20, 19, 18,
+        18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 21, 22, 22, 22, 22, 22, 22, 20,
+        20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16,
+        16, 16, 16, 16, 16, 16, 21, 22, 22, 23, 22, 23, 23, 21, 20, 19, 18, 18,
+        18, 17, 17, 17, 17, 17, 16, 16, 16, 15, 15, 15, 15, 15, 14, 14, 15, 15,
+        15, 15, 19, 20, 20, 21, 21, 21, 21, 20, 19, 19, 18, 17, 17, 16, 16, 15,
+        15, 15, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 16, 17,
+        18, 18, 18, 19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12,
+        12, 12, 12, 11, 11, 12, 12, 12, 12, 12, 12, 12, 15, 16, 17, 17, 17, 18,
+        18, 17, 17, 16, 16, 16, 15, 15, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11,
+        11, 11, 11, 11, 11, 11, 11, 11, 15, 16, 16, 17, 17, 17, 17, 17, 17, 17,
+        16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 10, 10,
+        10, 10, 10, 10 },
+  },
+  {
+      { /* Luma */
+        /* Size 4x4 */
+        32, 28, 18, 13, 28, 19, 14, 11, 18, 14, 10, 8, 13, 11, 8, 7,
+        /* Size 8x8 */
+        32, 32, 29, 24, 19, 15, 13, 11, 32, 31, 28, 24, 20, 16, 14, 12, 29, 28,
+        22, 20, 17, 14, 13, 12, 24, 24, 20, 16, 14, 12, 11, 10, 19, 20, 17, 14,
+        12, 10, 9, 9, 15, 16, 14, 12, 10, 9, 8, 8, 13, 14, 13, 11, 9, 8, 7, 7,
+        11, 12, 12, 10, 9, 8, 7, 6,
+        /* Size 16x16 */
+        32, 33, 33, 32, 30, 28, 25, 22, 19, 17, 16, 14, 12, 12, 11, 11, 33, 32,
+        32, 32, 30, 29, 26, 23, 20, 19, 17, 15, 13, 13, 12, 11, 33, 32, 31, 31,
+        29, 28, 26, 23, 21, 19, 17, 15, 14, 13, 12, 12, 32, 32, 31, 29, 28, 27,
+        25, 23, 21, 19, 18, 16, 14, 14, 13, 12, 30, 30, 29, 28, 26, 24, 22, 20,
+        19, 18, 16, 15, 13, 13, 12, 12, 28, 29, 28, 27, 24, 21, 20, 18, 17, 16,
+        15, 14, 13, 12, 11, 11, 25, 26, 26, 25, 22, 20, 18, 17, 15, 14, 14, 12,
+        12, 11, 11, 11, 22, 23, 23, 23, 20, 18, 17, 15, 14, 13, 12, 11, 11, 10,
+        10, 10, 19, 20, 21, 21, 19, 17, 15, 14, 12, 12, 11, 10, 10, 9, 9, 9, 17,
+        19, 19, 19, 18, 16, 14, 13, 12, 11, 10, 10, 9, 9, 9, 8, 16, 17, 17, 18,
+        16, 15, 14, 12, 11, 10, 10, 9, 9, 8, 8, 8, 14, 15, 15, 16, 15, 14, 12,
+        11, 10, 10, 9, 8, 8, 8, 7, 7, 12, 13, 14, 14, 13, 13, 12, 11, 10, 9, 9,
+        8, 7, 7, 7, 7, 12, 13, 13, 14, 13, 12, 11, 10, 9, 9, 8, 8, 7, 7, 7, 6,
+        11, 12, 12, 13, 12, 11, 11, 10, 9, 9, 8, 7, 7, 7, 6, 6, 11, 11, 12, 12,
+        12, 11, 11, 10, 9, 8, 8, 7, 7, 6, 6, 6,
+        /* Size 32x32 */
+        32, 33, 33, 33, 33, 33, 32, 32, 30, 29, 28, 26, 25, 23, 22, 21, 19, 18,
+        17, 16, 16, 14, 14, 13, 12, 12, 12, 11, 11, 11, 11, 10, 33, 32, 32, 32,
+        32, 32, 32, 31, 30, 29, 29, 27, 26, 24, 23, 22, 20, 19, 18, 17, 17, 15,
+        14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 33, 32, 32, 32, 32, 32, 32, 31,
+        30, 30, 29, 27, 26, 24, 23, 23, 20, 20, 19, 17, 17, 15, 15, 14, 13, 13,
+        13, 12, 12, 12, 11, 11, 33, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 28,
+        27, 25, 23, 23, 21, 20, 19, 18, 17, 16, 15, 14, 14, 14, 13, 13, 12, 12,
+        12, 11, 33, 32, 32, 32, 31, 31, 31, 30, 29, 28, 28, 26, 26, 24, 23, 23,
+        21, 20, 19, 18, 17, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 33, 32,
+        32, 32, 31, 31, 30, 30, 29, 28, 28, 26, 26, 24, 23, 23, 20, 20, 19, 18,
+        17, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 32, 32, 32, 32, 31, 30,
+        29, 28, 28, 27, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 18, 16, 16, 15,
+        14, 14, 14, 13, 13, 12, 12, 12, 32, 31, 31, 31, 30, 30, 28, 28, 27, 26,
+        26, 24, 24, 23, 22, 22, 20, 19, 19, 17, 17, 16, 15, 14, 14, 14, 13, 13,
+        13, 12, 12, 12, 30, 30, 30, 31, 29, 29, 28, 27, 26, 24, 24, 23, 22, 22,
+        20, 20, 19, 18, 18, 17, 16, 15, 15, 14, 13, 13, 13, 12, 12, 12, 12, 12,
+        29, 29, 30, 30, 28, 28, 27, 26, 24, 22, 22, 21, 20, 20, 19, 19, 17, 17,
+        17, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 28, 29, 29, 30,
+        28, 28, 27, 26, 24, 22, 21, 20, 20, 19, 18, 18, 17, 17, 16, 15, 15, 14,
+        14, 13, 13, 13, 12, 12, 11, 11, 11, 11, 26, 27, 27, 28, 26, 26, 26, 24,
+        23, 21, 20, 19, 19, 18, 17, 17, 16, 16, 15, 14, 14, 13, 13, 12, 12, 12,
+        11, 11, 11, 11, 11, 11, 25, 26, 26, 27, 26, 26, 25, 24, 22, 20, 20, 19,
+        18, 17, 17, 16, 15, 15, 14, 14, 14, 13, 12, 12, 12, 12, 11, 11, 11, 11,
+        11, 10, 23, 24, 24, 25, 24, 24, 24, 23, 22, 20, 19, 18, 17, 16, 16, 15,
+        14, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10, 22, 23,
+        23, 23, 23, 23, 23, 22, 20, 19, 18, 17, 17, 16, 15, 15, 14, 13, 13, 12,
+        12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10, 21, 22, 23, 23, 23, 23,
+        22, 22, 20, 19, 18, 17, 16, 15, 15, 14, 13, 13, 13, 12, 12, 11, 11, 11,
+        10, 10, 10, 10, 10, 10, 9, 9, 19, 20, 20, 21, 21, 20, 21, 20, 19, 17,
+        17, 16, 15, 14, 14, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9,
+        9, 9, 9, 18, 19, 20, 20, 20, 20, 20, 19, 18, 17, 17, 16, 15, 14, 13, 13,
+        12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 9, 9, 9, 9, 9, 17, 18, 19, 19,
+        19, 19, 19, 19, 18, 17, 16, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10,
+        10, 9, 9, 9, 9, 9, 9, 8, 8, 9, 16, 17, 17, 18, 18, 18, 18, 17, 17, 16,
+        15, 14, 14, 13, 12, 12, 11, 11, 11, 10, 10, 9, 9, 9, 9, 9, 8, 8, 8, 8,
+        8, 8, 16, 17, 17, 17, 17, 17, 18, 17, 16, 15, 15, 14, 14, 13, 12, 12,
+        11, 11, 10, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 14, 15, 15, 16, 16,
+        16, 16, 16, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8,
+        8, 8, 8, 8, 8, 8, 8, 7, 14, 14, 15, 15, 15, 15, 16, 15, 15, 14, 14, 13,
+        12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, 8, 7, 7, 7, 7, 7, 13,
+        13, 14, 14, 14, 14, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 9,
+        9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 12, 13, 13, 14, 14, 14, 14, 14,
+        13, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7,
+        7, 7, 7, 12, 13, 13, 14, 14, 14, 14, 14, 13, 13, 13, 12, 12, 11, 11, 10,
+        10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 12, 12, 13, 13, 13, 13,
+        14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7,
+        7, 7, 6, 6, 6, 11, 12, 12, 13, 13, 13, 13, 13, 12, 12, 12, 11, 11, 10,
+        10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 11, 12, 12, 12,
+        12, 12, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7,
+        7, 7, 7, 7, 6, 6, 6, 6, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 11, 11,
+        11, 10, 10, 10, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 11, 11,
+        11, 12, 12, 12, 12, 12, 12, 11, 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 8,
+        7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 10, 11, 11, 11, 11, 12, 12, 12, 12, 11,
+        11, 11, 10, 10, 10, 9, 9, 9, 9, 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6,
+        /* Size 4x8 */
+        32, 29, 17, 12, 32, 28, 18, 13, 30, 22, 16, 12, 25, 19, 13, 11, 20, 17,
+        11, 9, 16, 14, 9, 8, 14, 13, 9, 7, 12, 11, 9, 7,
+        /* Size 8x4 */
+        32, 32, 30, 25, 20, 16, 14, 12, 29, 28, 22, 19, 17, 14, 13, 11, 17, 18,
+        16, 13, 11, 9, 9, 9, 12, 13, 12, 11, 9, 8, 7, 7,
+        /* Size 8x16 */
+        32, 33, 29, 23, 19, 16, 12, 11, 33, 32, 30, 25, 20, 17, 13, 12, 33, 31,
+        29, 24, 21, 17, 14, 13, 32, 30, 28, 24, 21, 18, 14, 13, 30, 29, 25, 21,
+        19, 16, 13, 13, 28, 28, 22, 19, 17, 15, 13, 12, 25, 26, 21, 17, 15, 13,
+        12, 11, 22, 23, 19, 16, 14, 12, 11, 10, 19, 20, 18, 14, 12, 11, 10, 9,
+        18, 19, 17, 14, 12, 10, 9, 9, 16, 17, 16, 13, 11, 10, 9, 8, 14, 15, 14,
+        12, 10, 9, 8, 8, 12, 14, 13, 11, 10, 9, 7, 7, 12, 13, 12, 11, 9, 8, 7,
+        7, 11, 12, 12, 11, 9, 8, 7, 7, 11, 12, 12, 11, 9, 8, 7, 6,
+        /* Size 16x8 */
+        32, 33, 33, 32, 30, 28, 25, 22, 19, 18, 16, 14, 12, 12, 11, 11, 33, 32,
+        31, 30, 29, 28, 26, 23, 20, 19, 17, 15, 14, 13, 12, 12, 29, 30, 29, 28,
+        25, 22, 21, 19, 18, 17, 16, 14, 13, 12, 12, 12, 23, 25, 24, 24, 21, 19,
+        17, 16, 14, 14, 13, 12, 11, 11, 11, 11, 19, 20, 21, 21, 19, 17, 15, 14,
+        12, 12, 11, 10, 10, 9, 9, 9, 16, 17, 17, 18, 16, 15, 13, 12, 11, 10, 10,
+        9, 9, 8, 8, 8, 12, 13, 14, 14, 13, 13, 12, 11, 10, 9, 9, 8, 7, 7, 7, 7,
+        11, 12, 13, 13, 13, 12, 11, 10, 9, 9, 8, 8, 7, 7, 7, 6,
+        /* Size 16x32 */
+        32, 33, 33, 32, 29, 28, 23, 22, 19, 17, 16, 13, 12, 12, 11, 11, 33, 32,
+        32, 32, 29, 29, 24, 23, 20, 17, 17, 14, 13, 12, 12, 12, 33, 32, 32, 32,
+        30, 29, 25, 23, 20, 18, 17, 14, 13, 12, 12, 12, 33, 32, 32, 31, 30, 30,
+        25, 23, 21, 18, 17, 14, 14, 13, 12, 12, 33, 32, 31, 30, 29, 28, 24, 23,
+        21, 18, 17, 14, 14, 13, 13, 12, 32, 32, 31, 30, 28, 28, 24, 23, 20, 18,
+        17, 14, 14, 13, 13, 12, 32, 31, 30, 29, 28, 27, 24, 23, 21, 18, 18, 15,
+        14, 13, 13, 12, 32, 31, 30, 28, 26, 26, 23, 22, 20, 18, 17, 14, 14, 13,
+        13, 13, 30, 30, 29, 28, 25, 24, 21, 20, 19, 17, 16, 14, 13, 13, 13, 13,
+        29, 30, 28, 27, 23, 22, 20, 19, 17, 16, 15, 13, 13, 12, 12, 12, 28, 30,
+        28, 27, 22, 21, 19, 18, 17, 16, 15, 13, 13, 12, 12, 12, 26, 28, 26, 26,
+        21, 20, 18, 17, 16, 14, 14, 12, 12, 12, 12, 11, 25, 26, 26, 25, 21, 20,
+        17, 17, 15, 14, 13, 12, 12, 11, 11, 11, 23, 25, 24, 24, 20, 19, 16, 16,
+        14, 13, 13, 11, 11, 11, 11, 11, 22, 23, 23, 23, 19, 18, 16, 15, 14, 12,
+        12, 11, 11, 10, 10, 10, 21, 23, 23, 22, 19, 18, 15, 15, 13, 12, 12, 11,
+        10, 10, 10, 10, 19, 21, 20, 20, 18, 17, 14, 14, 12, 11, 11, 10, 10, 10,
+        9, 10, 19, 20, 20, 20, 17, 17, 14, 13, 12, 11, 11, 10, 9, 9, 9, 9, 18,
+        19, 19, 19, 17, 16, 14, 13, 12, 11, 10, 9, 9, 9, 9, 9, 16, 18, 18, 18,
+        16, 15, 13, 12, 11, 10, 10, 9, 9, 9, 9, 8, 16, 17, 17, 18, 16, 15, 13,
+        12, 11, 10, 10, 9, 9, 8, 8, 8, 14, 16, 16, 16, 14, 14, 12, 12, 11, 9, 9,
+        8, 8, 8, 8, 8, 14, 15, 15, 16, 14, 14, 12, 11, 10, 9, 9, 8, 8, 8, 8, 8,
+        13, 14, 14, 15, 13, 13, 11, 11, 10, 9, 9, 8, 8, 7, 7, 7, 12, 14, 14, 14,
+        13, 13, 11, 11, 10, 9, 9, 8, 7, 7, 7, 7, 12, 14, 14, 14, 13, 13, 11, 11,
+        10, 9, 8, 8, 7, 7, 7, 7, 12, 13, 13, 13, 12, 12, 11, 10, 9, 9, 8, 7, 7,
+        7, 7, 7, 12, 12, 13, 13, 12, 12, 11, 10, 9, 9, 8, 7, 7, 7, 7, 6, 11, 12,
+        12, 13, 12, 12, 11, 10, 9, 9, 8, 8, 7, 7, 7, 6, 11, 12, 12, 12, 12, 11,
+        11, 10, 9, 9, 8, 8, 7, 7, 6, 6, 11, 12, 12, 12, 12, 11, 11, 10, 9, 8, 8,
+        7, 7, 6, 6, 6, 10, 11, 11, 12, 12, 11, 11, 9, 9, 8, 8, 7, 7, 6, 6, 6,
+        /* Size 32x16 */
+        32, 33, 33, 33, 33, 32, 32, 32, 30, 29, 28, 26, 25, 23, 22, 21, 19, 19,
+        18, 16, 16, 14, 14, 13, 12, 12, 12, 12, 11, 11, 11, 10, 33, 32, 32, 32,
+        32, 32, 31, 31, 30, 30, 30, 28, 26, 25, 23, 23, 21, 20, 19, 18, 17, 16,
+        15, 14, 14, 14, 13, 12, 12, 12, 12, 11, 33, 32, 32, 32, 31, 31, 30, 30,
+        29, 28, 28, 26, 26, 24, 23, 23, 20, 20, 19, 18, 17, 16, 15, 14, 14, 14,
+        13, 13, 12, 12, 12, 11, 32, 32, 32, 31, 30, 30, 29, 28, 28, 27, 27, 26,
+        25, 24, 23, 22, 20, 20, 19, 18, 18, 16, 16, 15, 14, 14, 13, 13, 13, 12,
+        12, 12, 29, 29, 30, 30, 29, 28, 28, 26, 25, 23, 22, 21, 21, 20, 19, 19,
+        18, 17, 17, 16, 16, 14, 14, 13, 13, 13, 12, 12, 12, 12, 12, 12, 28, 29,
+        29, 30, 28, 28, 27, 26, 24, 22, 21, 20, 20, 19, 18, 18, 17, 17, 16, 15,
+        15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 23, 24, 25, 25, 24, 24,
+        24, 23, 21, 20, 19, 18, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 11,
+        11, 11, 11, 11, 11, 11, 11, 11, 22, 23, 23, 23, 23, 23, 23, 22, 20, 19,
+        18, 17, 17, 16, 15, 15, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 10, 10,
+        10, 10, 10, 9, 19, 20, 20, 21, 21, 20, 21, 20, 19, 17, 17, 16, 15, 14,
+        14, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9, 9, 17,
+        17, 18, 18, 18, 18, 18, 18, 17, 16, 16, 14, 14, 13, 12, 12, 11, 11, 11,
+        10, 10, 9, 9, 9, 9, 9, 9, 9, 9, 9, 8, 8, 16, 17, 17, 17, 17, 17, 18, 17,
+        16, 15, 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 9, 8, 8, 8,
+        8, 8, 8, 8, 13, 14, 14, 14, 14, 14, 15, 14, 14, 13, 13, 12, 12, 11, 11,
+        11, 10, 10, 9, 9, 9, 8, 8, 8, 8, 8, 7, 7, 8, 8, 7, 7, 12, 13, 13, 14,
+        14, 14, 14, 14, 13, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8,
+        7, 7, 7, 7, 7, 7, 7, 7, 12, 12, 12, 13, 13, 13, 13, 13, 13, 12, 12, 12,
+        11, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 6, 6, 11, 12,
+        12, 12, 13, 13, 13, 13, 13, 12, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8,
+        8, 8, 7, 7, 7, 7, 7, 7, 6, 6, 6, 11, 12, 12, 12, 12, 12, 12, 13, 13, 12,
+        12, 11, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6,
+        /* Size 4x16 */
+        33, 28, 17, 12, 32, 29, 18, 12, 32, 28, 18, 13, 31, 27, 18, 13, 30, 24,
+        17, 13, 30, 21, 16, 12, 26, 20, 14, 11, 23, 18, 12, 10, 21, 17, 11, 10,
+        19, 16, 11, 9, 17, 15, 10, 8, 15, 14, 9, 8, 14, 13, 9, 7, 13, 12, 9, 7,
+        12, 12, 9, 7, 12, 11, 8, 6,
+        /* Size 16x4 */
+        33, 32, 32, 31, 30, 30, 26, 23, 21, 19, 17, 15, 14, 13, 12, 12, 28, 29,
+        28, 27, 24, 21, 20, 18, 17, 16, 15, 14, 13, 12, 12, 11, 17, 18, 18, 18,
+        17, 16, 14, 12, 11, 11, 10, 9, 9, 9, 9, 8, 12, 12, 13, 13, 13, 12, 11,
+        10, 10, 9, 8, 8, 7, 7, 7, 6,
+        /* Size 8x32 */
+        32, 33, 29, 23, 19, 16, 12, 11, 33, 32, 29, 24, 20, 17, 13, 12, 33, 32,
+        30, 25, 20, 17, 13, 12, 33, 32, 30, 25, 21, 17, 14, 12, 33, 31, 29, 24,
+        21, 17, 14, 13, 32, 31, 28, 24, 20, 17, 14, 13, 32, 30, 28, 24, 21, 18,
+        14, 13, 32, 30, 26, 23, 20, 17, 14, 13, 30, 29, 25, 21, 19, 16, 13, 13,
+        29, 28, 23, 20, 17, 15, 13, 12, 28, 28, 22, 19, 17, 15, 13, 12, 26, 26,
+        21, 18, 16, 14, 12, 12, 25, 26, 21, 17, 15, 13, 12, 11, 23, 24, 20, 16,
+        14, 13, 11, 11, 22, 23, 19, 16, 14, 12, 11, 10, 21, 23, 19, 15, 13, 12,
+        10, 10, 19, 20, 18, 14, 12, 11, 10, 9, 19, 20, 17, 14, 12, 11, 9, 9, 18,
+        19, 17, 14, 12, 10, 9, 9, 16, 18, 16, 13, 11, 10, 9, 9, 16, 17, 16, 13,
+        11, 10, 9, 8, 14, 16, 14, 12, 11, 9, 8, 8, 14, 15, 14, 12, 10, 9, 8, 8,
+        13, 14, 13, 11, 10, 9, 8, 7, 12, 14, 13, 11, 10, 9, 7, 7, 12, 14, 13,
+        11, 10, 8, 7, 7, 12, 13, 12, 11, 9, 8, 7, 7, 12, 13, 12, 11, 9, 8, 7, 7,
+        11, 12, 12, 11, 9, 8, 7, 7, 11, 12, 12, 11, 9, 8, 7, 6, 11, 12, 12, 11,
+        9, 8, 7, 6, 10, 11, 12, 11, 9, 8, 7, 6,
+        /* Size 32x8 */
+        32, 33, 33, 33, 33, 32, 32, 32, 30, 29, 28, 26, 25, 23, 22, 21, 19, 19,
+        18, 16, 16, 14, 14, 13, 12, 12, 12, 12, 11, 11, 11, 10, 33, 32, 32, 32,
+        31, 31, 30, 30, 29, 28, 28, 26, 26, 24, 23, 23, 20, 20, 19, 18, 17, 16,
+        15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 29, 29, 30, 30, 29, 28, 28, 26,
+        25, 23, 22, 21, 21, 20, 19, 19, 18, 17, 17, 16, 16, 14, 14, 13, 13, 13,
+        12, 12, 12, 12, 12, 12, 23, 24, 25, 25, 24, 24, 24, 23, 21, 20, 19, 18,
+        17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 11, 11, 11,
+        11, 11, 19, 20, 20, 21, 21, 20, 21, 20, 19, 17, 17, 16, 15, 14, 14, 13,
+        12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9, 9, 16, 17, 17,
+        17, 17, 17, 18, 17, 16, 15, 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10,
+        9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 12, 13, 13, 14, 14, 14, 14, 14, 13, 13,
+        13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7,
+        11, 12, 12, 12, 13, 13, 13, 13, 13, 12, 12, 12, 11, 11, 10, 10, 9, 9, 9,
+        9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 6, 6, 6 },
+      { /* Chroma */
+        /* Size 4x4 */
+        32, 23, 20, 17, 23, 19, 17, 16, 20, 17, 14, 13, 17, 16, 13, 11,
+        /* Size 8x8 */
+        33, 30, 22, 22, 20, 18, 17, 16, 30, 26, 22, 23, 21, 19, 18, 17, 22, 22,
+        20, 20, 19, 18, 17, 17, 22, 23, 20, 18, 17, 16, 15, 15, 20, 21, 19, 17,
+        15, 14, 13, 13, 18, 19, 18, 16, 14, 12, 12, 12, 17, 18, 17, 15, 13, 12,
+        11, 11, 16, 17, 17, 15, 13, 12, 11, 10,
+        /* Size 16x16 */
+        32, 33, 31, 28, 25, 21, 21, 20, 20, 19, 18, 17, 16, 15, 15, 15, 33, 33,
+        30, 26, 24, 22, 22, 22, 21, 20, 19, 18, 17, 17, 16, 16, 31, 30, 28, 24,
+        23, 22, 22, 22, 22, 21, 20, 19, 18, 17, 17, 16, 28, 26, 24, 22, 22, 21,
+        22, 22, 22, 21, 20, 19, 19, 18, 17, 17, 25, 24, 23, 22, 21, 20, 21, 20,
+        20, 20, 19, 18, 18, 17, 17, 17, 21, 22, 22, 21, 20, 19, 19, 19, 19, 19,
+        18, 17, 17, 16, 16, 16, 21, 22, 22, 22, 21, 19, 19, 18, 17, 17, 17, 16,
+        16, 15, 15, 15, 20, 22, 22, 22, 20, 19, 18, 17, 16, 16, 16, 15, 15, 14,
+        14, 14, 20, 21, 22, 22, 20, 19, 17, 16, 16, 15, 15, 14, 14, 13, 14, 14,
+        19, 20, 21, 21, 20, 19, 17, 16, 15, 14, 14, 13, 13, 13, 13, 13, 18, 19,
+        20, 20, 19, 18, 17, 16, 15, 14, 13, 13, 12, 12, 12, 12, 17, 18, 19, 19,
+        18, 17, 16, 15, 14, 13, 13, 12, 12, 12, 12, 12, 16, 17, 18, 19, 18, 17,
+        16, 15, 14, 13, 12, 12, 11, 11, 11, 11, 15, 17, 17, 18, 17, 16, 15, 14,
+        13, 13, 12, 12, 11, 11, 11, 11, 15, 16, 17, 17, 17, 16, 15, 14, 14, 13,
+        12, 12, 11, 11, 10, 10, 15, 16, 16, 17, 17, 16, 15, 14, 14, 13, 12, 12,
+        11, 11, 10, 10,
+        /* Size 32x32 */
+        32, 33, 33, 34, 31, 31, 28, 27, 25, 22, 21, 21, 21, 21, 20, 20, 20, 19,
+        19, 18, 18, 17, 17, 16, 16, 16, 15, 15, 15, 15, 15, 14, 33, 33, 33, 33,
+        30, 30, 27, 26, 24, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 18,
+        18, 17, 17, 17, 16, 16, 16, 16, 16, 15, 33, 33, 33, 33, 30, 29, 26, 26,
+        24, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 19, 19, 18, 18, 17, 17, 17,
+        17, 16, 16, 16, 16, 15, 34, 33, 33, 32, 30, 29, 26, 25, 24, 23, 22, 23,
+        23, 23, 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17,
+        16, 16, 31, 30, 30, 30, 28, 27, 24, 24, 23, 22, 22, 22, 22, 23, 22, 22,
+        22, 21, 21, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 16, 16, 31, 30,
+        29, 29, 27, 26, 24, 23, 23, 22, 22, 22, 22, 23, 22, 22, 22, 21, 21, 20,
+        20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 17, 28, 27, 26, 26, 24, 24,
+        22, 22, 22, 22, 21, 22, 22, 23, 22, 22, 22, 22, 21, 21, 20, 20, 19, 19,
+        19, 19, 18, 18, 17, 17, 17, 17, 27, 26, 26, 25, 24, 23, 22, 22, 21, 21,
+        21, 21, 22, 22, 22, 22, 21, 21, 21, 20, 20, 19, 19, 19, 18, 18, 18, 18,
+        18, 17, 17, 17, 25, 24, 24, 24, 23, 23, 22, 21, 21, 20, 20, 21, 21, 21,
+        20, 20, 20, 20, 20, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 17, 17,
+        22, 22, 22, 23, 22, 22, 22, 21, 20, 20, 20, 20, 20, 20, 19, 19, 19, 19,
+        19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 16, 16, 21, 22, 22, 22,
+        22, 22, 21, 21, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18,
+        17, 17, 17, 17, 16, 16, 16, 16, 16, 16, 21, 22, 22, 23, 22, 22, 22, 21,
+        21, 20, 19, 19, 19, 19, 18, 18, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16,
+        16, 16, 16, 16, 16, 15, 21, 22, 22, 23, 22, 22, 22, 22, 21, 20, 19, 19,
+        19, 18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 15,
+        15, 15, 21, 22, 22, 23, 23, 23, 23, 22, 21, 20, 19, 19, 18, 18, 17, 17,
+        17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 15, 15, 15, 15, 15, 20, 21,
+        22, 22, 22, 22, 22, 22, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 16,
+        16, 15, 15, 15, 15, 14, 14, 15, 14, 14, 14, 15, 20, 21, 22, 22, 22, 22,
+        22, 22, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 14,
+        14, 14, 14, 14, 14, 14, 14, 14, 20, 20, 21, 22, 22, 22, 22, 21, 20, 19,
+        19, 18, 17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 13, 14,
+        14, 13, 14, 14, 19, 20, 20, 21, 21, 21, 22, 21, 20, 19, 19, 18, 17, 17,
+        16, 16, 15, 15, 15, 14, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+        19, 20, 20, 21, 21, 21, 21, 21, 20, 19, 19, 18, 17, 17, 16, 16, 15, 15,
+        14, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 18, 19, 19, 20,
+        20, 20, 21, 20, 19, 19, 18, 17, 17, 16, 16, 16, 15, 14, 14, 14, 13, 13,
+        13, 13, 12, 12, 12, 13, 12, 13, 13, 12, 18, 19, 19, 20, 20, 20, 20, 20,
+        19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 12,
+        12, 12, 12, 12, 12, 12, 17, 18, 18, 19, 19, 19, 20, 19, 19, 18, 18, 17,
+        16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+        12, 12, 17, 18, 18, 19, 19, 19, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15,
+        14, 14, 13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 16, 17,
+        17, 18, 18, 18, 19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13,
+        12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 12, 11, 16, 17, 17, 18, 18, 18,
+        19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11,
+        11, 11, 11, 11, 11, 11, 11, 11, 16, 17, 17, 18, 18, 18, 19, 18, 18, 17,
+        17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11,
+        11, 11, 11, 11, 15, 16, 17, 17, 17, 17, 18, 18, 17, 17, 16, 16, 15, 15,
+        14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+        15, 16, 16, 17, 17, 17, 18, 18, 17, 17, 16, 16, 15, 15, 15, 14, 14, 13,
+        13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 11, 10, 10, 10, 15, 16, 16, 17,
+        17, 17, 17, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12,
+        12, 11, 11, 11, 11, 11, 10, 10, 10, 10, 15, 16, 16, 17, 17, 17, 17, 17,
+        17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 12, 11, 11, 11,
+        11, 10, 10, 10, 10, 10, 15, 16, 16, 16, 16, 17, 17, 17, 17, 16, 16, 16,
+        15, 15, 14, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 10, 10, 10,
+        10, 10, 14, 15, 15, 16, 16, 17, 17, 17, 17, 16, 16, 15, 15, 15, 15, 14,
+        14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10,
+        /* Size 4x8 */
+        33, 22, 19, 16, 28, 22, 20, 17, 22, 20, 19, 17, 23, 19, 16, 15, 21, 19,
+        14, 13, 19, 18, 13, 12, 17, 17, 13, 11, 16, 16, 13, 11,
+        /* Size 8x4 */
+        33, 28, 22, 23, 21, 19, 17, 16, 22, 22, 20, 19, 19, 18, 17, 16, 19, 20,
+        19, 16, 14, 13, 13, 13, 16, 17, 17, 15, 13, 12, 11, 11,
+        /* Size 8x16 */
+        32, 31, 23, 21, 20, 18, 16, 15, 33, 30, 23, 22, 21, 19, 17, 16, 31, 28,
+        22, 23, 22, 20, 18, 17, 28, 24, 22, 23, 22, 20, 19, 17, 24, 23, 21, 21,
+        20, 19, 18, 17, 21, 22, 20, 19, 19, 18, 17, 16, 21, 22, 20, 18, 17, 17,
+        16, 15, 20, 22, 20, 17, 16, 16, 14, 14, 20, 22, 19, 17, 16, 14, 14, 14,
+        19, 21, 19, 17, 15, 14, 13, 13, 18, 20, 19, 16, 15, 13, 12, 12, 17, 19,
+        18, 16, 14, 13, 12, 12, 16, 18, 17, 15, 14, 12, 11, 11, 16, 17, 17, 15,
+        13, 12, 11, 11, 15, 17, 17, 15, 13, 12, 11, 11, 15, 16, 17, 15, 14, 12,
+        11, 10,
+        /* Size 16x8 */
+        32, 33, 31, 28, 24, 21, 21, 20, 20, 19, 18, 17, 16, 16, 15, 15, 31, 30,
+        28, 24, 23, 22, 22, 22, 22, 21, 20, 19, 18, 17, 17, 16, 23, 23, 22, 22,
+        21, 20, 20, 20, 19, 19, 19, 18, 17, 17, 17, 17, 21, 22, 23, 23, 21, 19,
+        18, 17, 17, 17, 16, 16, 15, 15, 15, 15, 20, 21, 22, 22, 20, 19, 17, 16,
+        16, 15, 15, 14, 14, 13, 13, 14, 18, 19, 20, 20, 19, 18, 17, 16, 14, 14,
+        13, 13, 12, 12, 12, 12, 16, 17, 18, 19, 18, 17, 16, 14, 14, 13, 12, 12,
+        11, 11, 11, 11, 15, 16, 17, 17, 17, 16, 15, 14, 14, 13, 12, 12, 11, 11,
+        11, 10,
+        /* Size 16x32 */
+        32, 33, 31, 28, 23, 21, 21, 20, 20, 18, 18, 16, 16, 15, 15, 15, 33, 33,
+        30, 27, 23, 22, 22, 21, 20, 19, 19, 17, 17, 16, 16, 16, 33, 32, 30, 26,
+        23, 22, 22, 22, 21, 20, 19, 17, 17, 17, 16, 16, 34, 32, 29, 26, 23, 22,
+        23, 22, 21, 20, 20, 18, 18, 17, 17, 17, 31, 29, 28, 24, 22, 22, 23, 22,
+        22, 20, 20, 18, 18, 17, 17, 17, 31, 28, 27, 24, 22, 22, 22, 22, 22, 20,
+        20, 18, 18, 17, 17, 17, 28, 26, 24, 22, 22, 22, 23, 22, 22, 21, 20, 19,
+        19, 18, 17, 17, 26, 25, 24, 22, 21, 21, 22, 22, 21, 20, 20, 19, 18, 18,
+        18, 17, 24, 24, 23, 22, 21, 20, 21, 20, 20, 19, 19, 18, 18, 17, 17, 17,
+        22, 22, 22, 21, 20, 20, 19, 19, 19, 19, 18, 17, 17, 17, 17, 17, 21, 22,
+        22, 21, 20, 19, 19, 19, 19, 18, 18, 17, 17, 16, 16, 17, 21, 22, 22, 22,
+        20, 19, 18, 18, 18, 17, 17, 16, 16, 16, 16, 16, 21, 23, 22, 22, 20, 19,
+        18, 18, 17, 17, 17, 16, 16, 16, 15, 16, 21, 23, 23, 22, 20, 19, 18, 17,
+        17, 16, 16, 15, 15, 15, 15, 15, 20, 22, 22, 22, 20, 19, 17, 17, 16, 16,
+        16, 15, 14, 15, 14, 15, 20, 22, 22, 22, 20, 19, 17, 17, 16, 16, 15, 14,
+        14, 14, 14, 14, 20, 21, 22, 22, 19, 19, 17, 16, 16, 15, 14, 14, 14, 14,
+        14, 14, 19, 21, 21, 21, 19, 19, 17, 16, 15, 14, 14, 13, 13, 13, 14, 13,
+        19, 20, 21, 21, 19, 19, 17, 16, 15, 14, 14, 13, 13, 13, 13, 13, 18, 20,
+        20, 20, 19, 18, 16, 16, 15, 14, 13, 13, 12, 13, 13, 13, 18, 20, 20, 20,
+        19, 18, 16, 16, 15, 14, 13, 12, 12, 12, 12, 13, 17, 19, 19, 20, 18, 18,
+        16, 15, 14, 13, 13, 12, 12, 12, 12, 12, 17, 18, 19, 19, 18, 17, 16, 15,
+        14, 13, 13, 12, 12, 12, 12, 12, 16, 18, 18, 19, 17, 17, 15, 15, 14, 13,
+        12, 12, 11, 11, 12, 12, 16, 18, 18, 18, 17, 17, 15, 14, 14, 13, 12, 11,
+        11, 11, 11, 12, 16, 17, 18, 18, 17, 17, 15, 14, 14, 13, 12, 11, 11, 11,
+        11, 11, 16, 17, 17, 18, 17, 16, 15, 14, 13, 12, 12, 11, 11, 11, 11, 11,
+        15, 17, 17, 18, 17, 16, 15, 15, 13, 13, 12, 11, 11, 11, 11, 11, 15, 17,
+        17, 17, 17, 16, 15, 14, 13, 13, 12, 12, 11, 11, 11, 10, 15, 16, 17, 17,
+        17, 16, 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 15, 16, 16, 17, 17, 16,
+        15, 14, 14, 13, 12, 12, 11, 11, 10, 10, 15, 16, 16, 17, 17, 15, 15, 14,
+        14, 12, 12, 11, 11, 10, 10, 10,
+        /* Size 32x16 */
+        32, 33, 33, 34, 31, 31, 28, 26, 24, 22, 21, 21, 21, 21, 20, 20, 20, 19,
+        19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 15, 33, 33, 32, 32,
+        29, 28, 26, 25, 24, 22, 22, 22, 23, 23, 22, 22, 21, 21, 20, 20, 20, 19,
+        18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 31, 30, 30, 29, 28, 27, 24, 24,
+        23, 22, 22, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18, 18,
+        17, 17, 17, 17, 16, 16, 28, 27, 26, 26, 24, 24, 22, 22, 22, 21, 21, 22,
+        22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 18, 18, 18, 18, 17, 17,
+        17, 17, 23, 23, 23, 23, 22, 22, 22, 21, 21, 20, 20, 20, 20, 20, 20, 20,
+        19, 19, 19, 19, 19, 18, 18, 17, 17, 17, 17, 17, 17, 17, 17, 17, 21, 22,
+        22, 22, 22, 22, 22, 21, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18,
+        18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 16, 15, 21, 22, 22, 23, 23, 22,
+        23, 22, 21, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 16, 15,
+        15, 15, 15, 15, 15, 15, 15, 15, 20, 21, 22, 22, 22, 22, 22, 22, 20, 19,
+        19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 14, 14, 14, 15,
+        14, 14, 14, 14, 20, 20, 21, 21, 22, 22, 22, 21, 20, 19, 19, 18, 17, 17,
+        16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 13, 13, 13, 13, 14, 14,
+        18, 19, 20, 20, 20, 20, 21, 20, 19, 19, 18, 17, 17, 16, 16, 16, 15, 14,
+        14, 14, 14, 13, 13, 13, 13, 13, 12, 13, 13, 13, 13, 12, 18, 19, 19, 20,
+        20, 20, 20, 20, 19, 18, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, 13,
+        13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 16, 17, 17, 18, 18, 18, 19, 19,
+        18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11,
+        11, 11, 12, 12, 12, 11, 16, 17, 17, 18, 18, 18, 19, 18, 18, 17, 17, 16,
+        16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11,
+        11, 11, 15, 16, 17, 17, 17, 17, 18, 18, 17, 17, 16, 16, 16, 15, 15, 14,
+        14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 10, 15, 16,
+        16, 17, 17, 17, 17, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13,
+        12, 12, 12, 12, 11, 11, 11, 11, 11, 10, 10, 10, 15, 16, 16, 17, 17, 17,
+        17, 17, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 12,
+        12, 11, 11, 11, 10, 10, 10, 10,
+        /* Size 4x16 */
+        33, 21, 18, 15, 32, 22, 20, 17, 29, 22, 20, 17, 26, 22, 21, 18, 24, 20,
+        19, 17, 22, 19, 18, 16, 23, 19, 17, 16, 22, 19, 16, 15, 21, 19, 15, 14,
+        20, 19, 14, 13, 20, 18, 14, 12, 18, 17, 13, 12, 18, 17, 13, 11, 17, 16,
+        12, 11, 17, 16, 13, 11, 16, 16, 13, 11,
+        /* Size 16x4 */
+        33, 32, 29, 26, 24, 22, 23, 22, 21, 20, 20, 18, 18, 17, 17, 16, 21, 22,
+        22, 22, 20, 19, 19, 19, 19, 19, 18, 17, 17, 16, 16, 16, 18, 20, 20, 21,
+        19, 18, 17, 16, 15, 14, 14, 13, 13, 12, 13, 13, 15, 17, 17, 18, 17, 16,
+        16, 15, 14, 13, 12, 12, 11, 11, 11, 11,
+        /* Size 8x32 */
+        32, 31, 23, 21, 20, 18, 16, 15, 33, 30, 23, 22, 20, 19, 17, 16, 33, 30,
+        23, 22, 21, 19, 17, 16, 34, 29, 23, 23, 21, 20, 18, 17, 31, 28, 22, 23,
+        22, 20, 18, 17, 31, 27, 22, 22, 22, 20, 18, 17, 28, 24, 22, 23, 22, 20,
+        19, 17, 26, 24, 21, 22, 21, 20, 18, 18, 24, 23, 21, 21, 20, 19, 18, 17,
+        22, 22, 20, 19, 19, 18, 17, 17, 21, 22, 20, 19, 19, 18, 17, 16, 21, 22,
+        20, 18, 18, 17, 16, 16, 21, 22, 20, 18, 17, 17, 16, 15, 21, 23, 20, 18,
+        17, 16, 15, 15, 20, 22, 20, 17, 16, 16, 14, 14, 20, 22, 20, 17, 16, 15,
+        14, 14, 20, 22, 19, 17, 16, 14, 14, 14, 19, 21, 19, 17, 15, 14, 13, 14,
+        19, 21, 19, 17, 15, 14, 13, 13, 18, 20, 19, 16, 15, 13, 12, 13, 18, 20,
+        19, 16, 15, 13, 12, 12, 17, 19, 18, 16, 14, 13, 12, 12, 17, 19, 18, 16,
+        14, 13, 12, 12, 16, 18, 17, 15, 14, 12, 11, 12, 16, 18, 17, 15, 14, 12,
+        11, 11, 16, 18, 17, 15, 14, 12, 11, 11, 16, 17, 17, 15, 13, 12, 11, 11,
+        15, 17, 17, 15, 13, 12, 11, 11, 15, 17, 17, 15, 13, 12, 11, 11, 15, 17,
+        17, 15, 13, 12, 11, 10, 15, 16, 17, 15, 14, 12, 11, 10, 15, 16, 17, 15,
+        14, 12, 11, 10,
+        /* Size 32x8 */
+        32, 33, 33, 34, 31, 31, 28, 26, 24, 22, 21, 21, 21, 21, 20, 20, 20, 19,
+        19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 15, 31, 30, 30, 29,
+        28, 27, 24, 24, 23, 22, 22, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20, 19,
+        19, 18, 18, 18, 17, 17, 17, 17, 16, 16, 23, 23, 23, 23, 22, 22, 22, 21,
+        21, 20, 20, 20, 20, 20, 20, 20, 19, 19, 19, 19, 19, 18, 18, 17, 17, 17,
+        17, 17, 17, 17, 17, 17, 21, 22, 22, 23, 23, 22, 23, 22, 21, 19, 19, 18,
+        18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 15, 15, 15,
+        15, 15, 20, 20, 21, 21, 22, 22, 22, 21, 20, 19, 19, 18, 17, 17, 16, 16,
+        16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 13, 13, 13, 13, 14, 14, 18, 19,
+        19, 20, 20, 20, 20, 20, 19, 18, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13,
+        13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 16, 17, 17, 18, 18, 18,
+        19, 18, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 11,
+        11, 11, 11, 11, 11, 11, 11, 11, 15, 16, 16, 17, 17, 17, 17, 18, 17, 17,
+        16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11,
+        11, 10, 10, 10 },
+  },
+  {
+      { /* Luma */
+        /* Size 4x4 */
+        32, 30, 19, 14, 30, 21, 16, 13, 19, 16, 11, 9, 14, 13, 9, 7,
+        /* Size 8x8 */
+        32, 32, 30, 26, 20, 17, 13, 12, 32, 31, 29, 26, 21, 17, 14, 13, 30, 29,
+        26, 22, 19, 16, 14, 13, 26, 26, 22, 18, 16, 14, 12, 11, 20, 21, 19, 16,
+        13, 11, 10, 10, 17, 17, 16, 14, 11, 10, 9, 8, 13, 14, 14, 12, 10, 9, 8,
+        7, 12, 13, 13, 11, 10, 8, 7, 7,
+        /* Size 16x16 */
+        32, 33, 33, 32, 31, 28, 26, 23, 21, 19, 17, 16, 14, 13, 12, 11, 33, 32,
+        32, 32, 31, 29, 27, 24, 22, 20, 18, 16, 15, 13, 13, 12, 33, 32, 32, 31,
+        30, 29, 27, 25, 23, 21, 19, 17, 15, 14, 13, 12, 32, 32, 31, 30, 28, 28,
+        26, 24, 23, 21, 19, 17, 16, 14, 14, 13, 31, 31, 30, 28, 27, 24, 23, 22,
+        20, 19, 18, 16, 15, 14, 13, 13, 28, 29, 29, 28, 24, 21, 20, 19, 18, 17,
+        16, 15, 14, 13, 12, 12, 26, 27, 27, 26, 23, 20, 19, 18, 17, 16, 15, 14,
+        13, 12, 12, 11, 23, 24, 25, 24, 22, 19, 18, 16, 15, 14, 14, 13, 12, 11,
+        11, 11, 21, 22, 23, 23, 20, 18, 17, 15, 14, 13, 13, 12, 11, 10, 10, 10,
+        19, 20, 21, 21, 19, 17, 16, 14, 13, 12, 12, 11, 10, 10, 9, 9, 17, 18,
+        19, 19, 18, 16, 15, 14, 13, 12, 11, 10, 10, 9, 9, 9, 16, 16, 17, 17, 16,
+        15, 14, 13, 12, 11, 10, 10, 9, 8, 8, 8, 14, 15, 15, 16, 15, 14, 13, 12,
+        11, 10, 10, 9, 8, 8, 8, 7, 13, 13, 14, 14, 14, 13, 12, 11, 10, 10, 9, 8,
+        8, 7, 7, 7, 12, 13, 13, 14, 13, 12, 12, 11, 10, 9, 9, 8, 8, 7, 7, 7, 11,
+        12, 12, 13, 13, 12, 11, 11, 10, 9, 9, 8, 7, 7, 7, 6,
+        /* Size 32x32 */
+        32, 33, 33, 33, 33, 33, 32, 32, 31, 30, 28, 28, 26, 25, 23, 22, 21, 20,
+        19, 18, 17, 16, 16, 14, 14, 13, 13, 12, 12, 12, 11, 11, 33, 32, 32, 32,
+        32, 32, 32, 32, 31, 30, 29, 29, 27, 26, 24, 23, 22, 20, 20, 18, 18, 17,
+        16, 15, 14, 13, 13, 13, 12, 12, 12, 12, 33, 32, 32, 32, 32, 32, 32, 32,
+        31, 30, 29, 29, 27, 26, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 15, 14,
+        13, 13, 13, 12, 12, 12, 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30,
+        28, 27, 25, 24, 23, 21, 21, 19, 19, 17, 17, 16, 15, 14, 14, 14, 13, 13,
+        12, 12, 33, 32, 32, 32, 32, 31, 31, 31, 30, 30, 29, 29, 27, 26, 25, 24,
+        23, 21, 21, 19, 19, 17, 17, 16, 15, 14, 14, 14, 13, 13, 12, 12, 33, 32,
+        32, 32, 31, 31, 31, 30, 29, 29, 28, 28, 26, 26, 24, 23, 23, 21, 20, 19,
+        19, 17, 17, 16, 15, 14, 14, 14, 13, 13, 13, 12, 32, 32, 32, 32, 31, 31,
+        30, 29, 28, 28, 28, 27, 26, 26, 24, 23, 23, 21, 21, 19, 19, 18, 17, 16,
+        16, 15, 14, 14, 14, 13, 13, 12, 32, 32, 32, 32, 31, 30, 29, 29, 28, 28,
+        27, 27, 26, 25, 24, 23, 22, 21, 21, 19, 19, 18, 17, 16, 16, 15, 14, 14,
+        14, 13, 13, 13, 31, 31, 31, 31, 30, 29, 28, 28, 27, 26, 24, 24, 23, 23,
+        22, 21, 20, 20, 19, 18, 18, 17, 16, 15, 15, 14, 14, 14, 13, 13, 13, 13,
+        30, 30, 30, 31, 30, 29, 28, 28, 26, 26, 24, 24, 23, 22, 22, 21, 20, 19,
+        19, 18, 18, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 28, 29, 29, 30,
+        29, 28, 28, 27, 24, 24, 21, 21, 20, 20, 19, 19, 18, 17, 17, 16, 16, 15,
+        15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 28, 29, 29, 30, 29, 28, 27, 27,
+        24, 24, 21, 21, 20, 20, 19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13,
+        13, 13, 12, 12, 12, 11, 26, 27, 27, 28, 27, 26, 26, 26, 23, 23, 20, 20,
+        19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11,
+        11, 11, 25, 26, 26, 27, 26, 26, 26, 25, 23, 22, 20, 20, 19, 18, 17, 17,
+        16, 16, 15, 15, 15, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 23, 24,
+        24, 25, 25, 24, 24, 24, 22, 22, 19, 19, 18, 17, 16, 16, 15, 15, 14, 14,
+        14, 13, 13, 12, 12, 11, 11, 11, 11, 11, 11, 11, 22, 23, 23, 24, 24, 23,
+        23, 23, 21, 21, 19, 19, 17, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12,
+        12, 11, 11, 11, 10, 10, 10, 10, 21, 22, 22, 23, 23, 23, 23, 22, 20, 20,
+        18, 18, 17, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 11, 11, 11, 10, 10,
+        10, 10, 10, 10, 20, 20, 21, 21, 21, 21, 21, 21, 20, 19, 17, 17, 16, 16,
+        15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10, 9,
+        19, 20, 20, 21, 21, 20, 21, 21, 19, 19, 17, 17, 16, 15, 14, 14, 13, 13,
+        12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 18, 18, 19, 19, 19,
+        19, 19, 19, 18, 18, 16, 16, 15, 15, 14, 13, 13, 12, 12, 11, 11, 11, 10,
+        10, 10, 9, 9, 9, 9, 9, 9, 9, 17, 18, 18, 19, 19, 19, 19, 19, 18, 18, 16,
+        16, 15, 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9,
+        9, 9, 16, 17, 17, 17, 17, 17, 18, 18, 17, 16, 15, 15, 14, 14, 13, 12,
+        12, 11, 11, 11, 10, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, 16, 16, 16, 17,
+        17, 17, 17, 17, 16, 16, 15, 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10,
+        10, 9, 9, 9, 8, 8, 8, 8, 8, 8, 14, 15, 15, 16, 16, 16, 16, 16, 15, 15,
+        14, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8,
+        8, 8, 14, 14, 15, 15, 15, 15, 16, 16, 15, 15, 14, 14, 13, 12, 12, 12,
+        11, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, 8, 8, 7, 8, 13, 13, 14, 14, 14,
+        14, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 9, 9, 9, 9, 8,
+        8, 8, 8, 7, 7, 7, 7, 7, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 13, 13,
+        12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 12,
+        13, 13, 14, 14, 14, 14, 14, 14, 13, 13, 13, 12, 12, 11, 11, 10, 10, 10,
+        9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 12, 12, 13, 13, 13, 13, 14, 14,
+        13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7,
+        7, 7, 7, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 12, 12, 11, 11, 11, 10,
+        10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 6, 11, 12, 12, 12, 12,
+        13, 13, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 7,
+        7, 7, 7, 7, 7, 6, 6, 11, 12, 12, 12, 12, 12, 12, 13, 13, 12, 12, 11, 11,
+        11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6,
+        /* Size 4x8 */
+        32, 29, 20, 13, 32, 28, 20, 14, 30, 24, 19, 14, 27, 20, 15, 12, 21, 17,
+        13, 10, 17, 15, 11, 9, 14, 13, 10, 8, 13, 12, 9, 7,
+        /* Size 8x4 */
+        32, 32, 30, 27, 21, 17, 14, 13, 29, 28, 24, 20, 17, 15, 13, 12, 20, 20,
+        19, 15, 13, 11, 10, 9, 13, 14, 14, 12, 10, 9, 8, 7,
+        /* Size 8x16 */
+        32, 33, 31, 26, 20, 16, 13, 12, 33, 32, 31, 26, 21, 17, 14, 12, 33, 32,
+        30, 27, 22, 17, 14, 13, 32, 31, 28, 26, 21, 18, 15, 13, 31, 30, 27, 23,
+        20, 17, 14, 13, 28, 29, 24, 20, 18, 15, 13, 12, 26, 27, 23, 19, 16, 14,
+        12, 12, 23, 25, 22, 17, 15, 13, 11, 11, 21, 23, 20, 17, 14, 12, 11, 10,
+        19, 21, 19, 16, 13, 11, 10, 9, 18, 19, 18, 15, 12, 10, 9, 9, 16, 17, 16,
+        14, 11, 10, 9, 8, 14, 15, 15, 13, 11, 9, 8, 8, 13, 14, 14, 12, 10, 9, 8,
+        7, 12, 13, 13, 11, 10, 8, 7, 7, 11, 12, 13, 11, 10, 9, 7, 7,
+        /* Size 16x8 */
+        32, 33, 33, 32, 31, 28, 26, 23, 21, 19, 18, 16, 14, 13, 12, 11, 33, 32,
+        32, 31, 30, 29, 27, 25, 23, 21, 19, 17, 15, 14, 13, 12, 31, 31, 30, 28,
+        27, 24, 23, 22, 20, 19, 18, 16, 15, 14, 13, 13, 26, 26, 27, 26, 23, 20,
+        19, 17, 17, 16, 15, 14, 13, 12, 11, 11, 20, 21, 22, 21, 20, 18, 16, 15,
+        14, 13, 12, 11, 11, 10, 10, 10, 16, 17, 17, 18, 17, 15, 14, 13, 12, 11,
+        10, 10, 9, 9, 8, 9, 13, 14, 14, 15, 14, 13, 12, 11, 11, 10, 9, 9, 8, 8,
+        7, 7, 12, 12, 13, 13, 13, 12, 12, 11, 10, 9, 9, 8, 8, 7, 7, 7,
+        /* Size 16x32 */
+        32, 33, 33, 32, 31, 28, 26, 23, 20, 19, 16, 16, 13, 13, 12, 11, 33, 32,
+        32, 32, 31, 29, 26, 24, 21, 20, 17, 16, 14, 13, 12, 12, 33, 32, 32, 32,
+        31, 29, 26, 24, 21, 20, 17, 17, 14, 13, 12, 12, 33, 32, 32, 31, 31, 30,
+        27, 25, 22, 21, 17, 17, 14, 14, 13, 13, 33, 32, 32, 31, 30, 29, 27, 25,
+        22, 21, 17, 17, 14, 14, 13, 13, 32, 32, 31, 30, 29, 28, 26, 24, 21, 20,
+        17, 17, 14, 14, 13, 13, 32, 32, 31, 29, 28, 28, 26, 24, 21, 21, 18, 17,
+        15, 14, 13, 13, 32, 31, 31, 29, 28, 27, 25, 24, 21, 21, 18, 17, 15, 15,
+        14, 13, 31, 31, 30, 28, 27, 25, 23, 22, 20, 19, 17, 16, 14, 14, 13, 13,
+        30, 30, 30, 28, 26, 24, 23, 21, 19, 19, 16, 16, 14, 14, 13, 12, 28, 30,
+        29, 27, 24, 21, 20, 19, 18, 17, 15, 15, 13, 13, 12, 12, 28, 29, 29, 27,
+        24, 21, 20, 19, 17, 17, 15, 15, 13, 13, 12, 12, 26, 28, 27, 26, 23, 20,
+        19, 18, 16, 16, 14, 14, 12, 12, 12, 12, 26, 27, 26, 25, 23, 20, 18, 17,
+        16, 15, 14, 13, 12, 12, 11, 11, 23, 25, 25, 24, 22, 19, 17, 16, 15, 14,
+        13, 13, 11, 11, 11, 11, 22, 24, 24, 23, 21, 19, 17, 16, 14, 14, 12, 12,
+        11, 11, 11, 10, 21, 23, 23, 22, 20, 18, 17, 15, 14, 13, 12, 12, 11, 10,
+        10, 10, 20, 21, 21, 21, 20, 17, 16, 15, 13, 13, 11, 11, 10, 10, 10, 10,
+        19, 21, 21, 20, 19, 17, 16, 14, 13, 12, 11, 11, 10, 10, 9, 10, 18, 19,
+        19, 19, 18, 16, 15, 14, 12, 12, 11, 10, 9, 9, 9, 9, 18, 19, 19, 19, 18,
+        16, 15, 14, 12, 12, 10, 10, 9, 9, 9, 9, 16, 17, 17, 18, 17, 15, 14, 13,
+        12, 11, 10, 10, 9, 9, 8, 8, 16, 17, 17, 17, 16, 15, 14, 13, 11, 11, 10,
+        10, 9, 8, 8, 8, 14, 16, 16, 16, 15, 14, 13, 12, 11, 11, 9, 9, 8, 8, 8,
+        8, 14, 15, 15, 16, 15, 14, 13, 12, 11, 10, 9, 9, 8, 8, 8, 8, 13, 14, 14,
+        15, 14, 13, 12, 11, 10, 10, 9, 9, 8, 8, 7, 7, 13, 14, 14, 14, 14, 13,
+        12, 11, 10, 10, 9, 8, 8, 7, 7, 7, 12, 14, 14, 14, 14, 13, 12, 11, 10,
+        10, 8, 8, 8, 7, 7, 7, 12, 13, 13, 14, 13, 12, 11, 11, 10, 9, 8, 8, 7, 7,
+        7, 7, 12, 13, 13, 13, 13, 12, 11, 10, 10, 9, 8, 8, 7, 7, 7, 7, 11, 12,
+        12, 13, 13, 12, 11, 10, 10, 9, 9, 8, 7, 7, 7, 7, 11, 12, 12, 13, 13, 11,
+        11, 10, 10, 9, 9, 8, 8, 7, 7, 6,
+        /* Size 32x16 */
+        32, 33, 33, 33, 33, 32, 32, 32, 31, 30, 28, 28, 26, 26, 23, 22, 21, 20,
+        19, 18, 18, 16, 16, 14, 14, 13, 13, 12, 12, 12, 11, 11, 33, 32, 32, 32,
+        32, 32, 32, 31, 31, 30, 30, 29, 28, 27, 25, 24, 23, 21, 21, 19, 19, 17,
+        17, 16, 15, 14, 14, 14, 13, 13, 12, 12, 33, 32, 32, 32, 32, 31, 31, 31,
+        30, 30, 29, 29, 27, 26, 25, 24, 23, 21, 21, 19, 19, 17, 17, 16, 15, 14,
+        14, 14, 13, 13, 12, 12, 32, 32, 32, 31, 31, 30, 29, 29, 28, 28, 27, 27,
+        26, 25, 24, 23, 22, 21, 20, 19, 19, 18, 17, 16, 16, 15, 14, 14, 14, 13,
+        13, 13, 31, 31, 31, 31, 30, 29, 28, 28, 27, 26, 24, 24, 23, 23, 22, 21,
+        20, 20, 19, 18, 18, 17, 16, 15, 15, 14, 14, 14, 13, 13, 13, 13, 28, 29,
+        29, 30, 29, 28, 28, 27, 25, 24, 21, 21, 20, 20, 19, 19, 18, 17, 17, 16,
+        16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 26, 26, 26, 27, 27, 26,
+        26, 25, 23, 23, 20, 20, 19, 18, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13,
+        13, 12, 12, 12, 11, 11, 11, 11, 23, 24, 24, 25, 25, 24, 24, 24, 22, 21,
+        19, 19, 18, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 11, 11, 11,
+        11, 10, 10, 10, 20, 21, 21, 22, 22, 21, 21, 21, 20, 19, 18, 17, 16, 16,
+        15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10,
+        19, 20, 20, 21, 21, 20, 21, 21, 19, 19, 17, 17, 16, 15, 14, 14, 13, 13,
+        12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 16, 17, 17, 17, 17,
+        17, 18, 18, 17, 16, 15, 15, 14, 14, 13, 12, 12, 11, 11, 11, 10, 10, 10,
+        9, 9, 9, 9, 8, 8, 8, 9, 9, 16, 16, 17, 17, 17, 17, 17, 17, 16, 16, 15,
+        15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, 8,
+        8, 13, 14, 14, 14, 14, 14, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11,
+        10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, 7, 7, 7, 8, 13, 13, 13, 14, 14, 14,
+        14, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8,
+        7, 7, 7, 7, 7, 7, 12, 12, 12, 13, 13, 13, 13, 14, 13, 13, 12, 12, 12,
+        11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 11, 12,
+        12, 13, 13, 13, 13, 13, 13, 12, 12, 12, 12, 11, 11, 10, 10, 10, 10, 9,
+        9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 6,
+        /* Size 4x16 */
+        33, 28, 19, 13, 32, 29, 20, 13, 32, 29, 21, 14, 32, 28, 21, 14, 31, 25,
+        19, 14, 30, 21, 17, 13, 28, 20, 16, 12, 25, 19, 14, 11, 23, 18, 13, 10,
+        21, 17, 12, 10, 19, 16, 12, 9, 17, 15, 11, 8, 15, 14, 10, 8, 14, 13, 10,
+        7, 13, 12, 9, 7, 12, 12, 9, 7,
+        /* Size 16x4 */
+        33, 32, 32, 32, 31, 30, 28, 25, 23, 21, 19, 17, 15, 14, 13, 12, 28, 29,
+        29, 28, 25, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 12, 19, 20, 21, 21,
+        19, 17, 16, 14, 13, 12, 12, 11, 10, 10, 9, 9, 13, 13, 14, 14, 14, 13,
+        12, 11, 10, 10, 9, 8, 8, 7, 7, 7,
+        /* Size 8x32 */
+        32, 33, 31, 26, 20, 16, 13, 12, 33, 32, 31, 26, 21, 17, 14, 12, 33, 32,
+        31, 26, 21, 17, 14, 12, 33, 32, 31, 27, 22, 17, 14, 13, 33, 32, 30, 27,
+        22, 17, 14, 13, 32, 31, 29, 26, 21, 17, 14, 13, 32, 31, 28, 26, 21, 18,
+        15, 13, 32, 31, 28, 25, 21, 18, 15, 14, 31, 30, 27, 23, 20, 17, 14, 13,
+        30, 30, 26, 23, 19, 16, 14, 13, 28, 29, 24, 20, 18, 15, 13, 12, 28, 29,
+        24, 20, 17, 15, 13, 12, 26, 27, 23, 19, 16, 14, 12, 12, 26, 26, 23, 18,
+        16, 14, 12, 11, 23, 25, 22, 17, 15, 13, 11, 11, 22, 24, 21, 17, 14, 12,
+        11, 11, 21, 23, 20, 17, 14, 12, 11, 10, 20, 21, 20, 16, 13, 11, 10, 10,
+        19, 21, 19, 16, 13, 11, 10, 9, 18, 19, 18, 15, 12, 11, 9, 9, 18, 19, 18,
+        15, 12, 10, 9, 9, 16, 17, 17, 14, 12, 10, 9, 8, 16, 17, 16, 14, 11, 10,
+        9, 8, 14, 16, 15, 13, 11, 9, 8, 8, 14, 15, 15, 13, 11, 9, 8, 8, 13, 14,
+        14, 12, 10, 9, 8, 7, 13, 14, 14, 12, 10, 9, 8, 7, 12, 14, 14, 12, 10, 8,
+        8, 7, 12, 13, 13, 11, 10, 8, 7, 7, 12, 13, 13, 11, 10, 8, 7, 7, 11, 12,
+        13, 11, 10, 9, 7, 7, 11, 12, 13, 11, 10, 9, 8, 7,
+        /* Size 32x8 */
+        32, 33, 33, 33, 33, 32, 32, 32, 31, 30, 28, 28, 26, 26, 23, 22, 21, 20,
+        19, 18, 18, 16, 16, 14, 14, 13, 13, 12, 12, 12, 11, 11, 33, 32, 32, 32,
+        32, 31, 31, 31, 30, 30, 29, 29, 27, 26, 25, 24, 23, 21, 21, 19, 19, 17,
+        17, 16, 15, 14, 14, 14, 13, 13, 12, 12, 31, 31, 31, 31, 30, 29, 28, 28,
+        27, 26, 24, 24, 23, 23, 22, 21, 20, 20, 19, 18, 18, 17, 16, 15, 15, 14,
+        14, 14, 13, 13, 13, 13, 26, 26, 26, 27, 27, 26, 26, 25, 23, 23, 20, 20,
+        19, 18, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11,
+        11, 11, 20, 21, 21, 22, 22, 21, 21, 21, 20, 19, 18, 17, 16, 16, 15, 14,
+        14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 16, 17,
+        17, 17, 17, 17, 18, 18, 17, 16, 15, 15, 14, 14, 13, 12, 12, 11, 11, 11,
+        10, 10, 10, 9, 9, 9, 9, 8, 8, 8, 9, 9, 13, 14, 14, 14, 14, 14, 15, 15,
+        14, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8,
+        7, 7, 7, 8, 12, 12, 12, 13, 13, 13, 13, 14, 13, 13, 12, 12, 12, 11, 11,
+        11, 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7 },
+      { /* Chroma */
+        /* Size 4x4 */
+        32, 22, 21, 18, 22, 19, 19, 17, 21, 19, 15, 13, 18, 17, 13, 11,
+        /* Size 8x8 */
+        33, 30, 24, 22, 21, 19, 17, 16, 30, 26, 23, 22, 22, 20, 18, 17, 24, 23,
+        21, 21, 20, 19, 18, 17, 22, 22, 21, 19, 18, 17, 16, 16, 21, 22, 20, 18,
+        16, 15, 14, 14, 19, 20, 19, 17, 15, 13, 12, 12, 17, 18, 18, 16, 14, 12,
+        12, 11, 16, 17, 17, 16, 14, 12, 11, 11,
+        /* Size 16x16 */
+        32, 33, 33, 29, 26, 21, 21, 21, 20, 20, 19, 18, 17, 16, 16, 15, 33, 33,
+        32, 28, 25, 22, 22, 22, 21, 21, 20, 19, 18, 17, 17, 16, 33, 32, 30, 26,
+        24, 22, 22, 23, 22, 22, 21, 20, 19, 18, 17, 17, 29, 28, 26, 23, 22, 22,
+        22, 23, 22, 22, 21, 20, 19, 18, 18, 17, 26, 25, 24, 22, 21, 20, 21, 21,
+        21, 21, 20, 19, 19, 18, 17, 17, 21, 22, 22, 22, 20, 19, 19, 19, 19, 19,
+        19, 18, 17, 17, 17, 17, 21, 22, 22, 22, 21, 19, 19, 19, 18, 18, 18, 17,
+        17, 16, 16, 16, 21, 22, 23, 23, 21, 19, 19, 18, 17, 17, 17, 16, 16, 15,
+        15, 15, 20, 21, 22, 22, 21, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14,
+        20, 21, 22, 22, 21, 19, 18, 17, 16, 16, 15, 14, 14, 14, 13, 13, 19, 20,
+        21, 21, 20, 19, 18, 17, 16, 15, 14, 14, 13, 13, 13, 13, 18, 19, 20, 20,
+        19, 18, 17, 16, 15, 14, 14, 13, 13, 12, 12, 12, 17, 18, 19, 19, 19, 17,
+        17, 16, 15, 14, 13, 13, 12, 12, 12, 12, 16, 17, 18, 18, 18, 17, 16, 15,
+        14, 14, 13, 12, 12, 11, 11, 11, 16, 17, 17, 18, 17, 17, 16, 15, 14, 13,
+        13, 12, 12, 11, 11, 11, 15, 16, 17, 17, 17, 17, 16, 15, 14, 13, 13, 12,
+        12, 11, 11, 10,
+        /* Size 32x32 */
+        32, 33, 33, 34, 33, 31, 29, 28, 26, 25, 21, 21, 21, 21, 21, 20, 20, 20,
+        20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 33, 33, 33, 33,
+        32, 30, 28, 27, 25, 24, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19,
+        19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 33, 33, 33, 33, 32, 29, 28, 26,
+        25, 24, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 19, 19, 18, 18, 17,
+        17, 17, 17, 16, 16, 16, 34, 33, 33, 32, 31, 29, 27, 26, 24, 24, 22, 22,
+        23, 23, 23, 23, 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18, 18, 17, 17,
+        17, 17, 33, 32, 32, 31, 30, 28, 26, 25, 24, 24, 22, 22, 22, 23, 23, 22,
+        22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 31, 30,
+        29, 29, 28, 26, 25, 24, 23, 23, 22, 22, 22, 22, 23, 22, 22, 22, 22, 21,
+        21, 20, 20, 19, 19, 18, 18, 18, 18, 17, 17, 17, 29, 28, 28, 27, 26, 25,
+        23, 22, 22, 22, 22, 22, 22, 22, 23, 22, 22, 22, 22, 21, 21, 20, 20, 20,
+        19, 19, 18, 18, 18, 18, 17, 17, 28, 27, 26, 26, 25, 24, 22, 22, 22, 22,
+        21, 22, 22, 22, 23, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 19, 19,
+        18, 18, 18, 18, 26, 25, 25, 24, 24, 23, 22, 22, 21, 21, 20, 21, 21, 21,
+        21, 21, 21, 21, 21, 20, 20, 20, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17,
+        25, 24, 24, 24, 24, 23, 22, 22, 21, 21, 20, 20, 21, 21, 21, 21, 20, 20,
+        20, 20, 20, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 21, 22, 22, 22,
+        22, 22, 22, 21, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18,
+        18, 18, 17, 17, 17, 17, 17, 17, 17, 17, 21, 22, 22, 22, 22, 22, 22, 22,
+        21, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17,
+        17, 17, 16, 16, 16, 16, 21, 22, 22, 23, 22, 22, 22, 22, 21, 21, 19, 19,
+        19, 19, 19, 18, 18, 18, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 16,
+        16, 16, 21, 22, 22, 23, 23, 22, 22, 22, 21, 21, 19, 19, 19, 19, 18, 18,
+        18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 16, 16, 16, 15, 15, 21, 22,
+        22, 23, 23, 23, 23, 23, 21, 21, 19, 19, 19, 18, 18, 17, 17, 17, 17, 17,
+        17, 16, 16, 16, 16, 15, 15, 15, 15, 15, 15, 15, 20, 22, 22, 23, 22, 22,
+        22, 22, 21, 21, 19, 19, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 16, 15,
+        15, 15, 15, 15, 15, 15, 14, 14, 20, 21, 21, 22, 22, 22, 22, 22, 21, 20,
+        19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14,
+        14, 14, 14, 14, 20, 21, 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 18, 18,
+        17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+        20, 20, 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 18, 18, 17, 17, 16, 16,
+        16, 15, 15, 15, 14, 14, 14, 14, 14, 14, 13, 13, 13, 14, 19, 20, 20, 21,
+        21, 21, 21, 21, 20, 20, 19, 19, 18, 17, 17, 16, 16, 15, 15, 15, 15, 14,
+        14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 19, 20, 20, 21, 21, 21, 21, 21,
+        20, 20, 19, 18, 18, 17, 17, 16, 16, 15, 15, 15, 14, 14, 14, 14, 13, 13,
+        13, 13, 13, 13, 13, 13, 18, 19, 19, 20, 20, 20, 20, 20, 20, 19, 18, 18,
+        17, 17, 16, 16, 15, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 12, 12, 12,
+        12, 12, 18, 19, 19, 20, 20, 20, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16,
+        15, 15, 14, 14, 14, 13, 13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 17, 18,
+        18, 19, 19, 19, 20, 20, 19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14,
+        14, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 17, 18, 18, 19, 19, 19,
+        19, 19, 19, 18, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 13, 12,
+        12, 12, 12, 12, 12, 12, 12, 12, 16, 17, 17, 18, 18, 18, 19, 19, 18, 18,
+        17, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 12, 11,
+        11, 11, 11, 11, 16, 17, 17, 18, 18, 18, 18, 19, 18, 18, 17, 17, 16, 16,
+        15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11,
+        16, 17, 17, 18, 18, 18, 18, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14,
+        14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 16, 16, 17, 17,
+        17, 18, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12,
+        12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 15, 16, 16, 17, 17, 17, 18, 18,
+        17, 17, 17, 16, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11,
+        11, 11, 11, 11, 11, 11, 15, 16, 16, 17, 17, 17, 17, 18, 17, 17, 17, 16,
+        16, 15, 15, 14, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11,
+        10, 11, 15, 16, 16, 17, 17, 17, 17, 18, 17, 17, 17, 16, 16, 15, 15, 14,
+        14, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 10,
+        /* Size 4x8 */
+        33, 22, 20, 17, 28, 22, 22, 18, 24, 20, 20, 18, 23, 19, 18, 16, 22, 19,
+        16, 14, 20, 18, 15, 12, 18, 17, 14, 11, 17, 16, 13, 11,
+        /* Size 8x4 */
+        33, 28, 24, 23, 22, 20, 18, 17, 22, 22, 20, 19, 19, 18, 17, 16, 20, 22,
+        20, 18, 16, 15, 14, 13, 17, 18, 18, 16, 14, 12, 11, 11,
+        /* Size 8x16 */
+        32, 32, 26, 21, 20, 18, 16, 15, 33, 31, 25, 22, 21, 19, 17, 16, 33, 29,
+        24, 22, 22, 20, 18, 17, 29, 26, 22, 22, 22, 20, 19, 18, 25, 24, 21, 21,
+        21, 20, 18, 17, 21, 22, 20, 19, 19, 18, 17, 17, 21, 22, 21, 19, 18, 17,
+        16, 16, 21, 23, 21, 18, 17, 16, 15, 15, 20, 22, 21, 18, 16, 15, 14, 14,
+        20, 21, 20, 18, 16, 14, 14, 13, 19, 20, 20, 17, 15, 14, 13, 13, 18, 20,
+        19, 17, 15, 13, 12, 12, 17, 19, 18, 16, 14, 13, 12, 12, 16, 18, 18, 16,
+        14, 12, 12, 11, 16, 17, 17, 16, 14, 12, 11, 11, 15, 17, 17, 16, 14, 13,
+        12, 11,
+        /* Size 16x8 */
+        32, 33, 33, 29, 25, 21, 21, 21, 20, 20, 19, 18, 17, 16, 16, 15, 32, 31,
+        29, 26, 24, 22, 22, 23, 22, 21, 20, 20, 19, 18, 17, 17, 26, 25, 24, 22,
+        21, 20, 21, 21, 21, 20, 20, 19, 18, 18, 17, 17, 21, 22, 22, 22, 21, 19,
+        19, 18, 18, 18, 17, 17, 16, 16, 16, 16, 20, 21, 22, 22, 21, 19, 18, 17,
+        16, 16, 15, 15, 14, 14, 14, 14, 18, 19, 20, 20, 20, 18, 17, 16, 15, 14,
+        14, 13, 13, 12, 12, 13, 16, 17, 18, 19, 18, 17, 16, 15, 14, 14, 13, 12,
+        12, 12, 11, 12, 15, 16, 17, 18, 17, 17, 16, 15, 14, 13, 13, 12, 12, 11,
+        11, 11,
+        /* Size 16x32 */
+        32, 33, 32, 28, 26, 21, 21, 21, 20, 20, 18, 18, 16, 16, 15, 15, 33, 33,
+        31, 27, 25, 22, 22, 22, 21, 20, 19, 19, 17, 17, 16, 16, 33, 33, 31, 27,
+        25, 22, 22, 22, 21, 21, 19, 19, 17, 17, 16, 16, 34, 32, 31, 26, 24, 22,
+        23, 23, 22, 21, 20, 20, 18, 18, 17, 17, 33, 31, 29, 25, 24, 22, 22, 23,
+        22, 21, 20, 20, 18, 18, 17, 17, 31, 28, 28, 24, 23, 22, 22, 22, 22, 22,
+        20, 20, 18, 18, 17, 17, 29, 27, 26, 23, 22, 22, 22, 23, 22, 22, 20, 20,
+        19, 18, 18, 17, 28, 26, 25, 22, 22, 22, 22, 23, 22, 22, 20, 20, 19, 19,
+        18, 18, 25, 24, 24, 22, 21, 21, 21, 21, 21, 20, 20, 19, 18, 18, 17, 18,
+        24, 24, 24, 22, 21, 20, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 21, 22,
+        22, 21, 20, 19, 19, 19, 19, 19, 18, 18, 17, 17, 17, 17, 21, 22, 22, 21,
+        20, 19, 19, 19, 19, 19, 18, 18, 17, 17, 16, 16, 21, 22, 22, 22, 21, 19,
+        19, 18, 18, 18, 17, 17, 16, 16, 16, 16, 21, 23, 22, 22, 21, 19, 19, 18,
+        18, 18, 17, 17, 16, 16, 16, 15, 21, 23, 23, 22, 21, 19, 18, 18, 17, 17,
+        16, 16, 15, 15, 15, 15, 21, 22, 22, 22, 21, 19, 18, 17, 17, 17, 16, 16,
+        15, 15, 15, 15, 20, 22, 22, 22, 21, 19, 18, 17, 16, 16, 15, 15, 14, 14,
+        14, 14, 20, 22, 22, 22, 21, 19, 18, 17, 16, 16, 15, 15, 14, 14, 14, 14,
+        20, 21, 21, 22, 20, 19, 18, 17, 16, 16, 14, 14, 14, 14, 13, 14, 19, 20,
+        21, 21, 20, 19, 17, 17, 15, 15, 14, 14, 13, 13, 13, 13, 19, 20, 20, 21,
+        20, 19, 17, 17, 15, 15, 14, 14, 13, 13, 13, 13, 18, 20, 20, 20, 20, 18,
+        17, 16, 15, 15, 13, 13, 12, 12, 12, 12, 18, 20, 20, 20, 19, 18, 17, 16,
+        15, 14, 13, 13, 12, 12, 12, 12, 17, 19, 19, 20, 19, 18, 17, 16, 14, 14,
+        13, 13, 12, 12, 12, 12, 17, 18, 19, 19, 18, 17, 16, 16, 14, 14, 13, 13,
+        12, 12, 12, 12, 16, 18, 18, 19, 18, 17, 16, 15, 14, 14, 12, 12, 12, 11,
+        11, 11, 16, 18, 18, 19, 18, 17, 16, 15, 14, 14, 12, 12, 12, 11, 11, 11,
+        16, 17, 18, 18, 18, 17, 16, 15, 14, 14, 12, 12, 11, 11, 11, 11, 16, 17,
+        17, 18, 17, 17, 16, 15, 14, 13, 12, 12, 11, 11, 11, 11, 15, 17, 17, 18,
+        17, 16, 16, 15, 14, 13, 12, 12, 11, 11, 11, 11, 15, 17, 17, 18, 17, 16,
+        16, 14, 14, 13, 13, 12, 12, 11, 11, 11, 15, 17, 17, 17, 17, 16, 16, 14,
+        14, 13, 13, 12, 12, 11, 11, 10,
+        /* Size 32x16 */
+        32, 33, 33, 34, 33, 31, 29, 28, 25, 24, 21, 21, 21, 21, 21, 21, 20, 20,
+        20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 33, 33, 33, 32,
+        31, 28, 27, 26, 24, 24, 22, 22, 22, 23, 23, 22, 22, 22, 21, 20, 20, 20,
+        20, 19, 18, 18, 18, 17, 17, 17, 17, 17, 32, 31, 31, 31, 29, 28, 26, 25,
+        24, 24, 22, 22, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 18,
+        18, 18, 17, 17, 17, 17, 28, 27, 27, 26, 25, 24, 23, 22, 22, 22, 21, 21,
+        22, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 19, 18, 18, 18,
+        18, 17, 26, 25, 25, 24, 24, 23, 22, 22, 21, 21, 20, 20, 21, 21, 21, 21,
+        21, 21, 20, 20, 20, 20, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 21, 22,
+        22, 22, 22, 22, 22, 22, 21, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+        19, 18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 21, 22, 22, 23, 22, 22,
+        22, 22, 21, 21, 19, 19, 19, 19, 18, 18, 18, 18, 18, 17, 17, 17, 17, 17,
+        16, 16, 16, 16, 16, 16, 16, 16, 21, 22, 22, 23, 23, 22, 23, 23, 21, 21,
+        19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15,
+        15, 15, 14, 14, 20, 21, 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 18, 18,
+        17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+        20, 20, 21, 21, 21, 22, 22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16,
+        16, 15, 15, 15, 14, 14, 14, 14, 14, 14, 13, 13, 13, 13, 18, 19, 19, 20,
+        20, 20, 20, 20, 20, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 13,
+        13, 13, 13, 12, 12, 12, 12, 12, 13, 13, 18, 19, 19, 20, 20, 20, 20, 20,
+        19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 13, 13, 12,
+        12, 12, 12, 12, 12, 12, 16, 17, 17, 18, 18, 18, 19, 19, 18, 18, 17, 17,
+        16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 12, 11, 11, 11,
+        12, 12, 16, 17, 17, 18, 18, 18, 18, 19, 18, 18, 17, 17, 16, 16, 15, 15,
+        14, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 15, 16,
+        16, 17, 17, 17, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 14, 14, 13, 13,
+        13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 15, 16, 16, 17, 17, 17,
+        17, 18, 18, 17, 17, 16, 16, 15, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12,
+        12, 11, 11, 11, 11, 11, 11, 10,
+        /* Size 4x16 */
+        33, 21, 20, 16, 33, 22, 21, 17, 31, 22, 21, 18, 27, 22, 22, 18, 24, 21,
+        20, 18, 22, 19, 19, 17, 22, 19, 18, 16, 23, 19, 17, 15, 22, 19, 16, 14,
+        21, 19, 16, 14, 20, 19, 15, 13, 20, 18, 14, 12, 18, 17, 14, 12, 18, 17,
+        14, 11, 17, 17, 13, 11, 17, 16, 13, 11,
+        /* Size 16x4 */
+        33, 33, 31, 27, 24, 22, 22, 23, 22, 21, 20, 20, 18, 18, 17, 17, 21, 22,
+        22, 22, 21, 19, 19, 19, 19, 19, 19, 18, 17, 17, 17, 16, 20, 21, 21, 22,
+        20, 19, 18, 17, 16, 16, 15, 14, 14, 14, 13, 13, 16, 17, 18, 18, 18, 17,
+        16, 15, 14, 14, 13, 12, 12, 11, 11, 11,
+        /* Size 8x32 */
+        32, 32, 26, 21, 20, 18, 16, 15, 33, 31, 25, 22, 21, 19, 17, 16, 33, 31,
+        25, 22, 21, 19, 17, 16, 34, 31, 24, 23, 22, 20, 18, 17, 33, 29, 24, 22,
+        22, 20, 18, 17, 31, 28, 23, 22, 22, 20, 18, 17, 29, 26, 22, 22, 22, 20,
+        19, 18, 28, 25, 22, 22, 22, 20, 19, 18, 25, 24, 21, 21, 21, 20, 18, 17,
+        24, 24, 21, 21, 20, 19, 18, 17, 21, 22, 20, 19, 19, 18, 17, 17, 21, 22,
+        20, 19, 19, 18, 17, 16, 21, 22, 21, 19, 18, 17, 16, 16, 21, 22, 21, 19,
+        18, 17, 16, 16, 21, 23, 21, 18, 17, 16, 15, 15, 21, 22, 21, 18, 17, 16,
+        15, 15, 20, 22, 21, 18, 16, 15, 14, 14, 20, 22, 21, 18, 16, 15, 14, 14,
+        20, 21, 20, 18, 16, 14, 14, 13, 19, 21, 20, 17, 15, 14, 13, 13, 19, 20,
+        20, 17, 15, 14, 13, 13, 18, 20, 20, 17, 15, 13, 12, 12, 18, 20, 19, 17,
+        15, 13, 12, 12, 17, 19, 19, 17, 14, 13, 12, 12, 17, 19, 18, 16, 14, 13,
+        12, 12, 16, 18, 18, 16, 14, 12, 12, 11, 16, 18, 18, 16, 14, 12, 12, 11,
+        16, 18, 18, 16, 14, 12, 11, 11, 16, 17, 17, 16, 14, 12, 11, 11, 15, 17,
+        17, 16, 14, 12, 11, 11, 15, 17, 17, 16, 14, 13, 12, 11, 15, 17, 17, 16,
+        14, 13, 12, 11,
+        /* Size 32x8 */
+        32, 33, 33, 34, 33, 31, 29, 28, 25, 24, 21, 21, 21, 21, 21, 21, 20, 20,
+        20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 32, 31, 31, 31,
+        29, 28, 26, 25, 24, 24, 22, 22, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20,
+        20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 26, 25, 25, 24, 24, 23, 22, 22,
+        21, 21, 20, 20, 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 19, 19, 18, 18,
+        18, 18, 17, 17, 17, 17, 21, 22, 22, 23, 22, 22, 22, 22, 21, 21, 19, 19,
+        19, 19, 18, 18, 18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 16,
+        16, 16, 20, 21, 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 18, 18, 17, 17,
+        16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, 18, 19,
+        19, 20, 20, 20, 20, 20, 20, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14,
+        14, 13, 13, 13, 13, 12, 12, 12, 12, 12, 13, 13, 16, 17, 17, 18, 18, 18,
+        19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12,
+        12, 12, 12, 11, 11, 11, 12, 12, 15, 16, 16, 17, 17, 17, 18, 18, 17, 17,
+        17, 16, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11,
+        11, 11, 11, 11 },
+  },
+  {
+      { /* Luma */
+        /* Size 4x4 */
+        32, 30, 21, 14, 30, 21, 17, 13, 21, 17, 12, 10, 14, 13, 10, 8,
+        /* Size 8x8 */
+        32, 32, 30, 27, 22, 18, 15, 13, 32, 31, 29, 26, 23, 19, 16, 14, 30, 29,
+        26, 23, 20, 18, 15, 13, 27, 26, 23, 19, 17, 15, 13, 12, 22, 23, 20, 17,
+        14, 13, 11, 10, 18, 19, 18, 15, 13, 11, 10, 9, 15, 16, 15, 13, 11, 10,
+        9, 8, 13, 14, 13, 12, 10, 9, 8, 7,
+        /* Size 16x16 */
+        32, 33, 33, 33, 32, 30, 28, 26, 23, 21, 19, 17, 16, 14, 13, 12, 33, 32,
+        32, 32, 32, 30, 29, 27, 24, 22, 20, 18, 17, 15, 13, 13, 33, 32, 32, 32,
+        32, 31, 30, 28, 25, 23, 21, 19, 17, 16, 14, 14, 33, 32, 32, 31, 30, 29,
+        28, 26, 24, 23, 20, 19, 17, 16, 14, 14, 32, 32, 32, 30, 29, 28, 27, 26,
+        24, 22, 21, 19, 18, 16, 15, 14, 30, 30, 31, 29, 28, 26, 24, 23, 22, 20,
+        19, 18, 16, 15, 14, 13, 28, 29, 30, 28, 27, 24, 21, 20, 19, 18, 17, 16,
+        15, 14, 13, 13, 26, 27, 28, 26, 26, 23, 20, 19, 18, 17, 16, 15, 14, 13,
+        12, 12, 23, 24, 25, 24, 24, 22, 19, 18, 16, 15, 14, 14, 13, 12, 11, 11,
+        21, 22, 23, 23, 22, 20, 18, 17, 15, 14, 13, 13, 12, 11, 11, 10, 19, 20,
+        21, 20, 21, 19, 17, 16, 14, 13, 12, 12, 11, 11, 10, 10, 17, 18, 19, 19,
+        19, 18, 16, 15, 14, 13, 12, 11, 10, 10, 9, 9, 16, 17, 17, 17, 18, 16,
+        15, 14, 13, 12, 11, 10, 10, 9, 9, 8, 14, 15, 16, 16, 16, 15, 14, 13, 12,
+        11, 11, 10, 9, 9, 8, 8, 13, 13, 14, 14, 15, 14, 13, 12, 11, 11, 10, 9,
+        9, 8, 8, 7, 12, 13, 14, 14, 14, 13, 13, 12, 11, 10, 10, 9, 8, 8, 7, 7,
+        /* Size 32x32 */
+        32, 33, 33, 33, 33, 33, 33, 32, 32, 30, 30, 28, 28, 26, 26, 23, 23, 21,
+        21, 19, 19, 17, 17, 16, 16, 14, 14, 13, 13, 12, 12, 12, 33, 32, 32, 32,
+        32, 32, 32, 32, 32, 30, 30, 29, 29, 27, 27, 24, 24, 22, 22, 20, 20, 18,
+        18, 17, 17, 15, 15, 13, 13, 13, 13, 12, 33, 32, 32, 32, 32, 32, 32, 32,
+        32, 30, 30, 29, 29, 27, 27, 24, 24, 22, 22, 20, 20, 18, 18, 17, 17, 15,
+        15, 13, 13, 13, 13, 12, 33, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30,
+        30, 28, 28, 25, 25, 23, 23, 21, 21, 19, 19, 17, 17, 16, 16, 14, 14, 14,
+        14, 13, 33, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 28, 28, 25,
+        25, 23, 23, 21, 21, 19, 19, 17, 17, 16, 16, 14, 14, 14, 14, 13, 33, 32,
+        32, 32, 32, 31, 31, 30, 30, 29, 29, 28, 28, 26, 26, 24, 24, 23, 23, 20,
+        20, 19, 19, 17, 17, 16, 16, 14, 14, 14, 14, 13, 33, 32, 32, 32, 32, 31,
+        31, 30, 30, 29, 29, 28, 28, 26, 26, 24, 24, 23, 23, 20, 20, 19, 19, 17,
+        17, 16, 16, 14, 14, 14, 14, 13, 32, 32, 32, 32, 32, 30, 30, 29, 29, 28,
+        28, 27, 27, 26, 26, 24, 24, 22, 22, 21, 21, 19, 19, 18, 18, 16, 16, 15,
+        15, 14, 14, 14, 32, 32, 32, 32, 32, 30, 30, 29, 29, 28, 28, 27, 27, 26,
+        26, 24, 24, 22, 22, 21, 21, 19, 19, 18, 18, 16, 16, 15, 15, 14, 14, 14,
+        30, 30, 30, 31, 31, 29, 29, 28, 28, 26, 26, 24, 24, 23, 23, 22, 22, 20,
+        20, 19, 19, 18, 18, 16, 16, 15, 15, 14, 14, 13, 13, 13, 30, 30, 30, 31,
+        31, 29, 29, 28, 28, 26, 26, 24, 24, 23, 23, 22, 22, 20, 20, 19, 19, 18,
+        18, 16, 16, 15, 15, 14, 14, 13, 13, 13, 28, 29, 29, 30, 30, 28, 28, 27,
+        27, 24, 24, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14,
+        14, 13, 13, 13, 13, 12, 28, 29, 29, 30, 30, 28, 28, 27, 27, 24, 24, 21,
+        21, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13,
+        13, 12, 26, 27, 27, 28, 28, 26, 26, 26, 26, 23, 23, 20, 20, 19, 19, 18,
+        18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11, 26, 27,
+        27, 28, 28, 26, 26, 26, 26, 23, 23, 20, 20, 19, 19, 18, 18, 17, 17, 16,
+        16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11, 23, 24, 24, 25, 25, 24,
+        24, 24, 24, 22, 22, 19, 19, 18, 18, 16, 16, 15, 15, 14, 14, 14, 14, 13,
+        13, 12, 12, 11, 11, 11, 11, 11, 23, 24, 24, 25, 25, 24, 24, 24, 24, 22,
+        22, 19, 19, 18, 18, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 11,
+        11, 11, 11, 11, 21, 22, 22, 23, 23, 23, 23, 22, 22, 20, 20, 18, 18, 17,
+        17, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, 10,
+        21, 22, 22, 23, 23, 23, 23, 22, 22, 20, 20, 18, 18, 17, 17, 15, 15, 14,
+        14, 13, 13, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, 10, 19, 20, 20, 21,
+        21, 20, 20, 21, 21, 19, 19, 17, 17, 16, 16, 14, 14, 13, 13, 12, 12, 12,
+        12, 11, 11, 11, 11, 10, 10, 10, 10, 9, 19, 20, 20, 21, 21, 20, 20, 21,
+        21, 19, 19, 17, 17, 16, 16, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11,
+        11, 10, 10, 10, 10, 9, 17, 18, 18, 19, 19, 19, 19, 19, 19, 18, 18, 16,
+        16, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9,
+        9, 17, 18, 18, 19, 19, 19, 19, 19, 19, 18, 18, 16, 16, 15, 15, 14, 14,
+        13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9, 16, 17, 17, 17,
+        17, 17, 17, 18, 18, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10,
+        10, 10, 10, 9, 9, 9, 9, 8, 8, 8, 16, 17, 17, 17, 17, 17, 17, 18, 18, 16,
+        16, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9,
+        8, 8, 8, 14, 15, 15, 16, 16, 16, 16, 16, 16, 15, 15, 14, 14, 13, 13, 12,
+        12, 11, 11, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, 14, 15, 15, 16,
+        16, 16, 16, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 10,
+        10, 9, 9, 9, 9, 8, 8, 8, 8, 8, 13, 13, 13, 14, 14, 14, 14, 15, 15, 14,
+        14, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 7,
+        7, 7, 13, 13, 13, 14, 14, 14, 14, 15, 15, 14, 14, 13, 13, 12, 12, 11,
+        11, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 12, 13, 13, 14, 14,
+        14, 14, 14, 14, 13, 13, 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 8,
+        8, 8, 8, 7, 7, 7, 7, 7, 12, 13, 13, 14, 14, 14, 14, 14, 14, 13, 13, 13,
+        13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 12,
+        12, 12, 13, 13, 13, 13, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10,
+        9, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7,
+        /* Size 4x8 */
+        32, 29, 20, 14, 32, 28, 20, 14, 30, 24, 19, 14, 28, 20, 16, 12, 23, 18,
+        13, 11, 19, 16, 12, 9, 16, 14, 11, 8, 14, 13, 10, 8,
+        /* Size 8x4 */
+        32, 32, 30, 28, 23, 19, 16, 14, 29, 28, 24, 20, 18, 16, 14, 13, 20, 20,
+        19, 16, 13, 12, 11, 10, 14, 14, 14, 12, 11, 9, 8, 8,
+        /* Size 8x16 */
+        32, 33, 32, 28, 23, 19, 16, 13, 33, 32, 32, 29, 24, 20, 17, 14, 33, 32,
+        31, 30, 25, 21, 17, 14, 32, 32, 30, 28, 24, 20, 17, 14, 32, 31, 29, 27,
+        24, 21, 18, 15, 30, 30, 28, 24, 21, 19, 16, 14, 28, 30, 27, 21, 19, 17,
+        15, 13, 26, 28, 26, 20, 18, 16, 14, 12, 23, 25, 24, 19, 16, 14, 13, 11,
+        21, 23, 22, 18, 15, 13, 12, 11, 19, 21, 20, 17, 14, 12, 11, 10, 18, 19,
+        19, 16, 14, 12, 10, 9, 16, 17, 18, 15, 13, 11, 10, 9, 14, 16, 16, 14,
+        12, 11, 9, 8, 13, 14, 15, 13, 11, 10, 9, 8, 12, 14, 14, 13, 11, 10, 8,
+        8,
+        /* Size 16x8 */
+        32, 33, 33, 32, 32, 30, 28, 26, 23, 21, 19, 18, 16, 14, 13, 12, 33, 32,
+        32, 32, 31, 30, 30, 28, 25, 23, 21, 19, 17, 16, 14, 14, 32, 32, 31, 30,
+        29, 28, 27, 26, 24, 22, 20, 19, 18, 16, 15, 14, 28, 29, 30, 28, 27, 24,
+        21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 23, 24, 25, 24, 24, 21, 19, 18,
+        16, 15, 14, 14, 13, 12, 11, 11, 19, 20, 21, 20, 21, 19, 17, 16, 14, 13,
+        12, 12, 11, 11, 10, 10, 16, 17, 17, 17, 18, 16, 15, 14, 13, 12, 11, 10,
+        10, 9, 9, 8, 13, 14, 14, 14, 15, 14, 13, 12, 11, 11, 10, 9, 9, 8, 8, 8,
+        /* Size 16x32 */
+        32, 33, 33, 32, 32, 28, 28, 23, 23, 19, 19, 16, 16, 13, 13, 12, 33, 32,
+        32, 32, 32, 29, 29, 24, 24, 20, 20, 17, 17, 14, 14, 12, 33, 32, 32, 32,
+        32, 29, 29, 24, 24, 20, 20, 17, 17, 14, 14, 12, 33, 32, 32, 31, 31, 30,
+        30, 25, 25, 21, 21, 17, 17, 14, 14, 13, 33, 32, 32, 31, 31, 30, 30, 25,
+        25, 21, 21, 17, 17, 14, 14, 13, 32, 32, 32, 30, 30, 28, 28, 24, 24, 20,
+        20, 17, 17, 14, 14, 13, 32, 32, 32, 30, 30, 28, 28, 24, 24, 20, 20, 17,
+        17, 14, 14, 13, 32, 31, 31, 29, 29, 27, 27, 24, 24, 21, 21, 18, 18, 15,
+        15, 14, 32, 31, 31, 29, 29, 27, 27, 24, 24, 21, 21, 18, 18, 15, 15, 14,
+        30, 30, 30, 28, 28, 24, 24, 21, 21, 19, 19, 16, 16, 14, 14, 13, 30, 30,
+        30, 28, 28, 24, 24, 21, 21, 19, 19, 16, 16, 14, 14, 13, 28, 30, 30, 27,
+        27, 21, 21, 19, 19, 17, 17, 15, 15, 13, 13, 12, 28, 30, 30, 27, 27, 21,
+        21, 19, 19, 17, 17, 15, 15, 13, 13, 12, 26, 28, 28, 26, 26, 20, 20, 18,
+        18, 16, 16, 14, 14, 12, 12, 12, 26, 28, 28, 26, 26, 20, 20, 18, 18, 16,
+        16, 14, 14, 12, 12, 12, 23, 25, 25, 24, 24, 19, 19, 16, 16, 14, 14, 13,
+        13, 11, 11, 11, 23, 25, 25, 24, 24, 19, 19, 16, 16, 14, 14, 13, 13, 11,
+        11, 11, 21, 23, 23, 22, 22, 18, 18, 15, 15, 13, 13, 12, 12, 11, 11, 10,
+        21, 23, 23, 22, 22, 18, 18, 15, 15, 13, 13, 12, 12, 11, 11, 10, 19, 21,
+        21, 20, 20, 17, 17, 14, 14, 12, 12, 11, 11, 10, 10, 9, 19, 21, 21, 20,
+        20, 17, 17, 14, 14, 12, 12, 11, 11, 10, 10, 9, 18, 19, 19, 19, 19, 16,
+        16, 14, 14, 12, 12, 10, 10, 9, 9, 9, 18, 19, 19, 19, 19, 16, 16, 14, 14,
+        12, 12, 10, 10, 9, 9, 9, 16, 17, 17, 18, 18, 15, 15, 13, 13, 11, 11, 10,
+        10, 9, 9, 8, 16, 17, 17, 18, 18, 15, 15, 13, 13, 11, 11, 10, 10, 9, 9,
+        8, 14, 16, 16, 16, 16, 14, 14, 12, 12, 11, 11, 9, 9, 8, 8, 8, 14, 16,
+        16, 16, 16, 14, 14, 12, 12, 11, 11, 9, 9, 8, 8, 8, 13, 14, 14, 15, 15,
+        13, 13, 11, 11, 10, 10, 9, 9, 8, 8, 7, 13, 14, 14, 15, 15, 13, 13, 11,
+        11, 10, 10, 9, 9, 8, 8, 7, 12, 14, 14, 14, 14, 13, 13, 11, 11, 10, 10,
+        8, 8, 8, 8, 7, 12, 14, 14, 14, 14, 13, 13, 11, 11, 10, 10, 8, 8, 8, 8,
+        7, 12, 13, 13, 13, 13, 12, 12, 11, 11, 9, 9, 8, 8, 7, 7, 7,
+        /* Size 32x16 */
+        32, 33, 33, 33, 33, 32, 32, 32, 32, 30, 30, 28, 28, 26, 26, 23, 23, 21,
+        21, 19, 19, 18, 18, 16, 16, 14, 14, 13, 13, 12, 12, 12, 33, 32, 32, 32,
+        32, 32, 32, 31, 31, 30, 30, 30, 30, 28, 28, 25, 25, 23, 23, 21, 21, 19,
+        19, 17, 17, 16, 16, 14, 14, 14, 14, 13, 33, 32, 32, 32, 32, 32, 32, 31,
+        31, 30, 30, 30, 30, 28, 28, 25, 25, 23, 23, 21, 21, 19, 19, 17, 17, 16,
+        16, 14, 14, 14, 14, 13, 32, 32, 32, 31, 31, 30, 30, 29, 29, 28, 28, 27,
+        27, 26, 26, 24, 24, 22, 22, 20, 20, 19, 19, 18, 18, 16, 16, 15, 15, 14,
+        14, 13, 32, 32, 32, 31, 31, 30, 30, 29, 29, 28, 28, 27, 27, 26, 26, 24,
+        24, 22, 22, 20, 20, 19, 19, 18, 18, 16, 16, 15, 15, 14, 14, 13, 28, 29,
+        29, 30, 30, 28, 28, 27, 27, 24, 24, 21, 21, 20, 20, 19, 19, 18, 18, 17,
+        17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 13, 12, 28, 29, 29, 30, 30, 28,
+        28, 27, 27, 24, 24, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15,
+        15, 14, 14, 13, 13, 13, 13, 12, 23, 24, 24, 25, 25, 24, 24, 24, 24, 21,
+        21, 19, 19, 18, 18, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 11,
+        11, 11, 11, 11, 23, 24, 24, 25, 25, 24, 24, 24, 24, 21, 21, 19, 19, 18,
+        18, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 11,
+        19, 20, 20, 21, 21, 20, 20, 21, 21, 19, 19, 17, 17, 16, 16, 14, 14, 13,
+        13, 12, 12, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 9, 19, 20, 20, 21,
+        21, 20, 20, 21, 21, 19, 19, 17, 17, 16, 16, 14, 14, 13, 13, 12, 12, 12,
+        12, 11, 11, 11, 11, 10, 10, 10, 10, 9, 16, 17, 17, 17, 17, 17, 17, 18,
+        18, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9,
+        9, 9, 9, 8, 8, 8, 16, 17, 17, 17, 17, 17, 17, 18, 18, 16, 16, 15, 15,
+        14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 8, 8, 8, 13,
+        14, 14, 14, 14, 14, 14, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11,
+        10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 7, 13, 14, 14, 14, 14, 14, 14, 15,
+        15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8,
+        8, 8, 8, 7, 12, 12, 12, 13, 13, 13, 13, 14, 14, 13, 13, 12, 12, 12, 12,
+        11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7,
+        /* Size 4x16 */
+        33, 28, 19, 13, 32, 29, 20, 14, 32, 30, 21, 14, 32, 28, 20, 14, 31, 27,
+        21, 15, 30, 24, 19, 14, 30, 21, 17, 13, 28, 20, 16, 12, 25, 19, 14, 11,
+        23, 18, 13, 11, 21, 17, 12, 10, 19, 16, 12, 9, 17, 15, 11, 9, 16, 14,
+        11, 8, 14, 13, 10, 8, 14, 13, 10, 8,
+        /* Size 16x4 */
+        33, 32, 32, 32, 31, 30, 30, 28, 25, 23, 21, 19, 17, 16, 14, 14, 28, 29,
+        30, 28, 27, 24, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 19, 20, 21, 20,
+        21, 19, 17, 16, 14, 13, 12, 12, 11, 11, 10, 10, 13, 14, 14, 14, 15, 14,
+        13, 12, 11, 11, 10, 9, 9, 8, 8, 8,
+        /* Size 8x32 */
+        32, 33, 32, 28, 23, 19, 16, 13, 33, 32, 32, 29, 24, 20, 17, 14, 33, 32,
+        32, 29, 24, 20, 17, 14, 33, 32, 31, 30, 25, 21, 17, 14, 33, 32, 31, 30,
+        25, 21, 17, 14, 32, 32, 30, 28, 24, 20, 17, 14, 32, 32, 30, 28, 24, 20,
+        17, 14, 32, 31, 29, 27, 24, 21, 18, 15, 32, 31, 29, 27, 24, 21, 18, 15,
+        30, 30, 28, 24, 21, 19, 16, 14, 30, 30, 28, 24, 21, 19, 16, 14, 28, 30,
+        27, 21, 19, 17, 15, 13, 28, 30, 27, 21, 19, 17, 15, 13, 26, 28, 26, 20,
+        18, 16, 14, 12, 26, 28, 26, 20, 18, 16, 14, 12, 23, 25, 24, 19, 16, 14,
+        13, 11, 23, 25, 24, 19, 16, 14, 13, 11, 21, 23, 22, 18, 15, 13, 12, 11,
+        21, 23, 22, 18, 15, 13, 12, 11, 19, 21, 20, 17, 14, 12, 11, 10, 19, 21,
+        20, 17, 14, 12, 11, 10, 18, 19, 19, 16, 14, 12, 10, 9, 18, 19, 19, 16,
+        14, 12, 10, 9, 16, 17, 18, 15, 13, 11, 10, 9, 16, 17, 18, 15, 13, 11,
+        10, 9, 14, 16, 16, 14, 12, 11, 9, 8, 14, 16, 16, 14, 12, 11, 9, 8, 13,
+        14, 15, 13, 11, 10, 9, 8, 13, 14, 15, 13, 11, 10, 9, 8, 12, 14, 14, 13,
+        11, 10, 8, 8, 12, 14, 14, 13, 11, 10, 8, 8, 12, 13, 13, 12, 11, 9, 8, 7,
+        /* Size 32x8 */
+        32, 33, 33, 33, 33, 32, 32, 32, 32, 30, 30, 28, 28, 26, 26, 23, 23, 21,
+        21, 19, 19, 18, 18, 16, 16, 14, 14, 13, 13, 12, 12, 12, 33, 32, 32, 32,
+        32, 32, 32, 31, 31, 30, 30, 30, 30, 28, 28, 25, 25, 23, 23, 21, 21, 19,
+        19, 17, 17, 16, 16, 14, 14, 14, 14, 13, 32, 32, 32, 31, 31, 30, 30, 29,
+        29, 28, 28, 27, 27, 26, 26, 24, 24, 22, 22, 20, 20, 19, 19, 18, 18, 16,
+        16, 15, 15, 14, 14, 13, 28, 29, 29, 30, 30, 28, 28, 27, 27, 24, 24, 21,
+        21, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13,
+        13, 12, 23, 24, 24, 25, 25, 24, 24, 24, 24, 21, 21, 19, 19, 18, 18, 16,
+        16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 11, 19, 20,
+        20, 21, 21, 20, 20, 21, 21, 19, 19, 17, 17, 16, 16, 14, 14, 13, 13, 12,
+        12, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 9, 16, 17, 17, 17, 17, 17,
+        17, 18, 18, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10,
+        10, 9, 9, 9, 9, 8, 8, 8, 13, 14, 14, 14, 14, 14, 14, 15, 15, 14, 14, 13,
+        13, 12, 12, 11, 11, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 7 },
+      { /* Chroma */
+        /* Size 4x4 */
+        32, 22, 22, 18, 22, 19, 19, 17, 22, 19, 16, 14, 18, 17, 14, 12,
+        /* Size 8x8 */
+        33, 30, 24, 22, 21, 20, 18, 17, 30, 26, 23, 22, 22, 21, 19, 18, 24, 23,
+        21, 21, 20, 20, 19, 18, 22, 22, 21, 19, 18, 18, 17, 16, 21, 22, 20, 18,
+        17, 16, 15, 14, 20, 21, 20, 18, 16, 14, 14, 13, 18, 19, 19, 17, 15, 14,
+        12, 12, 17, 18, 18, 16, 14, 13, 12, 11,
+        /* Size 16x16 */
+        32, 33, 34, 31, 28, 25, 21, 21, 21, 20, 20, 19, 18, 17, 16, 16, 33, 33,
+        33, 30, 27, 24, 22, 22, 22, 21, 20, 20, 19, 18, 17, 17, 34, 33, 32, 29,
+        26, 24, 22, 23, 23, 22, 22, 21, 20, 19, 18, 18, 31, 30, 29, 26, 24, 23,
+        22, 22, 23, 22, 22, 21, 20, 19, 18, 18, 28, 27, 26, 24, 22, 22, 21, 22,
+        23, 22, 22, 21, 20, 20, 19, 19, 25, 24, 24, 23, 22, 21, 20, 21, 21, 20,
+        20, 20, 19, 19, 18, 18, 21, 22, 22, 22, 21, 20, 19, 19, 19, 19, 19, 19,
+        18, 18, 17, 17, 21, 22, 23, 22, 22, 21, 19, 19, 19, 18, 18, 18, 17, 17,
+        16, 16, 21, 22, 23, 23, 23, 21, 19, 19, 18, 17, 17, 17, 16, 16, 15, 15,
+        20, 21, 22, 22, 22, 20, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 20, 20,
+        22, 22, 22, 20, 19, 18, 17, 16, 16, 15, 15, 14, 14, 14, 19, 20, 21, 21,
+        21, 20, 19, 18, 17, 16, 15, 14, 14, 14, 13, 13, 18, 19, 20, 20, 20, 19,
+        18, 17, 16, 15, 15, 14, 13, 13, 12, 12, 17, 18, 19, 19, 20, 19, 18, 17,
+        16, 15, 14, 14, 13, 12, 12, 12, 16, 17, 18, 18, 19, 18, 17, 16, 15, 14,
+        14, 13, 12, 12, 12, 11, 16, 17, 18, 18, 19, 18, 17, 16, 15, 14, 14, 13,
+        12, 12, 11, 11,
+        /* Size 32x32 */
+        32, 33, 33, 34, 34, 31, 31, 28, 28, 25, 25, 21, 21, 21, 21, 21, 21, 20,
+        20, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 33, 33, 33, 33,
+        33, 30, 30, 27, 27, 24, 24, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20,
+        20, 19, 19, 18, 18, 17, 17, 17, 17, 16, 33, 33, 33, 33, 33, 30, 30, 27,
+        27, 24, 24, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 19, 19, 18,
+        18, 17, 17, 17, 17, 16, 34, 33, 33, 32, 32, 29, 29, 26, 26, 24, 24, 22,
+        22, 23, 23, 23, 23, 22, 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18, 18,
+        18, 17, 34, 33, 33, 32, 32, 29, 29, 26, 26, 24, 24, 22, 22, 23, 23, 23,
+        23, 22, 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18, 18, 18, 17, 31, 30,
+        30, 29, 29, 26, 26, 24, 24, 23, 23, 22, 22, 22, 22, 23, 23, 22, 22, 22,
+        22, 21, 21, 20, 20, 19, 19, 18, 18, 18, 18, 17, 31, 30, 30, 29, 29, 26,
+        26, 24, 24, 23, 23, 22, 22, 22, 22, 23, 23, 22, 22, 22, 22, 21, 21, 20,
+        20, 19, 19, 18, 18, 18, 18, 17, 28, 27, 27, 26, 26, 24, 24, 22, 22, 22,
+        22, 21, 21, 22, 22, 23, 23, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 19,
+        19, 19, 19, 18, 28, 27, 27, 26, 26, 24, 24, 22, 22, 22, 22, 21, 21, 22,
+        22, 23, 23, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 19, 19, 19, 19, 18,
+        25, 24, 24, 24, 24, 23, 23, 22, 22, 21, 21, 20, 20, 21, 21, 21, 21, 20,
+        20, 20, 20, 20, 20, 19, 19, 19, 19, 18, 18, 18, 18, 17, 25, 24, 24, 24,
+        24, 23, 23, 22, 22, 21, 21, 20, 20, 21, 21, 21, 21, 20, 20, 20, 20, 20,
+        20, 19, 19, 19, 19, 18, 18, 18, 18, 17, 21, 22, 22, 22, 22, 22, 22, 21,
+        21, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18,
+        18, 17, 17, 17, 17, 17, 21, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 19,
+        19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17,
+        17, 17, 21, 22, 22, 23, 23, 22, 22, 22, 22, 21, 21, 19, 19, 19, 19, 19,
+        19, 18, 18, 18, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 16, 21, 22,
+        22, 23, 23, 22, 22, 22, 22, 21, 21, 19, 19, 19, 19, 19, 19, 18, 18, 18,
+        18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 16, 21, 22, 22, 23, 23, 23,
+        23, 23, 23, 21, 21, 19, 19, 19, 19, 18, 18, 17, 17, 17, 17, 17, 17, 16,
+        16, 16, 16, 15, 15, 15, 15, 15, 21, 22, 22, 23, 23, 23, 23, 23, 23, 21,
+        21, 19, 19, 19, 19, 18, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 15,
+        15, 15, 15, 15, 20, 21, 21, 22, 22, 22, 22, 22, 22, 20, 20, 19, 19, 18,
+        18, 17, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14,
+        20, 21, 21, 22, 22, 22, 22, 22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 17,
+        17, 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 20, 20, 20, 22,
+        22, 22, 22, 22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15,
+        15, 15, 15, 14, 14, 14, 14, 14, 14, 13, 20, 20, 20, 22, 22, 22, 22, 22,
+        22, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 14,
+        14, 14, 14, 14, 14, 13, 19, 20, 20, 21, 21, 21, 21, 21, 21, 20, 20, 19,
+        19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 14, 14, 13, 13, 13,
+        13, 13, 19, 20, 20, 21, 21, 21, 21, 21, 21, 20, 20, 19, 19, 18, 18, 17,
+        17, 16, 16, 15, 15, 14, 14, 14, 14, 14, 14, 13, 13, 13, 13, 13, 18, 19,
+        19, 20, 20, 20, 20, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 15,
+        15, 14, 14, 13, 13, 13, 13, 12, 12, 12, 12, 12, 18, 19, 19, 20, 20, 20,
+        20, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 15, 15, 14, 14, 13,
+        13, 13, 13, 12, 12, 12, 12, 12, 17, 18, 18, 19, 19, 19, 19, 20, 20, 19,
+        19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 12,
+        12, 12, 12, 12, 17, 18, 18, 19, 19, 19, 19, 20, 20, 19, 19, 18, 18, 17,
+        17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 12, 12,
+        16, 17, 17, 18, 18, 18, 18, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14,
+        14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 12, 11, 11, 11, 16, 17, 17, 18,
+        18, 18, 18, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13,
+        13, 12, 12, 12, 12, 12, 12, 11, 11, 11, 16, 17, 17, 18, 18, 18, 18, 19,
+        19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 12,
+        12, 11, 11, 11, 11, 11, 16, 17, 17, 18, 18, 18, 18, 19, 19, 18, 18, 17,
+        17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11,
+        11, 11, 15, 16, 16, 17, 17, 17, 17, 18, 18, 17, 17, 17, 17, 16, 16, 15,
+        15, 14, 14, 13, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11,
+        /* Size 4x8 */
+        33, 22, 20, 17, 28, 22, 22, 18, 24, 20, 20, 18, 22, 19, 18, 16, 22, 19,
+        16, 14, 20, 19, 15, 13, 19, 18, 14, 12, 17, 17, 14, 11,
+        /* Size 8x4 */
+        33, 28, 24, 22, 22, 20, 19, 17, 22, 22, 20, 19, 19, 19, 18, 17, 20, 22,
+        20, 18, 16, 15, 14, 14, 17, 18, 18, 16, 14, 13, 12, 11,
+        /* Size 8x16 */
+        32, 33, 28, 21, 21, 20, 18, 16, 33, 33, 27, 22, 22, 20, 19, 17, 34, 32,
+        26, 22, 23, 21, 20, 18, 31, 28, 24, 22, 22, 22, 20, 18, 28, 26, 22, 22,
+        23, 22, 20, 19, 24, 24, 22, 20, 21, 20, 19, 18, 21, 22, 21, 19, 19, 19,
+        18, 17, 21, 22, 22, 19, 18, 18, 17, 16, 21, 23, 22, 19, 18, 17, 16, 15,
+        20, 22, 22, 19, 17, 16, 15, 14, 20, 21, 22, 19, 17, 16, 14, 14, 19, 20,
+        21, 19, 17, 15, 14, 13, 18, 20, 20, 18, 16, 15, 13, 12, 17, 19, 20, 18,
+        16, 14, 13, 12, 16, 18, 19, 17, 15, 14, 12, 12, 16, 17, 18, 17, 15, 14,
+        12, 11,
+        /* Size 16x8 */
+        32, 33, 34, 31, 28, 24, 21, 21, 21, 20, 20, 19, 18, 17, 16, 16, 33, 33,
+        32, 28, 26, 24, 22, 22, 23, 22, 21, 20, 20, 19, 18, 17, 28, 27, 26, 24,
+        22, 22, 21, 22, 22, 22, 22, 21, 20, 20, 19, 18, 21, 22, 22, 22, 22, 20,
+        19, 19, 19, 19, 19, 19, 18, 18, 17, 17, 21, 22, 23, 22, 23, 21, 19, 18,
+        18, 17, 17, 17, 16, 16, 15, 15, 20, 20, 21, 22, 22, 20, 19, 18, 17, 16,
+        16, 15, 15, 14, 14, 14, 18, 19, 20, 20, 20, 19, 18, 17, 16, 15, 14, 14,
+        13, 13, 12, 12, 16, 17, 18, 18, 19, 18, 17, 16, 15, 14, 14, 13, 12, 12,
+        12, 11,
+        /* Size 16x32 */
+        32, 33, 33, 28, 28, 21, 21, 21, 21, 20, 20, 18, 18, 16, 16, 16, 33, 33,
+        33, 27, 27, 22, 22, 22, 22, 20, 20, 19, 19, 17, 17, 16, 33, 33, 33, 27,
+        27, 22, 22, 22, 22, 20, 20, 19, 19, 17, 17, 16, 34, 32, 32, 26, 26, 22,
+        22, 23, 23, 21, 21, 20, 20, 18, 18, 17, 34, 32, 32, 26, 26, 22, 22, 23,
+        23, 21, 21, 20, 20, 18, 18, 17, 31, 28, 28, 24, 24, 22, 22, 22, 22, 22,
+        22, 20, 20, 18, 18, 17, 31, 28, 28, 24, 24, 22, 22, 22, 22, 22, 22, 20,
+        20, 18, 18, 17, 28, 26, 26, 22, 22, 22, 22, 23, 23, 22, 22, 20, 20, 19,
+        19, 18, 28, 26, 26, 22, 22, 22, 22, 23, 23, 22, 22, 20, 20, 19, 19, 18,
+        24, 24, 24, 22, 22, 20, 20, 21, 21, 20, 20, 19, 19, 18, 18, 17, 24, 24,
+        24, 22, 22, 20, 20, 21, 21, 20, 20, 19, 19, 18, 18, 17, 21, 22, 22, 21,
+        21, 19, 19, 19, 19, 19, 19, 18, 18, 17, 17, 17, 21, 22, 22, 21, 21, 19,
+        19, 19, 19, 19, 19, 18, 18, 17, 17, 17, 21, 22, 22, 22, 22, 19, 19, 18,
+        18, 18, 18, 17, 17, 16, 16, 16, 21, 22, 22, 22, 22, 19, 19, 18, 18, 18,
+        18, 17, 17, 16, 16, 16, 21, 23, 23, 22, 22, 19, 19, 18, 18, 17, 17, 16,
+        16, 15, 15, 15, 21, 23, 23, 22, 22, 19, 19, 18, 18, 17, 17, 16, 16, 15,
+        15, 15, 20, 22, 22, 22, 22, 19, 19, 17, 17, 16, 16, 15, 15, 14, 14, 14,
+        20, 22, 22, 22, 22, 19, 19, 17, 17, 16, 16, 15, 15, 14, 14, 14, 20, 21,
+        21, 22, 22, 19, 19, 17, 17, 16, 16, 14, 14, 14, 14, 13, 20, 21, 21, 22,
+        22, 19, 19, 17, 17, 16, 16, 14, 14, 14, 14, 13, 19, 20, 20, 21, 21, 19,
+        19, 17, 17, 15, 15, 14, 14, 13, 13, 13, 19, 20, 20, 21, 21, 19, 19, 17,
+        17, 15, 15, 14, 14, 13, 13, 13, 18, 20, 20, 20, 20, 18, 18, 16, 16, 15,
+        15, 13, 13, 12, 12, 12, 18, 20, 20, 20, 20, 18, 18, 16, 16, 15, 15, 13,
+        13, 12, 12, 12, 17, 19, 19, 20, 20, 18, 18, 16, 16, 14, 14, 13, 13, 12,
+        12, 12, 17, 19, 19, 20, 20, 18, 18, 16, 16, 14, 14, 13, 13, 12, 12, 12,
+        16, 18, 18, 19, 19, 17, 17, 15, 15, 14, 14, 12, 12, 12, 12, 11, 16, 18,
+        18, 19, 19, 17, 17, 15, 15, 14, 14, 12, 12, 12, 12, 11, 16, 17, 17, 18,
+        18, 17, 17, 15, 15, 14, 14, 12, 12, 11, 11, 11, 16, 17, 17, 18, 18, 17,
+        17, 15, 15, 14, 14, 12, 12, 11, 11, 11, 16, 17, 17, 18, 18, 16, 16, 15,
+        15, 13, 13, 12, 12, 11, 11, 11,
+        /* Size 32x16 */
+        32, 33, 33, 34, 34, 31, 31, 28, 28, 24, 24, 21, 21, 21, 21, 21, 21, 20,
+        20, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 16, 33, 33, 33, 32,
+        32, 28, 28, 26, 26, 24, 24, 22, 22, 22, 22, 23, 23, 22, 22, 21, 21, 20,
+        20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 33, 33, 33, 32, 32, 28, 28, 26,
+        26, 24, 24, 22, 22, 22, 22, 23, 23, 22, 22, 21, 21, 20, 20, 20, 20, 19,
+        19, 18, 18, 17, 17, 17, 28, 27, 27, 26, 26, 24, 24, 22, 22, 22, 22, 21,
+        21, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 19, 19, 18,
+        18, 18, 28, 27, 27, 26, 26, 24, 24, 22, 22, 22, 22, 21, 21, 22, 22, 22,
+        22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 19, 19, 18, 18, 18, 21, 22,
+        22, 22, 22, 22, 22, 22, 22, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+        19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 16, 21, 22, 22, 22, 22, 22,
+        22, 22, 22, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18,
+        18, 18, 18, 17, 17, 17, 17, 16, 21, 22, 22, 23, 23, 22, 22, 23, 23, 21,
+        21, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 15,
+        15, 15, 15, 15, 21, 22, 22, 23, 23, 22, 22, 23, 23, 21, 21, 19, 19, 18,
+        18, 18, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 15,
+        20, 20, 20, 21, 21, 22, 22, 22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 16,
+        16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 13, 20, 20, 20, 21,
+        21, 22, 22, 22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15,
+        15, 15, 15, 14, 14, 14, 14, 14, 14, 13, 18, 19, 19, 20, 20, 20, 20, 20,
+        20, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13,
+        13, 12, 12, 12, 12, 12, 18, 19, 19, 20, 20, 20, 20, 20, 20, 19, 19, 18,
+        18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, 13, 12, 12, 12,
+        12, 12, 16, 17, 17, 18, 18, 18, 18, 19, 19, 18, 18, 17, 17, 16, 16, 15,
+        15, 14, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 12, 11, 11, 11, 16, 17,
+        17, 18, 18, 18, 18, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14,
+        14, 13, 13, 12, 12, 12, 12, 12, 12, 11, 11, 11, 16, 16, 16, 17, 17, 17,
+        17, 18, 18, 17, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 13, 12,
+        12, 12, 12, 11, 11, 11, 11, 11,
+        /* Size 4x16 */
+        33, 21, 20, 16, 33, 22, 20, 17, 32, 22, 21, 18, 28, 22, 22, 18, 26, 22,
+        22, 19, 24, 20, 20, 18, 22, 19, 19, 17, 22, 19, 18, 16, 23, 19, 17, 15,
+        22, 19, 16, 14, 21, 19, 16, 14, 20, 19, 15, 13, 20, 18, 15, 12, 19, 18,
+        14, 12, 18, 17, 14, 12, 17, 17, 14, 11,
+        /* Size 16x4 */
+        33, 33, 32, 28, 26, 24, 22, 22, 23, 22, 21, 20, 20, 19, 18, 17, 21, 22,
+        22, 22, 22, 20, 19, 19, 19, 19, 19, 19, 18, 18, 17, 17, 20, 20, 21, 22,
+        22, 20, 19, 18, 17, 16, 16, 15, 15, 14, 14, 14, 16, 17, 18, 18, 19, 18,
+        17, 16, 15, 14, 14, 13, 12, 12, 12, 11,
+        /* Size 8x32 */
+        32, 33, 28, 21, 21, 20, 18, 16, 33, 33, 27, 22, 22, 20, 19, 17, 33, 33,
+        27, 22, 22, 20, 19, 17, 34, 32, 26, 22, 23, 21, 20, 18, 34, 32, 26, 22,
+        23, 21, 20, 18, 31, 28, 24, 22, 22, 22, 20, 18, 31, 28, 24, 22, 22, 22,
+        20, 18, 28, 26, 22, 22, 23, 22, 20, 19, 28, 26, 22, 22, 23, 22, 20, 19,
+        24, 24, 22, 20, 21, 20, 19, 18, 24, 24, 22, 20, 21, 20, 19, 18, 21, 22,
+        21, 19, 19, 19, 18, 17, 21, 22, 21, 19, 19, 19, 18, 17, 21, 22, 22, 19,
+        18, 18, 17, 16, 21, 22, 22, 19, 18, 18, 17, 16, 21, 23, 22, 19, 18, 17,
+        16, 15, 21, 23, 22, 19, 18, 17, 16, 15, 20, 22, 22, 19, 17, 16, 15, 14,
+        20, 22, 22, 19, 17, 16, 15, 14, 20, 21, 22, 19, 17, 16, 14, 14, 20, 21,
+        22, 19, 17, 16, 14, 14, 19, 20, 21, 19, 17, 15, 14, 13, 19, 20, 21, 19,
+        17, 15, 14, 13, 18, 20, 20, 18, 16, 15, 13, 12, 18, 20, 20, 18, 16, 15,
+        13, 12, 17, 19, 20, 18, 16, 14, 13, 12, 17, 19, 20, 18, 16, 14, 13, 12,
+        16, 18, 19, 17, 15, 14, 12, 12, 16, 18, 19, 17, 15, 14, 12, 12, 16, 17,
+        18, 17, 15, 14, 12, 11, 16, 17, 18, 17, 15, 14, 12, 11, 16, 17, 18, 16,
+        15, 13, 12, 11,
+        /* Size 32x8 */
+        32, 33, 33, 34, 34, 31, 31, 28, 28, 24, 24, 21, 21, 21, 21, 21, 21, 20,
+        20, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 16, 33, 33, 33, 32,
+        32, 28, 28, 26, 26, 24, 24, 22, 22, 22, 22, 23, 23, 22, 22, 21, 21, 20,
+        20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 28, 27, 27, 26, 26, 24, 24, 22,
+        22, 22, 22, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20,
+        20, 19, 19, 18, 18, 18, 21, 22, 22, 22, 22, 22, 22, 22, 22, 20, 20, 19,
+        19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17,
+        17, 16, 21, 22, 22, 23, 23, 22, 22, 23, 23, 21, 21, 19, 19, 18, 18, 18,
+        18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 15, 20, 20,
+        20, 21, 21, 22, 22, 22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16,
+        16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 13, 18, 19, 19, 20, 20, 20,
+        20, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13,
+        13, 13, 13, 12, 12, 12, 12, 12, 16, 17, 17, 18, 18, 18, 18, 19, 19, 18,
+        18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12,
+        12, 11, 11, 11 },
+  },
+  {
+      { /* Luma */
+        /* Size 4x4 */
+        32, 31, 23, 17, 31, 26, 20, 16, 23, 20, 14, 12, 17, 16, 12, 9,
+        /* Size 8x8 */
+        33, 32, 32, 29, 24, 20, 17, 15, 32, 32, 31, 29, 25, 21, 18, 16, 32, 31,
+        29, 27, 24, 21, 18, 16, 29, 29, 27, 21, 19, 17, 16, 14, 24, 25, 24, 19,
+        16, 14, 13, 12, 20, 21, 21, 17, 14, 13, 12, 11, 17, 18, 18, 16, 13, 12,
+        10, 9, 15, 16, 16, 14, 12, 11, 9, 9,
+        /* Size 16x16 */
+        32, 33, 33, 33, 32, 30, 29, 27, 25, 23, 21, 19, 17, 16, 14, 13, 33, 32,
+        32, 32, 32, 30, 29, 28, 26, 24, 22, 20, 18, 17, 15, 13, 33, 32, 32, 32,
+        32, 31, 30, 28, 27, 25, 23, 21, 19, 17, 16, 14, 33, 32, 32, 31, 30, 29,
+        28, 27, 26, 24, 23, 20, 19, 17, 16, 14, 32, 32, 32, 30, 29, 28, 27, 26,
+        25, 24, 22, 21, 19, 18, 16, 15, 30, 30, 31, 29, 28, 26, 24, 23, 22, 21,
+        20, 19, 18, 16, 15, 14, 29, 29, 30, 28, 27, 24, 22, 21, 20, 19, 19, 17,
+        17, 15, 14, 13, 27, 28, 28, 27, 26, 23, 21, 20, 19, 18, 17, 16, 15, 14,
+        13, 12, 25, 26, 27, 26, 25, 22, 20, 19, 18, 17, 16, 15, 14, 14, 13, 12,
+        23, 24, 25, 24, 24, 21, 19, 18, 17, 16, 15, 14, 13, 13, 12, 11, 21, 22,
+        23, 23, 22, 20, 19, 17, 16, 15, 14, 13, 13, 12, 11, 11, 19, 20, 21, 20,
+        21, 19, 17, 16, 15, 14, 13, 12, 12, 11, 11, 10, 17, 18, 19, 19, 19, 18,
+        17, 15, 14, 13, 13, 12, 11, 10, 10, 9, 16, 17, 17, 17, 18, 16, 15, 14,
+        14, 13, 12, 11, 10, 10, 9, 9, 14, 15, 16, 16, 16, 15, 14, 13, 13, 12,
+        11, 11, 10, 9, 9, 8, 13, 13, 14, 14, 15, 14, 13, 12, 12, 11, 11, 10, 9,
+        9, 8, 8,
+        /* Size 32x32 */
+        32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 30, 30, 29, 28, 27, 26, 25, 23,
+        23, 21, 21, 19, 19, 18, 17, 17, 16, 15, 14, 14, 13, 13, 33, 33, 32, 32,
+        32, 32, 32, 32, 32, 32, 30, 30, 29, 29, 28, 27, 26, 24, 24, 22, 22, 20,
+        20, 19, 18, 17, 17, 16, 15, 15, 13, 13, 33, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 30, 30, 29, 29, 28, 27, 26, 24, 24, 22, 22, 20, 20, 19, 18, 17,
+        17, 16, 15, 15, 13, 13, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+        30, 30, 28, 27, 26, 25, 24, 23, 23, 21, 20, 19, 19, 18, 17, 17, 16, 16,
+        14, 14, 33, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 28, 28,
+        27, 25, 25, 23, 23, 21, 21, 20, 19, 18, 17, 17, 16, 16, 14, 14, 33, 32,
+        32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 29, 29, 28, 27, 26, 25, 24, 23,
+        23, 21, 21, 20, 19, 18, 17, 17, 16, 16, 14, 14, 33, 32, 32, 32, 32, 31,
+        31, 31, 30, 30, 29, 29, 28, 28, 27, 26, 26, 24, 24, 23, 23, 21, 20, 20,
+        19, 18, 17, 17, 16, 16, 14, 14, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30,
+        29, 29, 28, 28, 27, 26, 26, 24, 24, 23, 23, 21, 21, 20, 19, 18, 17, 17,
+        16, 16, 15, 15, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 28, 28, 27, 27,
+        26, 26, 25, 24, 24, 22, 22, 21, 21, 20, 19, 19, 18, 17, 16, 16, 15, 15,
+        32, 32, 32, 32, 31, 31, 30, 30, 29, 29, 28, 28, 27, 27, 26, 25, 25, 24,
+        24, 22, 22, 21, 20, 20, 19, 18, 18, 17, 16, 16, 15, 15, 30, 30, 30, 31,
+        31, 30, 29, 29, 28, 28, 26, 26, 24, 24, 23, 23, 22, 22, 21, 20, 20, 19,
+        19, 18, 18, 17, 16, 16, 15, 15, 14, 14, 30, 30, 30, 31, 31, 30, 29, 29,
+        28, 28, 26, 26, 24, 24, 23, 23, 22, 22, 21, 20, 20, 19, 19, 18, 18, 17,
+        16, 16, 15, 15, 14, 14, 29, 29, 29, 30, 30, 29, 28, 28, 27, 27, 24, 24,
+        22, 22, 21, 21, 20, 20, 19, 19, 19, 18, 17, 17, 17, 16, 15, 15, 14, 14,
+        13, 13, 28, 29, 29, 30, 30, 29, 28, 28, 27, 27, 24, 24, 22, 21, 20, 20,
+        20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 27, 28,
+        28, 28, 28, 28, 27, 27, 26, 26, 23, 23, 21, 20, 20, 20, 19, 18, 18, 17,
+        17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 26, 27, 27, 27, 28, 27,
+        26, 26, 26, 25, 23, 23, 21, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15,
+        15, 14, 14, 14, 13, 13, 12, 12, 25, 26, 26, 26, 27, 26, 26, 26, 25, 25,
+        22, 22, 20, 20, 19, 19, 18, 17, 17, 16, 16, 15, 15, 15, 14, 14, 14, 13,
+        13, 13, 12, 12, 23, 24, 24, 25, 25, 25, 24, 24, 24, 24, 22, 22, 20, 19,
+        18, 18, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 12, 11, 11,
+        23, 24, 24, 24, 25, 24, 24, 24, 24, 24, 21, 21, 19, 19, 18, 18, 17, 16,
+        16, 15, 15, 14, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 21, 22, 22, 23,
+        23, 23, 23, 23, 22, 22, 20, 20, 19, 18, 17, 17, 16, 15, 15, 14, 14, 14,
+        13, 13, 13, 12, 12, 12, 11, 11, 11, 11, 21, 22, 22, 23, 23, 23, 23, 23,
+        22, 22, 20, 20, 19, 18, 17, 17, 16, 15, 15, 14, 14, 14, 13, 13, 13, 12,
+        12, 12, 11, 11, 11, 11, 19, 20, 20, 21, 21, 21, 21, 21, 21, 21, 19, 19,
+        18, 17, 17, 16, 15, 14, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11,
+        10, 10, 19, 20, 20, 20, 21, 21, 20, 21, 21, 20, 19, 19, 17, 17, 16, 16,
+        15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 10, 10, 10, 18, 19,
+        19, 19, 20, 20, 20, 20, 20, 20, 18, 18, 17, 17, 16, 15, 15, 14, 14, 13,
+        13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 17, 18, 18, 19, 19, 19,
+        19, 19, 19, 19, 18, 18, 17, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 11,
+        11, 11, 10, 10, 10, 10, 9, 9, 17, 17, 17, 18, 18, 18, 18, 18, 19, 18,
+        17, 17, 16, 16, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 10, 10, 10,
+        10, 9, 9, 9, 16, 17, 17, 17, 17, 17, 17, 17, 18, 18, 16, 16, 15, 15, 14,
+        14, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 15, 16,
+        16, 17, 17, 17, 17, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12,
+        12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9, 14, 15, 15, 16, 16, 16, 16,
+        16, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10,
+        10, 9, 9, 9, 9, 8, 8, 14, 15, 15, 16, 16, 16, 16, 16, 16, 16, 15, 15,
+        14, 14, 13, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 9, 9, 9, 8, 8,
+        13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11,
+        11, 11, 11, 10, 10, 9, 9, 9, 9, 9, 8, 8, 8, 8, 13, 13, 13, 14, 14, 14,
+        14, 15, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 10, 10, 9,
+        9, 9, 9, 9, 8, 8, 8, 8,
+        /* Size 4x8 */
+        32, 30, 24, 17, 32, 30, 24, 17, 31, 28, 23, 18, 29, 24, 19, 15, 25, 21,
+        16, 13, 21, 19, 14, 11, 18, 17, 13, 10, 16, 15, 12, 9,
+        /* Size 8x4 */
+        32, 32, 31, 29, 25, 21, 18, 16, 30, 30, 28, 24, 21, 19, 17, 15, 24, 24,
+        23, 19, 16, 14, 13, 12, 17, 17, 18, 15, 13, 11, 10, 9,
+        /* Size 8x16 */
+        32, 33, 32, 28, 23, 19, 17, 14, 33, 32, 32, 29, 24, 20, 17, 15, 33, 32,
+        31, 30, 25, 21, 18, 16, 32, 32, 30, 28, 24, 20, 18, 16, 32, 31, 29, 27,
+        24, 21, 18, 16, 30, 30, 28, 24, 21, 19, 17, 15, 29, 30, 27, 22, 20, 17,
+        16, 14, 27, 28, 26, 21, 18, 16, 15, 13, 25, 26, 25, 20, 17, 15, 14, 13,
+        23, 24, 24, 19, 16, 14, 13, 12, 21, 23, 22, 18, 15, 13, 12, 11, 19, 21,
+        20, 17, 14, 12, 11, 10, 18, 19, 19, 16, 14, 12, 11, 10, 16, 17, 18, 15,
+        13, 11, 10, 9, 14, 16, 16, 14, 12, 11, 9, 9, 13, 14, 15, 13, 11, 10, 9,
+        8,
+        /* Size 16x8 */
+        32, 33, 33, 32, 32, 30, 29, 27, 25, 23, 21, 19, 18, 16, 14, 13, 33, 32,
+        32, 32, 31, 30, 30, 28, 26, 24, 23, 21, 19, 17, 16, 14, 32, 32, 31, 30,
+        29, 28, 27, 26, 25, 24, 22, 20, 19, 18, 16, 15, 28, 29, 30, 28, 27, 24,
+        22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 23, 24, 25, 24, 24, 21, 20, 18,
+        17, 16, 15, 14, 14, 13, 12, 11, 19, 20, 21, 20, 21, 19, 17, 16, 15, 14,
+        13, 12, 12, 11, 11, 10, 17, 17, 18, 18, 18, 17, 16, 15, 14, 13, 12, 11,
+        11, 10, 9, 9, 14, 15, 16, 16, 16, 15, 14, 13, 13, 12, 11, 10, 10, 9, 9,
+        8,
+        /* Size 16x32 */
+        32, 33, 33, 32, 32, 30, 28, 27, 23, 23, 19, 19, 17, 16, 14, 13, 33, 32,
+        32, 32, 32, 30, 29, 28, 24, 24, 20, 20, 17, 17, 15, 14, 33, 32, 32, 32,
+        32, 30, 29, 28, 24, 24, 20, 20, 17, 17, 15, 14, 33, 32, 32, 32, 32, 31,
+        29, 28, 25, 24, 20, 20, 18, 17, 15, 14, 33, 32, 32, 32, 31, 31, 30, 28,
+        25, 25, 21, 21, 18, 17, 16, 14, 33, 32, 32, 31, 31, 30, 29, 28, 25, 24,
+        21, 21, 18, 17, 16, 14, 32, 32, 32, 31, 30, 29, 28, 27, 24, 24, 20, 20,
+        18, 17, 16, 14, 32, 32, 32, 30, 30, 29, 28, 27, 24, 24, 21, 21, 18, 17,
+        16, 15, 32, 32, 31, 30, 29, 28, 27, 26, 24, 24, 21, 21, 18, 18, 16, 15,
+        32, 31, 31, 30, 29, 28, 26, 26, 24, 23, 20, 20, 18, 18, 16, 15, 30, 30,
+        30, 28, 28, 26, 24, 23, 21, 21, 19, 19, 17, 16, 15, 14, 30, 30, 30, 28,
+        28, 26, 24, 23, 21, 21, 19, 19, 17, 16, 15, 14, 29, 30, 30, 28, 27, 24,
+        22, 21, 20, 19, 17, 17, 16, 15, 14, 13, 28, 29, 30, 28, 27, 24, 21, 21,
+        19, 19, 17, 17, 16, 15, 14, 13, 27, 28, 28, 27, 26, 23, 21, 20, 18, 18,
+        16, 16, 15, 14, 13, 13, 26, 27, 28, 26, 26, 23, 20, 20, 18, 18, 16, 16,
+        14, 14, 13, 12, 25, 26, 26, 25, 25, 22, 20, 19, 17, 17, 15, 15, 14, 13,
+        13, 12, 23, 25, 25, 24, 24, 21, 19, 18, 16, 16, 14, 14, 13, 13, 12, 11,
+        23, 24, 24, 24, 24, 21, 19, 18, 16, 16, 14, 14, 13, 13, 12, 11, 21, 23,
+        23, 22, 22, 20, 18, 17, 15, 15, 13, 13, 12, 12, 11, 11, 21, 23, 23, 22,
+        22, 20, 18, 17, 15, 15, 13, 13, 12, 12, 11, 11, 19, 21, 21, 21, 21, 19,
+        17, 17, 14, 14, 13, 13, 12, 11, 10, 10, 19, 20, 21, 20, 20, 19, 17, 16,
+        14, 14, 12, 12, 11, 11, 10, 10, 18, 19, 20, 20, 20, 18, 17, 16, 14, 14,
+        12, 12, 11, 11, 10, 9, 18, 19, 19, 19, 19, 18, 16, 15, 14, 13, 12, 12,
+        11, 10, 10, 9, 17, 18, 18, 18, 18, 17, 16, 15, 13, 13, 12, 12, 10, 10,
+        9, 9, 16, 17, 17, 17, 18, 16, 15, 14, 13, 13, 11, 11, 10, 10, 9, 9, 15,
+        17, 17, 17, 17, 16, 15, 14, 13, 12, 11, 11, 10, 10, 9, 9, 14, 16, 16,
+        16, 16, 15, 14, 13, 12, 12, 11, 11, 9, 9, 9, 8, 14, 16, 16, 16, 16, 15,
+        14, 13, 12, 12, 10, 10, 9, 9, 9, 8, 13, 14, 14, 14, 15, 14, 13, 12, 11,
+        11, 10, 10, 9, 9, 8, 8, 13, 14, 14, 14, 15, 14, 13, 12, 11, 11, 10, 10,
+        9, 9, 8, 8,
+        /* Size 32x16 */
+        32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 30, 30, 29, 28, 27, 26, 25, 23,
+        23, 21, 21, 19, 19, 18, 18, 17, 16, 15, 14, 14, 13, 13, 33, 32, 32, 32,
+        32, 32, 32, 32, 32, 31, 30, 30, 30, 29, 28, 27, 26, 25, 24, 23, 23, 21,
+        20, 19, 19, 18, 17, 17, 16, 16, 14, 14, 33, 32, 32, 32, 32, 32, 32, 32,
+        31, 31, 30, 30, 30, 30, 28, 28, 26, 25, 24, 23, 23, 21, 21, 20, 19, 18,
+        17, 17, 16, 16, 14, 14, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 28, 28,
+        28, 28, 27, 26, 25, 24, 24, 22, 22, 21, 20, 20, 19, 18, 17, 17, 16, 16,
+        14, 14, 32, 32, 32, 32, 31, 31, 30, 30, 29, 29, 28, 28, 27, 27, 26, 26,
+        25, 24, 24, 22, 22, 21, 20, 20, 19, 18, 18, 17, 16, 16, 15, 15, 30, 30,
+        30, 31, 31, 30, 29, 29, 28, 28, 26, 26, 24, 24, 23, 23, 22, 21, 21, 20,
+        20, 19, 19, 18, 18, 17, 16, 16, 15, 15, 14, 14, 28, 29, 29, 29, 30, 29,
+        28, 28, 27, 26, 24, 24, 22, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 17,
+        16, 16, 15, 15, 14, 14, 13, 13, 27, 28, 28, 28, 28, 28, 27, 27, 26, 26,
+        23, 23, 21, 21, 20, 20, 19, 18, 18, 17, 17, 17, 16, 16, 15, 15, 14, 14,
+        13, 13, 12, 12, 23, 24, 24, 25, 25, 25, 24, 24, 24, 24, 21, 21, 20, 19,
+        18, 18, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 12, 11, 11,
+        23, 24, 24, 24, 25, 24, 24, 24, 24, 23, 21, 21, 19, 19, 18, 18, 17, 16,
+        16, 15, 15, 14, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 19, 20, 20, 20,
+        21, 21, 20, 21, 21, 20, 19, 19, 17, 17, 16, 16, 15, 14, 14, 13, 13, 13,
+        12, 12, 12, 12, 11, 11, 11, 10, 10, 10, 19, 20, 20, 20, 21, 21, 20, 21,
+        21, 20, 19, 19, 17, 17, 16, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12,
+        11, 11, 11, 10, 10, 10, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18, 17, 17,
+        16, 16, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 9,
+        9, 16, 17, 17, 17, 17, 17, 17, 17, 18, 18, 16, 16, 15, 15, 14, 14, 13,
+        13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 14, 15, 15, 15,
+        16, 16, 16, 16, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 11, 11, 10,
+        10, 10, 10, 9, 9, 9, 9, 9, 8, 8, 13, 14, 14, 14, 14, 14, 14, 15, 15, 15,
+        14, 14, 13, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, 9, 9, 9, 9, 9, 8, 8,
+        8, 8,
+        /* Size 4x16 */
+        33, 30, 23, 16, 32, 30, 24, 17, 32, 31, 25, 17, 32, 29, 24, 17, 32, 28,
+        24, 18, 30, 26, 21, 16, 30, 24, 19, 15, 28, 23, 18, 14, 26, 22, 17, 13,
+        24, 21, 16, 13, 23, 20, 15, 12, 20, 19, 14, 11, 19, 18, 13, 10, 17, 16,
+        13, 10, 16, 15, 12, 9, 14, 14, 11, 9,
+        /* Size 16x4 */
+        33, 32, 32, 32, 32, 30, 30, 28, 26, 24, 23, 20, 19, 17, 16, 14, 30, 30,
+        31, 29, 28, 26, 24, 23, 22, 21, 20, 19, 18, 16, 15, 14, 23, 24, 25, 24,
+        24, 21, 19, 18, 17, 16, 15, 14, 13, 13, 12, 11, 16, 17, 17, 17, 18, 16,
+        15, 14, 13, 13, 12, 11, 10, 10, 9, 9,
+        /* Size 8x32 */
+        32, 33, 32, 28, 23, 19, 17, 14, 33, 32, 32, 29, 24, 20, 17, 15, 33, 32,
+        32, 29, 24, 20, 17, 15, 33, 32, 32, 29, 25, 20, 18, 15, 33, 32, 31, 30,
+        25, 21, 18, 16, 33, 32, 31, 29, 25, 21, 18, 16, 32, 32, 30, 28, 24, 20,
+        18, 16, 32, 32, 30, 28, 24, 21, 18, 16, 32, 31, 29, 27, 24, 21, 18, 16,
+        32, 31, 29, 26, 24, 20, 18, 16, 30, 30, 28, 24, 21, 19, 17, 15, 30, 30,
+        28, 24, 21, 19, 17, 15, 29, 30, 27, 22, 20, 17, 16, 14, 28, 30, 27, 21,
+        19, 17, 16, 14, 27, 28, 26, 21, 18, 16, 15, 13, 26, 28, 26, 20, 18, 16,
+        14, 13, 25, 26, 25, 20, 17, 15, 14, 13, 23, 25, 24, 19, 16, 14, 13, 12,
+        23, 24, 24, 19, 16, 14, 13, 12, 21, 23, 22, 18, 15, 13, 12, 11, 21, 23,
+        22, 18, 15, 13, 12, 11, 19, 21, 21, 17, 14, 13, 12, 10, 19, 21, 20, 17,
+        14, 12, 11, 10, 18, 20, 20, 17, 14, 12, 11, 10, 18, 19, 19, 16, 14, 12,
+        11, 10, 17, 18, 18, 16, 13, 12, 10, 9, 16, 17, 18, 15, 13, 11, 10, 9,
+        15, 17, 17, 15, 13, 11, 10, 9, 14, 16, 16, 14, 12, 11, 9, 9, 14, 16, 16,
+        14, 12, 10, 9, 9, 13, 14, 15, 13, 11, 10, 9, 8, 13, 14, 15, 13, 11, 10,
+        9, 8,
+        /* Size 32x8 */
+        32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 30, 30, 29, 28, 27, 26, 25, 23,
+        23, 21, 21, 19, 19, 18, 18, 17, 16, 15, 14, 14, 13, 13, 33, 32, 32, 32,
+        32, 32, 32, 32, 31, 31, 30, 30, 30, 30, 28, 28, 26, 25, 24, 23, 23, 21,
+        21, 20, 19, 18, 17, 17, 16, 16, 14, 14, 32, 32, 32, 32, 31, 31, 30, 30,
+        29, 29, 28, 28, 27, 27, 26, 26, 25, 24, 24, 22, 22, 21, 20, 20, 19, 18,
+        18, 17, 16, 16, 15, 15, 28, 29, 29, 29, 30, 29, 28, 28, 27, 26, 24, 24,
+        22, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 15, 15, 14, 14,
+        13, 13, 23, 24, 24, 25, 25, 25, 24, 24, 24, 24, 21, 21, 20, 19, 18, 18,
+        17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 12, 11, 11, 19, 20,
+        20, 20, 21, 21, 20, 21, 21, 20, 19, 19, 17, 17, 16, 16, 15, 14, 14, 13,
+        13, 13, 12, 12, 12, 12, 11, 11, 11, 10, 10, 10, 17, 17, 17, 18, 18, 18,
+        18, 18, 18, 18, 17, 17, 16, 16, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11,
+        11, 10, 10, 10, 9, 9, 9, 9, 14, 15, 15, 15, 16, 16, 16, 16, 16, 16, 15,
+        15, 14, 14, 13, 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9,
+        8, 8 },
+      { /* Chroma */
+        /* Size 4x4 */
+        33, 24, 22, 19, 24, 21, 20, 19, 22, 20, 17, 15, 19, 19, 15, 13,
+        /* Size 8x8 */
+        33, 32, 27, 21, 22, 20, 19, 18, 32, 29, 24, 22, 23, 22, 20, 19, 27, 24,
+        22, 21, 23, 22, 21, 20, 21, 22, 21, 19, 19, 19, 18, 18, 22, 23, 23, 19,
+        18, 17, 16, 16, 20, 22, 22, 19, 17, 16, 15, 14, 19, 20, 21, 18, 16, 15,
+        14, 13, 18, 19, 20, 18, 16, 14, 13, 12,
+        /* Size 16x16 */
+        32, 33, 34, 31, 28, 25, 22, 21, 21, 21, 20, 20, 19, 18, 17, 16, 33, 33,
+        33, 30, 27, 24, 22, 22, 22, 22, 21, 20, 20, 19, 18, 17, 34, 33, 32, 29,
+        26, 24, 23, 22, 23, 23, 22, 22, 21, 20, 19, 18, 31, 30, 29, 26, 24, 23,
+        22, 22, 22, 23, 22, 22, 21, 20, 19, 18, 28, 27, 26, 24, 22, 22, 22, 22,
+        22, 23, 22, 22, 21, 20, 20, 19, 25, 24, 24, 23, 22, 21, 20, 20, 21, 21,
+        20, 20, 20, 19, 19, 18, 22, 22, 23, 22, 22, 20, 20, 20, 20, 20, 19, 19,
+        19, 18, 18, 17, 21, 22, 22, 22, 22, 20, 20, 19, 19, 19, 19, 18, 18, 18,
+        17, 17, 21, 22, 23, 22, 22, 21, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16,
+        21, 22, 23, 23, 23, 21, 20, 19, 18, 17, 17, 17, 16, 16, 16, 15, 20, 21,
+        22, 22, 22, 20, 19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 20, 20, 22, 22,
+        22, 20, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 19, 20, 21, 21, 21, 20,
+        19, 18, 17, 16, 16, 15, 14, 14, 14, 13, 18, 19, 20, 20, 20, 19, 18, 18,
+        17, 16, 15, 15, 14, 13, 13, 12, 17, 18, 19, 19, 20, 19, 18, 17, 16, 16,
+        15, 14, 14, 13, 12, 12, 16, 17, 18, 18, 19, 18, 17, 17, 16, 15, 14, 14,
+        13, 12, 12, 12,
+        /* Size 32x32 */
+        32, 33, 33, 34, 34, 32, 31, 30, 28, 28, 25, 25, 22, 21, 21, 21, 21, 21,
+        21, 20, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 16, 16, 33, 33, 33, 33,
+        33, 32, 30, 29, 27, 27, 24, 24, 22, 21, 22, 22, 22, 22, 22, 21, 21, 20,
+        20, 20, 20, 19, 19, 19, 18, 18, 17, 17, 33, 33, 33, 33, 33, 31, 30, 29,
+        27, 26, 24, 24, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 19,
+        19, 19, 18, 18, 17, 17, 34, 33, 33, 33, 33, 31, 29, 28, 26, 26, 24, 24,
+        22, 22, 22, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20, 20, 20, 19, 19, 19,
+        18, 18, 34, 33, 33, 33, 32, 31, 29, 28, 26, 26, 24, 24, 23, 22, 22, 23,
+        23, 23, 23, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 18, 18, 32, 32,
+        31, 31, 31, 29, 28, 27, 25, 24, 24, 24, 22, 22, 22, 22, 23, 23, 23, 22,
+        22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 18, 18, 31, 30, 30, 29, 29, 28,
+        26, 26, 24, 24, 23, 23, 22, 22, 22, 22, 22, 23, 23, 22, 22, 22, 22, 21,
+        21, 20, 20, 20, 19, 19, 18, 18, 30, 29, 29, 28, 28, 27, 26, 25, 23, 23,
+        23, 23, 22, 22, 22, 22, 22, 23, 23, 22, 22, 22, 22, 21, 21, 20, 20, 20,
+        19, 19, 19, 19, 28, 27, 27, 26, 26, 25, 24, 23, 22, 22, 22, 22, 22, 21,
+        22, 22, 22, 23, 23, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 20, 19, 19,
+        28, 27, 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 21, 21, 22, 22, 22, 23,
+        22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 20, 19, 19, 25, 24, 24, 24,
+        24, 24, 23, 23, 22, 22, 21, 21, 20, 20, 20, 21, 21, 21, 21, 20, 20, 20,
+        20, 20, 20, 20, 19, 19, 19, 19, 18, 18, 25, 24, 24, 24, 24, 24, 23, 23,
+        22, 22, 21, 21, 20, 20, 20, 21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 20,
+        19, 19, 19, 19, 18, 18, 22, 22, 22, 22, 23, 22, 22, 22, 22, 21, 20, 20,
+        20, 20, 20, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18,
+        17, 17, 21, 21, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 19,
+        19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 17, 17, 21, 22,
+        22, 22, 22, 22, 22, 22, 22, 22, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19,
+        19, 18, 18, 18, 18, 18, 18, 17, 17, 17, 17, 17, 21, 22, 22, 22, 23, 22,
+        22, 22, 22, 22, 21, 21, 20, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18,
+        18, 17, 17, 17, 17, 17, 16, 16, 21, 22, 22, 22, 23, 23, 22, 22, 22, 22,
+        21, 21, 20, 19, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17,
+        16, 16, 16, 16, 21, 22, 22, 23, 23, 23, 23, 23, 23, 23, 21, 21, 20, 19,
+        19, 19, 18, 18, 17, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15,
+        21, 22, 22, 22, 23, 23, 23, 23, 23, 22, 21, 21, 20, 19, 19, 18, 18, 17,
+        17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 16, 15, 15, 20, 21, 21, 22,
+        22, 22, 22, 22, 22, 22, 20, 20, 19, 19, 19, 18, 18, 17, 17, 17, 17, 16,
+        16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 20, 21, 21, 22, 22, 22, 22, 22,
+        22, 22, 20, 20, 19, 19, 19, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 16,
+        15, 15, 15, 15, 14, 14, 20, 20, 21, 21, 22, 22, 22, 22, 22, 22, 20, 20,
+        19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14,
+        14, 14, 20, 20, 20, 21, 22, 22, 22, 22, 22, 22, 20, 20, 19, 19, 18, 18,
+        17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 19, 20,
+        20, 20, 21, 21, 21, 21, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16,
+        16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 13, 13, 19, 20, 20, 20, 21, 21,
+        21, 21, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 15, 15, 15,
+        14, 14, 14, 14, 14, 13, 13, 13, 18, 19, 19, 20, 20, 20, 20, 20, 21, 21,
+        20, 20, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 14, 14, 14, 14, 14,
+        13, 13, 13, 13, 18, 19, 19, 20, 20, 20, 20, 20, 20, 20, 19, 19, 18, 18,
+        18, 17, 17, 16, 16, 15, 15, 15, 15, 14, 14, 14, 13, 13, 13, 13, 12, 12,
+        18, 19, 19, 19, 20, 20, 20, 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16,
+        16, 15, 15, 14, 14, 14, 14, 14, 13, 13, 13, 13, 12, 12, 17, 18, 18, 19,
+        19, 19, 19, 19, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 15, 15, 14,
+        14, 14, 14, 13, 13, 13, 12, 12, 12, 12, 17, 18, 18, 19, 19, 19, 19, 19,
+        20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 15, 15, 14, 14, 14, 13, 13,
+        13, 13, 12, 12, 12, 12, 16, 17, 17, 18, 18, 18, 18, 19, 19, 19, 18, 18,
+        17, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 12, 12, 12,
+        12, 12, 16, 17, 17, 18, 18, 18, 18, 19, 19, 19, 18, 18, 17, 17, 17, 16,
+        16, 15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 12, 12, 12, 12, 12,
+        /* Size 4x8 */
+        33, 24, 22, 19, 31, 23, 23, 20, 26, 22, 22, 20, 22, 20, 19, 18, 23, 21,
+        17, 16, 21, 20, 17, 15, 20, 20, 16, 14, 19, 19, 16, 13,
+        /* Size 8x4 */
+        33, 31, 26, 22, 23, 21, 20, 19, 24, 23, 22, 20, 21, 20, 20, 19, 22, 23,
+        22, 19, 17, 17, 16, 16, 19, 20, 20, 18, 16, 15, 14, 13,
+        /* Size 8x16 */
+        32, 33, 28, 21, 21, 20, 18, 17, 33, 33, 27, 22, 22, 20, 19, 18, 34, 32,
+        26, 22, 23, 21, 20, 19, 31, 28, 24, 22, 22, 22, 20, 19, 28, 26, 22, 22,
+        23, 22, 21, 20, 24, 24, 22, 20, 21, 20, 19, 18, 22, 22, 21, 20, 19, 19,
+        19, 18, 21, 22, 22, 19, 19, 18, 18, 17, 21, 23, 22, 19, 18, 17, 17, 16,
+        21, 23, 22, 19, 18, 17, 16, 16, 20, 22, 22, 19, 17, 16, 16, 15, 20, 21,
+        22, 19, 17, 16, 15, 14, 19, 20, 21, 19, 17, 15, 14, 13, 18, 20, 20, 18,
+        16, 15, 14, 13, 17, 19, 20, 18, 16, 14, 13, 12, 16, 18, 19, 17, 15, 14,
+        13, 12,
+        /* Size 16x8 */
+        32, 33, 34, 31, 28, 24, 22, 21, 21, 21, 20, 20, 19, 18, 17, 16, 33, 33,
+        32, 28, 26, 24, 22, 22, 23, 23, 22, 21, 20, 20, 19, 18, 28, 27, 26, 24,
+        22, 22, 21, 22, 22, 22, 22, 22, 21, 20, 20, 19, 21, 22, 22, 22, 22, 20,
+        20, 19, 19, 19, 19, 19, 19, 18, 18, 17, 21, 22, 23, 22, 23, 21, 19, 19,
+        18, 18, 17, 17, 17, 16, 16, 15, 20, 20, 21, 22, 22, 20, 19, 18, 17, 17,
+        16, 16, 15, 15, 14, 14, 18, 19, 20, 20, 21, 19, 19, 18, 17, 16, 16, 15,
+        14, 14, 13, 13, 17, 18, 19, 19, 20, 18, 18, 17, 16, 16, 15, 14, 13, 13,
+        12, 12,
+        /* Size 16x32 */
+        32, 33, 33, 29, 28, 24, 21, 21, 21, 21, 20, 20, 18, 18, 17, 16, 33, 33,
+        33, 28, 27, 24, 22, 22, 22, 22, 20, 20, 19, 19, 18, 17, 33, 33, 33, 28,
+        27, 24, 22, 22, 22, 22, 20, 20, 19, 19, 18, 17, 34, 32, 32, 28, 26, 24,
+        22, 22, 22, 22, 21, 21, 20, 20, 18, 18, 34, 32, 32, 28, 26, 24, 22, 22,
+        23, 23, 21, 21, 20, 20, 19, 18, 32, 31, 30, 26, 25, 23, 22, 22, 23, 23,
+        21, 21, 20, 20, 19, 18, 31, 29, 28, 26, 24, 23, 22, 22, 22, 22, 22, 22,
+        20, 20, 19, 18, 30, 28, 28, 24, 23, 23, 22, 22, 23, 22, 22, 22, 20, 20,
+        19, 19, 28, 26, 26, 23, 22, 22, 22, 22, 23, 22, 22, 22, 21, 20, 20, 19,
+        28, 26, 26, 23, 22, 22, 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 24, 24,
+        24, 22, 22, 21, 20, 20, 21, 21, 20, 20, 19, 19, 18, 18, 24, 24, 24, 22,
+        22, 21, 20, 20, 21, 21, 20, 20, 19, 19, 18, 18, 22, 22, 22, 22, 21, 20,
+        20, 20, 19, 19, 19, 19, 19, 18, 18, 17, 21, 22, 22, 22, 21, 20, 19, 19,
+        19, 19, 19, 19, 18, 18, 17, 17, 21, 22, 22, 22, 22, 20, 19, 19, 19, 19,
+        18, 18, 18, 18, 17, 17, 21, 22, 22, 22, 22, 20, 19, 19, 18, 18, 18, 18,
+        17, 17, 17, 16, 21, 22, 23, 22, 22, 21, 19, 19, 18, 18, 17, 17, 17, 17,
+        16, 16, 21, 23, 23, 23, 22, 21, 19, 19, 18, 17, 17, 17, 16, 16, 16, 15,
+        21, 22, 23, 22, 22, 21, 19, 19, 18, 17, 17, 17, 16, 16, 16, 15, 20, 22,
+        22, 22, 22, 20, 19, 19, 17, 17, 16, 16, 16, 15, 15, 14, 20, 22, 22, 22,
+        22, 20, 19, 19, 17, 17, 16, 16, 16, 15, 15, 14, 20, 21, 21, 22, 22, 20,
+        19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 20, 21, 21, 22, 22, 20, 19, 18,
+        17, 17, 16, 16, 15, 14, 14, 14, 19, 20, 21, 21, 21, 20, 19, 18, 17, 17,
+        15, 15, 14, 14, 14, 13, 19, 20, 20, 21, 21, 20, 19, 18, 17, 16, 15, 15,
+        14, 14, 13, 13, 19, 20, 20, 20, 21, 20, 18, 18, 16, 16, 15, 15, 14, 14,
+        13, 13, 18, 20, 20, 20, 20, 19, 18, 18, 16, 16, 15, 15, 14, 13, 13, 12,
+        18, 19, 19, 20, 20, 19, 18, 17, 16, 16, 14, 14, 13, 13, 13, 12, 17, 19,
+        19, 19, 20, 19, 18, 17, 16, 16, 14, 14, 13, 13, 12, 12, 17, 19, 19, 19,
+        19, 19, 17, 17, 16, 16, 14, 14, 13, 13, 12, 12, 16, 18, 18, 18, 19, 18,
+        17, 17, 15, 15, 14, 14, 13, 12, 12, 12, 16, 18, 18, 18, 19, 18, 17, 17,
+        15, 15, 14, 14, 13, 12, 12, 12,
+        /* Size 32x16 */
+        32, 33, 33, 34, 34, 32, 31, 30, 28, 28, 24, 24, 22, 21, 21, 21, 21, 21,
+        21, 20, 20, 20, 20, 19, 19, 19, 18, 18, 17, 17, 16, 16, 33, 33, 33, 32,
+        32, 31, 29, 28, 26, 26, 24, 24, 22, 22, 22, 22, 22, 23, 22, 22, 22, 21,
+        21, 20, 20, 20, 20, 19, 19, 19, 18, 18, 33, 33, 33, 32, 32, 30, 28, 28,
+        26, 26, 24, 24, 22, 22, 22, 22, 23, 23, 23, 22, 22, 21, 21, 21, 20, 20,
+        20, 19, 19, 19, 18, 18, 29, 28, 28, 28, 28, 26, 26, 24, 23, 23, 22, 22,
+        22, 22, 22, 22, 22, 23, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19,
+        18, 18, 28, 27, 27, 26, 26, 25, 24, 23, 22, 22, 22, 22, 21, 21, 22, 22,
+        22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 19, 19, 19, 24, 24,
+        24, 24, 24, 23, 23, 23, 22, 22, 21, 21, 20, 20, 20, 20, 21, 21, 21, 20,
+        20, 20, 20, 20, 20, 20, 19, 19, 19, 19, 18, 18, 21, 22, 22, 22, 22, 22,
+        22, 22, 22, 21, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+        19, 18, 18, 18, 18, 17, 17, 17, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+        20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 17,
+        17, 17, 17, 17, 21, 22, 22, 22, 23, 23, 22, 23, 23, 22, 21, 21, 19, 19,
+        19, 18, 18, 18, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15,
+        21, 22, 22, 22, 23, 23, 22, 22, 22, 22, 21, 21, 19, 19, 19, 18, 18, 17,
+        17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 16, 15, 15, 20, 20, 20, 21,
+        21, 21, 22, 22, 22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16,
+        16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 20, 20, 20, 21, 21, 21, 22, 22,
+        22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15,
+        15, 14, 14, 14, 14, 14, 18, 19, 19, 20, 20, 20, 20, 20, 21, 21, 19, 19,
+        19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13,
+        13, 13, 18, 19, 19, 20, 20, 20, 20, 20, 20, 20, 19, 19, 18, 18, 18, 17,
+        17, 16, 16, 15, 15, 15, 14, 14, 14, 14, 13, 13, 13, 13, 12, 12, 17, 18,
+        18, 18, 19, 19, 19, 19, 20, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 15,
+        15, 14, 14, 14, 13, 13, 13, 13, 12, 12, 12, 12, 16, 17, 17, 18, 18, 18,
+        18, 19, 19, 19, 18, 18, 17, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13,
+        13, 13, 12, 12, 12, 12, 12, 12,
+        /* Size 4x16 */
+        33, 24, 21, 18, 33, 24, 22, 19, 32, 24, 23, 20, 29, 23, 22, 20, 26, 22,
+        22, 20, 24, 21, 21, 19, 22, 20, 19, 18, 22, 20, 19, 18, 22, 21, 18, 17,
+        22, 21, 17, 16, 22, 20, 17, 15, 21, 20, 17, 14, 20, 20, 16, 14, 20, 19,
+        16, 13, 19, 19, 16, 13, 18, 18, 15, 12,
+        /* Size 16x4 */
+        33, 33, 32, 29, 26, 24, 22, 22, 22, 22, 22, 21, 20, 20, 19, 18, 24, 24,
+        24, 23, 22, 21, 20, 20, 21, 21, 20, 20, 20, 19, 19, 18, 21, 22, 23, 22,
+        22, 21, 19, 19, 18, 17, 17, 17, 16, 16, 16, 15, 18, 19, 20, 20, 20, 19,
+        18, 18, 17, 16, 15, 14, 14, 13, 13, 12,
+        /* Size 8x32 */
+        32, 33, 28, 21, 21, 20, 18, 17, 33, 33, 27, 22, 22, 20, 19, 18, 33, 33,
+        27, 22, 22, 20, 19, 18, 34, 32, 26, 22, 22, 21, 20, 18, 34, 32, 26, 22,
+        23, 21, 20, 19, 32, 30, 25, 22, 23, 21, 20, 19, 31, 28, 24, 22, 22, 22,
+        20, 19, 30, 28, 23, 22, 23, 22, 20, 19, 28, 26, 22, 22, 23, 22, 21, 20,
+        28, 26, 22, 21, 22, 22, 21, 19, 24, 24, 22, 20, 21, 20, 19, 18, 24, 24,
+        22, 20, 21, 20, 19, 18, 22, 22, 21, 20, 19, 19, 19, 18, 21, 22, 21, 19,
+        19, 19, 18, 17, 21, 22, 22, 19, 19, 18, 18, 17, 21, 22, 22, 19, 18, 18,
+        17, 17, 21, 23, 22, 19, 18, 17, 17, 16, 21, 23, 22, 19, 18, 17, 16, 16,
+        21, 23, 22, 19, 18, 17, 16, 16, 20, 22, 22, 19, 17, 16, 16, 15, 20, 22,
+        22, 19, 17, 16, 16, 15, 20, 21, 22, 19, 17, 16, 15, 14, 20, 21, 22, 19,
+        17, 16, 15, 14, 19, 21, 21, 19, 17, 15, 14, 14, 19, 20, 21, 19, 17, 15,
+        14, 13, 19, 20, 21, 18, 16, 15, 14, 13, 18, 20, 20, 18, 16, 15, 14, 13,
+        18, 19, 20, 18, 16, 14, 13, 13, 17, 19, 20, 18, 16, 14, 13, 12, 17, 19,
+        19, 17, 16, 14, 13, 12, 16, 18, 19, 17, 15, 14, 13, 12, 16, 18, 19, 17,
+        15, 14, 13, 12,
+        /* Size 32x8 */
+        32, 33, 33, 34, 34, 32, 31, 30, 28, 28, 24, 24, 22, 21, 21, 21, 21, 21,
+        21, 20, 20, 20, 20, 19, 19, 19, 18, 18, 17, 17, 16, 16, 33, 33, 33, 32,
+        32, 30, 28, 28, 26, 26, 24, 24, 22, 22, 22, 22, 23, 23, 23, 22, 22, 21,
+        21, 21, 20, 20, 20, 19, 19, 19, 18, 18, 28, 27, 27, 26, 26, 25, 24, 23,
+        22, 22, 22, 22, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21,
+        20, 20, 20, 19, 19, 19, 21, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20,
+        20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 17,
+        17, 17, 21, 22, 22, 22, 23, 23, 22, 23, 23, 22, 21, 21, 19, 19, 19, 18,
+        18, 18, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 20, 20,
+        20, 21, 21, 21, 22, 22, 22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16,
+        16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 18, 19, 19, 20, 20, 20,
+        20, 20, 21, 21, 19, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 14,
+        14, 14, 14, 13, 13, 13, 13, 13, 17, 18, 18, 18, 19, 19, 19, 19, 20, 19,
+        18, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 14, 14, 14, 13, 13, 13, 13,
+        12, 12, 12, 12 },
+  },
+  {
+      { /* Luma */
+        /* Size 4x4 */
+        32, 31, 24, 19, 31, 27, 22, 18, 24, 22, 16, 14, 19, 18, 14, 11,
+        /* Size 8x8 */
+        33, 32, 32, 30, 27, 22, 20, 16, 32, 32, 32, 30, 28, 23, 21, 17, 32, 32,
+        29, 28, 26, 23, 21, 18, 30, 30, 28, 24, 22, 20, 18, 16, 27, 28, 26, 22,
+        19, 17, 16, 14, 22, 23, 23, 20, 17, 15, 14, 12, 20, 21, 21, 18, 16, 14,
+        12, 11, 16, 17, 18, 16, 14, 12, 11, 10,
+        /* Size 16x16 */
+        32, 33, 33, 33, 32, 32, 30, 28, 27, 25, 23, 21, 19, 18, 17, 16, 33, 32,
+        32, 32, 32, 32, 30, 29, 27, 26, 24, 22, 20, 19, 18, 17, 33, 32, 32, 32,
+        32, 32, 31, 30, 28, 27, 25, 23, 21, 19, 18, 17, 33, 32, 32, 31, 31, 31,
+        29, 28, 27, 26, 24, 23, 21, 19, 18, 17, 32, 32, 32, 31, 30, 30, 28, 28,
+        26, 26, 24, 23, 21, 19, 19, 17, 32, 32, 32, 31, 30, 29, 28, 27, 26, 25,
+        24, 22, 21, 20, 19, 18, 30, 30, 31, 29, 28, 28, 26, 24, 23, 22, 22, 20,
+        19, 18, 17, 16, 28, 29, 30, 28, 28, 27, 24, 21, 20, 20, 19, 18, 17, 16,
+        16, 15, 27, 27, 28, 27, 26, 26, 23, 20, 20, 19, 18, 17, 16, 15, 15, 14,
+        25, 26, 27, 26, 26, 25, 22, 20, 19, 18, 17, 16, 15, 15, 14, 14, 23, 24,
+        25, 24, 24, 24, 22, 19, 18, 17, 16, 15, 14, 14, 13, 13, 21, 22, 23, 23,
+        23, 22, 20, 18, 17, 16, 15, 14, 13, 13, 12, 12, 19, 20, 21, 21, 21, 21,
+        19, 17, 16, 15, 14, 13, 12, 12, 12, 11, 18, 19, 19, 19, 19, 20, 18, 16,
+        15, 15, 14, 13, 12, 11, 11, 11, 17, 18, 18, 18, 19, 19, 17, 16, 15, 14,
+        13, 12, 12, 11, 11, 10, 16, 17, 17, 17, 17, 18, 16, 15, 14, 14, 13, 12,
+        11, 11, 10, 10,
+        /* Size 32x32 */
+        32, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 31, 30, 30, 28, 28, 27, 26,
+        25, 23, 23, 22, 21, 20, 19, 19, 18, 17, 17, 16, 16, 15, 33, 33, 33, 32,
+        32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 27, 27, 26, 24, 24, 22,
+        22, 21, 20, 20, 18, 18, 17, 16, 16, 15, 33, 33, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 31, 30, 30, 29, 29, 27, 27, 26, 24, 24, 23, 22, 21, 20, 20,
+        19, 18, 18, 17, 17, 15, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
+        30, 30, 29, 29, 28, 27, 26, 24, 24, 23, 23, 22, 20, 20, 19, 19, 18, 17,
+        17, 16, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30,
+        28, 28, 27, 25, 25, 23, 23, 22, 21, 21, 19, 19, 18, 17, 17, 16, 33, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 28, 28, 27, 25,
+        25, 23, 23, 22, 21, 21, 19, 19, 18, 17, 17, 16, 33, 32, 32, 32, 32, 32,
+        31, 31, 31, 31, 31, 30, 29, 29, 28, 28, 27, 26, 26, 24, 24, 23, 23, 22,
+        21, 21, 19, 19, 18, 17, 17, 16, 33, 32, 32, 32, 32, 32, 31, 31, 31, 30,
+        30, 29, 29, 28, 28, 28, 27, 26, 26, 24, 24, 23, 23, 22, 20, 20, 19, 19,
+        18, 17, 17, 16, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 29, 28, 28,
+        28, 28, 26, 26, 26, 24, 24, 23, 23, 22, 21, 21, 19, 19, 19, 17, 17, 16,
+        32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 28, 28, 28, 27, 27, 26, 26,
+        25, 24, 24, 23, 22, 22, 21, 21, 20, 19, 19, 18, 18, 17, 32, 32, 32, 32,
+        32, 32, 31, 30, 30, 29, 29, 28, 28, 28, 27, 27, 26, 26, 25, 24, 24, 23,
+        22, 22, 21, 21, 20, 19, 19, 18, 18, 17, 31, 31, 31, 31, 31, 31, 30, 29,
+        29, 28, 28, 27, 26, 26, 24, 24, 24, 23, 23, 22, 22, 21, 20, 20, 19, 19,
+        18, 18, 17, 17, 17, 16, 30, 30, 30, 30, 31, 31, 29, 29, 28, 28, 28, 26,
+        26, 25, 24, 24, 23, 23, 22, 22, 22, 20, 20, 20, 19, 19, 18, 18, 17, 16,
+        16, 15, 30, 30, 30, 30, 30, 30, 29, 28, 28, 28, 28, 26, 25, 24, 23, 23,
+        22, 22, 21, 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 15, 28, 29,
+        29, 29, 30, 30, 28, 28, 28, 27, 27, 24, 24, 23, 21, 21, 20, 20, 20, 19,
+        19, 18, 18, 18, 17, 17, 16, 16, 16, 15, 15, 14, 28, 29, 29, 29, 30, 30,
+        28, 28, 28, 27, 27, 24, 24, 23, 21, 21, 20, 20, 20, 19, 19, 18, 18, 18,
+        17, 17, 16, 16, 16, 15, 15, 14, 27, 27, 27, 28, 28, 28, 27, 27, 26, 26,
+        26, 24, 23, 22, 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 15, 15,
+        15, 14, 14, 13, 26, 27, 27, 27, 28, 28, 26, 26, 26, 26, 26, 23, 23, 22,
+        20, 20, 19, 19, 19, 18, 18, 17, 17, 17, 16, 16, 15, 15, 15, 14, 14, 13,
+        25, 26, 26, 26, 27, 27, 26, 26, 26, 25, 25, 23, 22, 21, 20, 20, 19, 19,
+        18, 17, 17, 17, 16, 16, 15, 15, 15, 14, 14, 14, 14, 13, 23, 24, 24, 24,
+        25, 25, 24, 24, 24, 24, 24, 22, 22, 20, 19, 19, 18, 18, 17, 16, 16, 16,
+        15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 23, 24, 24, 24, 25, 25, 24, 24,
+        24, 24, 24, 22, 22, 20, 19, 19, 18, 18, 17, 16, 16, 16, 15, 15, 14, 14,
+        14, 14, 13, 13, 13, 12, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 21,
+        20, 20, 18, 18, 17, 17, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 13, 12,
+        12, 12, 21, 22, 22, 23, 23, 23, 23, 23, 23, 22, 22, 20, 20, 19, 18, 18,
+        17, 17, 16, 15, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 12, 12, 20, 21,
+        21, 22, 22, 22, 22, 22, 22, 22, 22, 20, 20, 19, 18, 18, 17, 17, 16, 15,
+        15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 11, 19, 20, 20, 20, 21, 21,
+        21, 20, 21, 21, 21, 19, 19, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13,
+        12, 12, 12, 12, 12, 11, 11, 11, 19, 20, 20, 20, 21, 21, 21, 20, 21, 21,
+        21, 19, 19, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12,
+        12, 11, 11, 11, 18, 18, 19, 19, 19, 19, 19, 19, 19, 20, 20, 18, 18, 17,
+        16, 16, 15, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 10,
+        17, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 17, 16, 16, 15, 15,
+        14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 17, 17, 18, 18,
+        18, 18, 18, 18, 19, 19, 19, 17, 17, 17, 16, 16, 15, 15, 14, 13, 13, 13,
+        12, 12, 12, 12, 11, 11, 11, 10, 10, 10, 16, 16, 17, 17, 17, 17, 17, 17,
+        17, 18, 18, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11,
+        11, 10, 10, 10, 10, 9, 16, 16, 17, 17, 17, 17, 17, 17, 17, 18, 18, 17,
+        16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10,
+        10, 9, 15, 15, 15, 16, 16, 16, 16, 16, 16, 17, 17, 16, 15, 15, 14, 14,
+        13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 9,
+        /* Size 4x8 */
+        32, 32, 24, 18, 32, 31, 25, 19, 32, 29, 24, 20, 30, 28, 20, 17, 27, 26,
+        18, 15, 23, 23, 16, 13, 20, 20, 14, 12, 17, 18, 13, 11,
+        /* Size 8x4 */
+        32, 32, 32, 30, 27, 23, 20, 17, 32, 31, 29, 28, 26, 23, 20, 18, 24, 25,
+        24, 20, 18, 16, 14, 13, 18, 19, 20, 17, 15, 13, 12, 11,
+        /* Size 8x16 */
+        32, 33, 32, 29, 26, 23, 19, 16, 33, 32, 32, 29, 27, 24, 20, 17, 33, 32,
+        31, 30, 28, 25, 21, 17, 33, 32, 30, 29, 27, 24, 21, 17, 32, 32, 30, 28,
+        26, 24, 21, 18, 32, 31, 29, 28, 26, 24, 21, 18, 30, 30, 28, 25, 23, 21,
+        19, 16, 28, 30, 27, 22, 20, 19, 17, 15, 27, 28, 26, 22, 20, 18, 16, 14,
+        25, 26, 25, 21, 19, 17, 15, 13, 23, 25, 24, 20, 18, 16, 14, 13, 21, 23,
+        22, 19, 17, 15, 13, 12, 19, 21, 20, 18, 16, 14, 12, 11, 18, 19, 19, 17,
+        15, 14, 12, 11, 17, 18, 18, 16, 15, 13, 12, 10, 16, 17, 18, 16, 14, 13,
+        11, 10,
+        /* Size 16x8 */
+        32, 33, 33, 33, 32, 32, 30, 28, 27, 25, 23, 21, 19, 18, 17, 16, 33, 32,
+        32, 32, 32, 31, 30, 30, 28, 26, 25, 23, 21, 19, 18, 17, 32, 32, 31, 30,
+        30, 29, 28, 27, 26, 25, 24, 22, 20, 19, 18, 18, 29, 29, 30, 29, 28, 28,
+        25, 22, 22, 21, 20, 19, 18, 17, 16, 16, 26, 27, 28, 27, 26, 26, 23, 20,
+        20, 19, 18, 17, 16, 15, 15, 14, 23, 24, 25, 24, 24, 24, 21, 19, 18, 17,
+        16, 15, 14, 14, 13, 13, 19, 20, 21, 21, 21, 21, 19, 17, 16, 15, 14, 13,
+        12, 12, 12, 11, 16, 17, 17, 17, 18, 18, 16, 15, 14, 13, 13, 12, 11, 11,
+        10, 10,
+        /* Size 16x32 */
+        32, 33, 33, 33, 32, 32, 29, 28, 26, 23, 23, 20, 19, 18, 16, 16, 33, 32,
+        32, 32, 32, 32, 29, 29, 27, 24, 24, 21, 20, 18, 16, 16, 33, 32, 32, 32,
+        32, 32, 29, 29, 27, 24, 24, 21, 20, 19, 17, 17, 33, 32, 32, 32, 32, 32,
+        30, 29, 28, 25, 25, 21, 20, 19, 17, 17, 33, 32, 32, 32, 31, 31, 30, 30,
+        28, 25, 25, 22, 21, 19, 17, 17, 33, 32, 32, 32, 31, 31, 30, 30, 28, 25,
+        25, 22, 21, 19, 17, 17, 33, 32, 32, 31, 30, 30, 29, 28, 27, 24, 24, 21,
+        21, 19, 17, 17, 32, 32, 32, 31, 30, 30, 28, 28, 27, 24, 24, 21, 20, 19,
+        17, 17, 32, 32, 32, 31, 30, 30, 28, 28, 26, 24, 24, 21, 21, 19, 18, 18,
+        32, 32, 31, 30, 29, 29, 28, 27, 26, 24, 24, 21, 21, 20, 18, 18, 32, 32,
+        31, 30, 29, 29, 28, 27, 26, 24, 24, 21, 21, 20, 18, 18, 31, 31, 31, 29,
+        28, 28, 26, 25, 24, 22, 22, 20, 19, 18, 17, 17, 30, 30, 30, 29, 28, 28,
+        25, 24, 23, 21, 21, 19, 19, 18, 16, 16, 30, 30, 30, 29, 28, 28, 24, 23,
+        22, 20, 20, 19, 18, 17, 16, 16, 28, 29, 30, 28, 27, 27, 22, 21, 20, 19,
+        19, 18, 17, 16, 15, 15, 28, 29, 30, 28, 27, 27, 22, 21, 20, 19, 19, 18,
+        17, 16, 15, 15, 27, 28, 28, 27, 26, 26, 22, 20, 20, 18, 18, 17, 16, 15,
+        14, 14, 26, 27, 28, 26, 26, 26, 21, 20, 19, 18, 18, 16, 16, 15, 14, 14,
+        25, 26, 26, 26, 25, 25, 21, 20, 19, 17, 17, 16, 15, 15, 13, 13, 23, 25,
+        25, 24, 24, 24, 20, 19, 18, 16, 16, 15, 14, 14, 13, 13, 23, 25, 25, 24,
+        24, 24, 20, 19, 18, 16, 16, 15, 14, 14, 13, 13, 22, 23, 23, 23, 23, 23,
+        19, 18, 17, 16, 16, 14, 14, 13, 12, 12, 21, 23, 23, 23, 22, 22, 19, 18,
+        17, 15, 15, 14, 13, 13, 12, 12, 20, 22, 22, 22, 22, 22, 19, 18, 17, 15,
+        15, 13, 13, 12, 12, 12, 19, 20, 21, 20, 20, 20, 18, 17, 16, 14, 14, 13,
+        12, 12, 11, 11, 19, 20, 21, 20, 20, 20, 18, 17, 16, 14, 14, 13, 12, 12,
+        11, 11, 18, 19, 19, 19, 19, 19, 17, 16, 15, 14, 14, 12, 12, 11, 11, 11,
+        18, 19, 19, 19, 19, 19, 17, 16, 15, 14, 14, 12, 12, 11, 10, 10, 17, 18,
+        18, 18, 18, 18, 16, 16, 15, 13, 13, 12, 12, 11, 10, 10, 16, 17, 17, 17,
+        18, 18, 16, 15, 14, 13, 13, 12, 11, 11, 10, 10, 16, 17, 17, 17, 18, 18,
+        16, 15, 14, 13, 13, 12, 11, 11, 10, 10, 15, 16, 16, 16, 17, 17, 15, 14,
+        13, 12, 12, 11, 11, 10, 9, 9,
+        /* Size 32x16 */
+        32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 31, 30, 30, 28, 28, 27, 26,
+        25, 23, 23, 22, 21, 20, 19, 19, 18, 18, 17, 16, 16, 15, 33, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 28, 27, 26, 25, 25, 23,
+        23, 22, 20, 20, 19, 19, 18, 17, 17, 16, 33, 32, 32, 32, 32, 32, 32, 32,
+        32, 31, 31, 31, 30, 30, 30, 30, 28, 28, 26, 25, 25, 23, 23, 22, 21, 21,
+        19, 19, 18, 17, 17, 16, 33, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 29,
+        29, 29, 28, 28, 27, 26, 26, 24, 24, 23, 23, 22, 20, 20, 19, 19, 18, 17,
+        17, 16, 32, 32, 32, 32, 31, 31, 30, 30, 30, 29, 29, 28, 28, 28, 27, 27,
+        26, 26, 25, 24, 24, 23, 22, 22, 20, 20, 19, 19, 18, 18, 18, 17, 32, 32,
+        32, 32, 31, 31, 30, 30, 30, 29, 29, 28, 28, 28, 27, 27, 26, 26, 25, 24,
+        24, 23, 22, 22, 20, 20, 19, 19, 18, 18, 18, 17, 29, 29, 29, 30, 30, 30,
+        29, 28, 28, 28, 28, 26, 25, 24, 22, 22, 22, 21, 21, 20, 20, 19, 19, 19,
+        18, 18, 17, 17, 16, 16, 16, 15, 28, 29, 29, 29, 30, 30, 28, 28, 28, 27,
+        27, 25, 24, 23, 21, 21, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 16, 16,
+        16, 15, 15, 14, 26, 27, 27, 28, 28, 28, 27, 27, 26, 26, 26, 24, 23, 22,
+        20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 15, 15, 15, 14, 14, 13,
+        23, 24, 24, 25, 25, 25, 24, 24, 24, 24, 24, 22, 21, 20, 19, 19, 18, 18,
+        17, 16, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 23, 24, 24, 25,
+        25, 25, 24, 24, 24, 24, 24, 22, 21, 20, 19, 19, 18, 18, 17, 16, 16, 16,
+        15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 20, 21, 21, 21, 22, 22, 21, 21,
+        21, 21, 21, 20, 19, 19, 18, 18, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13,
+        12, 12, 12, 12, 12, 11, 19, 20, 20, 20, 21, 21, 21, 20, 21, 21, 21, 19,
+        19, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 11,
+        11, 11, 18, 18, 19, 19, 19, 19, 19, 19, 19, 20, 20, 18, 18, 17, 16, 16,
+        15, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 10, 16, 16,
+        17, 17, 17, 17, 17, 17, 18, 18, 18, 17, 16, 16, 15, 15, 14, 14, 13, 13,
+        13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 16, 16, 17, 17, 17, 17,
+        17, 17, 18, 18, 18, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12,
+        11, 11, 11, 10, 10, 10, 10, 9,
+        /* Size 4x16 */
+        33, 32, 23, 18, 32, 32, 24, 19, 32, 31, 25, 19, 32, 30, 24, 19, 32, 30,
+        24, 19, 32, 29, 24, 20, 30, 28, 21, 18, 29, 27, 19, 16, 28, 26, 18, 15,
+        26, 25, 17, 15, 25, 24, 16, 14, 23, 22, 15, 13, 20, 20, 14, 12, 19, 19,
+        14, 11, 18, 18, 13, 11, 17, 18, 13, 11,
+        /* Size 16x4 */
+        33, 32, 32, 32, 32, 32, 30, 29, 28, 26, 25, 23, 20, 19, 18, 17, 32, 32,
+        31, 30, 30, 29, 28, 27, 26, 25, 24, 22, 20, 19, 18, 18, 23, 24, 25, 24,
+        24, 24, 21, 19, 18, 17, 16, 15, 14, 14, 13, 13, 18, 19, 19, 19, 19, 20,
+        18, 16, 15, 15, 14, 13, 12, 11, 11, 11,
+        /* Size 8x32 */
+        32, 33, 32, 29, 26, 23, 19, 16, 33, 32, 32, 29, 27, 24, 20, 16, 33, 32,
+        32, 29, 27, 24, 20, 17, 33, 32, 32, 30, 28, 25, 20, 17, 33, 32, 31, 30,
+        28, 25, 21, 17, 33, 32, 31, 30, 28, 25, 21, 17, 33, 32, 30, 29, 27, 24,
+        21, 17, 32, 32, 30, 28, 27, 24, 20, 17, 32, 32, 30, 28, 26, 24, 21, 18,
+        32, 31, 29, 28, 26, 24, 21, 18, 32, 31, 29, 28, 26, 24, 21, 18, 31, 31,
+        28, 26, 24, 22, 19, 17, 30, 30, 28, 25, 23, 21, 19, 16, 30, 30, 28, 24,
+        22, 20, 18, 16, 28, 30, 27, 22, 20, 19, 17, 15, 28, 30, 27, 22, 20, 19,
+        17, 15, 27, 28, 26, 22, 20, 18, 16, 14, 26, 28, 26, 21, 19, 18, 16, 14,
+        25, 26, 25, 21, 19, 17, 15, 13, 23, 25, 24, 20, 18, 16, 14, 13, 23, 25,
+        24, 20, 18, 16, 14, 13, 22, 23, 23, 19, 17, 16, 14, 12, 21, 23, 22, 19,
+        17, 15, 13, 12, 20, 22, 22, 19, 17, 15, 13, 12, 19, 21, 20, 18, 16, 14,
+        12, 11, 19, 21, 20, 18, 16, 14, 12, 11, 18, 19, 19, 17, 15, 14, 12, 11,
+        18, 19, 19, 17, 15, 14, 12, 10, 17, 18, 18, 16, 15, 13, 12, 10, 16, 17,
+        18, 16, 14, 13, 11, 10, 16, 17, 18, 16, 14, 13, 11, 10, 15, 16, 17, 15,
+        13, 12, 11, 9,
+        /* Size 32x8 */
+        32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 31, 30, 30, 28, 28, 27, 26,
+        25, 23, 23, 22, 21, 20, 19, 19, 18, 18, 17, 16, 16, 15, 33, 32, 32, 32,
+        32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 28, 28, 26, 25, 25, 23,
+        23, 22, 21, 21, 19, 19, 18, 17, 17, 16, 32, 32, 32, 32, 31, 31, 30, 30,
+        30, 29, 29, 28, 28, 28, 27, 27, 26, 26, 25, 24, 24, 23, 22, 22, 20, 20,
+        19, 19, 18, 18, 18, 17, 29, 29, 29, 30, 30, 30, 29, 28, 28, 28, 28, 26,
+        25, 24, 22, 22, 22, 21, 21, 20, 20, 19, 19, 19, 18, 18, 17, 17, 16, 16,
+        16, 15, 26, 27, 27, 28, 28, 28, 27, 27, 26, 26, 26, 24, 23, 22, 20, 20,
+        20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 15, 15, 15, 14, 14, 13, 23, 24,
+        24, 25, 25, 25, 24, 24, 24, 24, 24, 22, 21, 20, 19, 19, 18, 18, 17, 16,
+        16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 19, 20, 20, 20, 21, 21,
+        21, 20, 21, 21, 21, 19, 19, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13,
+        12, 12, 12, 12, 12, 11, 11, 11, 16, 16, 17, 17, 17, 17, 17, 17, 18, 18,
+        18, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 10,
+        10, 10, 10, 9 },
+      { /* Chroma */
+        /* Size 4x4 */
+        33, 25, 22, 20, 25, 21, 21, 20, 22, 21, 18, 17, 20, 20, 17, 14,
+        /* Size 8x8 */
+        33, 33, 27, 23, 22, 21, 20, 19, 33, 32, 26, 23, 23, 22, 22, 20, 27, 26,
+        22, 22, 22, 22, 22, 20, 23, 23, 22, 20, 20, 20, 20, 19, 22, 23, 22, 20,
+        19, 18, 18, 17, 21, 22, 22, 20, 18, 17, 16, 16, 20, 22, 22, 20, 18, 16,
+        16, 15, 19, 20, 20, 19, 17, 16, 15, 13,
+        /* Size 16x16 */
+        32, 33, 34, 31, 30, 28, 25, 21, 21, 21, 21, 20, 20, 19, 19, 18, 33, 33,
+        33, 30, 28, 27, 24, 22, 22, 22, 22, 21, 20, 20, 19, 19, 34, 33, 32, 30,
+        28, 26, 24, 22, 23, 23, 23, 22, 22, 21, 20, 20, 31, 30, 30, 28, 26, 24,
+        23, 22, 22, 22, 23, 22, 22, 21, 20, 20, 30, 28, 28, 26, 24, 23, 22, 22,
+        22, 22, 23, 22, 22, 21, 21, 20, 28, 27, 26, 24, 23, 22, 22, 21, 22, 22,
+        23, 22, 22, 21, 21, 20, 25, 24, 24, 23, 22, 22, 21, 20, 20, 21, 21, 20,
+        20, 20, 20, 19, 21, 22, 22, 22, 22, 21, 20, 19, 19, 19, 19, 19, 19, 19,
+        18, 18, 21, 22, 23, 22, 22, 22, 20, 19, 19, 19, 19, 18, 18, 18, 18, 17,
+        21, 22, 23, 22, 22, 22, 21, 19, 19, 19, 18, 18, 17, 17, 17, 17, 21, 22,
+        23, 23, 23, 23, 21, 19, 19, 18, 18, 17, 17, 17, 16, 16, 20, 21, 22, 22,
+        22, 22, 20, 19, 18, 18, 17, 17, 16, 16, 16, 15, 20, 20, 22, 22, 22, 22,
+        20, 19, 18, 17, 17, 16, 16, 15, 15, 15, 19, 20, 21, 21, 21, 21, 20, 19,
+        18, 17, 17, 16, 15, 15, 14, 14, 19, 19, 20, 20, 21, 21, 20, 18, 18, 17,
+        16, 16, 15, 14, 14, 14, 18, 19, 20, 20, 20, 20, 19, 18, 17, 17, 16, 15,
+        15, 14, 14, 13,
+        /* Size 32x32 */
+        32, 33, 33, 33, 34, 34, 31, 31, 30, 28, 28, 26, 25, 23, 21, 21, 21, 21,
+        21, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 18, 18, 17, 33, 33, 33, 33,
+        33, 33, 31, 30, 28, 27, 27, 25, 24, 23, 21, 21, 22, 22, 22, 22, 22, 21,
+        21, 21, 20, 20, 20, 20, 19, 19, 19, 18, 33, 33, 33, 33, 33, 33, 30, 30,
+        28, 27, 27, 25, 24, 23, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20,
+        20, 20, 19, 19, 19, 18, 33, 33, 33, 33, 33, 33, 30, 29, 28, 26, 26, 25,
+        24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 19,
+        19, 19, 34, 33, 33, 33, 32, 32, 30, 29, 28, 26, 26, 24, 24, 23, 22, 22,
+        23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 34, 33,
+        33, 33, 32, 32, 30, 29, 28, 26, 26, 24, 24, 23, 22, 22, 23, 23, 23, 23,
+        23, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 31, 31, 30, 30, 30, 30,
+        28, 27, 26, 24, 24, 23, 23, 23, 22, 22, 22, 22, 22, 23, 23, 22, 22, 22,
+        22, 22, 21, 21, 20, 20, 20, 19, 31, 30, 30, 29, 29, 29, 27, 26, 26, 24,
+        24, 23, 23, 22, 22, 22, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 21, 21,
+        20, 20, 20, 19, 30, 28, 28, 28, 28, 28, 26, 26, 24, 23, 23, 23, 22, 22,
+        22, 22, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20,
+        28, 27, 27, 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 21, 21, 22, 22,
+        22, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 28, 27, 27, 26,
+        26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 21, 21, 22, 22, 22, 23, 23, 22,
+        22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 26, 25, 25, 25, 24, 24, 23, 23,
+        23, 22, 22, 21, 21, 21, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+        20, 20, 20, 20, 20, 19, 25, 24, 24, 24, 24, 24, 23, 23, 22, 22, 22, 21,
+        21, 21, 20, 20, 20, 21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 19,
+        19, 19, 23, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 21, 21, 20, 20, 20,
+        20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 19, 19, 19, 19, 19, 18, 21, 21,
+        22, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 19, 19, 19, 19,
+        19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 21, 21, 22, 22, 22, 22,
+        22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+        19, 19, 19, 19, 18, 18, 18, 18, 21, 22, 22, 22, 23, 23, 22, 22, 22, 22,
+        22, 21, 20, 20, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 18,
+        18, 17, 17, 17, 21, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 21, 21, 20,
+        19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 18, 17, 17, 17, 17,
+        21, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 21, 21, 20, 19, 19, 19, 19,
+        19, 18, 18, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 16, 21, 22, 22, 22,
+        23, 23, 23, 23, 23, 23, 23, 21, 21, 20, 19, 19, 19, 19, 18, 18, 18, 17,
+        17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 21, 22, 22, 22, 23, 23, 23, 23,
+        23, 23, 23, 21, 21, 20, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17,
+        17, 17, 16, 16, 16, 16, 20, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 21,
+        20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 16,
+        16, 15, 20, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 19, 19,
+        18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 20, 21,
+        21, 21, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 19, 19, 18, 18, 18, 17,
+        17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 15, 20, 20, 20, 21, 22, 22,
+        22, 22, 22, 22, 22, 21, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16,
+        16, 16, 15, 15, 15, 15, 15, 14, 20, 20, 20, 21, 22, 22, 22, 22, 22, 22,
+        22, 21, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15,
+        15, 15, 15, 14, 19, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 20, 20, 19,
+        19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14,
+        19, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 20, 20, 19, 19, 19, 18, 18,
+        17, 17, 17, 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 14, 19, 19, 19, 20,
+        20, 20, 20, 20, 21, 21, 21, 20, 20, 19, 18, 18, 18, 17, 17, 16, 16, 16,
+        16, 15, 15, 15, 14, 14, 14, 14, 14, 13, 18, 19, 19, 19, 20, 20, 20, 20,
+        20, 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 15, 15,
+        14, 14, 14, 13, 13, 13, 18, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20,
+        19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 13,
+        13, 13, 17, 18, 18, 19, 19, 19, 19, 19, 20, 20, 20, 19, 19, 18, 18, 18,
+        17, 17, 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 13, 13, 13, 13,
+        /* Size 4x8 */
+        33, 27, 22, 20, 32, 26, 23, 21, 26, 22, 23, 21, 23, 22, 20, 19, 22, 22,
+        18, 18, 22, 22, 17, 16, 21, 22, 17, 15, 19, 20, 16, 14,
+        /* Size 8x4 */
+        33, 32, 26, 23, 22, 22, 21, 19, 27, 26, 22, 22, 22, 22, 22, 20, 22, 23,
+        23, 20, 18, 17, 17, 16, 20, 21, 21, 19, 18, 16, 15, 14,
+        /* Size 8x16 */
+        32, 33, 28, 23, 21, 21, 20, 18, 33, 33, 27, 23, 22, 22, 20, 19, 34, 32,
+        26, 23, 23, 23, 21, 20, 31, 29, 24, 22, 22, 23, 22, 20, 29, 28, 23, 22,
+        22, 23, 22, 20, 28, 26, 22, 22, 22, 23, 22, 20, 24, 24, 22, 21, 20, 21,
+        20, 19, 21, 22, 21, 20, 19, 19, 19, 18, 21, 22, 22, 20, 19, 19, 18, 17,
+        21, 23, 22, 20, 19, 18, 17, 17, 21, 23, 22, 20, 19, 18, 17, 16, 20, 22,
+        22, 20, 18, 17, 16, 15, 20, 21, 22, 19, 18, 17, 16, 14, 19, 21, 21, 19,
+        18, 17, 15, 14, 19, 20, 21, 19, 18, 16, 15, 14, 18, 20, 20, 19, 17, 16,
+        15, 13,
+        /* Size 16x8 */
+        32, 33, 34, 31, 29, 28, 24, 21, 21, 21, 21, 20, 20, 19, 19, 18, 33, 33,
+        32, 29, 28, 26, 24, 22, 22, 23, 23, 22, 21, 21, 20, 20, 28, 27, 26, 24,
+        23, 22, 22, 21, 22, 22, 22, 22, 22, 21, 21, 20, 23, 23, 23, 22, 22, 22,
+        21, 20, 20, 20, 20, 20, 19, 19, 19, 19, 21, 22, 23, 22, 22, 22, 20, 19,
+        19, 19, 19, 18, 18, 18, 18, 17, 21, 22, 23, 23, 23, 23, 21, 19, 19, 18,
+        18, 17, 17, 17, 16, 16, 20, 20, 21, 22, 22, 22, 20, 19, 18, 17, 17, 16,
+        16, 15, 15, 15, 18, 19, 20, 20, 20, 20, 19, 18, 17, 17, 16, 15, 14, 14,
+        14, 13,
+        /* Size 16x32 */
+        32, 33, 33, 31, 28, 28, 23, 21, 21, 21, 21, 20, 20, 19, 18, 18, 33, 33,
+        33, 30, 27, 27, 23, 22, 22, 22, 22, 20, 20, 20, 19, 19, 33, 33, 33, 30,
+        27, 27, 23, 22, 22, 22, 22, 21, 20, 20, 19, 19, 33, 33, 32, 30, 26, 26,
+        23, 22, 22, 22, 22, 21, 21, 20, 19, 19, 34, 32, 32, 29, 26, 26, 23, 22,
+        23, 23, 23, 22, 21, 21, 20, 20, 34, 32, 32, 29, 26, 26, 23, 22, 23, 23,
+        23, 22, 21, 21, 20, 20, 31, 30, 29, 28, 24, 24, 22, 22, 22, 23, 23, 22,
+        22, 21, 20, 20, 31, 29, 28, 27, 24, 24, 22, 22, 22, 22, 22, 22, 22, 21,
+        20, 20, 29, 28, 28, 26, 23, 23, 22, 22, 22, 23, 23, 22, 22, 21, 20, 20,
+        28, 26, 26, 24, 22, 22, 22, 22, 22, 23, 23, 22, 22, 21, 20, 20, 28, 26,
+        26, 24, 22, 22, 22, 22, 22, 23, 23, 22, 22, 21, 20, 20, 25, 24, 24, 23,
+        22, 22, 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 24, 24, 24, 23, 22, 22,
+        21, 20, 20, 21, 21, 20, 20, 20, 19, 19, 23, 23, 23, 23, 22, 22, 20, 20,
+        20, 20, 20, 20, 20, 19, 19, 19, 21, 22, 22, 22, 21, 21, 20, 19, 19, 19,
+        19, 19, 19, 19, 18, 18, 21, 22, 22, 22, 21, 21, 20, 19, 19, 19, 19, 19,
+        19, 19, 18, 18, 21, 22, 22, 22, 22, 22, 20, 19, 19, 19, 19, 18, 18, 18,
+        17, 17, 21, 22, 22, 22, 22, 22, 20, 19, 19, 18, 18, 18, 18, 18, 17, 17,
+        21, 22, 23, 22, 22, 22, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 21, 22,
+        23, 23, 22, 22, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 21, 22, 23, 23,
+        22, 22, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 20, 22, 22, 22, 22, 22,
+        20, 19, 18, 17, 17, 17, 16, 16, 16, 16, 20, 22, 22, 22, 22, 22, 20, 19,
+        18, 17, 17, 16, 16, 16, 15, 15, 20, 21, 22, 22, 22, 22, 20, 19, 18, 17,
+        17, 16, 16, 16, 15, 15, 20, 21, 21, 22, 22, 22, 19, 19, 18, 17, 17, 16,
+        16, 15, 14, 14, 20, 21, 21, 22, 22, 22, 19, 19, 18, 17, 17, 16, 16, 15,
+        14, 14, 19, 20, 21, 21, 21, 21, 19, 19, 18, 17, 17, 15, 15, 15, 14, 14,
+        19, 20, 20, 21, 21, 21, 19, 19, 18, 17, 17, 15, 15, 15, 14, 14, 19, 20,
+        20, 20, 21, 21, 19, 18, 18, 16, 16, 15, 15, 14, 14, 14, 18, 19, 20, 20,
+        20, 20, 19, 18, 17, 16, 16, 15, 15, 14, 13, 13, 18, 19, 20, 20, 20, 20,
+        19, 18, 17, 16, 16, 15, 15, 14, 13, 13, 17, 19, 19, 19, 20, 20, 18, 18,
+        17, 16, 16, 15, 14, 14, 13, 13,
+        /* Size 32x16 */
+        32, 33, 33, 33, 34, 34, 31, 31, 29, 28, 28, 25, 24, 23, 21, 21, 21, 21,
+        21, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 18, 18, 17, 33, 33, 33, 33,
+        32, 32, 30, 29, 28, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22,
+        22, 21, 21, 21, 20, 20, 20, 19, 19, 19, 33, 33, 33, 32, 32, 32, 29, 28,
+        28, 26, 26, 24, 24, 23, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 21, 21,
+        21, 20, 20, 20, 20, 19, 31, 30, 30, 30, 29, 29, 28, 27, 26, 24, 24, 23,
+        23, 23, 22, 22, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 21, 21, 20, 20,
+        20, 19, 28, 27, 27, 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 21, 21,
+        22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 28, 27,
+        27, 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 21, 21, 22, 22, 22, 22,
+        22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 23, 23, 23, 23, 23, 23,
+        22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+        19, 19, 19, 19, 19, 19, 19, 18, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+        22, 21, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+        18, 18, 18, 18, 21, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 21, 20, 20,
+        19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 18, 18, 17, 17, 17,
+        21, 22, 22, 22, 23, 23, 23, 22, 23, 23, 23, 21, 21, 20, 19, 19, 19, 18,
+        18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 21, 22, 22, 22,
+        23, 23, 23, 22, 23, 23, 23, 21, 21, 20, 19, 19, 19, 18, 18, 18, 18, 17,
+        17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 20, 20, 21, 21, 22, 22, 22, 22,
+        22, 22, 22, 21, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 16,
+        15, 15, 15, 15, 15, 15, 20, 20, 20, 21, 21, 21, 22, 22, 22, 22, 22, 20,
+        20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 15,
+        15, 14, 19, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 20, 20, 19, 19, 19,
+        18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 18, 19,
+        19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16,
+        16, 16, 15, 15, 14, 14, 14, 14, 14, 13, 13, 13, 18, 19, 19, 19, 20, 20,
+        20, 20, 20, 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15,
+        14, 14, 14, 14, 14, 13, 13, 13,
+        /* Size 4x16 */
+        33, 28, 21, 19, 33, 27, 22, 20, 32, 26, 23, 21, 30, 24, 23, 21, 28, 23,
+        23, 21, 26, 22, 23, 21, 24, 22, 21, 20, 22, 21, 19, 19, 22, 22, 19, 18,
+        22, 22, 18, 17, 22, 22, 18, 17, 22, 22, 17, 16, 21, 22, 17, 15, 20, 21,
+        17, 15, 20, 21, 16, 14, 19, 20, 16, 14,
+        /* Size 16x4 */
+        33, 33, 32, 30, 28, 26, 24, 22, 22, 22, 22, 22, 21, 20, 20, 19, 28, 27,
+        26, 24, 23, 22, 22, 21, 22, 22, 22, 22, 22, 21, 21, 20, 21, 22, 23, 23,
+        23, 23, 21, 19, 19, 18, 18, 17, 17, 17, 16, 16, 19, 20, 21, 21, 21, 21,
+        20, 19, 18, 17, 17, 16, 15, 15, 14, 14,
+        /* Size 8x32 */
+        32, 33, 28, 23, 21, 21, 20, 18, 33, 33, 27, 23, 22, 22, 20, 19, 33, 33,
+        27, 23, 22, 22, 20, 19, 33, 32, 26, 23, 22, 22, 21, 19, 34, 32, 26, 23,
+        23, 23, 21, 20, 34, 32, 26, 23, 23, 23, 21, 20, 31, 29, 24, 22, 22, 23,
+        22, 20, 31, 28, 24, 22, 22, 22, 22, 20, 29, 28, 23, 22, 22, 23, 22, 20,
+        28, 26, 22, 22, 22, 23, 22, 20, 28, 26, 22, 22, 22, 23, 22, 20, 25, 24,
+        22, 21, 21, 21, 20, 20, 24, 24, 22, 21, 20, 21, 20, 19, 23, 23, 22, 20,
+        20, 20, 20, 19, 21, 22, 21, 20, 19, 19, 19, 18, 21, 22, 21, 20, 19, 19,
+        19, 18, 21, 22, 22, 20, 19, 19, 18, 17, 21, 22, 22, 20, 19, 18, 18, 17,
+        21, 23, 22, 20, 19, 18, 17, 17, 21, 23, 22, 20, 19, 18, 17, 16, 21, 23,
+        22, 20, 19, 18, 17, 16, 20, 22, 22, 20, 18, 17, 16, 16, 20, 22, 22, 20,
+        18, 17, 16, 15, 20, 22, 22, 20, 18, 17, 16, 15, 20, 21, 22, 19, 18, 17,
+        16, 14, 20, 21, 22, 19, 18, 17, 16, 14, 19, 21, 21, 19, 18, 17, 15, 14,
+        19, 20, 21, 19, 18, 17, 15, 14, 19, 20, 21, 19, 18, 16, 15, 14, 18, 20,
+        20, 19, 17, 16, 15, 13, 18, 20, 20, 19, 17, 16, 15, 13, 17, 19, 20, 18,
+        17, 16, 14, 13,
+        /* Size 32x8 */
+        32, 33, 33, 33, 34, 34, 31, 31, 29, 28, 28, 25, 24, 23, 21, 21, 21, 21,
+        21, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 18, 18, 17, 33, 33, 33, 32,
+        32, 32, 29, 28, 28, 26, 26, 24, 24, 23, 22, 22, 22, 22, 23, 23, 23, 22,
+        22, 22, 21, 21, 21, 20, 20, 20, 20, 19, 28, 27, 27, 26, 26, 26, 24, 24,
+        23, 22, 22, 22, 22, 22, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+        21, 21, 21, 20, 20, 20, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 21,
+        21, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19,
+        19, 18, 21, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 21, 20, 20, 19, 19,
+        19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 18, 18, 17, 17, 17, 21, 22,
+        22, 22, 23, 23, 23, 22, 23, 23, 23, 21, 21, 20, 19, 19, 19, 18, 18, 18,
+        18, 17, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 20, 20, 20, 21, 21, 21,
+        22, 22, 22, 22, 22, 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16,
+        16, 16, 15, 15, 15, 15, 15, 14, 18, 19, 19, 19, 20, 20, 20, 20, 20, 20,
+        20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 14, 14, 14, 14,
+        14, 13, 13, 13 },
+  },
+  {
+      { /* Luma */
+        /* Size 4x4 */
+        32, 32, 27, 20, 32, 29, 26, 21, 27, 26, 19, 16, 20, 21, 16, 13,
+        /* Size 8x8 */
+        33, 32, 32, 30, 29, 25, 22, 19, 32, 32, 32, 31, 30, 26, 23, 20, 32, 32,
+        30, 29, 28, 25, 23, 20, 30, 31, 29, 26, 24, 22, 20, 19, 29, 30, 28, 24,
+        21, 19, 18, 17, 25, 26, 25, 22, 19, 17, 16, 15, 22, 23, 23, 20, 18, 16,
+        14, 13, 19, 20, 20, 19, 17, 15, 13, 12,
+        /* Size 16x16 */
+        32, 33, 33, 33, 33, 32, 32, 30, 28, 27, 26, 23, 22, 21, 19, 17, 33, 32,
+        32, 32, 32, 32, 31, 30, 29, 28, 27, 24, 23, 22, 20, 18, 33, 32, 32, 32,
+        32, 32, 31, 31, 30, 28, 28, 25, 23, 22, 20, 19, 33, 32, 32, 32, 32, 31,
+        31, 30, 29, 28, 27, 25, 23, 23, 21, 19, 33, 32, 32, 32, 31, 30, 30, 29,
+        28, 27, 26, 24, 23, 22, 20, 19, 32, 32, 32, 31, 30, 29, 28, 28, 27, 26,
+        26, 24, 23, 22, 21, 19, 32, 31, 31, 31, 30, 28, 28, 27, 26, 25, 24, 23,
+        22, 21, 20, 19, 30, 30, 31, 30, 29, 28, 27, 26, 24, 23, 23, 22, 20, 20,
+        19, 18, 28, 29, 30, 29, 28, 27, 26, 24, 21, 20, 20, 19, 18, 18, 17, 16,
+        27, 28, 28, 28, 27, 26, 25, 23, 20, 20, 20, 18, 18, 17, 16, 15, 26, 27,
+        28, 27, 26, 26, 24, 23, 20, 20, 19, 18, 17, 17, 16, 15, 23, 24, 25, 25,
+        24, 24, 23, 22, 19, 18, 18, 16, 16, 15, 14, 14, 22, 23, 23, 23, 23, 23,
+        22, 20, 18, 18, 17, 16, 15, 14, 14, 13, 21, 22, 22, 23, 22, 22, 21, 20,
+        18, 17, 17, 15, 14, 14, 13, 13, 19, 20, 20, 21, 20, 21, 20, 19, 17, 16,
+        16, 14, 14, 13, 12, 12, 17, 18, 19, 19, 19, 19, 19, 18, 16, 15, 15, 14,
+        13, 13, 12, 11,
+        /* Size 32x32 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 30, 30, 29, 28, 28,
+        27, 26, 26, 24, 23, 23, 22, 21, 21, 19, 19, 19, 17, 17, 33, 33, 33, 33,
+        32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 29, 29, 28, 26, 26, 25,
+        24, 24, 22, 22, 21, 20, 20, 19, 18, 18, 33, 33, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 31, 30, 30, 30, 29, 29, 28, 27, 27, 25, 24, 24, 23, 22,
+        22, 20, 20, 19, 18, 18, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        31, 30, 30, 30, 29, 29, 28, 27, 27, 25, 24, 24, 23, 22, 22, 20, 20, 20,
+        18, 18, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30,
+        30, 30, 28, 28, 28, 26, 25, 25, 23, 23, 22, 21, 20, 20, 19, 19, 33, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 28, 28,
+        28, 26, 25, 25, 23, 23, 23, 21, 21, 20, 19, 19, 33, 32, 32, 32, 32, 32,
+        32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 29, 29, 28, 27, 27, 26, 25, 25,
+        23, 23, 23, 21, 21, 20, 19, 19, 33, 32, 32, 32, 32, 32, 32, 31, 31, 31,
+        30, 30, 30, 29, 29, 29, 28, 28, 27, 26, 26, 25, 24, 24, 23, 23, 22, 21,
+        20, 20, 19, 19, 33, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 29,
+        29, 28, 28, 28, 27, 26, 26, 25, 24, 24, 23, 23, 22, 21, 20, 20, 19, 19,
+        32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 29, 29, 29, 28, 28, 28,
+        27, 26, 26, 25, 24, 24, 23, 23, 22, 21, 21, 20, 19, 19, 32, 32, 32, 32,
+        32, 32, 31, 30, 30, 30, 29, 29, 28, 28, 28, 28, 27, 27, 26, 26, 26, 24,
+        24, 24, 23, 22, 22, 21, 21, 20, 19, 19, 32, 32, 32, 32, 32, 32, 31, 30,
+        30, 30, 29, 29, 28, 28, 28, 28, 27, 27, 26, 26, 26, 24, 24, 24, 23, 22,
+        22, 21, 21, 20, 19, 19, 32, 31, 31, 31, 31, 31, 31, 30, 30, 29, 28, 28,
+        28, 27, 27, 26, 26, 26, 25, 24, 24, 23, 23, 23, 22, 22, 21, 20, 20, 20,
+        19, 19, 30, 30, 30, 30, 31, 31, 30, 29, 29, 29, 28, 28, 27, 26, 26, 25,
+        24, 24, 23, 23, 23, 22, 22, 21, 20, 20, 20, 19, 19, 19, 18, 18, 30, 30,
+        30, 30, 31, 31, 30, 29, 29, 29, 28, 28, 27, 26, 26, 25, 24, 24, 23, 23,
+        23, 22, 22, 21, 20, 20, 20, 19, 19, 19, 18, 18, 29, 30, 30, 30, 30, 30,
+        30, 29, 28, 28, 28, 28, 26, 25, 25, 24, 23, 23, 22, 22, 22, 21, 20, 20,
+        19, 19, 19, 18, 18, 18, 17, 17, 28, 29, 29, 29, 30, 30, 29, 28, 28, 28,
+        27, 27, 26, 24, 24, 23, 21, 21, 20, 20, 20, 19, 19, 19, 18, 18, 18, 17,
+        17, 17, 16, 16, 28, 29, 29, 29, 30, 30, 29, 28, 28, 28, 27, 27, 26, 24,
+        24, 23, 21, 21, 20, 20, 20, 19, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16,
+        27, 28, 28, 28, 28, 28, 28, 27, 27, 27, 26, 26, 25, 23, 23, 22, 20, 20,
+        20, 20, 20, 19, 18, 18, 18, 17, 17, 17, 16, 16, 15, 15, 26, 26, 27, 27,
+        28, 28, 27, 26, 26, 26, 26, 26, 24, 23, 23, 22, 20, 20, 20, 19, 19, 18,
+        18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 26, 26, 27, 27, 28, 28, 27, 26,
+        26, 26, 26, 26, 24, 23, 23, 22, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17,
+        17, 16, 16, 16, 15, 15, 24, 25, 25, 25, 26, 26, 26, 25, 25, 25, 24, 24,
+        23, 22, 22, 21, 19, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 15,
+        14, 14, 23, 24, 24, 24, 25, 25, 25, 24, 24, 24, 24, 24, 23, 22, 22, 20,
+        19, 19, 18, 18, 18, 17, 16, 16, 16, 15, 15, 14, 14, 14, 14, 14, 23, 24,
+        24, 24, 25, 25, 25, 24, 24, 24, 24, 24, 23, 21, 21, 20, 19, 19, 18, 18,
+        18, 17, 16, 16, 16, 15, 15, 14, 14, 14, 13, 13, 22, 22, 23, 23, 23, 23,
+        23, 23, 23, 23, 23, 23, 22, 20, 20, 19, 18, 18, 18, 17, 17, 16, 16, 16,
+        15, 15, 14, 14, 14, 13, 13, 13, 21, 22, 22, 22, 23, 23, 23, 23, 23, 23,
+        22, 22, 22, 20, 20, 19, 18, 18, 17, 17, 17, 16, 15, 15, 15, 14, 14, 14,
+        13, 13, 13, 13, 21, 21, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 21, 20,
+        20, 19, 18, 18, 17, 17, 17, 16, 15, 15, 14, 14, 14, 13, 13, 13, 13, 13,
+        19, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 20, 19, 19, 18, 17, 17,
+        17, 16, 16, 15, 14, 14, 14, 14, 13, 13, 13, 12, 12, 12, 19, 20, 20, 20,
+        20, 21, 21, 20, 20, 21, 21, 21, 20, 19, 19, 18, 17, 17, 16, 16, 16, 15,
+        14, 14, 14, 13, 13, 13, 12, 12, 12, 12, 19, 19, 19, 20, 20, 20, 20, 20,
+        20, 20, 20, 20, 20, 19, 19, 18, 17, 17, 16, 16, 16, 15, 14, 14, 13, 13,
+        13, 12, 12, 12, 12, 12, 17, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19,
+        19, 18, 18, 17, 16, 16, 15, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 12,
+        11, 11, 17, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 17,
+        16, 16, 15, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 12, 11, 11,
+        /* Size 4x8 */
+        32, 32, 28, 20, 32, 31, 28, 21, 32, 30, 27, 21, 30, 28, 23, 19, 29, 27,
+        21, 17, 26, 24, 19, 15, 22, 22, 17, 13, 20, 20, 16, 12,
+        /* Size 8x4 */
+        32, 32, 32, 30, 29, 26, 22, 20, 32, 31, 30, 28, 27, 24, 22, 20, 28, 28,
+        27, 23, 21, 19, 17, 16, 20, 21, 21, 19, 17, 15, 13, 12,
+        /* Size 8x16 */
+        32, 33, 32, 32, 28, 23, 22, 19, 33, 32, 32, 31, 29, 24, 23, 20, 33, 32,
+        32, 31, 29, 25, 23, 21, 33, 32, 31, 31, 29, 25, 23, 21, 32, 32, 30, 30,
+        28, 24, 23, 20, 32, 31, 29, 28, 27, 24, 23, 21, 32, 31, 29, 28, 26, 23,
+        22, 20, 30, 30, 28, 27, 24, 21, 20, 19, 28, 30, 28, 26, 21, 19, 18, 17,
+        27, 28, 26, 25, 21, 18, 18, 16, 26, 28, 26, 24, 20, 18, 17, 16, 23, 25,
+        24, 23, 19, 16, 16, 14, 22, 23, 23, 22, 18, 16, 15, 14, 21, 22, 22, 21,
+        18, 15, 14, 13, 19, 21, 20, 20, 17, 14, 14, 12, 18, 19, 19, 19, 16, 14,
+        13, 12,
+        /* Size 16x8 */
+        32, 33, 33, 33, 32, 32, 32, 30, 28, 27, 26, 23, 22, 21, 19, 18, 33, 32,
+        32, 32, 32, 31, 31, 30, 30, 28, 28, 25, 23, 22, 21, 19, 32, 32, 32, 31,
+        30, 29, 29, 28, 28, 26, 26, 24, 23, 22, 20, 19, 32, 31, 31, 31, 30, 28,
+        28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 28, 29, 29, 29, 28, 27, 26, 24,
+        21, 21, 20, 19, 18, 18, 17, 16, 23, 24, 25, 25, 24, 24, 23, 21, 19, 18,
+        18, 16, 16, 15, 14, 14, 22, 23, 23, 23, 23, 23, 22, 20, 18, 18, 17, 16,
+        15, 14, 14, 13, 19, 20, 21, 21, 20, 21, 20, 19, 17, 16, 16, 14, 14, 13,
+        12, 12,
+        /* Size 16x32 */
+        32, 33, 33, 33, 32, 32, 32, 29, 28, 27, 23, 23, 22, 19, 19, 17, 33, 32,
+        32, 32, 32, 32, 31, 29, 29, 28, 24, 24, 22, 20, 20, 18, 33, 32, 32, 32,
+        32, 32, 31, 29, 29, 28, 24, 24, 23, 20, 20, 18, 33, 32, 32, 32, 32, 32,
+        31, 29, 29, 28, 24, 24, 23, 20, 20, 18, 33, 32, 32, 32, 32, 32, 31, 30,
+        29, 28, 25, 25, 23, 21, 21, 19, 33, 32, 32, 32, 32, 31, 31, 30, 30, 28,
+        25, 25, 23, 21, 21, 19, 33, 32, 32, 32, 31, 31, 31, 29, 29, 28, 25, 25,
+        23, 21, 21, 19, 32, 32, 32, 32, 31, 30, 30, 28, 28, 27, 24, 24, 23, 21,
+        21, 19, 32, 32, 32, 31, 30, 30, 30, 28, 28, 27, 24, 24, 23, 20, 20, 19,
+        32, 32, 32, 31, 30, 30, 29, 28, 28, 27, 24, 24, 23, 21, 21, 19, 32, 32,
+        31, 31, 29, 29, 28, 27, 27, 26, 24, 24, 23, 21, 21, 19, 32, 32, 31, 31,
+        29, 29, 28, 27, 27, 26, 24, 24, 23, 21, 21, 19, 32, 31, 31, 31, 29, 28,
+        28, 26, 26, 25, 23, 23, 22, 20, 20, 19, 30, 30, 30, 30, 28, 28, 27, 24,
+        24, 23, 21, 21, 20, 19, 19, 18, 30, 30, 30, 30, 28, 28, 27, 24, 24, 23,
+        21, 21, 20, 19, 19, 18, 29, 30, 30, 30, 28, 28, 26, 23, 23, 22, 20, 20,
+        19, 18, 18, 17, 28, 29, 30, 29, 28, 27, 26, 22, 21, 21, 19, 19, 18, 17,
+        17, 16, 28, 29, 30, 29, 28, 27, 26, 22, 21, 21, 19, 19, 18, 17, 17, 16,
+        27, 28, 28, 28, 26, 26, 25, 21, 21, 20, 18, 18, 18, 16, 16, 15, 26, 27,
+        28, 27, 26, 26, 24, 21, 20, 20, 18, 18, 17, 16, 16, 15, 26, 27, 28, 27,
+        26, 26, 24, 21, 20, 20, 18, 18, 17, 16, 16, 15, 24, 26, 26, 26, 24, 24,
+        23, 20, 20, 19, 17, 17, 16, 15, 15, 14, 23, 24, 25, 25, 24, 24, 23, 20,
+        19, 18, 16, 16, 16, 14, 14, 14, 23, 24, 25, 25, 24, 24, 23, 20, 19, 18,
+        16, 16, 16, 14, 14, 13, 22, 23, 23, 23, 23, 23, 22, 19, 18, 18, 16, 16,
+        15, 14, 14, 13, 21, 22, 23, 23, 22, 22, 21, 19, 18, 17, 15, 15, 15, 13,
+        13, 13, 21, 22, 22, 22, 22, 22, 21, 18, 18, 17, 15, 15, 14, 13, 13, 13,
+        19, 20, 21, 21, 21, 21, 20, 18, 17, 17, 14, 14, 14, 13, 13, 12, 19, 20,
+        21, 21, 20, 20, 20, 17, 17, 16, 14, 14, 14, 12, 12, 12, 19, 20, 20, 20,
+        20, 20, 19, 17, 17, 16, 14, 14, 13, 12, 12, 12, 18, 19, 19, 19, 19, 19,
+        19, 17, 16, 15, 14, 14, 13, 12, 12, 11, 18, 19, 19, 19, 19, 19, 19, 17,
+        16, 15, 14, 14, 13, 12, 12, 11,
+        /* Size 32x16 */
+        32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 30, 30, 29, 28, 28,
+        27, 26, 26, 24, 23, 23, 22, 21, 21, 19, 19, 19, 18, 18, 33, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 29, 29, 28, 27, 27, 26,
+        24, 24, 23, 22, 22, 20, 20, 20, 19, 19, 33, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 28, 28, 28, 26, 25, 25, 23, 23,
+        22, 21, 21, 20, 19, 19, 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+        31, 30, 30, 30, 29, 29, 28, 27, 27, 26, 25, 25, 23, 23, 22, 21, 21, 20,
+        19, 19, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 29, 29, 29, 28, 28, 28,
+        28, 28, 26, 26, 26, 24, 24, 24, 23, 22, 22, 21, 20, 20, 19, 19, 32, 32,
+        32, 32, 32, 31, 31, 30, 30, 30, 29, 29, 28, 28, 28, 28, 27, 27, 26, 26,
+        26, 24, 24, 24, 23, 22, 22, 21, 20, 20, 19, 19, 32, 31, 31, 31, 31, 31,
+        31, 30, 30, 29, 28, 28, 28, 27, 27, 26, 26, 26, 25, 24, 24, 23, 23, 23,
+        22, 21, 21, 20, 20, 19, 19, 19, 29, 29, 29, 29, 30, 30, 29, 28, 28, 28,
+        27, 27, 26, 24, 24, 23, 22, 22, 21, 21, 21, 20, 20, 20, 19, 19, 18, 18,
+        17, 17, 17, 17, 28, 29, 29, 29, 29, 30, 29, 28, 28, 28, 27, 27, 26, 24,
+        24, 23, 21, 21, 21, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16,
+        27, 28, 28, 28, 28, 28, 28, 27, 27, 27, 26, 26, 25, 23, 23, 22, 21, 21,
+        20, 20, 20, 19, 18, 18, 18, 17, 17, 17, 16, 16, 15, 15, 23, 24, 24, 24,
+        25, 25, 25, 24, 24, 24, 24, 24, 23, 21, 21, 20, 19, 19, 18, 18, 18, 17,
+        16, 16, 16, 15, 15, 14, 14, 14, 14, 14, 23, 24, 24, 24, 25, 25, 25, 24,
+        24, 24, 24, 24, 23, 21, 21, 20, 19, 19, 18, 18, 18, 17, 16, 16, 16, 15,
+        15, 14, 14, 14, 14, 14, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
+        22, 20, 20, 19, 18, 18, 18, 17, 17, 16, 16, 16, 15, 15, 14, 14, 14, 13,
+        13, 13, 19, 20, 20, 20, 21, 21, 21, 21, 20, 21, 21, 21, 20, 19, 19, 18,
+        17, 17, 16, 16, 16, 15, 14, 14, 14, 13, 13, 13, 12, 12, 12, 12, 19, 20,
+        20, 20, 21, 21, 21, 21, 20, 21, 21, 21, 20, 19, 19, 18, 17, 17, 16, 16,
+        16, 15, 14, 14, 14, 13, 13, 13, 12, 12, 12, 12, 17, 18, 18, 18, 19, 19,
+        19, 19, 19, 19, 19, 19, 19, 18, 18, 17, 16, 16, 15, 15, 15, 14, 14, 13,
+        13, 13, 13, 12, 12, 12, 11, 11,
+        /* Size 4x16 */
+        33, 32, 27, 19, 32, 32, 28, 20, 32, 32, 28, 21, 32, 31, 28, 21, 32, 30,
+        27, 20, 32, 29, 26, 21, 31, 28, 25, 20, 30, 28, 23, 19, 29, 27, 21, 17,
+        28, 26, 20, 16, 27, 26, 20, 16, 24, 24, 18, 14, 23, 23, 18, 14, 22, 22,
+        17, 13, 20, 20, 16, 12, 19, 19, 15, 12,
+        /* Size 16x4 */
+        33, 32, 32, 32, 32, 32, 31, 30, 29, 28, 27, 24, 23, 22, 20, 19, 32, 32,
+        32, 31, 30, 29, 28, 28, 27, 26, 26, 24, 23, 22, 20, 19, 27, 28, 28, 28,
+        27, 26, 25, 23, 21, 20, 20, 18, 18, 17, 16, 15, 19, 20, 21, 21, 20, 21,
+        20, 19, 17, 16, 16, 14, 14, 13, 12, 12,
+        /* Size 8x32 */
+        32, 33, 32, 32, 28, 23, 22, 19, 33, 32, 32, 31, 29, 24, 22, 20, 33, 32,
+        32, 31, 29, 24, 23, 20, 33, 32, 32, 31, 29, 24, 23, 20, 33, 32, 32, 31,
+        29, 25, 23, 21, 33, 32, 32, 31, 30, 25, 23, 21, 33, 32, 31, 31, 29, 25,
+        23, 21, 32, 32, 31, 30, 28, 24, 23, 21, 32, 32, 30, 30, 28, 24, 23, 20,
+        32, 32, 30, 29, 28, 24, 23, 21, 32, 31, 29, 28, 27, 24, 23, 21, 32, 31,
+        29, 28, 27, 24, 23, 21, 32, 31, 29, 28, 26, 23, 22, 20, 30, 30, 28, 27,
+        24, 21, 20, 19, 30, 30, 28, 27, 24, 21, 20, 19, 29, 30, 28, 26, 23, 20,
+        19, 18, 28, 30, 28, 26, 21, 19, 18, 17, 28, 30, 28, 26, 21, 19, 18, 17,
+        27, 28, 26, 25, 21, 18, 18, 16, 26, 28, 26, 24, 20, 18, 17, 16, 26, 28,
+        26, 24, 20, 18, 17, 16, 24, 26, 24, 23, 20, 17, 16, 15, 23, 25, 24, 23,
+        19, 16, 16, 14, 23, 25, 24, 23, 19, 16, 16, 14, 22, 23, 23, 22, 18, 16,
+        15, 14, 21, 23, 22, 21, 18, 15, 15, 13, 21, 22, 22, 21, 18, 15, 14, 13,
+        19, 21, 21, 20, 17, 14, 14, 13, 19, 21, 20, 20, 17, 14, 14, 12, 19, 20,
+        20, 19, 17, 14, 13, 12, 18, 19, 19, 19, 16, 14, 13, 12, 18, 19, 19, 19,
+        16, 14, 13, 12,
+        /* Size 32x8 */
+        32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 30, 30, 29, 28, 28,
+        27, 26, 26, 24, 23, 23, 22, 21, 21, 19, 19, 19, 18, 18, 33, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 28, 28, 28, 26,
+        25, 25, 23, 23, 22, 21, 21, 20, 19, 19, 32, 32, 32, 32, 32, 32, 31, 31,
+        30, 30, 29, 29, 29, 28, 28, 28, 28, 28, 26, 26, 26, 24, 24, 24, 23, 22,
+        22, 21, 20, 20, 19, 19, 32, 31, 31, 31, 31, 31, 31, 30, 30, 29, 28, 28,
+        28, 27, 27, 26, 26, 26, 25, 24, 24, 23, 23, 23, 22, 21, 21, 20, 20, 19,
+        19, 19, 28, 29, 29, 29, 29, 30, 29, 28, 28, 28, 27, 27, 26, 24, 24, 23,
+        21, 21, 21, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, 23, 24,
+        24, 24, 25, 25, 25, 24, 24, 24, 24, 24, 23, 21, 21, 20, 19, 19, 18, 18,
+        18, 17, 16, 16, 16, 15, 15, 14, 14, 14, 14, 14, 22, 22, 23, 23, 23, 23,
+        23, 23, 23, 23, 23, 23, 22, 20, 20, 19, 18, 18, 18, 17, 17, 16, 16, 16,
+        15, 15, 14, 14, 14, 13, 13, 13, 19, 20, 20, 20, 21, 21, 21, 21, 20, 21,
+        21, 21, 20, 19, 19, 18, 17, 17, 16, 16, 16, 15, 14, 14, 14, 13, 13, 13,
+        12, 12, 12, 12 },
+      { /* Chroma */
+        /* Size 4x4 */
+        33, 27, 22, 21, 27, 22, 22, 22, 22, 22, 19, 18, 21, 22, 18, 16,
+        /* Size 8x8 */
+        33, 33, 29, 24, 21, 22, 21, 20, 33, 32, 28, 24, 22, 23, 22, 21, 29, 28,
+        25, 23, 22, 23, 22, 21, 24, 24, 23, 21, 20, 21, 20, 20, 21, 22, 22, 20,
+        19, 19, 19, 19, 22, 23, 23, 21, 19, 18, 17, 17, 21, 22, 22, 20, 19, 17,
+        17, 16, 20, 21, 21, 20, 19, 17, 16, 15,
+        /* Size 16x16 */
+        32, 33, 34, 33, 31, 28, 27, 25, 21, 21, 21, 21, 20, 20, 20, 19, 33, 33,
+        33, 32, 30, 27, 26, 24, 22, 22, 22, 22, 21, 21, 20, 20, 34, 33, 33, 32,
+        29, 26, 25, 24, 22, 22, 22, 23, 22, 22, 21, 20, 33, 32, 32, 31, 28, 26,
+        25, 24, 22, 22, 23, 23, 22, 22, 22, 21, 31, 30, 29, 28, 26, 24, 23, 23,
+        22, 22, 22, 23, 22, 22, 22, 21, 28, 27, 26, 26, 24, 22, 22, 22, 21, 22,
+        22, 23, 22, 22, 22, 21, 27, 26, 25, 25, 23, 22, 22, 21, 21, 21, 21, 22,
+        22, 22, 21, 21, 25, 24, 24, 24, 23, 22, 21, 21, 20, 20, 21, 21, 20, 20,
+        20, 20, 21, 22, 22, 22, 22, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 19,
+        21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 19, 19, 19, 19, 18, 18, 21, 22,
+        22, 23, 22, 22, 21, 21, 19, 19, 19, 19, 18, 18, 18, 18, 21, 22, 23, 23,
+        23, 23, 22, 21, 19, 19, 19, 18, 17, 17, 17, 17, 20, 21, 22, 22, 22, 22,
+        22, 20, 19, 19, 18, 17, 17, 17, 16, 16, 20, 21, 22, 22, 22, 22, 22, 20,
+        19, 19, 18, 17, 17, 17, 16, 16, 20, 20, 21, 22, 22, 22, 21, 20, 19, 18,
+        18, 17, 16, 16, 16, 15, 19, 20, 20, 21, 21, 21, 21, 20, 19, 18, 18, 17,
+        16, 16, 15, 14,
+        /* Size 32x32 */
+        32, 33, 33, 33, 34, 34, 33, 31, 31, 30, 28, 28, 27, 25, 25, 23, 21, 21,
+        21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 33, 33, 33, 33,
+        33, 33, 33, 30, 30, 29, 27, 27, 26, 24, 24, 23, 21, 21, 22, 22, 22, 22,
+        22, 22, 21, 21, 21, 20, 20, 20, 19, 19, 33, 33, 33, 33, 33, 33, 32, 30,
+        30, 29, 27, 27, 26, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21,
+        21, 21, 20, 20, 20, 20, 33, 33, 33, 33, 33, 33, 32, 30, 30, 28, 27, 27,
+        26, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 21, 20,
+        20, 20, 34, 33, 33, 33, 33, 33, 32, 29, 29, 28, 26, 26, 25, 24, 24, 23,
+        22, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 21, 21, 21, 20, 20, 34, 33,
+        33, 33, 33, 32, 32, 29, 29, 28, 26, 26, 25, 24, 24, 23, 22, 22, 22, 23,
+        23, 23, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21, 33, 33, 32, 32, 32, 32,
+        31, 29, 28, 28, 26, 26, 25, 24, 24, 23, 22, 22, 22, 23, 23, 23, 23, 23,
+        22, 22, 22, 22, 22, 21, 21, 21, 31, 30, 30, 30, 29, 29, 29, 27, 27, 26,
+        24, 24, 24, 23, 23, 22, 22, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22,
+        22, 21, 21, 21, 31, 30, 30, 30, 29, 29, 28, 27, 26, 26, 24, 24, 23, 23,
+        23, 22, 22, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21,
+        30, 29, 29, 28, 28, 28, 28, 26, 26, 25, 23, 23, 23, 23, 23, 22, 22, 22,
+        22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21, 28, 27, 27, 27,
+        26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 22, 21, 21, 22, 22, 22, 23,
+        23, 23, 22, 22, 22, 22, 22, 22, 21, 21, 28, 27, 27, 27, 26, 26, 26, 24,
+        24, 23, 22, 22, 22, 22, 22, 22, 21, 21, 22, 22, 22, 23, 23, 23, 22, 22,
+        22, 22, 22, 22, 21, 21, 27, 26, 26, 26, 25, 25, 25, 24, 23, 23, 22, 22,
+        22, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 21, 21, 21,
+        21, 21, 25, 24, 24, 24, 24, 24, 24, 23, 23, 23, 22, 22, 21, 21, 21, 21,
+        20, 20, 20, 21, 21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 25, 24,
+        24, 24, 24, 24, 24, 23, 23, 23, 22, 22, 21, 21, 21, 21, 20, 20, 20, 21,
+        21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 23, 23, 23, 23, 23, 23,
+        23, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+        20, 20, 20, 20, 20, 20, 19, 19, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22,
+        21, 21, 21, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+        19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20,
+        20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+        21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 19, 19,
+        19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 21, 22, 22, 22,
+        22, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19,
+        19, 18, 18, 18, 18, 18, 18, 18, 18, 18, 21, 22, 22, 22, 22, 23, 23, 22,
+        22, 22, 22, 22, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18,
+        18, 18, 18, 18, 18, 18, 21, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23,
+        22, 21, 21, 20, 19, 19, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 17,
+        17, 17, 21, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 22, 21, 21, 20,
+        19, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 17, 21, 22,
+        22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 22, 21, 21, 20, 19, 19, 19, 18,
+        18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 16, 16, 20, 21, 21, 21, 22, 22,
+        22, 22, 22, 22, 22, 22, 22, 20, 20, 20, 19, 19, 19, 18, 18, 18, 17, 17,
+        17, 17, 17, 16, 16, 16, 16, 16, 20, 21, 21, 21, 22, 22, 22, 22, 22, 22,
+        22, 22, 22, 20, 20, 20, 19, 19, 19, 18, 18, 17, 17, 17, 17, 17, 17, 16,
+        16, 16, 16, 16, 20, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 20,
+        20, 20, 19, 19, 19, 18, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16,
+        20, 20, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 19, 19,
+        18, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 20, 20, 20, 21,
+        21, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 19, 19, 18, 18, 18, 17,
+        17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 19, 20, 20, 20, 21, 21, 21, 21,
+        21, 21, 22, 22, 21, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16,
+        16, 15, 15, 15, 15, 15, 19, 19, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21,
+        21, 20, 20, 19, 19, 19, 18, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 15,
+        14, 14, 19, 19, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 20, 20, 19,
+        19, 19, 18, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 14, 14,
+        /* Size 4x8 */
+        33, 27, 22, 20, 33, 26, 22, 21, 28, 23, 22, 22, 24, 22, 20, 20, 22, 21,
+        19, 19, 22, 22, 19, 17, 21, 22, 19, 16, 20, 21, 18, 15,
+        /* Size 8x4 */
+        33, 33, 28, 24, 22, 22, 21, 20, 27, 26, 23, 22, 21, 22, 22, 21, 22, 22,
+        22, 20, 19, 19, 19, 18, 20, 21, 22, 20, 19, 17, 16, 15,
+        /* Size 8x16 */
+        32, 33, 29, 27, 21, 21, 20, 20, 33, 33, 28, 26, 22, 22, 21, 20, 34, 32,
+        27, 26, 22, 23, 22, 21, 33, 31, 27, 25, 22, 23, 22, 21, 31, 28, 25, 23,
+        22, 22, 22, 22, 28, 26, 23, 22, 22, 23, 22, 22, 26, 25, 22, 22, 21, 22,
+        22, 21, 24, 24, 22, 21, 20, 21, 20, 20, 21, 22, 21, 21, 19, 19, 19, 19,
+        21, 22, 22, 21, 19, 19, 19, 18, 21, 22, 22, 21, 19, 18, 18, 18, 21, 23,
+        23, 22, 19, 18, 17, 17, 20, 22, 22, 21, 19, 17, 17, 16, 20, 22, 22, 21,
+        19, 17, 17, 16, 20, 21, 22, 21, 19, 17, 16, 16, 19, 20, 21, 20, 19, 17,
+        16, 15,
+        /* Size 16x8 */
+        32, 33, 34, 33, 31, 28, 26, 24, 21, 21, 21, 21, 20, 20, 20, 19, 33, 33,
+        32, 31, 28, 26, 25, 24, 22, 22, 22, 23, 22, 22, 21, 20, 29, 28, 27, 27,
+        25, 23, 22, 22, 21, 22, 22, 23, 22, 22, 22, 21, 27, 26, 26, 25, 23, 22,
+        22, 21, 21, 21, 21, 22, 21, 21, 21, 20, 21, 22, 22, 22, 22, 22, 21, 20,
+        19, 19, 19, 19, 19, 19, 19, 19, 21, 22, 23, 23, 22, 23, 22, 21, 19, 19,
+        18, 18, 17, 17, 17, 17, 20, 21, 22, 22, 22, 22, 22, 20, 19, 19, 18, 17,
+        17, 17, 16, 16, 20, 20, 21, 21, 22, 22, 21, 20, 19, 18, 18, 17, 16, 16,
+        16, 15,
+        /* Size 16x32 */
+        32, 33, 33, 33, 29, 28, 27, 22, 21, 21, 21, 21, 20, 20, 20, 19, 33, 33,
+        33, 32, 28, 27, 26, 22, 22, 22, 21, 21, 21, 20, 20, 19, 33, 33, 33, 32,
+        28, 27, 26, 22, 22, 22, 22, 22, 21, 20, 20, 20, 33, 33, 33, 32, 28, 27,
+        26, 22, 22, 22, 22, 22, 21, 20, 20, 20, 34, 33, 32, 32, 27, 26, 26, 23,
+        22, 22, 23, 23, 22, 21, 21, 20, 34, 33, 32, 31, 27, 26, 25, 23, 22, 22,
+        23, 23, 22, 21, 21, 20, 33, 32, 31, 31, 27, 26, 25, 23, 22, 22, 23, 23,
+        22, 21, 21, 20, 31, 29, 29, 28, 25, 24, 24, 22, 22, 22, 23, 23, 22, 22,
+        22, 21, 31, 29, 28, 28, 25, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21,
+        30, 28, 28, 28, 24, 23, 23, 22, 22, 22, 23, 23, 22, 22, 22, 21, 28, 26,
+        26, 25, 23, 22, 22, 22, 22, 22, 23, 23, 22, 22, 22, 21, 28, 26, 26, 25,
+        23, 22, 22, 22, 22, 22, 23, 23, 22, 22, 22, 21, 26, 26, 25, 24, 22, 22,
+        22, 21, 21, 21, 22, 22, 22, 21, 21, 20, 24, 24, 24, 24, 22, 22, 21, 20,
+        20, 20, 21, 21, 20, 20, 20, 20, 24, 24, 24, 24, 22, 22, 21, 20, 20, 20,
+        21, 21, 20, 20, 20, 20, 23, 23, 23, 23, 22, 22, 21, 20, 20, 20, 20, 20,
+        20, 20, 20, 19, 21, 22, 22, 22, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19,
+        19, 19, 21, 22, 22, 22, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 19,
+        21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 19, 19, 19, 18, 18, 18, 21, 22,
+        22, 22, 22, 22, 21, 20, 19, 19, 18, 18, 18, 18, 18, 17, 21, 22, 22, 22,
+        22, 22, 21, 20, 19, 19, 18, 18, 18, 18, 18, 17, 21, 22, 23, 23, 22, 22,
+        22, 20, 19, 19, 18, 18, 18, 17, 17, 17, 21, 22, 23, 23, 23, 22, 22, 20,
+        19, 19, 18, 18, 17, 17, 17, 17, 21, 22, 23, 23, 22, 22, 22, 20, 19, 19,
+        18, 18, 17, 17, 17, 16, 20, 22, 22, 22, 22, 22, 21, 19, 19, 19, 17, 17,
+        17, 16, 16, 16, 20, 21, 22, 22, 22, 22, 21, 19, 19, 19, 17, 17, 17, 16,
+        16, 16, 20, 21, 22, 22, 22, 22, 21, 19, 19, 19, 17, 17, 17, 16, 16, 16,
+        20, 21, 21, 21, 22, 22, 21, 19, 19, 18, 17, 17, 16, 16, 16, 15, 20, 21,
+        21, 21, 22, 22, 21, 19, 19, 18, 17, 17, 16, 16, 16, 15, 19, 20, 21, 21,
+        21, 21, 21, 19, 19, 18, 17, 17, 16, 15, 15, 15, 19, 20, 20, 20, 21, 21,
+        20, 19, 19, 18, 17, 17, 16, 15, 15, 14, 19, 20, 20, 20, 21, 21, 20, 19,
+        19, 18, 17, 17, 16, 15, 15, 14,
+        /* Size 32x16 */
+        32, 33, 33, 33, 34, 34, 33, 31, 31, 30, 28, 28, 26, 24, 24, 23, 21, 21,
+        21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 33, 33, 33, 33,
+        33, 33, 32, 29, 29, 28, 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 22,
+        22, 22, 22, 21, 21, 21, 21, 20, 20, 20, 33, 33, 33, 33, 32, 32, 31, 29,
+        28, 28, 26, 26, 25, 24, 24, 23, 22, 22, 22, 22, 22, 23, 23, 23, 22, 22,
+        22, 21, 21, 21, 20, 20, 33, 32, 32, 32, 32, 31, 31, 28, 28, 28, 25, 25,
+        24, 24, 24, 23, 22, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 21, 21, 21,
+        20, 20, 29, 28, 28, 28, 27, 27, 27, 25, 25, 24, 23, 23, 22, 22, 22, 22,
+        21, 21, 22, 22, 22, 22, 23, 22, 22, 22, 22, 22, 22, 21, 21, 21, 28, 27,
+        27, 27, 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 22, 21, 21, 22, 22,
+        22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 27, 26, 26, 26, 26, 25,
+        25, 24, 23, 23, 22, 22, 22, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22,
+        21, 21, 21, 21, 21, 21, 20, 20, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22,
+        22, 22, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 19, 19, 19, 19,
+        19, 19, 19, 19, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20,
+        20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+        21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 19, 19,
+        19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 21, 21, 22, 22,
+        23, 23, 23, 23, 22, 23, 23, 23, 22, 21, 21, 20, 19, 19, 19, 18, 18, 18,
+        18, 18, 17, 17, 17, 17, 17, 17, 17, 17, 21, 21, 22, 22, 23, 23, 23, 23,
+        22, 23, 23, 23, 22, 21, 21, 20, 19, 19, 19, 18, 18, 18, 18, 18, 17, 17,
+        17, 17, 17, 17, 17, 17, 20, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22,
+        22, 20, 20, 20, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16,
+        16, 16, 20, 20, 20, 20, 21, 21, 21, 22, 22, 22, 22, 22, 21, 20, 20, 20,
+        19, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 20, 20,
+        20, 20, 21, 21, 21, 22, 22, 22, 22, 22, 21, 20, 20, 20, 19, 19, 18, 18,
+        18, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 19, 19, 20, 20, 20, 20,
+        20, 21, 21, 21, 21, 21, 20, 20, 20, 19, 19, 19, 18, 17, 17, 17, 17, 16,
+        16, 16, 16, 15, 15, 15, 14, 14,
+        /* Size 4x16 */
+        33, 28, 21, 20, 33, 27, 22, 20, 33, 26, 22, 21, 32, 26, 22, 21, 29, 24,
+        22, 22, 26, 22, 22, 22, 26, 22, 21, 21, 24, 22, 20, 20, 22, 21, 19, 19,
+        22, 22, 19, 18, 22, 22, 19, 18, 22, 22, 19, 17, 22, 22, 19, 16, 21, 22,
+        19, 16, 21, 22, 18, 16, 20, 21, 18, 15,
+        /* Size 16x4 */
+        33, 33, 33, 32, 29, 26, 26, 24, 22, 22, 22, 22, 22, 21, 21, 20, 28, 27,
+        26, 26, 24, 22, 22, 22, 21, 22, 22, 22, 22, 22, 22, 21, 21, 22, 22, 22,
+        22, 22, 21, 20, 19, 19, 19, 19, 19, 19, 18, 18, 20, 20, 21, 21, 22, 22,
+        21, 20, 19, 18, 18, 17, 16, 16, 16, 15,
+        /* Size 8x32 */
+        32, 33, 29, 27, 21, 21, 20, 20, 33, 33, 28, 26, 22, 21, 21, 20, 33, 33,
+        28, 26, 22, 22, 21, 20, 33, 33, 28, 26, 22, 22, 21, 20, 34, 32, 27, 26,
+        22, 23, 22, 21, 34, 32, 27, 25, 22, 23, 22, 21, 33, 31, 27, 25, 22, 23,
+        22, 21, 31, 29, 25, 24, 22, 23, 22, 22, 31, 28, 25, 23, 22, 22, 22, 22,
+        30, 28, 24, 23, 22, 23, 22, 22, 28, 26, 23, 22, 22, 23, 22, 22, 28, 26,
+        23, 22, 22, 23, 22, 22, 26, 25, 22, 22, 21, 22, 22, 21, 24, 24, 22, 21,
+        20, 21, 20, 20, 24, 24, 22, 21, 20, 21, 20, 20, 23, 23, 22, 21, 20, 20,
+        20, 20, 21, 22, 21, 21, 19, 19, 19, 19, 21, 22, 21, 21, 19, 19, 19, 19,
+        21, 22, 22, 21, 19, 19, 19, 18, 21, 22, 22, 21, 19, 18, 18, 18, 21, 22,
+        22, 21, 19, 18, 18, 18, 21, 23, 22, 22, 19, 18, 18, 17, 21, 23, 23, 22,
+        19, 18, 17, 17, 21, 23, 22, 22, 19, 18, 17, 17, 20, 22, 22, 21, 19, 17,
+        17, 16, 20, 22, 22, 21, 19, 17, 17, 16, 20, 22, 22, 21, 19, 17, 17, 16,
+        20, 21, 22, 21, 19, 17, 16, 16, 20, 21, 22, 21, 19, 17, 16, 16, 19, 21,
+        21, 21, 19, 17, 16, 15, 19, 20, 21, 20, 19, 17, 16, 15, 19, 20, 21, 20,
+        19, 17, 16, 15,
+        /* Size 32x8 */
+        32, 33, 33, 33, 34, 34, 33, 31, 31, 30, 28, 28, 26, 24, 24, 23, 21, 21,
+        21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 33, 33, 33, 33,
+        32, 32, 31, 29, 28, 28, 26, 26, 25, 24, 24, 23, 22, 22, 22, 22, 22, 23,
+        23, 23, 22, 22, 22, 21, 21, 21, 20, 20, 29, 28, 28, 28, 27, 27, 27, 25,
+        25, 24, 23, 23, 22, 22, 22, 22, 21, 21, 22, 22, 22, 22, 23, 22, 22, 22,
+        22, 22, 22, 21, 21, 21, 27, 26, 26, 26, 26, 25, 25, 24, 23, 23, 22, 22,
+        22, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, 21, 21, 21, 21, 21, 21,
+        20, 20, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20,
+        19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 21,
+        22, 22, 23, 23, 23, 23, 22, 23, 23, 23, 22, 21, 21, 20, 19, 19, 19, 18,
+        18, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 17, 20, 21, 21, 21, 22, 22,
+        22, 22, 22, 22, 22, 22, 22, 20, 20, 20, 19, 19, 19, 18, 18, 18, 17, 17,
+        17, 17, 17, 16, 16, 16, 16, 16, 20, 20, 20, 20, 21, 21, 21, 22, 22, 22,
+        22, 22, 21, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 16,
+        16, 15, 15, 15 },
+  },
+  {
+      { /* Luma */
+        /* Size 4x4 */
+        32, 32, 29, 24, 32, 30, 28, 24, 29, 28, 21, 19, 24, 24, 19, 16,
+        /* Size 8x8 */
+        33, 33, 32, 32, 30, 28, 24, 22, 33, 32, 32, 32, 30, 28, 25, 23, 32, 32,
+        31, 30, 29, 27, 24, 23, 32, 32, 30, 29, 28, 26, 24, 22, 30, 30, 29, 28,
+        25, 23, 21, 20, 28, 28, 27, 26, 23, 20, 18, 17, 24, 25, 24, 24, 21, 18,
+        16, 15, 22, 23, 23, 22, 20, 17, 15, 14,
+        /* Size 16x16 */
+        32, 33, 33, 33, 33, 33, 32, 32, 30, 29, 28, 26, 25, 23, 22, 21, 33, 32,
+        32, 32, 32, 32, 32, 31, 30, 29, 29, 27, 26, 24, 23, 22, 33, 32, 32, 32,
+        32, 32, 32, 31, 30, 30, 29, 27, 26, 24, 23, 23, 33, 32, 32, 32, 32, 32,
+        32, 31, 31, 30, 30, 28, 27, 25, 23, 23, 33, 32, 32, 32, 31, 31, 31, 30,
+        29, 28, 28, 26, 26, 24, 23, 23, 33, 32, 32, 32, 31, 31, 30, 30, 29, 28,
+        28, 26, 26, 24, 23, 23, 32, 32, 32, 32, 31, 30, 29, 28, 28, 27, 27, 26,
+        25, 24, 23, 22, 32, 31, 31, 31, 30, 30, 28, 28, 27, 26, 26, 24, 24, 23,
+        22, 22, 30, 30, 30, 31, 29, 29, 28, 27, 26, 24, 24, 23, 22, 22, 20, 20,
+        29, 29, 30, 30, 28, 28, 27, 26, 24, 22, 22, 21, 20, 20, 19, 19, 28, 29,
+        29, 30, 28, 28, 27, 26, 24, 22, 21, 20, 20, 19, 18, 18, 26, 27, 27, 28,
+        26, 26, 26, 24, 23, 21, 20, 19, 19, 18, 17, 17, 25, 26, 26, 27, 26, 26,
+        25, 24, 22, 20, 20, 19, 18, 17, 17, 16, 23, 24, 24, 25, 24, 24, 24, 23,
+        22, 20, 19, 18, 17, 16, 16, 15, 22, 23, 23, 23, 23, 23, 23, 22, 20, 19,
+        18, 17, 17, 16, 15, 15, 21, 22, 23, 23, 23, 23, 22, 22, 20, 19, 18, 17,
+        16, 15, 15, 14,
+        /* Size 32x32 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 30, 30, 30,
+        29, 28, 28, 27, 26, 26, 25, 23, 23, 23, 22, 21, 21, 20, 33, 33, 33, 33,
+        33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 29, 29, 29, 28,
+        26, 26, 26, 24, 24, 23, 22, 22, 22, 20, 33, 33, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 29, 29, 29, 28, 27, 27, 26, 24,
+        24, 24, 23, 22, 22, 21, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 31, 30, 30, 30, 29, 29, 29, 28, 27, 27, 26, 24, 24, 24, 23, 22,
+        22, 21, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30,
+        30, 30, 30, 29, 29, 28, 27, 27, 26, 24, 24, 24, 23, 23, 23, 21, 33, 33,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30,
+        30, 28, 28, 28, 27, 25, 25, 25, 23, 23, 23, 22, 33, 33, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 28, 28, 28,
+        27, 25, 25, 25, 23, 23, 23, 22, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 31, 31, 31, 31, 30, 30, 30, 30, 29, 29, 28, 27, 27, 26, 25, 25, 24,
+        23, 23, 23, 22, 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31,
+        30, 29, 29, 29, 28, 28, 28, 28, 26, 26, 26, 24, 24, 24, 23, 23, 23, 21,
+        33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 29, 29, 29,
+        28, 28, 28, 27, 26, 26, 26, 24, 24, 24, 23, 23, 23, 21, 33, 32, 32, 32,
+        32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 29, 29, 29, 28, 28, 28, 27,
+        26, 26, 26, 24, 24, 24, 23, 23, 23, 21, 32, 32, 32, 32, 32, 32, 32, 31,
+        31, 31, 31, 30, 29, 29, 29, 28, 28, 28, 28, 28, 28, 26, 26, 26, 25, 24,
+        24, 24, 23, 23, 23, 21, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 29,
+        29, 29, 28, 28, 28, 28, 27, 27, 27, 26, 26, 26, 25, 24, 24, 24, 23, 22,
+        22, 21, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 29, 29, 29, 28, 28,
+        28, 28, 27, 27, 27, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 21, 32, 31,
+        31, 31, 31, 31, 31, 31, 30, 30, 30, 29, 28, 28, 28, 27, 27, 27, 26, 26,
+        26, 25, 24, 24, 24, 23, 23, 23, 22, 22, 22, 20, 30, 30, 30, 30, 30, 31,
+        31, 30, 29, 29, 29, 28, 28, 28, 27, 26, 26, 26, 24, 24, 24, 23, 23, 23,
+        22, 22, 22, 21, 20, 20, 20, 19, 30, 30, 30, 30, 30, 31, 31, 30, 29, 29,
+        29, 28, 28, 28, 27, 26, 26, 26, 24, 24, 24, 23, 23, 23, 22, 22, 22, 21,
+        20, 20, 20, 19, 30, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 28, 28, 28,
+        27, 26, 26, 25, 24, 23, 23, 23, 22, 22, 22, 21, 21, 21, 20, 20, 20, 19,
+        29, 29, 29, 29, 30, 30, 30, 30, 28, 28, 28, 28, 27, 27, 26, 24, 24, 24,
+        22, 22, 22, 21, 21, 21, 20, 20, 20, 19, 19, 19, 19, 18, 28, 29, 29, 29,
+        29, 30, 30, 29, 28, 28, 28, 28, 27, 27, 26, 24, 24, 23, 22, 21, 21, 20,
+        20, 20, 20, 19, 19, 19, 18, 18, 18, 18, 28, 29, 29, 29, 29, 30, 30, 29,
+        28, 28, 28, 28, 27, 27, 26, 24, 24, 23, 22, 21, 21, 20, 20, 20, 20, 19,
+        19, 19, 18, 18, 18, 18, 27, 28, 28, 28, 28, 28, 28, 28, 28, 27, 27, 26,
+        26, 26, 25, 23, 23, 23, 21, 20, 20, 20, 20, 20, 19, 18, 18, 18, 18, 17,
+        17, 17, 26, 26, 27, 27, 27, 28, 28, 27, 26, 26, 26, 26, 26, 26, 24, 23,
+        23, 22, 21, 20, 20, 20, 19, 19, 19, 18, 18, 18, 17, 17, 17, 16, 26, 26,
+        27, 27, 27, 28, 28, 27, 26, 26, 26, 26, 26, 26, 24, 23, 23, 22, 21, 20,
+        20, 20, 19, 19, 19, 18, 18, 18, 17, 17, 17, 16, 25, 26, 26, 26, 26, 27,
+        27, 26, 26, 26, 26, 25, 25, 25, 24, 22, 22, 22, 20, 20, 20, 19, 19, 19,
+        18, 17, 17, 17, 17, 16, 16, 16, 23, 24, 24, 24, 24, 25, 25, 25, 24, 24,
+        24, 24, 24, 24, 23, 22, 22, 21, 20, 19, 19, 18, 18, 18, 17, 16, 16, 16,
+        16, 15, 15, 15, 23, 24, 24, 24, 24, 25, 25, 25, 24, 24, 24, 24, 24, 24,
+        23, 22, 22, 21, 20, 19, 19, 18, 18, 18, 17, 16, 16, 16, 16, 15, 15, 15,
+        23, 23, 24, 24, 24, 25, 25, 24, 24, 24, 24, 24, 24, 24, 23, 21, 21, 21,
+        19, 19, 19, 18, 18, 18, 17, 16, 16, 16, 15, 15, 15, 15, 22, 22, 23, 23,
+        23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 22, 20, 20, 20, 19, 18, 18, 18,
+        17, 17, 17, 16, 16, 15, 15, 15, 15, 14, 21, 22, 22, 22, 23, 23, 23, 23,
+        23, 23, 23, 23, 22, 22, 22, 20, 20, 20, 19, 18, 18, 17, 17, 17, 16, 15,
+        15, 15, 15, 14, 14, 14, 21, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23,
+        22, 22, 22, 20, 20, 20, 19, 18, 18, 17, 17, 17, 16, 15, 15, 15, 15, 14,
+        14, 14, 20, 20, 21, 21, 21, 22, 22, 22, 21, 21, 21, 21, 21, 21, 20, 19,
+        19, 19, 18, 18, 18, 17, 16, 16, 16, 15, 15, 15, 14, 14, 14, 13,
+        /* Size 4x8 */
+        33, 32, 29, 24, 32, 31, 30, 25, 32, 30, 28, 24, 32, 29, 27, 24, 30, 28,
+        24, 21, 28, 26, 21, 18, 24, 24, 19, 16, 22, 22, 18, 15,
+        /* Size 8x4 */
+        33, 32, 32, 32, 30, 28, 24, 22, 32, 31, 30, 29, 28, 26, 24, 22, 29, 30,
+        28, 27, 24, 21, 19, 18, 24, 25, 24, 24, 21, 18, 16, 15,
+        /* Size 8x16 */
+        32, 33, 33, 32, 29, 28, 23, 22, 33, 32, 32, 32, 29, 29, 24, 23, 33, 32,
+        32, 32, 30, 29, 25, 23, 33, 32, 32, 31, 30, 30, 25, 23, 33, 32, 31, 30,
+        29, 28, 24, 23, 32, 32, 31, 30, 28, 28, 24, 23, 32, 31, 30, 29, 28, 27,
+        24, 23, 32, 31, 30, 28, 26, 26, 23, 22, 30, 30, 29, 28, 25, 24, 21, 20,
+        29, 30, 28, 27, 23, 22, 20, 19, 28, 30, 28, 27, 22, 21, 19, 18, 26, 28,
+        26, 26, 21, 20, 18, 17, 25, 26, 26, 25, 21, 20, 17, 17, 23, 25, 24, 24,
+        20, 19, 16, 16, 22, 23, 23, 23, 19, 18, 16, 15, 21, 23, 23, 22, 19, 18,
+        15, 15,
+        /* Size 16x8 */
+        32, 33, 33, 33, 33, 32, 32, 32, 30, 29, 28, 26, 25, 23, 22, 21, 33, 32,
+        32, 32, 32, 32, 31, 31, 30, 30, 30, 28, 26, 25, 23, 23, 33, 32, 32, 32,
+        31, 31, 30, 30, 29, 28, 28, 26, 26, 24, 23, 23, 32, 32, 32, 31, 30, 30,
+        29, 28, 28, 27, 27, 26, 25, 24, 23, 22, 29, 29, 30, 30, 29, 28, 28, 26,
+        25, 23, 22, 21, 21, 20, 19, 19, 28, 29, 29, 30, 28, 28, 27, 26, 24, 22,
+        21, 20, 20, 19, 18, 18, 23, 24, 25, 25, 24, 24, 24, 23, 21, 20, 19, 18,
+        17, 16, 16, 15, 22, 23, 23, 23, 23, 23, 23, 22, 20, 19, 18, 17, 17, 16,
+        15, 15,
+        /* Size 16x32 */
+        32, 33, 33, 33, 33, 32, 32, 32, 29, 28, 28, 26, 23, 23, 22, 19, 33, 33,
+        32, 32, 32, 32, 32, 31, 29, 29, 29, 26, 24, 24, 22, 20, 33, 32, 32, 32,
+        32, 32, 32, 31, 29, 29, 29, 26, 24, 24, 23, 20, 33, 32, 32, 32, 32, 32,
+        32, 31, 29, 29, 29, 26, 24, 24, 23, 20, 33, 32, 32, 32, 32, 32, 32, 31,
+        30, 29, 29, 26, 25, 25, 23, 20, 33, 32, 32, 32, 32, 31, 31, 31, 30, 30,
+        30, 27, 25, 25, 23, 21, 33, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 27,
+        25, 25, 23, 21, 33, 32, 32, 32, 32, 31, 31, 31, 30, 29, 29, 27, 25, 25,
+        23, 21, 33, 32, 32, 32, 31, 30, 30, 30, 29, 28, 28, 26, 24, 24, 23, 21,
+        32, 32, 32, 32, 31, 30, 30, 30, 28, 28, 28, 26, 24, 24, 23, 20, 32, 32,
+        32, 32, 31, 30, 30, 30, 28, 28, 28, 26, 24, 24, 23, 20, 32, 32, 32, 32,
+        31, 29, 29, 29, 28, 28, 28, 26, 24, 24, 23, 21, 32, 32, 31, 31, 30, 29,
+        29, 28, 28, 27, 27, 25, 24, 24, 23, 21, 32, 32, 31, 31, 30, 29, 29, 28,
+        28, 27, 27, 25, 24, 24, 23, 21, 32, 31, 31, 31, 30, 28, 28, 28, 26, 26,
+        26, 24, 23, 23, 22, 20, 30, 30, 30, 30, 29, 28, 28, 27, 25, 24, 24, 23,
+        21, 21, 20, 19, 30, 30, 30, 30, 29, 28, 28, 27, 25, 24, 24, 23, 21, 21,
+        20, 19, 30, 30, 30, 30, 29, 28, 28, 27, 24, 24, 24, 22, 21, 21, 20, 19,
+        29, 29, 30, 30, 28, 27, 27, 26, 23, 22, 22, 20, 20, 20, 19, 17, 28, 29,
+        30, 30, 28, 27, 27, 26, 22, 21, 21, 20, 19, 19, 18, 17, 28, 29, 30, 30,
+        28, 27, 27, 26, 22, 21, 21, 20, 19, 19, 18, 17, 27, 28, 28, 28, 28, 26,
+        26, 25, 22, 21, 21, 19, 18, 18, 18, 16, 26, 27, 28, 28, 26, 26, 26, 24,
+        21, 20, 20, 19, 18, 18, 17, 16, 26, 27, 28, 28, 26, 26, 26, 24, 21, 20,
+        20, 19, 18, 18, 17, 16, 25, 26, 26, 26, 26, 25, 25, 24, 21, 20, 20, 18,
+        17, 17, 17, 15, 23, 24, 25, 25, 24, 24, 24, 23, 20, 19, 19, 17, 16, 16,
+        16, 14, 23, 24, 25, 25, 24, 24, 24, 23, 20, 19, 19, 17, 16, 16, 16, 14,
+        23, 24, 24, 24, 24, 24, 24, 23, 20, 19, 19, 17, 16, 16, 15, 14, 22, 23,
+        23, 23, 23, 23, 23, 22, 19, 18, 18, 17, 16, 16, 15, 14, 21, 22, 23, 23,
+        23, 22, 22, 21, 19, 18, 18, 17, 15, 15, 15, 13, 21, 22, 23, 23, 23, 22,
+        22, 21, 19, 18, 18, 17, 15, 15, 15, 13, 20, 21, 22, 22, 21, 21, 21, 20,
+        18, 18, 18, 16, 15, 15, 14, 13,
+        /* Size 32x16 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 30, 30, 30,
+        29, 28, 28, 27, 26, 26, 25, 23, 23, 23, 22, 21, 21, 20, 33, 33, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 29, 29, 29, 28,
+        27, 27, 26, 24, 24, 24, 23, 22, 22, 21, 33, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 30, 28, 28, 28, 26, 25,
+        25, 24, 23, 23, 23, 22, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        31, 31, 31, 30, 30, 30, 30, 30, 30, 28, 28, 28, 26, 25, 25, 24, 23, 23,
+        23, 22, 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 29,
+        29, 29, 28, 28, 28, 28, 26, 26, 26, 24, 24, 24, 23, 23, 23, 21, 32, 32,
+        32, 32, 32, 31, 31, 31, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 27, 27,
+        27, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 21, 32, 32, 32, 32, 32, 31,
+        31, 31, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 27, 27, 27, 26, 26, 26,
+        25, 24, 24, 24, 23, 22, 22, 21, 32, 31, 31, 31, 31, 31, 31, 31, 30, 30,
+        30, 29, 28, 28, 28, 27, 27, 27, 26, 26, 26, 25, 24, 24, 24, 23, 23, 23,
+        22, 21, 21, 20, 29, 29, 29, 29, 30, 30, 30, 30, 29, 28, 28, 28, 28, 28,
+        26, 25, 25, 24, 23, 22, 22, 22, 21, 21, 21, 20, 20, 20, 19, 19, 19, 18,
+        28, 29, 29, 29, 29, 30, 30, 29, 28, 28, 28, 28, 27, 27, 26, 24, 24, 24,
+        22, 21, 21, 21, 20, 20, 20, 19, 19, 19, 18, 18, 18, 18, 28, 29, 29, 29,
+        29, 30, 30, 29, 28, 28, 28, 28, 27, 27, 26, 24, 24, 24, 22, 21, 21, 21,
+        20, 20, 20, 19, 19, 19, 18, 18, 18, 18, 26, 26, 26, 26, 26, 27, 27, 27,
+        26, 26, 26, 26, 25, 25, 24, 23, 23, 22, 20, 20, 20, 19, 19, 19, 18, 17,
+        17, 17, 17, 17, 17, 16, 23, 24, 24, 24, 25, 25, 25, 25, 24, 24, 24, 24,
+        24, 24, 23, 21, 21, 21, 20, 19, 19, 18, 18, 18, 17, 16, 16, 16, 16, 15,
+        15, 15, 23, 24, 24, 24, 25, 25, 25, 25, 24, 24, 24, 24, 24, 24, 23, 21,
+        21, 21, 20, 19, 19, 18, 18, 18, 17, 16, 16, 16, 16, 15, 15, 15, 22, 22,
+        23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 22, 20, 20, 20, 19, 18,
+        18, 18, 17, 17, 17, 16, 16, 15, 15, 15, 15, 14, 19, 20, 20, 20, 20, 21,
+        21, 21, 21, 20, 20, 21, 21, 21, 20, 19, 19, 19, 17, 17, 17, 16, 16, 16,
+        15, 14, 14, 14, 14, 13, 13, 13,
+        /* Size 4x16 */
+        33, 32, 28, 23, 32, 32, 29, 24, 32, 32, 29, 25, 32, 31, 30, 25, 32, 30,
+        28, 24, 32, 30, 28, 24, 32, 29, 27, 24, 31, 28, 26, 23, 30, 28, 24, 21,
+        29, 27, 22, 20, 29, 27, 21, 19, 27, 26, 20, 18, 26, 25, 20, 17, 24, 24,
+        19, 16, 23, 23, 18, 16, 22, 22, 18, 15,
+        /* Size 16x4 */
+        33, 32, 32, 32, 32, 32, 32, 31, 30, 29, 29, 27, 26, 24, 23, 22, 32, 32,
+        32, 31, 30, 30, 29, 28, 28, 27, 27, 26, 25, 24, 23, 22, 28, 29, 29, 30,
+        28, 28, 27, 26, 24, 22, 21, 20, 20, 19, 18, 18, 23, 24, 25, 25, 24, 24,
+        24, 23, 21, 20, 19, 18, 17, 16, 16, 15,
+        /* Size 8x32 */
+        32, 33, 33, 32, 29, 28, 23, 22, 33, 32, 32, 32, 29, 29, 24, 22, 33, 32,
+        32, 32, 29, 29, 24, 23, 33, 32, 32, 32, 29, 29, 24, 23, 33, 32, 32, 32,
+        30, 29, 25, 23, 33, 32, 32, 31, 30, 30, 25, 23, 33, 32, 32, 31, 30, 30,
+        25, 23, 33, 32, 32, 31, 30, 29, 25, 23, 33, 32, 31, 30, 29, 28, 24, 23,
+        32, 32, 31, 30, 28, 28, 24, 23, 32, 32, 31, 30, 28, 28, 24, 23, 32, 32,
+        31, 29, 28, 28, 24, 23, 32, 31, 30, 29, 28, 27, 24, 23, 32, 31, 30, 29,
+        28, 27, 24, 23, 32, 31, 30, 28, 26, 26, 23, 22, 30, 30, 29, 28, 25, 24,
+        21, 20, 30, 30, 29, 28, 25, 24, 21, 20, 30, 30, 29, 28, 24, 24, 21, 20,
+        29, 30, 28, 27, 23, 22, 20, 19, 28, 30, 28, 27, 22, 21, 19, 18, 28, 30,
+        28, 27, 22, 21, 19, 18, 27, 28, 28, 26, 22, 21, 18, 18, 26, 28, 26, 26,
+        21, 20, 18, 17, 26, 28, 26, 26, 21, 20, 18, 17, 25, 26, 26, 25, 21, 20,
+        17, 17, 23, 25, 24, 24, 20, 19, 16, 16, 23, 25, 24, 24, 20, 19, 16, 16,
+        23, 24, 24, 24, 20, 19, 16, 15, 22, 23, 23, 23, 19, 18, 16, 15, 21, 23,
+        23, 22, 19, 18, 15, 15, 21, 23, 23, 22, 19, 18, 15, 15, 20, 22, 21, 21,
+        18, 18, 15, 14,
+        /* Size 32x8 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 30, 30, 30,
+        29, 28, 28, 27, 26, 26, 25, 23, 23, 23, 22, 21, 21, 20, 33, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 30, 28,
+        28, 28, 26, 25, 25, 24, 23, 23, 23, 22, 33, 32, 32, 32, 32, 32, 32, 32,
+        31, 31, 31, 31, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 26, 26, 26, 24,
+        24, 24, 23, 23, 23, 21, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 29,
+        29, 29, 28, 28, 28, 28, 27, 27, 27, 26, 26, 26, 25, 24, 24, 24, 23, 22,
+        22, 21, 29, 29, 29, 29, 30, 30, 30, 30, 29, 28, 28, 28, 28, 28, 26, 25,
+        25, 24, 23, 22, 22, 22, 21, 21, 21, 20, 20, 20, 19, 19, 19, 18, 28, 29,
+        29, 29, 29, 30, 30, 29, 28, 28, 28, 28, 27, 27, 26, 24, 24, 24, 22, 21,
+        21, 21, 20, 20, 20, 19, 19, 19, 18, 18, 18, 18, 23, 24, 24, 24, 25, 25,
+        25, 25, 24, 24, 24, 24, 24, 24, 23, 21, 21, 21, 20, 19, 19, 18, 18, 18,
+        17, 16, 16, 16, 16, 15, 15, 15, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23,
+        23, 23, 23, 23, 22, 20, 20, 20, 19, 18, 18, 18, 17, 17, 17, 16, 16, 15,
+        15, 15, 15, 14 },
+      { /* Chroma */
+        /* Size 4x4 */
+        33, 28, 22, 22, 28, 23, 22, 23, 22, 22, 19, 19, 22, 23, 19, 17,
+        /* Size 8x8 */
+        33, 33, 30, 28, 24, 21, 22, 21, 33, 32, 29, 26, 24, 22, 23, 22, 30, 29,
+        26, 24, 23, 22, 23, 22, 28, 26, 24, 22, 22, 22, 23, 22, 24, 24, 23, 22,
+        21, 20, 20, 20, 21, 22, 22, 22, 20, 19, 19, 19, 22, 23, 23, 23, 20, 19,
+        18, 17, 21, 22, 22, 22, 20, 19, 17, 17,
+        /* Size 16x16 */
+        32, 33, 33, 34, 31, 31, 28, 27, 25, 22, 21, 21, 21, 21, 20, 20, 33, 33,
+        33, 33, 30, 30, 27, 26, 24, 22, 22, 22, 22, 22, 21, 21, 33, 33, 33, 33,
+        30, 29, 26, 26, 24, 22, 22, 22, 22, 22, 22, 22, 34, 33, 33, 32, 30, 29,
+        26, 25, 24, 23, 22, 23, 23, 23, 22, 22, 31, 30, 30, 30, 28, 27, 24, 24,
+        23, 22, 22, 22, 22, 23, 22, 22, 31, 30, 29, 29, 27, 26, 24, 23, 23, 22,
+        22, 22, 22, 23, 22, 22, 28, 27, 26, 26, 24, 24, 22, 22, 22, 22, 21, 22,
+        22, 23, 22, 22, 27, 26, 26, 25, 24, 23, 22, 22, 21, 21, 21, 21, 22, 22,
+        22, 22, 25, 24, 24, 24, 23, 23, 22, 21, 21, 20, 20, 21, 21, 21, 20, 20,
+        22, 22, 22, 23, 22, 22, 22, 21, 20, 20, 20, 20, 20, 20, 19, 19, 21, 22,
+        22, 22, 22, 22, 21, 21, 20, 20, 19, 19, 19, 19, 19, 19, 21, 22, 22, 23,
+        22, 22, 22, 21, 21, 20, 19, 19, 19, 19, 18, 18, 21, 22, 22, 23, 22, 22,
+        22, 22, 21, 20, 19, 19, 19, 18, 18, 18, 21, 22, 22, 23, 23, 23, 23, 22,
+        21, 20, 19, 19, 18, 18, 17, 17, 20, 21, 22, 22, 22, 22, 22, 22, 20, 19,
+        19, 18, 18, 17, 17, 17, 20, 21, 22, 22, 22, 22, 22, 22, 20, 19, 19, 18,
+        18, 17, 17, 17,
+        /* Size 32x32 */
+        32, 33, 33, 33, 33, 34, 34, 33, 31, 31, 31, 29, 28, 28, 27, 25, 25, 24,
+        22, 21, 21, 21, 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 33, 33, 33, 33,
+        33, 33, 33, 33, 31, 30, 30, 28, 28, 28, 26, 24, 24, 24, 22, 21, 21, 21,
+        22, 22, 22, 22, 22, 21, 21, 21, 21, 20, 33, 33, 33, 33, 33, 33, 33, 32,
+        30, 30, 30, 28, 27, 27, 26, 24, 24, 24, 22, 22, 22, 22, 22, 22, 22, 22,
+        22, 22, 21, 21, 21, 21, 33, 33, 33, 33, 33, 33, 33, 32, 30, 30, 30, 28,
+        27, 27, 26, 24, 24, 24, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21,
+        21, 21, 33, 33, 33, 33, 33, 33, 33, 32, 30, 29, 29, 28, 26, 26, 26, 24,
+        24, 24, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 34, 33,
+        33, 33, 33, 32, 32, 32, 30, 29, 29, 27, 26, 26, 25, 24, 24, 24, 23, 22,
+        22, 22, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 34, 33, 33, 33, 33, 32,
+        32, 32, 30, 29, 29, 27, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 23, 23,
+        23, 23, 23, 23, 22, 22, 22, 22, 33, 33, 32, 32, 32, 32, 32, 31, 29, 28,
+        28, 27, 26, 26, 25, 24, 24, 24, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23,
+        22, 22, 22, 22, 31, 31, 30, 30, 30, 30, 30, 29, 28, 27, 27, 25, 24, 24,
+        24, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22,
+        31, 30, 30, 30, 29, 29, 29, 28, 27, 26, 26, 25, 24, 24, 23, 23, 23, 23,
+        22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 31, 30, 30, 30,
+        29, 29, 29, 28, 27, 26, 26, 25, 24, 24, 23, 23, 23, 23, 22, 22, 22, 22,
+        22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 29, 28, 28, 28, 28, 27, 27, 27,
+        25, 25, 25, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 23,
+        23, 23, 22, 22, 22, 22, 28, 28, 27, 27, 26, 26, 26, 26, 24, 24, 24, 22,
+        22, 22, 22, 22, 22, 22, 22, 21, 21, 22, 22, 22, 22, 23, 23, 23, 22, 22,
+        22, 22, 28, 28, 27, 27, 26, 26, 26, 26, 24, 24, 24, 22, 22, 22, 22, 22,
+        22, 22, 22, 21, 21, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 27, 26,
+        26, 26, 26, 25, 25, 25, 24, 23, 23, 22, 22, 22, 22, 21, 21, 21, 21, 21,
+        21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 21, 25, 24, 24, 24, 24, 24,
+        24, 24, 23, 23, 23, 22, 22, 22, 21, 21, 21, 21, 20, 20, 20, 20, 21, 21,
+        21, 21, 21, 21, 20, 20, 20, 20, 25, 24, 24, 24, 24, 24, 24, 24, 23, 23,
+        23, 22, 22, 22, 21, 21, 21, 21, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21,
+        20, 20, 20, 20, 24, 24, 24, 24, 24, 24, 24, 24, 23, 23, 23, 22, 22, 22,
+        21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+        22, 22, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20,
+        20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 19, 19, 19, 19, 21, 21, 22, 22,
+        22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 20, 19, 19, 19,
+        19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 22, 22,
+        22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19,
+        19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+        22, 22, 21, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+        19, 18, 21, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 22, 22, 21, 21,
+        21, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 21, 22,
+        22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 19,
+        19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 21, 22, 22, 22, 22, 23,
+        23, 23, 22, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 19, 19, 19, 19, 19,
+        19, 18, 18, 18, 18, 18, 18, 18, 21, 22, 22, 22, 22, 23, 23, 23, 23, 23,
+        23, 23, 23, 23, 22, 21, 21, 20, 20, 19, 19, 19, 19, 19, 18, 18, 18, 17,
+        17, 17, 17, 17, 21, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23,
+        22, 21, 21, 20, 20, 19, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17,
+        21, 21, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 22, 21, 21, 20,
+        20, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 20, 21, 21, 21,
+        22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 20, 20, 20, 19, 19, 19, 19,
+        18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 20, 21, 21, 21, 22, 22, 22, 22,
+        22, 22, 22, 22, 22, 22, 22, 20, 20, 20, 19, 19, 19, 19, 18, 18, 18, 17,
+        17, 17, 17, 17, 17, 16, 20, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22,
+        22, 22, 22, 20, 20, 20, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17,
+        17, 16, 20, 20, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20,
+        20, 20, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16,
+        /* Size 4x8 */
+        33, 27, 22, 21, 33, 26, 22, 23, 29, 24, 22, 22, 26, 22, 22, 23, 24, 22,
+        20, 20, 22, 22, 19, 19, 22, 22, 19, 18, 21, 22, 19, 17,
+        /* Size 8x4 */
+        33, 33, 29, 26, 24, 22, 22, 21, 27, 26, 24, 22, 22, 22, 22, 22, 22, 22,
+        22, 22, 20, 19, 19, 19, 21, 23, 22, 23, 20, 19, 18, 17,
+        /* Size 8x16 */
+        32, 33, 31, 28, 23, 21, 21, 20, 33, 33, 30, 27, 23, 22, 22, 21, 33, 32,
+        30, 26, 23, 22, 22, 22, 34, 32, 29, 26, 23, 22, 23, 22, 31, 29, 28, 24,
+        22, 22, 23, 22, 31, 28, 27, 24, 22, 22, 22, 22, 28, 26, 24, 22, 22, 22,
+        23, 22, 26, 25, 24, 22, 21, 21, 22, 22, 24, 24, 23, 22, 21, 20, 21, 20,
+        22, 22, 22, 21, 20, 20, 19, 19, 21, 22, 22, 21, 20, 19, 19, 19, 21, 22,
+        22, 22, 20, 19, 18, 18, 21, 23, 22, 22, 20, 19, 18, 18, 21, 23, 23, 22,
+        20, 19, 18, 17, 20, 22, 22, 22, 20, 19, 17, 17, 20, 22, 22, 22, 20, 19,
+        17, 17,
+        /* Size 16x8 */
+        32, 33, 33, 34, 31, 31, 28, 26, 24, 22, 21, 21, 21, 21, 20, 20, 33, 33,
+        32, 32, 29, 28, 26, 25, 24, 22, 22, 22, 23, 23, 22, 22, 31, 30, 30, 29,
+        28, 27, 24, 24, 23, 22, 22, 22, 22, 23, 22, 22, 28, 27, 26, 26, 24, 24,
+        22, 22, 22, 21, 21, 22, 22, 22, 22, 22, 23, 23, 23, 23, 22, 22, 22, 21,
+        21, 20, 20, 20, 20, 20, 20, 20, 21, 22, 22, 22, 22, 22, 22, 21, 20, 20,
+        19, 19, 19, 19, 19, 19, 21, 22, 22, 23, 23, 22, 23, 22, 21, 19, 19, 18,
+        18, 18, 17, 17, 20, 21, 22, 22, 22, 22, 22, 22, 20, 19, 19, 18, 18, 17,
+        17, 17,
+        /* Size 16x32 */
+        32, 33, 33, 33, 31, 28, 28, 27, 23, 21, 21, 21, 21, 21, 20, 20, 33, 33,
+        33, 33, 31, 27, 27, 26, 23, 22, 22, 21, 21, 21, 21, 20, 33, 33, 33, 33,
+        30, 27, 27, 26, 23, 22, 22, 22, 22, 22, 21, 20, 33, 33, 33, 33, 30, 27,
+        27, 26, 23, 22, 22, 22, 22, 22, 21, 20, 33, 33, 32, 32, 30, 26, 26, 26,
+        23, 22, 22, 22, 22, 22, 22, 21, 34, 33, 32, 32, 29, 26, 26, 25, 23, 22,
+        22, 23, 23, 23, 22, 21, 34, 33, 32, 32, 29, 26, 26, 25, 23, 22, 22, 23,
+        23, 23, 22, 21, 33, 32, 31, 31, 29, 26, 26, 25, 23, 22, 22, 23, 23, 23,
+        22, 21, 31, 30, 29, 29, 28, 24, 24, 24, 22, 22, 22, 22, 23, 23, 22, 22,
+        31, 29, 28, 28, 27, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 31, 29,
+        28, 28, 27, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 29, 28, 27, 27,
+        25, 23, 23, 22, 22, 22, 22, 22, 23, 23, 22, 22, 28, 26, 26, 26, 24, 22,
+        22, 22, 22, 22, 22, 22, 23, 23, 22, 22, 28, 26, 26, 26, 24, 22, 22, 22,
+        22, 22, 22, 22, 23, 23, 22, 22, 26, 26, 25, 25, 24, 22, 22, 22, 21, 21,
+        21, 22, 22, 22, 22, 21, 24, 24, 24, 24, 23, 22, 22, 21, 21, 20, 20, 21,
+        21, 21, 20, 20, 24, 24, 24, 24, 23, 22, 22, 21, 21, 20, 20, 21, 21, 21,
+        20, 20, 24, 24, 24, 24, 23, 22, 22, 21, 20, 20, 20, 20, 20, 20, 20, 20,
+        22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 20, 19, 19, 19, 19, 21, 22,
+        22, 22, 22, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 21, 22, 22, 22,
+        22, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 21, 22, 22, 22, 22, 22,
+        22, 21, 20, 19, 19, 19, 19, 19, 19, 18, 21, 22, 22, 22, 22, 22, 22, 21,
+        20, 19, 19, 19, 18, 18, 18, 18, 21, 22, 22, 22, 22, 22, 22, 21, 20, 19,
+        19, 19, 18, 18, 18, 18, 21, 22, 23, 23, 22, 22, 22, 22, 20, 19, 19, 19,
+        18, 18, 18, 17, 21, 22, 23, 23, 23, 22, 22, 22, 20, 19, 19, 18, 18, 18,
+        17, 17, 21, 22, 23, 23, 23, 22, 22, 22, 20, 19, 19, 18, 18, 18, 17, 17,
+        21, 22, 23, 23, 23, 22, 22, 22, 20, 19, 19, 18, 18, 18, 17, 17, 20, 21,
+        22, 22, 22, 22, 22, 21, 20, 19, 19, 18, 17, 17, 17, 16, 20, 21, 22, 22,
+        22, 22, 22, 21, 20, 19, 19, 18, 17, 17, 17, 16, 20, 21, 22, 22, 22, 22,
+        22, 21, 20, 19, 19, 18, 17, 17, 17, 16, 20, 21, 22, 22, 22, 22, 22, 21,
+        20, 19, 19, 18, 17, 17, 17, 16,
+        /* Size 32x16 */
+        32, 33, 33, 33, 33, 34, 34, 33, 31, 31, 31, 29, 28, 28, 26, 24, 24, 24,
+        22, 21, 21, 21, 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 33, 33, 33, 33,
+        33, 33, 33, 32, 30, 29, 29, 28, 26, 26, 26, 24, 24, 24, 22, 22, 22, 22,
+        22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 33, 33, 33, 33, 32, 32, 32, 31,
+        29, 28, 28, 27, 26, 26, 25, 24, 24, 24, 22, 22, 22, 22, 22, 22, 23, 23,
+        23, 23, 22, 22, 22, 22, 33, 33, 33, 33, 32, 32, 32, 31, 29, 28, 28, 27,
+        26, 26, 25, 24, 24, 24, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 22, 22,
+        22, 22, 31, 31, 30, 30, 30, 29, 29, 29, 28, 27, 27, 25, 24, 24, 24, 23,
+        23, 23, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 28, 27,
+        27, 27, 26, 26, 26, 26, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 21, 21,
+        21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 28, 27, 27, 27, 26, 26,
+        26, 26, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 22, 22,
+        22, 22, 22, 22, 22, 22, 22, 22, 27, 26, 26, 26, 26, 25, 25, 25, 24, 23,
+        23, 22, 22, 22, 22, 21, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, 22,
+        21, 21, 21, 21, 23, 23, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22,
+        21, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+        21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20,
+        20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 22, 22, 22,
+        22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 20, 19, 19, 19,
+        19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 23, 23, 23,
+        22, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 19, 19, 19, 19, 19, 19, 18,
+        18, 18, 18, 18, 18, 18, 21, 21, 22, 22, 22, 23, 23, 23, 23, 22, 22, 23,
+        23, 23, 22, 21, 21, 20, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 17, 17,
+        17, 17, 21, 21, 22, 22, 22, 23, 23, 23, 23, 22, 22, 23, 23, 23, 22, 21,
+        21, 20, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 17, 17, 17, 17, 20, 21,
+        21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 20, 20, 20, 19, 19,
+        19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 20, 20, 20, 20, 21, 21,
+        21, 21, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 19, 19, 19, 18, 18, 18,
+        17, 17, 17, 17, 16, 16, 16, 16,
+        /* Size 4x16 */
+        33, 28, 21, 21, 33, 27, 22, 22, 33, 26, 22, 22, 33, 26, 22, 23, 30, 24,
+        22, 23, 29, 24, 22, 22, 26, 22, 22, 23, 26, 22, 21, 22, 24, 22, 20, 21,
+        22, 21, 20, 19, 22, 21, 19, 19, 22, 22, 19, 18, 22, 22, 19, 18, 22, 22,
+        19, 18, 21, 22, 19, 17, 21, 22, 19, 17,
+        /* Size 16x4 */
+        33, 33, 33, 33, 30, 29, 26, 26, 24, 22, 22, 22, 22, 22, 21, 21, 28, 27,
+        26, 26, 24, 24, 22, 22, 22, 21, 21, 22, 22, 22, 22, 22, 21, 22, 22, 22,
+        22, 22, 22, 21, 20, 20, 19, 19, 19, 19, 19, 19, 21, 22, 22, 23, 23, 22,
+        23, 22, 21, 19, 19, 18, 18, 18, 17, 17,
+        /* Size 8x32 */
+        32, 33, 31, 28, 23, 21, 21, 20, 33, 33, 31, 27, 23, 22, 21, 21, 33, 33,
+        30, 27, 23, 22, 22, 21, 33, 33, 30, 27, 23, 22, 22, 21, 33, 32, 30, 26,
+        23, 22, 22, 22, 34, 32, 29, 26, 23, 22, 23, 22, 34, 32, 29, 26, 23, 22,
+        23, 22, 33, 31, 29, 26, 23, 22, 23, 22, 31, 29, 28, 24, 22, 22, 23, 22,
+        31, 28, 27, 24, 22, 22, 22, 22, 31, 28, 27, 24, 22, 22, 22, 22, 29, 27,
+        25, 23, 22, 22, 23, 22, 28, 26, 24, 22, 22, 22, 23, 22, 28, 26, 24, 22,
+        22, 22, 23, 22, 26, 25, 24, 22, 21, 21, 22, 22, 24, 24, 23, 22, 21, 20,
+        21, 20, 24, 24, 23, 22, 21, 20, 21, 20, 24, 24, 23, 22, 20, 20, 20, 20,
+        22, 22, 22, 21, 20, 20, 19, 19, 21, 22, 22, 21, 20, 19, 19, 19, 21, 22,
+        22, 21, 20, 19, 19, 19, 21, 22, 22, 22, 20, 19, 19, 19, 21, 22, 22, 22,
+        20, 19, 18, 18, 21, 22, 22, 22, 20, 19, 18, 18, 21, 23, 22, 22, 20, 19,
+        18, 18, 21, 23, 23, 22, 20, 19, 18, 17, 21, 23, 23, 22, 20, 19, 18, 17,
+        21, 23, 23, 22, 20, 19, 18, 17, 20, 22, 22, 22, 20, 19, 17, 17, 20, 22,
+        22, 22, 20, 19, 17, 17, 20, 22, 22, 22, 20, 19, 17, 17, 20, 22, 22, 22,
+        20, 19, 17, 17,
+        /* Size 32x8 */
+        32, 33, 33, 33, 33, 34, 34, 33, 31, 31, 31, 29, 28, 28, 26, 24, 24, 24,
+        22, 21, 21, 21, 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 33, 33, 33, 33,
+        32, 32, 32, 31, 29, 28, 28, 27, 26, 26, 25, 24, 24, 24, 22, 22, 22, 22,
+        22, 22, 23, 23, 23, 23, 22, 22, 22, 22, 31, 31, 30, 30, 30, 29, 29, 29,
+        28, 27, 27, 25, 24, 24, 24, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 23,
+        23, 23, 22, 22, 22, 22, 28, 27, 27, 27, 26, 26, 26, 26, 24, 24, 24, 23,
+        22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+        22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 21, 21,
+        21, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 21, 22,
+        22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 20, 19,
+        19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 23,
+        23, 23, 23, 22, 22, 23, 23, 23, 22, 21, 21, 20, 19, 19, 19, 19, 18, 18,
+        18, 18, 18, 18, 17, 17, 17, 17, 20, 21, 21, 21, 22, 22, 22, 22, 22, 22,
+        22, 22, 22, 22, 22, 20, 20, 20, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17,
+        17, 17, 17, 17 },
+  },
+  {
+      { /* Luma */
+        /* Size 4x4 */
+        32, 32, 30, 27, 32, 31, 29, 26, 30, 29, 26, 23, 27, 26, 23, 19,
+        /* Size 8x8 */
+        33, 33, 32, 32, 31, 30, 28, 25, 33, 32, 32, 32, 31, 30, 28, 26, 32, 32,
+        32, 31, 30, 29, 28, 26, 32, 32, 31, 30, 29, 28, 27, 25, 31, 31, 30, 29,
+        28, 26, 25, 23, 30, 30, 29, 28, 26, 24, 22, 21, 28, 28, 28, 27, 25, 22,
+        20, 19, 25, 26, 26, 25, 23, 21, 19, 18,
+        /* Size 16x16 */
+        32, 33, 33, 33, 33, 33, 33, 32, 32, 30, 30, 28, 28, 26, 26, 23, 33, 32,
+        32, 32, 32, 32, 32, 32, 32, 30, 30, 29, 29, 27, 27, 24, 33, 32, 32, 32,
+        32, 32, 32, 32, 32, 30, 30, 29, 29, 27, 27, 24, 33, 32, 32, 32, 32, 32,
+        32, 32, 32, 31, 31, 30, 30, 28, 28, 25, 33, 32, 32, 32, 32, 32, 32, 32,
+        32, 31, 31, 30, 30, 28, 28, 25, 33, 32, 32, 32, 32, 31, 31, 30, 30, 29,
+        29, 28, 28, 26, 26, 24, 33, 32, 32, 32, 32, 31, 31, 30, 30, 29, 29, 28,
+        28, 26, 26, 24, 32, 32, 32, 32, 32, 30, 30, 29, 29, 28, 28, 27, 27, 26,
+        26, 24, 32, 32, 32, 32, 32, 30, 30, 29, 29, 28, 28, 27, 27, 26, 26, 24,
+        30, 30, 30, 31, 31, 29, 29, 28, 28, 26, 26, 24, 24, 23, 23, 22, 30, 30,
+        30, 31, 31, 29, 29, 28, 28, 26, 26, 24, 24, 23, 23, 22, 28, 29, 29, 30,
+        30, 28, 28, 27, 27, 24, 24, 21, 21, 20, 20, 19, 28, 29, 29, 30, 30, 28,
+        28, 27, 27, 24, 24, 21, 21, 20, 20, 19, 26, 27, 27, 28, 28, 26, 26, 26,
+        26, 23, 23, 20, 20, 19, 19, 18, 26, 27, 27, 28, 28, 26, 26, 26, 26, 23,
+        23, 20, 20, 19, 19, 18, 23, 24, 24, 25, 25, 24, 24, 24, 24, 22, 22, 19,
+        19, 18, 18, 16,
+        /* Size 32x32 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 31,
+        30, 30, 30, 29, 28, 28, 28, 28, 26, 26, 26, 25, 23, 23, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 30,
+        29, 29, 29, 28, 26, 26, 26, 25, 24, 24, 33, 33, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 30, 29, 29, 29, 28,
+        27, 27, 27, 26, 24, 24, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 31, 30, 30, 30, 30, 29, 29, 29, 28, 27, 27, 27, 26,
+        24, 24, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 31, 30, 30, 30, 30, 29, 29, 29, 28, 27, 27, 27, 26, 24, 24, 33, 33,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30,
+        30, 30, 29, 29, 29, 28, 27, 27, 27, 26, 25, 25, 33, 33, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30,
+        30, 28, 28, 28, 28, 26, 25, 25, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 28, 28, 28,
+        28, 26, 25, 25, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 28, 28, 28, 28, 26, 25, 25,
+        33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 30,
+        30, 30, 30, 29, 29, 29, 29, 28, 27, 27, 27, 26, 25, 25, 33, 32, 32, 32,
+        32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 30, 30, 30, 30, 29, 29, 29, 28,
+        28, 28, 28, 27, 26, 26, 26, 26, 24, 24, 33, 32, 32, 32, 32, 32, 32, 32,
+        32, 31, 31, 31, 31, 31, 30, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 27,
+        26, 26, 26, 26, 24, 24, 33, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31,
+        31, 31, 30, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 27, 26, 26, 26, 26,
+        24, 24, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30,
+        30, 29, 28, 28, 28, 28, 28, 28, 28, 27, 26, 26, 26, 25, 24, 24, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 30, 29, 29, 29, 28, 28, 28,
+        28, 28, 27, 27, 27, 26, 26, 26, 26, 25, 24, 24, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 31, 30, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 28, 27, 27,
+        27, 26, 26, 26, 26, 25, 24, 24, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
+        30, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 28, 27, 27, 27, 26, 26, 26,
+        26, 25, 24, 24, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 29,
+        28, 28, 28, 28, 27, 27, 27, 26, 26, 26, 26, 25, 24, 24, 24, 23, 23, 23,
+        30, 30, 30, 30, 30, 30, 31, 31, 31, 30, 29, 29, 29, 28, 28, 28, 28, 27,
+        26, 26, 26, 25, 24, 24, 24, 23, 23, 23, 23, 22, 22, 22, 30, 30, 30, 30,
+        30, 30, 31, 31, 31, 30, 29, 29, 29, 28, 28, 28, 28, 27, 26, 26, 26, 25,
+        24, 24, 24, 23, 23, 23, 23, 22, 22, 22, 30, 30, 30, 30, 30, 30, 31, 31,
+        31, 30, 29, 29, 29, 28, 28, 28, 28, 27, 26, 26, 26, 25, 24, 24, 24, 23,
+        23, 23, 23, 22, 22, 22, 29, 30, 30, 30, 30, 30, 30, 30, 30, 29, 28, 28,
+        28, 28, 28, 28, 28, 26, 25, 25, 25, 24, 23, 23, 23, 22, 22, 22, 22, 21,
+        20, 20, 28, 29, 29, 29, 29, 29, 30, 30, 30, 29, 28, 28, 28, 28, 27, 27,
+        27, 26, 24, 24, 24, 23, 21, 21, 21, 21, 20, 20, 20, 20, 19, 19, 28, 29,
+        29, 29, 29, 29, 30, 30, 30, 29, 28, 28, 28, 28, 27, 27, 27, 26, 24, 24,
+        24, 23, 21, 21, 21, 21, 20, 20, 20, 20, 19, 19, 28, 29, 29, 29, 29, 29,
+        30, 30, 30, 29, 28, 28, 28, 28, 27, 27, 27, 26, 24, 24, 24, 23, 21, 21,
+        21, 21, 20, 20, 20, 20, 19, 19, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+        27, 27, 27, 27, 26, 26, 26, 25, 23, 23, 23, 22, 21, 21, 21, 20, 20, 20,
+        20, 19, 18, 18, 26, 26, 27, 27, 27, 27, 28, 28, 28, 27, 26, 26, 26, 26,
+        26, 26, 26, 24, 23, 23, 23, 22, 20, 20, 20, 20, 19, 19, 19, 18, 18, 18,
+        26, 26, 27, 27, 27, 27, 28, 28, 28, 27, 26, 26, 26, 26, 26, 26, 26, 24,
+        23, 23, 23, 22, 20, 20, 20, 20, 19, 19, 19, 18, 18, 18, 26, 26, 27, 27,
+        27, 27, 28, 28, 28, 27, 26, 26, 26, 26, 26, 26, 26, 24, 23, 23, 23, 22,
+        20, 20, 20, 20, 19, 19, 19, 18, 18, 18, 25, 25, 26, 26, 26, 26, 26, 26,
+        26, 26, 26, 26, 26, 25, 25, 25, 25, 23, 22, 22, 22, 21, 20, 20, 20, 19,
+        18, 18, 18, 18, 17, 17, 23, 24, 24, 24, 24, 25, 25, 25, 25, 25, 24, 24,
+        24, 24, 24, 24, 24, 23, 22, 22, 22, 20, 19, 19, 19, 18, 18, 18, 18, 17,
+        16, 16, 23, 24, 24, 24, 24, 25, 25, 25, 25, 25, 24, 24, 24, 24, 24, 24,
+        24, 23, 22, 22, 22, 20, 19, 19, 19, 18, 18, 18, 18, 17, 16, 16,
+        /* Size 4x8 */
+        33, 32, 30, 26, 32, 32, 30, 27, 32, 31, 30, 27, 32, 31, 28, 26, 31, 30,
+        27, 24, 30, 28, 25, 22, 28, 27, 23, 20, 26, 26, 22, 18,
+        /* Size 8x4 */
+        33, 32, 32, 32, 31, 30, 28, 26, 32, 32, 31, 31, 30, 28, 27, 26, 30, 30,
+        30, 28, 27, 25, 23, 22, 26, 27, 27, 26, 24, 22, 20, 18,
+        /* Size 8x16 */
+        32, 33, 33, 32, 32, 28, 28, 23, 33, 32, 32, 32, 32, 29, 29, 24, 33, 32,
+        32, 32, 32, 29, 29, 24, 33, 32, 32, 31, 31, 30, 30, 25, 33, 32, 32, 31,
+        31, 30, 30, 25, 32, 32, 32, 30, 30, 28, 28, 24, 32, 32, 32, 30, 30, 28,
+        28, 24, 32, 31, 31, 29, 29, 27, 27, 24, 32, 31, 31, 29, 29, 27, 27, 24,
+        30, 30, 30, 28, 28, 24, 24, 21, 30, 30, 30, 28, 28, 24, 24, 21, 28, 30,
+        30, 27, 27, 21, 21, 19, 28, 30, 30, 27, 27, 21, 21, 19, 26, 28, 28, 26,
+        26, 20, 20, 18, 26, 28, 28, 26, 26, 20, 20, 18, 23, 25, 25, 24, 24, 19,
+        19, 16,
+        /* Size 16x8 */
+        32, 33, 33, 33, 33, 32, 32, 32, 32, 30, 30, 28, 28, 26, 26, 23, 33, 32,
+        32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 30, 28, 28, 25, 33, 32, 32, 32,
+        32, 32, 32, 31, 31, 30, 30, 30, 30, 28, 28, 25, 32, 32, 32, 31, 31, 30,
+        30, 29, 29, 28, 28, 27, 27, 26, 26, 24, 32, 32, 32, 31, 31, 30, 30, 29,
+        29, 28, 28, 27, 27, 26, 26, 24, 28, 29, 29, 30, 30, 28, 28, 27, 27, 24,
+        24, 21, 21, 20, 20, 19, 28, 29, 29, 30, 30, 28, 28, 27, 27, 24, 24, 21,
+        21, 20, 20, 19, 23, 24, 24, 25, 25, 24, 24, 24, 24, 21, 21, 19, 19, 18,
+        18, 16,
+        /* Size 16x32 */
+        32, 33, 33, 33, 33, 32, 32, 32, 32, 30, 28, 28, 28, 26, 23, 23, 33, 33,
+        33, 33, 33, 32, 32, 32, 32, 30, 29, 29, 29, 26, 24, 24, 33, 32, 32, 32,
+        32, 32, 32, 32, 32, 30, 29, 29, 29, 27, 24, 24, 33, 32, 32, 32, 32, 32,
+        32, 32, 32, 30, 29, 29, 29, 27, 24, 24, 33, 32, 32, 32, 32, 32, 32, 32,
+        32, 30, 29, 29, 29, 27, 24, 24, 33, 32, 32, 32, 32, 32, 32, 32, 32, 30,
+        29, 29, 29, 27, 25, 25, 33, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30,
+        30, 28, 25, 25, 33, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 28,
+        25, 25, 33, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 28, 25, 25,
+        33, 32, 32, 32, 32, 31, 31, 31, 31, 30, 29, 29, 29, 27, 25, 25, 32, 32,
+        32, 32, 32, 31, 30, 30, 30, 29, 28, 28, 28, 26, 24, 24, 32, 32, 32, 32,
+        32, 31, 30, 30, 30, 29, 28, 28, 28, 26, 24, 24, 32, 32, 32, 32, 32, 31,
+        30, 30, 30, 29, 28, 28, 28, 26, 24, 24, 32, 32, 32, 32, 32, 31, 30, 30,
+        30, 28, 28, 28, 28, 26, 24, 24, 32, 32, 31, 31, 31, 30, 29, 29, 29, 28,
+        27, 27, 27, 26, 24, 24, 32, 32, 31, 31, 31, 30, 29, 29, 29, 28, 27, 27,
+        27, 26, 24, 24, 32, 32, 31, 31, 31, 30, 29, 29, 29, 28, 27, 27, 27, 26,
+        24, 24, 31, 31, 31, 31, 31, 30, 28, 28, 28, 27, 26, 26, 26, 24, 23, 23,
+        30, 30, 30, 30, 30, 29, 28, 28, 28, 26, 24, 24, 24, 23, 21, 21, 30, 30,
+        30, 30, 30, 29, 28, 28, 28, 26, 24, 24, 24, 23, 21, 21, 30, 30, 30, 30,
+        30, 29, 28, 28, 28, 26, 24, 24, 24, 23, 21, 21, 29, 30, 30, 30, 30, 28,
+        28, 28, 28, 25, 23, 23, 23, 22, 20, 20, 28, 29, 30, 30, 30, 28, 27, 27,
+        27, 24, 21, 21, 21, 20, 19, 19, 28, 29, 30, 30, 30, 28, 27, 27, 27, 24,
+        21, 21, 21, 20, 19, 19, 28, 29, 30, 30, 30, 28, 27, 27, 27, 24, 21, 21,
+        21, 20, 19, 19, 28, 28, 28, 28, 28, 27, 26, 26, 26, 23, 21, 21, 21, 20,
+        18, 18, 26, 27, 28, 28, 28, 26, 26, 26, 26, 23, 20, 20, 20, 19, 18, 18,
+        26, 27, 28, 28, 28, 26, 26, 26, 26, 23, 20, 20, 20, 19, 18, 18, 26, 27,
+        28, 28, 28, 26, 26, 26, 26, 23, 20, 20, 20, 19, 18, 18, 25, 26, 26, 26,
+        26, 26, 24, 24, 24, 22, 20, 20, 20, 18, 17, 17, 23, 24, 25, 25, 25, 24,
+        24, 24, 24, 21, 19, 19, 19, 18, 16, 16, 23, 24, 25, 25, 25, 24, 24, 24,
+        24, 21, 19, 19, 19, 18, 16, 16,
+        /* Size 32x16 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31,
+        30, 30, 30, 29, 28, 28, 28, 28, 26, 26, 26, 25, 23, 23, 33, 33, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 30,
+        29, 29, 29, 28, 27, 27, 27, 26, 24, 24, 33, 33, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 28,
+        28, 28, 28, 26, 25, 25, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 28, 28, 28, 28, 26,
+        25, 25, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+        31, 31, 30, 30, 30, 30, 30, 30, 30, 28, 28, 28, 28, 26, 25, 25, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 30, 30, 30, 30, 29, 29,
+        29, 28, 28, 28, 28, 27, 26, 26, 26, 26, 24, 24, 32, 32, 32, 32, 32, 32,
+        31, 31, 31, 31, 30, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 28, 27, 27,
+        27, 26, 26, 26, 26, 24, 24, 24, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+        30, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 28, 27, 27, 27, 26, 26, 26,
+        26, 24, 24, 24, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30,
+        29, 29, 29, 28, 28, 28, 28, 28, 27, 27, 27, 26, 26, 26, 26, 24, 24, 24,
+        30, 30, 30, 30, 30, 30, 31, 31, 31, 30, 29, 29, 29, 28, 28, 28, 28, 27,
+        26, 26, 26, 25, 24, 24, 24, 23, 23, 23, 23, 22, 21, 21, 28, 29, 29, 29,
+        29, 29, 30, 30, 30, 29, 28, 28, 28, 28, 27, 27, 27, 26, 24, 24, 24, 23,
+        21, 21, 21, 21, 20, 20, 20, 20, 19, 19, 28, 29, 29, 29, 29, 29, 30, 30,
+        30, 29, 28, 28, 28, 28, 27, 27, 27, 26, 24, 24, 24, 23, 21, 21, 21, 21,
+        20, 20, 20, 20, 19, 19, 28, 29, 29, 29, 29, 29, 30, 30, 30, 29, 28, 28,
+        28, 28, 27, 27, 27, 26, 24, 24, 24, 23, 21, 21, 21, 21, 20, 20, 20, 20,
+        19, 19, 26, 26, 27, 27, 27, 27, 28, 28, 28, 27, 26, 26, 26, 26, 26, 26,
+        26, 24, 23, 23, 23, 22, 20, 20, 20, 20, 19, 19, 19, 18, 18, 18, 23, 24,
+        24, 24, 24, 25, 25, 25, 25, 25, 24, 24, 24, 24, 24, 24, 24, 23, 21, 21,
+        21, 20, 19, 19, 19, 18, 18, 18, 18, 17, 16, 16, 23, 24, 24, 24, 24, 25,
+        25, 25, 25, 25, 24, 24, 24, 24, 24, 24, 24, 23, 21, 21, 21, 20, 19, 19,
+        19, 18, 18, 18, 18, 17, 16, 16,
+        /* Size 4x16 */
+        33, 32, 30, 26, 32, 32, 30, 27, 32, 32, 30, 27, 32, 32, 31, 28, 32, 32,
+        31, 28, 32, 31, 29, 26, 32, 31, 29, 26, 32, 30, 28, 26, 32, 30, 28, 26,
+        30, 29, 26, 23, 30, 29, 26, 23, 29, 28, 24, 20, 29, 28, 24, 20, 27, 26,
+        23, 19, 27, 26, 23, 19, 24, 24, 21, 18,
+        /* Size 16x4 */
+        33, 32, 32, 32, 32, 32, 32, 32, 32, 30, 30, 29, 29, 27, 27, 24, 32, 32,
+        32, 32, 32, 31, 31, 30, 30, 29, 29, 28, 28, 26, 26, 24, 30, 30, 30, 31,
+        31, 29, 29, 28, 28, 26, 26, 24, 24, 23, 23, 21, 26, 27, 27, 28, 28, 26,
+        26, 26, 26, 23, 23, 20, 20, 19, 19, 18,
+        /* Size 8x32 */
+        32, 33, 33, 32, 32, 28, 28, 23, 33, 33, 33, 32, 32, 29, 29, 24, 33, 32,
+        32, 32, 32, 29, 29, 24, 33, 32, 32, 32, 32, 29, 29, 24, 33, 32, 32, 32,
+        32, 29, 29, 24, 33, 32, 32, 32, 32, 29, 29, 25, 33, 32, 32, 31, 31, 30,
+        30, 25, 33, 32, 32, 31, 31, 30, 30, 25, 33, 32, 32, 31, 31, 30, 30, 25,
+        33, 32, 32, 31, 31, 29, 29, 25, 32, 32, 32, 30, 30, 28, 28, 24, 32, 32,
+        32, 30, 30, 28, 28, 24, 32, 32, 32, 30, 30, 28, 28, 24, 32, 32, 32, 30,
+        30, 28, 28, 24, 32, 31, 31, 29, 29, 27, 27, 24, 32, 31, 31, 29, 29, 27,
+        27, 24, 32, 31, 31, 29, 29, 27, 27, 24, 31, 31, 31, 28, 28, 26, 26, 23,
+        30, 30, 30, 28, 28, 24, 24, 21, 30, 30, 30, 28, 28, 24, 24, 21, 30, 30,
+        30, 28, 28, 24, 24, 21, 29, 30, 30, 28, 28, 23, 23, 20, 28, 30, 30, 27,
+        27, 21, 21, 19, 28, 30, 30, 27, 27, 21, 21, 19, 28, 30, 30, 27, 27, 21,
+        21, 19, 28, 28, 28, 26, 26, 21, 21, 18, 26, 28, 28, 26, 26, 20, 20, 18,
+        26, 28, 28, 26, 26, 20, 20, 18, 26, 28, 28, 26, 26, 20, 20, 18, 25, 26,
+        26, 24, 24, 20, 20, 17, 23, 25, 25, 24, 24, 19, 19, 16, 23, 25, 25, 24,
+        24, 19, 19, 16,
+        /* Size 32x8 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31,
+        30, 30, 30, 29, 28, 28, 28, 28, 26, 26, 26, 25, 23, 23, 33, 33, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30,
+        30, 30, 30, 28, 28, 28, 28, 26, 25, 25, 33, 33, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 28,
+        28, 28, 28, 26, 25, 25, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30,
+        30, 30, 29, 29, 29, 28, 28, 28, 28, 28, 27, 27, 27, 26, 26, 26, 26, 24,
+        24, 24, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 29, 29,
+        29, 28, 28, 28, 28, 28, 27, 27, 27, 26, 26, 26, 26, 24, 24, 24, 28, 29,
+        29, 29, 29, 29, 30, 30, 30, 29, 28, 28, 28, 28, 27, 27, 27, 26, 24, 24,
+        24, 23, 21, 21, 21, 21, 20, 20, 20, 20, 19, 19, 28, 29, 29, 29, 29, 29,
+        30, 30, 30, 29, 28, 28, 28, 28, 27, 27, 27, 26, 24, 24, 24, 23, 21, 21,
+        21, 21, 20, 20, 20, 20, 19, 19, 23, 24, 24, 24, 24, 25, 25, 25, 25, 25,
+        24, 24, 24, 24, 24, 24, 24, 23, 21, 21, 21, 20, 19, 19, 19, 18, 18, 18,
+        18, 17, 16, 16 },
+      { /* Chroma */
+        /* Size 4x4 */
+        33, 30, 24, 22, 30, 26, 23, 22, 24, 23, 21, 21, 22, 22, 21, 19,
+        /* Size 8x8 */
+        33, 33, 32, 29, 26, 23, 21, 21, 33, 33, 31, 28, 25, 23, 22, 22, 32, 31,
+        29, 26, 24, 23, 22, 23, 29, 28, 26, 24, 23, 22, 22, 22, 26, 25, 24, 23,
+        22, 21, 21, 22, 23, 23, 23, 22, 21, 20, 20, 20, 21, 22, 22, 22, 21, 20,
+        19, 19, 21, 22, 23, 22, 22, 20, 19, 18,
+        /* Size 16x16 */
+        32, 33, 33, 34, 34, 31, 31, 28, 28, 25, 25, 21, 21, 21, 21, 21, 33, 33,
+        33, 33, 33, 30, 30, 27, 27, 24, 24, 22, 22, 22, 22, 22, 33, 33, 33, 33,
+        33, 30, 30, 27, 27, 24, 24, 22, 22, 22, 22, 22, 34, 33, 33, 32, 32, 29,
+        29, 26, 26, 24, 24, 22, 22, 23, 23, 23, 34, 33, 33, 32, 32, 29, 29, 26,
+        26, 24, 24, 22, 22, 23, 23, 23, 31, 30, 30, 29, 29, 26, 26, 24, 24, 23,
+        23, 22, 22, 22, 22, 23, 31, 30, 30, 29, 29, 26, 26, 24, 24, 23, 23, 22,
+        22, 22, 22, 23, 28, 27, 27, 26, 26, 24, 24, 22, 22, 22, 22, 21, 21, 22,
+        22, 23, 28, 27, 27, 26, 26, 24, 24, 22, 22, 22, 22, 21, 21, 22, 22, 23,
+        25, 24, 24, 24, 24, 23, 23, 22, 22, 21, 21, 20, 20, 21, 21, 21, 25, 24,
+        24, 24, 24, 23, 23, 22, 22, 21, 21, 20, 20, 21, 21, 21, 21, 22, 22, 22,
+        22, 22, 22, 21, 21, 20, 20, 19, 19, 19, 19, 19, 21, 22, 22, 22, 22, 22,
+        22, 21, 21, 20, 20, 19, 19, 19, 19, 19, 21, 22, 22, 23, 23, 22, 22, 22,
+        22, 21, 21, 19, 19, 19, 19, 19, 21, 22, 22, 23, 23, 22, 22, 22, 22, 21,
+        21, 19, 19, 19, 19, 19, 21, 22, 22, 23, 23, 23, 23, 23, 23, 21, 21, 19,
+        19, 19, 19, 18,
+        /* Size 32x32 */
+        32, 33, 33, 33, 33, 33, 34, 34, 34, 32, 31, 31, 31, 29, 28, 28, 28, 26,
+        25, 25, 25, 23, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 32, 30, 30, 30, 29, 28, 28, 28, 26, 24, 24, 24, 23,
+        21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 31, 30, 30, 30, 28, 27, 27, 27, 26, 24, 24, 24, 23, 22, 22, 22, 22,
+        22, 22, 22, 22, 22, 22, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 30, 30,
+        30, 28, 27, 27, 27, 26, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22,
+        22, 22, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 30, 30, 30, 28, 27, 27,
+        27, 26, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 31, 29, 29, 29, 28, 26, 26, 26, 25, 24, 24,
+        24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 34, 33, 33, 33, 33, 33,
+        32, 32, 32, 31, 29, 29, 29, 28, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22,
+        22, 22, 23, 23, 23, 23, 23, 23, 34, 33, 33, 33, 33, 33, 32, 32, 32, 31,
+        29, 29, 29, 28, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 23, 23,
+        23, 23, 23, 23, 34, 33, 33, 33, 33, 33, 32, 32, 32, 31, 29, 29, 29, 28,
+        26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23,
+        32, 32, 31, 31, 31, 31, 31, 31, 31, 29, 28, 28, 28, 26, 25, 25, 25, 24,
+        24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 31, 30, 30, 30,
+        30, 29, 29, 29, 29, 28, 26, 26, 26, 25, 24, 24, 24, 23, 23, 23, 23, 22,
+        22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 31, 30, 30, 30, 30, 29, 29, 29,
+        29, 28, 26, 26, 26, 25, 24, 24, 24, 23, 23, 23, 23, 22, 22, 22, 22, 22,
+        22, 22, 22, 22, 23, 23, 31, 30, 30, 30, 30, 29, 29, 29, 29, 28, 26, 26,
+        26, 25, 24, 24, 24, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+        23, 23, 29, 29, 28, 28, 28, 28, 28, 28, 28, 26, 25, 25, 25, 24, 23, 23,
+        23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 28, 28,
+        27, 27, 27, 26, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22,
+        22, 22, 21, 21, 21, 22, 22, 22, 22, 22, 23, 23, 28, 28, 27, 27, 27, 26,
+        26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21,
+        21, 22, 22, 22, 22, 22, 23, 23, 28, 28, 27, 27, 27, 26, 26, 26, 26, 25,
+        24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 22, 22,
+        22, 22, 23, 23, 26, 26, 26, 26, 26, 25, 25, 25, 25, 24, 23, 23, 23, 23,
+        22, 22, 22, 22, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22,
+        25, 24, 24, 24, 24, 24, 24, 24, 24, 24, 23, 23, 23, 22, 22, 22, 22, 21,
+        21, 21, 21, 21, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 25, 24, 24, 24,
+        24, 24, 24, 24, 24, 24, 23, 23, 23, 22, 22, 22, 22, 21, 21, 21, 21, 21,
+        20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 25, 24, 24, 24, 24, 24, 24, 24,
+        24, 24, 23, 23, 23, 22, 22, 22, 22, 21, 21, 21, 21, 21, 20, 20, 20, 20,
+        21, 21, 21, 21, 21, 21, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 22, 22,
+        22, 22, 22, 22, 22, 21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+        20, 20, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21,
+        21, 21, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 21,
+        22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 20, 20,
+        20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22,
+        22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 20, 20, 20, 20, 19, 19,
+        19, 19, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22,
+        22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19,
+        19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 22,
+        22, 22, 22, 21, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+        21, 21, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21,
+        21, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22,
+        22, 22, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 20,
+        19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 23, 23,
+        23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 19, 19, 19, 19,
+        19, 19, 19, 18, 18, 18, 21, 21, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23,
+        23, 23, 23, 23, 23, 22, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 18,
+        18, 18, 21, 21, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
+        23, 22, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18,
+        /* Size 4x8 */
+        33, 30, 24, 21, 33, 29, 24, 22, 31, 28, 23, 22, 28, 25, 22, 22, 26, 23,
+        21, 21, 23, 22, 21, 20, 22, 22, 20, 19, 22, 22, 21, 19,
+        /* Size 8x4 */
+        33, 33, 31, 28, 26, 23, 22, 22, 30, 29, 28, 25, 23, 22, 22, 22, 24, 24,
+        23, 22, 21, 21, 20, 21, 21, 22, 22, 22, 21, 20, 19, 19,
+        /* Size 8x16 */
+        32, 33, 33, 28, 28, 21, 21, 21, 33, 33, 33, 27, 27, 22, 22, 22, 33, 33,
+        33, 27, 27, 22, 22, 22, 34, 32, 32, 26, 26, 22, 22, 23, 34, 32, 32, 26,
+        26, 22, 22, 23, 31, 28, 28, 24, 24, 22, 22, 22, 31, 28, 28, 24, 24, 22,
+        22, 22, 28, 26, 26, 22, 22, 22, 22, 23, 28, 26, 26, 22, 22, 22, 22, 23,
+        24, 24, 24, 22, 22, 20, 20, 21, 24, 24, 24, 22, 22, 20, 20, 21, 21, 22,
+        22, 21, 21, 19, 19, 19, 21, 22, 22, 21, 21, 19, 19, 19, 21, 22, 22, 22,
+        22, 19, 19, 18, 21, 22, 22, 22, 22, 19, 19, 18, 21, 23, 23, 22, 22, 19,
+        19, 18,
+        /* Size 16x8 */
+        32, 33, 33, 34, 34, 31, 31, 28, 28, 24, 24, 21, 21, 21, 21, 21, 33, 33,
+        33, 32, 32, 28, 28, 26, 26, 24, 24, 22, 22, 22, 22, 23, 33, 33, 33, 32,
+        32, 28, 28, 26, 26, 24, 24, 22, 22, 22, 22, 23, 28, 27, 27, 26, 26, 24,
+        24, 22, 22, 22, 22, 21, 21, 22, 22, 22, 28, 27, 27, 26, 26, 24, 24, 22,
+        22, 22, 22, 21, 21, 22, 22, 22, 21, 22, 22, 22, 22, 22, 22, 22, 22, 20,
+        20, 19, 19, 19, 19, 19, 21, 22, 22, 22, 22, 22, 22, 22, 22, 20, 20, 19,
+        19, 19, 19, 19, 21, 22, 22, 23, 23, 22, 22, 23, 23, 21, 21, 19, 19, 18,
+        18, 18,
+        /* Size 16x32 */
+        32, 33, 33, 33, 33, 31, 28, 28, 28, 24, 21, 21, 21, 21, 21, 21, 33, 33,
+        33, 33, 33, 30, 28, 28, 28, 24, 22, 22, 22, 21, 21, 21, 33, 33, 33, 33,
+        33, 30, 27, 27, 27, 24, 22, 22, 22, 22, 22, 22, 33, 33, 33, 33, 33, 30,
+        27, 27, 27, 24, 22, 22, 22, 22, 22, 22, 33, 33, 33, 33, 33, 30, 27, 27,
+        27, 24, 22, 22, 22, 22, 22, 22, 33, 33, 32, 32, 32, 29, 26, 26, 26, 24,
+        22, 22, 22, 22, 22, 22, 34, 33, 32, 32, 32, 29, 26, 26, 26, 24, 22, 22,
+        22, 23, 23, 23, 34, 33, 32, 32, 32, 29, 26, 26, 26, 24, 22, 22, 22, 23,
+        23, 23, 34, 33, 32, 32, 32, 29, 26, 26, 26, 24, 22, 22, 22, 23, 23, 23,
+        32, 31, 30, 30, 30, 28, 25, 25, 25, 23, 22, 22, 22, 22, 23, 23, 31, 30,
+        28, 28, 28, 26, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 31, 30, 28, 28,
+        28, 26, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 31, 30, 28, 28, 28, 26,
+        24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 29, 28, 27, 27, 27, 25, 23, 23,
+        23, 22, 22, 22, 22, 22, 23, 23, 28, 27, 26, 26, 26, 24, 22, 22, 22, 22,
+        22, 22, 22, 22, 23, 23, 28, 27, 26, 26, 26, 24, 22, 22, 22, 22, 22, 22,
+        22, 22, 23, 23, 28, 27, 26, 26, 26, 24, 22, 22, 22, 22, 22, 22, 22, 22,
+        23, 23, 26, 26, 25, 25, 25, 23, 22, 22, 22, 21, 21, 21, 21, 21, 22, 22,
+        24, 24, 24, 24, 24, 23, 22, 22, 22, 21, 20, 20, 20, 20, 21, 21, 24, 24,
+        24, 24, 24, 23, 22, 22, 22, 21, 20, 20, 20, 20, 21, 21, 24, 24, 24, 24,
+        24, 23, 22, 22, 22, 21, 20, 20, 20, 20, 21, 21, 23, 23, 23, 23, 23, 22,
+        22, 22, 22, 21, 20, 20, 20, 20, 20, 20, 21, 21, 22, 22, 22, 22, 21, 21,
+        21, 20, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 21, 21, 21, 20,
+        19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 21, 21, 21, 20, 19, 19,
+        19, 19, 19, 19, 21, 22, 22, 22, 22, 22, 22, 22, 22, 20, 19, 19, 19, 19,
+        19, 19, 21, 22, 22, 22, 22, 22, 22, 22, 22, 20, 19, 19, 19, 19, 18, 18,
+        21, 22, 22, 22, 22, 22, 22, 22, 22, 20, 19, 19, 19, 19, 18, 18, 21, 22,
+        22, 22, 22, 22, 22, 22, 22, 20, 19, 19, 19, 19, 18, 18, 21, 22, 23, 23,
+        23, 22, 22, 22, 22, 21, 19, 19, 19, 19, 18, 18, 21, 22, 23, 23, 23, 23,
+        22, 22, 22, 21, 19, 19, 19, 18, 18, 18, 21, 22, 23, 23, 23, 23, 22, 22,
+        22, 21, 19, 19, 19, 18, 18, 18,
+        /* Size 32x16 */
+        32, 33, 33, 33, 33, 33, 34, 34, 34, 32, 31, 31, 31, 29, 28, 28, 28, 26,
+        24, 24, 24, 23, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 31, 30, 30, 30, 28, 27, 27, 27, 26, 24, 24, 24, 23,
+        21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 33, 33, 33, 33, 33, 32, 32, 32,
+        32, 30, 28, 28, 28, 27, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22,
+        22, 22, 22, 23, 23, 23, 33, 33, 33, 33, 33, 32, 32, 32, 32, 30, 28, 28,
+        28, 27, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 23,
+        23, 23, 33, 33, 33, 33, 33, 32, 32, 32, 32, 30, 28, 28, 28, 27, 26, 26,
+        26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 31, 30,
+        30, 30, 30, 29, 29, 29, 29, 28, 26, 26, 26, 25, 24, 24, 24, 23, 23, 23,
+        23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 28, 28, 27, 27, 27, 26,
+        26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21,
+        21, 22, 22, 22, 22, 22, 22, 22, 28, 28, 27, 27, 27, 26, 26, 26, 26, 25,
+        24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 22, 22,
+        22, 22, 22, 22, 28, 28, 27, 27, 27, 26, 26, 26, 26, 25, 24, 24, 24, 23,
+        22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22,
+        24, 24, 24, 24, 24, 24, 24, 24, 24, 23, 23, 23, 23, 22, 22, 22, 22, 21,
+        21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22,
+        22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 20,
+        19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 22, 22, 22, 22, 22, 22, 22,
+        22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 20, 19, 19, 19, 19,
+        19, 19, 19, 19, 19, 19, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+        22, 22, 22, 22, 22, 21, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19,
+        19, 19, 21, 21, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22,
+        22, 21, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 21, 21,
+        22, 22, 22, 22, 23, 23, 23, 23, 22, 22, 22, 23, 23, 23, 23, 22, 21, 21,
+        21, 20, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 21, 21, 22, 22, 22, 22,
+        23, 23, 23, 23, 22, 22, 22, 23, 23, 23, 23, 22, 21, 21, 21, 20, 19, 19,
+        19, 19, 18, 18, 18, 18, 18, 18,
+        /* Size 4x16 */
+        33, 31, 24, 21, 33, 30, 24, 22, 33, 30, 24, 22, 33, 29, 24, 23, 33, 29,
+        24, 23, 30, 26, 23, 22, 30, 26, 23, 22, 27, 24, 22, 22, 27, 24, 22, 22,
+        24, 23, 21, 20, 24, 23, 21, 20, 21, 22, 20, 19, 21, 22, 20, 19, 22, 22,
+        20, 19, 22, 22, 20, 19, 22, 23, 21, 18,
+        /* Size 16x4 */
+        33, 33, 33, 33, 33, 30, 30, 27, 27, 24, 24, 21, 21, 22, 22, 22, 31, 30,
+        30, 29, 29, 26, 26, 24, 24, 23, 23, 22, 22, 22, 22, 23, 24, 24, 24, 24,
+        24, 23, 23, 22, 22, 21, 21, 20, 20, 20, 20, 21, 21, 22, 22, 23, 23, 22,
+        22, 22, 22, 20, 20, 19, 19, 19, 19, 18,
+        /* Size 8x32 */
+        32, 33, 33, 28, 28, 21, 21, 21, 33, 33, 33, 28, 28, 22, 22, 21, 33, 33,
+        33, 27, 27, 22, 22, 22, 33, 33, 33, 27, 27, 22, 22, 22, 33, 33, 33, 27,
+        27, 22, 22, 22, 33, 32, 32, 26, 26, 22, 22, 22, 34, 32, 32, 26, 26, 22,
+        22, 23, 34, 32, 32, 26, 26, 22, 22, 23, 34, 32, 32, 26, 26, 22, 22, 23,
+        32, 30, 30, 25, 25, 22, 22, 23, 31, 28, 28, 24, 24, 22, 22, 22, 31, 28,
+        28, 24, 24, 22, 22, 22, 31, 28, 28, 24, 24, 22, 22, 22, 29, 27, 27, 23,
+        23, 22, 22, 23, 28, 26, 26, 22, 22, 22, 22, 23, 28, 26, 26, 22, 22, 22,
+        22, 23, 28, 26, 26, 22, 22, 22, 22, 23, 26, 25, 25, 22, 22, 21, 21, 22,
+        24, 24, 24, 22, 22, 20, 20, 21, 24, 24, 24, 22, 22, 20, 20, 21, 24, 24,
+        24, 22, 22, 20, 20, 21, 23, 23, 23, 22, 22, 20, 20, 20, 21, 22, 22, 21,
+        21, 19, 19, 19, 21, 22, 22, 21, 21, 19, 19, 19, 21, 22, 22, 21, 21, 19,
+        19, 19, 21, 22, 22, 22, 22, 19, 19, 19, 21, 22, 22, 22, 22, 19, 19, 18,
+        21, 22, 22, 22, 22, 19, 19, 18, 21, 22, 22, 22, 22, 19, 19, 18, 21, 23,
+        23, 22, 22, 19, 19, 18, 21, 23, 23, 22, 22, 19, 19, 18, 21, 23, 23, 22,
+        22, 19, 19, 18,
+        /* Size 32x8 */
+        32, 33, 33, 33, 33, 33, 34, 34, 34, 32, 31, 31, 31, 29, 28, 28, 28, 26,
+        24, 24, 24, 23, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 33, 33, 33, 33,
+        33, 32, 32, 32, 32, 30, 28, 28, 28, 27, 26, 26, 26, 25, 24, 24, 24, 23,
+        22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 33, 33, 33, 33, 33, 32, 32, 32,
+        32, 30, 28, 28, 28, 27, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22,
+        22, 22, 22, 23, 23, 23, 28, 28, 27, 27, 27, 26, 26, 26, 26, 25, 24, 24,
+        24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 22, 22, 22, 22,
+        22, 22, 28, 28, 27, 27, 27, 26, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22,
+        22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 21, 22,
+        22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20,
+        20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 22, 22, 22, 22, 22,
+        22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 20, 19, 19,
+        19, 19, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 23, 23, 23, 23,
+        22, 22, 22, 23, 23, 23, 23, 22, 21, 21, 21, 20, 19, 19, 19, 19, 18, 18,
+        18, 18, 18, 18 },
+  },
+  {
+      { /* Luma */
+        /* Size 4x4 */
+        32, 32, 32, 29, 32, 32, 31, 29, 32, 31, 29, 27, 29, 29, 27, 22,
+        /* Size 8x8 */
+        33, 33, 33, 32, 32, 32, 30, 29, 33, 32, 32, 32, 32, 31, 30, 29, 33, 32,
+        32, 32, 32, 31, 31, 30, 32, 32, 32, 31, 30, 30, 29, 28, 32, 32, 32, 30,
+        29, 29, 28, 27, 32, 31, 31, 30, 29, 28, 27, 26, 30, 30, 31, 29, 28, 27,
+        26, 24, 29, 29, 30, 28, 27, 26, 24, 21,
+        /* Size 16x16 */
+        32, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 31, 30, 30, 28, 28, 33, 33,
+        33, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 33, 33, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 33, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 33, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 31, 31, 30, 30, 30, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 31, 31, 30, 30, 30, 33, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 30,
+        29, 29, 28, 28, 33, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 29, 29, 28,
+        28, 28, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 29, 28, 28, 28, 28,
+        32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 28, 28, 28, 27, 27, 32, 32,
+        32, 32, 32, 32, 31, 30, 30, 29, 29, 28, 28, 28, 27, 27, 31, 31, 31, 31,
+        31, 31, 30, 29, 29, 28, 28, 27, 26, 26, 24, 24, 30, 30, 30, 30, 31, 31,
+        29, 29, 28, 28, 28, 26, 26, 25, 24, 24, 30, 30, 30, 30, 30, 30, 29, 28,
+        28, 28, 28, 26, 25, 24, 23, 23, 28, 29, 29, 29, 30, 30, 28, 28, 28, 27,
+        27, 24, 24, 23, 21, 21, 28, 29, 29, 29, 30, 30, 28, 28, 28, 27, 27, 24,
+        24, 23, 21, 21,
+        /* Size 32x32 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32,
+        32, 32, 32, 32, 31, 30, 30, 30, 30, 29, 28, 28, 28, 28, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        31, 30, 30, 30, 30, 29, 29, 29, 29, 28, 33, 33, 33, 33, 33, 33, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30,
+        30, 29, 29, 29, 29, 28, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 30, 29, 29, 29,
+        29, 28, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 30, 29, 29, 29, 29, 28, 33, 33,
+        33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 31, 31, 30, 30, 30, 30, 29, 29, 29, 29, 28, 33, 33, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30,
+        30, 30, 30, 30, 29, 29, 29, 28, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 30, 30,
+        30, 30, 30, 29, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 29,
+        33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 29, 33, 33, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
+        31, 31, 31, 31, 30, 30, 30, 30, 30, 29, 33, 33, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30,
+        30, 29, 29, 29, 29, 28, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 29, 29, 29, 29, 28, 28, 28,
+        28, 28, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31,
+        31, 30, 30, 30, 30, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 33, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 30, 30,
+        30, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 33, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 29, 29,
+        29, 29, 28, 28, 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 29, 29, 28, 28, 28, 28, 28,
+        28, 28, 28, 27, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30,
+        30, 30, 30, 29, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 27, 27, 27, 26,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 30, 29,
+        29, 29, 29, 28, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 30, 29, 29, 29, 29, 28,
+        28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 31, 31, 30, 30, 30, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28,
+        28, 27, 27, 27, 27, 26, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        30, 30, 30, 30, 29, 29, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 26, 26,
+        26, 25, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 29, 29, 29,
+        29, 28, 28, 28, 28, 28, 27, 26, 26, 26, 26, 25, 24, 24, 24, 24, 30, 30,
+        30, 30, 30, 30, 30, 31, 31, 31, 31, 30, 29, 29, 29, 29, 28, 28, 28, 28,
+        28, 27, 26, 26, 26, 26, 25, 24, 24, 24, 24, 24, 30, 30, 30, 30, 30, 30,
+        30, 31, 31, 31, 31, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 27, 26, 26,
+        26, 26, 25, 24, 24, 24, 24, 24, 30, 30, 30, 30, 30, 30, 30, 31, 31, 31,
+        31, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 27, 26, 26, 26, 26, 25, 24,
+        24, 24, 24, 24, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 29, 28,
+        28, 28, 28, 28, 28, 28, 28, 27, 26, 25, 25, 25, 24, 23, 23, 23, 23, 23,
+        29, 29, 29, 29, 29, 29, 30, 30, 30, 30, 30, 29, 28, 28, 28, 28, 28, 28,
+        27, 27, 27, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 21, 28, 29, 29, 29,
+        29, 29, 29, 30, 30, 30, 30, 29, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26,
+        24, 24, 24, 24, 23, 22, 21, 21, 21, 21, 28, 29, 29, 29, 29, 29, 29, 30,
+        30, 30, 30, 29, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 24, 24, 24, 24,
+        23, 22, 21, 21, 21, 21, 28, 29, 29, 29, 29, 29, 29, 30, 30, 30, 30, 29,
+        28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 24, 24, 24, 24, 23, 22, 21, 21,
+        21, 21, 28, 28, 28, 28, 28, 28, 28, 29, 29, 29, 29, 28, 28, 28, 28, 28,
+        27, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 21, 21, 21, 21, 20,
+        /* Size 4x8 */
+        33, 33, 32, 29, 32, 32, 32, 29, 32, 32, 31, 30, 32, 32, 30, 28, 32, 31,
+        29, 27, 31, 31, 28, 26, 30, 30, 28, 24, 29, 30, 27, 21,
+        /* Size 8x4 */
+        33, 32, 32, 32, 32, 31, 30, 29, 33, 32, 32, 32, 31, 31, 30, 30, 32, 32,
+        31, 30, 29, 28, 28, 27, 29, 29, 30, 28, 27, 26, 24, 21,
+        /* Size 8x16 */
+        32, 33, 33, 33, 32, 32, 29, 28, 33, 32, 32, 32, 32, 32, 29, 29, 33, 32,
+        32, 32, 32, 32, 29, 29, 33, 32, 32, 32, 32, 32, 30, 29, 33, 32, 32, 32,
+        31, 31, 30, 30, 33, 32, 32, 32, 31, 31, 30, 30, 33, 32, 32, 31, 30, 30,
+        29, 28, 32, 32, 32, 31, 30, 30, 28, 28, 32, 32, 32, 31, 30, 30, 28, 28,
+        32, 32, 31, 30, 29, 29, 28, 27, 32, 32, 31, 30, 29, 29, 28, 27, 31, 31,
+        31, 29, 28, 28, 26, 25, 30, 30, 30, 29, 28, 28, 25, 24, 30, 30, 30, 29,
+        28, 28, 24, 23, 28, 29, 30, 28, 27, 27, 22, 21, 28, 29, 30, 28, 27, 27,
+        22, 21,
+        /* Size 16x8 */
+        32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 31, 30, 30, 28, 28, 33, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 33, 32, 32, 32,
+        32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 33, 32, 32, 32, 32, 32,
+        31, 31, 31, 30, 30, 29, 29, 29, 28, 28, 32, 32, 32, 32, 31, 31, 30, 30,
+        30, 29, 29, 28, 28, 28, 27, 27, 32, 32, 32, 32, 31, 31, 30, 30, 30, 29,
+        29, 28, 28, 28, 27, 27, 29, 29, 29, 30, 30, 30, 29, 28, 28, 28, 28, 26,
+        25, 24, 22, 22, 28, 29, 29, 29, 30, 30, 28, 28, 28, 27, 27, 25, 24, 23,
+        21, 21,
+        /* Size 16x32 */
+        32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 31, 29, 28, 28, 28, 33, 33,
+        33, 33, 33, 33, 32, 32, 32, 32, 32, 31, 29, 29, 29, 29, 33, 33, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 31, 29, 29, 29, 29, 33, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 31, 29, 29, 29, 29, 33, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 31, 29, 29, 29, 29, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 31, 29, 29, 29, 29, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
+        30, 29, 29, 29, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 29,
+        29, 29, 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30,
+        33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 33, 32,
+        32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 33, 32, 32, 32,
+        32, 32, 31, 31, 31, 31, 31, 30, 29, 29, 29, 29, 33, 32, 32, 32, 32, 32,
+        31, 31, 30, 30, 30, 30, 29, 28, 28, 28, 32, 32, 32, 32, 32, 32, 31, 30,
+        30, 30, 30, 29, 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30,
+        30, 29, 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 30, 29,
+        28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 30, 29, 28, 28,
+        28, 28, 32, 32, 32, 31, 31, 31, 31, 30, 29, 29, 29, 28, 28, 27, 27, 27,
+        32, 32, 32, 31, 31, 31, 30, 29, 29, 29, 29, 28, 28, 27, 27, 27, 32, 32,
+        32, 31, 31, 31, 30, 29, 29, 29, 29, 28, 28, 27, 27, 27, 32, 32, 32, 31,
+        31, 31, 30, 29, 29, 29, 29, 28, 28, 27, 27, 27, 32, 31, 31, 31, 31, 31,
+        30, 29, 28, 28, 28, 28, 26, 26, 26, 26, 31, 31, 31, 31, 31, 31, 29, 28,
+        28, 28, 28, 27, 26, 25, 25, 25, 30, 30, 30, 30, 30, 30, 29, 28, 28, 28,
+        28, 26, 25, 24, 24, 24, 30, 30, 30, 30, 30, 30, 29, 28, 28, 28, 28, 26,
+        25, 24, 24, 24, 30, 30, 30, 30, 30, 30, 29, 28, 28, 28, 28, 26, 25, 24,
+        24, 24, 30, 30, 30, 30, 30, 30, 29, 28, 28, 28, 28, 26, 24, 23, 23, 23,
+        29, 29, 30, 30, 30, 30, 28, 28, 27, 27, 27, 25, 23, 22, 22, 22, 28, 29,
+        29, 30, 30, 30, 28, 28, 27, 27, 27, 24, 22, 21, 21, 21, 28, 29, 29, 30,
+        30, 30, 28, 28, 27, 27, 27, 24, 22, 21, 21, 21, 28, 29, 29, 30, 30, 30,
+        28, 28, 27, 27, 27, 24, 22, 21, 21, 21, 28, 28, 28, 28, 28, 28, 28, 27,
+        26, 26, 26, 24, 22, 21, 21, 21,
+        /* Size 32x16 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 31, 30, 30, 30, 30, 29, 28, 28, 28, 28, 33, 33, 33, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
+        31, 30, 30, 30, 30, 29, 29, 29, 29, 28, 33, 33, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30,
+        30, 30, 29, 29, 29, 28, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30,
+        30, 28, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, 28, 33, 33,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31,
+        31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, 28, 33, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 29, 29,
+        29, 29, 29, 28, 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 31, 31, 30, 30, 30, 30, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28,
+        28, 28, 28, 27, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30,
+        30, 30, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26,
+        32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 30, 29,
+        29, 29, 29, 28, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 32, 32, 32, 32,
+        32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 30, 29, 29, 29, 29, 28,
+        28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 30, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 27, 26, 26, 26,
+        26, 25, 24, 24, 24, 24, 29, 29, 29, 29, 29, 29, 30, 30, 30, 30, 30, 29,
+        29, 28, 28, 28, 28, 28, 28, 28, 28, 26, 26, 25, 25, 25, 24, 23, 22, 22,
+        22, 22, 28, 29, 29, 29, 29, 29, 29, 29, 30, 30, 30, 29, 28, 28, 28, 28,
+        28, 27, 27, 27, 27, 26, 25, 24, 24, 24, 23, 22, 21, 21, 21, 21, 28, 29,
+        29, 29, 29, 29, 29, 29, 30, 30, 30, 29, 28, 28, 28, 28, 28, 27, 27, 27,
+        27, 26, 25, 24, 24, 24, 23, 22, 21, 21, 21, 21, 28, 29, 29, 29, 29, 29,
+        29, 29, 30, 30, 30, 29, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 25, 24,
+        24, 24, 23, 22, 21, 21, 21, 21,
+        /* Size 4x16 */
+        33, 33, 32, 28, 33, 32, 32, 29, 32, 32, 32, 29, 32, 32, 32, 29, 32, 32,
+        31, 30, 32, 32, 31, 30, 32, 32, 30, 28, 32, 32, 30, 28, 32, 32, 30, 28,
+        32, 31, 29, 27, 32, 31, 29, 27, 31, 31, 28, 25, 30, 30, 28, 24, 30, 30,
+        28, 23, 29, 30, 27, 21, 29, 30, 27, 21,
+        /* Size 16x4 */
+        33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 33, 32,
+        32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 32, 32, 32, 32,
+        31, 31, 30, 30, 30, 29, 29, 28, 28, 28, 27, 27, 28, 29, 29, 29, 30, 30,
+        28, 28, 28, 27, 27, 25, 24, 23, 21, 21,
+        /* Size 8x32 */
+        32, 33, 33, 33, 32, 32, 29, 28, 33, 33, 33, 32, 32, 32, 29, 29, 33, 32,
+        32, 32, 32, 32, 29, 29, 33, 32, 32, 32, 32, 32, 29, 29, 33, 32, 32, 32,
+        32, 32, 29, 29, 33, 32, 32, 32, 32, 32, 29, 29, 33, 32, 32, 32, 32, 32,
+        30, 29, 33, 32, 32, 32, 32, 32, 30, 29, 33, 32, 32, 32, 31, 31, 30, 30,
+        33, 32, 32, 32, 31, 31, 30, 30, 33, 32, 32, 32, 31, 31, 30, 30, 33, 32,
+        32, 31, 31, 31, 29, 29, 33, 32, 32, 31, 30, 30, 29, 28, 32, 32, 32, 31,
+        30, 30, 28, 28, 32, 32, 32, 31, 30, 30, 28, 28, 32, 32, 32, 31, 30, 30,
+        28, 28, 32, 32, 32, 31, 30, 30, 28, 28, 32, 32, 31, 31, 29, 29, 28, 27,
+        32, 32, 31, 30, 29, 29, 28, 27, 32, 32, 31, 30, 29, 29, 28, 27, 32, 32,
+        31, 30, 29, 29, 28, 27, 32, 31, 31, 30, 28, 28, 26, 26, 31, 31, 31, 29,
+        28, 28, 26, 25, 30, 30, 30, 29, 28, 28, 25, 24, 30, 30, 30, 29, 28, 28,
+        25, 24, 30, 30, 30, 29, 28, 28, 25, 24, 30, 30, 30, 29, 28, 28, 24, 23,
+        29, 30, 30, 28, 27, 27, 23, 22, 28, 29, 30, 28, 27, 27, 22, 21, 28, 29,
+        30, 28, 27, 27, 22, 21, 28, 29, 30, 28, 27, 27, 22, 21, 28, 28, 28, 28,
+        26, 26, 22, 21,
+        /* Size 32x8 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 31, 30, 30, 30, 30, 29, 28, 28, 28, 28, 33, 33, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
+        31, 30, 30, 30, 30, 30, 29, 29, 29, 28, 33, 33, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 30, 30,
+        30, 30, 30, 30, 30, 28, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
+        31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 29, 29, 29, 29, 29, 28, 28, 28,
+        28, 28, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30,
+        30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 32, 32,
+        32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 30, 29, 29, 29,
+        29, 28, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 29, 29, 29, 29, 29, 29,
+        30, 30, 30, 30, 30, 29, 29, 28, 28, 28, 28, 28, 28, 28, 28, 26, 26, 25,
+        25, 25, 24, 23, 22, 22, 22, 22, 28, 29, 29, 29, 29, 29, 29, 29, 30, 30,
+        30, 29, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 25, 24, 24, 24, 23, 22,
+        21, 21, 21, 21 },
+      { /* Chroma */
+        /* Size 4x4 */
+        33, 32, 27, 22, 32, 30, 25, 22, 27, 25, 22, 22, 22, 22, 22, 20,
+        /* Size 8x8 */
+        33, 33, 34, 30, 28, 26, 24, 21, 33, 33, 33, 30, 28, 26, 24, 22, 34, 33,
+        32, 29, 26, 25, 24, 22, 30, 30, 29, 26, 24, 23, 23, 22, 28, 28, 26, 24,
+        22, 22, 22, 22, 26, 26, 25, 23, 22, 22, 21, 21, 24, 24, 24, 23, 22, 21,
+        21, 20, 21, 22, 22, 22, 22, 21, 20, 19,
+        /* Size 16x16 */
+        32, 33, 33, 33, 34, 34, 31, 31, 30, 28, 28, 26, 25, 23, 21, 21, 33, 33,
+        33, 33, 33, 33, 31, 30, 28, 27, 27, 25, 24, 23, 21, 21, 33, 33, 33, 33,
+        33, 33, 30, 30, 28, 27, 27, 25, 24, 23, 22, 22, 33, 33, 33, 33, 33, 33,
+        30, 29, 28, 26, 26, 25, 24, 23, 22, 22, 34, 33, 33, 33, 32, 32, 30, 29,
+        28, 26, 26, 24, 24, 23, 22, 22, 34, 33, 33, 33, 32, 32, 30, 29, 28, 26,
+        26, 24, 24, 23, 22, 22, 31, 31, 30, 30, 30, 30, 28, 27, 26, 24, 24, 23,
+        23, 23, 22, 22, 31, 30, 30, 29, 29, 29, 27, 26, 26, 24, 24, 23, 23, 22,
+        22, 22, 30, 28, 28, 28, 28, 28, 26, 26, 24, 23, 23, 23, 22, 22, 22, 22,
+        28, 27, 27, 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 21, 21, 28, 27,
+        27, 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 21, 21, 26, 25, 25, 25,
+        24, 24, 23, 23, 23, 22, 22, 21, 21, 21, 20, 20, 25, 24, 24, 24, 24, 24,
+        23, 23, 22, 22, 22, 21, 21, 21, 20, 20, 23, 23, 23, 23, 23, 23, 23, 22,
+        22, 22, 22, 21, 21, 20, 20, 20, 21, 21, 22, 22, 22, 22, 22, 22, 22, 21,
+        21, 20, 20, 20, 19, 19, 21, 21, 22, 22, 22, 22, 22, 22, 22, 21, 21, 20,
+        20, 20, 19, 19,
+        /* Size 32x32 */
+        32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 33, 31, 31, 31, 31, 30, 28,
+        28, 28, 28, 27, 26, 25, 25, 25, 23, 22, 21, 21, 21, 21, 33, 33, 33, 33,
+        33, 33, 33, 33, 34, 34, 34, 32, 31, 30, 30, 30, 29, 28, 28, 28, 28, 26,
+        25, 24, 24, 24, 23, 22, 21, 21, 21, 21, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 32, 31, 30, 30, 30, 28, 28, 27, 27, 27, 26, 25, 24, 24, 24,
+        23, 22, 21, 21, 21, 22, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32,
+        30, 30, 30, 30, 28, 28, 27, 27, 27, 26, 25, 24, 24, 24, 23, 22, 22, 22,
+        22, 22, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 30, 30, 30, 30,
+        28, 28, 27, 27, 27, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 22, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 30, 30, 30, 30, 28, 28, 27, 27,
+        27, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 22, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 31, 30, 29, 29, 29, 28, 27, 26, 26, 26, 26, 25, 24,
+        24, 24, 23, 22, 22, 22, 22, 22, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 31, 30, 29, 29, 29, 28, 27, 26, 26, 26, 25, 24, 24, 24, 24, 23, 22,
+        22, 22, 22, 22, 34, 34, 33, 33, 33, 33, 33, 33, 32, 32, 32, 31, 30, 29,
+        29, 29, 28, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 23, 22, 22, 22, 22,
+        34, 34, 33, 33, 33, 33, 33, 33, 32, 32, 32, 31, 30, 29, 29, 29, 28, 26,
+        26, 26, 26, 25, 24, 24, 24, 24, 23, 23, 22, 22, 22, 22, 34, 34, 33, 33,
+        33, 33, 33, 33, 32, 32, 32, 31, 30, 29, 29, 29, 28, 26, 26, 26, 26, 25,
+        24, 24, 24, 24, 23, 23, 22, 22, 22, 22, 33, 32, 32, 32, 32, 32, 31, 31,
+        31, 31, 31, 30, 28, 28, 28, 28, 27, 26, 25, 25, 25, 24, 24, 24, 24, 24,
+        23, 22, 22, 22, 22, 22, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, 28,
+        28, 27, 27, 27, 26, 25, 24, 24, 24, 24, 23, 23, 23, 23, 23, 22, 22, 22,
+        22, 22, 31, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 28, 27, 26, 26, 26,
+        26, 24, 24, 24, 24, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 31, 30,
+        30, 30, 30, 30, 29, 29, 29, 29, 29, 28, 27, 26, 26, 26, 26, 24, 24, 24,
+        24, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 31, 30, 30, 30, 30, 30,
+        29, 29, 29, 29, 29, 28, 27, 26, 26, 26, 26, 24, 24, 24, 24, 23, 23, 23,
+        23, 23, 22, 22, 22, 22, 22, 22, 30, 29, 28, 28, 28, 28, 28, 28, 28, 28,
+        28, 27, 26, 26, 26, 26, 24, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22,
+        22, 22, 22, 22, 28, 28, 28, 28, 28, 28, 27, 27, 26, 26, 26, 26, 25, 24,
+        24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+        28, 28, 27, 27, 27, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 22,
+        22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 28, 28, 27, 27,
+        27, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 22, 22, 22, 22, 22,
+        22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 28, 28, 27, 27, 27, 27, 26, 26,
+        26, 26, 26, 25, 24, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+        22, 22, 21, 21, 21, 22, 27, 26, 26, 26, 26, 26, 26, 25, 25, 25, 25, 24,
+        24, 23, 23, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21, 21, 21, 21, 21, 21,
+        21, 21, 26, 25, 25, 25, 25, 25, 25, 24, 24, 24, 24, 24, 23, 23, 23, 23,
+        23, 22, 22, 22, 22, 21, 21, 21, 21, 21, 21, 21, 20, 20, 20, 21, 25, 24,
+        24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 23, 23, 23, 23, 22, 22, 22, 22,
+        22, 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 20, 25, 24, 24, 24, 24, 24,
+        24, 24, 24, 24, 24, 24, 23, 23, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21,
+        21, 21, 21, 20, 20, 20, 20, 20, 25, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+        24, 24, 23, 23, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21, 21, 21, 21, 20,
+        20, 20, 20, 20, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 22,
+        22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 21, 20, 20, 20, 20, 20, 20,
+        22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22,
+        22, 22, 22, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 22,
+        22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21,
+        20, 20, 20, 20, 20, 20, 19, 19, 19, 19, 21, 21, 21, 22, 22, 22, 22, 22,
+        22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 20, 20, 20, 20,
+        20, 20, 19, 19, 19, 19, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+        22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 19, 19,
+        19, 19, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+        22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 19,
+        /* Size 4x8 */
+        33, 33, 28, 21, 33, 33, 27, 22, 33, 32, 26, 22, 30, 28, 24, 22, 28, 26,
+        22, 22, 26, 25, 22, 21, 24, 24, 22, 20, 21, 22, 21, 19,
+        /* Size 8x4 */
+        33, 33, 33, 30, 28, 26, 24, 21, 33, 33, 32, 28, 26, 25, 24, 22, 28, 27,
+        26, 24, 22, 22, 22, 21, 21, 22, 22, 22, 22, 21, 20, 19,
+        /* Size 8x16 */
+        32, 33, 33, 31, 28, 28, 23, 21, 33, 33, 33, 30, 27, 27, 23, 22, 33, 33,
+        33, 30, 27, 27, 23, 22, 33, 33, 32, 30, 26, 26, 23, 22, 34, 32, 32, 29,
+        26, 26, 23, 22, 34, 32, 32, 29, 26, 26, 23, 22, 31, 30, 29, 28, 24, 24,
+        22, 22, 31, 29, 28, 27, 24, 24, 22, 22, 29, 28, 28, 26, 23, 23, 22, 22,
+        28, 26, 26, 24, 22, 22, 22, 22, 28, 26, 26, 24, 22, 22, 22, 22, 25, 24,
+        24, 23, 22, 22, 21, 21, 24, 24, 24, 23, 22, 22, 21, 20, 23, 23, 23, 23,
+        22, 22, 20, 20, 21, 22, 22, 22, 21, 21, 20, 19, 21, 22, 22, 22, 21, 21,
+        20, 19,
+        /* Size 16x8 */
+        32, 33, 33, 33, 34, 34, 31, 31, 29, 28, 28, 25, 24, 23, 21, 21, 33, 33,
+        33, 33, 32, 32, 30, 29, 28, 26, 26, 24, 24, 23, 22, 22, 33, 33, 33, 32,
+        32, 32, 29, 28, 28, 26, 26, 24, 24, 23, 22, 22, 31, 30, 30, 30, 29, 29,
+        28, 27, 26, 24, 24, 23, 23, 23, 22, 22, 28, 27, 27, 26, 26, 26, 24, 24,
+        23, 22, 22, 22, 22, 22, 21, 21, 28, 27, 27, 26, 26, 26, 24, 24, 23, 22,
+        22, 22, 22, 22, 21, 21, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 21,
+        21, 20, 20, 20, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20,
+        19, 19,
+        /* Size 16x32 */
+        32, 33, 33, 33, 33, 33, 31, 29, 28, 28, 28, 26, 23, 21, 21, 21, 33, 33,
+        33, 33, 33, 33, 31, 28, 28, 28, 28, 25, 23, 21, 21, 21, 33, 33, 33, 33,
+        33, 33, 30, 28, 27, 27, 27, 25, 23, 22, 22, 22, 33, 33, 33, 33, 33, 33,
+        30, 28, 27, 27, 27, 25, 23, 22, 22, 22, 33, 33, 33, 33, 33, 33, 30, 28,
+        27, 27, 27, 25, 23, 22, 22, 22, 33, 33, 33, 33, 33, 33, 30, 28, 27, 27,
+        27, 25, 23, 22, 22, 22, 33, 33, 33, 32, 32, 32, 30, 28, 26, 26, 26, 25,
+        23, 22, 22, 22, 34, 33, 33, 32, 32, 32, 30, 27, 26, 26, 26, 24, 23, 22,
+        22, 22, 34, 33, 32, 32, 32, 32, 29, 27, 26, 26, 26, 24, 23, 22, 22, 22,
+        34, 33, 32, 32, 32, 32, 29, 27, 26, 26, 26, 24, 23, 22, 22, 22, 34, 33,
+        32, 32, 32, 32, 29, 27, 26, 26, 26, 24, 23, 22, 22, 22, 33, 32, 31, 31,
+        31, 31, 28, 26, 25, 25, 25, 24, 23, 22, 22, 22, 31, 30, 30, 29, 29, 29,
+        28, 26, 24, 24, 24, 23, 22, 22, 22, 22, 31, 30, 29, 28, 28, 28, 27, 25,
+        24, 24, 24, 23, 22, 22, 22, 22, 31, 30, 29, 28, 28, 28, 27, 25, 24, 24,
+        24, 23, 22, 22, 22, 22, 31, 30, 29, 28, 28, 28, 27, 25, 24, 24, 24, 23,
+        22, 22, 22, 22, 29, 28, 28, 28, 28, 28, 26, 24, 23, 23, 23, 23, 22, 22,
+        22, 22, 28, 28, 27, 26, 26, 26, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22,
+        28, 27, 26, 26, 26, 26, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 28, 27,
+        26, 26, 26, 26, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 28, 27, 26, 26,
+        26, 26, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 26, 26, 26, 25, 25, 25,
+        24, 22, 22, 22, 22, 21, 21, 21, 21, 21, 25, 25, 24, 24, 24, 24, 23, 22,
+        22, 22, 22, 21, 21, 21, 21, 21, 24, 24, 24, 24, 24, 24, 23, 22, 22, 22,
+        22, 21, 21, 20, 20, 20, 24, 24, 24, 24, 24, 24, 23, 22, 22, 22, 22, 21,
+        21, 20, 20, 20, 24, 24, 24, 24, 24, 24, 23, 22, 22, 22, 22, 21, 21, 20,
+        20, 20, 23, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 21, 20, 20, 20, 20,
+        22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 20, 20, 20, 20, 21, 21,
+        22, 22, 22, 22, 22, 21, 21, 21, 21, 20, 20, 19, 19, 19, 21, 21, 22, 22,
+        22, 22, 22, 21, 21, 21, 21, 20, 20, 19, 19, 19, 21, 21, 22, 22, 22, 22,
+        22, 21, 21, 21, 21, 20, 20, 19, 19, 19, 21, 21, 22, 22, 22, 22, 22, 22,
+        22, 22, 22, 21, 20, 19, 19, 19,
+        /* Size 32x16 */
+        32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 33, 31, 31, 31, 31, 29, 28,
+        28, 28, 28, 26, 25, 24, 24, 24, 23, 22, 21, 21, 21, 21, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 32, 30, 30, 30, 30, 28, 28, 27, 27, 27, 26,
+        25, 24, 24, 24, 23, 22, 21, 21, 21, 21, 33, 33, 33, 33, 33, 33, 33, 33,
+        32, 32, 32, 31, 30, 29, 29, 29, 28, 27, 26, 26, 26, 26, 24, 24, 24, 24,
+        23, 22, 22, 22, 22, 22, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 31,
+        29, 28, 28, 28, 28, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 22, 22, 22,
+        22, 22, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 31, 29, 28, 28, 28,
+        28, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 22, 22, 22, 22, 22, 33, 33,
+        33, 33, 33, 33, 32, 32, 32, 32, 32, 31, 29, 28, 28, 28, 28, 26, 26, 26,
+        26, 25, 24, 24, 24, 24, 23, 22, 22, 22, 22, 22, 31, 31, 30, 30, 30, 30,
+        30, 30, 29, 29, 29, 28, 28, 27, 27, 27, 26, 24, 24, 24, 24, 24, 23, 23,
+        23, 23, 23, 22, 22, 22, 22, 22, 29, 28, 28, 28, 28, 28, 28, 27, 27, 27,
+        27, 26, 26, 25, 25, 25, 24, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22,
+        21, 21, 21, 22, 28, 28, 27, 27, 27, 27, 26, 26, 26, 26, 26, 25, 24, 24,
+        24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 22,
+        28, 28, 27, 27, 27, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 22,
+        22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 22, 28, 28, 27, 27,
+        27, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 22, 22, 22, 22, 22,
+        22, 22, 22, 22, 22, 21, 21, 21, 21, 22, 26, 25, 25, 25, 25, 25, 25, 24,
+        24, 24, 24, 24, 23, 23, 23, 23, 23, 22, 22, 22, 22, 21, 21, 21, 21, 21,
+        21, 21, 20, 20, 20, 21, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
+        22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 21, 20, 20, 20, 20,
+        20, 20, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+        22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 19, 21, 21,
+        22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+        22, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22,
+        22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 20,
+        20, 20, 20, 20, 19, 19, 19, 19,
+        /* Size 4x16 */
+        33, 33, 28, 21, 33, 33, 27, 22, 33, 33, 27, 22, 33, 32, 26, 22, 33, 32,
+        26, 22, 33, 32, 26, 22, 30, 29, 24, 22, 30, 28, 24, 22, 28, 28, 23, 22,
+        27, 26, 22, 22, 27, 26, 22, 22, 25, 24, 22, 21, 24, 24, 22, 20, 23, 23,
+        22, 20, 21, 22, 21, 19, 21, 22, 21, 19,
+        /* Size 16x4 */
+        33, 33, 33, 33, 33, 33, 30, 30, 28, 27, 27, 25, 24, 23, 21, 21, 33, 33,
+        33, 32, 32, 32, 29, 28, 28, 26, 26, 24, 24, 23, 22, 22, 28, 27, 27, 26,
+        26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 21, 21, 21, 22, 22, 22, 22, 22,
+        22, 22, 22, 22, 22, 21, 20, 20, 19, 19,
+        /* Size 8x32 */
+        32, 33, 33, 31, 28, 28, 23, 21, 33, 33, 33, 31, 28, 28, 23, 21, 33, 33,
+        33, 30, 27, 27, 23, 22, 33, 33, 33, 30, 27, 27, 23, 22, 33, 33, 33, 30,
+        27, 27, 23, 22, 33, 33, 33, 30, 27, 27, 23, 22, 33, 33, 32, 30, 26, 26,
+        23, 22, 34, 33, 32, 30, 26, 26, 23, 22, 34, 32, 32, 29, 26, 26, 23, 22,
+        34, 32, 32, 29, 26, 26, 23, 22, 34, 32, 32, 29, 26, 26, 23, 22, 33, 31,
+        31, 28, 25, 25, 23, 22, 31, 30, 29, 28, 24, 24, 22, 22, 31, 29, 28, 27,
+        24, 24, 22, 22, 31, 29, 28, 27, 24, 24, 22, 22, 31, 29, 28, 27, 24, 24,
+        22, 22, 29, 28, 28, 26, 23, 23, 22, 22, 28, 27, 26, 24, 22, 22, 22, 22,
+        28, 26, 26, 24, 22, 22, 22, 22, 28, 26, 26, 24, 22, 22, 22, 22, 28, 26,
+        26, 24, 22, 22, 22, 22, 26, 26, 25, 24, 22, 22, 21, 21, 25, 24, 24, 23,
+        22, 22, 21, 21, 24, 24, 24, 23, 22, 22, 21, 20, 24, 24, 24, 23, 22, 22,
+        21, 20, 24, 24, 24, 23, 22, 22, 21, 20, 23, 23, 23, 23, 22, 22, 20, 20,
+        22, 22, 22, 22, 21, 21, 20, 20, 21, 22, 22, 22, 21, 21, 20, 19, 21, 22,
+        22, 22, 21, 21, 20, 19, 21, 22, 22, 22, 21, 21, 20, 19, 21, 22, 22, 22,
+        22, 22, 20, 19,
+        /* Size 32x8 */
+        32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 33, 31, 31, 31, 31, 29, 28,
+        28, 28, 28, 26, 25, 24, 24, 24, 23, 22, 21, 21, 21, 21, 33, 33, 33, 33,
+        33, 33, 33, 33, 32, 32, 32, 31, 30, 29, 29, 29, 28, 27, 26, 26, 26, 26,
+        24, 24, 24, 24, 23, 22, 22, 22, 22, 22, 33, 33, 33, 33, 33, 33, 32, 32,
+        32, 32, 32, 31, 29, 28, 28, 28, 28, 26, 26, 26, 26, 25, 24, 24, 24, 24,
+        23, 22, 22, 22, 22, 22, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 29, 28,
+        28, 27, 27, 27, 26, 24, 24, 24, 24, 24, 23, 23, 23, 23, 23, 22, 22, 22,
+        22, 22, 28, 28, 27, 27, 27, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24,
+        23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 22, 28, 28,
+        27, 27, 27, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 22, 22, 22,
+        22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 22, 23, 23, 23, 23, 23, 23,
+        23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21,
+        21, 21, 20, 20, 20, 20, 20, 20, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22,
+        22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 20,
+        19, 19, 19, 19 },
+  },
+  {
+      { /* Luma */
+        /* Size 4x4 */
+        33, 32, 32, 32, 32, 32, 32, 31, 32, 32, 31, 30, 32, 31, 30, 29,
+        /* Size 8x8 */
+        33, 33, 33, 33, 32, 32, 32, 31, 33, 32, 32, 32, 32, 32, 32, 31, 33, 32,
+        32, 32, 32, 32, 32, 31, 33, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32,
+        31, 31, 30, 29, 32, 32, 32, 32, 31, 30, 30, 29, 32, 32, 32, 32, 30, 30,
+        29, 28, 31, 31, 31, 31, 29, 29, 28, 27,
+        /* Size 16x16 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 30, 33, 33,
+        33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 30, 33, 33, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 33, 33, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 33, 33, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 31, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 31, 31, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31,
+        31, 30, 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 29,
+        33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 29, 33, 32,
+        32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 29, 32, 32, 32, 32,
+        32, 32, 32, 31, 31, 31, 31, 30, 29, 29, 29, 28, 32, 32, 32, 32, 32, 32,
+        32, 31, 31, 30, 30, 29, 29, 29, 28, 28, 32, 32, 32, 32, 32, 32, 32, 31,
+        31, 30, 30, 29, 29, 29, 28, 28, 32, 31, 31, 31, 31, 31, 31, 31, 30, 30,
+        30, 29, 28, 28, 28, 27, 30, 30, 30, 30, 30, 31, 31, 30, 29, 29, 29, 28,
+        28, 28, 27, 26,
+        /* Size 32x32 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 31, 31, 30, 30, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+        30, 30, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 33, 33,
+        33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 33, 33, 33, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 31, 31, 30, 30, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        31, 31, 30, 30, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30,
+        33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 33, 33, 33, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 33, 33, 33, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 31, 31, 31, 31, 31, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31,
+        31, 31, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 33, 33,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 33, 33, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 30, 30, 30, 30, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30,
+        30, 30, 29, 29, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 29,
+        33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31,
+        31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 29, 33, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31,
+        31, 30, 30, 30, 30, 30, 30, 29, 29, 29, 33, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30,
+        30, 30, 30, 29, 29, 29, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 29, 29,
+        29, 29, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+        31, 31, 31, 31, 31, 30, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30,
+        30, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 28, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 29, 29,
+        29, 29, 29, 29, 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29,
+        28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        31, 31, 31, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 30,
+        30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28, 32, 32, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29,
+        29, 29, 28, 28, 28, 28, 28, 28, 27, 27, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 28,
+        28, 28, 28, 27, 27, 27, 30, 30, 30, 30, 30, 30, 30, 30, 30, 31, 31, 31,
+        31, 31, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 27, 27,
+        26, 26, 30, 30, 30, 30, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 30, 30,
+        29, 29, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 27, 27, 26, 26,
+        /* Size 4x8 */
+        33, 33, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32,
+        31, 30, 32, 32, 30, 30, 32, 31, 30, 29, 31, 31, 29, 28,
+        /* Size 8x4 */
+        33, 33, 32, 32, 32, 32, 32, 31, 33, 32, 32, 32, 32, 32, 31, 31, 32, 32,
+        32, 32, 31, 30, 30, 29, 32, 32, 32, 31, 30, 30, 29, 28,
+        /* Size 8x16 */
+        32, 33, 33, 33, 33, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 31, 33, 32,
+        32, 32, 32, 32, 32, 31, 33, 32, 32, 32, 32, 32, 32, 31, 33, 32, 32, 32,
+        32, 32, 32, 31, 33, 32, 32, 32, 32, 31, 31, 31, 33, 32, 32, 32, 32, 31,
+        31, 31, 33, 32, 32, 32, 32, 31, 31, 31, 33, 32, 32, 32, 31, 30, 30, 30,
+        32, 32, 32, 32, 31, 30, 30, 30, 32, 32, 32, 32, 31, 30, 30, 30, 32, 32,
+        32, 32, 31, 29, 29, 29, 32, 32, 31, 31, 30, 29, 29, 28, 32, 32, 31, 31,
+        30, 29, 29, 28, 32, 31, 31, 31, 30, 28, 28, 28, 30, 30, 30, 30, 29, 28,
+        28, 27,
+        /* Size 16x8 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 30, 33, 33,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 33, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 33, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 33, 32, 32, 32, 32, 32, 32, 32,
+        31, 31, 31, 31, 30, 30, 30, 29, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30,
+        30, 29, 29, 29, 28, 28, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 29,
+        29, 29, 28, 28, 32, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 29, 28, 28,
+        28, 27,
+        /* Size 16x32 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 30, 33, 33,
+        33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 31, 30, 33, 33, 33, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 33, 33, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 33, 33, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 31, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 31, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 31, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        31, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30,
+        33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 33, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 33, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 31, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 31,
+        31, 31, 31, 31, 31, 30, 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+        31, 31, 30, 30, 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 30,
+        30, 29, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 29,
+        32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 29, 32, 32,
+        32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 29, 32, 32, 32, 32,
+        32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 29, 32, 32, 32, 32, 32, 32,
+        32, 31, 31, 30, 30, 30, 30, 30, 29, 29, 32, 32, 32, 32, 32, 32, 32, 31,
+        31, 30, 29, 29, 29, 29, 29, 28, 32, 32, 32, 32, 31, 31, 31, 31, 31, 30,
+        29, 29, 29, 29, 28, 28, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 29, 29,
+        29, 29, 28, 28, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 29, 29, 29, 29,
+        28, 28, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 29, 29, 29, 29, 28, 28,
+        32, 32, 32, 31, 31, 31, 31, 31, 30, 30, 29, 29, 29, 29, 28, 28, 32, 31,
+        31, 31, 31, 31, 31, 31, 30, 29, 28, 28, 28, 28, 28, 27, 31, 31, 31, 31,
+        31, 31, 31, 30, 30, 29, 28, 28, 28, 28, 28, 27, 30, 30, 30, 30, 30, 30,
+        30, 30, 29, 28, 28, 28, 28, 28, 27, 26, 30, 30, 30, 30, 30, 30, 30, 30,
+        29, 28, 28, 28, 28, 28, 27, 26,
+        /* Size 32x16 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 33, 33, 33, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 31, 31, 30, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31,
+        30, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 30, 30, 33, 33,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 30, 30, 33, 33, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
+        31, 31, 31, 31, 31, 31, 30, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 30, 30, 30, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+        31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 29, 29, 28, 28, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30,
+        29, 29, 29, 29, 29, 29, 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29,
+        29, 29, 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+        31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28,
+        28, 28, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31,
+        30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28, 32, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30,
+        30, 29, 29, 28, 28, 28, 28, 28, 28, 28, 27, 27, 30, 30, 30, 30, 30, 30,
+        30, 30, 30, 31, 31, 31, 31, 31, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28,
+        28, 28, 28, 28, 27, 27, 26, 26,
+        /* Size 4x16 */
+        33, 33, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32,
+        32, 32, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 31, 31, 32, 32, 31, 30,
+        32, 32, 31, 30, 32, 32, 31, 30, 32, 32, 30, 29, 32, 31, 30, 29, 32, 31,
+        30, 29, 31, 31, 29, 28, 30, 30, 28, 28,
+        /* Size 16x4 */
+        33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 33, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 32, 32, 32, 32,
+        32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 29, 28, 32, 32, 32, 32, 32, 31,
+        31, 31, 30, 30, 30, 29, 29, 29, 28, 28,
+        /* Size 8x32 */
+        32, 33, 33, 33, 33, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 31, 33, 33,
+        32, 32, 32, 32, 32, 31, 33, 32, 32, 32, 32, 32, 32, 31, 33, 32, 32, 32,
+        32, 32, 32, 31, 33, 32, 32, 32, 32, 32, 32, 31, 33, 32, 32, 32, 32, 32,
+        32, 31, 33, 32, 32, 32, 32, 32, 32, 31, 33, 32, 32, 32, 32, 32, 32, 31,
+        33, 32, 32, 32, 32, 32, 32, 31, 33, 32, 32, 32, 32, 31, 31, 31, 33, 32,
+        32, 32, 32, 31, 31, 31, 33, 32, 32, 32, 32, 31, 31, 31, 33, 32, 32, 32,
+        32, 31, 31, 31, 33, 32, 32, 32, 32, 31, 31, 31, 33, 32, 32, 32, 31, 31,
+        31, 30, 33, 32, 32, 32, 31, 30, 30, 30, 32, 32, 32, 32, 31, 30, 30, 30,
+        32, 32, 32, 32, 31, 30, 30, 30, 32, 32, 32, 32, 31, 30, 30, 30, 32, 32,
+        32, 32, 31, 30, 30, 30, 32, 32, 32, 32, 31, 30, 30, 29, 32, 32, 32, 32,
+        31, 29, 29, 29, 32, 32, 31, 31, 31, 29, 29, 28, 32, 32, 31, 31, 30, 29,
+        29, 28, 32, 32, 31, 31, 30, 29, 29, 28, 32, 32, 31, 31, 30, 29, 29, 28,
+        32, 32, 31, 31, 30, 29, 29, 28, 32, 31, 31, 31, 30, 28, 28, 28, 31, 31,
+        31, 31, 30, 28, 28, 28, 30, 30, 30, 30, 29, 28, 28, 27, 30, 30, 30, 30,
+        29, 28, 28, 27,
+        /* Size 32x8 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 33, 33, 33, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 33, 33, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31,
+        31, 31, 31, 31, 30, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31,
+        30, 30, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30,
+        30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29,
+        29, 29, 29, 29, 28, 28, 28, 28, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 28, 28, 28, 28, 28,
+        28, 28, 27, 27 },
+      { /* Chroma */
+        /* Size 4x4 */
+        33, 33, 30, 27, 33, 32, 29, 26, 30, 29, 26, 24, 27, 26, 24, 22,
+        /* Size 8x8 */
+        33, 33, 33, 34, 30, 29, 28, 26, 33, 33, 33, 33, 30, 29, 27, 25, 33, 33,
+        33, 33, 29, 28, 26, 25, 34, 33, 33, 32, 29, 28, 26, 24, 30, 30, 29, 29,
+        26, 26, 24, 23, 29, 29, 28, 28, 26, 25, 23, 23, 28, 27, 26, 26, 24, 23,
+        22, 22, 26, 25, 25, 24, 23, 23, 22, 21,
+        /* Size 16x16 */
+        32, 33, 33, 33, 33, 34, 34, 33, 31, 31, 31, 29, 28, 28, 27, 25, 33, 33,
+        33, 33, 33, 33, 33, 33, 31, 30, 30, 28, 28, 28, 26, 24, 33, 33, 33, 33,
+        33, 33, 33, 32, 30, 30, 30, 28, 27, 27, 26, 24, 33, 33, 33, 33, 33, 33,
+        33, 32, 30, 30, 30, 28, 27, 27, 26, 24, 33, 33, 33, 33, 33, 33, 33, 32,
+        30, 29, 29, 28, 26, 26, 26, 24, 34, 33, 33, 33, 33, 32, 32, 32, 30, 29,
+        29, 27, 26, 26, 25, 24, 34, 33, 33, 33, 33, 32, 32, 32, 30, 29, 29, 27,
+        26, 26, 25, 24, 33, 33, 32, 32, 32, 32, 32, 31, 29, 28, 28, 27, 26, 26,
+        25, 24, 31, 31, 30, 30, 30, 30, 30, 29, 28, 27, 27, 25, 24, 24, 24, 23,
+        31, 30, 30, 30, 29, 29, 29, 28, 27, 26, 26, 25, 24, 24, 23, 23, 31, 30,
+        30, 30, 29, 29, 29, 28, 27, 26, 26, 25, 24, 24, 23, 23, 29, 28, 28, 28,
+        28, 27, 27, 27, 25, 25, 25, 23, 22, 22, 22, 22, 28, 28, 27, 27, 26, 26,
+        26, 26, 24, 24, 24, 22, 22, 22, 22, 22, 28, 28, 27, 27, 26, 26, 26, 26,
+        24, 24, 24, 22, 22, 22, 22, 22, 27, 26, 26, 26, 26, 25, 25, 25, 24, 23,
+        23, 22, 22, 22, 22, 21, 25, 24, 24, 24, 24, 24, 24, 24, 23, 23, 23, 22,
+        22, 22, 21, 21,
+        /* Size 32x32 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 33, 32, 31, 31,
+        31, 31, 31, 30, 29, 28, 28, 28, 28, 28, 27, 26, 25, 25, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 33, 32, 31, 30, 30, 30, 30, 29,
+        28, 28, 28, 28, 28, 28, 26, 26, 24, 24, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 30, 30, 30, 29, 28, 28, 28, 28,
+        28, 27, 26, 26, 24, 24, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 32, 32, 30, 30, 30, 30, 30, 29, 28, 27, 27, 27, 27, 27, 26, 25,
+        24, 24, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31,
+        30, 30, 30, 30, 30, 29, 28, 27, 27, 27, 27, 26, 26, 25, 24, 24, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 30, 30, 30,
+        30, 29, 28, 27, 27, 27, 27, 26, 26, 25, 24, 24, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 30, 30, 30, 30, 29, 28, 27,
+        27, 27, 27, 26, 26, 25, 24, 24, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 32, 31, 30, 30, 30, 30, 30, 28, 28, 27, 27, 27, 27, 26,
+        26, 25, 24, 24, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        32, 31, 30, 29, 29, 29, 29, 28, 28, 27, 26, 26, 26, 26, 26, 25, 24, 24,
+        34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 29,
+        29, 29, 29, 28, 28, 26, 26, 26, 26, 26, 26, 25, 24, 24, 34, 34, 33, 33,
+        33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 31, 30, 29, 29, 29, 29, 28,
+        27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 34, 34, 33, 33, 33, 33, 33, 33,
+        33, 33, 32, 32, 32, 32, 32, 31, 30, 29, 29, 29, 29, 28, 27, 26, 26, 26,
+        26, 26, 25, 24, 24, 24, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32,
+        32, 32, 32, 31, 30, 29, 29, 29, 29, 28, 27, 26, 26, 26, 26, 26, 25, 24,
+        24, 24, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 31,
+        30, 29, 29, 29, 29, 28, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 33, 33,
+        33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 29, 28, 28, 28,
+        28, 28, 27, 26, 26, 26, 26, 25, 25, 24, 24, 24, 32, 32, 32, 32, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 30, 29, 28, 28, 28, 28, 28, 27, 26, 25,
+        25, 25, 25, 24, 24, 24, 24, 24, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30,
+        30, 30, 30, 30, 29, 28, 28, 27, 27, 27, 27, 26, 25, 24, 24, 24, 24, 24,
+        24, 23, 23, 23, 31, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29,
+        28, 28, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 24, 23, 23, 23, 23,
+        31, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 27, 26,
+        26, 26, 26, 26, 25, 24, 24, 24, 24, 24, 23, 23, 23, 23, 31, 30, 30, 30,
+        30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 27, 26, 26, 26, 26, 26,
+        25, 24, 24, 24, 24, 24, 23, 23, 23, 23, 31, 30, 30, 30, 30, 30, 30, 30,
+        29, 29, 29, 29, 29, 29, 28, 28, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24,
+        24, 24, 23, 23, 23, 23, 30, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28, 28,
+        28, 28, 28, 27, 26, 26, 26, 26, 26, 25, 24, 23, 23, 23, 23, 23, 23, 23,
+        23, 23, 29, 28, 28, 28, 28, 28, 28, 28, 28, 28, 27, 27, 27, 27, 27, 26,
+        25, 25, 25, 25, 25, 24, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 28, 28,
+        28, 27, 27, 27, 27, 27, 27, 26, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24,
+        24, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 28, 28, 28, 27, 27, 27,
+        27, 27, 26, 26, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 24, 23, 22, 22,
+        22, 22, 22, 22, 22, 22, 22, 22, 28, 28, 28, 27, 27, 27, 27, 27, 26, 26,
+        26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22,
+        22, 22, 22, 22, 28, 28, 28, 27, 27, 27, 27, 27, 26, 26, 26, 26, 26, 26,
+        26, 25, 24, 24, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+        28, 28, 27, 27, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 25, 24, 24, 24,
+        24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 27, 26, 26, 26,
+        26, 26, 26, 26, 26, 26, 25, 25, 25, 25, 25, 24, 24, 23, 23, 23, 23, 23,
+        22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 26, 26, 26, 25, 25, 25, 25, 25,
+        25, 25, 24, 24, 24, 24, 24, 24, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22,
+        22, 22, 21, 21, 21, 21, 25, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+        24, 24, 24, 24, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 21, 21,
+        21, 21, 25, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+        23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21,
+        /* Size 4x8 */
+        33, 33, 29, 28, 33, 33, 28, 27, 33, 32, 28, 26, 33, 32, 28, 26, 30, 28,
+        26, 24, 29, 28, 24, 23, 27, 26, 23, 22, 25, 24, 23, 22,
+        /* Size 8x4 */
+        33, 33, 33, 33, 30, 29, 27, 25, 33, 33, 32, 32, 28, 28, 26, 24, 29, 28,
+        28, 28, 26, 24, 23, 23, 28, 27, 26, 26, 24, 23, 22, 22,
+        /* Size 8x16 */
+        32, 33, 33, 33, 31, 28, 28, 27, 33, 33, 33, 33, 31, 27, 27, 26, 33, 33,
+        33, 33, 30, 27, 27, 26, 33, 33, 33, 33, 30, 27, 27, 26, 33, 33, 32, 32,
+        30, 26, 26, 26, 34, 33, 32, 32, 29, 26, 26, 25, 34, 33, 32, 32, 29, 26,
+        26, 25, 33, 32, 31, 31, 29, 26, 26, 25, 31, 30, 29, 29, 28, 24, 24, 24,
+        31, 29, 28, 28, 27, 24, 24, 23, 31, 29, 28, 28, 27, 24, 24, 23, 29, 28,
+        27, 27, 25, 23, 23, 22, 28, 26, 26, 26, 24, 22, 22, 22, 28, 26, 26, 26,
+        24, 22, 22, 22, 26, 26, 25, 25, 24, 22, 22, 22, 24, 24, 24, 24, 23, 22,
+        22, 21,
+        /* Size 16x8 */
+        32, 33, 33, 33, 33, 34, 34, 33, 31, 31, 31, 29, 28, 28, 26, 24, 33, 33,
+        33, 33, 33, 33, 33, 32, 30, 29, 29, 28, 26, 26, 26, 24, 33, 33, 33, 33,
+        32, 32, 32, 31, 29, 28, 28, 27, 26, 26, 25, 24, 33, 33, 33, 33, 32, 32,
+        32, 31, 29, 28, 28, 27, 26, 26, 25, 24, 31, 31, 30, 30, 30, 29, 29, 29,
+        28, 27, 27, 25, 24, 24, 24, 23, 28, 27, 27, 27, 26, 26, 26, 26, 24, 24,
+        24, 23, 22, 22, 22, 22, 28, 27, 27, 27, 26, 26, 26, 26, 24, 24, 24, 23,
+        22, 22, 22, 22, 27, 26, 26, 26, 26, 25, 25, 25, 24, 23, 23, 22, 22, 22,
+        22, 21,
+        /* Size 16x32 */
+        32, 33, 33, 33, 33, 33, 33, 33, 31, 29, 28, 28, 28, 28, 27, 24, 33, 33,
+        33, 33, 33, 33, 33, 33, 31, 29, 28, 28, 28, 28, 26, 24, 33, 33, 33, 33,
+        33, 33, 33, 32, 31, 29, 27, 27, 27, 27, 26, 24, 33, 33, 33, 33, 33, 33,
+        33, 32, 30, 28, 27, 27, 27, 27, 26, 24, 33, 33, 33, 33, 33, 33, 33, 32,
+        30, 28, 27, 27, 27, 27, 26, 24, 33, 33, 33, 33, 33, 33, 33, 32, 30, 28,
+        27, 27, 27, 27, 26, 24, 33, 33, 33, 33, 33, 33, 33, 32, 30, 28, 27, 27,
+        27, 27, 26, 24, 33, 33, 33, 33, 33, 33, 33, 32, 30, 28, 27, 27, 27, 27,
+        26, 24, 33, 33, 33, 33, 32, 32, 32, 32, 30, 28, 26, 26, 26, 26, 26, 24,
+        34, 33, 33, 32, 32, 32, 32, 32, 30, 28, 26, 26, 26, 26, 26, 24, 34, 33,
+        33, 32, 32, 32, 32, 31, 29, 28, 26, 26, 26, 26, 25, 24, 34, 33, 33, 32,
+        32, 32, 32, 31, 29, 28, 26, 26, 26, 26, 25, 24, 34, 33, 33, 32, 32, 32,
+        32, 31, 29, 28, 26, 26, 26, 26, 25, 24, 34, 33, 33, 32, 32, 32, 32, 31,
+        29, 28, 26, 26, 26, 26, 25, 24, 33, 33, 32, 32, 31, 31, 31, 31, 29, 27,
+        26, 26, 26, 26, 25, 24, 32, 32, 31, 31, 30, 30, 30, 30, 28, 26, 25, 25,
+        25, 25, 24, 23, 31, 31, 30, 29, 29, 29, 29, 29, 28, 26, 24, 24, 24, 24,
+        24, 23, 31, 30, 29, 29, 28, 28, 28, 28, 27, 26, 24, 24, 24, 24, 23, 23,
+        31, 30, 29, 29, 28, 28, 28, 28, 27, 26, 24, 24, 24, 24, 23, 23, 31, 30,
+        29, 29, 28, 28, 28, 28, 27, 26, 24, 24, 24, 24, 23, 23, 31, 30, 29, 29,
+        28, 28, 28, 28, 27, 26, 24, 24, 24, 24, 23, 23, 30, 29, 28, 28, 28, 28,
+        28, 28, 26, 24, 23, 23, 23, 23, 23, 23, 29, 28, 28, 27, 27, 27, 27, 26,
+        25, 24, 23, 23, 23, 23, 22, 22, 28, 28, 27, 26, 26, 26, 26, 26, 24, 23,
+        22, 22, 22, 22, 22, 22, 28, 27, 26, 26, 26, 26, 26, 25, 24, 23, 22, 22,
+        22, 22, 22, 22, 28, 27, 26, 26, 26, 26, 26, 25, 24, 23, 22, 22, 22, 22,
+        22, 22, 28, 27, 26, 26, 26, 26, 26, 25, 24, 23, 22, 22, 22, 22, 22, 22,
+        28, 27, 26, 26, 26, 26, 26, 25, 24, 23, 22, 22, 22, 22, 22, 22, 26, 26,
+        26, 25, 25, 25, 25, 24, 24, 23, 22, 22, 22, 22, 22, 21, 26, 25, 25, 24,
+        24, 24, 24, 24, 23, 23, 22, 22, 22, 22, 22, 21, 24, 24, 24, 24, 24, 24,
+        24, 24, 23, 22, 22, 22, 22, 22, 21, 21, 24, 24, 24, 24, 24, 24, 24, 24,
+        23, 22, 22, 22, 22, 22, 21, 21,
+        /* Size 32x16 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 33, 32, 31, 31,
+        31, 31, 31, 30, 29, 28, 28, 28, 28, 28, 26, 26, 24, 24, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 30, 30, 30, 29,
+        28, 28, 27, 27, 27, 27, 26, 25, 24, 24, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 32, 31, 30, 29, 29, 29, 29, 28, 28, 27, 26, 26,
+        26, 26, 26, 25, 24, 24, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32,
+        32, 32, 32, 31, 29, 29, 29, 29, 29, 28, 27, 26, 26, 26, 26, 26, 25, 24,
+        24, 24, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 31, 30,
+        29, 28, 28, 28, 28, 28, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 33, 33,
+        33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 31, 30, 29, 28, 28, 28,
+        28, 28, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 33, 33, 33, 33, 33, 33,
+        33, 33, 32, 32, 32, 32, 32, 32, 31, 30, 29, 28, 28, 28, 28, 28, 27, 26,
+        26, 26, 26, 26, 25, 24, 24, 24, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+        31, 31, 31, 31, 31, 30, 29, 28, 28, 28, 28, 28, 26, 26, 25, 25, 25, 25,
+        24, 24, 24, 24, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29,
+        29, 28, 28, 27, 27, 27, 27, 26, 25, 24, 24, 24, 24, 24, 24, 23, 23, 23,
+        29, 29, 29, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 27, 26, 26, 26,
+        26, 26, 26, 24, 24, 23, 23, 23, 23, 23, 23, 23, 22, 22, 28, 28, 27, 27,
+        27, 27, 27, 27, 26, 26, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 24, 23,
+        23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 28, 28, 27, 27, 27, 27, 27, 27,
+        26, 26, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 24, 23, 23, 22, 22, 22,
+        22, 22, 22, 22, 22, 22, 28, 28, 27, 27, 27, 27, 27, 27, 26, 26, 26, 26,
+        26, 26, 26, 25, 24, 24, 24, 24, 24, 23, 23, 22, 22, 22, 22, 22, 22, 22,
+        22, 22, 28, 28, 27, 27, 27, 27, 27, 27, 26, 26, 26, 26, 26, 26, 26, 25,
+        24, 24, 24, 24, 24, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 27, 26,
+        26, 26, 26, 26, 26, 26, 26, 26, 25, 25, 25, 25, 25, 24, 24, 23, 23, 23,
+        23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 24, 24, 24, 24, 24, 24,
+        24, 24, 24, 24, 24, 24, 24, 24, 24, 23, 23, 23, 23, 23, 23, 23, 22, 22,
+        22, 22, 22, 22, 21, 21, 21, 21,
+        /* Size 4x16 */
+        33, 33, 29, 28, 33, 33, 29, 27, 33, 33, 28, 27, 33, 33, 28, 27, 33, 32,
+        28, 26, 33, 32, 28, 26, 33, 32, 28, 26, 33, 31, 27, 26, 31, 29, 26, 24,
+        30, 28, 26, 24, 30, 28, 26, 24, 28, 27, 24, 23, 27, 26, 23, 22, 27, 26,
+        23, 22, 26, 25, 23, 22, 24, 24, 22, 22,
+        /* Size 16x4 */
+        33, 33, 33, 33, 33, 33, 33, 33, 31, 30, 30, 28, 27, 27, 26, 24, 33, 33,
+        33, 33, 32, 32, 32, 31, 29, 28, 28, 27, 26, 26, 25, 24, 29, 29, 28, 28,
+        28, 28, 28, 27, 26, 26, 26, 24, 23, 23, 23, 22, 28, 27, 27, 27, 26, 26,
+        26, 26, 24, 24, 24, 23, 22, 22, 22, 22,
+        /* Size 8x32 */
+        32, 33, 33, 33, 31, 28, 28, 27, 33, 33, 33, 33, 31, 28, 28, 26, 33, 33,
+        33, 33, 31, 27, 27, 26, 33, 33, 33, 33, 30, 27, 27, 26, 33, 33, 33, 33,
+        30, 27, 27, 26, 33, 33, 33, 33, 30, 27, 27, 26, 33, 33, 33, 33, 30, 27,
+        27, 26, 33, 33, 33, 33, 30, 27, 27, 26, 33, 33, 32, 32, 30, 26, 26, 26,
+        34, 33, 32, 32, 30, 26, 26, 26, 34, 33, 32, 32, 29, 26, 26, 25, 34, 33,
+        32, 32, 29, 26, 26, 25, 34, 33, 32, 32, 29, 26, 26, 25, 34, 33, 32, 32,
+        29, 26, 26, 25, 33, 32, 31, 31, 29, 26, 26, 25, 32, 31, 30, 30, 28, 25,
+        25, 24, 31, 30, 29, 29, 28, 24, 24, 24, 31, 29, 28, 28, 27, 24, 24, 23,
+        31, 29, 28, 28, 27, 24, 24, 23, 31, 29, 28, 28, 27, 24, 24, 23, 31, 29,
+        28, 28, 27, 24, 24, 23, 30, 28, 28, 28, 26, 23, 23, 23, 29, 28, 27, 27,
+        25, 23, 23, 22, 28, 27, 26, 26, 24, 22, 22, 22, 28, 26, 26, 26, 24, 22,
+        22, 22, 28, 26, 26, 26, 24, 22, 22, 22, 28, 26, 26, 26, 24, 22, 22, 22,
+        28, 26, 26, 26, 24, 22, 22, 22, 26, 26, 25, 25, 24, 22, 22, 22, 26, 25,
+        24, 24, 23, 22, 22, 22, 24, 24, 24, 24, 23, 22, 22, 21, 24, 24, 24, 24,
+        23, 22, 22, 21,
+        /* Size 32x8 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 33, 32, 31, 31,
+        31, 31, 31, 30, 29, 28, 28, 28, 28, 28, 26, 26, 24, 24, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 29, 29, 29, 29, 28,
+        28, 27, 26, 26, 26, 26, 26, 25, 24, 24, 33, 33, 33, 33, 33, 33, 33, 33,
+        32, 32, 32, 32, 32, 32, 31, 30, 29, 28, 28, 28, 28, 28, 27, 26, 26, 26,
+        26, 26, 25, 24, 24, 24, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32,
+        32, 32, 31, 30, 29, 28, 28, 28, 28, 28, 27, 26, 26, 26, 26, 26, 25, 24,
+        24, 24, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 28,
+        28, 27, 27, 27, 27, 26, 25, 24, 24, 24, 24, 24, 24, 23, 23, 23, 28, 28,
+        27, 27, 27, 27, 27, 27, 26, 26, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24,
+        24, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 28, 28, 27, 27, 27, 27,
+        27, 27, 26, 26, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 24, 23, 23, 22,
+        22, 22, 22, 22, 22, 22, 22, 22, 27, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+        25, 25, 25, 25, 25, 24, 24, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22,
+        22, 22, 21, 21 },
+  },
+  {
+      { /* Luma */
+        /* Size 4x4 */
+        33, 33, 33, 32, 33, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 31,
+        /* Size 8x8 */
+        33, 33, 33, 33, 33, 33, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32,
+        32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32,
+        32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        31, 31, 32, 32, 32, 32, 32, 32, 31, 31,
+        /* Size 16x16 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 33, 33, 33, 33,
+        33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
+        31, 31, 31, 31,
+        /* Size 32x32 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+        33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+        33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+        33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 31, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 33, 33,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31,
+        31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        /* Size 4x8 */
+        33, 33, 33, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32,
+        32, 32, 33, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31,
+        /* Size 8x4 */
+        33, 33, 33, 33, 33, 33, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31,
+        /* Size 8x16 */
+        32, 33, 33, 33, 33, 33, 33, 32, 33, 33, 33, 33, 33, 33, 32, 32, 33, 33,
+        32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32,
+        32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32,
+        32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32,
+        33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32,
+        32, 32, 32, 32, 31, 31, 33, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32,
+        32, 32, 31, 30, 32, 32, 32, 32, 32, 32, 31, 30, 32, 32, 32, 32, 32, 32,
+        31, 30,
+        /* Size 16x8 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 33, 33,
+        33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
+        31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30,
+        30, 30,
+        /* Size 16x32 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 31, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 31, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
+        33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 33, 33,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 33, 33, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 33, 33, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
+        31, 31, 31, 30, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31,
+        30, 30, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 31, 31, 31, 30, 30,
+        /* Size 32x16 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+        33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+        31, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31,
+        31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+        30, 30, 30, 30, 30, 30, 30, 30,
+        /* Size 4x16 */
+        33, 33, 33, 32, 33, 33, 33, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32,
+        32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32,
+        33, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32,
+        32, 31, 32, 32, 32, 31, 32, 32, 32, 31,
+        /* Size 16x4 */
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 33, 33,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 31, 31, 31, 31, 31,
+        /* Size 8x32 */
+        32, 33, 33, 33, 33, 33, 33, 32, 33, 33, 33, 33, 33, 33, 32, 32, 33, 33,
+        33, 33, 33, 33, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32,
+        32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32,
+        32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32,
+        33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32,
+        32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32,
+        32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32,
+        32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32,
+        33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32,
+        32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 31, 33, 32, 32, 32,
+        32, 32, 31, 31, 33, 32, 32, 32, 32, 32, 31, 31, 33, 32, 32, 32, 32, 32,
+        31, 31, 32, 32, 32, 32, 32, 32, 31, 30, 32, 32, 32, 32, 32, 32, 31, 30,
+        32, 32, 32, 32, 32, 32, 31, 30, 32, 32, 32, 32, 32, 32, 31, 30, 32, 32,
+        32, 32, 32, 32, 31, 30, 32, 32, 32, 32, 32, 32, 31, 30, 32, 32, 32, 32,
+        32, 32, 31, 30,
+        /* Size 32x8 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+        33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+        33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30,
+        30, 30, 30, 30 },
+      { /* Chroma */
+        /* Size 4x4 */
+        33, 33, 33, 30, 33, 33, 33, 29, 33, 33, 32, 29, 30, 29, 29, 26,
+        /* Size 8x8 */
+        33, 33, 33, 33, 34, 33, 31, 31, 33, 33, 33, 33, 33, 32, 30, 30, 33, 33,
+        33, 33, 33, 32, 30, 30, 33, 33, 33, 33, 33, 32, 29, 29, 34, 33, 33, 33,
+        32, 32, 29, 29, 33, 32, 32, 32, 32, 31, 28, 28, 31, 30, 30, 29, 29, 28,
+        26, 26, 31, 30, 30, 29, 29, 28, 26, 26,
+        /* Size 16x16 */
+        32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 33, 31, 31, 31, 31, 33, 33,
+        33, 33, 33, 33, 33, 33, 34, 34, 34, 32, 31, 30, 30, 30, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 30, 30, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 32, 30, 30, 30, 30, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 32, 30, 30, 30, 30, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 32, 30, 30, 30, 30, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31,
+        30, 29, 29, 29, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 30, 29,
+        29, 29, 34, 34, 33, 33, 33, 33, 33, 33, 32, 32, 32, 31, 30, 29, 29, 29,
+        34, 34, 33, 33, 33, 33, 33, 33, 32, 32, 32, 31, 30, 29, 29, 29, 34, 34,
+        33, 33, 33, 33, 33, 33, 32, 32, 32, 31, 30, 29, 29, 29, 33, 32, 32, 32,
+        32, 32, 31, 31, 31, 31, 31, 30, 28, 28, 28, 28, 31, 31, 31, 30, 30, 30,
+        30, 30, 30, 30, 30, 28, 28, 27, 27, 27, 31, 30, 30, 30, 30, 30, 29, 29,
+        29, 29, 29, 28, 27, 26, 26, 26, 31, 30, 30, 30, 30, 30, 29, 29, 29, 29,
+        29, 28, 27, 26, 26, 26, 31, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 28,
+        27, 26, 26, 26,
+        /* Size 32x32 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34,
+        34, 34, 34, 33, 33, 32, 31, 31, 31, 31, 31, 31, 31, 30, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 33,
+        33, 32, 31, 31, 31, 31, 31, 31, 31, 30, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 33, 32, 32, 31, 30,
+        30, 30, 30, 30, 30, 30, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 31, 30, 30, 30, 30, 30,
+        30, 29, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 32, 31, 31, 30, 30, 30, 30, 30, 30, 29, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 32, 32, 31, 30, 30, 30, 30, 30, 30, 30, 29, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 31,
+        30, 30, 30, 30, 30, 30, 30, 29, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 31, 30, 30, 30, 30,
+        30, 30, 30, 29, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 32, 32, 31, 30, 30, 30, 30, 30, 30, 30, 29,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 32, 32, 31, 30, 30, 30, 30, 30, 30, 30, 29, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32,
+        32, 31, 30, 30, 30, 30, 30, 30, 30, 29, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 31, 30, 29,
+        29, 29, 29, 29, 29, 29, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 31, 30, 29, 29, 29, 29, 29,
+        29, 29, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 32, 31, 31, 30, 29, 29, 29, 29, 29, 29, 28, 34, 34,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 32, 31, 30, 30, 29, 29, 29, 29, 29, 29, 28, 34, 34, 34, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 30,
+        30, 29, 29, 29, 29, 29, 29, 28, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 29,
+        29, 29, 29, 28, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 29, 29, 29, 29, 28,
+        34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32,
+        32, 32, 32, 32, 31, 30, 30, 29, 29, 29, 29, 29, 29, 28, 34, 34, 34, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32,
+        31, 30, 30, 29, 29, 29, 29, 29, 29, 28, 34, 34, 34, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29,
+        29, 29, 29, 29, 29, 28, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 28, 28, 28, 28, 28,
+        28, 28, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+        31, 31, 31, 31, 31, 30, 30, 29, 28, 28, 28, 28, 28, 28, 28, 28, 32, 32,
+        32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30,
+        30, 30, 29, 28, 28, 28, 28, 28, 28, 28, 28, 27, 31, 31, 31, 31, 31, 30,
+        30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 29, 28, 28,
+        28, 27, 27, 27, 27, 27, 27, 26, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30,
+        30, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 28, 28, 28, 27, 26, 26, 26,
+        26, 26, 26, 26, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29,
+        29, 29, 29, 29, 29, 29, 29, 28, 28, 28, 27, 26, 26, 26, 26, 26, 26, 26,
+        31, 31, 30, 30, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 29,
+        29, 29, 29, 28, 28, 28, 27, 26, 26, 26, 26, 26, 26, 26, 31, 31, 30, 30,
+        30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 28,
+        28, 28, 27, 26, 26, 26, 26, 26, 26, 26, 31, 31, 30, 30, 30, 30, 30, 30,
+        30, 30, 30, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 28, 28, 28, 27, 26,
+        26, 26, 26, 26, 26, 26, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, 30, 29,
+        29, 29, 29, 29, 29, 29, 29, 29, 29, 28, 28, 28, 27, 26, 26, 26, 26, 26,
+        26, 26, 30, 30, 30, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 28, 28, 28,
+        28, 28, 28, 28, 28, 28, 28, 27, 26, 26, 26, 26, 26, 26, 26, 26,
+        /* Size 4x8 */
+        33, 33, 33, 30, 33, 33, 33, 29, 33, 33, 33, 29, 33, 32, 32, 28, 33, 32,
+        32, 28, 33, 31, 31, 28, 30, 28, 28, 26, 30, 28, 28, 26,
+        /* Size 8x4 */
+        33, 33, 33, 33, 33, 33, 30, 30, 33, 33, 33, 32, 32, 31, 28, 28, 33, 33,
+        33, 32, 32, 31, 28, 28, 30, 29, 29, 28, 28, 28, 26, 26,
+        /* Size 8x16 */
+        32, 33, 33, 33, 33, 33, 31, 29, 33, 33, 33, 33, 33, 33, 31, 28, 33, 33,
+        33, 33, 33, 33, 30, 28, 33, 33, 33, 33, 33, 33, 30, 28, 33, 33, 33, 33,
+        33, 33, 30, 28, 33, 33, 33, 33, 33, 33, 30, 28, 33, 33, 33, 32, 32, 32,
+        30, 28, 34, 33, 33, 32, 32, 32, 30, 27, 34, 33, 32, 32, 32, 32, 29, 27,
+        34, 33, 32, 32, 32, 32, 29, 27, 34, 33, 32, 32, 32, 32, 29, 27, 33, 32,
+        31, 31, 31, 31, 28, 26, 31, 30, 30, 29, 29, 29, 28, 26, 31, 30, 29, 28,
+        28, 28, 27, 25, 31, 30, 29, 28, 28, 28, 27, 25, 31, 30, 29, 28, 28, 28,
+        27, 25,
+        /* Size 16x8 */
+        32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 33, 31, 31, 31, 31, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 30, 30, 30, 30, 33, 33, 33, 33,
+        33, 33, 33, 33, 32, 32, 32, 31, 30, 29, 29, 29, 33, 33, 33, 33, 33, 33,
+        32, 32, 32, 32, 32, 31, 29, 28, 28, 28, 33, 33, 33, 33, 33, 33, 32, 32,
+        32, 32, 32, 31, 29, 28, 28, 28, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32,
+        32, 31, 29, 28, 28, 28, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 29, 28,
+        28, 27, 27, 27, 29, 28, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 26, 25,
+        25, 25,
+        /* Size 16x32 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 29, 28, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 29, 28, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 28, 28, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 32, 31, 29, 28, 27, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 32, 30, 29, 28, 27, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 31, 30, 29, 28, 27, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31,
+        30, 29, 28, 27, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 30, 29,
+        28, 27, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 30, 29, 28, 27,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 30, 29, 28, 27, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 30, 29, 28, 27, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 31, 30, 29, 28, 27, 33, 33, 33, 33, 33, 32,
+        32, 32, 32, 32, 32, 31, 30, 28, 28, 26, 33, 33, 33, 33, 33, 32, 32, 32,
+        32, 32, 32, 31, 30, 28, 28, 26, 34, 33, 33, 33, 33, 32, 32, 32, 32, 32,
+        32, 31, 30, 28, 27, 26, 34, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31,
+        29, 28, 27, 26, 34, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 29, 28,
+        27, 26, 34, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 29, 28, 27, 26,
+        34, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 29, 28, 27, 26, 34, 33,
+        33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 29, 28, 27, 26, 34, 33, 33, 33,
+        32, 32, 32, 32, 32, 32, 32, 31, 29, 28, 27, 26, 33, 33, 33, 32, 32, 31,
+        31, 31, 31, 31, 31, 30, 29, 28, 27, 26, 33, 32, 32, 31, 31, 31, 31, 31,
+        31, 31, 31, 29, 28, 28, 26, 25, 32, 32, 31, 31, 30, 30, 30, 30, 30, 30,
+        30, 29, 28, 27, 26, 25, 31, 31, 30, 30, 30, 29, 29, 29, 29, 29, 29, 28,
+        28, 26, 26, 24, 31, 30, 30, 29, 29, 28, 28, 28, 28, 28, 28, 28, 27, 26,
+        25, 24, 31, 30, 30, 29, 29, 28, 28, 28, 28, 28, 28, 28, 27, 26, 25, 24,
+        31, 30, 30, 29, 29, 28, 28, 28, 28, 28, 28, 28, 27, 26, 25, 24, 31, 30,
+        30, 29, 29, 28, 28, 28, 28, 28, 28, 28, 27, 26, 25, 24, 31, 30, 30, 29,
+        29, 28, 28, 28, 28, 28, 28, 28, 27, 26, 25, 24, 31, 30, 30, 29, 29, 28,
+        28, 28, 28, 28, 28, 28, 27, 26, 25, 24, 30, 30, 29, 29, 28, 28, 28, 28,
+        28, 28, 28, 27, 26, 26, 24, 23,
+        /* Size 32x16 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34,
+        34, 34, 34, 33, 33, 32, 31, 31, 31, 31, 31, 31, 31, 30, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        32, 32, 31, 30, 30, 30, 30, 30, 30, 30, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 30,
+        30, 30, 30, 30, 30, 29, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 31, 30, 29, 29, 29, 29, 29,
+        29, 29, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32,
+        32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 29, 29, 29, 29, 28, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 31, 31, 30, 29, 28, 28, 28, 28, 28, 28, 28, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30,
+        29, 28, 28, 28, 28, 28, 28, 28, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 29, 28, 28, 28,
+        28, 28, 28, 28, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 29, 28, 28, 28, 28, 28, 28, 28,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 31, 31, 30, 29, 28, 28, 28, 28, 28, 28, 28, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
+        31, 30, 29, 28, 28, 28, 28, 28, 28, 28, 32, 32, 32, 32, 32, 31, 31, 31,
+        31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 29, 29, 28, 28,
+        28, 28, 28, 28, 28, 27, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30,
+        30, 30, 30, 29, 29, 29, 29, 29, 29, 29, 28, 28, 28, 27, 27, 27, 27, 27,
+        27, 26, 30, 30, 30, 29, 29, 29, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28,
+        28, 28, 28, 28, 28, 28, 28, 27, 26, 26, 26, 26, 26, 26, 26, 26, 29, 29,
+        28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 27, 27, 27, 27, 27, 27,
+        27, 27, 26, 26, 26, 25, 25, 25, 25, 25, 25, 24, 28, 28, 28, 27, 27, 27,
+        27, 27, 27, 27, 27, 27, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 25, 25,
+        24, 24, 24, 24, 24, 24, 24, 23,
+        /* Size 4x16 */
+        33, 33, 33, 30, 33, 33, 33, 30, 33, 33, 33, 29, 33, 33, 33, 29, 33, 33,
+        33, 29, 33, 33, 33, 29, 33, 32, 32, 28, 33, 32, 32, 28, 33, 32, 32, 28,
+        33, 32, 32, 28, 33, 32, 32, 28, 32, 31, 31, 28, 31, 29, 29, 26, 30, 28,
+        28, 26, 30, 28, 28, 26, 30, 28, 28, 26,
+        /* Size 16x4 */
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 30, 30, 33, 33,
+        33, 33, 33, 33, 32, 32, 32, 32, 32, 31, 29, 28, 28, 28, 33, 33, 33, 33,
+        33, 33, 32, 32, 32, 32, 32, 31, 29, 28, 28, 28, 30, 30, 29, 29, 29, 29,
+        28, 28, 28, 28, 28, 28, 26, 26, 26, 26,
+        /* Size 8x32 */
+        32, 33, 33, 33, 33, 33, 31, 29, 33, 33, 33, 33, 33, 33, 31, 29, 33, 33,
+        33, 33, 33, 33, 31, 28, 33, 33, 33, 33, 33, 33, 31, 28, 33, 33, 33, 33,
+        33, 33, 30, 28, 33, 33, 33, 33, 33, 33, 30, 28, 33, 33, 33, 33, 33, 33,
+        30, 28, 33, 33, 33, 33, 33, 33, 30, 28, 33, 33, 33, 33, 33, 33, 30, 28,
+        33, 33, 33, 33, 33, 33, 30, 28, 33, 33, 33, 33, 33, 33, 30, 28, 33, 33,
+        33, 33, 33, 33, 30, 28, 33, 33, 33, 32, 32, 32, 30, 28, 33, 33, 33, 32,
+        32, 32, 30, 28, 34, 33, 33, 32, 32, 32, 30, 27, 34, 33, 32, 32, 32, 32,
+        29, 27, 34, 33, 32, 32, 32, 32, 29, 27, 34, 33, 32, 32, 32, 32, 29, 27,
+        34, 33, 32, 32, 32, 32, 29, 27, 34, 33, 32, 32, 32, 32, 29, 27, 34, 33,
+        32, 32, 32, 32, 29, 27, 33, 33, 32, 31, 31, 31, 29, 27, 33, 32, 31, 31,
+        31, 31, 28, 26, 32, 31, 30, 30, 30, 30, 28, 26, 31, 30, 30, 29, 29, 29,
+        28, 26, 31, 30, 29, 28, 28, 28, 27, 25, 31, 30, 29, 28, 28, 28, 27, 25,
+        31, 30, 29, 28, 28, 28, 27, 25, 31, 30, 29, 28, 28, 28, 27, 25, 31, 30,
+        29, 28, 28, 28, 27, 25, 31, 30, 29, 28, 28, 28, 27, 25, 30, 29, 28, 28,
+        28, 28, 26, 24,
+        /* Size 32x8 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34,
+        34, 34, 34, 33, 33, 32, 31, 31, 31, 31, 31, 31, 31, 30, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        32, 31, 30, 30, 30, 30, 30, 30, 30, 29, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29,
+        29, 29, 29, 29, 29, 28, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 29, 28, 28, 28, 28, 28,
+        28, 28, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 31, 31, 30, 29, 28, 28, 28, 28, 28, 28, 28, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 31, 31, 30, 29, 28, 28, 28, 28, 28, 28, 28, 31, 31, 31, 31, 30, 30,
+        30, 30, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 29, 28, 28,
+        28, 27, 27, 27, 27, 27, 27, 26, 29, 29, 28, 28, 28, 28, 28, 28, 28, 28,
+        28, 28, 28, 28, 27, 27, 27, 27, 27, 27, 27, 27, 26, 26, 26, 25, 25, 25,
+        25, 25, 25, 24 },
+  },
+  {
+      { /* Luma */
+        /* Size 4x4 */
+        33, 33, 33, 33, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32,
+        /* Size 8x8 */
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32,
+        32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32,
+        32, 32, 33, 33, 32, 32, 32, 32, 32, 32,
+        /* Size 16x16 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+        33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32,
+        /* Size 32x32 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32,
+        32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+        33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33,
+        33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+        33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        /* Size 4x8 */
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32,
+        32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32,
+        /* Size 8x4 */
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33,
+        32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32,
+        /* Size 8x16 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 33, 33, 33, 32,
+        32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32,
+        32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32,
+        33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33,
+        32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32,
+        32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32,
+        32, 32,
+        /* Size 16x8 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32,
+        /* Size 16x32 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32,
+        32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+        33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+        33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+        33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32,
+        /* Size 32x16 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32,
+        32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+        33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33,
+        33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+        33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+        33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32,
+        /* Size 4x16 */
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 33, 32,
+        32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32,
+        33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32,
+        32, 32, 33, 32, 32, 32, 33, 32, 32, 32,
+        /* Size 16x4 */
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        /* Size 8x32 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32,
+        32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32,
+        33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33,
+        32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32,
+        32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32,
+        32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32,
+        33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33,
+        32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32,
+        32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32,
+        32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32,
+        33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33,
+        32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32,
+        32, 32, 32, 32,
+        /* Size 32x8 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+        33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 32, 32 },
+      { /* Chroma */
+        /* Size 4x4 */
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        /* Size 8x8 */
+        33, 33, 33, 33, 33, 33, 33, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 34, 33, 33, 33, 33, 33, 33, 33,
+        /* Size 16x16 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 32,
+        /* Size 32x32 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 34, 34, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34,
+        34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        32, 32, 34, 34, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32,
+        /* Size 4x8 */
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 34, 33, 32, 32,
+        /* Size 8x4 */
+        33, 33, 33, 33, 33, 33, 33, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 32, 32, 33, 33, 33, 33, 33, 33, 32, 32,
+        /* Size 8x16 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 33, 33, 33, 33,
+        33, 32, 32, 32, 34, 33, 33, 33, 33, 32, 32, 32, 34, 33, 33, 33, 32, 32,
+        32, 32,
+        /* Size 16x8 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32,
+        32, 32,
+        /* Size 16x32 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32,
+        32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32,
+        32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32,
+        34, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 34, 33,
+        33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 34, 34, 33, 33,
+        33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 34, 34, 33, 33, 33, 33,
+        33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 34, 34, 33, 33, 33, 33, 33, 33,
+        32, 32, 32, 32, 32, 32, 32, 32,
+        /* Size 32x16 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32,
+        32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32,
+        32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32,
+        32, 32, 32, 32, 32, 32, 32, 32,
+        /* Size 4x16 */
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 33, 33,
+        32, 32, 33, 33, 32, 32, 34, 33, 32, 32,
+        /* Size 16x4 */
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 32, 32, 32, 32,
+        /* Size 8x32 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 33, 33, 33, 33, 33, 32,
+        32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 33, 33, 33, 33, 33, 32, 32, 32,
+        34, 33, 33, 33, 33, 32, 32, 32, 34, 33, 33, 33, 33, 32, 32, 32, 34, 33,
+        33, 33, 32, 32, 32, 32, 34, 33, 33, 33, 32, 32, 32, 32, 34, 33, 33, 33,
+        32, 32, 32, 32,
+        /* Size 32x8 */
+        32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32,
+        32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+        33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32,
+        32, 32, 32, 32 },
+  },
+};
diff --git a/libs/libaom/src/av1/common/quant_common.h b/libs/libaom/src/av1/common/quant_common.h
new file mode 100644
index 000000000..9c30204ff
--- /dev/null
+++ b/libs/libaom/src/av1/common/quant_common.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_QUANT_COMMON_H_
+#define AOM_AV1_COMMON_QUANT_COMMON_H_
+
+#include <stdbool.h>
+#include "aom/aom_codec.h"
+#include "av1/common/seg_common.h"
+#include "av1/common/enums.h"
+#include "av1/common/entropy.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MINQ 0
+#define MAXQ 255
+#define QINDEX_RANGE (MAXQ - MINQ + 1)
+#define QINDEX_BITS 8
+// Total number of QM sets stored
+#define QM_LEVEL_BITS 4
+#define NUM_QM_LEVELS (1 << QM_LEVEL_BITS)
+/* Range of QMS is between first and last value, with offset applied to inter
+ * blocks*/
+#define DEFAULT_QM_Y 10
+#define DEFAULT_QM_U 11
+#define DEFAULT_QM_V 12
+#define DEFAULT_QM_FIRST 5
+#define DEFAULT_QM_LAST 9
+
+struct AV1Common;
+struct CommonQuantParams;
+struct macroblockd;
+
+int16_t av1_dc_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth);
+int16_t av1_ac_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth);
+
+int av1_get_qindex(const struct segmentation *seg, int segment_id,
+                   int base_qindex);
+
+// Returns true if we are using quantization matrix.
+bool av1_use_qmatrix(const struct CommonQuantParams *quant_params,
+                     const struct macroblockd *xd, int segment_id);
+
+// Reduce the large number of quantizers to a smaller number of levels for which
+// different matrices may be defined
+static INLINE int aom_get_qmlevel(int qindex, int first, int last) {
+  return first + (qindex * (last + 1 - first)) / QINDEX_RANGE;
+}
+
+// Initialize all global quant/dequant matrices.
+void av1_qm_init(struct CommonQuantParams *quant_params, int num_planes);
+
+// Get global dequant matrix.
+const qm_val_t *av1_iqmatrix(const struct CommonQuantParams *quant_params,
+                             int qmlevel, int plane, TX_SIZE tx_size);
+// Get global quant matrix.
+const qm_val_t *av1_qmatrix(const struct CommonQuantParams *quant_params,
+                            int qmlevel, int plane, TX_SIZE tx_size);
+
+// Get either local / global dequant matrix as appropriate.
+const qm_val_t *av1_get_iqmatrix(const struct CommonQuantParams *quant_params,
+                                 const struct macroblockd *xd, int plane,
+                                 TX_SIZE tx_size, TX_TYPE tx_type);
+// Get either local / global quant matrix as appropriate.
+const qm_val_t *av1_get_qmatrix(const struct CommonQuantParams *quant_params,
+                                const struct macroblockd *xd, int plane,
+                                TX_SIZE tx_size, TX_TYPE tx_type);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_COMMON_QUANT_COMMON_H_
diff --git a/libs/libaom/src/av1/common/reconinter.c b/libs/libaom/src/av1/common/reconinter.c
new file mode 100644
index 000000000..287adddcc
--- /dev/null
+++ b/libs/libaom/src/av1/common/reconinter.c
@@ -0,0 +1,1426 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+#include <limits.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/blend.h"
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/blockd.h"
+#include "av1/common/mvref_common.h"
+#include "av1/common/obmc.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+
+// This function will determine whether or not to create a warped
+// prediction.
+int av1_allow_warp(const MB_MODE_INFO *const mbmi,
+                   const WarpTypesAllowed *const warp_types,
+                   const WarpedMotionParams *const gm_params,
+                   int build_for_obmc, const struct scale_factors *const sf,
+                   WarpedMotionParams *final_warp_params) {
+  // Note: As per the spec, we must test the fixed point scales here, which are
+  // at a higher precision (1 << 14) than the xs and ys in subpel_params (that
+  // have 1 << 10 precision).
+  if (av1_is_scaled(sf)) return 0;
+
+  if (final_warp_params != NULL) *final_warp_params = default_warp_params;
+
+  if (build_for_obmc) return 0;
+
+  if (warp_types->local_warp_allowed && !mbmi->wm_params.invalid) {
+    if (final_warp_params != NULL)
+      memcpy(final_warp_params, &mbmi->wm_params, sizeof(*final_warp_params));
+    return 1;
+  } else if (warp_types->global_warp_allowed && !gm_params->invalid) {
+    if (final_warp_params != NULL)
+      memcpy(final_warp_params, gm_params, sizeof(*final_warp_params));
+    return 1;
+  }
+
+  return 0;
+}
+
+void av1_init_inter_params(InterPredParams *inter_pred_params, int block_width,
+                           int block_height, int pix_row, int pix_col,
+                           int subsampling_x, int subsampling_y, int bit_depth,
+                           int use_hbd_buf, int is_intrabc,
+                           const struct scale_factors *sf,
+                           const struct buf_2d *ref_buf,
+                           int_interpfilters interp_filters) {
+  inter_pred_params->block_width = block_width;
+  inter_pred_params->block_height = block_height;
+  inter_pred_params->pix_row = pix_row;
+  inter_pred_params->pix_col = pix_col;
+  inter_pred_params->subsampling_x = subsampling_x;
+  inter_pred_params->subsampling_y = subsampling_y;
+  inter_pred_params->bit_depth = bit_depth;
+  inter_pred_params->use_hbd_buf = use_hbd_buf;
+  inter_pred_params->is_intrabc = is_intrabc;
+  inter_pred_params->scale_factors = sf;
+  inter_pred_params->ref_frame_buf = *ref_buf;
+  inter_pred_params->mode = TRANSLATION_PRED;
+  inter_pred_params->comp_mode = UNIFORM_SINGLE;
+
+  if (is_intrabc) {
+    inter_pred_params->interp_filter_params[0] = &av1_intrabc_filter_params;
+    inter_pred_params->interp_filter_params[1] = &av1_intrabc_filter_params;
+  } else {
+    inter_pred_params->interp_filter_params[0] =
+        av1_get_interp_filter_params_with_block_size(
+            interp_filters.as_filters.x_filter, block_width);
+    inter_pred_params->interp_filter_params[1] =
+        av1_get_interp_filter_params_with_block_size(
+            interp_filters.as_filters.y_filter, block_height);
+  }
+}
+
+void av1_init_comp_mode(InterPredParams *inter_pred_params) {
+  inter_pred_params->comp_mode = UNIFORM_COMP;
+}
+
+void av1_init_warp_params(InterPredParams *inter_pred_params,
+                          const WarpTypesAllowed *warp_types, int ref,
+                          const MACROBLOCKD *xd, const MB_MODE_INFO *mi) {
+  if (inter_pred_params->block_height < 8 || inter_pred_params->block_width < 8)
+    return;
+
+  if (xd->cur_frame_force_integer_mv) return;
+
+  if (av1_allow_warp(mi, warp_types, &xd->global_motion[mi->ref_frame[ref]], 0,
+                     inter_pred_params->scale_factors,
+                     &inter_pred_params->warp_params))
+    inter_pred_params->mode = WARP_PRED;
+}
+
+void av1_init_mask_comp(InterPredParams *inter_pred_params, BLOCK_SIZE bsize,
+                        const INTERINTER_COMPOUND_DATA *mask_comp) {
+  inter_pred_params->sb_type = bsize;
+  inter_pred_params->mask_comp = *mask_comp;
+
+  if (inter_pred_params->conv_params.compound_index == 1) {
+    inter_pred_params->conv_params.do_average = 0;
+    inter_pred_params->comp_mode = MASK_COMP;
+  }
+}
+
+void av1_make_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
+                              int dst_stride,
+                              InterPredParams *inter_pred_params,
+                              const SubpelParams *subpel_params) {
+  assert(IMPLIES(inter_pred_params->conv_params.is_compound,
+                 inter_pred_params->conv_params.dst != NULL));
+
+  // TODO(jingning): av1_warp_plane() can be further cleaned up.
+  if (inter_pred_params->mode == WARP_PRED) {
+    av1_warp_plane(
+        &inter_pred_params->warp_params, inter_pred_params->use_hbd_buf,
+        inter_pred_params->bit_depth, inter_pred_params->ref_frame_buf.buf0,
+        inter_pred_params->ref_frame_buf.width,
+        inter_pred_params->ref_frame_buf.height,
+        inter_pred_params->ref_frame_buf.stride, dst,
+        inter_pred_params->pix_col, inter_pred_params->pix_row,
+        inter_pred_params->block_width, inter_pred_params->block_height,
+        dst_stride, inter_pred_params->subsampling_x,
+        inter_pred_params->subsampling_y, &inter_pred_params->conv_params);
+  } else if (inter_pred_params->mode == TRANSLATION_PRED) {
+#if CONFIG_AV1_HIGHBITDEPTH
+    if (inter_pred_params->use_hbd_buf) {
+      highbd_inter_predictor(
+          src, src_stride, dst, dst_stride, subpel_params,
+          inter_pred_params->scale_factors, inter_pred_params->block_width,
+          inter_pred_params->block_height, &inter_pred_params->conv_params,
+          inter_pred_params->interp_filter_params,
+          inter_pred_params->bit_depth);
+    } else {
+      inter_predictor(
+          src, src_stride, dst, dst_stride, subpel_params,
+          inter_pred_params->scale_factors, inter_pred_params->block_width,
+          inter_pred_params->block_height, &inter_pred_params->conv_params,
+          inter_pred_params->interp_filter_params);
+    }
+#else
+    inter_predictor(
+        src, src_stride, dst, dst_stride, subpel_params,
+        inter_pred_params->scale_factors, inter_pred_params->block_width,
+        inter_pred_params->block_height, &inter_pred_params->conv_params,
+        inter_pred_params->interp_filter_params);
+#endif
+  }
+}
+
+static const uint8_t wedge_master_oblique_odd[MASK_MASTER_SIZE] = {
+  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  2,  6,  18,
+  37, 53, 60, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+  64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+};
+static const uint8_t wedge_master_oblique_even[MASK_MASTER_SIZE] = {
+  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  4,  11, 27,
+  46, 58, 62, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+  64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+};
+static const uint8_t wedge_master_vertical[MASK_MASTER_SIZE] = {
+  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2,  7,  21,
+  43, 57, 62, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+  64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+};
+
+static AOM_INLINE void shift_copy(const uint8_t *src, uint8_t *dst, int shift,
+                                  int width) {
+  if (shift >= 0) {
+    memcpy(dst + shift, src, width - shift);
+    memset(dst, src[0], shift);
+  } else {
+    shift = -shift;
+    memcpy(dst, src + shift, width - shift);
+    memset(dst + width - shift, src[width - 1], shift);
+  }
+}
+
+/* clang-format off */
+DECLARE_ALIGNED(16, static uint8_t,
+                wedge_signflip_lookup[BLOCK_SIZES_ALL][MAX_WEDGE_TYPES]) = {
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },  // not used
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },  // not used
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },  // not used
+  { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, },
+  { 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, },
+  { 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, },
+  { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, },
+  { 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, },
+  { 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, },
+  { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, },
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },  // not used
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },  // not used
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },  // not used
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },  // not used
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },  // not used
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },  // not used
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },  // not used
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },  // not used
+  { 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, },
+  { 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, },
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },  // not used
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },  // not used
+};
+/* clang-format on */
+
+// [negative][direction]
+DECLARE_ALIGNED(
+    16, static uint8_t,
+    wedge_mask_obl[2][WEDGE_DIRECTIONS][MASK_MASTER_SIZE * MASK_MASTER_SIZE]);
+
+// 4 * MAX_WEDGE_SQUARE is an easy to compute and fairly tight upper bound
+// on the sum of all mask sizes up to an including MAX_WEDGE_SQUARE.
+DECLARE_ALIGNED(16, static uint8_t,
+                wedge_mask_buf[2 * MAX_WEDGE_TYPES * 4 * MAX_WEDGE_SQUARE]);
+
+DECLARE_ALIGNED(16, static uint8_t,
+                smooth_interintra_mask_buf[INTERINTRA_MODES][BLOCK_SIZES_ALL]
+                                          [MAX_WEDGE_SQUARE]);
+
+static wedge_masks_type wedge_masks[BLOCK_SIZES_ALL][2];
+
+static const wedge_code_type wedge_codebook_16_hgtw[16] = {
+  { WEDGE_OBLIQUE27, 4, 4 },  { WEDGE_OBLIQUE63, 4, 4 },
+  { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
+  { WEDGE_HORIZONTAL, 4, 2 }, { WEDGE_HORIZONTAL, 4, 4 },
+  { WEDGE_HORIZONTAL, 4, 6 }, { WEDGE_VERTICAL, 4, 4 },
+  { WEDGE_OBLIQUE27, 4, 2 },  { WEDGE_OBLIQUE27, 4, 6 },
+  { WEDGE_OBLIQUE153, 4, 2 }, { WEDGE_OBLIQUE153, 4, 6 },
+  { WEDGE_OBLIQUE63, 2, 4 },  { WEDGE_OBLIQUE63, 6, 4 },
+  { WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 },
+};
+
+static const wedge_code_type wedge_codebook_16_hltw[16] = {
+  { WEDGE_OBLIQUE27, 4, 4 },  { WEDGE_OBLIQUE63, 4, 4 },
+  { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
+  { WEDGE_VERTICAL, 2, 4 },   { WEDGE_VERTICAL, 4, 4 },
+  { WEDGE_VERTICAL, 6, 4 },   { WEDGE_HORIZONTAL, 4, 4 },
+  { WEDGE_OBLIQUE27, 4, 2 },  { WEDGE_OBLIQUE27, 4, 6 },
+  { WEDGE_OBLIQUE153, 4, 2 }, { WEDGE_OBLIQUE153, 4, 6 },
+  { WEDGE_OBLIQUE63, 2, 4 },  { WEDGE_OBLIQUE63, 6, 4 },
+  { WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 },
+};
+
+static const wedge_code_type wedge_codebook_16_heqw[16] = {
+  { WEDGE_OBLIQUE27, 4, 4 },  { WEDGE_OBLIQUE63, 4, 4 },
+  { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
+  { WEDGE_HORIZONTAL, 4, 2 }, { WEDGE_HORIZONTAL, 4, 6 },
+  { WEDGE_VERTICAL, 2, 4 },   { WEDGE_VERTICAL, 6, 4 },
+  { WEDGE_OBLIQUE27, 4, 2 },  { WEDGE_OBLIQUE27, 4, 6 },
+  { WEDGE_OBLIQUE153, 4, 2 }, { WEDGE_OBLIQUE153, 4, 6 },
+  { WEDGE_OBLIQUE63, 2, 4 },  { WEDGE_OBLIQUE63, 6, 4 },
+  { WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 },
+};
+
+const wedge_params_type av1_wedge_params_lookup[BLOCK_SIZES_ALL] = {
+  { 0, NULL, NULL, NULL },
+  { 0, NULL, NULL, NULL },
+  { 0, NULL, NULL, NULL },
+  { MAX_WEDGE_TYPES, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_8X8],
+    wedge_masks[BLOCK_8X8] },
+  { MAX_WEDGE_TYPES, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_8X16],
+    wedge_masks[BLOCK_8X16] },
+  { MAX_WEDGE_TYPES, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_16X8],
+    wedge_masks[BLOCK_16X8] },
+  { MAX_WEDGE_TYPES, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_16X16],
+    wedge_masks[BLOCK_16X16] },
+  { MAX_WEDGE_TYPES, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_16X32],
+    wedge_masks[BLOCK_16X32] },
+  { MAX_WEDGE_TYPES, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_32X16],
+    wedge_masks[BLOCK_32X16] },
+  { MAX_WEDGE_TYPES, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_32X32],
+    wedge_masks[BLOCK_32X32] },
+  { 0, NULL, NULL, NULL },
+  { 0, NULL, NULL, NULL },
+  { 0, NULL, NULL, NULL },
+  { 0, NULL, NULL, NULL },
+  { 0, NULL, NULL, NULL },
+  { 0, NULL, NULL, NULL },
+  { 0, NULL, NULL, NULL },
+  { 0, NULL, NULL, NULL },
+  { MAX_WEDGE_TYPES, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_8X32],
+    wedge_masks[BLOCK_8X32] },
+  { MAX_WEDGE_TYPES, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_32X8],
+    wedge_masks[BLOCK_32X8] },
+  { 0, NULL, NULL, NULL },
+  { 0, NULL, NULL, NULL },
+};
+
+static const uint8_t *get_wedge_mask_inplace(int wedge_index, int neg,
+                                             BLOCK_SIZE sb_type) {
+  const uint8_t *master;
+  const int bh = block_size_high[sb_type];
+  const int bw = block_size_wide[sb_type];
+  const wedge_code_type *a =
+      av1_wedge_params_lookup[sb_type].codebook + wedge_index;
+  int woff, hoff;
+  const uint8_t wsignflip =
+      av1_wedge_params_lookup[sb_type].signflip[wedge_index];
+
+  assert(wedge_index >= 0 && wedge_index < get_wedge_types_lookup(sb_type));
+  woff = (a->x_offset * bw) >> 3;
+  hoff = (a->y_offset * bh) >> 3;
+  master = wedge_mask_obl[neg ^ wsignflip][a->direction] +
+           MASK_MASTER_STRIDE * (MASK_MASTER_SIZE / 2 - hoff) +
+           MASK_MASTER_SIZE / 2 - woff;
+  return master;
+}
+
+const uint8_t *av1_get_compound_type_mask(
+    const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type) {
+  assert(is_masked_compound_type(comp_data->type));
+  (void)sb_type;
+  switch (comp_data->type) {
+    case COMPOUND_WEDGE:
+      return av1_get_contiguous_soft_mask(comp_data->wedge_index,
+                                          comp_data->wedge_sign, sb_type);
+    case COMPOUND_DIFFWTD: return comp_data->seg_mask;
+    default: assert(0); return NULL;
+  }
+}
+
+static AOM_INLINE void diffwtd_mask_d16(
+    uint8_t *mask, int which_inverse, int mask_base, const CONV_BUF_TYPE *src0,
+    int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w,
+    ConvolveParams *conv_params, int bd) {
+  int round =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1 + (bd - 8);
+  int i, j, m, diff;
+  for (i = 0; i < h; ++i) {
+    for (j = 0; j < w; ++j) {
+      diff = abs(src0[i * src0_stride + j] - src1[i * src1_stride + j]);
+      diff = ROUND_POWER_OF_TWO(diff, round);
+      m = clamp(mask_base + (diff / DIFF_FACTOR), 0, AOM_BLEND_A64_MAX_ALPHA);
+      mask[i * w + j] = which_inverse ? AOM_BLEND_A64_MAX_ALPHA - m : m;
+    }
+  }
+}
+
+void av1_build_compound_diffwtd_mask_d16_c(
+    uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0,
+    int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w,
+    ConvolveParams *conv_params, int bd) {
+  switch (mask_type) {
+    case DIFFWTD_38:
+      diffwtd_mask_d16(mask, 0, 38, src0, src0_stride, src1, src1_stride, h, w,
+                       conv_params, bd);
+      break;
+    case DIFFWTD_38_INV:
+      diffwtd_mask_d16(mask, 1, 38, src0, src0_stride, src1, src1_stride, h, w,
+                       conv_params, bd);
+      break;
+    default: assert(0);
+  }
+}
+
+static AOM_INLINE void diffwtd_mask(uint8_t *mask, int which_inverse,
+                                    int mask_base, const uint8_t *src0,
+                                    int src0_stride, const uint8_t *src1,
+                                    int src1_stride, int h, int w) {
+  int i, j, m, diff;
+  for (i = 0; i < h; ++i) {
+    for (j = 0; j < w; ++j) {
+      diff =
+          abs((int)src0[i * src0_stride + j] - (int)src1[i * src1_stride + j]);
+      m = clamp(mask_base + (diff / DIFF_FACTOR), 0, AOM_BLEND_A64_MAX_ALPHA);
+      mask[i * w + j] = which_inverse ? AOM_BLEND_A64_MAX_ALPHA - m : m;
+    }
+  }
+}
+
+void av1_build_compound_diffwtd_mask_c(uint8_t *mask,
+                                       DIFFWTD_MASK_TYPE mask_type,
+                                       const uint8_t *src0, int src0_stride,
+                                       const uint8_t *src1, int src1_stride,
+                                       int h, int w) {
+  switch (mask_type) {
+    case DIFFWTD_38:
+      diffwtd_mask(mask, 0, 38, src0, src0_stride, src1, src1_stride, h, w);
+      break;
+    case DIFFWTD_38_INV:
+      diffwtd_mask(mask, 1, 38, src0, src0_stride, src1, src1_stride, h, w);
+      break;
+    default: assert(0);
+  }
+}
+
+static AOM_FORCE_INLINE void diffwtd_mask_highbd(
+    uint8_t *mask, int which_inverse, int mask_base, const uint16_t *src0,
+    int src0_stride, const uint16_t *src1, int src1_stride, int h, int w,
+    const unsigned int bd) {
+  assert(bd >= 8);
+  if (bd == 8) {
+    if (which_inverse) {
+      for (int i = 0; i < h; ++i) {
+        for (int j = 0; j < w; ++j) {
+          int diff = abs((int)src0[j] - (int)src1[j]) / DIFF_FACTOR;
+          unsigned int m = negative_to_zero(mask_base + diff);
+          m = AOMMIN(m, AOM_BLEND_A64_MAX_ALPHA);
+          mask[j] = AOM_BLEND_A64_MAX_ALPHA - m;
+        }
+        src0 += src0_stride;
+        src1 += src1_stride;
+        mask += w;
+      }
+    } else {
+      for (int i = 0; i < h; ++i) {
+        for (int j = 0; j < w; ++j) {
+          int diff = abs((int)src0[j] - (int)src1[j]) / DIFF_FACTOR;
+          unsigned int m = negative_to_zero(mask_base + diff);
+          m = AOMMIN(m, AOM_BLEND_A64_MAX_ALPHA);
+          mask[j] = m;
+        }
+        src0 += src0_stride;
+        src1 += src1_stride;
+        mask += w;
+      }
+    }
+  } else {
+    const unsigned int bd_shift = bd - 8;
+    if (which_inverse) {
+      for (int i = 0; i < h; ++i) {
+        for (int j = 0; j < w; ++j) {
+          int diff =
+              (abs((int)src0[j] - (int)src1[j]) >> bd_shift) / DIFF_FACTOR;
+          unsigned int m = negative_to_zero(mask_base + diff);
+          m = AOMMIN(m, AOM_BLEND_A64_MAX_ALPHA);
+          mask[j] = AOM_BLEND_A64_MAX_ALPHA - m;
+        }
+        src0 += src0_stride;
+        src1 += src1_stride;
+        mask += w;
+      }
+    } else {
+      for (int i = 0; i < h; ++i) {
+        for (int j = 0; j < w; ++j) {
+          int diff =
+              (abs((int)src0[j] - (int)src1[j]) >> bd_shift) / DIFF_FACTOR;
+          unsigned int m = negative_to_zero(mask_base + diff);
+          m = AOMMIN(m, AOM_BLEND_A64_MAX_ALPHA);
+          mask[j] = m;
+        }
+        src0 += src0_stride;
+        src1 += src1_stride;
+        mask += w;
+      }
+    }
+  }
+}
+
+void av1_build_compound_diffwtd_mask_highbd_c(
+    uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0,
+    int src0_stride, const uint8_t *src1, int src1_stride, int h, int w,
+    int bd) {
+  switch (mask_type) {
+    case DIFFWTD_38:
+      diffwtd_mask_highbd(mask, 0, 38, CONVERT_TO_SHORTPTR(src0), src0_stride,
+                          CONVERT_TO_SHORTPTR(src1), src1_stride, h, w, bd);
+      break;
+    case DIFFWTD_38_INV:
+      diffwtd_mask_highbd(mask, 1, 38, CONVERT_TO_SHORTPTR(src0), src0_stride,
+                          CONVERT_TO_SHORTPTR(src1), src1_stride, h, w, bd);
+      break;
+    default: assert(0);
+  }
+}
+
+static AOM_INLINE void init_wedge_master_masks() {
+  int i, j;
+  const int w = MASK_MASTER_SIZE;
+  const int h = MASK_MASTER_SIZE;
+  const int stride = MASK_MASTER_STRIDE;
+  // Note: index [0] stores the masters, and [1] its complement.
+  // Generate prototype by shifting the masters
+  int shift = h / 4;
+  for (i = 0; i < h; i += 2) {
+    shift_copy(wedge_master_oblique_even,
+               &wedge_mask_obl[0][WEDGE_OBLIQUE63][i * stride], shift,
+               MASK_MASTER_SIZE);
+    shift--;
+    shift_copy(wedge_master_oblique_odd,
+               &wedge_mask_obl[0][WEDGE_OBLIQUE63][(i + 1) * stride], shift,
+               MASK_MASTER_SIZE);
+    memcpy(&wedge_mask_obl[0][WEDGE_VERTICAL][i * stride],
+           wedge_master_vertical,
+           MASK_MASTER_SIZE * sizeof(wedge_master_vertical[0]));
+    memcpy(&wedge_mask_obl[0][WEDGE_VERTICAL][(i + 1) * stride],
+           wedge_master_vertical,
+           MASK_MASTER_SIZE * sizeof(wedge_master_vertical[0]));
+  }
+
+  for (i = 0; i < h; ++i) {
+    for (j = 0; j < w; ++j) {
+      const int msk = wedge_mask_obl[0][WEDGE_OBLIQUE63][i * stride + j];
+      wedge_mask_obl[0][WEDGE_OBLIQUE27][j * stride + i] = msk;
+      wedge_mask_obl[0][WEDGE_OBLIQUE117][i * stride + w - 1 - j] =
+          wedge_mask_obl[0][WEDGE_OBLIQUE153][(w - 1 - j) * stride + i] =
+              (1 << WEDGE_WEIGHT_BITS) - msk;
+      wedge_mask_obl[1][WEDGE_OBLIQUE63][i * stride + j] =
+          wedge_mask_obl[1][WEDGE_OBLIQUE27][j * stride + i] =
+              (1 << WEDGE_WEIGHT_BITS) - msk;
+      wedge_mask_obl[1][WEDGE_OBLIQUE117][i * stride + w - 1 - j] =
+          wedge_mask_obl[1][WEDGE_OBLIQUE153][(w - 1 - j) * stride + i] = msk;
+      const int mskx = wedge_mask_obl[0][WEDGE_VERTICAL][i * stride + j];
+      wedge_mask_obl[0][WEDGE_HORIZONTAL][j * stride + i] = mskx;
+      wedge_mask_obl[1][WEDGE_VERTICAL][i * stride + j] =
+          wedge_mask_obl[1][WEDGE_HORIZONTAL][j * stride + i] =
+              (1 << WEDGE_WEIGHT_BITS) - mskx;
+    }
+  }
+}
+
+static AOM_INLINE void init_wedge_masks() {
+  uint8_t *dst = wedge_mask_buf;
+  BLOCK_SIZE bsize;
+  memset(wedge_masks, 0, sizeof(wedge_masks));
+  for (bsize = BLOCK_4X4; bsize < BLOCK_SIZES_ALL; ++bsize) {
+    const wedge_params_type *wedge_params = &av1_wedge_params_lookup[bsize];
+    const int wtypes = wedge_params->wedge_types;
+    if (wtypes == 0) continue;
+    const uint8_t *mask;
+    const int bw = block_size_wide[bsize];
+    const int bh = block_size_high[bsize];
+    int w;
+    for (w = 0; w < wtypes; ++w) {
+      mask = get_wedge_mask_inplace(w, 0, bsize);
+      aom_convolve_copy(mask, MASK_MASTER_STRIDE, dst, bw, NULL, 0, NULL, 0, bw,
+                        bh);
+      wedge_params->masks[0][w] = dst;
+      dst += bw * bh;
+
+      mask = get_wedge_mask_inplace(w, 1, bsize);
+      aom_convolve_copy(mask, MASK_MASTER_STRIDE, dst, bw, NULL, 0, NULL, 0, bw,
+                        bh);
+      wedge_params->masks[1][w] = dst;
+      dst += bw * bh;
+    }
+    assert(sizeof(wedge_mask_buf) >= (size_t)(dst - wedge_mask_buf));
+  }
+}
+
+/* clang-format off */
+static const uint8_t ii_weights1d[MAX_SB_SIZE] = {
+  60, 58, 56, 54, 52, 50, 48, 47, 45, 44, 42, 41, 39, 38, 37, 35, 34, 33, 32,
+  31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 22, 21, 20, 19, 19, 18, 18, 17, 16,
+  16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10,  9,  9,  9,  8,
+  8,  8,  8,  7,  7,  7,  7,  6,  6,  6,  6,  6,  5,  5,  5,  5,  5,  4,  4,
+  4,  4,  4,  4,  4,  4,  3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  2,  2,
+  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  1,  1,  1,  1,  1,  1,  1,  1,
+  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1
+};
+static uint8_t ii_size_scales[BLOCK_SIZES_ALL] = {
+    32, 16, 16, 16, 8, 8, 8, 4,
+    4,  4,  2,  2,  2, 1, 1, 1,
+    8,  8,  4,  4,  2, 2
+};
+/* clang-format on */
+
+static AOM_INLINE void build_smooth_interintra_mask(uint8_t *mask, int stride,
+                                                    BLOCK_SIZE plane_bsize,
+                                                    INTERINTRA_MODE mode) {
+  int i, j;
+  const int bw = block_size_wide[plane_bsize];
+  const int bh = block_size_high[plane_bsize];
+  const int size_scale = ii_size_scales[plane_bsize];
+
+  switch (mode) {
+    case II_V_PRED:
+      for (i = 0; i < bh; ++i) {
+        memset(mask, ii_weights1d[i * size_scale], bw * sizeof(mask[0]));
+        mask += stride;
+      }
+      break;
+
+    case II_H_PRED:
+      for (i = 0; i < bh; ++i) {
+        for (j = 0; j < bw; ++j) mask[j] = ii_weights1d[j * size_scale];
+        mask += stride;
+      }
+      break;
+
+    case II_SMOOTH_PRED:
+      for (i = 0; i < bh; ++i) {
+        for (j = 0; j < bw; ++j)
+          mask[j] = ii_weights1d[(i < j ? i : j) * size_scale];
+        mask += stride;
+      }
+      break;
+
+    case II_DC_PRED:
+    default:
+      for (i = 0; i < bh; ++i) {
+        memset(mask, 32, bw * sizeof(mask[0]));
+        mask += stride;
+      }
+      break;
+  }
+}
+
+static AOM_INLINE void init_smooth_interintra_masks() {
+  for (int m = 0; m < INTERINTRA_MODES; ++m) {
+    for (int bs = 0; bs < BLOCK_SIZES_ALL; ++bs) {
+      const int bw = block_size_wide[bs];
+      const int bh = block_size_high[bs];
+      if (bw > MAX_WEDGE_SIZE || bh > MAX_WEDGE_SIZE) continue;
+      build_smooth_interintra_mask(smooth_interintra_mask_buf[m][bs], bw, bs,
+                                   m);
+    }
+  }
+}
+
+// Equation of line: f(x, y) = a[0]*(x - a[2]*w/8) + a[1]*(y - a[3]*h/8) = 0
+void av1_init_wedge_masks() {
+  init_wedge_master_masks();
+  init_wedge_masks();
+  init_smooth_interintra_masks();
+}
+
+static AOM_INLINE void build_masked_compound_no_round(
+    uint8_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
+    const CONV_BUF_TYPE *src1, int src1_stride,
+    const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type, int h,
+    int w, InterPredParams *inter_pred_params) {
+  const int ssy = inter_pred_params->subsampling_y;
+  const int ssx = inter_pred_params->subsampling_x;
+  const uint8_t *mask = av1_get_compound_type_mask(comp_data, sb_type);
+  const int mask_stride = block_size_wide[sb_type];
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (inter_pred_params->use_hbd_buf) {
+    aom_highbd_blend_a64_d16_mask(dst, dst_stride, src0, src0_stride, src1,
+                                  src1_stride, mask, mask_stride, w, h, ssx,
+                                  ssy, &inter_pred_params->conv_params,
+                                  inter_pred_params->bit_depth);
+  } else {
+    aom_lowbd_blend_a64_d16_mask(dst, dst_stride, src0, src0_stride, src1,
+                                 src1_stride, mask, mask_stride, w, h, ssx, ssy,
+                                 &inter_pred_params->conv_params);
+  }
+#else
+  aom_lowbd_blend_a64_d16_mask(dst, dst_stride, src0, src0_stride, src1,
+                               src1_stride, mask, mask_stride, w, h, ssx, ssy,
+                               &inter_pred_params->conv_params);
+#endif
+}
+
+void av1_make_masked_inter_predictor(const uint8_t *pre, int pre_stride,
+                                     uint8_t *dst, int dst_stride,
+                                     InterPredParams *inter_pred_params,
+                                     const SubpelParams *subpel_params) {
+  const INTERINTER_COMPOUND_DATA *comp_data = &inter_pred_params->mask_comp;
+  BLOCK_SIZE sb_type = inter_pred_params->sb_type;
+
+  // We're going to call av1_make_inter_predictor to generate a prediction into
+  // a temporary buffer, then will blend that temporary buffer with that from
+  // the other reference.
+  DECLARE_ALIGNED(32, uint8_t, tmp_buf[2 * MAX_SB_SQUARE]);
+  uint8_t *tmp_dst =
+      inter_pred_params->use_hbd_buf ? CONVERT_TO_BYTEPTR(tmp_buf) : tmp_buf;
+
+  const int tmp_buf_stride = MAX_SB_SIZE;
+  CONV_BUF_TYPE *org_dst = inter_pred_params->conv_params.dst;
+  int org_dst_stride = inter_pred_params->conv_params.dst_stride;
+  CONV_BUF_TYPE *tmp_buf16 = (CONV_BUF_TYPE *)tmp_buf;
+  inter_pred_params->conv_params.dst = tmp_buf16;
+  inter_pred_params->conv_params.dst_stride = tmp_buf_stride;
+  assert(inter_pred_params->conv_params.do_average == 0);
+
+  // This will generate a prediction in tmp_buf for the second reference
+  av1_make_inter_predictor(pre, pre_stride, tmp_dst, MAX_SB_SIZE,
+                           inter_pred_params, subpel_params);
+
+  if (!inter_pred_params->conv_params.plane &&
+      comp_data->type == COMPOUND_DIFFWTD) {
+    av1_build_compound_diffwtd_mask_d16(
+        comp_data->seg_mask, comp_data->mask_type, org_dst, org_dst_stride,
+        tmp_buf16, tmp_buf_stride, inter_pred_params->block_height,
+        inter_pred_params->block_width, &inter_pred_params->conv_params,
+        inter_pred_params->bit_depth);
+  }
+  build_masked_compound_no_round(
+      dst, dst_stride, org_dst, org_dst_stride, tmp_buf16, tmp_buf_stride,
+      comp_data, sb_type, inter_pred_params->block_height,
+      inter_pred_params->block_width, inter_pred_params);
+}
+
+void av1_build_one_inter_predictor(
+    uint8_t *dst, int dst_stride, const MV *const src_mv,
+    InterPredParams *inter_pred_params, MACROBLOCKD *xd, int mi_x, int mi_y,
+    int ref, CalcSubpelParamsFunc calc_subpel_params_func) {
+  SubpelParams subpel_params;
+  uint8_t *src;
+  int src_stride;
+  calc_subpel_params_func(src_mv, inter_pred_params, xd, mi_x, mi_y, ref, &src,
+                          &subpel_params, &src_stride);
+
+  if (inter_pred_params->comp_mode == UNIFORM_SINGLE ||
+      inter_pred_params->comp_mode == UNIFORM_COMP) {
+    av1_make_inter_predictor(src, src_stride, dst, dst_stride,
+                             inter_pred_params, &subpel_params);
+  } else {
+    av1_make_masked_inter_predictor(src, src_stride, dst, dst_stride,
+                                    inter_pred_params, &subpel_params);
+  }
+}
+
+// True if the following hold:
+//  1. Not intrabc and not build_for_obmc
+//  2. A U or V plane
+//  3. If the block size differs from the base block size
+//  4. If sub-sampled, none of the previous blocks around the sub-sample
+//     are intrabc or inter-blocks
+static bool is_sub8x8_inter(const MACROBLOCKD *xd, int plane, BLOCK_SIZE bsize,
+                            int is_intrabc, int build_for_obmc) {
+  if (is_intrabc || build_for_obmc) {
+    return false;
+  }
+
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int ss_x = pd->subsampling_x;
+  const int ss_y = pd->subsampling_y;
+  if ((block_size_wide[bsize] >= 8 || !ss_x) &&
+      (block_size_high[bsize] >= 8 || !ss_y)) {
+    return false;
+  }
+
+  // For sub8x8 chroma blocks, we may be covering more than one luma block's
+  // worth of pixels. Thus (mi_x, mi_y) may not be the correct coordinates for
+  // the top-left corner of the prediction source - the correct top-left corner
+  // is at (pre_x, pre_y).
+  const int row_start = (block_size_high[bsize] == 4) && ss_y ? -1 : 0;
+  const int col_start = (block_size_wide[bsize] == 4) && ss_x ? -1 : 0;
+
+  for (int row = row_start; row <= 0; ++row) {
+    for (int col = col_start; col <= 0; ++col) {
+      const MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col];
+      if (!is_inter_block(this_mbmi)) return false;
+      if (is_intrabc_block(this_mbmi)) return false;
+    }
+  }
+  return true;
+}
+
+static void build_inter_predictors_sub8x8(
+    const AV1_COMMON *cm, MACROBLOCKD *xd, int plane, const MB_MODE_INFO *mi,
+    int bw, int bh, int mi_x, int mi_y,
+    CalcSubpelParamsFunc calc_subpel_params_func) {
+  const BLOCK_SIZE bsize = mi->sb_type;
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const bool ss_x = pd->subsampling_x;
+  const bool ss_y = pd->subsampling_y;
+  const int b4_w = block_size_wide[bsize] >> ss_x;
+  const int b4_h = block_size_high[bsize] >> ss_y;
+  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y);
+  const int b8_w = block_size_wide[plane_bsize];
+  const int b8_h = block_size_high[plane_bsize];
+  const int is_compound = has_second_ref(mi);
+  assert(!is_compound);
+  assert(!is_intrabc_block(mi));
+
+  // For sub8x8 chroma blocks, we may be covering more than one luma block's
+  // worth of pixels. Thus (mi_x, mi_y) may not be the correct coordinates for
+  // the top-left corner of the prediction source - the correct top-left corner
+  // is at (pre_x, pre_y).
+  const int row_start = (block_size_high[bsize] == 4) && ss_y ? -1 : 0;
+  const int col_start = (block_size_wide[bsize] == 4) && ss_x ? -1 : 0;
+  const int pre_x = (mi_x + MI_SIZE * col_start) >> ss_x;
+  const int pre_y = (mi_y + MI_SIZE * row_start) >> ss_y;
+
+  int row = row_start;
+  for (int y = 0; y < b8_h; y += b4_h) {
+    int col = col_start;
+    for (int x = 0; x < b8_w; x += b4_w) {
+      MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col];
+      int tmp_dst_stride = 8;
+      assert(bw < 8 || bh < 8);
+      (void)bw;
+      (void)bh;
+      struct buf_2d *const dst_buf = &pd->dst;
+      uint8_t *dst = dst_buf->buf + dst_buf->stride * y + x;
+      int ref = 0;
+      const RefCntBuffer *ref_buf =
+          get_ref_frame_buf(cm, this_mbmi->ref_frame[ref]);
+      const struct scale_factors *ref_scale_factors =
+          get_ref_scale_factors_const(cm, this_mbmi->ref_frame[ref]);
+      const struct scale_factors *const sf = ref_scale_factors;
+      const struct buf_2d pre_buf = {
+        NULL,
+        (plane == 1) ? ref_buf->buf.u_buffer : ref_buf->buf.v_buffer,
+        ref_buf->buf.uv_crop_width,
+        ref_buf->buf.uv_crop_height,
+        ref_buf->buf.uv_stride,
+      };
+
+      const MV mv = this_mbmi->mv[ref].as_mv;
+
+      InterPredParams inter_pred_params;
+      av1_init_inter_params(&inter_pred_params, b4_w, b4_h, pre_y + y,
+                            pre_x + x, pd->subsampling_x, pd->subsampling_y,
+                            xd->bd, is_cur_buf_hbd(xd), mi->use_intrabc, sf,
+                            &pre_buf, this_mbmi->interp_filters);
+      inter_pred_params.conv_params = get_conv_params_no_round(
+          ref, plane, xd->tmp_conv_dst, tmp_dst_stride, is_compound, xd->bd);
+      inter_pred_params.conv_params.use_dist_wtd_comp_avg = 0;
+
+      av1_build_one_inter_predictor(dst, dst_buf->stride, &mv,
+                                    &inter_pred_params, xd, mi_x + x, mi_y + y,
+                                    ref, calc_subpel_params_func);
+
+      ++col;
+    }
+    ++row;
+  }
+}
+
+static void build_inter_predictors_8x8_and_bigger(
+    const AV1_COMMON *cm, MACROBLOCKD *xd, int plane, const MB_MODE_INFO *mi,
+    int build_for_obmc, int bw, int bh, int mi_x, int mi_y,
+    CalcSubpelParamsFunc calc_subpel_params_func) {
+  const int is_compound = has_second_ref(mi);
+  const int is_intrabc = is_intrabc_block(mi);
+  assert(IMPLIES(is_intrabc, !is_compound));
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  struct buf_2d *const dst_buf = &pd->dst;
+  uint8_t *const dst = dst_buf->buf;
+
+  int is_global[2] = { 0, 0 };
+  for (int ref = 0; ref < 1 + is_compound; ++ref) {
+    const WarpedMotionParams *const wm = &xd->global_motion[mi->ref_frame[ref]];
+    is_global[ref] = is_global_mv_block(mi, wm->wmtype);
+  }
+
+  const BLOCK_SIZE bsize = mi->sb_type;
+  const int ss_x = pd->subsampling_x;
+  const int ss_y = pd->subsampling_y;
+  const int row_start =
+      (block_size_high[bsize] == 4) && ss_y && !build_for_obmc ? -1 : 0;
+  const int col_start =
+      (block_size_wide[bsize] == 4) && ss_x && !build_for_obmc ? -1 : 0;
+  const int pre_x = (mi_x + MI_SIZE * col_start) >> ss_x;
+  const int pre_y = (mi_y + MI_SIZE * row_start) >> ss_y;
+
+  for (int ref = 0; ref < 1 + is_compound; ++ref) {
+    const struct scale_factors *const sf =
+        is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref];
+    struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref];
+    const MV mv = mi->mv[ref].as_mv;
+    const WarpTypesAllowed warp_types = { is_global[ref],
+                                          mi->motion_mode == WARPED_CAUSAL };
+
+    InterPredParams inter_pred_params;
+    av1_init_inter_params(&inter_pred_params, bw, bh, pre_y, pre_x,
+                          pd->subsampling_x, pd->subsampling_y, xd->bd,
+                          is_cur_buf_hbd(xd), mi->use_intrabc, sf, pre_buf,
+                          mi->interp_filters);
+    if (is_compound) av1_init_comp_mode(&inter_pred_params);
+    inter_pred_params.conv_params = get_conv_params_no_round(
+        ref, plane, xd->tmp_conv_dst, MAX_SB_SIZE, is_compound, xd->bd);
+
+    av1_dist_wtd_comp_weight_assign(
+        cm, mi, 0, &inter_pred_params.conv_params.fwd_offset,
+        &inter_pred_params.conv_params.bck_offset,
+        &inter_pred_params.conv_params.use_dist_wtd_comp_avg, is_compound);
+
+    if (!build_for_obmc)
+      av1_init_warp_params(&inter_pred_params, &warp_types, ref, xd, mi);
+
+    if (is_masked_compound_type(mi->interinter_comp.type)) {
+      av1_init_mask_comp(&inter_pred_params, mi->sb_type, &mi->interinter_comp);
+      // Assign physical buffer.
+      inter_pred_params.mask_comp.seg_mask = xd->seg_mask;
+    }
+
+    av1_build_one_inter_predictor(dst, dst_buf->stride, &mv, &inter_pred_params,
+                                  xd, mi_x, mi_y, ref, calc_subpel_params_func);
+  }
+}
+
+void av1_build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                int plane, const MB_MODE_INFO *mi,
+                                int build_for_obmc, int bw, int bh, int mi_x,
+                                int mi_y,
+                                CalcSubpelParamsFunc calc_subpel_params_func) {
+  if (is_sub8x8_inter(xd, plane, mi->sb_type, is_intrabc_block(mi),
+                      build_for_obmc)) {
+    build_inter_predictors_sub8x8(cm, xd, plane, mi, bw, bh, mi_x, mi_y,
+                                  calc_subpel_params_func);
+  } else {
+    build_inter_predictors_8x8_and_bigger(cm, xd, plane, mi, build_for_obmc, bw,
+                                          bh, mi_x, mi_y,
+                                          calc_subpel_params_func);
+  }
+}
+
+void av1_dist_wtd_comp_weight_assign(const AV1_COMMON *cm,
+                                     const MB_MODE_INFO *mbmi, int order_idx,
+                                     int *fwd_offset, int *bck_offset,
+                                     int *use_dist_wtd_comp_avg,
+                                     int is_compound) {
+  assert(fwd_offset != NULL && bck_offset != NULL);
+  if (!is_compound || mbmi->compound_idx) {
+    *use_dist_wtd_comp_avg = 0;
+    return;
+  }
+
+  *use_dist_wtd_comp_avg = 1;
+  const RefCntBuffer *const bck_buf = get_ref_frame_buf(cm, mbmi->ref_frame[0]);
+  const RefCntBuffer *const fwd_buf = get_ref_frame_buf(cm, mbmi->ref_frame[1]);
+  const int cur_frame_index = cm->cur_frame->order_hint;
+  int bck_frame_index = 0, fwd_frame_index = 0;
+
+  if (bck_buf != NULL) bck_frame_index = bck_buf->order_hint;
+  if (fwd_buf != NULL) fwd_frame_index = fwd_buf->order_hint;
+
+  int d0 = clamp(abs(get_relative_dist(&cm->seq_params.order_hint_info,
+                                       fwd_frame_index, cur_frame_index)),
+                 0, MAX_FRAME_DISTANCE);
+  int d1 = clamp(abs(get_relative_dist(&cm->seq_params.order_hint_info,
+                                       cur_frame_index, bck_frame_index)),
+                 0, MAX_FRAME_DISTANCE);
+
+  const int order = d0 <= d1;
+
+  if (d0 == 0 || d1 == 0) {
+    *fwd_offset = quant_dist_lookup_table[order_idx][3][order];
+    *bck_offset = quant_dist_lookup_table[order_idx][3][1 - order];
+    return;
+  }
+
+  int i;
+  for (i = 0; i < 3; ++i) {
+    int c0 = quant_dist_weight[i][order];
+    int c1 = quant_dist_weight[i][!order];
+    int d0_c0 = d0 * c0;
+    int d1_c1 = d1 * c1;
+    if ((d0 > d1 && d0_c0 < d1_c1) || (d0 <= d1 && d0_c0 > d1_c1)) break;
+  }
+
+  *fwd_offset = quant_dist_lookup_table[order_idx][i][order];
+  *bck_offset = quant_dist_lookup_table[order_idx][i][1 - order];
+}
+
+void av1_setup_dst_planes(struct macroblockd_plane *planes, BLOCK_SIZE bsize,
+                          const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
+                          const int plane_start, const int plane_end) {
+  // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet
+  // the static analysis warnings.
+  for (int i = plane_start; i < AOMMIN(plane_end, MAX_MB_PLANE); ++i) {
+    struct macroblockd_plane *const pd = &planes[i];
+    const int is_uv = i > 0;
+    setup_pred_plane(&pd->dst, bsize, src->buffers[i], src->crop_widths[is_uv],
+                     src->crop_heights[is_uv], src->strides[is_uv], mi_row,
+                     mi_col, NULL, pd->subsampling_x, pd->subsampling_y);
+  }
+}
+
+void av1_setup_pre_planes(MACROBLOCKD *xd, int idx,
+                          const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
+                          const struct scale_factors *sf,
+                          const int num_planes) {
+  if (src != NULL) {
+    // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet
+    // the static analysis warnings.
+    for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); ++i) {
+      struct macroblockd_plane *const pd = &xd->plane[i];
+      const int is_uv = i > 0;
+      setup_pred_plane(&pd->pre[idx], xd->mi[0]->sb_type, src->buffers[i],
+                       src->crop_widths[is_uv], src->crop_heights[is_uv],
+                       src->strides[is_uv], mi_row, mi_col, sf,
+                       pd->subsampling_x, pd->subsampling_y);
+    }
+  }
+}
+
+// obmc_mask_N[overlap_position]
+static const uint8_t obmc_mask_1[1] = { 64 };
+DECLARE_ALIGNED(2, static const uint8_t, obmc_mask_2[2]) = { 45, 64 };
+
+DECLARE_ALIGNED(4, static const uint8_t, obmc_mask_4[4]) = { 39, 50, 59, 64 };
+
+static const uint8_t obmc_mask_8[8] = { 36, 42, 48, 53, 57, 61, 64, 64 };
+
+static const uint8_t obmc_mask_16[16] = { 34, 37, 40, 43, 46, 49, 52, 54,
+                                          56, 58, 60, 61, 64, 64, 64, 64 };
+
+static const uint8_t obmc_mask_32[32] = { 33, 35, 36, 38, 40, 41, 43, 44,
+                                          45, 47, 48, 50, 51, 52, 53, 55,
+                                          56, 57, 58, 59, 60, 60, 61, 62,
+                                          64, 64, 64, 64, 64, 64, 64, 64 };
+
+static const uint8_t obmc_mask_64[64] = {
+  33, 34, 35, 35, 36, 37, 38, 39, 40, 40, 41, 42, 43, 44, 44, 44,
+  45, 46, 47, 47, 48, 49, 50, 51, 51, 51, 52, 52, 53, 54, 55, 56,
+  56, 56, 57, 57, 58, 58, 59, 60, 60, 60, 60, 60, 61, 62, 62, 62,
+  62, 62, 63, 63, 63, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+};
+
+const uint8_t *av1_get_obmc_mask(int length) {
+  switch (length) {
+    case 1: return obmc_mask_1;
+    case 2: return obmc_mask_2;
+    case 4: return obmc_mask_4;
+    case 8: return obmc_mask_8;
+    case 16: return obmc_mask_16;
+    case 32: return obmc_mask_32;
+    case 64: return obmc_mask_64;
+    default: assert(0); return NULL;
+  }
+}
+
+static INLINE void increment_int_ptr(MACROBLOCKD *xd, int rel_mi_row,
+                                     int rel_mi_col, uint8_t op_mi_size,
+                                     int dir, MB_MODE_INFO *mi, void *fun_ctxt,
+                                     const int num_planes) {
+  (void)xd;
+  (void)rel_mi_row;
+  (void)rel_mi_col;
+  (void)op_mi_size;
+  (void)dir;
+  (void)mi;
+  ++*(int *)fun_ctxt;
+  (void)num_planes;
+}
+
+void av1_count_overlappable_neighbors(const AV1_COMMON *cm, MACROBLOCKD *xd) {
+  MB_MODE_INFO *mbmi = xd->mi[0];
+
+  mbmi->overlappable_neighbors[0] = 0;
+  mbmi->overlappable_neighbors[1] = 0;
+
+  if (!is_motion_variation_allowed_bsize(mbmi->sb_type)) return;
+
+  foreach_overlappable_nb_above(cm, xd, INT_MAX, increment_int_ptr,
+                                &mbmi->overlappable_neighbors[0]);
+  foreach_overlappable_nb_left(cm, xd, INT_MAX, increment_int_ptr,
+                               &mbmi->overlappable_neighbors[1]);
+}
+
+// HW does not support < 4x4 prediction. To limit the bandwidth requirement, if
+// block-size of current plane is smaller than 8x8, always only blend with the
+// left neighbor(s) (skip blending with the above side).
+#define DISABLE_CHROMA_U8X8_OBMC 0  // 0: one-sided obmc; 1: disable
+
+int av1_skip_u4x4_pred_in_obmc(BLOCK_SIZE bsize,
+                               const struct macroblockd_plane *pd, int dir) {
+  assert(is_motion_variation_allowed_bsize(bsize));
+
+  const BLOCK_SIZE bsize_plane =
+      get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+  switch (bsize_plane) {
+#if DISABLE_CHROMA_U8X8_OBMC
+    case BLOCK_4X4:
+    case BLOCK_8X4:
+    case BLOCK_4X8: return 1; break;
+#else
+    case BLOCK_4X4:
+    case BLOCK_8X4:
+    case BLOCK_4X8: return dir == 0; break;
+#endif
+    default: return 0;
+  }
+}
+
+void av1_modify_neighbor_predictor_for_obmc(MB_MODE_INFO *mbmi) {
+  mbmi->ref_frame[1] = NONE_FRAME;
+  mbmi->interinter_comp.type = COMPOUND_AVERAGE;
+
+  return;
+}
+
+struct obmc_inter_pred_ctxt {
+  uint8_t **adjacent;
+  int *adjacent_stride;
+};
+
+static INLINE void build_obmc_inter_pred_above(
+    MACROBLOCKD *xd, int rel_mi_row, int rel_mi_col, uint8_t op_mi_size,
+    int dir, MB_MODE_INFO *above_mi, void *fun_ctxt, const int num_planes) {
+  (void)above_mi;
+  (void)rel_mi_row;
+  (void)dir;
+  struct obmc_inter_pred_ctxt *ctxt = (struct obmc_inter_pred_ctxt *)fun_ctxt;
+  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+  const int overlap =
+      AOMMIN(block_size_high[bsize], block_size_high[BLOCK_64X64]) >> 1;
+
+  for (int plane = 0; plane < num_planes; ++plane) {
+    const struct macroblockd_plane *pd = &xd->plane[plane];
+    const int bw = (op_mi_size * MI_SIZE) >> pd->subsampling_x;
+    const int bh = overlap >> pd->subsampling_y;
+    const int plane_col = (rel_mi_col * MI_SIZE) >> pd->subsampling_x;
+
+    if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 0)) continue;
+
+    const int dst_stride = pd->dst.stride;
+    uint8_t *const dst = &pd->dst.buf[plane_col];
+    const int tmp_stride = ctxt->adjacent_stride[plane];
+    const uint8_t *const tmp = &ctxt->adjacent[plane][plane_col];
+    const uint8_t *const mask = av1_get_obmc_mask(bh);
+#if CONFIG_AV1_HIGHBITDEPTH
+    const int is_hbd = is_cur_buf_hbd(xd);
+    if (is_hbd)
+      aom_highbd_blend_a64_vmask(dst, dst_stride, dst, dst_stride, tmp,
+                                 tmp_stride, mask, bw, bh, xd->bd);
+    else
+      aom_blend_a64_vmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride,
+                          mask, bw, bh);
+#else
+    aom_blend_a64_vmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride, mask,
+                        bw, bh);
+#endif
+  }
+}
+
+static INLINE void build_obmc_inter_pred_left(
+    MACROBLOCKD *xd, int rel_mi_row, int rel_mi_col, uint8_t op_mi_size,
+    int dir, MB_MODE_INFO *left_mi, void *fun_ctxt, const int num_planes) {
+  (void)left_mi;
+  (void)rel_mi_col;
+  (void)dir;
+  struct obmc_inter_pred_ctxt *ctxt = (struct obmc_inter_pred_ctxt *)fun_ctxt;
+  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+  const int overlap =
+      AOMMIN(block_size_wide[bsize], block_size_wide[BLOCK_64X64]) >> 1;
+
+  for (int plane = 0; plane < num_planes; ++plane) {
+    const struct macroblockd_plane *pd = &xd->plane[plane];
+    const int bw = overlap >> pd->subsampling_x;
+    const int bh = (op_mi_size * MI_SIZE) >> pd->subsampling_y;
+    const int plane_row = (rel_mi_row * MI_SIZE) >> pd->subsampling_y;
+
+    if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 1)) continue;
+
+    const int dst_stride = pd->dst.stride;
+    uint8_t *const dst = &pd->dst.buf[plane_row * dst_stride];
+    const int tmp_stride = ctxt->adjacent_stride[plane];
+    const uint8_t *const tmp = &ctxt->adjacent[plane][plane_row * tmp_stride];
+    const uint8_t *const mask = av1_get_obmc_mask(bw);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+    const int is_hbd = is_cur_buf_hbd(xd);
+    if (is_hbd)
+      aom_highbd_blend_a64_hmask(dst, dst_stride, dst, dst_stride, tmp,
+                                 tmp_stride, mask, bw, bh, xd->bd);
+    else
+      aom_blend_a64_hmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride,
+                          mask, bw, bh);
+#else
+    aom_blend_a64_hmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride, mask,
+                        bw, bh);
+#endif
+  }
+}
+
+// This function combines motion compensated predictions that are generated by
+// top/left neighboring blocks' inter predictors with the regular inter
+// prediction. We assume the original prediction (bmc) is stored in
+// xd->plane[].dst.buf
+void av1_build_obmc_inter_prediction(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                     uint8_t *above[MAX_MB_PLANE],
+                                     int above_stride[MAX_MB_PLANE],
+                                     uint8_t *left[MAX_MB_PLANE],
+                                     int left_stride[MAX_MB_PLANE]) {
+  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+
+  // handle above row
+  struct obmc_inter_pred_ctxt ctxt_above = { above, above_stride };
+  foreach_overlappable_nb_above(cm, xd,
+                                max_neighbor_obmc[mi_size_wide_log2[bsize]],
+                                build_obmc_inter_pred_above, &ctxt_above);
+
+  // handle left column
+  struct obmc_inter_pred_ctxt ctxt_left = { left, left_stride };
+  foreach_overlappable_nb_left(cm, xd,
+                               max_neighbor_obmc[mi_size_high_log2[bsize]],
+                               build_obmc_inter_pred_left, &ctxt_left);
+}
+
+void av1_setup_address_for_obmc(MACROBLOCKD *xd, int mi_row_offset,
+                                int mi_col_offset, MB_MODE_INFO *ref_mbmi,
+                                struct build_prediction_ctxt *ctxt,
+                                const int num_planes) {
+  const BLOCK_SIZE ref_bsize = AOMMAX(BLOCK_8X8, ref_mbmi->sb_type);
+  const int ref_mi_row = xd->mi_row + mi_row_offset;
+  const int ref_mi_col = xd->mi_col + mi_col_offset;
+
+  for (int plane = 0; plane < num_planes; ++plane) {
+    struct macroblockd_plane *const pd = &xd->plane[plane];
+    setup_pred_plane(&pd->dst, ref_bsize, ctxt->tmp_buf[plane],
+                     ctxt->tmp_width[plane], ctxt->tmp_height[plane],
+                     ctxt->tmp_stride[plane], mi_row_offset, mi_col_offset,
+                     NULL, pd->subsampling_x, pd->subsampling_y);
+  }
+
+  const MV_REFERENCE_FRAME frame = ref_mbmi->ref_frame[0];
+
+  const RefCntBuffer *const ref_buf = get_ref_frame_buf(ctxt->cm, frame);
+  const struct scale_factors *const sf =
+      get_ref_scale_factors_const(ctxt->cm, frame);
+
+  xd->block_ref_scale_factors[0] = sf;
+  if ((!av1_is_valid_scale(sf)))
+    aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
+                       "Reference frame has invalid dimensions");
+
+  av1_setup_pre_planes(xd, 0, &ref_buf->buf, ref_mi_row, ref_mi_col, sf,
+                       num_planes);
+}
+
+void av1_setup_build_prediction_by_above_pred(
+    MACROBLOCKD *xd, int rel_mi_col, uint8_t above_mi_width,
+    MB_MODE_INFO *above_mbmi, struct build_prediction_ctxt *ctxt,
+    const int num_planes) {
+  const BLOCK_SIZE a_bsize = AOMMAX(BLOCK_8X8, above_mbmi->sb_type);
+  const int above_mi_col = xd->mi_col + rel_mi_col;
+
+  av1_modify_neighbor_predictor_for_obmc(above_mbmi);
+
+  for (int j = 0; j < num_planes; ++j) {
+    struct macroblockd_plane *const pd = &xd->plane[j];
+    setup_pred_plane(&pd->dst, a_bsize, ctxt->tmp_buf[j], ctxt->tmp_width[j],
+                     ctxt->tmp_height[j], ctxt->tmp_stride[j], 0, rel_mi_col,
+                     NULL, pd->subsampling_x, pd->subsampling_y);
+  }
+
+  const int num_refs = 1 + has_second_ref(above_mbmi);
+
+  for (int ref = 0; ref < num_refs; ++ref) {
+    const MV_REFERENCE_FRAME frame = above_mbmi->ref_frame[ref];
+
+    const RefCntBuffer *const ref_buf = get_ref_frame_buf(ctxt->cm, frame);
+    const struct scale_factors *const sf =
+        get_ref_scale_factors_const(ctxt->cm, frame);
+    xd->block_ref_scale_factors[ref] = sf;
+    if ((!av1_is_valid_scale(sf)))
+      aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
+                         "Reference frame has invalid dimensions");
+    av1_setup_pre_planes(xd, ref, &ref_buf->buf, xd->mi_row, above_mi_col, sf,
+                         num_planes);
+  }
+
+  xd->mb_to_left_edge = 8 * MI_SIZE * (-above_mi_col);
+  xd->mb_to_right_edge =
+      ctxt->mb_to_far_edge +
+      (xd->width - rel_mi_col - above_mi_width) * MI_SIZE * 8;
+}
+
+void av1_setup_build_prediction_by_left_pred(MACROBLOCKD *xd, int rel_mi_row,
+                                             uint8_t left_mi_height,
+                                             MB_MODE_INFO *left_mbmi,
+                                             struct build_prediction_ctxt *ctxt,
+                                             const int num_planes) {
+  const BLOCK_SIZE l_bsize = AOMMAX(BLOCK_8X8, left_mbmi->sb_type);
+  const int left_mi_row = xd->mi_row + rel_mi_row;
+
+  av1_modify_neighbor_predictor_for_obmc(left_mbmi);
+
+  for (int j = 0; j < num_planes; ++j) {
+    struct macroblockd_plane *const pd = &xd->plane[j];
+    setup_pred_plane(&pd->dst, l_bsize, ctxt->tmp_buf[j], ctxt->tmp_width[j],
+                     ctxt->tmp_height[j], ctxt->tmp_stride[j], rel_mi_row, 0,
+                     NULL, pd->subsampling_x, pd->subsampling_y);
+  }
+
+  const int num_refs = 1 + has_second_ref(left_mbmi);
+
+  for (int ref = 0; ref < num_refs; ++ref) {
+    const MV_REFERENCE_FRAME frame = left_mbmi->ref_frame[ref];
+
+    const RefCntBuffer *const ref_buf = get_ref_frame_buf(ctxt->cm, frame);
+    const struct scale_factors *const ref_scale_factors =
+        get_ref_scale_factors_const(ctxt->cm, frame);
+
+    xd->block_ref_scale_factors[ref] = ref_scale_factors;
+    if ((!av1_is_valid_scale(ref_scale_factors)))
+      aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
+                         "Reference frame has invalid dimensions");
+    av1_setup_pre_planes(xd, ref, &ref_buf->buf, left_mi_row, xd->mi_col,
+                         ref_scale_factors, num_planes);
+  }
+
+  xd->mb_to_top_edge = GET_MV_SUBPEL(MI_SIZE * (-left_mi_row));
+  xd->mb_to_bottom_edge =
+      ctxt->mb_to_far_edge +
+      GET_MV_SUBPEL((xd->height - rel_mi_row - left_mi_height) * MI_SIZE);
+}
+
+static AOM_INLINE void combine_interintra(
+    INTERINTRA_MODE mode, int8_t use_wedge_interintra, int8_t wedge_index,
+    int8_t wedge_sign, BLOCK_SIZE bsize, BLOCK_SIZE plane_bsize,
+    uint8_t *comppred, int compstride, const uint8_t *interpred,
+    int interstride, const uint8_t *intrapred, int intrastride) {
+  const int bw = block_size_wide[plane_bsize];
+  const int bh = block_size_high[plane_bsize];
+
+  if (use_wedge_interintra) {
+    if (av1_is_wedge_used(bsize)) {
+      const uint8_t *mask =
+          av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
+      const int subw = 2 * mi_size_wide[bsize] == bw;
+      const int subh = 2 * mi_size_high[bsize] == bh;
+      aom_blend_a64_mask(comppred, compstride, intrapred, intrastride,
+                         interpred, interstride, mask, block_size_wide[bsize],
+                         bw, bh, subw, subh);
+    }
+    return;
+  }
+
+  const uint8_t *mask = smooth_interintra_mask_buf[mode][plane_bsize];
+  aom_blend_a64_mask(comppred, compstride, intrapred, intrastride, interpred,
+                     interstride, mask, bw, bw, bh, 0, 0);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static AOM_INLINE void combine_interintra_highbd(
+    INTERINTRA_MODE mode, int8_t use_wedge_interintra, int8_t wedge_index,
+    int8_t wedge_sign, BLOCK_SIZE bsize, BLOCK_SIZE plane_bsize,
+    uint8_t *comppred8, int compstride, const uint8_t *interpred8,
+    int interstride, const uint8_t *intrapred8, int intrastride, int bd) {
+  const int bw = block_size_wide[plane_bsize];
+  const int bh = block_size_high[plane_bsize];
+
+  if (use_wedge_interintra) {
+    if (av1_is_wedge_used(bsize)) {
+      const uint8_t *mask =
+          av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
+      const int subh = 2 * mi_size_high[bsize] == bh;
+      const int subw = 2 * mi_size_wide[bsize] == bw;
+      aom_highbd_blend_a64_mask(comppred8, compstride, intrapred8, intrastride,
+                                interpred8, interstride, mask,
+                                block_size_wide[bsize], bw, bh, subw, subh, bd);
+    }
+    return;
+  }
+
+  uint8_t mask[MAX_SB_SQUARE];
+  build_smooth_interintra_mask(mask, bw, plane_bsize, mode);
+  aom_highbd_blend_a64_mask(comppred8, compstride, intrapred8, intrastride,
+                            interpred8, interstride, mask, bw, bw, bh, 0, 0,
+                            bd);
+}
+#endif
+
+void av1_build_intra_predictors_for_interintra(const AV1_COMMON *cm,
+                                               MACROBLOCKD *xd,
+                                               BLOCK_SIZE bsize, int plane,
+                                               const BUFFER_SET *ctx,
+                                               uint8_t *dst, int dst_stride) {
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int ssx = xd->plane[plane].subsampling_x;
+  const int ssy = xd->plane[plane].subsampling_y;
+  BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ssx, ssy);
+  PREDICTION_MODE mode = interintra_to_intra_mode[xd->mi[0]->interintra_mode];
+  assert(xd->mi[0]->angle_delta[PLANE_TYPE_Y] == 0);
+  assert(xd->mi[0]->angle_delta[PLANE_TYPE_UV] == 0);
+  assert(xd->mi[0]->filter_intra_mode_info.use_filter_intra == 0);
+  assert(xd->mi[0]->use_intrabc == 0);
+
+  av1_predict_intra_block(cm, xd, pd->width, pd->height,
+                          max_txsize_rect_lookup[plane_bsize], mode, 0, 0,
+                          FILTER_INTRA_MODES, ctx->plane[plane],
+                          ctx->stride[plane], dst, dst_stride, 0, 0, plane);
+}
+
+void av1_combine_interintra(MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane,
+                            const uint8_t *inter_pred, int inter_stride,
+                            const uint8_t *intra_pred, int intra_stride) {
+  const int ssx = xd->plane[plane].subsampling_x;
+  const int ssy = xd->plane[plane].subsampling_y;
+  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ssx, ssy);
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (is_cur_buf_hbd(xd)) {
+    combine_interintra_highbd(
+        xd->mi[0]->interintra_mode, xd->mi[0]->use_wedge_interintra,
+        xd->mi[0]->interintra_wedge_index, INTERINTRA_WEDGE_SIGN, bsize,
+        plane_bsize, xd->plane[plane].dst.buf, xd->plane[plane].dst.stride,
+        inter_pred, inter_stride, intra_pred, intra_stride, xd->bd);
+    return;
+  }
+#endif
+  combine_interintra(
+      xd->mi[0]->interintra_mode, xd->mi[0]->use_wedge_interintra,
+      xd->mi[0]->interintra_wedge_index, INTERINTRA_WEDGE_SIGN, bsize,
+      plane_bsize, xd->plane[plane].dst.buf, xd->plane[plane].dst.stride,
+      inter_pred, inter_stride, intra_pred, intra_stride);
+}
+
+// build interintra_predictors for one plane
+void av1_build_interintra_predictor(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                    uint8_t *pred, int stride,
+                                    const BUFFER_SET *ctx, int plane,
+                                    BLOCK_SIZE bsize) {
+  assert(bsize < BLOCK_SIZES_ALL);
+  if (is_cur_buf_hbd(xd)) {
+    DECLARE_ALIGNED(16, uint16_t, intrapredictor[MAX_SB_SQUARE]);
+    av1_build_intra_predictors_for_interintra(
+        cm, xd, bsize, plane, ctx, CONVERT_TO_BYTEPTR(intrapredictor),
+        MAX_SB_SIZE);
+    av1_combine_interintra(xd, bsize, plane, pred, stride,
+                           CONVERT_TO_BYTEPTR(intrapredictor), MAX_SB_SIZE);
+  } else {
+    DECLARE_ALIGNED(16, uint8_t, intrapredictor[MAX_SB_SQUARE]);
+    av1_build_intra_predictors_for_interintra(cm, xd, bsize, plane, ctx,
+                                              intrapredictor, MAX_SB_SIZE);
+    av1_combine_interintra(xd, bsize, plane, pred, stride, intrapredictor,
+                           MAX_SB_SIZE);
+  }
+}
diff --git a/libs/libaom/src/av1/common/reconinter.h b/libs/libaom/src/av1/common/reconinter.h
new file mode 100644
index 000000000..fe3c6a621
--- /dev/null
+++ b/libs/libaom/src/av1/common/reconinter.h
@@ -0,0 +1,414 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_RECONINTER_H_
+#define AOM_AV1_COMMON_RECONINTER_H_
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/convolve.h"
+#include "av1/common/filter.h"
+#include "av1/common/warped_motion.h"
+#include "aom/aom_integer.h"
+
+// Work out how many pixels off the edge of a reference frame we're allowed
+// to go when forming an inter prediction.
+// The outermost row/col of each referernce frame is extended by
+// (AOM_BORDER_IN_PIXELS >> subsampling) pixels, but we need to keep
+// at least AOM_INTERP_EXTEND pixels within that to account for filtering.
+//
+// We have to break this up into two macros to keep both clang-format and
+// tools/lint-hunks.py happy.
+#define AOM_LEFT_TOP_MARGIN_PX(subsampling) \
+  ((AOM_BORDER_IN_PIXELS >> subsampling) - AOM_INTERP_EXTEND)
+#define AOM_LEFT_TOP_MARGIN_SCALED(subsampling) \
+  (AOM_LEFT_TOP_MARGIN_PX(subsampling) << SCALE_SUBPEL_BITS)
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_WEDGE_TYPES 16
+
+#define MAX_WEDGE_SIZE_LOG2 5  // 32x32
+#define MAX_WEDGE_SIZE (1 << MAX_WEDGE_SIZE_LOG2)
+#define MAX_WEDGE_SQUARE (MAX_WEDGE_SIZE * MAX_WEDGE_SIZE)
+
+#define WEDGE_WEIGHT_BITS 6
+
+#define WEDGE_NONE -1
+
+// Angles are with respect to horizontal anti-clockwise
+enum {
+  WEDGE_HORIZONTAL = 0,
+  WEDGE_VERTICAL = 1,
+  WEDGE_OBLIQUE27 = 2,
+  WEDGE_OBLIQUE63 = 3,
+  WEDGE_OBLIQUE117 = 4,
+  WEDGE_OBLIQUE153 = 5,
+  WEDGE_DIRECTIONS
+} UENUM1BYTE(WedgeDirectionType);
+
+// 3-tuple: {direction, x_offset, y_offset}
+typedef struct {
+  WedgeDirectionType direction;
+  int x_offset;
+  int y_offset;
+} wedge_code_type;
+
+typedef uint8_t *wedge_masks_type[MAX_WEDGE_TYPES];
+
+typedef struct {
+  int wedge_types;
+  const wedge_code_type *codebook;
+  uint8_t *signflip;
+  wedge_masks_type *masks;
+} wedge_params_type;
+
+extern const wedge_params_type av1_wedge_params_lookup[BLOCK_SIZES_ALL];
+
+typedef struct SubpelParams {
+  int xs;
+  int ys;
+  int subpel_x;
+  int subpel_y;
+} SubpelParams;
+
+struct build_prediction_ctxt {
+  const AV1_COMMON *cm;
+  uint8_t **tmp_buf;
+  int *tmp_width;
+  int *tmp_height;
+  int *tmp_stride;
+  int mb_to_far_edge;
+};
+
+typedef enum InterPredMode {
+  TRANSLATION_PRED,
+  WARP_PRED,
+} InterPredMode;
+
+typedef enum InterCompMode {
+  UNIFORM_SINGLE,
+  UNIFORM_COMP,
+  MASK_COMP,
+} InterCompMode;
+
+typedef struct InterPredParams {
+  InterPredMode mode;
+  InterCompMode comp_mode;
+  WarpedMotionParams warp_params;
+  ConvolveParams conv_params;
+  const InterpFilterParams *interp_filter_params[2];
+  int block_width;
+  int block_height;
+  int pix_row;
+  int pix_col;
+  struct buf_2d ref_frame_buf;
+  int subsampling_x;
+  int subsampling_y;
+  const struct scale_factors *scale_factors;
+  int bit_depth;
+  int use_hbd_buf;
+  INTERINTER_COMPOUND_DATA mask_comp;
+  BLOCK_SIZE sb_type;
+  int is_intrabc;
+} InterPredParams;
+
+void av1_init_inter_params(InterPredParams *inter_pred_params, int block_width,
+                           int block_height, int pix_row, int pix_col,
+                           int subsampling_x, int subsampling_y, int bit_depth,
+                           int use_hbd_buf, int is_intrabc,
+                           const struct scale_factors *sf,
+                           const struct buf_2d *ref_buf,
+                           int_interpfilters interp_filters);
+
+void av1_init_comp_mode(InterPredParams *inter_pred_params);
+
+void av1_init_warp_params(InterPredParams *inter_pred_params,
+                          const WarpTypesAllowed *warp_types, int ref,
+                          const MACROBLOCKD *xd, const MB_MODE_INFO *mi);
+
+void av1_init_mask_comp(InterPredParams *inter_pred_params, BLOCK_SIZE bsize,
+                        const INTERINTER_COMPOUND_DATA *mask_comp);
+
+static INLINE int has_scale(int xs, int ys) {
+  return xs != SCALE_SUBPEL_SHIFTS || ys != SCALE_SUBPEL_SHIFTS;
+}
+
+static INLINE void revert_scale_extra_bits(SubpelParams *sp) {
+  sp->subpel_x >>= SCALE_EXTRA_BITS;
+  sp->subpel_y >>= SCALE_EXTRA_BITS;
+  sp->xs >>= SCALE_EXTRA_BITS;
+  sp->ys >>= SCALE_EXTRA_BITS;
+  assert(sp->subpel_x < SUBPEL_SHIFTS);
+  assert(sp->subpel_y < SUBPEL_SHIFTS);
+  assert(sp->xs <= SUBPEL_SHIFTS);
+  assert(sp->ys <= SUBPEL_SHIFTS);
+}
+
+static INLINE void inter_predictor(
+    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride,
+    const SubpelParams *subpel_params, const struct scale_factors *sf, int w,
+    int h, ConvolveParams *conv_params,
+    const InterpFilterParams *interp_filters[2]) {
+  assert(conv_params->do_average == 0 || conv_params->do_average == 1);
+  assert(sf);
+  const int is_scaled = has_scale(subpel_params->xs, subpel_params->ys);
+  if (is_scaled) {
+    av1_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
+                           interp_filters, subpel_params->subpel_x,
+                           subpel_params->xs, subpel_params->subpel_y,
+                           subpel_params->ys, 1, conv_params, sf);
+  } else {
+    SubpelParams sp = *subpel_params;
+    revert_scale_extra_bits(&sp);
+    av1_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
+                           interp_filters, sp.subpel_x, sp.xs, sp.subpel_y,
+                           sp.ys, 0, conv_params, sf);
+  }
+}
+
+static INLINE void highbd_inter_predictor(
+    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride,
+    const SubpelParams *subpel_params, const struct scale_factors *sf, int w,
+    int h, ConvolveParams *conv_params,
+    const InterpFilterParams *interp_filters[2], int bd) {
+  assert(conv_params->do_average == 0 || conv_params->do_average == 1);
+  assert(sf);
+  const int is_scaled = has_scale(subpel_params->xs, subpel_params->ys);
+  if (is_scaled) {
+    av1_highbd_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
+                                  interp_filters, subpel_params->subpel_x,
+                                  subpel_params->xs, subpel_params->subpel_y,
+                                  subpel_params->ys, 1, conv_params, sf, bd);
+  } else {
+    SubpelParams sp = *subpel_params;
+    revert_scale_extra_bits(&sp);
+    av1_highbd_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
+                                  interp_filters, sp.subpel_x, sp.xs,
+                                  sp.subpel_y, sp.ys, 0, conv_params, sf, bd);
+  }
+}
+
+void av1_modify_neighbor_predictor_for_obmc(MB_MODE_INFO *mbmi);
+int av1_skip_u4x4_pred_in_obmc(BLOCK_SIZE bsize,
+                               const struct macroblockd_plane *pd, int dir);
+
+static INLINE int is_interinter_compound_used(COMPOUND_TYPE type,
+                                              BLOCK_SIZE sb_type) {
+  const int comp_allowed = is_comp_ref_allowed(sb_type);
+  switch (type) {
+    case COMPOUND_AVERAGE:
+    case COMPOUND_DISTWTD:
+    case COMPOUND_DIFFWTD: return comp_allowed;
+    case COMPOUND_WEDGE:
+      return comp_allowed && av1_wedge_params_lookup[sb_type].wedge_types > 0;
+    default: assert(0); return 0;
+  }
+}
+
+static INLINE int is_any_masked_compound_used(BLOCK_SIZE sb_type) {
+  COMPOUND_TYPE comp_type;
+  int i;
+  if (!is_comp_ref_allowed(sb_type)) return 0;
+  for (i = 0; i < COMPOUND_TYPES; i++) {
+    comp_type = (COMPOUND_TYPE)i;
+    if (is_masked_compound_type(comp_type) &&
+        is_interinter_compound_used(comp_type, sb_type))
+      return 1;
+  }
+  return 0;
+}
+
+static INLINE int get_wedge_types_lookup(BLOCK_SIZE sb_type) {
+  return av1_wedge_params_lookup[sb_type].wedge_types;
+}
+
+static INLINE int av1_is_wedge_used(BLOCK_SIZE sb_type) {
+  return av1_wedge_params_lookup[sb_type].wedge_types > 0;
+}
+
+void av1_make_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
+                              int dst_stride,
+                              InterPredParams *inter_pred_params,
+                              const SubpelParams *subpel_params);
+
+void av1_make_masked_inter_predictor(const uint8_t *pre, int pre_stride,
+                                     uint8_t *dst, int dst_stride,
+                                     InterPredParams *inter_pred_params,
+                                     const SubpelParams *subpel_params);
+
+typedef void (*CalcSubpelParamsFunc)(const MV *const src_mv,
+                                     InterPredParams *const inter_pred_params,
+                                     MACROBLOCKD *xd, int mi_x, int mi_y,
+                                     int ref, uint8_t **pre,
+                                     SubpelParams *subpel_params,
+                                     int *src_stride);
+
+void av1_build_one_inter_predictor(
+    uint8_t *dst, int dst_stride, const MV *const src_mv,
+    InterPredParams *inter_pred_params, MACROBLOCKD *xd, int mi_x, int mi_y,
+    int ref, CalcSubpelParamsFunc calc_subpel_params_func);
+
+void av1_build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                int plane, const MB_MODE_INFO *mi,
+                                int build_for_obmc, int bw, int bh, int mi_x,
+                                int mi_y,
+                                CalcSubpelParamsFunc calc_subpel_params_func);
+
+// TODO(jkoleszar): yet another mv clamping function :-(
+static INLINE MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd,
+                                           const MV *src_mv, int bw, int bh,
+                                           int ss_x, int ss_y) {
+  // If the MV points so far into the UMV border that no visible pixels
+  // are used for reconstruction, the subpel part of the MV can be
+  // discarded and the MV limited to 16 pixels with equivalent results.
+  const int spel_left = (AOM_INTERP_EXTEND + bw) << SUBPEL_BITS;
+  const int spel_right = spel_left - SUBPEL_SHIFTS;
+  const int spel_top = (AOM_INTERP_EXTEND + bh) << SUBPEL_BITS;
+  const int spel_bottom = spel_top - SUBPEL_SHIFTS;
+  MV clamped_mv = { (int16_t)(src_mv->row * (1 << (1 - ss_y))),
+                    (int16_t)(src_mv->col * (1 << (1 - ss_x))) };
+  assert(ss_x <= 1);
+  assert(ss_y <= 1);
+  const SubpelMvLimits mv_limits = {
+    xd->mb_to_left_edge * (1 << (1 - ss_x)) - spel_left,
+    xd->mb_to_right_edge * (1 << (1 - ss_x)) + spel_right,
+    xd->mb_to_top_edge * (1 << (1 - ss_y)) - spel_top,
+    xd->mb_to_bottom_edge * (1 << (1 - ss_y)) + spel_bottom
+  };
+
+  clamp_mv(&clamped_mv, &mv_limits);
+
+  return clamped_mv;
+}
+
+static INLINE int64_t scaled_buffer_offset(int x_offset, int y_offset,
+                                           int stride,
+                                           const struct scale_factors *sf) {
+  const int x =
+      sf ? sf->scale_value_x(x_offset, sf) >> SCALE_EXTRA_BITS : x_offset;
+  const int y =
+      sf ? sf->scale_value_y(y_offset, sf) >> SCALE_EXTRA_BITS : y_offset;
+  return (int64_t)y * stride + x;
+}
+
+static INLINE void setup_pred_plane(struct buf_2d *dst, BLOCK_SIZE bsize,
+                                    uint8_t *src, int width, int height,
+                                    int stride, int mi_row, int mi_col,
+                                    const struct scale_factors *scale,
+                                    int subsampling_x, int subsampling_y) {
+  // Offset the buffer pointer
+  if (subsampling_y && (mi_row & 0x01) && (mi_size_high[bsize] == 1))
+    mi_row -= 1;
+  if (subsampling_x && (mi_col & 0x01) && (mi_size_wide[bsize] == 1))
+    mi_col -= 1;
+
+  const int x = (MI_SIZE * mi_col) >> subsampling_x;
+  const int y = (MI_SIZE * mi_row) >> subsampling_y;
+  dst->buf = src + scaled_buffer_offset(x, y, stride, scale);
+  dst->buf0 = src;
+  dst->width = width;
+  dst->height = height;
+  dst->stride = stride;
+}
+
+void av1_setup_dst_planes(struct macroblockd_plane *planes, BLOCK_SIZE bsize,
+                          const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
+                          const int plane_start, const int plane_end);
+
+void av1_setup_pre_planes(MACROBLOCKD *xd, int idx,
+                          const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
+                          const struct scale_factors *sf, const int num_planes);
+
+static INLINE void set_default_interp_filters(
+    MB_MODE_INFO *const mbmi, InterpFilter frame_interp_filter) {
+  mbmi->interp_filters =
+      av1_broadcast_interp_filter(av1_unswitchable_filter(frame_interp_filter));
+}
+
+static INLINE int av1_is_interp_needed(const MACROBLOCKD *const xd) {
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  if (mbmi->skip_mode) return 0;
+  if (mbmi->motion_mode == WARPED_CAUSAL) return 0;
+  if (is_nontrans_global_motion(xd, xd->mi[0])) return 0;
+  return 1;
+}
+
+void av1_setup_address_for_obmc(MACROBLOCKD *xd, int mi_row_offset,
+                                int mi_col_offset, MB_MODE_INFO *ref_mbmi,
+                                struct build_prediction_ctxt *ctxt,
+                                const int num_planes);
+
+void av1_setup_build_prediction_by_above_pred(
+    MACROBLOCKD *xd, int rel_mi_col, uint8_t above_mi_width,
+    MB_MODE_INFO *above_mbmi, struct build_prediction_ctxt *ctxt,
+    const int num_planes);
+void av1_setup_build_prediction_by_left_pred(MACROBLOCKD *xd, int rel_mi_row,
+                                             uint8_t left_mi_height,
+                                             MB_MODE_INFO *left_mbmi,
+                                             struct build_prediction_ctxt *ctxt,
+                                             const int num_planes);
+void av1_build_obmc_inter_prediction(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                     uint8_t *above[MAX_MB_PLANE],
+                                     int above_stride[MAX_MB_PLANE],
+                                     uint8_t *left[MAX_MB_PLANE],
+                                     int left_stride[MAX_MB_PLANE]);
+
+const uint8_t *av1_get_obmc_mask(int length);
+void av1_count_overlappable_neighbors(const AV1_COMMON *cm, MACROBLOCKD *xd);
+
+#define MASK_MASTER_SIZE ((MAX_WEDGE_SIZE) << 1)
+#define MASK_MASTER_STRIDE (MASK_MASTER_SIZE)
+
+void av1_init_wedge_masks();
+
+static INLINE const uint8_t *av1_get_contiguous_soft_mask(int8_t wedge_index,
+                                                          int8_t wedge_sign,
+                                                          BLOCK_SIZE sb_type) {
+  return av1_wedge_params_lookup[sb_type].masks[wedge_sign][wedge_index];
+}
+
+const uint8_t *av1_get_compound_type_mask(
+    const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type);
+
+// build interintra_predictors for one plane
+void av1_build_interintra_predictor(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                    uint8_t *pred, int stride,
+                                    const BUFFER_SET *ctx, int plane,
+                                    BLOCK_SIZE bsize);
+
+void av1_build_intra_predictors_for_interintra(const AV1_COMMON *cm,
+                                               MACROBLOCKD *xd,
+                                               BLOCK_SIZE bsize, int plane,
+                                               const BUFFER_SET *ctx,
+                                               uint8_t *dst, int dst_stride);
+
+void av1_combine_interintra(MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane,
+                            const uint8_t *inter_pred, int inter_stride,
+                            const uint8_t *intra_pred, int intra_stride);
+
+void av1_dist_wtd_comp_weight_assign(const AV1_COMMON *cm,
+                                     const MB_MODE_INFO *mbmi, int order_idx,
+                                     int *fwd_offset, int *bck_offset,
+                                     int *use_dist_wtd_comp_avg,
+                                     int is_compound);
+int av1_allow_warp(const MB_MODE_INFO *const mbmi,
+                   const WarpTypesAllowed *const warp_types,
+                   const WarpedMotionParams *const gm_params,
+                   int build_for_obmc, const struct scale_factors *const sf,
+                   WarpedMotionParams *final_warp_params);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_COMMON_RECONINTER_H_
diff --git a/libs/libaom/src/av1/common/reconintra.c b/libs/libaom/src/av1/common/reconintra.c
new file mode 100644
index 000000000..1307a0313
--- /dev/null
+++ b/libs/libaom/src/av1/common/reconintra.c
@@ -0,0 +1,1704 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/aom_once.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/system_state.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/common/cfl.h"
+#include "av1/common/reconintra.h"
+
+enum {
+  NEED_LEFT = 1 << 1,
+  NEED_ABOVE = 1 << 2,
+  NEED_ABOVERIGHT = 1 << 3,
+  NEED_ABOVELEFT = 1 << 4,
+  NEED_BOTTOMLEFT = 1 << 5,
+};
+
+#define INTRA_EDGE_FILT 3
+#define INTRA_EDGE_TAPS 5
+#define MAX_UPSAMPLE_SZ 16
+
+static const uint8_t extend_modes[INTRA_MODES] = {
+  NEED_ABOVE | NEED_LEFT,                   // DC
+  NEED_ABOVE,                               // V
+  NEED_LEFT,                                // H
+  NEED_ABOVE | NEED_ABOVERIGHT,             // D45
+  NEED_LEFT | NEED_ABOVE | NEED_ABOVELEFT,  // D135
+  NEED_LEFT | NEED_ABOVE | NEED_ABOVELEFT,  // D113
+  NEED_LEFT | NEED_ABOVE | NEED_ABOVELEFT,  // D157
+  NEED_LEFT | NEED_BOTTOMLEFT,              // D203
+  NEED_ABOVE | NEED_ABOVERIGHT,             // D67
+  NEED_LEFT | NEED_ABOVE,                   // SMOOTH
+  NEED_LEFT | NEED_ABOVE,                   // SMOOTH_V
+  NEED_LEFT | NEED_ABOVE,                   // SMOOTH_H
+  NEED_LEFT | NEED_ABOVE | NEED_ABOVELEFT,  // PAETH
+};
+
+// Tables to store if the top-right reference pixels are available. The flags
+// are represented with bits, packed into 8-bit integers. E.g., for the 32x32
+// blocks in a 128x128 superblock, the index of the "o" block is 10 (in raster
+// order), so its flag is stored at the 3rd bit of the 2nd entry in the table,
+// i.e. (table[10 / 8] >> (10 % 8)) & 1.
+//       . . . .
+//       . . . .
+//       . . o .
+//       . . . .
+static uint8_t has_tr_4x4[128] = {
+  255, 255, 255, 255, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85,
+  127, 127, 127, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85,
+  255, 127, 255, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85,
+  127, 127, 127, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85,
+  255, 255, 255, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85,
+  127, 127, 127, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85,
+  255, 127, 255, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85,
+  127, 127, 127, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85,
+};
+static uint8_t has_tr_4x8[64] = {
+  255, 255, 255, 255, 119, 119, 119, 119, 127, 127, 127, 127, 119,
+  119, 119, 119, 255, 127, 255, 127, 119, 119, 119, 119, 127, 127,
+  127, 127, 119, 119, 119, 119, 255, 255, 255, 127, 119, 119, 119,
+  119, 127, 127, 127, 127, 119, 119, 119, 119, 255, 127, 255, 127,
+  119, 119, 119, 119, 127, 127, 127, 127, 119, 119, 119, 119,
+};
+static uint8_t has_tr_8x4[64] = {
+  255, 255, 0, 0, 85, 85, 0, 0, 119, 119, 0, 0, 85, 85, 0, 0,
+  127, 127, 0, 0, 85, 85, 0, 0, 119, 119, 0, 0, 85, 85, 0, 0,
+  255, 127, 0, 0, 85, 85, 0, 0, 119, 119, 0, 0, 85, 85, 0, 0,
+  127, 127, 0, 0, 85, 85, 0, 0, 119, 119, 0, 0, 85, 85, 0, 0,
+};
+static uint8_t has_tr_8x8[32] = {
+  255, 255, 85, 85, 119, 119, 85, 85, 127, 127, 85, 85, 119, 119, 85, 85,
+  255, 127, 85, 85, 119, 119, 85, 85, 127, 127, 85, 85, 119, 119, 85, 85,
+};
+static uint8_t has_tr_8x16[16] = {
+  255, 255, 119, 119, 127, 127, 119, 119,
+  255, 127, 119, 119, 127, 127, 119, 119,
+};
+static uint8_t has_tr_16x8[16] = {
+  255, 0, 85, 0, 119, 0, 85, 0, 127, 0, 85, 0, 119, 0, 85, 0,
+};
+static uint8_t has_tr_16x16[8] = {
+  255, 85, 119, 85, 127, 85, 119, 85,
+};
+static uint8_t has_tr_16x32[4] = { 255, 119, 127, 119 };
+static uint8_t has_tr_32x16[4] = { 15, 5, 7, 5 };
+static uint8_t has_tr_32x32[2] = { 95, 87 };
+static uint8_t has_tr_32x64[1] = { 127 };
+static uint8_t has_tr_64x32[1] = { 19 };
+static uint8_t has_tr_64x64[1] = { 7 };
+static uint8_t has_tr_64x128[1] = { 3 };
+static uint8_t has_tr_128x64[1] = { 1 };
+static uint8_t has_tr_128x128[1] = { 1 };
+static uint8_t has_tr_4x16[32] = {
+  255, 255, 255, 255, 127, 127, 127, 127, 255, 127, 255,
+  127, 127, 127, 127, 127, 255, 255, 255, 127, 127, 127,
+  127, 127, 255, 127, 255, 127, 127, 127, 127, 127,
+};
+static uint8_t has_tr_16x4[32] = {
+  255, 0, 0, 0, 85, 0, 0, 0, 119, 0, 0, 0, 85, 0, 0, 0,
+  127, 0, 0, 0, 85, 0, 0, 0, 119, 0, 0, 0, 85, 0, 0, 0,
+};
+static uint8_t has_tr_8x32[8] = {
+  255, 255, 127, 127, 255, 127, 127, 127,
+};
+static uint8_t has_tr_32x8[8] = {
+  15, 0, 5, 0, 7, 0, 5, 0,
+};
+static uint8_t has_tr_16x64[2] = { 255, 127 };
+static uint8_t has_tr_64x16[2] = { 3, 1 };
+
+static const uint8_t *const has_tr_tables[BLOCK_SIZES_ALL] = {
+  // 4X4
+  has_tr_4x4,
+  // 4X8,       8X4,            8X8
+  has_tr_4x8, has_tr_8x4, has_tr_8x8,
+  // 8X16,      16X8,           16X16
+  has_tr_8x16, has_tr_16x8, has_tr_16x16,
+  // 16X32,     32X16,          32X32
+  has_tr_16x32, has_tr_32x16, has_tr_32x32,
+  // 32X64,     64X32,          64X64
+  has_tr_32x64, has_tr_64x32, has_tr_64x64,
+  // 64x128,    128x64,         128x128
+  has_tr_64x128, has_tr_128x64, has_tr_128x128,
+  // 4x16,      16x4,            8x32
+  has_tr_4x16, has_tr_16x4, has_tr_8x32,
+  // 32x8,      16x64,           64x16
+  has_tr_32x8, has_tr_16x64, has_tr_64x16
+};
+
+static uint8_t has_tr_vert_8x8[32] = {
+  255, 255, 0, 0, 119, 119, 0, 0, 127, 127, 0, 0, 119, 119, 0, 0,
+  255, 127, 0, 0, 119, 119, 0, 0, 127, 127, 0, 0, 119, 119, 0, 0,
+};
+static uint8_t has_tr_vert_16x16[8] = {
+  255, 0, 119, 0, 127, 0, 119, 0,
+};
+static uint8_t has_tr_vert_32x32[2] = { 15, 7 };
+static uint8_t has_tr_vert_64x64[1] = { 3 };
+
+// The _vert_* tables are like the ordinary tables above, but describe the
+// order we visit square blocks when doing a PARTITION_VERT_A or
+// PARTITION_VERT_B. This is the same order as normal except for on the last
+// split where we go vertically (TL, BL, TR, BR). We treat the rectangular block
+// as a pair of squares, which means that these tables work correctly for both
+// mixed vertical partition types.
+//
+// There are tables for each of the square sizes. Vertical rectangles (like
+// BLOCK_16X32) use their respective "non-vert" table
+static const uint8_t *const has_tr_vert_tables[BLOCK_SIZES] = {
+  // 4X4
+  NULL,
+  // 4X8,      8X4,         8X8
+  has_tr_4x8, NULL, has_tr_vert_8x8,
+  // 8X16,     16X8,        16X16
+  has_tr_8x16, NULL, has_tr_vert_16x16,
+  // 16X32,    32X16,       32X32
+  has_tr_16x32, NULL, has_tr_vert_32x32,
+  // 32X64,    64X32,       64X64
+  has_tr_32x64, NULL, has_tr_vert_64x64,
+  // 64x128,   128x64,      128x128
+  has_tr_64x128, NULL, has_tr_128x128
+};
+
+static const uint8_t *get_has_tr_table(PARTITION_TYPE partition,
+                                       BLOCK_SIZE bsize) {
+  const uint8_t *ret = NULL;
+  // If this is a mixed vertical partition, look up bsize in orders_vert.
+  if (partition == PARTITION_VERT_A || partition == PARTITION_VERT_B) {
+    assert(bsize < BLOCK_SIZES);
+    ret = has_tr_vert_tables[bsize];
+  } else {
+    ret = has_tr_tables[bsize];
+  }
+  assert(ret);
+  return ret;
+}
+
+static int has_top_right(const AV1_COMMON *cm, BLOCK_SIZE bsize, int mi_row,
+                         int mi_col, int top_available, int right_available,
+                         PARTITION_TYPE partition, TX_SIZE txsz, int row_off,
+                         int col_off, int ss_x, int ss_y) {
+  if (!top_available || !right_available) return 0;
+
+  const int bw_unit = mi_size_wide[bsize];
+  const int plane_bw_unit = AOMMAX(bw_unit >> ss_x, 1);
+  const int top_right_count_unit = tx_size_wide_unit[txsz];
+
+  if (row_off > 0) {  // Just need to check if enough pixels on the right.
+    if (block_size_wide[bsize] > block_size_wide[BLOCK_64X64]) {
+      // Special case: For 128x128 blocks, the transform unit whose
+      // top-right corner is at the center of the block does in fact have
+      // pixels available at its top-right corner.
+      if (row_off == mi_size_high[BLOCK_64X64] >> ss_y &&
+          col_off + top_right_count_unit == mi_size_wide[BLOCK_64X64] >> ss_x) {
+        return 1;
+      }
+      const int plane_bw_unit_64 = mi_size_wide[BLOCK_64X64] >> ss_x;
+      const int col_off_64 = col_off % plane_bw_unit_64;
+      return col_off_64 + top_right_count_unit < plane_bw_unit_64;
+    }
+    return col_off + top_right_count_unit < plane_bw_unit;
+  } else {
+    // All top-right pixels are in the block above, which is already available.
+    if (col_off + top_right_count_unit < plane_bw_unit) return 1;
+
+    const int bw_in_mi_log2 = mi_size_wide_log2[bsize];
+    const int bh_in_mi_log2 = mi_size_high_log2[bsize];
+    const int sb_mi_size = mi_size_high[cm->seq_params.sb_size];
+    const int blk_row_in_sb = (mi_row & (sb_mi_size - 1)) >> bh_in_mi_log2;
+    const int blk_col_in_sb = (mi_col & (sb_mi_size - 1)) >> bw_in_mi_log2;
+
+    // Top row of superblock: so top-right pixels are in the top and/or
+    // top-right superblocks, both of which are already available.
+    if (blk_row_in_sb == 0) return 1;
+
+    // Rightmost column of superblock (and not the top row): so top-right pixels
+    // fall in the right superblock, which is not available yet.
+    if (((blk_col_in_sb + 1) << bw_in_mi_log2) >= sb_mi_size) {
+      return 0;
+    }
+
+    // General case (neither top row nor rightmost column): check if the
+    // top-right block is coded before the current block.
+    const int this_blk_index =
+        ((blk_row_in_sb + 0) << (MAX_MIB_SIZE_LOG2 - bw_in_mi_log2)) +
+        blk_col_in_sb + 0;
+    const int idx1 = this_blk_index / 8;
+    const int idx2 = this_blk_index % 8;
+    const uint8_t *has_tr_table = get_has_tr_table(partition, bsize);
+    return (has_tr_table[idx1] >> idx2) & 1;
+  }
+}
+
+// Similar to the has_tr_* tables, but store if the bottom-left reference
+// pixels are available.
+static uint8_t has_bl_4x4[128] = {
+  84, 85, 85, 85, 16, 17, 17, 17, 84, 85, 85, 85, 0,  1,  1,  1,  84, 85, 85,
+  85, 16, 17, 17, 17, 84, 85, 85, 85, 0,  0,  1,  0,  84, 85, 85, 85, 16, 17,
+  17, 17, 84, 85, 85, 85, 0,  1,  1,  1,  84, 85, 85, 85, 16, 17, 17, 17, 84,
+  85, 85, 85, 0,  0,  0,  0,  84, 85, 85, 85, 16, 17, 17, 17, 84, 85, 85, 85,
+  0,  1,  1,  1,  84, 85, 85, 85, 16, 17, 17, 17, 84, 85, 85, 85, 0,  0,  1,
+  0,  84, 85, 85, 85, 16, 17, 17, 17, 84, 85, 85, 85, 0,  1,  1,  1,  84, 85,
+  85, 85, 16, 17, 17, 17, 84, 85, 85, 85, 0,  0,  0,  0,
+};
+static uint8_t has_bl_4x8[64] = {
+  16, 17, 17, 17, 0, 1, 1, 1, 16, 17, 17, 17, 0, 0, 1, 0,
+  16, 17, 17, 17, 0, 1, 1, 1, 16, 17, 17, 17, 0, 0, 0, 0,
+  16, 17, 17, 17, 0, 1, 1, 1, 16, 17, 17, 17, 0, 0, 1, 0,
+  16, 17, 17, 17, 0, 1, 1, 1, 16, 17, 17, 17, 0, 0, 0, 0,
+};
+static uint8_t has_bl_8x4[64] = {
+  254, 255, 84, 85, 254, 255, 16, 17, 254, 255, 84, 85, 254, 255, 0, 1,
+  254, 255, 84, 85, 254, 255, 16, 17, 254, 255, 84, 85, 254, 255, 0, 0,
+  254, 255, 84, 85, 254, 255, 16, 17, 254, 255, 84, 85, 254, 255, 0, 1,
+  254, 255, 84, 85, 254, 255, 16, 17, 254, 255, 84, 85, 254, 255, 0, 0,
+};
+static uint8_t has_bl_8x8[32] = {
+  84, 85, 16, 17, 84, 85, 0, 1, 84, 85, 16, 17, 84, 85, 0, 0,
+  84, 85, 16, 17, 84, 85, 0, 1, 84, 85, 16, 17, 84, 85, 0, 0,
+};
+static uint8_t has_bl_8x16[16] = {
+  16, 17, 0, 1, 16, 17, 0, 0, 16, 17, 0, 1, 16, 17, 0, 0,
+};
+static uint8_t has_bl_16x8[16] = {
+  254, 84, 254, 16, 254, 84, 254, 0, 254, 84, 254, 16, 254, 84, 254, 0,
+};
+static uint8_t has_bl_16x16[8] = {
+  84, 16, 84, 0, 84, 16, 84, 0,
+};
+static uint8_t has_bl_16x32[4] = { 16, 0, 16, 0 };
+static uint8_t has_bl_32x16[4] = { 78, 14, 78, 14 };
+static uint8_t has_bl_32x32[2] = { 4, 4 };
+static uint8_t has_bl_32x64[1] = { 0 };
+static uint8_t has_bl_64x32[1] = { 34 };
+static uint8_t has_bl_64x64[1] = { 0 };
+static uint8_t has_bl_64x128[1] = { 0 };
+static uint8_t has_bl_128x64[1] = { 0 };
+static uint8_t has_bl_128x128[1] = { 0 };
+static uint8_t has_bl_4x16[32] = {
+  0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0,
+  0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0,
+};
+static uint8_t has_bl_16x4[32] = {
+  254, 254, 254, 84, 254, 254, 254, 16, 254, 254, 254, 84, 254, 254, 254, 0,
+  254, 254, 254, 84, 254, 254, 254, 16, 254, 254, 254, 84, 254, 254, 254, 0,
+};
+static uint8_t has_bl_8x32[8] = {
+  0, 1, 0, 0, 0, 1, 0, 0,
+};
+static uint8_t has_bl_32x8[8] = {
+  238, 78, 238, 14, 238, 78, 238, 14,
+};
+static uint8_t has_bl_16x64[2] = { 0, 0 };
+static uint8_t has_bl_64x16[2] = { 42, 42 };
+
+static const uint8_t *const has_bl_tables[BLOCK_SIZES_ALL] = {
+  // 4X4
+  has_bl_4x4,
+  // 4X8,         8X4,         8X8
+  has_bl_4x8, has_bl_8x4, has_bl_8x8,
+  // 8X16,        16X8,        16X16
+  has_bl_8x16, has_bl_16x8, has_bl_16x16,
+  // 16X32,       32X16,       32X32
+  has_bl_16x32, has_bl_32x16, has_bl_32x32,
+  // 32X64,       64X32,       64X64
+  has_bl_32x64, has_bl_64x32, has_bl_64x64,
+  // 64x128,      128x64,      128x128
+  has_bl_64x128, has_bl_128x64, has_bl_128x128,
+  // 4x16,        16x4,        8x32
+  has_bl_4x16, has_bl_16x4, has_bl_8x32,
+  // 32x8,        16x64,       64x16
+  has_bl_32x8, has_bl_16x64, has_bl_64x16
+};
+
+static uint8_t has_bl_vert_8x8[32] = {
+  254, 255, 16, 17, 254, 255, 0, 1, 254, 255, 16, 17, 254, 255, 0, 0,
+  254, 255, 16, 17, 254, 255, 0, 1, 254, 255, 16, 17, 254, 255, 0, 0,
+};
+static uint8_t has_bl_vert_16x16[8] = {
+  254, 16, 254, 0, 254, 16, 254, 0,
+};
+static uint8_t has_bl_vert_32x32[2] = { 14, 14 };
+static uint8_t has_bl_vert_64x64[1] = { 2 };
+
+// The _vert_* tables are like the ordinary tables above, but describe the
+// order we visit square blocks when doing a PARTITION_VERT_A or
+// PARTITION_VERT_B. This is the same order as normal except for on the last
+// split where we go vertically (TL, BL, TR, BR). We treat the rectangular block
+// as a pair of squares, which means that these tables work correctly for both
+// mixed vertical partition types.
+//
+// There are tables for each of the square sizes. Vertical rectangles (like
+// BLOCK_16X32) use their respective "non-vert" table
+static const uint8_t *const has_bl_vert_tables[BLOCK_SIZES] = {
+  // 4X4
+  NULL,
+  // 4X8,     8X4,         8X8
+  has_bl_4x8, NULL, has_bl_vert_8x8,
+  // 8X16,    16X8,        16X16
+  has_bl_8x16, NULL, has_bl_vert_16x16,
+  // 16X32,   32X16,       32X32
+  has_bl_16x32, NULL, has_bl_vert_32x32,
+  // 32X64,   64X32,       64X64
+  has_bl_32x64, NULL, has_bl_vert_64x64,
+  // 64x128,  128x64,      128x128
+  has_bl_64x128, NULL, has_bl_128x128
+};
+
+static const uint8_t *get_has_bl_table(PARTITION_TYPE partition,
+                                       BLOCK_SIZE bsize) {
+  const uint8_t *ret = NULL;
+  // If this is a mixed vertical partition, look up bsize in orders_vert.
+  if (partition == PARTITION_VERT_A || partition == PARTITION_VERT_B) {
+    assert(bsize < BLOCK_SIZES);
+    ret = has_bl_vert_tables[bsize];
+  } else {
+    ret = has_bl_tables[bsize];
+  }
+  assert(ret);
+  return ret;
+}
+
+static int has_bottom_left(const AV1_COMMON *cm, BLOCK_SIZE bsize, int mi_row,
+                           int mi_col, int bottom_available, int left_available,
+                           PARTITION_TYPE partition, TX_SIZE txsz, int row_off,
+                           int col_off, int ss_x, int ss_y) {
+  if (!bottom_available || !left_available) return 0;
+
+  // Special case for 128x* blocks, when col_off is half the block width.
+  // This is needed because 128x* superblocks are divided into 64x* blocks in
+  // raster order
+  if (block_size_wide[bsize] > block_size_wide[BLOCK_64X64] && col_off > 0) {
+    const int plane_bw_unit_64 = mi_size_wide[BLOCK_64X64] >> ss_x;
+    const int col_off_64 = col_off % plane_bw_unit_64;
+    if (col_off_64 == 0) {
+      // We are at the left edge of top-right or bottom-right 64x* block.
+      const int plane_bh_unit_64 = mi_size_high[BLOCK_64X64] >> ss_y;
+      const int row_off_64 = row_off % plane_bh_unit_64;
+      const int plane_bh_unit =
+          AOMMIN(mi_size_high[bsize] >> ss_y, plane_bh_unit_64);
+      // Check if all bottom-left pixels are in the left 64x* block (which is
+      // already coded).
+      return row_off_64 + tx_size_high_unit[txsz] < plane_bh_unit;
+    }
+  }
+
+  if (col_off > 0) {
+    // Bottom-left pixels are in the bottom-left block, which is not available.
+    return 0;
+  } else {
+    const int bh_unit = mi_size_high[bsize];
+    const int plane_bh_unit = AOMMAX(bh_unit >> ss_y, 1);
+    const int bottom_left_count_unit = tx_size_high_unit[txsz];
+
+    // All bottom-left pixels are in the left block, which is already available.
+    if (row_off + bottom_left_count_unit < plane_bh_unit) return 1;
+
+    const int bw_in_mi_log2 = mi_size_wide_log2[bsize];
+    const int bh_in_mi_log2 = mi_size_high_log2[bsize];
+    const int sb_mi_size = mi_size_high[cm->seq_params.sb_size];
+    const int blk_row_in_sb = (mi_row & (sb_mi_size - 1)) >> bh_in_mi_log2;
+    const int blk_col_in_sb = (mi_col & (sb_mi_size - 1)) >> bw_in_mi_log2;
+
+    // Leftmost column of superblock: so bottom-left pixels maybe in the left
+    // and/or bottom-left superblocks. But only the left superblock is
+    // available, so check if all required pixels fall in that superblock.
+    if (blk_col_in_sb == 0) {
+      const int blk_start_row_off =
+          blk_row_in_sb << (bh_in_mi_log2 + MI_SIZE_LOG2 - MI_SIZE_LOG2) >>
+          ss_y;
+      const int row_off_in_sb = blk_start_row_off + row_off;
+      const int sb_height_unit = sb_mi_size >> ss_y;
+      return row_off_in_sb + bottom_left_count_unit < sb_height_unit;
+    }
+
+    // Bottom row of superblock (and not the leftmost column): so bottom-left
+    // pixels fall in the bottom superblock, which is not available yet.
+    if (((blk_row_in_sb + 1) << bh_in_mi_log2) >= sb_mi_size) return 0;
+
+    // General case (neither leftmost column nor bottom row): check if the
+    // bottom-left block is coded before the current block.
+    const int this_blk_index =
+        ((blk_row_in_sb + 0) << (MAX_MIB_SIZE_LOG2 - bw_in_mi_log2)) +
+        blk_col_in_sb + 0;
+    const int idx1 = this_blk_index / 8;
+    const int idx2 = this_blk_index % 8;
+    const uint8_t *has_bl_table = get_has_bl_table(partition, bsize);
+    return (has_bl_table[idx1] >> idx2) & 1;
+  }
+}
+
+typedef void (*intra_pred_fn)(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left);
+
+static intra_pred_fn pred[INTRA_MODES][TX_SIZES_ALL];
+static intra_pred_fn dc_pred[2][2][TX_SIZES_ALL];
+
+#if CONFIG_AV1_HIGHBITDEPTH
+typedef void (*intra_high_pred_fn)(uint16_t *dst, ptrdiff_t stride,
+                                   const uint16_t *above, const uint16_t *left,
+                                   int bd);
+static intra_high_pred_fn pred_high[INTRA_MODES][TX_SIZES_ALL];
+static intra_high_pred_fn dc_pred_high[2][2][TX_SIZES_ALL];
+#endif
+
+static void init_intra_predictors_internal(void) {
+  assert(NELEMENTS(mode_to_angle_map) == INTRA_MODES);
+
+#define INIT_RECTANGULAR(p, type)             \
+  p[TX_4X8] = aom_##type##_predictor_4x8;     \
+  p[TX_8X4] = aom_##type##_predictor_8x4;     \
+  p[TX_8X16] = aom_##type##_predictor_8x16;   \
+  p[TX_16X8] = aom_##type##_predictor_16x8;   \
+  p[TX_16X32] = aom_##type##_predictor_16x32; \
+  p[TX_32X16] = aom_##type##_predictor_32x16; \
+  p[TX_32X64] = aom_##type##_predictor_32x64; \
+  p[TX_64X32] = aom_##type##_predictor_64x32; \
+  p[TX_4X16] = aom_##type##_predictor_4x16;   \
+  p[TX_16X4] = aom_##type##_predictor_16x4;   \
+  p[TX_8X32] = aom_##type##_predictor_8x32;   \
+  p[TX_32X8] = aom_##type##_predictor_32x8;   \
+  p[TX_16X64] = aom_##type##_predictor_16x64; \
+  p[TX_64X16] = aom_##type##_predictor_64x16;
+
+#define INIT_NO_4X4(p, type)                  \
+  p[TX_8X8] = aom_##type##_predictor_8x8;     \
+  p[TX_16X16] = aom_##type##_predictor_16x16; \
+  p[TX_32X32] = aom_##type##_predictor_32x32; \
+  p[TX_64X64] = aom_##type##_predictor_64x64; \
+  INIT_RECTANGULAR(p, type)
+
+#define INIT_ALL_SIZES(p, type)           \
+  p[TX_4X4] = aom_##type##_predictor_4x4; \
+  INIT_NO_4X4(p, type)
+
+  INIT_ALL_SIZES(pred[V_PRED], v);
+  INIT_ALL_SIZES(pred[H_PRED], h);
+  INIT_ALL_SIZES(pred[PAETH_PRED], paeth);
+  INIT_ALL_SIZES(pred[SMOOTH_PRED], smooth);
+  INIT_ALL_SIZES(pred[SMOOTH_V_PRED], smooth_v);
+  INIT_ALL_SIZES(pred[SMOOTH_H_PRED], smooth_h);
+  INIT_ALL_SIZES(dc_pred[0][0], dc_128);
+  INIT_ALL_SIZES(dc_pred[0][1], dc_top);
+  INIT_ALL_SIZES(dc_pred[1][0], dc_left);
+  INIT_ALL_SIZES(dc_pred[1][1], dc);
+#if CONFIG_AV1_HIGHBITDEPTH
+  INIT_ALL_SIZES(pred_high[V_PRED], highbd_v);
+  INIT_ALL_SIZES(pred_high[H_PRED], highbd_h);
+  INIT_ALL_SIZES(pred_high[PAETH_PRED], highbd_paeth);
+  INIT_ALL_SIZES(pred_high[SMOOTH_PRED], highbd_smooth);
+  INIT_ALL_SIZES(pred_high[SMOOTH_V_PRED], highbd_smooth_v);
+  INIT_ALL_SIZES(pred_high[SMOOTH_H_PRED], highbd_smooth_h);
+  INIT_ALL_SIZES(dc_pred_high[0][0], highbd_dc_128);
+  INIT_ALL_SIZES(dc_pred_high[0][1], highbd_dc_top);
+  INIT_ALL_SIZES(dc_pred_high[1][0], highbd_dc_left);
+  INIT_ALL_SIZES(dc_pred_high[1][1], highbd_dc);
+#endif
+#undef intra_pred_allsizes
+}
+
+// Directional prediction, zone 1: 0 < angle < 90
+void av1_dr_prediction_z1_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+                            const uint8_t *above, const uint8_t *left,
+                            int upsample_above, int dx, int dy) {
+  int r, c, x, base, shift, val;
+
+  (void)left;
+  (void)dy;
+  assert(dy == 1);
+  assert(dx > 0);
+
+  const int max_base_x = ((bw + bh) - 1) << upsample_above;
+  const int frac_bits = 6 - upsample_above;
+  const int base_inc = 1 << upsample_above;
+  x = dx;
+  for (r = 0; r < bh; ++r, dst += stride, x += dx) {
+    base = x >> frac_bits;
+    shift = ((x << upsample_above) & 0x3F) >> 1;
+
+    if (base >= max_base_x) {
+      for (int i = r; i < bh; ++i) {
+        memset(dst, above[max_base_x], bw * sizeof(dst[0]));
+        dst += stride;
+      }
+      return;
+    }
+
+    for (c = 0; c < bw; ++c, base += base_inc) {
+      if (base < max_base_x) {
+        val = above[base] * (32 - shift) + above[base + 1] * shift;
+        dst[c] = ROUND_POWER_OF_TWO(val, 5);
+      } else {
+        dst[c] = above[max_base_x];
+      }
+    }
+  }
+}
+
+// Directional prediction, zone 2: 90 < angle < 180
+void av1_dr_prediction_z2_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+                            const uint8_t *above, const uint8_t *left,
+                            int upsample_above, int upsample_left, int dx,
+                            int dy) {
+  assert(dx > 0);
+  assert(dy > 0);
+
+  const int min_base_x = -(1 << upsample_above);
+  const int min_base_y = -(1 << upsample_left);
+  (void)min_base_y;
+  const int frac_bits_x = 6 - upsample_above;
+  const int frac_bits_y = 6 - upsample_left;
+
+  for (int r = 0; r < bh; ++r) {
+    for (int c = 0; c < bw; ++c) {
+      int val;
+      int y = r + 1;
+      int x = (c << 6) - y * dx;
+      const int base_x = x >> frac_bits_x;
+      if (base_x >= min_base_x) {
+        const int shift = ((x * (1 << upsample_above)) & 0x3F) >> 1;
+        val = above[base_x] * (32 - shift) + above[base_x + 1] * shift;
+        val = ROUND_POWER_OF_TWO(val, 5);
+      } else {
+        x = c + 1;
+        y = (r << 6) - x * dy;
+        const int base_y = y >> frac_bits_y;
+        assert(base_y >= min_base_y);
+        const int shift = ((y * (1 << upsample_left)) & 0x3F) >> 1;
+        val = left[base_y] * (32 - shift) + left[base_y + 1] * shift;
+        val = ROUND_POWER_OF_TWO(val, 5);
+      }
+      dst[c] = val;
+    }
+    dst += stride;
+  }
+}
+
+// Directional prediction, zone 3: 180 < angle < 270
+void av1_dr_prediction_z3_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+                            const uint8_t *above, const uint8_t *left,
+                            int upsample_left, int dx, int dy) {
+  int r, c, y, base, shift, val;
+
+  (void)above;
+  (void)dx;
+
+  assert(dx == 1);
+  assert(dy > 0);
+
+  const int max_base_y = (bw + bh - 1) << upsample_left;
+  const int frac_bits = 6 - upsample_left;
+  const int base_inc = 1 << upsample_left;
+  y = dy;
+  for (c = 0; c < bw; ++c, y += dy) {
+    base = y >> frac_bits;
+    shift = ((y << upsample_left) & 0x3F) >> 1;
+
+    for (r = 0; r < bh; ++r, base += base_inc) {
+      if (base < max_base_y) {
+        val = left[base] * (32 - shift) + left[base + 1] * shift;
+        dst[r * stride + c] = val = ROUND_POWER_OF_TWO(val, 5);
+      } else {
+        for (; r < bh; ++r) dst[r * stride + c] = left[max_base_y];
+        break;
+      }
+    }
+  }
+}
+
+static void dr_predictor(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size,
+                         const uint8_t *above, const uint8_t *left,
+                         int upsample_above, int upsample_left, int angle) {
+  const int dx = av1_get_dx(angle);
+  const int dy = av1_get_dy(angle);
+  const int bw = tx_size_wide[tx_size];
+  const int bh = tx_size_high[tx_size];
+  assert(angle > 0 && angle < 270);
+
+  if (angle > 0 && angle < 90) {
+    av1_dr_prediction_z1(dst, stride, bw, bh, above, left, upsample_above, dx,
+                         dy);
+  } else if (angle > 90 && angle < 180) {
+    av1_dr_prediction_z2(dst, stride, bw, bh, above, left, upsample_above,
+                         upsample_left, dx, dy);
+  } else if (angle > 180 && angle < 270) {
+    av1_dr_prediction_z3(dst, stride, bw, bh, above, left, upsample_left, dx,
+                         dy);
+  } else if (angle == 90) {
+    pred[V_PRED][tx_size](dst, stride, above, left);
+  } else if (angle == 180) {
+    pred[H_PRED][tx_size](dst, stride, above, left);
+  }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+// Directional prediction, zone 1: 0 < angle < 90
+void av1_highbd_dr_prediction_z1_c(uint16_t *dst, ptrdiff_t stride, int bw,
+                                   int bh, const uint16_t *above,
+                                   const uint16_t *left, int upsample_above,
+                                   int dx, int dy, int bd) {
+  int r, c, x, base, shift, val;
+
+  (void)left;
+  (void)dy;
+  (void)bd;
+  assert(dy == 1);
+  assert(dx > 0);
+
+  const int max_base_x = ((bw + bh) - 1) << upsample_above;
+  const int frac_bits = 6 - upsample_above;
+  const int base_inc = 1 << upsample_above;
+  x = dx;
+  for (r = 0; r < bh; ++r, dst += stride, x += dx) {
+    base = x >> frac_bits;
+    shift = ((x << upsample_above) & 0x3F) >> 1;
+
+    if (base >= max_base_x) {
+      for (int i = r; i < bh; ++i) {
+        aom_memset16(dst, above[max_base_x], bw);
+        dst += stride;
+      }
+      return;
+    }
+
+    for (c = 0; c < bw; ++c, base += base_inc) {
+      if (base < max_base_x) {
+        val = above[base] * (32 - shift) + above[base + 1] * shift;
+        dst[c] = ROUND_POWER_OF_TWO(val, 5);
+      } else {
+        dst[c] = above[max_base_x];
+      }
+    }
+  }
+}
+
+// Directional prediction, zone 2: 90 < angle < 180
+void av1_highbd_dr_prediction_z2_c(uint16_t *dst, ptrdiff_t stride, int bw,
+                                   int bh, const uint16_t *above,
+                                   const uint16_t *left, int upsample_above,
+                                   int upsample_left, int dx, int dy, int bd) {
+  (void)bd;
+  assert(dx > 0);
+  assert(dy > 0);
+
+  const int min_base_x = -(1 << upsample_above);
+  const int min_base_y = -(1 << upsample_left);
+  (void)min_base_y;
+  const int frac_bits_x = 6 - upsample_above;
+  const int frac_bits_y = 6 - upsample_left;
+
+  for (int r = 0; r < bh; ++r) {
+    for (int c = 0; c < bw; ++c) {
+      int val;
+      int y = r + 1;
+      int x = (c << 6) - y * dx;
+      const int base_x = x >> frac_bits_x;
+      if (base_x >= min_base_x) {
+        const int shift = ((x * (1 << upsample_above)) & 0x3F) >> 1;
+        val = above[base_x] * (32 - shift) + above[base_x + 1] * shift;
+        val = ROUND_POWER_OF_TWO(val, 5);
+      } else {
+        x = c + 1;
+        y = (r << 6) - x * dy;
+        const int base_y = y >> frac_bits_y;
+        assert(base_y >= min_base_y);
+        const int shift = ((y * (1 << upsample_left)) & 0x3F) >> 1;
+        val = left[base_y] * (32 - shift) + left[base_y + 1] * shift;
+        val = ROUND_POWER_OF_TWO(val, 5);
+      }
+      dst[c] = val;
+    }
+    dst += stride;
+  }
+}
+
+// Directional prediction, zone 3: 180 < angle < 270
+void av1_highbd_dr_prediction_z3_c(uint16_t *dst, ptrdiff_t stride, int bw,
+                                   int bh, const uint16_t *above,
+                                   const uint16_t *left, int upsample_left,
+                                   int dx, int dy, int bd) {
+  int r, c, y, base, shift, val;
+
+  (void)above;
+  (void)dx;
+  (void)bd;
+  assert(dx == 1);
+  assert(dy > 0);
+
+  const int max_base_y = (bw + bh - 1) << upsample_left;
+  const int frac_bits = 6 - upsample_left;
+  const int base_inc = 1 << upsample_left;
+  y = dy;
+  for (c = 0; c < bw; ++c, y += dy) {
+    base = y >> frac_bits;
+    shift = ((y << upsample_left) & 0x3F) >> 1;
+
+    for (r = 0; r < bh; ++r, base += base_inc) {
+      if (base < max_base_y) {
+        val = left[base] * (32 - shift) + left[base + 1] * shift;
+        dst[r * stride + c] = ROUND_POWER_OF_TWO(val, 5);
+      } else {
+        for (; r < bh; ++r) dst[r * stride + c] = left[max_base_y];
+        break;
+      }
+    }
+  }
+}
+
+static void highbd_dr_predictor(uint16_t *dst, ptrdiff_t stride,
+                                TX_SIZE tx_size, const uint16_t *above,
+                                const uint16_t *left, int upsample_above,
+                                int upsample_left, int angle, int bd) {
+  const int dx = av1_get_dx(angle);
+  const int dy = av1_get_dy(angle);
+  const int bw = tx_size_wide[tx_size];
+  const int bh = tx_size_high[tx_size];
+  assert(angle > 0 && angle < 270);
+
+  if (angle > 0 && angle < 90) {
+    av1_highbd_dr_prediction_z1(dst, stride, bw, bh, above, left,
+                                upsample_above, dx, dy, bd);
+  } else if (angle > 90 && angle < 180) {
+    av1_highbd_dr_prediction_z2(dst, stride, bw, bh, above, left,
+                                upsample_above, upsample_left, dx, dy, bd);
+  } else if (angle > 180 && angle < 270) {
+    av1_highbd_dr_prediction_z3(dst, stride, bw, bh, above, left, upsample_left,
+                                dx, dy, bd);
+  } else if (angle == 90) {
+    pred_high[V_PRED][tx_size](dst, stride, above, left, bd);
+  } else if (angle == 180) {
+    pred_high[H_PRED][tx_size](dst, stride, above, left, bd);
+  }
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+DECLARE_ALIGNED(16, const int8_t,
+                av1_filter_intra_taps[FILTER_INTRA_MODES][8][8]) = {
+  {
+      { -6, 10, 0, 0, 0, 12, 0, 0 },
+      { -5, 2, 10, 0, 0, 9, 0, 0 },
+      { -3, 1, 1, 10, 0, 7, 0, 0 },
+      { -3, 1, 1, 2, 10, 5, 0, 0 },
+      { -4, 6, 0, 0, 0, 2, 12, 0 },
+      { -3, 2, 6, 0, 0, 2, 9, 0 },
+      { -3, 2, 2, 6, 0, 2, 7, 0 },
+      { -3, 1, 2, 2, 6, 3, 5, 0 },
+  },
+  {
+      { -10, 16, 0, 0, 0, 10, 0, 0 },
+      { -6, 0, 16, 0, 0, 6, 0, 0 },
+      { -4, 0, 0, 16, 0, 4, 0, 0 },
+      { -2, 0, 0, 0, 16, 2, 0, 0 },
+      { -10, 16, 0, 0, 0, 0, 10, 0 },
+      { -6, 0, 16, 0, 0, 0, 6, 0 },
+      { -4, 0, 0, 16, 0, 0, 4, 0 },
+      { -2, 0, 0, 0, 16, 0, 2, 0 },
+  },
+  {
+      { -8, 8, 0, 0, 0, 16, 0, 0 },
+      { -8, 0, 8, 0, 0, 16, 0, 0 },
+      { -8, 0, 0, 8, 0, 16, 0, 0 },
+      { -8, 0, 0, 0, 8, 16, 0, 0 },
+      { -4, 4, 0, 0, 0, 0, 16, 0 },
+      { -4, 0, 4, 0, 0, 0, 16, 0 },
+      { -4, 0, 0, 4, 0, 0, 16, 0 },
+      { -4, 0, 0, 0, 4, 0, 16, 0 },
+  },
+  {
+      { -2, 8, 0, 0, 0, 10, 0, 0 },
+      { -1, 3, 8, 0, 0, 6, 0, 0 },
+      { -1, 2, 3, 8, 0, 4, 0, 0 },
+      { 0, 1, 2, 3, 8, 2, 0, 0 },
+      { -1, 4, 0, 0, 0, 3, 10, 0 },
+      { -1, 3, 4, 0, 0, 4, 6, 0 },
+      { -1, 2, 3, 4, 0, 4, 4, 0 },
+      { -1, 2, 2, 3, 4, 3, 3, 0 },
+  },
+  {
+      { -12, 14, 0, 0, 0, 14, 0, 0 },
+      { -10, 0, 14, 0, 0, 12, 0, 0 },
+      { -9, 0, 0, 14, 0, 11, 0, 0 },
+      { -8, 0, 0, 0, 14, 10, 0, 0 },
+      { -10, 12, 0, 0, 0, 0, 14, 0 },
+      { -9, 1, 12, 0, 0, 0, 12, 0 },
+      { -8, 0, 0, 12, 0, 1, 11, 0 },
+      { -7, 0, 0, 1, 12, 1, 9, 0 },
+  },
+};
+
+void av1_filter_intra_predictor_c(uint8_t *dst, ptrdiff_t stride,
+                                  TX_SIZE tx_size, const uint8_t *above,
+                                  const uint8_t *left, int mode) {
+  int r, c;
+  uint8_t buffer[33][33];
+  const int bw = tx_size_wide[tx_size];
+  const int bh = tx_size_high[tx_size];
+
+  assert(bw <= 32 && bh <= 32);
+
+  for (r = 0; r < bh; ++r) buffer[r + 1][0] = left[r];
+  memcpy(buffer[0], &above[-1], (bw + 1) * sizeof(uint8_t));
+
+  for (r = 1; r < bh + 1; r += 2)
+    for (c = 1; c < bw + 1; c += 4) {
+      const uint8_t p0 = buffer[r - 1][c - 1];
+      const uint8_t p1 = buffer[r - 1][c];
+      const uint8_t p2 = buffer[r - 1][c + 1];
+      const uint8_t p3 = buffer[r - 1][c + 2];
+      const uint8_t p4 = buffer[r - 1][c + 3];
+      const uint8_t p5 = buffer[r][c - 1];
+      const uint8_t p6 = buffer[r + 1][c - 1];
+      for (int k = 0; k < 8; ++k) {
+        int r_offset = k >> 2;
+        int c_offset = k & 0x03;
+        buffer[r + r_offset][c + c_offset] =
+            clip_pixel(ROUND_POWER_OF_TWO_SIGNED(
+                av1_filter_intra_taps[mode][k][0] * p0 +
+                    av1_filter_intra_taps[mode][k][1] * p1 +
+                    av1_filter_intra_taps[mode][k][2] * p2 +
+                    av1_filter_intra_taps[mode][k][3] * p3 +
+                    av1_filter_intra_taps[mode][k][4] * p4 +
+                    av1_filter_intra_taps[mode][k][5] * p5 +
+                    av1_filter_intra_taps[mode][k][6] * p6,
+                FILTER_INTRA_SCALE_BITS));
+      }
+    }
+
+  for (r = 0; r < bh; ++r) {
+    memcpy(dst, &buffer[r + 1][1], bw * sizeof(uint8_t));
+    dst += stride;
+  }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static void highbd_filter_intra_predictor(uint16_t *dst, ptrdiff_t stride,
+                                          TX_SIZE tx_size,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int mode,
+                                          int bd) {
+  int r, c;
+  uint16_t buffer[33][33];
+  const int bw = tx_size_wide[tx_size];
+  const int bh = tx_size_high[tx_size];
+
+  assert(bw <= 32 && bh <= 32);
+
+  for (r = 0; r < bh; ++r) buffer[r + 1][0] = left[r];
+  memcpy(buffer[0], &above[-1], (bw + 1) * sizeof(buffer[0][0]));
+
+  for (r = 1; r < bh + 1; r += 2)
+    for (c = 1; c < bw + 1; c += 4) {
+      const uint16_t p0 = buffer[r - 1][c - 1];
+      const uint16_t p1 = buffer[r - 1][c];
+      const uint16_t p2 = buffer[r - 1][c + 1];
+      const uint16_t p3 = buffer[r - 1][c + 2];
+      const uint16_t p4 = buffer[r - 1][c + 3];
+      const uint16_t p5 = buffer[r][c - 1];
+      const uint16_t p6 = buffer[r + 1][c - 1];
+      for (int k = 0; k < 8; ++k) {
+        int r_offset = k >> 2;
+        int c_offset = k & 0x03;
+        buffer[r + r_offset][c + c_offset] =
+            clip_pixel_highbd(ROUND_POWER_OF_TWO_SIGNED(
+                                  av1_filter_intra_taps[mode][k][0] * p0 +
+                                      av1_filter_intra_taps[mode][k][1] * p1 +
+                                      av1_filter_intra_taps[mode][k][2] * p2 +
+                                      av1_filter_intra_taps[mode][k][3] * p3 +
+                                      av1_filter_intra_taps[mode][k][4] * p4 +
+                                      av1_filter_intra_taps[mode][k][5] * p5 +
+                                      av1_filter_intra_taps[mode][k][6] * p6,
+                                  FILTER_INTRA_SCALE_BITS),
+                              bd);
+      }
+    }
+
+  for (r = 0; r < bh; ++r) {
+    memcpy(dst, &buffer[r + 1][1], bw * sizeof(dst[0]));
+    dst += stride;
+  }
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+static int is_smooth(const MB_MODE_INFO *mbmi, int plane) {
+  if (plane == 0) {
+    const PREDICTION_MODE mode = mbmi->mode;
+    return (mode == SMOOTH_PRED || mode == SMOOTH_V_PRED ||
+            mode == SMOOTH_H_PRED);
+  } else {
+    // uv_mode is not set for inter blocks, so need to explicitly
+    // detect that case.
+    if (is_inter_block(mbmi)) return 0;
+
+    const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode;
+    return (uv_mode == UV_SMOOTH_PRED || uv_mode == UV_SMOOTH_V_PRED ||
+            uv_mode == UV_SMOOTH_H_PRED);
+  }
+}
+
+static int get_filt_type(const MACROBLOCKD *xd, int plane) {
+  int ab_sm, le_sm;
+
+  if (plane == 0) {
+    const MB_MODE_INFO *ab = xd->above_mbmi;
+    const MB_MODE_INFO *le = xd->left_mbmi;
+    ab_sm = ab ? is_smooth(ab, plane) : 0;
+    le_sm = le ? is_smooth(le, plane) : 0;
+  } else {
+    const MB_MODE_INFO *ab = xd->chroma_above_mbmi;
+    const MB_MODE_INFO *le = xd->chroma_left_mbmi;
+    ab_sm = ab ? is_smooth(ab, plane) : 0;
+    le_sm = le ? is_smooth(le, plane) : 0;
+  }
+
+  return (ab_sm || le_sm) ? 1 : 0;
+}
+
+static int intra_edge_filter_strength(int bs0, int bs1, int delta, int type) {
+  const int d = abs(delta);
+  int strength = 0;
+
+  const int blk_wh = bs0 + bs1;
+  if (type == 0) {
+    if (blk_wh <= 8) {
+      if (d >= 56) strength = 1;
+    } else if (blk_wh <= 12) {
+      if (d >= 40) strength = 1;
+    } else if (blk_wh <= 16) {
+      if (d >= 40) strength = 1;
+    } else if (blk_wh <= 24) {
+      if (d >= 8) strength = 1;
+      if (d >= 16) strength = 2;
+      if (d >= 32) strength = 3;
+    } else if (blk_wh <= 32) {
+      if (d >= 1) strength = 1;
+      if (d >= 4) strength = 2;
+      if (d >= 32) strength = 3;
+    } else {
+      if (d >= 1) strength = 3;
+    }
+  } else {
+    if (blk_wh <= 8) {
+      if (d >= 40) strength = 1;
+      if (d >= 64) strength = 2;
+    } else if (blk_wh <= 16) {
+      if (d >= 20) strength = 1;
+      if (d >= 48) strength = 2;
+    } else if (blk_wh <= 24) {
+      if (d >= 4) strength = 3;
+    } else {
+      if (d >= 1) strength = 3;
+    }
+  }
+  return strength;
+}
+
+void av1_filter_intra_edge_c(uint8_t *p, int sz, int strength) {
+  if (!strength) return;
+
+  const int kernel[INTRA_EDGE_FILT][INTRA_EDGE_TAPS] = { { 0, 4, 8, 4, 0 },
+                                                         { 0, 5, 6, 5, 0 },
+                                                         { 2, 4, 4, 4, 2 } };
+  const int filt = strength - 1;
+  uint8_t edge[129];
+
+  memcpy(edge, p, sz * sizeof(*p));
+  for (int i = 1; i < sz; i++) {
+    int s = 0;
+    for (int j = 0; j < INTRA_EDGE_TAPS; j++) {
+      int k = i - 2 + j;
+      k = (k < 0) ? 0 : k;
+      k = (k > sz - 1) ? sz - 1 : k;
+      s += edge[k] * kernel[filt][j];
+    }
+    s = (s + 8) >> 4;
+    p[i] = s;
+  }
+}
+
+static void filter_intra_edge_corner(uint8_t *p_above, uint8_t *p_left) {
+  const int kernel[3] = { 5, 6, 5 };
+
+  int s = (p_left[0] * kernel[0]) + (p_above[-1] * kernel[1]) +
+          (p_above[0] * kernel[2]);
+  s = (s + 8) >> 4;
+  p_above[-1] = s;
+  p_left[-1] = s;
+}
+
+void av1_filter_intra_edge_high_c(uint16_t *p, int sz, int strength) {
+  if (!strength) return;
+
+  const int kernel[INTRA_EDGE_FILT][INTRA_EDGE_TAPS] = { { 0, 4, 8, 4, 0 },
+                                                         { 0, 5, 6, 5, 0 },
+                                                         { 2, 4, 4, 4, 2 } };
+  const int filt = strength - 1;
+  uint16_t edge[129];
+
+  memcpy(edge, p, sz * sizeof(*p));
+  for (int i = 1; i < sz; i++) {
+    int s = 0;
+    for (int j = 0; j < INTRA_EDGE_TAPS; j++) {
+      int k = i - 2 + j;
+      k = (k < 0) ? 0 : k;
+      k = (k > sz - 1) ? sz - 1 : k;
+      s += edge[k] * kernel[filt][j];
+    }
+    s = (s + 8) >> 4;
+    p[i] = s;
+  }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static void filter_intra_edge_corner_high(uint16_t *p_above, uint16_t *p_left) {
+  const int kernel[3] = { 5, 6, 5 };
+
+  int s = (p_left[0] * kernel[0]) + (p_above[-1] * kernel[1]) +
+          (p_above[0] * kernel[2]);
+  s = (s + 8) >> 4;
+  p_above[-1] = s;
+  p_left[-1] = s;
+}
+#endif
+
+void av1_upsample_intra_edge_c(uint8_t *p, int sz) {
+  // interpolate half-sample positions
+  assert(sz <= MAX_UPSAMPLE_SZ);
+
+  uint8_t in[MAX_UPSAMPLE_SZ + 3];
+  // copy p[-1..(sz-1)] and extend first and last samples
+  in[0] = p[-1];
+  in[1] = p[-1];
+  for (int i = 0; i < sz; i++) {
+    in[i + 2] = p[i];
+  }
+  in[sz + 2] = p[sz - 1];
+
+  // interpolate half-sample edge positions
+  p[-2] = in[0];
+  for (int i = 0; i < sz; i++) {
+    int s = -in[i] + (9 * in[i + 1]) + (9 * in[i + 2]) - in[i + 3];
+    s = clip_pixel((s + 8) >> 4);
+    p[2 * i - 1] = s;
+    p[2 * i] = in[i + 2];
+  }
+}
+
+void av1_upsample_intra_edge_high_c(uint16_t *p, int sz, int bd) {
+  // interpolate half-sample positions
+  assert(sz <= MAX_UPSAMPLE_SZ);
+
+  uint16_t in[MAX_UPSAMPLE_SZ + 3];
+  // copy p[-1..(sz-1)] and extend first and last samples
+  in[0] = p[-1];
+  in[1] = p[-1];
+  for (int i = 0; i < sz; i++) {
+    in[i + 2] = p[i];
+  }
+  in[sz + 2] = p[sz - 1];
+
+  // interpolate half-sample edge positions
+  p[-2] = in[0];
+  for (int i = 0; i < sz; i++) {
+    int s = -in[i] + (9 * in[i + 1]) + (9 * in[i + 2]) - in[i + 3];
+    s = (s + 8) >> 4;
+    s = clip_pixel_highbd(s, bd);
+    p[2 * i - 1] = s;
+    p[2 * i] = in[i + 2];
+  }
+}
+#if CONFIG_AV1_HIGHBITDEPTH
+static void build_intra_predictors_high(
+    const MACROBLOCKD *xd, const uint8_t *ref8, int ref_stride, uint8_t *dst8,
+    int dst_stride, PREDICTION_MODE mode, int angle_delta,
+    FILTER_INTRA_MODE filter_intra_mode, TX_SIZE tx_size,
+    int disable_edge_filter, int n_top_px, int n_topright_px, int n_left_px,
+    int n_bottomleft_px, int plane) {
+  int i;
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  DECLARE_ALIGNED(16, uint16_t, left_data[MAX_TX_SIZE * 2 + 32]);
+  DECLARE_ALIGNED(16, uint16_t, above_data[MAX_TX_SIZE * 2 + 32]);
+  uint16_t *const above_row = above_data + 16;
+  uint16_t *const left_col = left_data + 16;
+  const int txwpx = tx_size_wide[tx_size];
+  const int txhpx = tx_size_high[tx_size];
+  int need_left = extend_modes[mode] & NEED_LEFT;
+  int need_above = extend_modes[mode] & NEED_ABOVE;
+  int need_above_left = extend_modes[mode] & NEED_ABOVELEFT;
+  const uint16_t *above_ref = ref - ref_stride;
+  const uint16_t *left_ref = ref - 1;
+  int p_angle = 0;
+  const int is_dr_mode = av1_is_directional_mode(mode);
+  const int use_filter_intra = filter_intra_mode != FILTER_INTRA_MODES;
+  int base = 128 << (xd->bd - 8);
+
+  // The default values if ref pixels are not available:
+  // base   base-1 base-1 .. base-1 base-1 base-1 base-1 base-1 base-1
+  // base+1   A      B  ..     Y      Z
+  // base+1   C      D  ..     W      X
+  // base+1   E      F  ..     U      V
+  // base+1   G      H  ..     S      T      T      T      T      T
+
+  if (is_dr_mode) {
+    p_angle = mode_to_angle_map[mode] + angle_delta;
+    if (p_angle <= 90)
+      need_above = 1, need_left = 0, need_above_left = 1;
+    else if (p_angle < 180)
+      need_above = 1, need_left = 1, need_above_left = 1;
+    else
+      need_above = 0, need_left = 1, need_above_left = 1;
+  }
+  if (use_filter_intra) need_left = need_above = need_above_left = 1;
+
+  assert(n_top_px >= 0);
+  assert(n_topright_px >= 0);
+  assert(n_left_px >= 0);
+  assert(n_bottomleft_px >= 0);
+
+  if ((!need_above && n_left_px == 0) || (!need_left && n_top_px == 0)) {
+    int val;
+    if (need_left) {
+      val = (n_top_px > 0) ? above_ref[0] : base + 1;
+    } else {
+      val = (n_left_px > 0) ? left_ref[0] : base - 1;
+    }
+    for (i = 0; i < txhpx; ++i) {
+      aom_memset16(dst, val, txwpx);
+      dst += dst_stride;
+    }
+    return;
+  }
+
+  // NEED_LEFT
+  if (need_left) {
+    int need_bottom = extend_modes[mode] & NEED_BOTTOMLEFT;
+    if (use_filter_intra) need_bottom = 0;
+    if (is_dr_mode) need_bottom = p_angle > 180;
+    const int num_left_pixels_needed = txhpx + (need_bottom ? txwpx : 0);
+    i = 0;
+    if (n_left_px > 0) {
+      for (; i < n_left_px; i++) left_col[i] = left_ref[i * ref_stride];
+      if (need_bottom && n_bottomleft_px > 0) {
+        assert(i == txhpx);
+        for (; i < txhpx + n_bottomleft_px; i++)
+          left_col[i] = left_ref[i * ref_stride];
+      }
+      if (i < num_left_pixels_needed)
+        aom_memset16(&left_col[i], left_col[i - 1], num_left_pixels_needed - i);
+    } else {
+      if (n_top_px > 0) {
+        aom_memset16(left_col, above_ref[0], num_left_pixels_needed);
+      } else {
+        aom_memset16(left_col, base + 1, num_left_pixels_needed);
+      }
+    }
+  }
+
+  // NEED_ABOVE
+  if (need_above) {
+    int need_right = extend_modes[mode] & NEED_ABOVERIGHT;
+    if (use_filter_intra) need_right = 0;
+    if (is_dr_mode) need_right = p_angle < 90;
+    const int num_top_pixels_needed = txwpx + (need_right ? txhpx : 0);
+    if (n_top_px > 0) {
+      memcpy(above_row, above_ref, n_top_px * sizeof(above_ref[0]));
+      i = n_top_px;
+      if (need_right && n_topright_px > 0) {
+        assert(n_top_px == txwpx);
+        memcpy(above_row + txwpx, above_ref + txwpx,
+               n_topright_px * sizeof(above_ref[0]));
+        i += n_topright_px;
+      }
+      if (i < num_top_pixels_needed)
+        aom_memset16(&above_row[i], above_row[i - 1],
+                     num_top_pixels_needed - i);
+    } else {
+      if (n_left_px > 0) {
+        aom_memset16(above_row, left_ref[0], num_top_pixels_needed);
+      } else {
+        aom_memset16(above_row, base - 1, num_top_pixels_needed);
+      }
+    }
+  }
+
+  if (need_above_left) {
+    if (n_top_px > 0 && n_left_px > 0) {
+      above_row[-1] = above_ref[-1];
+    } else if (n_top_px > 0) {
+      above_row[-1] = above_ref[0];
+    } else if (n_left_px > 0) {
+      above_row[-1] = left_ref[0];
+    } else {
+      above_row[-1] = base;
+    }
+    left_col[-1] = above_row[-1];
+  }
+
+  if (use_filter_intra) {
+    highbd_filter_intra_predictor(dst, dst_stride, tx_size, above_row, left_col,
+                                  filter_intra_mode, xd->bd);
+    return;
+  }
+
+  if (is_dr_mode) {
+    int upsample_above = 0;
+    int upsample_left = 0;
+    if (!disable_edge_filter) {
+      const int need_right = p_angle < 90;
+      const int need_bottom = p_angle > 180;
+      const int filt_type = get_filt_type(xd, plane);
+      if (p_angle != 90 && p_angle != 180) {
+        const int ab_le = need_above_left ? 1 : 0;
+        if (need_above && need_left && (txwpx + txhpx >= 24)) {
+          filter_intra_edge_corner_high(above_row, left_col);
+        }
+        if (need_above && n_top_px > 0) {
+          const int strength =
+              intra_edge_filter_strength(txwpx, txhpx, p_angle - 90, filt_type);
+          const int n_px = n_top_px + ab_le + (need_right ? txhpx : 0);
+          av1_filter_intra_edge_high(above_row - ab_le, n_px, strength);
+        }
+        if (need_left && n_left_px > 0) {
+          const int strength = intra_edge_filter_strength(
+              txhpx, txwpx, p_angle - 180, filt_type);
+          const int n_px = n_left_px + ab_le + (need_bottom ? txwpx : 0);
+          av1_filter_intra_edge_high(left_col - ab_le, n_px, strength);
+        }
+      }
+      upsample_above =
+          av1_use_intra_edge_upsample(txwpx, txhpx, p_angle - 90, filt_type);
+      if (need_above && upsample_above) {
+        const int n_px = txwpx + (need_right ? txhpx : 0);
+        av1_upsample_intra_edge_high(above_row, n_px, xd->bd);
+      }
+      upsample_left =
+          av1_use_intra_edge_upsample(txhpx, txwpx, p_angle - 180, filt_type);
+      if (need_left && upsample_left) {
+        const int n_px = txhpx + (need_bottom ? txwpx : 0);
+        av1_upsample_intra_edge_high(left_col, n_px, xd->bd);
+      }
+    }
+    highbd_dr_predictor(dst, dst_stride, tx_size, above_row, left_col,
+                        upsample_above, upsample_left, p_angle, xd->bd);
+    return;
+  }
+
+  // predict
+  if (mode == DC_PRED) {
+    dc_pred_high[n_left_px > 0][n_top_px > 0][tx_size](
+        dst, dst_stride, above_row, left_col, xd->bd);
+  } else {
+    pred_high[mode][tx_size](dst, dst_stride, above_row, left_col, xd->bd);
+  }
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref,
+                                   int ref_stride, uint8_t *dst, int dst_stride,
+                                   PREDICTION_MODE mode, int angle_delta,
+                                   FILTER_INTRA_MODE filter_intra_mode,
+                                   TX_SIZE tx_size, int disable_edge_filter,
+                                   int n_top_px, int n_topright_px,
+                                   int n_left_px, int n_bottomleft_px,
+                                   int plane) {
+  int i;
+  const uint8_t *above_ref = ref - ref_stride;
+  const uint8_t *left_ref = ref - 1;
+  DECLARE_ALIGNED(16, uint8_t, left_data[MAX_TX_SIZE * 2 + 32]);
+  DECLARE_ALIGNED(16, uint8_t, above_data[MAX_TX_SIZE * 2 + 32]);
+  uint8_t *const above_row = above_data + 16;
+  uint8_t *const left_col = left_data + 16;
+  const int txwpx = tx_size_wide[tx_size];
+  const int txhpx = tx_size_high[tx_size];
+  int need_left = extend_modes[mode] & NEED_LEFT;
+  int need_above = extend_modes[mode] & NEED_ABOVE;
+  int need_above_left = extend_modes[mode] & NEED_ABOVELEFT;
+  int p_angle = 0;
+  const int is_dr_mode = av1_is_directional_mode(mode);
+  const int use_filter_intra = filter_intra_mode != FILTER_INTRA_MODES;
+
+  // The default values if ref pixels are not available:
+  // 128 127 127 .. 127 127 127 127 127 127
+  // 129  A   B  ..  Y   Z
+  // 129  C   D  ..  W   X
+  // 129  E   F  ..  U   V
+  // 129  G   H  ..  S   T   T   T   T   T
+  // ..
+
+  if (is_dr_mode) {
+    p_angle = mode_to_angle_map[mode] + angle_delta;
+    if (p_angle <= 90)
+      need_above = 1, need_left = 0, need_above_left = 1;
+    else if (p_angle < 180)
+      need_above = 1, need_left = 1, need_above_left = 1;
+    else
+      need_above = 0, need_left = 1, need_above_left = 1;
+  }
+  if (use_filter_intra) need_left = need_above = need_above_left = 1;
+
+  assert(n_top_px >= 0);
+  assert(n_topright_px >= 0);
+  assert(n_left_px >= 0);
+  assert(n_bottomleft_px >= 0);
+
+  if ((!need_above && n_left_px == 0) || (!need_left && n_top_px == 0)) {
+    int val;
+    if (need_left) {
+      val = (n_top_px > 0) ? above_ref[0] : 129;
+    } else {
+      val = (n_left_px > 0) ? left_ref[0] : 127;
+    }
+    for (i = 0; i < txhpx; ++i) {
+      memset(dst, val, txwpx);
+      dst += dst_stride;
+    }
+    return;
+  }
+
+  // NEED_LEFT
+  if (need_left) {
+    int need_bottom = extend_modes[mode] & NEED_BOTTOMLEFT;
+    if (use_filter_intra) need_bottom = 0;
+    if (is_dr_mode) need_bottom = p_angle > 180;
+    // the avx2 dr_prediction_z2 may read at most 3 extra bytes,
+    // due to the avx2 mask load is with dword granularity.
+    // so we initialize 3 extra bytes to silence valgrind complain.
+    const int num_left_pixels_needed = txhpx + (need_bottom ? txwpx : 3);
+    i = 0;
+    if (n_left_px > 0) {
+      for (; i < n_left_px; i++) left_col[i] = left_ref[i * ref_stride];
+      if (need_bottom && n_bottomleft_px > 0) {
+        assert(i == txhpx);
+        for (; i < txhpx + n_bottomleft_px; i++)
+          left_col[i] = left_ref[i * ref_stride];
+      }
+      if (i < num_left_pixels_needed)
+        memset(&left_col[i], left_col[i - 1], num_left_pixels_needed - i);
+    } else {
+      if (n_top_px > 0) {
+        memset(left_col, above_ref[0], num_left_pixels_needed);
+      } else {
+        memset(left_col, 129, num_left_pixels_needed);
+      }
+    }
+  }
+
+  // NEED_ABOVE
+  if (need_above) {
+    int need_right = extend_modes[mode] & NEED_ABOVERIGHT;
+    if (use_filter_intra) need_right = 0;
+    if (is_dr_mode) need_right = p_angle < 90;
+    const int num_top_pixels_needed = txwpx + (need_right ? txhpx : 0);
+    if (n_top_px > 0) {
+      memcpy(above_row, above_ref, n_top_px);
+      i = n_top_px;
+      if (need_right && n_topright_px > 0) {
+        assert(n_top_px == txwpx);
+        memcpy(above_row + txwpx, above_ref + txwpx, n_topright_px);
+        i += n_topright_px;
+      }
+      if (i < num_top_pixels_needed)
+        memset(&above_row[i], above_row[i - 1], num_top_pixels_needed - i);
+    } else {
+      if (n_left_px > 0) {
+        memset(above_row, left_ref[0], num_top_pixels_needed);
+      } else {
+        memset(above_row, 127, num_top_pixels_needed);
+      }
+    }
+  }
+
+  if (need_above_left) {
+    if (n_top_px > 0 && n_left_px > 0) {
+      above_row[-1] = above_ref[-1];
+    } else if (n_top_px > 0) {
+      above_row[-1] = above_ref[0];
+    } else if (n_left_px > 0) {
+      above_row[-1] = left_ref[0];
+    } else {
+      above_row[-1] = 128;
+    }
+    left_col[-1] = above_row[-1];
+  }
+
+  if (use_filter_intra) {
+    av1_filter_intra_predictor(dst, dst_stride, tx_size, above_row, left_col,
+                               filter_intra_mode);
+    return;
+  }
+
+  if (is_dr_mode) {
+    int upsample_above = 0;
+    int upsample_left = 0;
+    if (!disable_edge_filter) {
+      const int need_right = p_angle < 90;
+      const int need_bottom = p_angle > 180;
+      const int filt_type = get_filt_type(xd, plane);
+      if (p_angle != 90 && p_angle != 180) {
+        const int ab_le = need_above_left ? 1 : 0;
+        if (need_above && need_left && (txwpx + txhpx >= 24)) {
+          filter_intra_edge_corner(above_row, left_col);
+        }
+        if (need_above && n_top_px > 0) {
+          const int strength =
+              intra_edge_filter_strength(txwpx, txhpx, p_angle - 90, filt_type);
+          const int n_px = n_top_px + ab_le + (need_right ? txhpx : 0);
+          av1_filter_intra_edge(above_row - ab_le, n_px, strength);
+        }
+        if (need_left && n_left_px > 0) {
+          const int strength = intra_edge_filter_strength(
+              txhpx, txwpx, p_angle - 180, filt_type);
+          const int n_px = n_left_px + ab_le + (need_bottom ? txwpx : 0);
+          av1_filter_intra_edge(left_col - ab_le, n_px, strength);
+        }
+      }
+      upsample_above =
+          av1_use_intra_edge_upsample(txwpx, txhpx, p_angle - 90, filt_type);
+      if (need_above && upsample_above) {
+        const int n_px = txwpx + (need_right ? txhpx : 0);
+        av1_upsample_intra_edge(above_row, n_px);
+      }
+      upsample_left =
+          av1_use_intra_edge_upsample(txhpx, txwpx, p_angle - 180, filt_type);
+      if (need_left && upsample_left) {
+        const int n_px = txhpx + (need_bottom ? txwpx : 0);
+        av1_upsample_intra_edge(left_col, n_px);
+      }
+    }
+    dr_predictor(dst, dst_stride, tx_size, above_row, left_col, upsample_above,
+                 upsample_left, p_angle);
+    return;
+  }
+
+  // predict
+  if (mode == DC_PRED) {
+    dc_pred[n_left_px > 0][n_top_px > 0][tx_size](dst, dst_stride, above_row,
+                                                  left_col);
+  } else {
+    pred[mode][tx_size](dst, dst_stride, above_row, left_col);
+  }
+}
+
+static INLINE BLOCK_SIZE scale_chroma_bsize(BLOCK_SIZE bsize, int subsampling_x,
+                                            int subsampling_y) {
+  assert(subsampling_x >= 0 && subsampling_x < 2);
+  assert(subsampling_y >= 0 && subsampling_y < 2);
+  BLOCK_SIZE bs = bsize;
+  switch (bsize) {
+    case BLOCK_4X4:
+      if (subsampling_x == 1 && subsampling_y == 1)
+        bs = BLOCK_8X8;
+      else if (subsampling_x == 1)
+        bs = BLOCK_8X4;
+      else if (subsampling_y == 1)
+        bs = BLOCK_4X8;
+      break;
+    case BLOCK_4X8:
+      if (subsampling_x == 1 && subsampling_y == 1)
+        bs = BLOCK_8X8;
+      else if (subsampling_x == 1)
+        bs = BLOCK_8X8;
+      else if (subsampling_y == 1)
+        bs = BLOCK_4X8;
+      break;
+    case BLOCK_8X4:
+      if (subsampling_x == 1 && subsampling_y == 1)
+        bs = BLOCK_8X8;
+      else if (subsampling_x == 1)
+        bs = BLOCK_8X4;
+      else if (subsampling_y == 1)
+        bs = BLOCK_8X8;
+      break;
+    case BLOCK_4X16:
+      if (subsampling_x == 1 && subsampling_y == 1)
+        bs = BLOCK_8X16;
+      else if (subsampling_x == 1)
+        bs = BLOCK_8X16;
+      else if (subsampling_y == 1)
+        bs = BLOCK_4X16;
+      break;
+    case BLOCK_16X4:
+      if (subsampling_x == 1 && subsampling_y == 1)
+        bs = BLOCK_16X8;
+      else if (subsampling_x == 1)
+        bs = BLOCK_16X4;
+      else if (subsampling_y == 1)
+        bs = BLOCK_16X8;
+      break;
+    default: break;
+  }
+  return bs;
+}
+
+void av1_predict_intra_block(
+    const AV1_COMMON *cm, const MACROBLOCKD *xd, int wpx, int hpx,
+    TX_SIZE tx_size, PREDICTION_MODE mode, int angle_delta, int use_palette,
+    FILTER_INTRA_MODE filter_intra_mode, const uint8_t *ref, int ref_stride,
+    uint8_t *dst, int dst_stride, int col_off, int row_off, int plane) {
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int txwpx = tx_size_wide[tx_size];
+  const int txhpx = tx_size_high[tx_size];
+  const int x = col_off << MI_SIZE_LOG2;
+  const int y = row_off << MI_SIZE_LOG2;
+
+  if (use_palette) {
+    int r, c;
+    const uint8_t *const map = xd->plane[plane != 0].color_index_map +
+                               xd->color_index_map_offset[plane != 0];
+    const uint16_t *const palette =
+        mbmi->palette_mode_info.palette_colors + plane * PALETTE_MAX_SIZE;
+    if (is_cur_buf_hbd(xd)) {
+      uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
+      for (r = 0; r < txhpx; ++r) {
+        for (c = 0; c < txwpx; ++c) {
+          dst16[r * dst_stride + c] = palette[map[(r + y) * wpx + c + x]];
+        }
+      }
+    } else {
+      for (r = 0; r < txhpx; ++r) {
+        for (c = 0; c < txwpx; ++c) {
+          dst[r * dst_stride + c] =
+              (uint8_t)palette[map[(r + y) * wpx + c + x]];
+        }
+      }
+    }
+    return;
+  }
+
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int txw = tx_size_wide_unit[tx_size];
+  const int txh = tx_size_high_unit[tx_size];
+  const int ss_x = pd->subsampling_x;
+  const int ss_y = pd->subsampling_y;
+  const int have_top =
+      row_off || (ss_y ? xd->chroma_up_available : xd->up_available);
+  const int have_left =
+      col_off || (ss_x ? xd->chroma_left_available : xd->left_available);
+  const int mi_row = -xd->mb_to_top_edge >> (3 + MI_SIZE_LOG2);
+  const int mi_col = -xd->mb_to_left_edge >> (3 + MI_SIZE_LOG2);
+  const int xr_chr_offset = 0;
+  const int yd_chr_offset = 0;
+
+  // Distance between the right edge of this prediction block to
+  // the frame right edge
+  const int xr =
+      (xd->mb_to_right_edge >> (3 + ss_x)) + (wpx - x - txwpx) - xr_chr_offset;
+  // Distance between the bottom edge of this prediction block to
+  // the frame bottom edge
+  const int yd =
+      (xd->mb_to_bottom_edge >> (3 + ss_y)) + (hpx - y - txhpx) - yd_chr_offset;
+  const int right_available =
+      mi_col + ((col_off + txw) << ss_x) < xd->tile.mi_col_end;
+  const int bottom_available =
+      (yd > 0) && (mi_row + ((row_off + txh) << ss_y) < xd->tile.mi_row_end);
+
+  const PARTITION_TYPE partition = mbmi->partition;
+
+  BLOCK_SIZE bsize = mbmi->sb_type;
+  // force 4x4 chroma component block size.
+  if (ss_x || ss_y) {
+    bsize = scale_chroma_bsize(bsize, ss_x, ss_y);
+  }
+
+  const int have_top_right =
+      has_top_right(cm, bsize, mi_row, mi_col, have_top, right_available,
+                    partition, tx_size, row_off, col_off, ss_x, ss_y);
+  const int have_bottom_left =
+      has_bottom_left(cm, bsize, mi_row, mi_col, bottom_available, have_left,
+                      partition, tx_size, row_off, col_off, ss_x, ss_y);
+
+  const int disable_edge_filter = !cm->seq_params.enable_intra_edge_filter;
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (is_cur_buf_hbd(xd)) {
+    build_intra_predictors_high(
+        xd, ref, ref_stride, dst, dst_stride, mode, angle_delta,
+        filter_intra_mode, tx_size, disable_edge_filter,
+        have_top ? AOMMIN(txwpx, xr + txwpx) : 0,
+        have_top_right ? AOMMIN(txwpx, xr) : 0,
+        have_left ? AOMMIN(txhpx, yd + txhpx) : 0,
+        have_bottom_left ? AOMMIN(txhpx, yd) : 0, plane);
+    return;
+  }
+#endif
+  build_intra_predictors(xd, ref, ref_stride, dst, dst_stride, mode,
+                         angle_delta, filter_intra_mode, tx_size,
+                         disable_edge_filter,
+                         have_top ? AOMMIN(txwpx, xr + txwpx) : 0,
+                         have_top_right ? AOMMIN(txwpx, xr) : 0,
+                         have_left ? AOMMIN(txhpx, yd + txhpx) : 0,
+                         have_bottom_left ? AOMMIN(txhpx, yd) : 0, plane);
+}
+
+void av1_predict_intra_block_facade(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                    int plane, int blk_col, int blk_row,
+                                    TX_SIZE tx_size) {
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int dst_stride = pd->dst.stride;
+  uint8_t *dst = &pd->dst.buf[(blk_row * dst_stride + blk_col) << MI_SIZE_LOG2];
+  const PREDICTION_MODE mode =
+      (plane == AOM_PLANE_Y) ? mbmi->mode : get_uv_mode(mbmi->uv_mode);
+  const int use_palette = mbmi->palette_mode_info.palette_size[plane != 0] > 0;
+  const FILTER_INTRA_MODE filter_intra_mode =
+      (plane == AOM_PLANE_Y && mbmi->filter_intra_mode_info.use_filter_intra)
+          ? mbmi->filter_intra_mode_info.filter_intra_mode
+          : FILTER_INTRA_MODES;
+  const int angle_delta = mbmi->angle_delta[plane != AOM_PLANE_Y] * ANGLE_STEP;
+
+  if (plane != AOM_PLANE_Y && mbmi->uv_mode == UV_CFL_PRED) {
+#if CONFIG_DEBUG
+    assert(is_cfl_allowed(xd));
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(
+        mbmi->sb_type, pd->subsampling_x, pd->subsampling_y);
+    (void)plane_bsize;
+    assert(plane_bsize < BLOCK_SIZES_ALL);
+    if (!xd->lossless[mbmi->segment_id]) {
+      assert(blk_col == 0);
+      assert(blk_row == 0);
+      assert(block_size_wide[plane_bsize] == tx_size_wide[tx_size]);
+      assert(block_size_high[plane_bsize] == tx_size_high[tx_size]);
+    }
+#endif
+    CFL_CTX *const cfl = &xd->cfl;
+    CFL_PRED_TYPE pred_plane = get_cfl_pred_type(plane);
+    if (cfl->dc_pred_is_cached[pred_plane] == 0) {
+      av1_predict_intra_block(cm, xd, pd->width, pd->height, tx_size, mode,
+                              angle_delta, use_palette, filter_intra_mode, dst,
+                              dst_stride, dst, dst_stride, blk_col, blk_row,
+                              plane);
+      if (cfl->use_dc_pred_cache) {
+        cfl_store_dc_pred(xd, dst, pred_plane, tx_size_wide[tx_size]);
+        cfl->dc_pred_is_cached[pred_plane] = 1;
+      }
+    } else {
+      cfl_load_dc_pred(xd, dst, dst_stride, tx_size, pred_plane);
+    }
+    cfl_predict_block(xd, dst, dst_stride, tx_size, plane);
+    return;
+  }
+  av1_predict_intra_block(cm, xd, pd->width, pd->height, tx_size, mode,
+                          angle_delta, use_palette, filter_intra_mode, dst,
+                          dst_stride, dst, dst_stride, blk_col, blk_row, plane);
+}
+
+void av1_init_intra_predictors(void) {
+  aom_once(init_intra_predictors_internal);
+}
diff --git a/libs/libaom/src/av1/common/reconintra.h b/libs/libaom/src/av1/common/reconintra.h
new file mode 100644
index 000000000..9d203569c
--- /dev/null
+++ b/libs/libaom/src/av1/common/reconintra.h
@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_RECONINTRA_H_
+#define AOM_AV1_COMMON_RECONINTRA_H_
+
+#include <stdlib.h>
+
+#include "aom/aom_integer.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/common/blockd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_init_intra_predictors(void);
+void av1_predict_intra_block_facade(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                    int plane, int blk_col, int blk_row,
+                                    TX_SIZE tx_size);
+void av1_predict_intra_block(
+    const AV1_COMMON *cm, const MACROBLOCKD *xd, int wpx, int hpx,
+    TX_SIZE tx_size, PREDICTION_MODE mode, int angle_delta, int use_palette,
+    FILTER_INTRA_MODE filter_intra_mode, const uint8_t *ref, int ref_stride,
+    uint8_t *dst, int dst_stride, int col_off, int row_off, int plane);
+
+// Mapping of interintra to intra mode for use in the intra component
+static const PREDICTION_MODE interintra_to_intra_mode[INTERINTRA_MODES] = {
+  DC_PRED, V_PRED, H_PRED, SMOOTH_PRED
+};
+
+// Mapping of intra mode to the interintra mode
+static const INTERINTRA_MODE intra_to_interintra_mode[INTRA_MODES] = {
+  II_DC_PRED, II_V_PRED, II_H_PRED, II_V_PRED,      II_SMOOTH_PRED, II_V_PRED,
+  II_H_PRED,  II_H_PRED, II_V_PRED, II_SMOOTH_PRED, II_SMOOTH_PRED
+};
+
+#define FILTER_INTRA_SCALE_BITS 4
+
+static INLINE int av1_is_directional_mode(PREDICTION_MODE mode) {
+  return mode >= V_PRED && mode <= D67_PRED;
+}
+
+static INLINE int av1_use_angle_delta(BLOCK_SIZE bsize) {
+  return bsize >= BLOCK_8X8;
+}
+
+static INLINE int av1_allow_intrabc(const AV1_COMMON *const cm) {
+  return frame_is_intra_only(cm) && cm->features.allow_screen_content_tools &&
+         cm->features.allow_intrabc;
+}
+
+static INLINE int av1_filter_intra_allowed_bsize(const AV1_COMMON *const cm,
+                                                 BLOCK_SIZE bs) {
+  if (!cm->seq_params.enable_filter_intra || bs == BLOCK_INVALID) return 0;
+
+  return block_size_wide[bs] <= 32 && block_size_high[bs] <= 32;
+}
+
+static INLINE int av1_filter_intra_allowed(const AV1_COMMON *const cm,
+                                           const MB_MODE_INFO *mbmi) {
+  return mbmi->mode == DC_PRED &&
+         mbmi->palette_mode_info.palette_size[0] == 0 &&
+         av1_filter_intra_allowed_bsize(cm, mbmi->sb_type);
+}
+
+extern const int8_t av1_filter_intra_taps[FILTER_INTRA_MODES][8][8];
+
+static const int16_t dr_intra_derivative[90] = {
+  // More evenly spread out angles and limited to 10-bit
+  // Values that are 0 will never be used
+  //                    Approx angle
+  0,    0, 0,        //
+  1023, 0, 0,        // 3, ...
+  547,  0, 0,        // 6, ...
+  372,  0, 0, 0, 0,  // 9, ...
+  273,  0, 0,        // 14, ...
+  215,  0, 0,        // 17, ...
+  178,  0, 0,        // 20, ...
+  151,  0, 0,        // 23, ... (113 & 203 are base angles)
+  132,  0, 0,        // 26, ...
+  116,  0, 0,        // 29, ...
+  102,  0, 0, 0,     // 32, ...
+  90,   0, 0,        // 36, ...
+  80,   0, 0,        // 39, ...
+  71,   0, 0,        // 42, ...
+  64,   0, 0,        // 45, ... (45 & 135 are base angles)
+  57,   0, 0,        // 48, ...
+  51,   0, 0,        // 51, ...
+  45,   0, 0, 0,     // 54, ...
+  40,   0, 0,        // 58, ...
+  35,   0, 0,        // 61, ...
+  31,   0, 0,        // 64, ...
+  27,   0, 0,        // 67, ... (67 & 157 are base angles)
+  23,   0, 0,        // 70, ...
+  19,   0, 0,        // 73, ...
+  15,   0, 0, 0, 0,  // 76, ...
+  11,   0, 0,        // 81, ...
+  7,    0, 0,        // 84, ...
+  3,    0, 0,        // 87, ...
+};
+
+// Get the shift (up-scaled by 256) in X w.r.t a unit change in Y.
+// If angle > 0 && angle < 90, dx = -((int)(256 / t));
+// If angle > 90 && angle < 180, dx = (int)(256 / t);
+// If angle > 180 && angle < 270, dx = 1;
+static INLINE int av1_get_dx(int angle) {
+  if (angle > 0 && angle < 90) {
+    return dr_intra_derivative[angle];
+  } else if (angle > 90 && angle < 180) {
+    return dr_intra_derivative[180 - angle];
+  } else {
+    // In this case, we are not really going to use dx. We may return any value.
+    return 1;
+  }
+}
+
+// Get the shift (up-scaled by 256) in Y w.r.t a unit change in X.
+// If angle > 0 && angle < 90, dy = 1;
+// If angle > 90 && angle < 180, dy = (int)(256 * t);
+// If angle > 180 && angle < 270, dy = -((int)(256 * t));
+static INLINE int av1_get_dy(int angle) {
+  if (angle > 90 && angle < 180) {
+    return dr_intra_derivative[angle - 90];
+  } else if (angle > 180 && angle < 270) {
+    return dr_intra_derivative[270 - angle];
+  } else {
+    // In this case, we are not really going to use dy. We may return any value.
+    return 1;
+  }
+}
+
+static INLINE int av1_use_intra_edge_upsample(int bs0, int bs1, int delta,
+                                              int type) {
+  const int d = abs(delta);
+  const int blk_wh = bs0 + bs1;
+  if (d == 0 || d >= 40) return 0;
+  return type ? (blk_wh <= 8) : (blk_wh <= 16);
+}
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+#endif  // AOM_AV1_COMMON_RECONINTRA_H_
diff --git a/libs/libaom/src/av1/common/resize.c b/libs/libaom/src/av1/common/resize.c
new file mode 100644
index 000000000..98f28f7b5
--- /dev/null
+++ b/libs/libaom/src/av1/common/resize.c
@@ -0,0 +1,1455 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/mem.h"
+#include "aom_scale/aom_scale.h"
+#include "av1/common/common.h"
+#include "av1/common/resize.h"
+
+#include "config/aom_scale_rtcd.h"
+
+// Filters for interpolation (0.5-band) - note this also filters integer pels.
+static const InterpKernel filteredinterp_filters500[(1 << RS_SUBPEL_BITS)] = {
+  { -3, 0, 35, 64, 35, 0, -3, 0 },    { -3, 0, 34, 64, 36, 0, -3, 0 },
+  { -3, -1, 34, 64, 36, 1, -3, 0 },   { -3, -1, 33, 64, 37, 1, -3, 0 },
+  { -3, -1, 32, 64, 38, 1, -3, 0 },   { -3, -1, 31, 64, 39, 1, -3, 0 },
+  { -3, -1, 31, 63, 39, 2, -3, 0 },   { -2, -2, 30, 63, 40, 2, -3, 0 },
+  { -2, -2, 29, 63, 41, 2, -3, 0 },   { -2, -2, 29, 63, 41, 3, -4, 0 },
+  { -2, -2, 28, 63, 42, 3, -4, 0 },   { -2, -2, 27, 63, 43, 3, -4, 0 },
+  { -2, -3, 27, 63, 43, 4, -4, 0 },   { -2, -3, 26, 62, 44, 5, -4, 0 },
+  { -2, -3, 25, 62, 45, 5, -4, 0 },   { -2, -3, 25, 62, 45, 5, -4, 0 },
+  { -2, -3, 24, 62, 46, 5, -4, 0 },   { -2, -3, 23, 61, 47, 6, -4, 0 },
+  { -2, -3, 23, 61, 47, 6, -4, 0 },   { -2, -3, 22, 61, 48, 7, -4, -1 },
+  { -2, -3, 21, 60, 49, 7, -4, 0 },   { -1, -4, 20, 60, 49, 8, -4, 0 },
+  { -1, -4, 20, 60, 50, 8, -4, -1 },  { -1, -4, 19, 59, 51, 9, -4, -1 },
+  { -1, -4, 19, 59, 51, 9, -4, -1 },  { -1, -4, 18, 58, 52, 10, -4, -1 },
+  { -1, -4, 17, 58, 52, 11, -4, -1 }, { -1, -4, 16, 58, 53, 11, -4, -1 },
+  { -1, -4, 16, 57, 53, 12, -4, -1 }, { -1, -4, 15, 57, 54, 12, -4, -1 },
+  { -1, -4, 15, 56, 54, 13, -4, -1 }, { -1, -4, 14, 56, 55, 13, -4, -1 },
+  { -1, -4, 14, 55, 55, 14, -4, -1 }, { -1, -4, 13, 55, 56, 14, -4, -1 },
+  { -1, -4, 13, 54, 56, 15, -4, -1 }, { -1, -4, 12, 54, 57, 15, -4, -1 },
+  { -1, -4, 12, 53, 57, 16, -4, -1 }, { -1, -4, 11, 53, 58, 16, -4, -1 },
+  { -1, -4, 11, 52, 58, 17, -4, -1 }, { -1, -4, 10, 52, 58, 18, -4, -1 },
+  { -1, -4, 9, 51, 59, 19, -4, -1 },  { -1, -4, 9, 51, 59, 19, -4, -1 },
+  { -1, -4, 8, 50, 60, 20, -4, -1 },  { 0, -4, 8, 49, 60, 20, -4, -1 },
+  { 0, -4, 7, 49, 60, 21, -3, -2 },   { -1, -4, 7, 48, 61, 22, -3, -2 },
+  { 0, -4, 6, 47, 61, 23, -3, -2 },   { 0, -4, 6, 47, 61, 23, -3, -2 },
+  { 0, -4, 5, 46, 62, 24, -3, -2 },   { 0, -4, 5, 45, 62, 25, -3, -2 },
+  { 0, -4, 5, 45, 62, 25, -3, -2 },   { 0, -4, 5, 44, 62, 26, -3, -2 },
+  { 0, -4, 4, 43, 63, 27, -3, -2 },   { 0, -4, 3, 43, 63, 27, -2, -2 },
+  { 0, -4, 3, 42, 63, 28, -2, -2 },   { 0, -4, 3, 41, 63, 29, -2, -2 },
+  { 0, -3, 2, 41, 63, 29, -2, -2 },   { 0, -3, 2, 40, 63, 30, -2, -2 },
+  { 0, -3, 2, 39, 63, 31, -1, -3 },   { 0, -3, 1, 39, 64, 31, -1, -3 },
+  { 0, -3, 1, 38, 64, 32, -1, -3 },   { 0, -3, 1, 37, 64, 33, -1, -3 },
+  { 0, -3, 1, 36, 64, 34, -1, -3 },   { 0, -3, 0, 36, 64, 34, 0, -3 },
+};
+
+// Filters for interpolation (0.625-band) - note this also filters integer pels.
+static const InterpKernel filteredinterp_filters625[(1 << RS_SUBPEL_BITS)] = {
+  { -1, -8, 33, 80, 33, -8, -1, 0 }, { -1, -8, 31, 80, 34, -8, -1, 1 },
+  { -1, -8, 30, 80, 35, -8, -1, 1 }, { -1, -8, 29, 80, 36, -7, -2, 1 },
+  { -1, -8, 28, 80, 37, -7, -2, 1 }, { -1, -8, 27, 80, 38, -7, -2, 1 },
+  { 0, -8, 26, 79, 39, -7, -2, 1 },  { 0, -8, 25, 79, 40, -7, -2, 1 },
+  { 0, -8, 24, 79, 41, -7, -2, 1 },  { 0, -8, 23, 78, 42, -6, -2, 1 },
+  { 0, -8, 22, 78, 43, -6, -2, 1 },  { 0, -8, 21, 78, 44, -6, -2, 1 },
+  { 0, -8, 20, 78, 45, -5, -3, 1 },  { 0, -8, 19, 77, 47, -5, -3, 1 },
+  { 0, -8, 18, 77, 48, -5, -3, 1 },  { 0, -8, 17, 77, 49, -5, -3, 1 },
+  { 0, -8, 16, 76, 50, -4, -3, 1 },  { 0, -8, 15, 76, 51, -4, -3, 1 },
+  { 0, -8, 15, 75, 52, -3, -4, 1 },  { 0, -7, 14, 74, 53, -3, -4, 1 },
+  { 0, -7, 13, 74, 54, -3, -4, 1 },  { 0, -7, 12, 73, 55, -2, -4, 1 },
+  { 0, -7, 11, 73, 56, -2, -4, 1 },  { 0, -7, 10, 72, 57, -1, -4, 1 },
+  { 1, -7, 10, 71, 58, -1, -5, 1 },  { 0, -7, 9, 71, 59, 0, -5, 1 },
+  { 1, -7, 8, 70, 60, 0, -5, 1 },    { 1, -7, 7, 69, 61, 1, -5, 1 },
+  { 1, -6, 6, 68, 62, 1, -5, 1 },    { 0, -6, 6, 68, 62, 2, -5, 1 },
+  { 1, -6, 5, 67, 63, 2, -5, 1 },    { 1, -6, 5, 66, 64, 3, -6, 1 },
+  { 1, -6, 4, 65, 65, 4, -6, 1 },    { 1, -6, 3, 64, 66, 5, -6, 1 },
+  { 1, -5, 2, 63, 67, 5, -6, 1 },    { 1, -5, 2, 62, 68, 6, -6, 0 },
+  { 1, -5, 1, 62, 68, 6, -6, 1 },    { 1, -5, 1, 61, 69, 7, -7, 1 },
+  { 1, -5, 0, 60, 70, 8, -7, 1 },    { 1, -5, 0, 59, 71, 9, -7, 0 },
+  { 1, -5, -1, 58, 71, 10, -7, 1 },  { 1, -4, -1, 57, 72, 10, -7, 0 },
+  { 1, -4, -2, 56, 73, 11, -7, 0 },  { 1, -4, -2, 55, 73, 12, -7, 0 },
+  { 1, -4, -3, 54, 74, 13, -7, 0 },  { 1, -4, -3, 53, 74, 14, -7, 0 },
+  { 1, -4, -3, 52, 75, 15, -8, 0 },  { 1, -3, -4, 51, 76, 15, -8, 0 },
+  { 1, -3, -4, 50, 76, 16, -8, 0 },  { 1, -3, -5, 49, 77, 17, -8, 0 },
+  { 1, -3, -5, 48, 77, 18, -8, 0 },  { 1, -3, -5, 47, 77, 19, -8, 0 },
+  { 1, -3, -5, 45, 78, 20, -8, 0 },  { 1, -2, -6, 44, 78, 21, -8, 0 },
+  { 1, -2, -6, 43, 78, 22, -8, 0 },  { 1, -2, -6, 42, 78, 23, -8, 0 },
+  { 1, -2, -7, 41, 79, 24, -8, 0 },  { 1, -2, -7, 40, 79, 25, -8, 0 },
+  { 1, -2, -7, 39, 79, 26, -8, 0 },  { 1, -2, -7, 38, 80, 27, -8, -1 },
+  { 1, -2, -7, 37, 80, 28, -8, -1 }, { 1, -2, -7, 36, 80, 29, -8, -1 },
+  { 1, -1, -8, 35, 80, 30, -8, -1 }, { 1, -1, -8, 34, 80, 31, -8, -1 },
+};
+
+// Filters for interpolation (0.75-band) - note this also filters integer pels.
+static const InterpKernel filteredinterp_filters750[(1 << RS_SUBPEL_BITS)] = {
+  { 2, -11, 25, 96, 25, -11, 2, 0 }, { 2, -11, 24, 96, 26, -11, 2, 0 },
+  { 2, -11, 22, 96, 28, -11, 2, 0 }, { 2, -10, 21, 96, 29, -12, 2, 0 },
+  { 2, -10, 19, 96, 31, -12, 2, 0 }, { 2, -10, 18, 95, 32, -11, 2, 0 },
+  { 2, -10, 17, 95, 34, -12, 2, 0 }, { 2, -9, 15, 95, 35, -12, 2, 0 },
+  { 2, -9, 14, 94, 37, -12, 2, 0 },  { 2, -9, 13, 94, 38, -12, 2, 0 },
+  { 2, -8, 12, 93, 40, -12, 1, 0 },  { 2, -8, 11, 93, 41, -12, 1, 0 },
+  { 2, -8, 9, 92, 43, -12, 1, 1 },   { 2, -8, 8, 92, 44, -12, 1, 1 },
+  { 2, -7, 7, 91, 46, -12, 1, 0 },   { 2, -7, 6, 90, 47, -12, 1, 1 },
+  { 2, -7, 5, 90, 49, -12, 1, 0 },   { 2, -6, 4, 89, 50, -12, 1, 0 },
+  { 2, -6, 3, 88, 52, -12, 0, 1 },   { 2, -6, 2, 87, 54, -12, 0, 1 },
+  { 2, -5, 1, 86, 55, -12, 0, 1 },   { 2, -5, 0, 85, 57, -12, 0, 1 },
+  { 2, -5, -1, 84, 58, -11, 0, 1 },  { 2, -5, -2, 83, 60, -11, 0, 1 },
+  { 2, -4, -2, 82, 61, -11, -1, 1 }, { 1, -4, -3, 81, 63, -10, -1, 1 },
+  { 2, -4, -4, 80, 64, -10, -1, 1 }, { 1, -4, -4, 79, 66, -10, -1, 1 },
+  { 1, -3, -5, 77, 67, -9, -1, 1 },  { 1, -3, -6, 76, 69, -9, -1, 1 },
+  { 1, -3, -6, 75, 70, -8, -2, 1 },  { 1, -2, -7, 74, 71, -8, -2, 1 },
+  { 1, -2, -7, 72, 72, -7, -2, 1 },  { 1, -2, -8, 71, 74, -7, -2, 1 },
+  { 1, -2, -8, 70, 75, -6, -3, 1 },  { 1, -1, -9, 69, 76, -6, -3, 1 },
+  { 1, -1, -9, 67, 77, -5, -3, 1 },  { 1, -1, -10, 66, 79, -4, -4, 1 },
+  { 1, -1, -10, 64, 80, -4, -4, 2 }, { 1, -1, -10, 63, 81, -3, -4, 1 },
+  { 1, -1, -11, 61, 82, -2, -4, 2 }, { 1, 0, -11, 60, 83, -2, -5, 2 },
+  { 1, 0, -11, 58, 84, -1, -5, 2 },  { 1, 0, -12, 57, 85, 0, -5, 2 },
+  { 1, 0, -12, 55, 86, 1, -5, 2 },   { 1, 0, -12, 54, 87, 2, -6, 2 },
+  { 1, 0, -12, 52, 88, 3, -6, 2 },   { 0, 1, -12, 50, 89, 4, -6, 2 },
+  { 0, 1, -12, 49, 90, 5, -7, 2 },   { 1, 1, -12, 47, 90, 6, -7, 2 },
+  { 0, 1, -12, 46, 91, 7, -7, 2 },   { 1, 1, -12, 44, 92, 8, -8, 2 },
+  { 1, 1, -12, 43, 92, 9, -8, 2 },   { 0, 1, -12, 41, 93, 11, -8, 2 },
+  { 0, 1, -12, 40, 93, 12, -8, 2 },  { 0, 2, -12, 38, 94, 13, -9, 2 },
+  { 0, 2, -12, 37, 94, 14, -9, 2 },  { 0, 2, -12, 35, 95, 15, -9, 2 },
+  { 0, 2, -12, 34, 95, 17, -10, 2 }, { 0, 2, -11, 32, 95, 18, -10, 2 },
+  { 0, 2, -12, 31, 96, 19, -10, 2 }, { 0, 2, -12, 29, 96, 21, -10, 2 },
+  { 0, 2, -11, 28, 96, 22, -11, 2 }, { 0, 2, -11, 26, 96, 24, -11, 2 },
+};
+
+// Filters for interpolation (0.875-band) - note this also filters integer pels.
+static const InterpKernel filteredinterp_filters875[(1 << RS_SUBPEL_BITS)] = {
+  { 3, -8, 13, 112, 13, -8, 3, 0 },   { 2, -7, 12, 112, 15, -8, 3, -1 },
+  { 3, -7, 10, 112, 17, -9, 3, -1 },  { 2, -6, 8, 112, 19, -9, 3, -1 },
+  { 2, -6, 7, 112, 21, -10, 3, -1 },  { 2, -5, 6, 111, 22, -10, 3, -1 },
+  { 2, -5, 4, 111, 24, -10, 3, -1 },  { 2, -4, 3, 110, 26, -11, 3, -1 },
+  { 2, -4, 1, 110, 28, -11, 3, -1 },  { 2, -4, 0, 109, 30, -12, 4, -1 },
+  { 1, -3, -1, 108, 32, -12, 4, -1 }, { 1, -3, -2, 108, 34, -13, 4, -1 },
+  { 1, -2, -4, 107, 36, -13, 4, -1 }, { 1, -2, -5, 106, 38, -13, 4, -1 },
+  { 1, -1, -6, 105, 40, -14, 4, -1 }, { 1, -1, -7, 104, 42, -14, 4, -1 },
+  { 1, -1, -7, 103, 44, -15, 4, -1 }, { 1, 0, -8, 101, 46, -15, 4, -1 },
+  { 1, 0, -9, 100, 48, -15, 4, -1 },  { 1, 0, -10, 99, 50, -15, 4, -1 },
+  { 1, 1, -11, 97, 53, -16, 4, -1 },  { 0, 1, -11, 96, 55, -16, 4, -1 },
+  { 0, 1, -12, 95, 57, -16, 4, -1 },  { 0, 2, -13, 93, 59, -16, 4, -1 },
+  { 0, 2, -13, 91, 61, -16, 4, -1 },  { 0, 2, -14, 90, 63, -16, 4, -1 },
+  { 0, 2, -14, 88, 65, -16, 4, -1 },  { 0, 2, -15, 86, 67, -16, 4, 0 },
+  { 0, 3, -15, 84, 69, -17, 4, 0 },   { 0, 3, -16, 83, 71, -17, 4, 0 },
+  { 0, 3, -16, 81, 73, -16, 3, 0 },   { 0, 3, -16, 79, 75, -16, 3, 0 },
+  { 0, 3, -16, 77, 77, -16, 3, 0 },   { 0, 3, -16, 75, 79, -16, 3, 0 },
+  { 0, 3, -16, 73, 81, -16, 3, 0 },   { 0, 4, -17, 71, 83, -16, 3, 0 },
+  { 0, 4, -17, 69, 84, -15, 3, 0 },   { 0, 4, -16, 67, 86, -15, 2, 0 },
+  { -1, 4, -16, 65, 88, -14, 2, 0 },  { -1, 4, -16, 63, 90, -14, 2, 0 },
+  { -1, 4, -16, 61, 91, -13, 2, 0 },  { -1, 4, -16, 59, 93, -13, 2, 0 },
+  { -1, 4, -16, 57, 95, -12, 1, 0 },  { -1, 4, -16, 55, 96, -11, 1, 0 },
+  { -1, 4, -16, 53, 97, -11, 1, 1 },  { -1, 4, -15, 50, 99, -10, 0, 1 },
+  { -1, 4, -15, 48, 100, -9, 0, 1 },  { -1, 4, -15, 46, 101, -8, 0, 1 },
+  { -1, 4, -15, 44, 103, -7, -1, 1 }, { -1, 4, -14, 42, 104, -7, -1, 1 },
+  { -1, 4, -14, 40, 105, -6, -1, 1 }, { -1, 4, -13, 38, 106, -5, -2, 1 },
+  { -1, 4, -13, 36, 107, -4, -2, 1 }, { -1, 4, -13, 34, 108, -2, -3, 1 },
+  { -1, 4, -12, 32, 108, -1, -3, 1 }, { -1, 4, -12, 30, 109, 0, -4, 2 },
+  { -1, 3, -11, 28, 110, 1, -4, 2 },  { -1, 3, -11, 26, 110, 3, -4, 2 },
+  { -1, 3, -10, 24, 111, 4, -5, 2 },  { -1, 3, -10, 22, 111, 6, -5, 2 },
+  { -1, 3, -10, 21, 112, 7, -6, 2 },  { -1, 3, -9, 19, 112, 8, -6, 2 },
+  { -1, 3, -9, 17, 112, 10, -7, 3 },  { -1, 3, -8, 15, 112, 12, -7, 2 },
+};
+
+const int16_t av1_resize_filter_normative[(
+    1 << RS_SUBPEL_BITS)][UPSCALE_NORMATIVE_TAPS] = {
+#if UPSCALE_NORMATIVE_TAPS == 8
+  { 0, 0, 0, 128, 0, 0, 0, 0 },        { 0, 0, -1, 128, 2, -1, 0, 0 },
+  { 0, 1, -3, 127, 4, -2, 1, 0 },      { 0, 1, -4, 127, 6, -3, 1, 0 },
+  { 0, 2, -6, 126, 8, -3, 1, 0 },      { 0, 2, -7, 125, 11, -4, 1, 0 },
+  { -1, 2, -8, 125, 13, -5, 2, 0 },    { -1, 3, -9, 124, 15, -6, 2, 0 },
+  { -1, 3, -10, 123, 18, -6, 2, -1 },  { -1, 3, -11, 122, 20, -7, 3, -1 },
+  { -1, 4, -12, 121, 22, -8, 3, -1 },  { -1, 4, -13, 120, 25, -9, 3, -1 },
+  { -1, 4, -14, 118, 28, -9, 3, -1 },  { -1, 4, -15, 117, 30, -10, 4, -1 },
+  { -1, 5, -16, 116, 32, -11, 4, -1 }, { -1, 5, -16, 114, 35, -12, 4, -1 },
+  { -1, 5, -17, 112, 38, -12, 4, -1 }, { -1, 5, -18, 111, 40, -13, 5, -1 },
+  { -1, 5, -18, 109, 43, -14, 5, -1 }, { -1, 6, -19, 107, 45, -14, 5, -1 },
+  { -1, 6, -19, 105, 48, -15, 5, -1 }, { -1, 6, -19, 103, 51, -16, 5, -1 },
+  { -1, 6, -20, 101, 53, -16, 6, -1 }, { -1, 6, -20, 99, 56, -17, 6, -1 },
+  { -1, 6, -20, 97, 58, -17, 6, -1 },  { -1, 6, -20, 95, 61, -18, 6, -1 },
+  { -2, 7, -20, 93, 64, -18, 6, -2 },  { -2, 7, -20, 91, 66, -19, 6, -1 },
+  { -2, 7, -20, 88, 69, -19, 6, -1 },  { -2, 7, -20, 86, 71, -19, 6, -1 },
+  { -2, 7, -20, 84, 74, -20, 7, -2 },  { -2, 7, -20, 81, 76, -20, 7, -1 },
+  { -2, 7, -20, 79, 79, -20, 7, -2 },  { -1, 7, -20, 76, 81, -20, 7, -2 },
+  { -2, 7, -20, 74, 84, -20, 7, -2 },  { -1, 6, -19, 71, 86, -20, 7, -2 },
+  { -1, 6, -19, 69, 88, -20, 7, -2 },  { -1, 6, -19, 66, 91, -20, 7, -2 },
+  { -2, 6, -18, 64, 93, -20, 7, -2 },  { -1, 6, -18, 61, 95, -20, 6, -1 },
+  { -1, 6, -17, 58, 97, -20, 6, -1 },  { -1, 6, -17, 56, 99, -20, 6, -1 },
+  { -1, 6, -16, 53, 101, -20, 6, -1 }, { -1, 5, -16, 51, 103, -19, 6, -1 },
+  { -1, 5, -15, 48, 105, -19, 6, -1 }, { -1, 5, -14, 45, 107, -19, 6, -1 },
+  { -1, 5, -14, 43, 109, -18, 5, -1 }, { -1, 5, -13, 40, 111, -18, 5, -1 },
+  { -1, 4, -12, 38, 112, -17, 5, -1 }, { -1, 4, -12, 35, 114, -16, 5, -1 },
+  { -1, 4, -11, 32, 116, -16, 5, -1 }, { -1, 4, -10, 30, 117, -15, 4, -1 },
+  { -1, 3, -9, 28, 118, -14, 4, -1 },  { -1, 3, -9, 25, 120, -13, 4, -1 },
+  { -1, 3, -8, 22, 121, -12, 4, -1 },  { -1, 3, -7, 20, 122, -11, 3, -1 },
+  { -1, 2, -6, 18, 123, -10, 3, -1 },  { 0, 2, -6, 15, 124, -9, 3, -1 },
+  { 0, 2, -5, 13, 125, -8, 2, -1 },    { 0, 1, -4, 11, 125, -7, 2, 0 },
+  { 0, 1, -3, 8, 126, -6, 2, 0 },      { 0, 1, -3, 6, 127, -4, 1, 0 },
+  { 0, 1, -2, 4, 127, -3, 1, 0 },      { 0, 0, -1, 2, 128, -1, 0, 0 },
+#else
+#error "Invalid value of UPSCALE_NORMATIVE_TAPS"
+#endif  // UPSCALE_NORMATIVE_TAPS == 8
+};
+
+// Filters for interpolation (full-band) - no filtering for integer pixels
+#define filteredinterp_filters1000 av1_resize_filter_normative
+
+// Filters for factor of 2 downsampling.
+static const int16_t av1_down2_symeven_half_filter[] = { 56, 12, -3, -1 };
+static const int16_t av1_down2_symodd_half_filter[] = { 64, 35, 0, -3 };
+
+static const InterpKernel *choose_interp_filter(int in_length, int out_length) {
+  int out_length16 = out_length * 16;
+  if (out_length16 >= in_length * 16)
+    return filteredinterp_filters1000;
+  else if (out_length16 >= in_length * 13)
+    return filteredinterp_filters875;
+  else if (out_length16 >= in_length * 11)
+    return filteredinterp_filters750;
+  else if (out_length16 >= in_length * 9)
+    return filteredinterp_filters625;
+  else
+    return filteredinterp_filters500;
+}
+
+static void interpolate_core(const uint8_t *const input, int in_length,
+                             uint8_t *output, int out_length,
+                             const int16_t *interp_filters, int interp_taps) {
+  const int32_t delta =
+      (((uint32_t)in_length << RS_SCALE_SUBPEL_BITS) + out_length / 2) /
+      out_length;
+  const int32_t offset =
+      in_length > out_length
+          ? (((int32_t)(in_length - out_length) << (RS_SCALE_SUBPEL_BITS - 1)) +
+             out_length / 2) /
+                out_length
+          : -(((int32_t)(out_length - in_length)
+               << (RS_SCALE_SUBPEL_BITS - 1)) +
+              out_length / 2) /
+                out_length;
+  uint8_t *optr = output;
+  int x, x1, x2, sum, k, int_pel, sub_pel;
+  int32_t y;
+
+  x = 0;
+  y = offset + RS_SCALE_EXTRA_OFF;
+  while ((y >> RS_SCALE_SUBPEL_BITS) < (interp_taps / 2 - 1)) {
+    x++;
+    y += delta;
+  }
+  x1 = x;
+  x = out_length - 1;
+  y = delta * x + offset + RS_SCALE_EXTRA_OFF;
+  while ((y >> RS_SCALE_SUBPEL_BITS) + (int32_t)(interp_taps / 2) >=
+         in_length) {
+    x--;
+    y -= delta;
+  }
+  x2 = x;
+  if (x1 > x2) {
+    for (x = 0, y = offset + RS_SCALE_EXTRA_OFF; x < out_length;
+         ++x, y += delta) {
+      int_pel = y >> RS_SCALE_SUBPEL_BITS;
+      sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK;
+      const int16_t *filter = &interp_filters[sub_pel * interp_taps];
+      sum = 0;
+      for (k = 0; k < interp_taps; ++k) {
+        const int pk = int_pel - interp_taps / 2 + 1 + k;
+        sum += filter[k] * input[AOMMAX(AOMMIN(pk, in_length - 1), 0)];
+      }
+      *optr++ = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+    }
+  } else {
+    // Initial part.
+    for (x = 0, y = offset + RS_SCALE_EXTRA_OFF; x < x1; ++x, y += delta) {
+      int_pel = y >> RS_SCALE_SUBPEL_BITS;
+      sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK;
+      const int16_t *filter = &interp_filters[sub_pel * interp_taps];
+      sum = 0;
+      for (k = 0; k < interp_taps; ++k)
+        sum += filter[k] * input[AOMMAX(int_pel - interp_taps / 2 + 1 + k, 0)];
+      *optr++ = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+    }
+    // Middle part.
+    for (; x <= x2; ++x, y += delta) {
+      int_pel = y >> RS_SCALE_SUBPEL_BITS;
+      sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK;
+      const int16_t *filter = &interp_filters[sub_pel * interp_taps];
+      sum = 0;
+      for (k = 0; k < interp_taps; ++k)
+        sum += filter[k] * input[int_pel - interp_taps / 2 + 1 + k];
+      *optr++ = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+    }
+    // End part.
+    for (; x < out_length; ++x, y += delta) {
+      int_pel = y >> RS_SCALE_SUBPEL_BITS;
+      sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK;
+      const int16_t *filter = &interp_filters[sub_pel * interp_taps];
+      sum = 0;
+      for (k = 0; k < interp_taps; ++k)
+        sum += filter[k] *
+               input[AOMMIN(int_pel - interp_taps / 2 + 1 + k, in_length - 1)];
+      *optr++ = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+    }
+  }
+}
+
+static void interpolate_core_double_prec(const double *const input,
+                                         int in_length, double *output,
+                                         int out_length,
+                                         const int16_t *interp_filters,
+                                         int interp_taps) {
+  const int32_t delta =
+      (((uint32_t)in_length << RS_SCALE_SUBPEL_BITS) + out_length / 2) /
+      out_length;
+  const int32_t offset =
+      in_length > out_length
+          ? (((int32_t)(in_length - out_length) << (RS_SCALE_SUBPEL_BITS - 1)) +
+             out_length / 2) /
+                out_length
+          : -(((int32_t)(out_length - in_length)
+               << (RS_SCALE_SUBPEL_BITS - 1)) +
+              out_length / 2) /
+                out_length;
+  double *optr = output;
+  int x, x1, x2, k, int_pel, sub_pel;
+  double sum;
+  int32_t y;
+
+  x = 0;
+  y = offset + RS_SCALE_EXTRA_OFF;
+  while ((y >> RS_SCALE_SUBPEL_BITS) < (interp_taps / 2 - 1)) {
+    x++;
+    y += delta;
+  }
+  x1 = x;
+  x = out_length - 1;
+  y = delta * x + offset + RS_SCALE_EXTRA_OFF;
+  while ((y >> RS_SCALE_SUBPEL_BITS) + (int32_t)(interp_taps / 2) >=
+         in_length) {
+    x--;
+    y -= delta;
+  }
+  x2 = x;
+  if (x1 > x2) {
+    for (x = 0, y = offset + RS_SCALE_EXTRA_OFF; x < out_length;
+         ++x, y += delta) {
+      int_pel = y >> RS_SCALE_SUBPEL_BITS;
+      sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK;
+      const int16_t *filter = &interp_filters[sub_pel * interp_taps];
+      sum = 0;
+      for (k = 0; k < interp_taps; ++k) {
+        const int pk = int_pel - interp_taps / 2 + 1 + k;
+        sum += filter[k] * input[AOMMAX(AOMMIN(pk, in_length - 1), 0)];
+      }
+      *optr++ = sum / (1 << FILTER_BITS);
+    }
+  } else {
+    // Initial part.
+    for (x = 0, y = offset + RS_SCALE_EXTRA_OFF; x < x1; ++x, y += delta) {
+      int_pel = y >> RS_SCALE_SUBPEL_BITS;
+      sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK;
+      const int16_t *filter = &interp_filters[sub_pel * interp_taps];
+      sum = 0;
+      for (k = 0; k < interp_taps; ++k)
+        sum += filter[k] * input[AOMMAX(int_pel - interp_taps / 2 + 1 + k, 0)];
+      *optr++ = sum / (1 << FILTER_BITS);
+    }
+    // Middle part.
+    for (; x <= x2; ++x, y += delta) {
+      int_pel = y >> RS_SCALE_SUBPEL_BITS;
+      sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK;
+      const int16_t *filter = &interp_filters[sub_pel * interp_taps];
+      sum = 0;
+      for (k = 0; k < interp_taps; ++k)
+        sum += filter[k] * input[int_pel - interp_taps / 2 + 1 + k];
+      *optr++ = sum / (1 << FILTER_BITS);
+    }
+    // End part.
+    for (; x < out_length; ++x, y += delta) {
+      int_pel = y >> RS_SCALE_SUBPEL_BITS;
+      sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK;
+      const int16_t *filter = &interp_filters[sub_pel * interp_taps];
+      sum = 0;
+      for (k = 0; k < interp_taps; ++k)
+        sum += filter[k] *
+               input[AOMMIN(int_pel - interp_taps / 2 + 1 + k, in_length - 1)];
+      *optr++ = sum / (1 << FILTER_BITS);
+    }
+  }
+}
+
+static void interpolate(const uint8_t *const input, int in_length,
+                        uint8_t *output, int out_length) {
+  const InterpKernel *interp_filters =
+      choose_interp_filter(in_length, out_length);
+
+  interpolate_core(input, in_length, output, out_length, &interp_filters[0][0],
+                   SUBPEL_TAPS);
+}
+
+static void interpolate_double_prec(const double *const input, int in_length,
+                                    double *output, int out_length) {
+  const InterpKernel *interp_filters =
+      choose_interp_filter(in_length, out_length);
+
+  interpolate_core_double_prec(input, in_length, output, out_length,
+                               &interp_filters[0][0], SUBPEL_TAPS);
+}
+
+int32_t av1_get_upscale_convolve_step(int in_length, int out_length) {
+  return ((in_length << RS_SCALE_SUBPEL_BITS) + out_length / 2) / out_length;
+}
+
+static int32_t get_upscale_convolve_x0(int in_length, int out_length,
+                                       int32_t x_step_qn) {
+  const int err = out_length * x_step_qn - (in_length << RS_SCALE_SUBPEL_BITS);
+  const int32_t x0 =
+      (-((out_length - in_length) << (RS_SCALE_SUBPEL_BITS - 1)) +
+       out_length / 2) /
+          out_length +
+      RS_SCALE_EXTRA_OFF - err / 2;
+  return (int32_t)((uint32_t)x0 & RS_SCALE_SUBPEL_MASK);
+}
+
+static void down2_symeven(const uint8_t *const input, int length,
+                          uint8_t *output) {
+  // Actual filter len = 2 * filter_len_half.
+  const int16_t *filter = av1_down2_symeven_half_filter;
+  const int filter_len_half = sizeof(av1_down2_symeven_half_filter) / 2;
+  int i, j;
+  uint8_t *optr = output;
+  int l1 = filter_len_half;
+  int l2 = (length - filter_len_half);
+  l1 += (l1 & 1);
+  l2 += (l2 & 1);
+  if (l1 > l2) {
+    // Short input length.
+    for (i = 0; i < length; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1));
+      for (j = 0; j < filter_len_half; ++j) {
+        sum +=
+            (input[AOMMAX(i - j, 0)] + input[AOMMIN(i + 1 + j, length - 1)]) *
+            filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel(sum);
+    }
+  } else {
+    // Initial part.
+    for (i = 0; i < l1; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1));
+      for (j = 0; j < filter_len_half; ++j) {
+        sum += (input[AOMMAX(i - j, 0)] + input[i + 1 + j]) * filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel(sum);
+    }
+    // Middle part.
+    for (; i < l2; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1));
+      for (j = 0; j < filter_len_half; ++j) {
+        sum += (input[i - j] + input[i + 1 + j]) * filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel(sum);
+    }
+    // End part.
+    for (; i < length; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1));
+      for (j = 0; j < filter_len_half; ++j) {
+        sum +=
+            (input[i - j] + input[AOMMIN(i + 1 + j, length - 1)]) * filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel(sum);
+    }
+  }
+}
+
+static void down2_symodd(const uint8_t *const input, int length,
+                         uint8_t *output) {
+  // Actual filter len = 2 * filter_len_half - 1.
+  const int16_t *filter = av1_down2_symodd_half_filter;
+  const int filter_len_half = sizeof(av1_down2_symodd_half_filter) / 2;
+  int i, j;
+  uint8_t *optr = output;
+  int l1 = filter_len_half - 1;
+  int l2 = (length - filter_len_half + 1);
+  l1 += (l1 & 1);
+  l2 += (l2 & 1);
+  if (l1 > l2) {
+    // Short input length.
+    for (i = 0; i < length; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+      for (j = 1; j < filter_len_half; ++j) {
+        sum += (input[(i - j < 0 ? 0 : i - j)] +
+                input[(i + j >= length ? length - 1 : i + j)]) *
+               filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel(sum);
+    }
+  } else {
+    // Initial part.
+    for (i = 0; i < l1; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+      for (j = 1; j < filter_len_half; ++j) {
+        sum += (input[(i - j < 0 ? 0 : i - j)] + input[i + j]) * filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel(sum);
+    }
+    // Middle part.
+    for (; i < l2; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+      for (j = 1; j < filter_len_half; ++j) {
+        sum += (input[i - j] + input[i + j]) * filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel(sum);
+    }
+    // End part.
+    for (; i < length; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+      for (j = 1; j < filter_len_half; ++j) {
+        sum += (input[i - j] + input[(i + j >= length ? length - 1 : i + j)]) *
+               filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel(sum);
+    }
+  }
+}
+
+static int get_down2_length(int length, int steps) {
+  for (int s = 0; s < steps; ++s) length = (length + 1) >> 1;
+  return length;
+}
+
+static int get_down2_steps(int in_length, int out_length) {
+  int steps = 0;
+  int proj_in_length;
+  while ((proj_in_length = get_down2_length(in_length, 1)) >= out_length) {
+    ++steps;
+    in_length = proj_in_length;
+    if (in_length == 1) {
+      // Special case: we break because any further calls to get_down2_length()
+      // with be with length == 1, which return 1, resulting in an infinite
+      // loop.
+      break;
+    }
+  }
+  return steps;
+}
+
+static void resize_multistep(const uint8_t *const input, int length,
+                             uint8_t *output, int olength, uint8_t *otmp) {
+  if (length == olength) {
+    memcpy(output, input, sizeof(output[0]) * length);
+    return;
+  }
+  const int steps = get_down2_steps(length, olength);
+
+  if (steps > 0) {
+    uint8_t *out = NULL;
+    int filteredlength = length;
+
+    assert(otmp != NULL);
+    uint8_t *otmp2 = otmp + get_down2_length(length, 1);
+    for (int s = 0; s < steps; ++s) {
+      const int proj_filteredlength = get_down2_length(filteredlength, 1);
+      const uint8_t *const in = (s == 0 ? input : out);
+      if (s == steps - 1 && proj_filteredlength == olength)
+        out = output;
+      else
+        out = (s & 1 ? otmp2 : otmp);
+      if (filteredlength & 1)
+        down2_symodd(in, filteredlength, out);
+      else
+        down2_symeven(in, filteredlength, out);
+      filteredlength = proj_filteredlength;
+    }
+    if (filteredlength != olength) {
+      interpolate(out, filteredlength, output, olength);
+    }
+  } else {
+    interpolate(input, length, output, olength);
+  }
+}
+
+static void upscale_multistep_double_prec(const double *const input, int length,
+                                          double *output, int olength) {
+  assert(length < olength);
+  interpolate_double_prec(input, length, output, olength);
+}
+
+static void fill_col_to_arr(uint8_t *img, int stride, int len, uint8_t *arr) {
+  int i;
+  uint8_t *iptr = img;
+  uint8_t *aptr = arr;
+  for (i = 0; i < len; ++i, iptr += stride) {
+    *aptr++ = *iptr;
+  }
+}
+
+static void fill_arr_to_col(uint8_t *img, int stride, int len, uint8_t *arr) {
+  int i;
+  uint8_t *iptr = img;
+  uint8_t *aptr = arr;
+  for (i = 0; i < len; ++i, iptr += stride) {
+    *iptr = *aptr++;
+  }
+}
+
+static void fill_col_to_arr_double_prec(double *img, int stride, int len,
+                                        double *arr) {
+  int i;
+  double *iptr = img;
+  double *aptr = arr;
+  for (i = 0; i < len; ++i, iptr += stride) {
+    *aptr++ = *iptr;
+  }
+}
+
+static void fill_arr_to_col_double_prec(double *img, int stride, int len,
+                                        double *arr) {
+  int i;
+  double *iptr = img;
+  double *aptr = arr;
+  for (i = 0; i < len; ++i, iptr += stride) {
+    *iptr = *aptr++;
+  }
+}
+
+void av1_resize_plane(const uint8_t *const input, int height, int width,
+                      int in_stride, uint8_t *output, int height2, int width2,
+                      int out_stride) {
+  int i;
+  uint8_t *intbuf = (uint8_t *)aom_malloc(sizeof(uint8_t) * width2 * height);
+  uint8_t *tmpbuf =
+      (uint8_t *)aom_malloc(sizeof(uint8_t) * AOMMAX(width, height));
+  uint8_t *arrbuf = (uint8_t *)aom_malloc(sizeof(uint8_t) * height);
+  uint8_t *arrbuf2 = (uint8_t *)aom_malloc(sizeof(uint8_t) * height2);
+  if (intbuf == NULL || tmpbuf == NULL || arrbuf == NULL || arrbuf2 == NULL)
+    goto Error;
+  assert(width > 0);
+  assert(height > 0);
+  assert(width2 > 0);
+  assert(height2 > 0);
+  for (i = 0; i < height; ++i)
+    resize_multistep(input + in_stride * i, width, intbuf + width2 * i, width2,
+                     tmpbuf);
+  for (i = 0; i < width2; ++i) {
+    fill_col_to_arr(intbuf + i, width2, height, arrbuf);
+    resize_multistep(arrbuf, height, arrbuf2, height2, tmpbuf);
+    fill_arr_to_col(output + i, out_stride, height2, arrbuf2);
+  }
+
+Error:
+  aom_free(intbuf);
+  aom_free(tmpbuf);
+  aom_free(arrbuf);
+  aom_free(arrbuf2);
+}
+
+void av1_upscale_plane_double_prec(const double *const input, int height,
+                                   int width, int in_stride, double *output,
+                                   int height2, int width2, int out_stride) {
+  int i;
+  double *intbuf = (double *)aom_malloc(sizeof(double) * width2 * height);
+  double *arrbuf = (double *)aom_malloc(sizeof(double) * height);
+  double *arrbuf2 = (double *)aom_malloc(sizeof(double) * height2);
+  if (intbuf == NULL || arrbuf == NULL || arrbuf2 == NULL) goto Error;
+  assert(width > 0);
+  assert(height > 0);
+  assert(width2 > 0);
+  assert(height2 > 0);
+  for (i = 0; i < height; ++i)
+    upscale_multistep_double_prec(input + in_stride * i, width,
+                                  intbuf + width2 * i, width2);
+  for (i = 0; i < width2; ++i) {
+    fill_col_to_arr_double_prec(intbuf + i, width2, height, arrbuf);
+    upscale_multistep_double_prec(arrbuf, height, arrbuf2, height2);
+    fill_arr_to_col_double_prec(output + i, out_stride, height2, arrbuf2);
+  }
+
+Error:
+  aom_free(intbuf);
+  aom_free(arrbuf);
+  aom_free(arrbuf2);
+}
+
+static void upscale_normative_rect(const uint8_t *const input, int height,
+                                   int width, int in_stride, uint8_t *output,
+                                   int height2, int width2, int out_stride,
+                                   int x_step_qn, int x0_qn, int pad_left,
+                                   int pad_right) {
+  assert(width > 0);
+  assert(height > 0);
+  assert(width2 > 0);
+  assert(height2 > 0);
+  assert(height2 == height);
+
+  // Extend the left/right pixels of the tile column if needed
+  // (either because we can't sample from other tiles, or because we're at
+  // a frame edge).
+  // Save the overwritten pixels into tmp_left and tmp_right.
+  // Note: Because we pass input-1 to av1_convolve_horiz_rs, we need one extra
+  // column of border pixels compared to what we'd naively think.
+  const int border_cols = UPSCALE_NORMATIVE_TAPS / 2 + 1;
+  uint8_t *tmp_left =
+      NULL;  // Silence spurious "may be used uninitialized" warnings
+  uint8_t *tmp_right = NULL;
+  uint8_t *const in_tl = (uint8_t *)(input - border_cols);  // Cast off 'const'
+  uint8_t *const in_tr = (uint8_t *)(input + width);
+  if (pad_left) {
+    tmp_left = (uint8_t *)aom_malloc(sizeof(*tmp_left) * border_cols * height);
+    for (int i = 0; i < height; i++) {
+      memcpy(tmp_left + i * border_cols, in_tl + i * in_stride, border_cols);
+      memset(in_tl + i * in_stride, input[i * in_stride], border_cols);
+    }
+  }
+  if (pad_right) {
+    tmp_right =
+        (uint8_t *)aom_malloc(sizeof(*tmp_right) * border_cols * height);
+    for (int i = 0; i < height; i++) {
+      memcpy(tmp_right + i * border_cols, in_tr + i * in_stride, border_cols);
+      memset(in_tr + i * in_stride, input[i * in_stride + width - 1],
+             border_cols);
+    }
+  }
+
+  av1_convolve_horiz_rs(input - 1, in_stride, output, out_stride, width2,
+                        height2, &av1_resize_filter_normative[0][0], x0_qn,
+                        x_step_qn);
+
+  // Restore the left/right border pixels
+  if (pad_left) {
+    for (int i = 0; i < height; i++) {
+      memcpy(in_tl + i * in_stride, tmp_left + i * border_cols, border_cols);
+    }
+    aom_free(tmp_left);
+  }
+  if (pad_right) {
+    for (int i = 0; i < height; i++) {
+      memcpy(in_tr + i * in_stride, tmp_right + i * border_cols, border_cols);
+    }
+    aom_free(tmp_right);
+  }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static void highbd_interpolate_core(const uint16_t *const input, int in_length,
+                                    uint16_t *output, int out_length, int bd,
+                                    const int16_t *interp_filters,
+                                    int interp_taps) {
+  const int32_t delta =
+      (((uint32_t)in_length << RS_SCALE_SUBPEL_BITS) + out_length / 2) /
+      out_length;
+  const int32_t offset =
+      in_length > out_length
+          ? (((int32_t)(in_length - out_length) << (RS_SCALE_SUBPEL_BITS - 1)) +
+             out_length / 2) /
+                out_length
+          : -(((int32_t)(out_length - in_length)
+               << (RS_SCALE_SUBPEL_BITS - 1)) +
+              out_length / 2) /
+                out_length;
+  uint16_t *optr = output;
+  int x, x1, x2, sum, k, int_pel, sub_pel;
+  int32_t y;
+
+  x = 0;
+  y = offset + RS_SCALE_EXTRA_OFF;
+  while ((y >> RS_SCALE_SUBPEL_BITS) < (interp_taps / 2 - 1)) {
+    x++;
+    y += delta;
+  }
+  x1 = x;
+  x = out_length - 1;
+  y = delta * x + offset + RS_SCALE_EXTRA_OFF;
+  while ((y >> RS_SCALE_SUBPEL_BITS) + (int32_t)(interp_taps / 2) >=
+         in_length) {
+    x--;
+    y -= delta;
+  }
+  x2 = x;
+  if (x1 > x2) {
+    for (x = 0, y = offset + RS_SCALE_EXTRA_OFF; x < out_length;
+         ++x, y += delta) {
+      int_pel = y >> RS_SCALE_SUBPEL_BITS;
+      sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK;
+      const int16_t *filter = &interp_filters[sub_pel * interp_taps];
+      sum = 0;
+      for (k = 0; k < interp_taps; ++k) {
+        const int pk = int_pel - interp_taps / 2 + 1 + k;
+        sum += filter[k] * input[AOMMAX(AOMMIN(pk, in_length - 1), 0)];
+      }
+      *optr++ = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+    }
+  } else {
+    // Initial part.
+    for (x = 0, y = offset + RS_SCALE_EXTRA_OFF; x < x1; ++x, y += delta) {
+      int_pel = y >> RS_SCALE_SUBPEL_BITS;
+      sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK;
+      const int16_t *filter = &interp_filters[sub_pel * interp_taps];
+      sum = 0;
+      for (k = 0; k < interp_taps; ++k)
+        sum += filter[k] * input[AOMMAX(int_pel - interp_taps / 2 + 1 + k, 0)];
+      *optr++ = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+    }
+    // Middle part.
+    for (; x <= x2; ++x, y += delta) {
+      int_pel = y >> RS_SCALE_SUBPEL_BITS;
+      sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK;
+      const int16_t *filter = &interp_filters[sub_pel * interp_taps];
+      sum = 0;
+      for (k = 0; k < interp_taps; ++k)
+        sum += filter[k] * input[int_pel - interp_taps / 2 + 1 + k];
+      *optr++ = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+    }
+    // End part.
+    for (; x < out_length; ++x, y += delta) {
+      int_pel = y >> RS_SCALE_SUBPEL_BITS;
+      sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK;
+      const int16_t *filter = &interp_filters[sub_pel * interp_taps];
+      sum = 0;
+      for (k = 0; k < interp_taps; ++k)
+        sum += filter[k] *
+               input[AOMMIN(int_pel - interp_taps / 2 + 1 + k, in_length - 1)];
+      *optr++ = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+    }
+  }
+}
+
+static void highbd_interpolate(const uint16_t *const input, int in_length,
+                               uint16_t *output, int out_length, int bd) {
+  const InterpKernel *interp_filters =
+      choose_interp_filter(in_length, out_length);
+
+  highbd_interpolate_core(input, in_length, output, out_length, bd,
+                          &interp_filters[0][0], SUBPEL_TAPS);
+}
+
+static void highbd_down2_symeven(const uint16_t *const input, int length,
+                                 uint16_t *output, int bd) {
+  // Actual filter len = 2 * filter_len_half.
+  static const int16_t *filter = av1_down2_symeven_half_filter;
+  const int filter_len_half = sizeof(av1_down2_symeven_half_filter) / 2;
+  int i, j;
+  uint16_t *optr = output;
+  int l1 = filter_len_half;
+  int l2 = (length - filter_len_half);
+  l1 += (l1 & 1);
+  l2 += (l2 & 1);
+  if (l1 > l2) {
+    // Short input length.
+    for (i = 0; i < length; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1));
+      for (j = 0; j < filter_len_half; ++j) {
+        sum +=
+            (input[AOMMAX(0, i - j)] + input[AOMMIN(i + 1 + j, length - 1)]) *
+            filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel_highbd(sum, bd);
+    }
+  } else {
+    // Initial part.
+    for (i = 0; i < l1; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1));
+      for (j = 0; j < filter_len_half; ++j) {
+        sum += (input[AOMMAX(0, i - j)] + input[i + 1 + j]) * filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel_highbd(sum, bd);
+    }
+    // Middle part.
+    for (; i < l2; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1));
+      for (j = 0; j < filter_len_half; ++j) {
+        sum += (input[i - j] + input[i + 1 + j]) * filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel_highbd(sum, bd);
+    }
+    // End part.
+    for (; i < length; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1));
+      for (j = 0; j < filter_len_half; ++j) {
+        sum +=
+            (input[i - j] + input[AOMMIN(i + 1 + j, length - 1)]) * filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel_highbd(sum, bd);
+    }
+  }
+}
+
+static void highbd_down2_symodd(const uint16_t *const input, int length,
+                                uint16_t *output, int bd) {
+  // Actual filter len = 2 * filter_len_half - 1.
+  static const int16_t *filter = av1_down2_symodd_half_filter;
+  const int filter_len_half = sizeof(av1_down2_symodd_half_filter) / 2;
+  int i, j;
+  uint16_t *optr = output;
+  int l1 = filter_len_half - 1;
+  int l2 = (length - filter_len_half + 1);
+  l1 += (l1 & 1);
+  l2 += (l2 & 1);
+  if (l1 > l2) {
+    // Short input length.
+    for (i = 0; i < length; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+      for (j = 1; j < filter_len_half; ++j) {
+        sum += (input[AOMMAX(i - j, 0)] + input[AOMMIN(i + j, length - 1)]) *
+               filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel_highbd(sum, bd);
+    }
+  } else {
+    // Initial part.
+    for (i = 0; i < l1; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+      for (j = 1; j < filter_len_half; ++j) {
+        sum += (input[AOMMAX(i - j, 0)] + input[i + j]) * filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel_highbd(sum, bd);
+    }
+    // Middle part.
+    for (; i < l2; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+      for (j = 1; j < filter_len_half; ++j) {
+        sum += (input[i - j] + input[i + j]) * filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel_highbd(sum, bd);
+    }
+    // End part.
+    for (; i < length; i += 2) {
+      int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+      for (j = 1; j < filter_len_half; ++j) {
+        sum += (input[i - j] + input[AOMMIN(i + j, length - 1)]) * filter[j];
+      }
+      sum >>= FILTER_BITS;
+      *optr++ = clip_pixel_highbd(sum, bd);
+    }
+  }
+}
+
+static void highbd_resize_multistep(const uint16_t *const input, int length,
+                                    uint16_t *output, int olength,
+                                    uint16_t *otmp, int bd) {
+  if (length == olength) {
+    memcpy(output, input, sizeof(output[0]) * length);
+    return;
+  }
+  const int steps = get_down2_steps(length, olength);
+
+  if (steps > 0) {
+    uint16_t *out = NULL;
+    int filteredlength = length;
+
+    assert(otmp != NULL);
+    uint16_t *otmp2 = otmp + get_down2_length(length, 1);
+    for (int s = 0; s < steps; ++s) {
+      const int proj_filteredlength = get_down2_length(filteredlength, 1);
+      const uint16_t *const in = (s == 0 ? input : out);
+      if (s == steps - 1 && proj_filteredlength == olength)
+        out = output;
+      else
+        out = (s & 1 ? otmp2 : otmp);
+      if (filteredlength & 1)
+        highbd_down2_symodd(in, filteredlength, out, bd);
+      else
+        highbd_down2_symeven(in, filteredlength, out, bd);
+      filteredlength = proj_filteredlength;
+    }
+    if (filteredlength != olength) {
+      highbd_interpolate(out, filteredlength, output, olength, bd);
+    }
+  } else {
+    highbd_interpolate(input, length, output, olength, bd);
+  }
+}
+
+static void highbd_fill_col_to_arr(uint16_t *img, int stride, int len,
+                                   uint16_t *arr) {
+  int i;
+  uint16_t *iptr = img;
+  uint16_t *aptr = arr;
+  for (i = 0; i < len; ++i, iptr += stride) {
+    *aptr++ = *iptr;
+  }
+}
+
+static void highbd_fill_arr_to_col(uint16_t *img, int stride, int len,
+                                   uint16_t *arr) {
+  int i;
+  uint16_t *iptr = img;
+  uint16_t *aptr = arr;
+  for (i = 0; i < len; ++i, iptr += stride) {
+    *iptr = *aptr++;
+  }
+}
+
+void av1_highbd_resize_plane(const uint8_t *const input, int height, int width,
+                             int in_stride, uint8_t *output, int height2,
+                             int width2, int out_stride, int bd) {
+  int i;
+  uint16_t *intbuf = (uint16_t *)aom_malloc(sizeof(uint16_t) * width2 * height);
+  uint16_t *tmpbuf =
+      (uint16_t *)aom_malloc(sizeof(uint16_t) * AOMMAX(width, height));
+  uint16_t *arrbuf = (uint16_t *)aom_malloc(sizeof(uint16_t) * height);
+  uint16_t *arrbuf2 = (uint16_t *)aom_malloc(sizeof(uint16_t) * height2);
+  if (intbuf == NULL || tmpbuf == NULL || arrbuf == NULL || arrbuf2 == NULL)
+    goto Error;
+  for (i = 0; i < height; ++i) {
+    highbd_resize_multistep(CONVERT_TO_SHORTPTR(input + in_stride * i), width,
+                            intbuf + width2 * i, width2, tmpbuf, bd);
+  }
+  for (i = 0; i < width2; ++i) {
+    highbd_fill_col_to_arr(intbuf + i, width2, height, arrbuf);
+    highbd_resize_multistep(arrbuf, height, arrbuf2, height2, tmpbuf, bd);
+    highbd_fill_arr_to_col(CONVERT_TO_SHORTPTR(output + i), out_stride, height2,
+                           arrbuf2);
+  }
+
+Error:
+  aom_free(intbuf);
+  aom_free(tmpbuf);
+  aom_free(arrbuf);
+  aom_free(arrbuf2);
+}
+
+static void highbd_upscale_normative_rect(const uint8_t *const input,
+                                          int height, int width, int in_stride,
+                                          uint8_t *output, int height2,
+                                          int width2, int out_stride,
+                                          int x_step_qn, int x0_qn,
+                                          int pad_left, int pad_right, int bd) {
+  assert(width > 0);
+  assert(height > 0);
+  assert(width2 > 0);
+  assert(height2 > 0);
+  assert(height2 == height);
+
+  // Extend the left/right pixels of the tile column if needed
+  // (either because we can't sample from other tiles, or because we're at
+  // a frame edge).
+  // Save the overwritten pixels into tmp_left and tmp_right.
+  // Note: Because we pass input-1 to av1_convolve_horiz_rs, we need one extra
+  // column of border pixels compared to what we'd naively think.
+  const int border_cols = UPSCALE_NORMATIVE_TAPS / 2 + 1;
+  const int border_size = border_cols * sizeof(uint16_t);
+  uint16_t *tmp_left =
+      NULL;  // Silence spurious "may be used uninitialized" warnings
+  uint16_t *tmp_right = NULL;
+  uint16_t *const input16 = CONVERT_TO_SHORTPTR(input);
+  uint16_t *const in_tl = input16 - border_cols;
+  uint16_t *const in_tr = input16 + width;
+  if (pad_left) {
+    tmp_left = (uint16_t *)aom_malloc(sizeof(*tmp_left) * border_cols * height);
+    for (int i = 0; i < height; i++) {
+      memcpy(tmp_left + i * border_cols, in_tl + i * in_stride, border_size);
+      aom_memset16(in_tl + i * in_stride, input16[i * in_stride], border_cols);
+    }
+  }
+  if (pad_right) {
+    tmp_right =
+        (uint16_t *)aom_malloc(sizeof(*tmp_right) * border_cols * height);
+    for (int i = 0; i < height; i++) {
+      memcpy(tmp_right + i * border_cols, in_tr + i * in_stride, border_size);
+      aom_memset16(in_tr + i * in_stride, input16[i * in_stride + width - 1],
+                   border_cols);
+    }
+  }
+
+  av1_highbd_convolve_horiz_rs(CONVERT_TO_SHORTPTR(input - 1), in_stride,
+                               CONVERT_TO_SHORTPTR(output), out_stride, width2,
+                               height2, &av1_resize_filter_normative[0][0],
+                               x0_qn, x_step_qn, bd);
+
+  // Restore the left/right border pixels
+  if (pad_left) {
+    for (int i = 0; i < height; i++) {
+      memcpy(in_tl + i * in_stride, tmp_left + i * border_cols, border_size);
+    }
+    aom_free(tmp_left);
+  }
+  if (pad_right) {
+    for (int i = 0; i < height; i++) {
+      memcpy(in_tr + i * in_stride, tmp_right + i * border_cols, border_size);
+    }
+    aom_free(tmp_right);
+  }
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+void av1_resize_frame420(const uint8_t *const y, int y_stride,
+                         const uint8_t *const u, const uint8_t *const v,
+                         int uv_stride, int height, int width, uint8_t *oy,
+                         int oy_stride, uint8_t *ou, uint8_t *ov,
+                         int ouv_stride, int oheight, int owidth) {
+  av1_resize_plane(y, height, width, y_stride, oy, oheight, owidth, oy_stride);
+  av1_resize_plane(u, height / 2, width / 2, uv_stride, ou, oheight / 2,
+                   owidth / 2, ouv_stride);
+  av1_resize_plane(v, height / 2, width / 2, uv_stride, ov, oheight / 2,
+                   owidth / 2, ouv_stride);
+}
+
+void av1_resize_frame422(const uint8_t *const y, int y_stride,
+                         const uint8_t *const u, const uint8_t *const v,
+                         int uv_stride, int height, int width, uint8_t *oy,
+                         int oy_stride, uint8_t *ou, uint8_t *ov,
+                         int ouv_stride, int oheight, int owidth) {
+  av1_resize_plane(y, height, width, y_stride, oy, oheight, owidth, oy_stride);
+  av1_resize_plane(u, height, width / 2, uv_stride, ou, oheight, owidth / 2,
+                   ouv_stride);
+  av1_resize_plane(v, height, width / 2, uv_stride, ov, oheight, owidth / 2,
+                   ouv_stride);
+}
+
+void av1_resize_frame444(const uint8_t *const y, int y_stride,
+                         const uint8_t *const u, const uint8_t *const v,
+                         int uv_stride, int height, int width, uint8_t *oy,
+                         int oy_stride, uint8_t *ou, uint8_t *ov,
+                         int ouv_stride, int oheight, int owidth) {
+  av1_resize_plane(y, height, width, y_stride, oy, oheight, owidth, oy_stride);
+  av1_resize_plane(u, height, width, uv_stride, ou, oheight, owidth,
+                   ouv_stride);
+  av1_resize_plane(v, height, width, uv_stride, ov, oheight, owidth,
+                   ouv_stride);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void av1_highbd_resize_frame420(const uint8_t *const y, int y_stride,
+                                const uint8_t *const u, const uint8_t *const v,
+                                int uv_stride, int height, int width,
+                                uint8_t *oy, int oy_stride, uint8_t *ou,
+                                uint8_t *ov, int ouv_stride, int oheight,
+                                int owidth, int bd) {
+  av1_highbd_resize_plane(y, height, width, y_stride, oy, oheight, owidth,
+                          oy_stride, bd);
+  av1_highbd_resize_plane(u, height / 2, width / 2, uv_stride, ou, oheight / 2,
+                          owidth / 2, ouv_stride, bd);
+  av1_highbd_resize_plane(v, height / 2, width / 2, uv_stride, ov, oheight / 2,
+                          owidth / 2, ouv_stride, bd);
+}
+
+void av1_highbd_resize_frame422(const uint8_t *const y, int y_stride,
+                                const uint8_t *const u, const uint8_t *const v,
+                                int uv_stride, int height, int width,
+                                uint8_t *oy, int oy_stride, uint8_t *ou,
+                                uint8_t *ov, int ouv_stride, int oheight,
+                                int owidth, int bd) {
+  av1_highbd_resize_plane(y, height, width, y_stride, oy, oheight, owidth,
+                          oy_stride, bd);
+  av1_highbd_resize_plane(u, height, width / 2, uv_stride, ou, oheight,
+                          owidth / 2, ouv_stride, bd);
+  av1_highbd_resize_plane(v, height, width / 2, uv_stride, ov, oheight,
+                          owidth / 2, ouv_stride, bd);
+}
+
+void av1_highbd_resize_frame444(const uint8_t *const y, int y_stride,
+                                const uint8_t *const u, const uint8_t *const v,
+                                int uv_stride, int height, int width,
+                                uint8_t *oy, int oy_stride, uint8_t *ou,
+                                uint8_t *ov, int ouv_stride, int oheight,
+                                int owidth, int bd) {
+  av1_highbd_resize_plane(y, height, width, y_stride, oy, oheight, owidth,
+                          oy_stride, bd);
+  av1_highbd_resize_plane(u, height, width, uv_stride, ou, oheight, owidth,
+                          ouv_stride, bd);
+  av1_highbd_resize_plane(v, height, width, uv_stride, ov, oheight, owidth,
+                          ouv_stride, bd);
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+void av1_resize_and_extend_frame(const YV12_BUFFER_CONFIG *src,
+                                 YV12_BUFFER_CONFIG *dst, int bd,
+                                 const int num_planes) {
+  // TODO(dkovalev): replace YV12_BUFFER_CONFIG with aom_image_t
+
+  // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet
+  // the static analysis warnings.
+  for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); ++i) {
+    const int is_uv = i > 0;
+#if CONFIG_AV1_HIGHBITDEPTH
+    if (src->flags & YV12_FLAG_HIGHBITDEPTH)
+      av1_highbd_resize_plane(src->buffers[i], src->crop_heights[is_uv],
+                              src->crop_widths[is_uv], src->strides[is_uv],
+                              dst->buffers[i], dst->crop_heights[is_uv],
+                              dst->crop_widths[is_uv], dst->strides[is_uv], bd);
+    else
+      av1_resize_plane(src->buffers[i], src->crop_heights[is_uv],
+                       src->crop_widths[is_uv], src->strides[is_uv],
+                       dst->buffers[i], dst->crop_heights[is_uv],
+                       dst->crop_widths[is_uv], dst->strides[is_uv]);
+#else
+    (void)bd;
+    av1_resize_plane(src->buffers[i], src->crop_heights[is_uv],
+                     src->crop_widths[is_uv], src->strides[is_uv],
+                     dst->buffers[i], dst->crop_heights[is_uv],
+                     dst->crop_widths[is_uv], dst->strides[is_uv]);
+#endif
+  }
+  aom_extend_frame_borders(dst, num_planes);
+}
+
+void av1_upscale_normative_rows(const AV1_COMMON *cm, const uint8_t *src,
+                                int src_stride, uint8_t *dst, int dst_stride,
+                                int plane, int rows) {
+  const int is_uv = (plane > 0);
+  const int ss_x = is_uv && cm->seq_params.subsampling_x;
+  const int downscaled_plane_width = ROUND_POWER_OF_TWO(cm->width, ss_x);
+  const int upscaled_plane_width =
+      ROUND_POWER_OF_TWO(cm->superres_upscaled_width, ss_x);
+  const int superres_denom = cm->superres_scale_denominator;
+
+  TileInfo tile_col;
+  const int32_t x_step_qn = av1_get_upscale_convolve_step(
+      downscaled_plane_width, upscaled_plane_width);
+  int32_t x0_qn = get_upscale_convolve_x0(downscaled_plane_width,
+                                          upscaled_plane_width, x_step_qn);
+
+  for (int j = 0; j < cm->tiles.cols; j++) {
+    av1_tile_set_col(&tile_col, cm, j);
+    // Determine the limits of this tile column in both the source
+    // and destination images.
+    // Note: The actual location which we start sampling from is
+    // (downscaled_x0 - 1 + (x0_qn/2^14)), and this quantity increases
+    // by exactly dst_width * (x_step_qn/2^14) pixels each iteration.
+    const int downscaled_x0 = tile_col.mi_col_start << (MI_SIZE_LOG2 - ss_x);
+    const int downscaled_x1 = tile_col.mi_col_end << (MI_SIZE_LOG2 - ss_x);
+    const int src_width = downscaled_x1 - downscaled_x0;
+
+    const int upscaled_x0 = (downscaled_x0 * superres_denom) / SCALE_NUMERATOR;
+    int upscaled_x1;
+    if (j == cm->tiles.cols - 1) {
+      // Note that we can't just use AOMMIN here - due to rounding,
+      // (downscaled_x1 * superres_denom) / SCALE_NUMERATOR may be less than
+      // upscaled_plane_width.
+      upscaled_x1 = upscaled_plane_width;
+    } else {
+      upscaled_x1 = (downscaled_x1 * superres_denom) / SCALE_NUMERATOR;
+    }
+
+    const uint8_t *const src_ptr = src + downscaled_x0;
+    uint8_t *const dst_ptr = dst + upscaled_x0;
+    const int dst_width = upscaled_x1 - upscaled_x0;
+
+    const int pad_left = (j == 0);
+    const int pad_right = (j == cm->tiles.cols - 1);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+    if (cm->seq_params.use_highbitdepth)
+      highbd_upscale_normative_rect(src_ptr, rows, src_width, src_stride,
+                                    dst_ptr, rows, dst_width, dst_stride,
+                                    x_step_qn, x0_qn, pad_left, pad_right,
+                                    cm->seq_params.bit_depth);
+    else
+      upscale_normative_rect(src_ptr, rows, src_width, src_stride, dst_ptr,
+                             rows, dst_width, dst_stride, x_step_qn, x0_qn,
+                             pad_left, pad_right);
+#else
+    upscale_normative_rect(src_ptr, rows, src_width, src_stride, dst_ptr, rows,
+                           dst_width, dst_stride, x_step_qn, x0_qn, pad_left,
+                           pad_right);
+#endif
+    // Update the fractional pixel offset to prepare for the next tile column.
+    x0_qn += (dst_width * x_step_qn) - (src_width << RS_SCALE_SUBPEL_BITS);
+  }
+}
+
+void av1_upscale_normative_and_extend_frame(const AV1_COMMON *cm,
+                                            const YV12_BUFFER_CONFIG *src,
+                                            YV12_BUFFER_CONFIG *dst) {
+  const int num_planes = av1_num_planes(cm);
+  for (int i = 0; i < num_planes; ++i) {
+    const int is_uv = (i > 0);
+    av1_upscale_normative_rows(cm, src->buffers[i], src->strides[is_uv],
+                               dst->buffers[i], dst->strides[is_uv], i,
+                               src->crop_heights[is_uv]);
+  }
+
+  aom_extend_frame_borders(dst, num_planes);
+}
+
+YV12_BUFFER_CONFIG *av1_scale_if_required(AV1_COMMON *cm,
+                                          YV12_BUFFER_CONFIG *unscaled,
+                                          YV12_BUFFER_CONFIG *scaled) {
+  const int num_planes = av1_num_planes(cm);
+  if (cm->width != unscaled->y_crop_width ||
+      cm->height != unscaled->y_crop_height) {
+    av1_resize_and_extend_frame(unscaled, scaled, (int)cm->seq_params.bit_depth,
+                                num_planes);
+    return scaled;
+  } else {
+    return unscaled;
+  }
+}
+
+// Calculates the scaled dimension given the original dimension and the scale
+// denominator.
+static void calculate_scaled_size_helper(int *dim, int denom) {
+  if (denom != SCALE_NUMERATOR) {
+    // We need to ensure the constraint in "Appendix A" of the spec:
+    // * FrameWidth is greater than or equal to 16
+    // * FrameHeight is greater than or equal to 16
+    // For this, we clamp the downscaled dimension to at least 16. One
+    // exception: if original dimension itself was < 16, then we keep the
+    // downscaled dimension to be same as the original, to ensure that resizing
+    // is valid.
+    const int min_dim = AOMMIN(16, *dim);
+    // Use this version if we need *dim to be even
+    // *width = (*width * SCALE_NUMERATOR + denom) / (2 * denom);
+    // *width <<= 1;
+    *dim = (*dim * SCALE_NUMERATOR + denom / 2) / (denom);
+    *dim = AOMMAX(*dim, min_dim);
+  }
+}
+
+void av1_calculate_scaled_size(int *width, int *height, int resize_denom) {
+  calculate_scaled_size_helper(width, resize_denom);
+  calculate_scaled_size_helper(height, resize_denom);
+}
+
+void av1_calculate_scaled_superres_size(int *width, int *height,
+                                        int superres_denom) {
+  (void)height;
+  calculate_scaled_size_helper(width, superres_denom);
+}
+
+void av1_calculate_unscaled_superres_size(int *width, int *height, int denom) {
+  if (denom != SCALE_NUMERATOR) {
+    // Note: av1_calculate_scaled_superres_size() rounds *up* after division
+    // when the resulting dimensions are odd. So here, we round *down*.
+    *width = *width * denom / SCALE_NUMERATOR;
+    (void)height;
+  }
+}
+
+// Copy only the config data from 'src' to 'dst'.
+static void copy_buffer_config(const YV12_BUFFER_CONFIG *const src,
+                               YV12_BUFFER_CONFIG *const dst) {
+  dst->bit_depth = src->bit_depth;
+  dst->color_primaries = src->color_primaries;
+  dst->transfer_characteristics = src->transfer_characteristics;
+  dst->matrix_coefficients = src->matrix_coefficients;
+  dst->monochrome = src->monochrome;
+  dst->chroma_sample_position = src->chroma_sample_position;
+  dst->color_range = src->color_range;
+}
+
+// TODO(afergs): Look for in-place upscaling
+// TODO(afergs): aom_ vs av1_ functions? Which can I use?
+// Upscale decoded image.
+void av1_superres_upscale(AV1_COMMON *cm, BufferPool *const pool) {
+  const int num_planes = av1_num_planes(cm);
+  if (!av1_superres_scaled(cm)) return;
+  const SequenceHeader *const seq_params = &cm->seq_params;
+  const int byte_alignment = cm->features.byte_alignment;
+
+  YV12_BUFFER_CONFIG copy_buffer;
+  memset(&copy_buffer, 0, sizeof(copy_buffer));
+
+  YV12_BUFFER_CONFIG *const frame_to_show = &cm->cur_frame->buf;
+
+  const int aligned_width = ALIGN_POWER_OF_TWO(cm->width, 3);
+  if (aom_alloc_frame_buffer(
+          &copy_buffer, aligned_width, cm->height, seq_params->subsampling_x,
+          seq_params->subsampling_y, seq_params->use_highbitdepth,
+          AOM_BORDER_IN_PIXELS, byte_alignment))
+    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate copy buffer for superres upscaling");
+
+  // Copy function assumes the frames are the same size.
+  // Note that it does not copy YV12_BUFFER_CONFIG config data.
+  aom_yv12_copy_frame(frame_to_show, &copy_buffer, num_planes);
+
+  assert(copy_buffer.y_crop_width == aligned_width);
+  assert(copy_buffer.y_crop_height == cm->height);
+
+  // Realloc the current frame buffer at a higher resolution in place.
+  if (pool != NULL) {
+    // Use callbacks if on the decoder.
+    aom_codec_frame_buffer_t *fb = &cm->cur_frame->raw_frame_buffer;
+    aom_release_frame_buffer_cb_fn_t release_fb_cb = pool->release_fb_cb;
+    aom_get_frame_buffer_cb_fn_t cb = pool->get_fb_cb;
+    void *cb_priv = pool->cb_priv;
+
+    lock_buffer_pool(pool);
+    // Realloc with callback does not release the frame buffer - release first.
+    if (release_fb_cb(cb_priv, fb)) {
+      unlock_buffer_pool(pool);
+      aom_internal_error(
+          &cm->error, AOM_CODEC_MEM_ERROR,
+          "Failed to free current frame buffer before superres upscaling");
+    }
+    // aom_realloc_frame_buffer() leaves config data for frame_to_show intact
+    if (aom_realloc_frame_buffer(
+            frame_to_show, cm->superres_upscaled_width,
+            cm->superres_upscaled_height, seq_params->subsampling_x,
+            seq_params->subsampling_y, seq_params->use_highbitdepth,
+            AOM_BORDER_IN_PIXELS, byte_alignment, fb, cb, cb_priv)) {
+      unlock_buffer_pool(pool);
+      aom_internal_error(
+          &cm->error, AOM_CODEC_MEM_ERROR,
+          "Failed to allocate current frame buffer for superres upscaling");
+    }
+    unlock_buffer_pool(pool);
+  } else {
+    // Make a copy of the config data for frame_to_show in copy_buffer
+    copy_buffer_config(frame_to_show, &copy_buffer);
+
+    // Don't use callbacks on the encoder.
+    // aom_alloc_frame_buffer() clears the config data for frame_to_show
+    if (aom_alloc_frame_buffer(
+            frame_to_show, cm->superres_upscaled_width,
+            cm->superres_upscaled_height, seq_params->subsampling_x,
+            seq_params->subsampling_y, seq_params->use_highbitdepth,
+            AOM_BORDER_IN_PIXELS, byte_alignment))
+      aom_internal_error(
+          &cm->error, AOM_CODEC_MEM_ERROR,
+          "Failed to reallocate current frame buffer for superres upscaling");
+
+    // Restore config data back to frame_to_show
+    copy_buffer_config(&copy_buffer, frame_to_show);
+  }
+  // TODO(afergs): verify frame_to_show is correct after realloc
+  //               encoder:
+  //               decoder:
+
+  assert(frame_to_show->y_crop_width == cm->superres_upscaled_width);
+  assert(frame_to_show->y_crop_height == cm->superres_upscaled_height);
+
+  // Scale up and back into frame_to_show.
+  assert(frame_to_show->y_crop_width != cm->width);
+  av1_upscale_normative_and_extend_frame(cm, &copy_buffer, frame_to_show);
+
+  // Free the copy buffer
+  aom_free_frame_buffer(&copy_buffer);
+}
diff --git a/libs/libaom/src/av1/common/resize.h b/libs/libaom/src/av1/common/resize.h
new file mode 100644
index 000000000..8ee859e5c
--- /dev/null
+++ b/libs/libaom/src/av1/common/resize.h
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_RESIZE_H_
+#define AOM_AV1_COMMON_RESIZE_H_
+
+#include <stdio.h>
+#include "aom/aom_integer.h"
+#include "av1/common/av1_common_int.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_resize_plane(const uint8_t *const input, int height, int width,
+                      int in_stride, uint8_t *output, int height2, int width2,
+                      int out_stride);
+void av1_upscale_plane_double_prec(const double *const input, int height,
+                                   int width, int in_stride, double *output,
+                                   int height2, int width2, int out_stride);
+void av1_resize_frame420(const uint8_t *const y, int y_stride,
+                         const uint8_t *const u, const uint8_t *const v,
+                         int uv_stride, int height, int width, uint8_t *oy,
+                         int oy_stride, uint8_t *ou, uint8_t *ov,
+                         int ouv_stride, int oheight, int owidth);
+void av1_resize_frame422(const uint8_t *const y, int y_stride,
+                         const uint8_t *const u, const uint8_t *const v,
+                         int uv_stride, int height, int width, uint8_t *oy,
+                         int oy_stride, uint8_t *ou, uint8_t *ov,
+                         int ouv_stride, int oheight, int owidth);
+void av1_resize_frame444(const uint8_t *const y, int y_stride,
+                         const uint8_t *const u, const uint8_t *const v,
+                         int uv_stride, int height, int width, uint8_t *oy,
+                         int oy_stride, uint8_t *ou, uint8_t *ov,
+                         int ouv_stride, int oheight, int owidth);
+
+void av1_highbd_resize_plane(const uint8_t *const input, int height, int width,
+                             int in_stride, uint8_t *output, int height2,
+                             int width2, int out_stride, int bd);
+void av1_highbd_resize_frame420(const uint8_t *const y, int y_stride,
+                                const uint8_t *const u, const uint8_t *const v,
+                                int uv_stride, int height, int width,
+                                uint8_t *oy, int oy_stride, uint8_t *ou,
+                                uint8_t *ov, int ouv_stride, int oheight,
+                                int owidth, int bd);
+void av1_highbd_resize_frame422(const uint8_t *const y, int y_stride,
+                                const uint8_t *const u, const uint8_t *const v,
+                                int uv_stride, int height, int width,
+                                uint8_t *oy, int oy_stride, uint8_t *ou,
+                                uint8_t *ov, int ouv_stride, int oheight,
+                                int owidth, int bd);
+void av1_highbd_resize_frame444(const uint8_t *const y, int y_stride,
+                                const uint8_t *const u, const uint8_t *const v,
+                                int uv_stride, int height, int width,
+                                uint8_t *oy, int oy_stride, uint8_t *ou,
+                                uint8_t *ov, int ouv_stride, int oheight,
+                                int owidth, int bd);
+void av1_resize_and_extend_frame(const YV12_BUFFER_CONFIG *src,
+                                 YV12_BUFFER_CONFIG *dst, int bd,
+                                 const int num_planes);
+
+void av1_upscale_normative_rows(const AV1_COMMON *cm, const uint8_t *src,
+                                int src_stride, uint8_t *dst, int dst_stride,
+                                int plane, int rows);
+void av1_upscale_normative_and_extend_frame(const AV1_COMMON *cm,
+                                            const YV12_BUFFER_CONFIG *src,
+                                            YV12_BUFFER_CONFIG *dst);
+
+YV12_BUFFER_CONFIG *av1_scale_if_required(AV1_COMMON *cm,
+                                          YV12_BUFFER_CONFIG *unscaled,
+                                          YV12_BUFFER_CONFIG *scaled);
+
+// Calculates the scaled dimensions from the given original dimensions and the
+// resize scale denominator.
+void av1_calculate_scaled_size(int *width, int *height, int resize_denom);
+
+// Similar to above, but calculates scaled dimensions after superres from the
+// given original dimensions and superres scale denominator.
+void av1_calculate_scaled_superres_size(int *width, int *height,
+                                        int superres_denom);
+
+// Inverse of av1_calculate_scaled_superres_size() above: calculates the
+// original dimensions from the given scaled dimensions and the scale
+// denominator.
+void av1_calculate_unscaled_superres_size(int *width, int *height, int denom);
+
+void av1_superres_upscale(AV1_COMMON *cm, BufferPool *const pool);
+
+// Returns 1 if a superres upscaled frame is scaled and 0 otherwise.
+static INLINE int av1_superres_scaled(const AV1_COMMON *cm) {
+  // Note: for some corner cases (e.g. cm->width of 1), there may be no scaling
+  // required even though cm->superres_scale_denominator != SCALE_NUMERATOR.
+  // So, the following check is more accurate.
+  return !(cm->width == cm->superres_upscaled_width);
+}
+
+#define UPSCALE_NORMATIVE_TAPS 8
+extern const int16_t av1_resize_filter_normative[1 << RS_SUBPEL_BITS]
+                                                [UPSCALE_NORMATIVE_TAPS];
+
+int32_t av1_get_upscale_convolve_step(int in_length, int out_length);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_COMMON_RESIZE_H_
diff --git a/libs/libaom/src/av1/common/restoration.c b/libs/libaom/src/av1/common/restoration.c
new file mode 100644
index 000000000..a0f37ad63
--- /dev/null
+++ b/libs/libaom/src/av1/common/restoration.c
@@ -0,0 +1,1566 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ *
+ */
+
+#include <math.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "aom_mem/aom_mem.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/common/resize.h"
+#include "av1/common/restoration.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+
+#include "aom_ports/mem.h"
+
+// The 's' values are calculated based on original 'r' and 'e' values in the
+// spec using GenSgrprojVtable().
+// Note: Setting r = 0 skips the filter; with corresponding s = -1 (invalid).
+const sgr_params_type av1_sgr_params[SGRPROJ_PARAMS] = {
+  { { 2, 1 }, { 140, 3236 } }, { { 2, 1 }, { 112, 2158 } },
+  { { 2, 1 }, { 93, 1618 } },  { { 2, 1 }, { 80, 1438 } },
+  { { 2, 1 }, { 70, 1295 } },  { { 2, 1 }, { 58, 1177 } },
+  { { 2, 1 }, { 47, 1079 } },  { { 2, 1 }, { 37, 996 } },
+  { { 2, 1 }, { 30, 925 } },   { { 2, 1 }, { 25, 863 } },
+  { { 0, 1 }, { -1, 2589 } },  { { 0, 1 }, { -1, 1618 } },
+  { { 0, 1 }, { -1, 1177 } },  { { 0, 1 }, { -1, 925 } },
+  { { 2, 0 }, { 56, -1 } },    { { 2, 0 }, { 22, -1 } },
+};
+
+AV1PixelRect av1_whole_frame_rect(const AV1_COMMON *cm, int is_uv) {
+  AV1PixelRect rect;
+
+  int ss_x = is_uv && cm->seq_params.subsampling_x;
+  int ss_y = is_uv && cm->seq_params.subsampling_y;
+
+  rect.top = 0;
+  rect.bottom = ROUND_POWER_OF_TWO(cm->height, ss_y);
+  rect.left = 0;
+  rect.right = ROUND_POWER_OF_TWO(cm->superres_upscaled_width, ss_x);
+  return rect;
+}
+
+// Count horizontal or vertical units per tile (use a width or height for
+// tile_size, respectively). We basically want to divide the tile size by the
+// size of a restoration unit. Rather than rounding up unconditionally as you
+// might expect, we round to nearest, which models the way a right or bottom
+// restoration unit can extend to up to 150% its normal width or height. The
+// max with 1 is to deal with tiles that are smaller than half of a restoration
+// unit.
+int av1_lr_count_units_in_tile(int unit_size, int tile_size) {
+  return AOMMAX((tile_size + (unit_size >> 1)) / unit_size, 1);
+}
+
+void av1_alloc_restoration_struct(AV1_COMMON *cm, RestorationInfo *rsi,
+                                  int is_uv) {
+  // We need to allocate enough space for restoration units to cover the
+  // largest tile. Without CONFIG_MAX_TILE, this is always the tile at the
+  // top-left and we can use av1_get_tile_rect(). With CONFIG_MAX_TILE, we have
+  // to do the computation ourselves, iterating over the tiles and keeping
+  // track of the largest width and height, then upscaling.
+  const AV1PixelRect tile_rect = av1_whole_frame_rect(cm, is_uv);
+  const int max_tile_w = tile_rect.right - tile_rect.left;
+  const int max_tile_h = tile_rect.bottom - tile_rect.top;
+
+  // To calculate hpertile and vpertile (horizontal and vertical units per
+  // tile), we basically want to divide the largest tile width or height by the
+  // size of a restoration unit. Rather than rounding up unconditionally as you
+  // might expect, we round to nearest, which models the way a right or bottom
+  // restoration unit can extend to up to 150% its normal width or height. The
+  // max with 1 is to deal with tiles that are smaller than half of a
+  // restoration unit.
+  const int unit_size = rsi->restoration_unit_size;
+  const int hpertile = av1_lr_count_units_in_tile(unit_size, max_tile_w);
+  const int vpertile = av1_lr_count_units_in_tile(unit_size, max_tile_h);
+
+  rsi->units_per_tile = hpertile * vpertile;
+  rsi->horz_units_per_tile = hpertile;
+  rsi->vert_units_per_tile = vpertile;
+
+  const int ntiles = 1;
+  const int nunits = ntiles * rsi->units_per_tile;
+
+  aom_free(rsi->unit_info);
+  CHECK_MEM_ERROR(cm, rsi->unit_info,
+                  (RestorationUnitInfo *)aom_memalign(
+                      16, sizeof(*rsi->unit_info) * nunits));
+}
+
+void av1_free_restoration_struct(RestorationInfo *rst_info) {
+  aom_free(rst_info->unit_info);
+  rst_info->unit_info = NULL;
+}
+
+#if 0
+// Pair of values for each sgrproj parameter:
+// Index 0 corresponds to r[0], e[0]
+// Index 1 corresponds to r[1], e[1]
+int sgrproj_mtable[SGRPROJ_PARAMS][2];
+
+static void GenSgrprojVtable() {
+  for (int i = 0; i < SGRPROJ_PARAMS; ++i) {
+    const sgr_params_type *const params = &av1_sgr_params[i];
+    for (int j = 0; j < 2; ++j) {
+      const int e = params->e[j];
+      const int r = params->r[j];
+      if (r == 0) {                 // filter is disabled
+        sgrproj_mtable[i][j] = -1;  // mark invalid
+      } else {                      // filter is enabled
+        const int n = (2 * r + 1) * (2 * r + 1);
+        const int n2e = n * n * e;
+        assert(n2e != 0);
+        sgrproj_mtable[i][j] = (((1 << SGRPROJ_MTABLE_BITS) + n2e / 2) / n2e);
+      }
+    }
+  }
+}
+#endif
+
+void av1_loop_restoration_precal() {
+#if 0
+  GenSgrprojVtable();
+#endif
+}
+
+static void extend_frame_lowbd(uint8_t *data, int width, int height, int stride,
+                               int border_horz, int border_vert) {
+  uint8_t *data_p;
+  int i;
+  for (i = 0; i < height; ++i) {
+    data_p = data + i * stride;
+    memset(data_p - border_horz, data_p[0], border_horz);
+    memset(data_p + width, data_p[width - 1], border_horz);
+  }
+  data_p = data - border_horz;
+  for (i = -border_vert; i < 0; ++i) {
+    memcpy(data_p + i * stride, data_p, width + 2 * border_horz);
+  }
+  for (i = height; i < height + border_vert; ++i) {
+    memcpy(data_p + i * stride, data_p + (height - 1) * stride,
+           width + 2 * border_horz);
+  }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static void extend_frame_highbd(uint16_t *data, int width, int height,
+                                int stride, int border_horz, int border_vert) {
+  uint16_t *data_p;
+  int i, j;
+  for (i = 0; i < height; ++i) {
+    data_p = data + i * stride;
+    for (j = -border_horz; j < 0; ++j) data_p[j] = data_p[0];
+    for (j = width; j < width + border_horz; ++j) data_p[j] = data_p[width - 1];
+  }
+  data_p = data - border_horz;
+  for (i = -border_vert; i < 0; ++i) {
+    memcpy(data_p + i * stride, data_p,
+           (width + 2 * border_horz) * sizeof(uint16_t));
+  }
+  for (i = height; i < height + border_vert; ++i) {
+    memcpy(data_p + i * stride, data_p + (height - 1) * stride,
+           (width + 2 * border_horz) * sizeof(uint16_t));
+  }
+}
+
+static void copy_tile_highbd(int width, int height, const uint16_t *src,
+                             int src_stride, uint16_t *dst, int dst_stride) {
+  for (int i = 0; i < height; ++i)
+    memcpy(dst + i * dst_stride, src + i * src_stride, width * sizeof(*dst));
+}
+#endif
+
+void av1_extend_frame(uint8_t *data, int width, int height, int stride,
+                      int border_horz, int border_vert, int highbd) {
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (highbd) {
+    extend_frame_highbd(CONVERT_TO_SHORTPTR(data), width, height, stride,
+                        border_horz, border_vert);
+    return;
+  }
+#endif
+  (void)highbd;
+  extend_frame_lowbd(data, width, height, stride, border_horz, border_vert);
+}
+
+static void copy_tile_lowbd(int width, int height, const uint8_t *src,
+                            int src_stride, uint8_t *dst, int dst_stride) {
+  for (int i = 0; i < height; ++i)
+    memcpy(dst + i * dst_stride, src + i * src_stride, width);
+}
+
+static void copy_tile(int width, int height, const uint8_t *src, int src_stride,
+                      uint8_t *dst, int dst_stride, int highbd) {
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (highbd) {
+    copy_tile_highbd(width, height, CONVERT_TO_SHORTPTR(src), src_stride,
+                     CONVERT_TO_SHORTPTR(dst), dst_stride);
+    return;
+  }
+#endif
+  (void)highbd;
+  copy_tile_lowbd(width, height, src, src_stride, dst, dst_stride);
+}
+
+#define REAL_PTR(hbd, d) ((hbd) ? (uint8_t *)CONVERT_TO_SHORTPTR(d) : (d))
+
+// With striped loop restoration, the filtering for each 64-pixel stripe gets
+// most of its input from the output of CDEF (stored in data8), but we need to
+// fill out a border of 3 pixels above/below the stripe according to the
+// following
+// rules:
+//
+// * At a frame boundary, we copy the outermost row of CDEF pixels three times.
+//   This extension is done by a call to av1_extend_frame() at the start of the
+//   loop restoration process, so the value of copy_above/copy_below doesn't
+//   strictly matter. However, by setting *copy_above = *copy_below = 1 whenever
+//   loop filtering across tiles is disabled, we can allow
+//   {setup,restore}_processing_stripe_boundary to assume that the top/bottom
+//   data has always been copied, simplifying the behaviour at the left and
+//   right edges of tiles.
+//
+// * If we're at a tile boundary and loop filtering across tiles is enabled,
+//   then there is a logical stripe which is 64 pixels high, but which is split
+//   into an 8px high and a 56px high stripe so that the processing (and
+//   coefficient set usage) can be aligned to tiles.
+//   In this case, we use the 3 rows of CDEF output across the boundary for
+//   context; this corresponds to leaving the frame buffer as-is.
+//
+// * If we're at a tile boundary and loop filtering across tiles is disabled,
+//   then we take the outermost row of CDEF pixels *within the current tile*
+//   and copy it three times. Thus we behave exactly as if the tile were a full
+//   frame.
+//
+// * Otherwise, we're at a stripe boundary within a tile. In that case, we
+//   take 2 rows of deblocked pixels and extend them to 3 rows of context.
+//
+// The distinction between the latter two cases is handled by the
+// av1_loop_restoration_save_boundary_lines() function, so here we just need
+// to decide if we're overwriting the above/below boundary pixels or not.
+static void get_stripe_boundary_info(const RestorationTileLimits *limits,
+                                     const AV1PixelRect *tile_rect, int ss_y,
+                                     int *copy_above, int *copy_below) {
+  *copy_above = 1;
+  *copy_below = 1;
+
+  const int full_stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
+  const int runit_offset = RESTORATION_UNIT_OFFSET >> ss_y;
+
+  const int first_stripe_in_tile = (limits->v_start == tile_rect->top);
+  const int this_stripe_height =
+      full_stripe_height - (first_stripe_in_tile ? runit_offset : 0);
+  const int last_stripe_in_tile =
+      (limits->v_start + this_stripe_height >= tile_rect->bottom);
+
+  if (first_stripe_in_tile) *copy_above = 0;
+  if (last_stripe_in_tile) *copy_below = 0;
+}
+
+// Overwrite the border pixels around a processing stripe so that the conditions
+// listed above get_stripe_boundary_info() are preserved.
+// We save the pixels which get overwritten into a temporary buffer, so that
+// they can be restored by restore_processing_stripe_boundary() after we've
+// processed the stripe.
+//
+// limits gives the rectangular limits of the remaining stripes for the current
+// restoration unit. rsb is the stored stripe boundaries (taken from either
+// deblock or CDEF output as necessary).
+//
+// tile_rect is the limits of the current tile and tile_stripe0 is the index of
+// the first stripe in this tile (needed to convert the tile-relative stripe
+// index we get from limits into something we can look up in rsb).
+static void setup_processing_stripe_boundary(
+    const RestorationTileLimits *limits, const RestorationStripeBoundaries *rsb,
+    int rsb_row, int use_highbd, int h, uint8_t *data8, int data_stride,
+    RestorationLineBuffers *rlbs, int copy_above, int copy_below, int opt) {
+  // Offsets within the line buffers. The buffer logically starts at column
+  // -RESTORATION_EXTRA_HORZ so the 1st column (at x0 - RESTORATION_EXTRA_HORZ)
+  // has column x0 in the buffer.
+  const int buf_stride = rsb->stripe_boundary_stride;
+  const int buf_x0_off = limits->h_start;
+  const int line_width =
+      (limits->h_end - limits->h_start) + 2 * RESTORATION_EXTRA_HORZ;
+  const int line_size = line_width << use_highbd;
+
+  const int data_x0 = limits->h_start - RESTORATION_EXTRA_HORZ;
+
+  // Replace RESTORATION_BORDER pixels above the top of the stripe
+  // We expand RESTORATION_CTX_VERT=2 lines from rsb->stripe_boundary_above
+  // to fill RESTORATION_BORDER=3 lines of above pixels. This is done by
+  // duplicating the topmost of the 2 lines (see the AOMMAX call when
+  // calculating src_row, which gets the values 0, 0, 1 for i = -3, -2, -1).
+  //
+  // Special case: If we're at the top of a tile, which isn't on the topmost
+  // tile row, and we're allowed to loop filter across tiles, then we have a
+  // logical 64-pixel-high stripe which has been split into an 8-pixel high
+  // stripe and a 56-pixel high stripe (the current one). So, in this case,
+  // we want to leave the boundary alone!
+  if (!opt) {
+    if (copy_above) {
+      uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
+
+      for (int i = -RESTORATION_BORDER; i < 0; ++i) {
+        const int buf_row = rsb_row + AOMMAX(i + RESTORATION_CTX_VERT, 0);
+        const int buf_off = buf_x0_off + buf_row * buf_stride;
+        const uint8_t *buf =
+            rsb->stripe_boundary_above + (buf_off << use_highbd);
+        uint8_t *dst8 = data8_tl + i * data_stride;
+        // Save old pixels, then replace with data from stripe_boundary_above
+        memcpy(rlbs->tmp_save_above[i + RESTORATION_BORDER],
+               REAL_PTR(use_highbd, dst8), line_size);
+        memcpy(REAL_PTR(use_highbd, dst8), buf, line_size);
+      }
+    }
+
+    // Replace RESTORATION_BORDER pixels below the bottom of the stripe.
+    // The second buffer row is repeated, so src_row gets the values 0, 1, 1
+    // for i = 0, 1, 2.
+    if (copy_below) {
+      const int stripe_end = limits->v_start + h;
+      uint8_t *data8_bl = data8 + data_x0 + stripe_end * data_stride;
+
+      for (int i = 0; i < RESTORATION_BORDER; ++i) {
+        const int buf_row = rsb_row + AOMMIN(i, RESTORATION_CTX_VERT - 1);
+        const int buf_off = buf_x0_off + buf_row * buf_stride;
+        const uint8_t *src =
+            rsb->stripe_boundary_below + (buf_off << use_highbd);
+
+        uint8_t *dst8 = data8_bl + i * data_stride;
+        // Save old pixels, then replace with data from stripe_boundary_below
+        memcpy(rlbs->tmp_save_below[i], REAL_PTR(use_highbd, dst8), line_size);
+        memcpy(REAL_PTR(use_highbd, dst8), src, line_size);
+      }
+    }
+  } else {
+    if (copy_above) {
+      uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
+
+      // Only save and overwrite i=-RESTORATION_BORDER line.
+      uint8_t *dst8 = data8_tl + (-RESTORATION_BORDER) * data_stride;
+      // Save old pixels, then replace with data from stripe_boundary_above
+      memcpy(rlbs->tmp_save_above[0], REAL_PTR(use_highbd, dst8), line_size);
+      memcpy(REAL_PTR(use_highbd, dst8),
+             REAL_PTR(use_highbd,
+                      data8_tl + (-RESTORATION_BORDER + 1) * data_stride),
+             line_size);
+    }
+
+    if (copy_below) {
+      const int stripe_end = limits->v_start + h;
+      uint8_t *data8_bl = data8 + data_x0 + stripe_end * data_stride;
+
+      // Only save and overwrite i=2 line.
+      uint8_t *dst8 = data8_bl + 2 * data_stride;
+      // Save old pixels, then replace with data from stripe_boundary_below
+      memcpy(rlbs->tmp_save_below[2], REAL_PTR(use_highbd, dst8), line_size);
+      memcpy(REAL_PTR(use_highbd, dst8),
+             REAL_PTR(use_highbd, data8_bl + (2 - 1) * data_stride), line_size);
+    }
+  }
+}
+
+// This function restores the boundary lines modified by
+// setup_processing_stripe_boundary.
+//
+// Note: We need to be careful when handling the corners of the processing
+// unit, because (eg.) the top-left corner is considered to be part of
+// both the left and top borders. This means that, depending on the
+// loop_filter_across_tiles_enabled flag, the corner pixels might get
+// overwritten twice, once as part of the "top" border and once as part
+// of the "left" border (or similar for other corners).
+//
+// Everything works out fine as long as we make sure to reverse the order
+// when restoring, ie. we need to restore the left/right borders followed
+// by the top/bottom borders.
+static void restore_processing_stripe_boundary(
+    const RestorationTileLimits *limits, const RestorationLineBuffers *rlbs,
+    int use_highbd, int h, uint8_t *data8, int data_stride, int copy_above,
+    int copy_below, int opt) {
+  const int line_width =
+      (limits->h_end - limits->h_start) + 2 * RESTORATION_EXTRA_HORZ;
+  const int line_size = line_width << use_highbd;
+
+  const int data_x0 = limits->h_start - RESTORATION_EXTRA_HORZ;
+
+  if (!opt) {
+    if (copy_above) {
+      uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
+      for (int i = -RESTORATION_BORDER; i < 0; ++i) {
+        uint8_t *dst8 = data8_tl + i * data_stride;
+        memcpy(REAL_PTR(use_highbd, dst8),
+               rlbs->tmp_save_above[i + RESTORATION_BORDER], line_size);
+      }
+    }
+
+    if (copy_below) {
+      const int stripe_bottom = limits->v_start + h;
+      uint8_t *data8_bl = data8 + data_x0 + stripe_bottom * data_stride;
+
+      for (int i = 0; i < RESTORATION_BORDER; ++i) {
+        if (stripe_bottom + i >= limits->v_end + RESTORATION_BORDER) break;
+
+        uint8_t *dst8 = data8_bl + i * data_stride;
+        memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_below[i], line_size);
+      }
+    }
+  } else {
+    if (copy_above) {
+      uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
+
+      // Only restore i=-RESTORATION_BORDER line.
+      uint8_t *dst8 = data8_tl + (-RESTORATION_BORDER) * data_stride;
+      memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_above[0], line_size);
+    }
+
+    if (copy_below) {
+      const int stripe_bottom = limits->v_start + h;
+      uint8_t *data8_bl = data8 + data_x0 + stripe_bottom * data_stride;
+
+      // Only restore i=2 line.
+      if (stripe_bottom + 2 < limits->v_end + RESTORATION_BORDER) {
+        uint8_t *dst8 = data8_bl + 2 * data_stride;
+        memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_below[2], line_size);
+      }
+    }
+  }
+}
+
+static void wiener_filter_stripe(const RestorationUnitInfo *rui,
+                                 int stripe_width, int stripe_height,
+                                 int procunit_width, const uint8_t *src,
+                                 int src_stride, uint8_t *dst, int dst_stride,
+                                 int32_t *tmpbuf, int bit_depth) {
+  (void)tmpbuf;
+  (void)bit_depth;
+  assert(bit_depth == 8);
+  const ConvolveParams conv_params = get_conv_params_wiener(8);
+
+  for (int j = 0; j < stripe_width; j += procunit_width) {
+    int w = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15);
+    const uint8_t *src_p = src + j;
+    uint8_t *dst_p = dst + j;
+    av1_wiener_convolve_add_src(
+        src_p, src_stride, dst_p, dst_stride, rui->wiener_info.hfilter, 16,
+        rui->wiener_info.vfilter, 16, w, stripe_height, &conv_params);
+  }
+}
+
+/* Calculate windowed sums (if sqr=0) or sums of squares (if sqr=1)
+   over the input. The window is of size (2r + 1)x(2r + 1), and we
+   specialize to r = 1, 2, 3. A default function is used for r > 3.
+
+   Each loop follows the same format: We keep a window's worth of input
+   in individual variables and select data out of that as appropriate.
+*/
+static void boxsum1(int32_t *src, int width, int height, int src_stride,
+                    int sqr, int32_t *dst, int dst_stride) {
+  int i, j, a, b, c;
+  assert(width > 2 * SGRPROJ_BORDER_HORZ);
+  assert(height > 2 * SGRPROJ_BORDER_VERT);
+
+  // Vertical sum over 3-pixel regions, from src into dst.
+  if (!sqr) {
+    for (j = 0; j < width; ++j) {
+      a = src[j];
+      b = src[src_stride + j];
+      c = src[2 * src_stride + j];
+
+      dst[j] = a + b;
+      for (i = 1; i < height - 2; ++i) {
+        // Loop invariant: At the start of each iteration,
+        // a = src[(i - 1) * src_stride + j]
+        // b = src[(i    ) * src_stride + j]
+        // c = src[(i + 1) * src_stride + j]
+        dst[i * dst_stride + j] = a + b + c;
+        a = b;
+        b = c;
+        c = src[(i + 2) * src_stride + j];
+      }
+      dst[i * dst_stride + j] = a + b + c;
+      dst[(i + 1) * dst_stride + j] = b + c;
+    }
+  } else {
+    for (j = 0; j < width; ++j) {
+      a = src[j] * src[j];
+      b = src[src_stride + j] * src[src_stride + j];
+      c = src[2 * src_stride + j] * src[2 * src_stride + j];
+
+      dst[j] = a + b;
+      for (i = 1; i < height - 2; ++i) {
+        dst[i * dst_stride + j] = a + b + c;
+        a = b;
+        b = c;
+        c = src[(i + 2) * src_stride + j] * src[(i + 2) * src_stride + j];
+      }
+      dst[i * dst_stride + j] = a + b + c;
+      dst[(i + 1) * dst_stride + j] = b + c;
+    }
+  }
+
+  // Horizontal sum over 3-pixel regions of dst
+  for (i = 0; i < height; ++i) {
+    a = dst[i * dst_stride];
+    b = dst[i * dst_stride + 1];
+    c = dst[i * dst_stride + 2];
+
+    dst[i * dst_stride] = a + b;
+    for (j = 1; j < width - 2; ++j) {
+      // Loop invariant: At the start of each iteration,
+      // a = src[i * src_stride + (j - 1)]
+      // b = src[i * src_stride + (j    )]
+      // c = src[i * src_stride + (j + 1)]
+      dst[i * dst_stride + j] = a + b + c;
+      a = b;
+      b = c;
+      c = dst[i * dst_stride + (j + 2)];
+    }
+    dst[i * dst_stride + j] = a + b + c;
+    dst[i * dst_stride + (j + 1)] = b + c;
+  }
+}
+
+static void boxsum2(int32_t *src, int width, int height, int src_stride,
+                    int sqr, int32_t *dst, int dst_stride) {
+  int i, j, a, b, c, d, e;
+  assert(width > 2 * SGRPROJ_BORDER_HORZ);
+  assert(height > 2 * SGRPROJ_BORDER_VERT);
+
+  // Vertical sum over 5-pixel regions, from src into dst.
+  if (!sqr) {
+    for (j = 0; j < width; ++j) {
+      a = src[j];
+      b = src[src_stride + j];
+      c = src[2 * src_stride + j];
+      d = src[3 * src_stride + j];
+      e = src[4 * src_stride + j];
+
+      dst[j] = a + b + c;
+      dst[dst_stride + j] = a + b + c + d;
+      for (i = 2; i < height - 3; ++i) {
+        // Loop invariant: At the start of each iteration,
+        // a = src[(i - 2) * src_stride + j]
+        // b = src[(i - 1) * src_stride + j]
+        // c = src[(i    ) * src_stride + j]
+        // d = src[(i + 1) * src_stride + j]
+        // e = src[(i + 2) * src_stride + j]
+        dst[i * dst_stride + j] = a + b + c + d + e;
+        a = b;
+        b = c;
+        c = d;
+        d = e;
+        e = src[(i + 3) * src_stride + j];
+      }
+      dst[i * dst_stride + j] = a + b + c + d + e;
+      dst[(i + 1) * dst_stride + j] = b + c + d + e;
+      dst[(i + 2) * dst_stride + j] = c + d + e;
+    }
+  } else {
+    for (j = 0; j < width; ++j) {
+      a = src[j] * src[j];
+      b = src[src_stride + j] * src[src_stride + j];
+      c = src[2 * src_stride + j] * src[2 * src_stride + j];
+      d = src[3 * src_stride + j] * src[3 * src_stride + j];
+      e = src[4 * src_stride + j] * src[4 * src_stride + j];
+
+      dst[j] = a + b + c;
+      dst[dst_stride + j] = a + b + c + d;
+      for (i = 2; i < height - 3; ++i) {
+        dst[i * dst_stride + j] = a + b + c + d + e;
+        a = b;
+        b = c;
+        c = d;
+        d = e;
+        e = src[(i + 3) * src_stride + j] * src[(i + 3) * src_stride + j];
+      }
+      dst[i * dst_stride + j] = a + b + c + d + e;
+      dst[(i + 1) * dst_stride + j] = b + c + d + e;
+      dst[(i + 2) * dst_stride + j] = c + d + e;
+    }
+  }
+
+  // Horizontal sum over 5-pixel regions of dst
+  for (i = 0; i < height; ++i) {
+    a = dst[i * dst_stride];
+    b = dst[i * dst_stride + 1];
+    c = dst[i * dst_stride + 2];
+    d = dst[i * dst_stride + 3];
+    e = dst[i * dst_stride + 4];
+
+    dst[i * dst_stride] = a + b + c;
+    dst[i * dst_stride + 1] = a + b + c + d;
+    for (j = 2; j < width - 3; ++j) {
+      // Loop invariant: At the start of each iteration,
+      // a = src[i * src_stride + (j - 2)]
+      // b = src[i * src_stride + (j - 1)]
+      // c = src[i * src_stride + (j    )]
+      // d = src[i * src_stride + (j + 1)]
+      // e = src[i * src_stride + (j + 2)]
+      dst[i * dst_stride + j] = a + b + c + d + e;
+      a = b;
+      b = c;
+      c = d;
+      d = e;
+      e = dst[i * dst_stride + (j + 3)];
+    }
+    dst[i * dst_stride + j] = a + b + c + d + e;
+    dst[i * dst_stride + (j + 1)] = b + c + d + e;
+    dst[i * dst_stride + (j + 2)] = c + d + e;
+  }
+}
+
+static void boxsum(int32_t *src, int width, int height, int src_stride, int r,
+                   int sqr, int32_t *dst, int dst_stride) {
+  if (r == 1)
+    boxsum1(src, width, height, src_stride, sqr, dst, dst_stride);
+  else if (r == 2)
+    boxsum2(src, width, height, src_stride, sqr, dst, dst_stride);
+  else
+    assert(0 && "Invalid value of r in self-guided filter");
+}
+
+void av1_decode_xq(const int *xqd, int *xq, const sgr_params_type *params) {
+  if (params->r[0] == 0) {
+    xq[0] = 0;
+    xq[1] = (1 << SGRPROJ_PRJ_BITS) - xqd[1];
+  } else if (params->r[1] == 0) {
+    xq[0] = xqd[0];
+    xq[1] = 0;
+  } else {
+    xq[0] = xqd[0];
+    xq[1] = (1 << SGRPROJ_PRJ_BITS) - xq[0] - xqd[1];
+  }
+}
+
+const int32_t av1_x_by_xplus1[256] = {
+  // Special case: Map 0 -> 1 (corresponding to a value of 1/256)
+  // instead of 0. See comments in selfguided_restoration_internal() for why
+  1,   128, 171, 192, 205, 213, 219, 224, 228, 230, 233, 235, 236, 238, 239,
+  240, 241, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247, 247, 247,
+  248, 248, 248, 248, 249, 249, 249, 249, 249, 250, 250, 250, 250, 250, 250,
+  250, 251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 252, 252, 252, 252,
+  252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 253, 253,
+  253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253,
+  253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 254, 254, 254,
+  254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
+  254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
+  254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
+  254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
+  254, 254, 254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+  256,
+};
+
+const int32_t av1_one_by_x[MAX_NELEM] = {
+  4096, 2048, 1365, 1024, 819, 683, 585, 512, 455, 410, 372, 341, 315,
+  293,  273,  256,  241,  228, 216, 205, 195, 186, 178, 171, 164,
+};
+
+static void calculate_intermediate_result(int32_t *dgd, int width, int height,
+                                          int dgd_stride, int bit_depth,
+                                          int sgr_params_idx, int radius_idx,
+                                          int pass, int32_t *A, int32_t *B) {
+  const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
+  const int r = params->r[radius_idx];
+  const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
+  const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
+  // Adjusting the stride of A and B here appears to avoid bad cache effects,
+  // leading to a significant speed improvement.
+  // We also align the stride to a multiple of 16 bytes, for consistency
+  // with the SIMD version of this function.
+  int buf_stride = ((width_ext + 3) & ~3) + 16;
+  const int step = pass == 0 ? 1 : 2;
+  int i, j;
+
+  assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r");
+  assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 &&
+         "Need SGRPROJ_BORDER_* >= r+1");
+
+  boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
+         width_ext, height_ext, dgd_stride, r, 0, B, buf_stride);
+  boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
+         width_ext, height_ext, dgd_stride, r, 1, A, buf_stride);
+  A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+  B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+  // Calculate the eventual A[] and B[] arrays. Include a 1-pixel border - ie,
+  // for a 64x64 processing unit, we calculate 66x66 pixels of A[] and B[].
+  for (i = -1; i < height + 1; i += step) {
+    for (j = -1; j < width + 1; ++j) {
+      const int k = i * buf_stride + j;
+      const int n = (2 * r + 1) * (2 * r + 1);
+
+      // a < 2^16 * n < 2^22 regardless of bit depth
+      uint32_t a = ROUND_POWER_OF_TWO(A[k], 2 * (bit_depth - 8));
+      // b < 2^8 * n < 2^14 regardless of bit depth
+      uint32_t b = ROUND_POWER_OF_TWO(B[k], bit_depth - 8);
+
+      // Each term in calculating p = a * n - b * b is < 2^16 * n^2 < 2^28,
+      // and p itself satisfies p < 2^14 * n^2 < 2^26.
+      // This bound on p is due to:
+      // https://en.wikipedia.org/wiki/Popoviciu's_inequality_on_variances
+      //
+      // Note: Sometimes, in high bit depth, we can end up with a*n < b*b.
+      // This is an artefact of rounding, and can only happen if all pixels
+      // are (almost) identical, so in this case we saturate to p=0.
+      uint32_t p = (a * n < b * b) ? 0 : a * n - b * b;
+
+      const uint32_t s = params->s[radius_idx];
+
+      // p * s < (2^14 * n^2) * round(2^20 / n^2 eps) < 2^34 / eps < 2^32
+      // as long as eps >= 4. So p * s fits into a uint32_t, and z < 2^12
+      // (this holds even after accounting for the rounding in s)
+      const uint32_t z = ROUND_POWER_OF_TWO(p * s, SGRPROJ_MTABLE_BITS);
+
+      // Note: We have to be quite careful about the value of A[k].
+      // This is used as a blend factor between individual pixel values and the
+      // local mean. So it logically has a range of [0, 256], including both
+      // endpoints.
+      //
+      // This is a pain for hardware, as we'd like something which can be stored
+      // in exactly 8 bits.
+      // Further, in the calculation of B[k] below, if z == 0 and r == 2,
+      // then A[k] "should be" 0. But then we can end up setting B[k] to a value
+      // slightly above 2^(8 + bit depth), due to rounding in the value of
+      // av1_one_by_x[25-1].
+      //
+      // Thus we saturate so that, when z == 0, A[k] is set to 1 instead of 0.
+      // This fixes the above issues (256 - A[k] fits in a uint8, and we can't
+      // overflow), without significantly affecting the final result: z == 0
+      // implies that the image is essentially "flat", so the local mean and
+      // individual pixel values are very similar.
+      //
+      // Note that saturating on the other side, ie. requring A[k] <= 255,
+      // would be a bad idea, as that corresponds to the case where the image
+      // is very variable, when we want to preserve the local pixel value as
+      // much as possible.
+      A[k] = av1_x_by_xplus1[AOMMIN(z, 255)];  // in range [1, 256]
+
+      // SGRPROJ_SGR - A[k] < 2^8 (from above), B[k] < 2^(bit_depth) * n,
+      // av1_one_by_x[n - 1] = round(2^12 / n)
+      // => the product here is < 2^(20 + bit_depth) <= 2^32,
+      // and B[k] is set to a value < 2^(8 + bit depth)
+      // This holds even with the rounding in av1_one_by_x and in the overall
+      // result, as long as SGRPROJ_SGR - A[k] is strictly less than 2^8.
+      B[k] = (int32_t)ROUND_POWER_OF_TWO((uint32_t)(SGRPROJ_SGR - A[k]) *
+                                             (uint32_t)B[k] *
+                                             (uint32_t)av1_one_by_x[n - 1],
+                                         SGRPROJ_RECIP_BITS);
+    }
+  }
+}
+
+static void selfguided_restoration_fast_internal(
+    int32_t *dgd, int width, int height, int dgd_stride, int32_t *dst,
+    int dst_stride, int bit_depth, int sgr_params_idx, int radius_idx) {
+  const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
+  const int r = params->r[radius_idx];
+  const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
+  // Adjusting the stride of A and B here appears to avoid bad cache effects,
+  // leading to a significant speed improvement.
+  // We also align the stride to a multiple of 16 bytes, for consistency
+  // with the SIMD version of this function.
+  int buf_stride = ((width_ext + 3) & ~3) + 16;
+  int32_t A_[RESTORATION_PROC_UNIT_PELS];
+  int32_t B_[RESTORATION_PROC_UNIT_PELS];
+  int32_t *A = A_;
+  int32_t *B = B_;
+  int i, j;
+  calculate_intermediate_result(dgd, width, height, dgd_stride, bit_depth,
+                                sgr_params_idx, radius_idx, 1, A, B);
+  A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+  B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+
+  // Use the A[] and B[] arrays to calculate the filtered image
+  (void)r;
+  assert(r == 2);
+  for (i = 0; i < height; ++i) {
+    if (!(i & 1)) {  // even row
+      for (j = 0; j < width; ++j) {
+        const int k = i * buf_stride + j;
+        const int l = i * dgd_stride + j;
+        const int m = i * dst_stride + j;
+        const int nb = 5;
+        const int32_t a = (A[k - buf_stride] + A[k + buf_stride]) * 6 +
+                          (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
+                           A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
+                              5;
+        const int32_t b = (B[k - buf_stride] + B[k + buf_stride]) * 6 +
+                          (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
+                           B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
+                              5;
+        const int32_t v = a * dgd[l] + b;
+        dst[m] =
+            ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+      }
+    } else {  // odd row
+      for (j = 0; j < width; ++j) {
+        const int k = i * buf_stride + j;
+        const int l = i * dgd_stride + j;
+        const int m = i * dst_stride + j;
+        const int nb = 4;
+        const int32_t a = A[k] * 6 + (A[k - 1] + A[k + 1]) * 5;
+        const int32_t b = B[k] * 6 + (B[k - 1] + B[k + 1]) * 5;
+        const int32_t v = a * dgd[l] + b;
+        dst[m] =
+            ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+      }
+    }
+  }
+}
+
+static void selfguided_restoration_internal(int32_t *dgd, int width, int height,
+                                            int dgd_stride, int32_t *dst,
+                                            int dst_stride, int bit_depth,
+                                            int sgr_params_idx,
+                                            int radius_idx) {
+  const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
+  // Adjusting the stride of A and B here appears to avoid bad cache effects,
+  // leading to a significant speed improvement.
+  // We also align the stride to a multiple of 16 bytes, for consistency
+  // with the SIMD version of this function.
+  int buf_stride = ((width_ext + 3) & ~3) + 16;
+  int32_t A_[RESTORATION_PROC_UNIT_PELS];
+  int32_t B_[RESTORATION_PROC_UNIT_PELS];
+  int32_t *A = A_;
+  int32_t *B = B_;
+  int i, j;
+  calculate_intermediate_result(dgd, width, height, dgd_stride, bit_depth,
+                                sgr_params_idx, radius_idx, 0, A, B);
+  A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+  B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+
+  // Use the A[] and B[] arrays to calculate the filtered image
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < width; ++j) {
+      const int k = i * buf_stride + j;
+      const int l = i * dgd_stride + j;
+      const int m = i * dst_stride + j;
+      const int nb = 5;
+      const int32_t a =
+          (A[k] + A[k - 1] + A[k + 1] + A[k - buf_stride] + A[k + buf_stride]) *
+              4 +
+          (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
+           A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
+              3;
+      const int32_t b =
+          (B[k] + B[k - 1] + B[k + 1] + B[k - buf_stride] + B[k + buf_stride]) *
+              4 +
+          (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
+           B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
+              3;
+      const int32_t v = a * dgd[l] + b;
+      dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+    }
+  }
+}
+
+int av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
+                                 int dgd_stride, int32_t *flt0, int32_t *flt1,
+                                 int flt_stride, int sgr_params_idx,
+                                 int bit_depth, int highbd) {
+  int32_t dgd32_[RESTORATION_PROC_UNIT_PELS];
+  const int dgd32_stride = width + 2 * SGRPROJ_BORDER_HORZ;
+  int32_t *dgd32 =
+      dgd32_ + dgd32_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ;
+
+  if (highbd) {
+    const uint16_t *dgd16 = CONVERT_TO_SHORTPTR(dgd8);
+    for (int i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
+      for (int j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
+        dgd32[i * dgd32_stride + j] = dgd16[i * dgd_stride + j];
+      }
+    }
+  } else {
+    for (int i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
+      for (int j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
+        dgd32[i * dgd32_stride + j] = dgd8[i * dgd_stride + j];
+      }
+    }
+  }
+
+  const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
+  // If params->r == 0 we skip the corresponding filter. We only allow one of
+  // the radii to be 0, as having both equal to 0 would be equivalent to
+  // skipping SGR entirely.
+  assert(!(params->r[0] == 0 && params->r[1] == 0));
+
+  if (params->r[0] > 0)
+    selfguided_restoration_fast_internal(dgd32, width, height, dgd32_stride,
+                                         flt0, flt_stride, bit_depth,
+                                         sgr_params_idx, 0);
+  if (params->r[1] > 0)
+    selfguided_restoration_internal(dgd32, width, height, dgd32_stride, flt1,
+                                    flt_stride, bit_depth, sgr_params_idx, 1);
+  return 0;
+}
+
+void av1_apply_selfguided_restoration_c(const uint8_t *dat8, int width,
+                                        int height, int stride, int eps,
+                                        const int *xqd, uint8_t *dst8,
+                                        int dst_stride, int32_t *tmpbuf,
+                                        int bit_depth, int highbd) {
+  int32_t *flt0 = tmpbuf;
+  int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
+  assert(width * height <= RESTORATION_UNITPELS_MAX);
+
+  const int ret = av1_selfguided_restoration_c(
+      dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd);
+  (void)ret;
+  assert(!ret);
+  const sgr_params_type *const params = &av1_sgr_params[eps];
+  int xq[2];
+  av1_decode_xq(xqd, xq, params);
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; ++j) {
+      const int k = i * width + j;
+      uint8_t *dst8ij = dst8 + i * dst_stride + j;
+      const uint8_t *dat8ij = dat8 + i * stride + j;
+
+      const uint16_t pre_u = highbd ? *CONVERT_TO_SHORTPTR(dat8ij) : *dat8ij;
+      const int32_t u = (int32_t)pre_u << SGRPROJ_RST_BITS;
+      int32_t v = u << SGRPROJ_PRJ_BITS;
+      // If params->r == 0 then we skipped the filtering in
+      // av1_selfguided_restoration_c, i.e. flt[k] == u
+      if (params->r[0] > 0) v += xq[0] * (flt0[k] - u);
+      if (params->r[1] > 0) v += xq[1] * (flt1[k] - u);
+      const int16_t w =
+          (int16_t)ROUND_POWER_OF_TWO(v, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+
+      const uint16_t out = clip_pixel_highbd(w, bit_depth);
+      if (highbd)
+        *CONVERT_TO_SHORTPTR(dst8ij) = out;
+      else
+        *dst8ij = (uint8_t)out;
+    }
+  }
+}
+
+static void sgrproj_filter_stripe(const RestorationUnitInfo *rui,
+                                  int stripe_width, int stripe_height,
+                                  int procunit_width, const uint8_t *src,
+                                  int src_stride, uint8_t *dst, int dst_stride,
+                                  int32_t *tmpbuf, int bit_depth) {
+  (void)bit_depth;
+  assert(bit_depth == 8);
+
+  for (int j = 0; j < stripe_width; j += procunit_width) {
+    int w = AOMMIN(procunit_width, stripe_width - j);
+    av1_apply_selfguided_restoration(
+        src + j, w, stripe_height, src_stride, rui->sgrproj_info.ep,
+        rui->sgrproj_info.xqd, dst + j, dst_stride, tmpbuf, bit_depth, 0);
+  }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static void wiener_filter_stripe_highbd(const RestorationUnitInfo *rui,
+                                        int stripe_width, int stripe_height,
+                                        int procunit_width, const uint8_t *src8,
+                                        int src_stride, uint8_t *dst8,
+                                        int dst_stride, int32_t *tmpbuf,
+                                        int bit_depth) {
+  (void)tmpbuf;
+  const ConvolveParams conv_params = get_conv_params_wiener(bit_depth);
+
+  for (int j = 0; j < stripe_width; j += procunit_width) {
+    int w = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15);
+    const uint8_t *src8_p = src8 + j;
+    uint8_t *dst8_p = dst8 + j;
+    av1_highbd_wiener_convolve_add_src(src8_p, src_stride, dst8_p, dst_stride,
+                                       rui->wiener_info.hfilter, 16,
+                                       rui->wiener_info.vfilter, 16, w,
+                                       stripe_height, &conv_params, bit_depth);
+  }
+}
+
+static void sgrproj_filter_stripe_highbd(const RestorationUnitInfo *rui,
+                                         int stripe_width, int stripe_height,
+                                         int procunit_width,
+                                         const uint8_t *src8, int src_stride,
+                                         uint8_t *dst8, int dst_stride,
+                                         int32_t *tmpbuf, int bit_depth) {
+  for (int j = 0; j < stripe_width; j += procunit_width) {
+    int w = AOMMIN(procunit_width, stripe_width - j);
+    av1_apply_selfguided_restoration(
+        src8 + j, w, stripe_height, src_stride, rui->sgrproj_info.ep,
+        rui->sgrproj_info.xqd, dst8 + j, dst_stride, tmpbuf, bit_depth, 1);
+  }
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+typedef void (*stripe_filter_fun)(const RestorationUnitInfo *rui,
+                                  int stripe_width, int stripe_height,
+                                  int procunit_width, const uint8_t *src,
+                                  int src_stride, uint8_t *dst, int dst_stride,
+                                  int32_t *tmpbuf, int bit_depth);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+#define NUM_STRIPE_FILTERS 4
+static const stripe_filter_fun stripe_filters[NUM_STRIPE_FILTERS] = {
+  wiener_filter_stripe, sgrproj_filter_stripe, wiener_filter_stripe_highbd,
+  sgrproj_filter_stripe_highbd
+};
+#else
+#define NUM_STRIPE_FILTERS 2
+static const stripe_filter_fun stripe_filters[NUM_STRIPE_FILTERS] = {
+  wiener_filter_stripe, sgrproj_filter_stripe
+};
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+// Filter one restoration unit
+void av1_loop_restoration_filter_unit(
+    const RestorationTileLimits *limits, const RestorationUnitInfo *rui,
+    const RestorationStripeBoundaries *rsb, RestorationLineBuffers *rlbs,
+    const AV1PixelRect *tile_rect, int tile_stripe0, int ss_x, int ss_y,
+    int highbd, int bit_depth, uint8_t *data8, int stride, uint8_t *dst8,
+    int dst_stride, int32_t *tmpbuf, int optimized_lr) {
+  RestorationType unit_rtype = rui->restoration_type;
+
+  int unit_h = limits->v_end - limits->v_start;
+  int unit_w = limits->h_end - limits->h_start;
+  uint8_t *data8_tl = data8 + limits->v_start * stride + limits->h_start;
+  uint8_t *dst8_tl = dst8 + limits->v_start * dst_stride + limits->h_start;
+
+  if (unit_rtype == RESTORE_NONE) {
+    copy_tile(unit_w, unit_h, data8_tl, stride, dst8_tl, dst_stride, highbd);
+    return;
+  }
+
+  const int filter_idx = 2 * highbd + (unit_rtype == RESTORE_SGRPROJ);
+  assert(filter_idx < NUM_STRIPE_FILTERS);
+  const stripe_filter_fun stripe_filter = stripe_filters[filter_idx];
+
+  const int procunit_width = RESTORATION_PROC_UNIT_SIZE >> ss_x;
+
+  // Convolve the whole tile one stripe at a time
+  RestorationTileLimits remaining_stripes = *limits;
+  int i = 0;
+  while (i < unit_h) {
+    int copy_above, copy_below;
+    remaining_stripes.v_start = limits->v_start + i;
+
+    get_stripe_boundary_info(&remaining_stripes, tile_rect, ss_y, &copy_above,
+                             &copy_below);
+
+    const int full_stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
+    const int runit_offset = RESTORATION_UNIT_OFFSET >> ss_y;
+
+    // Work out where this stripe's boundaries are within
+    // rsb->stripe_boundary_{above,below}
+    const int tile_stripe =
+        (remaining_stripes.v_start - tile_rect->top + runit_offset) /
+        full_stripe_height;
+    const int frame_stripe = tile_stripe0 + tile_stripe;
+    const int rsb_row = RESTORATION_CTX_VERT * frame_stripe;
+
+    // Calculate this stripe's height, based on two rules:
+    // * The topmost stripe in each tile is 8 luma pixels shorter than usual.
+    // * We can't extend past the end of the current restoration unit
+    const int nominal_stripe_height =
+        full_stripe_height - ((tile_stripe == 0) ? runit_offset : 0);
+    const int h = AOMMIN(nominal_stripe_height,
+                         remaining_stripes.v_end - remaining_stripes.v_start);
+
+    setup_processing_stripe_boundary(&remaining_stripes, rsb, rsb_row, highbd,
+                                     h, data8, stride, rlbs, copy_above,
+                                     copy_below, optimized_lr);
+
+    stripe_filter(rui, unit_w, h, procunit_width, data8_tl + i * stride, stride,
+                  dst8_tl + i * dst_stride, dst_stride, tmpbuf, bit_depth);
+
+    restore_processing_stripe_boundary(&remaining_stripes, rlbs, highbd, h,
+                                       data8, stride, copy_above, copy_below,
+                                       optimized_lr);
+
+    i += h;
+  }
+}
+
+static void filter_frame_on_unit(const RestorationTileLimits *limits,
+                                 const AV1PixelRect *tile_rect,
+                                 int rest_unit_idx, void *priv, int32_t *tmpbuf,
+                                 RestorationLineBuffers *rlbs) {
+  FilterFrameCtxt *ctxt = (FilterFrameCtxt *)priv;
+  const RestorationInfo *rsi = ctxt->rsi;
+
+  av1_loop_restoration_filter_unit(
+      limits, &rsi->unit_info[rest_unit_idx], &rsi->boundaries, rlbs, tile_rect,
+      ctxt->tile_stripe0, ctxt->ss_x, ctxt->ss_y, ctxt->highbd, ctxt->bit_depth,
+      ctxt->data8, ctxt->data_stride, ctxt->dst8, ctxt->dst_stride, tmpbuf,
+      rsi->optimized_lr);
+}
+
+void av1_loop_restoration_filter_frame_init(AV1LrStruct *lr_ctxt,
+                                            YV12_BUFFER_CONFIG *frame,
+                                            AV1_COMMON *cm, int optimized_lr,
+                                            int num_planes) {
+  const SequenceHeader *const seq_params = &cm->seq_params;
+  const int bit_depth = seq_params->bit_depth;
+  const int highbd = seq_params->use_highbitdepth;
+  lr_ctxt->dst = &cm->rst_frame;
+
+  const int frame_width = frame->crop_widths[0];
+  const int frame_height = frame->crop_heights[0];
+  if (aom_realloc_frame_buffer(
+          lr_ctxt->dst, frame_width, frame_height, seq_params->subsampling_x,
+          seq_params->subsampling_y, highbd, AOM_RESTORATION_FRAME_BORDER,
+          cm->features.byte_alignment, NULL, NULL, NULL) < 0)
+    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate restoration dst buffer");
+
+  lr_ctxt->on_rest_unit = filter_frame_on_unit;
+  lr_ctxt->frame = frame;
+  for (int plane = 0; plane < num_planes; ++plane) {
+    RestorationInfo *rsi = &cm->rst_info[plane];
+    RestorationType rtype = rsi->frame_restoration_type;
+    rsi->optimized_lr = optimized_lr;
+
+    if (rtype == RESTORE_NONE) {
+      continue;
+    }
+
+    const int is_uv = plane > 0;
+    const int plane_width = frame->crop_widths[is_uv];
+    const int plane_height = frame->crop_heights[is_uv];
+    FilterFrameCtxt *lr_plane_ctxt = &lr_ctxt->ctxt[plane];
+
+    av1_extend_frame(frame->buffers[plane], plane_width, plane_height,
+                     frame->strides[is_uv], RESTORATION_BORDER,
+                     RESTORATION_BORDER, highbd);
+
+    lr_plane_ctxt->rsi = rsi;
+    lr_plane_ctxt->ss_x = is_uv && seq_params->subsampling_x;
+    lr_plane_ctxt->ss_y = is_uv && seq_params->subsampling_y;
+    lr_plane_ctxt->highbd = highbd;
+    lr_plane_ctxt->bit_depth = bit_depth;
+    lr_plane_ctxt->data8 = frame->buffers[plane];
+    lr_plane_ctxt->dst8 = lr_ctxt->dst->buffers[plane];
+    lr_plane_ctxt->data_stride = frame->strides[is_uv];
+    lr_plane_ctxt->dst_stride = lr_ctxt->dst->strides[is_uv];
+    lr_plane_ctxt->tile_rect = av1_whole_frame_rect(cm, is_uv);
+    lr_plane_ctxt->tile_stripe0 = 0;
+  }
+}
+
+void av1_loop_restoration_copy_planes(AV1LrStruct *loop_rest_ctxt,
+                                      AV1_COMMON *cm, int num_planes) {
+  typedef void (*copy_fun)(const YV12_BUFFER_CONFIG *src_ybc,
+                           YV12_BUFFER_CONFIG *dst_ybc, int hstart, int hend,
+                           int vstart, int vend);
+  static const copy_fun copy_funs[3] = { aom_yv12_partial_coloc_copy_y,
+                                         aom_yv12_partial_coloc_copy_u,
+                                         aom_yv12_partial_coloc_copy_v };
+  assert(num_planes <= 3);
+  for (int plane = 0; plane < num_planes; ++plane) {
+    if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
+    AV1PixelRect tile_rect = loop_rest_ctxt->ctxt[plane].tile_rect;
+    copy_funs[plane](loop_rest_ctxt->dst, loop_rest_ctxt->frame, tile_rect.left,
+                     tile_rect.right, tile_rect.top, tile_rect.bottom);
+  }
+}
+
+static void foreach_rest_unit_in_planes(AV1LrStruct *lr_ctxt, AV1_COMMON *cm,
+                                        int num_planes) {
+  FilterFrameCtxt *ctxt = lr_ctxt->ctxt;
+
+  for (int plane = 0; plane < num_planes; ++plane) {
+    if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) {
+      continue;
+    }
+
+    av1_foreach_rest_unit_in_plane(cm, plane, lr_ctxt->on_rest_unit,
+                                   &ctxt[plane], &ctxt[plane].tile_rect,
+                                   cm->rst_tmpbuf, cm->rlbs);
+  }
+}
+
+void av1_loop_restoration_filter_frame(YV12_BUFFER_CONFIG *frame,
+                                       AV1_COMMON *cm, int optimized_lr,
+                                       void *lr_ctxt) {
+  assert(!cm->features.all_lossless);
+  const int num_planes = av1_num_planes(cm);
+
+  AV1LrStruct *loop_rest_ctxt = (AV1LrStruct *)lr_ctxt;
+
+  av1_loop_restoration_filter_frame_init(loop_rest_ctxt, frame, cm,
+                                         optimized_lr, num_planes);
+
+  foreach_rest_unit_in_planes(loop_rest_ctxt, cm, num_planes);
+
+  av1_loop_restoration_copy_planes(loop_rest_ctxt, cm, num_planes);
+}
+
+void av1_foreach_rest_unit_in_row(
+    RestorationTileLimits *limits, const AV1PixelRect *tile_rect,
+    rest_unit_visitor_t on_rest_unit, int row_number, int unit_size,
+    int unit_idx0, int hunits_per_tile, int vunits_per_tile, int plane,
+    void *priv, int32_t *tmpbuf, RestorationLineBuffers *rlbs,
+    sync_read_fn_t on_sync_read, sync_write_fn_t on_sync_write,
+    struct AV1LrSyncData *const lr_sync) {
+  const int tile_w = tile_rect->right - tile_rect->left;
+  const int ext_size = unit_size * 3 / 2;
+  int x0 = 0, j = 0;
+  while (x0 < tile_w) {
+    int remaining_w = tile_w - x0;
+    int w = (remaining_w < ext_size) ? remaining_w : unit_size;
+
+    limits->h_start = tile_rect->left + x0;
+    limits->h_end = tile_rect->left + x0 + w;
+    assert(limits->h_end <= tile_rect->right);
+
+    const int unit_idx = unit_idx0 + row_number * hunits_per_tile + j;
+
+    // No sync for even numbered rows
+    // For odd numbered rows, Loop Restoration of current block requires the LR
+    // of top-right and bottom-right blocks to be completed
+
+    // top-right sync
+    on_sync_read(lr_sync, row_number, j, plane);
+    if ((row_number + 1) < vunits_per_tile)
+      // bottom-right sync
+      on_sync_read(lr_sync, row_number + 2, j, plane);
+
+    on_rest_unit(limits, tile_rect, unit_idx, priv, tmpbuf, rlbs);
+
+    on_sync_write(lr_sync, row_number, j, hunits_per_tile, plane);
+
+    x0 += w;
+    ++j;
+  }
+}
+
+void av1_lr_sync_read_dummy(void *const lr_sync, int r, int c, int plane) {
+  (void)lr_sync;
+  (void)r;
+  (void)c;
+  (void)plane;
+}
+
+void av1_lr_sync_write_dummy(void *const lr_sync, int r, int c,
+                             const int sb_cols, int plane) {
+  (void)lr_sync;
+  (void)r;
+  (void)c;
+  (void)sb_cols;
+  (void)plane;
+}
+
+static void foreach_rest_unit_in_tile(
+    const AV1PixelRect *tile_rect, int tile_row, int tile_col, int tile_cols,
+    int hunits_per_tile, int vunits_per_tile, int units_per_tile, int unit_size,
+    int ss_y, int plane, rest_unit_visitor_t on_rest_unit, void *priv,
+    int32_t *tmpbuf, RestorationLineBuffers *rlbs) {
+  const int tile_h = tile_rect->bottom - tile_rect->top;
+  const int ext_size = unit_size * 3 / 2;
+
+  const int tile_idx = tile_col + tile_row * tile_cols;
+  const int unit_idx0 = tile_idx * units_per_tile;
+
+  int y0 = 0, i = 0;
+  while (y0 < tile_h) {
+    int remaining_h = tile_h - y0;
+    int h = (remaining_h < ext_size) ? remaining_h : unit_size;
+
+    RestorationTileLimits limits;
+    limits.v_start = tile_rect->top + y0;
+    limits.v_end = tile_rect->top + y0 + h;
+    assert(limits.v_end <= tile_rect->bottom);
+    // Offset the tile upwards to align with the restoration processing stripe
+    const int voffset = RESTORATION_UNIT_OFFSET >> ss_y;
+    limits.v_start = AOMMAX(tile_rect->top, limits.v_start - voffset);
+    if (limits.v_end < tile_rect->bottom) limits.v_end -= voffset;
+
+    av1_foreach_rest_unit_in_row(
+        &limits, tile_rect, on_rest_unit, i, unit_size, unit_idx0,
+        hunits_per_tile, vunits_per_tile, plane, priv, tmpbuf, rlbs,
+        av1_lr_sync_read_dummy, av1_lr_sync_write_dummy, NULL);
+
+    y0 += h;
+    ++i;
+  }
+}
+
+void av1_foreach_rest_unit_in_plane(const struct AV1Common *cm, int plane,
+                                    rest_unit_visitor_t on_rest_unit,
+                                    void *priv, AV1PixelRect *tile_rect,
+                                    int32_t *tmpbuf,
+                                    RestorationLineBuffers *rlbs) {
+  const int is_uv = plane > 0;
+  const int ss_y = is_uv && cm->seq_params.subsampling_y;
+
+  const RestorationInfo *rsi = &cm->rst_info[plane];
+
+  foreach_rest_unit_in_tile(tile_rect, LR_TILE_ROW, LR_TILE_COL, LR_TILE_COLS,
+                            rsi->horz_units_per_tile, rsi->vert_units_per_tile,
+                            rsi->units_per_tile, rsi->restoration_unit_size,
+                            ss_y, plane, on_rest_unit, priv, tmpbuf, rlbs);
+}
+
+int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane,
+                                       int mi_row, int mi_col, BLOCK_SIZE bsize,
+                                       int *rcol0, int *rcol1, int *rrow0,
+                                       int *rrow1) {
+  assert(rcol0 && rcol1 && rrow0 && rrow1);
+
+  if (bsize != cm->seq_params.sb_size) return 0;
+  if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) return 0;
+
+  assert(!cm->features.all_lossless);
+
+  const int is_uv = plane > 0;
+
+  const AV1PixelRect tile_rect = av1_whole_frame_rect(cm, is_uv);
+  const int tile_w = tile_rect.right - tile_rect.left;
+  const int tile_h = tile_rect.bottom - tile_rect.top;
+
+  const int mi_top = 0;
+  const int mi_left = 0;
+
+  // Compute the mi-unit corners of the superblock relative to the top-left of
+  // the tile
+  const int mi_rel_row0 = mi_row - mi_top;
+  const int mi_rel_col0 = mi_col - mi_left;
+  const int mi_rel_row1 = mi_rel_row0 + mi_size_high[bsize];
+  const int mi_rel_col1 = mi_rel_col0 + mi_size_wide[bsize];
+
+  const RestorationInfo *rsi = &cm->rst_info[plane];
+  const int size = rsi->restoration_unit_size;
+
+  // Calculate the number of restoration units in this tile (which might be
+  // strictly less than rsi->horz_units_per_tile and rsi->vert_units_per_tile)
+  const int horz_units = av1_lr_count_units_in_tile(size, tile_w);
+  const int vert_units = av1_lr_count_units_in_tile(size, tile_h);
+
+  // The size of an MI-unit on this plane of the image
+  const int ss_x = is_uv && cm->seq_params.subsampling_x;
+  const int ss_y = is_uv && cm->seq_params.subsampling_y;
+  const int mi_size_x = MI_SIZE >> ss_x;
+  const int mi_size_y = MI_SIZE >> ss_y;
+
+  // Write m for the relative mi column or row, D for the superres denominator
+  // and N for the superres numerator. If u is the upscaled pixel offset then
+  // we can write the downscaled pixel offset in two ways as:
+  //
+  //   MI_SIZE * m = N / D u
+  //
+  // from which we get u = D * MI_SIZE * m / N
+  const int mi_to_num_x = av1_superres_scaled(cm)
+                              ? mi_size_x * cm->superres_scale_denominator
+                              : mi_size_x;
+  const int mi_to_num_y = mi_size_y;
+  const int denom_x = av1_superres_scaled(cm) ? size * SCALE_NUMERATOR : size;
+  const int denom_y = size;
+
+  const int rnd_x = denom_x - 1;
+  const int rnd_y = denom_y - 1;
+
+  // rcol0/rrow0 should be the first column/row of restoration units (relative
+  // to the top-left of the tile) that doesn't start left/below of
+  // mi_col/mi_row. For this calculation, we need to round up the division (if
+  // the sb starts at runit column 10.1, the first matching runit has column
+  // index 11)
+  *rcol0 = (mi_rel_col0 * mi_to_num_x + rnd_x) / denom_x;
+  *rrow0 = (mi_rel_row0 * mi_to_num_y + rnd_y) / denom_y;
+
+  // rel_col1/rel_row1 is the equivalent calculation, but for the superblock
+  // below-right. If we're at the bottom or right of the tile, this restoration
+  // unit might not exist, in which case we'll clamp accordingly.
+  *rcol1 = AOMMIN((mi_rel_col1 * mi_to_num_x + rnd_x) / denom_x, horz_units);
+  *rrow1 = AOMMIN((mi_rel_row1 * mi_to_num_y + rnd_y) / denom_y, vert_units);
+
+  return *rcol0 < *rcol1 && *rrow0 < *rrow1;
+}
+
+// Extend to left and right
+static void extend_lines(uint8_t *buf, int width, int height, int stride,
+                         int extend, int use_highbitdepth) {
+  for (int i = 0; i < height; ++i) {
+    if (use_highbitdepth) {
+      uint16_t *buf16 = (uint16_t *)buf;
+      aom_memset16(buf16 - extend, buf16[0], extend);
+      aom_memset16(buf16 + width, buf16[width - 1], extend);
+    } else {
+      memset(buf - extend, buf[0], extend);
+      memset(buf + width, buf[width - 1], extend);
+    }
+    buf += stride;
+  }
+}
+
+static void save_deblock_boundary_lines(
+    const YV12_BUFFER_CONFIG *frame, const AV1_COMMON *cm, int plane, int row,
+    int stripe, int use_highbd, int is_above,
+    RestorationStripeBoundaries *boundaries) {
+  const int is_uv = plane > 0;
+  const uint8_t *src_buf = REAL_PTR(use_highbd, frame->buffers[plane]);
+  const int src_stride = frame->strides[is_uv] << use_highbd;
+  const uint8_t *src_rows = src_buf + row * src_stride;
+
+  uint8_t *bdry_buf = is_above ? boundaries->stripe_boundary_above
+                               : boundaries->stripe_boundary_below;
+  uint8_t *bdry_start = bdry_buf + (RESTORATION_EXTRA_HORZ << use_highbd);
+  const int bdry_stride = boundaries->stripe_boundary_stride << use_highbd;
+  uint8_t *bdry_rows = bdry_start + RESTORATION_CTX_VERT * stripe * bdry_stride;
+
+  // There is a rare case in which a processing stripe can end 1px above the
+  // crop border. In this case, we do want to use deblocked pixels from below
+  // the stripe (hence why we ended up in this function), but instead of
+  // fetching 2 "below" rows we need to fetch one and duplicate it.
+  // This is equivalent to clamping the sample locations against the crop border
+  const int lines_to_save =
+      AOMMIN(RESTORATION_CTX_VERT, frame->crop_heights[is_uv] - row);
+  assert(lines_to_save == 1 || lines_to_save == 2);
+
+  int upscaled_width;
+  int line_bytes;
+  if (av1_superres_scaled(cm)) {
+    const int ss_x = is_uv && cm->seq_params.subsampling_x;
+    upscaled_width = (cm->superres_upscaled_width + ss_x) >> ss_x;
+    line_bytes = upscaled_width << use_highbd;
+    if (use_highbd)
+      av1_upscale_normative_rows(
+          cm, CONVERT_TO_BYTEPTR(src_rows), frame->strides[is_uv],
+          CONVERT_TO_BYTEPTR(bdry_rows), boundaries->stripe_boundary_stride,
+          plane, lines_to_save);
+    else
+      av1_upscale_normative_rows(cm, src_rows, frame->strides[is_uv], bdry_rows,
+                                 boundaries->stripe_boundary_stride, plane,
+                                 lines_to_save);
+  } else {
+    upscaled_width = frame->crop_widths[is_uv];
+    line_bytes = upscaled_width << use_highbd;
+    for (int i = 0; i < lines_to_save; i++) {
+      memcpy(bdry_rows + i * bdry_stride, src_rows + i * src_stride,
+             line_bytes);
+    }
+  }
+  // If we only saved one line, then copy it into the second line buffer
+  if (lines_to_save == 1)
+    memcpy(bdry_rows + bdry_stride, bdry_rows, line_bytes);
+
+  extend_lines(bdry_rows, upscaled_width, RESTORATION_CTX_VERT, bdry_stride,
+               RESTORATION_EXTRA_HORZ, use_highbd);
+}
+
+static void save_cdef_boundary_lines(const YV12_BUFFER_CONFIG *frame,
+                                     const AV1_COMMON *cm, int plane, int row,
+                                     int stripe, int use_highbd, int is_above,
+                                     RestorationStripeBoundaries *boundaries) {
+  const int is_uv = plane > 0;
+  const uint8_t *src_buf = REAL_PTR(use_highbd, frame->buffers[plane]);
+  const int src_stride = frame->strides[is_uv] << use_highbd;
+  const uint8_t *src_rows = src_buf + row * src_stride;
+
+  uint8_t *bdry_buf = is_above ? boundaries->stripe_boundary_above
+                               : boundaries->stripe_boundary_below;
+  uint8_t *bdry_start = bdry_buf + (RESTORATION_EXTRA_HORZ << use_highbd);
+  const int bdry_stride = boundaries->stripe_boundary_stride << use_highbd;
+  uint8_t *bdry_rows = bdry_start + RESTORATION_CTX_VERT * stripe * bdry_stride;
+  const int src_width = frame->crop_widths[is_uv];
+
+  // At the point where this function is called, we've already applied
+  // superres. So we don't need to extend the lines here, we can just
+  // pull directly from the topmost row of the upscaled frame.
+  const int ss_x = is_uv && cm->seq_params.subsampling_x;
+  const int upscaled_width = av1_superres_scaled(cm)
+                                 ? (cm->superres_upscaled_width + ss_x) >> ss_x
+                                 : src_width;
+  const int line_bytes = upscaled_width << use_highbd;
+  for (int i = 0; i < RESTORATION_CTX_VERT; i++) {
+    // Copy the line at 'row' into both context lines. This is because
+    // we want to (effectively) extend the outermost row of CDEF data
+    // from this tile to produce a border, rather than using deblocked
+    // pixels from the tile above/below.
+    memcpy(bdry_rows + i * bdry_stride, src_rows, line_bytes);
+  }
+  extend_lines(bdry_rows, upscaled_width, RESTORATION_CTX_VERT, bdry_stride,
+               RESTORATION_EXTRA_HORZ, use_highbd);
+}
+
+static void save_tile_row_boundary_lines(const YV12_BUFFER_CONFIG *frame,
+                                         int use_highbd, int plane,
+                                         AV1_COMMON *cm, int after_cdef) {
+  const int is_uv = plane > 0;
+  const int ss_y = is_uv && cm->seq_params.subsampling_y;
+  const int stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
+  const int stripe_off = RESTORATION_UNIT_OFFSET >> ss_y;
+
+  // Get the tile rectangle, with height rounded up to the next multiple of 8
+  // luma pixels (only relevant for the bottom tile of the frame)
+  const AV1PixelRect tile_rect = av1_whole_frame_rect(cm, is_uv);
+  const int stripe0 = 0;
+
+  RestorationStripeBoundaries *boundaries = &cm->rst_info[plane].boundaries;
+
+  const int plane_height = ROUND_POWER_OF_TWO(cm->height, ss_y);
+
+  int tile_stripe;
+  for (tile_stripe = 0;; ++tile_stripe) {
+    const int rel_y0 = AOMMAX(0, tile_stripe * stripe_height - stripe_off);
+    const int y0 = tile_rect.top + rel_y0;
+    if (y0 >= tile_rect.bottom) break;
+
+    const int rel_y1 = (tile_stripe + 1) * stripe_height - stripe_off;
+    const int y1 = AOMMIN(tile_rect.top + rel_y1, tile_rect.bottom);
+
+    const int frame_stripe = stripe0 + tile_stripe;
+
+    // In this case, we should only use CDEF pixels at the top
+    // and bottom of the frame as a whole; internal tile boundaries
+    // can use deblocked pixels from adjacent tiles for context.
+    const int use_deblock_above = (frame_stripe > 0);
+    const int use_deblock_below = (y1 < plane_height);
+
+    if (!after_cdef) {
+      // Save deblocked context where needed.
+      if (use_deblock_above) {
+        save_deblock_boundary_lines(frame, cm, plane, y0 - RESTORATION_CTX_VERT,
+                                    frame_stripe, use_highbd, 1, boundaries);
+      }
+      if (use_deblock_below) {
+        save_deblock_boundary_lines(frame, cm, plane, y1, frame_stripe,
+                                    use_highbd, 0, boundaries);
+      }
+    } else {
+      // Save CDEF context where needed. Note that we need to save the CDEF
+      // context for a particular boundary iff we *didn't* save deblocked
+      // context for that boundary.
+      //
+      // In addition, we need to save copies of the outermost line within
+      // the tile, rather than using data from outside the tile.
+      if (!use_deblock_above) {
+        save_cdef_boundary_lines(frame, cm, plane, y0, frame_stripe, use_highbd,
+                                 1, boundaries);
+      }
+      if (!use_deblock_below) {
+        save_cdef_boundary_lines(frame, cm, plane, y1 - 1, frame_stripe,
+                                 use_highbd, 0, boundaries);
+      }
+    }
+  }
+}
+
+// For each RESTORATION_PROC_UNIT_SIZE pixel high stripe, save 4 scan
+// lines to be used as boundary in the loop restoration process. The
+// lines are saved in rst_internal.stripe_boundary_lines
+void av1_loop_restoration_save_boundary_lines(const YV12_BUFFER_CONFIG *frame,
+                                              AV1_COMMON *cm, int after_cdef) {
+  const int num_planes = av1_num_planes(cm);
+  const int use_highbd = cm->seq_params.use_highbitdepth;
+  for (int p = 0; p < num_planes; ++p) {
+    save_tile_row_boundary_lines(frame, use_highbd, p, cm, after_cdef);
+  }
+}
diff --git a/libs/libaom/src/av1/common/restoration.h b/libs/libaom/src/av1/common/restoration.h
new file mode 100644
index 000000000..3b80dd5a9
--- /dev/null
+++ b/libs/libaom/src/av1/common/restoration.h
@@ -0,0 +1,380 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_RESTORATION_H_
+#define AOM_AV1_COMMON_RESTORATION_H_
+
+#include "aom_ports/mem.h"
+#include "config/aom_config.h"
+
+#include "av1/common/blockd.h"
+#include "av1/common/enums.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Border for Loop restoration buffer
+#define AOM_RESTORATION_FRAME_BORDER 32
+#define CLIP(x, lo, hi) ((x) < (lo) ? (lo) : (x) > (hi) ? (hi) : (x))
+#define RINT(x) ((x) < 0 ? (int)((x)-0.5) : (int)((x) + 0.5))
+
+#define RESTORATION_PROC_UNIT_SIZE 64
+
+// Filter tile grid offset upwards compared to the superblock grid
+#define RESTORATION_UNIT_OFFSET 8
+
+#define SGRPROJ_BORDER_VERT 3  // Vertical border used for Sgr
+#define SGRPROJ_BORDER_HORZ 3  // Horizontal border used for Sgr
+
+#define WIENER_BORDER_VERT 2  // Vertical border used for Wiener
+#define WIENER_HALFWIN 3
+#define WIENER_BORDER_HORZ (WIENER_HALFWIN)  // Horizontal border for Wiener
+
+// RESTORATION_BORDER_VERT determines line buffer requirement for LR.
+// Should be set at the max of SGRPROJ_BORDER_VERT and WIENER_BORDER_VERT.
+// Note the line buffer needed is twice the value of this macro.
+#if SGRPROJ_BORDER_VERT >= WIENER_BORDER_VERT
+#define RESTORATION_BORDER_VERT (SGRPROJ_BORDER_VERT)
+#else
+#define RESTORATION_BORDER_VERT (WIENER_BORDER_VERT)
+#endif  // SGRPROJ_BORDER_VERT >= WIENER_BORDER_VERT
+
+#if SGRPROJ_BORDER_HORZ >= WIENER_BORDER_HORZ
+#define RESTORATION_BORDER_HORZ (SGRPROJ_BORDER_HORZ)
+#else
+#define RESTORATION_BORDER_HORZ (WIENER_BORDER_HORZ)
+#endif  // SGRPROJ_BORDER_VERT >= WIENER_BORDER_VERT
+
+// How many border pixels do we need for each processing unit?
+#define RESTORATION_BORDER 3
+
+// How many rows of deblocked pixels do we save above/below each processing
+// stripe?
+#define RESTORATION_CTX_VERT 2
+
+// Additional pixels to the left and right in above/below buffers
+// It is RESTORATION_BORDER_HORZ rounded up to get nicer buffer alignment
+#define RESTORATION_EXTRA_HORZ 4
+
+// Pad up to 20 more (may be much less is needed)
+#define RESTORATION_PADDING 20
+#define RESTORATION_PROC_UNIT_PELS                             \
+  ((RESTORATION_PROC_UNIT_SIZE + RESTORATION_BORDER_HORZ * 2 + \
+    RESTORATION_PADDING) *                                     \
+   (RESTORATION_PROC_UNIT_SIZE + RESTORATION_BORDER_VERT * 2 + \
+    RESTORATION_PADDING))
+
+#define RESTORATION_UNITSIZE_MAX 256
+#define RESTORATION_UNITPELS_HORZ_MAX \
+  (RESTORATION_UNITSIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_HORZ + 16)
+#define RESTORATION_UNITPELS_VERT_MAX                                \
+  ((RESTORATION_UNITSIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_VERT + \
+    RESTORATION_UNIT_OFFSET))
+#define RESTORATION_UNITPELS_MAX \
+  (RESTORATION_UNITPELS_HORZ_MAX * RESTORATION_UNITPELS_VERT_MAX)
+
+// Two 32-bit buffers needed for the restored versions from two filters
+// TODO(debargha, rupert): Refactor to not need the large tilesize to be stored
+// on the decoder side.
+#define SGRPROJ_TMPBUF_SIZE (RESTORATION_UNITPELS_MAX * 2 * sizeof(int32_t))
+
+#define SGRPROJ_EXTBUF_SIZE (0)
+#define SGRPROJ_PARAMS_BITS 4
+#define SGRPROJ_PARAMS (1 << SGRPROJ_PARAMS_BITS)
+
+// Precision bits for projection
+#define SGRPROJ_PRJ_BITS 7
+// Restoration precision bits generated higher than source before projection
+#define SGRPROJ_RST_BITS 4
+// Internal precision bits for core selfguided_restoration
+#define SGRPROJ_SGR_BITS 8
+#define SGRPROJ_SGR (1 << SGRPROJ_SGR_BITS)
+
+#define SGRPROJ_PRJ_MIN0 (-(1 << SGRPROJ_PRJ_BITS) * 3 / 4)
+#define SGRPROJ_PRJ_MAX0 (SGRPROJ_PRJ_MIN0 + (1 << SGRPROJ_PRJ_BITS) - 1)
+#define SGRPROJ_PRJ_MIN1 (-(1 << SGRPROJ_PRJ_BITS) / 4)
+#define SGRPROJ_PRJ_MAX1 (SGRPROJ_PRJ_MIN1 + (1 << SGRPROJ_PRJ_BITS) - 1)
+
+#define SGRPROJ_PRJ_SUBEXP_K 4
+
+#define SGRPROJ_BITS (SGRPROJ_PRJ_BITS * 2 + SGRPROJ_PARAMS_BITS)
+
+#define MAX_RADIUS 2  // Only 1, 2, 3 allowed
+#define MAX_NELEM ((2 * MAX_RADIUS + 1) * (2 * MAX_RADIUS + 1))
+#define SGRPROJ_MTABLE_BITS 20
+#define SGRPROJ_RECIP_BITS 12
+
+#define WIENER_HALFWIN1 (WIENER_HALFWIN + 1)
+#define WIENER_WIN (2 * WIENER_HALFWIN + 1)
+#define WIENER_WIN2 ((WIENER_WIN) * (WIENER_WIN))
+#define WIENER_TMPBUF_SIZE (0)
+#define WIENER_EXTBUF_SIZE (0)
+
+// If WIENER_WIN_CHROMA == WIENER_WIN - 2, that implies 5x5 filters are used for
+// chroma. To use 7x7 for chroma set WIENER_WIN_CHROMA to WIENER_WIN.
+#define WIENER_WIN_CHROMA (WIENER_WIN - 2)
+#define WIENER_WIN_REDUCED (WIENER_WIN - 2)
+#define WIENER_WIN2_CHROMA ((WIENER_WIN_CHROMA) * (WIENER_WIN_CHROMA))
+
+#define WIENER_FILT_PREC_BITS 7
+#define WIENER_FILT_STEP (1 << WIENER_FILT_PREC_BITS)
+
+// Central values for the taps
+#define WIENER_FILT_TAP0_MIDV (3)
+#define WIENER_FILT_TAP1_MIDV (-7)
+#define WIENER_FILT_TAP2_MIDV (15)
+#define WIENER_FILT_TAP3_MIDV                                              \
+  (WIENER_FILT_STEP - 2 * (WIENER_FILT_TAP0_MIDV + WIENER_FILT_TAP1_MIDV + \
+                           WIENER_FILT_TAP2_MIDV))
+
+#define WIENER_FILT_TAP0_BITS 4
+#define WIENER_FILT_TAP1_BITS 5
+#define WIENER_FILT_TAP2_BITS 6
+
+#define WIENER_FILT_BITS \
+  ((WIENER_FILT_TAP0_BITS + WIENER_FILT_TAP1_BITS + WIENER_FILT_TAP2_BITS) * 2)
+
+#define WIENER_FILT_TAP0_MINV \
+  (WIENER_FILT_TAP0_MIDV - (1 << WIENER_FILT_TAP0_BITS) / 2)
+#define WIENER_FILT_TAP1_MINV \
+  (WIENER_FILT_TAP1_MIDV - (1 << WIENER_FILT_TAP1_BITS) / 2)
+#define WIENER_FILT_TAP2_MINV \
+  (WIENER_FILT_TAP2_MIDV - (1 << WIENER_FILT_TAP2_BITS) / 2)
+
+#define WIENER_FILT_TAP0_MAXV \
+  (WIENER_FILT_TAP0_MIDV - 1 + (1 << WIENER_FILT_TAP0_BITS) / 2)
+#define WIENER_FILT_TAP1_MAXV \
+  (WIENER_FILT_TAP1_MIDV - 1 + (1 << WIENER_FILT_TAP1_BITS) / 2)
+#define WIENER_FILT_TAP2_MAXV \
+  (WIENER_FILT_TAP2_MIDV - 1 + (1 << WIENER_FILT_TAP2_BITS) / 2)
+
+#define WIENER_FILT_TAP0_SUBEXP_K 1
+#define WIENER_FILT_TAP1_SUBEXP_K 2
+#define WIENER_FILT_TAP2_SUBEXP_K 3
+
+// Max of SGRPROJ_TMPBUF_SIZE, DOMAINTXFMRF_TMPBUF_SIZE, WIENER_TMPBUF_SIZE
+#define RESTORATION_TMPBUF_SIZE (SGRPROJ_TMPBUF_SIZE)
+
+// Max of SGRPROJ_EXTBUF_SIZE, WIENER_EXTBUF_SIZE
+#define RESTORATION_EXTBUF_SIZE (WIENER_EXTBUF_SIZE)
+
+// Check the assumptions of the existing code
+#if SUBPEL_TAPS != WIENER_WIN + 1
+#error "Wiener filter currently only works if SUBPEL_TAPS == WIENER_WIN + 1"
+#endif
+#if WIENER_FILT_PREC_BITS != 7
+#error "Wiener filter currently only works if WIENER_FILT_PREC_BITS == 7"
+#endif
+
+#define LR_TILE_ROW 0
+#define LR_TILE_COL 0
+#define LR_TILE_COLS 1
+
+typedef struct {
+  int r[2];  // radii
+  int s[2];  // sgr parameters for r[0] and r[1], based on GenSgrprojVtable()
+} sgr_params_type;
+
+typedef struct {
+  RestorationType restoration_type;
+  WienerInfo wiener_info;
+  SgrprojInfo sgrproj_info;
+} RestorationUnitInfo;
+
+// A restoration line buffer needs space for two lines plus a horizontal filter
+// margin of RESTORATION_EXTRA_HORZ on each side.
+#define RESTORATION_LINEBUFFER_WIDTH \
+  (RESTORATION_UNITSIZE_MAX * 3 / 2 + 2 * RESTORATION_EXTRA_HORZ)
+
+// Similarly, the column buffers (used when we're at a vertical tile edge
+// that we can't filter across) need space for one processing unit's worth
+// of pixels, plus the top/bottom border width
+#define RESTORATION_COLBUFFER_HEIGHT \
+  (RESTORATION_PROC_UNIT_SIZE + 2 * RESTORATION_BORDER)
+
+typedef struct {
+  // Temporary buffers to save/restore 3 lines above/below the restoration
+  // stripe.
+  uint16_t tmp_save_above[RESTORATION_BORDER][RESTORATION_LINEBUFFER_WIDTH];
+  uint16_t tmp_save_below[RESTORATION_BORDER][RESTORATION_LINEBUFFER_WIDTH];
+} RestorationLineBuffers;
+
+typedef struct {
+  uint8_t *stripe_boundary_above;
+  uint8_t *stripe_boundary_below;
+  int stripe_boundary_stride;
+  int stripe_boundary_size;
+} RestorationStripeBoundaries;
+
+typedef struct {
+  RestorationType frame_restoration_type;
+  int restoration_unit_size;
+
+  // Fields below here are allocated and initialised by
+  // av1_alloc_restoration_struct. (horz_)units_per_tile give the number of
+  // restoration units in (one row of) the largest tile in the frame. The data
+  // in unit_info is laid out with units_per_tile entries for each tile, which
+  // have stride horz_units_per_tile.
+  //
+  // Even if there are tiles of different sizes, the data in unit_info is laid
+  // out as if all tiles are of full size.
+  int units_per_tile;
+  int vert_units_per_tile, horz_units_per_tile;
+  RestorationUnitInfo *unit_info;
+  RestorationStripeBoundaries boundaries;
+  int optimized_lr;
+} RestorationInfo;
+
+static INLINE void set_default_sgrproj(SgrprojInfo *sgrproj_info) {
+  sgrproj_info->xqd[0] = (SGRPROJ_PRJ_MIN0 + SGRPROJ_PRJ_MAX0) / 2;
+  sgrproj_info->xqd[1] = (SGRPROJ_PRJ_MIN1 + SGRPROJ_PRJ_MAX1) / 2;
+}
+
+static INLINE void set_default_wiener(WienerInfo *wiener_info) {
+  wiener_info->vfilter[0] = wiener_info->hfilter[0] = WIENER_FILT_TAP0_MIDV;
+  wiener_info->vfilter[1] = wiener_info->hfilter[1] = WIENER_FILT_TAP1_MIDV;
+  wiener_info->vfilter[2] = wiener_info->hfilter[2] = WIENER_FILT_TAP2_MIDV;
+  wiener_info->vfilter[WIENER_HALFWIN] = wiener_info->hfilter[WIENER_HALFWIN] =
+      -2 *
+      (WIENER_FILT_TAP2_MIDV + WIENER_FILT_TAP1_MIDV + WIENER_FILT_TAP0_MIDV);
+  wiener_info->vfilter[4] = wiener_info->hfilter[4] = WIENER_FILT_TAP2_MIDV;
+  wiener_info->vfilter[5] = wiener_info->hfilter[5] = WIENER_FILT_TAP1_MIDV;
+  wiener_info->vfilter[6] = wiener_info->hfilter[6] = WIENER_FILT_TAP0_MIDV;
+}
+
+typedef struct {
+  int h_start, h_end, v_start, v_end;
+} RestorationTileLimits;
+
+typedef void (*rest_unit_visitor_t)(const RestorationTileLimits *limits,
+                                    const AV1PixelRect *tile_rect,
+                                    int rest_unit_idx, void *priv,
+                                    int32_t *tmpbuf,
+                                    RestorationLineBuffers *rlbs);
+
+typedef struct FilterFrameCtxt {
+  const RestorationInfo *rsi;
+  int tile_stripe0;
+  int ss_x, ss_y;
+  int highbd, bit_depth;
+  uint8_t *data8, *dst8;
+  int data_stride, dst_stride;
+  AV1PixelRect tile_rect;
+} FilterFrameCtxt;
+
+typedef struct AV1LrStruct {
+  rest_unit_visitor_t on_rest_unit;
+  FilterFrameCtxt ctxt[MAX_MB_PLANE];
+  YV12_BUFFER_CONFIG *frame;
+  YV12_BUFFER_CONFIG *dst;
+} AV1LrStruct;
+
+extern const sgr_params_type av1_sgr_params[SGRPROJ_PARAMS];
+extern int sgrproj_mtable[SGRPROJ_PARAMS][2];
+extern const int32_t av1_x_by_xplus1[256];
+extern const int32_t av1_one_by_x[MAX_NELEM];
+
+void av1_alloc_restoration_struct(struct AV1Common *cm, RestorationInfo *rsi,
+                                  int is_uv);
+void av1_free_restoration_struct(RestorationInfo *rst_info);
+
+void av1_extend_frame(uint8_t *data, int width, int height, int stride,
+                      int border_horz, int border_vert, int highbd);
+void av1_decode_xq(const int *xqd, int *xq, const sgr_params_type *params);
+
+// Filter a single loop restoration unit.
+//
+// limits is the limits of the unit. rui gives the mode to use for this unit
+// and its coefficients. If striped loop restoration is enabled, rsb contains
+// deblocked pixels to use for stripe boundaries; rlbs is just some space to
+// use as a scratch buffer. tile_rect gives the limits of the tile containing
+// this unit. tile_stripe0 is the index of the first stripe in this tile.
+//
+// ss_x and ss_y are flags which should be 1 if this is a plane with
+// horizontal/vertical subsampling, respectively. highbd is a flag which should
+// be 1 in high bit depth mode, in which case bit_depth is the bit depth.
+//
+// data8 is the frame data (pointing at the top-left corner of the frame, not
+// the restoration unit) and stride is its stride. dst8 is the buffer where the
+// results will be written and has stride dst_stride. Like data8, dst8 should
+// point at the top-left corner of the frame.
+//
+// Finally tmpbuf is a scratch buffer used by the sgrproj filter which should
+// be at least SGRPROJ_TMPBUF_SIZE big.
+void av1_loop_restoration_filter_unit(
+    const RestorationTileLimits *limits, const RestorationUnitInfo *rui,
+    const RestorationStripeBoundaries *rsb, RestorationLineBuffers *rlbs,
+    const AV1PixelRect *tile_rect, int tile_stripe0, int ss_x, int ss_y,
+    int highbd, int bit_depth, uint8_t *data8, int stride, uint8_t *dst8,
+    int dst_stride, int32_t *tmpbuf, int optimized_lr);
+
+void av1_loop_restoration_filter_frame(YV12_BUFFER_CONFIG *frame,
+                                       struct AV1Common *cm, int optimized_lr,
+                                       void *lr_ctxt);
+void av1_loop_restoration_precal();
+
+typedef void (*rest_tile_start_visitor_t)(int tile_row, int tile_col,
+                                          void *priv);
+struct AV1LrSyncData;
+
+typedef void (*sync_read_fn_t)(void *const lr_sync, int r, int c, int plane);
+
+typedef void (*sync_write_fn_t)(void *const lr_sync, int r, int c,
+                                const int sb_cols, int plane);
+
+// Call on_rest_unit for each loop restoration unit in the plane.
+void av1_foreach_rest_unit_in_plane(const struct AV1Common *cm, int plane,
+                                    rest_unit_visitor_t on_rest_unit,
+                                    void *priv, AV1PixelRect *tile_rect,
+                                    int32_t *tmpbuf,
+                                    RestorationLineBuffers *rlbs);
+
+// Return 1 iff the block at mi_row, mi_col with size bsize is a
+// top-level superblock containing the top-left corner of at least one
+// loop restoration unit.
+//
+// If the block is a top-level superblock, the function writes to
+// *rcol0, *rcol1, *rrow0, *rrow1. The rectangle of restoration unit
+// indices given by [*rcol0, *rcol1) x [*rrow0, *rrow1) are relative
+// to the current tile, whose starting index is returned as
+// *tile_tl_idx.
+int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane,
+                                       int mi_row, int mi_col, BLOCK_SIZE bsize,
+                                       int *rcol0, int *rcol1, int *rrow0,
+                                       int *rrow1);
+
+void av1_loop_restoration_save_boundary_lines(const YV12_BUFFER_CONFIG *frame,
+                                              struct AV1Common *cm,
+                                              int after_cdef);
+void av1_loop_restoration_filter_frame_init(AV1LrStruct *lr_ctxt,
+                                            YV12_BUFFER_CONFIG *frame,
+                                            struct AV1Common *cm,
+                                            int optimized_lr, int num_planes);
+void av1_loop_restoration_copy_planes(AV1LrStruct *loop_rest_ctxt,
+                                      struct AV1Common *cm, int num_planes);
+void av1_foreach_rest_unit_in_row(
+    RestorationTileLimits *limits, const AV1PixelRect *tile_rect,
+    rest_unit_visitor_t on_rest_unit, int row_number, int unit_size,
+    int unit_idx0, int hunits_per_tile, int vunits_per_tile, int plane,
+    void *priv, int32_t *tmpbuf, RestorationLineBuffers *rlbs,
+    sync_read_fn_t on_sync_read, sync_write_fn_t on_sync_write,
+    struct AV1LrSyncData *const lr_sync);
+AV1PixelRect av1_whole_frame_rect(const struct AV1Common *cm, int is_uv);
+int av1_lr_count_units_in_tile(int unit_size, int tile_size);
+void av1_lr_sync_read_dummy(void *const lr_sync, int r, int c, int plane);
+void av1_lr_sync_write_dummy(void *const lr_sync, int r, int c,
+                             const int sb_cols, int plane);
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_COMMON_RESTORATION_H_
diff --git a/libs/libaom/src/av1/common/scale.c b/libs/libaom/src/av1/common/scale.c
new file mode 100644
index 000000000..3b14c0a2c
--- /dev/null
+++ b/libs/libaom/src/av1/common/scale.c
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "av1/common/filter.h"
+#include "av1/common/scale.h"
+#include "aom_dsp/aom_filter.h"
+
+// Note: Expect val to be in q4 precision
+static INLINE int scaled_x(int val, const struct scale_factors *sf) {
+  const int off =
+      (sf->x_scale_fp - (1 << REF_SCALE_SHIFT)) * (1 << (SUBPEL_BITS - 1));
+  const int64_t tval = (int64_t)val * sf->x_scale_fp + off;
+  return (int)ROUND_POWER_OF_TWO_SIGNED_64(tval,
+                                           REF_SCALE_SHIFT - SCALE_EXTRA_BITS);
+}
+
+// Note: Expect val to be in q4 precision
+static INLINE int scaled_y(int val, const struct scale_factors *sf) {
+  const int off =
+      (sf->y_scale_fp - (1 << REF_SCALE_SHIFT)) * (1 << (SUBPEL_BITS - 1));
+  const int64_t tval = (int64_t)val * sf->y_scale_fp + off;
+  return (int)ROUND_POWER_OF_TWO_SIGNED_64(tval,
+                                           REF_SCALE_SHIFT - SCALE_EXTRA_BITS);
+}
+
+// Note: Expect val to be in q4 precision
+static int unscaled_value(int val, const struct scale_factors *sf) {
+  (void)sf;
+  return val * (1 << SCALE_EXTRA_BITS);
+}
+
+static int get_fixed_point_scale_factor(int other_size, int this_size) {
+  // Calculate scaling factor once for each reference frame
+  // and use fixed point scaling factors in decoding and encoding routines.
+  // Hardware implementations can calculate scale factor in device driver
+  // and use multiplication and shifting on hardware instead of division.
+  return ((other_size << REF_SCALE_SHIFT) + this_size / 2) / this_size;
+}
+
+// Given the fixed point scale, calculate coarse point scale.
+static int fixed_point_scale_to_coarse_point_scale(int scale_fp) {
+  return ROUND_POWER_OF_TWO(scale_fp, REF_SCALE_SHIFT - SCALE_SUBPEL_BITS);
+}
+
+// Note: x and y are integer precision, mvq4 is q4 precision.
+MV32 av1_scale_mv(const MV *mvq4, int x, int y,
+                  const struct scale_factors *sf) {
+  const int x_off_q4 = scaled_x(x << SUBPEL_BITS, sf);
+  const int y_off_q4 = scaled_y(y << SUBPEL_BITS, sf);
+  const MV32 res = { scaled_y((y << SUBPEL_BITS) + mvq4->row, sf) - y_off_q4,
+                     scaled_x((x << SUBPEL_BITS) + mvq4->col, sf) - x_off_q4 };
+  return res;
+}
+
+void av1_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w,
+                                       int other_h, int this_w, int this_h) {
+  if (!valid_ref_frame_size(other_w, other_h, this_w, this_h)) {
+    sf->x_scale_fp = REF_INVALID_SCALE;
+    sf->y_scale_fp = REF_INVALID_SCALE;
+    return;
+  }
+
+  sf->x_scale_fp = get_fixed_point_scale_factor(other_w, this_w);
+  sf->y_scale_fp = get_fixed_point_scale_factor(other_h, this_h);
+
+  sf->x_step_q4 = fixed_point_scale_to_coarse_point_scale(sf->x_scale_fp);
+  sf->y_step_q4 = fixed_point_scale_to_coarse_point_scale(sf->y_scale_fp);
+
+  if (av1_is_scaled(sf)) {
+    sf->scale_value_x = scaled_x;
+    sf->scale_value_y = scaled_y;
+  } else {
+    sf->scale_value_x = unscaled_value;
+    sf->scale_value_y = unscaled_value;
+  }
+
+  // AV1 convolve functions
+  // Special case convolve functions should produce the same result as
+  // av1_convolve_2d.
+  // subpel_x_qn == 0 && subpel_y_qn == 0
+  sf->convolve[0][0][0] = av1_convolve_2d_copy_sr;
+  // subpel_x_qn == 0
+  sf->convolve[0][1][0] = av1_convolve_y_sr;
+  // subpel_y_qn == 0
+  sf->convolve[1][0][0] = av1_convolve_x_sr;
+  // subpel_x_qn != 0 && subpel_y_qn != 0
+  sf->convolve[1][1][0] = av1_convolve_2d_sr;
+  // subpel_x_qn == 0 && subpel_y_qn == 0
+  sf->convolve[0][0][1] = av1_dist_wtd_convolve_2d_copy;
+  // subpel_x_qn == 0
+  sf->convolve[0][1][1] = av1_dist_wtd_convolve_y;
+  // subpel_y_qn == 0
+  sf->convolve[1][0][1] = av1_dist_wtd_convolve_x;
+  // subpel_x_qn != 0 && subpel_y_qn != 0
+  sf->convolve[1][1][1] = av1_dist_wtd_convolve_2d;
+#if CONFIG_AV1_HIGHBITDEPTH
+  // AV1 High BD convolve functions
+  // Special case convolve functions should produce the same result as
+  // av1_highbd_convolve_2d.
+  // subpel_x_qn == 0 && subpel_y_qn == 0
+  sf->highbd_convolve[0][0][0] = av1_highbd_convolve_2d_copy_sr;
+  // subpel_x_qn == 0
+  sf->highbd_convolve[0][1][0] = av1_highbd_convolve_y_sr;
+  // subpel_y_qn == 0
+  sf->highbd_convolve[1][0][0] = av1_highbd_convolve_x_sr;
+  // subpel_x_qn != 0 && subpel_y_qn != 0
+  sf->highbd_convolve[1][1][0] = av1_highbd_convolve_2d_sr;
+  // subpel_x_qn == 0 && subpel_y_qn == 0
+  sf->highbd_convolve[0][0][1] = av1_highbd_dist_wtd_convolve_2d_copy;
+  // subpel_x_qn == 0
+  sf->highbd_convolve[0][1][1] = av1_highbd_dist_wtd_convolve_y;
+  // subpel_y_qn == 0
+  sf->highbd_convolve[1][0][1] = av1_highbd_dist_wtd_convolve_x;
+  // subpel_x_qn != 0 && subpel_y_qn != 0
+  sf->highbd_convolve[1][1][1] = av1_highbd_dist_wtd_convolve_2d;
+#endif
+}
diff --git a/libs/libaom/src/av1/common/scale.h b/libs/libaom/src/av1/common/scale.h
new file mode 100644
index 000000000..16b40bde8
--- /dev/null
+++ b/libs/libaom/src/av1/common/scale.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_SCALE_H_
+#define AOM_AV1_COMMON_SCALE_H_
+
+#include "av1/common/convolve.h"
+#include "av1/common/mv.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define SCALE_NUMERATOR 8
+
+#define REF_SCALE_SHIFT 14
+#define REF_NO_SCALE (1 << REF_SCALE_SHIFT)
+#define REF_INVALID_SCALE -1
+
+struct scale_factors {
+  int x_scale_fp;  // horizontal fixed point scale factor
+  int y_scale_fp;  // vertical fixed point scale factor
+  int x_step_q4;
+  int y_step_q4;
+
+  int (*scale_value_x)(int val, const struct scale_factors *sf);
+  int (*scale_value_y)(int val, const struct scale_factors *sf);
+
+  // convolve_fn_ptr[subpel_x != 0][subpel_y != 0][is_compound]
+  aom_convolve_fn_t convolve[2][2][2];
+  aom_highbd_convolve_fn_t highbd_convolve[2][2][2];
+};
+
+MV32 av1_scale_mv(const MV *mv, int x, int y, const struct scale_factors *sf);
+
+void av1_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w,
+                                       int other_h, int this_w, int this_h);
+
+static INLINE int av1_is_valid_scale(const struct scale_factors *sf) {
+  assert(sf != NULL);
+  return sf->x_scale_fp != REF_INVALID_SCALE &&
+         sf->y_scale_fp != REF_INVALID_SCALE;
+}
+
+static INLINE int av1_is_scaled(const struct scale_factors *sf) {
+  assert(sf != NULL);
+  return av1_is_valid_scale(sf) &&
+         (sf->x_scale_fp != REF_NO_SCALE || sf->y_scale_fp != REF_NO_SCALE);
+}
+
+static INLINE int valid_ref_frame_size(int ref_width, int ref_height,
+                                       int this_width, int this_height) {
+  return 2 * this_width >= ref_width && 2 * this_height >= ref_height &&
+         this_width <= 16 * ref_width && this_height <= 16 * ref_height;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_COMMON_SCALE_H_
diff --git a/libs/libaom/src/av1/common/scan.c b/libs/libaom/src/av1/common/scan.c
new file mode 100644
index 000000000..c1d4f3581
--- /dev/null
+++ b/libs/libaom/src/av1/common/scan.c
@@ -0,0 +1,2048 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "av1/common/common_data.h"
+#include "av1/common/scan.h"
+
+DECLARE_ALIGNED(16, static const int16_t,
+                default_scan_4x4[16]) = { 0, 1,  4,  8,  5, 2,  3,  6,
+                                          9, 12, 13, 10, 7, 11, 14, 15 };
+
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_4x4[16]) = {
+  0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_4x4[16]) = {
+  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_4x8[32]) = {
+  0,  1,  4,  2,  5,  8,  3,  6,  9,  12, 7,  10, 13, 16, 11, 14,
+  17, 20, 15, 18, 21, 24, 19, 22, 25, 28, 23, 26, 29, 27, 30, 31,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_4x8[32]) = {
+  0, 4, 8,  12, 16, 20, 24, 28, 1, 5, 9,  13, 17, 21, 25, 29,
+  2, 6, 10, 14, 18, 22, 26, 30, 3, 7, 11, 15, 19, 23, 27, 31,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_4x8[32]) = {
+  0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_8x4[32]) = {
+  0,  8, 1,  16, 9,  2, 24, 17, 10, 3, 25, 18, 11, 4,  26, 19,
+  12, 5, 27, 20, 13, 6, 28, 21, 14, 7, 29, 22, 15, 30, 23, 31,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_8x4[32]) = {
+  0, 8,  16, 24, 1, 9,  17, 25, 2, 10, 18, 26, 3, 11, 19, 27,
+  4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_8x4[32]) = {
+  0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_4x16[64]) = {
+  0,  1,  4,  2,  5,  8,  3,  6,  9,  12, 7,  10, 13, 16, 11, 14,
+  17, 20, 15, 18, 21, 24, 19, 22, 25, 28, 23, 26, 29, 32, 27, 30,
+  33, 36, 31, 34, 37, 40, 35, 38, 41, 44, 39, 42, 45, 48, 43, 46,
+  49, 52, 47, 50, 53, 56, 51, 54, 57, 60, 55, 58, 61, 59, 62, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_16x4[64]) = {
+  0,  16, 1,  32, 17, 2,  48, 33, 18, 3,  49, 34, 19, 4,  50, 35,
+  20, 5,  51, 36, 21, 6,  52, 37, 22, 7,  53, 38, 23, 8,  54, 39,
+  24, 9,  55, 40, 25, 10, 56, 41, 26, 11, 57, 42, 27, 12, 58, 43,
+  28, 13, 59, 44, 29, 14, 60, 45, 30, 15, 61, 46, 31, 62, 47, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_4x16[64]) = {
+  0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_16x4[64]) = {
+  0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_4x16[64]) = {
+  0, 4, 8,  12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60,
+  1, 5, 9,  13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61,
+  2, 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62,
+  3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_16x4[64]) = {
+  0,  16, 32, 48, 1,  17, 33, 49, 2,  18, 34, 50, 3,  19, 35, 51,
+  4,  20, 36, 52, 5,  21, 37, 53, 6,  22, 38, 54, 7,  23, 39, 55,
+  8,  24, 40, 56, 9,  25, 41, 57, 10, 26, 42, 58, 11, 27, 43, 59,
+  12, 28, 44, 60, 13, 29, 45, 61, 14, 30, 46, 62, 15, 31, 47, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_8x32[256]) = {
+  0,   1,   8,   2,   9,   16,  3,   10,  17,  24,  4,   11,  18,  25,  32,
+  5,   12,  19,  26,  33,  40,  6,   13,  20,  27,  34,  41,  48,  7,   14,
+  21,  28,  35,  42,  49,  56,  15,  22,  29,  36,  43,  50,  57,  64,  23,
+  30,  37,  44,  51,  58,  65,  72,  31,  38,  45,  52,  59,  66,  73,  80,
+  39,  46,  53,  60,  67,  74,  81,  88,  47,  54,  61,  68,  75,  82,  89,
+  96,  55,  62,  69,  76,  83,  90,  97,  104, 63,  70,  77,  84,  91,  98,
+  105, 112, 71,  78,  85,  92,  99,  106, 113, 120, 79,  86,  93,  100, 107,
+  114, 121, 128, 87,  94,  101, 108, 115, 122, 129, 136, 95,  102, 109, 116,
+  123, 130, 137, 144, 103, 110, 117, 124, 131, 138, 145, 152, 111, 118, 125,
+  132, 139, 146, 153, 160, 119, 126, 133, 140, 147, 154, 161, 168, 127, 134,
+  141, 148, 155, 162, 169, 176, 135, 142, 149, 156, 163, 170, 177, 184, 143,
+  150, 157, 164, 171, 178, 185, 192, 151, 158, 165, 172, 179, 186, 193, 200,
+  159, 166, 173, 180, 187, 194, 201, 208, 167, 174, 181, 188, 195, 202, 209,
+  216, 175, 182, 189, 196, 203, 210, 217, 224, 183, 190, 197, 204, 211, 218,
+  225, 232, 191, 198, 205, 212, 219, 226, 233, 240, 199, 206, 213, 220, 227,
+  234, 241, 248, 207, 214, 221, 228, 235, 242, 249, 215, 222, 229, 236, 243,
+  250, 223, 230, 237, 244, 251, 231, 238, 245, 252, 239, 246, 253, 247, 254,
+  255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_32x8[256]) = {
+  0,   32,  1,   64,  33,  2,   96,  65,  34,  3,   128, 97,  66,  35,  4,
+  160, 129, 98,  67,  36,  5,   192, 161, 130, 99,  68,  37,  6,   224, 193,
+  162, 131, 100, 69,  38,  7,   225, 194, 163, 132, 101, 70,  39,  8,   226,
+  195, 164, 133, 102, 71,  40,  9,   227, 196, 165, 134, 103, 72,  41,  10,
+  228, 197, 166, 135, 104, 73,  42,  11,  229, 198, 167, 136, 105, 74,  43,
+  12,  230, 199, 168, 137, 106, 75,  44,  13,  231, 200, 169, 138, 107, 76,
+  45,  14,  232, 201, 170, 139, 108, 77,  46,  15,  233, 202, 171, 140, 109,
+  78,  47,  16,  234, 203, 172, 141, 110, 79,  48,  17,  235, 204, 173, 142,
+  111, 80,  49,  18,  236, 205, 174, 143, 112, 81,  50,  19,  237, 206, 175,
+  144, 113, 82,  51,  20,  238, 207, 176, 145, 114, 83,  52,  21,  239, 208,
+  177, 146, 115, 84,  53,  22,  240, 209, 178, 147, 116, 85,  54,  23,  241,
+  210, 179, 148, 117, 86,  55,  24,  242, 211, 180, 149, 118, 87,  56,  25,
+  243, 212, 181, 150, 119, 88,  57,  26,  244, 213, 182, 151, 120, 89,  58,
+  27,  245, 214, 183, 152, 121, 90,  59,  28,  246, 215, 184, 153, 122, 91,
+  60,  29,  247, 216, 185, 154, 123, 92,  61,  30,  248, 217, 186, 155, 124,
+  93,  62,  31,  249, 218, 187, 156, 125, 94,  63,  250, 219, 188, 157, 126,
+  95,  251, 220, 189, 158, 127, 252, 221, 190, 159, 253, 222, 191, 254, 223,
+  255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_8x32[256]) = {
+  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
+  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
+  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
+  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,
+  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
+  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
+  105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+  120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
+  135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
+  150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
+  165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
+  180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
+  195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
+  210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224,
+  225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+  240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
+  255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_32x8[256]) = {
+  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
+  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
+  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
+  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,
+  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
+  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
+  105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+  120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
+  135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
+  150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
+  165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
+  180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
+  195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
+  210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224,
+  225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+  240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
+  255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_8x32[256]) = {
+  0,   8,   16,  24,  32,  40,  48,  56,  64,  72,  80,  88,  96,  104, 112,
+  120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232,
+  240, 248, 1,   9,   17,  25,  33,  41,  49,  57,  65,  73,  81,  89,  97,
+  105, 113, 121, 129, 137, 145, 153, 161, 169, 177, 185, 193, 201, 209, 217,
+  225, 233, 241, 249, 2,   10,  18,  26,  34,  42,  50,  58,  66,  74,  82,
+  90,  98,  106, 114, 122, 130, 138, 146, 154, 162, 170, 178, 186, 194, 202,
+  210, 218, 226, 234, 242, 250, 3,   11,  19,  27,  35,  43,  51,  59,  67,
+  75,  83,  91,  99,  107, 115, 123, 131, 139, 147, 155, 163, 171, 179, 187,
+  195, 203, 211, 219, 227, 235, 243, 251, 4,   12,  20,  28,  36,  44,  52,
+  60,  68,  76,  84,  92,  100, 108, 116, 124, 132, 140, 148, 156, 164, 172,
+  180, 188, 196, 204, 212, 220, 228, 236, 244, 252, 5,   13,  21,  29,  37,
+  45,  53,  61,  69,  77,  85,  93,  101, 109, 117, 125, 133, 141, 149, 157,
+  165, 173, 181, 189, 197, 205, 213, 221, 229, 237, 245, 253, 6,   14,  22,
+  30,  38,  46,  54,  62,  70,  78,  86,  94,  102, 110, 118, 126, 134, 142,
+  150, 158, 166, 174, 182, 190, 198, 206, 214, 222, 230, 238, 246, 254, 7,
+  15,  23,  31,  39,  47,  55,  63,  71,  79,  87,  95,  103, 111, 119, 127,
+  135, 143, 151, 159, 167, 175, 183, 191, 199, 207, 215, 223, 231, 239, 247,
+  255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_32x8[256]) = {
+  0,  32, 64, 96,  128, 160, 192, 224, 1,  33, 65, 97,  129, 161, 193, 225,
+  2,  34, 66, 98,  130, 162, 194, 226, 3,  35, 67, 99,  131, 163, 195, 227,
+  4,  36, 68, 100, 132, 164, 196, 228, 5,  37, 69, 101, 133, 165, 197, 229,
+  6,  38, 70, 102, 134, 166, 198, 230, 7,  39, 71, 103, 135, 167, 199, 231,
+  8,  40, 72, 104, 136, 168, 200, 232, 9,  41, 73, 105, 137, 169, 201, 233,
+  10, 42, 74, 106, 138, 170, 202, 234, 11, 43, 75, 107, 139, 171, 203, 235,
+  12, 44, 76, 108, 140, 172, 204, 236, 13, 45, 77, 109, 141, 173, 205, 237,
+  14, 46, 78, 110, 142, 174, 206, 238, 15, 47, 79, 111, 143, 175, 207, 239,
+  16, 48, 80, 112, 144, 176, 208, 240, 17, 49, 81, 113, 145, 177, 209, 241,
+  18, 50, 82, 114, 146, 178, 210, 242, 19, 51, 83, 115, 147, 179, 211, 243,
+  20, 52, 84, 116, 148, 180, 212, 244, 21, 53, 85, 117, 149, 181, 213, 245,
+  22, 54, 86, 118, 150, 182, 214, 246, 23, 55, 87, 119, 151, 183, 215, 247,
+  24, 56, 88, 120, 152, 184, 216, 248, 25, 57, 89, 121, 153, 185, 217, 249,
+  26, 58, 90, 122, 154, 186, 218, 250, 27, 59, 91, 123, 155, 187, 219, 251,
+  28, 60, 92, 124, 156, 188, 220, 252, 29, 61, 93, 125, 157, 189, 221, 253,
+  30, 62, 94, 126, 158, 190, 222, 254, 31, 63, 95, 127, 159, 191, 223, 255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_8x8[64]) = {
+  0,  1,  8,  16, 9,  2,  3,  10, 17, 24, 32, 25, 18, 11, 4,  5,
+  12, 19, 26, 33, 40, 48, 41, 34, 27, 20, 13, 6,  7,  14, 21, 28,
+  35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, 30, 37, 44, 51,
+  58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_8x8[64]) = {
+  0, 8,  16, 24, 32, 40, 48, 56, 1, 9,  17, 25, 33, 41, 49, 57,
+  2, 10, 18, 26, 34, 42, 50, 58, 3, 11, 19, 27, 35, 43, 51, 59,
+  4, 12, 20, 28, 36, 44, 52, 60, 5, 13, 21, 29, 37, 45, 53, 61,
+  6, 14, 22, 30, 38, 46, 54, 62, 7, 15, 23, 31, 39, 47, 55, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_8x8[64]) = {
+  0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_8x16[128]) = {
+  0,   1,   8,   2,   9,   16,  3,   10,  17,  24,  4,   11,  18,  25,  32,
+  5,   12,  19,  26,  33,  40,  6,   13,  20,  27,  34,  41,  48,  7,   14,
+  21,  28,  35,  42,  49,  56,  15,  22,  29,  36,  43,  50,  57,  64,  23,
+  30,  37,  44,  51,  58,  65,  72,  31,  38,  45,  52,  59,  66,  73,  80,
+  39,  46,  53,  60,  67,  74,  81,  88,  47,  54,  61,  68,  75,  82,  89,
+  96,  55,  62,  69,  76,  83,  90,  97,  104, 63,  70,  77,  84,  91,  98,
+  105, 112, 71,  78,  85,  92,  99,  106, 113, 120, 79,  86,  93,  100, 107,
+  114, 121, 87,  94,  101, 108, 115, 122, 95,  102, 109, 116, 123, 103, 110,
+  117, 124, 111, 118, 125, 119, 126, 127,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_16x8[128]) = {
+  0,  16,  1,   32, 17,  2,   48,  33,  18, 3,  64,  49,  34,  19,  4,   80,
+  65, 50,  35,  20, 5,   96,  81,  66,  51, 36, 21,  6,   112, 97,  82,  67,
+  52, 37,  22,  7,  113, 98,  83,  68,  53, 38, 23,  8,   114, 99,  84,  69,
+  54, 39,  24,  9,  115, 100, 85,  70,  55, 40, 25,  10,  116, 101, 86,  71,
+  56, 41,  26,  11, 117, 102, 87,  72,  57, 42, 27,  12,  118, 103, 88,  73,
+  58, 43,  28,  13, 119, 104, 89,  74,  59, 44, 29,  14,  120, 105, 90,  75,
+  60, 45,  30,  15, 121, 106, 91,  76,  61, 46, 31,  122, 107, 92,  77,  62,
+  47, 123, 108, 93, 78,  63,  124, 109, 94, 79, 125, 110, 95,  126, 111, 127,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_8x16[128]) = {
+  0, 8,  16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96,  104, 112, 120,
+  1, 9,  17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97,  105, 113, 121,
+  2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98,  106, 114, 122,
+  3, 11, 19, 27, 35, 43, 51, 59, 67, 75, 83, 91, 99,  107, 115, 123,
+  4, 12, 20, 28, 36, 44, 52, 60, 68, 76, 84, 92, 100, 108, 116, 124,
+  5, 13, 21, 29, 37, 45, 53, 61, 69, 77, 85, 93, 101, 109, 117, 125,
+  6, 14, 22, 30, 38, 46, 54, 62, 70, 78, 86, 94, 102, 110, 118, 126,
+  7, 15, 23, 31, 39, 47, 55, 63, 71, 79, 87, 95, 103, 111, 119, 127,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_16x8[128]) = {
+  0,  16, 32, 48, 64, 80, 96,  112, 1,  17, 33, 49, 65, 81, 97,  113,
+  2,  18, 34, 50, 66, 82, 98,  114, 3,  19, 35, 51, 67, 83, 99,  115,
+  4,  20, 36, 52, 68, 84, 100, 116, 5,  21, 37, 53, 69, 85, 101, 117,
+  6,  22, 38, 54, 70, 86, 102, 118, 7,  23, 39, 55, 71, 87, 103, 119,
+  8,  24, 40, 56, 72, 88, 104, 120, 9,  25, 41, 57, 73, 89, 105, 121,
+  10, 26, 42, 58, 74, 90, 106, 122, 11, 27, 43, 59, 75, 91, 107, 123,
+  12, 28, 44, 60, 76, 92, 108, 124, 13, 29, 45, 61, 77, 93, 109, 125,
+  14, 30, 46, 62, 78, 94, 110, 126, 15, 31, 47, 63, 79, 95, 111, 127,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_8x16[128]) = {
+  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
+  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
+  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
+  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,
+  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
+  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
+  105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+  120, 121, 122, 123, 124, 125, 126, 127,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_16x8[128]) = {
+  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
+  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
+  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
+  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,
+  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
+  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
+  105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+  120, 121, 122, 123, 124, 125, 126, 127,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_16x32[512]) = {
+  0,   1,   16,  2,   17,  32,  3,   18,  33,  48,  4,   19,  34,  49,  64,
+  5,   20,  35,  50,  65,  80,  6,   21,  36,  51,  66,  81,  96,  7,   22,
+  37,  52,  67,  82,  97,  112, 8,   23,  38,  53,  68,  83,  98,  113, 128,
+  9,   24,  39,  54,  69,  84,  99,  114, 129, 144, 10,  25,  40,  55,  70,
+  85,  100, 115, 130, 145, 160, 11,  26,  41,  56,  71,  86,  101, 116, 131,
+  146, 161, 176, 12,  27,  42,  57,  72,  87,  102, 117, 132, 147, 162, 177,
+  192, 13,  28,  43,  58,  73,  88,  103, 118, 133, 148, 163, 178, 193, 208,
+  14,  29,  44,  59,  74,  89,  104, 119, 134, 149, 164, 179, 194, 209, 224,
+  15,  30,  45,  60,  75,  90,  105, 120, 135, 150, 165, 180, 195, 210, 225,
+  240, 31,  46,  61,  76,  91,  106, 121, 136, 151, 166, 181, 196, 211, 226,
+  241, 256, 47,  62,  77,  92,  107, 122, 137, 152, 167, 182, 197, 212, 227,
+  242, 257, 272, 63,  78,  93,  108, 123, 138, 153, 168, 183, 198, 213, 228,
+  243, 258, 273, 288, 79,  94,  109, 124, 139, 154, 169, 184, 199, 214, 229,
+  244, 259, 274, 289, 304, 95,  110, 125, 140, 155, 170, 185, 200, 215, 230,
+  245, 260, 275, 290, 305, 320, 111, 126, 141, 156, 171, 186, 201, 216, 231,
+  246, 261, 276, 291, 306, 321, 336, 127, 142, 157, 172, 187, 202, 217, 232,
+  247, 262, 277, 292, 307, 322, 337, 352, 143, 158, 173, 188, 203, 218, 233,
+  248, 263, 278, 293, 308, 323, 338, 353, 368, 159, 174, 189, 204, 219, 234,
+  249, 264, 279, 294, 309, 324, 339, 354, 369, 384, 175, 190, 205, 220, 235,
+  250, 265, 280, 295, 310, 325, 340, 355, 370, 385, 400, 191, 206, 221, 236,
+  251, 266, 281, 296, 311, 326, 341, 356, 371, 386, 401, 416, 207, 222, 237,
+  252, 267, 282, 297, 312, 327, 342, 357, 372, 387, 402, 417, 432, 223, 238,
+  253, 268, 283, 298, 313, 328, 343, 358, 373, 388, 403, 418, 433, 448, 239,
+  254, 269, 284, 299, 314, 329, 344, 359, 374, 389, 404, 419, 434, 449, 464,
+  255, 270, 285, 300, 315, 330, 345, 360, 375, 390, 405, 420, 435, 450, 465,
+  480, 271, 286, 301, 316, 331, 346, 361, 376, 391, 406, 421, 436, 451, 466,
+  481, 496, 287, 302, 317, 332, 347, 362, 377, 392, 407, 422, 437, 452, 467,
+  482, 497, 303, 318, 333, 348, 363, 378, 393, 408, 423, 438, 453, 468, 483,
+  498, 319, 334, 349, 364, 379, 394, 409, 424, 439, 454, 469, 484, 499, 335,
+  350, 365, 380, 395, 410, 425, 440, 455, 470, 485, 500, 351, 366, 381, 396,
+  411, 426, 441, 456, 471, 486, 501, 367, 382, 397, 412, 427, 442, 457, 472,
+  487, 502, 383, 398, 413, 428, 443, 458, 473, 488, 503, 399, 414, 429, 444,
+  459, 474, 489, 504, 415, 430, 445, 460, 475, 490, 505, 431, 446, 461, 476,
+  491, 506, 447, 462, 477, 492, 507, 463, 478, 493, 508, 479, 494, 509, 495,
+  510, 511,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_32x16[512]) = {
+  0,   32,  1,   64,  33,  2,   96,  65,  34,  3,   128, 97,  66,  35,  4,
+  160, 129, 98,  67,  36,  5,   192, 161, 130, 99,  68,  37,  6,   224, 193,
+  162, 131, 100, 69,  38,  7,   256, 225, 194, 163, 132, 101, 70,  39,  8,
+  288, 257, 226, 195, 164, 133, 102, 71,  40,  9,   320, 289, 258, 227, 196,
+  165, 134, 103, 72,  41,  10,  352, 321, 290, 259, 228, 197, 166, 135, 104,
+  73,  42,  11,  384, 353, 322, 291, 260, 229, 198, 167, 136, 105, 74,  43,
+  12,  416, 385, 354, 323, 292, 261, 230, 199, 168, 137, 106, 75,  44,  13,
+  448, 417, 386, 355, 324, 293, 262, 231, 200, 169, 138, 107, 76,  45,  14,
+  480, 449, 418, 387, 356, 325, 294, 263, 232, 201, 170, 139, 108, 77,  46,
+  15,  481, 450, 419, 388, 357, 326, 295, 264, 233, 202, 171, 140, 109, 78,
+  47,  16,  482, 451, 420, 389, 358, 327, 296, 265, 234, 203, 172, 141, 110,
+  79,  48,  17,  483, 452, 421, 390, 359, 328, 297, 266, 235, 204, 173, 142,
+  111, 80,  49,  18,  484, 453, 422, 391, 360, 329, 298, 267, 236, 205, 174,
+  143, 112, 81,  50,  19,  485, 454, 423, 392, 361, 330, 299, 268, 237, 206,
+  175, 144, 113, 82,  51,  20,  486, 455, 424, 393, 362, 331, 300, 269, 238,
+  207, 176, 145, 114, 83,  52,  21,  487, 456, 425, 394, 363, 332, 301, 270,
+  239, 208, 177, 146, 115, 84,  53,  22,  488, 457, 426, 395, 364, 333, 302,
+  271, 240, 209, 178, 147, 116, 85,  54,  23,  489, 458, 427, 396, 365, 334,
+  303, 272, 241, 210, 179, 148, 117, 86,  55,  24,  490, 459, 428, 397, 366,
+  335, 304, 273, 242, 211, 180, 149, 118, 87,  56,  25,  491, 460, 429, 398,
+  367, 336, 305, 274, 243, 212, 181, 150, 119, 88,  57,  26,  492, 461, 430,
+  399, 368, 337, 306, 275, 244, 213, 182, 151, 120, 89,  58,  27,  493, 462,
+  431, 400, 369, 338, 307, 276, 245, 214, 183, 152, 121, 90,  59,  28,  494,
+  463, 432, 401, 370, 339, 308, 277, 246, 215, 184, 153, 122, 91,  60,  29,
+  495, 464, 433, 402, 371, 340, 309, 278, 247, 216, 185, 154, 123, 92,  61,
+  30,  496, 465, 434, 403, 372, 341, 310, 279, 248, 217, 186, 155, 124, 93,
+  62,  31,  497, 466, 435, 404, 373, 342, 311, 280, 249, 218, 187, 156, 125,
+  94,  63,  498, 467, 436, 405, 374, 343, 312, 281, 250, 219, 188, 157, 126,
+  95,  499, 468, 437, 406, 375, 344, 313, 282, 251, 220, 189, 158, 127, 500,
+  469, 438, 407, 376, 345, 314, 283, 252, 221, 190, 159, 501, 470, 439, 408,
+  377, 346, 315, 284, 253, 222, 191, 502, 471, 440, 409, 378, 347, 316, 285,
+  254, 223, 503, 472, 441, 410, 379, 348, 317, 286, 255, 504, 473, 442, 411,
+  380, 349, 318, 287, 505, 474, 443, 412, 381, 350, 319, 506, 475, 444, 413,
+  382, 351, 507, 476, 445, 414, 383, 508, 477, 446, 415, 509, 478, 447, 510,
+  479, 511,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_16x32[512]) = {
+  0,   16,  32,  48,  64,  80,  96,  112, 128, 144, 160, 176, 192, 208, 224,
+  240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464,
+  480, 496, 1,   17,  33,  49,  65,  81,  97,  113, 129, 145, 161, 177, 193,
+  209, 225, 241, 257, 273, 289, 305, 321, 337, 353, 369, 385, 401, 417, 433,
+  449, 465, 481, 497, 2,   18,  34,  50,  66,  82,  98,  114, 130, 146, 162,
+  178, 194, 210, 226, 242, 258, 274, 290, 306, 322, 338, 354, 370, 386, 402,
+  418, 434, 450, 466, 482, 498, 3,   19,  35,  51,  67,  83,  99,  115, 131,
+  147, 163, 179, 195, 211, 227, 243, 259, 275, 291, 307, 323, 339, 355, 371,
+  387, 403, 419, 435, 451, 467, 483, 499, 4,   20,  36,  52,  68,  84,  100,
+  116, 132, 148, 164, 180, 196, 212, 228, 244, 260, 276, 292, 308, 324, 340,
+  356, 372, 388, 404, 420, 436, 452, 468, 484, 500, 5,   21,  37,  53,  69,
+  85,  101, 117, 133, 149, 165, 181, 197, 213, 229, 245, 261, 277, 293, 309,
+  325, 341, 357, 373, 389, 405, 421, 437, 453, 469, 485, 501, 6,   22,  38,
+  54,  70,  86,  102, 118, 134, 150, 166, 182, 198, 214, 230, 246, 262, 278,
+  294, 310, 326, 342, 358, 374, 390, 406, 422, 438, 454, 470, 486, 502, 7,
+  23,  39,  55,  71,  87,  103, 119, 135, 151, 167, 183, 199, 215, 231, 247,
+  263, 279, 295, 311, 327, 343, 359, 375, 391, 407, 423, 439, 455, 471, 487,
+  503, 8,   24,  40,  56,  72,  88,  104, 120, 136, 152, 168, 184, 200, 216,
+  232, 248, 264, 280, 296, 312, 328, 344, 360, 376, 392, 408, 424, 440, 456,
+  472, 488, 504, 9,   25,  41,  57,  73,  89,  105, 121, 137, 153, 169, 185,
+  201, 217, 233, 249, 265, 281, 297, 313, 329, 345, 361, 377, 393, 409, 425,
+  441, 457, 473, 489, 505, 10,  26,  42,  58,  74,  90,  106, 122, 138, 154,
+  170, 186, 202, 218, 234, 250, 266, 282, 298, 314, 330, 346, 362, 378, 394,
+  410, 426, 442, 458, 474, 490, 506, 11,  27,  43,  59,  75,  91,  107, 123,
+  139, 155, 171, 187, 203, 219, 235, 251, 267, 283, 299, 315, 331, 347, 363,
+  379, 395, 411, 427, 443, 459, 475, 491, 507, 12,  28,  44,  60,  76,  92,
+  108, 124, 140, 156, 172, 188, 204, 220, 236, 252, 268, 284, 300, 316, 332,
+  348, 364, 380, 396, 412, 428, 444, 460, 476, 492, 508, 13,  29,  45,  61,
+  77,  93,  109, 125, 141, 157, 173, 189, 205, 221, 237, 253, 269, 285, 301,
+  317, 333, 349, 365, 381, 397, 413, 429, 445, 461, 477, 493, 509, 14,  30,
+  46,  62,  78,  94,  110, 126, 142, 158, 174, 190, 206, 222, 238, 254, 270,
+  286, 302, 318, 334, 350, 366, 382, 398, 414, 430, 446, 462, 478, 494, 510,
+  15,  31,  47,  63,  79,  95,  111, 127, 143, 159, 175, 191, 207, 223, 239,
+  255, 271, 287, 303, 319, 335, 351, 367, 383, 399, 415, 431, 447, 463, 479,
+  495, 511,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_32x16[512]) = {
+  0,  32, 64, 96,  128, 160, 192, 224, 256, 288, 320, 352, 384, 416, 448, 480,
+  1,  33, 65, 97,  129, 161, 193, 225, 257, 289, 321, 353, 385, 417, 449, 481,
+  2,  34, 66, 98,  130, 162, 194, 226, 258, 290, 322, 354, 386, 418, 450, 482,
+  3,  35, 67, 99,  131, 163, 195, 227, 259, 291, 323, 355, 387, 419, 451, 483,
+  4,  36, 68, 100, 132, 164, 196, 228, 260, 292, 324, 356, 388, 420, 452, 484,
+  5,  37, 69, 101, 133, 165, 197, 229, 261, 293, 325, 357, 389, 421, 453, 485,
+  6,  38, 70, 102, 134, 166, 198, 230, 262, 294, 326, 358, 390, 422, 454, 486,
+  7,  39, 71, 103, 135, 167, 199, 231, 263, 295, 327, 359, 391, 423, 455, 487,
+  8,  40, 72, 104, 136, 168, 200, 232, 264, 296, 328, 360, 392, 424, 456, 488,
+  9,  41, 73, 105, 137, 169, 201, 233, 265, 297, 329, 361, 393, 425, 457, 489,
+  10, 42, 74, 106, 138, 170, 202, 234, 266, 298, 330, 362, 394, 426, 458, 490,
+  11, 43, 75, 107, 139, 171, 203, 235, 267, 299, 331, 363, 395, 427, 459, 491,
+  12, 44, 76, 108, 140, 172, 204, 236, 268, 300, 332, 364, 396, 428, 460, 492,
+  13, 45, 77, 109, 141, 173, 205, 237, 269, 301, 333, 365, 397, 429, 461, 493,
+  14, 46, 78, 110, 142, 174, 206, 238, 270, 302, 334, 366, 398, 430, 462, 494,
+  15, 47, 79, 111, 143, 175, 207, 239, 271, 303, 335, 367, 399, 431, 463, 495,
+  16, 48, 80, 112, 144, 176, 208, 240, 272, 304, 336, 368, 400, 432, 464, 496,
+  17, 49, 81, 113, 145, 177, 209, 241, 273, 305, 337, 369, 401, 433, 465, 497,
+  18, 50, 82, 114, 146, 178, 210, 242, 274, 306, 338, 370, 402, 434, 466, 498,
+  19, 51, 83, 115, 147, 179, 211, 243, 275, 307, 339, 371, 403, 435, 467, 499,
+  20, 52, 84, 116, 148, 180, 212, 244, 276, 308, 340, 372, 404, 436, 468, 500,
+  21, 53, 85, 117, 149, 181, 213, 245, 277, 309, 341, 373, 405, 437, 469, 501,
+  22, 54, 86, 118, 150, 182, 214, 246, 278, 310, 342, 374, 406, 438, 470, 502,
+  23, 55, 87, 119, 151, 183, 215, 247, 279, 311, 343, 375, 407, 439, 471, 503,
+  24, 56, 88, 120, 152, 184, 216, 248, 280, 312, 344, 376, 408, 440, 472, 504,
+  25, 57, 89, 121, 153, 185, 217, 249, 281, 313, 345, 377, 409, 441, 473, 505,
+  26, 58, 90, 122, 154, 186, 218, 250, 282, 314, 346, 378, 410, 442, 474, 506,
+  27, 59, 91, 123, 155, 187, 219, 251, 283, 315, 347, 379, 411, 443, 475, 507,
+  28, 60, 92, 124, 156, 188, 220, 252, 284, 316, 348, 380, 412, 444, 476, 508,
+  29, 61, 93, 125, 157, 189, 221, 253, 285, 317, 349, 381, 413, 445, 477, 509,
+  30, 62, 94, 126, 158, 190, 222, 254, 286, 318, 350, 382, 414, 446, 478, 510,
+  31, 63, 95, 127, 159, 191, 223, 255, 287, 319, 351, 383, 415, 447, 479, 511,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_16x32[512]) = {
+  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
+  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
+  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
+  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,
+  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
+  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
+  105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+  120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
+  135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
+  150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
+  165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
+  180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
+  195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
+  210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224,
+  225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+  240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
+  255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269,
+  270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284,
+  285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299,
+  300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314,
+  315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329,
+  330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344,
+  345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359,
+  360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374,
+  375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389,
+  390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404,
+  405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419,
+  420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434,
+  435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449,
+  450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464,
+  465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479,
+  480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494,
+  495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509,
+  510, 511,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_32x16[512]) = {
+  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
+  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
+  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
+  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,
+  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
+  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
+  105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+  120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
+  135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
+  150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
+  165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
+  180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
+  195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
+  210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224,
+  225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+  240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
+  255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269,
+  270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284,
+  285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299,
+  300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314,
+  315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329,
+  330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344,
+  345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359,
+  360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374,
+  375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389,
+  390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404,
+  405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419,
+  420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434,
+  435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449,
+  450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464,
+  465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479,
+  480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494,
+  495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509,
+  510, 511,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_16x16[256]) = {
+  0,   1,   16,  32,  17,  2,   3,   18,  33,  48,  64,  49,  34,  19,  4,
+  5,   20,  35,  50,  65,  80,  96,  81,  66,  51,  36,  21,  6,   7,   22,
+  37,  52,  67,  82,  97,  112, 128, 113, 98,  83,  68,  53,  38,  23,  8,
+  9,   24,  39,  54,  69,  84,  99,  114, 129, 144, 160, 145, 130, 115, 100,
+  85,  70,  55,  40,  25,  10,  11,  26,  41,  56,  71,  86,  101, 116, 131,
+  146, 161, 176, 192, 177, 162, 147, 132, 117, 102, 87,  72,  57,  42,  27,
+  12,  13,  28,  43,  58,  73,  88,  103, 118, 133, 148, 163, 178, 193, 208,
+  224, 209, 194, 179, 164, 149, 134, 119, 104, 89,  74,  59,  44,  29,  14,
+  15,  30,  45,  60,  75,  90,  105, 120, 135, 150, 165, 180, 195, 210, 225,
+  240, 241, 226, 211, 196, 181, 166, 151, 136, 121, 106, 91,  76,  61,  46,
+  31,  47,  62,  77,  92,  107, 122, 137, 152, 167, 182, 197, 212, 227, 242,
+  243, 228, 213, 198, 183, 168, 153, 138, 123, 108, 93,  78,  63,  79,  94,
+  109, 124, 139, 154, 169, 184, 199, 214, 229, 244, 245, 230, 215, 200, 185,
+  170, 155, 140, 125, 110, 95,  111, 126, 141, 156, 171, 186, 201, 216, 231,
+  246, 247, 232, 217, 202, 187, 172, 157, 142, 127, 143, 158, 173, 188, 203,
+  218, 233, 248, 249, 234, 219, 204, 189, 174, 159, 175, 190, 205, 220, 235,
+  250, 251, 236, 221, 206, 191, 207, 222, 237, 252, 253, 238, 223, 239, 254,
+  255
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_16x16[256]) = {
+  0,  16, 32, 48, 64, 80, 96,  112, 128, 144, 160, 176, 192, 208, 224, 240,
+  1,  17, 33, 49, 65, 81, 97,  113, 129, 145, 161, 177, 193, 209, 225, 241,
+  2,  18, 34, 50, 66, 82, 98,  114, 130, 146, 162, 178, 194, 210, 226, 242,
+  3,  19, 35, 51, 67, 83, 99,  115, 131, 147, 163, 179, 195, 211, 227, 243,
+  4,  20, 36, 52, 68, 84, 100, 116, 132, 148, 164, 180, 196, 212, 228, 244,
+  5,  21, 37, 53, 69, 85, 101, 117, 133, 149, 165, 181, 197, 213, 229, 245,
+  6,  22, 38, 54, 70, 86, 102, 118, 134, 150, 166, 182, 198, 214, 230, 246,
+  7,  23, 39, 55, 71, 87, 103, 119, 135, 151, 167, 183, 199, 215, 231, 247,
+  8,  24, 40, 56, 72, 88, 104, 120, 136, 152, 168, 184, 200, 216, 232, 248,
+  9,  25, 41, 57, 73, 89, 105, 121, 137, 153, 169, 185, 201, 217, 233, 249,
+  10, 26, 42, 58, 74, 90, 106, 122, 138, 154, 170, 186, 202, 218, 234, 250,
+  11, 27, 43, 59, 75, 91, 107, 123, 139, 155, 171, 187, 203, 219, 235, 251,
+  12, 28, 44, 60, 76, 92, 108, 124, 140, 156, 172, 188, 204, 220, 236, 252,
+  13, 29, 45, 61, 77, 93, 109, 125, 141, 157, 173, 189, 205, 221, 237, 253,
+  14, 30, 46, 62, 78, 94, 110, 126, 142, 158, 174, 190, 206, 222, 238, 254,
+  15, 31, 47, 63, 79, 95, 111, 127, 143, 159, 175, 191, 207, 223, 239, 255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_16x16[256]) = {
+  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
+  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
+  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
+  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,
+  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
+  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
+  105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+  120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
+  135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
+  150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
+  165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
+  180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
+  195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
+  210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224,
+  225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+  240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
+  255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_32x32[1024]) = {
+  0,   32,   64,  96,   128, 160,  192, 224,  256, 288,  320, 352,  384, 416,
+  448, 480,  512, 544,  576, 608,  640, 672,  704, 736,  768, 800,  832, 864,
+  896, 928,  960, 992,  1,   33,   65,  97,   129, 161,  193, 225,  257, 289,
+  321, 353,  385, 417,  449, 481,  513, 545,  577, 609,  641, 673,  705, 737,
+  769, 801,  833, 865,  897, 929,  961, 993,  2,   34,   66,  98,   130, 162,
+  194, 226,  258, 290,  322, 354,  386, 418,  450, 482,  514, 546,  578, 610,
+  642, 674,  706, 738,  770, 802,  834, 866,  898, 930,  962, 994,  3,   35,
+  67,  99,   131, 163,  195, 227,  259, 291,  323, 355,  387, 419,  451, 483,
+  515, 547,  579, 611,  643, 675,  707, 739,  771, 803,  835, 867,  899, 931,
+  963, 995,  4,   36,   68,  100,  132, 164,  196, 228,  260, 292,  324, 356,
+  388, 420,  452, 484,  516, 548,  580, 612,  644, 676,  708, 740,  772, 804,
+  836, 868,  900, 932,  964, 996,  5,   37,   69,  101,  133, 165,  197, 229,
+  261, 293,  325, 357,  389, 421,  453, 485,  517, 549,  581, 613,  645, 677,
+  709, 741,  773, 805,  837, 869,  901, 933,  965, 997,  6,   38,   70,  102,
+  134, 166,  198, 230,  262, 294,  326, 358,  390, 422,  454, 486,  518, 550,
+  582, 614,  646, 678,  710, 742,  774, 806,  838, 870,  902, 934,  966, 998,
+  7,   39,   71,  103,  135, 167,  199, 231,  263, 295,  327, 359,  391, 423,
+  455, 487,  519, 551,  583, 615,  647, 679,  711, 743,  775, 807,  839, 871,
+  903, 935,  967, 999,  8,   40,   72,  104,  136, 168,  200, 232,  264, 296,
+  328, 360,  392, 424,  456, 488,  520, 552,  584, 616,  648, 680,  712, 744,
+  776, 808,  840, 872,  904, 936,  968, 1000, 9,   41,   73,  105,  137, 169,
+  201, 233,  265, 297,  329, 361,  393, 425,  457, 489,  521, 553,  585, 617,
+  649, 681,  713, 745,  777, 809,  841, 873,  905, 937,  969, 1001, 10,  42,
+  74,  106,  138, 170,  202, 234,  266, 298,  330, 362,  394, 426,  458, 490,
+  522, 554,  586, 618,  650, 682,  714, 746,  778, 810,  842, 874,  906, 938,
+  970, 1002, 11,  43,   75,  107,  139, 171,  203, 235,  267, 299,  331, 363,
+  395, 427,  459, 491,  523, 555,  587, 619,  651, 683,  715, 747,  779, 811,
+  843, 875,  907, 939,  971, 1003, 12,  44,   76,  108,  140, 172,  204, 236,
+  268, 300,  332, 364,  396, 428,  460, 492,  524, 556,  588, 620,  652, 684,
+  716, 748,  780, 812,  844, 876,  908, 940,  972, 1004, 13,  45,   77,  109,
+  141, 173,  205, 237,  269, 301,  333, 365,  397, 429,  461, 493,  525, 557,
+  589, 621,  653, 685,  717, 749,  781, 813,  845, 877,  909, 941,  973, 1005,
+  14,  46,   78,  110,  142, 174,  206, 238,  270, 302,  334, 366,  398, 430,
+  462, 494,  526, 558,  590, 622,  654, 686,  718, 750,  782, 814,  846, 878,
+  910, 942,  974, 1006, 15,  47,   79,  111,  143, 175,  207, 239,  271, 303,
+  335, 367,  399, 431,  463, 495,  527, 559,  591, 623,  655, 687,  719, 751,
+  783, 815,  847, 879,  911, 943,  975, 1007, 16,  48,   80,  112,  144, 176,
+  208, 240,  272, 304,  336, 368,  400, 432,  464, 496,  528, 560,  592, 624,
+  656, 688,  720, 752,  784, 816,  848, 880,  912, 944,  976, 1008, 17,  49,
+  81,  113,  145, 177,  209, 241,  273, 305,  337, 369,  401, 433,  465, 497,
+  529, 561,  593, 625,  657, 689,  721, 753,  785, 817,  849, 881,  913, 945,
+  977, 1009, 18,  50,   82,  114,  146, 178,  210, 242,  274, 306,  338, 370,
+  402, 434,  466, 498,  530, 562,  594, 626,  658, 690,  722, 754,  786, 818,
+  850, 882,  914, 946,  978, 1010, 19,  51,   83,  115,  147, 179,  211, 243,
+  275, 307,  339, 371,  403, 435,  467, 499,  531, 563,  595, 627,  659, 691,
+  723, 755,  787, 819,  851, 883,  915, 947,  979, 1011, 20,  52,   84,  116,
+  148, 180,  212, 244,  276, 308,  340, 372,  404, 436,  468, 500,  532, 564,
+  596, 628,  660, 692,  724, 756,  788, 820,  852, 884,  916, 948,  980, 1012,
+  21,  53,   85,  117,  149, 181,  213, 245,  277, 309,  341, 373,  405, 437,
+  469, 501,  533, 565,  597, 629,  661, 693,  725, 757,  789, 821,  853, 885,
+  917, 949,  981, 1013, 22,  54,   86,  118,  150, 182,  214, 246,  278, 310,
+  342, 374,  406, 438,  470, 502,  534, 566,  598, 630,  662, 694,  726, 758,
+  790, 822,  854, 886,  918, 950,  982, 1014, 23,  55,   87,  119,  151, 183,
+  215, 247,  279, 311,  343, 375,  407, 439,  471, 503,  535, 567,  599, 631,
+  663, 695,  727, 759,  791, 823,  855, 887,  919, 951,  983, 1015, 24,  56,
+  88,  120,  152, 184,  216, 248,  280, 312,  344, 376,  408, 440,  472, 504,
+  536, 568,  600, 632,  664, 696,  728, 760,  792, 824,  856, 888,  920, 952,
+  984, 1016, 25,  57,   89,  121,  153, 185,  217, 249,  281, 313,  345, 377,
+  409, 441,  473, 505,  537, 569,  601, 633,  665, 697,  729, 761,  793, 825,
+  857, 889,  921, 953,  985, 1017, 26,  58,   90,  122,  154, 186,  218, 250,
+  282, 314,  346, 378,  410, 442,  474, 506,  538, 570,  602, 634,  666, 698,
+  730, 762,  794, 826,  858, 890,  922, 954,  986, 1018, 27,  59,   91,  123,
+  155, 187,  219, 251,  283, 315,  347, 379,  411, 443,  475, 507,  539, 571,
+  603, 635,  667, 699,  731, 763,  795, 827,  859, 891,  923, 955,  987, 1019,
+  28,  60,   92,  124,  156, 188,  220, 252,  284, 316,  348, 380,  412, 444,
+  476, 508,  540, 572,  604, 636,  668, 700,  732, 764,  796, 828,  860, 892,
+  924, 956,  988, 1020, 29,  61,   93,  125,  157, 189,  221, 253,  285, 317,
+  349, 381,  413, 445,  477, 509,  541, 573,  605, 637,  669, 701,  733, 765,
+  797, 829,  861, 893,  925, 957,  989, 1021, 30,  62,   94,  126,  158, 190,
+  222, 254,  286, 318,  350, 382,  414, 446,  478, 510,  542, 574,  606, 638,
+  670, 702,  734, 766,  798, 830,  862, 894,  926, 958,  990, 1022, 31,  63,
+  95,  127,  159, 191,  223, 255,  287, 319,  351, 383,  415, 447,  479, 511,
+  543, 575,  607, 639,  671, 703,  735, 767,  799, 831,  863, 895,  927, 959,
+  991, 1023,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_32x32[1024]) = {
+  0,    1,    2,    3,    4,    5,    6,    7,    8,    9,    10,   11,   12,
+  13,   14,   15,   16,   17,   18,   19,   20,   21,   22,   23,   24,   25,
+  26,   27,   28,   29,   30,   31,   32,   33,   34,   35,   36,   37,   38,
+  39,   40,   41,   42,   43,   44,   45,   46,   47,   48,   49,   50,   51,
+  52,   53,   54,   55,   56,   57,   58,   59,   60,   61,   62,   63,   64,
+  65,   66,   67,   68,   69,   70,   71,   72,   73,   74,   75,   76,   77,
+  78,   79,   80,   81,   82,   83,   84,   85,   86,   87,   88,   89,   90,
+  91,   92,   93,   94,   95,   96,   97,   98,   99,   100,  101,  102,  103,
+  104,  105,  106,  107,  108,  109,  110,  111,  112,  113,  114,  115,  116,
+  117,  118,  119,  120,  121,  122,  123,  124,  125,  126,  127,  128,  129,
+  130,  131,  132,  133,  134,  135,  136,  137,  138,  139,  140,  141,  142,
+  143,  144,  145,  146,  147,  148,  149,  150,  151,  152,  153,  154,  155,
+  156,  157,  158,  159,  160,  161,  162,  163,  164,  165,  166,  167,  168,
+  169,  170,  171,  172,  173,  174,  175,  176,  177,  178,  179,  180,  181,
+  182,  183,  184,  185,  186,  187,  188,  189,  190,  191,  192,  193,  194,
+  195,  196,  197,  198,  199,  200,  201,  202,  203,  204,  205,  206,  207,
+  208,  209,  210,  211,  212,  213,  214,  215,  216,  217,  218,  219,  220,
+  221,  222,  223,  224,  225,  226,  227,  228,  229,  230,  231,  232,  233,
+  234,  235,  236,  237,  238,  239,  240,  241,  242,  243,  244,  245,  246,
+  247,  248,  249,  250,  251,  252,  253,  254,  255,  256,  257,  258,  259,
+  260,  261,  262,  263,  264,  265,  266,  267,  268,  269,  270,  271,  272,
+  273,  274,  275,  276,  277,  278,  279,  280,  281,  282,  283,  284,  285,
+  286,  287,  288,  289,  290,  291,  292,  293,  294,  295,  296,  297,  298,
+  299,  300,  301,  302,  303,  304,  305,  306,  307,  308,  309,  310,  311,
+  312,  313,  314,  315,  316,  317,  318,  319,  320,  321,  322,  323,  324,
+  325,  326,  327,  328,  329,  330,  331,  332,  333,  334,  335,  336,  337,
+  338,  339,  340,  341,  342,  343,  344,  345,  346,  347,  348,  349,  350,
+  351,  352,  353,  354,  355,  356,  357,  358,  359,  360,  361,  362,  363,
+  364,  365,  366,  367,  368,  369,  370,  371,  372,  373,  374,  375,  376,
+  377,  378,  379,  380,  381,  382,  383,  384,  385,  386,  387,  388,  389,
+  390,  391,  392,  393,  394,  395,  396,  397,  398,  399,  400,  401,  402,
+  403,  404,  405,  406,  407,  408,  409,  410,  411,  412,  413,  414,  415,
+  416,  417,  418,  419,  420,  421,  422,  423,  424,  425,  426,  427,  428,
+  429,  430,  431,  432,  433,  434,  435,  436,  437,  438,  439,  440,  441,
+  442,  443,  444,  445,  446,  447,  448,  449,  450,  451,  452,  453,  454,
+  455,  456,  457,  458,  459,  460,  461,  462,  463,  464,  465,  466,  467,
+  468,  469,  470,  471,  472,  473,  474,  475,  476,  477,  478,  479,  480,
+  481,  482,  483,  484,  485,  486,  487,  488,  489,  490,  491,  492,  493,
+  494,  495,  496,  497,  498,  499,  500,  501,  502,  503,  504,  505,  506,
+  507,  508,  509,  510,  511,  512,  513,  514,  515,  516,  517,  518,  519,
+  520,  521,  522,  523,  524,  525,  526,  527,  528,  529,  530,  531,  532,
+  533,  534,  535,  536,  537,  538,  539,  540,  541,  542,  543,  544,  545,
+  546,  547,  548,  549,  550,  551,  552,  553,  554,  555,  556,  557,  558,
+  559,  560,  561,  562,  563,  564,  565,  566,  567,  568,  569,  570,  571,
+  572,  573,  574,  575,  576,  577,  578,  579,  580,  581,  582,  583,  584,
+  585,  586,  587,  588,  589,  590,  591,  592,  593,  594,  595,  596,  597,
+  598,  599,  600,  601,  602,  603,  604,  605,  606,  607,  608,  609,  610,
+  611,  612,  613,  614,  615,  616,  617,  618,  619,  620,  621,  622,  623,
+  624,  625,  626,  627,  628,  629,  630,  631,  632,  633,  634,  635,  636,
+  637,  638,  639,  640,  641,  642,  643,  644,  645,  646,  647,  648,  649,
+  650,  651,  652,  653,  654,  655,  656,  657,  658,  659,  660,  661,  662,
+  663,  664,  665,  666,  667,  668,  669,  670,  671,  672,  673,  674,  675,
+  676,  677,  678,  679,  680,  681,  682,  683,  684,  685,  686,  687,  688,
+  689,  690,  691,  692,  693,  694,  695,  696,  697,  698,  699,  700,  701,
+  702,  703,  704,  705,  706,  707,  708,  709,  710,  711,  712,  713,  714,
+  715,  716,  717,  718,  719,  720,  721,  722,  723,  724,  725,  726,  727,
+  728,  729,  730,  731,  732,  733,  734,  735,  736,  737,  738,  739,  740,
+  741,  742,  743,  744,  745,  746,  747,  748,  749,  750,  751,  752,  753,
+  754,  755,  756,  757,  758,  759,  760,  761,  762,  763,  764,  765,  766,
+  767,  768,  769,  770,  771,  772,  773,  774,  775,  776,  777,  778,  779,
+  780,  781,  782,  783,  784,  785,  786,  787,  788,  789,  790,  791,  792,
+  793,  794,  795,  796,  797,  798,  799,  800,  801,  802,  803,  804,  805,
+  806,  807,  808,  809,  810,  811,  812,  813,  814,  815,  816,  817,  818,
+  819,  820,  821,  822,  823,  824,  825,  826,  827,  828,  829,  830,  831,
+  832,  833,  834,  835,  836,  837,  838,  839,  840,  841,  842,  843,  844,
+  845,  846,  847,  848,  849,  850,  851,  852,  853,  854,  855,  856,  857,
+  858,  859,  860,  861,  862,  863,  864,  865,  866,  867,  868,  869,  870,
+  871,  872,  873,  874,  875,  876,  877,  878,  879,  880,  881,  882,  883,
+  884,  885,  886,  887,  888,  889,  890,  891,  892,  893,  894,  895,  896,
+  897,  898,  899,  900,  901,  902,  903,  904,  905,  906,  907,  908,  909,
+  910,  911,  912,  913,  914,  915,  916,  917,  918,  919,  920,  921,  922,
+  923,  924,  925,  926,  927,  928,  929,  930,  931,  932,  933,  934,  935,
+  936,  937,  938,  939,  940,  941,  942,  943,  944,  945,  946,  947,  948,
+  949,  950,  951,  952,  953,  954,  955,  956,  957,  958,  959,  960,  961,
+  962,  963,  964,  965,  966,  967,  968,  969,  970,  971,  972,  973,  974,
+  975,  976,  977,  978,  979,  980,  981,  982,  983,  984,  985,  986,  987,
+  988,  989,  990,  991,  992,  993,  994,  995,  996,  997,  998,  999,  1000,
+  1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013,
+  1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_32x32[1024]) = {
+  0,    1,    32,   64,   33,   2,   3,    34,   65,   96,   128,  97,  66,
+  35,   4,    5,    36,   67,   98,  129,  160,  192,  161,  130,  99,  68,
+  37,   6,    7,    38,   69,   100, 131,  162,  193,  224,  256,  225, 194,
+  163,  132,  101,  70,   39,   8,   9,    40,   71,   102,  133,  164, 195,
+  226,  257,  288,  320,  289,  258, 227,  196,  165,  134,  103,  72,  41,
+  10,   11,   42,   73,   104,  135, 166,  197,  228,  259,  290,  321, 352,
+  384,  353,  322,  291,  260,  229, 198,  167,  136,  105,  74,   43,  12,
+  13,   44,   75,   106,  137,  168, 199,  230,  261,  292,  323,  354, 385,
+  416,  448,  417,  386,  355,  324, 293,  262,  231,  200,  169,  138, 107,
+  76,   45,   14,   15,   46,   77,  108,  139,  170,  201,  232,  263, 294,
+  325,  356,  387,  418,  449,  480, 512,  481,  450,  419,  388,  357, 326,
+  295,  264,  233,  202,  171,  140, 109,  78,   47,   16,   17,   48,  79,
+  110,  141,  172,  203,  234,  265, 296,  327,  358,  389,  420,  451, 482,
+  513,  544,  576,  545,  514,  483, 452,  421,  390,  359,  328,  297, 266,
+  235,  204,  173,  142,  111,  80,  49,   18,   19,   50,   81,   112, 143,
+  174,  205,  236,  267,  298,  329, 360,  391,  422,  453,  484,  515, 546,
+  577,  608,  640,  609,  578,  547, 516,  485,  454,  423,  392,  361, 330,
+  299,  268,  237,  206,  175,  144, 113,  82,   51,   20,   21,   52,  83,
+  114,  145,  176,  207,  238,  269, 300,  331,  362,  393,  424,  455, 486,
+  517,  548,  579,  610,  641,  672, 704,  673,  642,  611,  580,  549, 518,
+  487,  456,  425,  394,  363,  332, 301,  270,  239,  208,  177,  146, 115,
+  84,   53,   22,   23,   54,   85,  116,  147,  178,  209,  240,  271, 302,
+  333,  364,  395,  426,  457,  488, 519,  550,  581,  612,  643,  674, 705,
+  736,  768,  737,  706,  675,  644, 613,  582,  551,  520,  489,  458, 427,
+  396,  365,  334,  303,  272,  241, 210,  179,  148,  117,  86,   55,  24,
+  25,   56,   87,   118,  149,  180, 211,  242,  273,  304,  335,  366, 397,
+  428,  459,  490,  521,  552,  583, 614,  645,  676,  707,  738,  769, 800,
+  832,  801,  770,  739,  708,  677, 646,  615,  584,  553,  522,  491, 460,
+  429,  398,  367,  336,  305,  274, 243,  212,  181,  150,  119,  88,  57,
+  26,   27,   58,   89,   120,  151, 182,  213,  244,  275,  306,  337, 368,
+  399,  430,  461,  492,  523,  554, 585,  616,  647,  678,  709,  740, 771,
+  802,  833,  864,  896,  865,  834, 803,  772,  741,  710,  679,  648, 617,
+  586,  555,  524,  493,  462,  431, 400,  369,  338,  307,  276,  245, 214,
+  183,  152,  121,  90,   59,   28,  29,   60,   91,   122,  153,  184, 215,
+  246,  277,  308,  339,  370,  401, 432,  463,  494,  525,  556,  587, 618,
+  649,  680,  711,  742,  773,  804, 835,  866,  897,  928,  960,  929, 898,
+  867,  836,  805,  774,  743,  712, 681,  650,  619,  588,  557,  526, 495,
+  464,  433,  402,  371,  340,  309, 278,  247,  216,  185,  154,  123, 92,
+  61,   30,   31,   62,   93,   124, 155,  186,  217,  248,  279,  310, 341,
+  372,  403,  434,  465,  496,  527, 558,  589,  620,  651,  682,  713, 744,
+  775,  806,  837,  868,  899,  930, 961,  992,  993,  962,  931,  900, 869,
+  838,  807,  776,  745,  714,  683, 652,  621,  590,  559,  528,  497, 466,
+  435,  404,  373,  342,  311,  280, 249,  218,  187,  156,  125,  94,  63,
+  95,   126,  157,  188,  219,  250, 281,  312,  343,  374,  405,  436, 467,
+  498,  529,  560,  591,  622,  653, 684,  715,  746,  777,  808,  839, 870,
+  901,  932,  963,  994,  995,  964, 933,  902,  871,  840,  809,  778, 747,
+  716,  685,  654,  623,  592,  561, 530,  499,  468,  437,  406,  375, 344,
+  313,  282,  251,  220,  189,  158, 127,  159,  190,  221,  252,  283, 314,
+  345,  376,  407,  438,  469,  500, 531,  562,  593,  624,  655,  686, 717,
+  748,  779,  810,  841,  872,  903, 934,  965,  996,  997,  966,  935, 904,
+  873,  842,  811,  780,  749,  718, 687,  656,  625,  594,  563,  532, 501,
+  470,  439,  408,  377,  346,  315, 284,  253,  222,  191,  223,  254, 285,
+  316,  347,  378,  409,  440,  471, 502,  533,  564,  595,  626,  657, 688,
+  719,  750,  781,  812,  843,  874, 905,  936,  967,  998,  999,  968, 937,
+  906,  875,  844,  813,  782,  751, 720,  689,  658,  627,  596,  565, 534,
+  503,  472,  441,  410,  379,  348, 317,  286,  255,  287,  318,  349, 380,
+  411,  442,  473,  504,  535,  566, 597,  628,  659,  690,  721,  752, 783,
+  814,  845,  876,  907,  938,  969, 1000, 1001, 970,  939,  908,  877, 846,
+  815,  784,  753,  722,  691,  660, 629,  598,  567,  536,  505,  474, 443,
+  412,  381,  350,  319,  351,  382, 413,  444,  475,  506,  537,  568, 599,
+  630,  661,  692,  723,  754,  785, 816,  847,  878,  909,  940,  971, 1002,
+  1003, 972,  941,  910,  879,  848, 817,  786,  755,  724,  693,  662, 631,
+  600,  569,  538,  507,  476,  445, 414,  383,  415,  446,  477,  508, 539,
+  570,  601,  632,  663,  694,  725, 756,  787,  818,  849,  880,  911, 942,
+  973,  1004, 1005, 974,  943,  912, 881,  850,  819,  788,  757,  726, 695,
+  664,  633,  602,  571,  540,  509, 478,  447,  479,  510,  541,  572, 603,
+  634,  665,  696,  727,  758,  789, 820,  851,  882,  913,  944,  975, 1006,
+  1007, 976,  945,  914,  883,  852, 821,  790,  759,  728,  697,  666, 635,
+  604,  573,  542,  511,  543,  574, 605,  636,  667,  698,  729,  760, 791,
+  822,  853,  884,  915,  946,  977, 1008, 1009, 978,  947,  916,  885, 854,
+  823,  792,  761,  730,  699,  668, 637,  606,  575,  607,  638,  669, 700,
+  731,  762,  793,  824,  855,  886, 917,  948,  979,  1010, 1011, 980, 949,
+  918,  887,  856,  825,  794,  763, 732,  701,  670,  639,  671,  702, 733,
+  764,  795,  826,  857,  888,  919, 950,  981,  1012, 1013, 982,  951, 920,
+  889,  858,  827,  796,  765,  734, 703,  735,  766,  797,  828,  859, 890,
+  921,  952,  983,  1014, 1015, 984, 953,  922,  891,  860,  829,  798, 767,
+  799,  830,  861,  892,  923,  954, 985,  1016, 1017, 986,  955,  924, 893,
+  862,  831,  863,  894,  925,  956, 987,  1018, 1019, 988,  957,  926, 895,
+  927,  958,  989,  1020, 1021, 990, 959,  991,  1022, 1023
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                av1_default_iscan_4x4[16]) = { 0, 1, 5,  6,  2, 4,  7,  12,
+                                               3, 8, 11, 13, 9, 10, 14, 15 };
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_4x4[16]) = {
+  0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_4x4[16]) = {
+  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_4x8[32]) = {
+  0,  1,  3,  6,  2,  4,  7,  10, 5,  8,  11, 14, 9,  12, 15, 18,
+  13, 16, 19, 22, 17, 20, 23, 26, 21, 24, 27, 29, 25, 28, 30, 31,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_4x8[32]) = {
+  0, 8,  16, 24, 1, 9,  17, 25, 2, 10, 18, 26, 3, 11, 19, 27,
+  4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_4x8[32]) = {
+  0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_8x4[32]) = {
+  0, 2, 5,  9,  13, 17, 21, 25, 1, 4,  8,  12, 16, 20, 24, 28,
+  3, 7, 11, 15, 19, 23, 27, 30, 6, 10, 14, 18, 22, 26, 29, 31,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_8x4[32]) = {
+  0, 4, 8,  12, 16, 20, 24, 28, 1, 5, 9,  13, 17, 21, 25, 29,
+  2, 6, 10, 14, 18, 22, 26, 30, 3, 7, 11, 15, 19, 23, 27, 31,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_8x4[32]) = {
+  0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_4x16[64]) = {
+  0,  1,  3,  6,  2,  4,  7,  10, 5,  8,  11, 14, 9,  12, 15, 18,
+  13, 16, 19, 22, 17, 20, 23, 26, 21, 24, 27, 30, 25, 28, 31, 34,
+  29, 32, 35, 38, 33, 36, 39, 42, 37, 40, 43, 46, 41, 44, 47, 50,
+  45, 48, 51, 54, 49, 52, 55, 58, 53, 56, 59, 61, 57, 60, 62, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_16x4[64]) = {
+  0, 2,  5,  9,  13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57,
+  1, 4,  8,  12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60,
+  3, 7,  11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 62,
+  6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 61, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_4x16[64]) = {
+  0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_16x4[64]) = {
+  0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_4x16[64]) = {
+  0,  16, 32, 48, 1,  17, 33, 49, 2,  18, 34, 50, 3,  19, 35, 51,
+  4,  20, 36, 52, 5,  21, 37, 53, 6,  22, 38, 54, 7,  23, 39, 55,
+  8,  24, 40, 56, 9,  25, 41, 57, 10, 26, 42, 58, 11, 27, 43, 59,
+  12, 28, 44, 60, 13, 29, 45, 61, 14, 30, 46, 62, 15, 31, 47, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_16x4[64]) = {
+  0, 4, 8,  12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60,
+  1, 5, 9,  13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61,
+  2, 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62,
+  3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_8x32[256]) = {
+  0,   1,   3,   6,   10,  15,  21,  28,  2,   4,   7,   11,  16,  22,  29,
+  36,  5,   8,   12,  17,  23,  30,  37,  44,  9,   13,  18,  24,  31,  38,
+  45,  52,  14,  19,  25,  32,  39,  46,  53,  60,  20,  26,  33,  40,  47,
+  54,  61,  68,  27,  34,  41,  48,  55,  62,  69,  76,  35,  42,  49,  56,
+  63,  70,  77,  84,  43,  50,  57,  64,  71,  78,  85,  92,  51,  58,  65,
+  72,  79,  86,  93,  100, 59,  66,  73,  80,  87,  94,  101, 108, 67,  74,
+  81,  88,  95,  102, 109, 116, 75,  82,  89,  96,  103, 110, 117, 124, 83,
+  90,  97,  104, 111, 118, 125, 132, 91,  98,  105, 112, 119, 126, 133, 140,
+  99,  106, 113, 120, 127, 134, 141, 148, 107, 114, 121, 128, 135, 142, 149,
+  156, 115, 122, 129, 136, 143, 150, 157, 164, 123, 130, 137, 144, 151, 158,
+  165, 172, 131, 138, 145, 152, 159, 166, 173, 180, 139, 146, 153, 160, 167,
+  174, 181, 188, 147, 154, 161, 168, 175, 182, 189, 196, 155, 162, 169, 176,
+  183, 190, 197, 204, 163, 170, 177, 184, 191, 198, 205, 212, 171, 178, 185,
+  192, 199, 206, 213, 220, 179, 186, 193, 200, 207, 214, 221, 228, 187, 194,
+  201, 208, 215, 222, 229, 235, 195, 202, 209, 216, 223, 230, 236, 241, 203,
+  210, 217, 224, 231, 237, 242, 246, 211, 218, 225, 232, 238, 243, 247, 250,
+  219, 226, 233, 239, 244, 248, 251, 253, 227, 234, 240, 245, 249, 252, 254,
+  255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_32x8[256]) = {
+  0,   2,   5,   9,   14,  20,  27,  35,  43,  51,  59,  67,  75,  83,  91,
+  99,  107, 115, 123, 131, 139, 147, 155, 163, 171, 179, 187, 195, 203, 211,
+  219, 227, 1,   4,   8,   13,  19,  26,  34,  42,  50,  58,  66,  74,  82,
+  90,  98,  106, 114, 122, 130, 138, 146, 154, 162, 170, 178, 186, 194, 202,
+  210, 218, 226, 234, 3,   7,   12,  18,  25,  33,  41,  49,  57,  65,  73,
+  81,  89,  97,  105, 113, 121, 129, 137, 145, 153, 161, 169, 177, 185, 193,
+  201, 209, 217, 225, 233, 240, 6,   11,  17,  24,  32,  40,  48,  56,  64,
+  72,  80,  88,  96,  104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184,
+  192, 200, 208, 216, 224, 232, 239, 245, 10,  16,  23,  31,  39,  47,  55,
+  63,  71,  79,  87,  95,  103, 111, 119, 127, 135, 143, 151, 159, 167, 175,
+  183, 191, 199, 207, 215, 223, 231, 238, 244, 249, 15,  22,  30,  38,  46,
+  54,  62,  70,  78,  86,  94,  102, 110, 118, 126, 134, 142, 150, 158, 166,
+  174, 182, 190, 198, 206, 214, 222, 230, 237, 243, 248, 252, 21,  29,  37,
+  45,  53,  61,  69,  77,  85,  93,  101, 109, 117, 125, 133, 141, 149, 157,
+  165, 173, 181, 189, 197, 205, 213, 221, 229, 236, 242, 247, 251, 254, 28,
+  36,  44,  52,  60,  68,  76,  84,  92,  100, 108, 116, 124, 132, 140, 148,
+  156, 164, 172, 180, 188, 196, 204, 212, 220, 228, 235, 241, 246, 250, 253,
+  255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_8x32[256]) = {
+  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
+  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
+  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
+  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,
+  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
+  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
+  105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+  120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
+  135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
+  150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
+  165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
+  180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
+  195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
+  210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224,
+  225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+  240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
+  255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_32x8[256]) = {
+  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
+  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
+  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
+  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,
+  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
+  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
+  105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+  120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
+  135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
+  150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
+  165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
+  180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
+  195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
+  210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224,
+  225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+  240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
+  255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_8x32[256]) = {
+  0,  32, 64, 96,  128, 160, 192, 224, 1,  33, 65, 97,  129, 161, 193, 225,
+  2,  34, 66, 98,  130, 162, 194, 226, 3,  35, 67, 99,  131, 163, 195, 227,
+  4,  36, 68, 100, 132, 164, 196, 228, 5,  37, 69, 101, 133, 165, 197, 229,
+  6,  38, 70, 102, 134, 166, 198, 230, 7,  39, 71, 103, 135, 167, 199, 231,
+  8,  40, 72, 104, 136, 168, 200, 232, 9,  41, 73, 105, 137, 169, 201, 233,
+  10, 42, 74, 106, 138, 170, 202, 234, 11, 43, 75, 107, 139, 171, 203, 235,
+  12, 44, 76, 108, 140, 172, 204, 236, 13, 45, 77, 109, 141, 173, 205, 237,
+  14, 46, 78, 110, 142, 174, 206, 238, 15, 47, 79, 111, 143, 175, 207, 239,
+  16, 48, 80, 112, 144, 176, 208, 240, 17, 49, 81, 113, 145, 177, 209, 241,
+  18, 50, 82, 114, 146, 178, 210, 242, 19, 51, 83, 115, 147, 179, 211, 243,
+  20, 52, 84, 116, 148, 180, 212, 244, 21, 53, 85, 117, 149, 181, 213, 245,
+  22, 54, 86, 118, 150, 182, 214, 246, 23, 55, 87, 119, 151, 183, 215, 247,
+  24, 56, 88, 120, 152, 184, 216, 248, 25, 57, 89, 121, 153, 185, 217, 249,
+  26, 58, 90, 122, 154, 186, 218, 250, 27, 59, 91, 123, 155, 187, 219, 251,
+  28, 60, 92, 124, 156, 188, 220, 252, 29, 61, 93, 125, 157, 189, 221, 253,
+  30, 62, 94, 126, 158, 190, 222, 254, 31, 63, 95, 127, 159, 191, 223, 255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_32x8[256]) = {
+  0,   8,   16,  24,  32,  40,  48,  56,  64,  72,  80,  88,  96,  104, 112,
+  120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232,
+  240, 248, 1,   9,   17,  25,  33,  41,  49,  57,  65,  73,  81,  89,  97,
+  105, 113, 121, 129, 137, 145, 153, 161, 169, 177, 185, 193, 201, 209, 217,
+  225, 233, 241, 249, 2,   10,  18,  26,  34,  42,  50,  58,  66,  74,  82,
+  90,  98,  106, 114, 122, 130, 138, 146, 154, 162, 170, 178, 186, 194, 202,
+  210, 218, 226, 234, 242, 250, 3,   11,  19,  27,  35,  43,  51,  59,  67,
+  75,  83,  91,  99,  107, 115, 123, 131, 139, 147, 155, 163, 171, 179, 187,
+  195, 203, 211, 219, 227, 235, 243, 251, 4,   12,  20,  28,  36,  44,  52,
+  60,  68,  76,  84,  92,  100, 108, 116, 124, 132, 140, 148, 156, 164, 172,
+  180, 188, 196, 204, 212, 220, 228, 236, 244, 252, 5,   13,  21,  29,  37,
+  45,  53,  61,  69,  77,  85,  93,  101, 109, 117, 125, 133, 141, 149, 157,
+  165, 173, 181, 189, 197, 205, 213, 221, 229, 237, 245, 253, 6,   14,  22,
+  30,  38,  46,  54,  62,  70,  78,  86,  94,  102, 110, 118, 126, 134, 142,
+  150, 158, 166, 174, 182, 190, 198, 206, 214, 222, 230, 238, 246, 254, 7,
+  15,  23,  31,  39,  47,  55,  63,  71,  79,  87,  95,  103, 111, 119, 127,
+  135, 143, 151, 159, 167, 175, 183, 191, 199, 207, 215, 223, 231, 239, 247,
+  255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_8x8[64]) = {
+  0, 8,  16, 24, 32, 40, 48, 56, 1, 9,  17, 25, 33, 41, 49, 57,
+  2, 10, 18, 26, 34, 42, 50, 58, 3, 11, 19, 27, 35, 43, 51, 59,
+  4, 12, 20, 28, 36, 44, 52, 60, 5, 13, 21, 29, 37, 45, 53, 61,
+  6, 14, 22, 30, 38, 46, 54, 62, 7, 15, 23, 31, 39, 47, 55, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_8x8[64]) = {
+  0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_8x8[64]) = {
+  0,  1,  5,  6,  14, 15, 27, 28, 2,  4,  7,  13, 16, 26, 29, 42,
+  3,  8,  12, 17, 25, 30, 41, 43, 9,  11, 18, 24, 31, 40, 44, 53,
+  10, 19, 23, 32, 39, 45, 52, 54, 20, 22, 33, 38, 46, 51, 55, 60,
+  21, 34, 37, 47, 50, 56, 59, 61, 35, 36, 48, 49, 57, 58, 62, 63
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_8x16[128]) = {
+  0,  1,  3,   6,   10,  15,  21,  28,  2,  4,   7,   11,  16,  22,  29,  36,
+  5,  8,  12,  17,  23,  30,  37,  44,  9,  13,  18,  24,  31,  38,  45,  52,
+  14, 19, 25,  32,  39,  46,  53,  60,  20, 26,  33,  40,  47,  54,  61,  68,
+  27, 34, 41,  48,  55,  62,  69,  76,  35, 42,  49,  56,  63,  70,  77,  84,
+  43, 50, 57,  64,  71,  78,  85,  92,  51, 58,  65,  72,  79,  86,  93,  100,
+  59, 66, 73,  80,  87,  94,  101, 107, 67, 74,  81,  88,  95,  102, 108, 113,
+  75, 82, 89,  96,  103, 109, 114, 118, 83, 90,  97,  104, 110, 115, 119, 122,
+  91, 98, 105, 111, 116, 120, 123, 125, 99, 106, 112, 117, 121, 124, 126, 127,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_16x8[128]) = {
+  0,  2,  5,  9,  14, 20, 27, 35, 43, 51,  59,  67,  75,  83,  91,  99,
+  1,  4,  8,  13, 19, 26, 34, 42, 50, 58,  66,  74,  82,  90,  98,  106,
+  3,  7,  12, 18, 25, 33, 41, 49, 57, 65,  73,  81,  89,  97,  105, 112,
+  6,  11, 17, 24, 32, 40, 48, 56, 64, 72,  80,  88,  96,  104, 111, 117,
+  10, 16, 23, 31, 39, 47, 55, 63, 71, 79,  87,  95,  103, 110, 116, 121,
+  15, 22, 30, 38, 46, 54, 62, 70, 78, 86,  94,  102, 109, 115, 120, 124,
+  21, 29, 37, 45, 53, 61, 69, 77, 85, 93,  101, 108, 114, 119, 123, 126,
+  28, 36, 44, 52, 60, 68, 76, 84, 92, 100, 107, 113, 118, 122, 125, 127,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_8x16[128]) = {
+  0,  16, 32, 48, 64, 80, 96,  112, 1,  17, 33, 49, 65, 81, 97,  113,
+  2,  18, 34, 50, 66, 82, 98,  114, 3,  19, 35, 51, 67, 83, 99,  115,
+  4,  20, 36, 52, 68, 84, 100, 116, 5,  21, 37, 53, 69, 85, 101, 117,
+  6,  22, 38, 54, 70, 86, 102, 118, 7,  23, 39, 55, 71, 87, 103, 119,
+  8,  24, 40, 56, 72, 88, 104, 120, 9,  25, 41, 57, 73, 89, 105, 121,
+  10, 26, 42, 58, 74, 90, 106, 122, 11, 27, 43, 59, 75, 91, 107, 123,
+  12, 28, 44, 60, 76, 92, 108, 124, 13, 29, 45, 61, 77, 93, 109, 125,
+  14, 30, 46, 62, 78, 94, 110, 126, 15, 31, 47, 63, 79, 95, 111, 127,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_16x8[128]) = {
+  0, 8,  16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96,  104, 112, 120,
+  1, 9,  17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97,  105, 113, 121,
+  2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98,  106, 114, 122,
+  3, 11, 19, 27, 35, 43, 51, 59, 67, 75, 83, 91, 99,  107, 115, 123,
+  4, 12, 20, 28, 36, 44, 52, 60, 68, 76, 84, 92, 100, 108, 116, 124,
+  5, 13, 21, 29, 37, 45, 53, 61, 69, 77, 85, 93, 101, 109, 117, 125,
+  6, 14, 22, 30, 38, 46, 54, 62, 70, 78, 86, 94, 102, 110, 118, 126,
+  7, 15, 23, 31, 39, 47, 55, 63, 71, 79, 87, 95, 103, 111, 119, 127,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_8x16[128]) = {
+  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
+  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
+  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
+  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,
+  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
+  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
+  105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+  120, 121, 122, 123, 124, 125, 126, 127,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_16x8[128]) = {
+  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
+  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
+  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
+  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,
+  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
+  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
+  105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+  120, 121, 122, 123, 124, 125, 126, 127,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_16x32[512]) = {
+  0,   1,   3,   6,   10,  15,  21,  28,  36,  45,  55,  66,  78,  91,  105,
+  120, 2,   4,   7,   11,  16,  22,  29,  37,  46,  56,  67,  79,  92,  106,
+  121, 136, 5,   8,   12,  17,  23,  30,  38,  47,  57,  68,  80,  93,  107,
+  122, 137, 152, 9,   13,  18,  24,  31,  39,  48,  58,  69,  81,  94,  108,
+  123, 138, 153, 168, 14,  19,  25,  32,  40,  49,  59,  70,  82,  95,  109,
+  124, 139, 154, 169, 184, 20,  26,  33,  41,  50,  60,  71,  83,  96,  110,
+  125, 140, 155, 170, 185, 200, 27,  34,  42,  51,  61,  72,  84,  97,  111,
+  126, 141, 156, 171, 186, 201, 216, 35,  43,  52,  62,  73,  85,  98,  112,
+  127, 142, 157, 172, 187, 202, 217, 232, 44,  53,  63,  74,  86,  99,  113,
+  128, 143, 158, 173, 188, 203, 218, 233, 248, 54,  64,  75,  87,  100, 114,
+  129, 144, 159, 174, 189, 204, 219, 234, 249, 264, 65,  76,  88,  101, 115,
+  130, 145, 160, 175, 190, 205, 220, 235, 250, 265, 280, 77,  89,  102, 116,
+  131, 146, 161, 176, 191, 206, 221, 236, 251, 266, 281, 296, 90,  103, 117,
+  132, 147, 162, 177, 192, 207, 222, 237, 252, 267, 282, 297, 312, 104, 118,
+  133, 148, 163, 178, 193, 208, 223, 238, 253, 268, 283, 298, 313, 328, 119,
+  134, 149, 164, 179, 194, 209, 224, 239, 254, 269, 284, 299, 314, 329, 344,
+  135, 150, 165, 180, 195, 210, 225, 240, 255, 270, 285, 300, 315, 330, 345,
+  360, 151, 166, 181, 196, 211, 226, 241, 256, 271, 286, 301, 316, 331, 346,
+  361, 376, 167, 182, 197, 212, 227, 242, 257, 272, 287, 302, 317, 332, 347,
+  362, 377, 392, 183, 198, 213, 228, 243, 258, 273, 288, 303, 318, 333, 348,
+  363, 378, 393, 407, 199, 214, 229, 244, 259, 274, 289, 304, 319, 334, 349,
+  364, 379, 394, 408, 421, 215, 230, 245, 260, 275, 290, 305, 320, 335, 350,
+  365, 380, 395, 409, 422, 434, 231, 246, 261, 276, 291, 306, 321, 336, 351,
+  366, 381, 396, 410, 423, 435, 446, 247, 262, 277, 292, 307, 322, 337, 352,
+  367, 382, 397, 411, 424, 436, 447, 457, 263, 278, 293, 308, 323, 338, 353,
+  368, 383, 398, 412, 425, 437, 448, 458, 467, 279, 294, 309, 324, 339, 354,
+  369, 384, 399, 413, 426, 438, 449, 459, 468, 476, 295, 310, 325, 340, 355,
+  370, 385, 400, 414, 427, 439, 450, 460, 469, 477, 484, 311, 326, 341, 356,
+  371, 386, 401, 415, 428, 440, 451, 461, 470, 478, 485, 491, 327, 342, 357,
+  372, 387, 402, 416, 429, 441, 452, 462, 471, 479, 486, 492, 497, 343, 358,
+  373, 388, 403, 417, 430, 442, 453, 463, 472, 480, 487, 493, 498, 502, 359,
+  374, 389, 404, 418, 431, 443, 454, 464, 473, 481, 488, 494, 499, 503, 506,
+  375, 390, 405, 419, 432, 444, 455, 465, 474, 482, 489, 495, 500, 504, 507,
+  509, 391, 406, 420, 433, 445, 456, 466, 475, 483, 490, 496, 501, 505, 508,
+  510, 511,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_32x16[512]) = {
+  0,   2,   5,   9,   14,  20,  27,  35,  44,  54,  65,  77,  90,  104, 119,
+  135, 151, 167, 183, 199, 215, 231, 247, 263, 279, 295, 311, 327, 343, 359,
+  375, 391, 1,   4,   8,   13,  19,  26,  34,  43,  53,  64,  76,  89,  103,
+  118, 134, 150, 166, 182, 198, 214, 230, 246, 262, 278, 294, 310, 326, 342,
+  358, 374, 390, 406, 3,   7,   12,  18,  25,  33,  42,  52,  63,  75,  88,
+  102, 117, 133, 149, 165, 181, 197, 213, 229, 245, 261, 277, 293, 309, 325,
+  341, 357, 373, 389, 405, 420, 6,   11,  17,  24,  32,  41,  51,  62,  74,
+  87,  101, 116, 132, 148, 164, 180, 196, 212, 228, 244, 260, 276, 292, 308,
+  324, 340, 356, 372, 388, 404, 419, 433, 10,  16,  23,  31,  40,  50,  61,
+  73,  86,  100, 115, 131, 147, 163, 179, 195, 211, 227, 243, 259, 275, 291,
+  307, 323, 339, 355, 371, 387, 403, 418, 432, 445, 15,  22,  30,  39,  49,
+  60,  72,  85,  99,  114, 130, 146, 162, 178, 194, 210, 226, 242, 258, 274,
+  290, 306, 322, 338, 354, 370, 386, 402, 417, 431, 444, 456, 21,  29,  38,
+  48,  59,  71,  84,  98,  113, 129, 145, 161, 177, 193, 209, 225, 241, 257,
+  273, 289, 305, 321, 337, 353, 369, 385, 401, 416, 430, 443, 455, 466, 28,
+  37,  47,  58,  70,  83,  97,  112, 128, 144, 160, 176, 192, 208, 224, 240,
+  256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 415, 429, 442, 454, 465,
+  475, 36,  46,  57,  69,  82,  96,  111, 127, 143, 159, 175, 191, 207, 223,
+  239, 255, 271, 287, 303, 319, 335, 351, 367, 383, 399, 414, 428, 441, 453,
+  464, 474, 483, 45,  56,  68,  81,  95,  110, 126, 142, 158, 174, 190, 206,
+  222, 238, 254, 270, 286, 302, 318, 334, 350, 366, 382, 398, 413, 427, 440,
+  452, 463, 473, 482, 490, 55,  67,  80,  94,  109, 125, 141, 157, 173, 189,
+  205, 221, 237, 253, 269, 285, 301, 317, 333, 349, 365, 381, 397, 412, 426,
+  439, 451, 462, 472, 481, 489, 496, 66,  79,  93,  108, 124, 140, 156, 172,
+  188, 204, 220, 236, 252, 268, 284, 300, 316, 332, 348, 364, 380, 396, 411,
+  425, 438, 450, 461, 471, 480, 488, 495, 501, 78,  92,  107, 123, 139, 155,
+  171, 187, 203, 219, 235, 251, 267, 283, 299, 315, 331, 347, 363, 379, 395,
+  410, 424, 437, 449, 460, 470, 479, 487, 494, 500, 505, 91,  106, 122, 138,
+  154, 170, 186, 202, 218, 234, 250, 266, 282, 298, 314, 330, 346, 362, 378,
+  394, 409, 423, 436, 448, 459, 469, 478, 486, 493, 499, 504, 508, 105, 121,
+  137, 153, 169, 185, 201, 217, 233, 249, 265, 281, 297, 313, 329, 345, 361,
+  377, 393, 408, 422, 435, 447, 458, 468, 477, 485, 492, 498, 503, 507, 510,
+  120, 136, 152, 168, 184, 200, 216, 232, 248, 264, 280, 296, 312, 328, 344,
+  360, 376, 392, 407, 421, 434, 446, 457, 467, 476, 484, 491, 497, 502, 506,
+  509, 511,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_16x32[512]) = {
+  0,  32, 64, 96,  128, 160, 192, 224, 256, 288, 320, 352, 384, 416, 448, 480,
+  1,  33, 65, 97,  129, 161, 193, 225, 257, 289, 321, 353, 385, 417, 449, 481,
+  2,  34, 66, 98,  130, 162, 194, 226, 258, 290, 322, 354, 386, 418, 450, 482,
+  3,  35, 67, 99,  131, 163, 195, 227, 259, 291, 323, 355, 387, 419, 451, 483,
+  4,  36, 68, 100, 132, 164, 196, 228, 260, 292, 324, 356, 388, 420, 452, 484,
+  5,  37, 69, 101, 133, 165, 197, 229, 261, 293, 325, 357, 389, 421, 453, 485,
+  6,  38, 70, 102, 134, 166, 198, 230, 262, 294, 326, 358, 390, 422, 454, 486,
+  7,  39, 71, 103, 135, 167, 199, 231, 263, 295, 327, 359, 391, 423, 455, 487,
+  8,  40, 72, 104, 136, 168, 200, 232, 264, 296, 328, 360, 392, 424, 456, 488,
+  9,  41, 73, 105, 137, 169, 201, 233, 265, 297, 329, 361, 393, 425, 457, 489,
+  10, 42, 74, 106, 138, 170, 202, 234, 266, 298, 330, 362, 394, 426, 458, 490,
+  11, 43, 75, 107, 139, 171, 203, 235, 267, 299, 331, 363, 395, 427, 459, 491,
+  12, 44, 76, 108, 140, 172, 204, 236, 268, 300, 332, 364, 396, 428, 460, 492,
+  13, 45, 77, 109, 141, 173, 205, 237, 269, 301, 333, 365, 397, 429, 461, 493,
+  14, 46, 78, 110, 142, 174, 206, 238, 270, 302, 334, 366, 398, 430, 462, 494,
+  15, 47, 79, 111, 143, 175, 207, 239, 271, 303, 335, 367, 399, 431, 463, 495,
+  16, 48, 80, 112, 144, 176, 208, 240, 272, 304, 336, 368, 400, 432, 464, 496,
+  17, 49, 81, 113, 145, 177, 209, 241, 273, 305, 337, 369, 401, 433, 465, 497,
+  18, 50, 82, 114, 146, 178, 210, 242, 274, 306, 338, 370, 402, 434, 466, 498,
+  19, 51, 83, 115, 147, 179, 211, 243, 275, 307, 339, 371, 403, 435, 467, 499,
+  20, 52, 84, 116, 148, 180, 212, 244, 276, 308, 340, 372, 404, 436, 468, 500,
+  21, 53, 85, 117, 149, 181, 213, 245, 277, 309, 341, 373, 405, 437, 469, 501,
+  22, 54, 86, 118, 150, 182, 214, 246, 278, 310, 342, 374, 406, 438, 470, 502,
+  23, 55, 87, 119, 151, 183, 215, 247, 279, 311, 343, 375, 407, 439, 471, 503,
+  24, 56, 88, 120, 152, 184, 216, 248, 280, 312, 344, 376, 408, 440, 472, 504,
+  25, 57, 89, 121, 153, 185, 217, 249, 281, 313, 345, 377, 409, 441, 473, 505,
+  26, 58, 90, 122, 154, 186, 218, 250, 282, 314, 346, 378, 410, 442, 474, 506,
+  27, 59, 91, 123, 155, 187, 219, 251, 283, 315, 347, 379, 411, 443, 475, 507,
+  28, 60, 92, 124, 156, 188, 220, 252, 284, 316, 348, 380, 412, 444, 476, 508,
+  29, 61, 93, 125, 157, 189, 221, 253, 285, 317, 349, 381, 413, 445, 477, 509,
+  30, 62, 94, 126, 158, 190, 222, 254, 286, 318, 350, 382, 414, 446, 478, 510,
+  31, 63, 95, 127, 159, 191, 223, 255, 287, 319, 351, 383, 415, 447, 479, 511,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_32x16[512]) = {
+  0,   16,  32,  48,  64,  80,  96,  112, 128, 144, 160, 176, 192, 208, 224,
+  240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464,
+  480, 496, 1,   17,  33,  49,  65,  81,  97,  113, 129, 145, 161, 177, 193,
+  209, 225, 241, 257, 273, 289, 305, 321, 337, 353, 369, 385, 401, 417, 433,
+  449, 465, 481, 497, 2,   18,  34,  50,  66,  82,  98,  114, 130, 146, 162,
+  178, 194, 210, 226, 242, 258, 274, 290, 306, 322, 338, 354, 370, 386, 402,
+  418, 434, 450, 466, 482, 498, 3,   19,  35,  51,  67,  83,  99,  115, 131,
+  147, 163, 179, 195, 211, 227, 243, 259, 275, 291, 307, 323, 339, 355, 371,
+  387, 403, 419, 435, 451, 467, 483, 499, 4,   20,  36,  52,  68,  84,  100,
+  116, 132, 148, 164, 180, 196, 212, 228, 244, 260, 276, 292, 308, 324, 340,
+  356, 372, 388, 404, 420, 436, 452, 468, 484, 500, 5,   21,  37,  53,  69,
+  85,  101, 117, 133, 149, 165, 181, 197, 213, 229, 245, 261, 277, 293, 309,
+  325, 341, 357, 373, 389, 405, 421, 437, 453, 469, 485, 501, 6,   22,  38,
+  54,  70,  86,  102, 118, 134, 150, 166, 182, 198, 214, 230, 246, 262, 278,
+  294, 310, 326, 342, 358, 374, 390, 406, 422, 438, 454, 470, 486, 502, 7,
+  23,  39,  55,  71,  87,  103, 119, 135, 151, 167, 183, 199, 215, 231, 247,
+  263, 279, 295, 311, 327, 343, 359, 375, 391, 407, 423, 439, 455, 471, 487,
+  503, 8,   24,  40,  56,  72,  88,  104, 120, 136, 152, 168, 184, 200, 216,
+  232, 248, 264, 280, 296, 312, 328, 344, 360, 376, 392, 408, 424, 440, 456,
+  472, 488, 504, 9,   25,  41,  57,  73,  89,  105, 121, 137, 153, 169, 185,
+  201, 217, 233, 249, 265, 281, 297, 313, 329, 345, 361, 377, 393, 409, 425,
+  441, 457, 473, 489, 505, 10,  26,  42,  58,  74,  90,  106, 122, 138, 154,
+  170, 186, 202, 218, 234, 250, 266, 282, 298, 314, 330, 346, 362, 378, 394,
+  410, 426, 442, 458, 474, 490, 506, 11,  27,  43,  59,  75,  91,  107, 123,
+  139, 155, 171, 187, 203, 219, 235, 251, 267, 283, 299, 315, 331, 347, 363,
+  379, 395, 411, 427, 443, 459, 475, 491, 507, 12,  28,  44,  60,  76,  92,
+  108, 124, 140, 156, 172, 188, 204, 220, 236, 252, 268, 284, 300, 316, 332,
+  348, 364, 380, 396, 412, 428, 444, 460, 476, 492, 508, 13,  29,  45,  61,
+  77,  93,  109, 125, 141, 157, 173, 189, 205, 221, 237, 253, 269, 285, 301,
+  317, 333, 349, 365, 381, 397, 413, 429, 445, 461, 477, 493, 509, 14,  30,
+  46,  62,  78,  94,  110, 126, 142, 158, 174, 190, 206, 222, 238, 254, 270,
+  286, 302, 318, 334, 350, 366, 382, 398, 414, 430, 446, 462, 478, 494, 510,
+  15,  31,  47,  63,  79,  95,  111, 127, 143, 159, 175, 191, 207, 223, 239,
+  255, 271, 287, 303, 319, 335, 351, 367, 383, 399, 415, 431, 447, 463, 479,
+  495, 511,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_16x32[512]) = {
+  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
+  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
+  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
+  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,
+  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
+  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
+  105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+  120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
+  135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
+  150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
+  165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
+  180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
+  195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
+  210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224,
+  225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+  240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
+  255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269,
+  270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284,
+  285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299,
+  300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314,
+  315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329,
+  330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344,
+  345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359,
+  360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374,
+  375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389,
+  390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404,
+  405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419,
+  420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434,
+  435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449,
+  450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464,
+  465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479,
+  480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494,
+  495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509,
+  510, 511,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_32x16[512]) = {
+  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
+  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
+  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
+  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,
+  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
+  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
+  105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+  120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
+  135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
+  150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
+  165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
+  180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
+  195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
+  210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224,
+  225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+  240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
+  255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269,
+  270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284,
+  285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299,
+  300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314,
+  315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329,
+  330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344,
+  345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359,
+  360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374,
+  375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389,
+  390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404,
+  405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419,
+  420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434,
+  435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449,
+  450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464,
+  465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479,
+  480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494,
+  495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509,
+  510, 511,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_16x16[256]) = {
+  0,  16, 32, 48, 64, 80, 96,  112, 128, 144, 160, 176, 192, 208, 224, 240,
+  1,  17, 33, 49, 65, 81, 97,  113, 129, 145, 161, 177, 193, 209, 225, 241,
+  2,  18, 34, 50, 66, 82, 98,  114, 130, 146, 162, 178, 194, 210, 226, 242,
+  3,  19, 35, 51, 67, 83, 99,  115, 131, 147, 163, 179, 195, 211, 227, 243,
+  4,  20, 36, 52, 68, 84, 100, 116, 132, 148, 164, 180, 196, 212, 228, 244,
+  5,  21, 37, 53, 69, 85, 101, 117, 133, 149, 165, 181, 197, 213, 229, 245,
+  6,  22, 38, 54, 70, 86, 102, 118, 134, 150, 166, 182, 198, 214, 230, 246,
+  7,  23, 39, 55, 71, 87, 103, 119, 135, 151, 167, 183, 199, 215, 231, 247,
+  8,  24, 40, 56, 72, 88, 104, 120, 136, 152, 168, 184, 200, 216, 232, 248,
+  9,  25, 41, 57, 73, 89, 105, 121, 137, 153, 169, 185, 201, 217, 233, 249,
+  10, 26, 42, 58, 74, 90, 106, 122, 138, 154, 170, 186, 202, 218, 234, 250,
+  11, 27, 43, 59, 75, 91, 107, 123, 139, 155, 171, 187, 203, 219, 235, 251,
+  12, 28, 44, 60, 76, 92, 108, 124, 140, 156, 172, 188, 204, 220, 236, 252,
+  13, 29, 45, 61, 77, 93, 109, 125, 141, 157, 173, 189, 205, 221, 237, 253,
+  14, 30, 46, 62, 78, 94, 110, 126, 142, 158, 174, 190, 206, 222, 238, 254,
+  15, 31, 47, 63, 79, 95, 111, 127, 143, 159, 175, 191, 207, 223, 239, 255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_16x16[256]) = {
+  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
+  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
+  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
+  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,
+  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
+  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
+  105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+  120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
+  135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
+  150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
+  165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
+  180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
+  195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
+  210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224,
+  225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+  240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
+  255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_16x16[256]) = {
+  0,   1,   5,   6,   14,  15,  27,  28,  44,  45,  65,  66,  90,  91,  119,
+  120, 2,   4,   7,   13,  16,  26,  29,  43,  46,  64,  67,  89,  92,  118,
+  121, 150, 3,   8,   12,  17,  25,  30,  42,  47,  63,  68,  88,  93,  117,
+  122, 149, 151, 9,   11,  18,  24,  31,  41,  48,  62,  69,  87,  94,  116,
+  123, 148, 152, 177, 10,  19,  23,  32,  40,  49,  61,  70,  86,  95,  115,
+  124, 147, 153, 176, 178, 20,  22,  33,  39,  50,  60,  71,  85,  96,  114,
+  125, 146, 154, 175, 179, 200, 21,  34,  38,  51,  59,  72,  84,  97,  113,
+  126, 145, 155, 174, 180, 199, 201, 35,  37,  52,  58,  73,  83,  98,  112,
+  127, 144, 156, 173, 181, 198, 202, 219, 36,  53,  57,  74,  82,  99,  111,
+  128, 143, 157, 172, 182, 197, 203, 218, 220, 54,  56,  75,  81,  100, 110,
+  129, 142, 158, 171, 183, 196, 204, 217, 221, 234, 55,  76,  80,  101, 109,
+  130, 141, 159, 170, 184, 195, 205, 216, 222, 233, 235, 77,  79,  102, 108,
+  131, 140, 160, 169, 185, 194, 206, 215, 223, 232, 236, 245, 78,  103, 107,
+  132, 139, 161, 168, 186, 193, 207, 214, 224, 231, 237, 244, 246, 104, 106,
+  133, 138, 162, 167, 187, 192, 208, 213, 225, 230, 238, 243, 247, 252, 105,
+  134, 137, 163, 166, 188, 191, 209, 212, 226, 229, 239, 242, 248, 251, 253,
+  135, 136, 164, 165, 189, 190, 210, 211, 227, 228, 240, 241, 249, 250, 254,
+  255
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_32x32[1024]) = {
+  0,   32,   64,  96,   128, 160,  192, 224,  256, 288,  320, 352,  384, 416,
+  448, 480,  512, 544,  576, 608,  640, 672,  704, 736,  768, 800,  832, 864,
+  896, 928,  960, 992,  1,   33,   65,  97,   129, 161,  193, 225,  257, 289,
+  321, 353,  385, 417,  449, 481,  513, 545,  577, 609,  641, 673,  705, 737,
+  769, 801,  833, 865,  897, 929,  961, 993,  2,   34,   66,  98,   130, 162,
+  194, 226,  258, 290,  322, 354,  386, 418,  450, 482,  514, 546,  578, 610,
+  642, 674,  706, 738,  770, 802,  834, 866,  898, 930,  962, 994,  3,   35,
+  67,  99,   131, 163,  195, 227,  259, 291,  323, 355,  387, 419,  451, 483,
+  515, 547,  579, 611,  643, 675,  707, 739,  771, 803,  835, 867,  899, 931,
+  963, 995,  4,   36,   68,  100,  132, 164,  196, 228,  260, 292,  324, 356,
+  388, 420,  452, 484,  516, 548,  580, 612,  644, 676,  708, 740,  772, 804,
+  836, 868,  900, 932,  964, 996,  5,   37,   69,  101,  133, 165,  197, 229,
+  261, 293,  325, 357,  389, 421,  453, 485,  517, 549,  581, 613,  645, 677,
+  709, 741,  773, 805,  837, 869,  901, 933,  965, 997,  6,   38,   70,  102,
+  134, 166,  198, 230,  262, 294,  326, 358,  390, 422,  454, 486,  518, 550,
+  582, 614,  646, 678,  710, 742,  774, 806,  838, 870,  902, 934,  966, 998,
+  7,   39,   71,  103,  135, 167,  199, 231,  263, 295,  327, 359,  391, 423,
+  455, 487,  519, 551,  583, 615,  647, 679,  711, 743,  775, 807,  839, 871,
+  903, 935,  967, 999,  8,   40,   72,  104,  136, 168,  200, 232,  264, 296,
+  328, 360,  392, 424,  456, 488,  520, 552,  584, 616,  648, 680,  712, 744,
+  776, 808,  840, 872,  904, 936,  968, 1000, 9,   41,   73,  105,  137, 169,
+  201, 233,  265, 297,  329, 361,  393, 425,  457, 489,  521, 553,  585, 617,
+  649, 681,  713, 745,  777, 809,  841, 873,  905, 937,  969, 1001, 10,  42,
+  74,  106,  138, 170,  202, 234,  266, 298,  330, 362,  394, 426,  458, 490,
+  522, 554,  586, 618,  650, 682,  714, 746,  778, 810,  842, 874,  906, 938,
+  970, 1002, 11,  43,   75,  107,  139, 171,  203, 235,  267, 299,  331, 363,
+  395, 427,  459, 491,  523, 555,  587, 619,  651, 683,  715, 747,  779, 811,
+  843, 875,  907, 939,  971, 1003, 12,  44,   76,  108,  140, 172,  204, 236,
+  268, 300,  332, 364,  396, 428,  460, 492,  524, 556,  588, 620,  652, 684,
+  716, 748,  780, 812,  844, 876,  908, 940,  972, 1004, 13,  45,   77,  109,
+  141, 173,  205, 237,  269, 301,  333, 365,  397, 429,  461, 493,  525, 557,
+  589, 621,  653, 685,  717, 749,  781, 813,  845, 877,  909, 941,  973, 1005,
+  14,  46,   78,  110,  142, 174,  206, 238,  270, 302,  334, 366,  398, 430,
+  462, 494,  526, 558,  590, 622,  654, 686,  718, 750,  782, 814,  846, 878,
+  910, 942,  974, 1006, 15,  47,   79,  111,  143, 175,  207, 239,  271, 303,
+  335, 367,  399, 431,  463, 495,  527, 559,  591, 623,  655, 687,  719, 751,
+  783, 815,  847, 879,  911, 943,  975, 1007, 16,  48,   80,  112,  144, 176,
+  208, 240,  272, 304,  336, 368,  400, 432,  464, 496,  528, 560,  592, 624,
+  656, 688,  720, 752,  784, 816,  848, 880,  912, 944,  976, 1008, 17,  49,
+  81,  113,  145, 177,  209, 241,  273, 305,  337, 369,  401, 433,  465, 497,
+  529, 561,  593, 625,  657, 689,  721, 753,  785, 817,  849, 881,  913, 945,
+  977, 1009, 18,  50,   82,  114,  146, 178,  210, 242,  274, 306,  338, 370,
+  402, 434,  466, 498,  530, 562,  594, 626,  658, 690,  722, 754,  786, 818,
+  850, 882,  914, 946,  978, 1010, 19,  51,   83,  115,  147, 179,  211, 243,
+  275, 307,  339, 371,  403, 435,  467, 499,  531, 563,  595, 627,  659, 691,
+  723, 755,  787, 819,  851, 883,  915, 947,  979, 1011, 20,  52,   84,  116,
+  148, 180,  212, 244,  276, 308,  340, 372,  404, 436,  468, 500,  532, 564,
+  596, 628,  660, 692,  724, 756,  788, 820,  852, 884,  916, 948,  980, 1012,
+  21,  53,   85,  117,  149, 181,  213, 245,  277, 309,  341, 373,  405, 437,
+  469, 501,  533, 565,  597, 629,  661, 693,  725, 757,  789, 821,  853, 885,
+  917, 949,  981, 1013, 22,  54,   86,  118,  150, 182,  214, 246,  278, 310,
+  342, 374,  406, 438,  470, 502,  534, 566,  598, 630,  662, 694,  726, 758,
+  790, 822,  854, 886,  918, 950,  982, 1014, 23,  55,   87,  119,  151, 183,
+  215, 247,  279, 311,  343, 375,  407, 439,  471, 503,  535, 567,  599, 631,
+  663, 695,  727, 759,  791, 823,  855, 887,  919, 951,  983, 1015, 24,  56,
+  88,  120,  152, 184,  216, 248,  280, 312,  344, 376,  408, 440,  472, 504,
+  536, 568,  600, 632,  664, 696,  728, 760,  792, 824,  856, 888,  920, 952,
+  984, 1016, 25,  57,   89,  121,  153, 185,  217, 249,  281, 313,  345, 377,
+  409, 441,  473, 505,  537, 569,  601, 633,  665, 697,  729, 761,  793, 825,
+  857, 889,  921, 953,  985, 1017, 26,  58,   90,  122,  154, 186,  218, 250,
+  282, 314,  346, 378,  410, 442,  474, 506,  538, 570,  602, 634,  666, 698,
+  730, 762,  794, 826,  858, 890,  922, 954,  986, 1018, 27,  59,   91,  123,
+  155, 187,  219, 251,  283, 315,  347, 379,  411, 443,  475, 507,  539, 571,
+  603, 635,  667, 699,  731, 763,  795, 827,  859, 891,  923, 955,  987, 1019,
+  28,  60,   92,  124,  156, 188,  220, 252,  284, 316,  348, 380,  412, 444,
+  476, 508,  540, 572,  604, 636,  668, 700,  732, 764,  796, 828,  860, 892,
+  924, 956,  988, 1020, 29,  61,   93,  125,  157, 189,  221, 253,  285, 317,
+  349, 381,  413, 445,  477, 509,  541, 573,  605, 637,  669, 701,  733, 765,
+  797, 829,  861, 893,  925, 957,  989, 1021, 30,  62,   94,  126,  158, 190,
+  222, 254,  286, 318,  350, 382,  414, 446,  478, 510,  542, 574,  606, 638,
+  670, 702,  734, 766,  798, 830,  862, 894,  926, 958,  990, 1022, 31,  63,
+  95,  127,  159, 191,  223, 255,  287, 319,  351, 383,  415, 447,  479, 511,
+  543, 575,  607, 639,  671, 703,  735, 767,  799, 831,  863, 895,  927, 959,
+  991, 1023,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_32x32[1024]) = {
+  0,    1,    2,    3,    4,    5,    6,    7,    8,    9,    10,   11,   12,
+  13,   14,   15,   16,   17,   18,   19,   20,   21,   22,   23,   24,   25,
+  26,   27,   28,   29,   30,   31,   32,   33,   34,   35,   36,   37,   38,
+  39,   40,   41,   42,   43,   44,   45,   46,   47,   48,   49,   50,   51,
+  52,   53,   54,   55,   56,   57,   58,   59,   60,   61,   62,   63,   64,
+  65,   66,   67,   68,   69,   70,   71,   72,   73,   74,   75,   76,   77,
+  78,   79,   80,   81,   82,   83,   84,   85,   86,   87,   88,   89,   90,
+  91,   92,   93,   94,   95,   96,   97,   98,   99,   100,  101,  102,  103,
+  104,  105,  106,  107,  108,  109,  110,  111,  112,  113,  114,  115,  116,
+  117,  118,  119,  120,  121,  122,  123,  124,  125,  126,  127,  128,  129,
+  130,  131,  132,  133,  134,  135,  136,  137,  138,  139,  140,  141,  142,
+  143,  144,  145,  146,  147,  148,  149,  150,  151,  152,  153,  154,  155,
+  156,  157,  158,  159,  160,  161,  162,  163,  164,  165,  166,  167,  168,
+  169,  170,  171,  172,  173,  174,  175,  176,  177,  178,  179,  180,  181,
+  182,  183,  184,  185,  186,  187,  188,  189,  190,  191,  192,  193,  194,
+  195,  196,  197,  198,  199,  200,  201,  202,  203,  204,  205,  206,  207,
+  208,  209,  210,  211,  212,  213,  214,  215,  216,  217,  218,  219,  220,
+  221,  222,  223,  224,  225,  226,  227,  228,  229,  230,  231,  232,  233,
+  234,  235,  236,  237,  238,  239,  240,  241,  242,  243,  244,  245,  246,
+  247,  248,  249,  250,  251,  252,  253,  254,  255,  256,  257,  258,  259,
+  260,  261,  262,  263,  264,  265,  266,  267,  268,  269,  270,  271,  272,
+  273,  274,  275,  276,  277,  278,  279,  280,  281,  282,  283,  284,  285,
+  286,  287,  288,  289,  290,  291,  292,  293,  294,  295,  296,  297,  298,
+  299,  300,  301,  302,  303,  304,  305,  306,  307,  308,  309,  310,  311,
+  312,  313,  314,  315,  316,  317,  318,  319,  320,  321,  322,  323,  324,
+  325,  326,  327,  328,  329,  330,  331,  332,  333,  334,  335,  336,  337,
+  338,  339,  340,  341,  342,  343,  344,  345,  346,  347,  348,  349,  350,
+  351,  352,  353,  354,  355,  356,  357,  358,  359,  360,  361,  362,  363,
+  364,  365,  366,  367,  368,  369,  370,  371,  372,  373,  374,  375,  376,
+  377,  378,  379,  380,  381,  382,  383,  384,  385,  386,  387,  388,  389,
+  390,  391,  392,  393,  394,  395,  396,  397,  398,  399,  400,  401,  402,
+  403,  404,  405,  406,  407,  408,  409,  410,  411,  412,  413,  414,  415,
+  416,  417,  418,  419,  420,  421,  422,  423,  424,  425,  426,  427,  428,
+  429,  430,  431,  432,  433,  434,  435,  436,  437,  438,  439,  440,  441,
+  442,  443,  444,  445,  446,  447,  448,  449,  450,  451,  452,  453,  454,
+  455,  456,  457,  458,  459,  460,  461,  462,  463,  464,  465,  466,  467,
+  468,  469,  470,  471,  472,  473,  474,  475,  476,  477,  478,  479,  480,
+  481,  482,  483,  484,  485,  486,  487,  488,  489,  490,  491,  492,  493,
+  494,  495,  496,  497,  498,  499,  500,  501,  502,  503,  504,  505,  506,
+  507,  508,  509,  510,  511,  512,  513,  514,  515,  516,  517,  518,  519,
+  520,  521,  522,  523,  524,  525,  526,  527,  528,  529,  530,  531,  532,
+  533,  534,  535,  536,  537,  538,  539,  540,  541,  542,  543,  544,  545,
+  546,  547,  548,  549,  550,  551,  552,  553,  554,  555,  556,  557,  558,
+  559,  560,  561,  562,  563,  564,  565,  566,  567,  568,  569,  570,  571,
+  572,  573,  574,  575,  576,  577,  578,  579,  580,  581,  582,  583,  584,
+  585,  586,  587,  588,  589,  590,  591,  592,  593,  594,  595,  596,  597,
+  598,  599,  600,  601,  602,  603,  604,  605,  606,  607,  608,  609,  610,
+  611,  612,  613,  614,  615,  616,  617,  618,  619,  620,  621,  622,  623,
+  624,  625,  626,  627,  628,  629,  630,  631,  632,  633,  634,  635,  636,
+  637,  638,  639,  640,  641,  642,  643,  644,  645,  646,  647,  648,  649,
+  650,  651,  652,  653,  654,  655,  656,  657,  658,  659,  660,  661,  662,
+  663,  664,  665,  666,  667,  668,  669,  670,  671,  672,  673,  674,  675,
+  676,  677,  678,  679,  680,  681,  682,  683,  684,  685,  686,  687,  688,
+  689,  690,  691,  692,  693,  694,  695,  696,  697,  698,  699,  700,  701,
+  702,  703,  704,  705,  706,  707,  708,  709,  710,  711,  712,  713,  714,
+  715,  716,  717,  718,  719,  720,  721,  722,  723,  724,  725,  726,  727,
+  728,  729,  730,  731,  732,  733,  734,  735,  736,  737,  738,  739,  740,
+  741,  742,  743,  744,  745,  746,  747,  748,  749,  750,  751,  752,  753,
+  754,  755,  756,  757,  758,  759,  760,  761,  762,  763,  764,  765,  766,
+  767,  768,  769,  770,  771,  772,  773,  774,  775,  776,  777,  778,  779,
+  780,  781,  782,  783,  784,  785,  786,  787,  788,  789,  790,  791,  792,
+  793,  794,  795,  796,  797,  798,  799,  800,  801,  802,  803,  804,  805,
+  806,  807,  808,  809,  810,  811,  812,  813,  814,  815,  816,  817,  818,
+  819,  820,  821,  822,  823,  824,  825,  826,  827,  828,  829,  830,  831,
+  832,  833,  834,  835,  836,  837,  838,  839,  840,  841,  842,  843,  844,
+  845,  846,  847,  848,  849,  850,  851,  852,  853,  854,  855,  856,  857,
+  858,  859,  860,  861,  862,  863,  864,  865,  866,  867,  868,  869,  870,
+  871,  872,  873,  874,  875,  876,  877,  878,  879,  880,  881,  882,  883,
+  884,  885,  886,  887,  888,  889,  890,  891,  892,  893,  894,  895,  896,
+  897,  898,  899,  900,  901,  902,  903,  904,  905,  906,  907,  908,  909,
+  910,  911,  912,  913,  914,  915,  916,  917,  918,  919,  920,  921,  922,
+  923,  924,  925,  926,  927,  928,  929,  930,  931,  932,  933,  934,  935,
+  936,  937,  938,  939,  940,  941,  942,  943,  944,  945,  946,  947,  948,
+  949,  950,  951,  952,  953,  954,  955,  956,  957,  958,  959,  960,  961,
+  962,  963,  964,  965,  966,  967,  968,  969,  970,  971,  972,  973,  974,
+  975,  976,  977,  978,  979,  980,  981,  982,  983,  984,  985,  986,  987,
+  988,  989,  990,  991,  992,  993,  994,  995,  996,  997,  998,  999,  1000,
+  1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013,
+  1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_32x32[1024]) = {
+  0,    1,    5,    6,    14,   15,   27,   28,   44,   45,   65,   66,   90,
+  91,   119,  120,  152,  153,  189,  190,  230,  231,  275,  276,  324,  325,
+  377,  378,  434,  435,  495,  496,  2,    4,    7,    13,   16,   26,   29,
+  43,   46,   64,   67,   89,   92,   118,  121,  151,  154,  188,  191,  229,
+  232,  274,  277,  323,  326,  376,  379,  433,  436,  494,  497,  558,  3,
+  8,    12,   17,   25,   30,   42,   47,   63,   68,   88,   93,   117,  122,
+  150,  155,  187,  192,  228,  233,  273,  278,  322,  327,  375,  380,  432,
+  437,  493,  498,  557,  559,  9,    11,   18,   24,   31,   41,   48,   62,
+  69,   87,   94,   116,  123,  149,  156,  186,  193,  227,  234,  272,  279,
+  321,  328,  374,  381,  431,  438,  492,  499,  556,  560,  617,  10,   19,
+  23,   32,   40,   49,   61,   70,   86,   95,   115,  124,  148,  157,  185,
+  194,  226,  235,  271,  280,  320,  329,  373,  382,  430,  439,  491,  500,
+  555,  561,  616,  618,  20,   22,   33,   39,   50,   60,   71,   85,   96,
+  114,  125,  147,  158,  184,  195,  225,  236,  270,  281,  319,  330,  372,
+  383,  429,  440,  490,  501,  554,  562,  615,  619,  672,  21,   34,   38,
+  51,   59,   72,   84,   97,   113,  126,  146,  159,  183,  196,  224,  237,
+  269,  282,  318,  331,  371,  384,  428,  441,  489,  502,  553,  563,  614,
+  620,  671,  673,  35,   37,   52,   58,   73,   83,   98,   112,  127,  145,
+  160,  182,  197,  223,  238,  268,  283,  317,  332,  370,  385,  427,  442,
+  488,  503,  552,  564,  613,  621,  670,  674,  723,  36,   53,   57,   74,
+  82,   99,   111,  128,  144,  161,  181,  198,  222,  239,  267,  284,  316,
+  333,  369,  386,  426,  443,  487,  504,  551,  565,  612,  622,  669,  675,
+  722,  724,  54,   56,   75,   81,   100,  110,  129,  143,  162,  180,  199,
+  221,  240,  266,  285,  315,  334,  368,  387,  425,  444,  486,  505,  550,
+  566,  611,  623,  668,  676,  721,  725,  770,  55,   76,   80,   101,  109,
+  130,  142,  163,  179,  200,  220,  241,  265,  286,  314,  335,  367,  388,
+  424,  445,  485,  506,  549,  567,  610,  624,  667,  677,  720,  726,  769,
+  771,  77,   79,   102,  108,  131,  141,  164,  178,  201,  219,  242,  264,
+  287,  313,  336,  366,  389,  423,  446,  484,  507,  548,  568,  609,  625,
+  666,  678,  719,  727,  768,  772,  813,  78,   103,  107,  132,  140,  165,
+  177,  202,  218,  243,  263,  288,  312,  337,  365,  390,  422,  447,  483,
+  508,  547,  569,  608,  626,  665,  679,  718,  728,  767,  773,  812,  814,
+  104,  106,  133,  139,  166,  176,  203,  217,  244,  262,  289,  311,  338,
+  364,  391,  421,  448,  482,  509,  546,  570,  607,  627,  664,  680,  717,
+  729,  766,  774,  811,  815,  852,  105,  134,  138,  167,  175,  204,  216,
+  245,  261,  290,  310,  339,  363,  392,  420,  449,  481,  510,  545,  571,
+  606,  628,  663,  681,  716,  730,  765,  775,  810,  816,  851,  853,  135,
+  137,  168,  174,  205,  215,  246,  260,  291,  309,  340,  362,  393,  419,
+  450,  480,  511,  544,  572,  605,  629,  662,  682,  715,  731,  764,  776,
+  809,  817,  850,  854,  887,  136,  169,  173,  206,  214,  247,  259,  292,
+  308,  341,  361,  394,  418,  451,  479,  512,  543,  573,  604,  630,  661,
+  683,  714,  732,  763,  777,  808,  818,  849,  855,  886,  888,  170,  172,
+  207,  213,  248,  258,  293,  307,  342,  360,  395,  417,  452,  478,  513,
+  542,  574,  603,  631,  660,  684,  713,  733,  762,  778,  807,  819,  848,
+  856,  885,  889,  918,  171,  208,  212,  249,  257,  294,  306,  343,  359,
+  396,  416,  453,  477,  514,  541,  575,  602,  632,  659,  685,  712,  734,
+  761,  779,  806,  820,  847,  857,  884,  890,  917,  919,  209,  211,  250,
+  256,  295,  305,  344,  358,  397,  415,  454,  476,  515,  540,  576,  601,
+  633,  658,  686,  711,  735,  760,  780,  805,  821,  846,  858,  883,  891,
+  916,  920,  945,  210,  251,  255,  296,  304,  345,  357,  398,  414,  455,
+  475,  516,  539,  577,  600,  634,  657,  687,  710,  736,  759,  781,  804,
+  822,  845,  859,  882,  892,  915,  921,  944,  946,  252,  254,  297,  303,
+  346,  356,  399,  413,  456,  474,  517,  538,  578,  599,  635,  656,  688,
+  709,  737,  758,  782,  803,  823,  844,  860,  881,  893,  914,  922,  943,
+  947,  968,  253,  298,  302,  347,  355,  400,  412,  457,  473,  518,  537,
+  579,  598,  636,  655,  689,  708,  738,  757,  783,  802,  824,  843,  861,
+  880,  894,  913,  923,  942,  948,  967,  969,  299,  301,  348,  354,  401,
+  411,  458,  472,  519,  536,  580,  597,  637,  654,  690,  707,  739,  756,
+  784,  801,  825,  842,  862,  879,  895,  912,  924,  941,  949,  966,  970,
+  987,  300,  349,  353,  402,  410,  459,  471,  520,  535,  581,  596,  638,
+  653,  691,  706,  740,  755,  785,  800,  826,  841,  863,  878,  896,  911,
+  925,  940,  950,  965,  971,  986,  988,  350,  352,  403,  409,  460,  470,
+  521,  534,  582,  595,  639,  652,  692,  705,  741,  754,  786,  799,  827,
+  840,  864,  877,  897,  910,  926,  939,  951,  964,  972,  985,  989,  1002,
+  351,  404,  408,  461,  469,  522,  533,  583,  594,  640,  651,  693,  704,
+  742,  753,  787,  798,  828,  839,  865,  876,  898,  909,  927,  938,  952,
+  963,  973,  984,  990,  1001, 1003, 405,  407,  462,  468,  523,  532,  584,
+  593,  641,  650,  694,  703,  743,  752,  788,  797,  829,  838,  866,  875,
+  899,  908,  928,  937,  953,  962,  974,  983,  991,  1000, 1004, 1013, 406,
+  463,  467,  524,  531,  585,  592,  642,  649,  695,  702,  744,  751,  789,
+  796,  830,  837,  867,  874,  900,  907,  929,  936,  954,  961,  975,  982,
+  992,  999,  1005, 1012, 1014, 464,  466,  525,  530,  586,  591,  643,  648,
+  696,  701,  745,  750,  790,  795,  831,  836,  868,  873,  901,  906,  930,
+  935,  955,  960,  976,  981,  993,  998,  1006, 1011, 1015, 1020, 465,  526,
+  529,  587,  590,  644,  647,  697,  700,  746,  749,  791,  794,  832,  835,
+  869,  872,  902,  905,  931,  934,  956,  959,  977,  980,  994,  997,  1007,
+  1010, 1016, 1019, 1021, 527,  528,  588,  589,  645,  646,  698,  699,  747,
+  748,  792,  793,  833,  834,  870,  871,  903,  904,  932,  933,  957,  958,
+  978,  979,  995,  996,  1008, 1009, 1017, 1018, 1022, 1023
+};
+
+const SCAN_ORDER av1_default_scan_orders[TX_SIZES] = {
+  { default_scan_4x4, av1_default_iscan_4x4 },
+  { default_scan_8x8, av1_default_iscan_8x8 },
+  { default_scan_16x16, av1_default_iscan_16x16 },
+  { default_scan_32x32, av1_default_iscan_32x32 },
+  // Half of the coefficients of tx64 at higher frequencies are set to
+  // zeros. So tx32's scan order is used.
+  { default_scan_32x32, av1_default_iscan_32x32 },
+};
+
+const SCAN_ORDER av1_scan_orders[TX_SIZES_ALL][TX_TYPES] = {
+  {
+      // TX_4X4
+      { default_scan_4x4, av1_default_iscan_4x4 },
+      { default_scan_4x4, av1_default_iscan_4x4 },
+      { default_scan_4x4, av1_default_iscan_4x4 },
+      { default_scan_4x4, av1_default_iscan_4x4 },
+      { default_scan_4x4, av1_default_iscan_4x4 },
+      { default_scan_4x4, av1_default_iscan_4x4 },
+      { default_scan_4x4, av1_default_iscan_4x4 },
+      { default_scan_4x4, av1_default_iscan_4x4 },
+      { default_scan_4x4, av1_default_iscan_4x4 },
+      { default_scan_4x4, av1_default_iscan_4x4 },
+      { mrow_scan_4x4, av1_mrow_iscan_4x4 },
+      { mcol_scan_4x4, av1_mcol_iscan_4x4 },
+      { mrow_scan_4x4, av1_mrow_iscan_4x4 },
+      { mcol_scan_4x4, av1_mcol_iscan_4x4 },
+      { mrow_scan_4x4, av1_mrow_iscan_4x4 },
+      { mcol_scan_4x4, av1_mcol_iscan_4x4 },
+  },
+  {
+      // TX_8X8
+      { default_scan_8x8, av1_default_iscan_8x8 },
+      { default_scan_8x8, av1_default_iscan_8x8 },
+      { default_scan_8x8, av1_default_iscan_8x8 },
+      { default_scan_8x8, av1_default_iscan_8x8 },
+      { default_scan_8x8, av1_default_iscan_8x8 },
+      { default_scan_8x8, av1_default_iscan_8x8 },
+      { default_scan_8x8, av1_default_iscan_8x8 },
+      { default_scan_8x8, av1_default_iscan_8x8 },
+      { default_scan_8x8, av1_default_iscan_8x8 },
+      { default_scan_8x8, av1_default_iscan_8x8 },
+      { mrow_scan_8x8, av1_mrow_iscan_8x8 },
+      { mcol_scan_8x8, av1_mcol_iscan_8x8 },
+      { mrow_scan_8x8, av1_mrow_iscan_8x8 },
+      { mcol_scan_8x8, av1_mcol_iscan_8x8 },
+      { mrow_scan_8x8, av1_mrow_iscan_8x8 },
+      { mcol_scan_8x8, av1_mcol_iscan_8x8 },
+  },
+  {
+      // TX_16X16
+      { default_scan_16x16, av1_default_iscan_16x16 },
+      { default_scan_16x16, av1_default_iscan_16x16 },
+      { default_scan_16x16, av1_default_iscan_16x16 },
+      { default_scan_16x16, av1_default_iscan_16x16 },
+      { default_scan_16x16, av1_default_iscan_16x16 },
+      { default_scan_16x16, av1_default_iscan_16x16 },
+      { default_scan_16x16, av1_default_iscan_16x16 },
+      { default_scan_16x16, av1_default_iscan_16x16 },
+      { default_scan_16x16, av1_default_iscan_16x16 },
+      { default_scan_16x16, av1_default_iscan_16x16 },
+      { mrow_scan_16x16, av1_mrow_iscan_16x16 },
+      { mcol_scan_16x16, av1_mcol_iscan_16x16 },
+      { mrow_scan_16x16, av1_mrow_iscan_16x16 },
+      { mcol_scan_16x16, av1_mcol_iscan_16x16 },
+      { mrow_scan_16x16, av1_mrow_iscan_16x16 },
+      { mcol_scan_16x16, av1_mcol_iscan_16x16 },
+  },
+  {
+      // TX_32X32
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32 },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32 },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32 },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32 },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32 },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32 },
+  },
+  {
+      // TX_64X64
+      // Half of the coefficients of tx64 at higher frequencies are set to
+      // zeros. So tx32's scan order is used.
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32 },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32 },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32 },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32 },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32 },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32 },
+  },
+  {
+      // TX_4X8
+      { default_scan_4x8, av1_default_iscan_4x8 },
+      { default_scan_4x8, av1_default_iscan_4x8 },
+      { default_scan_4x8, av1_default_iscan_4x8 },
+      { default_scan_4x8, av1_default_iscan_4x8 },
+      { default_scan_4x8, av1_default_iscan_4x8 },
+      { default_scan_4x8, av1_default_iscan_4x8 },
+      { default_scan_4x8, av1_default_iscan_4x8 },
+      { default_scan_4x8, av1_default_iscan_4x8 },
+      { default_scan_4x8, av1_default_iscan_4x8 },
+      { default_scan_4x8, av1_default_iscan_4x8 },
+      { mrow_scan_4x8, av1_mrow_iscan_4x8 },
+      { mcol_scan_4x8, av1_mcol_iscan_4x8 },
+      { mrow_scan_4x8, av1_mrow_iscan_4x8 },
+      { mcol_scan_4x8, av1_mcol_iscan_4x8 },
+      { mrow_scan_4x8, av1_mrow_iscan_4x8 },
+      { mcol_scan_4x8, av1_mcol_iscan_4x8 },
+  },
+  {
+      // TX_8X4
+      { default_scan_8x4, av1_default_iscan_8x4 },
+      { default_scan_8x4, av1_default_iscan_8x4 },
+      { default_scan_8x4, av1_default_iscan_8x4 },
+      { default_scan_8x4, av1_default_iscan_8x4 },
+      { default_scan_8x4, av1_default_iscan_8x4 },
+      { default_scan_8x4, av1_default_iscan_8x4 },
+      { default_scan_8x4, av1_default_iscan_8x4 },
+      { default_scan_8x4, av1_default_iscan_8x4 },
+      { default_scan_8x4, av1_default_iscan_8x4 },
+      { default_scan_8x4, av1_default_iscan_8x4 },
+      { mrow_scan_8x4, av1_mrow_iscan_8x4 },
+      { mcol_scan_8x4, av1_mcol_iscan_8x4 },
+      { mrow_scan_8x4, av1_mrow_iscan_8x4 },
+      { mcol_scan_8x4, av1_mcol_iscan_8x4 },
+      { mrow_scan_8x4, av1_mrow_iscan_8x4 },
+      { mcol_scan_8x4, av1_mcol_iscan_8x4 },
+  },
+  {
+      // TX_8X16
+      { default_scan_8x16, av1_default_iscan_8x16 },
+      { default_scan_8x16, av1_default_iscan_8x16 },
+      { default_scan_8x16, av1_default_iscan_8x16 },
+      { default_scan_8x16, av1_default_iscan_8x16 },
+      { default_scan_8x16, av1_default_iscan_8x16 },
+      { default_scan_8x16, av1_default_iscan_8x16 },
+      { default_scan_8x16, av1_default_iscan_8x16 },
+      { default_scan_8x16, av1_default_iscan_8x16 },
+      { default_scan_8x16, av1_default_iscan_8x16 },
+      { default_scan_8x16, av1_default_iscan_8x16 },
+      { mrow_scan_8x16, av1_mrow_iscan_8x16 },
+      { mcol_scan_8x16, av1_mcol_iscan_8x16 },
+      { mrow_scan_8x16, av1_mrow_iscan_8x16 },
+      { mcol_scan_8x16, av1_mcol_iscan_8x16 },
+      { mrow_scan_8x16, av1_mrow_iscan_8x16 },
+      { mcol_scan_8x16, av1_mcol_iscan_8x16 },
+  },
+  {
+      // TX_16X8
+      { default_scan_16x8, av1_default_iscan_16x8 },
+      { default_scan_16x8, av1_default_iscan_16x8 },
+      { default_scan_16x8, av1_default_iscan_16x8 },
+      { default_scan_16x8, av1_default_iscan_16x8 },
+      { default_scan_16x8, av1_default_iscan_16x8 },
+      { default_scan_16x8, av1_default_iscan_16x8 },
+      { default_scan_16x8, av1_default_iscan_16x8 },
+      { default_scan_16x8, av1_default_iscan_16x8 },
+      { default_scan_16x8, av1_default_iscan_16x8 },
+      { default_scan_16x8, av1_default_iscan_16x8 },
+      { mrow_scan_16x8, av1_mrow_iscan_16x8 },
+      { mcol_scan_16x8, av1_mcol_iscan_16x8 },
+      { mrow_scan_16x8, av1_mrow_iscan_16x8 },
+      { mcol_scan_16x8, av1_mcol_iscan_16x8 },
+      { mrow_scan_16x8, av1_mrow_iscan_16x8 },
+      { mcol_scan_16x8, av1_mcol_iscan_16x8 },
+  },
+  {
+      // TX_16X32
+      { default_scan_16x32, av1_default_iscan_16x32 },
+      { default_scan_16x32, av1_default_iscan_16x32 },
+      { default_scan_16x32, av1_default_iscan_16x32 },
+      { default_scan_16x32, av1_default_iscan_16x32 },
+      { default_scan_16x32, av1_default_iscan_16x32 },
+      { default_scan_16x32, av1_default_iscan_16x32 },
+      { default_scan_16x32, av1_default_iscan_16x32 },
+      { default_scan_16x32, av1_default_iscan_16x32 },
+      { default_scan_16x32, av1_default_iscan_16x32 },
+      { default_scan_16x32, av1_default_iscan_16x32 },
+      { mrow_scan_16x32, av1_mrow_iscan_16x32 },
+      { mcol_scan_16x32, av1_mcol_iscan_16x32 },
+      { mrow_scan_16x32, av1_mrow_iscan_16x32 },
+      { mcol_scan_16x32, av1_mcol_iscan_16x32 },
+      { mrow_scan_16x32, av1_mrow_iscan_16x32 },
+      { mcol_scan_16x32, av1_mcol_iscan_16x32 },
+  },
+  {
+      // TX_32X16
+      { default_scan_32x16, av1_default_iscan_32x16 },
+      { default_scan_32x16, av1_default_iscan_32x16 },
+      { default_scan_32x16, av1_default_iscan_32x16 },
+      { default_scan_32x16, av1_default_iscan_32x16 },
+      { default_scan_32x16, av1_default_iscan_32x16 },
+      { default_scan_32x16, av1_default_iscan_32x16 },
+      { default_scan_32x16, av1_default_iscan_32x16 },
+      { default_scan_32x16, av1_default_iscan_32x16 },
+      { default_scan_32x16, av1_default_iscan_32x16 },
+      { default_scan_32x16, av1_default_iscan_32x16 },
+      { mrow_scan_32x16, av1_mrow_iscan_32x16 },
+      { mcol_scan_32x16, av1_mcol_iscan_32x16 },
+      { mrow_scan_32x16, av1_mrow_iscan_32x16 },
+      { mcol_scan_32x16, av1_mcol_iscan_32x16 },
+      { mrow_scan_32x16, av1_mrow_iscan_32x16 },
+      { mcol_scan_32x16, av1_mcol_iscan_32x16 },
+  },
+  {
+      // TX_32X64
+      // Half of the coefficients of tx64 at higher frequencies are set to
+      // zeros. So tx32's scan order is used.
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32 },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32 },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32 },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32 },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32 },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32 },
+  },
+  {
+      // TX_64X32
+      // Half of the coefficients of tx64 at higher frequencies are set to
+      // zeros. So tx32's scan order is used.
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { default_scan_32x32, av1_default_iscan_32x32 },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32 },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32 },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32 },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32 },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32 },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32 },
+  },
+  {
+      // TX_4X16
+      { default_scan_4x16, av1_default_iscan_4x16 },
+      { default_scan_4x16, av1_default_iscan_4x16 },
+      { default_scan_4x16, av1_default_iscan_4x16 },
+      { default_scan_4x16, av1_default_iscan_4x16 },
+      { default_scan_4x16, av1_default_iscan_4x16 },
+      { default_scan_4x16, av1_default_iscan_4x16 },
+      { default_scan_4x16, av1_default_iscan_4x16 },
+      { default_scan_4x16, av1_default_iscan_4x16 },
+      { default_scan_4x16, av1_default_iscan_4x16 },
+      { default_scan_4x16, av1_default_iscan_4x16 },
+      { mrow_scan_4x16, av1_mrow_iscan_4x16 },
+      { mcol_scan_4x16, av1_mcol_iscan_4x16 },
+      { mrow_scan_4x16, av1_mrow_iscan_4x16 },
+      { mcol_scan_4x16, av1_mcol_iscan_4x16 },
+      { mrow_scan_4x16, av1_mrow_iscan_4x16 },
+      { mcol_scan_4x16, av1_mcol_iscan_4x16 },
+  },
+  {
+      // TX_16X4
+      { default_scan_16x4, av1_default_iscan_16x4 },
+      { default_scan_16x4, av1_default_iscan_16x4 },
+      { default_scan_16x4, av1_default_iscan_16x4 },
+      { default_scan_16x4, av1_default_iscan_16x4 },
+      { default_scan_16x4, av1_default_iscan_16x4 },
+      { default_scan_16x4, av1_default_iscan_16x4 },
+      { default_scan_16x4, av1_default_iscan_16x4 },
+      { default_scan_16x4, av1_default_iscan_16x4 },
+      { default_scan_16x4, av1_default_iscan_16x4 },
+      { default_scan_16x4, av1_default_iscan_16x4 },
+      { mrow_scan_16x4, av1_mrow_iscan_16x4 },
+      { mcol_scan_16x4, av1_mcol_iscan_16x4 },
+      { mrow_scan_16x4, av1_mrow_iscan_16x4 },
+      { mcol_scan_16x4, av1_mcol_iscan_16x4 },
+      { mrow_scan_16x4, av1_mrow_iscan_16x4 },
+      { mcol_scan_16x4, av1_mcol_iscan_16x4 },
+  },
+  {
+      // TX_8X32
+      { default_scan_8x32, av1_default_iscan_8x32 },
+      { default_scan_8x32, av1_default_iscan_8x32 },
+      { default_scan_8x32, av1_default_iscan_8x32 },
+      { default_scan_8x32, av1_default_iscan_8x32 },
+      { default_scan_8x32, av1_default_iscan_8x32 },
+      { default_scan_8x32, av1_default_iscan_8x32 },
+      { default_scan_8x32, av1_default_iscan_8x32 },
+      { default_scan_8x32, av1_default_iscan_8x32 },
+      { default_scan_8x32, av1_default_iscan_8x32 },
+      { default_scan_8x32, av1_default_iscan_8x32 },
+      { mrow_scan_8x32, av1_mrow_iscan_8x32 },
+      { mcol_scan_8x32, av1_mcol_iscan_8x32 },
+      { mrow_scan_8x32, av1_mrow_iscan_8x32 },
+      { mcol_scan_8x32, av1_mcol_iscan_8x32 },
+      { mrow_scan_8x32, av1_mrow_iscan_8x32 },
+      { mcol_scan_8x32, av1_mcol_iscan_8x32 },
+  },
+  {
+      // TX_32X8
+      { default_scan_32x8, av1_default_iscan_32x8 },
+      { default_scan_32x8, av1_default_iscan_32x8 },
+      { default_scan_32x8, av1_default_iscan_32x8 },
+      { default_scan_32x8, av1_default_iscan_32x8 },
+      { default_scan_32x8, av1_default_iscan_32x8 },
+      { default_scan_32x8, av1_default_iscan_32x8 },
+      { default_scan_32x8, av1_default_iscan_32x8 },
+      { default_scan_32x8, av1_default_iscan_32x8 },
+      { default_scan_32x8, av1_default_iscan_32x8 },
+      { default_scan_32x8, av1_default_iscan_32x8 },
+      { mrow_scan_32x8, av1_mrow_iscan_32x8 },
+      { mcol_scan_32x8, av1_mcol_iscan_32x8 },
+      { mrow_scan_32x8, av1_mrow_iscan_32x8 },
+      { mcol_scan_32x8, av1_mcol_iscan_32x8 },
+      { mrow_scan_32x8, av1_mrow_iscan_32x8 },
+      { mcol_scan_32x8, av1_mcol_iscan_32x8 },
+  },
+  {
+      // TX_16X64
+      // Half of the coefficients of tx64 at higher frequencies are set to
+      // zeros. So tx32's scan order is used.
+      { default_scan_16x32, av1_default_iscan_16x32 },
+      { default_scan_16x32, av1_default_iscan_16x32 },
+      { default_scan_16x32, av1_default_iscan_16x32 },
+      { default_scan_16x32, av1_default_iscan_16x32 },
+      { default_scan_16x32, av1_default_iscan_16x32 },
+      { default_scan_16x32, av1_default_iscan_16x32 },
+      { default_scan_16x32, av1_default_iscan_16x32 },
+      { default_scan_16x32, av1_default_iscan_16x32 },
+      { default_scan_16x32, av1_default_iscan_16x32 },
+      { default_scan_16x32, av1_default_iscan_16x32 },
+      { mrow_scan_16x32, av1_mrow_iscan_16x32 },
+      { mcol_scan_16x32, av1_mcol_iscan_16x32 },
+      { mrow_scan_16x32, av1_mrow_iscan_16x32 },
+      { mcol_scan_16x32, av1_mcol_iscan_16x32 },
+      { mrow_scan_16x32, av1_mrow_iscan_16x32 },
+      { mcol_scan_16x32, av1_mcol_iscan_16x32 },
+  },
+  {
+      // TX_64X16
+      // Half of the coefficients of tx64 at higher frequencies are set to
+      // zeros. So tx32's scan order is used.
+      { default_scan_32x16, av1_default_iscan_32x16 },
+      { default_scan_32x16, av1_default_iscan_32x16 },
+      { default_scan_32x16, av1_default_iscan_32x16 },
+      { default_scan_32x16, av1_default_iscan_32x16 },
+      { default_scan_32x16, av1_default_iscan_32x16 },
+      { default_scan_32x16, av1_default_iscan_32x16 },
+      { default_scan_32x16, av1_default_iscan_32x16 },
+      { default_scan_32x16, av1_default_iscan_32x16 },
+      { default_scan_32x16, av1_default_iscan_32x16 },
+      { default_scan_32x16, av1_default_iscan_32x16 },
+      { mrow_scan_32x16, av1_mrow_iscan_32x16 },
+      { mcol_scan_32x16, av1_mcol_iscan_32x16 },
+      { mrow_scan_32x16, av1_mrow_iscan_32x16 },
+      { mcol_scan_32x16, av1_mcol_iscan_32x16 },
+      { mrow_scan_32x16, av1_mrow_iscan_32x16 },
+      { mcol_scan_32x16, av1_mcol_iscan_32x16 },
+  },
+};
diff --git a/libs/libaom/src/av1/common/scan.h b/libs/libaom/src/av1/common/scan.h
new file mode 100644
index 000000000..d9620e1c5
--- /dev/null
+++ b/libs/libaom/src/av1/common/scan.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_SCAN_H_
+#define AOM_AV1_COMMON_SCAN_H_
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/blockd.h"
+#include "av1/common/enums.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_NEIGHBORS 2
+
+enum {
+  SCAN_MODE_ZIG_ZAG,
+  SCAN_MODE_COL_DIAG,
+  SCAN_MODE_ROW_DIAG,
+  SCAN_MODE_COL_1D,
+  SCAN_MODE_ROW_1D,
+  SCAN_MODES
+} UENUM1BYTE(SCAN_MODE);
+
+extern const SCAN_ORDER av1_default_scan_orders[TX_SIZES];
+extern const SCAN_ORDER av1_scan_orders[TX_SIZES_ALL][TX_TYPES];
+
+void av1_deliver_eob_threshold(const AV1_COMMON *cm, MACROBLOCKD *xd);
+
+static INLINE const SCAN_ORDER *get_default_scan(TX_SIZE tx_size,
+                                                 TX_TYPE tx_type) {
+  return &av1_scan_orders[tx_size][tx_type];
+}
+
+static INLINE const SCAN_ORDER *get_scan(TX_SIZE tx_size, TX_TYPE tx_type) {
+  return get_default_scan(tx_size, tx_type);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_COMMON_SCAN_H_
diff --git a/libs/libaom/src/av1/common/seg_common.c b/libs/libaom/src/av1/common/seg_common.c
new file mode 100644
index 000000000..60b185161
--- /dev/null
+++ b/libs/libaom/src/av1/common/seg_common.c
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "av1/common/av1_loopfilter.h"
+#include "av1/common/blockd.h"
+#include "av1/common/seg_common.h"
+#include "av1/common/quant_common.h"
+
+static const int seg_feature_data_signed[SEG_LVL_MAX] = {
+  1, 1, 1, 1, 1, 0, 0, 0
+};
+
+static const int seg_feature_data_max[SEG_LVL_MAX] = { MAXQ,
+                                                       MAX_LOOP_FILTER,
+                                                       MAX_LOOP_FILTER,
+                                                       MAX_LOOP_FILTER,
+                                                       MAX_LOOP_FILTER,
+                                                       7,
+                                                       0,
+                                                       0 };
+
+// These functions provide access to new segment level features.
+// Eventually these function may be "optimized out" but for the moment,
+// the coding mechanism is still subject to change so these provide a
+// convenient single point of change.
+
+void av1_clearall_segfeatures(struct segmentation *seg) {
+  av1_zero(seg->feature_data);
+  av1_zero(seg->feature_mask);
+}
+
+void av1_calculate_segdata(struct segmentation *seg) {
+  seg->segid_preskip = 0;
+  seg->last_active_segid = 0;
+  for (int i = 0; i < MAX_SEGMENTS; i++) {
+    for (int j = 0; j < SEG_LVL_MAX; j++) {
+      if (seg->feature_mask[i] & (1 << j)) {
+        seg->segid_preskip |= (j >= SEG_LVL_REF_FRAME);
+        seg->last_active_segid = i;
+      }
+    }
+  }
+}
+
+void av1_enable_segfeature(struct segmentation *seg, int segment_id,
+                           SEG_LVL_FEATURES feature_id) {
+  seg->feature_mask[segment_id] |= 1 << feature_id;
+}
+
+int av1_seg_feature_data_max(SEG_LVL_FEATURES feature_id) {
+  return seg_feature_data_max[feature_id];
+}
+
+int av1_is_segfeature_signed(SEG_LVL_FEATURES feature_id) {
+  return seg_feature_data_signed[feature_id];
+}
+
+// The 'seg_data' given for each segment can be either deltas (from the default
+// value chosen for the frame) or absolute values.
+//
+// Valid range for abs values is (0-127 for MB_LVL_ALT_Q), (0-63 for
+// SEGMENT_ALT_LF)
+// Valid range for delta values are (+/-127 for MB_LVL_ALT_Q), (+/-63 for
+// SEGMENT_ALT_LF)
+//
+// abs_delta = SEGMENT_DELTADATA (deltas) abs_delta = SEGMENT_ABSDATA (use
+// the absolute values given).
+
+void av1_set_segdata(struct segmentation *seg, int segment_id,
+                     SEG_LVL_FEATURES feature_id, int seg_data) {
+  if (seg_data < 0) {
+    assert(seg_feature_data_signed[feature_id]);
+    assert(-seg_data <= seg_feature_data_max[feature_id]);
+  } else {
+    assert(seg_data <= seg_feature_data_max[feature_id]);
+  }
+
+  seg->feature_data[segment_id][feature_id] = seg_data;
+}
+
+// TBD? Functions to read and write segment data with range / validity checking
diff --git a/libs/libaom/src/av1/common/seg_common.h b/libs/libaom/src/av1/common/seg_common.h
new file mode 100644
index 000000000..aeb9c1768
--- /dev/null
+++ b/libs/libaom/src/av1/common/seg_common.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_SEG_COMMON_H_
+#define AOM_AV1_COMMON_SEG_COMMON_H_
+
+#include "aom_dsp/prob.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_SEGMENTS 8
+#define SEG_TREE_PROBS (MAX_SEGMENTS - 1)
+
+#define SEG_TEMPORAL_PRED_CTXS 3
+#define SPATIAL_PREDICTION_PROBS 3
+
+enum {
+  SEG_LVL_ALT_Q,       // Use alternate Quantizer ....
+  SEG_LVL_ALT_LF_Y_V,  // Use alternate loop filter value on y plane vertical
+  SEG_LVL_ALT_LF_Y_H,  // Use alternate loop filter value on y plane horizontal
+  SEG_LVL_ALT_LF_U,    // Use alternate loop filter value on u plane
+  SEG_LVL_ALT_LF_V,    // Use alternate loop filter value on v plane
+  SEG_LVL_REF_FRAME,   // Optional Segment reference frame
+  SEG_LVL_SKIP,        // Optional Segment (0,0) + skip mode
+  SEG_LVL_GLOBALMV,
+  SEG_LVL_MAX
+} UENUM1BYTE(SEG_LVL_FEATURES);
+
+struct segmentation {
+  uint8_t enabled;
+  uint8_t update_map;
+  uint8_t update_data;
+  uint8_t temporal_update;
+
+  int16_t feature_data[MAX_SEGMENTS][SEG_LVL_MAX];
+  unsigned int feature_mask[MAX_SEGMENTS];
+  int last_active_segid;  // The highest numbered segment id that has some
+                          // enabled feature.
+  uint8_t segid_preskip;  // Whether the segment id will be read before the
+                          // skip syntax element.
+                          // 1: the segment id will be read first.
+                          // 0: the skip syntax element will be read first.
+};
+
+struct segmentation_probs {
+  aom_cdf_prob tree_cdf[CDF_SIZE(MAX_SEGMENTS)];
+  aom_cdf_prob pred_cdf[SEG_TEMPORAL_PRED_CTXS][CDF_SIZE(2)];
+  aom_cdf_prob spatial_pred_seg_cdf[SPATIAL_PREDICTION_PROBS]
+                                   [CDF_SIZE(MAX_SEGMENTS)];
+};
+
+static INLINE int segfeature_active(const struct segmentation *seg,
+                                    int segment_id,
+                                    SEG_LVL_FEATURES feature_id) {
+  return seg->enabled && (seg->feature_mask[segment_id] & (1 << feature_id));
+}
+
+static INLINE void segfeatures_copy(struct segmentation *dst,
+                                    const struct segmentation *src) {
+  int i, j;
+  for (i = 0; i < MAX_SEGMENTS; i++) {
+    dst->feature_mask[i] = src->feature_mask[i];
+    for (j = 0; j < SEG_LVL_MAX; j++) {
+      dst->feature_data[i][j] = src->feature_data[i][j];
+    }
+  }
+  dst->segid_preskip = src->segid_preskip;
+  dst->last_active_segid = src->last_active_segid;
+}
+
+void av1_clearall_segfeatures(struct segmentation *seg);
+
+void av1_enable_segfeature(struct segmentation *seg, int segment_id,
+                           SEG_LVL_FEATURES feature_id);
+
+void av1_calculate_segdata(struct segmentation *seg);
+
+int av1_seg_feature_data_max(SEG_LVL_FEATURES feature_id);
+
+int av1_is_segfeature_signed(SEG_LVL_FEATURES feature_id);
+
+void av1_set_segdata(struct segmentation *seg, int segment_id,
+                     SEG_LVL_FEATURES feature_id, int seg_data);
+
+static INLINE int get_segdata(const struct segmentation *seg, int segment_id,
+                              SEG_LVL_FEATURES feature_id) {
+  return seg->feature_data[segment_id][feature_id];
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_COMMON_SEG_COMMON_H_
diff --git a/libs/libaom/src/av1/common/thread_common.c b/libs/libaom/src/av1/common/thread_common.c
new file mode 100644
index 000000000..f3c8795f8
--- /dev/null
+++ b/libs/libaom/src/av1/common/thread_common.c
@@ -0,0 +1,930 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_config.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "av1/common/av1_loopfilter.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/thread_common.h"
+#include "av1/common/reconinter.h"
+
+// Set up nsync by width.
+static INLINE int get_sync_range(int width) {
+  // nsync numbers are picked by testing. For example, for 4k
+  // video, using 4 gives best performance.
+  if (width < 640)
+    return 1;
+  else if (width <= 1280)
+    return 2;
+  else if (width <= 4096)
+    return 4;
+  else
+    return 8;
+}
+
+static INLINE int get_lr_sync_range(int width) {
+#if 0
+  // nsync numbers are picked by testing. For example, for 4k
+  // video, using 4 gives best performance.
+  if (width < 640)
+    return 1;
+  else if (width <= 1280)
+    return 2;
+  else if (width <= 4096)
+    return 4;
+  else
+    return 8;
+#else
+  (void)width;
+  return 1;
+#endif
+}
+
+// Allocate memory for lf row synchronization
+static void loop_filter_alloc(AV1LfSync *lf_sync, AV1_COMMON *cm, int rows,
+                              int width, int num_workers) {
+  lf_sync->rows = rows;
+#if CONFIG_MULTITHREAD
+  {
+    int i, j;
+
+    for (j = 0; j < MAX_MB_PLANE; j++) {
+      CHECK_MEM_ERROR(cm, lf_sync->mutex_[j],
+                      aom_malloc(sizeof(*(lf_sync->mutex_[j])) * rows));
+      if (lf_sync->mutex_[j]) {
+        for (i = 0; i < rows; ++i) {
+          pthread_mutex_init(&lf_sync->mutex_[j][i], NULL);
+        }
+      }
+
+      CHECK_MEM_ERROR(cm, lf_sync->cond_[j],
+                      aom_malloc(sizeof(*(lf_sync->cond_[j])) * rows));
+      if (lf_sync->cond_[j]) {
+        for (i = 0; i < rows; ++i) {
+          pthread_cond_init(&lf_sync->cond_[j][i], NULL);
+        }
+      }
+    }
+
+    CHECK_MEM_ERROR(cm, lf_sync->job_mutex,
+                    aom_malloc(sizeof(*(lf_sync->job_mutex))));
+    if (lf_sync->job_mutex) {
+      pthread_mutex_init(lf_sync->job_mutex, NULL);
+    }
+  }
+#endif  // CONFIG_MULTITHREAD
+  CHECK_MEM_ERROR(cm, lf_sync->lfdata,
+                  aom_malloc(num_workers * sizeof(*(lf_sync->lfdata))));
+  lf_sync->num_workers = num_workers;
+
+  for (int j = 0; j < MAX_MB_PLANE; j++) {
+    CHECK_MEM_ERROR(cm, lf_sync->cur_sb_col[j],
+                    aom_malloc(sizeof(*(lf_sync->cur_sb_col[j])) * rows));
+  }
+  CHECK_MEM_ERROR(
+      cm, lf_sync->job_queue,
+      aom_malloc(sizeof(*(lf_sync->job_queue)) * rows * MAX_MB_PLANE * 2));
+  // Set up nsync.
+  lf_sync->sync_range = get_sync_range(width);
+}
+
+// Deallocate lf synchronization related mutex and data
+void av1_loop_filter_dealloc(AV1LfSync *lf_sync) {
+  if (lf_sync != NULL) {
+    int j;
+#if CONFIG_MULTITHREAD
+    int i;
+    for (j = 0; j < MAX_MB_PLANE; j++) {
+      if (lf_sync->mutex_[j] != NULL) {
+        for (i = 0; i < lf_sync->rows; ++i) {
+          pthread_mutex_destroy(&lf_sync->mutex_[j][i]);
+        }
+        aom_free(lf_sync->mutex_[j]);
+      }
+      if (lf_sync->cond_[j] != NULL) {
+        for (i = 0; i < lf_sync->rows; ++i) {
+          pthread_cond_destroy(&lf_sync->cond_[j][i]);
+        }
+        aom_free(lf_sync->cond_[j]);
+      }
+    }
+    if (lf_sync->job_mutex != NULL) {
+      pthread_mutex_destroy(lf_sync->job_mutex);
+      aom_free(lf_sync->job_mutex);
+    }
+#endif  // CONFIG_MULTITHREAD
+    aom_free(lf_sync->lfdata);
+    for (j = 0; j < MAX_MB_PLANE; j++) {
+      aom_free(lf_sync->cur_sb_col[j]);
+    }
+
+    aom_free(lf_sync->job_queue);
+    // clear the structure as the source of this call may be a resize in which
+    // case this call will be followed by an _alloc() which may fail.
+    av1_zero(*lf_sync);
+  }
+}
+
+static void loop_filter_data_reset(LFWorkerData *lf_data,
+                                   YV12_BUFFER_CONFIG *frame_buffer,
+                                   struct AV1Common *cm, MACROBLOCKD *xd) {
+  struct macroblockd_plane *pd = xd->plane;
+  lf_data->frame_buffer = frame_buffer;
+  lf_data->cm = cm;
+  lf_data->xd = xd;
+  for (int i = 0; i < MAX_MB_PLANE; i++) {
+    memcpy(&lf_data->planes[i].dst, &pd[i].dst, sizeof(lf_data->planes[i].dst));
+    lf_data->planes[i].subsampling_x = pd[i].subsampling_x;
+    lf_data->planes[i].subsampling_y = pd[i].subsampling_y;
+  }
+}
+
+static INLINE void sync_read(AV1LfSync *const lf_sync, int r, int c,
+                             int plane) {
+#if CONFIG_MULTITHREAD
+  const int nsync = lf_sync->sync_range;
+
+  if (r && !(c & (nsync - 1))) {
+    pthread_mutex_t *const mutex = &lf_sync->mutex_[plane][r - 1];
+    pthread_mutex_lock(mutex);
+
+    while (c > lf_sync->cur_sb_col[plane][r - 1] - nsync) {
+      pthread_cond_wait(&lf_sync->cond_[plane][r - 1], mutex);
+    }
+    pthread_mutex_unlock(mutex);
+  }
+#else
+  (void)lf_sync;
+  (void)r;
+  (void)c;
+  (void)plane;
+#endif  // CONFIG_MULTITHREAD
+}
+
+static INLINE void sync_write(AV1LfSync *const lf_sync, int r, int c,
+                              const int sb_cols, int plane) {
+#if CONFIG_MULTITHREAD
+  const int nsync = lf_sync->sync_range;
+  int cur;
+  // Only signal when there are enough filtered SB for next row to run.
+  int sig = 1;
+
+  if (c < sb_cols - 1) {
+    cur = c;
+    if (c % nsync) sig = 0;
+  } else {
+    cur = sb_cols + nsync;
+  }
+
+  if (sig) {
+    pthread_mutex_lock(&lf_sync->mutex_[plane][r]);
+
+    lf_sync->cur_sb_col[plane][r] = cur;
+
+    pthread_cond_broadcast(&lf_sync->cond_[plane][r]);
+    pthread_mutex_unlock(&lf_sync->mutex_[plane][r]);
+  }
+#else
+  (void)lf_sync;
+  (void)r;
+  (void)c;
+  (void)sb_cols;
+  (void)plane;
+#endif  // CONFIG_MULTITHREAD
+}
+
+static void enqueue_lf_jobs(AV1LfSync *lf_sync, AV1_COMMON *cm, int start,
+                            int stop,
+#if CONFIG_LPF_MASK
+                            int is_decoding,
+#endif
+                            int plane_start, int plane_end) {
+  int mi_row, plane, dir;
+  AV1LfMTInfo *lf_job_queue = lf_sync->job_queue;
+  lf_sync->jobs_enqueued = 0;
+  lf_sync->jobs_dequeued = 0;
+
+  for (dir = 0; dir < 2; dir++) {
+    for (plane = plane_start; plane < plane_end; plane++) {
+      if (plane == 0 && !(cm->lf.filter_level[0]) && !(cm->lf.filter_level[1]))
+        break;
+      else if (plane == 1 && !(cm->lf.filter_level_u))
+        continue;
+      else if (plane == 2 && !(cm->lf.filter_level_v))
+        continue;
+#if CONFIG_LPF_MASK
+      int step = MAX_MIB_SIZE;
+      if (is_decoding) {
+        step = MI_SIZE_64X64;
+      }
+      for (mi_row = start; mi_row < stop; mi_row += step)
+#else
+      for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE)
+#endif
+      {
+        lf_job_queue->mi_row = mi_row;
+        lf_job_queue->plane = plane;
+        lf_job_queue->dir = dir;
+        lf_job_queue++;
+        lf_sync->jobs_enqueued++;
+      }
+    }
+  }
+}
+
+static AV1LfMTInfo *get_lf_job_info(AV1LfSync *lf_sync) {
+  AV1LfMTInfo *cur_job_info = NULL;
+
+#if CONFIG_MULTITHREAD
+  pthread_mutex_lock(lf_sync->job_mutex);
+
+  if (lf_sync->jobs_dequeued < lf_sync->jobs_enqueued) {
+    cur_job_info = lf_sync->job_queue + lf_sync->jobs_dequeued;
+    lf_sync->jobs_dequeued++;
+  }
+
+  pthread_mutex_unlock(lf_sync->job_mutex);
+#else
+  (void)lf_sync;
+#endif
+
+  return cur_job_info;
+}
+
+// Implement row loopfiltering for each thread.
+static INLINE void thread_loop_filter_rows(
+    const YV12_BUFFER_CONFIG *const frame_buffer, AV1_COMMON *const cm,
+    struct macroblockd_plane *planes, MACROBLOCKD *xd,
+    AV1LfSync *const lf_sync) {
+  const int sb_cols =
+      ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols, MAX_MIB_SIZE_LOG2) >>
+      MAX_MIB_SIZE_LOG2;
+  int mi_row, mi_col, plane, dir;
+  int r, c;
+
+  while (1) {
+    AV1LfMTInfo *cur_job_info = get_lf_job_info(lf_sync);
+
+    if (cur_job_info != NULL) {
+      mi_row = cur_job_info->mi_row;
+      plane = cur_job_info->plane;
+      dir = cur_job_info->dir;
+      r = mi_row >> MAX_MIB_SIZE_LOG2;
+
+      if (dir == 0) {
+        for (mi_col = 0; mi_col < cm->mi_params.mi_cols;
+             mi_col += MAX_MIB_SIZE) {
+          c = mi_col >> MAX_MIB_SIZE_LOG2;
+
+          av1_setup_dst_planes(planes, cm->seq_params.sb_size, frame_buffer,
+                               mi_row, mi_col, plane, plane + 1);
+
+          av1_filter_block_plane_vert(cm, xd, plane, &planes[plane], mi_row,
+                                      mi_col);
+          sync_write(lf_sync, r, c, sb_cols, plane);
+        }
+      } else if (dir == 1) {
+        for (mi_col = 0; mi_col < cm->mi_params.mi_cols;
+             mi_col += MAX_MIB_SIZE) {
+          c = mi_col >> MAX_MIB_SIZE_LOG2;
+
+          // Wait for vertical edge filtering of the top-right block to be
+          // completed
+          sync_read(lf_sync, r, c, plane);
+
+          // Wait for vertical edge filtering of the right block to be
+          // completed
+          sync_read(lf_sync, r + 1, c, plane);
+
+          av1_setup_dst_planes(planes, cm->seq_params.sb_size, frame_buffer,
+                               mi_row, mi_col, plane, plane + 1);
+          av1_filter_block_plane_horz(cm, xd, plane, &planes[plane], mi_row,
+                                      mi_col);
+        }
+      }
+    } else {
+      break;
+    }
+  }
+}
+
+// Row-based multi-threaded loopfilter hook
+static int loop_filter_row_worker(void *arg1, void *arg2) {
+  AV1LfSync *const lf_sync = (AV1LfSync *)arg1;
+  LFWorkerData *const lf_data = (LFWorkerData *)arg2;
+  thread_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
+                          lf_data->xd, lf_sync);
+  return 1;
+}
+
+#if CONFIG_LPF_MASK
+static INLINE void thread_loop_filter_bitmask_rows(
+    const YV12_BUFFER_CONFIG *const frame_buffer, AV1_COMMON *const cm,
+    struct macroblockd_plane *planes, MACROBLOCKD *xd,
+    AV1LfSync *const lf_sync) {
+  const int sb_cols =
+      ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols, MIN_MIB_SIZE_LOG2) >>
+      MIN_MIB_SIZE_LOG2;
+  int mi_row, mi_col, plane, dir;
+  int r, c;
+  (void)xd;
+
+  while (1) {
+    AV1LfMTInfo *cur_job_info = get_lf_job_info(lf_sync);
+
+    if (cur_job_info != NULL) {
+      mi_row = cur_job_info->mi_row;
+      plane = cur_job_info->plane;
+      dir = cur_job_info->dir;
+      r = mi_row >> MIN_MIB_SIZE_LOG2;
+
+      if (dir == 0) {
+        for (mi_col = 0; mi_col < cm->mi_params.mi_cols;
+             mi_col += MI_SIZE_64X64) {
+          c = mi_col >> MIN_MIB_SIZE_LOG2;
+
+          av1_setup_dst_planes(planes, BLOCK_64X64, frame_buffer, mi_row,
+                               mi_col, plane, plane + 1);
+
+          av1_filter_block_plane_bitmask_vert(cm, &planes[plane], plane, mi_row,
+                                              mi_col);
+          sync_write(lf_sync, r, c, sb_cols, plane);
+        }
+      } else if (dir == 1) {
+        for (mi_col = 0; mi_col < cm->mi_params.mi_cols;
+             mi_col += MI_SIZE_64X64) {
+          c = mi_col >> MIN_MIB_SIZE_LOG2;
+
+          // Wait for vertical edge filtering of the top-right block to be
+          // completed
+          sync_read(lf_sync, r, c, plane);
+
+          // Wait for vertical edge filtering of the right block to be
+          // completed
+          sync_read(lf_sync, r + 1, c, plane);
+
+          av1_setup_dst_planes(planes, BLOCK_64X64, frame_buffer, mi_row,
+                               mi_col, plane, plane + 1);
+          av1_filter_block_plane_bitmask_horz(cm, &planes[plane], plane, mi_row,
+                                              mi_col);
+        }
+      }
+    } else {
+      break;
+    }
+  }
+}
+
+// Row-based multi-threaded loopfilter hook
+static int loop_filter_bitmask_row_worker(void *arg1, void *arg2) {
+  AV1LfSync *const lf_sync = (AV1LfSync *)arg1;
+  LFWorkerData *const lf_data = (LFWorkerData *)arg2;
+  thread_loop_filter_bitmask_rows(lf_data->frame_buffer, lf_data->cm,
+                                  lf_data->planes, lf_data->xd, lf_sync);
+  return 1;
+}
+#endif  // CONFIG_LPF_MASK
+
+static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
+                                MACROBLOCKD *xd, int start, int stop,
+                                int plane_start, int plane_end,
+#if CONFIG_LPF_MASK
+                                int is_decoding,
+#endif
+                                AVxWorker *workers, int nworkers,
+                                AV1LfSync *lf_sync) {
+  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+#if CONFIG_LPF_MASK
+  int sb_rows;
+  if (is_decoding) {
+    sb_rows = ALIGN_POWER_OF_TWO(cm->mi_params.mi_rows, MIN_MIB_SIZE_LOG2) >>
+              MIN_MIB_SIZE_LOG2;
+  } else {
+    sb_rows = ALIGN_POWER_OF_TWO(cm->mi_params.mi_rows, MAX_MIB_SIZE_LOG2) >>
+              MAX_MIB_SIZE_LOG2;
+  }
+#else
+  // Number of superblock rows and cols
+  const int sb_rows =
+      ALIGN_POWER_OF_TWO(cm->mi_params.mi_rows, MAX_MIB_SIZE_LOG2) >>
+      MAX_MIB_SIZE_LOG2;
+#endif
+  const int num_workers = nworkers;
+  int i;
+
+  if (!lf_sync->sync_range || sb_rows != lf_sync->rows ||
+      num_workers > lf_sync->num_workers) {
+    av1_loop_filter_dealloc(lf_sync);
+    loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers);
+  }
+
+  // Initialize cur_sb_col to -1 for all SB rows.
+  for (i = 0; i < MAX_MB_PLANE; i++) {
+    memset(lf_sync->cur_sb_col[i], -1,
+           sizeof(*(lf_sync->cur_sb_col[i])) * sb_rows);
+  }
+
+  enqueue_lf_jobs(lf_sync, cm, start, stop,
+#if CONFIG_LPF_MASK
+                  is_decoding,
+#endif
+                  plane_start, plane_end);
+
+  // Set up loopfilter thread data.
+  for (i = 0; i < num_workers; ++i) {
+    AVxWorker *const worker = &workers[i];
+    LFWorkerData *const lf_data = &lf_sync->lfdata[i];
+
+#if CONFIG_LPF_MASK
+    if (is_decoding) {
+      worker->hook = loop_filter_bitmask_row_worker;
+    } else {
+      worker->hook = loop_filter_row_worker;
+    }
+#else
+    worker->hook = loop_filter_row_worker;
+#endif
+    worker->data1 = lf_sync;
+    worker->data2 = lf_data;
+
+    // Loopfilter data
+    loop_filter_data_reset(lf_data, frame, cm, xd);
+
+    // Start loopfiltering
+    if (i == num_workers - 1) {
+      winterface->execute(worker);
+    } else {
+      winterface->launch(worker);
+    }
+  }
+
+  // Wait till all rows are finished
+  for (i = 0; i < num_workers; ++i) {
+    winterface->sync(&workers[i]);
+  }
+}
+
+void av1_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
+                              MACROBLOCKD *xd, int plane_start, int plane_end,
+                              int partial_frame,
+#if CONFIG_LPF_MASK
+                              int is_decoding,
+#endif
+                              AVxWorker *workers, int num_workers,
+                              AV1LfSync *lf_sync) {
+  int start_mi_row, end_mi_row, mi_rows_to_filter;
+
+  start_mi_row = 0;
+  mi_rows_to_filter = cm->mi_params.mi_rows;
+  if (partial_frame && cm->mi_params.mi_rows > 8) {
+    start_mi_row = cm->mi_params.mi_rows >> 1;
+    start_mi_row &= 0xfffffff8;
+    mi_rows_to_filter = AOMMAX(cm->mi_params.mi_rows / 8, 8);
+  }
+  end_mi_row = start_mi_row + mi_rows_to_filter;
+  av1_loop_filter_frame_init(cm, plane_start, plane_end);
+
+#if CONFIG_LPF_MASK
+  if (is_decoding) {
+    cm->is_decoding = is_decoding;
+    // TODO(chengchen): currently use one thread to build bitmasks for the
+    // frame. Make it support multi-thread later.
+    for (int plane = plane_start; plane < plane_end; plane++) {
+      if (plane == 0 && !(cm->lf.filter_level[0]) && !(cm->lf.filter_level[1]))
+        break;
+      else if (plane == 1 && !(cm->lf.filter_level_u))
+        continue;
+      else if (plane == 2 && !(cm->lf.filter_level_v))
+        continue;
+
+      // TODO(chengchen): can we remove this?
+      struct macroblockd_plane *pd = xd->plane;
+      av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame, 0, 0, plane,
+                           plane + 1);
+
+      av1_build_bitmask_vert_info(cm, &pd[plane], plane);
+      av1_build_bitmask_horz_info(cm, &pd[plane], plane);
+    }
+    loop_filter_rows_mt(frame, cm, xd, start_mi_row, end_mi_row, plane_start,
+                        plane_end, 1, workers, num_workers, lf_sync);
+  } else {
+    loop_filter_rows_mt(frame, cm, xd, start_mi_row, end_mi_row, plane_start,
+                        plane_end, 0, workers, num_workers, lf_sync);
+  }
+#else
+  loop_filter_rows_mt(frame, cm, xd, start_mi_row, end_mi_row, plane_start,
+                      plane_end, workers, num_workers, lf_sync);
+#endif
+}
+
+static INLINE void lr_sync_read(void *const lr_sync, int r, int c, int plane) {
+#if CONFIG_MULTITHREAD
+  AV1LrSync *const loop_res_sync = (AV1LrSync *)lr_sync;
+  const int nsync = loop_res_sync->sync_range;
+
+  if (r && !(c & (nsync - 1))) {
+    pthread_mutex_t *const mutex = &loop_res_sync->mutex_[plane][r - 1];
+    pthread_mutex_lock(mutex);
+
+    while (c > loop_res_sync->cur_sb_col[plane][r - 1] - nsync) {
+      pthread_cond_wait(&loop_res_sync->cond_[plane][r - 1], mutex);
+    }
+    pthread_mutex_unlock(mutex);
+  }
+#else
+  (void)lr_sync;
+  (void)r;
+  (void)c;
+  (void)plane;
+#endif  // CONFIG_MULTITHREAD
+}
+
+static INLINE void lr_sync_write(void *const lr_sync, int r, int c,
+                                 const int sb_cols, int plane) {
+#if CONFIG_MULTITHREAD
+  AV1LrSync *const loop_res_sync = (AV1LrSync *)lr_sync;
+  const int nsync = loop_res_sync->sync_range;
+  int cur;
+  // Only signal when there are enough filtered SB for next row to run.
+  int sig = 1;
+
+  if (c < sb_cols - 1) {
+    cur = c;
+    if (c % nsync) sig = 0;
+  } else {
+    cur = sb_cols + nsync;
+  }
+
+  if (sig) {
+    pthread_mutex_lock(&loop_res_sync->mutex_[plane][r]);
+
+    loop_res_sync->cur_sb_col[plane][r] = cur;
+
+    pthread_cond_broadcast(&loop_res_sync->cond_[plane][r]);
+    pthread_mutex_unlock(&loop_res_sync->mutex_[plane][r]);
+  }
+#else
+  (void)lr_sync;
+  (void)r;
+  (void)c;
+  (void)sb_cols;
+  (void)plane;
+#endif  // CONFIG_MULTITHREAD
+}
+
+// Allocate memory for loop restoration row synchronization
+static void loop_restoration_alloc(AV1LrSync *lr_sync, AV1_COMMON *cm,
+                                   int num_workers, int num_rows_lr,
+                                   int num_planes, int width) {
+  lr_sync->rows = num_rows_lr;
+  lr_sync->num_planes = num_planes;
+#if CONFIG_MULTITHREAD
+  {
+    int i, j;
+
+    for (j = 0; j < num_planes; j++) {
+      CHECK_MEM_ERROR(cm, lr_sync->mutex_[j],
+                      aom_malloc(sizeof(*(lr_sync->mutex_[j])) * num_rows_lr));
+      if (lr_sync->mutex_[j]) {
+        for (i = 0; i < num_rows_lr; ++i) {
+          pthread_mutex_init(&lr_sync->mutex_[j][i], NULL);
+        }
+      }
+
+      CHECK_MEM_ERROR(cm, lr_sync->cond_[j],
+                      aom_malloc(sizeof(*(lr_sync->cond_[j])) * num_rows_lr));
+      if (lr_sync->cond_[j]) {
+        for (i = 0; i < num_rows_lr; ++i) {
+          pthread_cond_init(&lr_sync->cond_[j][i], NULL);
+        }
+      }
+    }
+
+    CHECK_MEM_ERROR(cm, lr_sync->job_mutex,
+                    aom_malloc(sizeof(*(lr_sync->job_mutex))));
+    if (lr_sync->job_mutex) {
+      pthread_mutex_init(lr_sync->job_mutex, NULL);
+    }
+  }
+#endif  // CONFIG_MULTITHREAD
+  CHECK_MEM_ERROR(cm, lr_sync->lrworkerdata,
+                  aom_malloc(num_workers * sizeof(*(lr_sync->lrworkerdata))));
+
+  for (int worker_idx = 0; worker_idx < num_workers; ++worker_idx) {
+    if (worker_idx < num_workers - 1) {
+      CHECK_MEM_ERROR(cm, lr_sync->lrworkerdata[worker_idx].rst_tmpbuf,
+                      (int32_t *)aom_memalign(16, RESTORATION_TMPBUF_SIZE));
+      CHECK_MEM_ERROR(cm, lr_sync->lrworkerdata[worker_idx].rlbs,
+                      aom_malloc(sizeof(RestorationLineBuffers)));
+
+    } else {
+      lr_sync->lrworkerdata[worker_idx].rst_tmpbuf = cm->rst_tmpbuf;
+      lr_sync->lrworkerdata[worker_idx].rlbs = cm->rlbs;
+    }
+  }
+
+  lr_sync->num_workers = num_workers;
+
+  for (int j = 0; j < num_planes; j++) {
+    CHECK_MEM_ERROR(
+        cm, lr_sync->cur_sb_col[j],
+        aom_malloc(sizeof(*(lr_sync->cur_sb_col[j])) * num_rows_lr));
+  }
+  CHECK_MEM_ERROR(
+      cm, lr_sync->job_queue,
+      aom_malloc(sizeof(*(lr_sync->job_queue)) * num_rows_lr * num_planes));
+  // Set up nsync.
+  lr_sync->sync_range = get_lr_sync_range(width);
+}
+
+// Deallocate loop restoration synchronization related mutex and data
+void av1_loop_restoration_dealloc(AV1LrSync *lr_sync, int num_workers) {
+  if (lr_sync != NULL) {
+    int j;
+#if CONFIG_MULTITHREAD
+    int i;
+    for (j = 0; j < MAX_MB_PLANE; j++) {
+      if (lr_sync->mutex_[j] != NULL) {
+        for (i = 0; i < lr_sync->rows; ++i) {
+          pthread_mutex_destroy(&lr_sync->mutex_[j][i]);
+        }
+        aom_free(lr_sync->mutex_[j]);
+      }
+      if (lr_sync->cond_[j] != NULL) {
+        for (i = 0; i < lr_sync->rows; ++i) {
+          pthread_cond_destroy(&lr_sync->cond_[j][i]);
+        }
+        aom_free(lr_sync->cond_[j]);
+      }
+    }
+    if (lr_sync->job_mutex != NULL) {
+      pthread_mutex_destroy(lr_sync->job_mutex);
+      aom_free(lr_sync->job_mutex);
+    }
+#endif  // CONFIG_MULTITHREAD
+    for (j = 0; j < MAX_MB_PLANE; j++) {
+      aom_free(lr_sync->cur_sb_col[j]);
+    }
+
+    aom_free(lr_sync->job_queue);
+
+    if (lr_sync->lrworkerdata) {
+      for (int worker_idx = 0; worker_idx < num_workers - 1; worker_idx++) {
+        LRWorkerData *const workerdata_data =
+            lr_sync->lrworkerdata + worker_idx;
+
+        aom_free(workerdata_data->rst_tmpbuf);
+        aom_free(workerdata_data->rlbs);
+      }
+      aom_free(lr_sync->lrworkerdata);
+    }
+
+    // clear the structure as the source of this call may be a resize in which
+    // case this call will be followed by an _alloc() which may fail.
+    av1_zero(*lr_sync);
+  }
+}
+
+static void enqueue_lr_jobs(AV1LrSync *lr_sync, AV1LrStruct *lr_ctxt,
+                            AV1_COMMON *cm) {
+  FilterFrameCtxt *ctxt = lr_ctxt->ctxt;
+
+  const int num_planes = av1_num_planes(cm);
+  AV1LrMTInfo *lr_job_queue = lr_sync->job_queue;
+  int32_t lr_job_counter[2], num_even_lr_jobs = 0;
+  lr_sync->jobs_enqueued = 0;
+  lr_sync->jobs_dequeued = 0;
+
+  for (int plane = 0; plane < num_planes; plane++) {
+    if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
+    num_even_lr_jobs =
+        num_even_lr_jobs + ((ctxt[plane].rsi->vert_units_per_tile + 1) >> 1);
+  }
+  lr_job_counter[0] = 0;
+  lr_job_counter[1] = num_even_lr_jobs;
+
+  for (int plane = 0; plane < num_planes; plane++) {
+    if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
+    const int is_uv = plane > 0;
+    const int ss_y = is_uv && cm->seq_params.subsampling_y;
+
+    AV1PixelRect tile_rect = ctxt[plane].tile_rect;
+    const int unit_size = ctxt[plane].rsi->restoration_unit_size;
+
+    const int tile_h = tile_rect.bottom - tile_rect.top;
+    const int ext_size = unit_size * 3 / 2;
+
+    int y0 = 0, i = 0;
+    while (y0 < tile_h) {
+      int remaining_h = tile_h - y0;
+      int h = (remaining_h < ext_size) ? remaining_h : unit_size;
+
+      RestorationTileLimits limits;
+      limits.v_start = tile_rect.top + y0;
+      limits.v_end = tile_rect.top + y0 + h;
+      assert(limits.v_end <= tile_rect.bottom);
+      // Offset the tile upwards to align with the restoration processing stripe
+      const int voffset = RESTORATION_UNIT_OFFSET >> ss_y;
+      limits.v_start = AOMMAX(tile_rect.top, limits.v_start - voffset);
+      if (limits.v_end < tile_rect.bottom) limits.v_end -= voffset;
+
+      assert(lr_job_counter[0] <= num_even_lr_jobs);
+
+      lr_job_queue[lr_job_counter[i & 1]].lr_unit_row = i;
+      lr_job_queue[lr_job_counter[i & 1]].plane = plane;
+      lr_job_queue[lr_job_counter[i & 1]].v_start = limits.v_start;
+      lr_job_queue[lr_job_counter[i & 1]].v_end = limits.v_end;
+      lr_job_queue[lr_job_counter[i & 1]].sync_mode = i & 1;
+      if ((i & 1) == 0) {
+        lr_job_queue[lr_job_counter[i & 1]].v_copy_start =
+            limits.v_start + RESTORATION_BORDER;
+        lr_job_queue[lr_job_counter[i & 1]].v_copy_end =
+            limits.v_end - RESTORATION_BORDER;
+        if (i == 0) {
+          assert(limits.v_start == tile_rect.top);
+          lr_job_queue[lr_job_counter[i & 1]].v_copy_start = tile_rect.top;
+        }
+        if (i == (ctxt[plane].rsi->vert_units_per_tile - 1)) {
+          assert(limits.v_end == tile_rect.bottom);
+          lr_job_queue[lr_job_counter[i & 1]].v_copy_end = tile_rect.bottom;
+        }
+      } else {
+        lr_job_queue[lr_job_counter[i & 1]].v_copy_start =
+            AOMMAX(limits.v_start - RESTORATION_BORDER, tile_rect.top);
+        lr_job_queue[lr_job_counter[i & 1]].v_copy_end =
+            AOMMIN(limits.v_end + RESTORATION_BORDER, tile_rect.bottom);
+      }
+      lr_job_counter[i & 1]++;
+      lr_sync->jobs_enqueued++;
+
+      y0 += h;
+      ++i;
+    }
+  }
+}
+
+static AV1LrMTInfo *get_lr_job_info(AV1LrSync *lr_sync) {
+  AV1LrMTInfo *cur_job_info = NULL;
+
+#if CONFIG_MULTITHREAD
+  pthread_mutex_lock(lr_sync->job_mutex);
+
+  if (lr_sync->jobs_dequeued < lr_sync->jobs_enqueued) {
+    cur_job_info = lr_sync->job_queue + lr_sync->jobs_dequeued;
+    lr_sync->jobs_dequeued++;
+  }
+
+  pthread_mutex_unlock(lr_sync->job_mutex);
+#else
+  (void)lr_sync;
+#endif
+
+  return cur_job_info;
+}
+
+// Implement row loop restoration for each thread.
+static int loop_restoration_row_worker(void *arg1, void *arg2) {
+  AV1LrSync *const lr_sync = (AV1LrSync *)arg1;
+  LRWorkerData *lrworkerdata = (LRWorkerData *)arg2;
+  AV1LrStruct *lr_ctxt = (AV1LrStruct *)lrworkerdata->lr_ctxt;
+  FilterFrameCtxt *ctxt = lr_ctxt->ctxt;
+  int lr_unit_row;
+  int plane;
+  const int tile_row = LR_TILE_ROW;
+  const int tile_col = LR_TILE_COL;
+  const int tile_cols = LR_TILE_COLS;
+  const int tile_idx = tile_col + tile_row * tile_cols;
+  typedef void (*copy_fun)(const YV12_BUFFER_CONFIG *src_ybc,
+                           YV12_BUFFER_CONFIG *dst_ybc, int hstart, int hend,
+                           int vstart, int vend);
+  static const copy_fun copy_funs[3] = { aom_yv12_partial_coloc_copy_y,
+                                         aom_yv12_partial_coloc_copy_u,
+                                         aom_yv12_partial_coloc_copy_v };
+
+  while (1) {
+    AV1LrMTInfo *cur_job_info = get_lr_job_info(lr_sync);
+    if (cur_job_info != NULL) {
+      RestorationTileLimits limits;
+      sync_read_fn_t on_sync_read;
+      sync_write_fn_t on_sync_write;
+      limits.v_start = cur_job_info->v_start;
+      limits.v_end = cur_job_info->v_end;
+      lr_unit_row = cur_job_info->lr_unit_row;
+      plane = cur_job_info->plane;
+      const int unit_idx0 = tile_idx * ctxt[plane].rsi->units_per_tile;
+
+      // sync_mode == 1 implies only sync read is required in LR Multi-threading
+      // sync_mode == 0 implies only sync write is required.
+      on_sync_read =
+          cur_job_info->sync_mode == 1 ? lr_sync_read : av1_lr_sync_read_dummy;
+      on_sync_write = cur_job_info->sync_mode == 0 ? lr_sync_write
+                                                   : av1_lr_sync_write_dummy;
+
+      av1_foreach_rest_unit_in_row(
+          &limits, &(ctxt[plane].tile_rect), lr_ctxt->on_rest_unit, lr_unit_row,
+          ctxt[plane].rsi->restoration_unit_size, unit_idx0,
+          ctxt[plane].rsi->horz_units_per_tile,
+          ctxt[plane].rsi->vert_units_per_tile, plane, &ctxt[plane],
+          lrworkerdata->rst_tmpbuf, lrworkerdata->rlbs, on_sync_read,
+          on_sync_write, lr_sync);
+
+      copy_funs[plane](lr_ctxt->dst, lr_ctxt->frame, ctxt[plane].tile_rect.left,
+                       ctxt[plane].tile_rect.right, cur_job_info->v_copy_start,
+                       cur_job_info->v_copy_end);
+    } else {
+      break;
+    }
+  }
+  return 1;
+}
+
+static void foreach_rest_unit_in_planes_mt(AV1LrStruct *lr_ctxt,
+                                           AVxWorker *workers, int nworkers,
+                                           AV1LrSync *lr_sync, AV1_COMMON *cm) {
+  FilterFrameCtxt *ctxt = lr_ctxt->ctxt;
+
+  const int num_planes = av1_num_planes(cm);
+
+  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+  int num_rows_lr = 0;
+
+  for (int plane = 0; plane < num_planes; plane++) {
+    if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
+
+    const AV1PixelRect tile_rect = ctxt[plane].tile_rect;
+    const int max_tile_h = tile_rect.bottom - tile_rect.top;
+
+    const int unit_size = cm->rst_info[plane].restoration_unit_size;
+
+    num_rows_lr =
+        AOMMAX(num_rows_lr, av1_lr_count_units_in_tile(unit_size, max_tile_h));
+  }
+
+  const int num_workers = nworkers;
+  int i;
+  assert(MAX_MB_PLANE == 3);
+
+  if (!lr_sync->sync_range || num_rows_lr != lr_sync->rows ||
+      num_workers > lr_sync->num_workers || num_planes != lr_sync->num_planes) {
+    av1_loop_restoration_dealloc(lr_sync, num_workers);
+    loop_restoration_alloc(lr_sync, cm, num_workers, num_rows_lr, num_planes,
+                           cm->width);
+  }
+
+  // Initialize cur_sb_col to -1 for all SB rows.
+  for (i = 0; i < num_planes; i++) {
+    memset(lr_sync->cur_sb_col[i], -1,
+           sizeof(*(lr_sync->cur_sb_col[i])) * num_rows_lr);
+  }
+
+  enqueue_lr_jobs(lr_sync, lr_ctxt, cm);
+
+  // Set up looprestoration thread data.
+  for (i = 0; i < num_workers; ++i) {
+    AVxWorker *const worker = &workers[i];
+    lr_sync->lrworkerdata[i].lr_ctxt = (void *)lr_ctxt;
+    worker->hook = loop_restoration_row_worker;
+    worker->data1 = lr_sync;
+    worker->data2 = &lr_sync->lrworkerdata[i];
+
+    // Start loopfiltering
+    if (i == num_workers - 1) {
+      winterface->execute(worker);
+    } else {
+      winterface->launch(worker);
+    }
+  }
+
+  // Wait till all rows are finished
+  for (i = 0; i < num_workers; ++i) {
+    winterface->sync(&workers[i]);
+  }
+}
+
+void av1_loop_restoration_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
+                                          AV1_COMMON *cm, int optimized_lr,
+                                          AVxWorker *workers, int num_workers,
+                                          AV1LrSync *lr_sync, void *lr_ctxt) {
+  assert(!cm->features.all_lossless);
+
+  const int num_planes = av1_num_planes(cm);
+
+  AV1LrStruct *loop_rest_ctxt = (AV1LrStruct *)lr_ctxt;
+
+  av1_loop_restoration_filter_frame_init(loop_rest_ctxt, frame, cm,
+                                         optimized_lr, num_planes);
+
+  foreach_rest_unit_in_planes_mt(loop_rest_ctxt, workers, num_workers, lr_sync,
+                                 cm);
+}
diff --git a/libs/libaom/src/av1/common/thread_common.h b/libs/libaom/src/av1/common/thread_common.h
new file mode 100644
index 000000000..7397f1c54
--- /dev/null
+++ b/libs/libaom/src/av1/common/thread_common.h
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_THREAD_COMMON_H_
+#define AOM_AV1_COMMON_THREAD_COMMON_H_
+
+#include "config/aom_config.h"
+
+#include "av1/common/av1_loopfilter.h"
+#include "aom_util/aom_thread.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct AV1Common;
+
+typedef struct AV1LfMTInfo {
+  int mi_row;
+  int plane;
+  int dir;
+} AV1LfMTInfo;
+
+// Loopfilter row synchronization
+typedef struct AV1LfSyncData {
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t *mutex_[MAX_MB_PLANE];
+  pthread_cond_t *cond_[MAX_MB_PLANE];
+#endif
+  // Allocate memory to store the loop-filtered superblock index in each row.
+  int *cur_sb_col[MAX_MB_PLANE];
+  // The optimal sync_range for different resolution and platform should be
+  // determined by testing. Currently, it is chosen to be a power-of-2 number.
+  int sync_range;
+  int rows;
+
+  // Row-based parallel loopfilter data
+  LFWorkerData *lfdata;
+  int num_workers;
+
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t *job_mutex;
+#endif
+  AV1LfMTInfo *job_queue;
+  int jobs_enqueued;
+  int jobs_dequeued;
+} AV1LfSync;
+
+typedef struct AV1LrMTInfo {
+  int v_start;
+  int v_end;
+  int lr_unit_row;
+  int plane;
+  int sync_mode;
+  int v_copy_start;
+  int v_copy_end;
+} AV1LrMTInfo;
+
+typedef struct LoopRestorationWorkerData {
+  int32_t *rst_tmpbuf;
+  void *rlbs;
+  void *lr_ctxt;
+} LRWorkerData;
+
+// Looprestoration row synchronization
+typedef struct AV1LrSyncData {
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t *mutex_[MAX_MB_PLANE];
+  pthread_cond_t *cond_[MAX_MB_PLANE];
+#endif
+  // Allocate memory to store the loop-restoration block index in each row.
+  int *cur_sb_col[MAX_MB_PLANE];
+  // The optimal sync_range for different resolution and platform should be
+  // determined by testing. Currently, it is chosen to be a power-of-2 number.
+  int sync_range;
+  int rows;
+  int num_planes;
+
+  int num_workers;
+
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t *job_mutex;
+#endif
+  // Row-based parallel loopfilter data
+  LRWorkerData *lrworkerdata;
+
+  AV1LrMTInfo *job_queue;
+  int jobs_enqueued;
+  int jobs_dequeued;
+} AV1LrSync;
+
+// Deallocate loopfilter synchronization related mutex and data.
+void av1_loop_filter_dealloc(AV1LfSync *lf_sync);
+
+void av1_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm,
+                              struct macroblockd *xd, int plane_start,
+                              int plane_end, int partial_frame,
+#if CONFIG_LPF_MASK
+                              int is_decoding,
+#endif
+                              AVxWorker *workers, int num_workers,
+                              AV1LfSync *lf_sync);
+void av1_loop_restoration_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
+                                          struct AV1Common *cm,
+                                          int optimized_lr, AVxWorker *workers,
+                                          int num_workers, AV1LrSync *lr_sync,
+                                          void *lr_ctxt);
+void av1_loop_restoration_dealloc(AV1LrSync *lr_sync, int num_workers);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_COMMON_THREAD_COMMON_H_
diff --git a/libs/libaom/src/av1/common/tile_common.c b/libs/libaom/src/av1/common/tile_common.c
new file mode 100644
index 000000000..1b11bd760
--- /dev/null
+++ b/libs/libaom/src/av1/common/tile_common.c
@@ -0,0 +1,239 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/resize.h"
+#include "av1/common/tile_common.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+void av1_tile_init(TileInfo *tile, const AV1_COMMON *cm, int row, int col) {
+  av1_tile_set_row(tile, cm, row);
+  av1_tile_set_col(tile, cm, col);
+}
+
+// Find smallest k>=0 such that (blk_size << k) >= target
+static int tile_log2(int blk_size, int target) {
+  int k;
+  for (k = 0; (blk_size << k) < target; k++) {
+  }
+  return k;
+}
+
+void av1_get_tile_limits(AV1_COMMON *const cm) {
+  const SequenceHeader *const seq_params = &cm->seq_params;
+  CommonTileParams *const tiles = &cm->tiles;
+  const int mi_cols =
+      ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols, seq_params->mib_size_log2);
+  const int mi_rows =
+      ALIGN_POWER_OF_TWO(cm->mi_params.mi_rows, seq_params->mib_size_log2);
+  const int sb_cols = mi_cols >> seq_params->mib_size_log2;
+  const int sb_rows = mi_rows >> seq_params->mib_size_log2;
+
+  const int sb_size_log2 = seq_params->mib_size_log2 + MI_SIZE_LOG2;
+  tiles->max_width_sb = MAX_TILE_WIDTH >> sb_size_log2;
+  const int max_tile_area_sb = MAX_TILE_AREA >> (2 * sb_size_log2);
+
+  tiles->min_log2_cols = tile_log2(tiles->max_width_sb, sb_cols);
+  tiles->max_log2_cols = tile_log2(1, AOMMIN(sb_cols, MAX_TILE_COLS));
+  tiles->max_log2_rows = tile_log2(1, AOMMIN(sb_rows, MAX_TILE_ROWS));
+  tiles->min_log2 = tile_log2(max_tile_area_sb, sb_cols * sb_rows);
+  tiles->min_log2 = AOMMAX(tiles->min_log2, tiles->min_log2_cols);
+}
+
+void av1_calculate_tile_cols(const SequenceHeader *const seq_params,
+                             int cm_mi_rows, int cm_mi_cols,
+                             CommonTileParams *const tiles) {
+  int mi_cols = ALIGN_POWER_OF_TWO(cm_mi_cols, seq_params->mib_size_log2);
+  int mi_rows = ALIGN_POWER_OF_TWO(cm_mi_rows, seq_params->mib_size_log2);
+  int sb_cols = mi_cols >> seq_params->mib_size_log2;
+  int sb_rows = mi_rows >> seq_params->mib_size_log2;
+  int i;
+
+  // This will be overridden if there is at least two columns of tiles
+  // (otherwise there is no inner tile width)
+  tiles->min_inner_width = -1;
+
+  if (tiles->uniform_spacing) {
+    int start_sb;
+    int size_sb = ALIGN_POWER_OF_TWO(sb_cols, tiles->log2_cols);
+    size_sb >>= tiles->log2_cols;
+    assert(size_sb > 0);
+    for (i = 0, start_sb = 0; start_sb < sb_cols; i++) {
+      tiles->col_start_sb[i] = start_sb;
+      start_sb += size_sb;
+    }
+    tiles->cols = i;
+    tiles->col_start_sb[i] = sb_cols;
+    tiles->min_log2_rows = AOMMAX(tiles->min_log2 - tiles->log2_cols, 0);
+    tiles->max_height_sb = sb_rows >> tiles->min_log2_rows;
+
+    tiles->width = size_sb << seq_params->mib_size_log2;
+    tiles->width = AOMMIN(tiles->width, cm_mi_cols);
+    if (tiles->cols > 1) {
+      tiles->min_inner_width = tiles->width;
+    }
+  } else {
+    int max_tile_area_sb = (sb_rows * sb_cols);
+    int widest_tile_sb = 1;
+    int narrowest_inner_tile_sb = 65536;
+    tiles->log2_cols = tile_log2(1, tiles->cols);
+    for (i = 0; i < tiles->cols; i++) {
+      int size_sb = tiles->col_start_sb[i + 1] - tiles->col_start_sb[i];
+      widest_tile_sb = AOMMAX(widest_tile_sb, size_sb);
+      // ignore the rightmost tile in frame for determining the narrowest
+      if (i < tiles->cols - 1)
+        narrowest_inner_tile_sb = AOMMIN(narrowest_inner_tile_sb, size_sb);
+    }
+    if (tiles->min_log2) {
+      max_tile_area_sb >>= (tiles->min_log2 + 1);
+    }
+    tiles->max_height_sb = AOMMAX(max_tile_area_sb / widest_tile_sb, 1);
+    if (tiles->cols > 1) {
+      tiles->min_inner_width = narrowest_inner_tile_sb
+                               << seq_params->mib_size_log2;
+    }
+  }
+}
+
+void av1_calculate_tile_rows(const SequenceHeader *const seq_params,
+                             int cm_mi_rows, CommonTileParams *const tiles) {
+  int mi_rows = ALIGN_POWER_OF_TWO(cm_mi_rows, seq_params->mib_size_log2);
+  int sb_rows = mi_rows >> seq_params->mib_size_log2;
+  int start_sb, size_sb, i;
+
+  if (tiles->uniform_spacing) {
+    size_sb = ALIGN_POWER_OF_TWO(sb_rows, tiles->log2_rows);
+    size_sb >>= tiles->log2_rows;
+    assert(size_sb > 0);
+    for (i = 0, start_sb = 0; start_sb < sb_rows; i++) {
+      tiles->row_start_sb[i] = start_sb;
+      start_sb += size_sb;
+    }
+    tiles->rows = i;
+    tiles->row_start_sb[i] = sb_rows;
+
+    tiles->height = size_sb << seq_params->mib_size_log2;
+    tiles->height = AOMMIN(tiles->height, cm_mi_rows);
+  } else {
+    tiles->log2_rows = tile_log2(1, tiles->rows);
+  }
+}
+
+void av1_tile_set_row(TileInfo *tile, const AV1_COMMON *cm, int row) {
+  assert(row < cm->tiles.rows);
+  int mi_row_start = cm->tiles.row_start_sb[row]
+                     << cm->seq_params.mib_size_log2;
+  int mi_row_end = cm->tiles.row_start_sb[row + 1]
+                   << cm->seq_params.mib_size_log2;
+  tile->tile_row = row;
+  tile->mi_row_start = mi_row_start;
+  tile->mi_row_end = AOMMIN(mi_row_end, cm->mi_params.mi_rows);
+  assert(tile->mi_row_end > tile->mi_row_start);
+}
+
+void av1_tile_set_col(TileInfo *tile, const AV1_COMMON *cm, int col) {
+  assert(col < cm->tiles.cols);
+  int mi_col_start = cm->tiles.col_start_sb[col]
+                     << cm->seq_params.mib_size_log2;
+  int mi_col_end = cm->tiles.col_start_sb[col + 1]
+                   << cm->seq_params.mib_size_log2;
+  tile->tile_col = col;
+  tile->mi_col_start = mi_col_start;
+  tile->mi_col_end = AOMMIN(mi_col_end, cm->mi_params.mi_cols);
+  assert(tile->mi_col_end > tile->mi_col_start);
+}
+
+int av1_get_sb_rows_in_tile(AV1_COMMON *cm, TileInfo tile) {
+  int mi_rows_aligned_to_sb = ALIGN_POWER_OF_TWO(
+      tile.mi_row_end - tile.mi_row_start, cm->seq_params.mib_size_log2);
+  int sb_rows = mi_rows_aligned_to_sb >> cm->seq_params.mib_size_log2;
+
+  return sb_rows;
+}
+
+int av1_get_sb_cols_in_tile(AV1_COMMON *cm, TileInfo tile) {
+  int mi_cols_aligned_to_sb = ALIGN_POWER_OF_TWO(
+      tile.mi_col_end - tile.mi_col_start, cm->seq_params.mib_size_log2);
+  int sb_cols = mi_cols_aligned_to_sb >> cm->seq_params.mib_size_log2;
+
+  return sb_cols;
+}
+
+AV1PixelRect av1_get_tile_rect(const TileInfo *tile_info, const AV1_COMMON *cm,
+                               int is_uv) {
+  AV1PixelRect r;
+
+  // Calculate position in the Y plane
+  r.left = tile_info->mi_col_start * MI_SIZE;
+  r.right = tile_info->mi_col_end * MI_SIZE;
+  r.top = tile_info->mi_row_start * MI_SIZE;
+  r.bottom = tile_info->mi_row_end * MI_SIZE;
+
+  // If upscaling is enabled, the tile limits need scaling to match the
+  // upscaled frame where the restoration units live. To do this, scale up the
+  // top-left and bottom-right of the tile.
+  if (av1_superres_scaled(cm)) {
+    av1_calculate_unscaled_superres_size(&r.left, &r.top,
+                                         cm->superres_scale_denominator);
+    av1_calculate_unscaled_superres_size(&r.right, &r.bottom,
+                                         cm->superres_scale_denominator);
+  }
+
+  const int frame_w = cm->superres_upscaled_width;
+  const int frame_h = cm->superres_upscaled_height;
+
+  // Make sure we don't fall off the bottom-right of the frame.
+  r.right = AOMMIN(r.right, frame_w);
+  r.bottom = AOMMIN(r.bottom, frame_h);
+
+  // Convert to coordinates in the appropriate plane
+  const int ss_x = is_uv && cm->seq_params.subsampling_x;
+  const int ss_y = is_uv && cm->seq_params.subsampling_y;
+
+  r.left = ROUND_POWER_OF_TWO(r.left, ss_x);
+  r.right = ROUND_POWER_OF_TWO(r.right, ss_x);
+  r.top = ROUND_POWER_OF_TWO(r.top, ss_y);
+  r.bottom = ROUND_POWER_OF_TWO(r.bottom, ss_y);
+
+  return r;
+}
+
+void av1_get_uniform_tile_size(const AV1_COMMON *cm, int *w, int *h) {
+  const CommonTileParams *const tiles = &cm->tiles;
+  if (tiles->uniform_spacing) {
+    *w = tiles->width;
+    *h = tiles->height;
+  } else {
+    for (int i = 0; i < tiles->cols; ++i) {
+      const int tile_width_sb =
+          tiles->col_start_sb[i + 1] - tiles->col_start_sb[i];
+      const int tile_w = tile_width_sb * cm->seq_params.mib_size;
+      assert(i == 0 || tile_w == *w);  // ensure all tiles have same dimension
+      *w = tile_w;
+    }
+
+    for (int i = 0; i < tiles->rows; ++i) {
+      const int tile_height_sb =
+          tiles->row_start_sb[i + 1] - tiles->row_start_sb[i];
+      const int tile_h = tile_height_sb * cm->seq_params.mib_size;
+      assert(i == 0 || tile_h == *h);  // ensure all tiles have same dimension
+      *h = tile_h;
+    }
+  }
+}
+
+int av1_is_min_tile_width_satisfied(const AV1_COMMON *cm) {
+  // Disable check if there is a single tile col in the frame
+  if (cm->tiles.cols == 1) return 1;
+
+  return ((cm->tiles.min_inner_width << MI_SIZE_LOG2) >=
+          (64 << av1_superres_scaled(cm)));
+}
diff --git a/libs/libaom/src/av1/common/tile_common.h b/libs/libaom/src/av1/common/tile_common.h
new file mode 100644
index 000000000..ca7c5f496
--- /dev/null
+++ b/libs/libaom/src/av1/common/tile_common.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_TILE_COMMON_H_
+#define AOM_AV1_COMMON_TILE_COMMON_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "config/aom_config.h"
+
+struct AV1Common;
+struct SequenceHeader;
+struct CommonTileParams;
+
+#define DEFAULT_MAX_NUM_TG 1
+
+typedef struct TileInfo {
+  int mi_row_start, mi_row_end;
+  int mi_col_start, mi_col_end;
+  int tile_row;
+  int tile_col;
+} TileInfo;
+
+// initializes 'tile->mi_(row|col)_(start|end)' for (row, col) based on
+// 'cm->log2_tile_(rows|cols)' & 'cm->mi_(rows|cols)'
+void av1_tile_init(TileInfo *tile, const struct AV1Common *cm, int row,
+                   int col);
+
+void av1_tile_set_row(TileInfo *tile, const struct AV1Common *cm, int row);
+void av1_tile_set_col(TileInfo *tile, const struct AV1Common *cm, int col);
+
+int av1_get_sb_rows_in_tile(struct AV1Common *cm, TileInfo tile);
+int av1_get_sb_cols_in_tile(struct AV1Common *cm, TileInfo tile);
+
+typedef struct {
+  int left, top, right, bottom;
+} AV1PixelRect;
+
+// Return the pixel extents of the given tile
+AV1PixelRect av1_get_tile_rect(const TileInfo *tile_info,
+                               const struct AV1Common *cm, int is_uv);
+
+// Define tile maximum width and area
+// There is no maximum height since height is limited by area and width limits
+// The minimum tile width or height is fixed at one superblock
+#define MAX_TILE_WIDTH (4096)        // Max Tile width in pixels
+#define MAX_TILE_AREA (4096 * 2304)  // Maximum tile area in pixels
+
+void av1_get_uniform_tile_size(const struct AV1Common *cm, int *w, int *h);
+void av1_get_tile_limits(struct AV1Common *const cm);
+void av1_calculate_tile_cols(const struct SequenceHeader *const seq_params,
+                             int cm_mi_rows, int cm_mi_cols,
+                             struct CommonTileParams *const tiles);
+void av1_calculate_tile_rows(const struct SequenceHeader *const seq_params,
+                             int cm_mi_rows,
+                             struct CommonTileParams *const tiles);
+
+// Checks if the minimum tile_width requirement is satisfied
+int av1_is_min_tile_width_satisfied(const struct AV1Common *cm);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_COMMON_TILE_COMMON_H_
diff --git a/libs/libaom/src/av1/common/timing.c b/libs/libaom/src/av1/common/timing.c
new file mode 100644
index 000000000..a959cdf76
--- /dev/null
+++ b/libs/libaom/src/av1/common/timing.c
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/timing.h"
+
+/* Tables for AV1 max bitrates for different levels of main and high tier.
+ * The tables are in Kbps instead of Mbps in the specification.
+ * Note that depending on the profile, a multiplier is needed.
+ */
+#define UNDEFINED_RATE \
+  (1 << 21)  // Placeholder rate for levels with undefined rate
+#define INVALID_RATE \
+  (0)  // For invalid profile-level configuration, set rate to 0
+
+/* Max Bitrates for levels of Main Tier in kbps. Bitrate in main_kbps [31] */
+/* is a dummy value. The decoder model is not applicable for level 31. */
+static int32_t main_kbps[1 << LEVEL_BITS] = {
+  1500,           3000,           UNDEFINED_RATE, UNDEFINED_RATE,
+  6000,           10000,          UNDEFINED_RATE, UNDEFINED_RATE,
+  12000,          20000,          UNDEFINED_RATE, UNDEFINED_RATE,
+  30000,          40000,          60000,          60000,
+  60000,          100000,         160000,         160000,
+  UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE,
+  UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE,
+  UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE
+};
+
+/* Max Bitrates for levels of High Tier in kbps. Bitrate in high_kbps [31] */
+/* is a dummy value. The decoder model is not applicable for level 31. */
+static int32_t high_kbps[1 << LEVEL_BITS] = {
+  INVALID_RATE,   INVALID_RATE,   INVALID_RATE,   INVALID_RATE,
+  INVALID_RATE,   INVALID_RATE,   INVALID_RATE,   INVALID_RATE,
+  30000,          50000,          UNDEFINED_RATE, UNDEFINED_RATE,
+  100000,         160000,         240000,         240000,
+  240000,         480000,         800000,         800000,
+  UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE,
+  UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE,
+  UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE, UNDEFINED_RATE
+};
+
+/* BitrateProfileFactor */
+static int bitrate_profile_factor[1 << PROFILE_BITS] = {
+  1, 2, 3, 0, 0, 0, 0, 0
+};
+
+int64_t av1_max_level_bitrate(BITSTREAM_PROFILE seq_profile, int seq_level_idx,
+                              int seq_tier) {
+  int64_t bitrate;
+
+  if (seq_tier) {
+    bitrate = high_kbps[seq_level_idx] * bitrate_profile_factor[seq_profile];
+  } else {
+    bitrate = main_kbps[seq_level_idx] * bitrate_profile_factor[seq_profile];
+  }
+
+  return bitrate * 1000;
+}
+
+void av1_set_aom_dec_model_info(aom_dec_model_info_t *decoder_model) {
+  decoder_model->encoder_decoder_buffer_delay_length = 16;
+  decoder_model->buffer_removal_time_length = 10;
+  decoder_model->frame_presentation_time_length = 10;
+}
+
+void av1_set_dec_model_op_parameters(aom_dec_model_op_parameters_t *op_params) {
+  op_params->decoder_model_param_present_flag = 1;
+  op_params->decoder_buffer_delay = 90000 >> 1;  //  0.5 s
+  op_params->encoder_buffer_delay = 90000 >> 1;  //  0.5 s
+  op_params->low_delay_mode_flag = 0;
+  op_params->display_model_param_present_flag = 1;
+  op_params->initial_display_delay = 8;  // 8 frames delay
+}
+
+void av1_set_resource_availability_parameters(
+    aom_dec_model_op_parameters_t *op_params) {
+  op_params->decoder_model_param_present_flag = 0;
+  op_params->decoder_buffer_delay =
+      70000;  // Resource availability mode default
+  op_params->encoder_buffer_delay =
+      20000;                           // Resource availability mode default
+  op_params->low_delay_mode_flag = 0;  // Resource availability mode default
+  op_params->display_model_param_present_flag = 1;
+  op_params->initial_display_delay = 8;  // 8 frames delay
+}
diff --git a/libs/libaom/src/av1/common/timing.h b/libs/libaom/src/av1/common/timing.h
new file mode 100644
index 000000000..9192124f7
--- /dev/null
+++ b/libs/libaom/src/av1/common/timing.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_TIMING_H_
+#define AOM_AV1_COMMON_TIMING_H_
+
+#include "aom/aom_integer.h"
+#include "av1/common/enums.h"
+
+#define MAX_NUM_OP_POINTS 32
+
+typedef struct aom_timing {
+  uint32_t num_units_in_display_tick;
+  uint32_t time_scale;
+  int equal_picture_interval;
+  uint32_t num_ticks_per_picture;
+} aom_timing_info_t;
+
+typedef struct aom_dec_model_info {
+  uint32_t num_units_in_decoding_tick;
+  int encoder_decoder_buffer_delay_length;
+  int buffer_removal_time_length;
+  int frame_presentation_time_length;
+} aom_dec_model_info_t;
+
+typedef struct aom_dec_model_op_parameters {
+  int decoder_model_param_present_flag;
+  int64_t bitrate;
+  int64_t buffer_size;
+  uint32_t decoder_buffer_delay;
+  uint32_t encoder_buffer_delay;
+  int low_delay_mode_flag;
+  int display_model_param_present_flag;
+  int initial_display_delay;
+} aom_dec_model_op_parameters_t;
+
+void av1_set_aom_dec_model_info(aom_dec_model_info_t *decoder_model);
+
+void av1_set_dec_model_op_parameters(aom_dec_model_op_parameters_t *op_params);
+
+void av1_set_resource_availability_parameters(
+    aom_dec_model_op_parameters_t *op_params);
+
+int64_t av1_max_level_bitrate(BITSTREAM_PROFILE seq_profile, int seq_level_idx,
+                              int seq_tier);
+
+#endif  // AOM_AV1_COMMON_TIMING_H_
diff --git a/libs/libaom/src/av1/common/token_cdfs.h b/libs/libaom/src/av1/common/token_cdfs.h
new file mode 100644
index 000000000..f1edda58d
--- /dev/null
+++ b/libs/libaom/src/av1/common/token_cdfs.h
@@ -0,0 +1,3555 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_TOKEN_CDFS_H_
+#define AOM_AV1_COMMON_TOKEN_CDFS_H_
+
+#include "config/aom_config.h"
+
+#include "av1/common/entropy.h"
+
+static const aom_cdf_prob
+    av1_default_dc_sign_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][DC_SIGN_CONTEXTS]
+                            [CDF_SIZE(2)] = {
+                              { {
+                                    { AOM_CDF2(128 * 125) },
+                                    { AOM_CDF2(128 * 102) },
+                                    { AOM_CDF2(128 * 147) },
+                                },
+                                {
+                                    { AOM_CDF2(128 * 119) },
+                                    { AOM_CDF2(128 * 101) },
+                                    { AOM_CDF2(128 * 135) },
+                                } },
+                              { {
+                                    { AOM_CDF2(128 * 125) },
+                                    { AOM_CDF2(128 * 102) },
+                                    { AOM_CDF2(128 * 147) },
+                                },
+                                {
+                                    { AOM_CDF2(128 * 119) },
+                                    { AOM_CDF2(128 * 101) },
+                                    { AOM_CDF2(128 * 135) },
+                                } },
+                              { {
+                                    { AOM_CDF2(128 * 125) },
+                                    { AOM_CDF2(128 * 102) },
+                                    { AOM_CDF2(128 * 147) },
+                                },
+                                {
+                                    { AOM_CDF2(128 * 119) },
+                                    { AOM_CDF2(128 * 101) },
+                                    { AOM_CDF2(128 * 135) },
+                                } },
+                              { {
+                                    { AOM_CDF2(128 * 125) },
+                                    { AOM_CDF2(128 * 102) },
+                                    { AOM_CDF2(128 * 147) },
+                                },
+                                {
+                                    { AOM_CDF2(128 * 119) },
+                                    { AOM_CDF2(128 * 101) },
+                                    { AOM_CDF2(128 * 135) },
+                                } },
+                            };
+
+static const aom_cdf_prob
+    av1_default_txb_skip_cdfs[TOKEN_CDF_Q_CTXS][TX_SIZES][TXB_SKIP_CONTEXTS]
+                             [CDF_SIZE(2)] = { { { { AOM_CDF2(31849) },
+                                                   { AOM_CDF2(5892) },
+                                                   { AOM_CDF2(12112) },
+                                                   { AOM_CDF2(21935) },
+                                                   { AOM_CDF2(20289) },
+                                                   { AOM_CDF2(27473) },
+                                                   { AOM_CDF2(32487) },
+                                                   { AOM_CDF2(7654) },
+                                                   { AOM_CDF2(19473) },
+                                                   { AOM_CDF2(29984) },
+                                                   { AOM_CDF2(9961) },
+                                                   { AOM_CDF2(30242) },
+                                                   { AOM_CDF2(32117) } },
+                                                 { { AOM_CDF2(31548) },
+                                                   { AOM_CDF2(1549) },
+                                                   { AOM_CDF2(10130) },
+                                                   { AOM_CDF2(16656) },
+                                                   { AOM_CDF2(18591) },
+                                                   { AOM_CDF2(26308) },
+                                                   { AOM_CDF2(32537) },
+                                                   { AOM_CDF2(5403) },
+                                                   { AOM_CDF2(18096) },
+                                                   { AOM_CDF2(30003) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) } },
+                                                 { { AOM_CDF2(29957) },
+                                                   { AOM_CDF2(5391) },
+                                                   { AOM_CDF2(18039) },
+                                                   { AOM_CDF2(23566) },
+                                                   { AOM_CDF2(22431) },
+                                                   { AOM_CDF2(25822) },
+                                                   { AOM_CDF2(32197) },
+                                                   { AOM_CDF2(3778) },
+                                                   { AOM_CDF2(15336) },
+                                                   { AOM_CDF2(28981) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) } },
+                                                 { { AOM_CDF2(17920) },
+                                                   { AOM_CDF2(1818) },
+                                                   { AOM_CDF2(7282) },
+                                                   { AOM_CDF2(25273) },
+                                                   { AOM_CDF2(10923) },
+                                                   { AOM_CDF2(31554) },
+                                                   { AOM_CDF2(32624) },
+                                                   { AOM_CDF2(1366) },
+                                                   { AOM_CDF2(15628) },
+                                                   { AOM_CDF2(30462) },
+                                                   { AOM_CDF2(146) },
+                                                   { AOM_CDF2(5132) },
+                                                   { AOM_CDF2(31657) } },
+                                                 { { AOM_CDF2(6308) },
+                                                   { AOM_CDF2(117) },
+                                                   { AOM_CDF2(1638) },
+                                                   { AOM_CDF2(2161) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(10923) },
+                                                   { AOM_CDF2(30247) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) } } },
+                                               { { { AOM_CDF2(30371) },
+                                                   { AOM_CDF2(7570) },
+                                                   { AOM_CDF2(13155) },
+                                                   { AOM_CDF2(20751) },
+                                                   { AOM_CDF2(20969) },
+                                                   { AOM_CDF2(27067) },
+                                                   { AOM_CDF2(32013) },
+                                                   { AOM_CDF2(5495) },
+                                                   { AOM_CDF2(17942) },
+                                                   { AOM_CDF2(28280) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) } },
+                                                 { { AOM_CDF2(31782) },
+                                                   { AOM_CDF2(1836) },
+                                                   { AOM_CDF2(10689) },
+                                                   { AOM_CDF2(17604) },
+                                                   { AOM_CDF2(21622) },
+                                                   { AOM_CDF2(27518) },
+                                                   { AOM_CDF2(32399) },
+                                                   { AOM_CDF2(4419) },
+                                                   { AOM_CDF2(16294) },
+                                                   { AOM_CDF2(28345) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) } },
+                                                 { { AOM_CDF2(31901) },
+                                                   { AOM_CDF2(10311) },
+                                                   { AOM_CDF2(18047) },
+                                                   { AOM_CDF2(24806) },
+                                                   { AOM_CDF2(23288) },
+                                                   { AOM_CDF2(27914) },
+                                                   { AOM_CDF2(32296) },
+                                                   { AOM_CDF2(4215) },
+                                                   { AOM_CDF2(15756) },
+                                                   { AOM_CDF2(28341) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) } },
+                                                 { { AOM_CDF2(26726) },
+                                                   { AOM_CDF2(1045) },
+                                                   { AOM_CDF2(11703) },
+                                                   { AOM_CDF2(20590) },
+                                                   { AOM_CDF2(18554) },
+                                                   { AOM_CDF2(25970) },
+                                                   { AOM_CDF2(31938) },
+                                                   { AOM_CDF2(5583) },
+                                                   { AOM_CDF2(21313) },
+                                                   { AOM_CDF2(29390) },
+                                                   { AOM_CDF2(641) },
+                                                   { AOM_CDF2(22265) },
+                                                   { AOM_CDF2(31452) } },
+                                                 { { AOM_CDF2(26584) },
+                                                   { AOM_CDF2(188) },
+                                                   { AOM_CDF2(8847) },
+                                                   { AOM_CDF2(24519) },
+                                                   { AOM_CDF2(22938) },
+                                                   { AOM_CDF2(30583) },
+                                                   { AOM_CDF2(32608) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) } } },
+                                               { { { AOM_CDF2(29614) },
+                                                   { AOM_CDF2(9068) },
+                                                   { AOM_CDF2(12924) },
+                                                   { AOM_CDF2(19538) },
+                                                   { AOM_CDF2(17737) },
+                                                   { AOM_CDF2(24619) },
+                                                   { AOM_CDF2(30642) },
+                                                   { AOM_CDF2(4119) },
+                                                   { AOM_CDF2(16026) },
+                                                   { AOM_CDF2(25657) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) } },
+                                                 { { AOM_CDF2(31957) },
+                                                   { AOM_CDF2(3230) },
+                                                   { AOM_CDF2(11153) },
+                                                   { AOM_CDF2(18123) },
+                                                   { AOM_CDF2(20143) },
+                                                   { AOM_CDF2(26536) },
+                                                   { AOM_CDF2(31986) },
+                                                   { AOM_CDF2(3050) },
+                                                   { AOM_CDF2(14603) },
+                                                   { AOM_CDF2(25155) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) } },
+                                                 { { AOM_CDF2(32363) },
+                                                   { AOM_CDF2(10692) },
+                                                   { AOM_CDF2(19090) },
+                                                   { AOM_CDF2(24357) },
+                                                   { AOM_CDF2(24442) },
+                                                   { AOM_CDF2(28312) },
+                                                   { AOM_CDF2(32169) },
+                                                   { AOM_CDF2(3648) },
+                                                   { AOM_CDF2(15690) },
+                                                   { AOM_CDF2(26815) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) } },
+                                                 { { AOM_CDF2(30669) },
+                                                   { AOM_CDF2(3832) },
+                                                   { AOM_CDF2(11663) },
+                                                   { AOM_CDF2(18889) },
+                                                   { AOM_CDF2(19782) },
+                                                   { AOM_CDF2(23313) },
+                                                   { AOM_CDF2(31330) },
+                                                   { AOM_CDF2(5124) },
+                                                   { AOM_CDF2(18719) },
+                                                   { AOM_CDF2(28468) },
+                                                   { AOM_CDF2(3082) },
+                                                   { AOM_CDF2(20982) },
+                                                   { AOM_CDF2(29443) } },
+                                                 { { AOM_CDF2(28573) },
+                                                   { AOM_CDF2(3183) },
+                                                   { AOM_CDF2(17802) },
+                                                   { AOM_CDF2(25977) },
+                                                   { AOM_CDF2(26677) },
+                                                   { AOM_CDF2(27832) },
+                                                   { AOM_CDF2(32387) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) } } },
+                                               { { { AOM_CDF2(26887) },
+                                                   { AOM_CDF2(6729) },
+                                                   { AOM_CDF2(10361) },
+                                                   { AOM_CDF2(17442) },
+                                                   { AOM_CDF2(15045) },
+                                                   { AOM_CDF2(22478) },
+                                                   { AOM_CDF2(29072) },
+                                                   { AOM_CDF2(2713) },
+                                                   { AOM_CDF2(11861) },
+                                                   { AOM_CDF2(20773) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) } },
+                                                 { { AOM_CDF2(31903) },
+                                                   { AOM_CDF2(2044) },
+                                                   { AOM_CDF2(7528) },
+                                                   { AOM_CDF2(14618) },
+                                                   { AOM_CDF2(16182) },
+                                                   { AOM_CDF2(24168) },
+                                                   { AOM_CDF2(31037) },
+                                                   { AOM_CDF2(2786) },
+                                                   { AOM_CDF2(11194) },
+                                                   { AOM_CDF2(20155) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) } },
+                                                 { { AOM_CDF2(32510) },
+                                                   { AOM_CDF2(8430) },
+                                                   { AOM_CDF2(17318) },
+                                                   { AOM_CDF2(24154) },
+                                                   { AOM_CDF2(23674) },
+                                                   { AOM_CDF2(28789) },
+                                                   { AOM_CDF2(32139) },
+                                                   { AOM_CDF2(3440) },
+                                                   { AOM_CDF2(13117) },
+                                                   { AOM_CDF2(22702) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) } },
+                                                 { { AOM_CDF2(31671) },
+                                                   { AOM_CDF2(2056) },
+                                                   { AOM_CDF2(11746) },
+                                                   { AOM_CDF2(16852) },
+                                                   { AOM_CDF2(18635) },
+                                                   { AOM_CDF2(24715) },
+                                                   { AOM_CDF2(31484) },
+                                                   { AOM_CDF2(4656) },
+                                                   { AOM_CDF2(16074) },
+                                                   { AOM_CDF2(24704) },
+                                                   { AOM_CDF2(1806) },
+                                                   { AOM_CDF2(14645) },
+                                                   { AOM_CDF2(25336) } },
+                                                 { { AOM_CDF2(31539) },
+                                                   { AOM_CDF2(8433) },
+                                                   { AOM_CDF2(20576) },
+                                                   { AOM_CDF2(27904) },
+                                                   { AOM_CDF2(27852) },
+                                                   { AOM_CDF2(30026) },
+                                                   { AOM_CDF2(32441) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) },
+                                                   { AOM_CDF2(16384) } } } };
+
+static const aom_cdf_prob
+    av1_default_eob_extra_cdfs[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES]
+                              [EOB_COEF_CONTEXTS][CDF_SIZE(2)] = {
+                                { { {
+                                        { AOM_CDF2(16961) },
+                                        { AOM_CDF2(17223) },
+                                        { AOM_CDF2(7621) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    },
+                                    {
+                                        { AOM_CDF2(19069) },
+                                        { AOM_CDF2(22525) },
+                                        { AOM_CDF2(13377) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    } },
+                                  { {
+                                        { AOM_CDF2(20401) },
+                                        { AOM_CDF2(17025) },
+                                        { AOM_CDF2(12845) },
+                                        { AOM_CDF2(12873) },
+                                        { AOM_CDF2(14094) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    },
+                                    {
+                                        { AOM_CDF2(20681) },
+                                        { AOM_CDF2(20701) },
+                                        { AOM_CDF2(15250) },
+                                        { AOM_CDF2(15017) },
+                                        { AOM_CDF2(14928) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    } },
+                                  { {
+                                        { AOM_CDF2(23905) },
+                                        { AOM_CDF2(17194) },
+                                        { AOM_CDF2(16170) },
+                                        { AOM_CDF2(17695) },
+                                        { AOM_CDF2(13826) },
+                                        { AOM_CDF2(15810) },
+                                        { AOM_CDF2(12036) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    },
+                                    {
+                                        { AOM_CDF2(23959) },
+                                        { AOM_CDF2(20799) },
+                                        { AOM_CDF2(19021) },
+                                        { AOM_CDF2(16203) },
+                                        { AOM_CDF2(17886) },
+                                        { AOM_CDF2(14144) },
+                                        { AOM_CDF2(12010) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    } },
+                                  { {
+                                        { AOM_CDF2(27399) },
+                                        { AOM_CDF2(16327) },
+                                        { AOM_CDF2(18071) },
+                                        { AOM_CDF2(19584) },
+                                        { AOM_CDF2(20721) },
+                                        { AOM_CDF2(18432) },
+                                        { AOM_CDF2(19560) },
+                                        { AOM_CDF2(10150) },
+                                        { AOM_CDF2(8805) },
+                                    },
+                                    {
+                                        { AOM_CDF2(24932) },
+                                        { AOM_CDF2(20833) },
+                                        { AOM_CDF2(12027) },
+                                        { AOM_CDF2(16670) },
+                                        { AOM_CDF2(19914) },
+                                        { AOM_CDF2(15106) },
+                                        { AOM_CDF2(17662) },
+                                        { AOM_CDF2(13783) },
+                                        { AOM_CDF2(28756) },
+                                    } },
+                                  { {
+                                        { AOM_CDF2(23406) },
+                                        { AOM_CDF2(21845) },
+                                        { AOM_CDF2(18432) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(17096) },
+                                        { AOM_CDF2(12561) },
+                                        { AOM_CDF2(17320) },
+                                        { AOM_CDF2(22395) },
+                                        { AOM_CDF2(21370) },
+                                    },
+                                    {
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    } } },
+                                { { {
+                                        { AOM_CDF2(17471) },
+                                        { AOM_CDF2(20223) },
+                                        { AOM_CDF2(11357) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    },
+                                    {
+                                        { AOM_CDF2(20335) },
+                                        { AOM_CDF2(21667) },
+                                        { AOM_CDF2(14818) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    } },
+                                  { {
+                                        { AOM_CDF2(20430) },
+                                        { AOM_CDF2(20662) },
+                                        { AOM_CDF2(15367) },
+                                        { AOM_CDF2(16970) },
+                                        { AOM_CDF2(14657) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    },
+                                    {
+                                        { AOM_CDF2(22117) },
+                                        { AOM_CDF2(22028) },
+                                        { AOM_CDF2(18650) },
+                                        { AOM_CDF2(16042) },
+                                        { AOM_CDF2(15885) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    } },
+                                  { {
+                                        { AOM_CDF2(22409) },
+                                        { AOM_CDF2(21012) },
+                                        { AOM_CDF2(15650) },
+                                        { AOM_CDF2(17395) },
+                                        { AOM_CDF2(15469) },
+                                        { AOM_CDF2(20205) },
+                                        { AOM_CDF2(19511) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    },
+                                    {
+                                        { AOM_CDF2(24220) },
+                                        { AOM_CDF2(22480) },
+                                        { AOM_CDF2(17737) },
+                                        { AOM_CDF2(18916) },
+                                        { AOM_CDF2(19268) },
+                                        { AOM_CDF2(18412) },
+                                        { AOM_CDF2(18844) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    } },
+                                  { {
+                                        { AOM_CDF2(25991) },
+                                        { AOM_CDF2(20314) },
+                                        { AOM_CDF2(17731) },
+                                        { AOM_CDF2(19678) },
+                                        { AOM_CDF2(18649) },
+                                        { AOM_CDF2(17307) },
+                                        { AOM_CDF2(21798) },
+                                        { AOM_CDF2(17549) },
+                                        { AOM_CDF2(15630) },
+                                    },
+                                    {
+                                        { AOM_CDF2(26585) },
+                                        { AOM_CDF2(21469) },
+                                        { AOM_CDF2(20432) },
+                                        { AOM_CDF2(17735) },
+                                        { AOM_CDF2(19280) },
+                                        { AOM_CDF2(15235) },
+                                        { AOM_CDF2(20297) },
+                                        { AOM_CDF2(22471) },
+                                        { AOM_CDF2(28997) },
+                                    } },
+                                  { {
+                                        { AOM_CDF2(26605) },
+                                        { AOM_CDF2(11304) },
+                                        { AOM_CDF2(16726) },
+                                        { AOM_CDF2(16560) },
+                                        { AOM_CDF2(20866) },
+                                        { AOM_CDF2(23524) },
+                                        { AOM_CDF2(19878) },
+                                        { AOM_CDF2(13469) },
+                                        { AOM_CDF2(23084) },
+                                    },
+                                    {
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    } } },
+                                { { {
+                                        { AOM_CDF2(18983) },
+                                        { AOM_CDF2(20512) },
+                                        { AOM_CDF2(14885) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    },
+                                    {
+                                        { AOM_CDF2(20090) },
+                                        { AOM_CDF2(19444) },
+                                        { AOM_CDF2(17286) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    } },
+                                  { {
+                                        { AOM_CDF2(19139) },
+                                        { AOM_CDF2(21487) },
+                                        { AOM_CDF2(18959) },
+                                        { AOM_CDF2(20910) },
+                                        { AOM_CDF2(19089) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    },
+                                    {
+                                        { AOM_CDF2(20536) },
+                                        { AOM_CDF2(20664) },
+                                        { AOM_CDF2(20625) },
+                                        { AOM_CDF2(19123) },
+                                        { AOM_CDF2(14862) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    } },
+                                  { {
+                                        { AOM_CDF2(19833) },
+                                        { AOM_CDF2(21502) },
+                                        { AOM_CDF2(17485) },
+                                        { AOM_CDF2(20267) },
+                                        { AOM_CDF2(18353) },
+                                        { AOM_CDF2(23329) },
+                                        { AOM_CDF2(21478) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    },
+                                    {
+                                        { AOM_CDF2(22041) },
+                                        { AOM_CDF2(23434) },
+                                        { AOM_CDF2(20001) },
+                                        { AOM_CDF2(20554) },
+                                        { AOM_CDF2(20951) },
+                                        { AOM_CDF2(20145) },
+                                        { AOM_CDF2(15562) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    } },
+                                  { {
+                                        { AOM_CDF2(23312) },
+                                        { AOM_CDF2(21607) },
+                                        { AOM_CDF2(16526) },
+                                        { AOM_CDF2(18957) },
+                                        { AOM_CDF2(18034) },
+                                        { AOM_CDF2(18934) },
+                                        { AOM_CDF2(24247) },
+                                        { AOM_CDF2(16921) },
+                                        { AOM_CDF2(17080) },
+                                    },
+                                    {
+                                        { AOM_CDF2(26579) },
+                                        { AOM_CDF2(24910) },
+                                        { AOM_CDF2(18637) },
+                                        { AOM_CDF2(19800) },
+                                        { AOM_CDF2(20388) },
+                                        { AOM_CDF2(9887) },
+                                        { AOM_CDF2(15642) },
+                                        { AOM_CDF2(30198) },
+                                        { AOM_CDF2(24721) },
+                                    } },
+                                  { {
+                                        { AOM_CDF2(26998) },
+                                        { AOM_CDF2(16737) },
+                                        { AOM_CDF2(17838) },
+                                        { AOM_CDF2(18922) },
+                                        { AOM_CDF2(19515) },
+                                        { AOM_CDF2(18636) },
+                                        { AOM_CDF2(17333) },
+                                        { AOM_CDF2(15776) },
+                                        { AOM_CDF2(22658) },
+                                    },
+                                    {
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    } } },
+                                { { {
+                                        { AOM_CDF2(20177) },
+                                        { AOM_CDF2(20789) },
+                                        { AOM_CDF2(20262) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    },
+                                    {
+                                        { AOM_CDF2(21416) },
+                                        { AOM_CDF2(20855) },
+                                        { AOM_CDF2(23410) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    } },
+                                  { {
+                                        { AOM_CDF2(20238) },
+                                        { AOM_CDF2(21057) },
+                                        { AOM_CDF2(19159) },
+                                        { AOM_CDF2(22337) },
+                                        { AOM_CDF2(20159) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    },
+                                    {
+                                        { AOM_CDF2(20125) },
+                                        { AOM_CDF2(20559) },
+                                        { AOM_CDF2(21707) },
+                                        { AOM_CDF2(22296) },
+                                        { AOM_CDF2(17333) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    } },
+                                  { {
+                                        { AOM_CDF2(19941) },
+                                        { AOM_CDF2(20527) },
+                                        { AOM_CDF2(21470) },
+                                        { AOM_CDF2(22487) },
+                                        { AOM_CDF2(19558) },
+                                        { AOM_CDF2(22354) },
+                                        { AOM_CDF2(20331) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    },
+                                    {
+                                        { AOM_CDF2(22752) },
+                                        { AOM_CDF2(25006) },
+                                        { AOM_CDF2(22075) },
+                                        { AOM_CDF2(21576) },
+                                        { AOM_CDF2(17740) },
+                                        { AOM_CDF2(21690) },
+                                        { AOM_CDF2(19211) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    } },
+                                  { {
+                                        { AOM_CDF2(21442) },
+                                        { AOM_CDF2(22358) },
+                                        { AOM_CDF2(18503) },
+                                        { AOM_CDF2(20291) },
+                                        { AOM_CDF2(19945) },
+                                        { AOM_CDF2(21294) },
+                                        { AOM_CDF2(21178) },
+                                        { AOM_CDF2(19400) },
+                                        { AOM_CDF2(10556) },
+                                    },
+                                    {
+                                        { AOM_CDF2(24648) },
+                                        { AOM_CDF2(24949) },
+                                        { AOM_CDF2(20708) },
+                                        { AOM_CDF2(23905) },
+                                        { AOM_CDF2(20501) },
+                                        { AOM_CDF2(9558) },
+                                        { AOM_CDF2(9423) },
+                                        { AOM_CDF2(30365) },
+                                        { AOM_CDF2(19253) },
+                                    } },
+                                  { {
+                                        { AOM_CDF2(26064) },
+                                        { AOM_CDF2(22098) },
+                                        { AOM_CDF2(19613) },
+                                        { AOM_CDF2(20525) },
+                                        { AOM_CDF2(17595) },
+                                        { AOM_CDF2(16618) },
+                                        { AOM_CDF2(20497) },
+                                        { AOM_CDF2(18989) },
+                                        { AOM_CDF2(15513) },
+                                    },
+                                    {
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                        { AOM_CDF2(16384) },
+                                    } } }
+                              };
+
+static const aom_cdf_prob
+    av1_default_eob_multi16_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(
+        5)] = { { { { AOM_CDF5(840, 1039, 1980, 4895) },
+                    { AOM_CDF5(370, 671, 1883, 4471) } },
+                  { { AOM_CDF5(3247, 4950, 9688, 14563) },
+                    { AOM_CDF5(1904, 3354, 7763, 14647) } } },
+                { { { AOM_CDF5(2125, 2551, 5165, 8946) },
+                    { AOM_CDF5(513, 765, 1859, 6339) } },
+                  { { AOM_CDF5(7637, 9498, 14259, 19108) },
+                    { AOM_CDF5(2497, 4096, 8866, 16993) } } },
+                { { { AOM_CDF5(4016, 4897, 8881, 14968) },
+                    { AOM_CDF5(716, 1105, 2646, 10056) } },
+                  { { AOM_CDF5(11139, 13270, 18241, 23566) },
+                    { AOM_CDF5(3192, 5032, 10297, 19755) } } },
+                { { { AOM_CDF5(6708, 8958, 14746, 22133) },
+                    { AOM_CDF5(1222, 2074, 4783, 15410) } },
+                  { { AOM_CDF5(19575, 21766, 26044, 29709) },
+                    { AOM_CDF5(7297, 10767, 19273, 28194) } } } };
+
+static const aom_cdf_prob
+    av1_default_eob_multi32_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(
+        6)] = { { { { AOM_CDF6(400, 520, 977, 2102, 6542) },
+                    { AOM_CDF6(210, 405, 1315, 3326, 7537) } },
+                  { { AOM_CDF6(2636, 4273, 7588, 11794, 20401) },
+                    { AOM_CDF6(1786, 3179, 6902, 11357, 19054) } } },
+                { { { AOM_CDF6(989, 1249, 2019, 4151, 10785) },
+                    { AOM_CDF6(313, 441, 1099, 2917, 8562) } },
+                  { { AOM_CDF6(8394, 10352, 13932, 18855, 26014) },
+                    { AOM_CDF6(2578, 4124, 8181, 13670, 24234) } } },
+                { { { AOM_CDF6(2515, 3003, 4452, 8162, 16041) },
+                    { AOM_CDF6(574, 821, 1836, 5089, 13128) } },
+                  { { AOM_CDF6(13468, 16303, 20361, 25105, 29281) },
+                    { AOM_CDF6(3542, 5502, 10415, 16760, 25644) } } },
+                { { { AOM_CDF6(4617, 5709, 8446, 13584, 23135) },
+                    { AOM_CDF6(1156, 1702, 3675, 9274, 20539) } },
+                  { { AOM_CDF6(22086, 24282, 27010, 29770, 31743) },
+                    { AOM_CDF6(7699, 10897, 20891, 26926, 31628) } } } };
+
+static const aom_cdf_prob
+    av1_default_eob_multi64_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(
+        7)] = { { { { AOM_CDF7(329, 498, 1101, 1784, 3265, 7758) },
+                    { AOM_CDF7(335, 730, 1459, 5494, 8755, 12997) } },
+                  { { AOM_CDF7(3505, 5304, 10086, 13814, 17684, 23370) },
+                    { AOM_CDF7(1563, 2700, 4876, 10911, 14706, 22480) } } },
+                { { { AOM_CDF7(1260, 1446, 2253, 3712, 6652, 13369) },
+                    { AOM_CDF7(401, 605, 1029, 2563, 5845, 12626) } },
+                  { { AOM_CDF7(8609, 10612, 14624, 18714, 22614, 29024) },
+                    { AOM_CDF7(1923, 3127, 5867, 9703, 14277, 27100) } } },
+                { { { AOM_CDF7(2374, 2772, 4583, 7276, 12288, 19706) },
+                    { AOM_CDF7(497, 810, 1315, 3000, 7004, 15641) } },
+                  { { AOM_CDF7(15050, 17126, 21410, 24886, 28156, 30726) },
+                    { AOM_CDF7(4034, 6290, 10235, 14982, 21214, 28491) } } },
+                { { { AOM_CDF7(6307, 7541, 12060, 16358, 22553, 27865) },
+                    { AOM_CDF7(1289, 2320, 3971, 7926, 14153, 24291) } },
+                  { { AOM_CDF7(24212, 25708, 28268, 30035, 31307, 32049) },
+                    { AOM_CDF7(8726, 12378, 19409, 26450, 30038, 32462) } } } };
+
+static const aom_cdf_prob
+    av1_default_eob_multi128_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(
+        8)] = {
+      { { { AOM_CDF8(219, 482, 1140, 2091, 3680, 6028, 12586) },
+          { AOM_CDF8(371, 699, 1254, 4830, 9479, 12562, 17497) } },
+        { { AOM_CDF8(5245, 7456, 12880, 15852, 20033, 23932, 27608) },
+          { AOM_CDF8(2054, 3472, 5869, 14232, 18242, 20590, 26752) } } },
+      { { { AOM_CDF8(685, 933, 1488, 2714, 4766, 8562, 19254) },
+          { AOM_CDF8(217, 352, 618, 2303, 5261, 9969, 17472) } },
+        { { AOM_CDF8(8045, 11200, 15497, 19595, 23948, 27408, 30938) },
+          { AOM_CDF8(2310, 4160, 7471, 14997, 17931, 20768, 30240) } } },
+      { { { AOM_CDF8(1366, 1738, 2527, 5016, 9355, 15797, 24643) },
+          { AOM_CDF8(354, 558, 944, 2760, 7287, 14037, 21779) } },
+        { { AOM_CDF8(13627, 16246, 20173, 24429, 27948, 30415, 31863) },
+          { AOM_CDF8(6275, 9889, 14769, 23164, 27988, 30493, 32272) } } },
+      { { { AOM_CDF8(3472, 4885, 7489, 12481, 18517, 24536, 29635) },
+          { AOM_CDF8(886, 1731, 3271, 8469, 15569, 22126, 28383) } },
+        { { AOM_CDF8(24313, 26062, 28385, 30107, 31217, 31898, 32345) },
+          { AOM_CDF8(9165, 13282, 21150, 30286, 31894, 32571, 32712) } } }
+    };
+
+static const aom_cdf_prob
+    av1_default_eob_multi256_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(
+        9)] = {
+      { { { AOM_CDF9(310, 584, 1887, 3589, 6168, 8611, 11352, 15652) },
+          { AOM_CDF9(998, 1850, 2998, 5604, 17341, 19888, 22899, 25583) } },
+        { { AOM_CDF9(2520, 3240, 5952, 8870, 12577, 17558, 19954, 24168) },
+          { AOM_CDF9(2203, 4130, 7435, 10739, 20652, 23681, 25609, 27261) } } },
+      { { { AOM_CDF9(1448, 2109, 4151, 6263, 9329, 13260, 17944, 23300) },
+          { AOM_CDF9(399, 1019, 1749, 3038, 10444, 15546, 22739, 27294) } },
+        { { AOM_CDF9(6402, 8148, 12623, 15072, 18728, 22847, 26447, 29377) },
+          { AOM_CDF9(1674, 3252, 5734, 10159, 22397, 23802, 24821, 30940) } } },
+      { { { AOM_CDF9(3089, 3920, 6038, 9460, 14266, 19881, 25766, 29176) },
+          { AOM_CDF9(1084, 2358, 3488, 5122, 11483, 18103, 26023, 29799) } },
+        { { AOM_CDF9(11514, 13794, 17480, 20754, 24361, 27378, 29492, 31277) },
+          { AOM_CDF9(6571, 9610, 15516, 21826, 29092, 30829, 31842,
+                     32708) } } },
+      { { { AOM_CDF9(5348, 7113, 11820, 15924, 22106, 26777, 30334, 31757) },
+          { AOM_CDF9(2453, 4474, 6307, 8777, 16474, 22975, 29000, 31547) } },
+        { { AOM_CDF9(23110, 24597, 27140, 28894, 30167, 30927, 31392, 32094) },
+          { AOM_CDF9(9998, 17661, 25178, 28097, 31308, 32038, 32403,
+                     32695) } } }
+    };
+
+static const aom_cdf_prob
+    av1_default_eob_multi512_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(
+        10)] = { { { { AOM_CDF10(641, 983, 3707, 5430, 10234, 14958, 18788,
+                                 23412, 26061) },
+                     { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938,
+                                 26214, 29491) } },
+                   { { AOM_CDF10(5095, 6446, 9996, 13354, 16017, 17986, 20919,
+                                 26129, 29140) },
+                     { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938,
+                                 26214, 29491) } } },
+                 { { { AOM_CDF10(1230, 2278, 5035, 7776, 11871, 15346, 19590,
+                                 24584, 28749) },
+                     { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938,
+                                 26214, 29491) } },
+                   { { AOM_CDF10(7265, 9979, 15819, 19250, 21780, 23846, 26478,
+                                 28396, 31811) },
+                     { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938,
+                                 26214, 29491) } } },
+                 { { { AOM_CDF10(2624, 3936, 6480, 9686, 13979, 17726, 23267,
+                                 28410, 31078) },
+                     { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938,
+                                 26214, 29491) } },
+                   { { AOM_CDF10(12015, 14769, 19588, 22052, 24222, 25812,
+                                 27300, 29219, 32114) },
+                     { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938,
+                                 26214, 29491) } } },
+                 { { { AOM_CDF10(5927, 7809, 10923, 14597, 19439, 24135, 28456,
+                                 31142, 32060) },
+                     { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938,
+                                 26214, 29491) } },
+                   { { AOM_CDF10(21093, 23043, 25742, 27658, 29097, 29716,
+                                 30073, 30820, 31956) },
+                     { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938,
+                                 26214, 29491) } } } };
+
+static const aom_cdf_prob
+    av1_default_eob_multi1024_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(
+        11)] = { { { { AOM_CDF11(393, 421, 751, 1623, 3160, 6352, 13345, 18047,
+                                 22571, 25830) },
+                     { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852,
+                                 23831, 26810, 29789) } },
+                   { { AOM_CDF11(1865, 1988, 2930, 4242, 10533, 16538, 21354,
+                                 27255, 28546, 31784) },
+                     { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852,
+                                 23831, 26810, 29789) } } },
+                 { { { AOM_CDF11(696, 948, 3145, 5702, 9706, 13217, 17851,
+                                 21856, 25692, 28034) },
+                     { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852,
+                                 23831, 26810, 29789) } },
+                   { { AOM_CDF11(2672, 3591, 9330, 17084, 22725, 24284, 26527,
+                                 28027, 28377, 30876) },
+                     { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852,
+                                 23831, 26810, 29789) } } },
+                 { { { AOM_CDF11(2784, 3831, 7041, 10521, 14847, 18844, 23155,
+                                 26682, 29229, 31045) },
+                     { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852,
+                                 23831, 26810, 29789) } },
+                   { { AOM_CDF11(9577, 12466, 17739, 20750, 22061, 23215, 24601,
+                                 25483, 25843, 32056) },
+                     { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852,
+                                 23831, 26810, 29789) } } },
+                 { { { AOM_CDF11(6698, 8334, 11961, 15762, 20186, 23862, 27434,
+                                 29326, 31082, 32050) },
+                     { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852,
+                                 23831, 26810, 29789) } },
+                   { { AOM_CDF11(20569, 22426, 25569, 26859, 28053, 28913,
+                                 29486, 29724, 29807, 32570) },
+                     { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852,
+                                 23831, 26810, 29789) } } } };
+
+static const aom_cdf_prob av1_default_coeff_lps_multi_cdfs
+    [TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES][LEVEL_CONTEXTS]
+    [CDF_SIZE(BR_CDF_SIZE)] = {
+      { { { { AOM_CDF4(14298, 20718, 24174) },
+            { AOM_CDF4(12536, 19601, 23789) },
+            { AOM_CDF4(8712, 15051, 19503) },
+            { AOM_CDF4(6170, 11327, 15434) },
+            { AOM_CDF4(4742, 8926, 12538) },
+            { AOM_CDF4(3803, 7317, 10546) },
+            { AOM_CDF4(1696, 3317, 4871) },
+            { AOM_CDF4(14392, 19951, 22756) },
+            { AOM_CDF4(15978, 23218, 26818) },
+            { AOM_CDF4(12187, 19474, 23889) },
+            { AOM_CDF4(9176, 15640, 20259) },
+            { AOM_CDF4(7068, 12655, 17028) },
+            { AOM_CDF4(5656, 10442, 14472) },
+            { AOM_CDF4(2580, 4992, 7244) },
+            { AOM_CDF4(12136, 18049, 21426) },
+            { AOM_CDF4(13784, 20721, 24481) },
+            { AOM_CDF4(10836, 17621, 21900) },
+            { AOM_CDF4(8372, 14444, 18847) },
+            { AOM_CDF4(6523, 11779, 16000) },
+            { AOM_CDF4(5337, 9898, 13760) },
+            { AOM_CDF4(3034, 5860, 8462) } },
+          { { AOM_CDF4(15967, 22905, 26286) },
+            { AOM_CDF4(13534, 20654, 24579) },
+            { AOM_CDF4(9504, 16092, 20535) },
+            { AOM_CDF4(6975, 12568, 16903) },
+            { AOM_CDF4(5364, 10091, 14020) },
+            { AOM_CDF4(4357, 8370, 11857) },
+            { AOM_CDF4(2506, 4934, 7218) },
+            { AOM_CDF4(23032, 28815, 30936) },
+            { AOM_CDF4(19540, 26704, 29719) },
+            { AOM_CDF4(15158, 22969, 27097) },
+            { AOM_CDF4(11408, 18865, 23650) },
+            { AOM_CDF4(8885, 15448, 20250) },
+            { AOM_CDF4(7108, 12853, 17416) },
+            { AOM_CDF4(4231, 8041, 11480) },
+            { AOM_CDF4(19823, 26490, 29156) },
+            { AOM_CDF4(18890, 25929, 28932) },
+            { AOM_CDF4(15660, 23491, 27433) },
+            { AOM_CDF4(12147, 19776, 24488) },
+            { AOM_CDF4(9728, 16774, 21649) },
+            { AOM_CDF4(7919, 14277, 19066) },
+            { AOM_CDF4(5440, 10170, 14185) } } },
+        { { { AOM_CDF4(14406, 20862, 24414) },
+            { AOM_CDF4(11824, 18907, 23109) },
+            { AOM_CDF4(8257, 14393, 18803) },
+            { AOM_CDF4(5860, 10747, 14778) },
+            { AOM_CDF4(4475, 8486, 11984) },
+            { AOM_CDF4(3606, 6954, 10043) },
+            { AOM_CDF4(1736, 3410, 5048) },
+            { AOM_CDF4(14430, 20046, 22882) },
+            { AOM_CDF4(15593, 22899, 26709) },
+            { AOM_CDF4(12102, 19368, 23811) },
+            { AOM_CDF4(9059, 15584, 20262) },
+            { AOM_CDF4(6999, 12603, 17048) },
+            { AOM_CDF4(5684, 10497, 14553) },
+            { AOM_CDF4(2822, 5438, 7862) },
+            { AOM_CDF4(15785, 21585, 24359) },
+            { AOM_CDF4(18347, 25229, 28266) },
+            { AOM_CDF4(14974, 22487, 26389) },
+            { AOM_CDF4(11423, 18681, 23271) },
+            { AOM_CDF4(8863, 15350, 20008) },
+            { AOM_CDF4(7153, 12852, 17278) },
+            { AOM_CDF4(3707, 7036, 9982) } },
+          { { AOM_CDF4(15460, 21696, 25469) },
+            { AOM_CDF4(12170, 19249, 23191) },
+            { AOM_CDF4(8723, 15027, 19332) },
+            { AOM_CDF4(6428, 11704, 15874) },
+            { AOM_CDF4(4922, 9292, 13052) },
+            { AOM_CDF4(4139, 7695, 11010) },
+            { AOM_CDF4(2291, 4508, 6598) },
+            { AOM_CDF4(19856, 26920, 29828) },
+            { AOM_CDF4(17923, 25289, 28792) },
+            { AOM_CDF4(14278, 21968, 26297) },
+            { AOM_CDF4(10910, 18136, 22950) },
+            { AOM_CDF4(8423, 14815, 19627) },
+            { AOM_CDF4(6771, 12283, 16774) },
+            { AOM_CDF4(4074, 7750, 11081) },
+            { AOM_CDF4(19852, 26074, 28672) },
+            { AOM_CDF4(19371, 26110, 28989) },
+            { AOM_CDF4(16265, 23873, 27663) },
+            { AOM_CDF4(12758, 20378, 24952) },
+            { AOM_CDF4(10095, 17098, 21961) },
+            { AOM_CDF4(8250, 14628, 19451) },
+            { AOM_CDF4(5205, 9745, 13622) } } },
+        { { { AOM_CDF4(10563, 16233, 19763) },
+            { AOM_CDF4(9794, 16022, 19804) },
+            { AOM_CDF4(6750, 11945, 15759) },
+            { AOM_CDF4(4963, 9186, 12752) },
+            { AOM_CDF4(3845, 7435, 10627) },
+            { AOM_CDF4(3051, 6085, 8834) },
+            { AOM_CDF4(1311, 2596, 3830) },
+            { AOM_CDF4(11246, 16404, 19689) },
+            { AOM_CDF4(12315, 18911, 22731) },
+            { AOM_CDF4(10557, 17095, 21289) },
+            { AOM_CDF4(8136, 14006, 18249) },
+            { AOM_CDF4(6348, 11474, 15565) },
+            { AOM_CDF4(5196, 9655, 13400) },
+            { AOM_CDF4(2349, 4526, 6587) },
+            { AOM_CDF4(13337, 18730, 21569) },
+            { AOM_CDF4(19306, 26071, 28882) },
+            { AOM_CDF4(15952, 23540, 27254) },
+            { AOM_CDF4(12409, 19934, 24430) },
+            { AOM_CDF4(9760, 16706, 21389) },
+            { AOM_CDF4(8004, 14220, 18818) },
+            { AOM_CDF4(4138, 7794, 10961) } },
+          { { AOM_CDF4(10870, 16684, 20949) },
+            { AOM_CDF4(9664, 15230, 18680) },
+            { AOM_CDF4(6886, 12109, 15408) },
+            { AOM_CDF4(4825, 8900, 12305) },
+            { AOM_CDF4(3630, 7162, 10314) },
+            { AOM_CDF4(3036, 6429, 9387) },
+            { AOM_CDF4(1671, 3296, 4940) },
+            { AOM_CDF4(13819, 19159, 23026) },
+            { AOM_CDF4(11984, 19108, 23120) },
+            { AOM_CDF4(10690, 17210, 21663) },
+            { AOM_CDF4(7984, 14154, 18333) },
+            { AOM_CDF4(6868, 12294, 16124) },
+            { AOM_CDF4(5274, 8994, 12868) },
+            { AOM_CDF4(2988, 5771, 8424) },
+            { AOM_CDF4(19736, 26647, 29141) },
+            { AOM_CDF4(18933, 26070, 28984) },
+            { AOM_CDF4(15779, 23048, 27200) },
+            { AOM_CDF4(12638, 20061, 24532) },
+            { AOM_CDF4(10692, 17545, 22220) },
+            { AOM_CDF4(9217, 15251, 20054) },
+            { AOM_CDF4(5078, 9284, 12594) } } },
+        { { { AOM_CDF4(2331, 3662, 5244) },
+            { AOM_CDF4(2891, 4771, 6145) },
+            { AOM_CDF4(4598, 7623, 9729) },
+            { AOM_CDF4(3520, 6845, 9199) },
+            { AOM_CDF4(3417, 6119, 9324) },
+            { AOM_CDF4(2601, 5412, 7385) },
+            { AOM_CDF4(600, 1173, 1744) },
+            { AOM_CDF4(7672, 13286, 17469) },
+            { AOM_CDF4(4232, 7792, 10793) },
+            { AOM_CDF4(2915, 5317, 7397) },
+            { AOM_CDF4(2318, 4356, 6152) },
+            { AOM_CDF4(2127, 4000, 5554) },
+            { AOM_CDF4(1850, 3478, 5275) },
+            { AOM_CDF4(977, 1933, 2843) },
+            { AOM_CDF4(18280, 24387, 27989) },
+            { AOM_CDF4(15852, 22671, 26185) },
+            { AOM_CDF4(13845, 20951, 24789) },
+            { AOM_CDF4(11055, 17966, 22129) },
+            { AOM_CDF4(9138, 15422, 19801) },
+            { AOM_CDF4(7454, 13145, 17456) },
+            { AOM_CDF4(3370, 6393, 9013) } },
+          { { AOM_CDF4(5842, 9229, 10838) },
+            { AOM_CDF4(2313, 3491, 4276) },
+            { AOM_CDF4(2998, 6104, 7496) },
+            { AOM_CDF4(2420, 7447, 9868) },
+            { AOM_CDF4(3034, 8495, 10923) },
+            { AOM_CDF4(4076, 8937, 10975) },
+            { AOM_CDF4(1086, 2370, 3299) },
+            { AOM_CDF4(9714, 17254, 20444) },
+            { AOM_CDF4(8543, 13698, 17123) },
+            { AOM_CDF4(4918, 9007, 11910) },
+            { AOM_CDF4(4129, 7532, 10553) },
+            { AOM_CDF4(2364, 5533, 8058) },
+            { AOM_CDF4(1834, 3546, 5563) },
+            { AOM_CDF4(1473, 2908, 4133) },
+            { AOM_CDF4(15405, 21193, 25619) },
+            { AOM_CDF4(15691, 21952, 26561) },
+            { AOM_CDF4(12962, 19194, 24165) },
+            { AOM_CDF4(10272, 17855, 22129) },
+            { AOM_CDF4(8588, 15270, 20718) },
+            { AOM_CDF4(8682, 14669, 19500) },
+            { AOM_CDF4(4870, 9636, 13205) } } },
+        { { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) } },
+          { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) } } } },
+      { { { { AOM_CDF4(14995, 21341, 24749) },
+            { AOM_CDF4(13158, 20289, 24601) },
+            { AOM_CDF4(8941, 15326, 19876) },
+            { AOM_CDF4(6297, 11541, 15807) },
+            { AOM_CDF4(4817, 9029, 12776) },
+            { AOM_CDF4(3731, 7273, 10627) },
+            { AOM_CDF4(1847, 3617, 5354) },
+            { AOM_CDF4(14472, 19659, 22343) },
+            { AOM_CDF4(16806, 24162, 27533) },
+            { AOM_CDF4(12900, 20404, 24713) },
+            { AOM_CDF4(9411, 16112, 20797) },
+            { AOM_CDF4(7056, 12697, 17148) },
+            { AOM_CDF4(5544, 10339, 14460) },
+            { AOM_CDF4(2954, 5704, 8319) },
+            { AOM_CDF4(12464, 18071, 21354) },
+            { AOM_CDF4(15482, 22528, 26034) },
+            { AOM_CDF4(12070, 19269, 23624) },
+            { AOM_CDF4(8953, 15406, 20106) },
+            { AOM_CDF4(7027, 12730, 17220) },
+            { AOM_CDF4(5887, 10913, 15140) },
+            { AOM_CDF4(3793, 7278, 10447) } },
+          { { AOM_CDF4(15571, 22232, 25749) },
+            { AOM_CDF4(14506, 21575, 25374) },
+            { AOM_CDF4(10189, 17089, 21569) },
+            { AOM_CDF4(7316, 13301, 17915) },
+            { AOM_CDF4(5783, 10912, 15190) },
+            { AOM_CDF4(4760, 9155, 13088) },
+            { AOM_CDF4(2993, 5966, 8774) },
+            { AOM_CDF4(23424, 28903, 30778) },
+            { AOM_CDF4(20775, 27666, 30290) },
+            { AOM_CDF4(16474, 24410, 28299) },
+            { AOM_CDF4(12471, 20180, 24987) },
+            { AOM_CDF4(9410, 16487, 21439) },
+            { AOM_CDF4(7536, 13614, 18529) },
+            { AOM_CDF4(5048, 9586, 13549) },
+            { AOM_CDF4(21090, 27290, 29756) },
+            { AOM_CDF4(20796, 27402, 30026) },
+            { AOM_CDF4(17819, 25485, 28969) },
+            { AOM_CDF4(13860, 21909, 26462) },
+            { AOM_CDF4(11002, 18494, 23529) },
+            { AOM_CDF4(8953, 15929, 20897) },
+            { AOM_CDF4(6448, 11918, 16454) } } },
+        { { { AOM_CDF4(15999, 22208, 25449) },
+            { AOM_CDF4(13050, 19988, 24122) },
+            { AOM_CDF4(8594, 14864, 19378) },
+            { AOM_CDF4(6033, 11079, 15238) },
+            { AOM_CDF4(4554, 8683, 12347) },
+            { AOM_CDF4(3672, 7139, 10337) },
+            { AOM_CDF4(1900, 3771, 5576) },
+            { AOM_CDF4(15788, 21340, 23949) },
+            { AOM_CDF4(16825, 24235, 27758) },
+            { AOM_CDF4(12873, 20402, 24810) },
+            { AOM_CDF4(9590, 16363, 21094) },
+            { AOM_CDF4(7352, 13209, 17733) },
+            { AOM_CDF4(5960, 10989, 15184) },
+            { AOM_CDF4(3232, 6234, 9007) },
+            { AOM_CDF4(15761, 20716, 23224) },
+            { AOM_CDF4(19318, 25989, 28759) },
+            { AOM_CDF4(15529, 23094, 26929) },
+            { AOM_CDF4(11662, 18989, 23641) },
+            { AOM_CDF4(8955, 15568, 20366) },
+            { AOM_CDF4(7281, 13106, 17708) },
+            { AOM_CDF4(4248, 8059, 11440) } },
+          { { AOM_CDF4(14899, 21217, 24503) },
+            { AOM_CDF4(13519, 20283, 24047) },
+            { AOM_CDF4(9429, 15966, 20365) },
+            { AOM_CDF4(6700, 12355, 16652) },
+            { AOM_CDF4(5088, 9704, 13716) },
+            { AOM_CDF4(4243, 8154, 11731) },
+            { AOM_CDF4(2702, 5364, 7861) },
+            { AOM_CDF4(22745, 28388, 30454) },
+            { AOM_CDF4(20235, 27146, 29922) },
+            { AOM_CDF4(15896, 23715, 27637) },
+            { AOM_CDF4(11840, 19350, 24131) },
+            { AOM_CDF4(9122, 15932, 20880) },
+            { AOM_CDF4(7488, 13581, 18362) },
+            { AOM_CDF4(5114, 9568, 13370) },
+            { AOM_CDF4(20845, 26553, 28932) },
+            { AOM_CDF4(20981, 27372, 29884) },
+            { AOM_CDF4(17781, 25335, 28785) },
+            { AOM_CDF4(13760, 21708, 26297) },
+            { AOM_CDF4(10975, 18415, 23365) },
+            { AOM_CDF4(9045, 15789, 20686) },
+            { AOM_CDF4(6130, 11199, 15423) } } },
+        { { { AOM_CDF4(13549, 19724, 23158) },
+            { AOM_CDF4(11844, 18382, 22246) },
+            { AOM_CDF4(7919, 13619, 17773) },
+            { AOM_CDF4(5486, 10143, 13946) },
+            { AOM_CDF4(4166, 7983, 11324) },
+            { AOM_CDF4(3364, 6506, 9427) },
+            { AOM_CDF4(1598, 3160, 4674) },
+            { AOM_CDF4(15281, 20979, 23781) },
+            { AOM_CDF4(14939, 22119, 25952) },
+            { AOM_CDF4(11363, 18407, 22812) },
+            { AOM_CDF4(8609, 14857, 19370) },
+            { AOM_CDF4(6737, 12184, 16480) },
+            { AOM_CDF4(5506, 10263, 14262) },
+            { AOM_CDF4(2990, 5786, 8380) },
+            { AOM_CDF4(20249, 25253, 27417) },
+            { AOM_CDF4(21070, 27518, 30001) },
+            { AOM_CDF4(16854, 24469, 28074) },
+            { AOM_CDF4(12864, 20486, 25000) },
+            { AOM_CDF4(9962, 16978, 21778) },
+            { AOM_CDF4(8074, 14338, 19048) },
+            { AOM_CDF4(4494, 8479, 11906) } },
+          { { AOM_CDF4(13960, 19617, 22829) },
+            { AOM_CDF4(11150, 17341, 21228) },
+            { AOM_CDF4(7150, 12964, 17190) },
+            { AOM_CDF4(5331, 10002, 13867) },
+            { AOM_CDF4(4167, 7744, 11057) },
+            { AOM_CDF4(3480, 6629, 9646) },
+            { AOM_CDF4(1883, 3784, 5686) },
+            { AOM_CDF4(18752, 25660, 28912) },
+            { AOM_CDF4(16968, 24586, 28030) },
+            { AOM_CDF4(13520, 21055, 25313) },
+            { AOM_CDF4(10453, 17626, 22280) },
+            { AOM_CDF4(8386, 14505, 19116) },
+            { AOM_CDF4(6742, 12595, 17008) },
+            { AOM_CDF4(4273, 8140, 11499) },
+            { AOM_CDF4(22120, 27827, 30233) },
+            { AOM_CDF4(20563, 27358, 29895) },
+            { AOM_CDF4(17076, 24644, 28153) },
+            { AOM_CDF4(13362, 20942, 25309) },
+            { AOM_CDF4(10794, 17965, 22695) },
+            { AOM_CDF4(9014, 15652, 20319) },
+            { AOM_CDF4(5708, 10512, 14497) } } },
+        { { { AOM_CDF4(5705, 10930, 15725) },
+            { AOM_CDF4(7946, 12765, 16115) },
+            { AOM_CDF4(6801, 12123, 16226) },
+            { AOM_CDF4(5462, 10135, 14200) },
+            { AOM_CDF4(4189, 8011, 11507) },
+            { AOM_CDF4(3191, 6229, 9408) },
+            { AOM_CDF4(1057, 2137, 3212) },
+            { AOM_CDF4(10018, 17067, 21491) },
+            { AOM_CDF4(7380, 12582, 16453) },
+            { AOM_CDF4(6068, 10845, 14339) },
+            { AOM_CDF4(5098, 9198, 12555) },
+            { AOM_CDF4(4312, 8010, 11119) },
+            { AOM_CDF4(3700, 6966, 9781) },
+            { AOM_CDF4(1693, 3326, 4887) },
+            { AOM_CDF4(18757, 24930, 27774) },
+            { AOM_CDF4(17648, 24596, 27817) },
+            { AOM_CDF4(14707, 22052, 26026) },
+            { AOM_CDF4(11720, 18852, 23292) },
+            { AOM_CDF4(9357, 15952, 20525) },
+            { AOM_CDF4(7810, 13753, 18210) },
+            { AOM_CDF4(3879, 7333, 10328) } },
+          { { AOM_CDF4(8278, 13242, 15922) },
+            { AOM_CDF4(10547, 15867, 18919) },
+            { AOM_CDF4(9106, 15842, 20609) },
+            { AOM_CDF4(6833, 13007, 17218) },
+            { AOM_CDF4(4811, 9712, 13923) },
+            { AOM_CDF4(3985, 7352, 11128) },
+            { AOM_CDF4(1688, 3458, 5262) },
+            { AOM_CDF4(12951, 21861, 26510) },
+            { AOM_CDF4(9788, 16044, 20276) },
+            { AOM_CDF4(6309, 11244, 14870) },
+            { AOM_CDF4(5183, 9349, 12566) },
+            { AOM_CDF4(4389, 8229, 11492) },
+            { AOM_CDF4(3633, 6945, 10620) },
+            { AOM_CDF4(3600, 6847, 9907) },
+            { AOM_CDF4(21748, 28137, 30255) },
+            { AOM_CDF4(19436, 26581, 29560) },
+            { AOM_CDF4(16359, 24201, 27953) },
+            { AOM_CDF4(13961, 21693, 25871) },
+            { AOM_CDF4(11544, 18686, 23322) },
+            { AOM_CDF4(9372, 16462, 20952) },
+            { AOM_CDF4(6138, 11210, 15390) } } },
+        { { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) } },
+          { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) } } } },
+      { { { { AOM_CDF4(16138, 22223, 25509) },
+            { AOM_CDF4(15347, 22430, 26332) },
+            { AOM_CDF4(9614, 16736, 21332) },
+            { AOM_CDF4(6600, 12275, 16907) },
+            { AOM_CDF4(4811, 9424, 13547) },
+            { AOM_CDF4(3748, 7809, 11420) },
+            { AOM_CDF4(2254, 4587, 6890) },
+            { AOM_CDF4(15196, 20284, 23177) },
+            { AOM_CDF4(18317, 25469, 28451) },
+            { AOM_CDF4(13918, 21651, 25842) },
+            { AOM_CDF4(10052, 17150, 21995) },
+            { AOM_CDF4(7499, 13630, 18587) },
+            { AOM_CDF4(6158, 11417, 16003) },
+            { AOM_CDF4(4014, 7785, 11252) },
+            { AOM_CDF4(15048, 21067, 24384) },
+            { AOM_CDF4(18202, 25346, 28553) },
+            { AOM_CDF4(14302, 22019, 26356) },
+            { AOM_CDF4(10839, 18139, 23166) },
+            { AOM_CDF4(8715, 15744, 20806) },
+            { AOM_CDF4(7536, 13576, 18544) },
+            { AOM_CDF4(5413, 10335, 14498) } },
+          { { AOM_CDF4(17394, 24501, 27895) },
+            { AOM_CDF4(15889, 23420, 27185) },
+            { AOM_CDF4(11561, 19133, 23870) },
+            { AOM_CDF4(8285, 14812, 19844) },
+            { AOM_CDF4(6496, 12043, 16550) },
+            { AOM_CDF4(4771, 9574, 13677) },
+            { AOM_CDF4(3603, 6830, 10144) },
+            { AOM_CDF4(21656, 27704, 30200) },
+            { AOM_CDF4(21324, 27915, 30511) },
+            { AOM_CDF4(17327, 25336, 28997) },
+            { AOM_CDF4(13417, 21381, 26033) },
+            { AOM_CDF4(10132, 17425, 22338) },
+            { AOM_CDF4(8580, 15016, 19633) },
+            { AOM_CDF4(5694, 11477, 16411) },
+            { AOM_CDF4(24116, 29780, 31450) },
+            { AOM_CDF4(23853, 29695, 31591) },
+            { AOM_CDF4(20085, 27614, 30428) },
+            { AOM_CDF4(15326, 24335, 28575) },
+            { AOM_CDF4(11814, 19472, 24810) },
+            { AOM_CDF4(10221, 18611, 24767) },
+            { AOM_CDF4(7689, 14558, 20321) } } },
+        { { { AOM_CDF4(16214, 22380, 25770) },
+            { AOM_CDF4(14213, 21304, 25295) },
+            { AOM_CDF4(9213, 15823, 20455) },
+            { AOM_CDF4(6395, 11758, 16139) },
+            { AOM_CDF4(4779, 9187, 13066) },
+            { AOM_CDF4(3821, 7501, 10953) },
+            { AOM_CDF4(2293, 4567, 6795) },
+            { AOM_CDF4(15859, 21283, 23820) },
+            { AOM_CDF4(18404, 25602, 28726) },
+            { AOM_CDF4(14325, 21980, 26206) },
+            { AOM_CDF4(10669, 17937, 22720) },
+            { AOM_CDF4(8297, 14642, 19447) },
+            { AOM_CDF4(6746, 12389, 16893) },
+            { AOM_CDF4(4324, 8251, 11770) },
+            { AOM_CDF4(16532, 21631, 24475) },
+            { AOM_CDF4(20667, 27150, 29668) },
+            { AOM_CDF4(16728, 24510, 28175) },
+            { AOM_CDF4(12861, 20645, 25332) },
+            { AOM_CDF4(10076, 17361, 22417) },
+            { AOM_CDF4(8395, 14940, 19963) },
+            { AOM_CDF4(5731, 10683, 14912) } },
+          { { AOM_CDF4(14433, 21155, 24938) },
+            { AOM_CDF4(14658, 21716, 25545) },
+            { AOM_CDF4(9923, 16824, 21557) },
+            { AOM_CDF4(6982, 13052, 17721) },
+            { AOM_CDF4(5419, 10503, 15050) },
+            { AOM_CDF4(4852, 9162, 13014) },
+            { AOM_CDF4(3271, 6395, 9630) },
+            { AOM_CDF4(22210, 27833, 30109) },
+            { AOM_CDF4(20750, 27368, 29821) },
+            { AOM_CDF4(16894, 24828, 28573) },
+            { AOM_CDF4(13247, 21276, 25757) },
+            { AOM_CDF4(10038, 17265, 22563) },
+            { AOM_CDF4(8587, 14947, 20327) },
+            { AOM_CDF4(5645, 11371, 15252) },
+            { AOM_CDF4(22027, 27526, 29714) },
+            { AOM_CDF4(23098, 29146, 31221) },
+            { AOM_CDF4(19886, 27341, 30272) },
+            { AOM_CDF4(15609, 23747, 28046) },
+            { AOM_CDF4(11993, 20065, 24939) },
+            { AOM_CDF4(9637, 18267, 23671) },
+            { AOM_CDF4(7625, 13801, 19144) } } },
+        { { { AOM_CDF4(14438, 20798, 24089) },
+            { AOM_CDF4(12621, 19203, 23097) },
+            { AOM_CDF4(8177, 14125, 18402) },
+            { AOM_CDF4(5674, 10501, 14456) },
+            { AOM_CDF4(4236, 8239, 11733) },
+            { AOM_CDF4(3447, 6750, 9806) },
+            { AOM_CDF4(1986, 3950, 5864) },
+            { AOM_CDF4(16208, 22099, 24930) },
+            { AOM_CDF4(16537, 24025, 27585) },
+            { AOM_CDF4(12780, 20381, 24867) },
+            { AOM_CDF4(9767, 16612, 21416) },
+            { AOM_CDF4(7686, 13738, 18398) },
+            { AOM_CDF4(6333, 11614, 15964) },
+            { AOM_CDF4(3941, 7571, 10836) },
+            { AOM_CDF4(22819, 27422, 29202) },
+            { AOM_CDF4(22224, 28514, 30721) },
+            { AOM_CDF4(17660, 25433, 28913) },
+            { AOM_CDF4(13574, 21482, 26002) },
+            { AOM_CDF4(10629, 17977, 22938) },
+            { AOM_CDF4(8612, 15298, 20265) },
+            { AOM_CDF4(5607, 10491, 14596) } },
+          { { AOM_CDF4(13569, 19800, 23206) },
+            { AOM_CDF4(13128, 19924, 23869) },
+            { AOM_CDF4(8329, 14841, 19403) },
+            { AOM_CDF4(6130, 10976, 15057) },
+            { AOM_CDF4(4682, 8839, 12518) },
+            { AOM_CDF4(3656, 7409, 10588) },
+            { AOM_CDF4(2577, 5099, 7412) },
+            { AOM_CDF4(22427, 28684, 30585) },
+            { AOM_CDF4(20913, 27750, 30139) },
+            { AOM_CDF4(15840, 24109, 27834) },
+            { AOM_CDF4(12308, 20029, 24569) },
+            { AOM_CDF4(10216, 16785, 21458) },
+            { AOM_CDF4(8309, 14203, 19113) },
+            { AOM_CDF4(6043, 11168, 15307) },
+            { AOM_CDF4(23166, 28901, 30998) },
+            { AOM_CDF4(21899, 28405, 30751) },
+            { AOM_CDF4(18413, 26091, 29443) },
+            { AOM_CDF4(15233, 23114, 27352) },
+            { AOM_CDF4(12683, 20472, 25288) },
+            { AOM_CDF4(10702, 18259, 23409) },
+            { AOM_CDF4(8125, 14464, 19226) } } },
+        { { { AOM_CDF4(9040, 14786, 18360) },
+            { AOM_CDF4(9979, 15718, 19415) },
+            { AOM_CDF4(7913, 13918, 18311) },
+            { AOM_CDF4(5859, 10889, 15184) },
+            { AOM_CDF4(4593, 8677, 12510) },
+            { AOM_CDF4(3820, 7396, 10791) },
+            { AOM_CDF4(1730, 3471, 5192) },
+            { AOM_CDF4(11803, 18365, 22709) },
+            { AOM_CDF4(11419, 18058, 22225) },
+            { AOM_CDF4(9418, 15774, 20243) },
+            { AOM_CDF4(7539, 13325, 17657) },
+            { AOM_CDF4(6233, 11317, 15384) },
+            { AOM_CDF4(5137, 9656, 13545) },
+            { AOM_CDF4(2977, 5774, 8349) },
+            { AOM_CDF4(21207, 27246, 29640) },
+            { AOM_CDF4(19547, 26578, 29497) },
+            { AOM_CDF4(16169, 23871, 27690) },
+            { AOM_CDF4(12820, 20458, 25018) },
+            { AOM_CDF4(10224, 17332, 22214) },
+            { AOM_CDF4(8526, 15048, 19884) },
+            { AOM_CDF4(5037, 9410, 13118) } },
+          { { AOM_CDF4(12339, 17329, 20140) },
+            { AOM_CDF4(13505, 19895, 23225) },
+            { AOM_CDF4(9847, 16944, 21564) },
+            { AOM_CDF4(7280, 13256, 18348) },
+            { AOM_CDF4(4712, 10009, 14454) },
+            { AOM_CDF4(4361, 7914, 12477) },
+            { AOM_CDF4(2870, 5628, 7995) },
+            { AOM_CDF4(20061, 25504, 28526) },
+            { AOM_CDF4(15235, 22878, 26145) },
+            { AOM_CDF4(12985, 19958, 24155) },
+            { AOM_CDF4(9782, 16641, 21403) },
+            { AOM_CDF4(9456, 16360, 20760) },
+            { AOM_CDF4(6855, 12940, 18557) },
+            { AOM_CDF4(5661, 10564, 15002) },
+            { AOM_CDF4(25656, 30602, 31894) },
+            { AOM_CDF4(22570, 29107, 31092) },
+            { AOM_CDF4(18917, 26423, 29541) },
+            { AOM_CDF4(15940, 23649, 27754) },
+            { AOM_CDF4(12803, 20581, 25219) },
+            { AOM_CDF4(11082, 18695, 23376) },
+            { AOM_CDF4(7939, 14373, 19005) } } },
+        { { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) } },
+          { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) } } } },
+      { { { { AOM_CDF4(18315, 24289, 27551) },
+            { AOM_CDF4(16854, 24068, 27835) },
+            { AOM_CDF4(10140, 17927, 23173) },
+            { AOM_CDF4(6722, 12982, 18267) },
+            { AOM_CDF4(4661, 9826, 14706) },
+            { AOM_CDF4(3832, 8165, 12294) },
+            { AOM_CDF4(2795, 6098, 9245) },
+            { AOM_CDF4(17145, 23326, 26672) },
+            { AOM_CDF4(20733, 27680, 30308) },
+            { AOM_CDF4(16032, 24461, 28546) },
+            { AOM_CDF4(11653, 20093, 25081) },
+            { AOM_CDF4(9290, 16429, 22086) },
+            { AOM_CDF4(7796, 14598, 19982) },
+            { AOM_CDF4(6502, 12378, 17441) },
+            { AOM_CDF4(21681, 27732, 30320) },
+            { AOM_CDF4(22389, 29044, 31261) },
+            { AOM_CDF4(19027, 26731, 30087) },
+            { AOM_CDF4(14739, 23755, 28624) },
+            { AOM_CDF4(11358, 20778, 25511) },
+            { AOM_CDF4(10995, 18073, 24190) },
+            { AOM_CDF4(9162, 14990, 20617) } },
+          { { AOM_CDF4(21425, 27952, 30388) },
+            { AOM_CDF4(18062, 25838, 29034) },
+            { AOM_CDF4(11956, 19881, 24808) },
+            { AOM_CDF4(7718, 15000, 20980) },
+            { AOM_CDF4(5702, 11254, 16143) },
+            { AOM_CDF4(4898, 9088, 16864) },
+            { AOM_CDF4(3679, 6776, 11907) },
+            { AOM_CDF4(23294, 30160, 31663) },
+            { AOM_CDF4(24397, 29896, 31836) },
+            { AOM_CDF4(19245, 27128, 30593) },
+            { AOM_CDF4(13202, 19825, 26404) },
+            { AOM_CDF4(11578, 19297, 23957) },
+            { AOM_CDF4(8073, 13297, 21370) },
+            { AOM_CDF4(5461, 10923, 19745) },
+            { AOM_CDF4(27367, 30521, 31934) },
+            { AOM_CDF4(24904, 30671, 31940) },
+            { AOM_CDF4(23075, 28460, 31299) },
+            { AOM_CDF4(14400, 23658, 30417) },
+            { AOM_CDF4(13885, 23882, 28325) },
+            { AOM_CDF4(14746, 22938, 27853) },
+            { AOM_CDF4(5461, 16384, 27307) } } },
+        { { { AOM_CDF4(18274, 24813, 27890) },
+            { AOM_CDF4(15537, 23149, 27003) },
+            { AOM_CDF4(9449, 16740, 21827) },
+            { AOM_CDF4(6700, 12498, 17261) },
+            { AOM_CDF4(4988, 9866, 14198) },
+            { AOM_CDF4(4236, 8147, 11902) },
+            { AOM_CDF4(2867, 5860, 8654) },
+            { AOM_CDF4(17124, 23171, 26101) },
+            { AOM_CDF4(20396, 27477, 30148) },
+            { AOM_CDF4(16573, 24629, 28492) },
+            { AOM_CDF4(12749, 20846, 25674) },
+            { AOM_CDF4(10233, 17878, 22818) },
+            { AOM_CDF4(8525, 15332, 20363) },
+            { AOM_CDF4(6283, 11632, 16255) },
+            { AOM_CDF4(20466, 26511, 29286) },
+            { AOM_CDF4(23059, 29174, 31191) },
+            { AOM_CDF4(19481, 27263, 30241) },
+            { AOM_CDF4(15458, 23631, 28137) },
+            { AOM_CDF4(12416, 20608, 25693) },
+            { AOM_CDF4(10261, 18011, 23261) },
+            { AOM_CDF4(8016, 14655, 19666) } },
+          { { AOM_CDF4(17616, 24586, 28112) },
+            { AOM_CDF4(15809, 23299, 27155) },
+            { AOM_CDF4(10767, 18890, 23793) },
+            { AOM_CDF4(7727, 14255, 18865) },
+            { AOM_CDF4(6129, 11926, 16882) },
+            { AOM_CDF4(4482, 9704, 14861) },
+            { AOM_CDF4(3277, 7452, 11522) },
+            { AOM_CDF4(22956, 28551, 30730) },
+            { AOM_CDF4(22724, 28937, 30961) },
+            { AOM_CDF4(18467, 26324, 29580) },
+            { AOM_CDF4(13234, 20713, 25649) },
+            { AOM_CDF4(11181, 17592, 22481) },
+            { AOM_CDF4(8291, 18358, 24576) },
+            { AOM_CDF4(7568, 11881, 14984) },
+            { AOM_CDF4(24948, 29001, 31147) },
+            { AOM_CDF4(25674, 30619, 32151) },
+            { AOM_CDF4(20841, 26793, 29603) },
+            { AOM_CDF4(14669, 24356, 28666) },
+            { AOM_CDF4(11334, 23593, 28219) },
+            { AOM_CDF4(8922, 14762, 22873) },
+            { AOM_CDF4(8301, 13544, 20535) } } },
+        { { { AOM_CDF4(17113, 23733, 27081) },
+            { AOM_CDF4(14139, 21406, 25452) },
+            { AOM_CDF4(8552, 15002, 19776) },
+            { AOM_CDF4(5871, 11120, 15378) },
+            { AOM_CDF4(4455, 8616, 12253) },
+            { AOM_CDF4(3469, 6910, 10386) },
+            { AOM_CDF4(2255, 4553, 6782) },
+            { AOM_CDF4(18224, 24376, 27053) },
+            { AOM_CDF4(19290, 26710, 29614) },
+            { AOM_CDF4(14936, 22991, 27184) },
+            { AOM_CDF4(11238, 18951, 23762) },
+            { AOM_CDF4(8786, 15617, 20588) },
+            { AOM_CDF4(7317, 13228, 18003) },
+            { AOM_CDF4(5101, 9512, 13493) },
+            { AOM_CDF4(22639, 28222, 30210) },
+            { AOM_CDF4(23216, 29331, 31307) },
+            { AOM_CDF4(19075, 26762, 29895) },
+            { AOM_CDF4(15014, 23113, 27457) },
+            { AOM_CDF4(11938, 19857, 24752) },
+            { AOM_CDF4(9942, 17280, 22282) },
+            { AOM_CDF4(7167, 13144, 17752) } },
+          { { AOM_CDF4(15820, 22738, 26488) },
+            { AOM_CDF4(13530, 20885, 25216) },
+            { AOM_CDF4(8395, 15530, 20452) },
+            { AOM_CDF4(6574, 12321, 16380) },
+            { AOM_CDF4(5353, 10419, 14568) },
+            { AOM_CDF4(4613, 8446, 12381) },
+            { AOM_CDF4(3440, 7158, 9903) },
+            { AOM_CDF4(24247, 29051, 31224) },
+            { AOM_CDF4(22118, 28058, 30369) },
+            { AOM_CDF4(16498, 24768, 28389) },
+            { AOM_CDF4(12920, 21175, 26137) },
+            { AOM_CDF4(10730, 18619, 25352) },
+            { AOM_CDF4(10187, 16279, 22791) },
+            { AOM_CDF4(9310, 14631, 22127) },
+            { AOM_CDF4(24970, 30558, 32057) },
+            { AOM_CDF4(24801, 29942, 31698) },
+            { AOM_CDF4(22432, 28453, 30855) },
+            { AOM_CDF4(19054, 25680, 29580) },
+            { AOM_CDF4(14392, 23036, 28109) },
+            { AOM_CDF4(12495, 20947, 26650) },
+            { AOM_CDF4(12442, 20326, 26214) } } },
+        { { { AOM_CDF4(12162, 18785, 22648) },
+            { AOM_CDF4(12749, 19697, 23806) },
+            { AOM_CDF4(8580, 15297, 20346) },
+            { AOM_CDF4(6169, 11749, 16543) },
+            { AOM_CDF4(4836, 9391, 13448) },
+            { AOM_CDF4(3821, 7711, 11613) },
+            { AOM_CDF4(2228, 4601, 7070) },
+            { AOM_CDF4(16319, 24725, 28280) },
+            { AOM_CDF4(15698, 23277, 27168) },
+            { AOM_CDF4(12726, 20368, 25047) },
+            { AOM_CDF4(9912, 17015, 21976) },
+            { AOM_CDF4(7888, 14220, 19179) },
+            { AOM_CDF4(6777, 12284, 17018) },
+            { AOM_CDF4(4492, 8590, 12252) },
+            { AOM_CDF4(23249, 28904, 30947) },
+            { AOM_CDF4(21050, 27908, 30512) },
+            { AOM_CDF4(17440, 25340, 28949) },
+            { AOM_CDF4(14059, 22018, 26541) },
+            { AOM_CDF4(11288, 18903, 23898) },
+            { AOM_CDF4(9411, 16342, 21428) },
+            { AOM_CDF4(6278, 11588, 15944) } },
+          { { AOM_CDF4(13981, 20067, 23226) },
+            { AOM_CDF4(16922, 23580, 26783) },
+            { AOM_CDF4(11005, 19039, 24487) },
+            { AOM_CDF4(7389, 14218, 19798) },
+            { AOM_CDF4(5598, 11505, 17206) },
+            { AOM_CDF4(6090, 11213, 15659) },
+            { AOM_CDF4(3820, 7371, 10119) },
+            { AOM_CDF4(21082, 26925, 29675) },
+            { AOM_CDF4(21262, 28627, 31128) },
+            { AOM_CDF4(18392, 26454, 30437) },
+            { AOM_CDF4(14870, 22910, 27096) },
+            { AOM_CDF4(12620, 19484, 24908) },
+            { AOM_CDF4(9290, 16553, 22802) },
+            { AOM_CDF4(6668, 14288, 20004) },
+            { AOM_CDF4(27704, 31055, 31949) },
+            { AOM_CDF4(24709, 29978, 31788) },
+            { AOM_CDF4(21668, 29264, 31657) },
+            { AOM_CDF4(18295, 26968, 30074) },
+            { AOM_CDF4(16399, 24422, 29313) },
+            { AOM_CDF4(14347, 23026, 28104) },
+            { AOM_CDF4(12370, 19806, 24477) } } },
+        { { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) } },
+          { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+            { AOM_CDF4(8192, 16384, 24576) } } } }
+    };
+
+static const aom_cdf_prob av1_default_coeff_base_multi_cdfs
+    [TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES][SIG_COEF_CONTEXTS]
+    [CDF_SIZE(NUM_BASE_LEVELS +
+              2)] = { { { { { AOM_CDF4(4034, 8930, 12727) },
+                            { AOM_CDF4(18082, 29741, 31877) },
+                            { AOM_CDF4(12596, 26124, 30493) },
+                            { AOM_CDF4(9446, 21118, 27005) },
+                            { AOM_CDF4(6308, 15141, 21279) },
+                            { AOM_CDF4(2463, 6357, 9783) },
+                            { AOM_CDF4(20667, 30546, 31929) },
+                            { AOM_CDF4(13043, 26123, 30134) },
+                            { AOM_CDF4(8151, 18757, 24778) },
+                            { AOM_CDF4(5255, 12839, 18632) },
+                            { AOM_CDF4(2820, 7206, 11161) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(15736, 27553, 30604) },
+                            { AOM_CDF4(11210, 23794, 28787) },
+                            { AOM_CDF4(5947, 13874, 19701) },
+                            { AOM_CDF4(4215, 9323, 13891) },
+                            { AOM_CDF4(2833, 6462, 10059) },
+                            { AOM_CDF4(19605, 30393, 31582) },
+                            { AOM_CDF4(13523, 26252, 30248) },
+                            { AOM_CDF4(8446, 18622, 24512) },
+                            { AOM_CDF4(3818, 10343, 15974) },
+                            { AOM_CDF4(1481, 4117, 6796) },
+                            { AOM_CDF4(22649, 31302, 32190) },
+                            { AOM_CDF4(14829, 27127, 30449) },
+                            { AOM_CDF4(8313, 17702, 23304) },
+                            { AOM_CDF4(3022, 8301, 12786) },
+                            { AOM_CDF4(1536, 4412, 7184) },
+                            { AOM_CDF4(22354, 29774, 31372) },
+                            { AOM_CDF4(14723, 25472, 29214) },
+                            { AOM_CDF4(6673, 13745, 18662) },
+                            { AOM_CDF4(2068, 5766, 9322) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) } },
+                          { { AOM_CDF4(6302, 16444, 21761) },
+                            { AOM_CDF4(23040, 31538, 32475) },
+                            { AOM_CDF4(15196, 28452, 31496) },
+                            { AOM_CDF4(10020, 22946, 28514) },
+                            { AOM_CDF4(6533, 16862, 23501) },
+                            { AOM_CDF4(3538, 9816, 15076) },
+                            { AOM_CDF4(24444, 31875, 32525) },
+                            { AOM_CDF4(15881, 28924, 31635) },
+                            { AOM_CDF4(9922, 22873, 28466) },
+                            { AOM_CDF4(6527, 16966, 23691) },
+                            { AOM_CDF4(4114, 11303, 17220) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(20201, 30770, 32209) },
+                            { AOM_CDF4(14754, 28071, 31258) },
+                            { AOM_CDF4(8378, 20186, 26517) },
+                            { AOM_CDF4(5916, 15299, 21978) },
+                            { AOM_CDF4(4268, 11583, 17901) },
+                            { AOM_CDF4(24361, 32025, 32581) },
+                            { AOM_CDF4(18673, 30105, 31943) },
+                            { AOM_CDF4(10196, 22244, 27576) },
+                            { AOM_CDF4(5495, 14349, 20417) },
+                            { AOM_CDF4(2676, 7415, 11498) },
+                            { AOM_CDF4(24678, 31958, 32585) },
+                            { AOM_CDF4(18629, 29906, 31831) },
+                            { AOM_CDF4(9364, 20724, 26315) },
+                            { AOM_CDF4(4641, 12318, 18094) },
+                            { AOM_CDF4(2758, 7387, 11579) },
+                            { AOM_CDF4(25433, 31842, 32469) },
+                            { AOM_CDF4(18795, 29289, 31411) },
+                            { AOM_CDF4(7644, 17584, 23592) },
+                            { AOM_CDF4(3408, 9014, 15047) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) } } },
+                        { { { AOM_CDF4(4536, 10072, 14001) },
+                            { AOM_CDF4(25459, 31416, 32206) },
+                            { AOM_CDF4(16605, 28048, 30818) },
+                            { AOM_CDF4(11008, 22857, 27719) },
+                            { AOM_CDF4(6915, 16268, 22315) },
+                            { AOM_CDF4(2625, 6812, 10537) },
+                            { AOM_CDF4(24257, 31788, 32499) },
+                            { AOM_CDF4(16880, 29454, 31879) },
+                            { AOM_CDF4(11958, 25054, 29778) },
+                            { AOM_CDF4(7916, 18718, 25084) },
+                            { AOM_CDF4(3383, 8777, 13446) },
+                            { AOM_CDF4(22720, 31603, 32393) },
+                            { AOM_CDF4(14960, 28125, 31335) },
+                            { AOM_CDF4(9731, 22210, 27928) },
+                            { AOM_CDF4(6304, 15832, 22277) },
+                            { AOM_CDF4(2910, 7818, 12166) },
+                            { AOM_CDF4(20375, 30627, 32131) },
+                            { AOM_CDF4(13904, 27284, 30887) },
+                            { AOM_CDF4(9368, 21558, 27144) },
+                            { AOM_CDF4(5937, 14966, 21119) },
+                            { AOM_CDF4(2667, 7225, 11319) },
+                            { AOM_CDF4(23970, 31470, 32378) },
+                            { AOM_CDF4(17173, 29734, 32018) },
+                            { AOM_CDF4(12795, 25441, 29965) },
+                            { AOM_CDF4(8981, 19680, 25893) },
+                            { AOM_CDF4(4728, 11372, 16902) },
+                            { AOM_CDF4(24287, 31797, 32439) },
+                            { AOM_CDF4(16703, 29145, 31696) },
+                            { AOM_CDF4(10833, 23554, 28725) },
+                            { AOM_CDF4(6468, 16566, 23057) },
+                            { AOM_CDF4(2415, 6562, 10278) },
+                            { AOM_CDF4(26610, 32395, 32659) },
+                            { AOM_CDF4(18590, 30498, 32117) },
+                            { AOM_CDF4(12420, 25756, 29950) },
+                            { AOM_CDF4(7639, 18746, 24710) },
+                            { AOM_CDF4(3001, 8086, 12347) },
+                            { AOM_CDF4(25076, 32064, 32580) },
+                            { AOM_CDF4(17946, 30128, 32028) },
+                            { AOM_CDF4(12024, 24985, 29378) },
+                            { AOM_CDF4(7517, 18390, 24304) },
+                            { AOM_CDF4(3243, 8781, 13331) },
+                            { AOM_CDF4(8192, 16384, 24576) } },
+                          { { AOM_CDF4(6037, 16771, 21957) },
+                            { AOM_CDF4(24774, 31704, 32426) },
+                            { AOM_CDF4(16830, 28589, 31056) },
+                            { AOM_CDF4(10602, 22828, 27760) },
+                            { AOM_CDF4(6733, 16829, 23071) },
+                            { AOM_CDF4(3250, 8914, 13556) },
+                            { AOM_CDF4(25582, 32220, 32668) },
+                            { AOM_CDF4(18659, 30342, 32223) },
+                            { AOM_CDF4(12546, 26149, 30515) },
+                            { AOM_CDF4(8420, 20451, 26801) },
+                            { AOM_CDF4(4636, 12420, 18344) },
+                            { AOM_CDF4(27581, 32362, 32639) },
+                            { AOM_CDF4(18987, 30083, 31978) },
+                            { AOM_CDF4(11327, 24248, 29084) },
+                            { AOM_CDF4(7264, 17719, 24120) },
+                            { AOM_CDF4(3995, 10768, 16169) },
+                            { AOM_CDF4(25893, 31831, 32487) },
+                            { AOM_CDF4(16577, 28587, 31379) },
+                            { AOM_CDF4(10189, 22748, 28182) },
+                            { AOM_CDF4(6832, 17094, 23556) },
+                            { AOM_CDF4(3708, 10110, 15334) },
+                            { AOM_CDF4(25904, 32282, 32656) },
+                            { AOM_CDF4(19721, 30792, 32276) },
+                            { AOM_CDF4(12819, 26243, 30411) },
+                            { AOM_CDF4(8572, 20614, 26891) },
+                            { AOM_CDF4(5364, 14059, 20467) },
+                            { AOM_CDF4(26580, 32438, 32677) },
+                            { AOM_CDF4(20852, 31225, 32340) },
+                            { AOM_CDF4(12435, 25700, 29967) },
+                            { AOM_CDF4(8691, 20825, 26976) },
+                            { AOM_CDF4(4446, 12209, 17269) },
+                            { AOM_CDF4(27350, 32429, 32696) },
+                            { AOM_CDF4(21372, 30977, 32272) },
+                            { AOM_CDF4(12673, 25270, 29853) },
+                            { AOM_CDF4(9208, 20925, 26640) },
+                            { AOM_CDF4(5018, 13351, 18732) },
+                            { AOM_CDF4(27351, 32479, 32713) },
+                            { AOM_CDF4(21398, 31209, 32387) },
+                            { AOM_CDF4(12162, 25047, 29842) },
+                            { AOM_CDF4(7896, 18691, 25319) },
+                            { AOM_CDF4(4670, 12882, 18881) },
+                            { AOM_CDF4(8192, 16384, 24576) } } },
+                        { { { AOM_CDF4(5487, 10460, 13708) },
+                            { AOM_CDF4(21597, 28303, 30674) },
+                            { AOM_CDF4(11037, 21953, 26476) },
+                            { AOM_CDF4(8147, 17962, 22952) },
+                            { AOM_CDF4(5242, 13061, 18532) },
+                            { AOM_CDF4(1889, 5208, 8182) },
+                            { AOM_CDF4(26774, 32133, 32590) },
+                            { AOM_CDF4(17844, 29564, 31767) },
+                            { AOM_CDF4(11690, 24438, 29171) },
+                            { AOM_CDF4(7542, 18215, 24459) },
+                            { AOM_CDF4(2993, 8050, 12319) },
+                            { AOM_CDF4(28023, 32328, 32591) },
+                            { AOM_CDF4(18651, 30126, 31954) },
+                            { AOM_CDF4(12164, 25146, 29589) },
+                            { AOM_CDF4(7762, 18530, 24771) },
+                            { AOM_CDF4(3492, 9183, 13920) },
+                            { AOM_CDF4(27591, 32008, 32491) },
+                            { AOM_CDF4(17149, 28853, 31510) },
+                            { AOM_CDF4(11485, 24003, 28860) },
+                            { AOM_CDF4(7697, 18086, 24210) },
+                            { AOM_CDF4(3075, 7999, 12218) },
+                            { AOM_CDF4(28268, 32482, 32654) },
+                            { AOM_CDF4(19631, 31051, 32404) },
+                            { AOM_CDF4(13860, 27260, 31020) },
+                            { AOM_CDF4(9605, 21613, 27594) },
+                            { AOM_CDF4(4876, 12162, 17908) },
+                            { AOM_CDF4(27248, 32316, 32576) },
+                            { AOM_CDF4(18955, 30457, 32075) },
+                            { AOM_CDF4(11824, 23997, 28795) },
+                            { AOM_CDF4(7346, 18196, 24647) },
+                            { AOM_CDF4(3403, 9247, 14111) },
+                            { AOM_CDF4(29711, 32655, 32735) },
+                            { AOM_CDF4(21169, 31394, 32417) },
+                            { AOM_CDF4(13487, 27198, 30957) },
+                            { AOM_CDF4(8828, 21683, 27614) },
+                            { AOM_CDF4(4270, 11451, 17038) },
+                            { AOM_CDF4(28708, 32578, 32731) },
+                            { AOM_CDF4(20120, 31241, 32482) },
+                            { AOM_CDF4(13692, 27550, 31321) },
+                            { AOM_CDF4(9418, 22514, 28439) },
+                            { AOM_CDF4(4999, 13283, 19462) },
+                            { AOM_CDF4(8192, 16384, 24576) } },
+                          { { AOM_CDF4(5673, 14302, 19711) },
+                            { AOM_CDF4(26251, 30701, 31834) },
+                            { AOM_CDF4(12782, 23783, 27803) },
+                            { AOM_CDF4(9127, 20657, 25808) },
+                            { AOM_CDF4(6368, 16208, 21462) },
+                            { AOM_CDF4(2465, 7177, 10822) },
+                            { AOM_CDF4(29961, 32563, 32719) },
+                            { AOM_CDF4(18318, 29891, 31949) },
+                            { AOM_CDF4(11361, 24514, 29357) },
+                            { AOM_CDF4(7900, 19603, 25607) },
+                            { AOM_CDF4(4002, 10590, 15546) },
+                            { AOM_CDF4(29637, 32310, 32595) },
+                            { AOM_CDF4(18296, 29913, 31809) },
+                            { AOM_CDF4(10144, 21515, 26871) },
+                            { AOM_CDF4(5358, 14322, 20394) },
+                            { AOM_CDF4(3067, 8362, 13346) },
+                            { AOM_CDF4(28652, 32470, 32676) },
+                            { AOM_CDF4(17538, 30771, 32209) },
+                            { AOM_CDF4(13924, 26882, 30494) },
+                            { AOM_CDF4(10496, 22837, 27869) },
+                            { AOM_CDF4(7236, 16396, 21621) },
+                            { AOM_CDF4(30743, 32687, 32746) },
+                            { AOM_CDF4(23006, 31676, 32489) },
+                            { AOM_CDF4(14494, 27828, 31120) },
+                            { AOM_CDF4(10174, 22801, 28352) },
+                            { AOM_CDF4(6242, 15281, 21043) },
+                            { AOM_CDF4(25817, 32243, 32720) },
+                            { AOM_CDF4(18618, 31367, 32325) },
+                            { AOM_CDF4(13997, 28318, 31878) },
+                            { AOM_CDF4(12255, 26534, 31383) },
+                            { AOM_CDF4(9561, 21588, 28450) },
+                            { AOM_CDF4(28188, 32635, 32724) },
+                            { AOM_CDF4(22060, 32365, 32728) },
+                            { AOM_CDF4(18102, 30690, 32528) },
+                            { AOM_CDF4(14196, 28864, 31999) },
+                            { AOM_CDF4(12262, 25792, 30865) },
+                            { AOM_CDF4(24176, 32109, 32628) },
+                            { AOM_CDF4(18280, 29681, 31963) },
+                            { AOM_CDF4(10205, 23703, 29664) },
+                            { AOM_CDF4(7889, 20025, 27676) },
+                            { AOM_CDF4(6060, 16743, 23970) },
+                            { AOM_CDF4(8192, 16384, 24576) } } },
+                        { { { AOM_CDF4(5141, 7096, 8260) },
+                            { AOM_CDF4(27186, 29022, 29789) },
+                            { AOM_CDF4(6668, 12568, 15682) },
+                            { AOM_CDF4(2172, 6181, 8638) },
+                            { AOM_CDF4(1126, 3379, 4531) },
+                            { AOM_CDF4(443, 1361, 2254) },
+                            { AOM_CDF4(26083, 31153, 32436) },
+                            { AOM_CDF4(13486, 24603, 28483) },
+                            { AOM_CDF4(6508, 14840, 19910) },
+                            { AOM_CDF4(3386, 8800, 13286) },
+                            { AOM_CDF4(1530, 4322, 7054) },
+                            { AOM_CDF4(29639, 32080, 32548) },
+                            { AOM_CDF4(15897, 27552, 30290) },
+                            { AOM_CDF4(8588, 20047, 25383) },
+                            { AOM_CDF4(4889, 13339, 19269) },
+                            { AOM_CDF4(2240, 6871, 10498) },
+                            { AOM_CDF4(28165, 32197, 32517) },
+                            { AOM_CDF4(20735, 30427, 31568) },
+                            { AOM_CDF4(14325, 24671, 27692) },
+                            { AOM_CDF4(5119, 12554, 17805) },
+                            { AOM_CDF4(1810, 5441, 8261) },
+                            { AOM_CDF4(31212, 32724, 32748) },
+                            { AOM_CDF4(23352, 31766, 32545) },
+                            { AOM_CDF4(14669, 27570, 31059) },
+                            { AOM_CDF4(8492, 20894, 27272) },
+                            { AOM_CDF4(3644, 10194, 15204) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) } },
+                          { { AOM_CDF4(2461, 7013, 9371) },
+                            { AOM_CDF4(24749, 29600, 30986) },
+                            { AOM_CDF4(9466, 19037, 22417) },
+                            { AOM_CDF4(3584, 9280, 14400) },
+                            { AOM_CDF4(1505, 3929, 5433) },
+                            { AOM_CDF4(677, 1500, 2736) },
+                            { AOM_CDF4(23987, 30702, 32117) },
+                            { AOM_CDF4(13554, 24571, 29263) },
+                            { AOM_CDF4(6211, 14556, 21155) },
+                            { AOM_CDF4(3135, 10972, 15625) },
+                            { AOM_CDF4(2435, 7127, 11427) },
+                            { AOM_CDF4(31300, 32532, 32550) },
+                            { AOM_CDF4(14757, 30365, 31954) },
+                            { AOM_CDF4(4405, 11612, 18553) },
+                            { AOM_CDF4(580, 4132, 7322) },
+                            { AOM_CDF4(1695, 10169, 14124) },
+                            { AOM_CDF4(30008, 32282, 32591) },
+                            { AOM_CDF4(19244, 30108, 31748) },
+                            { AOM_CDF4(11180, 24158, 29555) },
+                            { AOM_CDF4(5650, 14972, 19209) },
+                            { AOM_CDF4(2114, 5109, 8456) },
+                            { AOM_CDF4(31856, 32716, 32748) },
+                            { AOM_CDF4(23012, 31664, 32572) },
+                            { AOM_CDF4(13694, 26656, 30636) },
+                            { AOM_CDF4(8142, 19508, 26093) },
+                            { AOM_CDF4(4253, 10955, 16724) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) } } },
+                        { { { AOM_CDF4(601, 983, 1311) },
+                            { AOM_CDF4(18725, 23406, 28087) },
+                            { AOM_CDF4(5461, 8192, 10923) },
+                            { AOM_CDF4(3781, 15124, 21425) },
+                            { AOM_CDF4(2587, 7761, 12072) },
+                            { AOM_CDF4(106, 458, 810) },
+                            { AOM_CDF4(22282, 29710, 31894) },
+                            { AOM_CDF4(8508, 20926, 25984) },
+                            { AOM_CDF4(3726, 12713, 18083) },
+                            { AOM_CDF4(1620, 7112, 10893) },
+                            { AOM_CDF4(729, 2236, 3495) },
+                            { AOM_CDF4(30163, 32474, 32684) },
+                            { AOM_CDF4(18304, 30464, 32000) },
+                            { AOM_CDF4(11443, 26526, 29647) },
+                            { AOM_CDF4(6007, 15292, 21299) },
+                            { AOM_CDF4(2234, 6703, 8937) },
+                            { AOM_CDF4(30954, 32177, 32571) },
+                            { AOM_CDF4(17363, 29562, 31076) },
+                            { AOM_CDF4(9686, 22464, 27410) },
+                            { AOM_CDF4(8192, 16384, 21390) },
+                            { AOM_CDF4(1755, 8046, 11264) },
+                            { AOM_CDF4(31168, 32734, 32748) },
+                            { AOM_CDF4(22486, 31441, 32471) },
+                            { AOM_CDF4(12833, 25627, 29738) },
+                            { AOM_CDF4(6980, 17379, 23122) },
+                            { AOM_CDF4(3111, 8887, 13479) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) } },
+                          { { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) } } } },
+                      { { { { AOM_CDF4(6041, 11854, 15927) },
+                            { AOM_CDF4(20326, 30905, 32251) },
+                            { AOM_CDF4(14164, 26831, 30725) },
+                            { AOM_CDF4(9760, 20647, 26585) },
+                            { AOM_CDF4(6416, 14953, 21219) },
+                            { AOM_CDF4(2966, 7151, 10891) },
+                            { AOM_CDF4(23567, 31374, 32254) },
+                            { AOM_CDF4(14978, 27416, 30946) },
+                            { AOM_CDF4(9434, 20225, 26254) },
+                            { AOM_CDF4(6658, 14558, 20535) },
+                            { AOM_CDF4(3916, 8677, 12989) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(18088, 29545, 31587) },
+                            { AOM_CDF4(13062, 25843, 30073) },
+                            { AOM_CDF4(8940, 16827, 22251) },
+                            { AOM_CDF4(7654, 13220, 17973) },
+                            { AOM_CDF4(5733, 10316, 14456) },
+                            { AOM_CDF4(22879, 31388, 32114) },
+                            { AOM_CDF4(15215, 27993, 30955) },
+                            { AOM_CDF4(9397, 19445, 24978) },
+                            { AOM_CDF4(3442, 9813, 15344) },
+                            { AOM_CDF4(1368, 3936, 6532) },
+                            { AOM_CDF4(25494, 32033, 32406) },
+                            { AOM_CDF4(16772, 27963, 30718) },
+                            { AOM_CDF4(9419, 18165, 23260) },
+                            { AOM_CDF4(2677, 7501, 11797) },
+                            { AOM_CDF4(1516, 4344, 7170) },
+                            { AOM_CDF4(26556, 31454, 32101) },
+                            { AOM_CDF4(17128, 27035, 30108) },
+                            { AOM_CDF4(8324, 15344, 20249) },
+                            { AOM_CDF4(1903, 5696, 9469) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) } },
+                          { { AOM_CDF4(8455, 19003, 24368) },
+                            { AOM_CDF4(23563, 32021, 32604) },
+                            { AOM_CDF4(16237, 29446, 31935) },
+                            { AOM_CDF4(10724, 23999, 29358) },
+                            { AOM_CDF4(6725, 17528, 24416) },
+                            { AOM_CDF4(3927, 10927, 16825) },
+                            { AOM_CDF4(26313, 32288, 32634) },
+                            { AOM_CDF4(17430, 30095, 32095) },
+                            { AOM_CDF4(11116, 24606, 29679) },
+                            { AOM_CDF4(7195, 18384, 25269) },
+                            { AOM_CDF4(4726, 12852, 19315) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(22822, 31648, 32483) },
+                            { AOM_CDF4(16724, 29633, 31929) },
+                            { AOM_CDF4(10261, 23033, 28725) },
+                            { AOM_CDF4(7029, 17840, 24528) },
+                            { AOM_CDF4(4867, 13886, 21502) },
+                            { AOM_CDF4(25298, 31892, 32491) },
+                            { AOM_CDF4(17809, 29330, 31512) },
+                            { AOM_CDF4(9668, 21329, 26579) },
+                            { AOM_CDF4(4774, 12956, 18976) },
+                            { AOM_CDF4(2322, 7030, 11540) },
+                            { AOM_CDF4(25472, 31920, 32543) },
+                            { AOM_CDF4(17957, 29387, 31632) },
+                            { AOM_CDF4(9196, 20593, 26400) },
+                            { AOM_CDF4(4680, 12705, 19202) },
+                            { AOM_CDF4(2917, 8456, 13436) },
+                            { AOM_CDF4(26471, 32059, 32574) },
+                            { AOM_CDF4(18458, 29783, 31909) },
+                            { AOM_CDF4(8400, 19464, 25956) },
+                            { AOM_CDF4(3812, 10973, 17206) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) } } },
+                        { { { AOM_CDF4(6779, 13743, 17678) },
+                            { AOM_CDF4(24806, 31797, 32457) },
+                            { AOM_CDF4(17616, 29047, 31372) },
+                            { AOM_CDF4(11063, 23175, 28003) },
+                            { AOM_CDF4(6521, 16110, 22324) },
+                            { AOM_CDF4(2764, 7504, 11654) },
+                            { AOM_CDF4(25266, 32367, 32637) },
+                            { AOM_CDF4(19054, 30553, 32175) },
+                            { AOM_CDF4(12139, 25212, 29807) },
+                            { AOM_CDF4(7311, 18162, 24704) },
+                            { AOM_CDF4(3397, 9164, 14074) },
+                            { AOM_CDF4(25988, 32208, 32522) },
+                            { AOM_CDF4(16253, 28912, 31526) },
+                            { AOM_CDF4(9151, 21387, 27372) },
+                            { AOM_CDF4(5688, 14915, 21496) },
+                            { AOM_CDF4(2717, 7627, 12004) },
+                            { AOM_CDF4(23144, 31855, 32443) },
+                            { AOM_CDF4(16070, 28491, 31325) },
+                            { AOM_CDF4(8702, 20467, 26517) },
+                            { AOM_CDF4(5243, 13956, 20367) },
+                            { AOM_CDF4(2621, 7335, 11567) },
+                            { AOM_CDF4(26636, 32340, 32630) },
+                            { AOM_CDF4(19990, 31050, 32341) },
+                            { AOM_CDF4(13243, 26105, 30315) },
+                            { AOM_CDF4(8588, 19521, 25918) },
+                            { AOM_CDF4(4717, 11585, 17304) },
+                            { AOM_CDF4(25844, 32292, 32582) },
+                            { AOM_CDF4(19090, 30635, 32097) },
+                            { AOM_CDF4(11963, 24546, 28939) },
+                            { AOM_CDF4(6218, 16087, 22354) },
+                            { AOM_CDF4(2340, 6608, 10426) },
+                            { AOM_CDF4(28046, 32576, 32694) },
+                            { AOM_CDF4(21178, 31313, 32296) },
+                            { AOM_CDF4(13486, 26184, 29870) },
+                            { AOM_CDF4(7149, 17871, 23723) },
+                            { AOM_CDF4(2833, 7958, 12259) },
+                            { AOM_CDF4(27710, 32528, 32686) },
+                            { AOM_CDF4(20674, 31076, 32268) },
+                            { AOM_CDF4(12413, 24955, 29243) },
+                            { AOM_CDF4(6676, 16927, 23097) },
+                            { AOM_CDF4(2966, 8333, 12919) },
+                            { AOM_CDF4(8192, 16384, 24576) } },
+                          { { AOM_CDF4(8639, 19339, 24429) },
+                            { AOM_CDF4(24404, 31837, 32525) },
+                            { AOM_CDF4(16997, 29425, 31784) },
+                            { AOM_CDF4(11253, 24234, 29149) },
+                            { AOM_CDF4(6751, 17394, 24028) },
+                            { AOM_CDF4(3490, 9830, 15191) },
+                            { AOM_CDF4(26283, 32471, 32714) },
+                            { AOM_CDF4(19599, 31168, 32442) },
+                            { AOM_CDF4(13146, 26954, 30893) },
+                            { AOM_CDF4(8214, 20588, 26890) },
+                            { AOM_CDF4(4699, 13081, 19300) },
+                            { AOM_CDF4(28212, 32458, 32669) },
+                            { AOM_CDF4(18594, 30316, 32100) },
+                            { AOM_CDF4(11219, 24408, 29234) },
+                            { AOM_CDF4(6865, 17656, 24149) },
+                            { AOM_CDF4(3678, 10362, 16006) },
+                            { AOM_CDF4(25825, 32136, 32616) },
+                            { AOM_CDF4(17313, 29853, 32021) },
+                            { AOM_CDF4(11197, 24471, 29472) },
+                            { AOM_CDF4(6947, 17781, 24405) },
+                            { AOM_CDF4(3768, 10660, 16261) },
+                            { AOM_CDF4(27352, 32500, 32706) },
+                            { AOM_CDF4(20850, 31468, 32469) },
+                            { AOM_CDF4(14021, 27707, 31133) },
+                            { AOM_CDF4(8964, 21748, 27838) },
+                            { AOM_CDF4(5437, 14665, 21187) },
+                            { AOM_CDF4(26304, 32492, 32698) },
+                            { AOM_CDF4(20409, 31380, 32385) },
+                            { AOM_CDF4(13682, 27222, 30632) },
+                            { AOM_CDF4(8974, 21236, 26685) },
+                            { AOM_CDF4(4234, 11665, 16934) },
+                            { AOM_CDF4(26273, 32357, 32711) },
+                            { AOM_CDF4(20672, 31242, 32441) },
+                            { AOM_CDF4(14172, 27254, 30902) },
+                            { AOM_CDF4(9870, 21898, 27275) },
+                            { AOM_CDF4(5164, 13506, 19270) },
+                            { AOM_CDF4(26725, 32459, 32728) },
+                            { AOM_CDF4(20991, 31442, 32527) },
+                            { AOM_CDF4(13071, 26434, 30811) },
+                            { AOM_CDF4(8184, 20090, 26742) },
+                            { AOM_CDF4(4803, 13255, 19895) },
+                            { AOM_CDF4(8192, 16384, 24576) } } },
+                        { { { AOM_CDF4(7555, 14942, 18501) },
+                            { AOM_CDF4(24410, 31178, 32287) },
+                            { AOM_CDF4(14394, 26738, 30253) },
+                            { AOM_CDF4(8413, 19554, 25195) },
+                            { AOM_CDF4(4766, 12924, 18785) },
+                            { AOM_CDF4(2029, 5806, 9207) },
+                            { AOM_CDF4(26776, 32364, 32663) },
+                            { AOM_CDF4(18732, 29967, 31931) },
+                            { AOM_CDF4(11005, 23786, 28852) },
+                            { AOM_CDF4(6466, 16909, 23510) },
+                            { AOM_CDF4(3044, 8638, 13419) },
+                            { AOM_CDF4(29208, 32582, 32704) },
+                            { AOM_CDF4(20068, 30857, 32208) },
+                            { AOM_CDF4(12003, 25085, 29595) },
+                            { AOM_CDF4(6947, 17750, 24189) },
+                            { AOM_CDF4(3245, 9103, 14007) },
+                            { AOM_CDF4(27359, 32465, 32669) },
+                            { AOM_CDF4(19421, 30614, 32174) },
+                            { AOM_CDF4(11915, 25010, 29579) },
+                            { AOM_CDF4(6950, 17676, 24074) },
+                            { AOM_CDF4(3007, 8473, 13096) },
+                            { AOM_CDF4(29002, 32676, 32735) },
+                            { AOM_CDF4(22102, 31849, 32576) },
+                            { AOM_CDF4(14408, 28009, 31405) },
+                            { AOM_CDF4(9027, 21679, 27931) },
+                            { AOM_CDF4(4694, 12678, 18748) },
+                            { AOM_CDF4(28216, 32528, 32682) },
+                            { AOM_CDF4(20849, 31264, 32318) },
+                            { AOM_CDF4(12756, 25815, 29751) },
+                            { AOM_CDF4(7565, 18801, 24923) },
+                            { AOM_CDF4(3509, 9533, 14477) },
+                            { AOM_CDF4(30133, 32687, 32739) },
+                            { AOM_CDF4(23063, 31910, 32515) },
+                            { AOM_CDF4(14588, 28051, 31132) },
+                            { AOM_CDF4(9085, 21649, 27457) },
+                            { AOM_CDF4(4261, 11654, 17264) },
+                            { AOM_CDF4(29518, 32691, 32748) },
+                            { AOM_CDF4(22451, 31959, 32613) },
+                            { AOM_CDF4(14864, 28722, 31700) },
+                            { AOM_CDF4(9695, 22964, 28716) },
+                            { AOM_CDF4(4932, 13358, 19502) },
+                            { AOM_CDF4(8192, 16384, 24576) } },
+                          { { AOM_CDF4(6465, 16958, 21688) },
+                            { AOM_CDF4(25199, 31514, 32360) },
+                            { AOM_CDF4(14774, 27149, 30607) },
+                            { AOM_CDF4(9257, 21438, 26972) },
+                            { AOM_CDF4(5723, 15183, 21882) },
+                            { AOM_CDF4(3150, 8879, 13731) },
+                            { AOM_CDF4(26989, 32262, 32682) },
+                            { AOM_CDF4(17396, 29937, 32085) },
+                            { AOM_CDF4(11387, 24901, 29784) },
+                            { AOM_CDF4(7289, 18821, 25548) },
+                            { AOM_CDF4(3734, 10577, 16086) },
+                            { AOM_CDF4(29728, 32501, 32695) },
+                            { AOM_CDF4(17431, 29701, 31903) },
+                            { AOM_CDF4(9921, 22826, 28300) },
+                            { AOM_CDF4(5896, 15434, 22068) },
+                            { AOM_CDF4(3430, 9646, 14757) },
+                            { AOM_CDF4(28614, 32511, 32705) },
+                            { AOM_CDF4(19364, 30638, 32263) },
+                            { AOM_CDF4(13129, 26254, 30402) },
+                            { AOM_CDF4(8754, 20484, 26440) },
+                            { AOM_CDF4(4378, 11607, 17110) },
+                            { AOM_CDF4(30292, 32671, 32744) },
+                            { AOM_CDF4(21780, 31603, 32501) },
+                            { AOM_CDF4(14314, 27829, 31291) },
+                            { AOM_CDF4(9611, 22327, 28263) },
+                            { AOM_CDF4(4890, 13087, 19065) },
+                            { AOM_CDF4(25862, 32567, 32733) },
+                            { AOM_CDF4(20794, 32050, 32567) },
+                            { AOM_CDF4(17243, 30625, 32254) },
+                            { AOM_CDF4(13283, 27628, 31474) },
+                            { AOM_CDF4(9669, 22532, 28918) },
+                            { AOM_CDF4(27435, 32697, 32748) },
+                            { AOM_CDF4(24922, 32390, 32714) },
+                            { AOM_CDF4(21449, 31504, 32536) },
+                            { AOM_CDF4(16392, 29729, 31832) },
+                            { AOM_CDF4(11692, 24884, 29076) },
+                            { AOM_CDF4(24193, 32290, 32735) },
+                            { AOM_CDF4(18909, 31104, 32563) },
+                            { AOM_CDF4(12236, 26841, 31403) },
+                            { AOM_CDF4(8171, 21840, 29082) },
+                            { AOM_CDF4(7224, 17280, 25275) },
+                            { AOM_CDF4(8192, 16384, 24576) } } },
+                        { { { AOM_CDF4(3078, 6839, 9890) },
+                            { AOM_CDF4(13837, 20450, 24479) },
+                            { AOM_CDF4(5914, 14222, 19328) },
+                            { AOM_CDF4(3866, 10267, 14762) },
+                            { AOM_CDF4(2612, 7208, 11042) },
+                            { AOM_CDF4(1067, 2991, 4776) },
+                            { AOM_CDF4(25817, 31646, 32529) },
+                            { AOM_CDF4(13708, 26338, 30385) },
+                            { AOM_CDF4(7328, 18585, 24870) },
+                            { AOM_CDF4(4691, 13080, 19276) },
+                            { AOM_CDF4(1825, 5253, 8352) },
+                            { AOM_CDF4(29386, 32315, 32624) },
+                            { AOM_CDF4(17160, 29001, 31360) },
+                            { AOM_CDF4(9602, 21862, 27396) },
+                            { AOM_CDF4(5915, 15772, 22148) },
+                            { AOM_CDF4(2786, 7779, 12047) },
+                            { AOM_CDF4(29246, 32450, 32663) },
+                            { AOM_CDF4(18696, 29929, 31818) },
+                            { AOM_CDF4(10510, 23369, 28560) },
+                            { AOM_CDF4(6229, 16499, 23125) },
+                            { AOM_CDF4(2608, 7448, 11705) },
+                            { AOM_CDF4(30753, 32710, 32748) },
+                            { AOM_CDF4(21638, 31487, 32503) },
+                            { AOM_CDF4(12937, 26854, 30870) },
+                            { AOM_CDF4(8182, 20596, 26970) },
+                            { AOM_CDF4(3637, 10269, 15497) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) } },
+                          { { AOM_CDF4(5244, 12150, 16906) },
+                            { AOM_CDF4(20486, 26858, 29701) },
+                            { AOM_CDF4(7756, 18317, 23735) },
+                            { AOM_CDF4(3452, 9256, 13146) },
+                            { AOM_CDF4(2020, 5206, 8229) },
+                            { AOM_CDF4(1801, 4993, 7903) },
+                            { AOM_CDF4(27051, 31858, 32531) },
+                            { AOM_CDF4(15988, 27531, 30619) },
+                            { AOM_CDF4(9188, 21484, 26719) },
+                            { AOM_CDF4(6273, 17186, 23800) },
+                            { AOM_CDF4(3108, 9355, 14764) },
+                            { AOM_CDF4(31076, 32520, 32680) },
+                            { AOM_CDF4(18119, 30037, 31850) },
+                            { AOM_CDF4(10244, 22969, 27472) },
+                            { AOM_CDF4(4692, 14077, 19273) },
+                            { AOM_CDF4(3694, 11677, 17556) },
+                            { AOM_CDF4(30060, 32581, 32720) },
+                            { AOM_CDF4(21011, 30775, 32120) },
+                            { AOM_CDF4(11931, 24820, 29289) },
+                            { AOM_CDF4(7119, 17662, 24356) },
+                            { AOM_CDF4(3833, 10706, 16304) },
+                            { AOM_CDF4(31954, 32731, 32748) },
+                            { AOM_CDF4(23913, 31724, 32489) },
+                            { AOM_CDF4(15520, 28060, 31286) },
+                            { AOM_CDF4(11517, 23008, 28571) },
+                            { AOM_CDF4(6193, 14508, 20629) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) } } },
+                        { { { AOM_CDF4(1035, 2807, 4156) },
+                            { AOM_CDF4(13162, 18138, 20939) },
+                            { AOM_CDF4(2696, 6633, 8755) },
+                            { AOM_CDF4(1373, 4161, 6853) },
+                            { AOM_CDF4(1099, 2746, 4716) },
+                            { AOM_CDF4(340, 1021, 1599) },
+                            { AOM_CDF4(22826, 30419, 32135) },
+                            { AOM_CDF4(10395, 21762, 26942) },
+                            { AOM_CDF4(4726, 12407, 17361) },
+                            { AOM_CDF4(2447, 7080, 10593) },
+                            { AOM_CDF4(1227, 3717, 6011) },
+                            { AOM_CDF4(28156, 31424, 31934) },
+                            { AOM_CDF4(16915, 27754, 30373) },
+                            { AOM_CDF4(9148, 20990, 26431) },
+                            { AOM_CDF4(5950, 15515, 21148) },
+                            { AOM_CDF4(2492, 7327, 11526) },
+                            { AOM_CDF4(30602, 32477, 32670) },
+                            { AOM_CDF4(20026, 29955, 31568) },
+                            { AOM_CDF4(11220, 23628, 28105) },
+                            { AOM_CDF4(6652, 17019, 22973) },
+                            { AOM_CDF4(3064, 8536, 13043) },
+                            { AOM_CDF4(31769, 32724, 32748) },
+                            { AOM_CDF4(22230, 30887, 32373) },
+                            { AOM_CDF4(12234, 25079, 29731) },
+                            { AOM_CDF4(7326, 18816, 25353) },
+                            { AOM_CDF4(3933, 10907, 16616) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) } },
+                          { { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) } } } },
+                      { { { { AOM_CDF4(8896, 16227, 20630) },
+                            { AOM_CDF4(23629, 31782, 32527) },
+                            { AOM_CDF4(15173, 27755, 31321) },
+                            { AOM_CDF4(10158, 21233, 27382) },
+                            { AOM_CDF4(6420, 14857, 21558) },
+                            { AOM_CDF4(3269, 8155, 12646) },
+                            { AOM_CDF4(24835, 32009, 32496) },
+                            { AOM_CDF4(16509, 28421, 31579) },
+                            { AOM_CDF4(10957, 21514, 27418) },
+                            { AOM_CDF4(7881, 15930, 22096) },
+                            { AOM_CDF4(5388, 10960, 15918) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(20745, 30773, 32093) },
+                            { AOM_CDF4(15200, 27221, 30861) },
+                            { AOM_CDF4(13032, 20873, 25667) },
+                            { AOM_CDF4(12285, 18663, 23494) },
+                            { AOM_CDF4(11563, 17481, 21489) },
+                            { AOM_CDF4(26260, 31982, 32320) },
+                            { AOM_CDF4(15397, 28083, 31100) },
+                            { AOM_CDF4(9742, 19217, 24824) },
+                            { AOM_CDF4(3261, 9629, 15362) },
+                            { AOM_CDF4(1480, 4322, 7499) },
+                            { AOM_CDF4(27599, 32256, 32460) },
+                            { AOM_CDF4(16857, 27659, 30774) },
+                            { AOM_CDF4(9551, 18290, 23748) },
+                            { AOM_CDF4(3052, 8933, 14103) },
+                            { AOM_CDF4(2021, 5910, 9787) },
+                            { AOM_CDF4(29005, 32015, 32392) },
+                            { AOM_CDF4(17677, 27694, 30863) },
+                            { AOM_CDF4(9204, 17356, 23219) },
+                            { AOM_CDF4(2403, 7516, 12814) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) } },
+                          { { AOM_CDF4(10808, 22056, 26896) },
+                            { AOM_CDF4(25739, 32313, 32676) },
+                            { AOM_CDF4(17288, 30203, 32221) },
+                            { AOM_CDF4(11359, 24878, 29896) },
+                            { AOM_CDF4(6949, 17767, 24893) },
+                            { AOM_CDF4(4287, 11796, 18071) },
+                            { AOM_CDF4(27880, 32521, 32705) },
+                            { AOM_CDF4(19038, 31004, 32414) },
+                            { AOM_CDF4(12564, 26345, 30768) },
+                            { AOM_CDF4(8269, 19947, 26779) },
+                            { AOM_CDF4(5674, 14657, 21674) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(25742, 32319, 32671) },
+                            { AOM_CDF4(19557, 31164, 32454) },
+                            { AOM_CDF4(13381, 26381, 30755) },
+                            { AOM_CDF4(10101, 21466, 26722) },
+                            { AOM_CDF4(9209, 19650, 26825) },
+                            { AOM_CDF4(27107, 31917, 32432) },
+                            { AOM_CDF4(18056, 28893, 31203) },
+                            { AOM_CDF4(10200, 21434, 26764) },
+                            { AOM_CDF4(4660, 12913, 19502) },
+                            { AOM_CDF4(2368, 6930, 12504) },
+                            { AOM_CDF4(26960, 32158, 32613) },
+                            { AOM_CDF4(18628, 30005, 32031) },
+                            { AOM_CDF4(10233, 22442, 28232) },
+                            { AOM_CDF4(5471, 14630, 21516) },
+                            { AOM_CDF4(3235, 10767, 17109) },
+                            { AOM_CDF4(27696, 32440, 32692) },
+                            { AOM_CDF4(20032, 31167, 32438) },
+                            { AOM_CDF4(8700, 21341, 28442) },
+                            { AOM_CDF4(5662, 14831, 21795) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) } } },
+                        { { { AOM_CDF4(9704, 17294, 21132) },
+                            { AOM_CDF4(26762, 32278, 32633) },
+                            { AOM_CDF4(18382, 29620, 31819) },
+                            { AOM_CDF4(10891, 23475, 28723) },
+                            { AOM_CDF4(6358, 16583, 23309) },
+                            { AOM_CDF4(3248, 9118, 14141) },
+                            { AOM_CDF4(27204, 32573, 32699) },
+                            { AOM_CDF4(19818, 30824, 32329) },
+                            { AOM_CDF4(11772, 25120, 30041) },
+                            { AOM_CDF4(6995, 18033, 25039) },
+                            { AOM_CDF4(3752, 10442, 16098) },
+                            { AOM_CDF4(27222, 32256, 32559) },
+                            { AOM_CDF4(15356, 28399, 31475) },
+                            { AOM_CDF4(8821, 20635, 27057) },
+                            { AOM_CDF4(5511, 14404, 21239) },
+                            { AOM_CDF4(2935, 8222, 13051) },
+                            { AOM_CDF4(24875, 32120, 32529) },
+                            { AOM_CDF4(15233, 28265, 31445) },
+                            { AOM_CDF4(8605, 20570, 26932) },
+                            { AOM_CDF4(5431, 14413, 21196) },
+                            { AOM_CDF4(2994, 8341, 13223) },
+                            { AOM_CDF4(28201, 32604, 32700) },
+                            { AOM_CDF4(21041, 31446, 32456) },
+                            { AOM_CDF4(13221, 26213, 30475) },
+                            { AOM_CDF4(8255, 19385, 26037) },
+                            { AOM_CDF4(4930, 12585, 18830) },
+                            { AOM_CDF4(28768, 32448, 32627) },
+                            { AOM_CDF4(19705, 30561, 32021) },
+                            { AOM_CDF4(11572, 23589, 28220) },
+                            { AOM_CDF4(5532, 15034, 21446) },
+                            { AOM_CDF4(2460, 7150, 11456) },
+                            { AOM_CDF4(29874, 32619, 32699) },
+                            { AOM_CDF4(21621, 31071, 32201) },
+                            { AOM_CDF4(12511, 24747, 28992) },
+                            { AOM_CDF4(6281, 16395, 22748) },
+                            { AOM_CDF4(3246, 9278, 14497) },
+                            { AOM_CDF4(29715, 32625, 32712) },
+                            { AOM_CDF4(20958, 31011, 32283) },
+                            { AOM_CDF4(11233, 23671, 28806) },
+                            { AOM_CDF4(6012, 16128, 22868) },
+                            { AOM_CDF4(3427, 9851, 15414) },
+                            { AOM_CDF4(8192, 16384, 24576) } },
+                          { { AOM_CDF4(11016, 22111, 26794) },
+                            { AOM_CDF4(25946, 32357, 32677) },
+                            { AOM_CDF4(17890, 30452, 32252) },
+                            { AOM_CDF4(11678, 25142, 29816) },
+                            { AOM_CDF4(6720, 17534, 24584) },
+                            { AOM_CDF4(4230, 11665, 17820) },
+                            { AOM_CDF4(28400, 32623, 32747) },
+                            { AOM_CDF4(21164, 31668, 32575) },
+                            { AOM_CDF4(13572, 27388, 31182) },
+                            { AOM_CDF4(8234, 20750, 27358) },
+                            { AOM_CDF4(5065, 14055, 20897) },
+                            { AOM_CDF4(28981, 32547, 32705) },
+                            { AOM_CDF4(18681, 30543, 32239) },
+                            { AOM_CDF4(10919, 24075, 29286) },
+                            { AOM_CDF4(6431, 17199, 24077) },
+                            { AOM_CDF4(3819, 10464, 16618) },
+                            { AOM_CDF4(26870, 32467, 32693) },
+                            { AOM_CDF4(19041, 30831, 32347) },
+                            { AOM_CDF4(11794, 25211, 30016) },
+                            { AOM_CDF4(6888, 18019, 24970) },
+                            { AOM_CDF4(4370, 12363, 18992) },
+                            { AOM_CDF4(29578, 32670, 32744) },
+                            { AOM_CDF4(23159, 32007, 32613) },
+                            { AOM_CDF4(15315, 28669, 31676) },
+                            { AOM_CDF4(9298, 22607, 28782) },
+                            { AOM_CDF4(6144, 15913, 22968) },
+                            { AOM_CDF4(28110, 32499, 32669) },
+                            { AOM_CDF4(21574, 30937, 32015) },
+                            { AOM_CDF4(12759, 24818, 28727) },
+                            { AOM_CDF4(6545, 16761, 23042) },
+                            { AOM_CDF4(3649, 10597, 16833) },
+                            { AOM_CDF4(28163, 32552, 32728) },
+                            { AOM_CDF4(22101, 31469, 32464) },
+                            { AOM_CDF4(13160, 25472, 30143) },
+                            { AOM_CDF4(7303, 18684, 25468) },
+                            { AOM_CDF4(5241, 13975, 20955) },
+                            { AOM_CDF4(28400, 32631, 32744) },
+                            { AOM_CDF4(22104, 31793, 32603) },
+                            { AOM_CDF4(13557, 26571, 30846) },
+                            { AOM_CDF4(7749, 19861, 26675) },
+                            { AOM_CDF4(4873, 14030, 21234) },
+                            { AOM_CDF4(8192, 16384, 24576) } } },
+                        { { { AOM_CDF4(9800, 17635, 21073) },
+                            { AOM_CDF4(26153, 31885, 32527) },
+                            { AOM_CDF4(15038, 27852, 31006) },
+                            { AOM_CDF4(8718, 20564, 26486) },
+                            { AOM_CDF4(5128, 14076, 20514) },
+                            { AOM_CDF4(2636, 7566, 11925) },
+                            { AOM_CDF4(27551, 32504, 32701) },
+                            { AOM_CDF4(18310, 30054, 32100) },
+                            { AOM_CDF4(10211, 23420, 29082) },
+                            { AOM_CDF4(6222, 16876, 23916) },
+                            { AOM_CDF4(3462, 9954, 15498) },
+                            { AOM_CDF4(29991, 32633, 32721) },
+                            { AOM_CDF4(19883, 30751, 32201) },
+                            { AOM_CDF4(11141, 24184, 29285) },
+                            { AOM_CDF4(6420, 16940, 23774) },
+                            { AOM_CDF4(3392, 9753, 15118) },
+                            { AOM_CDF4(28465, 32616, 32712) },
+                            { AOM_CDF4(19850, 30702, 32244) },
+                            { AOM_CDF4(10983, 24024, 29223) },
+                            { AOM_CDF4(6294, 16770, 23582) },
+                            { AOM_CDF4(3244, 9283, 14509) },
+                            { AOM_CDF4(30023, 32717, 32748) },
+                            { AOM_CDF4(22940, 32032, 32626) },
+                            { AOM_CDF4(14282, 27928, 31473) },
+                            { AOM_CDF4(8562, 21327, 27914) },
+                            { AOM_CDF4(4846, 13393, 19919) },
+                            { AOM_CDF4(29981, 32590, 32695) },
+                            { AOM_CDF4(20465, 30963, 32166) },
+                            { AOM_CDF4(11479, 23579, 28195) },
+                            { AOM_CDF4(5916, 15648, 22073) },
+                            { AOM_CDF4(3031, 8605, 13398) },
+                            { AOM_CDF4(31146, 32691, 32739) },
+                            { AOM_CDF4(23106, 31724, 32444) },
+                            { AOM_CDF4(13783, 26738, 30439) },
+                            { AOM_CDF4(7852, 19468, 25807) },
+                            { AOM_CDF4(3860, 11124, 16853) },
+                            { AOM_CDF4(31014, 32724, 32748) },
+                            { AOM_CDF4(23629, 32109, 32628) },
+                            { AOM_CDF4(14747, 28115, 31403) },
+                            { AOM_CDF4(8545, 21242, 27478) },
+                            { AOM_CDF4(4574, 12781, 19067) },
+                            { AOM_CDF4(8192, 16384, 24576) } },
+                          { { AOM_CDF4(9185, 19694, 24688) },
+                            { AOM_CDF4(26081, 31985, 32621) },
+                            { AOM_CDF4(16015, 29000, 31787) },
+                            { AOM_CDF4(10542, 23690, 29206) },
+                            { AOM_CDF4(6732, 17945, 24677) },
+                            { AOM_CDF4(3916, 11039, 16722) },
+                            { AOM_CDF4(28224, 32566, 32744) },
+                            { AOM_CDF4(19100, 31138, 32485) },
+                            { AOM_CDF4(12528, 26620, 30879) },
+                            { AOM_CDF4(7741, 20277, 26885) },
+                            { AOM_CDF4(4566, 12845, 18990) },
+                            { AOM_CDF4(29933, 32593, 32718) },
+                            { AOM_CDF4(17670, 30333, 32155) },
+                            { AOM_CDF4(10385, 23600, 28909) },
+                            { AOM_CDF4(6243, 16236, 22407) },
+                            { AOM_CDF4(3976, 10389, 16017) },
+                            { AOM_CDF4(28377, 32561, 32738) },
+                            { AOM_CDF4(19366, 31175, 32482) },
+                            { AOM_CDF4(13327, 27175, 31094) },
+                            { AOM_CDF4(8258, 20769, 27143) },
+                            { AOM_CDF4(4703, 13198, 19527) },
+                            { AOM_CDF4(31086, 32706, 32748) },
+                            { AOM_CDF4(22853, 31902, 32583) },
+                            { AOM_CDF4(14759, 28186, 31419) },
+                            { AOM_CDF4(9284, 22382, 28348) },
+                            { AOM_CDF4(5585, 15192, 21868) },
+                            { AOM_CDF4(28291, 32652, 32746) },
+                            { AOM_CDF4(19849, 32107, 32571) },
+                            { AOM_CDF4(14834, 26818, 29214) },
+                            { AOM_CDF4(10306, 22594, 28672) },
+                            { AOM_CDF4(6615, 17384, 23384) },
+                            { AOM_CDF4(28947, 32604, 32745) },
+                            { AOM_CDF4(25625, 32289, 32646) },
+                            { AOM_CDF4(18758, 28672, 31403) },
+                            { AOM_CDF4(10017, 23430, 28523) },
+                            { AOM_CDF4(6862, 15269, 22131) },
+                            { AOM_CDF4(23933, 32509, 32739) },
+                            { AOM_CDF4(19927, 31495, 32631) },
+                            { AOM_CDF4(11903, 26023, 30621) },
+                            { AOM_CDF4(7026, 20094, 27252) },
+                            { AOM_CDF4(5998, 18106, 24437) },
+                            { AOM_CDF4(8192, 16384, 24576) } } },
+                        { { { AOM_CDF4(4456, 11274, 15533) },
+                            { AOM_CDF4(21219, 29079, 31616) },
+                            { AOM_CDF4(11173, 23774, 28567) },
+                            { AOM_CDF4(7282, 18293, 24263) },
+                            { AOM_CDF4(4890, 13286, 19115) },
+                            { AOM_CDF4(1890, 5508, 8659) },
+                            { AOM_CDF4(26651, 32136, 32647) },
+                            { AOM_CDF4(14630, 28254, 31455) },
+                            { AOM_CDF4(8716, 21287, 27395) },
+                            { AOM_CDF4(5615, 15331, 22008) },
+                            { AOM_CDF4(2675, 7700, 12150) },
+                            { AOM_CDF4(29954, 32526, 32690) },
+                            { AOM_CDF4(16126, 28982, 31633) },
+                            { AOM_CDF4(9030, 21361, 27352) },
+                            { AOM_CDF4(5411, 14793, 21271) },
+                            { AOM_CDF4(2943, 8422, 13163) },
+                            { AOM_CDF4(29539, 32601, 32730) },
+                            { AOM_CDF4(18125, 30385, 32201) },
+                            { AOM_CDF4(10422, 24090, 29468) },
+                            { AOM_CDF4(6468, 17487, 24438) },
+                            { AOM_CDF4(2970, 8653, 13531) },
+                            { AOM_CDF4(30912, 32715, 32748) },
+                            { AOM_CDF4(20666, 31373, 32497) },
+                            { AOM_CDF4(12509, 26640, 30917) },
+                            { AOM_CDF4(8058, 20629, 27290) },
+                            { AOM_CDF4(4231, 12006, 18052) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) } },
+                          { { AOM_CDF4(10202, 20633, 25484) },
+                            { AOM_CDF4(27336, 31445, 32352) },
+                            { AOM_CDF4(12420, 24384, 28552) },
+                            { AOM_CDF4(7648, 18115, 23856) },
+                            { AOM_CDF4(5662, 14341, 19902) },
+                            { AOM_CDF4(3611, 10328, 15390) },
+                            { AOM_CDF4(30945, 32616, 32736) },
+                            { AOM_CDF4(18682, 30505, 32253) },
+                            { AOM_CDF4(11513, 25336, 30203) },
+                            { AOM_CDF4(7449, 19452, 26148) },
+                            { AOM_CDF4(4482, 13051, 18886) },
+                            { AOM_CDF4(32022, 32690, 32747) },
+                            { AOM_CDF4(18578, 30501, 32146) },
+                            { AOM_CDF4(11249, 23368, 28631) },
+                            { AOM_CDF4(5645, 16958, 22158) },
+                            { AOM_CDF4(5009, 11444, 16637) },
+                            { AOM_CDF4(31357, 32710, 32748) },
+                            { AOM_CDF4(21552, 31494, 32504) },
+                            { AOM_CDF4(13891, 27677, 31340) },
+                            { AOM_CDF4(9051, 22098, 28172) },
+                            { AOM_CDF4(5190, 13377, 19486) },
+                            { AOM_CDF4(32364, 32740, 32748) },
+                            { AOM_CDF4(24839, 31907, 32551) },
+                            { AOM_CDF4(17160, 28779, 31696) },
+                            { AOM_CDF4(12452, 24137, 29602) },
+                            { AOM_CDF4(6165, 15389, 22477) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) } } },
+                        { { { AOM_CDF4(2575, 7281, 11077) },
+                            { AOM_CDF4(14002, 20866, 25402) },
+                            { AOM_CDF4(6343, 15056, 19658) },
+                            { AOM_CDF4(4474, 11858, 17041) },
+                            { AOM_CDF4(2865, 8299, 12534) },
+                            { AOM_CDF4(1344, 3949, 6391) },
+                            { AOM_CDF4(24720, 31239, 32459) },
+                            { AOM_CDF4(12585, 25356, 29968) },
+                            { AOM_CDF4(7181, 18246, 24444) },
+                            { AOM_CDF4(5025, 13667, 19885) },
+                            { AOM_CDF4(2521, 7304, 11605) },
+                            { AOM_CDF4(29908, 32252, 32584) },
+                            { AOM_CDF4(17421, 29156, 31575) },
+                            { AOM_CDF4(9889, 22188, 27782) },
+                            { AOM_CDF4(5878, 15647, 22123) },
+                            { AOM_CDF4(2814, 8665, 13323) },
+                            { AOM_CDF4(30183, 32568, 32713) },
+                            { AOM_CDF4(18528, 30195, 32049) },
+                            { AOM_CDF4(10982, 24606, 29657) },
+                            { AOM_CDF4(6957, 18165, 25231) },
+                            { AOM_CDF4(3508, 10118, 15468) },
+                            { AOM_CDF4(31761, 32736, 32748) },
+                            { AOM_CDF4(21041, 31328, 32546) },
+                            { AOM_CDF4(12568, 26732, 31166) },
+                            { AOM_CDF4(8052, 20720, 27733) },
+                            { AOM_CDF4(4336, 12192, 18396) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) } },
+                          { { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) } } } },
+                      { { { { AOM_CDF4(7062, 16472, 22319) },
+                            { AOM_CDF4(24538, 32261, 32674) },
+                            { AOM_CDF4(13675, 28041, 31779) },
+                            { AOM_CDF4(8590, 20674, 27631) },
+                            { AOM_CDF4(5685, 14675, 22013) },
+                            { AOM_CDF4(3655, 9898, 15731) },
+                            { AOM_CDF4(26493, 32418, 32658) },
+                            { AOM_CDF4(16376, 29342, 32090) },
+                            { AOM_CDF4(10594, 22649, 28970) },
+                            { AOM_CDF4(8176, 17170, 24303) },
+                            { AOM_CDF4(5605, 12694, 19139) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(23888, 31902, 32542) },
+                            { AOM_CDF4(18612, 29687, 31987) },
+                            { AOM_CDF4(16245, 24852, 29249) },
+                            { AOM_CDF4(15765, 22608, 27559) },
+                            { AOM_CDF4(19895, 24699, 27510) },
+                            { AOM_CDF4(28401, 32212, 32457) },
+                            { AOM_CDF4(15274, 27825, 30980) },
+                            { AOM_CDF4(9364, 18128, 24332) },
+                            { AOM_CDF4(2283, 8193, 15082) },
+                            { AOM_CDF4(1228, 3972, 7881) },
+                            { AOM_CDF4(29455, 32469, 32620) },
+                            { AOM_CDF4(17981, 28245, 31388) },
+                            { AOM_CDF4(10921, 20098, 26240) },
+                            { AOM_CDF4(3743, 11829, 18657) },
+                            { AOM_CDF4(2374, 9593, 15715) },
+                            { AOM_CDF4(31068, 32466, 32635) },
+                            { AOM_CDF4(20321, 29572, 31971) },
+                            { AOM_CDF4(10771, 20255, 27119) },
+                            { AOM_CDF4(2795, 10410, 17361) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) } },
+                          { { AOM_CDF4(9320, 22102, 27840) },
+                            { AOM_CDF4(27057, 32464, 32724) },
+                            { AOM_CDF4(16331, 30268, 32309) },
+                            { AOM_CDF4(10319, 23935, 29720) },
+                            { AOM_CDF4(6189, 16448, 24106) },
+                            { AOM_CDF4(3589, 10884, 18808) },
+                            { AOM_CDF4(29026, 32624, 32748) },
+                            { AOM_CDF4(19226, 31507, 32587) },
+                            { AOM_CDF4(12692, 26921, 31203) },
+                            { AOM_CDF4(7049, 19532, 27635) },
+                            { AOM_CDF4(7727, 15669, 23252) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(28056, 32625, 32748) },
+                            { AOM_CDF4(22383, 32075, 32669) },
+                            { AOM_CDF4(15417, 27098, 31749) },
+                            { AOM_CDF4(18127, 26493, 27190) },
+                            { AOM_CDF4(5461, 16384, 21845) },
+                            { AOM_CDF4(27982, 32091, 32584) },
+                            { AOM_CDF4(19045, 29868, 31972) },
+                            { AOM_CDF4(10397, 22266, 27932) },
+                            { AOM_CDF4(5990, 13697, 21500) },
+                            { AOM_CDF4(1792, 6912, 15104) },
+                            { AOM_CDF4(28198, 32501, 32718) },
+                            { AOM_CDF4(21534, 31521, 32569) },
+                            { AOM_CDF4(11109, 25217, 30017) },
+                            { AOM_CDF4(5671, 15124, 26151) },
+                            { AOM_CDF4(4681, 14043, 18725) },
+                            { AOM_CDF4(28688, 32580, 32741) },
+                            { AOM_CDF4(22576, 32079, 32661) },
+                            { AOM_CDF4(10627, 22141, 28340) },
+                            { AOM_CDF4(9362, 14043, 28087) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) } } },
+                        { { { AOM_CDF4(7754, 16948, 22142) },
+                            { AOM_CDF4(25670, 32330, 32691) },
+                            { AOM_CDF4(15663, 29225, 31994) },
+                            { AOM_CDF4(9878, 23288, 29158) },
+                            { AOM_CDF4(6419, 17088, 24336) },
+                            { AOM_CDF4(3859, 11003, 17039) },
+                            { AOM_CDF4(27562, 32595, 32725) },
+                            { AOM_CDF4(17575, 30588, 32399) },
+                            { AOM_CDF4(10819, 24838, 30309) },
+                            { AOM_CDF4(7124, 18686, 25916) },
+                            { AOM_CDF4(4479, 12688, 19340) },
+                            { AOM_CDF4(28385, 32476, 32673) },
+                            { AOM_CDF4(15306, 29005, 31938) },
+                            { AOM_CDF4(8937, 21615, 28322) },
+                            { AOM_CDF4(5982, 15603, 22786) },
+                            { AOM_CDF4(3620, 10267, 16136) },
+                            { AOM_CDF4(27280, 32464, 32667) },
+                            { AOM_CDF4(15607, 29160, 32004) },
+                            { AOM_CDF4(9091, 22135, 28740) },
+                            { AOM_CDF4(6232, 16632, 24020) },
+                            { AOM_CDF4(4047, 11377, 17672) },
+                            { AOM_CDF4(29220, 32630, 32718) },
+                            { AOM_CDF4(19650, 31220, 32462) },
+                            { AOM_CDF4(13050, 26312, 30827) },
+                            { AOM_CDF4(9228, 20870, 27468) },
+                            { AOM_CDF4(6146, 15149, 21971) },
+                            { AOM_CDF4(30169, 32481, 32623) },
+                            { AOM_CDF4(17212, 29311, 31554) },
+                            { AOM_CDF4(9911, 21311, 26882) },
+                            { AOM_CDF4(4487, 13314, 20372) },
+                            { AOM_CDF4(2570, 7772, 12889) },
+                            { AOM_CDF4(30924, 32613, 32708) },
+                            { AOM_CDF4(19490, 30206, 32107) },
+                            { AOM_CDF4(11232, 23998, 29276) },
+                            { AOM_CDF4(6769, 17955, 25035) },
+                            { AOM_CDF4(4398, 12623, 19214) },
+                            { AOM_CDF4(30609, 32627, 32722) },
+                            { AOM_CDF4(19370, 30582, 32287) },
+                            { AOM_CDF4(10457, 23619, 29409) },
+                            { AOM_CDF4(6443, 17637, 24834) },
+                            { AOM_CDF4(4645, 13236, 20106) },
+                            { AOM_CDF4(8192, 16384, 24576) } },
+                          { { AOM_CDF4(8626, 20271, 26216) },
+                            { AOM_CDF4(26707, 32406, 32711) },
+                            { AOM_CDF4(16999, 30329, 32286) },
+                            { AOM_CDF4(11445, 25123, 30286) },
+                            { AOM_CDF4(6411, 18828, 25601) },
+                            { AOM_CDF4(6801, 12458, 20248) },
+                            { AOM_CDF4(29918, 32682, 32748) },
+                            { AOM_CDF4(20649, 31739, 32618) },
+                            { AOM_CDF4(12879, 27773, 31581) },
+                            { AOM_CDF4(7896, 21751, 28244) },
+                            { AOM_CDF4(5260, 14870, 23698) },
+                            { AOM_CDF4(29252, 32593, 32731) },
+                            { AOM_CDF4(17072, 30460, 32294) },
+                            { AOM_CDF4(10653, 24143, 29365) },
+                            { AOM_CDF4(6536, 17490, 23983) },
+                            { AOM_CDF4(4929, 13170, 20085) },
+                            { AOM_CDF4(28137, 32518, 32715) },
+                            { AOM_CDF4(18171, 30784, 32407) },
+                            { AOM_CDF4(11437, 25436, 30459) },
+                            { AOM_CDF4(7252, 18534, 26176) },
+                            { AOM_CDF4(4126, 13353, 20978) },
+                            { AOM_CDF4(31162, 32726, 32748) },
+                            { AOM_CDF4(23017, 32222, 32701) },
+                            { AOM_CDF4(15629, 29233, 32046) },
+                            { AOM_CDF4(9387, 22621, 29480) },
+                            { AOM_CDF4(6922, 17616, 25010) },
+                            { AOM_CDF4(28838, 32265, 32614) },
+                            { AOM_CDF4(19701, 30206, 31920) },
+                            { AOM_CDF4(11214, 22410, 27933) },
+                            { AOM_CDF4(5320, 14177, 23034) },
+                            { AOM_CDF4(5049, 12881, 17827) },
+                            { AOM_CDF4(27484, 32471, 32734) },
+                            { AOM_CDF4(21076, 31526, 32561) },
+                            { AOM_CDF4(12707, 26303, 31211) },
+                            { AOM_CDF4(8169, 21722, 28219) },
+                            { AOM_CDF4(6045, 19406, 27042) },
+                            { AOM_CDF4(27753, 32572, 32745) },
+                            { AOM_CDF4(20832, 31878, 32653) },
+                            { AOM_CDF4(13250, 27356, 31674) },
+                            { AOM_CDF4(7718, 21508, 29858) },
+                            { AOM_CDF4(7209, 18350, 25559) },
+                            { AOM_CDF4(8192, 16384, 24576) } } },
+                        { { { AOM_CDF4(7876, 16901, 21741) },
+                            { AOM_CDF4(24001, 31898, 32625) },
+                            { AOM_CDF4(14529, 27959, 31451) },
+                            { AOM_CDF4(8273, 20818, 27258) },
+                            { AOM_CDF4(5278, 14673, 21510) },
+                            { AOM_CDF4(2983, 8843, 14039) },
+                            { AOM_CDF4(28016, 32574, 32732) },
+                            { AOM_CDF4(17471, 30306, 32301) },
+                            { AOM_CDF4(10224, 24063, 29728) },
+                            { AOM_CDF4(6602, 17954, 25052) },
+                            { AOM_CDF4(4002, 11585, 17759) },
+                            { AOM_CDF4(30190, 32634, 32739) },
+                            { AOM_CDF4(17497, 30282, 32270) },
+                            { AOM_CDF4(10229, 23729, 29538) },
+                            { AOM_CDF4(6344, 17211, 24440) },
+                            { AOM_CDF4(3849, 11189, 17108) },
+                            { AOM_CDF4(28570, 32583, 32726) },
+                            { AOM_CDF4(17521, 30161, 32238) },
+                            { AOM_CDF4(10153, 23565, 29378) },
+                            { AOM_CDF4(6455, 17341, 24443) },
+                            { AOM_CDF4(3907, 11042, 17024) },
+                            { AOM_CDF4(30689, 32715, 32748) },
+                            { AOM_CDF4(21546, 31840, 32610) },
+                            { AOM_CDF4(13547, 27581, 31459) },
+                            { AOM_CDF4(8912, 21757, 28309) },
+                            { AOM_CDF4(5548, 15080, 22046) },
+                            { AOM_CDF4(30783, 32540, 32685) },
+                            { AOM_CDF4(17540, 29528, 31668) },
+                            { AOM_CDF4(10160, 21468, 26783) },
+                            { AOM_CDF4(4724, 13393, 20054) },
+                            { AOM_CDF4(2702, 8174, 13102) },
+                            { AOM_CDF4(31648, 32686, 32742) },
+                            { AOM_CDF4(20954, 31094, 32337) },
+                            { AOM_CDF4(12420, 25698, 30179) },
+                            { AOM_CDF4(7304, 19320, 26248) },
+                            { AOM_CDF4(4366, 12261, 18864) },
+                            { AOM_CDF4(31581, 32723, 32748) },
+                            { AOM_CDF4(21373, 31586, 32525) },
+                            { AOM_CDF4(12744, 26625, 30885) },
+                            { AOM_CDF4(7431, 20322, 26950) },
+                            { AOM_CDF4(4692, 13323, 20111) },
+                            { AOM_CDF4(8192, 16384, 24576) } },
+                          { { AOM_CDF4(7833, 18369, 24095) },
+                            { AOM_CDF4(26650, 32273, 32702) },
+                            { AOM_CDF4(16371, 29961, 32191) },
+                            { AOM_CDF4(11055, 24082, 29629) },
+                            { AOM_CDF4(6892, 18644, 25400) },
+                            { AOM_CDF4(5006, 13057, 19240) },
+                            { AOM_CDF4(29834, 32666, 32748) },
+                            { AOM_CDF4(19577, 31335, 32570) },
+                            { AOM_CDF4(12253, 26509, 31122) },
+                            { AOM_CDF4(7991, 20772, 27711) },
+                            { AOM_CDF4(5677, 15910, 23059) },
+                            { AOM_CDF4(30109, 32532, 32720) },
+                            { AOM_CDF4(16747, 30166, 32252) },
+                            { AOM_CDF4(10134, 23542, 29184) },
+                            { AOM_CDF4(5791, 16176, 23556) },
+                            { AOM_CDF4(4362, 10414, 17284) },
+                            { AOM_CDF4(29492, 32626, 32748) },
+                            { AOM_CDF4(19894, 31402, 32525) },
+                            { AOM_CDF4(12942, 27071, 30869) },
+                            { AOM_CDF4(8346, 21216, 27405) },
+                            { AOM_CDF4(6572, 17087, 23859) },
+                            { AOM_CDF4(32035, 32735, 32748) },
+                            { AOM_CDF4(22957, 31838, 32618) },
+                            { AOM_CDF4(14724, 28572, 31772) },
+                            { AOM_CDF4(10364, 23999, 29553) },
+                            { AOM_CDF4(7004, 18433, 25655) },
+                            { AOM_CDF4(27528, 32277, 32681) },
+                            { AOM_CDF4(16959, 31171, 32096) },
+                            { AOM_CDF4(10486, 23593, 27962) },
+                            { AOM_CDF4(8192, 16384, 23211) },
+                            { AOM_CDF4(8937, 17873, 20852) },
+                            { AOM_CDF4(27715, 32002, 32615) },
+                            { AOM_CDF4(15073, 29491, 31676) },
+                            { AOM_CDF4(11264, 24576, 28672) },
+                            { AOM_CDF4(2341, 18725, 23406) },
+                            { AOM_CDF4(7282, 18204, 25486) },
+                            { AOM_CDF4(28547, 32213, 32657) },
+                            { AOM_CDF4(20788, 29773, 32239) },
+                            { AOM_CDF4(6780, 21469, 30508) },
+                            { AOM_CDF4(5958, 14895, 23831) },
+                            { AOM_CDF4(16384, 21845, 27307) },
+                            { AOM_CDF4(8192, 16384, 24576) } } },
+                        { { { AOM_CDF4(5992, 14304, 19765) },
+                            { AOM_CDF4(22612, 31238, 32456) },
+                            { AOM_CDF4(13456, 27162, 31087) },
+                            { AOM_CDF4(8001, 20062, 26504) },
+                            { AOM_CDF4(5168, 14105, 20764) },
+                            { AOM_CDF4(2632, 7771, 12385) },
+                            { AOM_CDF4(27034, 32344, 32709) },
+                            { AOM_CDF4(15850, 29415, 31997) },
+                            { AOM_CDF4(9494, 22776, 28841) },
+                            { AOM_CDF4(6151, 16830, 23969) },
+                            { AOM_CDF4(3461, 10039, 15722) },
+                            { AOM_CDF4(30134, 32569, 32731) },
+                            { AOM_CDF4(15638, 29422, 31945) },
+                            { AOM_CDF4(9150, 21865, 28218) },
+                            { AOM_CDF4(5647, 15719, 22676) },
+                            { AOM_CDF4(3402, 9772, 15477) },
+                            { AOM_CDF4(28530, 32586, 32735) },
+                            { AOM_CDF4(17139, 30298, 32292) },
+                            { AOM_CDF4(10200, 24039, 29685) },
+                            { AOM_CDF4(6419, 17674, 24786) },
+                            { AOM_CDF4(3544, 10225, 15824) },
+                            { AOM_CDF4(31333, 32726, 32748) },
+                            { AOM_CDF4(20618, 31487, 32544) },
+                            { AOM_CDF4(12901, 27217, 31232) },
+                            { AOM_CDF4(8624, 21734, 28171) },
+                            { AOM_CDF4(5104, 14191, 20748) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) } },
+                          { { AOM_CDF4(11206, 21090, 26561) },
+                            { AOM_CDF4(28759, 32279, 32671) },
+                            { AOM_CDF4(14171, 27952, 31569) },
+                            { AOM_CDF4(9743, 22907, 29141) },
+                            { AOM_CDF4(6871, 17886, 24868) },
+                            { AOM_CDF4(4960, 13152, 19315) },
+                            { AOM_CDF4(31077, 32661, 32748) },
+                            { AOM_CDF4(19400, 31195, 32515) },
+                            { AOM_CDF4(12752, 26858, 31040) },
+                            { AOM_CDF4(8370, 22098, 28591) },
+                            { AOM_CDF4(5457, 15373, 22298) },
+                            { AOM_CDF4(31697, 32706, 32748) },
+                            { AOM_CDF4(17860, 30657, 32333) },
+                            { AOM_CDF4(12510, 24812, 29261) },
+                            { AOM_CDF4(6180, 19124, 24722) },
+                            { AOM_CDF4(5041, 13548, 17959) },
+                            { AOM_CDF4(31552, 32716, 32748) },
+                            { AOM_CDF4(21908, 31769, 32623) },
+                            { AOM_CDF4(14470, 28201, 31565) },
+                            { AOM_CDF4(9493, 22982, 28608) },
+                            { AOM_CDF4(6858, 17240, 24137) },
+                            { AOM_CDF4(32543, 32752, 32756) },
+                            { AOM_CDF4(24286, 32097, 32666) },
+                            { AOM_CDF4(15958, 29217, 32024) },
+                            { AOM_CDF4(10207, 24234, 29958) },
+                            { AOM_CDF4(6929, 18305, 25652) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) } } },
+                        { { { AOM_CDF4(4137, 10847, 15682) },
+                            { AOM_CDF4(17824, 27001, 30058) },
+                            { AOM_CDF4(10204, 22796, 28291) },
+                            { AOM_CDF4(6076, 15935, 22125) },
+                            { AOM_CDF4(3852, 10937, 16816) },
+                            { AOM_CDF4(2252, 6324, 10131) },
+                            { AOM_CDF4(25840, 32016, 32662) },
+                            { AOM_CDF4(15109, 28268, 31531) },
+                            { AOM_CDF4(9385, 22231, 28340) },
+                            { AOM_CDF4(6082, 16672, 23479) },
+                            { AOM_CDF4(3318, 9427, 14681) },
+                            { AOM_CDF4(30594, 32574, 32718) },
+                            { AOM_CDF4(16836, 29552, 31859) },
+                            { AOM_CDF4(9556, 22542, 28356) },
+                            { AOM_CDF4(6305, 16725, 23540) },
+                            { AOM_CDF4(3376, 9895, 15184) },
+                            { AOM_CDF4(29383, 32617, 32745) },
+                            { AOM_CDF4(18891, 30809, 32401) },
+                            { AOM_CDF4(11688, 25942, 30687) },
+                            { AOM_CDF4(7468, 19469, 26651) },
+                            { AOM_CDF4(3909, 11358, 17012) },
+                            { AOM_CDF4(31564, 32736, 32748) },
+                            { AOM_CDF4(20906, 31611, 32600) },
+                            { AOM_CDF4(13191, 27621, 31537) },
+                            { AOM_CDF4(8768, 22029, 28676) },
+                            { AOM_CDF4(5079, 14109, 20906) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) } },
+                          { { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) },
+                            { AOM_CDF4(8192, 16384, 24576) } } } } };
+
+static const aom_cdf_prob av1_default_coeff_base_eob_multi_cdfs
+    [TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES][SIG_COEF_CONTEXTS_EOB][CDF_SIZE(
+        NUM_BASE_LEVELS + 1)] = { { { { { AOM_CDF3(17837, 29055) },
+                                        { AOM_CDF3(29600, 31446) },
+                                        { AOM_CDF3(30844, 31878) },
+                                        { AOM_CDF3(24926, 28948) } },
+                                      { { AOM_CDF3(21365, 30026) },
+                                        { AOM_CDF3(30512, 32423) },
+                                        { AOM_CDF3(31658, 32621) },
+                                        { AOM_CDF3(29630, 31881) } } },
+                                    { { { AOM_CDF3(5717, 26477) },
+                                        { AOM_CDF3(30491, 31703) },
+                                        { AOM_CDF3(31550, 32158) },
+                                        { AOM_CDF3(29648, 31491) } },
+                                      { { AOM_CDF3(12608, 27820) },
+                                        { AOM_CDF3(30680, 32225) },
+                                        { AOM_CDF3(30809, 32335) },
+                                        { AOM_CDF3(31299, 32423) } } },
+                                    { { { AOM_CDF3(1786, 12612) },
+                                        { AOM_CDF3(30663, 31625) },
+                                        { AOM_CDF3(32339, 32468) },
+                                        { AOM_CDF3(31148, 31833) } },
+                                      { { AOM_CDF3(18857, 23865) },
+                                        { AOM_CDF3(31428, 32428) },
+                                        { AOM_CDF3(31744, 32373) },
+                                        { AOM_CDF3(31775, 32526) } } },
+                                    { { { AOM_CDF3(1787, 2532) },
+                                        { AOM_CDF3(30832, 31662) },
+                                        { AOM_CDF3(31824, 32682) },
+                                        { AOM_CDF3(32133, 32569) } },
+                                      { { AOM_CDF3(13751, 22235) },
+                                        { AOM_CDF3(32089, 32409) },
+                                        { AOM_CDF3(27084, 27920) },
+                                        { AOM_CDF3(29291, 32594) } } },
+                                    { { { AOM_CDF3(1725, 3449) },
+                                        { AOM_CDF3(31102, 31935) },
+                                        { AOM_CDF3(32457, 32613) },
+                                        { AOM_CDF3(32412, 32649) } },
+                                      { { AOM_CDF3(10923, 21845) },
+                                        { AOM_CDF3(10923, 21845) },
+                                        { AOM_CDF3(10923, 21845) },
+                                        { AOM_CDF3(10923, 21845) } } } },
+                                  { { { { AOM_CDF3(17560, 29888) },
+                                        { AOM_CDF3(29671, 31549) },
+                                        { AOM_CDF3(31007, 32056) },
+                                        { AOM_CDF3(27286, 30006) } },
+                                      { { AOM_CDF3(26594, 31212) },
+                                        { AOM_CDF3(31208, 32582) },
+                                        { AOM_CDF3(31835, 32637) },
+                                        { AOM_CDF3(30595, 32206) } } },
+                                    { { { AOM_CDF3(15239, 29932) },
+                                        { AOM_CDF3(31315, 32095) },
+                                        { AOM_CDF3(32130, 32434) },
+                                        { AOM_CDF3(30864, 31996) } },
+                                      { { AOM_CDF3(26279, 30968) },
+                                        { AOM_CDF3(31142, 32495) },
+                                        { AOM_CDF3(31713, 32540) },
+                                        { AOM_CDF3(31929, 32594) } } },
+                                    { { { AOM_CDF3(2644, 25198) },
+                                        { AOM_CDF3(32038, 32451) },
+                                        { AOM_CDF3(32639, 32695) },
+                                        { AOM_CDF3(32166, 32518) } },
+                                      { { AOM_CDF3(17187, 27668) },
+                                        { AOM_CDF3(31714, 32550) },
+                                        { AOM_CDF3(32283, 32678) },
+                                        { AOM_CDF3(31930, 32563) } } },
+                                    { { { AOM_CDF3(1044, 2257) },
+                                        { AOM_CDF3(30755, 31923) },
+                                        { AOM_CDF3(32208, 32693) },
+                                        { AOM_CDF3(32244, 32615) } },
+                                      { { AOM_CDF3(21317, 26207) },
+                                        { AOM_CDF3(29133, 30868) },
+                                        { AOM_CDF3(29311, 31231) },
+                                        { AOM_CDF3(29657, 31087) } } },
+                                    { { { AOM_CDF3(478, 1834) },
+                                        { AOM_CDF3(31005, 31987) },
+                                        { AOM_CDF3(32317, 32724) },
+                                        { AOM_CDF3(30865, 32648) } },
+                                      { { AOM_CDF3(10923, 21845) },
+                                        { AOM_CDF3(10923, 21845) },
+                                        { AOM_CDF3(10923, 21845) },
+                                        { AOM_CDF3(10923, 21845) } } } },
+                                  { { { { AOM_CDF3(20092, 30774) },
+                                        { AOM_CDF3(30695, 32020) },
+                                        { AOM_CDF3(31131, 32103) },
+                                        { AOM_CDF3(28666, 30870) } },
+                                      { { AOM_CDF3(27258, 31095) },
+                                        { AOM_CDF3(31804, 32623) },
+                                        { AOM_CDF3(31763, 32528) },
+                                        { AOM_CDF3(31438, 32506) } } },
+                                    { { { AOM_CDF3(18049, 30489) },
+                                        { AOM_CDF3(31706, 32286) },
+                                        { AOM_CDF3(32163, 32473) },
+                                        { AOM_CDF3(31550, 32184) } },
+                                      { { AOM_CDF3(27116, 30842) },
+                                        { AOM_CDF3(31971, 32598) },
+                                        { AOM_CDF3(32088, 32576) },
+                                        { AOM_CDF3(32067, 32664) } } },
+                                    { { { AOM_CDF3(12854, 29093) },
+                                        { AOM_CDF3(32272, 32558) },
+                                        { AOM_CDF3(32667, 32729) },
+                                        { AOM_CDF3(32306, 32585) } },
+                                      { { AOM_CDF3(25476, 30366) },
+                                        { AOM_CDF3(32169, 32687) },
+                                        { AOM_CDF3(32479, 32689) },
+                                        { AOM_CDF3(31673, 32634) } } },
+                                    { { { AOM_CDF3(2809, 19301) },
+                                        { AOM_CDF3(32205, 32622) },
+                                        { AOM_CDF3(32338, 32730) },
+                                        { AOM_CDF3(31786, 32616) } },
+                                      { { AOM_CDF3(22737, 29105) },
+                                        { AOM_CDF3(30810, 32362) },
+                                        { AOM_CDF3(30014, 32627) },
+                                        { AOM_CDF3(30528, 32574) } } },
+                                    { { { AOM_CDF3(935, 3382) },
+                                        { AOM_CDF3(30789, 31909) },
+                                        { AOM_CDF3(32466, 32756) },
+                                        { AOM_CDF3(30860, 32513) } },
+                                      { { AOM_CDF3(10923, 21845) },
+                                        { AOM_CDF3(10923, 21845) },
+                                        { AOM_CDF3(10923, 21845) },
+                                        { AOM_CDF3(10923, 21845) } } } },
+                                  { { { { AOM_CDF3(22497, 31198) },
+                                        { AOM_CDF3(31715, 32495) },
+                                        { AOM_CDF3(31606, 32337) },
+                                        { AOM_CDF3(30388, 31990) } },
+                                      { { AOM_CDF3(27877, 31584) },
+                                        { AOM_CDF3(32170, 32728) },
+                                        { AOM_CDF3(32155, 32688) },
+                                        { AOM_CDF3(32219, 32702) } } },
+                                    { { { AOM_CDF3(21457, 31043) },
+                                        { AOM_CDF3(31951, 32483) },
+                                        { AOM_CDF3(32153, 32562) },
+                                        { AOM_CDF3(31473, 32215) } },
+                                      { { AOM_CDF3(27558, 31151) },
+                                        { AOM_CDF3(32020, 32640) },
+                                        { AOM_CDF3(32097, 32575) },
+                                        { AOM_CDF3(32242, 32719) } } },
+                                    { { { AOM_CDF3(19980, 30591) },
+                                        { AOM_CDF3(32219, 32597) },
+                                        { AOM_CDF3(32581, 32706) },
+                                        { AOM_CDF3(31803, 32287) } },
+                                      { { AOM_CDF3(26473, 30507) },
+                                        { AOM_CDF3(32431, 32723) },
+                                        { AOM_CDF3(32196, 32611) },
+                                        { AOM_CDF3(31588, 32528) } } },
+                                    { { { AOM_CDF3(24647, 30463) },
+                                        { AOM_CDF3(32412, 32695) },
+                                        { AOM_CDF3(32468, 32720) },
+                                        { AOM_CDF3(31269, 32523) } },
+                                      { { AOM_CDF3(28482, 31505) },
+                                        { AOM_CDF3(32152, 32701) },
+                                        { AOM_CDF3(31732, 32598) },
+                                        { AOM_CDF3(31767, 32712) } } },
+                                    { { { AOM_CDF3(12358, 24977) },
+                                        { AOM_CDF3(31331, 32385) },
+                                        { AOM_CDF3(32634, 32756) },
+                                        { AOM_CDF3(30411, 32548) } },
+                                      { { AOM_CDF3(10923, 21845) },
+                                        { AOM_CDF3(10923, 21845) },
+                                        { AOM_CDF3(10923, 21845) },
+                                        { AOM_CDF3(10923, 21845) } } } } };
+
+#endif  // AOM_AV1_COMMON_TOKEN_CDFS_H_
diff --git a/libs/libaom/src/av1/common/txb_common.c b/libs/libaom/src/av1/common/txb_common.c
new file mode 100644
index 000000000..4eef319cd
--- /dev/null
+++ b/libs/libaom/src/av1/common/txb_common.c
@@ -0,0 +1,458 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include "aom/aom_integer.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/common/txb_common.h"
+
+const int8_t av1_coeff_band_4x4[16] = { 0, 1, 2,  3,  4,  5,  6,  7,
+                                        8, 9, 10, 11, 12, 13, 14, 15 };
+
+const int8_t av1_coeff_band_8x8[64] = {
+  0,  1,  2,  2,  3,  3,  4,  4,  5,  6,  2,  2,  3,  3,  4,  4,
+  7,  7,  8,  8,  9,  9,  10, 10, 7,  7,  8,  8,  9,  9,  10, 10,
+  11, 11, 12, 12, 13, 13, 14, 14, 11, 11, 12, 12, 13, 13, 14, 14,
+  15, 15, 16, 16, 17, 17, 18, 18, 15, 15, 16, 16, 17, 17, 18, 18,
+};
+
+const int8_t av1_coeff_band_16x16[256] = {
+  0,  1,  4,  4,  7,  7,  7,  7,  8,  8,  8,  8,  9,  9,  9,  9,  2,  3,  4,
+  4,  7,  7,  7,  7,  8,  8,  8,  8,  9,  9,  9,  9,  5,  5,  6,  6,  7,  7,
+  7,  7,  8,  8,  8,  8,  9,  9,  9,  9,  5,  5,  6,  6,  7,  7,  7,  7,  8,
+  8,  8,  8,  9,  9,  9,  9,  10, 10, 10, 10, 11, 11, 11, 11, 12, 12, 12, 12,
+  13, 13, 13, 13, 10, 10, 10, 10, 11, 11, 11, 11, 12, 12, 12, 12, 13, 13, 13,
+  13, 10, 10, 10, 10, 11, 11, 11, 11, 12, 12, 12, 12, 13, 13, 13, 13, 10, 10,
+  10, 10, 11, 11, 11, 11, 12, 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 15,
+  15, 15, 15, 16, 16, 16, 16, 17, 17, 17, 17, 14, 14, 14, 14, 15, 15, 15, 15,
+  16, 16, 16, 16, 17, 17, 17, 17, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16,
+  16, 17, 17, 17, 17, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 17, 17,
+  17, 17, 18, 18, 18, 18, 19, 19, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, 18,
+  18, 18, 18, 19, 19, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, 18, 18, 18, 18,
+  19, 19, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, 18, 18, 18, 18, 19, 19, 19,
+  19, 20, 20, 20, 20, 21, 21, 21, 21,
+};
+
+const int8_t av1_coeff_band_32x32[1024] = {
+  0,  1,  4,  4,  7,  7,  7,  7,  10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11,
+  11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 2,  3,  4,  4,  7,  7,
+  7,  7,  10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 12,
+  12, 12, 12, 12, 12, 12, 12, 5,  5,  6,  6,  7,  7,  7,  7,  10, 10, 10, 10,
+  10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12,
+  12, 5,  5,  6,  6,  7,  7,  7,  7,  10, 10, 10, 10, 10, 10, 10, 10, 11, 11,
+  11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 8,  8,  8,  8,  9,
+  9,  9,  9,  10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11,
+  12, 12, 12, 12, 12, 12, 12, 12, 8,  8,  8,  8,  9,  9,  9,  9,  10, 10, 10,
+  10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12,
+  12, 12, 8,  8,  8,  8,  9,  9,  9,  9,  10, 10, 10, 10, 10, 10, 10, 10, 11,
+  11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 8,  8,  8,  8,
+  9,  9,  9,  9,  10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11,
+  11, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14,
+  14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16,
+  16, 16, 16, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14,
+  15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 13, 13, 13,
+  13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15,
+  15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 13, 13, 13, 13, 13, 13, 13, 13, 14,
+  14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16,
+  16, 16, 16, 16, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14,
+  14, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 13, 13,
+  13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15,
+  15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 13, 13, 13, 13, 13, 13, 13, 13,
+  14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16,
+  16, 16, 16, 16, 16, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14,
+  14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 17,
+  17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19,
+  19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 17, 17, 17, 17, 17, 17, 17,
+  17, 18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20,
+  20, 20, 20, 20, 20, 20, 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18,
+  18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20,
+  17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19,
+  19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 17, 17, 17, 17, 17, 17,
+  17, 17, 18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, 20,
+  20, 20, 20, 20, 20, 20, 20, 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18,
+  18, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20,
+  20, 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18, 18, 19, 19,
+  19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 17, 17, 17, 17, 17,
+  17, 17, 17, 18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19,
+  20, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22,
+  22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 24, 24, 24, 24, 24, 24,
+  24, 24, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 23,
+  23, 23, 23, 23, 23, 23, 23, 24, 24, 24, 24, 24, 24, 24, 24, 21, 21, 21, 21,
+  21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23,
+  23, 24, 24, 24, 24, 24, 24, 24, 24, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22,
+  22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 24, 24, 24, 24, 24,
+  24, 24, 24, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22,
+  23, 23, 23, 23, 23, 23, 23, 23, 24, 24, 24, 24, 24, 24, 24, 24, 21, 21, 21,
+  21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23,
+  23, 23, 24, 24, 24, 24, 24, 24, 24, 24, 21, 21, 21, 21, 21, 21, 21, 21, 22,
+  22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 24, 24, 24, 24,
+  24, 24, 24, 24, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22,
+  22, 23, 23, 23, 23, 23, 23, 23, 23, 24, 24, 24, 24, 24, 24, 24, 24,
+};
+
+// The ctx offset table when TX is TX_CLASS_2D.
+// TX col and row indices are clamped to 4
+
+const int8_t av1_nz_map_ctx_offset_4x4[16] = {
+  0, 1, 6, 6, 1, 6, 6, 21, 6, 6, 21, 21, 6, 21, 21, 21,
+};
+
+const int8_t av1_nz_map_ctx_offset_8x8[64] = {
+  0,  1,  6,  6,  21, 21, 21, 21, 1,  6,  6,  21, 21, 21, 21, 21,
+  6,  6,  21, 21, 21, 21, 21, 21, 6,  21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
+
+const int8_t av1_nz_map_ctx_offset_16x16[256] = {
+  0,  1,  6,  6,  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 1,  6,  6,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 6,  6,  21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 6,  21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
+
+const int8_t av1_nz_map_ctx_offset_32x32[1024] = {
+  0,  1,  6,  6,  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 1,  6,  6,  21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 6,  6,  21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 6,  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
+
+const int8_t av1_nz_map_ctx_offset_8x4[32] = {
+  0,  16, 6,  6,  21, 21, 21, 21, 16, 16, 6,  21, 21, 21, 21, 21,
+  16, 16, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21,
+};
+
+const int8_t av1_nz_map_ctx_offset_8x16[128] = {
+  0,  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 6,  6,  21,
+  21, 21, 21, 21, 21, 6,  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
+
+const int8_t av1_nz_map_ctx_offset_16x8[128] = {
+  0,  16, 6,  6,  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 6,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
+
+const int8_t av1_nz_map_ctx_offset_16x32[512] = {
+  0,  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 6,  6,  21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 6,  21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
+
+const int8_t av1_nz_map_ctx_offset_32x16[512] = {
+  0,  16, 6,  6,  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 6,  21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
+
+const int8_t av1_nz_map_ctx_offset_32x64[1024] = {
+  0,  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  11, 11, 11, 11, 11, 11, 11, 6,  6,  21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 6,  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
+
+const int8_t av1_nz_map_ctx_offset_64x32[1024] = {
+  0,  16, 6,  6,  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 6,  21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16,
+  16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
+
+const int8_t av1_nz_map_ctx_offset_4x16[64] = {
+  0,  11, 11, 11, 11, 11, 11, 11, 6,  6,  21, 21, 6,  21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
+
+const int8_t av1_nz_map_ctx_offset_16x4[64] = {
+  0,  16, 6,  6,  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  16, 16, 6,  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
+
+const int8_t av1_nz_map_ctx_offset_8x32[256] = {
+  0,  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 6,  6,  21,
+  21, 21, 21, 21, 21, 6,  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
+
+const int8_t av1_nz_map_ctx_offset_32x8[256] = {
+  0,  16, 6,  6,  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 6,  21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
+
+const int8_t *av1_nz_map_ctx_offset[19] = {
+  av1_nz_map_ctx_offset_4x4,    // TX_4x4
+  av1_nz_map_ctx_offset_8x8,    // TX_8x8
+  av1_nz_map_ctx_offset_16x16,  // TX_16x16
+  av1_nz_map_ctx_offset_32x32,  // TX_32x32
+  av1_nz_map_ctx_offset_32x32,  // TX_32x32
+  av1_nz_map_ctx_offset_4x16,   // TX_4x8
+  av1_nz_map_ctx_offset_8x4,    // TX_8x4
+  av1_nz_map_ctx_offset_8x32,   // TX_8x16
+  av1_nz_map_ctx_offset_16x8,   // TX_16x8
+  av1_nz_map_ctx_offset_16x32,  // TX_16x32
+  av1_nz_map_ctx_offset_32x16,  // TX_32x16
+  av1_nz_map_ctx_offset_32x64,  // TX_32x64
+  av1_nz_map_ctx_offset_64x32,  // TX_64x32
+  av1_nz_map_ctx_offset_4x16,   // TX_4x16
+  av1_nz_map_ctx_offset_16x4,   // TX_16x4
+  av1_nz_map_ctx_offset_8x32,   // TX_8x32
+  av1_nz_map_ctx_offset_32x8,   // TX_32x8
+  av1_nz_map_ctx_offset_16x32,  // TX_16x64
+  av1_nz_map_ctx_offset_64x32,  // TX_64x16
+};
+
+const int16_t av1_eob_group_start[12] = { 0,  1,  2,  3,   5,   9,
+                                          17, 33, 65, 129, 257, 513 };
+const int16_t av1_eob_offset_bits[12] = { 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
diff --git a/libs/libaom/src/av1/common/txb_common.h b/libs/libaom/src/av1/common/txb_common.h
new file mode 100644
index 000000000..5a62fa89b
--- /dev/null
+++ b/libs/libaom/src/av1/common/txb_common.h
@@ -0,0 +1,442 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_TXB_COMMON_H_
+#define AOM_AV1_COMMON_TXB_COMMON_H_
+
+#include "av1/common/av1_common_int.h"
+
+extern const int16_t av1_eob_group_start[12];
+extern const int16_t av1_eob_offset_bits[12];
+
+extern const int8_t av1_coeff_band_4x4[16];
+
+extern const int8_t av1_coeff_band_8x8[64];
+
+extern const int8_t av1_coeff_band_16x16[256];
+
+extern const int8_t av1_coeff_band_32x32[1024];
+
+extern const int8_t *av1_nz_map_ctx_offset[TX_SIZES_ALL];
+
+typedef struct txb_ctx {
+  int txb_skip_ctx;
+  int dc_sign_ctx;
+} TXB_CTX;
+
+static const int base_level_count_to_index[13] = {
+  0, 0, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3,
+};
+
+static const TX_CLASS tx_type_to_class[TX_TYPES] = {
+  TX_CLASS_2D,     // DCT_DCT
+  TX_CLASS_2D,     // ADST_DCT
+  TX_CLASS_2D,     // DCT_ADST
+  TX_CLASS_2D,     // ADST_ADST
+  TX_CLASS_2D,     // FLIPADST_DCT
+  TX_CLASS_2D,     // DCT_FLIPADST
+  TX_CLASS_2D,     // FLIPADST_FLIPADST
+  TX_CLASS_2D,     // ADST_FLIPADST
+  TX_CLASS_2D,     // FLIPADST_ADST
+  TX_CLASS_2D,     // IDTX
+  TX_CLASS_VERT,   // V_DCT
+  TX_CLASS_HORIZ,  // H_DCT
+  TX_CLASS_VERT,   // V_ADST
+  TX_CLASS_HORIZ,  // H_ADST
+  TX_CLASS_VERT,   // V_FLIPADST
+  TX_CLASS_HORIZ,  // H_FLIPADST
+};
+
+static INLINE int get_txb_bwl(TX_SIZE tx_size) {
+  tx_size = av1_get_adjusted_tx_size(tx_size);
+  return tx_size_wide_log2[tx_size];
+}
+
+static INLINE int get_txb_wide(TX_SIZE tx_size) {
+  tx_size = av1_get_adjusted_tx_size(tx_size);
+  return tx_size_wide[tx_size];
+}
+
+static INLINE int get_txb_high(TX_SIZE tx_size) {
+  tx_size = av1_get_adjusted_tx_size(tx_size);
+  return tx_size_high[tx_size];
+}
+
+static INLINE uint8_t *set_levels(uint8_t *const levels_buf, const int width) {
+  return levels_buf + TX_PAD_TOP * (width + TX_PAD_HOR);
+}
+
+static INLINE int get_padded_idx(const int idx, const int bwl) {
+  return idx + ((idx >> bwl) << TX_PAD_HOR_LOG2);
+}
+
+static INLINE int get_base_ctx_from_count_mag(int row, int col, int count,
+                                              int sig_mag) {
+  const int ctx = base_level_count_to_index[count];
+  int ctx_idx = -1;
+
+  if (row == 0 && col == 0) {
+    if (sig_mag >= 2) return ctx_idx = 0;
+    if (sig_mag == 1) {
+      if (count >= 2)
+        ctx_idx = 1;
+      else
+        ctx_idx = 2;
+
+      return ctx_idx;
+    }
+
+    ctx_idx = 3 + ctx;
+    assert(ctx_idx <= 6);
+    return ctx_idx;
+  } else if (row == 0) {
+    if (sig_mag >= 2) return ctx_idx = 6;
+    if (sig_mag == 1) {
+      if (count >= 2)
+        ctx_idx = 7;
+      else
+        ctx_idx = 8;
+      return ctx_idx;
+    }
+
+    ctx_idx = 9 + ctx;
+    assert(ctx_idx <= 11);
+    return ctx_idx;
+  } else if (col == 0) {
+    if (sig_mag >= 2) return ctx_idx = 12;
+    if (sig_mag == 1) {
+      if (count >= 2)
+        ctx_idx = 13;
+      else
+        ctx_idx = 14;
+
+      return ctx_idx;
+    }
+
+    ctx_idx = 15 + ctx;
+    assert(ctx_idx <= 17);
+    // TODO(angiebird): turn this on once the optimization is finalized
+    // assert(ctx_idx < 28);
+  } else {
+    if (sig_mag >= 2) return ctx_idx = 18;
+    if (sig_mag == 1) {
+      if (count >= 2)
+        ctx_idx = 19;
+      else
+        ctx_idx = 20;
+      return ctx_idx;
+    }
+
+    ctx_idx = 21 + ctx;
+
+    assert(ctx_idx <= 24);
+  }
+  return ctx_idx;
+}
+
+static INLINE int get_br_ctx_2d(const uint8_t *const levels,
+                                const int c,  // raster order
+                                const int bwl) {
+  assert(c > 0);
+  const int row = c >> bwl;
+  const int col = c - (row << bwl);
+  const int stride = (1 << bwl) + TX_PAD_HOR;
+  const int pos = row * stride + col;
+  int mag = AOMMIN(levels[pos + 1], MAX_BASE_BR_RANGE) +
+            AOMMIN(levels[pos + stride], MAX_BASE_BR_RANGE) +
+            AOMMIN(levels[pos + 1 + stride], MAX_BASE_BR_RANGE);
+  mag = AOMMIN((mag + 1) >> 1, 6);
+  //((row | col) < 2) is equivalent to ((row < 2) && (col < 2))
+  if ((row | col) < 2) return mag + 7;
+  return mag + 14;
+}
+
+static AOM_FORCE_INLINE int get_br_ctx_eob(const int c,  // raster order
+                                           const int bwl,
+                                           const TX_CLASS tx_class) {
+  const int row = c >> bwl;
+  const int col = c - (row << bwl);
+  if (c == 0) return 0;
+  if ((tx_class == TX_CLASS_2D && row < 2 && col < 2) ||
+      (tx_class == TX_CLASS_HORIZ && col == 0) ||
+      (tx_class == TX_CLASS_VERT && row == 0))
+    return 7;
+  return 14;
+}
+
+static AOM_FORCE_INLINE int get_br_ctx(const uint8_t *const levels,
+                                       const int c,  // raster order
+                                       const int bwl, const TX_CLASS tx_class) {
+  const int row = c >> bwl;
+  const int col = c - (row << bwl);
+  const int stride = (1 << bwl) + TX_PAD_HOR;
+  const int pos = row * stride + col;
+  int mag = levels[pos + 1];
+  mag += levels[pos + stride];
+  switch (tx_class) {
+    case TX_CLASS_2D:
+      mag += levels[pos + stride + 1];
+      mag = AOMMIN((mag + 1) >> 1, 6);
+      if (c == 0) return mag;
+      if ((row < 2) && (col < 2)) return mag + 7;
+      break;
+    case TX_CLASS_HORIZ:
+      mag += levels[pos + 2];
+      mag = AOMMIN((mag + 1) >> 1, 6);
+      if (c == 0) return mag;
+      if (col == 0) return mag + 7;
+      break;
+    case TX_CLASS_VERT:
+      mag += levels[pos + (stride << 1)];
+      mag = AOMMIN((mag + 1) >> 1, 6);
+      if (c == 0) return mag;
+      if (row == 0) return mag + 7;
+      break;
+    default: break;
+  }
+
+  return mag + 14;
+}
+
+static const uint8_t clip_max3[256] = {
+  0, 1, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3
+};
+
+static AOM_FORCE_INLINE int get_nz_mag(const uint8_t *const levels,
+                                       const int bwl, const TX_CLASS tx_class) {
+  int mag;
+
+  // Note: AOMMIN(level, 3) is useless for decoder since level < 3.
+  mag = clip_max3[levels[1]];                         // { 0, 1 }
+  mag += clip_max3[levels[(1 << bwl) + TX_PAD_HOR]];  // { 1, 0 }
+
+  if (tx_class == TX_CLASS_2D) {
+    mag += clip_max3[levels[(1 << bwl) + TX_PAD_HOR + 1]];          // { 1, 1 }
+    mag += clip_max3[levels[2]];                                    // { 0, 2 }
+    mag += clip_max3[levels[(2 << bwl) + (2 << TX_PAD_HOR_LOG2)]];  // { 2, 0 }
+  } else if (tx_class == TX_CLASS_VERT) {
+    mag += clip_max3[levels[(2 << bwl) + (2 << TX_PAD_HOR_LOG2)]];  // { 2, 0 }
+    mag += clip_max3[levels[(3 << bwl) + (3 << TX_PAD_HOR_LOG2)]];  // { 3, 0 }
+    mag += clip_max3[levels[(4 << bwl) + (4 << TX_PAD_HOR_LOG2)]];  // { 4, 0 }
+  } else {
+    mag += clip_max3[levels[2]];  // { 0, 2 }
+    mag += clip_max3[levels[3]];  // { 0, 3 }
+    mag += clip_max3[levels[4]];  // { 0, 4 }
+  }
+
+  return mag;
+}
+
+#define NZ_MAP_CTX_0 SIG_COEF_CONTEXTS_2D
+#define NZ_MAP_CTX_5 (NZ_MAP_CTX_0 + 5)
+#define NZ_MAP_CTX_10 (NZ_MAP_CTX_0 + 10)
+
+static const int nz_map_ctx_offset_1d[32] = {
+  NZ_MAP_CTX_0,  NZ_MAP_CTX_5,  NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10,
+  NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10,
+  NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10,
+  NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10,
+  NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10,
+  NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10,
+  NZ_MAP_CTX_10, NZ_MAP_CTX_10,
+};
+
+static AOM_FORCE_INLINE int get_nz_map_ctx_from_stats(
+    const int stats,
+    const int coeff_idx,  // raster order
+    const int bwl, const TX_SIZE tx_size, const TX_CLASS tx_class) {
+  // tx_class == 0(TX_CLASS_2D)
+  if ((tx_class | coeff_idx) == 0) return 0;
+  int ctx = (stats + 1) >> 1;
+  ctx = AOMMIN(ctx, 4);
+  switch (tx_class) {
+    case TX_CLASS_2D: {
+      // This is the algorithm to generate av1_nz_map_ctx_offset[][]
+      //   const int width = tx_size_wide[tx_size];
+      //   const int height = tx_size_high[tx_size];
+      //   if (width < height) {
+      //     if (row < 2) return 11 + ctx;
+      //   } else if (width > height) {
+      //     if (col < 2) return 16 + ctx;
+      //   }
+      //   if (row + col < 2) return ctx + 1;
+      //   if (row + col < 4) return 5 + ctx + 1;
+      //   return 21 + ctx;
+      return ctx + av1_nz_map_ctx_offset[tx_size][coeff_idx];
+    }
+    case TX_CLASS_HORIZ: {
+      const int row = coeff_idx >> bwl;
+      const int col = coeff_idx - (row << bwl);
+      return ctx + nz_map_ctx_offset_1d[col];
+    }
+    case TX_CLASS_VERT: {
+      const int row = coeff_idx >> bwl;
+      return ctx + nz_map_ctx_offset_1d[row];
+    }
+    default: break;
+  }
+  return 0;
+}
+
+typedef aom_cdf_prob (*base_cdf_arr)[CDF_SIZE(4)];
+typedef aom_cdf_prob (*br_cdf_arr)[CDF_SIZE(BR_CDF_SIZE)];
+
+static INLINE int get_lower_levels_ctx_eob(int bwl, int height, int scan_idx) {
+  if (scan_idx == 0) return 0;
+  if (scan_idx <= (height << bwl) / 8) return 1;
+  if (scan_idx <= (height << bwl) / 4) return 2;
+  return 3;
+}
+
+static INLINE int get_lower_levels_ctx_2d(const uint8_t *levels, int coeff_idx,
+                                          int bwl, TX_SIZE tx_size) {
+  assert(coeff_idx > 0);
+  int mag;
+  // Note: AOMMIN(level, 3) is useless for decoder since level < 3.
+  levels = levels + get_padded_idx(coeff_idx, bwl);
+  mag = AOMMIN(levels[1], 3);                                     // { 0, 1 }
+  mag += AOMMIN(levels[(1 << bwl) + TX_PAD_HOR], 3);              // { 1, 0 }
+  mag += AOMMIN(levels[(1 << bwl) + TX_PAD_HOR + 1], 3);          // { 1, 1 }
+  mag += AOMMIN(levels[2], 3);                                    // { 0, 2 }
+  mag += AOMMIN(levels[(2 << bwl) + (2 << TX_PAD_HOR_LOG2)], 3);  // { 2, 0 }
+
+  const int ctx = AOMMIN((mag + 1) >> 1, 4);
+  return ctx + av1_nz_map_ctx_offset[tx_size][coeff_idx];
+}
+static AOM_FORCE_INLINE int get_lower_levels_ctx(const uint8_t *levels,
+                                                 int coeff_idx, int bwl,
+                                                 TX_SIZE tx_size,
+                                                 TX_CLASS tx_class) {
+  const int stats =
+      get_nz_mag(levels + get_padded_idx(coeff_idx, bwl), bwl, tx_class);
+  return get_nz_map_ctx_from_stats(stats, coeff_idx, bwl, tx_size, tx_class);
+}
+
+static INLINE int get_lower_levels_ctx_general(int is_last, int scan_idx,
+                                               int bwl, int height,
+                                               const uint8_t *levels,
+                                               int coeff_idx, TX_SIZE tx_size,
+                                               TX_CLASS tx_class) {
+  if (is_last) {
+    if (scan_idx == 0) return 0;
+    if (scan_idx <= (height << bwl) >> 3) return 1;
+    if (scan_idx <= (height << bwl) >> 2) return 2;
+    return 3;
+  }
+  return get_lower_levels_ctx(levels, coeff_idx, bwl, tx_size, tx_class);
+}
+
+static INLINE void set_dc_sign(int *cul_level, int dc_val) {
+  if (dc_val < 0)
+    *cul_level |= 1 << COEFF_CONTEXT_BITS;
+  else if (dc_val > 0)
+    *cul_level += 2 << COEFF_CONTEXT_BITS;
+}
+
+static INLINE void get_txb_ctx(const BLOCK_SIZE plane_bsize,
+                               const TX_SIZE tx_size, const int plane,
+                               const ENTROPY_CONTEXT *const a,
+                               const ENTROPY_CONTEXT *const l,
+                               TXB_CTX *const txb_ctx) {
+#define MAX_TX_SIZE_UNIT 16
+  static const int8_t signs[3] = { 0, -1, 1 };
+  static const int8_t dc_sign_contexts[4 * MAX_TX_SIZE_UNIT + 1] = {
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+  };
+  const int txb_w_unit = tx_size_wide_unit[tx_size];
+  const int txb_h_unit = tx_size_high_unit[tx_size];
+  int dc_sign = 0;
+  int k = 0;
+
+  do {
+    const unsigned int sign = ((uint8_t)a[k]) >> COEFF_CONTEXT_BITS;
+    assert(sign <= 2);
+    dc_sign += signs[sign];
+  } while (++k < txb_w_unit);
+
+  k = 0;
+  do {
+    const unsigned int sign = ((uint8_t)l[k]) >> COEFF_CONTEXT_BITS;
+    assert(sign <= 2);
+    dc_sign += signs[sign];
+  } while (++k < txb_h_unit);
+
+  txb_ctx->dc_sign_ctx = dc_sign_contexts[dc_sign + 2 * MAX_TX_SIZE_UNIT];
+
+  if (plane == 0) {
+    if (plane_bsize == txsize_to_bsize[tx_size]) {
+      txb_ctx->txb_skip_ctx = 0;
+    } else {
+      // This is the algorithm to generate table skip_contexts[top][left].
+      //    const int max = AOMMIN(top | left, 4);
+      //    const int min = AOMMIN(AOMMIN(top, left), 4);
+      //    if (!max)
+      //      txb_skip_ctx = 1;
+      //    else if (!min)
+      //      txb_skip_ctx = 2 + (max > 3);
+      //    else if (max <= 3)
+      //      txb_skip_ctx = 4;
+      //    else if (min <= 3)
+      //      txb_skip_ctx = 5;
+      //    else
+      //      txb_skip_ctx = 6;
+      static const uint8_t skip_contexts[5][5] = { { 1, 2, 2, 2, 3 },
+                                                   { 2, 4, 4, 4, 5 },
+                                                   { 2, 4, 4, 4, 5 },
+                                                   { 2, 4, 4, 4, 5 },
+                                                   { 3, 5, 5, 5, 6 } };
+      // For top and left, we only care about which of the following three
+      // categories they belong to: { 0 }, { 1, 2, 3 }, or { 4, 5, ... }. The
+      // spec calculates top and left with the Max() function. We can calculate
+      // an approximate max with bitwise OR because the real max and the
+      // approximate max belong to the same category.
+      int top = 0;
+      int left = 0;
+
+      k = 0;
+      do {
+        top |= a[k];
+      } while (++k < txb_w_unit);
+      top &= COEFF_CONTEXT_MASK;
+      top = AOMMIN(top, 4);
+
+      k = 0;
+      do {
+        left |= l[k];
+      } while (++k < txb_h_unit);
+      left &= COEFF_CONTEXT_MASK;
+      left = AOMMIN(left, 4);
+
+      txb_ctx->txb_skip_ctx = skip_contexts[top][left];
+    }
+  } else {
+    const int ctx_base = get_entropy_context(tx_size, a, l);
+    const int ctx_offset = (num_pels_log2_lookup[plane_bsize] >
+                            num_pels_log2_lookup[txsize_to_bsize[tx_size]])
+                               ? 10
+                               : 7;
+    txb_ctx->txb_skip_ctx = ctx_base + ctx_offset;
+  }
+#undef MAX_TX_SIZE_UNIT
+}
+
+#endif  // AOM_AV1_COMMON_TXB_COMMON_H_
diff --git a/libs/libaom/src/av1/common/warped_motion.c b/libs/libaom/src/av1/common/warped_motion.c
new file mode 100644
index 000000000..4e9fab9bd
--- /dev/null
+++ b/libs/libaom/src/av1/common/warped_motion.c
@@ -0,0 +1,1073 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <math.h>
+#include <assert.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/warped_motion.h"
+#include "av1/common/scale.h"
+
+// For warping, we really use a 6-tap filter, but we do blocks of 8 pixels
+// at a time. The zoom/rotation/shear in the model are applied to the
+// "fractional" position of each pixel, which therefore varies within
+// [-1, 2) * WARPEDPIXEL_PREC_SHIFTS.
+// We need an extra 2 taps to fit this in, for a total of 8 taps.
+/* clang-format off */
+const int16_t av1_warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8] = {
+#if WARPEDPIXEL_PREC_BITS == 6
+  // [-1, 0)
+  { 0,   0, 127,   1,   0, 0, 0, 0 }, { 0, - 1, 127,   2,   0, 0, 0, 0 },
+  { 1, - 3, 127,   4, - 1, 0, 0, 0 }, { 1, - 4, 126,   6, - 2, 1, 0, 0 },
+  { 1, - 5, 126,   8, - 3, 1, 0, 0 }, { 1, - 6, 125,  11, - 4, 1, 0, 0 },
+  { 1, - 7, 124,  13, - 4, 1, 0, 0 }, { 2, - 8, 123,  15, - 5, 1, 0, 0 },
+  { 2, - 9, 122,  18, - 6, 1, 0, 0 }, { 2, -10, 121,  20, - 6, 1, 0, 0 },
+  { 2, -11, 120,  22, - 7, 2, 0, 0 }, { 2, -12, 119,  25, - 8, 2, 0, 0 },
+  { 3, -13, 117,  27, - 8, 2, 0, 0 }, { 3, -13, 116,  29, - 9, 2, 0, 0 },
+  { 3, -14, 114,  32, -10, 3, 0, 0 }, { 3, -15, 113,  35, -10, 2, 0, 0 },
+  { 3, -15, 111,  37, -11, 3, 0, 0 }, { 3, -16, 109,  40, -11, 3, 0, 0 },
+  { 3, -16, 108,  42, -12, 3, 0, 0 }, { 4, -17, 106,  45, -13, 3, 0, 0 },
+  { 4, -17, 104,  47, -13, 3, 0, 0 }, { 4, -17, 102,  50, -14, 3, 0, 0 },
+  { 4, -17, 100,  52, -14, 3, 0, 0 }, { 4, -18,  98,  55, -15, 4, 0, 0 },
+  { 4, -18,  96,  58, -15, 3, 0, 0 }, { 4, -18,  94,  60, -16, 4, 0, 0 },
+  { 4, -18,  91,  63, -16, 4, 0, 0 }, { 4, -18,  89,  65, -16, 4, 0, 0 },
+  { 4, -18,  87,  68, -17, 4, 0, 0 }, { 4, -18,  85,  70, -17, 4, 0, 0 },
+  { 4, -18,  82,  73, -17, 4, 0, 0 }, { 4, -18,  80,  75, -17, 4, 0, 0 },
+  { 4, -18,  78,  78, -18, 4, 0, 0 }, { 4, -17,  75,  80, -18, 4, 0, 0 },
+  { 4, -17,  73,  82, -18, 4, 0, 0 }, { 4, -17,  70,  85, -18, 4, 0, 0 },
+  { 4, -17,  68,  87, -18, 4, 0, 0 }, { 4, -16,  65,  89, -18, 4, 0, 0 },
+  { 4, -16,  63,  91, -18, 4, 0, 0 }, { 4, -16,  60,  94, -18, 4, 0, 0 },
+  { 3, -15,  58,  96, -18, 4, 0, 0 }, { 4, -15,  55,  98, -18, 4, 0, 0 },
+  { 3, -14,  52, 100, -17, 4, 0, 0 }, { 3, -14,  50, 102, -17, 4, 0, 0 },
+  { 3, -13,  47, 104, -17, 4, 0, 0 }, { 3, -13,  45, 106, -17, 4, 0, 0 },
+  { 3, -12,  42, 108, -16, 3, 0, 0 }, { 3, -11,  40, 109, -16, 3, 0, 0 },
+  { 3, -11,  37, 111, -15, 3, 0, 0 }, { 2, -10,  35, 113, -15, 3, 0, 0 },
+  { 3, -10,  32, 114, -14, 3, 0, 0 }, { 2, - 9,  29, 116, -13, 3, 0, 0 },
+  { 2, - 8,  27, 117, -13, 3, 0, 0 }, { 2, - 8,  25, 119, -12, 2, 0, 0 },
+  { 2, - 7,  22, 120, -11, 2, 0, 0 }, { 1, - 6,  20, 121, -10, 2, 0, 0 },
+  { 1, - 6,  18, 122, - 9, 2, 0, 0 }, { 1, - 5,  15, 123, - 8, 2, 0, 0 },
+  { 1, - 4,  13, 124, - 7, 1, 0, 0 }, { 1, - 4,  11, 125, - 6, 1, 0, 0 },
+  { 1, - 3,   8, 126, - 5, 1, 0, 0 }, { 1, - 2,   6, 126, - 4, 1, 0, 0 },
+  { 0, - 1,   4, 127, - 3, 1, 0, 0 }, { 0,   0,   2, 127, - 1, 0, 0, 0 },
+
+  // [0, 1)
+  { 0,  0,   0, 127,   1,   0,  0,  0}, { 0,  0,  -1, 127,   2,   0,  0,  0},
+  { 0,  1,  -3, 127,   4,  -2,  1,  0}, { 0,  1,  -5, 127,   6,  -2,  1,  0},
+  { 0,  2,  -6, 126,   8,  -3,  1,  0}, {-1,  2,  -7, 126,  11,  -4,  2, -1},
+  {-1,  3,  -8, 125,  13,  -5,  2, -1}, {-1,  3, -10, 124,  16,  -6,  3, -1},
+  {-1,  4, -11, 123,  18,  -7,  3, -1}, {-1,  4, -12, 122,  20,  -7,  3, -1},
+  {-1,  4, -13, 121,  23,  -8,  3, -1}, {-2,  5, -14, 120,  25,  -9,  4, -1},
+  {-1,  5, -15, 119,  27, -10,  4, -1}, {-1,  5, -16, 118,  30, -11,  4, -1},
+  {-2,  6, -17, 116,  33, -12,  5, -1}, {-2,  6, -17, 114,  35, -12,  5, -1},
+  {-2,  6, -18, 113,  38, -13,  5, -1}, {-2,  7, -19, 111,  41, -14,  6, -2},
+  {-2,  7, -19, 110,  43, -15,  6, -2}, {-2,  7, -20, 108,  46, -15,  6, -2},
+  {-2,  7, -20, 106,  49, -16,  6, -2}, {-2,  7, -21, 104,  51, -16,  7, -2},
+  {-2,  7, -21, 102,  54, -17,  7, -2}, {-2,  8, -21, 100,  56, -18,  7, -2},
+  {-2,  8, -22,  98,  59, -18,  7, -2}, {-2,  8, -22,  96,  62, -19,  7, -2},
+  {-2,  8, -22,  94,  64, -19,  7, -2}, {-2,  8, -22,  91,  67, -20,  8, -2},
+  {-2,  8, -22,  89,  69, -20,  8, -2}, {-2,  8, -22,  87,  72, -21,  8, -2},
+  {-2,  8, -21,  84,  74, -21,  8, -2}, {-2,  8, -22,  82,  77, -21,  8, -2},
+  {-2,  8, -21,  79,  79, -21,  8, -2}, {-2,  8, -21,  77,  82, -22,  8, -2},
+  {-2,  8, -21,  74,  84, -21,  8, -2}, {-2,  8, -21,  72,  87, -22,  8, -2},
+  {-2,  8, -20,  69,  89, -22,  8, -2}, {-2,  8, -20,  67,  91, -22,  8, -2},
+  {-2,  7, -19,  64,  94, -22,  8, -2}, {-2,  7, -19,  62,  96, -22,  8, -2},
+  {-2,  7, -18,  59,  98, -22,  8, -2}, {-2,  7, -18,  56, 100, -21,  8, -2},
+  {-2,  7, -17,  54, 102, -21,  7, -2}, {-2,  7, -16,  51, 104, -21,  7, -2},
+  {-2,  6, -16,  49, 106, -20,  7, -2}, {-2,  6, -15,  46, 108, -20,  7, -2},
+  {-2,  6, -15,  43, 110, -19,  7, -2}, {-2,  6, -14,  41, 111, -19,  7, -2},
+  {-1,  5, -13,  38, 113, -18,  6, -2}, {-1,  5, -12,  35, 114, -17,  6, -2},
+  {-1,  5, -12,  33, 116, -17,  6, -2}, {-1,  4, -11,  30, 118, -16,  5, -1},
+  {-1,  4, -10,  27, 119, -15,  5, -1}, {-1,  4,  -9,  25, 120, -14,  5, -2},
+  {-1,  3,  -8,  23, 121, -13,  4, -1}, {-1,  3,  -7,  20, 122, -12,  4, -1},
+  {-1,  3,  -7,  18, 123, -11,  4, -1}, {-1,  3,  -6,  16, 124, -10,  3, -1},
+  {-1,  2,  -5,  13, 125,  -8,  3, -1}, {-1,  2,  -4,  11, 126,  -7,  2, -1},
+  { 0,  1,  -3,   8, 126,  -6,  2,  0}, { 0,  1,  -2,   6, 127,  -5,  1,  0},
+  { 0,  1,  -2,   4, 127,  -3,  1,  0}, { 0,  0,   0,   2, 127,  -1,  0,  0},
+
+  // [1, 2)
+  { 0, 0, 0,   1, 127,   0,   0, 0 }, { 0, 0, 0, - 1, 127,   2,   0, 0 },
+  { 0, 0, 1, - 3, 127,   4, - 1, 0 }, { 0, 0, 1, - 4, 126,   6, - 2, 1 },
+  { 0, 0, 1, - 5, 126,   8, - 3, 1 }, { 0, 0, 1, - 6, 125,  11, - 4, 1 },
+  { 0, 0, 1, - 7, 124,  13, - 4, 1 }, { 0, 0, 2, - 8, 123,  15, - 5, 1 },
+  { 0, 0, 2, - 9, 122,  18, - 6, 1 }, { 0, 0, 2, -10, 121,  20, - 6, 1 },
+  { 0, 0, 2, -11, 120,  22, - 7, 2 }, { 0, 0, 2, -12, 119,  25, - 8, 2 },
+  { 0, 0, 3, -13, 117,  27, - 8, 2 }, { 0, 0, 3, -13, 116,  29, - 9, 2 },
+  { 0, 0, 3, -14, 114,  32, -10, 3 }, { 0, 0, 3, -15, 113,  35, -10, 2 },
+  { 0, 0, 3, -15, 111,  37, -11, 3 }, { 0, 0, 3, -16, 109,  40, -11, 3 },
+  { 0, 0, 3, -16, 108,  42, -12, 3 }, { 0, 0, 4, -17, 106,  45, -13, 3 },
+  { 0, 0, 4, -17, 104,  47, -13, 3 }, { 0, 0, 4, -17, 102,  50, -14, 3 },
+  { 0, 0, 4, -17, 100,  52, -14, 3 }, { 0, 0, 4, -18,  98,  55, -15, 4 },
+  { 0, 0, 4, -18,  96,  58, -15, 3 }, { 0, 0, 4, -18,  94,  60, -16, 4 },
+  { 0, 0, 4, -18,  91,  63, -16, 4 }, { 0, 0, 4, -18,  89,  65, -16, 4 },
+  { 0, 0, 4, -18,  87,  68, -17, 4 }, { 0, 0, 4, -18,  85,  70, -17, 4 },
+  { 0, 0, 4, -18,  82,  73, -17, 4 }, { 0, 0, 4, -18,  80,  75, -17, 4 },
+  { 0, 0, 4, -18,  78,  78, -18, 4 }, { 0, 0, 4, -17,  75,  80, -18, 4 },
+  { 0, 0, 4, -17,  73,  82, -18, 4 }, { 0, 0, 4, -17,  70,  85, -18, 4 },
+  { 0, 0, 4, -17,  68,  87, -18, 4 }, { 0, 0, 4, -16,  65,  89, -18, 4 },
+  { 0, 0, 4, -16,  63,  91, -18, 4 }, { 0, 0, 4, -16,  60,  94, -18, 4 },
+  { 0, 0, 3, -15,  58,  96, -18, 4 }, { 0, 0, 4, -15,  55,  98, -18, 4 },
+  { 0, 0, 3, -14,  52, 100, -17, 4 }, { 0, 0, 3, -14,  50, 102, -17, 4 },
+  { 0, 0, 3, -13,  47, 104, -17, 4 }, { 0, 0, 3, -13,  45, 106, -17, 4 },
+  { 0, 0, 3, -12,  42, 108, -16, 3 }, { 0, 0, 3, -11,  40, 109, -16, 3 },
+  { 0, 0, 3, -11,  37, 111, -15, 3 }, { 0, 0, 2, -10,  35, 113, -15, 3 },
+  { 0, 0, 3, -10,  32, 114, -14, 3 }, { 0, 0, 2, - 9,  29, 116, -13, 3 },
+  { 0, 0, 2, - 8,  27, 117, -13, 3 }, { 0, 0, 2, - 8,  25, 119, -12, 2 },
+  { 0, 0, 2, - 7,  22, 120, -11, 2 }, { 0, 0, 1, - 6,  20, 121, -10, 2 },
+  { 0, 0, 1, - 6,  18, 122, - 9, 2 }, { 0, 0, 1, - 5,  15, 123, - 8, 2 },
+  { 0, 0, 1, - 4,  13, 124, - 7, 1 }, { 0, 0, 1, - 4,  11, 125, - 6, 1 },
+  { 0, 0, 1, - 3,   8, 126, - 5, 1 }, { 0, 0, 1, - 2,   6, 126, - 4, 1 },
+  { 0, 0, 0, - 1,   4, 127, - 3, 1 }, { 0, 0, 0,   0,   2, 127, - 1, 0 },
+  // dummy (replicate row index 191)
+  { 0, 0, 0,   0,   2, 127, - 1, 0 },
+
+#elif WARPEDPIXEL_PREC_BITS == 5
+  // [-1, 0)
+  {0,   0, 127,   1,   0, 0, 0, 0}, {1,  -3, 127,   4,  -1, 0, 0, 0},
+  {1,  -5, 126,   8,  -3, 1, 0, 0}, {1,  -7, 124,  13,  -4, 1, 0, 0},
+  {2,  -9, 122,  18,  -6, 1, 0, 0}, {2, -11, 120,  22,  -7, 2, 0, 0},
+  {3, -13, 117,  27,  -8, 2, 0, 0}, {3, -14, 114,  32, -10, 3, 0, 0},
+  {3, -15, 111,  37, -11, 3, 0, 0}, {3, -16, 108,  42, -12, 3, 0, 0},
+  {4, -17, 104,  47, -13, 3, 0, 0}, {4, -17, 100,  52, -14, 3, 0, 0},
+  {4, -18,  96,  58, -15, 3, 0, 0}, {4, -18,  91,  63, -16, 4, 0, 0},
+  {4, -18,  87,  68, -17, 4, 0, 0}, {4, -18,  82,  73, -17, 4, 0, 0},
+  {4, -18,  78,  78, -18, 4, 0, 0}, {4, -17,  73,  82, -18, 4, 0, 0},
+  {4, -17,  68,  87, -18, 4, 0, 0}, {4, -16,  63,  91, -18, 4, 0, 0},
+  {3, -15,  58,  96, -18, 4, 0, 0}, {3, -14,  52, 100, -17, 4, 0, 0},
+  {3, -13,  47, 104, -17, 4, 0, 0}, {3, -12,  42, 108, -16, 3, 0, 0},
+  {3, -11,  37, 111, -15, 3, 0, 0}, {3, -10,  32, 114, -14, 3, 0, 0},
+  {2,  -8,  27, 117, -13, 3, 0, 0}, {2,  -7,  22, 120, -11, 2, 0, 0},
+  {1,  -6,  18, 122,  -9, 2, 0, 0}, {1,  -4,  13, 124,  -7, 1, 0, 0},
+  {1,  -3,   8, 126,  -5, 1, 0, 0}, {0,  -1,   4, 127,  -3, 1, 0, 0},
+  // [0, 1)
+  { 0,  0,   0, 127,   1,   0,   0,  0}, { 0,  1,  -3, 127,   4,  -2,   1,  0},
+  { 0,  2,  -6, 126,   8,  -3,   1,  0}, {-1,  3,  -8, 125,  13,  -5,   2, -1},
+  {-1,  4, -11, 123,  18,  -7,   3, -1}, {-1,  4, -13, 121,  23,  -8,   3, -1},
+  {-1,  5, -15, 119,  27, -10,   4, -1}, {-2,  6, -17, 116,  33, -12,   5, -1},
+  {-2,  6, -18, 113,  38, -13,   5, -1}, {-2,  7, -19, 110,  43, -15,   6, -2},
+  {-2,  7, -20, 106,  49, -16,   6, -2}, {-2,  7, -21, 102,  54, -17,   7, -2},
+  {-2,  8, -22,  98,  59, -18,   7, -2}, {-2,  8, -22,  94,  64, -19,   7, -2},
+  {-2,  8, -22,  89,  69, -20,   8, -2}, {-2,  8, -21,  84,  74, -21,   8, -2},
+  {-2,  8, -21,  79,  79, -21,   8, -2}, {-2,  8, -21,  74,  84, -21,   8, -2},
+  {-2,  8, -20,  69,  89, -22,   8, -2}, {-2,  7, -19,  64,  94, -22,   8, -2},
+  {-2,  7, -18,  59,  98, -22,   8, -2}, {-2,  7, -17,  54, 102, -21,   7, -2},
+  {-2,  6, -16,  49, 106, -20,   7, -2}, {-2,  6, -15,  43, 110, -19,   7, -2},
+  {-1,  5, -13,  38, 113, -18,   6, -2}, {-1,  5, -12,  33, 116, -17,   6, -2},
+  {-1,  4, -10,  27, 119, -15,   5, -1}, {-1,  3,  -8,  23, 121, -13,   4, -1},
+  {-1,  3,  -7,  18, 123, -11,   4, -1}, {-1,  2,  -5,  13, 125,  -8,   3, -1},
+  { 0,  1,  -3,   8, 126,  -6,   2,  0}, { 0,  1,  -2,   4, 127,  -3,   1,  0},
+  // [1, 2)
+  {0, 0, 0,   1, 127,   0,   0, 0}, {0, 0, 1,  -3, 127,   4,  -1, 0},
+  {0, 0, 1,  -5, 126,   8,  -3, 1}, {0, 0, 1,  -7, 124,  13,  -4, 1},
+  {0, 0, 2,  -9, 122,  18,  -6, 1}, {0, 0, 2, -11, 120,  22,  -7, 2},
+  {0, 0, 3, -13, 117,  27,  -8, 2}, {0, 0, 3, -14, 114,  32, -10, 3},
+  {0, 0, 3, -15, 111,  37, -11, 3}, {0, 0, 3, -16, 108,  42, -12, 3},
+  {0, 0, 4, -17, 104,  47, -13, 3}, {0, 0, 4, -17, 100,  52, -14, 3},
+  {0, 0, 4, -18,  96,  58, -15, 3}, {0, 0, 4, -18,  91,  63, -16, 4},
+  {0, 0, 4, -18,  87,  68, -17, 4}, {0, 0, 4, -18,  82,  73, -17, 4},
+  {0, 0, 4, -18,  78,  78, -18, 4}, {0, 0, 4, -17,  73,  82, -18, 4},
+  {0, 0, 4, -17,  68,  87, -18, 4}, {0, 0, 4, -16,  63,  91, -18, 4},
+  {0, 0, 3, -15,  58,  96, -18, 4}, {0, 0, 3, -14,  52, 100, -17, 4},
+  {0, 0, 3, -13,  47, 104, -17, 4}, {0, 0, 3, -12,  42, 108, -16, 3},
+  {0, 0, 3, -11,  37, 111, -15, 3}, {0, 0, 3, -10,  32, 114, -14, 3},
+  {0, 0, 2,  -8,  27, 117, -13, 3}, {0, 0, 2,  -7,  22, 120, -11, 2},
+  {0, 0, 1,  -6,  18, 122,  -9, 2}, {0, 0, 1,  -4,  13, 124,  -7, 1},
+  {0, 0, 1,  -3,   8, 126,  -5, 1}, {0, 0, 0,  -1,   4, 127,  -3, 1},
+  // dummy (replicate row index 95)
+  {0, 0, 0,  -1,   4, 127,  -3, 1},
+
+#endif  // WARPEDPIXEL_PREC_BITS == 6
+};
+
+/* clang-format on */
+
+#define DIV_LUT_PREC_BITS 14
+#define DIV_LUT_BITS 8
+#define DIV_LUT_NUM (1 << DIV_LUT_BITS)
+
+static const uint16_t div_lut[DIV_LUT_NUM + 1] = {
+  16384, 16320, 16257, 16194, 16132, 16070, 16009, 15948, 15888, 15828, 15768,
+  15709, 15650, 15592, 15534, 15477, 15420, 15364, 15308, 15252, 15197, 15142,
+  15087, 15033, 14980, 14926, 14873, 14821, 14769, 14717, 14665, 14614, 14564,
+  14513, 14463, 14413, 14364, 14315, 14266, 14218, 14170, 14122, 14075, 14028,
+  13981, 13935, 13888, 13843, 13797, 13752, 13707, 13662, 13618, 13574, 13530,
+  13487, 13443, 13400, 13358, 13315, 13273, 13231, 13190, 13148, 13107, 13066,
+  13026, 12985, 12945, 12906, 12866, 12827, 12788, 12749, 12710, 12672, 12633,
+  12596, 12558, 12520, 12483, 12446, 12409, 12373, 12336, 12300, 12264, 12228,
+  12193, 12157, 12122, 12087, 12053, 12018, 11984, 11950, 11916, 11882, 11848,
+  11815, 11782, 11749, 11716, 11683, 11651, 11619, 11586, 11555, 11523, 11491,
+  11460, 11429, 11398, 11367, 11336, 11305, 11275, 11245, 11215, 11185, 11155,
+  11125, 11096, 11067, 11038, 11009, 10980, 10951, 10923, 10894, 10866, 10838,
+  10810, 10782, 10755, 10727, 10700, 10673, 10645, 10618, 10592, 10565, 10538,
+  10512, 10486, 10460, 10434, 10408, 10382, 10356, 10331, 10305, 10280, 10255,
+  10230, 10205, 10180, 10156, 10131, 10107, 10082, 10058, 10034, 10010, 9986,
+  9963,  9939,  9916,  9892,  9869,  9846,  9823,  9800,  9777,  9754,  9732,
+  9709,  9687,  9664,  9642,  9620,  9598,  9576,  9554,  9533,  9511,  9489,
+  9468,  9447,  9425,  9404,  9383,  9362,  9341,  9321,  9300,  9279,  9259,
+  9239,  9218,  9198,  9178,  9158,  9138,  9118,  9098,  9079,  9059,  9039,
+  9020,  9001,  8981,  8962,  8943,  8924,  8905,  8886,  8867,  8849,  8830,
+  8812,  8793,  8775,  8756,  8738,  8720,  8702,  8684,  8666,  8648,  8630,
+  8613,  8595,  8577,  8560,  8542,  8525,  8508,  8490,  8473,  8456,  8439,
+  8422,  8405,  8389,  8372,  8355,  8339,  8322,  8306,  8289,  8273,  8257,
+  8240,  8224,  8208,  8192,
+};
+
+// Decomposes a divisor D such that 1/D = y/2^shift, where y is returned
+// at precision of DIV_LUT_PREC_BITS along with the shift.
+static int16_t resolve_divisor_64(uint64_t D, int16_t *shift) {
+  int64_t f;
+  *shift = (int16_t)((D >> 32) ? get_msb((unsigned int)(D >> 32)) + 32
+                               : get_msb((unsigned int)D));
+  // e is obtained from D after resetting the most significant 1 bit.
+  const int64_t e = D - ((uint64_t)1 << *shift);
+  // Get the most significant DIV_LUT_BITS (8) bits of e into f
+  if (*shift > DIV_LUT_BITS)
+    f = ROUND_POWER_OF_TWO_64(e, *shift - DIV_LUT_BITS);
+  else
+    f = e << (DIV_LUT_BITS - *shift);
+  assert(f <= DIV_LUT_NUM);
+  *shift += DIV_LUT_PREC_BITS;
+  // Use f as lookup into the precomputed table of multipliers
+  return div_lut[f];
+}
+
+static int16_t resolve_divisor_32(uint32_t D, int16_t *shift) {
+  int32_t f;
+  *shift = get_msb(D);
+  // e is obtained from D after resetting the most significant 1 bit.
+  const int32_t e = D - ((uint32_t)1 << *shift);
+  // Get the most significant DIV_LUT_BITS (8) bits of e into f
+  if (*shift > DIV_LUT_BITS)
+    f = ROUND_POWER_OF_TWO(e, *shift - DIV_LUT_BITS);
+  else
+    f = e << (DIV_LUT_BITS - *shift);
+  assert(f <= DIV_LUT_NUM);
+  *shift += DIV_LUT_PREC_BITS;
+  // Use f as lookup into the precomputed table of multipliers
+  return div_lut[f];
+}
+
+static int is_affine_valid(const WarpedMotionParams *const wm) {
+  const int32_t *mat = wm->wmmat;
+  return (mat[2] > 0);
+}
+
+static int is_affine_shear_allowed(int16_t alpha, int16_t beta, int16_t gamma,
+                                   int16_t delta) {
+  if ((4 * abs(alpha) + 7 * abs(beta) >= (1 << WARPEDMODEL_PREC_BITS)) ||
+      (4 * abs(gamma) + 4 * abs(delta) >= (1 << WARPEDMODEL_PREC_BITS)))
+    return 0;
+  else
+    return 1;
+}
+
+// Returns 1 on success or 0 on an invalid affine set
+int av1_get_shear_params(WarpedMotionParams *wm) {
+  const int32_t *mat = wm->wmmat;
+  if (!is_affine_valid(wm)) return 0;
+  wm->alpha =
+      clamp(mat[2] - (1 << WARPEDMODEL_PREC_BITS), INT16_MIN, INT16_MAX);
+  wm->beta = clamp(mat[3], INT16_MIN, INT16_MAX);
+  int16_t shift;
+  int16_t y = resolve_divisor_32(abs(mat[2]), &shift) * (mat[2] < 0 ? -1 : 1);
+  int64_t v = ((int64_t)mat[4] * (1 << WARPEDMODEL_PREC_BITS)) * y;
+  wm->gamma =
+      clamp((int)ROUND_POWER_OF_TWO_SIGNED_64(v, shift), INT16_MIN, INT16_MAX);
+  v = ((int64_t)mat[3] * mat[4]) * y;
+  wm->delta = clamp(mat[5] - (int)ROUND_POWER_OF_TWO_SIGNED_64(v, shift) -
+                        (1 << WARPEDMODEL_PREC_BITS),
+                    INT16_MIN, INT16_MAX);
+
+  wm->alpha = ROUND_POWER_OF_TWO_SIGNED(wm->alpha, WARP_PARAM_REDUCE_BITS) *
+              (1 << WARP_PARAM_REDUCE_BITS);
+  wm->beta = ROUND_POWER_OF_TWO_SIGNED(wm->beta, WARP_PARAM_REDUCE_BITS) *
+             (1 << WARP_PARAM_REDUCE_BITS);
+  wm->gamma = ROUND_POWER_OF_TWO_SIGNED(wm->gamma, WARP_PARAM_REDUCE_BITS) *
+              (1 << WARP_PARAM_REDUCE_BITS);
+  wm->delta = ROUND_POWER_OF_TWO_SIGNED(wm->delta, WARP_PARAM_REDUCE_BITS) *
+              (1 << WARP_PARAM_REDUCE_BITS);
+
+  if (!is_affine_shear_allowed(wm->alpha, wm->beta, wm->gamma, wm->delta))
+    return 0;
+
+  return 1;
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE int highbd_error_measure(int err, int bd) {
+  const int b = bd - 8;
+  const int bmask = (1 << b) - 1;
+  const int v = (1 << b);
+  err = abs(err);
+  const int e1 = err >> b;
+  const int e2 = err & bmask;
+  return error_measure_lut[255 + e1] * (v - e2) +
+         error_measure_lut[256 + e1] * e2;
+}
+
+/* Note: For an explanation of the warp algorithm, and some notes on bit widths
+    for hardware implementations, see the comments above av1_warp_affine_c
+*/
+void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
+                              int width, int height, int stride, uint16_t *pred,
+                              int p_col, int p_row, int p_width, int p_height,
+                              int p_stride, int subsampling_x,
+                              int subsampling_y, int bd,
+                              ConvolveParams *conv_params, int16_t alpha,
+                              int16_t beta, int16_t gamma, int16_t delta) {
+  int32_t tmp[15 * 8];
+  const int reduce_bits_horiz =
+      conv_params->round_0 +
+      AOMMAX(bd + FILTER_BITS - conv_params->round_0 - 14, 0);
+  const int reduce_bits_vert = conv_params->is_compound
+                                   ? conv_params->round_1
+                                   : 2 * FILTER_BITS - reduce_bits_horiz;
+  const int max_bits_horiz = bd + FILTER_BITS + 1 - reduce_bits_horiz;
+  const int offset_bits_horiz = bd + FILTER_BITS - 1;
+  const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  (void)max_bits_horiz;
+  assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
+
+  for (int i = p_row; i < p_row + p_height; i += 8) {
+    for (int j = p_col; j < p_col + p_width; j += 8) {
+      // Calculate the center of this 8x8 block,
+      // project to luma coordinates (if in a subsampled chroma plane),
+      // apply the affine transformation,
+      // then convert back to the original coordinates (if necessary)
+      const int32_t src_x = (j + 4) << subsampling_x;
+      const int32_t src_y = (i + 4) << subsampling_y;
+      const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
+      const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
+      const int32_t x4 = dst_x >> subsampling_x;
+      const int32_t y4 = dst_y >> subsampling_y;
+
+      const int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
+      int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+      const int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
+      int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+
+      sx4 += alpha * (-4) + beta * (-4);
+      sy4 += gamma * (-4) + delta * (-4);
+
+      sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+      sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+
+      // Horizontal filter
+      for (int k = -7; k < 8; ++k) {
+        const int iy = clamp(iy4 + k, 0, height - 1);
+
+        int sx = sx4 + beta * (k + 4);
+        for (int l = -4; l < 4; ++l) {
+          int ix = ix4 + l - 3;
+          const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
+                           WARPEDPIXEL_PREC_SHIFTS;
+          assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
+          const int16_t *coeffs = av1_warped_filter[offs];
+
+          int32_t sum = 1 << offset_bits_horiz;
+          for (int m = 0; m < 8; ++m) {
+            const int sample_x = clamp(ix + m, 0, width - 1);
+            sum += ref[iy * stride + sample_x] * coeffs[m];
+          }
+          sum = ROUND_POWER_OF_TWO(sum, reduce_bits_horiz);
+          assert(0 <= sum && sum < (1 << max_bits_horiz));
+          tmp[(k + 7) * 8 + (l + 4)] = sum;
+          sx += alpha;
+        }
+      }
+
+      // Vertical filter
+      for (int k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
+        int sy = sy4 + delta * (k + 4);
+        for (int l = -4; l < AOMMIN(4, p_col + p_width - j - 4); ++l) {
+          const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
+                           WARPEDPIXEL_PREC_SHIFTS;
+          assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
+          const int16_t *coeffs = av1_warped_filter[offs];
+
+          int32_t sum = 1 << offset_bits_vert;
+          for (int m = 0; m < 8; ++m) {
+            sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
+          }
+
+          if (conv_params->is_compound) {
+            CONV_BUF_TYPE *p =
+                &conv_params
+                     ->dst[(i - p_row + k + 4) * conv_params->dst_stride +
+                           (j - p_col + l + 4)];
+            sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
+            if (conv_params->do_average) {
+              uint16_t *dst16 =
+                  &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
+              int32_t tmp32 = *p;
+              if (conv_params->use_dist_wtd_comp_avg) {
+                tmp32 = tmp32 * conv_params->fwd_offset +
+                        sum * conv_params->bck_offset;
+                tmp32 = tmp32 >> DIST_PRECISION_BITS;
+              } else {
+                tmp32 += sum;
+                tmp32 = tmp32 >> 1;
+              }
+              tmp32 = tmp32 - (1 << (offset_bits - conv_params->round_1)) -
+                      (1 << (offset_bits - conv_params->round_1 - 1));
+              *dst16 =
+                  clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp32, round_bits), bd);
+            } else {
+              *p = sum;
+            }
+          } else {
+            uint16_t *p =
+                &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
+            sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
+            assert(0 <= sum && sum < (1 << (bd + 2)));
+            *p = clip_pixel_highbd(sum - (1 << (bd - 1)) - (1 << bd), bd);
+          }
+          sy += gamma;
+        }
+      }
+    }
+  }
+}
+
+void highbd_warp_plane(WarpedMotionParams *wm, const uint16_t *const ref,
+                       int width, int height, int stride, uint16_t *const pred,
+                       int p_col, int p_row, int p_width, int p_height,
+                       int p_stride, int subsampling_x, int subsampling_y,
+                       int bd, ConvolveParams *conv_params) {
+  assert(wm->wmtype <= AFFINE);
+  if (wm->wmtype == ROTZOOM) {
+    wm->wmmat[5] = wm->wmmat[2];
+    wm->wmmat[4] = -wm->wmmat[3];
+  }
+  const int32_t *const mat = wm->wmmat;
+  const int16_t alpha = wm->alpha;
+  const int16_t beta = wm->beta;
+  const int16_t gamma = wm->gamma;
+  const int16_t delta = wm->delta;
+
+  av1_highbd_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row,
+                         p_width, p_height, p_stride, subsampling_x,
+                         subsampling_y, bd, conv_params, alpha, beta, gamma,
+                         delta);
+}
+
+int64_t av1_calc_highbd_frame_error(const uint16_t *const ref, int stride,
+                                    const uint16_t *const dst, int p_width,
+                                    int p_height, int p_stride, int bd) {
+  int64_t sum_error = 0;
+  for (int i = 0; i < p_height; ++i) {
+    for (int j = 0; j < p_width; ++j) {
+      sum_error +=
+          highbd_error_measure(dst[j + i * p_stride] - ref[j + i * stride], bd);
+    }
+  }
+  return sum_error;
+}
+
+static int64_t highbd_segmented_frame_error(
+    const uint16_t *const ref, int stride, const uint16_t *const dst,
+    int p_width, int p_height, int p_stride, int bd, uint8_t *segment_map,
+    int segment_map_stride) {
+  int patch_w, patch_h;
+  const int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
+  const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
+  int64_t sum_error = 0;
+  for (int i = 0; i < p_height; i += WARP_ERROR_BLOCK) {
+    for (int j = 0; j < p_width; j += WARP_ERROR_BLOCK) {
+      int seg_x = j >> WARP_ERROR_BLOCK_LOG;
+      int seg_y = i >> WARP_ERROR_BLOCK_LOG;
+      // Only compute the error if this block contains inliers from the motion
+      // model
+      if (!segment_map[seg_y * segment_map_stride + seg_x]) continue;
+
+      // avoid computing error into the frame padding
+      patch_w = AOMMIN(error_bsize_w, p_width - j);
+      patch_h = AOMMIN(error_bsize_h, p_height - i);
+      sum_error += av1_calc_highbd_frame_error(ref + j + i * stride, stride,
+                                               dst + j + i * p_stride, patch_w,
+                                               patch_h, p_stride, bd);
+    }
+  }
+  return sum_error;
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+/* The warp filter for ROTZOOM and AFFINE models works as follows:
+   * Split the input into 8x8 blocks
+   * For each block, project the point (4, 4) within the block, to get the
+     overall block position. Split into integer and fractional coordinates,
+     maintaining full WARPEDMODEL precision
+   * Filter horizontally: Generate 15 rows of 8 pixels each. Each pixel gets a
+     variable horizontal offset. This means that, while the rows of the
+     intermediate buffer align with the rows of the *reference* image, the
+     columns align with the columns of the *destination* image.
+   * Filter vertically: Generate the output block (up to 8x8 pixels, but if the
+     destination is too small we crop the output at this stage). Each pixel has
+     a variable vertical offset, so that the resulting rows are aligned with
+     the rows of the destination image.
+
+   To accomplish these alignments, we factor the warp matrix as a
+   product of two shear / asymmetric zoom matrices:
+   / a b \  = /   1       0    \ * / 1+alpha  beta \
+   \ c d /    \ gamma  1+delta /   \    0      1   /
+   where a, b, c, d are wmmat[2], wmmat[3], wmmat[4], wmmat[5] respectively.
+   The horizontal shear (with alpha and beta) is applied first,
+   then the vertical shear (with gamma and delta) is applied second.
+
+   The only limitation is that, to fit this in a fixed 8-tap filter size,
+   the fractional pixel offsets must be at most +-1. Since the horizontal filter
+   generates 15 rows of 8 columns, and the initial point we project is at (4, 4)
+   within the block, the parameters must satisfy
+   4 * |alpha| + 7 * |beta| <= 1   and   4 * |gamma| + 4 * |delta| <= 1
+   for this filter to be applicable.
+
+   Note: This function assumes that the caller has done all of the relevant
+   checks, ie. that we have a ROTZOOM or AFFINE model, that wm[4] and wm[5]
+   are set appropriately (if using a ROTZOOM model), and that alpha, beta,
+   gamma, delta are all in range.
+
+   TODO(david.barker): Maybe support scaled references?
+*/
+/* A note on hardware implementation:
+    The warp filter is intended to be implementable using the same hardware as
+    the high-precision convolve filters from the loop-restoration and
+    convolve-round experiments.
+
+    For a single filter stage, considering all of the coefficient sets for the
+    warp filter and the regular convolution filter, an input in the range
+    [0, 2^k - 1] is mapped into the range [-56 * (2^k - 1), 184 * (2^k - 1)]
+    before rounding.
+
+    Allowing for some changes to the filter coefficient sets, call the range
+    [-64 * 2^k, 192 * 2^k]. Then, if we initialize the accumulator to 64 * 2^k,
+    we can replace this by the range [0, 256 * 2^k], which can be stored in an
+    unsigned value with 8 + k bits.
+
+    This allows the derivation of the appropriate bit widths and offsets for
+    the various intermediate values: If
+
+    F := FILTER_BITS = 7 (or else the above ranges need adjusting)
+         So a *single* filter stage maps a k-bit input to a (k + F + 1)-bit
+         intermediate value.
+    H := ROUND0_BITS
+    V := VERSHEAR_REDUCE_PREC_BITS
+    (and note that we must have H + V = 2*F for the output to have the same
+     scale as the input)
+
+    then we end up with the following offsets and ranges:
+    Horizontal filter: Apply an offset of 1 << (bd + F - 1), sum fits into a
+                       uint{bd + F + 1}
+    After rounding: The values stored in 'tmp' fit into a uint{bd + F + 1 - H}.
+    Vertical filter: Apply an offset of 1 << (bd + 2*F - H), sum fits into a
+                     uint{bd + 2*F + 2 - H}
+    After rounding: The final value, before undoing the offset, fits into a
+                    uint{bd + 2}.
+
+    Then we need to undo the offsets before clamping to a pixel. Note that,
+    if we do this at the end, the amount to subtract is actually independent
+    of H and V:
+
+    offset to subtract = (1 << ((bd + F - 1) - H + F - V)) +
+                         (1 << ((bd + 2*F - H) - V))
+                      == (1 << (bd - 1)) + (1 << bd)
+
+    This allows us to entirely avoid clamping in both the warp filter and
+    the convolve-round experiment. As of the time of writing, the Wiener filter
+    from loop-restoration can encode a central coefficient up to 216, which
+    leads to a maximum value of about 282 * 2^k after applying the offset.
+    So in that case we still need to clamp.
+*/
+void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
+                       int height, int stride, uint8_t *pred, int p_col,
+                       int p_row, int p_width, int p_height, int p_stride,
+                       int subsampling_x, int subsampling_y,
+                       ConvolveParams *conv_params, int16_t alpha, int16_t beta,
+                       int16_t gamma, int16_t delta) {
+  int32_t tmp[15 * 8];
+  const int bd = 8;
+  const int reduce_bits_horiz = conv_params->round_0;
+  const int reduce_bits_vert = conv_params->is_compound
+                                   ? conv_params->round_1
+                                   : 2 * FILTER_BITS - reduce_bits_horiz;
+  const int max_bits_horiz = bd + FILTER_BITS + 1 - reduce_bits_horiz;
+  const int offset_bits_horiz = bd + FILTER_BITS - 1;
+  const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  (void)max_bits_horiz;
+  assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
+  assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
+
+  for (int i = p_row; i < p_row + p_height; i += 8) {
+    for (int j = p_col; j < p_col + p_width; j += 8) {
+      // Calculate the center of this 8x8 block,
+      // project to luma coordinates (if in a subsampled chroma plane),
+      // apply the affine transformation,
+      // then convert back to the original coordinates (if necessary)
+      const int32_t src_x = (j + 4) << subsampling_x;
+      const int32_t src_y = (i + 4) << subsampling_y;
+      const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
+      const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
+      const int32_t x4 = dst_x >> subsampling_x;
+      const int32_t y4 = dst_y >> subsampling_y;
+
+      int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
+      int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+      int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
+      int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+
+      sx4 += alpha * (-4) + beta * (-4);
+      sy4 += gamma * (-4) + delta * (-4);
+
+      sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+      sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+
+      // Horizontal filter
+      for (int k = -7; k < 8; ++k) {
+        // Clamp to top/bottom edge of the frame
+        const int iy = clamp(iy4 + k, 0, height - 1);
+
+        int sx = sx4 + beta * (k + 4);
+
+        for (int l = -4; l < 4; ++l) {
+          int ix = ix4 + l - 3;
+          // At this point, sx = sx4 + alpha * l + beta * k
+          const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
+                           WARPEDPIXEL_PREC_SHIFTS;
+          assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
+          const int16_t *coeffs = av1_warped_filter[offs];
+
+          int32_t sum = 1 << offset_bits_horiz;
+          for (int m = 0; m < 8; ++m) {
+            // Clamp to left/right edge of the frame
+            const int sample_x = clamp(ix + m, 0, width - 1);
+
+            sum += ref[iy * stride + sample_x] * coeffs[m];
+          }
+          sum = ROUND_POWER_OF_TWO(sum, reduce_bits_horiz);
+          assert(0 <= sum && sum < (1 << max_bits_horiz));
+          tmp[(k + 7) * 8 + (l + 4)] = sum;
+          sx += alpha;
+        }
+      }
+
+      // Vertical filter
+      for (int k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
+        int sy = sy4 + delta * (k + 4);
+        for (int l = -4; l < AOMMIN(4, p_col + p_width - j - 4); ++l) {
+          // At this point, sy = sy4 + gamma * l + delta * k
+          const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
+                           WARPEDPIXEL_PREC_SHIFTS;
+          assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
+          const int16_t *coeffs = av1_warped_filter[offs];
+
+          int32_t sum = 1 << offset_bits_vert;
+          for (int m = 0; m < 8; ++m) {
+            sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
+          }
+
+          if (conv_params->is_compound) {
+            CONV_BUF_TYPE *p =
+                &conv_params
+                     ->dst[(i - p_row + k + 4) * conv_params->dst_stride +
+                           (j - p_col + l + 4)];
+            sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
+            if (conv_params->do_average) {
+              uint8_t *dst8 =
+                  &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
+              int32_t tmp32 = *p;
+              if (conv_params->use_dist_wtd_comp_avg) {
+                tmp32 = tmp32 * conv_params->fwd_offset +
+                        sum * conv_params->bck_offset;
+                tmp32 = tmp32 >> DIST_PRECISION_BITS;
+              } else {
+                tmp32 += sum;
+                tmp32 = tmp32 >> 1;
+              }
+              tmp32 = tmp32 - (1 << (offset_bits - conv_params->round_1)) -
+                      (1 << (offset_bits - conv_params->round_1 - 1));
+              *dst8 = clip_pixel(ROUND_POWER_OF_TWO(tmp32, round_bits));
+            } else {
+              *p = sum;
+            }
+          } else {
+            uint8_t *p =
+                &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
+            sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
+            assert(0 <= sum && sum < (1 << (bd + 2)));
+            *p = clip_pixel(sum - (1 << (bd - 1)) - (1 << bd));
+          }
+          sy += gamma;
+        }
+      }
+    }
+  }
+}
+
+void warp_plane(WarpedMotionParams *wm, const uint8_t *const ref, int width,
+                int height, int stride, uint8_t *pred, int p_col, int p_row,
+                int p_width, int p_height, int p_stride, int subsampling_x,
+                int subsampling_y, ConvolveParams *conv_params) {
+  assert(wm->wmtype <= AFFINE);
+  if (wm->wmtype == ROTZOOM) {
+    wm->wmmat[5] = wm->wmmat[2];
+    wm->wmmat[4] = -wm->wmmat[3];
+  }
+  const int32_t *const mat = wm->wmmat;
+  const int16_t alpha = wm->alpha;
+  const int16_t beta = wm->beta;
+  const int16_t gamma = wm->gamma;
+  const int16_t delta = wm->delta;
+  av1_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row, p_width,
+                  p_height, p_stride, subsampling_x, subsampling_y, conv_params,
+                  alpha, beta, gamma, delta);
+}
+
+int64_t av1_calc_frame_error_c(const uint8_t *const ref, int stride,
+                               const uint8_t *const dst, int p_width,
+                               int p_height, int p_stride) {
+  int64_t sum_error = 0;
+  for (int i = 0; i < p_height; ++i) {
+    for (int j = 0; j < p_width; ++j) {
+      sum_error +=
+          (int64_t)error_measure(dst[j + i * p_stride] - ref[j + i * stride]);
+    }
+  }
+  return sum_error;
+}
+
+static int64_t segmented_frame_error(const uint8_t *const ref, int stride,
+                                     const uint8_t *const dst, int p_width,
+                                     int p_height, int p_stride,
+                                     uint8_t *segment_map,
+                                     int segment_map_stride) {
+  int patch_w, patch_h;
+  const int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
+  const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
+  int64_t sum_error = 0;
+  for (int i = 0; i < p_height; i += WARP_ERROR_BLOCK) {
+    for (int j = 0; j < p_width; j += WARP_ERROR_BLOCK) {
+      int seg_x = j >> WARP_ERROR_BLOCK_LOG;
+      int seg_y = i >> WARP_ERROR_BLOCK_LOG;
+      // Only compute the error if this block contains inliers from the motion
+      // model
+      if (!segment_map[seg_y * segment_map_stride + seg_x]) continue;
+
+      // avoid computing error into the frame padding
+      patch_w = AOMMIN(error_bsize_w, p_width - j);
+      patch_h = AOMMIN(error_bsize_h, p_height - i);
+      sum_error += av1_calc_frame_error(ref + j + i * stride, stride,
+                                        dst + j + i * p_stride, patch_w,
+                                        patch_h, p_stride);
+    }
+  }
+  return sum_error;
+}
+
+int64_t av1_frame_error(int use_hbd, int bd, const uint8_t *ref, int stride,
+                        uint8_t *dst, int p_width, int p_height, int p_stride) {
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (use_hbd) {
+    return av1_calc_highbd_frame_error(CONVERT_TO_SHORTPTR(ref), stride,
+                                       CONVERT_TO_SHORTPTR(dst), p_width,
+                                       p_height, p_stride, bd);
+  }
+#endif
+  (void)use_hbd;
+  (void)bd;
+  return av1_calc_frame_error(ref, stride, dst, p_width, p_height, p_stride);
+}
+
+int64_t av1_segmented_frame_error(int use_hbd, int bd, const uint8_t *ref,
+                                  int stride, uint8_t *dst, int p_width,
+                                  int p_height, int p_stride,
+                                  uint8_t *segment_map,
+                                  int segment_map_stride) {
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (use_hbd) {
+    return highbd_segmented_frame_error(
+        CONVERT_TO_SHORTPTR(ref), stride, CONVERT_TO_SHORTPTR(dst), p_width,
+        p_height, p_stride, bd, segment_map, segment_map_stride);
+  }
+#endif
+  (void)use_hbd;
+  (void)bd;
+  return segmented_frame_error(ref, stride, dst, p_width, p_height, p_stride,
+                               segment_map, segment_map_stride);
+}
+
+void av1_warp_plane(WarpedMotionParams *wm, int use_hbd, int bd,
+                    const uint8_t *ref, int width, int height, int stride,
+                    uint8_t *pred, int p_col, int p_row, int p_width,
+                    int p_height, int p_stride, int subsampling_x,
+                    int subsampling_y, ConvolveParams *conv_params) {
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (use_hbd)
+    highbd_warp_plane(wm, CONVERT_TO_SHORTPTR(ref), width, height, stride,
+                      CONVERT_TO_SHORTPTR(pred), p_col, p_row, p_width,
+                      p_height, p_stride, subsampling_x, subsampling_y, bd,
+                      conv_params);
+  else
+    warp_plane(wm, ref, width, height, stride, pred, p_col, p_row, p_width,
+               p_height, p_stride, subsampling_x, subsampling_y, conv_params);
+#else
+  (void)use_hbd;
+  (void)bd;
+  warp_plane(wm, ref, width, height, stride, pred, p_col, p_row, p_width,
+             p_height, p_stride, subsampling_x, subsampling_y, conv_params);
+#endif
+}
+
+#define LS_MV_MAX 256  // max mv in 1/8-pel
+// Use LS_STEP = 8 so that 2 less bits needed for A, Bx, By.
+#define LS_STEP 8
+
+// Assuming LS_MV_MAX is < MAX_SB_SIZE * 8,
+// the precision needed is:
+//   (MAX_SB_SIZE_LOG2 + 3) [for sx * sx magnitude] +
+//   (MAX_SB_SIZE_LOG2 + 4) [for sx * dx magnitude] +
+//   1 [for sign] +
+//   LEAST_SQUARES_SAMPLES_MAX_BITS
+//        [for adding up to LEAST_SQUARES_SAMPLES_MAX samples]
+// The value is 23
+#define LS_MAT_RANGE_BITS \
+  ((MAX_SB_SIZE_LOG2 + 4) * 2 + LEAST_SQUARES_SAMPLES_MAX_BITS)
+
+// Bit-depth reduction from the full-range
+#define LS_MAT_DOWN_BITS 2
+
+// bits range of A, Bx and By after downshifting
+#define LS_MAT_BITS (LS_MAT_RANGE_BITS - LS_MAT_DOWN_BITS)
+#define LS_MAT_MIN (-(1 << (LS_MAT_BITS - 1)))
+#define LS_MAT_MAX ((1 << (LS_MAT_BITS - 1)) - 1)
+
+// By setting LS_STEP = 8, the least 2 bits of every elements in A, Bx, By are
+// 0. So, we can reduce LS_MAT_RANGE_BITS(2) bits here.
+#define LS_SQUARE(a)                                          \
+  (((a) * (a)*4 + (a)*4 * LS_STEP + LS_STEP * LS_STEP * 2) >> \
+   (2 + LS_MAT_DOWN_BITS))
+#define LS_PRODUCT1(a, b)                                           \
+  (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP) >> \
+   (2 + LS_MAT_DOWN_BITS))
+#define LS_PRODUCT2(a, b)                                               \
+  (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP * 2) >> \
+   (2 + LS_MAT_DOWN_BITS))
+
+#define USE_LIMITED_PREC_MULT 0
+
+#if USE_LIMITED_PREC_MULT
+
+#define MUL_PREC_BITS 16
+static uint16_t resolve_multiplier_64(uint64_t D, int16_t *shift) {
+  int msb = 0;
+  uint16_t mult = 0;
+  *shift = 0;
+  if (D != 0) {
+    msb = (int16_t)((D >> 32) ? get_msb((unsigned int)(D >> 32)) + 32
+                              : get_msb((unsigned int)D));
+    if (msb >= MUL_PREC_BITS) {
+      mult = (uint16_t)ROUND_POWER_OF_TWO_64(D, msb + 1 - MUL_PREC_BITS);
+      *shift = msb + 1 - MUL_PREC_BITS;
+    } else {
+      mult = (uint16_t)D;
+      *shift = 0;
+    }
+  }
+  return mult;
+}
+
+static int32_t get_mult_shift_ndiag(int64_t Px, int16_t iDet, int shift) {
+  int32_t ret;
+  int16_t mshift;
+  uint16_t Mul = resolve_multiplier_64(llabs(Px), &mshift);
+  int32_t v = (int32_t)Mul * (int32_t)iDet * (Px < 0 ? -1 : 1);
+  shift -= mshift;
+  if (shift > 0) {
+    return (int32_t)clamp(ROUND_POWER_OF_TWO_SIGNED(v, shift),
+                          -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
+                          WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
+  } else {
+    return (int32_t)clamp(v * (1 << (-shift)),
+                          -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
+                          WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
+  }
+  return ret;
+}
+
+static int32_t get_mult_shift_diag(int64_t Px, int16_t iDet, int shift) {
+  int16_t mshift;
+  uint16_t Mul = resolve_multiplier_64(llabs(Px), &mshift);
+  int32_t v = (int32_t)Mul * (int32_t)iDet * (Px < 0 ? -1 : 1);
+  shift -= mshift;
+  if (shift > 0) {
+    return (int32_t)clamp(
+        ROUND_POWER_OF_TWO_SIGNED(v, shift),
+        (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
+        (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
+  } else {
+    return (int32_t)clamp(
+        v * (1 << (-shift)),
+        (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
+        (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
+  }
+}
+
+#else
+
+static int32_t get_mult_shift_ndiag(int64_t Px, int16_t iDet, int shift) {
+  int64_t v = Px * (int64_t)iDet;
+  return (int32_t)clamp64(ROUND_POWER_OF_TWO_SIGNED_64(v, shift),
+                          -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
+                          WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
+}
+
+static int32_t get_mult_shift_diag(int64_t Px, int16_t iDet, int shift) {
+  int64_t v = Px * (int64_t)iDet;
+  return (int32_t)clamp64(
+      ROUND_POWER_OF_TWO_SIGNED_64(v, shift),
+      (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
+      (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
+}
+#endif  // USE_LIMITED_PREC_MULT
+
+static int find_affine_int(int np, const int *pts1, const int *pts2,
+                           BLOCK_SIZE bsize, int mvy, int mvx,
+                           WarpedMotionParams *wm, int mi_row, int mi_col) {
+  int32_t A[2][2] = { { 0, 0 }, { 0, 0 } };
+  int32_t Bx[2] = { 0, 0 };
+  int32_t By[2] = { 0, 0 };
+
+  const int bw = block_size_wide[bsize];
+  const int bh = block_size_high[bsize];
+  const int rsuy = bh / 2 - 1;
+  const int rsux = bw / 2 - 1;
+  const int suy = rsuy * 8;
+  const int sux = rsux * 8;
+  const int duy = suy + mvy;
+  const int dux = sux + mvx;
+
+  // Assume the center pixel of the block has exactly the same motion vector
+  // as transmitted for the block. First shift the origin of the source
+  // points to the block center, and the origin of the destination points to
+  // the block center added to the motion vector transmitted.
+  // Let (xi, yi) denote the source points and (xi', yi') denote destination
+  // points after origin shfifting, for i = 0, 1, 2, .... n-1.
+  // Then if  P = [x0, y0,
+  //               x1, y1
+  //               x2, y1,
+  //                ....
+  //              ]
+  //          q = [x0', x1', x2', ... ]'
+  //          r = [y0', y1', y2', ... ]'
+  // the least squares problems that need to be solved are:
+  //          [h1, h2]' = inv(P'P)P'q and
+  //          [h3, h4]' = inv(P'P)P'r
+  // where the affine transformation is given by:
+  //          x' = h1.x + h2.y
+  //          y' = h3.x + h4.y
+  //
+  // The loop below computes: A = P'P, Bx = P'q, By = P'r
+  // We need to just compute inv(A).Bx and inv(A).By for the solutions.
+  // Contribution from neighbor block
+  for (int i = 0; i < np; i++) {
+    const int dx = pts2[i * 2] - dux;
+    const int dy = pts2[i * 2 + 1] - duy;
+    const int sx = pts1[i * 2] - sux;
+    const int sy = pts1[i * 2 + 1] - suy;
+    // (TODO)yunqing: This comparison wouldn't be necessary if the sample
+    // selection is done in find_samples(). Also, global offset can be removed
+    // while collecting samples.
+    if (abs(sx - dx) < LS_MV_MAX && abs(sy - dy) < LS_MV_MAX) {
+      A[0][0] += LS_SQUARE(sx);
+      A[0][1] += LS_PRODUCT1(sx, sy);
+      A[1][1] += LS_SQUARE(sy);
+      Bx[0] += LS_PRODUCT2(sx, dx);
+      Bx[1] += LS_PRODUCT1(sy, dx);
+      By[0] += LS_PRODUCT1(sx, dy);
+      By[1] += LS_PRODUCT2(sy, dy);
+    }
+  }
+
+  // Just for debugging, and can be removed later.
+  assert(A[0][0] >= LS_MAT_MIN && A[0][0] <= LS_MAT_MAX);
+  assert(A[0][1] >= LS_MAT_MIN && A[0][1] <= LS_MAT_MAX);
+  assert(A[1][1] >= LS_MAT_MIN && A[1][1] <= LS_MAT_MAX);
+  assert(Bx[0] >= LS_MAT_MIN && Bx[0] <= LS_MAT_MAX);
+  assert(Bx[1] >= LS_MAT_MIN && Bx[1] <= LS_MAT_MAX);
+  assert(By[0] >= LS_MAT_MIN && By[0] <= LS_MAT_MAX);
+  assert(By[1] >= LS_MAT_MIN && By[1] <= LS_MAT_MAX);
+
+  // Compute Determinant of A
+  const int64_t Det = (int64_t)A[0][0] * A[1][1] - (int64_t)A[0][1] * A[0][1];
+  if (Det == 0) return 1;
+
+  int16_t shift;
+  int16_t iDet = resolve_divisor_64(llabs(Det), &shift) * (Det < 0 ? -1 : 1);
+  shift -= WARPEDMODEL_PREC_BITS;
+  if (shift < 0) {
+    iDet <<= (-shift);
+    shift = 0;
+  }
+
+  int64_t Px[2], Py[2];
+  // These divided by the Det, are the least squares solutions
+  Px[0] = (int64_t)A[1][1] * Bx[0] - (int64_t)A[0][1] * Bx[1];
+  Px[1] = -(int64_t)A[0][1] * Bx[0] + (int64_t)A[0][0] * Bx[1];
+  Py[0] = (int64_t)A[1][1] * By[0] - (int64_t)A[0][1] * By[1];
+  Py[1] = -(int64_t)A[0][1] * By[0] + (int64_t)A[0][0] * By[1];
+
+  wm->wmmat[2] = get_mult_shift_diag(Px[0], iDet, shift);
+  wm->wmmat[3] = get_mult_shift_ndiag(Px[1], iDet, shift);
+  wm->wmmat[4] = get_mult_shift_ndiag(Py[0], iDet, shift);
+  wm->wmmat[5] = get_mult_shift_diag(Py[1], iDet, shift);
+
+  const int isuy = (mi_row * MI_SIZE + rsuy);
+  const int isux = (mi_col * MI_SIZE + rsux);
+  // Note: In the vx, vy expressions below, the max value of each of the
+  // 2nd and 3rd terms are (2^16 - 1) * (2^13 - 1). That leaves enough room
+  // for the first term so that the overall sum in the worst case fits
+  // within 32 bits overall.
+  const int32_t vx = mvx * (1 << (WARPEDMODEL_PREC_BITS - 3)) -
+                     (isux * (wm->wmmat[2] - (1 << WARPEDMODEL_PREC_BITS)) +
+                      isuy * wm->wmmat[3]);
+  const int32_t vy = mvy * (1 << (WARPEDMODEL_PREC_BITS - 3)) -
+                     (isux * wm->wmmat[4] +
+                      isuy * (wm->wmmat[5] - (1 << WARPEDMODEL_PREC_BITS)));
+  wm->wmmat[0] =
+      clamp(vx, -WARPEDMODEL_TRANS_CLAMP, WARPEDMODEL_TRANS_CLAMP - 1);
+  wm->wmmat[1] =
+      clamp(vy, -WARPEDMODEL_TRANS_CLAMP, WARPEDMODEL_TRANS_CLAMP - 1);
+
+  wm->wmmat[6] = wm->wmmat[7] = 0;
+  return 0;
+}
+
+int av1_find_projection(int np, const int *pts1, const int *pts2,
+                        BLOCK_SIZE bsize, int mvy, int mvx,
+                        WarpedMotionParams *wm_params, int mi_row, int mi_col) {
+  assert(wm_params->wmtype == AFFINE);
+
+  if (find_affine_int(np, pts1, pts2, bsize, mvy, mvx, wm_params, mi_row,
+                      mi_col))
+    return 1;
+
+  // check compatibility with the fast warp filter
+  if (!av1_get_shear_params(wm_params)) return 1;
+
+  return 0;
+}
diff --git a/libs/libaom/src/av1/common/warped_motion.h b/libs/libaom/src/av1/common/warped_motion.h
new file mode 100644
index 000000000..14dc0fe47
--- /dev/null
+++ b/libs/libaom/src/av1/common/warped_motion.h
@@ -0,0 +1,186 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_WARPED_MOTION_H_
+#define AOM_AV1_COMMON_WARPED_MOTION_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <math.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+
+#include "aom_ports/mem.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "av1/common/mv.h"
+#include "av1/common/convolve.h"
+
+#define MAX_PARAMDIM 9
+#define LEAST_SQUARES_SAMPLES_MAX_BITS 3
+#define LEAST_SQUARES_SAMPLES_MAX (1 << LEAST_SQUARES_SAMPLES_MAX_BITS)
+#define SAMPLES_ARRAY_SIZE (LEAST_SQUARES_SAMPLES_MAX * 2)
+#define WARPED_MOTION_DEBUG 0
+#define DEFAULT_WMTYPE AFFINE
+#define WARP_ERROR_BLOCK_LOG 5
+#define WARP_ERROR_BLOCK (1 << WARP_ERROR_BLOCK_LOG)
+
+extern const int16_t av1_warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8];
+
+DECLARE_ALIGNED(8, extern const int8_t,
+                av1_filter_8bit[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]);
+
+/* clang-format off */
+static const int error_measure_lut[512] = {
+    // pow 0.7
+    16384, 16339, 16294, 16249, 16204, 16158, 16113, 16068,
+    16022, 15977, 15932, 15886, 15840, 15795, 15749, 15703,
+    15657, 15612, 15566, 15520, 15474, 15427, 15381, 15335,
+    15289, 15242, 15196, 15149, 15103, 15056, 15010, 14963,
+    14916, 14869, 14822, 14775, 14728, 14681, 14634, 14587,
+    14539, 14492, 14445, 14397, 14350, 14302, 14254, 14206,
+    14159, 14111, 14063, 14015, 13967, 13918, 13870, 13822,
+    13773, 13725, 13676, 13628, 13579, 13530, 13481, 13432,
+    13383, 13334, 13285, 13236, 13187, 13137, 13088, 13038,
+    12988, 12939, 12889, 12839, 12789, 12739, 12689, 12639,
+    12588, 12538, 12487, 12437, 12386, 12335, 12285, 12234,
+    12183, 12132, 12080, 12029, 11978, 11926, 11875, 11823,
+    11771, 11719, 11667, 11615, 11563, 11511, 11458, 11406,
+    11353, 11301, 11248, 11195, 11142, 11089, 11036, 10982,
+    10929, 10875, 10822, 10768, 10714, 10660, 10606, 10552,
+    10497, 10443, 10388, 10333, 10279, 10224, 10168, 10113,
+    10058, 10002,  9947,  9891,  9835,  9779,  9723,  9666,
+    9610, 9553, 9497, 9440, 9383, 9326, 9268, 9211,
+    9153, 9095, 9037, 8979, 8921, 8862, 8804, 8745,
+    8686, 8627, 8568, 8508, 8449, 8389, 8329, 8269,
+    8208, 8148, 8087, 8026, 7965, 7903, 7842, 7780,
+    7718, 7656, 7593, 7531, 7468, 7405, 7341, 7278,
+    7214, 7150, 7086, 7021, 6956, 6891, 6826, 6760,
+    6695, 6628, 6562, 6495, 6428, 6361, 6293, 6225,
+    6157, 6089, 6020, 5950, 5881, 5811, 5741, 5670,
+    5599, 5527, 5456, 5383, 5311, 5237, 5164, 5090,
+    5015, 4941, 4865, 4789, 4713, 4636, 4558, 4480,
+    4401, 4322, 4242, 4162, 4080, 3998, 3916, 3832,
+    3748, 3663, 3577, 3490, 3402, 3314, 3224, 3133,
+    3041, 2948, 2854, 2758, 2661, 2562, 2461, 2359,
+    2255, 2148, 2040, 1929, 1815, 1698, 1577, 1452,
+    1323, 1187, 1045,  894,  731,  550,  339,    0,
+    339,  550,  731,  894, 1045, 1187, 1323, 1452,
+    1577, 1698, 1815, 1929, 2040, 2148, 2255, 2359,
+    2461, 2562, 2661, 2758, 2854, 2948, 3041, 3133,
+    3224, 3314, 3402, 3490, 3577, 3663, 3748, 3832,
+    3916, 3998, 4080, 4162, 4242, 4322, 4401, 4480,
+    4558, 4636, 4713, 4789, 4865, 4941, 5015, 5090,
+    5164, 5237, 5311, 5383, 5456, 5527, 5599, 5670,
+    5741, 5811, 5881, 5950, 6020, 6089, 6157, 6225,
+    6293, 6361, 6428, 6495, 6562, 6628, 6695, 6760,
+    6826, 6891, 6956, 7021, 7086, 7150, 7214, 7278,
+    7341, 7405, 7468, 7531, 7593, 7656, 7718, 7780,
+    7842, 7903, 7965, 8026, 8087, 8148, 8208, 8269,
+    8329, 8389, 8449, 8508, 8568, 8627, 8686, 8745,
+    8804, 8862, 8921, 8979, 9037, 9095, 9153, 9211,
+    9268, 9326, 9383, 9440, 9497, 9553, 9610, 9666,
+    9723,  9779,  9835,  9891,  9947, 10002, 10058, 10113,
+    10168, 10224, 10279, 10333, 10388, 10443, 10497, 10552,
+    10606, 10660, 10714, 10768, 10822, 10875, 10929, 10982,
+    11036, 11089, 11142, 11195, 11248, 11301, 11353, 11406,
+    11458, 11511, 11563, 11615, 11667, 11719, 11771, 11823,
+    11875, 11926, 11978, 12029, 12080, 12132, 12183, 12234,
+    12285, 12335, 12386, 12437, 12487, 12538, 12588, 12639,
+    12689, 12739, 12789, 12839, 12889, 12939, 12988, 13038,
+    13088, 13137, 13187, 13236, 13285, 13334, 13383, 13432,
+    13481, 13530, 13579, 13628, 13676, 13725, 13773, 13822,
+    13870, 13918, 13967, 14015, 14063, 14111, 14159, 14206,
+    14254, 14302, 14350, 14397, 14445, 14492, 14539, 14587,
+    14634, 14681, 14728, 14775, 14822, 14869, 14916, 14963,
+    15010, 15056, 15103, 15149, 15196, 15242, 15289, 15335,
+    15381, 15427, 15474, 15520, 15566, 15612, 15657, 15703,
+    15749, 15795, 15840, 15886, 15932, 15977, 16022, 16068,
+    16113, 16158, 16204, 16249, 16294, 16339, 16384, 16384,
+};
+/* clang-format on */
+
+static const uint8_t warp_pad_left[14][16] = {
+  { 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+  { 2, 2, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+  { 3, 3, 3, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+  { 4, 4, 4, 4, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+  { 5, 5, 5, 5, 5, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+  { 6, 6, 6, 6, 6, 6, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+  { 7, 7, 7, 7, 7, 7, 7, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+  { 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 10, 11, 12, 13, 14, 15 },
+  { 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10, 11, 12, 13, 14, 15 },
+  { 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 12, 13, 14, 15 },
+  { 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 13, 14, 15 },
+  { 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 14, 15 },
+  { 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 15 },
+  { 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15 },
+};
+
+static const uint8_t warp_pad_right[14][16] = {
+  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 14 },
+  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 13 },
+  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 12, 12, 12 },
+  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 11, 11, 11, 11 },
+  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10, 10, 10, 10 },
+  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 9, 9, 9, 9, 9 },
+  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8 },
+  { 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7 },
+  { 0, 1, 2, 3, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6 },
+  { 0, 1, 2, 3, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5 },
+  { 0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 },
+  { 0, 1, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 },
+  { 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 },
+  { 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }
+};
+
+static INLINE int error_measure(int err) {
+  return error_measure_lut[255 + err];
+}
+
+// Returns the error between the frame described by 'ref' and the frame
+// described by 'dst'.
+int64_t av1_frame_error(int use_hbd, int bd, const uint8_t *ref, int stride,
+                        uint8_t *dst, int p_width, int p_height, int p_stride);
+
+int64_t av1_segmented_frame_error(int use_hbd, int bd, const uint8_t *ref,
+                                  int stride, uint8_t *dst, int p_width,
+                                  int p_height, int p_stride,
+                                  uint8_t *segment_map, int segment_map_stride);
+
+int64_t av1_calc_highbd_frame_error(const uint16_t *const ref, int stride,
+                                    const uint16_t *const dst, int p_width,
+                                    int p_height, int p_stride, int bd);
+
+void highbd_warp_plane(WarpedMotionParams *wm, const uint16_t *const ref,
+                       int width, int height, int stride, uint16_t *const pred,
+                       int p_col, int p_row, int p_width, int p_height,
+                       int p_stride, int subsampling_x, int subsampling_y,
+                       int bd, ConvolveParams *conv_params);
+
+void warp_plane(WarpedMotionParams *wm, const uint8_t *const ref, int width,
+                int height, int stride, uint8_t *pred, int p_col, int p_row,
+                int p_width, int p_height, int p_stride, int subsampling_x,
+                int subsampling_y, ConvolveParams *conv_params);
+
+void av1_warp_plane(WarpedMotionParams *wm, int use_hbd, int bd,
+                    const uint8_t *ref, int width, int height, int stride,
+                    uint8_t *pred, int p_col, int p_row, int p_width,
+                    int p_height, int p_stride, int subsampling_x,
+                    int subsampling_y, ConvolveParams *conv_params);
+
+int av1_find_projection(int np, const int *pts1, const int *pts2,
+                        BLOCK_SIZE bsize, int mvy, int mvx,
+                        WarpedMotionParams *wm_params, int mi_row, int mi_col);
+
+int av1_get_shear_params(WarpedMotionParams *wm);
+#endif  // AOM_AV1_COMMON_WARPED_MOTION_H_
diff --git a/libs/libaom/src/av1/common/x86/av1_convolve_horiz_rs_sse4.c b/libs/libaom/src/av1/common/x86/av1_convolve_horiz_rs_sse4.c
new file mode 100644
index 000000000..8aa14696f
--- /dev/null
+++ b/libs/libaom/src/av1/common/x86/av1_convolve_horiz_rs_sse4.c
@@ -0,0 +1,228 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <smmintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/convolve.h"
+#include "av1/common/resize.h"
+#include "aom_dsp/x86/synonyms.h"
+
+// Note: If the crop width is not a multiple of 4, then, unlike the C version,
+// this function will overwrite some of the padding on the right hand side of
+// the frame. This padding appears to be trashed anyway, so this should not
+// affect the running of the decoder.
+void av1_convolve_horiz_rs_sse4_1(const uint8_t *src, int src_stride,
+                                  uint8_t *dst, int dst_stride, int w, int h,
+                                  const int16_t *x_filters, int x0_qn,
+                                  int x_step_qn) {
+  assert(UPSCALE_NORMATIVE_TAPS == 8);
+
+  src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
+
+  const __m128i round_add = _mm_set1_epi32((1 << FILTER_BITS) >> 1);
+  const __m128i zero = _mm_setzero_si128();
+
+  const uint8_t *src_y;
+  uint8_t *dst_y;
+  int x_qn = x0_qn;
+  for (int x = 0; x < w; x += 4, x_qn += 4 * x_step_qn) {
+    const int x_filter_idx0 =
+        ((x_qn + 0 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+    const int x_filter_idx1 =
+        ((x_qn + 1 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+    const int x_filter_idx2 =
+        ((x_qn + 2 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+    const int x_filter_idx3 =
+        ((x_qn + 3 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+
+    assert(x_filter_idx0 <= RS_SUBPEL_MASK);
+    assert(x_filter_idx1 <= RS_SUBPEL_MASK);
+    assert(x_filter_idx2 <= RS_SUBPEL_MASK);
+    assert(x_filter_idx3 <= RS_SUBPEL_MASK);
+
+    const int16_t *const x_filter0 =
+        &x_filters[x_filter_idx0 * UPSCALE_NORMATIVE_TAPS];
+    const int16_t *const x_filter1 =
+        &x_filters[x_filter_idx1 * UPSCALE_NORMATIVE_TAPS];
+    const int16_t *const x_filter2 =
+        &x_filters[x_filter_idx2 * UPSCALE_NORMATIVE_TAPS];
+    const int16_t *const x_filter3 =
+        &x_filters[x_filter_idx3 * UPSCALE_NORMATIVE_TAPS];
+
+    const __m128i fil0_16 = xx_loadu_128(x_filter0);
+    const __m128i fil1_16 = xx_loadu_128(x_filter1);
+    const __m128i fil2_16 = xx_loadu_128(x_filter2);
+    const __m128i fil3_16 = xx_loadu_128(x_filter3);
+
+    src_y = src;
+    dst_y = dst;
+    for (int y = 0; y < h; y++, src_y += src_stride, dst_y += dst_stride) {
+      const uint8_t *const src_x0 =
+          &src_y[(x_qn + 0 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+      const uint8_t *const src_x1 =
+          &src_y[(x_qn + 1 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+      const uint8_t *const src_x2 =
+          &src_y[(x_qn + 2 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+      const uint8_t *const src_x3 =
+          &src_y[(x_qn + 3 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+
+      // Load up the source data. This is 8-bit input data, so each load
+      // gets 8 pixels.
+      const __m128i src0_8 = xx_loadl_64(src_x0);
+      const __m128i src1_8 = xx_loadl_64(src_x1);
+      const __m128i src2_8 = xx_loadl_64(src_x2);
+      const __m128i src3_8 = xx_loadl_64(src_x3);
+
+      // Now zero-extend up to 16-bit precision, i.e.
+      // [ 00 00 00 00 hg fe dc ba ] -> [ 0h 0g 0f 0e 0d 0c 0b 0a ]
+      const __m128i src0_16 = _mm_cvtepu8_epi16(src0_8);
+      const __m128i src1_16 = _mm_cvtepu8_epi16(src1_8);
+      const __m128i src2_16 = _mm_cvtepu8_epi16(src2_8);
+      const __m128i src3_16 = _mm_cvtepu8_epi16(src3_8);
+
+      // Multiply by filter coefficients (results in a 32-bit value),
+      // and add adjacent pairs, i.e.
+      // ([ s7 s6 s5 s4 s3 s2 s1 s0], [ f7 f6 f5 f4 f3 f2 f1 f0 ])
+      // -> [ {s7*f7+s6*f6} {s5*f5+s4*f4} {s3*f3+s2*f2} {s1*f1+s0*f0} ]
+      const __m128i conv0_32 = _mm_madd_epi16(src0_16, fil0_16);
+      const __m128i conv1_32 = _mm_madd_epi16(src1_16, fil1_16);
+      const __m128i conv2_32 = _mm_madd_epi16(src2_16, fil2_16);
+      const __m128i conv3_32 = _mm_madd_epi16(src3_16, fil3_16);
+
+      // Reduce horizontally and add, i.e.
+      // ([ D C B A ], [ S R Q P ]) -> [ S+R Q+P D+C B+A ]
+      const __m128i conv01_32 = _mm_hadd_epi32(conv0_32, conv1_32);
+      const __m128i conv23_32 = _mm_hadd_epi32(conv2_32, conv3_32);
+
+      const __m128i conv0123_32 = _mm_hadd_epi32(conv01_32, conv23_32);
+
+      // Divide down by (1 << FILTER_BITS), rounding to nearest.
+      const __m128i shifted_32 =
+          _mm_srai_epi32(_mm_add_epi32(conv0123_32, round_add), FILTER_BITS);
+
+      // Pack 32-bit values into 16-bit values, i.e.
+      // ([ D C B A ], [ 0 0 0 0 ]) -> [ 0 0 0 0 D C B A ]
+      const __m128i shifted_16 = _mm_packus_epi32(shifted_32, zero);
+
+      // Pack 16-bit values into 8-bit values, i.e.
+      // ([ 0 0 0 0 D C B A ], [ 0 0 0 0 0 0 0 0 ])
+      // -> [ 0 0 0 0 0 0 DC BA ]
+      const __m128i shifted_8 = _mm_packus_epi16(shifted_16, zero);
+
+      // Write to the output
+      xx_storel_32(&dst_y[x], shifted_8);
+    }
+  }
+}
+
+// Note: If the crop width is not a multiple of 4, then, unlike the C version,
+// this function will overwrite some of the padding on the right hand side of
+// the frame. This padding appears to be trashed anyway, so this should not
+// affect the running of the decoder.
+void av1_highbd_convolve_horiz_rs_sse4_1(const uint16_t *src, int src_stride,
+                                         uint16_t *dst, int dst_stride, int w,
+                                         int h, const int16_t *x_filters,
+                                         int x0_qn, int x_step_qn, int bd) {
+  assert(UPSCALE_NORMATIVE_TAPS == 8);
+  assert(bd == 8 || bd == 10 || bd == 12);
+
+  src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
+
+  const __m128i round_add = _mm_set1_epi32((1 << FILTER_BITS) >> 1);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i clip_maximum = _mm_set1_epi16((1 << bd) - 1);
+
+  const uint16_t *src_y;
+  uint16_t *dst_y;
+  int x_qn = x0_qn;
+  for (int x = 0; x < w; x += 4, x_qn += 4 * x_step_qn) {
+    const int x_filter_idx0 =
+        ((x_qn + 0 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+    const int x_filter_idx1 =
+        ((x_qn + 1 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+    const int x_filter_idx2 =
+        ((x_qn + 2 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+    const int x_filter_idx3 =
+        ((x_qn + 3 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+
+    assert(x_filter_idx0 <= RS_SUBPEL_MASK);
+    assert(x_filter_idx1 <= RS_SUBPEL_MASK);
+    assert(x_filter_idx2 <= RS_SUBPEL_MASK);
+    assert(x_filter_idx3 <= RS_SUBPEL_MASK);
+
+    const int16_t *const x_filter0 =
+        &x_filters[x_filter_idx0 * UPSCALE_NORMATIVE_TAPS];
+    const int16_t *const x_filter1 =
+        &x_filters[x_filter_idx1 * UPSCALE_NORMATIVE_TAPS];
+    const int16_t *const x_filter2 =
+        &x_filters[x_filter_idx2 * UPSCALE_NORMATIVE_TAPS];
+    const int16_t *const x_filter3 =
+        &x_filters[x_filter_idx3 * UPSCALE_NORMATIVE_TAPS];
+
+    const __m128i fil0_16 = xx_loadu_128(x_filter0);
+    const __m128i fil1_16 = xx_loadu_128(x_filter1);
+    const __m128i fil2_16 = xx_loadu_128(x_filter2);
+    const __m128i fil3_16 = xx_loadu_128(x_filter3);
+
+    src_y = src;
+    dst_y = dst;
+    for (int y = 0; y < h; y++, src_y += src_stride, dst_y += dst_stride) {
+      const uint16_t *const src_x0 =
+          &src_y[(x_qn + 0 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+      const uint16_t *const src_x1 =
+          &src_y[(x_qn + 1 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+      const uint16_t *const src_x2 =
+          &src_y[(x_qn + 2 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+      const uint16_t *const src_x3 =
+          &src_y[(x_qn + 3 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+
+      // Load up the source data. This is 16-bit input data, so each load
+      // gets 8 pixels.
+      const __m128i src0_16 = xx_loadu_128(src_x0);
+      const __m128i src1_16 = xx_loadu_128(src_x1);
+      const __m128i src2_16 = xx_loadu_128(src_x2);
+      const __m128i src3_16 = xx_loadu_128(src_x3);
+
+      // Multiply by filter coefficients (results in a 32-bit value),
+      // and add adjacent pairs, i.e.
+      // ([ s7 s6 s5 s4 s3 s2 s1 s0], [ f7 f6 f5 f4 f3 f2 f1 f0 ])
+      // -> [ {s7*f7+s6*f6} {s5*f5+s4*f4} {s3*f3+s2*f2} {s1*f1+s0*f0} ]
+      const __m128i conv0_32 = _mm_madd_epi16(src0_16, fil0_16);
+      const __m128i conv1_32 = _mm_madd_epi16(src1_16, fil1_16);
+      const __m128i conv2_32 = _mm_madd_epi16(src2_16, fil2_16);
+      const __m128i conv3_32 = _mm_madd_epi16(src3_16, fil3_16);
+
+      // Reduce horizontally and add, i.e.
+      // ([ D C B A ], [ S R Q P ]) -> [ S+R Q+P D+C B+A ]
+      const __m128i conv01_32 = _mm_hadd_epi32(conv0_32, conv1_32);
+      const __m128i conv23_32 = _mm_hadd_epi32(conv2_32, conv3_32);
+
+      const __m128i conv0123_32 = _mm_hadd_epi32(conv01_32, conv23_32);
+
+      // Divide down by (1 << FILTER_BITS), rounding to nearest.
+      const __m128i shifted_32 =
+          _mm_srai_epi32(_mm_add_epi32(conv0123_32, round_add), FILTER_BITS);
+
+      // Pack 32-bit values into 16-bit values, i.e.
+      // ([ D C B A ], [ 0 0 0 0 ]) -> [ 0 0 0 0 D C B A ]
+      const __m128i shifted_16 = _mm_packus_epi32(shifted_32, zero);
+
+      // Clip the values at (1 << bd) - 1
+      const __m128i clipped_16 = _mm_min_epi16(shifted_16, clip_maximum);
+
+      // Write to the output
+      xx_storel_64(&dst_y[x], clipped_16);
+    }
+  }
+}
diff --git a/libs/libaom/src/av1/common/x86/av1_convolve_scale_sse4.c b/libs/libaom/src/av1/common/x86/av1_convolve_scale_sse4.c
new file mode 100644
index 000000000..196618176
--- /dev/null
+++ b/libs/libaom/src/av1/common/x86/av1_convolve_scale_sse4.c
@@ -0,0 +1,498 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <smmintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "av1/common/convolve.h"
+
+// A specialised version of hfilter, the horizontal filter for
+// av1_convolve_2d_scale_sse4_1. This version only supports 8 tap filters.
+static void hfilter8(const uint8_t *src, int src_stride, int16_t *dst, int w,
+                     int h, int subpel_x_qn, int x_step_qn,
+                     const InterpFilterParams *filter_params, unsigned round) {
+  const int bd = 8;
+  const int ntaps = 8;
+
+  src -= ntaps / 2 - 1;
+
+  int32_t round_add32 = (1 << round) / 2 + (1 << (bd + FILTER_BITS - 1));
+  const __m128i round_add = _mm_set1_epi32(round_add32);
+  const __m128i round_shift = _mm_cvtsi32_si128(round);
+
+  int x_qn = subpel_x_qn;
+  for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
+    const uint8_t *const src_col = src + (x_qn >> SCALE_SUBPEL_BITS);
+    const int filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
+    assert(filter_idx < SUBPEL_SHIFTS);
+    const int16_t *filter =
+        av1_get_interp_filter_subpel_kernel(filter_params, filter_idx);
+
+    // Load the filter coefficients
+    const __m128i coefflo = _mm_loadu_si128((__m128i *)filter);
+    const __m128i zero = _mm_castps_si128(_mm_setzero_ps());
+
+    int y;
+    for (y = 0; y <= h - 4; y += 4) {
+      const uint8_t *const src0 = src_col + y * src_stride;
+      const uint8_t *const src1 = src0 + 1 * src_stride;
+      const uint8_t *const src2 = src0 + 2 * src_stride;
+      const uint8_t *const src3 = src0 + 3 * src_stride;
+
+      // Load up source data. This is 8-bit input data; each load is just
+      // loading the lower half of the register and gets 8 pixels
+      const __m128i data08 = _mm_loadl_epi64((__m128i *)src0);
+      const __m128i data18 = _mm_loadl_epi64((__m128i *)src1);
+      const __m128i data28 = _mm_loadl_epi64((__m128i *)src2);
+      const __m128i data38 = _mm_loadl_epi64((__m128i *)src3);
+
+      // Now zero-extend up to 16-bit precision by interleaving with
+      // zeros. Drop the upper half of each register (which just had zeros)
+      const __m128i data0lo = _mm_unpacklo_epi8(data08, zero);
+      const __m128i data1lo = _mm_unpacklo_epi8(data18, zero);
+      const __m128i data2lo = _mm_unpacklo_epi8(data28, zero);
+      const __m128i data3lo = _mm_unpacklo_epi8(data38, zero);
+
+      // Multiply by coefficients
+      const __m128i conv0lo = _mm_madd_epi16(data0lo, coefflo);
+      const __m128i conv1lo = _mm_madd_epi16(data1lo, coefflo);
+      const __m128i conv2lo = _mm_madd_epi16(data2lo, coefflo);
+      const __m128i conv3lo = _mm_madd_epi16(data3lo, coefflo);
+
+      // Reduce horizontally and add
+      const __m128i conv01lo = _mm_hadd_epi32(conv0lo, conv1lo);
+      const __m128i conv23lo = _mm_hadd_epi32(conv2lo, conv3lo);
+      const __m128i conv = _mm_hadd_epi32(conv01lo, conv23lo);
+
+      // Divide down by (1 << round), rounding to nearest.
+      __m128i shifted =
+          _mm_sra_epi32(_mm_add_epi32(conv, round_add), round_shift);
+
+      shifted = _mm_packus_epi32(shifted, shifted);
+      // Write transposed to the output
+      _mm_storel_epi64((__m128i *)(dst + y + x * h), shifted);
+    }
+    for (; y < h; ++y) {
+      const uint8_t *const src_row = src_col + y * src_stride;
+
+      int32_t sum = (1 << (bd + FILTER_BITS - 1));
+      for (int k = 0; k < ntaps; ++k) {
+        sum += filter[k] * src_row[k];
+      }
+
+      dst[y + x * h] = ROUND_POWER_OF_TWO(sum, round);
+    }
+  }
+}
+
+static __m128i convolve_16_8(const int16_t *src, __m128i coeff) {
+  __m128i data = _mm_loadu_si128((__m128i *)src);
+  return _mm_madd_epi16(data, coeff);
+}
+
+// A specialised version of vfilter, the vertical filter for
+// av1_convolve_2d_scale_sse4_1. This version only supports 8 tap filters.
+static void vfilter8(const int16_t *src, int src_stride, uint8_t *dst,
+                     int dst_stride, int w, int h, int subpel_y_qn,
+                     int y_step_qn, const InterpFilterParams *filter_params,
+                     const ConvolveParams *conv_params, int bd) {
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int ntaps = 8;
+
+  const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
+
+  const int32_t sub32 = ((1 << (offset_bits - conv_params->round_1)) +
+                         (1 << (offset_bits - conv_params->round_1 - 1)));
+  const __m128i sub = _mm_set1_epi16(sub32);
+
+  CONV_BUF_TYPE *dst16 = conv_params->dst;
+  const int dst16_stride = conv_params->dst_stride;
+  const int bits =
+      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+  const __m128i bits_shift = _mm_cvtsi32_si128(bits);
+  const __m128i bits_const = _mm_set1_epi16(((1 << bits) >> 1));
+  const __m128i round_shift_add =
+      _mm_set1_epi32(((1 << conv_params->round_1) >> 1));
+  const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits);
+
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m128i wt0 = _mm_set1_epi16((short)w0);
+  const __m128i wt1 = _mm_set1_epi16((short)w1);
+  const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
+
+  int y_qn = subpel_y_qn;
+  for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
+    const int16_t *src_y = src + (y_qn >> SCALE_SUBPEL_BITS);
+    const int filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
+    assert(filter_idx < SUBPEL_SHIFTS);
+    const int16_t *filter =
+        av1_get_interp_filter_subpel_kernel(filter_params, filter_idx);
+
+    const __m128i coeff0716 = _mm_loadu_si128((__m128i *)filter);
+    int x;
+    for (x = 0; x <= w - 4; x += 4) {
+      const int16_t *const src0 = src_y + x * src_stride;
+      const int16_t *const src1 = src0 + 1 * src_stride;
+      const int16_t *const src2 = src0 + 2 * src_stride;
+      const int16_t *const src3 = src0 + 3 * src_stride;
+
+      // Load the source data for the three rows, adding the three registers of
+      // convolved products to one as we go (conv0..conv3) to avoid the
+      // register pressure getting too high.
+      const __m128i conv0 = convolve_16_8(src0, coeff0716);
+      const __m128i conv1 = convolve_16_8(src1, coeff0716);
+      const __m128i conv2 = convolve_16_8(src2, coeff0716);
+      const __m128i conv3 = convolve_16_8(src3, coeff0716);
+
+      // Now reduce horizontally to get one lane for each result
+      const __m128i conv01 = _mm_hadd_epi32(conv0, conv1);
+      const __m128i conv23 = _mm_hadd_epi32(conv2, conv3);
+      __m128i conv = _mm_hadd_epi32(conv01, conv23);
+
+      conv = _mm_add_epi32(conv, res_add_const);
+      // Divide down by (1 << round_1), rounding to nearest and subtract sub32.
+      __m128i shifted =
+          _mm_sra_epi32(_mm_add_epi32(conv, round_shift_add), round_shift);
+
+      uint8_t *dst_x = dst + y * dst_stride + x;
+      CONV_BUF_TYPE *dst_16_x = dst16 + y * dst16_stride + x;
+      __m128i result;
+      __m128i shifted_16 = _mm_packus_epi32(shifted, shifted);
+
+      if (conv_params->is_compound) {
+        if (conv_params->do_average) {
+          const __m128i p_16 = _mm_loadl_epi64((__m128i *)dst_16_x);
+          if (conv_params->use_dist_wtd_comp_avg) {
+            const __m128i p_16_lo = _mm_unpacklo_epi16(p_16, shifted_16);
+            const __m128i wt_res_lo = _mm_madd_epi16(p_16_lo, wt);
+            const __m128i shifted_32 =
+                _mm_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
+            shifted_16 = _mm_packus_epi32(shifted_32, shifted_32);
+          } else {
+            shifted_16 = _mm_srai_epi16(_mm_add_epi16(p_16, shifted_16), 1);
+          }
+          const __m128i subbed = _mm_sub_epi16(shifted_16, sub);
+          result = _mm_sra_epi16(_mm_add_epi16(subbed, bits_const), bits_shift);
+          const __m128i result_8 = _mm_packus_epi16(result, result);
+          *(uint32_t *)dst_x = _mm_cvtsi128_si32(result_8);
+        } else {
+          _mm_storel_epi64((__m128i *)dst_16_x, shifted_16);
+        }
+      } else {
+        const __m128i subbed = _mm_sub_epi16(shifted_16, sub);
+        result = _mm_sra_epi16(_mm_add_epi16(subbed, bits_const), bits_shift);
+        const __m128i result_8 = _mm_packus_epi16(result, result);
+        *(uint32_t *)dst_x = _mm_cvtsi128_si32(result_8);
+      }
+    }
+    for (; x < w; ++x) {
+      const int16_t *src_x = src_y + x * src_stride;
+      int32_t sum = 1 << offset_bits;
+      for (int k = 0; k < ntaps; ++k) sum += filter[k] * src_x[k];
+      CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
+
+      if (conv_params->is_compound) {
+        if (conv_params->do_average) {
+          int32_t tmp = dst16[y * dst16_stride + x];
+          if (conv_params->use_dist_wtd_comp_avg) {
+            tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+            tmp = tmp >> DIST_PRECISION_BITS;
+          } else {
+            tmp += res;
+            tmp = tmp >> 1;
+          }
+          /* Subtract round offset and convolve round */
+          tmp = tmp - sub32;
+          dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
+        } else {
+          dst16[y * dst16_stride + x] = res;
+        }
+      } else {
+        /* Subtract round offset and convolve round */
+        int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
+                             (1 << (offset_bits - conv_params->round_1 - 1)));
+        dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
+      }
+    }
+  }
+}
+void av1_convolve_2d_scale_sse4_1(const uint8_t *src, int src_stride,
+                                  uint8_t *dst8, int dst8_stride, int w, int h,
+                                  const InterpFilterParams *filter_params_x,
+                                  const InterpFilterParams *filter_params_y,
+                                  const int subpel_x_qn, const int x_step_qn,
+                                  const int subpel_y_qn, const int y_step_qn,
+                                  ConvolveParams *conv_params) {
+  int16_t tmp[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
+  int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
+             filter_params_y->taps;
+
+  const int xtaps = filter_params_x->taps;
+  const int ytaps = filter_params_y->taps;
+  const int fo_vert = ytaps / 2 - 1;
+  assert((xtaps == 8) && (ytaps == 8));
+  (void)xtaps;
+
+  // horizontal filter
+  hfilter8(src - fo_vert * src_stride, src_stride, tmp, w, im_h, subpel_x_qn,
+           x_step_qn, filter_params_x, conv_params->round_0);
+
+  // vertical filter (input is transposed)
+  vfilter8(tmp, im_h, dst8, dst8_stride, w, h, subpel_y_qn, y_step_qn,
+           filter_params_y, conv_params, 8);
+}
+
+// A specialised version of hfilter, the horizontal filter for
+// av1_highbd_convolve_2d_scale_sse4_1. This version only supports 8 tap
+// filters.
+static void highbd_hfilter8(const uint16_t *src, int src_stride, int16_t *dst,
+                            int w, int h, int subpel_x_qn, int x_step_qn,
+                            const InterpFilterParams *filter_params,
+                            unsigned round, int bd) {
+  const int ntaps = 8;
+
+  src -= ntaps / 2 - 1;
+
+  int32_t round_add32 = (1 << round) / 2 + (1 << (bd + FILTER_BITS - 1));
+  const __m128i round_add = _mm_set1_epi32(round_add32);
+  const __m128i round_shift = _mm_cvtsi32_si128(round);
+
+  int x_qn = subpel_x_qn;
+  for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
+    const uint16_t *const src_col = src + (x_qn >> SCALE_SUBPEL_BITS);
+    const int filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
+    assert(filter_idx < SUBPEL_SHIFTS);
+    const int16_t *filter =
+        av1_get_interp_filter_subpel_kernel(filter_params, filter_idx);
+
+    // Load the filter coefficients
+    const __m128i coefflo = _mm_loadu_si128((__m128i *)filter);
+
+    int y;
+    for (y = 0; y <= h - 4; y += 4) {
+      const uint16_t *const src0 = src_col + y * src_stride;
+      const uint16_t *const src1 = src0 + 1 * src_stride;
+      const uint16_t *const src2 = src0 + 2 * src_stride;
+      const uint16_t *const src3 = src0 + 3 * src_stride;
+
+      // Load up source data. This is 16-bit input data, so each load gets the 8
+      // pixels we need.
+      const __m128i data0lo = _mm_loadu_si128((__m128i *)src0);
+      const __m128i data1lo = _mm_loadu_si128((__m128i *)src1);
+      const __m128i data2lo = _mm_loadu_si128((__m128i *)src2);
+      const __m128i data3lo = _mm_loadu_si128((__m128i *)src3);
+
+      // Multiply by coefficients
+      const __m128i conv0lo = _mm_madd_epi16(data0lo, coefflo);
+      const __m128i conv1lo = _mm_madd_epi16(data1lo, coefflo);
+      const __m128i conv2lo = _mm_madd_epi16(data2lo, coefflo);
+      const __m128i conv3lo = _mm_madd_epi16(data3lo, coefflo);
+
+      // Reduce horizontally and add
+      const __m128i conv01lo = _mm_hadd_epi32(conv0lo, conv1lo);
+      const __m128i conv23lo = _mm_hadd_epi32(conv2lo, conv3lo);
+      const __m128i conv = _mm_hadd_epi32(conv01lo, conv23lo);
+
+      // Divide down by (1 << round), rounding to nearest.
+      __m128i shifted =
+          _mm_sra_epi32(_mm_add_epi32(conv, round_add), round_shift);
+
+      shifted = _mm_packus_epi32(shifted, shifted);
+      // Write transposed to the output
+      _mm_storel_epi64((__m128i *)(dst + y + x * h), shifted);
+    }
+    for (; y < h; ++y) {
+      const uint16_t *const src_row = src_col + y * src_stride;
+
+      int32_t sum = (1 << (bd + FILTER_BITS - 1));
+      for (int k = 0; k < ntaps; ++k) {
+        sum += filter[k] * src_row[k];
+      }
+
+      dst[y + x * h] = ROUND_POWER_OF_TWO(sum, round);
+    }
+  }
+}
+// A specialised version of vfilter, the vertical filter for
+// av1_highbd_convolve_2d_scale_sse4_1. This version only supports 8 tap
+// filters.
+static void highbd_vfilter8(const int16_t *src, int src_stride, uint16_t *dst,
+                            int dst_stride, int w, int h, int subpel_y_qn,
+                            int y_step_qn,
+                            const InterpFilterParams *filter_params,
+                            const ConvolveParams *conv_params, int bd) {
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int ntaps = 8;
+
+  const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
+
+  const int32_t sub32 = ((1 << (offset_bits - conv_params->round_1)) +
+                         (1 << (offset_bits - conv_params->round_1 - 1)));
+  const __m128i sub = _mm_set1_epi32(sub32);
+
+  CONV_BUF_TYPE *dst16 = conv_params->dst;
+  const int dst16_stride = conv_params->dst_stride;
+  const __m128i clip_pixel_ =
+      _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+  const int bits =
+      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+  const __m128i bits_shift = _mm_cvtsi32_si128(bits);
+  const __m128i bits_const = _mm_set1_epi32(((1 << bits) >> 1));
+  const __m128i round_shift_add =
+      _mm_set1_epi32(((1 << conv_params->round_1) >> 1));
+  const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits);
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  __m128i round_bits_shift = _mm_cvtsi32_si128(round_bits);
+  __m128i round_bits_const = _mm_set1_epi32(((1 << round_bits) >> 1));
+
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m128i wt0 = _mm_set1_epi32(w0);
+  const __m128i wt1 = _mm_set1_epi32(w1);
+
+  int y_qn = subpel_y_qn;
+  for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
+    const int16_t *src_y = src + (y_qn >> SCALE_SUBPEL_BITS);
+    const int filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
+    assert(filter_idx < SUBPEL_SHIFTS);
+    const int16_t *filter =
+        av1_get_interp_filter_subpel_kernel(filter_params, filter_idx);
+
+    const __m128i coeff0716 = _mm_loadu_si128((__m128i *)filter);
+    int x;
+    for (x = 0; x <= w - 4; x += 4) {
+      const int16_t *const src0 = src_y + x * src_stride;
+      const int16_t *const src1 = src0 + 1 * src_stride;
+      const int16_t *const src2 = src0 + 2 * src_stride;
+      const int16_t *const src3 = src0 + 3 * src_stride;
+
+      // Load the source data for the three rows, adding the three registers of
+      // convolved products to one as we go (conv0..conv3) to avoid the
+      // register pressure getting too high.
+      const __m128i conv0 = convolve_16_8(src0, coeff0716);
+      const __m128i conv1 = convolve_16_8(src1, coeff0716);
+      const __m128i conv2 = convolve_16_8(src2, coeff0716);
+      const __m128i conv3 = convolve_16_8(src3, coeff0716);
+
+      // Now reduce horizontally to get one lane for each result
+      const __m128i conv01 = _mm_hadd_epi32(conv0, conv1);
+      const __m128i conv23 = _mm_hadd_epi32(conv2, conv3);
+      __m128i conv = _mm_hadd_epi32(conv01, conv23);
+      conv = _mm_add_epi32(conv, res_add_const);
+
+      // Divide down by (1 << round_1), rounding to nearest and subtract sub32.
+      __m128i shifted =
+          _mm_sra_epi32(_mm_add_epi32(conv, round_shift_add), round_shift);
+
+      uint16_t *dst_x = dst + y * dst_stride + x;
+      CONV_BUF_TYPE *dst_16_x = dst16 + y * dst16_stride + x;
+
+      __m128i result;
+      if (conv_params->is_compound) {
+        if (conv_params->do_average) {
+          __m128i p_32 =
+              _mm_cvtepu16_epi32(_mm_loadl_epi64((__m128i *)dst_16_x));
+
+          if (conv_params->use_dist_wtd_comp_avg) {
+            shifted = _mm_add_epi32(_mm_mullo_epi32(p_32, wt0),
+                                    _mm_mullo_epi32(shifted, wt1));
+            shifted = _mm_srai_epi32(shifted, DIST_PRECISION_BITS);
+          } else {
+            shifted = _mm_srai_epi32(_mm_add_epi32(p_32, shifted), 1);
+          }
+          __m128i res32 = _mm_sub_epi32(shifted, sub);
+          res32 = _mm_sra_epi32(_mm_add_epi32(res32, round_bits_const),
+                                round_bits_shift);
+
+          __m128i res16 = _mm_packus_epi32(res32, res32);
+          res16 = _mm_min_epi16(res16, clip_pixel_);
+          _mm_storel_epi64((__m128i *)dst_x, res16);
+        } else {
+          __m128i shifted_16 = _mm_packus_epi32(shifted, shifted);
+          _mm_storel_epi64((__m128i *)dst_16_x, shifted_16);
+        }
+      } else {
+        const __m128i subbed = _mm_sub_epi32(shifted, sub);
+        result = _mm_sra_epi16(_mm_add_epi32(subbed, bits_const), bits_shift);
+        result = _mm_packus_epi32(result, result);
+        result = _mm_min_epi16(result, clip_pixel_);
+        _mm_storel_epi64((__m128i *)dst_x, result);
+      }
+    }
+
+    for (; x < w; ++x) {
+      const int16_t *src_x = src_y + x * src_stride;
+      int32_t sum = 1 << offset_bits;
+      for (int k = 0; k < ntaps; ++k) sum += filter[k] * src_x[k];
+      CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
+      if (conv_params->is_compound) {
+        if (conv_params->do_average) {
+          int32_t tmp = dst16[y * dst16_stride + x];
+          if (conv_params->use_dist_wtd_comp_avg) {
+            tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+            tmp = tmp >> DIST_PRECISION_BITS;
+          } else {
+            tmp += res;
+            tmp = tmp >> 1;
+          }
+          /* Subtract round offset and convolve round */
+          tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) +
+                       (1 << (offset_bits - conv_params->round_1 - 1)));
+          dst[y * dst_stride + x] =
+              clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
+        } else {
+          dst16[y * dst16_stride + x] = res;
+        }
+      } else {
+        /* Subtract round offset and convolve round */
+        int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
+                             (1 << (offset_bits - conv_params->round_1 - 1)));
+        dst[y * dst_stride + x] =
+            clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
+      }
+    }
+  }
+}
+
+void av1_highbd_convolve_2d_scale_sse4_1(
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int x_step_qn, const int subpel_y_qn, const int y_step_qn,
+    ConvolveParams *conv_params, int bd) {
+  // TODO(yaowu): Move this out of stack
+  DECLARE_ALIGNED(16, int16_t,
+                  tmp[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
+  int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
+             filter_params_y->taps;
+  const int xtaps = filter_params_x->taps;
+  const int ytaps = filter_params_y->taps;
+  const int fo_vert = ytaps / 2 - 1;
+
+  memset(tmp, 0, sizeof(tmp));
+  assert((xtaps == 8) && (ytaps == 8));
+  (void)xtaps;
+
+  // horizontal filter
+  highbd_hfilter8(src - fo_vert * src_stride, src_stride, tmp, w, im_h,
+                  subpel_x_qn, x_step_qn, filter_params_x, conv_params->round_0,
+                  bd);
+
+  // vertical filter (input is transposed)
+  highbd_vfilter8(tmp, im_h, dst, dst_stride, w, h, subpel_y_qn, y_step_qn,
+                  filter_params_y, conv_params, bd);
+}
diff --git a/libs/libaom/src/av1/common/x86/av1_inv_txfm_avx2.c b/libs/libaom/src/av1/common/x86/av1_inv_txfm_avx2.c
new file mode 100644
index 000000000..0fbd5eae4
--- /dev/null
+++ b/libs/libaom/src/av1/common/x86/av1_inv_txfm_avx2.c
@@ -0,0 +1,1949 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_config.h"
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/av1_inv_txfm1d_cfg.h"
+#include "av1/common/x86/av1_txfm_sse2.h"
+#include "av1/common/x86/av1_inv_txfm_avx2.h"
+#include "av1/common/x86/av1_inv_txfm_ssse3.h"
+
+// TODO(venkatsanampudi@ittiam.com): move this to header file
+
+// Sqrt2, Sqrt2^2, Sqrt2^3, Sqrt2^4, Sqrt2^5
+static int32_t NewSqrt2list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096,
+                                          4 * 5793 };
+
+static INLINE void idct16_stage5_avx2(__m256i *x1, const int32_t *cospi,
+                                      const __m256i _r, int8_t cos_bit) {
+  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  btf_16_adds_subs_avx2(&x1[0], &x1[3]);
+  btf_16_adds_subs_avx2(&x1[1], &x1[2]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit);
+
+  btf_16_adds_subs_avx2(&x1[8], &x1[11]);
+  btf_16_adds_subs_avx2(&x1[9], &x1[10]);
+  btf_16_adds_subs_avx2(&x1[15], &x1[12]);
+  btf_16_adds_subs_avx2(&x1[14], &x1[13]);
+}
+
+static INLINE void idct16_stage6_avx2(__m256i *x, const int32_t *cospi,
+                                      const __m256i _r, int8_t cos_bit) {
+  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  btf_16_adds_subs_avx2(&x[0], &x[7]);
+  btf_16_adds_subs_avx2(&x[1], &x[6]);
+  btf_16_adds_subs_avx2(&x[2], &x[5]);
+  btf_16_adds_subs_avx2(&x[3], &x[4]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit);
+}
+
+static INLINE void idct16_stage7_avx2(__m256i *output, __m256i *x1) {
+  btf_16_adds_subs_out_avx2(&output[0], &output[15], x1[0], x1[15]);
+  btf_16_adds_subs_out_avx2(&output[1], &output[14], x1[1], x1[14]);
+  btf_16_adds_subs_out_avx2(&output[2], &output[13], x1[2], x1[13]);
+  btf_16_adds_subs_out_avx2(&output[3], &output[12], x1[3], x1[12]);
+  btf_16_adds_subs_out_avx2(&output[4], &output[11], x1[4], x1[11]);
+  btf_16_adds_subs_out_avx2(&output[5], &output[10], x1[5], x1[10]);
+  btf_16_adds_subs_out_avx2(&output[6], &output[9], x1[6], x1[9]);
+  btf_16_adds_subs_out_avx2(&output[7], &output[8], x1[7], x1[8]);
+}
+
+static void idct16_avx2(const __m256i *input, __m256i *output, int8_t cos_bit) {
+  (void)(cos_bit);
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  __m256i cospi_p60_m04 = pair_set_w16_epi16(cospi[60], -cospi[4]);
+  __m256i cospi_p04_p60 = pair_set_w16_epi16(cospi[4], cospi[60]);
+  __m256i cospi_p28_m36 = pair_set_w16_epi16(cospi[28], -cospi[36]);
+  __m256i cospi_p36_p28 = pair_set_w16_epi16(cospi[36], cospi[28]);
+  __m256i cospi_p44_m20 = pair_set_w16_epi16(cospi[44], -cospi[20]);
+  __m256i cospi_p20_p44 = pair_set_w16_epi16(cospi[20], cospi[44]);
+  __m256i cospi_p12_m52 = pair_set_w16_epi16(cospi[12], -cospi[52]);
+  __m256i cospi_p52_p12 = pair_set_w16_epi16(cospi[52], cospi[12]);
+  __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
+  __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
+  __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]);
+  __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]);
+  __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
+  __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
+  __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
+  __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+  __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+  __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+
+  // stage 1
+  __m256i x1[16];
+  x1[0] = input[0];
+  x1[1] = input[8];
+  x1[2] = input[4];
+  x1[3] = input[12];
+  x1[4] = input[2];
+  x1[5] = input[10];
+  x1[6] = input[6];
+  x1[7] = input[14];
+  x1[8] = input[1];
+  x1[9] = input[9];
+  x1[10] = input[5];
+  x1[11] = input[13];
+  x1[12] = input[3];
+  x1[13] = input[11];
+  x1[14] = input[7];
+  x1[15] = input[15];
+
+  // stage 2
+  btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, &x1[8], &x1[15], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, &x1[9], &x1[14], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, &x1[10], &x1[13], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, &x1[11], &x1[12], _r, cos_bit);
+
+  // stage 3
+  btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, &x1[4], &x1[7], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, &x1[5], &x1[6], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[8], &x1[9]);
+  btf_16_adds_subs_avx2(&x1[11], &x1[10]);
+  btf_16_adds_subs_avx2(&x1[12], &x1[13]);
+  btf_16_adds_subs_avx2(&x1[15], &x1[14]);
+
+  // stage 4
+  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, &x1[2], &x1[3], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[4], &x1[5]);
+  btf_16_adds_subs_avx2(&x1[7], &x1[6]);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit);
+
+  idct16_stage5_avx2(x1, cospi, _r, cos_bit);
+  idct16_stage6_avx2(x1, cospi, _r, cos_bit);
+  idct16_stage7_avx2(output, x1);
+}
+
+static void idct16_low8_avx2(const __m256i *input, __m256i *output,
+                             int8_t cos_bit) {
+  (void)(cos_bit);
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+  const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+  const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+
+  // stage 1
+  __m256i x1[16];
+  x1[0] = input[0];
+  x1[2] = input[4];
+  x1[4] = input[2];
+  x1[6] = input[6];
+  x1[8] = input[1];
+  x1[10] = input[5];
+  x1[12] = input[3];
+  x1[14] = input[7];
+
+  // stage 2
+  btf_16_w16_0_avx2(cospi[60], cospi[4], x1[8], x1[8], x1[15]);
+  btf_16_w16_0_avx2(-cospi[36], cospi[28], x1[14], x1[9], x1[14]);
+  btf_16_w16_0_avx2(cospi[44], cospi[20], x1[10], x1[10], x1[13]);
+  btf_16_w16_0_avx2(-cospi[52], cospi[12], x1[12], x1[11], x1[12]);
+
+  // stage 3
+  btf_16_w16_0_avx2(cospi[56], cospi[8], x1[4], x1[4], x1[7]);
+  btf_16_w16_0_avx2(-cospi[40], cospi[24], x1[6], x1[5], x1[6]);
+  btf_16_adds_subs_avx2(&x1[8], &x1[9]);
+  btf_16_adds_subs_avx2(&x1[11], &x1[10]);
+  btf_16_adds_subs_avx2(&x1[12], &x1[13]);
+  btf_16_adds_subs_avx2(&x1[15], &x1[14]);
+
+  // stage 4
+  btf_16_w16_0_avx2(cospi[32], cospi[32], x1[0], x1[0], x1[1]);
+  btf_16_w16_0_avx2(cospi[48], cospi[16], x1[2], x1[2], x1[3]);
+  btf_16_adds_subs_avx2(&x1[4], &x1[5]);
+  btf_16_adds_subs_avx2(&x1[7], &x1[6]);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit);
+
+  idct16_stage5_avx2(x1, cospi, _r, cos_bit);
+  idct16_stage6_avx2(x1, cospi, _r, cos_bit);
+  idct16_stage7_avx2(output, x1);
+}
+
+static void idct16_low1_avx2(const __m256i *input, __m256i *output,
+                             int8_t cos_bit) {
+  (void)(cos_bit);
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+
+  // stage 1
+  __m256i x1[2];
+  x1[0] = input[0];
+
+  // stage 2
+  // stage 3
+  // stage 4
+  btf_16_w16_0_avx2(cospi[32], cospi[32], x1[0], x1[0], x1[1]);
+
+  // stage 5
+  // stage 6
+  output[0] = x1[0];
+  output[1] = x1[1];
+  output[2] = x1[1];
+  output[3] = x1[0];
+  output[4] = x1[0];
+  output[5] = x1[1];
+  output[6] = x1[1];
+  output[7] = x1[0];
+  output[8] = x1[0];
+  output[9] = x1[1];
+  output[10] = x1[1];
+  output[11] = x1[0];
+  output[12] = x1[0];
+  output[13] = x1[1];
+  output[14] = x1[1];
+  output[15] = x1[0];
+}
+
+static INLINE void iadst16_stage3_avx2(__m256i *x) {
+  btf_16_adds_subs_avx2(&x[0], &x[8]);
+  btf_16_adds_subs_avx2(&x[1], &x[9]);
+  btf_16_adds_subs_avx2(&x[2], &x[10]);
+  btf_16_adds_subs_avx2(&x[3], &x[11]);
+  btf_16_adds_subs_avx2(&x[4], &x[12]);
+  btf_16_adds_subs_avx2(&x[5], &x[13]);
+  btf_16_adds_subs_avx2(&x[6], &x[14]);
+  btf_16_adds_subs_avx2(&x[7], &x[15]);
+}
+
+static INLINE void iadst16_stage4_avx2(__m256i *x, const int32_t *cospi,
+                                       const __m256i _r, int8_t cos_bit) {
+  const __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
+  const __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
+  const __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]);
+  const __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]);
+  const __m256i cospi_m56_p08 = pair_set_w16_epi16(-cospi[56], cospi[8]);
+  const __m256i cospi_m24_p40 = pair_set_w16_epi16(-cospi[24], cospi[40]);
+  btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x[8], &x[9], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p40_p24, cospi_p24_m40, &x[10], &x[11], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m56_p08, cospi_p08_p56, &x[12], &x[13], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m24_p40, cospi_p40_p24, &x[14], &x[15], _r, cos_bit);
+}
+
+static INLINE void iadst16_stage5_avx2(__m256i *x) {
+  btf_16_adds_subs_avx2(&x[0], &x[4]);
+  btf_16_adds_subs_avx2(&x[1], &x[5]);
+  btf_16_adds_subs_avx2(&x[2], &x[6]);
+  btf_16_adds_subs_avx2(&x[3], &x[7]);
+  btf_16_adds_subs_avx2(&x[8], &x[12]);
+  btf_16_adds_subs_avx2(&x[9], &x[13]);
+  btf_16_adds_subs_avx2(&x[10], &x[14]);
+  btf_16_adds_subs_avx2(&x[11], &x[15]);
+}
+
+static INLINE void iadst16_stage6_avx2(__m256i *x, const int32_t *cospi,
+                                       const __m256i _r, int8_t cos_bit) {
+  const __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
+  const __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
+  const __m256i cospi_m48_p16 = pair_set_w16_epi16(-cospi[48], cospi[16]);
+  btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x[4], &x[5], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x[6], &x[7], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x[12], &x[13], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x[14], &x[15], _r, cos_bit);
+}
+
+static INLINE void iadst16_stage7_avx2(__m256i *x) {
+  btf_16_adds_subs_avx2(&x[0], &x[2]);
+  btf_16_adds_subs_avx2(&x[1], &x[3]);
+  btf_16_adds_subs_avx2(&x[4], &x[6]);
+  btf_16_adds_subs_avx2(&x[5], &x[7]);
+  btf_16_adds_subs_avx2(&x[8], &x[10]);
+  btf_16_adds_subs_avx2(&x[9], &x[11]);
+  btf_16_adds_subs_avx2(&x[12], &x[14]);
+  btf_16_adds_subs_avx2(&x[13], &x[15]);
+}
+
+static INLINE void iadst16_stage8_avx2(__m256i *x1, const int32_t *cospi,
+                                       const __m256i _r, int8_t cos_bit) {
+  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  const __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
+  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[2], &x1[3], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[6], &x1[7], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[10], &x1[11], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[14], &x1[15], _r, cos_bit);
+}
+
+static INLINE void iadst16_stage9_avx2(__m256i *output, __m256i *x1) {
+  const __m256i __zero = _mm256_setzero_si256();
+  output[0] = x1[0];
+  output[1] = _mm256_subs_epi16(__zero, x1[8]);
+  output[2] = x1[12];
+  output[3] = _mm256_subs_epi16(__zero, x1[4]);
+  output[4] = x1[6];
+  output[5] = _mm256_subs_epi16(__zero, x1[14]);
+  output[6] = x1[10];
+  output[7] = _mm256_subs_epi16(__zero, x1[2]);
+  output[8] = x1[3];
+  output[9] = _mm256_subs_epi16(__zero, x1[11]);
+  output[10] = x1[15];
+  output[11] = _mm256_subs_epi16(__zero, x1[7]);
+  output[12] = x1[5];
+  output[13] = _mm256_subs_epi16(__zero, x1[13]);
+  output[14] = x1[9];
+  output[15] = _mm256_subs_epi16(__zero, x1[1]);
+}
+
+static void iadst16_avx2(const __m256i *input, __m256i *output,
+                         int8_t cos_bit) {
+  (void)(cos_bit);
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+
+  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]);
+  __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]);
+  __m256i cospi_p10_p54 = pair_set_w16_epi16(cospi[10], cospi[54]);
+  __m256i cospi_p54_m10 = pair_set_w16_epi16(cospi[54], -cospi[10]);
+  __m256i cospi_p18_p46 = pair_set_w16_epi16(cospi[18], cospi[46]);
+  __m256i cospi_p46_m18 = pair_set_w16_epi16(cospi[46], -cospi[18]);
+  __m256i cospi_p26_p38 = pair_set_w16_epi16(cospi[26], cospi[38]);
+  __m256i cospi_p38_m26 = pair_set_w16_epi16(cospi[38], -cospi[26]);
+  __m256i cospi_p34_p30 = pair_set_w16_epi16(cospi[34], cospi[30]);
+  __m256i cospi_p30_m34 = pair_set_w16_epi16(cospi[30], -cospi[34]);
+  __m256i cospi_p42_p22 = pair_set_w16_epi16(cospi[42], cospi[22]);
+  __m256i cospi_p22_m42 = pair_set_w16_epi16(cospi[22], -cospi[42]);
+  __m256i cospi_p50_p14 = pair_set_w16_epi16(cospi[50], cospi[14]);
+  __m256i cospi_p14_m50 = pair_set_w16_epi16(cospi[14], -cospi[50]);
+  __m256i cospi_p58_p06 = pair_set_w16_epi16(cospi[58], cospi[6]);
+  __m256i cospi_p06_m58 = pair_set_w16_epi16(cospi[6], -cospi[58]);
+
+  // stage 1
+  __m256i x1[16];
+  x1[0] = input[15];
+  x1[1] = input[0];
+  x1[2] = input[13];
+  x1[3] = input[2];
+  x1[4] = input[11];
+  x1[5] = input[4];
+  x1[6] = input[9];
+  x1[7] = input[6];
+  x1[8] = input[7];
+  x1[9] = input[8];
+  x1[10] = input[5];
+  x1[11] = input[10];
+  x1[12] = input[3];
+  x1[13] = input[12];
+  x1[14] = input[1];
+  x1[15] = input[14];
+
+  // stage 2
+  btf_16_w16_avx2(cospi_p02_p62, cospi_p62_m02, &x1[0], &x1[1], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p10_p54, cospi_p54_m10, &x1[2], &x1[3], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p18_p46, cospi_p46_m18, &x1[4], &x1[5], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p26_p38, cospi_p38_m26, &x1[6], &x1[7], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p34_p30, cospi_p30_m34, &x1[8], &x1[9], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p42_p22, cospi_p22_m42, &x1[10], &x1[11], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p50_p14, cospi_p14_m50, &x1[12], &x1[13], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p58_p06, cospi_p06_m58, &x1[14], &x1[15], _r, cos_bit);
+
+  iadst16_stage3_avx2(x1);
+  iadst16_stage4_avx2(x1, cospi, _r, cos_bit);
+  iadst16_stage5_avx2(x1);
+  iadst16_stage6_avx2(x1, cospi, _r, cos_bit);
+  iadst16_stage7_avx2(x1);
+  iadst16_stage8_avx2(x1, cospi, _r, cos_bit);
+  iadst16_stage9_avx2(output, x1);
+}
+
+static void iadst16_low8_avx2(const __m256i *input, __m256i *output,
+                              int8_t cos_bit) {
+  (void)(cos_bit);
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  // stage 1
+  __m256i x1[16];
+  x1[1] = input[0];
+  x1[3] = input[2];
+  x1[5] = input[4];
+  x1[7] = input[6];
+  x1[8] = input[7];
+  x1[10] = input[5];
+  x1[12] = input[3];
+  x1[14] = input[1];
+
+  // stage 2
+  btf_16_w16_0_avx2(cospi[62], -cospi[2], x1[1], x1[0], x1[1]);
+  btf_16_w16_0_avx2(cospi[54], -cospi[10], x1[3], x1[2], x1[3]);
+  btf_16_w16_0_avx2(cospi[46], -cospi[18], x1[5], x1[4], x1[5]);
+  btf_16_w16_0_avx2(cospi[38], -cospi[26], x1[7], x1[6], x1[7]);
+  btf_16_w16_0_avx2(cospi[34], cospi[30], x1[8], x1[8], x1[9]);
+  btf_16_w16_0_avx2(cospi[42], cospi[22], x1[10], x1[10], x1[11]);
+  btf_16_w16_0_avx2(cospi[50], cospi[14], x1[12], x1[12], x1[13]);
+  btf_16_w16_0_avx2(cospi[58], cospi[06], x1[14], x1[14], x1[15]);
+
+  iadst16_stage3_avx2(x1);
+  iadst16_stage4_avx2(x1, cospi, _r, cos_bit);
+  iadst16_stage5_avx2(x1);
+  iadst16_stage6_avx2(x1, cospi, _r, cos_bit);
+  iadst16_stage7_avx2(x1);
+  iadst16_stage8_avx2(x1, cospi, _r, cos_bit);
+  iadst16_stage9_avx2(output, x1);
+}
+
+static void iadst16_low1_avx2(const __m256i *input, __m256i *output,
+                              int8_t cos_bit) {
+  (void)(cos_bit);
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  const __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
+  const __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
+  const __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
+  const __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
+
+  // stage 1
+  __m256i x1[16];
+  x1[1] = input[0];
+
+  // stage 2
+  btf_16_w16_0_avx2(cospi[62], -cospi[2], x1[1], x1[0], x1[1]);
+
+  // stage 3
+  x1[8] = x1[0];
+  x1[9] = x1[1];
+
+  // stage 4
+  btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x1[8], &x1[9], _r, cos_bit);
+
+  // stage 5
+  x1[4] = x1[0];
+  x1[5] = x1[1];
+
+  x1[12] = x1[8];
+  x1[13] = x1[9];
+
+  // stage 6
+  btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[4], &x1[5], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[12], &x1[13], _r, cos_bit);
+
+  // stage 7
+  x1[2] = x1[0];
+  x1[3] = x1[1];
+  x1[6] = x1[4];
+  x1[7] = x1[5];
+  x1[10] = x1[8];
+  x1[11] = x1[9];
+  x1[14] = x1[12];
+  x1[15] = x1[13];
+
+  iadst16_stage8_avx2(x1, cospi, _r, cos_bit);
+  iadst16_stage9_avx2(output, x1);
+}
+
+static INLINE void idct32_high16_stage3_avx2(__m256i *x) {
+  btf_16_adds_subs_avx2(&x[16], &x[17]);
+  btf_16_adds_subs_avx2(&x[19], &x[18]);
+  btf_16_adds_subs_avx2(&x[20], &x[21]);
+  btf_16_adds_subs_avx2(&x[23], &x[22]);
+  btf_16_adds_subs_avx2(&x[24], &x[25]);
+  btf_16_adds_subs_avx2(&x[27], &x[26]);
+  btf_16_adds_subs_avx2(&x[28], &x[29]);
+  btf_16_adds_subs_avx2(&x[31], &x[30]);
+}
+
+static INLINE void idct32_high16_stage4_avx2(__m256i *x, const int32_t *cospi,
+                                             const __m256i _r, int8_t cos_bit) {
+  const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
+  const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
+  const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
+  const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
+  const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
+  const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
+  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[18], &x[29], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[21], &x[26], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r, cos_bit);
+}
+
+static INLINE void idct32_high24_stage5_avx2(__m256i *x, const int32_t *cospi,
+                                             const __m256i _r, int8_t cos_bit) {
+  const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+  const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+  const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x[16], &x[19]);
+  btf_16_adds_subs_avx2(&x[17], &x[18]);
+  btf_16_adds_subs_avx2(&x[23], &x[20]);
+  btf_16_adds_subs_avx2(&x[22], &x[21]);
+  btf_16_adds_subs_avx2(&x[24], &x[27]);
+  btf_16_adds_subs_avx2(&x[25], &x[26]);
+  btf_16_adds_subs_avx2(&x[31], &x[28]);
+  btf_16_adds_subs_avx2(&x[30], &x[29]);
+}
+
+static INLINE void idct32_high28_stage6_avx2(__m256i *x, const int32_t *cospi,
+                                             const __m256i _r, int8_t cos_bit) {
+  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+  const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+  const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x[8], &x[11]);
+  btf_16_adds_subs_avx2(&x[9], &x[10]);
+  btf_16_adds_subs_avx2(&x[15], &x[12]);
+  btf_16_adds_subs_avx2(&x[14], &x[13]);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[18], &x[29], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[19], &x[28], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[20], &x[27], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[21], &x[26], _r, cos_bit);
+}
+
+static INLINE void idct32_stage7_avx2(__m256i *x, const int32_t *cospi,
+                                      const __m256i _r, int8_t cos_bit) {
+  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  btf_16_adds_subs_avx2(&x[0], &x[7]);
+  btf_16_adds_subs_avx2(&x[1], &x[6]);
+  btf_16_adds_subs_avx2(&x[2], &x[5]);
+  btf_16_adds_subs_avx2(&x[3], &x[4]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x[16], &x[23]);
+  btf_16_adds_subs_avx2(&x[17], &x[22]);
+  btf_16_adds_subs_avx2(&x[18], &x[21]);
+  btf_16_adds_subs_avx2(&x[19], &x[20]);
+  btf_16_adds_subs_avx2(&x[31], &x[24]);
+  btf_16_adds_subs_avx2(&x[30], &x[25]);
+  btf_16_adds_subs_avx2(&x[29], &x[26]);
+  btf_16_adds_subs_avx2(&x[28], &x[27]);
+}
+
+static INLINE void idct32_stage8_avx2(__m256i *x, const int32_t *cospi,
+                                      const __m256i _r, int8_t cos_bit) {
+  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  btf_16_adds_subs_avx2(&x[0], &x[15]);
+  btf_16_adds_subs_avx2(&x[1], &x[14]);
+  btf_16_adds_subs_avx2(&x[2], &x[13]);
+  btf_16_adds_subs_avx2(&x[3], &x[12]);
+  btf_16_adds_subs_avx2(&x[4], &x[11]);
+  btf_16_adds_subs_avx2(&x[5], &x[10]);
+  btf_16_adds_subs_avx2(&x[6], &x[9]);
+  btf_16_adds_subs_avx2(&x[7], &x[8]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[20], &x[27], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[21], &x[26], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[22], &x[25], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[23], &x[24], _r, cos_bit);
+}
+
+static INLINE void idct32_stage9_avx2(__m256i *output, __m256i *x) {
+  btf_16_adds_subs_out_avx2(&output[0], &output[31], x[0], x[31]);
+  btf_16_adds_subs_out_avx2(&output[1], &output[30], x[1], x[30]);
+  btf_16_adds_subs_out_avx2(&output[2], &output[29], x[2], x[29]);
+  btf_16_adds_subs_out_avx2(&output[3], &output[28], x[3], x[28]);
+  btf_16_adds_subs_out_avx2(&output[4], &output[27], x[4], x[27]);
+  btf_16_adds_subs_out_avx2(&output[5], &output[26], x[5], x[26]);
+  btf_16_adds_subs_out_avx2(&output[6], &output[25], x[6], x[25]);
+  btf_16_adds_subs_out_avx2(&output[7], &output[24], x[7], x[24]);
+  btf_16_adds_subs_out_avx2(&output[8], &output[23], x[8], x[23]);
+  btf_16_adds_subs_out_avx2(&output[9], &output[22], x[9], x[22]);
+  btf_16_adds_subs_out_avx2(&output[10], &output[21], x[10], x[21]);
+  btf_16_adds_subs_out_avx2(&output[11], &output[20], x[11], x[20]);
+  btf_16_adds_subs_out_avx2(&output[12], &output[19], x[12], x[19]);
+  btf_16_adds_subs_out_avx2(&output[13], &output[18], x[13], x[18]);
+  btf_16_adds_subs_out_avx2(&output[14], &output[17], x[14], x[17]);
+  btf_16_adds_subs_out_avx2(&output[15], &output[16], x[15], x[16]);
+}
+
+static void idct32_low1_avx2(const __m256i *input, __m256i *output,
+                             int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+
+  // stage 1
+  __m256i x[2];
+  x[0] = input[0];
+
+  // stage 2
+  // stage 3
+  // stage 4
+  // stage 5
+  btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
+
+  // stage 6
+  // stage 7
+  // stage 8
+  // stage 9
+  output[0] = x[0];
+  output[31] = x[0];
+  output[1] = x[1];
+  output[30] = x[1];
+  output[2] = x[1];
+  output[29] = x[1];
+  output[3] = x[0];
+  output[28] = x[0];
+  output[4] = x[0];
+  output[27] = x[0];
+  output[5] = x[1];
+  output[26] = x[1];
+  output[6] = x[1];
+  output[25] = x[1];
+  output[7] = x[0];
+  output[24] = x[0];
+  output[8] = x[0];
+  output[23] = x[0];
+  output[9] = x[1];
+  output[22] = x[1];
+  output[10] = x[1];
+  output[21] = x[1];
+  output[11] = x[0];
+  output[20] = x[0];
+  output[12] = x[0];
+  output[19] = x[0];
+  output[13] = x[1];
+  output[18] = x[1];
+  output[14] = x[1];
+  output[17] = x[1];
+  output[15] = x[0];
+  output[16] = x[0];
+}
+
+static void idct32_low8_avx2(const __m256i *input, __m256i *output,
+                             int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  // stage 1
+  __m256i x[32];
+  x[0] = input[0];
+  x[4] = input[4];
+  x[8] = input[2];
+  x[12] = input[6];
+  x[16] = input[1];
+  x[20] = input[5];
+  x[24] = input[3];
+  x[28] = input[7];
+
+  // stage 2
+  btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
+  btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]);
+  btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]);
+  btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
+
+  // stage 3
+  btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
+  btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]);
+  x[17] = x[16];
+  x[18] = x[19];
+  x[21] = x[20];
+  x[22] = x[23];
+  x[25] = x[24];
+  x[26] = x[27];
+  x[29] = x[28];
+  x[30] = x[31];
+
+  // stage 4
+  btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
+  x[9] = x[8];
+  x[10] = x[11];
+  x[13] = x[12];
+  x[14] = x[15];
+  idct32_high16_stage4_avx2(x, cospi, _r, cos_bit);
+
+  // stage 5
+  btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
+  x[5] = x[4];
+  x[6] = x[7];
+  idct32_high24_stage5_avx2(x, cospi, _r, cos_bit);
+  // stage 6
+  x[3] = x[0];
+  x[2] = x[1];
+  idct32_high28_stage6_avx2(x, cospi, _r, cos_bit);
+
+  idct32_stage7_avx2(x, cospi, _r, cos_bit);
+  idct32_stage8_avx2(x, cospi, _r, cos_bit);
+  idct32_stage9_avx2(output, x);
+}
+
+static void idct32_low16_avx2(const __m256i *input, __m256i *output,
+                              int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  // stage 1
+  __m256i x[32];
+  x[0] = input[0];
+  x[2] = input[8];
+  x[4] = input[4];
+  x[6] = input[12];
+  x[8] = input[2];
+  x[10] = input[10];
+  x[12] = input[6];
+  x[14] = input[14];
+  x[16] = input[1];
+  x[18] = input[9];
+  x[20] = input[5];
+  x[22] = input[13];
+  x[24] = input[3];
+  x[26] = input[11];
+  x[28] = input[7];
+  x[30] = input[15];
+
+  // stage 2
+  btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
+  btf_16_w16_0_avx2(-cospi[34], cospi[30], x[30], x[17], x[30]);
+  btf_16_w16_0_avx2(cospi[46], cospi[18], x[18], x[18], x[29]);
+  btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]);
+  btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]);
+  btf_16_w16_0_avx2(-cospi[42], cospi[22], x[26], x[21], x[26]);
+  btf_16_w16_0_avx2(cospi[38], cospi[26], x[22], x[22], x[25]);
+  btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
+
+  // stage 3
+  btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
+  btf_16_w16_0_avx2(-cospi[36], cospi[28], x[14], x[9], x[14]);
+  btf_16_w16_0_avx2(cospi[44], cospi[20], x[10], x[10], x[13]);
+  btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]);
+  idct32_high16_stage3_avx2(x);
+
+  // stage 4
+  btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
+  btf_16_w16_0_avx2(-cospi[40], cospi[24], x[6], x[5], x[6]);
+  btf_16_adds_subs_avx2(&x[8], &x[9]);
+  btf_16_adds_subs_avx2(&x[11], &x[10]);
+  btf_16_adds_subs_avx2(&x[12], &x[13]);
+  btf_16_adds_subs_avx2(&x[15], &x[14]);
+  idct32_high16_stage4_avx2(x, cospi, _r, cos_bit);
+
+  // stage 5
+  btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
+  btf_16_w16_0_avx2(cospi[48], cospi[16], x[2], x[2], x[3]);
+  btf_16_adds_subs_avx2(&x[4], &x[5]);
+  btf_16_adds_subs_avx2(&x[7], &x[6]);
+  idct32_high24_stage5_avx2(x, cospi, _r, cos_bit);
+
+  btf_16_adds_subs_avx2(&x[0], &x[3]);
+  btf_16_adds_subs_avx2(&x[1], &x[2]);
+  idct32_high28_stage6_avx2(x, cospi, _r, cos_bit);
+
+  idct32_stage7_avx2(x, cospi, _r, cos_bit);
+  idct32_stage8_avx2(x, cospi, _r, cos_bit);
+  idct32_stage9_avx2(output, x);
+}
+
+static void idct32_avx2(const __m256i *input, __m256i *output, int8_t cos_bit) {
+  (void)(cos_bit);
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]);
+  __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]);
+  __m256i cospi_p30_m34 = pair_set_w16_epi16(cospi[30], -cospi[34]);
+  __m256i cospi_p34_p30 = pair_set_w16_epi16(cospi[34], cospi[30]);
+  __m256i cospi_p46_m18 = pair_set_w16_epi16(cospi[46], -cospi[18]);
+  __m256i cospi_p18_p46 = pair_set_w16_epi16(cospi[18], cospi[46]);
+  __m256i cospi_p14_m50 = pair_set_w16_epi16(cospi[14], -cospi[50]);
+  __m256i cospi_p50_p14 = pair_set_w16_epi16(cospi[50], cospi[14]);
+  __m256i cospi_p54_m10 = pair_set_w16_epi16(cospi[54], -cospi[10]);
+  __m256i cospi_p10_p54 = pair_set_w16_epi16(cospi[10], cospi[54]);
+  __m256i cospi_p22_m42 = pair_set_w16_epi16(cospi[22], -cospi[42]);
+  __m256i cospi_p42_p22 = pair_set_w16_epi16(cospi[42], cospi[22]);
+  __m256i cospi_p38_m26 = pair_set_w16_epi16(cospi[38], -cospi[26]);
+  __m256i cospi_p26_p38 = pair_set_w16_epi16(cospi[26], cospi[38]);
+  __m256i cospi_p06_m58 = pair_set_w16_epi16(cospi[6], -cospi[58]);
+  __m256i cospi_p58_p06 = pair_set_w16_epi16(cospi[58], cospi[6]);
+  __m256i cospi_p60_m04 = pair_set_w16_epi16(cospi[60], -cospi[4]);
+  __m256i cospi_p04_p60 = pair_set_w16_epi16(cospi[4], cospi[60]);
+  __m256i cospi_p28_m36 = pair_set_w16_epi16(cospi[28], -cospi[36]);
+  __m256i cospi_p36_p28 = pair_set_w16_epi16(cospi[36], cospi[28]);
+  __m256i cospi_p44_m20 = pair_set_w16_epi16(cospi[44], -cospi[20]);
+  __m256i cospi_p20_p44 = pair_set_w16_epi16(cospi[20], cospi[44]);
+  __m256i cospi_p12_m52 = pair_set_w16_epi16(cospi[12], -cospi[52]);
+  __m256i cospi_p52_p12 = pair_set_w16_epi16(cospi[52], cospi[12]);
+  __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
+  __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
+  __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]);
+  __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]);
+  __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
+  __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
+  __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
+
+  // stage 1
+  __m256i x1[32];
+  x1[0] = input[0];
+  x1[1] = input[16];
+  x1[2] = input[8];
+  x1[3] = input[24];
+  x1[4] = input[4];
+  x1[5] = input[20];
+  x1[6] = input[12];
+  x1[7] = input[28];
+  x1[8] = input[2];
+  x1[9] = input[18];
+  x1[10] = input[10];
+  x1[11] = input[26];
+  x1[12] = input[6];
+  x1[13] = input[22];
+  x1[14] = input[14];
+  x1[15] = input[30];
+  x1[16] = input[1];
+  x1[17] = input[17];
+  x1[18] = input[9];
+  x1[19] = input[25];
+  x1[20] = input[5];
+  x1[21] = input[21];
+  x1[22] = input[13];
+  x1[23] = input[29];
+  x1[24] = input[3];
+  x1[25] = input[19];
+  x1[26] = input[11];
+  x1[27] = input[27];
+  x1[28] = input[7];
+  x1[29] = input[23];
+  x1[30] = input[15];
+  x1[31] = input[31];
+
+  // stage 2
+  btf_16_w16_avx2(cospi_p62_m02, cospi_p02_p62, &x1[16], &x1[31], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p30_m34, cospi_p34_p30, &x1[17], &x1[30], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p46_m18, cospi_p18_p46, &x1[18], &x1[29], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p14_m50, cospi_p50_p14, &x1[19], &x1[28], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p54_m10, cospi_p10_p54, &x1[20], &x1[27], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p22_m42, cospi_p42_p22, &x1[21], &x1[26], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p38_m26, cospi_p26_p38, &x1[22], &x1[25], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p06_m58, cospi_p58_p06, &x1[23], &x1[24], _r, cos_bit);
+
+  // stage 3
+  btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, &x1[8], &x1[15], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, &x1[9], &x1[14], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, &x1[10], &x1[13], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, &x1[11], &x1[12], _r, cos_bit);
+  idct32_high16_stage3_avx2(x1);
+
+  // stage 4
+  btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, &x1[4], &x1[7], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, &x1[5], &x1[6], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[8], &x1[9]);
+  btf_16_adds_subs_avx2(&x1[11], &x1[10]);
+  btf_16_adds_subs_avx2(&x1[12], &x1[13]);
+  btf_16_adds_subs_avx2(&x1[15], &x1[14]);
+  idct32_high16_stage4_avx2(x1, cospi, _r, cos_bit);
+
+  // stage 5
+  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, &x1[2], &x1[3], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[4], &x1[5]);
+  btf_16_adds_subs_avx2(&x1[7], &x1[6]);
+  idct32_high24_stage5_avx2(x1, cospi, _r, cos_bit);
+
+  // stage 6
+  btf_16_adds_subs_avx2(&x1[0], &x1[3]);
+  btf_16_adds_subs_avx2(&x1[1], &x1[2]);
+  idct32_high28_stage6_avx2(x1, cospi, _r, cos_bit);
+
+  idct32_stage7_avx2(x1, cospi, _r, cos_bit);
+  idct32_stage8_avx2(x1, cospi, _r, cos_bit);
+  idct32_stage9_avx2(output, x1);
+}
+
+static INLINE void idct64_stage4_high32_avx2(__m256i *x, const int32_t *cospi,
+                                             const __m256i _r, int8_t cos_bit) {
+  (void)cos_bit;
+  const __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
+  const __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
+  const __m256i cospi_m60_m04 = pair_set_w16_epi16(-cospi[60], -cospi[4]);
+  const __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
+  const __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]);
+  const __m256i cospi_m28_m36 = pair_set_w16_epi16(-cospi[28], -cospi[36]);
+  const __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]);
+  const __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]);
+  const __m256i cospi_m44_m20 = pair_set_w16_epi16(-cospi[44], -cospi[20]);
+  const __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
+  const __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]);
+  const __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]);
+  btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x[33], &x[62], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m60_m04, cospi_m04_p60, &x[34], &x[61], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m36_p28, cospi_p28_p36, &x[37], &x[58], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x[38], &x[57], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x[41], &x[54], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m44_m20, cospi_m20_p44, &x[42], &x[53], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m52_p12, cospi_p12_p52, &x[45], &x[50], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x[46], &x[49], _r, cos_bit);
+}
+
+static INLINE void idct64_stage5_high48_avx2(__m256i *x, const int32_t *cospi,
+                                             const __m256i _r, int8_t cos_bit) {
+  (void)cos_bit;
+  const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
+  const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
+  const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
+  const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
+  const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
+  const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
+  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[18], &x[29], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[21], &x[26], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x[32], &x[35]);
+  btf_16_adds_subs_avx2(&x[33], &x[34]);
+  btf_16_adds_subs_avx2(&x[39], &x[36]);
+  btf_16_adds_subs_avx2(&x[38], &x[37]);
+  btf_16_adds_subs_avx2(&x[40], &x[43]);
+  btf_16_adds_subs_avx2(&x[41], &x[42]);
+  btf_16_adds_subs_avx2(&x[47], &x[44]);
+  btf_16_adds_subs_avx2(&x[46], &x[45]);
+  btf_16_adds_subs_avx2(&x[48], &x[51]);
+  btf_16_adds_subs_avx2(&x[49], &x[50]);
+  btf_16_adds_subs_avx2(&x[55], &x[52]);
+  btf_16_adds_subs_avx2(&x[54], &x[53]);
+  btf_16_adds_subs_avx2(&x[56], &x[59]);
+  btf_16_adds_subs_avx2(&x[57], &x[58]);
+  btf_16_adds_subs_avx2(&x[63], &x[60]);
+  btf_16_adds_subs_avx2(&x[62], &x[61]);
+}
+
+static INLINE void idct64_stage6_high32_avx2(__m256i *x, const int32_t *cospi,
+                                             const __m256i _r, int8_t cos_bit) {
+  (void)cos_bit;
+  const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
+  const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
+  const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
+  const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
+  const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
+  const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
+  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[34], &x[61], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[35], &x[60], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[36], &x[59], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[37], &x[58], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[42], &x[53], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[43], &x[52], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[44], &x[51], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[45], &x[50], _r, cos_bit);
+}
+
+static INLINE void idct64_stage6_high48_avx2(__m256i *x, const int32_t *cospi,
+                                             const __m256i _r, int8_t cos_bit) {
+  btf_16_adds_subs_avx2(&x[16], &x[19]);
+  btf_16_adds_subs_avx2(&x[17], &x[18]);
+  btf_16_adds_subs_avx2(&x[23], &x[20]);
+  btf_16_adds_subs_avx2(&x[22], &x[21]);
+  btf_16_adds_subs_avx2(&x[24], &x[27]);
+  btf_16_adds_subs_avx2(&x[25], &x[26]);
+  btf_16_adds_subs_avx2(&x[31], &x[28]);
+  btf_16_adds_subs_avx2(&x[30], &x[29]);
+  idct64_stage6_high32_avx2(x, cospi, _r, cos_bit);
+}
+
+static INLINE void idct64_stage7_high48_avx2(__m256i *x, const int32_t *cospi,
+                                             const __m256i _r, int8_t cos_bit) {
+  (void)cos_bit;
+  const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+  const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+  const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[18], &x[29], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[19], &x[28], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[20], &x[27], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[21], &x[26], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x[32], &x[39]);
+  btf_16_adds_subs_avx2(&x[33], &x[38]);
+  btf_16_adds_subs_avx2(&x[34], &x[37]);
+  btf_16_adds_subs_avx2(&x[35], &x[36]);
+  btf_16_adds_subs_avx2(&x[47], &x[40]);
+  btf_16_adds_subs_avx2(&x[46], &x[41]);
+  btf_16_adds_subs_avx2(&x[45], &x[42]);
+  btf_16_adds_subs_avx2(&x[44], &x[43]);
+  btf_16_adds_subs_avx2(&x[48], &x[55]);
+  btf_16_adds_subs_avx2(&x[49], &x[54]);
+  btf_16_adds_subs_avx2(&x[50], &x[53]);
+  btf_16_adds_subs_avx2(&x[51], &x[52]);
+  btf_16_adds_subs_avx2(&x[63], &x[56]);
+  btf_16_adds_subs_avx2(&x[62], &x[57]);
+  btf_16_adds_subs_avx2(&x[61], &x[58]);
+  btf_16_adds_subs_avx2(&x[60], &x[59]);
+}
+
+static INLINE void idct64_stage8_high48_avx2(__m256i *x, const int32_t *cospi,
+                                             const __m256i _r, int8_t cos_bit) {
+  (void)cos_bit;
+  const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+  const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+  const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+  btf_16_adds_subs_avx2(&x[16], &x[23]);
+  btf_16_adds_subs_avx2(&x[17], &x[22]);
+  btf_16_adds_subs_avx2(&x[18], &x[21]);
+  btf_16_adds_subs_avx2(&x[19], &x[20]);
+  btf_16_adds_subs_avx2(&x[31], &x[24]);
+  btf_16_adds_subs_avx2(&x[30], &x[25]);
+  btf_16_adds_subs_avx2(&x[29], &x[26]);
+  btf_16_adds_subs_avx2(&x[28], &x[27]);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[36], &x[59], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[37], &x[58], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[38], &x[57], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[39], &x[56], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[40], &x[55], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[41], &x[54], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[42], &x[53], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[43], &x[52], _r, cos_bit);
+}
+
+static INLINE void idct64_stage9_avx2(__m256i *x, const int32_t *cospi,
+                                      const __m256i _r, int8_t cos_bit) {
+  (void)cos_bit;
+  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  btf_16_adds_subs_avx2(&x[0], &x[15]);
+  btf_16_adds_subs_avx2(&x[1], &x[14]);
+  btf_16_adds_subs_avx2(&x[2], &x[13]);
+  btf_16_adds_subs_avx2(&x[3], &x[12]);
+  btf_16_adds_subs_avx2(&x[4], &x[11]);
+  btf_16_adds_subs_avx2(&x[5], &x[10]);
+  btf_16_adds_subs_avx2(&x[6], &x[9]);
+  btf_16_adds_subs_avx2(&x[7], &x[8]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[20], &x[27], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[21], &x[26], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[22], &x[25], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[23], &x[24], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x[32], &x[47]);
+  btf_16_adds_subs_avx2(&x[33], &x[46]);
+  btf_16_adds_subs_avx2(&x[34], &x[45]);
+  btf_16_adds_subs_avx2(&x[35], &x[44]);
+  btf_16_adds_subs_avx2(&x[36], &x[43]);
+  btf_16_adds_subs_avx2(&x[37], &x[42]);
+  btf_16_adds_subs_avx2(&x[38], &x[41]);
+  btf_16_adds_subs_avx2(&x[39], &x[40]);
+  btf_16_adds_subs_avx2(&x[63], &x[48]);
+  btf_16_adds_subs_avx2(&x[62], &x[49]);
+  btf_16_adds_subs_avx2(&x[61], &x[50]);
+  btf_16_adds_subs_avx2(&x[60], &x[51]);
+  btf_16_adds_subs_avx2(&x[59], &x[52]);
+  btf_16_adds_subs_avx2(&x[58], &x[53]);
+  btf_16_adds_subs_avx2(&x[57], &x[54]);
+  btf_16_adds_subs_avx2(&x[56], &x[55]);
+}
+
+static INLINE void idct64_stage10_avx2(__m256i *x, const int32_t *cospi,
+                                       const __m256i _r, int8_t cos_bit) {
+  (void)cos_bit;
+  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  btf_16_adds_subs_avx2(&x[0], &x[31]);
+  btf_16_adds_subs_avx2(&x[1], &x[30]);
+  btf_16_adds_subs_avx2(&x[2], &x[29]);
+  btf_16_adds_subs_avx2(&x[3], &x[28]);
+  btf_16_adds_subs_avx2(&x[4], &x[27]);
+  btf_16_adds_subs_avx2(&x[5], &x[26]);
+  btf_16_adds_subs_avx2(&x[6], &x[25]);
+  btf_16_adds_subs_avx2(&x[7], &x[24]);
+  btf_16_adds_subs_avx2(&x[8], &x[23]);
+  btf_16_adds_subs_avx2(&x[9], &x[22]);
+  btf_16_adds_subs_avx2(&x[10], &x[21]);
+  btf_16_adds_subs_avx2(&x[11], &x[20]);
+  btf_16_adds_subs_avx2(&x[12], &x[19]);
+  btf_16_adds_subs_avx2(&x[13], &x[18]);
+  btf_16_adds_subs_avx2(&x[14], &x[17]);
+  btf_16_adds_subs_avx2(&x[15], &x[16]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[40], &x[55], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[41], &x[54], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[42], &x[53], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[43], &x[52], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[44], &x[51], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[45], &x[50], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[46], &x[49], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[47], &x[48], _r, cos_bit);
+}
+
+static INLINE void idct64_stage11_avx2(__m256i *output, __m256i *x) {
+  btf_16_adds_subs_out_avx2(&output[0], &output[63], x[0], x[63]);
+  btf_16_adds_subs_out_avx2(&output[1], &output[62], x[1], x[62]);
+  btf_16_adds_subs_out_avx2(&output[2], &output[61], x[2], x[61]);
+  btf_16_adds_subs_out_avx2(&output[3], &output[60], x[3], x[60]);
+  btf_16_adds_subs_out_avx2(&output[4], &output[59], x[4], x[59]);
+  btf_16_adds_subs_out_avx2(&output[5], &output[58], x[5], x[58]);
+  btf_16_adds_subs_out_avx2(&output[6], &output[57], x[6], x[57]);
+  btf_16_adds_subs_out_avx2(&output[7], &output[56], x[7], x[56]);
+  btf_16_adds_subs_out_avx2(&output[8], &output[55], x[8], x[55]);
+  btf_16_adds_subs_out_avx2(&output[9], &output[54], x[9], x[54]);
+  btf_16_adds_subs_out_avx2(&output[10], &output[53], x[10], x[53]);
+  btf_16_adds_subs_out_avx2(&output[11], &output[52], x[11], x[52]);
+  btf_16_adds_subs_out_avx2(&output[12], &output[51], x[12], x[51]);
+  btf_16_adds_subs_out_avx2(&output[13], &output[50], x[13], x[50]);
+  btf_16_adds_subs_out_avx2(&output[14], &output[49], x[14], x[49]);
+  btf_16_adds_subs_out_avx2(&output[15], &output[48], x[15], x[48]);
+  btf_16_adds_subs_out_avx2(&output[16], &output[47], x[16], x[47]);
+  btf_16_adds_subs_out_avx2(&output[17], &output[46], x[17], x[46]);
+  btf_16_adds_subs_out_avx2(&output[18], &output[45], x[18], x[45]);
+  btf_16_adds_subs_out_avx2(&output[19], &output[44], x[19], x[44]);
+  btf_16_adds_subs_out_avx2(&output[20], &output[43], x[20], x[43]);
+  btf_16_adds_subs_out_avx2(&output[21], &output[42], x[21], x[42]);
+  btf_16_adds_subs_out_avx2(&output[22], &output[41], x[22], x[41]);
+  btf_16_adds_subs_out_avx2(&output[23], &output[40], x[23], x[40]);
+  btf_16_adds_subs_out_avx2(&output[24], &output[39], x[24], x[39]);
+  btf_16_adds_subs_out_avx2(&output[25], &output[38], x[25], x[38]);
+  btf_16_adds_subs_out_avx2(&output[26], &output[37], x[26], x[37]);
+  btf_16_adds_subs_out_avx2(&output[27], &output[36], x[27], x[36]);
+  btf_16_adds_subs_out_avx2(&output[28], &output[35], x[28], x[35]);
+  btf_16_adds_subs_out_avx2(&output[29], &output[34], x[29], x[34]);
+  btf_16_adds_subs_out_avx2(&output[30], &output[33], x[30], x[33]);
+  btf_16_adds_subs_out_avx2(&output[31], &output[32], x[31], x[32]);
+}
+
+static void idct64_low1_avx2(const __m256i *input, __m256i *output,
+                             int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+
+  // stage 1
+  __m256i x[32];
+  x[0] = input[0];
+
+  // stage 2
+  // stage 3
+  // stage 4
+  // stage 5
+  // stage 6
+  btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
+
+  // stage 7
+  // stage 8
+  // stage 9
+  // stage 10
+  // stage 11
+  output[0] = x[0];
+  output[63] = x[0];
+  output[1] = x[1];
+  output[62] = x[1];
+  output[2] = x[1];
+  output[61] = x[1];
+  output[3] = x[0];
+  output[60] = x[0];
+  output[4] = x[0];
+  output[59] = x[0];
+  output[5] = x[1];
+  output[58] = x[1];
+  output[6] = x[1];
+  output[57] = x[1];
+  output[7] = x[0];
+  output[56] = x[0];
+  output[8] = x[0];
+  output[55] = x[0];
+  output[9] = x[1];
+  output[54] = x[1];
+  output[10] = x[1];
+  output[53] = x[1];
+  output[11] = x[0];
+  output[52] = x[0];
+  output[12] = x[0];
+  output[51] = x[0];
+  output[13] = x[1];
+  output[50] = x[1];
+  output[14] = x[1];
+  output[49] = x[1];
+  output[15] = x[0];
+  output[48] = x[0];
+  output[16] = x[0];
+  output[47] = x[0];
+  output[17] = x[1];
+  output[46] = x[1];
+  output[18] = x[1];
+  output[45] = x[1];
+  output[19] = x[0];
+  output[44] = x[0];
+  output[20] = x[0];
+  output[43] = x[0];
+  output[21] = x[1];
+  output[42] = x[1];
+  output[22] = x[1];
+  output[41] = x[1];
+  output[23] = x[0];
+  output[40] = x[0];
+  output[24] = x[0];
+  output[39] = x[0];
+  output[25] = x[1];
+  output[38] = x[1];
+  output[26] = x[1];
+  output[37] = x[1];
+  output[27] = x[0];
+  output[36] = x[0];
+  output[28] = x[0];
+  output[35] = x[0];
+  output[29] = x[1];
+  output[34] = x[1];
+  output[30] = x[1];
+  output[33] = x[1];
+  output[31] = x[0];
+  output[32] = x[0];
+}
+
+static void idct64_low8_avx2(const __m256i *input, __m256i *output,
+                             int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+  const __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
+  const __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
+  const __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
+  const __m256i cospi_m28_m36 = pair_set_w16_epi16(-cospi[28], -cospi[36]);
+  const __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]);
+  const __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]);
+  const __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
+  const __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]);
+  const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
+  const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
+  const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
+  const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
+  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+  const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+
+  // stage 1
+  __m256i x[64];
+  x[0] = input[0];
+  x[8] = input[4];
+  x[16] = input[2];
+  x[24] = input[6];
+  x[32] = input[1];
+  x[40] = input[5];
+  x[48] = input[3];
+  x[56] = input[7];
+
+  // stage 2
+  btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]);
+  btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]);
+  btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]);
+  btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]);
+
+  // stage 3
+  btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
+  btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
+  x[33] = x[32];
+  x[38] = x[39];
+  x[41] = x[40];
+  x[46] = x[47];
+  x[49] = x[48];
+  x[54] = x[55];
+  x[57] = x[56];
+  x[62] = x[63];
+
+  // stage 4
+  btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
+  x[17] = x[16];
+  x[22] = x[23];
+  x[25] = x[24];
+  x[30] = x[31];
+  btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x[33], &x[62], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x[38], &x[57], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x[41], &x[54], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x[46], &x[49], _r, cos_bit);
+
+  // stage 5
+  x[9] = x[8];
+  x[14] = x[15];
+  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r, cos_bit);
+  x[35] = x[32];
+  x[34] = x[33];
+  x[36] = x[39];
+  x[37] = x[38];
+  x[43] = x[40];
+  x[42] = x[41];
+  x[44] = x[47];
+  x[45] = x[46];
+  x[51] = x[48];
+  x[50] = x[49];
+  x[52] = x[55];
+  x[53] = x[54];
+  x[59] = x[56];
+  x[58] = x[57];
+  x[60] = x[63];
+  x[61] = x[62];
+
+  // stage 6
+  btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, cos_bit);
+  x[19] = x[16];
+  x[18] = x[17];
+  x[20] = x[23];
+  x[21] = x[22];
+  x[27] = x[24];
+  x[26] = x[25];
+  x[28] = x[31];
+  x[29] = x[30];
+  idct64_stage6_high32_avx2(x, cospi, _r, cos_bit);
+
+  // stage 7
+  x[3] = x[0];
+  x[2] = x[1];
+  x[11] = x[8];
+  x[10] = x[9];
+  x[12] = x[15];
+  x[13] = x[14];
+  idct64_stage7_high48_avx2(x, cospi, _r, cos_bit);
+
+  // stage 8
+  x[7] = x[0];
+  x[6] = x[1];
+  x[5] = x[2];
+  x[4] = x[3];
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit);
+  idct64_stage8_high48_avx2(x, cospi, _r, cos_bit);
+
+  idct64_stage9_avx2(x, cospi, _r, cos_bit);
+  idct64_stage10_avx2(x, cospi, _r, cos_bit);
+  idct64_stage11_avx2(output, x);
+}
+
+static void idct64_low16_avx2(const __m256i *input, __m256i *output,
+                              int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+  const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+  const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+
+  // stage 1
+  __m256i x[64];
+  x[0] = input[0];
+  x[4] = input[8];
+  x[8] = input[4];
+  x[12] = input[12];
+  x[16] = input[2];
+  x[20] = input[10];
+  x[24] = input[6];
+  x[28] = input[14];
+  x[32] = input[1];
+  x[36] = input[9];
+  x[40] = input[5];
+  x[44] = input[13];
+  x[48] = input[3];
+  x[52] = input[11];
+  x[56] = input[7];
+  x[60] = input[15];
+
+  // stage 2
+  btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]);
+  btf_16_w16_0_avx2(-cospi[49], cospi[15], x[60], x[35], x[60]);
+  btf_16_w16_0_avx2(cospi[55], cospi[9], x[36], x[36], x[59]);
+  btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]);
+  btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]);
+  btf_16_w16_0_avx2(-cospi[53], cospi[11], x[52], x[43], x[52]);
+  btf_16_w16_0_avx2(cospi[51], cospi[13], x[44], x[44], x[51]);
+  btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]);
+
+  // stage 3
+  btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
+  btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]);
+  btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]);
+  btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
+  x[33] = x[32];
+  x[34] = x[35];
+  x[37] = x[36];
+  x[38] = x[39];
+  x[41] = x[40];
+  x[42] = x[43];
+  x[45] = x[44];
+  x[46] = x[47];
+  x[49] = x[48];
+  x[50] = x[51];
+  x[53] = x[52];
+  x[54] = x[55];
+  x[57] = x[56];
+  x[58] = x[59];
+  x[61] = x[60];
+  x[62] = x[63];
+
+  // stage 4
+  btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
+  btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]);
+  x[17] = x[16];
+  x[18] = x[19];
+  x[21] = x[20];
+  x[22] = x[23];
+  x[25] = x[24];
+  x[26] = x[27];
+  x[29] = x[28];
+  x[30] = x[31];
+  idct64_stage4_high32_avx2(x, cospi, _r, cos_bit);
+
+  // stage 5
+  btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
+  x[9] = x[8];
+  x[10] = x[11];
+  x[13] = x[12];
+  x[14] = x[15];
+  idct64_stage5_high48_avx2(x, cospi, _r, cos_bit);
+
+  // stage 6
+  btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
+  x[5] = x[4];
+  x[6] = x[7];
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r, cos_bit);
+  idct64_stage6_high48_avx2(x, cospi, _r, cos_bit);
+
+  // stage 7
+  x[3] = x[0];
+  x[2] = x[1];
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x[8], &x[11]);
+  btf_16_adds_subs_avx2(&x[9], &x[10]);
+  btf_16_adds_subs_avx2(&x[15], &x[12]);
+  btf_16_adds_subs_avx2(&x[14], &x[13]);
+  idct64_stage7_high48_avx2(x, cospi, _r, cos_bit);
+
+  // stage 8
+  btf_16_adds_subs_avx2(&x[0], &x[7]);
+  btf_16_adds_subs_avx2(&x[1], &x[6]);
+  btf_16_adds_subs_avx2(&x[2], &x[5]);
+  btf_16_adds_subs_avx2(&x[3], &x[4]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit);
+  idct64_stage8_high48_avx2(x, cospi, _r, cos_bit);
+
+  idct64_stage9_avx2(x, cospi, _r, cos_bit);
+  idct64_stage10_avx2(x, cospi, _r, cos_bit);
+  idct64_stage11_avx2(output, x);
+}
+
+static void idct64_low32_avx2(const __m256i *input, __m256i *output,
+                              int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+  const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+  const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+  const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+
+  // stage 1
+  __m256i x[64];
+  x[0] = input[0];
+  x[2] = input[16];
+  x[4] = input[8];
+  x[6] = input[24];
+  x[8] = input[4];
+  x[10] = input[20];
+  x[12] = input[12];
+  x[14] = input[28];
+  x[16] = input[2];
+  x[18] = input[18];
+  x[20] = input[10];
+  x[22] = input[26];
+  x[24] = input[6];
+  x[26] = input[22];
+  x[28] = input[14];
+  x[30] = input[30];
+  x[32] = input[1];
+  x[34] = input[17];
+  x[36] = input[9];
+  x[38] = input[25];
+  x[40] = input[5];
+  x[42] = input[21];
+  x[44] = input[13];
+  x[46] = input[29];
+  x[48] = input[3];
+  x[50] = input[19];
+  x[52] = input[11];
+  x[54] = input[27];
+  x[56] = input[7];
+  x[58] = input[23];
+  x[60] = input[15];
+  x[62] = input[31];
+
+  // stage 2
+  btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]);
+  btf_16_w16_0_avx2(-cospi[33], cospi[31], x[62], x[33], x[62]);
+  btf_16_w16_0_avx2(cospi[47], cospi[17], x[34], x[34], x[61]);
+  btf_16_w16_0_avx2(-cospi[49], cospi[15], x[60], x[35], x[60]);
+  btf_16_w16_0_avx2(cospi[55], cospi[9], x[36], x[36], x[59]);
+  btf_16_w16_0_avx2(-cospi[41], cospi[23], x[58], x[37], x[58]);
+  btf_16_w16_0_avx2(cospi[39], cospi[25], x[38], x[38], x[57]);
+  btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]);
+  btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]);
+  btf_16_w16_0_avx2(-cospi[37], cospi[27], x[54], x[41], x[54]);
+  btf_16_w16_0_avx2(cospi[43], cospi[21], x[42], x[42], x[53]);
+  btf_16_w16_0_avx2(-cospi[53], cospi[11], x[52], x[43], x[52]);
+  btf_16_w16_0_avx2(cospi[51], cospi[13], x[44], x[44], x[51]);
+  btf_16_w16_0_avx2(-cospi[45], cospi[19], x[50], x[45], x[50]);
+  btf_16_w16_0_avx2(cospi[35], cospi[29], x[46], x[46], x[49]);
+  btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]);
+
+  // stage 3
+  btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
+  btf_16_w16_0_avx2(-cospi[34], cospi[30], x[30], x[17], x[30]);
+  btf_16_w16_0_avx2(cospi[46], cospi[18], x[18], x[18], x[29]);
+  btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]);
+  btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]);
+  btf_16_w16_0_avx2(-cospi[42], cospi[22], x[26], x[21], x[26]);
+  btf_16_w16_0_avx2(cospi[38], cospi[26], x[22], x[22], x[25]);
+  btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
+  btf_16_adds_subs_avx2(&x[32], &x[33]);
+  btf_16_adds_subs_avx2(&x[35], &x[34]);
+  btf_16_adds_subs_avx2(&x[36], &x[37]);
+  btf_16_adds_subs_avx2(&x[39], &x[38]);
+  btf_16_adds_subs_avx2(&x[40], &x[41]);
+  btf_16_adds_subs_avx2(&x[43], &x[42]);
+  btf_16_adds_subs_avx2(&x[44], &x[45]);
+  btf_16_adds_subs_avx2(&x[47], &x[46]);
+  btf_16_adds_subs_avx2(&x[48], &x[49]);
+  btf_16_adds_subs_avx2(&x[51], &x[50]);
+  btf_16_adds_subs_avx2(&x[52], &x[53]);
+  btf_16_adds_subs_avx2(&x[55], &x[54]);
+  btf_16_adds_subs_avx2(&x[56], &x[57]);
+  btf_16_adds_subs_avx2(&x[59], &x[58]);
+  btf_16_adds_subs_avx2(&x[60], &x[61]);
+  btf_16_adds_subs_avx2(&x[63], &x[62]);
+
+  // stage 4
+  btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
+  btf_16_w16_0_avx2(-cospi[36], cospi[28], x[14], x[9], x[14]);
+  btf_16_w16_0_avx2(cospi[44], cospi[20], x[10], x[10], x[13]);
+  btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]);
+  btf_16_adds_subs_avx2(&x[16], &x[17]);
+  btf_16_adds_subs_avx2(&x[19], &x[18]);
+  btf_16_adds_subs_avx2(&x[20], &x[21]);
+  btf_16_adds_subs_avx2(&x[23], &x[22]);
+  btf_16_adds_subs_avx2(&x[24], &x[25]);
+  btf_16_adds_subs_avx2(&x[27], &x[26]);
+  btf_16_adds_subs_avx2(&x[28], &x[29]);
+  btf_16_adds_subs_avx2(&x[31], &x[30]);
+  idct64_stage4_high32_avx2(x, cospi, _r, cos_bit);
+
+  // stage 5
+  btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
+  btf_16_w16_0_avx2(-cospi[40], cospi[24], x[6], x[5], x[6]);
+  btf_16_adds_subs_avx2(&x[8], &x[9]);
+  btf_16_adds_subs_avx2(&x[11], &x[10]);
+  btf_16_adds_subs_avx2(&x[12], &x[13]);
+  btf_16_adds_subs_avx2(&x[15], &x[14]);
+  idct64_stage5_high48_avx2(x, cospi, _r, cos_bit);
+
+  // stage 6
+  btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
+  btf_16_w16_0_avx2(cospi[48], cospi[16], x[2], x[2], x[3]);
+  btf_16_adds_subs_avx2(&x[4], &x[5]);
+  btf_16_adds_subs_avx2(&x[7], &x[6]);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r, cos_bit);
+  idct64_stage6_high48_avx2(x, cospi, _r, cos_bit);
+
+  // stage 7
+  btf_16_adds_subs_avx2(&x[0], &x[3]);
+  btf_16_adds_subs_avx2(&x[1], &x[2]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x[8], &x[11]);
+  btf_16_adds_subs_avx2(&x[9], &x[10]);
+  btf_16_adds_subs_avx2(&x[15], &x[12]);
+  btf_16_adds_subs_avx2(&x[14], &x[13]);
+  idct64_stage7_high48_avx2(x, cospi, _r, cos_bit);
+
+  // stage 8
+  btf_16_adds_subs_avx2(&x[0], &x[7]);
+  btf_16_adds_subs_avx2(&x[1], &x[6]);
+  btf_16_adds_subs_avx2(&x[2], &x[5]);
+  btf_16_adds_subs_avx2(&x[3], &x[4]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit);
+  idct64_stage8_high48_avx2(x, cospi, _r, cos_bit);
+
+  // stage 9~11
+  idct64_stage9_avx2(x, cospi, _r, cos_bit);
+  idct64_stage10_avx2(x, cospi, _r, cos_bit);
+  idct64_stage11_avx2(output, x);
+}
+
+typedef void (*transform_1d_avx2)(const __m256i *input, __m256i *output,
+                                  int8_t cos_bit);
+
+// 1D functions process 16 pixels at one time.
+static const transform_1d_avx2
+    lowbd_txfm_all_1d_zeros_w16_arr[TX_SIZES][ITX_TYPES_1D][4] = {
+      {
+          { NULL, NULL, NULL, NULL },
+          { NULL, NULL, NULL, NULL },
+          { NULL, NULL, NULL, NULL },
+      },
+      { { NULL, NULL, NULL, NULL },
+        { NULL, NULL, NULL, NULL },
+        { NULL, NULL, NULL, NULL } },
+      {
+          { idct16_low1_avx2, idct16_low8_avx2, idct16_avx2, NULL },
+          { iadst16_low1_avx2, iadst16_low8_avx2, iadst16_avx2, NULL },
+          { NULL, NULL, NULL, NULL },
+      },
+      { { idct32_low1_avx2, idct32_low8_avx2, idct32_low16_avx2, idct32_avx2 },
+        { NULL, NULL, NULL, NULL },
+        { NULL, NULL, NULL, NULL } },
+      { { idct64_low1_avx2, idct64_low8_avx2, idct64_low16_avx2,
+          idct64_low32_avx2 },
+        { NULL, NULL, NULL, NULL },
+        { NULL, NULL, NULL, NULL } }
+    };
+
+// only process w >= 16 h >= 16
+static INLINE void lowbd_inv_txfm2d_add_no_identity_avx2(
+    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+    TX_SIZE tx_size, int eob) {
+  __m256i buf1[64 * 16];
+  int eobx, eoby;
+  get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_size_w_div16 = txfm_size_col >> 4;
+  const int buf_size_nonzero_w_div16 = (eobx + 16) >> 4;
+  const int buf_size_nonzero_h_div16 = (eoby + 16) >> 4;
+  const int input_stride = AOMMIN(32, txfm_size_col);
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+
+  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+  const transform_1d_avx2 row_txfm =
+      lowbd_txfm_all_1d_zeros_w16_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+  const transform_1d_avx2 col_txfm =
+      lowbd_txfm_all_1d_zeros_w16_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+  assert(col_txfm != NULL);
+  assert(row_txfm != NULL);
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  const __m256i scale0 = _mm256_set1_epi16(1 << (15 + shift[0]));
+  for (int i = 0; i < buf_size_nonzero_h_div16; i++) {
+    __m256i buf0[64];
+    const int32_t *input_row = input + (i << 4) * input_stride;
+    for (int j = 0; j < buf_size_nonzero_w_div16; ++j) {
+      __m256i *buf0_cur = buf0 + j * 16;
+      const int32_t *input_cur = input_row + j * 16;
+      load_buffer_32bit_to_16bit_w16_avx2(input_cur, input_stride, buf0_cur,
+                                          16);
+      transpose_16bit_16x16_avx2(buf0_cur, buf0_cur);
+    }
+    if (rect_type == 1 || rect_type == -1) {
+      round_shift_avx2(buf0, buf0, input_stride);  // rect special code
+    }
+    row_txfm(buf0, buf0, cos_bit_row);
+    for (int j = 0; j < txfm_size_col; ++j) {
+      buf0[j] = _mm256_mulhrs_epi16(buf0[j], scale0);
+    }
+
+    __m256i *buf1_cur = buf1 + (i << 4);
+    if (lr_flip) {
+      for (int j = 0; j < buf_size_w_div16; ++j) {
+        __m256i temp[16];
+        flip_buf_avx2(buf0 + 16 * j, temp, 16);
+        int offset = txfm_size_row * (buf_size_w_div16 - 1 - j);
+        transpose_16bit_16x16_avx2(temp, buf1_cur + offset);
+      }
+    } else {
+      for (int j = 0; j < buf_size_w_div16; ++j) {
+        transpose_16bit_16x16_avx2(buf0 + 16 * j, buf1_cur + txfm_size_row * j);
+      }
+    }
+  }
+  const __m256i scale1 = _mm256_set1_epi16(1 << (15 + shift[1]));
+  for (int i = 0; i < buf_size_w_div16; i++) {
+    __m256i *buf1_cur = buf1 + i * txfm_size_row;
+    col_txfm(buf1_cur, buf1_cur, cos_bit_col);
+    for (int j = 0; j < txfm_size_row; ++j) {
+      buf1_cur[j] = _mm256_mulhrs_epi16(buf1_cur[j], scale1);
+    }
+  }
+  for (int i = 0; i < buf_size_w_div16; i++) {
+    lowbd_write_buffer_16xn_avx2(buf1 + i * txfm_size_row, output + 16 * i,
+                                 stride, ud_flip, txfm_size_row);
+  }
+}
+
+static INLINE void iidentity_row_16xn_avx2(__m256i *out, const int32_t *input,
+                                           int stride, int shift, int height,
+                                           int txw_idx, int rect_type) {
+  const int32_t *input_row = input;
+  const __m256i scale = _mm256_set1_epi16(NewSqrt2list[txw_idx]);
+  const __m256i _r = _mm256_set1_epi16((1 << (NewSqrt2Bits - 1)) +
+                                       (1 << (NewSqrt2Bits - shift - 1)));
+  const __m256i one = _mm256_set1_epi16(1);
+  const __m256i scale__r = _mm256_unpacklo_epi16(scale, _r);
+  if (rect_type != 1 && rect_type != -1) {
+    for (int i = 0; i < height; ++i) {
+      const __m256i src = load_32bit_to_16bit_w16_avx2(input_row);
+      input_row += stride;
+      __m256i lo = _mm256_unpacklo_epi16(src, one);
+      __m256i hi = _mm256_unpackhi_epi16(src, one);
+      lo = _mm256_madd_epi16(lo, scale__r);
+      hi = _mm256_madd_epi16(hi, scale__r);
+      lo = _mm256_srai_epi32(lo, NewSqrt2Bits - shift);
+      hi = _mm256_srai_epi32(hi, NewSqrt2Bits - shift);
+      out[i] = _mm256_packs_epi32(lo, hi);
+    }
+  } else {
+    const __m256i rect_scale =
+        _mm256_set1_epi16(NewInvSqrt2 << (15 - NewSqrt2Bits));
+    for (int i = 0; i < height; ++i) {
+      __m256i src = load_32bit_to_16bit_w16_avx2(input_row);
+      src = _mm256_mulhrs_epi16(src, rect_scale);
+      input_row += stride;
+      __m256i lo = _mm256_unpacklo_epi16(src, one);
+      __m256i hi = _mm256_unpackhi_epi16(src, one);
+      lo = _mm256_madd_epi16(lo, scale__r);
+      hi = _mm256_madd_epi16(hi, scale__r);
+      lo = _mm256_srai_epi32(lo, NewSqrt2Bits - shift);
+      hi = _mm256_srai_epi32(hi, NewSqrt2Bits - shift);
+      out[i] = _mm256_packs_epi32(lo, hi);
+    }
+  }
+}
+
+static INLINE void iidentity_col_16xn_avx2(uint8_t *output, int stride,
+                                           __m256i *buf, int shift, int height,
+                                           int txh_idx) {
+  const __m256i scale = _mm256_set1_epi16(NewSqrt2list[txh_idx]);
+  const __m256i scale__r = _mm256_set1_epi16(1 << (NewSqrt2Bits - 1));
+  const __m256i shift__r = _mm256_set1_epi32(1 << (-shift - 1));
+  const __m256i one = _mm256_set1_epi16(1);
+  const __m256i scale_coeff = _mm256_unpacklo_epi16(scale, scale__r);
+  for (int h = 0; h < height; ++h) {
+    __m256i lo = _mm256_unpacklo_epi16(buf[h], one);
+    __m256i hi = _mm256_unpackhi_epi16(buf[h], one);
+    lo = _mm256_madd_epi16(lo, scale_coeff);
+    hi = _mm256_madd_epi16(hi, scale_coeff);
+    lo = _mm256_srai_epi32(lo, NewSqrt2Bits);
+    hi = _mm256_srai_epi32(hi, NewSqrt2Bits);
+    lo = _mm256_add_epi32(lo, shift__r);
+    hi = _mm256_add_epi32(hi, shift__r);
+    lo = _mm256_srai_epi32(lo, -shift);
+    hi = _mm256_srai_epi32(hi, -shift);
+    const __m256i x = _mm256_packs_epi32(lo, hi);
+    write_recon_w16_avx2(x, output);
+    output += stride;
+  }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_idtx_avx2(const int32_t *input,
+                                                  uint8_t *output, int stride,
+                                                  TX_SIZE tx_size,
+                                                  int32_t eob) {
+  (void)eob;
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int input_stride = AOMMIN(32, txfm_size_col);
+  const int row_max = AOMMIN(32, txfm_size_row);
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+  __m256i buf[32];
+  for (int i = 0; i < input_stride; i += 16) {
+    iidentity_row_16xn_avx2(buf, input + i, input_stride, shift[0], row_max,
+                            txw_idx, rect_type);
+    iidentity_col_16xn_avx2(output + i, stride, buf, shift[1], row_max,
+                            txh_idx);
+  }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_h_identity_avx2(
+    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+    TX_SIZE tx_size, int eob) {
+  int eobx, eoby;
+  get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int txfm_size_col_notzero = AOMMIN(32, txfm_size_col);
+  const int input_stride = txfm_size_col_notzero;
+  const int buf_size_w_div16 = (eobx + 16) >> 4;
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+
+  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+  const transform_1d_avx2 col_txfm =
+      lowbd_txfm_all_1d_zeros_w16_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+  assert(col_txfm != NULL);
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  for (int i = 0; i < buf_size_w_div16; i++) {
+    __m256i buf0[64];
+    iidentity_row_16xn_avx2(buf0, input + (i << 4), input_stride, shift[0],
+                            eoby + 1, txw_idx, rect_type);
+    col_txfm(buf0, buf0, cos_bit_col);
+    __m256i mshift = _mm256_set1_epi16(1 << (15 + shift[1]));
+    int k = ud_flip ? (txfm_size_row - 1) : 0;
+    const int step = ud_flip ? -1 : 1;
+    for (int j = 0; j < txfm_size_row; ++j, k += step) {
+      __m256i res = _mm256_mulhrs_epi16(buf0[k], mshift);
+      write_recon_w16_avx2(res, output + (i << 4) + j * stride);
+    }
+  }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_v_identity_avx2(
+    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+    TX_SIZE tx_size, int eob) {
+  __m256i buf1[64];
+  int eobx, eoby;
+  get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_size_w_div16 = txfm_size_col >> 4;
+  const int buf_size_h_div16 = (eoby + 16) >> 4;
+  const int input_stride = AOMMIN(32, txfm_size_col);
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+
+  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+  const transform_1d_avx2 row_txfm =
+      lowbd_txfm_all_1d_zeros_w16_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+
+  assert(row_txfm != NULL);
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  for (int i = 0; i < buf_size_h_div16; i++) {
+    __m256i buf0[64];
+    const int32_t *input_row = input + i * input_stride * 16;
+    for (int j = 0; j < AOMMIN(4, buf_size_w_div16); ++j) {
+      __m256i *buf0_cur = buf0 + j * 16;
+      load_buffer_32bit_to_16bit_w16_avx2(input_row + j * 16, input_stride,
+                                          buf0_cur, 16);
+      transpose_16bit_16x16_avx2(buf0_cur, buf0_cur);
+    }
+    if (rect_type == 1 || rect_type == -1) {
+      round_shift_avx2(buf0, buf0, input_stride);  // rect special code
+    }
+    row_txfm(buf0, buf0, cos_bit_row);
+    round_shift_16bit_w16_avx2(buf0, txfm_size_col, shift[0]);
+    __m256i *_buf1 = buf1;
+    if (lr_flip) {
+      for (int j = 0; j < buf_size_w_div16; ++j) {
+        __m256i temp[16];
+        flip_buf_avx2(buf0 + 16 * j, temp, 16);
+        transpose_16bit_16x16_avx2(temp,
+                                   _buf1 + 16 * (buf_size_w_div16 - 1 - j));
+      }
+    } else {
+      for (int j = 0; j < buf_size_w_div16; ++j) {
+        transpose_16bit_16x16_avx2(buf0 + 16 * j, _buf1 + 16 * j);
+      }
+    }
+    for (int j = 0; j < buf_size_w_div16; ++j) {
+      iidentity_col_16xn_avx2(output + i * 16 * stride + j * 16, stride,
+                              buf1 + j * 16, shift[1], 16, txh_idx);
+    }
+  }
+}
+
+// for 32x32,32x64,64x32,64x64,16x32,32x16,64x16,16x64
+static INLINE void lowbd_inv_txfm2d_add_universe_avx2(
+    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+    TX_SIZE tx_size, int eob) {
+  (void)eob;
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:   // ADST in vertical, DCT in horizontal
+    case DCT_ADST:   // DCT  in vertical, ADST in horizontal
+    case ADST_ADST:  // ADST in both directions
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+      lowbd_inv_txfm2d_add_no_identity_avx2(input, output, stride, tx_type,
+                                            tx_size, eob);
+      break;
+    case IDTX:
+      lowbd_inv_txfm2d_add_idtx_avx2(input, output, stride, tx_size, eob);
+      break;
+    case V_DCT:
+    case V_ADST:
+    case V_FLIPADST:
+      lowbd_inv_txfm2d_add_h_identity_avx2(input, output, stride, tx_type,
+                                           tx_size, eob);
+      break;
+    case H_DCT:
+    case H_ADST:
+    case H_FLIPADST:
+      lowbd_inv_txfm2d_add_v_identity_avx2(input, output, stride, tx_type,
+                                           tx_size, eob);
+      break;
+    default:
+      av1_lowbd_inv_txfm2d_add_ssse3(input, output, stride, tx_type, tx_size,
+                                     eob);
+      break;
+  }
+}
+
+void av1_lowbd_inv_txfm2d_add_avx2(const int32_t *input, uint8_t *output,
+                                   int stride, TX_TYPE tx_type, TX_SIZE tx_size,
+                                   int eob) {
+  switch (tx_size) {
+    case TX_4X4:
+    case TX_8X8:
+    case TX_4X8:
+    case TX_8X4:
+    case TX_8X16:
+    case TX_16X8:
+    case TX_4X16:
+    case TX_16X4:
+    case TX_8X32:
+    case TX_32X8:
+      av1_lowbd_inv_txfm2d_add_ssse3(input, output, stride, tx_type, tx_size,
+                                     eob);
+      break;
+    case TX_16X16:
+    case TX_32X32:
+    case TX_64X64:
+    case TX_16X32:
+    case TX_32X16:
+    case TX_32X64:
+    case TX_64X32:
+    case TX_16X64:
+    case TX_64X16:
+    default:
+      lowbd_inv_txfm2d_add_universe_avx2(input, output, stride, tx_type,
+                                         tx_size, eob);
+      break;
+  }
+}
+
+void av1_inv_txfm_add_avx2(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
+                           const TxfmParam *txfm_param) {
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  if (!txfm_param->lossless) {
+    av1_lowbd_inv_txfm2d_add_avx2(dqcoeff, dst, stride, tx_type,
+                                  txfm_param->tx_size, txfm_param->eob);
+  } else {
+    av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param);
+  }
+}
diff --git a/libs/libaom/src/av1/common/x86/av1_inv_txfm_avx2.h b/libs/libaom/src/av1/common/x86/av1_inv_txfm_avx2.h
new file mode 100644
index 000000000..f74cbaeaa
--- /dev/null
+++ b/libs/libaom/src/av1/common/x86/av1_inv_txfm_avx2.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_
+#define AOM_AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_
+
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/transpose_sse2.h"
+#include "aom_dsp/x86/txfm_common_sse2.h"
+#include "aom_dsp/x86/txfm_common_avx2.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// half input is zero
+#define btf_16_w16_0_avx2(w0, w1, in, out0, out1)  \
+  {                                                \
+    const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
+    const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
+    const __m256i _in = in;                        \
+    out0 = _mm256_mulhrs_epi16(_in, _w0);          \
+    out1 = _mm256_mulhrs_epi16(_in, _w1);          \
+  }
+
+static INLINE void round_shift_avx2(const __m256i *input, __m256i *output,
+                                    int size) {
+  const __m256i scale = _mm256_set1_epi16(NewInvSqrt2 * 8);
+  for (int i = 0; i < size; ++i) {
+    output[i] = _mm256_mulhrs_epi16(input[i], scale);
+  }
+}
+
+static INLINE void write_recon_w16_avx2(__m256i res, uint8_t *output) {
+  __m128i pred = _mm_loadu_si128((__m128i const *)(output));
+  __m256i u = _mm256_adds_epi16(_mm256_cvtepu8_epi16(pred), res);
+  __m128i y = _mm256_castsi256_si128(
+      _mm256_permute4x64_epi64(_mm256_packus_epi16(u, u), 168));
+  _mm_storeu_si128((__m128i *)(output), y);
+}
+
+static INLINE void lowbd_write_buffer_16xn_avx2(__m256i *in, uint8_t *output,
+                                                int stride, int flipud,
+                                                int height) {
+  int j = flipud ? (height - 1) : 0;
+  const int step = flipud ? -1 : 1;
+  for (int i = 0; i < height; ++i, j += step) {
+    write_recon_w16_avx2(in[j], output + i * stride);
+  }
+}
+
+void av1_lowbd_inv_txfm2d_add_avx2(const int32_t *input, uint8_t *output,
+                                   int stride, TX_TYPE tx_type, TX_SIZE tx_size,
+                                   int eob);
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // AOM_AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_
diff --git a/libs/libaom/src/av1/common/x86/av1_inv_txfm_ssse3.c b/libs/libaom/src/av1/common/x86/av1_inv_txfm_ssse3.c
new file mode 100644
index 000000000..46c051ff8
--- /dev/null
+++ b/libs/libaom/src/av1/common/x86/av1_inv_txfm_ssse3.c
@@ -0,0 +1,2956 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "av1/common/av1_inv_txfm1d_cfg.h"
+#include "av1/common/x86/av1_inv_txfm_ssse3.h"
+#include "av1/common/x86/av1_txfm_sse2.h"
+
+// TODO(venkatsanampudi@ittiam.com): move this to header file
+
+// Sqrt2, Sqrt2^2, Sqrt2^3, Sqrt2^4, Sqrt2^5
+static int32_t NewSqrt2list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096,
+                                          4 * 5793 };
+
+// TODO(binpengsmail@gmail.com): replace some for loop with do {} while
+
+static void idct4_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+
+  // stage 1
+  __m128i x[4];
+  x[0] = input[0];
+  x[1] = input[2];
+  x[2] = input[1];
+  x[3] = input[3];
+
+  // stage 2
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
+  btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
+
+  // stage 3
+  btf_16_adds_subs_out_sse2(output[0], output[3], x[0], x[3]);
+  btf_16_adds_subs_out_sse2(output[1], output[2], x[1], x[2]);
+}
+
+static void idct4_w4_sse2(const __m128i *input, __m128i *output,
+                          int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+
+  // stage 1
+  __m128i x[4];
+  x[0] = input[0];
+  x[1] = input[2];
+  x[2] = input[1];
+  x[3] = input[3];
+
+  // stage 2
+  btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
+  btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
+
+  // stage 3
+  btf_16_adds_subs_out_sse2(output[0], output[3], x[0], x[3]);
+  btf_16_adds_subs_out_sse2(output[1], output[2], x[1], x[2]);
+}
+
+static void idct8_low1_ssse3(const __m128i *input, __m128i *output,
+                             int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+
+  // stage 1
+  __m128i x[2];
+  x[0] = input[0];
+
+  // stage 2
+  // stage 3
+  btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+
+  // stage 4
+  // stage 5
+  output[0] = x[0];
+  output[7] = x[0];
+  output[1] = x[1];
+  output[6] = x[1];
+  output[2] = x[1];
+  output[5] = x[1];
+  output[3] = x[0];
+  output[4] = x[0];
+}
+
+static void idct8_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+  const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+  const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
+  const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+
+  // stage 1
+  __m128i x[8];
+  x[0] = input[0];
+  x[1] = input[4];
+  x[2] = input[2];
+  x[3] = input[6];
+  x[4] = input[1];
+  x[5] = input[5];
+  x[6] = input[3];
+  x[7] = input[7];
+
+  // stage 2
+  btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
+  btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
+
+  // stage 3
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
+  btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
+  btf_16_adds_subs_sse2(x[4], x[5]);
+  btf_16_subs_adds_sse2(x[7], x[6]);
+
+  // stage 4
+  btf_16_adds_subs_sse2(x[0], x[3]);
+  btf_16_adds_subs_sse2(x[1], x[2]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
+
+  // stage 5
+  btf_16_adds_subs_out_sse2(output[0], output[7], x[0], x[7]);
+  btf_16_adds_subs_out_sse2(output[1], output[6], x[1], x[6]);
+  btf_16_adds_subs_out_sse2(output[2], output[5], x[2], x[5]);
+  btf_16_adds_subs_out_sse2(output[3], output[4], x[3], x[4]);
+}
+
+static void idct8_w4_sse2(const __m128i *input, __m128i *output,
+                          int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+  const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+  const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
+  const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+
+  // stage 1
+  __m128i x[8];
+  x[0] = input[0];
+  x[1] = input[4];
+  x[2] = input[2];
+  x[3] = input[6];
+  x[4] = input[1];
+  x[5] = input[5];
+  x[6] = input[3];
+  x[7] = input[7];
+
+  // stage 2
+  btf_16_4p_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
+  btf_16_4p_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
+
+  // stage 3
+  btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
+  btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
+  btf_16_adds_subs_sse2(x[4], x[5]);
+  btf_16_subs_adds_sse2(x[7], x[6]);
+
+  // stage 4
+  btf_16_adds_subs_sse2(x[0], x[3]);
+  btf_16_adds_subs_sse2(x[1], x[2]);
+  btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
+
+  // stage 5
+  btf_16_adds_subs_out_sse2(output[0], output[7], x[0], x[7]);
+  btf_16_adds_subs_out_sse2(output[1], output[6], x[1], x[6]);
+  btf_16_adds_subs_out_sse2(output[2], output[5], x[2], x[5]);
+  btf_16_adds_subs_out_sse2(output[3], output[4], x[3], x[4]);
+}
+
+static INLINE void idct16_stage5_sse2(__m128i *x, const int32_t *cospi,
+                                      const __m128i __rounding,
+                                      int8_t cos_bit) {
+  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  btf_16_adds_subs_sse2(x[0], x[3]);
+  btf_16_adds_subs_sse2(x[1], x[2]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
+  btf_16_adds_subs_sse2(x[8], x[11]);
+  btf_16_adds_subs_sse2(x[9], x[10]);
+  btf_16_subs_adds_sse2(x[15], x[12]);
+  btf_16_subs_adds_sse2(x[14], x[13]);
+}
+
+static INLINE void idct16_stage6_sse2(__m128i *x, const int32_t *cospi,
+                                      const __m128i __rounding,
+                                      int8_t cos_bit) {
+  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  btf_16_adds_subs_sse2(x[0], x[7]);
+  btf_16_adds_subs_sse2(x[1], x[6]);
+  btf_16_adds_subs_sse2(x[2], x[5]);
+  btf_16_adds_subs_sse2(x[3], x[4]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
+}
+
+static INLINE void idct16_stage7_sse2(__m128i *output, __m128i *x) {
+  btf_16_adds_subs_out_sse2(output[0], output[15], x[0], x[15]);
+  btf_16_adds_subs_out_sse2(output[1], output[14], x[1], x[14]);
+  btf_16_adds_subs_out_sse2(output[2], output[13], x[2], x[13]);
+  btf_16_adds_subs_out_sse2(output[3], output[12], x[3], x[12]);
+  btf_16_adds_subs_out_sse2(output[4], output[11], x[4], x[11]);
+  btf_16_adds_subs_out_sse2(output[5], output[10], x[5], x[10]);
+  btf_16_adds_subs_out_sse2(output[6], output[9], x[6], x[9]);
+  btf_16_adds_subs_out_sse2(output[7], output[8], x[7], x[8]);
+}
+
+static void idct16_low1_ssse3(const __m128i *input, __m128i *output,
+                              int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+
+  // stage 1
+  __m128i x[2];
+  x[0] = input[0];
+
+  // stage 2
+  // stage 3
+  // stage 4
+  btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+
+  // stage 5
+  // stage 6
+  // stage 7
+  output[0] = x[0];
+  output[15] = x[0];
+  output[1] = x[1];
+  output[14] = x[1];
+  output[2] = x[1];
+  output[13] = x[1];
+  output[3] = x[0];
+  output[12] = x[0];
+  output[4] = x[0];
+  output[11] = x[0];
+  output[5] = x[1];
+  output[10] = x[1];
+  output[6] = x[1];
+  output[9] = x[1];
+  output[7] = x[0];
+  output[8] = x[0];
+}
+
+static void idct16_low8_ssse3(const __m128i *input, __m128i *output,
+                              int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+  const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+  const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+  const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+
+  // stage 1
+  __m128i x[16];
+  x[0] = input[0];
+  x[2] = input[4];
+  x[4] = input[2];
+  x[6] = input[6];
+  x[8] = input[1];
+  x[10] = input[5];
+  x[12] = input[3];
+  x[14] = input[7];
+
+  // stage 2
+  btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
+  btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]);
+  btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]);
+  btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
+
+  // stage 3
+  btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
+  btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]);
+  btf_16_adds_subs_sse2(x[8], x[9]);
+  btf_16_subs_adds_sse2(x[11], x[10]);
+  btf_16_adds_subs_sse2(x[12], x[13]);
+  btf_16_subs_adds_sse2(x[15], x[14]);
+
+  // stage 4
+  btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+  btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]);
+  btf_16_adds_subs_sse2(x[4], x[5]);
+  btf_16_subs_adds_sse2(x[7], x[6]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
+
+  idct16_stage5_sse2(x, cospi, __rounding, cos_bit);
+  idct16_stage6_sse2(x, cospi, __rounding, cos_bit);
+  idct16_stage7_sse2(output, x);
+}
+
+static void idct16_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
+  const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
+  const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
+  const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
+  const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
+  const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
+  const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
+  const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
+  const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+  const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+  const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
+  const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+  const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+  const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+  const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+
+  // stage 1
+  __m128i x[16];
+  x[0] = input[0];
+  x[1] = input[8];
+  x[2] = input[4];
+  x[3] = input[12];
+  x[4] = input[2];
+  x[5] = input[10];
+  x[6] = input[6];
+  x[7] = input[14];
+  x[8] = input[1];
+  x[9] = input[9];
+  x[10] = input[5];
+  x[11] = input[13];
+  x[12] = input[3];
+  x[13] = input[11];
+  x[14] = input[7];
+  x[15] = input[15];
+
+  // stage 2
+  btf_16_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15]);
+  btf_16_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14]);
+  btf_16_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13]);
+  btf_16_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12]);
+
+  // stage 3
+  btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
+  btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
+  btf_16_adds_subs_sse2(x[8], x[9]);
+  btf_16_subs_adds_sse2(x[11], x[10]);
+  btf_16_adds_subs_sse2(x[12], x[13]);
+  btf_16_subs_adds_sse2(x[15], x[14]);
+
+  // stage 4
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
+  btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
+  btf_16_adds_subs_sse2(x[4], x[5]);
+  btf_16_subs_adds_sse2(x[7], x[6]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
+
+  // stage 5~7
+  idct16_stage5_sse2(x, cospi, __rounding, cos_bit);
+  idct16_stage6_sse2(x, cospi, __rounding, cos_bit);
+  idct16_stage7_sse2(output, x);
+}
+
+static void idct16_w4_sse2(const __m128i *input, __m128i *output,
+                           int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
+  const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
+  const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
+  const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
+  const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
+  const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
+  const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
+  const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
+  const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+  const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+  const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
+  const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+  const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+  const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+  const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+
+  // stage 1
+  __m128i x[16];
+  x[0] = input[0];
+  x[1] = input[8];
+  x[2] = input[4];
+  x[3] = input[12];
+  x[4] = input[2];
+  x[5] = input[10];
+  x[6] = input[6];
+  x[7] = input[14];
+  x[8] = input[1];
+  x[9] = input[9];
+  x[10] = input[5];
+  x[11] = input[13];
+  x[12] = input[3];
+  x[13] = input[11];
+  x[14] = input[7];
+  x[15] = input[15];
+
+  // stage 2
+  btf_16_4p_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15]);
+  btf_16_4p_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14]);
+  btf_16_4p_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13]);
+  btf_16_4p_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12]);
+
+  // stage 3
+  btf_16_4p_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
+  btf_16_4p_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
+  btf_16_adds_subs_sse2(x[8], x[9]);
+  btf_16_subs_adds_sse2(x[11], x[10]);
+  btf_16_adds_subs_sse2(x[12], x[13]);
+  btf_16_subs_adds_sse2(x[15], x[14]);
+
+  // stage 4
+  btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
+  btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
+  btf_16_adds_subs_sse2(x[4], x[5]);
+  btf_16_subs_adds_sse2(x[7], x[6]);
+  btf_16_4p_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
+  btf_16_4p_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
+
+  // stage 5
+  btf_16_adds_subs_sse2(x[0], x[3]);
+  btf_16_adds_subs_sse2(x[1], x[2]);
+  btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
+  btf_16_adds_subs_sse2(x[8], x[11]);
+  btf_16_adds_subs_sse2(x[9], x[10]);
+  btf_16_subs_adds_sse2(x[15], x[12]);
+  btf_16_subs_adds_sse2(x[14], x[13]);
+
+  // stage 6
+  btf_16_adds_subs_sse2(x[0], x[7]);
+  btf_16_adds_subs_sse2(x[1], x[6]);
+  btf_16_adds_subs_sse2(x[2], x[5]);
+  btf_16_adds_subs_sse2(x[3], x[4]);
+  btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
+  btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
+
+  // stage 7
+  idct16_stage7_sse2(output, x);
+}
+
+static INLINE void idct32_high16_stage3_sse2(__m128i *x) {
+  btf_16_adds_subs_sse2(x[16], x[17]);
+  btf_16_subs_adds_sse2(x[19], x[18]);
+  btf_16_adds_subs_sse2(x[20], x[21]);
+  btf_16_subs_adds_sse2(x[23], x[22]);
+  btf_16_adds_subs_sse2(x[24], x[25]);
+  btf_16_subs_adds_sse2(x[27], x[26]);
+  btf_16_adds_subs_sse2(x[28], x[29]);
+  btf_16_subs_adds_sse2(x[31], x[30]);
+}
+
+static INLINE void idct32_high16_stage4_sse2(__m128i *x, const int32_t *cospi,
+                                             const __m128i __rounding,
+                                             int8_t cos_bit) {
+  const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+  const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+  const __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
+  const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+  const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+  const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
+  btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]);
+  btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[18], x[29], x[18], x[29]);
+  btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[21], x[26], x[21], x[26]);
+  btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]);
+}
+
+static INLINE void idct32_high24_stage5_sse2(__m128i *x, const int32_t *cospi,
+                                             const __m128i __rounding,
+                                             int8_t cos_bit) {
+  const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+  const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+  const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
+  btf_16_adds_subs_sse2(x[16], x[19]);
+  btf_16_adds_subs_sse2(x[17], x[18]);
+  btf_16_subs_adds_sse2(x[23], x[20]);
+  btf_16_subs_adds_sse2(x[22], x[21]);
+  btf_16_adds_subs_sse2(x[24], x[27]);
+  btf_16_adds_subs_sse2(x[25], x[26]);
+  btf_16_subs_adds_sse2(x[31], x[28]);
+  btf_16_subs_adds_sse2(x[30], x[29]);
+}
+
+static INLINE void idct32_high28_stage6_sse2(__m128i *x, const int32_t *cospi,
+                                             const __m128i __rounding,
+                                             int8_t cos_bit) {
+  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+  const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+  const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
+  btf_16_adds_subs_sse2(x[8], x[11]);
+  btf_16_adds_subs_sse2(x[9], x[10]);
+  btf_16_subs_adds_sse2(x[15], x[12]);
+  btf_16_subs_adds_sse2(x[14], x[13]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[18], x[29], x[18], x[29]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[19], x[28], x[19], x[28]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[20], x[27], x[20], x[27]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[21], x[26], x[21], x[26]);
+}
+
+static INLINE void idct32_stage7_sse2(__m128i *x, const int32_t *cospi,
+                                      const __m128i __rounding,
+                                      int8_t cos_bit) {
+  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  btf_16_adds_subs_sse2(x[0], x[7]);
+  btf_16_adds_subs_sse2(x[1], x[6]);
+  btf_16_adds_subs_sse2(x[2], x[5]);
+  btf_16_adds_subs_sse2(x[3], x[4]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
+  btf_16_adds_subs_sse2(x[16], x[23]);
+  btf_16_adds_subs_sse2(x[17], x[22]);
+  btf_16_adds_subs_sse2(x[18], x[21]);
+  btf_16_adds_subs_sse2(x[19], x[20]);
+  btf_16_subs_adds_sse2(x[31], x[24]);
+  btf_16_subs_adds_sse2(x[30], x[25]);
+  btf_16_subs_adds_sse2(x[29], x[26]);
+  btf_16_subs_adds_sse2(x[28], x[27]);
+}
+
+static INLINE void idct32_stage8_sse2(__m128i *x, const int32_t *cospi,
+                                      const __m128i __rounding,
+                                      int8_t cos_bit) {
+  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  btf_16_adds_subs_sse2(x[0], x[15]);
+  btf_16_adds_subs_sse2(x[1], x[14]);
+  btf_16_adds_subs_sse2(x[2], x[13]);
+  btf_16_adds_subs_sse2(x[3], x[12]);
+  btf_16_adds_subs_sse2(x[4], x[11]);
+  btf_16_adds_subs_sse2(x[5], x[10]);
+  btf_16_adds_subs_sse2(x[6], x[9]);
+  btf_16_adds_subs_sse2(x[7], x[8]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[20], x[27], x[20], x[27]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[21], x[26], x[21], x[26]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[22], x[25], x[22], x[25]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[23], x[24], x[23], x[24]);
+}
+
+static INLINE void idct32_stage9_sse2(__m128i *output, __m128i *x) {
+  btf_16_adds_subs_out_sse2(output[0], output[31], x[0], x[31]);
+  btf_16_adds_subs_out_sse2(output[1], output[30], x[1], x[30]);
+  btf_16_adds_subs_out_sse2(output[2], output[29], x[2], x[29]);
+  btf_16_adds_subs_out_sse2(output[3], output[28], x[3], x[28]);
+  btf_16_adds_subs_out_sse2(output[4], output[27], x[4], x[27]);
+  btf_16_adds_subs_out_sse2(output[5], output[26], x[5], x[26]);
+  btf_16_adds_subs_out_sse2(output[6], output[25], x[6], x[25]);
+  btf_16_adds_subs_out_sse2(output[7], output[24], x[7], x[24]);
+  btf_16_adds_subs_out_sse2(output[8], output[23], x[8], x[23]);
+  btf_16_adds_subs_out_sse2(output[9], output[22], x[9], x[22]);
+  btf_16_adds_subs_out_sse2(output[10], output[21], x[10], x[21]);
+  btf_16_adds_subs_out_sse2(output[11], output[20], x[11], x[20]);
+  btf_16_adds_subs_out_sse2(output[12], output[19], x[12], x[19]);
+  btf_16_adds_subs_out_sse2(output[13], output[18], x[13], x[18]);
+  btf_16_adds_subs_out_sse2(output[14], output[17], x[14], x[17]);
+  btf_16_adds_subs_out_sse2(output[15], output[16], x[15], x[16]);
+}
+
+static void idct32_low1_ssse3(const __m128i *input, __m128i *output,
+                              int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+
+  // stage 1
+  __m128i x[2];
+  x[0] = input[0];
+
+  // stage 2
+  // stage 3
+  // stage 4
+  // stage 5
+  btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+
+  // stage 6
+  // stage 7
+  // stage 8
+  // stage 9
+  output[0] = x[0];
+  output[31] = x[0];
+  output[1] = x[1];
+  output[30] = x[1];
+  output[2] = x[1];
+  output[29] = x[1];
+  output[3] = x[0];
+  output[28] = x[0];
+  output[4] = x[0];
+  output[27] = x[0];
+  output[5] = x[1];
+  output[26] = x[1];
+  output[6] = x[1];
+  output[25] = x[1];
+  output[7] = x[0];
+  output[24] = x[0];
+  output[8] = x[0];
+  output[23] = x[0];
+  output[9] = x[1];
+  output[22] = x[1];
+  output[10] = x[1];
+  output[21] = x[1];
+  output[11] = x[0];
+  output[20] = x[0];
+  output[12] = x[0];
+  output[19] = x[0];
+  output[13] = x[1];
+  output[18] = x[1];
+  output[14] = x[1];
+  output[17] = x[1];
+  output[15] = x[0];
+  output[16] = x[0];
+}
+
+static void idct32_low8_ssse3(const __m128i *input, __m128i *output,
+                              int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  // stage 1
+  __m128i x[32];
+  x[0] = input[0];
+  x[4] = input[4];
+  x[8] = input[2];
+  x[12] = input[6];
+  x[16] = input[1];
+  x[20] = input[5];
+  x[24] = input[3];
+  x[28] = input[7];
+
+  // stage 2
+  btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
+  btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
+  btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
+  btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
+
+  // stage 3
+  btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
+  btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
+  x[17] = x[16];
+  x[18] = x[19];
+  x[21] = x[20];
+  x[22] = x[23];
+  x[25] = x[24];
+  x[26] = x[27];
+  x[29] = x[28];
+  x[30] = x[31];
+
+  // stage 4
+  btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
+  x[9] = x[8];
+  x[10] = x[11];
+  x[13] = x[12];
+  x[14] = x[15];
+  idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit);
+
+  // stage 5
+  btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+  x[5] = x[4];
+  x[6] = x[7];
+  idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit);
+  // stage 6
+  x[3] = x[0];
+  x[2] = x[1];
+  idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit);
+
+  idct32_stage7_sse2(x, cospi, __rounding, cos_bit);
+  idct32_stage8_sse2(x, cospi, __rounding, cos_bit);
+  idct32_stage9_sse2(output, x);
+}
+
+static void idct32_low16_ssse3(const __m128i *input, __m128i *output,
+                               int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  // stage 1
+  __m128i x[32];
+  x[0] = input[0];
+  x[2] = input[8];
+  x[4] = input[4];
+  x[6] = input[12];
+  x[8] = input[2];
+  x[10] = input[10];
+  x[12] = input[6];
+  x[14] = input[14];
+  x[16] = input[1];
+  x[18] = input[9];
+  x[20] = input[5];
+  x[22] = input[13];
+  x[24] = input[3];
+  x[26] = input[11];
+  x[28] = input[7];
+  x[30] = input[15];
+
+  // stage 2
+  btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
+  btf_16_ssse3(-cospi[34], cospi[30], x[30], x[17], x[30]);
+  btf_16_ssse3(cospi[46], cospi[18], x[18], x[18], x[29]);
+  btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
+  btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
+  btf_16_ssse3(-cospi[42], cospi[22], x[26], x[21], x[26]);
+  btf_16_ssse3(cospi[38], cospi[26], x[22], x[22], x[25]);
+  btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
+
+  // stage 3
+  btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
+  btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]);
+  btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]);
+  btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
+  idct32_high16_stage3_sse2(x);
+
+  // stage 4
+  btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
+  btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]);
+  btf_16_adds_subs_sse2(x[8], x[9]);
+  btf_16_subs_adds_sse2(x[11], x[10]);
+  btf_16_adds_subs_sse2(x[12], x[13]);
+  btf_16_subs_adds_sse2(x[15], x[14]);
+  idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit);
+
+  // stage 5
+  btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+  btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]);
+  btf_16_adds_subs_sse2(x[4], x[5]);
+  btf_16_subs_adds_sse2(x[7], x[6]);
+  idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit);
+
+  btf_16_adds_subs_sse2(x[0], x[3]);
+  btf_16_adds_subs_sse2(x[1], x[2]);
+  idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit);
+
+  idct32_stage7_sse2(x, cospi, __rounding, cos_bit);
+  idct32_stage8_sse2(x, cospi, __rounding, cos_bit);
+  idct32_stage9_sse2(output, x);
+}
+
+static void idct32_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
+  const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
+  const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
+  const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
+  const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
+  const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
+  const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
+  const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
+  const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
+  const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
+  const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
+  const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
+  const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
+  const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
+  const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
+  const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
+  const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
+  const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
+  const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
+  const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
+  const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
+  const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
+  const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
+  const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
+  const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+  const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+  const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
+  const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+
+  // stage 1
+  __m128i x[32];
+  x[0] = input[0];
+  x[1] = input[16];
+  x[2] = input[8];
+  x[3] = input[24];
+  x[4] = input[4];
+  x[5] = input[20];
+  x[6] = input[12];
+  x[7] = input[28];
+  x[8] = input[2];
+  x[9] = input[18];
+  x[10] = input[10];
+  x[11] = input[26];
+  x[12] = input[6];
+  x[13] = input[22];
+  x[14] = input[14];
+  x[15] = input[30];
+  x[16] = input[1];
+  x[17] = input[17];
+  x[18] = input[9];
+  x[19] = input[25];
+  x[20] = input[5];
+  x[21] = input[21];
+  x[22] = input[13];
+  x[23] = input[29];
+  x[24] = input[3];
+  x[25] = input[19];
+  x[26] = input[11];
+  x[27] = input[27];
+  x[28] = input[7];
+  x[29] = input[23];
+  x[30] = input[15];
+  x[31] = input[31];
+
+  // stage 2
+  btf_16_sse2(cospi_p62_m02, cospi_p02_p62, x[16], x[31], x[16], x[31]);
+  btf_16_sse2(cospi_p30_m34, cospi_p34_p30, x[17], x[30], x[17], x[30]);
+  btf_16_sse2(cospi_p46_m18, cospi_p18_p46, x[18], x[29], x[18], x[29]);
+  btf_16_sse2(cospi_p14_m50, cospi_p50_p14, x[19], x[28], x[19], x[28]);
+  btf_16_sse2(cospi_p54_m10, cospi_p10_p54, x[20], x[27], x[20], x[27]);
+  btf_16_sse2(cospi_p22_m42, cospi_p42_p22, x[21], x[26], x[21], x[26]);
+  btf_16_sse2(cospi_p38_m26, cospi_p26_p38, x[22], x[25], x[22], x[25]);
+  btf_16_sse2(cospi_p06_m58, cospi_p58_p06, x[23], x[24], x[23], x[24]);
+
+  // stage 3
+  btf_16_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15]);
+  btf_16_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14]);
+  btf_16_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13]);
+  btf_16_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12]);
+  idct32_high16_stage3_sse2(x);
+
+  // stage 4
+  btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
+  btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
+  btf_16_adds_subs_sse2(x[8], x[9]);
+  btf_16_subs_adds_sse2(x[11], x[10]);
+  btf_16_adds_subs_sse2(x[12], x[13]);
+  btf_16_subs_adds_sse2(x[15], x[14]);
+  idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit);
+
+  // stage 5
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
+  btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
+  btf_16_adds_subs_sse2(x[4], x[5]);
+  btf_16_adds_subs_sse2(x[7], x[6]);
+  idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit);
+
+  // stage 6
+  btf_16_adds_subs_sse2(x[0], x[3]);
+  btf_16_adds_subs_sse2(x[1], x[2]);
+  idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit);
+
+  // stage 7~8
+  idct32_stage7_sse2(x, cospi, __rounding, cos_bit);
+  idct32_stage8_sse2(x, cospi, __rounding, cos_bit);
+  idct32_stage9_sse2(output, x);
+}
+
+static INLINE void idct64_stage4_high32_sse2(__m128i *x, const int32_t *cospi,
+                                             const __m128i __rounding,
+                                             int8_t cos_bit) {
+  const __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
+  const __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
+  const __m128i cospi_m60_m04 = pair_set_epi16(-cospi[60], -cospi[4]);
+  const __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
+  const __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]);
+  const __m128i cospi_m28_m36 = pair_set_epi16(-cospi[28], -cospi[36]);
+  const __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
+  const __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
+  const __m128i cospi_m44_m20 = pair_set_epi16(-cospi[44], -cospi[20]);
+  const __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
+  const __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]);
+  const __m128i cospi_m12_m52 = pair_set_epi16(-cospi[12], -cospi[52]);
+  btf_16_sse2(cospi_m04_p60, cospi_p60_p04, x[33], x[62], x[33], x[62]);
+  btf_16_sse2(cospi_m60_m04, cospi_m04_p60, x[34], x[61], x[34], x[61]);
+  btf_16_sse2(cospi_m36_p28, cospi_p28_p36, x[37], x[58], x[37], x[58]);
+  btf_16_sse2(cospi_m28_m36, cospi_m36_p28, x[38], x[57], x[38], x[57]);
+  btf_16_sse2(cospi_m20_p44, cospi_p44_p20, x[41], x[54], x[41], x[54]);
+  btf_16_sse2(cospi_m44_m20, cospi_m20_p44, x[42], x[53], x[42], x[53]);
+  btf_16_sse2(cospi_m52_p12, cospi_p12_p52, x[45], x[50], x[45], x[50]);
+  btf_16_sse2(cospi_m12_m52, cospi_m52_p12, x[46], x[49], x[46], x[49]);
+}
+
+static INLINE void idct64_stage5_high48_sse2(__m128i *x, const int32_t *cospi,
+                                             const __m128i __rounding,
+                                             int8_t cos_bit) {
+  const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+  const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+  const __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
+  const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+  const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+  const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
+  btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]);
+  btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[18], x[29], x[18], x[29]);
+  btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[21], x[26], x[21], x[26]);
+  btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]);
+  btf_16_adds_subs_sse2(x[32], x[35]);
+  btf_16_adds_subs_sse2(x[33], x[34]);
+  btf_16_subs_adds_sse2(x[39], x[36]);
+  btf_16_subs_adds_sse2(x[38], x[37]);
+  btf_16_adds_subs_sse2(x[40], x[43]);
+  btf_16_adds_subs_sse2(x[41], x[42]);
+  btf_16_subs_adds_sse2(x[47], x[44]);
+  btf_16_subs_adds_sse2(x[46], x[45]);
+  btf_16_adds_subs_sse2(x[48], x[51]);
+  btf_16_adds_subs_sse2(x[49], x[50]);
+  btf_16_subs_adds_sse2(x[55], x[52]);
+  btf_16_subs_adds_sse2(x[54], x[53]);
+  btf_16_adds_subs_sse2(x[56], x[59]);
+  btf_16_adds_subs_sse2(x[57], x[58]);
+  btf_16_subs_adds_sse2(x[63], x[60]);
+  btf_16_subs_adds_sse2(x[62], x[61]);
+}
+
+static INLINE void idct64_stage6_high32_sse2(__m128i *x, const int32_t *cospi,
+                                             const __m128i __rounding,
+                                             int8_t cos_bit) {
+  const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+  const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+  const __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
+  const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+  const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+  const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
+  btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[34], x[61], x[34], x[61]);
+  btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[35], x[60], x[35], x[60]);
+  btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[36], x[59], x[36], x[59]);
+  btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[37], x[58], x[37], x[58]);
+  btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[42], x[53], x[42], x[53]);
+  btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[43], x[52], x[43], x[52]);
+  btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[44], x[51], x[44], x[51]);
+  btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[45], x[50], x[45], x[50]);
+}
+
+static INLINE void idct64_stage6_high48_sse2(__m128i *x, const int32_t *cospi,
+                                             const __m128i __rounding,
+                                             int8_t cos_bit) {
+  btf_16_adds_subs_sse2(x[16], x[19]);
+  btf_16_adds_subs_sse2(x[17], x[18]);
+  btf_16_subs_adds_sse2(x[23], x[20]);
+  btf_16_subs_adds_sse2(x[22], x[21]);
+  btf_16_adds_subs_sse2(x[24], x[27]);
+  btf_16_adds_subs_sse2(x[25], x[26]);
+  btf_16_subs_adds_sse2(x[31], x[28]);
+  btf_16_subs_adds_sse2(x[30], x[29]);
+  idct64_stage6_high32_sse2(x, cospi, __rounding, cos_bit);
+}
+
+static INLINE void idct64_stage7_high48_sse2(__m128i *x, const int32_t *cospi,
+                                             const __m128i __rounding,
+                                             int8_t cos_bit) {
+  const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+  const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+  const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[18], x[29], x[18], x[29]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[19], x[28], x[19], x[28]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[20], x[27], x[20], x[27]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[21], x[26], x[21], x[26]);
+  btf_16_adds_subs_sse2(x[32], x[39]);
+  btf_16_adds_subs_sse2(x[33], x[38]);
+  btf_16_adds_subs_sse2(x[34], x[37]);
+  btf_16_adds_subs_sse2(x[35], x[36]);
+  btf_16_subs_adds_sse2(x[47], x[40]);
+  btf_16_subs_adds_sse2(x[46], x[41]);
+  btf_16_subs_adds_sse2(x[45], x[42]);
+  btf_16_subs_adds_sse2(x[44], x[43]);
+  btf_16_adds_subs_sse2(x[48], x[55]);
+  btf_16_adds_subs_sse2(x[49], x[54]);
+  btf_16_adds_subs_sse2(x[50], x[53]);
+  btf_16_adds_subs_sse2(x[51], x[52]);
+  btf_16_subs_adds_sse2(x[63], x[56]);
+  btf_16_subs_adds_sse2(x[62], x[57]);
+  btf_16_subs_adds_sse2(x[61], x[58]);
+  btf_16_subs_adds_sse2(x[60], x[59]);
+}
+
+static INLINE void idct64_stage8_high48_sse2(__m128i *x, const int32_t *cospi,
+                                             const __m128i __rounding,
+                                             int8_t cos_bit) {
+  const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+  const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+  const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+  btf_16_adds_subs_sse2(x[16], x[23]);
+  btf_16_adds_subs_sse2(x[17], x[22]);
+  btf_16_adds_subs_sse2(x[18], x[21]);
+  btf_16_adds_subs_sse2(x[19], x[20]);
+  btf_16_subs_adds_sse2(x[31], x[24]);
+  btf_16_subs_adds_sse2(x[30], x[25]);
+  btf_16_subs_adds_sse2(x[29], x[26]);
+  btf_16_subs_adds_sse2(x[28], x[27]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[36], x[59], x[36], x[59]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[37], x[58], x[37], x[58]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[38], x[57], x[38], x[57]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[39], x[56], x[39], x[56]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[40], x[55], x[40], x[55]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[41], x[54], x[41], x[54]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[42], x[53], x[42], x[53]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[43], x[52], x[43], x[52]);
+}
+
+static INLINE void idct64_stage9_sse2(__m128i *x, const int32_t *cospi,
+                                      const __m128i __rounding,
+                                      int8_t cos_bit) {
+  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  btf_16_adds_subs_sse2(x[0], x[15]);
+  btf_16_adds_subs_sse2(x[1], x[14]);
+  btf_16_adds_subs_sse2(x[2], x[13]);
+  btf_16_adds_subs_sse2(x[3], x[12]);
+  btf_16_adds_subs_sse2(x[4], x[11]);
+  btf_16_adds_subs_sse2(x[5], x[10]);
+  btf_16_adds_subs_sse2(x[6], x[9]);
+  btf_16_adds_subs_sse2(x[7], x[8]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[20], x[27], x[20], x[27]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[21], x[26], x[21], x[26]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[22], x[25], x[22], x[25]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[23], x[24], x[23], x[24]);
+  btf_16_adds_subs_sse2(x[32], x[47]);
+  btf_16_adds_subs_sse2(x[33], x[46]);
+  btf_16_adds_subs_sse2(x[34], x[45]);
+  btf_16_adds_subs_sse2(x[35], x[44]);
+  btf_16_adds_subs_sse2(x[36], x[43]);
+  btf_16_adds_subs_sse2(x[37], x[42]);
+  btf_16_adds_subs_sse2(x[38], x[41]);
+  btf_16_adds_subs_sse2(x[39], x[40]);
+  btf_16_subs_adds_sse2(x[63], x[48]);
+  btf_16_subs_adds_sse2(x[62], x[49]);
+  btf_16_subs_adds_sse2(x[61], x[50]);
+  btf_16_subs_adds_sse2(x[60], x[51]);
+  btf_16_subs_adds_sse2(x[59], x[52]);
+  btf_16_subs_adds_sse2(x[58], x[53]);
+  btf_16_subs_adds_sse2(x[57], x[54]);
+  btf_16_subs_adds_sse2(x[56], x[55]);
+}
+
+static INLINE void idct64_stage10_sse2(__m128i *x, const int32_t *cospi,
+                                       const __m128i __rounding,
+                                       int8_t cos_bit) {
+  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  btf_16_adds_subs_sse2(x[0], x[31]);
+  btf_16_adds_subs_sse2(x[1], x[30]);
+  btf_16_adds_subs_sse2(x[2], x[29]);
+  btf_16_adds_subs_sse2(x[3], x[28]);
+  btf_16_adds_subs_sse2(x[4], x[27]);
+  btf_16_adds_subs_sse2(x[5], x[26]);
+  btf_16_adds_subs_sse2(x[6], x[25]);
+  btf_16_adds_subs_sse2(x[7], x[24]);
+  btf_16_adds_subs_sse2(x[8], x[23]);
+  btf_16_adds_subs_sse2(x[9], x[22]);
+  btf_16_adds_subs_sse2(x[10], x[21]);
+  btf_16_adds_subs_sse2(x[11], x[20]);
+  btf_16_adds_subs_sse2(x[12], x[19]);
+  btf_16_adds_subs_sse2(x[13], x[18]);
+  btf_16_adds_subs_sse2(x[14], x[17]);
+  btf_16_adds_subs_sse2(x[15], x[16]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[40], x[55], x[40], x[55]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[41], x[54], x[41], x[54]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[42], x[53], x[42], x[53]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[43], x[52], x[43], x[52]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[44], x[51], x[44], x[51]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[45], x[50], x[45], x[50]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[46], x[49], x[46], x[49]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[47], x[48], x[47], x[48]);
+}
+
+static INLINE void idct64_stage11_sse2(__m128i *output, __m128i *x) {
+  btf_16_adds_subs_out_sse2(output[0], output[63], x[0], x[63]);
+  btf_16_adds_subs_out_sse2(output[1], output[62], x[1], x[62]);
+  btf_16_adds_subs_out_sse2(output[2], output[61], x[2], x[61]);
+  btf_16_adds_subs_out_sse2(output[3], output[60], x[3], x[60]);
+  btf_16_adds_subs_out_sse2(output[4], output[59], x[4], x[59]);
+  btf_16_adds_subs_out_sse2(output[5], output[58], x[5], x[58]);
+  btf_16_adds_subs_out_sse2(output[6], output[57], x[6], x[57]);
+  btf_16_adds_subs_out_sse2(output[7], output[56], x[7], x[56]);
+  btf_16_adds_subs_out_sse2(output[8], output[55], x[8], x[55]);
+  btf_16_adds_subs_out_sse2(output[9], output[54], x[9], x[54]);
+  btf_16_adds_subs_out_sse2(output[10], output[53], x[10], x[53]);
+  btf_16_adds_subs_out_sse2(output[11], output[52], x[11], x[52]);
+  btf_16_adds_subs_out_sse2(output[12], output[51], x[12], x[51]);
+  btf_16_adds_subs_out_sse2(output[13], output[50], x[13], x[50]);
+  btf_16_adds_subs_out_sse2(output[14], output[49], x[14], x[49]);
+  btf_16_adds_subs_out_sse2(output[15], output[48], x[15], x[48]);
+  btf_16_adds_subs_out_sse2(output[16], output[47], x[16], x[47]);
+  btf_16_adds_subs_out_sse2(output[17], output[46], x[17], x[46]);
+  btf_16_adds_subs_out_sse2(output[18], output[45], x[18], x[45]);
+  btf_16_adds_subs_out_sse2(output[19], output[44], x[19], x[44]);
+  btf_16_adds_subs_out_sse2(output[20], output[43], x[20], x[43]);
+  btf_16_adds_subs_out_sse2(output[21], output[42], x[21], x[42]);
+  btf_16_adds_subs_out_sse2(output[22], output[41], x[22], x[41]);
+  btf_16_adds_subs_out_sse2(output[23], output[40], x[23], x[40]);
+  btf_16_adds_subs_out_sse2(output[24], output[39], x[24], x[39]);
+  btf_16_adds_subs_out_sse2(output[25], output[38], x[25], x[38]);
+  btf_16_adds_subs_out_sse2(output[26], output[37], x[26], x[37]);
+  btf_16_adds_subs_out_sse2(output[27], output[36], x[27], x[36]);
+  btf_16_adds_subs_out_sse2(output[28], output[35], x[28], x[35]);
+  btf_16_adds_subs_out_sse2(output[29], output[34], x[29], x[34]);
+  btf_16_adds_subs_out_sse2(output[30], output[33], x[30], x[33]);
+  btf_16_adds_subs_out_sse2(output[31], output[32], x[31], x[32]);
+}
+
+static void idct64_low1_ssse3(const __m128i *input, __m128i *output,
+                              int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+
+  // stage 1
+  __m128i x[32];
+  x[0] = input[0];
+
+  // stage 2
+  // stage 3
+  // stage 4
+  // stage 5
+  // stage 6
+  btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+
+  // stage 7
+  // stage 8
+  // stage 9
+  // stage 10
+  // stage 11
+  output[0] = x[0];
+  output[63] = x[0];
+  output[1] = x[1];
+  output[62] = x[1];
+  output[2] = x[1];
+  output[61] = x[1];
+  output[3] = x[0];
+  output[60] = x[0];
+  output[4] = x[0];
+  output[59] = x[0];
+  output[5] = x[1];
+  output[58] = x[1];
+  output[6] = x[1];
+  output[57] = x[1];
+  output[7] = x[0];
+  output[56] = x[0];
+  output[8] = x[0];
+  output[55] = x[0];
+  output[9] = x[1];
+  output[54] = x[1];
+  output[10] = x[1];
+  output[53] = x[1];
+  output[11] = x[0];
+  output[52] = x[0];
+  output[12] = x[0];
+  output[51] = x[0];
+  output[13] = x[1];
+  output[50] = x[1];
+  output[14] = x[1];
+  output[49] = x[1];
+  output[15] = x[0];
+  output[48] = x[0];
+  output[16] = x[0];
+  output[47] = x[0];
+  output[17] = x[1];
+  output[46] = x[1];
+  output[18] = x[1];
+  output[45] = x[1];
+  output[19] = x[0];
+  output[44] = x[0];
+  output[20] = x[0];
+  output[43] = x[0];
+  output[21] = x[1];
+  output[42] = x[1];
+  output[22] = x[1];
+  output[41] = x[1];
+  output[23] = x[0];
+  output[40] = x[0];
+  output[24] = x[0];
+  output[39] = x[0];
+  output[25] = x[1];
+  output[38] = x[1];
+  output[26] = x[1];
+  output[37] = x[1];
+  output[27] = x[0];
+  output[36] = x[0];
+  output[28] = x[0];
+  output[35] = x[0];
+  output[29] = x[1];
+  output[34] = x[1];
+  output[30] = x[1];
+  output[33] = x[1];
+  output[31] = x[0];
+  output[32] = x[0];
+}
+
+static void idct64_low8_ssse3(const __m128i *input, __m128i *output,
+                              int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+  const __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
+  const __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
+  const __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
+  const __m128i cospi_m28_m36 = pair_set_epi16(-cospi[28], -cospi[36]);
+  const __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
+  const __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
+  const __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
+  const __m128i cospi_m12_m52 = pair_set_epi16(-cospi[12], -cospi[52]);
+  const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+  const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+  const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+  const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+  const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+
+  // stage 1
+  __m128i x[64];
+  x[0] = input[0];
+  x[8] = input[4];
+  x[16] = input[2];
+  x[24] = input[6];
+  x[32] = input[1];
+  x[40] = input[5];
+  x[48] = input[3];
+  x[56] = input[7];
+
+  // stage 2
+  btf_16_ssse3(cospi[63], cospi[1], x[32], x[32], x[63]);
+  btf_16_ssse3(-cospi[57], cospi[7], x[56], x[39], x[56]);
+  btf_16_ssse3(cospi[59], cospi[5], x[40], x[40], x[55]);
+  btf_16_ssse3(-cospi[61], cospi[3], x[48], x[47], x[48]);
+
+  // stage 3
+  btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
+  btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
+  x[33] = x[32];
+  x[38] = x[39];
+  x[41] = x[40];
+  x[46] = x[47];
+  x[49] = x[48];
+  x[54] = x[55];
+  x[57] = x[56];
+  x[62] = x[63];
+
+  // stage 4
+  btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
+  x[17] = x[16];
+  x[22] = x[23];
+  x[25] = x[24];
+  x[30] = x[31];
+  btf_16_sse2(cospi_m04_p60, cospi_p60_p04, x[33], x[62], x[33], x[62]);
+  btf_16_sse2(cospi_m28_m36, cospi_m36_p28, x[38], x[57], x[38], x[57]);
+  btf_16_sse2(cospi_m20_p44, cospi_p44_p20, x[41], x[54], x[41], x[54]);
+  btf_16_sse2(cospi_m12_m52, cospi_m52_p12, x[46], x[49], x[46], x[49]);
+
+  // stage 5
+  x[9] = x[8];
+  x[14] = x[15];
+  btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]);
+  btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]);
+  x[35] = x[32];
+  x[34] = x[33];
+  x[36] = x[39];
+  x[37] = x[38];
+  x[43] = x[40];
+  x[42] = x[41];
+  x[44] = x[47];
+  x[45] = x[46];
+  x[51] = x[48];
+  x[50] = x[49];
+  x[52] = x[55];
+  x[53] = x[54];
+  x[59] = x[56];
+  x[58] = x[57];
+  x[60] = x[63];
+  x[61] = x[62];
+
+  // stage 6
+  btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
+  x[19] = x[16];
+  x[18] = x[17];
+  x[20] = x[23];
+  x[21] = x[22];
+  x[27] = x[24];
+  x[26] = x[25];
+  x[28] = x[31];
+  x[29] = x[30];
+  idct64_stage6_high32_sse2(x, cospi, __rounding, cos_bit);
+
+  // stage 7
+  x[3] = x[0];
+  x[2] = x[1];
+  x[11] = x[8];
+  x[10] = x[9];
+  x[12] = x[15];
+  x[13] = x[14];
+  idct64_stage7_high48_sse2(x, cospi, __rounding, cos_bit);
+
+  // stage 8
+  x[7] = x[0];
+  x[6] = x[1];
+  x[5] = x[2];
+  x[4] = x[3];
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
+  idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit);
+
+  idct64_stage9_sse2(x, cospi, __rounding, cos_bit);
+  idct64_stage10_sse2(x, cospi, __rounding, cos_bit);
+  idct64_stage11_sse2(output, x);
+}
+
+static void idct64_low16_ssse3(const __m128i *input, __m128i *output,
+                               int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+  const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+  const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+
+  // stage 1
+  __m128i x[64];
+  x[0] = input[0];
+  x[4] = input[8];
+  x[8] = input[4];
+  x[12] = input[12];
+  x[16] = input[2];
+  x[20] = input[10];
+  x[24] = input[6];
+  x[28] = input[14];
+  x[32] = input[1];
+  x[36] = input[9];
+  x[40] = input[5];
+  x[44] = input[13];
+  x[48] = input[3];
+  x[52] = input[11];
+  x[56] = input[7];
+  x[60] = input[15];
+
+  // stage 2
+  btf_16_ssse3(cospi[63], cospi[1], x[32], x[32], x[63]);
+  btf_16_ssse3(-cospi[49], cospi[15], x[60], x[35], x[60]);
+  btf_16_ssse3(cospi[55], cospi[9], x[36], x[36], x[59]);
+  btf_16_ssse3(-cospi[57], cospi[7], x[56], x[39], x[56]);
+  btf_16_ssse3(cospi[59], cospi[5], x[40], x[40], x[55]);
+  btf_16_ssse3(-cospi[53], cospi[11], x[52], x[43], x[52]);
+  btf_16_ssse3(cospi[51], cospi[13], x[44], x[44], x[51]);
+  btf_16_ssse3(-cospi[61], cospi[3], x[48], x[47], x[48]);
+
+  // stage 3
+  btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
+  btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
+  btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
+  btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
+  x[33] = x[32];
+  x[34] = x[35];
+  x[37] = x[36];
+  x[38] = x[39];
+  x[41] = x[40];
+  x[42] = x[43];
+  x[45] = x[44];
+  x[46] = x[47];
+  x[49] = x[48];
+  x[50] = x[51];
+  x[53] = x[52];
+  x[54] = x[55];
+  x[57] = x[56];
+  x[58] = x[59];
+  x[61] = x[60];
+  x[62] = x[63];
+
+  // stage 4
+  btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
+  btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
+  x[17] = x[16];
+  x[18] = x[19];
+  x[21] = x[20];
+  x[22] = x[23];
+  x[25] = x[24];
+  x[26] = x[27];
+  x[29] = x[28];
+  x[30] = x[31];
+  idct64_stage4_high32_sse2(x, cospi, __rounding, cos_bit);
+
+  // stage 5
+  btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
+  x[9] = x[8];
+  x[10] = x[11];
+  x[13] = x[12];
+  x[14] = x[15];
+  idct64_stage5_high48_sse2(x, cospi, __rounding, cos_bit);
+
+  // stage 6
+  btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+  x[5] = x[4];
+  x[6] = x[7];
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
+  idct64_stage6_high48_sse2(x, cospi, __rounding, cos_bit);
+
+  // stage 7
+  x[3] = x[0];
+  x[2] = x[1];
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
+  btf_16_adds_subs_sse2(x[8], x[11]);
+  btf_16_adds_subs_sse2(x[9], x[10]);
+  btf_16_subs_adds_sse2(x[15], x[12]);
+  btf_16_subs_adds_sse2(x[14], x[13]);
+  idct64_stage7_high48_sse2(x, cospi, __rounding, cos_bit);
+
+  // stage 8
+  btf_16_adds_subs_sse2(x[0], x[7]);
+  btf_16_adds_subs_sse2(x[1], x[6]);
+  btf_16_adds_subs_sse2(x[2], x[5]);
+  btf_16_adds_subs_sse2(x[3], x[4]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
+  idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit);
+
+  idct64_stage9_sse2(x, cospi, __rounding, cos_bit);
+  idct64_stage10_sse2(x, cospi, __rounding, cos_bit);
+  idct64_stage11_sse2(output, x);
+}
+
+static void idct64_low32_ssse3(const __m128i *input, __m128i *output,
+                               int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+  const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+  const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+  const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+
+  // stage 1
+  __m128i x[64];
+  x[0] = input[0];
+  x[2] = input[16];
+  x[4] = input[8];
+  x[6] = input[24];
+  x[8] = input[4];
+  x[10] = input[20];
+  x[12] = input[12];
+  x[14] = input[28];
+  x[16] = input[2];
+  x[18] = input[18];
+  x[20] = input[10];
+  x[22] = input[26];
+  x[24] = input[6];
+  x[26] = input[22];
+  x[28] = input[14];
+  x[30] = input[30];
+  x[32] = input[1];
+  x[34] = input[17];
+  x[36] = input[9];
+  x[38] = input[25];
+  x[40] = input[5];
+  x[42] = input[21];
+  x[44] = input[13];
+  x[46] = input[29];
+  x[48] = input[3];
+  x[50] = input[19];
+  x[52] = input[11];
+  x[54] = input[27];
+  x[56] = input[7];
+  x[58] = input[23];
+  x[60] = input[15];
+  x[62] = input[31];
+
+  // stage 2
+  btf_16_ssse3(cospi[63], cospi[1], x[32], x[32], x[63]);
+  btf_16_ssse3(-cospi[33], cospi[31], x[62], x[33], x[62]);
+  btf_16_ssse3(cospi[47], cospi[17], x[34], x[34], x[61]);
+  btf_16_ssse3(-cospi[49], cospi[15], x[60], x[35], x[60]);
+  btf_16_ssse3(cospi[55], cospi[9], x[36], x[36], x[59]);
+  btf_16_ssse3(-cospi[41], cospi[23], x[58], x[37], x[58]);
+  btf_16_ssse3(cospi[39], cospi[25], x[38], x[38], x[57]);
+  btf_16_ssse3(-cospi[57], cospi[7], x[56], x[39], x[56]);
+  btf_16_ssse3(cospi[59], cospi[5], x[40], x[40], x[55]);
+  btf_16_ssse3(-cospi[37], cospi[27], x[54], x[41], x[54]);
+  btf_16_ssse3(cospi[43], cospi[21], x[42], x[42], x[53]);
+  btf_16_ssse3(-cospi[53], cospi[11], x[52], x[43], x[52]);
+  btf_16_ssse3(cospi[51], cospi[13], x[44], x[44], x[51]);
+  btf_16_ssse3(-cospi[45], cospi[19], x[50], x[45], x[50]);
+  btf_16_ssse3(cospi[35], cospi[29], x[46], x[46], x[49]);
+  btf_16_ssse3(-cospi[61], cospi[3], x[48], x[47], x[48]);
+
+  // stage 3
+  btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
+  btf_16_ssse3(-cospi[34], cospi[30], x[30], x[17], x[30]);
+  btf_16_ssse3(cospi[46], cospi[18], x[18], x[18], x[29]);
+  btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
+  btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
+  btf_16_ssse3(-cospi[42], cospi[22], x[26], x[21], x[26]);
+  btf_16_ssse3(cospi[38], cospi[26], x[22], x[22], x[25]);
+  btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
+  btf_16_adds_subs_sse2(x[32], x[33]);
+  btf_16_subs_adds_sse2(x[35], x[34]);
+  btf_16_adds_subs_sse2(x[36], x[37]);
+  btf_16_subs_adds_sse2(x[39], x[38]);
+  btf_16_adds_subs_sse2(x[40], x[41]);
+  btf_16_subs_adds_sse2(x[43], x[42]);
+  btf_16_adds_subs_sse2(x[44], x[45]);
+  btf_16_subs_adds_sse2(x[47], x[46]);
+  btf_16_adds_subs_sse2(x[48], x[49]);
+  btf_16_subs_adds_sse2(x[51], x[50]);
+  btf_16_adds_subs_sse2(x[52], x[53]);
+  btf_16_subs_adds_sse2(x[55], x[54]);
+  btf_16_adds_subs_sse2(x[56], x[57]);
+  btf_16_subs_adds_sse2(x[59], x[58]);
+  btf_16_adds_subs_sse2(x[60], x[61]);
+  btf_16_subs_adds_sse2(x[63], x[62]);
+
+  // stage 4
+  btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
+  btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]);
+  btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]);
+  btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
+  btf_16_adds_subs_sse2(x[16], x[17]);
+  btf_16_subs_adds_sse2(x[19], x[18]);
+  btf_16_adds_subs_sse2(x[20], x[21]);
+  btf_16_subs_adds_sse2(x[23], x[22]);
+  btf_16_adds_subs_sse2(x[24], x[25]);
+  btf_16_subs_adds_sse2(x[27], x[26]);
+  btf_16_adds_subs_sse2(x[28], x[29]);
+  btf_16_subs_adds_sse2(x[31], x[30]);
+  idct64_stage4_high32_sse2(x, cospi, __rounding, cos_bit);
+
+  // stage 5
+  btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
+  btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]);
+  btf_16_adds_subs_sse2(x[8], x[9]);
+  btf_16_subs_adds_sse2(x[11], x[10]);
+  btf_16_adds_subs_sse2(x[12], x[13]);
+  btf_16_subs_adds_sse2(x[15], x[14]);
+  idct64_stage5_high48_sse2(x, cospi, __rounding, cos_bit);
+
+  // stage 6
+  btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+  btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]);
+  btf_16_adds_subs_sse2(x[4], x[5]);
+  btf_16_subs_adds_sse2(x[7], x[6]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
+  idct64_stage6_high48_sse2(x, cospi, __rounding, cos_bit);
+
+  // stage 7
+  btf_16_adds_subs_sse2(x[0], x[3]);
+  btf_16_adds_subs_sse2(x[1], x[2]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
+  btf_16_adds_subs_sse2(x[8], x[11]);
+  btf_16_adds_subs_sse2(x[9], x[10]);
+  btf_16_subs_adds_sse2(x[15], x[12]);
+  btf_16_subs_adds_sse2(x[14], x[13]);
+  idct64_stage7_high48_sse2(x, cospi, __rounding, cos_bit);
+
+  // stage 8
+  btf_16_adds_subs_sse2(x[0], x[7]);
+  btf_16_adds_subs_sse2(x[1], x[6]);
+  btf_16_adds_subs_sse2(x[2], x[5]);
+  btf_16_adds_subs_sse2(x[3], x[4]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
+  idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit);
+
+  // stage 9~11
+  idct64_stage9_sse2(x, cospi, __rounding, cos_bit);
+  idct64_stage10_sse2(x, cospi, __rounding, cos_bit);
+  idct64_stage11_sse2(output, x);
+}
+
+static void iadst4_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *sinpi = sinpi_arr(INV_COS_BIT);
+  const __m128i sinpi_p01_p04 = pair_set_epi16(sinpi[1], sinpi[4]);
+  const __m128i sinpi_p02_m01 = pair_set_epi16(sinpi[2], -sinpi[1]);
+  const __m128i sinpi_p03_p02 = pair_set_epi16(sinpi[3], sinpi[2]);
+  const __m128i sinpi_p03_m04 = pair_set_epi16(sinpi[3], -sinpi[4]);
+  const __m128i sinpi_p03_m03 = pair_set_epi16(sinpi[3], -sinpi[3]);
+  const __m128i sinpi_0_p03 = pair_set_epi16(0, sinpi[3]);
+  const __m128i sinpi_p04_p02 = pair_set_epi16(sinpi[4], sinpi[2]);
+  const __m128i sinpi_m03_m01 = pair_set_epi16(-sinpi[3], -sinpi[1]);
+  __m128i x0[4];
+  x0[0] = input[0];
+  x0[1] = input[1];
+  x0[2] = input[2];
+  x0[3] = input[3];
+
+  __m128i u[4];
+  u[0] = _mm_unpacklo_epi16(x0[0], x0[2]);
+  u[1] = _mm_unpackhi_epi16(x0[0], x0[2]);
+  u[2] = _mm_unpacklo_epi16(x0[1], x0[3]);
+  u[3] = _mm_unpackhi_epi16(x0[1], x0[3]);
+
+  __m128i x1[16];
+  x1[0] = _mm_madd_epi16(u[0], sinpi_p01_p04);  // x0*sin1 + x2*sin4
+  x1[1] = _mm_madd_epi16(u[1], sinpi_p01_p04);
+  x1[2] = _mm_madd_epi16(u[0], sinpi_p02_m01);  // x0*sin2 - x2*sin1
+  x1[3] = _mm_madd_epi16(u[1], sinpi_p02_m01);
+  x1[4] = _mm_madd_epi16(u[2], sinpi_p03_p02);  // x1*sin3 + x3*sin2
+  x1[5] = _mm_madd_epi16(u[3], sinpi_p03_p02);
+  x1[6] = _mm_madd_epi16(u[2], sinpi_p03_m04);  // x1*sin3 - x3*sin4
+  x1[7] = _mm_madd_epi16(u[3], sinpi_p03_m04);
+  x1[8] = _mm_madd_epi16(u[0], sinpi_p03_m03);  // x0*sin3 - x2*sin3
+  x1[9] = _mm_madd_epi16(u[1], sinpi_p03_m03);
+  x1[10] = _mm_madd_epi16(u[2], sinpi_0_p03);  // x2*sin3
+  x1[11] = _mm_madd_epi16(u[3], sinpi_0_p03);
+  x1[12] = _mm_madd_epi16(u[0], sinpi_p04_p02);  // x0*sin4 + x2*sin2
+  x1[13] = _mm_madd_epi16(u[1], sinpi_p04_p02);
+  x1[14] = _mm_madd_epi16(u[2], sinpi_m03_m01);  // -x1*sin3 - x3*sin1
+  x1[15] = _mm_madd_epi16(u[3], sinpi_m03_m01);
+
+  __m128i x2[8];
+  x2[0] = _mm_add_epi32(x1[0], x1[4]);  // x0*sin1 +x2*sin4 +x1*sin3 +x3*sin2
+  x2[1] = _mm_add_epi32(x1[1], x1[5]);
+  x2[2] = _mm_add_epi32(x1[2], x1[6]);  // x0*sin2 -x2*sin1 +x1*sin3 -x3*sin4
+  x2[3] = _mm_add_epi32(x1[3], x1[7]);
+  x2[4] = _mm_add_epi32(x1[8], x1[10]);  // x0*sin3 -x2*sin3 +x3*sin3
+  x2[5] = _mm_add_epi32(x1[9], x1[11]);
+  x2[6] = _mm_add_epi32(x1[12], x1[14]);  // x0*sin1 +x2*sin4 +x0*sin2 -x2*sin1
+  x2[7] = _mm_add_epi32(x1[13], x1[15]);
+
+  const __m128i rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+  for (int i = 0; i < 4; ++i) {
+    __m128i out0 = _mm_add_epi32(x2[2 * i], rounding);
+    __m128i out1 = _mm_add_epi32(x2[2 * i + 1], rounding);
+    out0 = _mm_srai_epi32(out0, INV_COS_BIT);
+    out1 = _mm_srai_epi32(out1, INV_COS_BIT);
+    output[i] = _mm_packs_epi32(out0, out1);
+  }
+}
+
+static void iadst4_w4_sse2(const __m128i *input, __m128i *output,
+                           int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *sinpi = sinpi_arr(INV_COS_BIT);
+  const __m128i sinpi_p01_p04 = pair_set_epi16(sinpi[1], sinpi[4]);
+  const __m128i sinpi_p02_m01 = pair_set_epi16(sinpi[2], -sinpi[1]);
+  const __m128i sinpi_p03_p02 = pair_set_epi16(sinpi[3], sinpi[2]);
+  const __m128i sinpi_p03_m04 = pair_set_epi16(sinpi[3], -sinpi[4]);
+  const __m128i sinpi_p03_m03 = pair_set_epi16(sinpi[3], -sinpi[3]);
+  const __m128i sinpi_0_p03 = pair_set_epi16(0, sinpi[3]);
+  const __m128i sinpi_p04_p02 = pair_set_epi16(sinpi[4], sinpi[2]);
+  const __m128i sinpi_m03_m01 = pair_set_epi16(-sinpi[3], -sinpi[1]);
+  __m128i x0[4];
+  x0[0] = input[0];
+  x0[1] = input[1];
+  x0[2] = input[2];
+  x0[3] = input[3];
+
+  __m128i u[2];
+  u[0] = _mm_unpacklo_epi16(x0[0], x0[2]);
+  u[1] = _mm_unpacklo_epi16(x0[1], x0[3]);
+
+  __m128i x1[8];
+  x1[0] = _mm_madd_epi16(u[0], sinpi_p01_p04);  // x0*sin1 + x2*sin4
+  x1[1] = _mm_madd_epi16(u[0], sinpi_p02_m01);  // x0*sin2 - x2*sin1
+  x1[2] = _mm_madd_epi16(u[1], sinpi_p03_p02);  // x1*sin3 + x3*sin2
+  x1[3] = _mm_madd_epi16(u[1], sinpi_p03_m04);  // x1*sin3 - x3*sin4
+  x1[4] = _mm_madd_epi16(u[0], sinpi_p03_m03);  // x0*sin3 - x2*sin3
+  x1[5] = _mm_madd_epi16(u[1], sinpi_0_p03);    // x2*sin3
+  x1[6] = _mm_madd_epi16(u[0], sinpi_p04_p02);  // x0*sin4 + x2*sin2
+  x1[7] = _mm_madd_epi16(u[1], sinpi_m03_m01);  // -x1*sin3 - x3*sin1
+
+  __m128i x2[4];
+  x2[0] = _mm_add_epi32(x1[0], x1[2]);  // x0*sin1 + x2*sin4 + x1*sin3 + x3*sin2
+  x2[1] = _mm_add_epi32(x1[1], x1[3]);  // x0*sin2 - x2*sin1 + x1*sin3 - x3*sin4
+  x2[2] = _mm_add_epi32(x1[4], x1[5]);  // x0*sin3 - x2*sin3 + x3*sin3
+  x2[3] = _mm_add_epi32(x1[6], x1[7]);  // x0*sin4 + x2*sin2 - x1*sin3 - x3*sin1
+
+  const __m128i rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+  for (int i = 0; i < 4; ++i) {
+    __m128i out0 = _mm_add_epi32(x2[i], rounding);
+    out0 = _mm_srai_epi32(out0, INV_COS_BIT);
+    output[i] = _mm_packs_epi32(out0, out0);
+  }
+}
+
+static void iadst8_low1_ssse3(const __m128i *input, __m128i *output,
+                              int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __zero = _mm_setzero_si128();
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+
+  // stage 1
+  __m128i x[8];
+  x[1] = input[0];
+
+  // stage 2
+  btf_16_ssse3(cospi[60], -cospi[4], x[1], x[0], x[1]);
+
+  // stage 3
+  x[4] = x[0];
+  x[5] = x[1];
+
+  // stage 4
+  btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
+
+  // stage 5
+  x[2] = x[0];
+  x[3] = x[1];
+  x[6] = x[4];
+  x[7] = x[5];
+
+  // stage 6
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
+
+  // stage 7
+  output[0] = x[0];
+  output[1] = _mm_subs_epi16(__zero, x[4]);
+  output[2] = x[6];
+  output[3] = _mm_subs_epi16(__zero, x[2]);
+  output[4] = x[3];
+  output[5] = _mm_subs_epi16(__zero, x[7]);
+  output[6] = x[5];
+  output[7] = _mm_subs_epi16(__zero, x[1]);
+}
+
+static void iadst8_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __zero = _mm_setzero_si128();
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
+  const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
+  const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
+  const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
+  const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
+  const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
+  const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
+  const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
+  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+  const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+
+  // stage 1
+  __m128i x[8];
+  x[0] = input[7];
+  x[1] = input[0];
+  x[2] = input[5];
+  x[3] = input[2];
+  x[4] = input[3];
+  x[5] = input[4];
+  x[6] = input[1];
+  x[7] = input[6];
+
+  // stage 2
+  btf_16_sse2(cospi_p04_p60, cospi_p60_m04, x[0], x[1], x[0], x[1]);
+  btf_16_sse2(cospi_p20_p44, cospi_p44_m20, x[2], x[3], x[2], x[3]);
+  btf_16_sse2(cospi_p36_p28, cospi_p28_m36, x[4], x[5], x[4], x[5]);
+  btf_16_sse2(cospi_p52_p12, cospi_p12_m52, x[6], x[7], x[6], x[7]);
+
+  // stage 3
+  btf_16_adds_subs_sse2(x[0], x[4]);
+  btf_16_adds_subs_sse2(x[1], x[5]);
+  btf_16_adds_subs_sse2(x[2], x[6]);
+  btf_16_adds_subs_sse2(x[3], x[7]);
+
+  // stage 4
+  btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
+  btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
+
+  // stage 5
+  btf_16_adds_subs_sse2(x[0], x[2]);
+  btf_16_adds_subs_sse2(x[1], x[3]);
+  btf_16_adds_subs_sse2(x[4], x[6]);
+  btf_16_adds_subs_sse2(x[5], x[7]);
+
+  // stage 6
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
+
+  // stage 7
+  output[0] = x[0];
+  output[1] = _mm_subs_epi16(__zero, x[4]);
+  output[2] = x[6];
+  output[3] = _mm_subs_epi16(__zero, x[2]);
+  output[4] = x[3];
+  output[5] = _mm_subs_epi16(__zero, x[7]);
+  output[6] = x[5];
+  output[7] = _mm_subs_epi16(__zero, x[1]);
+}
+
+static void iadst8_w4_sse2(const __m128i *input, __m128i *output,
+                           int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __zero = _mm_setzero_si128();
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
+  const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
+  const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
+  const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
+  const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
+  const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
+  const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
+  const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
+  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+  const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+
+  // stage 1
+  __m128i x[8];
+  x[0] = input[7];
+  x[1] = input[0];
+  x[2] = input[5];
+  x[3] = input[2];
+  x[4] = input[3];
+  x[5] = input[4];
+  x[6] = input[1];
+  x[7] = input[6];
+
+  // stage 2
+  btf_16_4p_sse2(cospi_p04_p60, cospi_p60_m04, x[0], x[1], x[0], x[1]);
+  btf_16_4p_sse2(cospi_p20_p44, cospi_p44_m20, x[2], x[3], x[2], x[3]);
+  btf_16_4p_sse2(cospi_p36_p28, cospi_p28_m36, x[4], x[5], x[4], x[5]);
+  btf_16_4p_sse2(cospi_p52_p12, cospi_p12_m52, x[6], x[7], x[6], x[7]);
+
+  // stage 3
+  btf_16_adds_subs_sse2(x[0], x[4]);
+  btf_16_adds_subs_sse2(x[1], x[5]);
+  btf_16_adds_subs_sse2(x[2], x[6]);
+  btf_16_adds_subs_sse2(x[3], x[7]);
+
+  // stage 4
+  btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
+  btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
+
+  // stage 5
+  btf_16_adds_subs_sse2(x[0], x[2]);
+  btf_16_adds_subs_sse2(x[1], x[3]);
+  btf_16_adds_subs_sse2(x[4], x[6]);
+  btf_16_adds_subs_sse2(x[5], x[7]);
+
+  // stage 6
+  btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
+  btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
+
+  // stage 7
+  output[0] = x[0];
+  output[1] = _mm_subs_epi16(__zero, x[4]);
+  output[2] = x[6];
+  output[3] = _mm_subs_epi16(__zero, x[2]);
+  output[4] = x[3];
+  output[5] = _mm_subs_epi16(__zero, x[7]);
+  output[6] = x[5];
+  output[7] = _mm_subs_epi16(__zero, x[1]);
+}
+
+static INLINE void iadst16_stage3_ssse3(__m128i *x) {
+  btf_16_adds_subs_sse2(x[0], x[8]);
+  btf_16_adds_subs_sse2(x[1], x[9]);
+  btf_16_adds_subs_sse2(x[2], x[10]);
+  btf_16_adds_subs_sse2(x[3], x[11]);
+  btf_16_adds_subs_sse2(x[4], x[12]);
+  btf_16_adds_subs_sse2(x[5], x[13]);
+  btf_16_adds_subs_sse2(x[6], x[14]);
+  btf_16_adds_subs_sse2(x[7], x[15]);
+}
+
+static INLINE void iadst16_stage4_ssse3(__m128i *x, const int32_t *cospi,
+                                        const __m128i __rounding,
+                                        int8_t cos_bit) {
+  const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+  const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+  const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
+  const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
+  const __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]);
+  const __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]);
+  btf_16_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]);
+  btf_16_sse2(cospi_p40_p24, cospi_p24_m40, x[10], x[11], x[10], x[11]);
+  btf_16_sse2(cospi_m56_p08, cospi_p08_p56, x[12], x[13], x[12], x[13]);
+  btf_16_sse2(cospi_m24_p40, cospi_p40_p24, x[14], x[15], x[14], x[15]);
+}
+
+static INLINE void iadst16_stage5_ssse3(__m128i *x) {
+  btf_16_adds_subs_sse2(x[0], x[4]);
+  btf_16_adds_subs_sse2(x[1], x[5]);
+  btf_16_adds_subs_sse2(x[2], x[6]);
+  btf_16_adds_subs_sse2(x[3], x[7]);
+  btf_16_adds_subs_sse2(x[8], x[12]);
+  btf_16_adds_subs_sse2(x[9], x[13]);
+  btf_16_adds_subs_sse2(x[10], x[14]);
+  btf_16_adds_subs_sse2(x[11], x[15]);
+}
+
+static INLINE void iadst16_stage6_ssse3(__m128i *x, const int32_t *cospi,
+                                        const __m128i __rounding,
+                                        int8_t cos_bit) {
+  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+  const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
+  btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
+  btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
+  btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]);
+  btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[14], x[15], x[14], x[15]);
+}
+
+static INLINE void iadst16_stage7_ssse3(__m128i *x) {
+  btf_16_adds_subs_sse2(x[0], x[2]);
+  btf_16_adds_subs_sse2(x[1], x[3]);
+  btf_16_adds_subs_sse2(x[4], x[6]);
+  btf_16_adds_subs_sse2(x[5], x[7]);
+  btf_16_adds_subs_sse2(x[8], x[10]);
+  btf_16_adds_subs_sse2(x[9], x[11]);
+  btf_16_adds_subs_sse2(x[12], x[14]);
+  btf_16_adds_subs_sse2(x[13], x[15]);
+}
+
+static INLINE void iadst16_stage8_ssse3(__m128i *x, const int32_t *cospi,
+                                        const __m128i __rounding,
+                                        int8_t cos_bit) {
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[10], x[11], x[10], x[11]);
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[14], x[15], x[14], x[15]);
+}
+
+static INLINE void iadst16_stage9_ssse3(__m128i *output, __m128i *x) {
+  const __m128i __zero = _mm_setzero_si128();
+  output[0] = x[0];
+  output[1] = _mm_subs_epi16(__zero, x[8]);
+  output[2] = x[12];
+  output[3] = _mm_subs_epi16(__zero, x[4]);
+  output[4] = x[6];
+  output[5] = _mm_subs_epi16(__zero, x[14]);
+  output[6] = x[10];
+  output[7] = _mm_subs_epi16(__zero, x[2]);
+  output[8] = x[3];
+  output[9] = _mm_subs_epi16(__zero, x[11]);
+  output[10] = x[15];
+  output[11] = _mm_subs_epi16(__zero, x[7]);
+  output[12] = x[5];
+  output[13] = _mm_subs_epi16(__zero, x[13]);
+  output[14] = x[9];
+  output[15] = _mm_subs_epi16(__zero, x[1]);
+}
+
+static void iadst16_low1_ssse3(const __m128i *input, __m128i *output,
+                               int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+  const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+
+  // stage 1
+  __m128i x[16];
+  x[1] = input[0];
+
+  // stage 2
+  btf_16_ssse3(cospi[62], -cospi[2], x[1], x[0], x[1]);
+
+  // stage 3
+  x[8] = x[0];
+  x[9] = x[1];
+
+  // stage 4
+  btf_16_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]);
+
+  // stage 5
+  x[4] = x[0];
+  x[5] = x[1];
+  x[12] = x[8];
+  x[13] = x[9];
+
+  // stage 6
+  btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
+  btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]);
+
+  // stage 7
+  x[2] = x[0];
+  x[3] = x[1];
+  x[6] = x[4];
+  x[7] = x[5];
+  x[10] = x[8];
+  x[11] = x[9];
+  x[14] = x[12];
+  x[15] = x[13];
+
+  iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit);
+  iadst16_stage9_ssse3(output, x);
+}
+
+static void iadst16_low8_ssse3(const __m128i *input, __m128i *output,
+                               int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  // stage 1
+  __m128i x[16];
+  x[1] = input[0];
+  x[3] = input[2];
+  x[5] = input[4];
+  x[7] = input[6];
+  x[8] = input[7];
+  x[10] = input[5];
+  x[12] = input[3];
+  x[14] = input[1];
+
+  // stage 2
+  btf_16_ssse3(cospi[62], -cospi[2], x[1], x[0], x[1]);
+  btf_16_ssse3(cospi[54], -cospi[10], x[3], x[2], x[3]);
+  btf_16_ssse3(cospi[46], -cospi[18], x[5], x[4], x[5]);
+  btf_16_ssse3(cospi[38], -cospi[26], x[7], x[6], x[7]);
+  btf_16_ssse3(cospi[34], cospi[30], x[8], x[8], x[9]);
+  btf_16_ssse3(cospi[42], cospi[22], x[10], x[10], x[11]);
+  btf_16_ssse3(cospi[50], cospi[14], x[12], x[12], x[13]);
+  btf_16_ssse3(cospi[58], cospi[6], x[14], x[14], x[15]);
+
+  // stage 3
+  iadst16_stage3_ssse3(x);
+  iadst16_stage4_ssse3(x, cospi, __rounding, cos_bit);
+  iadst16_stage5_ssse3(x);
+  iadst16_stage6_ssse3(x, cospi, __rounding, cos_bit);
+  iadst16_stage7_ssse3(x);
+  iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit);
+  iadst16_stage9_ssse3(output, x);
+}
+static void iadst16_sse2(const __m128i *input, __m128i *output,
+                         int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+  const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
+  const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
+  const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
+  const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
+  const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
+  const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
+  const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
+  const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
+  const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
+  const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
+  const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
+  const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
+  const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
+  const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
+  const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
+  const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
+
+  // stage 1
+  __m128i x[16];
+  x[0] = input[15];
+  x[1] = input[0];
+  x[2] = input[13];
+  x[3] = input[2];
+  x[4] = input[11];
+  x[5] = input[4];
+  x[6] = input[9];
+  x[7] = input[6];
+  x[8] = input[7];
+  x[9] = input[8];
+  x[10] = input[5];
+  x[11] = input[10];
+  x[12] = input[3];
+  x[13] = input[12];
+  x[14] = input[1];
+  x[15] = input[14];
+
+  // stage 2
+  btf_16_sse2(cospi_p02_p62, cospi_p62_m02, x[0], x[1], x[0], x[1]);
+  btf_16_sse2(cospi_p10_p54, cospi_p54_m10, x[2], x[3], x[2], x[3]);
+  btf_16_sse2(cospi_p18_p46, cospi_p46_m18, x[4], x[5], x[4], x[5]);
+  btf_16_sse2(cospi_p26_p38, cospi_p38_m26, x[6], x[7], x[6], x[7]);
+  btf_16_sse2(cospi_p34_p30, cospi_p30_m34, x[8], x[9], x[8], x[9]);
+  btf_16_sse2(cospi_p42_p22, cospi_p22_m42, x[10], x[11], x[10], x[11]);
+  btf_16_sse2(cospi_p50_p14, cospi_p14_m50, x[12], x[13], x[12], x[13]);
+  btf_16_sse2(cospi_p58_p06, cospi_p06_m58, x[14], x[15], x[14], x[15]);
+
+  // stage 3~9
+  iadst16_stage3_ssse3(x);
+  iadst16_stage4_ssse3(x, cospi, __rounding, cos_bit);
+  iadst16_stage5_ssse3(x);
+  iadst16_stage6_ssse3(x, cospi, __rounding, cos_bit);
+  iadst16_stage7_ssse3(x);
+  iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit);
+  iadst16_stage9_ssse3(output, x);
+}
+
+static void iadst16_w4_sse2(const __m128i *input, __m128i *output,
+                            int8_t cos_bit) {
+  (void)cos_bit;
+  const int32_t *cospi = cospi_arr(INV_COS_BIT);
+  const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+  const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
+  const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
+  const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
+  const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
+  const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
+  const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
+  const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
+  const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
+  const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
+  const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
+  const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
+  const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
+  const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
+  const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
+  const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
+  const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
+  const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+  const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+  const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
+  const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
+  const __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]);
+  const __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]);
+  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+  const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+
+  // stage 1
+  __m128i x[16];
+  x[0] = input[15];
+  x[1] = input[0];
+  x[2] = input[13];
+  x[3] = input[2];
+  x[4] = input[11];
+  x[5] = input[4];
+  x[6] = input[9];
+  x[7] = input[6];
+  x[8] = input[7];
+  x[9] = input[8];
+  x[10] = input[5];
+  x[11] = input[10];
+  x[12] = input[3];
+  x[13] = input[12];
+  x[14] = input[1];
+  x[15] = input[14];
+
+  // stage 2
+  btf_16_4p_sse2(cospi_p02_p62, cospi_p62_m02, x[0], x[1], x[0], x[1]);
+  btf_16_4p_sse2(cospi_p10_p54, cospi_p54_m10, x[2], x[3], x[2], x[3]);
+  btf_16_4p_sse2(cospi_p18_p46, cospi_p46_m18, x[4], x[5], x[4], x[5]);
+  btf_16_4p_sse2(cospi_p26_p38, cospi_p38_m26, x[6], x[7], x[6], x[7]);
+  btf_16_4p_sse2(cospi_p34_p30, cospi_p30_m34, x[8], x[9], x[8], x[9]);
+  btf_16_4p_sse2(cospi_p42_p22, cospi_p22_m42, x[10], x[11], x[10], x[11]);
+  btf_16_4p_sse2(cospi_p50_p14, cospi_p14_m50, x[12], x[13], x[12], x[13]);
+  btf_16_4p_sse2(cospi_p58_p06, cospi_p06_m58, x[14], x[15], x[14], x[15]);
+
+  // stage 3
+  iadst16_stage3_ssse3(x);
+
+  // stage 4
+  btf_16_4p_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]);
+  btf_16_4p_sse2(cospi_p40_p24, cospi_p24_m40, x[10], x[11], x[10], x[11]);
+  btf_16_4p_sse2(cospi_m56_p08, cospi_p08_p56, x[12], x[13], x[12], x[13]);
+  btf_16_4p_sse2(cospi_m24_p40, cospi_p40_p24, x[14], x[15], x[14], x[15]);
+
+  // stage 5
+  iadst16_stage5_ssse3(x);
+
+  // stage 6
+  btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
+  btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
+  btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]);
+  btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[14], x[15], x[14], x[15]);
+
+  // stage 7
+  iadst16_stage7_ssse3(x);
+
+  // stage 8
+  btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
+  btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
+  btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[10], x[11], x[10], x[11]);
+  btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[14], x[15], x[14], x[15]);
+
+  // stage 9
+  iadst16_stage9_ssse3(output, x);
+}
+
+static void iidentity4_ssse3(const __m128i *input, __m128i *output,
+                             int8_t cos_bit) {
+  (void)cos_bit;
+  const int16_t scale_fractional = (NewSqrt2 - (1 << NewSqrt2Bits));
+  const __m128i scale = _mm_set1_epi16(scale_fractional << (15 - NewSqrt2Bits));
+  for (int i = 0; i < 4; ++i) {
+    __m128i x = _mm_mulhrs_epi16(input[i], scale);
+    output[i] = _mm_adds_epi16(x, input[i]);
+  }
+}
+
+static void iidentity8_sse2(const __m128i *input, __m128i *output,
+                            int8_t cos_bit) {
+  (void)cos_bit;
+  for (int i = 0; i < 8; ++i) {
+    output[i] = _mm_adds_epi16(input[i], input[i]);
+  }
+}
+
+static void iidentity16_ssse3(const __m128i *input, __m128i *output,
+                              int8_t cos_bit) {
+  (void)cos_bit;
+  const int16_t scale_fractional = 2 * (NewSqrt2 - (1 << NewSqrt2Bits));
+  const __m128i scale = _mm_set1_epi16(scale_fractional << (15 - NewSqrt2Bits));
+  for (int i = 0; i < 16; ++i) {
+    __m128i x = _mm_mulhrs_epi16(input[i], scale);
+    __m128i srcx2 = _mm_adds_epi16(input[i], input[i]);
+    output[i] = _mm_adds_epi16(x, srcx2);
+  }
+}
+
+static INLINE __m128i lowbd_get_recon_8x8_sse2(const __m128i pred,
+                                               __m128i res) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i x0 = _mm_adds_epi16(res, _mm_unpacklo_epi8(pred, zero));
+  return _mm_packus_epi16(x0, x0);
+}
+
+static INLINE void lowbd_write_buffer_4xn_sse2(__m128i *in, uint8_t *output,
+                                               int stride, int flipud,
+                                               const int height) {
+  int j = flipud ? (height - 1) : 0;
+  const int step = flipud ? -1 : 1;
+  const __m128i zero = _mm_setzero_si128();
+  for (int i = 0; i < height; ++i, j += step) {
+    const __m128i v = _mm_cvtsi32_si128(*((uint32_t *)(output + i * stride)));
+    __m128i u = _mm_adds_epi16(in[j], _mm_unpacklo_epi8(v, zero));
+    u = _mm_packus_epi16(u, zero);
+    *((uint32_t *)(output + i * stride)) = _mm_cvtsi128_si32(u);
+  }
+}
+
+static INLINE void lowbd_write_buffer_8xn_sse2(__m128i *in, uint8_t *output,
+                                               int stride, int flipud,
+                                               const int height) {
+  int j = flipud ? (height - 1) : 0;
+  const int step = flipud ? -1 : 1;
+  for (int i = 0; i < height; ++i, j += step) {
+    const __m128i v = _mm_loadl_epi64((__m128i const *)(output + i * stride));
+    const __m128i u = lowbd_get_recon_8x8_sse2(v, in[j]);
+    _mm_storel_epi64((__m128i *)(output + i * stride), u);
+  }
+}
+
+// 1D functions process process 8 pixels at one time.
+static const transform_1d_ssse3
+    lowbd_txfm_all_1d_w8_arr[TX_SIZES][ITX_TYPES_1D] = {
+      { idct4_sse2, iadst4_sse2, iidentity4_ssse3 },
+      { idct8_sse2, iadst8_sse2, iidentity8_sse2 },
+      { idct16_sse2, iadst16_sse2, iidentity16_ssse3 },
+      { idct32_sse2, NULL, NULL },
+      { idct64_low32_ssse3, NULL, NULL },
+    };
+
+// functions for blocks with eob at DC and within
+// topleft 8x8, 16x16, 32x32 corner
+static const transform_1d_ssse3
+    lowbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = {
+      {
+          { idct4_sse2, idct4_sse2, NULL, NULL },
+          { iadst4_sse2, iadst4_sse2, NULL, NULL },
+          { iidentity4_ssse3, iidentity4_ssse3, NULL, NULL },
+      },
+      { { idct8_low1_ssse3, idct8_sse2, NULL, NULL },
+        { iadst8_low1_ssse3, iadst8_sse2, NULL, NULL },
+        { iidentity8_sse2, iidentity8_sse2, NULL, NULL } },
+      {
+          { idct16_low1_ssse3, idct16_low8_ssse3, idct16_sse2, NULL },
+          { iadst16_low1_ssse3, iadst16_low8_ssse3, iadst16_sse2, NULL },
+          { NULL, NULL, NULL, NULL },
+      },
+      { { idct32_low1_ssse3, idct32_low8_ssse3, idct32_low16_ssse3,
+          idct32_sse2 },
+        { NULL, NULL, NULL, NULL },
+        { NULL, NULL, NULL, NULL } },
+      { { idct64_low1_ssse3, idct64_low8_ssse3, idct64_low16_ssse3,
+          idct64_low32_ssse3 },
+        { NULL, NULL, NULL, NULL },
+        { NULL, NULL, NULL, NULL } }
+    };
+
+// 1D functions process process 4 pixels at one time.
+// used in 4x4, 4x8, 4x16, 8x4, 16x4
+static const transform_1d_ssse3
+    lowbd_txfm_all_1d_w4_arr[TX_SIZES][ITX_TYPES_1D] = {
+      { idct4_w4_sse2, iadst4_w4_sse2, iidentity4_ssse3 },
+      { idct8_w4_sse2, iadst8_w4_sse2, iidentity8_sse2 },
+      { idct16_w4_sse2, iadst16_w4_sse2, iidentity16_ssse3 },
+      { NULL, NULL, NULL },
+      { NULL, NULL, NULL },
+    };
+
+static INLINE void iidentity_row_8xn_ssse3(__m128i *out, const int32_t *input,
+                                           int stride, int shift, int height,
+                                           int txw_idx, int rect_type) {
+  const int32_t *input_row = input;
+  const __m128i scale = _mm_set1_epi16(NewSqrt2list[txw_idx]);
+  const __m128i rounding = _mm_set1_epi16((1 << (NewSqrt2Bits - 1)) +
+                                          (1 << (NewSqrt2Bits - shift - 1)));
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i scale_rounding = _mm_unpacklo_epi16(scale, rounding);
+  if (rect_type != 1 && rect_type != -1) {
+    for (int i = 0; i < height; ++i) {
+      const __m128i src = load_32bit_to_16bit(input_row);
+      input_row += stride;
+      __m128i lo = _mm_unpacklo_epi16(src, one);
+      __m128i hi = _mm_unpackhi_epi16(src, one);
+      lo = _mm_madd_epi16(lo, scale_rounding);
+      hi = _mm_madd_epi16(hi, scale_rounding);
+      lo = _mm_srai_epi32(lo, NewSqrt2Bits - shift);
+      hi = _mm_srai_epi32(hi, NewSqrt2Bits - shift);
+      out[i] = _mm_packs_epi32(lo, hi);
+    }
+  } else {
+    const __m128i rect_scale =
+        _mm_set1_epi16(NewInvSqrt2 << (15 - NewSqrt2Bits));
+    for (int i = 0; i < height; ++i) {
+      __m128i src = load_32bit_to_16bit(input_row);
+      src = _mm_mulhrs_epi16(src, rect_scale);
+      input_row += stride;
+      __m128i lo = _mm_unpacklo_epi16(src, one);
+      __m128i hi = _mm_unpackhi_epi16(src, one);
+      lo = _mm_madd_epi16(lo, scale_rounding);
+      hi = _mm_madd_epi16(hi, scale_rounding);
+      lo = _mm_srai_epi32(lo, NewSqrt2Bits - shift);
+      hi = _mm_srai_epi32(hi, NewSqrt2Bits - shift);
+      out[i] = _mm_packs_epi32(lo, hi);
+    }
+  }
+}
+
+static INLINE void iidentity_col_8xn_ssse3(uint8_t *output, int stride,
+                                           __m128i *buf, int shift, int height,
+                                           int txh_idx) {
+  const __m128i scale = _mm_set1_epi16(NewSqrt2list[txh_idx]);
+  const __m128i scale_rounding = _mm_set1_epi16(1 << (NewSqrt2Bits - 1));
+  const __m128i shift_rounding = _mm_set1_epi32(1 << (-shift - 1));
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i scale_coeff = _mm_unpacklo_epi16(scale, scale_rounding);
+  const __m128i zero = _mm_setzero_si128();
+  for (int h = 0; h < height; ++h) {
+    __m128i lo = _mm_unpacklo_epi16(buf[h], one);
+    __m128i hi = _mm_unpackhi_epi16(buf[h], one);
+    lo = _mm_madd_epi16(lo, scale_coeff);
+    hi = _mm_madd_epi16(hi, scale_coeff);
+    lo = _mm_srai_epi32(lo, NewSqrt2Bits);
+    hi = _mm_srai_epi32(hi, NewSqrt2Bits);
+    lo = _mm_add_epi32(lo, shift_rounding);
+    hi = _mm_add_epi32(hi, shift_rounding);
+    lo = _mm_srai_epi32(lo, -shift);
+    hi = _mm_srai_epi32(hi, -shift);
+    __m128i x = _mm_packs_epi32(lo, hi);
+
+    const __m128i pred = _mm_loadl_epi64((__m128i const *)(output));
+    x = _mm_adds_epi16(x, _mm_unpacklo_epi8(pred, zero));
+    const __m128i u = _mm_packus_epi16(x, x);
+    _mm_storel_epi64((__m128i *)(output), u);
+    output += stride;
+  }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_idtx_ssse3(const int32_t *input,
+                                                   uint8_t *output, int stride,
+                                                   TX_SIZE tx_size) {
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int input_stride = AOMMIN(32, txfm_size_col);
+  const int row_max = AOMMIN(32, txfm_size_row);
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+  __m128i buf[32];
+
+  for (int i = 0; i < (input_stride >> 3); ++i) {
+    iidentity_row_8xn_ssse3(buf, input + 8 * i, input_stride, shift[0], row_max,
+                            txw_idx, rect_type);
+    iidentity_col_8xn_ssse3(output + 8 * i, stride, buf, shift[1], row_max,
+                            txh_idx);
+  }
+}
+
+static void lowbd_inv_txfm2d_add_4x4_ssse3(const int32_t *input,
+                                           uint8_t *output, int stride,
+                                           TX_TYPE tx_type, TX_SIZE tx_size_,
+                                           int eob) {
+  (void)tx_size_;
+  (void)eob;
+  __m128i buf[4];
+  const TX_SIZE tx_size = TX_4X4;
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+
+  const transform_1d_ssse3 row_txfm =
+      lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]];
+  const transform_1d_ssse3 col_txfm =
+      lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  load_buffer_32bit_to_16bit_w4(input, txfm_size_col, buf, txfm_size_row);
+  transpose_16bit_4x4(buf, buf);
+  row_txfm(buf, buf, cos_bit_row);
+  if (lr_flip) {
+    __m128i temp[4];
+    flip_buf_sse2(buf, temp, txfm_size_col);
+    transpose_16bit_4x4(temp, buf);
+  } else {
+    transpose_16bit_4x4(buf, buf);
+  }
+  col_txfm(buf, buf, cos_bit_col);
+  round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
+  lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
+}
+
+static INLINE __m128i lowbd_get_recon_16x16_sse2(const __m128i pred,
+                                                 __m128i res0, __m128i res1) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i x0 = _mm_unpacklo_epi8(pred, zero);
+  __m128i x1 = _mm_unpackhi_epi8(pred, zero);
+  x0 = _mm_adds_epi16(res0, x0);
+  x1 = _mm_adds_epi16(res1, x1);
+  return _mm_packus_epi16(x0, x1);
+}
+
+static INLINE void lowbd_write_buffer_16xn_sse2(__m128i *in, uint8_t *output,
+                                                int stride, int flipud,
+                                                int height) {
+  int j = flipud ? (height - 1) : 0;
+  const int step = flipud ? -1 : 1;
+  for (int i = 0; i < height; ++i, j += step) {
+    __m128i v = _mm_loadu_si128((__m128i const *)(output + i * stride));
+    __m128i u = lowbd_get_recon_16x16_sse2(v, in[j], in[j + height]);
+    _mm_storeu_si128((__m128i *)(output + i * stride), u);
+  }
+}
+
+static INLINE void round_shift_ssse3(const __m128i *input, __m128i *output,
+                                     int size) {
+  const __m128i scale = _mm_set1_epi16(NewInvSqrt2 * 8);
+  for (int i = 0; i < size; ++i) {
+    output[i] = _mm_mulhrs_epi16(input[i], scale);
+  }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_no_identity_ssse3(
+    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+    TX_SIZE tx_size, int eob) {
+  __m128i buf1[64 * 8];
+  int eobx, eoby;
+  get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_size_w_div8 = txfm_size_col >> 3;
+  const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
+  const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+  const int input_stride = AOMMIN(32, txfm_size_col);
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+
+  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+  const transform_1d_ssse3 row_txfm =
+      lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+  const transform_1d_ssse3 col_txfm =
+      lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+  assert(col_txfm != NULL);
+  assert(row_txfm != NULL);
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
+    __m128i buf0[64];
+    const int32_t *input_row = input + i * input_stride * 8;
+    for (int j = 0; j < buf_size_nonzero_w_div8; ++j) {
+      __m128i *buf0_cur = buf0 + j * 8;
+      load_buffer_32bit_to_16bit(input_row + j * 8, input_stride, buf0_cur, 8);
+      transpose_16bit_8x8(buf0_cur, buf0_cur);
+    }
+    if (rect_type == 1 || rect_type == -1) {
+      round_shift_ssse3(buf0, buf0, input_stride);  // rect special code
+    }
+    row_txfm(buf0, buf0, cos_bit_row);
+    round_shift_16bit_ssse3(buf0, txfm_size_col, shift[0]);
+    __m128i *_buf1 = buf1 + i * 8;
+    if (lr_flip) {
+      for (int j = 0; j < buf_size_w_div8; ++j) {
+        __m128i temp[8];
+        flip_buf_sse2(buf0 + 8 * j, temp, 8);
+        transpose_16bit_8x8(temp,
+                            _buf1 + txfm_size_row * (buf_size_w_div8 - 1 - j));
+      }
+    } else {
+      for (int j = 0; j < buf_size_w_div8; ++j) {
+        transpose_16bit_8x8(buf0 + 8 * j, _buf1 + txfm_size_row * j);
+      }
+    }
+  }
+  for (int i = 0; i < buf_size_w_div8; i++) {
+    col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, cos_bit_col);
+    round_shift_16bit_ssse3(buf1 + i * txfm_size_row, txfm_size_row, shift[1]);
+  }
+
+  if (txfm_size_col >= 16) {
+    for (int i = 0; i < (txfm_size_col >> 4); i++) {
+      lowbd_write_buffer_16xn_sse2(buf1 + i * txfm_size_row * 2,
+                                   output + 16 * i, stride, ud_flip,
+                                   txfm_size_row);
+    }
+  } else if (txfm_size_col == 8) {
+    lowbd_write_buffer_8xn_sse2(buf1, output, stride, ud_flip, txfm_size_row);
+  }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_h_identity_ssse3(
+    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+    TX_SIZE tx_size, int eob) {
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+  int eobx, eoby;
+  get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_size_w_div8 = (eobx + 8) >> 3;
+  const int input_stride = AOMMIN(32, txfm_size_col);
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+
+  const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eoby];
+  assert(fun_idx < 5);
+  const transform_1d_ssse3 col_txfm =
+      lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx];
+
+  assert(col_txfm != NULL);
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  for (int i = 0; i < buf_size_w_div8; i++) {
+    __m128i buf0[64];
+    iidentity_row_8xn_ssse3(buf0, input + 8 * i, input_stride, shift[0],
+                            eoby + 1, txw_idx, rect_type);
+    col_txfm(buf0, buf0, cos_bit_col);
+    __m128i mshift = _mm_set1_epi16(1 << (15 + shift[1]));
+    int k = ud_flip ? (txfm_size_row - 1) : 0;
+    const int step = ud_flip ? -1 : 1;
+    uint8_t *out = output + 8 * i;
+    for (int j = 0; j < txfm_size_row; ++j, k += step) {
+      const __m128i v = _mm_loadl_epi64((__m128i const *)(out));
+      __m128i res = _mm_mulhrs_epi16(buf0[k], mshift);
+      const __m128i u = lowbd_get_recon_8x8_sse2(v, res);
+      _mm_storel_epi64((__m128i *)(out), u);
+      out += stride;
+    }
+  }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_v_identity_ssse3(
+    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+    TX_SIZE tx_size, int eob) {
+  __m128i buf1[64];
+  int eobx, eoby;
+  get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_size_w_div8 = txfm_size_col >> 3;
+  const int buf_size_h_div8 = (eoby + 8) >> 3;
+  const int input_stride = AOMMIN(32, txfm_size_col);
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+
+  const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eobx];
+  const transform_1d_ssse3 row_txfm =
+      lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx];
+
+  assert(row_txfm != NULL);
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  for (int i = 0; i < buf_size_h_div8; i++) {
+    __m128i buf0[64];
+    const int32_t *input_row = input + i * input_stride * 8;
+    for (int j = 0; j < AOMMIN(4, buf_size_w_div8); ++j) {
+      __m128i *buf0_cur = buf0 + j * 8;
+      load_buffer_32bit_to_16bit(input_row + j * 8, input_stride, buf0_cur, 8);
+      transpose_16bit_8x8(buf0_cur, buf0_cur);
+    }
+    if (rect_type == 1 || rect_type == -1) {
+      round_shift_ssse3(buf0, buf0, input_stride);  // rect special code
+    }
+    row_txfm(buf0, buf0, cos_bit_row);
+    round_shift_16bit_ssse3(buf0, txfm_size_col, shift[0]);
+    __m128i *_buf1 = buf1;
+    if (lr_flip) {
+      for (int j = 0; j < buf_size_w_div8; ++j) {
+        __m128i temp[8];
+        flip_buf_sse2(buf0 + 8 * j, temp, 8);
+        transpose_16bit_8x8(temp, _buf1 + 8 * (buf_size_w_div8 - 1 - j));
+      }
+    } else {
+      for (int j = 0; j < buf_size_w_div8; ++j) {
+        transpose_16bit_8x8(buf0 + 8 * j, _buf1 + 8 * j);
+      }
+    }
+
+    for (int j = 0; j < buf_size_w_div8; ++j) {
+      iidentity_col_8xn_ssse3(output + i * 8 * stride + j * 8, stride,
+                              buf1 + j * 8, shift[1], 8, txh_idx);
+    }
+  }
+}
+
+// for 32x32,32x64,64x32,64x64,32x8,8x32,16x32,32x16,64x16,16x64
+static INLINE void lowbd_inv_txfm2d_add_universe_ssse3(
+    const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+    TX_SIZE tx_size, int eob) {
+  switch (tx_type) {
+    case DCT_DCT:
+      lowbd_inv_txfm2d_add_no_identity_ssse3(input, output, stride, tx_type,
+                                             tx_size, eob);
+      break;
+    case IDTX:
+      lowbd_inv_txfm2d_add_idtx_ssse3(input, output, stride, tx_size);
+      break;
+    case V_DCT:
+    case V_ADST:
+    case V_FLIPADST:
+      lowbd_inv_txfm2d_add_h_identity_ssse3(input, output, stride, tx_type,
+                                            tx_size, eob);
+      break;
+    case H_DCT:
+    case H_ADST:
+    case H_FLIPADST:
+      lowbd_inv_txfm2d_add_v_identity_ssse3(input, output, stride, tx_type,
+                                            tx_size, eob);
+      break;
+    default:
+      lowbd_inv_txfm2d_add_no_identity_ssse3(input, output, stride, tx_type,
+                                             tx_size, eob);
+      break;
+  }
+}
+
+static void lowbd_inv_txfm2d_add_4x8_ssse3(const int32_t *input,
+                                           uint8_t *output, int stride,
+                                           TX_TYPE tx_type, TX_SIZE tx_size_,
+                                           int eob) {
+  (void)tx_size_;
+  (void)eob;
+  __m128i buf[8];
+  const TX_SIZE tx_size = TX_4X8;
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+
+  const transform_1d_ssse3 row_txfm =
+      lowbd_txfm_all_1d_w8_arr[txw_idx][hitx_1d_tab[tx_type]];
+  const transform_1d_ssse3 col_txfm =
+      lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  load_buffer_32bit_to_16bit_w4(input, txfm_size_col, buf, txfm_size_row);
+  transpose_16bit_4x8(buf, buf);
+  round_shift_ssse3(buf, buf, txfm_size_col);  // rect special code
+  row_txfm(buf, buf, cos_bit_row);
+  // round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]);// shift[0] is 0
+  if (lr_flip) {
+    __m128i temp[4];
+    flip_buf_sse2(buf, temp, txfm_size_col);
+    transpose_16bit_8x4(temp, buf);
+  } else {
+    transpose_16bit_8x4(buf, buf);
+  }
+  col_txfm(buf, buf, cos_bit_col);
+  round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
+  lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
+}
+
+static void lowbd_inv_txfm2d_add_8x4_ssse3(const int32_t *input,
+                                           uint8_t *output, int stride,
+                                           TX_TYPE tx_type, TX_SIZE tx_size_,
+                                           int eob) {
+  (void)tx_size_;
+  (void)eob;
+  __m128i buf[8];
+  const TX_SIZE tx_size = TX_8X4;
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+
+  const transform_1d_ssse3 row_txfm =
+      lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]];
+  const transform_1d_ssse3 col_txfm =
+      lowbd_txfm_all_1d_w8_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  load_buffer_32bit_to_16bit(input, txfm_size_col, buf, txfm_size_row);
+  transpose_16bit_8x4(buf, buf);
+  round_shift_ssse3(buf, buf, txfm_size_col);  // rect special code
+  row_txfm(buf, buf, cos_bit_row);
+  // round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]); // shift[0] is 0
+  if (lr_flip) {
+    __m128i temp[8];
+    flip_buf_sse2(buf, temp, txfm_size_col);
+    transpose_16bit_4x8(temp, buf);
+  } else {
+    transpose_16bit_4x8(buf, buf);
+  }
+  col_txfm(buf, buf, cos_bit_col);
+  round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
+  lowbd_write_buffer_8xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
+}
+
+static void lowbd_inv_txfm2d_add_4x16_ssse3(const int32_t *input,
+                                            uint8_t *output, int stride,
+                                            TX_TYPE tx_type, TX_SIZE tx_size_,
+                                            int eob) {
+  (void)tx_size_;
+  (void)eob;
+  __m128i buf[16];
+  const TX_SIZE tx_size = TX_4X16;
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+
+  const transform_1d_ssse3 row_txfm =
+      lowbd_txfm_all_1d_w8_arr[txw_idx][hitx_1d_tab[tx_type]];
+  const transform_1d_ssse3 col_txfm =
+      lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  const int row_one_loop = 8;
+  for (int i = 0; i < 2; ++i) {
+    const int32_t *input_cur = input + i * txfm_size_col * row_one_loop;
+    __m128i *buf_cur = buf + i * row_one_loop;
+    load_buffer_32bit_to_16bit_w4(input_cur, txfm_size_col, buf_cur,
+                                  row_one_loop);
+    transpose_16bit_4x8(buf_cur, buf_cur);
+    if (row_txfm == iidentity4_ssse3) {
+      const __m128i scale = pair_set_epi16(NewSqrt2, 3 << (NewSqrt2Bits - 1));
+      const __m128i ones = _mm_set1_epi16(1);
+      for (int j = 0; j < 4; ++j) {
+        const __m128i buf_lo = _mm_unpacklo_epi16(buf_cur[j], ones);
+        const __m128i buf_hi = _mm_unpackhi_epi16(buf_cur[j], ones);
+        const __m128i buf_32_lo =
+            _mm_srai_epi32(_mm_madd_epi16(buf_lo, scale), (NewSqrt2Bits + 1));
+        const __m128i buf_32_hi =
+            _mm_srai_epi32(_mm_madd_epi16(buf_hi, scale), (NewSqrt2Bits + 1));
+        buf_cur[j] = _mm_packs_epi32(buf_32_lo, buf_32_hi);
+      }
+    } else {
+      row_txfm(buf_cur, buf_cur, cos_bit_row);
+      round_shift_16bit_ssse3(buf_cur, row_one_loop, shift[0]);
+    }
+    if (lr_flip) {
+      __m128i temp[8];
+      flip_buf_sse2(buf_cur, temp, txfm_size_col);
+      transpose_16bit_8x4(temp, buf_cur);
+    } else {
+      transpose_16bit_8x4(buf_cur, buf_cur);
+    }
+  }
+  col_txfm(buf, buf, cos_bit_col);
+  round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
+  lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
+}
+
+static void lowbd_inv_txfm2d_add_16x4_ssse3(const int32_t *input,
+                                            uint8_t *output, int stride,
+                                            TX_TYPE tx_type, TX_SIZE tx_size_,
+                                            int eob) {
+  (void)tx_size_;
+  (void)eob;
+  __m128i buf[16];
+  const TX_SIZE tx_size = TX_16X4;
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_size_w_div8 = txfm_size_col >> 3;
+
+  const transform_1d_ssse3 row_txfm =
+      lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]];
+  const transform_1d_ssse3 col_txfm =
+      lowbd_txfm_all_1d_w8_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  const int row_one_loop = 8;
+  for (int i = 0; i < buf_size_w_div8; ++i) {
+    const int32_t *input_cur = input + i * row_one_loop;
+    __m128i *buf_cur = buf + i * row_one_loop;
+    load_buffer_32bit_to_16bit(input_cur, txfm_size_col, buf_cur,
+                               txfm_size_row);
+    transpose_16bit_8x4(buf_cur, buf_cur);
+  }
+  if (row_txfm == iidentity16_ssse3) {
+    const __m128i scale = pair_set_epi16(2 * NewSqrt2, 3 << (NewSqrt2Bits - 1));
+    const __m128i ones = _mm_set1_epi16(1);
+    for (int j = 0; j < 16; ++j) {
+      const __m128i buf_lo = _mm_unpacklo_epi16(buf[j], ones);
+      const __m128i buf_hi = _mm_unpackhi_epi16(buf[j], ones);
+      const __m128i buf_32_lo =
+          _mm_srai_epi32(_mm_madd_epi16(buf_lo, scale), (NewSqrt2Bits + 1));
+      const __m128i buf_32_hi =
+          _mm_srai_epi32(_mm_madd_epi16(buf_hi, scale), (NewSqrt2Bits + 1));
+      buf[j] = _mm_packs_epi32(buf_32_lo, buf_32_hi);
+    }
+  } else {
+    row_txfm(buf, buf, cos_bit_row);
+    round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]);
+  }
+  if (lr_flip) {
+    __m128i temp[16];
+    flip_buf_sse2(buf, temp, 16);
+    transpose_16bit_4x8(temp, buf);
+    transpose_16bit_4x8(temp + 8, buf + 8);
+  } else {
+    transpose_16bit_4x8(buf, buf);
+    transpose_16bit_4x8(buf + row_one_loop, buf + row_one_loop);
+  }
+  for (int i = 0; i < buf_size_w_div8; i++) {
+    col_txfm(buf + i * row_one_loop, buf + i * row_one_loop, cos_bit_col);
+    round_shift_16bit_ssse3(buf + i * row_one_loop, txfm_size_row, shift[1]);
+  }
+  lowbd_write_buffer_8xn_sse2(buf, output, stride, ud_flip, 4);
+  lowbd_write_buffer_8xn_sse2(buf + 8, output + 8, stride, ud_flip, 4);
+}
+
+void av1_lowbd_inv_txfm2d_add_ssse3(const int32_t *input, uint8_t *output,
+                                    int stride, TX_TYPE tx_type,
+                                    TX_SIZE tx_size, int eob) {
+  switch (tx_size) {
+    case TX_4X4:
+      lowbd_inv_txfm2d_add_4x4_ssse3(input, output, stride, tx_type, tx_size,
+                                     eob);
+      break;
+    case TX_4X8:
+      lowbd_inv_txfm2d_add_4x8_ssse3(input, output, stride, tx_type, tx_size,
+                                     eob);
+      break;
+    case TX_8X4:
+      lowbd_inv_txfm2d_add_8x4_ssse3(input, output, stride, tx_type, tx_size,
+                                     eob);
+      break;
+    case TX_4X16:
+      lowbd_inv_txfm2d_add_4x16_ssse3(input, output, stride, tx_type, tx_size,
+                                      eob);
+      break;
+    case TX_16X4:
+      lowbd_inv_txfm2d_add_16x4_ssse3(input, output, stride, tx_type, tx_size,
+                                      eob);
+      break;
+    default:
+      lowbd_inv_txfm2d_add_universe_ssse3(input, output, stride, tx_type,
+                                          tx_size, eob);
+      break;
+  }
+}
+
+void av1_inv_txfm_add_ssse3(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
+                            const TxfmParam *txfm_param) {
+  if (!txfm_param->lossless) {
+    const TX_TYPE tx_type = txfm_param->tx_type;
+    av1_lowbd_inv_txfm2d_add_ssse3(dqcoeff, dst, stride, tx_type,
+                                   txfm_param->tx_size, txfm_param->eob);
+
+  } else {
+    av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param);
+  }
+}
diff --git a/libs/libaom/src/av1/common/x86/av1_inv_txfm_ssse3.h b/libs/libaom/src/av1/common/x86/av1_inv_txfm_ssse3.h
new file mode 100644
index 000000000..7d5055deb
--- /dev/null
+++ b/libs/libaom/src/av1/common/x86/av1_inv_txfm_ssse3.h
@@ -0,0 +1,232 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_
+#define AOM_AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_
+
+#include <emmintrin.h>  // SSE2
+#include <tmmintrin.h>  // SSSE3
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/transpose_sse2.h"
+#include "aom_dsp/x86/txfm_common_sse2.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define btf_16_ssse3(w0, w1, in, out0, out1)    \
+  do {                                          \
+    const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
+    const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
+    const __m128i _in = in;                     \
+    out0 = _mm_mulhrs_epi16(_in, _w0);          \
+    out1 = _mm_mulhrs_epi16(_in, _w1);          \
+  } while (0)
+
+#define btf_16_adds_subs_sse2(in0, in1) \
+  do {                                  \
+    const __m128i _in0 = in0;           \
+    const __m128i _in1 = in1;           \
+    in0 = _mm_adds_epi16(_in0, _in1);   \
+    in1 = _mm_subs_epi16(_in0, _in1);   \
+  } while (0)
+
+#define btf_16_subs_adds_sse2(in0, in1) \
+  do {                                  \
+    const __m128i _in0 = in0;           \
+    const __m128i _in1 = in1;           \
+    in1 = _mm_subs_epi16(_in0, _in1);   \
+    in0 = _mm_adds_epi16(_in0, _in1);   \
+  } while (0)
+
+#define btf_16_adds_subs_out_sse2(out0, out1, in0, in1) \
+  do {                                                  \
+    const __m128i _in0 = in0;                           \
+    const __m128i _in1 = in1;                           \
+    out0 = _mm_adds_epi16(_in0, _in1);                  \
+    out1 = _mm_subs_epi16(_in0, _in1);                  \
+  } while (0)
+
+static INLINE void round_shift_16bit_ssse3(__m128i *in, int size, int bit) {
+  if (bit < 0) {
+    const __m128i scale = _mm_set1_epi16(1 << (15 + bit));
+    for (int i = 0; i < size; ++i) {
+      in[i] = _mm_mulhrs_epi16(in[i], scale);
+    }
+  } else if (bit > 0) {
+    for (int i = 0; i < size; ++i) {
+      in[i] = _mm_slli_epi16(in[i], bit);
+    }
+  }
+}
+
+// 1D itx types
+enum {
+  IDCT_1D,
+  IADST_1D,
+  IFLIPADST_1D = IADST_1D,
+  IIDENTITY_1D,
+  ITX_TYPES_1D,
+} UENUM1BYTE(ITX_TYPE_1D);
+
+static const ITX_TYPE_1D vitx_1d_tab[TX_TYPES] = {
+  IDCT_1D,      IADST_1D,     IDCT_1D,      IADST_1D,
+  IFLIPADST_1D, IDCT_1D,      IFLIPADST_1D, IADST_1D,
+  IFLIPADST_1D, IIDENTITY_1D, IDCT_1D,      IIDENTITY_1D,
+  IADST_1D,     IIDENTITY_1D, IFLIPADST_1D, IIDENTITY_1D,
+};
+
+static const ITX_TYPE_1D hitx_1d_tab[TX_TYPES] = {
+  IDCT_1D,      IDCT_1D,      IADST_1D,     IADST_1D,
+  IDCT_1D,      IFLIPADST_1D, IFLIPADST_1D, IFLIPADST_1D,
+  IADST_1D,     IIDENTITY_1D, IIDENTITY_1D, IDCT_1D,
+  IIDENTITY_1D, IADST_1D,     IIDENTITY_1D, IFLIPADST_1D,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x8_default[8]) = {
+  0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                av1_eob_to_eobxy_16x16_default[16]) = {
+  0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
+  0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                av1_eob_to_eobxy_32x32_default[32]) = {
+  0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+  0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+  0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+  0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x16_default[16]) = {
+  0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07,
+  0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_16x8_default[8]) = {
+  0x0707, 0x0707, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                av1_eob_to_eobxy_16x32_default[32]) = {
+  0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
+  0x0f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
+  0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
+  0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                av1_eob_to_eobxy_32x16_default[16]) = {
+  0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f,
+  0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x32_default[32]) = {
+  0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07,
+  0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x1f07, 0x1f07, 0x1f07,
+  0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07,
+  0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_32x8_default[8]) = {
+  0x0707, 0x070f, 0x070f, 0x071f, 0x071f, 0x071f, 0x071f, 0x071f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t *,
+                av1_eob_to_eobxy_default[TX_SIZES_ALL]) = {
+  NULL,
+  av1_eob_to_eobxy_8x8_default,
+  av1_eob_to_eobxy_16x16_default,
+  av1_eob_to_eobxy_32x32_default,
+  av1_eob_to_eobxy_32x32_default,
+  NULL,
+  NULL,
+  av1_eob_to_eobxy_8x16_default,
+  av1_eob_to_eobxy_16x8_default,
+  av1_eob_to_eobxy_16x32_default,
+  av1_eob_to_eobxy_32x16_default,
+  av1_eob_to_eobxy_32x32_default,
+  av1_eob_to_eobxy_32x32_default,
+  NULL,
+  NULL,
+  av1_eob_to_eobxy_8x32_default,
+  av1_eob_to_eobxy_32x8_default,
+  av1_eob_to_eobxy_16x32_default,
+  av1_eob_to_eobxy_32x16_default,
+};
+
+static const int lowbd_txfm_all_1d_zeros_idx[32] = {
+  0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
+  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+};
+
+// Transform block width in log2 for eob (size of 64 map to 32)
+static const int tx_size_wide_log2_eob[TX_SIZES_ALL] = {
+  2, 3, 4, 5, 5, 2, 3, 3, 4, 4, 5, 5, 5, 2, 4, 3, 5, 4, 5,
+};
+
+static INLINE void get_eobx_eoby_scan_default(int *eobx, int *eoby,
+                                              TX_SIZE tx_size, int eob) {
+  if (eob == 1) {
+    *eobx = 0;
+    *eoby = 0;
+    return;
+  }
+
+  const int tx_w_log2 = tx_size_wide_log2_eob[tx_size];
+  const int eob_row = (eob - 1) >> tx_w_log2;
+  const int eobxy = av1_eob_to_eobxy_default[tx_size][eob_row];
+  *eobx = eobxy & 0xFF;
+  *eoby = eobxy >> 8;
+}
+
+static int eob_fill[32] = {
+  0,  7,  7,  7,  7,  7,  7,  7,  15, 15, 15, 15, 15, 15, 15, 15,
+  31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+};
+
+static INLINE void get_eobx_eoby_scan_h_identity(int *eobx, int *eoby,
+                                                 TX_SIZE tx_size, int eob) {
+  eob -= 1;
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int eobx_max = AOMMIN(32, txfm_size_col) - 1;
+  *eobx = (eob >= eobx_max) ? eobx_max : eob_fill[eob];
+  const int temp_eoby = eob / (eobx_max + 1);
+  assert(temp_eoby < 32);
+  *eoby = eob_fill[temp_eoby];
+}
+
+static INLINE void get_eobx_eoby_scan_v_identity(int *eobx, int *eoby,
+                                                 TX_SIZE tx_size, int eob) {
+  eob -= 1;
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int eoby_max = AOMMIN(32, txfm_size_row) - 1;
+  *eobx = eob / (eoby_max + 1);
+  *eoby = (eob >= eoby_max) ? eoby_max : eob_fill[eob];
+}
+
+typedef void (*transform_1d_ssse3)(const __m128i *input, __m128i *output,
+                                   int8_t cos_bit);
+
+void av1_lowbd_inv_txfm2d_add_ssse3(const int32_t *input, uint8_t *output,
+                                    int stride, TX_TYPE tx_type,
+                                    TX_SIZE tx_size, int eob);
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_
diff --git a/libs/libaom/src/av1/common/x86/av1_txfm_sse2.h b/libs/libaom/src/av1/common/x86/av1_txfm_sse2.h
new file mode 100644
index 000000000..77aeb6eb1
--- /dev/null
+++ b/libs/libaom/src/av1/common/x86/av1_txfm_sse2.h
@@ -0,0 +1,317 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_COMMON_X86_AV1_TXFM_SSE2_H_
+#define AOM_AV1_COMMON_X86_AV1_TXFM_SSE2_H_
+
+#include <emmintrin.h>  // SSE2
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/transpose_sse2.h"
+#include "aom_dsp/x86/txfm_common_sse2.h"
+#include "av1/common/av1_txfm.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static INLINE void btf_16_w4_sse2(
+    const __m128i *const w0, const __m128i *const w1, const __m128i __rounding,
+    const int8_t cos_bit, const __m128i *const in0, const __m128i *const in1,
+    __m128i *const out0, __m128i *const out1) {
+  const __m128i t0 = _mm_unpacklo_epi16(*in0, *in1);
+  const __m128i u0 = _mm_madd_epi16(t0, *w0);
+  const __m128i v0 = _mm_madd_epi16(t0, *w1);
+  const __m128i a0 = _mm_add_epi32(u0, __rounding);
+  const __m128i b0 = _mm_add_epi32(v0, __rounding);
+  const __m128i c0 = _mm_srai_epi32(a0, cos_bit);
+  const __m128i d0 = _mm_srai_epi32(b0, cos_bit);
+
+  *out0 = _mm_packs_epi32(c0, c0);
+  *out1 = _mm_packs_epi32(d0, c0);
+}
+
+#define btf_16_4p_sse2(w0, w1, in0, in1, out0, out1) \
+  {                                                  \
+    __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
+    __m128i u0 = _mm_madd_epi16(t0, w0);             \
+    __m128i v0 = _mm_madd_epi16(t0, w1);             \
+                                                     \
+    __m128i a0 = _mm_add_epi32(u0, __rounding);      \
+    __m128i b0 = _mm_add_epi32(v0, __rounding);      \
+                                                     \
+    __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
+    __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
+                                                     \
+    out0 = _mm_packs_epi32(c0, c0);                  \
+    out1 = _mm_packs_epi32(d0, d0);                  \
+  }
+
+#define btf_16_sse2(w0, w1, in0, in1, out0, out1) \
+  {                                               \
+    __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
+    __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
+    __m128i u0 = _mm_madd_epi16(t0, w0);          \
+    __m128i u1 = _mm_madd_epi16(t1, w0);          \
+    __m128i v0 = _mm_madd_epi16(t0, w1);          \
+    __m128i v1 = _mm_madd_epi16(t1, w1);          \
+                                                  \
+    __m128i a0 = _mm_add_epi32(u0, __rounding);   \
+    __m128i a1 = _mm_add_epi32(u1, __rounding);   \
+    __m128i b0 = _mm_add_epi32(v0, __rounding);   \
+    __m128i b1 = _mm_add_epi32(v1, __rounding);   \
+                                                  \
+    __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
+    __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
+    __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
+    __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
+                                                  \
+    out0 = _mm_packs_epi32(c0, c1);               \
+    out1 = _mm_packs_epi32(d0, d1);               \
+  }
+
+static INLINE __m128i load_16bit_to_16bit(const int16_t *a) {
+  return _mm_load_si128((const __m128i *)a);
+}
+
+static INLINE __m128i load_32bit_to_16bit(const int32_t *a) {
+  const __m128i a_low = _mm_load_si128((const __m128i *)a);
+  return _mm_packs_epi32(a_low, *(const __m128i *)(a + 4));
+}
+
+static INLINE __m128i load_32bit_to_16bit_w4(const int32_t *a) {
+  const __m128i a_low = _mm_load_si128((const __m128i *)a);
+  return _mm_packs_epi32(a_low, a_low);
+}
+
+// Store 4 16 bit values. Sign extend the values.
+static INLINE void store_16bit_to_32bit_w4(const __m128i a, int32_t *const b) {
+  const __m128i a_lo = _mm_unpacklo_epi16(a, a);
+  const __m128i a_1 = _mm_srai_epi32(a_lo, 16);
+  _mm_store_si128((__m128i *)b, a_1);
+}
+
+// Store 8 16 bit values. Sign extend the values.
+static INLINE void store_16bit_to_32bit(__m128i a, int32_t *b) {
+  const __m128i a_lo = _mm_unpacklo_epi16(a, a);
+  const __m128i a_hi = _mm_unpackhi_epi16(a, a);
+  const __m128i a_1 = _mm_srai_epi32(a_lo, 16);
+  const __m128i a_2 = _mm_srai_epi32(a_hi, 16);
+  _mm_store_si128((__m128i *)b, a_1);
+  _mm_store_si128((__m128i *)(b + 4), a_2);
+}
+
+static INLINE __m128i scale_round_sse2(const __m128i a, const int scale) {
+  const __m128i scale_rounding = pair_set_epi16(scale, 1 << (NewSqrt2Bits - 1));
+  const __m128i b = _mm_madd_epi16(a, scale_rounding);
+  return _mm_srai_epi32(b, NewSqrt2Bits);
+}
+
+static INLINE void store_rect_16bit_to_32bit_w4(const __m128i a,
+                                                int32_t *const b) {
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i a_lo = _mm_unpacklo_epi16(a, one);
+  const __m128i b_lo = scale_round_sse2(a_lo, NewSqrt2);
+  _mm_store_si128((__m128i *)b, b_lo);
+}
+
+static INLINE void store_rect_16bit_to_32bit(const __m128i a,
+                                             int32_t *const b) {
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i a_lo = _mm_unpacklo_epi16(a, one);
+  const __m128i a_hi = _mm_unpackhi_epi16(a, one);
+  const __m128i b_lo = scale_round_sse2(a_lo, NewSqrt2);
+  const __m128i b_hi = scale_round_sse2(a_hi, NewSqrt2);
+  _mm_store_si128((__m128i *)b, b_lo);
+  _mm_store_si128((__m128i *)(b + 4), b_hi);
+}
+
+static INLINE void load_buffer_16bit_to_16bit_w4(const int16_t *const in,
+                                                 const int stride,
+                                                 __m128i *const out,
+                                                 const int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    out[i] = _mm_loadl_epi64((const __m128i *)(in + i * stride));
+  }
+}
+
+static INLINE void load_buffer_16bit_to_16bit_w4_flip(const int16_t *const in,
+                                                      const int stride,
+                                                      __m128i *const out,
+                                                      const int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    out[out_size - i - 1] = _mm_loadl_epi64((const __m128i *)(in + i * stride));
+  }
+}
+
+static INLINE void load_buffer_16bit_to_16bit(const int16_t *in, int stride,
+                                              __m128i *out, int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    out[i] = load_16bit_to_16bit(in + i * stride);
+  }
+}
+
+static INLINE void load_buffer_16bit_to_16bit_flip(const int16_t *in,
+                                                   int stride, __m128i *out,
+                                                   int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    out[out_size - i - 1] = load_16bit_to_16bit(in + i * stride);
+  }
+}
+
+static INLINE void load_buffer_32bit_to_16bit(const int32_t *in, int stride,
+                                              __m128i *out, int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    out[i] = load_32bit_to_16bit(in + i * stride);
+  }
+}
+
+static INLINE void load_buffer_32bit_to_16bit_w4(const int32_t *in, int stride,
+                                                 __m128i *out, int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    out[i] = load_32bit_to_16bit_w4(in + i * stride);
+  }
+}
+
+static INLINE void load_buffer_32bit_to_16bit_flip(const int32_t *in,
+                                                   int stride, __m128i *out,
+                                                   int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    out[out_size - i - 1] = load_32bit_to_16bit(in + i * stride);
+  }
+}
+
+static INLINE void store_buffer_16bit_to_32bit_w4(const __m128i *const in,
+                                                  int32_t *const out,
+                                                  const int stride,
+                                                  const int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    store_16bit_to_32bit_w4(in[i], out + i * stride);
+  }
+}
+
+static INLINE void store_buffer_16bit_to_32bit_w8(const __m128i *const in,
+                                                  int32_t *const out,
+                                                  const int stride,
+                                                  const int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    store_16bit_to_32bit(in[i], out + i * stride);
+  }
+}
+
+static INLINE void store_rect_buffer_16bit_to_32bit_w4(const __m128i *const in,
+                                                       int32_t *const out,
+                                                       const int stride,
+                                                       const int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    store_rect_16bit_to_32bit_w4(in[i], out + i * stride);
+  }
+}
+
+static INLINE void store_rect_buffer_16bit_to_32bit_w8(const __m128i *const in,
+                                                       int32_t *const out,
+                                                       const int stride,
+                                                       const int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    store_rect_16bit_to_32bit(in[i], out + i * stride);
+  }
+}
+
+static INLINE void store_buffer_16bit_to_16bit_8x8(const __m128i *in,
+                                                   uint16_t *out,
+                                                   const int stride) {
+  for (int i = 0; i < 8; ++i) {
+    _mm_store_si128((__m128i *)(out + i * stride), in[i]);
+  }
+}
+
+static INLINE void round_shift_16bit(__m128i *in, int size, int bit) {
+  if (bit < 0) {
+    bit = -bit;
+    __m128i rounding = _mm_set1_epi16(1 << (bit - 1));
+    for (int i = 0; i < size; ++i) {
+      in[i] = _mm_adds_epi16(in[i], rounding);
+      in[i] = _mm_srai_epi16(in[i], bit);
+    }
+  } else if (bit > 0) {
+    for (int i = 0; i < size; ++i) {
+      in[i] = _mm_slli_epi16(in[i], bit);
+    }
+  }
+}
+
+static INLINE void flip_buf_sse2(__m128i *in, __m128i *out, int size) {
+  for (int i = 0; i < size; ++i) {
+    out[size - i - 1] = in[i];
+  }
+}
+
+void av1_lowbd_fwd_txfm2d_4x4_sse2(const int16_t *input, int32_t *output,
+                                   int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_4x8_sse2(const int16_t *input, int32_t *output,
+                                   int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_4x16_sse2(const int16_t *input, int32_t *output,
+                                    int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_8x4_sse2(const int16_t *input, int32_t *output,
+                                   int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_8x8_sse2(const int16_t *input, int32_t *output,
+                                   int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_8x16_sse2(const int16_t *input, int32_t *output,
+                                    int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_8x32_sse2(const int16_t *input, int32_t *output,
+                                    int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_16x4_sse2(const int16_t *input, int32_t *output,
+                                    int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_16x8_sse2(const int16_t *input, int32_t *output,
+                                    int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_16x16_sse2(const int16_t *input, int32_t *output,
+                                     int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_16x32_sse2(const int16_t *input, int32_t *output,
+                                     int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_32x8_sse2(const int16_t *input, int32_t *output,
+                                    int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_32x16_sse2(const int16_t *input, int32_t *output,
+                                     int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_32x32_sse2(const int16_t *input, int32_t *output,
+                                     int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_16x64_sse2(const int16_t *input, int32_t *output,
+                                     int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_64x16_sse2(const int16_t *input, int32_t *output,
+                                     int stride, TX_TYPE tx_type, int bd);
+
+typedef void (*transform_1d_sse2)(const __m128i *input, __m128i *output,
+                                  int8_t cos_bit);
+
+typedef struct {
+  transform_1d_sse2 col, row;  // vertical and horizontal
+} transform_2d_sse2;
+
+#ifdef __cplusplus
+}
+#endif  // __cplusplus
+#endif  // AOM_AV1_COMMON_X86_AV1_TXFM_SSE2_H_
diff --git a/libs/libaom/src/av1/common/x86/av1_txfm_sse4.c b/libs/libaom/src/av1/common/x86/av1_txfm_sse4.c
new file mode 100644
index 000000000..65ccd1952
--- /dev/null
+++ b/libs/libaom/src/av1/common/x86/av1_txfm_sse4.c
@@ -0,0 +1,21 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/av1_txfm.h"
+#include "av1/common/x86/av1_txfm_sse4.h"
+
+void av1_round_shift_array_sse4_1(int32_t *arr, int size, int bit) {
+  __m128i *const vec = (__m128i *)arr;
+  const int vec_size = size >> 2;
+  av1_round_shift_array_32_sse4_1(vec, vec, vec_size, bit);
+}
diff --git a/libs/libaom/src/av1/common/x86/av1_txfm_sse4.h b/libs/libaom/src/av1/common/x86/av1_txfm_sse4.h
new file mode 100644
index 000000000..6cad821b1
--- /dev/null
+++ b/libs/libaom/src/av1/common/x86/av1_txfm_sse4.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_X86_AV1_TXFM_SSE4_H_
+#define AOM_AV1_COMMON_X86_AV1_TXFM_SSE4_H_
+
+#include <smmintrin.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static INLINE __m128i av1_round_shift_32_sse4_1(__m128i vec, int bit) {
+  __m128i tmp, round;
+  round = _mm_set1_epi32(1 << (bit - 1));
+  tmp = _mm_add_epi32(vec, round);
+  return _mm_srai_epi32(tmp, bit);
+}
+
+static INLINE void av1_round_shift_array_32_sse4_1(__m128i *input,
+                                                   __m128i *output,
+                                                   const int size,
+                                                   const int bit) {
+  if (bit > 0) {
+    int i;
+    for (i = 0; i < size; i++) {
+      output[i] = av1_round_shift_32_sse4_1(input[i], bit);
+    }
+  } else {
+    int i;
+    for (i = 0; i < size; i++) {
+      output[i] = _mm_slli_epi32(input[i], -bit);
+    }
+  }
+}
+
+static INLINE void av1_round_shift_rect_array_32_sse4_1(__m128i *input,
+                                                        __m128i *output,
+                                                        const int size,
+                                                        const int bit,
+                                                        const int val) {
+  const __m128i sqrt2 = _mm_set1_epi32(val);
+  if (bit > 0) {
+    int i;
+    for (i = 0; i < size; i++) {
+      const __m128i r0 = av1_round_shift_32_sse4_1(input[i], bit);
+      const __m128i r1 = _mm_mullo_epi32(sqrt2, r0);
+      output[i] = av1_round_shift_32_sse4_1(r1, NewSqrt2Bits);
+    }
+  } else {
+    int i;
+    for (i = 0; i < size; i++) {
+      const __m128i r0 = _mm_slli_epi32(input[i], -bit);
+      const __m128i r1 = _mm_mullo_epi32(sqrt2, r0);
+      output[i] = av1_round_shift_32_sse4_1(r1, NewSqrt2Bits);
+    }
+  }
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // AOM_AV1_COMMON_X86_AV1_TXFM_SSE4_H_
diff --git a/libs/libaom/src/av1/common/x86/cfl_avx2.c b/libs/libaom/src/av1/common/x86/cfl_avx2.c
new file mode 100644
index 000000000..d9c6f99d5
--- /dev/null
+++ b/libs/libaom/src/av1/common/x86/cfl_avx2.c
@@ -0,0 +1,495 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <immintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/cfl.h"
+
+#include "av1/common/x86/cfl_simd.h"
+
+#define CFL_GET_SUBSAMPLE_FUNCTION_AVX2(sub, bd)                               \
+  CFL_SUBSAMPLE(avx2, sub, bd, 32, 32)                                         \
+  CFL_SUBSAMPLE(avx2, sub, bd, 32, 16)                                         \
+  CFL_SUBSAMPLE(avx2, sub, bd, 32, 8)                                          \
+  cfl_subsample_##bd##_fn cfl_get_luma_subsampling_##sub##_##bd##_avx2(        \
+      TX_SIZE tx_size) {                                                       \
+    static const cfl_subsample_##bd##_fn subfn_##sub[TX_SIZES_ALL] = {         \
+      cfl_subsample_##bd##_##sub##_4x4_ssse3,   /* 4x4 */                      \
+      cfl_subsample_##bd##_##sub##_8x8_ssse3,   /* 8x8 */                      \
+      cfl_subsample_##bd##_##sub##_16x16_ssse3, /* 16x16 */                    \
+      cfl_subsample_##bd##_##sub##_32x32_avx2,  /* 32x32 */                    \
+      NULL,                                     /* 64x64 (invalid CFL size) */ \
+      cfl_subsample_##bd##_##sub##_4x8_ssse3,   /* 4x8 */                      \
+      cfl_subsample_##bd##_##sub##_8x4_ssse3,   /* 8x4 */                      \
+      cfl_subsample_##bd##_##sub##_8x16_ssse3,  /* 8x16 */                     \
+      cfl_subsample_##bd##_##sub##_16x8_ssse3,  /* 16x8 */                     \
+      cfl_subsample_##bd##_##sub##_16x32_ssse3, /* 16x32 */                    \
+      cfl_subsample_##bd##_##sub##_32x16_avx2,  /* 32x16 */                    \
+      NULL,                                     /* 32x64 (invalid CFL size) */ \
+      NULL,                                     /* 64x32 (invalid CFL size) */ \
+      cfl_subsample_##bd##_##sub##_4x16_ssse3,  /* 4x16  */                    \
+      cfl_subsample_##bd##_##sub##_16x4_ssse3,  /* 16x4  */                    \
+      cfl_subsample_##bd##_##sub##_8x32_ssse3,  /* 8x32  */                    \
+      cfl_subsample_##bd##_##sub##_32x8_avx2,   /* 32x8  */                    \
+      NULL,                                     /* 16x64 (invalid CFL size) */ \
+      NULL,                                     /* 64x16 (invalid CFL size) */ \
+    };                                                                         \
+    return subfn_##sub[tx_size];                                               \
+  }
+
+/**
+ * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more
+ * precise version of a box filter 4:2:0 pixel subsampling in Q3.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ *
+ * Note: For 4:2:0 luma subsampling, the width will never be greater than 16.
+ */
+static void cfl_luma_subsampling_420_lbd_avx2(const uint8_t *input,
+                                              int input_stride,
+                                              uint16_t *pred_buf_q3, int width,
+                                              int height) {
+  (void)width;                               // Forever 32
+  const __m256i twos = _mm256_set1_epi8(2);  // Thirty two twos
+  const int luma_stride = input_stride << 1;
+  __m256i *row = (__m256i *)pred_buf_q3;
+  const __m256i *row_end = row + (height >> 1) * CFL_BUF_LINE_I256;
+  do {
+    __m256i top = _mm256_loadu_si256((__m256i *)input);
+    __m256i bot = _mm256_loadu_si256((__m256i *)(input + input_stride));
+
+    __m256i top_16x16 = _mm256_maddubs_epi16(top, twos);
+    __m256i bot_16x16 = _mm256_maddubs_epi16(bot, twos);
+    __m256i sum_16x16 = _mm256_add_epi16(top_16x16, bot_16x16);
+
+    _mm256_storeu_si256(row, sum_16x16);
+
+    input += luma_stride;
+  } while ((row += CFL_BUF_LINE_I256) < row_end);
+}
+
+CFL_GET_SUBSAMPLE_FUNCTION_AVX2(420, lbd)
+
+/**
+ * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more
+ * precise version of a box filter 4:2:2 pixel subsampling in Q3.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ */
+static void cfl_luma_subsampling_422_lbd_avx2(const uint8_t *input,
+                                              int input_stride,
+                                              uint16_t *pred_buf_q3, int width,
+                                              int height) {
+  (void)width;                                // Forever 32
+  const __m256i fours = _mm256_set1_epi8(4);  // Thirty two fours
+  __m256i *row = (__m256i *)pred_buf_q3;
+  const __m256i *row_end = row + height * CFL_BUF_LINE_I256;
+  do {
+    __m256i top = _mm256_loadu_si256((__m256i *)input);
+    __m256i top_16x16 = _mm256_maddubs_epi16(top, fours);
+    _mm256_storeu_si256(row, top_16x16);
+    input += input_stride;
+  } while ((row += CFL_BUF_LINE_I256) < row_end);
+}
+
+CFL_GET_SUBSAMPLE_FUNCTION_AVX2(422, lbd)
+
+/**
+ * Multiplies the pixels by 8 (scaling in Q3). The AVX2 subsampling is only
+ * performed on block of width 32.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ */
+static void cfl_luma_subsampling_444_lbd_avx2(const uint8_t *input,
+                                              int input_stride,
+                                              uint16_t *pred_buf_q3, int width,
+                                              int height) {
+  (void)width;  // Forever 32
+  __m256i *row = (__m256i *)pred_buf_q3;
+  const __m256i *row_end = row + height * CFL_BUF_LINE_I256;
+  const __m256i zeros = _mm256_setzero_si256();
+  do {
+    __m256i top = _mm256_loadu_si256((__m256i *)input);
+    top = _mm256_permute4x64_epi64(top, _MM_SHUFFLE(3, 1, 2, 0));
+
+    __m256i row_lo = _mm256_unpacklo_epi8(top, zeros);
+    row_lo = _mm256_slli_epi16(row_lo, 3);
+    __m256i row_hi = _mm256_unpackhi_epi8(top, zeros);
+    row_hi = _mm256_slli_epi16(row_hi, 3);
+
+    _mm256_storeu_si256(row, row_lo);
+    _mm256_storeu_si256(row + 1, row_hi);
+
+    input += input_stride;
+  } while ((row += CFL_BUF_LINE_I256) < row_end);
+}
+
+CFL_GET_SUBSAMPLE_FUNCTION_AVX2(444, lbd)
+
+#if CONFIG_AV1_HIGHBITDEPTH
+/**
+ * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more
+ * precise version of a box filter 4:2:0 pixel subsampling in Q3.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ *
+ * Note: For 4:2:0 luma subsampling, the width will never be greater than 16.
+ */
+static void cfl_luma_subsampling_420_hbd_avx2(const uint16_t *input,
+                                              int input_stride,
+                                              uint16_t *pred_buf_q3, int width,
+                                              int height) {
+  (void)width;  // Forever 32
+  const int luma_stride = input_stride << 1;
+  __m256i *row = (__m256i *)pred_buf_q3;
+  const __m256i *row_end = row + (height >> 1) * CFL_BUF_LINE_I256;
+  do {
+    __m256i top = _mm256_loadu_si256((__m256i *)input);
+    __m256i bot = _mm256_loadu_si256((__m256i *)(input + input_stride));
+    __m256i sum = _mm256_add_epi16(top, bot);
+
+    __m256i top_1 = _mm256_loadu_si256((__m256i *)(input + 16));
+    __m256i bot_1 = _mm256_loadu_si256((__m256i *)(input + 16 + input_stride));
+    __m256i sum_1 = _mm256_add_epi16(top_1, bot_1);
+
+    __m256i hsum = _mm256_hadd_epi16(sum, sum_1);
+    hsum = _mm256_permute4x64_epi64(hsum, _MM_SHUFFLE(3, 1, 2, 0));
+    hsum = _mm256_add_epi16(hsum, hsum);
+
+    _mm256_storeu_si256(row, hsum);
+
+    input += luma_stride;
+  } while ((row += CFL_BUF_LINE_I256) < row_end);
+}
+
+CFL_GET_SUBSAMPLE_FUNCTION_AVX2(420, hbd)
+
+/**
+ * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more
+ * precise version of a box filter 4:2:2 pixel subsampling in Q3.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ *
+ */
+static void cfl_luma_subsampling_422_hbd_avx2(const uint16_t *input,
+                                              int input_stride,
+                                              uint16_t *pred_buf_q3, int width,
+                                              int height) {
+  (void)width;  // Forever 32
+  __m256i *row = (__m256i *)pred_buf_q3;
+  const __m256i *row_end = row + height * CFL_BUF_LINE_I256;
+  do {
+    __m256i top = _mm256_loadu_si256((__m256i *)input);
+    __m256i top_1 = _mm256_loadu_si256((__m256i *)(input + 16));
+    __m256i hsum = _mm256_hadd_epi16(top, top_1);
+    hsum = _mm256_permute4x64_epi64(hsum, _MM_SHUFFLE(3, 1, 2, 0));
+    hsum = _mm256_slli_epi16(hsum, 2);
+
+    _mm256_storeu_si256(row, hsum);
+
+    input += input_stride;
+  } while ((row += CFL_BUF_LINE_I256) < row_end);
+}
+
+CFL_GET_SUBSAMPLE_FUNCTION_AVX2(422, hbd)
+
+static void cfl_luma_subsampling_444_hbd_avx2(const uint16_t *input,
+                                              int input_stride,
+                                              uint16_t *pred_buf_q3, int width,
+                                              int height) {
+  (void)width;  // Forever 32
+  __m256i *row = (__m256i *)pred_buf_q3;
+  const __m256i *row_end = row + height * CFL_BUF_LINE_I256;
+  do {
+    __m256i top = _mm256_loadu_si256((__m256i *)input);
+    __m256i top_1 = _mm256_loadu_si256((__m256i *)(input + 16));
+    _mm256_storeu_si256(row, _mm256_slli_epi16(top, 3));
+    _mm256_storeu_si256(row + 1, _mm256_slli_epi16(top_1, 3));
+    input += input_stride;
+  } while ((row += CFL_BUF_LINE_I256) < row_end);
+}
+
+CFL_GET_SUBSAMPLE_FUNCTION_AVX2(444, hbd)
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+static INLINE __m256i predict_unclipped(const __m256i *input, __m256i alpha_q12,
+                                        __m256i alpha_sign, __m256i dc_q0) {
+  __m256i ac_q3 = _mm256_loadu_si256(input);
+  __m256i ac_sign = _mm256_sign_epi16(alpha_sign, ac_q3);
+  __m256i scaled_luma_q0 =
+      _mm256_mulhrs_epi16(_mm256_abs_epi16(ac_q3), alpha_q12);
+  scaled_luma_q0 = _mm256_sign_epi16(scaled_luma_q0, ac_sign);
+  return _mm256_add_epi16(scaled_luma_q0, dc_q0);
+}
+
+static INLINE void cfl_predict_lbd_avx2(const int16_t *pred_buf_q3,
+                                        uint8_t *dst, int dst_stride,
+                                        int alpha_q3, int width, int height) {
+  (void)width;
+  const __m256i alpha_sign = _mm256_set1_epi16(alpha_q3);
+  const __m256i alpha_q12 = _mm256_slli_epi16(_mm256_abs_epi16(alpha_sign), 9);
+  const __m256i dc_q0 = _mm256_set1_epi16(*dst);
+  __m256i *row = (__m256i *)pred_buf_q3;
+  const __m256i *row_end = row + height * CFL_BUF_LINE_I256;
+
+  do {
+    __m256i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0);
+    __m256i next = predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0);
+    res = _mm256_packus_epi16(res, next);
+    res = _mm256_permute4x64_epi64(res, _MM_SHUFFLE(3, 1, 2, 0));
+    _mm256_storeu_si256((__m256i *)dst, res);
+    dst += dst_stride;
+  } while ((row += CFL_BUF_LINE_I256) < row_end);
+}
+
+CFL_PREDICT_X(avx2, 32, 8, lbd);
+CFL_PREDICT_X(avx2, 32, 16, lbd);
+CFL_PREDICT_X(avx2, 32, 32, lbd);
+
+cfl_predict_lbd_fn cfl_get_predict_lbd_fn_avx2(TX_SIZE tx_size) {
+  static const cfl_predict_lbd_fn pred[TX_SIZES_ALL] = {
+    cfl_predict_lbd_4x4_ssse3,   /* 4x4 */
+    cfl_predict_lbd_8x8_ssse3,   /* 8x8 */
+    cfl_predict_lbd_16x16_ssse3, /* 16x16 */
+    cfl_predict_lbd_32x32_avx2,  /* 32x32 */
+    NULL,                        /* 64x64 (invalid CFL size) */
+    cfl_predict_lbd_4x8_ssse3,   /* 4x8 */
+    cfl_predict_lbd_8x4_ssse3,   /* 8x4 */
+    cfl_predict_lbd_8x16_ssse3,  /* 8x16 */
+    cfl_predict_lbd_16x8_ssse3,  /* 16x8 */
+    cfl_predict_lbd_16x32_ssse3, /* 16x32 */
+    cfl_predict_lbd_32x16_avx2,  /* 32x16 */
+    NULL,                        /* 32x64 (invalid CFL size) */
+    NULL,                        /* 64x32 (invalid CFL size) */
+    cfl_predict_lbd_4x16_ssse3,  /* 4x16  */
+    cfl_predict_lbd_16x4_ssse3,  /* 16x4  */
+    cfl_predict_lbd_8x32_ssse3,  /* 8x32  */
+    cfl_predict_lbd_32x8_avx2,   /* 32x8  */
+    NULL,                        /* 16x64 (invalid CFL size) */
+    NULL,                        /* 64x16 (invalid CFL size) */
+  };
+  // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to index the
+  // function pointer array out of bounds.
+  return pred[tx_size % TX_SIZES_ALL];
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static __m256i highbd_max_epi16(int bd) {
+  const __m256i neg_one = _mm256_set1_epi16(-1);
+  // (1 << bd) - 1 => -(-1 << bd) -1 => -1 - (-1 << bd) => -1 ^ (-1 << bd)
+  return _mm256_xor_si256(_mm256_slli_epi16(neg_one, bd), neg_one);
+}
+
+static __m256i highbd_clamp_epi16(__m256i u, __m256i zero, __m256i max) {
+  return _mm256_max_epi16(_mm256_min_epi16(u, max), zero);
+}
+
+static INLINE void cfl_predict_hbd_avx2(const int16_t *pred_buf_q3,
+                                        uint16_t *dst, int dst_stride,
+                                        int alpha_q3, int bd, int width,
+                                        int height) {
+  // Use SSSE3 version for smaller widths
+  assert(width == 16 || width == 32);
+  const __m256i alpha_sign = _mm256_set1_epi16(alpha_q3);
+  const __m256i alpha_q12 = _mm256_slli_epi16(_mm256_abs_epi16(alpha_sign), 9);
+  const __m256i dc_q0 = _mm256_loadu_si256((__m256i *)dst);
+  const __m256i max = highbd_max_epi16(bd);
+
+  __m256i *row = (__m256i *)pred_buf_q3;
+  const __m256i *row_end = row + height * CFL_BUF_LINE_I256;
+  do {
+    const __m256i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0);
+    _mm256_storeu_si256((__m256i *)dst,
+                        highbd_clamp_epi16(res, _mm256_setzero_si256(), max));
+    if (width == 32) {
+      const __m256i res_1 =
+          predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0);
+      _mm256_storeu_si256(
+          (__m256i *)(dst + 16),
+          highbd_clamp_epi16(res_1, _mm256_setzero_si256(), max));
+    }
+    dst += dst_stride;
+  } while ((row += CFL_BUF_LINE_I256) < row_end);
+}
+
+CFL_PREDICT_X(avx2, 16, 4, hbd)
+CFL_PREDICT_X(avx2, 16, 8, hbd)
+CFL_PREDICT_X(avx2, 16, 16, hbd)
+CFL_PREDICT_X(avx2, 16, 32, hbd)
+CFL_PREDICT_X(avx2, 32, 8, hbd)
+CFL_PREDICT_X(avx2, 32, 16, hbd)
+CFL_PREDICT_X(avx2, 32, 32, hbd)
+
+cfl_predict_hbd_fn cfl_get_predict_hbd_fn_avx2(TX_SIZE tx_size) {
+  static const cfl_predict_hbd_fn pred[TX_SIZES_ALL] = {
+    cfl_predict_hbd_4x4_ssse3,  /* 4x4 */
+    cfl_predict_hbd_8x8_ssse3,  /* 8x8 */
+    cfl_predict_hbd_16x16_avx2, /* 16x16 */
+    cfl_predict_hbd_32x32_avx2, /* 32x32 */
+    NULL,                       /* 64x64 (invalid CFL size) */
+    cfl_predict_hbd_4x8_ssse3,  /* 4x8 */
+    cfl_predict_hbd_8x4_ssse3,  /* 8x4 */
+    cfl_predict_hbd_8x16_ssse3, /* 8x16 */
+    cfl_predict_hbd_16x8_avx2,  /* 16x8 */
+    cfl_predict_hbd_16x32_avx2, /* 16x32 */
+    cfl_predict_hbd_32x16_avx2, /* 32x16 */
+    NULL,                       /* 32x64 (invalid CFL size) */
+    NULL,                       /* 64x32 (invalid CFL size) */
+    cfl_predict_hbd_4x16_ssse3, /* 4x16  */
+    cfl_predict_hbd_16x4_avx2,  /* 16x4  */
+    cfl_predict_hbd_8x32_ssse3, /* 8x32  */
+    cfl_predict_hbd_32x8_avx2,  /* 32x8  */
+    NULL,                       /* 16x64 (invalid CFL size) */
+    NULL,                       /* 64x16 (invalid CFL size) */
+  };
+  // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to index the
+  // function pointer array out of bounds.
+  return pred[tx_size % TX_SIZES_ALL];
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+// Returns a vector where all the (32-bits) elements are the sum of all the
+// lanes in a.
+static INLINE __m256i fill_sum_epi32(__m256i a) {
+  // Given that a == [A, B, C, D, E, F, G, H]
+  a = _mm256_hadd_epi32(a, a);
+  // Given that A' == A + B, C' == C + D, E' == E + F, G' == G + H
+  // a == [A', C', A', C', E', G', E', G']
+  a = _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0));
+  // a == [A', C', E', G', A', C', E', G']
+  a = _mm256_hadd_epi32(a, a);
+  // Given that A'' == A' + C' and E'' == E' + G'
+  // a == [A'', E'', A'', E'', A'', E'', A'', E'']
+  return _mm256_hadd_epi32(a, a);
+  // Given that A''' == A'' + E''
+  // a == [A''', A''', A''', A''', A''', A''', A''', A''']
+}
+
+static INLINE __m256i _mm256_addl_epi16(__m256i a) {
+  return _mm256_add_epi32(_mm256_unpacklo_epi16(a, _mm256_setzero_si256()),
+                          _mm256_unpackhi_epi16(a, _mm256_setzero_si256()));
+}
+
+static INLINE void subtract_average_avx2(const uint16_t *src_ptr,
+                                         int16_t *dst_ptr, int width,
+                                         int height, int round_offset,
+                                         int num_pel_log2) {
+  // Use SSE2 version for smaller widths
+  assert(width == 16 || width == 32);
+
+  const __m256i *src = (__m256i *)src_ptr;
+  const __m256i *const end = src + height * CFL_BUF_LINE_I256;
+  // To maximize usage of the AVX2 registers, we sum two rows per loop
+  // iteration
+  const int step = 2 * CFL_BUF_LINE_I256;
+
+  __m256i sum = _mm256_setzero_si256();
+  // For width 32, we use a second sum accumulator to reduce accumulator
+  // dependencies in the loop.
+  __m256i sum2;
+  if (width == 32) sum2 = _mm256_setzero_si256();
+
+  do {
+    // Add top row to the bottom row
+    __m256i l0 = _mm256_add_epi16(_mm256_loadu_si256(src),
+                                  _mm256_loadu_si256(src + CFL_BUF_LINE_I256));
+    sum = _mm256_add_epi32(sum, _mm256_addl_epi16(l0));
+    if (width == 32) { /* Don't worry, this if it gets optimized out. */
+      // Add the second part of the top row to the second part of the bottom row
+      __m256i l1 =
+          _mm256_add_epi16(_mm256_loadu_si256(src + 1),
+                           _mm256_loadu_si256(src + 1 + CFL_BUF_LINE_I256));
+      sum2 = _mm256_add_epi32(sum2, _mm256_addl_epi16(l1));
+    }
+    src += step;
+  } while (src < end);
+  // Combine both sum accumulators
+  if (width == 32) sum = _mm256_add_epi32(sum, sum2);
+
+  __m256i fill = fill_sum_epi32(sum);
+
+  __m256i avg_epi16 = _mm256_srli_epi32(
+      _mm256_add_epi32(fill, _mm256_set1_epi32(round_offset)), num_pel_log2);
+  avg_epi16 = _mm256_packs_epi32(avg_epi16, avg_epi16);
+
+  // Store and subtract loop
+  src = (__m256i *)src_ptr;
+  __m256i *dst = (__m256i *)dst_ptr;
+  do {
+    _mm256_storeu_si256(dst,
+                        _mm256_sub_epi16(_mm256_loadu_si256(src), avg_epi16));
+    if (width == 32) {
+      _mm256_storeu_si256(
+          dst + 1, _mm256_sub_epi16(_mm256_loadu_si256(src + 1), avg_epi16));
+    }
+    src += CFL_BUF_LINE_I256;
+    dst += CFL_BUF_LINE_I256;
+  } while (src < end);
+}
+
+// Declare wrappers for AVX2 sizes
+CFL_SUB_AVG_X(avx2, 16, 4, 32, 6)
+CFL_SUB_AVG_X(avx2, 16, 8, 64, 7)
+CFL_SUB_AVG_X(avx2, 16, 16, 128, 8)
+CFL_SUB_AVG_X(avx2, 16, 32, 256, 9)
+CFL_SUB_AVG_X(avx2, 32, 8, 128, 8)
+CFL_SUB_AVG_X(avx2, 32, 16, 256, 9)
+CFL_SUB_AVG_X(avx2, 32, 32, 512, 10)
+
+// Based on the observation that for small blocks AVX2 does not outperform
+// SSE2, we call the SSE2 code for block widths 4 and 8.
+cfl_subtract_average_fn cfl_get_subtract_average_fn_avx2(TX_SIZE tx_size) {
+  static const cfl_subtract_average_fn sub_avg[TX_SIZES_ALL] = {
+    cfl_subtract_average_4x4_sse2,   /* 4x4 */
+    cfl_subtract_average_8x8_sse2,   /* 8x8 */
+    cfl_subtract_average_16x16_avx2, /* 16x16 */
+    cfl_subtract_average_32x32_avx2, /* 32x32 */
+    NULL,                            /* 64x64 (invalid CFL size) */
+    cfl_subtract_average_4x8_sse2,   /* 4x8 */
+    cfl_subtract_average_8x4_sse2,   /* 8x4 */
+    cfl_subtract_average_8x16_sse2,  /* 8x16 */
+    cfl_subtract_average_16x8_avx2,  /* 16x8 */
+    cfl_subtract_average_16x32_avx2, /* 16x32 */
+    cfl_subtract_average_32x16_avx2, /* 32x16 */
+    NULL,                            /* 32x64 (invalid CFL size) */
+    NULL,                            /* 64x32 (invalid CFL size) */
+    cfl_subtract_average_4x16_sse2,  /* 4x16 */
+    cfl_subtract_average_16x4_avx2,  /* 16x4 */
+    cfl_subtract_average_8x32_sse2,  /* 8x32 */
+    cfl_subtract_average_32x8_avx2,  /* 32x8 */
+    NULL,                            /* 16x64 (invalid CFL size) */
+    NULL,                            /* 64x16 (invalid CFL size) */
+  };
+  // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to
+  // index the function pointer array out of bounds.
+  return sub_avg[tx_size % TX_SIZES_ALL];
+}
diff --git a/libs/libaom/src/av1/common/x86/cfl_simd.h b/libs/libaom/src/av1/common/x86/cfl_simd.h
new file mode 100644
index 000000000..03ae02a92
--- /dev/null
+++ b/libs/libaom/src/av1/common/x86/cfl_simd.h
@@ -0,0 +1,246 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_X86_CFL_SIMD_H_
+#define AOM_AV1_COMMON_X86_CFL_SIMD_H_
+
+#include "av1/common/blockd.h"
+
+// SSSE3 version is optimal for with == 4, we reuse them in AVX2
+void cfl_subsample_lbd_420_4x4_ssse3(const uint8_t *cfl_type, int input_stride,
+                                     uint16_t *output_q3);
+void cfl_subsample_lbd_420_4x8_ssse3(const uint8_t *cfl_type, int input_stride,
+                                     uint16_t *output_q3);
+void cfl_subsample_lbd_420_4x16_ssse3(const uint8_t *cfl_type, int input_stride,
+                                      uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 8, we reuse it in AVX2
+void cfl_subsample_lbd_420_8x4_ssse3(const uint8_t *cfl_type, int input_stride,
+                                     uint16_t *output_q3);
+void cfl_subsample_lbd_420_8x8_ssse3(const uint8_t *cfl_type, int input_stride,
+                                     uint16_t *output_q3);
+void cfl_subsample_lbd_420_8x16_ssse3(const uint8_t *cfl_type, int input_stride,
+                                      uint16_t *output_q3);
+void cfl_subsample_lbd_420_8x32_ssse3(const uint8_t *cfl_type, int input_stride,
+                                      uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 16, we reuse it in AVX2
+void cfl_subsample_lbd_420_16x4_ssse3(const uint8_t *cfl_type, int input_stride,
+                                      uint16_t *output_q3);
+void cfl_subsample_lbd_420_16x8_ssse3(const uint8_t *cfl_type, int input_stride,
+                                      uint16_t *output_q3);
+void cfl_subsample_lbd_420_16x16_ssse3(const uint8_t *cfl_type,
+                                       int input_stride, uint16_t *output_q3);
+void cfl_subsample_lbd_420_16x32_ssse3(const uint8_t *cfl_type,
+                                       int input_stride, uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 4, we reuse them in AVX2
+void cfl_subsample_lbd_422_4x4_ssse3(const uint8_t *cfl_type, int input_stride,
+                                     uint16_t *output_q3);
+void cfl_subsample_lbd_422_4x8_ssse3(const uint8_t *cfl_type, int input_stride,
+                                     uint16_t *output_q3);
+void cfl_subsample_lbd_422_4x16_ssse3(const uint8_t *cfl_type, int input_stride,
+                                      uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 8, we reuse it in AVX2
+void cfl_subsample_lbd_422_8x4_ssse3(const uint8_t *cfl_type, int input_stride,
+                                     uint16_t *output_q3);
+void cfl_subsample_lbd_422_8x8_ssse3(const uint8_t *cfl_type, int input_stride,
+                                     uint16_t *output_q3);
+void cfl_subsample_lbd_422_8x16_ssse3(const uint8_t *cfl_type, int input_stride,
+                                      uint16_t *output_q3);
+void cfl_subsample_lbd_422_8x32_ssse3(const uint8_t *cfl_type, int input_stride,
+                                      uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 16, we reuse it in AVX2
+void cfl_subsample_lbd_422_16x4_ssse3(const uint8_t *cfl_type, int input_stride,
+                                      uint16_t *output_q3);
+void cfl_subsample_lbd_422_16x8_ssse3(const uint8_t *cfl_type, int input_stride,
+                                      uint16_t *output_q3);
+void cfl_subsample_lbd_422_16x16_ssse3(const uint8_t *cfl_type,
+                                       int input_stride, uint16_t *output_q3);
+void cfl_subsample_lbd_422_16x32_ssse3(const uint8_t *cfl_type,
+                                       int input_stride, uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 4, we reuse them in AVX2
+void cfl_subsample_lbd_444_4x4_ssse3(const uint8_t *cfl_type, int input_stride,
+                                     uint16_t *output_q3);
+void cfl_subsample_lbd_444_4x8_ssse3(const uint8_t *cfl_type, int input_stride,
+                                     uint16_t *output_q3);
+void cfl_subsample_lbd_444_4x16_ssse3(const uint8_t *cfl_type, int input_stride,
+                                      uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 8, we reuse it in AVX2
+void cfl_subsample_lbd_444_8x4_ssse3(const uint8_t *cfl_type, int input_stride,
+                                     uint16_t *output_q3);
+void cfl_subsample_lbd_444_8x8_ssse3(const uint8_t *cfl_type, int input_stride,
+                                     uint16_t *output_q3);
+void cfl_subsample_lbd_444_8x16_ssse3(const uint8_t *cfl_type, int input_stride,
+                                      uint16_t *output_q3);
+void cfl_subsample_lbd_444_8x32_ssse3(const uint8_t *cfl_type, int input_stride,
+                                      uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 16, we reuse it in AVX2
+void cfl_subsample_lbd_444_16x4_ssse3(const uint8_t *cfl_type, int input_stride,
+                                      uint16_t *output_q3);
+void cfl_subsample_lbd_444_16x8_ssse3(const uint8_t *cfl_type, int input_stride,
+                                      uint16_t *output_q3);
+void cfl_subsample_lbd_444_16x16_ssse3(const uint8_t *cfl_type,
+                                       int input_stride, uint16_t *output_q3);
+void cfl_subsample_lbd_444_16x32_ssse3(const uint8_t *cfl_type,
+                                       int input_stride, uint16_t *output_q3);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void cfl_subsample_hbd_420_4x4_ssse3(const uint16_t *cfl_type, int input_stride,
+                                     uint16_t *output_q3);
+void cfl_subsample_hbd_420_4x8_ssse3(const uint16_t *cfl_type, int input_stride,
+                                     uint16_t *output_q3);
+void cfl_subsample_hbd_420_4x16_ssse3(const uint16_t *cfl_type,
+                                      int input_stride, uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 8, we reuse it in AVX2
+void cfl_subsample_hbd_420_8x4_ssse3(const uint16_t *cfl_type, int input_stride,
+                                     uint16_t *output_q3);
+void cfl_subsample_hbd_420_8x8_ssse3(const uint16_t *cfl_type, int input_stride,
+                                     uint16_t *output_q3);
+void cfl_subsample_hbd_420_8x16_ssse3(const uint16_t *cfl_type,
+                                      int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_420_8x32_ssse3(const uint16_t *cfl_type,
+                                      int input_stride, uint16_t *output_q3);
+
+// SSSE3 version is faster for with == 16, we reuse it in AVX2
+void cfl_subsample_hbd_420_16x4_ssse3(const uint16_t *cfl_type,
+                                      int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_420_16x8_ssse3(const uint16_t *cfl_type,
+                                      int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_420_16x16_ssse3(const uint16_t *cfl_type,
+                                       int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_420_16x32_ssse3(const uint16_t *cfl_type,
+                                       int input_stride, uint16_t *output_q3);
+
+void cfl_subsample_hbd_422_4x4_ssse3(const uint16_t *cfl_type, int input_stride,
+                                     uint16_t *output_q3);
+void cfl_subsample_hbd_422_4x8_ssse3(const uint16_t *cfl_type, int input_stride,
+                                     uint16_t *output_q3);
+void cfl_subsample_hbd_422_4x16_ssse3(const uint16_t *cfl_type,
+                                      int input_stride, uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 8, we reuse it in AVX2
+void cfl_subsample_hbd_422_8x4_ssse3(const uint16_t *cfl_type, int input_stride,
+                                     uint16_t *output_q3);
+void cfl_subsample_hbd_422_8x8_ssse3(const uint16_t *cfl_type, int input_stride,
+                                     uint16_t *output_q3);
+void cfl_subsample_hbd_422_8x16_ssse3(const uint16_t *cfl_type,
+                                      int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_422_8x32_ssse3(const uint16_t *cfl_type,
+                                      int input_stride, uint16_t *output_q3);
+
+// SSSE3 version is faster for with == 16, we reuse it in AVX2
+void cfl_subsample_hbd_422_16x4_ssse3(const uint16_t *cfl_type,
+                                      int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_422_16x8_ssse3(const uint16_t *cfl_type,
+                                      int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_422_16x16_ssse3(const uint16_t *cfl_type,
+                                       int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_422_16x32_ssse3(const uint16_t *cfl_type,
+                                       int input_stride, uint16_t *output_q3);
+
+void cfl_subsample_hbd_444_4x4_ssse3(const uint16_t *cfl_type, int input_stride,
+                                     uint16_t *output_q3);
+void cfl_subsample_hbd_444_4x8_ssse3(const uint16_t *cfl_type, int input_stride,
+                                     uint16_t *output_q3);
+void cfl_subsample_hbd_444_4x16_ssse3(const uint16_t *cfl_type,
+                                      int input_stride, uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 8, we reuse it in AVX2
+void cfl_subsample_hbd_444_8x4_ssse3(const uint16_t *cfl_type, int input_stride,
+                                     uint16_t *output_q3);
+void cfl_subsample_hbd_444_8x8_ssse3(const uint16_t *cfl_type, int input_stride,
+                                     uint16_t *output_q3);
+void cfl_subsample_hbd_444_8x16_ssse3(const uint16_t *cfl_type,
+                                      int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_444_8x32_ssse3(const uint16_t *cfl_type,
+                                      int input_stride, uint16_t *output_q3);
+
+// SSSE3 version is faster for with == 16, we reuse it in AVX2
+void cfl_subsample_hbd_444_16x4_ssse3(const uint16_t *cfl_type,
+                                      int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_444_16x8_ssse3(const uint16_t *cfl_type,
+                                      int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_444_16x16_ssse3(const uint16_t *cfl_type,
+                                       int input_stride, uint16_t *output_q3);
+void cfl_subsample_hbd_444_16x32_ssse3(const uint16_t *cfl_type,
+                                       int input_stride, uint16_t *output_q3);
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+// SSE2 version is optimal for with == 4, we reuse them in AVX2
+void cfl_subtract_average_4x4_sse2(const uint16_t *src, int16_t *dst);
+void cfl_subtract_average_4x8_sse2(const uint16_t *src, int16_t *dst);
+void cfl_subtract_average_4x16_sse2(const uint16_t *src, int16_t *dst);
+
+// SSE2 version is optimal for with == 8, we reuse them in AVX2
+void cfl_subtract_average_8x4_sse2(const uint16_t *src, int16_t *dst);
+void cfl_subtract_average_8x8_sse2(const uint16_t *src, int16_t *dst);
+void cfl_subtract_average_8x16_sse2(const uint16_t *src, int16_t *dst);
+void cfl_subtract_average_8x32_sse2(const uint16_t *src, int16_t *dst);
+
+void cfl_predict_lbd_4x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+                               int dst_stride, int alpha_q3);
+void cfl_predict_lbd_4x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+                               int dst_stride, int alpha_q3);
+void cfl_predict_lbd_4x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+                                int dst_stride, int alpha_q3);
+
+void cfl_predict_lbd_8x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+                               int dst_stride, int alpha_q3);
+void cfl_predict_lbd_8x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+                               int dst_stride, int alpha_q3);
+void cfl_predict_lbd_8x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+                                int dst_stride, int alpha_q3);
+void cfl_predict_lbd_8x32_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+                                int dst_stride, int alpha_q3);
+
+void cfl_predict_lbd_16x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+                                int dst_stride, int alpha_q3);
+void cfl_predict_lbd_16x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+                                int dst_stride, int alpha_q3);
+void cfl_predict_lbd_16x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+                                 int dst_stride, int alpha_q3);
+void cfl_predict_lbd_16x32_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+                                 int dst_stride, int alpha_q3);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void cfl_predict_hbd_4x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+                               int dst_stride, int alpha_q3, int bd);
+void cfl_predict_hbd_4x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+                               int dst_stride, int alpha_q3, int bd);
+void cfl_predict_hbd_4x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+                                int dst_stride, int alpha_q3, int bd);
+
+void cfl_predict_hbd_8x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+                               int dst_stride, int alpha_q3, int bd);
+void cfl_predict_hbd_8x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+                               int dst_stride, int alpha_q3, int bd);
+void cfl_predict_hbd_8x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+                                int dst_stride, int alpha_q3, int bd);
+void cfl_predict_hbd_8x32_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+                                int dst_stride, int alpha_q3, int bd);
+
+void cfl_predict_hbd_16x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+                                int dst_stride, int alpha_q3, int bd);
+void cfl_predict_hbd_16x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+                                int dst_stride, int alpha_q3, int bd);
+void cfl_predict_hbd_16x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+                                 int dst_stride, int alpha_q3, int bd);
+void cfl_predict_hbd_16x32_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+                                 int dst_stride, int alpha_q3, int bd);
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+#endif  // AOM_AV1_COMMON_X86_CFL_SIMD_H_
diff --git a/libs/libaom/src/av1/common/x86/cfl_sse2.c b/libs/libaom/src/av1/common/x86/cfl_sse2.c
new file mode 100644
index 000000000..4783fe098
--- /dev/null
+++ b/libs/libaom/src/av1/common/x86/cfl_sse2.c
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+
+#include "av1/common/cfl.h"
+#include "config/av1_rtcd.h"
+
+static INLINE __m128i fill_sum_epi32(__m128i l0) {
+  l0 = _mm_add_epi32(l0, _mm_shuffle_epi32(l0, _MM_SHUFFLE(1, 0, 3, 2)));
+  return _mm_add_epi32(l0, _mm_shuffle_epi32(l0, _MM_SHUFFLE(2, 3, 0, 1)));
+}
+
+static INLINE void subtract_average_sse2(const uint16_t *src_ptr,
+                                         int16_t *dst_ptr, int width,
+                                         int height, int round_offset,
+                                         int num_pel_log2) {
+  const __m128i zeros = _mm_setzero_si128();
+  const __m128i round_offset_epi32 = _mm_set1_epi32(round_offset);
+  const __m128i *src = (__m128i *)src_ptr;
+  const __m128i *const end = src + height * CFL_BUF_LINE_I128;
+  const int step = CFL_BUF_LINE_I128 * (1 + (width == 8) + 3 * (width == 4));
+
+  __m128i sum = zeros;
+  do {
+    __m128i l0;
+    if (width == 4) {
+      l0 = _mm_add_epi16(_mm_loadl_epi64(src),
+                         _mm_loadl_epi64(src + CFL_BUF_LINE_I128));
+      __m128i l1 = _mm_add_epi16(_mm_loadl_epi64(src + 2 * CFL_BUF_LINE_I128),
+                                 _mm_loadl_epi64(src + 3 * CFL_BUF_LINE_I128));
+      sum = _mm_add_epi32(sum, _mm_add_epi32(_mm_unpacklo_epi16(l0, zeros),
+                                             _mm_unpacklo_epi16(l1, zeros)));
+    } else {
+      if (width == 8) {
+        l0 = _mm_add_epi16(_mm_loadu_si128(src),
+                           _mm_loadu_si128(src + CFL_BUF_LINE_I128));
+      } else {
+        l0 = _mm_add_epi16(_mm_loadu_si128(src), _mm_loadu_si128(src + 1));
+      }
+      sum = _mm_add_epi32(sum, _mm_add_epi32(_mm_unpacklo_epi16(l0, zeros),
+                                             _mm_unpackhi_epi16(l0, zeros)));
+      if (width == 32) {
+        l0 = _mm_add_epi16(_mm_loadu_si128(src + 2), _mm_loadu_si128(src + 3));
+        sum = _mm_add_epi32(sum, _mm_add_epi32(_mm_unpacklo_epi16(l0, zeros),
+                                               _mm_unpackhi_epi16(l0, zeros)));
+      }
+    }
+    src += step;
+  } while (src < end);
+
+  sum = fill_sum_epi32(sum);
+
+  __m128i avg_epi16 =
+      _mm_srli_epi32(_mm_add_epi32(sum, round_offset_epi32), num_pel_log2);
+  avg_epi16 = _mm_packs_epi32(avg_epi16, avg_epi16);
+
+  src = (__m128i *)src_ptr;
+  __m128i *dst = (__m128i *)dst_ptr;
+  do {
+    if (width == 4) {
+      _mm_storel_epi64(dst, _mm_sub_epi16(_mm_loadl_epi64(src), avg_epi16));
+    } else {
+      _mm_storeu_si128(dst, _mm_sub_epi16(_mm_loadu_si128(src), avg_epi16));
+      if (width > 8) {
+        _mm_storeu_si128(dst + 1,
+                         _mm_sub_epi16(_mm_loadu_si128(src + 1), avg_epi16));
+        if (width == 32) {
+          _mm_storeu_si128(dst + 2,
+                           _mm_sub_epi16(_mm_loadu_si128(src + 2), avg_epi16));
+          _mm_storeu_si128(dst + 3,
+                           _mm_sub_epi16(_mm_loadu_si128(src + 3), avg_epi16));
+        }
+      }
+    }
+    src += CFL_BUF_LINE_I128;
+    dst += CFL_BUF_LINE_I128;
+  } while (src < end);
+}
+
+CFL_SUB_AVG_FN(sse2)
diff --git a/libs/libaom/src/av1/common/x86/cfl_ssse3.c b/libs/libaom/src/av1/common/x86/cfl_ssse3.c
new file mode 100644
index 000000000..476b6609a
--- /dev/null
+++ b/libs/libaom/src/av1/common/x86/cfl_ssse3.c
@@ -0,0 +1,397 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/cfl.h"
+
+#include "av1/common/x86/cfl_simd.h"
+
+// Load 32-bit integer from memory into the first element of dst.
+static INLINE __m128i _mm_loadh_epi32(__m128i const *mem_addr) {
+  return _mm_cvtsi32_si128(*((int *)mem_addr));
+}
+
+// Store 32-bit integer from the first element of a into memory.
+static INLINE void _mm_storeh_epi32(__m128i const *mem_addr, __m128i a) {
+  *((int *)mem_addr) = _mm_cvtsi128_si32(a);
+}
+
+/**
+ * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more
+ * precise version of a box filter 4:2:0 pixel subsampling in Q3.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ */
+static INLINE void cfl_luma_subsampling_420_lbd_ssse3(const uint8_t *input,
+                                                      int input_stride,
+                                                      uint16_t *pred_buf_q3,
+                                                      int width, int height) {
+  const __m128i twos = _mm_set1_epi8(2);
+  __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3;
+  const __m128i *end = pred_buf_m128i + (height >> 1) * CFL_BUF_LINE_I128;
+  const int luma_stride = input_stride << 1;
+  do {
+    if (width == 4) {
+      __m128i top = _mm_loadh_epi32((__m128i *)input);
+      top = _mm_maddubs_epi16(top, twos);
+      __m128i bot = _mm_loadh_epi32((__m128i *)(input + input_stride));
+      bot = _mm_maddubs_epi16(bot, twos);
+      const __m128i sum = _mm_add_epi16(top, bot);
+      _mm_storeh_epi32(pred_buf_m128i, sum);
+    } else if (width == 8) {
+      __m128i top = _mm_loadl_epi64((__m128i *)input);
+      top = _mm_maddubs_epi16(top, twos);
+      __m128i bot = _mm_loadl_epi64((__m128i *)(input + input_stride));
+      bot = _mm_maddubs_epi16(bot, twos);
+      const __m128i sum = _mm_add_epi16(top, bot);
+      _mm_storel_epi64(pred_buf_m128i, sum);
+    } else {
+      __m128i top = _mm_loadu_si128((__m128i *)input);
+      top = _mm_maddubs_epi16(top, twos);
+      __m128i bot = _mm_loadu_si128((__m128i *)(input + input_stride));
+      bot = _mm_maddubs_epi16(bot, twos);
+      const __m128i sum = _mm_add_epi16(top, bot);
+      _mm_storeu_si128(pred_buf_m128i, sum);
+      if (width == 32) {
+        __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1);
+        __m128i bot_1 =
+            _mm_loadu_si128(((__m128i *)(input + input_stride)) + 1);
+        top_1 = _mm_maddubs_epi16(top_1, twos);
+        bot_1 = _mm_maddubs_epi16(bot_1, twos);
+        __m128i sum_1 = _mm_add_epi16(top_1, bot_1);
+        _mm_storeu_si128(pred_buf_m128i + 1, sum_1);
+      }
+    }
+    input += luma_stride;
+    pred_buf_m128i += CFL_BUF_LINE_I128;
+  } while (pred_buf_m128i < end);
+}
+
+/**
+ * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more
+ * precise version of a box filter 4:2:2 pixel subsampling in Q3.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ */
+static INLINE void cfl_luma_subsampling_422_lbd_ssse3(const uint8_t *input,
+                                                      int input_stride,
+                                                      uint16_t *pred_buf_q3,
+                                                      int width, int height) {
+  const __m128i fours = _mm_set1_epi8(4);
+  __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3;
+  const __m128i *end = pred_buf_m128i + height * CFL_BUF_LINE_I128;
+  do {
+    if (width == 4) {
+      __m128i top = _mm_loadh_epi32((__m128i *)input);
+      top = _mm_maddubs_epi16(top, fours);
+      _mm_storeh_epi32(pred_buf_m128i, top);
+    } else if (width == 8) {
+      __m128i top = _mm_loadl_epi64((__m128i *)input);
+      top = _mm_maddubs_epi16(top, fours);
+      _mm_storel_epi64(pred_buf_m128i, top);
+    } else {
+      __m128i top = _mm_loadu_si128((__m128i *)input);
+      top = _mm_maddubs_epi16(top, fours);
+      _mm_storeu_si128(pred_buf_m128i, top);
+      if (width == 32) {
+        __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1);
+        top_1 = _mm_maddubs_epi16(top_1, fours);
+        _mm_storeu_si128(pred_buf_m128i + 1, top_1);
+      }
+    }
+    input += input_stride;
+    pred_buf_m128i += CFL_BUF_LINE_I128;
+  } while (pred_buf_m128i < end);
+}
+
+/**
+ * Multiplies the pixels by 8 (scaling in Q3).
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ */
+static INLINE void cfl_luma_subsampling_444_lbd_ssse3(const uint8_t *input,
+                                                      int input_stride,
+                                                      uint16_t *pred_buf_q3,
+                                                      int width, int height) {
+  const __m128i zeros = _mm_setzero_si128();
+  const int luma_stride = input_stride;
+  __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3;
+  const __m128i *end = pred_buf_m128i + height * CFL_BUF_LINE_I128;
+  do {
+    if (width == 4) {
+      __m128i row = _mm_loadh_epi32((__m128i *)input);
+      row = _mm_unpacklo_epi8(row, zeros);
+      _mm_storel_epi64(pred_buf_m128i, _mm_slli_epi16(row, 3));
+    } else if (width == 8) {
+      __m128i row = _mm_loadl_epi64((__m128i *)input);
+      row = _mm_unpacklo_epi8(row, zeros);
+      _mm_storeu_si128(pred_buf_m128i, _mm_slli_epi16(row, 3));
+    } else {
+      __m128i row = _mm_loadu_si128((__m128i *)input);
+      const __m128i row_lo = _mm_unpacklo_epi8(row, zeros);
+      const __m128i row_hi = _mm_unpackhi_epi8(row, zeros);
+      _mm_storeu_si128(pred_buf_m128i, _mm_slli_epi16(row_lo, 3));
+      _mm_storeu_si128(pred_buf_m128i + 1, _mm_slli_epi16(row_hi, 3));
+      if (width == 32) {
+        __m128i row_1 = _mm_loadu_si128(((__m128i *)input) + 1);
+        const __m128i row_1_lo = _mm_unpacklo_epi8(row_1, zeros);
+        const __m128i row_1_hi = _mm_unpackhi_epi8(row_1, zeros);
+        _mm_storeu_si128(pred_buf_m128i + 2, _mm_slli_epi16(row_1_lo, 3));
+        _mm_storeu_si128(pred_buf_m128i + 3, _mm_slli_epi16(row_1_hi, 3));
+      }
+    }
+    input += luma_stride;
+    pred_buf_m128i += CFL_BUF_LINE_I128;
+  } while (pred_buf_m128i < end);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+/**
+ * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more
+ * precise version of a box filter 4:2:0 pixel subsampling in Q3.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ */
+static INLINE void cfl_luma_subsampling_420_hbd_ssse3(const uint16_t *input,
+                                                      int input_stride,
+                                                      uint16_t *pred_buf_q3,
+                                                      int width, int height) {
+  const uint16_t *end = pred_buf_q3 + (height >> 1) * CFL_BUF_LINE;
+  const int luma_stride = input_stride << 1;
+  do {
+    if (width == 4) {
+      const __m128i top = _mm_loadl_epi64((__m128i *)input);
+      const __m128i bot = _mm_loadl_epi64((__m128i *)(input + input_stride));
+      __m128i sum = _mm_add_epi16(top, bot);
+      sum = _mm_hadd_epi16(sum, sum);
+      *((int *)pred_buf_q3) = _mm_cvtsi128_si32(_mm_add_epi16(sum, sum));
+    } else {
+      const __m128i top = _mm_loadu_si128((__m128i *)input);
+      const __m128i bot = _mm_loadu_si128((__m128i *)(input + input_stride));
+      __m128i sum = _mm_add_epi16(top, bot);
+      if (width == 8) {
+        sum = _mm_hadd_epi16(sum, sum);
+        _mm_storel_epi64((__m128i *)pred_buf_q3, _mm_add_epi16(sum, sum));
+      } else {
+        const __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1);
+        const __m128i bot_1 =
+            _mm_loadu_si128(((__m128i *)(input + input_stride)) + 1);
+        sum = _mm_hadd_epi16(sum, _mm_add_epi16(top_1, bot_1));
+        _mm_storeu_si128((__m128i *)pred_buf_q3, _mm_add_epi16(sum, sum));
+        if (width == 32) {
+          const __m128i top_2 = _mm_loadu_si128(((__m128i *)input) + 2);
+          const __m128i bot_2 =
+              _mm_loadu_si128(((__m128i *)(input + input_stride)) + 2);
+          const __m128i top_3 = _mm_loadu_si128(((__m128i *)input) + 3);
+          const __m128i bot_3 =
+              _mm_loadu_si128(((__m128i *)(input + input_stride)) + 3);
+          const __m128i sum_2 = _mm_add_epi16(top_2, bot_2);
+          const __m128i sum_3 = _mm_add_epi16(top_3, bot_3);
+          __m128i next_sum = _mm_hadd_epi16(sum_2, sum_3);
+          _mm_storeu_si128(((__m128i *)pred_buf_q3) + 1,
+                           _mm_add_epi16(next_sum, next_sum));
+        }
+      }
+    }
+    input += luma_stride;
+  } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
+}
+
+/**
+ * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more
+ * precise version of a box filter 4:2:2 pixel subsampling in Q3.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ */
+static INLINE void cfl_luma_subsampling_422_hbd_ssse3(const uint16_t *input,
+                                                      int input_stride,
+                                                      uint16_t *pred_buf_q3,
+                                                      int width, int height) {
+  __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3;
+  const __m128i *end = pred_buf_m128i + height * CFL_BUF_LINE_I128;
+  do {
+    if (width == 4) {
+      const __m128i top = _mm_loadl_epi64((__m128i *)input);
+      const __m128i sum = _mm_slli_epi16(_mm_hadd_epi16(top, top), 2);
+      _mm_storeh_epi32(pred_buf_m128i, sum);
+    } else {
+      const __m128i top = _mm_loadu_si128((__m128i *)input);
+      if (width == 8) {
+        const __m128i sum = _mm_slli_epi16(_mm_hadd_epi16(top, top), 2);
+        _mm_storel_epi64(pred_buf_m128i, sum);
+      } else {
+        const __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1);
+        const __m128i sum = _mm_slli_epi16(_mm_hadd_epi16(top, top_1), 2);
+        _mm_storeu_si128(pred_buf_m128i, sum);
+        if (width == 32) {
+          const __m128i top_2 = _mm_loadu_si128(((__m128i *)input) + 2);
+          const __m128i top_3 = _mm_loadu_si128(((__m128i *)input) + 3);
+          const __m128i sum_1 = _mm_slli_epi16(_mm_hadd_epi16(top_2, top_3), 2);
+          _mm_storeu_si128(pred_buf_m128i + 1, sum_1);
+        }
+      }
+    }
+    pred_buf_m128i += CFL_BUF_LINE_I128;
+    input += input_stride;
+  } while (pred_buf_m128i < end);
+}
+
+static INLINE void cfl_luma_subsampling_444_hbd_ssse3(const uint16_t *input,
+                                                      int input_stride,
+                                                      uint16_t *pred_buf_q3,
+                                                      int width, int height) {
+  const uint16_t *end = pred_buf_q3 + height * CFL_BUF_LINE;
+  do {
+    if (width == 4) {
+      const __m128i row = _mm_slli_epi16(_mm_loadl_epi64((__m128i *)input), 3);
+      _mm_storel_epi64((__m128i *)pred_buf_q3, row);
+    } else {
+      const __m128i row = _mm_slli_epi16(_mm_loadu_si128((__m128i *)input), 3);
+      _mm_storeu_si128((__m128i *)pred_buf_q3, row);
+      if (width >= 16) {
+        __m128i row_1 = _mm_loadu_si128(((__m128i *)input) + 1);
+        row_1 = _mm_slli_epi16(row_1, 3);
+        _mm_storeu_si128(((__m128i *)pred_buf_q3) + 1, row_1);
+        if (width == 32) {
+          __m128i row_2 = _mm_loadu_si128(((__m128i *)input) + 2);
+          row_2 = _mm_slli_epi16(row_2, 3);
+          _mm_storeu_si128(((__m128i *)pred_buf_q3) + 2, row_2);
+          __m128i row_3 = _mm_loadu_si128(((__m128i *)input) + 3);
+          row_3 = _mm_slli_epi16(row_3, 3);
+          _mm_storeu_si128(((__m128i *)pred_buf_q3) + 3, row_3);
+        }
+      }
+    }
+    input += input_stride;
+    pred_buf_q3 += CFL_BUF_LINE;
+  } while (pred_buf_q3 < end);
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+CFL_GET_SUBSAMPLE_FUNCTION(ssse3)
+
+static INLINE __m128i predict_unclipped(const __m128i *input, __m128i alpha_q12,
+                                        __m128i alpha_sign, __m128i dc_q0) {
+  __m128i ac_q3 = _mm_loadu_si128(input);
+  __m128i ac_sign = _mm_sign_epi16(alpha_sign, ac_q3);
+  __m128i scaled_luma_q0 = _mm_mulhrs_epi16(_mm_abs_epi16(ac_q3), alpha_q12);
+  scaled_luma_q0 = _mm_sign_epi16(scaled_luma_q0, ac_sign);
+  return _mm_add_epi16(scaled_luma_q0, dc_q0);
+}
+
+static INLINE void cfl_predict_lbd_ssse3(const int16_t *pred_buf_q3,
+                                         uint8_t *dst, int dst_stride,
+                                         int alpha_q3, int width, int height) {
+  const __m128i alpha_sign = _mm_set1_epi16(alpha_q3);
+  const __m128i alpha_q12 = _mm_slli_epi16(_mm_abs_epi16(alpha_sign), 9);
+  const __m128i dc_q0 = _mm_set1_epi16(*dst);
+  __m128i *row = (__m128i *)pred_buf_q3;
+  const __m128i *row_end = row + height * CFL_BUF_LINE_I128;
+  do {
+    __m128i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0);
+    if (width < 16) {
+      res = _mm_packus_epi16(res, res);
+      if (width == 4)
+        _mm_storeh_epi32((__m128i *)dst, res);
+      else
+        _mm_storel_epi64((__m128i *)dst, res);
+    } else {
+      __m128i next = predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0);
+      res = _mm_packus_epi16(res, next);
+      _mm_storeu_si128((__m128i *)dst, res);
+      if (width == 32) {
+        res = predict_unclipped(row + 2, alpha_q12, alpha_sign, dc_q0);
+        next = predict_unclipped(row + 3, alpha_q12, alpha_sign, dc_q0);
+        res = _mm_packus_epi16(res, next);
+        _mm_storeu_si128((__m128i *)(dst + 16), res);
+      }
+    }
+    dst += dst_stride;
+  } while ((row += CFL_BUF_LINE_I128) < row_end);
+}
+
+CFL_PREDICT_FN(ssse3, lbd)
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE __m128i highbd_max_epi16(int bd) {
+  const __m128i neg_one = _mm_set1_epi16(-1);
+  // (1 << bd) - 1 => -(-1 << bd) -1 => -1 - (-1 << bd) => -1 ^ (-1 << bd)
+  return _mm_xor_si128(_mm_slli_epi16(neg_one, bd), neg_one);
+}
+
+static INLINE __m128i highbd_clamp_epi16(__m128i u, __m128i zero, __m128i max) {
+  return _mm_max_epi16(_mm_min_epi16(u, max), zero);
+}
+
+static INLINE void cfl_predict_hbd_ssse3(const int16_t *pred_buf_q3,
+                                         uint16_t *dst, int dst_stride,
+                                         int alpha_q3, int bd, int width,
+                                         int height) {
+  const __m128i alpha_sign = _mm_set1_epi16(alpha_q3);
+  const __m128i alpha_q12 = _mm_slli_epi16(_mm_abs_epi16(alpha_sign), 9);
+  const __m128i dc_q0 = _mm_set1_epi16(*dst);
+  const __m128i max = highbd_max_epi16(bd);
+  const __m128i zeros = _mm_setzero_si128();
+  __m128i *row = (__m128i *)pred_buf_q3;
+  const __m128i *row_end = row + height * CFL_BUF_LINE_I128;
+  do {
+    __m128i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0);
+    res = highbd_clamp_epi16(res, zeros, max);
+    if (width == 4) {
+      _mm_storel_epi64((__m128i *)dst, res);
+    } else {
+      _mm_storeu_si128((__m128i *)dst, res);
+    }
+    if (width >= 16) {
+      const __m128i res_1 =
+          predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0);
+      _mm_storeu_si128(((__m128i *)dst) + 1,
+                       highbd_clamp_epi16(res_1, zeros, max));
+    }
+    if (width == 32) {
+      const __m128i res_2 =
+          predict_unclipped(row + 2, alpha_q12, alpha_sign, dc_q0);
+      _mm_storeu_si128((__m128i *)(dst + 16),
+                       highbd_clamp_epi16(res_2, zeros, max));
+      const __m128i res_3 =
+          predict_unclipped(row + 3, alpha_q12, alpha_sign, dc_q0);
+      _mm_storeu_si128((__m128i *)(dst + 24),
+                       highbd_clamp_epi16(res_3, zeros, max));
+    }
+    dst += dst_stride;
+  } while ((row += CFL_BUF_LINE_I128) < row_end);
+}
+
+CFL_PREDICT_FN(ssse3, hbd)
+#endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/libs/libaom/src/av1/common/x86/convolve_2d_avx2.c b/libs/libaom/src/av1/common/x86/convolve_2d_avx2.c
new file mode 100644
index 000000000..e19575d72
--- /dev/null
+++ b/libs/libaom/src/av1/common/x86/convolve_2d_avx2.c
@@ -0,0 +1,317 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/x86/convolve_avx2.h"
+#include "aom_dsp/x86/convolve_common_intrin.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "av1/common/convolve.h"
+
+void av1_convolve_2d_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
+                             int dst_stride, int w, int h,
+                             const InterpFilterParams *filter_params_x,
+                             const InterpFilterParams *filter_params_y,
+                             const int subpel_x_qn, const int subpel_y_qn,
+                             ConvolveParams *conv_params) {
+  const int bd = 8;
+  int im_stride = 8;
+  int i, is_horiz_4tap = 0, is_vert_4tap = 0;
+  DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
+  const int bits =
+      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+
+  assert(conv_params->round_0 > 0);
+
+  const __m256i round_const_h = _mm256_set1_epi16(
+      ((1 << (conv_params->round_0 - 1)) >> 1) + (1 << (bd + FILTER_BITS - 2)));
+  const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0 - 1);
+
+  const __m256i sum_round_v = _mm256_set1_epi32(
+      (1 << offset_bits) + ((1 << conv_params->round_1) >> 1));
+  const __m128i sum_shift_v = _mm_cvtsi32_si128(conv_params->round_1);
+
+  const __m256i round_const_v = _mm256_set1_epi32(
+      ((1 << bits) >> 1) - (1 << (offset_bits - conv_params->round_1)) -
+      ((1 << (offset_bits - conv_params->round_1)) >> 1));
+  const __m128i round_shift_v = _mm_cvtsi32_si128(bits);
+
+  __m256i filt[4], coeffs_h[4], coeffs_v[4];
+
+  filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2));
+  filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+
+  prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
+  prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_v);
+
+  // Condition for checking valid horz_filt taps
+  if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs_h[0], coeffs_h[3]), 0)))
+    is_horiz_4tap = 1;
+
+  // Condition for checking valid vert_filt taps
+  if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs_v[0], coeffs_v[3]), 0)))
+    is_vert_4tap = 1;
+
+  // horz_filt as 4 tap and vert_filt as 8 tap
+  if (is_horiz_4tap) {
+    int im_h = h + filter_params_y->taps - 1;
+    const int fo_vert = filter_params_y->taps / 2 - 1;
+    const int fo_horiz = 1;
+    const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+    // horz-filter
+    for (int j = 0; j < w; j += 8) {
+      for (i = 0; i < (im_h - 2); i += 2) {
+        __m256i data = _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));
+
+        // Load the next line
+        data = _mm256_inserti128_si256(
+            data,
+            _mm_loadu_si128(
+                (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]),
+            1);
+        __m256i res = convolve_lowbd_x_4tap(data, coeffs_h + 1, filt);
+
+        res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h),
+                               round_shift_h);
+        _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
+      }
+
+      __m256i data_1 = _mm256_castsi128_si256(
+          _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));
+
+      __m256i res = convolve_lowbd_x_4tap(data_1, coeffs_h + 1, filt);
+      res =
+          _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h);
+      _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
+
+      // vert filter
+      CONVOLVE_SR_VERTICAL_FILTER_8TAP;
+    }
+  } else if (is_vert_4tap) {
+    int im_h = h + 3;
+    const int fo_vert = 1;
+    const int fo_horiz = filter_params_x->taps / 2 - 1;
+    const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+    filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+    filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
+
+    for (int j = 0; j < w; j += 8) {
+      // horz_filter
+      CONVOLVE_SR_HORIZONTAL_FILTER_8TAP;
+      // vert_filter
+      __m256i s[6];
+      __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
+      __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
+      __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
+      __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
+
+      s[0] = _mm256_unpacklo_epi16(src_0, src_1);
+      s[1] = _mm256_unpacklo_epi16(src_2, src_3);
+      s[3] = _mm256_unpackhi_epi16(src_0, src_1);
+      s[4] = _mm256_unpackhi_epi16(src_2, src_3);
+
+      for (i = 0; i < h; i += 2) {
+        const int16_t *data = &im_block[i * im_stride];
+
+        const __m256i s4 =
+            _mm256_loadu_si256((__m256i *)(data + 4 * im_stride));
+        const __m256i s5 =
+            _mm256_loadu_si256((__m256i *)(data + 5 * im_stride));
+
+        s[2] = _mm256_unpacklo_epi16(s4, s5);
+        s[5] = _mm256_unpackhi_epi16(s4, s5);
+
+        __m256i res_a = convolve_4tap(s, coeffs_v + 1);
+        __m256i res_b = convolve_4tap(s + 3, coeffs_v + 1);
+
+        // Combine V round and 2F-H-V round into a single rounding
+        res_a =
+            _mm256_sra_epi32(_mm256_add_epi32(res_a, sum_round_v), sum_shift_v);
+        res_b =
+            _mm256_sra_epi32(_mm256_add_epi32(res_b, sum_round_v), sum_shift_v);
+
+        const __m256i res_a_round = _mm256_sra_epi32(
+            _mm256_add_epi32(res_a, round_const_v), round_shift_v);
+        const __m256i res_b_round = _mm256_sra_epi32(
+            _mm256_add_epi32(res_b, round_const_v), round_shift_v);
+
+        /* rounding code */
+        // 16 bit conversion
+        const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);
+        // 8 bit conversion and saturation to uint8
+        const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit);
+
+        const __m128i res_0 = _mm256_castsi256_si128(res_8b);
+        const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
+
+        // Store values into the destination buffer
+        __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
+        __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride];
+        if (w - j > 4) {
+          _mm_storel_epi64(p_0, res_0);
+          _mm_storel_epi64(p_1, res_1);
+        } else if (w == 4) {
+          xx_storel_32(p_0, res_0);
+          xx_storel_32(p_1, res_1);
+        } else {
+          *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0);
+          *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1);
+        }
+
+        s[0] = s[1];
+        s[1] = s[2];
+        s[3] = s[4];
+        s[4] = s[5];
+      }
+    }
+  } else {
+    int j;
+    int im_h = h + filter_params_y->taps - 1;
+    const int fo_vert = filter_params_y->taps / 2 - 1;
+    const int fo_horiz = filter_params_x->taps / 2 - 1;
+    const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+    filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+    filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
+
+    for (j = 0; j < w; j += 8) {
+      CONVOLVE_SR_HORIZONTAL_FILTER_8TAP;
+
+      CONVOLVE_SR_VERTICAL_FILTER_8TAP;
+    }
+  }
+}
+
+static INLINE void copy_128(const uint8_t *src, uint8_t *dst) {
+  __m256i s[4];
+  s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 32));
+  s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 32));
+  s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 32));
+  s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 32));
+  _mm256_storeu_si256((__m256i *)(dst + 0 * 32), s[0]);
+  _mm256_storeu_si256((__m256i *)(dst + 1 * 32), s[1]);
+  _mm256_storeu_si256((__m256i *)(dst + 2 * 32), s[2]);
+  _mm256_storeu_si256((__m256i *)(dst + 3 * 32), s[3]);
+}
+
+void av1_convolve_2d_copy_sr_avx2(const uint8_t *src, int src_stride,
+                                  uint8_t *dst, int dst_stride, int w, int h,
+                                  const InterpFilterParams *filter_params_x,
+                                  const InterpFilterParams *filter_params_y,
+                                  const int subpel_x_qn, const int subpel_y_qn,
+                                  ConvolveParams *conv_params) {
+  (void)filter_params_x;
+  (void)filter_params_y;
+  (void)subpel_x_qn;
+  (void)subpel_y_qn;
+  (void)conv_params;
+
+  if (w >= 16) {
+    assert(!((intptr_t)dst % 16));
+    assert(!(dst_stride % 16));
+  }
+
+  if (w == 2) {
+    do {
+      memmove(dst, src, 2 * sizeof(*src));
+      src += src_stride;
+      dst += dst_stride;
+      memmove(dst, src, 2 * sizeof(*src));
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 4) {
+    do {
+      memmove(dst, src, 4 * sizeof(*src));
+      src += src_stride;
+      dst += dst_stride;
+      memmove(dst, src, 4 * sizeof(*src));
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 8) {
+    do {
+      __m128i s[2];
+      s[0] = _mm_loadl_epi64((__m128i *)src);
+      src += src_stride;
+      s[1] = _mm_loadl_epi64((__m128i *)src);
+      src += src_stride;
+      _mm_storel_epi64((__m128i *)dst, s[0]);
+      dst += dst_stride;
+      _mm_storel_epi64((__m128i *)dst, s[1]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 16) {
+    do {
+      __m128i s[2];
+      s[0] = _mm_loadu_si128((__m128i *)src);
+      src += src_stride;
+      s[1] = _mm_loadu_si128((__m128i *)src);
+      src += src_stride;
+      _mm_store_si128((__m128i *)dst, s[0]);
+      dst += dst_stride;
+      _mm_store_si128((__m128i *)dst, s[1]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 32) {
+    do {
+      __m256i s[2];
+      s[0] = _mm256_loadu_si256((__m256i *)src);
+      src += src_stride;
+      s[1] = _mm256_loadu_si256((__m256i *)src);
+      src += src_stride;
+      _mm256_storeu_si256((__m256i *)dst, s[0]);
+      dst += dst_stride;
+      _mm256_storeu_si256((__m256i *)dst, s[1]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 64) {
+    do {
+      __m256i s[4];
+      s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 32));
+      s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 32));
+      src += src_stride;
+      s[2] = _mm256_loadu_si256((__m256i *)(src + 0 * 32));
+      s[3] = _mm256_loadu_si256((__m256i *)(src + 1 * 32));
+      src += src_stride;
+      _mm256_storeu_si256((__m256i *)(dst + 0 * 32), s[0]);
+      _mm256_storeu_si256((__m256i *)(dst + 1 * 32), s[1]);
+      dst += dst_stride;
+      _mm256_storeu_si256((__m256i *)(dst + 0 * 32), s[2]);
+      _mm256_storeu_si256((__m256i *)(dst + 1 * 32), s[3]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else {
+    do {
+      copy_128(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      copy_128(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  }
+}
diff --git a/libs/libaom/src/av1/common/x86/convolve_2d_sse2.c b/libs/libaom/src/av1/common/x86/convolve_2d_sse2.c
new file mode 100644
index 000000000..5376ea79b
--- /dev/null
+++ b/libs/libaom/src/av1/common/x86/convolve_2d_sse2.c
@@ -0,0 +1,471 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/convolve_sse2.h"
+#include "av1/common/convolve.h"
+
+void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
+                             int dst_stride, int w, int h,
+                             const InterpFilterParams *filter_params_x,
+                             const InterpFilterParams *filter_params_y,
+                             const int subpel_x_qn, const int subpel_y_qn,
+                             ConvolveParams *conv_params) {
+  const int bd = 8;
+
+  DECLARE_ALIGNED(16, int16_t,
+                  im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
+  int im_h = h + filter_params_y->taps - 1;
+  int im_stride = MAX_SB_SIZE;
+  int i, j;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+  const __m128i zero = _mm_setzero_si128();
+  const int bits =
+      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+
+  assert(conv_params->round_0 > 0);
+
+  /* Horizontal filter */
+  {
+    const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+        filter_params_x, subpel_x_qn & SUBPEL_MASK);
+    const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
+
+    // coeffs 0 1 0 1 2 3 2 3
+    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
+    // coeffs 4 5 4 5 6 7 6 7
+    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
+
+    // coeffs 0 1 0 1 0 1 0 1
+    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+    // coeffs 2 3 2 3 2 3 2 3
+    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+    // coeffs 4 5 4 5 4 5 4 5
+    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+    // coeffs 6 7 6 7 6 7 6 7
+    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+    const __m128i round_const = _mm_set1_epi32(
+        (1 << (bd + FILTER_BITS - 1)) + ((1 << conv_params->round_0) >> 1));
+    const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
+
+    for (i = 0; i < im_h; ++i) {
+      for (j = 0; j < w; j += 8) {
+        const __m128i data =
+            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+
+        // Filter even-index pixels
+        const __m128i src_0 = _mm_unpacklo_epi8(data, zero);
+        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+        const __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(data, 2), zero);
+        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+        const __m128i src_4 = _mm_unpacklo_epi8(_mm_srli_si128(data, 4), zero);
+        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+        const __m128i src_6 = _mm_unpacklo_epi8(_mm_srli_si128(data, 6), zero);
+        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+        __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
+                                         _mm_add_epi32(res_2, res_6));
+        res_even =
+            _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift);
+
+        // Filter odd-index pixels
+        const __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(data, 1), zero);
+        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+        const __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(data, 3), zero);
+        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+        const __m128i src_5 = _mm_unpacklo_epi8(_mm_srli_si128(data, 5), zero);
+        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+        const __m128i src_7 = _mm_unpacklo_epi8(_mm_srli_si128(data, 7), zero);
+        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+        __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
+                                        _mm_add_epi32(res_3, res_7));
+        res_odd =
+            _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift);
+
+        // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
+        __m128i res = _mm_packs_epi32(res_even, res_odd);
+        _mm_storeu_si128((__m128i *)&im_block[i * im_stride + j], res);
+      }
+    }
+  }
+
+  /* Vertical filter */
+  {
+    const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+        filter_params_y, subpel_y_qn & SUBPEL_MASK);
+    const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
+
+    // coeffs 0 1 0 1 2 3 2 3
+    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+    // coeffs 4 5 4 5 6 7 6 7
+    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+    // coeffs 0 1 0 1 0 1 0 1
+    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+    // coeffs 2 3 2 3 2 3 2 3
+    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+    // coeffs 4 5 4 5 4 5 4 5
+    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+    // coeffs 6 7 6 7 6 7 6 7
+    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+    const __m128i sum_round =
+        _mm_set1_epi32((1 << offset_bits) + ((1 << conv_params->round_1) >> 1));
+    const __m128i sum_shift = _mm_cvtsi32_si128(conv_params->round_1);
+
+    const __m128i round_const = _mm_set1_epi32(
+        ((1 << bits) >> 1) - (1 << (offset_bits - conv_params->round_1)) -
+        ((1 << (offset_bits - conv_params->round_1)) >> 1));
+    const __m128i round_shift = _mm_cvtsi32_si128(bits);
+
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; j += 8) {
+        // Filter even-index pixels
+        const int16_t *data = &im_block[i * im_stride + j];
+        const __m128i src_0 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride),
+                               *(__m128i *)(data + 1 * im_stride));
+        const __m128i src_2 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride),
+                               *(__m128i *)(data + 3 * im_stride));
+        const __m128i src_4 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride),
+                               *(__m128i *)(data + 5 * im_stride));
+        const __m128i src_6 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride),
+                               *(__m128i *)(data + 7 * im_stride));
+
+        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+        const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+                                               _mm_add_epi32(res_4, res_6));
+
+        // Filter odd-index pixels
+        const __m128i src_1 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride),
+                               *(__m128i *)(data + 1 * im_stride));
+        const __m128i src_3 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride),
+                               *(__m128i *)(data + 3 * im_stride));
+        const __m128i src_5 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride),
+                               *(__m128i *)(data + 5 * im_stride));
+        const __m128i src_7 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride),
+                               *(__m128i *)(data + 7 * im_stride));
+
+        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+        const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+                                              _mm_add_epi32(res_5, res_7));
+
+        // Rearrange pixels back into the order 0 ... 7
+        const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+        const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+        __m128i res_lo_round =
+            _mm_sra_epi32(_mm_add_epi32(res_lo, sum_round), sum_shift);
+        __m128i res_hi_round =
+            _mm_sra_epi32(_mm_add_epi32(res_hi, sum_round), sum_shift);
+
+        res_lo_round = _mm_sra_epi32(_mm_add_epi32(res_lo_round, round_const),
+                                     round_shift);
+        res_hi_round = _mm_sra_epi32(_mm_add_epi32(res_hi_round, round_const),
+                                     round_shift);
+
+        const __m128i res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
+        const __m128i res = _mm_packus_epi16(res16, res16);
+
+        // Accumulate values into the destination buffer
+        __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
+
+        if (w == 2) {
+          *(uint16_t *)p = (uint16_t)_mm_cvtsi128_si32(res);
+        } else if (w == 4) {
+          *(uint32_t *)p = _mm_cvtsi128_si32(res);
+        } else {
+          _mm_storel_epi64(p, res);
+        }
+      }
+    }
+  }
+}
+
+static INLINE void copy_128(const uint8_t *src, uint8_t *dst) {
+  __m128i s[8];
+  s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
+  s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
+  s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 16));
+  s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 16));
+  s[4] = _mm_loadu_si128((__m128i *)(src + 4 * 16));
+  s[5] = _mm_loadu_si128((__m128i *)(src + 5 * 16));
+  s[6] = _mm_loadu_si128((__m128i *)(src + 6 * 16));
+  s[7] = _mm_loadu_si128((__m128i *)(src + 7 * 16));
+  _mm_store_si128((__m128i *)(dst + 0 * 16), s[0]);
+  _mm_store_si128((__m128i *)(dst + 1 * 16), s[1]);
+  _mm_store_si128((__m128i *)(dst + 2 * 16), s[2]);
+  _mm_store_si128((__m128i *)(dst + 3 * 16), s[3]);
+  _mm_store_si128((__m128i *)(dst + 4 * 16), s[4]);
+  _mm_store_si128((__m128i *)(dst + 5 * 16), s[5]);
+  _mm_store_si128((__m128i *)(dst + 6 * 16), s[6]);
+  _mm_store_si128((__m128i *)(dst + 7 * 16), s[7]);
+}
+
+void av1_convolve_2d_copy_sr_sse2(const uint8_t *src, int src_stride,
+                                  uint8_t *dst, int dst_stride, int w, int h,
+                                  const InterpFilterParams *filter_params_x,
+                                  const InterpFilterParams *filter_params_y,
+                                  const int subpel_x_qn, const int subpel_y_qn,
+                                  ConvolveParams *conv_params) {
+  (void)filter_params_x;
+  (void)filter_params_y;
+  (void)subpel_x_qn;
+  (void)subpel_y_qn;
+  (void)conv_params;
+
+  if (w >= 16) {
+    assert(!((intptr_t)dst % 16));
+    assert(!(dst_stride % 16));
+  }
+
+  if (w == 2) {
+    do {
+      memmove(dst, src, 2 * sizeof(*src));
+      src += src_stride;
+      dst += dst_stride;
+      memmove(dst, src, 2 * sizeof(*src));
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 4) {
+    do {
+      memmove(dst, src, 4 * sizeof(*src));
+      src += src_stride;
+      dst += dst_stride;
+      memmove(dst, src, 4 * sizeof(*src));
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 8) {
+    do {
+      __m128i s[2];
+      s[0] = _mm_loadl_epi64((__m128i *)src);
+      src += src_stride;
+      s[1] = _mm_loadl_epi64((__m128i *)src);
+      src += src_stride;
+      _mm_storel_epi64((__m128i *)dst, s[0]);
+      dst += dst_stride;
+      _mm_storel_epi64((__m128i *)dst, s[1]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 16) {
+    do {
+      __m128i s[2];
+      s[0] = _mm_loadu_si128((__m128i *)src);
+      src += src_stride;
+      s[1] = _mm_loadu_si128((__m128i *)src);
+      src += src_stride;
+      _mm_store_si128((__m128i *)dst, s[0]);
+      dst += dst_stride;
+      _mm_store_si128((__m128i *)dst, s[1]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 32) {
+    do {
+      __m128i s[4];
+      s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
+      s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
+      src += src_stride;
+      s[2] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
+      s[3] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
+      src += src_stride;
+      _mm_store_si128((__m128i *)(dst + 0 * 16), s[0]);
+      _mm_store_si128((__m128i *)(dst + 1 * 16), s[1]);
+      dst += dst_stride;
+      _mm_store_si128((__m128i *)(dst + 0 * 16), s[2]);
+      _mm_store_si128((__m128i *)(dst + 1 * 16), s[3]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 64) {
+    do {
+      __m128i s[8];
+      s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
+      s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
+      s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 16));
+      s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 16));
+      src += src_stride;
+      s[4] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
+      s[5] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
+      s[6] = _mm_loadu_si128((__m128i *)(src + 2 * 16));
+      s[7] = _mm_loadu_si128((__m128i *)(src + 3 * 16));
+      src += src_stride;
+      _mm_store_si128((__m128i *)(dst + 0 * 16), s[0]);
+      _mm_store_si128((__m128i *)(dst + 1 * 16), s[1]);
+      _mm_store_si128((__m128i *)(dst + 2 * 16), s[2]);
+      _mm_store_si128((__m128i *)(dst + 3 * 16), s[3]);
+      dst += dst_stride;
+      _mm_store_si128((__m128i *)(dst + 0 * 16), s[4]);
+      _mm_store_si128((__m128i *)(dst + 1 * 16), s[5]);
+      _mm_store_si128((__m128i *)(dst + 2 * 16), s[6]);
+      _mm_store_si128((__m128i *)(dst + 3 * 16), s[7]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else {
+    do {
+      copy_128(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      copy_128(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  }
+}
+
+void av1_dist_wtd_convolve_2d_copy_sse2(
+    const uint8_t *src, int src_stride, uint8_t *dst0, int dst_stride0, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params) {
+  const int bd = 8;
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  (void)filter_params_x;
+  (void)filter_params_y;
+  (void)subpel_x_qn;
+  (void)subpel_y_qn;
+
+  const int bits =
+      FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
+  const int do_average = conv_params->do_average;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i left_shift = _mm_cvtsi32_si128(bits);
+  int i, j;
+
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m128i wt0 = _mm_set1_epi16(w0);
+  const __m128i wt1 = _mm_set1_epi16(w1);
+  const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
+
+  const int offset_0 =
+      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+  const __m128i offset_const = _mm_set1_epi16(offset);
+  const int rounding_shift =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1);
+
+  assert((w % 4) == 0);
+
+  if (!(w % 16)) {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; j += 16) {
+        const __m128i d8 = _mm_loadu_si128((__m128i *)&src[j]);
+
+        const __m128i d16_lo = _mm_unpacklo_epi8(d8, zero);
+        const __m128i d16_hi = _mm_unpackhi_epi8(d8, zero);
+
+        const __m128i res_lo = _mm_sll_epi16(d16_lo, left_shift);
+        const __m128i res_unsigned_lo = _mm_add_epi16(res_lo, offset_const);
+
+        const __m128i res_hi = _mm_sll_epi16(d16_hi, left_shift);
+        const __m128i res_unsigned_hi = _mm_add_epi16(res_hi, offset_const);
+
+        if (do_average) {
+          const __m128i data_ref_0_lo = _mm_loadu_si128((__m128i *)(&dst[j]));
+          const __m128i data_ref_0_hi =
+              _mm_loadu_si128((__m128i *)(&dst[j + 8]));
+
+          const __m128i comp_avg_res_lo = comp_avg(
+              &data_ref_0_lo, &res_unsigned_lo, &wt, use_dist_wtd_comp_avg);
+
+          const __m128i round_result_lo = convolve_rounding(
+              &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
+
+          const __m128i comp_avg_res_hi = comp_avg(
+              &data_ref_0_hi, &res_unsigned_hi, &wt, use_dist_wtd_comp_avg);
+
+          const __m128i round_result_hi = convolve_rounding(
+              &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
+
+          const __m128i res_8 =
+              _mm_packus_epi16(round_result_lo, round_result_hi);
+
+          _mm_store_si128((__m128i *)(&dst0[j]), res_8);
+        } else {
+          _mm_store_si128((__m128i *)(&dst[j]), res_unsigned_lo);
+          _mm_store_si128((__m128i *)(&dst[j + 8]), res_unsigned_hi);
+        }
+      }
+      src += src_stride;
+      dst += dst_stride;
+      dst0 += dst_stride0;
+    }
+  } else {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; j += 8) {
+        const __m128i d8 = _mm_loadl_epi64((__m128i *)&src[j]);
+        const __m128i d16_0 = _mm_unpacklo_epi8(d8, zero);
+
+        const __m128i res = _mm_sll_epi16(d16_0, left_shift);
+        const __m128i res_unsigned = _mm_add_epi16(res, offset_const);
+
+        if (do_average) {
+          const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)(&dst[j]));
+
+          const __m128i comp_avg_res =
+              comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
+
+          const __m128i round_result = convolve_rounding(
+              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+          const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
+
+          if (w > 4)
+            _mm_storel_epi64((__m128i *)(&dst0[j]), res_8);
+          else
+            *(uint32_t *)(&dst0[j]) = _mm_cvtsi128_si32(res_8);
+        } else {
+          _mm_store_si128((__m128i *)(&dst[j]), res_unsigned);
+        }
+      }
+      src += src_stride;
+      dst += dst_stride;
+      dst0 += dst_stride0;
+    }
+  }
+}
diff --git a/libs/libaom/src/av1/common/x86/convolve_avx2.c b/libs/libaom/src/av1/common/x86/convolve_avx2.c
new file mode 100644
index 000000000..1d5bc6fbd
--- /dev/null
+++ b/libs/libaom/src/av1/common/x86/convolve_avx2.c
@@ -0,0 +1,439 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/x86/convolve_avx2.h"
+#include "aom_dsp/x86/synonyms.h"
+
+void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
+                            int dst_stride, int w, int h,
+                            const InterpFilterParams *filter_params_x,
+                            const InterpFilterParams *filter_params_y,
+                            const int subpel_x_qn, const int subpel_y_qn,
+                            ConvolveParams *conv_params) {
+  int i, j, is_vert_4tap = 0;
+  // right shift is F-1 because we are already dividing
+  // filter co-efficients by 2
+  const int right_shift_bits = (FILTER_BITS - 1);
+  const __m128i right_shift = _mm_cvtsi32_si128(right_shift_bits);
+  const __m256i right_shift_const =
+      _mm256_set1_epi16((1 << right_shift_bits) >> 1);
+
+  assert(conv_params->round_0 <= FILTER_BITS);
+  assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
+         ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
+
+  (void)filter_params_x;
+  (void)subpel_x_qn;
+  (void)conv_params;
+  __m256i coeffs[4], s[8];
+  __m128i d[6];
+
+  prepare_coeffs_lowbd(filter_params_y, subpel_y_qn, coeffs);
+
+  // Condition for checking valid vert_filt taps
+  if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0)))
+    is_vert_4tap = 1;
+
+  // vert_filt as 4 tap
+  if (is_vert_4tap) {
+    const int fo_vert = 1;
+    const uint8_t *const src_ptr = src - fo_vert * src_stride;
+    for (j = 0; j < w; j += 16) {
+      const uint8_t *data = &src_ptr[j];
+      d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
+      d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
+      d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
+      d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
+      d[4] = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
+
+      // Load lines a and b. Line a to lower 128, line b to upper 128
+      const __m256i src_01a = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
+
+      const __m256i src_12a = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
+
+      const __m256i src_23a = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
+
+      const __m256i src_34a = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20);
+
+      s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
+      s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
+
+      s[3] = _mm256_unpackhi_epi8(src_01a, src_12a);
+      s[4] = _mm256_unpackhi_epi8(src_23a, src_34a);
+
+      for (i = 0; i < h; i += 2) {
+        data = &src_ptr[i * src_stride + j];
+        d[5] = _mm_loadu_si128((__m128i *)(data + 5 * src_stride));
+        const __m256i src_45a = _mm256_permute2x128_si256(
+            _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20);
+
+        d[4] = _mm_loadu_si128((__m128i *)(data + 6 * src_stride));
+        const __m256i src_56a = _mm256_permute2x128_si256(
+            _mm256_castsi128_si256(d[5]), _mm256_castsi128_si256(d[4]), 0x20);
+
+        s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
+        s[5] = _mm256_unpackhi_epi8(src_45a, src_56a);
+
+        const __m256i res_lo = convolve_lowbd_4tap(s, coeffs + 1);
+        /* rounding code */
+        // shift by F - 1
+        const __m256i res_16b_lo = _mm256_sra_epi16(
+            _mm256_add_epi16(res_lo, right_shift_const), right_shift);
+        // 8 bit conversion and saturation to uint8
+        __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
+
+        if (w - j > 8) {
+          const __m256i res_hi = convolve_lowbd_4tap(s + 3, coeffs + 1);
+
+          /* rounding code */
+          // shift by F - 1
+          const __m256i res_16b_hi = _mm256_sra_epi16(
+              _mm256_add_epi16(res_hi, right_shift_const), right_shift);
+          // 8 bit conversion and saturation to uint8
+          __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
+
+          __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi);
+
+          const __m128i res_0 = _mm256_castsi256_si128(res_a);
+          const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
+
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0);
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                           res_1);
+        } else {
+          const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo);
+          const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
+          if (w - j > 4) {
+            _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
+            _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                             res_1);
+          } else if (w - j > 2) {
+            xx_storel_32(&dst[i * dst_stride + j], res_0);
+            xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
+          } else {
+            __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
+            __m128i *const p_1 =
+                (__m128i *)&dst[i * dst_stride + j + dst_stride];
+            *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
+            *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
+          }
+        }
+        s[0] = s[1];
+        s[1] = s[2];
+
+        s[3] = s[4];
+        s[4] = s[5];
+      }
+    }
+  } else {
+    const int fo_vert = filter_params_y->taps / 2 - 1;
+    const uint8_t *const src_ptr = src - fo_vert * src_stride;
+
+    for (j = 0; j < w; j += 16) {
+      const uint8_t *data = &src_ptr[j];
+      __m256i src6;
+
+      d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
+      d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
+      d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
+      d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
+      d[4] = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
+      d[5] = _mm_loadu_si128((__m128i *)(data + 5 * src_stride));
+      // Load lines a and b. Line a to lower 128, line b to upper 128
+      const __m256i src_01a = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
+
+      const __m256i src_12a = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
+
+      const __m256i src_23a = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
+
+      const __m256i src_34a = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20);
+
+      const __m256i src_45a = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20);
+
+      src6 = _mm256_castsi128_si256(
+          _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
+      const __m256i src_56a =
+          _mm256_permute2x128_si256(_mm256_castsi128_si256(d[5]), src6, 0x20);
+
+      s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
+      s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
+      s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
+
+      s[4] = _mm256_unpackhi_epi8(src_01a, src_12a);
+      s[5] = _mm256_unpackhi_epi8(src_23a, src_34a);
+      s[6] = _mm256_unpackhi_epi8(src_45a, src_56a);
+
+      for (i = 0; i < h; i += 2) {
+        data = &src_ptr[i * src_stride + j];
+        const __m256i src_67a = _mm256_permute2x128_si256(
+            src6,
+            _mm256_castsi128_si256(
+                _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
+            0x20);
+
+        src6 = _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + 8 * src_stride)));
+        const __m256i src_78a = _mm256_permute2x128_si256(
+            _mm256_castsi128_si256(
+                _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
+            src6, 0x20);
+
+        s[3] = _mm256_unpacklo_epi8(src_67a, src_78a);
+        s[7] = _mm256_unpackhi_epi8(src_67a, src_78a);
+
+        const __m256i res_lo = convolve_lowbd(s, coeffs);
+
+        /* rounding code */
+        // shift by F - 1
+        const __m256i res_16b_lo = _mm256_sra_epi16(
+            _mm256_add_epi16(res_lo, right_shift_const), right_shift);
+        // 8 bit conversion and saturation to uint8
+        __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
+
+        if (w - j > 8) {
+          const __m256i res_hi = convolve_lowbd(s + 4, coeffs);
+
+          /* rounding code */
+          // shift by F - 1
+          const __m256i res_16b_hi = _mm256_sra_epi16(
+              _mm256_add_epi16(res_hi, right_shift_const), right_shift);
+          // 8 bit conversion and saturation to uint8
+          __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
+
+          __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi);
+
+          const __m128i res_0 = _mm256_castsi256_si128(res_a);
+          const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
+
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0);
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                           res_1);
+        } else {
+          const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo);
+          const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
+          if (w - j > 4) {
+            _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
+            _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                             res_1);
+          } else if (w - j > 2) {
+            xx_storel_32(&dst[i * dst_stride + j], res_0);
+            xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
+          } else {
+            __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
+            __m128i *const p_1 =
+                (__m128i *)&dst[i * dst_stride + j + dst_stride];
+            *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
+            *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
+          }
+        }
+        s[0] = s[1];
+        s[1] = s[2];
+        s[2] = s[3];
+
+        s[4] = s[5];
+        s[5] = s[6];
+        s[6] = s[7];
+      }
+    }
+  }
+}
+
+void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
+                            int dst_stride, int w, int h,
+                            const InterpFilterParams *filter_params_x,
+                            const InterpFilterParams *filter_params_y,
+                            const int subpel_x_qn, const int subpel_y_qn,
+                            ConvolveParams *conv_params) {
+  const int bits = FILTER_BITS - conv_params->round_0;
+
+  const __m256i round_0_const =
+      _mm256_set1_epi16((1 << (conv_params->round_0 - 1)) >> 1);
+  const __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1);
+  const __m256i round_const = _mm256_set1_epi16((1 << bits) >> 1);
+  const __m128i round_shift = _mm_cvtsi32_si128(bits);
+  int i, is_horiz_4tap = 0;
+  (void)filter_params_y;
+  (void)subpel_y_qn;
+
+  assert(bits >= 0);
+  assert((FILTER_BITS - conv_params->round_1) >= 0 ||
+         ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
+  assert(conv_params->round_0 > 0);
+
+  __m256i coeffs[4], filt[4];
+  filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2));
+  filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+
+  prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs);
+
+  // Condition for checking valid horz_filt taps
+  if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0)))
+    is_horiz_4tap = 1;
+
+  // horz_filt as 4 tap
+  if (is_horiz_4tap) {
+    const int fo_horiz = 1;
+    const uint8_t *const src_ptr = src - fo_horiz;
+    if (w <= 8) {
+      for (i = 0; i < h; i += 2) {
+        const __m256i data = _mm256_permute2x128_si256(
+            _mm256_castsi128_si256(
+                _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
+            _mm256_castsi128_si256(_mm_loadu_si128(
+                (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
+            0x20);
+
+        __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs + 1, filt);
+
+        res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
+                                   round_0_shift);
+
+        res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
+                                   round_shift);
+
+        /* rounding code */
+        // 8 bit conversion and saturation to uint8
+        __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
+
+        const __m128i res_0 = _mm256_castsi256_si128(res_8b);
+        const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
+
+        if (w > 4) {
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
+        } else if (w > 2) {
+          xx_storel_32(&dst[i * dst_stride], res_0);
+          xx_storel_32(&dst[i * dst_stride + dst_stride], res_1);
+        } else {
+          __m128i *const p_0 = (__m128i *)&dst[i * dst_stride];
+          __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride];
+          *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
+          *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
+        }
+      }
+    } else {
+      for (i = 0; i < h; ++i) {
+        for (int j = 0; j < w; j += 16) {
+          // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17
+          // 18 19 20 21 22 23
+          const __m256i data = _mm256_inserti128_si256(
+              _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
+              _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
+              1);
+
+          __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs + 1, filt);
+
+          res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
+                                     round_0_shift);
+
+          res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
+                                     round_shift);
+
+          /* rounding code */
+          // 8 bit conversion and saturation to uint8
+          __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
+
+          // Store values into the destination buffer
+          // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+          res_8b = _mm256_permute4x64_epi64(res_8b, 216);
+          __m128i res = _mm256_castsi256_si128(res_8b);
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
+        }
+      }
+    }
+  } else {
+    const int fo_horiz = filter_params_x->taps / 2 - 1;
+    const uint8_t *const src_ptr = src - fo_horiz;
+    filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+    filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
+
+    if (w <= 8) {
+      for (i = 0; i < h; i += 2) {
+        const __m256i data = _mm256_permute2x128_si256(
+            _mm256_castsi128_si256(
+                _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
+            _mm256_castsi128_si256(_mm_loadu_si128(
+                (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
+            0x20);
+
+        __m256i res_16b = convolve_lowbd_x(data, coeffs, filt);
+
+        res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
+                                   round_0_shift);
+
+        res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
+                                   round_shift);
+
+        /* rounding code */
+        // 8 bit conversion and saturation to uint8
+        __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
+
+        const __m128i res_0 = _mm256_castsi256_si128(res_8b);
+        const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
+        if (w > 4) {
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
+        } else if (w > 2) {
+          xx_storel_32(&dst[i * dst_stride], res_0);
+          xx_storel_32(&dst[i * dst_stride + dst_stride], res_1);
+        } else {
+          __m128i *const p_0 = (__m128i *)&dst[i * dst_stride];
+          __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride];
+          *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0);
+          *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1);
+        }
+      }
+    } else {
+      for (i = 0; i < h; ++i) {
+        for (int j = 0; j < w; j += 16) {
+          // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17
+          // 18 19 20 21 22 23
+          const __m256i data = _mm256_inserti128_si256(
+              _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
+              _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
+              1);
+
+          __m256i res_16b = convolve_lowbd_x(data, coeffs, filt);
+
+          res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
+                                     round_0_shift);
+
+          res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
+                                     round_shift);
+
+          /* rounding code */
+          // 8 bit conversion and saturation to uint8
+          __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
+
+          // Store values into the destination buffer
+          // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+          res_8b = _mm256_permute4x64_epi64(res_8b, 216);
+          __m128i res = _mm256_castsi256_si128(res_8b);
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
+        }
+      }
+    }
+  }
+}
diff --git a/libs/libaom/src/av1/common/x86/convolve_sse2.c b/libs/libaom/src/av1/common/x86/convolve_sse2.c
new file mode 100644
index 000000000..4323ac4d1
--- /dev/null
+++ b/libs/libaom/src/av1/common/x86/convolve_sse2.c
@@ -0,0 +1,338 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/convolve_common_intrin.h"
+#include "av1/common/convolve.h"
+
+static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params,
+                                  const int subpel_q4,
+                                  __m128i *const coeffs /* [4] */) {
+  const int16_t *const y_filter = av1_get_interp_filter_subpel_kernel(
+      filter_params, subpel_q4 & SUBPEL_MASK);
+  const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
+  // coeffs 0 1 0 1 2 3 2 3
+  const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+  // coeffs 4 5 4 5 6 7 6 7
+  const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+  coeffs[0] = _mm_unpacklo_epi64(tmp_0, tmp_0);  // coeffs 0 1 0 1 0 1 0 1
+  coeffs[1] = _mm_unpackhi_epi64(tmp_0, tmp_0);  // coeffs 2 3 2 3 2 3 2 3
+  coeffs[2] = _mm_unpacklo_epi64(tmp_1, tmp_1);  // coeffs 4 5 4 5 4 5 4 5
+  coeffs[3] = _mm_unpackhi_epi64(tmp_1, tmp_1);  // coeffs 6 7 6 7 6 7 6 7
+}
+
+static INLINE __m128i convolve(const __m128i *const s,
+                               const __m128i *const coeffs) {
+  const __m128i d0 = _mm_madd_epi16(s[0], coeffs[0]);
+  const __m128i d1 = _mm_madd_epi16(s[1], coeffs[1]);
+  const __m128i d2 = _mm_madd_epi16(s[2], coeffs[2]);
+  const __m128i d3 = _mm_madd_epi16(s[3], coeffs[3]);
+  const __m128i d = _mm_add_epi32(_mm_add_epi32(d0, d1), _mm_add_epi32(d2, d3));
+  return d;
+}
+
+static INLINE __m128i convolve_lo_x(const __m128i *const s,
+                                    const __m128i *const coeffs) {
+  __m128i ss[4];
+  ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128());
+  ss[1] = _mm_unpacklo_epi8(s[1], _mm_setzero_si128());
+  ss[2] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128());
+  ss[3] = _mm_unpacklo_epi8(s[3], _mm_setzero_si128());
+  return convolve(ss, coeffs);
+}
+
+static INLINE __m128i convolve_lo_y(const __m128i *const s,
+                                    const __m128i *const coeffs) {
+  __m128i ss[4];
+  ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128());
+  ss[1] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128());
+  ss[2] = _mm_unpacklo_epi8(s[4], _mm_setzero_si128());
+  ss[3] = _mm_unpacklo_epi8(s[6], _mm_setzero_si128());
+  return convolve(ss, coeffs);
+}
+
+static INLINE __m128i convolve_hi_y(const __m128i *const s,
+                                    const __m128i *const coeffs) {
+  __m128i ss[4];
+  ss[0] = _mm_unpackhi_epi8(s[0], _mm_setzero_si128());
+  ss[1] = _mm_unpackhi_epi8(s[2], _mm_setzero_si128());
+  ss[2] = _mm_unpackhi_epi8(s[4], _mm_setzero_si128());
+  ss[3] = _mm_unpackhi_epi8(s[6], _mm_setzero_si128());
+  return convolve(ss, coeffs);
+}
+
+void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
+                            int dst_stride, int w, int h,
+                            const InterpFilterParams *filter_params_x,
+                            const InterpFilterParams *filter_params_y,
+                            const int subpel_x_qn, const int subpel_y_qn,
+                            ConvolveParams *conv_params) {
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const uint8_t *src_ptr = src - fo_vert * src_stride;
+  const __m128i round_const = _mm_set1_epi32((1 << FILTER_BITS) >> 1);
+  const __m128i round_shift = _mm_cvtsi32_si128(FILTER_BITS);
+  __m128i coeffs[4];
+
+  (void)filter_params_x;
+  (void)subpel_x_qn;
+  (void)conv_params;
+
+  assert(conv_params->round_0 <= FILTER_BITS);
+  assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
+         ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
+
+  prepare_coeffs(filter_params_y, subpel_y_qn, coeffs);
+
+  if (w <= 4) {
+    __m128i s[8], src6, res, res_round, res16;
+    uint32_t res_int;
+    src6 = _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 6 * src_stride));
+    s[0] = _mm_unpacklo_epi8(
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 0 * src_stride)),
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 1 * src_stride)));
+    s[1] = _mm_unpacklo_epi8(
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 1 * src_stride)),
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 2 * src_stride)));
+    s[2] = _mm_unpacklo_epi8(
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 2 * src_stride)),
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 3 * src_stride)));
+    s[3] = _mm_unpacklo_epi8(
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 3 * src_stride)),
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 4 * src_stride)));
+    s[4] = _mm_unpacklo_epi8(
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 4 * src_stride)),
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 5 * src_stride)));
+    s[5] = _mm_unpacklo_epi8(
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 5 * src_stride)), src6);
+
+    do {
+      s[6] = _mm_unpacklo_epi8(
+          src6, _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 7 * src_stride)));
+      src6 = _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 8 * src_stride));
+      s[7] = _mm_unpacklo_epi8(
+          _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 7 * src_stride)), src6);
+
+      res = convolve_lo_y(s + 0, coeffs);
+      res_round = _mm_sra_epi32(_mm_add_epi32(res, round_const), round_shift);
+      res16 = _mm_packs_epi32(res_round, res_round);
+      res_int = _mm_cvtsi128_si32(_mm_packus_epi16(res16, res16));
+
+      if (w == 2)
+        *(uint16_t *)dst = (uint16_t)res_int;
+      else
+        *(uint32_t *)dst = res_int;
+
+      src_ptr += src_stride;
+      dst += dst_stride;
+
+      res = convolve_lo_y(s + 1, coeffs);
+      res_round = _mm_sra_epi32(_mm_add_epi32(res, round_const), round_shift);
+      res16 = _mm_packs_epi32(res_round, res_round);
+      res_int = _mm_cvtsi128_si32(_mm_packus_epi16(res16, res16));
+
+      if (w == 2)
+        *(uint16_t *)dst = (uint16_t)res_int;
+      else
+        *(uint32_t *)dst = res_int;
+
+      src_ptr += src_stride;
+      dst += dst_stride;
+
+      s[0] = s[2];
+      s[1] = s[3];
+      s[2] = s[4];
+      s[3] = s[5];
+      s[4] = s[6];
+      s[5] = s[7];
+      h -= 2;
+    } while (h);
+  } else {
+    assert(!(w % 8));
+    int j = 0;
+    do {
+      __m128i s[8], src6, res_lo, res_hi;
+      __m128i res_lo_round, res_hi_round, res16, res;
+      const uint8_t *data = &src_ptr[j];
+
+      src6 = _mm_loadl_epi64((__m128i *)(data + 6 * src_stride));
+      s[0] = _mm_unpacklo_epi8(
+          _mm_loadl_epi64((__m128i *)(data + 0 * src_stride)),
+          _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)));
+      s[1] = _mm_unpacklo_epi8(
+          _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)),
+          _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)));
+      s[2] = _mm_unpacklo_epi8(
+          _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)),
+          _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)));
+      s[3] = _mm_unpacklo_epi8(
+          _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)),
+          _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)));
+      s[4] = _mm_unpacklo_epi8(
+          _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)),
+          _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)));
+      s[5] = _mm_unpacklo_epi8(
+          _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)), src6);
+
+      int i = 0;
+      do {
+        data = &src_ptr[i * src_stride + j];
+        s[6] = _mm_unpacklo_epi8(
+            src6, _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)));
+        src6 = _mm_loadl_epi64((__m128i *)(data + 8 * src_stride));
+        s[7] = _mm_unpacklo_epi8(
+            _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)), src6);
+
+        res_lo = convolve_lo_y(s, coeffs);  // Filter low index pixels
+        res_hi = convolve_hi_y(s, coeffs);  // Filter high index pixels
+
+        res_lo_round =
+            _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+        res_hi_round =
+            _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+
+        res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
+        res = _mm_packus_epi16(res16, res16);
+
+        _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
+        i++;
+
+        res_lo = convolve_lo_y(s + 1, coeffs);  // Filter low index pixels
+        res_hi = convolve_hi_y(s + 1, coeffs);  // Filter high index pixels
+
+        res_lo_round =
+            _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+        res_hi_round =
+            _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+
+        res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
+        res = _mm_packus_epi16(res16, res16);
+
+        _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
+        i++;
+
+        s[0] = s[2];
+        s[1] = s[3];
+        s[2] = s[4];
+        s[3] = s[5];
+        s[4] = s[6];
+        s[5] = s[7];
+      } while (i < h);
+      j += 8;
+    } while (j < w);
+  }
+}
+
+void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
+                            int dst_stride, int w, int h,
+                            const InterpFilterParams *filter_params_x,
+                            const InterpFilterParams *filter_params_y,
+                            const int subpel_x_qn, const int subpel_y_qn,
+                            ConvolveParams *conv_params) {
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const uint8_t *src_ptr = src - fo_horiz;
+  const int bits = FILTER_BITS - conv_params->round_0;
+  const __m128i round_0_const =
+      _mm_set1_epi32((1 << conv_params->round_0) >> 1);
+  const __m128i round_const = _mm_set1_epi32((1 << bits) >> 1);
+  const __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0);
+  const __m128i round_shift = _mm_cvtsi32_si128(bits);
+  __m128i coeffs[4];
+
+  (void)filter_params_y;
+  (void)subpel_y_qn;
+
+  assert(bits >= 0);
+  assert((FILTER_BITS - conv_params->round_1) >= 0 ||
+         ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
+
+  prepare_coeffs(filter_params_x, subpel_x_qn, coeffs);
+
+  if (w <= 4) {
+    do {
+      const __m128i data = _mm_loadu_si128((__m128i *)src_ptr);
+      __m128i s[4];
+
+      s[0] = _mm_unpacklo_epi8(data, _mm_srli_si128(data, 1));
+      s[1] =
+          _mm_unpacklo_epi8(_mm_srli_si128(data, 2), _mm_srli_si128(data, 3));
+      s[2] =
+          _mm_unpacklo_epi8(_mm_srli_si128(data, 4), _mm_srli_si128(data, 5));
+      s[3] =
+          _mm_unpacklo_epi8(_mm_srli_si128(data, 6), _mm_srli_si128(data, 7));
+      const __m128i res_lo = convolve_lo_x(s, coeffs);
+      __m128i res_lo_round =
+          _mm_sra_epi32(_mm_add_epi32(res_lo, round_0_const), round_0_shift);
+      res_lo_round =
+          _mm_sra_epi32(_mm_add_epi32(res_lo_round, round_const), round_shift);
+
+      const __m128i res16 = _mm_packs_epi32(res_lo_round, res_lo_round);
+      const __m128i res = _mm_packus_epi16(res16, res16);
+
+      uint32_t r = _mm_cvtsi128_si32(res);
+      if (w == 2)
+        *(uint16_t *)dst = (uint16_t)r;
+      else
+        *(uint32_t *)dst = r;
+
+      src_ptr += src_stride;
+      dst += dst_stride;
+    } while (--h);
+  } else {
+    assert(!(w % 8));
+    int i = 0;
+    do {
+      int j = 0;
+      do {
+        const __m128i data =
+            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+        __m128i s[4];
+
+        // Filter even-index pixels
+        s[0] = data;
+        s[1] = _mm_srli_si128(data, 2);
+        s[2] = _mm_srli_si128(data, 4);
+        s[3] = _mm_srli_si128(data, 6);
+        const __m128i res_even = convolve_lo_x(s, coeffs);
+
+        // Filter odd-index pixels
+        s[0] = _mm_srli_si128(data, 1);
+        s[1] = _mm_srli_si128(data, 3);
+        s[2] = _mm_srli_si128(data, 5);
+        s[3] = _mm_srli_si128(data, 7);
+        const __m128i res_odd = convolve_lo_x(s, coeffs);
+
+        // Rearrange pixels back into the order 0 ... 7
+        const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+        const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+        __m128i res_lo_round =
+            _mm_sra_epi32(_mm_add_epi32(res_lo, round_0_const), round_0_shift);
+        res_lo_round = _mm_sra_epi32(_mm_add_epi32(res_lo_round, round_const),
+                                     round_shift);
+        __m128i res_hi_round =
+            _mm_sra_epi32(_mm_add_epi32(res_hi, round_0_const), round_0_shift);
+        res_hi_round = _mm_sra_epi32(_mm_add_epi32(res_hi_round, round_const),
+                                     round_shift);
+
+        const __m128i res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
+        const __m128i res = _mm_packus_epi16(res16, res16);
+
+        _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
+        j += 8;
+      } while (j < w);
+    } while (++i < h);
+  }
+}
diff --git a/libs/libaom/src/av1/common/x86/filterintra_sse4.c b/libs/libaom/src/av1/common/x86/filterintra_sse4.c
new file mode 100644
index 000000000..99f4d9967
--- /dev/null
+++ b/libs/libaom/src/av1/common/x86/filterintra_sse4.c
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/x86/synonyms.h"
+#include "av1/common/enums.h"
+#include "av1/common/reconintra.h"
+
+void av1_filter_intra_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride,
+                                       TX_SIZE tx_size, const uint8_t *above,
+                                       const uint8_t *left, int mode) {
+  int r, c;
+  uint8_t buffer[33][33];
+  const int bw = tx_size_wide[tx_size];
+  const int bh = tx_size_high[tx_size];
+
+  assert(bw <= 32 && bh <= 32);
+
+  for (r = 0; r < bh; ++r) buffer[r + 1][0] = left[r];
+  memcpy(buffer[0], &above[-1], (bw + 1) * sizeof(uint8_t));
+
+  const __m128i f1f0 = xx_load_128(av1_filter_intra_taps[mode][0]);
+  const __m128i f3f2 = xx_load_128(av1_filter_intra_taps[mode][2]);
+  const __m128i f5f4 = xx_load_128(av1_filter_intra_taps[mode][4]);
+  const __m128i f7f6 = xx_load_128(av1_filter_intra_taps[mode][6]);
+  const __m128i filter_intra_scale_bits =
+      _mm_set1_epi16(1 << (15 - FILTER_INTRA_SCALE_BITS));
+
+  for (r = 1; r < bh + 1; r += 2) {
+    for (c = 1; c < bw + 1; c += 4) {
+      DECLARE_ALIGNED(16, uint8_t, p[8]);
+      memcpy(p, &buffer[r - 1][c - 1], 5 * sizeof(uint8_t));
+      p[5] = buffer[r][c - 1];
+      p[6] = buffer[r + 1][c - 1];
+      p[7] = 0;
+      const __m128i p_b = xx_loadl_64(p);
+      const __m128i in = _mm_unpacklo_epi64(p_b, p_b);
+      const __m128i out_01 = _mm_maddubs_epi16(in, f1f0);
+      const __m128i out_23 = _mm_maddubs_epi16(in, f3f2);
+      const __m128i out_45 = _mm_maddubs_epi16(in, f5f4);
+      const __m128i out_67 = _mm_maddubs_epi16(in, f7f6);
+      const __m128i out_0123 = _mm_hadd_epi16(out_01, out_23);
+      const __m128i out_4567 = _mm_hadd_epi16(out_45, out_67);
+      const __m128i out_01234567 = _mm_hadd_epi16(out_0123, out_4567);
+      // Rounding
+      const __m128i round_w =
+          _mm_mulhrs_epi16(out_01234567, filter_intra_scale_bits);
+      const __m128i out_r = _mm_packus_epi16(round_w, round_w);
+      const __m128i out_r1 = _mm_srli_si128(out_r, 4);
+      // Storing
+      xx_storel_32(&buffer[r][c], out_r);
+      xx_storel_32(&buffer[r + 1][c], out_r1);
+    }
+  }
+
+  for (r = 0; r < bh; ++r) {
+    memcpy(dst, &buffer[r + 1][1], bw * sizeof(uint8_t));
+    dst += stride;
+  }
+}
diff --git a/libs/libaom/src/av1/common/x86/highbd_convolve_2d_avx2.c b/libs/libaom/src/av1/common/x86/highbd_convolve_2d_avx2.c
new file mode 100644
index 000000000..396aed01b
--- /dev/null
+++ b/libs/libaom/src/av1/common/x86/highbd_convolve_2d_avx2.c
@@ -0,0 +1,326 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/convolve_avx2.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "av1/common/convolve.h"
+
+void av1_highbd_convolve_2d_sr_avx2(const uint16_t *src, int src_stride,
+                                    uint16_t *dst, int dst_stride, int w, int h,
+                                    const InterpFilterParams *filter_params_x,
+                                    const InterpFilterParams *filter_params_y,
+                                    const int subpel_x_qn,
+                                    const int subpel_y_qn,
+                                    ConvolveParams *conv_params, int bd) {
+  DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
+  int im_h = h + filter_params_y->taps - 1;
+  int im_stride = 8;
+  int i, j;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+  // Check that, even with 12-bit input, the intermediate values will fit
+  // into an unsigned 16-bit intermediate array.
+  assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
+
+  __m256i s[8], coeffs_y[4], coeffs_x[4];
+
+  const __m256i round_const_x = _mm256_set1_epi32(
+      ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
+  const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
+
+  const __m256i round_const_y = _mm256_set1_epi32(
+      ((1 << conv_params->round_1) >> 1) -
+      (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
+  const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1);
+
+  const int bits =
+      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+  const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
+  const __m256i clip_pixel =
+      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+  const __m256i zero = _mm256_setzero_si256();
+
+  prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x);
+  prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y);
+
+  for (j = 0; j < w; j += 8) {
+    /* Horizontal filter */
+    {
+      for (i = 0; i < im_h; i += 2) {
+        const __m256i row0 =
+            _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]);
+        __m256i row1 = _mm256_set1_epi16(0);
+        if (i + 1 < im_h)
+          row1 =
+              _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]);
+
+        const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20);
+        const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);
+
+        // even pixels
+        s[0] = _mm256_alignr_epi8(r1, r0, 0);
+        s[1] = _mm256_alignr_epi8(r1, r0, 4);
+        s[2] = _mm256_alignr_epi8(r1, r0, 8);
+        s[3] = _mm256_alignr_epi8(r1, r0, 12);
+
+        __m256i res_even = convolve(s, coeffs_x);
+        res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x),
+                                    round_shift_x);
+
+        // odd pixels
+        s[0] = _mm256_alignr_epi8(r1, r0, 2);
+        s[1] = _mm256_alignr_epi8(r1, r0, 6);
+        s[2] = _mm256_alignr_epi8(r1, r0, 10);
+        s[3] = _mm256_alignr_epi8(r1, r0, 14);
+
+        __m256i res_odd = convolve(s, coeffs_x);
+        res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x),
+                                   round_shift_x);
+
+        __m256i res_even1 = _mm256_packs_epi32(res_even, res_even);
+        __m256i res_odd1 = _mm256_packs_epi32(res_odd, res_odd);
+        __m256i res = _mm256_unpacklo_epi16(res_even1, res_odd1);
+
+        _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
+      }
+    }
+
+    /* Vertical filter */
+    {
+      __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
+      __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
+      __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
+      __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
+      __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));
+      __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));
+
+      s[0] = _mm256_unpacklo_epi16(s0, s1);
+      s[1] = _mm256_unpacklo_epi16(s2, s3);
+      s[2] = _mm256_unpacklo_epi16(s4, s5);
+
+      s[4] = _mm256_unpackhi_epi16(s0, s1);
+      s[5] = _mm256_unpackhi_epi16(s2, s3);
+      s[6] = _mm256_unpackhi_epi16(s4, s5);
+
+      for (i = 0; i < h; i += 2) {
+        const int16_t *data = &im_block[i * im_stride];
+
+        const __m256i s6 =
+            _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));
+        const __m256i s7 =
+            _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));
+
+        s[3] = _mm256_unpacklo_epi16(s6, s7);
+        s[7] = _mm256_unpackhi_epi16(s6, s7);
+
+        const __m256i res_a = convolve(s, coeffs_y);
+        __m256i res_a_round = _mm256_sra_epi32(
+            _mm256_add_epi32(res_a, round_const_y), round_shift_y);
+
+        res_a_round = _mm256_sra_epi32(
+            _mm256_add_epi32(res_a_round, round_const_bits), round_shift_bits);
+
+        if (w - j > 4) {
+          const __m256i res_b = convolve(s + 4, coeffs_y);
+          __m256i res_b_round = _mm256_sra_epi32(
+              _mm256_add_epi32(res_b, round_const_y), round_shift_y);
+          res_b_round =
+              _mm256_sra_epi32(_mm256_add_epi32(res_b_round, round_const_bits),
+                               round_shift_bits);
+
+          __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);
+          res_16bit = _mm256_min_epi16(res_16bit, clip_pixel);
+          res_16bit = _mm256_max_epi16(res_16bit, zero);
+
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j],
+                           _mm256_castsi256_si128(res_16bit));
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                           _mm256_extracti128_si256(res_16bit, 1));
+        } else if (w == 4) {
+          res_a_round = _mm256_packs_epi32(res_a_round, res_a_round);
+          res_a_round = _mm256_min_epi16(res_a_round, clip_pixel);
+          res_a_round = _mm256_max_epi16(res_a_round, zero);
+
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j],
+                           _mm256_castsi256_si128(res_a_round));
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                           _mm256_extracti128_si256(res_a_round, 1));
+        } else {
+          res_a_round = _mm256_packs_epi32(res_a_round, res_a_round);
+          res_a_round = _mm256_min_epi16(res_a_round, clip_pixel);
+          res_a_round = _mm256_max_epi16(res_a_round, zero);
+
+          xx_storel_32((__m128i *)&dst[i * dst_stride + j],
+                       _mm256_castsi256_si128(res_a_round));
+          xx_storel_32((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                       _mm256_extracti128_si256(res_a_round, 1));
+        }
+
+        s[0] = s[1];
+        s[1] = s[2];
+        s[2] = s[3];
+
+        s[4] = s[5];
+        s[5] = s[6];
+        s[6] = s[7];
+      }
+    }
+  }
+}
+
+static INLINE void copy_64(const uint16_t *src, uint16_t *dst) {
+  __m256i s[4];
+  s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
+  s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
+  s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
+  s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
+  _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]);
+  _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]);
+  _mm256_storeu_si256((__m256i *)(dst + 2 * 16), s[2]);
+  _mm256_storeu_si256((__m256i *)(dst + 3 * 16), s[3]);
+}
+
+static INLINE void copy_128(const uint16_t *src, uint16_t *dst) {
+  __m256i s[8];
+  s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
+  s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
+  s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
+  s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
+  s[4] = _mm256_loadu_si256((__m256i *)(src + 4 * 16));
+  s[5] = _mm256_loadu_si256((__m256i *)(src + 5 * 16));
+  s[6] = _mm256_loadu_si256((__m256i *)(src + 6 * 16));
+  s[7] = _mm256_loadu_si256((__m256i *)(src + 7 * 16));
+
+  _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]);
+  _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]);
+  _mm256_storeu_si256((__m256i *)(dst + 2 * 16), s[2]);
+  _mm256_storeu_si256((__m256i *)(dst + 3 * 16), s[3]);
+  _mm256_storeu_si256((__m256i *)(dst + 4 * 16), s[4]);
+  _mm256_storeu_si256((__m256i *)(dst + 5 * 16), s[5]);
+  _mm256_storeu_si256((__m256i *)(dst + 6 * 16), s[6]);
+  _mm256_storeu_si256((__m256i *)(dst + 7 * 16), s[7]);
+}
+
+void av1_highbd_convolve_2d_copy_sr_avx2(
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
+  (void)filter_params_x;
+  (void)filter_params_y;
+  (void)subpel_x_qn;
+  (void)subpel_y_qn;
+  (void)conv_params;
+  (void)bd;
+
+  if (w >= 16) {
+    assert(!((intptr_t)dst % 16));
+    assert(!(dst_stride % 16));
+  }
+
+  if (w == 2) {
+    do {
+      memmove(dst, src, 2 * sizeof(*src));
+      src += src_stride;
+      dst += dst_stride;
+      memmove(dst, src, 2 * sizeof(*src));
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 4) {
+    do {
+      __m128i s[2];
+      s[0] = _mm_loadl_epi64((__m128i *)src);
+      src += src_stride;
+      s[1] = _mm_loadl_epi64((__m128i *)src);
+      src += src_stride;
+      _mm_storel_epi64((__m128i *)dst, s[0]);
+      dst += dst_stride;
+      _mm_storel_epi64((__m128i *)dst, s[1]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 8) {
+    do {
+      __m128i s[2];
+      s[0] = _mm_loadu_si128((__m128i *)src);
+      src += src_stride;
+      s[1] = _mm_loadu_si128((__m128i *)src);
+      src += src_stride;
+      _mm_store_si128((__m128i *)dst, s[0]);
+      dst += dst_stride;
+      _mm_store_si128((__m128i *)dst, s[1]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 16) {
+    do {
+      __m256i s[2];
+      s[0] = _mm256_loadu_si256((__m256i *)src);
+      src += src_stride;
+      s[1] = _mm256_loadu_si256((__m256i *)src);
+      src += src_stride;
+      _mm256_storeu_si256((__m256i *)dst, s[0]);
+      dst += dst_stride;
+      _mm256_storeu_si256((__m256i *)dst, s[1]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 32) {
+    do {
+      __m256i s[4];
+      s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
+      s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
+      src += src_stride;
+      s[2] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
+      s[3] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
+      src += src_stride;
+      _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]);
+      _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]);
+      dst += dst_stride;
+      _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[2]);
+      _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[3]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 64) {
+    do {
+      copy_64(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      copy_64(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else {
+    do {
+      copy_128(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      copy_128(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  }
+}
diff --git a/libs/libaom/src/av1/common/x86/highbd_convolve_2d_sse2.c b/libs/libaom/src/av1/common/x86/highbd_convolve_2d_sse2.c
new file mode 100644
index 000000000..f758775ee
--- /dev/null
+++ b/libs/libaom/src/av1/common/x86/highbd_convolve_2d_sse2.c
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <emmintrin.h>
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_filter.h"
+
+static INLINE void copy_64(const uint16_t *src, uint16_t *dst) {
+  __m128i s[8];
+  s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
+  s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
+  s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
+  s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 8));
+  s[4] = _mm_loadu_si128((__m128i *)(src + 4 * 8));
+  s[5] = _mm_loadu_si128((__m128i *)(src + 5 * 8));
+  s[6] = _mm_loadu_si128((__m128i *)(src + 6 * 8));
+  s[7] = _mm_loadu_si128((__m128i *)(src + 7 * 8));
+  _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]);
+  _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]);
+  _mm_store_si128((__m128i *)(dst + 2 * 8), s[2]);
+  _mm_store_si128((__m128i *)(dst + 3 * 8), s[3]);
+  _mm_store_si128((__m128i *)(dst + 4 * 8), s[4]);
+  _mm_store_si128((__m128i *)(dst + 5 * 8), s[5]);
+  _mm_store_si128((__m128i *)(dst + 6 * 8), s[6]);
+  _mm_store_si128((__m128i *)(dst + 7 * 8), s[7]);
+}
+
+static INLINE void copy_128(const uint16_t *src, uint16_t *dst) {
+  __m128i s[16];
+  s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
+  s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
+  s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
+  s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 8));
+  s[4] = _mm_loadu_si128((__m128i *)(src + 4 * 8));
+  s[5] = _mm_loadu_si128((__m128i *)(src + 5 * 8));
+  s[6] = _mm_loadu_si128((__m128i *)(src + 6 * 8));
+  s[7] = _mm_loadu_si128((__m128i *)(src + 7 * 8));
+  s[8] = _mm_loadu_si128((__m128i *)(src + 8 * 8));
+  s[9] = _mm_loadu_si128((__m128i *)(src + 9 * 8));
+  s[10] = _mm_loadu_si128((__m128i *)(src + 10 * 8));
+  s[11] = _mm_loadu_si128((__m128i *)(src + 11 * 8));
+  s[12] = _mm_loadu_si128((__m128i *)(src + 12 * 8));
+  s[13] = _mm_loadu_si128((__m128i *)(src + 13 * 8));
+  s[14] = _mm_loadu_si128((__m128i *)(src + 14 * 8));
+  s[15] = _mm_loadu_si128((__m128i *)(src + 15 * 8));
+  _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]);
+  _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]);
+  _mm_store_si128((__m128i *)(dst + 2 * 8), s[2]);
+  _mm_store_si128((__m128i *)(dst + 3 * 8), s[3]);
+  _mm_store_si128((__m128i *)(dst + 4 * 8), s[4]);
+  _mm_store_si128((__m128i *)(dst + 5 * 8), s[5]);
+  _mm_store_si128((__m128i *)(dst + 6 * 8), s[6]);
+  _mm_store_si128((__m128i *)(dst + 7 * 8), s[7]);
+  _mm_store_si128((__m128i *)(dst + 8 * 8), s[8]);
+  _mm_store_si128((__m128i *)(dst + 9 * 8), s[9]);
+  _mm_store_si128((__m128i *)(dst + 10 * 8), s[10]);
+  _mm_store_si128((__m128i *)(dst + 11 * 8), s[11]);
+  _mm_store_si128((__m128i *)(dst + 12 * 8), s[12]);
+  _mm_store_si128((__m128i *)(dst + 13 * 8), s[13]);
+  _mm_store_si128((__m128i *)(dst + 14 * 8), s[14]);
+  _mm_store_si128((__m128i *)(dst + 15 * 8), s[15]);
+}
+
+void av1_highbd_convolve_2d_copy_sr_sse2(
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
+  (void)filter_params_x;
+  (void)filter_params_y;
+  (void)subpel_x_qn;
+  (void)subpel_y_qn;
+  (void)conv_params;
+  (void)bd;
+  if (w >= 16) {
+    assert(!((intptr_t)dst % 16));
+    assert(!(dst_stride % 16));
+  }
+
+  if (w == 2) {
+    do {
+      __m128i s = _mm_loadl_epi64((__m128i *)src);
+      *(uint32_t *)dst = _mm_cvtsi128_si32(s);
+      src += src_stride;
+      dst += dst_stride;
+      s = _mm_loadl_epi64((__m128i *)src);
+      *(uint32_t *)dst = _mm_cvtsi128_si32(s);
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 4) {
+    do {
+      __m128i s[2];
+      s[0] = _mm_loadl_epi64((__m128i *)src);
+      src += src_stride;
+      s[1] = _mm_loadl_epi64((__m128i *)src);
+      src += src_stride;
+      _mm_storel_epi64((__m128i *)dst, s[0]);
+      dst += dst_stride;
+      _mm_storel_epi64((__m128i *)dst, s[1]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 8) {
+    do {
+      __m128i s[2];
+      s[0] = _mm_loadu_si128((__m128i *)src);
+      src += src_stride;
+      s[1] = _mm_loadu_si128((__m128i *)src);
+      src += src_stride;
+      _mm_store_si128((__m128i *)dst, s[0]);
+      dst += dst_stride;
+      _mm_store_si128((__m128i *)dst, s[1]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 16) {
+    do {
+      __m128i s[4];
+      s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
+      s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
+      src += src_stride;
+      s[2] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
+      s[3] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
+      src += src_stride;
+      _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]);
+      _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]);
+      dst += dst_stride;
+      _mm_store_si128((__m128i *)(dst + 0 * 8), s[2]);
+      _mm_store_si128((__m128i *)(dst + 1 * 8), s[3]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 32) {
+    do {
+      __m128i s[8];
+      s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
+      s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
+      s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
+      s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 8));
+      src += src_stride;
+      s[4] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
+      s[5] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
+      s[6] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
+      s[7] = _mm_loadu_si128((__m128i *)(src + 3 * 8));
+      src += src_stride;
+      _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]);
+      _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]);
+      _mm_store_si128((__m128i *)(dst + 2 * 8), s[2]);
+      _mm_store_si128((__m128i *)(dst + 3 * 8), s[3]);
+      dst += dst_stride;
+      _mm_store_si128((__m128i *)(dst + 0 * 8), s[4]);
+      _mm_store_si128((__m128i *)(dst + 1 * 8), s[5]);
+      _mm_store_si128((__m128i *)(dst + 2 * 8), s[6]);
+      _mm_store_si128((__m128i *)(dst + 3 * 8), s[7]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 64) {
+    do {
+      copy_64(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      copy_64(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else {
+    do {
+      copy_128(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      copy_128(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  }
+}
diff --git a/libs/libaom/src/av1/common/x86/highbd_convolve_2d_sse4.c b/libs/libaom/src/av1/common/x86/highbd_convolve_2d_sse4.c
new file mode 100644
index 000000000..d2ff47c1f
--- /dev/null
+++ b/libs/libaom/src/av1/common/x86/highbd_convolve_2d_sse4.c
@@ -0,0 +1,425 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+#include <smmintrin.h>
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/convolve_sse2.h"
+#include "aom_dsp/x86/convolve_sse4_1.h"
+#include "av1/common/convolve.h"
+
+void av1_highbd_dist_wtd_convolve_2d_copy_sse4_1(
+    const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  (void)filter_params_x;
+  (void)filter_params_y;
+  (void)subpel_x_qn;
+  (void)subpel_y_qn;
+
+  const int bits =
+      FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
+  const __m128i left_shift = _mm_cvtsi32_si128(bits);
+  const int do_average = conv_params->do_average;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m128i wt0 = _mm_set1_epi32(w0);
+  const __m128i wt1 = _mm_set1_epi32(w1);
+  const __m128i zero = _mm_setzero_si128();
+  int i, j;
+
+  const int offset_0 =
+      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+  const __m128i offset_const = _mm_set1_epi32(offset);
+  const __m128i offset_const_16b = _mm_set1_epi16(offset);
+  const int rounding_shift =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const __m128i rounding_const = _mm_set1_epi32((1 << rounding_shift) >> 1);
+  const __m128i clip_pixel_to_bd =
+      _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+
+  assert(bits <= 4);
+
+  if (!(w % 8)) {
+    for (i = 0; i < h; i += 1) {
+      for (j = 0; j < w; j += 8) {
+        const __m128i src_16bit =
+            _mm_loadu_si128((__m128i *)(&src[i * src_stride + j]));
+        const __m128i res = _mm_sll_epi16(src_16bit, left_shift);
+        if (do_average) {
+          const __m128i data_0 =
+              _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
+
+          const __m128i data_ref_0_lo = _mm_unpacklo_epi16(data_0, zero);
+          const __m128i data_ref_0_hi = _mm_unpackhi_epi16(data_0, zero);
+
+          const __m128i res_32b_lo = _mm_unpacklo_epi16(res, zero);
+          const __m128i res_unsigned_lo =
+              _mm_add_epi32(res_32b_lo, offset_const);
+
+          const __m128i comp_avg_res_lo =
+              highbd_comp_avg_sse4_1(&data_ref_0_lo, &res_unsigned_lo, &wt0,
+                                     &wt1, use_dist_wtd_comp_avg);
+
+          const __m128i res_32b_hi = _mm_unpackhi_epi16(res, zero);
+          const __m128i res_unsigned_hi =
+              _mm_add_epi32(res_32b_hi, offset_const);
+
+          const __m128i comp_avg_res_hi =
+              highbd_comp_avg_sse4_1(&data_ref_0_hi, &res_unsigned_hi, &wt0,
+                                     &wt1, use_dist_wtd_comp_avg);
+
+          const __m128i round_result_lo = highbd_convolve_rounding_sse2(
+              &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
+          const __m128i round_result_hi = highbd_convolve_rounding_sse2(
+              &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
+
+          const __m128i res_16b =
+              _mm_packus_epi32(round_result_lo, round_result_hi);
+          const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd);
+
+          _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip);
+        } else {
+          const __m128i res_unsigned_16b =
+              _mm_adds_epu16(res, offset_const_16b);
+
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]),
+                          res_unsigned_16b);
+        }
+      }
+    }
+  } else if (!(w % 4)) {
+    for (i = 0; i < h; i += 2) {
+      for (j = 0; j < w; j += 4) {
+        const __m128i src_row_0 =
+            _mm_loadl_epi64((__m128i *)(&src[i * src_stride + j]));
+        const __m128i src_row_1 =
+            _mm_loadl_epi64((__m128i *)(&src[i * src_stride + j + src_stride]));
+        const __m128i src_10 = _mm_unpacklo_epi64(src_row_0, src_row_1);
+
+        const __m128i res = _mm_sll_epi16(src_10, left_shift);
+
+        if (do_average) {
+          const __m128i data_0 =
+              _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j]));
+          const __m128i data_1 = _mm_loadl_epi64(
+              (__m128i *)(&dst[i * dst_stride + j + dst_stride]));
+
+          const __m128i data_ref_0 = _mm_unpacklo_epi16(data_0, zero);
+          const __m128i data_ref_1 = _mm_unpacklo_epi16(data_1, zero);
+
+          const __m128i res_32b = _mm_unpacklo_epi16(res, zero);
+          const __m128i res_unsigned_lo = _mm_add_epi32(res_32b, offset_const);
+
+          const __m128i res_32b_hi = _mm_unpackhi_epi16(res, zero);
+          const __m128i res_unsigned_hi =
+              _mm_add_epi32(res_32b_hi, offset_const);
+
+          const __m128i comp_avg_res_lo = highbd_comp_avg_sse4_1(
+              &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_dist_wtd_comp_avg);
+          const __m128i comp_avg_res_hi = highbd_comp_avg_sse4_1(
+              &data_ref_1, &res_unsigned_hi, &wt0, &wt1, use_dist_wtd_comp_avg);
+
+          const __m128i round_result_lo = highbd_convolve_rounding_sse2(
+              &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
+          const __m128i round_result_hi = highbd_convolve_rounding_sse2(
+              &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
+
+          const __m128i res_16b =
+              _mm_packus_epi32(round_result_lo, round_result_hi);
+          const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd);
+
+          const __m128i res_1 = _mm_srli_si128(res_clip, 8);
+
+          _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip);
+          _mm_storel_epi64(
+              (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
+        } else {
+          const __m128i res_unsigned_16b =
+              _mm_adds_epu16(res, offset_const_16b);
+
+          const __m128i res_1 = _mm_srli_si128(res_unsigned_16b, 8);
+
+          _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]),
+                           res_unsigned_16b);
+          _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                           res_1);
+        }
+      }
+    }
+  }
+}
+
+void av1_highbd_dist_wtd_convolve_2d_sse4_1(
+    const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
+  DECLARE_ALIGNED(16, int16_t,
+                  im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  int im_h = h + filter_params_y->taps - 1;
+  int im_stride = MAX_SB_SIZE;
+  int i, j;
+  const int do_average = conv_params->do_average;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m128i wt0 = _mm_set1_epi32(w0);
+  const __m128i wt1 = _mm_set1_epi32(w1);
+
+  const int offset_0 =
+      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+  const __m128i offset_const = _mm_set1_epi32(offset);
+  const int rounding_shift =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const __m128i rounding_const = _mm_set1_epi32((1 << rounding_shift) >> 1);
+  const __m128i clip_pixel_to_bd =
+      _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+
+  // Check that, even with 12-bit input, the intermediate values will fit
+  // into an unsigned 16-bit intermediate array.
+  assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
+
+  /* Horizontal filter */
+  {
+    const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+        filter_params_x, subpel_x_qn & SUBPEL_MASK);
+    const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
+
+    // coeffs 0 1 0 1 2 3 2 3
+    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
+    // coeffs 4 5 4 5 6 7 6 7
+    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
+
+    // coeffs 0 1 0 1 0 1 0 1
+    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+    // coeffs 2 3 2 3 2 3 2 3
+    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+    // coeffs 4 5 4 5 4 5 4 5
+    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+    // coeffs 6 7 6 7 6 7 6 7
+    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+    const __m128i round_const = _mm_set1_epi32(
+        ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
+    const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
+
+    for (i = 0; i < im_h; ++i) {
+      for (j = 0; j < w; j += 8) {
+        const __m128i data =
+            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+        const __m128i data2 =
+            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j + 8]);
+
+        // Filter even-index pixels
+        const __m128i res_0 = _mm_madd_epi16(data, coeff_01);
+        const __m128i res_2 =
+            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 4), coeff_23);
+        const __m128i res_4 =
+            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 8), coeff_45);
+        const __m128i res_6 =
+            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 12), coeff_67);
+
+        __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
+                                         _mm_add_epi32(res_2, res_6));
+        res_even =
+            _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift);
+
+        // Filter odd-index pixels
+        const __m128i res_1 =
+            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 2), coeff_01);
+        const __m128i res_3 =
+            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 6), coeff_23);
+        const __m128i res_5 =
+            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 10), coeff_45);
+        const __m128i res_7 =
+            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 14), coeff_67);
+
+        __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
+                                        _mm_add_epi32(res_3, res_7));
+        res_odd =
+            _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift);
+
+        // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
+        __m128i res = _mm_packs_epi32(res_even, res_odd);
+        _mm_storeu_si128((__m128i *)&im_block[i * im_stride + j], res);
+      }
+    }
+  }
+
+  /* Vertical filter */
+  {
+    const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+        filter_params_y, subpel_y_qn & SUBPEL_MASK);
+    const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
+
+    // coeffs 0 1 0 1 2 3 2 3
+    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+    // coeffs 4 5 4 5 6 7 6 7
+    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+    // coeffs 0 1 0 1 0 1 0 1
+    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+    // coeffs 2 3 2 3 2 3 2 3
+    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+    // coeffs 4 5 4 5 4 5 4 5
+    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+    // coeffs 6 7 6 7 6 7 6 7
+    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+    const __m128i round_const = _mm_set1_epi32(
+        ((1 << conv_params->round_1) >> 1) -
+        (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
+    const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
+
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; j += 8) {
+        // Filter even-index pixels
+        const int16_t *data = &im_block[i * im_stride + j];
+        const __m128i src_0 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride),
+                               *(__m128i *)(data + 1 * im_stride));
+        const __m128i src_2 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride),
+                               *(__m128i *)(data + 3 * im_stride));
+        const __m128i src_4 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride),
+                               *(__m128i *)(data + 5 * im_stride));
+        const __m128i src_6 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride),
+                               *(__m128i *)(data + 7 * im_stride));
+
+        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+        const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+                                               _mm_add_epi32(res_4, res_6));
+
+        // Filter odd-index pixels
+        const __m128i src_1 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride),
+                               *(__m128i *)(data + 1 * im_stride));
+        const __m128i src_3 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride),
+                               *(__m128i *)(data + 3 * im_stride));
+        const __m128i src_5 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride),
+                               *(__m128i *)(data + 5 * im_stride));
+        const __m128i src_7 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride),
+                               *(__m128i *)(data + 7 * im_stride));
+
+        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+        const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+                                              _mm_add_epi32(res_5, res_7));
+
+        // Rearrange pixels back into the order 0 ... 7
+        const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+        const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+        const __m128i res_lo_round =
+            _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+
+        const __m128i res_unsigned_lo =
+            _mm_add_epi32(res_lo_round, offset_const);
+
+        if (w < 8) {
+          if (do_average) {
+            const __m128i data_0 =
+                _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j]));
+
+            const __m128i data_ref_0 = _mm_cvtepu16_epi32(data_0);
+
+            const __m128i comp_avg_res =
+                highbd_comp_avg_sse4_1(&data_ref_0, &res_unsigned_lo, &wt0,
+                                       &wt1, use_dist_wtd_comp_avg);
+
+            const __m128i round_result = highbd_convolve_rounding_sse2(
+                &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+            const __m128i res_16b =
+                _mm_packus_epi32(round_result, round_result);
+            const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd);
+
+            _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip);
+          } else {
+            const __m128i res_16b =
+                _mm_packus_epi32(res_unsigned_lo, res_unsigned_lo);
+            _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_16b);
+          }
+        } else {
+          const __m128i res_hi_round =
+              _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+
+          const __m128i res_unsigned_hi =
+              _mm_add_epi32(res_hi_round, offset_const);
+
+          if (do_average) {
+            const __m128i data_lo =
+                _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j]));
+            const __m128i data_hi =
+                _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j + 4]));
+
+            const __m128i data_ref_0_lo = _mm_cvtepu16_epi32(data_lo);
+            const __m128i data_ref_0_hi = _mm_cvtepu16_epi32(data_hi);
+
+            const __m128i comp_avg_res_lo =
+                highbd_comp_avg_sse4_1(&data_ref_0_lo, &res_unsigned_lo, &wt0,
+                                       &wt1, use_dist_wtd_comp_avg);
+            const __m128i comp_avg_res_hi =
+                highbd_comp_avg_sse4_1(&data_ref_0_hi, &res_unsigned_hi, &wt0,
+                                       &wt1, use_dist_wtd_comp_avg);
+
+            const __m128i round_result_lo =
+                highbd_convolve_rounding_sse2(&comp_avg_res_lo, &offset_const,
+                                              &rounding_const, rounding_shift);
+            const __m128i round_result_hi =
+                highbd_convolve_rounding_sse2(&comp_avg_res_hi, &offset_const,
+                                              &rounding_const, rounding_shift);
+
+            const __m128i res_16b =
+                _mm_packus_epi32(round_result_lo, round_result_hi);
+            const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd);
+
+            _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip);
+          } else {
+            const __m128i res_16b =
+                _mm_packus_epi32(res_unsigned_lo, res_unsigned_hi);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_16b);
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/libs/libaom/src/av1/common/x86/highbd_convolve_2d_ssse3.c b/libs/libaom/src/av1/common/x86/highbd_convolve_2d_ssse3.c
new file mode 100644
index 000000000..5318fcaa8
--- /dev/null
+++ b/libs/libaom/src/av1/common/x86/highbd_convolve_2d_ssse3.c
@@ -0,0 +1,217 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/convolve_sse2.h"
+#include "av1/common/convolve.h"
+
+void av1_highbd_convolve_2d_sr_ssse3(
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
+  DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
+  int im_h = h + filter_params_y->taps - 1;
+  int im_stride = 8;
+  int i, j;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+  // Check that, even with 12-bit input, the intermediate values will fit
+  // into an unsigned 16-bit intermediate array.
+  assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
+  __m128i coeffs_x[4], coeffs_y[4], s[16];
+
+  const __m128i round_const_x = _mm_set1_epi32(
+      ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
+  const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
+
+  const __m128i round_const_y =
+      _mm_set1_epi32(((1 << conv_params->round_1) >> 1) -
+                     (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
+  const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1);
+
+  const int bits =
+      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+  const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1);
+  const __m128i clip_pixel =
+      _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+  const __m128i zero = _mm_setzero_si128();
+
+  prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x);
+  prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y);
+
+  for (j = 0; j < w; j += 8) {
+    /* Horizontal filter */
+    {
+      for (i = 0; i < im_h; i += 1) {
+        const __m128i row00 =
+            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+        const __m128i row01 =
+            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 8)]);
+
+        // even pixels
+        s[0] = _mm_alignr_epi8(row01, row00, 0);
+        s[1] = _mm_alignr_epi8(row01, row00, 4);
+        s[2] = _mm_alignr_epi8(row01, row00, 8);
+        s[3] = _mm_alignr_epi8(row01, row00, 12);
+
+        __m128i res_even = convolve(s, coeffs_x);
+        res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_x),
+                                 round_shift_x);
+
+        // odd pixels
+        s[0] = _mm_alignr_epi8(row01, row00, 2);
+        s[1] = _mm_alignr_epi8(row01, row00, 6);
+        s[2] = _mm_alignr_epi8(row01, row00, 10);
+        s[3] = _mm_alignr_epi8(row01, row00, 14);
+
+        __m128i res_odd = convolve(s, coeffs_x);
+        res_odd =
+            _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_x), round_shift_x);
+
+        __m128i res_even1 = _mm_packs_epi32(res_even, res_even);
+        __m128i res_odd1 = _mm_packs_epi32(res_odd, res_odd);
+        __m128i res = _mm_unpacklo_epi16(res_even1, res_odd1);
+
+        _mm_store_si128((__m128i *)&im_block[i * im_stride], res);
+      }
+    }
+    /* Vertical filter */
+    {
+      __m128i s0 = _mm_loadu_si128((__m128i *)(im_block + 0 * im_stride));
+      __m128i s1 = _mm_loadu_si128((__m128i *)(im_block + 1 * im_stride));
+      __m128i s2 = _mm_loadu_si128((__m128i *)(im_block + 2 * im_stride));
+      __m128i s3 = _mm_loadu_si128((__m128i *)(im_block + 3 * im_stride));
+      __m128i s4 = _mm_loadu_si128((__m128i *)(im_block + 4 * im_stride));
+      __m128i s5 = _mm_loadu_si128((__m128i *)(im_block + 5 * im_stride));
+      __m128i s6 = _mm_loadu_si128((__m128i *)(im_block + 6 * im_stride));
+
+      s[0] = _mm_unpacklo_epi16(s0, s1);
+      s[1] = _mm_unpacklo_epi16(s2, s3);
+      s[2] = _mm_unpacklo_epi16(s4, s5);
+
+      s[4] = _mm_unpackhi_epi16(s0, s1);
+      s[5] = _mm_unpackhi_epi16(s2, s3);
+      s[6] = _mm_unpackhi_epi16(s4, s5);
+
+      s[0 + 8] = _mm_unpacklo_epi16(s1, s2);
+      s[1 + 8] = _mm_unpacklo_epi16(s3, s4);
+      s[2 + 8] = _mm_unpacklo_epi16(s5, s6);
+
+      s[4 + 8] = _mm_unpackhi_epi16(s1, s2);
+      s[5 + 8] = _mm_unpackhi_epi16(s3, s4);
+      s[6 + 8] = _mm_unpackhi_epi16(s5, s6);
+
+      for (i = 0; i < h; i += 2) {
+        const int16_t *data = &im_block[i * im_stride];
+
+        __m128i s7 = _mm_loadu_si128((__m128i *)(data + 7 * im_stride));
+        __m128i s8 = _mm_loadu_si128((__m128i *)(data + 8 * im_stride));
+
+        s[3] = _mm_unpacklo_epi16(s6, s7);
+        s[7] = _mm_unpackhi_epi16(s6, s7);
+
+        s[3 + 8] = _mm_unpacklo_epi16(s7, s8);
+        s[7 + 8] = _mm_unpackhi_epi16(s7, s8);
+
+        const __m128i res_a0 = convolve(s, coeffs_y);
+        __m128i res_a_round0 =
+            _mm_sra_epi32(_mm_add_epi32(res_a0, round_const_y), round_shift_y);
+        res_a_round0 = _mm_sra_epi32(
+            _mm_add_epi32(res_a_round0, round_const_bits), round_shift_bits);
+
+        const __m128i res_a1 = convolve(s + 8, coeffs_y);
+        __m128i res_a_round1 =
+            _mm_sra_epi32(_mm_add_epi32(res_a1, round_const_y), round_shift_y);
+        res_a_round1 = _mm_sra_epi32(
+            _mm_add_epi32(res_a_round1, round_const_bits), round_shift_bits);
+
+        if (w - j > 4) {
+          const __m128i res_b0 = convolve(s + 4, coeffs_y);
+          __m128i res_b_round0 = _mm_sra_epi32(
+              _mm_add_epi32(res_b0, round_const_y), round_shift_y);
+          res_b_round0 = _mm_sra_epi32(
+              _mm_add_epi32(res_b_round0, round_const_bits), round_shift_bits);
+
+          const __m128i res_b1 = convolve(s + 4 + 8, coeffs_y);
+          __m128i res_b_round1 = _mm_sra_epi32(
+              _mm_add_epi32(res_b1, round_const_y), round_shift_y);
+          res_b_round1 = _mm_sra_epi32(
+              _mm_add_epi32(res_b_round1, round_const_bits), round_shift_bits);
+
+          __m128i res_16bit0 = _mm_packs_epi32(res_a_round0, res_b_round0);
+          res_16bit0 = _mm_min_epi16(res_16bit0, clip_pixel);
+          res_16bit0 = _mm_max_epi16(res_16bit0, zero);
+
+          __m128i res_16bit1 = _mm_packs_epi32(res_a_round1, res_b_round1);
+          res_16bit1 = _mm_min_epi16(res_16bit1, clip_pixel);
+          res_16bit1 = _mm_max_epi16(res_16bit1, zero);
+
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_16bit0);
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                           res_16bit1);
+        } else if (w == 4) {
+          res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0);
+          res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel);
+          res_a_round0 = _mm_max_epi16(res_a_round0, zero);
+
+          res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1);
+          res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel);
+          res_a_round1 = _mm_max_epi16(res_a_round1, zero);
+
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_a_round0);
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                           res_a_round1);
+        } else {
+          res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0);
+          res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel);
+          res_a_round0 = _mm_max_epi16(res_a_round0, zero);
+
+          res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1);
+          res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel);
+          res_a_round1 = _mm_max_epi16(res_a_round1, zero);
+
+          *((uint32_t *)(&dst[i * dst_stride + j])) =
+              _mm_cvtsi128_si32(res_a_round0);
+
+          *((uint32_t *)(&dst[i * dst_stride + j + dst_stride])) =
+              _mm_cvtsi128_si32(res_a_round1);
+        }
+        s[0] = s[1];
+        s[1] = s[2];
+        s[2] = s[3];
+
+        s[4] = s[5];
+        s[5] = s[6];
+        s[6] = s[7];
+
+        s[0 + 8] = s[1 + 8];
+        s[1 + 8] = s[2 + 8];
+        s[2 + 8] = s[3 + 8];
+
+        s[4 + 8] = s[5 + 8];
+        s[5 + 8] = s[6 + 8];
+        s[6 + 8] = s[7 + 8];
+
+        s6 = s8;
+      }
+    }
+  }
+}
diff --git a/libs/libaom/src/av1/common/x86/highbd_inv_txfm_avx2.c b/libs/libaom/src/av1/common/x86/highbd_inv_txfm_avx2.c
new file mode 100644
index 000000000..93e98e4b3
--- /dev/null
+++ b/libs/libaom/src/av1/common/x86/highbd_inv_txfm_avx2.c
@@ -0,0 +1,4246 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <assert.h>
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "av1/common/av1_inv_txfm1d_cfg.h"
+#include "av1/common/idct.h"
+#include "av1/common/x86/av1_inv_txfm_ssse3.h"
+#include "av1/common/x86/highbd_txfm_utility_sse4.h"
+#include "aom_dsp/x86/txfm_common_avx2.h"
+
+// Note:
+//  Total 32x4 registers to represent 32x32 block coefficients.
+//  For high bit depth, each coefficient is 4-byte.
+//  Each __m256i register holds 8 coefficients.
+//  So each "row" we needs 4 register. Totally 32 rows
+//  Register layout:
+//   v0,   v1,   v2,   v3,
+//   v4,   v5,   v6,   v7,
+//   ... ...
+//   v124, v125, v126, v127
+
+static INLINE __m256i highbd_clamp_epi16_avx2(__m256i u, int bd) {
+  const __m256i zero = _mm256_setzero_si256();
+  const __m256i one = _mm256_set1_epi16(1);
+  const __m256i max = _mm256_sub_epi16(_mm256_slli_epi16(one, bd), one);
+  __m256i clamped, mask;
+
+  mask = _mm256_cmpgt_epi16(u, max);
+  clamped = _mm256_andnot_si256(mask, u);
+  mask = _mm256_and_si256(mask, max);
+  clamped = _mm256_or_si256(mask, clamped);
+  mask = _mm256_cmpgt_epi16(clamped, zero);
+  clamped = _mm256_and_si256(clamped, mask);
+
+  return clamped;
+}
+
+static INLINE void round_shift_4x4_avx2(__m256i *in, int shift) {
+  if (shift != 0) {
+    __m256i rnding = _mm256_set1_epi32(1 << (shift - 1));
+    in[0] = _mm256_add_epi32(in[0], rnding);
+    in[1] = _mm256_add_epi32(in[1], rnding);
+    in[2] = _mm256_add_epi32(in[2], rnding);
+    in[3] = _mm256_add_epi32(in[3], rnding);
+
+    in[0] = _mm256_srai_epi32(in[0], shift);
+    in[1] = _mm256_srai_epi32(in[1], shift);
+    in[2] = _mm256_srai_epi32(in[2], shift);
+    in[3] = _mm256_srai_epi32(in[3], shift);
+  }
+}
+
+static INLINE void round_shift_8x8_avx2(__m256i *in, int shift) {
+  round_shift_4x4_avx2(in, shift);
+  round_shift_4x4_avx2(in + 4, shift);
+  round_shift_4x4_avx2(in + 8, shift);
+  round_shift_4x4_avx2(in + 12, shift);
+}
+
+static void highbd_clamp_epi32_avx2(__m256i *in, __m256i *out,
+                                    const __m256i *clamp_lo,
+                                    const __m256i *clamp_hi, int size) {
+  __m256i a0, a1;
+  for (int i = 0; i < size; i += 4) {
+    a0 = _mm256_max_epi32(in[i], *clamp_lo);
+    out[i] = _mm256_min_epi32(a0, *clamp_hi);
+
+    a1 = _mm256_max_epi32(in[i + 1], *clamp_lo);
+    out[i + 1] = _mm256_min_epi32(a1, *clamp_hi);
+
+    a0 = _mm256_max_epi32(in[i + 2], *clamp_lo);
+    out[i + 2] = _mm256_min_epi32(a0, *clamp_hi);
+
+    a1 = _mm256_max_epi32(in[i + 3], *clamp_lo);
+    out[i + 3] = _mm256_min_epi32(a1, *clamp_hi);
+  }
+}
+
+static INLINE __m256i highbd_get_recon_16x8_avx2(const __m256i pred,
+                                                 __m256i res0, __m256i res1,
+                                                 const int bd) {
+  __m256i x0 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(pred));
+  __m256i x1 = _mm256_cvtepi16_epi32(_mm256_extractf128_si256(pred, 1));
+
+  x0 = _mm256_add_epi32(res0, x0);
+  x1 = _mm256_add_epi32(res1, x1);
+  x0 = _mm256_packus_epi32(x0, x1);
+  x0 = _mm256_permute4x64_epi64(x0, 0xd8);
+  x0 = highbd_clamp_epi16_avx2(x0, bd);
+  return x0;
+}
+
+static INLINE void highbd_write_buffer_16xn_avx2(__m256i *in, uint16_t *output,
+                                                 int stride, int flipud,
+                                                 int height, const int bd) {
+  int j = flipud ? (height - 1) : 0;
+  const int step = flipud ? -1 : 1;
+  for (int i = 0; i < height; ++i, j += step) {
+    __m256i v = _mm256_loadu_si256((__m256i const *)(output + i * stride));
+    __m256i u = highbd_get_recon_16x8_avx2(v, in[j], in[j + height], bd);
+
+    _mm256_storeu_si256((__m256i *)(output + i * stride), u);
+  }
+}
+static INLINE __m256i highbd_get_recon_8x8_avx2(const __m256i pred, __m256i res,
+                                                const int bd) {
+  __m256i x0 = pred;
+  x0 = _mm256_add_epi32(res, x0);
+  x0 = _mm256_packus_epi32(x0, x0);
+  x0 = _mm256_permute4x64_epi64(x0, 0xd8);
+  x0 = highbd_clamp_epi16_avx2(x0, bd);
+  return x0;
+}
+
+static INLINE void highbd_write_buffer_8xn_avx2(__m256i *in, uint16_t *output,
+                                                int stride, int flipud,
+                                                int height, const int bd) {
+  int j = flipud ? (height - 1) : 0;
+  __m128i temp;
+  const int step = flipud ? -1 : 1;
+  for (int i = 0; i < height; ++i, j += step) {
+    temp = _mm_loadu_si128((__m128i const *)(output + i * stride));
+    __m256i v = _mm256_cvtepi16_epi32(temp);
+    __m256i u = highbd_get_recon_8x8_avx2(v, in[j], bd);
+    __m128i u1 = _mm256_castsi256_si128(u);
+    _mm_storeu_si128((__m128i *)(output + i * stride), u1);
+  }
+}
+static void neg_shift_avx2(const __m256i in0, const __m256i in1, __m256i *out0,
+                           __m256i *out1, const __m256i *clamp_lo,
+                           const __m256i *clamp_hi, int shift) {
+  __m256i offset = _mm256_set1_epi32((1 << shift) >> 1);
+  __m256i a0 = _mm256_add_epi32(offset, in0);
+  __m256i a1 = _mm256_sub_epi32(offset, in1);
+
+  a0 = _mm256_sra_epi32(a0, _mm_cvtsi32_si128(shift));
+  a1 = _mm256_sra_epi32(a1, _mm_cvtsi32_si128(shift));
+
+  a0 = _mm256_max_epi32(a0, *clamp_lo);
+  a0 = _mm256_min_epi32(a0, *clamp_hi);
+  a1 = _mm256_max_epi32(a1, *clamp_lo);
+  a1 = _mm256_min_epi32(a1, *clamp_hi);
+
+  *out0 = a0;
+  *out1 = a1;
+}
+
+static void transpose_8x8_avx2(const __m256i *in, __m256i *out) {
+  __m256i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m256i x0, x1;
+
+  u0 = _mm256_unpacklo_epi32(in[0], in[1]);
+  u1 = _mm256_unpackhi_epi32(in[0], in[1]);
+
+  u2 = _mm256_unpacklo_epi32(in[2], in[3]);
+  u3 = _mm256_unpackhi_epi32(in[2], in[3]);
+
+  u4 = _mm256_unpacklo_epi32(in[4], in[5]);
+  u5 = _mm256_unpackhi_epi32(in[4], in[5]);
+
+  u6 = _mm256_unpacklo_epi32(in[6], in[7]);
+  u7 = _mm256_unpackhi_epi32(in[6], in[7]);
+
+  x0 = _mm256_unpacklo_epi64(u0, u2);
+  x1 = _mm256_unpacklo_epi64(u4, u6);
+  out[0] = _mm256_permute2f128_si256(x0, x1, 0x20);
+  out[4] = _mm256_permute2f128_si256(x0, x1, 0x31);
+
+  x0 = _mm256_unpackhi_epi64(u0, u2);
+  x1 = _mm256_unpackhi_epi64(u4, u6);
+  out[1] = _mm256_permute2f128_si256(x0, x1, 0x20);
+  out[5] = _mm256_permute2f128_si256(x0, x1, 0x31);
+
+  x0 = _mm256_unpacklo_epi64(u1, u3);
+  x1 = _mm256_unpacklo_epi64(u5, u7);
+  out[2] = _mm256_permute2f128_si256(x0, x1, 0x20);
+  out[6] = _mm256_permute2f128_si256(x0, x1, 0x31);
+
+  x0 = _mm256_unpackhi_epi64(u1, u3);
+  x1 = _mm256_unpackhi_epi64(u5, u7);
+  out[3] = _mm256_permute2f128_si256(x0, x1, 0x20);
+  out[7] = _mm256_permute2f128_si256(x0, x1, 0x31);
+}
+
+static void transpose_8x8_flip_avx2(const __m256i *in, __m256i *out) {
+  __m256i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m256i x0, x1;
+
+  u0 = _mm256_unpacklo_epi32(in[7], in[6]);
+  u1 = _mm256_unpackhi_epi32(in[7], in[6]);
+
+  u2 = _mm256_unpacklo_epi32(in[5], in[4]);
+  u3 = _mm256_unpackhi_epi32(in[5], in[4]);
+
+  u4 = _mm256_unpacklo_epi32(in[3], in[2]);
+  u5 = _mm256_unpackhi_epi32(in[3], in[2]);
+
+  u6 = _mm256_unpacklo_epi32(in[1], in[0]);
+  u7 = _mm256_unpackhi_epi32(in[1], in[0]);
+
+  x0 = _mm256_unpacklo_epi64(u0, u2);
+  x1 = _mm256_unpacklo_epi64(u4, u6);
+  out[0] = _mm256_permute2f128_si256(x0, x1, 0x20);
+  out[4] = _mm256_permute2f128_si256(x0, x1, 0x31);
+
+  x0 = _mm256_unpackhi_epi64(u0, u2);
+  x1 = _mm256_unpackhi_epi64(u4, u6);
+  out[1] = _mm256_permute2f128_si256(x0, x1, 0x20);
+  out[5] = _mm256_permute2f128_si256(x0, x1, 0x31);
+
+  x0 = _mm256_unpacklo_epi64(u1, u3);
+  x1 = _mm256_unpacklo_epi64(u5, u7);
+  out[2] = _mm256_permute2f128_si256(x0, x1, 0x20);
+  out[6] = _mm256_permute2f128_si256(x0, x1, 0x31);
+
+  x0 = _mm256_unpackhi_epi64(u1, u3);
+  x1 = _mm256_unpackhi_epi64(u5, u7);
+  out[3] = _mm256_permute2f128_si256(x0, x1, 0x20);
+  out[7] = _mm256_permute2f128_si256(x0, x1, 0x31);
+}
+
+static void load_buffer_32x32(const int32_t *coeff, __m256i *in,
+                              int input_stiride, int size) {
+  int i;
+  for (i = 0; i < size; ++i) {
+    in[i] = _mm256_loadu_si256((const __m256i *)(coeff + i * input_stiride));
+  }
+}
+
+static INLINE __m256i half_btf_0_avx2(const __m256i *w0, const __m256i *n0,
+                                      const __m256i *rounding, int bit) {
+  __m256i x;
+  x = _mm256_mullo_epi32(*w0, *n0);
+  x = _mm256_add_epi32(x, *rounding);
+  x = _mm256_srai_epi32(x, bit);
+  return x;
+}
+
+static INLINE __m256i half_btf_avx2(const __m256i *w0, const __m256i *n0,
+                                    const __m256i *w1, const __m256i *n1,
+                                    const __m256i *rounding, int bit) {
+  __m256i x, y;
+
+  x = _mm256_mullo_epi32(*w0, *n0);
+  y = _mm256_mullo_epi32(*w1, *n1);
+  x = _mm256_add_epi32(x, y);
+  x = _mm256_add_epi32(x, *rounding);
+  x = _mm256_srai_epi32(x, bit);
+  return x;
+}
+
+static void addsub_avx2(const __m256i in0, const __m256i in1, __m256i *out0,
+                        __m256i *out1, const __m256i *clamp_lo,
+                        const __m256i *clamp_hi) {
+  __m256i a0 = _mm256_add_epi32(in0, in1);
+  __m256i a1 = _mm256_sub_epi32(in0, in1);
+
+  a0 = _mm256_max_epi32(a0, *clamp_lo);
+  a0 = _mm256_min_epi32(a0, *clamp_hi);
+  a1 = _mm256_max_epi32(a1, *clamp_lo);
+  a1 = _mm256_min_epi32(a1, *clamp_hi);
+
+  *out0 = a0;
+  *out1 = a1;
+}
+
+static INLINE void idct32_stage4_avx2(
+    __m256i *bf1, const __m256i *cospim8, const __m256i *cospi56,
+    const __m256i *cospi8, const __m256i *cospim56, const __m256i *cospim40,
+    const __m256i *cospi24, const __m256i *cospi40, const __m256i *cospim24,
+    const __m256i *rounding, int bit) {
+  __m256i temp1, temp2;
+  temp1 = half_btf_avx2(cospim8, &bf1[17], cospi56, &bf1[30], rounding, bit);
+  bf1[30] = half_btf_avx2(cospi56, &bf1[17], cospi8, &bf1[30], rounding, bit);
+  bf1[17] = temp1;
+
+  temp2 = half_btf_avx2(cospim56, &bf1[18], cospim8, &bf1[29], rounding, bit);
+  bf1[29] = half_btf_avx2(cospim8, &bf1[18], cospi56, &bf1[29], rounding, bit);
+  bf1[18] = temp2;
+
+  temp1 = half_btf_avx2(cospim40, &bf1[21], cospi24, &bf1[26], rounding, bit);
+  bf1[26] = half_btf_avx2(cospi24, &bf1[21], cospi40, &bf1[26], rounding, bit);
+  bf1[21] = temp1;
+
+  temp2 = half_btf_avx2(cospim24, &bf1[22], cospim40, &bf1[25], rounding, bit);
+  bf1[25] = half_btf_avx2(cospim40, &bf1[22], cospi24, &bf1[25], rounding, bit);
+  bf1[22] = temp2;
+}
+
+static INLINE void idct32_stage5_avx2(
+    __m256i *bf1, const __m256i *cospim16, const __m256i *cospi48,
+    const __m256i *cospi16, const __m256i *cospim48, const __m256i *clamp_lo,
+    const __m256i *clamp_hi, const __m256i *rounding, int bit) {
+  __m256i temp1, temp2;
+  temp1 = half_btf_avx2(cospim16, &bf1[9], cospi48, &bf1[14], rounding, bit);
+  bf1[14] = half_btf_avx2(cospi48, &bf1[9], cospi16, &bf1[14], rounding, bit);
+  bf1[9] = temp1;
+
+  temp2 = half_btf_avx2(cospim48, &bf1[10], cospim16, &bf1[13], rounding, bit);
+  bf1[13] = half_btf_avx2(cospim16, &bf1[10], cospi48, &bf1[13], rounding, bit);
+  bf1[10] = temp2;
+
+  addsub_avx2(bf1[16], bf1[19], bf1 + 16, bf1 + 19, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[17], bf1[18], bf1 + 17, bf1 + 18, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[23], bf1[20], bf1 + 23, bf1 + 20, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[22], bf1[21], bf1 + 22, bf1 + 21, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[24], bf1[27], bf1 + 24, bf1 + 27, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[25], bf1[26], bf1 + 25, bf1 + 26, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[31], bf1[28], bf1 + 31, bf1 + 28, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[30], bf1[29], bf1 + 30, bf1 + 29, clamp_lo, clamp_hi);
+}
+
+static INLINE void idct32_stage6_avx2(
+    __m256i *bf1, const __m256i *cospim32, const __m256i *cospi32,
+    const __m256i *cospim16, const __m256i *cospi48, const __m256i *cospi16,
+    const __m256i *cospim48, const __m256i *clamp_lo, const __m256i *clamp_hi,
+    const __m256i *rounding, int bit) {
+  __m256i temp1, temp2;
+  temp1 = half_btf_avx2(cospim32, &bf1[5], cospi32, &bf1[6], rounding, bit);
+  bf1[6] = half_btf_avx2(cospi32, &bf1[5], cospi32, &bf1[6], rounding, bit);
+  bf1[5] = temp1;
+
+  addsub_avx2(bf1[8], bf1[11], bf1 + 8, bf1 + 11, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[9], bf1[10], bf1 + 9, bf1 + 10, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[15], bf1[12], bf1 + 15, bf1 + 12, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[14], bf1[13], bf1 + 14, bf1 + 13, clamp_lo, clamp_hi);
+
+  temp1 = half_btf_avx2(cospim16, &bf1[18], cospi48, &bf1[29], rounding, bit);
+  bf1[29] = half_btf_avx2(cospi48, &bf1[18], cospi16, &bf1[29], rounding, bit);
+  bf1[18] = temp1;
+  temp2 = half_btf_avx2(cospim16, &bf1[19], cospi48, &bf1[28], rounding, bit);
+  bf1[28] = half_btf_avx2(cospi48, &bf1[19], cospi16, &bf1[28], rounding, bit);
+  bf1[19] = temp2;
+  temp1 = half_btf_avx2(cospim48, &bf1[20], cospim16, &bf1[27], rounding, bit);
+  bf1[27] = half_btf_avx2(cospim16, &bf1[20], cospi48, &bf1[27], rounding, bit);
+  bf1[20] = temp1;
+  temp2 = half_btf_avx2(cospim48, &bf1[21], cospim16, &bf1[26], rounding, bit);
+  bf1[26] = half_btf_avx2(cospim16, &bf1[21], cospi48, &bf1[26], rounding, bit);
+  bf1[21] = temp2;
+}
+
+static INLINE void idct32_stage7_avx2(__m256i *bf1, const __m256i *cospim32,
+                                      const __m256i *cospi32,
+                                      const __m256i *clamp_lo,
+                                      const __m256i *clamp_hi,
+                                      const __m256i *rounding, int bit) {
+  __m256i temp1, temp2;
+  addsub_avx2(bf1[0], bf1[7], bf1 + 0, bf1 + 7, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[1], bf1[6], bf1 + 1, bf1 + 6, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[2], bf1[5], bf1 + 2, bf1 + 5, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[3], bf1[4], bf1 + 3, bf1 + 4, clamp_lo, clamp_hi);
+
+  temp1 = half_btf_avx2(cospim32, &bf1[10], cospi32, &bf1[13], rounding, bit);
+  bf1[13] = half_btf_avx2(cospi32, &bf1[10], cospi32, &bf1[13], rounding, bit);
+  bf1[10] = temp1;
+  temp2 = half_btf_avx2(cospim32, &bf1[11], cospi32, &bf1[12], rounding, bit);
+  bf1[12] = half_btf_avx2(cospi32, &bf1[11], cospi32, &bf1[12], rounding, bit);
+  bf1[11] = temp2;
+
+  addsub_avx2(bf1[16], bf1[23], bf1 + 16, bf1 + 23, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[17], bf1[22], bf1 + 17, bf1 + 22, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[18], bf1[21], bf1 + 18, bf1 + 21, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[19], bf1[20], bf1 + 19, bf1 + 20, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[31], bf1[24], bf1 + 31, bf1 + 24, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[30], bf1[25], bf1 + 30, bf1 + 25, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[29], bf1[26], bf1 + 29, bf1 + 26, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[28], bf1[27], bf1 + 28, bf1 + 27, clamp_lo, clamp_hi);
+}
+
+static INLINE void idct32_stage8_avx2(__m256i *bf1, const __m256i *cospim32,
+                                      const __m256i *cospi32,
+                                      const __m256i *clamp_lo,
+                                      const __m256i *clamp_hi,
+                                      const __m256i *rounding, int bit) {
+  __m256i temp1, temp2;
+  addsub_avx2(bf1[0], bf1[15], bf1 + 0, bf1 + 15, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[1], bf1[14], bf1 + 1, bf1 + 14, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[2], bf1[13], bf1 + 2, bf1 + 13, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[3], bf1[12], bf1 + 3, bf1 + 12, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[4], bf1[11], bf1 + 4, bf1 + 11, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[5], bf1[10], bf1 + 5, bf1 + 10, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[6], bf1[9], bf1 + 6, bf1 + 9, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[7], bf1[8], bf1 + 7, bf1 + 8, clamp_lo, clamp_hi);
+
+  temp1 = half_btf_avx2(cospim32, &bf1[20], cospi32, &bf1[27], rounding, bit);
+  bf1[27] = half_btf_avx2(cospi32, &bf1[20], cospi32, &bf1[27], rounding, bit);
+  bf1[20] = temp1;
+  temp2 = half_btf_avx2(cospim32, &bf1[21], cospi32, &bf1[26], rounding, bit);
+  bf1[26] = half_btf_avx2(cospi32, &bf1[21], cospi32, &bf1[26], rounding, bit);
+  bf1[21] = temp2;
+  temp1 = half_btf_avx2(cospim32, &bf1[22], cospi32, &bf1[25], rounding, bit);
+  bf1[25] = half_btf_avx2(cospi32, &bf1[22], cospi32, &bf1[25], rounding, bit);
+  bf1[22] = temp1;
+  temp2 = half_btf_avx2(cospim32, &bf1[23], cospi32, &bf1[24], rounding, bit);
+  bf1[24] = half_btf_avx2(cospi32, &bf1[23], cospi32, &bf1[24], rounding, bit);
+  bf1[23] = temp2;
+}
+
+static INLINE void idct32_stage9_avx2(__m256i *bf1, __m256i *out,
+                                      const int do_cols, const int bd,
+                                      const int out_shift,
+                                      const __m256i *clamp_lo,
+                                      const __m256i *clamp_hi) {
+  addsub_avx2(bf1[0], bf1[31], out + 0, out + 31, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[1], bf1[30], out + 1, out + 30, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[2], bf1[29], out + 2, out + 29, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[3], bf1[28], out + 3, out + 28, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[4], bf1[27], out + 4, out + 27, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[5], bf1[26], out + 5, out + 26, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[6], bf1[25], out + 6, out + 25, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[7], bf1[24], out + 7, out + 24, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[8], bf1[23], out + 8, out + 23, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[9], bf1[22], out + 9, out + 22, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[10], bf1[21], out + 10, out + 21, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[11], bf1[20], out + 11, out + 20, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[12], bf1[19], out + 12, out + 19, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[13], bf1[18], out + 13, out + 18, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[14], bf1[17], out + 14, out + 17, clamp_lo, clamp_hi);
+  addsub_avx2(bf1[15], bf1[16], out + 15, out + 16, clamp_lo, clamp_hi);
+  if (!do_cols) {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+    const __m256i clamp_hi_out =
+        _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+    round_shift_8x8_avx2(out, out_shift);
+    round_shift_8x8_avx2(out + 16, out_shift);
+    highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 32);
+  }
+}
+
+static void idct32_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+                             int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+  const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+  __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+  __m256i x;
+  // stage 0
+  // stage 1
+  // stage 2
+  // stage 3
+  // stage 4
+  // stage 5
+  x = _mm256_mullo_epi32(in[0], cospi32);
+  x = _mm256_add_epi32(x, rounding);
+  x = _mm256_srai_epi32(x, bit);
+
+  // stage 6
+  // stage 7
+  // stage 8
+  // stage 9
+  if (!do_cols) {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1);
+    clamp_lo = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+    clamp_hi = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+    x = _mm256_add_epi32(offset, x);
+    x = _mm256_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
+  }
+  x = _mm256_max_epi32(x, clamp_lo);
+  x = _mm256_min_epi32(x, clamp_hi);
+  out[0] = x;
+  out[1] = x;
+  out[2] = x;
+  out[3] = x;
+  out[4] = x;
+  out[5] = x;
+  out[6] = x;
+  out[7] = x;
+  out[8] = x;
+  out[9] = x;
+  out[10] = x;
+  out[11] = x;
+  out[12] = x;
+  out[13] = x;
+  out[14] = x;
+  out[15] = x;
+  out[16] = x;
+  out[17] = x;
+  out[18] = x;
+  out[19] = x;
+  out[20] = x;
+  out[21] = x;
+  out[22] = x;
+  out[23] = x;
+  out[24] = x;
+  out[25] = x;
+  out[26] = x;
+  out[27] = x;
+  out[28] = x;
+  out[29] = x;
+  out[30] = x;
+  out[31] = x;
+}
+
+static void idct32_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+                             int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
+  const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
+  const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
+  const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
+  const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
+  const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
+  const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
+  const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
+  const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
+  const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
+  const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
+  const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
+  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+  const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+  const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+  const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
+  const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
+  const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
+  const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
+  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+  const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
+  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+  const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+  const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
+  const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+  const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+  __m256i bf1[32];
+
+  {
+    // stage 0
+    // stage 1
+    bf1[0] = in[0];
+    bf1[4] = in[4];
+    bf1[8] = in[2];
+    bf1[12] = in[6];
+    bf1[16] = in[1];
+    bf1[20] = in[5];
+    bf1[24] = in[3];
+    bf1[28] = in[7];
+
+    // stage 2
+    bf1[31] = half_btf_0_avx2(&cospi2, &bf1[16], &rounding, bit);
+    bf1[16] = half_btf_0_avx2(&cospi62, &bf1[16], &rounding, bit);
+    bf1[19] = half_btf_0_avx2(&cospim50, &bf1[28], &rounding, bit);
+    bf1[28] = half_btf_0_avx2(&cospi14, &bf1[28], &rounding, bit);
+    bf1[27] = half_btf_0_avx2(&cospi10, &bf1[20], &rounding, bit);
+    bf1[20] = half_btf_0_avx2(&cospi54, &bf1[20], &rounding, bit);
+    bf1[23] = half_btf_0_avx2(&cospim58, &bf1[24], &rounding, bit);
+    bf1[24] = half_btf_0_avx2(&cospi6, &bf1[24], &rounding, bit);
+
+    // stage 3
+    bf1[15] = half_btf_0_avx2(&cospi4, &bf1[8], &rounding, bit);
+    bf1[8] = half_btf_0_avx2(&cospi60, &bf1[8], &rounding, bit);
+
+    bf1[11] = half_btf_0_avx2(&cospim52, &bf1[12], &rounding, bit);
+    bf1[12] = half_btf_0_avx2(&cospi12, &bf1[12], &rounding, bit);
+    bf1[17] = bf1[16];
+    bf1[18] = bf1[19];
+    bf1[21] = bf1[20];
+    bf1[22] = bf1[23];
+    bf1[25] = bf1[24];
+    bf1[26] = bf1[27];
+    bf1[29] = bf1[28];
+    bf1[30] = bf1[31];
+
+    // stage 4
+    bf1[7] = half_btf_0_avx2(&cospi8, &bf1[4], &rounding, bit);
+    bf1[4] = half_btf_0_avx2(&cospi56, &bf1[4], &rounding, bit);
+
+    bf1[9] = bf1[8];
+    bf1[10] = bf1[11];
+    bf1[13] = bf1[12];
+    bf1[14] = bf1[15];
+
+    idct32_stage4_avx2(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40,
+                       &cospi24, &cospi40, &cospim24, &rounding, bit);
+
+    // stage 5
+    bf1[0] = half_btf_0_avx2(&cospi32, &bf1[0], &rounding, bit);
+    bf1[1] = bf1[0];
+    bf1[5] = bf1[4];
+    bf1[6] = bf1[7];
+
+    idct32_stage5_avx2(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo,
+                       &clamp_hi, &rounding, bit);
+
+    // stage 6
+    bf1[3] = bf1[0];
+    bf1[2] = bf1[1];
+
+    idct32_stage6_avx2(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
+                       &cospim48, &clamp_lo, &clamp_hi, &rounding, bit);
+
+    // stage 7
+    idct32_stage7_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
+                       &rounding, bit);
+
+    // stage 8
+    idct32_stage8_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
+                       &rounding, bit);
+
+    // stage 9
+    idct32_stage9_avx2(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
+  }
+}
+
+static void idct32_low16_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+                              int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
+  const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
+  const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
+  const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
+  const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
+  const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
+  const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
+  const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
+  const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
+  const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
+  const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
+  const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
+  const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
+  const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]);
+  const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
+  const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]);
+  const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
+  const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
+  const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
+  const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
+  const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
+  const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
+  const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
+  const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
+  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+  const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+  const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+  const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
+  const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
+  const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
+  const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
+  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+  const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
+  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+  const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+  const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
+  const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+  const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+  __m256i bf1[32];
+
+  {
+    // stage 0
+    // stage 1
+    bf1[0] = in[0];
+    bf1[2] = in[8];
+    bf1[4] = in[4];
+    bf1[6] = in[12];
+    bf1[8] = in[2];
+    bf1[10] = in[10];
+    bf1[12] = in[6];
+    bf1[14] = in[14];
+    bf1[16] = in[1];
+    bf1[18] = in[9];
+    bf1[20] = in[5];
+    bf1[22] = in[13];
+    bf1[24] = in[3];
+    bf1[26] = in[11];
+    bf1[28] = in[7];
+    bf1[30] = in[15];
+
+    // stage 2
+    bf1[31] = half_btf_0_avx2(&cospi2, &bf1[16], &rounding, bit);
+    bf1[16] = half_btf_0_avx2(&cospi62, &bf1[16], &rounding, bit);
+    bf1[17] = half_btf_0_avx2(&cospim34, &bf1[30], &rounding, bit);
+    bf1[30] = half_btf_0_avx2(&cospi30, &bf1[30], &rounding, bit);
+    bf1[29] = half_btf_0_avx2(&cospi18, &bf1[18], &rounding, bit);
+    bf1[18] = half_btf_0_avx2(&cospi46, &bf1[18], &rounding, bit);
+    bf1[19] = half_btf_0_avx2(&cospim50, &bf1[28], &rounding, bit);
+    bf1[28] = half_btf_0_avx2(&cospi14, &bf1[28], &rounding, bit);
+    bf1[27] = half_btf_0_avx2(&cospi10, &bf1[20], &rounding, bit);
+    bf1[20] = half_btf_0_avx2(&cospi54, &bf1[20], &rounding, bit);
+    bf1[21] = half_btf_0_avx2(&cospim42, &bf1[26], &rounding, bit);
+    bf1[26] = half_btf_0_avx2(&cospi22, &bf1[26], &rounding, bit);
+    bf1[25] = half_btf_0_avx2(&cospi26, &bf1[22], &rounding, bit);
+    bf1[22] = half_btf_0_avx2(&cospi38, &bf1[22], &rounding, bit);
+    bf1[23] = half_btf_0_avx2(&cospim58, &bf1[24], &rounding, bit);
+    bf1[24] = half_btf_0_avx2(&cospi6, &bf1[24], &rounding, bit);
+
+    // stage 3
+    bf1[15] = half_btf_0_avx2(&cospi4, &bf1[8], &rounding, bit);
+    bf1[8] = half_btf_0_avx2(&cospi60, &bf1[8], &rounding, bit);
+    bf1[9] = half_btf_0_avx2(&cospim36, &bf1[14], &rounding, bit);
+    bf1[14] = half_btf_0_avx2(&cospi28, &bf1[14], &rounding, bit);
+    bf1[13] = half_btf_0_avx2(&cospi20, &bf1[10], &rounding, bit);
+    bf1[10] = half_btf_0_avx2(&cospi44, &bf1[10], &rounding, bit);
+    bf1[11] = half_btf_0_avx2(&cospim52, &bf1[12], &rounding, bit);
+    bf1[12] = half_btf_0_avx2(&cospi12, &bf1[12], &rounding, bit);
+
+    addsub_avx2(bf1[16], bf1[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[19], bf1[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[20], bf1[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[23], bf1[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[24], bf1[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[27], bf1[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[28], bf1[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[31], bf1[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
+
+    // stage 4
+    bf1[7] = half_btf_0_avx2(&cospi8, &bf1[4], &rounding, bit);
+    bf1[4] = half_btf_0_avx2(&cospi56, &bf1[4], &rounding, bit);
+    bf1[5] = half_btf_0_avx2(&cospim40, &bf1[6], &rounding, bit);
+    bf1[6] = half_btf_0_avx2(&cospi24, &bf1[6], &rounding, bit);
+
+    addsub_avx2(bf1[8], bf1[9], bf1 + 8, bf1 + 9, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[11], bf1[10], bf1 + 11, bf1 + 10, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[12], bf1[13], bf1 + 12, bf1 + 13, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[15], bf1[14], bf1 + 15, bf1 + 14, &clamp_lo, &clamp_hi);
+
+    idct32_stage4_avx2(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40,
+                       &cospi24, &cospi40, &cospim24, &rounding, bit);
+
+    // stage 5
+    bf1[0] = half_btf_0_avx2(&cospi32, &bf1[0], &rounding, bit);
+    bf1[1] = bf1[0];
+    bf1[3] = half_btf_0_avx2(&cospi16, &bf1[2], &rounding, bit);
+    bf1[2] = half_btf_0_avx2(&cospi48, &bf1[2], &rounding, bit);
+
+    addsub_avx2(bf1[4], bf1[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[7], bf1[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
+
+    idct32_stage5_avx2(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo,
+                       &clamp_hi, &rounding, bit);
+
+    // stage 6
+    addsub_avx2(bf1[0], bf1[3], bf1 + 0, bf1 + 3, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[1], bf1[2], bf1 + 1, bf1 + 2, &clamp_lo, &clamp_hi);
+
+    idct32_stage6_avx2(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
+                       &cospim48, &clamp_lo, &clamp_hi, &rounding, bit);
+
+    // stage 7
+    idct32_stage7_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
+                       &rounding, bit);
+
+    // stage 8
+    idct32_stage8_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
+                       &rounding, bit);
+
+    // stage 9
+    idct32_stage9_avx2(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
+  }
+}
+
+static void idct32_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd,
+                        int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
+  const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
+  const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
+  const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
+  const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
+  const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
+  const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
+  const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
+  const __m256i cospi58 = _mm256_set1_epi32(cospi[58]);
+  const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
+  const __m256i cospi42 = _mm256_set1_epi32(cospi[42]);
+  const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
+  const __m256i cospi50 = _mm256_set1_epi32(cospi[50]);
+  const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
+  const __m256i cospi34 = _mm256_set1_epi32(cospi[34]);
+  const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
+  const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
+  const __m256i cospim26 = _mm256_set1_epi32(-cospi[26]);
+  const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]);
+  const __m256i cospim10 = _mm256_set1_epi32(-cospi[10]);
+  const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
+  const __m256i cospim18 = _mm256_set1_epi32(-cospi[18]);
+  const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]);
+  const __m256i cospim2 = _mm256_set1_epi32(-cospi[2]);
+  const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
+  const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
+  const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
+  const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
+  const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
+  const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
+  const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
+  const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
+  const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
+  const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
+  const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
+  const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
+  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+  const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+  const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+  const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
+  const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
+  const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
+  const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
+  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+  const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
+  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+  const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+  const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
+  const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+  const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+  __m256i bf1[32], bf0[32];
+
+  {
+    // stage 0
+    // stage 1
+    bf1[0] = in[0];
+    bf1[1] = in[16];
+    bf1[2] = in[8];
+    bf1[3] = in[24];
+    bf1[4] = in[4];
+    bf1[5] = in[20];
+    bf1[6] = in[12];
+    bf1[7] = in[28];
+    bf1[8] = in[2];
+    bf1[9] = in[18];
+    bf1[10] = in[10];
+    bf1[11] = in[26];
+    bf1[12] = in[6];
+    bf1[13] = in[22];
+    bf1[14] = in[14];
+    bf1[15] = in[30];
+    bf1[16] = in[1];
+    bf1[17] = in[17];
+    bf1[18] = in[9];
+    bf1[19] = in[25];
+    bf1[20] = in[5];
+    bf1[21] = in[21];
+    bf1[22] = in[13];
+    bf1[23] = in[29];
+    bf1[24] = in[3];
+    bf1[25] = in[19];
+    bf1[26] = in[11];
+    bf1[27] = in[27];
+    bf1[28] = in[7];
+    bf1[29] = in[23];
+    bf1[30] = in[15];
+    bf1[31] = in[31];
+
+    // stage 2
+    bf0[0] = bf1[0];
+    bf0[1] = bf1[1];
+    bf0[2] = bf1[2];
+    bf0[3] = bf1[3];
+    bf0[4] = bf1[4];
+    bf0[5] = bf1[5];
+    bf0[6] = bf1[6];
+    bf0[7] = bf1[7];
+    bf0[8] = bf1[8];
+    bf0[9] = bf1[9];
+    bf0[10] = bf1[10];
+    bf0[11] = bf1[11];
+    bf0[12] = bf1[12];
+    bf0[13] = bf1[13];
+    bf0[14] = bf1[14];
+    bf0[15] = bf1[15];
+    bf0[16] =
+        half_btf_avx2(&cospi62, &bf1[16], &cospim2, &bf1[31], &rounding, bit);
+    bf0[17] =
+        half_btf_avx2(&cospi30, &bf1[17], &cospim34, &bf1[30], &rounding, bit);
+    bf0[18] =
+        half_btf_avx2(&cospi46, &bf1[18], &cospim18, &bf1[29], &rounding, bit);
+    bf0[19] =
+        half_btf_avx2(&cospi14, &bf1[19], &cospim50, &bf1[28], &rounding, bit);
+    bf0[20] =
+        half_btf_avx2(&cospi54, &bf1[20], &cospim10, &bf1[27], &rounding, bit);
+    bf0[21] =
+        half_btf_avx2(&cospi22, &bf1[21], &cospim42, &bf1[26], &rounding, bit);
+    bf0[22] =
+        half_btf_avx2(&cospi38, &bf1[22], &cospim26, &bf1[25], &rounding, bit);
+    bf0[23] =
+        half_btf_avx2(&cospi6, &bf1[23], &cospim58, &bf1[24], &rounding, bit);
+    bf0[24] =
+        half_btf_avx2(&cospi58, &bf1[23], &cospi6, &bf1[24], &rounding, bit);
+    bf0[25] =
+        half_btf_avx2(&cospi26, &bf1[22], &cospi38, &bf1[25], &rounding, bit);
+    bf0[26] =
+        half_btf_avx2(&cospi42, &bf1[21], &cospi22, &bf1[26], &rounding, bit);
+    bf0[27] =
+        half_btf_avx2(&cospi10, &bf1[20], &cospi54, &bf1[27], &rounding, bit);
+    bf0[28] =
+        half_btf_avx2(&cospi50, &bf1[19], &cospi14, &bf1[28], &rounding, bit);
+    bf0[29] =
+        half_btf_avx2(&cospi18, &bf1[18], &cospi46, &bf1[29], &rounding, bit);
+    bf0[30] =
+        half_btf_avx2(&cospi34, &bf1[17], &cospi30, &bf1[30], &rounding, bit);
+    bf0[31] =
+        half_btf_avx2(&cospi2, &bf1[16], &cospi62, &bf1[31], &rounding, bit);
+
+    // stage 3
+    bf1[0] = bf0[0];
+    bf1[1] = bf0[1];
+    bf1[2] = bf0[2];
+    bf1[3] = bf0[3];
+    bf1[4] = bf0[4];
+    bf1[5] = bf0[5];
+    bf1[6] = bf0[6];
+    bf1[7] = bf0[7];
+    bf1[8] =
+        half_btf_avx2(&cospi60, &bf0[8], &cospim4, &bf0[15], &rounding, bit);
+    bf1[9] =
+        half_btf_avx2(&cospi28, &bf0[9], &cospim36, &bf0[14], &rounding, bit);
+    bf1[10] =
+        half_btf_avx2(&cospi44, &bf0[10], &cospim20, &bf0[13], &rounding, bit);
+    bf1[11] =
+        half_btf_avx2(&cospi12, &bf0[11], &cospim52, &bf0[12], &rounding, bit);
+    bf1[12] =
+        half_btf_avx2(&cospi52, &bf0[11], &cospi12, &bf0[12], &rounding, bit);
+    bf1[13] =
+        half_btf_avx2(&cospi20, &bf0[10], &cospi44, &bf0[13], &rounding, bit);
+    bf1[14] =
+        half_btf_avx2(&cospi36, &bf0[9], &cospi28, &bf0[14], &rounding, bit);
+    bf1[15] =
+        half_btf_avx2(&cospi4, &bf0[8], &cospi60, &bf0[15], &rounding, bit);
+
+    addsub_avx2(bf0[16], bf0[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[19], bf0[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[20], bf0[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[23], bf0[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[24], bf0[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[27], bf0[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[28], bf0[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[31], bf0[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
+
+    // stage 4
+    bf0[0] = bf1[0];
+    bf0[1] = bf1[1];
+    bf0[2] = bf1[2];
+    bf0[3] = bf1[3];
+    bf0[4] =
+        half_btf_avx2(&cospi56, &bf1[4], &cospim8, &bf1[7], &rounding, bit);
+    bf0[5] =
+        half_btf_avx2(&cospi24, &bf1[5], &cospim40, &bf1[6], &rounding, bit);
+    bf0[6] =
+        half_btf_avx2(&cospi40, &bf1[5], &cospi24, &bf1[6], &rounding, bit);
+    bf0[7] = half_btf_avx2(&cospi8, &bf1[4], &cospi56, &bf1[7], &rounding, bit);
+
+    addsub_avx2(bf1[8], bf1[9], bf0 + 8, bf0 + 9, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[11], bf1[10], bf0 + 11, bf0 + 10, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[12], bf1[13], bf0 + 12, bf0 + 13, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[15], bf1[14], bf0 + 15, bf0 + 14, &clamp_lo, &clamp_hi);
+
+    bf0[16] = bf1[16];
+    bf0[17] =
+        half_btf_avx2(&cospim8, &bf1[17], &cospi56, &bf1[30], &rounding, bit);
+    bf0[18] =
+        half_btf_avx2(&cospim56, &bf1[18], &cospim8, &bf1[29], &rounding, bit);
+    bf0[19] = bf1[19];
+    bf0[20] = bf1[20];
+    bf0[21] =
+        half_btf_avx2(&cospim40, &bf1[21], &cospi24, &bf1[26], &rounding, bit);
+    bf0[22] =
+        half_btf_avx2(&cospim24, &bf1[22], &cospim40, &bf1[25], &rounding, bit);
+    bf0[23] = bf1[23];
+    bf0[24] = bf1[24];
+    bf0[25] =
+        half_btf_avx2(&cospim40, &bf1[22], &cospi24, &bf1[25], &rounding, bit);
+    bf0[26] =
+        half_btf_avx2(&cospi24, &bf1[21], &cospi40, &bf1[26], &rounding, bit);
+    bf0[27] = bf1[27];
+    bf0[28] = bf1[28];
+    bf0[29] =
+        half_btf_avx2(&cospim8, &bf1[18], &cospi56, &bf1[29], &rounding, bit);
+    bf0[30] =
+        half_btf_avx2(&cospi56, &bf1[17], &cospi8, &bf1[30], &rounding, bit);
+    bf0[31] = bf1[31];
+
+    // stage 5
+    bf1[0] =
+        half_btf_avx2(&cospi32, &bf0[0], &cospi32, &bf0[1], &rounding, bit);
+    bf1[1] =
+        half_btf_avx2(&cospi32, &bf0[0], &cospim32, &bf0[1], &rounding, bit);
+    bf1[2] =
+        half_btf_avx2(&cospi48, &bf0[2], &cospim16, &bf0[3], &rounding, bit);
+    bf1[3] =
+        half_btf_avx2(&cospi16, &bf0[2], &cospi48, &bf0[3], &rounding, bit);
+    addsub_avx2(bf0[4], bf0[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[7], bf0[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
+    bf1[8] = bf0[8];
+    bf1[9] =
+        half_btf_avx2(&cospim16, &bf0[9], &cospi48, &bf0[14], &rounding, bit);
+    bf1[10] =
+        half_btf_avx2(&cospim48, &bf0[10], &cospim16, &bf0[13], &rounding, bit);
+    bf1[11] = bf0[11];
+    bf1[12] = bf0[12];
+    bf1[13] =
+        half_btf_avx2(&cospim16, &bf0[10], &cospi48, &bf0[13], &rounding, bit);
+    bf1[14] =
+        half_btf_avx2(&cospi48, &bf0[9], &cospi16, &bf0[14], &rounding, bit);
+    bf1[15] = bf0[15];
+    addsub_avx2(bf0[16], bf0[19], bf1 + 16, bf1 + 19, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[17], bf0[18], bf1 + 17, bf1 + 18, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[23], bf0[20], bf1 + 23, bf1 + 20, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[22], bf0[21], bf1 + 22, bf1 + 21, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[24], bf0[27], bf1 + 24, bf1 + 27, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[25], bf0[26], bf1 + 25, bf1 + 26, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[31], bf0[28], bf1 + 31, bf1 + 28, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[30], bf0[29], bf1 + 30, bf1 + 29, &clamp_lo, &clamp_hi);
+
+    // stage 6
+    addsub_avx2(bf1[0], bf1[3], bf0 + 0, bf0 + 3, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[1], bf1[2], bf0 + 1, bf0 + 2, &clamp_lo, &clamp_hi);
+    bf0[4] = bf1[4];
+    bf0[5] =
+        half_btf_avx2(&cospim32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
+    bf0[6] =
+        half_btf_avx2(&cospi32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
+    bf0[7] = bf1[7];
+    addsub_avx2(bf1[8], bf1[11], bf0 + 8, bf0 + 11, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[9], bf1[10], bf0 + 9, bf0 + 10, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[15], bf1[12], bf0 + 15, bf0 + 12, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[14], bf1[13], bf0 + 14, bf0 + 13, &clamp_lo, &clamp_hi);
+    bf0[16] = bf1[16];
+    bf0[17] = bf1[17];
+    bf0[18] =
+        half_btf_avx2(&cospim16, &bf1[18], &cospi48, &bf1[29], &rounding, bit);
+    bf0[19] =
+        half_btf_avx2(&cospim16, &bf1[19], &cospi48, &bf1[28], &rounding, bit);
+    bf0[20] =
+        half_btf_avx2(&cospim48, &bf1[20], &cospim16, &bf1[27], &rounding, bit);
+    bf0[21] =
+        half_btf_avx2(&cospim48, &bf1[21], &cospim16, &bf1[26], &rounding, bit);
+    bf0[22] = bf1[22];
+    bf0[23] = bf1[23];
+    bf0[24] = bf1[24];
+    bf0[25] = bf1[25];
+    bf0[26] =
+        half_btf_avx2(&cospim16, &bf1[21], &cospi48, &bf1[26], &rounding, bit);
+    bf0[27] =
+        half_btf_avx2(&cospim16, &bf1[20], &cospi48, &bf1[27], &rounding, bit);
+    bf0[28] =
+        half_btf_avx2(&cospi48, &bf1[19], &cospi16, &bf1[28], &rounding, bit);
+    bf0[29] =
+        half_btf_avx2(&cospi48, &bf1[18], &cospi16, &bf1[29], &rounding, bit);
+    bf0[30] = bf1[30];
+    bf0[31] = bf1[31];
+
+    // stage 7
+    addsub_avx2(bf0[0], bf0[7], bf1 + 0, bf1 + 7, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[1], bf0[6], bf1 + 1, bf1 + 6, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[2], bf0[5], bf1 + 2, bf1 + 5, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[3], bf0[4], bf1 + 3, bf1 + 4, &clamp_lo, &clamp_hi);
+    bf1[8] = bf0[8];
+    bf1[9] = bf0[9];
+    bf1[10] =
+        half_btf_avx2(&cospim32, &bf0[10], &cospi32, &bf0[13], &rounding, bit);
+    bf1[11] =
+        half_btf_avx2(&cospim32, &bf0[11], &cospi32, &bf0[12], &rounding, bit);
+    bf1[12] =
+        half_btf_avx2(&cospi32, &bf0[11], &cospi32, &bf0[12], &rounding, bit);
+    bf1[13] =
+        half_btf_avx2(&cospi32, &bf0[10], &cospi32, &bf0[13], &rounding, bit);
+    bf1[14] = bf0[14];
+    bf1[15] = bf0[15];
+    addsub_avx2(bf0[16], bf0[23], bf1 + 16, bf1 + 23, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[17], bf0[22], bf1 + 17, bf1 + 22, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[18], bf0[21], bf1 + 18, bf1 + 21, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[19], bf0[20], bf1 + 19, bf1 + 20, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[31], bf0[24], bf1 + 31, bf1 + 24, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[30], bf0[25], bf1 + 30, bf1 + 25, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[29], bf0[26], bf1 + 29, bf1 + 26, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[28], bf0[27], bf1 + 28, bf1 + 27, &clamp_lo, &clamp_hi);
+
+    // stage 8
+    addsub_avx2(bf1[0], bf1[15], bf0 + 0, bf0 + 15, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[1], bf1[14], bf0 + 1, bf0 + 14, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[2], bf1[13], bf0 + 2, bf0 + 13, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[3], bf1[12], bf0 + 3, bf0 + 12, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[4], bf1[11], bf0 + 4, bf0 + 11, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[5], bf1[10], bf0 + 5, bf0 + 10, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[6], bf1[9], bf0 + 6, bf0 + 9, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf1[7], bf1[8], bf0 + 7, bf0 + 8, &clamp_lo, &clamp_hi);
+    bf0[16] = bf1[16];
+    bf0[17] = bf1[17];
+    bf0[18] = bf1[18];
+    bf0[19] = bf1[19];
+    bf0[20] =
+        half_btf_avx2(&cospim32, &bf1[20], &cospi32, &bf1[27], &rounding, bit);
+    bf0[21] =
+        half_btf_avx2(&cospim32, &bf1[21], &cospi32, &bf1[26], &rounding, bit);
+    bf0[22] =
+        half_btf_avx2(&cospim32, &bf1[22], &cospi32, &bf1[25], &rounding, bit);
+    bf0[23] =
+        half_btf_avx2(&cospim32, &bf1[23], &cospi32, &bf1[24], &rounding, bit);
+    bf0[24] =
+        half_btf_avx2(&cospi32, &bf1[23], &cospi32, &bf1[24], &rounding, bit);
+    bf0[25] =
+        half_btf_avx2(&cospi32, &bf1[22], &cospi32, &bf1[25], &rounding, bit);
+    bf0[26] =
+        half_btf_avx2(&cospi32, &bf1[21], &cospi32, &bf1[26], &rounding, bit);
+    bf0[27] =
+        half_btf_avx2(&cospi32, &bf1[20], &cospi32, &bf1[27], &rounding, bit);
+    bf0[28] = bf1[28];
+    bf0[29] = bf1[29];
+    bf0[30] = bf1[30];
+    bf0[31] = bf1[31];
+
+    // stage 9
+    addsub_avx2(bf0[0], bf0[31], out + 0, out + 31, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[1], bf0[30], out + 1, out + 30, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[2], bf0[29], out + 2, out + 29, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[3], bf0[28], out + 3, out + 28, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[4], bf0[27], out + 4, out + 27, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[5], bf0[26], out + 5, out + 26, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[6], bf0[25], out + 6, out + 25, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[7], bf0[24], out + 7, out + 24, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[8], bf0[23], out + 8, out + 23, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[9], bf0[22], out + 9, out + 22, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[10], bf0[21], out + 10, out + 21, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[11], bf0[20], out + 11, out + 20, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[12], bf0[19], out + 12, out + 19, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[13], bf0[18], out + 13, out + 18, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[14], bf0[17], out + 14, out + 17, &clamp_lo, &clamp_hi);
+    addsub_avx2(bf0[15], bf0[16], out + 15, out + 16, &clamp_lo, &clamp_hi);
+    if (!do_cols) {
+      const int log_range_out = AOMMAX(16, bd + 6);
+      const __m256i clamp_lo_out =
+          _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+      const __m256i clamp_hi_out =
+          _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+      round_shift_8x8_avx2(out, out_shift);
+      round_shift_8x8_avx2(out + 16, out_shift);
+      highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 32);
+    }
+  }
+}
+static void idct16_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+                             int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+  __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+
+  {
+    // stage 0
+    // stage 1
+    // stage 2
+    // stage 3
+    // stage 4
+    in[0] = _mm256_mullo_epi32(in[0], cospi32);
+    in[0] = _mm256_add_epi32(in[0], rnding);
+    in[0] = _mm256_srai_epi32(in[0], bit);
+
+    // stage 5
+    // stage 6
+    // stage 7
+    if (!do_cols) {
+      const int log_range_out = AOMMAX(16, bd + 6);
+      clamp_lo = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+      clamp_hi = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+      __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1);
+      in[0] = _mm256_add_epi32(in[0], offset);
+      in[0] = _mm256_sra_epi32(in[0], _mm_cvtsi32_si128(out_shift));
+    }
+    in[0] = _mm256_max_epi32(in[0], clamp_lo);
+    in[0] = _mm256_min_epi32(in[0], clamp_hi);
+    out[0] = in[0];
+    out[1] = in[0];
+    out[2] = in[0];
+    out[3] = in[0];
+    out[4] = in[0];
+    out[5] = in[0];
+    out[6] = in[0];
+    out[7] = in[0];
+    out[8] = in[0];
+    out[9] = in[0];
+    out[10] = in[0];
+    out[11] = in[0];
+    out[12] = in[0];
+    out[13] = in[0];
+    out[14] = in[0];
+    out[15] = in[0];
+  }
+}
+
+static void idct16_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+                             int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
+  const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
+  const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
+  const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
+  const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
+  const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
+  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+  const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+  const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
+  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+  const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
+  const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+  const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
+  const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
+  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+  const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+  __m256i u[16], x, y;
+
+  {
+    // stage 0
+    // stage 1
+    u[0] = in[0];
+    u[2] = in[4];
+    u[4] = in[2];
+    u[6] = in[6];
+    u[8] = in[1];
+    u[10] = in[5];
+    u[12] = in[3];
+    u[14] = in[7];
+
+    // stage 2
+    u[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit);
+    u[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit);
+
+    u[9] = half_btf_0_avx2(&cospim36, &u[14], &rnding, bit);
+    u[14] = half_btf_0_avx2(&cospi28, &u[14], &rnding, bit);
+
+    u[13] = half_btf_0_avx2(&cospi20, &u[10], &rnding, bit);
+    u[10] = half_btf_0_avx2(&cospi44, &u[10], &rnding, bit);
+
+    u[11] = half_btf_0_avx2(&cospim52, &u[12], &rnding, bit);
+    u[12] = half_btf_0_avx2(&cospi12, &u[12], &rnding, bit);
+
+    // stage 3
+    u[7] = half_btf_0_avx2(&cospi8, &u[4], &rnding, bit);
+    u[4] = half_btf_0_avx2(&cospi56, &u[4], &rnding, bit);
+    u[5] = half_btf_0_avx2(&cospim40, &u[6], &rnding, bit);
+    u[6] = half_btf_0_avx2(&cospi24, &u[6], &rnding, bit);
+
+    addsub_avx2(u[8], u[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[11], u[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[12], u[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[15], u[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
+
+    // stage 4
+    x = _mm256_mullo_epi32(u[0], cospi32);
+    u[0] = _mm256_add_epi32(x, rnding);
+    u[0] = _mm256_srai_epi32(u[0], bit);
+    u[1] = u[0];
+
+    u[3] = half_btf_0_avx2(&cospi16, &u[2], &rnding, bit);
+    u[2] = half_btf_0_avx2(&cospi48, &u[2], &rnding, bit);
+
+    addsub_avx2(u[4], u[5], &u[4], &u[5], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[7], u[6], &u[7], &u[6], &clamp_lo, &clamp_hi);
+
+    x = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
+    u[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
+    u[9] = x;
+    y = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
+    u[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
+    u[10] = y;
+
+    // stage 5
+    addsub_avx2(u[0], u[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[1], u[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
+
+    x = _mm256_mullo_epi32(u[5], cospi32);
+    y = _mm256_mullo_epi32(u[6], cospi32);
+    u[5] = _mm256_sub_epi32(y, x);
+    u[5] = _mm256_add_epi32(u[5], rnding);
+    u[5] = _mm256_srai_epi32(u[5], bit);
+
+    u[6] = _mm256_add_epi32(y, x);
+    u[6] = _mm256_add_epi32(u[6], rnding);
+    u[6] = _mm256_srai_epi32(u[6], bit);
+
+    addsub_avx2(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
+
+    // stage 6
+    addsub_avx2(u[0], u[7], &u[0], &u[7], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[1], u[6], &u[1], &u[6], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[2], u[5], &u[2], &u[5], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[3], u[4], &u[3], &u[4], &clamp_lo, &clamp_hi);
+
+    x = _mm256_mullo_epi32(u[10], cospi32);
+    y = _mm256_mullo_epi32(u[13], cospi32);
+    u[10] = _mm256_sub_epi32(y, x);
+    u[10] = _mm256_add_epi32(u[10], rnding);
+    u[10] = _mm256_srai_epi32(u[10], bit);
+
+    u[13] = _mm256_add_epi32(x, y);
+    u[13] = _mm256_add_epi32(u[13], rnding);
+    u[13] = _mm256_srai_epi32(u[13], bit);
+
+    x = _mm256_mullo_epi32(u[11], cospi32);
+    y = _mm256_mullo_epi32(u[12], cospi32);
+    u[11] = _mm256_sub_epi32(y, x);
+    u[11] = _mm256_add_epi32(u[11], rnding);
+    u[11] = _mm256_srai_epi32(u[11], bit);
+
+    u[12] = _mm256_add_epi32(x, y);
+    u[12] = _mm256_add_epi32(u[12], rnding);
+    u[12] = _mm256_srai_epi32(u[12], bit);
+    // stage 7
+    addsub_avx2(u[0], u[15], out + 0, out + 15, &clamp_lo, &clamp_hi);
+    addsub_avx2(u[1], u[14], out + 1, out + 14, &clamp_lo, &clamp_hi);
+    addsub_avx2(u[2], u[13], out + 2, out + 13, &clamp_lo, &clamp_hi);
+    addsub_avx2(u[3], u[12], out + 3, out + 12, &clamp_lo, &clamp_hi);
+    addsub_avx2(u[4], u[11], out + 4, out + 11, &clamp_lo, &clamp_hi);
+    addsub_avx2(u[5], u[10], out + 5, out + 10, &clamp_lo, &clamp_hi);
+    addsub_avx2(u[6], u[9], out + 6, out + 9, &clamp_lo, &clamp_hi);
+    addsub_avx2(u[7], u[8], out + 7, out + 8, &clamp_lo, &clamp_hi);
+
+    if (!do_cols) {
+      const int log_range_out = AOMMAX(16, bd + 6);
+      const __m256i clamp_lo_out =
+          _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+      const __m256i clamp_hi_out =
+          _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+      round_shift_8x8_avx2(out, out_shift);
+      highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 16);
+    }
+  }
+}
+
+static void idct16_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd,
+                        int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
+  const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
+  const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
+  const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
+  const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
+  const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
+  const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
+  const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
+  const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
+  const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
+  const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
+  const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
+  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+  const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
+  const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+  const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
+  const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+  const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
+  const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+  const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+  __m256i u[16], v[16], x, y;
+
+  {
+    // stage 0
+    // stage 1
+    u[0] = in[0];
+    u[1] = in[8];
+    u[2] = in[4];
+    u[3] = in[12];
+    u[4] = in[2];
+    u[5] = in[10];
+    u[6] = in[6];
+    u[7] = in[14];
+    u[8] = in[1];
+    u[9] = in[9];
+    u[10] = in[5];
+    u[11] = in[13];
+    u[12] = in[3];
+    u[13] = in[11];
+    u[14] = in[7];
+    u[15] = in[15];
+
+    // stage 2
+    v[0] = u[0];
+    v[1] = u[1];
+    v[2] = u[2];
+    v[3] = u[3];
+    v[4] = u[4];
+    v[5] = u[5];
+    v[6] = u[6];
+    v[7] = u[7];
+
+    v[8] = half_btf_avx2(&cospi60, &u[8], &cospim4, &u[15], &rnding, bit);
+    v[9] = half_btf_avx2(&cospi28, &u[9], &cospim36, &u[14], &rnding, bit);
+    v[10] = half_btf_avx2(&cospi44, &u[10], &cospim20, &u[13], &rnding, bit);
+    v[11] = half_btf_avx2(&cospi12, &u[11], &cospim52, &u[12], &rnding, bit);
+    v[12] = half_btf_avx2(&cospi52, &u[11], &cospi12, &u[12], &rnding, bit);
+    v[13] = half_btf_avx2(&cospi20, &u[10], &cospi44, &u[13], &rnding, bit);
+    v[14] = half_btf_avx2(&cospi36, &u[9], &cospi28, &u[14], &rnding, bit);
+    v[15] = half_btf_avx2(&cospi4, &u[8], &cospi60, &u[15], &rnding, bit);
+
+    // stage 3
+    u[0] = v[0];
+    u[1] = v[1];
+    u[2] = v[2];
+    u[3] = v[3];
+    u[4] = half_btf_avx2(&cospi56, &v[4], &cospim8, &v[7], &rnding, bit);
+    u[5] = half_btf_avx2(&cospi24, &v[5], &cospim40, &v[6], &rnding, bit);
+    u[6] = half_btf_avx2(&cospi40, &v[5], &cospi24, &v[6], &rnding, bit);
+    u[7] = half_btf_avx2(&cospi8, &v[4], &cospi56, &v[7], &rnding, bit);
+    addsub_avx2(v[8], v[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
+    addsub_avx2(v[11], v[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
+    addsub_avx2(v[12], v[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
+    addsub_avx2(v[15], v[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
+
+    // stage 4
+    x = _mm256_mullo_epi32(u[0], cospi32);
+    y = _mm256_mullo_epi32(u[1], cospi32);
+    v[0] = _mm256_add_epi32(x, y);
+    v[0] = _mm256_add_epi32(v[0], rnding);
+    v[0] = _mm256_srai_epi32(v[0], bit);
+
+    v[1] = _mm256_sub_epi32(x, y);
+    v[1] = _mm256_add_epi32(v[1], rnding);
+    v[1] = _mm256_srai_epi32(v[1], bit);
+
+    v[2] = half_btf_avx2(&cospi48, &u[2], &cospim16, &u[3], &rnding, bit);
+    v[3] = half_btf_avx2(&cospi16, &u[2], &cospi48, &u[3], &rnding, bit);
+    addsub_avx2(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
+    v[8] = u[8];
+    v[9] = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
+    v[10] = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
+    v[11] = u[11];
+    v[12] = u[12];
+    v[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
+    v[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
+    v[15] = u[15];
+
+    // stage 5
+    addsub_avx2(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
+    addsub_avx2(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
+    u[4] = v[4];
+
+    x = _mm256_mullo_epi32(v[5], cospi32);
+    y = _mm256_mullo_epi32(v[6], cospi32);
+    u[5] = _mm256_sub_epi32(y, x);
+    u[5] = _mm256_add_epi32(u[5], rnding);
+    u[5] = _mm256_srai_epi32(u[5], bit);
+
+    u[6] = _mm256_add_epi32(y, x);
+    u[6] = _mm256_add_epi32(u[6], rnding);
+    u[6] = _mm256_srai_epi32(u[6], bit);
+
+    u[7] = v[7];
+    addsub_avx2(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
+    addsub_avx2(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
+    addsub_avx2(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
+    addsub_avx2(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
+
+    // stage 6
+    addsub_avx2(u[0], u[7], &v[0], &v[7], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[1], u[6], &v[1], &v[6], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[2], u[5], &v[2], &v[5], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[3], u[4], &v[3], &v[4], &clamp_lo, &clamp_hi);
+    v[8] = u[8];
+    v[9] = u[9];
+
+    x = _mm256_mullo_epi32(u[10], cospi32);
+    y = _mm256_mullo_epi32(u[13], cospi32);
+    v[10] = _mm256_sub_epi32(y, x);
+    v[10] = _mm256_add_epi32(v[10], rnding);
+    v[10] = _mm256_srai_epi32(v[10], bit);
+
+    v[13] = _mm256_add_epi32(x, y);
+    v[13] = _mm256_add_epi32(v[13], rnding);
+    v[13] = _mm256_srai_epi32(v[13], bit);
+
+    x = _mm256_mullo_epi32(u[11], cospi32);
+    y = _mm256_mullo_epi32(u[12], cospi32);
+    v[11] = _mm256_sub_epi32(y, x);
+    v[11] = _mm256_add_epi32(v[11], rnding);
+    v[11] = _mm256_srai_epi32(v[11], bit);
+
+    v[12] = _mm256_add_epi32(x, y);
+    v[12] = _mm256_add_epi32(v[12], rnding);
+    v[12] = _mm256_srai_epi32(v[12], bit);
+
+    v[14] = u[14];
+    v[15] = u[15];
+
+    // stage 7
+    addsub_avx2(v[0], v[15], out + 0, out + 15, &clamp_lo, &clamp_hi);
+    addsub_avx2(v[1], v[14], out + 1, out + 14, &clamp_lo, &clamp_hi);
+    addsub_avx2(v[2], v[13], out + 2, out + 13, &clamp_lo, &clamp_hi);
+    addsub_avx2(v[3], v[12], out + 3, out + 12, &clamp_lo, &clamp_hi);
+    addsub_avx2(v[4], v[11], out + 4, out + 11, &clamp_lo, &clamp_hi);
+    addsub_avx2(v[5], v[10], out + 5, out + 10, &clamp_lo, &clamp_hi);
+    addsub_avx2(v[6], v[9], out + 6, out + 9, &clamp_lo, &clamp_hi);
+    addsub_avx2(v[7], v[8], out + 7, out + 8, &clamp_lo, &clamp_hi);
+
+    if (!do_cols) {
+      const int log_range_out = AOMMAX(16, bd + 6);
+      const __m256i clamp_lo_out =
+          _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+      const __m256i clamp_hi_out =
+          _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+      round_shift_8x8_avx2(out, out_shift);
+      highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 16);
+    }
+  }
+}
+
+static void iadst16_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+                              int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
+  const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
+  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i v[16], x, y, temp1, temp2;
+
+  // Calculate the column 0, 1, 2, 3
+  {
+    // stage 0
+    // stage 1
+    // stage 2
+    x = _mm256_mullo_epi32(in[0], cospi62);
+    v[0] = _mm256_add_epi32(x, rnding);
+    v[0] = _mm256_srai_epi32(v[0], bit);
+
+    x = _mm256_mullo_epi32(in[0], cospi2);
+    v[1] = _mm256_sub_epi32(zero, x);
+    v[1] = _mm256_add_epi32(v[1], rnding);
+    v[1] = _mm256_srai_epi32(v[1], bit);
+
+    // stage 3
+    v[8] = v[0];
+    v[9] = v[1];
+
+    // stage 4
+    temp1 = _mm256_mullo_epi32(v[8], cospi8);
+    x = _mm256_mullo_epi32(v[9], cospi56);
+    temp1 = _mm256_add_epi32(temp1, x);
+    temp1 = _mm256_add_epi32(temp1, rnding);
+    temp1 = _mm256_srai_epi32(temp1, bit);
+
+    temp2 = _mm256_mullo_epi32(v[8], cospi56);
+    x = _mm256_mullo_epi32(v[9], cospi8);
+    temp2 = _mm256_sub_epi32(temp2, x);
+    temp2 = _mm256_add_epi32(temp2, rnding);
+    temp2 = _mm256_srai_epi32(temp2, bit);
+    v[8] = temp1;
+    v[9] = temp2;
+
+    // stage 5
+    v[4] = v[0];
+    v[5] = v[1];
+    v[12] = v[8];
+    v[13] = v[9];
+
+    // stage 6
+    temp1 = _mm256_mullo_epi32(v[4], cospi16);
+    x = _mm256_mullo_epi32(v[5], cospi48);
+    temp1 = _mm256_add_epi32(temp1, x);
+    temp1 = _mm256_add_epi32(temp1, rnding);
+    temp1 = _mm256_srai_epi32(temp1, bit);
+
+    temp2 = _mm256_mullo_epi32(v[4], cospi48);
+    x = _mm256_mullo_epi32(v[5], cospi16);
+    temp2 = _mm256_sub_epi32(temp2, x);
+    temp2 = _mm256_add_epi32(temp2, rnding);
+    temp2 = _mm256_srai_epi32(temp2, bit);
+    v[4] = temp1;
+    v[5] = temp2;
+
+    temp1 = _mm256_mullo_epi32(v[12], cospi16);
+    x = _mm256_mullo_epi32(v[13], cospi48);
+    temp1 = _mm256_add_epi32(temp1, x);
+    temp1 = _mm256_add_epi32(temp1, rnding);
+    temp1 = _mm256_srai_epi32(temp1, bit);
+
+    temp2 = _mm256_mullo_epi32(v[12], cospi48);
+    x = _mm256_mullo_epi32(v[13], cospi16);
+    temp2 = _mm256_sub_epi32(temp2, x);
+    temp2 = _mm256_add_epi32(temp2, rnding);
+    temp2 = _mm256_srai_epi32(temp2, bit);
+    v[12] = temp1;
+    v[13] = temp2;
+
+    // stage 7
+    v[2] = v[0];
+    v[3] = v[1];
+    v[6] = v[4];
+    v[7] = v[5];
+    v[10] = v[8];
+    v[11] = v[9];
+    v[14] = v[12];
+    v[15] = v[13];
+
+    // stage 8
+    y = _mm256_mullo_epi32(v[2], cospi32);
+    x = _mm256_mullo_epi32(v[3], cospi32);
+    v[2] = _mm256_add_epi32(y, x);
+    v[2] = _mm256_add_epi32(v[2], rnding);
+    v[2] = _mm256_srai_epi32(v[2], bit);
+
+    v[3] = _mm256_sub_epi32(y, x);
+    v[3] = _mm256_add_epi32(v[3], rnding);
+    v[3] = _mm256_srai_epi32(v[3], bit);
+
+    y = _mm256_mullo_epi32(v[6], cospi32);
+    x = _mm256_mullo_epi32(v[7], cospi32);
+    v[6] = _mm256_add_epi32(y, x);
+    v[6] = _mm256_add_epi32(v[6], rnding);
+    v[6] = _mm256_srai_epi32(v[6], bit);
+
+    v[7] = _mm256_sub_epi32(y, x);
+    v[7] = _mm256_add_epi32(v[7], rnding);
+    v[7] = _mm256_srai_epi32(v[7], bit);
+
+    y = _mm256_mullo_epi32(v[10], cospi32);
+    x = _mm256_mullo_epi32(v[11], cospi32);
+    v[10] = _mm256_add_epi32(y, x);
+    v[10] = _mm256_add_epi32(v[10], rnding);
+    v[10] = _mm256_srai_epi32(v[10], bit);
+
+    v[11] = _mm256_sub_epi32(y, x);
+    v[11] = _mm256_add_epi32(v[11], rnding);
+    v[11] = _mm256_srai_epi32(v[11], bit);
+
+    y = _mm256_mullo_epi32(v[14], cospi32);
+    x = _mm256_mullo_epi32(v[15], cospi32);
+    v[14] = _mm256_add_epi32(y, x);
+    v[14] = _mm256_add_epi32(v[14], rnding);
+    v[14] = _mm256_srai_epi32(v[14], bit);
+
+    v[15] = _mm256_sub_epi32(y, x);
+    v[15] = _mm256_add_epi32(v[15], rnding);
+    v[15] = _mm256_srai_epi32(v[15], bit);
+
+    // stage 9
+    if (do_cols) {
+      out[0] = v[0];
+      out[1] = _mm256_sub_epi32(_mm256_setzero_si256(), v[8]);
+      out[2] = v[12];
+      out[3] = _mm256_sub_epi32(_mm256_setzero_si256(), v[4]);
+      out[4] = v[6];
+      out[5] = _mm256_sub_epi32(_mm256_setzero_si256(), v[14]);
+      out[6] = v[10];
+      out[7] = _mm256_sub_epi32(_mm256_setzero_si256(), v[2]);
+      out[8] = v[3];
+      out[9] = _mm256_sub_epi32(_mm256_setzero_si256(), v[11]);
+      out[10] = v[15];
+      out[11] = _mm256_sub_epi32(_mm256_setzero_si256(), v[7]);
+      out[12] = v[5];
+      out[13] = _mm256_sub_epi32(_mm256_setzero_si256(), v[13]);
+      out[14] = v[9];
+      out[15] = _mm256_sub_epi32(_mm256_setzero_si256(), v[1]);
+    } else {
+      const int log_range_out = AOMMAX(16, bd + 6);
+      const __m256i clamp_lo_out =
+          _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+      const __m256i clamp_hi_out =
+          _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+      neg_shift_avx2(v[0], v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+                     out_shift);
+      neg_shift_avx2(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+      neg_shift_avx2(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+      neg_shift_avx2(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+      neg_shift_avx2(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+      neg_shift_avx2(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+      neg_shift_avx2(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+      neg_shift_avx2(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+    }
+  }
+}
+
+static void iadst16_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+                              int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
+  const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
+  const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
+  const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
+  const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
+  const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
+  const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
+  const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
+  const __m256i cospi34 = _mm256_set1_epi32(cospi[34]);
+  const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
+  const __m256i cospi42 = _mm256_set1_epi32(cospi[42]);
+  const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
+  const __m256i cospi50 = _mm256_set1_epi32(cospi[50]);
+  const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
+  const __m256i cospi58 = _mm256_set1_epi32(cospi[58]);
+  const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
+  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+  const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+  const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+  const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
+  const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
+  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+  const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+  const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+  __m256i u[16], x, y;
+
+  {
+    // stage 0
+    // stage 1
+    // stage 2
+    __m256i zero = _mm256_setzero_si256();
+    x = _mm256_mullo_epi32(in[0], cospi62);
+    u[0] = _mm256_add_epi32(x, rnding);
+    u[0] = _mm256_srai_epi32(u[0], bit);
+
+    x = _mm256_mullo_epi32(in[0], cospi2);
+    u[1] = _mm256_sub_epi32(zero, x);
+    u[1] = _mm256_add_epi32(u[1], rnding);
+    u[1] = _mm256_srai_epi32(u[1], bit);
+
+    x = _mm256_mullo_epi32(in[2], cospi54);
+    u[2] = _mm256_add_epi32(x, rnding);
+    u[2] = _mm256_srai_epi32(u[2], bit);
+
+    x = _mm256_mullo_epi32(in[2], cospi10);
+    u[3] = _mm256_sub_epi32(zero, x);
+    u[3] = _mm256_add_epi32(u[3], rnding);
+    u[3] = _mm256_srai_epi32(u[3], bit);
+
+    x = _mm256_mullo_epi32(in[4], cospi46);
+    u[4] = _mm256_add_epi32(x, rnding);
+    u[4] = _mm256_srai_epi32(u[4], bit);
+
+    x = _mm256_mullo_epi32(in[4], cospi18);
+    u[5] = _mm256_sub_epi32(zero, x);
+    u[5] = _mm256_add_epi32(u[5], rnding);
+    u[5] = _mm256_srai_epi32(u[5], bit);
+
+    x = _mm256_mullo_epi32(in[6], cospi38);
+    u[6] = _mm256_add_epi32(x, rnding);
+    u[6] = _mm256_srai_epi32(u[6], bit);
+
+    x = _mm256_mullo_epi32(in[6], cospi26);
+    u[7] = _mm256_sub_epi32(zero, x);
+    u[7] = _mm256_add_epi32(u[7], rnding);
+    u[7] = _mm256_srai_epi32(u[7], bit);
+
+    u[8] = _mm256_mullo_epi32(in[7], cospi34);
+    u[8] = _mm256_add_epi32(u[8], rnding);
+    u[8] = _mm256_srai_epi32(u[8], bit);
+
+    u[9] = _mm256_mullo_epi32(in[7], cospi30);
+    u[9] = _mm256_add_epi32(u[9], rnding);
+    u[9] = _mm256_srai_epi32(u[9], bit);
+
+    u[10] = _mm256_mullo_epi32(in[5], cospi42);
+    u[10] = _mm256_add_epi32(u[10], rnding);
+    u[10] = _mm256_srai_epi32(u[10], bit);
+
+    u[11] = _mm256_mullo_epi32(in[5], cospi22);
+    u[11] = _mm256_add_epi32(u[11], rnding);
+    u[11] = _mm256_srai_epi32(u[11], bit);
+
+    u[12] = _mm256_mullo_epi32(in[3], cospi50);
+    u[12] = _mm256_add_epi32(u[12], rnding);
+    u[12] = _mm256_srai_epi32(u[12], bit);
+
+    u[13] = _mm256_mullo_epi32(in[3], cospi14);
+    u[13] = _mm256_add_epi32(u[13], rnding);
+    u[13] = _mm256_srai_epi32(u[13], bit);
+
+    u[14] = _mm256_mullo_epi32(in[1], cospi58);
+    u[14] = _mm256_add_epi32(u[14], rnding);
+    u[14] = _mm256_srai_epi32(u[14], bit);
+
+    u[15] = _mm256_mullo_epi32(in[1], cospi6);
+    u[15] = _mm256_add_epi32(u[15], rnding);
+    u[15] = _mm256_srai_epi32(u[15], bit);
+
+    // stage 3
+    addsub_avx2(u[0], u[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[1], u[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[2], u[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[3], u[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[4], u[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[5], u[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[6], u[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[7], u[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
+
+    // stage 4
+    y = _mm256_mullo_epi32(u[8], cospi56);
+    x = _mm256_mullo_epi32(u[9], cospi56);
+    u[8] = _mm256_mullo_epi32(u[8], cospi8);
+    u[8] = _mm256_add_epi32(u[8], x);
+    u[8] = _mm256_add_epi32(u[8], rnding);
+    u[8] = _mm256_srai_epi32(u[8], bit);
+
+    x = _mm256_mullo_epi32(u[9], cospi8);
+    u[9] = _mm256_sub_epi32(y, x);
+    u[9] = _mm256_add_epi32(u[9], rnding);
+    u[9] = _mm256_srai_epi32(u[9], bit);
+
+    x = _mm256_mullo_epi32(u[11], cospi24);
+    y = _mm256_mullo_epi32(u[10], cospi24);
+    u[10] = _mm256_mullo_epi32(u[10], cospi40);
+    u[10] = _mm256_add_epi32(u[10], x);
+    u[10] = _mm256_add_epi32(u[10], rnding);
+    u[10] = _mm256_srai_epi32(u[10], bit);
+
+    x = _mm256_mullo_epi32(u[11], cospi40);
+    u[11] = _mm256_sub_epi32(y, x);
+    u[11] = _mm256_add_epi32(u[11], rnding);
+    u[11] = _mm256_srai_epi32(u[11], bit);
+
+    x = _mm256_mullo_epi32(u[13], cospi8);
+    y = _mm256_mullo_epi32(u[12], cospi8);
+    u[12] = _mm256_mullo_epi32(u[12], cospim56);
+    u[12] = _mm256_add_epi32(u[12], x);
+    u[12] = _mm256_add_epi32(u[12], rnding);
+    u[12] = _mm256_srai_epi32(u[12], bit);
+
+    x = _mm256_mullo_epi32(u[13], cospim56);
+    u[13] = _mm256_sub_epi32(y, x);
+    u[13] = _mm256_add_epi32(u[13], rnding);
+    u[13] = _mm256_srai_epi32(u[13], bit);
+
+    x = _mm256_mullo_epi32(u[15], cospi40);
+    y = _mm256_mullo_epi32(u[14], cospi40);
+    u[14] = _mm256_mullo_epi32(u[14], cospim24);
+    u[14] = _mm256_add_epi32(u[14], x);
+    u[14] = _mm256_add_epi32(u[14], rnding);
+    u[14] = _mm256_srai_epi32(u[14], bit);
+
+    x = _mm256_mullo_epi32(u[15], cospim24);
+    u[15] = _mm256_sub_epi32(y, x);
+    u[15] = _mm256_add_epi32(u[15], rnding);
+    u[15] = _mm256_srai_epi32(u[15], bit);
+
+    // stage 5
+    addsub_avx2(u[0], u[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[1], u[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[2], u[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[3], u[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[8], u[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[9], u[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[10], u[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[11], u[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
+
+    // stage 6
+    x = _mm256_mullo_epi32(u[5], cospi48);
+    y = _mm256_mullo_epi32(u[4], cospi48);
+    u[4] = _mm256_mullo_epi32(u[4], cospi16);
+    u[4] = _mm256_add_epi32(u[4], x);
+    u[4] = _mm256_add_epi32(u[4], rnding);
+    u[4] = _mm256_srai_epi32(u[4], bit);
+
+    x = _mm256_mullo_epi32(u[5], cospi16);
+    u[5] = _mm256_sub_epi32(y, x);
+    u[5] = _mm256_add_epi32(u[5], rnding);
+    u[5] = _mm256_srai_epi32(u[5], bit);
+
+    x = _mm256_mullo_epi32(u[7], cospi16);
+    y = _mm256_mullo_epi32(u[6], cospi16);
+    u[6] = _mm256_mullo_epi32(u[6], cospim48);
+    u[6] = _mm256_add_epi32(u[6], x);
+    u[6] = _mm256_add_epi32(u[6], rnding);
+    u[6] = _mm256_srai_epi32(u[6], bit);
+
+    x = _mm256_mullo_epi32(u[7], cospim48);
+    u[7] = _mm256_sub_epi32(y, x);
+    u[7] = _mm256_add_epi32(u[7], rnding);
+    u[7] = _mm256_srai_epi32(u[7], bit);
+
+    x = _mm256_mullo_epi32(u[13], cospi48);
+    y = _mm256_mullo_epi32(u[12], cospi48);
+    u[12] = _mm256_mullo_epi32(u[12], cospi16);
+    u[12] = _mm256_add_epi32(u[12], x);
+    u[12] = _mm256_add_epi32(u[12], rnding);
+    u[12] = _mm256_srai_epi32(u[12], bit);
+
+    x = _mm256_mullo_epi32(u[13], cospi16);
+    u[13] = _mm256_sub_epi32(y, x);
+    u[13] = _mm256_add_epi32(u[13], rnding);
+    u[13] = _mm256_srai_epi32(u[13], bit);
+
+    x = _mm256_mullo_epi32(u[15], cospi16);
+    y = _mm256_mullo_epi32(u[14], cospi16);
+    u[14] = _mm256_mullo_epi32(u[14], cospim48);
+    u[14] = _mm256_add_epi32(u[14], x);
+    u[14] = _mm256_add_epi32(u[14], rnding);
+    u[14] = _mm256_srai_epi32(u[14], bit);
+
+    x = _mm256_mullo_epi32(u[15], cospim48);
+    u[15] = _mm256_sub_epi32(y, x);
+    u[15] = _mm256_add_epi32(u[15], rnding);
+    u[15] = _mm256_srai_epi32(u[15], bit);
+
+    // stage 7
+    addsub_avx2(u[0], u[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[1], u[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[4], u[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[5], u[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[8], u[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[9], u[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[12], u[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[13], u[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
+
+    // stage 8
+    y = _mm256_mullo_epi32(u[2], cospi32);
+    x = _mm256_mullo_epi32(u[3], cospi32);
+    u[2] = _mm256_add_epi32(y, x);
+    u[2] = _mm256_add_epi32(u[2], rnding);
+    u[2] = _mm256_srai_epi32(u[2], bit);
+
+    u[3] = _mm256_sub_epi32(y, x);
+    u[3] = _mm256_add_epi32(u[3], rnding);
+    u[3] = _mm256_srai_epi32(u[3], bit);
+    y = _mm256_mullo_epi32(u[6], cospi32);
+    x = _mm256_mullo_epi32(u[7], cospi32);
+    u[6] = _mm256_add_epi32(y, x);
+    u[6] = _mm256_add_epi32(u[6], rnding);
+    u[6] = _mm256_srai_epi32(u[6], bit);
+
+    u[7] = _mm256_sub_epi32(y, x);
+    u[7] = _mm256_add_epi32(u[7], rnding);
+    u[7] = _mm256_srai_epi32(u[7], bit);
+
+    y = _mm256_mullo_epi32(u[10], cospi32);
+    x = _mm256_mullo_epi32(u[11], cospi32);
+    u[10] = _mm256_add_epi32(y, x);
+    u[10] = _mm256_add_epi32(u[10], rnding);
+    u[10] = _mm256_srai_epi32(u[10], bit);
+
+    u[11] = _mm256_sub_epi32(y, x);
+    u[11] = _mm256_add_epi32(u[11], rnding);
+    u[11] = _mm256_srai_epi32(u[11], bit);
+
+    y = _mm256_mullo_epi32(u[14], cospi32);
+    x = _mm256_mullo_epi32(u[15], cospi32);
+    u[14] = _mm256_add_epi32(y, x);
+    u[14] = _mm256_add_epi32(u[14], rnding);
+    u[14] = _mm256_srai_epi32(u[14], bit);
+
+    u[15] = _mm256_sub_epi32(y, x);
+    u[15] = _mm256_add_epi32(u[15], rnding);
+    u[15] = _mm256_srai_epi32(u[15], bit);
+
+    // stage 9
+    if (do_cols) {
+      out[0] = u[0];
+      out[1] = _mm256_sub_epi32(_mm256_setzero_si256(), u[8]);
+      out[2] = u[12];
+      out[3] = _mm256_sub_epi32(_mm256_setzero_si256(), u[4]);
+      out[4] = u[6];
+      out[5] = _mm256_sub_epi32(_mm256_setzero_si256(), u[14]);
+      out[6] = u[10];
+      out[7] = _mm256_sub_epi32(_mm256_setzero_si256(), u[2]);
+      out[8] = u[3];
+      out[9] = _mm256_sub_epi32(_mm256_setzero_si256(), u[11]);
+      out[10] = u[15];
+      out[11] = _mm256_sub_epi32(_mm256_setzero_si256(), u[7]);
+      out[12] = u[5];
+      out[13] = _mm256_sub_epi32(_mm256_setzero_si256(), u[13]);
+      out[14] = u[9];
+      out[15] = _mm256_sub_epi32(_mm256_setzero_si256(), u[1]);
+    } else {
+      const int log_range_out = AOMMAX(16, bd + 6);
+      const __m256i clamp_lo_out =
+          _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+      const __m256i clamp_hi_out =
+          _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+      neg_shift_avx2(u[0], u[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+                     out_shift);
+      neg_shift_avx2(u[12], u[4], out + 2, out + 3, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+      neg_shift_avx2(u[6], u[14], out + 4, out + 5, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+      neg_shift_avx2(u[10], u[2], out + 6, out + 7, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+      neg_shift_avx2(u[3], u[11], out + 8, out + 9, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+      neg_shift_avx2(u[15], u[7], out + 10, out + 11, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+      neg_shift_avx2(u[5], u[13], out + 12, out + 13, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+      neg_shift_avx2(u[9], u[1], out + 14, out + 15, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+    }
+  }
+}
+
+static void iadst16_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+                         int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
+  const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
+  const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
+  const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
+  const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
+  const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
+  const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
+  const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
+  const __m256i cospi34 = _mm256_set1_epi32(cospi[34]);
+  const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
+  const __m256i cospi42 = _mm256_set1_epi32(cospi[42]);
+  const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
+  const __m256i cospi50 = _mm256_set1_epi32(cospi[50]);
+  const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
+  const __m256i cospi58 = _mm256_set1_epi32(cospi[58]);
+  const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
+  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+  const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+  const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+  const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
+  const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
+  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+  const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+  const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+  __m256i u[16], v[16], x, y;
+
+  {
+    // stage 0
+    // stage 1
+    // stage 2
+    v[0] = _mm256_mullo_epi32(in[15], cospi2);
+    x = _mm256_mullo_epi32(in[0], cospi62);
+    v[0] = _mm256_add_epi32(v[0], x);
+    v[0] = _mm256_add_epi32(v[0], rnding);
+    v[0] = _mm256_srai_epi32(v[0], bit);
+
+    v[1] = _mm256_mullo_epi32(in[15], cospi62);
+    x = _mm256_mullo_epi32(in[0], cospi2);
+    v[1] = _mm256_sub_epi32(v[1], x);
+    v[1] = _mm256_add_epi32(v[1], rnding);
+    v[1] = _mm256_srai_epi32(v[1], bit);
+
+    v[2] = _mm256_mullo_epi32(in[13], cospi10);
+    x = _mm256_mullo_epi32(in[2], cospi54);
+    v[2] = _mm256_add_epi32(v[2], x);
+    v[2] = _mm256_add_epi32(v[2], rnding);
+    v[2] = _mm256_srai_epi32(v[2], bit);
+
+    v[3] = _mm256_mullo_epi32(in[13], cospi54);
+    x = _mm256_mullo_epi32(in[2], cospi10);
+    v[3] = _mm256_sub_epi32(v[3], x);
+    v[3] = _mm256_add_epi32(v[3], rnding);
+    v[3] = _mm256_srai_epi32(v[3], bit);
+
+    v[4] = _mm256_mullo_epi32(in[11], cospi18);
+    x = _mm256_mullo_epi32(in[4], cospi46);
+    v[4] = _mm256_add_epi32(v[4], x);
+    v[4] = _mm256_add_epi32(v[4], rnding);
+    v[4] = _mm256_srai_epi32(v[4], bit);
+
+    v[5] = _mm256_mullo_epi32(in[11], cospi46);
+    x = _mm256_mullo_epi32(in[4], cospi18);
+    v[5] = _mm256_sub_epi32(v[5], x);
+    v[5] = _mm256_add_epi32(v[5], rnding);
+    v[5] = _mm256_srai_epi32(v[5], bit);
+
+    v[6] = _mm256_mullo_epi32(in[9], cospi26);
+    x = _mm256_mullo_epi32(in[6], cospi38);
+    v[6] = _mm256_add_epi32(v[6], x);
+    v[6] = _mm256_add_epi32(v[6], rnding);
+    v[6] = _mm256_srai_epi32(v[6], bit);
+
+    v[7] = _mm256_mullo_epi32(in[9], cospi38);
+    x = _mm256_mullo_epi32(in[6], cospi26);
+    v[7] = _mm256_sub_epi32(v[7], x);
+    v[7] = _mm256_add_epi32(v[7], rnding);
+    v[7] = _mm256_srai_epi32(v[7], bit);
+
+    v[8] = _mm256_mullo_epi32(in[7], cospi34);
+    x = _mm256_mullo_epi32(in[8], cospi30);
+    v[8] = _mm256_add_epi32(v[8], x);
+    v[8] = _mm256_add_epi32(v[8], rnding);
+    v[8] = _mm256_srai_epi32(v[8], bit);
+
+    v[9] = _mm256_mullo_epi32(in[7], cospi30);
+    x = _mm256_mullo_epi32(in[8], cospi34);
+    v[9] = _mm256_sub_epi32(v[9], x);
+    v[9] = _mm256_add_epi32(v[9], rnding);
+    v[9] = _mm256_srai_epi32(v[9], bit);
+
+    v[10] = _mm256_mullo_epi32(in[5], cospi42);
+    x = _mm256_mullo_epi32(in[10], cospi22);
+    v[10] = _mm256_add_epi32(v[10], x);
+    v[10] = _mm256_add_epi32(v[10], rnding);
+    v[10] = _mm256_srai_epi32(v[10], bit);
+
+    v[11] = _mm256_mullo_epi32(in[5], cospi22);
+    x = _mm256_mullo_epi32(in[10], cospi42);
+    v[11] = _mm256_sub_epi32(v[11], x);
+    v[11] = _mm256_add_epi32(v[11], rnding);
+    v[11] = _mm256_srai_epi32(v[11], bit);
+
+    v[12] = _mm256_mullo_epi32(in[3], cospi50);
+    x = _mm256_mullo_epi32(in[12], cospi14);
+    v[12] = _mm256_add_epi32(v[12], x);
+    v[12] = _mm256_add_epi32(v[12], rnding);
+    v[12] = _mm256_srai_epi32(v[12], bit);
+
+    v[13] = _mm256_mullo_epi32(in[3], cospi14);
+    x = _mm256_mullo_epi32(in[12], cospi50);
+    v[13] = _mm256_sub_epi32(v[13], x);
+    v[13] = _mm256_add_epi32(v[13], rnding);
+    v[13] = _mm256_srai_epi32(v[13], bit);
+
+    v[14] = _mm256_mullo_epi32(in[1], cospi58);
+    x = _mm256_mullo_epi32(in[14], cospi6);
+    v[14] = _mm256_add_epi32(v[14], x);
+    v[14] = _mm256_add_epi32(v[14], rnding);
+    v[14] = _mm256_srai_epi32(v[14], bit);
+
+    v[15] = _mm256_mullo_epi32(in[1], cospi6);
+    x = _mm256_mullo_epi32(in[14], cospi58);
+    v[15] = _mm256_sub_epi32(v[15], x);
+    v[15] = _mm256_add_epi32(v[15], rnding);
+    v[15] = _mm256_srai_epi32(v[15], bit);
+
+    // stage 3
+    addsub_avx2(v[0], v[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
+    addsub_avx2(v[1], v[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
+    addsub_avx2(v[2], v[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
+    addsub_avx2(v[3], v[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
+    addsub_avx2(v[4], v[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
+    addsub_avx2(v[5], v[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
+    addsub_avx2(v[6], v[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
+    addsub_avx2(v[7], v[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
+
+    // stage 4
+    v[0] = u[0];
+    v[1] = u[1];
+    v[2] = u[2];
+    v[3] = u[3];
+    v[4] = u[4];
+    v[5] = u[5];
+    v[6] = u[6];
+    v[7] = u[7];
+
+    v[8] = _mm256_mullo_epi32(u[8], cospi8);
+    x = _mm256_mullo_epi32(u[9], cospi56);
+    v[8] = _mm256_add_epi32(v[8], x);
+    v[8] = _mm256_add_epi32(v[8], rnding);
+    v[8] = _mm256_srai_epi32(v[8], bit);
+
+    v[9] = _mm256_mullo_epi32(u[8], cospi56);
+    x = _mm256_mullo_epi32(u[9], cospi8);
+    v[9] = _mm256_sub_epi32(v[9], x);
+    v[9] = _mm256_add_epi32(v[9], rnding);
+    v[9] = _mm256_srai_epi32(v[9], bit);
+
+    v[10] = _mm256_mullo_epi32(u[10], cospi40);
+    x = _mm256_mullo_epi32(u[11], cospi24);
+    v[10] = _mm256_add_epi32(v[10], x);
+    v[10] = _mm256_add_epi32(v[10], rnding);
+    v[10] = _mm256_srai_epi32(v[10], bit);
+
+    v[11] = _mm256_mullo_epi32(u[10], cospi24);
+    x = _mm256_mullo_epi32(u[11], cospi40);
+    v[11] = _mm256_sub_epi32(v[11], x);
+    v[11] = _mm256_add_epi32(v[11], rnding);
+    v[11] = _mm256_srai_epi32(v[11], bit);
+
+    v[12] = _mm256_mullo_epi32(u[12], cospim56);
+    x = _mm256_mullo_epi32(u[13], cospi8);
+    v[12] = _mm256_add_epi32(v[12], x);
+    v[12] = _mm256_add_epi32(v[12], rnding);
+    v[12] = _mm256_srai_epi32(v[12], bit);
+
+    v[13] = _mm256_mullo_epi32(u[12], cospi8);
+    x = _mm256_mullo_epi32(u[13], cospim56);
+    v[13] = _mm256_sub_epi32(v[13], x);
+    v[13] = _mm256_add_epi32(v[13], rnding);
+    v[13] = _mm256_srai_epi32(v[13], bit);
+
+    v[14] = _mm256_mullo_epi32(u[14], cospim24);
+    x = _mm256_mullo_epi32(u[15], cospi40);
+    v[14] = _mm256_add_epi32(v[14], x);
+    v[14] = _mm256_add_epi32(v[14], rnding);
+    v[14] = _mm256_srai_epi32(v[14], bit);
+
+    v[15] = _mm256_mullo_epi32(u[14], cospi40);
+    x = _mm256_mullo_epi32(u[15], cospim24);
+    v[15] = _mm256_sub_epi32(v[15], x);
+    v[15] = _mm256_add_epi32(v[15], rnding);
+    v[15] = _mm256_srai_epi32(v[15], bit);
+
+    // stage 5
+    addsub_avx2(v[0], v[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
+    addsub_avx2(v[1], v[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
+    addsub_avx2(v[2], v[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
+    addsub_avx2(v[3], v[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
+    addsub_avx2(v[8], v[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
+    addsub_avx2(v[9], v[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
+    addsub_avx2(v[10], v[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
+    addsub_avx2(v[11], v[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
+
+    // stage 6
+    v[0] = u[0];
+    v[1] = u[1];
+    v[2] = u[2];
+    v[3] = u[3];
+
+    v[4] = _mm256_mullo_epi32(u[4], cospi16);
+    x = _mm256_mullo_epi32(u[5], cospi48);
+    v[4] = _mm256_add_epi32(v[4], x);
+    v[4] = _mm256_add_epi32(v[4], rnding);
+    v[4] = _mm256_srai_epi32(v[4], bit);
+
+    v[5] = _mm256_mullo_epi32(u[4], cospi48);
+    x = _mm256_mullo_epi32(u[5], cospi16);
+    v[5] = _mm256_sub_epi32(v[5], x);
+    v[5] = _mm256_add_epi32(v[5], rnding);
+    v[5] = _mm256_srai_epi32(v[5], bit);
+
+    v[6] = _mm256_mullo_epi32(u[6], cospim48);
+    x = _mm256_mullo_epi32(u[7], cospi16);
+    v[6] = _mm256_add_epi32(v[6], x);
+    v[6] = _mm256_add_epi32(v[6], rnding);
+    v[6] = _mm256_srai_epi32(v[6], bit);
+
+    v[7] = _mm256_mullo_epi32(u[6], cospi16);
+    x = _mm256_mullo_epi32(u[7], cospim48);
+    v[7] = _mm256_sub_epi32(v[7], x);
+    v[7] = _mm256_add_epi32(v[7], rnding);
+    v[7] = _mm256_srai_epi32(v[7], bit);
+
+    v[8] = u[8];
+    v[9] = u[9];
+    v[10] = u[10];
+    v[11] = u[11];
+
+    v[12] = _mm256_mullo_epi32(u[12], cospi16);
+    x = _mm256_mullo_epi32(u[13], cospi48);
+    v[12] = _mm256_add_epi32(v[12], x);
+    v[12] = _mm256_add_epi32(v[12], rnding);
+    v[12] = _mm256_srai_epi32(v[12], bit);
+
+    v[13] = _mm256_mullo_epi32(u[12], cospi48);
+    x = _mm256_mullo_epi32(u[13], cospi16);
+    v[13] = _mm256_sub_epi32(v[13], x);
+    v[13] = _mm256_add_epi32(v[13], rnding);
+    v[13] = _mm256_srai_epi32(v[13], bit);
+
+    v[14] = _mm256_mullo_epi32(u[14], cospim48);
+    x = _mm256_mullo_epi32(u[15], cospi16);
+    v[14] = _mm256_add_epi32(v[14], x);
+    v[14] = _mm256_add_epi32(v[14], rnding);
+    v[14] = _mm256_srai_epi32(v[14], bit);
+
+    v[15] = _mm256_mullo_epi32(u[14], cospi16);
+    x = _mm256_mullo_epi32(u[15], cospim48);
+    v[15] = _mm256_sub_epi32(v[15], x);
+    v[15] = _mm256_add_epi32(v[15], rnding);
+    v[15] = _mm256_srai_epi32(v[15], bit);
+
+    // stage 7
+    addsub_avx2(v[0], v[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
+    addsub_avx2(v[1], v[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
+    addsub_avx2(v[4], v[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
+    addsub_avx2(v[5], v[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
+    addsub_avx2(v[8], v[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
+    addsub_avx2(v[9], v[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
+    addsub_avx2(v[12], v[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
+    addsub_avx2(v[13], v[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
+
+    // stage 8
+    v[0] = u[0];
+    v[1] = u[1];
+
+    y = _mm256_mullo_epi32(u[2], cospi32);
+    x = _mm256_mullo_epi32(u[3], cospi32);
+    v[2] = _mm256_add_epi32(y, x);
+    v[2] = _mm256_add_epi32(v[2], rnding);
+    v[2] = _mm256_srai_epi32(v[2], bit);
+
+    v[3] = _mm256_sub_epi32(y, x);
+    v[3] = _mm256_add_epi32(v[3], rnding);
+    v[3] = _mm256_srai_epi32(v[3], bit);
+
+    v[4] = u[4];
+    v[5] = u[5];
+
+    y = _mm256_mullo_epi32(u[6], cospi32);
+    x = _mm256_mullo_epi32(u[7], cospi32);
+    v[6] = _mm256_add_epi32(y, x);
+    v[6] = _mm256_add_epi32(v[6], rnding);
+    v[6] = _mm256_srai_epi32(v[6], bit);
+
+    v[7] = _mm256_sub_epi32(y, x);
+    v[7] = _mm256_add_epi32(v[7], rnding);
+    v[7] = _mm256_srai_epi32(v[7], bit);
+
+    v[8] = u[8];
+    v[9] = u[9];
+
+    y = _mm256_mullo_epi32(u[10], cospi32);
+    x = _mm256_mullo_epi32(u[11], cospi32);
+    v[10] = _mm256_add_epi32(y, x);
+    v[10] = _mm256_add_epi32(v[10], rnding);
+    v[10] = _mm256_srai_epi32(v[10], bit);
+
+    v[11] = _mm256_sub_epi32(y, x);
+    v[11] = _mm256_add_epi32(v[11], rnding);
+    v[11] = _mm256_srai_epi32(v[11], bit);
+
+    v[12] = u[12];
+    v[13] = u[13];
+
+    y = _mm256_mullo_epi32(u[14], cospi32);
+    x = _mm256_mullo_epi32(u[15], cospi32);
+    v[14] = _mm256_add_epi32(y, x);
+    v[14] = _mm256_add_epi32(v[14], rnding);
+    v[14] = _mm256_srai_epi32(v[14], bit);
+
+    v[15] = _mm256_sub_epi32(y, x);
+    v[15] = _mm256_add_epi32(v[15], rnding);
+    v[15] = _mm256_srai_epi32(v[15], bit);
+
+    // stage 9
+    if (do_cols) {
+      out[0] = v[0];
+      out[1] = _mm256_sub_epi32(_mm256_setzero_si256(), v[8]);
+      out[2] = v[12];
+      out[3] = _mm256_sub_epi32(_mm256_setzero_si256(), v[4]);
+      out[4] = v[6];
+      out[5] = _mm256_sub_epi32(_mm256_setzero_si256(), v[14]);
+      out[6] = v[10];
+      out[7] = _mm256_sub_epi32(_mm256_setzero_si256(), v[2]);
+      out[8] = v[3];
+      out[9] = _mm256_sub_epi32(_mm256_setzero_si256(), v[11]);
+      out[10] = v[15];
+      out[11] = _mm256_sub_epi32(_mm256_setzero_si256(), v[7]);
+      out[12] = v[5];
+      out[13] = _mm256_sub_epi32(_mm256_setzero_si256(), v[13]);
+      out[14] = v[9];
+      out[15] = _mm256_sub_epi32(_mm256_setzero_si256(), v[1]);
+    } else {
+      const int log_range_out = AOMMAX(16, bd + 6);
+      const __m256i clamp_lo_out =
+          _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+      const __m256i clamp_hi_out =
+          _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+      neg_shift_avx2(v[0], v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+                     out_shift);
+      neg_shift_avx2(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+      neg_shift_avx2(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+      neg_shift_avx2(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+      neg_shift_avx2(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+      neg_shift_avx2(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+      neg_shift_avx2(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+      neg_shift_avx2(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+    }
+  }
+}
+static void idct8x8_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+                              int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+  __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+  __m256i x;
+
+  // stage 0
+  // stage 1
+  // stage 2
+  // stage 3
+  x = _mm256_mullo_epi32(in[0], cospi32);
+  x = _mm256_add_epi32(x, rnding);
+  x = _mm256_srai_epi32(x, bit);
+
+  // stage 4
+  // stage 5
+  if (!do_cols) {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1);
+    clamp_lo = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+    clamp_hi = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+    x = _mm256_add_epi32(x, offset);
+    x = _mm256_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
+  }
+  x = _mm256_max_epi32(x, clamp_lo);
+  x = _mm256_min_epi32(x, clamp_hi);
+  out[0] = x;
+  out[1] = x;
+  out[2] = x;
+  out[3] = x;
+  out[4] = x;
+  out[5] = x;
+  out[6] = x;
+  out[7] = x;
+}
+static void idct8x8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+                         int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+  const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
+  const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+  const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
+  const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+  const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
+  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+  const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+  __m256i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m256i v0, v1, v2, v3, v4, v5, v6, v7;
+  __m256i x, y;
+
+  // stage 0
+  // stage 1
+  // stage 2
+  u0 = in[0];
+  u1 = in[4];
+  u2 = in[2];
+  u3 = in[6];
+
+  x = _mm256_mullo_epi32(in[1], cospi56);
+  y = _mm256_mullo_epi32(in[7], cospim8);
+  u4 = _mm256_add_epi32(x, y);
+  u4 = _mm256_add_epi32(u4, rnding);
+  u4 = _mm256_srai_epi32(u4, bit);
+
+  x = _mm256_mullo_epi32(in[1], cospi8);
+  y = _mm256_mullo_epi32(in[7], cospi56);
+  u7 = _mm256_add_epi32(x, y);
+  u7 = _mm256_add_epi32(u7, rnding);
+  u7 = _mm256_srai_epi32(u7, bit);
+
+  x = _mm256_mullo_epi32(in[5], cospi24);
+  y = _mm256_mullo_epi32(in[3], cospim40);
+  u5 = _mm256_add_epi32(x, y);
+  u5 = _mm256_add_epi32(u5, rnding);
+  u5 = _mm256_srai_epi32(u5, bit);
+
+  x = _mm256_mullo_epi32(in[5], cospi40);
+  y = _mm256_mullo_epi32(in[3], cospi24);
+  u6 = _mm256_add_epi32(x, y);
+  u6 = _mm256_add_epi32(u6, rnding);
+  u6 = _mm256_srai_epi32(u6, bit);
+
+  // stage 3
+  x = _mm256_mullo_epi32(u0, cospi32);
+  y = _mm256_mullo_epi32(u1, cospi32);
+  v0 = _mm256_add_epi32(x, y);
+  v0 = _mm256_add_epi32(v0, rnding);
+  v0 = _mm256_srai_epi32(v0, bit);
+
+  v1 = _mm256_sub_epi32(x, y);
+  v1 = _mm256_add_epi32(v1, rnding);
+  v1 = _mm256_srai_epi32(v1, bit);
+
+  x = _mm256_mullo_epi32(u2, cospi48);
+  y = _mm256_mullo_epi32(u3, cospim16);
+  v2 = _mm256_add_epi32(x, y);
+  v2 = _mm256_add_epi32(v2, rnding);
+  v2 = _mm256_srai_epi32(v2, bit);
+
+  x = _mm256_mullo_epi32(u2, cospi16);
+  y = _mm256_mullo_epi32(u3, cospi48);
+  v3 = _mm256_add_epi32(x, y);
+  v3 = _mm256_add_epi32(v3, rnding);
+  v3 = _mm256_srai_epi32(v3, bit);
+
+  addsub_avx2(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi);
+  addsub_avx2(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi);
+
+  // stage 4
+  addsub_avx2(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi);
+  addsub_avx2(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi);
+  u4 = v4;
+  u7 = v7;
+
+  x = _mm256_mullo_epi32(v5, cospi32);
+  y = _mm256_mullo_epi32(v6, cospi32);
+  u6 = _mm256_add_epi32(y, x);
+  u6 = _mm256_add_epi32(u6, rnding);
+  u6 = _mm256_srai_epi32(u6, bit);
+
+  u5 = _mm256_sub_epi32(y, x);
+  u5 = _mm256_add_epi32(u5, rnding);
+  u5 = _mm256_srai_epi32(u5, bit);
+
+  addsub_avx2(u0, u7, out + 0, out + 7, &clamp_lo, &clamp_hi);
+  addsub_avx2(u1, u6, out + 1, out + 6, &clamp_lo, &clamp_hi);
+  addsub_avx2(u2, u5, out + 2, out + 5, &clamp_lo, &clamp_hi);
+  addsub_avx2(u3, u4, out + 3, out + 4, &clamp_lo, &clamp_hi);
+  // stage 5
+  if (!do_cols) {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+    const __m256i clamp_hi_out =
+        _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+    round_shift_4x4_avx2(out, out_shift);
+    round_shift_4x4_avx2(out + 4, out_shift);
+    highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 8);
+  }
+}
+static void iadst8x8_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+                               int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
+  const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
+  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+  const __m256i kZero = _mm256_setzero_si256();
+  __m256i u[8], x;
+
+  // stage 0
+  // stage 1
+  // stage 2
+
+  x = _mm256_mullo_epi32(in[0], cospi60);
+  u[0] = _mm256_add_epi32(x, rnding);
+  u[0] = _mm256_srai_epi32(u[0], bit);
+
+  x = _mm256_mullo_epi32(in[0], cospi4);
+  u[1] = _mm256_sub_epi32(kZero, x);
+  u[1] = _mm256_add_epi32(u[1], rnding);
+  u[1] = _mm256_srai_epi32(u[1], bit);
+
+  // stage 3
+  // stage 4
+  __m256i temp1, temp2;
+  temp1 = _mm256_mullo_epi32(u[0], cospi16);
+  x = _mm256_mullo_epi32(u[1], cospi48);
+  temp1 = _mm256_add_epi32(temp1, x);
+  temp1 = _mm256_add_epi32(temp1, rnding);
+  temp1 = _mm256_srai_epi32(temp1, bit);
+  u[4] = temp1;
+
+  temp2 = _mm256_mullo_epi32(u[0], cospi48);
+  x = _mm256_mullo_epi32(u[1], cospi16);
+  u[5] = _mm256_sub_epi32(temp2, x);
+  u[5] = _mm256_add_epi32(u[5], rnding);
+  u[5] = _mm256_srai_epi32(u[5], bit);
+
+  // stage 5
+  // stage 6
+  temp1 = _mm256_mullo_epi32(u[0], cospi32);
+  x = _mm256_mullo_epi32(u[1], cospi32);
+  u[2] = _mm256_add_epi32(temp1, x);
+  u[2] = _mm256_add_epi32(u[2], rnding);
+  u[2] = _mm256_srai_epi32(u[2], bit);
+
+  u[3] = _mm256_sub_epi32(temp1, x);
+  u[3] = _mm256_add_epi32(u[3], rnding);
+  u[3] = _mm256_srai_epi32(u[3], bit);
+
+  temp1 = _mm256_mullo_epi32(u[4], cospi32);
+  x = _mm256_mullo_epi32(u[5], cospi32);
+  u[6] = _mm256_add_epi32(temp1, x);
+  u[6] = _mm256_add_epi32(u[6], rnding);
+  u[6] = _mm256_srai_epi32(u[6], bit);
+
+  u[7] = _mm256_sub_epi32(temp1, x);
+  u[7] = _mm256_add_epi32(u[7], rnding);
+  u[7] = _mm256_srai_epi32(u[7], bit);
+
+  // stage 7
+  if (do_cols) {
+    out[0] = u[0];
+    out[1] = _mm256_sub_epi32(kZero, u[4]);
+    out[2] = u[6];
+    out[3] = _mm256_sub_epi32(kZero, u[2]);
+    out[4] = u[3];
+    out[5] = _mm256_sub_epi32(kZero, u[7]);
+    out[6] = u[5];
+    out[7] = _mm256_sub_epi32(kZero, u[1]);
+  } else {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+    const __m256i clamp_hi_out =
+        _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+    neg_shift_avx2(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+                   out_shift);
+    neg_shift_avx2(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
+                   out_shift);
+    neg_shift_avx2(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
+                   out_shift);
+    neg_shift_avx2(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
+                   out_shift);
+  }
+}
+
+static void iadst8x8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+                          int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
+  const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
+  const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
+  const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
+  const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
+  const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
+  const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
+  const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
+  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+  const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+  const __m256i kZero = _mm256_setzero_si256();
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+  const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+  __m256i u[8], v[8], x;
+
+  // stage 0
+  // stage 1
+  // stage 2
+
+  u[0] = _mm256_mullo_epi32(in[7], cospi4);
+  x = _mm256_mullo_epi32(in[0], cospi60);
+  u[0] = _mm256_add_epi32(u[0], x);
+  u[0] = _mm256_add_epi32(u[0], rnding);
+  u[0] = _mm256_srai_epi32(u[0], bit);
+
+  u[1] = _mm256_mullo_epi32(in[7], cospi60);
+  x = _mm256_mullo_epi32(in[0], cospi4);
+  u[1] = _mm256_sub_epi32(u[1], x);
+  u[1] = _mm256_add_epi32(u[1], rnding);
+  u[1] = _mm256_srai_epi32(u[1], bit);
+
+  u[2] = _mm256_mullo_epi32(in[5], cospi20);
+  x = _mm256_mullo_epi32(in[2], cospi44);
+  u[2] = _mm256_add_epi32(u[2], x);
+  u[2] = _mm256_add_epi32(u[2], rnding);
+  u[2] = _mm256_srai_epi32(u[2], bit);
+
+  u[3] = _mm256_mullo_epi32(in[5], cospi44);
+  x = _mm256_mullo_epi32(in[2], cospi20);
+  u[3] = _mm256_sub_epi32(u[3], x);
+  u[3] = _mm256_add_epi32(u[3], rnding);
+  u[3] = _mm256_srai_epi32(u[3], bit);
+
+  u[4] = _mm256_mullo_epi32(in[3], cospi36);
+  x = _mm256_mullo_epi32(in[4], cospi28);
+  u[4] = _mm256_add_epi32(u[4], x);
+  u[4] = _mm256_add_epi32(u[4], rnding);
+  u[4] = _mm256_srai_epi32(u[4], bit);
+
+  u[5] = _mm256_mullo_epi32(in[3], cospi28);
+  x = _mm256_mullo_epi32(in[4], cospi36);
+  u[5] = _mm256_sub_epi32(u[5], x);
+  u[5] = _mm256_add_epi32(u[5], rnding);
+  u[5] = _mm256_srai_epi32(u[5], bit);
+
+  u[6] = _mm256_mullo_epi32(in[1], cospi52);
+  x = _mm256_mullo_epi32(in[6], cospi12);
+  u[6] = _mm256_add_epi32(u[6], x);
+  u[6] = _mm256_add_epi32(u[6], rnding);
+  u[6] = _mm256_srai_epi32(u[6], bit);
+
+  u[7] = _mm256_mullo_epi32(in[1], cospi12);
+  x = _mm256_mullo_epi32(in[6], cospi52);
+  u[7] = _mm256_sub_epi32(u[7], x);
+  u[7] = _mm256_add_epi32(u[7], rnding);
+  u[7] = _mm256_srai_epi32(u[7], bit);
+
+  // stage 3
+  addsub_avx2(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
+  addsub_avx2(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
+  addsub_avx2(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
+  addsub_avx2(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
+
+  // stage 4
+  u[0] = v[0];
+  u[1] = v[1];
+  u[2] = v[2];
+  u[3] = v[3];
+
+  u[4] = _mm256_mullo_epi32(v[4], cospi16);
+  x = _mm256_mullo_epi32(v[5], cospi48);
+  u[4] = _mm256_add_epi32(u[4], x);
+  u[4] = _mm256_add_epi32(u[4], rnding);
+  u[4] = _mm256_srai_epi32(u[4], bit);
+
+  u[5] = _mm256_mullo_epi32(v[4], cospi48);
+  x = _mm256_mullo_epi32(v[5], cospi16);
+  u[5] = _mm256_sub_epi32(u[5], x);
+  u[5] = _mm256_add_epi32(u[5], rnding);
+  u[5] = _mm256_srai_epi32(u[5], bit);
+
+  u[6] = _mm256_mullo_epi32(v[6], cospim48);
+  x = _mm256_mullo_epi32(v[7], cospi16);
+  u[6] = _mm256_add_epi32(u[6], x);
+  u[6] = _mm256_add_epi32(u[6], rnding);
+  u[6] = _mm256_srai_epi32(u[6], bit);
+
+  u[7] = _mm256_mullo_epi32(v[6], cospi16);
+  x = _mm256_mullo_epi32(v[7], cospim48);
+  u[7] = _mm256_sub_epi32(u[7], x);
+  u[7] = _mm256_add_epi32(u[7], rnding);
+  u[7] = _mm256_srai_epi32(u[7], bit);
+
+  // stage 5
+  addsub_avx2(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
+  addsub_avx2(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
+  addsub_avx2(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
+  addsub_avx2(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
+
+  // stage 6
+  u[0] = v[0];
+  u[1] = v[1];
+  u[4] = v[4];
+  u[5] = v[5];
+
+  v[0] = _mm256_mullo_epi32(v[2], cospi32);
+  x = _mm256_mullo_epi32(v[3], cospi32);
+  u[2] = _mm256_add_epi32(v[0], x);
+  u[2] = _mm256_add_epi32(u[2], rnding);
+  u[2] = _mm256_srai_epi32(u[2], bit);
+
+  u[3] = _mm256_sub_epi32(v[0], x);
+  u[3] = _mm256_add_epi32(u[3], rnding);
+  u[3] = _mm256_srai_epi32(u[3], bit);
+
+  v[0] = _mm256_mullo_epi32(v[6], cospi32);
+  x = _mm256_mullo_epi32(v[7], cospi32);
+  u[6] = _mm256_add_epi32(v[0], x);
+  u[6] = _mm256_add_epi32(u[6], rnding);
+  u[6] = _mm256_srai_epi32(u[6], bit);
+
+  u[7] = _mm256_sub_epi32(v[0], x);
+  u[7] = _mm256_add_epi32(u[7], rnding);
+  u[7] = _mm256_srai_epi32(u[7], bit);
+
+  // stage 7
+  if (do_cols) {
+    out[0] = u[0];
+    out[1] = _mm256_sub_epi32(kZero, u[4]);
+    out[2] = u[6];
+    out[3] = _mm256_sub_epi32(kZero, u[2]);
+    out[4] = u[3];
+    out[5] = _mm256_sub_epi32(kZero, u[7]);
+    out[6] = u[5];
+    out[7] = _mm256_sub_epi32(kZero, u[1]);
+  } else {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+    const __m256i clamp_hi_out =
+        _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+    neg_shift_avx2(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+                   out_shift);
+    neg_shift_avx2(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
+                   out_shift);
+    neg_shift_avx2(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
+                   out_shift);
+    neg_shift_avx2(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
+                   out_shift);
+  }
+}
+static INLINE void idct64_stage8_avx2(
+    __m256i *u, const __m256i *cospim32, const __m256i *cospi32,
+    const __m256i *cospim16, const __m256i *cospi48, const __m256i *cospi16,
+    const __m256i *cospim48, const __m256i *clamp_lo, const __m256i *clamp_hi,
+    const __m256i *rnding, int bit) {
+  int i;
+  __m256i temp1, temp2, temp3, temp4;
+  temp1 = half_btf_avx2(cospim32, &u[10], cospi32, &u[13], rnding, bit);
+  u[13] = half_btf_avx2(cospi32, &u[10], cospi32, &u[13], rnding, bit);
+  u[10] = temp1;
+  temp2 = half_btf_avx2(cospim32, &u[11], cospi32, &u[12], rnding, bit);
+  u[12] = half_btf_avx2(cospi32, &u[11], cospi32, &u[12], rnding, bit);
+  u[11] = temp2;
+
+  for (i = 16; i < 20; ++i) {
+    addsub_avx2(u[i], u[i ^ 7], &u[i], &u[i ^ 7], clamp_lo, clamp_hi);
+    addsub_avx2(u[i ^ 15], u[i ^ 8], &u[i ^ 15], &u[i ^ 8], clamp_lo, clamp_hi);
+  }
+
+  temp1 = half_btf_avx2(cospim16, &u[36], cospi48, &u[59], rnding, bit);
+  temp2 = half_btf_avx2(cospim16, &u[37], cospi48, &u[58], rnding, bit);
+  temp3 = half_btf_avx2(cospim16, &u[38], cospi48, &u[57], rnding, bit);
+  temp4 = half_btf_avx2(cospim16, &u[39], cospi48, &u[56], rnding, bit);
+  u[56] = half_btf_avx2(cospi48, &u[39], cospi16, &u[56], rnding, bit);
+  u[57] = half_btf_avx2(cospi48, &u[38], cospi16, &u[57], rnding, bit);
+  u[58] = half_btf_avx2(cospi48, &u[37], cospi16, &u[58], rnding, bit);
+  u[59] = half_btf_avx2(cospi48, &u[36], cospi16, &u[59], rnding, bit);
+  u[36] = temp1;
+  u[37] = temp2;
+  u[38] = temp3;
+  u[39] = temp4;
+
+  temp1 = half_btf_avx2(cospim48, &u[40], cospim16, &u[55], rnding, bit);
+  temp2 = half_btf_avx2(cospim48, &u[41], cospim16, &u[54], rnding, bit);
+  temp3 = half_btf_avx2(cospim48, &u[42], cospim16, &u[53], rnding, bit);
+  temp4 = half_btf_avx2(cospim48, &u[43], cospim16, &u[52], rnding, bit);
+  u[52] = half_btf_avx2(cospim16, &u[43], cospi48, &u[52], rnding, bit);
+  u[53] = half_btf_avx2(cospim16, &u[42], cospi48, &u[53], rnding, bit);
+  u[54] = half_btf_avx2(cospim16, &u[41], cospi48, &u[54], rnding, bit);
+  u[55] = half_btf_avx2(cospim16, &u[40], cospi48, &u[55], rnding, bit);
+  u[40] = temp1;
+  u[41] = temp2;
+  u[42] = temp3;
+  u[43] = temp4;
+}
+
+static INLINE void idct64_stage9_avx2(__m256i *u, const __m256i *cospim32,
+                                      const __m256i *cospi32,
+                                      const __m256i *clamp_lo,
+                                      const __m256i *clamp_hi,
+                                      const __m256i *rnding, int bit) {
+  int i;
+  __m256i temp1, temp2, temp3, temp4;
+  for (i = 0; i < 8; ++i) {
+    addsub_avx2(u[i], u[15 - i], &u[i], &u[15 - i], clamp_lo, clamp_hi);
+  }
+
+  temp1 = half_btf_avx2(cospim32, &u[20], cospi32, &u[27], rnding, bit);
+  temp2 = half_btf_avx2(cospim32, &u[21], cospi32, &u[26], rnding, bit);
+  temp3 = half_btf_avx2(cospim32, &u[22], cospi32, &u[25], rnding, bit);
+  temp4 = half_btf_avx2(cospim32, &u[23], cospi32, &u[24], rnding, bit);
+  u[24] = half_btf_avx2(cospi32, &u[23], cospi32, &u[24], rnding, bit);
+  u[25] = half_btf_avx2(cospi32, &u[22], cospi32, &u[25], rnding, bit);
+  u[26] = half_btf_avx2(cospi32, &u[21], cospi32, &u[26], rnding, bit);
+  u[27] = half_btf_avx2(cospi32, &u[20], cospi32, &u[27], rnding, bit);
+  u[20] = temp1;
+  u[21] = temp2;
+  u[22] = temp3;
+  u[23] = temp4;
+  for (i = 32; i < 40; i++) {
+    addsub_avx2(u[i], u[i ^ 15], &u[i], &u[i ^ 15], clamp_lo, clamp_hi);
+  }
+
+  for (i = 48; i < 56; i++) {
+    addsub_avx2(u[i ^ 15], u[i], &u[i ^ 15], &u[i], clamp_lo, clamp_hi);
+  }
+}
+
+static INLINE void idct64_stage10_avx2(__m256i *u, const __m256i *cospim32,
+                                       const __m256i *cospi32,
+                                       const __m256i *clamp_lo,
+                                       const __m256i *clamp_hi,
+                                       const __m256i *rnding, int bit) {
+  __m256i temp1, temp2, temp3, temp4;
+  for (int i = 0; i < 16; i++) {
+    addsub_avx2(u[i], u[31 - i], &u[i], &u[31 - i], clamp_lo, clamp_hi);
+  }
+
+  temp1 = half_btf_avx2(cospim32, &u[40], cospi32, &u[55], rnding, bit);
+  temp2 = half_btf_avx2(cospim32, &u[41], cospi32, &u[54], rnding, bit);
+  temp3 = half_btf_avx2(cospim32, &u[42], cospi32, &u[53], rnding, bit);
+  temp4 = half_btf_avx2(cospim32, &u[43], cospi32, &u[52], rnding, bit);
+  u[52] = half_btf_avx2(cospi32, &u[43], cospi32, &u[52], rnding, bit);
+  u[53] = half_btf_avx2(cospi32, &u[42], cospi32, &u[53], rnding, bit);
+  u[54] = half_btf_avx2(cospi32, &u[41], cospi32, &u[54], rnding, bit);
+  u[55] = half_btf_avx2(cospi32, &u[40], cospi32, &u[55], rnding, bit);
+  u[40] = temp1;
+  u[41] = temp2;
+  u[42] = temp3;
+  u[43] = temp4;
+
+  temp1 = half_btf_avx2(cospim32, &u[44], cospi32, &u[51], rnding, bit);
+  temp2 = half_btf_avx2(cospim32, &u[45], cospi32, &u[50], rnding, bit);
+  temp3 = half_btf_avx2(cospim32, &u[46], cospi32, &u[49], rnding, bit);
+  temp4 = half_btf_avx2(cospim32, &u[47], cospi32, &u[48], rnding, bit);
+  u[48] = half_btf_avx2(cospi32, &u[47], cospi32, &u[48], rnding, bit);
+  u[49] = half_btf_avx2(cospi32, &u[46], cospi32, &u[49], rnding, bit);
+  u[50] = half_btf_avx2(cospi32, &u[45], cospi32, &u[50], rnding, bit);
+  u[51] = half_btf_avx2(cospi32, &u[44], cospi32, &u[51], rnding, bit);
+  u[44] = temp1;
+  u[45] = temp2;
+  u[46] = temp3;
+  u[47] = temp4;
+}
+
+static INLINE void idct64_stage11_avx2(__m256i *u, __m256i *out, int do_cols,
+                                       int bd, int out_shift,
+                                       const __m256i *clamp_lo,
+                                       const __m256i *clamp_hi) {
+  for (int i = 0; i < 32; i++) {
+    addsub_avx2(u[i], u[63 - i], &out[(i)], &out[(63 - i)], clamp_lo, clamp_hi);
+  }
+
+  if (!do_cols) {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+    const __m256i clamp_hi_out =
+        _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+    round_shift_8x8_avx2(out, out_shift);
+    round_shift_8x8_avx2(out + 16, out_shift);
+    round_shift_8x8_avx2(out + 32, out_shift);
+    round_shift_8x8_avx2(out + 48, out_shift);
+    highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 64);
+  }
+}
+
+static void idct64_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+                             int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+  __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+
+  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+
+  {
+    __m256i x;
+
+    // stage 1
+    // stage 2
+    // stage 3
+    // stage 4
+    // stage 5
+    // stage 6
+    x = half_btf_0_avx2(&cospi32, &in[0], &rnding, bit);
+
+    // stage 8
+    // stage 9
+    // stage 10
+    // stage 11
+    if (!do_cols) {
+      const int log_range_out = AOMMAX(16, bd + 6);
+      clamp_lo = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+      clamp_hi = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+      if (out_shift != 0) {
+        __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1);
+        x = _mm256_add_epi32(x, offset);
+        x = _mm256_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
+      }
+    }
+    x = _mm256_max_epi32(x, clamp_lo);
+    x = _mm256_min_epi32(x, clamp_hi);
+    out[0] = x;
+    out[1] = x;
+    out[2] = x;
+    out[3] = x;
+    out[4] = x;
+    out[5] = x;
+    out[6] = x;
+    out[7] = x;
+    out[8] = x;
+    out[9] = x;
+    out[10] = x;
+    out[11] = x;
+    out[12] = x;
+    out[13] = x;
+    out[14] = x;
+    out[15] = x;
+    out[16] = x;
+    out[17] = x;
+    out[18] = x;
+    out[19] = x;
+    out[20] = x;
+    out[21] = x;
+    out[22] = x;
+    out[23] = x;
+    out[24] = x;
+    out[25] = x;
+    out[26] = x;
+    out[27] = x;
+    out[28] = x;
+    out[29] = x;
+    out[30] = x;
+    out[31] = x;
+    out[32] = x;
+    out[33] = x;
+    out[34] = x;
+    out[35] = x;
+    out[36] = x;
+    out[37] = x;
+    out[38] = x;
+    out[39] = x;
+    out[40] = x;
+    out[41] = x;
+    out[42] = x;
+    out[43] = x;
+    out[44] = x;
+    out[45] = x;
+    out[46] = x;
+    out[47] = x;
+    out[48] = x;
+    out[49] = x;
+    out[50] = x;
+    out[51] = x;
+    out[52] = x;
+    out[53] = x;
+    out[54] = x;
+    out[55] = x;
+    out[56] = x;
+    out[57] = x;
+    out[58] = x;
+    out[59] = x;
+    out[60] = x;
+    out[61] = x;
+    out[62] = x;
+    out[63] = x;
+  }
+}
+static void idct64_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+                             int bd, int out_shift) {
+  int i, j;
+  const int32_t *cospi = cospi_arr(bit);
+  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+  const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+
+  const __m256i cospi1 = _mm256_set1_epi32(cospi[1]);
+  const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
+  const __m256i cospi3 = _mm256_set1_epi32(cospi[3]);
+  const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
+  const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
+  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+  const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
+  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+  const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
+  const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+  const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
+  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+  const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+  const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
+  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+  const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
+  const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
+  const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
+  const __m256i cospim12 = _mm256_set1_epi32(-cospi[12]);
+  const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
+  const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
+  const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
+  const __m256i cospim28 = _mm256_set1_epi32(-cospi[28]);
+  const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
+  const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
+  const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
+  const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+  const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
+  const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
+  const __m256i cospi63 = _mm256_set1_epi32(cospi[63]);
+  const __m256i cospim57 = _mm256_set1_epi32(-cospi[57]);
+  const __m256i cospi7 = _mm256_set1_epi32(cospi[7]);
+  const __m256i cospi5 = _mm256_set1_epi32(cospi[5]);
+  const __m256i cospi59 = _mm256_set1_epi32(cospi[59]);
+  const __m256i cospim61 = _mm256_set1_epi32(-cospi[61]);
+  const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
+  const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
+
+  {
+    __m256i u[64];
+
+    // stage 1
+    u[0] = in[0];
+    u[8] = in[4];
+    u[16] = in[2];
+    u[24] = in[6];
+    u[32] = in[1];
+    u[40] = in[5];
+    u[48] = in[3];
+    u[56] = in[7];
+
+    // stage 2
+    u[63] = half_btf_0_avx2(&cospi1, &u[32], &rnding, bit);
+    u[32] = half_btf_0_avx2(&cospi63, &u[32], &rnding, bit);
+    u[39] = half_btf_0_avx2(&cospim57, &u[56], &rnding, bit);
+    u[56] = half_btf_0_avx2(&cospi7, &u[56], &rnding, bit);
+    u[55] = half_btf_0_avx2(&cospi5, &u[40], &rnding, bit);
+    u[40] = half_btf_0_avx2(&cospi59, &u[40], &rnding, bit);
+    u[47] = half_btf_0_avx2(&cospim61, &u[48], &rnding, bit);
+    u[48] = half_btf_0_avx2(&cospi3, &u[48], &rnding, bit);
+
+    // stage 3
+    u[31] = half_btf_0_avx2(&cospi2, &u[16], &rnding, bit);
+    u[16] = half_btf_0_avx2(&cospi62, &u[16], &rnding, bit);
+    u[23] = half_btf_0_avx2(&cospim58, &u[24], &rnding, bit);
+    u[24] = half_btf_0_avx2(&cospi6, &u[24], &rnding, bit);
+    u[33] = u[32];
+    u[38] = u[39];
+    u[41] = u[40];
+    u[46] = u[47];
+    u[49] = u[48];
+    u[54] = u[55];
+    u[57] = u[56];
+    u[62] = u[63];
+
+    // stage 4
+    __m256i temp1, temp2;
+    u[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit);
+    u[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit);
+    u[17] = u[16];
+    u[22] = u[23];
+    u[25] = u[24];
+    u[30] = u[31];
+
+    temp1 = half_btf_avx2(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
+    u[62] = half_btf_avx2(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
+    u[33] = temp1;
+
+    temp2 = half_btf_avx2(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
+    u[38] = half_btf_avx2(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
+    u[57] = temp2;
+
+    temp1 = half_btf_avx2(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
+    u[54] = half_btf_avx2(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
+    u[41] = temp1;
+
+    temp2 = half_btf_avx2(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
+    u[49] = half_btf_avx2(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
+    u[46] = temp2;
+
+    // stage 5
+    u[9] = u[8];
+    u[14] = u[15];
+
+    temp1 = half_btf_avx2(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit);
+    u[30] = half_btf_avx2(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit);
+    u[17] = temp1;
+
+    temp2 = half_btf_avx2(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit);
+    u[25] = half_btf_avx2(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit);
+    u[22] = temp2;
+
+    u[35] = u[32];
+    u[34] = u[33];
+    u[36] = u[39];
+    u[37] = u[38];
+    u[43] = u[40];
+    u[42] = u[41];
+    u[44] = u[47];
+    u[45] = u[46];
+    u[51] = u[48];
+    u[50] = u[49];
+    u[52] = u[55];
+    u[53] = u[54];
+    u[59] = u[56];
+    u[58] = u[57];
+    u[60] = u[63];
+    u[61] = u[62];
+
+    // stage 6
+    temp1 = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
+    u[1] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
+    u[0] = temp1;
+
+    temp2 = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
+    u[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
+    u[9] = temp2;
+    u[19] = u[16];
+    u[18] = u[17];
+    u[20] = u[23];
+    u[21] = u[22];
+    u[27] = u[24];
+    u[26] = u[25];
+    u[28] = u[31];
+    u[29] = u[30];
+
+    temp1 = half_btf_avx2(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
+    u[61] = half_btf_avx2(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
+    u[34] = temp1;
+    temp2 = half_btf_avx2(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
+    u[60] = half_btf_avx2(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
+    u[35] = temp2;
+    temp1 = half_btf_avx2(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
+    u[59] = half_btf_avx2(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
+    u[36] = temp1;
+    temp2 = half_btf_avx2(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
+    u[58] = half_btf_avx2(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
+    u[37] = temp2;
+    temp1 = half_btf_avx2(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
+    u[53] = half_btf_avx2(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
+    u[42] = temp1;
+    temp2 = half_btf_avx2(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
+    u[52] = half_btf_avx2(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
+    u[43] = temp2;
+    temp1 = half_btf_avx2(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
+    u[51] = half_btf_avx2(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
+    u[44] = temp1;
+    temp2 = half_btf_avx2(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
+    u[50] = half_btf_avx2(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
+    u[45] = temp2;
+
+    // stage 7
+    u[3] = u[0];
+    u[2] = u[1];
+    u[11] = u[8];
+    u[10] = u[9];
+    u[12] = u[15];
+    u[13] = u[14];
+
+    temp1 = half_btf_avx2(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit);
+    u[29] = half_btf_avx2(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit);
+    u[18] = temp1;
+    temp2 = half_btf_avx2(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit);
+    u[28] = half_btf_avx2(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit);
+    u[19] = temp2;
+    temp1 = half_btf_avx2(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit);
+    u[27] = half_btf_avx2(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit);
+    u[20] = temp1;
+    temp2 = half_btf_avx2(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit);
+    u[26] = half_btf_avx2(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit);
+    u[21] = temp2;
+    for (i = 32; i < 64; i += 16) {
+      for (j = i; j < i + 4; j++) {
+        addsub_avx2(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
+        addsub_avx2(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
+                    &clamp_hi);
+      }
+    }
+
+    // stage 8
+    u[7] = u[0];
+    u[6] = u[1];
+    u[5] = u[2];
+    u[4] = u[3];
+
+    idct64_stage8_avx2(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
+                       &cospim48, &clamp_lo, &clamp_hi, &rnding, bit);
+
+    // stage 9
+    idct64_stage9_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
+                       bit);
+
+    // stage 10
+    idct64_stage10_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
+                        bit);
+
+    // stage 11
+    idct64_stage11_avx2(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
+  }
+}
+static void idct64_low16_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+                              int bd, int out_shift) {
+  int i, j;
+  const int32_t *cospi = cospi_arr(bit);
+  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+  const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+
+  const __m256i cospi1 = _mm256_set1_epi32(cospi[1]);
+  const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
+  const __m256i cospi3 = _mm256_set1_epi32(cospi[3]);
+  const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
+  const __m256i cospi5 = _mm256_set1_epi32(cospi[5]);
+  const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
+  const __m256i cospi7 = _mm256_set1_epi32(cospi[7]);
+  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+  const __m256i cospi9 = _mm256_set1_epi32(cospi[9]);
+  const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
+  const __m256i cospi11 = _mm256_set1_epi32(cospi[11]);
+  const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
+  const __m256i cospi13 = _mm256_set1_epi32(cospi[13]);
+  const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
+  const __m256i cospi15 = _mm256_set1_epi32(cospi[15]);
+  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+  const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
+  const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+  const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
+  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+  const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
+  const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+  const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
+  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+  const __m256i cospi51 = _mm256_set1_epi32(cospi[51]);
+  const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
+  const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
+  const __m256i cospi55 = _mm256_set1_epi32(cospi[55]);
+  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+  const __m256i cospi59 = _mm256_set1_epi32(cospi[59]);
+  const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
+  const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
+  const __m256i cospi63 = _mm256_set1_epi32(cospi[63]);
+
+  const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
+  const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
+  const __m256i cospim12 = _mm256_set1_epi32(-cospi[12]);
+  const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
+  const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
+  const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
+  const __m256i cospim28 = _mm256_set1_epi32(-cospi[28]);
+  const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
+  const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
+  const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
+  const __m256i cospim44 = _mm256_set1_epi32(-cospi[44]);
+  const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+  const __m256i cospim49 = _mm256_set1_epi32(-cospi[49]);
+  const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
+  const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
+  const __m256i cospim53 = _mm256_set1_epi32(-cospi[53]);
+  const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
+  const __m256i cospim57 = _mm256_set1_epi32(-cospi[57]);
+  const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
+  const __m256i cospim60 = _mm256_set1_epi32(-cospi[60]);
+  const __m256i cospim61 = _mm256_set1_epi32(-cospi[61]);
+
+  {
+    __m256i u[64];
+    __m256i tmp1, tmp2, tmp3, tmp4;
+    // stage 1
+    u[0] = in[0];
+    u[32] = in[1];
+    u[36] = in[9];
+    u[40] = in[5];
+    u[44] = in[13];
+    u[48] = in[3];
+    u[52] = in[11];
+    u[56] = in[7];
+    u[60] = in[15];
+    u[16] = in[2];
+    u[20] = in[10];
+    u[24] = in[6];
+    u[28] = in[14];
+    u[4] = in[8];
+    u[8] = in[4];
+    u[12] = in[12];
+
+    // stage 2
+    u[63] = half_btf_0_avx2(&cospi1, &u[32], &rnding, bit);
+    u[32] = half_btf_0_avx2(&cospi63, &u[32], &rnding, bit);
+    u[35] = half_btf_0_avx2(&cospim49, &u[60], &rnding, bit);
+    u[60] = half_btf_0_avx2(&cospi15, &u[60], &rnding, bit);
+    u[59] = half_btf_0_avx2(&cospi9, &u[36], &rnding, bit);
+    u[36] = half_btf_0_avx2(&cospi55, &u[36], &rnding, bit);
+    u[39] = half_btf_0_avx2(&cospim57, &u[56], &rnding, bit);
+    u[56] = half_btf_0_avx2(&cospi7, &u[56], &rnding, bit);
+    u[55] = half_btf_0_avx2(&cospi5, &u[40], &rnding, bit);
+    u[40] = half_btf_0_avx2(&cospi59, &u[40], &rnding, bit);
+    u[43] = half_btf_0_avx2(&cospim53, &u[52], &rnding, bit);
+    u[52] = half_btf_0_avx2(&cospi11, &u[52], &rnding, bit);
+    u[47] = half_btf_0_avx2(&cospim61, &u[48], &rnding, bit);
+    u[48] = half_btf_0_avx2(&cospi3, &u[48], &rnding, bit);
+    u[51] = half_btf_0_avx2(&cospi13, &u[44], &rnding, bit);
+    u[44] = half_btf_0_avx2(&cospi51, &u[44], &rnding, bit);
+
+    // stage 3
+    u[31] = half_btf_0_avx2(&cospi2, &u[16], &rnding, bit);
+    u[16] = half_btf_0_avx2(&cospi62, &u[16], &rnding, bit);
+    u[19] = half_btf_0_avx2(&cospim50, &u[28], &rnding, bit);
+    u[28] = half_btf_0_avx2(&cospi14, &u[28], &rnding, bit);
+    u[27] = half_btf_0_avx2(&cospi10, &u[20], &rnding, bit);
+    u[20] = half_btf_0_avx2(&cospi54, &u[20], &rnding, bit);
+    u[23] = half_btf_0_avx2(&cospim58, &u[24], &rnding, bit);
+    u[24] = half_btf_0_avx2(&cospi6, &u[24], &rnding, bit);
+    u[33] = u[32];
+    u[34] = u[35];
+    u[37] = u[36];
+    u[38] = u[39];
+    u[41] = u[40];
+    u[42] = u[43];
+    u[45] = u[44];
+    u[46] = u[47];
+    u[49] = u[48];
+    u[50] = u[51];
+    u[53] = u[52];
+    u[54] = u[55];
+    u[57] = u[56];
+    u[58] = u[59];
+    u[61] = u[60];
+    u[62] = u[63];
+
+    // stage 4
+    u[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit);
+    u[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit);
+    u[11] = half_btf_0_avx2(&cospim52, &u[12], &rnding, bit);
+    u[12] = half_btf_0_avx2(&cospi12, &u[12], &rnding, bit);
+
+    u[17] = u[16];
+    u[18] = u[19];
+    u[21] = u[20];
+    u[22] = u[23];
+    u[25] = u[24];
+    u[26] = u[27];
+    u[29] = u[28];
+    u[30] = u[31];
+
+    tmp1 = half_btf_avx2(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
+    tmp2 = half_btf_avx2(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit);
+    tmp3 = half_btf_avx2(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit);
+    tmp4 = half_btf_avx2(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
+    u[57] = half_btf_avx2(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
+    u[58] = half_btf_avx2(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit);
+    u[61] = half_btf_avx2(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit);
+    u[62] = half_btf_avx2(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
+    u[33] = tmp1;
+    u[34] = tmp2;
+    u[37] = tmp3;
+    u[38] = tmp4;
+
+    tmp1 = half_btf_avx2(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
+    tmp2 = half_btf_avx2(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit);
+    tmp3 = half_btf_avx2(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit);
+    tmp4 = half_btf_avx2(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
+    u[49] = half_btf_avx2(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
+    u[50] = half_btf_avx2(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit);
+    u[53] = half_btf_avx2(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit);
+    u[54] = half_btf_avx2(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
+    u[41] = tmp1;
+    u[42] = tmp2;
+    u[45] = tmp3;
+    u[46] = tmp4;
+
+    // stage 5
+    u[7] = half_btf_0_avx2(&cospi8, &u[4], &rnding, bit);
+    u[4] = half_btf_0_avx2(&cospi56, &u[4], &rnding, bit);
+
+    u[9] = u[8];
+    u[10] = u[11];
+    u[13] = u[12];
+    u[14] = u[15];
+
+    tmp1 = half_btf_avx2(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit);
+    tmp2 = half_btf_avx2(&cospim56, &u[18], &cospim8, &u[29], &rnding, bit);
+    tmp3 = half_btf_avx2(&cospim40, &u[21], &cospi24, &u[26], &rnding, bit);
+    tmp4 = half_btf_avx2(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit);
+    u[25] = half_btf_avx2(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit);
+    u[26] = half_btf_avx2(&cospi24, &u[21], &cospi40, &u[26], &rnding, bit);
+    u[29] = half_btf_avx2(&cospim8, &u[18], &cospi56, &u[29], &rnding, bit);
+    u[30] = half_btf_avx2(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit);
+    u[17] = tmp1;
+    u[18] = tmp2;
+    u[21] = tmp3;
+    u[22] = tmp4;
+
+    for (i = 32; i < 64; i += 8) {
+      addsub_avx2(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
+                  &clamp_hi);
+      addsub_avx2(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
+                  &clamp_hi);
+
+      addsub_avx2(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
+                  &clamp_hi);
+      addsub_avx2(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
+                  &clamp_hi);
+    }
+
+    // stage 6
+    tmp1 = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
+    u[1] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
+    u[0] = tmp1;
+    u[5] = u[4];
+    u[6] = u[7];
+
+    tmp1 = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
+    u[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
+    u[9] = tmp1;
+    tmp2 = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
+    u[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
+    u[10] = tmp2;
+
+    for (i = 16; i < 32; i += 8) {
+      addsub_avx2(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
+                  &clamp_hi);
+      addsub_avx2(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
+                  &clamp_hi);
+
+      addsub_avx2(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
+                  &clamp_hi);
+      addsub_avx2(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
+                  &clamp_hi);
+    }
+
+    tmp1 = half_btf_avx2(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
+    tmp2 = half_btf_avx2(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
+    tmp3 = half_btf_avx2(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
+    tmp4 = half_btf_avx2(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
+    u[58] = half_btf_avx2(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
+    u[59] = half_btf_avx2(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
+    u[60] = half_btf_avx2(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
+    u[61] = half_btf_avx2(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
+    u[34] = tmp1;
+    u[35] = tmp2;
+    u[36] = tmp3;
+    u[37] = tmp4;
+
+    tmp1 = half_btf_avx2(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
+    tmp2 = half_btf_avx2(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
+    tmp3 = half_btf_avx2(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
+    tmp4 = half_btf_avx2(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
+    u[50] = half_btf_avx2(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
+    u[51] = half_btf_avx2(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
+    u[52] = half_btf_avx2(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
+    u[53] = half_btf_avx2(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
+    u[42] = tmp1;
+    u[43] = tmp2;
+    u[44] = tmp3;
+    u[45] = tmp4;
+
+    // stage 7
+    u[3] = u[0];
+    u[2] = u[1];
+    tmp1 = half_btf_avx2(&cospim32, &u[5], &cospi32, &u[6], &rnding, bit);
+    u[6] = half_btf_avx2(&cospi32, &u[5], &cospi32, &u[6], &rnding, bit);
+    u[5] = tmp1;
+    addsub_avx2(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
+
+    tmp1 = half_btf_avx2(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit);
+    tmp2 = half_btf_avx2(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit);
+    tmp3 = half_btf_avx2(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit);
+    tmp4 = half_btf_avx2(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit);
+    u[26] = half_btf_avx2(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit);
+    u[27] = half_btf_avx2(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit);
+    u[28] = half_btf_avx2(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit);
+    u[29] = half_btf_avx2(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit);
+    u[18] = tmp1;
+    u[19] = tmp2;
+    u[20] = tmp3;
+    u[21] = tmp4;
+
+    for (i = 32; i < 64; i += 16) {
+      for (j = i; j < i + 4; j++) {
+        addsub_avx2(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
+        addsub_avx2(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
+                    &clamp_hi);
+      }
+    }
+
+    // stage 8
+    for (i = 0; i < 4; ++i) {
+      addsub_avx2(u[i], u[7 - i], &u[i], &u[7 - i], &clamp_lo, &clamp_hi);
+    }
+
+    idct64_stage8_avx2(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
+                       &cospim48, &clamp_lo, &clamp_hi, &rnding, bit);
+
+    // stage 9
+    idct64_stage9_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
+                       bit);
+
+    // stage 10
+    idct64_stage10_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
+                        bit);
+
+    // stage 11
+    idct64_stage11_avx2(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
+  }
+}
+static void idct64_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd,
+                        int out_shift) {
+  int i, j;
+  const int32_t *cospi = cospi_arr(bit);
+  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+  const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+
+  const __m256i cospi1 = _mm256_set1_epi32(cospi[1]);
+  const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
+  const __m256i cospi3 = _mm256_set1_epi32(cospi[3]);
+  const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
+  const __m256i cospi5 = _mm256_set1_epi32(cospi[5]);
+  const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
+  const __m256i cospi7 = _mm256_set1_epi32(cospi[7]);
+  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+  const __m256i cospi9 = _mm256_set1_epi32(cospi[9]);
+  const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
+  const __m256i cospi11 = _mm256_set1_epi32(cospi[11]);
+  const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
+  const __m256i cospi13 = _mm256_set1_epi32(cospi[13]);
+  const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
+  const __m256i cospi15 = _mm256_set1_epi32(cospi[15]);
+  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+  const __m256i cospi17 = _mm256_set1_epi32(cospi[17]);
+  const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
+  const __m256i cospi19 = _mm256_set1_epi32(cospi[19]);
+  const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
+  const __m256i cospi21 = _mm256_set1_epi32(cospi[21]);
+  const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
+  const __m256i cospi23 = _mm256_set1_epi32(cospi[23]);
+  const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+  const __m256i cospi25 = _mm256_set1_epi32(cospi[25]);
+  const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
+  const __m256i cospi27 = _mm256_set1_epi32(cospi[27]);
+  const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
+  const __m256i cospi29 = _mm256_set1_epi32(cospi[29]);
+  const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
+  const __m256i cospi31 = _mm256_set1_epi32(cospi[31]);
+  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+  const __m256i cospi35 = _mm256_set1_epi32(cospi[35]);
+  const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
+  const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
+  const __m256i cospi39 = _mm256_set1_epi32(cospi[39]);
+  const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+  const __m256i cospi43 = _mm256_set1_epi32(cospi[43]);
+  const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
+  const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
+  const __m256i cospi47 = _mm256_set1_epi32(cospi[47]);
+  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+  const __m256i cospi51 = _mm256_set1_epi32(cospi[51]);
+  const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
+  const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
+  const __m256i cospi55 = _mm256_set1_epi32(cospi[55]);
+  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+  const __m256i cospi59 = _mm256_set1_epi32(cospi[59]);
+  const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
+  const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
+  const __m256i cospi63 = _mm256_set1_epi32(cospi[63]);
+
+  const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
+  const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
+  const __m256i cospim12 = _mm256_set1_epi32(-cospi[12]);
+  const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
+  const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
+  const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
+  const __m256i cospim28 = _mm256_set1_epi32(-cospi[28]);
+  const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
+  const __m256i cospim33 = _mm256_set1_epi32(-cospi[33]);
+  const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]);
+  const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
+  const __m256i cospim37 = _mm256_set1_epi32(-cospi[37]);
+  const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
+  const __m256i cospim41 = _mm256_set1_epi32(-cospi[41]);
+  const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]);
+  const __m256i cospim44 = _mm256_set1_epi32(-cospi[44]);
+  const __m256i cospim45 = _mm256_set1_epi32(-cospi[45]);
+  const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+  const __m256i cospim49 = _mm256_set1_epi32(-cospi[49]);
+  const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
+  const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
+  const __m256i cospim53 = _mm256_set1_epi32(-cospi[53]);
+  const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
+  const __m256i cospim57 = _mm256_set1_epi32(-cospi[57]);
+  const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
+  const __m256i cospim60 = _mm256_set1_epi32(-cospi[60]);
+  const __m256i cospim61 = _mm256_set1_epi32(-cospi[61]);
+
+  {
+    __m256i u[64], v[64];
+
+    // stage 1
+    u[32] = in[1];
+    u[34] = in[17];
+    u[36] = in[9];
+    u[38] = in[25];
+    u[40] = in[5];
+    u[42] = in[21];
+    u[44] = in[13];
+    u[46] = in[29];
+    u[48] = in[3];
+    u[50] = in[19];
+    u[52] = in[11];
+    u[54] = in[27];
+    u[56] = in[7];
+    u[58] = in[23];
+    u[60] = in[15];
+    u[62] = in[31];
+
+    v[16] = in[2];
+    v[18] = in[18];
+    v[20] = in[10];
+    v[22] = in[26];
+    v[24] = in[6];
+    v[26] = in[22];
+    v[28] = in[14];
+    v[30] = in[30];
+
+    u[8] = in[4];
+    u[10] = in[20];
+    u[12] = in[12];
+    u[14] = in[28];
+
+    v[4] = in[8];
+    v[6] = in[24];
+
+    u[0] = in[0];
+    u[2] = in[16];
+
+    // stage 2
+    v[32] = half_btf_0_avx2(&cospi63, &u[32], &rnding, bit);
+    v[33] = half_btf_0_avx2(&cospim33, &u[62], &rnding, bit);
+    v[34] = half_btf_0_avx2(&cospi47, &u[34], &rnding, bit);
+    v[35] = half_btf_0_avx2(&cospim49, &u[60], &rnding, bit);
+    v[36] = half_btf_0_avx2(&cospi55, &u[36], &rnding, bit);
+    v[37] = half_btf_0_avx2(&cospim41, &u[58], &rnding, bit);
+    v[38] = half_btf_0_avx2(&cospi39, &u[38], &rnding, bit);
+    v[39] = half_btf_0_avx2(&cospim57, &u[56], &rnding, bit);
+    v[40] = half_btf_0_avx2(&cospi59, &u[40], &rnding, bit);
+    v[41] = half_btf_0_avx2(&cospim37, &u[54], &rnding, bit);
+    v[42] = half_btf_0_avx2(&cospi43, &u[42], &rnding, bit);
+    v[43] = half_btf_0_avx2(&cospim53, &u[52], &rnding, bit);
+    v[44] = half_btf_0_avx2(&cospi51, &u[44], &rnding, bit);
+    v[45] = half_btf_0_avx2(&cospim45, &u[50], &rnding, bit);
+    v[46] = half_btf_0_avx2(&cospi35, &u[46], &rnding, bit);
+    v[47] = half_btf_0_avx2(&cospim61, &u[48], &rnding, bit);
+    v[48] = half_btf_0_avx2(&cospi3, &u[48], &rnding, bit);
+    v[49] = half_btf_0_avx2(&cospi29, &u[46], &rnding, bit);
+    v[50] = half_btf_0_avx2(&cospi19, &u[50], &rnding, bit);
+    v[51] = half_btf_0_avx2(&cospi13, &u[44], &rnding, bit);
+    v[52] = half_btf_0_avx2(&cospi11, &u[52], &rnding, bit);
+    v[53] = half_btf_0_avx2(&cospi21, &u[42], &rnding, bit);
+    v[54] = half_btf_0_avx2(&cospi27, &u[54], &rnding, bit);
+    v[55] = half_btf_0_avx2(&cospi5, &u[40], &rnding, bit);
+    v[56] = half_btf_0_avx2(&cospi7, &u[56], &rnding, bit);
+    v[57] = half_btf_0_avx2(&cospi25, &u[38], &rnding, bit);
+    v[58] = half_btf_0_avx2(&cospi23, &u[58], &rnding, bit);
+    v[59] = half_btf_0_avx2(&cospi9, &u[36], &rnding, bit);
+    v[60] = half_btf_0_avx2(&cospi15, &u[60], &rnding, bit);
+    v[61] = half_btf_0_avx2(&cospi17, &u[34], &rnding, bit);
+    v[62] = half_btf_0_avx2(&cospi31, &u[62], &rnding, bit);
+    v[63] = half_btf_0_avx2(&cospi1, &u[32], &rnding, bit);
+
+    // stage 3
+    u[16] = half_btf_0_avx2(&cospi62, &v[16], &rnding, bit);
+    u[17] = half_btf_0_avx2(&cospim34, &v[30], &rnding, bit);
+    u[18] = half_btf_0_avx2(&cospi46, &v[18], &rnding, bit);
+    u[19] = half_btf_0_avx2(&cospim50, &v[28], &rnding, bit);
+    u[20] = half_btf_0_avx2(&cospi54, &v[20], &rnding, bit);
+    u[21] = half_btf_0_avx2(&cospim42, &v[26], &rnding, bit);
+    u[22] = half_btf_0_avx2(&cospi38, &v[22], &rnding, bit);
+    u[23] = half_btf_0_avx2(&cospim58, &v[24], &rnding, bit);
+    u[24] = half_btf_0_avx2(&cospi6, &v[24], &rnding, bit);
+    u[25] = half_btf_0_avx2(&cospi26, &v[22], &rnding, bit);
+    u[26] = half_btf_0_avx2(&cospi22, &v[26], &rnding, bit);
+    u[27] = half_btf_0_avx2(&cospi10, &v[20], &rnding, bit);
+    u[28] = half_btf_0_avx2(&cospi14, &v[28], &rnding, bit);
+    u[29] = half_btf_0_avx2(&cospi18, &v[18], &rnding, bit);
+    u[30] = half_btf_0_avx2(&cospi30, &v[30], &rnding, bit);
+    u[31] = half_btf_0_avx2(&cospi2, &v[16], &rnding, bit);
+
+    for (i = 32; i < 64; i += 4) {
+      addsub_avx2(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
+                  &clamp_hi);
+      addsub_avx2(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
+                  &clamp_hi);
+    }
+
+    // stage 4
+    v[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit);
+    v[9] = half_btf_0_avx2(&cospim36, &u[14], &rnding, bit);
+    v[10] = half_btf_0_avx2(&cospi44, &u[10], &rnding, bit);
+    v[11] = half_btf_0_avx2(&cospim52, &u[12], &rnding, bit);
+    v[12] = half_btf_0_avx2(&cospi12, &u[12], &rnding, bit);
+    v[13] = half_btf_0_avx2(&cospi20, &u[10], &rnding, bit);
+    v[14] = half_btf_0_avx2(&cospi28, &u[14], &rnding, bit);
+    v[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit);
+
+    for (i = 16; i < 32; i += 4) {
+      addsub_avx2(u[i + 0], u[i + 1], &v[i + 0], &v[i + 1], &clamp_lo,
+                  &clamp_hi);
+      addsub_avx2(u[i + 3], u[i + 2], &v[i + 3], &v[i + 2], &clamp_lo,
+                  &clamp_hi);
+    }
+
+    for (i = 32; i < 64; i += 4) {
+      v[i + 0] = u[i + 0];
+      v[i + 3] = u[i + 3];
+    }
+
+    v[33] = half_btf_avx2(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
+    v[34] = half_btf_avx2(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit);
+    v[37] = half_btf_avx2(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit);
+    v[38] = half_btf_avx2(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
+    v[41] = half_btf_avx2(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
+    v[42] = half_btf_avx2(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit);
+    v[45] = half_btf_avx2(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit);
+    v[46] = half_btf_avx2(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
+    v[49] = half_btf_avx2(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
+    v[50] = half_btf_avx2(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit);
+    v[53] = half_btf_avx2(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit);
+    v[54] = half_btf_avx2(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
+    v[57] = half_btf_avx2(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
+    v[58] = half_btf_avx2(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit);
+    v[61] = half_btf_avx2(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit);
+    v[62] = half_btf_avx2(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
+
+    // stage 5
+    u[4] = half_btf_0_avx2(&cospi56, &v[4], &rnding, bit);
+    u[5] = half_btf_0_avx2(&cospim40, &v[6], &rnding, bit);
+    u[6] = half_btf_0_avx2(&cospi24, &v[6], &rnding, bit);
+    u[7] = half_btf_0_avx2(&cospi8, &v[4], &rnding, bit);
+
+    for (i = 8; i < 16; i += 4) {
+      addsub_avx2(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
+                  &clamp_hi);
+      addsub_avx2(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
+                  &clamp_hi);
+    }
+
+    for (i = 16; i < 32; i += 4) {
+      u[i + 0] = v[i + 0];
+      u[i + 3] = v[i + 3];
+    }
+
+    u[17] = half_btf_avx2(&cospim8, &v[17], &cospi56, &v[30], &rnding, bit);
+    u[18] = half_btf_avx2(&cospim56, &v[18], &cospim8, &v[29], &rnding, bit);
+    u[21] = half_btf_avx2(&cospim40, &v[21], &cospi24, &v[26], &rnding, bit);
+    u[22] = half_btf_avx2(&cospim24, &v[22], &cospim40, &v[25], &rnding, bit);
+    u[25] = half_btf_avx2(&cospim40, &v[22], &cospi24, &v[25], &rnding, bit);
+    u[26] = half_btf_avx2(&cospi24, &v[21], &cospi40, &v[26], &rnding, bit);
+    u[29] = half_btf_avx2(&cospim8, &v[18], &cospi56, &v[29], &rnding, bit);
+    u[30] = half_btf_avx2(&cospi56, &v[17], &cospi8, &v[30], &rnding, bit);
+
+    for (i = 32; i < 64; i += 8) {
+      addsub_avx2(v[i + 0], v[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
+                  &clamp_hi);
+      addsub_avx2(v[i + 1], v[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
+                  &clamp_hi);
+
+      addsub_avx2(v[i + 7], v[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
+                  &clamp_hi);
+      addsub_avx2(v[i + 6], v[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
+                  &clamp_hi);
+    }
+
+    // stage 6
+    v[0] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
+    v[1] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
+    v[2] = half_btf_0_avx2(&cospi48, &u[2], &rnding, bit);
+    v[3] = half_btf_0_avx2(&cospi16, &u[2], &rnding, bit);
+
+    addsub_avx2(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
+    addsub_avx2(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
+
+    for (i = 8; i < 16; i += 4) {
+      v[i + 0] = u[i + 0];
+      v[i + 3] = u[i + 3];
+    }
+
+    v[9] = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
+    v[10] = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
+    v[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
+    v[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
+
+    for (i = 16; i < 32; i += 8) {
+      addsub_avx2(u[i + 0], u[i + 3], &v[i + 0], &v[i + 3], &clamp_lo,
+                  &clamp_hi);
+      addsub_avx2(u[i + 1], u[i + 2], &v[i + 1], &v[i + 2], &clamp_lo,
+                  &clamp_hi);
+
+      addsub_avx2(u[i + 7], u[i + 4], &v[i + 7], &v[i + 4], &clamp_lo,
+                  &clamp_hi);
+      addsub_avx2(u[i + 6], u[i + 5], &v[i + 6], &v[i + 5], &clamp_lo,
+                  &clamp_hi);
+    }
+
+    for (i = 32; i < 64; i += 8) {
+      v[i + 0] = u[i + 0];
+      v[i + 1] = u[i + 1];
+      v[i + 6] = u[i + 6];
+      v[i + 7] = u[i + 7];
+    }
+
+    v[34] = half_btf_avx2(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
+    v[35] = half_btf_avx2(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
+    v[36] = half_btf_avx2(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
+    v[37] = half_btf_avx2(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
+    v[42] = half_btf_avx2(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
+    v[43] = half_btf_avx2(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
+    v[44] = half_btf_avx2(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
+    v[45] = half_btf_avx2(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
+    v[50] = half_btf_avx2(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
+    v[51] = half_btf_avx2(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
+    v[52] = half_btf_avx2(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
+    v[53] = half_btf_avx2(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
+    v[58] = half_btf_avx2(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
+    v[59] = half_btf_avx2(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
+    v[60] = half_btf_avx2(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
+    v[61] = half_btf_avx2(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
+
+    // stage 7
+    addsub_avx2(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
+    addsub_avx2(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
+
+    u[4] = v[4];
+    u[7] = v[7];
+    u[5] = half_btf_avx2(&cospim32, &v[5], &cospi32, &v[6], &rnding, bit);
+    u[6] = half_btf_avx2(&cospi32, &v[5], &cospi32, &v[6], &rnding, bit);
+
+    addsub_avx2(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
+    addsub_avx2(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
+    addsub_avx2(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
+    addsub_avx2(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
+
+    for (i = 16; i < 32; i += 8) {
+      u[i + 0] = v[i + 0];
+      u[i + 1] = v[i + 1];
+      u[i + 6] = v[i + 6];
+      u[i + 7] = v[i + 7];
+    }
+
+    u[18] = half_btf_avx2(&cospim16, &v[18], &cospi48, &v[29], &rnding, bit);
+    u[19] = half_btf_avx2(&cospim16, &v[19], &cospi48, &v[28], &rnding, bit);
+    u[20] = half_btf_avx2(&cospim48, &v[20], &cospim16, &v[27], &rnding, bit);
+    u[21] = half_btf_avx2(&cospim48, &v[21], &cospim16, &v[26], &rnding, bit);
+    u[26] = half_btf_avx2(&cospim16, &v[21], &cospi48, &v[26], &rnding, bit);
+    u[27] = half_btf_avx2(&cospim16, &v[20], &cospi48, &v[27], &rnding, bit);
+    u[28] = half_btf_avx2(&cospi48, &v[19], &cospi16, &v[28], &rnding, bit);
+    u[29] = half_btf_avx2(&cospi48, &v[18], &cospi16, &v[29], &rnding, bit);
+
+    for (i = 32; i < 64; i += 16) {
+      for (j = i; j < i + 4; j++) {
+        addsub_avx2(v[j], v[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
+        addsub_avx2(v[j ^ 15], v[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
+                    &clamp_hi);
+      }
+    }
+
+    // stage 8
+    for (i = 0; i < 4; ++i) {
+      addsub_avx2(u[i], u[7 - i], &v[i], &v[7 - i], &clamp_lo, &clamp_hi);
+    }
+
+    v[8] = u[8];
+    v[9] = u[9];
+    v[14] = u[14];
+    v[15] = u[15];
+
+    v[10] = half_btf_avx2(&cospim32, &u[10], &cospi32, &u[13], &rnding, bit);
+    v[11] = half_btf_avx2(&cospim32, &u[11], &cospi32, &u[12], &rnding, bit);
+    v[12] = half_btf_avx2(&cospi32, &u[11], &cospi32, &u[12], &rnding, bit);
+    v[13] = half_btf_avx2(&cospi32, &u[10], &cospi32, &u[13], &rnding, bit);
+
+    for (i = 16; i < 20; ++i) {
+      addsub_avx2(u[i], u[i ^ 7], &v[i], &v[i ^ 7], &clamp_lo, &clamp_hi);
+      addsub_avx2(u[i ^ 15], u[i ^ 8], &v[i ^ 15], &v[i ^ 8], &clamp_lo,
+                  &clamp_hi);
+    }
+
+    for (i = 32; i < 36; ++i) {
+      v[i] = u[i];
+      v[i + 12] = u[i + 12];
+      v[i + 16] = u[i + 16];
+      v[i + 28] = u[i + 28];
+    }
+
+    v[36] = half_btf_avx2(&cospim16, &u[36], &cospi48, &u[59], &rnding, bit);
+    v[37] = half_btf_avx2(&cospim16, &u[37], &cospi48, &u[58], &rnding, bit);
+    v[38] = half_btf_avx2(&cospim16, &u[38], &cospi48, &u[57], &rnding, bit);
+    v[39] = half_btf_avx2(&cospim16, &u[39], &cospi48, &u[56], &rnding, bit);
+    v[40] = half_btf_avx2(&cospim48, &u[40], &cospim16, &u[55], &rnding, bit);
+    v[41] = half_btf_avx2(&cospim48, &u[41], &cospim16, &u[54], &rnding, bit);
+    v[42] = half_btf_avx2(&cospim48, &u[42], &cospim16, &u[53], &rnding, bit);
+    v[43] = half_btf_avx2(&cospim48, &u[43], &cospim16, &u[52], &rnding, bit);
+    v[52] = half_btf_avx2(&cospim16, &u[43], &cospi48, &u[52], &rnding, bit);
+    v[53] = half_btf_avx2(&cospim16, &u[42], &cospi48, &u[53], &rnding, bit);
+    v[54] = half_btf_avx2(&cospim16, &u[41], &cospi48, &u[54], &rnding, bit);
+    v[55] = half_btf_avx2(&cospim16, &u[40], &cospi48, &u[55], &rnding, bit);
+    v[56] = half_btf_avx2(&cospi48, &u[39], &cospi16, &u[56], &rnding, bit);
+    v[57] = half_btf_avx2(&cospi48, &u[38], &cospi16, &u[57], &rnding, bit);
+    v[58] = half_btf_avx2(&cospi48, &u[37], &cospi16, &u[58], &rnding, bit);
+    v[59] = half_btf_avx2(&cospi48, &u[36], &cospi16, &u[59], &rnding, bit);
+
+    // stage 9
+    for (i = 0; i < 8; ++i) {
+      addsub_avx2(v[i], v[15 - i], &u[i], &u[15 - i], &clamp_lo, &clamp_hi);
+    }
+
+    for (i = 16; i < 20; ++i) {
+      u[i] = v[i];
+      u[i + 12] = v[i + 12];
+    }
+
+    u[20] = half_btf_avx2(&cospim32, &v[20], &cospi32, &v[27], &rnding, bit);
+    u[21] = half_btf_avx2(&cospim32, &v[21], &cospi32, &v[26], &rnding, bit);
+    u[22] = half_btf_avx2(&cospim32, &v[22], &cospi32, &v[25], &rnding, bit);
+    u[23] = half_btf_avx2(&cospim32, &v[23], &cospi32, &v[24], &rnding, bit);
+    u[24] = half_btf_avx2(&cospi32, &v[23], &cospi32, &v[24], &rnding, bit);
+    u[25] = half_btf_avx2(&cospi32, &v[22], &cospi32, &v[25], &rnding, bit);
+    u[26] = half_btf_avx2(&cospi32, &v[21], &cospi32, &v[26], &rnding, bit);
+    u[27] = half_btf_avx2(&cospi32, &v[20], &cospi32, &v[27], &rnding, bit);
+
+    for (i = 32; i < 40; i++) {
+      addsub_avx2(v[i], v[i ^ 15], &u[i], &u[i ^ 15], &clamp_lo, &clamp_hi);
+    }
+
+    for (i = 48; i < 56; i++) {
+      addsub_avx2(v[i ^ 15], v[i], &u[i ^ 15], &u[i], &clamp_lo, &clamp_hi);
+    }
+
+    // stage 10
+    for (i = 0; i < 16; i++) {
+      addsub_avx2(u[i], u[31 - i], &v[i], &v[31 - i], &clamp_lo, &clamp_hi);
+    }
+
+    for (i = 32; i < 40; i++) v[i] = u[i];
+
+    v[40] = half_btf_avx2(&cospim32, &u[40], &cospi32, &u[55], &rnding, bit);
+    v[41] = half_btf_avx2(&cospim32, &u[41], &cospi32, &u[54], &rnding, bit);
+    v[42] = half_btf_avx2(&cospim32, &u[42], &cospi32, &u[53], &rnding, bit);
+    v[43] = half_btf_avx2(&cospim32, &u[43], &cospi32, &u[52], &rnding, bit);
+    v[44] = half_btf_avx2(&cospim32, &u[44], &cospi32, &u[51], &rnding, bit);
+    v[45] = half_btf_avx2(&cospim32, &u[45], &cospi32, &u[50], &rnding, bit);
+    v[46] = half_btf_avx2(&cospim32, &u[46], &cospi32, &u[49], &rnding, bit);
+    v[47] = half_btf_avx2(&cospim32, &u[47], &cospi32, &u[48], &rnding, bit);
+    v[48] = half_btf_avx2(&cospi32, &u[47], &cospi32, &u[48], &rnding, bit);
+    v[49] = half_btf_avx2(&cospi32, &u[46], &cospi32, &u[49], &rnding, bit);
+    v[50] = half_btf_avx2(&cospi32, &u[45], &cospi32, &u[50], &rnding, bit);
+    v[51] = half_btf_avx2(&cospi32, &u[44], &cospi32, &u[51], &rnding, bit);
+    v[52] = half_btf_avx2(&cospi32, &u[43], &cospi32, &u[52], &rnding, bit);
+    v[53] = half_btf_avx2(&cospi32, &u[42], &cospi32, &u[53], &rnding, bit);
+    v[54] = half_btf_avx2(&cospi32, &u[41], &cospi32, &u[54], &rnding, bit);
+    v[55] = half_btf_avx2(&cospi32, &u[40], &cospi32, &u[55], &rnding, bit);
+
+    for (i = 56; i < 64; i++) v[i] = u[i];
+
+    // stage 11
+    for (i = 0; i < 32; i++) {
+      addsub_avx2(v[i], v[63 - i], &out[(i)], &out[(63 - i)], &clamp_lo,
+                  &clamp_hi);
+    }
+    if (!do_cols) {
+      const int log_range_out = AOMMAX(16, bd + 6);
+      const __m256i clamp_lo_out =
+          _mm256_set1_epi32(-(1 << (log_range_out - 1)));
+      const __m256i clamp_hi_out =
+          _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+      round_shift_8x8_avx2(out, out_shift);
+      round_shift_8x8_avx2(out + 16, out_shift);
+      round_shift_8x8_avx2(out + 32, out_shift);
+      round_shift_8x8_avx2(out + 48, out_shift);
+      highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 64);
+    }
+  }
+}
+typedef void (*transform_1d_avx2)(__m256i *in, __m256i *out, int bit,
+                                  int do_cols, int bd, int out_shift);
+
+static const transform_1d_avx2
+    highbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = {
+      {
+          { NULL, NULL, NULL, NULL },
+          { NULL, NULL, NULL, NULL },
+          { NULL, NULL, NULL, NULL },
+      },
+      {
+          { idct8x8_low1_avx2, idct8x8_avx2, NULL, NULL },
+          { iadst8x8_low1_avx2, iadst8x8_avx2, NULL, NULL },
+          { NULL, NULL, NULL, NULL },
+      },
+      {
+          { idct16_low1_avx2, idct16_low8_avx2, idct16_avx2, NULL },
+          { iadst16_low1_avx2, iadst16_low8_avx2, iadst16_avx2, NULL },
+          { NULL, NULL, NULL, NULL },
+      },
+      { { idct32_low1_avx2, idct32_low8_avx2, idct32_low16_avx2, idct32_avx2 },
+        { NULL, NULL, NULL, NULL },
+        { NULL, NULL, NULL, NULL } },
+
+      { { idct64_low1_avx2, idct64_low8_avx2, idct64_low16_avx2, idct64_avx2 },
+        { NULL, NULL, NULL, NULL },
+        { NULL, NULL, NULL, NULL } }
+    };
+
+static void highbd_inv_txfm2d_add_no_identity_avx2(const int32_t *input,
+                                                   uint16_t *output, int stride,
+                                                   TX_TYPE tx_type,
+                                                   TX_SIZE tx_size, int eob,
+                                                   const int bd) {
+  __m256i buf1[64 * 8];
+  int eobx, eoby;
+  get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_size_w_div8 = txfm_size_col >> 3;
+  const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
+  const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+  const int input_stride = AOMMIN(32, txfm_size_col);
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+  const transform_1d_avx2 row_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+  const transform_1d_avx2 col_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+  assert(col_txfm != NULL);
+  assert(row_txfm != NULL);
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  // 1st stage: column transform
+  for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
+    __m256i buf0[64];
+    const int32_t *input_row = input + i * input_stride * 8;
+    for (int j = 0; j < buf_size_nonzero_w_div8; ++j) {
+      __m256i *buf0_cur = buf0 + j * 8;
+      load_buffer_32x32(input_row + j * 8, buf0_cur, input_stride, 8);
+
+      transpose_8x8_avx2(&buf0_cur[0], &buf0_cur[0]);
+    }
+    if (rect_type == 1 || rect_type == -1) {
+      av1_round_shift_rect_array_32_avx2(
+          buf0, buf0, buf_size_nonzero_w_div8 << 3, 0, NewInvSqrt2);
+    }
+    row_txfm(buf0, buf0, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+             -shift[0]);
+
+    __m256i *_buf1 = buf1 + i * 8;
+    if (lr_flip) {
+      for (int j = 0; j < buf_size_w_div8; ++j) {
+        transpose_8x8_flip_avx2(
+            &buf0[j * 8], &_buf1[(buf_size_w_div8 - 1 - j) * txfm_size_row]);
+      }
+    } else {
+      for (int j = 0; j < buf_size_w_div8; ++j) {
+        transpose_8x8_avx2(&buf0[j * 8], &_buf1[j * txfm_size_row]);
+      }
+    }
+  }
+  // 2nd stage: column transform
+  for (int i = 0; i < buf_size_w_div8; i++) {
+    col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
+             av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+
+    av1_round_shift_array_32_avx2(buf1 + i * txfm_size_row,
+                                  buf1 + i * txfm_size_row, txfm_size_row,
+                                  -shift[1]);
+  }
+
+  // write to buffer
+  if (txfm_size_col >= 16) {
+    for (int i = 0; i < (txfm_size_col >> 4); i++) {
+      highbd_write_buffer_16xn_avx2(buf1 + i * txfm_size_row * 2,
+                                    output + 16 * i, stride, ud_flip,
+                                    txfm_size_row, bd);
+    }
+  } else if (txfm_size_col == 8) {
+    highbd_write_buffer_8xn_avx2(buf1, output, stride, ud_flip, txfm_size_row,
+                                 bd);
+  }
+}
+
+void av1_highbd_inv_txfm2d_add_universe_avx2(const int32_t *input,
+                                             uint8_t *output, int stride,
+                                             TX_TYPE tx_type, TX_SIZE tx_size,
+                                             int eob, const int bd) {
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+      highbd_inv_txfm2d_add_no_identity_avx2(input, CONVERT_TO_SHORTPTR(output),
+                                             stride, tx_type, tx_size, eob, bd);
+      break;
+    case IDTX:
+    case H_DCT:
+    case H_ADST:
+    case H_FLIPADST:
+    case V_DCT:
+    case V_ADST:
+    case V_FLIPADST:
+      av1_highbd_inv_txfm2d_add_universe_sse4_1(input, output, stride, tx_type,
+                                                tx_size, eob, bd);
+      break;
+    default: assert(0); break;
+  }
+}
+void av1_highbd_inv_txfm_add_avx2(const tran_low_t *input, uint8_t *dest,
+                                  int stride, const TxfmParam *txfm_param) {
+  assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
+  const TX_SIZE tx_size = txfm_param->tx_size;
+  switch (tx_size) {
+    case TX_4X8:
+      av1_highbd_inv_txfm_add_4x8_sse4_1(input, dest, stride, txfm_param);
+      break;
+    case TX_8X4:
+      av1_highbd_inv_txfm_add_8x4_sse4_1(input, dest, stride, txfm_param);
+      break;
+    case TX_4X4:
+      av1_highbd_inv_txfm_add_4x4_sse4_1(input, dest, stride, txfm_param);
+      break;
+    case TX_16X4:
+      av1_highbd_inv_txfm_add_16x4_sse4_1(input, dest, stride, txfm_param);
+      break;
+    case TX_4X16:
+      av1_highbd_inv_txfm_add_4x16_sse4_1(input, dest, stride, txfm_param);
+      break;
+    default:
+      av1_highbd_inv_txfm2d_add_universe_avx2(
+          input, dest, stride, txfm_param->tx_type, txfm_param->tx_size,
+          txfm_param->eob, txfm_param->bd);
+      break;
+  }
+}
diff --git a/libs/libaom/src/av1/common/x86/highbd_inv_txfm_sse4.c b/libs/libaom/src/av1/common/x86/highbd_inv_txfm_sse4.c
new file mode 100644
index 000000000..03eaef832
--- /dev/null
+++ b/libs/libaom/src/av1/common/x86/highbd_inv_txfm_sse4.c
@@ -0,0 +1,5821 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <assert.h>
+#include <smmintrin.h> /* SSE4.1 */
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "av1/common/av1_inv_txfm1d_cfg.h"
+#include "av1/common/idct.h"
+#include "av1/common/x86/av1_inv_txfm_ssse3.h"
+#include "av1/common/x86/av1_txfm_sse2.h"
+#include "av1/common/x86/av1_txfm_sse4.h"
+#include "av1/common/x86/highbd_txfm_utility_sse4.h"
+
+static INLINE __m128i highbd_clamp_epi16(__m128i u, int bd) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
+  __m128i clamped, mask;
+
+  mask = _mm_cmpgt_epi16(u, max);
+  clamped = _mm_andnot_si128(mask, u);
+  mask = _mm_and_si128(mask, max);
+  clamped = _mm_or_si128(mask, clamped);
+  mask = _mm_cmpgt_epi16(clamped, zero);
+  clamped = _mm_and_si128(clamped, mask);
+
+  return clamped;
+}
+
+static INLINE void round_shift_4x4(__m128i *in, int shift) {
+  if (shift != 0) {
+    __m128i rnding = _mm_set1_epi32(1 << (shift - 1));
+    in[0] = _mm_add_epi32(in[0], rnding);
+    in[1] = _mm_add_epi32(in[1], rnding);
+    in[2] = _mm_add_epi32(in[2], rnding);
+    in[3] = _mm_add_epi32(in[3], rnding);
+
+    in[0] = _mm_srai_epi32(in[0], shift);
+    in[1] = _mm_srai_epi32(in[1], shift);
+    in[2] = _mm_srai_epi32(in[2], shift);
+    in[3] = _mm_srai_epi32(in[3], shift);
+  }
+}
+
+static void round_shift_8x8(__m128i *in, int shift) {
+  round_shift_4x4(&in[0], shift);
+  round_shift_4x4(&in[4], shift);
+  round_shift_4x4(&in[8], shift);
+  round_shift_4x4(&in[12], shift);
+}
+
+static void highbd_clamp_epi32_sse4_1(__m128i *in, __m128i *out,
+                                      const __m128i *clamp_lo,
+                                      const __m128i *clamp_hi, int size) {
+  __m128i a0, a1;
+  for (int i = 0; i < size; i += 4) {
+    a0 = _mm_max_epi32(in[i], *clamp_lo);
+    out[i] = _mm_min_epi32(a0, *clamp_hi);
+
+    a1 = _mm_max_epi32(in[i + 1], *clamp_lo);
+    out[i + 1] = _mm_min_epi32(a1, *clamp_hi);
+
+    a0 = _mm_max_epi32(in[i + 2], *clamp_lo);
+    out[i + 2] = _mm_min_epi32(a0, *clamp_hi);
+
+    a1 = _mm_max_epi32(in[i + 3], *clamp_lo);
+    out[i + 3] = _mm_min_epi32(a1, *clamp_hi);
+  }
+}
+
+static INLINE __m128i highbd_get_recon_8x8_sse4_1(const __m128i pred,
+                                                  __m128i res0, __m128i res1,
+                                                  const int bd) {
+  __m128i x0 = _mm_cvtepi16_epi32(pred);
+  __m128i x1 = _mm_cvtepi16_epi32(_mm_srli_si128(pred, 8));
+  __m128i min_clip_val = _mm_setzero_si128();
+  __m128i max_clip_val = _mm_set1_epi32((1 << bd) - 1);
+  x0 = _mm_add_epi32(res0, x0);
+  x1 = _mm_add_epi32(res1, x1);
+  x0 = _mm_max_epi32(x0, min_clip_val);
+  x0 = _mm_min_epi32(x0, max_clip_val);
+  x1 = _mm_max_epi32(x1, min_clip_val);
+  x1 = _mm_min_epi32(x1, max_clip_val);
+  x0 = _mm_packus_epi32(x0, x1);
+  return x0;
+}
+
+static INLINE __m128i highbd_get_recon_4xn_sse4_1(const __m128i pred,
+                                                  __m128i res0, const int bd) {
+  __m128i x0 = _mm_cvtepi16_epi32(pred);
+
+  x0 = _mm_add_epi32(res0, x0);
+  x0 = _mm_packus_epi32(x0, x0);
+  x0 = highbd_clamp_epi16(x0, bd);
+  return x0;
+}
+
+static INLINE void highbd_write_buffer_4xn_sse4_1(__m128i *in, uint16_t *output,
+                                                  int stride, int flipud,
+                                                  int height, const int bd) {
+  int j = flipud ? (height - 1) : 0;
+  const int step = flipud ? -1 : 1;
+  for (int i = 0; i < height; ++i, j += step) {
+    __m128i v = _mm_loadl_epi64((__m128i const *)(output + i * stride));
+    __m128i u = highbd_get_recon_4xn_sse4_1(v, in[j], bd);
+
+    _mm_storel_epi64((__m128i *)(output + i * stride), u);
+  }
+}
+
+static INLINE void highbd_write_buffer_8xn_sse4_1(__m128i *in, uint16_t *output,
+                                                  int stride, int flipud,
+                                                  int height, const int bd) {
+  int j = flipud ? (height - 1) : 0;
+  const int step = flipud ? -1 : 1;
+  for (int i = 0; i < height; ++i, j += step) {
+    __m128i v = _mm_loadu_si128((__m128i const *)(output + i * stride));
+    __m128i u = highbd_get_recon_8x8_sse4_1(v, in[j], in[j + height], bd);
+
+    _mm_storeu_si128((__m128i *)(output + i * stride), u);
+  }
+}
+
+static INLINE void load_buffer_32bit_input(const int32_t *in, int stride,
+                                           __m128i *out, int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    out[i] = _mm_loadu_si128((const __m128i *)(in + i * stride));
+  }
+}
+
+static INLINE void load_buffer_4x4(const int32_t *coeff, __m128i *in) {
+  in[0] = _mm_load_si128((const __m128i *)(coeff + 0));
+  in[1] = _mm_load_si128((const __m128i *)(coeff + 4));
+  in[2] = _mm_load_si128((const __m128i *)(coeff + 8));
+  in[3] = _mm_load_si128((const __m128i *)(coeff + 12));
+}
+
+static void addsub_sse4_1(const __m128i in0, const __m128i in1, __m128i *out0,
+                          __m128i *out1, const __m128i *clamp_lo,
+                          const __m128i *clamp_hi) {
+  __m128i a0 = _mm_add_epi32(in0, in1);
+  __m128i a1 = _mm_sub_epi32(in0, in1);
+
+  a0 = _mm_max_epi32(a0, *clamp_lo);
+  a0 = _mm_min_epi32(a0, *clamp_hi);
+  a1 = _mm_max_epi32(a1, *clamp_lo);
+  a1 = _mm_min_epi32(a1, *clamp_hi);
+
+  *out0 = a0;
+  *out1 = a1;
+}
+
+static void shift_and_clamp_sse4_1(__m128i *in0, __m128i *in1,
+                                   const __m128i *clamp_lo,
+                                   const __m128i *clamp_hi, int shift) {
+  __m128i offset = _mm_set1_epi32((1 << shift) >> 1);
+  __m128i in0_w_offset = _mm_add_epi32(*in0, offset);
+  __m128i in1_w_offset = _mm_add_epi32(*in1, offset);
+
+  in0_w_offset = _mm_sra_epi32(in0_w_offset, _mm_cvtsi32_si128(shift));
+  in1_w_offset = _mm_sra_epi32(in1_w_offset, _mm_cvtsi32_si128(shift));
+
+  in0_w_offset = _mm_max_epi32(in0_w_offset, *clamp_lo);
+  in0_w_offset = _mm_min_epi32(in0_w_offset, *clamp_hi);
+  in1_w_offset = _mm_max_epi32(in1_w_offset, *clamp_lo);
+  in1_w_offset = _mm_min_epi32(in1_w_offset, *clamp_hi);
+
+  *in0 = in0_w_offset;
+  *in1 = in1_w_offset;
+}
+
+static INLINE void idct32_stage4_sse4_1(
+    __m128i *bf1, const __m128i *cospim8, const __m128i *cospi56,
+    const __m128i *cospi8, const __m128i *cospim56, const __m128i *cospim40,
+    const __m128i *cospi24, const __m128i *cospi40, const __m128i *cospim24,
+    const __m128i *rounding, int bit) {
+  __m128i temp1, temp2;
+  temp1 = half_btf_sse4_1(cospim8, &bf1[17], cospi56, &bf1[30], rounding, bit);
+  bf1[30] = half_btf_sse4_1(cospi56, &bf1[17], cospi8, &bf1[30], rounding, bit);
+  bf1[17] = temp1;
+
+  temp2 = half_btf_sse4_1(cospim56, &bf1[18], cospim8, &bf1[29], rounding, bit);
+  bf1[29] =
+      half_btf_sse4_1(cospim8, &bf1[18], cospi56, &bf1[29], rounding, bit);
+  bf1[18] = temp2;
+
+  temp1 = half_btf_sse4_1(cospim40, &bf1[21], cospi24, &bf1[26], rounding, bit);
+  bf1[26] =
+      half_btf_sse4_1(cospi24, &bf1[21], cospi40, &bf1[26], rounding, bit);
+  bf1[21] = temp1;
+
+  temp2 =
+      half_btf_sse4_1(cospim24, &bf1[22], cospim40, &bf1[25], rounding, bit);
+  bf1[25] =
+      half_btf_sse4_1(cospim40, &bf1[22], cospi24, &bf1[25], rounding, bit);
+  bf1[22] = temp2;
+}
+
+static INLINE void idct32_stage5_sse4_1(
+    __m128i *bf1, const __m128i *cospim16, const __m128i *cospi48,
+    const __m128i *cospi16, const __m128i *cospim48, const __m128i *clamp_lo,
+    const __m128i *clamp_hi, const __m128i *rounding, int bit) {
+  __m128i temp1, temp2;
+  temp1 = half_btf_sse4_1(cospim16, &bf1[9], cospi48, &bf1[14], rounding, bit);
+  bf1[14] = half_btf_sse4_1(cospi48, &bf1[9], cospi16, &bf1[14], rounding, bit);
+  bf1[9] = temp1;
+
+  temp2 =
+      half_btf_sse4_1(cospim48, &bf1[10], cospim16, &bf1[13], rounding, bit);
+  bf1[13] =
+      half_btf_sse4_1(cospim16, &bf1[10], cospi48, &bf1[13], rounding, bit);
+  bf1[10] = temp2;
+
+  addsub_sse4_1(bf1[16], bf1[19], bf1 + 16, bf1 + 19, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[17], bf1[18], bf1 + 17, bf1 + 18, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[23], bf1[20], bf1 + 23, bf1 + 20, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[22], bf1[21], bf1 + 22, bf1 + 21, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[24], bf1[27], bf1 + 24, bf1 + 27, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[25], bf1[26], bf1 + 25, bf1 + 26, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[31], bf1[28], bf1 + 31, bf1 + 28, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[30], bf1[29], bf1 + 30, bf1 + 29, clamp_lo, clamp_hi);
+}
+
+static INLINE void idct32_stage6_sse4_1(
+    __m128i *bf1, const __m128i *cospim32, const __m128i *cospi32,
+    const __m128i *cospim16, const __m128i *cospi48, const __m128i *cospi16,
+    const __m128i *cospim48, const __m128i *clamp_lo, const __m128i *clamp_hi,
+    const __m128i *rounding, int bit) {
+  __m128i temp1, temp2;
+  temp1 = half_btf_sse4_1(cospim32, &bf1[5], cospi32, &bf1[6], rounding, bit);
+  bf1[6] = half_btf_sse4_1(cospi32, &bf1[5], cospi32, &bf1[6], rounding, bit);
+  bf1[5] = temp1;
+
+  addsub_sse4_1(bf1[8], bf1[11], bf1 + 8, bf1 + 11, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[9], bf1[10], bf1 + 9, bf1 + 10, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[15], bf1[12], bf1 + 15, bf1 + 12, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[14], bf1[13], bf1 + 14, bf1 + 13, clamp_lo, clamp_hi);
+
+  temp1 = half_btf_sse4_1(cospim16, &bf1[18], cospi48, &bf1[29], rounding, bit);
+  bf1[29] =
+      half_btf_sse4_1(cospi48, &bf1[18], cospi16, &bf1[29], rounding, bit);
+  bf1[18] = temp1;
+  temp2 = half_btf_sse4_1(cospim16, &bf1[19], cospi48, &bf1[28], rounding, bit);
+  bf1[28] =
+      half_btf_sse4_1(cospi48, &bf1[19], cospi16, &bf1[28], rounding, bit);
+  bf1[19] = temp2;
+  temp1 =
+      half_btf_sse4_1(cospim48, &bf1[20], cospim16, &bf1[27], rounding, bit);
+  bf1[27] =
+      half_btf_sse4_1(cospim16, &bf1[20], cospi48, &bf1[27], rounding, bit);
+  bf1[20] = temp1;
+  temp2 =
+      half_btf_sse4_1(cospim48, &bf1[21], cospim16, &bf1[26], rounding, bit);
+  bf1[26] =
+      half_btf_sse4_1(cospim16, &bf1[21], cospi48, &bf1[26], rounding, bit);
+  bf1[21] = temp2;
+}
+
+static INLINE void idct32_stage7_sse4_1(__m128i *bf1, const __m128i *cospim32,
+                                        const __m128i *cospi32,
+                                        const __m128i *clamp_lo,
+                                        const __m128i *clamp_hi,
+                                        const __m128i *rounding, int bit) {
+  __m128i temp1, temp2;
+  addsub_sse4_1(bf1[0], bf1[7], bf1 + 0, bf1 + 7, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[1], bf1[6], bf1 + 1, bf1 + 6, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[2], bf1[5], bf1 + 2, bf1 + 5, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[3], bf1[4], bf1 + 3, bf1 + 4, clamp_lo, clamp_hi);
+
+  temp1 = half_btf_sse4_1(cospim32, &bf1[10], cospi32, &bf1[13], rounding, bit);
+  bf1[13] =
+      half_btf_sse4_1(cospi32, &bf1[10], cospi32, &bf1[13], rounding, bit);
+  bf1[10] = temp1;
+  temp2 = half_btf_sse4_1(cospim32, &bf1[11], cospi32, &bf1[12], rounding, bit);
+  bf1[12] =
+      half_btf_sse4_1(cospi32, &bf1[11], cospi32, &bf1[12], rounding, bit);
+  bf1[11] = temp2;
+
+  addsub_sse4_1(bf1[16], bf1[23], bf1 + 16, bf1 + 23, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[17], bf1[22], bf1 + 17, bf1 + 22, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[18], bf1[21], bf1 + 18, bf1 + 21, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[19], bf1[20], bf1 + 19, bf1 + 20, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[31], bf1[24], bf1 + 31, bf1 + 24, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[30], bf1[25], bf1 + 30, bf1 + 25, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[29], bf1[26], bf1 + 29, bf1 + 26, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[28], bf1[27], bf1 + 28, bf1 + 27, clamp_lo, clamp_hi);
+}
+
+static INLINE void idct32_stage8_sse4_1(__m128i *bf1, const __m128i *cospim32,
+                                        const __m128i *cospi32,
+                                        const __m128i *clamp_lo,
+                                        const __m128i *clamp_hi,
+                                        const __m128i *rounding, int bit) {
+  __m128i temp1, temp2;
+  addsub_sse4_1(bf1[0], bf1[15], bf1 + 0, bf1 + 15, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[1], bf1[14], bf1 + 1, bf1 + 14, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[2], bf1[13], bf1 + 2, bf1 + 13, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[3], bf1[12], bf1 + 3, bf1 + 12, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[4], bf1[11], bf1 + 4, bf1 + 11, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[5], bf1[10], bf1 + 5, bf1 + 10, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[6], bf1[9], bf1 + 6, bf1 + 9, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[7], bf1[8], bf1 + 7, bf1 + 8, clamp_lo, clamp_hi);
+
+  temp1 = half_btf_sse4_1(cospim32, &bf1[20], cospi32, &bf1[27], rounding, bit);
+  bf1[27] =
+      half_btf_sse4_1(cospi32, &bf1[20], cospi32, &bf1[27], rounding, bit);
+  bf1[20] = temp1;
+  temp2 = half_btf_sse4_1(cospim32, &bf1[21], cospi32, &bf1[26], rounding, bit);
+  bf1[26] =
+      half_btf_sse4_1(cospi32, &bf1[21], cospi32, &bf1[26], rounding, bit);
+  bf1[21] = temp2;
+  temp1 = half_btf_sse4_1(cospim32, &bf1[22], cospi32, &bf1[25], rounding, bit);
+  bf1[25] =
+      half_btf_sse4_1(cospi32, &bf1[22], cospi32, &bf1[25], rounding, bit);
+  bf1[22] = temp1;
+  temp2 = half_btf_sse4_1(cospim32, &bf1[23], cospi32, &bf1[24], rounding, bit);
+  bf1[24] =
+      half_btf_sse4_1(cospi32, &bf1[23], cospi32, &bf1[24], rounding, bit);
+  bf1[23] = temp2;
+}
+
+static INLINE void idct32_stage9_sse4_1(__m128i *bf1, __m128i *out,
+                                        const int do_cols, const int bd,
+                                        const int out_shift,
+                                        const __m128i *clamp_lo,
+                                        const __m128i *clamp_hi) {
+  addsub_sse4_1(bf1[0], bf1[31], out + 0, out + 31, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[1], bf1[30], out + 1, out + 30, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[2], bf1[29], out + 2, out + 29, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[3], bf1[28], out + 3, out + 28, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[4], bf1[27], out + 4, out + 27, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[5], bf1[26], out + 5, out + 26, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[6], bf1[25], out + 6, out + 25, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[7], bf1[24], out + 7, out + 24, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[8], bf1[23], out + 8, out + 23, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[9], bf1[22], out + 9, out + 22, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[10], bf1[21], out + 10, out + 21, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[11], bf1[20], out + 11, out + 20, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[12], bf1[19], out + 12, out + 19, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[13], bf1[18], out + 13, out + 18, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[14], bf1[17], out + 14, out + 17, clamp_lo, clamp_hi);
+  addsub_sse4_1(bf1[15], bf1[16], out + 15, out + 16, clamp_lo, clamp_hi);
+
+  if (!do_cols) {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+    const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+    for (int i = 0; i < 32; i += 8) {
+      round_shift_4x4(out + i, out_shift);
+      round_shift_4x4(out + i + 4, out_shift);
+    }
+    highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 32);
+  }
+}
+
+static void neg_shift_sse4_1(const __m128i in0, const __m128i in1,
+                             __m128i *out0, __m128i *out1,
+                             const __m128i *clamp_lo, const __m128i *clamp_hi,
+                             int shift) {
+  __m128i offset = _mm_set1_epi32((1 << shift) >> 1);
+  __m128i a0 = _mm_add_epi32(offset, in0);
+  __m128i a1 = _mm_sub_epi32(offset, in1);
+
+  a0 = _mm_sra_epi32(a0, _mm_cvtsi32_si128(shift));
+  a1 = _mm_sra_epi32(a1, _mm_cvtsi32_si128(shift));
+
+  a0 = _mm_max_epi32(a0, *clamp_lo);
+  a0 = _mm_min_epi32(a0, *clamp_hi);
+  a1 = _mm_max_epi32(a1, *clamp_lo);
+  a1 = _mm_min_epi32(a1, *clamp_hi);
+
+  *out0 = a0;
+  *out1 = a1;
+}
+
+static void idct4x4_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+                           int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+  __m128i u0, u1, u2, u3;
+  __m128i v0, v1, v2, v3, x, y;
+
+  // Stage 0
+  // Stage 1
+  // Stage 2
+  v0 = _mm_unpacklo_epi32(in[0], in[1]);
+  v1 = _mm_unpackhi_epi32(in[0], in[1]);
+  v2 = _mm_unpacklo_epi32(in[2], in[3]);
+  v3 = _mm_unpackhi_epi32(in[2], in[3]);
+
+  u0 = _mm_unpacklo_epi64(v0, v2);
+  u1 = _mm_unpackhi_epi64(v0, v2);
+  u2 = _mm_unpacklo_epi64(v1, v3);
+  u3 = _mm_unpackhi_epi64(v1, v3);
+
+  x = _mm_mullo_epi32(u0, cospi32);
+  y = _mm_mullo_epi32(u2, cospi32);
+  v0 = _mm_add_epi32(x, y);
+  v0 = _mm_add_epi32(v0, rnding);
+  v0 = _mm_srai_epi32(v0, bit);
+
+  v1 = _mm_sub_epi32(x, y);
+  v1 = _mm_add_epi32(v1, rnding);
+  v1 = _mm_srai_epi32(v1, bit);
+
+  x = _mm_mullo_epi32(u1, cospi48);
+  y = _mm_mullo_epi32(u3, cospim16);
+  v2 = _mm_add_epi32(x, y);
+  v2 = _mm_add_epi32(v2, rnding);
+  v2 = _mm_srai_epi32(v2, bit);
+
+  x = _mm_mullo_epi32(u1, cospi16);
+  y = _mm_mullo_epi32(u3, cospi48);
+  v3 = _mm_add_epi32(x, y);
+  v3 = _mm_add_epi32(v3, rnding);
+  v3 = _mm_srai_epi32(v3, bit);
+
+  // Stage 3
+  addsub_sse4_1(v0, v3, out + 0, out + 3, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(v1, v2, out + 1, out + 2, &clamp_lo, &clamp_hi);
+
+  if (!do_cols) {
+    log_range = AOMMAX(16, bd + 6);
+    clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+    clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+
+    shift_and_clamp_sse4_1(out + 0, out + 3, &clamp_lo, &clamp_hi, out_shift);
+    shift_and_clamp_sse4_1(out + 1, out + 2, &clamp_lo, &clamp_hi, out_shift);
+  }
+}
+
+static void iadst4x4_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+                            int bd, int out_shift) {
+  const int32_t *sinpi = sinpi_arr(bit);
+  const __m128i zero = _mm_set1_epi32(0);
+  __m128i rnding = _mm_set1_epi32(1 << (bit + 4 - 1));
+  rnding = _mm_unpacklo_epi32(rnding, zero);
+  const __m128i mul = _mm_set1_epi32(1 << 4);
+  const __m128i sinpi1 = _mm_set1_epi32((int)sinpi[1]);
+  const __m128i sinpi2 = _mm_set1_epi32((int)sinpi[2]);
+  const __m128i sinpi3 = _mm_set1_epi32((int)sinpi[3]);
+  const __m128i sinpi4 = _mm_set1_epi32((int)sinpi[4]);
+  __m128i t;
+  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
+  __m128i x0, x1, x2, x3;
+  __m128i u0, u1, u2, u3;
+  __m128i v0, v1, v2, v3;
+  __m128i u0_low, u1_low, u2_low, u3_low;
+  __m128i u0_high, u1_high, u2_high, u3_high;
+
+  v0 = _mm_unpacklo_epi32(in[0], in[1]);
+  v1 = _mm_unpackhi_epi32(in[0], in[1]);
+  v2 = _mm_unpacklo_epi32(in[2], in[3]);
+  v3 = _mm_unpackhi_epi32(in[2], in[3]);
+
+  x0 = _mm_unpacklo_epi64(v0, v2);
+  x1 = _mm_unpackhi_epi64(v0, v2);
+  x2 = _mm_unpacklo_epi64(v1, v3);
+  x3 = _mm_unpackhi_epi64(v1, v3);
+
+  s0 = _mm_mullo_epi32(x0, sinpi1);
+  s1 = _mm_mullo_epi32(x0, sinpi2);
+  s2 = _mm_mullo_epi32(x1, sinpi3);
+  s3 = _mm_mullo_epi32(x2, sinpi4);
+  s4 = _mm_mullo_epi32(x2, sinpi1);
+  s5 = _mm_mullo_epi32(x3, sinpi2);
+  s6 = _mm_mullo_epi32(x3, sinpi4);
+  t = _mm_sub_epi32(x0, x2);
+  s7 = _mm_add_epi32(t, x3);
+
+  t = _mm_add_epi32(s0, s3);
+  s0 = _mm_add_epi32(t, s5);
+  t = _mm_sub_epi32(s1, s4);
+  s1 = _mm_sub_epi32(t, s6);
+  s3 = s2;
+  s2 = _mm_mullo_epi32(s7, sinpi3);
+
+  u0 = _mm_add_epi32(s0, s3);
+  u1 = _mm_add_epi32(s1, s3);
+  u2 = s2;
+  t = _mm_add_epi32(s0, s1);
+  u3 = _mm_sub_epi32(t, s3);
+
+  // u0
+  u0_low = _mm_mul_epi32(u0, mul);
+  u0_low = _mm_add_epi64(u0_low, rnding);
+
+  u0 = _mm_srli_si128(u0, 4);
+  u0_high = _mm_mul_epi32(u0, mul);
+  u0_high = _mm_add_epi64(u0_high, rnding);
+
+  u0_low = _mm_srli_si128(u0_low, 2);
+  u0_high = _mm_srli_si128(u0_high, 2);
+
+  u0 = _mm_unpacklo_epi32(u0_low, u0_high);
+  u0_high = _mm_unpackhi_epi32(u0_low, u0_high);
+  u0 = _mm_unpacklo_epi64(u0, u0_high);
+
+  // u1
+  u1_low = _mm_mul_epi32(u1, mul);
+  u1_low = _mm_add_epi64(u1_low, rnding);
+
+  u1 = _mm_srli_si128(u1, 4);
+  u1_high = _mm_mul_epi32(u1, mul);
+  u1_high = _mm_add_epi64(u1_high, rnding);
+
+  u1_low = _mm_srli_si128(u1_low, 2);
+  u1_high = _mm_srli_si128(u1_high, 2);
+
+  u1 = _mm_unpacklo_epi32(u1_low, u1_high);
+  u1_high = _mm_unpackhi_epi32(u1_low, u1_high);
+  u1 = _mm_unpacklo_epi64(u1, u1_high);
+
+  // u2
+  u2_low = _mm_mul_epi32(u2, mul);
+  u2_low = _mm_add_epi64(u2_low, rnding);
+
+  u2 = _mm_srli_si128(u2, 4);
+  u2_high = _mm_mul_epi32(u2, mul);
+  u2_high = _mm_add_epi64(u2_high, rnding);
+
+  u2_low = _mm_srli_si128(u2_low, 2);
+  u2_high = _mm_srli_si128(u2_high, 2);
+
+  u2 = _mm_unpacklo_epi32(u2_low, u2_high);
+  u2_high = _mm_unpackhi_epi32(u2_low, u2_high);
+  u2 = _mm_unpacklo_epi64(u2, u2_high);
+
+  // u3
+  u3_low = _mm_mul_epi32(u3, mul);
+  u3_low = _mm_add_epi64(u3_low, rnding);
+
+  u3 = _mm_srli_si128(u3, 4);
+  u3_high = _mm_mul_epi32(u3, mul);
+  u3_high = _mm_add_epi64(u3_high, rnding);
+
+  u3_low = _mm_srli_si128(u3_low, 2);
+  u3_high = _mm_srli_si128(u3_high, 2);
+
+  u3 = _mm_unpacklo_epi32(u3_low, u3_high);
+  u3_high = _mm_unpackhi_epi32(u3_low, u3_high);
+  u3 = _mm_unpacklo_epi64(u3, u3_high);
+
+  out[0] = u0;
+  out[1] = u1;
+  out[2] = u2;
+  out[3] = u3;
+
+  if (!do_cols) {
+    const int log_range = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+    const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+    round_shift_4x4(out, out_shift);
+    highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 4);
+  }
+}
+
+static void write_buffer_4x4(__m128i *in, uint16_t *output, int stride,
+                             int fliplr, int flipud, int shift, int bd) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i u0, u1, u2, u3;
+  __m128i v0, v1, v2, v3;
+
+  round_shift_4x4(in, shift);
+
+  v0 = _mm_loadl_epi64((__m128i const *)(output + 0 * stride));
+  v1 = _mm_loadl_epi64((__m128i const *)(output + 1 * stride));
+  v2 = _mm_loadl_epi64((__m128i const *)(output + 2 * stride));
+  v3 = _mm_loadl_epi64((__m128i const *)(output + 3 * stride));
+
+  v0 = _mm_unpacklo_epi16(v0, zero);
+  v1 = _mm_unpacklo_epi16(v1, zero);
+  v2 = _mm_unpacklo_epi16(v2, zero);
+  v3 = _mm_unpacklo_epi16(v3, zero);
+
+  if (fliplr) {
+    in[0] = _mm_shuffle_epi32(in[0], 0x1B);
+    in[1] = _mm_shuffle_epi32(in[1], 0x1B);
+    in[2] = _mm_shuffle_epi32(in[2], 0x1B);
+    in[3] = _mm_shuffle_epi32(in[3], 0x1B);
+  }
+
+  if (flipud) {
+    u0 = _mm_add_epi32(in[3], v0);
+    u1 = _mm_add_epi32(in[2], v1);
+    u2 = _mm_add_epi32(in[1], v2);
+    u3 = _mm_add_epi32(in[0], v3);
+  } else {
+    u0 = _mm_add_epi32(in[0], v0);
+    u1 = _mm_add_epi32(in[1], v1);
+    u2 = _mm_add_epi32(in[2], v2);
+    u3 = _mm_add_epi32(in[3], v3);
+  }
+
+  v0 = _mm_packus_epi32(u0, u1);
+  v2 = _mm_packus_epi32(u2, u3);
+
+  u0 = highbd_clamp_epi16(v0, bd);
+  u2 = highbd_clamp_epi16(v2, bd);
+
+  v0 = _mm_unpacklo_epi64(u0, u0);
+  v1 = _mm_unpackhi_epi64(u0, u0);
+  v2 = _mm_unpacklo_epi64(u2, u2);
+  v3 = _mm_unpackhi_epi64(u2, u2);
+
+  _mm_storel_epi64((__m128i *)(output + 0 * stride), v0);
+  _mm_storel_epi64((__m128i *)(output + 1 * stride), v1);
+  _mm_storel_epi64((__m128i *)(output + 2 * stride), v2);
+  _mm_storel_epi64((__m128i *)(output + 3 * stride), v3);
+}
+
+static void iidentity4_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+                              int bd, int out_shift) {
+  (void)bit;
+  __m128i v[4];
+  __m128i zero = _mm_set1_epi32(0);
+  __m128i fact = _mm_set1_epi32(NewSqrt2);
+  __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1));
+  __m128i a0_low, a1_low;
+  __m128i a0_high, a1_high;
+
+  offset = _mm_unpacklo_epi32(offset, zero);
+
+  for (int i = 0; i < 4; i++) {
+    a0_low = _mm_mul_epi32(in[i], fact);
+    a0_low = _mm_add_epi32(a0_low, offset);
+    a0_low = _mm_srli_epi64(a0_low, NewSqrt2Bits);
+
+    a0_high = _mm_srli_si128(in[i], 4);
+    a0_high = _mm_mul_epi32(a0_high, fact);
+    a0_high = _mm_add_epi32(a0_high, offset);
+    a0_high = _mm_srli_epi64(a0_high, NewSqrt2Bits);
+
+    a1_low = _mm_unpacklo_epi32(a0_low, a0_high);
+    a1_high = _mm_unpackhi_epi32(a0_low, a0_high);
+    out[i] = _mm_unpacklo_epi64(a1_low, a1_high);
+  }
+
+  if (!do_cols) {
+    const int log_range = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+    const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+    round_shift_4x4(out, out_shift);
+    highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 4);
+  }
+
+  // Transpose for 4x4
+  v[0] = _mm_unpacklo_epi32(out[0], out[1]);
+  v[1] = _mm_unpackhi_epi32(out[0], out[1]);
+  v[2] = _mm_unpacklo_epi32(out[2], out[3]);
+  v[3] = _mm_unpackhi_epi32(out[2], out[3]);
+
+  out[0] = _mm_unpacklo_epi64(v[0], v[2]);
+  out[1] = _mm_unpackhi_epi64(v[0], v[2]);
+  out[2] = _mm_unpacklo_epi64(v[1], v[3]);
+  out[3] = _mm_unpackhi_epi64(v[1], v[3]);
+}
+void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *input, uint16_t *output,
+                                   int stride, TX_TYPE tx_type, int bd) {
+  __m128i in[4];
+  const int8_t *shift = av1_inv_txfm_shift_ls[TX_4X4];
+  const int txw_idx = get_txw_idx(TX_4X4);
+  const int txh_idx = get_txh_idx(TX_4X4);
+
+  switch (tx_type) {
+    case DCT_DCT:
+      load_buffer_4x4(input, in);
+      idct4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+      idct4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+      break;
+    case ADST_DCT:
+      load_buffer_4x4(input, in);
+      idct4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+      iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+      break;
+    case DCT_ADST:
+      load_buffer_4x4(input, in);
+      iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+      idct4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+      break;
+    case ADST_ADST:
+      load_buffer_4x4(input, in);
+      iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+      iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+      break;
+    case FLIPADST_DCT:
+      load_buffer_4x4(input, in);
+      idct4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+      iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
+      break;
+    case DCT_FLIPADST:
+      load_buffer_4x4(input, in);
+      iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+      idct4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
+      break;
+    case FLIPADST_FLIPADST:
+      load_buffer_4x4(input, in);
+      iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+      iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_4x4(in, output, stride, 1, 1, -shift[1], bd);
+      break;
+    case ADST_FLIPADST:
+      load_buffer_4x4(input, in);
+      iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+      iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
+      break;
+    case FLIPADST_ADST:
+      load_buffer_4x4(input, in);
+      iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+      iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
+      break;
+    case IDTX:
+      load_buffer_4x4(input, in);
+      iidentity4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                        0);
+      iidentity4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd,
+                        0);
+      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+      break;
+    case V_DCT:
+      load_buffer_4x4(input, in);
+      iidentity4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                        0);
+      idct4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+      break;
+    case H_DCT:
+      load_buffer_4x4(input, in);
+      idct4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+      iidentity4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd,
+                        0);
+      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+      break;
+    case V_ADST:
+      load_buffer_4x4(input, in);
+      iidentity4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                        0);
+      iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+      break;
+    case H_ADST:
+      load_buffer_4x4(input, in);
+      iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+      iidentity4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd,
+                        0);
+      write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+      break;
+    case V_FLIPADST:
+      load_buffer_4x4(input, in);
+      iidentity4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                        0);
+      iadst4x4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
+      break;
+    case H_FLIPADST:
+      load_buffer_4x4(input, in);
+      iadst4x4_sse4_1(in, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+      iidentity4_sse4_1(in, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd,
+                        0);
+      write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
+      break;
+    default: assert(0);
+  }
+}
+
+// 8x8
+static void load_buffer_8x8(const int32_t *coeff, __m128i *in) {
+  in[0] = _mm_load_si128((const __m128i *)(coeff + 0));
+  in[1] = _mm_load_si128((const __m128i *)(coeff + 4));
+  in[2] = _mm_load_si128((const __m128i *)(coeff + 8));
+  in[3] = _mm_load_si128((const __m128i *)(coeff + 12));
+  in[4] = _mm_load_si128((const __m128i *)(coeff + 16));
+  in[5] = _mm_load_si128((const __m128i *)(coeff + 20));
+  in[6] = _mm_load_si128((const __m128i *)(coeff + 24));
+  in[7] = _mm_load_si128((const __m128i *)(coeff + 28));
+  in[8] = _mm_load_si128((const __m128i *)(coeff + 32));
+  in[9] = _mm_load_si128((const __m128i *)(coeff + 36));
+  in[10] = _mm_load_si128((const __m128i *)(coeff + 40));
+  in[11] = _mm_load_si128((const __m128i *)(coeff + 44));
+  in[12] = _mm_load_si128((const __m128i *)(coeff + 48));
+  in[13] = _mm_load_si128((const __m128i *)(coeff + 52));
+  in[14] = _mm_load_si128((const __m128i *)(coeff + 56));
+  in[15] = _mm_load_si128((const __m128i *)(coeff + 60));
+}
+
+static void idct8x8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+                           int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+  __m128i x, y;
+  int col;
+
+  // Note:
+  //  Even column: 0, 2, ..., 14
+  //  Odd column: 1, 3, ..., 15
+  //  one even column plus one odd column constructs one row (8 coeffs)
+  //  total we have 8 rows (8x8).
+  for (col = 0; col < 2; ++col) {
+    // stage 0
+    // stage 1
+    // stage 2
+    u0 = in[0 * 2 + col];
+    u1 = in[4 * 2 + col];
+    u2 = in[2 * 2 + col];
+    u3 = in[6 * 2 + col];
+
+    x = _mm_mullo_epi32(in[1 * 2 + col], cospi56);
+    y = _mm_mullo_epi32(in[7 * 2 + col], cospim8);
+    u4 = _mm_add_epi32(x, y);
+    u4 = _mm_add_epi32(u4, rnding);
+    u4 = _mm_srai_epi32(u4, bit);
+
+    x = _mm_mullo_epi32(in[1 * 2 + col], cospi8);
+    y = _mm_mullo_epi32(in[7 * 2 + col], cospi56);
+    u7 = _mm_add_epi32(x, y);
+    u7 = _mm_add_epi32(u7, rnding);
+    u7 = _mm_srai_epi32(u7, bit);
+
+    x = _mm_mullo_epi32(in[5 * 2 + col], cospi24);
+    y = _mm_mullo_epi32(in[3 * 2 + col], cospim40);
+    u5 = _mm_add_epi32(x, y);
+    u5 = _mm_add_epi32(u5, rnding);
+    u5 = _mm_srai_epi32(u5, bit);
+
+    x = _mm_mullo_epi32(in[5 * 2 + col], cospi40);
+    y = _mm_mullo_epi32(in[3 * 2 + col], cospi24);
+    u6 = _mm_add_epi32(x, y);
+    u6 = _mm_add_epi32(u6, rnding);
+    u6 = _mm_srai_epi32(u6, bit);
+
+    // stage 3
+    x = _mm_mullo_epi32(u0, cospi32);
+    y = _mm_mullo_epi32(u1, cospi32);
+    v0 = _mm_add_epi32(x, y);
+    v0 = _mm_add_epi32(v0, rnding);
+    v0 = _mm_srai_epi32(v0, bit);
+
+    v1 = _mm_sub_epi32(x, y);
+    v1 = _mm_add_epi32(v1, rnding);
+    v1 = _mm_srai_epi32(v1, bit);
+
+    x = _mm_mullo_epi32(u2, cospi48);
+    y = _mm_mullo_epi32(u3, cospim16);
+    v2 = _mm_add_epi32(x, y);
+    v2 = _mm_add_epi32(v2, rnding);
+    v2 = _mm_srai_epi32(v2, bit);
+
+    x = _mm_mullo_epi32(u2, cospi16);
+    y = _mm_mullo_epi32(u3, cospi48);
+    v3 = _mm_add_epi32(x, y);
+    v3 = _mm_add_epi32(v3, rnding);
+    v3 = _mm_srai_epi32(v3, bit);
+
+    addsub_sse4_1(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi);
+
+    // stage 4
+    addsub_sse4_1(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi);
+    u4 = v4;
+    u7 = v7;
+
+    x = _mm_mullo_epi32(v5, cospi32);
+    y = _mm_mullo_epi32(v6, cospi32);
+    u6 = _mm_add_epi32(y, x);
+    u6 = _mm_add_epi32(u6, rnding);
+    u6 = _mm_srai_epi32(u6, bit);
+
+    u5 = _mm_sub_epi32(y, x);
+    u5 = _mm_add_epi32(u5, rnding);
+    u5 = _mm_srai_epi32(u5, bit);
+
+    // stage 5
+    addsub_sse4_1(u0, u7, out + 0 * 2 + col, out + 7 * 2 + col, &clamp_lo,
+                  &clamp_hi);
+    addsub_sse4_1(u1, u6, out + 1 * 2 + col, out + 6 * 2 + col, &clamp_lo,
+                  &clamp_hi);
+    addsub_sse4_1(u2, u5, out + 2 * 2 + col, out + 5 * 2 + col, &clamp_lo,
+                  &clamp_hi);
+    addsub_sse4_1(u3, u4, out + 3 * 2 + col, out + 4 * 2 + col, &clamp_lo,
+                  &clamp_hi);
+  }
+
+  if (!do_cols) {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+    const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+    round_shift_8x8(out, out_shift);
+    highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 16);
+  }
+}
+
+static void iadst8x8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+                            int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+  const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+  const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+  const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+  const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+  const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+  const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const __m128i kZero = _mm_setzero_si128();
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+  __m128i u[8], v[8], x;
+
+  // Even 8 points: 0, 2, ..., 14
+  // stage 0
+  // stage 1
+  // stage 2
+  // (1)
+  u[0] = _mm_mullo_epi32(in[14], cospi4);
+  x = _mm_mullo_epi32(in[0], cospi60);
+  u[0] = _mm_add_epi32(u[0], x);
+  u[0] = _mm_add_epi32(u[0], rnding);
+  u[0] = _mm_srai_epi32(u[0], bit);
+
+  u[1] = _mm_mullo_epi32(in[14], cospi60);
+  x = _mm_mullo_epi32(in[0], cospi4);
+  u[1] = _mm_sub_epi32(u[1], x);
+  u[1] = _mm_add_epi32(u[1], rnding);
+  u[1] = _mm_srai_epi32(u[1], bit);
+
+  // (2)
+  u[2] = _mm_mullo_epi32(in[10], cospi20);
+  x = _mm_mullo_epi32(in[4], cospi44);
+  u[2] = _mm_add_epi32(u[2], x);
+  u[2] = _mm_add_epi32(u[2], rnding);
+  u[2] = _mm_srai_epi32(u[2], bit);
+
+  u[3] = _mm_mullo_epi32(in[10], cospi44);
+  x = _mm_mullo_epi32(in[4], cospi20);
+  u[3] = _mm_sub_epi32(u[3], x);
+  u[3] = _mm_add_epi32(u[3], rnding);
+  u[3] = _mm_srai_epi32(u[3], bit);
+
+  // (3)
+  u[4] = _mm_mullo_epi32(in[6], cospi36);
+  x = _mm_mullo_epi32(in[8], cospi28);
+  u[4] = _mm_add_epi32(u[4], x);
+  u[4] = _mm_add_epi32(u[4], rnding);
+  u[4] = _mm_srai_epi32(u[4], bit);
+
+  u[5] = _mm_mullo_epi32(in[6], cospi28);
+  x = _mm_mullo_epi32(in[8], cospi36);
+  u[5] = _mm_sub_epi32(u[5], x);
+  u[5] = _mm_add_epi32(u[5], rnding);
+  u[5] = _mm_srai_epi32(u[5], bit);
+
+  // (4)
+  u[6] = _mm_mullo_epi32(in[2], cospi52);
+  x = _mm_mullo_epi32(in[12], cospi12);
+  u[6] = _mm_add_epi32(u[6], x);
+  u[6] = _mm_add_epi32(u[6], rnding);
+  u[6] = _mm_srai_epi32(u[6], bit);
+
+  u[7] = _mm_mullo_epi32(in[2], cospi12);
+  x = _mm_mullo_epi32(in[12], cospi52);
+  u[7] = _mm_sub_epi32(u[7], x);
+  u[7] = _mm_add_epi32(u[7], rnding);
+  u[7] = _mm_srai_epi32(u[7], bit);
+
+  // stage 3
+  addsub_sse4_1(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
+
+  // stage 4
+  u[0] = v[0];
+  u[1] = v[1];
+  u[2] = v[2];
+  u[3] = v[3];
+
+  u[4] = _mm_mullo_epi32(v[4], cospi16);
+  x = _mm_mullo_epi32(v[5], cospi48);
+  u[4] = _mm_add_epi32(u[4], x);
+  u[4] = _mm_add_epi32(u[4], rnding);
+  u[4] = _mm_srai_epi32(u[4], bit);
+
+  u[5] = _mm_mullo_epi32(v[4], cospi48);
+  x = _mm_mullo_epi32(v[5], cospi16);
+  u[5] = _mm_sub_epi32(u[5], x);
+  u[5] = _mm_add_epi32(u[5], rnding);
+  u[5] = _mm_srai_epi32(u[5], bit);
+
+  u[6] = _mm_mullo_epi32(v[6], cospim48);
+  x = _mm_mullo_epi32(v[7], cospi16);
+  u[6] = _mm_add_epi32(u[6], x);
+  u[6] = _mm_add_epi32(u[6], rnding);
+  u[6] = _mm_srai_epi32(u[6], bit);
+
+  u[7] = _mm_mullo_epi32(v[6], cospi16);
+  x = _mm_mullo_epi32(v[7], cospim48);
+  u[7] = _mm_sub_epi32(u[7], x);
+  u[7] = _mm_add_epi32(u[7], rnding);
+  u[7] = _mm_srai_epi32(u[7], bit);
+
+  // stage 5
+  addsub_sse4_1(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
+
+  // stage 6
+  u[0] = v[0];
+  u[1] = v[1];
+  u[4] = v[4];
+  u[5] = v[5];
+
+  v[0] = _mm_mullo_epi32(v[2], cospi32);
+  x = _mm_mullo_epi32(v[3], cospi32);
+  u[2] = _mm_add_epi32(v[0], x);
+  u[2] = _mm_add_epi32(u[2], rnding);
+  u[2] = _mm_srai_epi32(u[2], bit);
+
+  u[3] = _mm_sub_epi32(v[0], x);
+  u[3] = _mm_add_epi32(u[3], rnding);
+  u[3] = _mm_srai_epi32(u[3], bit);
+
+  v[0] = _mm_mullo_epi32(v[6], cospi32);
+  x = _mm_mullo_epi32(v[7], cospi32);
+  u[6] = _mm_add_epi32(v[0], x);
+  u[6] = _mm_add_epi32(u[6], rnding);
+  u[6] = _mm_srai_epi32(u[6], bit);
+
+  u[7] = _mm_sub_epi32(v[0], x);
+  u[7] = _mm_add_epi32(u[7], rnding);
+  u[7] = _mm_srai_epi32(u[7], bit);
+
+  // stage 7
+  if (do_cols) {
+    out[0] = u[0];
+    out[2] = _mm_sub_epi32(kZero, u[4]);
+    out[4] = u[6];
+    out[6] = _mm_sub_epi32(kZero, u[2]);
+    out[8] = u[3];
+    out[10] = _mm_sub_epi32(kZero, u[7]);
+    out[12] = u[5];
+    out[14] = _mm_sub_epi32(kZero, u[1]);
+  } else {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+    const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+    neg_shift_sse4_1(u[0], u[4], out + 0, out + 2, &clamp_lo_out, &clamp_hi_out,
+                     out_shift);
+    neg_shift_sse4_1(u[6], u[2], out + 4, out + 6, &clamp_lo_out, &clamp_hi_out,
+                     out_shift);
+    neg_shift_sse4_1(u[3], u[7], out + 8, out + 10, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+    neg_shift_sse4_1(u[5], u[1], out + 12, out + 14, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+  }
+
+  // Odd 8 points: 1, 3, ..., 15
+  // stage 0
+  // stage 1
+  // stage 2
+  // (1)
+  u[0] = _mm_mullo_epi32(in[15], cospi4);
+  x = _mm_mullo_epi32(in[1], cospi60);
+  u[0] = _mm_add_epi32(u[0], x);
+  u[0] = _mm_add_epi32(u[0], rnding);
+  u[0] = _mm_srai_epi32(u[0], bit);
+
+  u[1] = _mm_mullo_epi32(in[15], cospi60);
+  x = _mm_mullo_epi32(in[1], cospi4);
+  u[1] = _mm_sub_epi32(u[1], x);
+  u[1] = _mm_add_epi32(u[1], rnding);
+  u[1] = _mm_srai_epi32(u[1], bit);
+
+  // (2)
+  u[2] = _mm_mullo_epi32(in[11], cospi20);
+  x = _mm_mullo_epi32(in[5], cospi44);
+  u[2] = _mm_add_epi32(u[2], x);
+  u[2] = _mm_add_epi32(u[2], rnding);
+  u[2] = _mm_srai_epi32(u[2], bit);
+
+  u[3] = _mm_mullo_epi32(in[11], cospi44);
+  x = _mm_mullo_epi32(in[5], cospi20);
+  u[3] = _mm_sub_epi32(u[3], x);
+  u[3] = _mm_add_epi32(u[3], rnding);
+  u[3] = _mm_srai_epi32(u[3], bit);
+
+  // (3)
+  u[4] = _mm_mullo_epi32(in[7], cospi36);
+  x = _mm_mullo_epi32(in[9], cospi28);
+  u[4] = _mm_add_epi32(u[4], x);
+  u[4] = _mm_add_epi32(u[4], rnding);
+  u[4] = _mm_srai_epi32(u[4], bit);
+
+  u[5] = _mm_mullo_epi32(in[7], cospi28);
+  x = _mm_mullo_epi32(in[9], cospi36);
+  u[5] = _mm_sub_epi32(u[5], x);
+  u[5] = _mm_add_epi32(u[5], rnding);
+  u[5] = _mm_srai_epi32(u[5], bit);
+
+  // (4)
+  u[6] = _mm_mullo_epi32(in[3], cospi52);
+  x = _mm_mullo_epi32(in[13], cospi12);
+  u[6] = _mm_add_epi32(u[6], x);
+  u[6] = _mm_add_epi32(u[6], rnding);
+  u[6] = _mm_srai_epi32(u[6], bit);
+
+  u[7] = _mm_mullo_epi32(in[3], cospi12);
+  x = _mm_mullo_epi32(in[13], cospi52);
+  u[7] = _mm_sub_epi32(u[7], x);
+  u[7] = _mm_add_epi32(u[7], rnding);
+  u[7] = _mm_srai_epi32(u[7], bit);
+
+  // stage 3
+  addsub_sse4_1(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
+
+  // stage 4
+  u[0] = v[0];
+  u[1] = v[1];
+  u[2] = v[2];
+  u[3] = v[3];
+
+  u[4] = _mm_mullo_epi32(v[4], cospi16);
+  x = _mm_mullo_epi32(v[5], cospi48);
+  u[4] = _mm_add_epi32(u[4], x);
+  u[4] = _mm_add_epi32(u[4], rnding);
+  u[4] = _mm_srai_epi32(u[4], bit);
+
+  u[5] = _mm_mullo_epi32(v[4], cospi48);
+  x = _mm_mullo_epi32(v[5], cospi16);
+  u[5] = _mm_sub_epi32(u[5], x);
+  u[5] = _mm_add_epi32(u[5], rnding);
+  u[5] = _mm_srai_epi32(u[5], bit);
+
+  u[6] = _mm_mullo_epi32(v[6], cospim48);
+  x = _mm_mullo_epi32(v[7], cospi16);
+  u[6] = _mm_add_epi32(u[6], x);
+  u[6] = _mm_add_epi32(u[6], rnding);
+  u[6] = _mm_srai_epi32(u[6], bit);
+
+  u[7] = _mm_mullo_epi32(v[6], cospi16);
+  x = _mm_mullo_epi32(v[7], cospim48);
+  u[7] = _mm_sub_epi32(u[7], x);
+  u[7] = _mm_add_epi32(u[7], rnding);
+  u[7] = _mm_srai_epi32(u[7], bit);
+
+  // stage 5
+  addsub_sse4_1(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
+
+  // stage 6
+  u[0] = v[0];
+  u[1] = v[1];
+  u[4] = v[4];
+  u[5] = v[5];
+
+  v[0] = _mm_mullo_epi32(v[2], cospi32);
+  x = _mm_mullo_epi32(v[3], cospi32);
+  u[2] = _mm_add_epi32(v[0], x);
+  u[2] = _mm_add_epi32(u[2], rnding);
+  u[2] = _mm_srai_epi32(u[2], bit);
+
+  u[3] = _mm_sub_epi32(v[0], x);
+  u[3] = _mm_add_epi32(u[3], rnding);
+  u[3] = _mm_srai_epi32(u[3], bit);
+
+  v[0] = _mm_mullo_epi32(v[6], cospi32);
+  x = _mm_mullo_epi32(v[7], cospi32);
+  u[6] = _mm_add_epi32(v[0], x);
+  u[6] = _mm_add_epi32(u[6], rnding);
+  u[6] = _mm_srai_epi32(u[6], bit);
+
+  u[7] = _mm_sub_epi32(v[0], x);
+  u[7] = _mm_add_epi32(u[7], rnding);
+  u[7] = _mm_srai_epi32(u[7], bit);
+
+  // stage 7
+  if (do_cols) {
+    out[1] = u[0];
+    out[3] = _mm_sub_epi32(kZero, u[4]);
+    out[5] = u[6];
+    out[7] = _mm_sub_epi32(kZero, u[2]);
+    out[9] = u[3];
+    out[11] = _mm_sub_epi32(kZero, u[7]);
+    out[13] = u[5];
+    out[15] = _mm_sub_epi32(kZero, u[1]);
+  } else {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+    const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+    neg_shift_sse4_1(u[0], u[4], out + 1, out + 3, &clamp_lo_out, &clamp_hi_out,
+                     out_shift);
+    neg_shift_sse4_1(u[6], u[2], out + 5, out + 7, &clamp_lo_out, &clamp_hi_out,
+                     out_shift);
+    neg_shift_sse4_1(u[3], u[7], out + 9, out + 11, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+    neg_shift_sse4_1(u[5], u[1], out + 13, out + 15, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+  }
+}
+
+static void iidentity8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+                              int bd, int out_shift) {
+  (void)bit;
+  out[0] = _mm_add_epi32(in[0], in[0]);
+  out[1] = _mm_add_epi32(in[1], in[1]);
+  out[2] = _mm_add_epi32(in[2], in[2]);
+  out[3] = _mm_add_epi32(in[3], in[3]);
+  out[4] = _mm_add_epi32(in[4], in[4]);
+  out[5] = _mm_add_epi32(in[5], in[5]);
+  out[6] = _mm_add_epi32(in[6], in[6]);
+  out[7] = _mm_add_epi32(in[7], in[7]);
+
+  if (!do_cols) {
+    const int log_range = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+    const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+    round_shift_4x4(out, out_shift);
+    round_shift_4x4(out + 4, out_shift);
+    highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 8);
+  }
+}
+
+static __m128i get_recon_8x8(const __m128i pred, __m128i res_lo, __m128i res_hi,
+                             int fliplr, int bd) {
+  __m128i x0, x1;
+  const __m128i zero = _mm_setzero_si128();
+
+  x0 = _mm_unpacklo_epi16(pred, zero);
+  x1 = _mm_unpackhi_epi16(pred, zero);
+
+  if (fliplr) {
+    res_lo = _mm_shuffle_epi32(res_lo, 0x1B);
+    res_hi = _mm_shuffle_epi32(res_hi, 0x1B);
+    x0 = _mm_add_epi32(res_hi, x0);
+    x1 = _mm_add_epi32(res_lo, x1);
+
+  } else {
+    x0 = _mm_add_epi32(res_lo, x0);
+    x1 = _mm_add_epi32(res_hi, x1);
+  }
+
+  x0 = _mm_packus_epi32(x0, x1);
+  return highbd_clamp_epi16(x0, bd);
+}
+
+static void write_buffer_8x8(__m128i *in, uint16_t *output, int stride,
+                             int fliplr, int flipud, int shift, int bd) {
+  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+
+  round_shift_8x8(in, shift);
+
+  v0 = _mm_load_si128((__m128i const *)(output + 0 * stride));
+  v1 = _mm_load_si128((__m128i const *)(output + 1 * stride));
+  v2 = _mm_load_si128((__m128i const *)(output + 2 * stride));
+  v3 = _mm_load_si128((__m128i const *)(output + 3 * stride));
+  v4 = _mm_load_si128((__m128i const *)(output + 4 * stride));
+  v5 = _mm_load_si128((__m128i const *)(output + 5 * stride));
+  v6 = _mm_load_si128((__m128i const *)(output + 6 * stride));
+  v7 = _mm_load_si128((__m128i const *)(output + 7 * stride));
+
+  if (flipud) {
+    u0 = get_recon_8x8(v0, in[14], in[15], fliplr, bd);
+    u1 = get_recon_8x8(v1, in[12], in[13], fliplr, bd);
+    u2 = get_recon_8x8(v2, in[10], in[11], fliplr, bd);
+    u3 = get_recon_8x8(v3, in[8], in[9], fliplr, bd);
+    u4 = get_recon_8x8(v4, in[6], in[7], fliplr, bd);
+    u5 = get_recon_8x8(v5, in[4], in[5], fliplr, bd);
+    u6 = get_recon_8x8(v6, in[2], in[3], fliplr, bd);
+    u7 = get_recon_8x8(v7, in[0], in[1], fliplr, bd);
+  } else {
+    u0 = get_recon_8x8(v0, in[0], in[1], fliplr, bd);
+    u1 = get_recon_8x8(v1, in[2], in[3], fliplr, bd);
+    u2 = get_recon_8x8(v2, in[4], in[5], fliplr, bd);
+    u3 = get_recon_8x8(v3, in[6], in[7], fliplr, bd);
+    u4 = get_recon_8x8(v4, in[8], in[9], fliplr, bd);
+    u5 = get_recon_8x8(v5, in[10], in[11], fliplr, bd);
+    u6 = get_recon_8x8(v6, in[12], in[13], fliplr, bd);
+    u7 = get_recon_8x8(v7, in[14], in[15], fliplr, bd);
+  }
+
+  _mm_store_si128((__m128i *)(output + 0 * stride), u0);
+  _mm_store_si128((__m128i *)(output + 1 * stride), u1);
+  _mm_store_si128((__m128i *)(output + 2 * stride), u2);
+  _mm_store_si128((__m128i *)(output + 3 * stride), u3);
+  _mm_store_si128((__m128i *)(output + 4 * stride), u4);
+  _mm_store_si128((__m128i *)(output + 5 * stride), u5);
+  _mm_store_si128((__m128i *)(output + 6 * stride), u6);
+  _mm_store_si128((__m128i *)(output + 7 * stride), u7);
+}
+
+void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *input, uint16_t *output,
+                                   int stride, TX_TYPE tx_type, int bd) {
+  __m128i in[16], out[16];
+  const int8_t *shift = av1_inv_txfm_shift_ls[TX_8X8];
+  const int txw_idx = get_txw_idx(TX_8X8);
+  const int txh_idx = get_txh_idx(TX_8X8);
+
+  switch (tx_type) {
+    case DCT_DCT:
+      load_buffer_8x8(input, in);
+      transpose_8x8(in, out);
+      idct8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                     -shift[0]);
+      transpose_8x8(in, out);
+      idct8x8_sse4_1(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
+      break;
+    case DCT_ADST:
+      load_buffer_8x8(input, in);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                      -shift[0]);
+      transpose_8x8(in, out);
+      idct8x8_sse4_1(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
+      break;
+    case ADST_DCT:
+      load_buffer_8x8(input, in);
+      transpose_8x8(in, out);
+      idct8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                     -shift[0]);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
+      break;
+    case ADST_ADST:
+      load_buffer_8x8(input, in);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                      -shift[0]);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
+      break;
+    case FLIPADST_DCT:
+      load_buffer_8x8(input, in);
+      transpose_8x8(in, out);
+      idct8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                     -shift[0]);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_8x8(in, output, stride, 0, 1, -shift[1], bd);
+      break;
+    case DCT_FLIPADST:
+      load_buffer_8x8(input, in);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                      -shift[0]);
+      transpose_8x8(in, out);
+      idct8x8_sse4_1(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_8x8(in, output, stride, 1, 0, -shift[1], bd);
+      break;
+    case ADST_FLIPADST:
+      load_buffer_8x8(input, in);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                      -shift[0]);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_8x8(in, output, stride, 1, 0, -shift[1], bd);
+      break;
+    case FLIPADST_FLIPADST:
+      load_buffer_8x8(input, in);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                      -shift[0]);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_8x8(in, output, stride, 1, 1, -shift[1], bd);
+      break;
+    case FLIPADST_ADST:
+      load_buffer_8x8(input, in);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+                      -shift[0]);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+      write_buffer_8x8(in, output, stride, 0, 1, -shift[1], bd);
+      break;
+    default: assert(0);
+  }
+}
+
+static void idct8x8_low1_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+                                int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+  __m128i x;
+
+  // stage 0
+  // stage 1
+  // stage 2
+  // stage 3
+  x = _mm_mullo_epi32(in[0], cospi32);
+  x = _mm_add_epi32(x, rnding);
+  x = _mm_srai_epi32(x, bit);
+
+  // stage 4
+  // stage 5
+  if (!do_cols) {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    clamp_lo = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+    clamp_hi = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+    __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
+    x = _mm_add_epi32(x, offset);
+    x = _mm_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
+  }
+
+  x = _mm_max_epi32(x, clamp_lo);
+  x = _mm_min_epi32(x, clamp_hi);
+  out[0] = x;
+  out[1] = x;
+  out[2] = x;
+  out[3] = x;
+  out[4] = x;
+  out[5] = x;
+  out[6] = x;
+  out[7] = x;
+}
+
+static void idct8x8_new_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+                               int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+  __m128i x, y;
+
+  // stage 0
+  // stage 1
+  // stage 2
+  u0 = in[0];
+  u1 = in[4];
+  u2 = in[2];
+  u3 = in[6];
+
+  x = _mm_mullo_epi32(in[1], cospi56);
+  y = _mm_mullo_epi32(in[7], cospim8);
+  u4 = _mm_add_epi32(x, y);
+  u4 = _mm_add_epi32(u4, rnding);
+  u4 = _mm_srai_epi32(u4, bit);
+
+  x = _mm_mullo_epi32(in[1], cospi8);
+  y = _mm_mullo_epi32(in[7], cospi56);
+  u7 = _mm_add_epi32(x, y);
+  u7 = _mm_add_epi32(u7, rnding);
+  u7 = _mm_srai_epi32(u7, bit);
+
+  x = _mm_mullo_epi32(in[5], cospi24);
+  y = _mm_mullo_epi32(in[3], cospim40);
+  u5 = _mm_add_epi32(x, y);
+  u5 = _mm_add_epi32(u5, rnding);
+  u5 = _mm_srai_epi32(u5, bit);
+
+  x = _mm_mullo_epi32(in[5], cospi40);
+  y = _mm_mullo_epi32(in[3], cospi24);
+  u6 = _mm_add_epi32(x, y);
+  u6 = _mm_add_epi32(u6, rnding);
+  u6 = _mm_srai_epi32(u6, bit);
+
+  // stage 3
+  x = _mm_mullo_epi32(u0, cospi32);
+  y = _mm_mullo_epi32(u1, cospi32);
+  v0 = _mm_add_epi32(x, y);
+  v0 = _mm_add_epi32(v0, rnding);
+  v0 = _mm_srai_epi32(v0, bit);
+
+  v1 = _mm_sub_epi32(x, y);
+  v1 = _mm_add_epi32(v1, rnding);
+  v1 = _mm_srai_epi32(v1, bit);
+
+  x = _mm_mullo_epi32(u2, cospi48);
+  y = _mm_mullo_epi32(u3, cospim16);
+  v2 = _mm_add_epi32(x, y);
+  v2 = _mm_add_epi32(v2, rnding);
+  v2 = _mm_srai_epi32(v2, bit);
+
+  x = _mm_mullo_epi32(u2, cospi16);
+  y = _mm_mullo_epi32(u3, cospi48);
+  v3 = _mm_add_epi32(x, y);
+  v3 = _mm_add_epi32(v3, rnding);
+  v3 = _mm_srai_epi32(v3, bit);
+
+  addsub_sse4_1(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi);
+
+  // stage 4
+  addsub_sse4_1(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi);
+  u4 = v4;
+  u7 = v7;
+
+  x = _mm_mullo_epi32(v5, cospi32);
+  y = _mm_mullo_epi32(v6, cospi32);
+  u6 = _mm_add_epi32(y, x);
+  u6 = _mm_add_epi32(u6, rnding);
+  u6 = _mm_srai_epi32(u6, bit);
+
+  u5 = _mm_sub_epi32(y, x);
+  u5 = _mm_add_epi32(u5, rnding);
+  u5 = _mm_srai_epi32(u5, bit);
+
+  // stage 5
+  addsub_sse4_1(u0, u7, out + 0, out + 7, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u1, u6, out + 1, out + 6, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u2, u5, out + 2, out + 5, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u3, u4, out + 3, out + 4, &clamp_lo, &clamp_hi);
+
+  if (!do_cols) {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+    const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+    round_shift_4x4(out, out_shift);
+    round_shift_4x4(out + 4, out_shift);
+    highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 8);
+  }
+}
+
+static void iadst8x8_low1_sse4_1(__m128i *in, __m128i *out, int bit,
+                                 int do_cols, int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const __m128i kZero = _mm_setzero_si128();
+  __m128i u[8], x;
+
+  // stage 0
+  // stage 1
+  // stage 2
+
+  x = _mm_mullo_epi32(in[0], cospi60);
+  u[0] = _mm_add_epi32(x, rnding);
+  u[0] = _mm_srai_epi32(u[0], bit);
+
+  x = _mm_mullo_epi32(in[0], cospi4);
+  u[1] = _mm_sub_epi32(kZero, x);
+  u[1] = _mm_add_epi32(u[1], rnding);
+  u[1] = _mm_srai_epi32(u[1], bit);
+
+  // stage 3
+  // stage 4
+  __m128i temp1, temp2;
+  temp1 = _mm_mullo_epi32(u[0], cospi16);
+  x = _mm_mullo_epi32(u[1], cospi48);
+  temp1 = _mm_add_epi32(temp1, x);
+  temp1 = _mm_add_epi32(temp1, rnding);
+  temp1 = _mm_srai_epi32(temp1, bit);
+  u[4] = temp1;
+
+  temp2 = _mm_mullo_epi32(u[0], cospi48);
+  x = _mm_mullo_epi32(u[1], cospi16);
+  u[5] = _mm_sub_epi32(temp2, x);
+  u[5] = _mm_add_epi32(u[5], rnding);
+  u[5] = _mm_srai_epi32(u[5], bit);
+
+  // stage 5
+  // stage 6
+  temp1 = _mm_mullo_epi32(u[0], cospi32);
+  x = _mm_mullo_epi32(u[1], cospi32);
+  u[2] = _mm_add_epi32(temp1, x);
+  u[2] = _mm_add_epi32(u[2], rnding);
+  u[2] = _mm_srai_epi32(u[2], bit);
+
+  u[3] = _mm_sub_epi32(temp1, x);
+  u[3] = _mm_add_epi32(u[3], rnding);
+  u[3] = _mm_srai_epi32(u[3], bit);
+
+  temp1 = _mm_mullo_epi32(u[4], cospi32);
+  x = _mm_mullo_epi32(u[5], cospi32);
+  u[6] = _mm_add_epi32(temp1, x);
+  u[6] = _mm_add_epi32(u[6], rnding);
+  u[6] = _mm_srai_epi32(u[6], bit);
+
+  u[7] = _mm_sub_epi32(temp1, x);
+  u[7] = _mm_add_epi32(u[7], rnding);
+  u[7] = _mm_srai_epi32(u[7], bit);
+
+  // stage 7
+  if (do_cols) {
+    out[0] = u[0];
+    out[1] = _mm_sub_epi32(kZero, u[4]);
+    out[2] = u[6];
+    out[3] = _mm_sub_epi32(kZero, u[2]);
+    out[4] = u[3];
+    out[5] = _mm_sub_epi32(kZero, u[7]);
+    out[6] = u[5];
+    out[7] = _mm_sub_epi32(kZero, u[1]);
+  } else {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+    const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+    neg_shift_sse4_1(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+                     out_shift);
+    neg_shift_sse4_1(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
+                     out_shift);
+    neg_shift_sse4_1(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
+                     out_shift);
+    neg_shift_sse4_1(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
+                     out_shift);
+  }
+}
+
+static void iadst8x8_new_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+                                int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+  const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+  const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+  const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+  const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+  const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+  const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const __m128i kZero = _mm_setzero_si128();
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+  __m128i u[8], v[8], x;
+
+  // stage 0
+  // stage 1
+  // stage 2
+
+  u[0] = _mm_mullo_epi32(in[7], cospi4);
+  x = _mm_mullo_epi32(in[0], cospi60);
+  u[0] = _mm_add_epi32(u[0], x);
+  u[0] = _mm_add_epi32(u[0], rnding);
+  u[0] = _mm_srai_epi32(u[0], bit);
+
+  u[1] = _mm_mullo_epi32(in[7], cospi60);
+  x = _mm_mullo_epi32(in[0], cospi4);
+  u[1] = _mm_sub_epi32(u[1], x);
+  u[1] = _mm_add_epi32(u[1], rnding);
+  u[1] = _mm_srai_epi32(u[1], bit);
+
+  // (2)
+  u[2] = _mm_mullo_epi32(in[5], cospi20);
+  x = _mm_mullo_epi32(in[2], cospi44);
+  u[2] = _mm_add_epi32(u[2], x);
+  u[2] = _mm_add_epi32(u[2], rnding);
+  u[2] = _mm_srai_epi32(u[2], bit);
+
+  u[3] = _mm_mullo_epi32(in[5], cospi44);
+  x = _mm_mullo_epi32(in[2], cospi20);
+  u[3] = _mm_sub_epi32(u[3], x);
+  u[3] = _mm_add_epi32(u[3], rnding);
+  u[3] = _mm_srai_epi32(u[3], bit);
+
+  // (3)
+  u[4] = _mm_mullo_epi32(in[3], cospi36);
+  x = _mm_mullo_epi32(in[4], cospi28);
+  u[4] = _mm_add_epi32(u[4], x);
+  u[4] = _mm_add_epi32(u[4], rnding);
+  u[4] = _mm_srai_epi32(u[4], bit);
+
+  u[5] = _mm_mullo_epi32(in[3], cospi28);
+  x = _mm_mullo_epi32(in[4], cospi36);
+  u[5] = _mm_sub_epi32(u[5], x);
+  u[5] = _mm_add_epi32(u[5], rnding);
+  u[5] = _mm_srai_epi32(u[5], bit);
+
+  // (4)
+  u[6] = _mm_mullo_epi32(in[1], cospi52);
+  x = _mm_mullo_epi32(in[6], cospi12);
+  u[6] = _mm_add_epi32(u[6], x);
+  u[6] = _mm_add_epi32(u[6], rnding);
+  u[6] = _mm_srai_epi32(u[6], bit);
+
+  u[7] = _mm_mullo_epi32(in[1], cospi12);
+  x = _mm_mullo_epi32(in[6], cospi52);
+  u[7] = _mm_sub_epi32(u[7], x);
+  u[7] = _mm_add_epi32(u[7], rnding);
+  u[7] = _mm_srai_epi32(u[7], bit);
+
+  // stage 3
+  addsub_sse4_1(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
+
+  // stage 4
+  u[0] = v[0];
+  u[1] = v[1];
+  u[2] = v[2];
+  u[3] = v[3];
+
+  u[4] = _mm_mullo_epi32(v[4], cospi16);
+  x = _mm_mullo_epi32(v[5], cospi48);
+  u[4] = _mm_add_epi32(u[4], x);
+  u[4] = _mm_add_epi32(u[4], rnding);
+  u[4] = _mm_srai_epi32(u[4], bit);
+
+  u[5] = _mm_mullo_epi32(v[4], cospi48);
+  x = _mm_mullo_epi32(v[5], cospi16);
+  u[5] = _mm_sub_epi32(u[5], x);
+  u[5] = _mm_add_epi32(u[5], rnding);
+  u[5] = _mm_srai_epi32(u[5], bit);
+
+  u[6] = _mm_mullo_epi32(v[6], cospim48);
+  x = _mm_mullo_epi32(v[7], cospi16);
+  u[6] = _mm_add_epi32(u[6], x);
+  u[6] = _mm_add_epi32(u[6], rnding);
+  u[6] = _mm_srai_epi32(u[6], bit);
+
+  u[7] = _mm_mullo_epi32(v[6], cospi16);
+  x = _mm_mullo_epi32(v[7], cospim48);
+  u[7] = _mm_sub_epi32(u[7], x);
+  u[7] = _mm_add_epi32(u[7], rnding);
+  u[7] = _mm_srai_epi32(u[7], bit);
+
+  // stage 5
+  addsub_sse4_1(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
+
+  // stage 6
+  u[0] = v[0];
+  u[1] = v[1];
+  u[4] = v[4];
+  u[5] = v[5];
+
+  v[0] = _mm_mullo_epi32(v[2], cospi32);
+  x = _mm_mullo_epi32(v[3], cospi32);
+  u[2] = _mm_add_epi32(v[0], x);
+  u[2] = _mm_add_epi32(u[2], rnding);
+  u[2] = _mm_srai_epi32(u[2], bit);
+
+  u[3] = _mm_sub_epi32(v[0], x);
+  u[3] = _mm_add_epi32(u[3], rnding);
+  u[3] = _mm_srai_epi32(u[3], bit);
+
+  v[0] = _mm_mullo_epi32(v[6], cospi32);
+  x = _mm_mullo_epi32(v[7], cospi32);
+  u[6] = _mm_add_epi32(v[0], x);
+  u[6] = _mm_add_epi32(u[6], rnding);
+  u[6] = _mm_srai_epi32(u[6], bit);
+
+  u[7] = _mm_sub_epi32(v[0], x);
+  u[7] = _mm_add_epi32(u[7], rnding);
+  u[7] = _mm_srai_epi32(u[7], bit);
+
+  // stage 7
+  if (do_cols) {
+    out[0] = u[0];
+    out[1] = _mm_sub_epi32(kZero, u[4]);
+    out[2] = u[6];
+    out[3] = _mm_sub_epi32(kZero, u[2]);
+    out[4] = u[3];
+    out[5] = _mm_sub_epi32(kZero, u[7]);
+    out[6] = u[5];
+    out[7] = _mm_sub_epi32(kZero, u[1]);
+  } else {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+    const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+    neg_shift_sse4_1(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+                     out_shift);
+    neg_shift_sse4_1(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
+                     out_shift);
+    neg_shift_sse4_1(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
+                     out_shift);
+    neg_shift_sse4_1(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
+                     out_shift);
+  }
+}
+
+static void idct16x16_low1_sse4_1(__m128i *in, __m128i *out, int bit,
+                                  int do_cols, int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+  // stage 0
+  // stage 1
+  // stage 2
+  // stage 3
+  // stage 4
+  in[0] = _mm_mullo_epi32(in[0], cospi32);
+  in[0] = _mm_add_epi32(in[0], rnding);
+  in[0] = _mm_srai_epi32(in[0], bit);
+
+  // stage 5
+  // stage 6
+  // stage 7
+  if (!do_cols) {
+    log_range = AOMMAX(16, bd + 6);
+    clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+    clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+    if (out_shift != 0) {
+      __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
+      in[0] = _mm_add_epi32(in[0], offset);
+      in[0] = _mm_sra_epi32(in[0], _mm_cvtsi32_si128(out_shift));
+    }
+  }
+
+  in[0] = _mm_max_epi32(in[0], clamp_lo);
+  in[0] = _mm_min_epi32(in[0], clamp_hi);
+  out[0] = in[0];
+  out[1] = in[0];
+  out[2] = in[0];
+  out[3] = in[0];
+  out[4] = in[0];
+  out[5] = in[0];
+  out[6] = in[0];
+  out[7] = in[0];
+  out[8] = in[0];
+  out[9] = in[0];
+  out[10] = in[0];
+  out[11] = in[0];
+  out[12] = in[0];
+  out[13] = in[0];
+  out[14] = in[0];
+  out[15] = in[0];
+}
+
+static void idct16x16_low8_sse4_1(__m128i *in, __m128i *out, int bit,
+                                  int do_cols, int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+  const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+  const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+  const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+  const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+  const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+  __m128i u[16], x, y;
+  // stage 0
+  // stage 1
+  u[0] = in[0];
+  u[2] = in[4];
+  u[4] = in[2];
+  u[6] = in[6];
+  u[8] = in[1];
+  u[10] = in[5];
+  u[12] = in[3];
+  u[14] = in[7];
+
+  // stage 2
+  u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit);
+  u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit);
+
+  u[9] = half_btf_0_sse4_1(&cospim36, &u[14], &rnding, bit);
+  u[14] = half_btf_0_sse4_1(&cospi28, &u[14], &rnding, bit);
+
+  u[13] = half_btf_0_sse4_1(&cospi20, &u[10], &rnding, bit);
+  u[10] = half_btf_0_sse4_1(&cospi44, &u[10], &rnding, bit);
+
+  u[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit);
+  u[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit);
+
+  // stage 3
+  u[7] = half_btf_0_sse4_1(&cospi8, &u[4], &rnding, bit);
+  u[4] = half_btf_0_sse4_1(&cospi56, &u[4], &rnding, bit);
+  u[5] = half_btf_0_sse4_1(&cospim40, &u[6], &rnding, bit);
+  u[6] = half_btf_0_sse4_1(&cospi24, &u[6], &rnding, bit);
+
+  addsub_sse4_1(u[8], u[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[11], u[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[12], u[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[15], u[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
+
+  // stage 4
+  x = _mm_mullo_epi32(u[0], cospi32);
+  u[0] = _mm_add_epi32(x, rnding);
+  u[0] = _mm_srai_epi32(u[0], bit);
+  u[1] = u[0];
+
+  u[3] = half_btf_0_sse4_1(&cospi16, &u[2], &rnding, bit);
+  u[2] = half_btf_0_sse4_1(&cospi48, &u[2], &rnding, bit);
+
+  addsub_sse4_1(u[4], u[5], &u[4], &u[5], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[7], u[6], &u[7], &u[6], &clamp_lo, &clamp_hi);
+
+  x = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
+  u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
+  u[9] = x;
+  y = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
+  u[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
+  u[10] = y;
+
+  // stage 5
+  addsub_sse4_1(u[0], u[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[1], u[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
+
+  x = _mm_mullo_epi32(u[5], cospi32);
+  y = _mm_mullo_epi32(u[6], cospi32);
+  u[5] = _mm_sub_epi32(y, x);
+  u[5] = _mm_add_epi32(u[5], rnding);
+  u[5] = _mm_srai_epi32(u[5], bit);
+
+  u[6] = _mm_add_epi32(y, x);
+  u[6] = _mm_add_epi32(u[6], rnding);
+  u[6] = _mm_srai_epi32(u[6], bit);
+
+  addsub_sse4_1(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
+
+  // stage 6
+  addsub_sse4_1(u[0], u[7], &u[0], &u[7], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[1], u[6], &u[1], &u[6], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[2], u[5], &u[2], &u[5], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[3], u[4], &u[3], &u[4], &clamp_lo, &clamp_hi);
+
+  x = _mm_mullo_epi32(u[10], cospi32);
+  y = _mm_mullo_epi32(u[13], cospi32);
+  u[10] = _mm_sub_epi32(y, x);
+  u[10] = _mm_add_epi32(u[10], rnding);
+  u[10] = _mm_srai_epi32(u[10], bit);
+
+  u[13] = _mm_add_epi32(x, y);
+  u[13] = _mm_add_epi32(u[13], rnding);
+  u[13] = _mm_srai_epi32(u[13], bit);
+
+  x = _mm_mullo_epi32(u[11], cospi32);
+  y = _mm_mullo_epi32(u[12], cospi32);
+  u[11] = _mm_sub_epi32(y, x);
+  u[11] = _mm_add_epi32(u[11], rnding);
+  u[11] = _mm_srai_epi32(u[11], bit);
+
+  u[12] = _mm_add_epi32(x, y);
+  u[12] = _mm_add_epi32(u[12], rnding);
+  u[12] = _mm_srai_epi32(u[12], bit);
+  // stage 7
+  addsub_sse4_1(u[0], u[15], out + 0, out + 15, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[1], u[14], out + 1, out + 14, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[2], u[13], out + 2, out + 13, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[3], u[12], out + 3, out + 12, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[4], u[11], out + 4, out + 11, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[5], u[10], out + 5, out + 10, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[6], u[9], out + 6, out + 9, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[7], u[8], out + 7, out + 8, &clamp_lo, &clamp_hi);
+
+  if (!do_cols) {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+    const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+    round_shift_8x8(out, out_shift);
+    highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 16);
+  }
+}
+
+static void iadst16x16_low1_sse4_1(__m128i *in, __m128i *out, int bit,
+                                   int do_cols, int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+  const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const __m128i zero = _mm_setzero_si128();
+  __m128i v[16], x, y, temp1, temp2;
+  // stage 0
+  // stage 1
+  // stage 2
+  x = _mm_mullo_epi32(in[0], cospi62);
+  v[0] = _mm_add_epi32(x, rnding);
+  v[0] = _mm_srai_epi32(v[0], bit);
+
+  x = _mm_mullo_epi32(in[0], cospi2);
+  v[1] = _mm_sub_epi32(zero, x);
+  v[1] = _mm_add_epi32(v[1], rnding);
+  v[1] = _mm_srai_epi32(v[1], bit);
+
+  // stage 3
+  v[8] = v[0];
+  v[9] = v[1];
+
+  // stage 4
+  temp1 = _mm_mullo_epi32(v[8], cospi8);
+  x = _mm_mullo_epi32(v[9], cospi56);
+  temp1 = _mm_add_epi32(temp1, x);
+  temp1 = _mm_add_epi32(temp1, rnding);
+  temp1 = _mm_srai_epi32(temp1, bit);
+
+  temp2 = _mm_mullo_epi32(v[8], cospi56);
+  x = _mm_mullo_epi32(v[9], cospi8);
+  temp2 = _mm_sub_epi32(temp2, x);
+  temp2 = _mm_add_epi32(temp2, rnding);
+  temp2 = _mm_srai_epi32(temp2, bit);
+  v[8] = temp1;
+  v[9] = temp2;
+
+  // stage 5
+  v[4] = v[0];
+  v[5] = v[1];
+  v[12] = v[8];
+  v[13] = v[9];
+
+  // stage 6
+  temp1 = _mm_mullo_epi32(v[4], cospi16);
+  x = _mm_mullo_epi32(v[5], cospi48);
+  temp1 = _mm_add_epi32(temp1, x);
+  temp1 = _mm_add_epi32(temp1, rnding);
+  temp1 = _mm_srai_epi32(temp1, bit);
+
+  temp2 = _mm_mullo_epi32(v[4], cospi48);
+  x = _mm_mullo_epi32(v[5], cospi16);
+  temp2 = _mm_sub_epi32(temp2, x);
+  temp2 = _mm_add_epi32(temp2, rnding);
+  temp2 = _mm_srai_epi32(temp2, bit);
+  v[4] = temp1;
+  v[5] = temp2;
+
+  temp1 = _mm_mullo_epi32(v[12], cospi16);
+  x = _mm_mullo_epi32(v[13], cospi48);
+  temp1 = _mm_add_epi32(temp1, x);
+  temp1 = _mm_add_epi32(temp1, rnding);
+  temp1 = _mm_srai_epi32(temp1, bit);
+
+  temp2 = _mm_mullo_epi32(v[12], cospi48);
+  x = _mm_mullo_epi32(v[13], cospi16);
+  temp2 = _mm_sub_epi32(temp2, x);
+  temp2 = _mm_add_epi32(temp2, rnding);
+  temp2 = _mm_srai_epi32(temp2, bit);
+  v[12] = temp1;
+  v[13] = temp2;
+
+  // stage 7
+  v[2] = v[0];
+  v[3] = v[1];
+  v[6] = v[4];
+  v[7] = v[5];
+  v[10] = v[8];
+  v[11] = v[9];
+  v[14] = v[12];
+  v[15] = v[13];
+
+  // stage 8
+  y = _mm_mullo_epi32(v[2], cospi32);
+  x = _mm_mullo_epi32(v[3], cospi32);
+  v[2] = _mm_add_epi32(y, x);
+  v[2] = _mm_add_epi32(v[2], rnding);
+  v[2] = _mm_srai_epi32(v[2], bit);
+
+  v[3] = _mm_sub_epi32(y, x);
+  v[3] = _mm_add_epi32(v[3], rnding);
+  v[3] = _mm_srai_epi32(v[3], bit);
+
+  y = _mm_mullo_epi32(v[6], cospi32);
+  x = _mm_mullo_epi32(v[7], cospi32);
+  v[6] = _mm_add_epi32(y, x);
+  v[6] = _mm_add_epi32(v[6], rnding);
+  v[6] = _mm_srai_epi32(v[6], bit);
+
+  v[7] = _mm_sub_epi32(y, x);
+  v[7] = _mm_add_epi32(v[7], rnding);
+  v[7] = _mm_srai_epi32(v[7], bit);
+
+  y = _mm_mullo_epi32(v[10], cospi32);
+  x = _mm_mullo_epi32(v[11], cospi32);
+  v[10] = _mm_add_epi32(y, x);
+  v[10] = _mm_add_epi32(v[10], rnding);
+  v[10] = _mm_srai_epi32(v[10], bit);
+
+  v[11] = _mm_sub_epi32(y, x);
+  v[11] = _mm_add_epi32(v[11], rnding);
+  v[11] = _mm_srai_epi32(v[11], bit);
+
+  y = _mm_mullo_epi32(v[14], cospi32);
+  x = _mm_mullo_epi32(v[15], cospi32);
+  v[14] = _mm_add_epi32(y, x);
+  v[14] = _mm_add_epi32(v[14], rnding);
+  v[14] = _mm_srai_epi32(v[14], bit);
+
+  v[15] = _mm_sub_epi32(y, x);
+  v[15] = _mm_add_epi32(v[15], rnding);
+  v[15] = _mm_srai_epi32(v[15], bit);
+
+  // stage 9
+  if (do_cols) {
+    out[0] = v[0];
+    out[1] = _mm_sub_epi32(zero, v[8]);
+    out[2] = v[12];
+    out[3] = _mm_sub_epi32(zero, v[4]);
+    out[4] = v[6];
+    out[5] = _mm_sub_epi32(zero, v[14]);
+    out[6] = v[10];
+    out[7] = _mm_sub_epi32(zero, v[2]);
+    out[8] = v[3];
+    out[9] = _mm_sub_epi32(zero, v[11]);
+    out[10] = v[15];
+    out[11] = _mm_sub_epi32(zero, v[7]);
+    out[12] = v[5];
+    out[13] = _mm_sub_epi32(zero, v[13]);
+    out[14] = v[9];
+    out[15] = _mm_sub_epi32(zero, v[1]);
+  } else {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+    const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+    neg_shift_sse4_1(v[0], v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+                     out_shift);
+    neg_shift_sse4_1(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+    neg_shift_sse4_1(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+    neg_shift_sse4_1(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+    neg_shift_sse4_1(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+    neg_shift_sse4_1(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+    neg_shift_sse4_1(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+    neg_shift_sse4_1(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+  }
+}
+
+static void iadst16x16_low8_sse4_1(__m128i *in, __m128i *out, int bit,
+                                   int do_cols, int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+  const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+  const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+  const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+  const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
+  const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
+  const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
+  const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
+  const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
+  const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
+  const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
+  const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
+  const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
+  const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+  const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
+  const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+  const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+  __m128i zero = _mm_setzero_si128();
+  __m128i u[16], x, y;
+
+  // stage 0
+  // stage 1
+  // stage 2
+  x = _mm_mullo_epi32(in[0], cospi62);
+  u[0] = _mm_add_epi32(x, rnding);
+  u[0] = _mm_srai_epi32(u[0], bit);
+
+  x = _mm_mullo_epi32(in[0], cospi2);
+  u[1] = _mm_sub_epi32(zero, x);
+  u[1] = _mm_add_epi32(u[1], rnding);
+  u[1] = _mm_srai_epi32(u[1], bit);
+
+  x = _mm_mullo_epi32(in[2], cospi54);
+  u[2] = _mm_add_epi32(x, rnding);
+  u[2] = _mm_srai_epi32(u[2], bit);
+
+  x = _mm_mullo_epi32(in[2], cospi10);
+  u[3] = _mm_sub_epi32(zero, x);
+  u[3] = _mm_add_epi32(u[3], rnding);
+  u[3] = _mm_srai_epi32(u[3], bit);
+
+  x = _mm_mullo_epi32(in[4], cospi46);
+  u[4] = _mm_add_epi32(x, rnding);
+  u[4] = _mm_srai_epi32(u[4], bit);
+
+  x = _mm_mullo_epi32(in[4], cospi18);
+  u[5] = _mm_sub_epi32(zero, x);
+  u[5] = _mm_add_epi32(u[5], rnding);
+  u[5] = _mm_srai_epi32(u[5], bit);
+
+  x = _mm_mullo_epi32(in[6], cospi38);
+  u[6] = _mm_add_epi32(x, rnding);
+  u[6] = _mm_srai_epi32(u[6], bit);
+
+  x = _mm_mullo_epi32(in[6], cospi26);
+  u[7] = _mm_sub_epi32(zero, x);
+  u[7] = _mm_add_epi32(u[7], rnding);
+  u[7] = _mm_srai_epi32(u[7], bit);
+
+  u[8] = _mm_mullo_epi32(in[7], cospi34);
+  u[8] = _mm_add_epi32(u[8], rnding);
+  u[8] = _mm_srai_epi32(u[8], bit);
+
+  u[9] = _mm_mullo_epi32(in[7], cospi30);
+  u[9] = _mm_add_epi32(u[9], rnding);
+  u[9] = _mm_srai_epi32(u[9], bit);
+
+  u[10] = _mm_mullo_epi32(in[5], cospi42);
+  u[10] = _mm_add_epi32(u[10], rnding);
+  u[10] = _mm_srai_epi32(u[10], bit);
+
+  u[11] = _mm_mullo_epi32(in[5], cospi22);
+  u[11] = _mm_add_epi32(u[11], rnding);
+  u[11] = _mm_srai_epi32(u[11], bit);
+
+  u[12] = _mm_mullo_epi32(in[3], cospi50);
+  u[12] = _mm_add_epi32(u[12], rnding);
+  u[12] = _mm_srai_epi32(u[12], bit);
+
+  u[13] = _mm_mullo_epi32(in[3], cospi14);
+  u[13] = _mm_add_epi32(u[13], rnding);
+  u[13] = _mm_srai_epi32(u[13], bit);
+
+  u[14] = _mm_mullo_epi32(in[1], cospi58);
+  u[14] = _mm_add_epi32(u[14], rnding);
+  u[14] = _mm_srai_epi32(u[14], bit);
+
+  u[15] = _mm_mullo_epi32(in[1], cospi6);
+  u[15] = _mm_add_epi32(u[15], rnding);
+  u[15] = _mm_srai_epi32(u[15], bit);
+
+  // stage 3
+  addsub_sse4_1(u[0], u[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[1], u[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[2], u[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[3], u[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[4], u[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[5], u[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[6], u[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[7], u[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
+
+  // stage 4
+  y = _mm_mullo_epi32(u[8], cospi56);
+  x = _mm_mullo_epi32(u[9], cospi56);
+  u[8] = _mm_mullo_epi32(u[8], cospi8);
+  u[8] = _mm_add_epi32(u[8], x);
+  u[8] = _mm_add_epi32(u[8], rnding);
+  u[8] = _mm_srai_epi32(u[8], bit);
+
+  x = _mm_mullo_epi32(u[9], cospi8);
+  u[9] = _mm_sub_epi32(y, x);
+  u[9] = _mm_add_epi32(u[9], rnding);
+  u[9] = _mm_srai_epi32(u[9], bit);
+
+  x = _mm_mullo_epi32(u[11], cospi24);
+  y = _mm_mullo_epi32(u[10], cospi24);
+  u[10] = _mm_mullo_epi32(u[10], cospi40);
+  u[10] = _mm_add_epi32(u[10], x);
+  u[10] = _mm_add_epi32(u[10], rnding);
+  u[10] = _mm_srai_epi32(u[10], bit);
+
+  x = _mm_mullo_epi32(u[11], cospi40);
+  u[11] = _mm_sub_epi32(y, x);
+  u[11] = _mm_add_epi32(u[11], rnding);
+  u[11] = _mm_srai_epi32(u[11], bit);
+
+  x = _mm_mullo_epi32(u[13], cospi8);
+  y = _mm_mullo_epi32(u[12], cospi8);
+  u[12] = _mm_mullo_epi32(u[12], cospim56);
+  u[12] = _mm_add_epi32(u[12], x);
+  u[12] = _mm_add_epi32(u[12], rnding);
+  u[12] = _mm_srai_epi32(u[12], bit);
+
+  x = _mm_mullo_epi32(u[13], cospim56);
+  u[13] = _mm_sub_epi32(y, x);
+  u[13] = _mm_add_epi32(u[13], rnding);
+  u[13] = _mm_srai_epi32(u[13], bit);
+
+  x = _mm_mullo_epi32(u[15], cospi40);
+  y = _mm_mullo_epi32(u[14], cospi40);
+  u[14] = _mm_mullo_epi32(u[14], cospim24);
+  u[14] = _mm_add_epi32(u[14], x);
+  u[14] = _mm_add_epi32(u[14], rnding);
+  u[14] = _mm_srai_epi32(u[14], bit);
+
+  x = _mm_mullo_epi32(u[15], cospim24);
+  u[15] = _mm_sub_epi32(y, x);
+  u[15] = _mm_add_epi32(u[15], rnding);
+  u[15] = _mm_srai_epi32(u[15], bit);
+
+  // stage 5
+  addsub_sse4_1(u[0], u[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[1], u[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[2], u[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[3], u[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[8], u[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[9], u[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[10], u[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[11], u[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
+
+  // stage 6
+  x = _mm_mullo_epi32(u[5], cospi48);
+  y = _mm_mullo_epi32(u[4], cospi48);
+  u[4] = _mm_mullo_epi32(u[4], cospi16);
+  u[4] = _mm_add_epi32(u[4], x);
+  u[4] = _mm_add_epi32(u[4], rnding);
+  u[4] = _mm_srai_epi32(u[4], bit);
+
+  x = _mm_mullo_epi32(u[5], cospi16);
+  u[5] = _mm_sub_epi32(y, x);
+  u[5] = _mm_add_epi32(u[5], rnding);
+  u[5] = _mm_srai_epi32(u[5], bit);
+
+  x = _mm_mullo_epi32(u[7], cospi16);
+  y = _mm_mullo_epi32(u[6], cospi16);
+  u[6] = _mm_mullo_epi32(u[6], cospim48);
+  u[6] = _mm_add_epi32(u[6], x);
+  u[6] = _mm_add_epi32(u[6], rnding);
+  u[6] = _mm_srai_epi32(u[6], bit);
+
+  x = _mm_mullo_epi32(u[7], cospim48);
+  u[7] = _mm_sub_epi32(y, x);
+  u[7] = _mm_add_epi32(u[7], rnding);
+  u[7] = _mm_srai_epi32(u[7], bit);
+
+  x = _mm_mullo_epi32(u[13], cospi48);
+  y = _mm_mullo_epi32(u[12], cospi48);
+  u[12] = _mm_mullo_epi32(u[12], cospi16);
+  u[12] = _mm_add_epi32(u[12], x);
+  u[12] = _mm_add_epi32(u[12], rnding);
+  u[12] = _mm_srai_epi32(u[12], bit);
+
+  x = _mm_mullo_epi32(u[13], cospi16);
+  u[13] = _mm_sub_epi32(y, x);
+  u[13] = _mm_add_epi32(u[13], rnding);
+  u[13] = _mm_srai_epi32(u[13], bit);
+
+  x = _mm_mullo_epi32(u[15], cospi16);
+  y = _mm_mullo_epi32(u[14], cospi16);
+  u[14] = _mm_mullo_epi32(u[14], cospim48);
+  u[14] = _mm_add_epi32(u[14], x);
+  u[14] = _mm_add_epi32(u[14], rnding);
+  u[14] = _mm_srai_epi32(u[14], bit);
+
+  x = _mm_mullo_epi32(u[15], cospim48);
+  u[15] = _mm_sub_epi32(y, x);
+  u[15] = _mm_add_epi32(u[15], rnding);
+  u[15] = _mm_srai_epi32(u[15], bit);
+
+  // stage 7
+  addsub_sse4_1(u[0], u[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[1], u[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[4], u[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[5], u[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[8], u[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[9], u[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[12], u[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(u[13], u[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
+
+  // stage 8
+  y = _mm_mullo_epi32(u[2], cospi32);
+  x = _mm_mullo_epi32(u[3], cospi32);
+  u[2] = _mm_add_epi32(y, x);
+  u[2] = _mm_add_epi32(u[2], rnding);
+  u[2] = _mm_srai_epi32(u[2], bit);
+
+  u[3] = _mm_sub_epi32(y, x);
+  u[3] = _mm_add_epi32(u[3], rnding);
+  u[3] = _mm_srai_epi32(u[3], bit);
+  y = _mm_mullo_epi32(u[6], cospi32);
+  x = _mm_mullo_epi32(u[7], cospi32);
+  u[6] = _mm_add_epi32(y, x);
+  u[6] = _mm_add_epi32(u[6], rnding);
+  u[6] = _mm_srai_epi32(u[6], bit);
+
+  u[7] = _mm_sub_epi32(y, x);
+  u[7] = _mm_add_epi32(u[7], rnding);
+  u[7] = _mm_srai_epi32(u[7], bit);
+
+  y = _mm_mullo_epi32(u[10], cospi32);
+  x = _mm_mullo_epi32(u[11], cospi32);
+  u[10] = _mm_add_epi32(y, x);
+  u[10] = _mm_add_epi32(u[10], rnding);
+  u[10] = _mm_srai_epi32(u[10], bit);
+
+  u[11] = _mm_sub_epi32(y, x);
+  u[11] = _mm_add_epi32(u[11], rnding);
+  u[11] = _mm_srai_epi32(u[11], bit);
+
+  y = _mm_mullo_epi32(u[14], cospi32);
+  x = _mm_mullo_epi32(u[15], cospi32);
+  u[14] = _mm_add_epi32(y, x);
+  u[14] = _mm_add_epi32(u[14], rnding);
+  u[14] = _mm_srai_epi32(u[14], bit);
+
+  u[15] = _mm_sub_epi32(y, x);
+  u[15] = _mm_add_epi32(u[15], rnding);
+  u[15] = _mm_srai_epi32(u[15], bit);
+
+  // stage 9
+  if (do_cols) {
+    out[0] = u[0];
+    out[1] = _mm_sub_epi32(zero, u[8]);
+    out[2] = u[12];
+    out[3] = _mm_sub_epi32(zero, u[4]);
+    out[4] = u[6];
+    out[5] = _mm_sub_epi32(zero, u[14]);
+    out[6] = u[10];
+    out[7] = _mm_sub_epi32(zero, u[2]);
+    out[8] = u[3];
+    out[9] = _mm_sub_epi32(zero, u[11]);
+    out[10] = u[15];
+    out[11] = _mm_sub_epi32(zero, u[7]);
+    out[12] = u[5];
+    out[13] = _mm_sub_epi32(zero, u[13]);
+    out[14] = u[9];
+    out[15] = _mm_sub_epi32(zero, u[1]);
+  } else {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+    const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+    neg_shift_sse4_1(u[0], u[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+                     out_shift);
+    neg_shift_sse4_1(u[12], u[4], out + 2, out + 3, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+    neg_shift_sse4_1(u[6], u[14], out + 4, out + 5, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+    neg_shift_sse4_1(u[10], u[2], out + 6, out + 7, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+    neg_shift_sse4_1(u[3], u[11], out + 8, out + 9, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+    neg_shift_sse4_1(u[15], u[7], out + 10, out + 11, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+    neg_shift_sse4_1(u[5], u[13], out + 12, out + 13, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+    neg_shift_sse4_1(u[9], u[1], out + 14, out + 15, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+  }
+}
+
+static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+                             int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+  const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
+  const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+  const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+  const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+  const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+  const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
+  const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+  const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+  const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+  const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+  __m128i u[16], v[16], x, y;
+
+  {
+    // stage 0
+    // stage 1
+    u[0] = in[0];
+    u[1] = in[8];
+    u[2] = in[4];
+    u[3] = in[12];
+    u[4] = in[2];
+    u[5] = in[10];
+    u[6] = in[6];
+    u[7] = in[14];
+    u[8] = in[1];
+    u[9] = in[9];
+    u[10] = in[5];
+    u[11] = in[13];
+    u[12] = in[3];
+    u[13] = in[11];
+    u[14] = in[7];
+    u[15] = in[15];
+
+    // stage 2
+    v[0] = u[0];
+    v[1] = u[1];
+    v[2] = u[2];
+    v[3] = u[3];
+    v[4] = u[4];
+    v[5] = u[5];
+    v[6] = u[6];
+    v[7] = u[7];
+
+    v[8] = half_btf_sse4_1(&cospi60, &u[8], &cospim4, &u[15], &rnding, bit);
+    v[9] = half_btf_sse4_1(&cospi28, &u[9], &cospim36, &u[14], &rnding, bit);
+    v[10] = half_btf_sse4_1(&cospi44, &u[10], &cospim20, &u[13], &rnding, bit);
+    v[11] = half_btf_sse4_1(&cospi12, &u[11], &cospim52, &u[12], &rnding, bit);
+    v[12] = half_btf_sse4_1(&cospi52, &u[11], &cospi12, &u[12], &rnding, bit);
+    v[13] = half_btf_sse4_1(&cospi20, &u[10], &cospi44, &u[13], &rnding, bit);
+    v[14] = half_btf_sse4_1(&cospi36, &u[9], &cospi28, &u[14], &rnding, bit);
+    v[15] = half_btf_sse4_1(&cospi4, &u[8], &cospi60, &u[15], &rnding, bit);
+
+    // stage 3
+    u[0] = v[0];
+    u[1] = v[1];
+    u[2] = v[2];
+    u[3] = v[3];
+    u[4] = half_btf_sse4_1(&cospi56, &v[4], &cospim8, &v[7], &rnding, bit);
+    u[5] = half_btf_sse4_1(&cospi24, &v[5], &cospim40, &v[6], &rnding, bit);
+    u[6] = half_btf_sse4_1(&cospi40, &v[5], &cospi24, &v[6], &rnding, bit);
+    u[7] = half_btf_sse4_1(&cospi8, &v[4], &cospi56, &v[7], &rnding, bit);
+    addsub_sse4_1(v[8], v[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[11], v[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[12], v[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[15], v[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
+
+    // stage 4
+    x = _mm_mullo_epi32(u[0], cospi32);
+    y = _mm_mullo_epi32(u[1], cospi32);
+    v[0] = _mm_add_epi32(x, y);
+    v[0] = _mm_add_epi32(v[0], rnding);
+    v[0] = _mm_srai_epi32(v[0], bit);
+
+    v[1] = _mm_sub_epi32(x, y);
+    v[1] = _mm_add_epi32(v[1], rnding);
+    v[1] = _mm_srai_epi32(v[1], bit);
+
+    v[2] = half_btf_sse4_1(&cospi48, &u[2], &cospim16, &u[3], &rnding, bit);
+    v[3] = half_btf_sse4_1(&cospi16, &u[2], &cospi48, &u[3], &rnding, bit);
+    addsub_sse4_1(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
+    v[8] = u[8];
+    v[9] = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
+    v[10] = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
+    v[11] = u[11];
+    v[12] = u[12];
+    v[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
+    v[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
+    v[15] = u[15];
+
+    // stage 5
+    addsub_sse4_1(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
+    u[4] = v[4];
+
+    x = _mm_mullo_epi32(v[5], cospi32);
+    y = _mm_mullo_epi32(v[6], cospi32);
+    u[5] = _mm_sub_epi32(y, x);
+    u[5] = _mm_add_epi32(u[5], rnding);
+    u[5] = _mm_srai_epi32(u[5], bit);
+
+    u[6] = _mm_add_epi32(y, x);
+    u[6] = _mm_add_epi32(u[6], rnding);
+    u[6] = _mm_srai_epi32(u[6], bit);
+
+    u[7] = v[7];
+    addsub_sse4_1(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
+
+    // stage 6
+    addsub_sse4_1(u[0], u[7], &v[0], &v[7], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[1], u[6], &v[1], &v[6], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[2], u[5], &v[2], &v[5], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[3], u[4], &v[3], &v[4], &clamp_lo, &clamp_hi);
+    v[8] = u[8];
+    v[9] = u[9];
+
+    x = _mm_mullo_epi32(u[10], cospi32);
+    y = _mm_mullo_epi32(u[13], cospi32);
+    v[10] = _mm_sub_epi32(y, x);
+    v[10] = _mm_add_epi32(v[10], rnding);
+    v[10] = _mm_srai_epi32(v[10], bit);
+
+    v[13] = _mm_add_epi32(x, y);
+    v[13] = _mm_add_epi32(v[13], rnding);
+    v[13] = _mm_srai_epi32(v[13], bit);
+
+    x = _mm_mullo_epi32(u[11], cospi32);
+    y = _mm_mullo_epi32(u[12], cospi32);
+    v[11] = _mm_sub_epi32(y, x);
+    v[11] = _mm_add_epi32(v[11], rnding);
+    v[11] = _mm_srai_epi32(v[11], bit);
+
+    v[12] = _mm_add_epi32(x, y);
+    v[12] = _mm_add_epi32(v[12], rnding);
+    v[12] = _mm_srai_epi32(v[12], bit);
+
+    v[14] = u[14];
+    v[15] = u[15];
+
+    // stage 7
+    addsub_sse4_1(v[0], v[15], out + 0, out + 15, &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[1], v[14], out + 1, out + 14, &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[2], v[13], out + 2, out + 13, &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[3], v[12], out + 3, out + 12, &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[4], v[11], out + 4, out + 11, &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[5], v[10], out + 5, out + 10, &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[6], v[9], out + 6, out + 9, &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[7], v[8], out + 7, out + 8, &clamp_lo, &clamp_hi);
+
+    if (!do_cols) {
+      const int log_range_out = AOMMAX(16, bd + 6);
+      const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+      const __m128i clamp_hi_out =
+          _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+      round_shift_8x8(out, out_shift);
+      highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 16);
+    }
+  }
+}
+
+static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+                              int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+  const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+  const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+  const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+  const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
+  const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
+  const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
+  const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
+  const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
+  const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
+  const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
+  const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
+  const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
+  const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+  const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
+  const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+  const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+  const __m128i zero = _mm_setzero_si128();
+  __m128i u[16], v[16], x, y;
+  // Calculate the column 0, 1, 2, 3
+  // stage 0
+  // stage 1
+  // stage 2
+  v[0] = _mm_mullo_epi32(in[15], cospi2);
+  x = _mm_mullo_epi32(in[0], cospi62);
+  v[0] = _mm_add_epi32(v[0], x);
+  v[0] = _mm_add_epi32(v[0], rnding);
+  v[0] = _mm_srai_epi32(v[0], bit);
+
+  v[1] = _mm_mullo_epi32(in[15], cospi62);
+  x = _mm_mullo_epi32(in[0], cospi2);
+  v[1] = _mm_sub_epi32(v[1], x);
+  v[1] = _mm_add_epi32(v[1], rnding);
+  v[1] = _mm_srai_epi32(v[1], bit);
+
+  v[2] = _mm_mullo_epi32(in[13], cospi10);
+  x = _mm_mullo_epi32(in[2], cospi54);
+  v[2] = _mm_add_epi32(v[2], x);
+  v[2] = _mm_add_epi32(v[2], rnding);
+  v[2] = _mm_srai_epi32(v[2], bit);
+
+  v[3] = _mm_mullo_epi32(in[13], cospi54);
+  x = _mm_mullo_epi32(in[2], cospi10);
+  v[3] = _mm_sub_epi32(v[3], x);
+  v[3] = _mm_add_epi32(v[3], rnding);
+  v[3] = _mm_srai_epi32(v[3], bit);
+
+  v[4] = _mm_mullo_epi32(in[11], cospi18);
+  x = _mm_mullo_epi32(in[4], cospi46);
+  v[4] = _mm_add_epi32(v[4], x);
+  v[4] = _mm_add_epi32(v[4], rnding);
+  v[4] = _mm_srai_epi32(v[4], bit);
+
+  v[5] = _mm_mullo_epi32(in[11], cospi46);
+  x = _mm_mullo_epi32(in[4], cospi18);
+  v[5] = _mm_sub_epi32(v[5], x);
+  v[5] = _mm_add_epi32(v[5], rnding);
+  v[5] = _mm_srai_epi32(v[5], bit);
+
+  v[6] = _mm_mullo_epi32(in[9], cospi26);
+  x = _mm_mullo_epi32(in[6], cospi38);
+  v[6] = _mm_add_epi32(v[6], x);
+  v[6] = _mm_add_epi32(v[6], rnding);
+  v[6] = _mm_srai_epi32(v[6], bit);
+
+  v[7] = _mm_mullo_epi32(in[9], cospi38);
+  x = _mm_mullo_epi32(in[6], cospi26);
+  v[7] = _mm_sub_epi32(v[7], x);
+  v[7] = _mm_add_epi32(v[7], rnding);
+  v[7] = _mm_srai_epi32(v[7], bit);
+
+  v[8] = _mm_mullo_epi32(in[7], cospi34);
+  x = _mm_mullo_epi32(in[8], cospi30);
+  v[8] = _mm_add_epi32(v[8], x);
+  v[8] = _mm_add_epi32(v[8], rnding);
+  v[8] = _mm_srai_epi32(v[8], bit);
+
+  v[9] = _mm_mullo_epi32(in[7], cospi30);
+  x = _mm_mullo_epi32(in[8], cospi34);
+  v[9] = _mm_sub_epi32(v[9], x);
+  v[9] = _mm_add_epi32(v[9], rnding);
+  v[9] = _mm_srai_epi32(v[9], bit);
+
+  v[10] = _mm_mullo_epi32(in[5], cospi42);
+  x = _mm_mullo_epi32(in[10], cospi22);
+  v[10] = _mm_add_epi32(v[10], x);
+  v[10] = _mm_add_epi32(v[10], rnding);
+  v[10] = _mm_srai_epi32(v[10], bit);
+
+  v[11] = _mm_mullo_epi32(in[5], cospi22);
+  x = _mm_mullo_epi32(in[10], cospi42);
+  v[11] = _mm_sub_epi32(v[11], x);
+  v[11] = _mm_add_epi32(v[11], rnding);
+  v[11] = _mm_srai_epi32(v[11], bit);
+
+  v[12] = _mm_mullo_epi32(in[3], cospi50);
+  x = _mm_mullo_epi32(in[12], cospi14);
+  v[12] = _mm_add_epi32(v[12], x);
+  v[12] = _mm_add_epi32(v[12], rnding);
+  v[12] = _mm_srai_epi32(v[12], bit);
+
+  v[13] = _mm_mullo_epi32(in[3], cospi14);
+  x = _mm_mullo_epi32(in[12], cospi50);
+  v[13] = _mm_sub_epi32(v[13], x);
+  v[13] = _mm_add_epi32(v[13], rnding);
+  v[13] = _mm_srai_epi32(v[13], bit);
+
+  v[14] = _mm_mullo_epi32(in[1], cospi58);
+  x = _mm_mullo_epi32(in[14], cospi6);
+  v[14] = _mm_add_epi32(v[14], x);
+  v[14] = _mm_add_epi32(v[14], rnding);
+  v[14] = _mm_srai_epi32(v[14], bit);
+
+  v[15] = _mm_mullo_epi32(in[1], cospi6);
+  x = _mm_mullo_epi32(in[14], cospi58);
+  v[15] = _mm_sub_epi32(v[15], x);
+  v[15] = _mm_add_epi32(v[15], rnding);
+  v[15] = _mm_srai_epi32(v[15], bit);
+
+  // stage 3
+  addsub_sse4_1(v[0], v[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(v[1], v[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(v[2], v[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(v[3], v[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(v[4], v[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(v[5], v[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(v[6], v[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(v[7], v[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
+
+  // stage 4
+  v[0] = u[0];
+  v[1] = u[1];
+  v[2] = u[2];
+  v[3] = u[3];
+  v[4] = u[4];
+  v[5] = u[5];
+  v[6] = u[6];
+  v[7] = u[7];
+
+  v[8] = _mm_mullo_epi32(u[8], cospi8);
+  x = _mm_mullo_epi32(u[9], cospi56);
+  v[8] = _mm_add_epi32(v[8], x);
+  v[8] = _mm_add_epi32(v[8], rnding);
+  v[8] = _mm_srai_epi32(v[8], bit);
+
+  v[9] = _mm_mullo_epi32(u[8], cospi56);
+  x = _mm_mullo_epi32(u[9], cospi8);
+  v[9] = _mm_sub_epi32(v[9], x);
+  v[9] = _mm_add_epi32(v[9], rnding);
+  v[9] = _mm_srai_epi32(v[9], bit);
+
+  v[10] = _mm_mullo_epi32(u[10], cospi40);
+  x = _mm_mullo_epi32(u[11], cospi24);
+  v[10] = _mm_add_epi32(v[10], x);
+  v[10] = _mm_add_epi32(v[10], rnding);
+  v[10] = _mm_srai_epi32(v[10], bit);
+
+  v[11] = _mm_mullo_epi32(u[10], cospi24);
+  x = _mm_mullo_epi32(u[11], cospi40);
+  v[11] = _mm_sub_epi32(v[11], x);
+  v[11] = _mm_add_epi32(v[11], rnding);
+  v[11] = _mm_srai_epi32(v[11], bit);
+
+  v[12] = _mm_mullo_epi32(u[12], cospim56);
+  x = _mm_mullo_epi32(u[13], cospi8);
+  v[12] = _mm_add_epi32(v[12], x);
+  v[12] = _mm_add_epi32(v[12], rnding);
+  v[12] = _mm_srai_epi32(v[12], bit);
+
+  v[13] = _mm_mullo_epi32(u[12], cospi8);
+  x = _mm_mullo_epi32(u[13], cospim56);
+  v[13] = _mm_sub_epi32(v[13], x);
+  v[13] = _mm_add_epi32(v[13], rnding);
+  v[13] = _mm_srai_epi32(v[13], bit);
+
+  v[14] = _mm_mullo_epi32(u[14], cospim24);
+  x = _mm_mullo_epi32(u[15], cospi40);
+  v[14] = _mm_add_epi32(v[14], x);
+  v[14] = _mm_add_epi32(v[14], rnding);
+  v[14] = _mm_srai_epi32(v[14], bit);
+
+  v[15] = _mm_mullo_epi32(u[14], cospi40);
+  x = _mm_mullo_epi32(u[15], cospim24);
+  v[15] = _mm_sub_epi32(v[15], x);
+  v[15] = _mm_add_epi32(v[15], rnding);
+  v[15] = _mm_srai_epi32(v[15], bit);
+
+  // stage 5
+  addsub_sse4_1(v[0], v[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(v[1], v[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(v[2], v[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(v[3], v[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(v[8], v[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(v[9], v[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(v[10], v[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(v[11], v[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
+
+  // stage 6
+  v[0] = u[0];
+  v[1] = u[1];
+  v[2] = u[2];
+  v[3] = u[3];
+
+  v[4] = _mm_mullo_epi32(u[4], cospi16);
+  x = _mm_mullo_epi32(u[5], cospi48);
+  v[4] = _mm_add_epi32(v[4], x);
+  v[4] = _mm_add_epi32(v[4], rnding);
+  v[4] = _mm_srai_epi32(v[4], bit);
+
+  v[5] = _mm_mullo_epi32(u[4], cospi48);
+  x = _mm_mullo_epi32(u[5], cospi16);
+  v[5] = _mm_sub_epi32(v[5], x);
+  v[5] = _mm_add_epi32(v[5], rnding);
+  v[5] = _mm_srai_epi32(v[5], bit);
+
+  v[6] = _mm_mullo_epi32(u[6], cospim48);
+  x = _mm_mullo_epi32(u[7], cospi16);
+  v[6] = _mm_add_epi32(v[6], x);
+  v[6] = _mm_add_epi32(v[6], rnding);
+  v[6] = _mm_srai_epi32(v[6], bit);
+
+  v[7] = _mm_mullo_epi32(u[6], cospi16);
+  x = _mm_mullo_epi32(u[7], cospim48);
+  v[7] = _mm_sub_epi32(v[7], x);
+  v[7] = _mm_add_epi32(v[7], rnding);
+  v[7] = _mm_srai_epi32(v[7], bit);
+
+  v[8] = u[8];
+  v[9] = u[9];
+  v[10] = u[10];
+  v[11] = u[11];
+
+  v[12] = _mm_mullo_epi32(u[12], cospi16);
+  x = _mm_mullo_epi32(u[13], cospi48);
+  v[12] = _mm_add_epi32(v[12], x);
+  v[12] = _mm_add_epi32(v[12], rnding);
+  v[12] = _mm_srai_epi32(v[12], bit);
+
+  v[13] = _mm_mullo_epi32(u[12], cospi48);
+  x = _mm_mullo_epi32(u[13], cospi16);
+  v[13] = _mm_sub_epi32(v[13], x);
+  v[13] = _mm_add_epi32(v[13], rnding);
+  v[13] = _mm_srai_epi32(v[13], bit);
+
+  v[14] = _mm_mullo_epi32(u[14], cospim48);
+  x = _mm_mullo_epi32(u[15], cospi16);
+  v[14] = _mm_add_epi32(v[14], x);
+  v[14] = _mm_add_epi32(v[14], rnding);
+  v[14] = _mm_srai_epi32(v[14], bit);
+
+  v[15] = _mm_mullo_epi32(u[14], cospi16);
+  x = _mm_mullo_epi32(u[15], cospim48);
+  v[15] = _mm_sub_epi32(v[15], x);
+  v[15] = _mm_add_epi32(v[15], rnding);
+  v[15] = _mm_srai_epi32(v[15], bit);
+
+  // stage 7
+  addsub_sse4_1(v[0], v[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(v[1], v[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(v[4], v[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(v[5], v[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(v[8], v[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(v[9], v[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(v[12], v[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
+  addsub_sse4_1(v[13], v[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
+
+  // stage 8
+  v[0] = u[0];
+  v[1] = u[1];
+
+  y = _mm_mullo_epi32(u[2], cospi32);
+  x = _mm_mullo_epi32(u[3], cospi32);
+  v[2] = _mm_add_epi32(y, x);
+  v[2] = _mm_add_epi32(v[2], rnding);
+  v[2] = _mm_srai_epi32(v[2], bit);
+
+  v[3] = _mm_sub_epi32(y, x);
+  v[3] = _mm_add_epi32(v[3], rnding);
+  v[3] = _mm_srai_epi32(v[3], bit);
+
+  v[4] = u[4];
+  v[5] = u[5];
+
+  y = _mm_mullo_epi32(u[6], cospi32);
+  x = _mm_mullo_epi32(u[7], cospi32);
+  v[6] = _mm_add_epi32(y, x);
+  v[6] = _mm_add_epi32(v[6], rnding);
+  v[6] = _mm_srai_epi32(v[6], bit);
+
+  v[7] = _mm_sub_epi32(y, x);
+  v[7] = _mm_add_epi32(v[7], rnding);
+  v[7] = _mm_srai_epi32(v[7], bit);
+
+  v[8] = u[8];
+  v[9] = u[9];
+
+  y = _mm_mullo_epi32(u[10], cospi32);
+  x = _mm_mullo_epi32(u[11], cospi32);
+  v[10] = _mm_add_epi32(y, x);
+  v[10] = _mm_add_epi32(v[10], rnding);
+  v[10] = _mm_srai_epi32(v[10], bit);
+
+  v[11] = _mm_sub_epi32(y, x);
+  v[11] = _mm_add_epi32(v[11], rnding);
+  v[11] = _mm_srai_epi32(v[11], bit);
+
+  v[12] = u[12];
+  v[13] = u[13];
+
+  y = _mm_mullo_epi32(u[14], cospi32);
+  x = _mm_mullo_epi32(u[15], cospi32);
+  v[14] = _mm_add_epi32(y, x);
+  v[14] = _mm_add_epi32(v[14], rnding);
+  v[14] = _mm_srai_epi32(v[14], bit);
+
+  v[15] = _mm_sub_epi32(y, x);
+  v[15] = _mm_add_epi32(v[15], rnding);
+  v[15] = _mm_srai_epi32(v[15], bit);
+
+  // stage 9
+  if (do_cols) {
+    out[0] = v[0];
+    out[1] = _mm_sub_epi32(zero, v[8]);
+    out[2] = v[12];
+    out[3] = _mm_sub_epi32(zero, v[4]);
+    out[4] = v[6];
+    out[5] = _mm_sub_epi32(zero, v[14]);
+    out[6] = v[10];
+    out[7] = _mm_sub_epi32(zero, v[2]);
+    out[8] = v[3];
+    out[9] = _mm_sub_epi32(zero, v[11]);
+    out[10] = v[15];
+    out[11] = _mm_sub_epi32(zero, v[7]);
+    out[12] = v[5];
+    out[13] = _mm_sub_epi32(zero, v[13]);
+    out[14] = v[9];
+    out[15] = _mm_sub_epi32(zero, v[1]);
+  } else {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+    const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+    neg_shift_sse4_1(v[0], v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+                     out_shift);
+    neg_shift_sse4_1(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+    neg_shift_sse4_1(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+    neg_shift_sse4_1(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+    neg_shift_sse4_1(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+    neg_shift_sse4_1(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+    neg_shift_sse4_1(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+    neg_shift_sse4_1(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
+                     &clamp_hi_out, out_shift);
+  }
+}
+static void iidentity16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+                               int bd, int out_shift) {
+  (void)bit;
+  __m128i fact = _mm_set1_epi32(2 * NewSqrt2);
+  __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1));
+  __m128i a0_low, a0_high, a1_low, a1_high;
+  __m128i zero = _mm_set1_epi32(0);
+  offset = _mm_unpacklo_epi32(offset, zero);
+
+  for (int i = 0; i < 16; i++) {
+    a0_low = _mm_mul_epi32(in[i], fact);
+    a0_low = _mm_add_epi32(a0_low, offset);
+    a0_low = _mm_srli_epi64(a0_low, NewSqrt2Bits);
+
+    a0_high = _mm_srli_si128(in[i], 4);
+    a0_high = _mm_mul_epi32(a0_high, fact);
+    a0_high = _mm_add_epi32(a0_high, offset);
+    a0_high = _mm_srli_epi64(a0_high, NewSqrt2Bits);
+
+    a1_low = _mm_unpacklo_epi32(a0_low, a0_high);
+    a1_high = _mm_unpackhi_epi32(a0_low, a0_high);
+    out[i] = _mm_unpacklo_epi64(a1_low, a1_high);
+  }
+
+  if (!do_cols) {
+    const int log_range = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+    const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+    round_shift_8x8(out, out_shift);
+    highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 16);
+  }
+}
+static INLINE void idct64_stage8_sse4_1(
+    __m128i *u, const __m128i *cospim32, const __m128i *cospi32,
+    const __m128i *cospim16, const __m128i *cospi48, const __m128i *cospi16,
+    const __m128i *cospim48, const __m128i *clamp_lo, const __m128i *clamp_hi,
+    const __m128i *rnding, int bit) {
+  int i;
+  __m128i temp1, temp2, temp3, temp4;
+  temp1 = half_btf_sse4_1(cospim32, &u[10], cospi32, &u[13], rnding, bit);
+  u[13] = half_btf_sse4_1(cospi32, &u[10], cospi32, &u[13], rnding, bit);
+  u[10] = temp1;
+  temp2 = half_btf_sse4_1(cospim32, &u[11], cospi32, &u[12], rnding, bit);
+  u[12] = half_btf_sse4_1(cospi32, &u[11], cospi32, &u[12], rnding, bit);
+  u[11] = temp2;
+
+  for (i = 16; i < 20; ++i) {
+    addsub_sse4_1(u[i], u[i ^ 7], &u[i], &u[i ^ 7], clamp_lo, clamp_hi);
+    addsub_sse4_1(u[i ^ 15], u[i ^ 8], &u[i ^ 15], &u[i ^ 8], clamp_lo,
+                  clamp_hi);
+  }
+
+  temp1 = half_btf_sse4_1(cospim16, &u[36], cospi48, &u[59], rnding, bit);
+  temp2 = half_btf_sse4_1(cospim16, &u[37], cospi48, &u[58], rnding, bit);
+  temp3 = half_btf_sse4_1(cospim16, &u[38], cospi48, &u[57], rnding, bit);
+  temp4 = half_btf_sse4_1(cospim16, &u[39], cospi48, &u[56], rnding, bit);
+  u[56] = half_btf_sse4_1(cospi48, &u[39], cospi16, &u[56], rnding, bit);
+  u[57] = half_btf_sse4_1(cospi48, &u[38], cospi16, &u[57], rnding, bit);
+  u[58] = half_btf_sse4_1(cospi48, &u[37], cospi16, &u[58], rnding, bit);
+  u[59] = half_btf_sse4_1(cospi48, &u[36], cospi16, &u[59], rnding, bit);
+  u[36] = temp1;
+  u[37] = temp2;
+  u[38] = temp3;
+  u[39] = temp4;
+
+  temp1 = half_btf_sse4_1(cospim48, &u[40], cospim16, &u[55], rnding, bit);
+  temp2 = half_btf_sse4_1(cospim48, &u[41], cospim16, &u[54], rnding, bit);
+  temp3 = half_btf_sse4_1(cospim48, &u[42], cospim16, &u[53], rnding, bit);
+  temp4 = half_btf_sse4_1(cospim48, &u[43], cospim16, &u[52], rnding, bit);
+  u[52] = half_btf_sse4_1(cospim16, &u[43], cospi48, &u[52], rnding, bit);
+  u[53] = half_btf_sse4_1(cospim16, &u[42], cospi48, &u[53], rnding, bit);
+  u[54] = half_btf_sse4_1(cospim16, &u[41], cospi48, &u[54], rnding, bit);
+  u[55] = half_btf_sse4_1(cospim16, &u[40], cospi48, &u[55], rnding, bit);
+  u[40] = temp1;
+  u[41] = temp2;
+  u[42] = temp3;
+  u[43] = temp4;
+}
+
+static INLINE void idct64_stage9_sse4_1(__m128i *u, const __m128i *cospim32,
+                                        const __m128i *cospi32,
+                                        const __m128i *clamp_lo,
+                                        const __m128i *clamp_hi,
+                                        const __m128i *rnding, int bit) {
+  int i;
+  __m128i temp1, temp2, temp3, temp4;
+  for (i = 0; i < 8; ++i) {
+    addsub_sse4_1(u[i], u[15 - i], &u[i], &u[15 - i], clamp_lo, clamp_hi);
+  }
+
+  temp1 = half_btf_sse4_1(cospim32, &u[20], cospi32, &u[27], rnding, bit);
+  temp2 = half_btf_sse4_1(cospim32, &u[21], cospi32, &u[26], rnding, bit);
+  temp3 = half_btf_sse4_1(cospim32, &u[22], cospi32, &u[25], rnding, bit);
+  temp4 = half_btf_sse4_1(cospim32, &u[23], cospi32, &u[24], rnding, bit);
+  u[24] = half_btf_sse4_1(cospi32, &u[23], cospi32, &u[24], rnding, bit);
+  u[25] = half_btf_sse4_1(cospi32, &u[22], cospi32, &u[25], rnding, bit);
+  u[26] = half_btf_sse4_1(cospi32, &u[21], cospi32, &u[26], rnding, bit);
+  u[27] = half_btf_sse4_1(cospi32, &u[20], cospi32, &u[27], rnding, bit);
+  u[20] = temp1;
+  u[21] = temp2;
+  u[22] = temp3;
+  u[23] = temp4;
+  for (i = 32; i < 40; i++) {
+    addsub_sse4_1(u[i], u[i ^ 15], &u[i], &u[i ^ 15], clamp_lo, clamp_hi);
+  }
+
+  for (i = 48; i < 56; i++) {
+    addsub_sse4_1(u[i ^ 15], u[i], &u[i ^ 15], &u[i], clamp_lo, clamp_hi);
+  }
+}
+
+static INLINE void idct64_stage10_sse4_1(__m128i *u, const __m128i *cospim32,
+                                         const __m128i *cospi32,
+                                         const __m128i *clamp_lo,
+                                         const __m128i *clamp_hi,
+                                         const __m128i *rnding, int bit) {
+  __m128i temp1, temp2, temp3, temp4;
+  for (int i = 0; i < 16; i++) {
+    addsub_sse4_1(u[i], u[31 - i], &u[i], &u[31 - i], clamp_lo, clamp_hi);
+  }
+
+  temp1 = half_btf_sse4_1(cospim32, &u[40], cospi32, &u[55], rnding, bit);
+  temp2 = half_btf_sse4_1(cospim32, &u[41], cospi32, &u[54], rnding, bit);
+  temp3 = half_btf_sse4_1(cospim32, &u[42], cospi32, &u[53], rnding, bit);
+  temp4 = half_btf_sse4_1(cospim32, &u[43], cospi32, &u[52], rnding, bit);
+  u[52] = half_btf_sse4_1(cospi32, &u[43], cospi32, &u[52], rnding, bit);
+  u[53] = half_btf_sse4_1(cospi32, &u[42], cospi32, &u[53], rnding, bit);
+  u[54] = half_btf_sse4_1(cospi32, &u[41], cospi32, &u[54], rnding, bit);
+  u[55] = half_btf_sse4_1(cospi32, &u[40], cospi32, &u[55], rnding, bit);
+  u[40] = temp1;
+  u[41] = temp2;
+  u[42] = temp3;
+  u[43] = temp4;
+
+  temp1 = half_btf_sse4_1(cospim32, &u[44], cospi32, &u[51], rnding, bit);
+  temp2 = half_btf_sse4_1(cospim32, &u[45], cospi32, &u[50], rnding, bit);
+  temp3 = half_btf_sse4_1(cospim32, &u[46], cospi32, &u[49], rnding, bit);
+  temp4 = half_btf_sse4_1(cospim32, &u[47], cospi32, &u[48], rnding, bit);
+  u[48] = half_btf_sse4_1(cospi32, &u[47], cospi32, &u[48], rnding, bit);
+  u[49] = half_btf_sse4_1(cospi32, &u[46], cospi32, &u[49], rnding, bit);
+  u[50] = half_btf_sse4_1(cospi32, &u[45], cospi32, &u[50], rnding, bit);
+  u[51] = half_btf_sse4_1(cospi32, &u[44], cospi32, &u[51], rnding, bit);
+  u[44] = temp1;
+  u[45] = temp2;
+  u[46] = temp3;
+  u[47] = temp4;
+}
+
+static INLINE void idct64_stage11_sse4_1(__m128i *u, __m128i *out, int do_cols,
+                                         int bd, int out_shift,
+                                         const __m128i *clamp_lo,
+                                         const __m128i *clamp_hi) {
+  for (int i = 0; i < 32; i++) {
+    addsub_sse4_1(u[i], u[63 - i], out + i, out + 63 - i, clamp_lo, clamp_hi);
+  }
+
+  if (!do_cols) {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+    const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+    for (int i = 0; i < 64; i += 4) {
+      round_shift_4x4(out + i, out_shift);
+      highbd_clamp_epi32_sse4_1(out + i, out + i, &clamp_lo_out, &clamp_hi_out,
+                                4);
+    }
+  }
+}
+
+static void idct64x64_low1_sse4_1(__m128i *in, __m128i *out, int bit,
+                                  int do_cols, int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+
+  {
+    __m128i x;
+
+    // stage 1
+    // stage 2
+    // stage 3
+    // stage 4
+    // stage 5
+    // stage 6
+    x = half_btf_0_sse4_1(&cospi32, &in[0], &rnding, bit);
+
+    // stage 8
+    // stage 9
+    // stage 10
+    // stage 11
+    if (!do_cols) {
+      const int log_range_out = AOMMAX(16, bd + 6);
+      clamp_lo = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+      clamp_hi = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+      if (out_shift != 0) {
+        __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
+        x = _mm_add_epi32(x, offset);
+        x = _mm_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
+      }
+    }
+    x = _mm_max_epi32(x, clamp_lo);
+    x = _mm_min_epi32(x, clamp_hi);
+    out[0] = x;
+    out[1] = x;
+    out[2] = x;
+    out[3] = x;
+    out[4] = x;
+    out[5] = x;
+    out[6] = x;
+    out[7] = x;
+    out[8] = x;
+    out[9] = x;
+    out[10] = x;
+    out[11] = x;
+    out[12] = x;
+    out[13] = x;
+    out[14] = x;
+    out[15] = x;
+    out[16] = x;
+    out[17] = x;
+    out[18] = x;
+    out[19] = x;
+    out[20] = x;
+    out[21] = x;
+    out[22] = x;
+    out[23] = x;
+    out[24] = x;
+    out[25] = x;
+    out[26] = x;
+    out[27] = x;
+    out[28] = x;
+    out[29] = x;
+    out[30] = x;
+    out[31] = x;
+    out[32] = x;
+    out[33] = x;
+    out[34] = x;
+    out[35] = x;
+    out[36] = x;
+    out[37] = x;
+    out[38] = x;
+    out[39] = x;
+    out[40] = x;
+    out[41] = x;
+    out[42] = x;
+    out[43] = x;
+    out[44] = x;
+    out[45] = x;
+    out[46] = x;
+    out[47] = x;
+    out[48] = x;
+    out[49] = x;
+    out[50] = x;
+    out[51] = x;
+    out[52] = x;
+    out[53] = x;
+    out[54] = x;
+    out[55] = x;
+    out[56] = x;
+    out[57] = x;
+    out[58] = x;
+    out[59] = x;
+    out[60] = x;
+    out[61] = x;
+    out[62] = x;
+    out[63] = x;
+  }
+}
+
+static void idct64x64_low8_sse4_1(__m128i *in, __m128i *out, int bit,
+                                  int do_cols, int bd, int out_shift) {
+  int i, j;
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+
+  const __m128i cospi1 = _mm_set1_epi32(cospi[1]);
+  const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+  const __m128i cospi3 = _mm_set1_epi32(cospi[3]);
+  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+  const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+  const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
+  const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+  const __m128i cospim12 = _mm_set1_epi32(-cospi[12]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
+  const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+  const __m128i cospim28 = _mm_set1_epi32(-cospi[28]);
+  const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
+  const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+  const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+  const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+  const __m128i cospi63 = _mm_set1_epi32(cospi[63]);
+  const __m128i cospim57 = _mm_set1_epi32(-cospi[57]);
+  const __m128i cospi7 = _mm_set1_epi32(cospi[7]);
+  const __m128i cospi5 = _mm_set1_epi32(cospi[5]);
+  const __m128i cospi59 = _mm_set1_epi32(cospi[59]);
+  const __m128i cospim61 = _mm_set1_epi32(-cospi[61]);
+  const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
+  const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+
+  {
+    __m128i u[64];
+
+    // stage 1
+    u[0] = in[0];
+    u[8] = in[4];
+    u[16] = in[2];
+    u[24] = in[6];
+    u[32] = in[1];
+    u[40] = in[5];
+    u[48] = in[3];
+    u[56] = in[7];
+
+    // stage 2
+    u[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit);
+    u[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit);
+    u[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit);
+    u[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit);
+    u[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit);
+    u[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit);
+    u[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit);
+    u[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit);
+
+    // stage 3
+    u[31] = half_btf_0_sse4_1(&cospi2, &u[16], &rnding, bit);
+    u[16] = half_btf_0_sse4_1(&cospi62, &u[16], &rnding, bit);
+    u[23] = half_btf_0_sse4_1(&cospim58, &u[24], &rnding, bit);
+    u[24] = half_btf_0_sse4_1(&cospi6, &u[24], &rnding, bit);
+    u[33] = u[32];
+    u[38] = u[39];
+    u[41] = u[40];
+    u[46] = u[47];
+    u[49] = u[48];
+    u[54] = u[55];
+    u[57] = u[56];
+    u[62] = u[63];
+
+    // stage 4
+    __m128i temp1, temp2;
+    u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit);
+    u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit);
+    u[17] = u[16];
+    u[22] = u[23];
+    u[25] = u[24];
+    u[30] = u[31];
+
+    temp1 = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
+    u[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
+    u[33] = temp1;
+
+    temp2 = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
+    u[38] = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
+    u[57] = temp2;
+
+    temp1 = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
+    u[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
+    u[41] = temp1;
+
+    temp2 = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
+    u[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
+    u[46] = temp2;
+
+    // stage 5
+    u[9] = u[8];
+    u[14] = u[15];
+
+    temp1 = half_btf_sse4_1(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit);
+    u[30] = half_btf_sse4_1(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit);
+    u[17] = temp1;
+
+    temp2 = half_btf_sse4_1(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit);
+    u[25] = half_btf_sse4_1(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit);
+    u[22] = temp2;
+
+    u[35] = u[32];
+    u[34] = u[33];
+    u[36] = u[39];
+    u[37] = u[38];
+    u[43] = u[40];
+    u[42] = u[41];
+    u[44] = u[47];
+    u[45] = u[46];
+    u[51] = u[48];
+    u[50] = u[49];
+    u[52] = u[55];
+    u[53] = u[54];
+    u[59] = u[56];
+    u[58] = u[57];
+    u[60] = u[63];
+    u[61] = u[62];
+
+    // stage 6
+    temp1 = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
+    u[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
+    u[0] = temp1;
+
+    temp2 = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
+    u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
+    u[9] = temp2;
+    u[19] = u[16];
+    u[18] = u[17];
+    u[20] = u[23];
+    u[21] = u[22];
+    u[27] = u[24];
+    u[26] = u[25];
+    u[28] = u[31];
+    u[29] = u[30];
+
+    temp1 = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
+    u[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
+    u[34] = temp1;
+    temp2 = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
+    u[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
+    u[35] = temp2;
+    temp1 = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
+    u[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
+    u[36] = temp1;
+    temp2 = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
+    u[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
+    u[37] = temp2;
+    temp1 = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
+    u[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
+    u[42] = temp1;
+    temp2 = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
+    u[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
+    u[43] = temp2;
+    temp1 = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
+    u[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
+    u[44] = temp1;
+    temp2 = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
+    u[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
+    u[45] = temp2;
+
+    // stage 7
+    u[3] = u[0];
+    u[2] = u[1];
+    u[11] = u[8];
+    u[10] = u[9];
+    u[12] = u[15];
+    u[13] = u[14];
+
+    temp1 = half_btf_sse4_1(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit);
+    u[29] = half_btf_sse4_1(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit);
+    u[18] = temp1;
+    temp2 = half_btf_sse4_1(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit);
+    u[28] = half_btf_sse4_1(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit);
+    u[19] = temp2;
+    temp1 = half_btf_sse4_1(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit);
+    u[27] = half_btf_sse4_1(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit);
+    u[20] = temp1;
+    temp2 = half_btf_sse4_1(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit);
+    u[26] = half_btf_sse4_1(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit);
+    u[21] = temp2;
+    for (i = 32; i < 64; i += 16) {
+      for (j = i; j < i + 4; j++) {
+        addsub_sse4_1(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
+        addsub_sse4_1(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
+                      &clamp_hi);
+      }
+    }
+
+    // stage 8
+    u[7] = u[0];
+    u[6] = u[1];
+    u[5] = u[2];
+    u[4] = u[3];
+
+    idct64_stage8_sse4_1(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
+                         &cospim48, &clamp_lo, &clamp_hi, &rnding, bit);
+
+    // stage 9
+    idct64_stage9_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
+                         bit);
+
+    // stage 10
+    idct64_stage10_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
+                          bit);
+
+    // stage 11
+    idct64_stage11_sse4_1(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
+  }
+}
+
+static void idct64x64_low16_sse4_1(__m128i *in, __m128i *out, int bit,
+                                   int do_cols, int bd, int out_shift) {
+  int i, j;
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+
+  const __m128i cospi1 = _mm_set1_epi32(cospi[1]);
+  const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+  const __m128i cospi3 = _mm_set1_epi32(cospi[3]);
+  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+  const __m128i cospi5 = _mm_set1_epi32(cospi[5]);
+  const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+  const __m128i cospi7 = _mm_set1_epi32(cospi[7]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi9 = _mm_set1_epi32(cospi[9]);
+  const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+  const __m128i cospi11 = _mm_set1_epi32(cospi[11]);
+  const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+  const __m128i cospi13 = _mm_set1_epi32(cospi[13]);
+  const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+  const __m128i cospi15 = _mm_set1_epi32(cospi[15]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi51 = _mm_set1_epi32(cospi[51]);
+  const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+  const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+  const __m128i cospi55 = _mm_set1_epi32(cospi[55]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospi59 = _mm_set1_epi32(cospi[59]);
+  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+  const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+  const __m128i cospi63 = _mm_set1_epi32(cospi[63]);
+
+  const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
+  const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+  const __m128i cospim12 = _mm_set1_epi32(-cospi[12]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
+  const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+  const __m128i cospim28 = _mm_set1_epi32(-cospi[28]);
+  const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
+  const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+  const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+  const __m128i cospim44 = _mm_set1_epi32(-cospi[44]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospim49 = _mm_set1_epi32(-cospi[49]);
+  const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
+  const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+  const __m128i cospim53 = _mm_set1_epi32(-cospi[53]);
+  const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+  const __m128i cospim57 = _mm_set1_epi32(-cospi[57]);
+  const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
+  const __m128i cospim60 = _mm_set1_epi32(-cospi[60]);
+  const __m128i cospim61 = _mm_set1_epi32(-cospi[61]);
+
+  {
+    __m128i u[64];
+    __m128i tmp1, tmp2, tmp3, tmp4;
+    // stage 1
+    u[0] = in[0];
+    u[32] = in[1];
+    u[36] = in[9];
+    u[40] = in[5];
+    u[44] = in[13];
+    u[48] = in[3];
+    u[52] = in[11];
+    u[56] = in[7];
+    u[60] = in[15];
+    u[16] = in[2];
+    u[20] = in[10];
+    u[24] = in[6];
+    u[28] = in[14];
+    u[4] = in[8];
+    u[8] = in[4];
+    u[12] = in[12];
+
+    // stage 2
+    u[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit);
+    u[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit);
+    u[35] = half_btf_0_sse4_1(&cospim49, &u[60], &rnding, bit);
+    u[60] = half_btf_0_sse4_1(&cospi15, &u[60], &rnding, bit);
+    u[59] = half_btf_0_sse4_1(&cospi9, &u[36], &rnding, bit);
+    u[36] = half_btf_0_sse4_1(&cospi55, &u[36], &rnding, bit);
+    u[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit);
+    u[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit);
+    u[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit);
+    u[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit);
+    u[43] = half_btf_0_sse4_1(&cospim53, &u[52], &rnding, bit);
+    u[52] = half_btf_0_sse4_1(&cospi11, &u[52], &rnding, bit);
+    u[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit);
+    u[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit);
+    u[51] = half_btf_0_sse4_1(&cospi13, &u[44], &rnding, bit);
+    u[44] = half_btf_0_sse4_1(&cospi51, &u[44], &rnding, bit);
+
+    // stage 3
+    u[31] = half_btf_0_sse4_1(&cospi2, &u[16], &rnding, bit);
+    u[16] = half_btf_0_sse4_1(&cospi62, &u[16], &rnding, bit);
+    u[19] = half_btf_0_sse4_1(&cospim50, &u[28], &rnding, bit);
+    u[28] = half_btf_0_sse4_1(&cospi14, &u[28], &rnding, bit);
+    u[27] = half_btf_0_sse4_1(&cospi10, &u[20], &rnding, bit);
+    u[20] = half_btf_0_sse4_1(&cospi54, &u[20], &rnding, bit);
+    u[23] = half_btf_0_sse4_1(&cospim58, &u[24], &rnding, bit);
+    u[24] = half_btf_0_sse4_1(&cospi6, &u[24], &rnding, bit);
+    u[33] = u[32];
+    u[34] = u[35];
+    u[37] = u[36];
+    u[38] = u[39];
+    u[41] = u[40];
+    u[42] = u[43];
+    u[45] = u[44];
+    u[46] = u[47];
+    u[49] = u[48];
+    u[50] = u[51];
+    u[53] = u[52];
+    u[54] = u[55];
+    u[57] = u[56];
+    u[58] = u[59];
+    u[61] = u[60];
+    u[62] = u[63];
+
+    // stage 4
+    u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit);
+    u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit);
+    u[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit);
+    u[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit);
+
+    u[17] = u[16];
+    u[18] = u[19];
+    u[21] = u[20];
+    u[22] = u[23];
+    u[25] = u[24];
+    u[26] = u[27];
+    u[29] = u[28];
+    u[30] = u[31];
+
+    tmp1 = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
+    tmp2 = half_btf_sse4_1(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit);
+    tmp3 = half_btf_sse4_1(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit);
+    tmp4 = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
+    u[57] = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
+    u[58] = half_btf_sse4_1(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit);
+    u[61] = half_btf_sse4_1(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit);
+    u[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
+    u[33] = tmp1;
+    u[34] = tmp2;
+    u[37] = tmp3;
+    u[38] = tmp4;
+
+    tmp1 = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
+    tmp2 = half_btf_sse4_1(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit);
+    tmp3 = half_btf_sse4_1(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit);
+    tmp4 = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
+    u[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
+    u[50] = half_btf_sse4_1(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit);
+    u[53] = half_btf_sse4_1(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit);
+    u[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
+    u[41] = tmp1;
+    u[42] = tmp2;
+    u[45] = tmp3;
+    u[46] = tmp4;
+
+    // stage 5
+    u[7] = half_btf_0_sse4_1(&cospi8, &u[4], &rnding, bit);
+    u[4] = half_btf_0_sse4_1(&cospi56, &u[4], &rnding, bit);
+
+    u[9] = u[8];
+    u[10] = u[11];
+    u[13] = u[12];
+    u[14] = u[15];
+
+    tmp1 = half_btf_sse4_1(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit);
+    tmp2 = half_btf_sse4_1(&cospim56, &u[18], &cospim8, &u[29], &rnding, bit);
+    tmp3 = half_btf_sse4_1(&cospim40, &u[21], &cospi24, &u[26], &rnding, bit);
+    tmp4 = half_btf_sse4_1(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit);
+    u[25] = half_btf_sse4_1(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit);
+    u[26] = half_btf_sse4_1(&cospi24, &u[21], &cospi40, &u[26], &rnding, bit);
+    u[29] = half_btf_sse4_1(&cospim8, &u[18], &cospi56, &u[29], &rnding, bit);
+    u[30] = half_btf_sse4_1(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit);
+    u[17] = tmp1;
+    u[18] = tmp2;
+    u[21] = tmp3;
+    u[22] = tmp4;
+
+    for (i = 32; i < 64; i += 8) {
+      addsub_sse4_1(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
+                    &clamp_hi);
+      addsub_sse4_1(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
+                    &clamp_hi);
+
+      addsub_sse4_1(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
+                    &clamp_hi);
+      addsub_sse4_1(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
+                    &clamp_hi);
+    }
+
+    // stage 6
+    tmp1 = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
+    u[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
+    u[0] = tmp1;
+    u[5] = u[4];
+    u[6] = u[7];
+
+    tmp1 = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
+    u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
+    u[9] = tmp1;
+    tmp2 = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
+    u[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
+    u[10] = tmp2;
+
+    for (i = 16; i < 32; i += 8) {
+      addsub_sse4_1(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
+                    &clamp_hi);
+      addsub_sse4_1(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
+                    &clamp_hi);
+
+      addsub_sse4_1(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
+                    &clamp_hi);
+      addsub_sse4_1(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
+                    &clamp_hi);
+    }
+
+    tmp1 = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
+    tmp2 = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
+    tmp3 = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
+    tmp4 = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
+    u[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
+    u[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
+    u[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
+    u[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
+    u[34] = tmp1;
+    u[35] = tmp2;
+    u[36] = tmp3;
+    u[37] = tmp4;
+
+    tmp1 = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
+    tmp2 = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
+    tmp3 = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
+    tmp4 = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
+    u[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
+    u[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
+    u[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
+    u[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
+    u[42] = tmp1;
+    u[43] = tmp2;
+    u[44] = tmp3;
+    u[45] = tmp4;
+
+    // stage 7
+    u[3] = u[0];
+    u[2] = u[1];
+    tmp1 = half_btf_sse4_1(&cospim32, &u[5], &cospi32, &u[6], &rnding, bit);
+    u[6] = half_btf_sse4_1(&cospi32, &u[5], &cospi32, &u[6], &rnding, bit);
+    u[5] = tmp1;
+    addsub_sse4_1(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
+
+    tmp1 = half_btf_sse4_1(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit);
+    tmp2 = half_btf_sse4_1(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit);
+    tmp3 = half_btf_sse4_1(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit);
+    tmp4 = half_btf_sse4_1(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit);
+    u[26] = half_btf_sse4_1(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit);
+    u[27] = half_btf_sse4_1(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit);
+    u[28] = half_btf_sse4_1(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit);
+    u[29] = half_btf_sse4_1(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit);
+    u[18] = tmp1;
+    u[19] = tmp2;
+    u[20] = tmp3;
+    u[21] = tmp4;
+
+    for (i = 32; i < 64; i += 16) {
+      for (j = i; j < i + 4; j++) {
+        addsub_sse4_1(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
+        addsub_sse4_1(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
+                      &clamp_hi);
+      }
+    }
+
+    // stage 8
+    for (i = 0; i < 4; ++i) {
+      addsub_sse4_1(u[i], u[7 - i], &u[i], &u[7 - i], &clamp_lo, &clamp_hi);
+    }
+
+    idct64_stage8_sse4_1(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
+                         &cospim48, &clamp_lo, &clamp_hi, &rnding, bit);
+
+    // stage 9
+    idct64_stage9_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
+                         bit);
+
+    // stage 10
+    idct64_stage10_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
+                          bit);
+
+    // stage 11
+    idct64_stage11_sse4_1(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
+  }
+}
+
+static void idct64x64_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+                             int bd, int out_shift) {
+  int i, j;
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+
+  const __m128i cospi1 = _mm_set1_epi32(cospi[1]);
+  const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+  const __m128i cospi3 = _mm_set1_epi32(cospi[3]);
+  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+  const __m128i cospi5 = _mm_set1_epi32(cospi[5]);
+  const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+  const __m128i cospi7 = _mm_set1_epi32(cospi[7]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi9 = _mm_set1_epi32(cospi[9]);
+  const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+  const __m128i cospi11 = _mm_set1_epi32(cospi[11]);
+  const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+  const __m128i cospi13 = _mm_set1_epi32(cospi[13]);
+  const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+  const __m128i cospi15 = _mm_set1_epi32(cospi[15]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospi17 = _mm_set1_epi32(cospi[17]);
+  const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
+  const __m128i cospi19 = _mm_set1_epi32(cospi[19]);
+  const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+  const __m128i cospi21 = _mm_set1_epi32(cospi[21]);
+  const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
+  const __m128i cospi23 = _mm_set1_epi32(cospi[23]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospi25 = _mm_set1_epi32(cospi[25]);
+  const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
+  const __m128i cospi27 = _mm_set1_epi32(cospi[27]);
+  const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+  const __m128i cospi29 = _mm_set1_epi32(cospi[29]);
+  const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
+  const __m128i cospi31 = _mm_set1_epi32(cospi[31]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospi35 = _mm_set1_epi32(cospi[35]);
+  const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+  const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
+  const __m128i cospi39 = _mm_set1_epi32(cospi[39]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospi43 = _mm_set1_epi32(cospi[43]);
+  const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+  const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
+  const __m128i cospi47 = _mm_set1_epi32(cospi[47]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi51 = _mm_set1_epi32(cospi[51]);
+  const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+  const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+  const __m128i cospi55 = _mm_set1_epi32(cospi[55]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospi59 = _mm_set1_epi32(cospi[59]);
+  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+  const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+  const __m128i cospi63 = _mm_set1_epi32(cospi[63]);
+
+  const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
+  const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+  const __m128i cospim12 = _mm_set1_epi32(-cospi[12]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
+  const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+  const __m128i cospim28 = _mm_set1_epi32(-cospi[28]);
+  const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
+  const __m128i cospim33 = _mm_set1_epi32(-cospi[33]);
+  const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
+  const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+  const __m128i cospim37 = _mm_set1_epi32(-cospi[37]);
+  const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+  const __m128i cospim41 = _mm_set1_epi32(-cospi[41]);
+  const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
+  const __m128i cospim44 = _mm_set1_epi32(-cospi[44]);
+  const __m128i cospim45 = _mm_set1_epi32(-cospi[45]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospim49 = _mm_set1_epi32(-cospi[49]);
+  const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
+  const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+  const __m128i cospim53 = _mm_set1_epi32(-cospi[53]);
+  const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+  const __m128i cospim57 = _mm_set1_epi32(-cospi[57]);
+  const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
+  const __m128i cospim60 = _mm_set1_epi32(-cospi[60]);
+  const __m128i cospim61 = _mm_set1_epi32(-cospi[61]);
+
+  {
+    __m128i u[64], v[64];
+
+    // stage 1
+    u[32] = in[1];
+    u[34] = in[17];
+    u[36] = in[9];
+    u[38] = in[25];
+    u[40] = in[5];
+    u[42] = in[21];
+    u[44] = in[13];
+    u[46] = in[29];
+    u[48] = in[3];
+    u[50] = in[19];
+    u[52] = in[11];
+    u[54] = in[27];
+    u[56] = in[7];
+    u[58] = in[23];
+    u[60] = in[15];
+    u[62] = in[31];
+
+    v[16] = in[2];
+    v[18] = in[18];
+    v[20] = in[10];
+    v[22] = in[26];
+    v[24] = in[6];
+    v[26] = in[22];
+    v[28] = in[14];
+    v[30] = in[30];
+
+    u[8] = in[4];
+    u[10] = in[20];
+    u[12] = in[12];
+    u[14] = in[28];
+
+    v[4] = in[8];
+    v[6] = in[24];
+
+    u[0] = in[0];
+    u[2] = in[16];
+
+    // stage 2
+    v[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit);
+    v[33] = half_btf_0_sse4_1(&cospim33, &u[62], &rnding, bit);
+    v[34] = half_btf_0_sse4_1(&cospi47, &u[34], &rnding, bit);
+    v[35] = half_btf_0_sse4_1(&cospim49, &u[60], &rnding, bit);
+    v[36] = half_btf_0_sse4_1(&cospi55, &u[36], &rnding, bit);
+    v[37] = half_btf_0_sse4_1(&cospim41, &u[58], &rnding, bit);
+    v[38] = half_btf_0_sse4_1(&cospi39, &u[38], &rnding, bit);
+    v[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit);
+    v[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit);
+    v[41] = half_btf_0_sse4_1(&cospim37, &u[54], &rnding, bit);
+    v[42] = half_btf_0_sse4_1(&cospi43, &u[42], &rnding, bit);
+    v[43] = half_btf_0_sse4_1(&cospim53, &u[52], &rnding, bit);
+    v[44] = half_btf_0_sse4_1(&cospi51, &u[44], &rnding, bit);
+    v[45] = half_btf_0_sse4_1(&cospim45, &u[50], &rnding, bit);
+    v[46] = half_btf_0_sse4_1(&cospi35, &u[46], &rnding, bit);
+    v[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit);
+    v[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit);
+    v[49] = half_btf_0_sse4_1(&cospi29, &u[46], &rnding, bit);
+    v[50] = half_btf_0_sse4_1(&cospi19, &u[50], &rnding, bit);
+    v[51] = half_btf_0_sse4_1(&cospi13, &u[44], &rnding, bit);
+    v[52] = half_btf_0_sse4_1(&cospi11, &u[52], &rnding, bit);
+    v[53] = half_btf_0_sse4_1(&cospi21, &u[42], &rnding, bit);
+    v[54] = half_btf_0_sse4_1(&cospi27, &u[54], &rnding, bit);
+    v[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit);
+    v[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit);
+    v[57] = half_btf_0_sse4_1(&cospi25, &u[38], &rnding, bit);
+    v[58] = half_btf_0_sse4_1(&cospi23, &u[58], &rnding, bit);
+    v[59] = half_btf_0_sse4_1(&cospi9, &u[36], &rnding, bit);
+    v[60] = half_btf_0_sse4_1(&cospi15, &u[60], &rnding, bit);
+    v[61] = half_btf_0_sse4_1(&cospi17, &u[34], &rnding, bit);
+    v[62] = half_btf_0_sse4_1(&cospi31, &u[62], &rnding, bit);
+    v[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit);
+
+    // stage 3
+    u[16] = half_btf_0_sse4_1(&cospi62, &v[16], &rnding, bit);
+    u[17] = half_btf_0_sse4_1(&cospim34, &v[30], &rnding, bit);
+    u[18] = half_btf_0_sse4_1(&cospi46, &v[18], &rnding, bit);
+    u[19] = half_btf_0_sse4_1(&cospim50, &v[28], &rnding, bit);
+    u[20] = half_btf_0_sse4_1(&cospi54, &v[20], &rnding, bit);
+    u[21] = half_btf_0_sse4_1(&cospim42, &v[26], &rnding, bit);
+    u[22] = half_btf_0_sse4_1(&cospi38, &v[22], &rnding, bit);
+    u[23] = half_btf_0_sse4_1(&cospim58, &v[24], &rnding, bit);
+    u[24] = half_btf_0_sse4_1(&cospi6, &v[24], &rnding, bit);
+    u[25] = half_btf_0_sse4_1(&cospi26, &v[22], &rnding, bit);
+    u[26] = half_btf_0_sse4_1(&cospi22, &v[26], &rnding, bit);
+    u[27] = half_btf_0_sse4_1(&cospi10, &v[20], &rnding, bit);
+    u[28] = half_btf_0_sse4_1(&cospi14, &v[28], &rnding, bit);
+    u[29] = half_btf_0_sse4_1(&cospi18, &v[18], &rnding, bit);
+    u[30] = half_btf_0_sse4_1(&cospi30, &v[30], &rnding, bit);
+    u[31] = half_btf_0_sse4_1(&cospi2, &v[16], &rnding, bit);
+
+    for (i = 32; i < 64; i += 4) {
+      addsub_sse4_1(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
+                    &clamp_hi);
+      addsub_sse4_1(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
+                    &clamp_hi);
+    }
+
+    // stage 4
+    v[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit);
+    v[9] = half_btf_0_sse4_1(&cospim36, &u[14], &rnding, bit);
+    v[10] = half_btf_0_sse4_1(&cospi44, &u[10], &rnding, bit);
+    v[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit);
+    v[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit);
+    v[13] = half_btf_0_sse4_1(&cospi20, &u[10], &rnding, bit);
+    v[14] = half_btf_0_sse4_1(&cospi28, &u[14], &rnding, bit);
+    v[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit);
+
+    for (i = 16; i < 32; i += 4) {
+      addsub_sse4_1(u[i + 0], u[i + 1], &v[i + 0], &v[i + 1], &clamp_lo,
+                    &clamp_hi);
+      addsub_sse4_1(u[i + 3], u[i + 2], &v[i + 3], &v[i + 2], &clamp_lo,
+                    &clamp_hi);
+    }
+
+    for (i = 32; i < 64; i += 4) {
+      v[i + 0] = u[i + 0];
+      v[i + 3] = u[i + 3];
+    }
+
+    v[33] = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
+    v[34] = half_btf_sse4_1(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit);
+    v[37] = half_btf_sse4_1(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit);
+    v[38] = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
+    v[41] = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
+    v[42] = half_btf_sse4_1(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit);
+    v[45] = half_btf_sse4_1(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit);
+    v[46] = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
+    v[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
+    v[50] = half_btf_sse4_1(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit);
+    v[53] = half_btf_sse4_1(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit);
+    v[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
+    v[57] = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
+    v[58] = half_btf_sse4_1(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit);
+    v[61] = half_btf_sse4_1(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit);
+    v[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
+
+    // stage 5
+    u[4] = half_btf_0_sse4_1(&cospi56, &v[4], &rnding, bit);
+    u[5] = half_btf_0_sse4_1(&cospim40, &v[6], &rnding, bit);
+    u[6] = half_btf_0_sse4_1(&cospi24, &v[6], &rnding, bit);
+    u[7] = half_btf_0_sse4_1(&cospi8, &v[4], &rnding, bit);
+
+    for (i = 8; i < 16; i += 4) {
+      addsub_sse4_1(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
+                    &clamp_hi);
+      addsub_sse4_1(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
+                    &clamp_hi);
+    }
+
+    for (i = 16; i < 32; i += 4) {
+      u[i + 0] = v[i + 0];
+      u[i + 3] = v[i + 3];
+    }
+
+    u[17] = half_btf_sse4_1(&cospim8, &v[17], &cospi56, &v[30], &rnding, bit);
+    u[18] = half_btf_sse4_1(&cospim56, &v[18], &cospim8, &v[29], &rnding, bit);
+    u[21] = half_btf_sse4_1(&cospim40, &v[21], &cospi24, &v[26], &rnding, bit);
+    u[22] = half_btf_sse4_1(&cospim24, &v[22], &cospim40, &v[25], &rnding, bit);
+    u[25] = half_btf_sse4_1(&cospim40, &v[22], &cospi24, &v[25], &rnding, bit);
+    u[26] = half_btf_sse4_1(&cospi24, &v[21], &cospi40, &v[26], &rnding, bit);
+    u[29] = half_btf_sse4_1(&cospim8, &v[18], &cospi56, &v[29], &rnding, bit);
+    u[30] = half_btf_sse4_1(&cospi56, &v[17], &cospi8, &v[30], &rnding, bit);
+
+    for (i = 32; i < 64; i += 8) {
+      addsub_sse4_1(v[i + 0], v[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
+                    &clamp_hi);
+      addsub_sse4_1(v[i + 1], v[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
+                    &clamp_hi);
+
+      addsub_sse4_1(v[i + 7], v[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
+                    &clamp_hi);
+      addsub_sse4_1(v[i + 6], v[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
+                    &clamp_hi);
+    }
+
+    // stage 6
+    v[0] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
+    v[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
+    v[2] = half_btf_0_sse4_1(&cospi48, &u[2], &rnding, bit);
+    v[3] = half_btf_0_sse4_1(&cospi16, &u[2], &rnding, bit);
+
+    addsub_sse4_1(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
+
+    for (i = 8; i < 16; i += 4) {
+      v[i + 0] = u[i + 0];
+      v[i + 3] = u[i + 3];
+    }
+
+    v[9] = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
+    v[10] = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
+    v[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
+    v[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
+
+    for (i = 16; i < 32; i += 8) {
+      addsub_sse4_1(u[i + 0], u[i + 3], &v[i + 0], &v[i + 3], &clamp_lo,
+                    &clamp_hi);
+      addsub_sse4_1(u[i + 1], u[i + 2], &v[i + 1], &v[i + 2], &clamp_lo,
+                    &clamp_hi);
+
+      addsub_sse4_1(u[i + 7], u[i + 4], &v[i + 7], &v[i + 4], &clamp_lo,
+                    &clamp_hi);
+      addsub_sse4_1(u[i + 6], u[i + 5], &v[i + 6], &v[i + 5], &clamp_lo,
+                    &clamp_hi);
+    }
+
+    for (i = 32; i < 64; i += 8) {
+      v[i + 0] = u[i + 0];
+      v[i + 1] = u[i + 1];
+      v[i + 6] = u[i + 6];
+      v[i + 7] = u[i + 7];
+    }
+
+    v[34] = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
+    v[35] = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
+    v[36] = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
+    v[37] = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
+    v[42] = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
+    v[43] = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
+    v[44] = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
+    v[45] = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
+    v[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
+    v[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
+    v[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
+    v[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
+    v[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
+    v[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
+    v[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
+    v[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
+
+    // stage 7
+    addsub_sse4_1(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
+
+    u[4] = v[4];
+    u[7] = v[7];
+    u[5] = half_btf_sse4_1(&cospim32, &v[5], &cospi32, &v[6], &rnding, bit);
+    u[6] = half_btf_sse4_1(&cospi32, &v[5], &cospi32, &v[6], &rnding, bit);
+
+    addsub_sse4_1(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
+    addsub_sse4_1(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
+
+    for (i = 16; i < 32; i += 8) {
+      u[i + 0] = v[i + 0];
+      u[i + 1] = v[i + 1];
+      u[i + 6] = v[i + 6];
+      u[i + 7] = v[i + 7];
+    }
+
+    u[18] = half_btf_sse4_1(&cospim16, &v[18], &cospi48, &v[29], &rnding, bit);
+    u[19] = half_btf_sse4_1(&cospim16, &v[19], &cospi48, &v[28], &rnding, bit);
+    u[20] = half_btf_sse4_1(&cospim48, &v[20], &cospim16, &v[27], &rnding, bit);
+    u[21] = half_btf_sse4_1(&cospim48, &v[21], &cospim16, &v[26], &rnding, bit);
+    u[26] = half_btf_sse4_1(&cospim16, &v[21], &cospi48, &v[26], &rnding, bit);
+    u[27] = half_btf_sse4_1(&cospim16, &v[20], &cospi48, &v[27], &rnding, bit);
+    u[28] = half_btf_sse4_1(&cospi48, &v[19], &cospi16, &v[28], &rnding, bit);
+    u[29] = half_btf_sse4_1(&cospi48, &v[18], &cospi16, &v[29], &rnding, bit);
+
+    for (i = 32; i < 64; i += 16) {
+      for (j = i; j < i + 4; j++) {
+        addsub_sse4_1(v[j], v[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
+        addsub_sse4_1(v[j ^ 15], v[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
+                      &clamp_hi);
+      }
+    }
+
+    // stage 8
+    for (i = 0; i < 4; ++i) {
+      addsub_sse4_1(u[i], u[7 - i], &v[i], &v[7 - i], &clamp_lo, &clamp_hi);
+    }
+
+    v[8] = u[8];
+    v[9] = u[9];
+    v[14] = u[14];
+    v[15] = u[15];
+
+    v[10] = half_btf_sse4_1(&cospim32, &u[10], &cospi32, &u[13], &rnding, bit);
+    v[11] = half_btf_sse4_1(&cospim32, &u[11], &cospi32, &u[12], &rnding, bit);
+    v[12] = half_btf_sse4_1(&cospi32, &u[11], &cospi32, &u[12], &rnding, bit);
+    v[13] = half_btf_sse4_1(&cospi32, &u[10], &cospi32, &u[13], &rnding, bit);
+
+    for (i = 16; i < 20; ++i) {
+      addsub_sse4_1(u[i], u[i ^ 7], &v[i], &v[i ^ 7], &clamp_lo, &clamp_hi);
+      addsub_sse4_1(u[i ^ 15], u[i ^ 8], &v[i ^ 15], &v[i ^ 8], &clamp_lo,
+                    &clamp_hi);
+    }
+
+    for (i = 32; i < 36; ++i) {
+      v[i] = u[i];
+      v[i + 12] = u[i + 12];
+      v[i + 16] = u[i + 16];
+      v[i + 28] = u[i + 28];
+    }
+
+    v[36] = half_btf_sse4_1(&cospim16, &u[36], &cospi48, &u[59], &rnding, bit);
+    v[37] = half_btf_sse4_1(&cospim16, &u[37], &cospi48, &u[58], &rnding, bit);
+    v[38] = half_btf_sse4_1(&cospim16, &u[38], &cospi48, &u[57], &rnding, bit);
+    v[39] = half_btf_sse4_1(&cospim16, &u[39], &cospi48, &u[56], &rnding, bit);
+    v[40] = half_btf_sse4_1(&cospim48, &u[40], &cospim16, &u[55], &rnding, bit);
+    v[41] = half_btf_sse4_1(&cospim48, &u[41], &cospim16, &u[54], &rnding, bit);
+    v[42] = half_btf_sse4_1(&cospim48, &u[42], &cospim16, &u[53], &rnding, bit);
+    v[43] = half_btf_sse4_1(&cospim48, &u[43], &cospim16, &u[52], &rnding, bit);
+    v[52] = half_btf_sse4_1(&cospim16, &u[43], &cospi48, &u[52], &rnding, bit);
+    v[53] = half_btf_sse4_1(&cospim16, &u[42], &cospi48, &u[53], &rnding, bit);
+    v[54] = half_btf_sse4_1(&cospim16, &u[41], &cospi48, &u[54], &rnding, bit);
+    v[55] = half_btf_sse4_1(&cospim16, &u[40], &cospi48, &u[55], &rnding, bit);
+    v[56] = half_btf_sse4_1(&cospi48, &u[39], &cospi16, &u[56], &rnding, bit);
+    v[57] = half_btf_sse4_1(&cospi48, &u[38], &cospi16, &u[57], &rnding, bit);
+    v[58] = half_btf_sse4_1(&cospi48, &u[37], &cospi16, &u[58], &rnding, bit);
+    v[59] = half_btf_sse4_1(&cospi48, &u[36], &cospi16, &u[59], &rnding, bit);
+
+    // stage 9
+    for (i = 0; i < 8; ++i) {
+      addsub_sse4_1(v[i], v[15 - i], &u[i], &u[15 - i], &clamp_lo, &clamp_hi);
+    }
+
+    for (i = 16; i < 20; ++i) {
+      u[i] = v[i];
+      u[i + 12] = v[i + 12];
+    }
+
+    u[20] = half_btf_sse4_1(&cospim32, &v[20], &cospi32, &v[27], &rnding, bit);
+    u[21] = half_btf_sse4_1(&cospim32, &v[21], &cospi32, &v[26], &rnding, bit);
+    u[22] = half_btf_sse4_1(&cospim32, &v[22], &cospi32, &v[25], &rnding, bit);
+    u[23] = half_btf_sse4_1(&cospim32, &v[23], &cospi32, &v[24], &rnding, bit);
+    u[24] = half_btf_sse4_1(&cospi32, &v[23], &cospi32, &v[24], &rnding, bit);
+    u[25] = half_btf_sse4_1(&cospi32, &v[22], &cospi32, &v[25], &rnding, bit);
+    u[26] = half_btf_sse4_1(&cospi32, &v[21], &cospi32, &v[26], &rnding, bit);
+    u[27] = half_btf_sse4_1(&cospi32, &v[20], &cospi32, &v[27], &rnding, bit);
+
+    for (i = 32; i < 40; i++) {
+      addsub_sse4_1(v[i], v[i ^ 15], &u[i], &u[i ^ 15], &clamp_lo, &clamp_hi);
+    }
+
+    for (i = 48; i < 56; i++) {
+      addsub_sse4_1(v[i ^ 15], v[i], &u[i ^ 15], &u[i], &clamp_lo, &clamp_hi);
+    }
+
+    // stage 10
+    for (i = 0; i < 16; i++) {
+      addsub_sse4_1(u[i], u[31 - i], &v[i], &v[31 - i], &clamp_lo, &clamp_hi);
+    }
+
+    for (i = 32; i < 40; i++) v[i] = u[i];
+
+    v[40] = half_btf_sse4_1(&cospim32, &u[40], &cospi32, &u[55], &rnding, bit);
+    v[41] = half_btf_sse4_1(&cospim32, &u[41], &cospi32, &u[54], &rnding, bit);
+    v[42] = half_btf_sse4_1(&cospim32, &u[42], &cospi32, &u[53], &rnding, bit);
+    v[43] = half_btf_sse4_1(&cospim32, &u[43], &cospi32, &u[52], &rnding, bit);
+    v[44] = half_btf_sse4_1(&cospim32, &u[44], &cospi32, &u[51], &rnding, bit);
+    v[45] = half_btf_sse4_1(&cospim32, &u[45], &cospi32, &u[50], &rnding, bit);
+    v[46] = half_btf_sse4_1(&cospim32, &u[46], &cospi32, &u[49], &rnding, bit);
+    v[47] = half_btf_sse4_1(&cospim32, &u[47], &cospi32, &u[48], &rnding, bit);
+    v[48] = half_btf_sse4_1(&cospi32, &u[47], &cospi32, &u[48], &rnding, bit);
+    v[49] = half_btf_sse4_1(&cospi32, &u[46], &cospi32, &u[49], &rnding, bit);
+    v[50] = half_btf_sse4_1(&cospi32, &u[45], &cospi32, &u[50], &rnding, bit);
+    v[51] = half_btf_sse4_1(&cospi32, &u[44], &cospi32, &u[51], &rnding, bit);
+    v[52] = half_btf_sse4_1(&cospi32, &u[43], &cospi32, &u[52], &rnding, bit);
+    v[53] = half_btf_sse4_1(&cospi32, &u[42], &cospi32, &u[53], &rnding, bit);
+    v[54] = half_btf_sse4_1(&cospi32, &u[41], &cospi32, &u[54], &rnding, bit);
+    v[55] = half_btf_sse4_1(&cospi32, &u[40], &cospi32, &u[55], &rnding, bit);
+
+    for (i = 56; i < 64; i++) v[i] = u[i];
+
+    // stage 11
+    for (i = 0; i < 32; i++) {
+      addsub_sse4_1(v[i], v[63 - i], &out[(i)], &out[(63 - i)], &clamp_lo,
+                    &clamp_hi);
+    }
+
+    if (!do_cols) {
+      const int log_range_out = AOMMAX(16, bd + 6);
+      const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+      const __m128i clamp_hi_out =
+          _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+      for (i = 0; i < 64; i += 4) {
+        round_shift_4x4(out + i, out_shift);
+        highbd_clamp_epi32_sse4_1(out + i, out + i, &clamp_lo_out,
+                                  &clamp_hi_out, 4);
+      }
+    }
+  }
+}
+
+static void idct32x32_low1_sse4_1(__m128i *in, __m128i *out, int bit,
+                                  int do_cols, int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i rounding = _mm_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+  __m128i bf1;
+
+  // stage 0
+  // stage 1
+  bf1 = in[0];
+
+  // stage 2
+  // stage 3
+  // stage 4
+  // stage 5
+  bf1 = half_btf_0_sse4_1(&cospi32, &bf1, &rounding, bit);
+
+  // stage 6
+  // stage 7
+  // stage 8
+  // stage 9
+  if (do_cols) {
+    bf1 = _mm_max_epi32(bf1, clamp_lo);
+    bf1 = _mm_min_epi32(bf1, clamp_hi);
+  } else {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    clamp_lo = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+    clamp_hi = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+    if (out_shift != 0) {
+      __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
+      bf1 = _mm_add_epi32(bf1, offset);
+      bf1 = _mm_sra_epi32(bf1, _mm_cvtsi32_si128(out_shift));
+    }
+  }
+
+  bf1 = _mm_max_epi32(bf1, clamp_lo);
+  bf1 = _mm_min_epi32(bf1, clamp_hi);
+  out[0] = bf1;
+  out[1] = bf1;
+  out[2] = bf1;
+  out[3] = bf1;
+  out[4] = bf1;
+  out[5] = bf1;
+  out[6] = bf1;
+  out[7] = bf1;
+  out[8] = bf1;
+  out[9] = bf1;
+  out[10] = bf1;
+  out[11] = bf1;
+  out[12] = bf1;
+  out[13] = bf1;
+  out[14] = bf1;
+  out[15] = bf1;
+  out[16] = bf1;
+  out[17] = bf1;
+  out[18] = bf1;
+  out[19] = bf1;
+  out[20] = bf1;
+  out[21] = bf1;
+  out[22] = bf1;
+  out[23] = bf1;
+  out[24] = bf1;
+  out[25] = bf1;
+  out[26] = bf1;
+  out[27] = bf1;
+  out[28] = bf1;
+  out[29] = bf1;
+  out[30] = bf1;
+  out[31] = bf1;
+}
+
+static void idct32x32_low8_sse4_1(__m128i *in, __m128i *out, int bit,
+                                  int do_cols, int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+  const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+  const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+  const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+  const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+  const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+  const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
+  const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
+  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+  const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+  const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+  const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+  const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+  const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i rounding = _mm_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+  __m128i bf1[32];
+
+  // stage 0
+  // stage 1
+  bf1[0] = in[0];
+  bf1[4] = in[4];
+  bf1[8] = in[2];
+  bf1[12] = in[6];
+  bf1[16] = in[1];
+  bf1[20] = in[5];
+  bf1[24] = in[3];
+  bf1[28] = in[7];
+
+  // stage 2
+  bf1[31] = half_btf_0_sse4_1(&cospi2, &bf1[16], &rounding, bit);
+  bf1[16] = half_btf_0_sse4_1(&cospi62, &bf1[16], &rounding, bit);
+  bf1[19] = half_btf_0_sse4_1(&cospim50, &bf1[28], &rounding, bit);
+  bf1[28] = half_btf_0_sse4_1(&cospi14, &bf1[28], &rounding, bit);
+  bf1[27] = half_btf_0_sse4_1(&cospi10, &bf1[20], &rounding, bit);
+  bf1[20] = half_btf_0_sse4_1(&cospi54, &bf1[20], &rounding, bit);
+  bf1[23] = half_btf_0_sse4_1(&cospim58, &bf1[24], &rounding, bit);
+  bf1[24] = half_btf_0_sse4_1(&cospi6, &bf1[24], &rounding, bit);
+
+  // stage 3
+  bf1[15] = half_btf_0_sse4_1(&cospi4, &bf1[8], &rounding, bit);
+  bf1[8] = half_btf_0_sse4_1(&cospi60, &bf1[8], &rounding, bit);
+
+  bf1[11] = half_btf_0_sse4_1(&cospim52, &bf1[12], &rounding, bit);
+  bf1[12] = half_btf_0_sse4_1(&cospi12, &bf1[12], &rounding, bit);
+  bf1[17] = bf1[16];
+  bf1[18] = bf1[19];
+  bf1[21] = bf1[20];
+  bf1[22] = bf1[23];
+  bf1[25] = bf1[24];
+  bf1[26] = bf1[27];
+  bf1[29] = bf1[28];
+  bf1[30] = bf1[31];
+
+  // stage 4 :
+  bf1[7] = half_btf_0_sse4_1(&cospi8, &bf1[4], &rounding, bit);
+  bf1[4] = half_btf_0_sse4_1(&cospi56, &bf1[4], &rounding, bit);
+
+  bf1[9] = bf1[8];
+  bf1[10] = bf1[11];
+  bf1[13] = bf1[12];
+  bf1[14] = bf1[15];
+
+  idct32_stage4_sse4_1(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40,
+                       &cospi24, &cospi40, &cospim24, &rounding, bit);
+
+  // stage 5
+  bf1[0] = half_btf_0_sse4_1(&cospi32, &bf1[0], &rounding, bit);
+  bf1[1] = bf1[0];
+  bf1[5] = bf1[4];
+  bf1[6] = bf1[7];
+
+  idct32_stage5_sse4_1(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo,
+                       &clamp_hi, &rounding, bit);
+
+  // stage 6
+  bf1[3] = bf1[0];
+  bf1[2] = bf1[1];
+
+  idct32_stage6_sse4_1(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
+                       &cospim48, &clamp_lo, &clamp_hi, &rounding, bit);
+
+  // stage 7
+  idct32_stage7_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
+                       &rounding, bit);
+
+  // stage 8
+  idct32_stage8_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
+                       &rounding, bit);
+
+  // stage 9
+  idct32_stage9_sse4_1(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
+}
+
+static void idct32x32_low16_sse4_1(__m128i *in, __m128i *out, int bit,
+                                   int do_cols, int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+  const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
+  const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
+  const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+  const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+  const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
+  const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
+  const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+  const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
+  const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+  const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
+  const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+  const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
+  const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
+  const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
+  const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
+  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+  const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+  const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+  const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+  const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+  const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+  const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+  const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+  const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+  const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i rounding = _mm_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+  __m128i bf1[32];
+
+  // stage 0
+  // stage 1
+
+  bf1[0] = in[0];
+  bf1[2] = in[8];
+  bf1[4] = in[4];
+  bf1[6] = in[12];
+  bf1[8] = in[2];
+  bf1[10] = in[10];
+  bf1[12] = in[6];
+  bf1[14] = in[14];
+  bf1[16] = in[1];
+  bf1[18] = in[9];
+  bf1[20] = in[5];
+  bf1[22] = in[13];
+  bf1[24] = in[3];
+  bf1[26] = in[11];
+  bf1[28] = in[7];
+  bf1[30] = in[15];
+
+  // stage 2
+  bf1[31] = half_btf_0_sse4_1(&cospi2, &bf1[16], &rounding, bit);
+  bf1[16] = half_btf_0_sse4_1(&cospi62, &bf1[16], &rounding, bit);
+  bf1[17] = half_btf_0_sse4_1(&cospim34, &bf1[30], &rounding, bit);
+  bf1[30] = half_btf_0_sse4_1(&cospi30, &bf1[30], &rounding, bit);
+  bf1[29] = half_btf_0_sse4_1(&cospi18, &bf1[18], &rounding, bit);
+  bf1[18] = half_btf_0_sse4_1(&cospi46, &bf1[18], &rounding, bit);
+  bf1[19] = half_btf_0_sse4_1(&cospim50, &bf1[28], &rounding, bit);
+  bf1[28] = half_btf_0_sse4_1(&cospi14, &bf1[28], &rounding, bit);
+  bf1[27] = half_btf_0_sse4_1(&cospi10, &bf1[20], &rounding, bit);
+  bf1[20] = half_btf_0_sse4_1(&cospi54, &bf1[20], &rounding, bit);
+  bf1[21] = half_btf_0_sse4_1(&cospim42, &bf1[26], &rounding, bit);
+  bf1[26] = half_btf_0_sse4_1(&cospi22, &bf1[26], &rounding, bit);
+  bf1[25] = half_btf_0_sse4_1(&cospi26, &bf1[22], &rounding, bit);
+  bf1[22] = half_btf_0_sse4_1(&cospi38, &bf1[22], &rounding, bit);
+  bf1[23] = half_btf_0_sse4_1(&cospim58, &bf1[24], &rounding, bit);
+  bf1[24] = half_btf_0_sse4_1(&cospi6, &bf1[24], &rounding, bit);
+
+  // stage 3
+  bf1[15] = half_btf_0_sse4_1(&cospi4, &bf1[8], &rounding, bit);
+  bf1[8] = half_btf_0_sse4_1(&cospi60, &bf1[8], &rounding, bit);
+  bf1[9] = half_btf_0_sse4_1(&cospim36, &bf1[14], &rounding, bit);
+  bf1[14] = half_btf_0_sse4_1(&cospi28, &bf1[14], &rounding, bit);
+  bf1[13] = half_btf_0_sse4_1(&cospi20, &bf1[10], &rounding, bit);
+  bf1[10] = half_btf_0_sse4_1(&cospi44, &bf1[10], &rounding, bit);
+  bf1[11] = half_btf_0_sse4_1(&cospim52, &bf1[12], &rounding, bit);
+  bf1[12] = half_btf_0_sse4_1(&cospi12, &bf1[12], &rounding, bit);
+
+  addsub_sse4_1(bf1[16], bf1[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[19], bf1[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[20], bf1[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[23], bf1[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[24], bf1[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[27], bf1[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[28], bf1[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[31], bf1[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
+  // stage 4
+  bf1[7] = half_btf_0_sse4_1(&cospi8, &bf1[4], &rounding, bit);
+  bf1[4] = half_btf_0_sse4_1(&cospi56, &bf1[4], &rounding, bit);
+  bf1[5] = half_btf_0_sse4_1(&cospim40, &bf1[6], &rounding, bit);
+  bf1[6] = half_btf_0_sse4_1(&cospi24, &bf1[6], &rounding, bit);
+
+  addsub_sse4_1(bf1[8], bf1[9], bf1 + 8, bf1 + 9, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[11], bf1[10], bf1 + 11, bf1 + 10, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[12], bf1[13], bf1 + 12, bf1 + 13, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[15], bf1[14], bf1 + 15, bf1 + 14, &clamp_lo, &clamp_hi);
+
+  idct32_stage4_sse4_1(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40,
+                       &cospi24, &cospi40, &cospim24, &rounding, bit);
+
+  // stage 5
+  bf1[0] = half_btf_0_sse4_1(&cospi32, &bf1[0], &rounding, bit);
+  bf1[1] = bf1[0];
+  bf1[3] = half_btf_0_sse4_1(&cospi16, &bf1[2], &rounding, bit);
+  bf1[2] = half_btf_0_sse4_1(&cospi48, &bf1[2], &rounding, bit);
+
+  addsub_sse4_1(bf1[4], bf1[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[7], bf1[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
+
+  idct32_stage5_sse4_1(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo,
+                       &clamp_hi, &rounding, bit);
+
+  // stage 6
+  addsub_sse4_1(bf1[0], bf1[3], bf1 + 0, bf1 + 3, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[1], bf1[2], bf1 + 1, bf1 + 2, &clamp_lo, &clamp_hi);
+
+  idct32_stage6_sse4_1(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
+                       &cospim48, &clamp_lo, &clamp_hi, &rounding, bit);
+
+  // stage 7
+  idct32_stage7_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
+                       &rounding, bit);
+
+  // stage 8
+  idct32_stage8_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
+                       &rounding, bit);
+  // stage 9
+  idct32_stage9_sse4_1(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
+}
+
+static void idct32x32_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+                             int bd, int out_shift) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+  const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
+  const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
+  const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+  const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+  const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
+  const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
+  const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+  const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
+  const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
+  const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
+  const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+  const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
+  const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
+  const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
+  const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+  const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
+  const __m128i cospim26 = _mm_set1_epi32(-cospi[26]);
+  const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
+  const __m128i cospim10 = _mm_set1_epi32(-cospi[10]);
+  const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
+  const __m128i cospim18 = _mm_set1_epi32(-cospi[18]);
+  const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
+  const __m128i cospim2 = _mm_set1_epi32(-cospi[2]);
+  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+  const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+  const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+  const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+  const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+  const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+  const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+  const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+  const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
+  const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+  const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+  const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+  const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+  const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i rounding = _mm_set1_epi32(1 << (bit - 1));
+  const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+  const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+  const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+  __m128i bf1[32], bf0[32];
+
+  // stage 0
+  // stage 1
+  bf1[0] = in[0];
+  bf1[1] = in[16];
+  bf1[2] = in[8];
+  bf1[3] = in[24];
+  bf1[4] = in[4];
+  bf1[5] = in[20];
+  bf1[6] = in[12];
+  bf1[7] = in[28];
+  bf1[8] = in[2];
+  bf1[9] = in[18];
+  bf1[10] = in[10];
+  bf1[11] = in[26];
+  bf1[12] = in[6];
+  bf1[13] = in[22];
+  bf1[14] = in[14];
+  bf1[15] = in[30];
+  bf1[16] = in[1];
+  bf1[17] = in[17];
+  bf1[18] = in[9];
+  bf1[19] = in[25];
+  bf1[20] = in[5];
+  bf1[21] = in[21];
+  bf1[22] = in[13];
+  bf1[23] = in[29];
+  bf1[24] = in[3];
+  bf1[25] = in[19];
+  bf1[26] = in[11];
+  bf1[27] = in[27];
+  bf1[28] = in[7];
+  bf1[29] = in[23];
+  bf1[30] = in[15];
+  bf1[31] = in[31];
+
+  // stage 2
+  bf0[0] = bf1[0];
+  bf0[1] = bf1[1];
+  bf0[2] = bf1[2];
+  bf0[3] = bf1[3];
+  bf0[4] = bf1[4];
+  bf0[5] = bf1[5];
+  bf0[6] = bf1[6];
+  bf0[7] = bf1[7];
+  bf0[8] = bf1[8];
+  bf0[9] = bf1[9];
+  bf0[10] = bf1[10];
+  bf0[11] = bf1[11];
+  bf0[12] = bf1[12];
+  bf0[13] = bf1[13];
+  bf0[14] = bf1[14];
+  bf0[15] = bf1[15];
+  bf0[16] =
+      half_btf_sse4_1(&cospi62, &bf1[16], &cospim2, &bf1[31], &rounding, bit);
+  bf0[17] =
+      half_btf_sse4_1(&cospi30, &bf1[17], &cospim34, &bf1[30], &rounding, bit);
+  bf0[18] =
+      half_btf_sse4_1(&cospi46, &bf1[18], &cospim18, &bf1[29], &rounding, bit);
+  bf0[19] =
+      half_btf_sse4_1(&cospi14, &bf1[19], &cospim50, &bf1[28], &rounding, bit);
+  bf0[20] =
+      half_btf_sse4_1(&cospi54, &bf1[20], &cospim10, &bf1[27], &rounding, bit);
+  bf0[21] =
+      half_btf_sse4_1(&cospi22, &bf1[21], &cospim42, &bf1[26], &rounding, bit);
+  bf0[22] =
+      half_btf_sse4_1(&cospi38, &bf1[22], &cospim26, &bf1[25], &rounding, bit);
+  bf0[23] =
+      half_btf_sse4_1(&cospi6, &bf1[23], &cospim58, &bf1[24], &rounding, bit);
+  bf0[24] =
+      half_btf_sse4_1(&cospi58, &bf1[23], &cospi6, &bf1[24], &rounding, bit);
+  bf0[25] =
+      half_btf_sse4_1(&cospi26, &bf1[22], &cospi38, &bf1[25], &rounding, bit);
+  bf0[26] =
+      half_btf_sse4_1(&cospi42, &bf1[21], &cospi22, &bf1[26], &rounding, bit);
+  bf0[27] =
+      half_btf_sse4_1(&cospi10, &bf1[20], &cospi54, &bf1[27], &rounding, bit);
+  bf0[28] =
+      half_btf_sse4_1(&cospi50, &bf1[19], &cospi14, &bf1[28], &rounding, bit);
+  bf0[29] =
+      half_btf_sse4_1(&cospi18, &bf1[18], &cospi46, &bf1[29], &rounding, bit);
+  bf0[30] =
+      half_btf_sse4_1(&cospi34, &bf1[17], &cospi30, &bf1[30], &rounding, bit);
+  bf0[31] =
+      half_btf_sse4_1(&cospi2, &bf1[16], &cospi62, &bf1[31], &rounding, bit);
+
+  // stage 3
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] =
+      half_btf_sse4_1(&cospi60, &bf0[8], &cospim4, &bf0[15], &rounding, bit);
+  bf1[9] =
+      half_btf_sse4_1(&cospi28, &bf0[9], &cospim36, &bf0[14], &rounding, bit);
+  bf1[10] =
+      half_btf_sse4_1(&cospi44, &bf0[10], &cospim20, &bf0[13], &rounding, bit);
+  bf1[11] =
+      half_btf_sse4_1(&cospi12, &bf0[11], &cospim52, &bf0[12], &rounding, bit);
+  bf1[12] =
+      half_btf_sse4_1(&cospi52, &bf0[11], &cospi12, &bf0[12], &rounding, bit);
+  bf1[13] =
+      half_btf_sse4_1(&cospi20, &bf0[10], &cospi44, &bf0[13], &rounding, bit);
+  bf1[14] =
+      half_btf_sse4_1(&cospi36, &bf0[9], &cospi28, &bf0[14], &rounding, bit);
+  bf1[15] =
+      half_btf_sse4_1(&cospi4, &bf0[8], &cospi60, &bf0[15], &rounding, bit);
+
+  addsub_sse4_1(bf0[16], bf0[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[19], bf0[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[20], bf0[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[23], bf0[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[24], bf0[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[27], bf0[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[28], bf0[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[31], bf0[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
+
+  // stage 4
+  bf0[0] = bf1[0];
+  bf0[1] = bf1[1];
+  bf0[2] = bf1[2];
+  bf0[3] = bf1[3];
+  bf0[4] =
+      half_btf_sse4_1(&cospi56, &bf1[4], &cospim8, &bf1[7], &rounding, bit);
+  bf0[5] =
+      half_btf_sse4_1(&cospi24, &bf1[5], &cospim40, &bf1[6], &rounding, bit);
+  bf0[6] =
+      half_btf_sse4_1(&cospi40, &bf1[5], &cospi24, &bf1[6], &rounding, bit);
+  bf0[7] = half_btf_sse4_1(&cospi8, &bf1[4], &cospi56, &bf1[7], &rounding, bit);
+
+  addsub_sse4_1(bf1[8], bf1[9], bf0 + 8, bf0 + 9, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[11], bf1[10], bf0 + 11, bf0 + 10, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[12], bf1[13], bf0 + 12, bf0 + 13, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[15], bf1[14], bf0 + 15, bf0 + 14, &clamp_lo, &clamp_hi);
+
+  bf0[16] = bf1[16];
+  bf0[17] =
+      half_btf_sse4_1(&cospim8, &bf1[17], &cospi56, &bf1[30], &rounding, bit);
+  bf0[18] =
+      half_btf_sse4_1(&cospim56, &bf1[18], &cospim8, &bf1[29], &rounding, bit);
+  bf0[19] = bf1[19];
+  bf0[20] = bf1[20];
+  bf0[21] =
+      half_btf_sse4_1(&cospim40, &bf1[21], &cospi24, &bf1[26], &rounding, bit);
+  bf0[22] =
+      half_btf_sse4_1(&cospim24, &bf1[22], &cospim40, &bf1[25], &rounding, bit);
+  bf0[23] = bf1[23];
+  bf0[24] = bf1[24];
+  bf0[25] =
+      half_btf_sse4_1(&cospim40, &bf1[22], &cospi24, &bf1[25], &rounding, bit);
+  bf0[26] =
+      half_btf_sse4_1(&cospi24, &bf1[21], &cospi40, &bf1[26], &rounding, bit);
+  bf0[27] = bf1[27];
+  bf0[28] = bf1[28];
+  bf0[29] =
+      half_btf_sse4_1(&cospim8, &bf1[18], &cospi56, &bf1[29], &rounding, bit);
+  bf0[30] =
+      half_btf_sse4_1(&cospi56, &bf1[17], &cospi8, &bf1[30], &rounding, bit);
+  bf0[31] = bf1[31];
+
+  // stage 5
+  bf1[0] =
+      half_btf_sse4_1(&cospi32, &bf0[0], &cospi32, &bf0[1], &rounding, bit);
+  bf1[1] =
+      half_btf_sse4_1(&cospi32, &bf0[0], &cospim32, &bf0[1], &rounding, bit);
+  bf1[2] =
+      half_btf_sse4_1(&cospi48, &bf0[2], &cospim16, &bf0[3], &rounding, bit);
+  bf1[3] =
+      half_btf_sse4_1(&cospi16, &bf0[2], &cospi48, &bf0[3], &rounding, bit);
+  addsub_sse4_1(bf0[4], bf0[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[7], bf0[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
+  bf1[8] = bf0[8];
+  bf1[9] =
+      half_btf_sse4_1(&cospim16, &bf0[9], &cospi48, &bf0[14], &rounding, bit);
+  bf1[10] =
+      half_btf_sse4_1(&cospim48, &bf0[10], &cospim16, &bf0[13], &rounding, bit);
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] =
+      half_btf_sse4_1(&cospim16, &bf0[10], &cospi48, &bf0[13], &rounding, bit);
+  bf1[14] =
+      half_btf_sse4_1(&cospi48, &bf0[9], &cospi16, &bf0[14], &rounding, bit);
+  bf1[15] = bf0[15];
+  addsub_sse4_1(bf0[16], bf0[19], bf1 + 16, bf1 + 19, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[17], bf0[18], bf1 + 17, bf1 + 18, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[23], bf0[20], bf1 + 23, bf1 + 20, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[22], bf0[21], bf1 + 22, bf1 + 21, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[24], bf0[27], bf1 + 24, bf1 + 27, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[25], bf0[26], bf1 + 25, bf1 + 26, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[31], bf0[28], bf1 + 31, bf1 + 28, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[30], bf0[29], bf1 + 30, bf1 + 29, &clamp_lo, &clamp_hi);
+
+  // stage 6
+  addsub_sse4_1(bf1[0], bf1[3], bf0 + 0, bf0 + 3, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[1], bf1[2], bf0 + 1, bf0 + 2, &clamp_lo, &clamp_hi);
+  bf0[4] = bf1[4];
+  bf0[5] =
+      half_btf_sse4_1(&cospim32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
+  bf0[6] =
+      half_btf_sse4_1(&cospi32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
+  bf0[7] = bf1[7];
+  addsub_sse4_1(bf1[8], bf1[11], bf0 + 8, bf0 + 11, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[9], bf1[10], bf0 + 9, bf0 + 10, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[15], bf1[12], bf0 + 15, bf0 + 12, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[14], bf1[13], bf0 + 14, bf0 + 13, &clamp_lo, &clamp_hi);
+  bf0[16] = bf1[16];
+  bf0[17] = bf1[17];
+  bf0[18] =
+      half_btf_sse4_1(&cospim16, &bf1[18], &cospi48, &bf1[29], &rounding, bit);
+  bf0[19] =
+      half_btf_sse4_1(&cospim16, &bf1[19], &cospi48, &bf1[28], &rounding, bit);
+  bf0[20] =
+      half_btf_sse4_1(&cospim48, &bf1[20], &cospim16, &bf1[27], &rounding, bit);
+  bf0[21] =
+      half_btf_sse4_1(&cospim48, &bf1[21], &cospim16, &bf1[26], &rounding, bit);
+  bf0[22] = bf1[22];
+  bf0[23] = bf1[23];
+  bf0[24] = bf1[24];
+  bf0[25] = bf1[25];
+  bf0[26] =
+      half_btf_sse4_1(&cospim16, &bf1[21], &cospi48, &bf1[26], &rounding, bit);
+  bf0[27] =
+      half_btf_sse4_1(&cospim16, &bf1[20], &cospi48, &bf1[27], &rounding, bit);
+  bf0[28] =
+      half_btf_sse4_1(&cospi48, &bf1[19], &cospi16, &bf1[28], &rounding, bit);
+  bf0[29] =
+      half_btf_sse4_1(&cospi48, &bf1[18], &cospi16, &bf1[29], &rounding, bit);
+  bf0[30] = bf1[30];
+  bf0[31] = bf1[31];
+
+  // stage 7
+  addsub_sse4_1(bf0[0], bf0[7], bf1 + 0, bf1 + 7, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[1], bf0[6], bf1 + 1, bf1 + 6, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[2], bf0[5], bf1 + 2, bf1 + 5, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[3], bf0[4], bf1 + 3, bf1 + 4, &clamp_lo, &clamp_hi);
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] =
+      half_btf_sse4_1(&cospim32, &bf0[10], &cospi32, &bf0[13], &rounding, bit);
+  bf1[11] =
+      half_btf_sse4_1(&cospim32, &bf0[11], &cospi32, &bf0[12], &rounding, bit);
+  bf1[12] =
+      half_btf_sse4_1(&cospi32, &bf0[11], &cospi32, &bf0[12], &rounding, bit);
+  bf1[13] =
+      half_btf_sse4_1(&cospi32, &bf0[10], &cospi32, &bf0[13], &rounding, bit);
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  addsub_sse4_1(bf0[16], bf0[23], bf1 + 16, bf1 + 23, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[17], bf0[22], bf1 + 17, bf1 + 22, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[18], bf0[21], bf1 + 18, bf1 + 21, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[19], bf0[20], bf1 + 19, bf1 + 20, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[31], bf0[24], bf1 + 31, bf1 + 24, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[30], bf0[25], bf1 + 30, bf1 + 25, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[29], bf0[26], bf1 + 29, bf1 + 26, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[28], bf0[27], bf1 + 28, bf1 + 27, &clamp_lo, &clamp_hi);
+
+  // stage 8
+  addsub_sse4_1(bf1[0], bf1[15], bf0 + 0, bf0 + 15, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[1], bf1[14], bf0 + 1, bf0 + 14, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[2], bf1[13], bf0 + 2, bf0 + 13, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[3], bf1[12], bf0 + 3, bf0 + 12, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[4], bf1[11], bf0 + 4, bf0 + 11, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[5], bf1[10], bf0 + 5, bf0 + 10, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[6], bf1[9], bf0 + 6, bf0 + 9, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf1[7], bf1[8], bf0 + 7, bf0 + 8, &clamp_lo, &clamp_hi);
+  bf0[16] = bf1[16];
+  bf0[17] = bf1[17];
+  bf0[18] = bf1[18];
+  bf0[19] = bf1[19];
+  bf0[20] =
+      half_btf_sse4_1(&cospim32, &bf1[20], &cospi32, &bf1[27], &rounding, bit);
+  bf0[21] =
+      half_btf_sse4_1(&cospim32, &bf1[21], &cospi32, &bf1[26], &rounding, bit);
+  bf0[22] =
+      half_btf_sse4_1(&cospim32, &bf1[22], &cospi32, &bf1[25], &rounding, bit);
+  bf0[23] =
+      half_btf_sse4_1(&cospim32, &bf1[23], &cospi32, &bf1[24], &rounding, bit);
+  bf0[24] =
+      half_btf_sse4_1(&cospi32, &bf1[23], &cospi32, &bf1[24], &rounding, bit);
+  bf0[25] =
+      half_btf_sse4_1(&cospi32, &bf1[22], &cospi32, &bf1[25], &rounding, bit);
+  bf0[26] =
+      half_btf_sse4_1(&cospi32, &bf1[21], &cospi32, &bf1[26], &rounding, bit);
+  bf0[27] =
+      half_btf_sse4_1(&cospi32, &bf1[20], &cospi32, &bf1[27], &rounding, bit);
+  bf0[28] = bf1[28];
+  bf0[29] = bf1[29];
+  bf0[30] = bf1[30];
+  bf0[31] = bf1[31];
+
+  // stage 9
+  addsub_sse4_1(bf0[0], bf0[31], out + 0, out + 31, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[1], bf0[30], out + 1, out + 30, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[2], bf0[29], out + 2, out + 29, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[3], bf0[28], out + 3, out + 28, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[4], bf0[27], out + 4, out + 27, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[5], bf0[26], out + 5, out + 26, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[6], bf0[25], out + 6, out + 25, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[7], bf0[24], out + 7, out + 24, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[8], bf0[23], out + 8, out + 23, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[9], bf0[22], out + 9, out + 22, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[10], bf0[21], out + 10, out + 21, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[11], bf0[20], out + 11, out + 20, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[12], bf0[19], out + 12, out + 19, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[13], bf0[18], out + 13, out + 18, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[14], bf0[17], out + 14, out + 17, &clamp_lo, &clamp_hi);
+  addsub_sse4_1(bf0[15], bf0[16], out + 15, out + 16, &clamp_lo, &clamp_hi);
+
+  if (!do_cols) {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+    const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+    round_shift_8x8(out, out_shift);
+    round_shift_8x8(out + 16, out_shift);
+    highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 32);
+  }
+}
+
+void av1_highbd_inv_txfm_add_8x8_sse4_1(const tran_low_t *input, uint8_t *dest,
+                                        int stride,
+                                        const TxfmParam *txfm_param) {
+  int bd = txfm_param->bd;
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  const int32_t *src = cast_to_int32(input);
+  switch (tx_type) {
+    case IDTX:
+    case H_DCT:
+    case H_ADST:
+    case H_FLIPADST:
+    case V_DCT:
+    case V_ADST:
+    case V_FLIPADST:
+      av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type,
+                                                txfm_param->tx_size,
+                                                txfm_param->eob, bd);
+      break;
+    default:
+      av1_inv_txfm2d_add_8x8_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride,
+                                    tx_type, bd);
+      break;
+  }
+}
+void av1_highbd_inv_txfm_add_4x4_sse4_1(const tran_low_t *input, uint8_t *dest,
+                                        int stride,
+                                        const TxfmParam *txfm_param) {
+  assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
+  int eob = txfm_param->eob;
+  int bd = txfm_param->bd;
+  int lossless = txfm_param->lossless;
+  const int32_t *src = cast_to_int32(input);
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  if (lossless) {
+    assert(tx_type == DCT_DCT);
+    av1_highbd_iwht4x4_add(input, dest, stride, eob, bd);
+    return;
+  }
+  av1_inv_txfm2d_add_4x4_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
+                                bd);
+}
+static void iidentity32_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+                               int bd, int out_shift) {
+  (void)bit;
+  for (int i = 0; i < 32; i += 16) {
+    out[i] = _mm_slli_epi32(in[i], 2);
+    out[i + 1] = _mm_slli_epi32(in[i + 1], 2);
+    out[i + 2] = _mm_slli_epi32(in[i + 2], 2);
+    out[i + 3] = _mm_slli_epi32(in[i + 3], 2);
+    out[i + 4] = _mm_slli_epi32(in[i + 4], 2);
+    out[i + 5] = _mm_slli_epi32(in[i + 5], 2);
+    out[i + 6] = _mm_slli_epi32(in[i + 6], 2);
+    out[i + 7] = _mm_slli_epi32(in[i + 7], 2);
+    out[i + 8] = _mm_slli_epi32(in[i + 8], 2);
+    out[i + 9] = _mm_slli_epi32(in[i + 9], 2);
+    out[i + 10] = _mm_slli_epi32(in[i + 10], 2);
+    out[i + 11] = _mm_slli_epi32(in[i + 11], 2);
+    out[i + 12] = _mm_slli_epi32(in[i + 12], 2);
+    out[i + 13] = _mm_slli_epi32(in[i + 13], 2);
+    out[i + 14] = _mm_slli_epi32(in[i + 14], 2);
+    out[i + 15] = _mm_slli_epi32(in[i + 15], 2);
+  }
+
+  if (!do_cols) {
+    const int log_range_out = AOMMAX(16, bd + 6);
+    const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+    const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+    round_shift_8x8(out, out_shift);
+    round_shift_8x8(out + 16, out_shift);
+    highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 32);
+  }
+}
+static const transform_1d_sse4_1
+    highbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = {
+      {
+          { idct4x4_sse4_1, NULL, NULL, NULL },
+          { iadst4x4_sse4_1, NULL, NULL, NULL },
+          { iidentity4_sse4_1, iidentity4_sse4_1, iidentity4_sse4_1, NULL },
+      },
+      { { idct8x8_low1_sse4_1, idct8x8_new_sse4_1, NULL, NULL },
+        { iadst8x8_low1_sse4_1, iadst8x8_new_sse4_1, NULL, NULL },
+        { iidentity8_sse4_1, iidentity8_sse4_1, NULL, NULL } },
+      {
+          { idct16x16_low1_sse4_1, idct16x16_low8_sse4_1, idct16x16_sse4_1,
+            NULL },
+          { iadst16x16_low1_sse4_1, iadst16x16_low8_sse4_1, iadst16x16_sse4_1,
+            NULL },
+          { iidentity16_sse4_1, NULL, iidentity16_sse4_1, NULL },
+      },
+      { { idct32x32_low1_sse4_1, idct32x32_low8_sse4_1, idct32x32_low16_sse4_1,
+          idct32x32_sse4_1 },
+        { NULL, NULL, NULL, NULL },
+        { iidentity32_sse4_1, NULL, NULL, NULL } },
+      { { idct64x64_low1_sse4_1, idct64x64_low8_sse4_1, idct64x64_low16_sse4_1,
+          idct64x64_sse4_1 },
+        { NULL, NULL, NULL, NULL },
+        { NULL, NULL, NULL, NULL } }
+    };
+static void highbd_inv_txfm2d_add_h_identity_ssse41(const int32_t *input,
+                                                    uint16_t *output,
+                                                    int stride, TX_TYPE tx_type,
+                                                    TX_SIZE tx_size, int eob,
+                                                    const int bd) {
+  __m128i buf1[64];
+  int eobx, eoby;
+  get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int input_stride = AOMMIN(32, txfm_size_col);
+  const int buf_size_w_div4 = input_stride >> 2;
+  const int buf_size_h_div8 = (eoby + 8) >> 3;
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+  const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eoby];
+  const transform_1d_sse4_1 row_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
+  const transform_1d_sse4_1 col_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx];
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  for (int i = 0; i < (buf_size_h_div8 << 1); ++i) {
+    __m128i buf0[16];
+    const int32_t *input_row = input + i * input_stride * 4;
+    for (int j = 0; j < buf_size_w_div4; ++j) {
+      __m128i *buf0_cur = buf0 + j * 4;
+      load_buffer_32bit_input(input_row + j * 4, input_stride, buf0_cur, 4);
+    }
+    if (rect_type == 1 || rect_type == -1) {
+      av1_round_shift_rect_array_32_sse4_1(buf0, buf0, input_stride, 0,
+                                           NewInvSqrt2);
+    }
+    row_txfm(buf0, buf0, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+             -shift[0]);
+
+    __m128i *_buf1 = buf1 + i * 4;
+
+    for (int j = 0; j < buf_size_w_div4; ++j) {
+      _buf1[j * txfm_size_row + 0] = buf0[j * 4 + 0];
+      _buf1[j * txfm_size_row + 1] = buf0[j * 4 + 1];
+      _buf1[j * txfm_size_row + 2] = buf0[j * 4 + 2];
+      _buf1[j * txfm_size_row + 3] = buf0[j * 4 + 3];
+    }
+  }
+  for (int i = 0; i < buf_size_w_div4; i++) {
+    col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
+             av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+
+    av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row,
+                                    buf1 + i * txfm_size_row, txfm_size_row,
+                                    -shift[1]);
+  }
+
+  // write to buffer
+  for (int i = 0; i < (txfm_size_col >> 3); i++) {
+    highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2, output + 8 * i,
+                                   stride, ud_flip, txfm_size_row, bd);
+  }
+}
+static void highbd_inv_txfm2d_add_v_identity_ssse41(const int32_t *input,
+                                                    uint16_t *output,
+                                                    int stride, TX_TYPE tx_type,
+                                                    TX_SIZE tx_size, int eob,
+                                                    const int bd) {
+  __m128i buf1[64];
+  int eobx, eoby;
+  get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int input_stride = AOMMIN(32, txfm_size_col);
+  const int buf_size_w_div8 = input_stride >> 2;
+  const int row_max = AOMMIN(32, txfm_size_row);
+  const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+  const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eobx];
+  const transform_1d_sse4_1 row_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx];
+  const transform_1d_sse4_1 col_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  for (int i = 0; i < (row_max >> 2); ++i) {
+    __m128i buf0[16];
+    const int32_t *input_row = input + i * input_stride * 4;
+    for (int j = 0; j < (buf_size_nonzero_w_div8 << 1); ++j) {
+      __m128i *buf0_cur = buf0 + j * 4;
+      load_buffer_32bit_input(input_row + j * 4, input_stride, buf0_cur, 4);
+
+      TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3],
+                    buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]);
+    }
+    if (rect_type == 1 || rect_type == -1) {
+      av1_round_shift_rect_array_32_sse4_1(
+          buf0, buf0, (buf_size_nonzero_w_div8 << 3), 0, NewInvSqrt2);
+    }
+    row_txfm(buf0, buf0, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+             -shift[0]);
+
+    __m128i *_buf1 = buf1 + i * 4;
+    if (lr_flip) {
+      for (int j = 0; j < buf_size_w_div8; ++j) {
+        TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],
+                      buf0[4 * j],
+                      _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 0],
+                      _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 1],
+                      _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 2],
+                      _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 3]);
+      }
+    } else {
+      for (int j = 0; j < buf_size_w_div8; ++j) {
+        TRANSPOSE_4X4(
+            buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3],
+            _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1],
+            _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3]);
+      }
+    }
+  }
+  for (int i = 0; i < buf_size_w_div8; i++) {
+    col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
+             av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+
+    av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row,
+                                    buf1 + i * txfm_size_row, txfm_size_row,
+                                    -shift[1]);
+  }
+
+  // write to buffer
+  {
+    for (int i = 0; i < (txfm_size_col >> 3); i++) {
+      highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2,
+                                     output + 8 * i, stride, ud_flip,
+                                     txfm_size_row, bd);
+    }
+  }
+}
+static void highbd_inv_txfm2d_add_idtx_ssse41(const int32_t *input,
+                                              uint16_t *output, int stride,
+                                              TX_TYPE tx_type, TX_SIZE tx_size,
+                                              int eob, const int bd) {
+  (void)eob;
+  __m128i buf1[64 * 4];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int input_stride = AOMMIN(32, txfm_size_col);
+  const int row_max = AOMMIN(32, txfm_size_row);
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+  const transform_1d_sse4_1 row_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
+  const transform_1d_sse4_1 col_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
+
+  for (int i = 0; i < (row_max >> 2); ++i) {
+    __m128i buf0[32];
+    const int32_t *input_row = input + i * input_stride * 4;
+    for (int j = 0; j < (input_stride >> 2); ++j) {
+      __m128i *buf0_cur = buf0 + j * 4;
+      load_buffer_32bit_input(input_row + j * 4, input_stride, buf0_cur, 4);
+    }
+    if (rect_type == 1 || rect_type == -1) {
+      av1_round_shift_rect_array_32_sse4_1(buf0, buf0, input_stride, 0,
+                                           NewInvSqrt2);
+    }
+    row_txfm(buf0, buf0, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+             -shift[0]);
+
+    __m128i *_buf1 = buf1 + i * 4;
+    for (int j = 0; j < (input_stride >> 2); ++j) {
+      _buf1[j * txfm_size_row + 0] = buf0[j * 4 + 0];
+      _buf1[j * txfm_size_row + 1] = buf0[j * 4 + 1];
+      _buf1[j * txfm_size_row + 2] = buf0[j * 4 + 2];
+      _buf1[j * txfm_size_row + 3] = buf0[j * 4 + 3];
+    }
+  }
+  for (int i = 0; i < (input_stride >> 2); i++) {
+    col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
+             av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+
+    av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row,
+                                    buf1 + i * txfm_size_row, txfm_size_row,
+                                    -shift[1]);
+  }
+
+  // write to buffer
+  {
+    for (int i = 0; i < (txfm_size_col >> 3); i++) {
+      highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2,
+                                     output + 8 * i, stride, 0, txfm_size_row,
+                                     bd);
+    }
+  }
+}
+static void highbd_inv_txfm2d_add_no_identity_sse41(const int32_t *input,
+                                                    uint16_t *output,
+                                                    int stride, TX_TYPE tx_type,
+                                                    TX_SIZE tx_size, int eob,
+                                                    const int bd) {
+  __m128i buf1[64 * 16];
+  int eobx, eoby;
+  get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_size_w_div8 = txfm_size_col >> 2;
+  const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
+  const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+  const int input_stride = AOMMIN(32, txfm_size_col);
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+
+  const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+  const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+  const transform_1d_sse4_1 row_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+  const transform_1d_sse4_1 col_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+  assert(col_txfm != NULL);
+  assert(row_txfm != NULL);
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  // 1st stage: column transform
+  for (int i = 0; i < buf_size_nonzero_h_div8 << 1; i++) {
+    __m128i buf0[64];
+    const int32_t *input_row = input + i * input_stride * 4;
+    for (int j = 0; j < buf_size_nonzero_w_div8 << 1; ++j) {
+      __m128i *buf0_cur = buf0 + j * 4;
+      load_buffer_32bit_input(input_row + j * 4, input_stride, buf0_cur, 4);
+
+      TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3],
+                    buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]);
+    }
+    if (rect_type == 1 || rect_type == -1) {
+      av1_round_shift_rect_array_32_sse4_1(
+          buf0, buf0, buf_size_nonzero_w_div8 << 3, 0, NewInvSqrt2);
+    }
+    row_txfm(buf0, buf0, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+             -shift[0]);
+
+    __m128i *_buf1 = buf1 + i * 4;
+    if (lr_flip) {
+      for (int j = 0; j < buf_size_w_div8; ++j) {
+        TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],
+                      buf0[4 * j],
+                      _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 0],
+                      _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 1],
+                      _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 2],
+                      _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 3]);
+      }
+    } else {
+      for (int j = 0; j < buf_size_w_div8; ++j) {
+        TRANSPOSE_4X4(
+            buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3],
+            _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1],
+            _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3]);
+      }
+    }
+  }
+  // 2nd stage: column transform
+  for (int i = 0; i < buf_size_w_div8; i++) {
+    col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
+             av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+
+    av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row,
+                                    buf1 + i * txfm_size_row, txfm_size_row,
+                                    -shift[1]);
+  }
+
+  // write to buffer
+  {
+    for (int i = 0; i < (txfm_size_col >> 3); i++) {
+      highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2,
+                                     output + 8 * i, stride, ud_flip,
+                                     txfm_size_row, bd);
+    }
+  }
+}
+
+static void highbd_inv_txfm2d_add_4x8_sse41(const int32_t *input,
+                                            uint16_t *output, int stride,
+                                            TX_TYPE tx_type, TX_SIZE tx_size,
+                                            int eob, const int bd) {
+  (void)eob;
+  __m128i buf1[8];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const transform_1d_sse4_1 row_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
+  const transform_1d_sse4_1 col_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][1];
+  const int input_stride = AOMMIN(32, txfm_size_col);
+
+  assert(col_txfm != NULL);
+  assert(row_txfm != NULL);
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  // 1st stage: column transform
+  __m128i buf0[8];
+  const int32_t *input_row = input;
+  __m128i *buf0_cur = buf0;
+  load_buffer_32bit_input(input_row, input_stride, buf0_cur, txfm_size_row);
+  av1_round_shift_rect_array_32_sse4_1(buf0, buf0, txfm_size_row, 0,
+                                       NewInvSqrt2);
+  row_txfm(buf0, buf0, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
+  row_txfm(buf0 + 4, buf0 + 4, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+           -shift[0]);
+
+  if (lr_flip) {
+    TRANSPOSE_4X4(buf0[3], buf0[2], buf0[1], buf0[0], buf1[0], buf1[1], buf1[2],
+                  buf1[3]);
+
+    TRANSPOSE_4X4(buf0[7], buf0[6], buf0[5], buf0[4], buf1[4], buf1[5], buf1[6],
+                  buf1[7]);
+  } else {
+    TRANSPOSE_4X4(buf0[0], buf0[1], buf0[2], buf0[3], buf1[0], buf1[1], buf1[2],
+                  buf1[3]);
+
+    TRANSPOSE_4X4(buf0[4], buf0[5], buf0[6], buf0[7], buf1[4], buf1[5], buf1[6],
+                  buf1[7]);
+  }
+
+  // 2nd stage: column transform
+  col_txfm(buf1, buf1, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+
+  av1_round_shift_array_32_sse4_1(buf1, buf1, txfm_size_row, -shift[1]);
+
+  // write to buffer
+  highbd_write_buffer_4xn_sse4_1(buf1, output, stride, ud_flip, txfm_size_row,
+                                 bd);
+}
+
+static void highbd_inv_txfm2d_add_8x4_sse41(const int32_t *input,
+                                            uint16_t *output, int stride,
+                                            TX_TYPE tx_type, TX_SIZE tx_size,
+                                            int eob, const int bd) {
+  (void)eob;
+  __m128i buf1[8];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const transform_1d_sse4_1 row_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][1];
+  const transform_1d_sse4_1 col_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
+
+  assert(col_txfm != NULL);
+  assert(row_txfm != NULL);
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  // 1st stage: column transform
+  __m128i buf0[8];
+  const int32_t *input_row = input;
+  load_buffer_32bit_input(input_row, 4, buf0, txfm_size_col);
+
+  TRANSPOSE_4X4(buf0[0], buf0[2], buf0[4], buf0[6], buf1[0], buf1[1], buf1[2],
+                buf1[3]);
+  TRANSPOSE_4X4(buf0[1], buf0[3], buf0[5], buf0[7], buf1[4], buf1[5], buf1[6],
+                buf1[7]);
+
+  av1_round_shift_rect_array_32_sse4_1(buf1, buf0, txfm_size_col, 0,
+                                       NewInvSqrt2);
+  row_txfm(buf0, buf0, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
+
+  __m128i *buf1_ptr;
+  if (lr_flip) {
+    flip_buf_sse2(buf0, buf1, txfm_size_col);
+    buf1_ptr = buf1;
+  } else {
+    buf1_ptr = buf0;
+  }
+
+  // 2nd stage: column transform
+  for (int i = 0; i < 2; i++) {
+    col_txfm(buf1_ptr + i * txfm_size_row, buf1_ptr + i * txfm_size_row,
+             av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+  }
+  av1_round_shift_array_32_sse4_1(buf1_ptr, buf1_ptr, txfm_size_col, -shift[1]);
+  // write to buffer
+  highbd_write_buffer_8xn_sse4_1(buf1_ptr, output, stride, ud_flip,
+                                 txfm_size_row, bd);
+}
+
+static void highbd_inv_txfm2d_add_4x16_sse4_1(const int32_t *input,
+                                              uint16_t *output, int stride,
+                                              TX_TYPE tx_type, TX_SIZE tx_size,
+                                              int eob, const int bd) {
+  (void)eob;
+  __m128i buf1[16];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_size_h_div8 = txfm_size_row >> 2;
+  const transform_1d_sse4_1 row_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
+  const transform_1d_sse4_1 col_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][2];
+  const int input_stride = AOMMIN(32, txfm_size_col);
+
+  assert(col_txfm != NULL);
+  assert(row_txfm != NULL);
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  // 1st stage: column transform
+  __m128i buf0[16];
+  const int32_t *input_row = input;
+  __m128i *buf0_cur = buf0;
+  load_buffer_32bit_input(input_row, input_stride, buf0_cur, txfm_size_row);
+  for (int i = 0; i < (txfm_size_row >> 2); i++) {
+    row_txfm(buf0 + (i << 2), buf0 + (i << 2),
+             av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
+  }
+
+  if (lr_flip) {
+    for (int j = 0; j < buf_size_h_div8; ++j) {
+      TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],
+                    buf0[4 * j], buf1[4 * j], buf1[4 * j + 1], buf1[4 * j + 2],
+                    buf1[4 * j + 3]);
+    }
+  } else {
+    for (int j = 0; j < buf_size_h_div8; ++j) {
+      TRANSPOSE_4X4(buf0[4 * j], buf0[4 * j + 1], buf0[4 * j + 2],
+                    buf0[4 * j + 3], buf1[4 * j], buf1[4 * j + 1],
+                    buf1[4 * j + 2], buf1[4 * j + 3]);
+    }
+  }
+
+  // 2nd stage: column transform
+  col_txfm(buf1, buf1, av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+
+  av1_round_shift_array_32_sse4_1(buf1, buf1, txfm_size_row, -shift[1]);
+
+  // write to buffer
+  highbd_write_buffer_4xn_sse4_1(buf1, output, stride, ud_flip, txfm_size_row,
+                                 bd);
+}
+
+static void highbd_inv_txfm2d_add_16x4_sse4_1(const int32_t *input,
+                                              uint16_t *output, int stride,
+                                              TX_TYPE tx_type, TX_SIZE tx_size,
+                                              int eob, const int bd) {
+  (void)eob;
+  __m128i buf1[16];
+  const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_size_w_div8 = txfm_size_col >> 2;
+  const transform_1d_sse4_1 row_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][2];
+  const transform_1d_sse4_1 col_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
+
+  assert(col_txfm != NULL);
+  assert(row_txfm != NULL);
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  // 1st stage: column transform
+  __m128i buf0[16];
+  const int32_t *input_row = input;
+  load_buffer_32bit_input(input_row, 4, buf0, txfm_size_col);
+
+  for (int j = 0; j < buf_size_w_div8; j++) {
+    TRANSPOSE_4X4(buf0[j], buf0[j + 4], buf0[j + 8], buf0[j + 12], buf1[4 * j],
+                  buf1[4 * j + 1], buf1[4 * j + 2], buf1[4 * j + 3]);
+  }
+  row_txfm(buf1, buf0, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
+
+  __m128i *buf1_ptr;
+  if (lr_flip) {
+    flip_buf_sse2(buf0, buf1, txfm_size_col);
+    buf1_ptr = buf1;
+  } else {
+    buf1_ptr = buf0;
+  }
+
+  // 2nd stage: column transform
+  for (int i = 0; i < buf_size_w_div8; i++) {
+    col_txfm(buf1_ptr + i * txfm_size_row, buf1_ptr + i * txfm_size_row,
+             av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+  }
+  av1_round_shift_array_32_sse4_1(buf1_ptr, buf1_ptr, txfm_size_col, -shift[1]);
+
+  // write to buffer
+  for (int i = 0; i < (txfm_size_col >> 3); i++) {
+    highbd_write_buffer_8xn_sse4_1(buf1_ptr + i * txfm_size_row * 2,
+                                   output + 8 * i, stride, ud_flip,
+                                   txfm_size_row, bd);
+  }
+}
+
+void av1_highbd_inv_txfm2d_add_universe_sse4_1(const int32_t *input,
+                                               uint8_t *output, int stride,
+                                               TX_TYPE tx_type, TX_SIZE tx_size,
+                                               int eob, const int bd) {
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+      highbd_inv_txfm2d_add_no_identity_sse41(
+          input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob,
+          bd);
+      break;
+    case V_DCT:
+    case V_ADST:
+    case V_FLIPADST:
+      highbd_inv_txfm2d_add_h_identity_ssse41(
+          input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob,
+          bd);
+      break;
+    case H_DCT:
+    case H_ADST:
+    case H_FLIPADST:
+      highbd_inv_txfm2d_add_v_identity_ssse41(
+          input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob,
+          bd);
+      break;
+    case IDTX:
+      highbd_inv_txfm2d_add_idtx_ssse41(input, CONVERT_TO_SHORTPTR(output),
+                                        stride, tx_type, tx_size, eob, bd);
+      break;
+    default: assert(0); break;
+  }
+}
+
+void av1_highbd_inv_txfm_add_4x8_sse4_1(const tran_low_t *input, uint8_t *dest,
+                                        int stride,
+                                        const TxfmParam *txfm_param) {
+  int bd = txfm_param->bd;
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  const TX_SIZE tx_size = txfm_param->tx_size;
+  int eob = txfm_param->eob;
+  highbd_inv_txfm2d_add_4x8_sse41(input, CONVERT_TO_SHORTPTR(dest), stride,
+                                  tx_type, tx_size, eob, bd);
+}
+
+void av1_highbd_inv_txfm_add_8x4_sse4_1(const tran_low_t *input, uint8_t *dest,
+                                        int stride,
+                                        const TxfmParam *txfm_param) {
+  int bd = txfm_param->bd;
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  const TX_SIZE tx_size = txfm_param->tx_size;
+  int eob = txfm_param->eob;
+  highbd_inv_txfm2d_add_8x4_sse41(input, CONVERT_TO_SHORTPTR(dest), stride,
+                                  tx_type, tx_size, eob, bd);
+}
+
+void av1_highbd_inv_txfm_add_4x16_sse4_1(const tran_low_t *input, uint8_t *dest,
+                                         int stride,
+                                         const TxfmParam *txfm_param) {
+  int bd = txfm_param->bd;
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  const TX_SIZE tx_size = txfm_param->tx_size;
+  int eob = txfm_param->eob;
+  highbd_inv_txfm2d_add_4x16_sse4_1(input, CONVERT_TO_SHORTPTR(dest), stride,
+                                    tx_type, tx_size, eob, bd);
+}
+
+void av1_highbd_inv_txfm_add_16x4_sse4_1(const tran_low_t *input, uint8_t *dest,
+                                         int stride,
+                                         const TxfmParam *txfm_param) {
+  int bd = txfm_param->bd;
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  const TX_SIZE tx_size = txfm_param->tx_size;
+  int eob = txfm_param->eob;
+  highbd_inv_txfm2d_add_16x4_sse4_1(input, CONVERT_TO_SHORTPTR(dest), stride,
+                                    tx_type, tx_size, eob, bd);
+}
+
+void av1_highbd_inv_txfm_add_sse4_1(const tran_low_t *input, uint8_t *dest,
+                                    int stride, const TxfmParam *txfm_param) {
+  assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
+  const TX_SIZE tx_size = txfm_param->tx_size;
+  switch (tx_size) {
+    case TX_8X8:
+      av1_highbd_inv_txfm_add_8x8_sse4_1(input, dest, stride, txfm_param);
+      break;
+    case TX_4X8:
+      av1_highbd_inv_txfm_add_4x8_sse4_1(input, dest, stride, txfm_param);
+      break;
+    case TX_8X4:
+      av1_highbd_inv_txfm_add_8x4_sse4_1(input, dest, stride, txfm_param);
+      break;
+    case TX_4X4:
+      av1_highbd_inv_txfm_add_4x4_sse4_1(input, dest, stride, txfm_param);
+      break;
+    case TX_16X4:
+      av1_highbd_inv_txfm_add_16x4_sse4_1(input, dest, stride, txfm_param);
+      break;
+    case TX_4X16:
+      av1_highbd_inv_txfm_add_4x16_sse4_1(input, dest, stride, txfm_param);
+      break;
+    default:
+      av1_highbd_inv_txfm2d_add_universe_sse4_1(
+          input, dest, stride, txfm_param->tx_type, tx_size, txfm_param->eob,
+          txfm_param->bd);
+      break;
+  }
+}
diff --git a/libs/libaom/src/av1/common/x86/highbd_jnt_convolve_avx2.c b/libs/libaom/src/av1/common/x86/highbd_jnt_convolve_avx2.c
new file mode 100644
index 000000000..70f1ec709
--- /dev/null
+++ b/libs/libaom/src/av1/common/x86/highbd_jnt_convolve_avx2.c
@@ -0,0 +1,859 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/convolve_avx2.h"
+#include "aom_dsp/x86/convolve_common_intrin.h"
+#include "aom_dsp/x86/convolve_sse4_1.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "av1/common/convolve.h"
+
+void av1_highbd_dist_wtd_convolve_2d_copy_avx2(
+    const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  (void)filter_params_x;
+  (void)filter_params_y;
+  (void)subpel_x_qn;
+  (void)subpel_y_qn;
+
+  const int bits =
+      FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
+  const __m128i left_shift = _mm_cvtsi32_si128(bits);
+  const int do_average = conv_params->do_average;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m256i wt0 = _mm256_set1_epi32(w0);
+  const __m256i wt1 = _mm256_set1_epi32(w1);
+  const __m256i zero = _mm256_setzero_si256();
+  int i, j;
+
+  const int offset_0 =
+      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+  const __m256i offset_const = _mm256_set1_epi32(offset);
+  const __m256i offset_const_16b = _mm256_set1_epi16(offset);
+  const int rounding_shift =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1);
+  const __m256i clip_pixel_to_bd =
+      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+
+  assert(bits <= 4);
+
+  if (!(w % 16)) {
+    for (i = 0; i < h; i += 1) {
+      for (j = 0; j < w; j += 16) {
+        const __m256i src_16bit =
+            _mm256_loadu_si256((__m256i *)(&src[i * src_stride + j]));
+
+        const __m256i res = _mm256_sll_epi16(src_16bit, left_shift);
+
+        if (do_average) {
+          const __m256i data_0 =
+              _mm256_loadu_si256((__m256i *)(&dst[i * dst_stride + j]));
+
+          const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_0, zero);
+          const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_0, zero);
+
+          const __m256i res_32b_lo = _mm256_unpacklo_epi16(res, zero);
+          const __m256i res_unsigned_lo =
+              _mm256_add_epi32(res_32b_lo, offset_const);
+
+          const __m256i comp_avg_res_lo =
+              highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1,
+                              use_dist_wtd_comp_avg);
+
+          const __m256i res_32b_hi = _mm256_unpackhi_epi16(res, zero);
+          const __m256i res_unsigned_hi =
+              _mm256_add_epi32(res_32b_hi, offset_const);
+
+          const __m256i comp_avg_res_hi =
+              highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1,
+                              use_dist_wtd_comp_avg);
+
+          const __m256i round_result_lo = highbd_convolve_rounding(
+              &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
+          const __m256i round_result_hi = highbd_convolve_rounding(
+              &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
+
+          const __m256i res_16b =
+              _mm256_packus_epi32(round_result_lo, round_result_hi);
+          const __m256i res_clip = _mm256_min_epi16(res_16b, clip_pixel_to_bd);
+
+          _mm256_store_si256((__m256i *)(&dst0[i * dst_stride0 + j]), res_clip);
+        } else {
+          const __m256i res_unsigned_16b =
+              _mm256_adds_epu16(res, offset_const_16b);
+
+          _mm256_store_si256((__m256i *)(&dst[i * dst_stride + j]),
+                             res_unsigned_16b);
+        }
+      }
+    }
+  } else if (!(w % 4)) {
+    for (i = 0; i < h; i += 2) {
+      for (j = 0; j < w; j += 8) {
+        const __m128i src_row_0 =
+            _mm_loadu_si128((__m128i *)(&src[i * src_stride + j]));
+        const __m128i src_row_1 =
+            _mm_loadu_si128((__m128i *)(&src[i * src_stride + j + src_stride]));
+        // since not all compilers yet support _mm256_set_m128i()
+        const __m256i src_10 = _mm256_insertf128_si256(
+            _mm256_castsi128_si256(src_row_0), src_row_1, 1);
+
+        const __m256i res = _mm256_sll_epi16(src_10, left_shift);
+
+        if (w - j < 8) {
+          if (do_average) {
+            const __m256i data_0 = _mm256_castsi128_si256(
+                _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])));
+            const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64(
+                (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
+            const __m256i data_01 =
+                _mm256_permute2x128_si256(data_0, data_1, 0x20);
+
+            const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero);
+
+            const __m256i res_32b = _mm256_unpacklo_epi16(res, zero);
+            const __m256i res_unsigned_lo =
+                _mm256_add_epi32(res_32b, offset_const);
+
+            const __m256i comp_avg_res =
+                highbd_comp_avg(&data_ref_0, &res_unsigned_lo, &wt0, &wt1,
+                                use_dist_wtd_comp_avg);
+
+            const __m256i round_result = highbd_convolve_rounding(
+                &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+            const __m256i res_16b =
+                _mm256_packus_epi32(round_result, round_result);
+            const __m256i res_clip =
+                _mm256_min_epi16(res_16b, clip_pixel_to_bd);
+
+            const __m128i res_0 = _mm256_castsi256_si128(res_clip);
+            const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
+
+            _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+            _mm_storel_epi64(
+                (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
+          } else {
+            const __m256i res_unsigned_16b =
+                _mm256_adds_epu16(res, offset_const_16b);
+
+            const __m128i res_0 = _mm256_castsi256_si128(res_unsigned_16b);
+            const __m128i res_1 = _mm256_extracti128_si256(res_unsigned_16b, 1);
+
+            _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0);
+            _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                             res_1);
+          }
+        } else {
+          if (do_average) {
+            const __m256i data_0 = _mm256_castsi128_si256(
+                _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])));
+            const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128(
+                (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
+            const __m256i data_01 =
+                _mm256_permute2x128_si256(data_0, data_1, 0x20);
+
+            const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero);
+            const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero);
+
+            const __m256i res_32b_lo = _mm256_unpacklo_epi16(res, zero);
+            const __m256i res_unsigned_lo =
+                _mm256_add_epi32(res_32b_lo, offset_const);
+
+            const __m256i comp_avg_res_lo =
+                highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1,
+                                use_dist_wtd_comp_avg);
+
+            const __m256i res_32b_hi = _mm256_unpackhi_epi16(res, zero);
+            const __m256i res_unsigned_hi =
+                _mm256_add_epi32(res_32b_hi, offset_const);
+
+            const __m256i comp_avg_res_hi =
+                highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1,
+                                use_dist_wtd_comp_avg);
+
+            const __m256i round_result_lo =
+                highbd_convolve_rounding(&comp_avg_res_lo, &offset_const,
+                                         &rounding_const, rounding_shift);
+            const __m256i round_result_hi =
+                highbd_convolve_rounding(&comp_avg_res_hi, &offset_const,
+                                         &rounding_const, rounding_shift);
+
+            const __m256i res_16b =
+                _mm256_packus_epi32(round_result_lo, round_result_hi);
+            const __m256i res_clip =
+                _mm256_min_epi16(res_16b, clip_pixel_to_bd);
+
+            const __m128i res_0 = _mm256_castsi256_si128(res_clip);
+            const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
+
+            _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+            _mm_store_si128(
+                (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
+          } else {
+            const __m256i res_unsigned_16b =
+                _mm256_adds_epu16(res, offset_const_16b);
+            const __m128i res_0 = _mm256_castsi256_si128(res_unsigned_16b);
+            const __m128i res_1 = _mm256_extracti128_si256(res_unsigned_16b, 1);
+
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                            res_1);
+          }
+        }
+      }
+    }
+  }
+}
+
+void av1_highbd_dist_wtd_convolve_2d_avx2(
+    const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
+  DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  int im_h = h + filter_params_y->taps - 1;
+  int im_stride = 8;
+  int i, j;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+  // Check that, even with 12-bit input, the intermediate values will fit
+  // into an unsigned 16-bit intermediate array.
+  assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
+
+  __m256i s[8], coeffs_y[4], coeffs_x[4];
+  const int do_average = conv_params->do_average;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
+
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m256i wt0 = _mm256_set1_epi32(w0);
+  const __m256i wt1 = _mm256_set1_epi32(w1);
+  const __m256i zero = _mm256_setzero_si256();
+
+  const __m256i round_const_x = _mm256_set1_epi32(
+      ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
+  const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
+
+  const __m256i round_const_y = _mm256_set1_epi32(
+      ((1 << conv_params->round_1) >> 1) -
+      (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
+  const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1);
+
+  const int offset_0 =
+      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+  const __m256i offset_const = _mm256_set1_epi32(offset);
+  const int rounding_shift =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1);
+
+  const __m256i clip_pixel_to_bd =
+      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+
+  prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x);
+  prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y);
+
+  for (j = 0; j < w; j += 8) {
+    /* Horizontal filter */
+    {
+      for (i = 0; i < im_h; i += 2) {
+        const __m256i row0 =
+            _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]);
+        __m256i row1 = _mm256_set1_epi16(0);
+        if (i + 1 < im_h)
+          row1 =
+              _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]);
+
+        const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20);
+        const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);
+
+        // even pixels
+        s[0] = _mm256_alignr_epi8(r1, r0, 0);
+        s[1] = _mm256_alignr_epi8(r1, r0, 4);
+        s[2] = _mm256_alignr_epi8(r1, r0, 8);
+        s[3] = _mm256_alignr_epi8(r1, r0, 12);
+
+        __m256i res_even = convolve(s, coeffs_x);
+        res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x),
+                                    round_shift_x);
+
+        // odd pixels
+        s[0] = _mm256_alignr_epi8(r1, r0, 2);
+        s[1] = _mm256_alignr_epi8(r1, r0, 6);
+        s[2] = _mm256_alignr_epi8(r1, r0, 10);
+        s[3] = _mm256_alignr_epi8(r1, r0, 14);
+
+        __m256i res_odd = convolve(s, coeffs_x);
+        res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x),
+                                   round_shift_x);
+
+        __m256i res_even1 = _mm256_packs_epi32(res_even, res_even);
+        __m256i res_odd1 = _mm256_packs_epi32(res_odd, res_odd);
+        __m256i res = _mm256_unpacklo_epi16(res_even1, res_odd1);
+
+        _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
+      }
+    }
+
+    /* Vertical filter */
+    {
+      __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
+      __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
+      __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
+      __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
+      __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));
+      __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));
+
+      s[0] = _mm256_unpacklo_epi16(s0, s1);
+      s[1] = _mm256_unpacklo_epi16(s2, s3);
+      s[2] = _mm256_unpacklo_epi16(s4, s5);
+
+      s[4] = _mm256_unpackhi_epi16(s0, s1);
+      s[5] = _mm256_unpackhi_epi16(s2, s3);
+      s[6] = _mm256_unpackhi_epi16(s4, s5);
+
+      for (i = 0; i < h; i += 2) {
+        const int16_t *data = &im_block[i * im_stride];
+
+        const __m256i s6 =
+            _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));
+        const __m256i s7 =
+            _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));
+
+        s[3] = _mm256_unpacklo_epi16(s6, s7);
+        s[7] = _mm256_unpackhi_epi16(s6, s7);
+
+        const __m256i res_a = convolve(s, coeffs_y);
+
+        const __m256i res_a_round = _mm256_sra_epi32(
+            _mm256_add_epi32(res_a, round_const_y), round_shift_y);
+
+        const __m256i res_unsigned_lo =
+            _mm256_add_epi32(res_a_round, offset_const);
+
+        if (w - j < 8) {
+          if (do_average) {
+            const __m256i data_0 = _mm256_castsi128_si256(
+                _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])));
+            const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64(
+                (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
+            const __m256i data_01 =
+                _mm256_permute2x128_si256(data_0, data_1, 0x20);
+
+            const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero);
+
+            const __m256i comp_avg_res =
+                highbd_comp_avg(&data_ref_0, &res_unsigned_lo, &wt0, &wt1,
+                                use_dist_wtd_comp_avg);
+
+            const __m256i round_result = highbd_convolve_rounding(
+                &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+            const __m256i res_16b =
+                _mm256_packus_epi32(round_result, round_result);
+            const __m256i res_clip =
+                _mm256_min_epi16(res_16b, clip_pixel_to_bd);
+
+            const __m128i res_0 = _mm256_castsi256_si128(res_clip);
+            const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
+
+            _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+            _mm_storel_epi64(
+                (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
+          } else {
+            __m256i res_16b =
+                _mm256_packus_epi32(res_unsigned_lo, res_unsigned_lo);
+            const __m128i res_0 = _mm256_castsi256_si128(res_16b);
+            const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
+
+            _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0);
+            _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                             res_1);
+          }
+        } else {
+          const __m256i res_b = convolve(s + 4, coeffs_y);
+          const __m256i res_b_round = _mm256_sra_epi32(
+              _mm256_add_epi32(res_b, round_const_y), round_shift_y);
+
+          __m256i res_unsigned_hi = _mm256_add_epi32(res_b_round, offset_const);
+
+          if (do_average) {
+            const __m256i data_0 = _mm256_castsi128_si256(
+                _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])));
+            const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128(
+                (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
+            const __m256i data_01 =
+                _mm256_permute2x128_si256(data_0, data_1, 0x20);
+
+            const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero);
+            const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero);
+
+            const __m256i comp_avg_res_lo =
+                highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1,
+                                use_dist_wtd_comp_avg);
+            const __m256i comp_avg_res_hi =
+                highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1,
+                                use_dist_wtd_comp_avg);
+
+            const __m256i round_result_lo =
+                highbd_convolve_rounding(&comp_avg_res_lo, &offset_const,
+                                         &rounding_const, rounding_shift);
+            const __m256i round_result_hi =
+                highbd_convolve_rounding(&comp_avg_res_hi, &offset_const,
+                                         &rounding_const, rounding_shift);
+
+            const __m256i res_16b =
+                _mm256_packus_epi32(round_result_lo, round_result_hi);
+            const __m256i res_clip =
+                _mm256_min_epi16(res_16b, clip_pixel_to_bd);
+
+            const __m128i res_0 = _mm256_castsi256_si128(res_clip);
+            const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
+
+            _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+            _mm_store_si128(
+                (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
+          } else {
+            __m256i res_16b =
+                _mm256_packus_epi32(res_unsigned_lo, res_unsigned_hi);
+            const __m128i res_0 = _mm256_castsi256_si128(res_16b);
+            const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
+
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                            res_1);
+          }
+        }
+
+        s[0] = s[1];
+        s[1] = s[2];
+        s[2] = s[3];
+
+        s[4] = s[5];
+        s[5] = s[6];
+        s[6] = s[7];
+      }
+    }
+  }
+}
+
+void av1_highbd_dist_wtd_convolve_x_avx2(
+    const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const uint16_t *const src_ptr = src - fo_horiz;
+  const int bits = FILTER_BITS - conv_params->round_1;
+  (void)filter_params_y;
+  (void)subpel_y_qn;
+
+  int i, j;
+  __m256i s[4], coeffs_x[4];
+
+  const int do_average = conv_params->do_average;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m256i wt0 = _mm256_set1_epi32(w0);
+  const __m256i wt1 = _mm256_set1_epi32(w1);
+  const __m256i zero = _mm256_setzero_si256();
+
+  const __m256i round_const_x =
+      _mm256_set1_epi32(((1 << conv_params->round_0) >> 1));
+  const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
+  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+
+  const int offset_0 =
+      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+  const __m256i offset_const = _mm256_set1_epi32(offset);
+  const int rounding_shift =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1);
+  const __m256i clip_pixel_to_bd =
+      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+
+  assert(bits >= 0);
+  prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x);
+
+  for (j = 0; j < w; j += 8) {
+    /* Horizontal filter */
+    for (i = 0; i < h; i += 2) {
+      const __m256i row0 =
+          _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]);
+      __m256i row1 =
+          _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]);
+
+      const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20);
+      const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);
+
+      // even pixels
+      s[0] = _mm256_alignr_epi8(r1, r0, 0);
+      s[1] = _mm256_alignr_epi8(r1, r0, 4);
+      s[2] = _mm256_alignr_epi8(r1, r0, 8);
+      s[3] = _mm256_alignr_epi8(r1, r0, 12);
+
+      __m256i res_even = convolve(s, coeffs_x);
+      res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x),
+                                  round_shift_x);
+
+      // odd pixels
+      s[0] = _mm256_alignr_epi8(r1, r0, 2);
+      s[1] = _mm256_alignr_epi8(r1, r0, 6);
+      s[2] = _mm256_alignr_epi8(r1, r0, 10);
+      s[3] = _mm256_alignr_epi8(r1, r0, 14);
+
+      __m256i res_odd = convolve(s, coeffs_x);
+      res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x),
+                                 round_shift_x);
+
+      res_even = _mm256_sll_epi32(res_even, round_shift_bits);
+      res_odd = _mm256_sll_epi32(res_odd, round_shift_bits);
+
+      __m256i res1 = _mm256_unpacklo_epi32(res_even, res_odd);
+
+      __m256i res_unsigned_lo = _mm256_add_epi32(res1, offset_const);
+
+      if (w - j < 8) {
+        if (do_average) {
+          const __m256i data_0 = _mm256_castsi128_si256(
+              _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])));
+          const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64(
+              (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
+          const __m256i data_01 =
+              _mm256_permute2x128_si256(data_0, data_1, 0x20);
+
+          const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero);
+
+          const __m256i comp_avg_res = highbd_comp_avg(
+              &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_dist_wtd_comp_avg);
+
+          const __m256i round_result = highbd_convolve_rounding(
+              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+          const __m256i res_16b =
+              _mm256_packus_epi32(round_result, round_result);
+          const __m256i res_clip = _mm256_min_epi16(res_16b, clip_pixel_to_bd);
+
+          const __m128i res_0 = _mm256_castsi256_si128(res_clip);
+          const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
+
+          _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+          _mm_storel_epi64(
+              (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
+        } else {
+          __m256i res_16b =
+              _mm256_packus_epi32(res_unsigned_lo, res_unsigned_lo);
+          const __m128i res_0 = _mm256_castsi256_si128(res_16b);
+          const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
+
+          _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0);
+          _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                           res_1);
+        }
+      } else {
+        __m256i res2 = _mm256_unpackhi_epi32(res_even, res_odd);
+        __m256i res_unsigned_hi = _mm256_add_epi32(res2, offset_const);
+
+        if (do_average) {
+          const __m256i data_0 = _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])));
+          const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128(
+              (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
+          const __m256i data_01 =
+              _mm256_permute2x128_si256(data_0, data_1, 0x20);
+
+          const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero);
+          const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero);
+
+          const __m256i comp_avg_res_lo =
+              highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1,
+                              use_dist_wtd_comp_avg);
+          const __m256i comp_avg_res_hi =
+              highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1,
+                              use_dist_wtd_comp_avg);
+
+          const __m256i round_result_lo = highbd_convolve_rounding(
+              &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
+          const __m256i round_result_hi = highbd_convolve_rounding(
+              &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
+
+          const __m256i res_16b =
+              _mm256_packus_epi32(round_result_lo, round_result_hi);
+          const __m256i res_clip = _mm256_min_epi16(res_16b, clip_pixel_to_bd);
+
+          const __m128i res_0 = _mm256_castsi256_si128(res_clip);
+          const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
+
+          _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+          _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]),
+                          res_1);
+        } else {
+          __m256i res_16b =
+              _mm256_packus_epi32(res_unsigned_lo, res_unsigned_hi);
+          const __m128i res_0 = _mm256_castsi256_si128(res_16b);
+          const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
+
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                          res_1);
+        }
+      }
+    }
+  }
+}
+
+void av1_highbd_dist_wtd_convolve_y_avx2(
+    const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const uint16_t *const src_ptr = src - fo_vert * src_stride;
+  const int bits = FILTER_BITS - conv_params->round_0;
+  (void)filter_params_x;
+  (void)subpel_x_qn;
+
+  assert(bits >= 0);
+  int i, j;
+  __m256i s[8], coeffs_y[4];
+  const int do_average = conv_params->do_average;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
+
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m256i wt0 = _mm256_set1_epi32(w0);
+  const __m256i wt1 = _mm256_set1_epi32(w1);
+  const __m256i round_const_y =
+      _mm256_set1_epi32(((1 << conv_params->round_1) >> 1));
+  const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1);
+  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+
+  const int offset_0 =
+      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+  const __m256i offset_const = _mm256_set1_epi32(offset);
+  const int rounding_shift =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1);
+  const __m256i clip_pixel_to_bd =
+      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+  const __m256i zero = _mm256_setzero_si256();
+
+  prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y);
+
+  for (j = 0; j < w; j += 8) {
+    const uint16_t *data = &src_ptr[j];
+    /* Vertical filter */
+    {
+      __m256i src6;
+      __m256i s01 = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 0 * src_stride))),
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
+          0x20);
+      __m256i s12 = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
+          0x20);
+      __m256i s23 = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
+          0x20);
+      __m256i s34 = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
+          0x20);
+      __m256i s45 = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
+          0x20);
+      src6 = _mm256_castsi128_si256(
+          _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
+      __m256i s56 = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
+          src6, 0x20);
+
+      s[0] = _mm256_unpacklo_epi16(s01, s12);
+      s[1] = _mm256_unpacklo_epi16(s23, s34);
+      s[2] = _mm256_unpacklo_epi16(s45, s56);
+
+      s[4] = _mm256_unpackhi_epi16(s01, s12);
+      s[5] = _mm256_unpackhi_epi16(s23, s34);
+      s[6] = _mm256_unpackhi_epi16(s45, s56);
+
+      for (i = 0; i < h; i += 2) {
+        data = &src_ptr[i * src_stride + j];
+
+        const __m256i s67 = _mm256_permute2x128_si256(
+            src6,
+            _mm256_castsi128_si256(
+                _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
+            0x20);
+
+        src6 = _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + 8 * src_stride)));
+
+        const __m256i s78 = _mm256_permute2x128_si256(
+            _mm256_castsi128_si256(
+                _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
+            src6, 0x20);
+
+        s[3] = _mm256_unpacklo_epi16(s67, s78);
+        s[7] = _mm256_unpackhi_epi16(s67, s78);
+
+        const __m256i res_a = convolve(s, coeffs_y);
+
+        __m256i res_a_round = _mm256_sll_epi32(res_a, round_shift_bits);
+        res_a_round = _mm256_sra_epi32(
+            _mm256_add_epi32(res_a_round, round_const_y), round_shift_y);
+
+        __m256i res_unsigned_lo = _mm256_add_epi32(res_a_round, offset_const);
+
+        if (w - j < 8) {
+          if (do_average) {
+            const __m256i data_0 = _mm256_castsi128_si256(
+                _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])));
+            const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64(
+                (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
+            const __m256i data_01 =
+                _mm256_permute2x128_si256(data_0, data_1, 0x20);
+
+            const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero);
+
+            const __m256i comp_avg_res =
+                highbd_comp_avg(&data_ref_0, &res_unsigned_lo, &wt0, &wt1,
+                                use_dist_wtd_comp_avg);
+
+            const __m256i round_result = highbd_convolve_rounding(
+                &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+            const __m256i res_16b =
+                _mm256_packus_epi32(round_result, round_result);
+            const __m256i res_clip =
+                _mm256_min_epi16(res_16b, clip_pixel_to_bd);
+
+            const __m128i res_0 = _mm256_castsi256_si128(res_clip);
+            const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
+
+            _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+            _mm_storel_epi64(
+                (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
+          } else {
+            __m256i res_16b =
+                _mm256_packus_epi32(res_unsigned_lo, res_unsigned_lo);
+            const __m128i res_0 = _mm256_castsi256_si128(res_16b);
+            const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
+
+            _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0);
+            _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                             res_1);
+          }
+        } else {
+          const __m256i res_b = convolve(s + 4, coeffs_y);
+          __m256i res_b_round = _mm256_sll_epi32(res_b, round_shift_bits);
+          res_b_round = _mm256_sra_epi32(
+              _mm256_add_epi32(res_b_round, round_const_y), round_shift_y);
+
+          __m256i res_unsigned_hi = _mm256_add_epi32(res_b_round, offset_const);
+
+          if (do_average) {
+            const __m256i data_0 = _mm256_castsi128_si256(
+                _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])));
+            const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128(
+                (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
+            const __m256i data_01 =
+                _mm256_permute2x128_si256(data_0, data_1, 0x20);
+
+            const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero);
+            const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero);
+
+            const __m256i comp_avg_res_lo =
+                highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1,
+                                use_dist_wtd_comp_avg);
+            const __m256i comp_avg_res_hi =
+                highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1,
+                                use_dist_wtd_comp_avg);
+
+            const __m256i round_result_lo =
+                highbd_convolve_rounding(&comp_avg_res_lo, &offset_const,
+                                         &rounding_const, rounding_shift);
+            const __m256i round_result_hi =
+                highbd_convolve_rounding(&comp_avg_res_hi, &offset_const,
+                                         &rounding_const, rounding_shift);
+
+            const __m256i res_16b =
+                _mm256_packus_epi32(round_result_lo, round_result_hi);
+            const __m256i res_clip =
+                _mm256_min_epi16(res_16b, clip_pixel_to_bd);
+
+            const __m128i res_0 = _mm256_castsi256_si128(res_clip);
+            const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
+
+            _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+            _mm_store_si128(
+                (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
+          } else {
+            __m256i res_16b =
+                _mm256_packus_epi32(res_unsigned_lo, res_unsigned_hi);
+            const __m128i res_0 = _mm256_castsi256_si128(res_16b);
+            const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
+
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                            res_1);
+          }
+        }
+        s[0] = s[1];
+        s[1] = s[2];
+        s[2] = s[3];
+
+        s[4] = s[5];
+        s[5] = s[6];
+        s[6] = s[7];
+      }
+    }
+  }
+}
diff --git a/libs/libaom/src/av1/common/x86/highbd_jnt_convolve_sse4.c b/libs/libaom/src/av1/common/x86/highbd_jnt_convolve_sse4.c
new file mode 100644
index 000000000..f033a6f94
--- /dev/null
+++ b/libs/libaom/src/av1/common/x86/highbd_jnt_convolve_sse4.c
@@ -0,0 +1,387 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h>
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/convolve_sse2.h"
+#include "aom_dsp/x86/convolve_sse4_1.h"
+
+void av1_highbd_dist_wtd_convolve_y_sse4_1(
+    const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const uint16_t *const src_ptr = src - fo_vert * src_stride;
+  const int bits = FILTER_BITS - conv_params->round_0;
+  (void)filter_params_x;
+  (void)subpel_x_qn;
+
+  assert(bits >= 0);
+  int i, j;
+  const int do_average = conv_params->do_average;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
+
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m128i wt0 = _mm_set1_epi32(w0);
+  const __m128i wt1 = _mm_set1_epi32(w1);
+  const __m128i round_const_y =
+      _mm_set1_epi32(((1 << conv_params->round_1) >> 1));
+  const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1);
+  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+
+  const int offset_0 =
+      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+  const __m128i offset_const = _mm_set1_epi32(offset);
+  const int rounding_shift =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const __m128i rounding_const = _mm_set1_epi32((1 << rounding_shift) >> 1);
+  const __m128i clip_pixel_to_bd =
+      _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+  const __m128i zero = _mm_setzero_si128();
+  __m128i s[16], coeffs_y[4];
+
+  prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y);
+
+  for (j = 0; j < w; j += 8) {
+    const uint16_t *data = &src_ptr[j];
+    /* Vertical filter */
+    {
+      __m128i s0 = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
+      __m128i s1 = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
+      __m128i s2 = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
+      __m128i s3 = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
+      __m128i s4 = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
+      __m128i s5 = _mm_loadu_si128((__m128i *)(data + 5 * src_stride));
+      __m128i s6 = _mm_loadu_si128((__m128i *)(data + 6 * src_stride));
+
+      s[0] = _mm_unpacklo_epi16(s0, s1);
+      s[1] = _mm_unpacklo_epi16(s2, s3);
+      s[2] = _mm_unpacklo_epi16(s4, s5);
+
+      s[4] = _mm_unpackhi_epi16(s0, s1);
+      s[5] = _mm_unpackhi_epi16(s2, s3);
+      s[6] = _mm_unpackhi_epi16(s4, s5);
+
+      s[0 + 8] = _mm_unpacklo_epi16(s1, s2);
+      s[1 + 8] = _mm_unpacklo_epi16(s3, s4);
+      s[2 + 8] = _mm_unpacklo_epi16(s5, s6);
+
+      s[4 + 8] = _mm_unpackhi_epi16(s1, s2);
+      s[5 + 8] = _mm_unpackhi_epi16(s3, s4);
+      s[6 + 8] = _mm_unpackhi_epi16(s5, s6);
+
+      for (i = 0; i < h; i += 2) {
+        data = &src_ptr[i * src_stride + j];
+
+        __m128i s7 = _mm_loadu_si128((__m128i *)(data + 7 * src_stride));
+        __m128i s8 = _mm_loadu_si128((__m128i *)(data + 8 * src_stride));
+
+        s[3] = _mm_unpacklo_epi16(s6, s7);
+        s[7] = _mm_unpackhi_epi16(s6, s7);
+
+        s[3 + 8] = _mm_unpacklo_epi16(s7, s8);
+        s[7 + 8] = _mm_unpackhi_epi16(s7, s8);
+
+        const __m128i res_a0 = convolve(s, coeffs_y);
+        __m128i res_a_round0 = _mm_sll_epi32(res_a0, round_shift_bits);
+        res_a_round0 = _mm_sra_epi32(_mm_add_epi32(res_a_round0, round_const_y),
+                                     round_shift_y);
+
+        const __m128i res_a1 = convolve(s + 8, coeffs_y);
+        __m128i res_a_round1 = _mm_sll_epi32(res_a1, round_shift_bits);
+        res_a_round1 = _mm_sra_epi32(_mm_add_epi32(res_a_round1, round_const_y),
+                                     round_shift_y);
+
+        __m128i res_unsigned_lo_0 = _mm_add_epi32(res_a_round0, offset_const);
+        __m128i res_unsigned_lo_1 = _mm_add_epi32(res_a_round1, offset_const);
+
+        if (w - j < 8) {
+          if (do_average) {
+            const __m128i data_0 =
+                _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j]));
+            const __m128i data_1 = _mm_loadl_epi64(
+                (__m128i *)(&dst[i * dst_stride + j + dst_stride]));
+
+            const __m128i data_ref_0 = _mm_unpacklo_epi16(data_0, zero);
+            const __m128i data_ref_1 = _mm_unpacklo_epi16(data_1, zero);
+
+            const __m128i comp_avg_res_0 =
+                highbd_comp_avg_sse4_1(&data_ref_0, &res_unsigned_lo_0, &wt0,
+                                       &wt1, use_dist_wtd_comp_avg);
+            const __m128i comp_avg_res_1 =
+                highbd_comp_avg_sse4_1(&data_ref_1, &res_unsigned_lo_1, &wt0,
+                                       &wt1, use_dist_wtd_comp_avg);
+
+            const __m128i round_result_0 =
+                highbd_convolve_rounding_sse2(&comp_avg_res_0, &offset_const,
+                                              &rounding_const, rounding_shift);
+            const __m128i round_result_1 =
+                highbd_convolve_rounding_sse2(&comp_avg_res_1, &offset_const,
+                                              &rounding_const, rounding_shift);
+
+            const __m128i res_16b_0 =
+                _mm_packus_epi32(round_result_0, round_result_0);
+            const __m128i res_clip_0 =
+                _mm_min_epi16(res_16b_0, clip_pixel_to_bd);
+            const __m128i res_16b_1 =
+                _mm_packus_epi32(round_result_1, round_result_1);
+            const __m128i res_clip_1 =
+                _mm_min_epi16(res_16b_1, clip_pixel_to_bd);
+
+            _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]),
+                             res_clip_0);
+            _mm_storel_epi64(
+                (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]),
+                res_clip_1);
+
+          } else {
+            __m128i res_16b_0 =
+                _mm_packus_epi32(res_unsigned_lo_0, res_unsigned_lo_0);
+
+            __m128i res_16b_1 =
+                _mm_packus_epi32(res_unsigned_lo_1, res_unsigned_lo_1);
+
+            _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_16b_0);
+            _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                             res_16b_1);
+          }
+        } else {
+          const __m128i res_b0 = convolve(s + 4, coeffs_y);
+          __m128i res_b_round0 = _mm_sll_epi32(res_b0, round_shift_bits);
+          res_b_round0 = _mm_sra_epi32(
+              _mm_add_epi32(res_b_round0, round_const_y), round_shift_y);
+
+          const __m128i res_b1 = convolve(s + 4 + 8, coeffs_y);
+          __m128i res_b_round1 = _mm_sll_epi32(res_b1, round_shift_bits);
+          res_b_round1 = _mm_sra_epi32(
+              _mm_add_epi32(res_b_round1, round_const_y), round_shift_y);
+
+          __m128i res_unsigned_hi_0 = _mm_add_epi32(res_b_round0, offset_const);
+          __m128i res_unsigned_hi_1 = _mm_add_epi32(res_b_round1, offset_const);
+
+          if (do_average) {
+            const __m128i data_0 =
+                _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
+            const __m128i data_1 = _mm_loadu_si128(
+                (__m128i *)(&dst[i * dst_stride + j + dst_stride]));
+            const __m128i data_ref_0_lo_0 = _mm_unpacklo_epi16(data_0, zero);
+            const __m128i data_ref_0_lo_1 = _mm_unpacklo_epi16(data_1, zero);
+
+            const __m128i data_ref_0_hi_0 = _mm_unpackhi_epi16(data_0, zero);
+            const __m128i data_ref_0_hi_1 = _mm_unpackhi_epi16(data_1, zero);
+
+            const __m128i comp_avg_res_lo_0 =
+                highbd_comp_avg_sse4_1(&data_ref_0_lo_0, &res_unsigned_lo_0,
+                                       &wt0, &wt1, use_dist_wtd_comp_avg);
+            const __m128i comp_avg_res_lo_1 =
+                highbd_comp_avg_sse4_1(&data_ref_0_lo_1, &res_unsigned_lo_1,
+                                       &wt0, &wt1, use_dist_wtd_comp_avg);
+            const __m128i comp_avg_res_hi_0 =
+                highbd_comp_avg_sse4_1(&data_ref_0_hi_0, &res_unsigned_hi_0,
+                                       &wt0, &wt1, use_dist_wtd_comp_avg);
+            const __m128i comp_avg_res_hi_1 =
+                highbd_comp_avg_sse4_1(&data_ref_0_hi_1, &res_unsigned_hi_1,
+                                       &wt0, &wt1, use_dist_wtd_comp_avg);
+
+            const __m128i round_result_lo_0 =
+                highbd_convolve_rounding_sse2(&comp_avg_res_lo_0, &offset_const,
+                                              &rounding_const, rounding_shift);
+            const __m128i round_result_lo_1 =
+                highbd_convolve_rounding_sse2(&comp_avg_res_lo_1, &offset_const,
+                                              &rounding_const, rounding_shift);
+            const __m128i round_result_hi_0 =
+                highbd_convolve_rounding_sse2(&comp_avg_res_hi_0, &offset_const,
+                                              &rounding_const, rounding_shift);
+            const __m128i round_result_hi_1 =
+                highbd_convolve_rounding_sse2(&comp_avg_res_hi_1, &offset_const,
+                                              &rounding_const, rounding_shift);
+
+            const __m128i res_16b_0 =
+                _mm_packus_epi32(round_result_lo_0, round_result_hi_0);
+            const __m128i res_clip_0 =
+                _mm_min_epi16(res_16b_0, clip_pixel_to_bd);
+
+            const __m128i res_16b_1 =
+                _mm_packus_epi32(round_result_lo_1, round_result_hi_1);
+            const __m128i res_clip_1 =
+                _mm_min_epi16(res_16b_1, clip_pixel_to_bd);
+
+            _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]),
+                            res_clip_0);
+            _mm_store_si128(
+                (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]),
+                res_clip_1);
+          } else {
+            __m128i res_16bit0 =
+                _mm_packus_epi32(res_unsigned_lo_0, res_unsigned_hi_0);
+            __m128i res_16bit1 =
+                _mm_packus_epi32(res_unsigned_lo_1, res_unsigned_hi_1);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_16bit0);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                            res_16bit1);
+          }
+        }
+        s[0] = s[1];
+        s[1] = s[2];
+        s[2] = s[3];
+
+        s[4] = s[5];
+        s[5] = s[6];
+        s[6] = s[7];
+
+        s[0 + 8] = s[1 + 8];
+        s[1 + 8] = s[2 + 8];
+        s[2 + 8] = s[3 + 8];
+
+        s[4 + 8] = s[5 + 8];
+        s[5 + 8] = s[6 + 8];
+        s[6 + 8] = s[7 + 8];
+
+        s6 = s8;
+      }
+    }
+  }
+}
+
+void av1_highbd_dist_wtd_convolve_x_sse4_1(
+    const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params, int bd) {
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const uint16_t *const src_ptr = src - fo_horiz;
+  const int bits = FILTER_BITS - conv_params->round_1;
+  (void)filter_params_y;
+  (void)subpel_y_qn;
+
+  int i, j;
+  __m128i s[4], coeffs_x[4];
+
+  const int do_average = conv_params->do_average;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m128i wt0 = _mm_set1_epi32(w0);
+  const __m128i wt1 = _mm_set1_epi32(w1);
+  const __m128i zero = _mm_setzero_si128();
+
+  const __m128i round_const_x =
+      _mm_set1_epi32(((1 << conv_params->round_0) >> 1));
+  const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
+  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+
+  const int offset_0 =
+      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+  const __m128i offset_const = _mm_set1_epi32(offset);
+  const int rounding_shift =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const __m128i rounding_const = _mm_set1_epi32((1 << rounding_shift) >> 1);
+  const __m128i clip_pixel_to_bd =
+      _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+
+  assert(bits >= 0);
+  prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x);
+
+  for (j = 0; j < w; j += 8) {
+    /* Horizontal filter */
+    for (i = 0; i < h; i += 1) {
+      const __m128i row00 =
+          _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+      const __m128i row01 =
+          _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 8)]);
+
+      // even pixels
+      s[0] = _mm_alignr_epi8(row01, row00, 0);
+      s[1] = _mm_alignr_epi8(row01, row00, 4);
+      s[2] = _mm_alignr_epi8(row01, row00, 8);
+      s[3] = _mm_alignr_epi8(row01, row00, 12);
+
+      __m128i res_even = convolve(s, coeffs_x);
+      res_even =
+          _mm_sra_epi32(_mm_add_epi32(res_even, round_const_x), round_shift_x);
+
+      // odd pixels
+      s[0] = _mm_alignr_epi8(row01, row00, 2);
+      s[1] = _mm_alignr_epi8(row01, row00, 6);
+      s[2] = _mm_alignr_epi8(row01, row00, 10);
+      s[3] = _mm_alignr_epi8(row01, row00, 14);
+
+      __m128i res_odd = convolve(s, coeffs_x);
+      res_odd =
+          _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_x), round_shift_x);
+
+      res_even = _mm_sll_epi32(res_even, round_shift_bits);
+      res_odd = _mm_sll_epi32(res_odd, round_shift_bits);
+
+      __m128i res1 = _mm_unpacklo_epi32(res_even, res_odd);
+      __m128i res_unsigned_lo = _mm_add_epi32(res1, offset_const);
+      if (w - j < 8) {
+        if (do_average) {
+          const __m128i data_0 =
+              _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j]));
+          const __m128i data_ref_0 = _mm_unpacklo_epi16(data_0, zero);
+
+          const __m128i comp_avg_res = highbd_comp_avg_sse4_1(
+              &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_dist_wtd_comp_avg);
+          const __m128i round_result = highbd_convolve_rounding_sse2(
+              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+          const __m128i res_16b = _mm_packus_epi32(round_result, round_result);
+          const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd);
+          _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip);
+        } else {
+          __m128i res_16b = _mm_packus_epi32(res_unsigned_lo, res_unsigned_lo);
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_16b);
+        }
+      } else {
+        __m128i res2 = _mm_unpackhi_epi32(res_even, res_odd);
+        __m128i res_unsigned_hi = _mm_add_epi32(res2, offset_const);
+        if (do_average) {
+          const __m128i data_0 =
+              _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
+          const __m128i data_ref_0_lo = _mm_unpacklo_epi16(data_0, zero);
+          const __m128i data_ref_0_hi = _mm_unpackhi_epi16(data_0, zero);
+
+          const __m128i comp_avg_res_lo =
+              highbd_comp_avg_sse4_1(&data_ref_0_lo, &res_unsigned_lo, &wt0,
+                                     &wt1, use_dist_wtd_comp_avg);
+          const __m128i comp_avg_res_hi =
+              highbd_comp_avg_sse4_1(&data_ref_0_hi, &res_unsigned_hi, &wt0,
+                                     &wt1, use_dist_wtd_comp_avg);
+
+          const __m128i round_result_lo = highbd_convolve_rounding_sse2(
+              &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
+          const __m128i round_result_hi = highbd_convolve_rounding_sse2(
+              &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
+
+          const __m128i res_16b =
+              _mm_packus_epi32(round_result_lo, round_result_hi);
+          const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd);
+          _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip);
+        } else {
+          __m128i res_16b = _mm_packus_epi32(res_unsigned_lo, res_unsigned_hi);
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_16b);
+        }
+      }
+    }
+  }
+}
diff --git a/libs/libaom/src/av1/common/x86/highbd_txfm_utility_sse4.h b/libs/libaom/src/av1/common/x86/highbd_txfm_utility_sse4.h
new file mode 100644
index 000000000..5734810f5
--- /dev/null
+++ b/libs/libaom/src/av1/common/x86/highbd_txfm_utility_sse4.h
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_X86_HIGHBD_TXFM_UTILITY_SSE4_H_
+#define AOM_AV1_COMMON_X86_HIGHBD_TXFM_UTILITY_SSE4_H_
+
+#include <smmintrin.h> /* SSE4.1 */
+
+#define TRANSPOSE_4X4(x0, x1, x2, x3, y0, y1, y2, y3) \
+  do {                                                \
+    __m128i u0, u1, u2, u3;                           \
+    u0 = _mm_unpacklo_epi32(x0, x1);                  \
+    u1 = _mm_unpackhi_epi32(x0, x1);                  \
+    u2 = _mm_unpacklo_epi32(x2, x3);                  \
+    u3 = _mm_unpackhi_epi32(x2, x3);                  \
+    y0 = _mm_unpacklo_epi64(u0, u2);                  \
+    y1 = _mm_unpackhi_epi64(u0, u2);                  \
+    y2 = _mm_unpacklo_epi64(u1, u3);                  \
+    y3 = _mm_unpackhi_epi64(u1, u3);                  \
+  } while (0)
+
+static INLINE void transpose_8x8(const __m128i *in, __m128i *out) {
+  TRANSPOSE_4X4(in[0], in[2], in[4], in[6], out[0], out[2], out[4], out[6]);
+  TRANSPOSE_4X4(in[1], in[3], in[5], in[7], out[8], out[10], out[12], out[14]);
+  TRANSPOSE_4X4(in[8], in[10], in[12], in[14], out[1], out[3], out[5], out[7]);
+  TRANSPOSE_4X4(in[9], in[11], in[13], in[15], out[9], out[11], out[13],
+                out[15]);
+}
+
+static INLINE void transpose_16x16(const __m128i *in, __m128i *out) {
+  // Upper left 8x8
+  TRANSPOSE_4X4(in[0], in[4], in[8], in[12], out[0], out[4], out[8], out[12]);
+  TRANSPOSE_4X4(in[1], in[5], in[9], in[13], out[16], out[20], out[24],
+                out[28]);
+  TRANSPOSE_4X4(in[16], in[20], in[24], in[28], out[1], out[5], out[9],
+                out[13]);
+  TRANSPOSE_4X4(in[17], in[21], in[25], in[29], out[17], out[21], out[25],
+                out[29]);
+
+  // Upper right 8x8
+  TRANSPOSE_4X4(in[2], in[6], in[10], in[14], out[32], out[36], out[40],
+                out[44]);
+  TRANSPOSE_4X4(in[3], in[7], in[11], in[15], out[48], out[52], out[56],
+                out[60]);
+  TRANSPOSE_4X4(in[18], in[22], in[26], in[30], out[33], out[37], out[41],
+                out[45]);
+  TRANSPOSE_4X4(in[19], in[23], in[27], in[31], out[49], out[53], out[57],
+                out[61]);
+
+  // Lower left 8x8
+  TRANSPOSE_4X4(in[32], in[36], in[40], in[44], out[2], out[6], out[10],
+                out[14]);
+  TRANSPOSE_4X4(in[33], in[37], in[41], in[45], out[18], out[22], out[26],
+                out[30]);
+  TRANSPOSE_4X4(in[48], in[52], in[56], in[60], out[3], out[7], out[11],
+                out[15]);
+  TRANSPOSE_4X4(in[49], in[53], in[57], in[61], out[19], out[23], out[27],
+                out[31]);
+  // Lower right 8x8
+  TRANSPOSE_4X4(in[34], in[38], in[42], in[46], out[34], out[38], out[42],
+                out[46]);
+  TRANSPOSE_4X4(in[35], in[39], in[43], in[47], out[50], out[54], out[58],
+                out[62]);
+  TRANSPOSE_4X4(in[50], in[54], in[58], in[62], out[35], out[39], out[43],
+                out[47]);
+  TRANSPOSE_4X4(in[51], in[55], in[59], in[63], out[51], out[55], out[59],
+                out[63]);
+}
+
+static INLINE void transpose_8nx8n(const __m128i *input, __m128i *output,
+                                   const int width, const int height) {
+  const int numcol = height >> 2;
+  const int numrow = width >> 2;
+  for (int j = 0; j < numrow; j++) {
+    for (int i = 0; i < numcol; i++) {
+      TRANSPOSE_4X4(input[i * width + j + (numrow * 0)],
+                    input[i * width + j + (numrow * 1)],
+                    input[i * width + j + (numrow * 2)],
+                    input[i * width + j + (numrow * 3)],
+                    output[j * height + i + (numcol * 0)],
+                    output[j * height + i + (numcol * 1)],
+                    output[j * height + i + (numcol * 2)],
+                    output[j * height + i + (numcol * 3)]);
+    }
+  }
+}
+
+// Note:
+//  rounding = 1 << (bit - 1)
+static INLINE __m128i half_btf_sse4_1(const __m128i *w0, const __m128i *n0,
+                                      const __m128i *w1, const __m128i *n1,
+                                      const __m128i *rounding, int bit) {
+  __m128i x, y;
+
+  x = _mm_mullo_epi32(*w0, *n0);
+  y = _mm_mullo_epi32(*w1, *n1);
+  x = _mm_add_epi32(x, y);
+  x = _mm_add_epi32(x, *rounding);
+  x = _mm_srai_epi32(x, bit);
+  return x;
+}
+
+static INLINE __m128i half_btf_0_sse4_1(const __m128i *w0, const __m128i *n0,
+                                        const __m128i *rounding, int bit) {
+  __m128i x;
+
+  x = _mm_mullo_epi32(*w0, *n0);
+  x = _mm_add_epi32(x, *rounding);
+  x = _mm_srai_epi32(x, bit);
+  return x;
+}
+
+typedef void (*transform_1d_sse4_1)(__m128i *in, __m128i *out, int bit,
+                                    int do_cols, int bd, int out_shift);
+
+typedef void (*fwd_transform_1d_sse4_1)(__m128i *in, __m128i *out, int bit,
+                                        const int num_cols);
+
+void av1_highbd_inv_txfm2d_add_universe_sse4_1(const int32_t *input,
+                                               uint8_t *output, int stride,
+                                               TX_TYPE tx_type, TX_SIZE tx_size,
+                                               int eob, const int bd);
+
+#endif  // AOM_AV1_COMMON_X86_HIGHBD_TXFM_UTILITY_SSE4_H_
diff --git a/libs/libaom/src/av1/common/x86/highbd_warp_plane_sse4.c b/libs/libaom/src/av1/common/x86/highbd_warp_plane_sse4.c
new file mode 100644
index 000000000..60a819308
--- /dev/null
+++ b/libs/libaom/src/av1/common/x86/highbd_warp_plane_sse4.c
@@ -0,0 +1,632 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/warped_motion.h"
+
+static const uint8_t warp_highbd_arrange_bytes[16] = { 0,  2,  4,  6, 8, 10,
+                                                       12, 14, 1,  3, 5, 7,
+                                                       9,  11, 13, 15 };
+
+static const uint8_t highbd_shuffle_alpha0_mask0[16] = {
+  0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
+};
+static const uint8_t highbd_shuffle_alpha0_mask1[16] = {
+  4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7
+};
+static const uint8_t highbd_shuffle_alpha0_mask2[16] = { 8,  9,  10, 11, 8,  9,
+                                                         10, 11, 8,  9,  10, 11,
+                                                         8,  9,  10, 11 };
+static const uint8_t highbd_shuffle_alpha0_mask3[16] = { 12, 13, 14, 15, 12, 13,
+                                                         14, 15, 12, 13, 14, 15,
+                                                         12, 13, 14, 15 };
+
+static INLINE void highbd_prepare_horizontal_filter_coeff(int alpha, int sx,
+                                                          __m128i *coeff) {
+  // Filter even-index pixels
+  const __m128i tmp_0 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_2 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_4 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_6 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS)));
+
+  // coeffs 0 1 0 1 2 3 2 3 for pixels 0, 2
+  const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
+  // coeffs 0 1 0 1 2 3 2 3 for pixels 4, 6
+  const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
+  // coeffs 4 5 4 5 6 7 6 7 for pixels 0, 2
+  const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
+  // coeffs 4 5 4 5 6 7 6 7 for pixels 4, 6
+  const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
+
+  // coeffs 0 1 0 1 0 1 0 1 for pixels 0, 2, 4, 6
+  coeff[0] = _mm_unpacklo_epi64(tmp_8, tmp_10);
+  // coeffs 2 3 2 3 2 3 2 3 for pixels 0, 2, 4, 6
+  coeff[2] = _mm_unpackhi_epi64(tmp_8, tmp_10);
+  // coeffs 4 5 4 5 4 5 4 5 for pixels 0, 2, 4, 6
+  coeff[4] = _mm_unpacklo_epi64(tmp_12, tmp_14);
+  // coeffs 6 7 6 7 6 7 6 7 for pixels 0, 2, 4, 6
+  coeff[6] = _mm_unpackhi_epi64(tmp_12, tmp_14);
+
+  // Filter odd-index pixels
+  const __m128i tmp_1 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_3 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_5 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_7 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS)));
+
+  const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
+  const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
+  const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
+  const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
+
+  coeff[1] = _mm_unpacklo_epi64(tmp_9, tmp_11);
+  coeff[3] = _mm_unpackhi_epi64(tmp_9, tmp_11);
+  coeff[5] = _mm_unpacklo_epi64(tmp_13, tmp_15);
+  coeff[7] = _mm_unpackhi_epi64(tmp_13, tmp_15);
+}
+
+static INLINE void highbd_prepare_horizontal_filter_coeff_alpha0(
+    int sx, __m128i *coeff) {
+  // Filter coeff
+  const __m128i tmp_0 = _mm_loadu_si128(
+      (__m128i *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS)));
+
+  coeff[0] = _mm_shuffle_epi8(
+      tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask0));
+  coeff[2] = _mm_shuffle_epi8(
+      tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask1));
+  coeff[4] = _mm_shuffle_epi8(
+      tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask2));
+  coeff[6] = _mm_shuffle_epi8(
+      tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask3));
+
+  coeff[1] = coeff[0];
+  coeff[3] = coeff[2];
+  coeff[5] = coeff[4];
+  coeff[7] = coeff[6];
+}
+
+static INLINE void highbd_filter_src_pixels(
+    const __m128i *src, const __m128i *src2, __m128i *tmp, __m128i *coeff,
+    const int offset_bits_horiz, const int reduce_bits_horiz, int k) {
+  const __m128i src_1 = *src;
+  const __m128i src2_1 = *src2;
+
+  const __m128i round_const = _mm_set1_epi32((1 << offset_bits_horiz) +
+                                             ((1 << reduce_bits_horiz) >> 1));
+
+  const __m128i res_0 = _mm_madd_epi16(src_1, coeff[0]);
+  const __m128i res_2 =
+      _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 4), coeff[2]);
+  const __m128i res_4 =
+      _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 8), coeff[4]);
+  const __m128i res_6 =
+      _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 12), coeff[6]);
+
+  __m128i res_even =
+      _mm_add_epi32(_mm_add_epi32(res_0, res_4), _mm_add_epi32(res_2, res_6));
+  res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const),
+                           _mm_cvtsi32_si128(reduce_bits_horiz));
+
+  const __m128i res_1 =
+      _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 2), coeff[1]);
+  const __m128i res_3 =
+      _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 6), coeff[3]);
+  const __m128i res_5 =
+      _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 10), coeff[5]);
+  const __m128i res_7 =
+      _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 14), coeff[7]);
+
+  __m128i res_odd =
+      _mm_add_epi32(_mm_add_epi32(res_1, res_5), _mm_add_epi32(res_3, res_7));
+  res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const),
+                          _mm_cvtsi32_si128(reduce_bits_horiz));
+
+  // Combine results into one register.
+  // We store the columns in the order 0, 2, 4, 6, 1, 3, 5, 7
+  // as this order helps with the vertical filter.
+  tmp[k + 7] = _mm_packs_epi32(res_even, res_odd);
+}
+
+static INLINE void highbd_horiz_filter(const __m128i *src, const __m128i *src2,
+                                       __m128i *tmp, int sx, int alpha, int k,
+                                       const int offset_bits_horiz,
+                                       const int reduce_bits_horiz) {
+  __m128i coeff[8];
+  highbd_prepare_horizontal_filter_coeff(alpha, sx, coeff);
+  highbd_filter_src_pixels(src, src2, tmp, coeff, offset_bits_horiz,
+                           reduce_bits_horiz, k);
+}
+
+static INLINE void highbd_warp_horizontal_filter_alpha0_beta0(
+    const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+    const int offset_bits_horiz, const int reduce_bits_horiz) {
+  (void)beta;
+  (void)alpha;
+  int k;
+
+  __m128i coeff[8];
+  highbd_prepare_horizontal_filter_coeff_alpha0(sx4, coeff);
+
+  for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+    int iy = iy4 + k;
+    if (iy < 0)
+      iy = 0;
+    else if (iy > height - 1)
+      iy = height - 1;
+
+    // Load source pixels
+    const __m128i src =
+        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+    const __m128i src2 =
+        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
+    highbd_filter_src_pixels(&src, &src2, tmp, coeff, offset_bits_horiz,
+                             reduce_bits_horiz, k);
+  }
+}
+
+static INLINE void highbd_warp_horizontal_filter_alpha0(
+    const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+    const int offset_bits_horiz, const int reduce_bits_horiz) {
+  (void)alpha;
+  int k;
+  for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+    int iy = iy4 + k;
+    if (iy < 0)
+      iy = 0;
+    else if (iy > height - 1)
+      iy = height - 1;
+    int sx = sx4 + beta * (k + 4);
+
+    // Load source pixels
+    const __m128i src =
+        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+    const __m128i src2 =
+        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
+
+    __m128i coeff[8];
+    highbd_prepare_horizontal_filter_coeff_alpha0(sx, coeff);
+    highbd_filter_src_pixels(&src, &src2, tmp, coeff, offset_bits_horiz,
+                             reduce_bits_horiz, k);
+  }
+}
+
+static INLINE void highbd_warp_horizontal_filter_beta0(
+    const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+    const int offset_bits_horiz, const int reduce_bits_horiz) {
+  (void)beta;
+  int k;
+  __m128i coeff[8];
+  highbd_prepare_horizontal_filter_coeff(alpha, sx4, coeff);
+
+  for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+    int iy = iy4 + k;
+    if (iy < 0)
+      iy = 0;
+    else if (iy > height - 1)
+      iy = height - 1;
+
+    // Load source pixels
+    const __m128i src =
+        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+    const __m128i src2 =
+        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
+    highbd_filter_src_pixels(&src, &src2, tmp, coeff, offset_bits_horiz,
+                             reduce_bits_horiz, k);
+  }
+}
+
+static INLINE void highbd_warp_horizontal_filter(
+    const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+    const int offset_bits_horiz, const int reduce_bits_horiz) {
+  int k;
+  for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+    int iy = iy4 + k;
+    if (iy < 0)
+      iy = 0;
+    else if (iy > height - 1)
+      iy = height - 1;
+    int sx = sx4 + beta * (k + 4);
+
+    // Load source pixels
+    const __m128i src =
+        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+    const __m128i src2 =
+        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
+
+    highbd_horiz_filter(&src, &src2, tmp, sx, alpha, k, offset_bits_horiz,
+                        reduce_bits_horiz);
+  }
+}
+
+static INLINE void highbd_prepare_warp_horizontal_filter(
+    const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+    const int offset_bits_horiz, const int reduce_bits_horiz) {
+  if (alpha == 0 && beta == 0)
+    highbd_warp_horizontal_filter_alpha0_beta0(
+        ref, tmp, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i,
+        offset_bits_horiz, reduce_bits_horiz);
+
+  else if (alpha == 0 && beta != 0)
+    highbd_warp_horizontal_filter_alpha0(ref, tmp, stride, ix4, iy4, sx4, alpha,
+                                         beta, p_height, height, i,
+                                         offset_bits_horiz, reduce_bits_horiz);
+
+  else if (alpha != 0 && beta == 0)
+    highbd_warp_horizontal_filter_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha,
+                                        beta, p_height, height, i,
+                                        offset_bits_horiz, reduce_bits_horiz);
+  else
+    highbd_warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha, beta,
+                                  p_height, height, i, offset_bits_horiz,
+                                  reduce_bits_horiz);
+}
+
+void av1_highbd_warp_affine_sse4_1(const int32_t *mat, const uint16_t *ref,
+                                   int width, int height, int stride,
+                                   uint16_t *pred, int p_col, int p_row,
+                                   int p_width, int p_height, int p_stride,
+                                   int subsampling_x, int subsampling_y, int bd,
+                                   ConvolveParams *conv_params, int16_t alpha,
+                                   int16_t beta, int16_t gamma, int16_t delta) {
+  __m128i tmp[15];
+  int i, j, k;
+  const int reduce_bits_horiz =
+      conv_params->round_0 +
+      AOMMAX(bd + FILTER_BITS - conv_params->round_0 - 14, 0);
+  const int reduce_bits_vert = conv_params->is_compound
+                                   ? conv_params->round_1
+                                   : 2 * FILTER_BITS - reduce_bits_horiz;
+  const int offset_bits_horiz = bd + FILTER_BITS - 1;
+  assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
+  assert(!(bd == 12 && reduce_bits_horiz < 5));
+  assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
+
+  const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
+  const __m128i clip_pixel =
+      _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+  const __m128i reduce_bits_vert_shift = _mm_cvtsi32_si128(reduce_bits_vert);
+  const __m128i reduce_bits_vert_const =
+      _mm_set1_epi32(((1 << reduce_bits_vert) >> 1));
+  const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits_vert);
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const __m128i res_sub_const =
+      _mm_set1_epi32(-(1 << (offset_bits - conv_params->round_1)) -
+                     (1 << (offset_bits - conv_params->round_1 - 1)));
+  __m128i round_bits_shift = _mm_cvtsi32_si128(round_bits);
+  __m128i round_bits_const = _mm_set1_epi32(((1 << round_bits) >> 1));
+
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m128i wt0 = _mm_set1_epi32(w0);
+  const __m128i wt1 = _mm_set1_epi32(w1);
+
+  /* Note: For this code to work, the left/right frame borders need to be
+  extended by at least 13 pixels each. By the time we get here, other
+  code will have set up this border, but we allow an explicit check
+  for debugging purposes.
+  */
+  /*for (i = 0; i < height; ++i) {
+  for (j = 0; j < 13; ++j) {
+  assert(ref[i * stride - 13 + j] == ref[i * stride]);
+  assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]);
+  }
+  }*/
+
+  for (i = 0; i < p_height; i += 8) {
+    for (j = 0; j < p_width; j += 8) {
+      const int32_t src_x = (p_col + j + 4) << subsampling_x;
+      const int32_t src_y = (p_row + i + 4) << subsampling_y;
+      const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
+      const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
+      const int32_t x4 = dst_x >> subsampling_x;
+      const int32_t y4 = dst_y >> subsampling_y;
+
+      int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
+      int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+      int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
+      int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+
+      // Add in all the constant terms, including rounding and offset
+      sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+             (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+      sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+             (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+
+      sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+      sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+
+      // Horizontal filter
+      // If the block is aligned such that, after clamping, every sample
+      // would be taken from the leftmost/rightmost column, then we can
+      // skip the expensive horizontal filter.
+      if (ix4 <= -7) {
+        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+          int iy = iy4 + k;
+          if (iy < 0)
+            iy = 0;
+          else if (iy > height - 1)
+            iy = height - 1;
+          tmp[k + 7] = _mm_set1_epi16(
+              (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
+              ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz)));
+        }
+      } else if (ix4 >= width + 6) {
+        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+          int iy = iy4 + k;
+          if (iy < 0)
+            iy = 0;
+          else if (iy > height - 1)
+            iy = height - 1;
+          tmp[k + 7] =
+              _mm_set1_epi16((1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
+                             ref[iy * stride + (width - 1)] *
+                                 (1 << (FILTER_BITS - reduce_bits_horiz)));
+        }
+      } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) {
+        const int out_of_boundary_left = -(ix4 - 6);
+        const int out_of_boundary_right = (ix4 + 8) - width;
+
+        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+          int iy = iy4 + k;
+          if (iy < 0)
+            iy = 0;
+          else if (iy > height - 1)
+            iy = height - 1;
+          int sx = sx4 + beta * (k + 4);
+
+          // Load source pixels
+          const __m128i src =
+              _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+          const __m128i src2 =
+              _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
+
+          const __m128i src_01 = _mm_shuffle_epi8(
+              src, _mm_loadu_si128((__m128i *)warp_highbd_arrange_bytes));
+          const __m128i src2_01 = _mm_shuffle_epi8(
+              src2, _mm_loadu_si128((__m128i *)warp_highbd_arrange_bytes));
+
+          __m128i src_lo = _mm_unpacklo_epi64(src_01, src2_01);
+          __m128i src_hi = _mm_unpackhi_epi64(src_01, src2_01);
+
+          if (out_of_boundary_left >= 0) {
+            const __m128i shuffle_reg_left =
+                _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]);
+            src_lo = _mm_shuffle_epi8(src_lo, shuffle_reg_left);
+            src_hi = _mm_shuffle_epi8(src_hi, shuffle_reg_left);
+          }
+
+          if (out_of_boundary_right >= 0) {
+            const __m128i shuffle_reg_right = _mm_loadu_si128(
+                (__m128i *)warp_pad_right[out_of_boundary_right]);
+            src_lo = _mm_shuffle_epi8(src_lo, shuffle_reg_right);
+            src_hi = _mm_shuffle_epi8(src_hi, shuffle_reg_right);
+          }
+
+          const __m128i src_padded = _mm_unpacklo_epi8(src_lo, src_hi);
+          const __m128i src2_padded = _mm_unpackhi_epi8(src_lo, src_hi);
+
+          highbd_horiz_filter(&src_padded, &src2_padded, tmp, sx, alpha, k,
+                              offset_bits_horiz, reduce_bits_horiz);
+        }
+      } else {
+        highbd_prepare_warp_horizontal_filter(
+            ref, tmp, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i,
+            offset_bits_horiz, reduce_bits_horiz);
+      }
+
+      // Vertical filter
+      for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
+        int sy = sy4 + delta * (k + 4);
+
+        // Load from tmp and rearrange pairs of consecutive rows into the
+        // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7
+        const __m128i *src = tmp + (k + 4);
+        const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]);
+        const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]);
+        const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]);
+        const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]);
+
+        // Filter even-index pixels
+        const __m128i tmp_0 = _mm_loadu_si128(
+            (__m128i *)(av1_warped_filter +
+                        ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
+        const __m128i tmp_2 = _mm_loadu_si128(
+            (__m128i *)(av1_warped_filter +
+                        ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
+        const __m128i tmp_4 = _mm_loadu_si128(
+            (__m128i *)(av1_warped_filter +
+                        ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
+        const __m128i tmp_6 = _mm_loadu_si128(
+            (__m128i *)(av1_warped_filter +
+                        ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+        const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
+        const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
+        const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
+        const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
+
+        const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
+        const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
+        const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
+        const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
+
+        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_0);
+        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_2);
+        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_4);
+        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_6);
+
+        const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+                                               _mm_add_epi32(res_4, res_6));
+
+        // Filter odd-index pixels
+        const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]);
+        const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]);
+        const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]);
+        const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);
+
+        const __m128i tmp_1 = _mm_loadu_si128(
+            (__m128i *)(av1_warped_filter +
+                        ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
+        const __m128i tmp_3 = _mm_loadu_si128(
+            (__m128i *)(av1_warped_filter +
+                        ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
+        const __m128i tmp_5 = _mm_loadu_si128(
+            (__m128i *)(av1_warped_filter +
+                        ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
+        const __m128i tmp_7 = _mm_loadu_si128(
+            (__m128i *)(av1_warped_filter +
+                        ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+        const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
+        const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
+        const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
+        const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
+
+        const __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
+        const __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
+        const __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
+        const __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
+
+        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_1);
+        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_3);
+        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_5);
+        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_7);
+
+        const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+                                              _mm_add_epi32(res_5, res_7));
+
+        // Rearrange pixels back into the order 0 ... 7
+        __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+        __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+        if (conv_params->is_compound) {
+          __m128i *const p =
+              (__m128i *)&conv_params
+                  ->dst[(i + k + 4) * conv_params->dst_stride + j];
+          res_lo = _mm_add_epi32(res_lo, res_add_const);
+          res_lo = _mm_sra_epi32(_mm_add_epi32(res_lo, reduce_bits_vert_const),
+                                 reduce_bits_vert_shift);
+
+          if (conv_params->do_average) {
+            __m128i *const dst16 = (__m128i *)&pred[(i + k + 4) * p_stride + j];
+            __m128i p_32 = _mm_cvtepu16_epi32(_mm_loadl_epi64(p));
+
+            if (conv_params->use_dist_wtd_comp_avg) {
+              res_lo = _mm_add_epi32(_mm_mullo_epi32(p_32, wt0),
+                                     _mm_mullo_epi32(res_lo, wt1));
+              res_lo = _mm_srai_epi32(res_lo, DIST_PRECISION_BITS);
+            } else {
+              res_lo = _mm_srai_epi32(_mm_add_epi32(p_32, res_lo), 1);
+            }
+
+            __m128i res32_lo = _mm_add_epi32(res_lo, res_sub_const);
+            res32_lo = _mm_sra_epi32(_mm_add_epi32(res32_lo, round_bits_const),
+                                     round_bits_shift);
+
+            __m128i res16_lo = _mm_packus_epi32(res32_lo, res32_lo);
+            res16_lo = _mm_min_epi16(res16_lo, clip_pixel);
+            _mm_storel_epi64(dst16, res16_lo);
+          } else {
+            res_lo = _mm_packus_epi32(res_lo, res_lo);
+            _mm_storel_epi64(p, res_lo);
+          }
+          if (p_width > 4) {
+            __m128i *const p4 =
+                (__m128i *)&conv_params
+                    ->dst[(i + k + 4) * conv_params->dst_stride + j + 4];
+
+            res_hi = _mm_add_epi32(res_hi, res_add_const);
+            res_hi =
+                _mm_sra_epi32(_mm_add_epi32(res_hi, reduce_bits_vert_const),
+                              reduce_bits_vert_shift);
+            if (conv_params->do_average) {
+              __m128i *const dst16_4 =
+                  (__m128i *)&pred[(i + k + 4) * p_stride + j + 4];
+              __m128i p4_32 = _mm_cvtepu16_epi32(_mm_loadl_epi64(p4));
+
+              if (conv_params->use_dist_wtd_comp_avg) {
+                res_hi = _mm_add_epi32(_mm_mullo_epi32(p4_32, wt0),
+                                       _mm_mullo_epi32(res_hi, wt1));
+                res_hi = _mm_srai_epi32(res_hi, DIST_PRECISION_BITS);
+              } else {
+                res_hi = _mm_srai_epi32(_mm_add_epi32(p4_32, res_hi), 1);
+              }
+
+              __m128i res32_hi = _mm_add_epi32(res_hi, res_sub_const);
+              res32_hi = _mm_sra_epi32(
+                  _mm_add_epi32(res32_hi, round_bits_const), round_bits_shift);
+              __m128i res16_hi = _mm_packus_epi32(res32_hi, res32_hi);
+              res16_hi = _mm_min_epi16(res16_hi, clip_pixel);
+              _mm_storel_epi64(dst16_4, res16_hi);
+            } else {
+              res_hi = _mm_packus_epi32(res_hi, res_hi);
+              _mm_storel_epi64(p4, res_hi);
+            }
+          }
+        } else {
+          // Round and pack into 8 bits
+          const __m128i round_const =
+              _mm_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) +
+                             ((1 << reduce_bits_vert) >> 1));
+
+          const __m128i res_lo_round = _mm_srai_epi32(
+              _mm_add_epi32(res_lo, round_const), reduce_bits_vert);
+          const __m128i res_hi_round = _mm_srai_epi32(
+              _mm_add_epi32(res_hi, round_const), reduce_bits_vert);
+
+          __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
+          // Clamp res_16bit to the range [0, 2^bd - 1]
+          const __m128i max_val = _mm_set1_epi16((1 << bd) - 1);
+          const __m128i zero = _mm_setzero_si128();
+          res_16bit = _mm_max_epi16(_mm_min_epi16(res_16bit, max_val), zero);
+
+          // Store, blending with 'pred' if needed
+          __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
+
+          // Note: If we're outputting a 4x4 block, we need to be very careful
+          // to only output 4 pixels at this point, to avoid encode/decode
+          // mismatches when encoding with multiple threads.
+          if (p_width == 4) {
+            _mm_storel_epi64(p, res_16bit);
+          } else {
+            _mm_storeu_si128(p, res_16bit);
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/libs/libaom/src/av1/common/x86/highbd_wiener_convolve_avx2.c b/libs/libaom/src/av1/common/x86/highbd_wiener_convolve_avx2.c
new file mode 100644
index 000000000..0c8a8505b
--- /dev/null
+++ b/libs/libaom/src/av1/common/x86/highbd_wiener_convolve_avx2.c
@@ -0,0 +1,245 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "av1/common/convolve.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+
+// 128-bit xmmwords are written as [ ... ] with the MSB on the left.
+// 256-bit ymmwords are written as two xmmwords, [ ... ][ ... ] with the MSB
+// on the left.
+// A row of, say, 16-bit pixels with values p0, p1, p2, ..., p14, p15 will be
+// loaded and stored as [ p15 ... p9 p8 ][ p7 ... p1 p0 ].
+void av1_highbd_wiener_convolve_add_src_avx2(
+    const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8,
+    ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
+    const int16_t *filter_y, int y_step_q4, int w, int h,
+    const ConvolveParams *conv_params, int bd) {
+  assert(x_step_q4 == 16 && y_step_q4 == 16);
+  assert(!(w & 7));
+  assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16);
+  (void)x_step_q4;
+  (void)y_step_q4;
+
+  const uint16_t *const src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *const dst = CONVERT_TO_SHORTPTR(dst8);
+
+  DECLARE_ALIGNED(32, uint16_t,
+                  temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
+  int intermediate_height = h + SUBPEL_TAPS - 1;
+  const int center_tap = ((SUBPEL_TAPS - 1) / 2);
+  const uint16_t *const src_ptr = src - center_tap * src_stride - center_tap;
+
+  const __m128i zero_128 = _mm_setzero_si128();
+  const __m256i zero_256 = _mm256_setzero_si256();
+
+  // Add an offset to account for the "add_src" part of the convolve function.
+  const __m128i offset = _mm_insert_epi16(zero_128, 1 << FILTER_BITS, 3);
+
+  const __m256i clamp_low = zero_256;
+
+  /* Horizontal filter */
+  {
+    const __m256i clamp_high_ep =
+        _mm256_set1_epi16(WIENER_CLAMP_LIMIT(conv_params->round_0, bd) - 1);
+
+    // coeffs [ f7 f6 f5 f4 f3 f2 f1 f0 ]
+    const __m128i coeffs_x = _mm_add_epi16(xx_loadu_128(filter_x), offset);
+
+    // coeffs [ f3 f2 f3 f2 f1 f0 f1 f0 ]
+    const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
+    // coeffs [ f7 f6 f7 f6 f5 f4 f5 f4 ]
+    const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
+
+    // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ]
+    const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123);
+    // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ]
+    const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123);
+    // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ]
+    const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567);
+    // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ]
+    const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567);
+
+    // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ][ f1 f0 f1 f0 f1 f0 f1 f0 ]
+    const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128);
+    // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ][ f3 f2 f3 f2 f3 f2 f3 f2 ]
+    const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128);
+    // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ][ f5 f4 f5 f4 f5 f4 f5 f4 ]
+    const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128);
+    // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ][ f7 f6 f7 f6 f7 f6 f7 f6 ]
+    const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128);
+
+    const __m256i round_const = _mm256_set1_epi32(
+        (1 << (conv_params->round_0 - 1)) + (1 << (bd + FILTER_BITS - 1)));
+
+    for (int i = 0; i < intermediate_height; ++i) {
+      for (int j = 0; j < w; j += 16) {
+        const uint16_t *src_ij = src_ptr + i * src_stride + j;
+
+        // Load 16-bit src data
+        const __m256i src_0 = yy_loadu_256(src_ij + 0);
+        const __m256i src_1 = yy_loadu_256(src_ij + 1);
+        const __m256i src_2 = yy_loadu_256(src_ij + 2);
+        const __m256i src_3 = yy_loadu_256(src_ij + 3);
+        const __m256i src_4 = yy_loadu_256(src_ij + 4);
+        const __m256i src_5 = yy_loadu_256(src_ij + 5);
+        const __m256i src_6 = yy_loadu_256(src_ij + 6);
+        const __m256i src_7 = yy_loadu_256(src_ij + 7);
+
+        // Multiply src data by filter coeffs and sum pairs
+        const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01);
+        const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01);
+        const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23);
+        const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23);
+        const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45);
+        const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45);
+        const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67);
+        const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67);
+
+        // Calculate scalar product for even- and odd-indices separately,
+        // increasing to 32-bit precision
+        const __m256i res_even_sum = _mm256_add_epi32(
+            _mm256_add_epi32(res_0, res_4), _mm256_add_epi32(res_2, res_6));
+        const __m256i res_even = _mm256_srai_epi32(
+            _mm256_add_epi32(res_even_sum, round_const), conv_params->round_0);
+
+        const __m256i res_odd_sum = _mm256_add_epi32(
+            _mm256_add_epi32(res_1, res_5), _mm256_add_epi32(res_3, res_7));
+        const __m256i res_odd = _mm256_srai_epi32(
+            _mm256_add_epi32(res_odd_sum, round_const), conv_params->round_0);
+
+        // Reduce to 16-bit precision and pack even- and odd-index results
+        // back into one register. The _mm256_packs_epi32 intrinsic returns
+        // a register with the pixels ordered as follows:
+        // [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ]
+        const __m256i res = _mm256_packs_epi32(res_even, res_odd);
+        const __m256i res_clamped =
+            _mm256_min_epi16(_mm256_max_epi16(res, clamp_low), clamp_high_ep);
+
+        // Store in a temporary array
+        yy_storeu_256(temp + i * MAX_SB_SIZE + j, res_clamped);
+      }
+    }
+  }
+
+  /* Vertical filter */
+  {
+    const __m256i clamp_high = _mm256_set1_epi16((1 << bd) - 1);
+
+    // coeffs [ f7 f6 f5 f4 f3 f2 f1 f0 ]
+    const __m128i coeffs_y = _mm_add_epi16(xx_loadu_128(filter_y), offset);
+
+    // coeffs [ f3 f2 f3 f2 f1 f0 f1 f0 ]
+    const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+    // coeffs [ f7 f6 f7 f6 f5 f4 f5 f4 ]
+    const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+    // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ]
+    const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123);
+    // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ]
+    const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123);
+    // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ]
+    const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567);
+    // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ]
+    const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567);
+
+    // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ][ f1 f0 f1 f0 f1 f0 f1 f0 ]
+    const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128);
+    // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ][ f3 f2 f3 f2 f3 f2 f3 f2 ]
+    const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128);
+    // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ][ f5 f4 f5 f4 f5 f4 f5 f4 ]
+    const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128);
+    // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ][ f7 f6 f7 f6 f7 f6 f7 f6 ]
+    const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128);
+
+    const __m256i round_const =
+        _mm256_set1_epi32((1 << (conv_params->round_1 - 1)) -
+                          (1 << (bd + conv_params->round_1 - 1)));
+
+    for (int i = 0; i < h; ++i) {
+      for (int j = 0; j < w; j += 16) {
+        const uint16_t *temp_ij = temp + i * MAX_SB_SIZE + j;
+
+        // Load 16-bit data from the output of the horizontal filter in
+        // which the pixels are ordered as follows:
+        // [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ]
+        const __m256i data_0 = yy_loadu_256(temp_ij + 0 * MAX_SB_SIZE);
+        const __m256i data_1 = yy_loadu_256(temp_ij + 1 * MAX_SB_SIZE);
+        const __m256i data_2 = yy_loadu_256(temp_ij + 2 * MAX_SB_SIZE);
+        const __m256i data_3 = yy_loadu_256(temp_ij + 3 * MAX_SB_SIZE);
+        const __m256i data_4 = yy_loadu_256(temp_ij + 4 * MAX_SB_SIZE);
+        const __m256i data_5 = yy_loadu_256(temp_ij + 5 * MAX_SB_SIZE);
+        const __m256i data_6 = yy_loadu_256(temp_ij + 6 * MAX_SB_SIZE);
+        const __m256i data_7 = yy_loadu_256(temp_ij + 7 * MAX_SB_SIZE);
+
+        // Filter the even-indices, increasing to 32-bit precision
+        const __m256i src_0 = _mm256_unpacklo_epi16(data_0, data_1);
+        const __m256i src_2 = _mm256_unpacklo_epi16(data_2, data_3);
+        const __m256i src_4 = _mm256_unpacklo_epi16(data_4, data_5);
+        const __m256i src_6 = _mm256_unpacklo_epi16(data_6, data_7);
+
+        const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01);
+        const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23);
+        const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45);
+        const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67);
+
+        const __m256i res_even = _mm256_add_epi32(
+            _mm256_add_epi32(res_0, res_2), _mm256_add_epi32(res_4, res_6));
+
+        // Filter the odd-indices, increasing to 32-bit precision
+        const __m256i src_1 = _mm256_unpackhi_epi16(data_0, data_1);
+        const __m256i src_3 = _mm256_unpackhi_epi16(data_2, data_3);
+        const __m256i src_5 = _mm256_unpackhi_epi16(data_4, data_5);
+        const __m256i src_7 = _mm256_unpackhi_epi16(data_6, data_7);
+
+        const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01);
+        const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23);
+        const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45);
+        const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67);
+
+        const __m256i res_odd = _mm256_add_epi32(
+            _mm256_add_epi32(res_1, res_3), _mm256_add_epi32(res_5, res_7));
+
+        // Pixels are currently in the following order:
+        // res_even order: [ 14 12 10 8 ] [ 6 4 2 0 ]
+        // res_odd order:  [ 15 13 11 9 ] [ 7 5 3 1 ]
+        //
+        // Rearrange the pixels into the following order:
+        // res_lo order: [ 11 10  9  8 ] [ 3 2 1 0 ]
+        // res_hi order: [ 15 14 13 12 ] [ 7 6 5 4 ]
+        const __m256i res_lo = _mm256_unpacklo_epi32(res_even, res_odd);
+        const __m256i res_hi = _mm256_unpackhi_epi32(res_even, res_odd);
+
+        const __m256i res_lo_round = _mm256_srai_epi32(
+            _mm256_add_epi32(res_lo, round_const), conv_params->round_1);
+        const __m256i res_hi_round = _mm256_srai_epi32(
+            _mm256_add_epi32(res_hi, round_const), conv_params->round_1);
+
+        // Reduce to 16-bit precision and pack into the correct order:
+        // [ 15 14 13 12 11 10 9 8 ][ 7 6 5 4 3 2 1 0 ]
+        const __m256i res_16bit =
+            _mm256_packs_epi32(res_lo_round, res_hi_round);
+        const __m256i res_16bit_clamped = _mm256_min_epi16(
+            _mm256_max_epi16(res_16bit, clamp_low), clamp_high);
+
+        // Store in the dst array
+        yy_storeu_256(dst + i * dst_stride + j, res_16bit_clamped);
+      }
+    }
+  }
+}
diff --git a/libs/libaom/src/av1/common/x86/highbd_wiener_convolve_ssse3.c b/libs/libaom/src/av1/common/x86/highbd_wiener_convolve_ssse3.c
new file mode 100644
index 000000000..818b1099c
--- /dev/null
+++ b/libs/libaom/src/av1/common/x86/highbd_wiener_convolve_ssse3.c
@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "av1/common/convolve.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+
+void av1_highbd_wiener_convolve_add_src_ssse3(
+    const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8,
+    ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
+    const int16_t *filter_y, int y_step_q4, int w, int h,
+    const ConvolveParams *conv_params, int bd) {
+  assert(x_step_q4 == 16 && y_step_q4 == 16);
+  assert(!(w & 7));
+  assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16);
+  (void)x_step_q4;
+  (void)y_step_q4;
+
+  const uint16_t *const src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *const dst = CONVERT_TO_SHORTPTR(dst8);
+
+  DECLARE_ALIGNED(16, uint16_t,
+                  temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
+  int intermediate_height = h + SUBPEL_TAPS - 1;
+  int i, j;
+  const int center_tap = ((SUBPEL_TAPS - 1) / 2);
+  const uint16_t *const src_ptr = src - center_tap * src_stride - center_tap;
+
+  const __m128i zero = _mm_setzero_si128();
+  // Add an offset to account for the "add_src" part of the convolve function.
+  const __m128i offset = _mm_insert_epi16(zero, 1 << FILTER_BITS, 3);
+
+  /* Horizontal filter */
+  {
+    const __m128i coeffs_x =
+        _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_x), offset);
+
+    // coeffs 0 1 0 1 2 3 2 3
+    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
+    // coeffs 4 5 4 5 6 7 6 7
+    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
+
+    // coeffs 0 1 0 1 0 1 0 1
+    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+    // coeffs 2 3 2 3 2 3 2 3
+    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+    // coeffs 4 5 4 5 4 5 4 5
+    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+    // coeffs 6 7 6 7 6 7 6 7
+    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+    const __m128i round_const = _mm_set1_epi32(
+        (1 << (conv_params->round_0 - 1)) + (1 << (bd + FILTER_BITS - 1)));
+
+    for (i = 0; i < intermediate_height; ++i) {
+      for (j = 0; j < w; j += 8) {
+        const __m128i data =
+            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+        const __m128i data2 =
+            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j + 8]);
+
+        // Filter even-index pixels
+        const __m128i res_0 = _mm_madd_epi16(data, coeff_01);
+        const __m128i res_2 =
+            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 4), coeff_23);
+        const __m128i res_4 =
+            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 8), coeff_45);
+        const __m128i res_6 =
+            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 12), coeff_67);
+
+        __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
+                                         _mm_add_epi32(res_2, res_6));
+        res_even = _mm_srai_epi32(_mm_add_epi32(res_even, round_const),
+                                  conv_params->round_0);
+
+        // Filter odd-index pixels
+        const __m128i res_1 =
+            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 2), coeff_01);
+        const __m128i res_3 =
+            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 6), coeff_23);
+        const __m128i res_5 =
+            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 10), coeff_45);
+        const __m128i res_7 =
+            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 14), coeff_67);
+
+        __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
+                                        _mm_add_epi32(res_3, res_7));
+        res_odd = _mm_srai_epi32(_mm_add_epi32(res_odd, round_const),
+                                 conv_params->round_0);
+
+        // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
+        const __m128i maxval =
+            _mm_set1_epi16((WIENER_CLAMP_LIMIT(conv_params->round_0, bd)) - 1);
+        __m128i res = _mm_packs_epi32(res_even, res_odd);
+        res = _mm_min_epi16(_mm_max_epi16(res, zero), maxval);
+        _mm_storeu_si128((__m128i *)&temp[i * MAX_SB_SIZE + j], res);
+      }
+    }
+  }
+
+  /* Vertical filter */
+  {
+    const __m128i coeffs_y =
+        _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_y), offset);
+
+    // coeffs 0 1 0 1 2 3 2 3
+    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+    // coeffs 4 5 4 5 6 7 6 7
+    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+    // coeffs 0 1 0 1 0 1 0 1
+    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+    // coeffs 2 3 2 3 2 3 2 3
+    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+    // coeffs 4 5 4 5 4 5 4 5
+    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+    // coeffs 6 7 6 7 6 7 6 7
+    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+    const __m128i round_const =
+        _mm_set1_epi32((1 << (conv_params->round_1 - 1)) -
+                       (1 << (bd + conv_params->round_1 - 1)));
+
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; j += 8) {
+        // Filter even-index pixels
+        const uint16_t *data = &temp[i * MAX_SB_SIZE + j];
+        const __m128i src_0 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE),
+                               *(__m128i *)(data + 1 * MAX_SB_SIZE));
+        const __m128i src_2 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE),
+                               *(__m128i *)(data + 3 * MAX_SB_SIZE));
+        const __m128i src_4 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE),
+                               *(__m128i *)(data + 5 * MAX_SB_SIZE));
+        const __m128i src_6 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE),
+                               *(__m128i *)(data + 7 * MAX_SB_SIZE));
+
+        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+        const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+                                               _mm_add_epi32(res_4, res_6));
+
+        // Filter odd-index pixels
+        const __m128i src_1 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE),
+                               *(__m128i *)(data + 1 * MAX_SB_SIZE));
+        const __m128i src_3 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE),
+                               *(__m128i *)(data + 3 * MAX_SB_SIZE));
+        const __m128i src_5 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE),
+                               *(__m128i *)(data + 5 * MAX_SB_SIZE));
+        const __m128i src_7 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE),
+                               *(__m128i *)(data + 7 * MAX_SB_SIZE));
+
+        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+        const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+                                              _mm_add_epi32(res_5, res_7));
+
+        // Rearrange pixels back into the order 0 ... 7
+        const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+        const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+        const __m128i res_lo_round = _mm_srai_epi32(
+            _mm_add_epi32(res_lo, round_const), conv_params->round_1);
+        const __m128i res_hi_round = _mm_srai_epi32(
+            _mm_add_epi32(res_hi, round_const), conv_params->round_1);
+
+        const __m128i maxval = _mm_set1_epi16((1 << bd) - 1);
+        __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
+        res_16bit = _mm_min_epi16(_mm_max_epi16(res_16bit, zero), maxval);
+
+        __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
+        _mm_storeu_si128(p, res_16bit);
+      }
+    }
+  }
+}
diff --git a/libs/libaom/src/av1/common/x86/intra_edge_sse4.c b/libs/libaom/src/av1/common/x86/intra_edge_sse4.c
new file mode 100644
index 000000000..fc69f41d7
--- /dev/null
+++ b/libs/libaom/src/av1/common/x86/intra_edge_sse4.c
@@ -0,0 +1,318 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <smmintrin.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+void av1_filter_intra_edge_sse4_1(uint8_t *p, int sz, int strength) {
+  if (!strength) return;
+
+  DECLARE_ALIGNED(16, static const int8_t, kern[3][16]) = {
+    { 4, 8, 4, 0, 4, 8, 4, 0, 4, 8, 4, 0, 4, 8, 4, 0 },  // strength 1: 4,8,4
+    { 5, 6, 5, 0, 5, 6, 5, 0, 5, 6, 5, 0, 5, 6, 5, 0 },  // strength 2: 5,6,5
+    { 2, 4, 4, 4, 2, 0, 0, 0, 2, 4, 4, 4, 2, 0, 0, 0 }  // strength 3: 2,4,4,4,2
+  };
+
+  DECLARE_ALIGNED(16, static const int8_t, v_const[5][16]) = {
+    { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 },
+    { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 },
+    { 0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8 },
+    { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+  };
+
+  // Extend the first and last samples to simplify the loop for the 5-tap case
+  p[-1] = p[0];
+  __m128i last = _mm_set1_epi8(p[sz - 1]);
+  _mm_storeu_si128((__m128i *)&p[sz], last);
+
+  // Adjust input pointer for filter support area
+  uint8_t *in = (strength == 3) ? p - 1 : p;
+
+  // Avoid modifying first sample
+  uint8_t *out = p + 1;
+  int len = sz - 1;
+
+  const int use_3tap_filter = (strength < 3);
+
+  if (use_3tap_filter) {
+    __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]);
+    __m128i shuf0 = _mm_lddqu_si128((__m128i const *)v_const[0]);
+    __m128i shuf1 = _mm_lddqu_si128((__m128i const *)v_const[1]);
+    __m128i iden = _mm_lddqu_si128((__m128i *)v_const[3]);
+    __m128i in0 = _mm_lddqu_si128((__m128i *)in);
+    while (len > 0) {
+      int n_out = (len < 8) ? len : 8;
+      __m128i d0 = _mm_shuffle_epi8(in0, shuf0);
+      __m128i d1 = _mm_shuffle_epi8(in0, shuf1);
+      d0 = _mm_maddubs_epi16(d0, coef0);
+      d1 = _mm_maddubs_epi16(d1, coef0);
+      d0 = _mm_hadd_epi16(d0, d1);
+      __m128i eight = _mm_set1_epi16(8);
+      d0 = _mm_add_epi16(d0, eight);
+      d0 = _mm_srai_epi16(d0, 4);
+      d0 = _mm_packus_epi16(d0, d0);
+      __m128i out0 = _mm_lddqu_si128((__m128i *)out);
+      __m128i n0 = _mm_set1_epi8(n_out);
+      __m128i mask = _mm_cmpgt_epi8(n0, iden);
+      out0 = _mm_blendv_epi8(out0, d0, mask);
+      _mm_storel_epi64((__m128i *)out, out0);
+      __m128i in1 = _mm_lddqu_si128((__m128i *)(in + 16));
+      in0 = _mm_alignr_epi8(in1, in0, 8);
+      in += 8;
+      out += 8;
+      len -= n_out;
+    }
+  } else {  // 5-tap filter
+    __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]);
+    __m128i two = _mm_set1_epi8(2);
+    __m128i shuf_a = _mm_lddqu_si128((__m128i const *)v_const[2]);
+    __m128i shuf_b = _mm_add_epi8(shuf_a, two);
+    __m128i shuf_c = _mm_add_epi8(shuf_b, two);
+    __m128i shuf_d = _mm_add_epi8(shuf_c, two);
+    __m128i iden = _mm_lddqu_si128((__m128i *)v_const[3]);
+    __m128i in0 = _mm_lddqu_si128((__m128i *)in);
+    while (len > 0) {
+      int n_out = (len < 8) ? len : 8;
+      __m128i d0 = _mm_shuffle_epi8(in0, shuf_a);
+      __m128i d1 = _mm_shuffle_epi8(in0, shuf_b);
+      __m128i d2 = _mm_shuffle_epi8(in0, shuf_c);
+      __m128i d3 = _mm_shuffle_epi8(in0, shuf_d);
+      d0 = _mm_maddubs_epi16(d0, coef0);
+      d1 = _mm_maddubs_epi16(d1, coef0);
+      d2 = _mm_maddubs_epi16(d2, coef0);
+      d3 = _mm_maddubs_epi16(d3, coef0);
+      d0 = _mm_hadd_epi16(d0, d1);
+      d2 = _mm_hadd_epi16(d2, d3);
+      d0 = _mm_hadd_epi16(d0, d2);
+      __m128i eight = _mm_set1_epi16(8);
+      d0 = _mm_add_epi16(d0, eight);
+      d0 = _mm_srai_epi16(d0, 4);
+      d0 = _mm_packus_epi16(d0, d0);
+      __m128i out0 = _mm_lddqu_si128((__m128i *)out);
+      __m128i n0 = _mm_set1_epi8(n_out);
+      __m128i mask = _mm_cmpgt_epi8(n0, iden);
+      out0 = _mm_blendv_epi8(out0, d0, mask);
+      _mm_storel_epi64((__m128i *)out, out0);
+      __m128i in1 = _mm_lddqu_si128((__m128i *)(in + 16));
+      in0 = _mm_alignr_epi8(in1, in0, 8);
+      in += 8;
+      out += 8;
+      len -= n_out;
+    }
+  }
+}
+
+void av1_filter_intra_edge_high_sse4_1(uint16_t *p, int sz, int strength) {
+  if (!strength) return;
+
+  DECLARE_ALIGNED(16, static const int16_t, kern[3][8]) = {
+    { 4, 8, 4, 8, 4, 8, 4, 8 },  // strength 1: 4,8,4
+    { 5, 6, 5, 6, 5, 6, 5, 6 },  // strength 2: 5,6,5
+    { 2, 4, 2, 4, 2, 4, 2, 4 }   // strength 3: 2,4,4,4,2
+  };
+
+  DECLARE_ALIGNED(16, static const int16_t,
+                  v_const[1][8]) = { { 0, 1, 2, 3, 4, 5, 6, 7 } };
+
+  // Extend the first and last samples to simplify the loop for the 5-tap case
+  p[-1] = p[0];
+  __m128i last = _mm_set1_epi16(p[sz - 1]);
+  _mm_storeu_si128((__m128i *)&p[sz], last);
+
+  // Adjust input pointer for filter support area
+  uint16_t *in = (strength == 3) ? p - 1 : p;
+
+  // Avoid modifying first sample
+  uint16_t *out = p + 1;
+  int len = sz - 1;
+
+  const int use_3tap_filter = (strength < 3);
+
+  if (use_3tap_filter) {
+    __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]);
+    __m128i iden = _mm_lddqu_si128((__m128i *)v_const[0]);
+    __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]);
+    __m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]);
+    while (len > 0) {
+      int n_out = (len < 8) ? len : 8;
+      __m128i in1 = _mm_alignr_epi8(in8, in0, 2);
+      __m128i in2 = _mm_alignr_epi8(in8, in0, 4);
+      __m128i in02 = _mm_add_epi16(in0, in2);
+      __m128i d0 = _mm_unpacklo_epi16(in02, in1);
+      __m128i d1 = _mm_unpackhi_epi16(in02, in1);
+      d0 = _mm_mullo_epi16(d0, coef0);
+      d1 = _mm_mullo_epi16(d1, coef0);
+      d0 = _mm_hadd_epi16(d0, d1);
+      __m128i eight = _mm_set1_epi16(8);
+      d0 = _mm_add_epi16(d0, eight);
+      d0 = _mm_srli_epi16(d0, 4);
+      __m128i out0 = _mm_lddqu_si128((__m128i *)out);
+      __m128i n0 = _mm_set1_epi16(n_out);
+      __m128i mask = _mm_cmpgt_epi16(n0, iden);
+      out0 = _mm_blendv_epi8(out0, d0, mask);
+      _mm_storeu_si128((__m128i *)out, out0);
+      in += 8;
+      in0 = in8;
+      in8 = _mm_lddqu_si128((__m128i *)&in[8]);
+      out += 8;
+      len -= n_out;
+    }
+  } else {  // 5-tap filter
+    __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]);
+    __m128i iden = _mm_lddqu_si128((__m128i *)v_const[0]);
+    __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]);
+    __m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]);
+    while (len > 0) {
+      int n_out = (len < 8) ? len : 8;
+      __m128i in1 = _mm_alignr_epi8(in8, in0, 2);
+      __m128i in2 = _mm_alignr_epi8(in8, in0, 4);
+      __m128i in3 = _mm_alignr_epi8(in8, in0, 6);
+      __m128i in4 = _mm_alignr_epi8(in8, in0, 8);
+      __m128i in04 = _mm_add_epi16(in0, in4);
+      __m128i in123 = _mm_add_epi16(in1, in2);
+      in123 = _mm_add_epi16(in123, in3);
+      __m128i d0 = _mm_unpacklo_epi16(in04, in123);
+      __m128i d1 = _mm_unpackhi_epi16(in04, in123);
+      d0 = _mm_mullo_epi16(d0, coef0);
+      d1 = _mm_mullo_epi16(d1, coef0);
+      d0 = _mm_hadd_epi16(d0, d1);
+      __m128i eight = _mm_set1_epi16(8);
+      d0 = _mm_add_epi16(d0, eight);
+      d0 = _mm_srli_epi16(d0, 4);
+      __m128i out0 = _mm_lddqu_si128((__m128i *)out);
+      __m128i n0 = _mm_set1_epi16(n_out);
+      __m128i mask = _mm_cmpgt_epi16(n0, iden);
+      out0 = _mm_blendv_epi8(out0, d0, mask);
+      _mm_storeu_si128((__m128i *)out, out0);
+      in += 8;
+      in0 = in8;
+      in8 = _mm_lddqu_si128((__m128i *)&in[8]);
+      out += 8;
+      len -= n_out;
+    }
+  }
+}
+
+void av1_upsample_intra_edge_sse4_1(uint8_t *p, int sz) {
+  // interpolate half-sample positions
+  assert(sz <= 24);
+
+  DECLARE_ALIGNED(16, static const int8_t, kernel[1][16]) = {
+    { -1, 9, 9, -1, -1, 9, 9, -1, -1, 9, 9, -1, -1, 9, 9, -1 }
+  };
+
+  DECLARE_ALIGNED(
+      16, static const int8_t,
+      v_const[2][16]) = { { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 },
+                          { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } };
+
+  // Extend first/last samples (upper-left p[-1], last p[sz-1])
+  // to support 4-tap filter
+  p[-2] = p[-1];
+  p[sz] = p[sz - 1];
+
+  uint8_t *in = &p[-2];
+  uint8_t *out = &p[-2];
+
+  int n = sz + 1;  // Input length including upper-left sample
+
+  __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]);
+  __m128i in16 = _mm_lddqu_si128((__m128i *)&in[16]);
+
+  __m128i coef0 = _mm_lddqu_si128((__m128i *)kernel[0]);
+  __m128i shuf0 = _mm_lddqu_si128((__m128i *)v_const[0]);
+  __m128i shuf1 = _mm_lddqu_si128((__m128i *)v_const[1]);
+
+  while (n > 0) {
+    __m128i in8 = _mm_alignr_epi8(in16, in0, 8);
+    __m128i d0 = _mm_shuffle_epi8(in0, shuf0);
+    __m128i d1 = _mm_shuffle_epi8(in0, shuf1);
+    __m128i d2 = _mm_shuffle_epi8(in8, shuf0);
+    __m128i d3 = _mm_shuffle_epi8(in8, shuf1);
+    d0 = _mm_maddubs_epi16(d0, coef0);
+    d1 = _mm_maddubs_epi16(d1, coef0);
+    d2 = _mm_maddubs_epi16(d2, coef0);
+    d3 = _mm_maddubs_epi16(d3, coef0);
+    d0 = _mm_hadd_epi16(d0, d1);
+    d2 = _mm_hadd_epi16(d2, d3);
+    __m128i eight = _mm_set1_epi16(8);
+    d0 = _mm_add_epi16(d0, eight);
+    d2 = _mm_add_epi16(d2, eight);
+    d0 = _mm_srai_epi16(d0, 4);
+    d2 = _mm_srai_epi16(d2, 4);
+    d0 = _mm_packus_epi16(d0, d2);
+    __m128i in1 = _mm_alignr_epi8(in16, in0, 1);
+    __m128i out0 = _mm_unpacklo_epi8(in1, d0);
+    __m128i out1 = _mm_unpackhi_epi8(in1, d0);
+    _mm_storeu_si128((__m128i *)&out[0], out0);
+    _mm_storeu_si128((__m128i *)&out[16], out1);
+    in0 = in16;
+    in16 = _mm_setzero_si128();
+    out += 32;
+    n -= 16;
+  }
+}
+
+void av1_upsample_intra_edge_high_sse4_1(uint16_t *p, int sz, int bd) {
+  // interpolate half-sample positions
+  assert(sz <= 24);
+
+  DECLARE_ALIGNED(16, static const int16_t,
+                  kernel[1][8]) = { { -1, 9, -1, 9, -1, 9, -1, 9 } };
+
+  // Extend first/last samples (upper-left p[-1], last p[sz-1])
+  // to support 4-tap filter
+  p[-2] = p[-1];
+  p[sz] = p[sz - 1];
+
+  uint16_t *in = &p[-2];
+  uint16_t *out = in;
+  int n = sz + 1;
+
+  __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]);
+  __m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]);
+  __m128i in16 = _mm_lddqu_si128((__m128i *)&in[16]);
+  __m128i in24 = _mm_lddqu_si128((__m128i *)&in[24]);
+
+  while (n > 0) {
+    __m128i in1 = _mm_alignr_epi8(in8, in0, 2);
+    __m128i in2 = _mm_alignr_epi8(in8, in0, 4);
+    __m128i in3 = _mm_alignr_epi8(in8, in0, 6);
+    __m128i sum0 = _mm_add_epi16(in0, in3);
+    __m128i sum1 = _mm_add_epi16(in1, in2);
+    __m128i d0 = _mm_unpacklo_epi16(sum0, sum1);
+    __m128i d1 = _mm_unpackhi_epi16(sum0, sum1);
+    __m128i coef0 = _mm_lddqu_si128((__m128i *)kernel[0]);
+    d0 = _mm_madd_epi16(d0, coef0);
+    d1 = _mm_madd_epi16(d1, coef0);
+    __m128i eight = _mm_set1_epi32(8);
+    d0 = _mm_add_epi32(d0, eight);
+    d1 = _mm_add_epi32(d1, eight);
+    d0 = _mm_srai_epi32(d0, 4);
+    d1 = _mm_srai_epi32(d1, 4);
+    d0 = _mm_packus_epi32(d0, d1);
+    __m128i max0 = _mm_set1_epi16((1 << bd) - 1);
+    d0 = _mm_min_epi16(d0, max0);
+    __m128i out0 = _mm_unpacklo_epi16(in1, d0);
+    __m128i out1 = _mm_unpackhi_epi16(in1, d0);
+    _mm_storeu_si128((__m128i *)&out[0], out0);
+    _mm_storeu_si128((__m128i *)&out[8], out1);
+    in0 = in8;
+    in8 = in16;
+    in16 = in24;
+    in24 = _mm_setzero_si128();
+    out += 16;
+    n -= 8;
+  }
+}
diff --git a/libs/libaom/src/av1/common/x86/jnt_convolve_avx2.c b/libs/libaom/src/av1/common/x86/jnt_convolve_avx2.c
new file mode 100644
index 000000000..6de61573e
--- /dev/null
+++ b/libs/libaom/src/av1/common/x86/jnt_convolve_avx2.c
@@ -0,0 +1,917 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/convolve_avx2.h"
+#include "aom_dsp/x86/convolve_common_intrin.h"
+#include "aom_dsp/x86/convolve_sse4_1.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "av1/common/convolve.h"
+
+static INLINE __m256i unpack_weights_avx2(ConvolveParams *conv_params) {
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m256i wt0 = _mm256_set1_epi16((int16_t)w0);
+  const __m256i wt1 = _mm256_set1_epi16((int16_t)w1);
+  const __m256i wt = _mm256_unpacklo_epi16(wt0, wt1);
+  return wt;
+}
+
+static INLINE __m256i load_line2_avx2(const void *a, const void *b) {
+  return _mm256_permute2x128_si256(
+      _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)a)),
+      _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)b)), 0x20);
+}
+
+void av1_dist_wtd_convolve_x_avx2(const uint8_t *src, int src_stride,
+                                  uint8_t *dst0, int dst_stride0, int w, int h,
+                                  const InterpFilterParams *filter_params_x,
+                                  const InterpFilterParams *filter_params_y,
+                                  const int subpel_x_qn, const int subpel_y_qn,
+                                  ConvolveParams *conv_params) {
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  const int bd = 8;
+  int i, j, is_horiz_4tap = 0;
+  const int bits = FILTER_BITS - conv_params->round_1;
+  const __m256i wt = unpack_weights_avx2(conv_params);
+  const int do_average = conv_params->do_average;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
+  const int offset_0 =
+      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+  const __m256i offset_const = _mm256_set1_epi16(offset);
+  const int rounding_shift =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1);
+
+  assert(bits >= 0);
+  assert(conv_params->round_0 > 0);
+
+  const __m256i round_const =
+      _mm256_set1_epi16((1 << (conv_params->round_0 - 1)) >> 1);
+  const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1);
+
+  (void)filter_params_y;
+  (void)subpel_y_qn;
+
+  __m256i filt[4], coeffs[4];
+
+  filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2);
+  filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+
+  prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs);
+
+  // Condition for checking valid horz_filt taps
+  if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0)))
+    is_horiz_4tap = 1;
+
+  // horz_filt as 4 tap
+  if (is_horiz_4tap) {
+    const int fo_horiz = 1;
+    const uint8_t *const src_ptr = src - fo_horiz;
+    for (i = 0; i < h; i += 2) {
+      const uint8_t *src_data = src_ptr + i * src_stride;
+      CONV_BUF_TYPE *dst_data = dst + i * dst_stride;
+      for (j = 0; j < w; j += 8) {
+        const __m256i data =
+            load_line2_avx2(&src_data[j], &src_data[j + src_stride]);
+
+        __m256i res = convolve_lowbd_x_4tap(data, coeffs + 1, filt);
+        res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const), round_shift);
+        res = _mm256_slli_epi16(res, bits);
+
+        const __m256i res_unsigned = _mm256_add_epi16(res, offset_const);
+
+        // Accumulate values into the destination buffer
+        if (do_average) {
+          const __m256i data_ref_0 =
+              load_line2_avx2(&dst_data[j], &dst_data[j + dst_stride]);
+          const __m256i comp_avg_res =
+              comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
+
+          const __m256i round_result = convolve_rounding(
+              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+          const __m256i res_8 = _mm256_packus_epi16(round_result, round_result);
+          const __m128i res_0 = _mm256_castsi256_si128(res_8);
+          const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+          if (w > 4) {
+            _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+            _mm_storel_epi64(
+                (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+          } else {
+            *(uint32_t *)(&dst0[i * dst_stride0 + j]) =
+                _mm_cvtsi128_si32(res_0);
+            *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
+                _mm_cvtsi128_si32(res_1);
+          }
+        } else {
+          const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+
+          const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                          res_1);
+        }
+      }
+    }
+  } else {
+    const int fo_horiz = filter_params_x->taps / 2 - 1;
+    const uint8_t *const src_ptr = src - fo_horiz;
+
+    filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+    filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
+    for (i = 0; i < h; i += 2) {
+      const uint8_t *src_data = src_ptr + i * src_stride;
+      CONV_BUF_TYPE *dst_data = dst + i * dst_stride;
+      for (j = 0; j < w; j += 8) {
+        const __m256i data =
+            load_line2_avx2(&src_data[j], &src_data[j + src_stride]);
+
+        __m256i res = convolve_lowbd_x(data, coeffs, filt);
+
+        res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const), round_shift);
+
+        res = _mm256_slli_epi16(res, bits);
+
+        const __m256i res_unsigned = _mm256_add_epi16(res, offset_const);
+
+        // Accumulate values into the destination buffer
+        if (do_average) {
+          const __m256i data_ref_0 =
+              load_line2_avx2(&dst_data[j], &dst_data[j + dst_stride]);
+          const __m256i comp_avg_res =
+              comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
+
+          const __m256i round_result = convolve_rounding(
+              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+          const __m256i res_8 = _mm256_packus_epi16(round_result, round_result);
+          const __m128i res_0 = _mm256_castsi256_si128(res_8);
+          const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+          if (w > 4) {
+            _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+            _mm_storel_epi64(
+                (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+          } else {
+            *(uint32_t *)(&dst0[i * dst_stride0 + j]) =
+                _mm_cvtsi128_si32(res_0);
+            *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
+                _mm_cvtsi128_si32(res_1);
+          }
+        } else {
+          const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+
+          const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                          res_1);
+        }
+      }
+    }
+  }
+}
+
+void av1_dist_wtd_convolve_y_avx2(const uint8_t *src, int src_stride,
+                                  uint8_t *dst0, int dst_stride0, int w, int h,
+                                  const InterpFilterParams *filter_params_x,
+                                  const InterpFilterParams *filter_params_y,
+                                  const int subpel_x_qn, const int subpel_y_qn,
+                                  ConvolveParams *conv_params) {
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  const int bd = 8;
+  int i, j, is_vert_4tap = 0;
+  // +1 to compensate for dividing the filter coeffs by 2
+  const int left_shift = FILTER_BITS - conv_params->round_0 + 1;
+  const __m256i round_const =
+      _mm256_set1_epi32((1 << conv_params->round_1) >> 1);
+  const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
+  const __m256i wt = unpack_weights_avx2(conv_params);
+  const int do_average = conv_params->do_average;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
+  const int offset_0 =
+      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+  const __m256i offset_const = _mm256_set1_epi16(offset);
+  const int offset_1 = (1 << (bd + FILTER_BITS - 2));
+  const __m256i offset_const_1 = _mm256_set1_epi16(offset_1);
+  const __m256i offset_const_2 = _mm256_set1_epi16((1 << offset_0));
+  const int rounding_shift =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1);
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i coeffs[4], s[8];
+
+  assert((FILTER_BITS - conv_params->round_0) >= 0);
+
+  prepare_coeffs_lowbd(filter_params_y, subpel_y_qn, coeffs);
+
+  (void)conv_params;
+  (void)filter_params_x;
+  (void)subpel_x_qn;
+
+  // Condition for checking valid vert_filt taps
+  if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0)))
+    is_vert_4tap = 1;
+
+  if (is_vert_4tap) {
+    const int fo_vert = 1;
+    const uint8_t *const src_ptr = src - fo_vert * src_stride;
+    for (j = 0; j < w; j += 16) {
+      const uint8_t *data = &src_ptr[j];
+      __m256i src4;
+      // Load lines a and b. Line a to lower 128, line b to upper 128
+      {
+        __m256i src_ab[4];
+        __m256i src_a[5];
+        src_a[0] = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
+        for (int kk = 0; kk < 4; ++kk) {
+          data += src_stride;
+          src_a[kk + 1] =
+              _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
+          src_ab[kk] =
+              _mm256_permute2x128_si256(src_a[kk], src_a[kk + 1], 0x20);
+        }
+        src4 = src_a[4];
+        s[0] = _mm256_unpacklo_epi8(src_ab[0], src_ab[1]);
+        s[1] = _mm256_unpacklo_epi8(src_ab[2], src_ab[3]);
+
+        s[3] = _mm256_unpackhi_epi8(src_ab[0], src_ab[1]);
+        s[4] = _mm256_unpackhi_epi8(src_ab[2], src_ab[3]);
+      }
+
+      for (i = 0; i < h; i += 2) {
+        data = &src_ptr[(i + 5) * src_stride + j];
+        const __m256i src5 =
+            _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
+        const __m256i src_45a = _mm256_permute2x128_si256(src4, src5, 0x20);
+
+        src4 = _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + src_stride)));
+        const __m256i src_56a = _mm256_permute2x128_si256(src5, src4, 0x20);
+
+        s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
+        s[5] = _mm256_unpackhi_epi8(src_45a, src_56a);
+
+        __m256i res_lo = convolve_lowbd_4tap(s, coeffs + 1);
+
+        res_lo = _mm256_add_epi16(res_lo, offset_const_1);
+
+        const __m256i res_lo_0_32b = _mm256_unpacklo_epi16(res_lo, zero);
+        const __m256i res_lo_0_shift =
+            _mm256_slli_epi32(res_lo_0_32b, left_shift);
+        const __m256i res_lo_0_round = _mm256_sra_epi32(
+            _mm256_add_epi32(res_lo_0_shift, round_const), round_shift);
+
+        const __m256i res_lo_1_32b = _mm256_unpackhi_epi16(res_lo, zero);
+        const __m256i res_lo_1_shift =
+            _mm256_slli_epi32(res_lo_1_32b, left_shift);
+        const __m256i res_lo_1_round = _mm256_sra_epi32(
+            _mm256_add_epi32(res_lo_1_shift, round_const), round_shift);
+
+        const __m256i res_lo_round =
+            _mm256_packs_epi32(res_lo_0_round, res_lo_1_round);
+
+        const __m256i res_lo_unsigned =
+            _mm256_add_epi16(res_lo_round, offset_const_2);
+
+        if (w - j < 16) {
+          if (do_average) {
+            const __m256i data_ref_0 =
+                load_line2_avx2(&dst[i * dst_stride + j],
+                                &dst[i * dst_stride + j + dst_stride]);
+            const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_lo_unsigned,
+                                                  &wt, use_dist_wtd_comp_avg);
+
+            const __m256i round_result = convolve_rounding(
+                &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+            const __m256i res_8 =
+                _mm256_packus_epi16(round_result, round_result);
+            const __m128i res_0 = _mm256_castsi256_si128(res_8);
+            const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+            if (w - j > 4) {
+              _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+              _mm_storel_epi64(
+                  (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])),
+                  res_1);
+            } else {
+              *(uint32_t *)(&dst0[i * dst_stride0 + j]) =
+                  _mm_cvtsi128_si32(res_0);
+              *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
+                  _mm_cvtsi128_si32(res_1);
+            }
+          } else {
+            const __m128i res_0 = _mm256_castsi256_si128(res_lo_unsigned);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+
+            const __m128i res_1 = _mm256_extracti128_si256(res_lo_unsigned, 1);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                            res_1);
+          }
+        } else {
+          __m256i res_hi = convolve_lowbd_4tap(s + 3, coeffs + 1);
+
+          res_hi = _mm256_add_epi16(res_hi, offset_const_1);
+
+          const __m256i res_hi_0_32b = _mm256_unpacklo_epi16(res_hi, zero);
+          const __m256i res_hi_0_shift =
+              _mm256_slli_epi32(res_hi_0_32b, left_shift);
+          const __m256i res_hi_0_round = _mm256_sra_epi32(
+              _mm256_add_epi32(res_hi_0_shift, round_const), round_shift);
+
+          const __m256i res_hi_1_32b = _mm256_unpackhi_epi16(res_hi, zero);
+          const __m256i res_hi_1_shift =
+              _mm256_slli_epi32(res_hi_1_32b, left_shift);
+          const __m256i res_hi_1_round = _mm256_sra_epi32(
+              _mm256_add_epi32(res_hi_1_shift, round_const), round_shift);
+
+          const __m256i res_hi_round =
+              _mm256_packs_epi32(res_hi_0_round, res_hi_1_round);
+
+          const __m256i res_hi_unsigned =
+              _mm256_add_epi16(res_hi_round, offset_const_2);
+
+          if (do_average) {
+            const __m256i data_ref_0_lo =
+                load_line2_avx2(&dst[i * dst_stride + j],
+                                &dst[i * dst_stride + j + dst_stride]);
+
+            const __m256i data_ref_0_hi =
+                load_line2_avx2(&dst[i * dst_stride + j + 8],
+                                &dst[i * dst_stride + j + 8 + dst_stride]);
+
+            const __m256i comp_avg_res_lo = comp_avg(
+                &data_ref_0_lo, &res_lo_unsigned, &wt, use_dist_wtd_comp_avg);
+
+            const __m256i comp_avg_res_hi = comp_avg(
+                &data_ref_0_hi, &res_hi_unsigned, &wt, use_dist_wtd_comp_avg);
+
+            const __m256i round_result_lo =
+                convolve_rounding(&comp_avg_res_lo, &offset_const,
+                                  &rounding_const, rounding_shift);
+
+            const __m256i round_result_hi =
+                convolve_rounding(&comp_avg_res_hi, &offset_const,
+                                  &rounding_const, rounding_shift);
+
+            const __m256i res_8 =
+                _mm256_packus_epi16(round_result_lo, round_result_hi);
+            const __m128i res_0 = _mm256_castsi256_si128(res_8);
+            const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+            _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+            _mm_store_si128(
+                (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+
+          } else {
+            const __m128i res_lo_0 = _mm256_castsi256_si128(res_lo_unsigned);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_lo_0);
+
+            const __m128i res_lo_1 =
+                _mm256_extracti128_si256(res_lo_unsigned, 1);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                            res_lo_1);
+
+            const __m128i res_hi_0 = _mm256_castsi256_si128(res_hi_unsigned);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + 8]),
+                            res_hi_0);
+
+            const __m128i res_hi_1 =
+                _mm256_extracti128_si256(res_hi_unsigned, 1);
+            _mm_store_si128(
+                (__m128i *)(&dst[i * dst_stride + j + 8 + dst_stride]),
+                res_hi_1);
+          }
+        }
+        s[0] = s[1];
+        s[1] = s[2];
+
+        s[3] = s[4];
+        s[4] = s[5];
+      }
+    }
+  } else {
+    const int fo_vert = filter_params_y->taps / 2 - 1;
+    const uint8_t *const src_ptr = src - fo_vert * src_stride;
+    for (j = 0; j < w; j += 16) {
+      const uint8_t *data = &src_ptr[j];
+      __m256i src6;
+      // Load lines a and b. Line a to lower 128, line b to upper 128
+      {
+        __m256i src_ab[7];
+        __m256i src_a[7];
+        src_a[0] = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
+        for (int kk = 0; kk < 6; ++kk) {
+          data += src_stride;
+          src_a[kk + 1] =
+              _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
+          src_ab[kk] =
+              _mm256_permute2x128_si256(src_a[kk], src_a[kk + 1], 0x20);
+        }
+        src6 = src_a[6];
+        s[0] = _mm256_unpacklo_epi8(src_ab[0], src_ab[1]);
+        s[1] = _mm256_unpacklo_epi8(src_ab[2], src_ab[3]);
+        s[2] = _mm256_unpacklo_epi8(src_ab[4], src_ab[5]);
+        s[4] = _mm256_unpackhi_epi8(src_ab[0], src_ab[1]);
+        s[5] = _mm256_unpackhi_epi8(src_ab[2], src_ab[3]);
+        s[6] = _mm256_unpackhi_epi8(src_ab[4], src_ab[5]);
+      }
+
+      for (i = 0; i < h; i += 2) {
+        data = &src_ptr[(i + 7) * src_stride + j];
+        const __m256i src7 =
+            _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
+        const __m256i src_67a = _mm256_permute2x128_si256(src6, src7, 0x20);
+
+        src6 = _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + src_stride)));
+        const __m256i src_78a = _mm256_permute2x128_si256(src7, src6, 0x20);
+
+        s[3] = _mm256_unpacklo_epi8(src_67a, src_78a);
+        s[7] = _mm256_unpackhi_epi8(src_67a, src_78a);
+
+        __m256i res_lo = convolve_lowbd(s, coeffs);
+
+        res_lo = _mm256_add_epi16(res_lo, offset_const_1);
+
+        const __m256i res_lo_0_32b = _mm256_unpacklo_epi16(res_lo, zero);
+        const __m256i res_lo_0_shift =
+            _mm256_slli_epi32(res_lo_0_32b, left_shift);
+        const __m256i res_lo_0_round = _mm256_sra_epi32(
+            _mm256_add_epi32(res_lo_0_shift, round_const), round_shift);
+
+        const __m256i res_lo_1_32b = _mm256_unpackhi_epi16(res_lo, zero);
+        const __m256i res_lo_1_shift =
+            _mm256_slli_epi32(res_lo_1_32b, left_shift);
+        const __m256i res_lo_1_round = _mm256_sra_epi32(
+            _mm256_add_epi32(res_lo_1_shift, round_const), round_shift);
+
+        const __m256i res_lo_round =
+            _mm256_packs_epi32(res_lo_0_round, res_lo_1_round);
+
+        const __m256i res_lo_unsigned =
+            _mm256_add_epi16(res_lo_round, offset_const_2);
+
+        if (w - j < 16) {
+          if (do_average) {
+            const __m256i data_ref_0 =
+                load_line2_avx2(&dst[i * dst_stride + j],
+                                &dst[i * dst_stride + j + dst_stride]);
+            const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_lo_unsigned,
+                                                  &wt, use_dist_wtd_comp_avg);
+
+            const __m256i round_result = convolve_rounding(
+                &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+            const __m256i res_8 =
+                _mm256_packus_epi16(round_result, round_result);
+            const __m128i res_0 = _mm256_castsi256_si128(res_8);
+            const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+            if (w - j > 4) {
+              _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+              _mm_storel_epi64(
+                  (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])),
+                  res_1);
+            } else {
+              *(uint32_t *)(&dst0[i * dst_stride0 + j]) =
+                  _mm_cvtsi128_si32(res_0);
+              *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
+                  _mm_cvtsi128_si32(res_1);
+            }
+          } else {
+            const __m128i res_0 = _mm256_castsi256_si128(res_lo_unsigned);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+
+            const __m128i res_1 = _mm256_extracti128_si256(res_lo_unsigned, 1);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                            res_1);
+          }
+        } else {
+          __m256i res_hi = convolve_lowbd(s + 4, coeffs);
+
+          res_hi = _mm256_add_epi16(res_hi, offset_const_1);
+
+          const __m256i res_hi_0_32b = _mm256_unpacklo_epi16(res_hi, zero);
+          const __m256i res_hi_0_shift =
+              _mm256_slli_epi32(res_hi_0_32b, left_shift);
+          const __m256i res_hi_0_round = _mm256_sra_epi32(
+              _mm256_add_epi32(res_hi_0_shift, round_const), round_shift);
+
+          const __m256i res_hi_1_32b = _mm256_unpackhi_epi16(res_hi, zero);
+          const __m256i res_hi_1_shift =
+              _mm256_slli_epi32(res_hi_1_32b, left_shift);
+          const __m256i res_hi_1_round = _mm256_sra_epi32(
+              _mm256_add_epi32(res_hi_1_shift, round_const), round_shift);
+
+          const __m256i res_hi_round =
+              _mm256_packs_epi32(res_hi_0_round, res_hi_1_round);
+
+          const __m256i res_hi_unsigned =
+              _mm256_add_epi16(res_hi_round, offset_const_2);
+
+          if (do_average) {
+            const __m256i data_ref_0_lo =
+                load_line2_avx2(&dst[i * dst_stride + j],
+                                &dst[i * dst_stride + j + dst_stride]);
+
+            const __m256i data_ref_0_hi =
+                load_line2_avx2(&dst[i * dst_stride + j + 8],
+                                &dst[i * dst_stride + j + 8 + dst_stride]);
+
+            const __m256i comp_avg_res_lo = comp_avg(
+                &data_ref_0_lo, &res_lo_unsigned, &wt, use_dist_wtd_comp_avg);
+
+            const __m256i comp_avg_res_hi = comp_avg(
+                &data_ref_0_hi, &res_hi_unsigned, &wt, use_dist_wtd_comp_avg);
+
+            const __m256i round_result_lo =
+                convolve_rounding(&comp_avg_res_lo, &offset_const,
+                                  &rounding_const, rounding_shift);
+
+            const __m256i round_result_hi =
+                convolve_rounding(&comp_avg_res_hi, &offset_const,
+                                  &rounding_const, rounding_shift);
+
+            const __m256i res_8 =
+                _mm256_packus_epi16(round_result_lo, round_result_hi);
+            const __m128i res_0 = _mm256_castsi256_si128(res_8);
+            const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+            _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+            _mm_store_si128(
+                (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+
+          } else {
+            const __m128i res_lo_0 = _mm256_castsi256_si128(res_lo_unsigned);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_lo_0);
+
+            const __m128i res_lo_1 =
+                _mm256_extracti128_si256(res_lo_unsigned, 1);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                            res_lo_1);
+
+            const __m128i res_hi_0 = _mm256_castsi256_si128(res_hi_unsigned);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + 8]),
+                            res_hi_0);
+
+            const __m128i res_hi_1 =
+                _mm256_extracti128_si256(res_hi_unsigned, 1);
+            _mm_store_si128(
+                (__m128i *)(&dst[i * dst_stride + j + 8 + dst_stride]),
+                res_hi_1);
+          }
+        }
+        s[0] = s[1];
+        s[1] = s[2];
+        s[2] = s[3];
+
+        s[4] = s[5];
+        s[5] = s[6];
+        s[6] = s[7];
+      }
+    }
+  }
+}
+
+void av1_dist_wtd_convolve_2d_avx2(const uint8_t *src, int src_stride,
+                                   uint8_t *dst0, int dst_stride0, int w, int h,
+                                   const InterpFilterParams *filter_params_x,
+                                   const InterpFilterParams *filter_params_y,
+                                   const int subpel_x_qn, const int subpel_y_qn,
+                                   ConvolveParams *conv_params) {
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  const int bd = 8;
+
+  DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
+
+  int im_stride = 8;
+  int i, is_horiz_4tap = 0, is_vert_4tap = 0;
+  const __m256i wt = unpack_weights_avx2(conv_params);
+  const int do_average = conv_params->do_average;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
+  const int offset_0 =
+      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+  const __m256i offset_const = _mm256_set1_epi16(offset);
+  const int rounding_shift =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1);
+
+  assert(conv_params->round_0 > 0);
+
+  const __m256i round_const_h = _mm256_set1_epi16(
+      ((1 << (conv_params->round_0 - 1)) >> 1) + (1 << (bd + FILTER_BITS - 2)));
+  const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0 - 1);
+
+  const __m256i round_const_v = _mm256_set1_epi32(
+      ((1 << conv_params->round_1) >> 1) -
+      (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
+  const __m128i round_shift_v = _mm_cvtsi32_si128(conv_params->round_1);
+
+  __m256i filt[4], coeffs_x[4], coeffs_y[4];
+
+  filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2);
+  filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+
+  prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs_x);
+  prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y);
+
+  // Condition for checking valid horz_filt taps
+  if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs_x[0], coeffs_x[3]), 0)))
+    is_horiz_4tap = 1;
+
+  // Condition for checking valid vert_filt taps
+  if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs_y[0], coeffs_y[3]), 0)))
+    is_vert_4tap = 1;
+
+  if (is_horiz_4tap) {
+    int im_h = h + filter_params_y->taps - 1;
+    const int fo_vert = filter_params_y->taps / 2 - 1;
+    const int fo_horiz = 1;
+    const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+    for (int j = 0; j < w; j += 8) {
+      /* Horizontal filter */
+      const uint8_t *src_h = src_ptr + j;
+      for (i = 0; i < im_h; i += 2) {
+        __m256i data =
+            _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)src_h));
+        if (i + 1 < im_h)
+          data = _mm256_inserti128_si256(
+              data, _mm_loadu_si128((__m128i *)(src_h + src_stride)), 1);
+        src_h += (src_stride << 1);
+        __m256i res = convolve_lowbd_x_4tap(data, coeffs_x + 1, filt);
+
+        res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h),
+                               round_shift_h);
+
+        _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
+      }
+      DIST_WTD_CONVOLVE_VERTICAL_FILTER_8TAP;
+    }
+  } else if (is_vert_4tap) {
+    int im_h = h + 3;
+    const int fo_vert = 1;
+    const int fo_horiz = filter_params_x->taps / 2 - 1;
+    const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+    filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+    filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
+
+    for (int j = 0; j < w; j += 8) {
+      /* Horizontal filter */
+      const uint8_t *src_h = src_ptr + j;
+      DIST_WTD_CONVOLVE_HORIZONTAL_FILTER_8TAP;
+
+      /* Vertical filter */
+      __m256i s[6];
+      __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
+      __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
+      __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
+      __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
+
+      s[0] = _mm256_unpacklo_epi16(s0, s1);
+      s[1] = _mm256_unpacklo_epi16(s2, s3);
+
+      s[3] = _mm256_unpackhi_epi16(s0, s1);
+      s[4] = _mm256_unpackhi_epi16(s2, s3);
+
+      for (i = 0; i < h; i += 2) {
+        const int16_t *data = &im_block[i * im_stride];
+
+        const __m256i s4 =
+            _mm256_loadu_si256((__m256i *)(data + 4 * im_stride));
+        const __m256i s5 =
+            _mm256_loadu_si256((__m256i *)(data + 5 * im_stride));
+
+        s[2] = _mm256_unpacklo_epi16(s4, s5);
+        s[5] = _mm256_unpackhi_epi16(s4, s5);
+
+        const __m256i res_a = convolve_4tap(s, coeffs_y + 1);
+        const __m256i res_a_round = _mm256_sra_epi32(
+            _mm256_add_epi32(res_a, round_const_v), round_shift_v);
+
+        if (w - j > 4) {
+          const __m256i res_b = convolve_4tap(s + 3, coeffs_y + 1);
+          const __m256i res_b_round = _mm256_sra_epi32(
+              _mm256_add_epi32(res_b, round_const_v), round_shift_v);
+          const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_b_round);
+          const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const);
+
+          if (do_average) {
+            const __m256i data_ref_0 =
+                load_line2_avx2(&dst[i * dst_stride + j],
+                                &dst[i * dst_stride + j + dst_stride]);
+            const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned,
+                                                  &wt, use_dist_wtd_comp_avg);
+
+            const __m256i round_result = convolve_rounding(
+                &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+            const __m256i res_8 =
+                _mm256_packus_epi16(round_result, round_result);
+            const __m128i res_0 = _mm256_castsi256_si128(res_8);
+            const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+            _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+            _mm_storel_epi64(
+                (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+          } else {
+            const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+
+            const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                            res_1);
+          }
+        } else {
+          const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_a_round);
+          const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const);
+
+          if (do_average) {
+            const __m256i data_ref_0 =
+                load_line2_avx2(&dst[i * dst_stride + j],
+                                &dst[i * dst_stride + j + dst_stride]);
+
+            const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned,
+                                                  &wt, use_dist_wtd_comp_avg);
+
+            const __m256i round_result = convolve_rounding(
+                &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+            const __m256i res_8 =
+                _mm256_packus_epi16(round_result, round_result);
+            const __m128i res_0 = _mm256_castsi256_si128(res_8);
+            const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+            *(uint32_t *)(&dst0[i * dst_stride0 + j]) =
+                _mm_cvtsi128_si32(res_0);
+            *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
+                _mm_cvtsi128_si32(res_1);
+
+          } else {
+            const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+
+            const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);
+            _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                            res_1);
+          }
+        }
+        s[0] = s[1];
+        s[1] = s[2];
+        s[3] = s[4];
+        s[4] = s[5];
+      }
+    }
+  } else {
+    int im_h = h + filter_params_y->taps - 1;
+    const int fo_vert = filter_params_y->taps / 2 - 1;
+    const int fo_horiz = filter_params_x->taps / 2 - 1;
+    const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+    filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+    filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
+
+    for (int j = 0; j < w; j += 8) {
+      /* Horizontal filter */
+      const uint8_t *src_h = src_ptr + j;
+      DIST_WTD_CONVOLVE_HORIZONTAL_FILTER_8TAP;
+
+      DIST_WTD_CONVOLVE_VERTICAL_FILTER_8TAP;
+    }
+  }
+}
+
+void av1_dist_wtd_convolve_2d_copy_avx2(
+    const uint8_t *src, int src_stride, uint8_t *dst0, int dst_stride0, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params) {
+  const int bd = 8;
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  (void)filter_params_x;
+  (void)filter_params_y;
+  (void)subpel_x_qn;
+  (void)subpel_y_qn;
+
+  const int bits =
+      FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
+  const __m128i left_shift = _mm_cvtsi32_si128(bits);
+  const int do_average = conv_params->do_average;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
+  const __m256i wt = unpack_weights_avx2(conv_params);
+  const __m256i zero = _mm256_setzero_si256();
+
+  const int offset_0 =
+      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+  const __m256i offset_const = _mm256_set1_epi16(offset);
+  const int rounding_shift =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1);
+  int i, j;
+
+  if (!(w % 16)) {
+    for (i = 0; i < h; i += 1) {
+      for (j = 0; j < w; j += 16) {
+        const __m256i src_16bit = _mm256_cvtepu8_epi16(
+            _mm_loadu_si128((__m128i *)(&src[i * src_stride + j])));
+
+        const __m256i res = _mm256_sll_epi16(src_16bit, left_shift);
+        const __m256i res_unsigned = _mm256_add_epi16(res, offset_const);
+
+        if (do_average) {
+          const __m256i data_ref_0 =
+              _mm256_loadu_si256((__m256i *)(&dst[i * dst_stride + j]));
+
+          const __m256i comp_avg_res =
+              comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
+
+          const __m256i round_result = convolve_rounding(
+              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+          const __m256i res_8 = _mm256_packus_epi16(round_result, round_result);
+          const __m256i res_0 = _mm256_permute4x64_epi64(res_8, 0xD8);
+
+          _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]),
+                          _mm256_castsi256_si128(res_0));
+        } else {
+          _mm256_store_si256((__m256i *)(&dst[i * dst_stride + j]),
+                             res_unsigned);
+        }
+      }
+    }
+  } else if (!(w % 4)) {
+    for (i = 0; i < h; i += 2) {
+      for (j = 0; j < w; j += 8) {
+        const __m128i src_row_0 =
+            _mm_loadl_epi64((__m128i *)(&src[i * src_stride + j]));
+        const __m128i src_row_1 =
+            _mm_loadl_epi64((__m128i *)(&src[i * src_stride + j + src_stride]));
+        // since not all compilers yet support _mm256_set_m128i()
+        const __m256i src_10 = _mm256_insertf128_si256(
+            _mm256_castsi128_si256(src_row_0), src_row_1, 1);
+
+        const __m256i src_16bit = _mm256_unpacklo_epi8(src_10, zero);
+
+        const __m256i res = _mm256_sll_epi16(src_16bit, left_shift);
+
+        const __m256i res_unsigned = _mm256_add_epi16(res, offset_const);
+
+        // Accumulate values into the destination buffer
+        if (do_average) {
+          const __m256i data_ref_0 = load_line2_avx2(
+              &dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]);
+          const __m256i comp_avg_res =
+              comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
+
+          const __m256i round_result = convolve_rounding(
+              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+          const __m256i res_8 = _mm256_packus_epi16(round_result, round_result);
+          const __m128i res_0 = _mm256_castsi256_si128(res_8);
+          const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+          if (w > 4) {
+            _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+            _mm_storel_epi64(
+                (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+          } else {
+            *(uint32_t *)(&dst0[i * dst_stride0 + j]) =
+                _mm_cvtsi128_si32(res_0);
+            *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
+                _mm_cvtsi128_si32(res_1);
+          }
+        } else {
+          const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+
+          const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+                          res_1);
+        }
+      }
+    }
+  }
+}
diff --git a/libs/libaom/src/av1/common/x86/jnt_convolve_sse2.c b/libs/libaom/src/av1/common/x86/jnt_convolve_sse2.c
new file mode 100644
index 000000000..f8f640a11
--- /dev/null
+++ b/libs/libaom/src/av1/common/x86/jnt_convolve_sse2.c
@@ -0,0 +1,615 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/convolve_sse2.h"
+
+void av1_dist_wtd_convolve_x_sse2(const uint8_t *src, int src_stride,
+                                  uint8_t *dst0, int dst_stride0, int w, int h,
+                                  const InterpFilterParams *filter_params_x,
+                                  const InterpFilterParams *filter_params_y,
+                                  const int subpel_x_qn, const int subpel_y_qn,
+                                  ConvolveParams *conv_params) {
+  const int bd = 8;
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  const int dst_stride = conv_params->dst_stride;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const uint8_t *src_ptr = src - fo_horiz;
+  const int bits = FILTER_BITS - conv_params->round_1;
+  const __m128i left_shift = _mm_cvtsi32_si128(bits);
+  const __m128i round_const = _mm_set1_epi32((1 << conv_params->round_0) >> 1);
+  const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m128i wt0 = _mm_set1_epi16(w0);
+  const __m128i wt1 = _mm_set1_epi16(w1);
+  const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
+  const int do_average = conv_params->do_average;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
+  const int offset_0 =
+      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+  const __m128i offset_const = _mm_set1_epi16(offset);
+  const int rounding_shift =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1);
+  __m128i coeffs[4];
+
+  (void)filter_params_y;
+  (void)subpel_y_qn;
+
+  prepare_coeffs(filter_params_x, subpel_x_qn, coeffs);
+
+  if (w == 4) {
+    do {
+      const __m128i data = _mm_loadu_si128((__m128i *)src_ptr);
+      __m128i s[4];
+
+      s[0] = _mm_unpacklo_epi8(data, _mm_srli_si128(data, 1));
+      s[1] =
+          _mm_unpacklo_epi8(_mm_srli_si128(data, 2), _mm_srli_si128(data, 3));
+      s[2] =
+          _mm_unpacklo_epi8(_mm_srli_si128(data, 4), _mm_srli_si128(data, 5));
+      s[3] =
+          _mm_unpacklo_epi8(_mm_srli_si128(data, 6), _mm_srli_si128(data, 7));
+      const __m128i res_lo = convolve_lo_x(s, coeffs);
+      const __m128i res_lo_round =
+          _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+      const __m128i res_lo_shift = _mm_sll_epi32(res_lo_round, left_shift);
+
+      const __m128i res_16b = _mm_packs_epi32(res_lo_shift, res_lo_shift);
+      const __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const);
+
+      // Accumulate values into the destination buffer
+      if (do_average) {
+        const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)dst);
+
+        const __m128i comp_avg_res =
+            comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
+
+        const __m128i round_result = convolve_rounding(
+            &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+        const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
+        *(uint32_t *)(&dst0[0]) = _mm_cvtsi128_si32(res_8);
+      } else {
+        _mm_store_si128((__m128i *)(&dst[0]), res_unsigned);
+      }
+      src_ptr += src_stride;
+      dst += dst_stride;
+      dst0 += dst_stride0;
+    } while (--h);
+  } else {
+    assert(!(w % 8));
+    int i = 0;
+    do {
+      int j = 0;
+      do {
+        const __m128i data =
+            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+        __m128i s[4];
+
+        // Filter even-index pixels
+        s[0] = data;
+        s[1] = _mm_srli_si128(data, 2);
+        s[2] = _mm_srli_si128(data, 4);
+        s[3] = _mm_srli_si128(data, 6);
+        const __m128i res_even = convolve_lo_x(s, coeffs);
+
+        // Filter odd-index pixels
+        s[0] = _mm_srli_si128(data, 1);
+        s[1] = _mm_srli_si128(data, 3);
+        s[2] = _mm_srli_si128(data, 5);
+        s[3] = _mm_srli_si128(data, 7);
+        const __m128i res_odd = convolve_lo_x(s, coeffs);
+
+        // Rearrange pixels back into the order 0 ... 7
+        const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+        const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+        const __m128i res_lo_round =
+            _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+        const __m128i res_hi_round =
+            _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+        const __m128i res_lo_shift = _mm_sll_epi32(res_lo_round, left_shift);
+        const __m128i res_hi_shift = _mm_sll_epi32(res_hi_round, left_shift);
+
+        const __m128i res_16b = _mm_packs_epi32(res_lo_shift, res_hi_shift);
+        const __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const);
+
+        // Accumulate values into the destination buffer
+        if (do_average) {
+          const __m128i data_ref_0 =
+              _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
+
+          const __m128i comp_avg_res =
+              comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
+
+          const __m128i round_result = convolve_rounding(
+              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+          const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
+          _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_8);
+        } else {
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned);
+        }
+        j += 8;
+      } while (j < w);
+    } while (++i < h);
+  }
+}
+
+void av1_dist_wtd_convolve_y_sse2(const uint8_t *src, int src_stride,
+                                  uint8_t *dst0, int dst_stride0, int w, int h,
+                                  const InterpFilterParams *filter_params_x,
+                                  const InterpFilterParams *filter_params_y,
+                                  const int subpel_x_qn, const int subpel_y_qn,
+                                  ConvolveParams *conv_params) {
+  const int bd = 8;
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  const int dst_stride = conv_params->dst_stride;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const uint8_t *src_ptr = src - fo_vert * src_stride;
+  const int bits = FILTER_BITS - conv_params->round_0;
+  const __m128i left_shift = _mm_cvtsi32_si128(bits);
+  const __m128i wt0 = _mm_set1_epi16(conv_params->fwd_offset);
+  const __m128i wt1 = _mm_set1_epi16(conv_params->bck_offset);
+  const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
+  const int do_average = conv_params->do_average;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
+  const int offset_0 =
+      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+  const __m128i offset_const = _mm_set1_epi16(offset);
+  const int rounding_shift =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1);
+  const __m128i round_const = _mm_set1_epi32((1 << conv_params->round_1) >> 1);
+  const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
+  __m128i coeffs[4];
+
+  (void)filter_params_x;
+  (void)subpel_x_qn;
+
+  prepare_coeffs(filter_params_y, subpel_y_qn, coeffs);
+
+  if (w == 4) {
+    __m128i s[8], src6, res, res_shift;
+    src6 = _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 6 * src_stride));
+    s[0] = _mm_unpacklo_epi8(
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 0 * src_stride)),
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 1 * src_stride)));
+    s[1] = _mm_unpacklo_epi8(
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 1 * src_stride)),
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 2 * src_stride)));
+    s[2] = _mm_unpacklo_epi8(
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 2 * src_stride)),
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 3 * src_stride)));
+    s[3] = _mm_unpacklo_epi8(
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 3 * src_stride)),
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 4 * src_stride)));
+    s[4] = _mm_unpacklo_epi8(
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 4 * src_stride)),
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 5 * src_stride)));
+    s[5] = _mm_unpacklo_epi8(
+        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 5 * src_stride)), src6);
+
+    do {
+      s[6] = _mm_unpacklo_epi8(
+          src6, _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 7 * src_stride)));
+      src6 = _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 8 * src_stride));
+      s[7] = _mm_unpacklo_epi8(
+          _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 7 * src_stride)), src6);
+
+      res = convolve_lo_y(s + 0, coeffs);
+      res_shift = _mm_sll_epi32(res, left_shift);
+      res_shift =
+          _mm_sra_epi32(_mm_add_epi32(res_shift, round_const), round_shift);
+
+      __m128i res_16b = _mm_packs_epi32(res_shift, res_shift);
+      __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const);
+
+      // Accumulate values into the destination buffer
+      if (do_average) {
+        const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)dst);
+
+        const __m128i comp_avg_res =
+            comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
+
+        const __m128i round_result = convolve_rounding(
+            &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+        const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
+        *(uint32_t *)(&dst0[0]) = _mm_cvtsi128_si32(res_8);
+
+      } else {
+        _mm_store_si128((__m128i *)dst, res_unsigned);
+      }
+
+      src_ptr += src_stride;
+      dst += dst_stride;
+      dst0 += dst_stride0;
+
+      res = convolve_lo_y(s + 1, coeffs);
+      res_shift = _mm_sll_epi32(res, left_shift);
+      res_shift =
+          _mm_sra_epi32(_mm_add_epi32(res_shift, round_const), round_shift);
+
+      res_16b = _mm_packs_epi32(res_shift, res_shift);
+      res_unsigned = _mm_add_epi16(res_16b, offset_const);
+
+      // Accumulate values into the destination buffer
+      if (do_average) {
+        const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)dst);
+
+        const __m128i comp_avg_res =
+            comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
+
+        const __m128i round_result = convolve_rounding(
+            &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+        const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
+        *(uint32_t *)(&dst0[0]) = _mm_cvtsi128_si32(res_8);
+
+      } else {
+        _mm_store_si128((__m128i *)dst, res_unsigned);
+      }
+
+      src_ptr += src_stride;
+      dst += dst_stride;
+      dst0 += dst_stride0;
+
+      s[0] = s[2];
+      s[1] = s[3];
+      s[2] = s[4];
+      s[3] = s[5];
+      s[4] = s[6];
+      s[5] = s[7];
+      h -= 2;
+    } while (h);
+  } else {
+    assert(!(w % 8));
+    int j = 0;
+    do {
+      __m128i s[8], src6, res_lo, res_hi, res_lo_shift, res_hi_shift;
+      const uint8_t *data = &src_ptr[j];
+
+      src6 = _mm_loadl_epi64((__m128i *)(data + 6 * src_stride));
+      s[0] = _mm_unpacklo_epi8(
+          _mm_loadl_epi64((__m128i *)(data + 0 * src_stride)),
+          _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)));
+      s[1] = _mm_unpacklo_epi8(
+          _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)),
+          _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)));
+      s[2] = _mm_unpacklo_epi8(
+          _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)),
+          _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)));
+      s[3] = _mm_unpacklo_epi8(
+          _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)),
+          _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)));
+      s[4] = _mm_unpacklo_epi8(
+          _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)),
+          _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)));
+      s[5] = _mm_unpacklo_epi8(
+          _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)), src6);
+
+      int i = 0;
+      do {
+        data = &src_ptr[i * src_stride + j];
+        s[6] = _mm_unpacklo_epi8(
+            src6, _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)));
+        src6 = _mm_loadl_epi64((__m128i *)(data + 8 * src_stride));
+        s[7] = _mm_unpacklo_epi8(
+            _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)), src6);
+
+        res_lo = convolve_lo_y(s, coeffs);  // Filter low index pixels
+        res_hi = convolve_hi_y(s, coeffs);  // Filter high index pixels
+        res_lo_shift = _mm_sll_epi32(res_lo, left_shift);
+        res_hi_shift = _mm_sll_epi32(res_hi, left_shift);
+        res_lo_shift = _mm_sra_epi32(_mm_add_epi32(res_lo_shift, round_const),
+                                     round_shift);
+        res_hi_shift = _mm_sra_epi32(_mm_add_epi32(res_hi_shift, round_const),
+                                     round_shift);
+
+        __m128i res_16b = _mm_packs_epi32(res_lo_shift, res_hi_shift);
+        __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const);
+
+        // Accumulate values into the destination buffer
+        if (do_average) {
+          const __m128i data_ref_0 =
+              _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
+
+          const __m128i comp_avg_res =
+              comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
+
+          const __m128i round_result = convolve_rounding(
+              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+          const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
+          _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_8);
+        } else {
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned);
+        }
+        i++;
+
+        res_lo = convolve_lo_y(s + 1, coeffs);  // Filter low index pixels
+        res_hi = convolve_hi_y(s + 1, coeffs);  // Filter high index pixels
+        res_lo_shift = _mm_sll_epi32(res_lo, left_shift);
+        res_hi_shift = _mm_sll_epi32(res_hi, left_shift);
+        res_lo_shift = _mm_sra_epi32(_mm_add_epi32(res_lo_shift, round_const),
+                                     round_shift);
+        res_hi_shift = _mm_sra_epi32(_mm_add_epi32(res_hi_shift, round_const),
+                                     round_shift);
+        res_16b = _mm_packs_epi32(res_lo_shift, res_hi_shift);
+        res_unsigned = _mm_add_epi16(res_16b, offset_const);
+
+        // Accumulate values into the destination buffer
+        if (do_average) {
+          __m128i data_ref_0 =
+              _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
+
+          const __m128i comp_avg_res =
+              comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
+
+          const __m128i round_result = convolve_rounding(
+              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+          const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
+          _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_8);
+        } else {
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned);
+        }
+        i++;
+
+        s[0] = s[2];
+        s[1] = s[3];
+        s[2] = s[4];
+        s[3] = s[5];
+        s[4] = s[6];
+        s[5] = s[7];
+      } while (i < h);
+      j += 8;
+    } while (j < w);
+  }
+}
+
+void av1_dist_wtd_convolve_2d_sse2(const uint8_t *src, int src_stride,
+                                   uint8_t *dst0, int dst_stride0, int w, int h,
+                                   const InterpFilterParams *filter_params_x,
+                                   const InterpFilterParams *filter_params_y,
+                                   const int subpel_x_qn, const int subpel_y_qn,
+                                   ConvolveParams *conv_params) {
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  const int bd = 8;
+
+  DECLARE_ALIGNED(16, int16_t,
+                  im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
+  int im_h = h + filter_params_y->taps - 1;
+  int im_stride = MAX_SB_SIZE;
+  int i, j;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const int do_average = conv_params->do_average;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
+  const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+  const __m128i zero = _mm_setzero_si128();
+
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m128i wt0 = _mm_set1_epi16(w0);
+  const __m128i wt1 = _mm_set1_epi16(w1);
+  const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
+
+  const int offset_0 =
+      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+  const __m128i offset_const = _mm_set1_epi16(offset);
+  const int rounding_shift =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1);
+
+  /* Horizontal filter */
+  {
+    const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+        filter_params_x, subpel_x_qn & SUBPEL_MASK);
+    const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
+
+    // coeffs 0 1 0 1 2 3 2 3
+    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
+    // coeffs 4 5 4 5 6 7 6 7
+    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
+
+    // coeffs 0 1 0 1 0 1 0 1
+    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+    // coeffs 2 3 2 3 2 3 2 3
+    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+    // coeffs 4 5 4 5 4 5 4 5
+    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+    // coeffs 6 7 6 7 6 7 6 7
+    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+    const __m128i round_const = _mm_set1_epi32(
+        ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
+    const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
+
+    for (i = 0; i < im_h; ++i) {
+      for (j = 0; j < w; j += 8) {
+        __m128i temp_lo, temp_hi;
+        const __m128i data =
+            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+
+        const __m128i src_lo = _mm_unpacklo_epi8(data, zero);
+        const __m128i src_hi = _mm_unpackhi_epi8(data, zero);
+
+        // Filter even-index pixels
+        const __m128i res_0 = _mm_madd_epi16(src_lo, coeff_01);
+        temp_lo = _mm_srli_si128(src_lo, 4);
+        temp_hi = _mm_slli_si128(src_hi, 12);
+        const __m128i src_2 = _mm_or_si128(temp_hi, temp_lo);
+        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+        temp_lo = _mm_srli_si128(src_lo, 8);
+        temp_hi = _mm_slli_si128(src_hi, 8);
+        const __m128i src_4 = _mm_or_si128(temp_hi, temp_lo);
+        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+        temp_lo = _mm_srli_si128(src_lo, 12);
+        temp_hi = _mm_slli_si128(src_hi, 4);
+        const __m128i src_6 = _mm_or_si128(temp_hi, temp_lo);
+        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+        __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
+                                         _mm_add_epi32(res_2, res_6));
+        res_even =
+            _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift);
+
+        // Filter odd-index pixels
+        temp_lo = _mm_srli_si128(src_lo, 2);
+        temp_hi = _mm_slli_si128(src_hi, 14);
+        const __m128i src_1 = _mm_or_si128(temp_hi, temp_lo);
+        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+        temp_lo = _mm_srli_si128(src_lo, 6);
+        temp_hi = _mm_slli_si128(src_hi, 10);
+        const __m128i src_3 = _mm_or_si128(temp_hi, temp_lo);
+        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+        temp_lo = _mm_srli_si128(src_lo, 10);
+        temp_hi = _mm_slli_si128(src_hi, 6);
+        const __m128i src_5 = _mm_or_si128(temp_hi, temp_lo);
+        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+        temp_lo = _mm_srli_si128(src_lo, 14);
+        temp_hi = _mm_slli_si128(src_hi, 2);
+        const __m128i src_7 = _mm_or_si128(temp_hi, temp_lo);
+        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+        __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
+                                        _mm_add_epi32(res_3, res_7));
+        res_odd =
+            _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift);
+
+        // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
+        __m128i res = _mm_packs_epi32(res_even, res_odd);
+        _mm_store_si128((__m128i *)&im_block[i * im_stride + j], res);
+      }
+    }
+  }
+
+  /* Vertical filter */
+  {
+    const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+        filter_params_y, subpel_y_qn & SUBPEL_MASK);
+    const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
+
+    // coeffs 0 1 0 1 2 3 2 3
+    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+    // coeffs 4 5 4 5 6 7 6 7
+    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+    // coeffs 0 1 0 1 0 1 0 1
+    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+    // coeffs 2 3 2 3 2 3 2 3
+    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+    // coeffs 4 5 4 5 4 5 4 5
+    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+    // coeffs 6 7 6 7 6 7 6 7
+    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+    const __m128i round_const = _mm_set1_epi32(
+        ((1 << conv_params->round_1) >> 1) -
+        (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
+    const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
+
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; j += 8) {
+        // Filter even-index pixels
+        const int16_t *data = &im_block[i * im_stride + j];
+        const __m128i src_0 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride),
+                               *(__m128i *)(data + 1 * im_stride));
+        const __m128i src_2 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride),
+                               *(__m128i *)(data + 3 * im_stride));
+        const __m128i src_4 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride),
+                               *(__m128i *)(data + 5 * im_stride));
+        const __m128i src_6 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride),
+                               *(__m128i *)(data + 7 * im_stride));
+
+        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+        const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+                                               _mm_add_epi32(res_4, res_6));
+
+        // Filter odd-index pixels
+        const __m128i src_1 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride),
+                               *(__m128i *)(data + 1 * im_stride));
+        const __m128i src_3 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride),
+                               *(__m128i *)(data + 3 * im_stride));
+        const __m128i src_5 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride),
+                               *(__m128i *)(data + 5 * im_stride));
+        const __m128i src_7 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride),
+                               *(__m128i *)(data + 7 * im_stride));
+
+        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+        const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+                                              _mm_add_epi32(res_5, res_7));
+
+        // Rearrange pixels back into the order 0 ... 7
+        const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+        const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+        const __m128i res_lo_round =
+            _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+        const __m128i res_hi_round =
+            _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+
+        const __m128i res_16b = _mm_packs_epi32(res_lo_round, res_hi_round);
+        const __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const);
+
+        // Accumulate values into the destination buffer
+        if (do_average) {
+          const __m128i data_ref_0 =
+              _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
+
+          const __m128i comp_avg_res =
+              comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
+
+          const __m128i round_result = convolve_rounding(
+              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+          const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
+
+          if (w > 4)
+            _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_8);
+          else
+            *(uint32_t *)(&dst0[i * dst_stride0 + j]) =
+                _mm_cvtsi128_si32(res_8);
+        } else {
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned);
+        }
+      }
+    }
+  }
+}
diff --git a/libs/libaom/src/av1/common/x86/jnt_convolve_ssse3.c b/libs/libaom/src/av1/common/x86/jnt_convolve_ssse3.c
new file mode 100644
index 000000000..f45e3b267
--- /dev/null
+++ b/libs/libaom/src/av1/common/x86/jnt_convolve_ssse3.c
@@ -0,0 +1,231 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/convolve_sse2.h"
+
+void av1_dist_wtd_convolve_2d_ssse3(
+    const uint8_t *src, int src_stride, uint8_t *dst0, int dst_stride0, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params) {
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  const int bd = 8;
+
+  DECLARE_ALIGNED(16, int16_t,
+                  im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
+  int im_h = h + filter_params_y->taps - 1;
+  int im_stride = MAX_SB_SIZE;
+  int i, j;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const int do_average = conv_params->do_average;
+  const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
+  const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+  const __m128i zero = _mm_setzero_si128();
+
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m128i wt0 = _mm_set1_epi16(w0);
+  const __m128i wt1 = _mm_set1_epi16(w1);
+  const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
+
+  const int offset_0 =
+      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+  const __m128i offset_const = _mm_set1_epi16(offset);
+  const int rounding_shift =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1);
+
+  /* Horizontal filter */
+  {
+    const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+        filter_params_x, subpel_x_qn & SUBPEL_MASK);
+    const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
+
+    // coeffs 0 1 0 1 2 3 2 3
+    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
+    // coeffs 4 5 4 5 6 7 6 7
+    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
+
+    // coeffs 0 1 0 1 0 1 0 1
+    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+    // coeffs 2 3 2 3 2 3 2 3
+    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+    // coeffs 4 5 4 5 4 5 4 5
+    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+    // coeffs 6 7 6 7 6 7 6 7
+    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+    const __m128i round_const = _mm_set1_epi32(
+        ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
+    const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
+
+    for (i = 0; i < im_h; ++i) {
+      for (j = 0; j < w; j += 8) {
+        const __m128i data =
+            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+
+        const __m128i src_lo = _mm_unpacklo_epi8(data, zero);
+        const __m128i src_hi = _mm_unpackhi_epi8(data, zero);
+
+        // Filter even-index pixels
+        const __m128i res_0 = _mm_madd_epi16(src_lo, coeff_01);
+        const __m128i src_2 = _mm_alignr_epi8(src_hi, src_lo, 4);
+        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+        const __m128i src_4 = _mm_alignr_epi8(src_hi, src_lo, 8);
+        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+        const __m128i src_6 = _mm_alignr_epi8(src_hi, src_lo, 12);
+        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+        __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
+                                         _mm_add_epi32(res_2, res_6));
+        res_even =
+            _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift);
+
+        // Filter odd-index pixels
+        const __m128i src_1 = _mm_alignr_epi8(src_hi, src_lo, 2);
+        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+        const __m128i src_3 = _mm_alignr_epi8(src_hi, src_lo, 6);
+        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+        const __m128i src_5 = _mm_alignr_epi8(src_hi, src_lo, 10);
+        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+        const __m128i src_7 = _mm_alignr_epi8(src_hi, src_lo, 14);
+        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+        __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
+                                        _mm_add_epi32(res_3, res_7));
+        res_odd =
+            _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift);
+
+        // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
+        __m128i res = _mm_packs_epi32(res_even, res_odd);
+        _mm_store_si128((__m128i *)&im_block[i * im_stride + j], res);
+      }
+    }
+  }
+
+  /* Vertical filter */
+  {
+    const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+        filter_params_y, subpel_y_qn & SUBPEL_MASK);
+    const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
+
+    // coeffs 0 1 0 1 2 3 2 3
+    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+    // coeffs 4 5 4 5 6 7 6 7
+    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+    // coeffs 0 1 0 1 0 1 0 1
+    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+    // coeffs 2 3 2 3 2 3 2 3
+    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+    // coeffs 4 5 4 5 4 5 4 5
+    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+    // coeffs 6 7 6 7 6 7 6 7
+    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+    const __m128i round_const = _mm_set1_epi32(
+        ((1 << conv_params->round_1) >> 1) -
+        (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
+    const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
+
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; j += 8) {
+        // Filter even-index pixels
+        const int16_t *data = &im_block[i * im_stride + j];
+        const __m128i src_0 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride),
+                               *(__m128i *)(data + 1 * im_stride));
+        const __m128i src_2 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride),
+                               *(__m128i *)(data + 3 * im_stride));
+        const __m128i src_4 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride),
+                               *(__m128i *)(data + 5 * im_stride));
+        const __m128i src_6 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride),
+                               *(__m128i *)(data + 7 * im_stride));
+
+        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+        const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+                                               _mm_add_epi32(res_4, res_6));
+
+        // Filter odd-index pixels
+        const __m128i src_1 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride),
+                               *(__m128i *)(data + 1 * im_stride));
+        const __m128i src_3 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride),
+                               *(__m128i *)(data + 3 * im_stride));
+        const __m128i src_5 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride),
+                               *(__m128i *)(data + 5 * im_stride));
+        const __m128i src_7 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride),
+                               *(__m128i *)(data + 7 * im_stride));
+
+        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+        const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+                                              _mm_add_epi32(res_5, res_7));
+
+        // Rearrange pixels back into the order 0 ... 7
+        const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+        const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+        const __m128i res_lo_round =
+            _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+        const __m128i res_hi_round =
+            _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+
+        const __m128i res_16b = _mm_packs_epi32(res_lo_round, res_hi_round);
+        const __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const);
+
+        // Accumulate values into the destination buffer
+        if (do_average) {
+          const __m128i data_ref_0 =
+              _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
+
+          const __m128i comp_avg_res =
+              comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
+
+          const __m128i round_result = convolve_rounding(
+              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+          const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
+
+          if (w > 4)
+            _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_8);
+          else
+            *(uint32_t *)(&dst0[i * dst_stride0 + j]) =
+                _mm_cvtsi128_si32(res_8);
+        } else {
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned);
+        }
+      }
+    }
+  }
+}
diff --git a/libs/libaom/src/av1/common/x86/reconinter_avx2.c b/libs/libaom/src/av1/common/x86/reconinter_avx2.c
new file mode 100644
index 000000000..a38bd8317
--- /dev/null
+++ b/libs/libaom/src/av1/common/x86/reconinter_avx2.c
@@ -0,0 +1,620 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/blend.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+#include "av1/common/blockd.h"
+
+static INLINE __m256i calc_mask_avx2(const __m256i mask_base, const __m256i s0,
+                                     const __m256i s1) {
+  const __m256i diff = _mm256_abs_epi16(_mm256_sub_epi16(s0, s1));
+  return _mm256_abs_epi16(
+      _mm256_add_epi16(mask_base, _mm256_srli_epi16(diff, 4)));
+  // clamp(diff, 0, 64) can be skiped for diff is always in the range ( 38, 54)
+}
+void av1_build_compound_diffwtd_mask_avx2(uint8_t *mask,
+                                          DIFFWTD_MASK_TYPE mask_type,
+                                          const uint8_t *src0, int src0_stride,
+                                          const uint8_t *src1, int src1_stride,
+                                          int h, int w) {
+  const int mb = (mask_type == DIFFWTD_38_INV) ? AOM_BLEND_A64_MAX_ALPHA : 0;
+  const __m256i y_mask_base = _mm256_set1_epi16(38 - mb);
+  int i = 0;
+  if (4 == w) {
+    do {
+      const __m128i s0A = xx_loadl_32(src0);
+      const __m128i s0B = xx_loadl_32(src0 + src0_stride);
+      const __m128i s0C = xx_loadl_32(src0 + src0_stride * 2);
+      const __m128i s0D = xx_loadl_32(src0 + src0_stride * 3);
+      const __m128i s0AB = _mm_unpacklo_epi32(s0A, s0B);
+      const __m128i s0CD = _mm_unpacklo_epi32(s0C, s0D);
+      const __m128i s0ABCD = _mm_unpacklo_epi64(s0AB, s0CD);
+      const __m256i s0ABCD_w = _mm256_cvtepu8_epi16(s0ABCD);
+
+      const __m128i s1A = xx_loadl_32(src1);
+      const __m128i s1B = xx_loadl_32(src1 + src1_stride);
+      const __m128i s1C = xx_loadl_32(src1 + src1_stride * 2);
+      const __m128i s1D = xx_loadl_32(src1 + src1_stride * 3);
+      const __m128i s1AB = _mm_unpacklo_epi32(s1A, s1B);
+      const __m128i s1CD = _mm_unpacklo_epi32(s1C, s1D);
+      const __m128i s1ABCD = _mm_unpacklo_epi64(s1AB, s1CD);
+      const __m256i s1ABCD_w = _mm256_cvtepu8_epi16(s1ABCD);
+      const __m256i m16 = calc_mask_avx2(y_mask_base, s0ABCD_w, s1ABCD_w);
+      const __m256i m8 = _mm256_packus_epi16(m16, _mm256_setzero_si256());
+      const __m128i x_m8 =
+          _mm256_castsi256_si128(_mm256_permute4x64_epi64(m8, 0xd8));
+      xx_storeu_128(mask, x_m8);
+      src0 += (src0_stride << 2);
+      src1 += (src1_stride << 2);
+      mask += 16;
+      i += 4;
+    } while (i < h);
+  } else if (8 == w) {
+    do {
+      const __m128i s0A = xx_loadl_64(src0);
+      const __m128i s0B = xx_loadl_64(src0 + src0_stride);
+      const __m128i s0C = xx_loadl_64(src0 + src0_stride * 2);
+      const __m128i s0D = xx_loadl_64(src0 + src0_stride * 3);
+      const __m256i s0AC_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s0A, s0C));
+      const __m256i s0BD_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s0B, s0D));
+      const __m128i s1A = xx_loadl_64(src1);
+      const __m128i s1B = xx_loadl_64(src1 + src1_stride);
+      const __m128i s1C = xx_loadl_64(src1 + src1_stride * 2);
+      const __m128i s1D = xx_loadl_64(src1 + src1_stride * 3);
+      const __m256i s1AB_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s1A, s1C));
+      const __m256i s1CD_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s1B, s1D));
+      const __m256i m16AC = calc_mask_avx2(y_mask_base, s0AC_w, s1AB_w);
+      const __m256i m16BD = calc_mask_avx2(y_mask_base, s0BD_w, s1CD_w);
+      const __m256i m8 = _mm256_packus_epi16(m16AC, m16BD);
+      yy_storeu_256(mask, m8);
+      src0 += src0_stride << 2;
+      src1 += src1_stride << 2;
+      mask += 32;
+      i += 4;
+    } while (i < h);
+  } else if (16 == w) {
+    do {
+      const __m128i s0A = xx_load_128(src0);
+      const __m128i s0B = xx_load_128(src0 + src0_stride);
+      const __m128i s1A = xx_load_128(src1);
+      const __m128i s1B = xx_load_128(src1 + src1_stride);
+      const __m256i s0AL = _mm256_cvtepu8_epi16(s0A);
+      const __m256i s0BL = _mm256_cvtepu8_epi16(s0B);
+      const __m256i s1AL = _mm256_cvtepu8_epi16(s1A);
+      const __m256i s1BL = _mm256_cvtepu8_epi16(s1B);
+
+      const __m256i m16AL = calc_mask_avx2(y_mask_base, s0AL, s1AL);
+      const __m256i m16BL = calc_mask_avx2(y_mask_base, s0BL, s1BL);
+
+      const __m256i m8 =
+          _mm256_permute4x64_epi64(_mm256_packus_epi16(m16AL, m16BL), 0xd8);
+      yy_storeu_256(mask, m8);
+      src0 += src0_stride << 1;
+      src1 += src1_stride << 1;
+      mask += 32;
+      i += 2;
+    } while (i < h);
+  } else {
+    do {
+      int j = 0;
+      do {
+        const __m256i s0 = yy_loadu_256(src0 + j);
+        const __m256i s1 = yy_loadu_256(src1 + j);
+        const __m256i s0L = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s0));
+        const __m256i s1L = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s1));
+        const __m256i s0H =
+            _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s0, 1));
+        const __m256i s1H =
+            _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s1, 1));
+        const __m256i m16L = calc_mask_avx2(y_mask_base, s0L, s1L);
+        const __m256i m16H = calc_mask_avx2(y_mask_base, s0H, s1H);
+        const __m256i m8 =
+            _mm256_permute4x64_epi64(_mm256_packus_epi16(m16L, m16H), 0xd8);
+        yy_storeu_256(mask + j, m8);
+        j += 32;
+      } while (j < w);
+      src0 += src0_stride;
+      src1 += src1_stride;
+      mask += w;
+      i += 1;
+    } while (i < h);
+  }
+}
+
+static INLINE __m256i calc_mask_d16_avx2(const __m256i *data_src0,
+                                         const __m256i *data_src1,
+                                         const __m256i *round_const,
+                                         const __m256i *mask_base_16,
+                                         const __m256i *clip_diff, int round) {
+  const __m256i diffa = _mm256_subs_epu16(*data_src0, *data_src1);
+  const __m256i diffb = _mm256_subs_epu16(*data_src1, *data_src0);
+  const __m256i diff = _mm256_max_epu16(diffa, diffb);
+  const __m256i diff_round =
+      _mm256_srli_epi16(_mm256_adds_epu16(diff, *round_const), round);
+  const __m256i diff_factor = _mm256_srli_epi16(diff_round, DIFF_FACTOR_LOG2);
+  const __m256i diff_mask = _mm256_adds_epi16(diff_factor, *mask_base_16);
+  const __m256i diff_clamp = _mm256_min_epi16(diff_mask, *clip_diff);
+  return diff_clamp;
+}
+
+static INLINE __m256i calc_mask_d16_inv_avx2(const __m256i *data_src0,
+                                             const __m256i *data_src1,
+                                             const __m256i *round_const,
+                                             const __m256i *mask_base_16,
+                                             const __m256i *clip_diff,
+                                             int round) {
+  const __m256i diffa = _mm256_subs_epu16(*data_src0, *data_src1);
+  const __m256i diffb = _mm256_subs_epu16(*data_src1, *data_src0);
+  const __m256i diff = _mm256_max_epu16(diffa, diffb);
+  const __m256i diff_round =
+      _mm256_srli_epi16(_mm256_adds_epu16(diff, *round_const), round);
+  const __m256i diff_factor = _mm256_srli_epi16(diff_round, DIFF_FACTOR_LOG2);
+  const __m256i diff_mask = _mm256_adds_epi16(diff_factor, *mask_base_16);
+  const __m256i diff_clamp = _mm256_min_epi16(diff_mask, *clip_diff);
+  const __m256i diff_const_16 = _mm256_sub_epi16(*clip_diff, diff_clamp);
+  return diff_const_16;
+}
+
+static INLINE void build_compound_diffwtd_mask_d16_avx2(
+    uint8_t *mask, const CONV_BUF_TYPE *src0, int src0_stride,
+    const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, int shift) {
+  const int mask_base = 38;
+  const __m256i _r = _mm256_set1_epi16((1 << shift) >> 1);
+  const __m256i y38 = _mm256_set1_epi16(mask_base);
+  const __m256i y64 = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  int i = 0;
+  if (w == 4) {
+    do {
+      const __m128i s0A = xx_loadl_64(src0);
+      const __m128i s0B = xx_loadl_64(src0 + src0_stride);
+      const __m128i s0C = xx_loadl_64(src0 + src0_stride * 2);
+      const __m128i s0D = xx_loadl_64(src0 + src0_stride * 3);
+      const __m128i s1A = xx_loadl_64(src1);
+      const __m128i s1B = xx_loadl_64(src1 + src1_stride);
+      const __m128i s1C = xx_loadl_64(src1 + src1_stride * 2);
+      const __m128i s1D = xx_loadl_64(src1 + src1_stride * 3);
+      const __m256i s0 = yy_set_m128i(_mm_unpacklo_epi64(s0C, s0D),
+                                      _mm_unpacklo_epi64(s0A, s0B));
+      const __m256i s1 = yy_set_m128i(_mm_unpacklo_epi64(s1C, s1D),
+                                      _mm_unpacklo_epi64(s1A, s1B));
+      const __m256i m16 = calc_mask_d16_avx2(&s0, &s1, &_r, &y38, &y64, shift);
+      const __m256i m8 = _mm256_packus_epi16(m16, _mm256_setzero_si256());
+      xx_storeu_128(mask,
+                    _mm256_castsi256_si128(_mm256_permute4x64_epi64(m8, 0xd8)));
+      src0 += src0_stride << 2;
+      src1 += src1_stride << 2;
+      mask += 16;
+      i += 4;
+    } while (i < h);
+  } else if (w == 8) {
+    do {
+      const __m256i s0AB = yy_loadu2_128(src0 + src0_stride, src0);
+      const __m256i s0CD =
+          yy_loadu2_128(src0 + src0_stride * 3, src0 + src0_stride * 2);
+      const __m256i s1AB = yy_loadu2_128(src1 + src1_stride, src1);
+      const __m256i s1CD =
+          yy_loadu2_128(src1 + src1_stride * 3, src1 + src1_stride * 2);
+      const __m256i m16AB =
+          calc_mask_d16_avx2(&s0AB, &s1AB, &_r, &y38, &y64, shift);
+      const __m256i m16CD =
+          calc_mask_d16_avx2(&s0CD, &s1CD, &_r, &y38, &y64, shift);
+      const __m256i m8 = _mm256_packus_epi16(m16AB, m16CD);
+      yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
+      src0 += src0_stride << 2;
+      src1 += src1_stride << 2;
+      mask += 32;
+      i += 4;
+    } while (i < h);
+  } else if (w == 16) {
+    do {
+      const __m256i s0A = yy_loadu_256(src0);
+      const __m256i s0B = yy_loadu_256(src0 + src0_stride);
+      const __m256i s1A = yy_loadu_256(src1);
+      const __m256i s1B = yy_loadu_256(src1 + src1_stride);
+      const __m256i m16A =
+          calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
+      const __m256i m16B =
+          calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
+      const __m256i m8 = _mm256_packus_epi16(m16A, m16B);
+      yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
+      src0 += src0_stride << 1;
+      src1 += src1_stride << 1;
+      mask += 32;
+      i += 2;
+    } while (i < h);
+  } else if (w == 32) {
+    do {
+      const __m256i s0A = yy_loadu_256(src0);
+      const __m256i s0B = yy_loadu_256(src0 + 16);
+      const __m256i s1A = yy_loadu_256(src1);
+      const __m256i s1B = yy_loadu_256(src1 + 16);
+      const __m256i m16A =
+          calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
+      const __m256i m16B =
+          calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
+      const __m256i m8 = _mm256_packus_epi16(m16A, m16B);
+      yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
+      src0 += src0_stride;
+      src1 += src1_stride;
+      mask += 32;
+      i += 1;
+    } while (i < h);
+  } else if (w == 64) {
+    do {
+      const __m256i s0A = yy_loadu_256(src0);
+      const __m256i s0B = yy_loadu_256(src0 + 16);
+      const __m256i s0C = yy_loadu_256(src0 + 32);
+      const __m256i s0D = yy_loadu_256(src0 + 48);
+      const __m256i s1A = yy_loadu_256(src1);
+      const __m256i s1B = yy_loadu_256(src1 + 16);
+      const __m256i s1C = yy_loadu_256(src1 + 32);
+      const __m256i s1D = yy_loadu_256(src1 + 48);
+      const __m256i m16A =
+          calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
+      const __m256i m16B =
+          calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
+      const __m256i m16C =
+          calc_mask_d16_avx2(&s0C, &s1C, &_r, &y38, &y64, shift);
+      const __m256i m16D =
+          calc_mask_d16_avx2(&s0D, &s1D, &_r, &y38, &y64, shift);
+      const __m256i m8AB = _mm256_packus_epi16(m16A, m16B);
+      const __m256i m8CD = _mm256_packus_epi16(m16C, m16D);
+      yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8));
+      yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8));
+      src0 += src0_stride;
+      src1 += src1_stride;
+      mask += 64;
+      i += 1;
+    } while (i < h);
+  } else {
+    do {
+      const __m256i s0A = yy_loadu_256(src0);
+      const __m256i s0B = yy_loadu_256(src0 + 16);
+      const __m256i s0C = yy_loadu_256(src0 + 32);
+      const __m256i s0D = yy_loadu_256(src0 + 48);
+      const __m256i s0E = yy_loadu_256(src0 + 64);
+      const __m256i s0F = yy_loadu_256(src0 + 80);
+      const __m256i s0G = yy_loadu_256(src0 + 96);
+      const __m256i s0H = yy_loadu_256(src0 + 112);
+      const __m256i s1A = yy_loadu_256(src1);
+      const __m256i s1B = yy_loadu_256(src1 + 16);
+      const __m256i s1C = yy_loadu_256(src1 + 32);
+      const __m256i s1D = yy_loadu_256(src1 + 48);
+      const __m256i s1E = yy_loadu_256(src1 + 64);
+      const __m256i s1F = yy_loadu_256(src1 + 80);
+      const __m256i s1G = yy_loadu_256(src1 + 96);
+      const __m256i s1H = yy_loadu_256(src1 + 112);
+      const __m256i m16A =
+          calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
+      const __m256i m16B =
+          calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
+      const __m256i m16C =
+          calc_mask_d16_avx2(&s0C, &s1C, &_r, &y38, &y64, shift);
+      const __m256i m16D =
+          calc_mask_d16_avx2(&s0D, &s1D, &_r, &y38, &y64, shift);
+      const __m256i m16E =
+          calc_mask_d16_avx2(&s0E, &s1E, &_r, &y38, &y64, shift);
+      const __m256i m16F =
+          calc_mask_d16_avx2(&s0F, &s1F, &_r, &y38, &y64, shift);
+      const __m256i m16G =
+          calc_mask_d16_avx2(&s0G, &s1G, &_r, &y38, &y64, shift);
+      const __m256i m16H =
+          calc_mask_d16_avx2(&s0H, &s1H, &_r, &y38, &y64, shift);
+      const __m256i m8AB = _mm256_packus_epi16(m16A, m16B);
+      const __m256i m8CD = _mm256_packus_epi16(m16C, m16D);
+      const __m256i m8EF = _mm256_packus_epi16(m16E, m16F);
+      const __m256i m8GH = _mm256_packus_epi16(m16G, m16H);
+      yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8));
+      yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8));
+      yy_storeu_256(mask + 64, _mm256_permute4x64_epi64(m8EF, 0xd8));
+      yy_storeu_256(mask + 96, _mm256_permute4x64_epi64(m8GH, 0xd8));
+      src0 += src0_stride;
+      src1 += src1_stride;
+      mask += 128;
+      i += 1;
+    } while (i < h);
+  }
+}
+
+static INLINE void build_compound_diffwtd_mask_d16_inv_avx2(
+    uint8_t *mask, const CONV_BUF_TYPE *src0, int src0_stride,
+    const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, int shift) {
+  const int mask_base = 38;
+  const __m256i _r = _mm256_set1_epi16((1 << shift) >> 1);
+  const __m256i y38 = _mm256_set1_epi16(mask_base);
+  const __m256i y64 = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  int i = 0;
+  if (w == 4) {
+    do {
+      const __m128i s0A = xx_loadl_64(src0);
+      const __m128i s0B = xx_loadl_64(src0 + src0_stride);
+      const __m128i s0C = xx_loadl_64(src0 + src0_stride * 2);
+      const __m128i s0D = xx_loadl_64(src0 + src0_stride * 3);
+      const __m128i s1A = xx_loadl_64(src1);
+      const __m128i s1B = xx_loadl_64(src1 + src1_stride);
+      const __m128i s1C = xx_loadl_64(src1 + src1_stride * 2);
+      const __m128i s1D = xx_loadl_64(src1 + src1_stride * 3);
+      const __m256i s0 = yy_set_m128i(_mm_unpacklo_epi64(s0C, s0D),
+                                      _mm_unpacklo_epi64(s0A, s0B));
+      const __m256i s1 = yy_set_m128i(_mm_unpacklo_epi64(s1C, s1D),
+                                      _mm_unpacklo_epi64(s1A, s1B));
+      const __m256i m16 =
+          calc_mask_d16_inv_avx2(&s0, &s1, &_r, &y38, &y64, shift);
+      const __m256i m8 = _mm256_packus_epi16(m16, _mm256_setzero_si256());
+      xx_storeu_128(mask,
+                    _mm256_castsi256_si128(_mm256_permute4x64_epi64(m8, 0xd8)));
+      src0 += src0_stride << 2;
+      src1 += src1_stride << 2;
+      mask += 16;
+      i += 4;
+    } while (i < h);
+  } else if (w == 8) {
+    do {
+      const __m256i s0AB = yy_loadu2_128(src0 + src0_stride, src0);
+      const __m256i s0CD =
+          yy_loadu2_128(src0 + src0_stride * 3, src0 + src0_stride * 2);
+      const __m256i s1AB = yy_loadu2_128(src1 + src1_stride, src1);
+      const __m256i s1CD =
+          yy_loadu2_128(src1 + src1_stride * 3, src1 + src1_stride * 2);
+      const __m256i m16AB =
+          calc_mask_d16_inv_avx2(&s0AB, &s1AB, &_r, &y38, &y64, shift);
+      const __m256i m16CD =
+          calc_mask_d16_inv_avx2(&s0CD, &s1CD, &_r, &y38, &y64, shift);
+      const __m256i m8 = _mm256_packus_epi16(m16AB, m16CD);
+      yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
+      src0 += src0_stride << 2;
+      src1 += src1_stride << 2;
+      mask += 32;
+      i += 4;
+    } while (i < h);
+  } else if (w == 16) {
+    do {
+      const __m256i s0A = yy_loadu_256(src0);
+      const __m256i s0B = yy_loadu_256(src0 + src0_stride);
+      const __m256i s1A = yy_loadu_256(src1);
+      const __m256i s1B = yy_loadu_256(src1 + src1_stride);
+      const __m256i m16A =
+          calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
+      const __m256i m16B =
+          calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
+      const __m256i m8 = _mm256_packus_epi16(m16A, m16B);
+      yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
+      src0 += src0_stride << 1;
+      src1 += src1_stride << 1;
+      mask += 32;
+      i += 2;
+    } while (i < h);
+  } else if (w == 32) {
+    do {
+      const __m256i s0A = yy_loadu_256(src0);
+      const __m256i s0B = yy_loadu_256(src0 + 16);
+      const __m256i s1A = yy_loadu_256(src1);
+      const __m256i s1B = yy_loadu_256(src1 + 16);
+      const __m256i m16A =
+          calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
+      const __m256i m16B =
+          calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
+      const __m256i m8 = _mm256_packus_epi16(m16A, m16B);
+      yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
+      src0 += src0_stride;
+      src1 += src1_stride;
+      mask += 32;
+      i += 1;
+    } while (i < h);
+  } else if (w == 64) {
+    do {
+      const __m256i s0A = yy_loadu_256(src0);
+      const __m256i s0B = yy_loadu_256(src0 + 16);
+      const __m256i s0C = yy_loadu_256(src0 + 32);
+      const __m256i s0D = yy_loadu_256(src0 + 48);
+      const __m256i s1A = yy_loadu_256(src1);
+      const __m256i s1B = yy_loadu_256(src1 + 16);
+      const __m256i s1C = yy_loadu_256(src1 + 32);
+      const __m256i s1D = yy_loadu_256(src1 + 48);
+      const __m256i m16A =
+          calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
+      const __m256i m16B =
+          calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
+      const __m256i m16C =
+          calc_mask_d16_inv_avx2(&s0C, &s1C, &_r, &y38, &y64, shift);
+      const __m256i m16D =
+          calc_mask_d16_inv_avx2(&s0D, &s1D, &_r, &y38, &y64, shift);
+      const __m256i m8AB = _mm256_packus_epi16(m16A, m16B);
+      const __m256i m8CD = _mm256_packus_epi16(m16C, m16D);
+      yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8));
+      yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8));
+      src0 += src0_stride;
+      src1 += src1_stride;
+      mask += 64;
+      i += 1;
+    } while (i < h);
+  } else {
+    do {
+      const __m256i s0A = yy_loadu_256(src0);
+      const __m256i s0B = yy_loadu_256(src0 + 16);
+      const __m256i s0C = yy_loadu_256(src0 + 32);
+      const __m256i s0D = yy_loadu_256(src0 + 48);
+      const __m256i s0E = yy_loadu_256(src0 + 64);
+      const __m256i s0F = yy_loadu_256(src0 + 80);
+      const __m256i s0G = yy_loadu_256(src0 + 96);
+      const __m256i s0H = yy_loadu_256(src0 + 112);
+      const __m256i s1A = yy_loadu_256(src1);
+      const __m256i s1B = yy_loadu_256(src1 + 16);
+      const __m256i s1C = yy_loadu_256(src1 + 32);
+      const __m256i s1D = yy_loadu_256(src1 + 48);
+      const __m256i s1E = yy_loadu_256(src1 + 64);
+      const __m256i s1F = yy_loadu_256(src1 + 80);
+      const __m256i s1G = yy_loadu_256(src1 + 96);
+      const __m256i s1H = yy_loadu_256(src1 + 112);
+      const __m256i m16A =
+          calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
+      const __m256i m16B =
+          calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
+      const __m256i m16C =
+          calc_mask_d16_inv_avx2(&s0C, &s1C, &_r, &y38, &y64, shift);
+      const __m256i m16D =
+          calc_mask_d16_inv_avx2(&s0D, &s1D, &_r, &y38, &y64, shift);
+      const __m256i m16E =
+          calc_mask_d16_inv_avx2(&s0E, &s1E, &_r, &y38, &y64, shift);
+      const __m256i m16F =
+          calc_mask_d16_inv_avx2(&s0F, &s1F, &_r, &y38, &y64, shift);
+      const __m256i m16G =
+          calc_mask_d16_inv_avx2(&s0G, &s1G, &_r, &y38, &y64, shift);
+      const __m256i m16H =
+          calc_mask_d16_inv_avx2(&s0H, &s1H, &_r, &y38, &y64, shift);
+      const __m256i m8AB = _mm256_packus_epi16(m16A, m16B);
+      const __m256i m8CD = _mm256_packus_epi16(m16C, m16D);
+      const __m256i m8EF = _mm256_packus_epi16(m16E, m16F);
+      const __m256i m8GH = _mm256_packus_epi16(m16G, m16H);
+      yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8));
+      yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8));
+      yy_storeu_256(mask + 64, _mm256_permute4x64_epi64(m8EF, 0xd8));
+      yy_storeu_256(mask + 96, _mm256_permute4x64_epi64(m8GH, 0xd8));
+      src0 += src0_stride;
+      src1 += src1_stride;
+      mask += 128;
+      i += 1;
+    } while (i < h);
+  }
+}
+
+void av1_build_compound_diffwtd_mask_d16_avx2(
+    uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0,
+    int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w,
+    ConvolveParams *conv_params, int bd) {
+  const int shift =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1 + (bd - 8);
+  // When rounding constant is added, there is a possibility of overflow.
+  // However that much precision is not required. Code should very well work for
+  // other values of DIFF_FACTOR_LOG2 and AOM_BLEND_A64_MAX_ALPHA as well. But
+  // there is a possibility of corner case bugs.
+  assert(DIFF_FACTOR_LOG2 == 4);
+  assert(AOM_BLEND_A64_MAX_ALPHA == 64);
+
+  if (mask_type == DIFFWTD_38) {
+    build_compound_diffwtd_mask_d16_avx2(mask, src0, src0_stride, src1,
+                                         src1_stride, h, w, shift);
+  } else {
+    build_compound_diffwtd_mask_d16_inv_avx2(mask, src0, src0_stride, src1,
+                                             src1_stride, h, w, shift);
+  }
+}
+
+void av1_build_compound_diffwtd_mask_highbd_avx2(
+    uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0,
+    int src0_stride, const uint8_t *src1, int src1_stride, int h, int w,
+    int bd) {
+  if (w < 16) {
+    av1_build_compound_diffwtd_mask_highbd_ssse3(
+        mask, mask_type, src0, src0_stride, src1, src1_stride, h, w, bd);
+  } else {
+    assert(mask_type == DIFFWTD_38 || mask_type == DIFFWTD_38_INV);
+    assert(bd >= 8);
+    assert((w % 16) == 0);
+    const __m256i y0 = _mm256_setzero_si256();
+    const __m256i yAOM_BLEND_A64_MAX_ALPHA =
+        _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+    const int mask_base = 38;
+    const __m256i ymask_base = _mm256_set1_epi16(mask_base);
+    const uint16_t *ssrc0 = CONVERT_TO_SHORTPTR(src0);
+    const uint16_t *ssrc1 = CONVERT_TO_SHORTPTR(src1);
+    if (bd == 8) {
+      if (mask_type == DIFFWTD_38_INV) {
+        for (int i = 0; i < h; ++i) {
+          for (int j = 0; j < w; j += 16) {
+            __m256i s0 = _mm256_loadu_si256((const __m256i *)&ssrc0[j]);
+            __m256i s1 = _mm256_loadu_si256((const __m256i *)&ssrc1[j]);
+            __m256i diff = _mm256_srai_epi16(
+                _mm256_abs_epi16(_mm256_sub_epi16(s0, s1)), DIFF_FACTOR_LOG2);
+            __m256i m = _mm256_min_epi16(
+                _mm256_max_epi16(y0, _mm256_add_epi16(diff, ymask_base)),
+                yAOM_BLEND_A64_MAX_ALPHA);
+            m = _mm256_sub_epi16(yAOM_BLEND_A64_MAX_ALPHA, m);
+            m = _mm256_packus_epi16(m, m);
+            m = _mm256_permute4x64_epi64(m, _MM_SHUFFLE(0, 0, 2, 0));
+            __m128i m0 = _mm256_castsi256_si128(m);
+            _mm_storeu_si128((__m128i *)&mask[j], m0);
+          }
+          ssrc0 += src0_stride;
+          ssrc1 += src1_stride;
+          mask += w;
+        }
+      } else {
+        for (int i = 0; i < h; ++i) {
+          for (int j = 0; j < w; j += 16) {
+            __m256i s0 = _mm256_loadu_si256((const __m256i *)&ssrc0[j]);
+            __m256i s1 = _mm256_loadu_si256((const __m256i *)&ssrc1[j]);
+            __m256i diff = _mm256_srai_epi16(
+                _mm256_abs_epi16(_mm256_sub_epi16(s0, s1)), DIFF_FACTOR_LOG2);
+            __m256i m = _mm256_min_epi16(
+                _mm256_max_epi16(y0, _mm256_add_epi16(diff, ymask_base)),
+                yAOM_BLEND_A64_MAX_ALPHA);
+            m = _mm256_packus_epi16(m, m);
+            m = _mm256_permute4x64_epi64(m, _MM_SHUFFLE(0, 0, 2, 0));
+            __m128i m0 = _mm256_castsi256_si128(m);
+            _mm_storeu_si128((__m128i *)&mask[j], m0);
+          }
+          ssrc0 += src0_stride;
+          ssrc1 += src1_stride;
+          mask += w;
+        }
+      }
+    } else {
+      const __m128i xshift = xx_set1_64_from_32i(bd - 8 + DIFF_FACTOR_LOG2);
+      if (mask_type == DIFFWTD_38_INV) {
+        for (int i = 0; i < h; ++i) {
+          for (int j = 0; j < w; j += 16) {
+            __m256i s0 = _mm256_loadu_si256((const __m256i *)&ssrc0[j]);
+            __m256i s1 = _mm256_loadu_si256((const __m256i *)&ssrc1[j]);
+            __m256i diff = _mm256_sra_epi16(
+                _mm256_abs_epi16(_mm256_sub_epi16(s0, s1)), xshift);
+            __m256i m = _mm256_min_epi16(
+                _mm256_max_epi16(y0, _mm256_add_epi16(diff, ymask_base)),
+                yAOM_BLEND_A64_MAX_ALPHA);
+            m = _mm256_sub_epi16(yAOM_BLEND_A64_MAX_ALPHA, m);
+            m = _mm256_packus_epi16(m, m);
+            m = _mm256_permute4x64_epi64(m, _MM_SHUFFLE(0, 0, 2, 0));
+            __m128i m0 = _mm256_castsi256_si128(m);
+            _mm_storeu_si128((__m128i *)&mask[j], m0);
+          }
+          ssrc0 += src0_stride;
+          ssrc1 += src1_stride;
+          mask += w;
+        }
+      } else {
+        for (int i = 0; i < h; ++i) {
+          for (int j = 0; j < w; j += 16) {
+            __m256i s0 = _mm256_loadu_si256((const __m256i *)&ssrc0[j]);
+            __m256i s1 = _mm256_loadu_si256((const __m256i *)&ssrc1[j]);
+            __m256i diff = _mm256_sra_epi16(
+                _mm256_abs_epi16(_mm256_sub_epi16(s0, s1)), xshift);
+            __m256i m = _mm256_min_epi16(
+                _mm256_max_epi16(y0, _mm256_add_epi16(diff, ymask_base)),
+                yAOM_BLEND_A64_MAX_ALPHA);
+            m = _mm256_packus_epi16(m, m);
+            m = _mm256_permute4x64_epi64(m, _MM_SHUFFLE(0, 0, 2, 0));
+            __m128i m0 = _mm256_castsi256_si128(m);
+            _mm_storeu_si128((__m128i *)&mask[j], m0);
+          }
+          ssrc0 += src0_stride;
+          ssrc1 += src1_stride;
+          mask += w;
+        }
+      }
+    }
+  }
+}
diff --git a/libs/libaom/src/av1/common/x86/reconinter_sse4.c b/libs/libaom/src/av1/common/x86/reconinter_sse4.c
new file mode 100644
index 000000000..5171ca493
--- /dev/null
+++ b/libs/libaom/src/av1/common/x86/reconinter_sse4.c
@@ -0,0 +1,153 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>  // SSE2
+#include <smmintrin.h>  /* SSE4.1 */
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/blend.h"
+#include "av1/common/blockd.h"
+
+static INLINE __m128i calc_mask(const __m128i mask_base, const __m128i s0,
+                                const __m128i s1) {
+  const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(s0, s1));
+  return _mm_abs_epi16(_mm_add_epi16(mask_base, _mm_srli_epi16(diff, 4)));
+  // clamp(diff, 0, 64) can be skiped for diff is always in the range ( 38, 54)
+}
+
+void av1_build_compound_diffwtd_mask_sse4_1(uint8_t *mask,
+                                            DIFFWTD_MASK_TYPE mask_type,
+                                            const uint8_t *src0, int stride0,
+                                            const uint8_t *src1, int stride1,
+                                            int h, int w) {
+  const int mb = (mask_type == DIFFWTD_38_INV) ? AOM_BLEND_A64_MAX_ALPHA : 0;
+  const __m128i mask_base = _mm_set1_epi16(38 - mb);
+  int i = 0;
+  if (4 == w) {
+    do {
+      const __m128i s0A = _mm_cvtsi32_si128(*(uint32_t *)src0);
+      const __m128i s0B = _mm_cvtsi32_si128(*(uint32_t *)(src0 + stride0));
+      const __m128i s0AB = _mm_unpacklo_epi32(s0A, s0B);
+      const __m128i s0 = _mm_cvtepu8_epi16(s0AB);
+
+      const __m128i s1A = _mm_cvtsi32_si128(*(uint32_t *)src1);
+      const __m128i s1B = _mm_cvtsi32_si128(*(uint32_t *)(src1 + stride1));
+      const __m128i s1AB = _mm_unpacklo_epi32(s1A, s1B);
+      const __m128i s1 = _mm_cvtepu8_epi16(s1AB);
+
+      const __m128i m16 = calc_mask(mask_base, s0, s1);
+      const __m128i m8 = _mm_packus_epi16(m16, m16);
+
+      *(uint32_t *)mask = _mm_cvtsi128_si32(m8);
+      *(uint32_t *)(mask + w) = _mm_extract_epi32(m8, 1);
+      src0 += (stride0 << 1);
+      src1 += (stride1 << 1);
+      mask += 8;
+      i += 2;
+    } while (i < h);
+  } else if (8 == w) {
+    do {
+      __m128i s0 = _mm_loadl_epi64((__m128i const *)src0);
+      __m128i s1 = _mm_loadl_epi64((__m128i const *)src1);
+      s0 = _mm_cvtepu8_epi16(s0);
+      s1 = _mm_cvtepu8_epi16(s1);
+      const __m128i m16 = calc_mask(mask_base, s0, s1);
+      const __m128i m8 = _mm_packus_epi16(m16, m16);
+      _mm_storel_epi64((__m128i *)mask, m8);
+      src0 += stride0;
+      src1 += stride1;
+      mask += 8;
+      i += 1;
+    } while (i < h);
+  } else {
+    const __m128i zero = _mm_setzero_si128();
+    do {
+      int j = 0;
+      do {
+        const __m128i s0 = _mm_load_si128((__m128i const *)(src0 + j));
+        const __m128i s1 = _mm_load_si128((__m128i const *)(src1 + j));
+        const __m128i s0L = _mm_cvtepu8_epi16(s0);
+        const __m128i s1L = _mm_cvtepu8_epi16(s1);
+        const __m128i s0H = _mm_unpackhi_epi8(s0, zero);
+        const __m128i s1H = _mm_unpackhi_epi8(s1, zero);
+
+        const __m128i m16L = calc_mask(mask_base, s0L, s1L);
+        const __m128i m16H = calc_mask(mask_base, s0H, s1H);
+
+        const __m128i m8 = _mm_packus_epi16(m16L, m16H);
+        _mm_store_si128((__m128i *)(mask + j), m8);
+        j += 16;
+      } while (j < w);
+      src0 += stride0;
+      src1 += stride1;
+      mask += w;
+      i += 1;
+    } while (i < h);
+  }
+}
+
+void av1_build_compound_diffwtd_mask_d16_sse4_1(
+    uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0,
+    int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w,
+    ConvolveParams *conv_params, int bd) {
+  const int which_inverse = (mask_type == DIFFWTD_38) ? 0 : 1;
+  const int mask_base = 38;
+  int round =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1 + (bd - 8);
+  const __m128i round_const = _mm_set1_epi16((1 << round) >> 1);
+  const __m128i mask_base_16 = _mm_set1_epi16(mask_base);
+  const __m128i clip_diff = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i add_const =
+      _mm_set1_epi16((which_inverse ? AOM_BLEND_A64_MAX_ALPHA : 0));
+  const __m128i add_sign = _mm_set1_epi16((which_inverse ? -1 : 1));
+
+  int i, j;
+  // When rounding constant is added, there is a possibility of overflow.
+  // However that much precision is not required. Code should very well work for
+  // other values of DIFF_FACTOR_LOG2 and AOM_BLEND_A64_MAX_ALPHA as well. But
+  // there is a possibility of corner case bugs.
+  assert(DIFF_FACTOR_LOG2 == 4);
+  assert(AOM_BLEND_A64_MAX_ALPHA == 64);
+  for (i = 0; i < h; ++i) {
+    for (j = 0; j < w; j += 8) {
+      const __m128i data_src0 =
+          _mm_loadu_si128((__m128i *)&src0[(i * src0_stride) + j]);
+      const __m128i data_src1 =
+          _mm_loadu_si128((__m128i *)&src1[(i * src1_stride) + j]);
+
+      const __m128i diffa = _mm_subs_epu16(data_src0, data_src1);
+      const __m128i diffb = _mm_subs_epu16(data_src1, data_src0);
+      const __m128i diff = _mm_max_epu16(diffa, diffb);
+      const __m128i diff_round =
+          _mm_srli_epi16(_mm_adds_epu16(diff, round_const), round);
+      const __m128i diff_factor = _mm_srli_epi16(diff_round, DIFF_FACTOR_LOG2);
+      const __m128i diff_mask = _mm_adds_epi16(diff_factor, mask_base_16);
+      __m128i diff_clamp = _mm_min_epi16(diff_mask, clip_diff);
+      // clamp to 0 can be skipped since we are using add and saturate
+      // instruction
+
+      const __m128i diff_sign = _mm_sign_epi16(diff_clamp, add_sign);
+      const __m128i diff_const_16 = _mm_add_epi16(diff_sign, add_const);
+
+      // 8 bit conversion and saturation to uint8
+      const __m128i res_8 = _mm_packus_epi16(diff_const_16, diff_const_16);
+
+      // Store values into the destination buffer
+      __m128i *const dst = (__m128i *)&mask[i * w + j];
+
+      if ((w - j) > 4) {
+        _mm_storel_epi64(dst, res_8);
+      } else {  // w==4
+        *(uint32_t *)dst = _mm_cvtsi128_si32(res_8);
+      }
+    }
+  }
+}
diff --git a/libs/libaom/src/av1/common/x86/reconinter_ssse3.c b/libs/libaom/src/av1/common/x86/reconinter_ssse3.c
new file mode 100644
index 000000000..cf684447c
--- /dev/null
+++ b/libs/libaom/src/av1/common/x86/reconinter_ssse3.c
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/blend.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "av1/common/blockd.h"
+
+void av1_build_compound_diffwtd_mask_highbd_ssse3(
+    uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0,
+    int src0_stride, const uint8_t *src1, int src1_stride, int h, int w,
+    int bd) {
+  if (w < 8) {
+    av1_build_compound_diffwtd_mask_highbd_c(mask, mask_type, src0, src0_stride,
+                                             src1, src1_stride, h, w, bd);
+  } else {
+    assert(bd >= 8);
+    assert((w % 8) == 0);
+    assert(mask_type == DIFFWTD_38 || mask_type == DIFFWTD_38_INV);
+    const __m128i x0 = _mm_setzero_si128();
+    const __m128i xAOM_BLEND_A64_MAX_ALPHA =
+        _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+    const int mask_base = 38;
+    const __m128i xmask_base = _mm_set1_epi16(mask_base);
+    const uint16_t *ssrc0 = CONVERT_TO_SHORTPTR(src0);
+    const uint16_t *ssrc1 = CONVERT_TO_SHORTPTR(src1);
+    if (bd == 8) {
+      if (mask_type == DIFFWTD_38_INV) {
+        for (int i = 0; i < h; ++i) {
+          for (int j = 0; j < w; j += 8) {
+            __m128i s0 = _mm_loadu_si128((const __m128i *)&ssrc0[j]);
+            __m128i s1 = _mm_loadu_si128((const __m128i *)&ssrc1[j]);
+            __m128i diff = _mm_srai_epi16(_mm_abs_epi16(_mm_sub_epi16(s0, s1)),
+                                          DIFF_FACTOR_LOG2);
+            __m128i m = _mm_min_epi16(
+                _mm_max_epi16(x0, _mm_add_epi16(diff, xmask_base)),
+                xAOM_BLEND_A64_MAX_ALPHA);
+            m = _mm_sub_epi16(xAOM_BLEND_A64_MAX_ALPHA, m);
+            m = _mm_packus_epi16(m, m);
+            _mm_storel_epi64((__m128i *)&mask[j], m);
+          }
+          ssrc0 += src0_stride;
+          ssrc1 += src1_stride;
+          mask += w;
+        }
+      } else {
+        for (int i = 0; i < h; ++i) {
+          for (int j = 0; j < w; j += 8) {
+            __m128i s0 = _mm_loadu_si128((const __m128i *)&ssrc0[j]);
+            __m128i s1 = _mm_loadu_si128((const __m128i *)&ssrc1[j]);
+            __m128i diff = _mm_srai_epi16(_mm_abs_epi16(_mm_sub_epi16(s0, s1)),
+                                          DIFF_FACTOR_LOG2);
+            __m128i m = _mm_min_epi16(
+                _mm_max_epi16(x0, _mm_add_epi16(diff, xmask_base)),
+                xAOM_BLEND_A64_MAX_ALPHA);
+            m = _mm_packus_epi16(m, m);
+            _mm_storel_epi64((__m128i *)&mask[j], m);
+          }
+          ssrc0 += src0_stride;
+          ssrc1 += src1_stride;
+          mask += w;
+        }
+      }
+    } else {
+      const __m128i xshift = xx_set1_64_from_32i(bd - 8 + DIFF_FACTOR_LOG2);
+      if (mask_type == DIFFWTD_38_INV) {
+        for (int i = 0; i < h; ++i) {
+          for (int j = 0; j < w; j += 8) {
+            __m128i s0 = _mm_loadu_si128((const __m128i *)&ssrc0[j]);
+            __m128i s1 = _mm_loadu_si128((const __m128i *)&ssrc1[j]);
+            __m128i diff =
+                _mm_sra_epi16(_mm_abs_epi16(_mm_sub_epi16(s0, s1)), xshift);
+            __m128i m = _mm_min_epi16(
+                _mm_max_epi16(x0, _mm_add_epi16(diff, xmask_base)),
+                xAOM_BLEND_A64_MAX_ALPHA);
+            m = _mm_sub_epi16(xAOM_BLEND_A64_MAX_ALPHA, m);
+            m = _mm_packus_epi16(m, m);
+            _mm_storel_epi64((__m128i *)&mask[j], m);
+          }
+          ssrc0 += src0_stride;
+          ssrc1 += src1_stride;
+          mask += w;
+        }
+      } else {
+        for (int i = 0; i < h; ++i) {
+          for (int j = 0; j < w; j += 8) {
+            __m128i s0 = _mm_loadu_si128((const __m128i *)&ssrc0[j]);
+            __m128i s1 = _mm_loadu_si128((const __m128i *)&ssrc1[j]);
+            __m128i diff =
+                _mm_sra_epi16(_mm_abs_epi16(_mm_sub_epi16(s0, s1)), xshift);
+            __m128i m = _mm_min_epi16(
+                _mm_max_epi16(x0, _mm_add_epi16(diff, xmask_base)),
+                xAOM_BLEND_A64_MAX_ALPHA);
+            m = _mm_packus_epi16(m, m);
+            _mm_storel_epi64((__m128i *)&mask[j], m);
+          }
+          ssrc0 += src0_stride;
+          ssrc1 += src1_stride;
+          mask += w;
+        }
+      }
+    }
+  }
+}
diff --git a/libs/libaom/src/av1/common/x86/selfguided_avx2.c b/libs/libaom/src/av1/common/x86/selfguided_avx2.c
new file mode 100644
index 000000000..3c5558dda
--- /dev/null
+++ b/libs/libaom/src/av1/common/x86/selfguided_avx2.c
@@ -0,0 +1,724 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "av1/common/restoration.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+
+// Load 8 bytes from the possibly-misaligned pointer p, extend each byte to
+// 32-bit precision and return them in an AVX2 register.
+static __m256i yy256_load_extend_8_32(const void *p) {
+  return _mm256_cvtepu8_epi32(xx_loadl_64(p));
+}
+
+// Load 8 halfwords from the possibly-misaligned pointer p, extend each
+// halfword to 32-bit precision and return them in an AVX2 register.
+static __m256i yy256_load_extend_16_32(const void *p) {
+  return _mm256_cvtepu16_epi32(xx_loadu_128(p));
+}
+
+// Compute the scan of an AVX2 register holding 8 32-bit integers. If the
+// register holds x0..x7 then the scan will hold x0, x0+x1, x0+x1+x2, ...,
+// x0+x1+...+x7
+//
+// Let [...] represent a 128-bit block, and let a, ..., h be 32-bit integers
+// (assumed small enough to be able to add them without overflow).
+//
+// Use -> as shorthand for summing, i.e. h->a = h + g + f + e + d + c + b + a.
+//
+// x   = [h g f e][d c b a]
+// x01 = [g f e 0][c b a 0]
+// x02 = [g+h f+g e+f e][c+d b+c a+b a]
+// x03 = [e+f e 0 0][a+b a 0 0]
+// x04 = [e->h e->g e->f e][a->d a->c a->b a]
+// s   = a->d
+// s01 = [a->d a->d a->d a->d]
+// s02 = [a->d a->d a->d a->d][0 0 0 0]
+// ret = [a->h a->g a->f a->e][a->d a->c a->b a]
+static __m256i scan_32(__m256i x) {
+  const __m256i x01 = _mm256_slli_si256(x, 4);
+  const __m256i x02 = _mm256_add_epi32(x, x01);
+  const __m256i x03 = _mm256_slli_si256(x02, 8);
+  const __m256i x04 = _mm256_add_epi32(x02, x03);
+  const int32_t s = _mm256_extract_epi32(x04, 3);
+  const __m128i s01 = _mm_set1_epi32(s);
+  const __m256i s02 = _mm256_insertf128_si256(_mm256_setzero_si256(), s01, 1);
+  return _mm256_add_epi32(x04, s02);
+}
+
+// Compute two integral images from src. B sums elements; A sums their
+// squares. The images are offset by one pixel, so will have width and height
+// equal to width + 1, height + 1 and the first row and column will be zero.
+//
+// A+1 and B+1 should be aligned to 32 bytes. buf_stride should be a multiple
+// of 8.
+
+static void *memset_zero_avx(int32_t *dest, const __m256i *zero, size_t count) {
+  unsigned int i = 0;
+  for (i = 0; i < (count & 0xffffffe0); i += 32) {
+    _mm256_storeu_si256((__m256i *)(dest + i), *zero);
+    _mm256_storeu_si256((__m256i *)(dest + i + 8), *zero);
+    _mm256_storeu_si256((__m256i *)(dest + i + 16), *zero);
+    _mm256_storeu_si256((__m256i *)(dest + i + 24), *zero);
+  }
+  for (; i < (count & 0xfffffff8); i += 8) {
+    _mm256_storeu_si256((__m256i *)(dest + i), *zero);
+  }
+  for (; i < count; i++) {
+    dest[i] = 0;
+  }
+  return dest;
+}
+
+static void integral_images(const uint8_t *src, int src_stride, int width,
+                            int height, int32_t *A, int32_t *B,
+                            int buf_stride) {
+  const __m256i zero = _mm256_setzero_si256();
+  // Write out the zero top row
+  memset_zero_avx(A, &zero, (width + 8));
+  memset_zero_avx(B, &zero, (width + 8));
+  for (int i = 0; i < height; ++i) {
+    // Zero the left column.
+    A[(i + 1) * buf_stride] = B[(i + 1) * buf_stride] = 0;
+
+    // ldiff is the difference H - D where H is the output sample immediately
+    // to the left and D is the output sample above it. These are scalars,
+    // replicated across the eight lanes.
+    __m256i ldiff1 = zero, ldiff2 = zero;
+    for (int j = 0; j < width; j += 8) {
+      const int ABj = 1 + j;
+
+      const __m256i above1 = yy_load_256(B + ABj + i * buf_stride);
+      const __m256i above2 = yy_load_256(A + ABj + i * buf_stride);
+
+      const __m256i x1 = yy256_load_extend_8_32(src + j + i * src_stride);
+      const __m256i x2 = _mm256_madd_epi16(x1, x1);
+
+      const __m256i sc1 = scan_32(x1);
+      const __m256i sc2 = scan_32(x2);
+
+      const __m256i row1 =
+          _mm256_add_epi32(_mm256_add_epi32(sc1, above1), ldiff1);
+      const __m256i row2 =
+          _mm256_add_epi32(_mm256_add_epi32(sc2, above2), ldiff2);
+
+      yy_store_256(B + ABj + (i + 1) * buf_stride, row1);
+      yy_store_256(A + ABj + (i + 1) * buf_stride, row2);
+
+      // Calculate the new H - D.
+      ldiff1 = _mm256_set1_epi32(
+          _mm256_extract_epi32(_mm256_sub_epi32(row1, above1), 7));
+      ldiff2 = _mm256_set1_epi32(
+          _mm256_extract_epi32(_mm256_sub_epi32(row2, above2), 7));
+    }
+  }
+}
+
+// Compute two integral images from src. B sums elements; A sums their squares
+//
+// A and B should be aligned to 32 bytes. buf_stride should be a multiple of 8.
+static void integral_images_highbd(const uint16_t *src, int src_stride,
+                                   int width, int height, int32_t *A,
+                                   int32_t *B, int buf_stride) {
+  const __m256i zero = _mm256_setzero_si256();
+  // Write out the zero top row
+  memset_zero_avx(A, &zero, (width + 8));
+  memset_zero_avx(B, &zero, (width + 8));
+
+  for (int i = 0; i < height; ++i) {
+    // Zero the left column.
+    A[(i + 1) * buf_stride] = B[(i + 1) * buf_stride] = 0;
+
+    // ldiff is the difference H - D where H is the output sample immediately
+    // to the left and D is the output sample above it. These are scalars,
+    // replicated across the eight lanes.
+    __m256i ldiff1 = zero, ldiff2 = zero;
+    for (int j = 0; j < width; j += 8) {
+      const int ABj = 1 + j;
+
+      const __m256i above1 = yy_load_256(B + ABj + i * buf_stride);
+      const __m256i above2 = yy_load_256(A + ABj + i * buf_stride);
+
+      const __m256i x1 = yy256_load_extend_16_32(src + j + i * src_stride);
+      const __m256i x2 = _mm256_madd_epi16(x1, x1);
+
+      const __m256i sc1 = scan_32(x1);
+      const __m256i sc2 = scan_32(x2);
+
+      const __m256i row1 =
+          _mm256_add_epi32(_mm256_add_epi32(sc1, above1), ldiff1);
+      const __m256i row2 =
+          _mm256_add_epi32(_mm256_add_epi32(sc2, above2), ldiff2);
+
+      yy_store_256(B + ABj + (i + 1) * buf_stride, row1);
+      yy_store_256(A + ABj + (i + 1) * buf_stride, row2);
+
+      // Calculate the new H - D.
+      ldiff1 = _mm256_set1_epi32(
+          _mm256_extract_epi32(_mm256_sub_epi32(row1, above1), 7));
+      ldiff2 = _mm256_set1_epi32(
+          _mm256_extract_epi32(_mm256_sub_epi32(row2, above2), 7));
+    }
+  }
+}
+
+// Compute 8 values of boxsum from the given integral image. ii should point
+// at the middle of the box (for the first value). r is the box radius.
+static INLINE __m256i boxsum_from_ii(const int32_t *ii, int stride, int r) {
+  const __m256i tl = yy_loadu_256(ii - (r + 1) - (r + 1) * stride);
+  const __m256i tr = yy_loadu_256(ii + (r + 0) - (r + 1) * stride);
+  const __m256i bl = yy_loadu_256(ii - (r + 1) + r * stride);
+  const __m256i br = yy_loadu_256(ii + (r + 0) + r * stride);
+  const __m256i u = _mm256_sub_epi32(tr, tl);
+  const __m256i v = _mm256_sub_epi32(br, bl);
+  return _mm256_sub_epi32(v, u);
+}
+
+static __m256i round_for_shift(unsigned shift) {
+  return _mm256_set1_epi32((1 << shift) >> 1);
+}
+
+static __m256i compute_p(__m256i sum1, __m256i sum2, int bit_depth, int n) {
+  __m256i an, bb;
+  if (bit_depth > 8) {
+    const __m256i rounding_a = round_for_shift(2 * (bit_depth - 8));
+    const __m256i rounding_b = round_for_shift(bit_depth - 8);
+    const __m128i shift_a = _mm_cvtsi32_si128(2 * (bit_depth - 8));
+    const __m128i shift_b = _mm_cvtsi32_si128(bit_depth - 8);
+    const __m256i a =
+        _mm256_srl_epi32(_mm256_add_epi32(sum2, rounding_a), shift_a);
+    const __m256i b =
+        _mm256_srl_epi32(_mm256_add_epi32(sum1, rounding_b), shift_b);
+    // b < 2^14, so we can use a 16-bit madd rather than a 32-bit
+    // mullo to square it
+    bb = _mm256_madd_epi16(b, b);
+    an = _mm256_max_epi32(_mm256_mullo_epi32(a, _mm256_set1_epi32(n)), bb);
+  } else {
+    bb = _mm256_madd_epi16(sum1, sum1);
+    an = _mm256_mullo_epi32(sum2, _mm256_set1_epi32(n));
+  }
+  return _mm256_sub_epi32(an, bb);
+}
+
+// Assumes that C, D are integral images for the original buffer which has been
+// extended to have a padding of SGRPROJ_BORDER_VERT/SGRPROJ_BORDER_HORZ pixels
+// on the sides. A, B, C, D point at logical position (0, 0).
+static void calc_ab(int32_t *A, int32_t *B, const int32_t *C, const int32_t *D,
+                    int width, int height, int buf_stride, int bit_depth,
+                    int sgr_params_idx, int radius_idx) {
+  const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
+  const int r = params->r[radius_idx];
+  const int n = (2 * r + 1) * (2 * r + 1);
+  const __m256i s = _mm256_set1_epi32(params->s[radius_idx]);
+  // one_over_n[n-1] is 2^12/n, so easily fits in an int16
+  const __m256i one_over_n = _mm256_set1_epi32(av1_one_by_x[n - 1]);
+
+  const __m256i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS);
+  const __m256i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS);
+
+  // Set up masks
+  const __m128i ones32 = _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff);
+  __m256i mask[8];
+  for (int idx = 0; idx < 8; idx++) {
+    const __m128i shift = _mm_cvtsi32_si128(8 * (8 - idx));
+    mask[idx] = _mm256_cvtepi8_epi32(_mm_srl_epi64(ones32, shift));
+  }
+
+  for (int i = -1; i < height + 1; ++i) {
+    for (int j = -1; j < width + 1; j += 8) {
+      const int32_t *Cij = C + i * buf_stride + j;
+      const int32_t *Dij = D + i * buf_stride + j;
+
+      __m256i sum1 = boxsum_from_ii(Dij, buf_stride, r);
+      __m256i sum2 = boxsum_from_ii(Cij, buf_stride, r);
+
+      // When width + 2 isn't a multiple of 8, sum1 and sum2 will contain
+      // some uninitialised data in their upper words. We use a mask to
+      // ensure that these bits are set to 0.
+      int idx = AOMMIN(8, width + 1 - j);
+      assert(idx >= 1);
+
+      if (idx < 8) {
+        sum1 = _mm256_and_si256(mask[idx], sum1);
+        sum2 = _mm256_and_si256(mask[idx], sum2);
+      }
+
+      const __m256i p = compute_p(sum1, sum2, bit_depth, n);
+
+      const __m256i z = _mm256_min_epi32(
+          _mm256_srli_epi32(_mm256_add_epi32(_mm256_mullo_epi32(p, s), rnd_z),
+                            SGRPROJ_MTABLE_BITS),
+          _mm256_set1_epi32(255));
+
+      const __m256i a_res = _mm256_i32gather_epi32(av1_x_by_xplus1, z, 4);
+
+      yy_storeu_256(A + i * buf_stride + j, a_res);
+
+      const __m256i a_complement =
+          _mm256_sub_epi32(_mm256_set1_epi32(SGRPROJ_SGR), a_res);
+
+      // sum1 might have lanes greater than 2^15, so we can't use madd to do
+      // multiplication involving sum1. However, a_complement and one_over_n
+      // are both less than 256, so we can multiply them first.
+      const __m256i a_comp_over_n = _mm256_madd_epi16(a_complement, one_over_n);
+      const __m256i b_int = _mm256_mullo_epi32(a_comp_over_n, sum1);
+      const __m256i b_res = _mm256_srli_epi32(_mm256_add_epi32(b_int, rnd_res),
+                                              SGRPROJ_RECIP_BITS);
+
+      yy_storeu_256(B + i * buf_stride + j, b_res);
+    }
+  }
+}
+
+// Calculate 8 values of the "cross sum" starting at buf. This is a 3x3 filter
+// where the outer four corners have weight 3 and all other pixels have weight
+// 4.
+//
+// Pixels are indexed as follows:
+// xtl  xt   xtr
+// xl    x   xr
+// xbl  xb   xbr
+//
+// buf points to x
+//
+// fours = xl + xt + xr + xb + x
+// threes = xtl + xtr + xbr + xbl
+// cross_sum = 4 * fours + 3 * threes
+//           = 4 * (fours + threes) - threes
+//           = (fours + threes) << 2 - threes
+static INLINE __m256i cross_sum(const int32_t *buf, int stride) {
+  const __m256i xtl = yy_loadu_256(buf - 1 - stride);
+  const __m256i xt = yy_loadu_256(buf - stride);
+  const __m256i xtr = yy_loadu_256(buf + 1 - stride);
+  const __m256i xl = yy_loadu_256(buf - 1);
+  const __m256i x = yy_loadu_256(buf);
+  const __m256i xr = yy_loadu_256(buf + 1);
+  const __m256i xbl = yy_loadu_256(buf - 1 + stride);
+  const __m256i xb = yy_loadu_256(buf + stride);
+  const __m256i xbr = yy_loadu_256(buf + 1 + stride);
+
+  const __m256i fours = _mm256_add_epi32(
+      xl, _mm256_add_epi32(xt, _mm256_add_epi32(xr, _mm256_add_epi32(xb, x))));
+  const __m256i threes =
+      _mm256_add_epi32(xtl, _mm256_add_epi32(xtr, _mm256_add_epi32(xbr, xbl)));
+
+  return _mm256_sub_epi32(_mm256_slli_epi32(_mm256_add_epi32(fours, threes), 2),
+                          threes);
+}
+
+// The final filter for self-guided restoration. Computes a weighted average
+// across A, B with "cross sums" (see cross_sum implementation above).
+static void final_filter(int32_t *dst, int dst_stride, const int32_t *A,
+                         const int32_t *B, int buf_stride, const void *dgd8,
+                         int dgd_stride, int width, int height, int highbd) {
+  const int nb = 5;
+  const __m256i rounding =
+      round_for_shift(SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+  const uint8_t *dgd_real =
+      highbd ? (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8) : dgd8;
+
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; j += 8) {
+      const __m256i a = cross_sum(A + i * buf_stride + j, buf_stride);
+      const __m256i b = cross_sum(B + i * buf_stride + j, buf_stride);
+
+      const __m128i raw =
+          xx_loadu_128(dgd_real + ((i * dgd_stride + j) << highbd));
+      const __m256i src =
+          highbd ? _mm256_cvtepu16_epi32(raw) : _mm256_cvtepu8_epi32(raw);
+
+      __m256i v = _mm256_add_epi32(_mm256_madd_epi16(a, src), b);
+      __m256i w = _mm256_srai_epi32(_mm256_add_epi32(v, rounding),
+                                    SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+
+      yy_storeu_256(dst + i * dst_stride + j, w);
+    }
+  }
+}
+
+// Assumes that C, D are integral images for the original buffer which has been
+// extended to have a padding of SGRPROJ_BORDER_VERT/SGRPROJ_BORDER_HORZ pixels
+// on the sides. A, B, C, D point at logical position (0, 0).
+static void calc_ab_fast(int32_t *A, int32_t *B, const int32_t *C,
+                         const int32_t *D, int width, int height,
+                         int buf_stride, int bit_depth, int sgr_params_idx,
+                         int radius_idx) {
+  const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
+  const int r = params->r[radius_idx];
+  const int n = (2 * r + 1) * (2 * r + 1);
+  const __m256i s = _mm256_set1_epi32(params->s[radius_idx]);
+  // one_over_n[n-1] is 2^12/n, so easily fits in an int16
+  const __m256i one_over_n = _mm256_set1_epi32(av1_one_by_x[n - 1]);
+
+  const __m256i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS);
+  const __m256i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS);
+
+  // Set up masks
+  const __m128i ones32 = _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff);
+  __m256i mask[8];
+  for (int idx = 0; idx < 8; idx++) {
+    const __m128i shift = _mm_cvtsi32_si128(8 * (8 - idx));
+    mask[idx] = _mm256_cvtepi8_epi32(_mm_srl_epi64(ones32, shift));
+  }
+
+  for (int i = -1; i < height + 1; i += 2) {
+    for (int j = -1; j < width + 1; j += 8) {
+      const int32_t *Cij = C + i * buf_stride + j;
+      const int32_t *Dij = D + i * buf_stride + j;
+
+      __m256i sum1 = boxsum_from_ii(Dij, buf_stride, r);
+      __m256i sum2 = boxsum_from_ii(Cij, buf_stride, r);
+
+      // When width + 2 isn't a multiple of 8, sum1 and sum2 will contain
+      // some uninitialised data in their upper words. We use a mask to
+      // ensure that these bits are set to 0.
+      int idx = AOMMIN(8, width + 1 - j);
+      assert(idx >= 1);
+
+      if (idx < 8) {
+        sum1 = _mm256_and_si256(mask[idx], sum1);
+        sum2 = _mm256_and_si256(mask[idx], sum2);
+      }
+
+      const __m256i p = compute_p(sum1, sum2, bit_depth, n);
+
+      const __m256i z = _mm256_min_epi32(
+          _mm256_srli_epi32(_mm256_add_epi32(_mm256_mullo_epi32(p, s), rnd_z),
+                            SGRPROJ_MTABLE_BITS),
+          _mm256_set1_epi32(255));
+
+      const __m256i a_res = _mm256_i32gather_epi32(av1_x_by_xplus1, z, 4);
+
+      yy_storeu_256(A + i * buf_stride + j, a_res);
+
+      const __m256i a_complement =
+          _mm256_sub_epi32(_mm256_set1_epi32(SGRPROJ_SGR), a_res);
+
+      // sum1 might have lanes greater than 2^15, so we can't use madd to do
+      // multiplication involving sum1. However, a_complement and one_over_n
+      // are both less than 256, so we can multiply them first.
+      const __m256i a_comp_over_n = _mm256_madd_epi16(a_complement, one_over_n);
+      const __m256i b_int = _mm256_mullo_epi32(a_comp_over_n, sum1);
+      const __m256i b_res = _mm256_srli_epi32(_mm256_add_epi32(b_int, rnd_res),
+                                              SGRPROJ_RECIP_BITS);
+
+      yy_storeu_256(B + i * buf_stride + j, b_res);
+    }
+  }
+}
+
+// Calculate 8 values of the "cross sum" starting at buf.
+//
+// Pixels are indexed like this:
+// xtl  xt   xtr
+//  -   buf   -
+// xbl  xb   xbr
+//
+// Pixels are weighted like this:
+//  5    6    5
+//  0    0    0
+//  5    6    5
+//
+// fives = xtl + xtr + xbl + xbr
+// sixes = xt + xb
+// cross_sum = 6 * sixes + 5 * fives
+//           = 5 * (fives + sixes) - sixes
+//           = (fives + sixes) << 2 + (fives + sixes) + sixes
+static INLINE __m256i cross_sum_fast_even_row(const int32_t *buf, int stride) {
+  const __m256i xtl = yy_loadu_256(buf - 1 - stride);
+  const __m256i xt = yy_loadu_256(buf - stride);
+  const __m256i xtr = yy_loadu_256(buf + 1 - stride);
+  const __m256i xbl = yy_loadu_256(buf - 1 + stride);
+  const __m256i xb = yy_loadu_256(buf + stride);
+  const __m256i xbr = yy_loadu_256(buf + 1 + stride);
+
+  const __m256i fives =
+      _mm256_add_epi32(xtl, _mm256_add_epi32(xtr, _mm256_add_epi32(xbr, xbl)));
+  const __m256i sixes = _mm256_add_epi32(xt, xb);
+  const __m256i fives_plus_sixes = _mm256_add_epi32(fives, sixes);
+
+  return _mm256_add_epi32(
+      _mm256_add_epi32(_mm256_slli_epi32(fives_plus_sixes, 2),
+                       fives_plus_sixes),
+      sixes);
+}
+
+// Calculate 8 values of the "cross sum" starting at buf.
+//
+// Pixels are indexed like this:
+// xl    x   xr
+//
+// Pixels are weighted like this:
+//  5    6    5
+//
+// buf points to x
+//
+// fives = xl + xr
+// sixes = x
+// cross_sum = 5 * fives + 6 * sixes
+//           = 4 * (fives + sixes) + (fives + sixes) + sixes
+//           = (fives + sixes) << 2 + (fives + sixes) + sixes
+static INLINE __m256i cross_sum_fast_odd_row(const int32_t *buf) {
+  const __m256i xl = yy_loadu_256(buf - 1);
+  const __m256i x = yy_loadu_256(buf);
+  const __m256i xr = yy_loadu_256(buf + 1);
+
+  const __m256i fives = _mm256_add_epi32(xl, xr);
+  const __m256i sixes = x;
+
+  const __m256i fives_plus_sixes = _mm256_add_epi32(fives, sixes);
+
+  return _mm256_add_epi32(
+      _mm256_add_epi32(_mm256_slli_epi32(fives_plus_sixes, 2),
+                       fives_plus_sixes),
+      sixes);
+}
+
+// The final filter for the self-guided restoration. Computes a
+// weighted average across A, B with "cross sums" (see cross_sum_...
+// implementations above).
+static void final_filter_fast(int32_t *dst, int dst_stride, const int32_t *A,
+                              const int32_t *B, int buf_stride,
+                              const void *dgd8, int dgd_stride, int width,
+                              int height, int highbd) {
+  const int nb0 = 5;
+  const int nb1 = 4;
+
+  const __m256i rounding0 =
+      round_for_shift(SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS);
+  const __m256i rounding1 =
+      round_for_shift(SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS);
+
+  const uint8_t *dgd_real =
+      highbd ? (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8) : dgd8;
+
+  for (int i = 0; i < height; ++i) {
+    if (!(i & 1)) {  // even row
+      for (int j = 0; j < width; j += 8) {
+        const __m256i a =
+            cross_sum_fast_even_row(A + i * buf_stride + j, buf_stride);
+        const __m256i b =
+            cross_sum_fast_even_row(B + i * buf_stride + j, buf_stride);
+
+        const __m128i raw =
+            xx_loadu_128(dgd_real + ((i * dgd_stride + j) << highbd));
+        const __m256i src =
+            highbd ? _mm256_cvtepu16_epi32(raw) : _mm256_cvtepu8_epi32(raw);
+
+        __m256i v = _mm256_add_epi32(_mm256_madd_epi16(a, src), b);
+        __m256i w =
+            _mm256_srai_epi32(_mm256_add_epi32(v, rounding0),
+                              SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS);
+
+        yy_storeu_256(dst + i * dst_stride + j, w);
+      }
+    } else {  // odd row
+      for (int j = 0; j < width; j += 8) {
+        const __m256i a = cross_sum_fast_odd_row(A + i * buf_stride + j);
+        const __m256i b = cross_sum_fast_odd_row(B + i * buf_stride + j);
+
+        const __m128i raw =
+            xx_loadu_128(dgd_real + ((i * dgd_stride + j) << highbd));
+        const __m256i src =
+            highbd ? _mm256_cvtepu16_epi32(raw) : _mm256_cvtepu8_epi32(raw);
+
+        __m256i v = _mm256_add_epi32(_mm256_madd_epi16(a, src), b);
+        __m256i w =
+            _mm256_srai_epi32(_mm256_add_epi32(v, rounding1),
+                              SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS);
+
+        yy_storeu_256(dst + i * dst_stride + j, w);
+      }
+    }
+  }
+}
+
+int av1_selfguided_restoration_avx2(const uint8_t *dgd8, int width, int height,
+                                    int dgd_stride, int32_t *flt0,
+                                    int32_t *flt1, int flt_stride,
+                                    int sgr_params_idx, int bit_depth,
+                                    int highbd) {
+  // The ALIGN_POWER_OF_TWO macro here ensures that column 1 of Atl, Btl,
+  // Ctl and Dtl is 32-byte aligned.
+  const int buf_elts = ALIGN_POWER_OF_TWO(RESTORATION_PROC_UNIT_PELS, 3);
+
+  int32_t *buf = aom_memalign(
+      32, 4 * sizeof(*buf) * ALIGN_POWER_OF_TWO(RESTORATION_PROC_UNIT_PELS, 3));
+  if (!buf) return -1;
+
+  const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
+  const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
+
+  // Adjusting the stride of A and B here appears to avoid bad cache effects,
+  // leading to a significant speed improvement.
+  // We also align the stride to a multiple of 32 bytes for efficiency.
+  int buf_stride = ALIGN_POWER_OF_TWO(width_ext + 16, 3);
+
+  // The "tl" pointers point at the top-left of the initialised data for the
+  // array.
+  int32_t *Atl = buf + 0 * buf_elts + 7;
+  int32_t *Btl = buf + 1 * buf_elts + 7;
+  int32_t *Ctl = buf + 2 * buf_elts + 7;
+  int32_t *Dtl = buf + 3 * buf_elts + 7;
+
+  // The "0" pointers are (- SGRPROJ_BORDER_VERT, -SGRPROJ_BORDER_HORZ). Note
+  // there's a zero row and column in A, B (integral images), so we move down
+  // and right one for them.
+  const int buf_diag_border =
+      SGRPROJ_BORDER_HORZ + buf_stride * SGRPROJ_BORDER_VERT;
+
+  int32_t *A0 = Atl + 1 + buf_stride;
+  int32_t *B0 = Btl + 1 + buf_stride;
+  int32_t *C0 = Ctl + 1 + buf_stride;
+  int32_t *D0 = Dtl + 1 + buf_stride;
+
+  // Finally, A, B, C, D point at position (0, 0).
+  int32_t *A = A0 + buf_diag_border;
+  int32_t *B = B0 + buf_diag_border;
+  int32_t *C = C0 + buf_diag_border;
+  int32_t *D = D0 + buf_diag_border;
+
+  const int dgd_diag_border =
+      SGRPROJ_BORDER_HORZ + dgd_stride * SGRPROJ_BORDER_VERT;
+  const uint8_t *dgd0 = dgd8 - dgd_diag_border;
+
+  // Generate integral images from the input. C will contain sums of squares; D
+  // will contain just sums
+  if (highbd)
+    integral_images_highbd(CONVERT_TO_SHORTPTR(dgd0), dgd_stride, width_ext,
+                           height_ext, Ctl, Dtl, buf_stride);
+  else
+    integral_images(dgd0, dgd_stride, width_ext, height_ext, Ctl, Dtl,
+                    buf_stride);
+
+  const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
+  // Write to flt0 and flt1
+  // If params->r == 0 we skip the corresponding filter. We only allow one of
+  // the radii to be 0, as having both equal to 0 would be equivalent to
+  // skipping SGR entirely.
+  assert(!(params->r[0] == 0 && params->r[1] == 0));
+  assert(params->r[0] < AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ));
+  assert(params->r[1] < AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ));
+
+  if (params->r[0] > 0) {
+    calc_ab_fast(A, B, C, D, width, height, buf_stride, bit_depth,
+                 sgr_params_idx, 0);
+    final_filter_fast(flt0, flt_stride, A, B, buf_stride, dgd8, dgd_stride,
+                      width, height, highbd);
+  }
+
+  if (params->r[1] > 0) {
+    calc_ab(A, B, C, D, width, height, buf_stride, bit_depth, sgr_params_idx,
+            1);
+    final_filter(flt1, flt_stride, A, B, buf_stride, dgd8, dgd_stride, width,
+                 height, highbd);
+  }
+  aom_free(buf);
+  return 0;
+}
+
+void av1_apply_selfguided_restoration_avx2(const uint8_t *dat8, int width,
+                                           int height, int stride, int eps,
+                                           const int *xqd, uint8_t *dst8,
+                                           int dst_stride, int32_t *tmpbuf,
+                                           int bit_depth, int highbd) {
+  int32_t *flt0 = tmpbuf;
+  int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
+  assert(width * height <= RESTORATION_UNITPELS_MAX);
+  const int ret = av1_selfguided_restoration_avx2(
+      dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd);
+  (void)ret;
+  assert(!ret);
+  const sgr_params_type *const params = &av1_sgr_params[eps];
+  int xq[2];
+  av1_decode_xq(xqd, xq, params);
+
+  __m256i xq0 = _mm256_set1_epi32(xq[0]);
+  __m256i xq1 = _mm256_set1_epi32(xq[1]);
+
+  for (int i = 0; i < height; ++i) {
+    // Calculate output in batches of 16 pixels
+    for (int j = 0; j < width; j += 16) {
+      const int k = i * width + j;
+      const int m = i * dst_stride + j;
+
+      const uint8_t *dat8ij = dat8 + i * stride + j;
+      __m256i ep_0, ep_1;
+      __m128i src_0, src_1;
+      if (highbd) {
+        src_0 = xx_loadu_128(CONVERT_TO_SHORTPTR(dat8ij));
+        src_1 = xx_loadu_128(CONVERT_TO_SHORTPTR(dat8ij + 8));
+        ep_0 = _mm256_cvtepu16_epi32(src_0);
+        ep_1 = _mm256_cvtepu16_epi32(src_1);
+      } else {
+        src_0 = xx_loadu_128(dat8ij);
+        ep_0 = _mm256_cvtepu8_epi32(src_0);
+        ep_1 = _mm256_cvtepu8_epi32(_mm_srli_si128(src_0, 8));
+      }
+
+      const __m256i u_0 = _mm256_slli_epi32(ep_0, SGRPROJ_RST_BITS);
+      const __m256i u_1 = _mm256_slli_epi32(ep_1, SGRPROJ_RST_BITS);
+
+      __m256i v_0 = _mm256_slli_epi32(u_0, SGRPROJ_PRJ_BITS);
+      __m256i v_1 = _mm256_slli_epi32(u_1, SGRPROJ_PRJ_BITS);
+
+      if (params->r[0] > 0) {
+        const __m256i f1_0 = _mm256_sub_epi32(yy_loadu_256(&flt0[k]), u_0);
+        v_0 = _mm256_add_epi32(v_0, _mm256_mullo_epi32(xq0, f1_0));
+
+        const __m256i f1_1 = _mm256_sub_epi32(yy_loadu_256(&flt0[k + 8]), u_1);
+        v_1 = _mm256_add_epi32(v_1, _mm256_mullo_epi32(xq0, f1_1));
+      }
+
+      if (params->r[1] > 0) {
+        const __m256i f2_0 = _mm256_sub_epi32(yy_loadu_256(&flt1[k]), u_0);
+        v_0 = _mm256_add_epi32(v_0, _mm256_mullo_epi32(xq1, f2_0));
+
+        const __m256i f2_1 = _mm256_sub_epi32(yy_loadu_256(&flt1[k + 8]), u_1);
+        v_1 = _mm256_add_epi32(v_1, _mm256_mullo_epi32(xq1, f2_1));
+      }
+
+      const __m256i rounding =
+          round_for_shift(SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+      const __m256i w_0 = _mm256_srai_epi32(
+          _mm256_add_epi32(v_0, rounding), SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+      const __m256i w_1 = _mm256_srai_epi32(
+          _mm256_add_epi32(v_1, rounding), SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+
+      if (highbd) {
+        // Pack into 16 bits and clamp to [0, 2^bit_depth)
+        // Note that packing into 16 bits messes up the order of the bits,
+        // so we use a permute function to correct this
+        const __m256i tmp = _mm256_packus_epi32(w_0, w_1);
+        const __m256i tmp2 = _mm256_permute4x64_epi64(tmp, 0xd8);
+        const __m256i max = _mm256_set1_epi16((1 << bit_depth) - 1);
+        const __m256i res = _mm256_min_epi16(tmp2, max);
+        yy_storeu_256(CONVERT_TO_SHORTPTR(dst8 + m), res);
+      } else {
+        // Pack into 8 bits and clamp to [0, 256)
+        // Note that each pack messes up the order of the bits,
+        // so we use a permute function to correct this
+        const __m256i tmp = _mm256_packs_epi32(w_0, w_1);
+        const __m256i tmp2 = _mm256_permute4x64_epi64(tmp, 0xd8);
+        const __m256i res =
+            _mm256_packus_epi16(tmp2, tmp2 /* "don't care" value */);
+        const __m128i res2 =
+            _mm256_castsi256_si128(_mm256_permute4x64_epi64(res, 0xd8));
+        xx_storeu_128(dst8 + m, res2);
+      }
+    }
+  }
+}
diff --git a/libs/libaom/src/av1/common/x86/selfguided_sse4.c b/libs/libaom/src/av1/common/x86/selfguided_sse4.c
new file mode 100644
index 000000000..72c7708f1
--- /dev/null
+++ b/libs/libaom/src/av1/common/x86/selfguided_sse4.c
@@ -0,0 +1,662 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "av1/common/restoration.h"
+#include "aom_dsp/x86/synonyms.h"
+
+// Load 4 bytes from the possibly-misaligned pointer p, extend each byte to
+// 32-bit precision and return them in an SSE register.
+static __m128i xx_load_extend_8_32(const void *p) {
+  return _mm_cvtepu8_epi32(xx_loadl_32(p));
+}
+
+// Load 4 halfwords from the possibly-misaligned pointer p, extend each
+// halfword to 32-bit precision and return them in an SSE register.
+static __m128i xx_load_extend_16_32(const void *p) {
+  return _mm_cvtepu16_epi32(xx_loadl_64(p));
+}
+
+// Compute the scan of an SSE register holding 4 32-bit integers. If the
+// register holds x0..x3 then the scan will hold x0, x0+x1, x0+x1+x2,
+// x0+x1+x2+x3
+static __m128i scan_32(__m128i x) {
+  const __m128i x01 = _mm_add_epi32(x, _mm_slli_si128(x, 4));
+  return _mm_add_epi32(x01, _mm_slli_si128(x01, 8));
+}
+
+// Compute two integral images from src. B sums elements; A sums their
+// squares. The images are offset by one pixel, so will have width and height
+// equal to width + 1, height + 1 and the first row and column will be zero.
+//
+// A+1 and B+1 should be aligned to 16 bytes. buf_stride should be a multiple
+// of 4.
+static void integral_images(const uint8_t *src, int src_stride, int width,
+                            int height, int32_t *A, int32_t *B,
+                            int buf_stride) {
+  // Write out the zero top row
+  memset(A, 0, sizeof(*A) * (width + 1));
+  memset(B, 0, sizeof(*B) * (width + 1));
+
+  const __m128i zero = _mm_setzero_si128();
+  for (int i = 0; i < height; ++i) {
+    // Zero the left column.
+    A[(i + 1) * buf_stride] = B[(i + 1) * buf_stride] = 0;
+
+    // ldiff is the difference H - D where H is the output sample immediately
+    // to the left and D is the output sample above it. These are scalars,
+    // replicated across the four lanes.
+    __m128i ldiff1 = zero, ldiff2 = zero;
+    for (int j = 0; j < width; j += 4) {
+      const int ABj = 1 + j;
+
+      const __m128i above1 = xx_load_128(B + ABj + i * buf_stride);
+      const __m128i above2 = xx_load_128(A + ABj + i * buf_stride);
+
+      const __m128i x1 = xx_load_extend_8_32(src + j + i * src_stride);
+      const __m128i x2 = _mm_madd_epi16(x1, x1);
+
+      const __m128i sc1 = scan_32(x1);
+      const __m128i sc2 = scan_32(x2);
+
+      const __m128i row1 = _mm_add_epi32(_mm_add_epi32(sc1, above1), ldiff1);
+      const __m128i row2 = _mm_add_epi32(_mm_add_epi32(sc2, above2), ldiff2);
+
+      xx_store_128(B + ABj + (i + 1) * buf_stride, row1);
+      xx_store_128(A + ABj + (i + 1) * buf_stride, row2);
+
+      // Calculate the new H - D.
+      ldiff1 = _mm_shuffle_epi32(_mm_sub_epi32(row1, above1), 0xff);
+      ldiff2 = _mm_shuffle_epi32(_mm_sub_epi32(row2, above2), 0xff);
+    }
+  }
+}
+
+// Compute two integral images from src. B sums elements; A sums their squares
+//
+// A and B should be aligned to 16 bytes. buf_stride should be a multiple of 4.
+static void integral_images_highbd(const uint16_t *src, int src_stride,
+                                   int width, int height, int32_t *A,
+                                   int32_t *B, int buf_stride) {
+  // Write out the zero top row
+  memset(A, 0, sizeof(*A) * (width + 1));
+  memset(B, 0, sizeof(*B) * (width + 1));
+
+  const __m128i zero = _mm_setzero_si128();
+  for (int i = 0; i < height; ++i) {
+    // Zero the left column.
+    A[(i + 1) * buf_stride] = B[(i + 1) * buf_stride] = 0;
+
+    // ldiff is the difference H - D where H is the output sample immediately
+    // to the left and D is the output sample above it. These are scalars,
+    // replicated across the four lanes.
+    __m128i ldiff1 = zero, ldiff2 = zero;
+    for (int j = 0; j < width; j += 4) {
+      const int ABj = 1 + j;
+
+      const __m128i above1 = xx_load_128(B + ABj + i * buf_stride);
+      const __m128i above2 = xx_load_128(A + ABj + i * buf_stride);
+
+      const __m128i x1 = xx_load_extend_16_32(src + j + i * src_stride);
+      const __m128i x2 = _mm_madd_epi16(x1, x1);
+
+      const __m128i sc1 = scan_32(x1);
+      const __m128i sc2 = scan_32(x2);
+
+      const __m128i row1 = _mm_add_epi32(_mm_add_epi32(sc1, above1), ldiff1);
+      const __m128i row2 = _mm_add_epi32(_mm_add_epi32(sc2, above2), ldiff2);
+
+      xx_store_128(B + ABj + (i + 1) * buf_stride, row1);
+      xx_store_128(A + ABj + (i + 1) * buf_stride, row2);
+
+      // Calculate the new H - D.
+      ldiff1 = _mm_shuffle_epi32(_mm_sub_epi32(row1, above1), 0xff);
+      ldiff2 = _mm_shuffle_epi32(_mm_sub_epi32(row2, above2), 0xff);
+    }
+  }
+}
+
+// Compute 4 values of boxsum from the given integral image. ii should point
+// at the middle of the box (for the first value). r is the box radius.
+static INLINE __m128i boxsum_from_ii(const int32_t *ii, int stride, int r) {
+  const __m128i tl = xx_loadu_128(ii - (r + 1) - (r + 1) * stride);
+  const __m128i tr = xx_loadu_128(ii + (r + 0) - (r + 1) * stride);
+  const __m128i bl = xx_loadu_128(ii - (r + 1) + r * stride);
+  const __m128i br = xx_loadu_128(ii + (r + 0) + r * stride);
+  const __m128i u = _mm_sub_epi32(tr, tl);
+  const __m128i v = _mm_sub_epi32(br, bl);
+  return _mm_sub_epi32(v, u);
+}
+
+static __m128i round_for_shift(unsigned shift) {
+  return _mm_set1_epi32((1 << shift) >> 1);
+}
+
+static __m128i compute_p(__m128i sum1, __m128i sum2, int bit_depth, int n) {
+  __m128i an, bb;
+  if (bit_depth > 8) {
+    const __m128i rounding_a = round_for_shift(2 * (bit_depth - 8));
+    const __m128i rounding_b = round_for_shift(bit_depth - 8);
+    const __m128i shift_a = _mm_cvtsi32_si128(2 * (bit_depth - 8));
+    const __m128i shift_b = _mm_cvtsi32_si128(bit_depth - 8);
+    const __m128i a = _mm_srl_epi32(_mm_add_epi32(sum2, rounding_a), shift_a);
+    const __m128i b = _mm_srl_epi32(_mm_add_epi32(sum1, rounding_b), shift_b);
+    // b < 2^14, so we can use a 16-bit madd rather than a 32-bit
+    // mullo to square it
+    bb = _mm_madd_epi16(b, b);
+    an = _mm_max_epi32(_mm_mullo_epi32(a, _mm_set1_epi32(n)), bb);
+  } else {
+    bb = _mm_madd_epi16(sum1, sum1);
+    an = _mm_mullo_epi32(sum2, _mm_set1_epi32(n));
+  }
+  return _mm_sub_epi32(an, bb);
+}
+
+// Assumes that C, D are integral images for the original buffer which has been
+// extended to have a padding of SGRPROJ_BORDER_VERT/SGRPROJ_BORDER_HORZ pixels
+// on the sides. A, B, C, D point at logical position (0, 0).
+static void calc_ab(int32_t *A, int32_t *B, const int32_t *C, const int32_t *D,
+                    int width, int height, int buf_stride, int bit_depth,
+                    int sgr_params_idx, int radius_idx) {
+  const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
+  const int r = params->r[radius_idx];
+  const int n = (2 * r + 1) * (2 * r + 1);
+  const __m128i s = _mm_set1_epi32(params->s[radius_idx]);
+  // one_over_n[n-1] is 2^12/n, so easily fits in an int16
+  const __m128i one_over_n = _mm_set1_epi32(av1_one_by_x[n - 1]);
+
+  const __m128i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS);
+  const __m128i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS);
+
+  // Set up masks
+  const __m128i ones32 = _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff);
+  __m128i mask[4];
+  for (int idx = 0; idx < 4; idx++) {
+    const __m128i shift = _mm_cvtsi32_si128(8 * (4 - idx));
+    mask[idx] = _mm_cvtepi8_epi32(_mm_srl_epi64(ones32, shift));
+  }
+
+  for (int i = -1; i < height + 1; ++i) {
+    for (int j = -1; j < width + 1; j += 4) {
+      const int32_t *Cij = C + i * buf_stride + j;
+      const int32_t *Dij = D + i * buf_stride + j;
+
+      __m128i sum1 = boxsum_from_ii(Dij, buf_stride, r);
+      __m128i sum2 = boxsum_from_ii(Cij, buf_stride, r);
+
+      // When width + 2 isn't a multiple of 4, sum1 and sum2 will contain
+      // some uninitialised data in their upper words. We use a mask to
+      // ensure that these bits are set to 0.
+      int idx = AOMMIN(4, width + 1 - j);
+      assert(idx >= 1);
+
+      if (idx < 4) {
+        sum1 = _mm_and_si128(mask[idx], sum1);
+        sum2 = _mm_and_si128(mask[idx], sum2);
+      }
+
+      const __m128i p = compute_p(sum1, sum2, bit_depth, n);
+
+      const __m128i z = _mm_min_epi32(
+          _mm_srli_epi32(_mm_add_epi32(_mm_mullo_epi32(p, s), rnd_z),
+                         SGRPROJ_MTABLE_BITS),
+          _mm_set1_epi32(255));
+
+      // 'Gather' type instructions are not available pre-AVX2, so synthesize a
+      // gather using scalar loads.
+      const __m128i a_res =
+          _mm_set_epi32(av1_x_by_xplus1[_mm_extract_epi32(z, 3)],
+                        av1_x_by_xplus1[_mm_extract_epi32(z, 2)],
+                        av1_x_by_xplus1[_mm_extract_epi32(z, 1)],
+                        av1_x_by_xplus1[_mm_extract_epi32(z, 0)]);
+
+      xx_storeu_128(A + i * buf_stride + j, a_res);
+
+      const __m128i a_complement =
+          _mm_sub_epi32(_mm_set1_epi32(SGRPROJ_SGR), a_res);
+
+      // sum1 might have lanes greater than 2^15, so we can't use madd to do
+      // multiplication involving sum1. However, a_complement and one_over_n
+      // are both less than 256, so we can multiply them first.
+      const __m128i a_comp_over_n = _mm_madd_epi16(a_complement, one_over_n);
+      const __m128i b_int = _mm_mullo_epi32(a_comp_over_n, sum1);
+      const __m128i b_res =
+          _mm_srli_epi32(_mm_add_epi32(b_int, rnd_res), SGRPROJ_RECIP_BITS);
+
+      xx_storeu_128(B + i * buf_stride + j, b_res);
+    }
+  }
+}
+
+// Calculate 4 values of the "cross sum" starting at buf. This is a 3x3 filter
+// where the outer four corners have weight 3 and all other pixels have weight
+// 4.
+//
+// Pixels are indexed like this:
+// xtl  xt   xtr
+// xl    x   xr
+// xbl  xb   xbr
+//
+// buf points to x
+//
+// fours = xl + xt + xr + xb + x
+// threes = xtl + xtr + xbr + xbl
+// cross_sum = 4 * fours + 3 * threes
+//           = 4 * (fours + threes) - threes
+//           = (fours + threes) << 2 - threes
+static INLINE __m128i cross_sum(const int32_t *buf, int stride) {
+  const __m128i xtl = xx_loadu_128(buf - 1 - stride);
+  const __m128i xt = xx_loadu_128(buf - stride);
+  const __m128i xtr = xx_loadu_128(buf + 1 - stride);
+  const __m128i xl = xx_loadu_128(buf - 1);
+  const __m128i x = xx_loadu_128(buf);
+  const __m128i xr = xx_loadu_128(buf + 1);
+  const __m128i xbl = xx_loadu_128(buf - 1 + stride);
+  const __m128i xb = xx_loadu_128(buf + stride);
+  const __m128i xbr = xx_loadu_128(buf + 1 + stride);
+
+  const __m128i fours = _mm_add_epi32(
+      xl, _mm_add_epi32(xt, _mm_add_epi32(xr, _mm_add_epi32(xb, x))));
+  const __m128i threes =
+      _mm_add_epi32(xtl, _mm_add_epi32(xtr, _mm_add_epi32(xbr, xbl)));
+
+  return _mm_sub_epi32(_mm_slli_epi32(_mm_add_epi32(fours, threes), 2), threes);
+}
+
+// The final filter for self-guided restoration. Computes a weighted average
+// across A, B with "cross sums" (see cross_sum implementation above).
+static void final_filter(int32_t *dst, int dst_stride, const int32_t *A,
+                         const int32_t *B, int buf_stride, const void *dgd8,
+                         int dgd_stride, int width, int height, int highbd) {
+  const int nb = 5;
+  const __m128i rounding =
+      round_for_shift(SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+  const uint8_t *dgd_real =
+      highbd ? (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8) : dgd8;
+
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; j += 4) {
+      const __m128i a = cross_sum(A + i * buf_stride + j, buf_stride);
+      const __m128i b = cross_sum(B + i * buf_stride + j, buf_stride);
+      const __m128i raw =
+          xx_loadl_64(dgd_real + ((i * dgd_stride + j) << highbd));
+      const __m128i src =
+          highbd ? _mm_cvtepu16_epi32(raw) : _mm_cvtepu8_epi32(raw);
+
+      __m128i v = _mm_add_epi32(_mm_madd_epi16(a, src), b);
+      __m128i w = _mm_srai_epi32(_mm_add_epi32(v, rounding),
+                                 SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+
+      xx_storeu_128(dst + i * dst_stride + j, w);
+    }
+  }
+}
+
+// Assumes that C, D are integral images for the original buffer which has been
+// extended to have a padding of SGRPROJ_BORDER_VERT/SGRPROJ_BORDER_HORZ pixels
+// on the sides. A, B, C, D point at logical position (0, 0).
+static void calc_ab_fast(int32_t *A, int32_t *B, const int32_t *C,
+                         const int32_t *D, int width, int height,
+                         int buf_stride, int bit_depth, int sgr_params_idx,
+                         int radius_idx) {
+  const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
+  const int r = params->r[radius_idx];
+  const int n = (2 * r + 1) * (2 * r + 1);
+  const __m128i s = _mm_set1_epi32(params->s[radius_idx]);
+  // one_over_n[n-1] is 2^12/n, so easily fits in an int16
+  const __m128i one_over_n = _mm_set1_epi32(av1_one_by_x[n - 1]);
+
+  const __m128i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS);
+  const __m128i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS);
+
+  // Set up masks
+  const __m128i ones32 = _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff);
+  __m128i mask[4];
+  for (int idx = 0; idx < 4; idx++) {
+    const __m128i shift = _mm_cvtsi32_si128(8 * (4 - idx));
+    mask[idx] = _mm_cvtepi8_epi32(_mm_srl_epi64(ones32, shift));
+  }
+
+  for (int i = -1; i < height + 1; i += 2) {
+    for (int j = -1; j < width + 1; j += 4) {
+      const int32_t *Cij = C + i * buf_stride + j;
+      const int32_t *Dij = D + i * buf_stride + j;
+
+      __m128i sum1 = boxsum_from_ii(Dij, buf_stride, r);
+      __m128i sum2 = boxsum_from_ii(Cij, buf_stride, r);
+
+      // When width + 2 isn't a multiple of 4, sum1 and sum2 will contain
+      // some uninitialised data in their upper words. We use a mask to
+      // ensure that these bits are set to 0.
+      int idx = AOMMIN(4, width + 1 - j);
+      assert(idx >= 1);
+
+      if (idx < 4) {
+        sum1 = _mm_and_si128(mask[idx], sum1);
+        sum2 = _mm_and_si128(mask[idx], sum2);
+      }
+
+      const __m128i p = compute_p(sum1, sum2, bit_depth, n);
+
+      const __m128i z = _mm_min_epi32(
+          _mm_srli_epi32(_mm_add_epi32(_mm_mullo_epi32(p, s), rnd_z),
+                         SGRPROJ_MTABLE_BITS),
+          _mm_set1_epi32(255));
+
+      // 'Gather' type instructions are not available pre-AVX2, so synthesize a
+      // gather using scalar loads.
+      const __m128i a_res =
+          _mm_set_epi32(av1_x_by_xplus1[_mm_extract_epi32(z, 3)],
+                        av1_x_by_xplus1[_mm_extract_epi32(z, 2)],
+                        av1_x_by_xplus1[_mm_extract_epi32(z, 1)],
+                        av1_x_by_xplus1[_mm_extract_epi32(z, 0)]);
+
+      xx_storeu_128(A + i * buf_stride + j, a_res);
+
+      const __m128i a_complement =
+          _mm_sub_epi32(_mm_set1_epi32(SGRPROJ_SGR), a_res);
+
+      // sum1 might have lanes greater than 2^15, so we can't use madd to do
+      // multiplication involving sum1. However, a_complement and one_over_n
+      // are both less than 256, so we can multiply them first.
+      const __m128i a_comp_over_n = _mm_madd_epi16(a_complement, one_over_n);
+      const __m128i b_int = _mm_mullo_epi32(a_comp_over_n, sum1);
+      const __m128i b_res =
+          _mm_srli_epi32(_mm_add_epi32(b_int, rnd_res), SGRPROJ_RECIP_BITS);
+
+      xx_storeu_128(B + i * buf_stride + j, b_res);
+    }
+  }
+}
+
+// Calculate 4 values of the "cross sum" starting at buf.
+//
+// Pixels are indexed like this:
+// xtl  xt   xtr
+//  -   buf   -
+// xbl  xb   xbr
+//
+// Pixels are weighted like this:
+//  5    6    5
+//  0    0    0
+//  5    6    5
+//
+// fives = xtl + xtr + xbl + xbr
+// sixes = xt + xb
+// cross_sum = 6 * sixes + 5 * fives
+//           = 5 * (fives + sixes) - sixes
+//           = (fives + sixes) << 2 + (fives + sixes) + sixes
+static INLINE __m128i cross_sum_fast_even_row(const int32_t *buf, int stride) {
+  const __m128i xtl = xx_loadu_128(buf - 1 - stride);
+  const __m128i xt = xx_loadu_128(buf - stride);
+  const __m128i xtr = xx_loadu_128(buf + 1 - stride);
+  const __m128i xbl = xx_loadu_128(buf - 1 + stride);
+  const __m128i xb = xx_loadu_128(buf + stride);
+  const __m128i xbr = xx_loadu_128(buf + 1 + stride);
+
+  const __m128i fives =
+      _mm_add_epi32(xtl, _mm_add_epi32(xtr, _mm_add_epi32(xbr, xbl)));
+  const __m128i sixes = _mm_add_epi32(xt, xb);
+  const __m128i fives_plus_sixes = _mm_add_epi32(fives, sixes);
+
+  return _mm_add_epi32(
+      _mm_add_epi32(_mm_slli_epi32(fives_plus_sixes, 2), fives_plus_sixes),
+      sixes);
+}
+
+// Calculate 4 values of the "cross sum" starting at buf.
+//
+// Pixels are indexed like this:
+// xl    x   xr
+//
+// Pixels are weighted like this:
+//  5    6    5
+//
+// buf points to x
+//
+// fives = xl + xr
+// sixes = x
+// cross_sum = 5 * fives + 6 * sixes
+//           = 4 * (fives + sixes) + (fives + sixes) + sixes
+//           = (fives + sixes) << 2 + (fives + sixes) + sixes
+static INLINE __m128i cross_sum_fast_odd_row(const int32_t *buf) {
+  const __m128i xl = xx_loadu_128(buf - 1);
+  const __m128i x = xx_loadu_128(buf);
+  const __m128i xr = xx_loadu_128(buf + 1);
+
+  const __m128i fives = _mm_add_epi32(xl, xr);
+  const __m128i sixes = x;
+
+  const __m128i fives_plus_sixes = _mm_add_epi32(fives, sixes);
+
+  return _mm_add_epi32(
+      _mm_add_epi32(_mm_slli_epi32(fives_plus_sixes, 2), fives_plus_sixes),
+      sixes);
+}
+
+// The final filter for the self-guided restoration. Computes a
+// weighted average across A, B with "cross sums" (see cross_sum_...
+// implementations above).
+static void final_filter_fast(int32_t *dst, int dst_stride, const int32_t *A,
+                              const int32_t *B, int buf_stride,
+                              const void *dgd8, int dgd_stride, int width,
+                              int height, int highbd) {
+  const int nb0 = 5;
+  const int nb1 = 4;
+
+  const __m128i rounding0 =
+      round_for_shift(SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS);
+  const __m128i rounding1 =
+      round_for_shift(SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS);
+
+  const uint8_t *dgd_real =
+      highbd ? (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8) : dgd8;
+
+  for (int i = 0; i < height; ++i) {
+    if (!(i & 1)) {  // even row
+      for (int j = 0; j < width; j += 4) {
+        const __m128i a =
+            cross_sum_fast_even_row(A + i * buf_stride + j, buf_stride);
+        const __m128i b =
+            cross_sum_fast_even_row(B + i * buf_stride + j, buf_stride);
+        const __m128i raw =
+            xx_loadl_64(dgd_real + ((i * dgd_stride + j) << highbd));
+        const __m128i src =
+            highbd ? _mm_cvtepu16_epi32(raw) : _mm_cvtepu8_epi32(raw);
+
+        __m128i v = _mm_add_epi32(_mm_madd_epi16(a, src), b);
+        __m128i w = _mm_srai_epi32(_mm_add_epi32(v, rounding0),
+                                   SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS);
+
+        xx_storeu_128(dst + i * dst_stride + j, w);
+      }
+    } else {  // odd row
+      for (int j = 0; j < width; j += 4) {
+        const __m128i a = cross_sum_fast_odd_row(A + i * buf_stride + j);
+        const __m128i b = cross_sum_fast_odd_row(B + i * buf_stride + j);
+        const __m128i raw =
+            xx_loadl_64(dgd_real + ((i * dgd_stride + j) << highbd));
+        const __m128i src =
+            highbd ? _mm_cvtepu16_epi32(raw) : _mm_cvtepu8_epi32(raw);
+
+        __m128i v = _mm_add_epi32(_mm_madd_epi16(a, src), b);
+        __m128i w = _mm_srai_epi32(_mm_add_epi32(v, rounding1),
+                                   SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS);
+
+        xx_storeu_128(dst + i * dst_stride + j, w);
+      }
+    }
+  }
+}
+
+int av1_selfguided_restoration_sse4_1(const uint8_t *dgd8, int width,
+                                      int height, int dgd_stride, int32_t *flt0,
+                                      int32_t *flt1, int flt_stride,
+                                      int sgr_params_idx, int bit_depth,
+                                      int highbd) {
+  int32_t *buf = (int32_t *)aom_memalign(
+      16, 4 * sizeof(*buf) * RESTORATION_PROC_UNIT_PELS);
+  if (!buf) return -1;
+  memset(buf, 0, 4 * sizeof(*buf) * RESTORATION_PROC_UNIT_PELS);
+
+  const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
+  const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
+
+  // Adjusting the stride of A and B here appears to avoid bad cache effects,
+  // leading to a significant speed improvement.
+  // We also align the stride to a multiple of 16 bytes for efficiency.
+  int buf_stride = ((width_ext + 3) & ~3) + 16;
+
+  // The "tl" pointers point at the top-left of the initialised data for the
+  // array. Adding 3 here ensures that column 1 is 16-byte aligned.
+  int32_t *Atl = buf + 0 * RESTORATION_PROC_UNIT_PELS + 3;
+  int32_t *Btl = buf + 1 * RESTORATION_PROC_UNIT_PELS + 3;
+  int32_t *Ctl = buf + 2 * RESTORATION_PROC_UNIT_PELS + 3;
+  int32_t *Dtl = buf + 3 * RESTORATION_PROC_UNIT_PELS + 3;
+
+  // The "0" pointers are (- SGRPROJ_BORDER_VERT, -SGRPROJ_BORDER_HORZ). Note
+  // there's a zero row and column in A, B (integral images), so we move down
+  // and right one for them.
+  const int buf_diag_border =
+      SGRPROJ_BORDER_HORZ + buf_stride * SGRPROJ_BORDER_VERT;
+
+  int32_t *A0 = Atl + 1 + buf_stride;
+  int32_t *B0 = Btl + 1 + buf_stride;
+  int32_t *C0 = Ctl + 1 + buf_stride;
+  int32_t *D0 = Dtl + 1 + buf_stride;
+
+  // Finally, A, B, C, D point at position (0, 0).
+  int32_t *A = A0 + buf_diag_border;
+  int32_t *B = B0 + buf_diag_border;
+  int32_t *C = C0 + buf_diag_border;
+  int32_t *D = D0 + buf_diag_border;
+
+  const int dgd_diag_border =
+      SGRPROJ_BORDER_HORZ + dgd_stride * SGRPROJ_BORDER_VERT;
+  const uint8_t *dgd0 = dgd8 - dgd_diag_border;
+
+  // Generate integral images from the input. C will contain sums of squares; D
+  // will contain just sums
+  if (highbd)
+    integral_images_highbd(CONVERT_TO_SHORTPTR(dgd0), dgd_stride, width_ext,
+                           height_ext, Ctl, Dtl, buf_stride);
+  else
+    integral_images(dgd0, dgd_stride, width_ext, height_ext, Ctl, Dtl,
+                    buf_stride);
+
+  const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
+  // Write to flt0 and flt1
+  // If params->r == 0 we skip the corresponding filter. We only allow one of
+  // the radii to be 0, as having both equal to 0 would be equivalent to
+  // skipping SGR entirely.
+  assert(!(params->r[0] == 0 && params->r[1] == 0));
+  assert(params->r[0] < AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ));
+  assert(params->r[1] < AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ));
+
+  if (params->r[0] > 0) {
+    calc_ab_fast(A, B, C, D, width, height, buf_stride, bit_depth,
+                 sgr_params_idx, 0);
+    final_filter_fast(flt0, flt_stride, A, B, buf_stride, dgd8, dgd_stride,
+                      width, height, highbd);
+  }
+
+  if (params->r[1] > 0) {
+    calc_ab(A, B, C, D, width, height, buf_stride, bit_depth, sgr_params_idx,
+            1);
+    final_filter(flt1, flt_stride, A, B, buf_stride, dgd8, dgd_stride, width,
+                 height, highbd);
+  }
+  aom_free(buf);
+  return 0;
+}
+
+void av1_apply_selfguided_restoration_sse4_1(const uint8_t *dat8, int width,
+                                             int height, int stride, int eps,
+                                             const int *xqd, uint8_t *dst8,
+                                             int dst_stride, int32_t *tmpbuf,
+                                             int bit_depth, int highbd) {
+  int32_t *flt0 = tmpbuf;
+  int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
+  assert(width * height <= RESTORATION_UNITPELS_MAX);
+  const int ret = av1_selfguided_restoration_sse4_1(
+      dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd);
+  (void)ret;
+  assert(!ret);
+  const sgr_params_type *const params = &av1_sgr_params[eps];
+  int xq[2];
+  av1_decode_xq(xqd, xq, params);
+
+  __m128i xq0 = _mm_set1_epi32(xq[0]);
+  __m128i xq1 = _mm_set1_epi32(xq[1]);
+
+  for (int i = 0; i < height; ++i) {
+    // Calculate output in batches of 8 pixels
+    for (int j = 0; j < width; j += 8) {
+      const int k = i * width + j;
+      const int m = i * dst_stride + j;
+
+      const uint8_t *dat8ij = dat8 + i * stride + j;
+      __m128i src;
+      if (highbd) {
+        src = xx_loadu_128(CONVERT_TO_SHORTPTR(dat8ij));
+      } else {
+        src = _mm_cvtepu8_epi16(xx_loadl_64(dat8ij));
+      }
+
+      const __m128i u = _mm_slli_epi16(src, SGRPROJ_RST_BITS);
+      const __m128i u_0 = _mm_cvtepu16_epi32(u);
+      const __m128i u_1 = _mm_cvtepu16_epi32(_mm_srli_si128(u, 8));
+
+      __m128i v_0 = _mm_slli_epi32(u_0, SGRPROJ_PRJ_BITS);
+      __m128i v_1 = _mm_slli_epi32(u_1, SGRPROJ_PRJ_BITS);
+
+      if (params->r[0] > 0) {
+        const __m128i f1_0 = _mm_sub_epi32(xx_loadu_128(&flt0[k]), u_0);
+        v_0 = _mm_add_epi32(v_0, _mm_mullo_epi32(xq0, f1_0));
+
+        const __m128i f1_1 = _mm_sub_epi32(xx_loadu_128(&flt0[k + 4]), u_1);
+        v_1 = _mm_add_epi32(v_1, _mm_mullo_epi32(xq0, f1_1));
+      }
+
+      if (params->r[1] > 0) {
+        const __m128i f2_0 = _mm_sub_epi32(xx_loadu_128(&flt1[k]), u_0);
+        v_0 = _mm_add_epi32(v_0, _mm_mullo_epi32(xq1, f2_0));
+
+        const __m128i f2_1 = _mm_sub_epi32(xx_loadu_128(&flt1[k + 4]), u_1);
+        v_1 = _mm_add_epi32(v_1, _mm_mullo_epi32(xq1, f2_1));
+      }
+
+      const __m128i rounding =
+          round_for_shift(SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+      const __m128i w_0 = _mm_srai_epi32(_mm_add_epi32(v_0, rounding),
+                                         SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+      const __m128i w_1 = _mm_srai_epi32(_mm_add_epi32(v_1, rounding),
+                                         SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+
+      if (highbd) {
+        // Pack into 16 bits and clamp to [0, 2^bit_depth)
+        const __m128i tmp = _mm_packus_epi32(w_0, w_1);
+        const __m128i max = _mm_set1_epi16((1 << bit_depth) - 1);
+        const __m128i res = _mm_min_epi16(tmp, max);
+        xx_storeu_128(CONVERT_TO_SHORTPTR(dst8 + m), res);
+      } else {
+        // Pack into 8 bits and clamp to [0, 256)
+        const __m128i tmp = _mm_packs_epi32(w_0, w_1);
+        const __m128i res = _mm_packus_epi16(tmp, tmp /* "don't care" value */);
+        xx_storel_64(dst8 + m, res);
+      }
+    }
+  }
+}
diff --git a/libs/libaom/src/av1/common/x86/warp_plane_avx2.c b/libs/libaom/src/av1/common/x86/warp_plane_avx2.c
new file mode 100644
index 000000000..53a928d76
--- /dev/null
+++ b/libs/libaom/src/av1/common/x86/warp_plane_avx2.c
@@ -0,0 +1,1318 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+#include "config/av1_rtcd.h"
+#include "av1/common/warped_motion.h"
+#include "aom_dsp/x86/synonyms.h"
+
+DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask01_avx2[32]) = {
+  0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
+  0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask23_avx2[32]) = {
+  2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
+  2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask45_avx2[32]) = {
+  4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5,
+  4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5, 4, 5
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, shuffle_alpha0_mask67_avx2[32]) = {
+  6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7,
+  6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask0_avx2[32]) = {
+  0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
+  0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask1_avx2[32]) = {
+  4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7,
+  4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask2_avx2[32]) = {
+  8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11,
+  8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, shuffle_gamma0_mask3_avx2[32]) = {
+  12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15,
+  12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15
+};
+
+DECLARE_ALIGNED(32, static const uint8_t,
+                shuffle_src0[32]) = { 0, 2, 2, 4, 4, 6, 6, 8, 1, 3, 3,
+                                      5, 5, 7, 7, 9, 0, 2, 2, 4, 4, 6,
+                                      6, 8, 1, 3, 3, 5, 5, 7, 7, 9 };
+
+DECLARE_ALIGNED(32, static const uint8_t,
+                shuffle_src1[32]) = { 4,  6,  6,  8,  8,  10, 10, 12, 5,  7, 7,
+                                      9,  9,  11, 11, 13, 4,  6,  6,  8,  8, 10,
+                                      10, 12, 5,  7,  7,  9,  9,  11, 11, 13 };
+
+DECLARE_ALIGNED(32, static const uint8_t,
+                shuffle_src2[32]) = { 1, 3, 3, 5, 5,  7, 7, 9, 2, 4, 4,
+                                      6, 6, 8, 8, 10, 1, 3, 3, 5, 5, 7,
+                                      7, 9, 2, 4, 4,  6, 6, 8, 8, 10 };
+
+DECLARE_ALIGNED(32, static const uint8_t,
+                shuffle_src3[32]) = { 5,  7,  7,  9,  9,  11, 11, 13, 6,  8, 8,
+                                      10, 10, 12, 12, 14, 5,  7,  7,  9,  9, 11,
+                                      11, 13, 6,  8,  8,  10, 10, 12, 12, 14 };
+
+static INLINE void filter_src_pixels_avx2(const __m256i src, __m256i *horz_out,
+                                          __m256i *coeff,
+                                          const __m256i *shuffle_src,
+                                          const __m256i *round_const,
+                                          const __m128i *shift, int row) {
+  const __m256i src_0 = _mm256_shuffle_epi8(src, shuffle_src[0]);
+  const __m256i src_1 = _mm256_shuffle_epi8(src, shuffle_src[1]);
+  const __m256i src_2 = _mm256_shuffle_epi8(src, shuffle_src[2]);
+  const __m256i src_3 = _mm256_shuffle_epi8(src, shuffle_src[3]);
+
+  const __m256i res_02 = _mm256_maddubs_epi16(src_0, coeff[0]);
+  const __m256i res_46 = _mm256_maddubs_epi16(src_1, coeff[1]);
+  const __m256i res_13 = _mm256_maddubs_epi16(src_2, coeff[2]);
+  const __m256i res_57 = _mm256_maddubs_epi16(src_3, coeff[3]);
+
+  const __m256i res_even = _mm256_add_epi16(res_02, res_46);
+  const __m256i res_odd = _mm256_add_epi16(res_13, res_57);
+  const __m256i res =
+      _mm256_add_epi16(_mm256_add_epi16(res_even, res_odd), *round_const);
+  horz_out[row] = _mm256_srl_epi16(res, *shift);
+}
+
+static INLINE void prepare_horizontal_filter_coeff_avx2(int alpha, int beta,
+                                                        int sx,
+                                                        __m256i *coeff) {
+  __m128i tmp_0 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[((unsigned)(sx + 0 * alpha)) >>
+                                  WARPEDDIFF_PREC_BITS]);
+  __m128i tmp_1 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[((unsigned)(sx + 1 * alpha)) >>
+                                  WARPEDDIFF_PREC_BITS]);
+  __m128i tmp_2 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[((unsigned)(sx + 2 * alpha)) >>
+                                  WARPEDDIFF_PREC_BITS]);
+  __m128i tmp_3 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[((unsigned)(sx + 3 * alpha)) >>
+                                  WARPEDDIFF_PREC_BITS]);
+
+  __m128i tmp_4 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[((unsigned)(sx + 4 * alpha)) >>
+                                  WARPEDDIFF_PREC_BITS]);
+  __m128i tmp_5 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[((unsigned)(sx + 5 * alpha)) >>
+                                  WARPEDDIFF_PREC_BITS]);
+  __m128i tmp_6 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[((unsigned)(sx + 6 * alpha)) >>
+                                  WARPEDDIFF_PREC_BITS]);
+  __m128i tmp_7 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[((unsigned)(sx + 7 * alpha)) >>
+                                  WARPEDDIFF_PREC_BITS]);
+
+  __m256i tmp0_256 = _mm256_castsi128_si256(tmp_0);
+  __m256i tmp2_256 = _mm256_castsi128_si256(tmp_2);
+  __m256i tmp1_256 = _mm256_castsi128_si256(tmp_1);
+  __m256i tmp3_256 = _mm256_castsi128_si256(tmp_3);
+
+  __m256i tmp4_256 = _mm256_castsi128_si256(tmp_4);
+  __m256i tmp6_256 = _mm256_castsi128_si256(tmp_6);
+  __m256i tmp5_256 = _mm256_castsi128_si256(tmp_5);
+  __m256i tmp7_256 = _mm256_castsi128_si256(tmp_7);
+
+  __m128i tmp_8 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 0 * alpha) >>
+                                  WARPEDDIFF_PREC_BITS]);
+  tmp0_256 = _mm256_inserti128_si256(tmp0_256, tmp_8, 1);
+
+  __m128i tmp_9 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 1 * alpha) >>
+                                  WARPEDDIFF_PREC_BITS]);
+  tmp1_256 = _mm256_inserti128_si256(tmp1_256, tmp_9, 1);
+
+  __m128i tmp_10 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 2 * alpha) >>
+                                  WARPEDDIFF_PREC_BITS]);
+  tmp2_256 = _mm256_inserti128_si256(tmp2_256, tmp_10, 1);
+
+  __m128i tmp_11 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 3 * alpha) >>
+                                  WARPEDDIFF_PREC_BITS]);
+  tmp3_256 = _mm256_inserti128_si256(tmp3_256, tmp_11, 1);
+
+  tmp_2 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 4 * alpha) >>
+                                  WARPEDDIFF_PREC_BITS]);
+  tmp4_256 = _mm256_inserti128_si256(tmp4_256, tmp_2, 1);
+
+  tmp_3 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 5 * alpha) >>
+                                  WARPEDDIFF_PREC_BITS]);
+  tmp5_256 = _mm256_inserti128_si256(tmp5_256, tmp_3, 1);
+
+  tmp_6 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 6 * alpha) >>
+                                  WARPEDDIFF_PREC_BITS]);
+  tmp6_256 = _mm256_inserti128_si256(tmp6_256, tmp_6, 1);
+
+  tmp_7 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(unsigned)((sx + beta) + 7 * alpha) >>
+                                  WARPEDDIFF_PREC_BITS]);
+  tmp7_256 = _mm256_inserti128_si256(tmp7_256, tmp_7, 1);
+
+  const __m256i tmp_12 = _mm256_unpacklo_epi16(tmp0_256, tmp2_256);
+  const __m256i tmp_13 = _mm256_unpacklo_epi16(tmp1_256, tmp3_256);
+  const __m256i tmp_14 = _mm256_unpacklo_epi16(tmp4_256, tmp6_256);
+  const __m256i tmp_15 = _mm256_unpacklo_epi16(tmp5_256, tmp7_256);
+
+  const __m256i res_0 = _mm256_unpacklo_epi32(tmp_12, tmp_14);
+  const __m256i res_1 = _mm256_unpackhi_epi32(tmp_12, tmp_14);
+  const __m256i res_2 = _mm256_unpacklo_epi32(tmp_13, tmp_15);
+  const __m256i res_3 = _mm256_unpackhi_epi32(tmp_13, tmp_15);
+
+  coeff[0] = _mm256_unpacklo_epi64(res_0, res_2);
+  coeff[1] = _mm256_unpackhi_epi64(res_0, res_2);
+  coeff[2] = _mm256_unpacklo_epi64(res_1, res_3);
+  coeff[3] = _mm256_unpackhi_epi64(res_1, res_3);
+}
+
+static INLINE void prepare_horizontal_filter_coeff_beta0_avx2(int alpha, int sx,
+                                                              __m256i *coeff) {
+  __m128i tmp_0 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]);
+  __m128i tmp_1 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]);
+  __m128i tmp_2 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]);
+  __m128i tmp_3 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]);
+  __m128i tmp_4 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]);
+  __m128i tmp_5 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]);
+  __m128i tmp_6 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]);
+  __m128i tmp_7 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]);
+
+  tmp_0 = _mm_unpacklo_epi16(tmp_0, tmp_2);
+  tmp_1 = _mm_unpacklo_epi16(tmp_1, tmp_3);
+  tmp_4 = _mm_unpacklo_epi16(tmp_4, tmp_6);
+  tmp_5 = _mm_unpacklo_epi16(tmp_5, tmp_7);
+
+  const __m256i tmp_12 = _mm256_broadcastsi128_si256(tmp_0);
+  const __m256i tmp_13 = _mm256_broadcastsi128_si256(tmp_1);
+  const __m256i tmp_14 = _mm256_broadcastsi128_si256(tmp_4);
+  const __m256i tmp_15 = _mm256_broadcastsi128_si256(tmp_5);
+
+  const __m256i res_0 = _mm256_unpacklo_epi32(tmp_12, tmp_14);
+  const __m256i res_1 = _mm256_unpackhi_epi32(tmp_12, tmp_14);
+  const __m256i res_2 = _mm256_unpacklo_epi32(tmp_13, tmp_15);
+  const __m256i res_3 = _mm256_unpackhi_epi32(tmp_13, tmp_15);
+
+  coeff[0] = _mm256_unpacklo_epi64(res_0, res_2);
+  coeff[1] = _mm256_unpackhi_epi64(res_0, res_2);
+  coeff[2] = _mm256_unpacklo_epi64(res_1, res_3);
+  coeff[3] = _mm256_unpackhi_epi64(res_1, res_3);
+}
+
+static INLINE void prepare_horizontal_filter_coeff_alpha0_avx2(int beta, int sx,
+                                                               __m256i *coeff) {
+  const __m128i tmp_0 =
+      _mm_loadl_epi64((__m128i *)&av1_filter_8bit[sx >> WARPEDDIFF_PREC_BITS]);
+  const __m128i tmp_1 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(sx + beta) >> WARPEDDIFF_PREC_BITS]);
+
+  const __m256i res_0 =
+      _mm256_inserti128_si256(_mm256_castsi128_si256(tmp_0), tmp_1, 0x1);
+
+  coeff[0] = _mm256_shuffle_epi8(
+      res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask01_avx2));
+  coeff[1] = _mm256_shuffle_epi8(
+      res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask23_avx2));
+  coeff[2] = _mm256_shuffle_epi8(
+      res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask45_avx2));
+  coeff[3] = _mm256_shuffle_epi8(
+      res_0, _mm256_load_si256((__m256i *)shuffle_alpha0_mask67_avx2));
+}
+
+static INLINE void horizontal_filter_avx2(const __m256i src, __m256i *horz_out,
+                                          int sx, int alpha, int beta, int row,
+                                          const __m256i *shuffle_src,
+                                          const __m256i *round_const,
+                                          const __m128i *shift) {
+  __m256i coeff[4];
+  prepare_horizontal_filter_coeff_avx2(alpha, beta, sx, coeff);
+  filter_src_pixels_avx2(src, horz_out, coeff, shuffle_src, round_const, shift,
+                         row);
+}
+static INLINE void prepare_horizontal_filter_coeff(int alpha, int sx,
+                                                   __m256i *coeff) {
+  const __m128i tmp_0 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]);
+  const __m128i tmp_1 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]);
+  const __m128i tmp_2 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]);
+  const __m128i tmp_3 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]);
+  const __m128i tmp_4 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]);
+  const __m128i tmp_5 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]);
+  const __m128i tmp_6 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]);
+  const __m128i tmp_7 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]);
+
+  const __m128i tmp_8 = _mm_unpacklo_epi16(tmp_0, tmp_2);
+  const __m128i tmp_9 = _mm_unpacklo_epi16(tmp_1, tmp_3);
+  const __m128i tmp_10 = _mm_unpacklo_epi16(tmp_4, tmp_6);
+  const __m128i tmp_11 = _mm_unpacklo_epi16(tmp_5, tmp_7);
+
+  const __m128i tmp_12 = _mm_unpacklo_epi32(tmp_8, tmp_10);
+  const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_8, tmp_10);
+  const __m128i tmp_14 = _mm_unpacklo_epi32(tmp_9, tmp_11);
+  const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_9, tmp_11);
+
+  coeff[0] = _mm256_castsi128_si256(_mm_unpacklo_epi64(tmp_12, tmp_14));
+  coeff[1] = _mm256_castsi128_si256(_mm_unpackhi_epi64(tmp_12, tmp_14));
+  coeff[2] = _mm256_castsi128_si256(_mm_unpacklo_epi64(tmp_13, tmp_15));
+  coeff[3] = _mm256_castsi128_si256(_mm_unpackhi_epi64(tmp_13, tmp_15));
+}
+
+static INLINE void warp_horizontal_filter_avx2(
+    const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4,
+    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+    const __m256i *round_const, const __m128i *shift,
+    const __m256i *shuffle_src) {
+  int k, iy, sx, row = 0;
+  __m256i coeff[4];
+  for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
+    iy = iy4 + k;
+    iy = clamp(iy, 0, height - 1);
+    const __m128i src_0 =
+        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+    iy = iy4 + k + 1;
+    iy = clamp(iy, 0, height - 1);
+    const __m128i src_1 =
+        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+    const __m256i src_01 =
+        _mm256_inserti128_si256(_mm256_castsi128_si256(src_0), src_1, 0x1);
+    sx = sx4 + beta * (k + 4);
+    horizontal_filter_avx2(src_01, horz_out, sx, alpha, beta, row, shuffle_src,
+                           round_const, shift);
+    row += 1;
+  }
+  iy = iy4 + k;
+  iy = clamp(iy, 0, height - 1);
+  const __m256i src_01 = _mm256_castsi128_si256(
+      _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)));
+  sx = sx4 + beta * (k + 4);
+  prepare_horizontal_filter_coeff(alpha, sx, coeff);
+  filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
+                         shift, row);
+}
+
+static INLINE void warp_horizontal_filter_alpha0_avx2(
+    const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4,
+    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+    const __m256i *round_const, const __m128i *shift,
+    const __m256i *shuffle_src) {
+  (void)alpha;
+  int k, iy, sx, row = 0;
+  __m256i coeff[4];
+  for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
+    iy = iy4 + k;
+    iy = clamp(iy, 0, height - 1);
+    const __m128i src_0 =
+        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+    iy = iy4 + k + 1;
+    iy = clamp(iy, 0, height - 1);
+    const __m128i src_1 =
+        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+    const __m256i src_01 =
+        _mm256_inserti128_si256(_mm256_castsi128_si256(src_0), src_1, 0x1);
+    sx = sx4 + beta * (k + 4);
+    prepare_horizontal_filter_coeff_alpha0_avx2(beta, sx, coeff);
+    filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
+                           shift, row);
+    row += 1;
+  }
+  iy = iy4 + k;
+  iy = clamp(iy, 0, height - 1);
+  const __m256i src_01 = _mm256_castsi128_si256(
+      _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)));
+  sx = sx4 + beta * (k + 4);
+  prepare_horizontal_filter_coeff_alpha0_avx2(beta, sx, coeff);
+  filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
+                         shift, row);
+}
+
+static INLINE void warp_horizontal_filter_beta0_avx2(
+    const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4,
+    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+    const __m256i *round_const, const __m128i *shift,
+    const __m256i *shuffle_src) {
+  (void)beta;
+  int k, iy, row = 0;
+  __m256i coeff[4];
+  prepare_horizontal_filter_coeff_beta0_avx2(alpha, sx4, coeff);
+  for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
+    iy = iy4 + k;
+    iy = clamp(iy, 0, height - 1);
+    const __m128i src_0 =
+        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+    iy = iy4 + k + 1;
+    iy = clamp(iy, 0, height - 1);
+    const __m128i src_1 =
+        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+    const __m256i src_01 =
+        _mm256_inserti128_si256(_mm256_castsi128_si256(src_0), src_1, 0x1);
+    filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
+                           shift, row);
+    row += 1;
+  }
+  iy = iy4 + k;
+  iy = clamp(iy, 0, height - 1);
+  const __m256i src_01 = _mm256_castsi128_si256(
+      _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)));
+  filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
+                         shift, row);
+}
+
+static INLINE void warp_horizontal_filter_alpha0_beta0_avx2(
+    const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4,
+    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+    const __m256i *round_const, const __m128i *shift,
+    const __m256i *shuffle_src) {
+  (void)alpha;
+  int k, iy, row = 0;
+  __m256i coeff[4];
+  prepare_horizontal_filter_coeff_alpha0_avx2(beta, sx4, coeff);
+  for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
+    iy = iy4 + k;
+    iy = clamp(iy, 0, height - 1);
+    const __m128i src0 =
+        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+    iy = iy4 + k + 1;
+    iy = clamp(iy, 0, height - 1);
+    const __m128i src1 =
+        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+    const __m256i src_01 =
+        _mm256_inserti128_si256(_mm256_castsi128_si256(src0), src1, 0x1);
+    filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
+                           shift, row);
+    row += 1;
+  }
+  iy = iy4 + k;
+  iy = clamp(iy, 0, height - 1);
+  const __m256i src_01 = _mm256_castsi128_si256(
+      _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7)));
+  filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src, round_const,
+                         shift, row);
+}
+
+static INLINE void unpack_weights_and_set_round_const_avx2(
+    ConvolveParams *conv_params, const int round_bits, const int offset_bits,
+    __m256i *res_sub_const, __m256i *round_bits_const, __m256i *wt) {
+  *res_sub_const =
+      _mm256_set1_epi16(-(1 << (offset_bits - conv_params->round_1)) -
+                        (1 << (offset_bits - conv_params->round_1 - 1)));
+  *round_bits_const = _mm256_set1_epi16(((1 << round_bits) >> 1));
+
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m256i wt0 = _mm256_set1_epi16((short)w0);
+  const __m256i wt1 = _mm256_set1_epi16((short)w1);
+  *wt = _mm256_unpacklo_epi16(wt0, wt1);
+}
+
+static INLINE void prepare_vertical_filter_coeffs_avx2(int gamma, int delta,
+                                                       int sy,
+                                                       __m256i *coeffs) {
+  __m128i filt_00 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  __m128i filt_01 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  __m128i filt_02 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  __m128i filt_03 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+  __m128i filt_10 = _mm_loadu_si128(
+      (__m128i *)(av1_warped_filter +
+                  (((sy + delta) + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  __m128i filt_11 = _mm_loadu_si128(
+      (__m128i *)(av1_warped_filter +
+                  (((sy + delta) + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  __m128i filt_12 = _mm_loadu_si128(
+      (__m128i *)(av1_warped_filter +
+                  (((sy + delta) + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  __m128i filt_13 = _mm_loadu_si128(
+      (__m128i *)(av1_warped_filter +
+                  (((sy + delta) + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+  __m256i filt_0 =
+      _mm256_inserti128_si256(_mm256_castsi128_si256(filt_00), filt_10, 0x1);
+  __m256i filt_1 =
+      _mm256_inserti128_si256(_mm256_castsi128_si256(filt_01), filt_11, 0x1);
+  __m256i filt_2 =
+      _mm256_inserti128_si256(_mm256_castsi128_si256(filt_02), filt_12, 0x1);
+  __m256i filt_3 =
+      _mm256_inserti128_si256(_mm256_castsi128_si256(filt_03), filt_13, 0x1);
+
+  __m256i res_0 = _mm256_unpacklo_epi32(filt_0, filt_1);
+  __m256i res_1 = _mm256_unpacklo_epi32(filt_2, filt_3);
+  __m256i res_2 = _mm256_unpackhi_epi32(filt_0, filt_1);
+  __m256i res_3 = _mm256_unpackhi_epi32(filt_2, filt_3);
+
+  coeffs[0] = _mm256_unpacklo_epi64(res_0, res_1);
+  coeffs[1] = _mm256_unpackhi_epi64(res_0, res_1);
+  coeffs[2] = _mm256_unpacklo_epi64(res_2, res_3);
+  coeffs[3] = _mm256_unpackhi_epi64(res_2, res_3);
+
+  filt_00 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  filt_01 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  filt_02 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  filt_03 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+  filt_10 = _mm_loadu_si128(
+      (__m128i *)(av1_warped_filter +
+                  (((sy + delta) + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  filt_11 = _mm_loadu_si128(
+      (__m128i *)(av1_warped_filter +
+                  (((sy + delta) + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  filt_12 = _mm_loadu_si128(
+      (__m128i *)(av1_warped_filter +
+                  (((sy + delta) + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  filt_13 = _mm_loadu_si128(
+      (__m128i *)(av1_warped_filter +
+                  (((sy + delta) + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+  filt_0 =
+      _mm256_inserti128_si256(_mm256_castsi128_si256(filt_00), filt_10, 0x1);
+  filt_1 =
+      _mm256_inserti128_si256(_mm256_castsi128_si256(filt_01), filt_11, 0x1);
+  filt_2 =
+      _mm256_inserti128_si256(_mm256_castsi128_si256(filt_02), filt_12, 0x1);
+  filt_3 =
+      _mm256_inserti128_si256(_mm256_castsi128_si256(filt_03), filt_13, 0x1);
+
+  res_0 = _mm256_unpacklo_epi32(filt_0, filt_1);
+  res_1 = _mm256_unpacklo_epi32(filt_2, filt_3);
+  res_2 = _mm256_unpackhi_epi32(filt_0, filt_1);
+  res_3 = _mm256_unpackhi_epi32(filt_2, filt_3);
+
+  coeffs[4] = _mm256_unpacklo_epi64(res_0, res_1);
+  coeffs[5] = _mm256_unpackhi_epi64(res_0, res_1);
+  coeffs[6] = _mm256_unpacklo_epi64(res_2, res_3);
+  coeffs[7] = _mm256_unpackhi_epi64(res_2, res_3);
+}
+
+static INLINE void prepare_vertical_filter_coeffs_delta0_avx2(int gamma, int sy,
+                                                              __m256i *coeffs) {
+  __m128i filt_00 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  __m128i filt_01 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  __m128i filt_02 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  __m128i filt_03 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+  __m256i filt_0 = _mm256_broadcastsi128_si256(filt_00);
+  __m256i filt_1 = _mm256_broadcastsi128_si256(filt_01);
+  __m256i filt_2 = _mm256_broadcastsi128_si256(filt_02);
+  __m256i filt_3 = _mm256_broadcastsi128_si256(filt_03);
+
+  __m256i res_0 = _mm256_unpacklo_epi32(filt_0, filt_1);
+  __m256i res_1 = _mm256_unpacklo_epi32(filt_2, filt_3);
+  __m256i res_2 = _mm256_unpackhi_epi32(filt_0, filt_1);
+  __m256i res_3 = _mm256_unpackhi_epi32(filt_2, filt_3);
+
+  coeffs[0] = _mm256_unpacklo_epi64(res_0, res_1);
+  coeffs[1] = _mm256_unpackhi_epi64(res_0, res_1);
+  coeffs[2] = _mm256_unpacklo_epi64(res_2, res_3);
+  coeffs[3] = _mm256_unpackhi_epi64(res_2, res_3);
+
+  filt_00 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  filt_01 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  filt_02 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  filt_03 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+  filt_0 = _mm256_broadcastsi128_si256(filt_00);
+  filt_1 = _mm256_broadcastsi128_si256(filt_01);
+  filt_2 = _mm256_broadcastsi128_si256(filt_02);
+  filt_3 = _mm256_broadcastsi128_si256(filt_03);
+
+  res_0 = _mm256_unpacklo_epi32(filt_0, filt_1);
+  res_1 = _mm256_unpacklo_epi32(filt_2, filt_3);
+  res_2 = _mm256_unpackhi_epi32(filt_0, filt_1);
+  res_3 = _mm256_unpackhi_epi32(filt_2, filt_3);
+
+  coeffs[4] = _mm256_unpacklo_epi64(res_0, res_1);
+  coeffs[5] = _mm256_unpackhi_epi64(res_0, res_1);
+  coeffs[6] = _mm256_unpacklo_epi64(res_2, res_3);
+  coeffs[7] = _mm256_unpackhi_epi64(res_2, res_3);
+}
+
+static INLINE void prepare_vertical_filter_coeffs_gamma0_avx2(int delta, int sy,
+                                                              __m256i *coeffs) {
+  const __m128i filt_0 = _mm_loadu_si128(
+      (__m128i *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
+  const __m128i filt_1 = _mm_loadu_si128(
+      (__m128i *)(av1_warped_filter + ((sy + delta) >> WARPEDDIFF_PREC_BITS)));
+
+  __m256i res_0 =
+      _mm256_inserti128_si256(_mm256_castsi128_si256(filt_0), filt_1, 0x1);
+
+  coeffs[0] = _mm256_shuffle_epi8(
+      res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask0_avx2));
+  coeffs[1] = _mm256_shuffle_epi8(
+      res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask1_avx2));
+  coeffs[2] = _mm256_shuffle_epi8(
+      res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask2_avx2));
+  coeffs[3] = _mm256_shuffle_epi8(
+      res_0, _mm256_load_si256((__m256i *)shuffle_gamma0_mask3_avx2));
+
+  coeffs[4] = coeffs[0];
+  coeffs[5] = coeffs[1];
+  coeffs[6] = coeffs[2];
+  coeffs[7] = coeffs[3];
+}
+
+static INLINE void filter_src_pixels_vertical_avx2(__m256i *horz_out,
+                                                   __m256i *src,
+                                                   __m256i *coeffs,
+                                                   __m256i *res_lo,
+                                                   __m256i *res_hi, int row) {
+  const __m256i src_6 = horz_out[row + 3];
+  const __m256i src_7 =
+      _mm256_permute2x128_si256(horz_out[row + 3], horz_out[row + 4], 0x21);
+
+  src[6] = _mm256_unpacklo_epi16(src_6, src_7);
+
+  const __m256i res_0 = _mm256_madd_epi16(src[0], coeffs[0]);
+  const __m256i res_2 = _mm256_madd_epi16(src[2], coeffs[1]);
+  const __m256i res_4 = _mm256_madd_epi16(src[4], coeffs[2]);
+  const __m256i res_6 = _mm256_madd_epi16(src[6], coeffs[3]);
+
+  const __m256i res_even = _mm256_add_epi32(_mm256_add_epi32(res_0, res_2),
+                                            _mm256_add_epi32(res_4, res_6));
+
+  src[7] = _mm256_unpackhi_epi16(src_6, src_7);
+
+  const __m256i res_1 = _mm256_madd_epi16(src[1], coeffs[4]);
+  const __m256i res_3 = _mm256_madd_epi16(src[3], coeffs[5]);
+  const __m256i res_5 = _mm256_madd_epi16(src[5], coeffs[6]);
+  const __m256i res_7 = _mm256_madd_epi16(src[7], coeffs[7]);
+
+  const __m256i res_odd = _mm256_add_epi32(_mm256_add_epi32(res_1, res_3),
+                                           _mm256_add_epi32(res_5, res_7));
+
+  // Rearrange pixels back into the order 0 ... 7
+  *res_lo = _mm256_unpacklo_epi32(res_even, res_odd);
+  *res_hi = _mm256_unpackhi_epi32(res_even, res_odd);
+}
+
+static INLINE void store_vertical_filter_output_avx2(
+    const __m256i *res_lo, const __m256i *res_hi, const __m256i *res_add_const,
+    const __m256i *wt, const __m256i *res_sub_const,
+    const __m256i *round_bits_const, uint8_t *pred, ConvolveParams *conv_params,
+    int i, int j, int k, const int reduce_bits_vert, int p_stride, int p_width,
+    const int round_bits) {
+  __m256i res_lo_1 = *res_lo;
+  __m256i res_hi_1 = *res_hi;
+
+  if (conv_params->is_compound) {
+    __m128i *const p_0 =
+        (__m128i *)&conv_params->dst[(i + k + 4) * conv_params->dst_stride + j];
+    __m128i *const p_1 =
+        (__m128i *)&conv_params
+            ->dst[(i + (k + 1) + 4) * conv_params->dst_stride + j];
+
+    res_lo_1 = _mm256_srai_epi32(_mm256_add_epi32(res_lo_1, *res_add_const),
+                                 reduce_bits_vert);
+
+    const __m256i temp_lo_16 = _mm256_packus_epi32(res_lo_1, res_lo_1);
+    __m256i res_lo_16;
+    if (conv_params->do_average) {
+      __m128i *const dst8_0 = (__m128i *)&pred[(i + k + 4) * p_stride + j];
+      __m128i *const dst8_1 =
+          (__m128i *)&pred[(i + (k + 1) + 4) * p_stride + j];
+      const __m128i p_16_0 = _mm_loadl_epi64(p_0);
+      const __m128i p_16_1 = _mm_loadl_epi64(p_1);
+      const __m256i p_16 =
+          _mm256_inserti128_si256(_mm256_castsi128_si256(p_16_0), p_16_1, 1);
+      if (conv_params->use_dist_wtd_comp_avg) {
+        const __m256i p_16_lo = _mm256_unpacklo_epi16(p_16, temp_lo_16);
+        const __m256i wt_res_lo = _mm256_madd_epi16(p_16_lo, *wt);
+        const __m256i shifted_32 =
+            _mm256_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
+        res_lo_16 = _mm256_packus_epi32(shifted_32, shifted_32);
+      } else {
+        res_lo_16 = _mm256_srai_epi16(_mm256_add_epi16(p_16, temp_lo_16), 1);
+      }
+      res_lo_16 = _mm256_add_epi16(res_lo_16, *res_sub_const);
+      res_lo_16 = _mm256_srai_epi16(
+          _mm256_add_epi16(res_lo_16, *round_bits_const), round_bits);
+      const __m256i res_8_lo = _mm256_packus_epi16(res_lo_16, res_lo_16);
+      const __m128i res_8_lo_0 = _mm256_castsi256_si128(res_8_lo);
+      const __m128i res_8_lo_1 = _mm256_extracti128_si256(res_8_lo, 1);
+      *(uint32_t *)dst8_0 = _mm_cvtsi128_si32(res_8_lo_0);
+      *(uint32_t *)dst8_1 = _mm_cvtsi128_si32(res_8_lo_1);
+    } else {
+      const __m128i temp_lo_16_0 = _mm256_castsi256_si128(temp_lo_16);
+      const __m128i temp_lo_16_1 = _mm256_extracti128_si256(temp_lo_16, 1);
+      _mm_storel_epi64(p_0, temp_lo_16_0);
+      _mm_storel_epi64(p_1, temp_lo_16_1);
+    }
+    if (p_width > 4) {
+      __m128i *const p4_0 =
+          (__m128i *)&conv_params
+              ->dst[(i + k + 4) * conv_params->dst_stride + j + 4];
+      __m128i *const p4_1 =
+          (__m128i *)&conv_params
+              ->dst[(i + (k + 1) + 4) * conv_params->dst_stride + j + 4];
+      res_hi_1 = _mm256_srai_epi32(_mm256_add_epi32(res_hi_1, *res_add_const),
+                                   reduce_bits_vert);
+      const __m256i temp_hi_16 = _mm256_packus_epi32(res_hi_1, res_hi_1);
+      __m256i res_hi_16;
+      if (conv_params->do_average) {
+        __m128i *const dst8_4_0 =
+            (__m128i *)&pred[(i + k + 4) * p_stride + j + 4];
+        __m128i *const dst8_4_1 =
+            (__m128i *)&pred[(i + (k + 1) + 4) * p_stride + j + 4];
+        const __m128i p4_16_0 = _mm_loadl_epi64(p4_0);
+        const __m128i p4_16_1 = _mm_loadl_epi64(p4_1);
+        const __m256i p4_16 = _mm256_inserti128_si256(
+            _mm256_castsi128_si256(p4_16_0), p4_16_1, 1);
+        if (conv_params->use_dist_wtd_comp_avg) {
+          const __m256i p_16_hi = _mm256_unpacklo_epi16(p4_16, temp_hi_16);
+          const __m256i wt_res_hi = _mm256_madd_epi16(p_16_hi, *wt);
+          const __m256i shifted_32 =
+              _mm256_srai_epi32(wt_res_hi, DIST_PRECISION_BITS);
+          res_hi_16 = _mm256_packus_epi32(shifted_32, shifted_32);
+        } else {
+          res_hi_16 = _mm256_srai_epi16(_mm256_add_epi16(p4_16, temp_hi_16), 1);
+        }
+        res_hi_16 = _mm256_add_epi16(res_hi_16, *res_sub_const);
+        res_hi_16 = _mm256_srai_epi16(
+            _mm256_add_epi16(res_hi_16, *round_bits_const), round_bits);
+        __m256i res_8_hi = _mm256_packus_epi16(res_hi_16, res_hi_16);
+        const __m128i res_8_hi_0 = _mm256_castsi256_si128(res_8_hi);
+        const __m128i res_8_hi_1 = _mm256_extracti128_si256(res_8_hi, 1);
+        *(uint32_t *)dst8_4_0 = _mm_cvtsi128_si32(res_8_hi_0);
+        *(uint32_t *)dst8_4_1 = _mm_cvtsi128_si32(res_8_hi_1);
+      } else {
+        const __m128i temp_hi_16_0 = _mm256_castsi256_si128(temp_hi_16);
+        const __m128i temp_hi_16_1 = _mm256_extracti128_si256(temp_hi_16, 1);
+        _mm_storel_epi64(p4_0, temp_hi_16_0);
+        _mm_storel_epi64(p4_1, temp_hi_16_1);
+      }
+    }
+  } else {
+    const __m256i res_lo_round = _mm256_srai_epi32(
+        _mm256_add_epi32(res_lo_1, *res_add_const), reduce_bits_vert);
+    const __m256i res_hi_round = _mm256_srai_epi32(
+        _mm256_add_epi32(res_hi_1, *res_add_const), reduce_bits_vert);
+
+    const __m256i res_16bit = _mm256_packs_epi32(res_lo_round, res_hi_round);
+    const __m256i res_8bit = _mm256_packus_epi16(res_16bit, res_16bit);
+    const __m128i res_8bit0 = _mm256_castsi256_si128(res_8bit);
+    const __m128i res_8bit1 = _mm256_extracti128_si256(res_8bit, 1);
+
+    // Store, blending with 'pred' if needed
+    __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
+    __m128i *const p1 = (__m128i *)&pred[(i + (k + 1) + 4) * p_stride + j];
+
+    if (p_width == 4) {
+      *(uint32_t *)p = _mm_cvtsi128_si32(res_8bit0);
+      *(uint32_t *)p1 = _mm_cvtsi128_si32(res_8bit1);
+    } else {
+      _mm_storel_epi64(p, res_8bit0);
+      _mm_storel_epi64(p1, res_8bit1);
+    }
+  }
+}
+
+static INLINE void warp_vertical_filter_avx2(
+    uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params,
+    int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width,
+    int i, int j, int sy4, const int reduce_bits_vert,
+    const __m256i *res_add_const, const int round_bits,
+    const __m256i *res_sub_const, const __m256i *round_bits_const,
+    const __m256i *wt) {
+  int k, row = 0;
+  __m256i src[8];
+  const __m256i src_0 = horz_out[0];
+  const __m256i src_1 =
+      _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21);
+  const __m256i src_2 = horz_out[1];
+  const __m256i src_3 =
+      _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21);
+  const __m256i src_4 = horz_out[2];
+  const __m256i src_5 =
+      _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21);
+
+  src[0] = _mm256_unpacklo_epi16(src_0, src_1);
+  src[2] = _mm256_unpacklo_epi16(src_2, src_3);
+  src[4] = _mm256_unpacklo_epi16(src_4, src_5);
+
+  src[1] = _mm256_unpackhi_epi16(src_0, src_1);
+  src[3] = _mm256_unpackhi_epi16(src_2, src_3);
+  src[5] = _mm256_unpackhi_epi16(src_4, src_5);
+
+  for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) {
+    int sy = sy4 + delta * (k + 4);
+    __m256i coeffs[8];
+    prepare_vertical_filter_coeffs_avx2(gamma, delta, sy, coeffs);
+    __m256i res_lo, res_hi;
+    filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi,
+                                    row);
+    store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt,
+                                      res_sub_const, round_bits_const, pred,
+                                      conv_params, i, j, k, reduce_bits_vert,
+                                      p_stride, p_width, round_bits);
+    src[0] = src[2];
+    src[2] = src[4];
+    src[4] = src[6];
+    src[1] = src[3];
+    src[3] = src[5];
+    src[5] = src[7];
+
+    row += 1;
+  }
+}
+
+static INLINE void warp_vertical_filter_gamma0_avx2(
+    uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params,
+    int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width,
+    int i, int j, int sy4, const int reduce_bits_vert,
+    const __m256i *res_add_const, const int round_bits,
+    const __m256i *res_sub_const, const __m256i *round_bits_const,
+    const __m256i *wt) {
+  (void)gamma;
+  int k, row = 0;
+  __m256i src[8];
+  const __m256i src_0 = horz_out[0];
+  const __m256i src_1 =
+      _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21);
+  const __m256i src_2 = horz_out[1];
+  const __m256i src_3 =
+      _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21);
+  const __m256i src_4 = horz_out[2];
+  const __m256i src_5 =
+      _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21);
+
+  src[0] = _mm256_unpacklo_epi16(src_0, src_1);
+  src[2] = _mm256_unpacklo_epi16(src_2, src_3);
+  src[4] = _mm256_unpacklo_epi16(src_4, src_5);
+
+  src[1] = _mm256_unpackhi_epi16(src_0, src_1);
+  src[3] = _mm256_unpackhi_epi16(src_2, src_3);
+  src[5] = _mm256_unpackhi_epi16(src_4, src_5);
+
+  for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) {
+    int sy = sy4 + delta * (k + 4);
+    __m256i coeffs[8];
+    prepare_vertical_filter_coeffs_gamma0_avx2(delta, sy, coeffs);
+    __m256i res_lo, res_hi;
+    filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi,
+                                    row);
+    store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt,
+                                      res_sub_const, round_bits_const, pred,
+                                      conv_params, i, j, k, reduce_bits_vert,
+                                      p_stride, p_width, round_bits);
+    src[0] = src[2];
+    src[2] = src[4];
+    src[4] = src[6];
+    src[1] = src[3];
+    src[3] = src[5];
+    src[5] = src[7];
+    row += 1;
+  }
+}
+
+static INLINE void warp_vertical_filter_delta0_avx2(
+    uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params,
+    int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width,
+    int i, int j, int sy4, const int reduce_bits_vert,
+    const __m256i *res_add_const, const int round_bits,
+    const __m256i *res_sub_const, const __m256i *round_bits_const,
+    const __m256i *wt) {
+  (void)delta;
+  int k, row = 0;
+  __m256i src[8], coeffs[8];
+  const __m256i src_0 = horz_out[0];
+  const __m256i src_1 =
+      _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21);
+  const __m256i src_2 = horz_out[1];
+  const __m256i src_3 =
+      _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21);
+  const __m256i src_4 = horz_out[2];
+  const __m256i src_5 =
+      _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21);
+
+  src[0] = _mm256_unpacklo_epi16(src_0, src_1);
+  src[2] = _mm256_unpacklo_epi16(src_2, src_3);
+  src[4] = _mm256_unpacklo_epi16(src_4, src_5);
+
+  src[1] = _mm256_unpackhi_epi16(src_0, src_1);
+  src[3] = _mm256_unpackhi_epi16(src_2, src_3);
+  src[5] = _mm256_unpackhi_epi16(src_4, src_5);
+
+  prepare_vertical_filter_coeffs_delta0_avx2(gamma, sy4, coeffs);
+
+  for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) {
+    __m256i res_lo, res_hi;
+    filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi,
+                                    row);
+    store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt,
+                                      res_sub_const, round_bits_const, pred,
+                                      conv_params, i, j, k, reduce_bits_vert,
+                                      p_stride, p_width, round_bits);
+    src[0] = src[2];
+    src[2] = src[4];
+    src[4] = src[6];
+    src[1] = src[3];
+    src[3] = src[5];
+    src[5] = src[7];
+    row += 1;
+  }
+}
+
+static INLINE void warp_vertical_filter_gamma0_delta0_avx2(
+    uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params,
+    int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width,
+    int i, int j, int sy4, const int reduce_bits_vert,
+    const __m256i *res_add_const, const int round_bits,
+    const __m256i *res_sub_const, const __m256i *round_bits_const,
+    const __m256i *wt) {
+  (void)gamma;
+  int k, row = 0;
+  __m256i src[8], coeffs[8];
+  const __m256i src_0 = horz_out[0];
+  const __m256i src_1 =
+      _mm256_permute2x128_si256(horz_out[0], horz_out[1], 0x21);
+  const __m256i src_2 = horz_out[1];
+  const __m256i src_3 =
+      _mm256_permute2x128_si256(horz_out[1], horz_out[2], 0x21);
+  const __m256i src_4 = horz_out[2];
+  const __m256i src_5 =
+      _mm256_permute2x128_si256(horz_out[2], horz_out[3], 0x21);
+
+  src[0] = _mm256_unpacklo_epi16(src_0, src_1);
+  src[2] = _mm256_unpacklo_epi16(src_2, src_3);
+  src[4] = _mm256_unpacklo_epi16(src_4, src_5);
+
+  src[1] = _mm256_unpackhi_epi16(src_0, src_1);
+  src[3] = _mm256_unpackhi_epi16(src_2, src_3);
+  src[5] = _mm256_unpackhi_epi16(src_4, src_5);
+
+  prepare_vertical_filter_coeffs_gamma0_avx2(delta, sy4, coeffs);
+
+  for (k = -4; k < AOMMIN(4, p_height - i - 4); k += 2) {
+    __m256i res_lo, res_hi;
+    filter_src_pixels_vertical_avx2(horz_out, src, coeffs, &res_lo, &res_hi,
+                                    row);
+    store_vertical_filter_output_avx2(&res_lo, &res_hi, res_add_const, wt,
+                                      res_sub_const, round_bits_const, pred,
+                                      conv_params, i, j, k, reduce_bits_vert,
+                                      p_stride, p_width, round_bits);
+    src[0] = src[2];
+    src[2] = src[4];
+    src[4] = src[6];
+    src[1] = src[3];
+    src[3] = src[5];
+    src[5] = src[7];
+    row += 1;
+  }
+}
+
+static INLINE void prepare_warp_vertical_filter_avx2(
+    uint8_t *pred, __m256i *horz_out, ConvolveParams *conv_params,
+    int16_t gamma, int16_t delta, int p_height, int p_stride, int p_width,
+    int i, int j, int sy4, const int reduce_bits_vert,
+    const __m256i *res_add_const, const int round_bits,
+    const __m256i *res_sub_const, const __m256i *round_bits_const,
+    const __m256i *wt) {
+  if (gamma == 0 && delta == 0)
+    warp_vertical_filter_gamma0_delta0_avx2(
+        pred, horz_out, conv_params, gamma, delta, p_height, p_stride, p_width,
+        i, j, sy4, reduce_bits_vert, res_add_const, round_bits, res_sub_const,
+        round_bits_const, wt);
+  else if (gamma == 0 && delta != 0)
+    warp_vertical_filter_gamma0_avx2(
+        pred, horz_out, conv_params, gamma, delta, p_height, p_stride, p_width,
+        i, j, sy4, reduce_bits_vert, res_add_const, round_bits, res_sub_const,
+        round_bits_const, wt);
+  else if (gamma != 0 && delta == 0)
+    warp_vertical_filter_delta0_avx2(
+        pred, horz_out, conv_params, gamma, delta, p_height, p_stride, p_width,
+        i, j, sy4, reduce_bits_vert, res_add_const, round_bits, res_sub_const,
+        round_bits_const, wt);
+  else
+    warp_vertical_filter_avx2(pred, horz_out, conv_params, gamma, delta,
+                              p_height, p_stride, p_width, i, j, sy4,
+                              reduce_bits_vert, res_add_const, round_bits,
+                              res_sub_const, round_bits_const, wt);
+}
+
+static INLINE void prepare_warp_horizontal_filter_avx2(
+    const uint8_t *ref, __m256i *horz_out, int stride, int32_t ix4, int32_t iy4,
+    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+    const __m256i *round_const, const __m128i *shift,
+    const __m256i *shuffle_src) {
+  if (alpha == 0 && beta == 0)
+    warp_horizontal_filter_alpha0_beta0_avx2(
+        ref, horz_out, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i,
+        round_const, shift, shuffle_src);
+  else if (alpha == 0 && beta != 0)
+    warp_horizontal_filter_alpha0_avx2(ref, horz_out, stride, ix4, iy4, sx4,
+                                       alpha, beta, p_height, height, i,
+                                       round_const, shift, shuffle_src);
+  else if (alpha != 0 && beta == 0)
+    warp_horizontal_filter_beta0_avx2(ref, horz_out, stride, ix4, iy4, sx4,
+                                      alpha, beta, p_height, height, i,
+                                      round_const, shift, shuffle_src);
+  else
+    warp_horizontal_filter_avx2(ref, horz_out, stride, ix4, iy4, sx4, alpha,
+                                beta, p_height, height, i, round_const, shift,
+                                shuffle_src);
+}
+
+int64_t av1_calc_frame_error_avx2(const uint8_t *const ref, int ref_stride,
+                                  const uint8_t *const dst, int p_width,
+                                  int p_height, int dst_stride) {
+  int64_t sum_error = 0;
+  int i, j;
+  __m256i row_error, col_error;
+  __m256i zero = _mm256_set1_epi16(0);
+  __m256i dup_255 = _mm256_set1_epi16(255);
+  col_error = zero;
+
+  for (i = 0; i < (p_height / 4); i++) {
+    row_error = _mm256_set1_epi16(0);
+    for (j = 0; j < (p_width / 16); j++) {
+      __m256i ref_1_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
+          (__m128i *)(ref + (j * 16) + (((i * 4) + 0) * ref_stride))));
+      __m256i dst_1_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
+          (__m128i *)(dst + (j * 16) + (((i * 4) + 0) * dst_stride))));
+      __m256i ref_2_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
+          (__m128i *)(ref + (j * 16) + (((i * 4) + 1) * ref_stride))));
+      __m256i dst_2_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
+          (__m128i *)(dst + (j * 16) + (((i * 4) + 1) * dst_stride))));
+      __m256i ref_3_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
+          (__m128i *)(ref + (j * 16) + (((i * 4) + 2) * ref_stride))));
+      __m256i dst_3_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
+          (__m128i *)(dst + (j * 16) + (((i * 4) + 2) * dst_stride))));
+      __m256i ref_4_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
+          (__m128i *)(ref + (j * 16) + (((i * 4) + 3) * ref_stride))));
+      __m256i dst_4_16 = _mm256_cvtepu8_epi16(_mm_load_si128(
+          (__m128i *)(dst + (j * 16) + (((i * 4) + 3) * dst_stride))));
+
+      __m256i diff_1 =
+          _mm256_add_epi16(_mm256_sub_epi16(dst_1_16, ref_1_16), dup_255);
+      __m256i diff_2 =
+          _mm256_add_epi16(_mm256_sub_epi16(dst_2_16, ref_2_16), dup_255);
+      __m256i diff_3 =
+          _mm256_add_epi16(_mm256_sub_epi16(dst_3_16, ref_3_16), dup_255);
+      __m256i diff_4 =
+          _mm256_add_epi16(_mm256_sub_epi16(dst_4_16, ref_4_16), dup_255);
+
+      __m256i diff_1_lo = _mm256_unpacklo_epi16(diff_1, zero);
+      __m256i diff_1_hi = _mm256_unpackhi_epi16(diff_1, zero);
+      __m256i diff_2_lo = _mm256_unpacklo_epi16(diff_2, zero);
+      __m256i diff_2_hi = _mm256_unpackhi_epi16(diff_2, zero);
+      __m256i diff_3_lo = _mm256_unpacklo_epi16(diff_3, zero);
+      __m256i diff_3_hi = _mm256_unpackhi_epi16(diff_3, zero);
+      __m256i diff_4_lo = _mm256_unpacklo_epi16(diff_4, zero);
+      __m256i diff_4_hi = _mm256_unpackhi_epi16(diff_4, zero);
+
+      __m256i error_1_lo =
+          _mm256_i32gather_epi32(error_measure_lut, diff_1_lo, 4);
+      __m256i error_1_hi =
+          _mm256_i32gather_epi32(error_measure_lut, diff_1_hi, 4);
+      __m256i error_2_lo =
+          _mm256_i32gather_epi32(error_measure_lut, diff_2_lo, 4);
+      __m256i error_2_hi =
+          _mm256_i32gather_epi32(error_measure_lut, diff_2_hi, 4);
+      __m256i error_3_lo =
+          _mm256_i32gather_epi32(error_measure_lut, diff_3_lo, 4);
+      __m256i error_3_hi =
+          _mm256_i32gather_epi32(error_measure_lut, diff_3_hi, 4);
+      __m256i error_4_lo =
+          _mm256_i32gather_epi32(error_measure_lut, diff_4_lo, 4);
+      __m256i error_4_hi =
+          _mm256_i32gather_epi32(error_measure_lut, diff_4_hi, 4);
+
+      __m256i error_1 = _mm256_add_epi32(error_1_lo, error_1_hi);
+      __m256i error_2 = _mm256_add_epi32(error_2_lo, error_2_hi);
+      __m256i error_3 = _mm256_add_epi32(error_3_lo, error_3_hi);
+      __m256i error_4 = _mm256_add_epi32(error_4_lo, error_4_hi);
+
+      __m256i error_1_2 = _mm256_add_epi32(error_1, error_2);
+      __m256i error_3_4 = _mm256_add_epi32(error_3, error_4);
+
+      __m256i error_1_2_3_4 = _mm256_add_epi32(error_1_2, error_3_4);
+      row_error = _mm256_add_epi32(row_error, error_1_2_3_4);
+    }
+    __m256i col_error_lo = _mm256_unpacklo_epi32(row_error, zero);
+    __m256i col_error_hi = _mm256_unpackhi_epi32(row_error, zero);
+    __m256i col_error_temp = _mm256_add_epi64(col_error_lo, col_error_hi);
+    col_error = _mm256_add_epi64(col_error, col_error_temp);
+    // Error summation for remaining width, which is not multiple of 16
+    if (p_width & 0xf) {
+      for (int k = 0; k < 4; ++k) {
+        for (int l = j * 16; l < p_width; ++l) {
+          sum_error +=
+              (int64_t)error_measure(dst[l + ((i * 4) + k) * dst_stride] -
+                                     ref[l + ((i * 4) + k) * ref_stride]);
+        }
+      }
+    }
+  }
+  __m128i sum_error_q_0 = _mm256_castsi256_si128(col_error);
+  __m128i sum_error_q_1 = _mm256_extracti128_si256(col_error, 1);
+  sum_error_q_0 = _mm_add_epi64(sum_error_q_0, sum_error_q_1);
+  int64_t sum_error_d_0, sum_error_d_1;
+  xx_storel_64(&sum_error_d_0, sum_error_q_0);
+  xx_storel_64(&sum_error_d_1, _mm_srli_si128(sum_error_q_0, 8));
+  sum_error = (sum_error + sum_error_d_0 + sum_error_d_1);
+  // Error summation for remaining height, which is not multiple of 4
+  if (p_height & 0x3) {
+    for (int k = i * 4; k < p_height; ++k) {
+      for (int l = 0; l < p_width; ++l) {
+        sum_error += (int64_t)error_measure(dst[l + k * dst_stride] -
+                                            ref[l + k * ref_stride]);
+      }
+    }
+  }
+  return sum_error;
+}
+
+void av1_warp_affine_avx2(const int32_t *mat, const uint8_t *ref, int width,
+                          int height, int stride, uint8_t *pred, int p_col,
+                          int p_row, int p_width, int p_height, int p_stride,
+                          int subsampling_x, int subsampling_y,
+                          ConvolveParams *conv_params, int16_t alpha,
+                          int16_t beta, int16_t gamma, int16_t delta) {
+  __m256i horz_out[8];
+  int i, j, k;
+  const int bd = 8;
+  const int reduce_bits_horiz = conv_params->round_0;
+  const int reduce_bits_vert = conv_params->is_compound
+                                   ? conv_params->round_1
+                                   : 2 * FILTER_BITS - reduce_bits_horiz;
+  const int offset_bits_horiz = bd + FILTER_BITS - 1;
+  assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
+
+  const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
+  const __m256i reduce_bits_vert_const =
+      _mm256_set1_epi32(((1 << reduce_bits_vert) >> 1));
+  const __m256i res_add_const = _mm256_set1_epi32(1 << offset_bits_vert);
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
+
+  const __m256i round_const = _mm256_set1_epi16(
+      (1 << offset_bits_horiz) + ((1 << reduce_bits_horiz) >> 1));
+  const __m128i shift = _mm_cvtsi32_si128(reduce_bits_horiz);
+
+  __m256i res_sub_const, round_bits_const, wt;
+  unpack_weights_and_set_round_const_avx2(conv_params, round_bits, offset_bits,
+                                          &res_sub_const, &round_bits_const,
+                                          &wt);
+
+  __m256i res_add_const_1;
+  if (conv_params->is_compound == 1) {
+    res_add_const_1 = _mm256_add_epi32(reduce_bits_vert_const, res_add_const);
+  } else {
+    res_add_const_1 = _mm256_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) +
+                                        ((1 << reduce_bits_vert) >> 1));
+  }
+  const int32_t const1 = alpha * (-4) + beta * (-4) +
+                         (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+                         (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+  const int32_t const2 = gamma * (-4) + delta * (-4) +
+                         (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+                         (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+  const int32_t const3 = ((1 << WARP_PARAM_REDUCE_BITS) - 1);
+  const int16_t const4 = (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1));
+  const int16_t const5 = (1 << (FILTER_BITS - reduce_bits_horiz));
+
+  __m256i shuffle_src[4];
+  shuffle_src[0] = _mm256_load_si256((__m256i *)shuffle_src0);
+  shuffle_src[1] = _mm256_load_si256((__m256i *)shuffle_src1);
+  shuffle_src[2] = _mm256_load_si256((__m256i *)shuffle_src2);
+  shuffle_src[3] = _mm256_load_si256((__m256i *)shuffle_src3);
+
+  for (i = 0; i < p_height; i += 8) {
+    for (j = 0; j < p_width; j += 8) {
+      const int32_t src_x = (p_col + j + 4) << subsampling_x;
+      const int32_t src_y = (p_row + i + 4) << subsampling_y;
+      const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
+      const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
+      const int32_t x4 = dst_x >> subsampling_x;
+      const int32_t y4 = dst_y >> subsampling_y;
+
+      int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
+      int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+      int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
+      int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+
+      // Add in all the constant terms, including rounding and offset
+      sx4 += const1;
+      sy4 += const2;
+
+      sx4 &= ~const3;
+      sy4 &= ~const3;
+
+      // Horizontal filter
+      // If the block is aligned such that, after clamping, every sample
+      // would be taken from the leftmost/rightmost column, then we can
+      // skip the expensive horizontal filter.
+
+      if (ix4 <= -7) {
+        int iy, row = 0;
+        for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
+          iy = iy4 + k;
+          iy = clamp(iy, 0, height - 1);
+          const __m256i temp_0 =
+              _mm256_set1_epi16(const4 + ref[iy * stride] * const5);
+          iy = iy4 + k + 1;
+          iy = clamp(iy, 0, height - 1);
+          const __m256i temp_1 =
+              _mm256_set1_epi16(const4 + ref[iy * stride] * const5);
+          horz_out[row] = _mm256_blend_epi32(temp_0, temp_1, 0xf0);
+          row += 1;
+        }
+        iy = iy4 + k;
+        iy = clamp(iy, 0, height - 1);
+        horz_out[row] = _mm256_set1_epi16(const4 + ref[iy * stride] * const5);
+      } else if (ix4 >= width + 6) {
+        int iy, row = 0;
+        for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
+          iy = iy4 + k;
+          iy = clamp(iy, 0, height - 1);
+          const __m256i temp_0 = _mm256_set1_epi16(
+              const4 + ref[iy * stride + (width - 1)] * const5);
+          iy = iy4 + k + 1;
+          iy = clamp(iy, 0, height - 1);
+          const __m256i temp_1 = _mm256_set1_epi16(
+              const4 + ref[iy * stride + (width - 1)] * const5);
+          horz_out[row] = _mm256_blend_epi32(temp_0, temp_1, 0xf0);
+          row += 1;
+        }
+        iy = iy4 + k;
+        iy = clamp(iy, 0, height - 1);
+        horz_out[row] =
+            _mm256_set1_epi16(const4 + ref[iy * stride + (width - 1)] * const5);
+      } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) {
+        const int out_of_boundary_left = -(ix4 - 6);
+        const int out_of_boundary_right = (ix4 + 8) - width;
+        int iy, sx, row = 0;
+        for (k = -7; k <= (AOMMIN(8, p_height - i) - 2); k += 2) {
+          iy = iy4 + k;
+          iy = clamp(iy, 0, height - 1);
+          __m128i src0 =
+              _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+          iy = iy4 + k + 1;
+          iy = clamp(iy, 0, height - 1);
+          __m128i src1 =
+              _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+
+          if (out_of_boundary_left >= 0) {
+            const __m128i shuffle_reg_left =
+                _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]);
+            src0 = _mm_shuffle_epi8(src0, shuffle_reg_left);
+            src1 = _mm_shuffle_epi8(src1, shuffle_reg_left);
+          }
+          if (out_of_boundary_right >= 0) {
+            const __m128i shuffle_reg_right = _mm_loadu_si128(
+                (__m128i *)warp_pad_right[out_of_boundary_right]);
+            src0 = _mm_shuffle_epi8(src0, shuffle_reg_right);
+            src1 = _mm_shuffle_epi8(src1, shuffle_reg_right);
+          }
+          sx = sx4 + beta * (k + 4);
+          const __m256i src_01 =
+              _mm256_inserti128_si256(_mm256_castsi128_si256(src0), src1, 0x1);
+          horizontal_filter_avx2(src_01, horz_out, sx, alpha, beta, row,
+                                 shuffle_src, &round_const, &shift);
+          row += 1;
+        }
+        iy = iy4 + k;
+        iy = clamp(iy, 0, height - 1);
+        __m128i src = _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+        if (out_of_boundary_left >= 0) {
+          const __m128i shuffle_reg_left =
+              _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]);
+          src = _mm_shuffle_epi8(src, shuffle_reg_left);
+        }
+        if (out_of_boundary_right >= 0) {
+          const __m128i shuffle_reg_right =
+              _mm_loadu_si128((__m128i *)warp_pad_right[out_of_boundary_right]);
+          src = _mm_shuffle_epi8(src, shuffle_reg_right);
+        }
+        sx = sx4 + beta * (k + 4);
+        const __m256i src_01 = _mm256_castsi128_si256(src);
+        __m256i coeff[4];
+        prepare_horizontal_filter_coeff(alpha, sx, coeff);
+        filter_src_pixels_avx2(src_01, horz_out, coeff, shuffle_src,
+                               &round_const, &shift, row);
+      } else {
+        prepare_warp_horizontal_filter_avx2(
+            ref, horz_out, stride, ix4, iy4, sx4, alpha, beta, p_height, height,
+            i, &round_const, &shift, shuffle_src);
+      }
+
+      // Vertical filter
+      prepare_warp_vertical_filter_avx2(
+          pred, horz_out, conv_params, gamma, delta, p_height, p_stride,
+          p_width, i, j, sy4, reduce_bits_vert, &res_add_const_1, round_bits,
+          &res_sub_const, &round_bits_const, &wt);
+    }
+  }
+}
diff --git a/libs/libaom/src/av1/common/x86/warp_plane_sse2.c b/libs/libaom/src/av1/common/x86/warp_plane_sse2.c
new file mode 100644
index 000000000..6ff666518
--- /dev/null
+++ b/libs/libaom/src/av1/common/x86/warp_plane_sse2.c
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+
+#include "aom_dsp/x86/synonyms.h"
+#include "av1/common/warped_motion.h"
+#include "config/av1_rtcd.h"
+
+int64_t av1_calc_frame_error_sse2(const uint8_t *const ref, int ref_stride,
+                                  const uint8_t *const dst, int p_width,
+                                  int p_height, int dst_stride) {
+  int64_t sum_error = 0;
+  int i, j;
+  __m128i row_error, col_error;
+  __m128i zero = _mm_set1_epi16(0);
+  __m128i dup_255 = _mm_set1_epi16(255);
+  col_error = zero;
+  for (i = 0; i < (p_height); i++) {
+    row_error = zero;
+    for (j = 0; j < (p_width / 16); j++) {
+      __m128i ref_8 =
+          _mm_load_si128((__m128i *)(ref + (j * 16) + (i * ref_stride)));
+      __m128i dst_8 =
+          _mm_load_si128((__m128i *)(dst + (j * 16) + (i * dst_stride)));
+      __m128i ref_16_lo = _mm_unpacklo_epi8(ref_8, zero);
+      __m128i ref_16_hi = _mm_unpackhi_epi8(ref_8, zero);
+      __m128i dst_16_lo = _mm_unpacklo_epi8(dst_8, zero);
+      __m128i dst_16_hi = _mm_unpackhi_epi8(dst_8, zero);
+
+      __m128i diff_1 =
+          _mm_add_epi16(_mm_sub_epi16(dst_16_lo, ref_16_lo), dup_255);
+      __m128i diff_2 =
+          _mm_add_epi16(_mm_sub_epi16(dst_16_hi, ref_16_hi), dup_255);
+
+      __m128i error_1_lo =
+          _mm_set_epi32(error_measure_lut[_mm_extract_epi16(diff_1, 3)],
+                        error_measure_lut[_mm_extract_epi16(diff_1, 2)],
+                        error_measure_lut[_mm_extract_epi16(diff_1, 1)],
+                        error_measure_lut[_mm_extract_epi16(diff_1, 0)]);
+      __m128i error_1_hi =
+          _mm_set_epi32(error_measure_lut[_mm_extract_epi16(diff_1, 7)],
+                        error_measure_lut[_mm_extract_epi16(diff_1, 6)],
+                        error_measure_lut[_mm_extract_epi16(diff_1, 5)],
+                        error_measure_lut[_mm_extract_epi16(diff_1, 4)]);
+      __m128i error_2_lo =
+          _mm_set_epi32(error_measure_lut[_mm_extract_epi16(diff_2, 3)],
+                        error_measure_lut[_mm_extract_epi16(diff_2, 2)],
+                        error_measure_lut[_mm_extract_epi16(diff_2, 1)],
+                        error_measure_lut[_mm_extract_epi16(diff_2, 0)]);
+      __m128i error_2_hi =
+          _mm_set_epi32(error_measure_lut[_mm_extract_epi16(diff_2, 7)],
+                        error_measure_lut[_mm_extract_epi16(diff_2, 6)],
+                        error_measure_lut[_mm_extract_epi16(diff_2, 5)],
+                        error_measure_lut[_mm_extract_epi16(diff_2, 4)]);
+
+      __m128i error_1 = _mm_add_epi32(error_1_lo, error_1_hi);
+      __m128i error_2 = _mm_add_epi32(error_2_lo, error_2_hi);
+      __m128i error_1_2 = _mm_add_epi32(error_1, error_2);
+
+      row_error = _mm_add_epi32(row_error, error_1_2);
+    }
+    __m128i col_error_lo = _mm_unpacklo_epi32(row_error, zero);
+    __m128i col_error_hi = _mm_unpackhi_epi32(row_error, zero);
+    __m128i col_error_temp = _mm_add_epi64(col_error_lo, col_error_hi);
+    col_error = _mm_add_epi64(col_error, col_error_temp);
+    // Error summation for remaining width, which is not multiple of 16
+    if (p_width & 0xf) {
+      for (int l = j * 16; l < p_width; ++l) {
+        sum_error += (int64_t)error_measure(dst[l + i * dst_stride] -
+                                            ref[l + i * ref_stride]);
+      }
+    }
+  }
+  int64_t sum_error_d_0, sum_error_d_1;
+  xx_storel_64(&sum_error_d_0, col_error);
+  xx_storel_64(&sum_error_d_1, _mm_srli_si128(col_error, 8));
+  sum_error = (sum_error + sum_error_d_0 + sum_error_d_1);
+  return sum_error;
+}
diff --git a/libs/libaom/src/av1/common/x86/warp_plane_sse4.c b/libs/libaom/src/av1/common/x86/warp_plane_sse4.c
new file mode 100644
index 000000000..10ddf92d0
--- /dev/null
+++ b/libs/libaom/src/av1/common/x86/warp_plane_sse4.c
@@ -0,0 +1,963 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+#include <smmintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/warped_motion.h"
+
+/* This is a modified version of 'av1_warped_filter' from warped_motion.c:
+   * Each coefficient is stored in 8 bits instead of 16 bits
+   * The coefficients are rearranged in the column order 0, 2, 4, 6, 1, 3, 5, 7
+
+     This is done in order to avoid overflow: Since the tap with the largest
+     coefficient could be any of taps 2, 3, 4 or 5, we can't use the summation
+     order ((0 + 1) + (4 + 5)) + ((2 + 3) + (6 + 7)) used in the regular
+     convolve functions.
+
+     Instead, we use the summation order
+     ((0 + 2) + (4 + 6)) + ((1 + 3) + (5 + 7)).
+     The rearrangement of coefficients in this table is so that we can get the
+     coefficients into the correct order more quickly.
+*/
+/* clang-format off */
+DECLARE_ALIGNED(8, const int8_t,
+                av1_filter_8bit[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]) = {
+#if WARPEDPIXEL_PREC_BITS == 6
+  // [-1, 0)
+  { 0, 127,   0, 0,   0,   1, 0, 0}, { 0, 127,   0, 0,  -1,   2, 0, 0},
+  { 1, 127,  -1, 0,  -3,   4, 0, 0}, { 1, 126,  -2, 0,  -4,   6, 1, 0},
+  { 1, 126,  -3, 0,  -5,   8, 1, 0}, { 1, 125,  -4, 0,  -6,  11, 1, 0},
+  { 1, 124,  -4, 0,  -7,  13, 1, 0}, { 2, 123,  -5, 0,  -8,  15, 1, 0},
+  { 2, 122,  -6, 0,  -9,  18, 1, 0}, { 2, 121,  -6, 0, -10,  20, 1, 0},
+  { 2, 120,  -7, 0, -11,  22, 2, 0}, { 2, 119,  -8, 0, -12,  25, 2, 0},
+  { 3, 117,  -8, 0, -13,  27, 2, 0}, { 3, 116,  -9, 0, -13,  29, 2, 0},
+  { 3, 114, -10, 0, -14,  32, 3, 0}, { 3, 113, -10, 0, -15,  35, 2, 0},
+  { 3, 111, -11, 0, -15,  37, 3, 0}, { 3, 109, -11, 0, -16,  40, 3, 0},
+  { 3, 108, -12, 0, -16,  42, 3, 0}, { 4, 106, -13, 0, -17,  45, 3, 0},
+  { 4, 104, -13, 0, -17,  47, 3, 0}, { 4, 102, -14, 0, -17,  50, 3, 0},
+  { 4, 100, -14, 0, -17,  52, 3, 0}, { 4,  98, -15, 0, -18,  55, 4, 0},
+  { 4,  96, -15, 0, -18,  58, 3, 0}, { 4,  94, -16, 0, -18,  60, 4, 0},
+  { 4,  91, -16, 0, -18,  63, 4, 0}, { 4,  89, -16, 0, -18,  65, 4, 0},
+  { 4,  87, -17, 0, -18,  68, 4, 0}, { 4,  85, -17, 0, -18,  70, 4, 0},
+  { 4,  82, -17, 0, -18,  73, 4, 0}, { 4,  80, -17, 0, -18,  75, 4, 0},
+  { 4,  78, -18, 0, -18,  78, 4, 0}, { 4,  75, -18, 0, -17,  80, 4, 0},
+  { 4,  73, -18, 0, -17,  82, 4, 0}, { 4,  70, -18, 0, -17,  85, 4, 0},
+  { 4,  68, -18, 0, -17,  87, 4, 0}, { 4,  65, -18, 0, -16,  89, 4, 0},
+  { 4,  63, -18, 0, -16,  91, 4, 0}, { 4,  60, -18, 0, -16,  94, 4, 0},
+  { 3,  58, -18, 0, -15,  96, 4, 0}, { 4,  55, -18, 0, -15,  98, 4, 0},
+  { 3,  52, -17, 0, -14, 100, 4, 0}, { 3,  50, -17, 0, -14, 102, 4, 0},
+  { 3,  47, -17, 0, -13, 104, 4, 0}, { 3,  45, -17, 0, -13, 106, 4, 0},
+  { 3,  42, -16, 0, -12, 108, 3, 0}, { 3,  40, -16, 0, -11, 109, 3, 0},
+  { 3,  37, -15, 0, -11, 111, 3, 0}, { 2,  35, -15, 0, -10, 113, 3, 0},
+  { 3,  32, -14, 0, -10, 114, 3, 0}, { 2,  29, -13, 0,  -9, 116, 3, 0},
+  { 2,  27, -13, 0,  -8, 117, 3, 0}, { 2,  25, -12, 0,  -8, 119, 2, 0},
+  { 2,  22, -11, 0,  -7, 120, 2, 0}, { 1,  20, -10, 0,  -6, 121, 2, 0},
+  { 1,  18,  -9, 0,  -6, 122, 2, 0}, { 1,  15,  -8, 0,  -5, 123, 2, 0},
+  { 1,  13,  -7, 0,  -4, 124, 1, 0}, { 1,  11,  -6, 0,  -4, 125, 1, 0},
+  { 1,   8,  -5, 0,  -3, 126, 1, 0}, { 1,   6,  -4, 0,  -2, 126, 1, 0},
+  { 0,   4,  -3, 0,  -1, 127, 1, 0}, { 0,   2,  -1, 0,   0, 127, 0, 0},
+  // [0, 1)
+  { 0,   0,   1, 0, 0, 127,   0,  0}, { 0,  -1,   2, 0, 0, 127,   0,  0},
+  { 0,  -3,   4, 1, 1, 127,  -2,  0}, { 0,  -5,   6, 1, 1, 127,  -2,  0},
+  { 0,  -6,   8, 1, 2, 126,  -3,  0}, {-1,  -7,  11, 2, 2, 126,  -4, -1},
+  {-1,  -8,  13, 2, 3, 125,  -5, -1}, {-1, -10,  16, 3, 3, 124,  -6, -1},
+  {-1, -11,  18, 3, 4, 123,  -7, -1}, {-1, -12,  20, 3, 4, 122,  -7, -1},
+  {-1, -13,  23, 3, 4, 121,  -8, -1}, {-2, -14,  25, 4, 5, 120,  -9, -1},
+  {-1, -15,  27, 4, 5, 119, -10, -1}, {-1, -16,  30, 4, 5, 118, -11, -1},
+  {-2, -17,  33, 5, 6, 116, -12, -1}, {-2, -17,  35, 5, 6, 114, -12, -1},
+  {-2, -18,  38, 5, 6, 113, -13, -1}, {-2, -19,  41, 6, 7, 111, -14, -2},
+  {-2, -19,  43, 6, 7, 110, -15, -2}, {-2, -20,  46, 6, 7, 108, -15, -2},
+  {-2, -20,  49, 6, 7, 106, -16, -2}, {-2, -21,  51, 7, 7, 104, -16, -2},
+  {-2, -21,  54, 7, 7, 102, -17, -2}, {-2, -21,  56, 7, 8, 100, -18, -2},
+  {-2, -22,  59, 7, 8,  98, -18, -2}, {-2, -22,  62, 7, 8,  96, -19, -2},
+  {-2, -22,  64, 7, 8,  94, -19, -2}, {-2, -22,  67, 8, 8,  91, -20, -2},
+  {-2, -22,  69, 8, 8,  89, -20, -2}, {-2, -22,  72, 8, 8,  87, -21, -2},
+  {-2, -21,  74, 8, 8,  84, -21, -2}, {-2, -22,  77, 8, 8,  82, -21, -2},
+  {-2, -21,  79, 8, 8,  79, -21, -2}, {-2, -21,  82, 8, 8,  77, -22, -2},
+  {-2, -21,  84, 8, 8,  74, -21, -2}, {-2, -21,  87, 8, 8,  72, -22, -2},
+  {-2, -20,  89, 8, 8,  69, -22, -2}, {-2, -20,  91, 8, 8,  67, -22, -2},
+  {-2, -19,  94, 8, 7,  64, -22, -2}, {-2, -19,  96, 8, 7,  62, -22, -2},
+  {-2, -18,  98, 8, 7,  59, -22, -2}, {-2, -18, 100, 8, 7,  56, -21, -2},
+  {-2, -17, 102, 7, 7,  54, -21, -2}, {-2, -16, 104, 7, 7,  51, -21, -2},
+  {-2, -16, 106, 7, 6,  49, -20, -2}, {-2, -15, 108, 7, 6,  46, -20, -2},
+  {-2, -15, 110, 7, 6,  43, -19, -2}, {-2, -14, 111, 7, 6,  41, -19, -2},
+  {-1, -13, 113, 6, 5,  38, -18, -2}, {-1, -12, 114, 6, 5,  35, -17, -2},
+  {-1, -12, 116, 6, 5,  33, -17, -2}, {-1, -11, 118, 5, 4,  30, -16, -1},
+  {-1, -10, 119, 5, 4,  27, -15, -1}, {-1,  -9, 120, 5, 4,  25, -14, -2},
+  {-1,  -8, 121, 4, 3,  23, -13, -1}, {-1,  -7, 122, 4, 3,  20, -12, -1},
+  {-1,  -7, 123, 4, 3,  18, -11, -1}, {-1,  -6, 124, 3, 3,  16, -10, -1},
+  {-1,  -5, 125, 3, 2,  13,  -8, -1}, {-1,  -4, 126, 2, 2,  11,  -7, -1},
+  { 0,  -3, 126, 2, 1,   8,  -6,  0}, { 0,  -2, 127, 1, 1,   6,  -5,  0},
+  { 0,  -2, 127, 1, 1,   4,  -3,  0}, { 0,   0, 127, 0, 0,   2,  -1,  0},
+  // [1, 2)
+  { 0, 0, 127,   0, 0,   1,   0, 0}, { 0, 0, 127,   0, 0,  -1,   2, 0},
+  { 0, 1, 127,  -1, 0,  -3,   4, 0}, { 0, 1, 126,  -2, 0,  -4,   6, 1},
+  { 0, 1, 126,  -3, 0,  -5,   8, 1}, { 0, 1, 125,  -4, 0,  -6,  11, 1},
+  { 0, 1, 124,  -4, 0,  -7,  13, 1}, { 0, 2, 123,  -5, 0,  -8,  15, 1},
+  { 0, 2, 122,  -6, 0,  -9,  18, 1}, { 0, 2, 121,  -6, 0, -10,  20, 1},
+  { 0, 2, 120,  -7, 0, -11,  22, 2}, { 0, 2, 119,  -8, 0, -12,  25, 2},
+  { 0, 3, 117,  -8, 0, -13,  27, 2}, { 0, 3, 116,  -9, 0, -13,  29, 2},
+  { 0, 3, 114, -10, 0, -14,  32, 3}, { 0, 3, 113, -10, 0, -15,  35, 2},
+  { 0, 3, 111, -11, 0, -15,  37, 3}, { 0, 3, 109, -11, 0, -16,  40, 3},
+  { 0, 3, 108, -12, 0, -16,  42, 3}, { 0, 4, 106, -13, 0, -17,  45, 3},
+  { 0, 4, 104, -13, 0, -17,  47, 3}, { 0, 4, 102, -14, 0, -17,  50, 3},
+  { 0, 4, 100, -14, 0, -17,  52, 3}, { 0, 4,  98, -15, 0, -18,  55, 4},
+  { 0, 4,  96, -15, 0, -18,  58, 3}, { 0, 4,  94, -16, 0, -18,  60, 4},
+  { 0, 4,  91, -16, 0, -18,  63, 4}, { 0, 4,  89, -16, 0, -18,  65, 4},
+  { 0, 4,  87, -17, 0, -18,  68, 4}, { 0, 4,  85, -17, 0, -18,  70, 4},
+  { 0, 4,  82, -17, 0, -18,  73, 4}, { 0, 4,  80, -17, 0, -18,  75, 4},
+  { 0, 4,  78, -18, 0, -18,  78, 4}, { 0, 4,  75, -18, 0, -17,  80, 4},
+  { 0, 4,  73, -18, 0, -17,  82, 4}, { 0, 4,  70, -18, 0, -17,  85, 4},
+  { 0, 4,  68, -18, 0, -17,  87, 4}, { 0, 4,  65, -18, 0, -16,  89, 4},
+  { 0, 4,  63, -18, 0, -16,  91, 4}, { 0, 4,  60, -18, 0, -16,  94, 4},
+  { 0, 3,  58, -18, 0, -15,  96, 4}, { 0, 4,  55, -18, 0, -15,  98, 4},
+  { 0, 3,  52, -17, 0, -14, 100, 4}, { 0, 3,  50, -17, 0, -14, 102, 4},
+  { 0, 3,  47, -17, 0, -13, 104, 4}, { 0, 3,  45, -17, 0, -13, 106, 4},
+  { 0, 3,  42, -16, 0, -12, 108, 3}, { 0, 3,  40, -16, 0, -11, 109, 3},
+  { 0, 3,  37, -15, 0, -11, 111, 3}, { 0, 2,  35, -15, 0, -10, 113, 3},
+  { 0, 3,  32, -14, 0, -10, 114, 3}, { 0, 2,  29, -13, 0,  -9, 116, 3},
+  { 0, 2,  27, -13, 0,  -8, 117, 3}, { 0, 2,  25, -12, 0,  -8, 119, 2},
+  { 0, 2,  22, -11, 0,  -7, 120, 2}, { 0, 1,  20, -10, 0,  -6, 121, 2},
+  { 0, 1,  18,  -9, 0,  -6, 122, 2}, { 0, 1,  15,  -8, 0,  -5, 123, 2},
+  { 0, 1,  13,  -7, 0,  -4, 124, 1}, { 0, 1,  11,  -6, 0,  -4, 125, 1},
+  { 0, 1,   8,  -5, 0,  -3, 126, 1}, { 0, 1,   6,  -4, 0,  -2, 126, 1},
+  { 0, 0,   4,  -3, 0,  -1, 127, 1}, { 0, 0,   2,  -1, 0,   0, 127, 0},
+  // dummy (replicate row index 191)
+  { 0, 0,   2,  -1, 0,   0, 127, 0},
+
+#else
+  // [-1, 0)
+  { 0, 127,   0, 0,   0,   1, 0, 0}, { 1, 127,  -1, 0,  -3,   4, 0, 0},
+  { 1, 126,  -3, 0,  -5,   8, 1, 0}, { 1, 124,  -4, 0,  -7,  13, 1, 0},
+  { 2, 122,  -6, 0,  -9,  18, 1, 0}, { 2, 120,  -7, 0, -11,  22, 2, 0},
+  { 3, 117,  -8, 0, -13,  27, 2, 0}, { 3, 114, -10, 0, -14,  32, 3, 0},
+  { 3, 111, -11, 0, -15,  37, 3, 0}, { 3, 108, -12, 0, -16,  42, 3, 0},
+  { 4, 104, -13, 0, -17,  47, 3, 0}, { 4, 100, -14, 0, -17,  52, 3, 0},
+  { 4,  96, -15, 0, -18,  58, 3, 0}, { 4,  91, -16, 0, -18,  63, 4, 0},
+  { 4,  87, -17, 0, -18,  68, 4, 0}, { 4,  82, -17, 0, -18,  73, 4, 0},
+  { 4,  78, -18, 0, -18,  78, 4, 0}, { 4,  73, -18, 0, -17,  82, 4, 0},
+  { 4,  68, -18, 0, -17,  87, 4, 0}, { 4,  63, -18, 0, -16,  91, 4, 0},
+  { 3,  58, -18, 0, -15,  96, 4, 0}, { 3,  52, -17, 0, -14, 100, 4, 0},
+  { 3,  47, -17, 0, -13, 104, 4, 0}, { 3,  42, -16, 0, -12, 108, 3, 0},
+  { 3,  37, -15, 0, -11, 111, 3, 0}, { 3,  32, -14, 0, -10, 114, 3, 0},
+  { 2,  27, -13, 0,  -8, 117, 3, 0}, { 2,  22, -11, 0,  -7, 120, 2, 0},
+  { 1,  18,  -9, 0,  -6, 122, 2, 0}, { 1,  13,  -7, 0,  -4, 124, 1, 0},
+  { 1,   8,  -5, 0,  -3, 126, 1, 0}, { 0,   4,  -3, 0,  -1, 127, 1, 0},
+  // [0, 1)
+  { 0,   0,   1, 0, 0, 127,   0,  0}, { 0,  -3,   4, 1, 1, 127,  -2,  0},
+  { 0,  -6,   8, 1, 2, 126,  -3,  0}, {-1,  -8,  13, 2, 3, 125,  -5, -1},
+  {-1, -11,  18, 3, 4, 123,  -7, -1}, {-1, -13,  23, 3, 4, 121,  -8, -1},
+  {-1, -15,  27, 4, 5, 119, -10, -1}, {-2, -17,  33, 5, 6, 116, -12, -1},
+  {-2, -18,  38, 5, 6, 113, -13, -1}, {-2, -19,  43, 6, 7, 110, -15, -2},
+  {-2, -20,  49, 6, 7, 106, -16, -2}, {-2, -21,  54, 7, 7, 102, -17, -2},
+  {-2, -22,  59, 7, 8,  98, -18, -2}, {-2, -22,  64, 7, 8,  94, -19, -2},
+  {-2, -22,  69, 8, 8,  89, -20, -2}, {-2, -21,  74, 8, 8,  84, -21, -2},
+  {-2, -21,  79, 8, 8,  79, -21, -2}, {-2, -21,  84, 8, 8,  74, -21, -2},
+  {-2, -20,  89, 8, 8,  69, -22, -2}, {-2, -19,  94, 8, 7,  64, -22, -2},
+  {-2, -18,  98, 8, 7,  59, -22, -2}, {-2, -17, 102, 7, 7,  54, -21, -2},
+  {-2, -16, 106, 7, 6,  49, -20, -2}, {-2, -15, 110, 7, 6,  43, -19, -2},
+  {-1, -13, 113, 6, 5,  38, -18, -2}, {-1, -12, 116, 6, 5,  33, -17, -2},
+  {-1, -10, 119, 5, 4,  27, -15, -1}, {-1,  -8, 121, 4, 3,  23, -13, -1},
+  {-1,  -7, 123, 4, 3,  18, -11, -1}, {-1,  -5, 125, 3, 2,  13,  -8, -1},
+  { 0,  -3, 126, 2, 1,   8,  -6,  0}, { 0,  -2, 127, 1, 1,   4,  -3,  0},
+  // [1, 2)
+  { 0,  0, 127,   0, 0,   1,   0, 0}, { 0, 1, 127,  -1, 0,  -3,   4, 0},
+  { 0,  1, 126,  -3, 0,  -5,   8, 1}, { 0, 1, 124,  -4, 0,  -7,  13, 1},
+  { 0,  2, 122,  -6, 0,  -9,  18, 1}, { 0, 2, 120,  -7, 0, -11,  22, 2},
+  { 0,  3, 117,  -8, 0, -13,  27, 2}, { 0, 3, 114, -10, 0, -14,  32, 3},
+  { 0,  3, 111, -11, 0, -15,  37, 3}, { 0, 3, 108, -12, 0, -16,  42, 3},
+  { 0,  4, 104, -13, 0, -17,  47, 3}, { 0, 4, 100, -14, 0, -17,  52, 3},
+  { 0,  4,  96, -15, 0, -18,  58, 3}, { 0, 4,  91, -16, 0, -18,  63, 4},
+  { 0,  4,  87, -17, 0, -18,  68, 4}, { 0, 4,  82, -17, 0, -18,  73, 4},
+  { 0,  4,  78, -18, 0, -18,  78, 4}, { 0, 4,  73, -18, 0, -17,  82, 4},
+  { 0,  4,  68, -18, 0, -17,  87, 4}, { 0, 4,  63, -18, 0, -16,  91, 4},
+  { 0,  3,  58, -18, 0, -15,  96, 4}, { 0, 3,  52, -17, 0, -14, 100, 4},
+  { 0,  3,  47, -17, 0, -13, 104, 4}, { 0, 3,  42, -16, 0, -12, 108, 3},
+  { 0,  3,  37, -15, 0, -11, 111, 3}, { 0, 3,  32, -14, 0, -10, 114, 3},
+  { 0,  2,  27, -13, 0,  -8, 117, 3}, { 0, 2,  22, -11, 0,  -7, 120, 2},
+  { 0,  1,  18,  -9, 0,  -6, 122, 2}, { 0, 1,  13,  -7, 0,  -4, 124, 1},
+  { 0,  1,   8,  -5, 0,  -3, 126, 1}, { 0, 0,   4,  -3, 0,  -1, 127, 1},
+  // dummy (replicate row index 95)
+  { 0, 0,   4,  -3, 0,  -1, 127, 1},
+#endif  // WARPEDPIXEL_PREC_BITS == 6
+};
+/* clang-format on */
+
+// Shuffle masks: we want to convert a sequence of bytes 0, 1, 2, ..., 15
+// in an SSE register into two sequences:
+// 0, 2, 2, 4, ..., 12, 12, 14, <don't care>
+// 1, 3, 3, 5, ..., 13, 13, 15, <don't care>
+DECLARE_ALIGNED(16, static const uint8_t,
+                even_mask[16]) = { 0, 2,  2,  4,  4,  6,  6,  8,
+                                   8, 10, 10, 12, 12, 14, 14, 0 };
+
+DECLARE_ALIGNED(16, static const uint8_t,
+                odd_mask[16]) = { 1, 3,  3,  5,  5,  7,  7,  9,
+                                  9, 11, 11, 13, 13, 15, 15, 0 };
+
+DECLARE_ALIGNED(16, static const uint8_t,
+                shuffle_alpha0_mask01[16]) = { 0, 1, 0, 1, 0, 1, 0, 1,
+                                               0, 1, 0, 1, 0, 1, 0, 1 };
+
+DECLARE_ALIGNED(16, static const uint8_t,
+                shuffle_alpha0_mask23[16]) = { 2, 3, 2, 3, 2, 3, 2, 3,
+                                               2, 3, 2, 3, 2, 3, 2, 3 };
+
+DECLARE_ALIGNED(16, static const uint8_t,
+                shuffle_alpha0_mask45[16]) = { 4, 5, 4, 5, 4, 5, 4, 5,
+                                               4, 5, 4, 5, 4, 5, 4, 5 };
+
+DECLARE_ALIGNED(16, static const uint8_t,
+                shuffle_alpha0_mask67[16]) = { 6, 7, 6, 7, 6, 7, 6, 7,
+                                               6, 7, 6, 7, 6, 7, 6, 7 };
+
+DECLARE_ALIGNED(16, static const uint8_t,
+                shuffle_gamma0_mask0[16]) = { 0, 1, 2, 3, 0, 1, 2, 3,
+                                              0, 1, 2, 3, 0, 1, 2, 3 };
+
+DECLARE_ALIGNED(16, static const uint8_t,
+                shuffle_gamma0_mask1[16]) = { 4, 5, 6, 7, 4, 5, 6, 7,
+                                              4, 5, 6, 7, 4, 5, 6, 7 };
+
+DECLARE_ALIGNED(16, static const uint8_t,
+                shuffle_gamma0_mask2[16]) = { 8, 9, 10, 11, 8, 9, 10, 11,
+                                              8, 9, 10, 11, 8, 9, 10, 11 };
+
+DECLARE_ALIGNED(16, static const uint8_t,
+                shuffle_gamma0_mask3[16]) = { 12, 13, 14, 15, 12, 13, 14, 15,
+                                              12, 13, 14, 15, 12, 13, 14, 15 };
+
+static INLINE void filter_src_pixels(__m128i src, __m128i *tmp, __m128i *coeff,
+                                     const int offset_bits_horiz,
+                                     const int reduce_bits_horiz, int k) {
+  const __m128i src_even =
+      _mm_shuffle_epi8(src, _mm_load_si128((__m128i *)even_mask));
+  const __m128i src_odd =
+      _mm_shuffle_epi8(src, _mm_load_si128((__m128i *)odd_mask));
+  // The pixel order we need for 'src' is:
+  // 0 2 2 4 4 6 6 8 1 3 3 5 5 7 7 9
+  const __m128i src_02 = _mm_unpacklo_epi64(src_even, src_odd);
+  const __m128i res_02 = _mm_maddubs_epi16(src_02, coeff[0]);
+  // 4 6 6 8 8 10 10 12 5 7 7 9 9 11 11 13
+  const __m128i src_46 = _mm_unpacklo_epi64(_mm_srli_si128(src_even, 4),
+                                            _mm_srli_si128(src_odd, 4));
+  const __m128i res_46 = _mm_maddubs_epi16(src_46, coeff[1]);
+  // 1 3 3 5 5 7 7 9 2 4 4 6 6 8 8 10
+  const __m128i src_13 =
+      _mm_unpacklo_epi64(src_odd, _mm_srli_si128(src_even, 2));
+  const __m128i res_13 = _mm_maddubs_epi16(src_13, coeff[2]);
+  // 5 7 7 9 9 11 11 13 6 8 8 10 10 12 12 14
+  const __m128i src_57 = _mm_unpacklo_epi64(_mm_srli_si128(src_odd, 4),
+                                            _mm_srli_si128(src_even, 6));
+  const __m128i res_57 = _mm_maddubs_epi16(src_57, coeff[3]);
+
+  const __m128i round_const = _mm_set1_epi16((1 << offset_bits_horiz) +
+                                             ((1 << reduce_bits_horiz) >> 1));
+
+  // Note: The values res_02 + res_46 and res_13 + res_57 both
+  // fit into int16s at this point, but their sum may be too wide to fit
+  // into an int16. However, once we also add round_const, the sum of
+  // all of these fits into a uint16.
+  //
+  // The wrapping behaviour of _mm_add_* is used here to make sure we
+  // get the correct result despite converting between different
+  // (implicit) types.
+  const __m128i res_even = _mm_add_epi16(res_02, res_46);
+  const __m128i res_odd = _mm_add_epi16(res_13, res_57);
+  const __m128i res =
+      _mm_add_epi16(_mm_add_epi16(res_even, res_odd), round_const);
+  tmp[k + 7] = _mm_srl_epi16(res, _mm_cvtsi32_si128(reduce_bits_horiz));
+}
+
+static INLINE void prepare_horizontal_filter_coeff(int alpha, int sx,
+                                                   __m128i *coeff) {
+  // Filter even-index pixels
+  const __m128i tmp_0 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]);
+  const __m128i tmp_1 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]);
+  const __m128i tmp_2 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]);
+  const __m128i tmp_3 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]);
+  const __m128i tmp_4 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]);
+  const __m128i tmp_5 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]);
+  const __m128i tmp_6 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]);
+  const __m128i tmp_7 = _mm_loadl_epi64(
+      (__m128i *)&av1_filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]);
+
+  // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 0 2
+  const __m128i tmp_8 = _mm_unpacklo_epi16(tmp_0, tmp_2);
+  // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 1 3
+  const __m128i tmp_9 = _mm_unpacklo_epi16(tmp_1, tmp_3);
+  // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 4 6
+  const __m128i tmp_10 = _mm_unpacklo_epi16(tmp_4, tmp_6);
+  // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 5 7
+  const __m128i tmp_11 = _mm_unpacklo_epi16(tmp_5, tmp_7);
+
+  // Coeffs 0 2 0 2 0 2 0 2 4 6 4 6 4 6 4 6 for pixels 0 2 4 6
+  const __m128i tmp_12 = _mm_unpacklo_epi32(tmp_8, tmp_10);
+  // Coeffs 1 3 1 3 1 3 1 3 5 7 5 7 5 7 5 7 for pixels 0 2 4 6
+  const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_8, tmp_10);
+  // Coeffs 0 2 0 2 0 2 0 2 4 6 4 6 4 6 4 6 for pixels 1 3 5 7
+  const __m128i tmp_14 = _mm_unpacklo_epi32(tmp_9, tmp_11);
+  // Coeffs 1 3 1 3 1 3 1 3 5 7 5 7 5 7 5 7 for pixels 1 3 5 7
+  const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_9, tmp_11);
+
+  // Coeffs 0 2 for pixels 0 2 4 6 1 3 5 7
+  coeff[0] = _mm_unpacklo_epi64(tmp_12, tmp_14);
+  // Coeffs 4 6 for pixels 0 2 4 6 1 3 5 7
+  coeff[1] = _mm_unpackhi_epi64(tmp_12, tmp_14);
+  // Coeffs 1 3 for pixels 0 2 4 6 1 3 5 7
+  coeff[2] = _mm_unpacklo_epi64(tmp_13, tmp_15);
+  // Coeffs 5 7 for pixels 0 2 4 6 1 3 5 7
+  coeff[3] = _mm_unpackhi_epi64(tmp_13, tmp_15);
+}
+
+static INLINE void prepare_horizontal_filter_coeff_alpha0(int sx,
+                                                          __m128i *coeff) {
+  // Filter even-index pixels
+  const __m128i tmp_0 =
+      _mm_loadl_epi64((__m128i *)&av1_filter_8bit[sx >> WARPEDDIFF_PREC_BITS]);
+
+  // Coeffs 0 2 for pixels 0 2 4 6 1 3 5 7
+  coeff[0] =
+      _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask01));
+  // Coeffs 4 6 for pixels 0 2 4 6 1 3 5 7
+  coeff[1] =
+      _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask23));
+  // Coeffs 1 3 for pixels 0 2 4 6 1 3 5 7
+  coeff[2] =
+      _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask45));
+  // Coeffs 5 7 for pixels 0 2 4 6 1 3 5 7
+  coeff[3] =
+      _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask67));
+}
+
+static INLINE void horizontal_filter(__m128i src, __m128i *tmp, int sx,
+                                     int alpha, int k,
+                                     const int offset_bits_horiz,
+                                     const int reduce_bits_horiz) {
+  __m128i coeff[4];
+  prepare_horizontal_filter_coeff(alpha, sx, coeff);
+  filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
+}
+
+static INLINE void warp_horizontal_filter(const uint8_t *ref, __m128i *tmp,
+                                          int stride, int32_t ix4, int32_t iy4,
+                                          int32_t sx4, int alpha, int beta,
+                                          int p_height, int height, int i,
+                                          const int offset_bits_horiz,
+                                          const int reduce_bits_horiz) {
+  int k;
+  for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+    int iy = iy4 + k;
+    if (iy < 0)
+      iy = 0;
+    else if (iy > height - 1)
+      iy = height - 1;
+    int sx = sx4 + beta * (k + 4);
+
+    // Load source pixels
+    const __m128i src =
+        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+    horizontal_filter(src, tmp, sx, alpha, k, offset_bits_horiz,
+                      reduce_bits_horiz);
+  }
+}
+
+static INLINE void warp_horizontal_filter_alpha0(
+    const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+    const int offset_bits_horiz, const int reduce_bits_horiz) {
+  (void)alpha;
+  int k;
+  for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+    int iy = iy4 + k;
+    if (iy < 0)
+      iy = 0;
+    else if (iy > height - 1)
+      iy = height - 1;
+    int sx = sx4 + beta * (k + 4);
+
+    // Load source pixels
+    const __m128i src =
+        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+
+    __m128i coeff[4];
+    prepare_horizontal_filter_coeff_alpha0(sx, coeff);
+    filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
+  }
+}
+
+static INLINE void warp_horizontal_filter_beta0(
+    const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+    const int offset_bits_horiz, const int reduce_bits_horiz) {
+  (void)beta;
+  int k;
+  __m128i coeff[4];
+  prepare_horizontal_filter_coeff(alpha, sx4, coeff);
+
+  for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+    int iy = iy4 + k;
+    if (iy < 0)
+      iy = 0;
+    else if (iy > height - 1)
+      iy = height - 1;
+
+    // Load source pixels
+    const __m128i src =
+        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+    filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
+  }
+}
+
+static INLINE void warp_horizontal_filter_alpha0_beta0(
+    const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+    const int offset_bits_horiz, const int reduce_bits_horiz) {
+  (void)beta;
+  (void)alpha;
+  int k;
+
+  __m128i coeff[4];
+  prepare_horizontal_filter_coeff_alpha0(sx4, coeff);
+
+  for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+    int iy = iy4 + k;
+    if (iy < 0)
+      iy = 0;
+    else if (iy > height - 1)
+      iy = height - 1;
+
+    // Load source pixels
+    const __m128i src =
+        _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+    filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
+  }
+}
+
+static INLINE void unpack_weights_and_set_round_const(
+    ConvolveParams *conv_params, const int round_bits, const int offset_bits,
+    __m128i *res_sub_const, __m128i *round_bits_const, __m128i *wt) {
+  *res_sub_const =
+      _mm_set1_epi16(-(1 << (offset_bits - conv_params->round_1)) -
+                     (1 << (offset_bits - conv_params->round_1 - 1)));
+  *round_bits_const = _mm_set1_epi16(((1 << round_bits) >> 1));
+
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m128i wt0 = _mm_set1_epi16((int16_t)w0);
+  const __m128i wt1 = _mm_set1_epi16((int16_t)w1);
+  *wt = _mm_unpacklo_epi16(wt0, wt1);
+}
+
+static INLINE void prepare_vertical_filter_coeffs(int gamma, int sy,
+                                                  __m128i *coeffs) {
+  const __m128i tmp_0 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_2 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_4 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_6 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+  const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
+  const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
+  const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
+  const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
+
+  // even coeffs
+  coeffs[0] = _mm_unpacklo_epi64(tmp_8, tmp_10);
+  coeffs[1] = _mm_unpackhi_epi64(tmp_8, tmp_10);
+  coeffs[2] = _mm_unpacklo_epi64(tmp_12, tmp_14);
+  coeffs[3] = _mm_unpackhi_epi64(tmp_12, tmp_14);
+
+  const __m128i tmp_1 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_3 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_5 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
+  const __m128i tmp_7 =
+      _mm_loadu_si128((__m128i *)(av1_warped_filter +
+                                  ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+  const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
+  const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
+  const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
+  const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
+
+  // odd coeffs
+  coeffs[4] = _mm_unpacklo_epi64(tmp_9, tmp_11);
+  coeffs[5] = _mm_unpackhi_epi64(tmp_9, tmp_11);
+  coeffs[6] = _mm_unpacklo_epi64(tmp_13, tmp_15);
+  coeffs[7] = _mm_unpackhi_epi64(tmp_13, tmp_15);
+}
+
+static INLINE void prepare_vertical_filter_coeffs_gamma0(int sy,
+                                                         __m128i *coeffs) {
+  const __m128i tmp_0 = _mm_loadu_si128(
+      (__m128i *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
+
+  // even coeffs
+  coeffs[0] =
+      _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask0));
+  coeffs[1] =
+      _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask1));
+  coeffs[2] =
+      _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask2));
+  coeffs[3] =
+      _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask3));
+
+  // odd coeffs
+  coeffs[4] = coeffs[0];
+  coeffs[5] = coeffs[1];
+  coeffs[6] = coeffs[2];
+  coeffs[7] = coeffs[3];
+}
+
+static INLINE void filter_src_pixels_vertical(__m128i *tmp, __m128i *coeffs,
+                                              __m128i *res_lo, __m128i *res_hi,
+                                              int k) {
+  // Load from tmp and rearrange pairs of consecutive rows into the
+  // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7
+  const __m128i *src = tmp + (k + 4);
+  const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]);
+  const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]);
+  const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]);
+  const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]);
+
+  const __m128i res_0 = _mm_madd_epi16(src_0, coeffs[0]);
+  const __m128i res_2 = _mm_madd_epi16(src_2, coeffs[1]);
+  const __m128i res_4 = _mm_madd_epi16(src_4, coeffs[2]);
+  const __m128i res_6 = _mm_madd_epi16(src_6, coeffs[3]);
+
+  const __m128i res_even =
+      _mm_add_epi32(_mm_add_epi32(res_0, res_2), _mm_add_epi32(res_4, res_6));
+
+  // Filter odd-index pixels
+  const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]);
+  const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]);
+  const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]);
+  const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);
+
+  const __m128i res_1 = _mm_madd_epi16(src_1, coeffs[4]);
+  const __m128i res_3 = _mm_madd_epi16(src_3, coeffs[5]);
+  const __m128i res_5 = _mm_madd_epi16(src_5, coeffs[6]);
+  const __m128i res_7 = _mm_madd_epi16(src_7, coeffs[7]);
+
+  const __m128i res_odd =
+      _mm_add_epi32(_mm_add_epi32(res_1, res_3), _mm_add_epi32(res_5, res_7));
+
+  // Rearrange pixels back into the order 0 ... 7
+  *res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+  *res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+}
+
+static INLINE void store_vertical_filter_output(
+    __m128i *res_lo, __m128i *res_hi, const __m128i *res_add_const,
+    const __m128i *wt, const __m128i *res_sub_const, __m128i *round_bits_const,
+    uint8_t *pred, ConvolveParams *conv_params, int i, int j, int k,
+    const int reduce_bits_vert, int p_stride, int p_width,
+    const int round_bits) {
+  __m128i res_lo_1 = *res_lo;
+  __m128i res_hi_1 = *res_hi;
+
+  if (conv_params->is_compound) {
+    __m128i *const p =
+        (__m128i *)&conv_params->dst[(i + k + 4) * conv_params->dst_stride + j];
+    res_lo_1 = _mm_srai_epi32(_mm_add_epi32(res_lo_1, *res_add_const),
+                              reduce_bits_vert);
+    const __m128i temp_lo_16 = _mm_packus_epi32(res_lo_1, res_lo_1);
+    __m128i res_lo_16;
+    if (conv_params->do_average) {
+      __m128i *const dst8 = (__m128i *)&pred[(i + k + 4) * p_stride + j];
+      const __m128i p_16 = _mm_loadl_epi64(p);
+
+      if (conv_params->use_dist_wtd_comp_avg) {
+        const __m128i p_16_lo = _mm_unpacklo_epi16(p_16, temp_lo_16);
+        const __m128i wt_res_lo = _mm_madd_epi16(p_16_lo, *wt);
+        const __m128i shifted_32 =
+            _mm_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
+        res_lo_16 = _mm_packus_epi32(shifted_32, shifted_32);
+      } else {
+        res_lo_16 = _mm_srai_epi16(_mm_add_epi16(p_16, temp_lo_16), 1);
+      }
+
+      res_lo_16 = _mm_add_epi16(res_lo_16, *res_sub_const);
+
+      res_lo_16 = _mm_srai_epi16(_mm_add_epi16(res_lo_16, *round_bits_const),
+                                 round_bits);
+      __m128i res_8_lo = _mm_packus_epi16(res_lo_16, res_lo_16);
+      *(uint32_t *)dst8 = _mm_cvtsi128_si32(res_8_lo);
+    } else {
+      _mm_storel_epi64(p, temp_lo_16);
+    }
+    if (p_width > 4) {
+      __m128i *const p4 =
+          (__m128i *)&conv_params
+              ->dst[(i + k + 4) * conv_params->dst_stride + j + 4];
+      res_hi_1 = _mm_srai_epi32(_mm_add_epi32(res_hi_1, *res_add_const),
+                                reduce_bits_vert);
+      const __m128i temp_hi_16 = _mm_packus_epi32(res_hi_1, res_hi_1);
+      __m128i res_hi_16;
+
+      if (conv_params->do_average) {
+        __m128i *const dst8_4 =
+            (__m128i *)&pred[(i + k + 4) * p_stride + j + 4];
+        const __m128i p4_16 = _mm_loadl_epi64(p4);
+
+        if (conv_params->use_dist_wtd_comp_avg) {
+          const __m128i p_16_hi = _mm_unpacklo_epi16(p4_16, temp_hi_16);
+          const __m128i wt_res_hi = _mm_madd_epi16(p_16_hi, *wt);
+          const __m128i shifted_32 =
+              _mm_srai_epi32(wt_res_hi, DIST_PRECISION_BITS);
+          res_hi_16 = _mm_packus_epi32(shifted_32, shifted_32);
+        } else {
+          res_hi_16 = _mm_srai_epi16(_mm_add_epi16(p4_16, temp_hi_16), 1);
+        }
+        res_hi_16 = _mm_add_epi16(res_hi_16, *res_sub_const);
+
+        res_hi_16 = _mm_srai_epi16(_mm_add_epi16(res_hi_16, *round_bits_const),
+                                   round_bits);
+        __m128i res_8_hi = _mm_packus_epi16(res_hi_16, res_hi_16);
+        *(uint32_t *)dst8_4 = _mm_cvtsi128_si32(res_8_hi);
+
+      } else {
+        _mm_storel_epi64(p4, temp_hi_16);
+      }
+    }
+  } else {
+    const __m128i res_lo_round = _mm_srai_epi32(
+        _mm_add_epi32(res_lo_1, *res_add_const), reduce_bits_vert);
+    const __m128i res_hi_round = _mm_srai_epi32(
+        _mm_add_epi32(res_hi_1, *res_add_const), reduce_bits_vert);
+
+    const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
+    __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit);
+
+    // Store, blending with 'pred' if needed
+    __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
+
+    // Note: If we're outputting a 4x4 block, we need to be very careful
+    // to only output 4 pixels at this point, to avoid encode/decode
+    // mismatches when encoding with multiple threads.
+    if (p_width == 4) {
+      *(uint32_t *)p = _mm_cvtsi128_si32(res_8bit);
+    } else {
+      _mm_storel_epi64(p, res_8bit);
+    }
+  }
+}
+
+static INLINE void warp_vertical_filter(
+    uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
+    int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
+    int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
+    const int round_bits, const int offset_bits) {
+  int k;
+  __m128i res_sub_const, round_bits_const, wt;
+  unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
+                                     &res_sub_const, &round_bits_const, &wt);
+  // Vertical filter
+  for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
+    int sy = sy4 + delta * (k + 4);
+
+    __m128i coeffs[8];
+    prepare_vertical_filter_coeffs(gamma, sy, coeffs);
+
+    __m128i res_lo;
+    __m128i res_hi;
+    filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
+
+    store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
+                                 &res_sub_const, &round_bits_const, pred,
+                                 conv_params, i, j, k, reduce_bits_vert,
+                                 p_stride, p_width, round_bits);
+  }
+}
+
+static INLINE void warp_vertical_filter_gamma0(
+    uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
+    int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
+    int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
+    const int round_bits, const int offset_bits) {
+  int k;
+  (void)gamma;
+  __m128i res_sub_const, round_bits_const, wt;
+  unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
+                                     &res_sub_const, &round_bits_const, &wt);
+  // Vertical filter
+  for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
+    int sy = sy4 + delta * (k + 4);
+
+    __m128i coeffs[8];
+    prepare_vertical_filter_coeffs_gamma0(sy, coeffs);
+
+    __m128i res_lo;
+    __m128i res_hi;
+    filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
+
+    store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
+                                 &res_sub_const, &round_bits_const, pred,
+                                 conv_params, i, j, k, reduce_bits_vert,
+                                 p_stride, p_width, round_bits);
+  }
+}
+
+static INLINE void warp_vertical_filter_delta0(
+    uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
+    int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
+    int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
+    const int round_bits, const int offset_bits) {
+  (void)delta;
+  int k;
+  __m128i res_sub_const, round_bits_const, wt;
+  unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
+                                     &res_sub_const, &round_bits_const, &wt);
+
+  __m128i coeffs[8];
+  prepare_vertical_filter_coeffs(gamma, sy4, coeffs);
+  // Vertical filter
+  for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
+    __m128i res_lo;
+    __m128i res_hi;
+    filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
+
+    store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
+                                 &res_sub_const, &round_bits_const, pred,
+                                 conv_params, i, j, k, reduce_bits_vert,
+                                 p_stride, p_width, round_bits);
+  }
+}
+
+static INLINE void warp_vertical_filter_gamma0_delta0(
+    uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
+    int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
+    int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
+    const int round_bits, const int offset_bits) {
+  (void)delta;
+  (void)gamma;
+  int k;
+  __m128i res_sub_const, round_bits_const, wt;
+  unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
+                                     &res_sub_const, &round_bits_const, &wt);
+
+  __m128i coeffs[8];
+  prepare_vertical_filter_coeffs_gamma0(sy4, coeffs);
+  // Vertical filter
+  for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
+    __m128i res_lo;
+    __m128i res_hi;
+    filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
+
+    store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
+                                 &res_sub_const, &round_bits_const, pred,
+                                 conv_params, i, j, k, reduce_bits_vert,
+                                 p_stride, p_width, round_bits);
+  }
+}
+
+static INLINE void prepare_warp_vertical_filter(
+    uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
+    int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
+    int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
+    const int round_bits, const int offset_bits) {
+  if (gamma == 0 && delta == 0)
+    warp_vertical_filter_gamma0_delta0(
+        pred, tmp, conv_params, gamma, delta, p_height, p_stride, p_width, i, j,
+        sy4, reduce_bits_vert, res_add_const, round_bits, offset_bits);
+  else if (gamma == 0 && delta != 0)
+    warp_vertical_filter_gamma0(pred, tmp, conv_params, gamma, delta, p_height,
+                                p_stride, p_width, i, j, sy4, reduce_bits_vert,
+                                res_add_const, round_bits, offset_bits);
+  else if (gamma != 0 && delta == 0)
+    warp_vertical_filter_delta0(pred, tmp, conv_params, gamma, delta, p_height,
+                                p_stride, p_width, i, j, sy4, reduce_bits_vert,
+                                res_add_const, round_bits, offset_bits);
+  else
+    warp_vertical_filter(pred, tmp, conv_params, gamma, delta, p_height,
+                         p_stride, p_width, i, j, sy4, reduce_bits_vert,
+                         res_add_const, round_bits, offset_bits);
+}
+
+static INLINE void prepare_warp_horizontal_filter(
+    const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+    int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+    const int offset_bits_horiz, const int reduce_bits_horiz) {
+  if (alpha == 0 && beta == 0)
+    warp_horizontal_filter_alpha0_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha,
+                                        beta, p_height, height, i,
+                                        offset_bits_horiz, reduce_bits_horiz);
+  else if (alpha == 0 && beta != 0)
+    warp_horizontal_filter_alpha0(ref, tmp, stride, ix4, iy4, sx4, alpha, beta,
+                                  p_height, height, i, offset_bits_horiz,
+                                  reduce_bits_horiz);
+  else if (alpha != 0 && beta == 0)
+    warp_horizontal_filter_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha, beta,
+                                 p_height, height, i, offset_bits_horiz,
+                                 reduce_bits_horiz);
+  else
+    warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha, beta,
+                           p_height, height, i, offset_bits_horiz,
+                           reduce_bits_horiz);
+}
+
+void av1_warp_affine_sse4_1(const int32_t *mat, const uint8_t *ref, int width,
+                            int height, int stride, uint8_t *pred, int p_col,
+                            int p_row, int p_width, int p_height, int p_stride,
+                            int subsampling_x, int subsampling_y,
+                            ConvolveParams *conv_params, int16_t alpha,
+                            int16_t beta, int16_t gamma, int16_t delta) {
+  __m128i tmp[15];
+  int i, j, k;
+  const int bd = 8;
+  const int reduce_bits_horiz = conv_params->round_0;
+  const int reduce_bits_vert = conv_params->is_compound
+                                   ? conv_params->round_1
+                                   : 2 * FILTER_BITS - reduce_bits_horiz;
+  const int offset_bits_horiz = bd + FILTER_BITS - 1;
+  assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
+
+  const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
+  const __m128i reduce_bits_vert_const =
+      _mm_set1_epi32(((1 << reduce_bits_vert) >> 1));
+  const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits_vert);
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
+
+  /* Note: For this code to work, the left/right frame borders need to be
+  extended by at least 13 pixels each. By the time we get here, other
+  code will have set up this border, but we allow an explicit check
+  for debugging purposes.
+  */
+  /*for (i = 0; i < height; ++i) {
+  for (j = 0; j < 13; ++j) {
+  assert(ref[i * stride - 13 + j] == ref[i * stride]);
+  assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]);
+  }
+  }*/
+  __m128i res_add_const_1;
+  if (conv_params->is_compound == 1) {
+    res_add_const_1 = _mm_add_epi32(reduce_bits_vert_const, res_add_const);
+  } else {
+    res_add_const_1 = _mm_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) +
+                                     ((1 << reduce_bits_vert) >> 1));
+  }
+
+  for (i = 0; i < p_height; i += 8) {
+    for (j = 0; j < p_width; j += 8) {
+      const int32_t src_x = (p_col + j + 4) << subsampling_x;
+      const int32_t src_y = (p_row + i + 4) << subsampling_y;
+      const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
+      const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
+      const int32_t x4 = dst_x >> subsampling_x;
+      const int32_t y4 = dst_y >> subsampling_y;
+
+      int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
+      int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+      int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
+      int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+
+      // Add in all the constant terms, including rounding and offset
+      sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+             (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+      sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+             (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+
+      sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+      sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+
+      // Horizontal filter
+      // If the block is aligned such that, after clamping, every sample
+      // would be taken from the leftmost/rightmost column, then we can
+      // skip the expensive horizontal filter.
+      if (ix4 <= -7) {
+        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+          int iy = iy4 + k;
+          if (iy < 0)
+            iy = 0;
+          else if (iy > height - 1)
+            iy = height - 1;
+          tmp[k + 7] = _mm_set1_epi16(
+              (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
+              ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz)));
+        }
+      } else if (ix4 >= width + 6) {
+        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+          int iy = iy4 + k;
+          if (iy < 0)
+            iy = 0;
+          else if (iy > height - 1)
+            iy = height - 1;
+          tmp[k + 7] =
+              _mm_set1_epi16((1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
+                             ref[iy * stride + (width - 1)] *
+                                 (1 << (FILTER_BITS - reduce_bits_horiz)));
+        }
+      } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) {
+        const int out_of_boundary_left = -(ix4 - 6);
+        const int out_of_boundary_right = (ix4 + 8) - width;
+        for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+          int iy = iy4 + k;
+          if (iy < 0)
+            iy = 0;
+          else if (iy > height - 1)
+            iy = height - 1;
+          int sx = sx4 + beta * (k + 4);
+
+          // Load source pixels
+          __m128i src =
+              _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+          if (out_of_boundary_left >= 0) {
+            const __m128i shuffle_reg_left =
+                _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]);
+            src = _mm_shuffle_epi8(src, shuffle_reg_left);
+          }
+          if (out_of_boundary_right >= 0) {
+            const __m128i shuffle_reg_right = _mm_loadu_si128(
+                (__m128i *)warp_pad_right[out_of_boundary_right]);
+            src = _mm_shuffle_epi8(src, shuffle_reg_right);
+          }
+          horizontal_filter(src, tmp, sx, alpha, k, offset_bits_horiz,
+                            reduce_bits_horiz);
+        }
+      } else {
+        prepare_warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha,
+                                       beta, p_height, height, i,
+                                       offset_bits_horiz, reduce_bits_horiz);
+      }
+
+      // Vertical filter
+      prepare_warp_vertical_filter(
+          pred, tmp, conv_params, gamma, delta, p_height, p_stride, p_width, i,
+          j, sy4, reduce_bits_vert, &res_add_const_1, round_bits, offset_bits);
+    }
+  }
+}
diff --git a/libs/libaom/src/av1/common/x86/wiener_convolve_avx2.c b/libs/libaom/src/av1/common/x86/wiener_convolve_avx2.c
new file mode 100644
index 000000000..b7ac68383
--- /dev/null
+++ b/libs/libaom/src/av1/common/x86/wiener_convolve_avx2.c
@@ -0,0 +1,242 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+#include <assert.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/convolve.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/convolve_avx2.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+
+// 128-bit xmmwords are written as [ ... ] with the MSB on the left.
+// 256-bit ymmwords are written as two xmmwords, [ ... ][ ... ] with the MSB
+// on the left.
+// A row of, say, 8-bit pixels with values p0, p1, p2, ..., p30, p31 will be
+// loaded and stored as [ p31 ... p17 p16 ][ p15 ... p1 p0 ].
+
+// Exploiting the range of wiener filter coefficients,
+// horizontal filtering can be done in 16 bit intermediate precision.
+// The details are as follows :
+// Consider the horizontal wiener filter coefficients of the following form :
+//      [C0, C1, C2, 2^(FILTER_BITS) -2 * (C0 + C1 + C2), C2, C1, C0]
+// Subtracting  2^(FILTER_BITS) from the centre tap we get the following  :
+//      [C0, C1, C2,     -2 * (C0 + C1 + C2),             C2, C1, C0]
+// The sum of the product "C0 * p0 + C1 * p1 + C2 * p2 -2 * (C0 + C1 + C2) * p3
+// + C2 * p4 + C1 * p5 + C0 * p6" would be in the range of signed 16 bit
+// precision. Finally, after rounding the above result by round_0, we multiply
+// the centre pixel by 2^(FILTER_BITS - round_0) and add it to get the
+// horizontal filter output.
+
+void av1_wiener_convolve_add_src_avx2(const uint8_t *src, ptrdiff_t src_stride,
+                                      uint8_t *dst, ptrdiff_t dst_stride,
+                                      const int16_t *filter_x, int x_step_q4,
+                                      const int16_t *filter_y, int y_step_q4,
+                                      int w, int h,
+                                      const ConvolveParams *conv_params) {
+  const int bd = 8;
+  assert(x_step_q4 == 16 && y_step_q4 == 16);
+  assert(!(w & 7));
+  (void)x_step_q4;
+  (void)y_step_q4;
+
+  DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + SUBPEL_TAPS) * 8]);
+  int im_h = h + SUBPEL_TAPS - 2;
+  int im_stride = 8;
+  memset(im_block + (im_h * im_stride), 0, MAX_SB_SIZE);
+  int i, j;
+  const int center_tap = (SUBPEL_TAPS - 1) / 2;
+  const uint8_t *const src_ptr = src - center_tap * src_stride - center_tap;
+
+  __m256i filt[4], coeffs_h[4], coeffs_v[4], filt_center;
+
+  assert(conv_params->round_0 > 0);
+
+  filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
+  filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
+  filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
+  filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
+
+  filt_center = _mm256_load_si256((__m256i const *)filt_center_global_avx2);
+
+  const __m128i coeffs_x = _mm_loadu_si128((__m128i *)filter_x);
+  const __m256i filter_coeffs_x = _mm256_broadcastsi128_si256(coeffs_x);
+
+  // coeffs 0 1 0 1 0 1 0 1
+  coeffs_h[0] =
+      _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0200u));
+  // coeffs 2 3 2 3 2 3 2 3
+  coeffs_h[1] =
+      _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0604u));
+  // coeffs 4 5 4 5 4 5 4 5
+  coeffs_h[2] =
+      _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0a08u));
+  // coeffs 6 7 6 7 6 7 6 7
+  coeffs_h[3] =
+      _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0e0cu));
+
+  const __m256i round_const_h =
+      _mm256_set1_epi16((1 << (conv_params->round_0 - 1)));
+  const __m256i round_const_horz =
+      _mm256_set1_epi16((1 << (bd + FILTER_BITS - conv_params->round_0 - 1)));
+  const __m256i clamp_low = _mm256_setzero_si256();
+  const __m256i clamp_high =
+      _mm256_set1_epi16(WIENER_CLAMP_LIMIT(conv_params->round_0, bd) - 1);
+  const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0);
+
+  // Add an offset to account for the "add_src" part of the convolve function.
+  const __m128i zero_128 = _mm_setzero_si128();
+  const __m128i offset_0 = _mm_insert_epi16(zero_128, 1 << FILTER_BITS, 3);
+  const __m128i coeffs_y = _mm_add_epi16(xx_loadu_128(filter_y), offset_0);
+
+  const __m256i filter_coeffs_y = _mm256_broadcastsi128_si256(coeffs_y);
+
+  // coeffs 0 1 0 1 0 1 0 1
+  coeffs_v[0] = _mm256_shuffle_epi32(filter_coeffs_y, 0x00);
+  // coeffs 2 3 2 3 2 3 2 3
+  coeffs_v[1] = _mm256_shuffle_epi32(filter_coeffs_y, 0x55);
+  // coeffs 4 5 4 5 4 5 4 5
+  coeffs_v[2] = _mm256_shuffle_epi32(filter_coeffs_y, 0xaa);
+  // coeffs 6 7 6 7 6 7 6 7
+  coeffs_v[3] = _mm256_shuffle_epi32(filter_coeffs_y, 0xff);
+
+  const __m256i round_const_v =
+      _mm256_set1_epi32((1 << (conv_params->round_1 - 1)) -
+                        (1 << (bd + conv_params->round_1 - 1)));
+  const __m128i round_shift_v = _mm_cvtsi32_si128(conv_params->round_1);
+
+  for (j = 0; j < w; j += 8) {
+    for (i = 0; i < im_h; i += 2) {
+      __m256i data = _mm256_castsi128_si256(
+          _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));
+
+      // Load the next line
+      if (i + 1 < im_h)
+        data = _mm256_inserti128_si256(
+            data,
+            _mm_loadu_si128(
+                (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]),
+            1);
+
+      __m256i res = convolve_lowbd_x(data, coeffs_h, filt);
+
+      res =
+          _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h);
+
+      __m256i data_0 = _mm256_shuffle_epi8(data, filt_center);
+
+      // multiply the center pixel by 2^(FILTER_BITS - round_0) and add it to
+      // the result
+      data_0 = _mm256_slli_epi16(data_0, FILTER_BITS - conv_params->round_0);
+      res = _mm256_add_epi16(res, data_0);
+      res = _mm256_add_epi16(res, round_const_horz);
+      const __m256i res_clamped =
+          _mm256_min_epi16(_mm256_max_epi16(res, clamp_low), clamp_high);
+      _mm256_store_si256((__m256i *)&im_block[i * im_stride], res_clamped);
+    }
+
+    /* Vertical filter */
+    {
+      __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
+      __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
+      __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
+      __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
+      __m256i src_4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));
+      __m256i src_5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));
+
+      __m256i s[8];
+      s[0] = _mm256_unpacklo_epi16(src_0, src_1);
+      s[1] = _mm256_unpacklo_epi16(src_2, src_3);
+      s[2] = _mm256_unpacklo_epi16(src_4, src_5);
+
+      s[4] = _mm256_unpackhi_epi16(src_0, src_1);
+      s[5] = _mm256_unpackhi_epi16(src_2, src_3);
+      s[6] = _mm256_unpackhi_epi16(src_4, src_5);
+
+      for (i = 0; i < h - 1; i += 2) {
+        const int16_t *data = &im_block[i * im_stride];
+
+        const __m256i s6 =
+            _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));
+        const __m256i s7 =
+            _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));
+
+        s[3] = _mm256_unpacklo_epi16(s6, s7);
+        s[7] = _mm256_unpackhi_epi16(s6, s7);
+
+        __m256i res_a = convolve(s, coeffs_v);
+        __m256i res_b = convolve(s + 4, coeffs_v);
+
+        const __m256i res_a_round = _mm256_sra_epi32(
+            _mm256_add_epi32(res_a, round_const_v), round_shift_v);
+        const __m256i res_b_round = _mm256_sra_epi32(
+            _mm256_add_epi32(res_b, round_const_v), round_shift_v);
+
+        /* rounding code */
+        // 16 bit conversion
+        const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);
+        // 8 bit conversion and saturation to uint8
+        const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit);
+
+        const __m128i res_0 = _mm256_castsi256_si128(res_8b);
+        const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
+
+        // Store values into the destination buffer
+        __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
+        __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride];
+
+        _mm_storel_epi64(p_0, res_0);
+        _mm_storel_epi64(p_1, res_1);
+
+        s[0] = s[1];
+        s[1] = s[2];
+        s[2] = s[3];
+
+        s[4] = s[5];
+        s[5] = s[6];
+        s[6] = s[7];
+      }
+      if (h - i) {
+        s[0] = _mm256_permute2x128_si256(s[0], s[4], 0x20);
+        s[1] = _mm256_permute2x128_si256(s[1], s[5], 0x20);
+        s[2] = _mm256_permute2x128_si256(s[2], s[6], 0x20);
+
+        const int16_t *data = &im_block[i * im_stride];
+        const __m128i s6_ = _mm_loadu_si128((__m128i *)(data + 6 * im_stride));
+        const __m128i s7_ = _mm_loadu_si128((__m128i *)(data + 7 * im_stride));
+
+        __m128i s3 = _mm_unpacklo_epi16(s6_, s7_);
+        __m128i s7 = _mm_unpackhi_epi16(s6_, s7_);
+
+        s[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(s3), s7, 1);
+        __m256i convolveres = convolve(s, coeffs_v);
+
+        const __m256i res_round = _mm256_sra_epi32(
+            _mm256_add_epi32(convolveres, round_const_v), round_shift_v);
+
+        /* rounding code */
+        // 16 bit conversion
+        __m128i reslo = _mm256_castsi256_si128(res_round);
+        __m128i reshi = _mm256_extracti128_si256(res_round, 1);
+        const __m128i res_16bit = _mm_packus_epi32(reslo, reshi);
+
+        // 8 bit conversion and saturation to uint8
+        const __m128i res_8b = _mm_packus_epi16(res_16bit, res_16bit);
+        __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
+        _mm_storel_epi64(p_0, res_8b);
+      }
+    }
+  }
+}
diff --git a/libs/libaom/src/av1/common/x86/wiener_convolve_sse2.c b/libs/libaom/src/av1/common/x86/wiener_convolve_sse2.c
new file mode 100644
index 000000000..f9d00b733
--- /dev/null
+++ b/libs/libaom/src/av1/common/x86/wiener_convolve_sse2.c
@@ -0,0 +1,199 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+#include <assert.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/convolve.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+
+void av1_wiener_convolve_add_src_sse2(const uint8_t *src, ptrdiff_t src_stride,
+                                      uint8_t *dst, ptrdiff_t dst_stride,
+                                      const int16_t *filter_x, int x_step_q4,
+                                      const int16_t *filter_y, int y_step_q4,
+                                      int w, int h,
+                                      const ConvolveParams *conv_params) {
+  const int bd = 8;
+  assert(x_step_q4 == 16 && y_step_q4 == 16);
+  assert(!(w & 7));
+  (void)x_step_q4;
+  (void)y_step_q4;
+
+  DECLARE_ALIGNED(16, uint16_t,
+                  temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
+  int intermediate_height = h + SUBPEL_TAPS - 2;
+  memset(temp + (intermediate_height * MAX_SB_SIZE), 0, MAX_SB_SIZE);
+  int i, j;
+  const int center_tap = ((SUBPEL_TAPS - 1) / 2);
+  const uint8_t *const src_ptr = src - center_tap * src_stride - center_tap;
+
+  const __m128i zero = _mm_setzero_si128();
+  // Add an offset to account for the "add_src" part of the convolve function.
+  const __m128i offset = _mm_insert_epi16(zero, 1 << FILTER_BITS, 3);
+
+  /* Horizontal filter */
+  {
+    const __m128i coeffs_x =
+        _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_x), offset);
+
+    // coeffs 0 1 0 1 2 3 2 3
+    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
+    // coeffs 4 5 4 5 6 7 6 7
+    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
+
+    // coeffs 0 1 0 1 0 1 0 1
+    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+    // coeffs 2 3 2 3 2 3 2 3
+    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+    // coeffs 4 5 4 5 4 5 4 5
+    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+    // coeffs 6 7 6 7 6 7 6 7
+    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+    const __m128i round_const = _mm_set1_epi32(
+        (1 << (conv_params->round_0 - 1)) + (1 << (bd + FILTER_BITS - 1)));
+
+    for (i = 0; i < intermediate_height; ++i) {
+      for (j = 0; j < w; j += 8) {
+        const __m128i data =
+            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+
+        // Filter even-index pixels
+        const __m128i src_0 = _mm_unpacklo_epi8(data, zero);
+        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+        const __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(data, 2), zero);
+        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+        const __m128i src_4 = _mm_unpacklo_epi8(_mm_srli_si128(data, 4), zero);
+        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+        const __m128i src_6 = _mm_unpacklo_epi8(_mm_srli_si128(data, 6), zero);
+        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+        __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
+                                         _mm_add_epi32(res_2, res_6));
+        res_even = _mm_srai_epi32(_mm_add_epi32(res_even, round_const),
+                                  conv_params->round_0);
+
+        // Filter odd-index pixels
+        const __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(data, 1), zero);
+        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+        const __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(data, 3), zero);
+        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+        const __m128i src_5 = _mm_unpacklo_epi8(_mm_srli_si128(data, 5), zero);
+        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+        const __m128i src_7 = _mm_unpacklo_epi8(_mm_srli_si128(data, 7), zero);
+        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+        __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
+                                        _mm_add_epi32(res_3, res_7));
+        res_odd = _mm_srai_epi32(_mm_add_epi32(res_odd, round_const),
+                                 conv_params->round_0);
+
+        // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
+        __m128i res = _mm_packs_epi32(res_even, res_odd);
+        res = _mm_min_epi16(
+            _mm_max_epi16(res, zero),
+            _mm_set1_epi16(WIENER_CLAMP_LIMIT(conv_params->round_0, bd) - 1));
+        _mm_storeu_si128((__m128i *)&temp[i * MAX_SB_SIZE + j], res);
+      }
+    }
+  }
+
+  /* Vertical filter */
+  {
+    const __m128i coeffs_y =
+        _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_y), offset);
+
+    // coeffs 0 1 0 1 2 3 2 3
+    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+    // coeffs 4 5 4 5 6 7 6 7
+    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+    // coeffs 0 1 0 1 0 1 0 1
+    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+    // coeffs 2 3 2 3 2 3 2 3
+    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+    // coeffs 4 5 4 5 4 5 4 5
+    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+    // coeffs 6 7 6 7 6 7 6 7
+    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+    const __m128i round_const =
+        _mm_set1_epi32((1 << (conv_params->round_1 - 1)) -
+                       (1 << (bd + conv_params->round_1 - 1)));
+
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; j += 8) {
+        // Filter even-index pixels
+        const uint16_t *data = &temp[i * MAX_SB_SIZE + j];
+        const __m128i src_0 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE),
+                               *(__m128i *)(data + 1 * MAX_SB_SIZE));
+        const __m128i src_2 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE),
+                               *(__m128i *)(data + 3 * MAX_SB_SIZE));
+        const __m128i src_4 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE),
+                               *(__m128i *)(data + 5 * MAX_SB_SIZE));
+        const __m128i src_6 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE),
+                               *(__m128i *)(data + 7 * MAX_SB_SIZE));
+
+        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+        const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+                                               _mm_add_epi32(res_4, res_6));
+
+        // Filter odd-index pixels
+        const __m128i src_1 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE),
+                               *(__m128i *)(data + 1 * MAX_SB_SIZE));
+        const __m128i src_3 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE),
+                               *(__m128i *)(data + 3 * MAX_SB_SIZE));
+        const __m128i src_5 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE),
+                               *(__m128i *)(data + 5 * MAX_SB_SIZE));
+        const __m128i src_7 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE),
+                               *(__m128i *)(data + 7 * MAX_SB_SIZE));
+
+        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+        const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+                                              _mm_add_epi32(res_5, res_7));
+
+        // Rearrange pixels back into the order 0 ... 7
+        const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+        const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+        const __m128i res_lo_round = _mm_srai_epi32(
+            _mm_add_epi32(res_lo, round_const), conv_params->round_1);
+        const __m128i res_hi_round = _mm_srai_epi32(
+            _mm_add_epi32(res_hi, round_const), conv_params->round_1);
+
+        const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
+        __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit);
+
+        __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
+        _mm_storel_epi64(p, res_8bit);
+      }
+    }
+  }
+}
diff --git a/libs/libaom/src/av1/decoder/accounting.c b/libs/libaom/src/av1/decoder/accounting.c
new file mode 100644
index 000000000..2e58d09e0
--- /dev/null
+++ b/libs/libaom/src/av1/decoder/accounting.c
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom/aom_integer.h"
+#include "av1/decoder/accounting.h"
+
+static int accounting_hash(const char *str) {
+  uint32_t val;
+  const unsigned char *ustr;
+  val = 0;
+  ustr = (const unsigned char *)str;
+  /* This is about the worst hash one can design, but it should be good enough
+     here. */
+  while (*ustr) val += *ustr++;
+  return val % AOM_ACCOUNTING_HASH_SIZE;
+}
+
+/* Dictionary lookup based on an open-addressing hash table. */
+int aom_accounting_dictionary_lookup(Accounting *accounting, const char *str) {
+  int hash;
+  size_t len;
+  AccountingDictionary *dictionary;
+  dictionary = &accounting->syms.dictionary;
+  hash = accounting_hash(str);
+  while (accounting->hash_dictionary[hash] != -1) {
+    if (strcmp(dictionary->strs[accounting->hash_dictionary[hash]], str) == 0) {
+      return accounting->hash_dictionary[hash];
+    }
+    hash++;
+    if (hash == AOM_ACCOUNTING_HASH_SIZE) hash = 0;
+  }
+  /* No match found. */
+  assert(dictionary->num_strs + 1 < MAX_SYMBOL_TYPES);
+  accounting->hash_dictionary[hash] = dictionary->num_strs;
+  len = strlen(str);
+  dictionary->strs[dictionary->num_strs] = malloc(len + 1);
+  snprintf(dictionary->strs[dictionary->num_strs], len + 1, "%s", str);
+  dictionary->num_strs++;
+  return dictionary->num_strs - 1;
+}
+
+void aom_accounting_init(Accounting *accounting) {
+  int i;
+  accounting->num_syms_allocated = 1000;
+  accounting->syms.syms =
+      malloc(sizeof(AccountingSymbol) * accounting->num_syms_allocated);
+  accounting->syms.dictionary.num_strs = 0;
+  assert(AOM_ACCOUNTING_HASH_SIZE > 2 * MAX_SYMBOL_TYPES);
+  for (i = 0; i < AOM_ACCOUNTING_HASH_SIZE; i++)
+    accounting->hash_dictionary[i] = -1;
+  aom_accounting_reset(accounting);
+}
+
+void aom_accounting_reset(Accounting *accounting) {
+  accounting->syms.num_syms = 0;
+  accounting->syms.num_binary_syms = 0;
+  accounting->syms.num_multi_syms = 0;
+  accounting->context.x = -1;
+  accounting->context.y = -1;
+  accounting->last_tell_frac = 0;
+}
+
+void aom_accounting_clear(Accounting *accounting) {
+  int i;
+  AccountingDictionary *dictionary;
+  free(accounting->syms.syms);
+  dictionary = &accounting->syms.dictionary;
+  for (i = 0; i < dictionary->num_strs; i++) {
+    free(dictionary->strs[i]);
+  }
+}
+
+void aom_accounting_set_context(Accounting *accounting, int16_t x, int16_t y) {
+  accounting->context.x = x;
+  accounting->context.y = y;
+}
+
+void aom_accounting_record(Accounting *accounting, const char *str,
+                           uint32_t bits) {
+  AccountingSymbol sym;
+  // Reuse previous symbol if it has the same context and symbol id.
+  if (accounting->syms.num_syms) {
+    AccountingSymbol *last_sym;
+    last_sym = &accounting->syms.syms[accounting->syms.num_syms - 1];
+    if (memcmp(&last_sym->context, &accounting->context,
+               sizeof(AccountingSymbolContext)) == 0) {
+      uint32_t id;
+      id = aom_accounting_dictionary_lookup(accounting, str);
+      if (id == last_sym->id) {
+        last_sym->bits += bits;
+        last_sym->samples++;
+        return;
+      }
+    }
+  }
+  sym.context = accounting->context;
+  sym.samples = 1;
+  sym.bits = bits;
+  sym.id = aom_accounting_dictionary_lookup(accounting, str);
+  assert(sym.id <= 255);
+  if (accounting->syms.num_syms == accounting->num_syms_allocated) {
+    accounting->num_syms_allocated *= 2;
+    accounting->syms.syms =
+        realloc(accounting->syms.syms,
+                sizeof(AccountingSymbol) * accounting->num_syms_allocated);
+    assert(accounting->syms.syms != NULL);
+  }
+  accounting->syms.syms[accounting->syms.num_syms++] = sym;
+}
+
+void aom_accounting_dump(Accounting *accounting) {
+  int i;
+  AccountingSymbol *sym;
+  printf("\n----- Number of recorded syntax elements = %d -----\n",
+         accounting->syms.num_syms);
+  printf("----- Total number of symbol calls = %d (%d binary) -----\n",
+         accounting->syms.num_multi_syms + accounting->syms.num_binary_syms,
+         accounting->syms.num_binary_syms);
+  for (i = 0; i < accounting->syms.num_syms; i++) {
+    sym = &accounting->syms.syms[i];
+    printf("%s x: %d, y: %d bits: %f samples: %d\n",
+           accounting->syms.dictionary.strs[sym->id], sym->context.x,
+           sym->context.y, (float)sym->bits / 8.0, sym->samples);
+  }
+}
diff --git a/libs/libaom/src/av1/decoder/accounting.h b/libs/libaom/src/av1/decoder/accounting.h
new file mode 100644
index 000000000..ad2e8b6cf
--- /dev/null
+++ b/libs/libaom/src/av1/decoder/accounting.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_DECODER_ACCOUNTING_H_
+#define AOM_AV1_DECODER_ACCOUNTING_H_
+#include <stdlib.h>
+#include "aom/aomdx.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+#define AOM_ACCOUNTING_HASH_SIZE (1021)
+
+/* Max number of entries for symbol types in the dictionary (increase as
+   necessary). */
+#define MAX_SYMBOL_TYPES (256)
+
+/*The resolution of fractional-precision bit usage measurements, i.e.,
+   3 => 1/8th bits.*/
+#define AOM_ACCT_BITRES (3)
+
+typedef struct {
+  int16_t x;
+  int16_t y;
+} AccountingSymbolContext;
+
+typedef struct {
+  AccountingSymbolContext context;
+  uint32_t id;
+  /** Number of bits in units of 1/8 bit. */
+  uint32_t bits;
+  uint32_t samples;
+} AccountingSymbol;
+
+/** Dictionary for translating strings into id. */
+typedef struct {
+  char *strs[MAX_SYMBOL_TYPES];
+  int num_strs;
+} AccountingDictionary;
+
+typedef struct {
+  /** All recorded symbols decoded. */
+  AccountingSymbol *syms;
+  /** Number of syntax actually recorded. */
+  int num_syms;
+  /** Raw symbol decoding calls for non-binary values. */
+  int num_multi_syms;
+  /** Raw binary symbol decoding calls. */
+  int num_binary_syms;
+  /** Dictionary for translating strings into id. */
+  AccountingDictionary dictionary;
+} AccountingSymbols;
+
+struct Accounting {
+  AccountingSymbols syms;
+  /** Size allocated for symbols (not all may be used). */
+  int num_syms_allocated;
+  int16_t hash_dictionary[AOM_ACCOUNTING_HASH_SIZE];
+  AccountingSymbolContext context;
+  uint32_t last_tell_frac;
+};
+
+void aom_accounting_init(Accounting *accounting);
+void aom_accounting_reset(Accounting *accounting);
+void aom_accounting_clear(Accounting *accounting);
+void aom_accounting_set_context(Accounting *accounting, int16_t x, int16_t y);
+int aom_accounting_dictionary_lookup(Accounting *accounting, const char *str);
+void aom_accounting_record(Accounting *accounting, const char *str,
+                           uint32_t bits);
+void aom_accounting_dump(Accounting *accounting);
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+#endif  // AOM_AV1_DECODER_ACCOUNTING_H_
diff --git a/libs/libaom/src/av1/decoder/decodeframe.c b/libs/libaom/src/av1/decoder/decodeframe.c
new file mode 100644
index 000000000..7abfac4aa
--- /dev/null
+++ b/libs/libaom/src/av1/decoder/decodeframe.c
@@ -0,0 +1,5326 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stddef.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_scale_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_codec.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/binary_codes_reader.h"
+#include "aom_dsp/bitreader.h"
+#include "aom_dsp/bitreader_buffer.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/mem_ops.h"
+#include "aom_scale/aom_scale.h"
+#include "aom_util/aom_thread.h"
+
+#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
+#include "aom_util/debug_util.h"
+#endif  // CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
+
+#include "av1/common/alloccommon.h"
+#include "av1/common/cdef.h"
+#include "av1/common/cfl.h"
+#if CONFIG_INSPECTION
+#include "av1/decoder/inspection.h"
+#endif
+#include "av1/common/common.h"
+#include "av1/common/entropy.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/entropymv.h"
+#include "av1/common/frame_buffers.h"
+#include "av1/common/idct.h"
+#include "av1/common/mvref_common.h"
+#include "av1/common/pred_common.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+#include "av1/common/resize.h"
+#include "av1/common/seg_common.h"
+#include "av1/common/thread_common.h"
+#include "av1/common/tile_common.h"
+#include "av1/common/warped_motion.h"
+#include "av1/common/obmc.h"
+#include "av1/decoder/decodeframe.h"
+#include "av1/decoder/decodemv.h"
+#include "av1/decoder/decoder.h"
+#include "av1/decoder/decodetxb.h"
+#include "av1/decoder/detokenize.h"
+
+#define ACCT_STR __func__
+
+#define AOM_MIN_THREADS_PER_TILE 1
+#define AOM_MAX_THREADS_PER_TILE 2
+
+// This is needed by ext_tile related unit tests.
+#define EXT_TILE_DEBUG 1
+#define MC_TEMP_BUF_PELS                       \
+  (((MAX_SB_SIZE)*2 + (AOM_INTERP_EXTEND)*2) * \
+   ((MAX_SB_SIZE)*2 + (AOM_INTERP_EXTEND)*2))
+
+// Checks that the remaining bits start with a 1 and ends with 0s.
+// It consumes an additional byte, if already byte aligned before the check.
+int av1_check_trailing_bits(AV1Decoder *pbi, struct aom_read_bit_buffer *rb) {
+  AV1_COMMON *const cm = &pbi->common;
+  // bit_offset is set to 0 (mod 8) when the reader is already byte aligned
+  int bits_before_alignment = 8 - rb->bit_offset % 8;
+  int trailing = aom_rb_read_literal(rb, bits_before_alignment);
+  if (trailing != (1 << (bits_before_alignment - 1))) {
+    cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+    return -1;
+  }
+  return 0;
+}
+
+// Use only_chroma = 1 to only set the chroma planes
+static AOM_INLINE void set_planes_to_neutral_grey(
+    const SequenceHeader *const seq_params, const YV12_BUFFER_CONFIG *const buf,
+    int only_chroma) {
+  if (seq_params->use_highbitdepth) {
+    const int val = 1 << (seq_params->bit_depth - 1);
+    for (int plane = only_chroma; plane < MAX_MB_PLANE; plane++) {
+      const int is_uv = plane > 0;
+      uint16_t *const base = CONVERT_TO_SHORTPTR(buf->buffers[plane]);
+      // Set the first row to neutral grey. Then copy the first row to all
+      // subsequent rows.
+      if (buf->crop_heights[is_uv] > 0) {
+        aom_memset16(base, val, buf->crop_widths[is_uv]);
+        for (int row_idx = 1; row_idx < buf->crop_heights[is_uv]; row_idx++) {
+          memcpy(&base[row_idx * buf->strides[is_uv]], base,
+                 sizeof(*base) * buf->crop_widths[is_uv]);
+        }
+      }
+    }
+  } else {
+    for (int plane = only_chroma; plane < MAX_MB_PLANE; plane++) {
+      const int is_uv = plane > 0;
+      for (int row_idx = 0; row_idx < buf->crop_heights[is_uv]; row_idx++) {
+        memset(&buf->buffers[plane][row_idx * buf->uv_stride], 1 << 7,
+               buf->crop_widths[is_uv]);
+      }
+    }
+  }
+}
+
+static AOM_INLINE void loop_restoration_read_sb_coeffs(
+    const AV1_COMMON *const cm, MACROBLOCKD *xd, aom_reader *const r, int plane,
+    int runit_idx);
+
+static int read_is_valid(const uint8_t *start, size_t len, const uint8_t *end) {
+  return len != 0 && len <= (size_t)(end - start);
+}
+
+static TX_MODE read_tx_mode(struct aom_read_bit_buffer *rb,
+                            int coded_lossless) {
+  if (coded_lossless) return ONLY_4X4;
+  return aom_rb_read_bit(rb) ? TX_MODE_SELECT : TX_MODE_LARGEST;
+}
+
+static REFERENCE_MODE read_frame_reference_mode(
+    const AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
+  if (frame_is_intra_only(cm)) {
+    return SINGLE_REFERENCE;
+  } else {
+    return aom_rb_read_bit(rb) ? REFERENCE_MODE_SELECT : SINGLE_REFERENCE;
+  }
+}
+
+static AOM_INLINE void inverse_transform_block(MACROBLOCKD *xd, int plane,
+                                               const TX_TYPE tx_type,
+                                               const TX_SIZE tx_size,
+                                               uint8_t *dst, int stride,
+                                               int reduced_tx_set) {
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  tran_low_t *const dqcoeff = pd->dqcoeff_block + xd->cb_offset[plane];
+  eob_info *eob_data = pd->eob_data + xd->txb_offset[plane];
+  uint16_t scan_line = eob_data->max_scan_line;
+  uint16_t eob = eob_data->eob;
+  av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, dst, stride,
+                              eob, reduced_tx_set);
+  memset(dqcoeff, 0, (scan_line + 1) * sizeof(dqcoeff[0]));
+}
+
+static AOM_INLINE void read_coeffs_tx_intra_block(
+    const AV1_COMMON *const cm, MACROBLOCKD *const xd, aom_reader *const r,
+    const int plane, const int row, const int col, const TX_SIZE tx_size) {
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  if (!mbmi->skip) {
+#if TXCOEFF_TIMER
+    struct aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+#endif
+    av1_read_coeffs_txb_facade(cm, xd, r, plane, row, col, tx_size);
+#if TXCOEFF_TIMER
+    aom_usec_timer_mark(&timer);
+    const int64_t elapsed_time = aom_usec_timer_elapsed(&timer);
+    cm->txcoeff_timer += elapsed_time;
+    ++cm->txb_count;
+#endif
+  }
+}
+
+static AOM_INLINE void decode_block_void(const AV1_COMMON *const cm,
+                                         MACROBLOCKD *const xd,
+                                         aom_reader *const r, const int plane,
+                                         const int row, const int col,
+                                         const TX_SIZE tx_size) {
+  (void)cm;
+  (void)xd;
+  (void)r;
+  (void)plane;
+  (void)row;
+  (void)col;
+  (void)tx_size;
+}
+
+static AOM_INLINE void predict_inter_block_void(AV1_COMMON *const cm,
+                                                MACROBLOCKD *const xd,
+                                                BLOCK_SIZE bsize) {
+  (void)cm;
+  (void)xd;
+  (void)bsize;
+}
+
+static AOM_INLINE void cfl_store_inter_block_void(AV1_COMMON *const cm,
+                                                  MACROBLOCKD *const xd) {
+  (void)cm;
+  (void)xd;
+}
+
+static AOM_INLINE void predict_and_reconstruct_intra_block(
+    const AV1_COMMON *const cm, MACROBLOCKD *const xd, aom_reader *const r,
+    const int plane, const int row, const int col, const TX_SIZE tx_size) {
+  (void)r;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  PLANE_TYPE plane_type = get_plane_type(plane);
+
+  av1_predict_intra_block_facade(cm, xd, plane, col, row, tx_size);
+
+  if (!mbmi->skip) {
+    struct macroblockd_plane *const pd = &xd->plane[plane];
+    eob_info *eob_data = pd->eob_data + xd->txb_offset[plane];
+    if (eob_data->eob) {
+      const bool reduced_tx_set_used = cm->features.reduced_tx_set_used;
+      // tx_type was read out in av1_read_coeffs_txb.
+      const TX_TYPE tx_type = av1_get_tx_type(xd, plane_type, row, col, tx_size,
+                                              reduced_tx_set_used);
+      uint8_t *dst = &pd->dst.buf[(row * pd->dst.stride + col) << MI_SIZE_LOG2];
+      inverse_transform_block(xd, plane, tx_type, tx_size, dst, pd->dst.stride,
+                              reduced_tx_set_used);
+    }
+  }
+  if (plane == AOM_PLANE_Y && store_cfl_required(cm, xd)) {
+    cfl_store_tx(xd, row, col, tx_size, mbmi->sb_type);
+  }
+}
+
+static AOM_INLINE void inverse_transform_inter_block(
+    const AV1_COMMON *const cm, MACROBLOCKD *const xd, aom_reader *const r,
+    const int plane, const int blk_row, const int blk_col,
+    const TX_SIZE tx_size) {
+  (void)r;
+  PLANE_TYPE plane_type = get_plane_type(plane);
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const bool reduced_tx_set_used = cm->features.reduced_tx_set_used;
+  // tx_type was read out in av1_read_coeffs_txb.
+  const TX_TYPE tx_type = av1_get_tx_type(xd, plane_type, blk_row, blk_col,
+                                          tx_size, reduced_tx_set_used);
+
+  uint8_t *dst =
+      &pd->dst.buf[(blk_row * pd->dst.stride + blk_col) << MI_SIZE_LOG2];
+  inverse_transform_block(xd, plane, tx_type, tx_size, dst, pd->dst.stride,
+                          reduced_tx_set_used);
+#if CONFIG_MISMATCH_DEBUG
+  int pixel_c, pixel_r;
+  BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
+  int blk_w = block_size_wide[bsize];
+  int blk_h = block_size_high[bsize];
+  const int mi_row = -xd->mb_to_top_edge >> (3 + MI_SIZE_LOG2);
+  const int mi_col = -xd->mb_to_left_edge >> (3 + MI_SIZE_LOG2);
+  mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, blk_col, blk_row,
+                  pd->subsampling_x, pd->subsampling_y);
+  mismatch_check_block_tx(dst, pd->dst.stride, cm->current_frame.order_hint,
+                          plane, pixel_c, pixel_r, blk_w, blk_h,
+                          xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH);
+#endif
+}
+
+static AOM_INLINE void set_cb_buffer_offsets(MACROBLOCKD *const xd,
+                                             TX_SIZE tx_size, int plane) {
+  xd->cb_offset[plane] += tx_size_wide[tx_size] * tx_size_high[tx_size];
+  xd->txb_offset[plane] =
+      xd->cb_offset[plane] / (TX_SIZE_W_MIN * TX_SIZE_H_MIN);
+}
+
+static AOM_INLINE void decode_reconstruct_tx(
+    AV1_COMMON *cm, ThreadData *const td, aom_reader *r,
+    MB_MODE_INFO *const mbmi, int plane, BLOCK_SIZE plane_bsize, int blk_row,
+    int blk_col, int block, TX_SIZE tx_size, int *eob_total) {
+  MACROBLOCKD *const xd = &td->xd;
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const TX_SIZE plane_tx_size =
+      plane ? av1_get_max_uv_txsize(mbmi->sb_type, pd->subsampling_x,
+                                    pd->subsampling_y)
+            : mbmi->inter_tx_size[av1_get_txb_size_index(plane_bsize, blk_row,
+                                                         blk_col)];
+  // Scale to match transform block unit.
+  const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+  const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+  if (tx_size == plane_tx_size || plane) {
+    td->read_coeffs_tx_inter_block_visit(cm, xd, r, plane, blk_row, blk_col,
+                                         tx_size);
+
+    td->inverse_tx_inter_block_visit(cm, xd, r, plane, blk_row, blk_col,
+                                     tx_size);
+    eob_info *eob_data = pd->eob_data + xd->txb_offset[plane];
+    *eob_total += eob_data->eob;
+    set_cb_buffer_offsets(xd, tx_size, plane);
+  } else {
+    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+    assert(IMPLIES(tx_size <= TX_4X4, sub_txs == tx_size));
+    assert(IMPLIES(tx_size > TX_4X4, sub_txs < tx_size));
+    const int bsw = tx_size_wide_unit[sub_txs];
+    const int bsh = tx_size_high_unit[sub_txs];
+    const int sub_step = bsw * bsh;
+
+    assert(bsw > 0 && bsh > 0);
+
+    for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) {
+      for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
+        const int offsetr = blk_row + row;
+        const int offsetc = blk_col + col;
+
+        if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
+
+        decode_reconstruct_tx(cm, td, r, mbmi, plane, plane_bsize, offsetr,
+                              offsetc, block, sub_txs, eob_total);
+        block += sub_step;
+      }
+    }
+  }
+}
+
+static AOM_INLINE void set_offsets(AV1_COMMON *const cm, MACROBLOCKD *const xd,
+                                   BLOCK_SIZE bsize, int mi_row, int mi_col,
+                                   int bw, int bh, int x_mis, int y_mis) {
+  const int num_planes = av1_num_planes(cm);
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  const TileInfo *const tile = &xd->tile;
+
+  set_mi_offsets(mi_params, xd, mi_row, mi_col);
+  xd->mi[0]->sb_type = bsize;
+#if CONFIG_RD_DEBUG
+  xd->mi[0]->mi_row = mi_row;
+  xd->mi[0]->mi_col = mi_col;
+#endif
+
+  assert(x_mis && y_mis);
+  for (int x = 1; x < x_mis; ++x) xd->mi[x] = xd->mi[0];
+  int idx = mi_params->mi_stride;
+  for (int y = 1; y < y_mis; ++y) {
+    memcpy(&xd->mi[idx], &xd->mi[0], x_mis * sizeof(xd->mi[0]));
+    idx += mi_params->mi_stride;
+  }
+
+  set_plane_n4(xd, bw, bh, num_planes);
+  set_entropy_context(xd, mi_row, mi_col, num_planes);
+
+  // Distance of Mb to the various image edges. These are specified to 8th pel
+  // as they are always compared to values that are in 1/8th pel units
+  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, mi_params->mi_rows,
+                 mi_params->mi_cols);
+
+  av1_setup_dst_planes(xd->plane, bsize, &cm->cur_frame->buf, mi_row, mi_col, 0,
+                       num_planes);
+}
+
+static AOM_INLINE void decode_mbmi_block(AV1Decoder *const pbi,
+                                         MACROBLOCKD *const xd, int mi_row,
+                                         int mi_col, aom_reader *r,
+                                         PARTITION_TYPE partition,
+                                         BLOCK_SIZE bsize) {
+  AV1_COMMON *const cm = &pbi->common;
+  const SequenceHeader *const seq_params = &cm->seq_params;
+  const int bw = mi_size_wide[bsize];
+  const int bh = mi_size_high[bsize];
+  const int x_mis = AOMMIN(bw, cm->mi_params.mi_cols - mi_col);
+  const int y_mis = AOMMIN(bh, cm->mi_params.mi_rows - mi_row);
+
+#if CONFIG_ACCOUNTING
+  aom_accounting_set_context(&pbi->accounting, mi_col, mi_row);
+#endif
+  set_offsets(cm, xd, bsize, mi_row, mi_col, bw, bh, x_mis, y_mis);
+  xd->mi[0]->partition = partition;
+  av1_read_mode_info(pbi, xd, r, x_mis, y_mis);
+  if (bsize >= BLOCK_8X8 &&
+      (seq_params->subsampling_x || seq_params->subsampling_y)) {
+    const BLOCK_SIZE uv_subsize =
+        ss_size_lookup[bsize][seq_params->subsampling_x]
+                      [seq_params->subsampling_y];
+    if (uv_subsize == BLOCK_INVALID)
+      aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
+                         "Invalid block size.");
+  }
+}
+
+typedef struct PadBlock {
+  int x0;
+  int x1;
+  int y0;
+  int y1;
+} PadBlock;
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static AOM_INLINE void highbd_build_mc_border(const uint8_t *src8,
+                                              int src_stride, uint8_t *dst8,
+                                              int dst_stride, int x, int y,
+                                              int b_w, int b_h, int w, int h) {
+  // Get a pointer to the start of the real data for this row.
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  const uint16_t *ref_row = src - x - y * src_stride;
+
+  if (y >= h)
+    ref_row += (h - 1) * src_stride;
+  else if (y > 0)
+    ref_row += y * src_stride;
+
+  do {
+    int right = 0, copy;
+    int left = x < 0 ? -x : 0;
+
+    if (left > b_w) left = b_w;
+
+    if (x + b_w > w) right = x + b_w - w;
+
+    if (right > b_w) right = b_w;
+
+    copy = b_w - left - right;
+
+    if (left) aom_memset16(dst, ref_row[0], left);
+
+    if (copy) memcpy(dst + left, ref_row + x + left, copy * sizeof(uint16_t));
+
+    if (right) aom_memset16(dst + left + copy, ref_row[w - 1], right);
+
+    dst += dst_stride;
+    ++y;
+
+    if (y > 0 && y < h) ref_row += src_stride;
+  } while (--b_h);
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+static AOM_INLINE void build_mc_border(const uint8_t *src, int src_stride,
+                                       uint8_t *dst, int dst_stride, int x,
+                                       int y, int b_w, int b_h, int w, int h) {
+  // Get a pointer to the start of the real data for this row.
+  const uint8_t *ref_row = src - x - y * src_stride;
+
+  if (y >= h)
+    ref_row += (h - 1) * src_stride;
+  else if (y > 0)
+    ref_row += y * src_stride;
+
+  do {
+    int right = 0, copy;
+    int left = x < 0 ? -x : 0;
+
+    if (left > b_w) left = b_w;
+
+    if (x + b_w > w) right = x + b_w - w;
+
+    if (right > b_w) right = b_w;
+
+    copy = b_w - left - right;
+
+    if (left) memset(dst, ref_row[0], left);
+
+    if (copy) memcpy(dst + left, ref_row + x + left, copy);
+
+    if (right) memset(dst + left + copy, ref_row[w - 1], right);
+
+    dst += dst_stride;
+    ++y;
+
+    if (y > 0 && y < h) ref_row += src_stride;
+  } while (--b_h);
+}
+
+static INLINE int update_extend_mc_border_params(
+    const struct scale_factors *const sf, struct buf_2d *const pre_buf,
+    MV32 scaled_mv, PadBlock *block, int subpel_x_mv, int subpel_y_mv,
+    int do_warp, int is_intrabc, int *x_pad, int *y_pad) {
+  const int is_scaled = av1_is_scaled(sf);
+  // Get reference width and height.
+  int frame_width = pre_buf->width;
+  int frame_height = pre_buf->height;
+
+  // Do border extension if there is motion or
+  // width/height is not a multiple of 8 pixels.
+  if ((!is_intrabc) && (!do_warp) &&
+      (is_scaled || scaled_mv.col || scaled_mv.row || (frame_width & 0x7) ||
+       (frame_height & 0x7))) {
+    if (subpel_x_mv || (sf->x_step_q4 != SUBPEL_SHIFTS)) {
+      block->x0 -= AOM_INTERP_EXTEND - 1;
+      block->x1 += AOM_INTERP_EXTEND;
+      *x_pad = 1;
+    }
+
+    if (subpel_y_mv || (sf->y_step_q4 != SUBPEL_SHIFTS)) {
+      block->y0 -= AOM_INTERP_EXTEND - 1;
+      block->y1 += AOM_INTERP_EXTEND;
+      *y_pad = 1;
+    }
+
+    // Skip border extension if block is inside the frame.
+    if (block->x0 < 0 || block->x1 > frame_width - 1 || block->y0 < 0 ||
+        block->y1 > frame_height - 1) {
+      return 1;
+    }
+  }
+  return 0;
+}
+
+static INLINE void extend_mc_border(const struct scale_factors *const sf,
+                                    struct buf_2d *const pre_buf,
+                                    MV32 scaled_mv, PadBlock block,
+                                    int subpel_x_mv, int subpel_y_mv,
+                                    int do_warp, int is_intrabc, int highbd,
+                                    uint8_t *mc_buf, uint8_t **pre,
+                                    int *src_stride) {
+  int x_pad = 0, y_pad = 0;
+  if (update_extend_mc_border_params(sf, pre_buf, scaled_mv, &block,
+                                     subpel_x_mv, subpel_y_mv, do_warp,
+                                     is_intrabc, &x_pad, &y_pad)) {
+    // Get reference block pointer.
+    const uint8_t *const buf_ptr =
+        pre_buf->buf0 + block.y0 * pre_buf->stride + block.x0;
+    int buf_stride = pre_buf->stride;
+    const int b_w = block.x1 - block.x0;
+    const int b_h = block.y1 - block.y0;
+
+#if CONFIG_AV1_HIGHBITDEPTH
+    // Extend the border.
+    if (highbd) {
+      highbd_build_mc_border(buf_ptr, buf_stride, mc_buf, b_w, block.x0,
+                             block.y0, b_w, b_h, pre_buf->width,
+                             pre_buf->height);
+    } else {
+      build_mc_border(buf_ptr, buf_stride, mc_buf, b_w, block.x0, block.y0, b_w,
+                      b_h, pre_buf->width, pre_buf->height);
+    }
+#else
+    (void)highbd;
+    build_mc_border(buf_ptr, buf_stride, mc_buf, b_w, block.x0, block.y0, b_w,
+                    b_h, pre_buf->width, pre_buf->height);
+#endif
+    *src_stride = b_w;
+    *pre = mc_buf + y_pad * (AOM_INTERP_EXTEND - 1) * b_w +
+           x_pad * (AOM_INTERP_EXTEND - 1);
+  }
+}
+
+static void dec_calc_subpel_params(const MV *const src_mv,
+                                   InterPredParams *const inter_pred_params,
+                                   const MACROBLOCKD *const xd, int mi_x,
+                                   int mi_y, uint8_t **pre,
+                                   SubpelParams *subpel_params, int *src_stride,
+                                   PadBlock *block, MV32 *scaled_mv,
+                                   int *subpel_x_mv, int *subpel_y_mv) {
+  const struct scale_factors *sf = inter_pred_params->scale_factors;
+  struct buf_2d *pre_buf = &inter_pred_params->ref_frame_buf;
+  const int bw = inter_pred_params->block_width;
+  const int bh = inter_pred_params->block_height;
+  const int is_scaled = av1_is_scaled(sf);
+  if (is_scaled) {
+    int ssx = inter_pred_params->subsampling_x;
+    int ssy = inter_pred_params->subsampling_y;
+    int orig_pos_y = inter_pred_params->pix_row << SUBPEL_BITS;
+    orig_pos_y += src_mv->row * (1 << (1 - ssy));
+    int orig_pos_x = inter_pred_params->pix_col << SUBPEL_BITS;
+    orig_pos_x += src_mv->col * (1 << (1 - ssx));
+    int pos_y = sf->scale_value_y(orig_pos_y, sf);
+    int pos_x = sf->scale_value_x(orig_pos_x, sf);
+    pos_x += SCALE_EXTRA_OFF;
+    pos_y += SCALE_EXTRA_OFF;
+
+    const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy);
+    const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx);
+    const int bottom = (pre_buf->height + AOM_INTERP_EXTEND)
+                       << SCALE_SUBPEL_BITS;
+    const int right = (pre_buf->width + AOM_INTERP_EXTEND) << SCALE_SUBPEL_BITS;
+    pos_y = clamp(pos_y, top, bottom);
+    pos_x = clamp(pos_x, left, right);
+
+    subpel_params->subpel_x = pos_x & SCALE_SUBPEL_MASK;
+    subpel_params->subpel_y = pos_y & SCALE_SUBPEL_MASK;
+    subpel_params->xs = sf->x_step_q4;
+    subpel_params->ys = sf->y_step_q4;
+
+    // Get reference block top left coordinate.
+    block->x0 = pos_x >> SCALE_SUBPEL_BITS;
+    block->y0 = pos_y >> SCALE_SUBPEL_BITS;
+
+    // Get reference block bottom right coordinate.
+    block->x1 =
+        ((pos_x + (bw - 1) * subpel_params->xs) >> SCALE_SUBPEL_BITS) + 1;
+    block->y1 =
+        ((pos_y + (bh - 1) * subpel_params->ys) >> SCALE_SUBPEL_BITS) + 1;
+
+    MV temp_mv;
+    temp_mv = clamp_mv_to_umv_border_sb(xd, src_mv, bw, bh,
+                                        inter_pred_params->subsampling_x,
+                                        inter_pred_params->subsampling_y);
+    *scaled_mv = av1_scale_mv(&temp_mv, mi_x, mi_y, sf);
+    scaled_mv->row += SCALE_EXTRA_OFF;
+    scaled_mv->col += SCALE_EXTRA_OFF;
+
+    *subpel_x_mv = scaled_mv->col & SCALE_SUBPEL_MASK;
+    *subpel_y_mv = scaled_mv->row & SCALE_SUBPEL_MASK;
+  } else {
+    // Get block position in current frame.
+    int pos_x = inter_pred_params->pix_col << SUBPEL_BITS;
+    int pos_y = inter_pred_params->pix_row << SUBPEL_BITS;
+
+    const MV mv_q4 = clamp_mv_to_umv_border_sb(
+        xd, src_mv, bw, bh, inter_pred_params->subsampling_x,
+        inter_pred_params->subsampling_y);
+    subpel_params->xs = subpel_params->ys = SCALE_SUBPEL_SHIFTS;
+    subpel_params->subpel_x = (mv_q4.col & SUBPEL_MASK) << SCALE_EXTRA_BITS;
+    subpel_params->subpel_y = (mv_q4.row & SUBPEL_MASK) << SCALE_EXTRA_BITS;
+
+    // Get reference block top left coordinate.
+    pos_x += mv_q4.col;
+    pos_y += mv_q4.row;
+    block->x0 = pos_x >> SUBPEL_BITS;
+    block->y0 = pos_y >> SUBPEL_BITS;
+
+    // Get reference block bottom right coordinate.
+    block->x1 = (pos_x >> SUBPEL_BITS) + (bw - 1) + 1;
+    block->y1 = (pos_y >> SUBPEL_BITS) + (bh - 1) + 1;
+
+    scaled_mv->row = mv_q4.row;
+    scaled_mv->col = mv_q4.col;
+    *subpel_x_mv = scaled_mv->col & SUBPEL_MASK;
+    *subpel_y_mv = scaled_mv->row & SUBPEL_MASK;
+  }
+  *pre = pre_buf->buf0 + block->y0 * pre_buf->stride + block->x0;
+  *src_stride = pre_buf->stride;
+}
+
+static void dec_calc_subpel_params_and_extend(
+    const MV *const src_mv, InterPredParams *const inter_pred_params,
+    MACROBLOCKD *xd, int mi_x, int mi_y, int ref, uint8_t **pre,
+    SubpelParams *subpel_params, int *src_stride) {
+  PadBlock block;
+  MV32 scaled_mv;
+  int subpel_x_mv, subpel_y_mv;
+  dec_calc_subpel_params(src_mv, inter_pred_params, xd, mi_x, mi_y, pre,
+                         subpel_params, src_stride, &block, &scaled_mv,
+                         &subpel_x_mv, &subpel_y_mv);
+  extend_mc_border(
+      inter_pred_params->scale_factors, &inter_pred_params->ref_frame_buf,
+      scaled_mv, block, subpel_x_mv, subpel_y_mv,
+      inter_pred_params->mode == WARP_PRED, inter_pred_params->is_intrabc,
+      inter_pred_params->use_hbd_buf, xd->mc_buf[ref], pre, src_stride);
+}
+
+static void dec_build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                       int plane, const MB_MODE_INFO *mi,
+                                       int build_for_obmc, int bw, int bh,
+                                       int mi_x, int mi_y) {
+  av1_build_inter_predictors(cm, xd, plane, mi, build_for_obmc, bw, bh, mi_x,
+                             mi_y, dec_calc_subpel_params_and_extend);
+}
+
+static AOM_INLINE void dec_build_inter_predictor(const AV1_COMMON *cm,
+                                                 MACROBLOCKD *xd, int mi_row,
+                                                 int mi_col, BLOCK_SIZE bsize) {
+  const int num_planes = av1_num_planes(cm);
+  for (int plane = 0; plane < num_planes; ++plane) {
+    if (plane && !xd->is_chroma_ref) break;
+    const int mi_x = mi_col * MI_SIZE;
+    const int mi_y = mi_row * MI_SIZE;
+    dec_build_inter_predictors(cm, xd, plane, xd->mi[0], 0,
+                               xd->plane[plane].width, xd->plane[plane].height,
+                               mi_x, mi_y);
+    if (is_interintra_pred(xd->mi[0])) {
+      BUFFER_SET ctx = { { xd->plane[0].dst.buf, xd->plane[1].dst.buf,
+                           xd->plane[2].dst.buf },
+                         { xd->plane[0].dst.stride, xd->plane[1].dst.stride,
+                           xd->plane[2].dst.stride } };
+      av1_build_interintra_predictor(cm, xd, xd->plane[plane].dst.buf,
+                                     xd->plane[plane].dst.stride, &ctx, plane,
+                                     bsize);
+    }
+  }
+}
+
+static INLINE void dec_build_prediction_by_above_pred(
+    MACROBLOCKD *xd, int rel_mi_row, int rel_mi_col, uint8_t op_mi_size,
+    int dir, MB_MODE_INFO *above_mbmi, void *fun_ctxt, const int num_planes) {
+  struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt;
+  const int above_mi_col = xd->mi_col + rel_mi_col;
+  int mi_x, mi_y;
+  MB_MODE_INFO backup_mbmi = *above_mbmi;
+
+  (void)rel_mi_row;
+  (void)dir;
+
+  av1_setup_build_prediction_by_above_pred(xd, rel_mi_col, op_mi_size,
+                                           &backup_mbmi, ctxt, num_planes);
+  mi_x = above_mi_col << MI_SIZE_LOG2;
+  mi_y = xd->mi_row << MI_SIZE_LOG2;
+
+  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+
+  for (int j = 0; j < num_planes; ++j) {
+    const struct macroblockd_plane *pd = &xd->plane[j];
+    int bw = (op_mi_size * MI_SIZE) >> pd->subsampling_x;
+    int bh = clamp(block_size_high[bsize] >> (pd->subsampling_y + 1), 4,
+                   block_size_high[BLOCK_64X64] >> (pd->subsampling_y + 1));
+
+    if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 0)) continue;
+    dec_build_inter_predictors(ctxt->cm, xd, j, &backup_mbmi, 1, bw, bh, mi_x,
+                               mi_y);
+  }
+}
+
+static AOM_INLINE void dec_build_prediction_by_above_preds(
+    const AV1_COMMON *cm, MACROBLOCKD *xd, uint8_t *tmp_buf[MAX_MB_PLANE],
+    int tmp_width[MAX_MB_PLANE], int tmp_height[MAX_MB_PLANE],
+    int tmp_stride[MAX_MB_PLANE]) {
+  if (!xd->up_available) return;
+
+  // Adjust mb_to_bottom_edge to have the correct value for the OBMC
+  // prediction block. This is half the height of the original block,
+  // except for 128-wide blocks, where we only use a height of 32.
+  const int this_height = xd->height * MI_SIZE;
+  const int pred_height = AOMMIN(this_height / 2, 32);
+  xd->mb_to_bottom_edge += GET_MV_SUBPEL(this_height - pred_height);
+  struct build_prediction_ctxt ctxt = { cm,         tmp_buf,
+                                        tmp_width,  tmp_height,
+                                        tmp_stride, xd->mb_to_right_edge };
+  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+  foreach_overlappable_nb_above(cm, xd,
+                                max_neighbor_obmc[mi_size_wide_log2[bsize]],
+                                dec_build_prediction_by_above_pred, &ctxt);
+
+  xd->mb_to_left_edge = -GET_MV_SUBPEL(xd->mi_col * MI_SIZE);
+  xd->mb_to_right_edge = ctxt.mb_to_far_edge;
+  xd->mb_to_bottom_edge -= GET_MV_SUBPEL(this_height - pred_height);
+}
+
+static INLINE void dec_build_prediction_by_left_pred(
+    MACROBLOCKD *xd, int rel_mi_row, int rel_mi_col, uint8_t op_mi_size,
+    int dir, MB_MODE_INFO *left_mbmi, void *fun_ctxt, const int num_planes) {
+  struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt;
+  const int left_mi_row = xd->mi_row + rel_mi_row;
+  int mi_x, mi_y;
+  MB_MODE_INFO backup_mbmi = *left_mbmi;
+
+  (void)rel_mi_col;
+  (void)dir;
+
+  av1_setup_build_prediction_by_left_pred(xd, rel_mi_row, op_mi_size,
+                                          &backup_mbmi, ctxt, num_planes);
+  mi_x = xd->mi_col << MI_SIZE_LOG2;
+  mi_y = left_mi_row << MI_SIZE_LOG2;
+  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+
+  for (int j = 0; j < num_planes; ++j) {
+    const struct macroblockd_plane *pd = &xd->plane[j];
+    int bw = clamp(block_size_wide[bsize] >> (pd->subsampling_x + 1), 4,
+                   block_size_wide[BLOCK_64X64] >> (pd->subsampling_x + 1));
+    int bh = (op_mi_size << MI_SIZE_LOG2) >> pd->subsampling_y;
+
+    if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 1)) continue;
+    dec_build_inter_predictors(ctxt->cm, xd, j, &backup_mbmi, 1, bw, bh, mi_x,
+                               mi_y);
+  }
+}
+
+static AOM_INLINE void dec_build_prediction_by_left_preds(
+    const AV1_COMMON *cm, MACROBLOCKD *xd, uint8_t *tmp_buf[MAX_MB_PLANE],
+    int tmp_width[MAX_MB_PLANE], int tmp_height[MAX_MB_PLANE],
+    int tmp_stride[MAX_MB_PLANE]) {
+  if (!xd->left_available) return;
+
+  // Adjust mb_to_right_edge to have the correct value for the OBMC
+  // prediction block. This is half the width of the original block,
+  // except for 128-wide blocks, where we only use a width of 32.
+  const int this_width = xd->width * MI_SIZE;
+  const int pred_width = AOMMIN(this_width / 2, 32);
+  xd->mb_to_right_edge += GET_MV_SUBPEL(this_width - pred_width);
+
+  struct build_prediction_ctxt ctxt = { cm,         tmp_buf,
+                                        tmp_width,  tmp_height,
+                                        tmp_stride, xd->mb_to_bottom_edge };
+  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+  foreach_overlappable_nb_left(cm, xd,
+                               max_neighbor_obmc[mi_size_high_log2[bsize]],
+                               dec_build_prediction_by_left_pred, &ctxt);
+
+  xd->mb_to_top_edge = -GET_MV_SUBPEL(xd->mi_row * MI_SIZE);
+  xd->mb_to_right_edge -= GET_MV_SUBPEL(this_width - pred_width);
+  xd->mb_to_bottom_edge = ctxt.mb_to_far_edge;
+}
+
+static void set_dst_buf(MACROBLOCKD *xd, uint8_t **dst_buf1,
+                        uint8_t **dst_buf2) {
+  dst_buf1[0] = xd->tmp_obmc_bufs[0];
+  dst_buf1[1] = xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE;
+  dst_buf1[2] = xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * 2;
+  dst_buf2[0] = xd->tmp_obmc_bufs[1];
+  dst_buf2[1] = xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE;
+  dst_buf2[2] = xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * 2;
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static void set_dst_buf_highbd(MACROBLOCKD *xd, uint8_t **dst_buf1,
+                               uint8_t **dst_buf2) {
+  int len = sizeof(uint16_t);
+  dst_buf1[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0]);
+  dst_buf1[1] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * len);
+  dst_buf1[2] =
+      CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * 2 * len);
+  dst_buf2[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1]);
+  dst_buf2[1] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * len);
+  dst_buf2[2] =
+      CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * 2 * len);
+}
+#endif
+
+static AOM_INLINE void dec_build_obmc_inter_predictors_sb(const AV1_COMMON *cm,
+                                                          MACROBLOCKD *xd) {
+  const int num_planes = av1_num_planes(cm);
+  uint8_t *dst_buf1[MAX_MB_PLANE], *dst_buf2[MAX_MB_PLANE];
+  int dst_stride1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+  int dst_stride2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+  int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+  int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+  int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+  int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (is_cur_buf_hbd(xd)) {
+    set_dst_buf_highbd(xd, dst_buf1, dst_buf2);
+  } else {
+    set_dst_buf(xd, dst_buf1, dst_buf2);
+  }
+#else
+  set_dst_buf(xd, dst_buf1, dst_buf2);
+#endif
+
+  dec_build_prediction_by_above_preds(cm, xd, dst_buf1, dst_width1, dst_height1,
+                                      dst_stride1);
+  dec_build_prediction_by_left_preds(cm, xd, dst_buf2, dst_width2, dst_height2,
+                                     dst_stride2);
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+  av1_setup_dst_planes(xd->plane, xd->mi[0]->sb_type, &cm->cur_frame->buf,
+                       mi_row, mi_col, 0, num_planes);
+  av1_build_obmc_inter_prediction(cm, xd, dst_buf1, dst_stride1, dst_buf2,
+                                  dst_stride2);
+}
+
+static AOM_INLINE void cfl_store_inter_block(AV1_COMMON *const cm,
+                                             MACROBLOCKD *const xd) {
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  if (store_cfl_required(cm, xd)) {
+    cfl_store_block(xd, mbmi->sb_type, mbmi->tx_size);
+  }
+}
+
+static AOM_INLINE void predict_inter_block(AV1_COMMON *const cm,
+                                           MACROBLOCKD *const xd,
+                                           BLOCK_SIZE bsize) {
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  const int num_planes = av1_num_planes(cm);
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+  for (int ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
+    const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
+    if (frame < LAST_FRAME) {
+      assert(is_intrabc_block(mbmi));
+      assert(frame == INTRA_FRAME);
+      assert(ref == 0);
+    } else {
+      const RefCntBuffer *ref_buf = get_ref_frame_buf(cm, frame);
+      const struct scale_factors *ref_scale_factors =
+          get_ref_scale_factors_const(cm, frame);
+
+      xd->block_ref_scale_factors[ref] = ref_scale_factors;
+      av1_setup_pre_planes(xd, ref, &ref_buf->buf, mi_row, mi_col,
+                           ref_scale_factors, num_planes);
+    }
+  }
+
+  dec_build_inter_predictor(cm, xd, mi_row, mi_col, bsize);
+  if (mbmi->motion_mode == OBMC_CAUSAL) {
+    dec_build_obmc_inter_predictors_sb(cm, xd);
+  }
+#if CONFIG_MISMATCH_DEBUG
+  for (int plane = 0; plane < num_planes; ++plane) {
+    const struct macroblockd_plane *pd = &xd->plane[plane];
+    int pixel_c, pixel_r;
+    mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, 0, 0, pd->subsampling_x,
+                    pd->subsampling_y);
+    if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
+                             pd->subsampling_y))
+      continue;
+    mismatch_check_block_pre(pd->dst.buf, pd->dst.stride,
+                             cm->current_frame.order_hint, plane, pixel_c,
+                             pixel_r, pd->width, pd->height,
+                             xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH);
+  }
+#endif
+}
+
+static AOM_INLINE void set_color_index_map_offset(MACROBLOCKD *const xd,
+                                                  int plane, aom_reader *r) {
+  (void)r;
+  Av1ColorMapParam params;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  av1_get_block_dimensions(mbmi->sb_type, plane, xd, &params.plane_width,
+                           &params.plane_height, NULL, NULL);
+  xd->color_index_map_offset[plane] += params.plane_width * params.plane_height;
+}
+
+static AOM_INLINE void decode_token_recon_block(AV1Decoder *const pbi,
+                                                ThreadData *const td,
+                                                aom_reader *r,
+                                                BLOCK_SIZE bsize) {
+  AV1_COMMON *const cm = &pbi->common;
+  MACROBLOCKD *const xd = &td->xd;
+  const int num_planes = av1_num_planes(cm);
+  MB_MODE_INFO *mbmi = xd->mi[0];
+
+  if (!is_inter_block(mbmi)) {
+    int row, col;
+    assert(bsize == get_plane_block_size(bsize, xd->plane[0].subsampling_x,
+                                         xd->plane[0].subsampling_y));
+    const int max_blocks_wide = max_block_wide(xd, bsize, 0);
+    const int max_blocks_high = max_block_high(xd, bsize, 0);
+    const BLOCK_SIZE max_unit_bsize = BLOCK_64X64;
+    int mu_blocks_wide = mi_size_wide[max_unit_bsize];
+    int mu_blocks_high = mi_size_high[max_unit_bsize];
+    mu_blocks_wide = AOMMIN(max_blocks_wide, mu_blocks_wide);
+    mu_blocks_high = AOMMIN(max_blocks_high, mu_blocks_high);
+
+    for (row = 0; row < max_blocks_high; row += mu_blocks_high) {
+      for (col = 0; col < max_blocks_wide; col += mu_blocks_wide) {
+        for (int plane = 0; plane < num_planes; ++plane) {
+          if (plane && !xd->is_chroma_ref) break;
+          const struct macroblockd_plane *const pd = &xd->plane[plane];
+          const TX_SIZE tx_size = av1_get_tx_size(plane, xd);
+          const int stepr = tx_size_high_unit[tx_size];
+          const int stepc = tx_size_wide_unit[tx_size];
+
+          const int unit_height = ROUND_POWER_OF_TWO(
+              AOMMIN(mu_blocks_high + row, max_blocks_high), pd->subsampling_y);
+          const int unit_width = ROUND_POWER_OF_TWO(
+              AOMMIN(mu_blocks_wide + col, max_blocks_wide), pd->subsampling_x);
+
+          for (int blk_row = row >> pd->subsampling_y; blk_row < unit_height;
+               blk_row += stepr) {
+            for (int blk_col = col >> pd->subsampling_x; blk_col < unit_width;
+                 blk_col += stepc) {
+              td->read_coeffs_tx_intra_block_visit(cm, xd, r, plane, blk_row,
+                                                   blk_col, tx_size);
+              td->predict_and_recon_intra_block_visit(cm, xd, r, plane, blk_row,
+                                                      blk_col, tx_size);
+              set_cb_buffer_offsets(xd, tx_size, plane);
+            }
+          }
+        }
+      }
+    }
+  } else {
+    td->predict_inter_block_visit(cm, xd, bsize);
+    // Reconstruction
+    if (!mbmi->skip) {
+      int eobtotal = 0;
+
+      const int max_blocks_wide = max_block_wide(xd, bsize, 0);
+      const int max_blocks_high = max_block_high(xd, bsize, 0);
+      int row, col;
+
+      const BLOCK_SIZE max_unit_bsize = BLOCK_64X64;
+      assert(max_unit_bsize ==
+             get_plane_block_size(BLOCK_64X64, xd->plane[0].subsampling_x,
+                                  xd->plane[0].subsampling_y));
+      int mu_blocks_wide = mi_size_wide[max_unit_bsize];
+      int mu_blocks_high = mi_size_high[max_unit_bsize];
+
+      mu_blocks_wide = AOMMIN(max_blocks_wide, mu_blocks_wide);
+      mu_blocks_high = AOMMIN(max_blocks_high, mu_blocks_high);
+
+      for (row = 0; row < max_blocks_high; row += mu_blocks_high) {
+        for (col = 0; col < max_blocks_wide; col += mu_blocks_wide) {
+          for (int plane = 0; plane < num_planes; ++plane) {
+            if (plane && !xd->is_chroma_ref) break;
+            const struct macroblockd_plane *const pd = &xd->plane[plane];
+            const int ss_x = pd->subsampling_x;
+            const int ss_y = pd->subsampling_y;
+            const BLOCK_SIZE plane_bsize =
+                get_plane_block_size(bsize, ss_x, ss_y);
+            const TX_SIZE max_tx_size =
+                get_vartx_max_txsize(xd, plane_bsize, plane);
+            const int bh_var_tx = tx_size_high_unit[max_tx_size];
+            const int bw_var_tx = tx_size_wide_unit[max_tx_size];
+            int block = 0;
+            int step =
+                tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
+            int blk_row, blk_col;
+            const int unit_height = ROUND_POWER_OF_TWO(
+                AOMMIN(mu_blocks_high + row, max_blocks_high), ss_y);
+            const int unit_width = ROUND_POWER_OF_TWO(
+                AOMMIN(mu_blocks_wide + col, max_blocks_wide), ss_x);
+
+            for (blk_row = row >> ss_y; blk_row < unit_height;
+                 blk_row += bh_var_tx) {
+              for (blk_col = col >> ss_x; blk_col < unit_width;
+                   blk_col += bw_var_tx) {
+                decode_reconstruct_tx(cm, td, r, mbmi, plane, plane_bsize,
+                                      blk_row, blk_col, block, max_tx_size,
+                                      &eobtotal);
+                block += step;
+              }
+            }
+          }
+        }
+      }
+    }
+    td->cfl_store_inter_block_visit(cm, xd);
+  }
+
+  av1_visit_palette(pbi, xd, r, set_color_index_map_offset);
+}
+
+static AOM_INLINE void set_inter_tx_size(MB_MODE_INFO *mbmi, int stride_log2,
+                                         int tx_w_log2, int tx_h_log2,
+                                         int min_txs, int split_size, int txs,
+                                         int blk_row, int blk_col) {
+  for (int idy = 0; idy < tx_size_high_unit[split_size];
+       idy += tx_size_high_unit[min_txs]) {
+    for (int idx = 0; idx < tx_size_wide_unit[split_size];
+         idx += tx_size_wide_unit[min_txs]) {
+      const int index = (((blk_row + idy) >> tx_h_log2) << stride_log2) +
+                        ((blk_col + idx) >> tx_w_log2);
+      mbmi->inter_tx_size[index] = txs;
+    }
+  }
+}
+
+static AOM_INLINE void read_tx_size_vartx(MACROBLOCKD *xd, MB_MODE_INFO *mbmi,
+                                          TX_SIZE tx_size, int depth,
+#if CONFIG_LPF_MASK
+                                          AV1_COMMON *cm, int mi_row,
+                                          int mi_col, int store_bitmask,
+#endif
+                                          int blk_row, int blk_col,
+                                          aom_reader *r) {
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  int is_split = 0;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const int max_blocks_high = max_block_high(xd, bsize, 0);
+  const int max_blocks_wide = max_block_wide(xd, bsize, 0);
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+  assert(tx_size > TX_4X4);
+  TX_SIZE txs = max_txsize_rect_lookup[bsize];
+  for (int level = 0; level < MAX_VARTX_DEPTH - 1; ++level)
+    txs = sub_tx_size_map[txs];
+  const int tx_w_log2 = tx_size_wide_log2[txs] - MI_SIZE_LOG2;
+  const int tx_h_log2 = tx_size_high_log2[txs] - MI_SIZE_LOG2;
+  const int bw_log2 = mi_size_wide_log2[bsize];
+  const int stride_log2 = bw_log2 - tx_w_log2;
+
+  if (depth == MAX_VARTX_DEPTH) {
+    set_inter_tx_size(mbmi, stride_log2, tx_w_log2, tx_h_log2, txs, tx_size,
+                      tx_size, blk_row, blk_col);
+    mbmi->tx_size = tx_size;
+    txfm_partition_update(xd->above_txfm_context + blk_col,
+                          xd->left_txfm_context + blk_row, tx_size, tx_size);
+    return;
+  }
+
+  const int ctx = txfm_partition_context(xd->above_txfm_context + blk_col,
+                                         xd->left_txfm_context + blk_row,
+                                         mbmi->sb_type, tx_size);
+  is_split = aom_read_symbol(r, ec_ctx->txfm_partition_cdf[ctx], 2, ACCT_STR);
+
+  if (is_split) {
+    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+    const int bsw = tx_size_wide_unit[sub_txs];
+    const int bsh = tx_size_high_unit[sub_txs];
+
+    if (sub_txs == TX_4X4) {
+      set_inter_tx_size(mbmi, stride_log2, tx_w_log2, tx_h_log2, txs, tx_size,
+                        sub_txs, blk_row, blk_col);
+      mbmi->tx_size = sub_txs;
+      txfm_partition_update(xd->above_txfm_context + blk_col,
+                            xd->left_txfm_context + blk_row, sub_txs, tx_size);
+#if CONFIG_LPF_MASK
+      if (store_bitmask) {
+        av1_store_bitmask_vartx(cm, mi_row + blk_row, mi_col + blk_col,
+                                txsize_to_bsize[tx_size], TX_4X4, mbmi);
+      }
+#endif
+      return;
+    }
+#if CONFIG_LPF_MASK
+    if (depth + 1 == MAX_VARTX_DEPTH && store_bitmask) {
+      av1_store_bitmask_vartx(cm, mi_row + blk_row, mi_col + blk_col,
+                              txsize_to_bsize[tx_size], sub_txs, mbmi);
+      store_bitmask = 0;
+    }
+#endif
+
+    assert(bsw > 0 && bsh > 0);
+    for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) {
+      for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
+        int offsetr = blk_row + row;
+        int offsetc = blk_col + col;
+        read_tx_size_vartx(xd, mbmi, sub_txs, depth + 1,
+#if CONFIG_LPF_MASK
+                           cm, mi_row, mi_col, store_bitmask,
+#endif
+                           offsetr, offsetc, r);
+      }
+    }
+  } else {
+    set_inter_tx_size(mbmi, stride_log2, tx_w_log2, tx_h_log2, txs, tx_size,
+                      tx_size, blk_row, blk_col);
+    mbmi->tx_size = tx_size;
+    txfm_partition_update(xd->above_txfm_context + blk_col,
+                          xd->left_txfm_context + blk_row, tx_size, tx_size);
+#if CONFIG_LPF_MASK
+    if (store_bitmask) {
+      av1_store_bitmask_vartx(cm, mi_row + blk_row, mi_col + blk_col,
+                              txsize_to_bsize[tx_size], tx_size, mbmi);
+    }
+#endif
+  }
+}
+
+static TX_SIZE read_selected_tx_size(const MACROBLOCKD *const xd,
+                                     aom_reader *r) {
+  // TODO(debargha): Clean up the logic here. This function should only
+  // be called for intra.
+  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+  const int32_t tx_size_cat = bsize_to_tx_size_cat(bsize);
+  const int max_depths = bsize_to_max_depth(bsize);
+  const int ctx = get_tx_size_context(xd);
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  const int depth = aom_read_symbol(r, ec_ctx->tx_size_cdf[tx_size_cat][ctx],
+                                    max_depths + 1, ACCT_STR);
+  assert(depth >= 0 && depth <= max_depths);
+  const TX_SIZE tx_size = depth_to_tx_size(depth, bsize);
+  return tx_size;
+}
+
+static TX_SIZE read_tx_size(const MACROBLOCKD *const xd, TX_MODE tx_mode,
+                            int is_inter, int allow_select_inter,
+                            aom_reader *r) {
+  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+  if (xd->lossless[xd->mi[0]->segment_id]) return TX_4X4;
+
+  if (block_signals_txsize(bsize)) {
+    if ((!is_inter || allow_select_inter) && tx_mode == TX_MODE_SELECT) {
+      const TX_SIZE coded_tx_size = read_selected_tx_size(xd, r);
+      return coded_tx_size;
+    } else {
+      return tx_size_from_tx_mode(bsize, tx_mode);
+    }
+  } else {
+    assert(IMPLIES(tx_mode == ONLY_4X4, bsize == BLOCK_4X4));
+    return max_txsize_rect_lookup[bsize];
+  }
+}
+
+static AOM_INLINE void parse_decode_block(AV1Decoder *const pbi,
+                                          ThreadData *const td, int mi_row,
+                                          int mi_col, aom_reader *r,
+                                          PARTITION_TYPE partition,
+                                          BLOCK_SIZE bsize) {
+  MACROBLOCKD *const xd = &td->xd;
+  decode_mbmi_block(pbi, xd, mi_row, mi_col, r, partition, bsize);
+
+  av1_visit_palette(pbi, xd, r, av1_decode_palette_tokens);
+
+  AV1_COMMON *cm = &pbi->common;
+  const int num_planes = av1_num_planes(cm);
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  int inter_block_tx = is_inter_block(mbmi) || is_intrabc_block(mbmi);
+  if (cm->features.tx_mode == TX_MODE_SELECT && block_signals_txsize(bsize) &&
+      !mbmi->skip && inter_block_tx && !xd->lossless[mbmi->segment_id]) {
+    const TX_SIZE max_tx_size = max_txsize_rect_lookup[bsize];
+    const int bh = tx_size_high_unit[max_tx_size];
+    const int bw = tx_size_wide_unit[max_tx_size];
+    const int width = mi_size_wide[bsize];
+    const int height = mi_size_high[bsize];
+
+    for (int idy = 0; idy < height; idy += bh)
+      for (int idx = 0; idx < width; idx += bw)
+        read_tx_size_vartx(xd, mbmi, max_tx_size, 0,
+#if CONFIG_LPF_MASK
+                           cm, mi_row, mi_col, 1,
+#endif
+                           idy, idx, r);
+  } else {
+    mbmi->tx_size =
+        read_tx_size(xd, cm->features.tx_mode, inter_block_tx, !mbmi->skip, r);
+    if (inter_block_tx)
+      memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size));
+    set_txfm_ctxs(mbmi->tx_size, xd->width, xd->height,
+                  mbmi->skip && is_inter_block(mbmi), xd);
+#if CONFIG_LPF_MASK
+    const int w = mi_size_wide[bsize];
+    const int h = mi_size_high[bsize];
+    if (w <= mi_size_wide[BLOCK_64X64] && h <= mi_size_high[BLOCK_64X64]) {
+      av1_store_bitmask_univariant_tx(cm, mi_row, mi_col, bsize, mbmi);
+    } else {
+      for (int row = 0; row < h; row += mi_size_high[BLOCK_64X64]) {
+        for (int col = 0; col < w; col += mi_size_wide[BLOCK_64X64]) {
+          av1_store_bitmask_univariant_tx(cm, mi_row + row, mi_col + col,
+                                          BLOCK_64X64, mbmi);
+        }
+      }
+    }
+#endif
+  }
+#if CONFIG_LPF_MASK
+  const int w = mi_size_wide[bsize];
+  const int h = mi_size_high[bsize];
+  if (w <= mi_size_wide[BLOCK_64X64] && h <= mi_size_high[BLOCK_64X64]) {
+    av1_store_bitmask_other_info(cm, mi_row, mi_col, bsize, mbmi, 1, 1);
+  } else {
+    for (int row = 0; row < h; row += mi_size_high[BLOCK_64X64]) {
+      for (int col = 0; col < w; col += mi_size_wide[BLOCK_64X64]) {
+        av1_store_bitmask_other_info(cm, mi_row + row, mi_col + col,
+                                     BLOCK_64X64, mbmi, row == 0, col == 0);
+      }
+    }
+  }
+#endif
+
+  if (cm->delta_q_info.delta_q_present_flag) {
+    for (int i = 0; i < MAX_SEGMENTS; i++) {
+      const int current_qindex =
+          av1_get_qindex(&cm->seg, i, xd->current_qindex);
+      const CommonQuantParams *const quant_params = &cm->quant_params;
+      for (int j = 0; j < num_planes; ++j) {
+        const int dc_delta_q = j == 0 ? quant_params->y_dc_delta_q
+                                      : (j == 1 ? quant_params->u_dc_delta_q
+                                                : quant_params->v_dc_delta_q);
+        const int ac_delta_q = j == 0 ? 0
+                                      : (j == 1 ? quant_params->u_ac_delta_q
+                                                : quant_params->v_ac_delta_q);
+        xd->plane[j].seg_dequant_QTX[i][0] = av1_dc_quant_QTX(
+            current_qindex, dc_delta_q, cm->seq_params.bit_depth);
+        xd->plane[j].seg_dequant_QTX[i][1] = av1_ac_quant_QTX(
+            current_qindex, ac_delta_q, cm->seq_params.bit_depth);
+      }
+    }
+  }
+  if (mbmi->skip) av1_reset_entropy_context(xd, bsize, num_planes);
+
+  decode_token_recon_block(pbi, td, r, bsize);
+}
+
+static AOM_INLINE void set_offsets_for_pred_and_recon(AV1Decoder *const pbi,
+                                                      ThreadData *const td,
+                                                      int mi_row, int mi_col,
+                                                      BLOCK_SIZE bsize) {
+  AV1_COMMON *const cm = &pbi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  MACROBLOCKD *const xd = &td->xd;
+  const int bw = mi_size_wide[bsize];
+  const int bh = mi_size_high[bsize];
+  const int num_planes = av1_num_planes(cm);
+
+  const int offset = mi_row * mi_params->mi_stride + mi_col;
+  const TileInfo *const tile = &xd->tile;
+
+  xd->mi = mi_params->mi_grid_base + offset;
+  xd->tx_type_map =
+      &mi_params->tx_type_map[mi_row * mi_params->mi_stride + mi_col];
+  xd->tx_type_map_stride = mi_params->mi_stride;
+
+  set_plane_n4(xd, bw, bh, num_planes);
+
+  // Distance of Mb to the various image edges. These are specified to 8th pel
+  // as they are always compared to values that are in 1/8th pel units
+  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, mi_params->mi_rows,
+                 mi_params->mi_cols);
+
+  av1_setup_dst_planes(xd->plane, bsize, &cm->cur_frame->buf, mi_row, mi_col, 0,
+                       num_planes);
+}
+
+static AOM_INLINE void decode_block(AV1Decoder *const pbi, ThreadData *const td,
+                                    int mi_row, int mi_col, aom_reader *r,
+                                    PARTITION_TYPE partition,
+                                    BLOCK_SIZE bsize) {
+  (void)partition;
+  set_offsets_for_pred_and_recon(pbi, td, mi_row, mi_col, bsize);
+  decode_token_recon_block(pbi, td, r, bsize);
+}
+
+static PARTITION_TYPE read_partition(MACROBLOCKD *xd, int mi_row, int mi_col,
+                                     aom_reader *r, int has_rows, int has_cols,
+                                     BLOCK_SIZE bsize) {
+  const int ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+
+  if (!has_rows && !has_cols) return PARTITION_SPLIT;
+
+  assert(ctx >= 0);
+  aom_cdf_prob *partition_cdf = ec_ctx->partition_cdf[ctx];
+  if (has_rows && has_cols) {
+    return (PARTITION_TYPE)aom_read_symbol(
+        r, partition_cdf, partition_cdf_length(bsize), ACCT_STR);
+  } else if (!has_rows && has_cols) {
+    assert(bsize > BLOCK_8X8);
+    aom_cdf_prob cdf[2];
+    partition_gather_vert_alike(cdf, partition_cdf, bsize);
+    assert(cdf[1] == AOM_ICDF(CDF_PROB_TOP));
+    return aom_read_cdf(r, cdf, 2, ACCT_STR) ? PARTITION_SPLIT : PARTITION_HORZ;
+  } else {
+    assert(has_rows && !has_cols);
+    assert(bsize > BLOCK_8X8);
+    aom_cdf_prob cdf[2];
+    partition_gather_horz_alike(cdf, partition_cdf, bsize);
+    assert(cdf[1] == AOM_ICDF(CDF_PROB_TOP));
+    return aom_read_cdf(r, cdf, 2, ACCT_STR) ? PARTITION_SPLIT : PARTITION_VERT;
+  }
+}
+
+// TODO(slavarnway): eliminate bsize and subsize in future commits
+static AOM_INLINE void decode_partition(AV1Decoder *const pbi,
+                                        ThreadData *const td, int mi_row,
+                                        int mi_col, aom_reader *reader,
+                                        BLOCK_SIZE bsize,
+                                        int parse_decode_flag) {
+  assert(bsize < BLOCK_SIZES_ALL);
+  AV1_COMMON *const cm = &pbi->common;
+  MACROBLOCKD *const xd = &td->xd;
+  const int bw = mi_size_wide[bsize];
+  const int hbs = bw >> 1;
+  PARTITION_TYPE partition;
+  BLOCK_SIZE subsize;
+  const int quarter_step = bw / 4;
+  BLOCK_SIZE bsize2 = get_partition_subsize(bsize, PARTITION_SPLIT);
+  const int has_rows = (mi_row + hbs) < cm->mi_params.mi_rows;
+  const int has_cols = (mi_col + hbs) < cm->mi_params.mi_cols;
+
+  if (mi_row >= cm->mi_params.mi_rows || mi_col >= cm->mi_params.mi_cols)
+    return;
+
+  // parse_decode_flag takes the following values :
+  // 01 - do parse only
+  // 10 - do decode only
+  // 11 - do parse and decode
+  static const block_visitor_fn_t block_visit[4] = { NULL, parse_decode_block,
+                                                     decode_block,
+                                                     parse_decode_block };
+
+  if (parse_decode_flag & 1) {
+    const int num_planes = av1_num_planes(cm);
+    for (int plane = 0; plane < num_planes; ++plane) {
+      int rcol0, rcol1, rrow0, rrow1;
+      if (av1_loop_restoration_corners_in_sb(cm, plane, mi_row, mi_col, bsize,
+                                             &rcol0, &rcol1, &rrow0, &rrow1)) {
+        const int rstride = cm->rst_info[plane].horz_units_per_tile;
+        for (int rrow = rrow0; rrow < rrow1; ++rrow) {
+          for (int rcol = rcol0; rcol < rcol1; ++rcol) {
+            const int runit_idx = rcol + rrow * rstride;
+            loop_restoration_read_sb_coeffs(cm, xd, reader, plane, runit_idx);
+          }
+        }
+      }
+    }
+
+    partition = (bsize < BLOCK_8X8) ? PARTITION_NONE
+                                    : read_partition(xd, mi_row, mi_col, reader,
+                                                     has_rows, has_cols, bsize);
+  } else {
+    partition = get_partition(cm, mi_row, mi_col, bsize);
+  }
+  subsize = get_partition_subsize(bsize, partition);
+  if (subsize == BLOCK_INVALID) {
+    aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
+                       "Partition is invalid for block size %dx%d",
+                       block_size_wide[bsize], block_size_high[bsize]);
+  }
+  // Check the bitstream is conformant: if there is subsampling on the
+  // chroma planes, subsize must subsample to a valid block size.
+  const struct macroblockd_plane *const pd_u = &xd->plane[1];
+  if (get_plane_block_size(subsize, pd_u->subsampling_x, pd_u->subsampling_y) ==
+      BLOCK_INVALID) {
+    aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
+                       "Block size %dx%d invalid with this subsampling mode",
+                       block_size_wide[subsize], block_size_high[subsize]);
+  }
+
+#define DEC_BLOCK_STX_ARG
+#define DEC_BLOCK_EPT_ARG partition,
+#define DEC_BLOCK(db_r, db_c, db_subsize)                                  \
+  block_visit[parse_decode_flag](pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), \
+                                 reader, DEC_BLOCK_EPT_ARG(db_subsize))
+#define DEC_PARTITION(db_r, db_c, db_subsize)                        \
+  decode_partition(pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), reader, \
+                   (db_subsize), parse_decode_flag)
+
+  switch (partition) {
+    case PARTITION_NONE: DEC_BLOCK(mi_row, mi_col, subsize); break;
+    case PARTITION_HORZ:
+      DEC_BLOCK(mi_row, mi_col, subsize);
+      if (has_rows) DEC_BLOCK(mi_row + hbs, mi_col, subsize);
+      break;
+    case PARTITION_VERT:
+      DEC_BLOCK(mi_row, mi_col, subsize);
+      if (has_cols) DEC_BLOCK(mi_row, mi_col + hbs, subsize);
+      break;
+    case PARTITION_SPLIT:
+      DEC_PARTITION(mi_row, mi_col, subsize);
+      DEC_PARTITION(mi_row, mi_col + hbs, subsize);
+      DEC_PARTITION(mi_row + hbs, mi_col, subsize);
+      DEC_PARTITION(mi_row + hbs, mi_col + hbs, subsize);
+      break;
+    case PARTITION_HORZ_A:
+      DEC_BLOCK(mi_row, mi_col, bsize2);
+      DEC_BLOCK(mi_row, mi_col + hbs, bsize2);
+      DEC_BLOCK(mi_row + hbs, mi_col, subsize);
+      break;
+    case PARTITION_HORZ_B:
+      DEC_BLOCK(mi_row, mi_col, subsize);
+      DEC_BLOCK(mi_row + hbs, mi_col, bsize2);
+      DEC_BLOCK(mi_row + hbs, mi_col + hbs, bsize2);
+      break;
+    case PARTITION_VERT_A:
+      DEC_BLOCK(mi_row, mi_col, bsize2);
+      DEC_BLOCK(mi_row + hbs, mi_col, bsize2);
+      DEC_BLOCK(mi_row, mi_col + hbs, subsize);
+      break;
+    case PARTITION_VERT_B:
+      DEC_BLOCK(mi_row, mi_col, subsize);
+      DEC_BLOCK(mi_row, mi_col + hbs, bsize2);
+      DEC_BLOCK(mi_row + hbs, mi_col + hbs, bsize2);
+      break;
+    case PARTITION_HORZ_4:
+      for (int i = 0; i < 4; ++i) {
+        int this_mi_row = mi_row + i * quarter_step;
+        if (i > 0 && this_mi_row >= cm->mi_params.mi_rows) break;
+        DEC_BLOCK(this_mi_row, mi_col, subsize);
+      }
+      break;
+    case PARTITION_VERT_4:
+      for (int i = 0; i < 4; ++i) {
+        int this_mi_col = mi_col + i * quarter_step;
+        if (i > 0 && this_mi_col >= cm->mi_params.mi_cols) break;
+        DEC_BLOCK(mi_row, this_mi_col, subsize);
+      }
+      break;
+    default: assert(0 && "Invalid partition type");
+  }
+
+#undef DEC_PARTITION
+#undef DEC_BLOCK
+#undef DEC_BLOCK_EPT_ARG
+#undef DEC_BLOCK_STX_ARG
+
+  if (parse_decode_flag & 1)
+    update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition);
+}
+
+static AOM_INLINE void setup_bool_decoder(
+    const uint8_t *data, const uint8_t *data_end, const size_t read_size,
+    struct aom_internal_error_info *error_info, aom_reader *r,
+    uint8_t allow_update_cdf) {
+  // Validate the calculated partition length. If the buffer
+  // described by the partition can't be fully read, then restrict
+  // it to the portion that can be (for EC mode) or throw an error.
+  if (!read_is_valid(data, read_size, data_end))
+    aom_internal_error(error_info, AOM_CODEC_CORRUPT_FRAME,
+                       "Truncated packet or corrupt tile length");
+
+  if (aom_reader_init(r, data, read_size))
+    aom_internal_error(error_info, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate bool decoder %d", 1);
+
+  r->allow_update_cdf = allow_update_cdf;
+}
+
+static AOM_INLINE void setup_segmentation(AV1_COMMON *const cm,
+                                          struct aom_read_bit_buffer *rb) {
+  struct segmentation *const seg = &cm->seg;
+
+  seg->update_map = 0;
+  seg->update_data = 0;
+  seg->temporal_update = 0;
+
+  seg->enabled = aom_rb_read_bit(rb);
+  if (!seg->enabled) {
+    if (cm->cur_frame->seg_map)
+      memset(cm->cur_frame->seg_map, 0,
+             (cm->mi_params.mi_rows * cm->mi_params.mi_cols));
+
+    memset(seg, 0, sizeof(*seg));
+    segfeatures_copy(&cm->cur_frame->seg, seg);
+    return;
+  }
+  if (cm->seg.enabled && cm->prev_frame &&
+      (cm->mi_params.mi_rows == cm->prev_frame->mi_rows) &&
+      (cm->mi_params.mi_cols == cm->prev_frame->mi_cols)) {
+    cm->last_frame_seg_map = cm->prev_frame->seg_map;
+  } else {
+    cm->last_frame_seg_map = NULL;
+  }
+  // Read update flags
+  if (cm->features.primary_ref_frame == PRIMARY_REF_NONE) {
+    // These frames can't use previous frames, so must signal map + features
+    seg->update_map = 1;
+    seg->temporal_update = 0;
+    seg->update_data = 1;
+  } else {
+    seg->update_map = aom_rb_read_bit(rb);
+    if (seg->update_map) {
+      seg->temporal_update = aom_rb_read_bit(rb);
+    } else {
+      seg->temporal_update = 0;
+    }
+    seg->update_data = aom_rb_read_bit(rb);
+  }
+
+  // Segmentation data update
+  if (seg->update_data) {
+    av1_clearall_segfeatures(seg);
+
+    for (int i = 0; i < MAX_SEGMENTS; i++) {
+      for (int j = 0; j < SEG_LVL_MAX; j++) {
+        int data = 0;
+        const int feature_enabled = aom_rb_read_bit(rb);
+        if (feature_enabled) {
+          av1_enable_segfeature(seg, i, j);
+
+          const int data_max = av1_seg_feature_data_max(j);
+          const int data_min = -data_max;
+          const int ubits = get_unsigned_bits(data_max);
+
+          if (av1_is_segfeature_signed(j)) {
+            data = aom_rb_read_inv_signed_literal(rb, ubits);
+          } else {
+            data = aom_rb_read_literal(rb, ubits);
+          }
+
+          data = clamp(data, data_min, data_max);
+        }
+        av1_set_segdata(seg, i, j, data);
+      }
+    }
+    av1_calculate_segdata(seg);
+  } else if (cm->prev_frame) {
+    segfeatures_copy(seg, &cm->prev_frame->seg);
+  }
+  segfeatures_copy(&cm->cur_frame->seg, seg);
+}
+
+static AOM_INLINE void decode_restoration_mode(AV1_COMMON *cm,
+                                               struct aom_read_bit_buffer *rb) {
+  assert(!cm->features.all_lossless);
+  const int num_planes = av1_num_planes(cm);
+  if (cm->features.allow_intrabc) return;
+  int all_none = 1, chroma_none = 1;
+  for (int p = 0; p < num_planes; ++p) {
+    RestorationInfo *rsi = &cm->rst_info[p];
+    if (aom_rb_read_bit(rb)) {
+      rsi->frame_restoration_type =
+          aom_rb_read_bit(rb) ? RESTORE_SGRPROJ : RESTORE_WIENER;
+    } else {
+      rsi->frame_restoration_type =
+          aom_rb_read_bit(rb) ? RESTORE_SWITCHABLE : RESTORE_NONE;
+    }
+    if (rsi->frame_restoration_type != RESTORE_NONE) {
+      all_none = 0;
+      chroma_none &= p == 0;
+    }
+  }
+  if (!all_none) {
+    assert(cm->seq_params.sb_size == BLOCK_64X64 ||
+           cm->seq_params.sb_size == BLOCK_128X128);
+    const int sb_size = cm->seq_params.sb_size == BLOCK_128X128 ? 128 : 64;
+
+    for (int p = 0; p < num_planes; ++p)
+      cm->rst_info[p].restoration_unit_size = sb_size;
+
+    RestorationInfo *rsi = &cm->rst_info[0];
+
+    if (sb_size == 64) {
+      rsi->restoration_unit_size <<= aom_rb_read_bit(rb);
+    }
+    if (rsi->restoration_unit_size > 64) {
+      rsi->restoration_unit_size <<= aom_rb_read_bit(rb);
+    }
+  } else {
+    const int size = RESTORATION_UNITSIZE_MAX;
+    for (int p = 0; p < num_planes; ++p)
+      cm->rst_info[p].restoration_unit_size = size;
+  }
+
+  if (num_planes > 1) {
+    int s = AOMMIN(cm->seq_params.subsampling_x, cm->seq_params.subsampling_y);
+    if (s && !chroma_none) {
+      cm->rst_info[1].restoration_unit_size =
+          cm->rst_info[0].restoration_unit_size >> (aom_rb_read_bit(rb) * s);
+    } else {
+      cm->rst_info[1].restoration_unit_size =
+          cm->rst_info[0].restoration_unit_size;
+    }
+    cm->rst_info[2].restoration_unit_size =
+        cm->rst_info[1].restoration_unit_size;
+  }
+}
+
+static AOM_INLINE void read_wiener_filter(int wiener_win,
+                                          WienerInfo *wiener_info,
+                                          WienerInfo *ref_wiener_info,
+                                          aom_reader *rb) {
+  memset(wiener_info->vfilter, 0, sizeof(wiener_info->vfilter));
+  memset(wiener_info->hfilter, 0, sizeof(wiener_info->hfilter));
+
+  if (wiener_win == WIENER_WIN)
+    wiener_info->vfilter[0] = wiener_info->vfilter[WIENER_WIN - 1] =
+        aom_read_primitive_refsubexpfin(
+            rb, WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1,
+            WIENER_FILT_TAP0_SUBEXP_K,
+            ref_wiener_info->vfilter[0] - WIENER_FILT_TAP0_MINV, ACCT_STR) +
+        WIENER_FILT_TAP0_MINV;
+  else
+    wiener_info->vfilter[0] = wiener_info->vfilter[WIENER_WIN - 1] = 0;
+  wiener_info->vfilter[1] = wiener_info->vfilter[WIENER_WIN - 2] =
+      aom_read_primitive_refsubexpfin(
+          rb, WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1,
+          WIENER_FILT_TAP1_SUBEXP_K,
+          ref_wiener_info->vfilter[1] - WIENER_FILT_TAP1_MINV, ACCT_STR) +
+      WIENER_FILT_TAP1_MINV;
+  wiener_info->vfilter[2] = wiener_info->vfilter[WIENER_WIN - 3] =
+      aom_read_primitive_refsubexpfin(
+          rb, WIENER_FILT_TAP2_MAXV - WIENER_FILT_TAP2_MINV + 1,
+          WIENER_FILT_TAP2_SUBEXP_K,
+          ref_wiener_info->vfilter[2] - WIENER_FILT_TAP2_MINV, ACCT_STR) +
+      WIENER_FILT_TAP2_MINV;
+  // The central element has an implicit +WIENER_FILT_STEP
+  wiener_info->vfilter[WIENER_HALFWIN] =
+      -2 * (wiener_info->vfilter[0] + wiener_info->vfilter[1] +
+            wiener_info->vfilter[2]);
+
+  if (wiener_win == WIENER_WIN)
+    wiener_info->hfilter[0] = wiener_info->hfilter[WIENER_WIN - 1] =
+        aom_read_primitive_refsubexpfin(
+            rb, WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1,
+            WIENER_FILT_TAP0_SUBEXP_K,
+            ref_wiener_info->hfilter[0] - WIENER_FILT_TAP0_MINV, ACCT_STR) +
+        WIENER_FILT_TAP0_MINV;
+  else
+    wiener_info->hfilter[0] = wiener_info->hfilter[WIENER_WIN - 1] = 0;
+  wiener_info->hfilter[1] = wiener_info->hfilter[WIENER_WIN - 2] =
+      aom_read_primitive_refsubexpfin(
+          rb, WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1,
+          WIENER_FILT_TAP1_SUBEXP_K,
+          ref_wiener_info->hfilter[1] - WIENER_FILT_TAP1_MINV, ACCT_STR) +
+      WIENER_FILT_TAP1_MINV;
+  wiener_info->hfilter[2] = wiener_info->hfilter[WIENER_WIN - 3] =
+      aom_read_primitive_refsubexpfin(
+          rb, WIENER_FILT_TAP2_MAXV - WIENER_FILT_TAP2_MINV + 1,
+          WIENER_FILT_TAP2_SUBEXP_K,
+          ref_wiener_info->hfilter[2] - WIENER_FILT_TAP2_MINV, ACCT_STR) +
+      WIENER_FILT_TAP2_MINV;
+  // The central element has an implicit +WIENER_FILT_STEP
+  wiener_info->hfilter[WIENER_HALFWIN] =
+      -2 * (wiener_info->hfilter[0] + wiener_info->hfilter[1] +
+            wiener_info->hfilter[2]);
+  memcpy(ref_wiener_info, wiener_info, sizeof(*wiener_info));
+}
+
+static AOM_INLINE void read_sgrproj_filter(SgrprojInfo *sgrproj_info,
+                                           SgrprojInfo *ref_sgrproj_info,
+                                           aom_reader *rb) {
+  sgrproj_info->ep = aom_read_literal(rb, SGRPROJ_PARAMS_BITS, ACCT_STR);
+  const sgr_params_type *params = &av1_sgr_params[sgrproj_info->ep];
+
+  if (params->r[0] == 0) {
+    sgrproj_info->xqd[0] = 0;
+    sgrproj_info->xqd[1] =
+        aom_read_primitive_refsubexpfin(
+            rb, SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1, SGRPROJ_PRJ_SUBEXP_K,
+            ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1, ACCT_STR) +
+        SGRPROJ_PRJ_MIN1;
+  } else if (params->r[1] == 0) {
+    sgrproj_info->xqd[0] =
+        aom_read_primitive_refsubexpfin(
+            rb, SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1, SGRPROJ_PRJ_SUBEXP_K,
+            ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0, ACCT_STR) +
+        SGRPROJ_PRJ_MIN0;
+    sgrproj_info->xqd[1] = clamp((1 << SGRPROJ_PRJ_BITS) - sgrproj_info->xqd[0],
+                                 SGRPROJ_PRJ_MIN1, SGRPROJ_PRJ_MAX1);
+  } else {
+    sgrproj_info->xqd[0] =
+        aom_read_primitive_refsubexpfin(
+            rb, SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1, SGRPROJ_PRJ_SUBEXP_K,
+            ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0, ACCT_STR) +
+        SGRPROJ_PRJ_MIN0;
+    sgrproj_info->xqd[1] =
+        aom_read_primitive_refsubexpfin(
+            rb, SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1, SGRPROJ_PRJ_SUBEXP_K,
+            ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1, ACCT_STR) +
+        SGRPROJ_PRJ_MIN1;
+  }
+
+  memcpy(ref_sgrproj_info, sgrproj_info, sizeof(*sgrproj_info));
+}
+
+static AOM_INLINE void loop_restoration_read_sb_coeffs(
+    const AV1_COMMON *const cm, MACROBLOCKD *xd, aom_reader *const r, int plane,
+    int runit_idx) {
+  const RestorationInfo *rsi = &cm->rst_info[plane];
+  RestorationUnitInfo *rui = &rsi->unit_info[runit_idx];
+  if (rsi->frame_restoration_type == RESTORE_NONE) return;
+
+  assert(!cm->features.all_lossless);
+
+  const int wiener_win = (plane > 0) ? WIENER_WIN_CHROMA : WIENER_WIN;
+  WienerInfo *wiener_info = xd->wiener_info + plane;
+  SgrprojInfo *sgrproj_info = xd->sgrproj_info + plane;
+
+  if (rsi->frame_restoration_type == RESTORE_SWITCHABLE) {
+    rui->restoration_type =
+        aom_read_symbol(r, xd->tile_ctx->switchable_restore_cdf,
+                        RESTORE_SWITCHABLE_TYPES, ACCT_STR);
+    switch (rui->restoration_type) {
+      case RESTORE_WIENER:
+        read_wiener_filter(wiener_win, &rui->wiener_info, wiener_info, r);
+        break;
+      case RESTORE_SGRPROJ:
+        read_sgrproj_filter(&rui->sgrproj_info, sgrproj_info, r);
+        break;
+      default: assert(rui->restoration_type == RESTORE_NONE); break;
+    }
+  } else if (rsi->frame_restoration_type == RESTORE_WIENER) {
+    if (aom_read_symbol(r, xd->tile_ctx->wiener_restore_cdf, 2, ACCT_STR)) {
+      rui->restoration_type = RESTORE_WIENER;
+      read_wiener_filter(wiener_win, &rui->wiener_info, wiener_info, r);
+    } else {
+      rui->restoration_type = RESTORE_NONE;
+    }
+  } else if (rsi->frame_restoration_type == RESTORE_SGRPROJ) {
+    if (aom_read_symbol(r, xd->tile_ctx->sgrproj_restore_cdf, 2, ACCT_STR)) {
+      rui->restoration_type = RESTORE_SGRPROJ;
+      read_sgrproj_filter(&rui->sgrproj_info, sgrproj_info, r);
+    } else {
+      rui->restoration_type = RESTORE_NONE;
+    }
+  }
+}
+
+static AOM_INLINE void setup_loopfilter(AV1_COMMON *cm,
+                                        struct aom_read_bit_buffer *rb) {
+  const int num_planes = av1_num_planes(cm);
+  struct loopfilter *lf = &cm->lf;
+
+  if (cm->features.allow_intrabc || cm->features.coded_lossless) {
+    // write default deltas to frame buffer
+    av1_set_default_ref_deltas(cm->cur_frame->ref_deltas);
+    av1_set_default_mode_deltas(cm->cur_frame->mode_deltas);
+    return;
+  }
+  assert(!cm->features.coded_lossless);
+  if (cm->prev_frame) {
+    // write deltas to frame buffer
+    memcpy(lf->ref_deltas, cm->prev_frame->ref_deltas, REF_FRAMES);
+    memcpy(lf->mode_deltas, cm->prev_frame->mode_deltas, MAX_MODE_LF_DELTAS);
+  } else {
+    av1_set_default_ref_deltas(lf->ref_deltas);
+    av1_set_default_mode_deltas(lf->mode_deltas);
+  }
+  lf->filter_level[0] = aom_rb_read_literal(rb, 6);
+  lf->filter_level[1] = aom_rb_read_literal(rb, 6);
+  if (num_planes > 1) {
+    if (lf->filter_level[0] || lf->filter_level[1]) {
+      lf->filter_level_u = aom_rb_read_literal(rb, 6);
+      lf->filter_level_v = aom_rb_read_literal(rb, 6);
+    }
+  }
+  lf->sharpness_level = aom_rb_read_literal(rb, 3);
+
+  // Read in loop filter deltas applied at the MB level based on mode or ref
+  // frame.
+  lf->mode_ref_delta_update = 0;
+
+  lf->mode_ref_delta_enabled = aom_rb_read_bit(rb);
+  if (lf->mode_ref_delta_enabled) {
+    lf->mode_ref_delta_update = aom_rb_read_bit(rb);
+    if (lf->mode_ref_delta_update) {
+      for (int i = 0; i < REF_FRAMES; i++)
+        if (aom_rb_read_bit(rb))
+          lf->ref_deltas[i] = aom_rb_read_inv_signed_literal(rb, 6);
+
+      for (int i = 0; i < MAX_MODE_LF_DELTAS; i++)
+        if (aom_rb_read_bit(rb))
+          lf->mode_deltas[i] = aom_rb_read_inv_signed_literal(rb, 6);
+    }
+  }
+
+  // write deltas to frame buffer
+  memcpy(cm->cur_frame->ref_deltas, lf->ref_deltas, REF_FRAMES);
+  memcpy(cm->cur_frame->mode_deltas, lf->mode_deltas, MAX_MODE_LF_DELTAS);
+}
+
+static AOM_INLINE void setup_cdef(AV1_COMMON *cm,
+                                  struct aom_read_bit_buffer *rb) {
+  const int num_planes = av1_num_planes(cm);
+  CdefInfo *const cdef_info = &cm->cdef_info;
+
+  if (cm->features.allow_intrabc) return;
+  cdef_info->cdef_damping = aom_rb_read_literal(rb, 2) + 3;
+  cdef_info->cdef_bits = aom_rb_read_literal(rb, 2);
+  cdef_info->nb_cdef_strengths = 1 << cdef_info->cdef_bits;
+  for (int i = 0; i < cdef_info->nb_cdef_strengths; i++) {
+    cdef_info->cdef_strengths[i] = aom_rb_read_literal(rb, CDEF_STRENGTH_BITS);
+    cdef_info->cdef_uv_strengths[i] =
+        num_planes > 1 ? aom_rb_read_literal(rb, CDEF_STRENGTH_BITS) : 0;
+  }
+}
+
+static INLINE int read_delta_q(struct aom_read_bit_buffer *rb) {
+  return aom_rb_read_bit(rb) ? aom_rb_read_inv_signed_literal(rb, 6) : 0;
+}
+
+static AOM_INLINE void setup_quantization(CommonQuantParams *quant_params,
+                                          int num_planes,
+                                          bool separate_uv_delta_q,
+                                          struct aom_read_bit_buffer *rb) {
+  quant_params->base_qindex = aom_rb_read_literal(rb, QINDEX_BITS);
+  quant_params->y_dc_delta_q = read_delta_q(rb);
+  if (num_planes > 1) {
+    int diff_uv_delta = 0;
+    if (separate_uv_delta_q) diff_uv_delta = aom_rb_read_bit(rb);
+    quant_params->u_dc_delta_q = read_delta_q(rb);
+    quant_params->u_ac_delta_q = read_delta_q(rb);
+    if (diff_uv_delta) {
+      quant_params->v_dc_delta_q = read_delta_q(rb);
+      quant_params->v_ac_delta_q = read_delta_q(rb);
+    } else {
+      quant_params->v_dc_delta_q = quant_params->u_dc_delta_q;
+      quant_params->v_ac_delta_q = quant_params->u_ac_delta_q;
+    }
+  } else {
+    quant_params->u_dc_delta_q = 0;
+    quant_params->u_ac_delta_q = 0;
+    quant_params->v_dc_delta_q = 0;
+    quant_params->v_ac_delta_q = 0;
+  }
+  quant_params->using_qmatrix = aom_rb_read_bit(rb);
+  if (quant_params->using_qmatrix) {
+    quant_params->qmatrix_level_y = aom_rb_read_literal(rb, QM_LEVEL_BITS);
+    quant_params->qmatrix_level_u = aom_rb_read_literal(rb, QM_LEVEL_BITS);
+    if (!separate_uv_delta_q)
+      quant_params->qmatrix_level_v = quant_params->qmatrix_level_u;
+    else
+      quant_params->qmatrix_level_v = aom_rb_read_literal(rb, QM_LEVEL_BITS);
+  } else {
+    quant_params->qmatrix_level_y = 0;
+    quant_params->qmatrix_level_u = 0;
+    quant_params->qmatrix_level_v = 0;
+  }
+}
+
+// Build y/uv dequant values based on segmentation.
+static AOM_INLINE void setup_segmentation_dequant(AV1_COMMON *const cm,
+                                                  MACROBLOCKD *const xd) {
+  const int bit_depth = cm->seq_params.bit_depth;
+  // When segmentation is disabled, only the first value is used.  The
+  // remaining are don't cares.
+  const int max_segments = cm->seg.enabled ? MAX_SEGMENTS : 1;
+  CommonQuantParams *const quant_params = &cm->quant_params;
+  for (int i = 0; i < max_segments; ++i) {
+    const int qindex = xd->qindex[i];
+    quant_params->y_dequant_QTX[i][0] =
+        av1_dc_quant_QTX(qindex, quant_params->y_dc_delta_q, bit_depth);
+    quant_params->y_dequant_QTX[i][1] = av1_ac_quant_QTX(qindex, 0, bit_depth);
+    quant_params->u_dequant_QTX[i][0] =
+        av1_dc_quant_QTX(qindex, quant_params->u_dc_delta_q, bit_depth);
+    quant_params->u_dequant_QTX[i][1] =
+        av1_ac_quant_QTX(qindex, quant_params->u_ac_delta_q, bit_depth);
+    quant_params->v_dequant_QTX[i][0] =
+        av1_dc_quant_QTX(qindex, quant_params->v_dc_delta_q, bit_depth);
+    quant_params->v_dequant_QTX[i][1] =
+        av1_ac_quant_QTX(qindex, quant_params->v_ac_delta_q, bit_depth);
+    const int use_qmatrix = av1_use_qmatrix(quant_params, xd, i);
+    // NB: depends on base index so there is only 1 set per frame
+    // No quant weighting when lossless or signalled not using QM
+    const int qmlevel_y =
+        use_qmatrix ? quant_params->qmatrix_level_y : NUM_QM_LEVELS - 1;
+    for (int j = 0; j < TX_SIZES_ALL; ++j) {
+      quant_params->y_iqmatrix[i][j] =
+          av1_iqmatrix(quant_params, qmlevel_y, AOM_PLANE_Y, j);
+    }
+    const int qmlevel_u =
+        use_qmatrix ? quant_params->qmatrix_level_u : NUM_QM_LEVELS - 1;
+    for (int j = 0; j < TX_SIZES_ALL; ++j) {
+      quant_params->u_iqmatrix[i][j] =
+          av1_iqmatrix(quant_params, qmlevel_u, AOM_PLANE_U, j);
+    }
+    const int qmlevel_v =
+        use_qmatrix ? quant_params->qmatrix_level_v : NUM_QM_LEVELS - 1;
+    for (int j = 0; j < TX_SIZES_ALL; ++j) {
+      quant_params->v_iqmatrix[i][j] =
+          av1_iqmatrix(quant_params, qmlevel_v, AOM_PLANE_V, j);
+    }
+  }
+}
+
+static InterpFilter read_frame_interp_filter(struct aom_read_bit_buffer *rb) {
+  return aom_rb_read_bit(rb) ? SWITCHABLE
+                             : aom_rb_read_literal(rb, LOG_SWITCHABLE_FILTERS);
+}
+
+static AOM_INLINE void setup_render_size(AV1_COMMON *cm,
+                                         struct aom_read_bit_buffer *rb) {
+  cm->render_width = cm->superres_upscaled_width;
+  cm->render_height = cm->superres_upscaled_height;
+  if (aom_rb_read_bit(rb))
+    av1_read_frame_size(rb, 16, 16, &cm->render_width, &cm->render_height);
+}
+
+// TODO(afergs): make "struct aom_read_bit_buffer *const rb"?
+static AOM_INLINE void setup_superres(AV1_COMMON *const cm,
+                                      struct aom_read_bit_buffer *rb,
+                                      int *width, int *height) {
+  cm->superres_upscaled_width = *width;
+  cm->superres_upscaled_height = *height;
+
+  const SequenceHeader *const seq_params = &cm->seq_params;
+  if (!seq_params->enable_superres) return;
+
+  if (aom_rb_read_bit(rb)) {
+    cm->superres_scale_denominator =
+        (uint8_t)aom_rb_read_literal(rb, SUPERRES_SCALE_BITS);
+    cm->superres_scale_denominator += SUPERRES_SCALE_DENOMINATOR_MIN;
+    // Don't edit cm->width or cm->height directly, or the buffers won't get
+    // resized correctly
+    av1_calculate_scaled_superres_size(width, height,
+                                       cm->superres_scale_denominator);
+  } else {
+    // 1:1 scaling - ie. no scaling, scale not provided
+    cm->superres_scale_denominator = SCALE_NUMERATOR;
+  }
+}
+
+static AOM_INLINE void resize_context_buffers(AV1_COMMON *cm, int width,
+                                              int height) {
+#if CONFIG_SIZE_LIMIT
+  if (width > DECODE_WIDTH_LIMIT || height > DECODE_HEIGHT_LIMIT)
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "Dimensions of %dx%d beyond allowed size of %dx%d.",
+                       width, height, DECODE_WIDTH_LIMIT, DECODE_HEIGHT_LIMIT);
+#endif
+  if (cm->width != width || cm->height != height) {
+    const int new_mi_rows =
+        ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2) >> MI_SIZE_LOG2;
+    const int new_mi_cols =
+        ALIGN_POWER_OF_TWO(width, MI_SIZE_LOG2) >> MI_SIZE_LOG2;
+
+    // Allocations in av1_alloc_context_buffers() depend on individual
+    // dimensions as well as the overall size.
+    if (new_mi_cols > cm->mi_params.mi_cols ||
+        new_mi_rows > cm->mi_params.mi_rows) {
+      if (av1_alloc_context_buffers(cm, width, height)) {
+        // The cm->mi_* values have been cleared and any existing context
+        // buffers have been freed. Clear cm->width and cm->height to be
+        // consistent and to force a realloc next time.
+        cm->width = 0;
+        cm->height = 0;
+        aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                           "Failed to allocate context buffers");
+      }
+    } else {
+      cm->mi_params.set_mb_mi(&cm->mi_params, width, height);
+    }
+    av1_init_mi_buffers(&cm->mi_params);
+    cm->width = width;
+    cm->height = height;
+  }
+
+  ensure_mv_buffer(cm->cur_frame, cm);
+  cm->cur_frame->width = cm->width;
+  cm->cur_frame->height = cm->height;
+}
+
+static AOM_INLINE void setup_buffer_pool(AV1_COMMON *cm) {
+  BufferPool *const pool = cm->buffer_pool;
+  const SequenceHeader *const seq_params = &cm->seq_params;
+
+  lock_buffer_pool(pool);
+  if (aom_realloc_frame_buffer(
+          &cm->cur_frame->buf, cm->width, cm->height, seq_params->subsampling_x,
+          seq_params->subsampling_y, seq_params->use_highbitdepth,
+          AOM_DEC_BORDER_IN_PIXELS, cm->features.byte_alignment,
+          &cm->cur_frame->raw_frame_buffer, pool->get_fb_cb, pool->cb_priv)) {
+    unlock_buffer_pool(pool);
+    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate frame buffer");
+  }
+  unlock_buffer_pool(pool);
+
+  cm->cur_frame->buf.bit_depth = (unsigned int)seq_params->bit_depth;
+  cm->cur_frame->buf.color_primaries = seq_params->color_primaries;
+  cm->cur_frame->buf.transfer_characteristics =
+      seq_params->transfer_characteristics;
+  cm->cur_frame->buf.matrix_coefficients = seq_params->matrix_coefficients;
+  cm->cur_frame->buf.monochrome = seq_params->monochrome;
+  cm->cur_frame->buf.chroma_sample_position =
+      seq_params->chroma_sample_position;
+  cm->cur_frame->buf.color_range = seq_params->color_range;
+  cm->cur_frame->buf.render_width = cm->render_width;
+  cm->cur_frame->buf.render_height = cm->render_height;
+}
+
+static AOM_INLINE void setup_frame_size(AV1_COMMON *cm,
+                                        int frame_size_override_flag,
+                                        struct aom_read_bit_buffer *rb) {
+  const SequenceHeader *const seq_params = &cm->seq_params;
+  int width, height;
+
+  if (frame_size_override_flag) {
+    int num_bits_width = seq_params->num_bits_width;
+    int num_bits_height = seq_params->num_bits_height;
+    av1_read_frame_size(rb, num_bits_width, num_bits_height, &width, &height);
+    if (width > seq_params->max_frame_width ||
+        height > seq_params->max_frame_height) {
+      aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                         "Frame dimensions are larger than the maximum values");
+    }
+  } else {
+    width = seq_params->max_frame_width;
+    height = seq_params->max_frame_height;
+  }
+
+  setup_superres(cm, rb, &width, &height);
+  resize_context_buffers(cm, width, height);
+  setup_render_size(cm, rb);
+  setup_buffer_pool(cm);
+}
+
+static AOM_INLINE void setup_sb_size(SequenceHeader *seq_params,
+                                     struct aom_read_bit_buffer *rb) {
+  set_sb_size(seq_params, aom_rb_read_bit(rb) ? BLOCK_128X128 : BLOCK_64X64);
+}
+
+static INLINE int valid_ref_frame_img_fmt(aom_bit_depth_t ref_bit_depth,
+                                          int ref_xss, int ref_yss,
+                                          aom_bit_depth_t this_bit_depth,
+                                          int this_xss, int this_yss) {
+  return ref_bit_depth == this_bit_depth && ref_xss == this_xss &&
+         ref_yss == this_yss;
+}
+
+static AOM_INLINE void setup_frame_size_with_refs(
+    AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
+  int width, height;
+  int found = 0;
+  int has_valid_ref_frame = 0;
+  for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
+    if (aom_rb_read_bit(rb)) {
+      const RefCntBuffer *const ref_buf = get_ref_frame_buf(cm, i);
+      // This will never be NULL in a normal stream, as streams are required to
+      // have a shown keyframe before any inter frames, which would refresh all
+      // the reference buffers. However, it might be null if we're starting in
+      // the middle of a stream, and static analysis will error if we don't do
+      // a null check here.
+      if (ref_buf == NULL) {
+        aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                           "Invalid condition: invalid reference buffer");
+      } else {
+        const YV12_BUFFER_CONFIG *const buf = &ref_buf->buf;
+        width = buf->y_crop_width;
+        height = buf->y_crop_height;
+        cm->render_width = buf->render_width;
+        cm->render_height = buf->render_height;
+        setup_superres(cm, rb, &width, &height);
+        resize_context_buffers(cm, width, height);
+        found = 1;
+        break;
+      }
+    }
+  }
+
+  const SequenceHeader *const seq_params = &cm->seq_params;
+  if (!found) {
+    int num_bits_width = seq_params->num_bits_width;
+    int num_bits_height = seq_params->num_bits_height;
+
+    av1_read_frame_size(rb, num_bits_width, num_bits_height, &width, &height);
+    setup_superres(cm, rb, &width, &height);
+    resize_context_buffers(cm, width, height);
+    setup_render_size(cm, rb);
+  }
+
+  if (width <= 0 || height <= 0)
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "Invalid frame size");
+
+  // Check to make sure at least one of frames that this frame references
+  // has valid dimensions.
+  for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
+    const RefCntBuffer *const ref_frame = get_ref_frame_buf(cm, i);
+    has_valid_ref_frame |=
+        valid_ref_frame_size(ref_frame->buf.y_crop_width,
+                             ref_frame->buf.y_crop_height, width, height);
+  }
+  if (!has_valid_ref_frame)
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "Referenced frame has invalid size");
+  for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
+    const RefCntBuffer *const ref_frame = get_ref_frame_buf(cm, i);
+    if (!valid_ref_frame_img_fmt(
+            ref_frame->buf.bit_depth, ref_frame->buf.subsampling_x,
+            ref_frame->buf.subsampling_y, seq_params->bit_depth,
+            seq_params->subsampling_x, seq_params->subsampling_y))
+      aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                         "Referenced frame has incompatible color format");
+  }
+  setup_buffer_pool(cm);
+}
+
+// Same function as av1_read_uniform but reading from uncompresses header wb
+static int rb_read_uniform(struct aom_read_bit_buffer *const rb, int n) {
+  const int l = get_unsigned_bits(n);
+  const int m = (1 << l) - n;
+  const int v = aom_rb_read_literal(rb, l - 1);
+  assert(l != 0);
+  if (v < m)
+    return v;
+  else
+    return (v << 1) - m + aom_rb_read_bit(rb);
+}
+
+static AOM_INLINE void read_tile_info_max_tile(
+    AV1_COMMON *const cm, struct aom_read_bit_buffer *const rb) {
+  const SequenceHeader *const seq_params = &cm->seq_params;
+  CommonTileParams *const tiles = &cm->tiles;
+  int width_mi =
+      ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols, seq_params->mib_size_log2);
+  int height_mi =
+      ALIGN_POWER_OF_TWO(cm->mi_params.mi_rows, seq_params->mib_size_log2);
+  int width_sb = width_mi >> seq_params->mib_size_log2;
+  int height_sb = height_mi >> seq_params->mib_size_log2;
+
+  av1_get_tile_limits(cm);
+  tiles->uniform_spacing = aom_rb_read_bit(rb);
+
+  // Read tile columns
+  if (tiles->uniform_spacing) {
+    tiles->log2_cols = tiles->min_log2_cols;
+    while (tiles->log2_cols < tiles->max_log2_cols) {
+      if (!aom_rb_read_bit(rb)) {
+        break;
+      }
+      tiles->log2_cols++;
+    }
+  } else {
+    int i;
+    int start_sb;
+    for (i = 0, start_sb = 0; width_sb > 0 && i < MAX_TILE_COLS; i++) {
+      const int size_sb =
+          1 + rb_read_uniform(rb, AOMMIN(width_sb, tiles->max_width_sb));
+      tiles->col_start_sb[i] = start_sb;
+      start_sb += size_sb;
+      width_sb -= size_sb;
+    }
+    tiles->cols = i;
+    tiles->col_start_sb[i] = start_sb + width_sb;
+  }
+  av1_calculate_tile_cols(seq_params, cm->mi_params.mi_rows,
+                          cm->mi_params.mi_cols, tiles);
+
+  // Read tile rows
+  if (tiles->uniform_spacing) {
+    tiles->log2_rows = tiles->min_log2_rows;
+    while (tiles->log2_rows < tiles->max_log2_rows) {
+      if (!aom_rb_read_bit(rb)) {
+        break;
+      }
+      tiles->log2_rows++;
+    }
+  } else {
+    int i;
+    int start_sb;
+    for (i = 0, start_sb = 0; height_sb > 0 && i < MAX_TILE_ROWS; i++) {
+      const int size_sb =
+          1 + rb_read_uniform(rb, AOMMIN(height_sb, tiles->max_height_sb));
+      tiles->row_start_sb[i] = start_sb;
+      start_sb += size_sb;
+      height_sb -= size_sb;
+    }
+    tiles->rows = i;
+    tiles->row_start_sb[i] = start_sb + height_sb;
+  }
+  av1_calculate_tile_rows(seq_params, cm->mi_params.mi_rows, tiles);
+}
+
+void av1_set_single_tile_decoding_mode(AV1_COMMON *const cm) {
+  cm->tiles.single_tile_decoding = 0;
+  if (cm->tiles.large_scale) {
+    struct loopfilter *lf = &cm->lf;
+    RestorationInfo *const rst_info = cm->rst_info;
+    const CdefInfo *const cdef_info = &cm->cdef_info;
+
+    // Figure out single_tile_decoding by loopfilter_level.
+    const int no_loopfilter = !(lf->filter_level[0] || lf->filter_level[1]);
+    const int no_cdef = cdef_info->cdef_bits == 0 &&
+                        cdef_info->cdef_strengths[0] == 0 &&
+                        cdef_info->cdef_uv_strengths[0] == 0;
+    const int no_restoration =
+        rst_info[0].frame_restoration_type == RESTORE_NONE &&
+        rst_info[1].frame_restoration_type == RESTORE_NONE &&
+        rst_info[2].frame_restoration_type == RESTORE_NONE;
+    assert(IMPLIES(cm->features.coded_lossless, no_loopfilter && no_cdef));
+    assert(IMPLIES(cm->features.all_lossless, no_restoration));
+    cm->tiles.single_tile_decoding = no_loopfilter && no_cdef && no_restoration;
+  }
+}
+
+static AOM_INLINE void read_tile_info(AV1Decoder *const pbi,
+                                      struct aom_read_bit_buffer *const rb) {
+  AV1_COMMON *const cm = &pbi->common;
+
+  read_tile_info_max_tile(cm, rb);
+
+  pbi->context_update_tile_id = 0;
+  if (cm->tiles.rows * cm->tiles.cols > 1) {
+    // tile to use for cdf update
+    pbi->context_update_tile_id =
+        aom_rb_read_literal(rb, cm->tiles.log2_rows + cm->tiles.log2_cols);
+    if (pbi->context_update_tile_id >= cm->tiles.rows * cm->tiles.cols) {
+      aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                         "Invalid context_update_tile_id");
+    }
+    // tile size magnitude
+    pbi->tile_size_bytes = aom_rb_read_literal(rb, 2) + 1;
+  }
+}
+
+#if EXT_TILE_DEBUG
+static AOM_INLINE void read_ext_tile_info(
+    AV1Decoder *const pbi, struct aom_read_bit_buffer *const rb) {
+  AV1_COMMON *const cm = &pbi->common;
+
+  // This information is stored as a separate byte.
+  int mod = rb->bit_offset % CHAR_BIT;
+  if (mod > 0) aom_rb_read_literal(rb, CHAR_BIT - mod);
+  assert(rb->bit_offset % CHAR_BIT == 0);
+
+  if (cm->tiles.cols * cm->tiles.rows > 1) {
+    // Read the number of bytes used to store tile size
+    pbi->tile_col_size_bytes = aom_rb_read_literal(rb, 2) + 1;
+    pbi->tile_size_bytes = aom_rb_read_literal(rb, 2) + 1;
+  }
+}
+#endif  // EXT_TILE_DEBUG
+
+static size_t mem_get_varsize(const uint8_t *src, int sz) {
+  switch (sz) {
+    case 1: return src[0];
+    case 2: return mem_get_le16(src);
+    case 3: return mem_get_le24(src);
+    case 4: return mem_get_le32(src);
+    default: assert(0 && "Invalid size"); return -1;
+  }
+}
+
+#if EXT_TILE_DEBUG
+// Reads the next tile returning its size and adjusting '*data' accordingly
+// based on 'is_last'. On return, '*data' is updated to point to the end of the
+// raw tile buffer in the bit stream.
+static AOM_INLINE void get_ls_tile_buffer(
+    const uint8_t *const data_end, struct aom_internal_error_info *error_info,
+    const uint8_t **data, TileBufferDec (*const tile_buffers)[MAX_TILE_COLS],
+    int tile_size_bytes, int col, int row, int tile_copy_mode) {
+  size_t size;
+
+  size_t copy_size = 0;
+  const uint8_t *copy_data = NULL;
+
+  if (!read_is_valid(*data, tile_size_bytes, data_end))
+    aom_internal_error(error_info, AOM_CODEC_CORRUPT_FRAME,
+                       "Truncated packet or corrupt tile length");
+  size = mem_get_varsize(*data, tile_size_bytes);
+
+  // If tile_copy_mode = 1, then the top bit of the tile header indicates copy
+  // mode.
+  if (tile_copy_mode && (size >> (tile_size_bytes * 8 - 1)) == 1) {
+    // The remaining bits in the top byte signal the row offset
+    int offset = (size >> (tile_size_bytes - 1) * 8) & 0x7f;
+
+    // Currently, only use tiles in same column as reference tiles.
+    copy_data = tile_buffers[row - offset][col].data;
+    copy_size = tile_buffers[row - offset][col].size;
+    size = 0;
+  } else {
+    size += AV1_MIN_TILE_SIZE_BYTES;
+  }
+
+  *data += tile_size_bytes;
+
+  if (size > (size_t)(data_end - *data))
+    aom_internal_error(error_info, AOM_CODEC_CORRUPT_FRAME,
+                       "Truncated packet or corrupt tile size");
+
+  if (size > 0) {
+    tile_buffers[row][col].data = *data;
+    tile_buffers[row][col].size = size;
+  } else {
+    tile_buffers[row][col].data = copy_data;
+    tile_buffers[row][col].size = copy_size;
+  }
+
+  *data += size;
+}
+
+// Returns the end of the last tile buffer
+// (tile_buffers[cm->tiles.rows - 1][cm->tiles.cols - 1]).
+static const uint8_t *get_ls_tile_buffers(
+    AV1Decoder *pbi, const uint8_t *data, const uint8_t *data_end,
+    TileBufferDec (*const tile_buffers)[MAX_TILE_COLS]) {
+  AV1_COMMON *const cm = &pbi->common;
+  const int tile_cols = cm->tiles.cols;
+  const int tile_rows = cm->tiles.rows;
+  const int have_tiles = tile_cols * tile_rows > 1;
+  const uint8_t *raw_data_end;  // The end of the last tile buffer
+
+  if (!have_tiles) {
+    const size_t tile_size = data_end - data;
+    tile_buffers[0][0].data = data;
+    tile_buffers[0][0].size = tile_size;
+    raw_data_end = NULL;
+  } else {
+    // We locate only the tile buffers that are required, which are the ones
+    // specified by pbi->dec_tile_col and pbi->dec_tile_row. Also, we always
+    // need the last (bottom right) tile buffer, as we need to know where the
+    // end of the compressed frame buffer is for proper superframe decoding.
+
+    const uint8_t *tile_col_data_end[MAX_TILE_COLS] = { NULL };
+    const uint8_t *const data_start = data;
+
+    const int dec_tile_row = AOMMIN(pbi->dec_tile_row, tile_rows);
+    const int single_row = pbi->dec_tile_row >= 0;
+    const int tile_rows_start = single_row ? dec_tile_row : 0;
+    const int tile_rows_end = single_row ? tile_rows_start + 1 : tile_rows;
+    const int dec_tile_col = AOMMIN(pbi->dec_tile_col, tile_cols);
+    const int single_col = pbi->dec_tile_col >= 0;
+    const int tile_cols_start = single_col ? dec_tile_col : 0;
+    const int tile_cols_end = single_col ? tile_cols_start + 1 : tile_cols;
+
+    const int tile_col_size_bytes = pbi->tile_col_size_bytes;
+    const int tile_size_bytes = pbi->tile_size_bytes;
+    int tile_width, tile_height;
+    av1_get_uniform_tile_size(cm, &tile_width, &tile_height);
+    const int tile_copy_mode =
+        ((AOMMAX(tile_width, tile_height) << MI_SIZE_LOG2) <= 256) ? 1 : 0;
+    // Read tile column sizes for all columns (we need the last tile buffer)
+    for (int c = 0; c < tile_cols; ++c) {
+      const int is_last = c == tile_cols - 1;
+      size_t tile_col_size;
+
+      if (!is_last) {
+        tile_col_size = mem_get_varsize(data, tile_col_size_bytes);
+        data += tile_col_size_bytes;
+        tile_col_data_end[c] = data + tile_col_size;
+      } else {
+        tile_col_size = data_end - data;
+        tile_col_data_end[c] = data_end;
+      }
+      data += tile_col_size;
+    }
+
+    data = data_start;
+
+    // Read the required tile sizes.
+    for (int c = tile_cols_start; c < tile_cols_end; ++c) {
+      const int is_last = c == tile_cols - 1;
+
+      if (c > 0) data = tile_col_data_end[c - 1];
+
+      if (!is_last) data += tile_col_size_bytes;
+
+      // Get the whole of the last column, otherwise stop at the required tile.
+      for (int r = 0; r < (is_last ? tile_rows : tile_rows_end); ++r) {
+        get_ls_tile_buffer(tile_col_data_end[c], &pbi->common.error, &data,
+                           tile_buffers, tile_size_bytes, c, r, tile_copy_mode);
+      }
+    }
+
+    // If we have not read the last column, then read it to get the last tile.
+    if (tile_cols_end != tile_cols) {
+      const int c = tile_cols - 1;
+
+      data = tile_col_data_end[c - 1];
+
+      for (int r = 0; r < tile_rows; ++r) {
+        get_ls_tile_buffer(tile_col_data_end[c], &pbi->common.error, &data,
+                           tile_buffers, tile_size_bytes, c, r, tile_copy_mode);
+      }
+    }
+    raw_data_end = data;
+  }
+  return raw_data_end;
+}
+#endif  // EXT_TILE_DEBUG
+
+static const uint8_t *get_ls_single_tile_buffer(
+    AV1Decoder *pbi, const uint8_t *data,
+    TileBufferDec (*const tile_buffers)[MAX_TILE_COLS]) {
+  assert(pbi->dec_tile_row >= 0 && pbi->dec_tile_col >= 0);
+  tile_buffers[pbi->dec_tile_row][pbi->dec_tile_col].data = data;
+  tile_buffers[pbi->dec_tile_row][pbi->dec_tile_col].size =
+      (size_t)pbi->coded_tile_data_size;
+  return data + pbi->coded_tile_data_size;
+}
+
+// Reads the next tile returning its size and adjusting '*data' accordingly
+// based on 'is_last'.
+static AOM_INLINE void get_tile_buffer(
+    const uint8_t *const data_end, const int tile_size_bytes, int is_last,
+    struct aom_internal_error_info *error_info, const uint8_t **data,
+    TileBufferDec *const buf) {
+  size_t size;
+
+  if (!is_last) {
+    if (!read_is_valid(*data, tile_size_bytes, data_end))
+      aom_internal_error(error_info, AOM_CODEC_CORRUPT_FRAME,
+                         "Not enough data to read tile size");
+
+    size = mem_get_varsize(*data, tile_size_bytes) + AV1_MIN_TILE_SIZE_BYTES;
+    *data += tile_size_bytes;
+
+    if (size > (size_t)(data_end - *data))
+      aom_internal_error(error_info, AOM_CODEC_CORRUPT_FRAME,
+                         "Truncated packet or corrupt tile size");
+  } else {
+    size = data_end - *data;
+  }
+
+  buf->data = *data;
+  buf->size = size;
+
+  *data += size;
+}
+
+static AOM_INLINE void get_tile_buffers(
+    AV1Decoder *pbi, const uint8_t *data, const uint8_t *data_end,
+    TileBufferDec (*const tile_buffers)[MAX_TILE_COLS], int start_tile,
+    int end_tile) {
+  AV1_COMMON *const cm = &pbi->common;
+  const int tile_cols = cm->tiles.cols;
+  const int tile_rows = cm->tiles.rows;
+  int tc = 0;
+
+  for (int r = 0; r < tile_rows; ++r) {
+    for (int c = 0; c < tile_cols; ++c, ++tc) {
+      TileBufferDec *const buf = &tile_buffers[r][c];
+
+      const int is_last = (tc == end_tile);
+      const size_t hdr_offset = 0;
+
+      if (tc < start_tile || tc > end_tile) continue;
+
+      if (data + hdr_offset >= data_end)
+        aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                           "Data ended before all tiles were read.");
+      data += hdr_offset;
+      get_tile_buffer(data_end, pbi->tile_size_bytes, is_last,
+                      &pbi->common.error, &data, buf);
+    }
+  }
+}
+
+static AOM_INLINE void set_cb_buffer(AV1Decoder *pbi, MACROBLOCKD *const xd,
+                                     CB_BUFFER *cb_buffer_base,
+                                     const int num_planes, int mi_row,
+                                     int mi_col) {
+  AV1_COMMON *const cm = &pbi->common;
+  int mib_size_log2 = cm->seq_params.mib_size_log2;
+  int stride = (cm->mi_params.mi_cols >> mib_size_log2) + 1;
+  int offset = (mi_row >> mib_size_log2) * stride + (mi_col >> mib_size_log2);
+  CB_BUFFER *cb_buffer = cb_buffer_base + offset;
+
+  for (int plane = 0; plane < num_planes; ++plane) {
+    xd->plane[plane].dqcoeff_block = cb_buffer->dqcoeff[plane];
+    xd->plane[plane].eob_data = cb_buffer->eob_data[plane];
+    xd->cb_offset[plane] = 0;
+    xd->txb_offset[plane] = 0;
+  }
+  xd->plane[0].color_index_map = cb_buffer->color_index_map[0];
+  xd->plane[1].color_index_map = cb_buffer->color_index_map[1];
+  xd->color_index_map_offset[0] = 0;
+  xd->color_index_map_offset[1] = 0;
+}
+
+static AOM_INLINE void decoder_alloc_tile_data(AV1Decoder *pbi,
+                                               const int n_tiles) {
+  AV1_COMMON *const cm = &pbi->common;
+  aom_free(pbi->tile_data);
+  CHECK_MEM_ERROR(cm, pbi->tile_data,
+                  aom_memalign(32, n_tiles * sizeof(*pbi->tile_data)));
+  pbi->allocated_tiles = n_tiles;
+  for (int i = 0; i < n_tiles; i++) {
+    TileDataDec *const tile_data = pbi->tile_data + i;
+    av1_zero(tile_data->dec_row_mt_sync);
+  }
+  pbi->allocated_row_mt_sync_rows = 0;
+}
+
+// Set up nsync by width.
+static INLINE int get_sync_range(int width) {
+// nsync numbers are picked by testing.
+#if 0
+  if (width < 640)
+    return 1;
+  else if (width <= 1280)
+    return 2;
+  else if (width <= 4096)
+    return 4;
+  else
+    return 8;
+#else
+  (void)width;
+#endif
+  return 1;
+}
+
+// Allocate memory for decoder row synchronization
+static AOM_INLINE void dec_row_mt_alloc(AV1DecRowMTSync *dec_row_mt_sync,
+                                        AV1_COMMON *cm, int rows) {
+  dec_row_mt_sync->allocated_sb_rows = rows;
+#if CONFIG_MULTITHREAD
+  {
+    int i;
+
+    CHECK_MEM_ERROR(cm, dec_row_mt_sync->mutex_,
+                    aom_malloc(sizeof(*(dec_row_mt_sync->mutex_)) * rows));
+    if (dec_row_mt_sync->mutex_) {
+      for (i = 0; i < rows; ++i) {
+        pthread_mutex_init(&dec_row_mt_sync->mutex_[i], NULL);
+      }
+    }
+
+    CHECK_MEM_ERROR(cm, dec_row_mt_sync->cond_,
+                    aom_malloc(sizeof(*(dec_row_mt_sync->cond_)) * rows));
+    if (dec_row_mt_sync->cond_) {
+      for (i = 0; i < rows; ++i) {
+        pthread_cond_init(&dec_row_mt_sync->cond_[i], NULL);
+      }
+    }
+  }
+#endif  // CONFIG_MULTITHREAD
+
+  CHECK_MEM_ERROR(cm, dec_row_mt_sync->cur_sb_col,
+                  aom_malloc(sizeof(*(dec_row_mt_sync->cur_sb_col)) * rows));
+
+  // Set up nsync.
+  dec_row_mt_sync->sync_range = get_sync_range(cm->width);
+}
+
+// Deallocate decoder row synchronization related mutex and data
+void av1_dec_row_mt_dealloc(AV1DecRowMTSync *dec_row_mt_sync) {
+  if (dec_row_mt_sync != NULL) {
+#if CONFIG_MULTITHREAD
+    int i;
+    if (dec_row_mt_sync->mutex_ != NULL) {
+      for (i = 0; i < dec_row_mt_sync->allocated_sb_rows; ++i) {
+        pthread_mutex_destroy(&dec_row_mt_sync->mutex_[i]);
+      }
+      aom_free(dec_row_mt_sync->mutex_);
+    }
+    if (dec_row_mt_sync->cond_ != NULL) {
+      for (i = 0; i < dec_row_mt_sync->allocated_sb_rows; ++i) {
+        pthread_cond_destroy(&dec_row_mt_sync->cond_[i]);
+      }
+      aom_free(dec_row_mt_sync->cond_);
+    }
+#endif  // CONFIG_MULTITHREAD
+    aom_free(dec_row_mt_sync->cur_sb_col);
+
+    // clear the structure as the source of this call may be a resize in which
+    // case this call will be followed by an _alloc() which may fail.
+    av1_zero(*dec_row_mt_sync);
+  }
+}
+
+static INLINE void sync_read(AV1DecRowMTSync *const dec_row_mt_sync, int r,
+                             int c) {
+#if CONFIG_MULTITHREAD
+  const int nsync = dec_row_mt_sync->sync_range;
+
+  if (r && !(c & (nsync - 1))) {
+    pthread_mutex_t *const mutex = &dec_row_mt_sync->mutex_[r - 1];
+    pthread_mutex_lock(mutex);
+
+    while (c > dec_row_mt_sync->cur_sb_col[r - 1] - nsync) {
+      pthread_cond_wait(&dec_row_mt_sync->cond_[r - 1], mutex);
+    }
+    pthread_mutex_unlock(mutex);
+  }
+#else
+  (void)dec_row_mt_sync;
+  (void)r;
+  (void)c;
+#endif  // CONFIG_MULTITHREAD
+}
+
+static INLINE void sync_write(AV1DecRowMTSync *const dec_row_mt_sync, int r,
+                              int c, const int sb_cols) {
+#if CONFIG_MULTITHREAD
+  const int nsync = dec_row_mt_sync->sync_range;
+  int cur;
+  int sig = 1;
+
+  if (c < sb_cols - 1) {
+    cur = c;
+    if (c % nsync) sig = 0;
+  } else {
+    cur = sb_cols + nsync;
+  }
+
+  if (sig) {
+    pthread_mutex_lock(&dec_row_mt_sync->mutex_[r]);
+
+    dec_row_mt_sync->cur_sb_col[r] = cur;
+
+    pthread_cond_signal(&dec_row_mt_sync->cond_[r]);
+    pthread_mutex_unlock(&dec_row_mt_sync->mutex_[r]);
+  }
+#else
+  (void)dec_row_mt_sync;
+  (void)r;
+  (void)c;
+  (void)sb_cols;
+#endif  // CONFIG_MULTITHREAD
+}
+
+static AOM_INLINE void decode_tile_sb_row(AV1Decoder *pbi, ThreadData *const td,
+                                          TileInfo tile_info,
+                                          const int mi_row) {
+  AV1_COMMON *const cm = &pbi->common;
+  const int num_planes = av1_num_planes(cm);
+  TileDataDec *const tile_data =
+      pbi->tile_data + tile_info.tile_row * cm->tiles.cols + tile_info.tile_col;
+  const int sb_cols_in_tile = av1_get_sb_cols_in_tile(cm, tile_info);
+  const int sb_row_in_tile =
+      (mi_row - tile_info.mi_row_start) >> cm->seq_params.mib_size_log2;
+  int sb_col_in_tile = 0;
+
+  for (int mi_col = tile_info.mi_col_start; mi_col < tile_info.mi_col_end;
+       mi_col += cm->seq_params.mib_size, sb_col_in_tile++) {
+    set_cb_buffer(pbi, &td->xd, pbi->cb_buffer_base, num_planes, mi_row,
+                  mi_col);
+
+    sync_read(&tile_data->dec_row_mt_sync, sb_row_in_tile, sb_col_in_tile);
+
+    // Decoding of the super-block
+    decode_partition(pbi, td, mi_row, mi_col, td->bit_reader,
+                     cm->seq_params.sb_size, 0x2);
+
+    sync_write(&tile_data->dec_row_mt_sync, sb_row_in_tile, sb_col_in_tile,
+               sb_cols_in_tile);
+  }
+}
+
+static int check_trailing_bits_after_symbol_coder(aom_reader *r) {
+  if (aom_reader_has_overflowed(r)) return -1;
+
+  uint32_t nb_bits = aom_reader_tell(r);
+  uint32_t nb_bytes = (nb_bits + 7) >> 3;
+  const uint8_t *p = aom_reader_find_begin(r) + nb_bytes;
+
+  // aom_reader_tell() returns 1 for a newly initialized decoder, and the
+  // return value only increases as values are decoded. So nb_bits > 0, and
+  // thus p > p_begin. Therefore accessing p[-1] is safe.
+  uint8_t last_byte = p[-1];
+  uint8_t pattern = 128 >> ((nb_bits - 1) & 7);
+  if ((last_byte & (2 * pattern - 1)) != pattern) return -1;
+
+  // Make sure that all padding bytes are zero as required by the spec.
+  const uint8_t *p_end = aom_reader_find_end(r);
+  while (p < p_end) {
+    if (*p != 0) return -1;
+    p++;
+  }
+  return 0;
+}
+
+static AOM_INLINE void set_decode_func_pointers(ThreadData *td,
+                                                int parse_decode_flag) {
+  td->read_coeffs_tx_intra_block_visit = decode_block_void;
+  td->predict_and_recon_intra_block_visit = decode_block_void;
+  td->read_coeffs_tx_inter_block_visit = decode_block_void;
+  td->inverse_tx_inter_block_visit = decode_block_void;
+  td->predict_inter_block_visit = predict_inter_block_void;
+  td->cfl_store_inter_block_visit = cfl_store_inter_block_void;
+
+  if (parse_decode_flag & 0x1) {
+    td->read_coeffs_tx_intra_block_visit = read_coeffs_tx_intra_block;
+    td->read_coeffs_tx_inter_block_visit = av1_read_coeffs_txb_facade;
+  }
+  if (parse_decode_flag & 0x2) {
+    td->predict_and_recon_intra_block_visit =
+        predict_and_reconstruct_intra_block;
+    td->inverse_tx_inter_block_visit = inverse_transform_inter_block;
+    td->predict_inter_block_visit = predict_inter_block;
+    td->cfl_store_inter_block_visit = cfl_store_inter_block;
+  }
+}
+
+static AOM_INLINE void decode_tile(AV1Decoder *pbi, ThreadData *const td,
+                                   int tile_row, int tile_col) {
+  TileInfo tile_info;
+
+  AV1_COMMON *const cm = &pbi->common;
+  const int num_planes = av1_num_planes(cm);
+
+  av1_tile_set_row(&tile_info, cm, tile_row);
+  av1_tile_set_col(&tile_info, cm, tile_col);
+  av1_zero_above_context(cm, &td->xd, tile_info.mi_col_start,
+                         tile_info.mi_col_end, tile_row);
+  av1_reset_loop_filter_delta(&td->xd, num_planes);
+  av1_reset_loop_restoration(&td->xd, num_planes);
+
+  for (int mi_row = tile_info.mi_row_start; mi_row < tile_info.mi_row_end;
+       mi_row += cm->seq_params.mib_size) {
+    av1_zero_left_context(&td->xd);
+
+    for (int mi_col = tile_info.mi_col_start; mi_col < tile_info.mi_col_end;
+         mi_col += cm->seq_params.mib_size) {
+      set_cb_buffer(pbi, &td->xd, &td->cb_buffer_base, num_planes, 0, 0);
+
+      // Bit-stream parsing and decoding of the superblock
+      decode_partition(pbi, td, mi_row, mi_col, td->bit_reader,
+                       cm->seq_params.sb_size, 0x3);
+
+      if (aom_reader_has_overflowed(td->bit_reader)) {
+        aom_merge_corrupted_flag(&td->xd.corrupted, 1);
+        return;
+      }
+    }
+  }
+
+  int corrupted =
+      (check_trailing_bits_after_symbol_coder(td->bit_reader)) ? 1 : 0;
+  aom_merge_corrupted_flag(&td->xd.corrupted, corrupted);
+}
+
+static const uint8_t *decode_tiles(AV1Decoder *pbi, const uint8_t *data,
+                                   const uint8_t *data_end, int start_tile,
+                                   int end_tile) {
+  AV1_COMMON *const cm = &pbi->common;
+  ThreadData *const td = &pbi->td;
+  CommonTileParams *const tiles = &cm->tiles;
+  const int tile_cols = tiles->cols;
+  const int tile_rows = tiles->rows;
+  const int n_tiles = tile_cols * tile_rows;
+  TileBufferDec(*const tile_buffers)[MAX_TILE_COLS] = pbi->tile_buffers;
+  const int dec_tile_row = AOMMIN(pbi->dec_tile_row, tile_rows);
+  const int single_row = pbi->dec_tile_row >= 0;
+  const int dec_tile_col = AOMMIN(pbi->dec_tile_col, tile_cols);
+  const int single_col = pbi->dec_tile_col >= 0;
+  int tile_rows_start;
+  int tile_rows_end;
+  int tile_cols_start;
+  int tile_cols_end;
+  int inv_col_order;
+  int inv_row_order;
+  int tile_row, tile_col;
+  uint8_t allow_update_cdf;
+  const uint8_t *raw_data_end = NULL;
+
+  if (tiles->large_scale) {
+    tile_rows_start = single_row ? dec_tile_row : 0;
+    tile_rows_end = single_row ? dec_tile_row + 1 : tile_rows;
+    tile_cols_start = single_col ? dec_tile_col : 0;
+    tile_cols_end = single_col ? tile_cols_start + 1 : tile_cols;
+    inv_col_order = pbi->inv_tile_order && !single_col;
+    inv_row_order = pbi->inv_tile_order && !single_row;
+    allow_update_cdf = 0;
+  } else {
+    tile_rows_start = 0;
+    tile_rows_end = tile_rows;
+    tile_cols_start = 0;
+    tile_cols_end = tile_cols;
+    inv_col_order = pbi->inv_tile_order;
+    inv_row_order = pbi->inv_tile_order;
+    allow_update_cdf = 1;
+  }
+
+  // No tiles to decode.
+  if (tile_rows_end <= tile_rows_start || tile_cols_end <= tile_cols_start ||
+      // First tile is larger than end_tile.
+      tile_rows_start * tiles->cols + tile_cols_start > end_tile ||
+      // Last tile is smaller than start_tile.
+      (tile_rows_end - 1) * tiles->cols + tile_cols_end - 1 < start_tile)
+    return data;
+
+  allow_update_cdf = allow_update_cdf && !cm->features.disable_cdf_update;
+
+  assert(tile_rows <= MAX_TILE_ROWS);
+  assert(tile_cols <= MAX_TILE_COLS);
+
+#if EXT_TILE_DEBUG
+  if (tiles->large_scale && !pbi->ext_tile_debug)
+    raw_data_end = get_ls_single_tile_buffer(pbi, data, tile_buffers);
+  else if (tiles->large_scale && pbi->ext_tile_debug)
+    raw_data_end = get_ls_tile_buffers(pbi, data, data_end, tile_buffers);
+  else
+#endif  // EXT_TILE_DEBUG
+    get_tile_buffers(pbi, data, data_end, tile_buffers, start_tile, end_tile);
+
+  if (pbi->tile_data == NULL || n_tiles != pbi->allocated_tiles) {
+    decoder_alloc_tile_data(pbi, n_tiles);
+  }
+#if CONFIG_ACCOUNTING
+  if (pbi->acct_enabled) {
+    aom_accounting_reset(&pbi->accounting);
+  }
+#endif
+
+  set_decode_func_pointers(&pbi->td, 0x3);
+
+  // Load all tile information into thread_data.
+  td->xd = pbi->mb;
+  td->xd.corrupted = 0;
+  td->xd.mc_buf[0] = td->mc_buf[0];
+  td->xd.mc_buf[1] = td->mc_buf[1];
+  td->xd.tmp_conv_dst = td->tmp_conv_dst;
+  for (int j = 0; j < 2; ++j) {
+    td->xd.tmp_obmc_bufs[j] = td->tmp_obmc_bufs[j];
+  }
+
+  for (tile_row = tile_rows_start; tile_row < tile_rows_end; ++tile_row) {
+    const int row = inv_row_order ? tile_rows - 1 - tile_row : tile_row;
+
+    for (tile_col = tile_cols_start; tile_col < tile_cols_end; ++tile_col) {
+      const int col = inv_col_order ? tile_cols - 1 - tile_col : tile_col;
+      TileDataDec *const tile_data = pbi->tile_data + row * tiles->cols + col;
+      const TileBufferDec *const tile_bs_buf = &tile_buffers[row][col];
+
+      if (row * tiles->cols + col < start_tile ||
+          row * tiles->cols + col > end_tile)
+        continue;
+
+      td->bit_reader = &tile_data->bit_reader;
+      av1_zero(td->cb_buffer_base.dqcoeff);
+      av1_tile_init(&td->xd.tile, cm, row, col);
+      td->xd.current_qindex = cm->quant_params.base_qindex;
+      setup_bool_decoder(tile_bs_buf->data, data_end, tile_bs_buf->size,
+                         &cm->error, td->bit_reader, allow_update_cdf);
+#if CONFIG_ACCOUNTING
+      if (pbi->acct_enabled) {
+        td->bit_reader->accounting = &pbi->accounting;
+        td->bit_reader->accounting->last_tell_frac =
+            aom_reader_tell_frac(td->bit_reader);
+      } else {
+        td->bit_reader->accounting = NULL;
+      }
+#endif
+      av1_init_macroblockd(cm, &td->xd, NULL);
+      av1_init_above_context(&cm->above_contexts, av1_num_planes(cm), row,
+                             &td->xd);
+
+      // Initialise the tile context from the frame context
+      tile_data->tctx = *cm->fc;
+      td->xd.tile_ctx = &tile_data->tctx;
+
+      // decode tile
+      decode_tile(pbi, td, row, col);
+      aom_merge_corrupted_flag(&pbi->mb.corrupted, td->xd.corrupted);
+      if (pbi->mb.corrupted)
+        aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                           "Failed to decode tile data");
+    }
+  }
+
+  if (tiles->large_scale) {
+    if (n_tiles == 1) {
+      // Find the end of the single tile buffer
+      return aom_reader_find_end(&pbi->tile_data->bit_reader);
+    }
+    // Return the end of the last tile buffer
+    return raw_data_end;
+  }
+  TileDataDec *const tile_data = pbi->tile_data + end_tile;
+
+  return aom_reader_find_end(&tile_data->bit_reader);
+}
+
+static TileJobsDec *get_dec_job_info(AV1DecTileMT *tile_mt_info) {
+  TileJobsDec *cur_job_info = NULL;
+#if CONFIG_MULTITHREAD
+  pthread_mutex_lock(tile_mt_info->job_mutex);
+
+  if (tile_mt_info->jobs_dequeued < tile_mt_info->jobs_enqueued) {
+    cur_job_info = tile_mt_info->job_queue + tile_mt_info->jobs_dequeued;
+    tile_mt_info->jobs_dequeued++;
+  }
+
+  pthread_mutex_unlock(tile_mt_info->job_mutex);
+#else
+  (void)tile_mt_info;
+#endif
+  return cur_job_info;
+}
+
+static AOM_INLINE void tile_worker_hook_init(
+    AV1Decoder *const pbi, DecWorkerData *const thread_data,
+    const TileBufferDec *const tile_buffer, TileDataDec *const tile_data,
+    uint8_t allow_update_cdf) {
+  AV1_COMMON *cm = &pbi->common;
+  ThreadData *const td = thread_data->td;
+  int tile_row = tile_data->tile_info.tile_row;
+  int tile_col = tile_data->tile_info.tile_col;
+
+  td->bit_reader = &tile_data->bit_reader;
+  av1_zero(td->cb_buffer_base.dqcoeff);
+  av1_tile_init(&td->xd.tile, cm, tile_row, tile_col);
+  td->xd.current_qindex = cm->quant_params.base_qindex;
+  setup_bool_decoder(tile_buffer->data, thread_data->data_end,
+                     tile_buffer->size, &thread_data->error_info,
+                     td->bit_reader, allow_update_cdf);
+#if CONFIG_ACCOUNTING
+  if (pbi->acct_enabled) {
+    td->bit_reader->accounting = &pbi->accounting;
+    td->bit_reader->accounting->last_tell_frac =
+        aom_reader_tell_frac(td->bit_reader);
+  } else {
+    td->bit_reader->accounting = NULL;
+  }
+#endif
+  av1_init_macroblockd(cm, &td->xd, NULL);
+  td->xd.error_info = &thread_data->error_info;
+  av1_init_above_context(&cm->above_contexts, av1_num_planes(cm), tile_row,
+                         &td->xd);
+
+  // Initialise the tile context from the frame context
+  tile_data->tctx = *cm->fc;
+  td->xd.tile_ctx = &tile_data->tctx;
+#if CONFIG_ACCOUNTING
+  if (pbi->acct_enabled) {
+    tile_data->bit_reader.accounting->last_tell_frac =
+        aom_reader_tell_frac(&tile_data->bit_reader);
+  }
+#endif
+}
+
+static int tile_worker_hook(void *arg1, void *arg2) {
+  DecWorkerData *const thread_data = (DecWorkerData *)arg1;
+  AV1Decoder *const pbi = (AV1Decoder *)arg2;
+  AV1_COMMON *cm = &pbi->common;
+  ThreadData *const td = thread_data->td;
+  uint8_t allow_update_cdf;
+
+  // The jmp_buf is valid only for the duration of the function that calls
+  // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+  // before it returns.
+  if (setjmp(thread_data->error_info.jmp)) {
+    thread_data->error_info.setjmp = 0;
+    thread_data->td->xd.corrupted = 1;
+    return 0;
+  }
+  thread_data->error_info.setjmp = 1;
+
+  allow_update_cdf = cm->tiles.large_scale ? 0 : 1;
+  allow_update_cdf = allow_update_cdf && !cm->features.disable_cdf_update;
+
+  set_decode_func_pointers(td, 0x3);
+
+  assert(cm->tiles.cols > 0);
+  while (!td->xd.corrupted) {
+    TileJobsDec *cur_job_info = get_dec_job_info(&pbi->tile_mt_info);
+
+    if (cur_job_info != NULL) {
+      const TileBufferDec *const tile_buffer = cur_job_info->tile_buffer;
+      TileDataDec *const tile_data = cur_job_info->tile_data;
+      tile_worker_hook_init(pbi, thread_data, tile_buffer, tile_data,
+                            allow_update_cdf);
+      // decode tile
+      int tile_row = tile_data->tile_info.tile_row;
+      int tile_col = tile_data->tile_info.tile_col;
+      decode_tile(pbi, td, tile_row, tile_col);
+    } else {
+      break;
+    }
+  }
+  thread_data->error_info.setjmp = 0;
+  return !td->xd.corrupted;
+}
+
+static INLINE int get_max_row_mt_workers_per_tile(AV1_COMMON *cm,
+                                                  TileInfo tile) {
+  // NOTE: Currently value of max workers is calculated based
+  // on the parse and decode time. As per the theoretical estimate
+  // when percentage of parse time is equal to percentage of decode
+  // time, number of workers needed to parse + decode a tile can not
+  // exceed more than 2.
+  // TODO(any): Modify this value if parsing is optimized in future.
+  int sb_rows = av1_get_sb_rows_in_tile(cm, tile);
+  int max_workers =
+      sb_rows == 1 ? AOM_MIN_THREADS_PER_TILE : AOM_MAX_THREADS_PER_TILE;
+  return max_workers;
+}
+
+// The caller must hold pbi->row_mt_mutex_ when calling this function.
+// Returns 1 if either the next job is stored in *next_job_info or 1 is stored
+// in *end_of_frame.
+// NOTE: The caller waits on pbi->row_mt_cond_ if this function returns 0.
+// The return value of this function depends on the following variables:
+// - frame_row_mt_info->mi_rows_parse_done
+// - frame_row_mt_info->mi_rows_decode_started
+// - frame_row_mt_info->row_mt_exit
+// Therefore we may need to signal or broadcast pbi->row_mt_cond_ if any of
+// these variables is modified.
+static int get_next_job_info(AV1Decoder *const pbi,
+                             AV1DecRowMTJobInfo *next_job_info,
+                             int *end_of_frame) {
+  AV1_COMMON *cm = &pbi->common;
+  TileDataDec *tile_data;
+  AV1DecRowMTSync *dec_row_mt_sync;
+  AV1DecRowMTInfo *frame_row_mt_info = &pbi->frame_row_mt_info;
+  TileInfo tile_info;
+  const int tile_rows_start = frame_row_mt_info->tile_rows_start;
+  const int tile_rows_end = frame_row_mt_info->tile_rows_end;
+  const int tile_cols_start = frame_row_mt_info->tile_cols_start;
+  const int tile_cols_end = frame_row_mt_info->tile_cols_end;
+  const int start_tile = frame_row_mt_info->start_tile;
+  const int end_tile = frame_row_mt_info->end_tile;
+  const int sb_mi_size = mi_size_wide[cm->seq_params.sb_size];
+  int num_mis_to_decode, num_threads_working;
+  int num_mis_waiting_for_decode;
+  int min_threads_working = INT_MAX;
+  int max_mis_to_decode = 0;
+  int tile_row_idx, tile_col_idx;
+  int tile_row = -1;
+  int tile_col = -1;
+
+  memset(next_job_info, 0, sizeof(*next_job_info));
+
+  // Frame decode is completed or error is encountered.
+  *end_of_frame = (frame_row_mt_info->mi_rows_decode_started ==
+                   frame_row_mt_info->mi_rows_to_decode) ||
+                  (frame_row_mt_info->row_mt_exit == 1);
+  if (*end_of_frame) {
+    return 1;
+  }
+
+  // Decoding cannot start as bit-stream parsing is not complete.
+  assert(frame_row_mt_info->mi_rows_parse_done >=
+         frame_row_mt_info->mi_rows_decode_started);
+  if (frame_row_mt_info->mi_rows_parse_done ==
+      frame_row_mt_info->mi_rows_decode_started)
+    return 0;
+
+  // Choose the tile to decode.
+  for (tile_row_idx = tile_rows_start; tile_row_idx < tile_rows_end;
+       ++tile_row_idx) {
+    for (tile_col_idx = tile_cols_start; tile_col_idx < tile_cols_end;
+         ++tile_col_idx) {
+      if (tile_row_idx * cm->tiles.cols + tile_col_idx < start_tile ||
+          tile_row_idx * cm->tiles.cols + tile_col_idx > end_tile)
+        continue;
+
+      tile_data = pbi->tile_data + tile_row_idx * cm->tiles.cols + tile_col_idx;
+      dec_row_mt_sync = &tile_data->dec_row_mt_sync;
+
+      num_threads_working = dec_row_mt_sync->num_threads_working;
+      num_mis_waiting_for_decode = (dec_row_mt_sync->mi_rows_parse_done -
+                                    dec_row_mt_sync->mi_rows_decode_started) *
+                                   dec_row_mt_sync->mi_cols;
+      num_mis_to_decode =
+          (dec_row_mt_sync->mi_rows - dec_row_mt_sync->mi_rows_decode_started) *
+          dec_row_mt_sync->mi_cols;
+
+      assert(num_mis_to_decode >= num_mis_waiting_for_decode);
+
+      // Pick the tile which has minimum number of threads working on it.
+      if (num_mis_waiting_for_decode > 0) {
+        if (num_threads_working < min_threads_working) {
+          min_threads_working = num_threads_working;
+          max_mis_to_decode = 0;
+        }
+        if (num_threads_working == min_threads_working &&
+            num_mis_to_decode > max_mis_to_decode &&
+            num_threads_working <
+                get_max_row_mt_workers_per_tile(cm, tile_data->tile_info)) {
+          max_mis_to_decode = num_mis_to_decode;
+          tile_row = tile_row_idx;
+          tile_col = tile_col_idx;
+        }
+      }
+    }
+  }
+  // No job found to process
+  if (tile_row == -1 || tile_col == -1) return 0;
+
+  tile_data = pbi->tile_data + tile_row * cm->tiles.cols + tile_col;
+  tile_info = tile_data->tile_info;
+  dec_row_mt_sync = &tile_data->dec_row_mt_sync;
+
+  next_job_info->tile_row = tile_row;
+  next_job_info->tile_col = tile_col;
+  next_job_info->mi_row =
+      dec_row_mt_sync->mi_rows_decode_started + tile_info.mi_row_start;
+
+  dec_row_mt_sync->num_threads_working++;
+  dec_row_mt_sync->mi_rows_decode_started += sb_mi_size;
+  frame_row_mt_info->mi_rows_decode_started += sb_mi_size;
+  assert(frame_row_mt_info->mi_rows_parse_done >=
+         frame_row_mt_info->mi_rows_decode_started);
+#if CONFIG_MULTITHREAD
+  if (frame_row_mt_info->mi_rows_decode_started ==
+      frame_row_mt_info->mi_rows_to_decode) {
+    pthread_cond_broadcast(pbi->row_mt_cond_);
+  }
+#endif
+
+  return 1;
+}
+
+static INLINE void signal_parse_sb_row_done(AV1Decoder *const pbi,
+                                            TileDataDec *const tile_data,
+                                            const int sb_mi_size) {
+  AV1DecRowMTInfo *frame_row_mt_info = &pbi->frame_row_mt_info;
+#if CONFIG_MULTITHREAD
+  pthread_mutex_lock(pbi->row_mt_mutex_);
+#endif
+  assert(frame_row_mt_info->mi_rows_parse_done >=
+         frame_row_mt_info->mi_rows_decode_started);
+  tile_data->dec_row_mt_sync.mi_rows_parse_done += sb_mi_size;
+  frame_row_mt_info->mi_rows_parse_done += sb_mi_size;
+#if CONFIG_MULTITHREAD
+  // A new decode job is available. Wake up one worker thread to handle the
+  // new decode job.
+  // NOTE: This assumes we bump mi_rows_parse_done and mi_rows_decode_started
+  // by the same increment (sb_mi_size).
+  pthread_cond_signal(pbi->row_mt_cond_);
+  pthread_mutex_unlock(pbi->row_mt_mutex_);
+#endif
+}
+
+// This function is very similar to decode_tile(). It would be good to figure
+// out how to share code.
+static AOM_INLINE void parse_tile_row_mt(AV1Decoder *pbi, ThreadData *const td,
+                                         TileDataDec *const tile_data) {
+  AV1_COMMON *const cm = &pbi->common;
+  const int sb_mi_size = mi_size_wide[cm->seq_params.sb_size];
+  const int num_planes = av1_num_planes(cm);
+  TileInfo tile_info = tile_data->tile_info;
+  int tile_row = tile_info.tile_row;
+
+  av1_zero_above_context(cm, &td->xd, tile_info.mi_col_start,
+                         tile_info.mi_col_end, tile_row);
+  av1_reset_loop_filter_delta(&td->xd, num_planes);
+  av1_reset_loop_restoration(&td->xd, num_planes);
+
+  for (int mi_row = tile_info.mi_row_start; mi_row < tile_info.mi_row_end;
+       mi_row += cm->seq_params.mib_size) {
+    av1_zero_left_context(&td->xd);
+
+    for (int mi_col = tile_info.mi_col_start; mi_col < tile_info.mi_col_end;
+         mi_col += cm->seq_params.mib_size) {
+      set_cb_buffer(pbi, &td->xd, pbi->cb_buffer_base, num_planes, mi_row,
+                    mi_col);
+
+      // Bit-stream parsing of the superblock
+      decode_partition(pbi, td, mi_row, mi_col, td->bit_reader,
+                       cm->seq_params.sb_size, 0x1);
+
+      if (aom_reader_has_overflowed(td->bit_reader)) {
+        aom_merge_corrupted_flag(&td->xd.corrupted, 1);
+        return;
+      }
+    }
+    signal_parse_sb_row_done(pbi, tile_data, sb_mi_size);
+  }
+
+  int corrupted =
+      (check_trailing_bits_after_symbol_coder(td->bit_reader)) ? 1 : 0;
+  aom_merge_corrupted_flag(&td->xd.corrupted, corrupted);
+}
+
+static int row_mt_worker_hook(void *arg1, void *arg2) {
+  DecWorkerData *const thread_data = (DecWorkerData *)arg1;
+  AV1Decoder *const pbi = (AV1Decoder *)arg2;
+  AV1_COMMON *cm = &pbi->common;
+  ThreadData *const td = thread_data->td;
+  uint8_t allow_update_cdf;
+  AV1DecRowMTInfo *frame_row_mt_info = &pbi->frame_row_mt_info;
+  td->xd.corrupted = 0;
+
+  // The jmp_buf is valid only for the duration of the function that calls
+  // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+  // before it returns.
+  if (setjmp(thread_data->error_info.jmp)) {
+    thread_data->error_info.setjmp = 0;
+    thread_data->td->xd.corrupted = 1;
+#if CONFIG_MULTITHREAD
+    pthread_mutex_lock(pbi->row_mt_mutex_);
+#endif
+    frame_row_mt_info->row_mt_exit = 1;
+#if CONFIG_MULTITHREAD
+    pthread_cond_broadcast(pbi->row_mt_cond_);
+    pthread_mutex_unlock(pbi->row_mt_mutex_);
+#endif
+    return 0;
+  }
+  thread_data->error_info.setjmp = 1;
+
+  allow_update_cdf = cm->tiles.large_scale ? 0 : 1;
+  allow_update_cdf = allow_update_cdf && !cm->features.disable_cdf_update;
+
+  set_decode_func_pointers(td, 0x1);
+
+  assert(cm->tiles.cols > 0);
+  while (!td->xd.corrupted) {
+    TileJobsDec *cur_job_info = get_dec_job_info(&pbi->tile_mt_info);
+
+    if (cur_job_info != NULL) {
+      const TileBufferDec *const tile_buffer = cur_job_info->tile_buffer;
+      TileDataDec *const tile_data = cur_job_info->tile_data;
+      tile_worker_hook_init(pbi, thread_data, tile_buffer, tile_data,
+                            allow_update_cdf);
+#if CONFIG_MULTITHREAD
+      pthread_mutex_lock(pbi->row_mt_mutex_);
+#endif
+      tile_data->dec_row_mt_sync.num_threads_working++;
+#if CONFIG_MULTITHREAD
+      pthread_mutex_unlock(pbi->row_mt_mutex_);
+#endif
+      // decode tile
+      parse_tile_row_mt(pbi, td, tile_data);
+#if CONFIG_MULTITHREAD
+      pthread_mutex_lock(pbi->row_mt_mutex_);
+#endif
+      tile_data->dec_row_mt_sync.num_threads_working--;
+#if CONFIG_MULTITHREAD
+      pthread_mutex_unlock(pbi->row_mt_mutex_);
+#endif
+    } else {
+      break;
+    }
+  }
+
+  if (td->xd.corrupted) {
+    thread_data->error_info.setjmp = 0;
+#if CONFIG_MULTITHREAD
+    pthread_mutex_lock(pbi->row_mt_mutex_);
+#endif
+    frame_row_mt_info->row_mt_exit = 1;
+#if CONFIG_MULTITHREAD
+    pthread_cond_broadcast(pbi->row_mt_cond_);
+    pthread_mutex_unlock(pbi->row_mt_mutex_);
+#endif
+    return 0;
+  }
+
+  set_decode_func_pointers(td, 0x2);
+
+  while (1) {
+    AV1DecRowMTJobInfo next_job_info;
+    int end_of_frame = 0;
+
+#if CONFIG_MULTITHREAD
+    pthread_mutex_lock(pbi->row_mt_mutex_);
+#endif
+    while (!get_next_job_info(pbi, &next_job_info, &end_of_frame)) {
+#if CONFIG_MULTITHREAD
+      pthread_cond_wait(pbi->row_mt_cond_, pbi->row_mt_mutex_);
+#endif
+    }
+#if CONFIG_MULTITHREAD
+    pthread_mutex_unlock(pbi->row_mt_mutex_);
+#endif
+
+    if (end_of_frame) break;
+
+    int tile_row = next_job_info.tile_row;
+    int tile_col = next_job_info.tile_col;
+    int mi_row = next_job_info.mi_row;
+
+    TileDataDec *tile_data =
+        pbi->tile_data + tile_row * cm->tiles.cols + tile_col;
+    AV1DecRowMTSync *dec_row_mt_sync = &tile_data->dec_row_mt_sync;
+    TileInfo tile_info = tile_data->tile_info;
+
+    av1_tile_init(&td->xd.tile, cm, tile_row, tile_col);
+    av1_init_macroblockd(cm, &td->xd, NULL);
+    td->xd.error_info = &thread_data->error_info;
+
+    decode_tile_sb_row(pbi, td, tile_info, mi_row);
+
+#if CONFIG_MULTITHREAD
+    pthread_mutex_lock(pbi->row_mt_mutex_);
+#endif
+    dec_row_mt_sync->num_threads_working--;
+#if CONFIG_MULTITHREAD
+    pthread_mutex_unlock(pbi->row_mt_mutex_);
+#endif
+  }
+  thread_data->error_info.setjmp = 0;
+  return !td->xd.corrupted;
+}
+
+// sorts in descending order
+static int compare_tile_buffers(const void *a, const void *b) {
+  const TileJobsDec *const buf1 = (const TileJobsDec *)a;
+  const TileJobsDec *const buf2 = (const TileJobsDec *)b;
+  return (((int)buf2->tile_buffer->size) - ((int)buf1->tile_buffer->size));
+}
+
+static AOM_INLINE void enqueue_tile_jobs(AV1Decoder *pbi, AV1_COMMON *cm,
+                                         int tile_rows_start, int tile_rows_end,
+                                         int tile_cols_start, int tile_cols_end,
+                                         int start_tile, int end_tile) {
+  AV1DecTileMT *tile_mt_info = &pbi->tile_mt_info;
+  TileJobsDec *tile_job_queue = tile_mt_info->job_queue;
+  tile_mt_info->jobs_enqueued = 0;
+  tile_mt_info->jobs_dequeued = 0;
+
+  for (int row = tile_rows_start; row < tile_rows_end; row++) {
+    for (int col = tile_cols_start; col < tile_cols_end; col++) {
+      if (row * cm->tiles.cols + col < start_tile ||
+          row * cm->tiles.cols + col > end_tile)
+        continue;
+      tile_job_queue->tile_buffer = &pbi->tile_buffers[row][col];
+      tile_job_queue->tile_data = pbi->tile_data + row * cm->tiles.cols + col;
+      tile_job_queue++;
+      tile_mt_info->jobs_enqueued++;
+    }
+  }
+}
+
+static AOM_INLINE void alloc_dec_jobs(AV1DecTileMT *tile_mt_info,
+                                      AV1_COMMON *cm, int tile_rows,
+                                      int tile_cols) {
+  tile_mt_info->alloc_tile_rows = tile_rows;
+  tile_mt_info->alloc_tile_cols = tile_cols;
+  int num_tiles = tile_rows * tile_cols;
+#if CONFIG_MULTITHREAD
+  {
+    CHECK_MEM_ERROR(cm, tile_mt_info->job_mutex,
+                    aom_malloc(sizeof(*tile_mt_info->job_mutex) * num_tiles));
+
+    for (int i = 0; i < num_tiles; i++) {
+      pthread_mutex_init(&tile_mt_info->job_mutex[i], NULL);
+    }
+  }
+#endif
+  CHECK_MEM_ERROR(cm, tile_mt_info->job_queue,
+                  aom_malloc(sizeof(*tile_mt_info->job_queue) * num_tiles));
+}
+
+void av1_free_mc_tmp_buf(ThreadData *thread_data) {
+  int ref;
+  for (ref = 0; ref < 2; ref++) {
+    if (thread_data->mc_buf_use_highbd)
+      aom_free(CONVERT_TO_SHORTPTR(thread_data->mc_buf[ref]));
+    else
+      aom_free(thread_data->mc_buf[ref]);
+    thread_data->mc_buf[ref] = NULL;
+  }
+  thread_data->mc_buf_size = 0;
+  thread_data->mc_buf_use_highbd = 0;
+
+  aom_free(thread_data->tmp_conv_dst);
+  thread_data->tmp_conv_dst = NULL;
+  for (int i = 0; i < 2; ++i) {
+    aom_free(thread_data->tmp_obmc_bufs[i]);
+    thread_data->tmp_obmc_bufs[i] = NULL;
+  }
+}
+
+static AOM_INLINE void allocate_mc_tmp_buf(AV1_COMMON *const cm,
+                                           ThreadData *thread_data,
+                                           int buf_size, int use_highbd) {
+  for (int ref = 0; ref < 2; ref++) {
+    if (use_highbd) {
+      uint16_t *hbd_mc_buf;
+      CHECK_MEM_ERROR(cm, hbd_mc_buf, (uint16_t *)aom_memalign(16, buf_size));
+      thread_data->mc_buf[ref] = CONVERT_TO_BYTEPTR(hbd_mc_buf);
+    } else {
+      CHECK_MEM_ERROR(cm, thread_data->mc_buf[ref],
+                      (uint8_t *)aom_memalign(16, buf_size));
+    }
+  }
+  thread_data->mc_buf_size = buf_size;
+  thread_data->mc_buf_use_highbd = use_highbd;
+
+  CHECK_MEM_ERROR(cm, thread_data->tmp_conv_dst,
+                  aom_memalign(32, MAX_SB_SIZE * MAX_SB_SIZE *
+                                       sizeof(*thread_data->tmp_conv_dst)));
+  for (int i = 0; i < 2; ++i) {
+    CHECK_MEM_ERROR(
+        cm, thread_data->tmp_obmc_bufs[i],
+        aom_memalign(16, 2 * MAX_MB_PLANE * MAX_SB_SQUARE *
+                             sizeof(*thread_data->tmp_obmc_bufs[i])));
+  }
+}
+
+static AOM_INLINE void reset_dec_workers(AV1Decoder *pbi,
+                                         AVxWorkerHook worker_hook,
+                                         int num_workers) {
+  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+
+  // Reset tile decoding hook
+  for (int worker_idx = 0; worker_idx < num_workers; ++worker_idx) {
+    AVxWorker *const worker = &pbi->tile_workers[worker_idx];
+    DecWorkerData *const thread_data = pbi->thread_data + worker_idx;
+    thread_data->td->xd = pbi->mb;
+    thread_data->td->xd.corrupted = 0;
+    thread_data->td->xd.mc_buf[0] = thread_data->td->mc_buf[0];
+    thread_data->td->xd.mc_buf[1] = thread_data->td->mc_buf[1];
+    thread_data->td->xd.tmp_conv_dst = thread_data->td->tmp_conv_dst;
+    for (int j = 0; j < 2; ++j) {
+      thread_data->td->xd.tmp_obmc_bufs[j] = thread_data->td->tmp_obmc_bufs[j];
+    }
+    winterface->sync(worker);
+
+    worker->hook = worker_hook;
+    worker->data1 = thread_data;
+    worker->data2 = pbi;
+  }
+#if CONFIG_ACCOUNTING
+  if (pbi->acct_enabled) {
+    aom_accounting_reset(&pbi->accounting);
+  }
+#endif
+}
+
+static AOM_INLINE void launch_dec_workers(AV1Decoder *pbi,
+                                          const uint8_t *data_end,
+                                          int num_workers) {
+  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+
+  for (int worker_idx = 0; worker_idx < num_workers; ++worker_idx) {
+    AVxWorker *const worker = &pbi->tile_workers[worker_idx];
+    DecWorkerData *const thread_data = (DecWorkerData *)worker->data1;
+
+    thread_data->data_end = data_end;
+
+    worker->had_error = 0;
+    if (worker_idx == num_workers - 1) {
+      winterface->execute(worker);
+    } else {
+      winterface->launch(worker);
+    }
+  }
+}
+
+static AOM_INLINE void sync_dec_workers(AV1Decoder *pbi, int num_workers) {
+  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+  int corrupted = 0;
+
+  for (int worker_idx = num_workers; worker_idx > 0; --worker_idx) {
+    AVxWorker *const worker = &pbi->tile_workers[worker_idx - 1];
+    aom_merge_corrupted_flag(&corrupted, !winterface->sync(worker));
+  }
+
+  pbi->mb.corrupted = corrupted;
+}
+
+static AOM_INLINE void decode_mt_init(AV1Decoder *pbi) {
+  AV1_COMMON *const cm = &pbi->common;
+  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+  int worker_idx;
+
+  // Create workers and thread_data
+  if (pbi->num_workers == 0) {
+    const int num_threads = pbi->max_threads;
+    CHECK_MEM_ERROR(cm, pbi->tile_workers,
+                    aom_malloc(num_threads * sizeof(*pbi->tile_workers)));
+    CHECK_MEM_ERROR(cm, pbi->thread_data,
+                    aom_malloc(num_threads * sizeof(*pbi->thread_data)));
+
+    for (worker_idx = 0; worker_idx < num_threads; ++worker_idx) {
+      AVxWorker *const worker = &pbi->tile_workers[worker_idx];
+      DecWorkerData *const thread_data = pbi->thread_data + worker_idx;
+      ++pbi->num_workers;
+
+      winterface->init(worker);
+      worker->thread_name = "aom tile worker";
+      if (worker_idx < num_threads - 1 && !winterface->reset(worker)) {
+        aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+                           "Tile decoder thread creation failed");
+      }
+
+      if (worker_idx < num_threads - 1) {
+        // Allocate thread data.
+        CHECK_MEM_ERROR(cm, thread_data->td,
+                        aom_memalign(32, sizeof(*thread_data->td)));
+        av1_zero(*thread_data->td);
+      } else {
+        // Main thread acts as a worker and uses the thread data in pbi
+        thread_data->td = &pbi->td;
+      }
+      thread_data->error_info.error_code = AOM_CODEC_OK;
+      thread_data->error_info.setjmp = 0;
+    }
+  }
+  const int use_highbd = cm->seq_params.use_highbitdepth;
+  const int buf_size = MC_TEMP_BUF_PELS << use_highbd;
+  for (worker_idx = 0; worker_idx < pbi->max_threads - 1; ++worker_idx) {
+    DecWorkerData *const thread_data = pbi->thread_data + worker_idx;
+    if (thread_data->td->mc_buf_size != buf_size) {
+      av1_free_mc_tmp_buf(thread_data->td);
+      allocate_mc_tmp_buf(cm, thread_data->td, buf_size, use_highbd);
+    }
+  }
+}
+
+static AOM_INLINE void tile_mt_queue(AV1Decoder *pbi, int tile_cols,
+                                     int tile_rows, int tile_rows_start,
+                                     int tile_rows_end, int tile_cols_start,
+                                     int tile_cols_end, int start_tile,
+                                     int end_tile) {
+  AV1_COMMON *const cm = &pbi->common;
+  if (pbi->tile_mt_info.alloc_tile_cols != tile_cols ||
+      pbi->tile_mt_info.alloc_tile_rows != tile_rows) {
+    av1_dealloc_dec_jobs(&pbi->tile_mt_info);
+    alloc_dec_jobs(&pbi->tile_mt_info, cm, tile_rows, tile_cols);
+  }
+  enqueue_tile_jobs(pbi, cm, tile_rows_start, tile_rows_end, tile_cols_start,
+                    tile_cols_end, start_tile, end_tile);
+  qsort(pbi->tile_mt_info.job_queue, pbi->tile_mt_info.jobs_enqueued,
+        sizeof(pbi->tile_mt_info.job_queue[0]), compare_tile_buffers);
+}
+
+static const uint8_t *decode_tiles_mt(AV1Decoder *pbi, const uint8_t *data,
+                                      const uint8_t *data_end, int start_tile,
+                                      int end_tile) {
+  AV1_COMMON *const cm = &pbi->common;
+  CommonTileParams *const tiles = &cm->tiles;
+  const int tile_cols = tiles->cols;
+  const int tile_rows = tiles->rows;
+  const int n_tiles = tile_cols * tile_rows;
+  TileBufferDec(*const tile_buffers)[MAX_TILE_COLS] = pbi->tile_buffers;
+  const int dec_tile_row = AOMMIN(pbi->dec_tile_row, tile_rows);
+  const int single_row = pbi->dec_tile_row >= 0;
+  const int dec_tile_col = AOMMIN(pbi->dec_tile_col, tile_cols);
+  const int single_col = pbi->dec_tile_col >= 0;
+  int tile_rows_start;
+  int tile_rows_end;
+  int tile_cols_start;
+  int tile_cols_end;
+  int tile_count_tg;
+  int num_workers;
+  const uint8_t *raw_data_end = NULL;
+
+  if (tiles->large_scale) {
+    tile_rows_start = single_row ? dec_tile_row : 0;
+    tile_rows_end = single_row ? dec_tile_row + 1 : tile_rows;
+    tile_cols_start = single_col ? dec_tile_col : 0;
+    tile_cols_end = single_col ? tile_cols_start + 1 : tile_cols;
+  } else {
+    tile_rows_start = 0;
+    tile_rows_end = tile_rows;
+    tile_cols_start = 0;
+    tile_cols_end = tile_cols;
+  }
+  tile_count_tg = end_tile - start_tile + 1;
+  num_workers = AOMMIN(pbi->max_threads, tile_count_tg);
+
+  // No tiles to decode.
+  if (tile_rows_end <= tile_rows_start || tile_cols_end <= tile_cols_start ||
+      // First tile is larger than end_tile.
+      tile_rows_start * tile_cols + tile_cols_start > end_tile ||
+      // Last tile is smaller than start_tile.
+      (tile_rows_end - 1) * tile_cols + tile_cols_end - 1 < start_tile)
+    return data;
+
+  assert(tile_rows <= MAX_TILE_ROWS);
+  assert(tile_cols <= MAX_TILE_COLS);
+  assert(tile_count_tg > 0);
+  assert(num_workers > 0);
+  assert(start_tile <= end_tile);
+  assert(start_tile >= 0 && end_tile < n_tiles);
+
+  decode_mt_init(pbi);
+
+  // get tile size in tile group
+#if EXT_TILE_DEBUG
+  if (tiles->large_scale) assert(pbi->ext_tile_debug == 1);
+  if (tiles->large_scale)
+    raw_data_end = get_ls_tile_buffers(pbi, data, data_end, tile_buffers);
+  else
+#endif  // EXT_TILE_DEBUG
+    get_tile_buffers(pbi, data, data_end, tile_buffers, start_tile, end_tile);
+
+  if (pbi->tile_data == NULL || n_tiles != pbi->allocated_tiles) {
+    decoder_alloc_tile_data(pbi, n_tiles);
+  }
+
+  for (int row = 0; row < tile_rows; row++) {
+    for (int col = 0; col < tile_cols; col++) {
+      TileDataDec *tile_data = pbi->tile_data + row * tiles->cols + col;
+      av1_tile_init(&tile_data->tile_info, cm, row, col);
+    }
+  }
+
+  tile_mt_queue(pbi, tile_cols, tile_rows, tile_rows_start, tile_rows_end,
+                tile_cols_start, tile_cols_end, start_tile, end_tile);
+
+  reset_dec_workers(pbi, tile_worker_hook, num_workers);
+  launch_dec_workers(pbi, data_end, num_workers);
+  sync_dec_workers(pbi, num_workers);
+
+  if (pbi->mb.corrupted)
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "Failed to decode tile data");
+
+  if (tiles->large_scale) {
+    if (n_tiles == 1) {
+      // Find the end of the single tile buffer
+      return aom_reader_find_end(&pbi->tile_data->bit_reader);
+    }
+    // Return the end of the last tile buffer
+    return raw_data_end;
+  }
+  TileDataDec *const tile_data = pbi->tile_data + end_tile;
+
+  return aom_reader_find_end(&tile_data->bit_reader);
+}
+
+static AOM_INLINE void dec_alloc_cb_buf(AV1Decoder *pbi) {
+  AV1_COMMON *const cm = &pbi->common;
+  int size = ((cm->mi_params.mi_rows >> cm->seq_params.mib_size_log2) + 1) *
+             ((cm->mi_params.mi_cols >> cm->seq_params.mib_size_log2) + 1);
+
+  if (pbi->cb_buffer_alloc_size < size) {
+    av1_dec_free_cb_buf(pbi);
+    CHECK_MEM_ERROR(cm, pbi->cb_buffer_base,
+                    aom_memalign(32, sizeof(*pbi->cb_buffer_base) * size));
+    memset(pbi->cb_buffer_base, 0, sizeof(*pbi->cb_buffer_base) * size);
+    pbi->cb_buffer_alloc_size = size;
+  }
+}
+
+static AOM_INLINE void row_mt_frame_init(AV1Decoder *pbi, int tile_rows_start,
+                                         int tile_rows_end, int tile_cols_start,
+                                         int tile_cols_end, int start_tile,
+                                         int end_tile, int max_sb_rows) {
+  AV1_COMMON *const cm = &pbi->common;
+  AV1DecRowMTInfo *frame_row_mt_info = &pbi->frame_row_mt_info;
+
+  frame_row_mt_info->tile_rows_start = tile_rows_start;
+  frame_row_mt_info->tile_rows_end = tile_rows_end;
+  frame_row_mt_info->tile_cols_start = tile_cols_start;
+  frame_row_mt_info->tile_cols_end = tile_cols_end;
+  frame_row_mt_info->start_tile = start_tile;
+  frame_row_mt_info->end_tile = end_tile;
+  frame_row_mt_info->mi_rows_to_decode = 0;
+  frame_row_mt_info->mi_rows_parse_done = 0;
+  frame_row_mt_info->mi_rows_decode_started = 0;
+  frame_row_mt_info->row_mt_exit = 0;
+
+  for (int tile_row = tile_rows_start; tile_row < tile_rows_end; ++tile_row) {
+    for (int tile_col = tile_cols_start; tile_col < tile_cols_end; ++tile_col) {
+      if (tile_row * cm->tiles.cols + tile_col < start_tile ||
+          tile_row * cm->tiles.cols + tile_col > end_tile)
+        continue;
+
+      TileDataDec *const tile_data =
+          pbi->tile_data + tile_row * cm->tiles.cols + tile_col;
+      TileInfo tile_info = tile_data->tile_info;
+
+      tile_data->dec_row_mt_sync.mi_rows_parse_done = 0;
+      tile_data->dec_row_mt_sync.mi_rows_decode_started = 0;
+      tile_data->dec_row_mt_sync.num_threads_working = 0;
+      tile_data->dec_row_mt_sync.mi_rows =
+          ALIGN_POWER_OF_TWO(tile_info.mi_row_end - tile_info.mi_row_start,
+                             cm->seq_params.mib_size_log2);
+      tile_data->dec_row_mt_sync.mi_cols =
+          ALIGN_POWER_OF_TWO(tile_info.mi_col_end - tile_info.mi_col_start,
+                             cm->seq_params.mib_size_log2);
+
+      frame_row_mt_info->mi_rows_to_decode +=
+          tile_data->dec_row_mt_sync.mi_rows;
+
+      // Initialize cur_sb_col to -1 for all SB rows.
+      memset(tile_data->dec_row_mt_sync.cur_sb_col, -1,
+             sizeof(*tile_data->dec_row_mt_sync.cur_sb_col) * max_sb_rows);
+    }
+  }
+
+#if CONFIG_MULTITHREAD
+  if (pbi->row_mt_mutex_ == NULL) {
+    CHECK_MEM_ERROR(cm, pbi->row_mt_mutex_,
+                    aom_malloc(sizeof(*(pbi->row_mt_mutex_))));
+    if (pbi->row_mt_mutex_) {
+      pthread_mutex_init(pbi->row_mt_mutex_, NULL);
+    }
+  }
+
+  if (pbi->row_mt_cond_ == NULL) {
+    CHECK_MEM_ERROR(cm, pbi->row_mt_cond_,
+                    aom_malloc(sizeof(*(pbi->row_mt_cond_))));
+    if (pbi->row_mt_cond_) {
+      pthread_cond_init(pbi->row_mt_cond_, NULL);
+    }
+  }
+#endif
+}
+
+static const uint8_t *decode_tiles_row_mt(AV1Decoder *pbi, const uint8_t *data,
+                                          const uint8_t *data_end,
+                                          int start_tile, int end_tile) {
+  AV1_COMMON *const cm = &pbi->common;
+  CommonTileParams *const tiles = &cm->tiles;
+  const int tile_cols = tiles->cols;
+  const int tile_rows = tiles->rows;
+  const int n_tiles = tile_cols * tile_rows;
+  TileBufferDec(*const tile_buffers)[MAX_TILE_COLS] = pbi->tile_buffers;
+  const int dec_tile_row = AOMMIN(pbi->dec_tile_row, tile_rows);
+  const int single_row = pbi->dec_tile_row >= 0;
+  const int dec_tile_col = AOMMIN(pbi->dec_tile_col, tile_cols);
+  const int single_col = pbi->dec_tile_col >= 0;
+  int tile_rows_start;
+  int tile_rows_end;
+  int tile_cols_start;
+  int tile_cols_end;
+  int tile_count_tg;
+  int num_workers = 0;
+  int max_threads;
+  const uint8_t *raw_data_end = NULL;
+  int max_sb_rows = 0;
+
+  if (tiles->large_scale) {
+    tile_rows_start = single_row ? dec_tile_row : 0;
+    tile_rows_end = single_row ? dec_tile_row + 1 : tile_rows;
+    tile_cols_start = single_col ? dec_tile_col : 0;
+    tile_cols_end = single_col ? tile_cols_start + 1 : tile_cols;
+  } else {
+    tile_rows_start = 0;
+    tile_rows_end = tile_rows;
+    tile_cols_start = 0;
+    tile_cols_end = tile_cols;
+  }
+  tile_count_tg = end_tile - start_tile + 1;
+  max_threads = pbi->max_threads;
+
+  // No tiles to decode.
+  if (tile_rows_end <= tile_rows_start || tile_cols_end <= tile_cols_start ||
+      // First tile is larger than end_tile.
+      tile_rows_start * tile_cols + tile_cols_start > end_tile ||
+      // Last tile is smaller than start_tile.
+      (tile_rows_end - 1) * tile_cols + tile_cols_end - 1 < start_tile)
+    return data;
+
+  assert(tile_rows <= MAX_TILE_ROWS);
+  assert(tile_cols <= MAX_TILE_COLS);
+  assert(tile_count_tg > 0);
+  assert(max_threads > 0);
+  assert(start_tile <= end_tile);
+  assert(start_tile >= 0 && end_tile < n_tiles);
+
+  (void)tile_count_tg;
+
+  decode_mt_init(pbi);
+
+  // get tile size in tile group
+#if EXT_TILE_DEBUG
+  if (tiles->large_scale) assert(pbi->ext_tile_debug == 1);
+  if (tiles->large_scale)
+    raw_data_end = get_ls_tile_buffers(pbi, data, data_end, tile_buffers);
+  else
+#endif  // EXT_TILE_DEBUG
+    get_tile_buffers(pbi, data, data_end, tile_buffers, start_tile, end_tile);
+
+  if (pbi->tile_data == NULL || n_tiles != pbi->allocated_tiles) {
+    if (pbi->tile_data != NULL) {
+      for (int i = 0; i < pbi->allocated_tiles; i++) {
+        TileDataDec *const tile_data = pbi->tile_data + i;
+        av1_dec_row_mt_dealloc(&tile_data->dec_row_mt_sync);
+      }
+    }
+    decoder_alloc_tile_data(pbi, n_tiles);
+  }
+
+  for (int row = 0; row < tile_rows; row++) {
+    for (int col = 0; col < tile_cols; col++) {
+      TileDataDec *tile_data = pbi->tile_data + row * tiles->cols + col;
+      av1_tile_init(&tile_data->tile_info, cm, row, col);
+
+      max_sb_rows = AOMMAX(max_sb_rows,
+                           av1_get_sb_rows_in_tile(cm, tile_data->tile_info));
+      num_workers += get_max_row_mt_workers_per_tile(cm, tile_data->tile_info);
+    }
+  }
+  num_workers = AOMMIN(num_workers, max_threads);
+
+  if (pbi->allocated_row_mt_sync_rows != max_sb_rows) {
+    for (int i = 0; i < n_tiles; ++i) {
+      TileDataDec *const tile_data = pbi->tile_data + i;
+      av1_dec_row_mt_dealloc(&tile_data->dec_row_mt_sync);
+      dec_row_mt_alloc(&tile_data->dec_row_mt_sync, cm, max_sb_rows);
+    }
+    pbi->allocated_row_mt_sync_rows = max_sb_rows;
+  }
+
+  tile_mt_queue(pbi, tile_cols, tile_rows, tile_rows_start, tile_rows_end,
+                tile_cols_start, tile_cols_end, start_tile, end_tile);
+
+  dec_alloc_cb_buf(pbi);
+
+  row_mt_frame_init(pbi, tile_rows_start, tile_rows_end, tile_cols_start,
+                    tile_cols_end, start_tile, end_tile, max_sb_rows);
+
+  reset_dec_workers(pbi, row_mt_worker_hook, num_workers);
+  launch_dec_workers(pbi, data_end, num_workers);
+  sync_dec_workers(pbi, num_workers);
+
+  if (pbi->mb.corrupted)
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "Failed to decode tile data");
+
+  if (tiles->large_scale) {
+    if (n_tiles == 1) {
+      // Find the end of the single tile buffer
+      return aom_reader_find_end(&pbi->tile_data->bit_reader);
+    }
+    // Return the end of the last tile buffer
+    return raw_data_end;
+  }
+  TileDataDec *const tile_data = pbi->tile_data + end_tile;
+
+  return aom_reader_find_end(&tile_data->bit_reader);
+}
+
+static AOM_INLINE void error_handler(void *data) {
+  AV1_COMMON *const cm = (AV1_COMMON *)data;
+  aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, "Truncated packet");
+}
+
+// Reads the high_bitdepth and twelve_bit fields in color_config() and sets
+// seq_params->bit_depth based on the values of those fields and
+// seq_params->profile. Reports errors by calling rb->error_handler() or
+// aom_internal_error().
+static AOM_INLINE void read_bitdepth(
+    struct aom_read_bit_buffer *rb, SequenceHeader *seq_params,
+    struct aom_internal_error_info *error_info) {
+  const int high_bitdepth = aom_rb_read_bit(rb);
+  if (seq_params->profile == PROFILE_2 && high_bitdepth) {
+    const int twelve_bit = aom_rb_read_bit(rb);
+    seq_params->bit_depth = twelve_bit ? AOM_BITS_12 : AOM_BITS_10;
+  } else if (seq_params->profile <= PROFILE_2) {
+    seq_params->bit_depth = high_bitdepth ? AOM_BITS_10 : AOM_BITS_8;
+  } else {
+    aom_internal_error(error_info, AOM_CODEC_UNSUP_BITSTREAM,
+                       "Unsupported profile/bit-depth combination");
+  }
+#if !CONFIG_AV1_HIGHBITDEPTH
+  if (seq_params->bit_depth > AOM_BITS_8) {
+    aom_internal_error(error_info, AOM_CODEC_UNSUP_BITSTREAM,
+                       "Bit-depth %d not supported", seq_params->bit_depth);
+  }
+#endif
+}
+
+void av1_read_film_grain_params(AV1_COMMON *cm,
+                                struct aom_read_bit_buffer *rb) {
+  aom_film_grain_t *pars = &cm->film_grain_params;
+  const SequenceHeader *const seq_params = &cm->seq_params;
+
+  pars->apply_grain = aom_rb_read_bit(rb);
+  if (!pars->apply_grain) {
+    memset(pars, 0, sizeof(*pars));
+    return;
+  }
+
+  pars->random_seed = aom_rb_read_literal(rb, 16);
+  if (cm->current_frame.frame_type == INTER_FRAME)
+    pars->update_parameters = aom_rb_read_bit(rb);
+  else
+    pars->update_parameters = 1;
+
+  pars->bit_depth = seq_params->bit_depth;
+
+  if (!pars->update_parameters) {
+    // inherit parameters from a previous reference frame
+    int film_grain_params_ref_idx = aom_rb_read_literal(rb, 3);
+    // Section 6.8.20: It is a requirement of bitstream conformance that
+    // film_grain_params_ref_idx is equal to ref_frame_idx[ j ] for some value
+    // of j in the range 0 to REFS_PER_FRAME - 1.
+    int found = 0;
+    for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+      if (film_grain_params_ref_idx == cm->remapped_ref_idx[i]) {
+        found = 1;
+        break;
+      }
+    }
+    if (!found) {
+      aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                         "Invalid film grain reference idx %d. ref_frame_idx = "
+                         "{%d, %d, %d, %d, %d, %d, %d}",
+                         film_grain_params_ref_idx, cm->remapped_ref_idx[0],
+                         cm->remapped_ref_idx[1], cm->remapped_ref_idx[2],
+                         cm->remapped_ref_idx[3], cm->remapped_ref_idx[4],
+                         cm->remapped_ref_idx[5], cm->remapped_ref_idx[6]);
+    }
+    RefCntBuffer *const buf = cm->ref_frame_map[film_grain_params_ref_idx];
+    if (buf == NULL) {
+      aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                         "Invalid Film grain reference idx");
+    }
+    if (!buf->film_grain_params_present) {
+      aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                         "Film grain reference parameters not available");
+    }
+    uint16_t random_seed = pars->random_seed;
+    *pars = buf->film_grain_params;   // inherit paramaters
+    pars->random_seed = random_seed;  // with new random seed
+    return;
+  }
+
+  // Scaling functions parameters
+  pars->num_y_points = aom_rb_read_literal(rb, 4);  // max 14
+  if (pars->num_y_points > 14)
+    aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                       "Number of points for film grain luma scaling function "
+                       "exceeds the maximum value.");
+  for (int i = 0; i < pars->num_y_points; i++) {
+    pars->scaling_points_y[i][0] = aom_rb_read_literal(rb, 8);
+    if (i && pars->scaling_points_y[i - 1][0] >= pars->scaling_points_y[i][0])
+      aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                         "First coordinate of the scaling function points "
+                         "shall be increasing.");
+    pars->scaling_points_y[i][1] = aom_rb_read_literal(rb, 8);
+  }
+
+  if (!seq_params->monochrome)
+    pars->chroma_scaling_from_luma = aom_rb_read_bit(rb);
+  else
+    pars->chroma_scaling_from_luma = 0;
+
+  if (seq_params->monochrome || pars->chroma_scaling_from_luma ||
+      ((seq_params->subsampling_x == 1) && (seq_params->subsampling_y == 1) &&
+       (pars->num_y_points == 0))) {
+    pars->num_cb_points = 0;
+    pars->num_cr_points = 0;
+  } else {
+    pars->num_cb_points = aom_rb_read_literal(rb, 4);  // max 10
+    if (pars->num_cb_points > 10)
+      aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                         "Number of points for film grain cb scaling function "
+                         "exceeds the maximum value.");
+    for (int i = 0; i < pars->num_cb_points; i++) {
+      pars->scaling_points_cb[i][0] = aom_rb_read_literal(rb, 8);
+      if (i &&
+          pars->scaling_points_cb[i - 1][0] >= pars->scaling_points_cb[i][0])
+        aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                           "First coordinate of the scaling function points "
+                           "shall be increasing.");
+      pars->scaling_points_cb[i][1] = aom_rb_read_literal(rb, 8);
+    }
+
+    pars->num_cr_points = aom_rb_read_literal(rb, 4);  // max 10
+    if (pars->num_cr_points > 10)
+      aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                         "Number of points for film grain cr scaling function "
+                         "exceeds the maximum value.");
+    for (int i = 0; i < pars->num_cr_points; i++) {
+      pars->scaling_points_cr[i][0] = aom_rb_read_literal(rb, 8);
+      if (i &&
+          pars->scaling_points_cr[i - 1][0] >= pars->scaling_points_cr[i][0])
+        aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                           "First coordinate of the scaling function points "
+                           "shall be increasing.");
+      pars->scaling_points_cr[i][1] = aom_rb_read_literal(rb, 8);
+    }
+
+    if ((seq_params->subsampling_x == 1) && (seq_params->subsampling_y == 1) &&
+        (((pars->num_cb_points == 0) && (pars->num_cr_points != 0)) ||
+         ((pars->num_cb_points != 0) && (pars->num_cr_points == 0))))
+      aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                         "In YCbCr 4:2:0, film grain shall be applied "
+                         "to both chroma components or neither.");
+  }
+
+  pars->scaling_shift = aom_rb_read_literal(rb, 2) + 8;  // 8 + value
+
+  // AR coefficients
+  // Only sent if the corresponsing scaling function has
+  // more than 0 points
+
+  pars->ar_coeff_lag = aom_rb_read_literal(rb, 2);
+
+  int num_pos_luma = 2 * pars->ar_coeff_lag * (pars->ar_coeff_lag + 1);
+  int num_pos_chroma = num_pos_luma;
+  if (pars->num_y_points > 0) ++num_pos_chroma;
+
+  if (pars->num_y_points)
+    for (int i = 0; i < num_pos_luma; i++)
+      pars->ar_coeffs_y[i] = aom_rb_read_literal(rb, 8) - 128;
+
+  if (pars->num_cb_points || pars->chroma_scaling_from_luma)
+    for (int i = 0; i < num_pos_chroma; i++)
+      pars->ar_coeffs_cb[i] = aom_rb_read_literal(rb, 8) - 128;
+
+  if (pars->num_cr_points || pars->chroma_scaling_from_luma)
+    for (int i = 0; i < num_pos_chroma; i++)
+      pars->ar_coeffs_cr[i] = aom_rb_read_literal(rb, 8) - 128;
+
+  pars->ar_coeff_shift = aom_rb_read_literal(rb, 2) + 6;  // 6 + value
+
+  pars->grain_scale_shift = aom_rb_read_literal(rb, 2);
+
+  if (pars->num_cb_points) {
+    pars->cb_mult = aom_rb_read_literal(rb, 8);
+    pars->cb_luma_mult = aom_rb_read_literal(rb, 8);
+    pars->cb_offset = aom_rb_read_literal(rb, 9);
+  }
+
+  if (pars->num_cr_points) {
+    pars->cr_mult = aom_rb_read_literal(rb, 8);
+    pars->cr_luma_mult = aom_rb_read_literal(rb, 8);
+    pars->cr_offset = aom_rb_read_literal(rb, 9);
+  }
+
+  pars->overlap_flag = aom_rb_read_bit(rb);
+
+  pars->clip_to_restricted_range = aom_rb_read_bit(rb);
+}
+
+static AOM_INLINE void read_film_grain(AV1_COMMON *cm,
+                                       struct aom_read_bit_buffer *rb) {
+  if (cm->seq_params.film_grain_params_present &&
+      (cm->show_frame || cm->showable_frame)) {
+    av1_read_film_grain_params(cm, rb);
+  } else {
+    memset(&cm->film_grain_params, 0, sizeof(cm->film_grain_params));
+  }
+  cm->film_grain_params.bit_depth = cm->seq_params.bit_depth;
+  memcpy(&cm->cur_frame->film_grain_params, &cm->film_grain_params,
+         sizeof(aom_film_grain_t));
+}
+
+void av1_read_color_config(struct aom_read_bit_buffer *rb,
+                           int allow_lowbitdepth, SequenceHeader *seq_params,
+                           struct aom_internal_error_info *error_info) {
+  read_bitdepth(rb, seq_params, error_info);
+
+  seq_params->use_highbitdepth =
+      seq_params->bit_depth > AOM_BITS_8 || !allow_lowbitdepth;
+  // monochrome bit (not needed for PROFILE_1)
+  const int is_monochrome =
+      seq_params->profile != PROFILE_1 ? aom_rb_read_bit(rb) : 0;
+  seq_params->monochrome = is_monochrome;
+  int color_description_present_flag = aom_rb_read_bit(rb);
+  if (color_description_present_flag) {
+    seq_params->color_primaries = aom_rb_read_literal(rb, 8);
+    seq_params->transfer_characteristics = aom_rb_read_literal(rb, 8);
+    seq_params->matrix_coefficients = aom_rb_read_literal(rb, 8);
+  } else {
+    seq_params->color_primaries = AOM_CICP_CP_UNSPECIFIED;
+    seq_params->transfer_characteristics = AOM_CICP_TC_UNSPECIFIED;
+    seq_params->matrix_coefficients = AOM_CICP_MC_UNSPECIFIED;
+  }
+  if (is_monochrome) {
+    // [16,235] (including xvycc) vs [0,255] range
+    seq_params->color_range = aom_rb_read_bit(rb);
+    seq_params->subsampling_y = seq_params->subsampling_x = 1;
+    seq_params->chroma_sample_position = AOM_CSP_UNKNOWN;
+    seq_params->separate_uv_delta_q = 0;
+    return;
+  }
+  if (seq_params->color_primaries == AOM_CICP_CP_BT_709 &&
+      seq_params->transfer_characteristics == AOM_CICP_TC_SRGB &&
+      seq_params->matrix_coefficients == AOM_CICP_MC_IDENTITY) {
+    seq_params->subsampling_y = seq_params->subsampling_x = 0;
+    seq_params->color_range = 1;  // assume full color-range
+    if (!(seq_params->profile == PROFILE_1 ||
+          (seq_params->profile == PROFILE_2 &&
+           seq_params->bit_depth == AOM_BITS_12))) {
+      aom_internal_error(
+          error_info, AOM_CODEC_UNSUP_BITSTREAM,
+          "sRGB colorspace not compatible with specified profile");
+    }
+  } else {
+    // [16,235] (including xvycc) vs [0,255] range
+    seq_params->color_range = aom_rb_read_bit(rb);
+    if (seq_params->profile == PROFILE_0) {
+      // 420 only
+      seq_params->subsampling_x = seq_params->subsampling_y = 1;
+    } else if (seq_params->profile == PROFILE_1) {
+      // 444 only
+      seq_params->subsampling_x = seq_params->subsampling_y = 0;
+    } else {
+      assert(seq_params->profile == PROFILE_2);
+      if (seq_params->bit_depth == AOM_BITS_12) {
+        seq_params->subsampling_x = aom_rb_read_bit(rb);
+        if (seq_params->subsampling_x)
+          seq_params->subsampling_y = aom_rb_read_bit(rb);  // 422 or 420
+        else
+          seq_params->subsampling_y = 0;  // 444
+      } else {
+        // 422
+        seq_params->subsampling_x = 1;
+        seq_params->subsampling_y = 0;
+      }
+    }
+    if (seq_params->matrix_coefficients == AOM_CICP_MC_IDENTITY &&
+        (seq_params->subsampling_x || seq_params->subsampling_y)) {
+      aom_internal_error(
+          error_info, AOM_CODEC_UNSUP_BITSTREAM,
+          "Identity CICP Matrix incompatible with non 4:4:4 color sampling");
+    }
+    if (seq_params->subsampling_x && seq_params->subsampling_y) {
+      seq_params->chroma_sample_position = aom_rb_read_literal(rb, 2);
+    }
+  }
+  seq_params->separate_uv_delta_q = aom_rb_read_bit(rb);
+}
+
+void av1_read_timing_info_header(aom_timing_info_t *timing_info,
+                                 struct aom_internal_error_info *error,
+                                 struct aom_read_bit_buffer *rb) {
+  timing_info->num_units_in_display_tick =
+      aom_rb_read_unsigned_literal(rb,
+                                   32);  // Number of units in a display tick
+  timing_info->time_scale = aom_rb_read_unsigned_literal(rb, 32);  // Time scale
+  if (timing_info->num_units_in_display_tick == 0 ||
+      timing_info->time_scale == 0) {
+    aom_internal_error(
+        error, AOM_CODEC_UNSUP_BITSTREAM,
+        "num_units_in_display_tick and time_scale must be greater than 0.");
+  }
+  timing_info->equal_picture_interval =
+      aom_rb_read_bit(rb);  // Equal picture interval bit
+  if (timing_info->equal_picture_interval) {
+    const uint32_t num_ticks_per_picture_minus_1 = aom_rb_read_uvlc(rb);
+    if (num_ticks_per_picture_minus_1 == UINT32_MAX) {
+      aom_internal_error(
+          error, AOM_CODEC_UNSUP_BITSTREAM,
+          "num_ticks_per_picture_minus_1 cannot be (1 << 32) − 1.");
+    }
+    timing_info->num_ticks_per_picture = num_ticks_per_picture_minus_1 + 1;
+  }
+}
+
+void av1_read_decoder_model_info(aom_dec_model_info_t *decoder_model_info,
+                                 struct aom_read_bit_buffer *rb) {
+  decoder_model_info->encoder_decoder_buffer_delay_length =
+      aom_rb_read_literal(rb, 5) + 1;
+  decoder_model_info->num_units_in_decoding_tick =
+      aom_rb_read_unsigned_literal(rb,
+                                   32);  // Number of units in a decoding tick
+  decoder_model_info->buffer_removal_time_length =
+      aom_rb_read_literal(rb, 5) + 1;
+  decoder_model_info->frame_presentation_time_length =
+      aom_rb_read_literal(rb, 5) + 1;
+}
+
+void av1_read_op_parameters_info(aom_dec_model_op_parameters_t *op_params,
+                                 int buffer_delay_length,
+                                 struct aom_read_bit_buffer *rb) {
+  op_params->decoder_buffer_delay =
+      aom_rb_read_unsigned_literal(rb, buffer_delay_length);
+  op_params->encoder_buffer_delay =
+      aom_rb_read_unsigned_literal(rb, buffer_delay_length);
+  op_params->low_delay_mode_flag = aom_rb_read_bit(rb);
+}
+
+static AOM_INLINE void read_temporal_point_info(
+    AV1_COMMON *const cm, struct aom_read_bit_buffer *rb) {
+  cm->frame_presentation_time = aom_rb_read_unsigned_literal(
+      rb, cm->seq_params.decoder_model_info.frame_presentation_time_length);
+}
+
+void av1_read_sequence_header(AV1_COMMON *cm, struct aom_read_bit_buffer *rb,
+                              SequenceHeader *seq_params) {
+  const int num_bits_width = aom_rb_read_literal(rb, 4) + 1;
+  const int num_bits_height = aom_rb_read_literal(rb, 4) + 1;
+  const int max_frame_width = aom_rb_read_literal(rb, num_bits_width) + 1;
+  const int max_frame_height = aom_rb_read_literal(rb, num_bits_height) + 1;
+
+  seq_params->num_bits_width = num_bits_width;
+  seq_params->num_bits_height = num_bits_height;
+  seq_params->max_frame_width = max_frame_width;
+  seq_params->max_frame_height = max_frame_height;
+
+  if (seq_params->reduced_still_picture_hdr) {
+    seq_params->frame_id_numbers_present_flag = 0;
+  } else {
+    seq_params->frame_id_numbers_present_flag = aom_rb_read_bit(rb);
+  }
+  if (seq_params->frame_id_numbers_present_flag) {
+    // We must always have delta_frame_id_length < frame_id_length,
+    // in order for a frame to be referenced with a unique delta.
+    // Avoid wasting bits by using a coding that enforces this restriction.
+    seq_params->delta_frame_id_length = aom_rb_read_literal(rb, 4) + 2;
+    seq_params->frame_id_length =
+        aom_rb_read_literal(rb, 3) + seq_params->delta_frame_id_length + 1;
+    if (seq_params->frame_id_length > 16)
+      aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                         "Invalid frame_id_length");
+  }
+
+  setup_sb_size(seq_params, rb);
+
+  seq_params->enable_filter_intra = aom_rb_read_bit(rb);
+  seq_params->enable_intra_edge_filter = aom_rb_read_bit(rb);
+
+  if (seq_params->reduced_still_picture_hdr) {
+    seq_params->enable_interintra_compound = 0;
+    seq_params->enable_masked_compound = 0;
+    seq_params->enable_warped_motion = 0;
+    seq_params->enable_dual_filter = 0;
+    seq_params->order_hint_info.enable_order_hint = 0;
+    seq_params->order_hint_info.enable_dist_wtd_comp = 0;
+    seq_params->order_hint_info.enable_ref_frame_mvs = 0;
+    seq_params->force_screen_content_tools = 2;  // SELECT_SCREEN_CONTENT_TOOLS
+    seq_params->force_integer_mv = 2;            // SELECT_INTEGER_MV
+    seq_params->order_hint_info.order_hint_bits_minus_1 = -1;
+  } else {
+    seq_params->enable_interintra_compound = aom_rb_read_bit(rb);
+    seq_params->enable_masked_compound = aom_rb_read_bit(rb);
+    seq_params->enable_warped_motion = aom_rb_read_bit(rb);
+    seq_params->enable_dual_filter = aom_rb_read_bit(rb);
+
+    seq_params->order_hint_info.enable_order_hint = aom_rb_read_bit(rb);
+    seq_params->order_hint_info.enable_dist_wtd_comp =
+        seq_params->order_hint_info.enable_order_hint ? aom_rb_read_bit(rb) : 0;
+    seq_params->order_hint_info.enable_ref_frame_mvs =
+        seq_params->order_hint_info.enable_order_hint ? aom_rb_read_bit(rb) : 0;
+
+    if (aom_rb_read_bit(rb)) {
+      seq_params->force_screen_content_tools =
+          2;  // SELECT_SCREEN_CONTENT_TOOLS
+    } else {
+      seq_params->force_screen_content_tools = aom_rb_read_bit(rb);
+    }
+
+    if (seq_params->force_screen_content_tools > 0) {
+      if (aom_rb_read_bit(rb)) {
+        seq_params->force_integer_mv = 2;  // SELECT_INTEGER_MV
+      } else {
+        seq_params->force_integer_mv = aom_rb_read_bit(rb);
+      }
+    } else {
+      seq_params->force_integer_mv = 2;  // SELECT_INTEGER_MV
+    }
+    seq_params->order_hint_info.order_hint_bits_minus_1 =
+        seq_params->order_hint_info.enable_order_hint
+            ? aom_rb_read_literal(rb, 3)
+            : -1;
+  }
+
+  seq_params->enable_superres = aom_rb_read_bit(rb);
+  seq_params->enable_cdef = aom_rb_read_bit(rb);
+  seq_params->enable_restoration = aom_rb_read_bit(rb);
+}
+
+static int read_global_motion_params(WarpedMotionParams *params,
+                                     const WarpedMotionParams *ref_params,
+                                     struct aom_read_bit_buffer *rb,
+                                     int allow_hp) {
+  TransformationType type = aom_rb_read_bit(rb);
+  if (type != IDENTITY) {
+    if (aom_rb_read_bit(rb))
+      type = ROTZOOM;
+    else
+      type = aom_rb_read_bit(rb) ? TRANSLATION : AFFINE;
+  }
+
+  *params = default_warp_params;
+  params->wmtype = type;
+
+  if (type >= ROTZOOM) {
+    params->wmmat[2] = aom_rb_read_signed_primitive_refsubexpfin(
+                           rb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+                           (ref_params->wmmat[2] >> GM_ALPHA_PREC_DIFF) -
+                               (1 << GM_ALPHA_PREC_BITS)) *
+                           GM_ALPHA_DECODE_FACTOR +
+                       (1 << WARPEDMODEL_PREC_BITS);
+    params->wmmat[3] = aom_rb_read_signed_primitive_refsubexpfin(
+                           rb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+                           (ref_params->wmmat[3] >> GM_ALPHA_PREC_DIFF)) *
+                       GM_ALPHA_DECODE_FACTOR;
+  }
+
+  if (type >= AFFINE) {
+    params->wmmat[4] = aom_rb_read_signed_primitive_refsubexpfin(
+                           rb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+                           (ref_params->wmmat[4] >> GM_ALPHA_PREC_DIFF)) *
+                       GM_ALPHA_DECODE_FACTOR;
+    params->wmmat[5] = aom_rb_read_signed_primitive_refsubexpfin(
+                           rb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+                           (ref_params->wmmat[5] >> GM_ALPHA_PREC_DIFF) -
+                               (1 << GM_ALPHA_PREC_BITS)) *
+                           GM_ALPHA_DECODE_FACTOR +
+                       (1 << WARPEDMODEL_PREC_BITS);
+  } else {
+    params->wmmat[4] = -params->wmmat[3];
+    params->wmmat[5] = params->wmmat[2];
+  }
+
+  if (type >= TRANSLATION) {
+    const int trans_bits = (type == TRANSLATION)
+                               ? GM_ABS_TRANS_ONLY_BITS - !allow_hp
+                               : GM_ABS_TRANS_BITS;
+    const int trans_dec_factor =
+        (type == TRANSLATION) ? GM_TRANS_ONLY_DECODE_FACTOR * (1 << !allow_hp)
+                              : GM_TRANS_DECODE_FACTOR;
+    const int trans_prec_diff = (type == TRANSLATION)
+                                    ? GM_TRANS_ONLY_PREC_DIFF + !allow_hp
+                                    : GM_TRANS_PREC_DIFF;
+    params->wmmat[0] = aom_rb_read_signed_primitive_refsubexpfin(
+                           rb, (1 << trans_bits) + 1, SUBEXPFIN_K,
+                           (ref_params->wmmat[0] >> trans_prec_diff)) *
+                       trans_dec_factor;
+    params->wmmat[1] = aom_rb_read_signed_primitive_refsubexpfin(
+                           rb, (1 << trans_bits) + 1, SUBEXPFIN_K,
+                           (ref_params->wmmat[1] >> trans_prec_diff)) *
+                       trans_dec_factor;
+  }
+
+  if (params->wmtype <= AFFINE) {
+    int good_shear_params = av1_get_shear_params(params);
+    if (!good_shear_params) return 0;
+  }
+
+  return 1;
+}
+
+static AOM_INLINE void read_global_motion(AV1_COMMON *cm,
+                                          struct aom_read_bit_buffer *rb) {
+  for (int frame = LAST_FRAME; frame <= ALTREF_FRAME; ++frame) {
+    const WarpedMotionParams *ref_params =
+        cm->prev_frame ? &cm->prev_frame->global_motion[frame]
+                       : &default_warp_params;
+    int good_params =
+        read_global_motion_params(&cm->global_motion[frame], ref_params, rb,
+                                  cm->features.allow_high_precision_mv);
+    if (!good_params) {
+#if WARPED_MOTION_DEBUG
+      printf("Warning: unexpected global motion shear params from aomenc\n");
+#endif
+      cm->global_motion[frame].invalid = 1;
+    }
+
+    // TODO(sarahparker, debargha): The logic in the commented out code below
+    // does not work currently and causes mismatches when resize is on. Fix it
+    // before turning the optimization back on.
+    /*
+    YV12_BUFFER_CONFIG *ref_buf = get_ref_frame(cm, frame);
+    if (cm->width == ref_buf->y_crop_width &&
+        cm->height == ref_buf->y_crop_height) {
+      read_global_motion_params(&cm->global_motion[frame],
+                                &cm->prev_frame->global_motion[frame], rb,
+                                cm->features.allow_high_precision_mv);
+    } else {
+      cm->global_motion[frame] = default_warp_params;
+    }
+    */
+    /*
+    printf("Dec Ref %d [%d/%d]: %d %d %d %d\n",
+           frame, cm->current_frame.frame_number, cm->show_frame,
+           cm->global_motion[frame].wmmat[0],
+           cm->global_motion[frame].wmmat[1],
+           cm->global_motion[frame].wmmat[2],
+           cm->global_motion[frame].wmmat[3]);
+           */
+  }
+  memcpy(cm->cur_frame->global_motion, cm->global_motion,
+         REF_FRAMES * sizeof(WarpedMotionParams));
+}
+
+// Release the references to the frame buffers in cm->ref_frame_map and reset
+// all elements of cm->ref_frame_map to NULL.
+static AOM_INLINE void reset_ref_frame_map(AV1_COMMON *const cm) {
+  BufferPool *const pool = cm->buffer_pool;
+
+  for (int i = 0; i < REF_FRAMES; i++) {
+    decrease_ref_count(cm->ref_frame_map[i], pool);
+    cm->ref_frame_map[i] = NULL;
+  }
+}
+
+// If the refresh_frame_flags bitmask is set, update reference frame id values
+// and mark frames as valid for reference.
+static AOM_INLINE void update_ref_frame_id(AV1Decoder *const pbi) {
+  AV1_COMMON *const cm = &pbi->common;
+  int refresh_frame_flags = cm->current_frame.refresh_frame_flags;
+  for (int i = 0; i < REF_FRAMES; i++) {
+    if ((refresh_frame_flags >> i) & 1) {
+      cm->ref_frame_id[i] = cm->current_frame_id;
+      pbi->valid_for_referencing[i] = 1;
+    }
+  }
+}
+
+static AOM_INLINE void show_existing_frame_reset(AV1Decoder *const pbi,
+                                                 int existing_frame_idx) {
+  AV1_COMMON *const cm = &pbi->common;
+
+  assert(cm->show_existing_frame);
+
+  cm->current_frame.frame_type = KEY_FRAME;
+
+  cm->current_frame.refresh_frame_flags = (1 << REF_FRAMES) - 1;
+
+  for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+    cm->remapped_ref_idx[i] = INVALID_IDX;
+  }
+
+  if (pbi->need_resync) {
+    reset_ref_frame_map(cm);
+    pbi->need_resync = 0;
+  }
+
+  // Note that the displayed frame must be valid for referencing in order to
+  // have been selected.
+  cm->current_frame_id = cm->ref_frame_id[existing_frame_idx];
+  update_ref_frame_id(pbi);
+
+  cm->features.refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED;
+}
+
+static INLINE void reset_frame_buffers(AV1_COMMON *cm) {
+  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+  int i;
+
+  lock_buffer_pool(cm->buffer_pool);
+  reset_ref_frame_map(cm);
+  assert(cm->cur_frame->ref_count == 1);
+  for (i = 0; i < FRAME_BUFFERS; ++i) {
+    // Reset all unreferenced frame buffers. We can also reset cm->cur_frame
+    // because we are the sole owner of cm->cur_frame.
+    if (frame_bufs[i].ref_count > 0 && &frame_bufs[i] != cm->cur_frame) {
+      continue;
+    }
+    frame_bufs[i].order_hint = 0;
+    av1_zero(frame_bufs[i].ref_order_hints);
+  }
+  av1_zero_unused_internal_frame_buffers(&cm->buffer_pool->int_frame_buffers);
+  unlock_buffer_pool(cm->buffer_pool);
+}
+
+// On success, returns 0. On failure, calls aom_internal_error and does not
+// return.
+static int read_uncompressed_header(AV1Decoder *pbi,
+                                    struct aom_read_bit_buffer *rb) {
+  AV1_COMMON *const cm = &pbi->common;
+  const SequenceHeader *const seq_params = &cm->seq_params;
+  CurrentFrame *const current_frame = &cm->current_frame;
+  FeatureFlags *const features = &cm->features;
+  MACROBLOCKD *const xd = &pbi->mb;
+  BufferPool *const pool = cm->buffer_pool;
+  RefCntBuffer *const frame_bufs = pool->frame_bufs;
+
+  if (!pbi->sequence_header_ready) {
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "No sequence header");
+  }
+
+  if (seq_params->reduced_still_picture_hdr) {
+    cm->show_existing_frame = 0;
+    cm->show_frame = 1;
+    current_frame->frame_type = KEY_FRAME;
+    if (pbi->sequence_header_changed) {
+      // This is the start of a new coded video sequence.
+      pbi->sequence_header_changed = 0;
+      pbi->decoding_first_frame = 1;
+      reset_frame_buffers(cm);
+    }
+    features->error_resilient_mode = 1;
+  } else {
+    cm->show_existing_frame = aom_rb_read_bit(rb);
+    pbi->reset_decoder_state = 0;
+
+    if (cm->show_existing_frame) {
+      if (pbi->sequence_header_changed) {
+        aom_internal_error(
+            &cm->error, AOM_CODEC_CORRUPT_FRAME,
+            "New sequence header starts with a show_existing_frame.");
+      }
+      // Show an existing frame directly.
+      const int existing_frame_idx = aom_rb_read_literal(rb, 3);
+      RefCntBuffer *const frame_to_show = cm->ref_frame_map[existing_frame_idx];
+      if (frame_to_show == NULL) {
+        aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                           "Buffer does not contain a decoded frame");
+      }
+      if (seq_params->decoder_model_info_present_flag &&
+          seq_params->timing_info.equal_picture_interval == 0) {
+        read_temporal_point_info(cm, rb);
+      }
+      if (seq_params->frame_id_numbers_present_flag) {
+        int frame_id_length = seq_params->frame_id_length;
+        int display_frame_id = aom_rb_read_literal(rb, frame_id_length);
+        /* Compare display_frame_id with ref_frame_id and check valid for
+         * referencing */
+        if (display_frame_id != cm->ref_frame_id[existing_frame_idx] ||
+            pbi->valid_for_referencing[existing_frame_idx] == 0)
+          aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                             "Reference buffer frame ID mismatch");
+      }
+      lock_buffer_pool(pool);
+      assert(frame_to_show->ref_count > 0);
+      // cm->cur_frame should be the buffer referenced by the return value
+      // of the get_free_fb() call in assign_cur_frame_new_fb() (called by
+      // av1_receive_compressed_data()), so the ref_count should be 1.
+      assert(cm->cur_frame->ref_count == 1);
+      // assign_frame_buffer_p() decrements ref_count directly rather than
+      // call decrease_ref_count(). If cm->cur_frame->raw_frame_buffer has
+      // already been allocated, it will not be released by
+      // assign_frame_buffer_p()!
+      assert(!cm->cur_frame->raw_frame_buffer.data);
+      assign_frame_buffer_p(&cm->cur_frame, frame_to_show);
+      pbi->reset_decoder_state = frame_to_show->frame_type == KEY_FRAME;
+      unlock_buffer_pool(pool);
+
+      cm->lf.filter_level[0] = 0;
+      cm->lf.filter_level[1] = 0;
+      cm->show_frame = 1;
+
+      // Section 6.8.2: It is a requirement of bitstream conformance that when
+      // show_existing_frame is used to show a previous frame, that the value
+      // of showable_frame for the previous frame was equal to 1.
+      if (!frame_to_show->showable_frame) {
+        aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                           "Buffer does not contain a showable frame");
+      }
+      // Section 6.8.2: It is a requirement of bitstream conformance that when
+      // show_existing_frame is used to show a previous frame with
+      // RefFrameType[ frame_to_show_map_idx ] equal to KEY_FRAME, that the
+      // frame is output via the show_existing_frame mechanism at most once.
+      if (pbi->reset_decoder_state) frame_to_show->showable_frame = 0;
+
+      cm->film_grain_params = frame_to_show->film_grain_params;
+
+      if (pbi->reset_decoder_state) {
+        show_existing_frame_reset(pbi, existing_frame_idx);
+      } else {
+        current_frame->refresh_frame_flags = 0;
+      }
+
+      return 0;
+    }
+
+    current_frame->frame_type = (FRAME_TYPE)aom_rb_read_literal(rb, 2);
+    if (pbi->sequence_header_changed) {
+      if (current_frame->frame_type == KEY_FRAME) {
+        // This is the start of a new coded video sequence.
+        pbi->sequence_header_changed = 0;
+        pbi->decoding_first_frame = 1;
+        reset_frame_buffers(cm);
+      } else {
+        aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                           "Sequence header has changed without a keyframe.");
+      }
+    }
+
+    cm->show_frame = aom_rb_read_bit(rb);
+    if (seq_params->still_picture &&
+        (current_frame->frame_type != KEY_FRAME || !cm->show_frame)) {
+      aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                         "Still pictures must be coded as shown keyframes");
+    }
+    cm->showable_frame = current_frame->frame_type != KEY_FRAME;
+    if (cm->show_frame) {
+      if (seq_params->decoder_model_info_present_flag &&
+          seq_params->timing_info.equal_picture_interval == 0)
+        read_temporal_point_info(cm, rb);
+    } else {
+      // See if this frame can be used as show_existing_frame in future
+      cm->showable_frame = aom_rb_read_bit(rb);
+    }
+    cm->cur_frame->showable_frame = cm->showable_frame;
+    features->error_resilient_mode =
+        frame_is_sframe(cm) ||
+                (current_frame->frame_type == KEY_FRAME && cm->show_frame)
+            ? 1
+            : aom_rb_read_bit(rb);
+  }
+
+  if (current_frame->frame_type == KEY_FRAME && cm->show_frame) {
+    /* All frames need to be marked as not valid for referencing */
+    for (int i = 0; i < REF_FRAMES; i++) {
+      pbi->valid_for_referencing[i] = 0;
+    }
+  }
+  features->disable_cdf_update = aom_rb_read_bit(rb);
+  if (seq_params->force_screen_content_tools == 2) {
+    features->allow_screen_content_tools = aom_rb_read_bit(rb);
+  } else {
+    features->allow_screen_content_tools =
+        seq_params->force_screen_content_tools;
+  }
+
+  if (features->allow_screen_content_tools) {
+    if (seq_params->force_integer_mv == 2) {
+      features->cur_frame_force_integer_mv = aom_rb_read_bit(rb);
+    } else {
+      features->cur_frame_force_integer_mv = seq_params->force_integer_mv;
+    }
+  } else {
+    features->cur_frame_force_integer_mv = 0;
+  }
+
+  int frame_size_override_flag = 0;
+  features->allow_intrabc = 0;
+  features->primary_ref_frame = PRIMARY_REF_NONE;
+
+  if (!seq_params->reduced_still_picture_hdr) {
+    if (seq_params->frame_id_numbers_present_flag) {
+      int frame_id_length = seq_params->frame_id_length;
+      int diff_len = seq_params->delta_frame_id_length;
+      int prev_frame_id = 0;
+      int have_prev_frame_id =
+          !pbi->decoding_first_frame &&
+          !(current_frame->frame_type == KEY_FRAME && cm->show_frame);
+      if (have_prev_frame_id) {
+        prev_frame_id = cm->current_frame_id;
+      }
+      cm->current_frame_id = aom_rb_read_literal(rb, frame_id_length);
+
+      if (have_prev_frame_id) {
+        int diff_frame_id;
+        if (cm->current_frame_id > prev_frame_id) {
+          diff_frame_id = cm->current_frame_id - prev_frame_id;
+        } else {
+          diff_frame_id =
+              (1 << frame_id_length) + cm->current_frame_id - prev_frame_id;
+        }
+        /* Check current_frame_id for conformance */
+        if (prev_frame_id == cm->current_frame_id ||
+            diff_frame_id >= (1 << (frame_id_length - 1))) {
+          aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                             "Invalid value of current_frame_id");
+        }
+      }
+      /* Check if some frames need to be marked as not valid for referencing */
+      for (int i = 0; i < REF_FRAMES; i++) {
+        if (cm->current_frame_id - (1 << diff_len) > 0) {
+          if (cm->ref_frame_id[i] > cm->current_frame_id ||
+              cm->ref_frame_id[i] < cm->current_frame_id - (1 << diff_len))
+            pbi->valid_for_referencing[i] = 0;
+        } else {
+          if (cm->ref_frame_id[i] > cm->current_frame_id &&
+              cm->ref_frame_id[i] < (1 << frame_id_length) +
+                                        cm->current_frame_id - (1 << diff_len))
+            pbi->valid_for_referencing[i] = 0;
+        }
+      }
+    }
+
+    frame_size_override_flag = frame_is_sframe(cm) ? 1 : aom_rb_read_bit(rb);
+
+    current_frame->order_hint = aom_rb_read_literal(
+        rb, seq_params->order_hint_info.order_hint_bits_minus_1 + 1);
+    current_frame->frame_number = current_frame->order_hint;
+
+    if (!features->error_resilient_mode && !frame_is_intra_only(cm)) {
+      features->primary_ref_frame = aom_rb_read_literal(rb, PRIMARY_REF_BITS);
+    }
+  }
+
+  if (seq_params->decoder_model_info_present_flag) {
+    cm->buffer_removal_time_present = aom_rb_read_bit(rb);
+    if (cm->buffer_removal_time_present) {
+      for (int op_num = 0;
+           op_num < seq_params->operating_points_cnt_minus_1 + 1; op_num++) {
+        if (seq_params->op_params[op_num].decoder_model_param_present_flag) {
+          if ((((seq_params->operating_point_idc[op_num] >>
+                 cm->temporal_layer_id) &
+                0x1) &&
+               ((seq_params->operating_point_idc[op_num] >>
+                 (cm->spatial_layer_id + 8)) &
+                0x1)) ||
+              seq_params->operating_point_idc[op_num] == 0) {
+            cm->buffer_removal_times[op_num] = aom_rb_read_unsigned_literal(
+                rb, seq_params->decoder_model_info.buffer_removal_time_length);
+          } else {
+            cm->buffer_removal_times[op_num] = 0;
+          }
+        } else {
+          cm->buffer_removal_times[op_num] = 0;
+        }
+      }
+    }
+  }
+  if (current_frame->frame_type == KEY_FRAME) {
+    if (!cm->show_frame) {  // unshown keyframe (forward keyframe)
+      current_frame->refresh_frame_flags = aom_rb_read_literal(rb, REF_FRAMES);
+    } else {  // shown keyframe
+      current_frame->refresh_frame_flags = (1 << REF_FRAMES) - 1;
+    }
+
+    for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+      cm->remapped_ref_idx[i] = INVALID_IDX;
+    }
+    if (pbi->need_resync) {
+      reset_ref_frame_map(cm);
+      pbi->need_resync = 0;
+    }
+  } else {
+    if (current_frame->frame_type == INTRA_ONLY_FRAME) {
+      current_frame->refresh_frame_flags = aom_rb_read_literal(rb, REF_FRAMES);
+      if (current_frame->refresh_frame_flags == 0xFF) {
+        aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                           "Intra only frames cannot have refresh flags 0xFF");
+      }
+      if (pbi->need_resync) {
+        reset_ref_frame_map(cm);
+        pbi->need_resync = 0;
+      }
+    } else if (pbi->need_resync != 1) { /* Skip if need resync */
+      current_frame->refresh_frame_flags =
+          frame_is_sframe(cm) ? 0xFF : aom_rb_read_literal(rb, REF_FRAMES);
+    }
+  }
+
+  if (!frame_is_intra_only(cm) || current_frame->refresh_frame_flags != 0xFF) {
+    // Read all ref frame order hints if error_resilient_mode == 1
+    if (features->error_resilient_mode &&
+        seq_params->order_hint_info.enable_order_hint) {
+      for (int ref_idx = 0; ref_idx < REF_FRAMES; ref_idx++) {
+        // Read order hint from bit stream
+        unsigned int order_hint = aom_rb_read_literal(
+            rb, seq_params->order_hint_info.order_hint_bits_minus_1 + 1);
+        // Get buffer
+        RefCntBuffer *buf = cm->ref_frame_map[ref_idx];
+        if (buf == NULL || order_hint != buf->order_hint) {
+          if (buf != NULL) {
+            lock_buffer_pool(pool);
+            decrease_ref_count(buf, pool);
+            unlock_buffer_pool(pool);
+            cm->ref_frame_map[ref_idx] = NULL;
+          }
+          // If no corresponding buffer exists, allocate a new buffer with all
+          // pixels set to neutral grey.
+          int buf_idx = get_free_fb(cm);
+          if (buf_idx == INVALID_IDX) {
+            aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                               "Unable to find free frame buffer");
+          }
+          buf = &frame_bufs[buf_idx];
+          lock_buffer_pool(pool);
+          if (aom_realloc_frame_buffer(
+                  &buf->buf, seq_params->max_frame_width,
+                  seq_params->max_frame_height, seq_params->subsampling_x,
+                  seq_params->subsampling_y, seq_params->use_highbitdepth,
+                  AOM_BORDER_IN_PIXELS, features->byte_alignment,
+                  &buf->raw_frame_buffer, pool->get_fb_cb, pool->cb_priv)) {
+            decrease_ref_count(buf, pool);
+            unlock_buffer_pool(pool);
+            aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                               "Failed to allocate frame buffer");
+          }
+          unlock_buffer_pool(pool);
+          // According to the specification, valid bitstreams are required to
+          // never use missing reference frames so the filling process for
+          // missing frames is not normatively defined and RefValid for missing
+          // frames is set to 0.
+
+          // To make libaom more robust when the bitstream has been corrupted
+          // by the loss of some frames of data, this code adds a neutral grey
+          // buffer in place of missing frames, i.e.
+          //
+          set_planes_to_neutral_grey(seq_params, &buf->buf, 0);
+          //
+          // and allows the frames to be used for referencing, i.e.
+          //
+          pbi->valid_for_referencing[ref_idx] = 1;
+          //
+          // Please note such behavior is not normative and other decoders may
+          // use a different approach.
+          cm->ref_frame_map[ref_idx] = buf;
+          buf->order_hint = order_hint;
+        }
+      }
+    }
+  }
+
+  if (current_frame->frame_type == KEY_FRAME) {
+    setup_frame_size(cm, frame_size_override_flag, rb);
+
+    if (features->allow_screen_content_tools && !av1_superres_scaled(cm))
+      features->allow_intrabc = aom_rb_read_bit(rb);
+    features->allow_ref_frame_mvs = 0;
+    cm->prev_frame = NULL;
+  } else {
+    features->allow_ref_frame_mvs = 0;
+
+    if (current_frame->frame_type == INTRA_ONLY_FRAME) {
+      cm->cur_frame->film_grain_params_present =
+          seq_params->film_grain_params_present;
+      setup_frame_size(cm, frame_size_override_flag, rb);
+      if (features->allow_screen_content_tools && !av1_superres_scaled(cm))
+        features->allow_intrabc = aom_rb_read_bit(rb);
+
+    } else if (pbi->need_resync != 1) { /* Skip if need resync */
+      int frame_refs_short_signaling = 0;
+      // Frame refs short signaling is off when error resilient mode is on.
+      if (seq_params->order_hint_info.enable_order_hint)
+        frame_refs_short_signaling = aom_rb_read_bit(rb);
+
+      if (frame_refs_short_signaling) {
+        // == LAST_FRAME ==
+        const int lst_ref = aom_rb_read_literal(rb, REF_FRAMES_LOG2);
+        const RefCntBuffer *const lst_buf = cm->ref_frame_map[lst_ref];
+
+        // == GOLDEN_FRAME ==
+        const int gld_ref = aom_rb_read_literal(rb, REF_FRAMES_LOG2);
+        const RefCntBuffer *const gld_buf = cm->ref_frame_map[gld_ref];
+
+        // Most of the time, streams start with a keyframe. In that case,
+        // ref_frame_map will have been filled in at that point and will not
+        // contain any NULLs. However, streams are explicitly allowed to start
+        // with an intra-only frame, so long as they don't then signal a
+        // reference to a slot that hasn't been set yet. That's what we are
+        // checking here.
+        if (lst_buf == NULL)
+          aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                             "Inter frame requests nonexistent reference");
+        if (gld_buf == NULL)
+          aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                             "Inter frame requests nonexistent reference");
+
+        av1_set_frame_refs(cm, cm->remapped_ref_idx, lst_ref, gld_ref);
+      }
+
+      for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+        int ref = 0;
+        if (!frame_refs_short_signaling) {
+          ref = aom_rb_read_literal(rb, REF_FRAMES_LOG2);
+
+          // Most of the time, streams start with a keyframe. In that case,
+          // ref_frame_map will have been filled in at that point and will not
+          // contain any NULLs. However, streams are explicitly allowed to start
+          // with an intra-only frame, so long as they don't then signal a
+          // reference to a slot that hasn't been set yet. That's what we are
+          // checking here.
+          if (cm->ref_frame_map[ref] == NULL)
+            aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                               "Inter frame requests nonexistent reference");
+          cm->remapped_ref_idx[i] = ref;
+        } else {
+          ref = cm->remapped_ref_idx[i];
+        }
+        // Check valid for referencing
+        if (pbi->valid_for_referencing[ref] == 0)
+          aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                             "Reference frame not valid for referencing");
+
+        cm->ref_frame_sign_bias[LAST_FRAME + i] = 0;
+
+        if (seq_params->frame_id_numbers_present_flag) {
+          int frame_id_length = seq_params->frame_id_length;
+          int diff_len = seq_params->delta_frame_id_length;
+          int delta_frame_id_minus_1 = aom_rb_read_literal(rb, diff_len);
+          int ref_frame_id =
+              ((cm->current_frame_id - (delta_frame_id_minus_1 + 1) +
+                (1 << frame_id_length)) %
+               (1 << frame_id_length));
+          // Compare values derived from delta_frame_id_minus_1 and
+          // refresh_frame_flags.
+          if (ref_frame_id != cm->ref_frame_id[ref])
+            aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                               "Reference buffer frame ID mismatch");
+        }
+      }
+
+      if (!features->error_resilient_mode && frame_size_override_flag) {
+        setup_frame_size_with_refs(cm, rb);
+      } else {
+        setup_frame_size(cm, frame_size_override_flag, rb);
+      }
+
+      if (features->cur_frame_force_integer_mv) {
+        features->allow_high_precision_mv = 0;
+      } else {
+        features->allow_high_precision_mv = aom_rb_read_bit(rb);
+      }
+      features->interp_filter = read_frame_interp_filter(rb);
+      features->switchable_motion_mode = aom_rb_read_bit(rb);
+    }
+
+    cm->prev_frame = get_primary_ref_frame_buf(cm);
+    if (features->primary_ref_frame != PRIMARY_REF_NONE &&
+        get_primary_ref_frame_buf(cm) == NULL) {
+      aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                         "Reference frame containing this frame's initial "
+                         "frame context is unavailable.");
+    }
+
+    if (!(current_frame->frame_type == INTRA_ONLY_FRAME) &&
+        pbi->need_resync != 1) {
+      if (frame_might_allow_ref_frame_mvs(cm))
+        features->allow_ref_frame_mvs = aom_rb_read_bit(rb);
+      else
+        features->allow_ref_frame_mvs = 0;
+
+      for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
+        const RefCntBuffer *const ref_buf = get_ref_frame_buf(cm, i);
+        struct scale_factors *const ref_scale_factors =
+            get_ref_scale_factors(cm, i);
+        av1_setup_scale_factors_for_frame(
+            ref_scale_factors, ref_buf->buf.y_crop_width,
+            ref_buf->buf.y_crop_height, cm->width, cm->height);
+        if ((!av1_is_valid_scale(ref_scale_factors)))
+          aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                             "Reference frame has invalid dimensions");
+      }
+    }
+  }
+
+  av1_setup_frame_buf_refs(cm);
+
+  av1_setup_frame_sign_bias(cm);
+
+  cm->cur_frame->frame_type = current_frame->frame_type;
+
+  update_ref_frame_id(pbi);
+
+  const int might_bwd_adapt = !(seq_params->reduced_still_picture_hdr) &&
+                              !(features->disable_cdf_update);
+  if (might_bwd_adapt) {
+    features->refresh_frame_context = aom_rb_read_bit(rb)
+                                          ? REFRESH_FRAME_CONTEXT_DISABLED
+                                          : REFRESH_FRAME_CONTEXT_BACKWARD;
+  } else {
+    features->refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED;
+  }
+
+  cm->cur_frame->buf.bit_depth = seq_params->bit_depth;
+  cm->cur_frame->buf.color_primaries = seq_params->color_primaries;
+  cm->cur_frame->buf.transfer_characteristics =
+      seq_params->transfer_characteristics;
+  cm->cur_frame->buf.matrix_coefficients = seq_params->matrix_coefficients;
+  cm->cur_frame->buf.monochrome = seq_params->monochrome;
+  cm->cur_frame->buf.chroma_sample_position =
+      seq_params->chroma_sample_position;
+  cm->cur_frame->buf.color_range = seq_params->color_range;
+  cm->cur_frame->buf.render_width = cm->render_width;
+  cm->cur_frame->buf.render_height = cm->render_height;
+
+  if (pbi->need_resync) {
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "Keyframe / intra-only frame required to reset decoder"
+                       " state");
+  }
+
+  if (features->allow_intrabc) {
+    // Set parameters corresponding to no filtering.
+    struct loopfilter *lf = &cm->lf;
+    lf->filter_level[0] = 0;
+    lf->filter_level[1] = 0;
+    cm->cdef_info.cdef_bits = 0;
+    cm->cdef_info.cdef_strengths[0] = 0;
+    cm->cdef_info.nb_cdef_strengths = 1;
+    cm->cdef_info.cdef_uv_strengths[0] = 0;
+    cm->rst_info[0].frame_restoration_type = RESTORE_NONE;
+    cm->rst_info[1].frame_restoration_type = RESTORE_NONE;
+    cm->rst_info[2].frame_restoration_type = RESTORE_NONE;
+  }
+
+  read_tile_info(pbi, rb);
+  if (!av1_is_min_tile_width_satisfied(cm)) {
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "Minimum tile width requirement not satisfied");
+  }
+
+  CommonQuantParams *const quant_params = &cm->quant_params;
+  setup_quantization(quant_params, av1_num_planes(cm),
+                     cm->seq_params.separate_uv_delta_q, rb);
+  xd->bd = (int)seq_params->bit_depth;
+
+  CommonContexts *const above_contexts = &cm->above_contexts;
+  if (above_contexts->num_planes < av1_num_planes(cm) ||
+      above_contexts->num_mi_cols < cm->mi_params.mi_cols ||
+      above_contexts->num_tile_rows < cm->tiles.rows) {
+    av1_free_above_context_buffers(above_contexts);
+    if (av1_alloc_above_context_buffers(above_contexts, cm->tiles.rows,
+                                        cm->mi_params.mi_cols,
+                                        av1_num_planes(cm))) {
+      aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                         "Failed to allocate context buffers");
+    }
+  }
+
+  if (features->primary_ref_frame == PRIMARY_REF_NONE) {
+    av1_setup_past_independence(cm);
+  }
+
+  setup_segmentation(cm, rb);
+
+  cm->delta_q_info.delta_q_res = 1;
+  cm->delta_q_info.delta_lf_res = 1;
+  cm->delta_q_info.delta_lf_present_flag = 0;
+  cm->delta_q_info.delta_lf_multi = 0;
+  cm->delta_q_info.delta_q_present_flag =
+      quant_params->base_qindex > 0 ? aom_rb_read_bit(rb) : 0;
+  if (cm->delta_q_info.delta_q_present_flag) {
+    xd->current_qindex = quant_params->base_qindex;
+    cm->delta_q_info.delta_q_res = 1 << aom_rb_read_literal(rb, 2);
+    if (!features->allow_intrabc)
+      cm->delta_q_info.delta_lf_present_flag = aom_rb_read_bit(rb);
+    if (cm->delta_q_info.delta_lf_present_flag) {
+      cm->delta_q_info.delta_lf_res = 1 << aom_rb_read_literal(rb, 2);
+      cm->delta_q_info.delta_lf_multi = aom_rb_read_bit(rb);
+      av1_reset_loop_filter_delta(xd, av1_num_planes(cm));
+    }
+  }
+
+  xd->cur_frame_force_integer_mv = features->cur_frame_force_integer_mv;
+
+  for (int i = 0; i < MAX_SEGMENTS; ++i) {
+    const int qindex = av1_get_qindex(&cm->seg, i, quant_params->base_qindex);
+    xd->lossless[i] =
+        qindex == 0 && quant_params->y_dc_delta_q == 0 &&
+        quant_params->u_dc_delta_q == 0 && quant_params->u_ac_delta_q == 0 &&
+        quant_params->v_dc_delta_q == 0 && quant_params->v_ac_delta_q == 0;
+    xd->qindex[i] = qindex;
+  }
+  features->coded_lossless = is_coded_lossless(cm, xd);
+  features->all_lossless = features->coded_lossless && !av1_superres_scaled(cm);
+  setup_segmentation_dequant(cm, xd);
+  if (features->coded_lossless) {
+    cm->lf.filter_level[0] = 0;
+    cm->lf.filter_level[1] = 0;
+  }
+  if (features->coded_lossless || !seq_params->enable_cdef) {
+    cm->cdef_info.cdef_bits = 0;
+    cm->cdef_info.cdef_strengths[0] = 0;
+    cm->cdef_info.cdef_uv_strengths[0] = 0;
+  }
+  if (features->all_lossless || !seq_params->enable_restoration) {
+    cm->rst_info[0].frame_restoration_type = RESTORE_NONE;
+    cm->rst_info[1].frame_restoration_type = RESTORE_NONE;
+    cm->rst_info[2].frame_restoration_type = RESTORE_NONE;
+  }
+  setup_loopfilter(cm, rb);
+
+  if (!features->coded_lossless && seq_params->enable_cdef) {
+    setup_cdef(cm, rb);
+  }
+  if (!features->all_lossless && seq_params->enable_restoration) {
+    decode_restoration_mode(cm, rb);
+  }
+
+  features->tx_mode = read_tx_mode(rb, features->coded_lossless);
+  current_frame->reference_mode = read_frame_reference_mode(cm, rb);
+
+  av1_setup_skip_mode_allowed(cm);
+  current_frame->skip_mode_info.skip_mode_flag =
+      current_frame->skip_mode_info.skip_mode_allowed ? aom_rb_read_bit(rb) : 0;
+
+  if (frame_might_allow_warped_motion(cm))
+    features->allow_warped_motion = aom_rb_read_bit(rb);
+  else
+    features->allow_warped_motion = 0;
+
+  features->reduced_tx_set_used = aom_rb_read_bit(rb);
+
+  if (features->allow_ref_frame_mvs && !frame_might_allow_ref_frame_mvs(cm)) {
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "Frame wrongly requests reference frame MVs");
+  }
+
+  if (!frame_is_intra_only(cm)) read_global_motion(cm, rb);
+
+  cm->cur_frame->film_grain_params_present =
+      seq_params->film_grain_params_present;
+  read_film_grain(cm, rb);
+
+#if EXT_TILE_DEBUG
+  if (pbi->ext_tile_debug && cm->tiles.large_scale) {
+    read_ext_tile_info(pbi, rb);
+    av1_set_single_tile_decoding_mode(cm);
+  }
+#endif  // EXT_TILE_DEBUG
+  return 0;
+}
+
+struct aom_read_bit_buffer *av1_init_read_bit_buffer(
+    AV1Decoder *pbi, struct aom_read_bit_buffer *rb, const uint8_t *data,
+    const uint8_t *data_end) {
+  rb->bit_offset = 0;
+  rb->error_handler = error_handler;
+  rb->error_handler_data = &pbi->common;
+  rb->bit_buffer = data;
+  rb->bit_buffer_end = data_end;
+  return rb;
+}
+
+void av1_read_frame_size(struct aom_read_bit_buffer *rb, int num_bits_width,
+                         int num_bits_height, int *width, int *height) {
+  *width = aom_rb_read_literal(rb, num_bits_width) + 1;
+  *height = aom_rb_read_literal(rb, num_bits_height) + 1;
+}
+
+BITSTREAM_PROFILE av1_read_profile(struct aom_read_bit_buffer *rb) {
+  int profile = aom_rb_read_literal(rb, PROFILE_BITS);
+  return (BITSTREAM_PROFILE)profile;
+}
+
+static AOM_INLINE void superres_post_decode(AV1Decoder *pbi) {
+  AV1_COMMON *const cm = &pbi->common;
+  BufferPool *const pool = cm->buffer_pool;
+
+  if (!av1_superres_scaled(cm)) return;
+  assert(!cm->features.all_lossless);
+
+  av1_superres_upscale(cm, pool);
+}
+
+uint32_t av1_decode_frame_headers_and_setup(AV1Decoder *pbi,
+                                            struct aom_read_bit_buffer *rb,
+                                            const uint8_t *data,
+                                            const uint8_t **p_data_end,
+                                            int trailing_bits_present) {
+  AV1_COMMON *const cm = &pbi->common;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *const xd = &pbi->mb;
+
+#if CONFIG_BITSTREAM_DEBUG
+  aom_bitstream_queue_set_frame_read(cm->current_frame.frame_number * 2 +
+                                     cm->show_frame);
+#endif
+#if CONFIG_MISMATCH_DEBUG
+  mismatch_move_frame_idx_r();
+#endif
+
+  for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
+    cm->global_motion[i] = default_warp_params;
+    cm->cur_frame->global_motion[i] = default_warp_params;
+  }
+  xd->global_motion = cm->global_motion;
+
+  read_uncompressed_header(pbi, rb);
+
+  if (trailing_bits_present) av1_check_trailing_bits(pbi, rb);
+
+  if (!cm->tiles.single_tile_decoding &&
+      (pbi->dec_tile_row >= 0 || pbi->dec_tile_col >= 0)) {
+    pbi->dec_tile_row = -1;
+    pbi->dec_tile_col = -1;
+  }
+
+  const uint32_t uncomp_hdr_size =
+      (uint32_t)aom_rb_bytes_read(rb);  // Size of the uncompressed header
+  YV12_BUFFER_CONFIG *new_fb = &cm->cur_frame->buf;
+  xd->cur_buf = new_fb;
+  if (av1_allow_intrabc(cm)) {
+    av1_setup_scale_factors_for_frame(
+        &cm->sf_identity, xd->cur_buf->y_crop_width, xd->cur_buf->y_crop_height,
+        xd->cur_buf->y_crop_width, xd->cur_buf->y_crop_height);
+  }
+
+  if (cm->show_existing_frame) {
+    // showing a frame directly
+    *p_data_end = data + uncomp_hdr_size;
+    if (pbi->reset_decoder_state) {
+      // Use the default frame context values.
+      *cm->fc = *cm->default_frame_context;
+      if (!cm->fc->initialized)
+        aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                           "Uninitialized entropy context.");
+    }
+    return uncomp_hdr_size;
+  }
+
+  cm->mi_params.setup_mi(&cm->mi_params);
+
+  av1_setup_motion_field(cm);
+
+  av1_setup_block_planes(xd, cm->seq_params.subsampling_x,
+                         cm->seq_params.subsampling_y, num_planes);
+  if (cm->features.primary_ref_frame == PRIMARY_REF_NONE) {
+    // use the default frame context values
+    *cm->fc = *cm->default_frame_context;
+  } else {
+    *cm->fc = get_primary_ref_frame_buf(cm)->frame_context;
+  }
+  if (!cm->fc->initialized)
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "Uninitialized entropy context.");
+
+  xd->corrupted = 0;
+  return uncomp_hdr_size;
+}
+
+// Once-per-frame initialization
+static AOM_INLINE void setup_frame_info(AV1Decoder *pbi) {
+  AV1_COMMON *const cm = &pbi->common;
+
+  if (cm->rst_info[0].frame_restoration_type != RESTORE_NONE ||
+      cm->rst_info[1].frame_restoration_type != RESTORE_NONE ||
+      cm->rst_info[2].frame_restoration_type != RESTORE_NONE) {
+    av1_alloc_restoration_buffers(cm);
+  }
+  const int use_highbd = cm->seq_params.use_highbitdepth;
+  const int buf_size = MC_TEMP_BUF_PELS << use_highbd;
+  if (pbi->td.mc_buf_size != buf_size) {
+    av1_free_mc_tmp_buf(&pbi->td);
+    allocate_mc_tmp_buf(cm, &pbi->td, buf_size, use_highbd);
+  }
+}
+
+void av1_decode_tg_tiles_and_wrapup(AV1Decoder *pbi, const uint8_t *data,
+                                    const uint8_t *data_end,
+                                    const uint8_t **p_data_end, int start_tile,
+                                    int end_tile, int initialize_flag) {
+  AV1_COMMON *const cm = &pbi->common;
+  CommonTileParams *const tiles = &cm->tiles;
+  MACROBLOCKD *const xd = &pbi->mb;
+  const int tile_count_tg = end_tile - start_tile + 1;
+
+  if (initialize_flag) setup_frame_info(pbi);
+  const int num_planes = av1_num_planes(cm);
+#if CONFIG_LPF_MASK
+  av1_loop_filter_frame_init(cm, 0, num_planes);
+#endif
+
+  if (pbi->max_threads > 1 && !(tiles->large_scale && !pbi->ext_tile_debug) &&
+      pbi->row_mt)
+    *p_data_end =
+        decode_tiles_row_mt(pbi, data, data_end, start_tile, end_tile);
+  else if (pbi->max_threads > 1 && tile_count_tg > 1 &&
+           !(tiles->large_scale && !pbi->ext_tile_debug))
+    *p_data_end = decode_tiles_mt(pbi, data, data_end, start_tile, end_tile);
+  else
+    *p_data_end = decode_tiles(pbi, data, data_end, start_tile, end_tile);
+
+  // If the bit stream is monochrome, set the U and V buffers to a constant.
+  if (num_planes < 3) {
+    set_planes_to_neutral_grey(&cm->seq_params, xd->cur_buf, 1);
+  }
+
+  if (end_tile != tiles->rows * tiles->cols - 1) {
+    return;
+  }
+
+  if (!cm->features.allow_intrabc && !tiles->single_tile_decoding) {
+    if (cm->lf.filter_level[0] || cm->lf.filter_level[1]) {
+      if (pbi->num_workers > 1) {
+        av1_loop_filter_frame_mt(
+            &cm->cur_frame->buf, cm, &pbi->mb, 0, num_planes, 0,
+#if CONFIG_LPF_MASK
+            1,
+#endif
+            pbi->tile_workers, pbi->num_workers, &pbi->lf_row_sync);
+      } else {
+        av1_loop_filter_frame(&cm->cur_frame->buf, cm, &pbi->mb,
+#if CONFIG_LPF_MASK
+                              1,
+#endif
+                              0, num_planes, 0);
+      }
+    }
+
+    const int do_loop_restoration =
+        cm->rst_info[0].frame_restoration_type != RESTORE_NONE ||
+        cm->rst_info[1].frame_restoration_type != RESTORE_NONE ||
+        cm->rst_info[2].frame_restoration_type != RESTORE_NONE;
+    const int do_cdef =
+        !pbi->skip_loop_filter && !cm->features.coded_lossless &&
+        (cm->cdef_info.cdef_bits || cm->cdef_info.cdef_strengths[0] ||
+         cm->cdef_info.cdef_uv_strengths[0]);
+    const int do_superres = av1_superres_scaled(cm);
+    const int optimized_loop_restoration = !do_cdef && !do_superres;
+
+    if (!optimized_loop_restoration) {
+      if (do_loop_restoration)
+        av1_loop_restoration_save_boundary_lines(&pbi->common.cur_frame->buf,
+                                                 cm, 0);
+
+      if (do_cdef) av1_cdef_frame(&pbi->common.cur_frame->buf, cm, &pbi->mb);
+
+      superres_post_decode(pbi);
+
+      if (do_loop_restoration) {
+        av1_loop_restoration_save_boundary_lines(&pbi->common.cur_frame->buf,
+                                                 cm, 1);
+        if (pbi->num_workers > 1) {
+          av1_loop_restoration_filter_frame_mt(
+              (YV12_BUFFER_CONFIG *)xd->cur_buf, cm, optimized_loop_restoration,
+              pbi->tile_workers, pbi->num_workers, &pbi->lr_row_sync,
+              &pbi->lr_ctxt);
+        } else {
+          av1_loop_restoration_filter_frame((YV12_BUFFER_CONFIG *)xd->cur_buf,
+                                            cm, optimized_loop_restoration,
+                                            &pbi->lr_ctxt);
+        }
+      }
+    } else {
+      // In no cdef and no superres case. Provide an optimized version of
+      // loop_restoration_filter.
+      if (do_loop_restoration) {
+        if (pbi->num_workers > 1) {
+          av1_loop_restoration_filter_frame_mt(
+              (YV12_BUFFER_CONFIG *)xd->cur_buf, cm, optimized_loop_restoration,
+              pbi->tile_workers, pbi->num_workers, &pbi->lr_row_sync,
+              &pbi->lr_ctxt);
+        } else {
+          av1_loop_restoration_filter_frame((YV12_BUFFER_CONFIG *)xd->cur_buf,
+                                            cm, optimized_loop_restoration,
+                                            &pbi->lr_ctxt);
+        }
+      }
+    }
+  }
+#if CONFIG_LPF_MASK
+  av1_zero_array(cm->lf.lfm, cm->lf.lfm_num);
+#endif
+
+  if (!xd->corrupted) {
+    if (cm->features.refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
+      assert(pbi->context_update_tile_id < pbi->allocated_tiles);
+      *cm->fc = pbi->tile_data[pbi->context_update_tile_id].tctx;
+      av1_reset_cdf_symbol_counters(cm->fc);
+    }
+  } else {
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "Decode failed. Frame data is corrupted.");
+  }
+
+#if CONFIG_INSPECTION
+  if (pbi->inspect_cb != NULL) {
+    (*pbi->inspect_cb)(pbi, pbi->inspect_ctx);
+  }
+#endif
+
+  // Non frame parallel update frame context here.
+  if (!tiles->large_scale) {
+    cm->cur_frame->frame_context = *cm->fc;
+  }
+}
diff --git a/libs/libaom/src/av1/decoder/decodeframe.h b/libs/libaom/src/av1/decoder/decodeframe.h
new file mode 100644
index 000000000..95b3c9f22
--- /dev/null
+++ b/libs/libaom/src/av1/decoder/decodeframe.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_DECODER_DECODEFRAME_H_
+#define AOM_AV1_DECODER_DECODEFRAME_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct AV1Decoder;
+struct aom_read_bit_buffer;
+struct ThreadData;
+
+// Reads the middle part of the sequence header OBU (from
+// frame_width_bits_minus_1 to enable_restoration) into seq_params.
+// Reports errors by calling rb->error_handler() or aom_internal_error().
+void av1_read_sequence_header(AV1_COMMON *cm, struct aom_read_bit_buffer *rb,
+                              SequenceHeader *seq_params);
+
+void av1_read_frame_size(struct aom_read_bit_buffer *rb, int num_bits_width,
+                         int num_bits_height, int *width, int *height);
+BITSTREAM_PROFILE av1_read_profile(struct aom_read_bit_buffer *rb);
+
+// Returns 0 on success. Sets pbi->common.error.error_code and returns -1 on
+// failure.
+int av1_check_trailing_bits(struct AV1Decoder *pbi,
+                            struct aom_read_bit_buffer *rb);
+
+// On success, returns the frame header size. On failure, calls
+// aom_internal_error and does not return.
+// TODO(wtc): Figure out and document the p_data_end parameter.
+uint32_t av1_decode_frame_headers_and_setup(struct AV1Decoder *pbi,
+                                            struct aom_read_bit_buffer *rb,
+                                            const uint8_t *data,
+                                            const uint8_t **p_data_end,
+                                            int trailing_bits_present);
+
+void av1_decode_tg_tiles_and_wrapup(struct AV1Decoder *pbi, const uint8_t *data,
+                                    const uint8_t *data_end,
+                                    const uint8_t **p_data_end, int start_tile,
+                                    int end_tile, int initialize_flag);
+
+// Implements the color_config() function in the spec. Reports errors by
+// calling rb->error_handler() or aom_internal_error().
+void av1_read_color_config(struct aom_read_bit_buffer *rb,
+                           int allow_lowbitdepth, SequenceHeader *seq_params,
+                           struct aom_internal_error_info *error_info);
+
+// Implements the timing_info() function in the spec. Reports errors by calling
+// rb->error_handler() or aom_internal_error().
+void av1_read_timing_info_header(aom_timing_info_t *timing_info,
+                                 struct aom_internal_error_info *error,
+                                 struct aom_read_bit_buffer *rb);
+
+// Implements the decoder_model_info() function in the spec. Reports errors by
+// calling rb->error_handler().
+void av1_read_decoder_model_info(aom_dec_model_info_t *decoder_model_info,
+                                 struct aom_read_bit_buffer *rb);
+
+// Implements the operating_parameters_info() function in the spec. Reports
+// errors by calling rb->error_handler().
+void av1_read_op_parameters_info(aom_dec_model_op_parameters_t *op_params,
+                                 int buffer_delay_length,
+                                 struct aom_read_bit_buffer *rb);
+
+struct aom_read_bit_buffer *av1_init_read_bit_buffer(
+    struct AV1Decoder *pbi, struct aom_read_bit_buffer *rb, const uint8_t *data,
+    const uint8_t *data_end);
+
+void av1_free_mc_tmp_buf(struct ThreadData *thread_data);
+
+void av1_set_single_tile_decoding_mode(AV1_COMMON *const cm);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_DECODER_DECODEFRAME_H_
diff --git a/libs/libaom/src/av1/decoder/decodemv.c b/libs/libaom/src/av1/decoder/decodemv.c
new file mode 100644
index 000000000..e97cec42c
--- /dev/null
+++ b/libs/libaom/src/av1/decoder/decodemv.c
@@ -0,0 +1,1575 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "av1/common/cfl.h"
+#include "av1/common/common.h"
+#include "av1/common/entropy.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/entropymv.h"
+#include "av1/common/mvref_common.h"
+#include "av1/common/pred_common.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+#include "av1/common/seg_common.h"
+#include "av1/common/warped_motion.h"
+
+#include "av1/decoder/decodeframe.h"
+#include "av1/decoder/decodemv.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+
+#define ACCT_STR __func__
+
+#define DEC_MISMATCH_DEBUG 0
+
+static PREDICTION_MODE read_intra_mode(aom_reader *r, aom_cdf_prob *cdf) {
+  return (PREDICTION_MODE)aom_read_symbol(r, cdf, INTRA_MODES, ACCT_STR);
+}
+
+static void read_cdef(AV1_COMMON *cm, aom_reader *r, MACROBLOCKD *const xd) {
+  const int skip = xd->mi[0]->skip;
+  if (cm->features.coded_lossless) return;
+  if (cm->features.allow_intrabc) {
+    assert(cm->cdef_info.cdef_bits == 0);
+    return;
+  }
+
+  // At the start of a superblock, mark that we haven't yet read CDEF strengths
+  // for any of the CDEF units contained in this superblock.
+  const int sb_mask = (cm->seq_params.mib_size - 1);
+  const int mi_row_in_sb = (xd->mi_row & sb_mask);
+  const int mi_col_in_sb = (xd->mi_col & sb_mask);
+  if (mi_row_in_sb == 0 && mi_col_in_sb == 0) {
+    xd->cdef_transmitted[0] = xd->cdef_transmitted[1] =
+        xd->cdef_transmitted[2] = xd->cdef_transmitted[3] = false;
+  }
+
+  // CDEF unit size is 64x64 irrespective of the superblock size.
+  const int cdef_size = 1 << (6 - MI_SIZE_LOG2);
+
+  // Find index of this CDEF unit in this superblock.
+  const int index_mask = cdef_size;
+  const int cdef_unit_row_in_sb = ((xd->mi_row & index_mask) != 0);
+  const int cdef_unit_col_in_sb = ((xd->mi_col & index_mask) != 0);
+  const int index = (cm->seq_params.sb_size == BLOCK_128X128)
+                        ? cdef_unit_col_in_sb + 2 * cdef_unit_row_in_sb
+                        : 0;
+
+  // Read CDEF strength from the first non-skip coding block in this CDEF unit.
+  if (!xd->cdef_transmitted[index] && !skip) {
+    // CDEF strength for this CDEF unit needs to be read into the MB_MODE_INFO
+    // of the 1st block in this CDEF unit.
+    const int first_block_mask = ~(cdef_size - 1);
+    CommonModeInfoParams *const mi_params = &cm->mi_params;
+    const int grid_idx =
+        get_mi_grid_idx(mi_params, xd->mi_row & first_block_mask,
+                        xd->mi_col & first_block_mask);
+    MB_MODE_INFO *const mbmi = mi_params->mi_grid_base[grid_idx];
+    mbmi->cdef_strength =
+        aom_read_literal(r, cm->cdef_info.cdef_bits, ACCT_STR);
+    xd->cdef_transmitted[index] = true;
+  }
+}
+
+static int read_delta_qindex(AV1_COMMON *cm, const MACROBLOCKD *xd,
+                             aom_reader *r, MB_MODE_INFO *const mbmi) {
+  int sign, abs, reduced_delta_qindex = 0;
+  BLOCK_SIZE bsize = mbmi->sb_type;
+  const int b_col = xd->mi_col & (cm->seq_params.mib_size - 1);
+  const int b_row = xd->mi_row & (cm->seq_params.mib_size - 1);
+  const int read_delta_q_flag = (b_col == 0 && b_row == 0);
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+
+  if ((bsize != cm->seq_params.sb_size || mbmi->skip == 0) &&
+      read_delta_q_flag) {
+    abs = aom_read_symbol(r, ec_ctx->delta_q_cdf, DELTA_Q_PROBS + 1, ACCT_STR);
+    const int smallval = (abs < DELTA_Q_SMALL);
+
+    if (!smallval) {
+      const int rem_bits = aom_read_literal(r, 3, ACCT_STR) + 1;
+      const int thr = (1 << rem_bits) + 1;
+      abs = aom_read_literal(r, rem_bits, ACCT_STR) + thr;
+    }
+
+    if (abs) {
+      sign = aom_read_bit(r, ACCT_STR);
+    } else {
+      sign = 1;
+    }
+
+    reduced_delta_qindex = sign ? -abs : abs;
+  }
+  return reduced_delta_qindex;
+}
+static int read_delta_lflevel(const AV1_COMMON *const cm, aom_reader *r,
+                              aom_cdf_prob *const cdf,
+                              const MB_MODE_INFO *const mbmi, int mi_col,
+                              int mi_row) {
+  int reduced_delta_lflevel = 0;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const int b_col = mi_col & (cm->seq_params.mib_size - 1);
+  const int b_row = mi_row & (cm->seq_params.mib_size - 1);
+  const int read_delta_lf_flag = (b_col == 0 && b_row == 0);
+
+  if ((bsize != cm->seq_params.sb_size || mbmi->skip == 0) &&
+      read_delta_lf_flag) {
+    int abs = aom_read_symbol(r, cdf, DELTA_LF_PROBS + 1, ACCT_STR);
+    const int smallval = (abs < DELTA_LF_SMALL);
+    if (!smallval) {
+      const int rem_bits = aom_read_literal(r, 3, ACCT_STR) + 1;
+      const int thr = (1 << rem_bits) + 1;
+      abs = aom_read_literal(r, rem_bits, ACCT_STR) + thr;
+    }
+    const int sign = abs ? aom_read_bit(r, ACCT_STR) : 1;
+    reduced_delta_lflevel = sign ? -abs : abs;
+  }
+  return reduced_delta_lflevel;
+}
+
+static UV_PREDICTION_MODE read_intra_mode_uv(FRAME_CONTEXT *ec_ctx,
+                                             aom_reader *r,
+                                             CFL_ALLOWED_TYPE cfl_allowed,
+                                             PREDICTION_MODE y_mode) {
+  const UV_PREDICTION_MODE uv_mode =
+      aom_read_symbol(r, ec_ctx->uv_mode_cdf[cfl_allowed][y_mode],
+                      UV_INTRA_MODES - !cfl_allowed, ACCT_STR);
+  return uv_mode;
+}
+
+static uint8_t read_cfl_alphas(FRAME_CONTEXT *const ec_ctx, aom_reader *r,
+                               int8_t *signs_out) {
+  const int8_t joint_sign =
+      aom_read_symbol(r, ec_ctx->cfl_sign_cdf, CFL_JOINT_SIGNS, "cfl:signs");
+  uint8_t idx = 0;
+  // Magnitudes are only coded for nonzero values
+  if (CFL_SIGN_U(joint_sign) != CFL_SIGN_ZERO) {
+    aom_cdf_prob *cdf_u = ec_ctx->cfl_alpha_cdf[CFL_CONTEXT_U(joint_sign)];
+    idx = (uint8_t)aom_read_symbol(r, cdf_u, CFL_ALPHABET_SIZE, "cfl:alpha_u")
+          << CFL_ALPHABET_SIZE_LOG2;
+  }
+  if (CFL_SIGN_V(joint_sign) != CFL_SIGN_ZERO) {
+    aom_cdf_prob *cdf_v = ec_ctx->cfl_alpha_cdf[CFL_CONTEXT_V(joint_sign)];
+    idx += (uint8_t)aom_read_symbol(r, cdf_v, CFL_ALPHABET_SIZE, "cfl:alpha_v");
+  }
+  *signs_out = joint_sign;
+  return idx;
+}
+
+static INTERINTRA_MODE read_interintra_mode(MACROBLOCKD *xd, aom_reader *r,
+                                            int size_group) {
+  const INTERINTRA_MODE ii_mode = (INTERINTRA_MODE)aom_read_symbol(
+      r, xd->tile_ctx->interintra_mode_cdf[size_group], INTERINTRA_MODES,
+      ACCT_STR);
+  return ii_mode;
+}
+
+static PREDICTION_MODE read_inter_mode(FRAME_CONTEXT *ec_ctx, aom_reader *r,
+                                       int16_t ctx) {
+  int16_t mode_ctx = ctx & NEWMV_CTX_MASK;
+  int is_newmv, is_zeromv, is_refmv;
+  is_newmv = aom_read_symbol(r, ec_ctx->newmv_cdf[mode_ctx], 2, ACCT_STR) == 0;
+  if (is_newmv) return NEWMV;
+
+  mode_ctx = (ctx >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK;
+  is_zeromv =
+      aom_read_symbol(r, ec_ctx->zeromv_cdf[mode_ctx], 2, ACCT_STR) == 0;
+  if (is_zeromv) return GLOBALMV;
+
+  mode_ctx = (ctx >> REFMV_OFFSET) & REFMV_CTX_MASK;
+  is_refmv = aom_read_symbol(r, ec_ctx->refmv_cdf[mode_ctx], 2, ACCT_STR) == 0;
+  if (is_refmv)
+    return NEARESTMV;
+  else
+    return NEARMV;
+}
+
+static void read_drl_idx(FRAME_CONTEXT *ec_ctx, MACROBLOCKD *xd,
+                         MB_MODE_INFO *mbmi, aom_reader *r) {
+  uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+  mbmi->ref_mv_idx = 0;
+  if (mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV) {
+    for (int idx = 0; idx < 2; ++idx) {
+      if (xd->ref_mv_count[ref_frame_type] > idx + 1) {
+        uint8_t drl_ctx = av1_drl_ctx(xd->weight[ref_frame_type], idx);
+        int drl_idx = aom_read_symbol(r, ec_ctx->drl_cdf[drl_ctx], 2, ACCT_STR);
+        mbmi->ref_mv_idx = idx + drl_idx;
+        if (!drl_idx) return;
+      }
+    }
+  }
+  if (have_nearmv_in_inter_mode(mbmi->mode)) {
+    // Offset the NEARESTMV mode.
+    // TODO(jingning): Unify the two syntax decoding loops after the NEARESTMV
+    // mode is factored in.
+    for (int idx = 1; idx < 3; ++idx) {
+      if (xd->ref_mv_count[ref_frame_type] > idx + 1) {
+        uint8_t drl_ctx = av1_drl_ctx(xd->weight[ref_frame_type], idx);
+        int drl_idx = aom_read_symbol(r, ec_ctx->drl_cdf[drl_ctx], 2, ACCT_STR);
+        mbmi->ref_mv_idx = idx + drl_idx - 1;
+        if (!drl_idx) return;
+      }
+    }
+  }
+}
+
+static MOTION_MODE read_motion_mode(AV1_COMMON *cm, MACROBLOCKD *xd,
+                                    MB_MODE_INFO *mbmi, aom_reader *r) {
+  if (cm->features.switchable_motion_mode == 0) return SIMPLE_TRANSLATION;
+  if (mbmi->skip_mode) return SIMPLE_TRANSLATION;
+
+  const MOTION_MODE last_motion_mode_allowed = motion_mode_allowed(
+      xd->global_motion, xd, mbmi, cm->features.allow_warped_motion);
+  int motion_mode;
+
+  if (last_motion_mode_allowed == SIMPLE_TRANSLATION) return SIMPLE_TRANSLATION;
+
+  if (last_motion_mode_allowed == OBMC_CAUSAL) {
+    motion_mode =
+        aom_read_symbol(r, xd->tile_ctx->obmc_cdf[mbmi->sb_type], 2, ACCT_STR);
+    return (MOTION_MODE)(SIMPLE_TRANSLATION + motion_mode);
+  } else {
+    motion_mode =
+        aom_read_symbol(r, xd->tile_ctx->motion_mode_cdf[mbmi->sb_type],
+                        MOTION_MODES, ACCT_STR);
+    return (MOTION_MODE)(SIMPLE_TRANSLATION + motion_mode);
+  }
+}
+
+static PREDICTION_MODE read_inter_compound_mode(MACROBLOCKD *xd, aom_reader *r,
+                                                int16_t ctx) {
+  const int mode =
+      aom_read_symbol(r, xd->tile_ctx->inter_compound_mode_cdf[ctx],
+                      INTER_COMPOUND_MODES, ACCT_STR);
+  assert(is_inter_compound_mode(NEAREST_NEARESTMV + mode));
+  return NEAREST_NEARESTMV + mode;
+}
+
+int av1_neg_deinterleave(int diff, int ref, int max) {
+  if (!ref) return diff;
+  if (ref >= (max - 1)) return max - diff - 1;
+  if (2 * ref < max) {
+    if (diff <= 2 * ref) {
+      if (diff & 1)
+        return ref + ((diff + 1) >> 1);
+      else
+        return ref - (diff >> 1);
+    }
+    return diff;
+  } else {
+    if (diff <= 2 * (max - ref - 1)) {
+      if (diff & 1)
+        return ref + ((diff + 1) >> 1);
+      else
+        return ref - (diff >> 1);
+    }
+    return max - (diff + 1);
+  }
+}
+
+static int read_segment_id(AV1_COMMON *const cm, const MACROBLOCKD *const xd,
+                           aom_reader *r, int skip) {
+  int cdf_num;
+  const int pred = av1_get_spatial_seg_pred(cm, xd, &cdf_num);
+  if (skip) return pred;
+
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  struct segmentation *const seg = &cm->seg;
+  struct segmentation_probs *const segp = &ec_ctx->seg;
+  aom_cdf_prob *pred_cdf = segp->spatial_pred_seg_cdf[cdf_num];
+  const int coded_id = aom_read_symbol(r, pred_cdf, MAX_SEGMENTS, ACCT_STR);
+  const int segment_id =
+      av1_neg_deinterleave(coded_id, pred, seg->last_active_segid + 1);
+
+  if (segment_id < 0 || segment_id > seg->last_active_segid) {
+    aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
+                       "Corrupted segment_ids");
+  }
+  return segment_id;
+}
+
+static int dec_get_segment_id(const AV1_COMMON *cm, const uint8_t *segment_ids,
+                              int mi_offset, int x_mis, int y_mis) {
+  int segment_id = INT_MAX;
+
+  for (int y = 0; y < y_mis; y++)
+    for (int x = 0; x < x_mis; x++)
+      segment_id = AOMMIN(
+          segment_id, segment_ids[mi_offset + y * cm->mi_params.mi_cols + x]);
+
+  assert(segment_id >= 0 && segment_id < MAX_SEGMENTS);
+  return segment_id;
+}
+
+static void set_segment_id(AV1_COMMON *cm, int mi_offset, int x_mis, int y_mis,
+                           int segment_id) {
+  assert(segment_id >= 0 && segment_id < MAX_SEGMENTS);
+
+  for (int y = 0; y < y_mis; y++)
+    for (int x = 0; x < x_mis; x++)
+      cm->cur_frame->seg_map[mi_offset + y * cm->mi_params.mi_cols + x] =
+          segment_id;
+}
+
+static int read_intra_segment_id(AV1_COMMON *const cm,
+                                 const MACROBLOCKD *const xd, int bsize,
+                                 aom_reader *r, int skip) {
+  struct segmentation *const seg = &cm->seg;
+  if (!seg->enabled) return 0;  // Default for disabled segmentation
+  assert(seg->update_map && !seg->temporal_update);
+
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+  const int mi_offset = mi_row * mi_params->mi_cols + mi_col;
+  const int bw = mi_size_wide[bsize];
+  const int bh = mi_size_high[bsize];
+  const int x_mis = AOMMIN(mi_params->mi_cols - mi_col, bw);
+  const int y_mis = AOMMIN(mi_params->mi_rows - mi_row, bh);
+  const int segment_id = read_segment_id(cm, xd, r, skip);
+  set_segment_id(cm, mi_offset, x_mis, y_mis, segment_id);
+  return segment_id;
+}
+
+static void copy_segment_id(const CommonModeInfoParams *const mi_params,
+                            const uint8_t *last_segment_ids,
+                            uint8_t *current_segment_ids, int mi_offset,
+                            int x_mis, int y_mis) {
+  for (int y = 0; y < y_mis; y++)
+    for (int x = 0; x < x_mis; x++)
+      current_segment_ids[mi_offset + y * mi_params->mi_cols + x] =
+          last_segment_ids
+              ? last_segment_ids[mi_offset + y * mi_params->mi_cols + x]
+              : 0;
+}
+
+static int get_predicted_segment_id(AV1_COMMON *const cm, int mi_offset,
+                                    int x_mis, int y_mis) {
+  return cm->last_frame_seg_map ? dec_get_segment_id(cm, cm->last_frame_seg_map,
+                                                     mi_offset, x_mis, y_mis)
+                                : 0;
+}
+
+static int read_inter_segment_id(AV1_COMMON *const cm, MACROBLOCKD *const xd,
+                                 int preskip, aom_reader *r) {
+  struct segmentation *const seg = &cm->seg;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+  const int mi_offset = mi_row * mi_params->mi_cols + mi_col;
+  const int bw = mi_size_wide[mbmi->sb_type];
+  const int bh = mi_size_high[mbmi->sb_type];
+
+  // TODO(slavarnway): move x_mis, y_mis into xd ?????
+  const int x_mis = AOMMIN(mi_params->mi_cols - mi_col, bw);
+  const int y_mis = AOMMIN(mi_params->mi_rows - mi_row, bh);
+
+  if (!seg->enabled) return 0;  // Default for disabled segmentation
+
+  if (!seg->update_map) {
+    copy_segment_id(mi_params, cm->last_frame_seg_map, cm->cur_frame->seg_map,
+                    mi_offset, x_mis, y_mis);
+    return get_predicted_segment_id(cm, mi_offset, x_mis, y_mis);
+  }
+
+  int segment_id;
+  if (preskip) {
+    if (!seg->segid_preskip) return 0;
+  } else {
+    if (mbmi->skip) {
+      if (seg->temporal_update) {
+        mbmi->seg_id_predicted = 0;
+      }
+      segment_id = read_segment_id(cm, xd, r, 1);
+      set_segment_id(cm, mi_offset, x_mis, y_mis, segment_id);
+      return segment_id;
+    }
+  }
+
+  if (seg->temporal_update) {
+    const int ctx = av1_get_pred_context_seg_id(xd);
+    FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+    struct segmentation_probs *const segp = &ec_ctx->seg;
+    aom_cdf_prob *pred_cdf = segp->pred_cdf[ctx];
+    mbmi->seg_id_predicted = aom_read_symbol(r, pred_cdf, 2, ACCT_STR);
+    if (mbmi->seg_id_predicted) {
+      segment_id = get_predicted_segment_id(cm, mi_offset, x_mis, y_mis);
+    } else {
+      segment_id = read_segment_id(cm, xd, r, 0);
+    }
+  } else {
+    segment_id = read_segment_id(cm, xd, r, 0);
+  }
+  set_segment_id(cm, mi_offset, x_mis, y_mis, segment_id);
+  return segment_id;
+}
+
+static int read_skip_mode(AV1_COMMON *cm, const MACROBLOCKD *xd, int segment_id,
+                          aom_reader *r) {
+  if (!cm->current_frame.skip_mode_info.skip_mode_flag) return 0;
+
+  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
+    return 0;
+  }
+
+  if (!is_comp_ref_allowed(xd->mi[0]->sb_type)) return 0;
+
+  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME) ||
+      segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) {
+    // These features imply single-reference mode, while skip mode implies
+    // compound reference. Hence, the two are mutually exclusive.
+    // In other words, skip_mode is implicitly 0 here.
+    return 0;
+  }
+
+  const int ctx = av1_get_skip_mode_context(xd);
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  const int skip_mode =
+      aom_read_symbol(r, ec_ctx->skip_mode_cdfs[ctx], 2, ACCT_STR);
+  return skip_mode;
+}
+
+static int read_skip(AV1_COMMON *cm, const MACROBLOCKD *xd, int segment_id,
+                     aom_reader *r) {
+  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
+    return 1;
+  } else {
+    const int ctx = av1_get_skip_context(xd);
+    FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+    const int skip = aom_read_symbol(r, ec_ctx->skip_cdfs[ctx], 2, ACCT_STR);
+    return skip;
+  }
+}
+
+// Merge the sorted list of cached colors(cached_colors[0...n_cached_colors-1])
+// and the sorted list of transmitted colors(colors[n_cached_colors...n-1]) into
+// one single sorted list(colors[...]).
+static void merge_colors(uint16_t *colors, uint16_t *cached_colors,
+                         int n_colors, int n_cached_colors) {
+  if (n_cached_colors == 0) return;
+  int cache_idx = 0, trans_idx = n_cached_colors;
+  for (int i = 0; i < n_colors; ++i) {
+    if (cache_idx < n_cached_colors &&
+        (trans_idx >= n_colors ||
+         cached_colors[cache_idx] <= colors[trans_idx])) {
+      colors[i] = cached_colors[cache_idx++];
+    } else {
+      assert(trans_idx < n_colors);
+      colors[i] = colors[trans_idx++];
+    }
+  }
+}
+
+static void read_palette_colors_y(MACROBLOCKD *const xd, int bit_depth,
+                                  PALETTE_MODE_INFO *const pmi, aom_reader *r) {
+  uint16_t color_cache[2 * PALETTE_MAX_SIZE];
+  uint16_t cached_colors[PALETTE_MAX_SIZE];
+  const int n_cache = av1_get_palette_cache(xd, 0, color_cache);
+  const int n = pmi->palette_size[0];
+  int idx = 0;
+  for (int i = 0; i < n_cache && idx < n; ++i)
+    if (aom_read_bit(r, ACCT_STR)) cached_colors[idx++] = color_cache[i];
+  if (idx < n) {
+    const int n_cached_colors = idx;
+    pmi->palette_colors[idx++] = aom_read_literal(r, bit_depth, ACCT_STR);
+    if (idx < n) {
+      const int min_bits = bit_depth - 3;
+      int bits = min_bits + aom_read_literal(r, 2, ACCT_STR);
+      int range = (1 << bit_depth) - pmi->palette_colors[idx - 1] - 1;
+      for (; idx < n; ++idx) {
+        assert(range >= 0);
+        const int delta = aom_read_literal(r, bits, ACCT_STR) + 1;
+        pmi->palette_colors[idx] = clamp(pmi->palette_colors[idx - 1] + delta,
+                                         0, (1 << bit_depth) - 1);
+        range -= (pmi->palette_colors[idx] - pmi->palette_colors[idx - 1]);
+        bits = AOMMIN(bits, av1_ceil_log2(range));
+      }
+    }
+    merge_colors(pmi->palette_colors, cached_colors, n, n_cached_colors);
+  } else {
+    memcpy(pmi->palette_colors, cached_colors, n * sizeof(cached_colors[0]));
+  }
+}
+
+static void read_palette_colors_uv(MACROBLOCKD *const xd, int bit_depth,
+                                   PALETTE_MODE_INFO *const pmi,
+                                   aom_reader *r) {
+  const int n = pmi->palette_size[1];
+  // U channel colors.
+  uint16_t color_cache[2 * PALETTE_MAX_SIZE];
+  uint16_t cached_colors[PALETTE_MAX_SIZE];
+  const int n_cache = av1_get_palette_cache(xd, 1, color_cache);
+  int idx = 0;
+  for (int i = 0; i < n_cache && idx < n; ++i)
+    if (aom_read_bit(r, ACCT_STR)) cached_colors[idx++] = color_cache[i];
+  if (idx < n) {
+    const int n_cached_colors = idx;
+    idx += PALETTE_MAX_SIZE;
+    pmi->palette_colors[idx++] = aom_read_literal(r, bit_depth, ACCT_STR);
+    if (idx < PALETTE_MAX_SIZE + n) {
+      const int min_bits = bit_depth - 3;
+      int bits = min_bits + aom_read_literal(r, 2, ACCT_STR);
+      int range = (1 << bit_depth) - pmi->palette_colors[idx - 1];
+      for (; idx < PALETTE_MAX_SIZE + n; ++idx) {
+        assert(range >= 0);
+        const int delta = aom_read_literal(r, bits, ACCT_STR);
+        pmi->palette_colors[idx] = clamp(pmi->palette_colors[idx - 1] + delta,
+                                         0, (1 << bit_depth) - 1);
+        range -= (pmi->palette_colors[idx] - pmi->palette_colors[idx - 1]);
+        bits = AOMMIN(bits, av1_ceil_log2(range));
+      }
+    }
+    merge_colors(pmi->palette_colors + PALETTE_MAX_SIZE, cached_colors, n,
+                 n_cached_colors);
+  } else {
+    memcpy(pmi->palette_colors + PALETTE_MAX_SIZE, cached_colors,
+           n * sizeof(cached_colors[0]));
+  }
+
+  // V channel colors.
+  if (aom_read_bit(r, ACCT_STR)) {  // Delta encoding.
+    const int min_bits_v = bit_depth - 4;
+    const int max_val = 1 << bit_depth;
+    int bits = min_bits_v + aom_read_literal(r, 2, ACCT_STR);
+    pmi->palette_colors[2 * PALETTE_MAX_SIZE] =
+        aom_read_literal(r, bit_depth, ACCT_STR);
+    for (int i = 1; i < n; ++i) {
+      int delta = aom_read_literal(r, bits, ACCT_STR);
+      if (delta && aom_read_bit(r, ACCT_STR)) delta = -delta;
+      int val = (int)pmi->palette_colors[2 * PALETTE_MAX_SIZE + i - 1] + delta;
+      if (val < 0) val += max_val;
+      if (val >= max_val) val -= max_val;
+      pmi->palette_colors[2 * PALETTE_MAX_SIZE + i] = val;
+    }
+  } else {
+    for (int i = 0; i < n; ++i) {
+      pmi->palette_colors[2 * PALETTE_MAX_SIZE + i] =
+          aom_read_literal(r, bit_depth, ACCT_STR);
+    }
+  }
+}
+
+static void read_palette_mode_info(AV1_COMMON *const cm, MACROBLOCKD *const xd,
+                                   aom_reader *r) {
+  const int num_planes = av1_num_planes(cm);
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  assert(av1_allow_palette(cm->features.allow_screen_content_tools, bsize));
+  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  const int bsize_ctx = av1_get_palette_bsize_ctx(bsize);
+
+  if (mbmi->mode == DC_PRED) {
+    const int palette_mode_ctx = av1_get_palette_mode_ctx(xd);
+    const int modev = aom_read_symbol(
+        r, xd->tile_ctx->palette_y_mode_cdf[bsize_ctx][palette_mode_ctx], 2,
+        ACCT_STR);
+    if (modev) {
+      pmi->palette_size[0] =
+          aom_read_symbol(r, xd->tile_ctx->palette_y_size_cdf[bsize_ctx],
+                          PALETTE_SIZES, ACCT_STR) +
+          2;
+      read_palette_colors_y(xd, cm->seq_params.bit_depth, pmi, r);
+    }
+  }
+  if (num_planes > 1 && mbmi->uv_mode == UV_DC_PRED && xd->is_chroma_ref) {
+    const int palette_uv_mode_ctx = (pmi->palette_size[0] > 0);
+    const int modev = aom_read_symbol(
+        r, xd->tile_ctx->palette_uv_mode_cdf[palette_uv_mode_ctx], 2, ACCT_STR);
+    if (modev) {
+      pmi->palette_size[1] =
+          aom_read_symbol(r, xd->tile_ctx->palette_uv_size_cdf[bsize_ctx],
+                          PALETTE_SIZES, ACCT_STR) +
+          2;
+      read_palette_colors_uv(xd, cm->seq_params.bit_depth, pmi, r);
+    }
+  }
+}
+
+static int read_angle_delta(aom_reader *r, aom_cdf_prob *cdf) {
+  const int sym = aom_read_symbol(r, cdf, 2 * MAX_ANGLE_DELTA + 1, ACCT_STR);
+  return sym - MAX_ANGLE_DELTA;
+}
+
+static void read_filter_intra_mode_info(const AV1_COMMON *const cm,
+                                        MACROBLOCKD *const xd, aom_reader *r) {
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  FILTER_INTRA_MODE_INFO *filter_intra_mode_info =
+      &mbmi->filter_intra_mode_info;
+
+  if (av1_filter_intra_allowed(cm, mbmi)) {
+    filter_intra_mode_info->use_filter_intra = aom_read_symbol(
+        r, xd->tile_ctx->filter_intra_cdfs[mbmi->sb_type], 2, ACCT_STR);
+    if (filter_intra_mode_info->use_filter_intra) {
+      filter_intra_mode_info->filter_intra_mode = aom_read_symbol(
+          r, xd->tile_ctx->filter_intra_mode_cdf, FILTER_INTRA_MODES, ACCT_STR);
+    }
+  } else {
+    filter_intra_mode_info->use_filter_intra = 0;
+  }
+}
+
+void av1_read_tx_type(const AV1_COMMON *const cm, MACROBLOCKD *xd, int blk_row,
+                      int blk_col, TX_SIZE tx_size, aom_reader *r) {
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  uint8_t *tx_type =
+      &xd->tx_type_map[blk_row * xd->tx_type_map_stride + blk_col];
+  *tx_type = DCT_DCT;
+
+  // No need to read transform type if block is skipped.
+  if (mbmi->skip || segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP))
+    return;
+
+  // No need to read transform type for lossless mode(qindex==0).
+  const int qindex = xd->qindex[mbmi->segment_id];
+  if (qindex == 0) return;
+
+  const int inter_block = is_inter_block(mbmi);
+  if (get_ext_tx_types(tx_size, inter_block, cm->features.reduced_tx_set_used) >
+      1) {
+    const TxSetType tx_set_type = av1_get_ext_tx_set_type(
+        tx_size, inter_block, cm->features.reduced_tx_set_used);
+    const int eset =
+        get_ext_tx_set(tx_size, inter_block, cm->features.reduced_tx_set_used);
+    // eset == 0 should correspond to a set with only DCT_DCT and
+    // there is no need to read the tx_type
+    assert(eset != 0);
+
+    const TX_SIZE square_tx_size = txsize_sqr_map[tx_size];
+    FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+    if (inter_block) {
+      *tx_type = av1_ext_tx_inv[tx_set_type][aom_read_symbol(
+          r, ec_ctx->inter_ext_tx_cdf[eset][square_tx_size],
+          av1_num_ext_tx_set[tx_set_type], ACCT_STR)];
+    } else {
+      const PREDICTION_MODE intra_mode =
+          mbmi->filter_intra_mode_info.use_filter_intra
+              ? fimode_to_intradir[mbmi->filter_intra_mode_info
+                                       .filter_intra_mode]
+              : mbmi->mode;
+      *tx_type = av1_ext_tx_inv[tx_set_type][aom_read_symbol(
+          r, ec_ctx->intra_ext_tx_cdf[eset][square_tx_size][intra_mode],
+          av1_num_ext_tx_set[tx_set_type], ACCT_STR)];
+    }
+  }
+}
+
+static INLINE void read_mv(aom_reader *r, MV *mv, const MV *ref,
+                           nmv_context *ctx, MvSubpelPrecision precision);
+
+static INLINE int is_mv_valid(const MV *mv);
+
+static INLINE int assign_dv(AV1_COMMON *cm, MACROBLOCKD *xd, int_mv *mv,
+                            const int_mv *ref_mv, int mi_row, int mi_col,
+                            BLOCK_SIZE bsize, aom_reader *r) {
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  read_mv(r, &mv->as_mv, &ref_mv->as_mv, &ec_ctx->ndvc, MV_SUBPEL_NONE);
+  // DV should not have sub-pel.
+  assert((mv->as_mv.col & 7) == 0);
+  assert((mv->as_mv.row & 7) == 0);
+  mv->as_mv.col = (mv->as_mv.col >> 3) * 8;
+  mv->as_mv.row = (mv->as_mv.row >> 3) * 8;
+  int valid = is_mv_valid(&mv->as_mv) &&
+              av1_is_dv_valid(mv->as_mv, cm, xd, mi_row, mi_col, bsize,
+                              cm->seq_params.mib_size_log2);
+  return valid;
+}
+
+static void read_intrabc_info(AV1_COMMON *const cm, MACROBLOCKD *const xd,
+                              aom_reader *r) {
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  mbmi->use_intrabc = aom_read_symbol(r, ec_ctx->intrabc_cdf, 2, ACCT_STR);
+  if (mbmi->use_intrabc) {
+    BLOCK_SIZE bsize = mbmi->sb_type;
+    mbmi->mode = DC_PRED;
+    mbmi->uv_mode = UV_DC_PRED;
+    mbmi->interp_filters = av1_broadcast_interp_filter(BILINEAR);
+    mbmi->motion_mode = SIMPLE_TRANSLATION;
+
+    int16_t inter_mode_ctx[MODE_CTX_REF_FRAMES];
+    int_mv ref_mvs[INTRA_FRAME + 1][MAX_MV_REF_CANDIDATES];
+
+    av1_find_mv_refs(cm, xd, mbmi, INTRA_FRAME, xd->ref_mv_count,
+                     xd->ref_mv_stack, xd->weight, ref_mvs, /*global_mvs=*/NULL,
+                     inter_mode_ctx);
+
+    int_mv nearestmv, nearmv;
+
+    av1_find_best_ref_mvs(0, ref_mvs[INTRA_FRAME], &nearestmv, &nearmv, 0);
+    int_mv dv_ref = nearestmv.as_int == 0 ? nearmv : nearestmv;
+    if (dv_ref.as_int == 0)
+      av1_find_ref_dv(&dv_ref, &xd->tile, cm->seq_params.mib_size, xd->mi_row);
+    // Ref DV should not have sub-pel.
+    int valid_dv = (dv_ref.as_mv.col & 7) == 0 && (dv_ref.as_mv.row & 7) == 0;
+    dv_ref.as_mv.col = (dv_ref.as_mv.col >> 3) * 8;
+    dv_ref.as_mv.row = (dv_ref.as_mv.row >> 3) * 8;
+    valid_dv = valid_dv && assign_dv(cm, xd, &mbmi->mv[0], &dv_ref, xd->mi_row,
+                                     xd->mi_col, bsize, r);
+    if (!valid_dv) {
+      // Intra bc motion vectors are not valid - signal corrupt frame
+      aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
+                         "Invalid intrabc dv");
+    }
+  }
+}
+
+// If delta q is present, reads delta_q index.
+// Also reads delta_q loop filter levels, if present.
+static void read_delta_q_params(AV1_COMMON *const cm, MACROBLOCKD *const xd,
+                                aom_reader *r) {
+  DeltaQInfo *const delta_q_info = &cm->delta_q_info;
+
+  if (delta_q_info->delta_q_present_flag) {
+    MB_MODE_INFO *const mbmi = xd->mi[0];
+    xd->current_qindex +=
+        read_delta_qindex(cm, xd, r, mbmi) * delta_q_info->delta_q_res;
+    /* Normative: Clamp to [1,MAXQ] to not interfere with lossless mode */
+    xd->current_qindex = clamp(xd->current_qindex, 1, MAXQ);
+    FRAME_CONTEXT *const ec_ctx = xd->tile_ctx;
+    if (delta_q_info->delta_lf_present_flag) {
+      const int mi_row = xd->mi_row;
+      const int mi_col = xd->mi_col;
+      if (delta_q_info->delta_lf_multi) {
+        const int frame_lf_count =
+            av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
+        for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) {
+          const int tmp_lvl =
+              xd->delta_lf[lf_id] +
+              read_delta_lflevel(cm, r, ec_ctx->delta_lf_multi_cdf[lf_id], mbmi,
+                                 mi_col, mi_row) *
+                  delta_q_info->delta_lf_res;
+          mbmi->delta_lf[lf_id] = xd->delta_lf[lf_id] =
+              clamp(tmp_lvl, -MAX_LOOP_FILTER, MAX_LOOP_FILTER);
+        }
+      } else {
+        const int tmp_lvl = xd->delta_lf_from_base +
+                            read_delta_lflevel(cm, r, ec_ctx->delta_lf_cdf,
+                                               mbmi, mi_col, mi_row) *
+                                delta_q_info->delta_lf_res;
+        mbmi->delta_lf_from_base = xd->delta_lf_from_base =
+            clamp(tmp_lvl, -MAX_LOOP_FILTER, MAX_LOOP_FILTER);
+      }
+    }
+  }
+}
+
+static void read_intra_frame_mode_info(AV1_COMMON *const cm,
+                                       MACROBLOCKD *const xd, aom_reader *r) {
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const MB_MODE_INFO *above_mi = xd->above_mbmi;
+  const MB_MODE_INFO *left_mi = xd->left_mbmi;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  struct segmentation *const seg = &cm->seg;
+
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+
+  if (seg->segid_preskip)
+    mbmi->segment_id = read_intra_segment_id(cm, xd, bsize, r, 0);
+
+  mbmi->skip = read_skip(cm, xd, mbmi->segment_id, r);
+
+  if (!seg->segid_preskip)
+    mbmi->segment_id = read_intra_segment_id(cm, xd, bsize, r, mbmi->skip);
+
+  read_cdef(cm, r, xd);
+
+  read_delta_q_params(cm, xd, r);
+
+  mbmi->current_qindex = xd->current_qindex;
+
+  mbmi->ref_frame[0] = INTRA_FRAME;
+  mbmi->ref_frame[1] = NONE_FRAME;
+  mbmi->palette_mode_info.palette_size[0] = 0;
+  mbmi->palette_mode_info.palette_size[1] = 0;
+  mbmi->filter_intra_mode_info.use_filter_intra = 0;
+
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+  xd->above_txfm_context = cm->above_contexts.txfm[xd->tile.tile_row] + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+
+  if (av1_allow_intrabc(cm)) {
+    read_intrabc_info(cm, xd, r);
+    if (is_intrabc_block(mbmi)) return;
+  }
+
+  mbmi->mode = read_intra_mode(r, get_y_mode_cdf(ec_ctx, above_mi, left_mi));
+
+  const int use_angle_delta = av1_use_angle_delta(bsize);
+  mbmi->angle_delta[PLANE_TYPE_Y] =
+      (use_angle_delta && av1_is_directional_mode(mbmi->mode))
+          ? read_angle_delta(r, ec_ctx->angle_delta_cdf[mbmi->mode - V_PRED])
+          : 0;
+
+  if (!cm->seq_params.monochrome && xd->is_chroma_ref) {
+    mbmi->uv_mode =
+        read_intra_mode_uv(ec_ctx, r, is_cfl_allowed(xd), mbmi->mode);
+    if (mbmi->uv_mode == UV_CFL_PRED) {
+      mbmi->cfl_alpha_idx = read_cfl_alphas(ec_ctx, r, &mbmi->cfl_alpha_signs);
+    }
+    mbmi->angle_delta[PLANE_TYPE_UV] =
+        (use_angle_delta && av1_is_directional_mode(get_uv_mode(mbmi->uv_mode)))
+            ? read_angle_delta(r,
+                               ec_ctx->angle_delta_cdf[mbmi->uv_mode - V_PRED])
+            : 0;
+  } else {
+    // Avoid decoding angle_info if there is is no chroma prediction
+    mbmi->uv_mode = UV_DC_PRED;
+  }
+  xd->cfl.store_y = store_cfl_required(cm, xd);
+
+  if (av1_allow_palette(cm->features.allow_screen_content_tools, bsize))
+    read_palette_mode_info(cm, xd, r);
+
+  read_filter_intra_mode_info(cm, xd, r);
+}
+
+static int read_mv_component(aom_reader *r, nmv_component *mvcomp,
+                             int use_subpel, int usehp) {
+  int mag, d, fr, hp;
+  const int sign = aom_read_symbol(r, mvcomp->sign_cdf, 2, ACCT_STR);
+  const int mv_class =
+      aom_read_symbol(r, mvcomp->classes_cdf, MV_CLASSES, ACCT_STR);
+  const int class0 = mv_class == MV_CLASS_0;
+
+  // Integer part
+  if (class0) {
+    d = aom_read_symbol(r, mvcomp->class0_cdf, CLASS0_SIZE, ACCT_STR);
+    mag = 0;
+  } else {
+    const int n = mv_class + CLASS0_BITS - 1;  // number of bits
+    d = 0;
+    for (int i = 0; i < n; ++i)
+      d |= aom_read_symbol(r, mvcomp->bits_cdf[i], 2, ACCT_STR) << i;
+    mag = CLASS0_SIZE << (mv_class + 2);
+  }
+
+  if (use_subpel) {
+    // Fractional part
+    fr = aom_read_symbol(r, class0 ? mvcomp->class0_fp_cdf[d] : mvcomp->fp_cdf,
+                         MV_FP_SIZE, ACCT_STR);
+
+    // High precision part (if hp is not used, the default value of the hp is 1)
+    hp = usehp ? aom_read_symbol(
+                     r, class0 ? mvcomp->class0_hp_cdf : mvcomp->hp_cdf, 2,
+                     ACCT_STR)
+               : 1;
+  } else {
+    fr = 3;
+    hp = 1;
+  }
+
+  // Result
+  mag += ((d << 3) | (fr << 1) | hp) + 1;
+  return sign ? -mag : mag;
+}
+
+static INLINE void read_mv(aom_reader *r, MV *mv, const MV *ref,
+                           nmv_context *ctx, MvSubpelPrecision precision) {
+  MV diff = kZeroMv;
+  const MV_JOINT_TYPE joint_type =
+      (MV_JOINT_TYPE)aom_read_symbol(r, ctx->joints_cdf, MV_JOINTS, ACCT_STR);
+
+  if (mv_joint_vertical(joint_type))
+    diff.row = read_mv_component(r, &ctx->comps[0], precision > MV_SUBPEL_NONE,
+                                 precision > MV_SUBPEL_LOW_PRECISION);
+
+  if (mv_joint_horizontal(joint_type))
+    diff.col = read_mv_component(r, &ctx->comps[1], precision > MV_SUBPEL_NONE,
+                                 precision > MV_SUBPEL_LOW_PRECISION);
+
+  mv->row = ref->row + diff.row;
+  mv->col = ref->col + diff.col;
+}
+
+static REFERENCE_MODE read_block_reference_mode(AV1_COMMON *cm,
+                                                const MACROBLOCKD *xd,
+                                                aom_reader *r) {
+  if (!is_comp_ref_allowed(xd->mi[0]->sb_type)) return SINGLE_REFERENCE;
+  if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT) {
+    const int ctx = av1_get_reference_mode_context(xd);
+    const REFERENCE_MODE mode = (REFERENCE_MODE)aom_read_symbol(
+        r, xd->tile_ctx->comp_inter_cdf[ctx], 2, ACCT_STR);
+    return mode;  // SINGLE_REFERENCE or COMPOUND_REFERENCE
+  } else {
+    assert(cm->current_frame.reference_mode == SINGLE_REFERENCE);
+    return cm->current_frame.reference_mode;
+  }
+}
+
+#define READ_REF_BIT(pname) \
+  aom_read_symbol(r, av1_get_pred_cdf_##pname(xd), 2, ACCT_STR)
+
+static COMP_REFERENCE_TYPE read_comp_reference_type(const MACROBLOCKD *xd,
+                                                    aom_reader *r) {
+  const int ctx = av1_get_comp_reference_type_context(xd);
+  const COMP_REFERENCE_TYPE comp_ref_type =
+      (COMP_REFERENCE_TYPE)aom_read_symbol(
+          r, xd->tile_ctx->comp_ref_type_cdf[ctx], 2, ACCT_STR);
+  return comp_ref_type;  // UNIDIR_COMP_REFERENCE or BIDIR_COMP_REFERENCE
+}
+
+static void set_ref_frames_for_skip_mode(AV1_COMMON *const cm,
+                                         MV_REFERENCE_FRAME ref_frame[2]) {
+  ref_frame[0] = LAST_FRAME + cm->current_frame.skip_mode_info.ref_frame_idx_0;
+  ref_frame[1] = LAST_FRAME + cm->current_frame.skip_mode_info.ref_frame_idx_1;
+}
+
+// Read the referncence frame
+static void read_ref_frames(AV1_COMMON *const cm, MACROBLOCKD *const xd,
+                            aom_reader *r, int segment_id,
+                            MV_REFERENCE_FRAME ref_frame[2]) {
+  if (xd->mi[0]->skip_mode) {
+    set_ref_frames_for_skip_mode(cm, ref_frame);
+    return;
+  }
+
+  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
+    ref_frame[0] = (MV_REFERENCE_FRAME)get_segdata(&cm->seg, segment_id,
+                                                   SEG_LVL_REF_FRAME);
+    ref_frame[1] = NONE_FRAME;
+  } else if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP) ||
+             segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) {
+    ref_frame[0] = LAST_FRAME;
+    ref_frame[1] = NONE_FRAME;
+  } else {
+    const REFERENCE_MODE mode = read_block_reference_mode(cm, xd, r);
+
+    if (mode == COMPOUND_REFERENCE) {
+      const COMP_REFERENCE_TYPE comp_ref_type = read_comp_reference_type(xd, r);
+
+      if (comp_ref_type == UNIDIR_COMP_REFERENCE) {
+        const int bit = READ_REF_BIT(uni_comp_ref_p);
+        if (bit) {
+          ref_frame[0] = BWDREF_FRAME;
+          ref_frame[1] = ALTREF_FRAME;
+        } else {
+          const int bit1 = READ_REF_BIT(uni_comp_ref_p1);
+          if (bit1) {
+            const int bit2 = READ_REF_BIT(uni_comp_ref_p2);
+            if (bit2) {
+              ref_frame[0] = LAST_FRAME;
+              ref_frame[1] = GOLDEN_FRAME;
+            } else {
+              ref_frame[0] = LAST_FRAME;
+              ref_frame[1] = LAST3_FRAME;
+            }
+          } else {
+            ref_frame[0] = LAST_FRAME;
+            ref_frame[1] = LAST2_FRAME;
+          }
+        }
+
+        return;
+      }
+
+      assert(comp_ref_type == BIDIR_COMP_REFERENCE);
+
+      const int idx = 1;
+      const int bit = READ_REF_BIT(comp_ref_p);
+      // Decode forward references.
+      if (!bit) {
+        const int bit1 = READ_REF_BIT(comp_ref_p1);
+        ref_frame[!idx] = bit1 ? LAST2_FRAME : LAST_FRAME;
+      } else {
+        const int bit2 = READ_REF_BIT(comp_ref_p2);
+        ref_frame[!idx] = bit2 ? GOLDEN_FRAME : LAST3_FRAME;
+      }
+
+      // Decode backward references.
+      const int bit_bwd = READ_REF_BIT(comp_bwdref_p);
+      if (!bit_bwd) {
+        const int bit1_bwd = READ_REF_BIT(comp_bwdref_p1);
+        ref_frame[idx] = bit1_bwd ? ALTREF2_FRAME : BWDREF_FRAME;
+      } else {
+        ref_frame[idx] = ALTREF_FRAME;
+      }
+    } else if (mode == SINGLE_REFERENCE) {
+      const int bit0 = READ_REF_BIT(single_ref_p1);
+      if (bit0) {
+        const int bit1 = READ_REF_BIT(single_ref_p2);
+        if (!bit1) {
+          const int bit5 = READ_REF_BIT(single_ref_p6);
+          ref_frame[0] = bit5 ? ALTREF2_FRAME : BWDREF_FRAME;
+        } else {
+          ref_frame[0] = ALTREF_FRAME;
+        }
+      } else {
+        const int bit2 = READ_REF_BIT(single_ref_p3);
+        if (bit2) {
+          const int bit4 = READ_REF_BIT(single_ref_p5);
+          ref_frame[0] = bit4 ? GOLDEN_FRAME : LAST3_FRAME;
+        } else {
+          const int bit3 = READ_REF_BIT(single_ref_p4);
+          ref_frame[0] = bit3 ? LAST2_FRAME : LAST_FRAME;
+        }
+      }
+
+      ref_frame[1] = NONE_FRAME;
+    } else {
+      assert(0 && "Invalid prediction mode.");
+    }
+  }
+}
+
+static INLINE void read_mb_interp_filter(const MACROBLOCKD *const xd,
+                                         InterpFilter interp_filter,
+                                         bool enable_dual_filter,
+                                         MB_MODE_INFO *const mbmi,
+                                         aom_reader *r) {
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+
+  if (!av1_is_interp_needed(xd)) {
+    set_default_interp_filters(mbmi, interp_filter);
+    return;
+  }
+
+  if (interp_filter != SWITCHABLE) {
+    mbmi->interp_filters = av1_broadcast_interp_filter(interp_filter);
+  } else {
+    InterpFilter ref0_filter[2] = { EIGHTTAP_REGULAR, EIGHTTAP_REGULAR };
+    for (int dir = 0; dir < 2; ++dir) {
+      const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
+      ref0_filter[dir] = (InterpFilter)aom_read_symbol(
+          r, ec_ctx->switchable_interp_cdf[ctx], SWITCHABLE_FILTERS, ACCT_STR);
+      if (!enable_dual_filter) {
+        ref0_filter[1] = ref0_filter[0];
+        break;
+      }
+    }
+    // The index system works as: (0, 1) -> (vertical, horizontal) filter types
+    mbmi->interp_filters.as_filters.x_filter = ref0_filter[1];
+    mbmi->interp_filters.as_filters.y_filter = ref0_filter[0];
+  }
+}
+
+static void read_intra_block_mode_info(AV1_COMMON *const cm,
+                                       MACROBLOCKD *const xd,
+                                       MB_MODE_INFO *const mbmi,
+                                       aom_reader *r) {
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const int use_angle_delta = av1_use_angle_delta(bsize);
+
+  mbmi->ref_frame[0] = INTRA_FRAME;
+  mbmi->ref_frame[1] = NONE_FRAME;
+
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+
+  mbmi->mode = read_intra_mode(r, ec_ctx->y_mode_cdf[size_group_lookup[bsize]]);
+
+  mbmi->angle_delta[PLANE_TYPE_Y] =
+      use_angle_delta && av1_is_directional_mode(mbmi->mode)
+          ? read_angle_delta(r, ec_ctx->angle_delta_cdf[mbmi->mode - V_PRED])
+          : 0;
+  if (!cm->seq_params.monochrome && xd->is_chroma_ref) {
+    mbmi->uv_mode =
+        read_intra_mode_uv(ec_ctx, r, is_cfl_allowed(xd), mbmi->mode);
+    if (mbmi->uv_mode == UV_CFL_PRED) {
+      mbmi->cfl_alpha_idx =
+          read_cfl_alphas(xd->tile_ctx, r, &mbmi->cfl_alpha_signs);
+    }
+    mbmi->angle_delta[PLANE_TYPE_UV] =
+        use_angle_delta && av1_is_directional_mode(get_uv_mode(mbmi->uv_mode))
+            ? read_angle_delta(r,
+                               ec_ctx->angle_delta_cdf[mbmi->uv_mode - V_PRED])
+            : 0;
+  } else {
+    // Avoid decoding angle_info if there is is no chroma prediction
+    mbmi->uv_mode = UV_DC_PRED;
+  }
+  xd->cfl.store_y = store_cfl_required(cm, xd);
+
+  mbmi->palette_mode_info.palette_size[0] = 0;
+  mbmi->palette_mode_info.palette_size[1] = 0;
+  if (av1_allow_palette(cm->features.allow_screen_content_tools, bsize))
+    read_palette_mode_info(cm, xd, r);
+
+  read_filter_intra_mode_info(cm, xd, r);
+}
+
+static INLINE int is_mv_valid(const MV *mv) {
+  return mv->row > MV_LOW && mv->row < MV_UPP && mv->col > MV_LOW &&
+         mv->col < MV_UPP;
+}
+
+static INLINE int assign_mv(AV1_COMMON *cm, MACROBLOCKD *xd,
+                            PREDICTION_MODE mode,
+                            MV_REFERENCE_FRAME ref_frame[2], int_mv mv[2],
+                            int_mv ref_mv[2], int_mv nearest_mv[2],
+                            int_mv near_mv[2], int is_compound, int allow_hp,
+                            aom_reader *r) {
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  BLOCK_SIZE bsize = mbmi->sb_type;
+  FeatureFlags *const features = &cm->features;
+  if (features->cur_frame_force_integer_mv) {
+    allow_hp = MV_SUBPEL_NONE;
+  }
+  switch (mode) {
+    case NEWMV: {
+      nmv_context *const nmvc = &ec_ctx->nmvc;
+      read_mv(r, &mv[0].as_mv, &ref_mv[0].as_mv, nmvc, allow_hp);
+      break;
+    }
+    case NEARESTMV: {
+      mv[0].as_int = nearest_mv[0].as_int;
+      break;
+    }
+    case NEARMV: {
+      mv[0].as_int = near_mv[0].as_int;
+      break;
+    }
+    case GLOBALMV: {
+      mv[0].as_int = gm_get_motion_vector(&cm->global_motion[ref_frame[0]],
+                                          features->allow_high_precision_mv,
+                                          bsize, xd->mi_col, xd->mi_row,
+                                          features->cur_frame_force_integer_mv)
+                         .as_int;
+      break;
+    }
+    case NEW_NEWMV: {
+      assert(is_compound);
+      for (int i = 0; i < 2; ++i) {
+        nmv_context *const nmvc = &ec_ctx->nmvc;
+        read_mv(r, &mv[i].as_mv, &ref_mv[i].as_mv, nmvc, allow_hp);
+      }
+      break;
+    }
+    case NEAREST_NEARESTMV: {
+      assert(is_compound);
+      mv[0].as_int = nearest_mv[0].as_int;
+      mv[1].as_int = nearest_mv[1].as_int;
+      break;
+    }
+    case NEAR_NEARMV: {
+      assert(is_compound);
+      mv[0].as_int = near_mv[0].as_int;
+      mv[1].as_int = near_mv[1].as_int;
+      break;
+    }
+    case NEW_NEARESTMV: {
+      nmv_context *const nmvc = &ec_ctx->nmvc;
+      read_mv(r, &mv[0].as_mv, &ref_mv[0].as_mv, nmvc, allow_hp);
+      assert(is_compound);
+      mv[1].as_int = nearest_mv[1].as_int;
+      break;
+    }
+    case NEAREST_NEWMV: {
+      nmv_context *const nmvc = &ec_ctx->nmvc;
+      mv[0].as_int = nearest_mv[0].as_int;
+      read_mv(r, &mv[1].as_mv, &ref_mv[1].as_mv, nmvc, allow_hp);
+      assert(is_compound);
+      break;
+    }
+    case NEAR_NEWMV: {
+      nmv_context *const nmvc = &ec_ctx->nmvc;
+      mv[0].as_int = near_mv[0].as_int;
+      read_mv(r, &mv[1].as_mv, &ref_mv[1].as_mv, nmvc, allow_hp);
+      assert(is_compound);
+      break;
+    }
+    case NEW_NEARMV: {
+      nmv_context *const nmvc = &ec_ctx->nmvc;
+      read_mv(r, &mv[0].as_mv, &ref_mv[0].as_mv, nmvc, allow_hp);
+      assert(is_compound);
+      mv[1].as_int = near_mv[1].as_int;
+      break;
+    }
+    case GLOBAL_GLOBALMV: {
+      assert(is_compound);
+      mv[0].as_int = gm_get_motion_vector(&cm->global_motion[ref_frame[0]],
+                                          features->allow_high_precision_mv,
+                                          bsize, xd->mi_col, xd->mi_row,
+                                          features->cur_frame_force_integer_mv)
+                         .as_int;
+      mv[1].as_int = gm_get_motion_vector(&cm->global_motion[ref_frame[1]],
+                                          features->allow_high_precision_mv,
+                                          bsize, xd->mi_col, xd->mi_row,
+                                          features->cur_frame_force_integer_mv)
+                         .as_int;
+      break;
+    }
+    default: { return 0; }
+  }
+
+  int ret = is_mv_valid(&mv[0].as_mv);
+  if (is_compound) {
+    ret = ret && is_mv_valid(&mv[1].as_mv);
+  }
+  return ret;
+}
+
+static int read_is_inter_block(AV1_COMMON *const cm, MACROBLOCKD *const xd,
+                               int segment_id, aom_reader *r) {
+  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
+    const int frame = get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME);
+    if (frame < LAST_FRAME) return 0;
+    return frame != INTRA_FRAME;
+  }
+  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) {
+    return 1;
+  }
+  const int ctx = av1_get_intra_inter_context(xd);
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  const int is_inter =
+      aom_read_symbol(r, ec_ctx->intra_inter_cdf[ctx], 2, ACCT_STR);
+  return is_inter;
+}
+
+#if DEC_MISMATCH_DEBUG
+static void dec_dump_logs(AV1_COMMON *cm, MB_MODE_INFO *const mbmi, int mi_row,
+                          int mi_col, int16_t mode_ctx) {
+  int_mv mv[2] = { { 0 } };
+  for (int ref = 0; ref < 1 + has_second_ref(mbmi); ++ref)
+    mv[ref].as_mv = mbmi->mv[ref].as_mv;
+
+  const int16_t newmv_ctx = mode_ctx & NEWMV_CTX_MASK;
+  int16_t zeromv_ctx = -1;
+  int16_t refmv_ctx = -1;
+  if (mbmi->mode != NEWMV) {
+    zeromv_ctx = (mode_ctx >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK;
+    if (mbmi->mode != GLOBALMV)
+      refmv_ctx = (mode_ctx >> REFMV_OFFSET) & REFMV_CTX_MASK;
+  }
+
+#define FRAME_TO_CHECK 11
+  if (cm->current_frame.frame_number == FRAME_TO_CHECK && cm->show_frame == 1) {
+    printf(
+        "=== DECODER ===: "
+        "Frame=%d, (mi_row,mi_col)=(%d,%d), skip_mode=%d, mode=%d, bsize=%d, "
+        "show_frame=%d, mv[0]=(%d,%d), mv[1]=(%d,%d), ref[0]=%d, "
+        "ref[1]=%d, motion_mode=%d, mode_ctx=%d, "
+        "newmv_ctx=%d, zeromv_ctx=%d, refmv_ctx=%d, tx_size=%d\n",
+        cm->current_frame.frame_number, mi_row, mi_col, mbmi->skip_mode,
+        mbmi->mode, mbmi->sb_type, cm->show_frame, mv[0].as_mv.row,
+        mv[0].as_mv.col, mv[1].as_mv.row, mv[1].as_mv.col, mbmi->ref_frame[0],
+        mbmi->ref_frame[1], mbmi->motion_mode, mode_ctx, newmv_ctx, zeromv_ctx,
+        refmv_ctx, mbmi->tx_size);
+  }
+}
+#endif  // DEC_MISMATCH_DEBUG
+
+static void read_inter_block_mode_info(AV1Decoder *const pbi,
+                                       MACROBLOCKD *const xd,
+                                       MB_MODE_INFO *const mbmi,
+                                       aom_reader *r) {
+  AV1_COMMON *const cm = &pbi->common;
+  FeatureFlags *const features = &cm->features;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const int allow_hp = features->allow_high_precision_mv;
+  int_mv nearestmv[2], nearmv[2];
+  int_mv ref_mvs[MODE_CTX_REF_FRAMES][MAX_MV_REF_CANDIDATES] = { { { 0 } } };
+  int16_t inter_mode_ctx[MODE_CTX_REF_FRAMES];
+  int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+
+  mbmi->uv_mode = UV_DC_PRED;
+  mbmi->palette_mode_info.palette_size[0] = 0;
+  mbmi->palette_mode_info.palette_size[1] = 0;
+
+  av1_collect_neighbors_ref_counts(xd);
+
+  read_ref_frames(cm, xd, r, mbmi->segment_id, mbmi->ref_frame);
+  const int is_compound = has_second_ref(mbmi);
+
+  const MV_REFERENCE_FRAME ref_frame = av1_ref_frame_type(mbmi->ref_frame);
+  av1_find_mv_refs(cm, xd, mbmi, ref_frame, xd->ref_mv_count, xd->ref_mv_stack,
+                   xd->weight, ref_mvs, /*global_mvs=*/NULL, inter_mode_ctx);
+
+  mbmi->ref_mv_idx = 0;
+
+  if (mbmi->skip_mode) {
+    assert(is_compound);
+    mbmi->mode = NEAREST_NEARESTMV;
+  } else {
+    if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP) ||
+        segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_GLOBALMV)) {
+      mbmi->mode = GLOBALMV;
+    } else {
+      const int mode_ctx =
+          av1_mode_context_analyzer(inter_mode_ctx, mbmi->ref_frame);
+      if (is_compound)
+        mbmi->mode = read_inter_compound_mode(xd, r, mode_ctx);
+      else
+        mbmi->mode = read_inter_mode(ec_ctx, r, mode_ctx);
+      if (mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV ||
+          have_nearmv_in_inter_mode(mbmi->mode))
+        read_drl_idx(ec_ctx, xd, mbmi, r);
+    }
+  }
+
+  if (is_compound != is_inter_compound_mode(mbmi->mode)) {
+    aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
+                       "Prediction mode %d invalid with ref frame %d %d",
+                       mbmi->mode, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+  }
+
+  if (!is_compound && mbmi->mode != GLOBALMV) {
+    av1_find_best_ref_mvs(allow_hp, ref_mvs[mbmi->ref_frame[0]], &nearestmv[0],
+                          &nearmv[0], features->cur_frame_force_integer_mv);
+  }
+
+  if (is_compound && mbmi->mode != GLOBAL_GLOBALMV) {
+    const int ref_mv_idx = mbmi->ref_mv_idx + 1;
+    nearestmv[0] = xd->ref_mv_stack[ref_frame][0].this_mv;
+    nearestmv[1] = xd->ref_mv_stack[ref_frame][0].comp_mv;
+    nearmv[0] = xd->ref_mv_stack[ref_frame][ref_mv_idx].this_mv;
+    nearmv[1] = xd->ref_mv_stack[ref_frame][ref_mv_idx].comp_mv;
+    lower_mv_precision(&nearestmv[0].as_mv, allow_hp,
+                       features->cur_frame_force_integer_mv);
+    lower_mv_precision(&nearestmv[1].as_mv, allow_hp,
+                       features->cur_frame_force_integer_mv);
+    lower_mv_precision(&nearmv[0].as_mv, allow_hp,
+                       features->cur_frame_force_integer_mv);
+    lower_mv_precision(&nearmv[1].as_mv, allow_hp,
+                       features->cur_frame_force_integer_mv);
+  } else if (mbmi->ref_mv_idx > 0 && mbmi->mode == NEARMV) {
+    nearmv[0] =
+        xd->ref_mv_stack[mbmi->ref_frame[0]][1 + mbmi->ref_mv_idx].this_mv;
+  }
+
+  int_mv ref_mv[2] = { nearestmv[0], nearestmv[1] };
+
+  if (is_compound) {
+    int ref_mv_idx = mbmi->ref_mv_idx;
+    // Special case: NEAR_NEWMV and NEW_NEARMV modes use
+    // 1 + mbmi->ref_mv_idx (like NEARMV) instead of
+    // mbmi->ref_mv_idx (like NEWMV)
+    if (mbmi->mode == NEAR_NEWMV || mbmi->mode == NEW_NEARMV)
+      ref_mv_idx = 1 + mbmi->ref_mv_idx;
+
+    // TODO(jingning, yunqing): Do we need a lower_mv_precision() call here?
+    if (compound_ref0_mode(mbmi->mode) == NEWMV)
+      ref_mv[0] = xd->ref_mv_stack[ref_frame][ref_mv_idx].this_mv;
+
+    if (compound_ref1_mode(mbmi->mode) == NEWMV)
+      ref_mv[1] = xd->ref_mv_stack[ref_frame][ref_mv_idx].comp_mv;
+  } else {
+    if (mbmi->mode == NEWMV) {
+      if (xd->ref_mv_count[ref_frame] > 1)
+        ref_mv[0] = xd->ref_mv_stack[ref_frame][mbmi->ref_mv_idx].this_mv;
+    }
+  }
+
+  if (mbmi->skip_mode) assert(mbmi->mode == NEAREST_NEARESTMV);
+
+  const int mv_corrupted_flag =
+      !assign_mv(cm, xd, mbmi->mode, mbmi->ref_frame, mbmi->mv, ref_mv,
+                 nearestmv, nearmv, is_compound, allow_hp, r);
+  aom_merge_corrupted_flag(&xd->corrupted, mv_corrupted_flag);
+
+  mbmi->use_wedge_interintra = 0;
+  if (cm->seq_params.enable_interintra_compound && !mbmi->skip_mode &&
+      is_interintra_allowed(mbmi)) {
+    const int bsize_group = size_group_lookup[bsize];
+    const int interintra =
+        aom_read_symbol(r, ec_ctx->interintra_cdf[bsize_group], 2, ACCT_STR);
+    assert(mbmi->ref_frame[1] == NONE_FRAME);
+    if (interintra) {
+      const INTERINTRA_MODE interintra_mode =
+          read_interintra_mode(xd, r, bsize_group);
+      mbmi->ref_frame[1] = INTRA_FRAME;
+      mbmi->interintra_mode = interintra_mode;
+      mbmi->angle_delta[PLANE_TYPE_Y] = 0;
+      mbmi->angle_delta[PLANE_TYPE_UV] = 0;
+      mbmi->filter_intra_mode_info.use_filter_intra = 0;
+      if (av1_is_wedge_used(bsize)) {
+        mbmi->use_wedge_interintra = aom_read_symbol(
+            r, ec_ctx->wedge_interintra_cdf[bsize], 2, ACCT_STR);
+        if (mbmi->use_wedge_interintra) {
+          mbmi->interintra_wedge_index = (int8_t)aom_read_symbol(
+              r, ec_ctx->wedge_idx_cdf[bsize], MAX_WEDGE_TYPES, ACCT_STR);
+        }
+      }
+    }
+  }
+
+  for (int ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
+    const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
+    xd->block_ref_scale_factors[ref] = get_ref_scale_factors_const(cm, frame);
+  }
+
+  mbmi->motion_mode = SIMPLE_TRANSLATION;
+  if (is_motion_variation_allowed_bsize(mbmi->sb_type) && !mbmi->skip_mode &&
+      !has_second_ref(mbmi)) {
+    mbmi->num_proj_ref = av1_findSamples(cm, xd, pts, pts_inref);
+  }
+  av1_count_overlappable_neighbors(cm, xd);
+
+  if (mbmi->ref_frame[1] != INTRA_FRAME)
+    mbmi->motion_mode = read_motion_mode(cm, xd, mbmi, r);
+
+  // init
+  mbmi->comp_group_idx = 0;
+  mbmi->compound_idx = 1;
+  mbmi->interinter_comp.type = COMPOUND_AVERAGE;
+
+  if (has_second_ref(mbmi) && !mbmi->skip_mode) {
+    // Read idx to indicate current compound inter prediction mode group
+    const int masked_compound_used = is_any_masked_compound_used(bsize) &&
+                                     cm->seq_params.enable_masked_compound;
+
+    if (masked_compound_used) {
+      const int ctx_comp_group_idx = get_comp_group_idx_context(xd);
+      mbmi->comp_group_idx = (uint8_t)aom_read_symbol(
+          r, ec_ctx->comp_group_idx_cdf[ctx_comp_group_idx], 2, ACCT_STR);
+    }
+
+    if (mbmi->comp_group_idx == 0) {
+      if (cm->seq_params.order_hint_info.enable_dist_wtd_comp) {
+        const int comp_index_ctx = get_comp_index_context(cm, xd);
+        mbmi->compound_idx = (uint8_t)aom_read_symbol(
+            r, ec_ctx->compound_index_cdf[comp_index_ctx], 2, ACCT_STR);
+        mbmi->interinter_comp.type =
+            mbmi->compound_idx ? COMPOUND_AVERAGE : COMPOUND_DISTWTD;
+      } else {
+        // Distance-weighted compound is disabled, so always use average
+        mbmi->compound_idx = 1;
+        mbmi->interinter_comp.type = COMPOUND_AVERAGE;
+      }
+    } else {
+      assert(cm->current_frame.reference_mode != SINGLE_REFERENCE &&
+             is_inter_compound_mode(mbmi->mode) &&
+             mbmi->motion_mode == SIMPLE_TRANSLATION);
+      assert(masked_compound_used);
+
+      // compound_diffwtd, wedge
+      if (is_interinter_compound_used(COMPOUND_WEDGE, bsize)) {
+        mbmi->interinter_comp.type =
+            COMPOUND_WEDGE + aom_read_symbol(r,
+                                             ec_ctx->compound_type_cdf[bsize],
+                                             MASKED_COMPOUND_TYPES, ACCT_STR);
+      } else {
+        mbmi->interinter_comp.type = COMPOUND_DIFFWTD;
+      }
+
+      if (mbmi->interinter_comp.type == COMPOUND_WEDGE) {
+        assert(is_interinter_compound_used(COMPOUND_WEDGE, bsize));
+        mbmi->interinter_comp.wedge_index = (int8_t)aom_read_symbol(
+            r, ec_ctx->wedge_idx_cdf[bsize], MAX_WEDGE_TYPES, ACCT_STR);
+        mbmi->interinter_comp.wedge_sign = (int8_t)aom_read_bit(r, ACCT_STR);
+      } else {
+        assert(mbmi->interinter_comp.type == COMPOUND_DIFFWTD);
+        mbmi->interinter_comp.mask_type =
+            aom_read_literal(r, MAX_DIFFWTD_MASK_BITS, ACCT_STR);
+      }
+    }
+  }
+
+  read_mb_interp_filter(xd, features->interp_filter,
+                        cm->seq_params.enable_dual_filter, mbmi, r);
+
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+
+  if (mbmi->motion_mode == WARPED_CAUSAL) {
+    mbmi->wm_params.wmtype = DEFAULT_WMTYPE;
+    mbmi->wm_params.invalid = 0;
+
+    if (mbmi->num_proj_ref > 1) {
+      mbmi->num_proj_ref = av1_selectSamples(&mbmi->mv[0].as_mv, pts, pts_inref,
+                                             mbmi->num_proj_ref, bsize);
+    }
+
+    if (av1_find_projection(mbmi->num_proj_ref, pts, pts_inref, bsize,
+                            mbmi->mv[0].as_mv.row, mbmi->mv[0].as_mv.col,
+                            &mbmi->wm_params, mi_row, mi_col)) {
+#if WARPED_MOTION_DEBUG
+      printf("Warning: unexpected warped model from aomenc\n");
+#endif
+      mbmi->wm_params.invalid = 1;
+    }
+  }
+
+  xd->cfl.store_y = store_cfl_required(cm, xd);
+
+#if DEC_MISMATCH_DEBUG
+  dec_dump_logs(cm, mi, mi_row, mi_col, mode_ctx);
+#endif  // DEC_MISMATCH_DEBUG
+}
+
+static void read_inter_frame_mode_info(AV1Decoder *const pbi,
+                                       MACROBLOCKD *const xd, aom_reader *r) {
+  AV1_COMMON *const cm = &pbi->common;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  int inter_block = 1;
+
+  mbmi->mv[0].as_int = 0;
+  mbmi->mv[1].as_int = 0;
+  mbmi->segment_id = read_inter_segment_id(cm, xd, 1, r);
+
+  mbmi->skip_mode = read_skip_mode(cm, xd, mbmi->segment_id, r);
+
+  if (mbmi->skip_mode)
+    mbmi->skip = 1;
+  else
+    mbmi->skip = read_skip(cm, xd, mbmi->segment_id, r);
+
+  if (!cm->seg.segid_preskip)
+    mbmi->segment_id = read_inter_segment_id(cm, xd, 0, r);
+
+  read_cdef(cm, r, xd);
+
+  read_delta_q_params(cm, xd, r);
+
+  if (!mbmi->skip_mode)
+    inter_block = read_is_inter_block(cm, xd, mbmi->segment_id, r);
+
+  mbmi->current_qindex = xd->current_qindex;
+
+  xd->above_txfm_context =
+      cm->above_contexts.txfm[xd->tile.tile_row] + xd->mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (xd->mi_row & MAX_MIB_MASK);
+
+  if (inter_block)
+    read_inter_block_mode_info(pbi, xd, mbmi, r);
+  else
+    read_intra_block_mode_info(cm, xd, mbmi, r);
+}
+
+static void intra_copy_frame_mvs(AV1_COMMON *const cm, int mi_row, int mi_col,
+                                 int x_mis, int y_mis) {
+  const int frame_mvs_stride = ROUND_POWER_OF_TWO(cm->mi_params.mi_cols, 1);
+  MV_REF *frame_mvs =
+      cm->cur_frame->mvs + (mi_row >> 1) * frame_mvs_stride + (mi_col >> 1);
+  x_mis = ROUND_POWER_OF_TWO(x_mis, 1);
+  y_mis = ROUND_POWER_OF_TWO(y_mis, 1);
+
+  for (int h = 0; h < y_mis; h++) {
+    MV_REF *mv = frame_mvs;
+    for (int w = 0; w < x_mis; w++) {
+      mv->ref_frame = NONE_FRAME;
+      mv++;
+    }
+    frame_mvs += frame_mvs_stride;
+  }
+}
+
+void av1_read_mode_info(AV1Decoder *const pbi, MACROBLOCKD *xd, aom_reader *r,
+                        int x_mis, int y_mis) {
+  AV1_COMMON *const cm = &pbi->common;
+  MB_MODE_INFO *const mi = xd->mi[0];
+  mi->use_intrabc = 0;
+
+  if (frame_is_intra_only(cm)) {
+    read_intra_frame_mode_info(cm, xd, r);
+    if (pbi->common.seq_params.order_hint_info.enable_ref_frame_mvs)
+      intra_copy_frame_mvs(cm, xd->mi_row, xd->mi_col, x_mis, y_mis);
+  } else {
+    read_inter_frame_mode_info(pbi, xd, r);
+    if (pbi->common.seq_params.order_hint_info.enable_ref_frame_mvs)
+      av1_copy_frame_mvs(cm, mi, xd->mi_row, xd->mi_col, x_mis, y_mis);
+  }
+}
diff --git a/libs/libaom/src/av1/decoder/decodemv.h b/libs/libaom/src/av1/decoder/decodemv.h
new file mode 100644
index 000000000..289e66ae1
--- /dev/null
+++ b/libs/libaom/src/av1/decoder/decodemv.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_DECODER_DECODEMV_H_
+#define AOM_AV1_DECODER_DECODEMV_H_
+
+#include "aom_dsp/bitreader.h"
+
+#include "av1/decoder/decoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_read_mode_info(AV1Decoder *const pbi, MACROBLOCKD *xd, aom_reader *r,
+                        int x_mis, int y_mis);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+void av1_read_tx_type(const AV1_COMMON *const cm, MACROBLOCKD *xd, int blk_row,
+                      int blk_col, TX_SIZE tx_size, aom_reader *r);
+
+#endif  // AOM_AV1_DECODER_DECODEMV_H_
diff --git a/libs/libaom/src/av1/decoder/decoder.c b/libs/libaom/src/av1/decoder/decoder.c
new file mode 100644
index 000000000..fc5f2cd20
--- /dev/null
+++ b/libs/libaom/src/av1/decoder/decoder.c
@@ -0,0 +1,539 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <stdio.h>
+
+#include "config/av1_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/system_state.h"
+#include "aom_ports/aom_once.h"
+#include "aom_ports/aom_timer.h"
+#include "aom_scale/aom_scale.h"
+#include "aom_util/aom_thread.h"
+
+#include "av1/common/alloccommon.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/common/av1_loopfilter.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+
+#include "av1/decoder/decodeframe.h"
+#include "av1/decoder/decoder.h"
+#include "av1/decoder/detokenize.h"
+#include "av1/decoder/obu.h"
+
+static void initialize_dec(void) {
+  av1_rtcd();
+  aom_dsp_rtcd();
+  aom_scale_rtcd();
+  av1_init_intra_predictors();
+  av1_init_wedge_masks();
+}
+
+static void dec_set_mb_mi(CommonModeInfoParams *mi_params, int width,
+                          int height) {
+  // Ensure that the decoded width and height are both multiples of
+  // 8 luma pixels (note: this may only be a multiple of 4 chroma pixels if
+  // subsampling is used).
+  // This simplifies the implementation of various experiments,
+  // eg. cdef, which operates on units of 8x8 luma pixels.
+  const int aligned_width = ALIGN_POWER_OF_TWO(width, 3);
+  const int aligned_height = ALIGN_POWER_OF_TWO(height, 3);
+
+  mi_params->mi_cols = aligned_width >> MI_SIZE_LOG2;
+  mi_params->mi_rows = aligned_height >> MI_SIZE_LOG2;
+  mi_params->mi_stride = calc_mi_size(mi_params->mi_cols);
+
+  mi_params->mb_cols = (mi_params->mi_cols + 2) >> 2;
+  mi_params->mb_rows = (mi_params->mi_rows + 2) >> 2;
+  mi_params->MBs = mi_params->mb_rows * mi_params->mb_cols;
+
+  mi_params->mi_alloc_bsize = BLOCK_4X4;
+  mi_params->mi_alloc_stride = mi_params->mi_stride;
+
+  assert(mi_size_wide[mi_params->mi_alloc_bsize] ==
+         mi_size_high[mi_params->mi_alloc_bsize]);
+
+#if CONFIG_LPF_MASK
+  av1_alloc_loop_filter_mask(mi_params);
+#endif
+}
+
+static void dec_setup_mi(CommonModeInfoParams *mi_params) {
+  const int mi_grid_size =
+      mi_params->mi_stride * calc_mi_size(mi_params->mi_rows);
+  memset(mi_params->mi_grid_base, 0,
+         mi_grid_size * sizeof(*mi_params->mi_grid_base));
+}
+
+static void dec_free_mi(CommonModeInfoParams *mi_params) {
+  aom_free(mi_params->mi_alloc);
+  mi_params->mi_alloc = NULL;
+  aom_free(mi_params->mi_grid_base);
+  mi_params->mi_grid_base = NULL;
+  mi_params->mi_alloc_size = 0;
+  aom_free(mi_params->tx_type_map);
+  mi_params->tx_type_map = NULL;
+}
+
+AV1Decoder *av1_decoder_create(BufferPool *const pool) {
+  AV1Decoder *volatile const pbi = aom_memalign(32, sizeof(*pbi));
+  if (!pbi) return NULL;
+  av1_zero(*pbi);
+
+  AV1_COMMON *volatile const cm = &pbi->common;
+
+  // The jmp_buf is valid only for the duration of the function that calls
+  // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+  // before it returns.
+  if (setjmp(cm->error.jmp)) {
+    cm->error.setjmp = 0;
+    av1_decoder_remove(pbi);
+    return NULL;
+  }
+
+  cm->error.setjmp = 1;
+
+  CHECK_MEM_ERROR(cm, cm->fc,
+                  (FRAME_CONTEXT *)aom_memalign(32, sizeof(*cm->fc)));
+  CHECK_MEM_ERROR(
+      cm, cm->default_frame_context,
+      (FRAME_CONTEXT *)aom_memalign(32, sizeof(*cm->default_frame_context)));
+  memset(cm->fc, 0, sizeof(*cm->fc));
+  memset(cm->default_frame_context, 0, sizeof(*cm->default_frame_context));
+
+  pbi->need_resync = 1;
+  aom_once(initialize_dec);
+
+  // Initialize the references to not point to any frame buffers.
+  for (int i = 0; i < REF_FRAMES; i++) {
+    cm->ref_frame_map[i] = NULL;
+  }
+
+  cm->current_frame.frame_number = 0;
+  pbi->decoding_first_frame = 1;
+  pbi->common.buffer_pool = pool;
+
+  cm->seq_params.bit_depth = AOM_BITS_8;
+
+  cm->mi_params.free_mi = dec_free_mi;
+  cm->mi_params.setup_mi = dec_setup_mi;
+  cm->mi_params.set_mb_mi = dec_set_mb_mi;
+
+  av1_loop_filter_init(cm);
+
+  av1_qm_init(&cm->quant_params, av1_num_planes(cm));
+  av1_loop_restoration_precal();
+#if CONFIG_ACCOUNTING
+  pbi->acct_enabled = 1;
+  aom_accounting_init(&pbi->accounting);
+#endif
+
+  cm->error.setjmp = 0;
+
+  aom_get_worker_interface()->init(&pbi->lf_worker);
+  pbi->lf_worker.thread_name = "aom lf worker";
+
+  return pbi;
+}
+
+void av1_dealloc_dec_jobs(struct AV1DecTileMTData *tile_mt_info) {
+  if (tile_mt_info != NULL) {
+#if CONFIG_MULTITHREAD
+    if (tile_mt_info->job_mutex != NULL) {
+      pthread_mutex_destroy(tile_mt_info->job_mutex);
+      aom_free(tile_mt_info->job_mutex);
+    }
+#endif
+    aom_free(tile_mt_info->job_queue);
+    // clear the structure as the source of this call may be a resize in which
+    // case this call will be followed by an _alloc() which may fail.
+    av1_zero(*tile_mt_info);
+  }
+}
+
+void av1_dec_free_cb_buf(AV1Decoder *pbi) {
+  aom_free(pbi->cb_buffer_base);
+  pbi->cb_buffer_base = NULL;
+  pbi->cb_buffer_alloc_size = 0;
+}
+
+void av1_decoder_remove(AV1Decoder *pbi) {
+  int i;
+
+  if (!pbi) return;
+
+  // Free the tile list output buffer.
+  aom_free_frame_buffer(&pbi->tile_list_outbuf);
+
+  aom_get_worker_interface()->end(&pbi->lf_worker);
+  aom_free(pbi->lf_worker.data1);
+
+  if (pbi->thread_data) {
+    for (int worker_idx = 0; worker_idx < pbi->max_threads - 1; worker_idx++) {
+      DecWorkerData *const thread_data = pbi->thread_data + worker_idx;
+      av1_free_mc_tmp_buf(thread_data->td);
+      aom_free(thread_data->td);
+    }
+    aom_free(pbi->thread_data);
+  }
+
+  for (i = 0; i < pbi->num_workers; ++i) {
+    AVxWorker *const worker = &pbi->tile_workers[i];
+    aom_get_worker_interface()->end(worker);
+  }
+#if CONFIG_MULTITHREAD
+  if (pbi->row_mt_mutex_ != NULL) {
+    pthread_mutex_destroy(pbi->row_mt_mutex_);
+    aom_free(pbi->row_mt_mutex_);
+  }
+  if (pbi->row_mt_cond_ != NULL) {
+    pthread_cond_destroy(pbi->row_mt_cond_);
+    aom_free(pbi->row_mt_cond_);
+  }
+#endif
+  for (i = 0; i < pbi->allocated_tiles; i++) {
+    TileDataDec *const tile_data = pbi->tile_data + i;
+    av1_dec_row_mt_dealloc(&tile_data->dec_row_mt_sync);
+  }
+  aom_free(pbi->tile_data);
+  aom_free(pbi->tile_workers);
+
+  if (pbi->num_workers > 0) {
+    av1_loop_filter_dealloc(&pbi->lf_row_sync);
+    av1_loop_restoration_dealloc(&pbi->lr_row_sync, pbi->num_workers);
+    av1_dealloc_dec_jobs(&pbi->tile_mt_info);
+  }
+
+  av1_dec_free_cb_buf(pbi);
+#if CONFIG_ACCOUNTING
+  aom_accounting_clear(&pbi->accounting);
+#endif
+  av1_free_mc_tmp_buf(&pbi->td);
+  aom_img_metadata_array_free(pbi->metadata);
+  aom_free(pbi);
+}
+
+void av1_visit_palette(AV1Decoder *const pbi, MACROBLOCKD *const xd,
+                       aom_reader *r, palette_visitor_fn_t visit) {
+  if (!is_inter_block(xd->mi[0])) {
+    for (int plane = 0; plane < AOMMIN(2, av1_num_planes(&pbi->common));
+         ++plane) {
+      if (plane == 0 || xd->is_chroma_ref) {
+        if (xd->mi[0]->palette_mode_info.palette_size[plane])
+          visit(xd, plane, r);
+      } else {
+        assert(xd->mi[0]->palette_mode_info.palette_size[plane] == 0);
+      }
+    }
+  }
+}
+
+static int equal_dimensions(const YV12_BUFFER_CONFIG *a,
+                            const YV12_BUFFER_CONFIG *b) {
+  return a->y_height == b->y_height && a->y_width == b->y_width &&
+         a->uv_height == b->uv_height && a->uv_width == b->uv_width;
+}
+
+aom_codec_err_t av1_copy_reference_dec(AV1Decoder *pbi, int idx,
+                                       YV12_BUFFER_CONFIG *sd) {
+  AV1_COMMON *cm = &pbi->common;
+  const int num_planes = av1_num_planes(cm);
+
+  const YV12_BUFFER_CONFIG *const cfg = get_ref_frame(cm, idx);
+  if (cfg == NULL) {
+    aom_internal_error(&cm->error, AOM_CODEC_ERROR, "No reference frame");
+    return AOM_CODEC_ERROR;
+  }
+  if (!equal_dimensions(cfg, sd))
+    aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+                       "Incorrect buffer dimensions");
+  else
+    aom_yv12_copy_frame(cfg, sd, num_planes);
+
+  return cm->error.error_code;
+}
+
+static int equal_dimensions_and_border(const YV12_BUFFER_CONFIG *a,
+                                       const YV12_BUFFER_CONFIG *b) {
+  return a->y_height == b->y_height && a->y_width == b->y_width &&
+         a->uv_height == b->uv_height && a->uv_width == b->uv_width &&
+         a->y_stride == b->y_stride && a->uv_stride == b->uv_stride &&
+         a->border == b->border &&
+         (a->flags & YV12_FLAG_HIGHBITDEPTH) ==
+             (b->flags & YV12_FLAG_HIGHBITDEPTH);
+}
+
+aom_codec_err_t av1_set_reference_dec(AV1_COMMON *cm, int idx,
+                                      int use_external_ref,
+                                      YV12_BUFFER_CONFIG *sd) {
+  const int num_planes = av1_num_planes(cm);
+  YV12_BUFFER_CONFIG *ref_buf = NULL;
+
+  // Get the destination reference buffer.
+  ref_buf = get_ref_frame(cm, idx);
+
+  if (ref_buf == NULL) {
+    aom_internal_error(&cm->error, AOM_CODEC_ERROR, "No reference frame");
+    return AOM_CODEC_ERROR;
+  }
+
+  if (!use_external_ref) {
+    if (!equal_dimensions(ref_buf, sd)) {
+      aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+                         "Incorrect buffer dimensions");
+    } else {
+      // Overwrite the reference frame buffer.
+      aom_yv12_copy_frame(sd, ref_buf, num_planes);
+    }
+  } else {
+    if (!equal_dimensions_and_border(ref_buf, sd)) {
+      aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+                         "Incorrect buffer dimensions");
+    } else {
+      // Overwrite the reference frame buffer pointers.
+      // Once we no longer need the external reference buffer, these pointers
+      // are restored.
+      ref_buf->store_buf_adr[0] = ref_buf->y_buffer;
+      ref_buf->store_buf_adr[1] = ref_buf->u_buffer;
+      ref_buf->store_buf_adr[2] = ref_buf->v_buffer;
+      ref_buf->y_buffer = sd->y_buffer;
+      ref_buf->u_buffer = sd->u_buffer;
+      ref_buf->v_buffer = sd->v_buffer;
+      ref_buf->use_external_reference_buffers = 1;
+    }
+  }
+
+  return cm->error.error_code;
+}
+
+aom_codec_err_t av1_copy_new_frame_dec(AV1_COMMON *cm,
+                                       YV12_BUFFER_CONFIG *new_frame,
+                                       YV12_BUFFER_CONFIG *sd) {
+  const int num_planes = av1_num_planes(cm);
+
+  if (!equal_dimensions_and_border(new_frame, sd))
+    aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+                       "Incorrect buffer dimensions");
+  else
+    aom_yv12_copy_frame(new_frame, sd, num_planes);
+
+  return cm->error.error_code;
+}
+
+static void release_current_frame(AV1Decoder *pbi) {
+  AV1_COMMON *const cm = &pbi->common;
+  BufferPool *const pool = cm->buffer_pool;
+
+  cm->cur_frame->buf.corrupted = 1;
+  lock_buffer_pool(pool);
+  decrease_ref_count(cm->cur_frame, pool);
+  unlock_buffer_pool(pool);
+  cm->cur_frame = NULL;
+}
+
+// If any buffer updating is signaled it should be done here.
+// Consumes a reference to cm->cur_frame.
+//
+// This functions returns void. It reports failure by setting
+// cm->error.error_code.
+static void update_frame_buffers(AV1Decoder *pbi, int frame_decoded) {
+  int ref_index = 0, mask;
+  AV1_COMMON *const cm = &pbi->common;
+  BufferPool *const pool = cm->buffer_pool;
+
+  if (frame_decoded) {
+    lock_buffer_pool(pool);
+
+    // In ext-tile decoding, the camera frame header is only decoded once. So,
+    // we don't update the references here.
+    if (!pbi->camera_frame_header_ready) {
+      // The following for loop needs to release the reference stored in
+      // cm->ref_frame_map[ref_index] before storing a reference to
+      // cm->cur_frame in cm->ref_frame_map[ref_index].
+      for (mask = cm->current_frame.refresh_frame_flags; mask; mask >>= 1) {
+        if (mask & 1) {
+          decrease_ref_count(cm->ref_frame_map[ref_index], pool);
+          cm->ref_frame_map[ref_index] = cm->cur_frame;
+          ++cm->cur_frame->ref_count;
+        }
+        ++ref_index;
+      }
+    }
+
+    if (cm->show_existing_frame || cm->show_frame) {
+      if (pbi->output_all_layers) {
+        // Append this frame to the output queue
+        if (pbi->num_output_frames >= MAX_NUM_SPATIAL_LAYERS) {
+          // We can't store the new frame anywhere, so drop it and return an
+          // error
+          cm->cur_frame->buf.corrupted = 1;
+          decrease_ref_count(cm->cur_frame, pool);
+          cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
+        } else {
+          pbi->output_frames[pbi->num_output_frames] = cm->cur_frame;
+          pbi->num_output_frames++;
+        }
+      } else {
+        // Replace any existing output frame
+        assert(pbi->num_output_frames == 0 || pbi->num_output_frames == 1);
+        if (pbi->num_output_frames > 0) {
+          decrease_ref_count(pbi->output_frames[0], pool);
+        }
+        pbi->output_frames[0] = cm->cur_frame;
+        pbi->num_output_frames = 1;
+      }
+    } else {
+      decrease_ref_count(cm->cur_frame, pool);
+    }
+
+    unlock_buffer_pool(pool);
+  } else {
+    // Nothing was decoded, so just drop this frame buffer
+    lock_buffer_pool(pool);
+    decrease_ref_count(cm->cur_frame, pool);
+    unlock_buffer_pool(pool);
+  }
+  cm->cur_frame = NULL;
+
+  if (!pbi->camera_frame_header_ready) {
+    // Invalidate these references until the next frame starts.
+    for (ref_index = 0; ref_index < INTER_REFS_PER_FRAME; ref_index++) {
+      cm->remapped_ref_idx[ref_index] = INVALID_IDX;
+    }
+  }
+}
+
+int av1_receive_compressed_data(AV1Decoder *pbi, size_t size,
+                                const uint8_t **psource) {
+  AV1_COMMON *volatile const cm = &pbi->common;
+  const uint8_t *source = *psource;
+  cm->error.error_code = AOM_CODEC_OK;
+  cm->error.has_detail = 0;
+
+  if (size == 0) {
+    // This is used to signal that we are missing frames.
+    // We do not know if the missing frame(s) was supposed to update
+    // any of the reference buffers, but we act conservative and
+    // mark only the last buffer as corrupted.
+    //
+    // TODO(jkoleszar): Error concealment is undefined and non-normative
+    // at this point, but if it becomes so, [0] may not always be the correct
+    // thing to do here.
+    RefCntBuffer *ref_buf = get_ref_frame_buf(cm, LAST_FRAME);
+    if (ref_buf != NULL) ref_buf->buf.corrupted = 1;
+  }
+
+  if (assign_cur_frame_new_fb(cm) == NULL) {
+    cm->error.error_code = AOM_CODEC_MEM_ERROR;
+    return 1;
+  }
+
+  // The jmp_buf is valid only for the duration of the function that calls
+  // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+  // before it returns.
+  if (setjmp(cm->error.jmp)) {
+    const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+    int i;
+
+    cm->error.setjmp = 0;
+
+    // Synchronize all threads immediately as a subsequent decode call may
+    // cause a resize invalidating some allocations.
+    winterface->sync(&pbi->lf_worker);
+    for (i = 0; i < pbi->num_workers; ++i) {
+      winterface->sync(&pbi->tile_workers[i]);
+    }
+
+    release_current_frame(pbi);
+    aom_clear_system_state();
+    return -1;
+  }
+
+  cm->error.setjmp = 1;
+
+  int frame_decoded =
+      aom_decode_frame_from_obus(pbi, source, source + size, psource);
+
+  if (frame_decoded < 0) {
+    assert(cm->error.error_code != AOM_CODEC_OK);
+    release_current_frame(pbi);
+    cm->error.setjmp = 0;
+    return 1;
+  }
+
+#if TXCOEFF_TIMER
+  cm->cum_txcoeff_timer += cm->txcoeff_timer;
+  fprintf(stderr,
+          "txb coeff block number: %d, frame time: %ld, cum time %ld in us\n",
+          cm->txb_count, cm->txcoeff_timer, cm->cum_txcoeff_timer);
+  cm->txcoeff_timer = 0;
+  cm->txb_count = 0;
+#endif
+
+  // Note: At this point, this function holds a reference to cm->cur_frame
+  // in the buffer pool. This reference is consumed by update_frame_buffers().
+  update_frame_buffers(pbi, frame_decoded);
+
+  if (frame_decoded) {
+    pbi->decoding_first_frame = 0;
+  }
+
+  if (cm->error.error_code != AOM_CODEC_OK) {
+    cm->error.setjmp = 0;
+    return 1;
+  }
+
+  aom_clear_system_state();
+
+  if (!cm->show_existing_frame) {
+    if (cm->seg.enabled) {
+      if (cm->prev_frame &&
+          (cm->mi_params.mi_rows == cm->prev_frame->mi_rows) &&
+          (cm->mi_params.mi_cols == cm->prev_frame->mi_cols)) {
+        cm->last_frame_seg_map = cm->prev_frame->seg_map;
+      } else {
+        cm->last_frame_seg_map = NULL;
+      }
+    }
+  }
+
+  // Update progress in frame parallel decode.
+  cm->error.setjmp = 0;
+
+  return 0;
+}
+
+// Get the frame at a particular index in the output queue
+int av1_get_raw_frame(AV1Decoder *pbi, size_t index, YV12_BUFFER_CONFIG **sd,
+                      aom_film_grain_t **grain_params) {
+  if (index >= pbi->num_output_frames) return -1;
+  *sd = &pbi->output_frames[index]->buf;
+  *grain_params = &pbi->output_frames[index]->film_grain_params;
+  aom_clear_system_state();
+  return 0;
+}
+
+// Get the highest-spatial-layer output
+// TODO(david.barker): What should this do?
+int av1_get_frame_to_show(AV1Decoder *pbi, YV12_BUFFER_CONFIG *frame) {
+  if (pbi->num_output_frames == 0) return -1;
+
+  *frame = pbi->output_frames[pbi->num_output_frames - 1]->buf;
+  return 0;
+}
diff --git a/libs/libaom/src/av1/decoder/decoder.h b/libs/libaom/src/av1/decoder/decoder.h
new file mode 100644
index 000000000..4580de2ac
--- /dev/null
+++ b/libs/libaom/src/av1/decoder/decoder.h
@@ -0,0 +1,331 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_DECODER_DECODER_H_
+#define AOM_AV1_DECODER_DECODER_H_
+
+#include "config/aom_config.h"
+
+#include "aom/aom_codec.h"
+#include "aom_dsp/bitreader.h"
+#include "aom_scale/yv12config.h"
+#include "aom_util/aom_thread.h"
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/thread_common.h"
+#include "av1/decoder/dthread.h"
+#if CONFIG_ACCOUNTING
+#include "av1/decoder/accounting.h"
+#endif
+#if CONFIG_INSPECTION
+#include "av1/decoder/inspection.h"
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef void (*decode_block_visitor_fn_t)(const AV1_COMMON *const cm,
+                                          MACROBLOCKD *const xd,
+                                          aom_reader *const r, const int plane,
+                                          const int row, const int col,
+                                          const TX_SIZE tx_size);
+
+typedef void (*predict_inter_block_visitor_fn_t)(AV1_COMMON *const cm,
+                                                 MACROBLOCKD *const xd,
+                                                 BLOCK_SIZE bsize);
+
+typedef void (*cfl_store_inter_block_visitor_fn_t)(AV1_COMMON *const cm,
+                                                   MACROBLOCKD *const xd);
+
+typedef struct ThreadData {
+  DECLARE_ALIGNED(32, MACROBLOCKD, xd);
+  CB_BUFFER cb_buffer_base;
+  aom_reader *bit_reader;
+  uint8_t *mc_buf[2];
+  int32_t mc_buf_size;
+  int mc_buf_use_highbd;  // Boolean: whether the byte pointers stored in
+                          // mc_buf were converted from highbd pointers.
+
+  CONV_BUF_TYPE *tmp_conv_dst;
+  uint8_t *tmp_obmc_bufs[2];
+
+  decode_block_visitor_fn_t read_coeffs_tx_intra_block_visit;
+  decode_block_visitor_fn_t predict_and_recon_intra_block_visit;
+  decode_block_visitor_fn_t read_coeffs_tx_inter_block_visit;
+  decode_block_visitor_fn_t inverse_tx_inter_block_visit;
+  predict_inter_block_visitor_fn_t predict_inter_block_visit;
+  cfl_store_inter_block_visitor_fn_t cfl_store_inter_block_visit;
+} ThreadData;
+
+typedef struct AV1DecRowMTJobInfo {
+  int tile_row;
+  int tile_col;
+  int mi_row;
+} AV1DecRowMTJobInfo;
+
+typedef struct AV1DecRowMTSyncData {
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t *mutex_;
+  pthread_cond_t *cond_;
+#endif
+  int allocated_sb_rows;
+  int *cur_sb_col;
+  int sync_range;
+  int mi_rows;
+  int mi_cols;
+  int mi_rows_parse_done;
+  int mi_rows_decode_started;
+  int num_threads_working;
+} AV1DecRowMTSync;
+
+typedef struct AV1DecRowMTInfo {
+  int tile_rows_start;
+  int tile_rows_end;
+  int tile_cols_start;
+  int tile_cols_end;
+  int start_tile;
+  int end_tile;
+  int mi_rows_to_decode;
+
+  // Invariant:
+  //   mi_rows_parse_done >= mi_rows_decode_started.
+  // mi_rows_parse_done and mi_rows_decode_started are both initialized to 0.
+  // mi_rows_parse_done is incremented freely. mi_rows_decode_started may only
+  // be incremented to catch up with mi_rows_parse_done but is not allowed to
+  // surpass mi_rows_parse_done.
+  //
+  // When mi_rows_decode_started reaches mi_rows_to_decode, there are no more
+  // decode jobs.
+
+  // Indicates the progress of the bit-stream parsing of superblocks.
+  // Initialized to 0. Incremented by sb_mi_size when parse sb row is done.
+  int mi_rows_parse_done;
+  // Indicates the progress of the decoding of superblocks.
+  // Initialized to 0. Incremented by sb_mi_size when decode sb row is started.
+  int mi_rows_decode_started;
+  // Boolean: Initialized to 0 (false). Set to 1 (true) on error to abort
+  // decoding.
+  int row_mt_exit;
+} AV1DecRowMTInfo;
+
+typedef struct TileDataDec {
+  TileInfo tile_info;
+  aom_reader bit_reader;
+  DECLARE_ALIGNED(16, FRAME_CONTEXT, tctx);
+  AV1DecRowMTSync dec_row_mt_sync;
+} TileDataDec;
+
+typedef struct TileBufferDec {
+  const uint8_t *data;
+  size_t size;
+} TileBufferDec;
+
+typedef struct DataBuffer {
+  const uint8_t *data;
+  size_t size;
+} DataBuffer;
+
+typedef struct EXTERNAL_REFERENCES {
+  YV12_BUFFER_CONFIG refs[MAX_EXTERNAL_REFERENCES];
+  int num;
+} EXTERNAL_REFERENCES;
+
+typedef struct TileJobsDec {
+  TileBufferDec *tile_buffer;
+  TileDataDec *tile_data;
+} TileJobsDec;
+
+typedef struct AV1DecTileMTData {
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t *job_mutex;
+#endif
+  TileJobsDec *job_queue;
+  int jobs_enqueued;
+  int jobs_dequeued;
+  int alloc_tile_rows;
+  int alloc_tile_cols;
+} AV1DecTileMT;
+
+typedef struct AV1Decoder {
+  DECLARE_ALIGNED(32, MACROBLOCKD, mb);
+
+  DECLARE_ALIGNED(32, AV1_COMMON, common);
+
+  AVxWorker lf_worker;
+  AV1LfSync lf_row_sync;
+  AV1LrSync lr_row_sync;
+  AV1LrStruct lr_ctxt;
+  AVxWorker *tile_workers;
+  int num_workers;
+  DecWorkerData *thread_data;
+  ThreadData td;
+  TileDataDec *tile_data;
+  int allocated_tiles;
+
+  TileBufferDec tile_buffers[MAX_TILE_ROWS][MAX_TILE_COLS];
+  AV1DecTileMT tile_mt_info;
+
+  // Each time the decoder is called, we expect to receive a full temporal unit.
+  // This can contain up to one shown frame per spatial layer in the current
+  // operating point (note that some layers may be entirely omitted).
+  // If the 'output_all_layers' option is true, we save all of these shown
+  // frames so that they can be returned to the application. If the
+  // 'output_all_layers' option is false, then we only output one image per
+  // temporal unit.
+  //
+  // Note: The saved buffers are released at the start of the next time the
+  // application calls aom_codec_decode().
+  int output_all_layers;
+  RefCntBuffer *output_frames[MAX_NUM_SPATIAL_LAYERS];
+  size_t num_output_frames;  // How many frames are queued up so far?
+
+  // In order to properly support random-access decoding, we need
+  // to behave slightly differently for the very first frame we decode.
+  // So we track whether this is the first frame or not.
+  int decoding_first_frame;
+
+  int allow_lowbitdepth;
+  int max_threads;
+  int inv_tile_order;
+  int need_resync;  // wait for key/intra-only frame.
+  int reset_decoder_state;
+
+  int tile_size_bytes;
+  int tile_col_size_bytes;
+  int dec_tile_row, dec_tile_col;  // always -1 for non-VR tile encoding
+#if CONFIG_ACCOUNTING
+  int acct_enabled;
+  Accounting accounting;
+#endif
+  int sequence_header_ready;
+  int sequence_header_changed;
+#if CONFIG_INSPECTION
+  aom_inspect_cb inspect_cb;
+  void *inspect_ctx;
+#endif
+  int operating_point;
+  int current_operating_point;
+  int seen_frame_header;
+  // The expected start_tile (tg_start syntax element) of the next tile group.
+  int next_start_tile;
+
+  // State if the camera frame header is already decoded while
+  // large_scale_tile = 1.
+  int camera_frame_header_ready;
+  size_t frame_header_size;
+  DataBuffer obu_size_hdr;
+  int output_frame_width_in_tiles_minus_1;
+  int output_frame_height_in_tiles_minus_1;
+  int tile_count_minus_1;
+  uint32_t coded_tile_data_size;
+  unsigned int ext_tile_debug;  // for ext-tile software debug & testing
+  unsigned int row_mt;
+  EXTERNAL_REFERENCES ext_refs;
+  YV12_BUFFER_CONFIG tile_list_outbuf;
+
+  CB_BUFFER *cb_buffer_base;
+  int cb_buffer_alloc_size;
+
+  int allocated_row_mt_sync_rows;
+
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t *row_mt_mutex_;
+  pthread_cond_t *row_mt_cond_;
+#endif
+
+  AV1DecRowMTInfo frame_row_mt_info;
+  aom_metadata_array_t *metadata;
+
+  int context_update_tile_id;
+  int skip_loop_filter;
+  int skip_film_grain;
+  int is_annexb;
+  int valid_for_referencing[REF_FRAMES];
+} AV1Decoder;
+
+// Returns 0 on success. Sets pbi->common.error.error_code to a nonzero error
+// code and returns a nonzero value on failure.
+int av1_receive_compressed_data(struct AV1Decoder *pbi, size_t size,
+                                const uint8_t **psource);
+
+// Get the frame at a particular index in the output queue
+int av1_get_raw_frame(AV1Decoder *pbi, size_t index, YV12_BUFFER_CONFIG **sd,
+                      aom_film_grain_t **grain_params);
+
+int av1_get_frame_to_show(struct AV1Decoder *pbi, YV12_BUFFER_CONFIG *frame);
+
+aom_codec_err_t av1_copy_reference_dec(struct AV1Decoder *pbi, int idx,
+                                       YV12_BUFFER_CONFIG *sd);
+
+aom_codec_err_t av1_set_reference_dec(AV1_COMMON *cm, int idx,
+                                      int use_external_ref,
+                                      YV12_BUFFER_CONFIG *sd);
+aom_codec_err_t av1_copy_new_frame_dec(AV1_COMMON *cm,
+                                       YV12_BUFFER_CONFIG *new_frame,
+                                       YV12_BUFFER_CONFIG *sd);
+
+struct AV1Decoder *av1_decoder_create(BufferPool *const pool);
+
+void av1_decoder_remove(struct AV1Decoder *pbi);
+void av1_dealloc_dec_jobs(struct AV1DecTileMTData *tile_mt_info);
+
+void av1_dec_row_mt_dealloc(AV1DecRowMTSync *dec_row_mt_sync);
+
+void av1_dec_free_cb_buf(AV1Decoder *pbi);
+
+static INLINE void decrease_ref_count(RefCntBuffer *const buf,
+                                      BufferPool *const pool) {
+  if (buf != NULL) {
+    --buf->ref_count;
+    // Reference counts should never become negative. If this assertion fails,
+    // there is a bug in our reference count management.
+    assert(buf->ref_count >= 0);
+    // A worker may only get a free framebuffer index when calling get_free_fb.
+    // But the raw frame buffer is not set up until we finish decoding header.
+    // So if any error happens during decoding header, frame_bufs[idx] will not
+    // have a valid raw frame buffer.
+    if (buf->ref_count == 0 && buf->raw_frame_buffer.data) {
+      pool->release_fb_cb(pool->cb_priv, &buf->raw_frame_buffer);
+      buf->raw_frame_buffer.data = NULL;
+      buf->raw_frame_buffer.size = 0;
+      buf->raw_frame_buffer.priv = NULL;
+    }
+  }
+}
+
+#define ACCT_STR __func__
+static INLINE int av1_read_uniform(aom_reader *r, int n) {
+  const int l = get_unsigned_bits(n);
+  const int m = (1 << l) - n;
+  const int v = aom_read_literal(r, l - 1, ACCT_STR);
+  assert(l != 0);
+  if (v < m)
+    return v;
+  else
+    return (v << 1) - m + aom_read_literal(r, 1, ACCT_STR);
+}
+
+typedef void (*palette_visitor_fn_t)(MACROBLOCKD *const xd, int plane,
+                                     aom_reader *r);
+
+void av1_visit_palette(AV1Decoder *const pbi, MACROBLOCKD *const xd,
+                       aom_reader *r, palette_visitor_fn_t visit);
+
+typedef void (*block_visitor_fn_t)(AV1Decoder *const pbi, ThreadData *const td,
+                                   int mi_row, int mi_col, aom_reader *r,
+                                   PARTITION_TYPE partition, BLOCK_SIZE bsize);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_DECODER_DECODER_H_
diff --git a/libs/libaom/src/av1/decoder/decodetxb.c b/libs/libaom/src/av1/decoder/decodetxb.c
new file mode 100644
index 000000000..541f4c984
--- /dev/null
+++ b/libs/libaom/src/av1/decoder/decodetxb.c
@@ -0,0 +1,379 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/decoder/decodetxb.h"
+
+#include "aom_ports/mem.h"
+#include "av1/common/idct.h"
+#include "av1/common/scan.h"
+#include "av1/common/txb_common.h"
+#include "av1/decoder/decodemv.h"
+
+#define ACCT_STR __func__
+
+static int read_golomb(MACROBLOCKD *xd, aom_reader *r) {
+  int x = 1;
+  int length = 0;
+  int i = 0;
+
+  while (!i) {
+    i = aom_read_bit(r, ACCT_STR);
+    ++length;
+    if (length > 20) {
+      aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
+                         "Invalid length in read_golomb");
+      break;
+    }
+  }
+
+  for (i = 0; i < length - 1; ++i) {
+    x <<= 1;
+    x += aom_read_bit(r, ACCT_STR);
+  }
+
+  return x - 1;
+}
+
+static INLINE int rec_eob_pos(const int eob_token, const int extra) {
+  int eob = av1_eob_group_start[eob_token];
+  if (eob > 2) {
+    eob += extra;
+  }
+  return eob;
+}
+
+static INLINE int get_dqv(const int16_t *dequant, int coeff_idx,
+                          const qm_val_t *iqmatrix) {
+  int dqv = dequant[!!coeff_idx];
+  if (iqmatrix != NULL)
+    dqv =
+        ((iqmatrix[coeff_idx] * dqv) + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
+  return dqv;
+}
+
+static INLINE void read_coeffs_reverse_2d(aom_reader *r, TX_SIZE tx_size,
+                                          int start_si, int end_si,
+                                          const int16_t *scan, int bwl,
+                                          uint8_t *levels,
+                                          base_cdf_arr base_cdf,
+                                          br_cdf_arr br_cdf) {
+  for (int c = end_si; c >= start_si; --c) {
+    const int pos = scan[c];
+    const int coeff_ctx = get_lower_levels_ctx_2d(levels, pos, bwl, tx_size);
+    const int nsymbs = 4;
+    int level = aom_read_symbol(r, base_cdf[coeff_ctx], nsymbs, ACCT_STR);
+    if (level > NUM_BASE_LEVELS) {
+      const int br_ctx = get_br_ctx_2d(levels, pos, bwl);
+      aom_cdf_prob *cdf = br_cdf[br_ctx];
+      for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) {
+        const int k = aom_read_symbol(r, cdf, BR_CDF_SIZE, ACCT_STR);
+        level += k;
+        if (k < BR_CDF_SIZE - 1) break;
+      }
+    }
+    levels[get_padded_idx(pos, bwl)] = level;
+  }
+}
+
+static INLINE void read_coeffs_reverse(aom_reader *r, TX_SIZE tx_size,
+                                       TX_CLASS tx_class, int start_si,
+                                       int end_si, const int16_t *scan, int bwl,
+                                       uint8_t *levels, base_cdf_arr base_cdf,
+                                       br_cdf_arr br_cdf) {
+  for (int c = end_si; c >= start_si; --c) {
+    const int pos = scan[c];
+    const int coeff_ctx =
+        get_lower_levels_ctx(levels, pos, bwl, tx_size, tx_class);
+    const int nsymbs = 4;
+    int level = aom_read_symbol(r, base_cdf[coeff_ctx], nsymbs, ACCT_STR);
+    if (level > NUM_BASE_LEVELS) {
+      const int br_ctx = get_br_ctx(levels, pos, bwl, tx_class);
+      aom_cdf_prob *cdf = br_cdf[br_ctx];
+      for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) {
+        const int k = aom_read_symbol(r, cdf, BR_CDF_SIZE, ACCT_STR);
+        level += k;
+        if (k < BR_CDF_SIZE - 1) break;
+      }
+    }
+    levels[get_padded_idx(pos, bwl)] = level;
+  }
+}
+
+uint8_t av1_read_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *const xd,
+                            aom_reader *const r, const int blk_row,
+                            const int blk_col, const int plane,
+                            const TXB_CTX *const txb_ctx,
+                            const TX_SIZE tx_size) {
+  FRAME_CONTEXT *const ec_ctx = xd->tile_ctx;
+  const int32_t max_value = (1 << (7 + xd->bd)) - 1;
+  const int32_t min_value = -(1 << (7 + xd->bd));
+  const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+  const PLANE_TYPE plane_type = get_plane_type(plane);
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int16_t *const dequant = pd->seg_dequant_QTX[mbmi->segment_id];
+  tran_low_t *const tcoeffs = pd->dqcoeff_block + xd->cb_offset[plane];
+  const int shift = av1_get_tx_scale(tx_size);
+  const int bwl = get_txb_bwl(tx_size);
+  const int width = get_txb_wide(tx_size);
+  const int height = get_txb_high(tx_size);
+  int cul_level = 0;
+  int dc_val = 0;
+  uint8_t levels_buf[TX_PAD_2D];
+  uint8_t *const levels = set_levels(levels_buf, width);
+  const int all_zero = aom_read_symbol(
+      r, ec_ctx->txb_skip_cdf[txs_ctx][txb_ctx->txb_skip_ctx], 2, ACCT_STR);
+  eob_info *eob_data = pd->eob_data + xd->txb_offset[plane];
+  uint16_t *const eob = &(eob_data->eob);
+  uint16_t *const max_scan_line = &(eob_data->max_scan_line);
+  *max_scan_line = 0;
+  *eob = 0;
+
+#if CONFIG_INSPECTION
+  if (plane == 0) {
+    const int txk_type_idx =
+        av1_get_txk_type_index(mbmi->sb_type, blk_row, blk_col);
+    mbmi->tx_skip[txk_type_idx] = all_zero;
+  }
+#endif
+
+  if (all_zero) {
+    *max_scan_line = 0;
+    if (plane == 0) {
+      xd->tx_type_map[blk_row * xd->tx_type_map_stride + blk_col] = DCT_DCT;
+    }
+    return 0;
+  }
+
+  if (plane == AOM_PLANE_Y) {
+    // only y plane's tx_type is transmitted
+    av1_read_tx_type(cm, xd, blk_row, blk_col, tx_size, r);
+  }
+  const TX_TYPE tx_type =
+      av1_get_tx_type(xd, plane_type, blk_row, blk_col, tx_size,
+                      cm->features.reduced_tx_set_used);
+  const TX_CLASS tx_class = tx_type_to_class[tx_type];
+  const qm_val_t *iqmatrix =
+      av1_get_iqmatrix(&cm->quant_params, xd, plane, tx_size, tx_type);
+  const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
+  const int16_t *const scan = scan_order->scan;
+  int eob_extra = 0;
+  int eob_pt = 1;
+
+  const int eob_multi_size = txsize_log2_minus4[tx_size];
+  const int eob_multi_ctx = (tx_class == TX_CLASS_2D) ? 0 : 1;
+  switch (eob_multi_size) {
+    case 0:
+      eob_pt =
+          aom_read_symbol(r, ec_ctx->eob_flag_cdf16[plane_type][eob_multi_ctx],
+                          5, ACCT_STR) +
+          1;
+      break;
+    case 1:
+      eob_pt =
+          aom_read_symbol(r, ec_ctx->eob_flag_cdf32[plane_type][eob_multi_ctx],
+                          6, ACCT_STR) +
+          1;
+      break;
+    case 2:
+      eob_pt =
+          aom_read_symbol(r, ec_ctx->eob_flag_cdf64[plane_type][eob_multi_ctx],
+                          7, ACCT_STR) +
+          1;
+      break;
+    case 3:
+      eob_pt =
+          aom_read_symbol(r, ec_ctx->eob_flag_cdf128[plane_type][eob_multi_ctx],
+                          8, ACCT_STR) +
+          1;
+      break;
+    case 4:
+      eob_pt =
+          aom_read_symbol(r, ec_ctx->eob_flag_cdf256[plane_type][eob_multi_ctx],
+                          9, ACCT_STR) +
+          1;
+      break;
+    case 5:
+      eob_pt =
+          aom_read_symbol(r, ec_ctx->eob_flag_cdf512[plane_type][eob_multi_ctx],
+                          10, ACCT_STR) +
+          1;
+      break;
+    case 6:
+    default:
+      eob_pt = aom_read_symbol(
+                   r, ec_ctx->eob_flag_cdf1024[plane_type][eob_multi_ctx], 11,
+                   ACCT_STR) +
+               1;
+      break;
+  }
+
+  const int eob_offset_bits = av1_eob_offset_bits[eob_pt];
+  if (eob_offset_bits > 0) {
+    const int eob_ctx = eob_pt - 3;
+    int bit = aom_read_symbol(
+        r, ec_ctx->eob_extra_cdf[txs_ctx][plane_type][eob_ctx], 2, ACCT_STR);
+    if (bit) {
+      eob_extra += (1 << (eob_offset_bits - 1));
+    }
+
+    for (int i = 1; i < eob_offset_bits; i++) {
+      bit = aom_read_bit(r, ACCT_STR);
+      if (bit) {
+        eob_extra += (1 << (eob_offset_bits - 1 - i));
+      }
+    }
+  }
+  *eob = rec_eob_pos(eob_pt, eob_extra);
+
+  if (*eob > 1) {
+    memset(levels_buf, 0,
+           sizeof(*levels_buf) *
+               ((width + TX_PAD_HOR) * (height + TX_PAD_VER) + TX_PAD_END));
+  }
+
+  {
+    // Read the non-zero coefficient with scan index eob-1
+    // TODO(angiebird): Put this into a function
+    const int c = *eob - 1;
+    const int pos = scan[c];
+    const int coeff_ctx = get_lower_levels_ctx_eob(bwl, height, c);
+    const int nsymbs = 3;
+    aom_cdf_prob *cdf =
+        ec_ctx->coeff_base_eob_cdf[txs_ctx][plane_type][coeff_ctx];
+    int level = aom_read_symbol(r, cdf, nsymbs, ACCT_STR) + 1;
+    if (level > NUM_BASE_LEVELS) {
+      const int br_ctx = get_br_ctx_eob(pos, bwl, tx_class);
+      cdf = ec_ctx->coeff_br_cdf[AOMMIN(txs_ctx, TX_32X32)][plane_type][br_ctx];
+      for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) {
+        const int k = aom_read_symbol(r, cdf, BR_CDF_SIZE, ACCT_STR);
+        level += k;
+        if (k < BR_CDF_SIZE - 1) break;
+      }
+    }
+    levels[get_padded_idx(pos, bwl)] = level;
+  }
+  if (*eob > 1) {
+    base_cdf_arr base_cdf = ec_ctx->coeff_base_cdf[txs_ctx][plane_type];
+    br_cdf_arr br_cdf =
+        ec_ctx->coeff_br_cdf[AOMMIN(txs_ctx, TX_32X32)][plane_type];
+    if (tx_class == TX_CLASS_2D) {
+      read_coeffs_reverse_2d(r, tx_size, 1, *eob - 1 - 1, scan, bwl, levels,
+                             base_cdf, br_cdf);
+      read_coeffs_reverse(r, tx_size, tx_class, 0, 0, scan, bwl, levels,
+                          base_cdf, br_cdf);
+    } else {
+      read_coeffs_reverse(r, tx_size, tx_class, 0, *eob - 1 - 1, scan, bwl,
+                          levels, base_cdf, br_cdf);
+    }
+  }
+
+  for (int c = 0; c < *eob; ++c) {
+    const int pos = scan[c];
+    uint8_t sign;
+    tran_low_t level = levels[get_padded_idx(pos, bwl)];
+    if (level) {
+      *max_scan_line = AOMMAX(*max_scan_line, pos);
+      if (c == 0) {
+        const int dc_sign_ctx = txb_ctx->dc_sign_ctx;
+        sign = aom_read_symbol(r, ec_ctx->dc_sign_cdf[plane_type][dc_sign_ctx],
+                               2, ACCT_STR);
+      } else {
+        sign = aom_read_bit(r, ACCT_STR);
+      }
+      if (level >= MAX_BASE_BR_RANGE) {
+        level += read_golomb(xd, r);
+      }
+
+      if (c == 0) dc_val = sign ? -level : level;
+
+      // Bitmasking to clamp level to valid range:
+      //   The valid range for 8/10/12 bit vdieo is at most 14/16/18 bit
+      level &= 0xfffff;
+      cul_level += level;
+      tran_low_t dq_coeff;
+      // Bitmasking to clamp dq_coeff to valid range:
+      //   The valid range for 8/10/12 bit video is at most 17/19/21 bit
+      dq_coeff = (tran_low_t)(
+          (int64_t)level * get_dqv(dequant, scan[c], iqmatrix) & 0xffffff);
+      dq_coeff = dq_coeff >> shift;
+      if (sign) {
+        dq_coeff = -dq_coeff;
+      }
+      tcoeffs[pos] = clamp(dq_coeff, min_value, max_value);
+    }
+  }
+
+  cul_level = AOMMIN(COEFF_CONTEXT_MASK, cul_level);
+
+  // DC value
+  set_dc_sign(&cul_level, dc_val);
+
+  return cul_level;
+}
+
+void av1_read_coeffs_txb_facade(const AV1_COMMON *const cm,
+                                MACROBLOCKD *const xd, aom_reader *const r,
+                                const int plane, const int row, const int col,
+                                const TX_SIZE tx_size) {
+#if TXCOEFF_TIMER
+  struct aom_usec_timer timer;
+  aom_usec_timer_start(&timer);
+#endif
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  assert(bsize < BLOCK_SIZES_ALL);
+  const BLOCK_SIZE plane_bsize =
+      get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+
+  TXB_CTX txb_ctx;
+  get_txb_ctx(plane_bsize, tx_size, plane, pd->above_entropy_context + col,
+              pd->left_entropy_context + row, &txb_ctx);
+  const uint8_t cul_level =
+      av1_read_coeffs_txb(cm, xd, r, row, col, plane, &txb_ctx, tx_size);
+  av1_set_entropy_contexts(xd, pd, plane, plane_bsize, tx_size, cul_level, col,
+                           row);
+
+  if (is_inter_block(mbmi)) {
+    const PLANE_TYPE plane_type = get_plane_type(plane);
+    // tx_type will be read out in av1_read_coeffs_txb_facade
+    const TX_TYPE tx_type = av1_get_tx_type(xd, plane_type, row, col, tx_size,
+                                            cm->features.reduced_tx_set_used);
+
+    if (plane == 0) {
+      const int txw = tx_size_wide_unit[tx_size];
+      const int txh = tx_size_high_unit[tx_size];
+      // The 16x16 unit is due to the constraint from tx_64x64 which sets the
+      // maximum tx size for chroma as 32x32. Coupled with 4x1 transform block
+      // size, the constraint takes effect in 32x16 / 16x32 size too. To solve
+      // the intricacy, cover all the 16x16 units inside a 64 level transform.
+      if (txw == tx_size_wide_unit[TX_64X64] ||
+          txh == tx_size_high_unit[TX_64X64]) {
+        const int tx_unit = tx_size_wide_unit[TX_16X16];
+        const int stride = xd->tx_type_map_stride;
+        for (int idy = 0; idy < txh; idy += tx_unit) {
+          for (int idx = 0; idx < txw; idx += tx_unit) {
+            xd->tx_type_map[(row + idy) * stride + col + idx] = tx_type;
+          }
+        }
+      }
+    }
+  }
+
+#if TXCOEFF_TIMER
+  aom_usec_timer_mark(&timer);
+  const int64_t elapsed_time = aom_usec_timer_elapsed(&timer);
+  cm->txcoeff_timer += elapsed_time;
+  ++cm->txb_count;
+#endif
+}
diff --git a/libs/libaom/src/av1/decoder/decodetxb.h b/libs/libaom/src/av1/decoder/decodetxb.h
new file mode 100644
index 000000000..39bf0bf78
--- /dev/null
+++ b/libs/libaom/src/av1/decoder/decodetxb.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_DECODER_DECODETXB_H_
+#define AOM_AV1_DECODER_DECODETXB_H_
+
+#include "config/aom_config.h"
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/blockd.h"
+#include "av1/common/txb_common.h"
+#include "aom_dsp/bitreader.h"
+
+uint8_t av1_read_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *const xd,
+                            aom_reader *const r, const int blk_row,
+                            const int blk_col, const int plane,
+                            const TXB_CTX *const txb_ctx,
+                            const TX_SIZE tx_size);
+
+void av1_read_coeffs_txb_facade(const AV1_COMMON *const cm,
+                                MACROBLOCKD *const xd, aom_reader *const r,
+                                const int plane, const int row, const int col,
+                                const TX_SIZE tx_size);
+#endif  // AOM_AV1_DECODER_DECODETXB_H_
diff --git a/libs/libaom/src/av1/decoder/detokenize.c b/libs/libaom/src/av1/decoder/detokenize.c
new file mode 100644
index 000000000..9d54bd13d
--- /dev/null
+++ b/libs/libaom/src/av1/decoder/detokenize.c
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_config.h"
+
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+#include "av1/common/blockd.h"
+#include "av1/decoder/detokenize.h"
+
+#define ACCT_STR __func__
+
+#include "av1/common/common.h"
+#include "av1/common/entropy.h"
+#include "av1/common/idct.h"
+
+static void decode_color_map_tokens(Av1ColorMapParam *param, aom_reader *r) {
+  uint8_t color_order[PALETTE_MAX_SIZE];
+  const int n = param->n_colors;
+  uint8_t *const color_map = param->color_map;
+  MapCdf color_map_cdf = param->map_cdf;
+  int plane_block_width = param->plane_width;
+  int plane_block_height = param->plane_height;
+  int rows = param->rows;
+  int cols = param->cols;
+
+  // The first color index.
+  color_map[0] = av1_read_uniform(r, n);
+  assert(color_map[0] < n);
+
+  // Run wavefront on the palette map index decoding.
+  for (int i = 1; i < rows + cols - 1; ++i) {
+    for (int j = AOMMIN(i, cols - 1); j >= AOMMAX(0, i - rows + 1); --j) {
+      const int color_ctx = av1_get_palette_color_index_context(
+          color_map, plane_block_width, (i - j), j, n, color_order, NULL);
+      const int color_idx = aom_read_symbol(
+          r, color_map_cdf[n - PALETTE_MIN_SIZE][color_ctx], n, ACCT_STR);
+      assert(color_idx >= 0 && color_idx < n);
+      color_map[(i - j) * plane_block_width + j] = color_order[color_idx];
+    }
+  }
+  // Copy last column to extra columns.
+  if (cols < plane_block_width) {
+    for (int i = 0; i < rows; ++i) {
+      memset(color_map + i * plane_block_width + cols,
+             color_map[i * plane_block_width + cols - 1],
+             (plane_block_width - cols));
+    }
+  }
+  // Copy last row to extra rows.
+  for (int i = rows; i < plane_block_height; ++i) {
+    memcpy(color_map + i * plane_block_width,
+           color_map + (rows - 1) * plane_block_width, plane_block_width);
+  }
+}
+
+void av1_decode_palette_tokens(MACROBLOCKD *const xd, int plane,
+                               aom_reader *r) {
+  assert(plane == 0 || plane == 1);
+  Av1ColorMapParam params;
+  params.color_map =
+      xd->plane[plane].color_index_map + xd->color_index_map_offset[plane];
+  params.map_cdf = plane ? xd->tile_ctx->palette_uv_color_index_cdf
+                         : xd->tile_ctx->palette_y_color_index_cdf;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  params.n_colors = mbmi->palette_mode_info.palette_size[plane];
+  av1_get_block_dimensions(mbmi->sb_type, plane, xd, &params.plane_width,
+                           &params.plane_height, &params.rows, &params.cols);
+  decode_color_map_tokens(&params, r);
+}
diff --git a/libs/libaom/src/av1/decoder/detokenize.h b/libs/libaom/src/av1/decoder/detokenize.h
new file mode 100644
index 000000000..173b437a9
--- /dev/null
+++ b/libs/libaom/src/av1/decoder/detokenize.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_DECODER_DETOKENIZE_H_
+#define AOM_AV1_DECODER_DETOKENIZE_H_
+
+#include "config/aom_config.h"
+
+#include "av1/common/scan.h"
+#include "av1/decoder/decoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_decode_palette_tokens(MACROBLOCKD *const xd, int plane, aom_reader *r);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+#endif  // AOM_AV1_DECODER_DETOKENIZE_H_
diff --git a/libs/libaom/src/av1/decoder/dthread.h b/libs/libaom/src/av1/decoder/dthread.h
new file mode 100644
index 000000000..f82b9d8cc
--- /dev/null
+++ b/libs/libaom/src/av1/decoder/dthread.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_DECODER_DTHREAD_H_
+#define AOM_AV1_DECODER_DTHREAD_H_
+
+#include "config/aom_config.h"
+
+#include "aom_util/aom_thread.h"
+#include "aom/internal/aom_codec_internal.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct AV1Common;
+struct AV1Decoder;
+struct ThreadData;
+
+typedef struct DecWorkerData {
+  struct ThreadData *td;
+  const uint8_t *data_end;
+  struct aom_internal_error_info error_info;
+} DecWorkerData;
+
+// WorkerData for the FrameWorker thread. It contains all the information of
+// the worker and decode structures for decoding a frame.
+typedef struct FrameWorkerData {
+  struct AV1Decoder *pbi;
+  const uint8_t *data;
+  const uint8_t *data_end;
+  size_t data_size;
+  void *user_priv;
+  int received_frame;
+  int frame_context_ready;  // Current frame's context is ready to read.
+  int frame_decoded;        // Finished decoding current frame.
+} FrameWorkerData;
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_DECODER_DTHREAD_H_
diff --git a/libs/libaom/src/av1/decoder/inspection.c b/libs/libaom/src/av1/decoder/inspection.c
new file mode 100644
index 000000000..d121a7034
--- /dev/null
+++ b/libs/libaom/src/av1/decoder/inspection.c
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include "av1/decoder/decoder.h"
+#include "av1/decoder/inspection.h"
+#include "av1/common/enums.h"
+#include "av1/common/cdef.h"
+
+static void ifd_init_mi_rc(insp_frame_data *fd, int mi_cols, int mi_rows) {
+  fd->mi_cols = mi_cols;
+  fd->mi_rows = mi_rows;
+  fd->mi_grid = (insp_mi_data *)aom_malloc(sizeof(insp_mi_data) * fd->mi_rows *
+                                           fd->mi_cols);
+}
+
+void ifd_init(insp_frame_data *fd, int frame_width, int frame_height) {
+  int mi_cols = ALIGN_POWER_OF_TWO(frame_width, 3) >> MI_SIZE_LOG2;
+  int mi_rows = ALIGN_POWER_OF_TWO(frame_height, 3) >> MI_SIZE_LOG2;
+  ifd_init_mi_rc(fd, mi_cols, mi_rows);
+}
+
+void ifd_clear(insp_frame_data *fd) {
+  aom_free(fd->mi_grid);
+  fd->mi_grid = NULL;
+}
+
+/* TODO(negge) This function may be called by more than one thread when using
+               a multi-threaded decoder and this may cause a data race. */
+int ifd_inspect(insp_frame_data *fd, void *decoder, int skip_not_transform) {
+  struct AV1Decoder *pbi = (struct AV1Decoder *)decoder;
+  AV1_COMMON *const cm = &pbi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  const CommonQuantParams *quant_params = &cm->quant_params;
+
+  if (fd->mi_rows != mi_params->mi_rows || fd->mi_cols != mi_params->mi_cols) {
+    ifd_clear(fd);
+    ifd_init_mi_rc(fd, mi_params->mi_rows, mi_params->mi_cols);
+  }
+  fd->show_existing_frame = cm->show_existing_frame;
+  fd->frame_number = cm->current_frame.frame_number;
+  fd->show_frame = cm->show_frame;
+  fd->frame_type = cm->current_frame.frame_type;
+  fd->base_qindex = quant_params->base_qindex;
+  // Set width and height of the first tile until generic support can be added
+  TileInfo tile_info;
+  av1_tile_set_row(&tile_info, cm, 0);
+  av1_tile_set_col(&tile_info, cm, 0);
+  fd->tile_mi_cols = tile_info.mi_col_end - tile_info.mi_col_start;
+  fd->tile_mi_rows = tile_info.mi_row_end - tile_info.mi_row_start;
+  fd->delta_q_present_flag = cm->delta_q_info.delta_q_present_flag;
+  fd->delta_q_res = cm->delta_q_info.delta_q_res;
+#if CONFIG_ACCOUNTING
+  fd->accounting = &pbi->accounting;
+#endif
+  // TODO(negge): copy per frame CDEF data
+  int i, j;
+  for (i = 0; i < MAX_SEGMENTS; i++) {
+    for (j = 0; j < 2; j++) {
+      fd->y_dequant[i][j] = quant_params->y_dequant_QTX[i][j];
+      fd->u_dequant[i][j] = quant_params->u_dequant_QTX[i][j];
+      fd->v_dequant[i][j] = quant_params->v_dequant_QTX[i][j];
+    }
+  }
+  for (j = 0; j < mi_params->mi_rows; j++) {
+    for (i = 0; i < mi_params->mi_cols; i++) {
+      const MB_MODE_INFO *mbmi =
+          mi_params->mi_grid_base[j * mi_params->mi_stride + i];
+      insp_mi_data *mi = &fd->mi_grid[j * mi_params->mi_cols + i];
+      // Segment
+      mi->segment_id = mbmi->segment_id;
+      // Motion Vectors
+      mi->mv[0].row = mbmi->mv[0].as_mv.row;
+      mi->mv[0].col = mbmi->mv[0].as_mv.col;
+      mi->mv[1].row = mbmi->mv[1].as_mv.row;
+      mi->mv[1].col = mbmi->mv[1].as_mv.col;
+      // Reference Frames
+      mi->ref_frame[0] = mbmi->ref_frame[0];
+      mi->ref_frame[1] = mbmi->ref_frame[1];
+      // Prediction Mode
+      mi->mode = mbmi->mode;
+      mi->intrabc = (int16_t)mbmi->use_intrabc;
+      mi->palette = (int16_t)mbmi->palette_mode_info.palette_size[0];
+      mi->uv_palette = (int16_t)mbmi->palette_mode_info.palette_size[1];
+      // Prediction Mode for Chromatic planes
+      if (mi->mode < INTRA_MODES) {
+        mi->uv_mode = mbmi->uv_mode;
+      } else {
+        mi->uv_mode = UV_MODE_INVALID;
+      }
+
+      mi->motion_mode = mbmi->motion_mode;
+      mi->compound_type = mbmi->interinter_comp.type;
+
+      // Block Size
+      mi->sb_type = mbmi->sb_type;
+      // Skip Flag
+      mi->skip = mbmi->skip;
+      mi->filter[0] = av1_extract_interp_filter(mbmi->interp_filters, 0);
+      mi->filter[1] = av1_extract_interp_filter(mbmi->interp_filters, 1);
+      mi->dual_filter_type = mi->filter[0] * 3 + mi->filter[1];
+
+      // Transform
+      // TODO(anyone): extract tx type info from mbmi->txk_type[].
+
+      const BLOCK_SIZE bsize = mbmi->sb_type;
+      const int c = i % mi_size_wide[bsize];
+      const int r = j % mi_size_high[bsize];
+      if (is_inter_block(mbmi) || is_intrabc_block(mbmi))
+        mi->tx_size = mbmi->inter_tx_size[av1_get_txb_size_index(bsize, r, c)];
+      else
+        mi->tx_size = mbmi->tx_size;
+
+      if (skip_not_transform && mi->skip) mi->tx_size = -1;
+
+      if (mi->skip) {
+        const int tx_type_row = j - j % tx_size_high_unit[mi->tx_size];
+        const int tx_type_col = i - i % tx_size_wide_unit[mi->tx_size];
+        const int tx_type_map_idx =
+            tx_type_row * mi_params->mi_stride + tx_type_col;
+        mi->tx_type = mi_params->tx_type_map[tx_type_map_idx];
+      } else {
+        mi->tx_type = 0;
+      }
+
+      if (skip_not_transform &&
+          (mi->skip || mbmi->tx_skip[av1_get_txk_type_index(bsize, r, c)]))
+        mi->tx_type = -1;
+
+      mi->cdef_level = cm->cdef_info.cdef_strengths[mbmi->cdef_strength] /
+                       CDEF_SEC_STRENGTHS;
+      mi->cdef_strength = cm->cdef_info.cdef_strengths[mbmi->cdef_strength] %
+                          CDEF_SEC_STRENGTHS;
+
+      mi->cdef_strength += mi->cdef_strength == 3;
+      if (mbmi->uv_mode == UV_CFL_PRED) {
+        mi->cfl_alpha_idx = mbmi->cfl_alpha_idx;
+        mi->cfl_alpha_sign = mbmi->cfl_alpha_signs;
+      } else {
+        mi->cfl_alpha_idx = 0;
+        mi->cfl_alpha_sign = 0;
+      }
+      // delta_q
+      mi->current_qindex = mbmi->current_qindex;
+    }
+  }
+  return 1;
+}
diff --git a/libs/libaom/src/av1/decoder/inspection.h b/libs/libaom/src/av1/decoder/inspection.h
new file mode 100644
index 000000000..b963f6ac6
--- /dev/null
+++ b/libs/libaom/src/av1/decoder/inspection.h
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_DECODER_INSPECTION_H_
+#define AOM_AV1_DECODER_INSPECTION_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+#include "av1/common/seg_common.h"
+#if CONFIG_ACCOUNTING
+#include "av1/decoder/accounting.h"
+#endif
+
+#ifndef AOM_AOM_AOMDX_H_
+typedef void (*aom_inspect_cb)(void *decoder, void *data);
+#endif
+
+typedef struct insp_mv insp_mv;
+
+struct insp_mv {
+  int16_t row;
+  int16_t col;
+};
+
+typedef struct insp_mi_data insp_mi_data;
+
+struct insp_mi_data {
+  insp_mv mv[2];
+  int16_t ref_frame[2];
+  int16_t mode;
+  int16_t uv_mode;
+  int16_t sb_type;
+  int16_t skip;
+  int16_t segment_id;
+  int16_t dual_filter_type;
+  int16_t filter[2];
+  int16_t tx_type;
+  int16_t tx_size;
+  int16_t cdef_level;
+  int16_t cdef_strength;
+  int16_t cfl_alpha_idx;
+  int16_t cfl_alpha_sign;
+  int16_t current_qindex;
+  int16_t compound_type;
+  int16_t motion_mode;
+  int16_t intrabc;
+  int16_t palette;
+  int16_t uv_palette;
+};
+
+typedef struct insp_frame_data insp_frame_data;
+
+struct insp_frame_data {
+#if CONFIG_ACCOUNTING
+  Accounting *accounting;
+#endif
+  insp_mi_data *mi_grid;
+  int16_t frame_number;
+  int show_frame;
+  int frame_type;
+  int base_qindex;
+  int mi_rows;
+  int mi_cols;
+  int tile_mi_rows;
+  int tile_mi_cols;
+  int16_t y_dequant[MAX_SEGMENTS][2];
+  int16_t u_dequant[MAX_SEGMENTS][2];
+  int16_t v_dequant[MAX_SEGMENTS][2];
+  // TODO(negge): add per frame CDEF data
+  int delta_q_present_flag;
+  int delta_q_res;
+  int show_existing_frame;
+};
+
+void ifd_init(insp_frame_data *fd, int frame_width, int frame_height);
+void ifd_clear(insp_frame_data *fd);
+int ifd_inspect(insp_frame_data *fd, void *decoder, int skip_not_transform);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+#endif  // AOM_AV1_DECODER_INSPECTION_H_
diff --git a/libs/libaom/src/av1/decoder/obu.c b/libs/libaom/src/av1/decoder/obu.c
new file mode 100644
index 000000000..791e5965b
--- /dev/null
+++ b/libs/libaom/src/av1/decoder/obu.c
@@ -0,0 +1,1085 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "aom/aom_codec.h"
+#include "aom_dsp/bitreader_buffer.h"
+#include "aom_ports/mem_ops.h"
+
+#include "av1/common/common.h"
+#include "av1/common/obu_util.h"
+#include "av1/common/timing.h"
+#include "av1/decoder/decoder.h"
+#include "av1/decoder/decodeframe.h"
+#include "av1/decoder/obu.h"
+
+aom_codec_err_t aom_get_num_layers_from_operating_point_idc(
+    int operating_point_idc, unsigned int *number_spatial_layers,
+    unsigned int *number_temporal_layers) {
+  // derive number of spatial/temporal layers from operating_point_idc
+
+  if (!number_spatial_layers || !number_temporal_layers)
+    return AOM_CODEC_INVALID_PARAM;
+
+  if (operating_point_idc == 0) {
+    *number_temporal_layers = 1;
+    *number_spatial_layers = 1;
+  } else {
+    *number_spatial_layers = 0;
+    *number_temporal_layers = 0;
+    for (int j = 0; j < MAX_NUM_SPATIAL_LAYERS; j++) {
+      *number_spatial_layers +=
+          (operating_point_idc >> (j + MAX_NUM_TEMPORAL_LAYERS)) & 0x1;
+    }
+    for (int j = 0; j < MAX_NUM_TEMPORAL_LAYERS; j++) {
+      *number_temporal_layers += (operating_point_idc >> j) & 0x1;
+    }
+  }
+
+  return AOM_CODEC_OK;
+}
+
+static int is_obu_in_current_operating_point(AV1Decoder *pbi,
+                                             ObuHeader obu_header) {
+  if (!pbi->current_operating_point) {
+    return 1;
+  }
+
+  if ((pbi->current_operating_point >> obu_header.temporal_layer_id) & 0x1 &&
+      (pbi->current_operating_point >> (obu_header.spatial_layer_id + 8)) &
+          0x1) {
+    return 1;
+  }
+  return 0;
+}
+
+static int byte_alignment(AV1_COMMON *const cm,
+                          struct aom_read_bit_buffer *const rb) {
+  while (rb->bit_offset & 7) {
+    if (aom_rb_read_bit(rb)) {
+      cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+      return -1;
+    }
+  }
+  return 0;
+}
+
+static uint32_t read_temporal_delimiter_obu() { return 0; }
+
+// Returns a boolean that indicates success.
+static int read_bitstream_level(AV1_LEVEL *seq_level_idx,
+                                struct aom_read_bit_buffer *rb) {
+  *seq_level_idx = aom_rb_read_literal(rb, LEVEL_BITS);
+  if (!is_valid_seq_level_idx(*seq_level_idx)) return 0;
+  return 1;
+}
+
+// Returns whether two sequence headers are consistent with each other.
+// Note that the 'op_params' field is not compared per Section 7.5 in the spec:
+//   Within a particular coded video sequence, the contents of
+//   sequence_header_obu must be bit-identical each time the sequence header
+//   appears except for the contents of operating_parameters_info.
+static int are_seq_headers_consistent(const SequenceHeader *seq_params_old,
+                                      const SequenceHeader *seq_params_new) {
+  return !memcmp(seq_params_old, seq_params_new,
+                 offsetof(SequenceHeader, op_params));
+}
+
+// On success, sets pbi->sequence_header_ready to 1 and returns the number of
+// bytes read from 'rb'.
+// On failure, sets pbi->common.error.error_code and returns 0.
+static uint32_t read_sequence_header_obu(AV1Decoder *pbi,
+                                         struct aom_read_bit_buffer *rb) {
+  AV1_COMMON *const cm = &pbi->common;
+  const uint32_t saved_bit_offset = rb->bit_offset;
+
+  // Verify rb has been configured to report errors.
+  assert(rb->error_handler);
+
+  // Use a local variable to store the information as we decode. At the end,
+  // if no errors have occurred, cm->seq_params is updated.
+  SequenceHeader sh = cm->seq_params;
+  SequenceHeader *const seq_params = &sh;
+
+  seq_params->profile = av1_read_profile(rb);
+  if (seq_params->profile > CONFIG_MAX_DECODE_PROFILE) {
+    cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
+    return 0;
+  }
+
+  // Still picture or not
+  seq_params->still_picture = aom_rb_read_bit(rb);
+  seq_params->reduced_still_picture_hdr = aom_rb_read_bit(rb);
+  // Video must have reduced_still_picture_hdr = 0
+  if (!seq_params->still_picture && seq_params->reduced_still_picture_hdr) {
+    cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
+    return 0;
+  }
+
+  if (seq_params->reduced_still_picture_hdr) {
+    seq_params->timing_info_present = 0;
+    seq_params->decoder_model_info_present_flag = 0;
+    seq_params->display_model_info_present_flag = 0;
+    seq_params->operating_points_cnt_minus_1 = 0;
+    seq_params->operating_point_idc[0] = 0;
+    if (!read_bitstream_level(&seq_params->seq_level_idx[0], rb)) {
+      cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
+      return 0;
+    }
+    seq_params->tier[0] = 0;
+    seq_params->op_params[0].decoder_model_param_present_flag = 0;
+    seq_params->op_params[0].display_model_param_present_flag = 0;
+  } else {
+    seq_params->timing_info_present = aom_rb_read_bit(rb);
+    if (seq_params->timing_info_present) {
+      av1_read_timing_info_header(&seq_params->timing_info, &cm->error, rb);
+
+      seq_params->decoder_model_info_present_flag = aom_rb_read_bit(rb);
+      if (seq_params->decoder_model_info_present_flag)
+        av1_read_decoder_model_info(&seq_params->decoder_model_info, rb);
+    } else {
+      seq_params->decoder_model_info_present_flag = 0;
+    }
+    seq_params->display_model_info_present_flag = aom_rb_read_bit(rb);
+    seq_params->operating_points_cnt_minus_1 =
+        aom_rb_read_literal(rb, OP_POINTS_CNT_MINUS_1_BITS);
+    for (int i = 0; i < seq_params->operating_points_cnt_minus_1 + 1; i++) {
+      seq_params->operating_point_idc[i] =
+          aom_rb_read_literal(rb, OP_POINTS_IDC_BITS);
+      if (!read_bitstream_level(&seq_params->seq_level_idx[i], rb)) {
+        cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
+        return 0;
+      }
+      // This is the seq_level_idx[i] > 7 check in the spec. seq_level_idx 7
+      // is equivalent to level 3.3.
+      if (seq_params->seq_level_idx[i] >= SEQ_LEVEL_4_0)
+        seq_params->tier[i] = aom_rb_read_bit(rb);
+      else
+        seq_params->tier[i] = 0;
+      if (seq_params->decoder_model_info_present_flag) {
+        seq_params->op_params[i].decoder_model_param_present_flag =
+            aom_rb_read_bit(rb);
+        if (seq_params->op_params[i].decoder_model_param_present_flag)
+          av1_read_op_parameters_info(&seq_params->op_params[i],
+                                      seq_params->decoder_model_info
+                                          .encoder_decoder_buffer_delay_length,
+                                      rb);
+      } else {
+        seq_params->op_params[i].decoder_model_param_present_flag = 0;
+      }
+      if (seq_params->timing_info_present &&
+          (seq_params->timing_info.equal_picture_interval ||
+           seq_params->op_params[i].decoder_model_param_present_flag)) {
+        seq_params->op_params[i].bitrate = av1_max_level_bitrate(
+            seq_params->profile, seq_params->seq_level_idx[i],
+            seq_params->tier[i]);
+        // Level with seq_level_idx = 31 returns a high "dummy" bitrate to pass
+        // the check
+        if (seq_params->op_params[i].bitrate == 0)
+          aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                             "AV1 does not support this combination of "
+                             "profile, level, and tier.");
+        // Buffer size in bits/s is bitrate in bits/s * 1 s
+        seq_params->op_params[i].buffer_size = seq_params->op_params[i].bitrate;
+      }
+      if (seq_params->timing_info_present &&
+          seq_params->timing_info.equal_picture_interval &&
+          !seq_params->op_params[i].decoder_model_param_present_flag) {
+        // When the decoder_model_parameters are not sent for this op, set
+        // the default ones that can be used with the resource availability mode
+        seq_params->op_params[i].decoder_buffer_delay = 70000;
+        seq_params->op_params[i].encoder_buffer_delay = 20000;
+        seq_params->op_params[i].low_delay_mode_flag = 0;
+      }
+
+      if (seq_params->display_model_info_present_flag) {
+        seq_params->op_params[i].display_model_param_present_flag =
+            aom_rb_read_bit(rb);
+        if (seq_params->op_params[i].display_model_param_present_flag) {
+          seq_params->op_params[i].initial_display_delay =
+              aom_rb_read_literal(rb, 4) + 1;
+          if (seq_params->op_params[i].initial_display_delay > 10)
+            aom_internal_error(
+                &cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                "AV1 does not support more than 10 decoded frames delay");
+        } else {
+          seq_params->op_params[i].initial_display_delay = 10;
+        }
+      } else {
+        seq_params->op_params[i].display_model_param_present_flag = 0;
+        seq_params->op_params[i].initial_display_delay = 10;
+      }
+    }
+  }
+  // This decoder supports all levels.  Choose operating point provided by
+  // external means
+  int operating_point = pbi->operating_point;
+  if (operating_point < 0 ||
+      operating_point > seq_params->operating_points_cnt_minus_1)
+    operating_point = 0;
+  pbi->current_operating_point =
+      seq_params->operating_point_idc[operating_point];
+  if (aom_get_num_layers_from_operating_point_idc(
+          pbi->current_operating_point, &cm->number_spatial_layers,
+          &cm->number_temporal_layers) != AOM_CODEC_OK) {
+    cm->error.error_code = AOM_CODEC_ERROR;
+    return 0;
+  }
+
+  av1_read_sequence_header(cm, rb, seq_params);
+
+  av1_read_color_config(rb, pbi->allow_lowbitdepth, seq_params, &cm->error);
+  if (!(seq_params->subsampling_x == 0 && seq_params->subsampling_y == 0) &&
+      !(seq_params->subsampling_x == 1 && seq_params->subsampling_y == 1) &&
+      !(seq_params->subsampling_x == 1 && seq_params->subsampling_y == 0)) {
+    aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                       "Only 4:4:4, 4:2:2 and 4:2:0 are currently supported, "
+                       "%d %d subsampling is not supported.\n",
+                       seq_params->subsampling_x, seq_params->subsampling_y);
+  }
+
+  seq_params->film_grain_params_present = aom_rb_read_bit(rb);
+
+  if (av1_check_trailing_bits(pbi, rb) != 0) {
+    // cm->error.error_code is already set.
+    return 0;
+  }
+
+  // If a sequence header has been decoded before, we check if the new
+  // one is consistent with the old one.
+  if (pbi->sequence_header_ready) {
+    if (!are_seq_headers_consistent(&cm->seq_params, seq_params))
+      pbi->sequence_header_changed = 1;
+  }
+
+  cm->seq_params = *seq_params;
+  pbi->sequence_header_ready = 1;
+
+  return ((rb->bit_offset - saved_bit_offset + 7) >> 3);
+}
+
+// On success, returns the frame header size. On failure, calls
+// aom_internal_error and does not return.
+static uint32_t read_frame_header_obu(AV1Decoder *pbi,
+                                      struct aom_read_bit_buffer *rb,
+                                      const uint8_t *data,
+                                      const uint8_t **p_data_end,
+                                      int trailing_bits_present) {
+  return av1_decode_frame_headers_and_setup(pbi, rb, data, p_data_end,
+                                            trailing_bits_present);
+}
+
+// On success, returns the tile group header size. On failure, calls
+// aom_internal_error() and returns -1.
+static int32_t read_tile_group_header(AV1Decoder *pbi,
+                                      struct aom_read_bit_buffer *rb,
+                                      int *start_tile, int *end_tile,
+                                      int tile_start_implicit) {
+  AV1_COMMON *const cm = &pbi->common;
+  CommonTileParams *const tiles = &cm->tiles;
+  uint32_t saved_bit_offset = rb->bit_offset;
+  int tile_start_and_end_present_flag = 0;
+  const int num_tiles = tiles->rows * tiles->cols;
+
+  if (!tiles->large_scale && num_tiles > 1) {
+    tile_start_and_end_present_flag = aom_rb_read_bit(rb);
+    if (tile_start_implicit && tile_start_and_end_present_flag) {
+      aom_internal_error(
+          &cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+          "For OBU_FRAME type obu tile_start_and_end_present_flag must be 0");
+      return -1;
+    }
+  }
+  if (tiles->large_scale || num_tiles == 1 ||
+      !tile_start_and_end_present_flag) {
+    *start_tile = 0;
+    *end_tile = num_tiles - 1;
+  } else {
+    int tile_bits = tiles->log2_rows + tiles->log2_cols;
+    *start_tile = aom_rb_read_literal(rb, tile_bits);
+    *end_tile = aom_rb_read_literal(rb, tile_bits);
+  }
+  if (*start_tile != pbi->next_start_tile) {
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "tg_start (%d) must be equal to %d", *start_tile,
+                       pbi->next_start_tile);
+    return -1;
+  }
+  if (*start_tile > *end_tile) {
+    aom_internal_error(
+        &cm->error, AOM_CODEC_CORRUPT_FRAME,
+        "tg_end (%d) must be greater than or equal to tg_start (%d)", *end_tile,
+        *start_tile);
+    return -1;
+  }
+  if (*end_tile >= num_tiles) {
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "tg_end (%d) must be less than NumTiles (%d)", *end_tile,
+                       num_tiles);
+    return -1;
+  }
+  pbi->next_start_tile = (*end_tile == num_tiles - 1) ? 0 : *end_tile + 1;
+
+  return ((rb->bit_offset - saved_bit_offset + 7) >> 3);
+}
+
+// On success, returns the tile group OBU size. On failure, sets
+// pbi->common.error.error_code and returns 0.
+static uint32_t read_one_tile_group_obu(
+    AV1Decoder *pbi, struct aom_read_bit_buffer *rb, int is_first_tg,
+    const uint8_t *data, const uint8_t *data_end, const uint8_t **p_data_end,
+    int *is_last_tg, int tile_start_implicit) {
+  AV1_COMMON *const cm = &pbi->common;
+  int start_tile, end_tile;
+  int32_t header_size, tg_payload_size;
+
+  assert((rb->bit_offset & 7) == 0);
+  assert(rb->bit_buffer + aom_rb_bytes_read(rb) == data);
+
+  header_size = read_tile_group_header(pbi, rb, &start_tile, &end_tile,
+                                       tile_start_implicit);
+  if (header_size == -1 || byte_alignment(cm, rb)) return 0;
+  data += header_size;
+  av1_decode_tg_tiles_and_wrapup(pbi, data, data_end, p_data_end, start_tile,
+                                 end_tile, is_first_tg);
+
+  tg_payload_size = (uint32_t)(*p_data_end - data);
+
+  *is_last_tg = end_tile == cm->tiles.rows * cm->tiles.cols - 1;
+  return header_size + tg_payload_size;
+}
+
+static void alloc_tile_list_buffer(AV1Decoder *pbi) {
+  // The resolution of the output frame is read out from the bitstream. The data
+  // are stored in the order of Y plane, U plane and V plane. As an example, for
+  // image format 4:2:0, the output frame of U plane and V plane is 1/4 of the
+  // output frame.
+  AV1_COMMON *const cm = &pbi->common;
+  int tile_width, tile_height;
+  av1_get_uniform_tile_size(cm, &tile_width, &tile_height);
+  const int tile_width_in_pixels = tile_width * MI_SIZE;
+  const int tile_height_in_pixels = tile_height * MI_SIZE;
+  const int output_frame_width =
+      (pbi->output_frame_width_in_tiles_minus_1 + 1) * tile_width_in_pixels;
+  const int output_frame_height =
+      (pbi->output_frame_height_in_tiles_minus_1 + 1) * tile_height_in_pixels;
+  // The output frame is used to store the decoded tile list. The decoded tile
+  // list has to fit into 1 output frame.
+  assert((pbi->tile_count_minus_1 + 1) <=
+         (pbi->output_frame_width_in_tiles_minus_1 + 1) *
+             (pbi->output_frame_height_in_tiles_minus_1 + 1));
+
+  // Allocate the tile list output buffer.
+  // Note: if cm->seq_params.use_highbitdepth is 1 and cm->seq_params.bit_depth
+  // is 8, we could allocate less memory, namely, 8 bits/pixel.
+  if (aom_alloc_frame_buffer(&pbi->tile_list_outbuf, output_frame_width,
+                             output_frame_height, cm->seq_params.subsampling_x,
+                             cm->seq_params.subsampling_y,
+                             (cm->seq_params.use_highbitdepth &&
+                              (cm->seq_params.bit_depth > AOM_BITS_8)),
+                             0, cm->features.byte_alignment))
+    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate the tile list output buffer");
+}
+
+static void yv12_tile_copy(const YV12_BUFFER_CONFIG *src, int hstart1,
+                           int hend1, int vstart1, int vend1,
+                           YV12_BUFFER_CONFIG *dst, int hstart2, int vstart2,
+                           int plane) {
+  const int src_stride = (plane > 0) ? src->strides[1] : src->strides[0];
+  const int dst_stride = (plane > 0) ? dst->strides[1] : dst->strides[0];
+  int row, col;
+
+  assert(src->flags & YV12_FLAG_HIGHBITDEPTH);
+  assert(!(dst->flags & YV12_FLAG_HIGHBITDEPTH));
+
+  const uint16_t *src16 =
+      CONVERT_TO_SHORTPTR(src->buffers[plane] + vstart1 * src_stride + hstart1);
+  uint8_t *dst8 = dst->buffers[plane] + vstart2 * dst_stride + hstart2;
+
+  for (row = vstart1; row < vend1; ++row) {
+    for (col = 0; col < (hend1 - hstart1); ++col) *dst8++ = (uint8_t)(*src16++);
+    src16 += src_stride - (hend1 - hstart1);
+    dst8 += dst_stride - (hend1 - hstart1);
+  }
+  return;
+}
+
+static void copy_decoded_tile_to_tile_list_buffer(AV1Decoder *pbi,
+                                                  int tile_idx) {
+  AV1_COMMON *const cm = &pbi->common;
+  int tile_width, tile_height;
+  av1_get_uniform_tile_size(cm, &tile_width, &tile_height);
+  const int tile_width_in_pixels = tile_width * MI_SIZE;
+  const int tile_height_in_pixels = tile_height * MI_SIZE;
+  const int ssy = cm->seq_params.subsampling_y;
+  const int ssx = cm->seq_params.subsampling_x;
+  const int num_planes = av1_num_planes(cm);
+
+  YV12_BUFFER_CONFIG *cur_frame = &cm->cur_frame->buf;
+  const int tr = tile_idx / (pbi->output_frame_width_in_tiles_minus_1 + 1);
+  const int tc = tile_idx % (pbi->output_frame_width_in_tiles_minus_1 + 1);
+  int plane;
+
+  // Copy decoded tile to the tile list output buffer.
+  for (plane = 0; plane < num_planes; ++plane) {
+    const int shift_x = plane > 0 ? ssx : 0;
+    const int shift_y = plane > 0 ? ssy : 0;
+    const int h = tile_height_in_pixels >> shift_y;
+    const int w = tile_width_in_pixels >> shift_x;
+
+    // src offset
+    int vstart1 = pbi->dec_tile_row * h;
+    int vend1 = vstart1 + h;
+    int hstart1 = pbi->dec_tile_col * w;
+    int hend1 = hstart1 + w;
+    // dst offset
+    int vstart2 = tr * h;
+    int hstart2 = tc * w;
+
+    if (cm->seq_params.use_highbitdepth &&
+        cm->seq_params.bit_depth == AOM_BITS_8) {
+      yv12_tile_copy(cur_frame, hstart1, hend1, vstart1, vend1,
+                     &pbi->tile_list_outbuf, hstart2, vstart2, plane);
+    } else {
+      switch (plane) {
+        case 0:
+          aom_yv12_partial_copy_y(cur_frame, hstart1, hend1, vstart1, vend1,
+                                  &pbi->tile_list_outbuf, hstart2, vstart2);
+          break;
+        case 1:
+          aom_yv12_partial_copy_u(cur_frame, hstart1, hend1, vstart1, vend1,
+                                  &pbi->tile_list_outbuf, hstart2, vstart2);
+          break;
+        case 2:
+          aom_yv12_partial_copy_v(cur_frame, hstart1, hend1, vstart1, vend1,
+                                  &pbi->tile_list_outbuf, hstart2, vstart2);
+          break;
+        default: assert(0);
+      }
+    }
+  }
+}
+
+// Only called while large_scale_tile = 1.
+//
+// On success, returns the tile list OBU size. On failure, sets
+// pbi->common.error.error_code and returns 0.
+static uint32_t read_and_decode_one_tile_list(AV1Decoder *pbi,
+                                              struct aom_read_bit_buffer *rb,
+                                              const uint8_t *data,
+                                              const uint8_t *data_end,
+                                              const uint8_t **p_data_end,
+                                              int *frame_decoding_finished) {
+  AV1_COMMON *const cm = &pbi->common;
+  uint32_t tile_list_payload_size = 0;
+  const int num_tiles = cm->tiles.cols * cm->tiles.rows;
+  const int start_tile = 0;
+  const int end_tile = num_tiles - 1;
+  int i = 0;
+
+  // Process the tile list info.
+  pbi->output_frame_width_in_tiles_minus_1 = aom_rb_read_literal(rb, 8);
+  pbi->output_frame_height_in_tiles_minus_1 = aom_rb_read_literal(rb, 8);
+  pbi->tile_count_minus_1 = aom_rb_read_literal(rb, 16);
+  if (pbi->tile_count_minus_1 > MAX_TILES - 1) {
+    cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+    return 0;
+  }
+
+  // Allocate output frame buffer for the tile list.
+  alloc_tile_list_buffer(pbi);
+
+  uint32_t tile_list_info_bytes = 4;
+  tile_list_payload_size += tile_list_info_bytes;
+  data += tile_list_info_bytes;
+
+  int tile_idx = 0;
+  for (i = 0; i <= pbi->tile_count_minus_1; i++) {
+    // Process 1 tile.
+    // Reset the bit reader.
+    rb->bit_offset = 0;
+    rb->bit_buffer = data;
+
+    // Read out the tile info.
+    uint32_t tile_info_bytes = 5;
+    // Set reference for each tile.
+    int ref_idx = aom_rb_read_literal(rb, 8);
+    if (ref_idx >= MAX_EXTERNAL_REFERENCES) {
+      cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+      return 0;
+    }
+    av1_set_reference_dec(cm, cm->remapped_ref_idx[0], 1,
+                          &pbi->ext_refs.refs[ref_idx]);
+
+    pbi->dec_tile_row = aom_rb_read_literal(rb, 8);
+    pbi->dec_tile_col = aom_rb_read_literal(rb, 8);
+    if (pbi->dec_tile_row < 0 || pbi->dec_tile_col < 0 ||
+        pbi->dec_tile_row >= cm->tiles.rows ||
+        pbi->dec_tile_col >= cm->tiles.cols) {
+      cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+      return 0;
+    }
+
+    pbi->coded_tile_data_size = aom_rb_read_literal(rb, 16) + 1;
+    data += tile_info_bytes;
+    if ((size_t)(data_end - data) < pbi->coded_tile_data_size) {
+      cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+      return 0;
+    }
+
+    av1_decode_tg_tiles_and_wrapup(pbi, data, data + pbi->coded_tile_data_size,
+                                   p_data_end, start_tile, end_tile, 0);
+    uint32_t tile_payload_size = (uint32_t)(*p_data_end - data);
+
+    tile_list_payload_size += tile_info_bytes + tile_payload_size;
+
+    // Update data ptr for next tile decoding.
+    data = *p_data_end;
+    assert(data <= data_end);
+
+    // Copy the decoded tile to the tile list output buffer.
+    copy_decoded_tile_to_tile_list_buffer(pbi, tile_idx);
+    tile_idx++;
+  }
+
+  *frame_decoding_finished = 1;
+  return tile_list_payload_size;
+}
+
+// Returns the last nonzero byte index in 'data'. If there is no nonzero byte in
+// 'data', returns -1.
+static int get_last_nonzero_byte_index(const uint8_t *data, size_t sz) {
+  // Scan backward and return on the first nonzero byte.
+  int i = (int)sz - 1;
+  while (i >= 0 && data[i] == 0) {
+    --i;
+  }
+  return i;
+}
+
+// Allocates metadata that was read and adds it to the decoders metadata array.
+static void alloc_read_metadata(AV1Decoder *const pbi,
+                                OBU_METADATA_TYPE metadata_type,
+                                const uint8_t *data, size_t sz,
+                                aom_metadata_insert_flags_t insert_flag) {
+  AV1_COMMON *const cm = &pbi->common;
+  aom_metadata_t *metadata =
+      aom_img_metadata_alloc(metadata_type, data, sz, insert_flag);
+  if (!metadata) {
+    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                       "Error allocating metadata");
+  }
+  if (!pbi->metadata) {
+    pbi->metadata = aom_img_metadata_array_alloc(1);
+    if (!pbi->metadata) {
+      aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                         "Failed to allocate metadata array");
+    }
+  } else {
+    aom_metadata_t **metadata_array =
+        (aom_metadata_t **)realloc(pbi->metadata->metadata_array,
+                                   (pbi->metadata->sz + 1) * sizeof(metadata));
+    if (!metadata_array) {
+      aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                         "Error allocating metadata");
+    }
+    pbi->metadata->metadata_array = metadata_array;
+    pbi->metadata->sz++;
+  }
+  pbi->metadata->metadata_array[pbi->metadata->sz - 1] = metadata;
+}
+
+// On success, returns the number of bytes read from 'data'. On failure, calls
+// aom_internal_error() and does not return.
+static size_t read_metadata_itut_t35(AV1Decoder *const pbi, const uint8_t *data,
+                                     size_t sz) {
+  const int kMinItuT35PayloadSize = 2;
+  AV1_COMMON *const cm = &pbi->common;
+  if (sz == 0) {
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "itu_t_t35_country_code is missing");
+  }
+  int bytes_read = get_last_nonzero_byte_index(data, sz);
+  if (bytes_read < 0) {
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "No trailing bits found on metadata");
+  }
+  if (*data == 0xFF && bytes_read < kMinItuT35PayloadSize) {
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "itu_t_t35_country_code_extension_byte is missing");
+  }
+  alloc_read_metadata(pbi, OBU_METADATA_TYPE_ITUT_T35, data, (size_t)bytes_read,
+                      AOM_MIF_ANY_FRAME);
+  return (size_t)bytes_read;
+}
+
+// On success, returns the number of bytes read from 'data'. On failure, calls
+// aom_internal_error() and does not return.
+static size_t read_metadata_hdr_cll(AV1Decoder *const pbi, const uint8_t *data,
+                                    size_t sz) {
+  const int kHdrCllPayloadSize = 4;
+  AV1_COMMON *const cm = &pbi->common;
+  if (sz == 0) {
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "HDR CLL metadata payload is missing");
+  }
+  int bytes_read = get_last_nonzero_byte_index(data, sz);
+  if (bytes_read < 0) {
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "No trailing bits found on metadata");
+  }
+  if (bytes_read != kHdrCllPayloadSize) {
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "Incorrect HDR CLL metadata payload size");
+  }
+  alloc_read_metadata(pbi, OBU_METADATA_TYPE_HDR_CLL, data, (size_t)bytes_read,
+                      AOM_MIF_ANY_FRAME);
+  return (size_t)bytes_read;
+}
+
+// On success, returns the number of bytes read from 'data'. On failure, calls
+// aom_internal_error() and does not return.
+static size_t read_metadata_hdr_mdcv(AV1Decoder *const pbi, const uint8_t *data,
+                                     size_t sz) {
+  const int kMdcvPayloadSize = 24;
+  AV1_COMMON *const cm = &pbi->common;
+  if (sz == 0) {
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "HDR MDCV metadata payload is missing");
+  }
+  int bytes_read = get_last_nonzero_byte_index(data, sz);
+  if (bytes_read < 0) {
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "No trailing bits found on HDR MDCV metadata");
+  }
+  if (bytes_read != kMdcvPayloadSize) {
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "Incorrect HDR MDCV metadata payload size");
+  }
+  alloc_read_metadata(pbi, OBU_METADATA_TYPE_HDR_MDCV, data, (size_t)bytes_read,
+                      AOM_MIF_ANY_FRAME);
+  return (size_t)bytes_read;
+}
+
+static void scalability_structure(struct aom_read_bit_buffer *rb) {
+  const int spatial_layers_cnt_minus_1 = aom_rb_read_literal(rb, 2);
+  const int spatial_layer_dimensions_present_flag = aom_rb_read_bit(rb);
+  const int spatial_layer_description_present_flag = aom_rb_read_bit(rb);
+  const int temporal_group_description_present_flag = aom_rb_read_bit(rb);
+  aom_rb_read_literal(rb, 3);  // reserved
+
+  if (spatial_layer_dimensions_present_flag) {
+    for (int i = 0; i <= spatial_layers_cnt_minus_1; i++) {
+      aom_rb_read_literal(rb, 16);
+      aom_rb_read_literal(rb, 16);
+    }
+  }
+  if (spatial_layer_description_present_flag) {
+    for (int i = 0; i <= spatial_layers_cnt_minus_1; i++) {
+      aom_rb_read_literal(rb, 8);
+    }
+  }
+  if (temporal_group_description_present_flag) {
+    const int temporal_group_size = aom_rb_read_literal(rb, 8);
+    for (int i = 0; i < temporal_group_size; i++) {
+      aom_rb_read_literal(rb, 3);
+      aom_rb_read_bit(rb);
+      aom_rb_read_bit(rb);
+      const int temporal_group_ref_cnt = aom_rb_read_literal(rb, 3);
+      for (int j = 0; j < temporal_group_ref_cnt; j++) {
+        aom_rb_read_literal(rb, 8);
+      }
+    }
+  }
+}
+
+static void read_metadata_scalability(struct aom_read_bit_buffer *rb) {
+  const int scalability_mode_idc = aom_rb_read_literal(rb, 8);
+  if (scalability_mode_idc == SCALABILITY_SS) {
+    scalability_structure(rb);
+  }
+}
+
+static void read_metadata_timecode(struct aom_read_bit_buffer *rb) {
+  aom_rb_read_literal(rb, 5);  // counting_type f(5)
+  const int full_timestamp_flag =
+      aom_rb_read_bit(rb);     // full_timestamp_flag f(1)
+  aom_rb_read_bit(rb);         // discontinuity_flag (f1)
+  aom_rb_read_bit(rb);         // cnt_dropped_flag f(1)
+  aom_rb_read_literal(rb, 9);  // n_frames f(9)
+  if (full_timestamp_flag) {
+    aom_rb_read_literal(rb, 6);  // seconds_value f(6)
+    aom_rb_read_literal(rb, 6);  // minutes_value f(6)
+    aom_rb_read_literal(rb, 5);  // hours_value f(5)
+  } else {
+    const int seconds_flag = aom_rb_read_bit(rb);  // seconds_flag f(1)
+    if (seconds_flag) {
+      aom_rb_read_literal(rb, 6);                    // seconds_value f(6)
+      const int minutes_flag = aom_rb_read_bit(rb);  // minutes_flag f(1)
+      if (minutes_flag) {
+        aom_rb_read_literal(rb, 6);                  // minutes_value f(6)
+        const int hours_flag = aom_rb_read_bit(rb);  // hours_flag f(1)
+        if (hours_flag) {
+          aom_rb_read_literal(rb, 5);  // hours_value f(5)
+        }
+      }
+    }
+  }
+  // time_offset_length f(5)
+  const int time_offset_length = aom_rb_read_literal(rb, 5);
+  if (time_offset_length) {
+    // time_offset_value f(time_offset_length)
+    aom_rb_read_literal(rb, time_offset_length);
+  }
+}
+
+// Returns the last nonzero byte in 'data'. If there is no nonzero byte in
+// 'data', returns 0.
+//
+// Call this function to check the following requirement in the spec:
+//   This implies that when any payload data is present for this OBU type, at
+//   least one byte of the payload data (including the trailing bit) shall not
+//   be equal to 0.
+static uint8_t get_last_nonzero_byte(const uint8_t *data, size_t sz) {
+  // Scan backward and return on the first nonzero byte.
+  size_t i = sz;
+  while (i != 0) {
+    --i;
+    if (data[i] != 0) return data[i];
+  }
+  return 0;
+}
+
+// Checks the metadata for correct syntax but ignores the parsed metadata.
+//
+// On success, returns the number of bytes read from 'data'. On failure, sets
+// pbi->common.error.error_code and returns 0, or calls aom_internal_error()
+// and does not return.
+static size_t read_metadata(AV1Decoder *pbi, const uint8_t *data, size_t sz) {
+  AV1_COMMON *const cm = &pbi->common;
+  size_t type_length;
+  uint64_t type_value;
+  if (aom_uleb_decode(data, sz, &type_value, &type_length) < 0) {
+    cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+    return 0;
+  }
+  const OBU_METADATA_TYPE metadata_type = (OBU_METADATA_TYPE)type_value;
+  if (metadata_type == 0 || metadata_type >= 6) {
+    // If metadata_type is reserved for future use or a user private value,
+    // ignore the entire OBU and just check trailing bits.
+    if (get_last_nonzero_byte(data + type_length, sz - type_length) == 0) {
+      pbi->common.error.error_code = AOM_CODEC_CORRUPT_FRAME;
+      return 0;
+    }
+    return sz;
+  }
+  if (metadata_type == OBU_METADATA_TYPE_ITUT_T35) {
+    size_t bytes_read =
+        type_length +
+        read_metadata_itut_t35(pbi, data + type_length, sz - type_length);
+    // itu_t_t35_payload_bytes is byte aligned and the first
+    // trailing byte should be 0x80.
+    if (get_last_nonzero_byte(data + bytes_read, sz - bytes_read) != 0x80) {
+      pbi->common.error.error_code = AOM_CODEC_CORRUPT_FRAME;
+      return 0;
+    }
+    return sz;
+  } else if (metadata_type == OBU_METADATA_TYPE_HDR_CLL) {
+    size_t bytes_read =
+        type_length +
+        read_metadata_hdr_cll(pbi, data + type_length, sz - type_length);
+    if (get_last_nonzero_byte(data + bytes_read, sz - bytes_read) != 0x80) {
+      pbi->common.error.error_code = AOM_CODEC_CORRUPT_FRAME;
+      return 0;
+    }
+    return sz;
+  } else if (metadata_type == OBU_METADATA_TYPE_HDR_MDCV) {
+    size_t bytes_read =
+        type_length +
+        read_metadata_hdr_mdcv(pbi, data + type_length, sz - type_length);
+    if (get_last_nonzero_byte(data + bytes_read, sz - bytes_read) != 0x80) {
+      pbi->common.error.error_code = AOM_CODEC_CORRUPT_FRAME;
+      return 0;
+    }
+    return sz;
+  }
+
+  struct aom_read_bit_buffer rb;
+  av1_init_read_bit_buffer(pbi, &rb, data + type_length, data + sz);
+  if (metadata_type == OBU_METADATA_TYPE_SCALABILITY) {
+    read_metadata_scalability(&rb);
+  } else {
+    assert(metadata_type == OBU_METADATA_TYPE_TIMECODE);
+    read_metadata_timecode(&rb);
+  }
+  if (av1_check_trailing_bits(pbi, &rb) != 0) {
+    // cm->error.error_code is already set.
+    return 0;
+  }
+  assert((rb.bit_offset & 7) == 0);
+  return type_length + (rb.bit_offset >> 3);
+}
+
+// On success, returns 'sz'. On failure, sets pbi->common.error.error_code and
+// returns 0.
+static size_t read_padding(AV1_COMMON *const cm, const uint8_t *data,
+                           size_t sz) {
+  // The spec allows a padding OBU to be header-only (i.e., obu_size = 0). So
+  // check trailing bits only if sz > 0.
+  if (sz > 0) {
+    // The payload of a padding OBU is byte aligned. Therefore the first
+    // trailing byte should be 0x80. See https://crbug.com/aomedia/2393.
+    const uint8_t last_nonzero_byte = get_last_nonzero_byte(data, sz);
+    if (last_nonzero_byte != 0x80) {
+      cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+      return 0;
+    }
+  }
+  return sz;
+}
+
+// On success, returns a boolean that indicates whether the decoding of the
+// current frame is finished. On failure, sets cm->error.error_code and
+// returns -1.
+int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data,
+                               const uint8_t *data_end,
+                               const uint8_t **p_data_end) {
+  AV1_COMMON *const cm = &pbi->common;
+  int frame_decoding_finished = 0;
+  int is_first_tg_obu_received = 1;
+  uint32_t frame_header_size = 0;
+  ObuHeader obu_header;
+  memset(&obu_header, 0, sizeof(obu_header));
+  pbi->seen_frame_header = 0;
+  pbi->next_start_tile = 0;
+
+  if (data_end < data) {
+    cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+    return -1;
+  }
+
+  // Reset pbi->camera_frame_header_ready to 0 if cm->tiles.large_scale = 0.
+  if (!cm->tiles.large_scale) pbi->camera_frame_header_ready = 0;
+
+  // decode frame as a series of OBUs
+  while (!frame_decoding_finished && cm->error.error_code == AOM_CODEC_OK) {
+    struct aom_read_bit_buffer rb;
+    size_t payload_size = 0;
+    size_t decoded_payload_size = 0;
+    size_t obu_payload_offset = 0;
+    size_t bytes_read = 0;
+    const size_t bytes_available = data_end - data;
+
+    if (bytes_available == 0 && !pbi->seen_frame_header) {
+      *p_data_end = data;
+      cm->error.error_code = AOM_CODEC_OK;
+      break;
+    }
+
+    aom_codec_err_t status =
+        aom_read_obu_header_and_size(data, bytes_available, pbi->is_annexb,
+                                     &obu_header, &payload_size, &bytes_read);
+
+    if (status != AOM_CODEC_OK) {
+      cm->error.error_code = status;
+      return -1;
+    }
+
+    // Record obu size header information.
+    pbi->obu_size_hdr.data = data + obu_header.size;
+    pbi->obu_size_hdr.size = bytes_read - obu_header.size;
+
+    // Note: aom_read_obu_header_and_size() takes care of checking that this
+    // doesn't cause 'data' to advance past 'data_end'.
+    data += bytes_read;
+
+    if ((size_t)(data_end - data) < payload_size) {
+      cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+      return -1;
+    }
+
+    cm->temporal_layer_id = obu_header.temporal_layer_id;
+    cm->spatial_layer_id = obu_header.spatial_layer_id;
+
+    if (obu_header.type != OBU_TEMPORAL_DELIMITER &&
+        obu_header.type != OBU_SEQUENCE_HEADER &&
+        obu_header.type != OBU_PADDING) {
+      // don't decode obu if it's not in current operating mode
+      if (!is_obu_in_current_operating_point(pbi, obu_header)) {
+        data += payload_size;
+        continue;
+      }
+    }
+
+    av1_init_read_bit_buffer(pbi, &rb, data, data + payload_size);
+
+    switch (obu_header.type) {
+      case OBU_TEMPORAL_DELIMITER:
+        decoded_payload_size = read_temporal_delimiter_obu();
+        pbi->seen_frame_header = 0;
+        pbi->next_start_tile = 0;
+        break;
+      case OBU_SEQUENCE_HEADER:
+        decoded_payload_size = read_sequence_header_obu(pbi, &rb);
+        if (cm->error.error_code != AOM_CODEC_OK) return -1;
+        // The sequence header should not change in the middle of a frame.
+        if (pbi->sequence_header_changed && pbi->seen_frame_header) {
+          cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+          return -1;
+        }
+        break;
+      case OBU_FRAME_HEADER:
+      case OBU_REDUNDANT_FRAME_HEADER:
+      case OBU_FRAME:
+        if (obu_header.type == OBU_REDUNDANT_FRAME_HEADER) {
+          if (!pbi->seen_frame_header) {
+            cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+            return -1;
+          }
+        } else {
+          // OBU_FRAME_HEADER or OBU_FRAME.
+          if (pbi->seen_frame_header) {
+            cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+            return -1;
+          }
+        }
+        // Only decode first frame header received
+        if (!pbi->seen_frame_header ||
+            (cm->tiles.large_scale && !pbi->camera_frame_header_ready)) {
+          frame_header_size = read_frame_header_obu(
+              pbi, &rb, data, p_data_end, obu_header.type != OBU_FRAME);
+          pbi->seen_frame_header = 1;
+          if (!pbi->ext_tile_debug && cm->tiles.large_scale)
+            pbi->camera_frame_header_ready = 1;
+        } else {
+          // TODO(wtc): Verify that the frame_header_obu is identical to the
+          // original frame_header_obu. For now just skip frame_header_size
+          // bytes in the bit buffer.
+          if (frame_header_size > payload_size) {
+            cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+            return -1;
+          }
+          assert(rb.bit_offset == 0);
+          rb.bit_offset = 8 * frame_header_size;
+        }
+
+        decoded_payload_size = frame_header_size;
+        pbi->frame_header_size = frame_header_size;
+
+        if (cm->show_existing_frame) {
+          if (obu_header.type == OBU_FRAME) {
+            cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
+            return -1;
+          }
+          frame_decoding_finished = 1;
+          pbi->seen_frame_header = 0;
+          break;
+        }
+
+        // In large scale tile coding, decode the common camera frame header
+        // before any tile list OBU.
+        if (!pbi->ext_tile_debug && pbi->camera_frame_header_ready) {
+          frame_decoding_finished = 1;
+          // Skip the rest of the frame data.
+          decoded_payload_size = payload_size;
+          // Update data_end.
+          *p_data_end = data_end;
+          break;
+        }
+
+        if (obu_header.type != OBU_FRAME) break;
+        obu_payload_offset = frame_header_size;
+        // Byte align the reader before reading the tile group.
+        // byte_alignment() has set cm->error.error_code if it returns -1.
+        if (byte_alignment(cm, &rb)) return -1;
+        AOM_FALLTHROUGH_INTENDED;  // fall through to read tile group.
+      case OBU_TILE_GROUP:
+        if (!pbi->seen_frame_header) {
+          cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+          return -1;
+        }
+        if (obu_payload_offset > payload_size) {
+          cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+          return -1;
+        }
+        decoded_payload_size += read_one_tile_group_obu(
+            pbi, &rb, is_first_tg_obu_received, data + obu_payload_offset,
+            data + payload_size, p_data_end, &frame_decoding_finished,
+            obu_header.type == OBU_FRAME);
+        if (cm->error.error_code != AOM_CODEC_OK) return -1;
+        is_first_tg_obu_received = 0;
+        if (frame_decoding_finished) pbi->seen_frame_header = 0;
+        break;
+      case OBU_METADATA:
+        decoded_payload_size = read_metadata(pbi, data, payload_size);
+        if (cm->error.error_code != AOM_CODEC_OK) return -1;
+        break;
+      case OBU_TILE_LIST:
+        if (CONFIG_NORMAL_TILE_MODE) {
+          cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
+          return -1;
+        }
+
+        // This OBU type is purely for the large scale tile coding mode.
+        // The common camera frame header has to be already decoded.
+        if (!pbi->camera_frame_header_ready) {
+          cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+          return -1;
+        }
+
+        cm->tiles.large_scale = 1;
+        av1_set_single_tile_decoding_mode(cm);
+        decoded_payload_size =
+            read_and_decode_one_tile_list(pbi, &rb, data, data + payload_size,
+                                          p_data_end, &frame_decoding_finished);
+        if (cm->error.error_code != AOM_CODEC_OK) return -1;
+        break;
+      case OBU_PADDING:
+        decoded_payload_size = read_padding(&pbi->common, data, payload_size);
+        if (cm->error.error_code != AOM_CODEC_OK) return -1;
+        break;
+      default:
+        // Skip unrecognized OBUs
+        if (payload_size > 0 &&
+            get_last_nonzero_byte(data, payload_size) == 0) {
+          cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+          return -1;
+        }
+        decoded_payload_size = payload_size;
+        break;
+    }
+
+    // Check that the signalled OBU size matches the actual amount of data read
+    if (decoded_payload_size > payload_size) {
+      cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+      return -1;
+    }
+
+    // If there are extra padding bytes, they should all be zero
+    while (decoded_payload_size < payload_size) {
+      uint8_t padding_byte = data[decoded_payload_size++];
+      if (padding_byte != 0) {
+        cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+        return -1;
+      }
+    }
+
+    data += payload_size;
+  }
+
+  if (cm->error.error_code != AOM_CODEC_OK) return -1;
+  return frame_decoding_finished;
+}
diff --git a/libs/libaom/src/av1/decoder/obu.h b/libs/libaom/src/av1/decoder/obu.h
new file mode 100644
index 000000000..d8ebe368e
--- /dev/null
+++ b/libs/libaom/src/av1/decoder/obu.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_DECODER_OBU_H_
+#define AOM_AV1_DECODER_OBU_H_
+
+#include "aom/aom_codec.h"
+#include "av1/decoder/decoder.h"
+
+// Try to decode one frame from a buffer.
+// Returns 1 if we decoded a frame,
+//         0 if we didn't decode a frame but that's okay
+//           (eg, if there was a frame but we skipped it),
+//     or -1 on error
+int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data,
+                               const uint8_t *data_end,
+                               const uint8_t **p_data_end);
+
+aom_codec_err_t aom_get_num_layers_from_operating_point_idc(
+    int operating_point_idc, unsigned int *number_spatial_layers,
+    unsigned int *number_temporal_layers);
+
+#endif  // AOM_AV1_DECODER_OBU_H_
diff --git a/libs/libaom/src/av1/encoder/aq_complexity.c b/libs/libaom/src/av1/encoder/aq_complexity.c
new file mode 100644
index 000000000..36580063d
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/aq_complexity.c
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+#include <math.h>
+
+#include "av1/encoder/aq_complexity.h"
+#include "av1/encoder/aq_variance.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/common/seg_common.h"
+#include "av1/encoder/segmentation.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/system_state.h"
+
+#define AQ_C_SEGMENTS 5
+#define DEFAULT_AQ2_SEG 3  // Neutral Q segment
+#define AQ_C_STRENGTHS 3
+static const double aq_c_q_adj_factor[AQ_C_STRENGTHS][AQ_C_SEGMENTS] = {
+  { 1.75, 1.25, 1.05, 1.00, 0.90 },
+  { 2.00, 1.50, 1.15, 1.00, 0.85 },
+  { 2.50, 1.75, 1.25, 1.00, 0.80 }
+};
+static const double aq_c_transitions[AQ_C_STRENGTHS][AQ_C_SEGMENTS] = {
+  { 0.15, 0.30, 0.55, 2.00, 100.0 },
+  { 0.20, 0.40, 0.65, 2.00, 100.0 },
+  { 0.25, 0.50, 0.75, 2.00, 100.0 }
+};
+static const double aq_c_var_thresholds[AQ_C_STRENGTHS][AQ_C_SEGMENTS] = {
+  { -4.0, -3.0, -2.0, 100.00, 100.0 },
+  { -3.5, -2.5, -1.5, 100.00, 100.0 },
+  { -3.0, -2.0, -1.0, 100.00, 100.0 }
+};
+
+static int get_aq_c_strength(int q_index, aom_bit_depth_t bit_depth) {
+  // Approximate base quatizer (truncated to int)
+  const int base_quant = av1_ac_quant_QTX(q_index, 0, bit_depth) / 4;
+  return (base_quant > 10) + (base_quant > 25);
+}
+
+static bool is_frame_aq_enabled(const AV1_COMP *const cpi) {
+  const AV1_COMMON *const cm = &cpi->common;
+
+  return frame_is_intra_only(cm) || cm->features.error_resilient_mode ||
+         cpi->refresh_alt_ref_frame ||
+         (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref);
+}
+
+// Segmentation only makes sense if the target bits per SB is above a threshold.
+// Below this the overheads will usually outweigh any benefit.
+static bool is_sb_aq_enabled(const AV1_COMP *const cpi) {
+  return cpi->rc.sb64_target_rate >= 256;
+}
+
+void av1_setup_in_frame_q_adj(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int base_qindex = cm->quant_params.base_qindex;
+  struct segmentation *const seg = &cm->seg;
+  const int resolution_change =
+      cm->prev_frame && (cm->width != cm->prev_frame->width ||
+                         cm->height != cm->prev_frame->height);
+
+  // Make SURE use of floating point in this function is safe.
+  aom_clear_system_state();
+
+  if (resolution_change) {
+    memset(cpi->enc_seg.map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols);
+    av1_clearall_segfeatures(seg);
+    av1_disable_segmentation(seg);
+    return;
+  }
+
+  if (is_frame_aq_enabled(cpi)) {
+    int segment;
+    const int aq_strength =
+        get_aq_c_strength(base_qindex, cm->seq_params.bit_depth);
+
+    // Clear down the segment map.
+    memset(cpi->enc_seg.map, DEFAULT_AQ2_SEG,
+           cm->mi_params.mi_rows * cm->mi_params.mi_cols);
+
+    av1_clearall_segfeatures(seg);
+
+    if (!is_sb_aq_enabled(cpi)) {
+      av1_disable_segmentation(seg);
+      return;
+    }
+
+    av1_enable_segmentation(seg);
+
+    // Default segment "Q" feature is disabled so it defaults to the baseline Q.
+    av1_disable_segfeature(seg, DEFAULT_AQ2_SEG, SEG_LVL_ALT_Q);
+
+    // Use some of the segments for in frame Q adjustment.
+    for (segment = 0; segment < AQ_C_SEGMENTS; ++segment) {
+      int qindex_delta;
+
+      if (segment == DEFAULT_AQ2_SEG) continue;
+
+      qindex_delta = av1_compute_qdelta_by_rate(
+          &cpi->rc, cm->current_frame.frame_type, base_qindex,
+          aq_c_q_adj_factor[aq_strength][segment], cm->seq_params.bit_depth);
+
+      // For AQ complexity mode, we dont allow Q0 in a segment if the base
+      // Q is not 0. Q0 (lossless) implies 4x4 only and in AQ mode 2 a segment
+      // Q delta is sometimes applied without going back around the rd loop.
+      // This could lead to an illegal combination of partition size and q.
+      if ((base_qindex != 0) && ((base_qindex + qindex_delta) == 0)) {
+        qindex_delta = -base_qindex + 1;
+      }
+      if ((base_qindex + qindex_delta) > 0) {
+        av1_enable_segfeature(seg, segment, SEG_LVL_ALT_Q);
+        av1_set_segdata(seg, segment, SEG_LVL_ALT_Q, qindex_delta);
+      }
+    }
+  }
+}
+
+#define DEFAULT_LV_THRESH 10.0
+#define MIN_DEFAULT_LV_THRESH 8.0
+// Select a segment for the current block.
+// The choice of segment for a block depends on the ratio of the projected
+// bits for the block vs a target average and its spatial complexity.
+void av1_caq_select_segment(const AV1_COMP *cpi, MACROBLOCK *mb, BLOCK_SIZE bs,
+                            int mi_row, int mi_col, int projected_rate) {
+  if ((!is_frame_aq_enabled(cpi)) || (!is_sb_aq_enabled(cpi))) return;
+  const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+
+  const int mi_offset = mi_row * cm->mi_params.mi_cols + mi_col;
+  const int xmis = AOMMIN(cm->mi_params.mi_cols - mi_col, mi_size_wide[bs]);
+  const int ymis = AOMMIN(cm->mi_params.mi_rows - mi_row, mi_size_high[bs]);
+  int x, y;
+  int i;
+  unsigned char segment;
+
+  if (0) {
+    segment = DEFAULT_AQ2_SEG;
+  } else {
+    // Rate depends on fraction of a SB64 in frame (xmis * ymis / bw * bh).
+    // It is converted to bits << AV1_PROB_COST_SHIFT units.
+    const int64_t num = (int64_t)(cpi->rc.sb64_target_rate * xmis * ymis)
+                        << AV1_PROB_COST_SHIFT;
+    const int denom = cm->seq_params.mib_size * cm->seq_params.mib_size;
+    const int target_rate = (int)(num / denom);
+    double logvar;
+    double low_var_thresh;
+    const int aq_strength = get_aq_c_strength(cm->quant_params.base_qindex,
+                                              cm->seq_params.bit_depth);
+
+    aom_clear_system_state();
+    low_var_thresh =
+        (is_stat_consumption_stage_twopass(cpi))
+            ? AOMMAX(exp(cpi->twopass.mb_av_energy), MIN_DEFAULT_LV_THRESH)
+            : DEFAULT_LV_THRESH;
+
+    av1_setup_src_planes(mb, cpi->source, mi_row, mi_col, num_planes, bs);
+    logvar = av1_log_block_var(cpi, mb, bs);
+
+    segment = AQ_C_SEGMENTS - 1;  // Just in case no break out below.
+    for (i = 0; i < AQ_C_SEGMENTS; ++i) {
+      // Test rate against a threshold value and variance against a threshold.
+      // Increasing segment number (higher variance and complexity) = higher Q.
+      if ((projected_rate < target_rate * aq_c_transitions[aq_strength][i]) &&
+          (logvar < (low_var_thresh + aq_c_var_thresholds[aq_strength][i]))) {
+        segment = i;
+        break;
+      }
+    }
+  }
+
+  // Fill in the entires in the segment map corresponding to this SB64.
+  for (y = 0; y < ymis; y++) {
+    for (x = 0; x < xmis; x++) {
+      cpi->enc_seg.map[mi_offset + y * cm->mi_params.mi_cols + x] = segment;
+    }
+  }
+}
diff --git a/libs/libaom/src/av1/encoder/aq_complexity.h b/libs/libaom/src/av1/encoder/aq_complexity.h
new file mode 100644
index 000000000..3421d74c9
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/aq_complexity.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_AQ_COMPLEXITY_H_
+#define AOM_AV1_ENCODER_AQ_COMPLEXITY_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/common/enums.h"
+
+struct AV1_COMP;
+struct macroblock;
+
+// Select a segment for the current Block.
+void av1_caq_select_segment(const struct AV1_COMP *cpi, struct macroblock *,
+                            BLOCK_SIZE bs, int mi_row, int mi_col,
+                            int projected_rate);
+
+// This function sets up a set of segments with delta Q values around
+// the baseline frame quantizer.
+void av1_setup_in_frame_q_adj(struct AV1_COMP *cpi);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_AQ_COMPLEXITY_H_
diff --git a/libs/libaom/src/av1/encoder/aq_cyclicrefresh.c b/libs/libaom/src/av1/encoder/aq_cyclicrefresh.c
new file mode 100644
index 000000000..b8884942a
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/aq_cyclicrefresh.c
@@ -0,0 +1,501 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+#include <math.h>
+
+#include "av1/common/seg_common.h"
+#include "av1/encoder/aq_cyclicrefresh.h"
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/segmentation.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/system_state.h"
+
+CYCLIC_REFRESH *av1_cyclic_refresh_alloc(int mi_rows, int mi_cols) {
+  size_t last_coded_q_map_size;
+  CYCLIC_REFRESH *const cr = aom_calloc(1, sizeof(*cr));
+  if (cr == NULL) return NULL;
+
+  cr->map = aom_calloc(mi_rows * mi_cols, sizeof(*cr->map));
+  if (cr->map == NULL) {
+    av1_cyclic_refresh_free(cr);
+    return NULL;
+  }
+  last_coded_q_map_size = mi_rows * mi_cols * sizeof(*cr->last_coded_q_map);
+  cr->last_coded_q_map = aom_malloc(last_coded_q_map_size);
+  if (cr->last_coded_q_map == NULL) {
+    av1_cyclic_refresh_free(cr);
+    return NULL;
+  }
+  assert(MAXQ <= 255);
+  memset(cr->last_coded_q_map, MAXQ, last_coded_q_map_size);
+  cr->avg_frame_low_motion = 0.0;
+  return cr;
+}
+
+void av1_cyclic_refresh_free(CYCLIC_REFRESH *cr) {
+  if (cr != NULL) {
+    aom_free(cr->map);
+    aom_free(cr->last_coded_q_map);
+    aom_free(cr);
+  }
+}
+
+// Check if this coding block, of size bsize, should be considered for refresh
+// (lower-qp coding). Decision can be based on various factors, such as
+// size of the coding block (i.e., below min_block size rejected), coding
+// mode, and rate/distortion.
+static int candidate_refresh_aq(const CYCLIC_REFRESH *cr,
+                                const MB_MODE_INFO *mbmi, int64_t rate,
+                                int64_t dist, int bsize) {
+  MV mv = mbmi->mv[0].as_mv;
+  // Reject the block for lower-qp coding if projected distortion
+  // is above the threshold, and any of the following is true:
+  // 1) mode uses large mv
+  // 2) mode is an intra-mode
+  // Otherwise accept for refresh.
+  if (dist > cr->thresh_dist_sb &&
+      (mv.row > cr->motion_thresh || mv.row < -cr->motion_thresh ||
+       mv.col > cr->motion_thresh || mv.col < -cr->motion_thresh ||
+       !is_inter_block(mbmi)))
+    return CR_SEGMENT_ID_BASE;
+  else if (bsize >= BLOCK_16X16 && rate < cr->thresh_rate_sb &&
+           is_inter_block(mbmi) && mbmi->mv[0].as_int == 0 &&
+           cr->rate_boost_fac > 10)
+    // More aggressive delta-q for bigger blocks with zero motion.
+    return CR_SEGMENT_ID_BOOST2;
+  else
+    return CR_SEGMENT_ID_BOOST1;
+}
+
+// Compute delta-q for the segment.
+static int compute_deltaq(const AV1_COMP *cpi, int q, double rate_factor) {
+  const CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  int deltaq =
+      av1_compute_qdelta_by_rate(rc, cpi->common.current_frame.frame_type, q,
+                                 rate_factor, cpi->common.seq_params.bit_depth);
+  if ((-deltaq) > cr->max_qdelta_perc * q / 100) {
+    deltaq = -cr->max_qdelta_perc * q / 100;
+  }
+  return deltaq;
+}
+
+// For the just encoded frame, estimate the bits, incorporating the delta-q
+// from non-base segment. For now ignore effect of multiple segments
+// (with different delta-q). Note this function is called in the postencode
+// (called from rc_update_rate_correction_factors()).
+int av1_cyclic_refresh_estimate_bits_at_q(const AV1_COMP *cpi,
+                                          double correction_factor) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const FRAME_TYPE frame_type = cm->current_frame.frame_type;
+  const int base_qindex = cm->quant_params.base_qindex;
+  const int bit_depth = cm->seq_params.bit_depth;
+  const CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  const int mbs = cm->mi_params.MBs;
+  const int num4x4bl = mbs << 4;
+  // Weight for non-base segments: use actual number of blocks refreshed in
+  // previous/just encoded frame. Note number of blocks here is in 4x4 units.
+  const double weight_segment1 = (double)cr->actual_num_seg1_blocks / num4x4bl;
+  const double weight_segment2 = (double)cr->actual_num_seg2_blocks / num4x4bl;
+  // Take segment weighted average for estimated bits.
+  const int estimated_bits =
+      (int)((1.0 - weight_segment1 - weight_segment2) *
+                av1_estimate_bits_at_q(frame_type, base_qindex, mbs,
+                                       correction_factor, bit_depth) +
+            weight_segment1 * av1_estimate_bits_at_q(
+                                  frame_type, base_qindex + cr->qindex_delta[1],
+                                  mbs, correction_factor, bit_depth) +
+            weight_segment2 * av1_estimate_bits_at_q(
+                                  frame_type, base_qindex + cr->qindex_delta[2],
+                                  mbs, correction_factor, bit_depth));
+  return estimated_bits;
+}
+
+// Prior to encoding the frame, estimate the bits per mb, for a given q = i and
+// a corresponding delta-q (for segment 1). This function is called in the
+// rc_regulate_q() to set the base qp index.
+// Note: the segment map is set to either 0/CR_SEGMENT_ID_BASE (no refresh) or
+// to 1/CR_SEGMENT_ID_BOOST1 (refresh) for each superblock, prior to encoding.
+int av1_cyclic_refresh_rc_bits_per_mb(const AV1_COMP *cpi, int i,
+                                      double correction_factor) {
+  const AV1_COMMON *const cm = &cpi->common;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  int bits_per_mb;
+  int num4x4bl = cm->mi_params.MBs << 4;
+  // Weight for segment prior to encoding: take the average of the target
+  // number for the frame to be encoded and the actual from the previous frame.
+  double weight_segment =
+      (double)((cr->target_num_seg_blocks + cr->actual_num_seg1_blocks +
+                cr->actual_num_seg2_blocks) >>
+               1) /
+      num4x4bl;
+  // Compute delta-q corresponding to qindex i.
+  int deltaq = compute_deltaq(cpi, i, cr->rate_ratio_qdelta);
+  // Take segment weighted average for bits per mb.
+  bits_per_mb =
+      (int)((1.0 - weight_segment) *
+                av1_rc_bits_per_mb(cm->current_frame.frame_type, i,
+                                   correction_factor,
+                                   cm->seq_params.bit_depth) +
+            weight_segment * av1_rc_bits_per_mb(cm->current_frame.frame_type,
+                                                i + deltaq, correction_factor,
+                                                cm->seq_params.bit_depth));
+  return bits_per_mb;
+}
+
+// Prior to coding a given prediction block, of size bsize at (mi_row, mi_col),
+// check if we should reset the segment_id, and update the cyclic_refresh map
+// and segmentation map.
+void av1_cyclic_refresh_update_segment(const AV1_COMP *cpi,
+                                       MB_MODE_INFO *const mbmi, int mi_row,
+                                       int mi_col, BLOCK_SIZE bsize,
+                                       int64_t rate, int64_t dist, int skip) {
+  const AV1_COMMON *const cm = &cpi->common;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  const int bw = mi_size_wide[bsize];
+  const int bh = mi_size_high[bsize];
+  const int xmis = AOMMIN(cm->mi_params.mi_cols - mi_col, bw);
+  const int ymis = AOMMIN(cm->mi_params.mi_rows - mi_row, bh);
+  const int block_index = mi_row * cm->mi_params.mi_cols + mi_col;
+  const int refresh_this_block =
+      candidate_refresh_aq(cr, mbmi, rate, dist, bsize);
+  // Default is to not update the refresh map.
+  int new_map_value = cr->map[block_index];
+
+  // If this block is labeled for refresh, check if we should reset the
+  // segment_id.
+  if (cyclic_refresh_segment_id_boosted(mbmi->segment_id)) {
+    mbmi->segment_id = refresh_this_block;
+    // Reset segment_id if will be skipped.
+    if (skip) mbmi->segment_id = CR_SEGMENT_ID_BASE;
+  }
+
+  // Update the cyclic refresh map, to be used for setting segmentation map
+  // for the next frame. If the block  will be refreshed this frame, mark it
+  // as clean. The magnitude of the -ve influences how long before we consider
+  // it for refresh again.
+  if (cyclic_refresh_segment_id_boosted(mbmi->segment_id)) {
+    new_map_value = -cr->time_for_refresh;
+  } else if (refresh_this_block) {
+    // Else if it is accepted as candidate for refresh, and has not already
+    // been refreshed (marked as 1) then mark it as a candidate for cleanup
+    // for future time (marked as 0), otherwise don't update it.
+    if (cr->map[block_index] == 1) new_map_value = 0;
+  } else {
+    // Leave it marked as block that is not candidate for refresh.
+    new_map_value = 1;
+  }
+
+  // Update entries in the cyclic refresh map with new_map_value, and
+  // copy mbmi->segment_id into global segmentation map.
+  for (int y = 0; y < ymis; y++)
+    for (int x = 0; x < xmis; x++) {
+      int map_offset = block_index + y * cm->mi_params.mi_cols + x;
+      cr->map[map_offset] = new_map_value;
+      cpi->enc_seg.map[map_offset] = mbmi->segment_id;
+    }
+}
+
+// Update the some stats after encode frame is done.
+void av1_cyclic_refresh_postencode(AV1_COMP *const cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  unsigned char *const seg_map = cpi->enc_seg.map;
+  cr->cnt_zeromv = 0;
+  cr->actual_num_seg1_blocks = 0;
+  cr->actual_num_seg2_blocks = 0;
+  for (int mi_row = 0; mi_row < mi_params->mi_rows; mi_row++) {
+    for (int mi_col = 0; mi_col < mi_params->mi_cols; mi_col++) {
+      MB_MODE_INFO **mi =
+          mi_params->mi_grid_base + mi_row * mi_params->mi_stride + mi_col;
+      MV mv = mi[0]->mv[0].as_mv;
+      if (cm->seg.enabled) {
+        int map_index = mi_row * mi_params->mi_cols + mi_col;
+        if (cyclic_refresh_segment_id(seg_map[map_index]) ==
+            CR_SEGMENT_ID_BOOST1)
+          cr->actual_num_seg1_blocks++;
+        else if (cyclic_refresh_segment_id(seg_map[map_index]) ==
+                 CR_SEGMENT_ID_BOOST2)
+          cr->actual_num_seg2_blocks++;
+      }
+      // Accumulate low_content_frame.
+      if (is_inter_block(mi[0]) && abs(mv.row) < 16 && abs(mv.col) < 16)
+        cr->cnt_zeromv++;
+    }
+  }
+  cr->cnt_zeromv =
+      100 * cr->cnt_zeromv / (mi_params->mi_rows * mi_params->mi_cols);
+  cr->avg_frame_low_motion =
+      (3 * cr->avg_frame_low_motion + (double)cr->cnt_zeromv) / 4;
+}
+
+// Set golden frame update interval, for 1 pass CBR mode.
+void av1_cyclic_refresh_set_golden_update(AV1_COMP *const cpi) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  // Set minimum gf_interval for GF update to a multiple of the refresh period,
+  // with some max limit. Depending on past encoding stats, GF flag may be
+  // reset and update may not occur until next baseline_gf_interval.
+  if (cr->percent_refresh > 0)
+    rc->baseline_gf_interval = AOMMIN(2 * (100 / cr->percent_refresh), 40);
+  else
+    rc->baseline_gf_interval = 20;
+  if (cr->avg_frame_low_motion < 40) rc->baseline_gf_interval = 8;
+}
+
+// Update the segmentation map, and related quantities: cyclic refresh map,
+// refresh sb_index, and target number of blocks to be refreshed.
+// The map is set to either 0/CR_SEGMENT_ID_BASE (no refresh) or to
+// 1/CR_SEGMENT_ID_BOOST1 (refresh) for each superblock.
+// Blocks labeled as BOOST1 may later get set to BOOST2 (during the
+// encoding of the superblock).
+static void cyclic_refresh_update_map(AV1_COMP *const cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  unsigned char *const seg_map = cpi->enc_seg.map;
+  int i, block_count, bl_index, sb_rows, sb_cols, sbs_in_frame;
+  int xmis, ymis, x, y;
+  memset(seg_map, CR_SEGMENT_ID_BASE, mi_params->mi_rows * mi_params->mi_cols);
+  sb_cols = (mi_params->mi_cols + cm->seq_params.mib_size - 1) /
+            cm->seq_params.mib_size;
+  sb_rows = (mi_params->mi_rows + cm->seq_params.mib_size - 1) /
+            cm->seq_params.mib_size;
+  sbs_in_frame = sb_cols * sb_rows;
+  // Number of target blocks to get the q delta (segment 1).
+  block_count =
+      cr->percent_refresh * mi_params->mi_rows * mi_params->mi_cols / 100;
+  // Set the segmentation map: cycle through the superblocks, starting at
+  // cr->mb_index, and stopping when either block_count blocks have been found
+  // to be refreshed, or we have passed through whole frame.
+  if (cr->sb_index >= sbs_in_frame) cr->sb_index = 0;
+  assert(cr->sb_index < sbs_in_frame);
+  i = cr->sb_index;
+  cr->target_num_seg_blocks = 0;
+  do {
+    int sum_map = 0;
+    // Get the mi_row/mi_col corresponding to superblock index i.
+    int sb_row_index = (i / sb_cols);
+    int sb_col_index = i - sb_row_index * sb_cols;
+    int mi_row = sb_row_index * cm->seq_params.mib_size;
+    int mi_col = sb_col_index * cm->seq_params.mib_size;
+    // TODO(any): Ensure the population of
+    // cpi->common.features.allow_screen_content_tools and use the same instead
+    // of cpi->oxcf.content == AOM_CONTENT_SCREEN
+    int qindex_thresh = cpi->oxcf.content == AOM_CONTENT_SCREEN
+                            ? av1_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST2,
+                                             cm->quant_params.base_qindex)
+                            : 0;
+    assert(mi_row >= 0 && mi_row < mi_params->mi_rows);
+    assert(mi_col >= 0 && mi_col < mi_params->mi_cols);
+    bl_index = mi_row * mi_params->mi_cols + mi_col;
+    // Loop through all MI blocks in superblock and update map.
+    xmis = AOMMIN(mi_params->mi_cols - mi_col, cm->seq_params.mib_size);
+    ymis = AOMMIN(mi_params->mi_rows - mi_row, cm->seq_params.mib_size);
+    for (y = 0; y < ymis; y++) {
+      for (x = 0; x < xmis; x++) {
+        const int bl_index2 = bl_index + y * mi_params->mi_cols + x;
+        // If the block is as a candidate for clean up then mark it
+        // for possible boost/refresh (segment 1). The segment id may get
+        // reset to 0 later if block gets coded anything other than GLOBALMV.
+        if (cr->map[bl_index2] == 0) {
+          if (cr->last_coded_q_map[bl_index2] > qindex_thresh) sum_map++;
+        } else if (cr->map[bl_index2] < 0) {
+          cr->map[bl_index2]++;
+        }
+      }
+    }
+    // Enforce constant segment over superblock.
+    // If segment is at least half of superblock, set to 1.
+    if (sum_map >= xmis * ymis / 2) {
+      for (y = 0; y < ymis; y++)
+        for (x = 0; x < xmis; x++) {
+          seg_map[bl_index + y * mi_params->mi_cols + x] = CR_SEGMENT_ID_BOOST1;
+        }
+      cr->target_num_seg_blocks += xmis * ymis;
+    }
+    i++;
+    if (i == sbs_in_frame) {
+      i = 0;
+    }
+  } while (cr->target_num_seg_blocks < block_count && i != cr->sb_index);
+  cr->sb_index = i;
+}
+
+// Set cyclic refresh parameters.
+void av1_cyclic_refresh_update_parameters(AV1_COMP *const cpi) {
+  // TODO(marpan): Parameters need to be tuned.
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const AV1_COMMON *const cm = &cpi->common;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  int num4x4bl = cm->mi_params.MBs << 4;
+  int target_refresh = 0;
+  double weight_segment_target = 0;
+  double weight_segment = 0;
+  int qp_thresh = AOMMIN(20, rc->best_quality << 1);
+  int qp_max_thresh = 118 * MAXQ >> 7;
+  cr->apply_cyclic_refresh = 1;
+  if (frame_is_intra_only(cm) || is_lossless_requested(&cpi->oxcf) ||
+      cpi->svc.temporal_layer_id > 0 ||
+      rc->avg_frame_qindex[INTER_FRAME] < qp_thresh ||
+      (rc->frames_since_key > 20 &&
+       rc->avg_frame_qindex[INTER_FRAME] > qp_max_thresh) ||
+      (cr->avg_frame_low_motion < 45 && rc->frames_since_key > 40)) {
+    cr->apply_cyclic_refresh = 0;
+    return;
+  }
+  cr->percent_refresh = 10;
+  cr->max_qdelta_perc = 60;
+  cr->time_for_refresh = 0;
+  cr->motion_thresh = 32;
+  cr->rate_boost_fac = 15;
+  // Use larger delta-qp (increase rate_ratio_qdelta) for first few (~4)
+  // periods of the refresh cycle, after a key frame.
+  // Account for larger interval on base layer for temporal layers.
+  if (cr->percent_refresh > 0 &&
+      rc->frames_since_key < 400 / cr->percent_refresh) {
+    cr->rate_ratio_qdelta = 3.0;
+  } else {
+    cr->rate_ratio_qdelta = 2.0;
+  }
+  // Adjust some parameters for low resolutions.
+  if (cm->width * cm->height <= 352 * 288) {
+    if (rc->avg_frame_bandwidth < 3000) {
+      cr->motion_thresh = 16;
+      cr->rate_boost_fac = 13;
+    } else {
+      cr->max_qdelta_perc = 70;
+      cr->rate_ratio_qdelta = AOMMAX(cr->rate_ratio_qdelta, 2.5);
+    }
+  }
+  if (cpi->oxcf.rc_mode == AOM_VBR) {
+    // To be adjusted for VBR mode, e.g., based on gf period and boost.
+    // For now use smaller qp-delta (than CBR), no second boosted seg, and
+    // turn-off (no refresh) on golden refresh (since it's already boosted).
+    cr->percent_refresh = 10;
+    cr->rate_ratio_qdelta = 1.5;
+    cr->rate_boost_fac = 10;
+    if (cpi->refresh_golden_frame == 1) {
+      cr->percent_refresh = 0;
+      cr->rate_ratio_qdelta = 1.0;
+    }
+  }
+  // Weight for segment prior to encoding: take the average of the target
+  // number for the frame to be encoded and the actual from the previous frame.
+  // Use the target if its less. To be used for setting the base qp for the
+  // frame in av1_rc_regulate_q.
+  target_refresh =
+      cr->percent_refresh * cm->mi_params.mi_rows * cm->mi_params.mi_cols / 100;
+  weight_segment_target = (double)(target_refresh) / num4x4bl;
+  weight_segment = (double)((target_refresh + cr->actual_num_seg1_blocks +
+                             cr->actual_num_seg2_blocks) >>
+                            1) /
+                   num4x4bl;
+  if (weight_segment_target < 7 * weight_segment / 8)
+    weight_segment = weight_segment_target;
+  cr->weight_segment = weight_segment;
+}
+
+// Setup cyclic background refresh: set delta q and segmentation map.
+void av1_cyclic_refresh_setup(AV1_COMP *const cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  struct segmentation *const seg = &cm->seg;
+  int resolution_change =
+      cm->prev_frame && (cm->width != cm->prev_frame->width ||
+                         cm->height != cm->prev_frame->height);
+  if (resolution_change) av1_cyclic_refresh_reset_resize(cpi);
+  if (cm->current_frame.frame_number == 0) cr->low_content_avg = 0.0;
+  if (!cr->apply_cyclic_refresh) {
+    // Set segmentation map to 0 and disable.
+    unsigned char *const seg_map = cpi->enc_seg.map;
+    memset(seg_map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols);
+    av1_disable_segmentation(&cm->seg);
+    if (cm->current_frame.frame_type == KEY_FRAME) {
+      memset(cr->last_coded_q_map, MAXQ,
+             cm->mi_params.mi_rows * cm->mi_params.mi_cols *
+                 sizeof(*cr->last_coded_q_map));
+      cr->sb_index = 0;
+    }
+    return;
+  } else {
+    const double q = av1_convert_qindex_to_q(cm->quant_params.base_qindex,
+                                             cm->seq_params.bit_depth);
+    aom_clear_system_state();
+    // Set rate threshold to some multiple (set to 2 for now) of the target
+    // rate (target is given by sb64_target_rate and scaled by 256).
+    cr->thresh_rate_sb = ((int64_t)(rc->sb64_target_rate) << 8) << 2;
+    // Distortion threshold, quadratic in Q, scale factor to be adjusted.
+    // q will not exceed 457, so (q * q) is within 32bit; see:
+    // av1_convert_qindex_to_q(), av1_ac_quant(), ac_qlookup*[].
+    cr->thresh_dist_sb = ((int64_t)(q * q)) << 2;
+
+    // Set up segmentation.
+    // Clear down the segment map.
+    av1_enable_segmentation(&cm->seg);
+    av1_clearall_segfeatures(seg);
+
+    // Note: setting temporal_update has no effect, as the seg-map coding method
+    // (temporal or spatial) is determined in
+    // av1_choose_segmap_coding_method(),
+    // based on the coding cost of each method. For error_resilient mode on the
+    // last_frame_seg_map is set to 0, so if temporal coding is used, it is
+    // relative to 0 previous map.
+    // seg->temporal_update = 0;
+
+    // Segment BASE "Q" feature is disabled so it defaults to the baseline Q.
+    av1_disable_segfeature(seg, CR_SEGMENT_ID_BASE, SEG_LVL_ALT_Q);
+    // Use segment BOOST1 for in-frame Q adjustment.
+    av1_enable_segfeature(seg, CR_SEGMENT_ID_BOOST1, SEG_LVL_ALT_Q);
+    // Use segment BOOST2 for more aggressive in-frame Q adjustment.
+    av1_enable_segfeature(seg, CR_SEGMENT_ID_BOOST2, SEG_LVL_ALT_Q);
+
+    // Set the q delta for segment BOOST1.
+    const CommonQuantParams *const quant_params = &cm->quant_params;
+    int qindex_delta =
+        compute_deltaq(cpi, quant_params->base_qindex, cr->rate_ratio_qdelta);
+    cr->qindex_delta[1] = qindex_delta;
+
+    // Compute rd-mult for segment BOOST1.
+    const int qindex2 = clamp(
+        quant_params->base_qindex + quant_params->y_dc_delta_q + qindex_delta,
+        0, MAXQ);
+    cr->rdmult = av1_compute_rd_mult(cpi, qindex2);
+
+    av1_set_segdata(seg, CR_SEGMENT_ID_BOOST1, SEG_LVL_ALT_Q, qindex_delta);
+
+    // Set a more aggressive (higher) q delta for segment BOOST2.
+    qindex_delta = compute_deltaq(
+        cpi, quant_params->base_qindex,
+        AOMMIN(CR_MAX_RATE_TARGET_RATIO,
+               0.1 * cr->rate_boost_fac * cr->rate_ratio_qdelta));
+    cr->qindex_delta[2] = qindex_delta;
+    av1_set_segdata(seg, CR_SEGMENT_ID_BOOST2, SEG_LVL_ALT_Q, qindex_delta);
+
+    // Update the segmentation and refresh map.
+    cyclic_refresh_update_map(cpi);
+  }
+}
+
+int av1_cyclic_refresh_get_rdmult(const CYCLIC_REFRESH *cr) {
+  return cr->rdmult;
+}
+
+void av1_cyclic_refresh_reset_resize(AV1_COMP *const cpi) {
+  const AV1_COMMON *const cm = &cpi->common;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  memset(cr->map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols);
+  cr->sb_index = 0;
+  cpi->refresh_golden_frame = 1;
+}
diff --git a/libs/libaom/src/av1/encoder/aq_cyclicrefresh.h b/libs/libaom/src/av1/encoder/aq_cyclicrefresh.h
new file mode 100644
index 000000000..ee62f6aaa
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/aq_cyclicrefresh.h
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_AQ_CYCLICREFRESH_H_
+#define AOM_AV1_ENCODER_AQ_CYCLICREFRESH_H_
+
+#include "av1/common/blockd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// The segment ids used in cyclic refresh: from base (no boost) to increasing
+// boost (higher delta-qp).
+#define CR_SEGMENT_ID_BASE 0
+#define CR_SEGMENT_ID_BOOST1 1
+#define CR_SEGMENT_ID_BOOST2 2
+
+// Maximum rate target ratio for setting segment delta-qp.
+#define CR_MAX_RATE_TARGET_RATIO 4.0
+
+struct CYCLIC_REFRESH {
+  // Percentage of blocks per frame that are targeted as candidates
+  // for cyclic refresh.
+  int percent_refresh;
+  // Maximum q-delta as percentage of base q.
+  int max_qdelta_perc;
+  // Superblock starting index for cycling through the frame.
+  int sb_index;
+  // Controls how long block will need to wait to be refreshed again, in
+  // excess of the cycle time, i.e., in the case of all zero motion, block
+  // will be refreshed every (100/percent_refresh + time_for_refresh) frames.
+  int time_for_refresh;
+  // Target number of (4x4) blocks that are set for delta-q.
+  int target_num_seg_blocks;
+  // Actual number of (4x4) blocks that were applied delta-q.
+  int actual_num_seg1_blocks;
+  int actual_num_seg2_blocks;
+  // RD mult. parameters for segment 1.
+  int rdmult;
+  // Cyclic refresh map.
+  int8_t *map;
+  // Map of the last q a block was coded at.
+  uint8_t *last_coded_q_map;
+  // Thresholds applied to the projected rate/distortion of the coding block,
+  // when deciding whether block should be refreshed.
+  int64_t thresh_rate_sb;
+  int64_t thresh_dist_sb;
+  // Threshold applied to the motion vector (in units of 1/8 pel) of the
+  // coding block, when deciding whether block should be refreshed.
+  int16_t motion_thresh;
+  // Rate target ratio to set q delta.
+  double rate_ratio_qdelta;
+  // Boost factor for rate target ratio, for segment CR_SEGMENT_ID_BOOST2.
+  int rate_boost_fac;
+  double low_content_avg;
+  int qindex_delta[3];
+  double weight_segment;
+  int apply_cyclic_refresh;
+  int cnt_zeromv;
+  double avg_frame_low_motion;
+};
+
+struct AV1_COMP;
+
+typedef struct CYCLIC_REFRESH CYCLIC_REFRESH;
+
+CYCLIC_REFRESH *av1_cyclic_refresh_alloc(int mi_rows, int mi_cols);
+
+void av1_cyclic_refresh_free(CYCLIC_REFRESH *cr);
+
+// Estimate the bits, incorporating the delta-q from segment 1, after encoding
+// the frame.
+int av1_cyclic_refresh_estimate_bits_at_q(const struct AV1_COMP *cpi,
+                                          double correction_factor);
+
+// Estimate the bits per mb, for a given q = i and a corresponding delta-q
+// (for segment 1), prior to encoding the frame.
+int av1_cyclic_refresh_rc_bits_per_mb(const struct AV1_COMP *cpi, int i,
+                                      double correction_factor);
+
+// Prior to coding a given prediction block, of size bsize at (mi_row, mi_col),
+// check if we should reset the segment_id, and update the cyclic_refresh map
+// and segmentation map.
+void av1_cyclic_refresh_update_segment(const struct AV1_COMP *cpi,
+                                       MB_MODE_INFO *const mbmi, int mi_row,
+                                       int mi_col, BLOCK_SIZE bsize,
+                                       int64_t rate, int64_t dist, int skip);
+
+// Update the some stats after encode frame is done.
+void av1_cyclic_refresh_postencode(struct AV1_COMP *const cpi);
+
+// Set golden frame update interval, for 1 pass CBR mode.
+void av1_cyclic_refresh_set_golden_update(struct AV1_COMP *const cpi);
+
+// Set/update global/frame level refresh parameters.
+void av1_cyclic_refresh_update_parameters(struct AV1_COMP *const cpi);
+
+// Setup cyclic background refresh: set delta q and segmentation map.
+void av1_cyclic_refresh_setup(struct AV1_COMP *const cpi);
+
+int av1_cyclic_refresh_get_rdmult(const CYCLIC_REFRESH *cr);
+
+void av1_cyclic_refresh_reset_resize(struct AV1_COMP *const cpi);
+
+static INLINE int cyclic_refresh_segment_id_boosted(int segment_id) {
+  return segment_id == CR_SEGMENT_ID_BOOST1 ||
+         segment_id == CR_SEGMENT_ID_BOOST2;
+}
+
+static INLINE int cyclic_refresh_segment_id(int segment_id) {
+  if (segment_id == CR_SEGMENT_ID_BOOST1)
+    return CR_SEGMENT_ID_BOOST1;
+  else if (segment_id == CR_SEGMENT_ID_BOOST2)
+    return CR_SEGMENT_ID_BOOST2;
+  else
+    return CR_SEGMENT_ID_BASE;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_AQ_CYCLICREFRESH_H_
diff --git a/libs/libaom/src/av1/encoder/aq_variance.c b/libs/libaom/src/av1/encoder/aq_variance.c
new file mode 100644
index 000000000..4176da292
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/aq_variance.c
@@ -0,0 +1,205 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+
+#include "aom_ports/mem.h"
+
+#include "av1/encoder/aq_variance.h"
+#include "av1/common/seg_common.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/rd.h"
+#include "av1/encoder/segmentation.h"
+#include "av1/encoder/dwt.h"
+#include "aom_ports/system_state.h"
+
+static const double rate_ratio[MAX_SEGMENTS] = { 2.2, 1.7, 1.3, 1.0,
+                                                 0.9, .8,  .7,  .6 };
+
+static const double deltaq_rate_ratio[MAX_SEGMENTS] = { 2.5,  2.0, 1.5, 1.0,
+                                                        0.75, 1.0, 1.0, 1.0 };
+#define ENERGY_MIN (-4)
+#define ENERGY_MAX (1)
+#define ENERGY_SPAN (ENERGY_MAX - ENERGY_MIN + 1)
+#define ENERGY_IN_BOUNDS(energy) \
+  assert((energy) >= ENERGY_MIN && (energy) <= ENERGY_MAX)
+
+DECLARE_ALIGNED(16, static const uint8_t, av1_all_zeros[MAX_SB_SIZE]) = { 0 };
+
+DECLARE_ALIGNED(16, static const uint16_t,
+                av1_highbd_all_zeros[MAX_SB_SIZE]) = { 0 };
+
+static const int segment_id[ENERGY_SPAN] = { 0, 1, 1, 2, 3, 4 };
+
+#define SEGMENT_ID(i) segment_id[(i)-ENERGY_MIN]
+
+void av1_vaq_frame_setup(AV1_COMP *cpi) {
+  AV1_COMMON *cm = &cpi->common;
+  const int base_qindex = cm->quant_params.base_qindex;
+  struct segmentation *seg = &cm->seg;
+  int i;
+
+  int resolution_change =
+      cm->prev_frame && (cm->width != cm->prev_frame->width ||
+                         cm->height != cm->prev_frame->height);
+  int avg_energy = (int)(cpi->twopass.mb_av_energy - 2);
+  double avg_ratio;
+  if (avg_energy > 7) avg_energy = 7;
+  if (avg_energy < 0) avg_energy = 0;
+  avg_ratio = rate_ratio[avg_energy];
+
+  if (resolution_change) {
+    memset(cpi->enc_seg.map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols);
+    av1_clearall_segfeatures(seg);
+    aom_clear_system_state();
+    av1_disable_segmentation(seg);
+    return;
+  }
+  if (frame_is_intra_only(cm) || cm->features.error_resilient_mode ||
+      cpi->refresh_alt_ref_frame ||
+      (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
+    cpi->vaq_refresh = 1;
+
+    av1_enable_segmentation(seg);
+    av1_clearall_segfeatures(seg);
+
+    aom_clear_system_state();
+
+    for (i = 0; i < MAX_SEGMENTS; ++i) {
+      // Set up avg segment id to be 1.0 and adjust the other segments around
+      // it.
+      int qindex_delta = av1_compute_qdelta_by_rate(
+          &cpi->rc, cm->current_frame.frame_type, base_qindex,
+          rate_ratio[i] / avg_ratio, cm->seq_params.bit_depth);
+
+      // We don't allow qindex 0 in a segment if the base value is not 0.
+      // Q index 0 (lossless) implies 4x4 encoding only and in AQ mode a segment
+      // Q delta is sometimes applied without going back around the rd loop.
+      // This could lead to an illegal combination of partition size and q.
+      if ((base_qindex != 0) && ((base_qindex + qindex_delta) == 0)) {
+        qindex_delta = -base_qindex + 1;
+      }
+
+      av1_set_segdata(seg, i, SEG_LVL_ALT_Q, qindex_delta);
+      av1_enable_segfeature(seg, i, SEG_LVL_ALT_Q);
+    }
+  }
+}
+
+int av1_log_block_var(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) {
+  // This functions returns a score for the blocks local variance as calculated
+  // by: sum of the log of the (4x4 variances) of each subblock to the current
+  // block (x,bs)
+  // * 32 / number of pixels in the block_size.
+  // This is used for segmentation because to avoid situations in which a large
+  // block with a gentle gradient gets marked high variance even though each
+  // subblock has a low variance.   This allows us to assign the same segment
+  // number for the same sorts of area regardless of how the partitioning goes.
+
+  MACROBLOCKD *xd = &x->e_mbd;
+  double var = 0;
+  unsigned int sse;
+  int i, j;
+
+  int right_overflow =
+      (xd->mb_to_right_edge < 0) ? ((-xd->mb_to_right_edge) >> 3) : 0;
+  int bottom_overflow =
+      (xd->mb_to_bottom_edge < 0) ? ((-xd->mb_to_bottom_edge) >> 3) : 0;
+
+  const int bw = MI_SIZE * mi_size_wide[bs] - right_overflow;
+  const int bh = MI_SIZE * mi_size_high[bs] - bottom_overflow;
+
+  aom_clear_system_state();
+
+  for (i = 0; i < bh; i += 4) {
+    for (j = 0; j < bw; j += 4) {
+      if (is_cur_buf_hbd(xd)) {
+        var +=
+            log(1.0 + cpi->fn_ptr[BLOCK_4X4].vf(
+                          x->plane[0].src.buf + i * x->plane[0].src.stride + j,
+                          x->plane[0].src.stride,
+                          CONVERT_TO_BYTEPTR(av1_highbd_all_zeros), 0, &sse) /
+                          16);
+      } else {
+        var +=
+            log(1.0 + cpi->fn_ptr[BLOCK_4X4].vf(
+                          x->plane[0].src.buf + i * x->plane[0].src.stride + j,
+                          x->plane[0].src.stride, av1_all_zeros, 0, &sse) /
+                          16);
+      }
+    }
+  }
+  // Use average of 4x4 log variance. The range for 8 bit 0 - 9.704121561.
+  var /= (bw / 4 * bh / 4);
+  if (var > 7) var = 7;
+
+  aom_clear_system_state();
+  return (int)(var);
+}
+
+#define DEFAULT_E_MIDPOINT 10.0
+
+static unsigned int haar_ac_energy(MACROBLOCK *x, BLOCK_SIZE bs) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  int stride = x->plane[0].src.stride;
+  uint8_t *buf = x->plane[0].src.buf;
+  const int bw = MI_SIZE * mi_size_wide[bs];
+  const int bh = MI_SIZE * mi_size_high[bs];
+  const int hbd = is_cur_buf_hbd(xd);
+
+  int var = 0;
+  for (int r = 0; r < bh; r += 8)
+    for (int c = 0; c < bw; c += 8) {
+      var += av1_haar_ac_sad_8x8_uint8_input(buf + c + r * stride, stride, hbd);
+    }
+
+  return (unsigned int)((uint64_t)var * 256) >> num_pels_log2_lookup[bs];
+}
+
+double av1_log_block_wavelet_energy(MACROBLOCK *x, BLOCK_SIZE bs) {
+  unsigned int haar_sad = haar_ac_energy(x, bs);
+  aom_clear_system_state();
+  return log(haar_sad + 1.0);
+}
+
+int av1_block_wavelet_energy_level(const AV1_COMP *cpi, MACROBLOCK *x,
+                                   BLOCK_SIZE bs) {
+  double energy, energy_midpoint;
+  aom_clear_system_state();
+  energy_midpoint = (is_stat_consumption_stage_twopass(cpi))
+                        ? cpi->twopass.frame_avg_haar_energy
+                        : DEFAULT_E_MIDPOINT;
+  energy = av1_log_block_wavelet_energy(x, bs) - energy_midpoint;
+  return clamp((int)round(energy), ENERGY_MIN, ENERGY_MAX);
+}
+
+int av1_compute_q_from_energy_level_deltaq_mode(const AV1_COMP *const cpi,
+                                                int block_var_level) {
+  int rate_level;
+  const AV1_COMMON *const cm = &cpi->common;
+
+  if (DELTA_Q_PERCEPTUAL_MODULATION == 1) {
+    ENERGY_IN_BOUNDS(block_var_level);
+    rate_level = SEGMENT_ID(block_var_level);
+  } else {
+    rate_level = block_var_level;
+  }
+  const int base_qindex = cm->quant_params.base_qindex;
+  int qindex_delta = av1_compute_qdelta_by_rate(
+      &cpi->rc, cm->current_frame.frame_type, base_qindex,
+      deltaq_rate_ratio[rate_level], cm->seq_params.bit_depth);
+
+  if ((base_qindex != 0) && ((base_qindex + qindex_delta) == 0)) {
+    qindex_delta = -base_qindex + 1;
+  }
+  return base_qindex + qindex_delta;
+}
diff --git a/libs/libaom/src/av1/encoder/aq_variance.h b/libs/libaom/src/av1/encoder/aq_variance.h
new file mode 100644
index 000000000..543eb0b51
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/aq_variance.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_AQ_VARIANCE_H_
+#define AOM_AV1_ENCODER_AQ_VARIANCE_H_
+
+#include "av1/encoder/encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_vaq_frame_setup(AV1_COMP *cpi);
+
+int av1_log_block_var(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs);
+int av1_compute_q_from_energy_level_deltaq_mode(const AV1_COMP *const cpi,
+                                                int block_var_level);
+int av1_block_wavelet_energy_level(const AV1_COMP *cpi, MACROBLOCK *x,
+                                   BLOCK_SIZE bs);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_AQ_VARIANCE_H_
diff --git a/libs/libaom/src/av1/encoder/arm/neon/av1_error_neon.c b/libs/libaom/src/av1/encoder/arm/neon/av1_error_neon.c
new file mode 100644
index 000000000..22da1a8d6
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/arm/neon/av1_error_neon.c
@@ -0,0 +1,85 @@
+/*
+ *  Copyright (c) 2019, Alliance for Open Media. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "av1/common/arm/mem_neon.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+int64_t av1_block_error_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff,
+                             intptr_t block_size, int64_t *ssz) {
+  int64x2_t error = vdupq_n_s64(0);
+  int64x2_t sqcoeff = vdupq_n_s64(0);
+
+  assert(block_size >= 8);
+  assert((block_size % 8) == 0);
+
+  do {
+    const int16x8_t c = load_tran_low_to_s16q(coeff);
+    const int16x8_t d = load_tran_low_to_s16q(dqcoeff);
+    const int16x8_t diff = vsubq_s16(c, d);
+    const int16x4_t diff_lo = vget_low_s16(diff);
+    const int16x4_t diff_hi = vget_high_s16(diff);
+    // diff is 15-bits, the squares 30, so we can store 2 in 31-bits before
+    // accumulating them in 64-bits.
+    const int32x4_t err0 = vmull_s16(diff_lo, diff_lo);
+    const int32x4_t err1 = vmlal_s16(err0, diff_hi, diff_hi);
+    const int64x2_t err2 = vaddl_s32(vget_low_s32(err1), vget_high_s32(err1));
+    error = vaddq_s64(error, err2);
+
+    const int16x4_t coeff_lo = vget_low_s16(c);
+    const int16x4_t coeff_hi = vget_high_s16(c);
+    const int32x4_t sqcoeff0 = vmull_s16(coeff_lo, coeff_lo);
+    const int32x4_t sqcoeff1 = vmlal_s16(sqcoeff0, coeff_hi, coeff_hi);
+    const int64x2_t sqcoeff2 =
+        vaddl_s32(vget_low_s32(sqcoeff1), vget_high_s32(sqcoeff1));
+    sqcoeff = vaddq_s64(sqcoeff, sqcoeff2);
+
+    coeff += 8;
+    dqcoeff += 8;
+    block_size -= 8;
+  } while (block_size != 0);
+
+#if defined(__aarch64__)
+  *ssz = vaddvq_s64(sqcoeff);
+  return vaddvq_s64(error);
+#else
+  *ssz = vgetq_lane_s64(sqcoeff, 0) + vgetq_lane_s64(sqcoeff, 1);
+  return vgetq_lane_s64(error, 0) + vgetq_lane_s64(error, 1);
+#endif
+}
+
+int64_t av1_block_error_lp_neon(const int16_t *coeff, const int16_t *dqcoeff,
+                                int block_size) {
+  int64x2_t error = vdupq_n_s64(0);
+
+  assert(block_size >= 8);
+  assert((block_size % 8) == 0);
+
+  do {
+    const int16x8_t c = vld1q_s16(coeff);
+    const int16x8_t d = vld1q_s16(dqcoeff);
+    const int16x8_t diff = vsubq_s16(c, d);
+    const int16x4_t diff_lo = vget_low_s16(diff);
+    const int16x4_t diff_hi = vget_high_s16(diff);
+    // diff is 15-bits, the squares 30, so we can store 2 in 31-bits before
+    // accumulating them in 64-bits.
+    const int32x4_t err0 = vmull_s16(diff_lo, diff_lo);
+    const int32x4_t err1 = vmlal_s16(err0, diff_hi, diff_hi);
+    const int64x2_t err2 = vaddl_s32(vget_low_s32(err1), vget_high_s32(err1));
+    error = vaddq_s64(error, err2);
+    coeff += 8;
+    dqcoeff += 8;
+    block_size -= 8;
+  } while (block_size != 0);
+
+  return vgetq_lane_s64(error, 0) + vgetq_lane_s64(error, 1);
+}
diff --git a/libs/libaom/src/av1/encoder/arm/neon/quantize_neon.c b/libs/libaom/src/av1/encoder/arm/neon/quantize_neon.c
new file mode 100644
index 000000000..c2f50a217
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/arm/neon/quantize_neon.c
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include <math.h>
+
+#include "aom_mem/aom_mem.h"
+
+#include "av1/common/quant_common.h"
+#include "av1/common/seg_common.h"
+#include "av1/common/arm/mem_neon.h"
+
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/rd.h"
+
+void av1_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count,
+                          const int16_t *zbin_ptr, const int16_t *round_ptr,
+                          const int16_t *quant_ptr,
+                          const int16_t *quant_shift_ptr,
+                          tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                          const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                          const int16_t *scan, const int16_t *iscan) {
+  // TODO(jingning) Decide the need of these arguments after the
+  // quantization process is completed.
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  (void)scan;
+
+  // Quantization pass: All coefficients with index >= zero_flag are
+  // skippable. Note: zero_flag can be zero.
+  int i;
+  const int16x8_t v_zero = vdupq_n_s16(0);
+  const int16x8_t v_one = vdupq_n_s16(1);
+  int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1);
+  int16x8_t v_round = vmovq_n_s16(round_ptr[1]);
+  int16x8_t v_quant = vmovq_n_s16(quant_ptr[1]);
+  int16x8_t v_dequant = vmovq_n_s16(dequant_ptr[1]);
+  // adjust for dc
+  v_round = vsetq_lane_s16(round_ptr[0], v_round, 0);
+  v_quant = vsetq_lane_s16(quant_ptr[0], v_quant, 0);
+  v_dequant = vsetq_lane_s16(dequant_ptr[0], v_dequant, 0);
+  // process dc and the first seven ac coeffs
+  {
+    const int16x8_t v_iscan = vld1q_s16(&iscan[0]);
+    const int16x8_t v_coeff = load_tran_low_to_s16q(&coeff_ptr[0]);
+    const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+    const int16x8_t v_abs = vabsq_s16(v_coeff);
+    const int16x8_t v_tmp = vqaddq_s16(v_abs, v_round);
+    const int32x4_t v_tmp_lo =
+        vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant));
+    const int32x4_t v_tmp_hi =
+        vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant));
+    const int16x8_t v_tmp2 =
+        vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16));
+    const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
+    const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);
+    const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1);
+    const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
+    const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
+    const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant);
+    v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);
+    store_s16q_to_tran_low(&qcoeff_ptr[0], v_qcoeff);
+    store_s16q_to_tran_low(&dqcoeff_ptr[0], v_dqcoeff);
+    v_round = vmovq_n_s16(round_ptr[1]);
+    v_quant = vmovq_n_s16(quant_ptr[1]);
+    v_dequant = vmovq_n_s16(dequant_ptr[1]);
+  }
+  // now process the rest of the ac coeffs
+  for (i = 8; i < count; i += 8) {
+    const int16x8_t v_iscan = vld1q_s16(&iscan[i]);
+    const int16x8_t v_coeff = load_tran_low_to_s16q(&coeff_ptr[i]);
+    const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+    const int16x8_t v_abs = vabsq_s16(v_coeff);
+    const int16x8_t v_tmp = vqaddq_s16(v_abs, v_round);
+    const int32x4_t v_tmp_lo =
+        vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant));
+    const int32x4_t v_tmp_hi =
+        vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant));
+    const int16x8_t v_tmp2 =
+        vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16));
+    const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
+    const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);
+    const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1);
+    const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
+    const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
+    const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant);
+    v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);
+    store_s16q_to_tran_low(&qcoeff_ptr[i], v_qcoeff);
+    store_s16q_to_tran_low(&dqcoeff_ptr[i], v_dqcoeff);
+  }
+#ifdef __aarch64__
+  *eob_ptr = vmaxvq_s16(v_eobmax_76543210);
+#else
+  {
+    const int16x4_t v_eobmax_3210 = vmax_s16(vget_low_s16(v_eobmax_76543210),
+                                             vget_high_s16(v_eobmax_76543210));
+    const int64x1_t v_eobmax_xx32 =
+        vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32);
+    const int16x4_t v_eobmax_tmp =
+        vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32));
+    const int64x1_t v_eobmax_xxx3 =
+        vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16);
+    const int16x4_t v_eobmax_final =
+        vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));
+
+    *eob_ptr = (uint16_t)vget_lane_s16(v_eobmax_final, 0);
+  }
+#endif  // __aarch64__
+}
+
+static INLINE void calculate_dqcoeff_lp_and_store(const int16x8_t qcoeff,
+                                                  const int16x8_t dequant,
+                                                  int16_t *dqcoeff) {
+  const int32x4_t dqcoeff_0 =
+      vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant));
+  const int32x4_t dqcoeff_1 =
+      vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant));
+
+  vst1q_s16(dqcoeff, vcombine_s16(vmovn_s32(dqcoeff_0), vmovn_s32(dqcoeff_1)));
+}
+
+void av1_quantize_lp_neon(const int16_t *coeff_ptr, intptr_t count,
+                          const int16_t *round_ptr, const int16_t *quant_ptr,
+                          int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+                          const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                          const int16_t *scan) {
+  // Quantization pass: All coefficients with index >= zero_flag are
+  // skippable. Note: zero_flag can be zero.
+  const int16x8_t v_zero = vdupq_n_s16(0);
+  const int16x8_t v_one = vdupq_n_s16(1);
+  int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1);
+  int16x8_t v_round = vmovq_n_s16(round_ptr[1]);
+  int16x8_t v_quant = vmovq_n_s16(quant_ptr[1]);
+  int16x8_t v_dequant = vmovq_n_s16(dequant_ptr[1]);
+
+  // adjust for dc
+  v_round = vsetq_lane_s16(round_ptr[0], v_round, 0);
+  v_quant = vsetq_lane_s16(quant_ptr[0], v_quant, 0);
+  v_dequant = vsetq_lane_s16(dequant_ptr[0], v_dequant, 0);
+  // process dc and the first seven ac coeffs
+  {
+    const int16x8_t v_iscan = vld1q_s16(&scan[0]);
+    const int16x8_t v_coeff = vld1q_s16(coeff_ptr);
+    const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+    const int16x8_t v_abs = vabsq_s16(v_coeff);
+    const int16x8_t v_tmp = vqaddq_s16(v_abs, v_round);
+    const int32x4_t v_tmp_lo =
+        vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant));
+    const int32x4_t v_tmp_hi =
+        vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant));
+    const int16x8_t v_tmp2 =
+        vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16));
+    const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
+    const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);
+    const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1);
+    const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
+    const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
+    calculate_dqcoeff_lp_and_store(v_qcoeff, v_dequant, dqcoeff_ptr);
+    v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);
+    vst1q_s16(qcoeff_ptr, v_qcoeff);
+    v_round = vmovq_n_s16(round_ptr[1]);
+    v_quant = vmovq_n_s16(quant_ptr[1]);
+    v_dequant = vmovq_n_s16(dequant_ptr[1]);
+  }
+  // now process the rest of the ac coeffs
+  for (int i = 8; i < count; i += 8) {
+    const int16x8_t v_iscan = vld1q_s16(&scan[i]);
+    const int16x8_t v_coeff = vld1q_s16(coeff_ptr + i);
+    const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+    const int16x8_t v_abs = vabsq_s16(v_coeff);
+    const int16x8_t v_tmp = vqaddq_s16(v_abs, v_round);
+    const int32x4_t v_tmp_lo =
+        vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant));
+    const int32x4_t v_tmp_hi =
+        vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant));
+    const int16x8_t v_tmp2 =
+        vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16));
+    const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
+    const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);
+    const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1);
+    const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
+    const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
+    calculate_dqcoeff_lp_and_store(v_qcoeff, v_dequant, dqcoeff_ptr + i);
+    v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);
+    vst1q_s16(qcoeff_ptr + i, v_qcoeff);
+  }
+#ifdef __aarch64__
+  *eob_ptr = vmaxvq_s16(v_eobmax_76543210);
+#else
+  {
+    const int16x4_t v_eobmax_3210 = vmax_s16(vget_low_s16(v_eobmax_76543210),
+                                             vget_high_s16(v_eobmax_76543210));
+    const int64x1_t v_eobmax_xx32 =
+        vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32);
+    const int16x4_t v_eobmax_tmp =
+        vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32));
+    const int64x1_t v_eobmax_xxx3 =
+        vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16);
+    const int16x4_t v_eobmax_final =
+        vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));
+
+    *eob_ptr = (uint16_t)vget_lane_s16(v_eobmax_final, 0);
+  }
+#endif  // __aarch64__
+}
diff --git a/libs/libaom/src/av1/encoder/av1_fwd_txfm1d.c b/libs/libaom/src/av1/encoder/av1_fwd_txfm1d.c
new file mode 100644
index 000000000..6601c19ab
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/av1_fwd_txfm1d.c
@@ -0,0 +1,1885 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+#include "av1/encoder/av1_fwd_txfm1d.h"
+#include "av1/common/av1_txfm.h"
+
+void av1_fdct4(const int32_t *input, int32_t *output, int8_t cos_bit,
+               const int8_t *stage_range) {
+  const int32_t size = 4;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[4];
+
+  // stage 0;
+  av1_range_check_buf(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0] + input[3];
+  bf1[1] = input[1] + input[2];
+  bf1[2] = -input[2] + input[1];
+  bf1[3] = -input[3] + input[0];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
+  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
+  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[2];
+  bf1[2] = bf0[1];
+  bf1[3] = bf0[3];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+}
+
+void av1_fdct8(const int32_t *input, int32_t *output, int8_t cos_bit,
+               const int8_t *stage_range) {
+  const int32_t size = 8;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[8];
+
+  // stage 0;
+  av1_range_check_buf(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0] + input[7];
+  bf1[1] = input[1] + input[6];
+  bf1[2] = input[2] + input[5];
+  bf1[3] = input[3] + input[4];
+  bf1[4] = -input[4] + input[3];
+  bf1[5] = -input[5] + input[2];
+  bf1[6] = -input[6] + input[1];
+  bf1[7] = -input[7] + input[0];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[3];
+  bf1[1] = bf0[1] + bf0[2];
+  bf1[2] = -bf0[2] + bf0[1];
+  bf1[3] = -bf0[3] + bf0[0];
+  bf1[4] = bf0[4];
+  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
+  bf1[7] = bf0[7];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
+  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
+  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
+  bf1[4] = bf0[4] + bf0[5];
+  bf1[5] = -bf0[5] + bf0[4];
+  bf1[6] = -bf0[6] + bf0[7];
+  bf1[7] = bf0[7] + bf0[6];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
+  bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
+  bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
+  bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[4];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[6];
+  bf1[4] = bf0[1];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[3];
+  bf1[7] = bf0[7];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+}
+
+void av1_fdct16(const int32_t *input, int32_t *output, int8_t cos_bit,
+                const int8_t *stage_range) {
+  const int32_t size = 16;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[16];
+
+  // stage 0;
+  av1_range_check_buf(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0] + input[15];
+  bf1[1] = input[1] + input[14];
+  bf1[2] = input[2] + input[13];
+  bf1[3] = input[3] + input[12];
+  bf1[4] = input[4] + input[11];
+  bf1[5] = input[5] + input[10];
+  bf1[6] = input[6] + input[9];
+  bf1[7] = input[7] + input[8];
+  bf1[8] = -input[8] + input[7];
+  bf1[9] = -input[9] + input[6];
+  bf1[10] = -input[10] + input[5];
+  bf1[11] = -input[11] + input[4];
+  bf1[12] = -input[12] + input[3];
+  bf1[13] = -input[13] + input[2];
+  bf1[14] = -input[14] + input[1];
+  bf1[15] = -input[15] + input[0];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[7];
+  bf1[1] = bf0[1] + bf0[6];
+  bf1[2] = bf0[2] + bf0[5];
+  bf1[3] = bf0[3] + bf0[4];
+  bf1[4] = -bf0[4] + bf0[3];
+  bf1[5] = -bf0[5] + bf0[2];
+  bf1[6] = -bf0[6] + bf0[1];
+  bf1[7] = -bf0[7] + bf0[0];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
+  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
+  bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit);
+  bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[3];
+  bf1[1] = bf0[1] + bf0[2];
+  bf1[2] = -bf0[2] + bf0[1];
+  bf1[3] = -bf0[3] + bf0[0];
+  bf1[4] = bf0[4];
+  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8] + bf0[11];
+  bf1[9] = bf0[9] + bf0[10];
+  bf1[10] = -bf0[10] + bf0[9];
+  bf1[11] = -bf0[11] + bf0[8];
+  bf1[12] = -bf0[12] + bf0[15];
+  bf1[13] = -bf0[13] + bf0[14];
+  bf1[14] = bf0[14] + bf0[13];
+  bf1[15] = bf0[15] + bf0[12];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
+  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
+  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
+  bf1[4] = bf0[4] + bf0[5];
+  bf1[5] = -bf0[5] + bf0[4];
+  bf1[6] = -bf0[6] + bf0[7];
+  bf1[7] = bf0[7] + bf0[6];
+  bf1[8] = bf0[8];
+  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
+  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
+  bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
+  bf1[15] = bf0[15];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
+  bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
+  bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
+  bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
+  bf1[8] = bf0[8] + bf0[9];
+  bf1[9] = -bf0[9] + bf0[8];
+  bf1[10] = -bf0[10] + bf0[11];
+  bf1[11] = bf0[11] + bf0[10];
+  bf1[12] = bf0[12] + bf0[13];
+  bf1[13] = -bf0[13] + bf0[12];
+  bf1[14] = -bf0[14] + bf0[15];
+  bf1[15] = bf0[15] + bf0[14];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit);
+  bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit);
+  bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit);
+  bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit);
+  bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit);
+  bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit);
+  bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit);
+  bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[8];
+  bf1[2] = bf0[4];
+  bf1[3] = bf0[12];
+  bf1[4] = bf0[2];
+  bf1[5] = bf0[10];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[14];
+  bf1[8] = bf0[1];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[5];
+  bf1[11] = bf0[13];
+  bf1[12] = bf0[3];
+  bf1[13] = bf0[11];
+  bf1[14] = bf0[7];
+  bf1[15] = bf0[15];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+}
+
+void av1_fdct32(const int32_t *input, int32_t *output, int8_t cos_bit,
+                const int8_t *stage_range) {
+  const int32_t size = 32;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[32];
+
+  // stage 0;
+  av1_range_check_buf(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0] + input[31];
+  bf1[1] = input[1] + input[30];
+  bf1[2] = input[2] + input[29];
+  bf1[3] = input[3] + input[28];
+  bf1[4] = input[4] + input[27];
+  bf1[5] = input[5] + input[26];
+  bf1[6] = input[6] + input[25];
+  bf1[7] = input[7] + input[24];
+  bf1[8] = input[8] + input[23];
+  bf1[9] = input[9] + input[22];
+  bf1[10] = input[10] + input[21];
+  bf1[11] = input[11] + input[20];
+  bf1[12] = input[12] + input[19];
+  bf1[13] = input[13] + input[18];
+  bf1[14] = input[14] + input[17];
+  bf1[15] = input[15] + input[16];
+  bf1[16] = -input[16] + input[15];
+  bf1[17] = -input[17] + input[14];
+  bf1[18] = -input[18] + input[13];
+  bf1[19] = -input[19] + input[12];
+  bf1[20] = -input[20] + input[11];
+  bf1[21] = -input[21] + input[10];
+  bf1[22] = -input[22] + input[9];
+  bf1[23] = -input[23] + input[8];
+  bf1[24] = -input[24] + input[7];
+  bf1[25] = -input[25] + input[6];
+  bf1[26] = -input[26] + input[5];
+  bf1[27] = -input[27] + input[4];
+  bf1[28] = -input[28] + input[3];
+  bf1[29] = -input[29] + input[2];
+  bf1[30] = -input[30] + input[1];
+  bf1[31] = -input[31] + input[0];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[15];
+  bf1[1] = bf0[1] + bf0[14];
+  bf1[2] = bf0[2] + bf0[13];
+  bf1[3] = bf0[3] + bf0[12];
+  bf1[4] = bf0[4] + bf0[11];
+  bf1[5] = bf0[5] + bf0[10];
+  bf1[6] = bf0[6] + bf0[9];
+  bf1[7] = bf0[7] + bf0[8];
+  bf1[8] = -bf0[8] + bf0[7];
+  bf1[9] = -bf0[9] + bf0[6];
+  bf1[10] = -bf0[10] + bf0[5];
+  bf1[11] = -bf0[11] + bf0[4];
+  bf1[12] = -bf0[12] + bf0[3];
+  bf1[13] = -bf0[13] + bf0[2];
+  bf1[14] = -bf0[14] + bf0[1];
+  bf1[15] = -bf0[15] + bf0[0];
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = bf0[18];
+  bf1[19] = bf0[19];
+  bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
+  bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
+  bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
+  bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
+  bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit);
+  bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit);
+  bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit);
+  bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit);
+  bf1[28] = bf0[28];
+  bf1[29] = bf0[29];
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[7];
+  bf1[1] = bf0[1] + bf0[6];
+  bf1[2] = bf0[2] + bf0[5];
+  bf1[3] = bf0[3] + bf0[4];
+  bf1[4] = -bf0[4] + bf0[3];
+  bf1[5] = -bf0[5] + bf0[2];
+  bf1[6] = -bf0[6] + bf0[1];
+  bf1[7] = -bf0[7] + bf0[0];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
+  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
+  bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit);
+  bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = bf0[16] + bf0[23];
+  bf1[17] = bf0[17] + bf0[22];
+  bf1[18] = bf0[18] + bf0[21];
+  bf1[19] = bf0[19] + bf0[20];
+  bf1[20] = -bf0[20] + bf0[19];
+  bf1[21] = -bf0[21] + bf0[18];
+  bf1[22] = -bf0[22] + bf0[17];
+  bf1[23] = -bf0[23] + bf0[16];
+  bf1[24] = -bf0[24] + bf0[31];
+  bf1[25] = -bf0[25] + bf0[30];
+  bf1[26] = -bf0[26] + bf0[29];
+  bf1[27] = -bf0[27] + bf0[28];
+  bf1[28] = bf0[28] + bf0[27];
+  bf1[29] = bf0[29] + bf0[26];
+  bf1[30] = bf0[30] + bf0[25];
+  bf1[31] = bf0[31] + bf0[24];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[3];
+  bf1[1] = bf0[1] + bf0[2];
+  bf1[2] = -bf0[2] + bf0[1];
+  bf1[3] = -bf0[3] + bf0[0];
+  bf1[4] = bf0[4];
+  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8] + bf0[11];
+  bf1[9] = bf0[9] + bf0[10];
+  bf1[10] = -bf0[10] + bf0[9];
+  bf1[11] = -bf0[11] + bf0[8];
+  bf1[12] = -bf0[12] + bf0[15];
+  bf1[13] = -bf0[13] + bf0[14];
+  bf1[14] = bf0[14] + bf0[13];
+  bf1[15] = bf0[15] + bf0[12];
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit);
+  bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit);
+  bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit);
+  bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit);
+  bf1[22] = bf0[22];
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = bf0[25];
+  bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit);
+  bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit);
+  bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit);
+  bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit);
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
+  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
+  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
+  bf1[4] = bf0[4] + bf0[5];
+  bf1[5] = -bf0[5] + bf0[4];
+  bf1[6] = -bf0[6] + bf0[7];
+  bf1[7] = bf0[7] + bf0[6];
+  bf1[8] = bf0[8];
+  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
+  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
+  bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
+  bf1[15] = bf0[15];
+  bf1[16] = bf0[16] + bf0[19];
+  bf1[17] = bf0[17] + bf0[18];
+  bf1[18] = -bf0[18] + bf0[17];
+  bf1[19] = -bf0[19] + bf0[16];
+  bf1[20] = -bf0[20] + bf0[23];
+  bf1[21] = -bf0[21] + bf0[22];
+  bf1[22] = bf0[22] + bf0[21];
+  bf1[23] = bf0[23] + bf0[20];
+  bf1[24] = bf0[24] + bf0[27];
+  bf1[25] = bf0[25] + bf0[26];
+  bf1[26] = -bf0[26] + bf0[25];
+  bf1[27] = -bf0[27] + bf0[24];
+  bf1[28] = -bf0[28] + bf0[31];
+  bf1[29] = -bf0[29] + bf0[30];
+  bf1[30] = bf0[30] + bf0[29];
+  bf1[31] = bf0[31] + bf0[28];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
+  bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
+  bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
+  bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
+  bf1[8] = bf0[8] + bf0[9];
+  bf1[9] = -bf0[9] + bf0[8];
+  bf1[10] = -bf0[10] + bf0[11];
+  bf1[11] = bf0[11] + bf0[10];
+  bf1[12] = bf0[12] + bf0[13];
+  bf1[13] = -bf0[13] + bf0[12];
+  bf1[14] = -bf0[14] + bf0[15];
+  bf1[15] = bf0[15] + bf0[14];
+  bf1[16] = bf0[16];
+  bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit);
+  bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit);
+  bf1[19] = bf0[19];
+  bf1[20] = bf0[20];
+  bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit);
+  bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit);
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit);
+  bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit);
+  bf1[27] = bf0[27];
+  bf1[28] = bf0[28];
+  bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit);
+  bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit);
+  bf1[31] = bf0[31];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit);
+  bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit);
+  bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit);
+  bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit);
+  bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit);
+  bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit);
+  bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit);
+  bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit);
+  bf1[16] = bf0[16] + bf0[17];
+  bf1[17] = -bf0[17] + bf0[16];
+  bf1[18] = -bf0[18] + bf0[19];
+  bf1[19] = bf0[19] + bf0[18];
+  bf1[20] = bf0[20] + bf0[21];
+  bf1[21] = -bf0[21] + bf0[20];
+  bf1[22] = -bf0[22] + bf0[23];
+  bf1[23] = bf0[23] + bf0[22];
+  bf1[24] = bf0[24] + bf0[25];
+  bf1[25] = -bf0[25] + bf0[24];
+  bf1[26] = -bf0[26] + bf0[27];
+  bf1[27] = bf0[27] + bf0[26];
+  bf1[28] = bf0[28] + bf0[29];
+  bf1[29] = -bf0[29] + bf0[28];
+  bf1[30] = -bf0[30] + bf0[31];
+  bf1[31] = bf0[31] + bf0[30];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 8
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit);
+  bf1[17] = half_btf(cospi[30], bf0[17], cospi[34], bf0[30], cos_bit);
+  bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit);
+  bf1[19] = half_btf(cospi[14], bf0[19], cospi[50], bf0[28], cos_bit);
+  bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit);
+  bf1[21] = half_btf(cospi[22], bf0[21], cospi[42], bf0[26], cos_bit);
+  bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit);
+  bf1[23] = half_btf(cospi[6], bf0[23], cospi[58], bf0[24], cos_bit);
+  bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit);
+  bf1[25] = half_btf(cospi[38], bf0[25], -cospi[26], bf0[22], cos_bit);
+  bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit);
+  bf1[27] = half_btf(cospi[54], bf0[27], -cospi[10], bf0[20], cos_bit);
+  bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit);
+  bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit);
+  bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit);
+  bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 9
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[16];
+  bf1[2] = bf0[8];
+  bf1[3] = bf0[24];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[20];
+  bf1[6] = bf0[12];
+  bf1[7] = bf0[28];
+  bf1[8] = bf0[2];
+  bf1[9] = bf0[18];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[26];
+  bf1[12] = bf0[6];
+  bf1[13] = bf0[22];
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[30];
+  bf1[16] = bf0[1];
+  bf1[17] = bf0[17];
+  bf1[18] = bf0[9];
+  bf1[19] = bf0[25];
+  bf1[20] = bf0[5];
+  bf1[21] = bf0[21];
+  bf1[22] = bf0[13];
+  bf1[23] = bf0[29];
+  bf1[24] = bf0[3];
+  bf1[25] = bf0[19];
+  bf1[26] = bf0[11];
+  bf1[27] = bf0[27];
+  bf1[28] = bf0[7];
+  bf1[29] = bf0[23];
+  bf1[30] = bf0[15];
+  bf1[31] = bf0[31];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+}
+
+void av1_fadst4(const int32_t *input, int32_t *output, int8_t cos_bit,
+                const int8_t *stage_range) {
+  int bit = cos_bit;
+  const int32_t *sinpi = sinpi_arr(bit);
+  int32_t x0, x1, x2, x3;
+  int32_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+  // stage 0
+  av1_range_check_buf(0, input, input, 4, stage_range[0]);
+  x0 = input[0];
+  x1 = input[1];
+  x2 = input[2];
+  x3 = input[3];
+
+  if (!(x0 | x1 | x2 | x3)) {
+    output[0] = output[1] = output[2] = output[3] = 0;
+    return;
+  }
+
+  // stage 1
+  s0 = range_check_value(sinpi[1] * x0, bit + stage_range[1]);
+  s1 = range_check_value(sinpi[4] * x0, bit + stage_range[1]);
+  s2 = range_check_value(sinpi[2] * x1, bit + stage_range[1]);
+  s3 = range_check_value(sinpi[1] * x1, bit + stage_range[1]);
+  s4 = range_check_value(sinpi[3] * x2, bit + stage_range[1]);
+  s5 = range_check_value(sinpi[4] * x3, bit + stage_range[1]);
+  s6 = range_check_value(sinpi[2] * x3, bit + stage_range[1]);
+  s7 = range_check_value(x0 + x1, stage_range[1]);
+
+  // stage 2
+  s7 = range_check_value(s7 - x3, stage_range[2]);
+
+  // stage 3
+  x0 = range_check_value(s0 + s2, bit + stage_range[3]);
+  x1 = range_check_value(sinpi[3] * s7, bit + stage_range[3]);
+  x2 = range_check_value(s1 - s3, bit + stage_range[3]);
+  x3 = range_check_value(s4, bit + stage_range[3]);
+
+  // stage 4
+  x0 = range_check_value(x0 + s5, bit + stage_range[4]);
+  x2 = range_check_value(x2 + s6, bit + stage_range[4]);
+
+  // stage 5
+  s0 = range_check_value(x0 + x3, bit + stage_range[5]);
+  s1 = range_check_value(x1, bit + stage_range[5]);
+  s2 = range_check_value(x2 - x3, bit + stage_range[5]);
+  s3 = range_check_value(x2 - x0, bit + stage_range[5]);
+
+  // stage 6
+  s3 = range_check_value(s3 + x3, bit + stage_range[6]);
+
+  // 1-D transform scaling factor is sqrt(2).
+  output[0] = round_shift(s0, bit);
+  output[1] = round_shift(s1, bit);
+  output[2] = round_shift(s2, bit);
+  output[3] = round_shift(s3, bit);
+  av1_range_check_buf(6, input, output, 4, stage_range[6]);
+}
+
+void av1_fadst8(const int32_t *input, int32_t *output, int8_t cos_bit,
+                const int8_t *stage_range) {
+  const int32_t size = 8;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[8];
+
+  // stage 0;
+  av1_range_check_buf(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  assert(output != input);
+  bf1 = output;
+  bf1[0] = input[0];
+  bf1[1] = -input[7];
+  bf1[2] = -input[3];
+  bf1[3] = input[4];
+  bf1[4] = -input[1];
+  bf1[5] = input[6];
+  bf1[6] = input[2];
+  bf1[7] = -input[5];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit);
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
+  bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[2];
+  bf1[1] = bf0[1] + bf0[3];
+  bf1[2] = bf0[0] - bf0[2];
+  bf1[3] = bf0[1] - bf0[3];
+  bf1[4] = bf0[4] + bf0[6];
+  bf1[5] = bf0[5] + bf0[7];
+  bf1[6] = bf0[4] - bf0[6];
+  bf1[7] = bf0[5] - bf0[7];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit);
+  bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
+  bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
+  bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[4];
+  bf1[1] = bf0[1] + bf0[5];
+  bf1[2] = bf0[2] + bf0[6];
+  bf1[3] = bf0[3] + bf0[7];
+  bf1[4] = bf0[0] - bf0[4];
+  bf1[5] = bf0[1] - bf0[5];
+  bf1[6] = bf0[2] - bf0[6];
+  bf1[7] = bf0[3] - bf0[7];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[4], bf0[0], cospi[60], bf0[1], cos_bit);
+  bf1[1] = half_btf(cospi[60], bf0[0], -cospi[4], bf0[1], cos_bit);
+  bf1[2] = half_btf(cospi[20], bf0[2], cospi[44], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[44], bf0[2], -cospi[20], bf0[3], cos_bit);
+  bf1[4] = half_btf(cospi[36], bf0[4], cospi[28], bf0[5], cos_bit);
+  bf1[5] = half_btf(cospi[28], bf0[4], -cospi[36], bf0[5], cos_bit);
+  bf1[6] = half_btf(cospi[52], bf0[6], cospi[12], bf0[7], cos_bit);
+  bf1[7] = half_btf(cospi[12], bf0[6], -cospi[52], bf0[7], cos_bit);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[1];
+  bf1[1] = bf0[6];
+  bf1[2] = bf0[3];
+  bf1[3] = bf0[4];
+  bf1[4] = bf0[5];
+  bf1[5] = bf0[2];
+  bf1[6] = bf0[7];
+  bf1[7] = bf0[0];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+}
+
+void av1_fadst16(const int32_t *input, int32_t *output, int8_t cos_bit,
+                 const int8_t *stage_range) {
+  const int32_t size = 16;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[16];
+
+  // stage 0;
+  av1_range_check_buf(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  assert(output != input);
+  bf1 = output;
+  bf1[0] = input[0];
+  bf1[1] = -input[15];
+  bf1[2] = -input[7];
+  bf1[3] = input[8];
+  bf1[4] = -input[3];
+  bf1[5] = input[12];
+  bf1[6] = input[4];
+  bf1[7] = -input[11];
+  bf1[8] = -input[1];
+  bf1[9] = input[14];
+  bf1[10] = input[6];
+  bf1[11] = -input[9];
+  bf1[12] = input[2];
+  bf1[13] = -input[13];
+  bf1[14] = -input[5];
+  bf1[15] = input[10];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit);
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
+  bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit);
+  bf1[11] = half_btf(cospi[32], bf0[10], -cospi[32], bf0[11], cos_bit);
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit);
+  bf1[15] = half_btf(cospi[32], bf0[14], -cospi[32], bf0[15], cos_bit);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[2];
+  bf1[1] = bf0[1] + bf0[3];
+  bf1[2] = bf0[0] - bf0[2];
+  bf1[3] = bf0[1] - bf0[3];
+  bf1[4] = bf0[4] + bf0[6];
+  bf1[5] = bf0[5] + bf0[7];
+  bf1[6] = bf0[4] - bf0[6];
+  bf1[7] = bf0[5] - bf0[7];
+  bf1[8] = bf0[8] + bf0[10];
+  bf1[9] = bf0[9] + bf0[11];
+  bf1[10] = bf0[8] - bf0[10];
+  bf1[11] = bf0[9] - bf0[11];
+  bf1[12] = bf0[12] + bf0[14];
+  bf1[13] = bf0[13] + bf0[15];
+  bf1[14] = bf0[12] - bf0[14];
+  bf1[15] = bf0[13] - bf0[15];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit);
+  bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
+  bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
+  bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit);
+  bf1[13] = half_btf(cospi[48], bf0[12], -cospi[16], bf0[13], cos_bit);
+  bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit);
+  bf1[15] = half_btf(cospi[16], bf0[14], cospi[48], bf0[15], cos_bit);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[4];
+  bf1[1] = bf0[1] + bf0[5];
+  bf1[2] = bf0[2] + bf0[6];
+  bf1[3] = bf0[3] + bf0[7];
+  bf1[4] = bf0[0] - bf0[4];
+  bf1[5] = bf0[1] - bf0[5];
+  bf1[6] = bf0[2] - bf0[6];
+  bf1[7] = bf0[3] - bf0[7];
+  bf1[8] = bf0[8] + bf0[12];
+  bf1[9] = bf0[9] + bf0[13];
+  bf1[10] = bf0[10] + bf0[14];
+  bf1[11] = bf0[11] + bf0[15];
+  bf1[12] = bf0[8] - bf0[12];
+  bf1[13] = bf0[9] - bf0[13];
+  bf1[14] = bf0[10] - bf0[14];
+  bf1[15] = bf0[11] - bf0[15];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit);
+  bf1[9] = half_btf(cospi[56], bf0[8], -cospi[8], bf0[9], cos_bit);
+  bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit);
+  bf1[11] = half_btf(cospi[24], bf0[10], -cospi[40], bf0[11], cos_bit);
+  bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit);
+  bf1[13] = half_btf(cospi[8], bf0[12], cospi[56], bf0[13], cos_bit);
+  bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit);
+  bf1[15] = half_btf(cospi[40], bf0[14], cospi[24], bf0[15], cos_bit);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[8];
+  bf1[1] = bf0[1] + bf0[9];
+  bf1[2] = bf0[2] + bf0[10];
+  bf1[3] = bf0[3] + bf0[11];
+  bf1[4] = bf0[4] + bf0[12];
+  bf1[5] = bf0[5] + bf0[13];
+  bf1[6] = bf0[6] + bf0[14];
+  bf1[7] = bf0[7] + bf0[15];
+  bf1[8] = bf0[0] - bf0[8];
+  bf1[9] = bf0[1] - bf0[9];
+  bf1[10] = bf0[2] - bf0[10];
+  bf1[11] = bf0[3] - bf0[11];
+  bf1[12] = bf0[4] - bf0[12];
+  bf1[13] = bf0[5] - bf0[13];
+  bf1[14] = bf0[6] - bf0[14];
+  bf1[15] = bf0[7] - bf0[15];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 8
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[2], bf0[0], cospi[62], bf0[1], cos_bit);
+  bf1[1] = half_btf(cospi[62], bf0[0], -cospi[2], bf0[1], cos_bit);
+  bf1[2] = half_btf(cospi[10], bf0[2], cospi[54], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[54], bf0[2], -cospi[10], bf0[3], cos_bit);
+  bf1[4] = half_btf(cospi[18], bf0[4], cospi[46], bf0[5], cos_bit);
+  bf1[5] = half_btf(cospi[46], bf0[4], -cospi[18], bf0[5], cos_bit);
+  bf1[6] = half_btf(cospi[26], bf0[6], cospi[38], bf0[7], cos_bit);
+  bf1[7] = half_btf(cospi[38], bf0[6], -cospi[26], bf0[7], cos_bit);
+  bf1[8] = half_btf(cospi[34], bf0[8], cospi[30], bf0[9], cos_bit);
+  bf1[9] = half_btf(cospi[30], bf0[8], -cospi[34], bf0[9], cos_bit);
+  bf1[10] = half_btf(cospi[42], bf0[10], cospi[22], bf0[11], cos_bit);
+  bf1[11] = half_btf(cospi[22], bf0[10], -cospi[42], bf0[11], cos_bit);
+  bf1[12] = half_btf(cospi[50], bf0[12], cospi[14], bf0[13], cos_bit);
+  bf1[13] = half_btf(cospi[14], bf0[12], -cospi[50], bf0[13], cos_bit);
+  bf1[14] = half_btf(cospi[58], bf0[14], cospi[6], bf0[15], cos_bit);
+  bf1[15] = half_btf(cospi[6], bf0[14], -cospi[58], bf0[15], cos_bit);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 9
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[1];
+  bf1[1] = bf0[14];
+  bf1[2] = bf0[3];
+  bf1[3] = bf0[12];
+  bf1[4] = bf0[5];
+  bf1[5] = bf0[10];
+  bf1[6] = bf0[7];
+  bf1[7] = bf0[8];
+  bf1[8] = bf0[9];
+  bf1[9] = bf0[6];
+  bf1[10] = bf0[11];
+  bf1[11] = bf0[4];
+  bf1[12] = bf0[13];
+  bf1[13] = bf0[2];
+  bf1[14] = bf0[15];
+  bf1[15] = bf0[0];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+}
+
+void av1_fidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+                      const int8_t *stage_range) {
+  (void)cos_bit;
+  for (int i = 0; i < 4; ++i)
+    output[i] = round_shift((int64_t)input[i] * NewSqrt2, NewSqrt2Bits);
+  assert(stage_range[0] + NewSqrt2Bits <= 32);
+  av1_range_check_buf(0, input, output, 4, stage_range[0]);
+}
+
+void av1_fidentity8_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+                      const int8_t *stage_range) {
+  (void)cos_bit;
+  for (int i = 0; i < 8; ++i) output[i] = input[i] * 2;
+  av1_range_check_buf(0, input, output, 8, stage_range[0]);
+}
+
+void av1_fidentity16_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+                       const int8_t *stage_range) {
+  (void)cos_bit;
+  for (int i = 0; i < 16; ++i)
+    output[i] = round_shift((int64_t)input[i] * 2 * NewSqrt2, NewSqrt2Bits);
+  assert(stage_range[0] + NewSqrt2Bits <= 32);
+  av1_range_check_buf(0, input, output, 16, stage_range[0]);
+}
+
+void av1_fidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+                       const int8_t *stage_range) {
+  (void)cos_bit;
+  for (int i = 0; i < 32; ++i) output[i] = input[i] * 4;
+  av1_range_check_buf(0, input, output, 32, stage_range[0]);
+}
+
+void av1_fdct64(const int32_t *input, int32_t *output, int8_t cos_bit,
+                const int8_t *stage_range) {
+  const int32_t size = 64;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[64];
+
+  // stage 0;
+  av1_range_check_buf(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0] + input[63];
+  bf1[1] = input[1] + input[62];
+  bf1[2] = input[2] + input[61];
+  bf1[3] = input[3] + input[60];
+  bf1[4] = input[4] + input[59];
+  bf1[5] = input[5] + input[58];
+  bf1[6] = input[6] + input[57];
+  bf1[7] = input[7] + input[56];
+  bf1[8] = input[8] + input[55];
+  bf1[9] = input[9] + input[54];
+  bf1[10] = input[10] + input[53];
+  bf1[11] = input[11] + input[52];
+  bf1[12] = input[12] + input[51];
+  bf1[13] = input[13] + input[50];
+  bf1[14] = input[14] + input[49];
+  bf1[15] = input[15] + input[48];
+  bf1[16] = input[16] + input[47];
+  bf1[17] = input[17] + input[46];
+  bf1[18] = input[18] + input[45];
+  bf1[19] = input[19] + input[44];
+  bf1[20] = input[20] + input[43];
+  bf1[21] = input[21] + input[42];
+  bf1[22] = input[22] + input[41];
+  bf1[23] = input[23] + input[40];
+  bf1[24] = input[24] + input[39];
+  bf1[25] = input[25] + input[38];
+  bf1[26] = input[26] + input[37];
+  bf1[27] = input[27] + input[36];
+  bf1[28] = input[28] + input[35];
+  bf1[29] = input[29] + input[34];
+  bf1[30] = input[30] + input[33];
+  bf1[31] = input[31] + input[32];
+  bf1[32] = -input[32] + input[31];
+  bf1[33] = -input[33] + input[30];
+  bf1[34] = -input[34] + input[29];
+  bf1[35] = -input[35] + input[28];
+  bf1[36] = -input[36] + input[27];
+  bf1[37] = -input[37] + input[26];
+  bf1[38] = -input[38] + input[25];
+  bf1[39] = -input[39] + input[24];
+  bf1[40] = -input[40] + input[23];
+  bf1[41] = -input[41] + input[22];
+  bf1[42] = -input[42] + input[21];
+  bf1[43] = -input[43] + input[20];
+  bf1[44] = -input[44] + input[19];
+  bf1[45] = -input[45] + input[18];
+  bf1[46] = -input[46] + input[17];
+  bf1[47] = -input[47] + input[16];
+  bf1[48] = -input[48] + input[15];
+  bf1[49] = -input[49] + input[14];
+  bf1[50] = -input[50] + input[13];
+  bf1[51] = -input[51] + input[12];
+  bf1[52] = -input[52] + input[11];
+  bf1[53] = -input[53] + input[10];
+  bf1[54] = -input[54] + input[9];
+  bf1[55] = -input[55] + input[8];
+  bf1[56] = -input[56] + input[7];
+  bf1[57] = -input[57] + input[6];
+  bf1[58] = -input[58] + input[5];
+  bf1[59] = -input[59] + input[4];
+  bf1[60] = -input[60] + input[3];
+  bf1[61] = -input[61] + input[2];
+  bf1[62] = -input[62] + input[1];
+  bf1[63] = -input[63] + input[0];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[31];
+  bf1[1] = bf0[1] + bf0[30];
+  bf1[2] = bf0[2] + bf0[29];
+  bf1[3] = bf0[3] + bf0[28];
+  bf1[4] = bf0[4] + bf0[27];
+  bf1[5] = bf0[5] + bf0[26];
+  bf1[6] = bf0[6] + bf0[25];
+  bf1[7] = bf0[7] + bf0[24];
+  bf1[8] = bf0[8] + bf0[23];
+  bf1[9] = bf0[9] + bf0[22];
+  bf1[10] = bf0[10] + bf0[21];
+  bf1[11] = bf0[11] + bf0[20];
+  bf1[12] = bf0[12] + bf0[19];
+  bf1[13] = bf0[13] + bf0[18];
+  bf1[14] = bf0[14] + bf0[17];
+  bf1[15] = bf0[15] + bf0[16];
+  bf1[16] = -bf0[16] + bf0[15];
+  bf1[17] = -bf0[17] + bf0[14];
+  bf1[18] = -bf0[18] + bf0[13];
+  bf1[19] = -bf0[19] + bf0[12];
+  bf1[20] = -bf0[20] + bf0[11];
+  bf1[21] = -bf0[21] + bf0[10];
+  bf1[22] = -bf0[22] + bf0[9];
+  bf1[23] = -bf0[23] + bf0[8];
+  bf1[24] = -bf0[24] + bf0[7];
+  bf1[25] = -bf0[25] + bf0[6];
+  bf1[26] = -bf0[26] + bf0[5];
+  bf1[27] = -bf0[27] + bf0[4];
+  bf1[28] = -bf0[28] + bf0[3];
+  bf1[29] = -bf0[29] + bf0[2];
+  bf1[30] = -bf0[30] + bf0[1];
+  bf1[31] = -bf0[31] + bf0[0];
+  bf1[32] = bf0[32];
+  bf1[33] = bf0[33];
+  bf1[34] = bf0[34];
+  bf1[35] = bf0[35];
+  bf1[36] = bf0[36];
+  bf1[37] = bf0[37];
+  bf1[38] = bf0[38];
+  bf1[39] = bf0[39];
+  bf1[40] = half_btf(-cospi[32], bf0[40], cospi[32], bf0[55], cos_bit);
+  bf1[41] = half_btf(-cospi[32], bf0[41], cospi[32], bf0[54], cos_bit);
+  bf1[42] = half_btf(-cospi[32], bf0[42], cospi[32], bf0[53], cos_bit);
+  bf1[43] = half_btf(-cospi[32], bf0[43], cospi[32], bf0[52], cos_bit);
+  bf1[44] = half_btf(-cospi[32], bf0[44], cospi[32], bf0[51], cos_bit);
+  bf1[45] = half_btf(-cospi[32], bf0[45], cospi[32], bf0[50], cos_bit);
+  bf1[46] = half_btf(-cospi[32], bf0[46], cospi[32], bf0[49], cos_bit);
+  bf1[47] = half_btf(-cospi[32], bf0[47], cospi[32], bf0[48], cos_bit);
+  bf1[48] = half_btf(cospi[32], bf0[48], cospi[32], bf0[47], cos_bit);
+  bf1[49] = half_btf(cospi[32], bf0[49], cospi[32], bf0[46], cos_bit);
+  bf1[50] = half_btf(cospi[32], bf0[50], cospi[32], bf0[45], cos_bit);
+  bf1[51] = half_btf(cospi[32], bf0[51], cospi[32], bf0[44], cos_bit);
+  bf1[52] = half_btf(cospi[32], bf0[52], cospi[32], bf0[43], cos_bit);
+  bf1[53] = half_btf(cospi[32], bf0[53], cospi[32], bf0[42], cos_bit);
+  bf1[54] = half_btf(cospi[32], bf0[54], cospi[32], bf0[41], cos_bit);
+  bf1[55] = half_btf(cospi[32], bf0[55], cospi[32], bf0[40], cos_bit);
+  bf1[56] = bf0[56];
+  bf1[57] = bf0[57];
+  bf1[58] = bf0[58];
+  bf1[59] = bf0[59];
+  bf1[60] = bf0[60];
+  bf1[61] = bf0[61];
+  bf1[62] = bf0[62];
+  bf1[63] = bf0[63];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[15];
+  bf1[1] = bf0[1] + bf0[14];
+  bf1[2] = bf0[2] + bf0[13];
+  bf1[3] = bf0[3] + bf0[12];
+  bf1[4] = bf0[4] + bf0[11];
+  bf1[5] = bf0[5] + bf0[10];
+  bf1[6] = bf0[6] + bf0[9];
+  bf1[7] = bf0[7] + bf0[8];
+  bf1[8] = -bf0[8] + bf0[7];
+  bf1[9] = -bf0[9] + bf0[6];
+  bf1[10] = -bf0[10] + bf0[5];
+  bf1[11] = -bf0[11] + bf0[4];
+  bf1[12] = -bf0[12] + bf0[3];
+  bf1[13] = -bf0[13] + bf0[2];
+  bf1[14] = -bf0[14] + bf0[1];
+  bf1[15] = -bf0[15] + bf0[0];
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = bf0[18];
+  bf1[19] = bf0[19];
+  bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
+  bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
+  bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
+  bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
+  bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit);
+  bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit);
+  bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit);
+  bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit);
+  bf1[28] = bf0[28];
+  bf1[29] = bf0[29];
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  bf1[32] = bf0[32] + bf0[47];
+  bf1[33] = bf0[33] + bf0[46];
+  bf1[34] = bf0[34] + bf0[45];
+  bf1[35] = bf0[35] + bf0[44];
+  bf1[36] = bf0[36] + bf0[43];
+  bf1[37] = bf0[37] + bf0[42];
+  bf1[38] = bf0[38] + bf0[41];
+  bf1[39] = bf0[39] + bf0[40];
+  bf1[40] = -bf0[40] + bf0[39];
+  bf1[41] = -bf0[41] + bf0[38];
+  bf1[42] = -bf0[42] + bf0[37];
+  bf1[43] = -bf0[43] + bf0[36];
+  bf1[44] = -bf0[44] + bf0[35];
+  bf1[45] = -bf0[45] + bf0[34];
+  bf1[46] = -bf0[46] + bf0[33];
+  bf1[47] = -bf0[47] + bf0[32];
+  bf1[48] = -bf0[48] + bf0[63];
+  bf1[49] = -bf0[49] + bf0[62];
+  bf1[50] = -bf0[50] + bf0[61];
+  bf1[51] = -bf0[51] + bf0[60];
+  bf1[52] = -bf0[52] + bf0[59];
+  bf1[53] = -bf0[53] + bf0[58];
+  bf1[54] = -bf0[54] + bf0[57];
+  bf1[55] = -bf0[55] + bf0[56];
+  bf1[56] = bf0[56] + bf0[55];
+  bf1[57] = bf0[57] + bf0[54];
+  bf1[58] = bf0[58] + bf0[53];
+  bf1[59] = bf0[59] + bf0[52];
+  bf1[60] = bf0[60] + bf0[51];
+  bf1[61] = bf0[61] + bf0[50];
+  bf1[62] = bf0[62] + bf0[49];
+  bf1[63] = bf0[63] + bf0[48];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[7];
+  bf1[1] = bf0[1] + bf0[6];
+  bf1[2] = bf0[2] + bf0[5];
+  bf1[3] = bf0[3] + bf0[4];
+  bf1[4] = -bf0[4] + bf0[3];
+  bf1[5] = -bf0[5] + bf0[2];
+  bf1[6] = -bf0[6] + bf0[1];
+  bf1[7] = -bf0[7] + bf0[0];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
+  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
+  bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit);
+  bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = bf0[16] + bf0[23];
+  bf1[17] = bf0[17] + bf0[22];
+  bf1[18] = bf0[18] + bf0[21];
+  bf1[19] = bf0[19] + bf0[20];
+  bf1[20] = -bf0[20] + bf0[19];
+  bf1[21] = -bf0[21] + bf0[18];
+  bf1[22] = -bf0[22] + bf0[17];
+  bf1[23] = -bf0[23] + bf0[16];
+  bf1[24] = -bf0[24] + bf0[31];
+  bf1[25] = -bf0[25] + bf0[30];
+  bf1[26] = -bf0[26] + bf0[29];
+  bf1[27] = -bf0[27] + bf0[28];
+  bf1[28] = bf0[28] + bf0[27];
+  bf1[29] = bf0[29] + bf0[26];
+  bf1[30] = bf0[30] + bf0[25];
+  bf1[31] = bf0[31] + bf0[24];
+  bf1[32] = bf0[32];
+  bf1[33] = bf0[33];
+  bf1[34] = bf0[34];
+  bf1[35] = bf0[35];
+  bf1[36] = half_btf(-cospi[16], bf0[36], cospi[48], bf0[59], cos_bit);
+  bf1[37] = half_btf(-cospi[16], bf0[37], cospi[48], bf0[58], cos_bit);
+  bf1[38] = half_btf(-cospi[16], bf0[38], cospi[48], bf0[57], cos_bit);
+  bf1[39] = half_btf(-cospi[16], bf0[39], cospi[48], bf0[56], cos_bit);
+  bf1[40] = half_btf(-cospi[48], bf0[40], -cospi[16], bf0[55], cos_bit);
+  bf1[41] = half_btf(-cospi[48], bf0[41], -cospi[16], bf0[54], cos_bit);
+  bf1[42] = half_btf(-cospi[48], bf0[42], -cospi[16], bf0[53], cos_bit);
+  bf1[43] = half_btf(-cospi[48], bf0[43], -cospi[16], bf0[52], cos_bit);
+  bf1[44] = bf0[44];
+  bf1[45] = bf0[45];
+  bf1[46] = bf0[46];
+  bf1[47] = bf0[47];
+  bf1[48] = bf0[48];
+  bf1[49] = bf0[49];
+  bf1[50] = bf0[50];
+  bf1[51] = bf0[51];
+  bf1[52] = half_btf(cospi[48], bf0[52], -cospi[16], bf0[43], cos_bit);
+  bf1[53] = half_btf(cospi[48], bf0[53], -cospi[16], bf0[42], cos_bit);
+  bf1[54] = half_btf(cospi[48], bf0[54], -cospi[16], bf0[41], cos_bit);
+  bf1[55] = half_btf(cospi[48], bf0[55], -cospi[16], bf0[40], cos_bit);
+  bf1[56] = half_btf(cospi[16], bf0[56], cospi[48], bf0[39], cos_bit);
+  bf1[57] = half_btf(cospi[16], bf0[57], cospi[48], bf0[38], cos_bit);
+  bf1[58] = half_btf(cospi[16], bf0[58], cospi[48], bf0[37], cos_bit);
+  bf1[59] = half_btf(cospi[16], bf0[59], cospi[48], bf0[36], cos_bit);
+  bf1[60] = bf0[60];
+  bf1[61] = bf0[61];
+  bf1[62] = bf0[62];
+  bf1[63] = bf0[63];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[3];
+  bf1[1] = bf0[1] + bf0[2];
+  bf1[2] = -bf0[2] + bf0[1];
+  bf1[3] = -bf0[3] + bf0[0];
+  bf1[4] = bf0[4];
+  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8] + bf0[11];
+  bf1[9] = bf0[9] + bf0[10];
+  bf1[10] = -bf0[10] + bf0[9];
+  bf1[11] = -bf0[11] + bf0[8];
+  bf1[12] = -bf0[12] + bf0[15];
+  bf1[13] = -bf0[13] + bf0[14];
+  bf1[14] = bf0[14] + bf0[13];
+  bf1[15] = bf0[15] + bf0[12];
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit);
+  bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit);
+  bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit);
+  bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit);
+  bf1[22] = bf0[22];
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = bf0[25];
+  bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit);
+  bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit);
+  bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit);
+  bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit);
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  bf1[32] = bf0[32] + bf0[39];
+  bf1[33] = bf0[33] + bf0[38];
+  bf1[34] = bf0[34] + bf0[37];
+  bf1[35] = bf0[35] + bf0[36];
+  bf1[36] = -bf0[36] + bf0[35];
+  bf1[37] = -bf0[37] + bf0[34];
+  bf1[38] = -bf0[38] + bf0[33];
+  bf1[39] = -bf0[39] + bf0[32];
+  bf1[40] = -bf0[40] + bf0[47];
+  bf1[41] = -bf0[41] + bf0[46];
+  bf1[42] = -bf0[42] + bf0[45];
+  bf1[43] = -bf0[43] + bf0[44];
+  bf1[44] = bf0[44] + bf0[43];
+  bf1[45] = bf0[45] + bf0[42];
+  bf1[46] = bf0[46] + bf0[41];
+  bf1[47] = bf0[47] + bf0[40];
+  bf1[48] = bf0[48] + bf0[55];
+  bf1[49] = bf0[49] + bf0[54];
+  bf1[50] = bf0[50] + bf0[53];
+  bf1[51] = bf0[51] + bf0[52];
+  bf1[52] = -bf0[52] + bf0[51];
+  bf1[53] = -bf0[53] + bf0[50];
+  bf1[54] = -bf0[54] + bf0[49];
+  bf1[55] = -bf0[55] + bf0[48];
+  bf1[56] = -bf0[56] + bf0[63];
+  bf1[57] = -bf0[57] + bf0[62];
+  bf1[58] = -bf0[58] + bf0[61];
+  bf1[59] = -bf0[59] + bf0[60];
+  bf1[60] = bf0[60] + bf0[59];
+  bf1[61] = bf0[61] + bf0[58];
+  bf1[62] = bf0[62] + bf0[57];
+  bf1[63] = bf0[63] + bf0[56];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
+  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
+  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
+  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
+  bf1[4] = bf0[4] + bf0[5];
+  bf1[5] = -bf0[5] + bf0[4];
+  bf1[6] = -bf0[6] + bf0[7];
+  bf1[7] = bf0[7] + bf0[6];
+  bf1[8] = bf0[8];
+  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
+  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
+  bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
+  bf1[15] = bf0[15];
+  bf1[16] = bf0[16] + bf0[19];
+  bf1[17] = bf0[17] + bf0[18];
+  bf1[18] = -bf0[18] + bf0[17];
+  bf1[19] = -bf0[19] + bf0[16];
+  bf1[20] = -bf0[20] + bf0[23];
+  bf1[21] = -bf0[21] + bf0[22];
+  bf1[22] = bf0[22] + bf0[21];
+  bf1[23] = bf0[23] + bf0[20];
+  bf1[24] = bf0[24] + bf0[27];
+  bf1[25] = bf0[25] + bf0[26];
+  bf1[26] = -bf0[26] + bf0[25];
+  bf1[27] = -bf0[27] + bf0[24];
+  bf1[28] = -bf0[28] + bf0[31];
+  bf1[29] = -bf0[29] + bf0[30];
+  bf1[30] = bf0[30] + bf0[29];
+  bf1[31] = bf0[31] + bf0[28];
+  bf1[32] = bf0[32];
+  bf1[33] = bf0[33];
+  bf1[34] = half_btf(-cospi[8], bf0[34], cospi[56], bf0[61], cos_bit);
+  bf1[35] = half_btf(-cospi[8], bf0[35], cospi[56], bf0[60], cos_bit);
+  bf1[36] = half_btf(-cospi[56], bf0[36], -cospi[8], bf0[59], cos_bit);
+  bf1[37] = half_btf(-cospi[56], bf0[37], -cospi[8], bf0[58], cos_bit);
+  bf1[38] = bf0[38];
+  bf1[39] = bf0[39];
+  bf1[40] = bf0[40];
+  bf1[41] = bf0[41];
+  bf1[42] = half_btf(-cospi[40], bf0[42], cospi[24], bf0[53], cos_bit);
+  bf1[43] = half_btf(-cospi[40], bf0[43], cospi[24], bf0[52], cos_bit);
+  bf1[44] = half_btf(-cospi[24], bf0[44], -cospi[40], bf0[51], cos_bit);
+  bf1[45] = half_btf(-cospi[24], bf0[45], -cospi[40], bf0[50], cos_bit);
+  bf1[46] = bf0[46];
+  bf1[47] = bf0[47];
+  bf1[48] = bf0[48];
+  bf1[49] = bf0[49];
+  bf1[50] = half_btf(cospi[24], bf0[50], -cospi[40], bf0[45], cos_bit);
+  bf1[51] = half_btf(cospi[24], bf0[51], -cospi[40], bf0[44], cos_bit);
+  bf1[52] = half_btf(cospi[40], bf0[52], cospi[24], bf0[43], cos_bit);
+  bf1[53] = half_btf(cospi[40], bf0[53], cospi[24], bf0[42], cos_bit);
+  bf1[54] = bf0[54];
+  bf1[55] = bf0[55];
+  bf1[56] = bf0[56];
+  bf1[57] = bf0[57];
+  bf1[58] = half_btf(cospi[56], bf0[58], -cospi[8], bf0[37], cos_bit);
+  bf1[59] = half_btf(cospi[56], bf0[59], -cospi[8], bf0[36], cos_bit);
+  bf1[60] = half_btf(cospi[8], bf0[60], cospi[56], bf0[35], cos_bit);
+  bf1[61] = half_btf(cospi[8], bf0[61], cospi[56], bf0[34], cos_bit);
+  bf1[62] = bf0[62];
+  bf1[63] = bf0[63];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
+  bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
+  bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
+  bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
+  bf1[8] = bf0[8] + bf0[9];
+  bf1[9] = -bf0[9] + bf0[8];
+  bf1[10] = -bf0[10] + bf0[11];
+  bf1[11] = bf0[11] + bf0[10];
+  bf1[12] = bf0[12] + bf0[13];
+  bf1[13] = -bf0[13] + bf0[12];
+  bf1[14] = -bf0[14] + bf0[15];
+  bf1[15] = bf0[15] + bf0[14];
+  bf1[16] = bf0[16];
+  bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit);
+  bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit);
+  bf1[19] = bf0[19];
+  bf1[20] = bf0[20];
+  bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit);
+  bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit);
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit);
+  bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit);
+  bf1[27] = bf0[27];
+  bf1[28] = bf0[28];
+  bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit);
+  bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit);
+  bf1[31] = bf0[31];
+  bf1[32] = bf0[32] + bf0[35];
+  bf1[33] = bf0[33] + bf0[34];
+  bf1[34] = -bf0[34] + bf0[33];
+  bf1[35] = -bf0[35] + bf0[32];
+  bf1[36] = -bf0[36] + bf0[39];
+  bf1[37] = -bf0[37] + bf0[38];
+  bf1[38] = bf0[38] + bf0[37];
+  bf1[39] = bf0[39] + bf0[36];
+  bf1[40] = bf0[40] + bf0[43];
+  bf1[41] = bf0[41] + bf0[42];
+  bf1[42] = -bf0[42] + bf0[41];
+  bf1[43] = -bf0[43] + bf0[40];
+  bf1[44] = -bf0[44] + bf0[47];
+  bf1[45] = -bf0[45] + bf0[46];
+  bf1[46] = bf0[46] + bf0[45];
+  bf1[47] = bf0[47] + bf0[44];
+  bf1[48] = bf0[48] + bf0[51];
+  bf1[49] = bf0[49] + bf0[50];
+  bf1[50] = -bf0[50] + bf0[49];
+  bf1[51] = -bf0[51] + bf0[48];
+  bf1[52] = -bf0[52] + bf0[55];
+  bf1[53] = -bf0[53] + bf0[54];
+  bf1[54] = bf0[54] + bf0[53];
+  bf1[55] = bf0[55] + bf0[52];
+  bf1[56] = bf0[56] + bf0[59];
+  bf1[57] = bf0[57] + bf0[58];
+  bf1[58] = -bf0[58] + bf0[57];
+  bf1[59] = -bf0[59] + bf0[56];
+  bf1[60] = -bf0[60] + bf0[63];
+  bf1[61] = -bf0[61] + bf0[62];
+  bf1[62] = bf0[62] + bf0[61];
+  bf1[63] = bf0[63] + bf0[60];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 8
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit);
+  bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit);
+  bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit);
+  bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit);
+  bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit);
+  bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit);
+  bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit);
+  bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit);
+  bf1[16] = bf0[16] + bf0[17];
+  bf1[17] = -bf0[17] + bf0[16];
+  bf1[18] = -bf0[18] + bf0[19];
+  bf1[19] = bf0[19] + bf0[18];
+  bf1[20] = bf0[20] + bf0[21];
+  bf1[21] = -bf0[21] + bf0[20];
+  bf1[22] = -bf0[22] + bf0[23];
+  bf1[23] = bf0[23] + bf0[22];
+  bf1[24] = bf0[24] + bf0[25];
+  bf1[25] = -bf0[25] + bf0[24];
+  bf1[26] = -bf0[26] + bf0[27];
+  bf1[27] = bf0[27] + bf0[26];
+  bf1[28] = bf0[28] + bf0[29];
+  bf1[29] = -bf0[29] + bf0[28];
+  bf1[30] = -bf0[30] + bf0[31];
+  bf1[31] = bf0[31] + bf0[30];
+  bf1[32] = bf0[32];
+  bf1[33] = half_btf(-cospi[4], bf0[33], cospi[60], bf0[62], cos_bit);
+  bf1[34] = half_btf(-cospi[60], bf0[34], -cospi[4], bf0[61], cos_bit);
+  bf1[35] = bf0[35];
+  bf1[36] = bf0[36];
+  bf1[37] = half_btf(-cospi[36], bf0[37], cospi[28], bf0[58], cos_bit);
+  bf1[38] = half_btf(-cospi[28], bf0[38], -cospi[36], bf0[57], cos_bit);
+  bf1[39] = bf0[39];
+  bf1[40] = bf0[40];
+  bf1[41] = half_btf(-cospi[20], bf0[41], cospi[44], bf0[54], cos_bit);
+  bf1[42] = half_btf(-cospi[44], bf0[42], -cospi[20], bf0[53], cos_bit);
+  bf1[43] = bf0[43];
+  bf1[44] = bf0[44];
+  bf1[45] = half_btf(-cospi[52], bf0[45], cospi[12], bf0[50], cos_bit);
+  bf1[46] = half_btf(-cospi[12], bf0[46], -cospi[52], bf0[49], cos_bit);
+  bf1[47] = bf0[47];
+  bf1[48] = bf0[48];
+  bf1[49] = half_btf(cospi[12], bf0[49], -cospi[52], bf0[46], cos_bit);
+  bf1[50] = half_btf(cospi[52], bf0[50], cospi[12], bf0[45], cos_bit);
+  bf1[51] = bf0[51];
+  bf1[52] = bf0[52];
+  bf1[53] = half_btf(cospi[44], bf0[53], -cospi[20], bf0[42], cos_bit);
+  bf1[54] = half_btf(cospi[20], bf0[54], cospi[44], bf0[41], cos_bit);
+  bf1[55] = bf0[55];
+  bf1[56] = bf0[56];
+  bf1[57] = half_btf(cospi[28], bf0[57], -cospi[36], bf0[38], cos_bit);
+  bf1[58] = half_btf(cospi[36], bf0[58], cospi[28], bf0[37], cos_bit);
+  bf1[59] = bf0[59];
+  bf1[60] = bf0[60];
+  bf1[61] = half_btf(cospi[60], bf0[61], -cospi[4], bf0[34], cos_bit);
+  bf1[62] = half_btf(cospi[4], bf0[62], cospi[60], bf0[33], cos_bit);
+  bf1[63] = bf0[63];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 9
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit);
+  bf1[17] = half_btf(cospi[30], bf0[17], cospi[34], bf0[30], cos_bit);
+  bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit);
+  bf1[19] = half_btf(cospi[14], bf0[19], cospi[50], bf0[28], cos_bit);
+  bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit);
+  bf1[21] = half_btf(cospi[22], bf0[21], cospi[42], bf0[26], cos_bit);
+  bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit);
+  bf1[23] = half_btf(cospi[6], bf0[23], cospi[58], bf0[24], cos_bit);
+  bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit);
+  bf1[25] = half_btf(cospi[38], bf0[25], -cospi[26], bf0[22], cos_bit);
+  bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit);
+  bf1[27] = half_btf(cospi[54], bf0[27], -cospi[10], bf0[20], cos_bit);
+  bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit);
+  bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit);
+  bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit);
+  bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit);
+  bf1[32] = bf0[32] + bf0[33];
+  bf1[33] = -bf0[33] + bf0[32];
+  bf1[34] = -bf0[34] + bf0[35];
+  bf1[35] = bf0[35] + bf0[34];
+  bf1[36] = bf0[36] + bf0[37];
+  bf1[37] = -bf0[37] + bf0[36];
+  bf1[38] = -bf0[38] + bf0[39];
+  bf1[39] = bf0[39] + bf0[38];
+  bf1[40] = bf0[40] + bf0[41];
+  bf1[41] = -bf0[41] + bf0[40];
+  bf1[42] = -bf0[42] + bf0[43];
+  bf1[43] = bf0[43] + bf0[42];
+  bf1[44] = bf0[44] + bf0[45];
+  bf1[45] = -bf0[45] + bf0[44];
+  bf1[46] = -bf0[46] + bf0[47];
+  bf1[47] = bf0[47] + bf0[46];
+  bf1[48] = bf0[48] + bf0[49];
+  bf1[49] = -bf0[49] + bf0[48];
+  bf1[50] = -bf0[50] + bf0[51];
+  bf1[51] = bf0[51] + bf0[50];
+  bf1[52] = bf0[52] + bf0[53];
+  bf1[53] = -bf0[53] + bf0[52];
+  bf1[54] = -bf0[54] + bf0[55];
+  bf1[55] = bf0[55] + bf0[54];
+  bf1[56] = bf0[56] + bf0[57];
+  bf1[57] = -bf0[57] + bf0[56];
+  bf1[58] = -bf0[58] + bf0[59];
+  bf1[59] = bf0[59] + bf0[58];
+  bf1[60] = bf0[60] + bf0[61];
+  bf1[61] = -bf0[61] + bf0[60];
+  bf1[62] = -bf0[62] + bf0[63];
+  bf1[63] = bf0[63] + bf0[62];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 10
+  stage++;
+  cospi = cospi_arr(cos_bit);
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = bf0[18];
+  bf1[19] = bf0[19];
+  bf1[20] = bf0[20];
+  bf1[21] = bf0[21];
+  bf1[22] = bf0[22];
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = bf0[25];
+  bf1[26] = bf0[26];
+  bf1[27] = bf0[27];
+  bf1[28] = bf0[28];
+  bf1[29] = bf0[29];
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  bf1[32] = half_btf(cospi[63], bf0[32], cospi[1], bf0[63], cos_bit);
+  bf1[33] = half_btf(cospi[31], bf0[33], cospi[33], bf0[62], cos_bit);
+  bf1[34] = half_btf(cospi[47], bf0[34], cospi[17], bf0[61], cos_bit);
+  bf1[35] = half_btf(cospi[15], bf0[35], cospi[49], bf0[60], cos_bit);
+  bf1[36] = half_btf(cospi[55], bf0[36], cospi[9], bf0[59], cos_bit);
+  bf1[37] = half_btf(cospi[23], bf0[37], cospi[41], bf0[58], cos_bit);
+  bf1[38] = half_btf(cospi[39], bf0[38], cospi[25], bf0[57], cos_bit);
+  bf1[39] = half_btf(cospi[7], bf0[39], cospi[57], bf0[56], cos_bit);
+  bf1[40] = half_btf(cospi[59], bf0[40], cospi[5], bf0[55], cos_bit);
+  bf1[41] = half_btf(cospi[27], bf0[41], cospi[37], bf0[54], cos_bit);
+  bf1[42] = half_btf(cospi[43], bf0[42], cospi[21], bf0[53], cos_bit);
+  bf1[43] = half_btf(cospi[11], bf0[43], cospi[53], bf0[52], cos_bit);
+  bf1[44] = half_btf(cospi[51], bf0[44], cospi[13], bf0[51], cos_bit);
+  bf1[45] = half_btf(cospi[19], bf0[45], cospi[45], bf0[50], cos_bit);
+  bf1[46] = half_btf(cospi[35], bf0[46], cospi[29], bf0[49], cos_bit);
+  bf1[47] = half_btf(cospi[3], bf0[47], cospi[61], bf0[48], cos_bit);
+  bf1[48] = half_btf(cospi[3], bf0[48], -cospi[61], bf0[47], cos_bit);
+  bf1[49] = half_btf(cospi[35], bf0[49], -cospi[29], bf0[46], cos_bit);
+  bf1[50] = half_btf(cospi[19], bf0[50], -cospi[45], bf0[45], cos_bit);
+  bf1[51] = half_btf(cospi[51], bf0[51], -cospi[13], bf0[44], cos_bit);
+  bf1[52] = half_btf(cospi[11], bf0[52], -cospi[53], bf0[43], cos_bit);
+  bf1[53] = half_btf(cospi[43], bf0[53], -cospi[21], bf0[42], cos_bit);
+  bf1[54] = half_btf(cospi[27], bf0[54], -cospi[37], bf0[41], cos_bit);
+  bf1[55] = half_btf(cospi[59], bf0[55], -cospi[5], bf0[40], cos_bit);
+  bf1[56] = half_btf(cospi[7], bf0[56], -cospi[57], bf0[39], cos_bit);
+  bf1[57] = half_btf(cospi[39], bf0[57], -cospi[25], bf0[38], cos_bit);
+  bf1[58] = half_btf(cospi[23], bf0[58], -cospi[41], bf0[37], cos_bit);
+  bf1[59] = half_btf(cospi[55], bf0[59], -cospi[9], bf0[36], cos_bit);
+  bf1[60] = half_btf(cospi[15], bf0[60], -cospi[49], bf0[35], cos_bit);
+  bf1[61] = half_btf(cospi[47], bf0[61], -cospi[17], bf0[34], cos_bit);
+  bf1[62] = half_btf(cospi[31], bf0[62], -cospi[33], bf0[33], cos_bit);
+  bf1[63] = half_btf(cospi[63], bf0[63], -cospi[1], bf0[32], cos_bit);
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 11
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[32];
+  bf1[2] = bf0[16];
+  bf1[3] = bf0[48];
+  bf1[4] = bf0[8];
+  bf1[5] = bf0[40];
+  bf1[6] = bf0[24];
+  bf1[7] = bf0[56];
+  bf1[8] = bf0[4];
+  bf1[9] = bf0[36];
+  bf1[10] = bf0[20];
+  bf1[11] = bf0[52];
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[44];
+  bf1[14] = bf0[28];
+  bf1[15] = bf0[60];
+  bf1[16] = bf0[2];
+  bf1[17] = bf0[34];
+  bf1[18] = bf0[18];
+  bf1[19] = bf0[50];
+  bf1[20] = bf0[10];
+  bf1[21] = bf0[42];
+  bf1[22] = bf0[26];
+  bf1[23] = bf0[58];
+  bf1[24] = bf0[6];
+  bf1[25] = bf0[38];
+  bf1[26] = bf0[22];
+  bf1[27] = bf0[54];
+  bf1[28] = bf0[14];
+  bf1[29] = bf0[46];
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[62];
+  bf1[32] = bf0[1];
+  bf1[33] = bf0[33];
+  bf1[34] = bf0[17];
+  bf1[35] = bf0[49];
+  bf1[36] = bf0[9];
+  bf1[37] = bf0[41];
+  bf1[38] = bf0[25];
+  bf1[39] = bf0[57];
+  bf1[40] = bf0[5];
+  bf1[41] = bf0[37];
+  bf1[42] = bf0[21];
+  bf1[43] = bf0[53];
+  bf1[44] = bf0[13];
+  bf1[45] = bf0[45];
+  bf1[46] = bf0[29];
+  bf1[47] = bf0[61];
+  bf1[48] = bf0[3];
+  bf1[49] = bf0[35];
+  bf1[50] = bf0[19];
+  bf1[51] = bf0[51];
+  bf1[52] = bf0[11];
+  bf1[53] = bf0[43];
+  bf1[54] = bf0[27];
+  bf1[55] = bf0[59];
+  bf1[56] = bf0[7];
+  bf1[57] = bf0[39];
+  bf1[58] = bf0[23];
+  bf1[59] = bf0[55];
+  bf1[60] = bf0[15];
+  bf1[61] = bf0[47];
+  bf1[62] = bf0[31];
+  bf1[63] = bf0[63];
+  av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+}
diff --git a/libs/libaom/src/av1/encoder/av1_fwd_txfm1d.h b/libs/libaom/src/av1/encoder/av1_fwd_txfm1d.h
new file mode 100644
index 000000000..9ef54fe4d
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/av1_fwd_txfm1d.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_AV1_FWD_TXFM1D_H_
+#define AOM_AV1_ENCODER_AV1_FWD_TXFM1D_H_
+
+#include "av1/common/av1_txfm.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_fdct4(const int32_t *input, int32_t *output, int8_t cos_bit,
+               const int8_t *stage_range);
+void av1_fdct8(const int32_t *input, int32_t *output, int8_t cos_bit,
+               const int8_t *stage_range);
+void av1_fdct16(const int32_t *input, int32_t *output, int8_t cos_bit,
+                const int8_t *stage_range);
+void av1_fdct32(const int32_t *input, int32_t *output, int8_t cos_bit,
+                const int8_t *stage_range);
+void av1_fdct64(const int32_t *input, int32_t *output, int8_t cos_bit,
+                const int8_t *stage_range);
+void av1_fadst4(const int32_t *input, int32_t *output, int8_t cos_bit,
+                const int8_t *stage_range);
+void av1_fadst8(const int32_t *input, int32_t *output, int8_t cos_bit,
+                const int8_t *stage_range);
+void av1_fadst16(const int32_t *input, int32_t *output, int8_t cos_bit,
+                 const int8_t *stage_range);
+void av1_fidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+                      const int8_t *stage_range);
+void av1_fidentity8_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+                      const int8_t *stage_range);
+void av1_fidentity16_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+                       const int8_t *stage_range);
+void av1_fidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+                       const int8_t *stage_range);
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // AOM_AV1_ENCODER_AV1_FWD_TXFM1D_H_
diff --git a/libs/libaom/src/av1/encoder/av1_fwd_txfm1d_cfg.h b/libs/libaom/src/av1/encoder/av1_fwd_txfm1d_cfg.h
new file mode 100644
index 000000000..2777cc25b
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/av1_fwd_txfm1d_cfg.h
@@ -0,0 +1,19 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_AV1_FWD_TXFM1D_CFG_H_
+#define AOM_AV1_ENCODER_AV1_FWD_TXFM1D_CFG_H_
+#include "av1/common/enums.h"
+#include "av1/encoder/av1_fwd_txfm1d.h"
+extern const int8_t *av1_fwd_txfm_shift_ls[TX_SIZES_ALL];
+extern const int8_t av1_fwd_cos_bit_col[5][5];
+extern const int8_t av1_fwd_cos_bit_row[5][5];
+#endif  // AOM_AV1_ENCODER_AV1_FWD_TXFM1D_CFG_H_
diff --git a/libs/libaom/src/av1/encoder/av1_fwd_txfm2d.c b/libs/libaom/src/av1/encoder/av1_fwd_txfm2d.c
new file mode 100644
index 000000000..bcb829d79
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/av1_fwd_txfm2d.c
@@ -0,0 +1,429 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/txfm_common.h"
+#include "av1/common/enums.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/encoder/av1_fwd_txfm1d.h"
+#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
+
+static INLINE TxfmFunc fwd_txfm_type_to_func(TXFM_TYPE txfm_type) {
+  switch (txfm_type) {
+    case TXFM_TYPE_DCT4: return av1_fdct4;
+    case TXFM_TYPE_DCT8: return av1_fdct8;
+    case TXFM_TYPE_DCT16: return av1_fdct16;
+    case TXFM_TYPE_DCT32: return av1_fdct32;
+    case TXFM_TYPE_DCT64: return av1_fdct64;
+    case TXFM_TYPE_ADST4: return av1_fadst4;
+    case TXFM_TYPE_ADST8: return av1_fadst8;
+    case TXFM_TYPE_ADST16: return av1_fadst16;
+    case TXFM_TYPE_IDENTITY4: return av1_fidentity4_c;
+    case TXFM_TYPE_IDENTITY8: return av1_fidentity8_c;
+    case TXFM_TYPE_IDENTITY16: return av1_fidentity16_c;
+    case TXFM_TYPE_IDENTITY32: return av1_fidentity32_c;
+    default: assert(0); return NULL;
+  }
+}
+
+void av1_gen_fwd_stage_range(int8_t *stage_range_col, int8_t *stage_range_row,
+                             const TXFM_2D_FLIP_CFG *cfg, int bd) {
+  // Take the shift from the larger dimension in the rectangular case.
+  const int8_t *shift = cfg->shift;
+  // i < MAX_TXFM_STAGE_NUM will mute above array bounds warning
+  for (int i = 0; i < cfg->stage_num_col && i < MAX_TXFM_STAGE_NUM; ++i) {
+    stage_range_col[i] = cfg->stage_range_col[i] + shift[0] + bd + 1;
+  }
+
+  // i < MAX_TXFM_STAGE_NUM will mute above array bounds warning
+  for (int i = 0; i < cfg->stage_num_row && i < MAX_TXFM_STAGE_NUM; ++i) {
+    stage_range_row[i] = cfg->stage_range_row[i] + shift[0] + shift[1] + bd + 1;
+  }
+}
+
+static INLINE void fwd_txfm2d_c(const int16_t *input, int32_t *output,
+                                const int stride, const TXFM_2D_FLIP_CFG *cfg,
+                                int32_t *buf, int bd) {
+  int c, r;
+  // Note when assigning txfm_size_col, we use the txfm_size from the
+  // row configuration and vice versa. This is intentionally done to
+  // accurately perform rectangular transforms. When the transform is
+  // rectangular, the number of columns will be the same as the
+  // txfm_size stored in the row cfg struct. It will make no difference
+  // for square transforms.
+  const int txfm_size_col = tx_size_wide[cfg->tx_size];
+  const int txfm_size_row = tx_size_high[cfg->tx_size];
+  // Take the shift from the larger dimension in the rectangular case.
+  const int8_t *shift = cfg->shift;
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+  int8_t stage_range_col[MAX_TXFM_STAGE_NUM];
+  int8_t stage_range_row[MAX_TXFM_STAGE_NUM];
+  assert(cfg->stage_num_col <= MAX_TXFM_STAGE_NUM);
+  assert(cfg->stage_num_row <= MAX_TXFM_STAGE_NUM);
+  av1_gen_fwd_stage_range(stage_range_col, stage_range_row, cfg, bd);
+
+  const int8_t cos_bit_col = cfg->cos_bit_col;
+  const int8_t cos_bit_row = cfg->cos_bit_row;
+  const TxfmFunc txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col);
+  const TxfmFunc txfm_func_row = fwd_txfm_type_to_func(cfg->txfm_type_row);
+
+  // use output buffer as temp buffer
+  int32_t *temp_in = output;
+  int32_t *temp_out = output + txfm_size_row;
+
+  // Columns
+  for (c = 0; c < txfm_size_col; ++c) {
+    if (cfg->ud_flip == 0) {
+      for (r = 0; r < txfm_size_row; ++r) temp_in[r] = input[r * stride + c];
+    } else {
+      for (r = 0; r < txfm_size_row; ++r)
+        // flip upside down
+        temp_in[r] = input[(txfm_size_row - r - 1) * stride + c];
+    }
+    av1_round_shift_array(temp_in, txfm_size_row, -shift[0]);
+    txfm_func_col(temp_in, temp_out, cos_bit_col, stage_range_col);
+    av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+    if (cfg->lr_flip == 0) {
+      for (r = 0; r < txfm_size_row; ++r)
+        buf[r * txfm_size_col + c] = temp_out[r];
+    } else {
+      for (r = 0; r < txfm_size_row; ++r)
+        // flip from left to right
+        buf[r * txfm_size_col + (txfm_size_col - c - 1)] = temp_out[r];
+    }
+  }
+
+  // Rows
+  for (r = 0; r < txfm_size_row; ++r) {
+    txfm_func_row(buf + r * txfm_size_col, output + r * txfm_size_col,
+                  cos_bit_row, stage_range_row);
+    av1_round_shift_array(output + r * txfm_size_col, txfm_size_col, -shift[2]);
+    if (abs(rect_type) == 1) {
+      // Multiply everything by Sqrt2 if the transform is rectangular and the
+      // size difference is a factor of 2.
+      for (c = 0; c < txfm_size_col; ++c) {
+        output[r * txfm_size_col + c] = round_shift(
+            (int64_t)output[r * txfm_size_col + c] * NewSqrt2, NewSqrt2Bits);
+      }
+    }
+  }
+}
+
+void av1_fwd_txfm2d_4x8_c(const int16_t *input, int32_t *output, int stride,
+                          TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(32, int32_t, txfm_buf[4 * 8]);
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_4X8, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_8x4_c(const int16_t *input, int32_t *output, int stride,
+                          TX_TYPE tx_type, int bd) {
+  int32_t txfm_buf[8 * 4];
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_8X4, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_8x16_c(const int16_t *input, int32_t *output, int stride,
+                           TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(32, int32_t, txfm_buf[8 * 16]);
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_8X16, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_16x8_c(const int16_t *input, int32_t *output, int stride,
+                           TX_TYPE tx_type, int bd) {
+  int32_t txfm_buf[16 * 8];
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_16X8, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_16x32_c(const int16_t *input, int32_t *output, int stride,
+                            TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(32, int32_t, txfm_buf[16 * 32]);
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_16X32, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_32x16_c(const int16_t *input, int32_t *output, int stride,
+                            TX_TYPE tx_type, int bd) {
+  int32_t txfm_buf[32 * 16];
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_32X16, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_4x16_c(const int16_t *input, int32_t *output, int stride,
+                           TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(32, int32_t, txfm_buf[4 * 16]);
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_4X16, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_16x4_c(const int16_t *input, int32_t *output, int stride,
+                           TX_TYPE tx_type, int bd) {
+  int32_t txfm_buf[16 * 4];
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_16X4, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_8x32_c(const int16_t *input, int32_t *output, int stride,
+                           TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(32, int32_t, txfm_buf[32 * 8]);
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_8X32, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_32x8_c(const int16_t *input, int32_t *output, int stride,
+                           TX_TYPE tx_type, int bd) {
+  int32_t txfm_buf[32 * 8];
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_32X8, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_4x4_c(const int16_t *input, int32_t *output, int stride,
+                          TX_TYPE tx_type, int bd) {
+  int32_t txfm_buf[4 * 4];
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_4X4, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_8x8_c(const int16_t *input, int32_t *output, int stride,
+                          TX_TYPE tx_type, int bd) {
+  int32_t txfm_buf[8 * 8];
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_8X8, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_16x16_c(const int16_t *input, int32_t *output, int stride,
+                            TX_TYPE tx_type, int bd) {
+  int32_t txfm_buf[16 * 16];
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_16X16, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_32x32_c(const int16_t *input, int32_t *output, int stride,
+                            TX_TYPE tx_type, int bd) {
+  int32_t txfm_buf[32 * 32];
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_32X32, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_64x64_c(const int16_t *input, int32_t *output, int stride,
+                            TX_TYPE tx_type, int bd) {
+  int32_t txfm_buf[64 * 64];
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_64X64, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+
+  // Zero out top-right 32x32 area.
+  for (int row = 0; row < 32; ++row) {
+    memset(output + row * 64 + 32, 0, 32 * sizeof(*output));
+  }
+  // Zero out the bottom 64x32 area.
+  memset(output + 32 * 64, 0, 32 * 64 * sizeof(*output));
+  // Re-pack non-zero coeffs in the first 32x32 indices.
+  for (int row = 1; row < 32; ++row) {
+    memcpy(output + row * 32, output + row * 64, 32 * sizeof(*output));
+  }
+}
+
+void av1_fwd_txfm2d_32x64_c(const int16_t *input, int32_t *output, int stride,
+                            TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(32, int32_t, txfm_buf[32 * 64]);
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_32X64, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+  // Zero out the bottom 32x32 area.
+  memset(output + 32 * 32, 0, 32 * 32 * sizeof(*output));
+  // Note: no repacking needed here.
+}
+
+void av1_fwd_txfm2d_64x32_c(const int16_t *input, int32_t *output, int stride,
+                            TX_TYPE tx_type, int bd) {
+  int32_t txfm_buf[64 * 32];
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_64X32, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+
+  // Zero out right 32x32 area.
+  for (int row = 0; row < 32; ++row) {
+    memset(output + row * 64 + 32, 0, 32 * sizeof(*output));
+  }
+  // Re-pack non-zero coeffs in the first 32x32 indices.
+  for (int row = 1; row < 32; ++row) {
+    memcpy(output + row * 32, output + row * 64, 32 * sizeof(*output));
+  }
+}
+
+void av1_fwd_txfm2d_16x64_c(const int16_t *input, int32_t *output, int stride,
+                            TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(32, int32_t, txfm_buf[64 * 16]);
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_16X64, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+  // Zero out the bottom 16x32 area.
+  memset(output + 16 * 32, 0, 16 * 32 * sizeof(*output));
+  // Note: no repacking needed here.
+}
+
+void av1_fwd_txfm2d_64x16_c(const int16_t *input, int32_t *output, int stride,
+                            TX_TYPE tx_type, int bd) {
+  int32_t txfm_buf[64 * 16];
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_64X16, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+  // Zero out right 32x16 area.
+  for (int row = 0; row < 16; ++row) {
+    memset(output + row * 64 + 32, 0, 32 * sizeof(*output));
+  }
+  // Re-pack non-zero coeffs in the first 32x16 indices.
+  for (int row = 1; row < 16; ++row) {
+    memcpy(output + row * 32, output + row * 64, 32 * sizeof(*output));
+  }
+}
+
+static const int8_t fwd_shift_4x4[3] = { 2, 0, 0 };
+static const int8_t fwd_shift_8x8[3] = { 2, -1, 0 };
+static const int8_t fwd_shift_16x16[3] = { 2, -2, 0 };
+static const int8_t fwd_shift_32x32[3] = { 2, -4, 0 };
+static const int8_t fwd_shift_64x64[3] = { 0, -2, -2 };
+static const int8_t fwd_shift_4x8[3] = { 2, -1, 0 };
+static const int8_t fwd_shift_8x4[3] = { 2, -1, 0 };
+static const int8_t fwd_shift_8x16[3] = { 2, -2, 0 };
+static const int8_t fwd_shift_16x8[3] = { 2, -2, 0 };
+static const int8_t fwd_shift_16x32[3] = { 2, -4, 0 };
+static const int8_t fwd_shift_32x16[3] = { 2, -4, 0 };
+static const int8_t fwd_shift_32x64[3] = { 0, -2, -2 };
+static const int8_t fwd_shift_64x32[3] = { 2, -4, -2 };
+static const int8_t fwd_shift_4x16[3] = { 2, -1, 0 };
+static const int8_t fwd_shift_16x4[3] = { 2, -1, 0 };
+static const int8_t fwd_shift_8x32[3] = { 2, -2, 0 };
+static const int8_t fwd_shift_32x8[3] = { 2, -2, 0 };
+static const int8_t fwd_shift_16x64[3] = { 0, -2, 0 };
+static const int8_t fwd_shift_64x16[3] = { 2, -4, 0 };
+
+const int8_t *av1_fwd_txfm_shift_ls[TX_SIZES_ALL] = {
+  fwd_shift_4x4,   fwd_shift_8x8,   fwd_shift_16x16, fwd_shift_32x32,
+  fwd_shift_64x64, fwd_shift_4x8,   fwd_shift_8x4,   fwd_shift_8x16,
+  fwd_shift_16x8,  fwd_shift_16x32, fwd_shift_32x16, fwd_shift_32x64,
+  fwd_shift_64x32, fwd_shift_4x16,  fwd_shift_16x4,  fwd_shift_8x32,
+  fwd_shift_32x8,  fwd_shift_16x64, fwd_shift_64x16,
+};
+
+const int8_t av1_fwd_cos_bit_col[MAX_TXWH_IDX /*txw_idx*/]
+                                [MAX_TXWH_IDX /*txh_idx*/] = {
+                                  { 13, 13, 13, 0, 0 },
+                                  { 13, 13, 13, 12, 0 },
+                                  { 13, 13, 13, 12, 13 },
+                                  { 0, 13, 13, 12, 13 },
+                                  { 0, 0, 13, 12, 13 }
+                                };
+
+const int8_t av1_fwd_cos_bit_row[MAX_TXWH_IDX /*txw_idx*/]
+                                [MAX_TXWH_IDX /*txh_idx*/] = {
+                                  { 13, 13, 12, 0, 0 },
+                                  { 13, 13, 13, 12, 0 },
+                                  { 13, 13, 12, 13, 12 },
+                                  { 0, 12, 13, 12, 11 },
+                                  { 0, 0, 12, 11, 10 }
+                                };
+
+static const int8_t fdct4_range_mult2[4] = { 0, 2, 3, 3 };
+static const int8_t fdct8_range_mult2[6] = { 0, 2, 4, 5, 5, 5 };
+static const int8_t fdct16_range_mult2[8] = { 0, 2, 4, 6, 7, 7, 7, 7 };
+static const int8_t fdct32_range_mult2[10] = { 0, 2, 4, 6, 8, 9, 9, 9, 9, 9 };
+static const int8_t fdct64_range_mult2[12] = { 0,  2,  4,  6,  8,  10,
+                                               11, 11, 11, 11, 11, 11 };
+
+static const int8_t fadst4_range_mult2[7] = { 0, 2, 4, 3, 3, 3, 3 };
+static const int8_t fadst8_range_mult2[8] = { 0, 0, 1, 3, 3, 5, 5, 5 };
+static const int8_t fadst16_range_mult2[10] = { 0, 0, 1, 3, 3, 5, 5, 7, 7, 7 };
+
+static const int8_t fidtx4_range_mult2[1] = { 1 };
+static const int8_t fidtx8_range_mult2[1] = { 2 };
+static const int8_t fidtx16_range_mult2[1] = { 3 };
+static const int8_t fidtx32_range_mult2[1] = { 4 };
+
+#if 0
+const int8_t fwd_idtx_range_row[MAX_TXWH_IDX /*txw_idx*/]
+                               [MAX_TXWH_IDX /*txh_idx*/] = { { 2, 4, 5, 0, 0 },
+                                                              { 3, 4, 5, 6, 0 },
+                                                              { 4, 5, 6, 7, 8 },
+                                                              { 0, 5, 6, 7, 8 },
+                                                              { 0, 0, 7, 8,
+                                                                9 } };
+#endif
+
+static const int8_t *fwd_txfm_range_mult2_list[TXFM_TYPES] = {
+  fdct4_range_mult2,  fdct8_range_mult2,   fdct16_range_mult2,
+  fdct32_range_mult2, fdct64_range_mult2,  fadst4_range_mult2,
+  fadst8_range_mult2, fadst16_range_mult2, fidtx4_range_mult2,
+  fidtx8_range_mult2, fidtx16_range_mult2, fidtx32_range_mult2
+};
+
+static INLINE void set_fwd_txfm_non_scale_range(TXFM_2D_FLIP_CFG *cfg) {
+  av1_zero(cfg->stage_range_col);
+  av1_zero(cfg->stage_range_row);
+
+  const int8_t *range_mult2_col = fwd_txfm_range_mult2_list[cfg->txfm_type_col];
+  if (cfg->txfm_type_col != TXFM_TYPE_INVALID) {
+    int stage_num_col = cfg->stage_num_col;
+    for (int i = 0; i < stage_num_col; ++i)
+      cfg->stage_range_col[i] = (range_mult2_col[i] + 1) >> 1;
+  }
+
+  if (cfg->txfm_type_row != TXFM_TYPE_INVALID) {
+    int stage_num_row = cfg->stage_num_row;
+    const int8_t *range_mult2_row =
+        fwd_txfm_range_mult2_list[cfg->txfm_type_row];
+    for (int i = 0; i < stage_num_row; ++i) {
+      cfg->stage_range_row[i] =
+          (range_mult2_col[cfg->stage_num_col - 1] + range_mult2_row[i] + 1) >>
+          1;
+    }
+  }
+}
+
+void av1_get_fwd_txfm_cfg(TX_TYPE tx_type, TX_SIZE tx_size,
+                          TXFM_2D_FLIP_CFG *cfg) {
+  assert(cfg != NULL);
+  cfg->tx_size = tx_size;
+  set_flip_cfg(tx_type, cfg);
+  const TX_TYPE_1D tx_type_1d_col = vtx_tab[tx_type];
+  const TX_TYPE_1D tx_type_1d_row = htx_tab[tx_type];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  cfg->shift = av1_fwd_txfm_shift_ls[tx_size];
+  cfg->cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  cfg->cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  cfg->txfm_type_col = av1_txfm_type_ls[txh_idx][tx_type_1d_col];
+  cfg->txfm_type_row = av1_txfm_type_ls[txw_idx][tx_type_1d_row];
+  cfg->stage_num_col = av1_txfm_stage_num_list[cfg->txfm_type_col];
+  cfg->stage_num_row = av1_txfm_stage_num_list[cfg->txfm_type_row];
+  set_fwd_txfm_non_scale_range(cfg);
+}
diff --git a/libs/libaom/src/av1/encoder/av1_multi_thread.c b/libs/libaom/src/av1/encoder/av1_multi_thread.c
new file mode 100644
index 000000000..d170b0c28
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/av1_multi_thread.c
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/ethread.h"
+#include "av1/encoder/av1_multi_thread.h"
+
+void av1_row_mt_mem_alloc(AV1_COMP *cpi, int max_sb_rows) {
+  struct AV1Common *cm = &cpi->common;
+  MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt;
+  int tile_row, tile_col;
+  const int tile_cols = cm->tiles.cols;
+  const int tile_rows = cm->tiles.rows;
+
+  multi_thread_ctxt->allocated_tile_cols = tile_cols;
+  multi_thread_ctxt->allocated_tile_rows = tile_rows;
+  multi_thread_ctxt->allocated_sb_rows = max_sb_rows;
+
+  // Allocate memory for row based multi-threading
+  for (tile_row = 0; tile_row < multi_thread_ctxt->allocated_tile_rows;
+       tile_row++) {
+    for (tile_col = 0; tile_col < multi_thread_ctxt->allocated_tile_cols;
+         tile_col++) {
+      TileDataEnc *this_tile =
+          &cpi->tile_data[tile_row * multi_thread_ctxt->allocated_tile_cols +
+                          tile_col];
+      av1_row_mt_sync_mem_alloc(&this_tile->row_mt_sync, cm, max_sb_rows);
+      if (cpi->oxcf.cdf_update_mode)
+        CHECK_MEM_ERROR(
+            cm, this_tile->row_ctx,
+            (FRAME_CONTEXT *)aom_memalign(
+                16,
+                AOMMAX(1, (av1_get_sb_cols_in_tile(cm, this_tile->tile_info) -
+                           1)) *
+                    sizeof(*this_tile->row_ctx)));
+    }
+  }
+}
+
+void av1_row_mt_mem_dealloc(AV1_COMP *cpi) {
+  MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt;
+  int tile_col;
+  int tile_row;
+
+  // Free row based multi-threading sync memory
+  for (tile_row = 0; tile_row < multi_thread_ctxt->allocated_tile_rows;
+       tile_row++) {
+    for (tile_col = 0; tile_col < multi_thread_ctxt->allocated_tile_cols;
+         tile_col++) {
+      TileDataEnc *this_tile =
+          &cpi->tile_data[tile_row * multi_thread_ctxt->allocated_tile_cols +
+                          tile_col];
+      av1_row_mt_sync_mem_dealloc(&this_tile->row_mt_sync);
+      if (cpi->oxcf.cdf_update_mode) aom_free(this_tile->row_ctx);
+    }
+  }
+  multi_thread_ctxt->allocated_sb_rows = 0;
+  multi_thread_ctxt->allocated_tile_cols = 0;
+  multi_thread_ctxt->allocated_tile_rows = 0;
+}
diff --git a/libs/libaom/src/av1/encoder/av1_multi_thread.h b/libs/libaom/src/av1/encoder/av1_multi_thread.h
new file mode 100644
index 000000000..2a1cc7d6d
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/av1_multi_thread.h
@@ -0,0 +1,21 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AV1_ENCODER_AV1_MULTI_THREAD_H
+#define AV1_ENCODER_AV1_MULTI_THREAD_H
+
+#include "av1/encoder/encoder.h"
+
+void av1_row_mt_mem_alloc(AV1_COMP *cpi, int max_sb_rows);
+
+void av1_row_mt_mem_dealloc(AV1_COMP *cpi);
+
+#endif  // AV1_ENCODER_AV1_MULTI_THREAD_H
diff --git a/libs/libaom/src/av1/encoder/av1_quantize.c b/libs/libaom/src/av1/encoder/av1_quantize.c
new file mode 100644
index 000000000..569784a2a
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/av1_quantize.c
@@ -0,0 +1,789 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/quantize.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+
+#include "av1/common/idct.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/scan.h"
+#include "av1/common/seg_common.h"
+
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/rd.h"
+
+void av1_quantize_skip(intptr_t n_coeffs, tran_low_t *qcoeff_ptr,
+                       tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr) {
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  *eob_ptr = 0;
+}
+
+static void quantize_fp_helper_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+    const qm_val_t *iqm_ptr, int log_scale) {
+  int i, eob = -1;
+  const int rounding[2] = { ROUND_POWER_OF_TWO(round_ptr[0], log_scale),
+                            ROUND_POWER_OF_TWO(round_ptr[1], log_scale) };
+  // TODO(jingning) Decide the need of these arguments after the
+  // quantization process is completed.
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  (void)iscan;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (qm_ptr == NULL && iqm_ptr == NULL) {
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      const int32_t thresh = (int32_t)(dequant_ptr[rc != 0]);
+      const int coeff = coeff_ptr[rc];
+      const int coeff_sign = AOMSIGN(coeff);
+      int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      int tmp32 = 0;
+      if ((abs_coeff << (1 + log_scale)) >= thresh) {
+        abs_coeff =
+            clamp64(abs_coeff + rounding[rc != 0], INT16_MIN, INT16_MAX);
+        tmp32 = (int)((abs_coeff * quant_ptr[rc != 0]) >> (16 - log_scale));
+        if (tmp32) {
+          qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
+          const tran_low_t abs_dqcoeff =
+              (tmp32 * dequant_ptr[rc != 0]) >> log_scale;
+          dqcoeff_ptr[rc] = (abs_dqcoeff ^ coeff_sign) - coeff_sign;
+        }
+      }
+      if (tmp32) eob = i;
+    }
+  } else {
+    // Quantization pass: All coefficients with index >= zero_flag are
+    // skippable. Note: zero_flag can be zero.
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+      const qm_val_t wt = qm_ptr ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+      const qm_val_t iwt = iqm_ptr ? iqm_ptr[rc] : (1 << AOM_QM_BITS);
+      const int dequant =
+          (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >>
+          AOM_QM_BITS;
+      const int coeff_sign = AOMSIGN(coeff);
+      int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      int tmp32 = 0;
+      if (abs_coeff * wt >=
+          (dequant_ptr[rc != 0] << (AOM_QM_BITS - (1 + log_scale)))) {
+        abs_coeff += rounding[rc != 0];
+        abs_coeff = clamp64(abs_coeff, INT16_MIN, INT16_MAX);
+        tmp32 = (int)((abs_coeff * wt * quant_ptr[rc != 0]) >>
+                      (16 - log_scale + AOM_QM_BITS));
+        qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
+        const tran_low_t abs_dqcoeff = (tmp32 * dequant) >> log_scale;
+        dqcoeff_ptr[rc] = (abs_dqcoeff ^ coeff_sign) - coeff_sign;
+      }
+
+      if (tmp32) eob = i;
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static void highbd_quantize_fp_helper_c(
+    const tran_low_t *coeff_ptr, intptr_t count, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+    const qm_val_t *iqm_ptr, int log_scale) {
+  int i;
+  int eob = -1;
+  const int shift = 16 - log_scale;
+  // TODO(jingning) Decide the need of these arguments after the
+  // quantization process is completed.
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  (void)iscan;
+
+  if (qm_ptr || iqm_ptr) {
+    // Quantization pass: All coefficients with index >= zero_flag are
+    // skippable. Note: zero_flag can be zero.
+    for (i = 0; i < count; i++) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+      const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+      const qm_val_t iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS);
+      const int dequant =
+          (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >>
+          AOM_QM_BITS;
+      const int coeff_sign = AOMSIGN(coeff);
+      const int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      int abs_qcoeff = 0;
+      if (abs_coeff * wt >=
+          (dequant_ptr[rc != 0] << (AOM_QM_BITS - (1 + log_scale)))) {
+        const int64_t tmp =
+            abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale);
+        abs_qcoeff =
+            (int)((tmp * quant_ptr[rc != 0] * wt) >> (shift + AOM_QM_BITS));
+        qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+        const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant) >> log_scale;
+        dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
+        if (abs_qcoeff) eob = i;
+      } else {
+        qcoeff_ptr[rc] = 0;
+        dqcoeff_ptr[rc] = 0;
+      }
+    }
+  } else {
+    const int log_scaled_round_arr[2] = {
+      ROUND_POWER_OF_TWO(round_ptr[0], log_scale),
+      ROUND_POWER_OF_TWO(round_ptr[1], log_scale),
+    };
+    for (i = 0; i < count; i++) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+      const int rc01 = (rc != 0);
+      const int coeff_sign = AOMSIGN(coeff);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      const int log_scaled_round = log_scaled_round_arr[rc01];
+      if ((abs_coeff << (1 + log_scale)) >= dequant_ptr[rc01]) {
+        const int quant = quant_ptr[rc01];
+        const int dequant = dequant_ptr[rc01];
+        const int64_t tmp = (int64_t)abs_coeff + log_scaled_round;
+        const int abs_qcoeff = (int)((tmp * quant) >> shift);
+        qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+        const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant) >> log_scale;
+        if (abs_qcoeff) eob = i;
+        dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
+      } else {
+        qcoeff_ptr[rc] = 0;
+        dqcoeff_ptr[rc] = 0;
+      }
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+void av1_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                       const int16_t *zbin_ptr, const int16_t *round_ptr,
+                       const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
+                       tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                       const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                       const int16_t *scan, const int16_t *iscan) {
+  quantize_fp_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
+                       quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+                       eob_ptr, scan, iscan, NULL, NULL, 0);
+}
+
+void av1_quantize_lp_c(const int16_t *coeff_ptr, intptr_t n_coeffs,
+                       const int16_t *round_ptr, const int16_t *quant_ptr,
+                       int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+                       const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                       const int16_t *scan) {
+  int eob = -1;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  // Quantization pass: All coefficients with index >= zero_flag are
+  // skippable. Note: zero_flag can be zero.
+  for (int i = 0; i < n_coeffs; i++) {
+    const int rc = scan[i];
+    const int coeff = coeff_ptr[rc];
+    const int coeff_sign = AOMSIGN(coeff);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+
+    int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
+    tmp = (tmp * quant_ptr[rc != 0]) >> 16;
+
+    qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
+
+    if (tmp) eob = i;
+  }
+  *eob_ptr = eob + 1;
+}
+
+void av1_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                             const int16_t *zbin_ptr, const int16_t *round_ptr,
+                             const int16_t *quant_ptr,
+                             const int16_t *quant_shift_ptr,
+                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                             const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                             const int16_t *scan, const int16_t *iscan) {
+  quantize_fp_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
+                       quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+                       eob_ptr, scan, iscan, NULL, NULL, 1);
+}
+
+void av1_quantize_fp_64x64_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                             const int16_t *zbin_ptr, const int16_t *round_ptr,
+                             const int16_t *quant_ptr,
+                             const int16_t *quant_shift_ptr,
+                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                             const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                             const int16_t *scan, const int16_t *iscan) {
+  quantize_fp_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
+                       quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+                       eob_ptr, scan, iscan, NULL, NULL, 2);
+}
+
+void av1_quantize_fp_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                            const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
+                            tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                            const SCAN_ORDER *sc, const QUANT_PARAM *qparam) {
+  const qm_val_t *qm_ptr = qparam->qmatrix;
+  const qm_val_t *iqm_ptr = qparam->iqmatrix;
+  if (qm_ptr != NULL && iqm_ptr != NULL) {
+    quantize_fp_helper_c(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX,
+                         p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr,
+                         dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+                         sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
+  } else {
+    switch (qparam->log_scale) {
+      case 0:
+        av1_quantize_fp(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX,
+                        p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr,
+                        dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+                        sc->iscan);
+        break;
+      case 1:
+        av1_quantize_fp_32x32(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX,
+                              p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr,
+                              dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+                              sc->iscan);
+        break;
+      case 2:
+        av1_quantize_fp_64x64(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX,
+                              p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr,
+                              dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+                              sc->iscan);
+        break;
+      default: assert(0);
+    }
+  }
+}
+
+void av1_quantize_b_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                           const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
+                           tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                           const SCAN_ORDER *sc, const QUANT_PARAM *qparam) {
+  const qm_val_t *qm_ptr = qparam->qmatrix;
+  const qm_val_t *iqm_ptr = qparam->iqmatrix;
+  if (qparam->use_quant_b_adapt) {
+    // TODO(sarahparker) These quantize_b optimizations need SIMD
+    // implementations
+    if (qm_ptr != NULL && iqm_ptr != NULL) {
+      aom_quantize_b_adaptive_helper_c(
+          coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+          p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr,
+          sc->scan, sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
+    } else {
+      switch (qparam->log_scale) {
+        case 0:
+          aom_quantize_b_adaptive(coeff_ptr, n_coeffs, p->zbin_QTX,
+                                  p->round_QTX, p->quant_QTX,
+                                  p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr,
+                                  p->dequant_QTX, eob_ptr, sc->scan, sc->iscan);
+          break;
+        case 1:
+          aom_quantize_b_32x32_adaptive(
+              coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+              p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
+              eob_ptr, sc->scan, sc->iscan);
+          break;
+        case 2:
+          aom_quantize_b_64x64_adaptive(
+              coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+              p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
+              eob_ptr, sc->scan, sc->iscan);
+          break;
+        default: assert(0);
+      }
+    }
+  } else {
+    if (qm_ptr != NULL && iqm_ptr != NULL) {
+      aom_quantize_b_helper_c(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
+                              p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
+                              dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+                              sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
+    } else {
+      switch (qparam->log_scale) {
+        case 0:
+          aom_quantize_b(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
+                         p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
+                         dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+                         sc->iscan);
+          break;
+        case 1:
+          aom_quantize_b_32x32(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
+                               p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
+                               dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+                               sc->iscan);
+          break;
+        case 2:
+          aom_quantize_b_64x64(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
+                               p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
+                               dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+                               sc->iscan);
+          break;
+        default: assert(0);
+      }
+    }
+  }
+}
+
+static void quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
+                        int skip_block, const int16_t *round_ptr,
+                        const int16_t quant, tran_low_t *qcoeff_ptr,
+                        tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr,
+                        uint16_t *eob_ptr, const qm_val_t *qm_ptr,
+                        const qm_val_t *iqm_ptr, const int log_scale) {
+  const int rc = 0;
+  const int coeff = coeff_ptr[rc];
+  const int coeff_sign = AOMSIGN(coeff);
+  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+  int64_t tmp;
+  int eob = -1;
+  int32_t tmp32;
+  int dequant;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    const int wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+    const int iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS);
+    tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale),
+                INT16_MIN, INT16_MAX);
+    tmp32 = (int32_t)((tmp * wt * quant) >> (16 - log_scale + AOM_QM_BITS));
+    qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
+    dequant = (dequant_ptr * iwt + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
+    const tran_low_t abs_dqcoeff = (tmp32 * dequant) >> log_scale;
+    dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
+    if (tmp32) eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+
+void av1_quantize_dc_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                            const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
+                            tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                            const SCAN_ORDER *sc, const QUANT_PARAM *qparam) {
+  // obsolete skip_block
+  const int skip_block = 0;
+  (void)sc;
+  assert(qparam->log_scale >= 0 && qparam->log_scale < (3));
+  const qm_val_t *qm_ptr = qparam->qmatrix;
+  const qm_val_t *iqm_ptr = qparam->iqmatrix;
+  quantize_dc(coeff_ptr, (int)n_coeffs, skip_block, p->round_QTX,
+              p->quant_fp_QTX[0], qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX[0],
+              eob_ptr, qm_ptr, iqm_ptr, qparam->log_scale);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void av1_highbd_quantize_fp_facade(const tran_low_t *coeff_ptr,
+                                   intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+                                   tran_low_t *qcoeff_ptr,
+                                   tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                                   const SCAN_ORDER *sc,
+                                   const QUANT_PARAM *qparam) {
+  const qm_val_t *qm_ptr = qparam->qmatrix;
+  const qm_val_t *iqm_ptr = qparam->iqmatrix;
+  if (qm_ptr != NULL && iqm_ptr != NULL) {
+    highbd_quantize_fp_helper_c(
+        coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX, p->quant_fp_QTX,
+        p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr,
+        sc->scan, sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
+  } else {
+    av1_highbd_quantize_fp(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX,
+                           p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr,
+                           dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+                           sc->iscan, qparam->log_scale);
+  }
+}
+
+void av1_highbd_quantize_b_facade(const tran_low_t *coeff_ptr,
+                                  intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+                                  tran_low_t *qcoeff_ptr,
+                                  tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                                  const SCAN_ORDER *sc,
+                                  const QUANT_PARAM *qparam) {
+  const qm_val_t *qm_ptr = qparam->qmatrix;
+  const qm_val_t *iqm_ptr = qparam->iqmatrix;
+  if (qparam->use_quant_b_adapt) {
+    if (qm_ptr != NULL && iqm_ptr != NULL) {
+      aom_highbd_quantize_b_adaptive_helper_c(
+          coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+          p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr,
+          sc->scan, sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
+    } else {
+      switch (qparam->log_scale) {
+        case 0:
+          aom_highbd_quantize_b_adaptive(
+              coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+              p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
+              eob_ptr, sc->scan, sc->iscan);
+          break;
+        case 1:
+          aom_highbd_quantize_b_32x32_adaptive(
+              coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+              p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
+              eob_ptr, sc->scan, sc->iscan);
+          break;
+        case 2:
+          aom_highbd_quantize_b_64x64_adaptive(
+              coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+              p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
+              eob_ptr, sc->scan, sc->iscan);
+          break;
+        default: assert(0);
+      }
+    }
+  } else {
+    if (qm_ptr != NULL && iqm_ptr != NULL) {
+      aom_highbd_quantize_b_helper_c(
+          coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+          p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr,
+          sc->scan, sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
+    } else {
+      switch (qparam->log_scale) {
+        case 0:
+          aom_highbd_quantize_b(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
+                                p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
+                                dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+                                sc->iscan);
+          break;
+        case 1:
+          aom_highbd_quantize_b_32x32(
+              coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+              p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
+              eob_ptr, sc->scan, sc->iscan);
+          break;
+        case 2:
+          aom_highbd_quantize_b_64x64(
+              coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+              p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
+              eob_ptr, sc->scan, sc->iscan);
+          break;
+        default: assert(0);
+      }
+    }
+  }
+}
+
+static INLINE void highbd_quantize_dc(
+    const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
+    const int16_t *round_ptr, const int16_t quant, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr, uint16_t *eob_ptr,
+    const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr, const int log_scale) {
+  int eob = -1;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    const qm_val_t wt = qm_ptr != NULL ? qm_ptr[0] : (1 << AOM_QM_BITS);
+    const qm_val_t iwt = iqm_ptr != NULL ? iqm_ptr[0] : (1 << AOM_QM_BITS);
+    const int coeff = coeff_ptr[0];
+    const int coeff_sign = AOMSIGN(coeff);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+    const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[0], log_scale);
+    const int64_t tmpw = tmp * wt;
+    const int abs_qcoeff =
+        (int)((tmpw * quant) >> (16 - log_scale + AOM_QM_BITS));
+    qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+    const int dequant =
+        (dequant_ptr * iwt + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
+
+    const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant) >> log_scale;
+    dqcoeff_ptr[0] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
+    if (abs_qcoeff) eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+
+void av1_highbd_quantize_dc_facade(const tran_low_t *coeff_ptr,
+                                   intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+                                   tran_low_t *qcoeff_ptr,
+                                   tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                                   const SCAN_ORDER *sc,
+                                   const QUANT_PARAM *qparam) {
+  // obsolete skip_block
+  const int skip_block = 0;
+  const qm_val_t *qm_ptr = qparam->qmatrix;
+  const qm_val_t *iqm_ptr = qparam->iqmatrix;
+  (void)sc;
+
+  highbd_quantize_dc(coeff_ptr, (int)n_coeffs, skip_block, p->round_QTX,
+                     p->quant_fp_QTX[0], qcoeff_ptr, dqcoeff_ptr,
+                     p->dequant_QTX[0], eob_ptr, qm_ptr, iqm_ptr,
+                     qparam->log_scale);
+}
+
+void av1_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t count,
+                              const int16_t *zbin_ptr, const int16_t *round_ptr,
+                              const int16_t *quant_ptr,
+                              const int16_t *quant_shift_ptr,
+                              tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                              const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                              const int16_t *scan, const int16_t *iscan,
+                              int log_scale) {
+  highbd_quantize_fp_helper_c(coeff_ptr, count, zbin_ptr, round_ptr, quant_ptr,
+                              quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr,
+                              dequant_ptr, eob_ptr, scan, iscan, NULL, NULL,
+                              log_scale);
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+static void invert_quant(int16_t *quant, int16_t *shift, int d) {
+  uint32_t t;
+  int l, m;
+  t = d;
+  for (l = 0; t > 1; l++) t >>= 1;
+  m = 1 + (1 << (16 + l)) / d;
+  *quant = (int16_t)(m - (1 << 16));
+  *shift = 1 << (16 - l);
+}
+
+static int get_qzbin_factor(int q, aom_bit_depth_t bit_depth) {
+  const int quant = av1_dc_quant_QTX(q, 0, bit_depth);
+  switch (bit_depth) {
+    case AOM_BITS_8: return q == 0 ? 64 : (quant < 148 ? 84 : 80);
+    case AOM_BITS_10: return q == 0 ? 64 : (quant < 592 ? 84 : 80);
+    case AOM_BITS_12: return q == 0 ? 64 : (quant < 2368 ? 84 : 80);
+    default:
+      assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
+      return -1;
+  }
+}
+
+void av1_build_quantizer(aom_bit_depth_t bit_depth, int y_dc_delta_q,
+                         int u_dc_delta_q, int u_ac_delta_q, int v_dc_delta_q,
+                         int v_ac_delta_q, QUANTS *const quants,
+                         Dequants *const deq) {
+  int i, q, quant_QTX;
+
+  for (q = 0; q < QINDEX_RANGE; q++) {
+    const int qzbin_factor = get_qzbin_factor(q, bit_depth);
+    const int qrounding_factor = q == 0 ? 64 : 48;
+
+    for (i = 0; i < 2; ++i) {
+      int qrounding_factor_fp = 64;
+      // y quantizer with TX scale
+      quant_QTX = i == 0 ? av1_dc_quant_QTX(q, y_dc_delta_q, bit_depth)
+                         : av1_ac_quant_QTX(q, 0, bit_depth);
+      invert_quant(&quants->y_quant[q][i], &quants->y_quant_shift[q][i],
+                   quant_QTX);
+      quants->y_quant_fp[q][i] = (1 << 16) / quant_QTX;
+      quants->y_round_fp[q][i] = (qrounding_factor_fp * quant_QTX) >> 7;
+      quants->y_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant_QTX, 7);
+      quants->y_round[q][i] = (qrounding_factor * quant_QTX) >> 7;
+      deq->y_dequant_QTX[q][i] = quant_QTX;
+
+      // u quantizer with TX scale
+      quant_QTX = i == 0 ? av1_dc_quant_QTX(q, u_dc_delta_q, bit_depth)
+                         : av1_ac_quant_QTX(q, u_ac_delta_q, bit_depth);
+      invert_quant(&quants->u_quant[q][i], &quants->u_quant_shift[q][i],
+                   quant_QTX);
+      quants->u_quant_fp[q][i] = (1 << 16) / quant_QTX;
+      quants->u_round_fp[q][i] = (qrounding_factor_fp * quant_QTX) >> 7;
+      quants->u_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant_QTX, 7);
+      quants->u_round[q][i] = (qrounding_factor * quant_QTX) >> 7;
+      deq->u_dequant_QTX[q][i] = quant_QTX;
+
+      // v quantizer with TX scale
+      quant_QTX = i == 0 ? av1_dc_quant_QTX(q, v_dc_delta_q, bit_depth)
+                         : av1_ac_quant_QTX(q, v_ac_delta_q, bit_depth);
+      invert_quant(&quants->v_quant[q][i], &quants->v_quant_shift[q][i],
+                   quant_QTX);
+      quants->v_quant_fp[q][i] = (1 << 16) / quant_QTX;
+      quants->v_round_fp[q][i] = (qrounding_factor_fp * quant_QTX) >> 7;
+      quants->v_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant_QTX, 7);
+      quants->v_round[q][i] = (qrounding_factor * quant_QTX) >> 7;
+      deq->v_dequant_QTX[q][i] = quant_QTX;
+    }
+
+    for (i = 2; i < 8; i++) {  // 8: SIMD width
+      quants->y_quant[q][i] = quants->y_quant[q][1];
+      quants->y_quant_fp[q][i] = quants->y_quant_fp[q][1];
+      quants->y_round_fp[q][i] = quants->y_round_fp[q][1];
+      quants->y_quant_shift[q][i] = quants->y_quant_shift[q][1];
+      quants->y_zbin[q][i] = quants->y_zbin[q][1];
+      quants->y_round[q][i] = quants->y_round[q][1];
+      deq->y_dequant_QTX[q][i] = deq->y_dequant_QTX[q][1];
+
+      quants->u_quant[q][i] = quants->u_quant[q][1];
+      quants->u_quant_fp[q][i] = quants->u_quant_fp[q][1];
+      quants->u_round_fp[q][i] = quants->u_round_fp[q][1];
+      quants->u_quant_shift[q][i] = quants->u_quant_shift[q][1];
+      quants->u_zbin[q][i] = quants->u_zbin[q][1];
+      quants->u_round[q][i] = quants->u_round[q][1];
+      deq->u_dequant_QTX[q][i] = deq->u_dequant_QTX[q][1];
+      quants->v_quant[q][i] = quants->u_quant[q][1];
+      quants->v_quant_fp[q][i] = quants->v_quant_fp[q][1];
+      quants->v_round_fp[q][i] = quants->v_round_fp[q][1];
+      quants->v_quant_shift[q][i] = quants->v_quant_shift[q][1];
+      quants->v_zbin[q][i] = quants->v_zbin[q][1];
+      quants->v_round[q][i] = quants->v_round[q][1];
+      deq->v_dequant_QTX[q][i] = deq->v_dequant_QTX[q][1];
+    }
+  }
+}
+
+void av1_init_quantizer(EncQuantDequantParams *const enc_quant_dequant_params,
+                        const CommonQuantParams *quant_params,
+                        aom_bit_depth_t bit_depth) {
+  QUANTS *const quants = &enc_quant_dequant_params->quants;
+  Dequants *const dequants = &enc_quant_dequant_params->dequants;
+  av1_build_quantizer(bit_depth, quant_params->y_dc_delta_q,
+                      quant_params->u_dc_delta_q, quant_params->u_ac_delta_q,
+                      quant_params->v_dc_delta_q, quant_params->v_ac_delta_q,
+                      quants, dequants);
+}
+
+void av1_init_plane_quantizers(const AV1_COMP *cpi, MACROBLOCK *x,
+                               int segment_id) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const CommonQuantParams *const quant_params = &cm->quant_params;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const QUANTS *const quants = &cpi->enc_quant_dequant_params.quants;
+  const Dequants *const dequants = &cpi->enc_quant_dequant_params.dequants;
+
+  const int current_qindex =
+      AOMMAX(0, AOMMIN(QINDEX_RANGE - 1,
+                       cm->delta_q_info.delta_q_present_flag
+                           ? quant_params->base_qindex + xd->delta_qindex
+                           : quant_params->base_qindex));
+  const int qindex = av1_get_qindex(&cm->seg, segment_id, current_qindex);
+  const int rdmult =
+      av1_compute_rd_mult(cpi, qindex + quant_params->y_dc_delta_q);
+  const int use_qmatrix = av1_use_qmatrix(quant_params, xd, segment_id);
+
+  // Y
+  const int qmlevel_y =
+      use_qmatrix ? quant_params->qmatrix_level_y : NUM_QM_LEVELS - 1;
+  x->plane[0].quant_QTX = quants->y_quant[qindex];
+  x->plane[0].quant_fp_QTX = quants->y_quant_fp[qindex];
+  x->plane[0].round_fp_QTX = quants->y_round_fp[qindex];
+  x->plane[0].quant_shift_QTX = quants->y_quant_shift[qindex];
+  x->plane[0].zbin_QTX = quants->y_zbin[qindex];
+  x->plane[0].round_QTX = quants->y_round[qindex];
+  x->plane[0].dequant_QTX = dequants->y_dequant_QTX[qindex];
+  memcpy(&xd->plane[0].seg_qmatrix[segment_id],
+         quant_params->gqmatrix[qmlevel_y][0],
+         sizeof(quant_params->gqmatrix[qmlevel_y][0]));
+  memcpy(&xd->plane[0].seg_iqmatrix[segment_id],
+         quant_params->giqmatrix[qmlevel_y][0],
+         sizeof(quant_params->giqmatrix[qmlevel_y][0]));
+
+  // U
+  const int qmlevel_u =
+      use_qmatrix ? quant_params->qmatrix_level_u : NUM_QM_LEVELS - 1;
+  x->plane[1].quant_QTX = quants->u_quant[qindex];
+  x->plane[1].quant_fp_QTX = quants->u_quant_fp[qindex];
+  x->plane[1].round_fp_QTX = quants->u_round_fp[qindex];
+  x->plane[1].quant_shift_QTX = quants->u_quant_shift[qindex];
+  x->plane[1].zbin_QTX = quants->u_zbin[qindex];
+  x->plane[1].round_QTX = quants->u_round[qindex];
+  x->plane[1].dequant_QTX = dequants->u_dequant_QTX[qindex];
+  memcpy(&xd->plane[1].seg_qmatrix[segment_id],
+         quant_params->gqmatrix[qmlevel_u][1],
+         sizeof(quant_params->gqmatrix[qmlevel_u][1]));
+  memcpy(&xd->plane[1].seg_iqmatrix[segment_id],
+         quant_params->giqmatrix[qmlevel_u][1],
+         sizeof(quant_params->giqmatrix[qmlevel_u][1]));
+  // V
+  const int qmlevel_v =
+      use_qmatrix ? quant_params->qmatrix_level_v : NUM_QM_LEVELS - 1;
+  x->plane[2].quant_QTX = quants->v_quant[qindex];
+  x->plane[2].quant_fp_QTX = quants->v_quant_fp[qindex];
+  x->plane[2].round_fp_QTX = quants->v_round_fp[qindex];
+  x->plane[2].quant_shift_QTX = quants->v_quant_shift[qindex];
+  x->plane[2].zbin_QTX = quants->v_zbin[qindex];
+  x->plane[2].round_QTX = quants->v_round[qindex];
+  x->plane[2].dequant_QTX = dequants->v_dequant_QTX[qindex];
+  memcpy(&xd->plane[2].seg_qmatrix[segment_id],
+         quant_params->gqmatrix[qmlevel_v][2],
+         sizeof(quant_params->gqmatrix[qmlevel_v][2]));
+  memcpy(&xd->plane[2].seg_iqmatrix[segment_id],
+         quant_params->giqmatrix[qmlevel_v][2],
+         sizeof(quant_params->giqmatrix[qmlevel_v][2]));
+  x->skip_block = segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP);
+  x->qindex = qindex;
+
+  set_error_per_bit(x, rdmult);
+
+  av1_initialize_me_consts(cpi, x, qindex);
+}
+
+void av1_frame_init_quantizer(AV1_COMP *cpi) {
+  MACROBLOCK *const x = &cpi->td.mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  av1_init_plane_quantizers(cpi, x, xd->mi[0]->segment_id);
+}
+
+void av1_set_quantizer(AV1_COMMON *const cm, int min_qmlevel, int max_qmlevel,
+                       int q) {
+  // quantizer has to be reinitialized with av1_init_quantizer() if any
+  // delta_q changes.
+  CommonQuantParams *quant_params = &cm->quant_params;
+  quant_params->base_qindex = AOMMAX(cm->delta_q_info.delta_q_present_flag, q);
+  quant_params->y_dc_delta_q = 0;
+  quant_params->u_dc_delta_q = 0;
+  quant_params->u_ac_delta_q = 0;
+  quant_params->v_dc_delta_q = 0;
+  quant_params->v_ac_delta_q = 0;
+  quant_params->qmatrix_level_y =
+      aom_get_qmlevel(quant_params->base_qindex, min_qmlevel, max_qmlevel);
+  quant_params->qmatrix_level_u =
+      aom_get_qmlevel(quant_params->base_qindex + quant_params->u_ac_delta_q,
+                      min_qmlevel, max_qmlevel);
+
+  if (!cm->seq_params.separate_uv_delta_q)
+    quant_params->qmatrix_level_v = quant_params->qmatrix_level_u;
+  else
+    quant_params->qmatrix_level_v =
+        aom_get_qmlevel(quant_params->base_qindex + quant_params->v_ac_delta_q,
+                        min_qmlevel, max_qmlevel);
+}
+
+// Table that converts 0-63 Q-range values passed in outside to the Qindex
+// range used internally.
+static const int quantizer_to_qindex[] = {
+  0,   4,   8,   12,  16,  20,  24,  28,  32,  36,  40,  44,  48,
+  52,  56,  60,  64,  68,  72,  76,  80,  84,  88,  92,  96,  100,
+  104, 108, 112, 116, 120, 124, 128, 132, 136, 140, 144, 148, 152,
+  156, 160, 164, 168, 172, 176, 180, 184, 188, 192, 196, 200, 204,
+  208, 212, 216, 220, 224, 228, 232, 236, 240, 244, 249, 255,
+};
+
+int av1_quantizer_to_qindex(int quantizer) {
+  return quantizer_to_qindex[quantizer];
+}
+
+int av1_qindex_to_quantizer(int qindex) {
+  int quantizer;
+
+  for (quantizer = 0; quantizer < 64; ++quantizer)
+    if (quantizer_to_qindex[quantizer] >= qindex) return quantizer;
+
+  return 63;
+}
diff --git a/libs/libaom/src/av1/encoder/av1_quantize.h b/libs/libaom/src/av1/encoder/av1_quantize.h
new file mode 100644
index 000000000..40fb4bee8
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/av1_quantize.h
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_AV1_QUANTIZE_H_
+#define AOM_AV1_ENCODER_AV1_QUANTIZE_H_
+
+#include "config/aom_config.h"
+
+#include "av1/common/quant_common.h"
+#include "av1/common/scan.h"
+#include "av1/encoder/block.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define EOB_FACTOR 325
+#define SKIP_EOB_FACTOR_ADJUST 200
+
+typedef struct QUANT_PARAM {
+  int log_scale;
+  TX_SIZE tx_size;
+  const qm_val_t *qmatrix;
+  const qm_val_t *iqmatrix;
+  int use_quant_b_adapt;
+  int use_optimize_b;
+  int xform_quant_idx;
+} QUANT_PARAM;
+
+typedef void (*AV1_QUANT_FACADE)(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                                 const MACROBLOCK_PLANE *p,
+                                 tran_low_t *qcoeff_ptr,
+                                 tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                                 const SCAN_ORDER *sc,
+                                 const QUANT_PARAM *qparam);
+
+// The QUANTS structure is used only for internal quantizer setup in
+// av1_quantize.c.
+// All of its fields use the same coefficient shift/scaling at TX.
+typedef struct {
+  // 0: dc 1: ac 2-8: ac repeated to SIMD width
+  DECLARE_ALIGNED(16, int16_t, y_quant[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, y_quant_shift[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, y_zbin[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, y_round[QINDEX_RANGE][8]);
+
+  // TODO(jingning): in progress of re-working the quantization. will decide
+  // if we want to deprecate the current use of y_quant.
+  DECLARE_ALIGNED(16, int16_t, y_quant_fp[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, u_quant_fp[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, v_quant_fp[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, y_round_fp[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, u_round_fp[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, v_round_fp[QINDEX_RANGE][8]);
+
+  DECLARE_ALIGNED(16, int16_t, u_quant[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, v_quant[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, u_quant_shift[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, v_quant_shift[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, u_zbin[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, v_zbin[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, u_round[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, v_round[QINDEX_RANGE][8]);
+} QUANTS;
+
+// The Dequants structure is used only for internal quantizer setup in
+// av1_quantize.c.
+// Fields are suffixed according to whether or not they're expressed in
+// the same coefficient shift/precision as TX or a fixed Q3 format.
+typedef struct {
+  DECLARE_ALIGNED(16, int16_t,
+                  y_dequant_QTX[QINDEX_RANGE][8]);  // 8: SIMD width
+  DECLARE_ALIGNED(16, int16_t,
+                  u_dequant_QTX[QINDEX_RANGE][8]);  // 8: SIMD width
+  DECLARE_ALIGNED(16, int16_t,
+                  v_dequant_QTX[QINDEX_RANGE][8]);  // 8: SIMD width
+} Dequants;
+
+typedef struct {
+  // Quantization parameters for internal quantizer setup.
+  QUANTS quants;
+  // Dequantization parameters for internal quantizer setup.
+  Dequants dequants;
+} EncQuantDequantParams;
+
+struct AV1_COMP;
+struct AV1Common;
+
+void av1_frame_init_quantizer(struct AV1_COMP *cpi);
+
+void av1_init_plane_quantizers(const struct AV1_COMP *cpi, MACROBLOCK *x,
+                               int segment_id);
+
+void av1_build_quantizer(aom_bit_depth_t bit_depth, int y_dc_delta_q,
+                         int u_dc_delta_q, int u_ac_delta_q, int v_dc_delta_q,
+                         int v_ac_delta_q, QUANTS *const quants,
+                         Dequants *const deq);
+
+void av1_init_quantizer(EncQuantDequantParams *const enc_quant_dequant_params,
+                        const CommonQuantParams *quant_params,
+                        aom_bit_depth_t bit_depth);
+
+void av1_set_quantizer(struct AV1Common *const cm, int min_qmlevel,
+                       int max_qmlevel, int q);
+
+int av1_quantizer_to_qindex(int quantizer);
+
+int av1_qindex_to_quantizer(int qindex);
+
+void av1_quantize_skip(intptr_t n_coeffs, tran_low_t *qcoeff_ptr,
+                       tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr);
+
+void av1_quantize_fp_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                            const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
+                            tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                            const SCAN_ORDER *sc, const QUANT_PARAM *qparam);
+
+void av1_quantize_b_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                           const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
+                           tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                           const SCAN_ORDER *sc, const QUANT_PARAM *qparam);
+
+void av1_quantize_dc_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                            const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
+                            tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                            const SCAN_ORDER *sc, const QUANT_PARAM *qparam);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void av1_highbd_quantize_fp_facade(const tran_low_t *coeff_ptr,
+                                   intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+                                   tran_low_t *qcoeff_ptr,
+                                   tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                                   const SCAN_ORDER *sc,
+                                   const QUANT_PARAM *qparam);
+
+void av1_highbd_quantize_b_facade(const tran_low_t *coeff_ptr,
+                                  intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+                                  tran_low_t *qcoeff_ptr,
+                                  tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                                  const SCAN_ORDER *sc,
+                                  const QUANT_PARAM *qparam);
+
+void av1_highbd_quantize_dc_facade(const tran_low_t *coeff_ptr,
+                                   intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+                                   tran_low_t *qcoeff_ptr,
+                                   tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                                   const SCAN_ORDER *sc,
+                                   const QUANT_PARAM *qparam);
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_AV1_QUANTIZE_H_
diff --git a/libs/libaom/src/av1/encoder/bitstream.c b/libs/libaom/src/av1/encoder/bitstream.c
new file mode 100644
index 000000000..daa8ce1fc
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/bitstream.c
@@ -0,0 +1,3925 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <stdio.h>
+
+#include "aom/aom_encoder.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/binary_codes_writer.h"
+#include "aom_dsp/bitwriter_buffer.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/bitops.h"
+#include "aom_ports/mem_ops.h"
+#include "aom_ports/system_state.h"
+#if CONFIG_BITSTREAM_DEBUG
+#include "aom_util/debug_util.h"
+#endif  // CONFIG_BITSTREAM_DEBUG
+
+#include "av1/common/cdef.h"
+#include "av1/common/cfl.h"
+#include "av1/common/entropy.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/entropymv.h"
+#include "av1/common/mvref_common.h"
+#include "av1/common/pred_common.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+#include "av1/common/seg_common.h"
+#include "av1/common/tile_common.h"
+
+#include "av1/encoder/bitstream.h"
+#include "av1/encoder/cost.h"
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/encodetxb.h"
+#include "av1/encoder/mcomp.h"
+#include "av1/encoder/palette.h"
+#include "av1/encoder/segmentation.h"
+#include "av1/encoder/tokenize.h"
+
+#define ENC_MISMATCH_DEBUG 0
+
+static INLINE void write_uniform(aom_writer *w, int n, int v) {
+  const int l = get_unsigned_bits(n);
+  const int m = (1 << l) - n;
+  if (l == 0) return;
+  if (v < m) {
+    aom_write_literal(w, v, l - 1);
+  } else {
+    aom_write_literal(w, m + ((v - m) >> 1), l - 1);
+    aom_write_literal(w, (v - m) & 1, 1);
+  }
+}
+
+static AOM_INLINE void loop_restoration_write_sb_coeffs(
+    const AV1_COMMON *const cm, MACROBLOCKD *xd, const RestorationUnitInfo *rui,
+    aom_writer *const w, int plane, FRAME_COUNTS *counts);
+
+static AOM_INLINE void write_intra_y_mode_kf(FRAME_CONTEXT *frame_ctx,
+                                             const MB_MODE_INFO *mi,
+                                             const MB_MODE_INFO *above_mi,
+                                             const MB_MODE_INFO *left_mi,
+                                             PREDICTION_MODE mode,
+                                             aom_writer *w) {
+  assert(!is_intrabc_block(mi));
+  (void)mi;
+  aom_write_symbol(w, mode, get_y_mode_cdf(frame_ctx, above_mi, left_mi),
+                   INTRA_MODES);
+}
+
+static AOM_INLINE void write_inter_mode(aom_writer *w, PREDICTION_MODE mode,
+                                        FRAME_CONTEXT *ec_ctx,
+                                        const int16_t mode_ctx) {
+  const int16_t newmv_ctx = mode_ctx & NEWMV_CTX_MASK;
+
+  aom_write_symbol(w, mode != NEWMV, ec_ctx->newmv_cdf[newmv_ctx], 2);
+
+  if (mode != NEWMV) {
+    const int16_t zeromv_ctx =
+        (mode_ctx >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK;
+    aom_write_symbol(w, mode != GLOBALMV, ec_ctx->zeromv_cdf[zeromv_ctx], 2);
+
+    if (mode != GLOBALMV) {
+      int16_t refmv_ctx = (mode_ctx >> REFMV_OFFSET) & REFMV_CTX_MASK;
+      aom_write_symbol(w, mode != NEARESTMV, ec_ctx->refmv_cdf[refmv_ctx], 2);
+    }
+  }
+}
+
+static AOM_INLINE void write_drl_idx(
+    FRAME_CONTEXT *ec_ctx, const MB_MODE_INFO *mbmi,
+    const MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame, aom_writer *w) {
+  assert(mbmi->ref_mv_idx < 3);
+
+  const int new_mv = mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV;
+  if (new_mv) {
+    int idx;
+    for (idx = 0; idx < 2; ++idx) {
+      if (mbmi_ext_frame->ref_mv_count > idx + 1) {
+        uint8_t drl_ctx = av1_drl_ctx(mbmi_ext_frame->weight, idx);
+
+        aom_write_symbol(w, mbmi->ref_mv_idx != idx, ec_ctx->drl_cdf[drl_ctx],
+                         2);
+        if (mbmi->ref_mv_idx == idx) return;
+      }
+    }
+    return;
+  }
+
+  if (have_nearmv_in_inter_mode(mbmi->mode)) {
+    int idx;
+    // TODO(jingning): Temporary solution to compensate the NEARESTMV offset.
+    for (idx = 1; idx < 3; ++idx) {
+      if (mbmi_ext_frame->ref_mv_count > idx + 1) {
+        uint8_t drl_ctx = av1_drl_ctx(mbmi_ext_frame->weight, idx);
+        aom_write_symbol(w, mbmi->ref_mv_idx != (idx - 1),
+                         ec_ctx->drl_cdf[drl_ctx], 2);
+        if (mbmi->ref_mv_idx == (idx - 1)) return;
+      }
+    }
+    return;
+  }
+}
+
+static AOM_INLINE void write_inter_compound_mode(MACROBLOCKD *xd, aom_writer *w,
+                                                 PREDICTION_MODE mode,
+                                                 const int16_t mode_ctx) {
+  assert(is_inter_compound_mode(mode));
+  aom_write_symbol(w, INTER_COMPOUND_OFFSET(mode),
+                   xd->tile_ctx->inter_compound_mode_cdf[mode_ctx],
+                   INTER_COMPOUND_MODES);
+}
+
+static AOM_INLINE void write_tx_size_vartx(MACROBLOCKD *xd,
+                                           const MB_MODE_INFO *mbmi,
+                                           TX_SIZE tx_size, int depth,
+                                           int blk_row, int blk_col,
+                                           aom_writer *w) {
+  FRAME_CONTEXT *const ec_ctx = xd->tile_ctx;
+  const int max_blocks_high = max_block_high(xd, mbmi->sb_type, 0);
+  const int max_blocks_wide = max_block_wide(xd, mbmi->sb_type, 0);
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+  if (depth == MAX_VARTX_DEPTH) {
+    txfm_partition_update(xd->above_txfm_context + blk_col,
+                          xd->left_txfm_context + blk_row, tx_size, tx_size);
+    return;
+  }
+
+  const int ctx = txfm_partition_context(xd->above_txfm_context + blk_col,
+                                         xd->left_txfm_context + blk_row,
+                                         mbmi->sb_type, tx_size);
+  const int txb_size_index =
+      av1_get_txb_size_index(mbmi->sb_type, blk_row, blk_col);
+  const int write_txfm_partition =
+      tx_size == mbmi->inter_tx_size[txb_size_index];
+  if (write_txfm_partition) {
+    aom_write_symbol(w, 0, ec_ctx->txfm_partition_cdf[ctx], 2);
+
+    txfm_partition_update(xd->above_txfm_context + blk_col,
+                          xd->left_txfm_context + blk_row, tx_size, tx_size);
+    // TODO(yuec): set correct txfm partition update for qttx
+  } else {
+    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+    const int bsw = tx_size_wide_unit[sub_txs];
+    const int bsh = tx_size_high_unit[sub_txs];
+
+    aom_write_symbol(w, 1, ec_ctx->txfm_partition_cdf[ctx], 2);
+
+    if (sub_txs == TX_4X4) {
+      txfm_partition_update(xd->above_txfm_context + blk_col,
+                            xd->left_txfm_context + blk_row, sub_txs, tx_size);
+      return;
+    }
+
+    assert(bsw > 0 && bsh > 0);
+    for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh)
+      for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
+        int offsetr = blk_row + row;
+        int offsetc = blk_col + col;
+        write_tx_size_vartx(xd, mbmi, sub_txs, depth + 1, offsetr, offsetc, w);
+      }
+  }
+}
+
+static AOM_INLINE void write_selected_tx_size(const MACROBLOCKD *xd,
+                                              aom_writer *w) {
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  if (block_signals_txsize(bsize)) {
+    const TX_SIZE tx_size = mbmi->tx_size;
+    const int tx_size_ctx = get_tx_size_context(xd);
+    const int depth = tx_size_to_depth(tx_size, bsize);
+    const int max_depths = bsize_to_max_depth(bsize);
+    const int32_t tx_size_cat = bsize_to_tx_size_cat(bsize);
+
+    assert(depth >= 0 && depth <= max_depths);
+    assert(!is_inter_block(mbmi));
+    assert(IMPLIES(is_rect_tx(tx_size), is_rect_tx_allowed(xd, mbmi)));
+
+    aom_write_symbol(w, depth, ec_ctx->tx_size_cdf[tx_size_cat][tx_size_ctx],
+                     max_depths + 1);
+  }
+}
+
+static int write_skip(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+                      int segment_id, const MB_MODE_INFO *mi, aom_writer *w) {
+  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
+    return 1;
+  } else {
+    const int skip = mi->skip;
+    const int ctx = av1_get_skip_context(xd);
+    FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+    aom_write_symbol(w, skip, ec_ctx->skip_cdfs[ctx], 2);
+    return skip;
+  }
+}
+
+static int write_skip_mode(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+                           int segment_id, const MB_MODE_INFO *mi,
+                           aom_writer *w) {
+  if (!cm->current_frame.skip_mode_info.skip_mode_flag) return 0;
+  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
+    return 0;
+  }
+  const int skip_mode = mi->skip_mode;
+  if (!is_comp_ref_allowed(mi->sb_type)) {
+    assert(!skip_mode);
+    return 0;
+  }
+  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME) ||
+      segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) {
+    // These features imply single-reference mode, while skip mode implies
+    // compound reference. Hence, the two are mutually exclusive.
+    // In other words, skip_mode is implicitly 0 here.
+    assert(!skip_mode);
+    return 0;
+  }
+  const int ctx = av1_get_skip_mode_context(xd);
+  aom_write_symbol(w, skip_mode, xd->tile_ctx->skip_mode_cdfs[ctx], 2);
+  return skip_mode;
+}
+
+static AOM_INLINE void write_is_inter(const AV1_COMMON *cm,
+                                      const MACROBLOCKD *xd, int segment_id,
+                                      aom_writer *w, const int is_inter) {
+  if (!segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
+    if (segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) {
+      assert(is_inter);
+      return;
+    }
+    const int ctx = av1_get_intra_inter_context(xd);
+    FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+    aom_write_symbol(w, is_inter, ec_ctx->intra_inter_cdf[ctx], 2);
+  }
+}
+
+static AOM_INLINE void write_motion_mode(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                         const MB_MODE_INFO *mbmi,
+                                         aom_writer *w) {
+  MOTION_MODE last_motion_mode_allowed =
+      cm->features.switchable_motion_mode
+          ? motion_mode_allowed(cm->global_motion, xd, mbmi,
+                                cm->features.allow_warped_motion)
+          : SIMPLE_TRANSLATION;
+  assert(mbmi->motion_mode <= last_motion_mode_allowed);
+  switch (last_motion_mode_allowed) {
+    case SIMPLE_TRANSLATION: break;
+    case OBMC_CAUSAL:
+      aom_write_symbol(w, mbmi->motion_mode == OBMC_CAUSAL,
+                       xd->tile_ctx->obmc_cdf[mbmi->sb_type], 2);
+      break;
+    default:
+      aom_write_symbol(w, mbmi->motion_mode,
+                       xd->tile_ctx->motion_mode_cdf[mbmi->sb_type],
+                       MOTION_MODES);
+  }
+}
+
+static AOM_INLINE void write_delta_qindex(const MACROBLOCKD *xd,
+                                          int delta_qindex, aom_writer *w) {
+  int sign = delta_qindex < 0;
+  int abs = sign ? -delta_qindex : delta_qindex;
+  int rem_bits, thr;
+  int smallval = abs < DELTA_Q_SMALL ? 1 : 0;
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+
+  aom_write_symbol(w, AOMMIN(abs, DELTA_Q_SMALL), ec_ctx->delta_q_cdf,
+                   DELTA_Q_PROBS + 1);
+
+  if (!smallval) {
+    rem_bits = get_msb(abs - 1);
+    thr = (1 << rem_bits) + 1;
+    aom_write_literal(w, rem_bits - 1, 3);
+    aom_write_literal(w, abs - thr, rem_bits);
+  }
+  if (abs > 0) {
+    aom_write_bit(w, sign);
+  }
+}
+
+static AOM_INLINE void write_delta_lflevel(const AV1_COMMON *cm,
+                                           const MACROBLOCKD *xd, int lf_id,
+                                           int delta_lflevel, aom_writer *w) {
+  int sign = delta_lflevel < 0;
+  int abs = sign ? -delta_lflevel : delta_lflevel;
+  int rem_bits, thr;
+  int smallval = abs < DELTA_LF_SMALL ? 1 : 0;
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+
+  if (cm->delta_q_info.delta_lf_multi) {
+    assert(lf_id >= 0 && lf_id < (av1_num_planes(cm) > 1 ? FRAME_LF_COUNT
+                                                         : FRAME_LF_COUNT - 2));
+    aom_write_symbol(w, AOMMIN(abs, DELTA_LF_SMALL),
+                     ec_ctx->delta_lf_multi_cdf[lf_id], DELTA_LF_PROBS + 1);
+  } else {
+    aom_write_symbol(w, AOMMIN(abs, DELTA_LF_SMALL), ec_ctx->delta_lf_cdf,
+                     DELTA_LF_PROBS + 1);
+  }
+
+  if (!smallval) {
+    rem_bits = get_msb(abs - 1);
+    thr = (1 << rem_bits) + 1;
+    aom_write_literal(w, rem_bits - 1, 3);
+    aom_write_literal(w, abs - thr, rem_bits);
+  }
+  if (abs > 0) {
+    aom_write_bit(w, sign);
+  }
+}
+
+static AOM_INLINE void pack_map_tokens(aom_writer *w, const TOKENEXTRA **tp,
+                                       int n, int num) {
+  const TOKENEXTRA *p = *tp;
+  write_uniform(w, n, p->token);  // The first color index.
+  ++p;
+  --num;
+  for (int i = 0; i < num; ++i) {
+    aom_write_symbol(w, p->token, p->color_map_cdf, n);
+    ++p;
+  }
+  *tp = p;
+}
+
+static AOM_INLINE void pack_txb_tokens(
+    aom_writer *w, AV1_COMMON *cm, MACROBLOCK *const x, const TOKENEXTRA **tp,
+    const TOKENEXTRA *const tok_end, MACROBLOCKD *xd, MB_MODE_INFO *mbmi,
+    int plane, BLOCK_SIZE plane_bsize, aom_bit_depth_t bit_depth, int block,
+    int blk_row, int blk_col, TX_SIZE tx_size, TOKEN_STATS *token_stats) {
+  const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+  const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const TX_SIZE plane_tx_size =
+      plane ? av1_get_max_uv_txsize(mbmi->sb_type, pd->subsampling_x,
+                                    pd->subsampling_y)
+            : mbmi->inter_tx_size[av1_get_txb_size_index(plane_bsize, blk_row,
+                                                         blk_col)];
+
+  if (tx_size == plane_tx_size || plane) {
+    av1_write_coeffs_txb(cm, x, w, blk_row, blk_col, plane, block, tx_size);
+#if CONFIG_RD_DEBUG
+    TOKEN_STATS tmp_token_stats;
+    init_token_stats(&tmp_token_stats);
+    token_stats->txb_coeff_cost_map[blk_row][blk_col] = tmp_token_stats.cost;
+    token_stats->cost += tmp_token_stats.cost;
+#endif
+  } else {
+    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+    const int bsw = tx_size_wide_unit[sub_txs];
+    const int bsh = tx_size_high_unit[sub_txs];
+    const int step = bsh * bsw;
+
+    assert(bsw > 0 && bsh > 0);
+
+    for (int r = 0; r < tx_size_high_unit[tx_size]; r += bsh) {
+      for (int c = 0; c < tx_size_wide_unit[tx_size]; c += bsw) {
+        const int offsetr = blk_row + r;
+        const int offsetc = blk_col + c;
+        if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
+        pack_txb_tokens(w, cm, x, tp, tok_end, xd, mbmi, plane, plane_bsize,
+                        bit_depth, block, offsetr, offsetc, sub_txs,
+                        token_stats);
+        block += step;
+      }
+    }
+  }
+}
+
+static INLINE void set_spatial_segment_id(
+    const CommonModeInfoParams *const mi_params, uint8_t *segment_ids,
+    BLOCK_SIZE bsize, int mi_row, int mi_col, int segment_id) {
+  const int mi_offset = mi_row * mi_params->mi_cols + mi_col;
+  const int bw = mi_size_wide[bsize];
+  const int bh = mi_size_high[bsize];
+  const int xmis = AOMMIN(mi_params->mi_cols - mi_col, bw);
+  const int ymis = AOMMIN(mi_params->mi_rows - mi_row, bh);
+
+  for (int y = 0; y < ymis; ++y) {
+    for (int x = 0; x < xmis; ++x) {
+      segment_ids[mi_offset + y * mi_params->mi_cols + x] = segment_id;
+    }
+  }
+}
+
+int av1_neg_interleave(int x, int ref, int max) {
+  assert(x < max);
+  const int diff = x - ref;
+  if (!ref) return x;
+  if (ref >= (max - 1)) return -x + max - 1;
+  if (2 * ref < max) {
+    if (abs(diff) <= ref) {
+      if (diff > 0)
+        return (diff << 1) - 1;
+      else
+        return ((-diff) << 1);
+    }
+    return x;
+  } else {
+    if (abs(diff) < (max - ref)) {
+      if (diff > 0)
+        return (diff << 1) - 1;
+      else
+        return ((-diff) << 1);
+    }
+    return (max - x) - 1;
+  }
+}
+
+static AOM_INLINE void write_segment_id(
+    AV1_COMP *cpi, const MB_MODE_INFO *const mbmi, aom_writer *w,
+    const struct segmentation *seg, struct segmentation_probs *segp, int skip) {
+  if (!seg->enabled || !seg->update_map) return;
+
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+  int cdf_num;
+  const int pred = av1_get_spatial_seg_pred(cm, xd, &cdf_num);
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+
+  if (skip) {
+    // Still need to transmit tx size for intra blocks even if skip is
+    // true. Changing segment_id may make the tx size become invalid, e.g
+    // changing from lossless to lossy.
+    assert(is_inter_block(mbmi) || !cpi->enc_seg.has_lossless_segment);
+
+    set_spatial_segment_id(&cm->mi_params, cm->cur_frame->seg_map,
+                           mbmi->sb_type, mi_row, mi_col, pred);
+    set_spatial_segment_id(&cm->mi_params, cpi->enc_seg.map, mbmi->sb_type,
+                           mi_row, mi_col, pred);
+    /* mbmi is read only but we need to update segment_id */
+    ((MB_MODE_INFO *)mbmi)->segment_id = pred;
+    return;
+  }
+
+  const int coded_id =
+      av1_neg_interleave(mbmi->segment_id, pred, seg->last_active_segid + 1);
+  aom_cdf_prob *pred_cdf = segp->spatial_pred_seg_cdf[cdf_num];
+  aom_write_symbol(w, coded_id, pred_cdf, MAX_SEGMENTS);
+  set_spatial_segment_id(&cm->mi_params, cm->cur_frame->seg_map, mbmi->sb_type,
+                         mi_row, mi_col, mbmi->segment_id);
+}
+
+#define WRITE_REF_BIT(bname, pname) \
+  aom_write_symbol(w, bname, av1_get_pred_cdf_##pname(xd), 2)
+
+// This function encodes the reference frame
+static AOM_INLINE void write_ref_frames(const AV1_COMMON *cm,
+                                        const MACROBLOCKD *xd, aom_writer *w) {
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int is_compound = has_second_ref(mbmi);
+  const int segment_id = mbmi->segment_id;
+
+  // If segment level coding of this signal is disabled...
+  // or the segment allows multiple reference frame options
+  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
+    assert(!is_compound);
+    assert(mbmi->ref_frame[0] ==
+           get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME));
+  } else if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP) ||
+             segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) {
+    assert(!is_compound);
+    assert(mbmi->ref_frame[0] == LAST_FRAME);
+  } else {
+    // does the feature use compound prediction or not
+    // (if not specified at the frame/segment level)
+    if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT) {
+      if (is_comp_ref_allowed(mbmi->sb_type))
+        aom_write_symbol(w, is_compound, av1_get_reference_mode_cdf(xd), 2);
+    } else {
+      assert((!is_compound) ==
+             (cm->current_frame.reference_mode == SINGLE_REFERENCE));
+    }
+
+    if (is_compound) {
+      const COMP_REFERENCE_TYPE comp_ref_type = has_uni_comp_refs(mbmi)
+                                                    ? UNIDIR_COMP_REFERENCE
+                                                    : BIDIR_COMP_REFERENCE;
+      aom_write_symbol(w, comp_ref_type, av1_get_comp_reference_type_cdf(xd),
+                       2);
+
+      if (comp_ref_type == UNIDIR_COMP_REFERENCE) {
+        const int bit = mbmi->ref_frame[0] == BWDREF_FRAME;
+        WRITE_REF_BIT(bit, uni_comp_ref_p);
+
+        if (!bit) {
+          assert(mbmi->ref_frame[0] == LAST_FRAME);
+          const int bit1 = mbmi->ref_frame[1] == LAST3_FRAME ||
+                           mbmi->ref_frame[1] == GOLDEN_FRAME;
+          WRITE_REF_BIT(bit1, uni_comp_ref_p1);
+          if (bit1) {
+            const int bit2 = mbmi->ref_frame[1] == GOLDEN_FRAME;
+            WRITE_REF_BIT(bit2, uni_comp_ref_p2);
+          }
+        } else {
+          assert(mbmi->ref_frame[1] == ALTREF_FRAME);
+        }
+
+        return;
+      }
+
+      assert(comp_ref_type == BIDIR_COMP_REFERENCE);
+
+      const int bit = (mbmi->ref_frame[0] == GOLDEN_FRAME ||
+                       mbmi->ref_frame[0] == LAST3_FRAME);
+      WRITE_REF_BIT(bit, comp_ref_p);
+
+      if (!bit) {
+        const int bit1 = mbmi->ref_frame[0] == LAST2_FRAME;
+        WRITE_REF_BIT(bit1, comp_ref_p1);
+      } else {
+        const int bit2 = mbmi->ref_frame[0] == GOLDEN_FRAME;
+        WRITE_REF_BIT(bit2, comp_ref_p2);
+      }
+
+      const int bit_bwd = mbmi->ref_frame[1] == ALTREF_FRAME;
+      WRITE_REF_BIT(bit_bwd, comp_bwdref_p);
+
+      if (!bit_bwd) {
+        WRITE_REF_BIT(mbmi->ref_frame[1] == ALTREF2_FRAME, comp_bwdref_p1);
+      }
+
+    } else {
+      const int bit0 = (mbmi->ref_frame[0] <= ALTREF_FRAME &&
+                        mbmi->ref_frame[0] >= BWDREF_FRAME);
+      WRITE_REF_BIT(bit0, single_ref_p1);
+
+      if (bit0) {
+        const int bit1 = mbmi->ref_frame[0] == ALTREF_FRAME;
+        WRITE_REF_BIT(bit1, single_ref_p2);
+
+        if (!bit1) {
+          WRITE_REF_BIT(mbmi->ref_frame[0] == ALTREF2_FRAME, single_ref_p6);
+        }
+      } else {
+        const int bit2 = (mbmi->ref_frame[0] == LAST3_FRAME ||
+                          mbmi->ref_frame[0] == GOLDEN_FRAME);
+        WRITE_REF_BIT(bit2, single_ref_p3);
+
+        if (!bit2) {
+          const int bit3 = mbmi->ref_frame[0] != LAST_FRAME;
+          WRITE_REF_BIT(bit3, single_ref_p4);
+        } else {
+          const int bit4 = mbmi->ref_frame[0] != LAST3_FRAME;
+          WRITE_REF_BIT(bit4, single_ref_p5);
+        }
+      }
+    }
+  }
+}
+
+static AOM_INLINE void write_filter_intra_mode_info(
+    const AV1_COMMON *cm, const MACROBLOCKD *xd, const MB_MODE_INFO *const mbmi,
+    aom_writer *w) {
+  if (av1_filter_intra_allowed(cm, mbmi)) {
+    aom_write_symbol(w, mbmi->filter_intra_mode_info.use_filter_intra,
+                     xd->tile_ctx->filter_intra_cdfs[mbmi->sb_type], 2);
+    if (mbmi->filter_intra_mode_info.use_filter_intra) {
+      const FILTER_INTRA_MODE mode =
+          mbmi->filter_intra_mode_info.filter_intra_mode;
+      aom_write_symbol(w, mode, xd->tile_ctx->filter_intra_mode_cdf,
+                       FILTER_INTRA_MODES);
+    }
+  }
+}
+
+static AOM_INLINE void write_angle_delta(aom_writer *w, int angle_delta,
+                                         aom_cdf_prob *cdf) {
+  aom_write_symbol(w, angle_delta + MAX_ANGLE_DELTA, cdf,
+                   2 * MAX_ANGLE_DELTA + 1);
+}
+
+static AOM_INLINE void write_mb_interp_filter(AV1_COMMON *const cm,
+                                              const MACROBLOCKD *xd,
+                                              aom_writer *w) {
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+
+  if (!av1_is_interp_needed(xd)) {
+    int_interpfilters filters = av1_broadcast_interp_filter(
+        av1_unswitchable_filter(cm->features.interp_filter));
+    assert(mbmi->interp_filters.as_int == filters.as_int);
+    (void)filters;
+    return;
+  }
+  if (cm->features.interp_filter == SWITCHABLE) {
+    int dir;
+    for (dir = 0; dir < 2; ++dir) {
+      const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
+      InterpFilter filter =
+          av1_extract_interp_filter(mbmi->interp_filters, dir);
+      aom_write_symbol(w, filter, ec_ctx->switchable_interp_cdf[ctx],
+                       SWITCHABLE_FILTERS);
+      ++cm->cur_frame->interp_filter_selected[filter];
+      if (cm->seq_params.enable_dual_filter == 0) return;
+    }
+  }
+}
+
+// Transmit color values with delta encoding. Write the first value as
+// literal, and the deltas between each value and the previous one. "min_val" is
+// the smallest possible value of the deltas.
+static AOM_INLINE void delta_encode_palette_colors(const int *colors, int num,
+                                                   int bit_depth, int min_val,
+                                                   aom_writer *w) {
+  if (num <= 0) return;
+  assert(colors[0] < (1 << bit_depth));
+  aom_write_literal(w, colors[0], bit_depth);
+  if (num == 1) return;
+  int max_delta = 0;
+  int deltas[PALETTE_MAX_SIZE];
+  memset(deltas, 0, sizeof(deltas));
+  for (int i = 1; i < num; ++i) {
+    assert(colors[i] < (1 << bit_depth));
+    const int delta = colors[i] - colors[i - 1];
+    deltas[i - 1] = delta;
+    assert(delta >= min_val);
+    if (delta > max_delta) max_delta = delta;
+  }
+  const int min_bits = bit_depth - 3;
+  int bits = AOMMAX(av1_ceil_log2(max_delta + 1 - min_val), min_bits);
+  assert(bits <= bit_depth);
+  int range = (1 << bit_depth) - colors[0] - min_val;
+  aom_write_literal(w, bits - min_bits, 2);
+  for (int i = 0; i < num - 1; ++i) {
+    aom_write_literal(w, deltas[i] - min_val, bits);
+    range -= deltas[i];
+    bits = AOMMIN(bits, av1_ceil_log2(range));
+  }
+}
+
+// Transmit luma palette color values. First signal if each color in the color
+// cache is used. Those colors that are not in the cache are transmitted with
+// delta encoding.
+static AOM_INLINE void write_palette_colors_y(
+    const MACROBLOCKD *const xd, const PALETTE_MODE_INFO *const pmi,
+    int bit_depth, aom_writer *w) {
+  const int n = pmi->palette_size[0];
+  uint16_t color_cache[2 * PALETTE_MAX_SIZE];
+  const int n_cache = av1_get_palette_cache(xd, 0, color_cache);
+  int out_cache_colors[PALETTE_MAX_SIZE];
+  uint8_t cache_color_found[2 * PALETTE_MAX_SIZE];
+  const int n_out_cache =
+      av1_index_color_cache(color_cache, n_cache, pmi->palette_colors, n,
+                            cache_color_found, out_cache_colors);
+  int n_in_cache = 0;
+  for (int i = 0; i < n_cache && n_in_cache < n; ++i) {
+    const int found = cache_color_found[i];
+    aom_write_bit(w, found);
+    n_in_cache += found;
+  }
+  assert(n_in_cache + n_out_cache == n);
+  delta_encode_palette_colors(out_cache_colors, n_out_cache, bit_depth, 1, w);
+}
+
+// Write chroma palette color values. U channel is handled similarly to the luma
+// channel. For v channel, either use delta encoding or transmit raw values
+// directly, whichever costs less.
+static AOM_INLINE void write_palette_colors_uv(
+    const MACROBLOCKD *const xd, const PALETTE_MODE_INFO *const pmi,
+    int bit_depth, aom_writer *w) {
+  const int n = pmi->palette_size[1];
+  const uint16_t *colors_u = pmi->palette_colors + PALETTE_MAX_SIZE;
+  const uint16_t *colors_v = pmi->palette_colors + 2 * PALETTE_MAX_SIZE;
+  // U channel colors.
+  uint16_t color_cache[2 * PALETTE_MAX_SIZE];
+  const int n_cache = av1_get_palette_cache(xd, 1, color_cache);
+  int out_cache_colors[PALETTE_MAX_SIZE];
+  uint8_t cache_color_found[2 * PALETTE_MAX_SIZE];
+  const int n_out_cache = av1_index_color_cache(
+      color_cache, n_cache, colors_u, n, cache_color_found, out_cache_colors);
+  int n_in_cache = 0;
+  for (int i = 0; i < n_cache && n_in_cache < n; ++i) {
+    const int found = cache_color_found[i];
+    aom_write_bit(w, found);
+    n_in_cache += found;
+  }
+  delta_encode_palette_colors(out_cache_colors, n_out_cache, bit_depth, 0, w);
+
+  // V channel colors. Don't use color cache as the colors are not sorted.
+  const int max_val = 1 << bit_depth;
+  int zero_count = 0, min_bits_v = 0;
+  int bits_v =
+      av1_get_palette_delta_bits_v(pmi, bit_depth, &zero_count, &min_bits_v);
+  const int rate_using_delta =
+      2 + bit_depth + (bits_v + 1) * (n - 1) - zero_count;
+  const int rate_using_raw = bit_depth * n;
+  if (rate_using_delta < rate_using_raw) {  // delta encoding
+    assert(colors_v[0] < (1 << bit_depth));
+    aom_write_bit(w, 1);
+    aom_write_literal(w, bits_v - min_bits_v, 2);
+    aom_write_literal(w, colors_v[0], bit_depth);
+    for (int i = 1; i < n; ++i) {
+      assert(colors_v[i] < (1 << bit_depth));
+      if (colors_v[i] == colors_v[i - 1]) {  // No need to signal sign bit.
+        aom_write_literal(w, 0, bits_v);
+        continue;
+      }
+      const int delta = abs((int)colors_v[i] - colors_v[i - 1]);
+      const int sign_bit = colors_v[i] < colors_v[i - 1];
+      if (delta <= max_val - delta) {
+        aom_write_literal(w, delta, bits_v);
+        aom_write_bit(w, sign_bit);
+      } else {
+        aom_write_literal(w, max_val - delta, bits_v);
+        aom_write_bit(w, !sign_bit);
+      }
+    }
+  } else {  // Transmit raw values.
+    aom_write_bit(w, 0);
+    for (int i = 0; i < n; ++i) {
+      assert(colors_v[i] < (1 << bit_depth));
+      aom_write_literal(w, colors_v[i], bit_depth);
+    }
+  }
+}
+
+static AOM_INLINE void write_palette_mode_info(const AV1_COMMON *cm,
+                                               const MACROBLOCKD *xd,
+                                               const MB_MODE_INFO *const mbmi,
+                                               aom_writer *w) {
+  const int num_planes = av1_num_planes(cm);
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  assert(av1_allow_palette(cm->features.allow_screen_content_tools, bsize));
+  const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  const int bsize_ctx = av1_get_palette_bsize_ctx(bsize);
+
+  if (mbmi->mode == DC_PRED) {
+    const int n = pmi->palette_size[0];
+    const int palette_y_mode_ctx = av1_get_palette_mode_ctx(xd);
+    aom_write_symbol(
+        w, n > 0,
+        xd->tile_ctx->palette_y_mode_cdf[bsize_ctx][palette_y_mode_ctx], 2);
+    if (n > 0) {
+      aom_write_symbol(w, n - PALETTE_MIN_SIZE,
+                       xd->tile_ctx->palette_y_size_cdf[bsize_ctx],
+                       PALETTE_SIZES);
+      write_palette_colors_y(xd, pmi, cm->seq_params.bit_depth, w);
+    }
+  }
+
+  const int uv_dc_pred =
+      num_planes > 1 && mbmi->uv_mode == UV_DC_PRED && xd->is_chroma_ref;
+  if (uv_dc_pred) {
+    const int n = pmi->palette_size[1];
+    const int palette_uv_mode_ctx = (pmi->palette_size[0] > 0);
+    aom_write_symbol(w, n > 0,
+                     xd->tile_ctx->palette_uv_mode_cdf[palette_uv_mode_ctx], 2);
+    if (n > 0) {
+      aom_write_symbol(w, n - PALETTE_MIN_SIZE,
+                       xd->tile_ctx->palette_uv_size_cdf[bsize_ctx],
+                       PALETTE_SIZES);
+      write_palette_colors_uv(xd, pmi, cm->seq_params.bit_depth, w);
+    }
+  }
+}
+
+void av1_write_tx_type(const AV1_COMMON *const cm, const MACROBLOCKD *xd,
+                       TX_TYPE tx_type, TX_SIZE tx_size, aom_writer *w) {
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  const FeatureFlags *const features = &cm->features;
+  const int is_inter = is_inter_block(mbmi);
+  if (get_ext_tx_types(tx_size, is_inter, features->reduced_tx_set_used) > 1 &&
+      ((!cm->seg.enabled && cm->quant_params.base_qindex > 0) ||
+       (cm->seg.enabled && xd->qindex[mbmi->segment_id] > 0)) &&
+      !mbmi->skip &&
+      !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+    FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+    const TX_SIZE square_tx_size = txsize_sqr_map[tx_size];
+    const TxSetType tx_set_type = av1_get_ext_tx_set_type(
+        tx_size, is_inter, features->reduced_tx_set_used);
+    const int eset =
+        get_ext_tx_set(tx_size, is_inter, features->reduced_tx_set_used);
+    // eset == 0 should correspond to a set with only DCT_DCT and there
+    // is no need to send the tx_type
+    assert(eset > 0);
+    assert(av1_ext_tx_used[tx_set_type][tx_type]);
+    if (is_inter) {
+      aom_write_symbol(w, av1_ext_tx_ind[tx_set_type][tx_type],
+                       ec_ctx->inter_ext_tx_cdf[eset][square_tx_size],
+                       av1_num_ext_tx_set[tx_set_type]);
+    } else {
+      PREDICTION_MODE intra_dir;
+      if (mbmi->filter_intra_mode_info.use_filter_intra)
+        intra_dir =
+            fimode_to_intradir[mbmi->filter_intra_mode_info.filter_intra_mode];
+      else
+        intra_dir = mbmi->mode;
+      aom_write_symbol(
+          w, av1_ext_tx_ind[tx_set_type][tx_type],
+          ec_ctx->intra_ext_tx_cdf[eset][square_tx_size][intra_dir],
+          av1_num_ext_tx_set[tx_set_type]);
+    }
+  }
+}
+
+static AOM_INLINE void write_intra_y_mode_nonkf(FRAME_CONTEXT *frame_ctx,
+                                                BLOCK_SIZE bsize,
+                                                PREDICTION_MODE mode,
+                                                aom_writer *w) {
+  aom_write_symbol(w, mode, frame_ctx->y_mode_cdf[size_group_lookup[bsize]],
+                   INTRA_MODES);
+}
+
+static AOM_INLINE void write_intra_uv_mode(FRAME_CONTEXT *frame_ctx,
+                                           UV_PREDICTION_MODE uv_mode,
+                                           PREDICTION_MODE y_mode,
+                                           CFL_ALLOWED_TYPE cfl_allowed,
+                                           aom_writer *w) {
+  aom_write_symbol(w, uv_mode, frame_ctx->uv_mode_cdf[cfl_allowed][y_mode],
+                   UV_INTRA_MODES - !cfl_allowed);
+}
+
+static AOM_INLINE void write_cfl_alphas(FRAME_CONTEXT *const ec_ctx,
+                                        uint8_t idx, int8_t joint_sign,
+                                        aom_writer *w) {
+  aom_write_symbol(w, joint_sign, ec_ctx->cfl_sign_cdf, CFL_JOINT_SIGNS);
+  // Magnitudes are only signaled for nonzero codes.
+  if (CFL_SIGN_U(joint_sign) != CFL_SIGN_ZERO) {
+    aom_cdf_prob *cdf_u = ec_ctx->cfl_alpha_cdf[CFL_CONTEXT_U(joint_sign)];
+    aom_write_symbol(w, CFL_IDX_U(idx), cdf_u, CFL_ALPHABET_SIZE);
+  }
+  if (CFL_SIGN_V(joint_sign) != CFL_SIGN_ZERO) {
+    aom_cdf_prob *cdf_v = ec_ctx->cfl_alpha_cdf[CFL_CONTEXT_V(joint_sign)];
+    aom_write_symbol(w, CFL_IDX_V(idx), cdf_v, CFL_ALPHABET_SIZE);
+  }
+}
+
+static AOM_INLINE void write_cdef(AV1_COMMON *cm, MACROBLOCKD *const xd,
+                                  aom_writer *w, int skip) {
+  if (cm->features.coded_lossless || cm->features.allow_intrabc) return;
+
+  // At the start of a superblock, mark that we haven't yet written CDEF
+  // strengths for any of the CDEF units contained in this superblock.
+  const int sb_mask = (cm->seq_params.mib_size - 1);
+  const int mi_row_in_sb = (xd->mi_row & sb_mask);
+  const int mi_col_in_sb = (xd->mi_col & sb_mask);
+  if (mi_row_in_sb == 0 && mi_col_in_sb == 0) {
+    xd->cdef_transmitted[0] = xd->cdef_transmitted[1] =
+        xd->cdef_transmitted[2] = xd->cdef_transmitted[3] = false;
+  }
+
+  // CDEF unit size is 64x64 irrespective of the superblock size.
+  const int cdef_size = 1 << (6 - MI_SIZE_LOG2);
+
+  // Find index of this CDEF unit in this superblock.
+  const int index_mask = cdef_size;
+  const int cdef_unit_row_in_sb = ((xd->mi_row & index_mask) != 0);
+  const int cdef_unit_col_in_sb = ((xd->mi_col & index_mask) != 0);
+  const int index = (cm->seq_params.sb_size == BLOCK_128X128)
+                        ? cdef_unit_col_in_sb + 2 * cdef_unit_row_in_sb
+                        : 0;
+
+  // Write CDEF strength to the first non-skip coding block in this CDEF unit.
+  if (!xd->cdef_transmitted[index] && !skip) {
+    // CDEF strength for this CDEF unit needs to be stored in the MB_MODE_INFO
+    // of the 1st block in this CDEF unit.
+    const int first_block_mask = ~(cdef_size - 1);
+    const CommonModeInfoParams *const mi_params = &cm->mi_params;
+    const int grid_idx =
+        get_mi_grid_idx(mi_params, xd->mi_row & first_block_mask,
+                        xd->mi_col & first_block_mask);
+    const MB_MODE_INFO *const mbmi = mi_params->mi_grid_base[grid_idx];
+    aom_write_literal(w, mbmi->cdef_strength, cm->cdef_info.cdef_bits);
+    xd->cdef_transmitted[index] = true;
+  }
+}
+
+static AOM_INLINE void write_inter_segment_id(
+    AV1_COMP *cpi, aom_writer *w, const struct segmentation *const seg,
+    struct segmentation_probs *const segp, int skip, int preskip) {
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  AV1_COMMON *const cm = &cpi->common;
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+
+  if (seg->update_map) {
+    if (preskip) {
+      if (!seg->segid_preskip) return;
+    } else {
+      if (seg->segid_preskip) return;
+      if (skip) {
+        write_segment_id(cpi, mbmi, w, seg, segp, 1);
+        if (seg->temporal_update) mbmi->seg_id_predicted = 0;
+        return;
+      }
+    }
+    if (seg->temporal_update) {
+      const int pred_flag = mbmi->seg_id_predicted;
+      aom_cdf_prob *pred_cdf = av1_get_pred_cdf_seg_id(segp, xd);
+      aom_write_symbol(w, pred_flag, pred_cdf, 2);
+      if (!pred_flag) {
+        write_segment_id(cpi, mbmi, w, seg, segp, 0);
+      }
+      if (pred_flag) {
+        set_spatial_segment_id(&cm->mi_params, cm->cur_frame->seg_map,
+                               mbmi->sb_type, mi_row, mi_col, mbmi->segment_id);
+      }
+    } else {
+      write_segment_id(cpi, mbmi, w, seg, segp, 0);
+    }
+  }
+}
+
+// If delta q is present, writes delta_q index.
+// Also writes delta_q loop filter levels, if present.
+static AOM_INLINE void write_delta_q_params(AV1_COMP *cpi, int skip,
+                                            aom_writer *w) {
+  AV1_COMMON *const cm = &cpi->common;
+  const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
+
+  if (delta_q_info->delta_q_present_flag) {
+    MACROBLOCK *const x = &cpi->td.mb;
+    MACROBLOCKD *const xd = &x->e_mbd;
+    const MB_MODE_INFO *const mbmi = xd->mi[0];
+    const BLOCK_SIZE bsize = mbmi->sb_type;
+    const int super_block_upper_left =
+        ((xd->mi_row & (cm->seq_params.mib_size - 1)) == 0) &&
+        ((xd->mi_col & (cm->seq_params.mib_size - 1)) == 0);
+
+    if ((bsize != cm->seq_params.sb_size || skip == 0) &&
+        super_block_upper_left) {
+      assert(mbmi->current_qindex > 0);
+      const int reduced_delta_qindex =
+          (mbmi->current_qindex - xd->current_qindex) /
+          delta_q_info->delta_q_res;
+      write_delta_qindex(xd, reduced_delta_qindex, w);
+      xd->current_qindex = mbmi->current_qindex;
+      if (delta_q_info->delta_lf_present_flag) {
+        if (delta_q_info->delta_lf_multi) {
+          const int frame_lf_count =
+              av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
+          for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) {
+            int reduced_delta_lflevel =
+                (mbmi->delta_lf[lf_id] - xd->delta_lf[lf_id]) /
+                delta_q_info->delta_lf_res;
+            write_delta_lflevel(cm, xd, lf_id, reduced_delta_lflevel, w);
+            xd->delta_lf[lf_id] = mbmi->delta_lf[lf_id];
+          }
+        } else {
+          int reduced_delta_lflevel =
+              (mbmi->delta_lf_from_base - xd->delta_lf_from_base) /
+              delta_q_info->delta_lf_res;
+          write_delta_lflevel(cm, xd, -1, reduced_delta_lflevel, w);
+          xd->delta_lf_from_base = mbmi->delta_lf_from_base;
+        }
+      }
+    }
+  }
+}
+
+static AOM_INLINE void write_intra_prediction_modes(AV1_COMP *cpi,
+                                                    int is_keyframe,
+                                                    aom_writer *w) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->td.mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  const PREDICTION_MODE mode = mbmi->mode;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+
+  // Y mode.
+  if (is_keyframe) {
+    const MB_MODE_INFO *const above_mi = xd->above_mbmi;
+    const MB_MODE_INFO *const left_mi = xd->left_mbmi;
+    write_intra_y_mode_kf(ec_ctx, mbmi, above_mi, left_mi, mode, w);
+  } else {
+    write_intra_y_mode_nonkf(ec_ctx, bsize, mode, w);
+  }
+
+  // Y angle delta.
+  const int use_angle_delta = av1_use_angle_delta(bsize);
+  if (use_angle_delta && av1_is_directional_mode(mode)) {
+    write_angle_delta(w, mbmi->angle_delta[PLANE_TYPE_Y],
+                      ec_ctx->angle_delta_cdf[mode - V_PRED]);
+  }
+
+  // UV mode and UV angle delta.
+  if (!cm->seq_params.monochrome && xd->is_chroma_ref) {
+    const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode;
+    write_intra_uv_mode(ec_ctx, uv_mode, mode, is_cfl_allowed(xd), w);
+    if (uv_mode == UV_CFL_PRED)
+      write_cfl_alphas(ec_ctx, mbmi->cfl_alpha_idx, mbmi->cfl_alpha_signs, w);
+    if (use_angle_delta && av1_is_directional_mode(get_uv_mode(uv_mode))) {
+      write_angle_delta(w, mbmi->angle_delta[PLANE_TYPE_UV],
+                        ec_ctx->angle_delta_cdf[uv_mode - V_PRED]);
+    }
+  }
+
+  // Palette.
+  if (av1_allow_palette(cm->features.allow_screen_content_tools, bsize)) {
+    write_palette_mode_info(cm, xd, mbmi, w);
+  }
+
+  // Filter intra.
+  write_filter_intra_mode_info(cm, xd, mbmi, w);
+}
+
+static INLINE int16_t mode_context_analyzer(
+    const int16_t mode_context, const MV_REFERENCE_FRAME *const rf) {
+  if (rf[1] <= INTRA_FRAME) return mode_context;
+
+  const int16_t newmv_ctx = mode_context & NEWMV_CTX_MASK;
+  const int16_t refmv_ctx = (mode_context >> REFMV_OFFSET) & REFMV_CTX_MASK;
+
+  const int16_t comp_ctx = compound_mode_ctx_map[refmv_ctx >> 1][AOMMIN(
+      newmv_ctx, COMP_NEWMV_CTXS - 1)];
+  return comp_ctx;
+}
+
+static INLINE int_mv get_ref_mv_from_stack(
+    int ref_idx, const MV_REFERENCE_FRAME *ref_frame, int ref_mv_idx,
+    const MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame) {
+  const int8_t ref_frame_type = av1_ref_frame_type(ref_frame);
+  const CANDIDATE_MV *curr_ref_mv_stack = mbmi_ext_frame->ref_mv_stack;
+
+  if (ref_frame[1] > INTRA_FRAME) {
+    assert(ref_idx == 0 || ref_idx == 1);
+    return ref_idx ? curr_ref_mv_stack[ref_mv_idx].comp_mv
+                   : curr_ref_mv_stack[ref_mv_idx].this_mv;
+  }
+
+  assert(ref_idx == 0);
+  return ref_mv_idx < mbmi_ext_frame->ref_mv_count
+             ? curr_ref_mv_stack[ref_mv_idx].this_mv
+             : mbmi_ext_frame->global_mvs[ref_frame_type];
+}
+
+static INLINE int_mv get_ref_mv(const MACROBLOCK *x, int ref_idx) {
+  const MACROBLOCKD *xd = &x->e_mbd;
+  const MB_MODE_INFO *mbmi = xd->mi[0];
+  int ref_mv_idx = mbmi->ref_mv_idx;
+  if (mbmi->mode == NEAR_NEWMV || mbmi->mode == NEW_NEARMV) {
+    assert(has_second_ref(mbmi));
+    ref_mv_idx += 1;
+  }
+  return get_ref_mv_from_stack(ref_idx, mbmi->ref_frame, ref_mv_idx,
+                               x->mbmi_ext_frame);
+}
+
+static AOM_INLINE void pack_inter_mode_mvs(AV1_COMP *cpi, aom_writer *w) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->td.mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  const struct segmentation *const seg = &cm->seg;
+  struct segmentation_probs *const segp = &ec_ctx->seg;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  const MB_MODE_INFO_EXT_FRAME *const mbmi_ext_frame = x->mbmi_ext_frame;
+  const PREDICTION_MODE mode = mbmi->mode;
+  const int segment_id = mbmi->segment_id;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const int allow_hp = cm->features.allow_high_precision_mv;
+  const int is_inter = is_inter_block(mbmi);
+  const int is_compound = has_second_ref(mbmi);
+  int ref;
+
+  write_inter_segment_id(cpi, w, seg, segp, 0, 1);
+
+  write_skip_mode(cm, xd, segment_id, mbmi, w);
+
+  assert(IMPLIES(mbmi->skip_mode, mbmi->skip));
+  const int skip =
+      mbmi->skip_mode ? 1 : write_skip(cm, xd, segment_id, mbmi, w);
+
+  write_inter_segment_id(cpi, w, seg, segp, skip, 0);
+
+  write_cdef(cm, xd, w, skip);
+
+  write_delta_q_params(cpi, skip, w);
+
+  if (!mbmi->skip_mode) write_is_inter(cm, xd, mbmi->segment_id, w, is_inter);
+
+  if (mbmi->skip_mode) return;
+
+  if (!is_inter) {
+    write_intra_prediction_modes(cpi, 0, w);
+  } else {
+    int16_t mode_ctx;
+
+    av1_collect_neighbors_ref_counts(xd);
+
+    write_ref_frames(cm, xd, w);
+
+    mode_ctx =
+        mode_context_analyzer(mbmi_ext_frame->mode_context, mbmi->ref_frame);
+
+    // If segment skip is not enabled code the mode.
+    if (!segfeature_active(seg, segment_id, SEG_LVL_SKIP)) {
+      if (is_inter_compound_mode(mode))
+        write_inter_compound_mode(xd, w, mode, mode_ctx);
+      else if (is_inter_singleref_mode(mode))
+        write_inter_mode(w, mode, ec_ctx, mode_ctx);
+
+      if (mode == NEWMV || mode == NEW_NEWMV || have_nearmv_in_inter_mode(mode))
+        write_drl_idx(ec_ctx, mbmi, mbmi_ext_frame, w);
+      else
+        assert(mbmi->ref_mv_idx == 0);
+    }
+
+    if (mode == NEWMV || mode == NEW_NEWMV) {
+      for (ref = 0; ref < 1 + is_compound; ++ref) {
+        nmv_context *nmvc = &ec_ctx->nmvc;
+        const int_mv ref_mv = get_ref_mv(x, ref);
+        av1_encode_mv(cpi, w, &mbmi->mv[ref].as_mv, &ref_mv.as_mv, nmvc,
+                      allow_hp);
+      }
+    } else if (mode == NEAREST_NEWMV || mode == NEAR_NEWMV) {
+      nmv_context *nmvc = &ec_ctx->nmvc;
+      const int_mv ref_mv = get_ref_mv(x, 1);
+      av1_encode_mv(cpi, w, &mbmi->mv[1].as_mv, &ref_mv.as_mv, nmvc, allow_hp);
+    } else if (mode == NEW_NEARESTMV || mode == NEW_NEARMV) {
+      nmv_context *nmvc = &ec_ctx->nmvc;
+      const int_mv ref_mv = get_ref_mv(x, 0);
+      av1_encode_mv(cpi, w, &mbmi->mv[0].as_mv, &ref_mv.as_mv, nmvc, allow_hp);
+    }
+
+    if (cpi->common.current_frame.reference_mode != COMPOUND_REFERENCE &&
+        cpi->common.seq_params.enable_interintra_compound &&
+        is_interintra_allowed(mbmi)) {
+      const int interintra = mbmi->ref_frame[1] == INTRA_FRAME;
+      const int bsize_group = size_group_lookup[bsize];
+      aom_write_symbol(w, interintra, ec_ctx->interintra_cdf[bsize_group], 2);
+      if (interintra) {
+        aom_write_symbol(w, mbmi->interintra_mode,
+                         ec_ctx->interintra_mode_cdf[bsize_group],
+                         INTERINTRA_MODES);
+        if (av1_is_wedge_used(bsize)) {
+          aom_write_symbol(w, mbmi->use_wedge_interintra,
+                           ec_ctx->wedge_interintra_cdf[bsize], 2);
+          if (mbmi->use_wedge_interintra) {
+            aom_write_symbol(w, mbmi->interintra_wedge_index,
+                             ec_ctx->wedge_idx_cdf[bsize], MAX_WEDGE_TYPES);
+          }
+        }
+      }
+    }
+
+    if (mbmi->ref_frame[1] != INTRA_FRAME) write_motion_mode(cm, xd, mbmi, w);
+
+    // First write idx to indicate current compound inter prediction mode group
+    // Group A (0): dist_wtd_comp, compound_average
+    // Group B (1): interintra, compound_diffwtd, wedge
+    if (has_second_ref(mbmi)) {
+      const int masked_compound_used = is_any_masked_compound_used(bsize) &&
+                                       cm->seq_params.enable_masked_compound;
+
+      if (masked_compound_used) {
+        const int ctx_comp_group_idx = get_comp_group_idx_context(xd);
+        aom_write_symbol(w, mbmi->comp_group_idx,
+                         ec_ctx->comp_group_idx_cdf[ctx_comp_group_idx], 2);
+      } else {
+        assert(mbmi->comp_group_idx == 0);
+      }
+
+      if (mbmi->comp_group_idx == 0) {
+        if (mbmi->compound_idx)
+          assert(mbmi->interinter_comp.type == COMPOUND_AVERAGE);
+
+        if (cm->seq_params.order_hint_info.enable_dist_wtd_comp) {
+          const int comp_index_ctx = get_comp_index_context(cm, xd);
+          aom_write_symbol(w, mbmi->compound_idx,
+                           ec_ctx->compound_index_cdf[comp_index_ctx], 2);
+        } else {
+          assert(mbmi->compound_idx == 1);
+        }
+      } else {
+        assert(cpi->common.current_frame.reference_mode != SINGLE_REFERENCE &&
+               is_inter_compound_mode(mbmi->mode) &&
+               mbmi->motion_mode == SIMPLE_TRANSLATION);
+        assert(masked_compound_used);
+        // compound_diffwtd, wedge
+        assert(mbmi->interinter_comp.type == COMPOUND_WEDGE ||
+               mbmi->interinter_comp.type == COMPOUND_DIFFWTD);
+
+        if (is_interinter_compound_used(COMPOUND_WEDGE, bsize))
+          aom_write_symbol(w, mbmi->interinter_comp.type - COMPOUND_WEDGE,
+                           ec_ctx->compound_type_cdf[bsize],
+                           MASKED_COMPOUND_TYPES);
+
+        if (mbmi->interinter_comp.type == COMPOUND_WEDGE) {
+          assert(is_interinter_compound_used(COMPOUND_WEDGE, bsize));
+          aom_write_symbol(w, mbmi->interinter_comp.wedge_index,
+                           ec_ctx->wedge_idx_cdf[bsize], MAX_WEDGE_TYPES);
+          aom_write_bit(w, mbmi->interinter_comp.wedge_sign);
+        } else {
+          assert(mbmi->interinter_comp.type == COMPOUND_DIFFWTD);
+          aom_write_literal(w, mbmi->interinter_comp.mask_type,
+                            MAX_DIFFWTD_MASK_BITS);
+        }
+      }
+    }
+    write_mb_interp_filter(cm, xd, w);
+  }
+}
+
+static AOM_INLINE void write_intrabc_info(
+    MACROBLOCKD *xd, const MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame,
+    aom_writer *w) {
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  int use_intrabc = is_intrabc_block(mbmi);
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  aom_write_symbol(w, use_intrabc, ec_ctx->intrabc_cdf, 2);
+  if (use_intrabc) {
+    assert(mbmi->mode == DC_PRED);
+    assert(mbmi->uv_mode == UV_DC_PRED);
+    assert(mbmi->motion_mode == SIMPLE_TRANSLATION);
+    int_mv dv_ref = mbmi_ext_frame->ref_mv_stack[0].this_mv;
+    av1_encode_dv(w, &mbmi->mv[0].as_mv, &dv_ref.as_mv, &ec_ctx->ndvc);
+  }
+}
+
+static AOM_INLINE void write_mb_modes_kf(
+    AV1_COMP *cpi, MACROBLOCKD *xd,
+    const MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame, aom_writer *w) {
+  AV1_COMMON *const cm = &cpi->common;
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  const struct segmentation *const seg = &cm->seg;
+  struct segmentation_probs *const segp = &ec_ctx->seg;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+
+  if (seg->segid_preskip && seg->update_map)
+    write_segment_id(cpi, mbmi, w, seg, segp, 0);
+
+  const int skip = write_skip(cm, xd, mbmi->segment_id, mbmi, w);
+
+  if (!seg->segid_preskip && seg->update_map)
+    write_segment_id(cpi, mbmi, w, seg, segp, skip);
+
+  write_cdef(cm, xd, w, skip);
+
+  write_delta_q_params(cpi, skip, w);
+
+  if (av1_allow_intrabc(cm)) {
+    write_intrabc_info(xd, mbmi_ext_frame, w);
+    if (is_intrabc_block(mbmi)) return;
+  }
+
+  write_intra_prediction_modes(cpi, 1, w);
+}
+
+#if CONFIG_RD_DEBUG
+static AOM_INLINE void dump_mode_info(MB_MODE_INFO *mi) {
+  printf("\nmi->mi_row == %d\n", mi->mi_row);
+  printf("&& mi->mi_col == %d\n", mi->mi_col);
+  printf("&& mi->sb_type == %d\n", mi->sb_type);
+  printf("&& mi->tx_size == %d\n", mi->tx_size);
+  printf("&& mi->mode == %d\n", mi->mode);
+}
+
+static int rd_token_stats_mismatch(RD_STATS *rd_stats, TOKEN_STATS *token_stats,
+                                   int plane) {
+  if (rd_stats->txb_coeff_cost[plane] != token_stats->cost) {
+    int r, c;
+    printf("\nplane %d rd_stats->txb_coeff_cost %d token_stats->cost %d\n",
+           plane, rd_stats->txb_coeff_cost[plane], token_stats->cost);
+    printf("rd txb_coeff_cost_map\n");
+    for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r) {
+      for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c) {
+        printf("%d ", rd_stats->txb_coeff_cost_map[plane][r][c]);
+      }
+      printf("\n");
+    }
+
+    printf("pack txb_coeff_cost_map\n");
+    for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r) {
+      for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c) {
+        printf("%d ", token_stats->txb_coeff_cost_map[r][c]);
+      }
+      printf("\n");
+    }
+    return 1;
+  }
+  return 0;
+}
+#endif
+
+#if ENC_MISMATCH_DEBUG
+static AOM_INLINE void enc_dump_logs(
+    const AV1_COMMON *const cm,
+    const MBMIExtFrameBufferInfo *const mbmi_ext_info, int mi_row, int mi_col) {
+  const MB_MODE_INFO *const mbmi = *(
+      cm->mi_params.mi_grid_base + (mi_row * cm->mi_params.mi_stride + mi_col));
+  const MB_MODE_INFO_EXT_FRAME *const mbmi_ext_frame =
+      mbmi_ext_info->frame_base + get_mi_ext_idx(mi_row, mi_col,
+                                                 cm->mi_params.mi_alloc_bsize,
+                                                 mbmi_ext_info->stride);
+  if (is_inter_block(mbmi)) {
+#define FRAME_TO_CHECK 11
+    if (cm->current_frame.frame_number == FRAME_TO_CHECK &&
+        cm->show_frame == 1) {
+      const BLOCK_SIZE bsize = mbmi->sb_type;
+
+      int_mv mv[2] = { 0 };
+      const int is_comp_ref = has_second_ref(mbmi);
+
+      for (int ref = 0; ref < 1 + is_comp_ref; ++ref)
+        mv[ref].as_mv = mbmi->mv[ref].as_mv;
+
+      if (!is_comp_ref) {
+        mv[1].as_int = 0;
+      }
+
+      const int16_t mode_ctx =
+          is_comp_ref ? 0
+                      : mode_context_analyzer(mbmi_ext_frame->mode_context,
+                                              mbmi->ref_frame);
+
+      const int16_t newmv_ctx = mode_ctx & NEWMV_CTX_MASK;
+      int16_t zeromv_ctx = -1;
+      int16_t refmv_ctx = -1;
+
+      if (mbmi->mode != NEWMV) {
+        zeromv_ctx = (mode_ctx >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK;
+        if (mbmi->mode != GLOBALMV)
+          refmv_ctx = (mode_ctx >> REFMV_OFFSET) & REFMV_CTX_MASK;
+      }
+
+      printf(
+          "=== ENCODER ===: "
+          "Frame=%d, (mi_row,mi_col)=(%d,%d), skip_mode=%d, mode=%d, bsize=%d, "
+          "show_frame=%d, mv[0]=(%d,%d), mv[1]=(%d,%d), ref[0]=%d, "
+          "ref[1]=%d, motion_mode=%d, mode_ctx=%d, "
+          "newmv_ctx=%d, zeromv_ctx=%d, refmv_ctx=%d, tx_size=%d\n",
+          cm->current_frame.frame_number, mi_row, mi_col, mbmi->skip_mode,
+          mbmi->mode, bsize, cm->show_frame, mv[0].as_mv.row, mv[0].as_mv.col,
+          mv[1].as_mv.row, mv[1].as_mv.col, mbmi->ref_frame[0],
+          mbmi->ref_frame[1], mbmi->motion_mode, mode_ctx, newmv_ctx,
+          zeromv_ctx, refmv_ctx, mbmi->tx_size);
+    }
+  }
+}
+#endif  // ENC_MISMATCH_DEBUG
+
+static AOM_INLINE void write_mbmi_b(AV1_COMP *cpi, aom_writer *w) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+  MB_MODE_INFO *m = xd->mi[0];
+
+  if (frame_is_intra_only(cm)) {
+    write_mb_modes_kf(cpi, xd, cpi->td.mb.mbmi_ext_frame, w);
+  } else {
+    // has_subpel_mv_component needs the ref frame buffers set up to look
+    // up if they are scaled. has_subpel_mv_component is in turn needed by
+    // write_switchable_interp_filter, which is called by pack_inter_mode_mvs.
+    set_ref_ptrs(cm, xd, m->ref_frame[0], m->ref_frame[1]);
+
+#if ENC_MISMATCH_DEBUG
+    enc_dump_logs(cm, &cpi->mbmi_ext_info, xd->mi_row, xd->mi_col);
+#endif  // ENC_MISMATCH_DEBUG
+
+    pack_inter_mode_mvs(cpi, w);
+  }
+}
+
+static AOM_INLINE void write_inter_txb_coeff(
+    AV1_COMMON *const cm, MACROBLOCK *const x, MB_MODE_INFO *const mbmi,
+    aom_writer *w, const TOKENEXTRA **tok, const TOKENEXTRA *const tok_end,
+    TOKEN_STATS *token_stats, const int row, const int col, int *block,
+    const int plane) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  assert(bsize < BLOCK_SIZES_ALL);
+  const int ss_x = pd->subsampling_x;
+  const int ss_y = pd->subsampling_y;
+  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y);
+  assert(plane_bsize < BLOCK_SIZES_ALL);
+  const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, plane);
+  const int step =
+      tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
+  const int bkw = tx_size_wide_unit[max_tx_size];
+  const int bkh = tx_size_high_unit[max_tx_size];
+  const BLOCK_SIZE max_unit_bsize =
+      get_plane_block_size(BLOCK_64X64, ss_x, ss_y);
+  const int num_4x4_w = mi_size_wide[plane_bsize];
+  const int num_4x4_h = mi_size_high[plane_bsize];
+  const int mu_blocks_wide = mi_size_wide[max_unit_bsize];
+  const int mu_blocks_high = mi_size_high[max_unit_bsize];
+  const int unit_height = AOMMIN(mu_blocks_high + (row >> ss_y), num_4x4_h);
+  const int unit_width = AOMMIN(mu_blocks_wide + (col >> ss_x), num_4x4_w);
+  for (int blk_row = row >> ss_y; blk_row < unit_height; blk_row += bkh) {
+    for (int blk_col = col >> ss_x; blk_col < unit_width; blk_col += bkw) {
+      pack_txb_tokens(w, cm, x, tok, tok_end, xd, mbmi, plane, plane_bsize,
+                      cm->seq_params.bit_depth, *block, blk_row, blk_col,
+                      max_tx_size, token_stats);
+      *block += step;
+    }
+  }
+}
+
+static AOM_INLINE void write_tokens_b(AV1_COMP *cpi, aom_writer *w,
+                                      const TOKENEXTRA **tok,
+                                      const TOKENEXTRA *const tok_end) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->td.mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+
+  assert(!mbmi->skip);
+
+  const int is_inter = is_inter_block(mbmi);
+  if (!is_inter) {
+    av1_write_coeffs_mb(cm, x, w, bsize);
+  } else {
+    int block[MAX_MB_PLANE] = { 0 };
+    assert(bsize == get_plane_block_size(bsize, xd->plane[0].subsampling_x,
+                                         xd->plane[0].subsampling_y));
+    const int num_4x4_w = mi_size_wide[bsize];
+    const int num_4x4_h = mi_size_high[bsize];
+    TOKEN_STATS token_stats;
+    init_token_stats(&token_stats);
+
+    const BLOCK_SIZE max_unit_bsize = BLOCK_64X64;
+    assert(max_unit_bsize == get_plane_block_size(BLOCK_64X64,
+                                                  xd->plane[0].subsampling_x,
+                                                  xd->plane[0].subsampling_y));
+    int mu_blocks_wide = mi_size_wide[max_unit_bsize];
+    int mu_blocks_high = mi_size_high[max_unit_bsize];
+    mu_blocks_wide = AOMMIN(num_4x4_w, mu_blocks_wide);
+    mu_blocks_high = AOMMIN(num_4x4_h, mu_blocks_high);
+
+    const int num_planes = av1_num_planes(cm);
+    for (int row = 0; row < num_4x4_h; row += mu_blocks_high) {
+      for (int col = 0; col < num_4x4_w; col += mu_blocks_wide) {
+        for (int plane = 0; plane < num_planes; ++plane) {
+          if (plane && !xd->is_chroma_ref) break;
+          write_inter_txb_coeff(cm, x, mbmi, w, tok, tok_end, &token_stats, row,
+                                col, &block[plane], plane);
+        }
+      }
+    }
+#if CONFIG_RD_DEBUG
+    for (int plane = 0; plane < num_planes; ++plane) {
+      if (mbmi->sb_type >= BLOCK_8X8 &&
+          rd_token_stats_mismatch(&mbmi->rd_stats, &token_stats, plane)) {
+        dump_mode_info(mbmi);
+        assert(0);
+      }
+    }
+#endif  // CONFIG_RD_DEBUG
+  }
+}
+
+static AOM_INLINE void write_modes_b(AV1_COMP *cpi, const TileInfo *const tile,
+                                     aom_writer *w, const TOKENEXTRA **tok,
+                                     const TOKENEXTRA *const tok_end,
+                                     int mi_row, int mi_col) {
+  const AV1_COMMON *cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  MACROBLOCKD *xd = &cpi->td.mb.e_mbd;
+  const int grid_idx = mi_row * mi_params->mi_stride + mi_col;
+  xd->mi = mi_params->mi_grid_base + grid_idx;
+  cpi->td.mb.mbmi_ext_frame =
+      cpi->mbmi_ext_info.frame_base +
+      get_mi_ext_idx(mi_row, mi_col, cm->mi_params.mi_alloc_bsize,
+                     cpi->mbmi_ext_info.stride);
+  xd->tx_type_map = mi_params->tx_type_map + grid_idx;
+  xd->tx_type_map_stride = mi_params->mi_stride;
+
+  const MB_MODE_INFO *mbmi = xd->mi[0];
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  assert(bsize <= cm->seq_params.sb_size ||
+         (bsize >= BLOCK_SIZES && bsize < BLOCK_SIZES_ALL));
+
+  const int bh = mi_size_high[bsize];
+  const int bw = mi_size_wide[bsize];
+  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, mi_params->mi_rows,
+                 mi_params->mi_cols);
+
+  xd->above_txfm_context = cm->above_contexts.txfm[tile->tile_row] + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+
+  write_mbmi_b(cpi, w);
+
+  for (int plane = 0; plane < AOMMIN(2, av1_num_planes(cm)); ++plane) {
+    const uint8_t palette_size_plane =
+        mbmi->palette_mode_info.palette_size[plane];
+    assert(!mbmi->skip_mode || !palette_size_plane);
+    if (palette_size_plane > 0) {
+      assert(mbmi->use_intrabc == 0);
+      assert(av1_allow_palette(cm->features.allow_screen_content_tools,
+                               mbmi->sb_type));
+      assert(!plane || xd->is_chroma_ref);
+      int rows, cols;
+      av1_get_block_dimensions(mbmi->sb_type, plane, xd, NULL, NULL, &rows,
+                               &cols);
+      assert(*tok < tok_end);
+      pack_map_tokens(w, tok, palette_size_plane, rows * cols);
+    }
+  }
+
+  const int is_inter_tx = is_inter_block(mbmi);
+  const int skip = mbmi->skip;
+  const int segment_id = mbmi->segment_id;
+  if (cm->features.tx_mode == TX_MODE_SELECT && block_signals_txsize(bsize) &&
+      !(is_inter_tx && skip) && !xd->lossless[segment_id]) {
+    if (is_inter_tx) {  // This implies skip flag is 0.
+      const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, bsize, 0);
+      const int txbh = tx_size_high_unit[max_tx_size];
+      const int txbw = tx_size_wide_unit[max_tx_size];
+      const int width = mi_size_wide[bsize];
+      const int height = mi_size_high[bsize];
+      for (int idy = 0; idy < height; idy += txbh) {
+        for (int idx = 0; idx < width; idx += txbw) {
+          write_tx_size_vartx(xd, mbmi, max_tx_size, 0, idy, idx, w);
+        }
+      }
+    } else {
+      write_selected_tx_size(xd, w);
+      set_txfm_ctxs(mbmi->tx_size, xd->width, xd->height, 0, xd);
+    }
+  } else {
+    set_txfm_ctxs(mbmi->tx_size, xd->width, xd->height, skip && is_inter_tx,
+                  xd);
+  }
+
+  if (!mbmi->skip) {
+    write_tokens_b(cpi, w, tok, tok_end);
+  }
+}
+
+static AOM_INLINE void write_partition(const AV1_COMMON *const cm,
+                                       const MACROBLOCKD *const xd, int hbs,
+                                       int mi_row, int mi_col, PARTITION_TYPE p,
+                                       BLOCK_SIZE bsize, aom_writer *w) {
+  const int is_partition_point = bsize >= BLOCK_8X8;
+
+  if (!is_partition_point) return;
+
+  const int has_rows = (mi_row + hbs) < cm->mi_params.mi_rows;
+  const int has_cols = (mi_col + hbs) < cm->mi_params.mi_cols;
+  const int ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+
+  if (!has_rows && !has_cols) {
+    assert(p == PARTITION_SPLIT);
+    return;
+  }
+
+  if (has_rows && has_cols) {
+    aom_write_symbol(w, p, ec_ctx->partition_cdf[ctx],
+                     partition_cdf_length(bsize));
+  } else if (!has_rows && has_cols) {
+    assert(p == PARTITION_SPLIT || p == PARTITION_HORZ);
+    assert(bsize > BLOCK_8X8);
+    aom_cdf_prob cdf[2];
+    partition_gather_vert_alike(cdf, ec_ctx->partition_cdf[ctx], bsize);
+    aom_write_cdf(w, p == PARTITION_SPLIT, cdf, 2);
+  } else {
+    assert(has_rows && !has_cols);
+    assert(p == PARTITION_SPLIT || p == PARTITION_VERT);
+    assert(bsize > BLOCK_8X8);
+    aom_cdf_prob cdf[2];
+    partition_gather_horz_alike(cdf, ec_ctx->partition_cdf[ctx], bsize);
+    aom_write_cdf(w, p == PARTITION_SPLIT, cdf, 2);
+  }
+}
+
+static AOM_INLINE void write_modes_sb(
+    AV1_COMP *const cpi, const TileInfo *const tile, aom_writer *const w,
+    const TOKENEXTRA **tok, const TOKENEXTRA *const tok_end, int mi_row,
+    int mi_col, BLOCK_SIZE bsize) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+  assert(bsize < BLOCK_SIZES_ALL);
+  const int hbs = mi_size_wide[bsize] / 2;
+  const int quarter_step = mi_size_wide[bsize] / 4;
+  int i;
+  const PARTITION_TYPE partition = get_partition(cm, mi_row, mi_col, bsize);
+  const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
+
+  if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) return;
+
+  const int num_planes = av1_num_planes(cm);
+  for (int plane = 0; plane < num_planes; ++plane) {
+    int rcol0, rcol1, rrow0, rrow1;
+    if (av1_loop_restoration_corners_in_sb(cm, plane, mi_row, mi_col, bsize,
+                                           &rcol0, &rcol1, &rrow0, &rrow1)) {
+      const int rstride = cm->rst_info[plane].horz_units_per_tile;
+      for (int rrow = rrow0; rrow < rrow1; ++rrow) {
+        for (int rcol = rcol0; rcol < rcol1; ++rcol) {
+          const int runit_idx = rcol + rrow * rstride;
+          const RestorationUnitInfo *rui =
+              &cm->rst_info[plane].unit_info[runit_idx];
+          loop_restoration_write_sb_coeffs(cm, xd, rui, w, plane,
+                                           cpi->td.counts);
+        }
+      }
+    }
+  }
+
+  write_partition(cm, xd, hbs, mi_row, mi_col, partition, bsize, w);
+  switch (partition) {
+    case PARTITION_NONE:
+      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+      break;
+    case PARTITION_HORZ:
+      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+      if (mi_row + hbs < mi_params->mi_rows)
+        write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col);
+      break;
+    case PARTITION_VERT:
+      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+      if (mi_col + hbs < mi_params->mi_cols)
+        write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs);
+      break;
+    case PARTITION_SPLIT:
+      write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col, subsize);
+      write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs, subsize);
+      write_modes_sb(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col, subsize);
+      write_modes_sb(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs,
+                     subsize);
+      break;
+    case PARTITION_HORZ_A:
+      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs);
+      write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col);
+      break;
+    case PARTITION_HORZ_B:
+      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+      write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col);
+      write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs);
+      break;
+    case PARTITION_VERT_A:
+      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+      write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col);
+      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs);
+      break;
+    case PARTITION_VERT_B:
+      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs);
+      write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs);
+      break;
+    case PARTITION_HORZ_4:
+      for (i = 0; i < 4; ++i) {
+        int this_mi_row = mi_row + i * quarter_step;
+        if (i > 0 && this_mi_row >= mi_params->mi_rows) break;
+
+        write_modes_b(cpi, tile, w, tok, tok_end, this_mi_row, mi_col);
+      }
+      break;
+    case PARTITION_VERT_4:
+      for (i = 0; i < 4; ++i) {
+        int this_mi_col = mi_col + i * quarter_step;
+        if (i > 0 && this_mi_col >= mi_params->mi_cols) break;
+
+        write_modes_b(cpi, tile, w, tok, tok_end, mi_row, this_mi_col);
+      }
+      break;
+    default: assert(0);
+  }
+
+  // update partition context
+  update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition);
+}
+
+static AOM_INLINE void write_modes(AV1_COMP *const cpi,
+                                   const TileInfo *const tile,
+                                   aom_writer *const w, int tile_row,
+                                   int tile_col) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+  const int mi_row_start = tile->mi_row_start;
+  const int mi_row_end = tile->mi_row_end;
+  const int mi_col_start = tile->mi_col_start;
+  const int mi_col_end = tile->mi_col_end;
+  const int num_planes = av1_num_planes(cm);
+
+  av1_zero_above_context(cm, xd, mi_col_start, mi_col_end, tile->tile_row);
+  av1_init_above_context(&cm->above_contexts, num_planes, tile->tile_row, xd);
+
+  if (cpi->common.delta_q_info.delta_q_present_flag) {
+    xd->current_qindex = cpi->common.quant_params.base_qindex;
+    if (cpi->common.delta_q_info.delta_lf_present_flag) {
+      av1_reset_loop_filter_delta(xd, num_planes);
+    }
+  }
+
+  for (int mi_row = mi_row_start; mi_row < mi_row_end;
+       mi_row += cm->seq_params.mib_size) {
+    const int sb_row_in_tile =
+        (mi_row - tile->mi_row_start) >> cm->seq_params.mib_size_log2;
+    const TOKENEXTRA *tok =
+        cpi->tplist[tile_row][tile_col][sb_row_in_tile].start;
+    const TOKENEXTRA *tok_end =
+        tok + cpi->tplist[tile_row][tile_col][sb_row_in_tile].count;
+
+    av1_zero_left_context(xd);
+
+    for (int mi_col = mi_col_start; mi_col < mi_col_end;
+         mi_col += cm->seq_params.mib_size) {
+      cpi->td.mb.cb_coef_buff = av1_get_cb_coeff_buffer(cpi, mi_row, mi_col);
+      write_modes_sb(cpi, tile, w, &tok, tok_end, mi_row, mi_col,
+                     cm->seq_params.sb_size);
+    }
+    assert(tok == cpi->tplist[tile_row][tile_col][sb_row_in_tile].stop);
+  }
+}
+
+static AOM_INLINE void encode_restoration_mode(
+    AV1_COMMON *cm, struct aom_write_bit_buffer *wb) {
+  assert(!cm->features.all_lossless);
+  if (!cm->seq_params.enable_restoration) return;
+  if (cm->features.allow_intrabc) return;
+  const int num_planes = av1_num_planes(cm);
+  int all_none = 1, chroma_none = 1;
+  for (int p = 0; p < num_planes; ++p) {
+    RestorationInfo *rsi = &cm->rst_info[p];
+    if (rsi->frame_restoration_type != RESTORE_NONE) {
+      all_none = 0;
+      chroma_none &= p == 0;
+    }
+    switch (rsi->frame_restoration_type) {
+      case RESTORE_NONE:
+        aom_wb_write_bit(wb, 0);
+        aom_wb_write_bit(wb, 0);
+        break;
+      case RESTORE_WIENER:
+        aom_wb_write_bit(wb, 1);
+        aom_wb_write_bit(wb, 0);
+        break;
+      case RESTORE_SGRPROJ:
+        aom_wb_write_bit(wb, 1);
+        aom_wb_write_bit(wb, 1);
+        break;
+      case RESTORE_SWITCHABLE:
+        aom_wb_write_bit(wb, 0);
+        aom_wb_write_bit(wb, 1);
+        break;
+      default: assert(0);
+    }
+  }
+  if (!all_none) {
+    assert(cm->seq_params.sb_size == BLOCK_64X64 ||
+           cm->seq_params.sb_size == BLOCK_128X128);
+    const int sb_size = cm->seq_params.sb_size == BLOCK_128X128 ? 128 : 64;
+
+    RestorationInfo *rsi = &cm->rst_info[0];
+
+    assert(rsi->restoration_unit_size >= sb_size);
+    assert(RESTORATION_UNITSIZE_MAX == 256);
+
+    if (sb_size == 64) {
+      aom_wb_write_bit(wb, rsi->restoration_unit_size > 64);
+    }
+    if (rsi->restoration_unit_size > 64) {
+      aom_wb_write_bit(wb, rsi->restoration_unit_size > 128);
+    }
+  }
+
+  if (num_planes > 1) {
+    int s = AOMMIN(cm->seq_params.subsampling_x, cm->seq_params.subsampling_y);
+    if (s && !chroma_none) {
+      aom_wb_write_bit(wb, cm->rst_info[1].restoration_unit_size !=
+                               cm->rst_info[0].restoration_unit_size);
+      assert(cm->rst_info[1].restoration_unit_size ==
+                 cm->rst_info[0].restoration_unit_size ||
+             cm->rst_info[1].restoration_unit_size ==
+                 (cm->rst_info[0].restoration_unit_size >> s));
+      assert(cm->rst_info[2].restoration_unit_size ==
+             cm->rst_info[1].restoration_unit_size);
+    } else if (!s) {
+      assert(cm->rst_info[1].restoration_unit_size ==
+             cm->rst_info[0].restoration_unit_size);
+      assert(cm->rst_info[2].restoration_unit_size ==
+             cm->rst_info[1].restoration_unit_size);
+    }
+  }
+}
+
+static AOM_INLINE void write_wiener_filter(int wiener_win,
+                                           const WienerInfo *wiener_info,
+                                           WienerInfo *ref_wiener_info,
+                                           aom_writer *wb) {
+  if (wiener_win == WIENER_WIN)
+    aom_write_primitive_refsubexpfin(
+        wb, WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1,
+        WIENER_FILT_TAP0_SUBEXP_K,
+        ref_wiener_info->vfilter[0] - WIENER_FILT_TAP0_MINV,
+        wiener_info->vfilter[0] - WIENER_FILT_TAP0_MINV);
+  else
+    assert(wiener_info->vfilter[0] == 0 &&
+           wiener_info->vfilter[WIENER_WIN - 1] == 0);
+  aom_write_primitive_refsubexpfin(
+      wb, WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1,
+      WIENER_FILT_TAP1_SUBEXP_K,
+      ref_wiener_info->vfilter[1] - WIENER_FILT_TAP1_MINV,
+      wiener_info->vfilter[1] - WIENER_FILT_TAP1_MINV);
+  aom_write_primitive_refsubexpfin(
+      wb, WIENER_FILT_TAP2_MAXV - WIENER_FILT_TAP2_MINV + 1,
+      WIENER_FILT_TAP2_SUBEXP_K,
+      ref_wiener_info->vfilter[2] - WIENER_FILT_TAP2_MINV,
+      wiener_info->vfilter[2] - WIENER_FILT_TAP2_MINV);
+  if (wiener_win == WIENER_WIN)
+    aom_write_primitive_refsubexpfin(
+        wb, WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1,
+        WIENER_FILT_TAP0_SUBEXP_K,
+        ref_wiener_info->hfilter[0] - WIENER_FILT_TAP0_MINV,
+        wiener_info->hfilter[0] - WIENER_FILT_TAP0_MINV);
+  else
+    assert(wiener_info->hfilter[0] == 0 &&
+           wiener_info->hfilter[WIENER_WIN - 1] == 0);
+  aom_write_primitive_refsubexpfin(
+      wb, WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1,
+      WIENER_FILT_TAP1_SUBEXP_K,
+      ref_wiener_info->hfilter[1] - WIENER_FILT_TAP1_MINV,
+      wiener_info->hfilter[1] - WIENER_FILT_TAP1_MINV);
+  aom_write_primitive_refsubexpfin(
+      wb, WIENER_FILT_TAP2_MAXV - WIENER_FILT_TAP2_MINV + 1,
+      WIENER_FILT_TAP2_SUBEXP_K,
+      ref_wiener_info->hfilter[2] - WIENER_FILT_TAP2_MINV,
+      wiener_info->hfilter[2] - WIENER_FILT_TAP2_MINV);
+  memcpy(ref_wiener_info, wiener_info, sizeof(*wiener_info));
+}
+
+static AOM_INLINE void write_sgrproj_filter(const SgrprojInfo *sgrproj_info,
+                                            SgrprojInfo *ref_sgrproj_info,
+                                            aom_writer *wb) {
+  aom_write_literal(wb, sgrproj_info->ep, SGRPROJ_PARAMS_BITS);
+  const sgr_params_type *params = &av1_sgr_params[sgrproj_info->ep];
+
+  if (params->r[0] == 0) {
+    assert(sgrproj_info->xqd[0] == 0);
+    aom_write_primitive_refsubexpfin(
+        wb, SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1, SGRPROJ_PRJ_SUBEXP_K,
+        ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1,
+        sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1);
+  } else if (params->r[1] == 0) {
+    aom_write_primitive_refsubexpfin(
+        wb, SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1, SGRPROJ_PRJ_SUBEXP_K,
+        ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0,
+        sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0);
+  } else {
+    aom_write_primitive_refsubexpfin(
+        wb, SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1, SGRPROJ_PRJ_SUBEXP_K,
+        ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0,
+        sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0);
+    aom_write_primitive_refsubexpfin(
+        wb, SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1, SGRPROJ_PRJ_SUBEXP_K,
+        ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1,
+        sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1);
+  }
+
+  memcpy(ref_sgrproj_info, sgrproj_info, sizeof(*sgrproj_info));
+}
+
+static AOM_INLINE void loop_restoration_write_sb_coeffs(
+    const AV1_COMMON *const cm, MACROBLOCKD *xd, const RestorationUnitInfo *rui,
+    aom_writer *const w, int plane, FRAME_COUNTS *counts) {
+  const RestorationInfo *rsi = cm->rst_info + plane;
+  RestorationType frame_rtype = rsi->frame_restoration_type;
+  if (frame_rtype == RESTORE_NONE) return;
+
+  (void)counts;
+  assert(!cm->features.all_lossless);
+
+  const int wiener_win = (plane > 0) ? WIENER_WIN_CHROMA : WIENER_WIN;
+  WienerInfo *ref_wiener_info = &xd->wiener_info[plane];
+  SgrprojInfo *ref_sgrproj_info = &xd->sgrproj_info[plane];
+  RestorationType unit_rtype = rui->restoration_type;
+
+  if (frame_rtype == RESTORE_SWITCHABLE) {
+    aom_write_symbol(w, unit_rtype, xd->tile_ctx->switchable_restore_cdf,
+                     RESTORE_SWITCHABLE_TYPES);
+#if CONFIG_ENTROPY_STATS
+    ++counts->switchable_restore[unit_rtype];
+#endif
+    switch (unit_rtype) {
+      case RESTORE_WIENER:
+        write_wiener_filter(wiener_win, &rui->wiener_info, ref_wiener_info, w);
+        break;
+      case RESTORE_SGRPROJ:
+        write_sgrproj_filter(&rui->sgrproj_info, ref_sgrproj_info, w);
+        break;
+      default: assert(unit_rtype == RESTORE_NONE); break;
+    }
+  } else if (frame_rtype == RESTORE_WIENER) {
+    aom_write_symbol(w, unit_rtype != RESTORE_NONE,
+                     xd->tile_ctx->wiener_restore_cdf, 2);
+#if CONFIG_ENTROPY_STATS
+    ++counts->wiener_restore[unit_rtype != RESTORE_NONE];
+#endif
+    if (unit_rtype != RESTORE_NONE) {
+      write_wiener_filter(wiener_win, &rui->wiener_info, ref_wiener_info, w);
+    }
+  } else if (frame_rtype == RESTORE_SGRPROJ) {
+    aom_write_symbol(w, unit_rtype != RESTORE_NONE,
+                     xd->tile_ctx->sgrproj_restore_cdf, 2);
+#if CONFIG_ENTROPY_STATS
+    ++counts->sgrproj_restore[unit_rtype != RESTORE_NONE];
+#endif
+    if (unit_rtype != RESTORE_NONE) {
+      write_sgrproj_filter(&rui->sgrproj_info, ref_sgrproj_info, w);
+    }
+  }
+}
+
+// Only write out the ref delta section if any of the elements
+// will signal a delta.
+static bool is_mode_ref_delta_meaningful(AV1_COMMON *cm) {
+  struct loopfilter *lf = &cm->lf;
+  if (!lf->mode_ref_delta_update) {
+    return 0;
+  }
+  const RefCntBuffer *buf = get_primary_ref_frame_buf(cm);
+  int8_t last_ref_deltas[REF_FRAMES];
+  int8_t last_mode_deltas[MAX_MODE_LF_DELTAS];
+  if (buf == NULL) {
+    av1_set_default_ref_deltas(last_ref_deltas);
+    av1_set_default_mode_deltas(last_mode_deltas);
+  } else {
+    memcpy(last_ref_deltas, buf->ref_deltas, REF_FRAMES);
+    memcpy(last_mode_deltas, buf->mode_deltas, MAX_MODE_LF_DELTAS);
+  }
+  for (int i = 0; i < REF_FRAMES; i++) {
+    if (lf->ref_deltas[i] != last_ref_deltas[i]) {
+      return true;
+    }
+  }
+  for (int i = 0; i < MAX_MODE_LF_DELTAS; i++) {
+    if (lf->mode_deltas[i] != last_mode_deltas[i]) {
+      return true;
+    }
+  }
+  return false;
+}
+
+static AOM_INLINE void encode_loopfilter(AV1_COMMON *cm,
+                                         struct aom_write_bit_buffer *wb) {
+  assert(!cm->features.coded_lossless);
+  if (cm->features.allow_intrabc) return;
+  const int num_planes = av1_num_planes(cm);
+  struct loopfilter *lf = &cm->lf;
+
+  // Encode the loop filter level and type
+  aom_wb_write_literal(wb, lf->filter_level[0], 6);
+  aom_wb_write_literal(wb, lf->filter_level[1], 6);
+  if (num_planes > 1) {
+    if (lf->filter_level[0] || lf->filter_level[1]) {
+      aom_wb_write_literal(wb, lf->filter_level_u, 6);
+      aom_wb_write_literal(wb, lf->filter_level_v, 6);
+    }
+  }
+  aom_wb_write_literal(wb, lf->sharpness_level, 3);
+
+  aom_wb_write_bit(wb, lf->mode_ref_delta_enabled);
+
+  // Write out loop filter deltas applied at the MB level based on mode or
+  // ref frame (if they are enabled), only if there is information to write.
+  int meaningful = is_mode_ref_delta_meaningful(cm);
+  aom_wb_write_bit(wb, meaningful);
+  if (!meaningful) {
+    return;
+  }
+
+  const RefCntBuffer *buf = get_primary_ref_frame_buf(cm);
+  int8_t last_ref_deltas[REF_FRAMES];
+  int8_t last_mode_deltas[MAX_MODE_LF_DELTAS];
+  if (buf == NULL) {
+    av1_set_default_ref_deltas(last_ref_deltas);
+    av1_set_default_mode_deltas(last_mode_deltas);
+  } else {
+    memcpy(last_ref_deltas, buf->ref_deltas, REF_FRAMES);
+    memcpy(last_mode_deltas, buf->mode_deltas, MAX_MODE_LF_DELTAS);
+  }
+  for (int i = 0; i < REF_FRAMES; i++) {
+    const int delta = lf->ref_deltas[i];
+    const int changed = delta != last_ref_deltas[i];
+    aom_wb_write_bit(wb, changed);
+    if (changed) aom_wb_write_inv_signed_literal(wb, delta, 6);
+  }
+  for (int i = 0; i < MAX_MODE_LF_DELTAS; i++) {
+    const int delta = lf->mode_deltas[i];
+    const int changed = delta != last_mode_deltas[i];
+    aom_wb_write_bit(wb, changed);
+    if (changed) aom_wb_write_inv_signed_literal(wb, delta, 6);
+  }
+}
+
+static AOM_INLINE void encode_cdef(const AV1_COMMON *cm,
+                                   struct aom_write_bit_buffer *wb) {
+  assert(!cm->features.coded_lossless);
+  if (!cm->seq_params.enable_cdef) return;
+  if (cm->features.allow_intrabc) return;
+  const int num_planes = av1_num_planes(cm);
+  int i;
+  aom_wb_write_literal(wb, cm->cdef_info.cdef_damping - 3, 2);
+  aom_wb_write_literal(wb, cm->cdef_info.cdef_bits, 2);
+  for (i = 0; i < cm->cdef_info.nb_cdef_strengths; i++) {
+    aom_wb_write_literal(wb, cm->cdef_info.cdef_strengths[i],
+                         CDEF_STRENGTH_BITS);
+    if (num_planes > 1)
+      aom_wb_write_literal(wb, cm->cdef_info.cdef_uv_strengths[i],
+                           CDEF_STRENGTH_BITS);
+  }
+}
+
+static AOM_INLINE void write_delta_q(struct aom_write_bit_buffer *wb,
+                                     int delta_q) {
+  if (delta_q != 0) {
+    aom_wb_write_bit(wb, 1);
+    aom_wb_write_inv_signed_literal(wb, delta_q, 6);
+  } else {
+    aom_wb_write_bit(wb, 0);
+  }
+}
+
+static AOM_INLINE void encode_quantization(
+    const CommonQuantParams *const quant_params, int num_planes,
+    bool separate_uv_delta_q, struct aom_write_bit_buffer *wb) {
+  aom_wb_write_literal(wb, quant_params->base_qindex, QINDEX_BITS);
+  write_delta_q(wb, quant_params->y_dc_delta_q);
+  if (num_planes > 1) {
+    int diff_uv_delta =
+        (quant_params->u_dc_delta_q != quant_params->v_dc_delta_q) ||
+        (quant_params->u_ac_delta_q != quant_params->v_ac_delta_q);
+    if (separate_uv_delta_q) aom_wb_write_bit(wb, diff_uv_delta);
+    write_delta_q(wb, quant_params->u_dc_delta_q);
+    write_delta_q(wb, quant_params->u_ac_delta_q);
+    if (diff_uv_delta) {
+      write_delta_q(wb, quant_params->v_dc_delta_q);
+      write_delta_q(wb, quant_params->v_ac_delta_q);
+    }
+  }
+  aom_wb_write_bit(wb, quant_params->using_qmatrix);
+  if (quant_params->using_qmatrix) {
+    aom_wb_write_literal(wb, quant_params->qmatrix_level_y, QM_LEVEL_BITS);
+    aom_wb_write_literal(wb, quant_params->qmatrix_level_u, QM_LEVEL_BITS);
+    if (!separate_uv_delta_q)
+      assert(quant_params->qmatrix_level_u == quant_params->qmatrix_level_v);
+    else
+      aom_wb_write_literal(wb, quant_params->qmatrix_level_v, QM_LEVEL_BITS);
+  }
+}
+
+static AOM_INLINE void encode_segmentation(AV1_COMMON *cm, MACROBLOCKD *xd,
+                                           struct aom_write_bit_buffer *wb) {
+  int i, j;
+  struct segmentation *seg = &cm->seg;
+
+  aom_wb_write_bit(wb, seg->enabled);
+  if (!seg->enabled) return;
+
+  // Write update flags
+  if (cm->features.primary_ref_frame == PRIMARY_REF_NONE) {
+    assert(seg->update_map == 1);
+    seg->temporal_update = 0;
+    assert(seg->update_data == 1);
+  } else {
+    aom_wb_write_bit(wb, seg->update_map);
+    if (seg->update_map) {
+      // Select the coding strategy (temporal or spatial)
+      av1_choose_segmap_coding_method(cm, xd);
+      aom_wb_write_bit(wb, seg->temporal_update);
+    }
+    aom_wb_write_bit(wb, seg->update_data);
+  }
+
+  // Segmentation data
+  if (seg->update_data) {
+    for (i = 0; i < MAX_SEGMENTS; i++) {
+      for (j = 0; j < SEG_LVL_MAX; j++) {
+        const int active = segfeature_active(seg, i, j);
+        aom_wb_write_bit(wb, active);
+        if (active) {
+          const int data_max = av1_seg_feature_data_max(j);
+          const int data_min = -data_max;
+          const int ubits = get_unsigned_bits(data_max);
+          const int data = clamp(get_segdata(seg, i, j), data_min, data_max);
+
+          if (av1_is_segfeature_signed(j)) {
+            aom_wb_write_inv_signed_literal(wb, data, ubits);
+          } else {
+            aom_wb_write_literal(wb, data, ubits);
+          }
+        }
+      }
+    }
+  }
+}
+
+static AOM_INLINE void write_frame_interp_filter(
+    InterpFilter filter, struct aom_write_bit_buffer *wb) {
+  aom_wb_write_bit(wb, filter == SWITCHABLE);
+  if (filter != SWITCHABLE)
+    aom_wb_write_literal(wb, filter, LOG_SWITCHABLE_FILTERS);
+}
+
+// Same function as write_uniform but writing to uncompresses header wb
+static AOM_INLINE void wb_write_uniform(struct aom_write_bit_buffer *wb, int n,
+                                        int v) {
+  const int l = get_unsigned_bits(n);
+  const int m = (1 << l) - n;
+  if (l == 0) return;
+  if (v < m) {
+    aom_wb_write_literal(wb, v, l - 1);
+  } else {
+    aom_wb_write_literal(wb, m + ((v - m) >> 1), l - 1);
+    aom_wb_write_literal(wb, (v - m) & 1, 1);
+  }
+}
+
+static AOM_INLINE void write_tile_info_max_tile(
+    const AV1_COMMON *const cm, struct aom_write_bit_buffer *wb) {
+  int width_mi =
+      ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols, cm->seq_params.mib_size_log2);
+  int height_mi =
+      ALIGN_POWER_OF_TWO(cm->mi_params.mi_rows, cm->seq_params.mib_size_log2);
+  int width_sb = width_mi >> cm->seq_params.mib_size_log2;
+  int height_sb = height_mi >> cm->seq_params.mib_size_log2;
+  int size_sb, i;
+  const CommonTileParams *const tiles = &cm->tiles;
+
+  aom_wb_write_bit(wb, tiles->uniform_spacing);
+
+  if (tiles->uniform_spacing) {
+    int ones = tiles->log2_cols - tiles->min_log2_cols;
+    while (ones--) {
+      aom_wb_write_bit(wb, 1);
+    }
+    if (tiles->log2_cols < tiles->max_log2_cols) {
+      aom_wb_write_bit(wb, 0);
+    }
+
+    // rows
+    ones = tiles->log2_rows - tiles->min_log2_rows;
+    while (ones--) {
+      aom_wb_write_bit(wb, 1);
+    }
+    if (tiles->log2_rows < tiles->max_log2_rows) {
+      aom_wb_write_bit(wb, 0);
+    }
+  } else {
+    // Explicit tiles with configurable tile widths and heights
+    // columns
+    for (i = 0; i < tiles->cols; i++) {
+      size_sb = tiles->col_start_sb[i + 1] - tiles->col_start_sb[i];
+      wb_write_uniform(wb, AOMMIN(width_sb, tiles->max_width_sb), size_sb - 1);
+      width_sb -= size_sb;
+    }
+    assert(width_sb == 0);
+
+    // rows
+    for (i = 0; i < tiles->rows; i++) {
+      size_sb = tiles->row_start_sb[i + 1] - tiles->row_start_sb[i];
+      wb_write_uniform(wb, AOMMIN(height_sb, tiles->max_height_sb),
+                       size_sb - 1);
+      height_sb -= size_sb;
+    }
+    assert(height_sb == 0);
+  }
+}
+
+static AOM_INLINE void write_tile_info(const AV1_COMMON *const cm,
+                                       struct aom_write_bit_buffer *saved_wb,
+                                       struct aom_write_bit_buffer *wb) {
+  write_tile_info_max_tile(cm, wb);
+
+  *saved_wb = *wb;
+  if (cm->tiles.rows * cm->tiles.cols > 1) {
+    // tile id used for cdf update
+    aom_wb_write_literal(wb, 0, cm->tiles.log2_cols + cm->tiles.log2_rows);
+    // Number of bytes in tile size - 1
+    aom_wb_write_literal(wb, 3, 2);
+  }
+}
+
+static AOM_INLINE void write_ext_tile_info(
+    const AV1_COMMON *const cm, struct aom_write_bit_buffer *saved_wb,
+    struct aom_write_bit_buffer *wb) {
+  // This information is stored as a separate byte.
+  int mod = wb->bit_offset % CHAR_BIT;
+  if (mod > 0) aom_wb_write_literal(wb, 0, CHAR_BIT - mod);
+  assert(aom_wb_is_byte_aligned(wb));
+
+  *saved_wb = *wb;
+  if (cm->tiles.rows * cm->tiles.cols > 1) {
+    // Note that the last item in the uncompressed header is the data
+    // describing tile configuration.
+    // Number of bytes in tile column size - 1
+    aom_wb_write_literal(wb, 0, 2);
+    // Number of bytes in tile size - 1
+    aom_wb_write_literal(wb, 0, 2);
+  }
+}
+
+// Stores the location and size of a tile's data in the bitstream.  Used for
+// later identifying identical tiles
+typedef struct TileBufferEnc {
+  uint8_t *data;
+  size_t size;
+} TileBufferEnc;
+
+static INLINE int find_identical_tile(
+    const int tile_row, const int tile_col,
+    TileBufferEnc (*const tile_buffers)[MAX_TILE_COLS]) {
+  const MV32 candidate_offset[1] = { { 1, 0 } };
+  const uint8_t *const cur_tile_data =
+      tile_buffers[tile_row][tile_col].data + 4;
+  const size_t cur_tile_size = tile_buffers[tile_row][tile_col].size;
+
+  int i;
+
+  if (tile_row == 0) return 0;
+
+  // (TODO: yunqingwang) For now, only above tile is checked and used.
+  // More candidates such as left tile can be added later.
+  for (i = 0; i < 1; i++) {
+    int row_offset = candidate_offset[0].row;
+    int col_offset = candidate_offset[0].col;
+    int row = tile_row - row_offset;
+    int col = tile_col - col_offset;
+    const uint8_t *tile_data;
+    TileBufferEnc *candidate;
+
+    if (row < 0 || col < 0) continue;
+
+    const uint32_t tile_hdr = mem_get_le32(tile_buffers[row][col].data);
+
+    // Read out tile-copy-mode bit:
+    if ((tile_hdr >> 31) == 1) {
+      // The candidate is a copy tile itself: the offset is stored in bits
+      // 30 through 24 inclusive.
+      row_offset += (tile_hdr >> 24) & 0x7f;
+      row = tile_row - row_offset;
+    }
+
+    candidate = &tile_buffers[row][col];
+
+    if (row_offset >= 128 || candidate->size != cur_tile_size) continue;
+
+    tile_data = candidate->data + 4;
+
+    if (memcmp(tile_data, cur_tile_data, cur_tile_size) != 0) continue;
+
+    // Identical tile found
+    assert(row_offset > 0);
+    return row_offset;
+  }
+
+  // No identical tile found
+  return 0;
+}
+
+static AOM_INLINE void write_render_size(const AV1_COMMON *cm,
+                                         struct aom_write_bit_buffer *wb) {
+  const int scaling_active = av1_resize_scaled(cm);
+  aom_wb_write_bit(wb, scaling_active);
+  if (scaling_active) {
+    aom_wb_write_literal(wb, cm->render_width - 1, 16);
+    aom_wb_write_literal(wb, cm->render_height - 1, 16);
+  }
+}
+
+static AOM_INLINE void write_superres_scale(const AV1_COMMON *const cm,
+                                            struct aom_write_bit_buffer *wb) {
+  const SequenceHeader *const seq_params = &cm->seq_params;
+  if (!seq_params->enable_superres) {
+    assert(cm->superres_scale_denominator == SCALE_NUMERATOR);
+    return;
+  }
+
+  // First bit is whether to to scale or not
+  if (cm->superres_scale_denominator == SCALE_NUMERATOR) {
+    aom_wb_write_bit(wb, 0);  // no scaling
+  } else {
+    aom_wb_write_bit(wb, 1);  // scaling, write scale factor
+    assert(cm->superres_scale_denominator >= SUPERRES_SCALE_DENOMINATOR_MIN);
+    assert(cm->superres_scale_denominator <
+           SUPERRES_SCALE_DENOMINATOR_MIN + (1 << SUPERRES_SCALE_BITS));
+    aom_wb_write_literal(
+        wb, cm->superres_scale_denominator - SUPERRES_SCALE_DENOMINATOR_MIN,
+        SUPERRES_SCALE_BITS);
+  }
+}
+
+static AOM_INLINE void write_frame_size(const AV1_COMMON *cm,
+                                        int frame_size_override,
+                                        struct aom_write_bit_buffer *wb) {
+  const int coded_width = cm->superres_upscaled_width - 1;
+  const int coded_height = cm->superres_upscaled_height - 1;
+
+  if (frame_size_override) {
+    const SequenceHeader *seq_params = &cm->seq_params;
+    int num_bits_width = seq_params->num_bits_width;
+    int num_bits_height = seq_params->num_bits_height;
+    aom_wb_write_literal(wb, coded_width, num_bits_width);
+    aom_wb_write_literal(wb, coded_height, num_bits_height);
+  }
+
+  write_superres_scale(cm, wb);
+  write_render_size(cm, wb);
+}
+
+static AOM_INLINE void write_frame_size_with_refs(
+    const AV1_COMMON *const cm, struct aom_write_bit_buffer *wb) {
+  int found = 0;
+
+  MV_REFERENCE_FRAME ref_frame;
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    const YV12_BUFFER_CONFIG *cfg = get_ref_frame_yv12_buf(cm, ref_frame);
+
+    if (cfg != NULL) {
+      found = cm->superres_upscaled_width == cfg->y_crop_width &&
+              cm->superres_upscaled_height == cfg->y_crop_height;
+      found &= cm->render_width == cfg->render_width &&
+               cm->render_height == cfg->render_height;
+    }
+    aom_wb_write_bit(wb, found);
+    if (found) {
+      write_superres_scale(cm, wb);
+      break;
+    }
+  }
+
+  if (!found) {
+    int frame_size_override = 1;  // Always equal to 1 in this function
+    write_frame_size(cm, frame_size_override, wb);
+  }
+}
+
+static AOM_INLINE void write_profile(BITSTREAM_PROFILE profile,
+                                     struct aom_write_bit_buffer *wb) {
+  assert(profile >= PROFILE_0 && profile < MAX_PROFILES);
+  aom_wb_write_literal(wb, profile, PROFILE_BITS);
+}
+
+static AOM_INLINE void write_bitdepth(const SequenceHeader *const seq_params,
+                                      struct aom_write_bit_buffer *wb) {
+  // Profile 0/1: [0] for 8 bit, [1]  10-bit
+  // Profile   2: [0] for 8 bit, [10] 10-bit, [11] - 12-bit
+  aom_wb_write_bit(wb, seq_params->bit_depth == AOM_BITS_8 ? 0 : 1);
+  if (seq_params->profile == PROFILE_2 && seq_params->bit_depth != AOM_BITS_8) {
+    aom_wb_write_bit(wb, seq_params->bit_depth == AOM_BITS_10 ? 0 : 1);
+  }
+}
+
+static AOM_INLINE void write_color_config(
+    const SequenceHeader *const seq_params, struct aom_write_bit_buffer *wb) {
+  write_bitdepth(seq_params, wb);
+  const int is_monochrome = seq_params->monochrome;
+  // monochrome bit
+  if (seq_params->profile != PROFILE_1)
+    aom_wb_write_bit(wb, is_monochrome);
+  else
+    assert(!is_monochrome);
+  if (seq_params->color_primaries == AOM_CICP_CP_UNSPECIFIED &&
+      seq_params->transfer_characteristics == AOM_CICP_TC_UNSPECIFIED &&
+      seq_params->matrix_coefficients == AOM_CICP_MC_UNSPECIFIED) {
+    aom_wb_write_bit(wb, 0);  // No color description present
+  } else {
+    aom_wb_write_bit(wb, 1);  // Color description present
+    aom_wb_write_literal(wb, seq_params->color_primaries, 8);
+    aom_wb_write_literal(wb, seq_params->transfer_characteristics, 8);
+    aom_wb_write_literal(wb, seq_params->matrix_coefficients, 8);
+  }
+  if (is_monochrome) {
+    // 0: [16, 235] (i.e. xvYCC), 1: [0, 255]
+    aom_wb_write_bit(wb, seq_params->color_range);
+    return;
+  }
+  if (seq_params->color_primaries == AOM_CICP_CP_BT_709 &&
+      seq_params->transfer_characteristics == AOM_CICP_TC_SRGB &&
+      seq_params->matrix_coefficients == AOM_CICP_MC_IDENTITY) {
+    assert(seq_params->subsampling_x == 0 && seq_params->subsampling_y == 0);
+    assert(seq_params->profile == PROFILE_1 ||
+           (seq_params->profile == PROFILE_2 &&
+            seq_params->bit_depth == AOM_BITS_12));
+  } else {
+    // 0: [16, 235] (i.e. xvYCC), 1: [0, 255]
+    aom_wb_write_bit(wb, seq_params->color_range);
+    if (seq_params->profile == PROFILE_0) {
+      // 420 only
+      assert(seq_params->subsampling_x == 1 && seq_params->subsampling_y == 1);
+    } else if (seq_params->profile == PROFILE_1) {
+      // 444 only
+      assert(seq_params->subsampling_x == 0 && seq_params->subsampling_y == 0);
+    } else if (seq_params->profile == PROFILE_2) {
+      if (seq_params->bit_depth == AOM_BITS_12) {
+        // 420, 444 or 422
+        aom_wb_write_bit(wb, seq_params->subsampling_x);
+        if (seq_params->subsampling_x == 0) {
+          assert(seq_params->subsampling_y == 0 &&
+                 "4:4:0 subsampling not allowed in AV1");
+        } else {
+          aom_wb_write_bit(wb, seq_params->subsampling_y);
+        }
+      } else {
+        // 422 only
+        assert(seq_params->subsampling_x == 1 &&
+               seq_params->subsampling_y == 0);
+      }
+    }
+    if (seq_params->matrix_coefficients == AOM_CICP_MC_IDENTITY) {
+      assert(seq_params->subsampling_x == 0 && seq_params->subsampling_y == 0);
+    }
+    if (seq_params->subsampling_x == 1 && seq_params->subsampling_y == 1) {
+      aom_wb_write_literal(wb, seq_params->chroma_sample_position, 2);
+    }
+  }
+  aom_wb_write_bit(wb, seq_params->separate_uv_delta_q);
+}
+
+static AOM_INLINE void write_timing_info_header(
+    const aom_timing_info_t *const timing_info,
+    struct aom_write_bit_buffer *wb) {
+  aom_wb_write_unsigned_literal(wb, timing_info->num_units_in_display_tick, 32);
+  aom_wb_write_unsigned_literal(wb, timing_info->time_scale, 32);
+  aom_wb_write_bit(wb, timing_info->equal_picture_interval);
+  if (timing_info->equal_picture_interval) {
+    aom_wb_write_uvlc(wb, timing_info->num_ticks_per_picture - 1);
+  }
+}
+
+static AOM_INLINE void write_decoder_model_info(
+    const aom_dec_model_info_t *const decoder_model_info,
+    struct aom_write_bit_buffer *wb) {
+  aom_wb_write_literal(
+      wb, decoder_model_info->encoder_decoder_buffer_delay_length - 1, 5);
+  aom_wb_write_unsigned_literal(
+      wb, decoder_model_info->num_units_in_decoding_tick, 32);
+  aom_wb_write_literal(wb, decoder_model_info->buffer_removal_time_length - 1,
+                       5);
+  aom_wb_write_literal(
+      wb, decoder_model_info->frame_presentation_time_length - 1, 5);
+}
+
+static AOM_INLINE void write_dec_model_op_parameters(
+    const aom_dec_model_op_parameters_t *op_params, int buffer_delay_length,
+    struct aom_write_bit_buffer *wb) {
+  aom_wb_write_unsigned_literal(wb, op_params->decoder_buffer_delay,
+                                buffer_delay_length);
+  aom_wb_write_unsigned_literal(wb, op_params->encoder_buffer_delay,
+                                buffer_delay_length);
+  aom_wb_write_bit(wb, op_params->low_delay_mode_flag);
+}
+
+static AOM_INLINE void write_tu_pts_info(AV1_COMMON *const cm,
+                                         struct aom_write_bit_buffer *wb) {
+  aom_wb_write_unsigned_literal(
+      wb, cm->frame_presentation_time,
+      cm->seq_params.decoder_model_info.frame_presentation_time_length);
+}
+
+static AOM_INLINE void write_film_grain_params(
+    const AV1_COMP *const cpi, struct aom_write_bit_buffer *wb) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const aom_film_grain_t *const pars = &cm->cur_frame->film_grain_params;
+
+  aom_wb_write_bit(wb, pars->apply_grain);
+  if (!pars->apply_grain) return;
+
+  aom_wb_write_literal(wb, pars->random_seed, 16);
+
+  if (cm->current_frame.frame_type == INTER_FRAME)
+    aom_wb_write_bit(wb, pars->update_parameters);
+
+  if (!pars->update_parameters) {
+    int ref_frame, ref_idx;
+    for (ref_frame = LAST_FRAME; ref_frame < REF_FRAMES; ref_frame++) {
+      ref_idx = get_ref_frame_map_idx(cm, ref_frame);
+      assert(ref_idx != INVALID_IDX);
+      const RefCntBuffer *const buf = cm->ref_frame_map[ref_idx];
+      if (buf->film_grain_params_present &&
+          av1_check_grain_params_equiv(pars, &buf->film_grain_params)) {
+        break;
+      }
+    }
+    assert(ref_frame < REF_FRAMES);
+    aom_wb_write_literal(wb, ref_idx, 3);
+    return;
+  }
+
+  // Scaling functions parameters
+  aom_wb_write_literal(wb, pars->num_y_points, 4);  // max 14
+  for (int i = 0; i < pars->num_y_points; i++) {
+    aom_wb_write_literal(wb, pars->scaling_points_y[i][0], 8);
+    aom_wb_write_literal(wb, pars->scaling_points_y[i][1], 8);
+  }
+
+  if (!cm->seq_params.monochrome) {
+    aom_wb_write_bit(wb, pars->chroma_scaling_from_luma);
+  } else {
+    assert(!pars->chroma_scaling_from_luma);
+  }
+
+  if (cm->seq_params.monochrome || pars->chroma_scaling_from_luma ||
+      ((cm->seq_params.subsampling_x == 1) &&
+       (cm->seq_params.subsampling_y == 1) && (pars->num_y_points == 0))) {
+    assert(pars->num_cb_points == 0 && pars->num_cr_points == 0);
+  } else {
+    aom_wb_write_literal(wb, pars->num_cb_points, 4);  // max 10
+    for (int i = 0; i < pars->num_cb_points; i++) {
+      aom_wb_write_literal(wb, pars->scaling_points_cb[i][0], 8);
+      aom_wb_write_literal(wb, pars->scaling_points_cb[i][1], 8);
+    }
+
+    aom_wb_write_literal(wb, pars->num_cr_points, 4);  // max 10
+    for (int i = 0; i < pars->num_cr_points; i++) {
+      aom_wb_write_literal(wb, pars->scaling_points_cr[i][0], 8);
+      aom_wb_write_literal(wb, pars->scaling_points_cr[i][1], 8);
+    }
+  }
+
+  aom_wb_write_literal(wb, pars->scaling_shift - 8, 2);  // 8 + value
+
+  // AR coefficients
+  // Only sent if the corresponsing scaling function has
+  // more than 0 points
+
+  aom_wb_write_literal(wb, pars->ar_coeff_lag, 2);
+
+  int num_pos_luma = 2 * pars->ar_coeff_lag * (pars->ar_coeff_lag + 1);
+  int num_pos_chroma = num_pos_luma;
+  if (pars->num_y_points > 0) ++num_pos_chroma;
+
+  if (pars->num_y_points)
+    for (int i = 0; i < num_pos_luma; i++)
+      aom_wb_write_literal(wb, pars->ar_coeffs_y[i] + 128, 8);
+
+  if (pars->num_cb_points || pars->chroma_scaling_from_luma)
+    for (int i = 0; i < num_pos_chroma; i++)
+      aom_wb_write_literal(wb, pars->ar_coeffs_cb[i] + 128, 8);
+
+  if (pars->num_cr_points || pars->chroma_scaling_from_luma)
+    for (int i = 0; i < num_pos_chroma; i++)
+      aom_wb_write_literal(wb, pars->ar_coeffs_cr[i] + 128, 8);
+
+  aom_wb_write_literal(wb, pars->ar_coeff_shift - 6, 2);  // 8 + value
+
+  aom_wb_write_literal(wb, pars->grain_scale_shift, 2);
+
+  if (pars->num_cb_points) {
+    aom_wb_write_literal(wb, pars->cb_mult, 8);
+    aom_wb_write_literal(wb, pars->cb_luma_mult, 8);
+    aom_wb_write_literal(wb, pars->cb_offset, 9);
+  }
+
+  if (pars->num_cr_points) {
+    aom_wb_write_literal(wb, pars->cr_mult, 8);
+    aom_wb_write_literal(wb, pars->cr_luma_mult, 8);
+    aom_wb_write_literal(wb, pars->cr_offset, 9);
+  }
+
+  aom_wb_write_bit(wb, pars->overlap_flag);
+
+  aom_wb_write_bit(wb, pars->clip_to_restricted_range);
+}
+
+static AOM_INLINE void write_sb_size(const SequenceHeader *const seq_params,
+                                     struct aom_write_bit_buffer *wb) {
+  (void)seq_params;
+  (void)wb;
+  assert(seq_params->mib_size == mi_size_wide[seq_params->sb_size]);
+  assert(seq_params->mib_size == 1 << seq_params->mib_size_log2);
+  assert(seq_params->sb_size == BLOCK_128X128 ||
+         seq_params->sb_size == BLOCK_64X64);
+  aom_wb_write_bit(wb, seq_params->sb_size == BLOCK_128X128 ? 1 : 0);
+}
+
+static AOM_INLINE void write_sequence_header(
+    const SequenceHeader *const seq_params, struct aom_write_bit_buffer *wb) {
+  aom_wb_write_literal(wb, seq_params->num_bits_width - 1, 4);
+  aom_wb_write_literal(wb, seq_params->num_bits_height - 1, 4);
+  aom_wb_write_literal(wb, seq_params->max_frame_width - 1,
+                       seq_params->num_bits_width);
+  aom_wb_write_literal(wb, seq_params->max_frame_height - 1,
+                       seq_params->num_bits_height);
+
+  if (!seq_params->reduced_still_picture_hdr) {
+    aom_wb_write_bit(wb, seq_params->frame_id_numbers_present_flag);
+    if (seq_params->frame_id_numbers_present_flag) {
+      // We must always have delta_frame_id_length < frame_id_length,
+      // in order for a frame to be referenced with a unique delta.
+      // Avoid wasting bits by using a coding that enforces this restriction.
+      aom_wb_write_literal(wb, seq_params->delta_frame_id_length - 2, 4);
+      aom_wb_write_literal(
+          wb,
+          seq_params->frame_id_length - seq_params->delta_frame_id_length - 1,
+          3);
+    }
+  }
+
+  write_sb_size(seq_params, wb);
+
+  aom_wb_write_bit(wb, seq_params->enable_filter_intra);
+  aom_wb_write_bit(wb, seq_params->enable_intra_edge_filter);
+
+  if (!seq_params->reduced_still_picture_hdr) {
+    aom_wb_write_bit(wb, seq_params->enable_interintra_compound);
+    aom_wb_write_bit(wb, seq_params->enable_masked_compound);
+    aom_wb_write_bit(wb, seq_params->enable_warped_motion);
+    aom_wb_write_bit(wb, seq_params->enable_dual_filter);
+
+    aom_wb_write_bit(wb, seq_params->order_hint_info.enable_order_hint);
+
+    if (seq_params->order_hint_info.enable_order_hint) {
+      aom_wb_write_bit(wb, seq_params->order_hint_info.enable_dist_wtd_comp);
+      aom_wb_write_bit(wb, seq_params->order_hint_info.enable_ref_frame_mvs);
+    }
+    if (seq_params->force_screen_content_tools == 2) {
+      aom_wb_write_bit(wb, 1);
+    } else {
+      aom_wb_write_bit(wb, 0);
+      aom_wb_write_bit(wb, seq_params->force_screen_content_tools);
+    }
+    if (seq_params->force_screen_content_tools > 0) {
+      if (seq_params->force_integer_mv == 2) {
+        aom_wb_write_bit(wb, 1);
+      } else {
+        aom_wb_write_bit(wb, 0);
+        aom_wb_write_bit(wb, seq_params->force_integer_mv);
+      }
+    } else {
+      assert(seq_params->force_integer_mv == 2);
+    }
+    if (seq_params->order_hint_info.enable_order_hint)
+      aom_wb_write_literal(
+          wb, seq_params->order_hint_info.order_hint_bits_minus_1, 3);
+  }
+
+  aom_wb_write_bit(wb, seq_params->enable_superres);
+  aom_wb_write_bit(wb, seq_params->enable_cdef);
+  aom_wb_write_bit(wb, seq_params->enable_restoration);
+}
+
+static AOM_INLINE void write_global_motion_params(
+    const WarpedMotionParams *params, const WarpedMotionParams *ref_params,
+    struct aom_write_bit_buffer *wb, int allow_hp) {
+  const TransformationType type = params->wmtype;
+
+  aom_wb_write_bit(wb, type != IDENTITY);
+  if (type != IDENTITY) {
+    aom_wb_write_bit(wb, type == ROTZOOM);
+    if (type != ROTZOOM) aom_wb_write_bit(wb, type == TRANSLATION);
+  }
+
+  if (type >= ROTZOOM) {
+    aom_wb_write_signed_primitive_refsubexpfin(
+        wb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+        (ref_params->wmmat[2] >> GM_ALPHA_PREC_DIFF) -
+            (1 << GM_ALPHA_PREC_BITS),
+        (params->wmmat[2] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS));
+    aom_wb_write_signed_primitive_refsubexpfin(
+        wb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+        (ref_params->wmmat[3] >> GM_ALPHA_PREC_DIFF),
+        (params->wmmat[3] >> GM_ALPHA_PREC_DIFF));
+  }
+
+  if (type >= AFFINE) {
+    aom_wb_write_signed_primitive_refsubexpfin(
+        wb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+        (ref_params->wmmat[4] >> GM_ALPHA_PREC_DIFF),
+        (params->wmmat[4] >> GM_ALPHA_PREC_DIFF));
+    aom_wb_write_signed_primitive_refsubexpfin(
+        wb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+        (ref_params->wmmat[5] >> GM_ALPHA_PREC_DIFF) -
+            (1 << GM_ALPHA_PREC_BITS),
+        (params->wmmat[5] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS));
+  }
+
+  if (type >= TRANSLATION) {
+    const int trans_bits = (type == TRANSLATION)
+                               ? GM_ABS_TRANS_ONLY_BITS - !allow_hp
+                               : GM_ABS_TRANS_BITS;
+    const int trans_prec_diff = (type == TRANSLATION)
+                                    ? GM_TRANS_ONLY_PREC_DIFF + !allow_hp
+                                    : GM_TRANS_PREC_DIFF;
+    aom_wb_write_signed_primitive_refsubexpfin(
+        wb, (1 << trans_bits) + 1, SUBEXPFIN_K,
+        (ref_params->wmmat[0] >> trans_prec_diff),
+        (params->wmmat[0] >> trans_prec_diff));
+    aom_wb_write_signed_primitive_refsubexpfin(
+        wb, (1 << trans_bits) + 1, SUBEXPFIN_K,
+        (ref_params->wmmat[1] >> trans_prec_diff),
+        (params->wmmat[1] >> trans_prec_diff));
+  }
+}
+
+static AOM_INLINE void write_global_motion(AV1_COMP *cpi,
+                                           struct aom_write_bit_buffer *wb) {
+  AV1_COMMON *const cm = &cpi->common;
+  int frame;
+  for (frame = LAST_FRAME; frame <= ALTREF_FRAME; ++frame) {
+    const WarpedMotionParams *ref_params =
+        cm->prev_frame ? &cm->prev_frame->global_motion[frame]
+                       : &default_warp_params;
+    write_global_motion_params(&cm->global_motion[frame], ref_params, wb,
+                               cm->features.allow_high_precision_mv);
+    // TODO(sarahparker, debargha): The logic in the commented out code below
+    // does not work currently and causes mismatches when resize is on.
+    // Fix it before turning the optimization back on.
+    /*
+    YV12_BUFFER_CONFIG *ref_buf = get_ref_frame_yv12_buf(cpi, frame);
+    if (cpi->source->y_crop_width == ref_buf->y_crop_width &&
+        cpi->source->y_crop_height == ref_buf->y_crop_height) {
+      write_global_motion_params(&cm->global_motion[frame],
+                                 &cm->prev_frame->global_motion[frame], wb,
+                                 cm->features.allow_high_precision_mv);
+    } else {
+      assert(cm->global_motion[frame].wmtype == IDENTITY &&
+             "Invalid warp type for frames of different resolutions");
+    }
+    */
+    /*
+    printf("Frame %d/%d: Enc Ref %d: %d %d %d %d\n",
+           cm->current_frame.frame_number, cm->show_frame, frame,
+           cm->global_motion[frame].wmmat[0],
+           cm->global_motion[frame].wmmat[1], cm->global_motion[frame].wmmat[2],
+           cm->global_motion[frame].wmmat[3]);
+           */
+  }
+}
+
+static int check_frame_refs_short_signaling(AV1_COMMON *const cm) {
+  // Check whether all references are distinct frames.
+  const RefCntBuffer *seen_bufs[FRAME_BUFFERS] = { NULL };
+  int num_refs = 0;
+  for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame);
+    if (buf != NULL) {
+      int seen = 0;
+      for (int i = 0; i < num_refs; i++) {
+        if (seen_bufs[i] == buf) {
+          seen = 1;
+          break;
+        }
+      }
+      if (!seen) seen_bufs[num_refs++] = buf;
+    }
+  }
+
+  // We only turn on frame_refs_short_signaling when all references are
+  // distinct.
+  if (num_refs < INTER_REFS_PER_FRAME) {
+    // It indicates that there exist more than one reference frame pointing to
+    // the same reference buffer, i.e. two or more references are duplicate.
+    return 0;
+  }
+
+  // Check whether the encoder side ref frame choices are aligned with that to
+  // be derived at the decoder side.
+  int remapped_ref_idx_decoder[REF_FRAMES];
+
+  const int lst_map_idx = get_ref_frame_map_idx(cm, LAST_FRAME);
+  const int gld_map_idx = get_ref_frame_map_idx(cm, GOLDEN_FRAME);
+
+  // Set up the frame refs mapping indexes according to the
+  // frame_refs_short_signaling policy.
+  av1_set_frame_refs(cm, remapped_ref_idx_decoder, lst_map_idx, gld_map_idx);
+
+  // We only turn on frame_refs_short_signaling when the encoder side decision
+  // on ref frames is identical to that at the decoder side.
+  int frame_refs_short_signaling = 1;
+  for (int ref_idx = 0; ref_idx < INTER_REFS_PER_FRAME; ++ref_idx) {
+    // Compare the buffer index between two reference frames indexed
+    // respectively by the encoder and the decoder side decisions.
+    RefCntBuffer *ref_frame_buf_new = NULL;
+    if (remapped_ref_idx_decoder[ref_idx] != INVALID_IDX) {
+      ref_frame_buf_new = cm->ref_frame_map[remapped_ref_idx_decoder[ref_idx]];
+    }
+    if (get_ref_frame_buf(cm, LAST_FRAME + ref_idx) != ref_frame_buf_new) {
+      frame_refs_short_signaling = 0;
+      break;
+    }
+  }
+
+#if 0   // For debug
+  printf("\nFrame=%d: \n", cm->current_frame.frame_number);
+  printf("***frame_refs_short_signaling=%d\n", frame_refs_short_signaling);
+  for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    printf("enc_ref(map_idx=%d)=%d, vs. "
+        "dec_ref(map_idx=%d)=%d\n",
+        get_ref_frame_map_idx(cm, ref_frame), ref_frame,
+        cm->remapped_ref_idx[ref_frame - LAST_FRAME],
+        ref_frame);
+  }
+#endif  // 0
+
+  return frame_refs_short_signaling;
+}
+
+// New function based on HLS R18
+static AOM_INLINE void write_uncompressed_header_obu(
+    AV1_COMP *cpi, struct aom_write_bit_buffer *saved_wb,
+    struct aom_write_bit_buffer *wb) {
+  AV1_COMMON *const cm = &cpi->common;
+  const SequenceHeader *const seq_params = &cm->seq_params;
+  const CommonQuantParams *quant_params = &cm->quant_params;
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+  CurrentFrame *const current_frame = &cm->current_frame;
+  FeatureFlags *const features = &cm->features;
+
+  current_frame->frame_refs_short_signaling = 0;
+
+  if (seq_params->still_picture) {
+    assert(cm->show_existing_frame == 0);
+    assert(cm->show_frame == 1);
+    assert(current_frame->frame_type == KEY_FRAME);
+  }
+  if (!seq_params->reduced_still_picture_hdr) {
+    if (encode_show_existing_frame(cm)) {
+      aom_wb_write_bit(wb, 1);  // show_existing_frame
+      aom_wb_write_literal(wb, cpi->existing_fb_idx_to_show, 3);
+
+      if (seq_params->decoder_model_info_present_flag &&
+          seq_params->timing_info.equal_picture_interval == 0) {
+        write_tu_pts_info(cm, wb);
+      }
+      if (seq_params->frame_id_numbers_present_flag) {
+        int frame_id_len = seq_params->frame_id_length;
+        int display_frame_id = cm->ref_frame_id[cpi->existing_fb_idx_to_show];
+        aom_wb_write_literal(wb, display_frame_id, frame_id_len);
+      }
+      return;
+    } else {
+      aom_wb_write_bit(wb, 0);  // show_existing_frame
+    }
+
+    aom_wb_write_literal(wb, current_frame->frame_type, 2);
+
+    aom_wb_write_bit(wb, cm->show_frame);
+    if (cm->show_frame) {
+      if (seq_params->decoder_model_info_present_flag &&
+          seq_params->timing_info.equal_picture_interval == 0)
+        write_tu_pts_info(cm, wb);
+    } else {
+      aom_wb_write_bit(wb, cm->showable_frame);
+    }
+    if (frame_is_sframe(cm)) {
+      assert(features->error_resilient_mode);
+    } else if (!(current_frame->frame_type == KEY_FRAME && cm->show_frame)) {
+      aom_wb_write_bit(wb, features->error_resilient_mode);
+    }
+  }
+  aom_wb_write_bit(wb, features->disable_cdf_update);
+
+  if (seq_params->force_screen_content_tools == 2) {
+    aom_wb_write_bit(wb, features->allow_screen_content_tools);
+  } else {
+    assert(features->allow_screen_content_tools ==
+           seq_params->force_screen_content_tools);
+  }
+
+  if (features->allow_screen_content_tools) {
+    if (seq_params->force_integer_mv == 2) {
+      aom_wb_write_bit(wb, features->cur_frame_force_integer_mv);
+    } else {
+      assert(features->cur_frame_force_integer_mv ==
+             seq_params->force_integer_mv);
+    }
+  } else {
+    assert(features->cur_frame_force_integer_mv == 0);
+  }
+
+  int frame_size_override_flag = 0;
+
+  if (seq_params->reduced_still_picture_hdr) {
+    assert(cm->superres_upscaled_width == seq_params->max_frame_width &&
+           cm->superres_upscaled_height == seq_params->max_frame_height);
+  } else {
+    if (seq_params->frame_id_numbers_present_flag) {
+      int frame_id_len = seq_params->frame_id_length;
+      aom_wb_write_literal(wb, cm->current_frame_id, frame_id_len);
+    }
+
+    if (cm->superres_upscaled_width > seq_params->max_frame_width ||
+        cm->superres_upscaled_height > seq_params->max_frame_height) {
+      aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                         "Frame dimensions are larger than the maximum values");
+    }
+
+    frame_size_override_flag =
+        frame_is_sframe(cm)
+            ? 1
+            : (cm->superres_upscaled_width != seq_params->max_frame_width ||
+               cm->superres_upscaled_height != seq_params->max_frame_height);
+    if (!frame_is_sframe(cm)) aom_wb_write_bit(wb, frame_size_override_flag);
+
+    if (seq_params->order_hint_info.enable_order_hint)
+      aom_wb_write_literal(
+          wb, current_frame->order_hint,
+          seq_params->order_hint_info.order_hint_bits_minus_1 + 1);
+
+    if (!features->error_resilient_mode && !frame_is_intra_only(cm)) {
+      aom_wb_write_literal(wb, features->primary_ref_frame, PRIMARY_REF_BITS);
+    }
+  }
+
+  if (seq_params->decoder_model_info_present_flag) {
+    aom_wb_write_bit(wb, cm->buffer_removal_time_present);
+    if (cm->buffer_removal_time_present) {
+      for (int op_num = 0;
+           op_num < seq_params->operating_points_cnt_minus_1 + 1; op_num++) {
+        if (seq_params->op_params[op_num].decoder_model_param_present_flag) {
+          if (((seq_params->operating_point_idc[op_num] >>
+                cm->temporal_layer_id) &
+                   0x1 &&
+               (seq_params->operating_point_idc[op_num] >>
+                (cm->spatial_layer_id + 8)) &
+                   0x1) ||
+              seq_params->operating_point_idc[op_num] == 0) {
+            aom_wb_write_unsigned_literal(
+                wb, cm->buffer_removal_times[op_num],
+                seq_params->decoder_model_info.buffer_removal_time_length);
+            cm->buffer_removal_times[op_num]++;
+            if (cm->buffer_removal_times[op_num] == 0) {
+              aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                                 "buffer_removal_time overflowed");
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // Shown keyframes and switch-frames automatically refreshes all reference
+  // frames.  For all other frame types, we need to write refresh_frame_flags.
+  if ((current_frame->frame_type == KEY_FRAME && !cm->show_frame) ||
+      current_frame->frame_type == INTER_FRAME ||
+      current_frame->frame_type == INTRA_ONLY_FRAME)
+    aom_wb_write_literal(wb, current_frame->refresh_frame_flags, REF_FRAMES);
+
+  if (!frame_is_intra_only(cm) || current_frame->refresh_frame_flags != 0xff) {
+    // Write all ref frame order hints if error_resilient_mode == 1
+    if (features->error_resilient_mode &&
+        seq_params->order_hint_info.enable_order_hint) {
+      for (int ref_idx = 0; ref_idx < REF_FRAMES; ref_idx++) {
+        aom_wb_write_literal(
+            wb, cm->ref_frame_map[ref_idx]->order_hint,
+            seq_params->order_hint_info.order_hint_bits_minus_1 + 1);
+      }
+    }
+  }
+
+  if (current_frame->frame_type == KEY_FRAME) {
+    write_frame_size(cm, frame_size_override_flag, wb);
+    assert(!av1_superres_scaled(cm) || !features->allow_intrabc);
+    if (features->allow_screen_content_tools && !av1_superres_scaled(cm))
+      aom_wb_write_bit(wb, features->allow_intrabc);
+  } else {
+    if (current_frame->frame_type == INTRA_ONLY_FRAME) {
+      write_frame_size(cm, frame_size_override_flag, wb);
+      assert(!av1_superres_scaled(cm) || !features->allow_intrabc);
+      if (features->allow_screen_content_tools && !av1_superres_scaled(cm))
+        aom_wb_write_bit(wb, features->allow_intrabc);
+    } else if (current_frame->frame_type == INTER_FRAME ||
+               frame_is_sframe(cm)) {
+      MV_REFERENCE_FRAME ref_frame;
+
+      // NOTE: Error resilient mode turns off frame_refs_short_signaling
+      //       automatically.
+#define FRAME_REFS_SHORT_SIGNALING 0
+#if FRAME_REFS_SHORT_SIGNALING
+      current_frame->frame_refs_short_signaling =
+          seq_params->order_hint_info.enable_order_hint;
+#endif  // FRAME_REFS_SHORT_SIGNALING
+
+      if (current_frame->frame_refs_short_signaling) {
+        // NOTE(zoeliu@google.com):
+        //   An example solution for encoder-side implementation on frame refs
+        //   short signaling, which is only turned on when the encoder side
+        //   decision on ref frames is identical to that at the decoder side.
+        current_frame->frame_refs_short_signaling =
+            check_frame_refs_short_signaling(cm);
+      }
+
+      if (seq_params->order_hint_info.enable_order_hint)
+        aom_wb_write_bit(wb, current_frame->frame_refs_short_signaling);
+
+      if (current_frame->frame_refs_short_signaling) {
+        const int lst_ref = get_ref_frame_map_idx(cm, LAST_FRAME);
+        aom_wb_write_literal(wb, lst_ref, REF_FRAMES_LOG2);
+
+        const int gld_ref = get_ref_frame_map_idx(cm, GOLDEN_FRAME);
+        aom_wb_write_literal(wb, gld_ref, REF_FRAMES_LOG2);
+      }
+
+      for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+        assert(get_ref_frame_map_idx(cm, ref_frame) != INVALID_IDX);
+        if (!current_frame->frame_refs_short_signaling)
+          aom_wb_write_literal(wb, get_ref_frame_map_idx(cm, ref_frame),
+                               REF_FRAMES_LOG2);
+        if (seq_params->frame_id_numbers_present_flag) {
+          int i = get_ref_frame_map_idx(cm, ref_frame);
+          int frame_id_len = seq_params->frame_id_length;
+          int diff_len = seq_params->delta_frame_id_length;
+          int delta_frame_id_minus_1 =
+              ((cm->current_frame_id - cm->ref_frame_id[i] +
+                (1 << frame_id_len)) %
+               (1 << frame_id_len)) -
+              1;
+          if (delta_frame_id_minus_1 < 0 ||
+              delta_frame_id_minus_1 >= (1 << diff_len)) {
+            aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR,
+                               "Invalid delta_frame_id_minus_1");
+          }
+          aom_wb_write_literal(wb, delta_frame_id_minus_1, diff_len);
+        }
+      }
+
+      if (!features->error_resilient_mode && frame_size_override_flag) {
+        write_frame_size_with_refs(cm, wb);
+      } else {
+        write_frame_size(cm, frame_size_override_flag, wb);
+      }
+
+      if (!features->cur_frame_force_integer_mv)
+        aom_wb_write_bit(wb, features->allow_high_precision_mv);
+      write_frame_interp_filter(features->interp_filter, wb);
+      aom_wb_write_bit(wb, features->switchable_motion_mode);
+      if (frame_might_allow_ref_frame_mvs(cm)) {
+        aom_wb_write_bit(wb, features->allow_ref_frame_mvs);
+      } else {
+        assert(features->allow_ref_frame_mvs == 0);
+      }
+    }
+  }
+
+  const int might_bwd_adapt = !(seq_params->reduced_still_picture_hdr) &&
+                              !(features->disable_cdf_update);
+  if (cm->tiles.large_scale)
+    assert(features->refresh_frame_context == REFRESH_FRAME_CONTEXT_DISABLED);
+
+  if (might_bwd_adapt) {
+    aom_wb_write_bit(
+        wb, features->refresh_frame_context == REFRESH_FRAME_CONTEXT_DISABLED);
+  }
+
+  write_tile_info(cm, saved_wb, wb);
+  encode_quantization(quant_params, av1_num_planes(cm),
+                      cm->seq_params.separate_uv_delta_q, wb);
+  encode_segmentation(cm, xd, wb);
+
+  const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
+  if (delta_q_info->delta_q_present_flag) assert(quant_params->base_qindex > 0);
+  if (quant_params->base_qindex > 0) {
+    aom_wb_write_bit(wb, delta_q_info->delta_q_present_flag);
+    if (delta_q_info->delta_q_present_flag) {
+      aom_wb_write_literal(wb, get_msb(delta_q_info->delta_q_res), 2);
+      xd->current_qindex = quant_params->base_qindex;
+      if (features->allow_intrabc)
+        assert(delta_q_info->delta_lf_present_flag == 0);
+      else
+        aom_wb_write_bit(wb, delta_q_info->delta_lf_present_flag);
+      if (delta_q_info->delta_lf_present_flag) {
+        aom_wb_write_literal(wb, get_msb(delta_q_info->delta_lf_res), 2);
+        aom_wb_write_bit(wb, delta_q_info->delta_lf_multi);
+        av1_reset_loop_filter_delta(xd, av1_num_planes(cm));
+      }
+    }
+  }
+
+  if (features->all_lossless) {
+    assert(!av1_superres_scaled(cm));
+  } else {
+    if (!features->coded_lossless) {
+      encode_loopfilter(cm, wb);
+      encode_cdef(cm, wb);
+    }
+    encode_restoration_mode(cm, wb);
+  }
+
+  // Write TX mode
+  if (features->coded_lossless)
+    assert(features->tx_mode == ONLY_4X4);
+  else
+    aom_wb_write_bit(wb, features->tx_mode == TX_MODE_SELECT);
+
+  if (!frame_is_intra_only(cm)) {
+    const int use_hybrid_pred =
+        current_frame->reference_mode == REFERENCE_MODE_SELECT;
+
+    aom_wb_write_bit(wb, use_hybrid_pred);
+  }
+
+  if (current_frame->skip_mode_info.skip_mode_allowed)
+    aom_wb_write_bit(wb, current_frame->skip_mode_info.skip_mode_flag);
+
+  if (frame_might_allow_warped_motion(cm))
+    aom_wb_write_bit(wb, features->allow_warped_motion);
+  else
+    assert(!features->allow_warped_motion);
+
+  aom_wb_write_bit(wb, features->reduced_tx_set_used);
+
+  if (!frame_is_intra_only(cm)) write_global_motion(cpi, wb);
+
+  if (seq_params->film_grain_params_present &&
+      (cm->show_frame || cm->showable_frame))
+    write_film_grain_params(cpi, wb);
+
+  if (cm->tiles.large_scale) write_ext_tile_info(cm, saved_wb, wb);
+}
+
+static int choose_size_bytes(uint32_t size, int spare_msbs) {
+  // Choose the number of bytes required to represent size, without
+  // using the 'spare_msbs' number of most significant bits.
+
+  // Make sure we will fit in 4 bytes to start with..
+  if (spare_msbs > 0 && size >> (32 - spare_msbs) != 0) return -1;
+
+  // Normalise to 32 bits
+  size <<= spare_msbs;
+
+  if (size >> 24 != 0)
+    return 4;
+  else if (size >> 16 != 0)
+    return 3;
+  else if (size >> 8 != 0)
+    return 2;
+  else
+    return 1;
+}
+
+static AOM_INLINE void mem_put_varsize(uint8_t *const dst, const int sz,
+                                       const int val) {
+  switch (sz) {
+    case 1: dst[0] = (uint8_t)(val & 0xff); break;
+    case 2: mem_put_le16(dst, val); break;
+    case 3: mem_put_le24(dst, val); break;
+    case 4: mem_put_le32(dst, val); break;
+    default: assert(0 && "Invalid size"); break;
+  }
+}
+
+static int remux_tiles(const CommonTileParams *const tiles, uint8_t *dst,
+                       const uint32_t data_size, const uint32_t max_tile_size,
+                       const uint32_t max_tile_col_size,
+                       int *const tile_size_bytes,
+                       int *const tile_col_size_bytes) {
+  // Choose the tile size bytes (tsb) and tile column size bytes (tcsb)
+  int tsb;
+  int tcsb;
+
+  if (tiles->large_scale) {
+    // The top bit in the tile size field indicates tile copy mode, so we
+    // have 1 less bit to code the tile size
+    tsb = choose_size_bytes(max_tile_size, 1);
+    tcsb = choose_size_bytes(max_tile_col_size, 0);
+  } else {
+    tsb = choose_size_bytes(max_tile_size, 0);
+    tcsb = 4;  // This is ignored
+    (void)max_tile_col_size;
+  }
+
+  assert(tsb > 0);
+  assert(tcsb > 0);
+
+  *tile_size_bytes = tsb;
+  *tile_col_size_bytes = tcsb;
+  if (tsb == 4 && tcsb == 4) return data_size;
+
+  uint32_t wpos = 0;
+  uint32_t rpos = 0;
+
+  if (tiles->large_scale) {
+    int tile_row;
+    int tile_col;
+
+    for (tile_col = 0; tile_col < tiles->cols; tile_col++) {
+      // All but the last column has a column header
+      if (tile_col < tiles->cols - 1) {
+        uint32_t tile_col_size = mem_get_le32(dst + rpos);
+        rpos += 4;
+
+        // Adjust the tile column size by the number of bytes removed
+        // from the tile size fields.
+        tile_col_size -= (4 - tsb) * tiles->rows;
+
+        mem_put_varsize(dst + wpos, tcsb, tile_col_size);
+        wpos += tcsb;
+      }
+
+      for (tile_row = 0; tile_row < tiles->rows; tile_row++) {
+        // All, including the last row has a header
+        uint32_t tile_header = mem_get_le32(dst + rpos);
+        rpos += 4;
+
+        // If this is a copy tile, we need to shift the MSB to the
+        // top bit of the new width, and there is no data to copy.
+        if (tile_header >> 31 != 0) {
+          if (tsb < 4) tile_header >>= 32 - 8 * tsb;
+          mem_put_varsize(dst + wpos, tsb, tile_header);
+          wpos += tsb;
+        } else {
+          mem_put_varsize(dst + wpos, tsb, tile_header);
+          wpos += tsb;
+
+          tile_header += AV1_MIN_TILE_SIZE_BYTES;
+          memmove(dst + wpos, dst + rpos, tile_header);
+          rpos += tile_header;
+          wpos += tile_header;
+        }
+      }
+    }
+
+    assert(rpos > wpos);
+    assert(rpos == data_size);
+
+    return wpos;
+  }
+  const int n_tiles = tiles->cols * tiles->rows;
+  int n;
+
+  for (n = 0; n < n_tiles; n++) {
+    int tile_size;
+
+    if (n == n_tiles - 1) {
+      tile_size = data_size - rpos;
+    } else {
+      tile_size = mem_get_le32(dst + rpos);
+      rpos += 4;
+      mem_put_varsize(dst + wpos, tsb, tile_size);
+      tile_size += AV1_MIN_TILE_SIZE_BYTES;
+      wpos += tsb;
+    }
+
+    memmove(dst + wpos, dst + rpos, tile_size);
+
+    rpos += tile_size;
+    wpos += tile_size;
+  }
+
+  assert(rpos > wpos);
+  assert(rpos == data_size);
+
+  return wpos;
+}
+
+uint32_t av1_write_obu_header(AV1LevelParams *const level_params,
+                              OBU_TYPE obu_type, int obu_extension,
+                              uint8_t *const dst) {
+  if (level_params->keep_level_stats &&
+      (obu_type == OBU_FRAME || obu_type == OBU_FRAME_HEADER))
+    ++level_params->frame_header_count;
+
+  struct aom_write_bit_buffer wb = { dst, 0 };
+  uint32_t size = 0;
+
+  aom_wb_write_literal(&wb, 0, 1);  // forbidden bit.
+  aom_wb_write_literal(&wb, (int)obu_type, 4);
+  aom_wb_write_literal(&wb, obu_extension ? 1 : 0, 1);
+  aom_wb_write_literal(&wb, 1, 1);  // obu_has_payload_length_field
+  aom_wb_write_literal(&wb, 0, 1);  // reserved
+
+  if (obu_extension) {
+    aom_wb_write_literal(&wb, obu_extension & 0xFF, 8);
+  }
+
+  size = aom_wb_bytes_written(&wb);
+  return size;
+}
+
+int av1_write_uleb_obu_size(size_t obu_header_size, size_t obu_payload_size,
+                            uint8_t *dest) {
+  const size_t offset = obu_header_size;
+  size_t coded_obu_size = 0;
+  const uint32_t obu_size = (uint32_t)obu_payload_size;
+  assert(obu_size == obu_payload_size);
+
+  if (aom_uleb_encode(obu_size, sizeof(obu_size), dest + offset,
+                      &coded_obu_size) != 0) {
+    return AOM_CODEC_ERROR;
+  }
+
+  return AOM_CODEC_OK;
+}
+
+static size_t obu_memmove(size_t obu_header_size, size_t obu_payload_size,
+                          uint8_t *data) {
+  const size_t length_field_size = aom_uleb_size_in_bytes(obu_payload_size);
+  const size_t move_dst_offset = length_field_size + obu_header_size;
+  const size_t move_src_offset = obu_header_size;
+  const size_t move_size = obu_payload_size;
+  memmove(data + move_dst_offset, data + move_src_offset, move_size);
+  return length_field_size;
+}
+
+static AOM_INLINE void add_trailing_bits(struct aom_write_bit_buffer *wb) {
+  if (aom_wb_is_byte_aligned(wb)) {
+    aom_wb_write_literal(wb, 0x80, 8);
+  } else {
+    // assumes that the other bits are already 0s
+    aom_wb_write_bit(wb, 1);
+  }
+}
+
+static AOM_INLINE void write_bitstream_level(AV1_LEVEL seq_level_idx,
+                                             struct aom_write_bit_buffer *wb) {
+  assert(is_valid_seq_level_idx(seq_level_idx));
+  aom_wb_write_literal(wb, seq_level_idx, LEVEL_BITS);
+}
+
+uint32_t av1_write_sequence_header_obu(const SequenceHeader *seq_params,
+                                       uint8_t *const dst) {
+  struct aom_write_bit_buffer wb = { dst, 0 };
+  uint32_t size = 0;
+
+  write_profile(seq_params->profile, &wb);
+
+  // Still picture or not
+  aom_wb_write_bit(&wb, seq_params->still_picture);
+  assert(IMPLIES(!seq_params->still_picture,
+                 !seq_params->reduced_still_picture_hdr));
+  // whether to use reduced still picture header
+  aom_wb_write_bit(&wb, seq_params->reduced_still_picture_hdr);
+
+  if (seq_params->reduced_still_picture_hdr) {
+    assert(seq_params->timing_info_present == 0);
+    assert(seq_params->decoder_model_info_present_flag == 0);
+    assert(seq_params->display_model_info_present_flag == 0);
+    write_bitstream_level(seq_params->seq_level_idx[0], &wb);
+  } else {
+    aom_wb_write_bit(
+        &wb, seq_params->timing_info_present);  // timing info present flag
+
+    if (seq_params->timing_info_present) {
+      // timing_info
+      write_timing_info_header(&seq_params->timing_info, &wb);
+      aom_wb_write_bit(&wb, seq_params->decoder_model_info_present_flag);
+      if (seq_params->decoder_model_info_present_flag) {
+        write_decoder_model_info(&seq_params->decoder_model_info, &wb);
+      }
+    }
+    aom_wb_write_bit(&wb, seq_params->display_model_info_present_flag);
+    aom_wb_write_literal(&wb, seq_params->operating_points_cnt_minus_1,
+                         OP_POINTS_CNT_MINUS_1_BITS);
+    int i;
+    for (i = 0; i < seq_params->operating_points_cnt_minus_1 + 1; i++) {
+      aom_wb_write_literal(&wb, seq_params->operating_point_idc[i],
+                           OP_POINTS_IDC_BITS);
+      write_bitstream_level(seq_params->seq_level_idx[i], &wb);
+      if (seq_params->seq_level_idx[i] >= SEQ_LEVEL_4_0)
+        aom_wb_write_bit(&wb, seq_params->tier[i]);
+      if (seq_params->decoder_model_info_present_flag) {
+        aom_wb_write_bit(
+            &wb, seq_params->op_params[i].decoder_model_param_present_flag);
+        if (seq_params->op_params[i].decoder_model_param_present_flag) {
+          write_dec_model_op_parameters(
+              &seq_params->op_params[i],
+              seq_params->decoder_model_info
+                  .encoder_decoder_buffer_delay_length,
+              &wb);
+        }
+      }
+      if (seq_params->display_model_info_present_flag) {
+        aom_wb_write_bit(
+            &wb, seq_params->op_params[i].display_model_param_present_flag);
+        if (seq_params->op_params[i].display_model_param_present_flag) {
+          assert(seq_params->op_params[i].initial_display_delay <= 10);
+          aom_wb_write_literal(
+              &wb, seq_params->op_params[i].initial_display_delay - 1, 4);
+        }
+      }
+    }
+  }
+  write_sequence_header(seq_params, &wb);
+
+  write_color_config(seq_params, &wb);
+
+  aom_wb_write_bit(&wb, seq_params->film_grain_params_present);
+
+  add_trailing_bits(&wb);
+
+  size = aom_wb_bytes_written(&wb);
+  return size;
+}
+
+static uint32_t write_frame_header_obu(AV1_COMP *cpi,
+                                       struct aom_write_bit_buffer *saved_wb,
+                                       uint8_t *const dst,
+                                       int append_trailing_bits) {
+  struct aom_write_bit_buffer wb = { dst, 0 };
+  write_uncompressed_header_obu(cpi, saved_wb, &wb);
+  if (append_trailing_bits) add_trailing_bits(&wb);
+  return aom_wb_bytes_written(&wb);
+}
+
+static uint32_t write_tile_group_header(uint8_t *const dst, int start_tile,
+                                        int end_tile, int tiles_log2,
+                                        int tile_start_and_end_present_flag) {
+  struct aom_write_bit_buffer wb = { dst, 0 };
+  uint32_t size = 0;
+
+  if (!tiles_log2) return size;
+
+  aom_wb_write_bit(&wb, tile_start_and_end_present_flag);
+
+  if (tile_start_and_end_present_flag) {
+    aom_wb_write_literal(&wb, start_tile, tiles_log2);
+    aom_wb_write_literal(&wb, end_tile, tiles_log2);
+  }
+
+  size = aom_wb_bytes_written(&wb);
+  return size;
+}
+
+typedef struct {
+  uint8_t *frame_header;
+  size_t obu_header_byte_offset;
+  size_t total_length;
+} FrameHeaderInfo;
+
+extern void av1_print_uncompressed_frame_header(const uint8_t *data, int size,
+                                                const char *filename);
+
+static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst,
+                                       struct aom_write_bit_buffer *saved_wb,
+                                       uint8_t obu_extension_header,
+                                       const FrameHeaderInfo *fh_info,
+                                       int *const largest_tile_id) {
+  AV1_COMMON *const cm = &cpi->common;
+  const CommonTileParams *const tiles = &cm->tiles;
+  AV1LevelParams *const level_params = &cpi->level_params;
+  aom_writer mode_bc;
+  int tile_row, tile_col;
+  // Store the location and size of each tile's data in the bitstream:
+  TileBufferEnc tile_buffers[MAX_TILE_ROWS][MAX_TILE_COLS];
+  uint32_t total_size = 0;
+  const int tile_cols = tiles->cols;
+  const int tile_rows = tiles->rows;
+  unsigned int tile_size = 0;
+  unsigned int max_tile_size = 0;
+  unsigned int max_tile_col_size = 0;
+  const int n_log2_tiles = tiles->log2_rows + tiles->log2_cols;
+  // Fixed size tile groups for the moment
+  const int num_tg_hdrs = cpi->num_tg;
+  const int tg_size =
+      (tiles->large_scale)
+          ? 1
+          : (tile_rows * tile_cols + num_tg_hdrs - 1) / num_tg_hdrs;
+  int tile_count = 0;
+  int curr_tg_data_size = 0;
+  uint8_t *data = dst;
+  int new_tg = 1;
+  const int have_tiles = tile_cols * tile_rows > 1;
+  int first_tg = 1;
+
+  *largest_tile_id = 0;
+
+  if (tiles->large_scale) {
+    // For large_scale_tile case, we always have only one tile group, so it can
+    // be written as an OBU_FRAME.
+    const OBU_TYPE obu_type = OBU_FRAME;
+    const uint32_t tg_hdr_size =
+        av1_write_obu_header(level_params, obu_type, 0, data);
+    data += tg_hdr_size;
+
+    const uint32_t frame_header_size =
+        write_frame_header_obu(cpi, saved_wb, data, 0);
+    data += frame_header_size;
+    total_size += frame_header_size;
+
+    // (yunqing) This test ensures the correctness of large scale tile coding.
+    if (cpi->oxcf.ext_tile_debug) {
+      char fn[20] = "./fh";
+      fn[4] = cm->current_frame.frame_number / 100 + '0';
+      fn[5] = (cm->current_frame.frame_number % 100) / 10 + '0';
+      fn[6] = (cm->current_frame.frame_number % 10) + '0';
+      fn[7] = '\0';
+      av1_print_uncompressed_frame_header(data - frame_header_size,
+                                          frame_header_size, fn);
+    }
+
+    int tile_size_bytes = 0;
+    int tile_col_size_bytes = 0;
+
+    for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+      TileInfo tile_info;
+      const int is_last_col = (tile_col == tile_cols - 1);
+      const uint32_t col_offset = total_size;
+
+      av1_tile_set_col(&tile_info, cm, tile_col);
+
+      // The last column does not have a column header
+      if (!is_last_col) total_size += 4;
+
+      for (tile_row = 0; tile_row < tile_rows; tile_row++) {
+        TileBufferEnc *const buf = &tile_buffers[tile_row][tile_col];
+        const int data_offset = have_tiles ? 4 : 0;
+        const int tile_idx = tile_row * tile_cols + tile_col;
+        TileDataEnc *this_tile = &cpi->tile_data[tile_idx];
+        av1_tile_set_row(&tile_info, cm, tile_row);
+
+        buf->data = dst + total_size + tg_hdr_size;
+
+        // Is CONFIG_EXT_TILE = 1, every tile in the row has a header,
+        // even for the last one, unless no tiling is used at all.
+        total_size += data_offset;
+        cpi->td.mb.e_mbd.tile_ctx = &this_tile->tctx;
+        mode_bc.allow_update_cdf = !tiles->large_scale;
+        mode_bc.allow_update_cdf =
+            mode_bc.allow_update_cdf && !cm->features.disable_cdf_update;
+        aom_start_encode(&mode_bc, buf->data + data_offset);
+        write_modes(cpi, &tile_info, &mode_bc, tile_row, tile_col);
+        aom_stop_encode(&mode_bc);
+        tile_size = mode_bc.pos;
+        buf->size = tile_size;
+
+        // Record the maximum tile size we see, so we can compact headers later.
+        if (tile_size > max_tile_size) {
+          max_tile_size = tile_size;
+          *largest_tile_id = tile_cols * tile_row + tile_col;
+        }
+
+        if (have_tiles) {
+          // tile header: size of this tile, or copy offset
+          uint32_t tile_header = tile_size - AV1_MIN_TILE_SIZE_BYTES;
+          const int tile_copy_mode =
+              ((AOMMAX(tiles->width, tiles->height) << MI_SIZE_LOG2) <= 256)
+                  ? 1
+                  : 0;
+
+          // If tile_copy_mode = 1, check if this tile is a copy tile.
+          // Very low chances to have copy tiles on the key frames, so don't
+          // search on key frames to reduce unnecessary search.
+          if (cm->current_frame.frame_type != KEY_FRAME && tile_copy_mode) {
+            const int identical_tile_offset =
+                find_identical_tile(tile_row, tile_col, tile_buffers);
+
+            // Indicate a copy-tile by setting the most significant bit.
+            // The row-offset to copy from is stored in the highest byte.
+            // remux_tiles will move these around later
+            if (identical_tile_offset > 0) {
+              tile_size = 0;
+              tile_header = identical_tile_offset | 0x80;
+              tile_header <<= 24;
+            }
+          }
+
+          mem_put_le32(buf->data, tile_header);
+        }
+
+        total_size += tile_size;
+      }
+
+      if (!is_last_col) {
+        uint32_t col_size = total_size - col_offset - 4;
+        mem_put_le32(dst + col_offset + tg_hdr_size, col_size);
+
+        // Record the maximum tile column size we see.
+        max_tile_col_size = AOMMAX(max_tile_col_size, col_size);
+      }
+    }
+
+    if (have_tiles) {
+      total_size = remux_tiles(tiles, data, total_size - frame_header_size,
+                               max_tile_size, max_tile_col_size,
+                               &tile_size_bytes, &tile_col_size_bytes);
+      total_size += frame_header_size;
+    }
+
+    // In EXT_TILE case, only use 1 tile group. Follow the obu syntax, write
+    // current tile group size before tile data(include tile column header).
+    // Tile group size doesn't include the bytes storing tg size.
+    total_size += tg_hdr_size;
+    const uint32_t obu_payload_size = total_size - tg_hdr_size;
+    const size_t length_field_size =
+        obu_memmove(tg_hdr_size, obu_payload_size, dst);
+    if (av1_write_uleb_obu_size(tg_hdr_size, obu_payload_size, dst) !=
+        AOM_CODEC_OK) {
+      assert(0);
+    }
+    total_size += (uint32_t)length_field_size;
+    saved_wb->bit_buffer += length_field_size;
+
+    // Now fill in the gaps in the uncompressed header.
+    if (have_tiles) {
+      assert(tile_col_size_bytes >= 1 && tile_col_size_bytes <= 4);
+      aom_wb_overwrite_literal(saved_wb, tile_col_size_bytes - 1, 2);
+
+      assert(tile_size_bytes >= 1 && tile_size_bytes <= 4);
+      aom_wb_overwrite_literal(saved_wb, tile_size_bytes - 1, 2);
+    }
+    return total_size;
+  }
+
+  uint32_t obu_header_size = 0;
+  uint8_t *tile_data_start = dst + total_size;
+  for (tile_row = 0; tile_row < tile_rows; tile_row++) {
+    TileInfo tile_info;
+    av1_tile_set_row(&tile_info, cm, tile_row);
+
+    for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+      const int tile_idx = tile_row * tile_cols + tile_col;
+      TileBufferEnc *const buf = &tile_buffers[tile_row][tile_col];
+      TileDataEnc *this_tile = &cpi->tile_data[tile_idx];
+      int is_last_tile_in_tg = 0;
+
+      if (new_tg) {
+        data = dst + total_size;
+
+        // A new tile group begins at this tile.  Write the obu header and
+        // tile group header
+        const OBU_TYPE obu_type =
+            (num_tg_hdrs == 1) ? OBU_FRAME : OBU_TILE_GROUP;
+        curr_tg_data_size = av1_write_obu_header(level_params, obu_type,
+                                                 obu_extension_header, data);
+        obu_header_size = curr_tg_data_size;
+
+        if (num_tg_hdrs == 1) {
+          curr_tg_data_size += write_frame_header_obu(
+              cpi, saved_wb, data + curr_tg_data_size, 0);
+        }
+        curr_tg_data_size += write_tile_group_header(
+            data + curr_tg_data_size, tile_idx,
+            AOMMIN(tile_idx + tg_size - 1, tile_cols * tile_rows - 1),
+            n_log2_tiles, cpi->num_tg > 1);
+        total_size += curr_tg_data_size;
+        tile_data_start += curr_tg_data_size;
+        new_tg = 0;
+        tile_count = 0;
+      }
+      tile_count++;
+      av1_tile_set_col(&tile_info, cm, tile_col);
+
+      if (tile_count == tg_size || tile_idx == (tile_cols * tile_rows - 1)) {
+        is_last_tile_in_tg = 1;
+        new_tg = 1;
+      } else {
+        is_last_tile_in_tg = 0;
+      }
+
+      buf->data = dst + total_size;
+
+      // The last tile of the tile group does not have a header.
+      if (!is_last_tile_in_tg) total_size += 4;
+
+      cpi->td.mb.e_mbd.tile_ctx = &this_tile->tctx;
+      mode_bc.allow_update_cdf = 1;
+      mode_bc.allow_update_cdf =
+          mode_bc.allow_update_cdf && !cm->features.disable_cdf_update;
+      const int num_planes = av1_num_planes(cm);
+      av1_reset_loop_restoration(&cpi->td.mb.e_mbd, num_planes);
+
+      aom_start_encode(&mode_bc, dst + total_size);
+      write_modes(cpi, &tile_info, &mode_bc, tile_row, tile_col);
+      aom_stop_encode(&mode_bc);
+      tile_size = mode_bc.pos;
+      assert(tile_size >= AV1_MIN_TILE_SIZE_BYTES);
+
+      curr_tg_data_size += (tile_size + (is_last_tile_in_tg ? 0 : 4));
+      buf->size = tile_size;
+      if (tile_size > max_tile_size) {
+        *largest_tile_id = tile_cols * tile_row + tile_col;
+        max_tile_size = tile_size;
+      }
+
+      if (!is_last_tile_in_tg) {
+        // size of this tile
+        mem_put_le32(buf->data, tile_size - AV1_MIN_TILE_SIZE_BYTES);
+      } else {
+        // write current tile group size
+        const uint32_t obu_payload_size = curr_tg_data_size - obu_header_size;
+        const size_t length_field_size =
+            obu_memmove(obu_header_size, obu_payload_size, data);
+        if (av1_write_uleb_obu_size(obu_header_size, obu_payload_size, data) !=
+            AOM_CODEC_OK) {
+          assert(0);
+        }
+        curr_tg_data_size += (int)length_field_size;
+        total_size += (uint32_t)length_field_size;
+        tile_data_start += length_field_size;
+        if (num_tg_hdrs == 1) {
+          // if this tg is combined with the frame header then update saved
+          // frame header base offset accroding to length field size
+          saved_wb->bit_buffer += length_field_size;
+        }
+
+        if (!first_tg && cm->features.error_resilient_mode) {
+          // Make room for a duplicate Frame Header OBU.
+          memmove(data + fh_info->total_length, data, curr_tg_data_size);
+
+          // Insert a copy of the Frame Header OBU.
+          memcpy(data, fh_info->frame_header, fh_info->total_length);
+
+          // Force context update tile to be the first tile in error
+          // resiliant mode as the duplicate frame headers will have
+          // context_update_tile_id set to 0
+          *largest_tile_id = 0;
+
+          // Rewrite the OBU header to change the OBU type to Redundant Frame
+          // Header.
+          av1_write_obu_header(level_params, OBU_REDUNDANT_FRAME_HEADER,
+                               obu_extension_header,
+                               &data[fh_info->obu_header_byte_offset]);
+
+          data += fh_info->total_length;
+
+          curr_tg_data_size += (int)(fh_info->total_length);
+          total_size += (uint32_t)(fh_info->total_length);
+        }
+        first_tg = 0;
+      }
+
+      total_size += tile_size;
+    }
+  }
+
+  if (have_tiles) {
+    // Fill in context_update_tile_id indicating the tile to use for the
+    // cdf update. The encoder currently sets it to the largest tile
+    // (but is up to the encoder)
+    aom_wb_overwrite_literal(saved_wb, *largest_tile_id,
+                             tiles->log2_cols + tiles->log2_rows);
+    // If more than one tile group. tile_size_bytes takes the default value 4
+    // and does not need to be set. For a single tile group it is set in the
+    // section below.
+    if (num_tg_hdrs == 1) {
+      int tile_size_bytes = 4, unused;
+      const uint32_t tile_data_offset = (uint32_t)(tile_data_start - dst);
+      const uint32_t tile_data_size = total_size - tile_data_offset;
+
+      total_size =
+          remux_tiles(tiles, tile_data_start, tile_data_size, max_tile_size,
+                      max_tile_col_size, &tile_size_bytes, &unused);
+      total_size += tile_data_offset;
+      assert(tile_size_bytes >= 1 && tile_size_bytes <= 4);
+
+      aom_wb_overwrite_literal(saved_wb, tile_size_bytes - 1, 2);
+
+      // Update the OBU length if remux_tiles() reduced the size.
+      uint64_t payload_size;
+      size_t length_field_size;
+      int res =
+          aom_uleb_decode(dst + obu_header_size, total_size - obu_header_size,
+                          &payload_size, &length_field_size);
+      assert(res == 0);
+      (void)res;
+
+      const uint64_t new_payload_size =
+          total_size - obu_header_size - length_field_size;
+      if (new_payload_size != payload_size) {
+        size_t new_length_field_size;
+        res = aom_uleb_encode(new_payload_size, length_field_size,
+                              dst + obu_header_size, &new_length_field_size);
+        assert(res == 0);
+        if (new_length_field_size < length_field_size) {
+          const size_t src_offset = obu_header_size + length_field_size;
+          const size_t dst_offset = obu_header_size + new_length_field_size;
+          memmove(dst + dst_offset, dst + src_offset, (size_t)payload_size);
+          total_size -= (int)(length_field_size - new_length_field_size);
+        }
+      }
+    }
+  }
+  return total_size;
+}
+
+static size_t av1_write_metadata_obu(const aom_metadata_t *metadata,
+                                     uint8_t *const dst) {
+  size_t coded_metadata_size = 0;
+  const uint64_t metadata_type = (uint64_t)metadata->type;
+  if (aom_uleb_encode(metadata_type, sizeof(metadata_type), dst,
+                      &coded_metadata_size) != 0) {
+    return 0;
+  }
+  memcpy(dst + coded_metadata_size, metadata->payload, metadata->sz);
+  // Add trailing bits.
+  dst[coded_metadata_size + metadata->sz] = 0x80;
+  return (uint32_t)(coded_metadata_size + metadata->sz + 1);
+}
+
+static size_t av1_write_metadata_array(AV1_COMP *const cpi, uint8_t *dst) {
+  if (!cpi->source) return 0;
+  AV1_COMMON *const cm = &cpi->common;
+  aom_metadata_array_t *arr = cpi->source->metadata;
+  if (!arr) return 0;
+  size_t obu_header_size = 0;
+  size_t obu_payload_size = 0;
+  size_t total_bytes_written = 0;
+  size_t length_field_size = 0;
+  for (size_t i = 0; i < arr->sz; i++) {
+    aom_metadata_t *current_metadata = arr->metadata_array[i];
+    if (current_metadata && current_metadata->payload) {
+      if ((cm->current_frame.frame_type == KEY_FRAME &&
+           current_metadata->insert_flag == AOM_MIF_KEY_FRAME) ||
+          (cm->current_frame.frame_type != KEY_FRAME &&
+           current_metadata->insert_flag == AOM_MIF_NON_KEY_FRAME) ||
+          current_metadata->insert_flag == AOM_MIF_ANY_FRAME) {
+        obu_header_size =
+            av1_write_obu_header(&cpi->level_params, OBU_METADATA, 0, dst);
+        obu_payload_size =
+            av1_write_metadata_obu(current_metadata, dst + obu_header_size);
+        length_field_size = obu_memmove(obu_header_size, obu_payload_size, dst);
+        if (av1_write_uleb_obu_size(obu_header_size, obu_payload_size, dst) ==
+            AOM_CODEC_OK) {
+          const size_t obu_size = obu_header_size + obu_payload_size;
+          dst += obu_size + length_field_size;
+          total_bytes_written += obu_size + length_field_size;
+        } else {
+          aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR,
+                             "Error writing metadata OBU size");
+        }
+      }
+    }
+  }
+  return total_bytes_written;
+}
+
+int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size,
+                       int *const largest_tile_id) {
+  uint8_t *data = dst;
+  uint32_t data_size;
+  AV1_COMMON *const cm = &cpi->common;
+  AV1LevelParams *const level_params = &cpi->level_params;
+  uint32_t obu_header_size = 0;
+  uint32_t obu_payload_size = 0;
+  FrameHeaderInfo fh_info = { NULL, 0, 0 };
+  const uint8_t obu_extension_header =
+      cm->temporal_layer_id << 5 | cm->spatial_layer_id << 3 | 0;
+
+  // If no non-zero delta_q has been used, reset delta_q_present_flag
+  if (cm->delta_q_info.delta_q_present_flag && cpi->deltaq_used == 0) {
+    cm->delta_q_info.delta_q_present_flag = 0;
+  }
+
+#if CONFIG_BITSTREAM_DEBUG
+  bitstream_queue_reset_write();
+#endif
+
+  level_params->frame_header_count = 0;
+
+  // The TD is now written outside the frame encode loop
+
+  // write sequence header obu if KEY_FRAME, preceded by 4-byte size
+  if (cm->current_frame.frame_type == KEY_FRAME && cm->show_frame) {
+    obu_header_size =
+        av1_write_obu_header(level_params, OBU_SEQUENCE_HEADER, 0, data);
+
+    obu_payload_size =
+        av1_write_sequence_header_obu(&cm->seq_params, data + obu_header_size);
+    const size_t length_field_size =
+        obu_memmove(obu_header_size, obu_payload_size, data);
+    if (av1_write_uleb_obu_size(obu_header_size, obu_payload_size, data) !=
+        AOM_CODEC_OK) {
+      return AOM_CODEC_ERROR;
+    }
+
+    data += obu_header_size + obu_payload_size + length_field_size;
+  }
+
+  // write metadata obus before the frame obu that has the show_frame flag set
+  if (cm->show_frame) data += av1_write_metadata_array(cpi, data);
+
+  const int write_frame_header =
+      (cpi->num_tg > 1 || encode_show_existing_frame(cm));
+  struct aom_write_bit_buffer saved_wb;
+  if (write_frame_header) {
+    // Write Frame Header OBU.
+    fh_info.frame_header = data;
+    obu_header_size = av1_write_obu_header(level_params, OBU_FRAME_HEADER,
+                                           obu_extension_header, data);
+    obu_payload_size =
+        write_frame_header_obu(cpi, &saved_wb, data + obu_header_size, 1);
+
+    const size_t length_field_size =
+        obu_memmove(obu_header_size, obu_payload_size, data);
+    if (av1_write_uleb_obu_size(obu_header_size, obu_payload_size, data) !=
+        AOM_CODEC_OK) {
+      return AOM_CODEC_ERROR;
+    }
+
+    fh_info.obu_header_byte_offset = 0;
+    fh_info.total_length =
+        obu_header_size + obu_payload_size + length_field_size;
+    data += fh_info.total_length;
+
+    // Since length_field_size is determined adaptively after frame header
+    // encoding, saved_wb must be adjusted accordingly.
+    saved_wb.bit_buffer += length_field_size;
+  }
+
+  if (encode_show_existing_frame(cm)) {
+    data_size = 0;
+  } else {
+    //  Each tile group obu will be preceded by 4-byte size of the tile group
+    //  obu
+    data_size = write_tiles_in_tg_obus(
+        cpi, data, &saved_wb, obu_extension_header, &fh_info, largest_tile_id);
+  }
+  data += data_size;
+  *size = data - dst;
+  return AOM_CODEC_OK;
+}
diff --git a/libs/libaom/src/av1/encoder/bitstream.h b/libs/libaom/src/av1/encoder/bitstream.h
new file mode 100644
index 000000000..45151e25e
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/bitstream.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_BITSTREAM_H_
+#define AOM_AV1_ENCODER_BITSTREAM_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/encoder/encoder.h"
+
+struct aom_write_bit_buffer;
+
+// Writes only the OBU Sequence Header payload, and returns the size of the
+// payload written to 'dst'. This function does not write the OBU header, the
+// optional extension, or the OBU size to 'dst'.
+uint32_t av1_write_sequence_header_obu(const SequenceHeader *seq_params,
+                                       uint8_t *const dst);
+
+// Writes the OBU header byte, and the OBU header extension byte when
+// 'obu_extension' is non-zero. Returns number of bytes written to 'dst'.
+uint32_t av1_write_obu_header(AV1LevelParams *const level_params,
+                              OBU_TYPE obu_type, int obu_extension,
+                              uint8_t *const dst);
+
+int av1_write_uleb_obu_size(size_t obu_header_size, size_t obu_payload_size,
+                            uint8_t *dest);
+
+int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size,
+                       int *const largest_tile_id);
+
+void av1_write_tx_type(const AV1_COMMON *const cm, const MACROBLOCKD *xd,
+                       TX_TYPE tx_type, TX_SIZE tx_size, aom_writer *w);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_BITSTREAM_H_
diff --git a/libs/libaom/src/av1/encoder/block.h b/libs/libaom/src/av1/encoder/block.h
new file mode 100644
index 000000000..5a74567a4
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/block.h
@@ -0,0 +1,575 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_BLOCK_H_
+#define AOM_AV1_ENCODER_BLOCK_H_
+
+#include "av1/common/entropymv.h"
+#include "av1/common/entropy.h"
+#include "av1/common/mvref_common.h"
+
+#include "av1/encoder/enc_enums.h"
+#if !CONFIG_REALTIME_ONLY
+#include "av1/encoder/partition_cnn_weights.h"
+#endif
+
+#include "av1/encoder/hash.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MC_FLOW_BSIZE_1D 16
+#define MC_FLOW_NUM_PELS (MC_FLOW_BSIZE_1D * MC_FLOW_BSIZE_1D)
+#define MAX_MC_FLOW_BLK_IN_SB (MAX_SB_SIZE / MC_FLOW_BSIZE_1D)
+#define MAX_WINNER_MODE_COUNT_INTRA 3
+#define MAX_WINNER_MODE_COUNT_INTER 1
+typedef struct {
+  MB_MODE_INFO mbmi;
+  RD_STATS rd_cost;
+  int64_t rd;
+  int rate_y;
+  int rate_uv;
+  uint8_t color_index_map[64 * 64];
+  THR_MODES mode_index;
+} WinnerModeStats;
+
+typedef struct {
+  unsigned int sse;
+  int sum;
+  unsigned int var;
+} DIFF;
+
+enum {
+  NO_TRELLIS_OPT,          // No trellis optimization
+  FULL_TRELLIS_OPT,        // Trellis optimization in all stages
+  FINAL_PASS_TRELLIS_OPT,  // Trellis optimization in only the final encode pass
+  NO_ESTIMATE_YRD_TRELLIS_OPT  // Disable trellis in estimate_yrd_for_sb
+} UENUM1BYTE(TRELLIS_OPT_TYPE);
+
+typedef struct macroblock_plane {
+  DECLARE_ALIGNED(32, int16_t, src_diff[MAX_SB_SQUARE]);
+  tran_low_t *qcoeff;
+  tran_low_t *coeff;
+  uint16_t *eobs;
+  uint8_t *txb_entropy_ctx;
+  struct buf_2d src;
+
+  // Quantizer setings
+  // These are used/accessed only in the quantization process
+  // RDO does not / must not depend on any of these values
+  // All values below share the coefficient scale/shift used in TX
+  const int16_t *quant_fp_QTX;
+  const int16_t *round_fp_QTX;
+  const int16_t *quant_QTX;
+  const int16_t *quant_shift_QTX;
+  const int16_t *zbin_QTX;
+  const int16_t *round_QTX;
+  const int16_t *dequant_QTX;
+} MACROBLOCK_PLANE;
+
+typedef struct {
+  int txb_skip_cost[TXB_SKIP_CONTEXTS][2];
+  int base_eob_cost[SIG_COEF_CONTEXTS_EOB][3];
+  int base_cost[SIG_COEF_CONTEXTS][8];
+  int eob_extra_cost[EOB_COEF_CONTEXTS][2];
+  int dc_sign_cost[DC_SIGN_CONTEXTS][2];
+  int lps_cost[LEVEL_CONTEXTS][COEFF_BASE_RANGE + 1 + COEFF_BASE_RANGE + 1];
+} LV_MAP_COEFF_COST;
+
+typedef struct {
+  int eob_cost[2][11];
+} LV_MAP_EOB_COST;
+
+typedef struct {
+  tran_low_t tcoeff[MAX_MB_PLANE][MAX_SB_SQUARE];
+  uint16_t eobs[MAX_MB_PLANE][MAX_SB_SQUARE / (TX_SIZE_W_MIN * TX_SIZE_H_MIN)];
+  // Transform block entropy contexts.
+  // Bits 0~3: txb_skip_ctx; bits 4~5: dc_sign_ctx.
+  uint8_t entropy_ctx[MAX_MB_PLANE]
+                     [MAX_SB_SQUARE / (TX_SIZE_W_MIN * TX_SIZE_H_MIN)];
+} CB_COEFF_BUFFER;
+
+typedef struct {
+  // TODO(angiebird): Reduce the buffer size according to sb_type
+  CANDIDATE_MV ref_mv_stack[MODE_CTX_REF_FRAMES][USABLE_REF_MV_STACK_SIZE];
+  uint16_t weight[MODE_CTX_REF_FRAMES][USABLE_REF_MV_STACK_SIZE];
+  int_mv global_mvs[REF_FRAMES];
+  int16_t mode_context[MODE_CTX_REF_FRAMES];
+  uint8_t ref_mv_count[MODE_CTX_REF_FRAMES];
+} MB_MODE_INFO_EXT;
+
+// Structure to store best mode information at frame level. This
+// frame level information will be used during bitstream preparation stage.
+typedef struct {
+  CANDIDATE_MV ref_mv_stack[USABLE_REF_MV_STACK_SIZE];
+  uint16_t weight[USABLE_REF_MV_STACK_SIZE];
+  // TODO(Ravi/Remya): Reduce the buffer size of global_mvs
+  int_mv global_mvs[REF_FRAMES];
+  int cb_offset;
+  int16_t mode_context;
+  uint8_t ref_mv_count;
+} MB_MODE_INFO_EXT_FRAME;
+
+typedef struct {
+  uint8_t best_palette_color_map[MAX_PALETTE_SQUARE];
+  int kmeans_data_buf[2 * MAX_PALETTE_SQUARE];
+} PALETTE_BUFFER;
+
+typedef struct {
+  TX_SIZE tx_size;
+  TX_SIZE inter_tx_size[INTER_TX_SIZE_BUF_LEN];
+  uint8_t blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  uint8_t tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  RD_STATS rd_stats;
+  uint32_t hash_value;
+} MB_RD_INFO;
+
+#define RD_RECORD_BUFFER_LEN 8
+typedef struct {
+  MB_RD_INFO tx_rd_info[RD_RECORD_BUFFER_LEN];  // Circular buffer.
+  int index_start;
+  int num;
+  CRC32C crc_calculator;  // Hash function.
+} MB_RD_RECORD;
+
+typedef struct {
+  int64_t dist;
+  int64_t sse;
+  int rate;
+  uint16_t eob;
+  TX_TYPE tx_type;
+  uint16_t entropy_context;
+  uint8_t txb_entropy_ctx;
+  uint8_t valid;
+  uint8_t fast;  // This is not being used now.
+  uint8_t perform_block_coeff_opt;
+} TXB_RD_INFO;
+
+#define TX_SIZE_RD_RECORD_BUFFER_LEN 256
+typedef struct {
+  uint32_t hash_vals[TX_SIZE_RD_RECORD_BUFFER_LEN];
+  TXB_RD_INFO tx_rd_info[TX_SIZE_RD_RECORD_BUFFER_LEN];
+  int index_start;
+  int num;
+} TXB_RD_RECORD;
+
+typedef struct tx_size_rd_info_node {
+  TXB_RD_INFO *rd_info_array;  // Points to array of size TX_TYPES.
+  struct tx_size_rd_info_node *children[4];
+} TXB_RD_INFO_NODE;
+
+// Simple translation rd state for prune_comp_search_by_single_result
+typedef struct {
+  RD_STATS rd_stats;
+  RD_STATS rd_stats_y;
+  RD_STATS rd_stats_uv;
+  uint8_t blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  uint8_t tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  uint8_t skip;
+  uint8_t disable_skip;
+  uint8_t early_skipped;
+} SimpleRDState;
+
+// 4: NEAREST, NEW, NEAR, GLOBAL
+#define SINGLE_REF_MODES ((REF_FRAMES - 1) * 4)
+
+#define MAX_COMP_RD_STATS 64
+typedef struct {
+  int32_t rate[COMPOUND_TYPES];
+  int64_t dist[COMPOUND_TYPES];
+  int32_t model_rate[COMPOUND_TYPES];
+  int64_t model_dist[COMPOUND_TYPES];
+  int comp_rs2[COMPOUND_TYPES];
+  int_mv mv[2];
+  MV_REFERENCE_FRAME ref_frames[2];
+  PREDICTION_MODE mode;
+  int_interpfilters filter;
+  int ref_mv_idx;
+  int is_global[2];
+  INTERINTER_COMPOUND_DATA interinter_comp;
+} COMP_RD_STATS;
+
+// Struct for buffers used by av1_compound_type_rd() function.
+// For sizes and alignment of these arrays, refer to
+// alloc_compound_type_rd_buffers() function.
+typedef struct {
+  uint8_t *pred0;
+  uint8_t *pred1;
+  int16_t *residual1;          // src - pred1
+  int16_t *diff10;             // pred1 - pred0
+  uint8_t *tmp_best_mask_buf;  // backup of the best segmentation mask
+} CompoundTypeRdBuffers;
+
+enum {
+  MV_COST_ENTROPY,    // Use the entropy rate of the mv as the cost
+  MV_COST_L1_LOWRES,  // Use the l1 norm of the mv as the cost (<480p)
+  MV_COST_L1_MIDRES,  // Use the l1 norm of the mv as the cost (>=480p)
+  MV_COST_L1_HDRES,   // Use the l1 norm of the mv as the cost (>=720p)
+  MV_COST_NONE        // Use 0 as as cost irrespective of the current mv
+} UENUM1BYTE(MV_COST_TYPE);
+
+struct inter_modes_info;
+typedef struct macroblock MACROBLOCK;
+struct macroblock {
+  struct macroblock_plane plane[MAX_MB_PLANE];
+
+  // Determine if one would go with reduced complexity transform block
+  // search model to select prediction modes, or full complexity model
+  // to select transform kernel.
+  int rd_model;
+
+  // prune_comp_search_by_single_result (3:MAX_REF_MV_SEARCH)
+  SimpleRDState simple_rd_state[SINGLE_REF_MODES][3];
+
+  // Inter macroblock RD search info.
+  MB_RD_RECORD mb_rd_record;
+
+  // Inter transform block RD search info. for square TX sizes.
+  TXB_RD_RECORD txb_rd_record_8X8[(MAX_MIB_SIZE >> 1) * (MAX_MIB_SIZE >> 1)];
+  TXB_RD_RECORD txb_rd_record_16X16[(MAX_MIB_SIZE >> 2) * (MAX_MIB_SIZE >> 2)];
+  TXB_RD_RECORD txb_rd_record_32X32[(MAX_MIB_SIZE >> 3) * (MAX_MIB_SIZE >> 3)];
+  TXB_RD_RECORD txb_rd_record_64X64[(MAX_MIB_SIZE >> 4) * (MAX_MIB_SIZE >> 4)];
+
+  // Intra transform block RD search info. for square TX sizes.
+  TXB_RD_RECORD txb_rd_record_intra;
+
+  MACROBLOCKD e_mbd;
+  MB_MODE_INFO_EXT *mbmi_ext;
+  MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame;
+  // Array of mode stats for winner mode processing
+  WinnerModeStats winner_mode_stats[AOMMAX(MAX_WINNER_MODE_COUNT_INTRA,
+                                           MAX_WINNER_MODE_COUNT_INTER)];
+  int winner_mode_count;
+  int skip_block;
+  int qindex;
+
+  // The equivalent error at the current rdmult of one whole bit (not one
+  // bitcost unit).
+  int errorperbit;
+  // The equivalend SAD error of one (whole) bit at the current quantizer
+  // for large blocks.
+  int sadperbit;
+  int rdmult;
+  int mb_energy;
+  int sb_energy_level;
+
+  unsigned int txb_split_count;
+#if CONFIG_SPEED_STATS
+  unsigned int tx_search_count;
+#endif  // CONFIG_SPEED_STATS
+
+  // These are set to their default values at the beginning, and then adjusted
+  // further in the encoding process.
+  BLOCK_SIZE min_partition_size;
+  BLOCK_SIZE max_partition_size;
+
+  unsigned int max_mv_context[REF_FRAMES];
+  unsigned int source_variance;
+  unsigned int simple_motion_pred_sse;
+  unsigned int pred_sse[REF_FRAMES];
+  int pred_mv_sad[REF_FRAMES];
+  int best_pred_mv_sad;
+
+  int nmv_vec_cost[MV_JOINTS];
+  int nmv_costs[2][MV_VALS];
+  int nmv_costs_hp[2][MV_VALS];
+  int *nmvcost[2];
+  int *nmvcost_hp[2];
+  int **mv_cost_stack;
+
+  int32_t *wsrc_buf;
+  int32_t *mask_buf;
+  uint8_t *above_pred_buf;
+  uint8_t *left_pred_buf;
+
+  PALETTE_BUFFER *palette_buffer;
+  CompoundTypeRdBuffers comp_rd_buffer;
+
+  CONV_BUF_TYPE *tmp_conv_dst;
+  uint8_t *tmp_obmc_bufs[2];
+
+  FRAME_CONTEXT *row_ctx;
+  // This context will be used to update color_map_cdf pointer which would be
+  // used during pack bitstream. For single thread and tile-multithreading case
+  // this ponter will be same as xd->tile_ctx, but for the case of row-mt:
+  // xd->tile_ctx will point to a temporary context while tile_pb_ctx will point
+  // to the accurate tile context.
+  FRAME_CONTEXT *tile_pb_ctx;
+
+  struct inter_modes_info *inter_modes_info;
+
+  // Contains the hash table, hash function, and buffer used for intrabc
+  IntraBCHashInfo intrabc_hash_info;
+
+  // These define limits to motion vector components to prevent them
+  // from extending outside the UMV borders
+  FullMvLimits mv_limits;
+
+  uint8_t blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  uint8_t tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
+
+  // Force the coding block to skip transform and quantization.
+  int force_skip;
+  int skip_cost[SKIP_CONTEXTS][2];
+
+  int skip_mode;  // 0: off; 1: on
+  int skip_mode_cost[SKIP_CONTEXTS][2];
+
+  LV_MAP_COEFF_COST coeff_costs[TX_SIZES][PLANE_TYPES];
+  LV_MAP_EOB_COST eob_costs[7][2];
+  uint16_t cb_offset;
+
+  // mode costs
+  int intra_inter_cost[INTRA_INTER_CONTEXTS][2];
+
+  int mbmode_cost[BLOCK_SIZE_GROUPS][INTRA_MODES];
+  int newmv_mode_cost[NEWMV_MODE_CONTEXTS][2];
+  int zeromv_mode_cost[GLOBALMV_MODE_CONTEXTS][2];
+  int refmv_mode_cost[REFMV_MODE_CONTEXTS][2];
+  int drl_mode_cost0[DRL_MODE_CONTEXTS][2];
+
+  int comp_inter_cost[COMP_INTER_CONTEXTS][2];
+  int single_ref_cost[REF_CONTEXTS][SINGLE_REFS - 1][2];
+  int comp_ref_type_cost[COMP_REF_TYPE_CONTEXTS]
+                        [CDF_SIZE(COMP_REFERENCE_TYPES)];
+  int uni_comp_ref_cost[UNI_COMP_REF_CONTEXTS][UNIDIR_COMP_REFS - 1]
+                       [CDF_SIZE(2)];
+  // Cost for signaling ref_frame[0] (LAST_FRAME, LAST2_FRAME, LAST3_FRAME or
+  // GOLDEN_FRAME) in bidir-comp mode.
+  int comp_ref_cost[REF_CONTEXTS][FWD_REFS - 1][2];
+  // Cost for signaling ref_frame[1] (ALTREF_FRAME, ALTREF2_FRAME, or
+  // BWDREF_FRAME) in bidir-comp mode.
+  int comp_bwdref_cost[REF_CONTEXTS][BWD_REFS - 1][2];
+  int inter_compound_mode_cost[INTER_MODE_CONTEXTS][INTER_COMPOUND_MODES];
+  int compound_type_cost[BLOCK_SIZES_ALL][MASKED_COMPOUND_TYPES];
+  int wedge_idx_cost[BLOCK_SIZES_ALL][16];
+  int interintra_cost[BLOCK_SIZE_GROUPS][2];
+  int wedge_interintra_cost[BLOCK_SIZES_ALL][2];
+  int interintra_mode_cost[BLOCK_SIZE_GROUPS][INTERINTRA_MODES];
+  int motion_mode_cost[BLOCK_SIZES_ALL][MOTION_MODES];
+  int motion_mode_cost1[BLOCK_SIZES_ALL][2];
+  int intra_uv_mode_cost[CFL_ALLOWED_TYPES][INTRA_MODES][UV_INTRA_MODES];
+  int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES];
+  int filter_intra_cost[BLOCK_SIZES_ALL][2];
+  int filter_intra_mode_cost[FILTER_INTRA_MODES];
+  int switchable_interp_costs[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS];
+  int partition_cost[PARTITION_CONTEXTS][EXT_PARTITION_TYPES];
+  int palette_y_size_cost[PALATTE_BSIZE_CTXS][PALETTE_SIZES];
+  int palette_uv_size_cost[PALATTE_BSIZE_CTXS][PALETTE_SIZES];
+  int palette_y_color_cost[PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS]
+                          [PALETTE_COLORS];
+  int palette_uv_color_cost[PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS]
+                           [PALETTE_COLORS];
+  int palette_y_mode_cost[PALATTE_BSIZE_CTXS][PALETTE_Y_MODE_CONTEXTS][2];
+  int palette_uv_mode_cost[PALETTE_UV_MODE_CONTEXTS][2];
+  // The rate associated with each alpha codeword
+  int cfl_cost[CFL_JOINT_SIGNS][CFL_PRED_PLANES][CFL_ALPHABET_SIZE];
+  int tx_size_cost[TX_SIZES - 1][TX_SIZE_CONTEXTS][TX_SIZES];
+  int txfm_partition_cost[TXFM_PARTITION_CONTEXTS][2];
+  int inter_tx_type_costs[EXT_TX_SETS_INTER][EXT_TX_SIZES][TX_TYPES];
+  int intra_tx_type_costs[EXT_TX_SETS_INTRA][EXT_TX_SIZES][INTRA_MODES]
+                         [TX_TYPES];
+  int angle_delta_cost[DIRECTIONAL_MODES][2 * MAX_ANGLE_DELTA + 1];
+  int switchable_restore_cost[RESTORE_SWITCHABLE_TYPES];
+  int wiener_restore_cost[2];
+  int sgrproj_restore_cost[2];
+  int intrabc_cost[2];
+
+  // Used to store sub partition's choices.
+  MV pred_mv[REF_FRAMES];
+
+  // Ref frames that are selected by square partition blocks within a super-
+  // block, in MI resolution. They can be used to prune ref frames for
+  // rectangular blocks.
+  int picked_ref_frames_mask[32 * 32];
+
+  // use default transform and skip transform type search for intra modes
+  int use_default_intra_tx_type;
+  // use default transform and skip transform type search for inter modes
+  int use_default_inter_tx_type;
+  int comp_idx_cost[COMP_INDEX_CONTEXTS][2];
+  int comp_group_idx_cost[COMP_GROUP_IDX_CONTEXTS][2];
+  int must_find_valid_partition;
+  int recalc_luma_mc_data;  // Flag to indicate recalculation of MC data during
+                            // interpolation filter search
+  int prune_mode;
+  uint32_t tx_domain_dist_threshold;
+  int use_transform_domain_distortion;
+  // The likelihood of an edge existing in the block (using partial Canny edge
+  // detection). For reference, 556 is the value returned for a solid
+  // vertical black/white edge.
+  uint16_t edge_strength;
+  // The strongest edge strength seen along the x/y axis.
+  uint16_t edge_strength_x;
+  uint16_t edge_strength_y;
+  uint8_t compound_idx;
+
+  // [Saved stat index]
+  COMP_RD_STATS comp_rd_stats[MAX_COMP_RD_STATS];
+  int comp_rd_stats_idx;
+
+  CB_COEFF_BUFFER *cb_coef_buff;
+
+  // Threshold used to decide the applicability of R-D optimization of
+  // quantized coeffs
+  uint32_t coeff_opt_dist_threshold;
+
+#if !CONFIG_REALTIME_ONLY
+  int quad_tree_idx;
+  int cnn_output_valid;
+  float cnn_buffer[CNN_OUT_BUF_SIZE];
+  float log_q;
+#endif
+  int thresh_freq_fact[BLOCK_SIZES_ALL][MAX_MODES];
+  // 0 - 128x128
+  // 1-2 - 128x64
+  // 3-4 - 64x128
+  // 5-8 - 64x64
+  // 9-16 - 64x32
+  // 17-24 - 32x64
+  // 25-40 - 32x32
+  // 41-104 - 16x16
+  uint8_t variance_low[105];
+  uint8_t content_state_sb;
+  // Strong color activity detection. Used in REALTIME coding mode to enhance
+  // the visual quality at the boundary of moving color objects.
+  uint8_t color_sensitivity[2];
+  int nonrd_prune_ref_frame_search;
+
+  // Used to control the tx size search evaluation for mode processing
+  // (normal/winner mode)
+  int tx_size_search_method;
+  // This tx_mode_search_type is used internally by the encoder, and is not
+  // written to the bitstream. It determines what kind of tx_mode should be
+  // searched. For example, we might set it to TX_MODE_LARGEST to find a good
+  // candidate, then use TX_MODE_SELECT on it
+  TX_MODE tx_mode_search_type;
+
+  // Used to control aggressiveness of skip flag prediction for mode processing
+  // (normal/winner mode)
+  unsigned int predict_skip_level;
+
+  // Copy out this SB's TPL block stats.
+  int valid_cost_b;
+  int64_t inter_cost_b[MAX_MC_FLOW_BLK_IN_SB * MAX_MC_FLOW_BLK_IN_SB];
+  int64_t intra_cost_b[MAX_MC_FLOW_BLK_IN_SB * MAX_MC_FLOW_BLK_IN_SB];
+  int_mv mv_b[MAX_MC_FLOW_BLK_IN_SB * MAX_MC_FLOW_BLK_IN_SB]
+             [INTER_REFS_PER_FRAME];
+  int cost_stride;
+
+  // The type of mv cost used during motion search
+  MV_COST_TYPE mv_cost_type;
+
+  uint8_t search_ref_frame[REF_FRAMES];
+
+#if CONFIG_AV1_HIGHBITDEPTH
+  void (*fwd_txfm4x4)(const int16_t *input, tran_low_t *output, int stride);
+  void (*inv_txfm_add)(const tran_low_t *input, uint8_t *dest, int stride,
+                       int eob);
+#else
+  void (*fwd_txfm4x4)(const int16_t *input, int16_t *output, int stride);
+  void (*inv_txfm_add)(const int16_t *input, uint8_t *dest, int stride,
+                       int eob);
+#endif
+};
+
+// Only consider full SB, MC_FLOW_BSIZE_1D = 16.
+static INLINE int tpl_blocks_in_sb(BLOCK_SIZE bsize) {
+  switch (bsize) {
+    case BLOCK_64X64: return 16;
+    case BLOCK_128X128: return 64;
+    default: assert(0);
+  }
+  return -1;
+}
+
+static INLINE int is_rect_tx_allowed_bsize(BLOCK_SIZE bsize) {
+  static const char LUT[BLOCK_SIZES_ALL] = {
+    0,  // BLOCK_4X4
+    1,  // BLOCK_4X8
+    1,  // BLOCK_8X4
+    0,  // BLOCK_8X8
+    1,  // BLOCK_8X16
+    1,  // BLOCK_16X8
+    0,  // BLOCK_16X16
+    1,  // BLOCK_16X32
+    1,  // BLOCK_32X16
+    0,  // BLOCK_32X32
+    1,  // BLOCK_32X64
+    1,  // BLOCK_64X32
+    0,  // BLOCK_64X64
+    0,  // BLOCK_64X128
+    0,  // BLOCK_128X64
+    0,  // BLOCK_128X128
+    1,  // BLOCK_4X16
+    1,  // BLOCK_16X4
+    1,  // BLOCK_8X32
+    1,  // BLOCK_32X8
+    1,  // BLOCK_16X64
+    1,  // BLOCK_64X16
+  };
+
+  return LUT[bsize];
+}
+
+static INLINE int is_rect_tx_allowed(const MACROBLOCKD *xd,
+                                     const MB_MODE_INFO *mbmi) {
+  return is_rect_tx_allowed_bsize(mbmi->sb_type) &&
+         !xd->lossless[mbmi->segment_id];
+}
+
+static INLINE int tx_size_to_depth(TX_SIZE tx_size, BLOCK_SIZE bsize) {
+  TX_SIZE ctx_size = max_txsize_rect_lookup[bsize];
+  int depth = 0;
+  while (tx_size != ctx_size) {
+    depth++;
+    ctx_size = sub_tx_size_map[ctx_size];
+    assert(depth <= MAX_TX_DEPTH);
+  }
+  return depth;
+}
+
+static INLINE void set_blk_skip(MACROBLOCK *x, int plane, int blk_idx,
+                                int skip) {
+  if (skip)
+    x->blk_skip[blk_idx] |= 1UL << plane;
+  else
+    x->blk_skip[blk_idx] &= ~(1UL << plane);
+#ifndef NDEBUG
+  // Set chroma planes to uninitialized states when luma is set to check if
+  // it will be set later
+  if (plane == 0) {
+    x->blk_skip[blk_idx] |= 1UL << (1 + 4);
+    x->blk_skip[blk_idx] |= 1UL << (2 + 4);
+  }
+
+  // Clear the initialization checking bit
+  x->blk_skip[blk_idx] &= ~(1UL << (plane + 4));
+#endif
+}
+
+static INLINE int is_blk_skip(MACROBLOCK *x, int plane, int blk_idx) {
+#ifndef NDEBUG
+  // Check if this is initialized
+  assert(!(x->blk_skip[blk_idx] & (1UL << (plane + 4))));
+
+  // The magic number is 0x77, this is to test if there is garbage data
+  assert((x->blk_skip[blk_idx] & 0x88) == 0);
+#endif
+  return (x->blk_skip[blk_idx] >> plane) & 1;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_BLOCK_H_
diff --git a/libs/libaom/src/av1/encoder/blockiness.c b/libs/libaom/src/av1/encoder/blockiness.c
new file mode 100644
index 000000000..f7cff9e53
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/blockiness.c
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/av1_rtcd.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "av1/common/common.h"
+#include "av1/common/filter.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/system_state.h"
+
+static int horizontal_filter(const uint8_t *s) {
+  return (s[1] - s[-2]) * 2 + (s[-1] - s[0]) * 6;
+}
+
+static int vertical_filter(const uint8_t *s, int p) {
+  return (s[p] - s[-2 * p]) * 2 + (s[-p] - s[0]) * 6;
+}
+
+static int variance(int sum, int sum_squared, int size) {
+  return sum_squared / size - (sum / size) * (sum / size);
+}
+// Calculate a blockiness level for a vertical block edge.
+// This function returns a new blockiness metric that's defined as
+
+//              p0 p1 p2 p3
+//              q0 q1 q2 q3
+// block edge ->
+//              r0 r1 r2 r3
+//              s0 s1 s2 s3
+
+// blockiness =  p0*-2+q0*6+r0*-6+s0*2 +
+//               p1*-2+q1*6+r1*-6+s1*2 +
+//               p2*-2+q2*6+r2*-6+s2*2 +
+//               p3*-2+q3*6+r3*-6+s3*2 ;
+
+// reconstructed_blockiness = abs(blockiness from reconstructed buffer -
+//                                blockiness from source buffer,0)
+//
+// I make the assumption that flat blocks are much more visible than high
+// contrast blocks. As such, I scale the result of the blockiness calc
+// by dividing the blockiness by the variance of the pixels on either side
+// of the edge as follows:
+// var_0 = (q0^2+q1^2+q2^2+q3^2) - ((q0 + q1 + q2 + q3) / 4 )^2
+// var_1 = (r0^2+r1^2+r2^2+r3^2) - ((r0 + r1 + r2 + r3) / 4 )^2
+// The returned blockiness is the scaled value
+// Reconstructed blockiness / ( 1 + var_0 + var_1 ) ;
+static int blockiness_vertical(const uint8_t *s, int sp, const uint8_t *r,
+                               int rp, int size) {
+  int s_blockiness = 0;
+  int r_blockiness = 0;
+  int sum_0 = 0;
+  int sum_sq_0 = 0;
+  int sum_1 = 0;
+  int sum_sq_1 = 0;
+  int i;
+  int var_0;
+  int var_1;
+  for (i = 0; i < size; ++i, s += sp, r += rp) {
+    s_blockiness += horizontal_filter(s);
+    r_blockiness += horizontal_filter(r);
+    sum_0 += s[0];
+    sum_sq_0 += s[0] * s[0];
+    sum_1 += s[-1];
+    sum_sq_1 += s[-1] * s[-1];
+  }
+  var_0 = variance(sum_0, sum_sq_0, size);
+  var_1 = variance(sum_1, sum_sq_1, size);
+  r_blockiness = abs(r_blockiness);
+  s_blockiness = abs(s_blockiness);
+
+  if (r_blockiness > s_blockiness)
+    return (r_blockiness - s_blockiness) / (1 + var_0 + var_1);
+  else
+    return 0;
+}
+
+// Calculate a blockiness level for a horizontal block edge
+// same as above.
+static int blockiness_horizontal(const uint8_t *s, int sp, const uint8_t *r,
+                                 int rp, int size) {
+  int s_blockiness = 0;
+  int r_blockiness = 0;
+  int sum_0 = 0;
+  int sum_sq_0 = 0;
+  int sum_1 = 0;
+  int sum_sq_1 = 0;
+  int i;
+  int var_0;
+  int var_1;
+  for (i = 0; i < size; ++i, ++s, ++r) {
+    s_blockiness += vertical_filter(s, sp);
+    r_blockiness += vertical_filter(r, rp);
+    sum_0 += s[0];
+    sum_sq_0 += s[0] * s[0];
+    sum_1 += s[-sp];
+    sum_sq_1 += s[-sp] * s[-sp];
+  }
+  var_0 = variance(sum_0, sum_sq_0, size);
+  var_1 = variance(sum_1, sum_sq_1, size);
+  r_blockiness = abs(r_blockiness);
+  s_blockiness = abs(s_blockiness);
+
+  if (r_blockiness > s_blockiness)
+    return (r_blockiness - s_blockiness) / (1 + var_0 + var_1);
+  else
+    return 0;
+}
+
+// This function returns the blockiness for the entire frame currently by
+// looking at all borders in steps of 4.
+double av1_get_blockiness(const unsigned char *img1, int img1_pitch,
+                          const unsigned char *img2, int img2_pitch, int width,
+                          int height) {
+  double blockiness = 0;
+  int i, j;
+  aom_clear_system_state();
+  for (i = 0; i < height;
+       i += 4, img1 += img1_pitch * 4, img2 += img2_pitch * 4) {
+    for (j = 0; j < width; j += 4) {
+      if (i > 0 && i < height && j > 0 && j < width) {
+        blockiness +=
+            blockiness_vertical(img1 + j, img1_pitch, img2 + j, img2_pitch, 4);
+        blockiness += blockiness_horizontal(img1 + j, img1_pitch, img2 + j,
+                                            img2_pitch, 4);
+      }
+    }
+  }
+  blockiness /= width * height / 16;
+  return blockiness;
+}
diff --git a/libs/libaom/src/av1/encoder/cnn.c b/libs/libaom/src/av1/encoder/cnn.c
new file mode 100644
index 000000000..5d8a236a0
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/cnn.c
@@ -0,0 +1,1144 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <math.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "av1/encoder/cnn.h"
+#include "av1/common/av1_common_int.h"
+
+#define CLAMPINDEX(a, hi) ((a) < 0 ? 0 : ((a) >= (hi) ? ((hi)-1) : (a)))
+
+typedef struct {
+  const float **input;
+  int in_width;
+  int in_height;
+  int in_stride;
+  const CNN_LAYER_CONFIG *layer_config;
+  float **output;
+  int out_stride;
+  int start_idx;
+  int th_step;
+} CONVOLVE_OPS;
+
+typedef float (*activation_fn)(float);
+
+static float softsign(float x) { return x / (float)(fabsf(x) + 1.0); }
+
+static float relu(float x) { return (x < 0) ? 0 : x; }
+
+static float identity(float x) { return x; }
+
+typedef struct {
+  int allocsize;
+  int channels;
+  int width, height, stride;
+  float *buf[CNN_MAX_CHANNELS];
+} TENSOR;
+
+static void init_tensor(TENSOR *tensor) { memset(tensor, 0, sizeof(*tensor)); }
+
+static void free_tensor(TENSOR *tensor) {
+  if (tensor->allocsize) {
+    aom_free(tensor->buf[0]);
+    tensor->buf[0] = NULL;
+    tensor->allocsize = 0;
+  }
+}
+
+static void realloc_tensor(TENSOR *tensor, int channels, int width,
+                           int height) {
+  const int newallocsize = channels * width * height;
+  if (tensor->allocsize < newallocsize) {
+    free_tensor(tensor);
+    tensor->buf[0] =
+        (float *)aom_malloc(sizeof(*tensor->buf[0]) * newallocsize);
+    tensor->allocsize = newallocsize;
+  }
+  tensor->width = width;
+  tensor->height = height;
+  tensor->stride = width;
+  tensor->channels = channels;
+  for (int c = 1; c < channels; ++c)
+    tensor->buf[c] = &tensor->buf[0][c * width * height];
+}
+
+static void copy_tensor(const TENSOR *src, int copy_channels, int dst_offset,
+                        TENSOR *dst) {
+  assert(src->width == dst->width);
+  assert(src->height == dst->height);
+  assert(copy_channels <= src->channels);
+  if (src->stride == dst->width && dst->stride == dst->width) {
+    for (int c = 0; c < copy_channels; ++c) {
+      memcpy(dst->buf[dst_offset + c], src->buf[c],
+             sizeof(*dst->buf[0]) * src->width * src->height);
+    }
+  } else {
+    for (int c = 0; c < copy_channels; ++c) {
+      for (int r = 0; r < dst->height; ++r) {
+        memcpy(&dst->buf[dst_offset + c][r * dst->stride],
+               &src->buf[c][r * src->stride],
+               dst->width * sizeof(*dst->buf[c]));
+      }
+    }
+  }
+}
+
+static void assign_tensor(TENSOR *tensor, float *buf[CNN_MAX_CHANNELS],
+                          int channels, int width, int height, int stride) {
+  tensor->allocsize = 0;
+  tensor->channels = channels;
+  tensor->width = width;
+  tensor->height = height;
+  tensor->stride = stride;
+  if (buf) {
+    for (int c = 0; c < channels; ++c) tensor->buf[c] = buf[c];
+  } else {
+    for (int c = 0; c < channels; ++c) tensor->buf[c] = NULL;
+  }
+}
+
+static void swap_tensor(TENSOR *t1, TENSOR *t2) {
+  TENSOR t = *t1;
+  *t1 = *t2;
+  *t2 = t;
+}
+
+// The concatenated tensor goes into dst with first the channels in
+// original dst followed by the channels in the src
+static void concat_tensor(const TENSOR *src, TENSOR *dst) {
+  assert(src->width == dst->width);
+  assert(src->height == dst->height);
+
+  const int dst_channels = dst->channels;
+  const int channels = dst->channels + src->channels;
+  const int newallocsize = channels * dst->width * dst->height;
+  if (dst->allocsize < newallocsize) {
+    TENSOR t;
+    init_tensor(&t);
+    // allocate new buffers and copy first the dst channels
+    realloc_tensor(&t, channels, dst->width, dst->height);
+    copy_tensor(dst, dst->channels, 0, &t);
+    // Swap the tensors and free the old buffers
+    swap_tensor(dst, &t);
+    free_tensor(&t);
+  }
+  for (int c = 1; c < channels; ++c)
+    dst->buf[c] = &dst->buf[0][c * dst->width * dst->height];
+  // Copy the channels in src after the first dst_channels channels.
+  copy_tensor(src, src->channels, dst_channels, dst);
+}
+
+int check_tensor_equal_dims(TENSOR *t1, TENSOR *t2) {
+  return (t1->width == t2->width && t1->height == t2->height);
+}
+
+int check_tensor_equal_size(TENSOR *t1, TENSOR *t2) {
+  return (t1->channels == t2->channels && t1->width == t2->width &&
+          t1->height == t2->height);
+}
+
+static void find_layer_output_size(int in_width, int in_height,
+                                   const CNN_LAYER_CONFIG *layer_config,
+                                   int *out_width, int *out_height) {
+  if (!layer_config->deconvolve) {
+    switch (layer_config->pad) {
+      case PADDING_SAME_ZERO:
+      case PADDING_SAME_REPLICATE:
+        *out_width = (in_width + layer_config->skip_width - 1) /
+                     layer_config->skip_width;
+        *out_height = (in_height + layer_config->skip_height - 1) /
+                      layer_config->skip_height;
+        break;
+      case PADDING_VALID:
+        *out_width =
+            (in_width - layer_config->filter_width + layer_config->skip_width) /
+            layer_config->skip_width;
+        *out_height = (in_height - layer_config->filter_height +
+                       layer_config->skip_height) /
+                      layer_config->skip_height;
+        break;
+      default: assert(0 && "Unknown padding type");
+    }
+  } else {
+    switch (layer_config->pad) {
+      case PADDING_SAME_ZERO:
+      case PADDING_SAME_REPLICATE:
+        *out_width = in_width * layer_config->skip_width;
+        *out_height = in_height * layer_config->skip_height;
+        break;
+      case PADDING_VALID:
+        *out_width = (in_width - 1) * layer_config->skip_width +
+                     layer_config->filter_width;
+        *out_height = (in_height - 1) * layer_config->skip_height +
+                      layer_config->filter_height;
+        break;
+      default: assert(0 && "Unknown padding type");
+    }
+  }
+}
+
+void find_cnn_out_channels(const CNN_LAYER_CONFIG *layer_config,
+                           int channels_per_branch[]) {
+  int branch = layer_config->branch;
+  const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config;
+  for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
+    if ((branch_config->input_to_branches & (1 << b)) && b != branch) {
+      if (layer_config->branch_copy_type == BRANCH_INPUT) {
+        channels_per_branch[b] = layer_config->in_channels;
+      } else if (layer_config->branch_copy_type == BRANCH_OUTPUT) {
+        channels_per_branch[b] = layer_config->out_channels;
+      } else if (layer_config->branch_copy_type == BRANCH_COMBINED) {
+        channels_per_branch[b] = layer_config->out_channels;
+        for (int c = 0; c < CNN_MAX_BRANCHES; ++c) {
+          if ((branch_config->branches_to_combine & (1 << c)) && c != branch) {
+            assert(channels_per_branch[c] > 0);
+            channels_per_branch[b] += channels_per_branch[c];
+          }
+        }
+      }
+    }
+  }
+  channels_per_branch[branch] = layer_config->out_channels;
+  for (int c = 0; c < CNN_MAX_BRANCHES; ++c) {
+    if ((branch_config->branches_to_combine & (1 << c)) && c != branch) {
+      assert(channels_per_branch[c] > 0);
+      channels_per_branch[branch] += channels_per_branch[c];
+    }
+  }
+}
+
+#if CONFIG_DEBUG
+static INLINE int cnn_has_at_least_one_output(const CNN_CONFIG *cnn_config) {
+  const int num_layers = cnn_config->num_layers;
+  const CNN_LAYER_CONFIG *layer_configs = cnn_config->layer_config;
+
+  for (int idx = 0; idx < num_layers; idx++) {
+    if (layer_configs[idx].output_num != -1) {
+      return 1;
+    }
+  }
+  return 0;
+}
+#endif
+
+void av1_find_cnn_output_size(int in_width, int in_height,
+                              const CNN_CONFIG *cnn_config, int *out_width,
+                              int *out_height, int *out_channels) {
+  int channels_per_branch[CNN_MAX_BRANCHES] = { 0 };
+  int i_width[CNN_MAX_BRANCHES] = { 0 };
+  int i_height[CNN_MAX_BRANCHES] = { 0 };
+  i_width[0] = in_width + cnn_config->ext_width * 2;
+  i_height[0] = in_height + cnn_config->ext_height * 2;
+
+#if CONFIG_DEBUG
+  assert(cnn_has_at_least_one_output(cnn_config));
+#endif
+
+  for (int i = 0; i < cnn_config->num_layers; ++i) {
+    const CNN_LAYER_CONFIG *layer_config = &cnn_config->layer_config[i];
+    const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config;
+    const int branch = layer_config->branch;
+    int o_width = 0, o_height = 0;
+
+    if (layer_config->branch_copy_type == BRANCH_INPUT) {
+      for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
+        if ((branch_config->input_to_branches & (1 << b)) && b != branch) {
+          assert(i_width[branch] > 0 && i_height[branch] > 0);
+          i_width[b] = i_width[branch];
+          i_height[b] = i_height[branch];
+        }
+      }
+    }
+
+    find_layer_output_size(i_width[branch], i_height[branch], layer_config,
+                           &o_width, &o_height);
+    i_width[branch] = o_width;
+    i_height[branch] = o_height;
+
+    if (layer_config->branch_copy_type == BRANCH_OUTPUT) {
+      for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
+        if ((branch_config->input_to_branches & (1 << b)) && b != branch) {
+          i_width[b] = o_width;
+          i_height[b] = o_height;
+        }
+      }
+    }
+
+    find_cnn_out_channels(layer_config, channels_per_branch);
+
+    const int output_num = layer_config->output_num;
+    if (output_num != -1) {  // Current layer is an output layer
+      out_width[output_num] = o_width;
+      out_height[output_num] = o_height;
+      out_channels[output_num] = channels_per_branch[layer_config->branch];
+    }
+  }
+}
+
+activation_fn get_activation(ACTIVATION layer_activation) {
+  switch (layer_activation) {
+    case NONE: return identity;
+    case RELU: return relu;
+    case SOFTSIGN: return softsign;
+    case SIGMOID:
+      assert(0 && "Sigmoid has not been supported in CNN.");  // TO DO
+      return NULL;
+    default: assert(0 && "Unknown activation type"); return NULL;
+  }
+}
+
+static INLINE int get_start_shift_convolve(int width, int filt_width,
+                                           int stride) {
+  const int mod = (width % stride);
+  const int filt_off = (filt_width - 1) / 2;
+  const int dif = (mod ? mod - 1 : stride - 1);
+  return AOMMIN((dif + (filt_width % 2)) / 2, filt_off);
+}
+
+void av1_cnn_add_c(float **output, int channels, int width, int height,
+                   int stride, const float **add) {
+  for (int c = 0; c < channels; ++c) {
+    for (int i = 0; i < height; ++i)
+      for (int j = 0; j < width; ++j)
+        output[c][i * stride + j] += add[c][i * stride + j];
+  }
+}
+
+void av1_cnn_activate_c(float **output, int channels, int width, int height,
+                        int stride, ACTIVATION layer_activation) {
+  activation_fn activation = get_activation(layer_activation);
+  for (int c = 0; c < channels; ++c) {
+    for (int i = 0; i < height; ++i)
+      for (int j = 0; j < width; ++j)
+        output[c][i * stride + j] = activation(output[c][i * stride + j]);
+  }
+}
+
+static void copy_active_tensor_to_branches(const TENSOR *layer_active_tensor,
+                                           const CNN_LAYER_CONFIG *layer_config,
+                                           int branch, TENSOR branch_output[]) {
+  const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config;
+  for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
+    if ((branch_config->input_to_branches & (1 << b)) && b != branch) {
+      // Copy layer's active tensor to output tensor of branch b if set in
+      // mask. The output becomes the input of the first layer of the branch
+      // because the layer of the branch is not the first layer.
+      int copy_channels = branch_config->channels_to_copy > 0
+                              ? branch_config->channels_to_copy
+                              : layer_active_tensor->channels;
+      realloc_tensor(&branch_output[b], copy_channels,
+                     layer_active_tensor->width, layer_active_tensor->height);
+      copy_tensor(layer_active_tensor, copy_channels, 0, &branch_output[b]);
+    }
+  }
+}
+
+static int convolve_layer(void *arg1, void *arg2) {
+  const CONVOLVE_OPS *convolve_ops = arg1;
+  (void)arg2;
+  av1_cnn_convolve(
+      convolve_ops->input, convolve_ops->in_width, convolve_ops->in_height,
+      convolve_ops->in_stride, convolve_ops->layer_config, convolve_ops->output,
+      convolve_ops->out_stride, convolve_ops->start_idx, convolve_ops->th_step);
+  return 1;
+}
+
+static void convolve_layer_mt(const float **input, int in_width, int in_height,
+                              int in_stride,
+                              const CNN_LAYER_CONFIG *layer_config,
+                              const CNN_THREAD_DATA *thread_data,
+                              float **output, int out_stride) {
+  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+  const int num_workers = thread_data->num_workers;
+
+  CONVOLVE_OPS convolve_ops[CNN_MAX_THREADS];
+  for (int th = 0; th < AOMMIN(num_workers, CNN_MAX_THREADS); ++th) {
+    AVxWorker *const worker = &thread_data->workers[th];
+    winterface->reset(worker);
+
+    CONVOLVE_OPS convolve_op = { input,      in_width,     in_height,
+                                 in_stride,  layer_config, output,
+                                 out_stride, th,           num_workers };
+    convolve_ops[th] = convolve_op;
+    worker->hook = convolve_layer;
+    worker->data1 = &(convolve_ops[th]);
+    worker->data2 = NULL;
+
+    // Start convolving.
+    if (th == num_workers - 1) {
+      winterface->execute(worker);
+    } else {
+      winterface->launch(worker);
+    }
+  }
+
+  // Wait until all workers have finished.
+  for (int th = 0; th < AOMMIN(num_workers, CNN_MAX_THREADS); ++th) {
+    winterface->sync(&thread_data->workers[th]);
+  }
+}
+
+void av1_cnn_convolve_c(const float **input, int in_width, int in_height,
+                        int in_stride, const CNN_LAYER_CONFIG *layer_config,
+                        float **output, int out_stride, int start_idx,
+                        int step) {
+  assert(!layer_config->deconvolve);
+  const int cstep = layer_config->in_channels * layer_config->out_channels;
+  const int filter_height_half = layer_config->filter_height >> 1;
+  const int filter_width_half = layer_config->filter_width >> 1;
+  const int channel_step = AOMMAX(step, 1);
+
+  if (layer_config->maxpool &&
+      (layer_config->skip_height > 1 || layer_config->skip_width > 1)) {
+    switch (layer_config->pad) {
+      case PADDING_SAME_ZERO:
+        for (int i = 0; i < layer_config->out_channels; ++i) {
+          for (int h = 0, u = 0; h < in_height;
+               h += layer_config->skip_height, ++u) {
+            for (int w = 0, v = 0; w < in_width;
+                 w += layer_config->skip_width, ++v) {
+              for (int hh = h;
+                   hh < AOMMIN(in_height, h + layer_config->skip_height);
+                   ++hh) {
+                for (int ww = w;
+                     ww < AOMMIN(in_width, w + layer_config->skip_width);
+                     ++ww) {
+                  float sum = layer_config->bias[i];
+                  for (int k = 0; k < layer_config->in_channels; ++k) {
+                    int off = k * layer_config->out_channels + i;
+                    for (int l = 0; l < layer_config->filter_height; ++l) {
+                      const int ii = hh + l - filter_height_half;
+                      for (int m = 0; m < layer_config->filter_width;
+                           ++m, off += cstep) {
+                        const int jj = ww + m - filter_width_half;
+                        if (ii < 0 || ii >= in_height || jj < 0 ||
+                            jj >= in_width)
+                          continue;
+                        sum += layer_config->weights[off] *
+                               input[k][ii * in_stride + jj];
+                      }
+                    }
+                  }
+                  const float a = sum;
+                  if (h == hh && w == ww)
+                    output[i][u * out_stride + v] = a;
+                  else
+                    output[i][u * out_stride + v] =
+                        AOMMAX(output[i][u * out_stride + v], a);
+                }
+              }
+            }
+          }
+        }
+        break;
+      case PADDING_SAME_REPLICATE:
+        for (int i = 0; i < layer_config->out_channels; ++i) {
+          for (int h = 0, u = 0; h < in_height;
+               h += layer_config->skip_height, ++u) {
+            for (int w = 0, v = 0; w < in_width;
+                 w += layer_config->skip_width, ++v) {
+              for (int hh = h;
+                   hh < AOMMIN(in_height, h + layer_config->skip_height);
+                   ++hh) {
+                for (int ww = w;
+                     ww < AOMMIN(in_width, w + layer_config->skip_width);
+                     ++ww) {
+                  float sum = layer_config->bias[i];
+                  for (int k = 0; k < layer_config->in_channels; ++k) {
+                    int off = k * layer_config->out_channels + i;
+                    for (int l = 0; l < layer_config->filter_height; ++l) {
+                      const int ii =
+                          CLAMPINDEX(hh + l - filter_height_half, in_height);
+                      for (int m = 0; m < layer_config->filter_width;
+                           ++m, off += cstep) {
+                        const int jj =
+                            CLAMPINDEX(ww + m - filter_width_half, in_width);
+                        assert(ii >= 0 && ii < in_height && jj >= 0 &&
+                               jj < in_width);
+                        sum += layer_config->weights[off] *
+                               input[k][ii * in_stride + jj];
+                      }
+                    }
+                  }
+                  const float a = sum;
+                  if (h == hh && w == ww)
+                    output[i][u * out_stride + v] = a;
+                  else
+                    output[i][u * out_stride + v] =
+                        AOMMAX(output[i][u * out_stride + v], a);
+                }
+              }
+            }
+          }
+        }
+        break;
+      case PADDING_VALID:
+        for (int i = 0; i < layer_config->out_channels; ++i) {
+          for (int h = 0, u = 0;
+               h < in_height - layer_config->filter_height + 1;
+               h += layer_config->skip_height, ++u) {
+            for (int w = 0, v = 0;
+                 w < in_width - layer_config->filter_width + 1;
+                 w += layer_config->skip_width, ++v) {
+              for (int hh = h;
+                   hh < AOMMIN(in_height, h + layer_config->skip_height);
+                   ++hh) {
+                for (int ww = w;
+                     ww < AOMMIN(in_width, w + layer_config->skip_width);
+                     ++ww) {
+                  float sum = layer_config->bias[i];
+                  for (int k = 0; k < layer_config->in_channels; ++k) {
+                    int off = k * layer_config->out_channels + i;
+                    for (int l = 0; l < layer_config->filter_height; ++l) {
+                      const int ii = hh + l;
+                      for (int m = 0; m < layer_config->filter_width;
+                           ++m, off += cstep) {
+                        const int jj = ww + m;
+                        assert(ii >= 0 && ii < in_height && jj >= 0 &&
+                               jj < in_width);
+                        sum += layer_config->weights[off] *
+                               input[k][ii * in_stride + jj];
+                      }
+                    }
+                  }
+                  const float a = sum;
+                  if (h == hh && w == ww)
+                    output[i][u * out_stride + v] = a;
+                  else
+                    output[i][u * out_stride + v] =
+                        AOMMAX(output[i][u * out_stride + v], a);
+                }
+              }
+            }
+          }
+        }
+        break;
+      default: assert(0 && "Unknown padding type");
+    }
+  } else {
+    // Results in element-wise matrix multiplication.
+    if (layer_config->filter_height == 1 && layer_config->filter_width == 1) {
+      const int start_h = get_start_shift_convolve(
+          in_height, layer_config->filter_height, layer_config->skip_height);
+      const int start_w =
+          get_start_shift_convolve(in_width, layer_config->filter_width,
+                                   layer_config->skip_width) +
+          start_idx * layer_config->skip_width;
+      const int out_w_step = AOMMAX(step, 1);
+      const int in_w_step = layer_config->skip_width * out_w_step;
+      for (int i = 0; i < layer_config->out_channels; ++i) {
+        for (int h = start_h, u = 0; h < in_height;
+             h += layer_config->skip_height, ++u) {
+          const int in_h = h * in_stride;
+          const int out_h = u * out_stride + start_idx;
+          for (int w = start_w, out_index = out_h; w < in_width;
+               w += in_w_step, out_index += out_w_step) {
+            float sum = layer_config->bias[i];
+            for (int k = 0; k < layer_config->in_channels; ++k) {
+              sum += layer_config->weights[k * layer_config->out_channels + i] *
+                     input[k][in_h + w];
+            }
+            output[i][out_index] = sum;
+          }
+        }
+      }
+      return;
+    }
+    const int ii_shift =
+        filter_height_half - (layer_config->filter_height - 1) % 2;
+    const int jj_shift =
+        filter_width_half - (layer_config->filter_width - 1) % 2;
+    switch (layer_config->pad) {
+      case PADDING_SAME_ZERO: {
+        const int start_h = get_start_shift_convolve(
+            in_height, layer_config->filter_height, layer_config->skip_height);
+        const int start_w = get_start_shift_convolve(
+            in_width, layer_config->filter_width, layer_config->skip_width);
+        const int end_ii_shift = filter_height_half + 1;
+        const int end_jj_shift = filter_width_half + 1;
+        // *_filter_margin stores the number of pixels along a dimension in the
+        // intersection of the complement of the image in the extended image
+        // and the filter.
+        const int top_filter_margin = layer_config->filter_width * ii_shift;
+        const int right_filter_margin = end_jj_shift - in_width;
+        for (int i = start_idx; i < layer_config->out_channels;
+             i += channel_step) {
+          for (int h = start_h, u = 0; h < in_height;
+               h += layer_config->skip_height, ++u) {
+            const int out_h = u * out_stride;
+            const int top_cstep =
+                AOMMAX(0, top_filter_margin - h * layer_config->filter_width) *
+                    cstep +
+                i;
+            const int start_ii = AOMMAX(0, h - ii_shift);
+            const int end_ii = AOMMIN(in_height, h + end_ii_shift);
+            for (int w = start_w, out_index = out_h; w < in_width;
+                 w += layer_config->skip_width, ++out_index) {
+              const int left_cstep = AOMMAX(0, jj_shift - w) * cstep;
+              const int right_cstep =
+                  AOMMAX(0, right_filter_margin + w) * cstep;
+              const int start_jj = AOMMAX(0, w - jj_shift);
+              const int end_jj = AOMMIN(in_width, w + end_jj_shift);
+              float sum = layer_config->bias[i];
+              for (int k = 0; k < layer_config->in_channels; ++k) {
+                int off = k * layer_config->out_channels + top_cstep;
+                for (int ii = start_ii; ii < end_ii; ++ii) {
+                  off += left_cstep;
+                  for (int jj = start_jj; jj < end_jj; ++jj, off += cstep) {
+                    sum += layer_config->weights[off] *
+                           input[k][ii * in_stride + jj];
+                  }
+                  off += right_cstep;
+                }
+              }
+              output[i][out_index] = sum;
+            }
+          }
+        }
+        break;
+      }
+      case PADDING_SAME_REPLICATE: {
+        // h and w are shifted to an offset coordinate system to reduce in-loop
+        // computation.
+        const int start_h =
+            get_start_shift_convolve(in_height, layer_config->filter_height,
+                                     layer_config->skip_height) -
+            ii_shift;
+        const int start_w =
+            get_start_shift_convolve(in_width, layer_config->filter_width,
+                                     layer_config->skip_width) -
+            jj_shift;
+        const int end_h = in_height - ii_shift;
+        const int end_w = in_width - jj_shift;
+        for (int i = start_idx; i < layer_config->out_channels;
+             i += channel_step) {
+          for (int h = start_h, u = 0; h < end_h;
+               h += layer_config->skip_height, ++u) {
+            const int out_h = u * out_stride;
+            const int upper_ii_index = layer_config->filter_height + h;
+            for (int w = start_w, out_index = out_h; w < end_w;
+                 w += layer_config->skip_width, ++out_index) {
+              const int upper_jj_index = layer_config->filter_width + w;
+              float sum = layer_config->bias[i];
+              for (int k = 0; k < layer_config->in_channels; ++k) {
+                int off = k * layer_config->out_channels + i;
+                for (int ii = h; ii < upper_ii_index; ++ii) {
+                  const int clamped_ii = CLAMPINDEX(ii, in_height);
+                  for (int jj = w; jj < upper_jj_index; ++jj) {
+                    const int clamped_jj = CLAMPINDEX(jj, in_width);
+                    assert(clamped_ii >= 0 && clamped_ii < in_height &&
+                           clamped_jj >= 0 && clamped_jj < in_width);
+                    sum += layer_config->weights[off] *
+                           input[k][clamped_ii * in_stride + clamped_jj];
+                    off += cstep;
+                  }
+                }
+              }
+              output[i][out_index] = sum;
+            }
+          }
+        }
+        break;
+      }
+      case PADDING_VALID: {
+        for (int i = start_idx; i < layer_config->out_channels;
+             i += channel_step) {
+          for (int h = 0, u = 0;
+               h < in_height - layer_config->filter_height + 1;
+               h += layer_config->skip_height, ++u) {
+            const int out_h = u * out_stride;
+            const int upper_ii_index = layer_config->filter_height + h;
+            for (int w = 0, out_index = out_h;
+                 w < in_width - layer_config->filter_width + 1;
+                 w += layer_config->skip_width, ++out_index) {
+              const int upper_jj_index = layer_config->filter_width + w;
+              float sum = layer_config->bias[i];
+              for (int k = 0; k < layer_config->in_channels; ++k) {
+                int off = k * layer_config->out_channels + i;
+                for (int ii = h; ii < upper_ii_index; ++ii) {
+                  for (int jj = w; jj < upper_jj_index; ++jj) {
+                    assert(ii >= 0 && ii < in_height && jj >= 0 &&
+                           jj < in_width);
+                    sum += layer_config->weights[off] *
+                           input[k][ii * in_stride + jj];
+                    off += cstep;
+                  }
+                }
+              }
+              output[i][out_index] = sum;
+            }
+          }
+        }
+        break;
+      }
+      default: assert(0 && "Unknown padding type");
+    }
+  }
+}
+
+static INLINE int get_start_shift_deconvolve(int filt_width, int stride) {
+  const int dif = AOMMAX(filt_width - stride, 0);
+  return dif / 2;
+}
+
+void av1_cnn_batchnorm_c(float **image, int channels, int width, int height,
+                         int stride, const float *gamma, const float *beta,
+                         const float *mean, const float *std) {
+  assert(gamma && beta && beta && std && "batchnorm has null parameter!");
+  for (int ch = 0; ch < channels; ch++) {
+    const float ch_gamma = gamma[ch];
+    const float ch_beta = beta[ch];
+    const float ch_mean = mean[ch];
+    const float ch_std = std[ch];
+    float *image_row = image[ch];
+
+    for (int row = 0; row < height; row++) {
+      for (int col = 0; col < width; col++) {
+        image_row[col] =
+            ch_gamma * (image_row[col] - ch_mean) / ch_std + ch_beta;
+      }
+      image_row += stride;
+    }
+  }
+}
+
+void av1_cnn_deconvolve_c(const float **input, int in_width, int in_height,
+                          int in_stride, const CNN_LAYER_CONFIG *layer_config,
+                          float **output, int out_stride) {
+  assert(layer_config->deconvolve);
+
+  const int cstep = layer_config->in_channels * layer_config->out_channels;
+
+  int out_width = 0;
+  int out_height = 0;
+  find_layer_output_size(in_width, in_height, layer_config, &out_width,
+                         &out_height);
+  switch (layer_config->pad) {
+    case PADDING_SAME_ZERO:
+      for (int i = 0; i < layer_config->out_channels; ++i) {
+        for (int u = 0; u < out_height; ++u) {
+          for (int v = 0; v < out_width; ++v) {
+            float sum = layer_config->bias[i];
+            for (int k = 0; k < layer_config->in_channels; ++k) {
+              int off = k * layer_config->out_channels + i;
+              for (int l = 0; l < layer_config->filter_height; ++l) {
+                const int h =
+                    u - l +
+                    get_start_shift_deconvolve(layer_config->filter_height,
+                                               layer_config->skip_height);
+                for (int m = 0; m < layer_config->filter_width;
+                     ++m, off += cstep) {
+                  const int w =
+                      v - m +
+                      get_start_shift_deconvolve(layer_config->filter_width,
+                                                 layer_config->skip_width);
+                  if ((h % layer_config->skip_height) != 0 ||
+                      (w % layer_config->skip_width) != 0)
+                    continue;
+                  const int ii = h / layer_config->skip_height;
+                  const int jj = w / layer_config->skip_width;
+                  if (ii < 0 || ii >= in_height || jj < 0 || jj >= in_width)
+                    continue;
+                  sum += layer_config->weights[off] *
+                         input[k][ii * in_stride + jj];
+                }
+              }
+            }
+            output[i][u * out_stride + v] = sum;
+          }
+        }
+      }
+      break;
+    case PADDING_SAME_REPLICATE:
+      for (int i = 0; i < layer_config->out_channels; ++i) {
+        for (int u = 0; u < out_height; ++u) {
+          for (int v = 0; v < out_width; ++v) {
+            float sum = layer_config->bias[i];
+            for (int k = 0; k < layer_config->in_channels; ++k) {
+              int off = k * layer_config->out_channels + i;
+              for (int l = 0; l < layer_config->filter_height; ++l) {
+                const int h =
+                    u - l +
+                    get_start_shift_deconvolve(layer_config->filter_height,
+                                               layer_config->skip_height);
+                for (int m = 0; m < layer_config->filter_width;
+                     ++m, off += cstep) {
+                  const int w =
+                      v - m +
+                      get_start_shift_deconvolve(layer_config->filter_width,
+                                                 layer_config->skip_width);
+                  if ((h % layer_config->skip_height) != 0 ||
+                      (w % layer_config->skip_width) != 0)
+                    continue;
+                  const int ii =
+                      CLAMPINDEX(h / layer_config->skip_height, in_height);
+                  const int jj =
+                      CLAMPINDEX(w / layer_config->skip_width, in_width);
+                  assert(ii >= 0 && ii < in_height && jj >= 0 && jj < in_width);
+                  continue;
+                  sum += layer_config->weights[off] *
+                         input[k][ii * in_stride + jj];
+                }
+              }
+            }
+            output[i][u * out_stride + v] = sum;
+          }
+        }
+      }
+      break;
+    case PADDING_VALID:
+      for (int i = 0; i < layer_config->out_channels; ++i) {
+        for (int u = 0; u < out_height; ++u) {
+          for (int v = 0; v < out_width; ++v) {
+            float sum = layer_config->bias[i];
+            for (int k = 0; k < layer_config->in_channels; ++k) {
+              int off = k * layer_config->out_channels + i;
+              for (int l = 0; l < layer_config->filter_height; ++l) {
+                const int h = u - l;
+                for (int m = 0; m < layer_config->filter_width;
+                     ++m, off += cstep) {
+                  const int w = v - m;
+                  if ((h % layer_config->skip_height) != 0 ||
+                      (w % layer_config->skip_width) != 0)
+                    continue;
+                  const int ii = h / layer_config->skip_height;
+                  const int jj = w / layer_config->skip_width;
+                  if (ii < 0 || ii >= in_height || jj < 0 || jj >= in_width)
+                    continue;
+                  sum += layer_config->weights[off] *
+                         input[k][ii * in_stride + jj];
+                }
+              }
+            }
+            output[i][u * out_stride + v] = sum;
+          }
+        }
+      }
+      break;
+    default: assert(0 && "Unknown padding type");
+  }
+}
+
+void av1_cnn_predict_c(const float **input, int in_width, int in_height,
+                       int in_stride, const CNN_CONFIG *cnn_config,
+                       const CNN_THREAD_DATA *thread_data,
+                       CNN_MULTI_OUT *output_struct) {
+  TENSOR tensor1[CNN_MAX_BRANCHES] = { 0 };
+  TENSOR tensor2[CNN_MAX_BRANCHES] = { 0 };
+
+  float **output[CNN_MAX_BRANCHES];
+  const int *out_chs = output_struct->output_channels;
+  output[0] = output_struct->output_buffer;
+  for (int out_idx = 1; out_idx < output_struct->num_outputs; out_idx++) {
+    output[out_idx] = output[out_idx - 1] + out_chs[out_idx - 1];
+  }
+
+  int i_width = in_width;
+  int i_height = in_height;
+  int o_width = 0, o_height = 0;
+  for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
+    init_tensor(&tensor1[b]);
+    init_tensor(&tensor2[b]);
+  }
+
+  const int *out_stride = output_struct->output_strides;
+  for (int layer = 0; layer < cnn_config->num_layers; ++layer) {
+    const CNN_LAYER_CONFIG *layer_config = &cnn_config->layer_config[layer];
+    const int branch = layer_config->branch;
+    const CNN_BRANCH_CONFIG *branch_config = &layer_config->branch_config;
+
+    // Allocate input tensor
+    if (layer == 0) {       // First layer
+      assert(branch == 0);  // First layer must be primary branch
+      assign_tensor(&tensor1[branch], (float **)input,
+                    layer_config->in_channels, in_width, in_height, in_stride);
+    } else {  // Non-first layer
+      // Swap tensor1 and tensor2
+      swap_tensor(&tensor1[branch], &tensor2[branch]);
+
+      i_width = tensor1[branch].width;
+      i_height = tensor1[branch].height;
+    }
+
+    // Allocate output tensor
+    find_layer_output_size(i_width, i_height, layer_config, &o_width,
+                           &o_height);
+    const int output_num = layer_config->output_num;
+    if (output_num == -1) {  // Non-output layer
+      realloc_tensor(&tensor2[branch], layer_config->out_channels, o_width,
+                     o_height);
+    } else {  // Output layer
+      free_tensor(&tensor2[branch]);
+      assign_tensor(&tensor2[branch], output[output_num],
+                    layer_config->out_channels, o_width, o_height,
+                    out_stride[output_num]);
+    }
+
+    // If we are combining branches make sure that the branch to combine
+    // is different from the current branch.
+    assert(IMPLIES(layer_config->branch_combine_type != BRANCH_NOC,
+                   !(branch_config->branches_to_combine & (1 << branch))));
+
+    if (layer_config->branch_copy_type == BRANCH_INPUT) {
+      copy_active_tensor_to_branches(&tensor1[branch], layer_config, branch,
+                                     tensor2);
+    }
+    // Check consistency of input and output channels
+    assert(tensor1[branch].channels == layer_config->in_channels);
+    assert(tensor2[branch].channels == layer_config->out_channels);
+
+    // Convolve/Deconvolve
+    if (!cnn_config->layer_config[layer].deconvolve) {
+      if (thread_data->num_workers > 1) {
+        convolve_layer_mt((const float **)tensor1[branch].buf,
+                          tensor1[branch].width, tensor1[branch].height,
+                          tensor1[branch].stride, layer_config, thread_data,
+                          tensor2[branch].buf, tensor2[branch].stride);
+      } else {
+        av1_cnn_convolve((const float **)tensor1[branch].buf,
+                         tensor1[branch].width, tensor1[branch].height,
+                         tensor1[branch].stride, layer_config,
+                         tensor2[branch].buf, tensor2[branch].stride, 0, 1);
+      }
+    } else {
+      av1_cnn_deconvolve((const float **)tensor1[branch].buf,
+                         tensor1[branch].width, tensor1[branch].height,
+                         tensor1[branch].stride, layer_config,
+                         tensor2[branch].buf, tensor2[branch].stride);
+    }
+
+    if (layer_config->branch_copy_type == BRANCH_OUTPUT) {
+      copy_active_tensor_to_branches(&tensor2[branch], layer_config, branch,
+                                     tensor2);
+    }
+
+    // Add tensors from other branches if needed
+    if (layer_config->branch_combine_type == BRANCH_ADD) {
+      for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
+        if ((branch_config->branches_to_combine & (1 << b)) && b != branch) {
+          assert(check_tensor_equal_size(&tensor2[b], &tensor2[branch]));
+          av1_cnn_add(tensor2[branch].buf, tensor2[branch].channels,
+                      tensor2[branch].width, tensor2[branch].height,
+                      tensor2[branch].stride, (const float **)tensor2[b].buf);
+        }
+      }
+    }
+
+    // Non-linearity
+    if (layer_config->activation != IDENTITY)
+      av1_cnn_activate(tensor2[branch].buf, tensor2[branch].channels,
+                       tensor2[branch].width, tensor2[branch].height,
+                       tensor2[branch].stride, layer_config->activation);
+
+    if (layer_config->bn_params.bn_gamma) {
+      av1_cnn_batchnorm(
+          tensor2[branch].buf, tensor2[branch].channels, tensor2[branch].width,
+          tensor2[branch].height, tensor2[branch].stride,
+          layer_config->bn_params.bn_gamma, layer_config->bn_params.bn_beta,
+          layer_config->bn_params.bn_mean, layer_config->bn_params.bn_std);
+    }
+
+    // Concatenate tensors
+    if (layer_config->branch_combine_type == BRANCH_CAT) {
+      if (output_num == -1) {  // Non-output layer
+        for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
+          if ((branch_config->branches_to_combine & (1 << b)) && b != branch) {
+            assert(check_tensor_equal_dims(&tensor2[b], &tensor2[branch]));
+            assert(tensor2[b].channels > 0);
+            concat_tensor(&tensor2[b], &tensor2[branch]);
+          }
+        }
+      } else {  // Output layer
+        const int existing_channels = tensor2[branch].channels;
+        int num_chs = existing_channels;
+        for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
+          if ((branch_config->branches_to_combine & (1 << b)) && b != branch) {
+            assert(check_tensor_equal_dims(&tensor2[b], &tensor2[branch]));
+            // Needed only to assign the new channel buffers
+            num_chs += tensor2[b].channels;
+          }
+        }
+        assign_tensor(&tensor2[branch], output[output_num], num_chs, o_width,
+                      o_height, out_stride[output_num]);
+
+        num_chs = existing_channels;
+        for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
+          if ((branch_config->branches_to_combine & (1 << b)) && b != branch) {
+            assert(check_tensor_equal_dims(&tensor2[b], &tensor2[branch]));
+            // Needed only to assign the new channel buffers
+            copy_tensor(&tensor2[b], tensor2[b].channels, num_chs,
+                        &tensor2[branch]);
+            num_chs += tensor2[b].channels;
+          }
+        }
+      }
+    }
+
+    if (layer_config->branch_copy_type == BRANCH_COMBINED) {
+      copy_active_tensor_to_branches(&tensor2[branch], layer_config, branch,
+                                     tensor2);
+    }
+  }
+
+  for (int b = 0; b < CNN_MAX_BRANCHES; ++b) {
+    free_tensor(&tensor1[b]);
+    free_tensor(&tensor2[b]);
+  }
+}
+
+// Assume output already has proper allocation
+// Assume input image buffers all have same resolution and strides
+void av1_cnn_predict_img_multi_out(uint8_t **dgd, int width, int height,
+                                   int stride, const CNN_CONFIG *cnn_config,
+                                   const CNN_THREAD_DATA *thread_data,
+                                   CNN_MULTI_OUT *output) {
+  const float max_val = 255.0;
+
+  const int in_width = width + 2 * cnn_config->ext_width;
+  const int in_height = height + 2 * cnn_config->ext_height;
+  const int in_channels = cnn_config->layer_config[0].in_channels;
+  float *inputs[CNN_MAX_CHANNELS];
+  float *input_ =
+      (float *)aom_malloc(in_width * in_height * in_channels * sizeof(*input_));
+  const int in_stride = in_width;
+
+  for (int c = 0; c < in_channels; ++c) {
+    inputs[c] = input_ + c * in_stride * in_height;
+    float *input =
+        inputs[c] + cnn_config->ext_height * in_stride + cnn_config->ext_width;
+
+    if (cnn_config->strict_bounds) {
+      for (int i = 0; i < height; ++i)
+        for (int j = 0; j < width; ++j)
+          input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val;
+      // extend left and right
+      for (int i = 0; i < height; ++i) {
+        for (int j = -cnn_config->ext_width; j < 0; ++j)
+          input[i * in_stride + j] = input[i * in_stride];
+        for (int j = width; j < width + cnn_config->ext_width; ++j)
+          input[i * in_stride + j] = input[i * in_stride + width - 1];
+      }
+      // extend top and bottom
+      for (int i = -cnn_config->ext_height; i < 0; ++i)
+        memcpy(&input[i * in_stride - cnn_config->ext_width],
+               &input[-cnn_config->ext_width], in_width * sizeof(*input));
+      for (int i = height; i < height + cnn_config->ext_height; ++i)
+        memcpy(&input[i * in_stride - cnn_config->ext_width],
+               &input[(height - 1) * in_stride - cnn_config->ext_width],
+               in_width * sizeof(*input));
+    } else {
+      for (int i = -cnn_config->ext_height; i < height + cnn_config->ext_height;
+           ++i)
+        for (int j = -cnn_config->ext_width; j < width + cnn_config->ext_width;
+             ++j)
+          input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val;
+    }
+  }
+  av1_cnn_predict((const float **)inputs, in_width, in_height, in_stride,
+                  cnn_config, thread_data, output);
+
+  aom_free(input_);
+}
+
+// Assume output already has proper allocation
+// Assume input image buffers all have same resolution and strides
+void av1_cnn_predict_img_multi_out_highbd(uint16_t **dgd, int width, int height,
+                                          int stride,
+                                          const CNN_CONFIG *cnn_config,
+                                          const CNN_THREAD_DATA *thread_data,
+                                          int bit_depth,
+                                          CNN_MULTI_OUT *output) {
+  const float max_val = (float)((1 << bit_depth) - 1);
+
+  const int in_width = width + 2 * cnn_config->ext_width;
+  const int in_height = height + 2 * cnn_config->ext_height;
+  const int in_channels = cnn_config->layer_config[0].in_channels;
+  float *inputs[CNN_MAX_CHANNELS];
+  float *input_ =
+      (float *)aom_malloc(in_width * in_height * in_channels * sizeof(*input_));
+  const int in_stride = in_width;
+
+  for (int c = 0; c < in_channels; ++c) {
+    inputs[c] = input_ + c * in_stride * in_height;
+    float *input =
+        inputs[c] + cnn_config->ext_height * in_stride + cnn_config->ext_width;
+
+    if (cnn_config->strict_bounds) {
+      for (int i = 0; i < height; ++i)
+        for (int j = 0; j < width; ++j)
+          input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val;
+      // extend left and right
+      for (int i = 0; i < height; ++i) {
+        for (int j = -cnn_config->ext_width; j < 0; ++j)
+          input[i * in_stride + j] = input[i * in_stride];
+        for (int j = width; j < width + cnn_config->ext_width; ++j)
+          input[i * in_stride + j] = input[i * in_stride + width - 1];
+      }
+      // extend top and bottom
+      for (int i = -cnn_config->ext_height; i < 0; ++i)
+        memcpy(&input[i * in_stride - cnn_config->ext_width],
+               &input[-cnn_config->ext_width], in_width * sizeof(*input));
+      for (int i = height; i < height + cnn_config->ext_height; ++i)
+        memcpy(&input[i * in_stride - cnn_config->ext_width],
+               &input[(height - 1) * in_stride - cnn_config->ext_width],
+               in_width * sizeof(*input));
+    } else {
+      for (int i = -cnn_config->ext_height; i < height + cnn_config->ext_height;
+           ++i)
+        for (int j = -cnn_config->ext_width; j < width + cnn_config->ext_width;
+             ++j)
+          input[i * in_stride + j] = (float)dgd[c][i * stride + j] / max_val;
+    }
+  }
+
+  av1_cnn_predict((const float **)inputs, in_width, in_height, in_stride,
+                  cnn_config, thread_data, output);
+
+  aom_free(input_);
+}
+
+// Assume output already has proper allocation
+// Assume input image buffers all have same resolution and strides
+void av1_cnn_predict_img(uint8_t **dgd, int width, int height, int stride,
+                         const CNN_CONFIG *cnn_config,
+                         const CNN_THREAD_DATA *thread_data, float **output,
+                         int out_stride) {
+  int out_width = 0, out_height = 0, out_channels = 0;
+  av1_find_cnn_output_size(width, height, cnn_config, &out_width, &out_height,
+                           &out_channels);
+  const int output_chs[1] = { out_channels };
+  const int output_strides[1] = { out_stride };
+  CNN_MULTI_OUT output_struct = { .output_channels = output_chs,
+                                  .output_strides = output_strides,
+                                  .output_buffer = output };
+  av1_cnn_predict_img_multi_out(dgd, width, height, stride, cnn_config,
+                                thread_data, &output_struct);
+}
+
+// Assume output already has proper allocation
+// Assume input image buffers all have same resolution and strides
+void av1_cnn_predict_img_highbd(uint16_t **dgd, int width, int height,
+                                int stride, const CNN_CONFIG *cnn_config,
+                                const CNN_THREAD_DATA *thread_data,
+                                int bit_depth, float **output, int out_stride) {
+  int out_width = 0, out_height = 0, out_channels = 0;
+  av1_find_cnn_output_size(width, height, cnn_config, &out_width, &out_height,
+                           &out_channels);
+  const int output_chs[1] = { out_channels };
+  const int output_strides[1] = { out_stride };
+  CNN_MULTI_OUT output_struct = { .output_channels = output_chs,
+                                  .output_strides = output_strides,
+                                  .output_buffer = output };
+  av1_cnn_predict_img_multi_out_highbd(dgd, width, height, stride, cnn_config,
+                                       thread_data, bit_depth, &output_struct);
+}
diff --git a/libs/libaom/src/av1/encoder/cnn.h b/libs/libaom/src/av1/encoder/cnn.h
new file mode 100644
index 000000000..706be4447
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/cnn.h
@@ -0,0 +1,197 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_CNN_H_
+#define AOM_AV1_COMMON_CNN_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <math.h>
+
+#include "aom_util/aom_thread.h"
+#include "config/av1_rtcd.h"
+
+struct AV1Common;
+
+#define CNN_MAX_HIDDEN_LAYERS 64
+#define CNN_MAX_LAYERS (CNN_MAX_HIDDEN_LAYERS + 1)
+#define CNN_MAX_CHANNELS 256
+#define CNN_MAX_BRANCHES 4
+#define CNN_MAX_THREADS 32
+
+#define NO_BRANCH_CONFIG \
+  { 0, 0, 0 }
+#define NO_BN_PARAMS \
+  { NULL, NULL, NULL, NULL }
+
+enum {
+  PADDING_SAME_ZERO,       // tensorflow's SAME padding with pixels outside
+                           // the image area assumed to be 0 (default)
+  PADDING_SAME_REPLICATE,  // tensorflow's SAME padding with pixels outside
+                           // the image area replicated from closest edge
+  PADDING_VALID            // tensorflow's VALID padding
+} UENUM1BYTE(PADDING_TYPE);
+
+// enum { NONE, RELU, SOFTSIGN } UENUM1BYTE(ACTIVATION);
+
+// Times when input tensor may be copied to branches given in input_to_branches.
+// BRANCH_NO_COPY: doesn't copy any tensor.
+// BRANCH_INPUT: copies the input tensor to branches.
+// BRANCH_OUTPUT: copies the convolved tensor to branches.
+// BRANCH_COMBINED: copies the combined (after convolving and branch combining)
+//   tensor. If no combinations happen at this layer, then this option
+//   has the same effect as COPY_OUTPUT.
+enum {
+  BRANCH_NO_COPY,
+  BRANCH_INPUT,
+  BRANCH_OUTPUT,
+  BRANCH_COMBINED
+} UENUM1BYTE(BRANCH_COPY);
+
+// Types of combining branches with output of current layer:
+// BRANCH_NOC: no branch combining
+// BRANCH_ADD: Add previously stored branch tensor to output of layer
+// BRANCH_CAT: Concatenate branch tensor to output of layer
+enum { BRANCH_NOC, BRANCH_ADD, BRANCH_CAT } UENUM1BYTE(BRANCH_COMBINE);
+
+// The parameters used to scale each channel in batch
+// normalization. The processing in done on a per-channel basis.
+// e.g. bn_mean[c] is the mean for all pixels in channel c. This
+// is always applied after activation. The output is given by
+// out[c,i,j] = norm[c,i,j] * bn_gamma[c] + bn_beta[c] where
+// norm[c,i,j] = (in[c,i,j] - bn_mean[c]) / bn_std[c]
+// here we assume that the effect of variance_epsilon is already
+// taken into account when bn_std is calculated. The pointers
+// needs to be either all zero or all valid. If all zero, then
+// batchnorm is disabled, else batchnorm is applied.
+struct CNN_BATCHNORM_PARAMS {
+  const float *bn_gamma;
+  const float *bn_beta;
+  const float *bn_mean;
+  const float *bn_std;
+};
+
+struct CNN_BRANCH_CONFIG {
+  int input_to_branches;  // If nonzero, copy the active tensor to the current
+  // layer and store for future use in branches
+  // specified in the field as a binary mask. For
+  // example, if input_to_branch = 0x06, it means the
+  // input tensor to the current branch is copied to
+  // branches 1 and 2 (where 0 represents the primary
+  // branch). One restriction is that the mask
+  // cannot indicate copying to the current branch.
+  // If greater than 0, only copies the channels up
+  // to the given index.
+  int channels_to_copy;  // Within the layer, input a copy of active
+  // tensor to branches given in input_to_branches.
+  int branches_to_combine;  // mask of branches to combine with output of
+  // current layer, if
+  // branch_combine_type != BRANCH_NOC
+  // For example, if branches_to_combine = 0x0A,
+  // it means that braches 1 and 3 are combined
+  // with the current branch.
+};
+
+struct CNN_LAYER_CONFIG {
+  int in_channels;
+  int filter_width;
+  int filter_height;
+  int out_channels;
+  int skip_width;
+  int skip_height;
+  int maxpool;            // whether to use maxpool or not (only effective when
+                          // skip width or skip_height are > 1)
+  const float *weights;   // array of length filter_height x filter_width x
+                          // in_channels x out_channels where the inner-most
+                          // scan is out_channels and the outer most scan is
+                          // filter_height.
+  const float *bias;      // array of length out_channels
+  PADDING_TYPE pad;       // padding type
+  ACTIVATION activation;  // the activation function to use after convolution
+  int deconvolve;         // whether this is a deconvolution layer.
+                          // 0: If skip_width or skip_height are > 1, then we
+                          // reduce resolution
+                          // 1: If skip_width or skip_height are > 1, then we
+                          // increase resolution
+  int branch;             // branch index in [0, CNN_MAX_BRANCHES - 1], where
+                          // 0 refers to the primary branch.
+  BRANCH_COPY branch_copy_type;
+  BRANCH_COMBINE branch_combine_type;
+  struct CNN_BRANCH_CONFIG branch_config;
+  struct CNN_BATCHNORM_PARAMS
+      bn_params;   // A struct that contains the parameters
+                   // used for batch normalization.
+  int output_num;  // The output buffer idx to which the layer output is
+                   // written. Set to -1 to disable writing it to the output. In
+                   // the case that branch_combine_type is BRANCH_CAT, all
+                   // concatenated channels will be written to output. In the
+                   // case of BRANCH_ADD, the output will be the result of
+                   // summation.
+};
+
+struct CNN_CONFIG {
+  int num_layers;  // number of CNN layers ( = number of hidden layers + 1)
+  int is_residue;  // whether the output activation is a residue
+  int ext_width, ext_height;  // extension horizontally and vertically
+  int strict_bounds;          // whether the input bounds are strict or not.
+                              // If strict, the extension area is filled by
+                              // replication; if not strict, image data is
+                              // assumed available beyond the bounds.
+  CNN_LAYER_CONFIG layer_config[CNN_MAX_LAYERS];
+};
+
+struct CNN_THREAD_DATA {
+  int num_workers;
+  AVxWorker *workers;
+};
+
+struct CNN_MULTI_OUT {
+  int num_outputs;
+  const int *output_channels;
+  const int *output_strides;
+  float **output_buffer;
+};
+
+// Function to return size of output
+void av1_find_cnn_output_size(int in_width, int in_height,
+                              const CNN_CONFIG *cnn_config, int *out_width,
+                              int *out_height, int *out_channels);
+
+// Prediction functions from set of input image buffers. This function supports
+// CNN with multiple outputs.
+void av1_cnn_predict_img_multi_out(uint8_t **dgd, int width, int height,
+                                   int stride, const CNN_CONFIG *cnn_config,
+                                   const CNN_THREAD_DATA *thread_data,
+                                   struct CNN_MULTI_OUT *output);
+void av1_cnn_predict_img_multi_out_highbd(uint16_t **dgd, int width, int height,
+                                          int stride,
+                                          const CNN_CONFIG *cnn_config,
+                                          const CNN_THREAD_DATA *thread_data,
+                                          int bit_depth, CNN_MULTI_OUT *output);
+
+// Prediction functions from set of input image buffers. This function only
+// supports a single output.
+void av1_cnn_predict_img(uint8_t **dgd, int width, int height, int stride,
+                         const CNN_CONFIG *cnn_config,
+                         const CNN_THREAD_DATA *thread_data, float **output,
+                         int out_stride);
+void av1_cnn_predict_img_highbd(uint16_t **dgd, int width, int height,
+                                int stride, const CNN_CONFIG *cnn_config,
+                                const CNN_THREAD_DATA *thread_data,
+                                int bit_depth, float **output, int out_stride);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_COMMON_CNN_H_
diff --git a/libs/libaom/src/av1/encoder/compound_type.c b/libs/libaom/src/av1/encoder/compound_type.c
new file mode 100644
index 000000000..42095b79e
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/compound_type.c
@@ -0,0 +1,1508 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/pred_common.h"
+#include "av1/encoder/compound_type.h"
+#include "av1/encoder/model_rd.h"
+#include "av1/encoder/motion_search_facade.h"
+#include "av1/encoder/rdopt_utils.h"
+#include "av1/encoder/reconinter_enc.h"
+#include "av1/encoder/tx_search.h"
+
+typedef int64_t (*pick_interinter_mask_type)(
+    const AV1_COMP *const cpi, MACROBLOCK *x, const BLOCK_SIZE bsize,
+    const uint8_t *const p0, const uint8_t *const p1,
+    const int16_t *const residual1, const int16_t *const diff10,
+    uint64_t *best_sse);
+
+// Checks if characteristics of search match
+static INLINE int is_comp_rd_match(const AV1_COMP *const cpi,
+                                   const MACROBLOCK *const x,
+                                   const COMP_RD_STATS *st,
+                                   const MB_MODE_INFO *const mi,
+                                   int32_t *comp_rate, int64_t *comp_dist,
+                                   int32_t *comp_model_rate,
+                                   int64_t *comp_model_dist, int *comp_rs2) {
+  // TODO(ranjit): Ensure that compound type search use regular filter always
+  // and check if following check can be removed
+  // Check if interp filter matches with previous case
+  if (st->filter.as_int != mi->interp_filters.as_int) return 0;
+
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  // Match MV and reference indices
+  for (int i = 0; i < 2; ++i) {
+    if ((st->ref_frames[i] != mi->ref_frame[i]) ||
+        (st->mv[i].as_int != mi->mv[i].as_int)) {
+      return 0;
+    }
+    const WarpedMotionParams *const wm = &xd->global_motion[mi->ref_frame[i]];
+    if (is_global_mv_block(mi, wm->wmtype) != st->is_global[i]) return 0;
+  }
+
+  // Store the stats for COMPOUND_AVERAGE and COMPOUND_DISTWTD
+  for (int comp_type = COMPOUND_AVERAGE; comp_type <= COMPOUND_DISTWTD;
+       comp_type++) {
+    comp_rate[comp_type] = st->rate[comp_type];
+    comp_dist[comp_type] = st->dist[comp_type];
+    comp_model_rate[comp_type] = st->model_rate[comp_type];
+    comp_model_dist[comp_type] = st->model_dist[comp_type];
+    comp_rs2[comp_type] = st->comp_rs2[comp_type];
+  }
+
+  // For compound wedge/segment, reuse data only if NEWMV is not present in
+  // either of the directions
+  if ((!have_newmv_in_inter_mode(mi->mode) &&
+       !have_newmv_in_inter_mode(st->mode)) ||
+      (cpi->sf.inter_sf.disable_interinter_wedge_newmv_search)) {
+    memcpy(&comp_rate[COMPOUND_WEDGE], &st->rate[COMPOUND_WEDGE],
+           sizeof(comp_rate[COMPOUND_WEDGE]) * 2);
+    memcpy(&comp_dist[COMPOUND_WEDGE], &st->dist[COMPOUND_WEDGE],
+           sizeof(comp_dist[COMPOUND_WEDGE]) * 2);
+    memcpy(&comp_model_rate[COMPOUND_WEDGE], &st->model_rate[COMPOUND_WEDGE],
+           sizeof(comp_model_rate[COMPOUND_WEDGE]) * 2);
+    memcpy(&comp_model_dist[COMPOUND_WEDGE], &st->model_dist[COMPOUND_WEDGE],
+           sizeof(comp_model_dist[COMPOUND_WEDGE]) * 2);
+    memcpy(&comp_rs2[COMPOUND_WEDGE], &st->comp_rs2[COMPOUND_WEDGE],
+           sizeof(comp_rs2[COMPOUND_WEDGE]) * 2);
+  }
+  return 1;
+}
+
+// Checks if similar compound type search case is accounted earlier
+// If found, returns relevant rd data
+static INLINE int find_comp_rd_in_stats(const AV1_COMP *const cpi,
+                                        const MACROBLOCK *x,
+                                        const MB_MODE_INFO *const mbmi,
+                                        int32_t *comp_rate, int64_t *comp_dist,
+                                        int32_t *comp_model_rate,
+                                        int64_t *comp_model_dist, int *comp_rs2,
+                                        int *match_index) {
+  for (int j = 0; j < x->comp_rd_stats_idx; ++j) {
+    if (is_comp_rd_match(cpi, x, &x->comp_rd_stats[j], mbmi, comp_rate,
+                         comp_dist, comp_model_rate, comp_model_dist,
+                         comp_rs2)) {
+      *match_index = j;
+      return 1;
+    }
+  }
+  return 0;  // no match result found
+}
+
+static INLINE bool enable_wedge_search(MACROBLOCK *const x,
+                                       const AV1_COMP *const cpi) {
+  // Enable wedge search if source variance and edge strength are above
+  // the thresholds.
+  return x->source_variance >
+             cpi->sf.inter_sf.disable_wedge_search_var_thresh &&
+         x->edge_strength > cpi->sf.inter_sf.disable_wedge_search_edge_thresh;
+}
+
+static INLINE bool enable_wedge_interinter_search(MACROBLOCK *const x,
+                                                  const AV1_COMP *const cpi) {
+  return enable_wedge_search(x, cpi) && cpi->oxcf.enable_interinter_wedge &&
+         !cpi->sf.inter_sf.disable_interinter_wedge;
+}
+
+static INLINE bool enable_wedge_interintra_search(MACROBLOCK *const x,
+                                                  const AV1_COMP *const cpi) {
+  return enable_wedge_search(x, cpi) && cpi->oxcf.enable_interintra_wedge &&
+         !cpi->sf.inter_sf.disable_wedge_interintra_search;
+}
+
+static int8_t estimate_wedge_sign(const AV1_COMP *cpi, const MACROBLOCK *x,
+                                  const BLOCK_SIZE bsize, const uint8_t *pred0,
+                                  int stride0, const uint8_t *pred1,
+                                  int stride1) {
+  static const BLOCK_SIZE split_qtr[BLOCK_SIZES_ALL] = {
+    //                            4X4
+    BLOCK_INVALID,
+    // 4X8,        8X4,           8X8
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X4,
+    // 8X16,       16X8,          16X16
+    BLOCK_4X8, BLOCK_8X4, BLOCK_8X8,
+    // 16X32,      32X16,         32X32
+    BLOCK_8X16, BLOCK_16X8, BLOCK_16X16,
+    // 32X64,      64X32,         64X64
+    BLOCK_16X32, BLOCK_32X16, BLOCK_32X32,
+    // 64x128,     128x64,        128x128
+    BLOCK_32X64, BLOCK_64X32, BLOCK_64X64,
+    // 4X16,       16X4,          8X32
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X16,
+    // 32X8,       16X64,         64X16
+    BLOCK_16X4, BLOCK_8X32, BLOCK_32X8
+  };
+  const struct macroblock_plane *const p = &x->plane[0];
+  const uint8_t *src = p->src.buf;
+  int src_stride = p->src.stride;
+  const int bw = block_size_wide[bsize];
+  const int bh = block_size_high[bsize];
+  const int bw_by2 = bw >> 1;
+  const int bh_by2 = bh >> 1;
+  uint32_t esq[2][2];
+  int64_t tl, br;
+
+  const BLOCK_SIZE f_index = split_qtr[bsize];
+  assert(f_index != BLOCK_INVALID);
+
+  if (is_cur_buf_hbd(&x->e_mbd)) {
+    pred0 = CONVERT_TO_BYTEPTR(pred0);
+    pred1 = CONVERT_TO_BYTEPTR(pred1);
+  }
+
+  // Residual variance computation over relevant quandrants in order to
+  // find TL + BR, TL = sum(1st,2nd,3rd) quadrants of (pred0 - pred1),
+  // BR = sum(2nd,3rd,4th) quadrants of (pred1 - pred0)
+  // The 2nd and 3rd quadrants cancel out in TL + BR
+  // Hence TL + BR = 1st quadrant of (pred0-pred1) + 4th of (pred1-pred0)
+  // TODO(nithya): Sign estimation assumes 45 degrees (1st and 4th quadrants)
+  // for all codebooks; experiment with other quadrant combinations for
+  // 0, 90 and 135 degrees also.
+  cpi->fn_ptr[f_index].vf(src, src_stride, pred0, stride0, &esq[0][0]);
+  cpi->fn_ptr[f_index].vf(src + bh_by2 * src_stride + bw_by2, src_stride,
+                          pred0 + bh_by2 * stride0 + bw_by2, stride0,
+                          &esq[0][1]);
+  cpi->fn_ptr[f_index].vf(src, src_stride, pred1, stride1, &esq[1][0]);
+  cpi->fn_ptr[f_index].vf(src + bh_by2 * src_stride + bw_by2, src_stride,
+                          pred1 + bh_by2 * stride1 + bw_by2, stride0,
+                          &esq[1][1]);
+
+  tl = ((int64_t)esq[0][0]) - ((int64_t)esq[1][0]);
+  br = ((int64_t)esq[1][1]) - ((int64_t)esq[0][1]);
+  return (tl + br > 0);
+}
+
+// Choose the best wedge index and sign
+static int64_t pick_wedge(const AV1_COMP *const cpi, const MACROBLOCK *const x,
+                          const BLOCK_SIZE bsize, const uint8_t *const p0,
+                          const int16_t *const residual1,
+                          const int16_t *const diff10,
+                          int8_t *const best_wedge_sign,
+                          int8_t *const best_wedge_index, uint64_t *best_sse) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const src = &x->plane[0].src;
+  const int bw = block_size_wide[bsize];
+  const int bh = block_size_high[bsize];
+  const int N = bw * bh;
+  assert(N >= 64);
+  int rate;
+  int64_t dist;
+  int64_t rd, best_rd = INT64_MAX;
+  int8_t wedge_index;
+  int8_t wedge_sign;
+  const int8_t wedge_types = get_wedge_types_lookup(bsize);
+  const uint8_t *mask;
+  uint64_t sse;
+  const int hbd = is_cur_buf_hbd(xd);
+  const int bd_round = hbd ? (xd->bd - 8) * 2 : 0;
+
+  DECLARE_ALIGNED(32, int16_t, residual0[MAX_SB_SQUARE]);  // src - pred0
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (hbd) {
+    aom_highbd_subtract_block(bh, bw, residual0, bw, src->buf, src->stride,
+                              CONVERT_TO_BYTEPTR(p0), bw, xd->bd);
+  } else {
+    aom_subtract_block(bh, bw, residual0, bw, src->buf, src->stride, p0, bw);
+  }
+#else
+  (void)hbd;
+  aom_subtract_block(bh, bw, residual0, bw, src->buf, src->stride, p0, bw);
+#endif
+
+  int64_t sign_limit = ((int64_t)aom_sum_squares_i16(residual0, N) -
+                        (int64_t)aom_sum_squares_i16(residual1, N)) *
+                       (1 << WEDGE_WEIGHT_BITS) / 2;
+  int16_t *ds = residual0;
+
+  av1_wedge_compute_delta_squares(ds, residual0, residual1, N);
+
+  for (wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
+    mask = av1_get_contiguous_soft_mask(wedge_index, 0, bsize);
+
+    wedge_sign = av1_wedge_sign_from_residuals(ds, mask, N, sign_limit);
+
+    mask = av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
+    sse = av1_wedge_sse_from_residuals(residual1, diff10, mask, N);
+    sse = ROUND_POWER_OF_TWO(sse, bd_round);
+
+    model_rd_sse_fn[MODELRD_TYPE_MASKED_COMPOUND](cpi, x, bsize, 0, sse, N,
+                                                  &rate, &dist);
+    // int rate2;
+    // int64_t dist2;
+    // model_rd_with_curvfit(cpi, x, bsize, 0, sse, N, &rate2, &dist2);
+    // printf("sse %"PRId64": leagacy: %d %"PRId64", curvfit %d %"PRId64"\n",
+    // sse, rate, dist, rate2, dist2); dist = dist2;
+    // rate = rate2;
+
+    rate += x->wedge_idx_cost[bsize][wedge_index];
+    rd = RDCOST(x->rdmult, rate, dist);
+
+    if (rd < best_rd) {
+      *best_wedge_index = wedge_index;
+      *best_wedge_sign = wedge_sign;
+      best_rd = rd;
+      *best_sse = sse;
+    }
+  }
+
+  return best_rd -
+         RDCOST(x->rdmult, x->wedge_idx_cost[bsize][*best_wedge_index], 0);
+}
+
+// Choose the best wedge index the specified sign
+static int64_t pick_wedge_fixed_sign(
+    const AV1_COMP *const cpi, const MACROBLOCK *const x,
+    const BLOCK_SIZE bsize, const int16_t *const residual1,
+    const int16_t *const diff10, const int8_t wedge_sign,
+    int8_t *const best_wedge_index, uint64_t *best_sse) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+
+  const int bw = block_size_wide[bsize];
+  const int bh = block_size_high[bsize];
+  const int N = bw * bh;
+  assert(N >= 64);
+  int rate;
+  int64_t dist;
+  int64_t rd, best_rd = INT64_MAX;
+  int8_t wedge_index;
+  const int8_t wedge_types = get_wedge_types_lookup(bsize);
+  const uint8_t *mask;
+  uint64_t sse;
+  const int hbd = is_cur_buf_hbd(xd);
+  const int bd_round = hbd ? (xd->bd - 8) * 2 : 0;
+  for (wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
+    mask = av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
+    sse = av1_wedge_sse_from_residuals(residual1, diff10, mask, N);
+    sse = ROUND_POWER_OF_TWO(sse, bd_round);
+
+    model_rd_sse_fn[MODELRD_TYPE_MASKED_COMPOUND](cpi, x, bsize, 0, sse, N,
+                                                  &rate, &dist);
+    rate += x->wedge_idx_cost[bsize][wedge_index];
+    rd = RDCOST(x->rdmult, rate, dist);
+
+    if (rd < best_rd) {
+      *best_wedge_index = wedge_index;
+      best_rd = rd;
+      *best_sse = sse;
+    }
+  }
+  return best_rd -
+         RDCOST(x->rdmult, x->wedge_idx_cost[bsize][*best_wedge_index], 0);
+}
+
+static int64_t pick_interinter_wedge(
+    const AV1_COMP *const cpi, MACROBLOCK *const x, const BLOCK_SIZE bsize,
+    const uint8_t *const p0, const uint8_t *const p1,
+    const int16_t *const residual1, const int16_t *const diff10,
+    uint64_t *best_sse) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int bw = block_size_wide[bsize];
+
+  int64_t rd;
+  int8_t wedge_index = -1;
+  int8_t wedge_sign = 0;
+
+  assert(is_interinter_compound_used(COMPOUND_WEDGE, bsize));
+  assert(cpi->common.seq_params.enable_masked_compound);
+
+  if (cpi->sf.inter_sf.fast_wedge_sign_estimate) {
+    wedge_sign = estimate_wedge_sign(cpi, x, bsize, p0, bw, p1, bw);
+    rd = pick_wedge_fixed_sign(cpi, x, bsize, residual1, diff10, wedge_sign,
+                               &wedge_index, best_sse);
+  } else {
+    rd = pick_wedge(cpi, x, bsize, p0, residual1, diff10, &wedge_sign,
+                    &wedge_index, best_sse);
+  }
+
+  mbmi->interinter_comp.wedge_sign = wedge_sign;
+  mbmi->interinter_comp.wedge_index = wedge_index;
+  return rd;
+}
+
+static int64_t pick_interinter_seg(const AV1_COMP *const cpi,
+                                   MACROBLOCK *const x, const BLOCK_SIZE bsize,
+                                   const uint8_t *const p0,
+                                   const uint8_t *const p1,
+                                   const int16_t *const residual1,
+                                   const int16_t *const diff10,
+                                   uint64_t *best_sse) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int bw = block_size_wide[bsize];
+  const int bh = block_size_high[bsize];
+  const int N = 1 << num_pels_log2_lookup[bsize];
+  int rate;
+  int64_t dist;
+  DIFFWTD_MASK_TYPE cur_mask_type;
+  int64_t best_rd = INT64_MAX;
+  DIFFWTD_MASK_TYPE best_mask_type = 0;
+  const int hbd = is_cur_buf_hbd(xd);
+  const int bd_round = hbd ? (xd->bd - 8) * 2 : 0;
+  DECLARE_ALIGNED(16, uint8_t, seg_mask[2 * MAX_SB_SQUARE]);
+  uint8_t *tmp_mask[2] = { xd->seg_mask, seg_mask };
+  // try each mask type and its inverse
+  for (cur_mask_type = 0; cur_mask_type < DIFFWTD_MASK_TYPES; cur_mask_type++) {
+    // build mask and inverse
+    if (hbd)
+      av1_build_compound_diffwtd_mask_highbd(
+          tmp_mask[cur_mask_type], cur_mask_type, CONVERT_TO_BYTEPTR(p0), bw,
+          CONVERT_TO_BYTEPTR(p1), bw, bh, bw, xd->bd);
+    else
+      av1_build_compound_diffwtd_mask(tmp_mask[cur_mask_type], cur_mask_type,
+                                      p0, bw, p1, bw, bh, bw);
+
+    // compute rd for mask
+    uint64_t sse = av1_wedge_sse_from_residuals(residual1, diff10,
+                                                tmp_mask[cur_mask_type], N);
+    sse = ROUND_POWER_OF_TWO(sse, bd_round);
+
+    model_rd_sse_fn[MODELRD_TYPE_MASKED_COMPOUND](cpi, x, bsize, 0, sse, N,
+                                                  &rate, &dist);
+    const int64_t rd0 = RDCOST(x->rdmult, rate, dist);
+
+    if (rd0 < best_rd) {
+      best_mask_type = cur_mask_type;
+      best_rd = rd0;
+      *best_sse = sse;
+    }
+  }
+  mbmi->interinter_comp.mask_type = best_mask_type;
+  if (best_mask_type == DIFFWTD_38_INV) {
+    memcpy(xd->seg_mask, seg_mask, N * 2);
+  }
+  return best_rd;
+}
+
+static int64_t pick_interintra_wedge(const AV1_COMP *const cpi,
+                                     const MACROBLOCK *const x,
+                                     const BLOCK_SIZE bsize,
+                                     const uint8_t *const p0,
+                                     const uint8_t *const p1) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  assert(av1_is_wedge_used(bsize));
+  assert(cpi->common.seq_params.enable_interintra_compound);
+
+  const struct buf_2d *const src = &x->plane[0].src;
+  const int bw = block_size_wide[bsize];
+  const int bh = block_size_high[bsize];
+  DECLARE_ALIGNED(32, int16_t, residual1[MAX_SB_SQUARE]);  // src - pred1
+  DECLARE_ALIGNED(32, int16_t, diff10[MAX_SB_SQUARE]);     // pred1 - pred0
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (is_cur_buf_hbd(xd)) {
+    aom_highbd_subtract_block(bh, bw, residual1, bw, src->buf, src->stride,
+                              CONVERT_TO_BYTEPTR(p1), bw, xd->bd);
+    aom_highbd_subtract_block(bh, bw, diff10, bw, CONVERT_TO_BYTEPTR(p1), bw,
+                              CONVERT_TO_BYTEPTR(p0), bw, xd->bd);
+  } else {
+    aom_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, p1, bw);
+    aom_subtract_block(bh, bw, diff10, bw, p1, bw, p0, bw);
+  }
+#else
+  aom_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, p1, bw);
+  aom_subtract_block(bh, bw, diff10, bw, p1, bw, p0, bw);
+#endif
+  int8_t wedge_index = -1;
+  uint64_t sse;
+  int64_t rd = pick_wedge_fixed_sign(cpi, x, bsize, residual1, diff10, 0,
+                                     &wedge_index, &sse);
+
+  mbmi->interintra_wedge_index = wedge_index;
+  return rd;
+}
+
+static AOM_INLINE void get_inter_predictors_masked_compound(
+    MACROBLOCK *x, const BLOCK_SIZE bsize, uint8_t **preds0, uint8_t **preds1,
+    int16_t *residual1, int16_t *diff10, int *strides) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  const int bw = block_size_wide[bsize];
+  const int bh = block_size_high[bsize];
+  // get inter predictors to use for masked compound modes
+  av1_build_inter_predictors_for_planes_single_buf(xd, bsize, 0, 0, 0, preds0,
+                                                   strides);
+  av1_build_inter_predictors_for_planes_single_buf(xd, bsize, 0, 0, 1, preds1,
+                                                   strides);
+  const struct buf_2d *const src = &x->plane[0].src;
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (is_cur_buf_hbd(xd)) {
+    aom_highbd_subtract_block(bh, bw, residual1, bw, src->buf, src->stride,
+                              CONVERT_TO_BYTEPTR(*preds1), bw, xd->bd);
+    aom_highbd_subtract_block(bh, bw, diff10, bw, CONVERT_TO_BYTEPTR(*preds1),
+                              bw, CONVERT_TO_BYTEPTR(*preds0), bw, xd->bd);
+  } else {
+    aom_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, *preds1,
+                       bw);
+    aom_subtract_block(bh, bw, diff10, bw, *preds1, bw, *preds0, bw);
+  }
+#else
+  aom_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, *preds1, bw);
+  aom_subtract_block(bh, bw, diff10, bw, *preds1, bw, *preds0, bw);
+#endif
+}
+
+// Computes the rd cost for the given interintra mode and updates the best
+static INLINE void compute_best_interintra_mode(
+    const AV1_COMP *const cpi, MB_MODE_INFO *mbmi, MACROBLOCKD *xd,
+    MACROBLOCK *const x, const int *const interintra_mode_cost,
+    const BUFFER_SET *orig_dst, uint8_t *intrapred, const uint8_t *tmp_buf,
+    INTERINTRA_MODE *best_interintra_mode, int64_t *best_interintra_rd,
+    INTERINTRA_MODE interintra_mode, BLOCK_SIZE bsize) {
+  const AV1_COMMON *const cm = &cpi->common;
+  int rate, skip_txfm_sb;
+  int64_t dist, skip_sse_sb;
+  const int bw = block_size_wide[bsize];
+  mbmi->interintra_mode = interintra_mode;
+  int rmode = interintra_mode_cost[interintra_mode];
+  av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
+                                            intrapred, bw);
+  av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
+  model_rd_sb_fn[MODELRD_TYPE_INTERINTRA](cpi, bsize, x, xd, 0, 0, &rate, &dist,
+                                          &skip_txfm_sb, &skip_sse_sb, NULL,
+                                          NULL, NULL);
+  int64_t rd = RDCOST(x->rdmult, rate + rmode, dist);
+  if (rd < *best_interintra_rd) {
+    *best_interintra_rd = rd;
+    *best_interintra_mode = mbmi->interintra_mode;
+  }
+}
+
+static int64_t estimate_yrd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bs,
+                                   MACROBLOCK *x, int64_t ref_best_rd,
+                                   RD_STATS *rd_stats) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  if (ref_best_rd < 0) return INT64_MAX;
+  av1_subtract_plane(x, bs, 0);
+  x->rd_model = LOW_TXFM_RD;
+  const int skip_trellis = (cpi->optimize_seg_arr[xd->mi[0]->segment_id] ==
+                            NO_ESTIMATE_YRD_TRELLIS_OPT);
+  const int64_t rd =
+      av1_uniform_txfm_yrd(cpi, x, rd_stats, ref_best_rd, bs,
+                           max_txsize_rect_lookup[bs], FTXS_NONE, skip_trellis);
+  x->rd_model = FULL_TXFM_RD;
+  if (rd != INT64_MAX) {
+    const int skip_ctx = av1_get_skip_context(xd);
+    if (rd_stats->skip) {
+      const int s1 = x->skip_cost[skip_ctx][1];
+      rd_stats->rate = s1;
+    } else {
+      const int s0 = x->skip_cost[skip_ctx][0];
+      rd_stats->rate += s0;
+    }
+  }
+  return rd;
+}
+
+// Computes the rd_threshold for smooth interintra rd search.
+static AOM_INLINE int64_t compute_rd_thresh(MACROBLOCK *const x,
+                                            int total_mode_rate,
+                                            int64_t ref_best_rd) {
+  const int64_t rd_thresh = get_rd_thresh_from_best_rd(
+      ref_best_rd, (1 << INTER_INTRA_RD_THRESH_SHIFT),
+      INTER_INTRA_RD_THRESH_SCALE);
+  const int64_t mode_rd = RDCOST(x->rdmult, total_mode_rate, 0);
+  return (rd_thresh - mode_rd);
+}
+
+// Computes the best wedge interintra mode
+static AOM_INLINE int64_t compute_best_wedge_interintra(
+    const AV1_COMP *const cpi, MB_MODE_INFO *mbmi, MACROBLOCKD *xd,
+    MACROBLOCK *const x, const int *const interintra_mode_cost,
+    const BUFFER_SET *orig_dst, uint8_t *intrapred_, uint8_t *tmp_buf_,
+    int *best_mode, int *best_wedge_index, BLOCK_SIZE bsize) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int bw = block_size_wide[bsize];
+  int64_t best_interintra_rd_wedge = INT64_MAX;
+  int64_t best_total_rd = INT64_MAX;
+  uint8_t *intrapred = get_buf_by_bd(xd, intrapred_);
+  for (INTERINTRA_MODE mode = 0; mode < INTERINTRA_MODES; ++mode) {
+    mbmi->interintra_mode = mode;
+    av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
+                                              intrapred, bw);
+    int64_t rd = pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_);
+    const int rate_overhead =
+        interintra_mode_cost[mode] +
+        x->wedge_idx_cost[bsize][mbmi->interintra_wedge_index];
+    const int64_t total_rd = rd + RDCOST(x->rdmult, rate_overhead, 0);
+    if (total_rd < best_total_rd) {
+      best_total_rd = total_rd;
+      best_interintra_rd_wedge = rd;
+      *best_mode = mbmi->interintra_mode;
+      *best_wedge_index = mbmi->interintra_wedge_index;
+    }
+  }
+  return best_interintra_rd_wedge;
+}
+
+int av1_handle_inter_intra_mode(const AV1_COMP *const cpi, MACROBLOCK *const x,
+                                BLOCK_SIZE bsize, MB_MODE_INFO *mbmi,
+                                HandleInterModeArgs *args, int64_t ref_best_rd,
+                                int *rate_mv, int *tmp_rate2,
+                                const BUFFER_SET *orig_dst) {
+  const int try_smooth_interintra = cpi->oxcf.enable_smooth_interintra &&
+                                    !cpi->sf.inter_sf.disable_smooth_interintra;
+  const int is_wedge_used = av1_is_wedge_used(bsize);
+  const int try_wedge_interintra =
+      is_wedge_used && enable_wedge_interintra_search(x, cpi);
+  if (!try_smooth_interintra && !try_wedge_interintra) return -1;
+
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  int64_t rd = INT64_MAX;
+  const int bw = block_size_wide[bsize];
+  DECLARE_ALIGNED(16, uint8_t, tmp_buf_[2 * MAX_INTERINTRA_SB_SQUARE]);
+  DECLARE_ALIGNED(16, uint8_t, intrapred_[2 * MAX_INTERINTRA_SB_SQUARE]);
+  uint8_t *tmp_buf = get_buf_by_bd(xd, tmp_buf_);
+  uint8_t *intrapred = get_buf_by_bd(xd, intrapred_);
+  const int *const interintra_mode_cost =
+      x->interintra_mode_cost[size_group_lookup[bsize]];
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+
+  // Single reference inter prediction
+  mbmi->ref_frame[1] = NONE_FRAME;
+  xd->plane[0].dst.buf = tmp_buf;
+  xd->plane[0].dst.stride = bw;
+  av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+                                AOM_PLANE_Y, AOM_PLANE_Y);
+  const int num_planes = av1_num_planes(cm);
+
+  // Restore the buffers for intra prediction
+  restore_dst_buf(xd, *orig_dst, num_planes);
+  mbmi->ref_frame[1] = INTRA_FRAME;
+  INTERINTRA_MODE best_interintra_mode =
+      args->inter_intra_mode[mbmi->ref_frame[0]];
+
+  // Compute smooth_interintra
+  int64_t best_interintra_rd_nowedge = INT64_MAX;
+  int best_mode_rate = INT_MAX;
+  if (try_smooth_interintra) {
+    mbmi->use_wedge_interintra = 0;
+    int interintra_mode_reuse = 1;
+    if (cpi->sf.inter_sf.reuse_inter_intra_mode == 0 ||
+        best_interintra_mode == INTERINTRA_MODES) {
+      interintra_mode_reuse = 0;
+      int64_t best_interintra_rd = INT64_MAX;
+      for (INTERINTRA_MODE cur_mode = 0; cur_mode < INTERINTRA_MODES;
+           ++cur_mode) {
+        if ((!cpi->oxcf.enable_smooth_intra ||
+             cpi->sf.intra_sf.disable_smooth_intra) &&
+            cur_mode == II_SMOOTH_PRED)
+          continue;
+        compute_best_interintra_mode(cpi, mbmi, xd, x, interintra_mode_cost,
+                                     orig_dst, intrapred, tmp_buf,
+                                     &best_interintra_mode, &best_interintra_rd,
+                                     cur_mode, bsize);
+      }
+      args->inter_intra_mode[mbmi->ref_frame[0]] = best_interintra_mode;
+    }
+    assert(IMPLIES(!cpi->oxcf.enable_smooth_interintra ||
+                       cpi->sf.inter_sf.disable_smooth_interintra,
+                   best_interintra_mode != II_SMOOTH_PRED));
+    // Recompute prediction if required
+    if (interintra_mode_reuse || best_interintra_mode != INTERINTRA_MODES - 1) {
+      mbmi->interintra_mode = best_interintra_mode;
+      av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
+                                                intrapred, bw);
+      av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
+    }
+
+    // Compute rd cost for best smooth_interintra
+    RD_STATS rd_stats;
+    const int rmode = interintra_mode_cost[best_interintra_mode] +
+                      (is_wedge_used ? x->wedge_interintra_cost[bsize][0] : 0);
+    const int total_mode_rate = rmode + *rate_mv;
+    const int64_t rd_thresh =
+        compute_rd_thresh(x, total_mode_rate, ref_best_rd);
+    rd = estimate_yrd_for_sb(cpi, bsize, x, rd_thresh, &rd_stats);
+    if (rd != INT64_MAX) {
+      rd = RDCOST(x->rdmult, total_mode_rate + rd_stats.rate, rd_stats.dist);
+    } else {
+      return -1;
+    }
+    best_interintra_rd_nowedge = rd;
+    best_mode_rate = rmode;
+    // Return early if best_interintra_rd_nowedge not good enough
+    if (ref_best_rd < INT64_MAX &&
+        (best_interintra_rd_nowedge >> INTER_INTRA_RD_THRESH_SHIFT) *
+                INTER_INTRA_RD_THRESH_SCALE >
+            ref_best_rd) {
+      return -1;
+    }
+  }
+
+  // Compute wedge interintra
+  int64_t best_interintra_rd_wedge = INT64_MAX;
+  if (try_wedge_interintra) {
+    mbmi->use_wedge_interintra = 1;
+    if (!cpi->sf.inter_sf.fast_interintra_wedge_search) {
+      // Exhaustive search of all wedge and mode combinations.
+      int best_mode = 0;
+      int best_wedge_index = 0;
+      best_interintra_rd_wedge = compute_best_wedge_interintra(
+          cpi, mbmi, xd, x, interintra_mode_cost, orig_dst, intrapred_,
+          tmp_buf_, &best_mode, &best_wedge_index, bsize);
+      mbmi->interintra_mode = best_mode;
+      mbmi->interintra_wedge_index = best_wedge_index;
+      if (best_mode != INTERINTRA_MODES - 1) {
+        av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
+                                                  intrapred, bw);
+      }
+    } else if (!try_smooth_interintra) {
+      if (best_interintra_mode == INTERINTRA_MODES) {
+        mbmi->interintra_mode = INTERINTRA_MODES - 1;
+        best_interintra_mode = INTERINTRA_MODES - 1;
+        av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
+                                                  intrapred, bw);
+        // Pick wedge mask based on INTERINTRA_MODES - 1
+        best_interintra_rd_wedge =
+            pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_);
+        // Find the best interintra mode for the chosen wedge mask
+        for (INTERINTRA_MODE cur_mode = 0; cur_mode < INTERINTRA_MODES;
+             ++cur_mode) {
+          compute_best_interintra_mode(
+              cpi, mbmi, xd, x, interintra_mode_cost, orig_dst, intrapred,
+              tmp_buf, &best_interintra_mode, &best_interintra_rd_wedge,
+              cur_mode, bsize);
+        }
+        args->inter_intra_mode[mbmi->ref_frame[0]] = best_interintra_mode;
+        mbmi->interintra_mode = best_interintra_mode;
+
+        // Recompute prediction if required
+        if (best_interintra_mode != INTERINTRA_MODES - 1) {
+          av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
+                                                    intrapred, bw);
+        }
+      } else {
+        // Pick wedge mask for the best interintra mode (reused)
+        mbmi->interintra_mode = best_interintra_mode;
+        av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
+                                                  intrapred, bw);
+        best_interintra_rd_wedge =
+            pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_);
+      }
+    } else {
+      // Pick wedge mask for the best interintra mode from smooth_interintra
+      best_interintra_rd_wedge =
+          pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_);
+    }
+
+    const int rate_overhead =
+        interintra_mode_cost[mbmi->interintra_mode] +
+        x->wedge_idx_cost[bsize][mbmi->interintra_wedge_index] +
+        x->wedge_interintra_cost[bsize][1];
+    best_interintra_rd_wedge += RDCOST(x->rdmult, rate_overhead + *rate_mv, 0);
+
+    const int_mv mv0 = mbmi->mv[0];
+    int_mv tmp_mv = mv0;
+    rd = INT64_MAX;
+    int tmp_rate_mv = 0;
+    // Refine motion vector for NEWMV case.
+    if (have_newmv_in_inter_mode(mbmi->mode)) {
+      int rate_sum, skip_txfm_sb;
+      int64_t dist_sum, skip_sse_sb;
+      // get negative of mask
+      const uint8_t *mask =
+          av1_get_contiguous_soft_mask(mbmi->interintra_wedge_index, 1, bsize);
+      av1_compound_single_motion_search(cpi, x, bsize, &tmp_mv.as_mv, intrapred,
+                                        mask, bw, &tmp_rate_mv, 0);
+      if (mbmi->mv[0].as_int != tmp_mv.as_int) {
+        mbmi->mv[0].as_int = tmp_mv.as_int;
+        // Set ref_frame[1] to NONE_FRAME temporarily so that the intra
+        // predictor is not calculated again in av1_enc_build_inter_predictor().
+        mbmi->ref_frame[1] = NONE_FRAME;
+        av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+                                      AOM_PLANE_Y, AOM_PLANE_Y);
+        mbmi->ref_frame[1] = INTRA_FRAME;
+        av1_combine_interintra(xd, bsize, 0, xd->plane[AOM_PLANE_Y].dst.buf,
+                               xd->plane[AOM_PLANE_Y].dst.stride, intrapred,
+                               bw);
+        model_rd_sb_fn[MODELRD_TYPE_MASKED_COMPOUND](
+            cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum, &skip_txfm_sb,
+            &skip_sse_sb, NULL, NULL, NULL);
+        rd =
+            RDCOST(x->rdmult, tmp_rate_mv + rate_overhead + rate_sum, dist_sum);
+      }
+    }
+    if (rd >= best_interintra_rd_wedge) {
+      tmp_mv.as_int = mv0.as_int;
+      tmp_rate_mv = *rate_mv;
+      av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
+    }
+    // Evaluate closer to true rd
+    RD_STATS rd_stats;
+    const int64_t mode_rd = RDCOST(x->rdmult, rate_overhead + tmp_rate_mv, 0);
+    const int64_t tmp_rd_thresh = best_interintra_rd_nowedge - mode_rd;
+    rd = estimate_yrd_for_sb(cpi, bsize, x, tmp_rd_thresh, &rd_stats);
+    if (rd != INT64_MAX) {
+      rd = RDCOST(x->rdmult, rate_overhead + tmp_rate_mv + rd_stats.rate,
+                  rd_stats.dist);
+    } else {
+      if (best_interintra_rd_nowedge == INT64_MAX) return -1;
+    }
+    best_interintra_rd_wedge = rd;
+    if (best_interintra_rd_wedge < best_interintra_rd_nowedge) {
+      mbmi->mv[0].as_int = tmp_mv.as_int;
+      *tmp_rate2 += tmp_rate_mv - *rate_mv;
+      *rate_mv = tmp_rate_mv;
+      best_mode_rate = rate_overhead;
+    } else {
+      mbmi->use_wedge_interintra = 0;
+      mbmi->interintra_mode = best_interintra_mode;
+      mbmi->mv[0].as_int = mv0.as_int;
+      av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+                                    AOM_PLANE_Y, AOM_PLANE_Y);
+    }
+  }
+
+  if (best_interintra_rd_nowedge == INT64_MAX &&
+      best_interintra_rd_wedge == INT64_MAX) {
+    return -1;
+  }
+
+  *tmp_rate2 += best_mode_rate;
+
+  if (num_planes > 1) {
+    av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+                                  AOM_PLANE_U, num_planes - 1);
+  }
+  return 0;
+}
+
+static void alloc_compound_type_rd_buffers_no_check(
+    CompoundTypeRdBuffers *const bufs) {
+  bufs->pred0 =
+      (uint8_t *)aom_memalign(16, 2 * MAX_SB_SQUARE * sizeof(*bufs->pred0));
+  bufs->pred1 =
+      (uint8_t *)aom_memalign(16, 2 * MAX_SB_SQUARE * sizeof(*bufs->pred1));
+  bufs->residual1 =
+      (int16_t *)aom_memalign(32, MAX_SB_SQUARE * sizeof(*bufs->residual1));
+  bufs->diff10 =
+      (int16_t *)aom_memalign(32, MAX_SB_SQUARE * sizeof(*bufs->diff10));
+  bufs->tmp_best_mask_buf = (uint8_t *)aom_malloc(
+      2 * MAX_SB_SQUARE * sizeof(*bufs->tmp_best_mask_buf));
+}
+
+// Computes the valid compound_types to be evaluated
+static INLINE int compute_valid_comp_types(
+    MACROBLOCK *x, const AV1_COMP *const cpi, int *try_average_and_distwtd_comp,
+    BLOCK_SIZE bsize, int masked_compound_used, int mode_search_mask,
+    COMPOUND_TYPE *valid_comp_types) {
+  const AV1_COMMON *cm = &cpi->common;
+  int valid_type_count = 0;
+  int comp_type, valid_check;
+  int8_t enable_masked_type[MASKED_COMPOUND_TYPES] = { 0, 0 };
+
+  const int try_average_comp = (mode_search_mask & (1 << COMPOUND_AVERAGE));
+  const int try_distwtd_comp =
+      ((mode_search_mask & (1 << COMPOUND_DISTWTD)) &&
+       cm->seq_params.order_hint_info.enable_dist_wtd_comp == 1 &&
+       cpi->sf.inter_sf.use_dist_wtd_comp_flag != DIST_WTD_COMP_DISABLED);
+  *try_average_and_distwtd_comp = try_average_comp && try_distwtd_comp;
+
+  // Check if COMPOUND_AVERAGE and COMPOUND_DISTWTD are valid cases
+  for (comp_type = COMPOUND_AVERAGE; comp_type <= COMPOUND_DISTWTD;
+       comp_type++) {
+    valid_check =
+        (comp_type == COMPOUND_AVERAGE) ? try_average_comp : try_distwtd_comp;
+    if (!*try_average_and_distwtd_comp && valid_check &&
+        is_interinter_compound_used(comp_type, bsize))
+      valid_comp_types[valid_type_count++] = comp_type;
+  }
+  // Check if COMPOUND_WEDGE and COMPOUND_DIFFWTD are valid cases
+  if (masked_compound_used) {
+    // enable_masked_type[0] corresponds to COMPOUND_WEDGE
+    // enable_masked_type[1] corresponds to COMPOUND_DIFFWTD
+    enable_masked_type[0] = enable_wedge_interinter_search(x, cpi);
+    enable_masked_type[1] = cpi->oxcf.enable_diff_wtd_comp;
+    for (comp_type = COMPOUND_WEDGE; comp_type <= COMPOUND_DIFFWTD;
+         comp_type++) {
+      if ((mode_search_mask & (1 << comp_type)) &&
+          is_interinter_compound_used(comp_type, bsize) &&
+          enable_masked_type[comp_type - COMPOUND_WEDGE])
+        valid_comp_types[valid_type_count++] = comp_type;
+    }
+  }
+  return valid_type_count;
+}
+
+// Calculates the cost for compound type mask
+static INLINE void calc_masked_type_cost(MACROBLOCK *x, BLOCK_SIZE bsize,
+                                         int comp_group_idx_ctx,
+                                         int comp_index_ctx,
+                                         int masked_compound_used,
+                                         int *masked_type_cost) {
+  av1_zero_array(masked_type_cost, COMPOUND_TYPES);
+  // Account for group index cost when wedge and/or diffwtd prediction are
+  // enabled
+  if (masked_compound_used) {
+    // Compound group index of average and distwtd is 0
+    // Compound group index of wedge and diffwtd is 1
+    masked_type_cost[COMPOUND_AVERAGE] +=
+        x->comp_group_idx_cost[comp_group_idx_ctx][0];
+    masked_type_cost[COMPOUND_DISTWTD] += masked_type_cost[COMPOUND_AVERAGE];
+    masked_type_cost[COMPOUND_WEDGE] +=
+        x->comp_group_idx_cost[comp_group_idx_ctx][1];
+    masked_type_cost[COMPOUND_DIFFWTD] += masked_type_cost[COMPOUND_WEDGE];
+  }
+
+  // Compute the cost to signal compound index/type
+  masked_type_cost[COMPOUND_AVERAGE] += x->comp_idx_cost[comp_index_ctx][1];
+  masked_type_cost[COMPOUND_DISTWTD] += x->comp_idx_cost[comp_index_ctx][0];
+  masked_type_cost[COMPOUND_WEDGE] += x->compound_type_cost[bsize][0];
+  masked_type_cost[COMPOUND_DIFFWTD] += x->compound_type_cost[bsize][1];
+}
+
+// Updates mbmi structure with the relevant compound type info
+static INLINE void update_mbmi_for_compound_type(MB_MODE_INFO *mbmi,
+                                                 COMPOUND_TYPE cur_type) {
+  mbmi->interinter_comp.type = cur_type;
+  mbmi->comp_group_idx = (cur_type >= COMPOUND_WEDGE);
+  mbmi->compound_idx = (cur_type != COMPOUND_DISTWTD);
+}
+
+// When match is found, populate the compound type data
+// and calculate the rd cost using the stored stats and
+// update the mbmi appropriately.
+static INLINE int populate_reuse_comp_type_data(
+    const MACROBLOCK *x, MB_MODE_INFO *mbmi,
+    BEST_COMP_TYPE_STATS *best_type_stats, int_mv *cur_mv, int32_t *comp_rate,
+    int64_t *comp_dist, int *comp_rs2, int *rate_mv, int64_t *rd,
+    int match_index) {
+  const int winner_comp_type =
+      x->comp_rd_stats[match_index].interinter_comp.type;
+  if (comp_rate[winner_comp_type] == INT_MAX)
+    return best_type_stats->best_compmode_interinter_cost;
+  update_mbmi_for_compound_type(mbmi, winner_comp_type);
+  mbmi->interinter_comp = x->comp_rd_stats[match_index].interinter_comp;
+  *rd = RDCOST(
+      x->rdmult,
+      comp_rs2[winner_comp_type] + *rate_mv + comp_rate[winner_comp_type],
+      comp_dist[winner_comp_type]);
+  mbmi->mv[0].as_int = cur_mv[0].as_int;
+  mbmi->mv[1].as_int = cur_mv[1].as_int;
+  return comp_rs2[winner_comp_type];
+}
+
+// Updates rd cost and relevant compound type data for the best compound type
+static INLINE void update_best_info(const MB_MODE_INFO *const mbmi, int64_t *rd,
+                                    BEST_COMP_TYPE_STATS *best_type_stats,
+                                    int64_t best_rd_cur,
+                                    int64_t comp_model_rd_cur, int rs2) {
+  *rd = best_rd_cur;
+  best_type_stats->comp_best_model_rd = comp_model_rd_cur;
+  best_type_stats->best_compound_data = mbmi->interinter_comp;
+  best_type_stats->best_compmode_interinter_cost = rs2;
+}
+
+// Updates best_mv for masked compound types
+static INLINE void update_mask_best_mv(const MB_MODE_INFO *const mbmi,
+                                       int_mv *best_mv, int_mv *cur_mv,
+                                       const COMPOUND_TYPE cur_type,
+                                       int *best_tmp_rate_mv, int tmp_rate_mv,
+                                       const SPEED_FEATURES *const sf) {
+  if (cur_type == COMPOUND_WEDGE ||
+      (sf->inter_sf.enable_interinter_diffwtd_newmv_search &&
+       cur_type == COMPOUND_DIFFWTD)) {
+    *best_tmp_rate_mv = tmp_rate_mv;
+    best_mv[0].as_int = mbmi->mv[0].as_int;
+    best_mv[1].as_int = mbmi->mv[1].as_int;
+  } else {
+    best_mv[0].as_int = cur_mv[0].as_int;
+    best_mv[1].as_int = cur_mv[1].as_int;
+  }
+}
+
+// Choose the better of the two COMPOUND_AVERAGE,
+// COMPOUND_DISTWTD based on modeled cost
+static int find_best_avg_distwtd_comp_type(MACROBLOCK *x, int *comp_model_rate,
+                                           int64_t *comp_model_dist,
+                                           int rate_mv, int64_t *best_rd) {
+  int64_t est_rd[2];
+  est_rd[COMPOUND_AVERAGE] =
+      RDCOST(x->rdmult, comp_model_rate[COMPOUND_AVERAGE] + rate_mv,
+             comp_model_dist[COMPOUND_AVERAGE]);
+  est_rd[COMPOUND_DISTWTD] =
+      RDCOST(x->rdmult, comp_model_rate[COMPOUND_DISTWTD] + rate_mv,
+             comp_model_dist[COMPOUND_DISTWTD]);
+  int best_type = (est_rd[COMPOUND_AVERAGE] <= est_rd[COMPOUND_DISTWTD])
+                      ? COMPOUND_AVERAGE
+                      : COMPOUND_DISTWTD;
+  *best_rd = est_rd[best_type];
+  return best_type;
+}
+
+static INLINE void save_comp_rd_search_stat(
+    MACROBLOCK *x, const MB_MODE_INFO *const mbmi, const int32_t *comp_rate,
+    const int64_t *comp_dist, const int32_t *comp_model_rate,
+    const int64_t *comp_model_dist, const int_mv *cur_mv, const int *comp_rs2) {
+  const int offset = x->comp_rd_stats_idx;
+  if (offset < MAX_COMP_RD_STATS) {
+    COMP_RD_STATS *const rd_stats = x->comp_rd_stats + offset;
+    memcpy(rd_stats->rate, comp_rate, sizeof(rd_stats->rate));
+    memcpy(rd_stats->dist, comp_dist, sizeof(rd_stats->dist));
+    memcpy(rd_stats->model_rate, comp_model_rate, sizeof(rd_stats->model_rate));
+    memcpy(rd_stats->model_dist, comp_model_dist, sizeof(rd_stats->model_dist));
+    memcpy(rd_stats->comp_rs2, comp_rs2, sizeof(rd_stats->comp_rs2));
+    memcpy(rd_stats->mv, cur_mv, sizeof(rd_stats->mv));
+    memcpy(rd_stats->ref_frames, mbmi->ref_frame, sizeof(rd_stats->ref_frames));
+    rd_stats->mode = mbmi->mode;
+    rd_stats->filter = mbmi->interp_filters;
+    rd_stats->ref_mv_idx = mbmi->ref_mv_idx;
+    const MACROBLOCKD *const xd = &x->e_mbd;
+    for (int i = 0; i < 2; ++i) {
+      const WarpedMotionParams *const wm =
+          &xd->global_motion[mbmi->ref_frame[i]];
+      rd_stats->is_global[i] = is_global_mv_block(mbmi, wm->wmtype);
+    }
+    memcpy(&rd_stats->interinter_comp, &mbmi->interinter_comp,
+           sizeof(rd_stats->interinter_comp));
+    ++x->comp_rd_stats_idx;
+  }
+}
+
+static INLINE int get_interinter_compound_mask_rate(
+    const MACROBLOCK *const x, const MB_MODE_INFO *const mbmi) {
+  const COMPOUND_TYPE compound_type = mbmi->interinter_comp.type;
+  // This function will be called only for COMPOUND_WEDGE and COMPOUND_DIFFWTD
+  if (compound_type == COMPOUND_WEDGE) {
+    return av1_is_wedge_used(mbmi->sb_type)
+               ? av1_cost_literal(1) +
+                     x->wedge_idx_cost[mbmi->sb_type]
+                                      [mbmi->interinter_comp.wedge_index]
+               : 0;
+  } else {
+    assert(compound_type == COMPOUND_DIFFWTD);
+    return av1_cost_literal(1);
+  }
+}
+
+// Takes a backup of rate, distortion and model_rd for future reuse
+static INLINE void backup_stats(COMPOUND_TYPE cur_type, int32_t *comp_rate,
+                                int64_t *comp_dist, int32_t *comp_model_rate,
+                                int64_t *comp_model_dist, int rate_sum,
+                                int64_t dist_sum, RD_STATS *rd_stats,
+                                int *comp_rs2, int rs2) {
+  comp_rate[cur_type] = rd_stats->rate;
+  comp_dist[cur_type] = rd_stats->dist;
+  comp_model_rate[cur_type] = rate_sum;
+  comp_model_dist[cur_type] = dist_sum;
+  comp_rs2[cur_type] = rs2;
+}
+
+static int64_t masked_compound_type_rd(
+    const AV1_COMP *const cpi, MACROBLOCK *x, const int_mv *const cur_mv,
+    const BLOCK_SIZE bsize, const PREDICTION_MODE this_mode, int *rs2,
+    int rate_mv, const BUFFER_SET *ctx, int *out_rate_mv, uint8_t **preds0,
+    uint8_t **preds1, int16_t *residual1, int16_t *diff10, int *strides,
+    int mode_rate, int64_t rd_thresh, int *calc_pred_masked_compound,
+    int32_t *comp_rate, int64_t *comp_dist, int32_t *comp_model_rate,
+    int64_t *comp_model_dist, const int64_t comp_best_model_rd,
+    int64_t *const comp_model_rd_cur, int *comp_rs2, int64_t ref_skip_rd) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  int64_t best_rd_cur = INT64_MAX;
+  int64_t rd = INT64_MAX;
+  const COMPOUND_TYPE compound_type = mbmi->interinter_comp.type;
+  // This function will be called only for COMPOUND_WEDGE and COMPOUND_DIFFWTD
+  assert(compound_type == COMPOUND_WEDGE || compound_type == COMPOUND_DIFFWTD);
+  int rate_sum, tmp_skip_txfm_sb;
+  int64_t dist_sum, tmp_skip_sse_sb;
+  pick_interinter_mask_type pick_interinter_mask[2] = { pick_interinter_wedge,
+                                                        pick_interinter_seg };
+
+  // TODO(any): Save pred and mask calculation as well into records. However
+  // this may increase memory requirements as compound segment mask needs to be
+  // stored in each record.
+  if (*calc_pred_masked_compound) {
+    get_inter_predictors_masked_compound(x, bsize, preds0, preds1, residual1,
+                                         diff10, strides);
+    *calc_pred_masked_compound = 0;
+  }
+  if (cpi->sf.inter_sf.prune_wedge_pred_diff_based &&
+      compound_type == COMPOUND_WEDGE) {
+    unsigned int sse;
+    if (is_cur_buf_hbd(xd))
+      (void)cpi->fn_ptr[bsize].vf(CONVERT_TO_BYTEPTR(*preds0), *strides,
+                                  CONVERT_TO_BYTEPTR(*preds1), *strides, &sse);
+    else
+      (void)cpi->fn_ptr[bsize].vf(*preds0, *strides, *preds1, *strides, &sse);
+    const unsigned int mse =
+        ROUND_POWER_OF_TWO(sse, num_pels_log2_lookup[bsize]);
+    // If two predictors are very similar, skip wedge compound mode search
+    if (mse < 8 || (!have_newmv_in_inter_mode(this_mode) && mse < 64)) {
+      *comp_model_rd_cur = INT64_MAX;
+      return INT64_MAX;
+    }
+  }
+  // Function pointer to pick the appropriate mask
+  // compound_type == COMPOUND_WEDGE, calls pick_interinter_wedge()
+  // compound_type == COMPOUND_DIFFWTD, calls pick_interinter_seg()
+  uint64_t cur_sse = UINT64_MAX;
+  best_rd_cur = pick_interinter_mask[compound_type - COMPOUND_WEDGE](
+      cpi, x, bsize, *preds0, *preds1, residual1, diff10, &cur_sse);
+  *rs2 += get_interinter_compound_mask_rate(x, mbmi);
+  best_rd_cur += RDCOST(x->rdmult, *rs2 + rate_mv, 0);
+  assert(cur_sse != UINT64_MAX);
+  int64_t skip_rd_cur = RDCOST(x->rdmult, *rs2 + rate_mv, (cur_sse << 4));
+
+  // Although the true rate_mv might be different after motion search, but it
+  // is unlikely to be the best mode considering the transform rd cost and other
+  // mode overhead cost
+  int64_t mode_rd = RDCOST(x->rdmult, *rs2 + mode_rate, 0);
+  if (mode_rd > rd_thresh) {
+    *comp_model_rd_cur = INT64_MAX;
+    return INT64_MAX;
+  }
+
+  // Check if the mode is good enough based on skip rd
+  // TODO(nithya): Handle wedge_newmv_search if extending for lower speed
+  // setting
+  if (cpi->sf.inter_sf.txfm_rd_gate_level) {
+    int eval_txfm = check_txfm_eval(x, bsize, ref_skip_rd, skip_rd_cur,
+                                    cpi->sf.inter_sf.txfm_rd_gate_level, 1);
+    if (!eval_txfm) {
+      *comp_model_rd_cur = INT64_MAX;
+      return INT64_MAX;
+    }
+  }
+
+  // Compute cost if matching record not found, else, reuse data
+  if (comp_rate[compound_type] == INT_MAX) {
+    // Check whether new MV search for wedge is to be done
+    int wedge_newmv_search =
+        have_newmv_in_inter_mode(this_mode) &&
+        (compound_type == COMPOUND_WEDGE) &&
+        (!cpi->sf.inter_sf.disable_interinter_wedge_newmv_search);
+    int diffwtd_newmv_search =
+        cpi->sf.inter_sf.enable_interinter_diffwtd_newmv_search &&
+        compound_type == COMPOUND_DIFFWTD &&
+        have_newmv_in_inter_mode(this_mode);
+
+    // Search for new MV if needed and build predictor
+    if (wedge_newmv_search) {
+      *out_rate_mv = av1_interinter_compound_motion_search(cpi, x, cur_mv,
+                                                           bsize, this_mode);
+      const int mi_row = xd->mi_row;
+      const int mi_col = xd->mi_col;
+      av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, ctx, bsize,
+                                    AOM_PLANE_Y, AOM_PLANE_Y);
+    } else if (diffwtd_newmv_search) {
+      *out_rate_mv = av1_interinter_compound_motion_search(cpi, x, cur_mv,
+                                                           bsize, this_mode);
+      // we need to update the mask according to the new motion vector
+      CompoundTypeRdBuffers tmp_buf;
+      int64_t tmp_rd = INT64_MAX;
+      alloc_compound_type_rd_buffers_no_check(&tmp_buf);
+
+      uint8_t *tmp_preds0[1] = { tmp_buf.pred0 };
+      uint8_t *tmp_preds1[1] = { tmp_buf.pred1 };
+
+      get_inter_predictors_masked_compound(x, bsize, tmp_preds0, tmp_preds1,
+                                           tmp_buf.residual1, tmp_buf.diff10,
+                                           strides);
+
+      tmp_rd = pick_interinter_mask[compound_type - COMPOUND_WEDGE](
+          cpi, x, bsize, *tmp_preds0, *tmp_preds1, tmp_buf.residual1,
+          tmp_buf.diff10, &cur_sse);
+      // we can reuse rs2 here
+      tmp_rd += RDCOST(x->rdmult, *rs2 + *out_rate_mv, 0);
+
+      if (tmp_rd >= best_rd_cur) {
+        // restore the motion vector
+        mbmi->mv[0].as_int = cur_mv[0].as_int;
+        mbmi->mv[1].as_int = cur_mv[1].as_int;
+        *out_rate_mv = rate_mv;
+        av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0, preds0,
+                                                 strides, preds1, strides);
+      } else {
+        // build the final prediciton using the updated mv
+        av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0, tmp_preds0,
+                                                 strides, tmp_preds1, strides);
+      }
+      av1_release_compound_type_rd_buffers(&tmp_buf);
+    } else {
+      *out_rate_mv = rate_mv;
+      av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0, preds0, strides,
+                                               preds1, strides);
+    }
+    // Get the RD cost from model RD
+    model_rd_sb_fn[MODELRD_TYPE_MASKED_COMPOUND](
+        cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum, &tmp_skip_txfm_sb,
+        &tmp_skip_sse_sb, NULL, NULL, NULL);
+    rd = RDCOST(x->rdmult, *rs2 + *out_rate_mv + rate_sum, dist_sum);
+    *comp_model_rd_cur = rd;
+    // Override with best if current is worse than best for new MV
+    if (wedge_newmv_search) {
+      if (rd >= best_rd_cur) {
+        mbmi->mv[0].as_int = cur_mv[0].as_int;
+        mbmi->mv[1].as_int = cur_mv[1].as_int;
+        *out_rate_mv = rate_mv;
+        av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0, preds0,
+                                                 strides, preds1, strides);
+        *comp_model_rd_cur = best_rd_cur;
+      }
+    }
+    if (cpi->sf.inter_sf.prune_comp_type_by_model_rd &&
+        (*comp_model_rd_cur > comp_best_model_rd) &&
+        comp_best_model_rd != INT64_MAX) {
+      *comp_model_rd_cur = INT64_MAX;
+      return INT64_MAX;
+    }
+    // Compute RD cost for the current type
+    RD_STATS rd_stats;
+    const int64_t tmp_mode_rd = RDCOST(x->rdmult, *rs2 + *out_rate_mv, 0);
+    const int64_t tmp_rd_thresh = rd_thresh - tmp_mode_rd;
+    rd = estimate_yrd_for_sb(cpi, bsize, x, tmp_rd_thresh, &rd_stats);
+    if (rd != INT64_MAX) {
+      rd =
+          RDCOST(x->rdmult, *rs2 + *out_rate_mv + rd_stats.rate, rd_stats.dist);
+      // Backup rate and distortion for future reuse
+      backup_stats(compound_type, comp_rate, comp_dist, comp_model_rate,
+                   comp_model_dist, rate_sum, dist_sum, &rd_stats, comp_rs2,
+                   *rs2);
+    }
+  } else {
+    // Reuse data as matching record is found
+    assert(comp_dist[compound_type] != INT64_MAX);
+    // When disable_interinter_wedge_newmv_search is set, motion refinement is
+    // disabled. Hence rate and distortion can be reused in this case as well
+    assert(IMPLIES(have_newmv_in_inter_mode(this_mode),
+                   cpi->sf.inter_sf.disable_interinter_wedge_newmv_search));
+    assert(mbmi->mv[0].as_int == cur_mv[0].as_int);
+    assert(mbmi->mv[1].as_int == cur_mv[1].as_int);
+    *out_rate_mv = rate_mv;
+    // Calculate RD cost based on stored stats
+    rd = RDCOST(x->rdmult, *rs2 + *out_rate_mv + comp_rate[compound_type],
+                comp_dist[compound_type]);
+    // Recalculate model rdcost with the updated rate
+    *comp_model_rd_cur =
+        RDCOST(x->rdmult, *rs2 + *out_rate_mv + comp_model_rate[compound_type],
+               comp_model_dist[compound_type]);
+  }
+  return rd;
+}
+
+// scaling values to be used for gating wedge/compound segment based on best
+// approximate rd
+static int comp_type_rd_threshold_mul[3] = { 1, 11, 12 };
+static int comp_type_rd_threshold_div[3] = { 3, 16, 16 };
+
+int av1_compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
+                         BLOCK_SIZE bsize, int_mv *cur_mv, int mode_search_mask,
+                         int masked_compound_used, const BUFFER_SET *orig_dst,
+                         const BUFFER_SET *tmp_dst,
+                         const CompoundTypeRdBuffers *buffers, int *rate_mv,
+                         int64_t *rd, RD_STATS *rd_stats, int64_t ref_best_rd,
+                         int64_t ref_skip_rd, int *is_luma_interp_done,
+                         int64_t rd_thresh) {
+  const AV1_COMMON *cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  const PREDICTION_MODE this_mode = mbmi->mode;
+  const int bw = block_size_wide[bsize];
+  int rs2;
+  int_mv best_mv[2];
+  int best_tmp_rate_mv = *rate_mv;
+  BEST_COMP_TYPE_STATS best_type_stats;
+  // Initializing BEST_COMP_TYPE_STATS
+  best_type_stats.best_compound_data.type = COMPOUND_AVERAGE;
+  best_type_stats.best_compmode_interinter_cost = 0;
+  best_type_stats.comp_best_model_rd = INT64_MAX;
+
+  uint8_t *preds0[1] = { buffers->pred0 };
+  uint8_t *preds1[1] = { buffers->pred1 };
+  int strides[1] = { bw };
+  int tmp_rate_mv;
+  const int num_pix = 1 << num_pels_log2_lookup[bsize];
+  const int mask_len = 2 * num_pix * sizeof(uint8_t);
+  COMPOUND_TYPE cur_type;
+  // Local array to store the mask cost for different compound types
+  int masked_type_cost[COMPOUND_TYPES];
+
+  int calc_pred_masked_compound = 1;
+  int64_t comp_dist[COMPOUND_TYPES] = { INT64_MAX, INT64_MAX, INT64_MAX,
+                                        INT64_MAX };
+  int32_t comp_rate[COMPOUND_TYPES] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX };
+  int comp_rs2[COMPOUND_TYPES] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX };
+  int32_t comp_model_rate[COMPOUND_TYPES] = { INT_MAX, INT_MAX, INT_MAX,
+                                              INT_MAX };
+  int64_t comp_model_dist[COMPOUND_TYPES] = { INT64_MAX, INT64_MAX, INT64_MAX,
+                                              INT64_MAX };
+  int match_index = 0;
+  const int match_found =
+      find_comp_rd_in_stats(cpi, x, mbmi, comp_rate, comp_dist, comp_model_rate,
+                            comp_model_dist, comp_rs2, &match_index);
+  best_mv[0].as_int = cur_mv[0].as_int;
+  best_mv[1].as_int = cur_mv[1].as_int;
+  *rd = INT64_MAX;
+  int rate_sum, tmp_skip_txfm_sb;
+  int64_t dist_sum, tmp_skip_sse_sb;
+
+  // Local array to store the valid compound types to be evaluated in the core
+  // loop
+  COMPOUND_TYPE valid_comp_types[COMPOUND_TYPES] = {
+    COMPOUND_AVERAGE, COMPOUND_DISTWTD, COMPOUND_WEDGE, COMPOUND_DIFFWTD
+  };
+  int valid_type_count = 0;
+  int try_average_and_distwtd_comp = 0;
+  // compute_valid_comp_types() returns the number of valid compound types to be
+  // evaluated and populates the same in the local array valid_comp_types[].
+  // It also sets the flag 'try_average_and_distwtd_comp'
+  valid_type_count = compute_valid_comp_types(
+      x, cpi, &try_average_and_distwtd_comp, bsize, masked_compound_used,
+      mode_search_mask, valid_comp_types);
+
+  // The following context indices are independent of compound type
+  const int comp_group_idx_ctx = get_comp_group_idx_context(xd);
+  const int comp_index_ctx = get_comp_index_context(cm, xd);
+
+  // Populates masked_type_cost local array for the 4 compound types
+  calc_masked_type_cost(x, bsize, comp_group_idx_ctx, comp_index_ctx,
+                        masked_compound_used, masked_type_cost);
+
+  int64_t comp_model_rd_cur = INT64_MAX;
+  int64_t best_rd_cur = INT64_MAX;
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+
+  // If the match is found, calculate the rd cost using the
+  // stored stats and update the mbmi appropriately.
+  if (match_found && cpi->sf.inter_sf.reuse_compound_type_decision) {
+    return populate_reuse_comp_type_data(x, mbmi, &best_type_stats, cur_mv,
+                                         comp_rate, comp_dist, comp_rs2,
+                                         rate_mv, rd, match_index);
+  }
+  // Special handling if both compound_average and compound_distwtd
+  // are to be searched. In this case, first estimate between the two
+  // modes and then call estimate_yrd_for_sb() only for the better of
+  // the two.
+  if (try_average_and_distwtd_comp) {
+    int est_rate[2];
+    int64_t est_dist[2], est_rd;
+    COMPOUND_TYPE best_type;
+    // Since modelled rate and dist are separately stored,
+    // compute better of COMPOUND_AVERAGE and COMPOUND_DISTWTD
+    // using the stored stats.
+    if ((comp_model_rate[COMPOUND_AVERAGE] != INT_MAX) &&
+        comp_model_rate[COMPOUND_DISTWTD] != INT_MAX) {
+      // Choose the better of the COMPOUND_AVERAGE,
+      // COMPOUND_DISTWTD on modeled cost.
+      best_type = find_best_avg_distwtd_comp_type(
+          x, comp_model_rate, comp_model_dist, *rate_mv, &est_rd);
+      update_mbmi_for_compound_type(mbmi, best_type);
+      if (comp_rate[best_type] != INT_MAX)
+        best_rd_cur = RDCOST(
+            x->rdmult,
+            masked_type_cost[best_type] + *rate_mv + comp_rate[best_type],
+            comp_dist[best_type]);
+      comp_model_rd_cur = est_rd;
+      // Update stats for best compound type
+      if (best_rd_cur < *rd) {
+        update_best_info(mbmi, rd, &best_type_stats, best_rd_cur,
+                         comp_model_rd_cur, masked_type_cost[best_type]);
+      }
+      restore_dst_buf(xd, *tmp_dst, 1);
+    } else {
+      int64_t sse_y[COMPOUND_DISTWTD + 1];
+      // Calculate model_rd for COMPOUND_AVERAGE and COMPOUND_DISTWTD
+      for (int comp_type = COMPOUND_AVERAGE; comp_type <= COMPOUND_DISTWTD;
+           comp_type++) {
+        update_mbmi_for_compound_type(mbmi, comp_type);
+        av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+                                      AOM_PLANE_Y, AOM_PLANE_Y);
+        model_rd_sb_fn[MODELRD_CURVFIT](
+            cpi, bsize, x, xd, 0, 0, &est_rate[comp_type], &est_dist[comp_type],
+            NULL, NULL, NULL, NULL, NULL);
+        est_rate[comp_type] += masked_type_cost[comp_type];
+        comp_model_rate[comp_type] = est_rate[comp_type];
+        comp_model_dist[comp_type] = est_dist[comp_type];
+        sse_y[comp_type] = x->pred_sse[xd->mi[0]->ref_frame[0]];
+        if (comp_type == COMPOUND_AVERAGE) {
+          *is_luma_interp_done = 1;
+          restore_dst_buf(xd, *tmp_dst, 1);
+        }
+      }
+      // Choose the better of the two based on modeled cost and call
+      // estimate_yrd_for_sb() for that one.
+      best_type = find_best_avg_distwtd_comp_type(
+          x, comp_model_rate, comp_model_dist, *rate_mv, &est_rd);
+      update_mbmi_for_compound_type(mbmi, best_type);
+      if (best_type == COMPOUND_AVERAGE) restore_dst_buf(xd, *orig_dst, 1);
+      rs2 = masked_type_cost[best_type];
+      RD_STATS est_rd_stats;
+      const int64_t mode_rd = RDCOST(x->rdmult, rs2 + *rate_mv, 0);
+      const int64_t tmp_rd_thresh = AOMMIN(*rd, rd_thresh) - mode_rd;
+      int64_t est_rd_ = INT64_MAX;
+      int eval_txfm = 1;
+      // Check if the mode is good enough based on skip rd
+      if (cpi->sf.inter_sf.txfm_rd_gate_level) {
+        int64_t skip_rd =
+            RDCOST(x->rdmult, rs2 + *rate_mv, (sse_y[best_type] << 4));
+        eval_txfm = check_txfm_eval(x, bsize, ref_skip_rd, skip_rd,
+                                    cpi->sf.inter_sf.txfm_rd_gate_level, 1);
+      }
+      // Evaluate further if skip rd is low enough
+      if (eval_txfm) {
+        est_rd_ =
+            estimate_yrd_for_sb(cpi, bsize, x, tmp_rd_thresh, &est_rd_stats);
+      }
+
+      if (est_rd_ != INT64_MAX) {
+        best_rd_cur = RDCOST(x->rdmult, rs2 + *rate_mv + est_rd_stats.rate,
+                             est_rd_stats.dist);
+        // Backup rate and distortion for future reuse
+        backup_stats(best_type, comp_rate, comp_dist, comp_model_rate,
+                     comp_model_dist, est_rate[best_type], est_dist[best_type],
+                     &est_rd_stats, comp_rs2, rs2);
+        comp_model_rd_cur = est_rd;
+      }
+      if (best_type == COMPOUND_AVERAGE) restore_dst_buf(xd, *tmp_dst, 1);
+      // Update stats for best compound type
+      if (best_rd_cur < *rd) {
+        update_best_info(mbmi, rd, &best_type_stats, best_rd_cur,
+                         comp_model_rd_cur, rs2);
+      }
+    }
+  }
+
+  // If COMPOUND_AVERAGE is not valid, use the spare buffer
+  if (valid_comp_types[0] != COMPOUND_AVERAGE) restore_dst_buf(xd, *tmp_dst, 1);
+
+  // Loop over valid compound types
+  for (int i = 0; i < valid_type_count; i++) {
+    cur_type = valid_comp_types[i];
+    comp_model_rd_cur = INT64_MAX;
+    tmp_rate_mv = *rate_mv;
+    best_rd_cur = INT64_MAX;
+
+    // Case COMPOUND_AVERAGE and COMPOUND_DISTWTD
+    if (cur_type < COMPOUND_WEDGE) {
+      update_mbmi_for_compound_type(mbmi, cur_type);
+      rs2 = masked_type_cost[cur_type];
+      const int64_t mode_rd = RDCOST(x->rdmult, rs2 + rd_stats->rate, 0);
+      if (mode_rd < ref_best_rd) {
+        // Reuse data if matching record is found
+        if (comp_rate[cur_type] == INT_MAX) {
+          av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+                                        AOM_PLANE_Y, AOM_PLANE_Y);
+          if (cur_type == COMPOUND_AVERAGE) *is_luma_interp_done = 1;
+
+          // Compute RD cost for the current type
+          RD_STATS est_rd_stats;
+          const int64_t tmp_rd_thresh = AOMMIN(*rd, rd_thresh) - mode_rd;
+          int64_t est_rd = INT64_MAX;
+          int eval_txfm = 1;
+          // Check if the mode is good enough based on skip rd
+          if (cpi->sf.inter_sf.txfm_rd_gate_level) {
+            int64_t sse_y = compute_sse_plane(x, xd, PLANE_TYPE_Y, bsize);
+            int64_t skip_rd = RDCOST(x->rdmult, rs2 + *rate_mv, (sse_y << 4));
+            eval_txfm = check_txfm_eval(x, bsize, ref_skip_rd, skip_rd,
+                                        cpi->sf.inter_sf.txfm_rd_gate_level, 1);
+          }
+          // Evaluate further if skip rd is low enough
+          if (eval_txfm) {
+            est_rd = estimate_yrd_for_sb(cpi, bsize, x, tmp_rd_thresh,
+                                         &est_rd_stats);
+          }
+
+          if (est_rd != INT64_MAX) {
+            best_rd_cur = RDCOST(x->rdmult, rs2 + *rate_mv + est_rd_stats.rate,
+                                 est_rd_stats.dist);
+            model_rd_sb_fn[MODELRD_TYPE_MASKED_COMPOUND](
+                cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum,
+                &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL);
+            comp_model_rd_cur =
+                RDCOST(x->rdmult, rs2 + *rate_mv + rate_sum, dist_sum);
+
+            // Backup rate and distortion for future reuse
+            backup_stats(cur_type, comp_rate, comp_dist, comp_model_rate,
+                         comp_model_dist, rate_sum, dist_sum, &est_rd_stats,
+                         comp_rs2, rs2);
+          }
+        } else {
+          // Calculate RD cost based on stored stats
+          assert(comp_dist[cur_type] != INT64_MAX);
+          best_rd_cur = RDCOST(x->rdmult, rs2 + *rate_mv + comp_rate[cur_type],
+                               comp_dist[cur_type]);
+          // Recalculate model rdcost with the updated rate
+          comp_model_rd_cur =
+              RDCOST(x->rdmult, rs2 + *rate_mv + comp_model_rate[cur_type],
+                     comp_model_dist[cur_type]);
+        }
+      }
+      // use spare buffer for following compound type try
+      if (cur_type == COMPOUND_AVERAGE) restore_dst_buf(xd, *tmp_dst, 1);
+    } else {
+      // Handle masked compound types
+      update_mbmi_for_compound_type(mbmi, cur_type);
+      rs2 = masked_type_cost[cur_type];
+      // Factors to control gating of compound type selection based on best
+      // approximate rd so far
+      const int max_comp_type_rd_threshold_mul =
+          comp_type_rd_threshold_mul[cpi->sf.inter_sf
+                                         .prune_comp_type_by_comp_avg];
+      const int max_comp_type_rd_threshold_div =
+          comp_type_rd_threshold_div[cpi->sf.inter_sf
+                                         .prune_comp_type_by_comp_avg];
+      // Evaluate COMPOUND_WEDGE / COMPOUND_DIFFWTD if approximated cost is
+      // within threshold
+      int64_t approx_rd = ((*rd / max_comp_type_rd_threshold_div) *
+                           max_comp_type_rd_threshold_mul);
+
+      if (approx_rd < ref_best_rd) {
+        const int64_t tmp_rd_thresh = AOMMIN(*rd, rd_thresh);
+        best_rd_cur = masked_compound_type_rd(
+            cpi, x, cur_mv, bsize, this_mode, &rs2, *rate_mv, orig_dst,
+            &tmp_rate_mv, preds0, preds1, buffers->residual1, buffers->diff10,
+            strides, rd_stats->rate, tmp_rd_thresh, &calc_pred_masked_compound,
+            comp_rate, comp_dist, comp_model_rate, comp_model_dist,
+            best_type_stats.comp_best_model_rd, &comp_model_rd_cur, comp_rs2,
+            ref_skip_rd);
+      }
+    }
+    // Update stats for best compound type
+    if (best_rd_cur < *rd) {
+      update_best_info(mbmi, rd, &best_type_stats, best_rd_cur,
+                       comp_model_rd_cur, rs2);
+      if (masked_compound_used && cur_type >= COMPOUND_WEDGE) {
+        memcpy(buffers->tmp_best_mask_buf, xd->seg_mask, mask_len);
+        if (have_newmv_in_inter_mode(this_mode))
+          update_mask_best_mv(mbmi, best_mv, cur_mv, cur_type,
+                              &best_tmp_rate_mv, tmp_rate_mv, &cpi->sf);
+      }
+    }
+    // reset to original mvs for next iteration
+    mbmi->mv[0].as_int = cur_mv[0].as_int;
+    mbmi->mv[1].as_int = cur_mv[1].as_int;
+  }
+  if (mbmi->interinter_comp.type != best_type_stats.best_compound_data.type) {
+    mbmi->comp_group_idx =
+        (best_type_stats.best_compound_data.type < COMPOUND_WEDGE) ? 0 : 1;
+    mbmi->compound_idx =
+        !(best_type_stats.best_compound_data.type == COMPOUND_DISTWTD);
+    mbmi->interinter_comp = best_type_stats.best_compound_data;
+    memcpy(xd->seg_mask, buffers->tmp_best_mask_buf, mask_len);
+  }
+  if (have_newmv_in_inter_mode(this_mode)) {
+    mbmi->mv[0].as_int = best_mv[0].as_int;
+    mbmi->mv[1].as_int = best_mv[1].as_int;
+    if (mbmi->interinter_comp.type == COMPOUND_WEDGE) {
+      rd_stats->rate += best_tmp_rate_mv - *rate_mv;
+      *rate_mv = best_tmp_rate_mv;
+    }
+  }
+  restore_dst_buf(xd, *orig_dst, 1);
+  if (!match_found)
+    save_comp_rd_search_stat(x, mbmi, comp_rate, comp_dist, comp_model_rate,
+                             comp_model_dist, cur_mv, comp_rs2);
+  return best_type_stats.best_compmode_interinter_cost;
+}
diff --git a/libs/libaom/src/av1/encoder/compound_type.h b/libs/libaom/src/av1/encoder/compound_type.h
new file mode 100644
index 000000000..f2bd857c9
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/compound_type.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_COMPOUND_TYPE_H_
+#define AOM_AV1_ENCODER_COMPOUND_TYPE_H_
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/interp_search.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Structure to store the compound type related stats for best compound type
+typedef struct {
+  INTERINTER_COMPOUND_DATA best_compound_data;
+  int64_t comp_best_model_rd;
+  int best_compmode_interinter_cost;
+} BEST_COMP_TYPE_STATS;
+
+int av1_handle_inter_intra_mode(const AV1_COMP *const cpi, MACROBLOCK *const x,
+                                BLOCK_SIZE bsize, MB_MODE_INFO *mbmi,
+                                HandleInterModeArgs *args, int64_t ref_best_rd,
+                                int *rate_mv, int *tmp_rate2,
+                                const BUFFER_SET *orig_dst);
+
+int av1_compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
+                         BLOCK_SIZE bsize, int_mv *cur_mv, int mode_search_mask,
+                         int masked_compound_used, const BUFFER_SET *orig_dst,
+                         const BUFFER_SET *tmp_dst,
+                         const CompoundTypeRdBuffers *buffers, int *rate_mv,
+                         int64_t *rd, RD_STATS *rd_stats, int64_t ref_best_rd,
+                         int64_t ref_skip_rd, int *is_luma_interp_done,
+                         int64_t rd_thresh);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_COMPOUND_TYPE_H_
diff --git a/libs/libaom/src/av1/encoder/context_tree.c b/libs/libaom/src/av1/encoder/context_tree.c
new file mode 100644
index 000000000..9b5b1cbf9
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/context_tree.c
@@ -0,0 +1,268 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/encoder/context_tree.h"
+#include "av1/encoder/encoder.h"
+
+static const BLOCK_SIZE square[MAX_SB_SIZE_LOG2 - 1] = {
+  BLOCK_4X4, BLOCK_8X8, BLOCK_16X16, BLOCK_32X32, BLOCK_64X64, BLOCK_128X128,
+};
+
+typedef struct {
+  tran_low_t *coeff_buf[MAX_MB_PLANE];
+  tran_low_t *qcoeff_buf[MAX_MB_PLANE];
+  tran_low_t *dqcoeff_buf[MAX_MB_PLANE];
+} PC_TREE_SHARED_BUFFERS;
+
+static AOM_INLINE void alloc_mode_context(AV1_COMMON *cm, int num_pix,
+                                          PICK_MODE_CONTEXT *ctx,
+                                          PC_TREE_SHARED_BUFFERS *shared_bufs) {
+  const int num_planes = av1_num_planes(cm);
+  int i;
+  const int num_blk = num_pix / 16;
+  ctx->num_4x4_blk = num_blk;
+
+  CHECK_MEM_ERROR(cm, ctx->blk_skip,
+                  aom_calloc(num_blk, sizeof(*ctx->blk_skip)));
+  CHECK_MEM_ERROR(cm, ctx->tx_type_map,
+                  aom_calloc(num_blk, sizeof(*ctx->tx_type_map)));
+  for (i = 0; i < num_planes; ++i) {
+    ctx->coeff[i] = shared_bufs->coeff_buf[i];
+    ctx->qcoeff[i] = shared_bufs->qcoeff_buf[i];
+    ctx->dqcoeff[i] = shared_bufs->dqcoeff_buf[i];
+    CHECK_MEM_ERROR(cm, ctx->eobs[i],
+                    aom_memalign(32, num_blk * sizeof(*ctx->eobs[i])));
+    CHECK_MEM_ERROR(
+        cm, ctx->txb_entropy_ctx[i],
+        aom_memalign(32, num_blk * sizeof(*ctx->txb_entropy_ctx[i])));
+  }
+
+  if (num_pix <= MAX_PALETTE_SQUARE) {
+    for (i = 0; i < 2; ++i) {
+      CHECK_MEM_ERROR(
+          cm, ctx->color_index_map[i],
+          aom_memalign(32, num_pix * sizeof(*ctx->color_index_map[i])));
+    }
+  }
+}
+
+static AOM_INLINE void free_mode_context(PICK_MODE_CONTEXT *ctx,
+                                         const int num_planes) {
+  int i;
+  aom_free(ctx->blk_skip);
+  ctx->blk_skip = 0;
+  aom_free(ctx->tx_type_map);
+  ctx->tx_type_map = 0;
+  for (i = 0; i < num_planes; ++i) {
+    ctx->coeff[i] = 0;
+    ctx->qcoeff[i] = 0;
+    ctx->dqcoeff[i] = 0;
+    aom_free(ctx->eobs[i]);
+    ctx->eobs[i] = 0;
+    aom_free(ctx->txb_entropy_ctx[i]);
+    ctx->txb_entropy_ctx[i] = 0;
+  }
+
+  for (i = 0; i < 2; ++i) {
+    aom_free(ctx->color_index_map[i]);
+    ctx->color_index_map[i] = 0;
+  }
+}
+
+static AOM_INLINE void alloc_tree_contexts(
+    AV1_COMMON *cm, PC_TREE *tree, int num_pix, int is_leaf,
+    PC_TREE_SHARED_BUFFERS *shared_bufs) {
+  alloc_mode_context(cm, num_pix, &tree->none, shared_bufs);
+
+  if (is_leaf) return;
+
+  alloc_mode_context(cm, num_pix / 2, &tree->horizontal[0], shared_bufs);
+  alloc_mode_context(cm, num_pix / 2, &tree->vertical[0], shared_bufs);
+
+  alloc_mode_context(cm, num_pix / 2, &tree->horizontal[1], shared_bufs);
+  alloc_mode_context(cm, num_pix / 2, &tree->vertical[1], shared_bufs);
+
+  alloc_mode_context(cm, num_pix / 4, &tree->horizontala[0], shared_bufs);
+  alloc_mode_context(cm, num_pix / 4, &tree->horizontala[1], shared_bufs);
+  alloc_mode_context(cm, num_pix / 2, &tree->horizontala[2], shared_bufs);
+
+  alloc_mode_context(cm, num_pix / 2, &tree->horizontalb[0], shared_bufs);
+  alloc_mode_context(cm, num_pix / 4, &tree->horizontalb[1], shared_bufs);
+  alloc_mode_context(cm, num_pix / 4, &tree->horizontalb[2], shared_bufs);
+
+  alloc_mode_context(cm, num_pix / 4, &tree->verticala[0], shared_bufs);
+  alloc_mode_context(cm, num_pix / 4, &tree->verticala[1], shared_bufs);
+  alloc_mode_context(cm, num_pix / 2, &tree->verticala[2], shared_bufs);
+
+  alloc_mode_context(cm, num_pix / 2, &tree->verticalb[0], shared_bufs);
+  alloc_mode_context(cm, num_pix / 4, &tree->verticalb[1], shared_bufs);
+  alloc_mode_context(cm, num_pix / 4, &tree->verticalb[2], shared_bufs);
+
+  for (int i = 0; i < 4; ++i) {
+    alloc_mode_context(cm, num_pix / 4, &tree->horizontal4[i], shared_bufs);
+    alloc_mode_context(cm, num_pix / 4, &tree->vertical4[i], shared_bufs);
+  }
+}
+
+static AOM_INLINE void free_tree_contexts(PC_TREE *tree, const int num_planes) {
+  int i;
+  for (i = 0; i < 3; i++) {
+    free_mode_context(&tree->horizontala[i], num_planes);
+    free_mode_context(&tree->horizontalb[i], num_planes);
+    free_mode_context(&tree->verticala[i], num_planes);
+    free_mode_context(&tree->verticalb[i], num_planes);
+  }
+  for (i = 0; i < 4; ++i) {
+    free_mode_context(&tree->horizontal4[i], num_planes);
+    free_mode_context(&tree->vertical4[i], num_planes);
+  }
+  free_mode_context(&tree->none, num_planes);
+  free_mode_context(&tree->horizontal[0], num_planes);
+  free_mode_context(&tree->horizontal[1], num_planes);
+  free_mode_context(&tree->vertical[0], num_planes);
+  free_mode_context(&tree->vertical[1], num_planes);
+}
+
+// This function will compute the number of pc_tree nodes to be allocated
+// or freed as per the super block size of BLOCK_128X128 or BLOCK_64X64
+static AOM_INLINE int get_pc_tree_nodes(const int is_sb_size_128,
+                                        int stat_generation_stage) {
+  const int tree_nodes_inc = is_sb_size_128 ? 1024 : 0;
+  const int tree_nodes =
+      stat_generation_stage ? 1 : (tree_nodes_inc + 256 + 64 + 16 + 4 + 1);
+  return tree_nodes;
+}
+
+// This function sets up a tree of contexts such that at each square
+// partition level. There are contexts for none, horizontal, vertical, and
+// split.  Along with a block_size value and a selected block_size which
+// represents the state of our search.
+void av1_setup_pc_tree(AV1_COMP *const cpi, ThreadData *td) {
+  AV1_COMMON *const cm = &cpi->common;
+  int i, j, stat_generation_stage = is_stat_generation_stage(cpi);
+  const int is_sb_size_128 = cm->seq_params.sb_size == BLOCK_128X128;
+  const int tree_nodes =
+      get_pc_tree_nodes(is_sb_size_128, stat_generation_stage);
+  int pc_tree_index = 0;
+  PC_TREE *this_pc;
+  PC_TREE_SHARED_BUFFERS shared_bufs;
+  int square_index = 1;
+  int nodes;
+
+  aom_free(td->pc_tree);
+  CHECK_MEM_ERROR(cm, td->pc_tree,
+                  aom_calloc(tree_nodes, sizeof(*td->pc_tree)));
+  this_pc = &td->pc_tree[0];
+
+  for (i = 0; i < 3; i++) {
+    const int max_num_pix = MAX_SB_SIZE * MAX_SB_SIZE;
+    CHECK_MEM_ERROR(cm, td->tree_coeff_buf[i],
+                    aom_memalign(32, max_num_pix * sizeof(tran_low_t)));
+    CHECK_MEM_ERROR(cm, td->tree_qcoeff_buf[i],
+                    aom_memalign(32, max_num_pix * sizeof(tran_low_t)));
+    CHECK_MEM_ERROR(cm, td->tree_dqcoeff_buf[i],
+                    aom_memalign(32, max_num_pix * sizeof(tran_low_t)));
+    shared_bufs.coeff_buf[i] = td->tree_coeff_buf[i];
+    shared_bufs.qcoeff_buf[i] = td->tree_qcoeff_buf[i];
+    shared_bufs.dqcoeff_buf[i] = td->tree_dqcoeff_buf[i];
+  }
+
+  if (!stat_generation_stage) {
+    const int leaf_factor = is_sb_size_128 ? 4 : 1;
+    const int leaf_nodes = 256 * leaf_factor;
+
+    // Sets up all the leaf nodes in the tree.
+    for (pc_tree_index = 0; pc_tree_index < leaf_nodes; ++pc_tree_index) {
+      PC_TREE *const tree = &td->pc_tree[pc_tree_index];
+      tree->block_size = square[0];
+      alloc_tree_contexts(cm, tree, 16, 1, &shared_bufs);
+    }
+
+    // Each node has 4 leaf nodes, fill each block_size level of the tree
+    // from leafs to the root.
+    for (nodes = leaf_nodes >> 2; nodes > 0; nodes >>= 2) {
+      for (i = 0; i < nodes; ++i) {
+        PC_TREE *const tree = &td->pc_tree[pc_tree_index];
+        alloc_tree_contexts(cm, tree, 16 << (2 * square_index), 0,
+                            &shared_bufs);
+        tree->block_size = square[square_index];
+        for (j = 0; j < 4; j++) tree->split[j] = this_pc++;
+        ++pc_tree_index;
+      }
+      ++square_index;
+    }
+  } else {
+    // Allocation for firstpass/LAP stage
+    // TODO(Mufaddal): refactor square_index to use a common block_size macro
+    // from firstpass.c
+    PC_TREE *const tree = &td->pc_tree[pc_tree_index];
+    square_index = 2;
+    alloc_tree_contexts(cm, tree, 16 << (2 * square_index), 1, &shared_bufs);
+    tree->block_size = square[square_index];
+  }
+
+  // Set up the root node for the applicable superblock size
+  td->pc_root = &td->pc_tree[tree_nodes - 1];
+#if CONFIG_INTERNAL_STATS
+  td->pc_root->none.best_mode_index = THR_INVALID;
+#endif  // CONFIG_INTERNAL_STATS
+}
+
+void av1_free_pc_tree(const AV1_COMP *const cpi, ThreadData *td,
+                      const int num_planes, BLOCK_SIZE sb_size) {
+  int stat_generation_stage = is_stat_generation_stage(cpi);
+  if (td->pc_tree != NULL) {
+    const int is_sb_size_128 = sb_size == BLOCK_128X128;
+    const int tree_nodes =
+        get_pc_tree_nodes(is_sb_size_128, stat_generation_stage);
+    for (int i = 0; i < tree_nodes; ++i) {
+      free_tree_contexts(&td->pc_tree[i], num_planes);
+    }
+    for (int i = 0; i < 3; ++i) {
+      aom_free(td->tree_coeff_buf[i]);
+      aom_free(td->tree_qcoeff_buf[i]);
+      aom_free(td->tree_dqcoeff_buf[i]);
+      td->tree_coeff_buf[i] = NULL;
+      td->tree_qcoeff_buf[i] = NULL;
+      td->tree_dqcoeff_buf[i] = NULL;
+    }
+    aom_free(td->pc_tree);
+    td->pc_tree = NULL;
+  }
+}
+
+void av1_copy_tree_context(PICK_MODE_CONTEXT *dst_ctx,
+                           PICK_MODE_CONTEXT *src_ctx) {
+  dst_ctx->mic = src_ctx->mic;
+  dst_ctx->mbmi_ext_best = src_ctx->mbmi_ext_best;
+
+  dst_ctx->num_4x4_blk = src_ctx->num_4x4_blk;
+  dst_ctx->skippable = src_ctx->skippable;
+#if CONFIG_INTERNAL_STATS
+  dst_ctx->best_mode_index = src_ctx->best_mode_index;
+#endif  // CONFIG_INTERNAL_STATS
+
+  memcpy(dst_ctx->blk_skip, src_ctx->blk_skip,
+         sizeof(uint8_t) * src_ctx->num_4x4_blk);
+  av1_copy_array(dst_ctx->tx_type_map, src_ctx->tx_type_map,
+                 src_ctx->num_4x4_blk);
+
+  dst_ctx->hybrid_pred_diff = src_ctx->hybrid_pred_diff;
+  dst_ctx->comp_pred_diff = src_ctx->comp_pred_diff;
+  dst_ctx->single_pred_diff = src_ctx->single_pred_diff;
+
+  dst_ctx->rd_stats = src_ctx->rd_stats;
+  dst_ctx->rd_mode_is_ready = src_ctx->rd_mode_is_ready;
+
+  memcpy(dst_ctx->pred_mv, src_ctx->pred_mv, sizeof(MV) * REF_FRAMES);
+
+  dst_ctx->partition = src_ctx->partition;
+}
diff --git a/libs/libaom/src/av1/encoder/context_tree.h b/libs/libaom/src/av1/encoder/context_tree.h
new file mode 100644
index 000000000..a39979413
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/context_tree.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_CONTEXT_TREE_H_
+#define AOM_AV1_ENCODER_CONTEXT_TREE_H_
+
+#include "config/aom_config.h"
+
+#include "av1/common/blockd.h"
+#include "av1/encoder/block.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct AV1_COMP;
+struct AV1Common;
+struct ThreadData;
+
+// Structure to hold snapshot of coding context during the mode picking process
+typedef struct {
+  MB_MODE_INFO mic;
+  MB_MODE_INFO_EXT_FRAME mbmi_ext_best;
+  uint8_t *color_index_map[2];
+  uint8_t *blk_skip;
+
+  tran_low_t *coeff[MAX_MB_PLANE];
+  tran_low_t *qcoeff[MAX_MB_PLANE];
+  tran_low_t *dqcoeff[MAX_MB_PLANE];
+  uint16_t *eobs[MAX_MB_PLANE];
+  uint8_t *txb_entropy_ctx[MAX_MB_PLANE];
+  uint8_t *tx_type_map;
+
+  int num_4x4_blk;
+  // For current partition, only if all Y, U, and V transform blocks'
+  // coefficients are quantized to 0, skippable is set to 1.
+  int skippable;
+#if CONFIG_INTERNAL_STATS
+  THR_MODES best_mode_index;
+#endif  // CONFIG_INTERNAL_STATS
+  int hybrid_pred_diff;
+  int comp_pred_diff;
+  int single_pred_diff;
+
+  RD_STATS rd_stats;
+
+  int rd_mode_is_ready;  // Flag to indicate whether rd pick mode decision has
+                         // been made.
+
+  // motion vector cache for adaptive motion search control in partition
+  // search loop
+  MV pred_mv[REF_FRAMES];
+  PARTITION_TYPE partition;
+} PICK_MODE_CONTEXT;
+
+typedef struct PC_TREE {
+  PARTITION_TYPE partitioning;
+  BLOCK_SIZE block_size;
+  PICK_MODE_CONTEXT none;
+  PICK_MODE_CONTEXT horizontal[2];
+  PICK_MODE_CONTEXT vertical[2];
+  PICK_MODE_CONTEXT horizontala[3];
+  PICK_MODE_CONTEXT horizontalb[3];
+  PICK_MODE_CONTEXT verticala[3];
+  PICK_MODE_CONTEXT verticalb[3];
+  PICK_MODE_CONTEXT horizontal4[4];
+  PICK_MODE_CONTEXT vertical4[4];
+  struct PC_TREE *split[4];
+  int index;
+
+  // Simple motion search_features
+  FULLPEL_MV start_mvs[REF_FRAMES];
+  unsigned int sms_none_feat[2];
+  unsigned int sms_rect_feat[8];
+  int sms_none_valid;
+  int sms_rect_valid;
+} PC_TREE;
+
+void av1_setup_pc_tree(struct AV1_COMP *const cpi, struct ThreadData *td);
+void av1_free_pc_tree(const struct AV1_COMP *const cpi, struct ThreadData *td,
+                      const int num_planes, BLOCK_SIZE sb_size);
+void av1_copy_tree_context(PICK_MODE_CONTEXT *dst_ctx,
+                           PICK_MODE_CONTEXT *src_ctx);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_CONTEXT_TREE_H_
diff --git a/libs/libaom/src/av1/encoder/corner_detect.c b/libs/libaom/src/av1/encoder/corner_detect.c
new file mode 100644
index 000000000..597bb30fc
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/corner_detect.c
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <memory.h>
+#include <math.h>
+#include <assert.h>
+
+#include "third_party/fastfeat/fast.h"
+
+#include "av1/encoder/corner_detect.h"
+
+// Fast_9 wrapper
+#define FAST_BARRIER 18
+int av1_fast_corner_detect(unsigned char *buf, int width, int height,
+                           int stride, int *points, int max_points) {
+  int num_points;
+  xy *const frm_corners_xy = aom_fast9_detect_nonmax(buf, width, height, stride,
+                                                     FAST_BARRIER, &num_points);
+  num_points = (num_points <= max_points ? num_points : max_points);
+  if (num_points > 0 && frm_corners_xy) {
+    memcpy(points, frm_corners_xy, sizeof(*frm_corners_xy) * num_points);
+    free(frm_corners_xy);
+    return num_points;
+  }
+  free(frm_corners_xy);
+  return 0;
+}
diff --git a/libs/libaom/src/av1/encoder/corner_detect.h b/libs/libaom/src/av1/encoder/corner_detect.h
new file mode 100644
index 000000000..15062f265
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/corner_detect.h
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_CORNER_DETECT_H_
+#define AOM_AV1_ENCODER_CORNER_DETECT_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+
+int av1_fast_corner_detect(unsigned char *buf, int width, int height,
+                           int stride, int *points, int max_points);
+
+#endif  // AOM_AV1_ENCODER_CORNER_DETECT_H_
diff --git a/libs/libaom/src/av1/encoder/corner_match.c b/libs/libaom/src/av1/encoder/corner_match.c
new file mode 100644
index 000000000..12f633b4f
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/corner_match.c
@@ -0,0 +1,194 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+#include <memory.h>
+#include <math.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom_ports/system_state.h"
+#include "av1/encoder/corner_match.h"
+
+#define SEARCH_SZ 9
+#define SEARCH_SZ_BY2 ((SEARCH_SZ - 1) / 2)
+
+#define THRESHOLD_NCC 0.75
+
+/* Compute var(im) * MATCH_SZ_SQ over a MATCH_SZ by MATCH_SZ window of im,
+   centered at (x, y).
+*/
+static double compute_variance(unsigned char *im, int stride, int x, int y) {
+  int sum = 0;
+  int sumsq = 0;
+  int var;
+  int i, j;
+  for (i = 0; i < MATCH_SZ; ++i)
+    for (j = 0; j < MATCH_SZ; ++j) {
+      sum += im[(i + y - MATCH_SZ_BY2) * stride + (j + x - MATCH_SZ_BY2)];
+      sumsq += im[(i + y - MATCH_SZ_BY2) * stride + (j + x - MATCH_SZ_BY2)] *
+               im[(i + y - MATCH_SZ_BY2) * stride + (j + x - MATCH_SZ_BY2)];
+    }
+  var = sumsq * MATCH_SZ_SQ - sum * sum;
+  return (double)var;
+}
+
+/* Compute corr(im1, im2) * MATCH_SZ * stddev(im1), where the
+   correlation/standard deviation are taken over MATCH_SZ by MATCH_SZ windows
+   of each image, centered at (x1, y1) and (x2, y2) respectively.
+*/
+double av1_compute_cross_correlation_c(unsigned char *im1, int stride1, int x1,
+                                       int y1, unsigned char *im2, int stride2,
+                                       int x2, int y2) {
+  int v1, v2;
+  int sum1 = 0;
+  int sum2 = 0;
+  int sumsq2 = 0;
+  int cross = 0;
+  int var2, cov;
+  int i, j;
+  for (i = 0; i < MATCH_SZ; ++i)
+    for (j = 0; j < MATCH_SZ; ++j) {
+      v1 = im1[(i + y1 - MATCH_SZ_BY2) * stride1 + (j + x1 - MATCH_SZ_BY2)];
+      v2 = im2[(i + y2 - MATCH_SZ_BY2) * stride2 + (j + x2 - MATCH_SZ_BY2)];
+      sum1 += v1;
+      sum2 += v2;
+      sumsq2 += v2 * v2;
+      cross += v1 * v2;
+    }
+  var2 = sumsq2 * MATCH_SZ_SQ - sum2 * sum2;
+  cov = cross * MATCH_SZ_SQ - sum1 * sum2;
+  aom_clear_system_state();
+  return cov / sqrt((double)var2);
+}
+
+static int is_eligible_point(int pointx, int pointy, int width, int height) {
+  return (pointx >= MATCH_SZ_BY2 && pointy >= MATCH_SZ_BY2 &&
+          pointx + MATCH_SZ_BY2 < width && pointy + MATCH_SZ_BY2 < height);
+}
+
+static int is_eligible_distance(int point1x, int point1y, int point2x,
+                                int point2y, int width, int height) {
+  const int thresh = (width < height ? height : width) >> 4;
+  return ((point1x - point2x) * (point1x - point2x) +
+          (point1y - point2y) * (point1y - point2y)) <= thresh * thresh;
+}
+
+static void improve_correspondence(unsigned char *frm, unsigned char *ref,
+                                   int width, int height, int frm_stride,
+                                   int ref_stride,
+                                   Correspondence *correspondences,
+                                   int num_correspondences) {
+  int i;
+  for (i = 0; i < num_correspondences; ++i) {
+    int x, y, best_x = 0, best_y = 0;
+    double best_match_ncc = 0.0;
+    for (y = -SEARCH_SZ_BY2; y <= SEARCH_SZ_BY2; ++y) {
+      for (x = -SEARCH_SZ_BY2; x <= SEARCH_SZ_BY2; ++x) {
+        double match_ncc;
+        if (!is_eligible_point(correspondences[i].rx + x,
+                               correspondences[i].ry + y, width, height))
+          continue;
+        if (!is_eligible_distance(correspondences[i].x, correspondences[i].y,
+                                  correspondences[i].rx + x,
+                                  correspondences[i].ry + y, width, height))
+          continue;
+        match_ncc = av1_compute_cross_correlation(
+            frm, frm_stride, correspondences[i].x, correspondences[i].y, ref,
+            ref_stride, correspondences[i].rx + x, correspondences[i].ry + y);
+        if (match_ncc > best_match_ncc) {
+          best_match_ncc = match_ncc;
+          best_y = y;
+          best_x = x;
+        }
+      }
+    }
+    correspondences[i].rx += best_x;
+    correspondences[i].ry += best_y;
+  }
+  for (i = 0; i < num_correspondences; ++i) {
+    int x, y, best_x = 0, best_y = 0;
+    double best_match_ncc = 0.0;
+    for (y = -SEARCH_SZ_BY2; y <= SEARCH_SZ_BY2; ++y)
+      for (x = -SEARCH_SZ_BY2; x <= SEARCH_SZ_BY2; ++x) {
+        double match_ncc;
+        if (!is_eligible_point(correspondences[i].x + x,
+                               correspondences[i].y + y, width, height))
+          continue;
+        if (!is_eligible_distance(
+                correspondences[i].x + x, correspondences[i].y + y,
+                correspondences[i].rx, correspondences[i].ry, width, height))
+          continue;
+        match_ncc = av1_compute_cross_correlation(
+            ref, ref_stride, correspondences[i].rx, correspondences[i].ry, frm,
+            frm_stride, correspondences[i].x + x, correspondences[i].y + y);
+        if (match_ncc > best_match_ncc) {
+          best_match_ncc = match_ncc;
+          best_y = y;
+          best_x = x;
+        }
+      }
+    correspondences[i].x += best_x;
+    correspondences[i].y += best_y;
+  }
+}
+
+int av1_determine_correspondence(unsigned char *frm, int *frm_corners,
+                                 int num_frm_corners, unsigned char *ref,
+                                 int *ref_corners, int num_ref_corners,
+                                 int width, int height, int frm_stride,
+                                 int ref_stride, int *correspondence_pts) {
+  // TODO(sarahparker) Improve this to include 2-way match
+  int i, j;
+  Correspondence *correspondences = (Correspondence *)correspondence_pts;
+  int num_correspondences = 0;
+  for (i = 0; i < num_frm_corners; ++i) {
+    double best_match_ncc = 0.0;
+    double template_norm;
+    int best_match_j = -1;
+    if (!is_eligible_point(frm_corners[2 * i], frm_corners[2 * i + 1], width,
+                           height))
+      continue;
+    for (j = 0; j < num_ref_corners; ++j) {
+      double match_ncc;
+      if (!is_eligible_point(ref_corners[2 * j], ref_corners[2 * j + 1], width,
+                             height))
+        continue;
+      if (!is_eligible_distance(frm_corners[2 * i], frm_corners[2 * i + 1],
+                                ref_corners[2 * j], ref_corners[2 * j + 1],
+                                width, height))
+        continue;
+      match_ncc = av1_compute_cross_correlation(
+          frm, frm_stride, frm_corners[2 * i], frm_corners[2 * i + 1], ref,
+          ref_stride, ref_corners[2 * j], ref_corners[2 * j + 1]);
+      if (match_ncc > best_match_ncc) {
+        best_match_ncc = match_ncc;
+        best_match_j = j;
+      }
+    }
+    // Note: We want to test if the best correlation is >= THRESHOLD_NCC,
+    // but need to account for the normalization in
+    // av1_compute_cross_correlation.
+    template_norm = compute_variance(frm, frm_stride, frm_corners[2 * i],
+                                     frm_corners[2 * i + 1]);
+    if (best_match_ncc > THRESHOLD_NCC * sqrt(template_norm)) {
+      correspondences[num_correspondences].x = frm_corners[2 * i];
+      correspondences[num_correspondences].y = frm_corners[2 * i + 1];
+      correspondences[num_correspondences].rx = ref_corners[2 * best_match_j];
+      correspondences[num_correspondences].ry =
+          ref_corners[2 * best_match_j + 1];
+      num_correspondences++;
+    }
+  }
+  improve_correspondence(frm, ref, width, height, frm_stride, ref_stride,
+                         correspondences, num_correspondences);
+  return num_correspondences;
+}
diff --git a/libs/libaom/src/av1/encoder/corner_match.h b/libs/libaom/src/av1/encoder/corner_match.h
new file mode 100644
index 000000000..3cf6de159
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/corner_match.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_ENCODER_CORNER_MATCH_H_
+#define AOM_AV1_ENCODER_CORNER_MATCH_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+
+#define MATCH_SZ 13
+#define MATCH_SZ_BY2 ((MATCH_SZ - 1) / 2)
+#define MATCH_SZ_SQ (MATCH_SZ * MATCH_SZ)
+
+typedef struct {
+  int x, y;
+  int rx, ry;
+} Correspondence;
+
+int av1_determine_correspondence(unsigned char *frm, int *frm_corners,
+                                 int num_frm_corners, unsigned char *ref,
+                                 int *ref_corners, int num_ref_corners,
+                                 int width, int height, int frm_stride,
+                                 int ref_stride, int *correspondence_pts);
+
+#endif  // AOM_AV1_ENCODER_CORNER_MATCH_H_
diff --git a/libs/libaom/src/av1/encoder/cost.c b/libs/libaom/src/av1/encoder/cost.c
new file mode 100644
index 000000000..323e2aed5
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/cost.c
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <assert.h>
+
+#include "av1/encoder/cost.h"
+#include "av1/common/entropy.h"
+
+// round(-log2(i/256.) * (1 << AV1_PROB_COST_SHIFT)); i = 128~255.
+const uint16_t av1_prob_cost[128] = {
+  512, 506, 501, 495, 489, 484, 478, 473, 467, 462, 456, 451, 446, 441, 435,
+  430, 425, 420, 415, 410, 405, 400, 395, 390, 385, 380, 375, 371, 366, 361,
+  356, 352, 347, 343, 338, 333, 329, 324, 320, 316, 311, 307, 302, 298, 294,
+  289, 285, 281, 277, 273, 268, 264, 260, 256, 252, 248, 244, 240, 236, 232,
+  228, 224, 220, 216, 212, 209, 205, 201, 197, 194, 190, 186, 182, 179, 175,
+  171, 168, 164, 161, 157, 153, 150, 146, 143, 139, 136, 132, 129, 125, 122,
+  119, 115, 112, 109, 105, 102, 99,  95,  92,  89,  86,  82,  79,  76,  73,
+  70,  66,  63,  60,  57,  54,  51,  48,  45,  42,  38,  35,  32,  29,  26,
+  23,  20,  18,  15,  12,  9,   6,   3,
+};
+
+void av1_cost_tokens_from_cdf(int *costs, const aom_cdf_prob *cdf,
+                              const int *inv_map) {
+  int i;
+  aom_cdf_prob prev_cdf = 0;
+  for (i = 0;; ++i) {
+    aom_cdf_prob p15 = AOM_ICDF(cdf[i]) - prev_cdf;
+    p15 = (p15 < EC_MIN_PROB) ? EC_MIN_PROB : p15;
+    prev_cdf = AOM_ICDF(cdf[i]);
+
+    if (inv_map)
+      costs[inv_map[i]] = av1_cost_symbol(p15);
+    else
+      costs[i] = av1_cost_symbol(p15);
+
+    // Stop once we reach the end of the CDF
+    if (cdf[i] == AOM_ICDF(CDF_PROB_TOP)) break;
+  }
+}
diff --git a/libs/libaom/src/av1/encoder/cost.h b/libs/libaom/src/av1/encoder/cost.h
new file mode 100644
index 000000000..be0241a82
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/cost.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_COST_H_
+#define AOM_AV1_ENCODER_COST_H_
+
+#include "aom_dsp/prob.h"
+#include "aom/aom_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern const uint16_t av1_prob_cost[128];
+
+// The factor to scale from cost in bits to cost in av1_prob_cost units.
+#define AV1_PROB_COST_SHIFT 9
+
+// Cost of coding an n bit literal, using 128 (i.e. 50%) probability
+// for each bit.
+#define av1_cost_literal(n) ((n) * (1 << AV1_PROB_COST_SHIFT))
+
+// Calculate the cost of a symbol with probability p15 / 2^15
+static INLINE int av1_cost_symbol(aom_cdf_prob p15) {
+  // p15 can be out of range [1, CDF_PROB_TOP - 1]. Clamping it, so that the
+  // following cost calculation works correctly. Otherwise, if p15 =
+  // CDF_PROB_TOP, shift would be -1, and "p15 << shift" would be wrong.
+  p15 = (aom_cdf_prob)clamp(p15, 1, CDF_PROB_TOP - 1);
+  assert(0 < p15 && p15 < CDF_PROB_TOP);
+  const int shift = CDF_PROB_BITS - 1 - get_msb(p15);
+  const int prob = get_prob(p15 << shift, CDF_PROB_TOP);
+  assert(prob >= 128);
+  return av1_prob_cost[prob - 128] + av1_cost_literal(shift);
+}
+
+void av1_cost_tokens_from_cdf(int *costs, const aom_cdf_prob *cdf,
+                              const int *inv_map);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_COST_H_
diff --git a/libs/libaom/src/av1/encoder/dwt.c b/libs/libaom/src/av1/encoder/dwt.c
new file mode 100644
index 000000000..04088b25f
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/dwt.c
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stdlib.h>
+#include <math.h>
+
+#include "config/av1_rtcd.h"
+#include "av1/encoder/dwt.h"
+
+// Note: block length must be even for this implementation
+static void analysis_53_row(int length, tran_low_t *x, tran_low_t *lowpass,
+                            tran_low_t *highpass) {
+  int n;
+  tran_low_t r, *a, *b;
+
+  n = length >> 1;
+  b = highpass;
+  a = lowpass;
+  while (--n) {
+    *a++ = (r = *x++) * 2;
+    *b++ = *x - ((r + x[1] + 1) >> 1);
+    x++;
+  }
+  *a = (r = *x++) * 2;
+  *b = *x - r;
+
+  n = length >> 1;
+  b = highpass;
+  a = lowpass;
+  r = *highpass;
+  while (n--) {
+    *a++ += (r + (*b) + 1) >> 1;
+    r = *b++;
+  }
+}
+
+static void analysis_53_col(int length, tran_low_t *x, tran_low_t *lowpass,
+                            tran_low_t *highpass) {
+  int n;
+  tran_low_t r, *a, *b;
+
+  n = length >> 1;
+  b = highpass;
+  a = lowpass;
+  while (--n) {
+    *a++ = (r = *x++);
+    *b++ = (((*x) * 2) - (r + x[1]) + 2) >> 2;
+    x++;
+  }
+  *a = (r = *x++);
+  *b = (*x - r + 1) >> 1;
+
+  n = length >> 1;
+  b = highpass;
+  a = lowpass;
+  r = *highpass;
+  while (n--) {
+    *a++ += (r + (*b) + 1) >> 1;
+    r = *b++;
+  }
+}
+
+static void dyadic_analyze_53_uint8_input(int levels, int width, int height,
+                                          uint8_t *x, int pitch_x,
+                                          tran_low_t *c, int pitch_c,
+                                          int dwt_scale_bits, int hbd) {
+  int lv, i, j, nh, nw, hh = height, hw = width;
+  tran_low_t buffer[2 * DWT_MAX_LENGTH];
+
+  if (hbd) {
+    uint16_t *x16 = CONVERT_TO_SHORTPTR(x);
+    for (i = 0; i < height; i++) {
+      for (j = 0; j < width; j++) {
+        c[i * pitch_c + j] = x16[i * pitch_x + j] << dwt_scale_bits;
+      }
+    }
+  } else {
+    for (i = 0; i < height; i++) {
+      for (j = 0; j < width; j++) {
+        c[i * pitch_c + j] = x[i * pitch_x + j] << dwt_scale_bits;
+      }
+    }
+  }
+
+  for (lv = 0; lv < levels; lv++) {
+    nh = hh;
+    hh = (hh + 1) >> 1;
+    nw = hw;
+    hw = (hw + 1) >> 1;
+    if ((nh < 2) || (nw < 2)) return;
+    for (i = 0; i < nh; i++) {
+      memcpy(buffer, &c[i * pitch_c], nw * sizeof(tran_low_t));
+      analysis_53_row(nw, buffer, &c[i * pitch_c], &c[i * pitch_c] + hw);
+    }
+    for (j = 0; j < nw; j++) {
+      for (i = 0; i < nh; i++) buffer[i + nh] = c[i * pitch_c + j];
+      analysis_53_col(nh, buffer + nh, buffer, buffer + hh);
+      for (i = 0; i < nh; i++) c[i * pitch_c + j] = buffer[i];
+    }
+  }
+}
+
+void av1_fdwt8x8_uint8_input_c(uint8_t *input, tran_low_t *output, int stride,
+                               int hbd) {
+  dyadic_analyze_53_uint8_input(4, 8, 8, input, stride, output, 8, 2, hbd);
+}
+
+int av1_haar_ac_sad(tran_low_t *output, int bw, int bh, int stride) {
+  int acsad = 0;
+
+  for (int r = 0; r < bh; ++r)
+    for (int c = 0; c < bw; ++c) {
+      if (r >= bh / 2 || c >= bw / 2) acsad += abs(output[r * stride + c]);
+    }
+  return acsad;
+}
+
+uint64_t av1_dct_ac_sad(tran_low_t *output, int bw, int bh, int stride) {
+  uint64_t acsad = 0;
+
+  for (int r = 0; r < bh; ++r)
+    for (int c = 0; c < bw; ++c) {
+      if (r > 0 || c > 0) acsad += abs(output[r * stride + c]);
+    }
+
+  return acsad;
+}
+
+uint32_t av1_variance(uint8_t *input, int bw, int bh, int stride) {
+  int sum = 0;
+  uint32_t sse = 0;
+
+  for (int r = 0; r < bh; ++r)
+    for (int c = 0; c < bw; ++c) {
+      sum += input[r * stride + c];
+      sse += input[r * stride + c] * input[r * stride + c];
+    }
+  return sse - (uint32_t)(((int64_t)sum * sum) / (bw * bh));
+}
+
+int av1_haar_ac_sad_8x8_uint8_input(uint8_t *input, int stride, int hbd) {
+  tran_low_t output[64];
+
+  av1_fdwt8x8_uint8_input_c(input, output, stride, hbd);
+  return av1_haar_ac_sad(output, 8, 8, 8);
+}
diff --git a/libs/libaom/src/av1/encoder/dwt.h b/libs/libaom/src/av1/encoder/dwt.h
new file mode 100644
index 000000000..37306c6a5
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/dwt.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_DWT_H_
+#define AOM_AV1_ENCODER_DWT_H_
+
+#include "av1/common/common.h"
+#include "av1/common/enums.h"
+
+#define DWT_MAX_LENGTH 64
+
+void av1_fdwt8x8(tran_low_t *input, tran_low_t *output, int stride);
+void av1_fdwt8x8_uint8_input_c(uint8_t *input, tran_low_t *output, int stride,
+                               int hbd);
+int av1_haar_ac_sad_8x8_uint8_input(uint8_t *input, int stride, int hbd);
+
+#endif  // AOM_AV1_ENCODER_DWT_H_
diff --git a/libs/libaom/src/av1/encoder/enc_enums.h b/libs/libaom/src/av1/encoder/enc_enums.h
new file mode 100644
index 000000000..5a0651483
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/enc_enums.h
@@ -0,0 +1,255 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ENC_ENUMS_H_
+#define AOM_AV1_ENCODER_ENC_ENUMS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// This enumerator type needs to be kept aligned with the mode order in
+// const MODE_DEFINITION av1_mode_defs[MAX_MODES] used in the rd code.
+enum {
+  THR_NEARESTMV,
+  THR_NEARESTL2,
+  THR_NEARESTL3,
+  THR_NEARESTB,
+  THR_NEARESTA2,
+  THR_NEARESTA,
+  THR_NEARESTG,
+
+  THR_NEWMV,
+  THR_NEWL2,
+  THR_NEWL3,
+  THR_NEWB,
+  THR_NEWA2,
+  THR_NEWA,
+  THR_NEWG,
+
+  THR_NEARMV,
+  THR_NEARL2,
+  THR_NEARL3,
+  THR_NEARB,
+  THR_NEARA2,
+  THR_NEARA,
+  THR_NEARG,
+
+  THR_GLOBALMV,
+  THR_GLOBALL2,
+  THR_GLOBALL3,
+  THR_GLOBALB,
+  THR_GLOBALA2,
+  THR_GLOBALA,
+  THR_GLOBALG,
+
+  THR_COMP_NEAREST_NEARESTLA,
+  THR_COMP_NEAREST_NEARESTL2A,
+  THR_COMP_NEAREST_NEARESTL3A,
+  THR_COMP_NEAREST_NEARESTGA,
+  THR_COMP_NEAREST_NEARESTLB,
+  THR_COMP_NEAREST_NEARESTL2B,
+  THR_COMP_NEAREST_NEARESTL3B,
+  THR_COMP_NEAREST_NEARESTGB,
+  THR_COMP_NEAREST_NEARESTLA2,
+  THR_COMP_NEAREST_NEARESTL2A2,
+  THR_COMP_NEAREST_NEARESTL3A2,
+  THR_COMP_NEAREST_NEARESTGA2,
+  THR_COMP_NEAREST_NEARESTLL2,
+  THR_COMP_NEAREST_NEARESTLL3,
+  THR_COMP_NEAREST_NEARESTLG,
+  THR_COMP_NEAREST_NEARESTBA,
+
+  THR_COMP_NEAR_NEARLA,
+  THR_COMP_NEW_NEARESTLA,
+  THR_COMP_NEAREST_NEWLA,
+  THR_COMP_NEW_NEARLA,
+  THR_COMP_NEAR_NEWLA,
+  THR_COMP_NEW_NEWLA,
+  THR_COMP_GLOBAL_GLOBALLA,
+
+  THR_COMP_NEAR_NEARL2A,
+  THR_COMP_NEW_NEARESTL2A,
+  THR_COMP_NEAREST_NEWL2A,
+  THR_COMP_NEW_NEARL2A,
+  THR_COMP_NEAR_NEWL2A,
+  THR_COMP_NEW_NEWL2A,
+  THR_COMP_GLOBAL_GLOBALL2A,
+
+  THR_COMP_NEAR_NEARL3A,
+  THR_COMP_NEW_NEARESTL3A,
+  THR_COMP_NEAREST_NEWL3A,
+  THR_COMP_NEW_NEARL3A,
+  THR_COMP_NEAR_NEWL3A,
+  THR_COMP_NEW_NEWL3A,
+  THR_COMP_GLOBAL_GLOBALL3A,
+
+  THR_COMP_NEAR_NEARGA,
+  THR_COMP_NEW_NEARESTGA,
+  THR_COMP_NEAREST_NEWGA,
+  THR_COMP_NEW_NEARGA,
+  THR_COMP_NEAR_NEWGA,
+  THR_COMP_NEW_NEWGA,
+  THR_COMP_GLOBAL_GLOBALGA,
+
+  THR_COMP_NEAR_NEARLB,
+  THR_COMP_NEW_NEARESTLB,
+  THR_COMP_NEAREST_NEWLB,
+  THR_COMP_NEW_NEARLB,
+  THR_COMP_NEAR_NEWLB,
+  THR_COMP_NEW_NEWLB,
+  THR_COMP_GLOBAL_GLOBALLB,
+
+  THR_COMP_NEAR_NEARL2B,
+  THR_COMP_NEW_NEARESTL2B,
+  THR_COMP_NEAREST_NEWL2B,
+  THR_COMP_NEW_NEARL2B,
+  THR_COMP_NEAR_NEWL2B,
+  THR_COMP_NEW_NEWL2B,
+  THR_COMP_GLOBAL_GLOBALL2B,
+
+  THR_COMP_NEAR_NEARL3B,
+  THR_COMP_NEW_NEARESTL3B,
+  THR_COMP_NEAREST_NEWL3B,
+  THR_COMP_NEW_NEARL3B,
+  THR_COMP_NEAR_NEWL3B,
+  THR_COMP_NEW_NEWL3B,
+  THR_COMP_GLOBAL_GLOBALL3B,
+
+  THR_COMP_NEAR_NEARGB,
+  THR_COMP_NEW_NEARESTGB,
+  THR_COMP_NEAREST_NEWGB,
+  THR_COMP_NEW_NEARGB,
+  THR_COMP_NEAR_NEWGB,
+  THR_COMP_NEW_NEWGB,
+  THR_COMP_GLOBAL_GLOBALGB,
+
+  THR_COMP_NEAR_NEARLA2,
+  THR_COMP_NEW_NEARESTLA2,
+  THR_COMP_NEAREST_NEWLA2,
+  THR_COMP_NEW_NEARLA2,
+  THR_COMP_NEAR_NEWLA2,
+  THR_COMP_NEW_NEWLA2,
+  THR_COMP_GLOBAL_GLOBALLA2,
+
+  THR_COMP_NEAR_NEARL2A2,
+  THR_COMP_NEW_NEARESTL2A2,
+  THR_COMP_NEAREST_NEWL2A2,
+  THR_COMP_NEW_NEARL2A2,
+  THR_COMP_NEAR_NEWL2A2,
+  THR_COMP_NEW_NEWL2A2,
+  THR_COMP_GLOBAL_GLOBALL2A2,
+
+  THR_COMP_NEAR_NEARL3A2,
+  THR_COMP_NEW_NEARESTL3A2,
+  THR_COMP_NEAREST_NEWL3A2,
+  THR_COMP_NEW_NEARL3A2,
+  THR_COMP_NEAR_NEWL3A2,
+  THR_COMP_NEW_NEWL3A2,
+  THR_COMP_GLOBAL_GLOBALL3A2,
+
+  THR_COMP_NEAR_NEARGA2,
+  THR_COMP_NEW_NEARESTGA2,
+  THR_COMP_NEAREST_NEWGA2,
+  THR_COMP_NEW_NEARGA2,
+  THR_COMP_NEAR_NEWGA2,
+  THR_COMP_NEW_NEWGA2,
+  THR_COMP_GLOBAL_GLOBALGA2,
+
+  THR_COMP_NEAR_NEARLL2,
+  THR_COMP_NEW_NEARESTLL2,
+  THR_COMP_NEAREST_NEWLL2,
+  THR_COMP_NEW_NEARLL2,
+  THR_COMP_NEAR_NEWLL2,
+  THR_COMP_NEW_NEWLL2,
+  THR_COMP_GLOBAL_GLOBALLL2,
+
+  THR_COMP_NEAR_NEARLL3,
+  THR_COMP_NEW_NEARESTLL3,
+  THR_COMP_NEAREST_NEWLL3,
+  THR_COMP_NEW_NEARLL3,
+  THR_COMP_NEAR_NEWLL3,
+  THR_COMP_NEW_NEWLL3,
+  THR_COMP_GLOBAL_GLOBALLL3,
+
+  THR_COMP_NEAR_NEARLG,
+  THR_COMP_NEW_NEARESTLG,
+  THR_COMP_NEAREST_NEWLG,
+  THR_COMP_NEW_NEARLG,
+  THR_COMP_NEAR_NEWLG,
+  THR_COMP_NEW_NEWLG,
+  THR_COMP_GLOBAL_GLOBALLG,
+
+  THR_COMP_NEAR_NEARBA,
+  THR_COMP_NEW_NEARESTBA,
+  THR_COMP_NEAREST_NEWBA,
+  THR_COMP_NEW_NEARBA,
+  THR_COMP_NEAR_NEWBA,
+  THR_COMP_NEW_NEWBA,
+  THR_COMP_GLOBAL_GLOBALBA,
+
+  THR_DC,
+  THR_PAETH,
+  THR_SMOOTH,
+  THR_SMOOTH_V,
+  THR_SMOOTH_H,
+  THR_H_PRED,
+  THR_V_PRED,
+  THR_D135_PRED,
+  THR_D203_PRED,
+  THR_D157_PRED,
+  THR_D67_PRED,
+  THR_D113_PRED,
+  THR_D45_PRED,
+
+  MAX_MODES,
+  SINGLE_REF_MODE_START = THR_NEARESTMV,
+  SINGLE_REF_MODE_END = THR_COMP_NEAREST_NEARESTLA,
+  NUM_SINGLE_REF_MODES = SINGLE_REF_MODE_END - SINGLE_REF_MODE_START,
+  THR_MODE_START = THR_NEARESTMV,
+  THR_MODE_END = MAX_MODES,
+  THR_INVALID = 255
+} UENUM1BYTE(THR_MODES);
+
+enum {
+  THR_LAST,
+  THR_LAST2,
+  THR_LAST3,
+  THR_BWDR,
+  THR_ALTR2,
+  THR_GOLD,
+  THR_ALTR,
+
+  THR_COMP_LA,
+  THR_COMP_L2A,
+  THR_COMP_L3A,
+  THR_COMP_GA,
+
+  THR_COMP_LB,
+  THR_COMP_L2B,
+  THR_COMP_L3B,
+  THR_COMP_GB,
+
+  THR_COMP_LA2,
+  THR_COMP_L2A2,
+  THR_COMP_L3A2,
+  THR_COMP_GA2,
+
+  THR_INTRA,
+
+  MAX_REFS
+} UENUM1BYTE(THR_MODES_SUB8X8);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_ENC_ENUMS_H_
diff --git a/libs/libaom/src/av1/encoder/encode_strategy.c b/libs/libaom/src/av1/encoder/encode_strategy.c
new file mode 100644
index 000000000..8eb73d8d3
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/encode_strategy.c
@@ -0,0 +1,1322 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdint.h>
+
+#include "config/aom_config.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "aom/aom_codec.h"
+#include "aom/aom_encoder.h"
+
+#include "aom_ports/system_state.h"
+
+#if CONFIG_MISMATCH_DEBUG
+#include "aom_util/debug_util.h"
+#endif  // CONFIG_MISMATCH_DEBUG
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/reconinter.h"
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encode_strategy.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/firstpass.h"
+#include "av1/encoder/pass2_strategy.h"
+#include "av1/encoder/temporal_filter.h"
+#include "av1/encoder/tpl_model.h"
+
+#define TEMPORAL_FILTER_KEY_FRAME (CONFIG_REALTIME_ONLY ? 0 : 1)
+
+void av1_configure_buffer_updates(AV1_COMP *const cpi,
+                                  EncodeFrameParams *const frame_params,
+                                  const FRAME_UPDATE_TYPE type,
+                                  int force_refresh_all) {
+  // NOTE(weitinglin): Should we define another function to take care of
+  // cpi->rc.is_$Source_Type to make this function as it is in the comment?
+
+  const ExternalFlags *const ext_flags = &cpi->ext_flags;
+  cpi->rc.is_src_frame_alt_ref = 0;
+
+  switch (type) {
+    case KF_UPDATE:
+      frame_params->refresh_golden_frame = 1;
+      frame_params->refresh_bwd_ref_frame = 1;
+      frame_params->refresh_alt_ref_frame = 1;
+      break;
+
+    case LF_UPDATE:
+      frame_params->refresh_golden_frame = 0;
+      frame_params->refresh_bwd_ref_frame = 0;
+      frame_params->refresh_alt_ref_frame = 0;
+      break;
+
+    case GF_UPDATE:
+      frame_params->refresh_golden_frame = 1;
+      frame_params->refresh_bwd_ref_frame = 0;
+      frame_params->refresh_alt_ref_frame = 0;
+      break;
+
+    case OVERLAY_UPDATE:
+      frame_params->refresh_golden_frame = 1;
+      frame_params->refresh_bwd_ref_frame = 0;
+      frame_params->refresh_alt_ref_frame = 0;
+
+      cpi->rc.is_src_frame_alt_ref = 1;
+      break;
+
+    case ARF_UPDATE:
+      frame_params->refresh_golden_frame = 0;
+      // NOTE: BWDREF does not get updated along with ALTREF_FRAME.
+      frame_params->refresh_bwd_ref_frame = 0;
+      frame_params->refresh_alt_ref_frame = 1;
+      break;
+
+    case INTNL_OVERLAY_UPDATE:
+      frame_params->refresh_golden_frame = 0;
+      frame_params->refresh_bwd_ref_frame = 0;
+      frame_params->refresh_alt_ref_frame = 0;
+
+      cpi->rc.is_src_frame_alt_ref = 1;
+      break;
+
+    case INTNL_ARF_UPDATE:
+      frame_params->refresh_golden_frame = 0;
+      frame_params->refresh_bwd_ref_frame = 1;
+      frame_params->refresh_alt_ref_frame = 0;
+      break;
+
+    default: assert(0); break;
+  }
+
+  if (ext_flags->refresh_frame_flags_pending &&
+      (!is_stat_generation_stage(cpi))) {
+    frame_params->refresh_golden_frame = ext_flags->refresh_golden_frame;
+    frame_params->refresh_alt_ref_frame = ext_flags->refresh_alt_ref_frame;
+    frame_params->refresh_bwd_ref_frame = ext_flags->refresh_bwd_ref_frame;
+  }
+
+  if (force_refresh_all) {
+    frame_params->refresh_golden_frame = 1;
+    frame_params->refresh_bwd_ref_frame = 1;
+    frame_params->refresh_alt_ref_frame = 1;
+  }
+}
+
+static void set_additional_frame_flags(const AV1_COMMON *const cm,
+                                       unsigned int *const frame_flags) {
+  if (frame_is_intra_only(cm)) {
+    *frame_flags |= FRAMEFLAGS_INTRAONLY;
+  }
+  if (frame_is_sframe(cm)) {
+    *frame_flags |= FRAMEFLAGS_SWITCH;
+  }
+  if (cm->features.error_resilient_mode) {
+    *frame_flags |= FRAMEFLAGS_ERROR_RESILIENT;
+  }
+}
+
+static INLINE void update_keyframe_counters(AV1_COMP *cpi) {
+  if (cpi->common.show_frame) {
+    if (!cpi->common.show_existing_frame || cpi->rc.is_src_frame_alt_ref ||
+        cpi->common.current_frame.frame_type == KEY_FRAME) {
+      // If this is a show_existing_frame with a source other than altref,
+      // or if it is not a displayed forward keyframe, the keyframe update
+      // counters were incremented when it was originally encoded.
+      cpi->rc.frames_since_key++;
+      cpi->rc.frames_to_key--;
+    }
+  }
+}
+
+static INLINE int is_frame_droppable(const SVC *const svc,
+                                     const ExternalFlags *const ext_flags) {
+  // Droppable frame is only used by external refresh flags. VoD setting won't
+  // trigger its use case.
+  if (svc->external_ref_frame_config)
+    return svc->non_reference_frame;
+  else if (ext_flags->refresh_frame_flags_pending)
+    return !(ext_flags->refresh_alt_ref_frame ||
+             ext_flags->refresh_alt2_ref_frame ||
+             ext_flags->refresh_bwd_ref_frame ||
+             ext_flags->refresh_golden_frame || ext_flags->refresh_last_frame);
+  else
+    return 0;
+}
+
+static INLINE void update_frames_till_gf_update(AV1_COMP *cpi) {
+  // TODO(weitinglin): Updating this counter for is_frame_droppable
+  // is a work-around to handle the condition when a frame is drop.
+  // We should fix the cpi->common.show_frame flag
+  // instead of checking the other condition to update the counter properly.
+  if (cpi->common.show_frame ||
+      is_frame_droppable(&cpi->svc, &cpi->ext_flags)) {
+    // Decrement count down till next gf
+    if (cpi->rc.frames_till_gf_update_due > 0)
+      cpi->rc.frames_till_gf_update_due--;
+  }
+}
+
+static INLINE void update_gf_group_index(AV1_COMP *cpi) {
+  // Increment the gf group index ready for the next frame. If this is
+  // a show_existing_frame with a source other than altref, or if it is not
+  // a displayed forward keyframe, the index was incremented when it was
+  // originally encoded.
+  if (!cpi->common.show_existing_frame || cpi->rc.is_src_frame_alt_ref ||
+      cpi->common.current_frame.frame_type == KEY_FRAME) {
+    ++cpi->gf_group.index;
+  }
+}
+
+static void update_rc_counts(AV1_COMP *cpi) {
+  update_keyframe_counters(cpi);
+  update_frames_till_gf_update(cpi);
+  update_gf_group_index(cpi);
+}
+
+static void set_ext_overrides(AV1_COMMON *const cm,
+                              EncodeFrameParams *const frame_params,
+                              ExternalFlags *const ext_flags) {
+  // Overrides the defaults with the externally supplied values with
+  // av1_update_reference() and av1_update_entropy() calls
+  // Note: The overrides are valid only for the next frame passed
+  // to av1_encode_lowlevel()
+
+  if (ext_flags->use_s_frame) {
+    frame_params->frame_type = S_FRAME;
+  }
+
+  if (ext_flags->refresh_frame_context_pending) {
+    cm->features.refresh_frame_context = ext_flags->refresh_frame_context;
+    ext_flags->refresh_frame_context_pending = 0;
+  }
+  cm->features.allow_ref_frame_mvs = ext_flags->use_ref_frame_mvs;
+
+  frame_params->error_resilient_mode = ext_flags->use_error_resilient;
+  // A keyframe is already error resilient and keyframes with
+  // error_resilient_mode interferes with the use of show_existing_frame
+  // when forward reference keyframes are enabled.
+  frame_params->error_resilient_mode &= frame_params->frame_type != KEY_FRAME;
+  // For bitstream conformance, s-frames must be error-resilient
+  frame_params->error_resilient_mode |= frame_params->frame_type == S_FRAME;
+}
+
+static int get_current_frame_ref_type(
+    const AV1_COMP *const cpi, const EncodeFrameParams *const frame_params) {
+  // We choose the reference "type" of this frame from the flags which indicate
+  // which reference frames will be refreshed by it.  More than one  of these
+  // flags may be set, so the order here implies an order of precedence. This is
+  // just used to choose the primary_ref_frame (as the most recent reference
+  // buffer of the same reference-type as the current frame)
+
+  (void)frame_params;
+  // TODO(jingning): This table should be a lot simpler with the new
+  // ARF system in place. Keep frame_params for the time being as we are
+  // still evaluating a few design options.
+  switch (cpi->gf_group.layer_depth[cpi->gf_group.index]) {
+    case 0: return 0;
+    case 1: return 1;
+    case MAX_ARF_LAYERS:
+    case MAX_ARF_LAYERS + 1: return 4;
+    default: return 7;
+  }
+}
+
+static int choose_primary_ref_frame(
+    const AV1_COMP *const cpi, const EncodeFrameParams *const frame_params) {
+  const AV1_COMMON *const cm = &cpi->common;
+
+  const int intra_only = frame_params->frame_type == KEY_FRAME ||
+                         frame_params->frame_type == INTRA_ONLY_FRAME;
+  if (intra_only || frame_params->error_resilient_mode || cpi->use_svc ||
+      cpi->ext_flags.use_primary_ref_none) {
+    return PRIMARY_REF_NONE;
+  }
+
+  // In large scale case, always use Last frame's frame contexts.
+  // Note(yunqing): In other cases, primary_ref_frame is chosen based on
+  // cpi->gf_group.layer_depth[cpi->gf_group.index], which also controls
+  // frame bit allocation.
+  if (cm->tiles.large_scale) return (LAST_FRAME - LAST_FRAME);
+
+  // Find the most recent reference frame with the same reference type as the
+  // current frame
+  const int current_ref_type = get_current_frame_ref_type(cpi, frame_params);
+  int wanted_fb = cpi->fb_of_context_type[current_ref_type];
+
+  int primary_ref_frame = PRIMARY_REF_NONE;
+  for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
+    if (get_ref_frame_map_idx(cm, ref_frame) == wanted_fb) {
+      primary_ref_frame = ref_frame - LAST_FRAME;
+    }
+  }
+
+  return primary_ref_frame;
+}
+
+static void update_fb_of_context_type(
+    const AV1_COMP *const cpi, const EncodeFrameParams *const frame_params,
+    int *const fb_of_context_type) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int current_frame_ref_type =
+      get_current_frame_ref_type(cpi, frame_params);
+
+  if (frame_is_intra_only(cm) || cm->features.error_resilient_mode ||
+      cpi->ext_flags.use_primary_ref_none) {
+    for (int i = 0; i < REF_FRAMES; i++) {
+      fb_of_context_type[i] = -1;
+    }
+    fb_of_context_type[current_frame_ref_type] =
+        cm->show_frame ? get_ref_frame_map_idx(cm, GOLDEN_FRAME)
+                       : get_ref_frame_map_idx(cm, ALTREF_FRAME);
+  }
+
+  if (!encode_show_existing_frame(cm)) {
+    // Refresh fb_of_context_type[]: see encoder.h for explanation
+    if (cm->current_frame.frame_type == KEY_FRAME) {
+      // All ref frames are refreshed, pick one that will live long enough
+      fb_of_context_type[current_frame_ref_type] = 0;
+    } else {
+      // If more than one frame is refreshed, it doesn't matter which one we
+      // pick so pick the first.  LST sometimes doesn't refresh any: this is ok
+
+      for (int i = 0; i < REF_FRAMES; i++) {
+        if (cm->current_frame.refresh_frame_flags & (1 << i)) {
+          fb_of_context_type[current_frame_ref_type] = i;
+          break;
+        }
+      }
+    }
+  }
+}
+
+static int get_order_offset(const GF_GROUP *const gf_group,
+                            const EncodeFrameParams *const frame_params) {
+  // shown frame by definition has order offset 0
+  // show_existing_frame ignores order_offset and simply takes the order_hint
+  // from the reference frame being shown.
+  if (frame_params->show_frame || frame_params->show_existing_frame) return 0;
+
+  const int arf_offset =
+      AOMMIN((MAX_GF_INTERVAL - 1), gf_group->arf_src_offset[gf_group->index]);
+  return AOMMIN((MAX_GF_INTERVAL - 1), arf_offset);
+}
+
+static void adjust_frame_rate(AV1_COMP *cpi, int64_t ts_start, int64_t ts_end) {
+  TimeStamps *time_stamps = &cpi->time_stamps;
+  int64_t this_duration;
+  int step = 0;
+
+  // Clear down mmx registers
+  aom_clear_system_state();
+
+  if (cpi->use_svc && cpi->svc.spatial_layer_id > 0) {
+    cpi->framerate = cpi->svc.base_framerate;
+    av1_rc_update_framerate(cpi, cpi->common.width, cpi->common.height);
+    return;
+  }
+
+  if (ts_start == time_stamps->first_ever) {
+    this_duration = ts_end - ts_start;
+    step = 1;
+  } else {
+    int64_t last_duration =
+        time_stamps->prev_end_seen - time_stamps->prev_start_seen;
+
+    this_duration = ts_end - time_stamps->prev_end_seen;
+
+    // do a step update if the duration changes by 10%
+    if (last_duration)
+      step = (int)((this_duration - last_duration) * 10 / last_duration);
+  }
+
+  if (this_duration) {
+    if (step) {
+      av1_new_framerate(cpi, 10000000.0 / this_duration);
+    } else {
+      // Average this frame's rate into the last second's average
+      // frame rate. If we haven't seen 1 second yet, then average
+      // over the whole interval seen.
+      const double interval =
+          AOMMIN((double)(ts_end - time_stamps->first_ever), 10000000.0);
+      double avg_duration = 10000000.0 / cpi->framerate;
+      avg_duration *= (interval - avg_duration + this_duration);
+      avg_duration /= interval;
+
+      av1_new_framerate(cpi, 10000000.0 / avg_duration);
+    }
+  }
+  time_stamps->prev_start_seen = ts_start;
+  time_stamps->prev_end_seen = ts_end;
+}
+
+// If this is an alt-ref, returns the offset of the source frame used
+// as the arf midpoint. Otherwise, returns 0.
+static int get_arf_src_index(GF_GROUP *gf_group, int pass) {
+  int arf_src_index = 0;
+  if (pass != 1) arf_src_index = gf_group->arf_src_offset[gf_group->index];
+  return arf_src_index;
+}
+
+// Called if this frame is an ARF or ARF2. Also handles forward-keyframes
+// For an ARF set arf2=0, for ARF2 set arf2=1
+// temporal_filtered is set to 1 if we temporally filter the ARF frame, so that
+// the correct post-filter buffer can be used.
+static struct lookahead_entry *setup_arf_frame(
+    AV1_COMP *const cpi, const int arf_src_index, int *code_arf,
+    EncodeFrameParams *const frame_params, int *show_existing_alt_ref) {
+  AV1_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+#if !CONFIG_REALTIME_ONLY
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+#endif
+
+  assert(arf_src_index <= rc->frames_to_key);
+  *code_arf = 0;
+
+  struct lookahead_entry *source =
+      av1_lookahead_peek(cpi->lookahead, arf_src_index, cpi->compressor_stage);
+
+  if (source != NULL) {
+    cm->showable_frame = 1;
+
+    // When arf_src_index == rc->frames_to_key, it indicates a fwd_kf
+    if (arf_src_index == rc->frames_to_key) {
+      // Skip temporal filtering and mark as intra_only if we have a fwd_kf
+      cpi->no_show_kf = 1;
+    } else {
+#if !CONFIG_REALTIME_ONLY
+      if (oxcf->arnr_max_frames > 0) {
+        // Produce the filtered ARF frame.
+        cm->current_frame.frame_type = INTER_FRAME;
+        FRAME_UPDATE_TYPE frame_update_type =
+            get_frame_update_type(&cpi->gf_group);
+        av1_configure_buffer_updates(cpi, frame_params, frame_update_type, 0);
+        *code_arf =
+            av1_temporal_filter(cpi, arf_src_index, show_existing_alt_ref);
+        if (*code_arf) {
+          aom_extend_frame_borders(&cpi->alt_ref_buffer, av1_num_planes(cm));
+        }
+      }
+#else
+      (void)show_existing_alt_ref;
+#endif
+    }
+    frame_params->show_frame = 0;
+  }
+  rc->source_alt_ref_pending = 0;
+  return source;
+}
+
+// Determine whether there is a forced keyframe pending in the lookahead buffer
+int is_forced_keyframe_pending(struct lookahead_ctx *lookahead,
+                               const int up_to_index,
+                               const COMPRESSOR_STAGE compressor_stage) {
+  for (int i = 0; i <= up_to_index; i++) {
+    const struct lookahead_entry *e =
+        av1_lookahead_peek(lookahead, i, compressor_stage);
+    if (e == NULL) {
+      // We have reached the end of the lookahead buffer and not early-returned
+      // so there isn't a forced key-frame pending.
+      return -1;
+    } else if (e->flags == AOM_EFLAG_FORCE_KF) {
+      return (i + 1);
+    } else {
+      continue;
+    }
+  }
+  return -1;  // Never reached
+}
+
+// Check if we should encode an ARF or internal ARF.  If not, try a LAST
+// Do some setup associated with the chosen source
+// temporal_filtered, flush, and frame_update_type are outputs.
+// Return the frame source, or NULL if we couldn't find one
+static struct lookahead_entry *choose_frame_source(
+    AV1_COMP *const cpi, int *const code_arf, int *const flush,
+    struct lookahead_entry **last_source, EncodeFrameParams *const frame_params,
+    int *show_existing_alt_ref) {
+  AV1_COMMON *const cm = &cpi->common;
+  struct lookahead_entry *source = NULL;
+  *code_arf = 0;
+
+  // Should we encode an alt-ref frame.
+  int arf_src_index = get_arf_src_index(&cpi->gf_group, cpi->oxcf.pass);
+  // TODO(Aasaipriya): Forced key frames need to be fixed when rc_mode != AOM_Q
+  if (arf_src_index &&
+      (is_forced_keyframe_pending(cpi->lookahead, arf_src_index,
+                                  cpi->compressor_stage) != -1) &&
+      cpi->oxcf.rc_mode != AOM_Q) {
+    arf_src_index = 0;
+    *flush = 1;
+  }
+
+  if (arf_src_index)
+    source = setup_arf_frame(cpi, arf_src_index, code_arf, frame_params,
+                             show_existing_alt_ref);
+
+  if (!source) {
+    // Get last frame source.
+    if (cm->current_frame.frame_number > 0) {
+      *last_source =
+          av1_lookahead_peek(cpi->lookahead, -1, cpi->compressor_stage);
+    }
+    // Read in the source frame.
+    source = av1_lookahead_pop(cpi->lookahead, *flush, cpi->compressor_stage);
+    if (source == NULL) return NULL;
+    frame_params->show_frame = 1;
+  }
+  return source;
+}
+
+// Don't allow a show_existing_frame to coincide with an error resilient or
+// S-Frame. An exception can be made in the case of a keyframe, since it does
+// not depend on any previous frames.
+static int allow_show_existing(const AV1_COMP *const cpi,
+                               unsigned int frame_flags) {
+  if (cpi->common.current_frame.frame_number == 0) return 0;
+
+  const struct lookahead_entry *lookahead_src =
+      av1_lookahead_peek(cpi->lookahead, 0, cpi->compressor_stage);
+  if (lookahead_src == NULL) return 1;
+
+  const int is_error_resilient =
+      cpi->oxcf.error_resilient_mode ||
+      (lookahead_src->flags & AOM_EFLAG_ERROR_RESILIENT);
+  const int is_s_frame =
+      cpi->oxcf.s_frame_mode || (lookahead_src->flags & AOM_EFLAG_SET_S_FRAME);
+  const int is_key_frame =
+      (cpi->rc.frames_to_key == 0) || (frame_flags & FRAMEFLAGS_KEY);
+  return !(is_error_resilient || is_s_frame) || is_key_frame;
+}
+
+// Update frame_flags to tell the encoder's caller what sort of frame was
+// encoded.
+static void update_frame_flags(AV1_COMP *cpi, unsigned int *frame_flags) {
+  if (encode_show_existing_frame(&cpi->common)) {
+    *frame_flags &= ~FRAMEFLAGS_GOLDEN;
+    *frame_flags &= ~FRAMEFLAGS_BWDREF;
+    *frame_flags &= ~FRAMEFLAGS_ALTREF;
+    *frame_flags &= ~FRAMEFLAGS_KEY;
+    return;
+  }
+
+  if (cpi->refresh_golden_frame == 1) {
+    *frame_flags |= FRAMEFLAGS_GOLDEN;
+  } else {
+    *frame_flags &= ~FRAMEFLAGS_GOLDEN;
+  }
+
+  if (cpi->refresh_alt_ref_frame == 1) {
+    *frame_flags |= FRAMEFLAGS_ALTREF;
+  } else {
+    *frame_flags &= ~FRAMEFLAGS_ALTREF;
+  }
+
+  if (cpi->refresh_bwd_ref_frame == 1) {
+    *frame_flags |= FRAMEFLAGS_BWDREF;
+  } else {
+    *frame_flags &= ~FRAMEFLAGS_BWDREF;
+  }
+
+  if (cpi->common.current_frame.frame_type == KEY_FRAME) {
+    *frame_flags |= FRAMEFLAGS_KEY;
+  } else {
+    *frame_flags &= ~FRAMEFLAGS_KEY;
+  }
+}
+
+#define DUMP_REF_FRAME_IMAGES 0
+
+#if DUMP_REF_FRAME_IMAGES == 1
+static int dump_one_image(AV1_COMMON *cm,
+                          const YV12_BUFFER_CONFIG *const ref_buf,
+                          char *file_name) {
+  int h;
+  FILE *f_ref = NULL;
+
+  if (ref_buf == NULL) {
+    printf("Frame data buffer is NULL.\n");
+    return AOM_CODEC_MEM_ERROR;
+  }
+
+  if ((f_ref = fopen(file_name, "wb")) == NULL) {
+    printf("Unable to open file %s to write.\n", file_name);
+    return AOM_CODEC_MEM_ERROR;
+  }
+
+  // --- Y ---
+  for (h = 0; h < cm->height; ++h) {
+    fwrite(&ref_buf->y_buffer[h * ref_buf->y_stride], 1, cm->width, f_ref);
+  }
+  // --- U ---
+  for (h = 0; h < (cm->height >> 1); ++h) {
+    fwrite(&ref_buf->u_buffer[h * ref_buf->uv_stride], 1, (cm->width >> 1),
+           f_ref);
+  }
+  // --- V ---
+  for (h = 0; h < (cm->height >> 1); ++h) {
+    fwrite(&ref_buf->v_buffer[h * ref_buf->uv_stride], 1, (cm->width >> 1),
+           f_ref);
+  }
+
+  fclose(f_ref);
+
+  return AOM_CODEC_OK;
+}
+
+static void dump_ref_frame_images(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  MV_REFERENCE_FRAME ref_frame;
+
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    char file_name[256] = "";
+    snprintf(file_name, sizeof(file_name), "/tmp/enc_F%d_ref_%d.yuv",
+             cm->current_frame.frame_number, ref_frame);
+    dump_one_image(cm, get_ref_frame_yv12_buf(cpi, ref_frame), file_name);
+  }
+}
+#endif  // DUMP_REF_FRAME_IMAGES == 1
+
+int av1_get_refresh_ref_frame_map(int refresh_frame_flags) {
+  int ref_map_index = INVALID_IDX;
+
+  for (ref_map_index = 0; ref_map_index < REF_FRAMES; ++ref_map_index)
+    if ((refresh_frame_flags >> ref_map_index) & 1) break;
+
+  return ref_map_index;
+}
+
+static void update_arf_stack(int ref_map_index,
+                             RefBufferStack *ref_buffer_stack) {
+  if (ref_buffer_stack->arf_stack_size >= 0) {
+    if (ref_buffer_stack->arf_stack[0] == ref_map_index)
+      stack_pop(ref_buffer_stack->arf_stack, &ref_buffer_stack->arf_stack_size);
+  }
+
+  if (ref_buffer_stack->lst_stack_size) {
+    for (int i = ref_buffer_stack->lst_stack_size - 1; i >= 0; --i) {
+      if (ref_buffer_stack->lst_stack[i] == ref_map_index) {
+        for (int idx = i; idx < ref_buffer_stack->lst_stack_size - 1; ++idx)
+          ref_buffer_stack->lst_stack[idx] =
+              ref_buffer_stack->lst_stack[idx + 1];
+        ref_buffer_stack->lst_stack[ref_buffer_stack->lst_stack_size - 1] =
+            INVALID_IDX;
+        --ref_buffer_stack->lst_stack_size;
+      }
+    }
+  }
+
+  if (ref_buffer_stack->gld_stack_size) {
+    for (int i = ref_buffer_stack->gld_stack_size - 1; i >= 0; --i) {
+      if (ref_buffer_stack->gld_stack[i] == ref_map_index) {
+        for (int idx = i; idx < ref_buffer_stack->gld_stack_size - 1; ++idx)
+          ref_buffer_stack->gld_stack[idx] =
+              ref_buffer_stack->gld_stack[idx + 1];
+        ref_buffer_stack->gld_stack[ref_buffer_stack->gld_stack_size - 1] =
+            INVALID_IDX;
+        --ref_buffer_stack->gld_stack_size;
+      }
+    }
+  }
+}
+
+// Update reference frame stack info.
+void av1_update_ref_frame_map(AV1_COMP *cpi,
+                              FRAME_UPDATE_TYPE frame_update_type,
+                              int show_existing_frame, int ref_map_index,
+                              RefBufferStack *ref_buffer_stack) {
+  AV1_COMMON *const cm = &cpi->common;
+  // TODO(jingning): Consider the S-frame same as key frame for the
+  // reference frame tracking purpose. The logic might be better
+  // expressed than converting the frame update type.
+  if (frame_is_sframe(cm)) frame_update_type = KEY_FRAME;
+
+  if (is_frame_droppable(&cpi->svc, &cpi->ext_flags)) return;
+
+  switch (frame_update_type) {
+    case KEY_FRAME:
+      if (show_existing_frame)
+        ref_map_index = stack_pop(ref_buffer_stack->arf_stack,
+                                  &ref_buffer_stack->arf_stack_size);
+      stack_reset(ref_buffer_stack->lst_stack,
+                  &ref_buffer_stack->lst_stack_size);
+      stack_reset(ref_buffer_stack->gld_stack,
+                  &ref_buffer_stack->gld_stack_size);
+      stack_reset(ref_buffer_stack->arf_stack,
+                  &ref_buffer_stack->arf_stack_size);
+      stack_push(ref_buffer_stack->gld_stack, &ref_buffer_stack->gld_stack_size,
+                 ref_map_index);
+      break;
+    case GF_UPDATE:
+      update_arf_stack(ref_map_index, ref_buffer_stack);
+      stack_push(ref_buffer_stack->gld_stack, &ref_buffer_stack->gld_stack_size,
+                 ref_map_index);
+      // For nonrd_mode: update LAST as well on GF_UPDATE frame.
+      if (cpi->sf.rt_sf.use_nonrd_pick_mode)
+        stack_push(ref_buffer_stack->lst_stack,
+                   &ref_buffer_stack->lst_stack_size, ref_map_index);
+      break;
+    case LF_UPDATE:
+      update_arf_stack(ref_map_index, ref_buffer_stack);
+      stack_push(ref_buffer_stack->lst_stack, &ref_buffer_stack->lst_stack_size,
+                 ref_map_index);
+      break;
+    case ARF_UPDATE:
+    case INTNL_ARF_UPDATE:
+      update_arf_stack(ref_map_index, ref_buffer_stack);
+      stack_push(ref_buffer_stack->arf_stack, &ref_buffer_stack->arf_stack_size,
+                 ref_map_index);
+      break;
+    case OVERLAY_UPDATE:
+      ref_map_index = stack_pop(ref_buffer_stack->arf_stack,
+                                &ref_buffer_stack->arf_stack_size);
+      stack_push(ref_buffer_stack->gld_stack, &ref_buffer_stack->gld_stack_size,
+                 ref_map_index);
+      break;
+    case INTNL_OVERLAY_UPDATE:
+      ref_map_index = stack_pop(ref_buffer_stack->arf_stack,
+                                &ref_buffer_stack->arf_stack_size);
+      stack_push(ref_buffer_stack->lst_stack, &ref_buffer_stack->lst_stack_size,
+                 ref_map_index);
+      break;
+    default: assert(0 && "unknown type");
+  }
+  return;
+}
+
+static int get_free_ref_map_index(const RefBufferStack *ref_buffer_stack) {
+  for (int idx = 0; idx < REF_FRAMES; ++idx) {
+    int is_free = 1;
+    for (int i = 0; i < ref_buffer_stack->arf_stack_size; ++i) {
+      if (ref_buffer_stack->arf_stack[i] == idx) {
+        is_free = 0;
+        break;
+      }
+    }
+
+    for (int i = 0; i < ref_buffer_stack->lst_stack_size; ++i) {
+      if (ref_buffer_stack->lst_stack[i] == idx) {
+        is_free = 0;
+        break;
+      }
+    }
+
+    for (int i = 0; i < ref_buffer_stack->gld_stack_size; ++i) {
+      if (ref_buffer_stack->gld_stack[i] == idx) {
+        is_free = 0;
+        break;
+      }
+    }
+
+    if (is_free) return idx;
+  }
+  return INVALID_IDX;
+}
+
+int av1_get_refresh_frame_flags(const AV1_COMP *const cpi,
+                                const EncodeFrameParams *const frame_params,
+                                FRAME_UPDATE_TYPE frame_update_type,
+                                const RefBufferStack *const ref_buffer_stack) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const ExternalFlags *const ext_flags = &cpi->ext_flags;
+  const SVC *const svc = &cpi->svc;
+  // Switch frames and shown key-frames overwrite all reference slots
+  if ((frame_params->frame_type == KEY_FRAME && frame_params->show_frame) ||
+      frame_params->frame_type == S_FRAME)
+    return 0xFF;
+
+  // show_existing_frames don't actually send refresh_frame_flags so set the
+  // flags to 0 to keep things consistent.
+  if (frame_params->show_existing_frame &&
+      (!frame_params->error_resilient_mode ||
+       frame_params->frame_type == KEY_FRAME)) {
+    return 0;
+  }
+
+  if (is_frame_droppable(svc, ext_flags)) return 0;
+
+  int refresh_mask = 0;
+
+  if (ext_flags->refresh_frame_flags_pending) {
+    if (svc->external_ref_frame_config) {
+      for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; i++) {
+        int ref_frame_map_idx = svc->ref_idx[i];
+        refresh_mask |= svc->refresh[ref_frame_map_idx] << ref_frame_map_idx;
+      }
+      return refresh_mask;
+    }
+    // Unfortunately the encoder interface reflects the old refresh_*_frame
+    // flags so we have to replicate the old refresh_frame_flags logic here in
+    // order to preserve the behaviour of the flag overrides.
+    int ref_frame_map_idx = get_ref_frame_map_idx(cm, LAST_FRAME);
+    if (ref_frame_map_idx != INVALID_IDX)
+      refresh_mask |= ext_flags->refresh_last_frame << ref_frame_map_idx;
+
+    ref_frame_map_idx = get_ref_frame_map_idx(cm, EXTREF_FRAME);
+    if (ref_frame_map_idx != INVALID_IDX)
+      refresh_mask |= ext_flags->refresh_bwd_ref_frame << ref_frame_map_idx;
+
+    ref_frame_map_idx = get_ref_frame_map_idx(cm, ALTREF2_FRAME);
+    if (ref_frame_map_idx != INVALID_IDX)
+      refresh_mask |= ext_flags->refresh_alt2_ref_frame << ref_frame_map_idx;
+
+    if (frame_update_type == OVERLAY_UPDATE) {
+      ref_frame_map_idx = get_ref_frame_map_idx(cm, ALTREF_FRAME);
+      if (ref_frame_map_idx != INVALID_IDX)
+        refresh_mask |= ext_flags->refresh_golden_frame << ref_frame_map_idx;
+    } else {
+      ref_frame_map_idx = get_ref_frame_map_idx(cm, GOLDEN_FRAME);
+      if (ref_frame_map_idx != INVALID_IDX)
+        refresh_mask |= ext_flags->refresh_golden_frame << ref_frame_map_idx;
+
+      ref_frame_map_idx = get_ref_frame_map_idx(cm, ALTREF_FRAME);
+      if (ref_frame_map_idx != INVALID_IDX)
+        refresh_mask |= ext_flags->refresh_alt_ref_frame << ref_frame_map_idx;
+    }
+    return refresh_mask;
+  }
+
+  // Search for the open slot to store the current frame.
+  int free_fb_index = get_free_ref_map_index(ref_buffer_stack);
+  switch (frame_update_type) {
+    case KF_UPDATE:
+    case GF_UPDATE:
+      if (free_fb_index != INVALID_IDX) {
+        refresh_mask = 1 << free_fb_index;
+      } else {
+        if (ref_buffer_stack->gld_stack_size)
+          refresh_mask =
+              1 << ref_buffer_stack
+                       ->gld_stack[ref_buffer_stack->gld_stack_size - 1];
+        else
+          refresh_mask =
+              1 << ref_buffer_stack
+                       ->lst_stack[ref_buffer_stack->lst_stack_size - 1];
+      }
+      break;
+    case LF_UPDATE:
+      if (free_fb_index != INVALID_IDX) {
+        refresh_mask = 1 << free_fb_index;
+      } else {
+        if (ref_buffer_stack->lst_stack_size >= 2)
+          refresh_mask =
+              1 << ref_buffer_stack
+                       ->lst_stack[ref_buffer_stack->lst_stack_size - 1];
+        else if (ref_buffer_stack->gld_stack_size >= 2)
+          refresh_mask =
+              1 << ref_buffer_stack
+                       ->gld_stack[ref_buffer_stack->gld_stack_size - 1];
+        else
+          assert(0 && "No ref map index found");
+      }
+      break;
+    case ARF_UPDATE:
+      if (free_fb_index != INVALID_IDX) {
+        refresh_mask = 1 << free_fb_index;
+      } else {
+        if (ref_buffer_stack->gld_stack_size >= 3)
+          refresh_mask =
+              1 << ref_buffer_stack
+                       ->gld_stack[ref_buffer_stack->gld_stack_size - 1];
+        else if (ref_buffer_stack->lst_stack_size >= 2)
+          refresh_mask =
+              1 << ref_buffer_stack
+                       ->lst_stack[ref_buffer_stack->lst_stack_size - 1];
+        else
+          assert(0 && "No ref map index found");
+      }
+      break;
+    case INTNL_ARF_UPDATE:
+      if (free_fb_index != INVALID_IDX) {
+        refresh_mask = 1 << free_fb_index;
+      } else {
+        refresh_mask =
+            1 << ref_buffer_stack
+                     ->lst_stack[ref_buffer_stack->lst_stack_size - 1];
+      }
+      break;
+    case OVERLAY_UPDATE: break;
+    case INTNL_OVERLAY_UPDATE: break;
+    default: assert(0); break;
+  }
+
+  return refresh_mask;
+}
+
+#if !CONFIG_REALTIME_ONLY
+void setup_mi(AV1_COMP *const cpi, YV12_BUFFER_CONFIG *src) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCK *const x = &cpi->td.mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+
+  av1_setup_src_planes(x, src, 0, 0, num_planes, cm->seq_params.sb_size);
+
+  av1_setup_block_planes(xd, cm->seq_params.subsampling_x,
+                         cm->seq_params.subsampling_y, num_planes);
+
+  set_mi_offsets(&cm->mi_params, xd, 0, 0);
+}
+
+// Apply temporal filtering to key frames and encode the filtered frame.
+// If the current frame is not key frame, this function is identical to
+// av1_encode().
+static int denoise_and_encode(AV1_COMP *const cpi, uint8_t *const dest,
+                              EncodeFrameInput *const frame_input,
+                              EncodeFrameParams *const frame_params,
+                              EncodeFrameResults *const frame_results) {
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  AV1_COMMON *const cm = &cpi->common;
+
+  // Decide whether to apply temporal filtering to the source frame.
+  int apply_filtering =
+      frame_params->frame_type == KEY_FRAME &&
+      oxcf->enable_keyframe_filtering && !is_stat_generation_stage(cpi) &&
+      !frame_params->show_existing_frame &&
+      cpi->rc.frames_to_key > TF_NUM_FILTERING_FRAMES_FOR_KEY_FRAME &&
+      !is_lossless_requested(oxcf) && oxcf->arnr_max_frames > 0;
+  if (apply_filtering) {
+    const double y_noise_level = av1_estimate_noise_from_single_plane(
+        frame_input->source, 0, cm->seq_params.bit_depth);
+    apply_filtering = y_noise_level > 0;
+  }
+
+  // Save the pointer to the original source image.
+  YV12_BUFFER_CONFIG *source_kf_buffer = frame_input->source;
+
+  // Apply filtering to key frame.
+  if (apply_filtering) {
+    // Initialization for frame motion estimation.
+    MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+    av1_init_mi_buffers(&cm->mi_params);
+    setup_mi(cpi, frame_input->source);
+    av1_init_macroblockd(cm, xd, NULL);
+    memset(
+        cpi->mbmi_ext_info.frame_base, 0,
+        cpi->mbmi_ext_info.alloc_size * sizeof(*cpi->mbmi_ext_info.frame_base));
+
+    av1_set_speed_features_framesize_independent(cpi, oxcf->speed);
+    av1_set_speed_features_framesize_dependent(cpi, oxcf->speed);
+    av1_set_rd_speed_thresholds(cpi);
+    av1_setup_frame_buf_refs(cm);
+    av1_setup_frame_sign_bias(cm);
+    av1_frame_init_quantizer(cpi);
+    av1_setup_past_independence(cm);
+
+    if (!frame_params->show_frame) {
+      int arf_src_index = get_arf_src_index(&cpi->gf_group, cpi->oxcf.pass);
+      av1_temporal_filter(cpi, -1 * arf_src_index, NULL);
+    } else {
+      av1_temporal_filter(cpi, -1, NULL);
+    }
+    aom_extend_frame_borders(&cpi->alt_ref_buffer, av1_num_planes(cm));
+    // Use the filtered frame for encoding.
+    frame_input->source = &cpi->alt_ref_buffer;
+    // Copy metadata info to alt-ref buffer.
+    aom_remove_metadata_from_frame_buffer(frame_input->source);
+    aom_copy_metadata_to_frame_buffer(frame_input->source,
+                                      source_kf_buffer->metadata);
+  }
+
+  if (frame_params->frame_type == KEY_FRAME && !is_stat_generation_stage(cpi) &&
+      oxcf->enable_tpl_model && oxcf->lag_in_frames > 0 &&
+      frame_params->show_frame) {
+    av1_tpl_setup_stats(cpi, 0, frame_params, frame_input);
+  }
+
+  if (av1_encode(cpi, dest, frame_input, frame_params, frame_results) !=
+      AOM_CODEC_OK) {
+    return AOM_CODEC_ERROR;
+  }
+
+  // Set frame_input source to true source for psnr calculation.
+  if (apply_filtering) {
+    cpi->source = source_kf_buffer;
+    cpi->unscaled_source = source_kf_buffer;
+  }
+
+  return AOM_CODEC_OK;
+}
+#endif  // !CONFIG_REALTIME_ONLY
+
+static INLINE int find_unused_ref_frame(const int *used_ref_frames,
+                                        const int *stack, int stack_size) {
+  for (int i = 0; i < stack_size; ++i) {
+    const int this_ref = stack[i];
+    int ref_idx = 0;
+    for (ref_idx = 0; ref_idx <= ALTREF_FRAME - LAST_FRAME; ++ref_idx) {
+      if (this_ref == used_ref_frames[ref_idx]) break;
+    }
+
+    // not in use
+    if (ref_idx > ALTREF_FRAME - LAST_FRAME) return this_ref;
+  }
+
+  return INVALID_IDX;
+}
+
+void av1_get_ref_frames(AV1_COMP *const cpi, RefBufferStack *ref_buffer_stack) {
+  AV1_COMMON *cm = &cpi->common;
+  int *const remapped_ref_idx = cm->remapped_ref_idx;
+  int *const arf_stack = ref_buffer_stack->arf_stack;
+  int *const lst_stack = ref_buffer_stack->lst_stack;
+  int *const gld_stack = ref_buffer_stack->gld_stack;
+  const int arf_stack_size = ref_buffer_stack->arf_stack_size;
+  const int lst_stack_size = ref_buffer_stack->lst_stack_size;
+  const int gld_stack_size = ref_buffer_stack->gld_stack_size;
+
+  // Initialization
+  for (int i = 0; i < REF_FRAMES; ++i) remapped_ref_idx[i] = INVALID_IDX;
+
+  if (arf_stack_size) {
+    remapped_ref_idx[ALTREF_FRAME - LAST_FRAME] = arf_stack[arf_stack_size - 1];
+
+    if (arf_stack_size > 1)
+      remapped_ref_idx[BWDREF_FRAME - LAST_FRAME] = arf_stack[0];
+
+    if (arf_stack_size > 2)
+      remapped_ref_idx[ALTREF2_FRAME - LAST_FRAME] = arf_stack[1];
+  }
+
+  if (lst_stack_size) {
+    remapped_ref_idx[LAST_FRAME - LAST_FRAME] = lst_stack[0];
+
+    if (lst_stack_size > 1)
+      remapped_ref_idx[LAST2_FRAME - LAST_FRAME] = lst_stack[1];
+  }
+
+  if (gld_stack_size) {
+    remapped_ref_idx[GOLDEN_FRAME - LAST_FRAME] = gld_stack[0];
+
+    if (gld_stack_size > 1) {
+      if (arf_stack_size <= 1)
+        remapped_ref_idx[BWDREF_FRAME - LAST_FRAME] = gld_stack[1];
+      else
+        remapped_ref_idx[LAST3_FRAME - LAST_FRAME] = gld_stack[1];
+    }
+  }
+
+  for (int idx = ALTREF_FRAME - LAST_FRAME; idx >= 0; --idx) {
+    int ref_map_index = remapped_ref_idx[idx];
+
+    if (ref_map_index != INVALID_IDX) continue;
+
+    ref_map_index =
+        find_unused_ref_frame(remapped_ref_idx, arf_stack, arf_stack_size);
+
+    if (ref_map_index == INVALID_IDX) {
+      ref_map_index =
+          find_unused_ref_frame(remapped_ref_idx, gld_stack, gld_stack_size);
+    }
+
+    if (ref_map_index == INVALID_IDX) {
+      ref_map_index =
+          find_unused_ref_frame(remapped_ref_idx, lst_stack, lst_stack_size);
+    }
+
+    if (ref_map_index != INVALID_IDX)
+      remapped_ref_idx[idx] = ref_map_index;
+    else
+      remapped_ref_idx[idx] = ref_buffer_stack->gld_stack[0];
+  }
+}
+
+int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size,
+                        uint8_t *const dest, unsigned int *frame_flags,
+                        int64_t *const time_stamp, int64_t *const time_end,
+                        const aom_rational64_t *const timestamp_ratio,
+                        int flush) {
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  AV1_COMMON *const cm = &cpi->common;
+  GF_GROUP *gf_group = &cpi->gf_group;
+  ExternalFlags *const ext_flags = &cpi->ext_flags;
+
+  EncodeFrameInput frame_input;
+  EncodeFrameParams frame_params;
+  EncodeFrameResults frame_results;
+  memset(&frame_input, 0, sizeof(frame_input));
+  memset(&frame_params, 0, sizeof(frame_params));
+  memset(&frame_results, 0, sizeof(frame_results));
+
+  // TODO(sarahparker) finish bit allocation for one pass pyramid
+  if (has_no_stats_stage(cpi) && oxcf->rc_mode != AOM_Q) {
+    cpi->oxcf.gf_max_pyr_height =
+        AOMMIN(cpi->oxcf.gf_max_pyr_height, USE_ALTREF_FOR_ONE_PASS);
+    cpi->oxcf.gf_min_pyr_height =
+        AOMMIN(cpi->oxcf.gf_min_pyr_height, cpi->oxcf.gf_max_pyr_height);
+  }
+
+  if (!is_stat_generation_stage(cpi)) {
+    // If this is a forward keyframe, mark as a show_existing_frame
+    if (cpi->oxcf.fwd_kf_enabled && (gf_group->index == gf_group->size) &&
+        gf_group->update_type[1] == ARF_UPDATE && cpi->rc.frames_to_key == 0) {
+      frame_params.show_existing_frame = 1;
+    } else {
+      frame_params.show_existing_frame =
+          ((oxcf->enable_overlay == 0 || cpi->sf.hl_sf.disable_overlay_frames ||
+            cpi->show_existing_alt_ref) &&
+           gf_group->update_type[gf_group->index] == OVERLAY_UPDATE) ||
+          gf_group->update_type[gf_group->index] == INTNL_OVERLAY_UPDATE;
+    }
+    frame_params.show_existing_frame &= allow_show_existing(cpi, *frame_flags);
+
+    // Reset show_existing_alt_ref decision to 0 after it is used.
+    if (gf_group->update_type[gf_group->index] == OVERLAY_UPDATE) {
+      cpi->show_existing_alt_ref = 0;
+    }
+  } else {
+    frame_params.show_existing_frame = 0;
+  }
+
+  int code_arf = 0;
+  struct lookahead_entry *source = NULL;
+  struct lookahead_entry *last_source = NULL;
+  if (frame_params.show_existing_frame) {
+    source = av1_lookahead_pop(cpi->lookahead, flush, cpi->compressor_stage);
+    frame_params.show_frame = 1;
+  } else {
+    int show_existing_alt_ref = 0;
+    source = choose_frame_source(cpi, &code_arf, &flush, &last_source,
+                                 &frame_params, &show_existing_alt_ref);
+    if (gf_group->update_type[gf_group->index] == ARF_UPDATE)
+      cpi->show_existing_alt_ref = show_existing_alt_ref;
+  }
+
+  if (source == NULL) {  // If no source was found, we can't encode a frame.
+#if !CONFIG_REALTIME_ONLY
+    if (flush && oxcf->pass == 1 && !cpi->twopass.first_pass_done) {
+      av1_end_first_pass(cpi); /* get last stats packet */
+      cpi->twopass.first_pass_done = 1;
+    }
+#endif
+    return -1;
+  }
+
+  frame_input.source = code_arf ? &cpi->alt_ref_buffer : &source->img;
+  frame_input.last_source = last_source != NULL ? &last_source->img : NULL;
+  frame_input.ts_duration = source->ts_end - source->ts_start;
+  // Save unfiltered source. It is used in av1_get_second_pass_params().
+  cpi->unfiltered_source = frame_input.source;
+
+  *time_stamp = source->ts_start;
+  *time_end = source->ts_end;
+  if (source->ts_start < cpi->time_stamps.first_ever) {
+    cpi->time_stamps.first_ever = source->ts_start;
+    cpi->time_stamps.prev_end_seen = source->ts_start;
+  }
+
+  av1_apply_encoding_flags(cpi, source->flags);
+  if (!frame_params.show_existing_frame)
+    *frame_flags = (source->flags & AOM_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0;
+
+  // Shown frames and arf-overlay frames need frame-rate considering
+  if (frame_params.show_frame)
+    adjust_frame_rate(cpi, source->ts_start, source->ts_end);
+
+  if (!frame_params.show_existing_frame) {
+    if (cpi->film_grain_table) {
+      cm->cur_frame->film_grain_params_present = aom_film_grain_table_lookup(
+          cpi->film_grain_table, *time_stamp, *time_end, 0 /* =erase */,
+          &cm->film_grain_params);
+    } else {
+      cm->cur_frame->film_grain_params_present =
+          cm->seq_params.film_grain_params_present;
+    }
+    // only one operating point supported now
+    const int64_t pts64 = ticks_to_timebase_units(timestamp_ratio, *time_stamp);
+    if (pts64 < 0 || pts64 > UINT32_MAX) return AOM_CODEC_ERROR;
+    cm->frame_presentation_time = (uint32_t)pts64;
+  }
+
+#if CONFIG_REALTIME_ONLY
+  av1_get_one_pass_rt_params(cpi, &frame_params, *frame_flags);
+#else
+  if (has_no_stats_stage(cpi) && oxcf->mode == REALTIME &&
+      oxcf->lag_in_frames == 0)
+    av1_get_one_pass_rt_params(cpi, &frame_params, *frame_flags);
+  else if (!is_stat_generation_stage(cpi))
+    av1_get_second_pass_params(cpi, &frame_params, &frame_input, *frame_flags);
+#endif
+  FRAME_UPDATE_TYPE frame_update_type = get_frame_update_type(gf_group);
+
+  if (frame_params.show_existing_frame &&
+      frame_params.frame_type != KEY_FRAME) {
+    // Force show-existing frames to be INTER, except forward keyframes
+    frame_params.frame_type = INTER_FRAME;
+  }
+
+  // TODO(david.turner@argondesign.com): Move all the encode strategy
+  // (largely near av1_get_compressed_data) in here
+
+  // TODO(david.turner@argondesign.com): Change all the encode strategy to
+  // modify frame_params instead of cm or cpi.
+
+  // Per-frame encode speed.  In theory this can vary, but things may have been
+  // written assuming speed-level will not change within a sequence, so this
+  // parameter should be used with caution.
+  frame_params.speed = oxcf->speed;
+
+  // Work out some encoding parameters specific to the pass:
+  if (has_no_stats_stage(cpi) && cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
+    av1_cyclic_refresh_update_parameters(cpi);
+  } else if (is_stat_generation_stage(cpi)) {
+    cpi->td.mb.e_mbd.lossless[0] = is_lossless_requested(&cpi->oxcf);
+    const int kf_requested = (cm->current_frame.frame_number == 0 ||
+                              (*frame_flags & FRAMEFLAGS_KEY));
+    if (kf_requested && frame_update_type != OVERLAY_UPDATE &&
+        frame_update_type != INTNL_OVERLAY_UPDATE) {
+      frame_params.frame_type = KEY_FRAME;
+    } else {
+      frame_params.frame_type = INTER_FRAME;
+    }
+  } else if (is_stat_consumption_stage(cpi)) {
+#if CONFIG_MISMATCH_DEBUG
+    mismatch_move_frame_idx_w();
+#endif
+#if TXCOEFF_COST_TIMER
+    cm->txcoeff_cost_timer = 0;
+    cm->txcoeff_cost_count = 0;
+#endif
+  }
+
+  if (!is_stat_generation_stage(cpi))
+    set_ext_overrides(cm, &frame_params, ext_flags);
+
+  // Shown keyframes and S frames refresh all reference buffers
+  const int force_refresh_all =
+      ((frame_params.frame_type == KEY_FRAME && frame_params.show_frame) ||
+       frame_params.frame_type == S_FRAME) &&
+      !frame_params.show_existing_frame;
+
+  av1_configure_buffer_updates(cpi, &frame_params, frame_update_type,
+                               force_refresh_all);
+
+  if (!is_stat_generation_stage(cpi)) {
+    const RefCntBuffer *ref_frames[INTER_REFS_PER_FRAME];
+    const YV12_BUFFER_CONFIG *ref_frame_buf[INTER_REFS_PER_FRAME];
+
+    if (!ext_flags->refresh_frame_flags_pending) {
+      av1_get_ref_frames(cpi, &cpi->ref_buffer_stack);
+    } else if (cpi->svc.external_ref_frame_config) {
+      for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; i++)
+        cm->remapped_ref_idx[i] = cpi->svc.ref_idx[i];
+    }
+
+    // Get the reference frames
+    for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+      ref_frames[i] = get_ref_frame_buf(cm, ref_frame_priority_order[i]);
+      ref_frame_buf[i] = ref_frames[i] != NULL ? &ref_frames[i]->buf : NULL;
+    }
+    // Work out which reference frame slots may be used.
+    frame_params.ref_frame_flags = get_ref_frame_flags(
+        &cpi->sf, ref_frame_buf, ext_flags->ref_frame_flags);
+
+    frame_params.primary_ref_frame =
+        choose_primary_ref_frame(cpi, &frame_params);
+    frame_params.order_offset = get_order_offset(&cpi->gf_group, &frame_params);
+
+    frame_params.refresh_frame_flags = av1_get_refresh_frame_flags(
+        cpi, &frame_params, frame_update_type, &cpi->ref_buffer_stack);
+
+    frame_params.existing_fb_idx_to_show =
+        frame_params.show_existing_frame
+            ? (frame_update_type == INTNL_OVERLAY_UPDATE
+                   ? get_ref_frame_map_idx(cm, BWDREF_FRAME)
+                   : get_ref_frame_map_idx(cm, ALTREF_FRAME))
+            : INVALID_IDX;
+  }
+
+  // The way frame_params->remapped_ref_idx is setup is a placeholder.
+  // Currently, reference buffer assignment is done by update_ref_frame_map()
+  // which is called by high-level strategy AFTER encoding a frame.  It modifies
+  // cm->remapped_ref_idx.  If you want to use an alternative method to
+  // determine reference buffer assignment, just put your assignments into
+  // frame_params->remapped_ref_idx here and they will be used when encoding
+  // this frame.  If frame_params->remapped_ref_idx is setup independently of
+  // cm->remapped_ref_idx then update_ref_frame_map() will have no effect.
+  memcpy(frame_params.remapped_ref_idx, cm->remapped_ref_idx,
+         REF_FRAMES * sizeof(*cm->remapped_ref_idx));
+
+  cpi->td.mb.e_mbd.delta_qindex = 0;
+
+  if (!frame_params.show_existing_frame) {
+    cm->quant_params.using_qmatrix = cpi->oxcf.using_qm;
+#if !CONFIG_REALTIME_ONLY
+    if (oxcf->lag_in_frames > 0 && !is_stat_generation_stage(cpi)) {
+      if (cpi->gf_group.index == 1 && cpi->oxcf.enable_tpl_model) {
+        av1_configure_buffer_updates(cpi, &frame_params, frame_update_type, 0);
+        av1_set_frame_size(cpi, cm->width, cm->height);
+        av1_tpl_setup_stats(cpi, 0, &frame_params, &frame_input);
+        assert(cpi->num_gf_group_show_frames == 1);
+      }
+    }
+#endif
+  }
+
+#if CONFIG_REALTIME_ONLY
+  if (av1_encode(cpi, dest, &frame_input, &frame_params, &frame_results) !=
+      AOM_CODEC_OK) {
+    return AOM_CODEC_ERROR;
+  }
+#else
+  if (denoise_and_encode(cpi, dest, &frame_input, &frame_params,
+                         &frame_results) != AOM_CODEC_OK) {
+    return AOM_CODEC_ERROR;
+  }
+#endif  // CONFIG_REALTIME_ONLY
+  if (!is_stat_generation_stage(cpi))
+    cpi->num_gf_group_show_frames += frame_params.show_frame;
+
+  if (!is_stat_generation_stage(cpi)) {
+    // First pass doesn't modify reference buffer assignment or produce frame
+    // flags
+    update_frame_flags(cpi, frame_flags);
+    if (!ext_flags->refresh_frame_flags_pending) {
+      int ref_map_index =
+          av1_get_refresh_ref_frame_map(cm->current_frame.refresh_frame_flags);
+      av1_update_ref_frame_map(cpi, frame_update_type, cm->show_existing_frame,
+                               ref_map_index, &cpi->ref_buffer_stack);
+    }
+  }
+
+#if !CONFIG_REALTIME_ONLY
+  if (!is_stat_generation_stage(cpi)) {
+#if TXCOEFF_COST_TIMER
+    cm->cum_txcoeff_cost_timer += cm->txcoeff_cost_timer;
+    fprintf(stderr,
+            "\ntxb coeff cost block number: %ld, frame time: %ld, cum time %ld "
+            "in us\n",
+            cm->txcoeff_cost_count, cm->txcoeff_cost_timer,
+            cm->cum_txcoeff_cost_timer);
+#endif
+    av1_twopass_postencode_update(cpi);
+  }
+#endif  // !CONFIG_REALTIME_ONLY
+
+  if (!is_stat_generation_stage(cpi)) {
+    update_fb_of_context_type(cpi, &frame_params, cpi->fb_of_context_type);
+    set_additional_frame_flags(cm, frame_flags);
+    update_rc_counts(cpi);
+  }
+
+  // Unpack frame_results:
+  *size = frame_results.size;
+
+  // Leave a signal for a higher level caller about if this frame is droppable
+  if (*size > 0) {
+    cpi->droppable = is_frame_droppable(&cpi->svc, ext_flags);
+  }
+
+  if (cpi->use_svc) av1_save_layer_context(cpi);
+
+  return AOM_CODEC_OK;
+}
diff --git a/libs/libaom/src/av1/encoder/encode_strategy.h b/libs/libaom/src/av1/encoder/encode_strategy.h
new file mode 100644
index 000000000..b05224ba1
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/encode_strategy.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ENCODE_STRATEGY_H_
+#define AOM_AV1_ENCODER_ENCODE_STRATEGY_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+
+#include "aom/aom_encoder.h"
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/firstpass.h"
+
+// This function will implement high-level encode strategy, choosing frame type,
+// frame placement, etc.  It populates an EncodeFrameParams struct with the
+// results of these decisions and then calls av1_encode()
+int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size,
+                        uint8_t *const dest, unsigned int *frame_flags,
+                        int64_t *const time_stamp, int64_t *const time_end,
+                        const aom_rational64_t *const timestamp_ratio,
+                        int flush);
+
+// Set individual buffer update flags based on frame reference type.
+// force_refresh_all is used when we have a KEY_FRAME or S_FRAME.  It forces all
+// refresh_*_frame flags to be set, because we refresh all buffers in this case.
+void av1_configure_buffer_updates(AV1_COMP *const cpi,
+                                  EncodeFrameParams *const frame_params,
+                                  const FRAME_UPDATE_TYPE type,
+                                  int force_refresh_all);
+
+int av1_get_refresh_frame_flags(const AV1_COMP *const cpi,
+                                const EncodeFrameParams *const frame_params,
+                                FRAME_UPDATE_TYPE frame_update_type,
+                                const RefBufferStack *const ref_buffer_stack);
+
+int av1_get_refresh_ref_frame_map(int refresh_frame_flags);
+
+void av1_update_ref_frame_map(AV1_COMP *cpi,
+                              FRAME_UPDATE_TYPE frame_update_type,
+                              int show_existing_frame, int ref_map_index,
+                              RefBufferStack *ref_buffer_stack);
+
+void av1_get_ref_frames(AV1_COMP *const cpi, RefBufferStack *ref_buffer_stack);
+
+int is_forced_keyframe_pending(struct lookahead_ctx *lookahead,
+                               const int up_to_index,
+                               const COMPRESSOR_STAGE compressor_stage);
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_ENCODE_STRATEGY_H_
diff --git a/libs/libaom/src/av1/encoder/encodeframe.c b/libs/libaom/src/av1/encoder/encodeframe.c
new file mode 100644
index 000000000..53b47d49e
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/encodeframe.c
@@ -0,0 +1,6475 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+#include <float.h>
+#include <math.h>
+#include <stdbool.h>
+#include <stdio.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/binary_codes_writer.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/system_state.h"
+
+#if CONFIG_MISMATCH_DEBUG
+#include "aom_util/debug_util.h"
+#endif  // CONFIG_MISMATCH_DEBUG
+
+#include "av1/common/cfl.h"
+#include "av1/common/common.h"
+#include "av1/common/entropy.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/idct.h"
+#include "av1/common/mv.h"
+#include "av1/common/mvref_common.h"
+#include "av1/common/pred_common.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/reconintra.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/seg_common.h"
+#include "av1/common/tile_common.h"
+#include "av1/common/warped_motion.h"
+
+#include "av1/encoder/aq_complexity.h"
+#include "av1/encoder/aq_cyclicrefresh.h"
+#include "av1/encoder/aq_variance.h"
+#include "av1/encoder/corner_detect.h"
+#include "av1/encoder/global_motion.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/encodemb.h"
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/encodetxb.h"
+#include "av1/encoder/ethread.h"
+#include "av1/encoder/extend.h"
+#include "av1/encoder/ml.h"
+#include "av1/encoder/motion_search_facade.h"
+#include "av1/encoder/partition_strategy.h"
+#if !CONFIG_REALTIME_ONLY
+#include "av1/encoder/partition_model_weights.h"
+#endif
+#include "av1/encoder/rd.h"
+#include "av1/encoder/rdopt.h"
+#include "av1/encoder/reconinter_enc.h"
+#include "av1/encoder/segmentation.h"
+#include "av1/encoder/tokenize.h"
+#include "av1/encoder/tpl_model.h"
+#include "av1/encoder/var_based_part.h"
+
+#if CONFIG_TUNE_VMAF
+#include "av1/encoder/tune_vmaf.h"
+#endif
+
+static AOM_INLINE void encode_superblock(const AV1_COMP *const cpi,
+                                         TileDataEnc *tile_data, ThreadData *td,
+                                         TOKENEXTRA **t, RUN_TYPE dry_run,
+                                         BLOCK_SIZE bsize, int *rate);
+
+// This is used as a reference when computing the source variance for the
+//  purposes of activity masking.
+// Eventually this should be replaced by custom no-reference routines,
+//  which will be faster.
+const uint8_t AV1_VAR_OFFS[MAX_SB_SIZE] = {
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128
+};
+
+static const uint16_t AV1_HIGH_VAR_OFFS_8[MAX_SB_SIZE] = {
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128
+};
+
+static const uint16_t AV1_HIGH_VAR_OFFS_10[MAX_SB_SIZE] = {
+  128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+  128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+  128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+  128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+  128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+  128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+  128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+  128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+  128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+  128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+  128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+  128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+  128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+  128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+  128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+  128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4
+};
+
+static const uint16_t AV1_HIGH_VAR_OFFS_12[MAX_SB_SIZE] = {
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16
+};
+
+typedef struct {
+  ENTROPY_CONTEXT a[MAX_MIB_SIZE * MAX_MB_PLANE];
+  ENTROPY_CONTEXT l[MAX_MIB_SIZE * MAX_MB_PLANE];
+  PARTITION_CONTEXT sa[MAX_MIB_SIZE];
+  PARTITION_CONTEXT sl[MAX_MIB_SIZE];
+  TXFM_CONTEXT *p_ta;
+  TXFM_CONTEXT *p_tl;
+  TXFM_CONTEXT ta[MAX_MIB_SIZE];
+  TXFM_CONTEXT tl[MAX_MIB_SIZE];
+} RD_SEARCH_MACROBLOCK_CONTEXT;
+
+enum { PICK_MODE_RD = 0, PICK_MODE_NONRD };
+
+enum {
+  SB_SINGLE_PASS,  // Single pass encoding: all ctxs get updated normally
+  SB_DRY_PASS,     // First pass of multi-pass: does not update the ctxs
+  SB_WET_PASS      // Second pass of multi-pass: finalize and update the ctx
+} UENUM1BYTE(SB_MULTI_PASS_MODE);
+
+// This struct is used to store the statistics used by sb-level multi-pass
+// encoding. Currently, this is only used to make a copy of the state before we
+// perform the first pass
+typedef struct SB_FIRST_PASS_STATS {
+  RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
+  RD_COUNTS rd_count;
+
+  int split_count;
+  FRAME_COUNTS fc;
+  InterModeRdModel inter_mode_rd_models[BLOCK_SIZES_ALL];
+  int thresh_freq_fact[BLOCK_SIZES_ALL][MAX_MODES];
+  int current_qindex;
+
+#if CONFIG_INTERNAL_STATS
+  unsigned int mode_chosen_counts[MAX_MODES];
+#endif  // CONFIG_INTERNAL_STATS
+} SB_FIRST_PASS_STATS;
+
+unsigned int av1_get_sby_perpixel_variance(const AV1_COMP *cpi,
+                                           const struct buf_2d *ref,
+                                           BLOCK_SIZE bs) {
+  unsigned int sse;
+  const unsigned int var =
+      cpi->fn_ptr[bs].vf(ref->buf, ref->stride, AV1_VAR_OFFS, 0, &sse);
+  return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]);
+}
+
+unsigned int av1_high_get_sby_perpixel_variance(const AV1_COMP *cpi,
+                                                const struct buf_2d *ref,
+                                                BLOCK_SIZE bs, int bd) {
+  unsigned int var, sse;
+  assert(bd == 8 || bd == 10 || bd == 12);
+  const int off_index = (bd - 8) >> 1;
+  const uint16_t *high_var_offs[3] = { AV1_HIGH_VAR_OFFS_8,
+                                       AV1_HIGH_VAR_OFFS_10,
+                                       AV1_HIGH_VAR_OFFS_12 };
+  var =
+      cpi->fn_ptr[bs].vf(ref->buf, ref->stride,
+                         CONVERT_TO_BYTEPTR(high_var_offs[off_index]), 0, &sse);
+  return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]);
+}
+
+static unsigned int get_sby_perpixel_diff_variance(const AV1_COMP *const cpi,
+                                                   const struct buf_2d *ref,
+                                                   int mi_row, int mi_col,
+                                                   BLOCK_SIZE bs) {
+  unsigned int sse, var;
+  uint8_t *last_y;
+  const YV12_BUFFER_CONFIG *last =
+      get_ref_frame_yv12_buf(&cpi->common, LAST_FRAME);
+
+  assert(last != NULL);
+  last_y =
+      &last->y_buffer[mi_row * MI_SIZE * last->y_stride + mi_col * MI_SIZE];
+  var = cpi->fn_ptr[bs].vf(ref->buf, ref->stride, last_y, last->y_stride, &sse);
+  return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]);
+}
+
+static BLOCK_SIZE get_rd_var_based_fixed_partition(AV1_COMP *cpi, MACROBLOCK *x,
+                                                   int mi_row, int mi_col) {
+  unsigned int var = get_sby_perpixel_diff_variance(
+      cpi, &x->plane[0].src, mi_row, mi_col, BLOCK_64X64);
+  if (var < 8)
+    return BLOCK_64X64;
+  else if (var < 128)
+    return BLOCK_32X32;
+  else if (var < 2048)
+    return BLOCK_16X16;
+  else
+    return BLOCK_8X8;
+}
+
+static int set_deltaq_rdmult(const AV1_COMP *const cpi, MACROBLOCKD *const xd) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const CommonQuantParams *quant_params = &cm->quant_params;
+  return av1_compute_rd_mult(cpi, quant_params->base_qindex + xd->delta_qindex +
+                                      quant_params->y_dc_delta_q);
+}
+
+static AOM_INLINE void set_ssim_rdmult(const AV1_COMP *const cpi,
+                                       MACROBLOCK *const x,
+                                       const BLOCK_SIZE bsize, const int mi_row,
+                                       const int mi_col, int *const rdmult) {
+  const AV1_COMMON *const cm = &cpi->common;
+
+  const int bsize_base = BLOCK_16X16;
+  const int num_mi_w = mi_size_wide[bsize_base];
+  const int num_mi_h = mi_size_high[bsize_base];
+  const int num_cols = (cm->mi_params.mi_cols + num_mi_w - 1) / num_mi_w;
+  const int num_rows = (cm->mi_params.mi_rows + num_mi_h - 1) / num_mi_h;
+  const int num_bcols = (mi_size_wide[bsize] + num_mi_w - 1) / num_mi_w;
+  const int num_brows = (mi_size_high[bsize] + num_mi_h - 1) / num_mi_h;
+  int row, col;
+  double num_of_mi = 0.0;
+  double geom_mean_of_scale = 0.0;
+
+  assert(cpi->oxcf.tuning == AOM_TUNE_SSIM);
+
+  aom_clear_system_state();
+  for (row = mi_row / num_mi_w;
+       row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) {
+    for (col = mi_col / num_mi_h;
+         col < num_cols && col < mi_col / num_mi_h + num_bcols; ++col) {
+      const int index = row * num_cols + col;
+      geom_mean_of_scale += log(cpi->ssim_rdmult_scaling_factors[index]);
+      num_of_mi += 1.0;
+    }
+  }
+  geom_mean_of_scale = exp(geom_mean_of_scale / num_of_mi);
+
+  *rdmult = (int)((double)(*rdmult) * geom_mean_of_scale + 0.5);
+  *rdmult = AOMMAX(*rdmult, 0);
+  set_error_per_bit(x, *rdmult);
+  aom_clear_system_state();
+}
+
+static int get_hier_tpl_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x,
+                               const BLOCK_SIZE bsize, const int mi_row,
+                               const int mi_col, int orig_rdmult) {
+  const AV1_COMMON *const cm = &cpi->common;
+  assert(IMPLIES(cpi->gf_group.size > 0,
+                 cpi->gf_group.index < cpi->gf_group.size));
+  const int tpl_idx = cpi->gf_group.index;
+  const TplDepFrame *tpl_frame = &cpi->tpl_data.tpl_frame[tpl_idx];
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int deltaq_rdmult = set_deltaq_rdmult(cpi, xd);
+  if (tpl_frame->is_valid == 0) return deltaq_rdmult;
+  if (!is_frame_tpl_eligible((AV1_COMP *)cpi)) return deltaq_rdmult;
+  if (tpl_idx >= MAX_LAG_BUFFERS) return deltaq_rdmult;
+  if (cpi->superres_mode != SUPERRES_NONE) return deltaq_rdmult;
+  if (cpi->oxcf.aq_mode != NO_AQ) return deltaq_rdmult;
+
+  const int bsize_base = BLOCK_16X16;
+  const int num_mi_w = mi_size_wide[bsize_base];
+  const int num_mi_h = mi_size_high[bsize_base];
+  const int num_cols = (cm->mi_params.mi_cols + num_mi_w - 1) / num_mi_w;
+  const int num_rows = (cm->mi_params.mi_rows + num_mi_h - 1) / num_mi_h;
+  const int num_bcols = (mi_size_wide[bsize] + num_mi_w - 1) / num_mi_w;
+  const int num_brows = (mi_size_high[bsize] + num_mi_h - 1) / num_mi_h;
+  int row, col;
+  double base_block_count = 0.0;
+  double geom_mean_of_scale = 0.0;
+  aom_clear_system_state();
+  for (row = mi_row / num_mi_w;
+       row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) {
+    for (col = mi_col / num_mi_h;
+         col < num_cols && col < mi_col / num_mi_h + num_bcols; ++col) {
+      const int index = row * num_cols + col;
+      geom_mean_of_scale += log(cpi->tpl_sb_rdmult_scaling_factors[index]);
+      base_block_count += 1.0;
+    }
+  }
+  geom_mean_of_scale = exp(geom_mean_of_scale / base_block_count);
+  int rdmult = (int)((double)orig_rdmult * geom_mean_of_scale + 0.5);
+  rdmult = AOMMAX(rdmult, 0);
+  set_error_per_bit(x, rdmult);
+  aom_clear_system_state();
+  if (bsize == cm->seq_params.sb_size) {
+    const int rdmult_sb = set_deltaq_rdmult(cpi, xd);
+    assert(rdmult_sb == rdmult);
+    (void)rdmult_sb;
+  }
+  return rdmult;
+}
+
+static int set_segment_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x,
+                              int8_t segment_id) {
+  const AV1_COMMON *const cm = &cpi->common;
+  av1_init_plane_quantizers(cpi, x, segment_id);
+  aom_clear_system_state();
+  const int segment_qindex =
+      av1_get_qindex(&cm->seg, segment_id, cm->quant_params.base_qindex);
+  return av1_compute_rd_mult(cpi,
+                             segment_qindex + cm->quant_params.y_dc_delta_q);
+}
+
+static AOM_INLINE void setup_block_rdmult(const AV1_COMP *const cpi,
+                                          MACROBLOCK *const x, int mi_row,
+                                          int mi_col, BLOCK_SIZE bsize,
+                                          AQ_MODE aq_mode, MB_MODE_INFO *mbmi) {
+  x->rdmult = cpi->rd.RDMULT;
+
+  if (aq_mode != NO_AQ) {
+    assert(mbmi != NULL);
+    if (aq_mode == VARIANCE_AQ) {
+      if (cpi->vaq_refresh) {
+        const int energy = bsize <= BLOCK_16X16
+                               ? x->mb_energy
+                               : av1_log_block_var(cpi, x, bsize);
+        mbmi->segment_id = energy;
+      }
+      x->rdmult = set_segment_rdmult(cpi, x, mbmi->segment_id);
+    } else if (aq_mode == COMPLEXITY_AQ) {
+      x->rdmult = set_segment_rdmult(cpi, x, mbmi->segment_id);
+    } else if (aq_mode == CYCLIC_REFRESH_AQ) {
+      // If segment is boosted, use rdmult for that segment.
+      if (cyclic_refresh_segment_id_boosted(mbmi->segment_id))
+        x->rdmult = av1_cyclic_refresh_get_rdmult(cpi->cyclic_refresh);
+    }
+  }
+
+  const AV1_COMMON *const cm = &cpi->common;
+  if (cm->delta_q_info.delta_q_present_flag &&
+      !cpi->sf.rt_sf.use_nonrd_pick_mode) {
+    x->rdmult = get_hier_tpl_rdmult(cpi, x, bsize, mi_row, mi_col, x->rdmult);
+  }
+
+  if (cpi->oxcf.tuning == AOM_TUNE_SSIM) {
+    set_ssim_rdmult(cpi, x, bsize, mi_row, mi_col, &x->rdmult);
+  }
+#if CONFIG_TUNE_VMAF
+  if (cpi->oxcf.tuning == AOM_TUNE_VMAF_WITHOUT_PREPROCESSING ||
+      cpi->oxcf.tuning == AOM_TUNE_VMAF_MAX_GAIN) {
+    av1_set_vmaf_rdmult(cpi, x, bsize, mi_row, mi_col, &x->rdmult);
+  }
+#endif
+}
+
+static AOM_INLINE void set_offsets_without_segment_id(
+    const AV1_COMP *const cpi, const TileInfo *const tile, MACROBLOCK *const x,
+    int mi_row, int mi_col, BLOCK_SIZE bsize) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *const xd = &x->e_mbd;
+  assert(bsize < BLOCK_SIZES_ALL);
+  const int mi_width = mi_size_wide[bsize];
+  const int mi_height = mi_size_high[bsize];
+
+  set_mode_info_offsets(&cpi->common.mi_params, &cpi->mbmi_ext_info, x, xd,
+                        mi_row, mi_col);
+
+  set_entropy_context(xd, mi_row, mi_col, num_planes);
+  xd->above_txfm_context = cm->above_contexts.txfm[tile->tile_row] + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+
+  // Set up destination pointers.
+  av1_setup_dst_planes(xd->plane, bsize, &cm->cur_frame->buf, mi_row, mi_col, 0,
+                       num_planes);
+
+  // Set up limit values for MV components.
+  // Mv beyond the range do not produce new/different prediction block.
+  av1_set_mv_limits(&cm->mi_params, &x->mv_limits, mi_row, mi_col, mi_height,
+                    mi_width, cpi->oxcf.border_in_pixels);
+
+  set_plane_n4(xd, mi_width, mi_height, num_planes);
+
+  // Set up distance of MB to edge of frame in 1/8th pel units.
+  assert(!(mi_col & (mi_width - 1)) && !(mi_row & (mi_height - 1)));
+  set_mi_row_col(xd, tile, mi_row, mi_height, mi_col, mi_width,
+                 cm->mi_params.mi_rows, cm->mi_params.mi_cols);
+
+  // Set up source buffers.
+  av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, bsize);
+
+  // required by av1_append_sub8x8_mvs_for_idx() and av1_find_best_ref_mvs()
+  xd->tile = *tile;
+}
+
+static AOM_INLINE void set_offsets(const AV1_COMP *const cpi,
+                                   const TileInfo *const tile,
+                                   MACROBLOCK *const x, int mi_row, int mi_col,
+                                   BLOCK_SIZE bsize) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const struct segmentation *const seg = &cm->seg;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi;
+
+  set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize);
+
+  // Setup segment ID.
+  mbmi = xd->mi[0];
+  mbmi->segment_id = 0;
+  if (seg->enabled) {
+    if (seg->enabled && !cpi->vaq_refresh) {
+      const uint8_t *const map =
+          seg->update_map ? cpi->enc_seg.map : cm->last_frame_seg_map;
+      mbmi->segment_id =
+          map ? get_segment_id(&cm->mi_params, map, bsize, mi_row, mi_col) : 0;
+    }
+    av1_init_plane_quantizers(cpi, x, mbmi->segment_id);
+  }
+}
+
+static AOM_INLINE void update_filter_type_count(FRAME_COUNTS *counts,
+                                                const MACROBLOCKD *xd,
+                                                const MB_MODE_INFO *mbmi) {
+  int dir;
+  for (dir = 0; dir < 2; ++dir) {
+    const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
+    InterpFilter filter = av1_extract_interp_filter(mbmi->interp_filters, dir);
+    ++counts->switchable_interp[ctx][filter];
+  }
+}
+
+static AOM_INLINE void update_filter_type_cdf(const MACROBLOCKD *xd,
+                                              const MB_MODE_INFO *mbmi) {
+  int dir;
+  for (dir = 0; dir < 2; ++dir) {
+    const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
+    InterpFilter filter = av1_extract_interp_filter(mbmi->interp_filters, dir);
+    update_cdf(xd->tile_ctx->switchable_interp_cdf[ctx], filter,
+               SWITCHABLE_FILTERS);
+  }
+}
+
+static AOM_INLINE void update_global_motion_used(PREDICTION_MODE mode,
+                                                 BLOCK_SIZE bsize,
+                                                 const MB_MODE_INFO *mbmi,
+                                                 RD_COUNTS *rdc) {
+  if (mode == GLOBALMV || mode == GLOBAL_GLOBALMV) {
+    const int num_4x4s = mi_size_wide[bsize] * mi_size_high[bsize];
+    int ref;
+    for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
+      rdc->global_motion_used[mbmi->ref_frame[ref]] += num_4x4s;
+    }
+  }
+}
+
+static AOM_INLINE void reset_tx_size(MACROBLOCK *x, MB_MODE_INFO *mbmi,
+                                     const TX_MODE tx_mode) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  if (xd->lossless[mbmi->segment_id]) {
+    mbmi->tx_size = TX_4X4;
+  } else if (tx_mode != TX_MODE_SELECT) {
+    mbmi->tx_size = tx_size_from_tx_mode(mbmi->sb_type, tx_mode);
+  } else {
+    BLOCK_SIZE bsize = mbmi->sb_type;
+    TX_SIZE min_tx_size = depth_to_tx_size(MAX_TX_DEPTH, bsize);
+    mbmi->tx_size = (TX_SIZE)TXSIZEMAX(mbmi->tx_size, min_tx_size);
+  }
+  if (is_inter_block(mbmi)) {
+    memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size));
+  }
+  const int stride = xd->tx_type_map_stride;
+  const int bw = mi_size_wide[mbmi->sb_type];
+  for (int row = 0; row < mi_size_high[mbmi->sb_type]; ++row) {
+    memset(xd->tx_type_map + row * stride, DCT_DCT,
+           bw * sizeof(xd->tx_type_map[0]));
+  }
+  av1_zero(x->blk_skip);
+  x->force_skip = 0;
+}
+
+// This function will copy the best reference mode information from
+// MB_MODE_INFO_EXT_FRAME to MB_MODE_INFO_EXT.
+static INLINE void copy_mbmi_ext_frame_to_mbmi_ext(
+    MB_MODE_INFO_EXT *mbmi_ext,
+    const MB_MODE_INFO_EXT_FRAME *const mbmi_ext_best, uint8_t ref_frame_type) {
+  memcpy(mbmi_ext->ref_mv_stack[ref_frame_type], mbmi_ext_best->ref_mv_stack,
+         sizeof(mbmi_ext->ref_mv_stack[USABLE_REF_MV_STACK_SIZE]));
+  memcpy(mbmi_ext->weight[ref_frame_type], mbmi_ext_best->weight,
+         sizeof(mbmi_ext->weight[USABLE_REF_MV_STACK_SIZE]));
+  mbmi_ext->mode_context[ref_frame_type] = mbmi_ext_best->mode_context;
+  mbmi_ext->ref_mv_count[ref_frame_type] = mbmi_ext_best->ref_mv_count;
+  memcpy(mbmi_ext->global_mvs, mbmi_ext_best->global_mvs,
+         sizeof(mbmi_ext->global_mvs));
+}
+
+static AOM_INLINE void update_state(const AV1_COMP *const cpi, ThreadData *td,
+                                    const PICK_MODE_CONTEXT *const ctx,
+                                    int mi_row, int mi_col, BLOCK_SIZE bsize,
+                                    RUN_TYPE dry_run) {
+  int i, x_idx, y;
+  const AV1_COMMON *const cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  const int num_planes = av1_num_planes(cm);
+  RD_COUNTS *const rdc = &td->rd_counts;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblock_plane *const p = x->plane;
+  struct macroblockd_plane *const pd = xd->plane;
+  const MB_MODE_INFO *const mi = &ctx->mic;
+  MB_MODE_INFO *const mi_addr = xd->mi[0];
+  const struct segmentation *const seg = &cm->seg;
+  const int bw = mi_size_wide[mi->sb_type];
+  const int bh = mi_size_high[mi->sb_type];
+  const int mis = mi_params->mi_stride;
+  const int mi_width = mi_size_wide[bsize];
+  const int mi_height = mi_size_high[bsize];
+
+  assert(mi->sb_type == bsize);
+
+  *mi_addr = *mi;
+  copy_mbmi_ext_frame_to_mbmi_ext(x->mbmi_ext, &ctx->mbmi_ext_best,
+                                  av1_ref_frame_type(ctx->mic.ref_frame));
+
+  memcpy(x->blk_skip, ctx->blk_skip, sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+
+  x->force_skip = ctx->rd_stats.skip;
+
+  xd->tx_type_map = ctx->tx_type_map;
+  xd->tx_type_map_stride = mi_size_wide[bsize];
+  // If not dry_run, copy the transform type data into the frame level buffer.
+  // Encoder will fetch tx types when writing bitstream.
+  if (!dry_run) {
+    const int grid_idx = get_mi_grid_idx(mi_params, mi_row, mi_col);
+    uint8_t *const tx_type_map = mi_params->tx_type_map + grid_idx;
+    const int mi_stride = mi_params->mi_stride;
+    for (int blk_row = 0; blk_row < bh; ++blk_row) {
+      av1_copy_array(tx_type_map + blk_row * mi_stride,
+                     xd->tx_type_map + blk_row * xd->tx_type_map_stride, bw);
+    }
+    xd->tx_type_map = tx_type_map;
+    xd->tx_type_map_stride = mi_stride;
+  }
+
+  // If segmentation in use
+  if (seg->enabled) {
+    // For in frame complexity AQ copy the segment id from the segment map.
+    if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) {
+      const uint8_t *const map =
+          seg->update_map ? cpi->enc_seg.map : cm->last_frame_seg_map;
+      mi_addr->segment_id =
+          map ? get_segment_id(mi_params, map, bsize, mi_row, mi_col) : 0;
+      reset_tx_size(x, mi_addr, x->tx_mode_search_type);
+    }
+    // Else for cyclic refresh mode update the segment map, set the segment id
+    // and then update the quantizer.
+    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
+      av1_cyclic_refresh_update_segment(cpi, mi_addr, mi_row, mi_col, bsize,
+                                        ctx->rd_stats.rate, ctx->rd_stats.dist,
+                                        x->force_skip);
+    }
+    if (mi_addr->uv_mode == UV_CFL_PRED && !is_cfl_allowed(xd))
+      mi_addr->uv_mode = UV_DC_PRED;
+  }
+
+  for (i = 0; i < num_planes; ++i) {
+    p[i].coeff = ctx->coeff[i];
+    p[i].qcoeff = ctx->qcoeff[i];
+    pd[i].dqcoeff = ctx->dqcoeff[i];
+    p[i].eobs = ctx->eobs[i];
+    p[i].txb_entropy_ctx = ctx->txb_entropy_ctx[i];
+  }
+  for (i = 0; i < 2; ++i) pd[i].color_index_map = ctx->color_index_map[i];
+  // Restore the coding context of the MB to that that was in place
+  // when the mode was picked for it
+  for (y = 0; y < mi_height; y++) {
+    for (x_idx = 0; x_idx < mi_width; x_idx++) {
+      if ((xd->mb_to_right_edge >> (3 + MI_SIZE_LOG2)) + mi_width > x_idx &&
+          (xd->mb_to_bottom_edge >> (3 + MI_SIZE_LOG2)) + mi_height > y) {
+        xd->mi[x_idx + y * mis] = mi_addr;
+      }
+    }
+  }
+
+  if (cpi->oxcf.aq_mode) av1_init_plane_quantizers(cpi, x, mi_addr->segment_id);
+
+  if (dry_run) return;
+
+#if CONFIG_INTERNAL_STATS
+  {
+    unsigned int *const mode_chosen_counts =
+        (unsigned int *)cpi->mode_chosen_counts;  // Cast const away.
+    if (frame_is_intra_only(cm)) {
+      static const int kf_mode_index[] = {
+        THR_DC /*DC_PRED*/,
+        THR_V_PRED /*V_PRED*/,
+        THR_H_PRED /*H_PRED*/,
+        THR_D45_PRED /*D45_PRED*/,
+        THR_D135_PRED /*D135_PRED*/,
+        THR_D113_PRED /*D113_PRED*/,
+        THR_D157_PRED /*D157_PRED*/,
+        THR_D203_PRED /*D203_PRED*/,
+        THR_D67_PRED /*D67_PRED*/,
+        THR_SMOOTH,   /*SMOOTH_PRED*/
+        THR_SMOOTH_V, /*SMOOTH_V_PRED*/
+        THR_SMOOTH_H, /*SMOOTH_H_PRED*/
+        THR_PAETH /*PAETH_PRED*/,
+      };
+      ++mode_chosen_counts[kf_mode_index[mi_addr->mode]];
+    } else {
+      // Note how often each mode chosen as best
+      ++mode_chosen_counts[ctx->best_mode_index];
+    }
+  }
+#endif
+  if (!frame_is_intra_only(cm)) {
+    if (is_inter_block(mi_addr)) {
+      // TODO(sarahparker): global motion stats need to be handled per-tile
+      // to be compatible with tile-based threading.
+      update_global_motion_used(mi_addr->mode, bsize, mi_addr, rdc);
+    }
+
+    if (cm->features.interp_filter == SWITCHABLE &&
+        mi_addr->motion_mode != WARPED_CAUSAL &&
+        !is_nontrans_global_motion(xd, xd->mi[0])) {
+      update_filter_type_count(td->counts, xd, mi_addr);
+    }
+
+    rdc->comp_pred_diff[SINGLE_REFERENCE] += ctx->single_pred_diff;
+    rdc->comp_pred_diff[COMPOUND_REFERENCE] += ctx->comp_pred_diff;
+    rdc->comp_pred_diff[REFERENCE_MODE_SELECT] += ctx->hybrid_pred_diff;
+  }
+
+  const int x_mis = AOMMIN(bw, mi_params->mi_cols - mi_col);
+  const int y_mis = AOMMIN(bh, mi_params->mi_rows - mi_row);
+  if (cm->seq_params.order_hint_info.enable_ref_frame_mvs)
+    av1_copy_frame_mvs(cm, mi, mi_row, mi_col, x_mis, y_mis);
+}
+
+void av1_setup_src_planes(MACROBLOCK *x, const YV12_BUFFER_CONFIG *src,
+                          int mi_row, int mi_col, const int num_planes,
+                          BLOCK_SIZE bsize) {
+  // Set current frame pointer.
+  x->e_mbd.cur_buf = src;
+
+  // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet
+  // the static analysis warnings.
+  for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); i++) {
+    const int is_uv = i > 0;
+    setup_pred_plane(
+        &x->plane[i].src, bsize, src->buffers[i], src->crop_widths[is_uv],
+        src->crop_heights[is_uv], src->strides[is_uv], mi_row, mi_col, NULL,
+        x->e_mbd.plane[i].subsampling_x, x->e_mbd.plane[i].subsampling_y);
+  }
+}
+
+static EdgeInfo edge_info(const struct buf_2d *ref, const BLOCK_SIZE bsize,
+                          const bool high_bd, const int bd) {
+  const int width = block_size_wide[bsize];
+  const int height = block_size_high[bsize];
+  // Implementation requires width to be a multiple of 8. It also requires
+  // height to be a multiple of 4, but this is always the case.
+  assert(height % 4 == 0);
+  if (width % 8 != 0) {
+    EdgeInfo ei = { .magnitude = 0, .x = 0, .y = 0 };
+    return ei;
+  }
+  return av1_edge_exists(ref->buf, ref->stride, width, height, high_bd, bd);
+}
+
+static int use_pb_simple_motion_pred_sse(const AV1_COMP *const cpi) {
+  // TODO(debargha, yuec): Not in use, need to implement a speed feature
+  // utilizing this data point, and replace '0' by the corresponding speed
+  // feature flag.
+  return 0 && !frame_is_intra_only(&cpi->common);
+}
+
+static void hybrid_intra_mode_search(AV1_COMP *cpi, MACROBLOCK *const x,
+                                     RD_STATS *rd_cost, BLOCK_SIZE bsize,
+                                     PICK_MODE_CONTEXT *ctx) {
+  // TODO(jianj): Investigate the failure of ScalabilityTest in AOM_Q mode,
+  // which sets base_qindex to 0 on keyframe.
+  if (cpi->oxcf.rc_mode != AOM_CBR || !cpi->sf.rt_sf.hybrid_intra_pickmode ||
+      bsize < BLOCK_16X16)
+    av1_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, INT64_MAX);
+  else
+    av1_pick_intra_mode(cpi, x, rd_cost, bsize, ctx);
+}
+
+static AOM_INLINE void pick_sb_modes(AV1_COMP *const cpi,
+                                     TileDataEnc *tile_data,
+                                     MACROBLOCK *const x, int mi_row,
+                                     int mi_col, RD_STATS *rd_cost,
+                                     PARTITION_TYPE partition, BLOCK_SIZE bsize,
+                                     PICK_MODE_CONTEXT *ctx, RD_STATS best_rd,
+                                     int pick_mode_type) {
+  if (best_rd.rdcost < 0) {
+    ctx->rd_stats.rdcost = INT64_MAX;
+    ctx->rd_stats.skip = 0;
+    av1_invalid_rd_stats(rd_cost);
+    return;
+  }
+
+  set_offsets(cpi, &tile_data->tile_info, x, mi_row, mi_col, bsize);
+
+  if (ctx->rd_mode_is_ready) {
+    assert(ctx->mic.sb_type == bsize);
+    assert(ctx->mic.partition == partition);
+    rd_cost->rate = ctx->rd_stats.rate;
+    rd_cost->dist = ctx->rd_stats.dist;
+    rd_cost->rdcost = ctx->rd_stats.rdcost;
+    return;
+  }
+
+  AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi;
+  struct macroblock_plane *const p = x->plane;
+  struct macroblockd_plane *const pd = xd->plane;
+  const AQ_MODE aq_mode = cpi->oxcf.aq_mode;
+  int i;
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, rd_pick_sb_modes_time);
+#endif
+
+  aom_clear_system_state();
+
+  mbmi = xd->mi[0];
+  mbmi->sb_type = bsize;
+  mbmi->partition = partition;
+
+#if CONFIG_RD_DEBUG
+  mbmi->mi_row = mi_row;
+  mbmi->mi_col = mi_col;
+#endif
+
+  xd->tx_type_map = x->tx_type_map;
+  xd->tx_type_map_stride = mi_size_wide[bsize];
+
+  for (i = 0; i < num_planes; ++i) {
+    p[i].coeff = ctx->coeff[i];
+    p[i].qcoeff = ctx->qcoeff[i];
+    pd[i].dqcoeff = ctx->dqcoeff[i];
+    p[i].eobs = ctx->eobs[i];
+    p[i].txb_entropy_ctx = ctx->txb_entropy_ctx[i];
+  }
+
+  for (i = 0; i < 2; ++i) pd[i].color_index_map = ctx->color_index_map[i];
+
+  ctx->skippable = 0;
+  // Set to zero to make sure we do not use the previous encoded frame stats
+  mbmi->skip = 0;
+  // Reset skip mode flag.
+  mbmi->skip_mode = 0;
+
+  if (is_cur_buf_hbd(xd)) {
+    x->source_variance = av1_high_get_sby_perpixel_variance(
+        cpi, &x->plane[0].src, bsize, xd->bd);
+  } else {
+    x->source_variance =
+        av1_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
+  }
+  if (use_pb_simple_motion_pred_sse(cpi)) {
+    const FULLPEL_MV start_mv = kZeroFullMv;
+    unsigned int var = 0;
+    av1_simple_motion_sse_var(cpi, x, mi_row, mi_col, bsize, start_mv, 0,
+                              &x->simple_motion_pred_sse, &var);
+  }
+
+  // If the threshold for disabling wedge search is zero, it means the feature
+  // should not be used. Use a value that will always succeed in the check.
+  if (cpi->sf.inter_sf.disable_wedge_search_edge_thresh == 0) {
+    x->edge_strength = UINT16_MAX;
+    x->edge_strength_x = UINT16_MAX;
+    x->edge_strength_y = UINT16_MAX;
+  } else {
+    EdgeInfo ei =
+        edge_info(&x->plane[0].src, bsize, is_cur_buf_hbd(xd), xd->bd);
+    x->edge_strength = ei.magnitude;
+    x->edge_strength_x = ei.x;
+    x->edge_strength_y = ei.y;
+  }
+
+  // Initialize default mode evaluation params
+  set_mode_eval_params(cpi, x, DEFAULT_EVAL);
+
+  // Save rdmult before it might be changed, so it can be restored later.
+  const int orig_rdmult = x->rdmult;
+  setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, aq_mode, mbmi);
+  // Set error per bit for current rdmult
+  set_error_per_bit(x, x->rdmult);
+  av1_rd_cost_update(x->rdmult, &best_rd);
+
+  // Find best coding mode & reconstruct the MB so it is available
+  // as a predictor for MBs that follow in the SB
+  if (frame_is_intra_only(cm)) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    start_timing(cpi, av1_rd_pick_intra_mode_sb_time);
+#endif
+    switch (pick_mode_type) {
+      case PICK_MODE_RD:
+        av1_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, best_rd.rdcost);
+        break;
+      case PICK_MODE_NONRD:
+        hybrid_intra_mode_search(cpi, x, rd_cost, bsize, ctx);
+        break;
+      default: assert(0 && "Unknown pick mode type.");
+    }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    end_timing(cpi, av1_rd_pick_intra_mode_sb_time);
+#endif
+  } else {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    start_timing(cpi, av1_rd_pick_inter_mode_sb_time);
+#endif
+    if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+      av1_rd_pick_inter_mode_sb_seg_skip(cpi, tile_data, x, mi_row, mi_col,
+                                         rd_cost, bsize, ctx, best_rd.rdcost);
+    } else {
+      // TODO(kyslov): do the same for pick_inter_mode_sb_seg_skip
+      switch (pick_mode_type) {
+        case PICK_MODE_RD:
+          av1_rd_pick_inter_mode_sb(cpi, tile_data, x, rd_cost, bsize, ctx,
+                                    best_rd.rdcost);
+          break;
+        case PICK_MODE_NONRD:
+          av1_nonrd_pick_inter_mode_sb(cpi, tile_data, x, rd_cost, bsize, ctx,
+                                       best_rd.rdcost);
+          break;
+        default: assert(0 && "Unknown pick mode type.");
+      }
+    }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    end_timing(cpi, av1_rd_pick_inter_mode_sb_time);
+#endif
+  }
+
+  // Examine the resulting rate and for AQ mode 2 make a segment choice.
+  if (rd_cost->rate != INT_MAX && aq_mode == COMPLEXITY_AQ &&
+      bsize >= BLOCK_16X16) {
+    av1_caq_select_segment(cpi, x, bsize, mi_row, mi_col, rd_cost->rate);
+  }
+
+  x->rdmult = orig_rdmult;
+
+  // TODO(jingning) The rate-distortion optimization flow needs to be
+  // refactored to provide proper exit/return handle.
+  if (rd_cost->rate == INT_MAX) rd_cost->rdcost = INT64_MAX;
+
+  ctx->rd_stats.rate = rd_cost->rate;
+  ctx->rd_stats.dist = rd_cost->dist;
+  ctx->rd_stats.rdcost = rd_cost->rdcost;
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, rd_pick_sb_modes_time);
+#endif
+}
+
+static AOM_INLINE void update_inter_mode_stats(FRAME_CONTEXT *fc,
+                                               FRAME_COUNTS *counts,
+                                               PREDICTION_MODE mode,
+                                               int16_t mode_context) {
+  (void)counts;
+
+  int16_t mode_ctx = mode_context & NEWMV_CTX_MASK;
+  if (mode == NEWMV) {
+#if CONFIG_ENTROPY_STATS
+    ++counts->newmv_mode[mode_ctx][0];
+#endif
+    update_cdf(fc->newmv_cdf[mode_ctx], 0, 2);
+    return;
+  }
+
+#if CONFIG_ENTROPY_STATS
+  ++counts->newmv_mode[mode_ctx][1];
+#endif
+  update_cdf(fc->newmv_cdf[mode_ctx], 1, 2);
+
+  mode_ctx = (mode_context >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK;
+  if (mode == GLOBALMV) {
+#if CONFIG_ENTROPY_STATS
+    ++counts->zeromv_mode[mode_ctx][0];
+#endif
+    update_cdf(fc->zeromv_cdf[mode_ctx], 0, 2);
+    return;
+  }
+
+#if CONFIG_ENTROPY_STATS
+  ++counts->zeromv_mode[mode_ctx][1];
+#endif
+  update_cdf(fc->zeromv_cdf[mode_ctx], 1, 2);
+
+  mode_ctx = (mode_context >> REFMV_OFFSET) & REFMV_CTX_MASK;
+#if CONFIG_ENTROPY_STATS
+  ++counts->refmv_mode[mode_ctx][mode != NEARESTMV];
+#endif
+  update_cdf(fc->refmv_cdf[mode_ctx], mode != NEARESTMV, 2);
+}
+
+static AOM_INLINE void update_palette_cdf(MACROBLOCKD *xd,
+                                          const MB_MODE_INFO *const mbmi,
+                                          FRAME_COUNTS *counts) {
+  FRAME_CONTEXT *fc = xd->tile_ctx;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  const int palette_bsize_ctx = av1_get_palette_bsize_ctx(bsize);
+
+  (void)counts;
+
+  if (mbmi->mode == DC_PRED) {
+    const int n = pmi->palette_size[0];
+    const int palette_mode_ctx = av1_get_palette_mode_ctx(xd);
+
+#if CONFIG_ENTROPY_STATS
+    ++counts->palette_y_mode[palette_bsize_ctx][palette_mode_ctx][n > 0];
+#endif
+    update_cdf(fc->palette_y_mode_cdf[palette_bsize_ctx][palette_mode_ctx],
+               n > 0, 2);
+    if (n > 0) {
+#if CONFIG_ENTROPY_STATS
+      ++counts->palette_y_size[palette_bsize_ctx][n - PALETTE_MIN_SIZE];
+#endif
+      update_cdf(fc->palette_y_size_cdf[palette_bsize_ctx],
+                 n - PALETTE_MIN_SIZE, PALETTE_SIZES);
+    }
+  }
+
+  if (mbmi->uv_mode == UV_DC_PRED) {
+    const int n = pmi->palette_size[1];
+    const int palette_uv_mode_ctx = (pmi->palette_size[0] > 0);
+
+#if CONFIG_ENTROPY_STATS
+    ++counts->palette_uv_mode[palette_uv_mode_ctx][n > 0];
+#endif
+    update_cdf(fc->palette_uv_mode_cdf[palette_uv_mode_ctx], n > 0, 2);
+
+    if (n > 0) {
+#if CONFIG_ENTROPY_STATS
+      ++counts->palette_uv_size[palette_bsize_ctx][n - PALETTE_MIN_SIZE];
+#endif
+      update_cdf(fc->palette_uv_size_cdf[palette_bsize_ctx],
+                 n - PALETTE_MIN_SIZE, PALETTE_SIZES);
+    }
+  }
+}
+
+static AOM_INLINE void sum_intra_stats(const AV1_COMMON *const cm,
+                                       FRAME_COUNTS *counts, MACROBLOCKD *xd,
+                                       const MB_MODE_INFO *const mbmi,
+                                       const MB_MODE_INFO *above_mi,
+                                       const MB_MODE_INFO *left_mi,
+                                       const int intraonly) {
+  FRAME_CONTEXT *fc = xd->tile_ctx;
+  const PREDICTION_MODE y_mode = mbmi->mode;
+  (void)counts;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+
+  if (intraonly) {
+#if CONFIG_ENTROPY_STATS
+    const PREDICTION_MODE above = av1_above_block_mode(above_mi);
+    const PREDICTION_MODE left = av1_left_block_mode(left_mi);
+    const int above_ctx = intra_mode_context[above];
+    const int left_ctx = intra_mode_context[left];
+    ++counts->kf_y_mode[above_ctx][left_ctx][y_mode];
+#endif  // CONFIG_ENTROPY_STATS
+    update_cdf(get_y_mode_cdf(fc, above_mi, left_mi), y_mode, INTRA_MODES);
+  } else {
+#if CONFIG_ENTROPY_STATS
+    ++counts->y_mode[size_group_lookup[bsize]][y_mode];
+#endif  // CONFIG_ENTROPY_STATS
+    update_cdf(fc->y_mode_cdf[size_group_lookup[bsize]], y_mode, INTRA_MODES);
+  }
+
+  if (av1_filter_intra_allowed(cm, mbmi)) {
+    const int use_filter_intra_mode =
+        mbmi->filter_intra_mode_info.use_filter_intra;
+#if CONFIG_ENTROPY_STATS
+    ++counts->filter_intra[mbmi->sb_type][use_filter_intra_mode];
+    if (use_filter_intra_mode) {
+      ++counts
+            ->filter_intra_mode[mbmi->filter_intra_mode_info.filter_intra_mode];
+    }
+#endif  // CONFIG_ENTROPY_STATS
+    update_cdf(fc->filter_intra_cdfs[mbmi->sb_type], use_filter_intra_mode, 2);
+    if (use_filter_intra_mode) {
+      update_cdf(fc->filter_intra_mode_cdf,
+                 mbmi->filter_intra_mode_info.filter_intra_mode,
+                 FILTER_INTRA_MODES);
+    }
+  }
+  if (av1_is_directional_mode(mbmi->mode) && av1_use_angle_delta(bsize)) {
+#if CONFIG_ENTROPY_STATS
+    ++counts->angle_delta[mbmi->mode - V_PRED]
+                         [mbmi->angle_delta[PLANE_TYPE_Y] + MAX_ANGLE_DELTA];
+#endif
+    update_cdf(fc->angle_delta_cdf[mbmi->mode - V_PRED],
+               mbmi->angle_delta[PLANE_TYPE_Y] + MAX_ANGLE_DELTA,
+               2 * MAX_ANGLE_DELTA + 1);
+  }
+
+  if (!xd->is_chroma_ref) return;
+
+  const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode;
+  const CFL_ALLOWED_TYPE cfl_allowed = is_cfl_allowed(xd);
+#if CONFIG_ENTROPY_STATS
+  ++counts->uv_mode[cfl_allowed][y_mode][uv_mode];
+#endif  // CONFIG_ENTROPY_STATS
+  update_cdf(fc->uv_mode_cdf[cfl_allowed][y_mode], uv_mode,
+             UV_INTRA_MODES - !cfl_allowed);
+  if (uv_mode == UV_CFL_PRED) {
+    const int8_t joint_sign = mbmi->cfl_alpha_signs;
+    const uint8_t idx = mbmi->cfl_alpha_idx;
+
+#if CONFIG_ENTROPY_STATS
+    ++counts->cfl_sign[joint_sign];
+#endif
+    update_cdf(fc->cfl_sign_cdf, joint_sign, CFL_JOINT_SIGNS);
+    if (CFL_SIGN_U(joint_sign) != CFL_SIGN_ZERO) {
+      aom_cdf_prob *cdf_u = fc->cfl_alpha_cdf[CFL_CONTEXT_U(joint_sign)];
+
+#if CONFIG_ENTROPY_STATS
+      ++counts->cfl_alpha[CFL_CONTEXT_U(joint_sign)][CFL_IDX_U(idx)];
+#endif
+      update_cdf(cdf_u, CFL_IDX_U(idx), CFL_ALPHABET_SIZE);
+    }
+    if (CFL_SIGN_V(joint_sign) != CFL_SIGN_ZERO) {
+      aom_cdf_prob *cdf_v = fc->cfl_alpha_cdf[CFL_CONTEXT_V(joint_sign)];
+
+#if CONFIG_ENTROPY_STATS
+      ++counts->cfl_alpha[CFL_CONTEXT_V(joint_sign)][CFL_IDX_V(idx)];
+#endif
+      update_cdf(cdf_v, CFL_IDX_V(idx), CFL_ALPHABET_SIZE);
+    }
+  }
+  if (av1_is_directional_mode(get_uv_mode(uv_mode)) &&
+      av1_use_angle_delta(bsize)) {
+#if CONFIG_ENTROPY_STATS
+    ++counts->angle_delta[uv_mode - UV_V_PRED]
+                         [mbmi->angle_delta[PLANE_TYPE_UV] + MAX_ANGLE_DELTA];
+#endif
+    update_cdf(fc->angle_delta_cdf[uv_mode - UV_V_PRED],
+               mbmi->angle_delta[PLANE_TYPE_UV] + MAX_ANGLE_DELTA,
+               2 * MAX_ANGLE_DELTA + 1);
+  }
+  if (av1_allow_palette(cm->features.allow_screen_content_tools, bsize)) {
+    update_palette_cdf(xd, mbmi, counts);
+  }
+}
+
+static AOM_INLINE void update_stats(const AV1_COMMON *const cm,
+                                    ThreadData *td) {
+  MACROBLOCK *x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+  const CurrentFrame *const current_frame = &cm->current_frame;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  FRAME_CONTEXT *fc = xd->tile_ctx;
+  const int seg_ref_active =
+      segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_REF_FRAME);
+
+  if (current_frame->skip_mode_info.skip_mode_flag && !seg_ref_active &&
+      is_comp_ref_allowed(bsize)) {
+    const int skip_mode_ctx = av1_get_skip_mode_context(xd);
+#if CONFIG_ENTROPY_STATS
+    td->counts->skip_mode[skip_mode_ctx][mbmi->skip_mode]++;
+#endif
+    update_cdf(fc->skip_mode_cdfs[skip_mode_ctx], mbmi->skip_mode, 2);
+  }
+
+  if (!mbmi->skip_mode && !seg_ref_active) {
+    const int skip_ctx = av1_get_skip_context(xd);
+#if CONFIG_ENTROPY_STATS
+    td->counts->skip[skip_ctx][mbmi->skip]++;
+#endif
+    update_cdf(fc->skip_cdfs[skip_ctx], mbmi->skip, 2);
+  }
+
+#if CONFIG_ENTROPY_STATS
+  // delta quant applies to both intra and inter
+  const int super_block_upper_left =
+      ((xd->mi_row & (cm->seq_params.mib_size - 1)) == 0) &&
+      ((xd->mi_col & (cm->seq_params.mib_size - 1)) == 0);
+  const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
+  if (delta_q_info->delta_q_present_flag &&
+      (bsize != cm->seq_params.sb_size || !mbmi->skip) &&
+      super_block_upper_left) {
+    const int dq =
+        (mbmi->current_qindex - xd->current_qindex) / delta_q_info->delta_q_res;
+    const int absdq = abs(dq);
+    for (int i = 0; i < AOMMIN(absdq, DELTA_Q_SMALL); ++i) {
+      td->counts->delta_q[i][1]++;
+    }
+    if (absdq < DELTA_Q_SMALL) td->counts->delta_q[absdq][0]++;
+    if (delta_q_info->delta_lf_present_flag) {
+      if (delta_q_info->delta_lf_multi) {
+        const int frame_lf_count =
+            av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
+        for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) {
+          const int delta_lf = (mbmi->delta_lf[lf_id] - xd->delta_lf[lf_id]) /
+                               delta_q_info->delta_lf_res;
+          const int abs_delta_lf = abs(delta_lf);
+          for (int i = 0; i < AOMMIN(abs_delta_lf, DELTA_LF_SMALL); ++i) {
+            td->counts->delta_lf_multi[lf_id][i][1]++;
+          }
+          if (abs_delta_lf < DELTA_LF_SMALL)
+            td->counts->delta_lf_multi[lf_id][abs_delta_lf][0]++;
+        }
+      } else {
+        const int delta_lf =
+            (mbmi->delta_lf_from_base - xd->delta_lf_from_base) /
+            delta_q_info->delta_lf_res;
+        const int abs_delta_lf = abs(delta_lf);
+        for (int i = 0; i < AOMMIN(abs_delta_lf, DELTA_LF_SMALL); ++i) {
+          td->counts->delta_lf[i][1]++;
+        }
+        if (abs_delta_lf < DELTA_LF_SMALL)
+          td->counts->delta_lf[abs_delta_lf][0]++;
+      }
+    }
+  }
+#endif
+
+  if (!is_inter_block(mbmi)) {
+    sum_intra_stats(cm, td->counts, xd, mbmi, xd->above_mbmi, xd->left_mbmi,
+                    frame_is_intra_only(cm));
+  }
+
+  if (av1_allow_intrabc(cm)) {
+    update_cdf(fc->intrabc_cdf, is_intrabc_block(mbmi), 2);
+#if CONFIG_ENTROPY_STATS
+    ++td->counts->intrabc[is_intrabc_block(mbmi)];
+#endif  // CONFIG_ENTROPY_STATS
+  }
+
+  if (frame_is_intra_only(cm) || mbmi->skip_mode) return;
+
+  FRAME_COUNTS *const counts = td->counts;
+  const int inter_block = is_inter_block(mbmi);
+
+  if (!seg_ref_active) {
+#if CONFIG_ENTROPY_STATS
+    counts->intra_inter[av1_get_intra_inter_context(xd)][inter_block]++;
+#endif
+    update_cdf(fc->intra_inter_cdf[av1_get_intra_inter_context(xd)],
+               inter_block, 2);
+    // If the segment reference feature is enabled we have only a single
+    // reference frame allowed for the segment so exclude it from
+    // the reference frame counts used to work out probabilities.
+    if (inter_block) {
+      const MV_REFERENCE_FRAME ref0 = mbmi->ref_frame[0];
+      const MV_REFERENCE_FRAME ref1 = mbmi->ref_frame[1];
+      if (current_frame->reference_mode == REFERENCE_MODE_SELECT) {
+        if (is_comp_ref_allowed(bsize)) {
+#if CONFIG_ENTROPY_STATS
+          counts->comp_inter[av1_get_reference_mode_context(xd)]
+                            [has_second_ref(mbmi)]++;
+#endif  // CONFIG_ENTROPY_STATS
+          update_cdf(av1_get_reference_mode_cdf(xd), has_second_ref(mbmi), 2);
+        }
+      }
+
+      if (has_second_ref(mbmi)) {
+        const COMP_REFERENCE_TYPE comp_ref_type = has_uni_comp_refs(mbmi)
+                                                      ? UNIDIR_COMP_REFERENCE
+                                                      : BIDIR_COMP_REFERENCE;
+        update_cdf(av1_get_comp_reference_type_cdf(xd), comp_ref_type,
+                   COMP_REFERENCE_TYPES);
+#if CONFIG_ENTROPY_STATS
+        counts->comp_ref_type[av1_get_comp_reference_type_context(xd)]
+                             [comp_ref_type]++;
+#endif  // CONFIG_ENTROPY_STATS
+
+        if (comp_ref_type == UNIDIR_COMP_REFERENCE) {
+          const int bit = (ref0 == BWDREF_FRAME);
+          update_cdf(av1_get_pred_cdf_uni_comp_ref_p(xd), bit, 2);
+#if CONFIG_ENTROPY_STATS
+          counts
+              ->uni_comp_ref[av1_get_pred_context_uni_comp_ref_p(xd)][0][bit]++;
+#endif  // CONFIG_ENTROPY_STATS
+          if (!bit) {
+            const int bit1 = (ref1 == LAST3_FRAME || ref1 == GOLDEN_FRAME);
+            update_cdf(av1_get_pred_cdf_uni_comp_ref_p1(xd), bit1, 2);
+#if CONFIG_ENTROPY_STATS
+            counts->uni_comp_ref[av1_get_pred_context_uni_comp_ref_p1(xd)][1]
+                                [bit1]++;
+#endif  // CONFIG_ENTROPY_STATS
+            if (bit1) {
+              update_cdf(av1_get_pred_cdf_uni_comp_ref_p2(xd),
+                         ref1 == GOLDEN_FRAME, 2);
+#if CONFIG_ENTROPY_STATS
+              counts->uni_comp_ref[av1_get_pred_context_uni_comp_ref_p2(xd)][2]
+                                  [ref1 == GOLDEN_FRAME]++;
+#endif  // CONFIG_ENTROPY_STATS
+            }
+          }
+        } else {
+          const int bit = (ref0 == GOLDEN_FRAME || ref0 == LAST3_FRAME);
+          update_cdf(av1_get_pred_cdf_comp_ref_p(xd), bit, 2);
+#if CONFIG_ENTROPY_STATS
+          counts->comp_ref[av1_get_pred_context_comp_ref_p(xd)][0][bit]++;
+#endif  // CONFIG_ENTROPY_STATS
+          if (!bit) {
+            update_cdf(av1_get_pred_cdf_comp_ref_p1(xd), ref0 == LAST2_FRAME,
+                       2);
+#if CONFIG_ENTROPY_STATS
+            counts->comp_ref[av1_get_pred_context_comp_ref_p1(xd)][1]
+                            [ref0 == LAST2_FRAME]++;
+#endif  // CONFIG_ENTROPY_STATS
+          } else {
+            update_cdf(av1_get_pred_cdf_comp_ref_p2(xd), ref0 == GOLDEN_FRAME,
+                       2);
+#if CONFIG_ENTROPY_STATS
+            counts->comp_ref[av1_get_pred_context_comp_ref_p2(xd)][2]
+                            [ref0 == GOLDEN_FRAME]++;
+#endif  // CONFIG_ENTROPY_STATS
+          }
+          update_cdf(av1_get_pred_cdf_comp_bwdref_p(xd), ref1 == ALTREF_FRAME,
+                     2);
+#if CONFIG_ENTROPY_STATS
+          counts->comp_bwdref[av1_get_pred_context_comp_bwdref_p(xd)][0]
+                             [ref1 == ALTREF_FRAME]++;
+#endif  // CONFIG_ENTROPY_STATS
+          if (ref1 != ALTREF_FRAME) {
+            update_cdf(av1_get_pred_cdf_comp_bwdref_p1(xd),
+                       ref1 == ALTREF2_FRAME, 2);
+#if CONFIG_ENTROPY_STATS
+            counts->comp_bwdref[av1_get_pred_context_comp_bwdref_p1(xd)][1]
+                               [ref1 == ALTREF2_FRAME]++;
+#endif  // CONFIG_ENTROPY_STATS
+          }
+        }
+      } else {
+        const int bit = (ref0 >= BWDREF_FRAME);
+        update_cdf(av1_get_pred_cdf_single_ref_p1(xd), bit, 2);
+#if CONFIG_ENTROPY_STATS
+        counts->single_ref[av1_get_pred_context_single_ref_p1(xd)][0][bit]++;
+#endif  // CONFIG_ENTROPY_STATS
+        if (bit) {
+          assert(ref0 <= ALTREF_FRAME);
+          update_cdf(av1_get_pred_cdf_single_ref_p2(xd), ref0 == ALTREF_FRAME,
+                     2);
+#if CONFIG_ENTROPY_STATS
+          counts->single_ref[av1_get_pred_context_single_ref_p2(xd)][1]
+                            [ref0 == ALTREF_FRAME]++;
+#endif  // CONFIG_ENTROPY_STATS
+          if (ref0 != ALTREF_FRAME) {
+            update_cdf(av1_get_pred_cdf_single_ref_p6(xd),
+                       ref0 == ALTREF2_FRAME, 2);
+#if CONFIG_ENTROPY_STATS
+            counts->single_ref[av1_get_pred_context_single_ref_p6(xd)][5]
+                              [ref0 == ALTREF2_FRAME]++;
+#endif  // CONFIG_ENTROPY_STATS
+          }
+        } else {
+          const int bit1 = !(ref0 == LAST2_FRAME || ref0 == LAST_FRAME);
+          update_cdf(av1_get_pred_cdf_single_ref_p3(xd), bit1, 2);
+#if CONFIG_ENTROPY_STATS
+          counts->single_ref[av1_get_pred_context_single_ref_p3(xd)][2][bit1]++;
+#endif  // CONFIG_ENTROPY_STATS
+          if (!bit1) {
+            update_cdf(av1_get_pred_cdf_single_ref_p4(xd), ref0 != LAST_FRAME,
+                       2);
+#if CONFIG_ENTROPY_STATS
+            counts->single_ref[av1_get_pred_context_single_ref_p4(xd)][3]
+                              [ref0 != LAST_FRAME]++;
+#endif  // CONFIG_ENTROPY_STATS
+          } else {
+            update_cdf(av1_get_pred_cdf_single_ref_p5(xd), ref0 != LAST3_FRAME,
+                       2);
+#if CONFIG_ENTROPY_STATS
+            counts->single_ref[av1_get_pred_context_single_ref_p5(xd)][4]
+                              [ref0 != LAST3_FRAME]++;
+#endif  // CONFIG_ENTROPY_STATS
+          }
+        }
+      }
+
+      if (cm->seq_params.enable_interintra_compound &&
+          is_interintra_allowed(mbmi)) {
+        const int bsize_group = size_group_lookup[bsize];
+        if (mbmi->ref_frame[1] == INTRA_FRAME) {
+#if CONFIG_ENTROPY_STATS
+          counts->interintra[bsize_group][1]++;
+#endif
+          update_cdf(fc->interintra_cdf[bsize_group], 1, 2);
+#if CONFIG_ENTROPY_STATS
+          counts->interintra_mode[bsize_group][mbmi->interintra_mode]++;
+#endif
+          update_cdf(fc->interintra_mode_cdf[bsize_group],
+                     mbmi->interintra_mode, INTERINTRA_MODES);
+          if (av1_is_wedge_used(bsize)) {
+#if CONFIG_ENTROPY_STATS
+            counts->wedge_interintra[bsize][mbmi->use_wedge_interintra]++;
+#endif
+            update_cdf(fc->wedge_interintra_cdf[bsize],
+                       mbmi->use_wedge_interintra, 2);
+            if (mbmi->use_wedge_interintra) {
+#if CONFIG_ENTROPY_STATS
+              counts->wedge_idx[bsize][mbmi->interintra_wedge_index]++;
+#endif
+              update_cdf(fc->wedge_idx_cdf[bsize], mbmi->interintra_wedge_index,
+                         16);
+            }
+          }
+        } else {
+#if CONFIG_ENTROPY_STATS
+          counts->interintra[bsize_group][0]++;
+#endif
+          update_cdf(fc->interintra_cdf[bsize_group], 0, 2);
+        }
+      }
+
+      const MOTION_MODE motion_allowed =
+          cm->features.switchable_motion_mode
+              ? motion_mode_allowed(xd->global_motion, xd, mbmi,
+                                    cm->features.allow_warped_motion)
+              : SIMPLE_TRANSLATION;
+      if (mbmi->ref_frame[1] != INTRA_FRAME) {
+        if (motion_allowed == WARPED_CAUSAL) {
+#if CONFIG_ENTROPY_STATS
+          counts->motion_mode[bsize][mbmi->motion_mode]++;
+#endif
+          update_cdf(fc->motion_mode_cdf[bsize], mbmi->motion_mode,
+                     MOTION_MODES);
+        } else if (motion_allowed == OBMC_CAUSAL) {
+#if CONFIG_ENTROPY_STATS
+          counts->obmc[bsize][mbmi->motion_mode == OBMC_CAUSAL]++;
+#endif
+          update_cdf(fc->obmc_cdf[bsize], mbmi->motion_mode == OBMC_CAUSAL, 2);
+        }
+      }
+
+      if (has_second_ref(mbmi)) {
+        assert(current_frame->reference_mode != SINGLE_REFERENCE &&
+               is_inter_compound_mode(mbmi->mode) &&
+               mbmi->motion_mode == SIMPLE_TRANSLATION);
+
+        const int masked_compound_used = is_any_masked_compound_used(bsize) &&
+                                         cm->seq_params.enable_masked_compound;
+        if (masked_compound_used) {
+          const int comp_group_idx_ctx = get_comp_group_idx_context(xd);
+#if CONFIG_ENTROPY_STATS
+          ++counts->comp_group_idx[comp_group_idx_ctx][mbmi->comp_group_idx];
+#endif
+          update_cdf(fc->comp_group_idx_cdf[comp_group_idx_ctx],
+                     mbmi->comp_group_idx, 2);
+        }
+
+        if (mbmi->comp_group_idx == 0) {
+          const int comp_index_ctx = get_comp_index_context(cm, xd);
+#if CONFIG_ENTROPY_STATS
+          ++counts->compound_index[comp_index_ctx][mbmi->compound_idx];
+#endif
+          update_cdf(fc->compound_index_cdf[comp_index_ctx], mbmi->compound_idx,
+                     2);
+        } else {
+          assert(masked_compound_used);
+          if (is_interinter_compound_used(COMPOUND_WEDGE, bsize)) {
+#if CONFIG_ENTROPY_STATS
+            ++counts->compound_type[bsize][mbmi->interinter_comp.type -
+                                           COMPOUND_WEDGE];
+#endif
+            update_cdf(fc->compound_type_cdf[bsize],
+                       mbmi->interinter_comp.type - COMPOUND_WEDGE,
+                       MASKED_COMPOUND_TYPES);
+          }
+        }
+      }
+      if (mbmi->interinter_comp.type == COMPOUND_WEDGE) {
+        if (is_interinter_compound_used(COMPOUND_WEDGE, bsize)) {
+#if CONFIG_ENTROPY_STATS
+          counts->wedge_idx[bsize][mbmi->interinter_comp.wedge_index]++;
+#endif
+          update_cdf(fc->wedge_idx_cdf[bsize],
+                     mbmi->interinter_comp.wedge_index, 16);
+        }
+      }
+    }
+  }
+
+  if (inter_block && cm->features.interp_filter == SWITCHABLE &&
+      mbmi->motion_mode != WARPED_CAUSAL &&
+      !is_nontrans_global_motion(xd, mbmi)) {
+    update_filter_type_cdf(xd, mbmi);
+  }
+  if (inter_block &&
+      !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+    const PREDICTION_MODE mode = mbmi->mode;
+    const int16_t mode_ctx =
+        av1_mode_context_analyzer(mbmi_ext->mode_context, mbmi->ref_frame);
+    if (has_second_ref(mbmi)) {
+#if CONFIG_ENTROPY_STATS
+      ++counts->inter_compound_mode[mode_ctx][INTER_COMPOUND_OFFSET(mode)];
+#endif
+      update_cdf(fc->inter_compound_mode_cdf[mode_ctx],
+                 INTER_COMPOUND_OFFSET(mode), INTER_COMPOUND_MODES);
+    } else {
+      update_inter_mode_stats(fc, counts, mode, mode_ctx);
+    }
+
+    const int new_mv = mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV;
+    if (new_mv) {
+      const uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+      for (int idx = 0; idx < 2; ++idx) {
+        if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
+          const uint8_t drl_ctx =
+              av1_drl_ctx(mbmi_ext->weight[ref_frame_type], idx);
+          update_cdf(fc->drl_cdf[drl_ctx], mbmi->ref_mv_idx != idx, 2);
+#if CONFIG_ENTROPY_STATS
+          ++counts->drl_mode[drl_ctx][mbmi->ref_mv_idx != idx];
+#endif
+          if (mbmi->ref_mv_idx == idx) break;
+        }
+      }
+    }
+
+    if (have_nearmv_in_inter_mode(mbmi->mode)) {
+      const uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+      for (int idx = 1; idx < 3; ++idx) {
+        if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
+          const uint8_t drl_ctx =
+              av1_drl_ctx(mbmi_ext->weight[ref_frame_type], idx);
+          update_cdf(fc->drl_cdf[drl_ctx], mbmi->ref_mv_idx != idx - 1, 2);
+#if CONFIG_ENTROPY_STATS
+          ++counts->drl_mode[drl_ctx][mbmi->ref_mv_idx != idx - 1];
+#endif
+          if (mbmi->ref_mv_idx == idx - 1) break;
+        }
+      }
+    }
+    if (have_newmv_in_inter_mode(mbmi->mode)) {
+      const int allow_hp = cm->features.cur_frame_force_integer_mv
+                               ? MV_SUBPEL_NONE
+                               : cm->features.allow_high_precision_mv;
+      if (new_mv) {
+        for (int ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
+          const int_mv ref_mv = av1_get_ref_mv(x, ref);
+          av1_update_mv_stats(&mbmi->mv[ref].as_mv, &ref_mv.as_mv, &fc->nmvc,
+                              allow_hp);
+        }
+      } else if (mbmi->mode == NEAREST_NEWMV || mbmi->mode == NEAR_NEWMV) {
+        const int ref = 1;
+        const int_mv ref_mv = av1_get_ref_mv(x, ref);
+        av1_update_mv_stats(&mbmi->mv[ref].as_mv, &ref_mv.as_mv, &fc->nmvc,
+                            allow_hp);
+      } else if (mbmi->mode == NEW_NEARESTMV || mbmi->mode == NEW_NEARMV) {
+        const int ref = 0;
+        const int_mv ref_mv = av1_get_ref_mv(x, ref);
+        av1_update_mv_stats(&mbmi->mv[ref].as_mv, &ref_mv.as_mv, &fc->nmvc,
+                            allow_hp);
+      }
+    }
+  }
+}
+
+static AOM_INLINE void restore_context(MACROBLOCK *x,
+                                       const RD_SEARCH_MACROBLOCK_CONTEXT *ctx,
+                                       int mi_row, int mi_col, BLOCK_SIZE bsize,
+                                       const int num_planes) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  int p;
+  const int num_4x4_blocks_wide = mi_size_wide[bsize];
+  const int num_4x4_blocks_high = mi_size_high[bsize];
+  int mi_width = mi_size_wide[bsize];
+  int mi_height = mi_size_high[bsize];
+  for (p = 0; p < num_planes; p++) {
+    int tx_col = mi_col;
+    int tx_row = mi_row & MAX_MIB_MASK;
+    memcpy(
+        xd->above_entropy_context[p] + (tx_col >> xd->plane[p].subsampling_x),
+        ctx->a + num_4x4_blocks_wide * p,
+        (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >>
+            xd->plane[p].subsampling_x);
+    memcpy(xd->left_entropy_context[p] + (tx_row >> xd->plane[p].subsampling_y),
+           ctx->l + num_4x4_blocks_high * p,
+           (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >>
+               xd->plane[p].subsampling_y);
+  }
+  memcpy(xd->above_partition_context + mi_col, ctx->sa,
+         sizeof(*xd->above_partition_context) * mi_width);
+  memcpy(xd->left_partition_context + (mi_row & MAX_MIB_MASK), ctx->sl,
+         sizeof(xd->left_partition_context[0]) * mi_height);
+  xd->above_txfm_context = ctx->p_ta;
+  xd->left_txfm_context = ctx->p_tl;
+  memcpy(xd->above_txfm_context, ctx->ta,
+         sizeof(*xd->above_txfm_context) * mi_width);
+  memcpy(xd->left_txfm_context, ctx->tl,
+         sizeof(*xd->left_txfm_context) * mi_height);
+}
+
+static AOM_INLINE void save_context(const MACROBLOCK *x,
+                                    RD_SEARCH_MACROBLOCK_CONTEXT *ctx,
+                                    int mi_row, int mi_col, BLOCK_SIZE bsize,
+                                    const int num_planes) {
+  const MACROBLOCKD *xd = &x->e_mbd;
+  int p;
+  int mi_width = mi_size_wide[bsize];
+  int mi_height = mi_size_high[bsize];
+
+  // buffer the above/left context information of the block in search.
+  for (p = 0; p < num_planes; ++p) {
+    int tx_col = mi_col;
+    int tx_row = mi_row & MAX_MIB_MASK;
+    memcpy(
+        ctx->a + mi_width * p,
+        xd->above_entropy_context[p] + (tx_col >> xd->plane[p].subsampling_x),
+        (sizeof(ENTROPY_CONTEXT) * mi_width) >> xd->plane[p].subsampling_x);
+    memcpy(ctx->l + mi_height * p,
+           xd->left_entropy_context[p] + (tx_row >> xd->plane[p].subsampling_y),
+           (sizeof(ENTROPY_CONTEXT) * mi_height) >> xd->plane[p].subsampling_y);
+  }
+  memcpy(ctx->sa, xd->above_partition_context + mi_col,
+         sizeof(*xd->above_partition_context) * mi_width);
+  memcpy(ctx->sl, xd->left_partition_context + (mi_row & MAX_MIB_MASK),
+         sizeof(xd->left_partition_context[0]) * mi_height);
+  memcpy(ctx->ta, xd->above_txfm_context,
+         sizeof(*xd->above_txfm_context) * mi_width);
+  memcpy(ctx->tl, xd->left_txfm_context,
+         sizeof(*xd->left_txfm_context) * mi_height);
+  ctx->p_ta = xd->above_txfm_context;
+  ctx->p_tl = xd->left_txfm_context;
+}
+
+static AOM_INLINE void encode_b(const AV1_COMP *const cpi,
+                                TileDataEnc *tile_data, ThreadData *td,
+                                TOKENEXTRA **tp, int mi_row, int mi_col,
+                                RUN_TYPE dry_run, BLOCK_SIZE bsize,
+                                PARTITION_TYPE partition,
+                                PICK_MODE_CONTEXT *const ctx, int *rate) {
+  TileInfo *const tile = &tile_data->tile_info;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *xd = &x->e_mbd;
+
+  set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize);
+  const int origin_mult = x->rdmult;
+  setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL);
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  mbmi->partition = partition;
+  update_state(cpi, td, ctx, mi_row, mi_col, bsize, dry_run);
+
+  if (!dry_run) {
+    x->mbmi_ext_frame->cb_offset = x->cb_offset;
+    assert(x->cb_offset <
+           (1 << num_pels_log2_lookup[cpi->common.seq_params.sb_size]));
+  }
+
+  encode_superblock(cpi, tile_data, td, tp, dry_run, bsize, rate);
+
+  if (!dry_run) {
+    const AV1_COMMON *const cm = &cpi->common;
+    x->cb_offset += block_size_wide[bsize] * block_size_high[bsize];
+    if (bsize == cpi->common.seq_params.sb_size && mbmi->skip == 1 &&
+        cm->delta_q_info.delta_lf_present_flag) {
+      const int frame_lf_count =
+          av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
+      for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id)
+        mbmi->delta_lf[lf_id] = xd->delta_lf[lf_id];
+      mbmi->delta_lf_from_base = xd->delta_lf_from_base;
+    }
+    if (has_second_ref(mbmi)) {
+      if (mbmi->compound_idx == 0 ||
+          mbmi->interinter_comp.type == COMPOUND_AVERAGE)
+        mbmi->comp_group_idx = 0;
+      else
+        mbmi->comp_group_idx = 1;
+    }
+
+    // delta quant applies to both intra and inter
+    const int super_block_upper_left =
+        ((mi_row & (cm->seq_params.mib_size - 1)) == 0) &&
+        ((mi_col & (cm->seq_params.mib_size - 1)) == 0);
+    const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
+    if (delta_q_info->delta_q_present_flag &&
+        (bsize != cm->seq_params.sb_size || !mbmi->skip) &&
+        super_block_upper_left) {
+      xd->current_qindex = mbmi->current_qindex;
+      if (delta_q_info->delta_lf_present_flag) {
+        if (delta_q_info->delta_lf_multi) {
+          const int frame_lf_count =
+              av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
+          for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) {
+            xd->delta_lf[lf_id] = mbmi->delta_lf[lf_id];
+          }
+        } else {
+          xd->delta_lf_from_base = mbmi->delta_lf_from_base;
+        }
+      }
+    }
+
+    RD_COUNTS *rdc = &td->rd_counts;
+    if (mbmi->skip_mode) {
+      assert(!frame_is_intra_only(cm));
+      rdc->skip_mode_used_flag = 1;
+      if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT) {
+        assert(has_second_ref(mbmi));
+        rdc->compound_ref_used_flag = 1;
+      }
+      set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+    } else {
+      const int seg_ref_active =
+          segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_REF_FRAME);
+      if (!seg_ref_active) {
+        // If the segment reference feature is enabled we have only a single
+        // reference frame allowed for the segment so exclude it from
+        // the reference frame counts used to work out probabilities.
+        if (is_inter_block(mbmi)) {
+          av1_collect_neighbors_ref_counts(xd);
+          if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT) {
+            if (has_second_ref(mbmi)) {
+              // This flag is also updated for 4x4 blocks
+              rdc->compound_ref_used_flag = 1;
+            }
+          }
+          set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+        }
+      }
+    }
+
+    if (tile_data->allow_update_cdf) update_stats(&cpi->common, td);
+
+    // Gather obmc and warped motion count to update the probability.
+    if ((!cpi->sf.inter_sf.disable_obmc &&
+         cpi->sf.inter_sf.prune_obmc_prob_thresh > 0) ||
+        (cm->features.allow_warped_motion &&
+         cpi->sf.inter_sf.prune_warped_prob_thresh > 0)) {
+      const int inter_block = is_inter_block(mbmi);
+      const int seg_ref_active =
+          segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_REF_FRAME);
+      if (!seg_ref_active && inter_block) {
+        const MOTION_MODE motion_allowed =
+            cm->features.switchable_motion_mode
+                ? motion_mode_allowed(xd->global_motion, xd, mbmi,
+                                      cm->features.allow_warped_motion)
+                : SIMPLE_TRANSLATION;
+
+        if (mbmi->ref_frame[1] != INTRA_FRAME) {
+          if (motion_allowed >= OBMC_CAUSAL) {
+            td->rd_counts.obmc_used[bsize][mbmi->motion_mode == OBMC_CAUSAL]++;
+          }
+          if (motion_allowed == WARPED_CAUSAL) {
+            td->rd_counts.warped_used[mbmi->motion_mode == WARPED_CAUSAL]++;
+          }
+        }
+      }
+    }
+  }
+  // TODO(Ravi/Remya): Move this copy function to a better logical place
+  // This function will copy the best mode information from block
+  // level (x->mbmi_ext) to frame level (cpi->mbmi_ext_info.frame_base). This
+  // frame level buffer (cpi->mbmi_ext_info.frame_base) will be used during
+  // bitstream preparation.
+  av1_copy_mbmi_ext_to_mbmi_ext_frame(x->mbmi_ext_frame, x->mbmi_ext,
+                                      av1_ref_frame_type(xd->mi[0]->ref_frame));
+  x->rdmult = origin_mult;
+}
+
+static AOM_INLINE void encode_sb(const AV1_COMP *const cpi, ThreadData *td,
+                                 TileDataEnc *tile_data, TOKENEXTRA **tp,
+                                 int mi_row, int mi_col, RUN_TYPE dry_run,
+                                 BLOCK_SIZE bsize, PC_TREE *pc_tree,
+                                 int *rate) {
+  assert(bsize < BLOCK_SIZES_ALL);
+  const AV1_COMMON *const cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  assert(bsize < BLOCK_SIZES_ALL);
+  const int hbs = mi_size_wide[bsize] / 2;
+  const int is_partition_root = bsize >= BLOCK_8X8;
+  const int ctx = is_partition_root
+                      ? partition_plane_context(xd, mi_row, mi_col, bsize)
+                      : -1;
+  const PARTITION_TYPE partition = pc_tree->partitioning;
+  const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
+  int quarter_step = mi_size_wide[bsize] / 4;
+  int i;
+  BLOCK_SIZE bsize2 = get_partition_subsize(bsize, PARTITION_SPLIT);
+
+  if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) return;
+
+  if (!dry_run && ctx >= 0) {
+    const int has_rows = (mi_row + hbs) < mi_params->mi_rows;
+    const int has_cols = (mi_col + hbs) < mi_params->mi_cols;
+
+    if (has_rows && has_cols) {
+#if CONFIG_ENTROPY_STATS
+      td->counts->partition[ctx][partition]++;
+#endif
+
+      if (tile_data->allow_update_cdf) {
+        FRAME_CONTEXT *fc = xd->tile_ctx;
+        update_cdf(fc->partition_cdf[ctx], partition,
+                   partition_cdf_length(bsize));
+      }
+    }
+  }
+
+  switch (partition) {
+    case PARTITION_NONE:
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize,
+               partition, &pc_tree->none, rate);
+      break;
+    case PARTITION_VERT:
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize,
+               partition, &pc_tree->vertical[0], rate);
+      if (mi_col + hbs < mi_params->mi_cols) {
+        encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, dry_run, subsize,
+                 partition, &pc_tree->vertical[1], rate);
+      }
+      break;
+    case PARTITION_HORZ:
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize,
+               partition, &pc_tree->horizontal[0], rate);
+      if (mi_row + hbs < mi_params->mi_rows) {
+        encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, dry_run, subsize,
+                 partition, &pc_tree->horizontal[1], rate);
+      }
+      break;
+    case PARTITION_SPLIT:
+      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, dry_run, subsize,
+                pc_tree->split[0], rate);
+      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col + hbs, dry_run, subsize,
+                pc_tree->split[1], rate);
+      encode_sb(cpi, td, tile_data, tp, mi_row + hbs, mi_col, dry_run, subsize,
+                pc_tree->split[2], rate);
+      encode_sb(cpi, td, tile_data, tp, mi_row + hbs, mi_col + hbs, dry_run,
+                subsize, pc_tree->split[3], rate);
+      break;
+
+    case PARTITION_HORZ_A:
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, bsize2,
+               partition, &pc_tree->horizontala[0], rate);
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, dry_run, bsize2,
+               partition, &pc_tree->horizontala[1], rate);
+      encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, dry_run, subsize,
+               partition, &pc_tree->horizontala[2], rate);
+      break;
+    case PARTITION_HORZ_B:
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize,
+               partition, &pc_tree->horizontalb[0], rate);
+      encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, dry_run, bsize2,
+               partition, &pc_tree->horizontalb[1], rate);
+      encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col + hbs, dry_run,
+               bsize2, partition, &pc_tree->horizontalb[2], rate);
+      break;
+    case PARTITION_VERT_A:
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, bsize2,
+               partition, &pc_tree->verticala[0], rate);
+      encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, dry_run, bsize2,
+               partition, &pc_tree->verticala[1], rate);
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, dry_run, subsize,
+               partition, &pc_tree->verticala[2], rate);
+
+      break;
+    case PARTITION_VERT_B:
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize,
+               partition, &pc_tree->verticalb[0], rate);
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, dry_run, bsize2,
+               partition, &pc_tree->verticalb[1], rate);
+      encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col + hbs, dry_run,
+               bsize2, partition, &pc_tree->verticalb[2], rate);
+      break;
+    case PARTITION_HORZ_4:
+      for (i = 0; i < 4; ++i) {
+        int this_mi_row = mi_row + i * quarter_step;
+        if (i > 0 && this_mi_row >= mi_params->mi_rows) break;
+
+        encode_b(cpi, tile_data, td, tp, this_mi_row, mi_col, dry_run, subsize,
+                 partition, &pc_tree->horizontal4[i], rate);
+      }
+      break;
+    case PARTITION_VERT_4:
+      for (i = 0; i < 4; ++i) {
+        int this_mi_col = mi_col + i * quarter_step;
+        if (i > 0 && this_mi_col >= mi_params->mi_cols) break;
+        encode_b(cpi, tile_data, td, tp, mi_row, this_mi_col, dry_run, subsize,
+                 partition, &pc_tree->vertical4[i], rate);
+      }
+      break;
+    default: assert(0 && "Invalid partition type."); break;
+  }
+
+  update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition);
+}
+
+static AOM_INLINE void set_partial_sb_partition(
+    const AV1_COMMON *const cm, MB_MODE_INFO *mi, int bh_in, int bw_in,
+    int mi_rows_remaining, int mi_cols_remaining, BLOCK_SIZE bsize,
+    MB_MODE_INFO **mib) {
+  int bh = bh_in;
+  int r, c;
+  for (r = 0; r < cm->seq_params.mib_size; r += bh) {
+    int bw = bw_in;
+    for (c = 0; c < cm->seq_params.mib_size; c += bw) {
+      const int grid_index = get_mi_grid_idx(&cm->mi_params, r, c);
+      const int mi_index = get_alloc_mi_idx(&cm->mi_params, r, c);
+      mib[grid_index] = mi + mi_index;
+      mib[grid_index]->sb_type = find_partition_size(
+          bsize, mi_rows_remaining - r, mi_cols_remaining - c, &bh, &bw);
+    }
+  }
+}
+
+// This function attempts to set all mode info entries in a given superblock
+// to the same block partition size.
+// However, at the bottom and right borders of the image the requested size
+// may not be allowed in which case this code attempts to choose the largest
+// allowable partition.
+static AOM_INLINE void set_fixed_partitioning(AV1_COMP *cpi,
+                                              const TileInfo *const tile,
+                                              MB_MODE_INFO **mib, int mi_row,
+                                              int mi_col, BLOCK_SIZE bsize) {
+  AV1_COMMON *const cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  const int mi_rows_remaining = tile->mi_row_end - mi_row;
+  const int mi_cols_remaining = tile->mi_col_end - mi_col;
+  MB_MODE_INFO *const mi_upper_left =
+      mi_params->mi_alloc + get_alloc_mi_idx(mi_params, mi_row, mi_col);
+  int bh = mi_size_high[bsize];
+  int bw = mi_size_wide[bsize];
+
+  assert(bsize >= mi_params->mi_alloc_bsize &&
+         "Attempted to use bsize < mi_params->mi_alloc_bsize");
+  assert((mi_rows_remaining > 0) && (mi_cols_remaining > 0));
+
+  // Apply the requested partition size to the SB if it is all "in image"
+  if ((mi_cols_remaining >= cm->seq_params.mib_size) &&
+      (mi_rows_remaining >= cm->seq_params.mib_size)) {
+    for (int block_row = 0; block_row < cm->seq_params.mib_size;
+         block_row += bh) {
+      for (int block_col = 0; block_col < cm->seq_params.mib_size;
+           block_col += bw) {
+        const int grid_index = get_mi_grid_idx(mi_params, block_row, block_col);
+        const int mi_index = get_alloc_mi_idx(mi_params, block_row, block_col);
+        mib[grid_index] = mi_upper_left + mi_index;
+        mib[grid_index]->sb_type = bsize;
+      }
+    }
+  } else {
+    // Else this is a partial SB.
+    set_partial_sb_partition(cm, mi_upper_left, bh, bw, mi_rows_remaining,
+                             mi_cols_remaining, bsize, mib);
+  }
+}
+
+static AOM_INLINE void rd_use_partition(
+    AV1_COMP *cpi, ThreadData *td, TileDataEnc *tile_data, MB_MODE_INFO **mib,
+    TOKENEXTRA **tp, int mi_row, int mi_col, BLOCK_SIZE bsize, int *rate,
+    int64_t *dist, int do_recon, PC_TREE *pc_tree) {
+  AV1_COMMON *const cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  const int num_planes = av1_num_planes(cm);
+  TileInfo *const tile_info = &tile_data->tile_info;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int bs = mi_size_wide[bsize];
+  const int hbs = bs / 2;
+  int i;
+  const int pl = (bsize >= BLOCK_8X8)
+                     ? partition_plane_context(xd, mi_row, mi_col, bsize)
+                     : 0;
+  const PARTITION_TYPE partition =
+      (bsize >= BLOCK_8X8) ? get_partition(cm, mi_row, mi_col, bsize)
+                           : PARTITION_NONE;
+  const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
+  RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
+  RD_STATS last_part_rdc, none_rdc, chosen_rdc, invalid_rdc;
+  BLOCK_SIZE sub_subsize = BLOCK_4X4;
+  int splits_below = 0;
+  BLOCK_SIZE bs_type = mib[0]->sb_type;
+  PICK_MODE_CONTEXT *ctx_none = &pc_tree->none;
+
+  if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) return;
+
+  assert(mi_size_wide[bsize] == mi_size_high[bsize]);
+
+  av1_invalid_rd_stats(&last_part_rdc);
+  av1_invalid_rd_stats(&none_rdc);
+  av1_invalid_rd_stats(&chosen_rdc);
+  av1_invalid_rd_stats(&invalid_rdc);
+
+  pc_tree->partitioning = partition;
+
+  xd->above_txfm_context =
+      cm->above_contexts.txfm[tile_info->tile_row] + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+  save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+
+  if (bsize == BLOCK_16X16 && cpi->vaq_refresh) {
+    set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+    x->mb_energy = av1_log_block_var(cpi, x, bsize);
+  }
+
+  // Save rdmult before it might be changed, so it can be restored later.
+  const int orig_rdmult = x->rdmult;
+  setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL);
+
+  if (cpi->sf.part_sf.partition_search_type == VAR_BASED_PARTITION &&
+      (cpi->sf.part_sf.adjust_var_based_rd_partitioning == 2 ||
+       (cpi->sf.part_sf.adjust_var_based_rd_partitioning == 1 &&
+        cm->quant_params.base_qindex > 190 && bsize <= BLOCK_32X32 &&
+        !frame_is_intra_only(cm)))) {
+    // Check if any of the sub blocks are further split.
+    if (partition == PARTITION_SPLIT && subsize > BLOCK_8X8) {
+      sub_subsize = get_partition_subsize(subsize, PARTITION_SPLIT);
+      splits_below = 1;
+      for (i = 0; i < 4; i++) {
+        int jj = i >> 1, ii = i & 0x01;
+        MB_MODE_INFO *this_mi = mib[jj * hbs * mi_params->mi_stride + ii * hbs];
+        if (this_mi && this_mi->sb_type >= sub_subsize) {
+          splits_below = 0;
+        }
+      }
+    }
+
+    // If partition is not none try none unless each of the 4 splits are split
+    // even further..
+    if (partition != PARTITION_NONE && !splits_below &&
+        mi_row + hbs < mi_params->mi_rows &&
+        mi_col + hbs < mi_params->mi_cols) {
+      pc_tree->partitioning = PARTITION_NONE;
+      pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &none_rdc,
+                    PARTITION_NONE, bsize, ctx_none, invalid_rdc, PICK_MODE_RD);
+
+      if (none_rdc.rate < INT_MAX) {
+        none_rdc.rate += x->partition_cost[pl][PARTITION_NONE];
+        none_rdc.rdcost = RDCOST(x->rdmult, none_rdc.rate, none_rdc.dist);
+      }
+
+      restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+      mib[0]->sb_type = bs_type;
+      pc_tree->partitioning = partition;
+    }
+  }
+
+  switch (partition) {
+    case PARTITION_NONE:
+      pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+                    PARTITION_NONE, bsize, ctx_none, invalid_rdc, PICK_MODE_RD);
+      break;
+    case PARTITION_HORZ:
+      pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+                    PARTITION_HORZ, subsize, &pc_tree->horizontal[0],
+                    invalid_rdc, PICK_MODE_RD);
+      if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 &&
+          mi_row + hbs < mi_params->mi_rows) {
+        RD_STATS tmp_rdc;
+        const PICK_MODE_CONTEXT *const ctx_h = &pc_tree->horizontal[0];
+        av1_init_rd_stats(&tmp_rdc);
+        update_state(cpi, td, ctx_h, mi_row, mi_col, subsize, 1);
+        encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, subsize,
+                          NULL);
+        pick_sb_modes(cpi, tile_data, x, mi_row + hbs, mi_col, &tmp_rdc,
+                      PARTITION_HORZ, subsize, &pc_tree->horizontal[1],
+                      invalid_rdc, PICK_MODE_RD);
+        if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
+          av1_invalid_rd_stats(&last_part_rdc);
+          break;
+        }
+        last_part_rdc.rate += tmp_rdc.rate;
+        last_part_rdc.dist += tmp_rdc.dist;
+        last_part_rdc.rdcost += tmp_rdc.rdcost;
+      }
+      break;
+    case PARTITION_VERT:
+      pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+                    PARTITION_VERT, subsize, &pc_tree->vertical[0], invalid_rdc,
+                    PICK_MODE_RD);
+      if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 &&
+          mi_col + hbs < mi_params->mi_cols) {
+        RD_STATS tmp_rdc;
+        const PICK_MODE_CONTEXT *const ctx_v = &pc_tree->vertical[0];
+        av1_init_rd_stats(&tmp_rdc);
+        update_state(cpi, td, ctx_v, mi_row, mi_col, subsize, 1);
+        encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, subsize,
+                          NULL);
+        pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + hbs, &tmp_rdc,
+                      PARTITION_VERT, subsize,
+                      &pc_tree->vertical[bsize > BLOCK_8X8], invalid_rdc,
+                      PICK_MODE_RD);
+        if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
+          av1_invalid_rd_stats(&last_part_rdc);
+          break;
+        }
+        last_part_rdc.rate += tmp_rdc.rate;
+        last_part_rdc.dist += tmp_rdc.dist;
+        last_part_rdc.rdcost += tmp_rdc.rdcost;
+      }
+      break;
+    case PARTITION_SPLIT:
+      if (cpi->sf.part_sf.adjust_var_based_rd_partitioning == 1 &&
+          none_rdc.rate < INT_MAX && none_rdc.skip == 1) {
+        av1_invalid_rd_stats(&last_part_rdc);
+        break;
+      }
+      last_part_rdc.rate = 0;
+      last_part_rdc.dist = 0;
+      last_part_rdc.rdcost = 0;
+      for (i = 0; i < 4; i++) {
+        int x_idx = (i & 1) * hbs;
+        int y_idx = (i >> 1) * hbs;
+        int jj = i >> 1, ii = i & 0x01;
+        RD_STATS tmp_rdc;
+        if ((mi_row + y_idx >= mi_params->mi_rows) ||
+            (mi_col + x_idx >= mi_params->mi_cols))
+          continue;
+
+        av1_init_rd_stats(&tmp_rdc);
+        rd_use_partition(cpi, td, tile_data,
+                         mib + jj * hbs * mi_params->mi_stride + ii * hbs, tp,
+                         mi_row + y_idx, mi_col + x_idx, subsize, &tmp_rdc.rate,
+                         &tmp_rdc.dist, i != 3, pc_tree->split[i]);
+        if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
+          av1_invalid_rd_stats(&last_part_rdc);
+          break;
+        }
+        last_part_rdc.rate += tmp_rdc.rate;
+        last_part_rdc.dist += tmp_rdc.dist;
+      }
+      break;
+    case PARTITION_VERT_A:
+    case PARTITION_VERT_B:
+    case PARTITION_HORZ_A:
+    case PARTITION_HORZ_B:
+    case PARTITION_HORZ_4:
+    case PARTITION_VERT_4:
+      assert(0 && "Cannot handle extended partition types");
+    default: assert(0); break;
+  }
+
+  if (last_part_rdc.rate < INT_MAX) {
+    last_part_rdc.rate += x->partition_cost[pl][partition];
+    last_part_rdc.rdcost =
+        RDCOST(x->rdmult, last_part_rdc.rate, last_part_rdc.dist);
+  }
+
+  if ((cpi->sf.part_sf.partition_search_type == VAR_BASED_PARTITION &&
+       cpi->sf.part_sf.adjust_var_based_rd_partitioning > 2) &&
+      partition != PARTITION_SPLIT && bsize > BLOCK_8X8 &&
+      (mi_row + bs < mi_params->mi_rows ||
+       mi_row + hbs == mi_params->mi_rows) &&
+      (mi_col + bs < mi_params->mi_cols ||
+       mi_col + hbs == mi_params->mi_cols)) {
+    BLOCK_SIZE split_subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+    chosen_rdc.rate = 0;
+    chosen_rdc.dist = 0;
+
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+    pc_tree->partitioning = PARTITION_SPLIT;
+
+    // Split partition.
+    for (i = 0; i < 4; i++) {
+      int x_idx = (i & 1) * hbs;
+      int y_idx = (i >> 1) * hbs;
+      RD_STATS tmp_rdc;
+
+      if ((mi_row + y_idx >= mi_params->mi_rows) ||
+          (mi_col + x_idx >= mi_params->mi_cols))
+        continue;
+
+      save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+      pc_tree->split[i]->partitioning = PARTITION_NONE;
+      pick_sb_modes(cpi, tile_data, x, mi_row + y_idx, mi_col + x_idx, &tmp_rdc,
+                    PARTITION_SPLIT, split_subsize, &pc_tree->split[i]->none,
+                    invalid_rdc, PICK_MODE_RD);
+
+      restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+      if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
+        av1_invalid_rd_stats(&chosen_rdc);
+        break;
+      }
+
+      chosen_rdc.rate += tmp_rdc.rate;
+      chosen_rdc.dist += tmp_rdc.dist;
+
+      if (i != 3)
+        encode_sb(cpi, td, tile_data, tp, mi_row + y_idx, mi_col + x_idx,
+                  OUTPUT_ENABLED, split_subsize, pc_tree->split[i], NULL);
+
+      chosen_rdc.rate += x->partition_cost[pl][PARTITION_NONE];
+    }
+    if (chosen_rdc.rate < INT_MAX) {
+      chosen_rdc.rate += x->partition_cost[pl][PARTITION_SPLIT];
+      chosen_rdc.rdcost = RDCOST(x->rdmult, chosen_rdc.rate, chosen_rdc.dist);
+    }
+  }
+
+  // If last_part is better set the partitioning to that.
+  if (last_part_rdc.rdcost < chosen_rdc.rdcost) {
+    mib[0]->sb_type = bsize;
+    if (bsize >= BLOCK_8X8) pc_tree->partitioning = partition;
+    chosen_rdc = last_part_rdc;
+  }
+  // If none was better set the partitioning to that.
+  if (none_rdc.rdcost < chosen_rdc.rdcost) {
+    if (bsize >= BLOCK_8X8) pc_tree->partitioning = PARTITION_NONE;
+    chosen_rdc = none_rdc;
+  }
+
+  restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+
+  // We must have chosen a partitioning and encoding or we'll fail later on.
+  // No other opportunities for success.
+  if (bsize == cm->seq_params.sb_size)
+    assert(chosen_rdc.rate < INT_MAX && chosen_rdc.dist < INT64_MAX);
+
+  if (do_recon) {
+    if (bsize == cm->seq_params.sb_size) {
+      // NOTE: To get estimate for rate due to the tokens, use:
+      // int rate_coeffs = 0;
+      // encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_COSTCOEFFS,
+      //           bsize, pc_tree, &rate_coeffs);
+      x->cb_offset = 0;
+      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize,
+                pc_tree, NULL);
+    } else {
+      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
+                pc_tree, NULL);
+    }
+  }
+
+  *rate = chosen_rdc.rate;
+  *dist = chosen_rdc.dist;
+  x->rdmult = orig_rdmult;
+}
+
+static int is_leaf_split_partition(AV1_COMMON *cm, int mi_row, int mi_col,
+                                   BLOCK_SIZE bsize) {
+  const int bs = mi_size_wide[bsize];
+  const int hbs = bs / 2;
+  assert(bsize >= BLOCK_8X8);
+  const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+
+  for (int i = 0; i < 4; i++) {
+    int x_idx = (i & 1) * hbs;
+    int y_idx = (i >> 1) * hbs;
+    if ((mi_row + y_idx >= cm->mi_params.mi_rows) ||
+        (mi_col + x_idx >= cm->mi_params.mi_cols))
+      return 0;
+    if (get_partition(cm, mi_row + y_idx, mi_col + x_idx, subsize) !=
+            PARTITION_NONE &&
+        subsize != BLOCK_8X8)
+      return 0;
+  }
+  return 1;
+}
+
+static AOM_INLINE int do_slipt_check(BLOCK_SIZE bsize) {
+  return (bsize == BLOCK_16X16 || bsize == BLOCK_32X32);
+}
+
+static AOM_INLINE void nonrd_use_partition(AV1_COMP *cpi, ThreadData *td,
+                                           TileDataEnc *tile_data,
+                                           MB_MODE_INFO **mib, TOKENEXTRA **tp,
+                                           int mi_row, int mi_col,
+                                           BLOCK_SIZE bsize, PC_TREE *pc_tree) {
+  AV1_COMMON *const cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  TileInfo *const tile_info = &tile_data->tile_info;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  // Only square blocks from 8x8 to 128x128 are supported
+  assert(bsize >= BLOCK_8X8 && bsize <= BLOCK_128X128);
+  const int bs = mi_size_wide[bsize];
+  const int hbs = bs / 2;
+  const PARTITION_TYPE partition =
+      (bsize >= BLOCK_8X8) ? get_partition(cm, mi_row, mi_col, bsize)
+                           : PARTITION_NONE;
+  BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
+  assert(subsize <= BLOCK_LARGEST);
+  const int pl = (bsize >= BLOCK_8X8)
+                     ? partition_plane_context(xd, mi_row, mi_col, bsize)
+                     : 0;
+
+  RD_STATS dummy_cost;
+  av1_invalid_rd_stats(&dummy_cost);
+  RD_STATS invalid_rd;
+  av1_invalid_rd_stats(&invalid_rd);
+
+  if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) return;
+
+  assert(mi_size_wide[bsize] == mi_size_high[bsize]);
+
+  pc_tree->partitioning = partition;
+
+  xd->above_txfm_context =
+      cm->above_contexts.txfm[tile_info->tile_row] + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+
+  switch (partition) {
+    case PARTITION_NONE:
+      if (cpi->sf.rt_sf.nonrd_check_partition_split && do_slipt_check(bsize) &&
+          !frame_is_intra_only(cm)) {
+        RD_STATS split_rdc, none_rdc, block_rdc;
+        RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
+
+        av1_init_rd_stats(&split_rdc);
+        av1_invalid_rd_stats(&none_rdc);
+
+        save_context(x, &x_ctx, mi_row, mi_col, bsize, 3);
+        subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+        pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &none_rdc,
+                      PARTITION_NONE, bsize, &pc_tree->none, invalid_rd,
+                      PICK_MODE_NONRD);
+        none_rdc.rate += x->partition_cost[pl][PARTITION_NONE];
+        none_rdc.rdcost = RDCOST(x->rdmult, none_rdc.rate, none_rdc.dist);
+        restore_context(x, &x_ctx, mi_row, mi_col, bsize, 3);
+
+        for (int i = 0; i < 4; i++) {
+          av1_invalid_rd_stats(&block_rdc);
+          const int x_idx = (i & 1) * hbs;
+          const int y_idx = (i >> 1) * hbs;
+          if (mi_row + y_idx >= mi_params->mi_rows ||
+              mi_col + x_idx >= mi_params->mi_cols)
+            continue;
+          xd->above_txfm_context =
+              cm->above_contexts.txfm[tile_info->tile_row] + mi_col + x_idx;
+          xd->left_txfm_context =
+              xd->left_txfm_context_buffer + ((mi_row + y_idx) & MAX_MIB_MASK);
+          pc_tree->split[i]->partitioning = PARTITION_NONE;
+          pick_sb_modes(cpi, tile_data, x, mi_row + y_idx, mi_col + x_idx,
+                        &block_rdc, PARTITION_NONE, subsize,
+                        &pc_tree->split[i]->none, invalid_rd, PICK_MODE_NONRD);
+          split_rdc.rate += block_rdc.rate;
+          split_rdc.dist += block_rdc.dist;
+
+          encode_b(cpi, tile_data, td, tp, mi_row + y_idx, mi_col + x_idx, 1,
+                   subsize, PARTITION_NONE, &pc_tree->split[i]->none, NULL);
+        }
+        split_rdc.rate += x->partition_cost[pl][PARTITION_SPLIT];
+        split_rdc.rdcost = RDCOST(x->rdmult, split_rdc.rate, split_rdc.dist);
+        restore_context(x, &x_ctx, mi_row, mi_col, bsize, 3);
+
+        if (none_rdc.rdcost < split_rdc.rdcost) {
+          mib[0]->sb_type = bsize;
+          pc_tree->partitioning = PARTITION_NONE;
+          encode_b(cpi, tile_data, td, tp, mi_row, mi_col, 0, bsize, partition,
+                   &pc_tree->none, NULL);
+        } else {
+          mib[0]->sb_type = subsize;
+          pc_tree->partitioning = PARTITION_SPLIT;
+          for (int i = 0; i < 4; i++) {
+            const int x_idx = (i & 1) * hbs;
+            const int y_idx = (i >> 1) * hbs;
+            if (mi_row + y_idx >= mi_params->mi_rows ||
+                mi_col + x_idx >= mi_params->mi_cols)
+              continue;
+
+            encode_b(cpi, tile_data, td, tp, mi_row + y_idx, mi_col + x_idx, 0,
+                     subsize, PARTITION_NONE, &pc_tree->split[i]->none, NULL);
+          }
+        }
+
+      } else {
+        pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &dummy_cost,
+                      PARTITION_NONE, bsize, &pc_tree->none, invalid_rd,
+                      PICK_MODE_NONRD);
+        encode_b(cpi, tile_data, td, tp, mi_row, mi_col, 0, bsize, partition,
+                 &pc_tree->none, NULL);
+      }
+      break;
+    case PARTITION_VERT:
+      pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &dummy_cost,
+                    PARTITION_VERT, subsize, &pc_tree->vertical[0], invalid_rd,
+                    PICK_MODE_NONRD);
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col, 0, subsize,
+               PARTITION_VERT, &pc_tree->vertical[0], NULL);
+      if (mi_col + hbs < mi_params->mi_cols && bsize > BLOCK_8X8) {
+        pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + hbs, &dummy_cost,
+                      PARTITION_VERT, subsize, &pc_tree->vertical[1],
+                      invalid_rd, PICK_MODE_NONRD);
+        encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, 0, subsize,
+                 PARTITION_VERT, &pc_tree->vertical[1], NULL);
+      }
+      break;
+    case PARTITION_HORZ:
+      pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &dummy_cost,
+                    PARTITION_HORZ, subsize, &pc_tree->horizontal[0],
+                    invalid_rd, PICK_MODE_NONRD);
+      encode_b(cpi, tile_data, td, tp, mi_row, mi_col, 0, subsize,
+               PARTITION_HORZ, &pc_tree->horizontal[0], NULL);
+
+      if (mi_row + hbs < mi_params->mi_rows && bsize > BLOCK_8X8) {
+        pick_sb_modes(cpi, tile_data, x, mi_row + hbs, mi_col, &dummy_cost,
+                      PARTITION_HORZ, subsize, &pc_tree->horizontal[1],
+                      invalid_rd, PICK_MODE_NONRD);
+        encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, 0, subsize,
+                 PARTITION_HORZ, &pc_tree->horizontal[1], NULL);
+      }
+      break;
+    case PARTITION_SPLIT:
+      if (cpi->sf.rt_sf.nonrd_check_partition_merge_mode &&
+          is_leaf_split_partition(cm, mi_row, mi_col, bsize) &&
+          !frame_is_intra_only(cm) && bsize <= BLOCK_32X32) {
+        RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
+        RD_STATS split_rdc, none_rdc;
+        av1_invalid_rd_stats(&split_rdc);
+        av1_invalid_rd_stats(&none_rdc);
+        save_context(x, &x_ctx, mi_row, mi_col, bsize, 3);
+        xd->above_txfm_context =
+            cm->above_contexts.txfm[tile_info->tile_row] + mi_col;
+        xd->left_txfm_context =
+            xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+        pc_tree->partitioning = PARTITION_NONE;
+        pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &none_rdc,
+                      PARTITION_NONE, bsize, &pc_tree->none, invalid_rd,
+                      PICK_MODE_NONRD);
+        none_rdc.rate += x->partition_cost[pl][PARTITION_NONE];
+        none_rdc.rdcost = RDCOST(x->rdmult, none_rdc.rate, none_rdc.dist);
+        restore_context(x, &x_ctx, mi_row, mi_col, bsize, 3);
+        if (cpi->sf.rt_sf.nonrd_check_partition_merge_mode != 2 ||
+            none_rdc.skip != 1 || pc_tree->none.mic.mode == NEWMV) {
+          av1_init_rd_stats(&split_rdc);
+          for (int i = 0; i < 4; i++) {
+            RD_STATS block_rdc;
+            av1_invalid_rd_stats(&block_rdc);
+            int x_idx = (i & 1) * hbs;
+            int y_idx = (i >> 1) * hbs;
+            if ((mi_row + y_idx >= mi_params->mi_rows) ||
+                (mi_col + x_idx >= mi_params->mi_cols))
+              continue;
+            xd->above_txfm_context =
+                cm->above_contexts.txfm[tile_info->tile_row] + mi_col + x_idx;
+            xd->left_txfm_context = xd->left_txfm_context_buffer +
+                                    ((mi_row + y_idx) & MAX_MIB_MASK);
+            pc_tree->split[i]->partitioning = PARTITION_NONE;
+            pick_sb_modes(cpi, tile_data, x, mi_row + y_idx, mi_col + x_idx,
+                          &block_rdc, PARTITION_NONE, subsize,
+                          &pc_tree->split[i]->none, invalid_rd,
+                          PICK_MODE_NONRD);
+            split_rdc.rate += block_rdc.rate;
+            split_rdc.dist += block_rdc.dist;
+
+            encode_b(cpi, tile_data, td, tp, mi_row + y_idx, mi_col + x_idx, 1,
+                     subsize, PARTITION_NONE, &pc_tree->split[i]->none, NULL);
+          }
+          restore_context(x, &x_ctx, mi_row, mi_col, bsize, 3);
+          split_rdc.rate += x->partition_cost[pl][PARTITION_SPLIT];
+          split_rdc.rdcost = RDCOST(x->rdmult, split_rdc.rate, split_rdc.dist);
+        }
+        if (none_rdc.rdcost < split_rdc.rdcost) {
+          mib[0]->sb_type = bsize;
+          pc_tree->partitioning = PARTITION_NONE;
+          encode_b(cpi, tile_data, td, tp, mi_row, mi_col, 0, bsize, partition,
+                   &pc_tree->none, NULL);
+        } else {
+          mib[0]->sb_type = subsize;
+          pc_tree->partitioning = PARTITION_SPLIT;
+          for (int i = 0; i < 4; i++) {
+            int x_idx = (i & 1) * hbs;
+            int y_idx = (i >> 1) * hbs;
+            if ((mi_row + y_idx >= mi_params->mi_rows) ||
+                (mi_col + x_idx >= mi_params->mi_cols))
+              continue;
+
+            encode_b(cpi, tile_data, td, tp, mi_row + y_idx, mi_col + x_idx, 0,
+                     subsize, PARTITION_NONE, &pc_tree->split[i]->none, NULL);
+          }
+        }
+      } else {
+        for (int i = 0; i < 4; i++) {
+          int x_idx = (i & 1) * hbs;
+          int y_idx = (i >> 1) * hbs;
+          int jj = i >> 1, ii = i & 0x01;
+          if ((mi_row + y_idx >= mi_params->mi_rows) ||
+              (mi_col + x_idx >= mi_params->mi_cols))
+            continue;
+          nonrd_use_partition(cpi, td, tile_data,
+                              mib + jj * hbs * mi_params->mi_stride + ii * hbs,
+                              tp, mi_row + y_idx, mi_col + x_idx, subsize,
+                              pc_tree->split[i]);
+        }
+      }
+      break;
+    case PARTITION_VERT_A:
+    case PARTITION_VERT_B:
+    case PARTITION_HORZ_A:
+    case PARTITION_HORZ_B:
+    case PARTITION_HORZ_4:
+    case PARTITION_VERT_4:
+      assert(0 && "Cannot handle extended partition types");
+    default: assert(0); break;
+  }
+}
+
+#if !CONFIG_REALTIME_ONLY
+static const FIRSTPASS_STATS *read_one_frame_stats(const TWO_PASS *p, int frm) {
+  assert(frm >= 0);
+  if (frm < 0 ||
+      p->stats_buf_ctx->stats_in_start + frm > p->stats_buf_ctx->stats_in_end) {
+    return NULL;
+  }
+
+  return &p->stats_buf_ctx->stats_in_start[frm];
+}
+// Checks to see if a super block is on a horizontal image edge.
+// In most cases this is the "real" edge unless there are formatting
+// bars embedded in the stream.
+static int active_h_edge(const AV1_COMP *cpi, int mi_row, int mi_step) {
+  int top_edge = 0;
+  int bottom_edge = cpi->common.mi_params.mi_rows;
+  int is_active_h_edge = 0;
+
+  // For two pass account for any formatting bars detected.
+  if (is_stat_consumption_stage_twopass(cpi)) {
+    const AV1_COMMON *const cm = &cpi->common;
+    const FIRSTPASS_STATS *const this_frame_stats = read_one_frame_stats(
+        &cpi->twopass, cm->current_frame.display_order_hint);
+    if (this_frame_stats == NULL) return AOM_CODEC_ERROR;
+
+    // The inactive region is specified in MBs not mi units.
+    // The image edge is in the following MB row.
+    top_edge += (int)(this_frame_stats->inactive_zone_rows * 4);
+
+    bottom_edge -= (int)(this_frame_stats->inactive_zone_rows * 4);
+    bottom_edge = AOMMAX(top_edge, bottom_edge);
+  }
+
+  if (((top_edge >= mi_row) && (top_edge < (mi_row + mi_step))) ||
+      ((bottom_edge >= mi_row) && (bottom_edge < (mi_row + mi_step)))) {
+    is_active_h_edge = 1;
+  }
+  return is_active_h_edge;
+}
+
+// Checks to see if a super block is on a vertical image edge.
+// In most cases this is the "real" edge unless there are formatting
+// bars embedded in the stream.
+static int active_v_edge(const AV1_COMP *cpi, int mi_col, int mi_step) {
+  int left_edge = 0;
+  int right_edge = cpi->common.mi_params.mi_cols;
+  int is_active_v_edge = 0;
+
+  // For two pass account for any formatting bars detected.
+  if (is_stat_consumption_stage_twopass(cpi)) {
+    const AV1_COMMON *const cm = &cpi->common;
+    const FIRSTPASS_STATS *const this_frame_stats = read_one_frame_stats(
+        &cpi->twopass, cm->current_frame.display_order_hint);
+    if (this_frame_stats == NULL) return AOM_CODEC_ERROR;
+
+    // The inactive region is specified in MBs not mi units.
+    // The image edge is in the following MB row.
+    left_edge += (int)(this_frame_stats->inactive_zone_cols * 4);
+
+    right_edge -= (int)(this_frame_stats->inactive_zone_cols * 4);
+    right_edge = AOMMAX(left_edge, right_edge);
+  }
+
+  if (((left_edge >= mi_col) && (left_edge < (mi_col + mi_step))) ||
+      ((right_edge >= mi_col) && (right_edge < (mi_col + mi_step)))) {
+    is_active_v_edge = 1;
+  }
+  return is_active_v_edge;
+}
+#endif  // !CONFIG_REALTIME_ONLY
+
+static INLINE void store_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
+  memcpy(ctx->pred_mv, x->pred_mv, sizeof(x->pred_mv));
+}
+
+static INLINE void load_pred_mv(MACROBLOCK *x,
+                                const PICK_MODE_CONTEXT *const ctx) {
+  memcpy(x->pred_mv, ctx->pred_mv, sizeof(x->pred_mv));
+}
+
+#if !CONFIG_REALTIME_ONLY
+// Try searching for an encoding for the given subblock. Returns zero if the
+// rdcost is already too high (to tell the caller not to bother searching for
+// encodings of further subblocks)
+static int rd_try_subblock(AV1_COMP *const cpi, ThreadData *td,
+                           TileDataEnc *tile_data, TOKENEXTRA **tp, int is_last,
+                           int mi_row, int mi_col, BLOCK_SIZE subsize,
+                           RD_STATS best_rdcost, RD_STATS *sum_rdc,
+                           PARTITION_TYPE partition,
+                           PICK_MODE_CONTEXT *prev_ctx,
+                           PICK_MODE_CONTEXT *this_ctx) {
+  MACROBLOCK *const x = &td->mb;
+  const int orig_mult = x->rdmult;
+  setup_block_rdmult(cpi, x, mi_row, mi_col, subsize, NO_AQ, NULL);
+
+  av1_rd_cost_update(x->rdmult, &best_rdcost);
+  if (cpi->sf.mv_sf.adaptive_motion_search) load_pred_mv(x, prev_ctx);
+
+  RD_STATS rdcost_remaining;
+  av1_rd_stats_subtraction(x->rdmult, &best_rdcost, sum_rdc, &rdcost_remaining);
+  RD_STATS this_rdc;
+  pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, partition,
+                subsize, this_ctx, rdcost_remaining, PICK_MODE_RD);
+
+  if (this_rdc.rate == INT_MAX) {
+    sum_rdc->rdcost = INT64_MAX;
+  } else {
+    sum_rdc->rate += this_rdc.rate;
+    sum_rdc->dist += this_rdc.dist;
+    av1_rd_cost_update(x->rdmult, sum_rdc);
+  }
+
+  if (sum_rdc->rdcost >= best_rdcost.rdcost) {
+    x->rdmult = orig_mult;
+    return 0;
+  }
+
+  if (!is_last) {
+    update_state(cpi, td, this_ctx, mi_row, mi_col, subsize, 1);
+    encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, subsize, NULL);
+  }
+
+  x->rdmult = orig_mult;
+  return 1;
+}
+
+static bool rd_test_partition3(AV1_COMP *const cpi, ThreadData *td,
+                               TileDataEnc *tile_data, TOKENEXTRA **tp,
+                               PC_TREE *pc_tree, RD_STATS *best_rdc,
+                               PICK_MODE_CONTEXT ctxs[3],
+                               PICK_MODE_CONTEXT *ctx, int mi_row, int mi_col,
+                               BLOCK_SIZE bsize, PARTITION_TYPE partition,
+                               int mi_row0, int mi_col0, BLOCK_SIZE subsize0,
+                               int mi_row1, int mi_col1, BLOCK_SIZE subsize1,
+                               int mi_row2, int mi_col2, BLOCK_SIZE subsize2) {
+  const MACROBLOCK *const x = &td->mb;
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
+  RD_STATS sum_rdc;
+  av1_init_rd_stats(&sum_rdc);
+  sum_rdc.rate = x->partition_cost[pl][partition];
+  sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
+  if (!rd_try_subblock(cpi, td, tile_data, tp, 0, mi_row0, mi_col0, subsize0,
+                       *best_rdc, &sum_rdc, partition, ctx, &ctxs[0]))
+    return false;
+
+  if (!rd_try_subblock(cpi, td, tile_data, tp, 0, mi_row1, mi_col1, subsize1,
+                       *best_rdc, &sum_rdc, partition, &ctxs[0], &ctxs[1]))
+    return false;
+
+  if (!rd_try_subblock(cpi, td, tile_data, tp, 1, mi_row2, mi_col2, subsize2,
+                       *best_rdc, &sum_rdc, partition, &ctxs[1], &ctxs[2]))
+    return false;
+
+  av1_rd_cost_update(x->rdmult, &sum_rdc);
+  if (sum_rdc.rdcost >= best_rdc->rdcost) return false;
+  sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
+  if (sum_rdc.rdcost >= best_rdc->rdcost) return false;
+
+  *best_rdc = sum_rdc;
+  pc_tree->partitioning = partition;
+  return true;
+}
+
+static AOM_INLINE void reset_partition(PC_TREE *pc_tree, BLOCK_SIZE bsize) {
+  pc_tree->partitioning = PARTITION_NONE;
+  pc_tree->none.rd_stats.skip = 0;
+
+  if (bsize >= BLOCK_8X8) {
+    BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+    for (int idx = 0; idx < 4; ++idx)
+      reset_partition(pc_tree->split[idx], subsize);
+  }
+}
+
+// Record the ref frames that have been selected by square partition blocks.
+static AOM_INLINE void update_picked_ref_frames_mask(MACROBLOCK *const x,
+                                                     int ref_type,
+                                                     BLOCK_SIZE bsize,
+                                                     int mib_size, int mi_row,
+                                                     int mi_col) {
+  assert(mi_size_wide[bsize] == mi_size_high[bsize]);
+  const int sb_size_mask = mib_size - 1;
+  const int mi_row_in_sb = mi_row & sb_size_mask;
+  const int mi_col_in_sb = mi_col & sb_size_mask;
+  const int mi_size = mi_size_wide[bsize];
+  for (int i = mi_row_in_sb; i < mi_row_in_sb + mi_size; ++i) {
+    for (int j = mi_col_in_sb; j < mi_col_in_sb + mi_size; ++j) {
+      x->picked_ref_frames_mask[i * 32 + j] |= 1 << ref_type;
+    }
+  }
+}
+
+// Structure to keep win flags for HORZ and VERT partition evaluations
+typedef struct {
+  bool horz_win;
+  bool vert_win;
+} RD_RECT_PART_WIN_INFO;
+
+// Decide whether to evaluate the AB partition specified by part_type based on
+// split and HORZ/VERT info
+int evaluate_ab_partition_based_on_split(
+    PC_TREE *pc_tree, PARTITION_TYPE rect_part,
+    RD_RECT_PART_WIN_INFO *rect_part_win_info, int qindex, int split_idx1,
+    int split_idx2) {
+  int num_win = 0;
+  // Threshold for number of winners
+  // Conservative pruning for high quantizers
+  const int num_win_thresh = AOMMIN(3 * (2 * (MAXQ - qindex) / MAXQ), 3);
+  bool sub_part_win = (rect_part_win_info == NULL)
+                          ? (pc_tree->partitioning == rect_part)
+                          : (rect_part == PARTITION_HORZ)
+                                ? rect_part_win_info->horz_win
+                                : rect_part_win_info->vert_win;
+  num_win += (sub_part_win) ? 1 : 0;
+  num_win +=
+      (pc_tree->split[split_idx1]->partitioning == PARTITION_NONE) ? 1 : 0;
+  num_win +=
+      (pc_tree->split[split_idx2]->partitioning == PARTITION_NONE) ? 1 : 0;
+  if (num_win < num_win_thresh) {
+    return 0;
+  }
+  return 1;
+}
+
+// Searches for the best partition pattern for a block based on the
+// rate-distortion cost, and returns a bool value to indicate whether a valid
+// partition pattern is found. The partition can recursively go down to
+// the smallest block size.
+//
+// Inputs:
+//     cpi: the global compressor setting
+//     td: thread data
+//     tile_data: tile data
+//     tp: the pointer to the start token
+//     mi_row: row coordinate of the block in a step size of MI_SIZE
+//     mi_col: column coordinate of the block in a step size of MI_SIZE
+//     bsize: block size
+//     max_sq_part: the largest square block size for prediction blocks
+//     min_sq_part: the smallest square block size for prediction blocks
+//     rd_cost: the pointer to the final rd cost of the current block
+//     best_rdc: the upper bound of rd cost for a valid partition
+//     pc_tree: the pointer to the PC_TREE node storing the picked partitions
+//              and mode info for the current block
+//     none_rd: the pointer to the rd cost in the case of not splitting the
+//              current block
+//     multi_pass_mode: SB_SINGLE_PASS/SB_DRY_PASS/SB_WET_PASS
+//     rect_part_win_info: the pointer to a struct storing whether horz/vert
+//                         partition outperforms previously tested partitions
+//
+// Output:
+//     a bool value indicating whether a valid partition is found
+static bool rd_pick_partition(AV1_COMP *const cpi, ThreadData *td,
+                              TileDataEnc *tile_data, TOKENEXTRA **tp,
+                              int mi_row, int mi_col, BLOCK_SIZE bsize,
+                              BLOCK_SIZE max_sq_part, BLOCK_SIZE min_sq_part,
+                              RD_STATS *rd_cost, RD_STATS best_rdc,
+                              PC_TREE *pc_tree, int64_t *none_rd,
+                              SB_MULTI_PASS_MODE multi_pass_mode,
+                              RD_RECT_PART_WIN_INFO *rect_part_win_info) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  const int num_planes = av1_num_planes(cm);
+  TileInfo *const tile_info = &tile_data->tile_info;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int mi_step = mi_size_wide[bsize] / 2;
+  RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
+  const TOKENEXTRA *const tp_orig = *tp;
+  PICK_MODE_CONTEXT *ctx_none = &pc_tree->none;
+  int tmp_partition_cost[PARTITION_TYPES];
+  BLOCK_SIZE subsize;
+  RD_STATS this_rdc, sum_rdc;
+  const int bsize_at_least_8x8 = (bsize >= BLOCK_8X8);
+  int do_square_split = bsize_at_least_8x8;
+  const int pl = bsize_at_least_8x8
+                     ? partition_plane_context(xd, mi_row, mi_col, bsize)
+                     : 0;
+  const int *partition_cost = x->partition_cost[pl];
+
+  int do_rectangular_split = cpi->oxcf.enable_rect_partitions;
+  int64_t cur_none_rd = 0;
+  int64_t split_rd[4] = { 0, 0, 0, 0 };
+  int64_t horz_rd[2] = { 0, 0 };
+  int64_t vert_rd[2] = { 0, 0 };
+  int prune_horz = 0;
+  int prune_vert = 0;
+  int terminate_partition_search = 0;
+
+  int split_ctx_is_ready[2] = { 0, 0 };
+  int horz_ctx_is_ready = 0;
+  int vert_ctx_is_ready = 0;
+  BLOCK_SIZE bsize2 = get_partition_subsize(bsize, PARTITION_SPLIT);
+  // Initialise HORZ and VERT win flags as true for all split partitions
+  RD_RECT_PART_WIN_INFO split_part_rect_win[4] = {
+    { true, true }, { true, true }, { true, true }, { true, true }
+  };
+
+  bool found_best_partition = false;
+  if (best_rdc.rdcost < 0) {
+    av1_invalid_rd_stats(rd_cost);
+    return found_best_partition;
+  }
+
+  if (frame_is_intra_only(cm) && bsize == BLOCK_64X64) {
+    x->quad_tree_idx = 0;
+    x->cnn_output_valid = 0;
+  }
+
+  if (bsize == cm->seq_params.sb_size) x->must_find_valid_partition = 0;
+
+  // Override skipping rectangular partition operations for edge blocks
+  const int has_rows = (mi_row + mi_step < mi_params->mi_rows);
+  const int has_cols = (mi_col + mi_step < mi_params->mi_cols);
+  const int xss = x->e_mbd.plane[1].subsampling_x;
+  const int yss = x->e_mbd.plane[1].subsampling_y;
+
+  if (none_rd) *none_rd = 0;
+  int partition_none_allowed = has_rows && has_cols;
+  int partition_horz_allowed =
+      has_cols && bsize_at_least_8x8 && cpi->oxcf.enable_rect_partitions &&
+      get_plane_block_size(get_partition_subsize(bsize, PARTITION_HORZ), xss,
+                           yss) != BLOCK_INVALID;
+  int partition_vert_allowed =
+      has_rows && bsize_at_least_8x8 && cpi->oxcf.enable_rect_partitions &&
+      get_plane_block_size(get_partition_subsize(bsize, PARTITION_VERT), xss,
+                           yss) != BLOCK_INVALID;
+
+  (void)*tp_orig;
+
+#if CONFIG_COLLECT_PARTITION_STATS
+  int partition_decisions[EXT_PARTITION_TYPES] = { 0 };
+  int partition_attempts[EXT_PARTITION_TYPES] = { 0 };
+  int64_t partition_times[EXT_PARTITION_TYPES] = { 0 };
+  struct aom_usec_timer partition_timer = { 0 };
+  int partition_timer_on = 0;
+#if CONFIG_COLLECT_PARTITION_STATS == 2
+  PartitionStats *part_stats = &cpi->partition_stats;
+#endif
+#endif
+
+  // Override partition costs at the edges of the frame in the same
+  // way as in read_partition (see decodeframe.c)
+  if (!(has_rows && has_cols)) {
+    assert(bsize_at_least_8x8 && pl >= 0);
+    const aom_cdf_prob *partition_cdf = cm->fc->partition_cdf[pl];
+    const int max_cost = av1_cost_symbol(0);
+    for (int i = 0; i < PARTITION_TYPES; ++i) tmp_partition_cost[i] = max_cost;
+    if (has_cols) {
+      // At the bottom, the two possibilities are HORZ and SPLIT
+      aom_cdf_prob bot_cdf[2];
+      partition_gather_vert_alike(bot_cdf, partition_cdf, bsize);
+      static const int bot_inv_map[2] = { PARTITION_HORZ, PARTITION_SPLIT };
+      av1_cost_tokens_from_cdf(tmp_partition_cost, bot_cdf, bot_inv_map);
+    } else if (has_rows) {
+      // At the right, the two possibilities are VERT and SPLIT
+      aom_cdf_prob rhs_cdf[2];
+      partition_gather_horz_alike(rhs_cdf, partition_cdf, bsize);
+      static const int rhs_inv_map[2] = { PARTITION_VERT, PARTITION_SPLIT };
+      av1_cost_tokens_from_cdf(tmp_partition_cost, rhs_cdf, rhs_inv_map);
+    } else {
+      // At the bottom right, we always split
+      tmp_partition_cost[PARTITION_SPLIT] = 0;
+    }
+
+    partition_cost = tmp_partition_cost;
+  }
+
+#ifndef NDEBUG
+  // Nothing should rely on the default value of this array (which is just
+  // leftover from encoding the previous block. Setting it to fixed pattern
+  // when debugging.
+  // bit 0, 1, 2 are blk_skip of each plane
+  // bit 4, 5, 6 are initialization checking of each plane
+  memset(x->blk_skip, 0x77, sizeof(x->blk_skip));
+#endif  // NDEBUG
+
+  assert(mi_size_wide[bsize] == mi_size_high[bsize]);
+
+  av1_init_rd_stats(&this_rdc);
+
+  set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+
+  // Save rdmult before it might be changed, so it can be restored later.
+  const int orig_rdmult = x->rdmult;
+  setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL);
+
+  av1_rd_cost_update(x->rdmult, &best_rdc);
+
+  if (bsize == BLOCK_16X16 && cpi->vaq_refresh)
+    x->mb_energy = av1_log_block_var(cpi, x, bsize);
+
+  if (bsize > cpi->sf.part_sf.use_square_partition_only_threshold) {
+    partition_horz_allowed &= !has_rows;
+    partition_vert_allowed &= !has_cols;
+  }
+
+  xd->above_txfm_context =
+      cm->above_contexts.txfm[tile_info->tile_row] + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+  save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+
+  const int try_intra_cnn_split =
+      !cpi->is_screen_content_type && frame_is_intra_only(cm) &&
+      cpi->sf.part_sf.intra_cnn_split &&
+      cm->seq_params.sb_size >= BLOCK_64X64 && bsize <= BLOCK_64X64 &&
+      bsize >= BLOCK_8X8 &&
+      mi_row + mi_size_high[bsize] <= mi_params->mi_rows &&
+      mi_col + mi_size_wide[bsize] <= mi_params->mi_cols;
+
+  if (try_intra_cnn_split) {
+    av1_intra_mode_cnn_partition(
+        &cpi->common, x, bsize, x->quad_tree_idx, &partition_none_allowed,
+        &partition_horz_allowed, &partition_vert_allowed, &do_rectangular_split,
+        &do_square_split);
+  }
+
+  // Use simple_motion_search to prune partitions. This must be done prior to
+  // PARTITION_SPLIT to propagate the initial mvs to a smaller blocksize.
+  const int try_split_only =
+      !cpi->is_screen_content_type &&
+      cpi->sf.part_sf.simple_motion_search_split && do_square_split &&
+      bsize >= BLOCK_8X8 &&
+      mi_row + mi_size_high[bsize] <= mi_params->mi_rows &&
+      mi_col + mi_size_wide[bsize] <= mi_params->mi_cols &&
+      !frame_is_intra_only(cm) && !av1_superres_scaled(cm);
+
+  if (try_split_only) {
+    av1_simple_motion_search_based_split(
+        cpi, x, pc_tree, mi_row, mi_col, bsize, &partition_none_allowed,
+        &partition_horz_allowed, &partition_vert_allowed, &do_rectangular_split,
+        &do_square_split);
+  }
+
+  const int try_prune_rect =
+      !cpi->is_screen_content_type &&
+      cpi->sf.part_sf.simple_motion_search_prune_rect &&
+      !frame_is_intra_only(cm) && do_rectangular_split &&
+      (do_square_split || partition_none_allowed ||
+       (prune_horz && prune_vert)) &&
+      (partition_horz_allowed || partition_vert_allowed) && bsize >= BLOCK_8X8;
+
+  if (try_prune_rect) {
+    av1_simple_motion_search_prune_rect(
+        cpi, x, pc_tree, mi_row, mi_col, bsize, &partition_horz_allowed,
+        &partition_vert_allowed, &prune_horz, &prune_vert);
+  }
+
+  // Max and min square partition levels are defined as the partition nodes that
+  // the recursive function rd_pick_partition() can reach. To implement this:
+  // only PARTITION_NONE is allowed if the current node equals min_sq_part,
+  // only PARTITION_SPLIT is allowed if the current node exceeds max_sq_part.
+  assert(block_size_wide[min_sq_part] == block_size_high[min_sq_part]);
+  assert(block_size_wide[max_sq_part] == block_size_high[max_sq_part]);
+  assert(min_sq_part <= max_sq_part);
+  assert(block_size_wide[bsize] == block_size_high[bsize]);
+  const int max_partition_size = block_size_wide[max_sq_part];
+  const int min_partition_size = block_size_wide[min_sq_part];
+  const int blksize = block_size_wide[bsize];
+  assert(min_partition_size <= max_partition_size);
+  const int is_le_min_sq_part = blksize <= min_partition_size;
+  const int is_gt_max_sq_part = blksize > max_partition_size;
+  if (is_gt_max_sq_part) {
+    // If current block size is larger than max, only allow split.
+    partition_none_allowed = 0;
+    partition_horz_allowed = 0;
+    partition_vert_allowed = 0;
+    do_square_split = 1;
+  } else if (is_le_min_sq_part) {
+    // If current block size is less or equal to min, only allow none if valid
+    // block large enough; only allow split otherwise.
+    partition_horz_allowed = 0;
+    partition_vert_allowed = 0;
+    // only disable square split when current block is not at the picture
+    // boundary. otherwise, inherit the square split flag from previous logic
+    if (has_rows && has_cols) do_square_split = 0;
+    partition_none_allowed = !do_square_split;
+  }
+
+BEGIN_PARTITION_SEARCH:
+  if (x->must_find_valid_partition) {
+    do_square_split = bsize_at_least_8x8 && (blksize > min_partition_size);
+    partition_none_allowed =
+        has_rows && has_cols && (blksize >= min_partition_size);
+    partition_horz_allowed =
+        has_cols && bsize_at_least_8x8 && cpi->oxcf.enable_rect_partitions &&
+        (blksize > min_partition_size) &&
+        get_plane_block_size(get_partition_subsize(bsize, PARTITION_HORZ), xss,
+                             yss) != BLOCK_INVALID;
+    partition_vert_allowed =
+        has_rows && bsize_at_least_8x8 && cpi->oxcf.enable_rect_partitions &&
+        (blksize > min_partition_size) &&
+        get_plane_block_size(get_partition_subsize(bsize, PARTITION_VERT), xss,
+                             yss) != BLOCK_INVALID;
+    terminate_partition_search = 0;
+  }
+
+  // Partition block source pixel variance.
+  unsigned int pb_source_variance = UINT_MAX;
+
+  // Partition block sse after simple motion compensation, not in use now,
+  // but will be used for upcoming speed features
+  unsigned int pb_simple_motion_pred_sse = UINT_MAX;
+  (void)pb_simple_motion_pred_sse;
+
+  // PARTITION_NONE
+  if (is_le_min_sq_part && has_rows && has_cols) partition_none_allowed = 1;
+  assert(terminate_partition_search == 0);
+  int64_t part_none_rd = INT64_MAX;
+  if (cpi->is_screen_content_type)
+    partition_none_allowed = has_rows && has_cols;
+  if (partition_none_allowed && !is_gt_max_sq_part) {
+    int pt_cost = 0;
+    if (bsize_at_least_8x8) {
+      pt_cost = partition_cost[PARTITION_NONE] < INT_MAX
+                    ? partition_cost[PARTITION_NONE]
+                    : 0;
+    }
+    RD_STATS partition_rdcost;
+    av1_init_rd_stats(&partition_rdcost);
+    partition_rdcost.rate = pt_cost;
+    av1_rd_cost_update(x->rdmult, &partition_rdcost);
+    RD_STATS best_remain_rdcost;
+    av1_rd_stats_subtraction(x->rdmult, &best_rdc, &partition_rdcost,
+                             &best_remain_rdcost);
+#if CONFIG_COLLECT_PARTITION_STATS
+    if (best_remain_rdcost >= 0) {
+      partition_attempts[PARTITION_NONE] += 1;
+      aom_usec_timer_start(&partition_timer);
+      partition_timer_on = 1;
+    }
+#endif
+    pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, PARTITION_NONE,
+                  bsize, ctx_none, best_remain_rdcost, PICK_MODE_RD);
+    av1_rd_cost_update(x->rdmult, &this_rdc);
+#if CONFIG_COLLECT_PARTITION_STATS
+    if (partition_timer_on) {
+      aom_usec_timer_mark(&partition_timer);
+      int64_t time = aom_usec_timer_elapsed(&partition_timer);
+      partition_times[PARTITION_NONE] += time;
+      partition_timer_on = 0;
+    }
+#endif
+    pb_source_variance = x->source_variance;
+    pb_simple_motion_pred_sse = x->simple_motion_pred_sse;
+    if (none_rd) *none_rd = this_rdc.rdcost;
+    cur_none_rd = this_rdc.rdcost;
+    if (this_rdc.rate != INT_MAX) {
+      if (cpi->sf.inter_sf.prune_ref_frame_for_rect_partitions) {
+        const int ref_type = av1_ref_frame_type(ctx_none->mic.ref_frame);
+        update_picked_ref_frames_mask(x, ref_type, bsize,
+                                      cm->seq_params.mib_size, mi_row, mi_col);
+      }
+      if (bsize_at_least_8x8) {
+        this_rdc.rate += pt_cost;
+        this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist);
+      }
+
+      part_none_rd = this_rdc.rdcost;
+      if (this_rdc.rdcost < best_rdc.rdcost) {
+        // Adjust dist breakout threshold according to the partition size.
+        const int64_t dist_breakout_thr =
+            cpi->sf.part_sf.partition_search_breakout_dist_thr >>
+            ((2 * (MAX_SB_SIZE_LOG2 - 2)) -
+             (mi_size_wide_log2[bsize] + mi_size_high_log2[bsize]));
+        const int rate_breakout_thr =
+            cpi->sf.part_sf.partition_search_breakout_rate_thr *
+            num_pels_log2_lookup[bsize];
+
+        best_rdc = this_rdc;
+        found_best_partition = true;
+        if (bsize_at_least_8x8) pc_tree->partitioning = PARTITION_NONE;
+
+        if (!frame_is_intra_only(cm) &&
+            (do_square_split || do_rectangular_split) &&
+            !x->e_mbd.lossless[xd->mi[0]->segment_id] && ctx_none->skippable) {
+          const int use_ml_based_breakout =
+              bsize <= cpi->sf.part_sf.use_square_partition_only_threshold &&
+              bsize > BLOCK_4X4 && xd->bd == 8;
+          if (use_ml_based_breakout) {
+            if (av1_ml_predict_breakout(cpi, bsize, x, &this_rdc,
+                                        pb_source_variance)) {
+              do_square_split = 0;
+              do_rectangular_split = 0;
+            }
+          }
+
+          // If all y, u, v transform blocks in this partition are skippable,
+          // and the dist & rate are within the thresholds, the partition
+          // search is terminated for current branch of the partition search
+          // tree. The dist & rate thresholds are set to 0 at speed 0 to
+          // disable the early termination at that speed.
+          if (best_rdc.dist < dist_breakout_thr &&
+              best_rdc.rate < rate_breakout_thr) {
+            do_square_split = 0;
+            do_rectangular_split = 0;
+          }
+        }
+
+        if (cpi->sf.part_sf.simple_motion_search_early_term_none &&
+            cm->show_frame && !frame_is_intra_only(cm) &&
+            bsize >= BLOCK_16X16 && mi_row + mi_step < mi_params->mi_rows &&
+            mi_col + mi_step < mi_params->mi_cols &&
+            this_rdc.rdcost < INT64_MAX && this_rdc.rdcost >= 0 &&
+            this_rdc.rate < INT_MAX && this_rdc.rate >= 0 &&
+            (do_square_split || do_rectangular_split)) {
+          av1_simple_motion_search_early_term_none(cpi, x, pc_tree, mi_row,
+                                                   mi_col, bsize, &this_rdc,
+                                                   &terminate_partition_search);
+        }
+      }
+    }
+
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+  }
+
+  // store estimated motion vector
+  if (cpi->sf.mv_sf.adaptive_motion_search) store_pred_mv(x, ctx_none);
+
+  // PARTITION_SPLIT
+  int64_t part_split_rd = INT64_MAX;
+  if ((!terminate_partition_search && do_square_split) || is_gt_max_sq_part) {
+    av1_init_rd_stats(&sum_rdc);
+    subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+    sum_rdc.rate = partition_cost[PARTITION_SPLIT];
+    sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
+
+    int idx;
+#if CONFIG_COLLECT_PARTITION_STATS
+    if (best_rdc.rdcost - sum_rdc.rdcost >= 0) {
+      partition_attempts[PARTITION_SPLIT] += 1;
+      aom_usec_timer_start(&partition_timer);
+      partition_timer_on = 1;
+    }
+#endif
+    for (idx = 0; idx < 4 && sum_rdc.rdcost < best_rdc.rdcost; ++idx) {
+      const int x_idx = (idx & 1) * mi_step;
+      const int y_idx = (idx >> 1) * mi_step;
+
+      if (mi_row + y_idx >= mi_params->mi_rows ||
+          mi_col + x_idx >= mi_params->mi_cols)
+        continue;
+
+      if (cpi->sf.mv_sf.adaptive_motion_search) load_pred_mv(x, ctx_none);
+
+      pc_tree->split[idx]->index = idx;
+      int64_t *p_split_rd = &split_rd[idx];
+
+      RD_STATS best_remain_rdcost;
+      av1_rd_stats_subtraction(x->rdmult, &best_rdc, &sum_rdc,
+                               &best_remain_rdcost);
+
+      int curr_quad_tree_idx = 0;
+      if (frame_is_intra_only(cm) && bsize <= BLOCK_64X64) {
+        curr_quad_tree_idx = x->quad_tree_idx;
+        x->quad_tree_idx = 4 * curr_quad_tree_idx + idx + 1;
+      }
+      if (!rd_pick_partition(cpi, td, tile_data, tp, mi_row + y_idx,
+                             mi_col + x_idx, subsize, max_sq_part, min_sq_part,
+                             &this_rdc, best_remain_rdcost, pc_tree->split[idx],
+                             p_split_rd, multi_pass_mode,
+                             &split_part_rect_win[idx])) {
+        av1_invalid_rd_stats(&sum_rdc);
+        break;
+      }
+      if (frame_is_intra_only(cm) && bsize <= BLOCK_64X64) {
+        x->quad_tree_idx = curr_quad_tree_idx;
+      }
+
+      sum_rdc.rate += this_rdc.rate;
+      sum_rdc.dist += this_rdc.dist;
+      av1_rd_cost_update(x->rdmult, &sum_rdc);
+      if (idx <= 1 && (bsize <= BLOCK_8X8 ||
+                       pc_tree->split[idx]->partitioning == PARTITION_NONE)) {
+        const MB_MODE_INFO *const mbmi = &pc_tree->split[idx]->none.mic;
+        const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+        // Neither palette mode nor cfl predicted
+        if (pmi->palette_size[0] == 0 && pmi->palette_size[1] == 0) {
+          if (mbmi->uv_mode != UV_CFL_PRED) split_ctx_is_ready[idx] = 1;
+        }
+      }
+    }
+#if CONFIG_COLLECT_PARTITION_STATS
+    if (partition_timer_on) {
+      aom_usec_timer_mark(&partition_timer);
+      int64_t time = aom_usec_timer_elapsed(&partition_timer);
+      partition_times[PARTITION_SPLIT] += time;
+      partition_timer_on = 0;
+    }
+#endif
+    const int reached_last_index = (idx == 4);
+
+    part_split_rd = sum_rdc.rdcost;
+    if (reached_last_index && sum_rdc.rdcost < best_rdc.rdcost) {
+      sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
+      if (sum_rdc.rdcost < best_rdc.rdcost) {
+        best_rdc = sum_rdc;
+        found_best_partition = true;
+        pc_tree->partitioning = PARTITION_SPLIT;
+      }
+    } else if (cpi->sf.part_sf.less_rectangular_check_level > 0) {
+      // Skip rectangular partition test when partition type none gives better
+      // rd than partition type split.
+      if (cpi->sf.part_sf.less_rectangular_check_level == 2 || idx <= 2) {
+        const int partition_none_valid = cur_none_rd > 0;
+        const int partition_none_better = cur_none_rd < sum_rdc.rdcost;
+        do_rectangular_split &=
+            !(partition_none_valid && partition_none_better);
+      }
+    }
+
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+  }  // if (do_split)
+
+  if (cpi->sf.part_sf.ml_early_term_after_part_split_level &&
+      !frame_is_intra_only(cm) && !terminate_partition_search &&
+      do_rectangular_split &&
+      (partition_horz_allowed || partition_vert_allowed)) {
+    av1_ml_early_term_after_split(cpi, x, pc_tree, bsize, best_rdc.rdcost,
+                                  part_none_rd, part_split_rd, split_rd, mi_row,
+                                  mi_col, &terminate_partition_search);
+  }
+
+  if (!cpi->sf.part_sf.ml_early_term_after_part_split_level &&
+      cpi->sf.part_sf.ml_prune_rect_partition && !frame_is_intra_only(cm) &&
+      (partition_horz_allowed || partition_vert_allowed) &&
+      !(prune_horz || prune_vert) && !terminate_partition_search) {
+    av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, bsize);
+    av1_ml_prune_rect_partition(cpi, x, bsize, best_rdc.rdcost, cur_none_rd,
+                                split_rd, &prune_horz, &prune_vert);
+  }
+
+  // PARTITION_HORZ
+  assert(IMPLIES(!cpi->oxcf.enable_rect_partitions, !partition_horz_allowed));
+  if (!terminate_partition_search && partition_horz_allowed && !prune_horz &&
+      (do_rectangular_split || active_h_edge(cpi, mi_row, mi_step)) &&
+      !is_gt_max_sq_part) {
+    av1_init_rd_stats(&sum_rdc);
+    subsize = get_partition_subsize(bsize, PARTITION_HORZ);
+    if (cpi->sf.mv_sf.adaptive_motion_search) load_pred_mv(x, ctx_none);
+    sum_rdc.rate = partition_cost[PARTITION_HORZ];
+    sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
+    RD_STATS best_remain_rdcost;
+    av1_rd_stats_subtraction(x->rdmult, &best_rdc, &sum_rdc,
+                             &best_remain_rdcost);
+#if CONFIG_COLLECT_PARTITION_STATS
+    if (best_remain_rdcost >= 0) {
+      partition_attempts[PARTITION_HORZ] += 1;
+      aom_usec_timer_start(&partition_timer);
+      partition_timer_on = 1;
+    }
+#endif
+    pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, PARTITION_HORZ,
+                  subsize, &pc_tree->horizontal[0], best_remain_rdcost,
+                  PICK_MODE_RD);
+    av1_rd_cost_update(x->rdmult, &this_rdc);
+
+    if (this_rdc.rate == INT_MAX) {
+      sum_rdc.rdcost = INT64_MAX;
+    } else {
+      sum_rdc.rate += this_rdc.rate;
+      sum_rdc.dist += this_rdc.dist;
+      av1_rd_cost_update(x->rdmult, &sum_rdc);
+    }
+    horz_rd[0] = this_rdc.rdcost;
+
+    if (sum_rdc.rdcost < best_rdc.rdcost && has_rows) {
+      const PICK_MODE_CONTEXT *const ctx_h = &pc_tree->horizontal[0];
+      const MB_MODE_INFO *const mbmi = &pc_tree->horizontal[0].mic;
+      const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+      // Neither palette mode nor cfl predicted
+      if (pmi->palette_size[0] == 0 && pmi->palette_size[1] == 0) {
+        if (mbmi->uv_mode != UV_CFL_PRED) horz_ctx_is_ready = 1;
+      }
+      update_state(cpi, td, ctx_h, mi_row, mi_col, subsize, 1);
+      encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, subsize, NULL);
+
+      if (cpi->sf.mv_sf.adaptive_motion_search) load_pred_mv(x, ctx_h);
+
+      av1_rd_stats_subtraction(x->rdmult, &best_rdc, &sum_rdc,
+                               &best_remain_rdcost);
+
+      pick_sb_modes(cpi, tile_data, x, mi_row + mi_step, mi_col, &this_rdc,
+                    PARTITION_HORZ, subsize, &pc_tree->horizontal[1],
+                    best_remain_rdcost, PICK_MODE_RD);
+      av1_rd_cost_update(x->rdmult, &this_rdc);
+      horz_rd[1] = this_rdc.rdcost;
+
+      if (this_rdc.rate == INT_MAX) {
+        sum_rdc.rdcost = INT64_MAX;
+      } else {
+        sum_rdc.rate += this_rdc.rate;
+        sum_rdc.dist += this_rdc.dist;
+        av1_rd_cost_update(x->rdmult, &sum_rdc);
+      }
+    }
+#if CONFIG_COLLECT_PARTITION_STATS
+    if (partition_timer_on) {
+      aom_usec_timer_mark(&partition_timer);
+      int64_t time = aom_usec_timer_elapsed(&partition_timer);
+      partition_times[PARTITION_HORZ] += time;
+      partition_timer_on = 0;
+    }
+#endif
+
+    if (sum_rdc.rdcost < best_rdc.rdcost) {
+      sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
+      if (sum_rdc.rdcost < best_rdc.rdcost) {
+        best_rdc = sum_rdc;
+        found_best_partition = true;
+        pc_tree->partitioning = PARTITION_HORZ;
+      }
+    } else {
+      // Update HORZ win flag
+      if (rect_part_win_info != NULL) {
+        rect_part_win_info->horz_win = false;
+      }
+    }
+
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+  }
+
+  // PARTITION_VERT
+  assert(IMPLIES(!cpi->oxcf.enable_rect_partitions, !partition_vert_allowed));
+  if (!terminate_partition_search && partition_vert_allowed && !prune_vert &&
+      (do_rectangular_split || active_v_edge(cpi, mi_col, mi_step)) &&
+      !is_gt_max_sq_part) {
+    av1_init_rd_stats(&sum_rdc);
+    subsize = get_partition_subsize(bsize, PARTITION_VERT);
+
+    if (cpi->sf.mv_sf.adaptive_motion_search) load_pred_mv(x, ctx_none);
+
+    sum_rdc.rate = partition_cost[PARTITION_VERT];
+    sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
+    RD_STATS best_remain_rdcost;
+    av1_rd_stats_subtraction(x->rdmult, &best_rdc, &sum_rdc,
+                             &best_remain_rdcost);
+#if CONFIG_COLLECT_PARTITION_STATS
+    if (best_remain_rdcost >= 0) {
+      partition_attempts[PARTITION_VERT] += 1;
+      aom_usec_timer_start(&partition_timer);
+      partition_timer_on = 1;
+    }
+#endif
+    pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, PARTITION_VERT,
+                  subsize, &pc_tree->vertical[0], best_remain_rdcost,
+                  PICK_MODE_RD);
+    av1_rd_cost_update(x->rdmult, &this_rdc);
+
+    if (this_rdc.rate == INT_MAX) {
+      sum_rdc.rdcost = INT64_MAX;
+    } else {
+      sum_rdc.rate += this_rdc.rate;
+      sum_rdc.dist += this_rdc.dist;
+      av1_rd_cost_update(x->rdmult, &sum_rdc);
+    }
+    vert_rd[0] = this_rdc.rdcost;
+    if (sum_rdc.rdcost < best_rdc.rdcost && has_cols) {
+      const MB_MODE_INFO *const mbmi = &pc_tree->vertical[0].mic;
+      const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+      // Neither palette mode nor cfl predicted
+      if (pmi->palette_size[0] == 0 && pmi->palette_size[1] == 0) {
+        if (mbmi->uv_mode != UV_CFL_PRED) vert_ctx_is_ready = 1;
+      }
+      update_state(cpi, td, &pc_tree->vertical[0], mi_row, mi_col, subsize, 1);
+      encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, subsize, NULL);
+
+      if (cpi->sf.mv_sf.adaptive_motion_search) load_pred_mv(x, ctx_none);
+
+      av1_rd_stats_subtraction(x->rdmult, &best_rdc, &sum_rdc,
+                               &best_remain_rdcost);
+      pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + mi_step, &this_rdc,
+                    PARTITION_VERT, subsize, &pc_tree->vertical[1],
+                    best_remain_rdcost, PICK_MODE_RD);
+      av1_rd_cost_update(x->rdmult, &this_rdc);
+      vert_rd[1] = this_rdc.rdcost;
+
+      if (this_rdc.rate == INT_MAX) {
+        sum_rdc.rdcost = INT64_MAX;
+      } else {
+        sum_rdc.rate += this_rdc.rate;
+        sum_rdc.dist += this_rdc.dist;
+        av1_rd_cost_update(x->rdmult, &sum_rdc);
+      }
+    }
+#if CONFIG_COLLECT_PARTITION_STATS
+    if (partition_timer_on) {
+      aom_usec_timer_mark(&partition_timer);
+      int64_t time = aom_usec_timer_elapsed(&partition_timer);
+      partition_times[PARTITION_VERT] += time;
+      partition_timer_on = 0;
+    }
+#endif
+
+    av1_rd_cost_update(x->rdmult, &sum_rdc);
+    if (sum_rdc.rdcost < best_rdc.rdcost) {
+      best_rdc = sum_rdc;
+      found_best_partition = true;
+      pc_tree->partitioning = PARTITION_VERT;
+    } else {
+      // Update VERT win flag
+      if (rect_part_win_info != NULL) {
+        rect_part_win_info->vert_win = false;
+      }
+    }
+
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+  }
+
+  if (pb_source_variance == UINT_MAX) {
+    av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, bsize);
+    if (is_cur_buf_hbd(xd)) {
+      pb_source_variance = av1_high_get_sby_perpixel_variance(
+          cpi, &x->plane[0].src, bsize, xd->bd);
+    } else {
+      pb_source_variance =
+          av1_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
+    }
+  }
+
+  if (use_pb_simple_motion_pred_sse(cpi) &&
+      pb_simple_motion_pred_sse == UINT_MAX) {
+    const FULLPEL_MV start_mv = kZeroFullMv;
+    unsigned int var = 0;
+
+    av1_simple_motion_sse_var(cpi, x, mi_row, mi_col, bsize, start_mv, 0,
+                              &pb_simple_motion_pred_sse, &var);
+  }
+
+  assert(IMPLIES(!cpi->oxcf.enable_rect_partitions, !do_rectangular_split));
+
+  const int ext_partition_allowed =
+      do_rectangular_split &&
+      bsize > cpi->sf.part_sf.ext_partition_eval_thresh && has_rows && has_cols;
+
+  // The standard AB partitions are allowed whenever ext-partition-types are
+  // allowed
+  int horzab_partition_allowed =
+      ext_partition_allowed & cpi->oxcf.enable_ab_partitions;
+  int vertab_partition_allowed =
+      ext_partition_allowed & cpi->oxcf.enable_ab_partitions;
+
+  if (cpi->sf.part_sf.prune_ext_partition_types_search_level) {
+    if (cpi->sf.part_sf.prune_ext_partition_types_search_level == 1) {
+      // TODO(debargha,huisu@google.com): may need to tune the threshold for
+      // pb_source_variance.
+      horzab_partition_allowed &= (pc_tree->partitioning == PARTITION_HORZ ||
+                                   (pc_tree->partitioning == PARTITION_NONE &&
+                                    pb_source_variance < 32) ||
+                                   pc_tree->partitioning == PARTITION_SPLIT);
+      vertab_partition_allowed &= (pc_tree->partitioning == PARTITION_VERT ||
+                                   (pc_tree->partitioning == PARTITION_NONE &&
+                                    pb_source_variance < 32) ||
+                                   pc_tree->partitioning == PARTITION_SPLIT);
+    } else {
+      horzab_partition_allowed &= (pc_tree->partitioning == PARTITION_HORZ ||
+                                   pc_tree->partitioning == PARTITION_SPLIT);
+      vertab_partition_allowed &= (pc_tree->partitioning == PARTITION_VERT ||
+                                   pc_tree->partitioning == PARTITION_SPLIT);
+    }
+    horz_rd[0] = (horz_rd[0] < INT64_MAX ? horz_rd[0] : 0);
+    horz_rd[1] = (horz_rd[1] < INT64_MAX ? horz_rd[1] : 0);
+    vert_rd[0] = (vert_rd[0] < INT64_MAX ? vert_rd[0] : 0);
+    vert_rd[1] = (vert_rd[1] < INT64_MAX ? vert_rd[1] : 0);
+    split_rd[0] = (split_rd[0] < INT64_MAX ? split_rd[0] : 0);
+    split_rd[1] = (split_rd[1] < INT64_MAX ? split_rd[1] : 0);
+    split_rd[2] = (split_rd[2] < INT64_MAX ? split_rd[2] : 0);
+    split_rd[3] = (split_rd[3] < INT64_MAX ? split_rd[3] : 0);
+  }
+  int horza_partition_allowed = horzab_partition_allowed;
+  int horzb_partition_allowed = horzab_partition_allowed;
+  if (cpi->sf.part_sf.prune_ext_partition_types_search_level) {
+    const int64_t horz_a_rd = horz_rd[1] + split_rd[0] + split_rd[1];
+    const int64_t horz_b_rd = horz_rd[0] + split_rd[2] + split_rd[3];
+    switch (cpi->sf.part_sf.prune_ext_partition_types_search_level) {
+      case 1:
+        horza_partition_allowed &= (horz_a_rd / 16 * 14 < best_rdc.rdcost);
+        horzb_partition_allowed &= (horz_b_rd / 16 * 14 < best_rdc.rdcost);
+        break;
+      case 2:
+      default:
+        horza_partition_allowed &= (horz_a_rd / 16 * 15 < best_rdc.rdcost);
+        horzb_partition_allowed &= (horz_b_rd / 16 * 15 < best_rdc.rdcost);
+        break;
+    }
+  }
+
+  int verta_partition_allowed = vertab_partition_allowed;
+  int vertb_partition_allowed = vertab_partition_allowed;
+  if (cpi->sf.part_sf.prune_ext_partition_types_search_level) {
+    const int64_t vert_a_rd = vert_rd[1] + split_rd[0] + split_rd[2];
+    const int64_t vert_b_rd = vert_rd[0] + split_rd[1] + split_rd[3];
+    switch (cpi->sf.part_sf.prune_ext_partition_types_search_level) {
+      case 1:
+        verta_partition_allowed &= (vert_a_rd / 16 * 14 < best_rdc.rdcost);
+        vertb_partition_allowed &= (vert_b_rd / 16 * 14 < best_rdc.rdcost);
+        break;
+      case 2:
+      default:
+        verta_partition_allowed &= (vert_a_rd / 16 * 15 < best_rdc.rdcost);
+        vertb_partition_allowed &= (vert_b_rd / 16 * 15 < best_rdc.rdcost);
+        break;
+    }
+  }
+
+  if (cpi->sf.part_sf.ml_prune_ab_partition && ext_partition_allowed &&
+      partition_horz_allowed && partition_vert_allowed) {
+    // TODO(huisu@google.com): x->source_variance may not be the current
+    // block's variance. The correct one to use is pb_source_variance. Need to
+    // re-train the model to fix it.
+    av1_ml_prune_ab_partition(
+        bsize, pc_tree->partitioning, get_unsigned_bits(x->source_variance),
+        best_rdc.rdcost, horz_rd, vert_rd, split_rd, &horza_partition_allowed,
+        &horzb_partition_allowed, &verta_partition_allowed,
+        &vertb_partition_allowed);
+  }
+
+  horza_partition_allowed &= cpi->oxcf.enable_ab_partitions;
+  horzb_partition_allowed &= cpi->oxcf.enable_ab_partitions;
+  verta_partition_allowed &= cpi->oxcf.enable_ab_partitions;
+  vertb_partition_allowed &= cpi->oxcf.enable_ab_partitions;
+
+  if (cpi->sf.part_sf.prune_ab_partition_using_split_info &&
+      horza_partition_allowed) {
+    horza_partition_allowed &= evaluate_ab_partition_based_on_split(
+        pc_tree, PARTITION_HORZ, rect_part_win_info, x->qindex, 0, 1);
+  }
+
+  // PARTITION_HORZ_A
+  if (!terminate_partition_search && partition_horz_allowed &&
+      horza_partition_allowed && !is_gt_max_sq_part) {
+    subsize = get_partition_subsize(bsize, PARTITION_HORZ_A);
+    pc_tree->horizontala[0].rd_mode_is_ready = 0;
+    pc_tree->horizontala[1].rd_mode_is_ready = 0;
+    pc_tree->horizontala[2].rd_mode_is_ready = 0;
+    if (split_ctx_is_ready[0]) {
+      av1_copy_tree_context(&pc_tree->horizontala[0], &pc_tree->split[0]->none);
+      pc_tree->horizontala[0].mic.partition = PARTITION_HORZ_A;
+      pc_tree->horizontala[0].rd_mode_is_ready = 1;
+      if (split_ctx_is_ready[1]) {
+        av1_copy_tree_context(&pc_tree->horizontala[1],
+                              &pc_tree->split[1]->none);
+        pc_tree->horizontala[1].mic.partition = PARTITION_HORZ_A;
+        pc_tree->horizontala[1].rd_mode_is_ready = 1;
+      }
+    }
+#if CONFIG_COLLECT_PARTITION_STATS
+    {
+      RD_STATS tmp_sum_rdc;
+      av1_init_rd_stats(&tmp_sum_rdc);
+      tmp_sum_rdc.rate = x->partition_cost[pl][PARTITION_HORZ_A];
+      tmp_sum_rdc.rdcost = RDCOST(x->rdmult, tmp_sum_rdc.rate, 0);
+      if (best_rdc.rdcost - tmp_sum_rdc.rdcost >= 0) {
+        partition_attempts[PARTITION_HORZ_A] += 1;
+        aom_usec_timer_start(&partition_timer);
+        partition_timer_on = 1;
+      }
+    }
+#endif
+    found_best_partition |= rd_test_partition3(
+        cpi, td, tile_data, tp, pc_tree, &best_rdc, pc_tree->horizontala,
+        ctx_none, mi_row, mi_col, bsize, PARTITION_HORZ_A, mi_row, mi_col,
+        bsize2, mi_row, mi_col + mi_step, bsize2, mi_row + mi_step, mi_col,
+        subsize);
+#if CONFIG_COLLECT_PARTITION_STATS
+    if (partition_timer_on) {
+      aom_usec_timer_mark(&partition_timer);
+      int64_t time = aom_usec_timer_elapsed(&partition_timer);
+      partition_times[PARTITION_HORZ_A] += time;
+      partition_timer_on = 0;
+    }
+#endif
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+  }
+
+  if (cpi->sf.part_sf.prune_ab_partition_using_split_info &&
+      horzb_partition_allowed) {
+    horzb_partition_allowed &= evaluate_ab_partition_based_on_split(
+        pc_tree, PARTITION_HORZ, rect_part_win_info, x->qindex, 2, 3);
+  }
+
+  // PARTITION_HORZ_B
+  if (!terminate_partition_search && partition_horz_allowed &&
+      horzb_partition_allowed && !is_gt_max_sq_part) {
+    subsize = get_partition_subsize(bsize, PARTITION_HORZ_B);
+    pc_tree->horizontalb[0].rd_mode_is_ready = 0;
+    pc_tree->horizontalb[1].rd_mode_is_ready = 0;
+    pc_tree->horizontalb[2].rd_mode_is_ready = 0;
+    if (horz_ctx_is_ready) {
+      av1_copy_tree_context(&pc_tree->horizontalb[0], &pc_tree->horizontal[0]);
+      pc_tree->horizontalb[0].mic.partition = PARTITION_HORZ_B;
+      pc_tree->horizontalb[0].rd_mode_is_ready = 1;
+    }
+#if CONFIG_COLLECT_PARTITION_STATS
+    {
+      RD_STATS tmp_sum_rdc;
+      av1_init_rd_stats(&tmp_sum_rdc);
+      tmp_sum_rdc.rate = x->partition_cost[pl][PARTITION_HORZ_B];
+      tmp_sum_rdc.rdcost = RDCOST(x->rdmult, tmp_sum_rdc.rate, 0);
+      if (best_rdc.rdcost - tmp_sum_rdc.rdcost >= 0) {
+        partition_attempts[PARTITION_HORZ_B] += 1;
+        aom_usec_timer_start(&partition_timer);
+        partition_timer_on = 1;
+      }
+    }
+#endif
+    found_best_partition |= rd_test_partition3(
+        cpi, td, tile_data, tp, pc_tree, &best_rdc, pc_tree->horizontalb,
+        ctx_none, mi_row, mi_col, bsize, PARTITION_HORZ_B, mi_row, mi_col,
+        subsize, mi_row + mi_step, mi_col, bsize2, mi_row + mi_step,
+        mi_col + mi_step, bsize2);
+
+#if CONFIG_COLLECT_PARTITION_STATS
+    if (partition_timer_on) {
+      aom_usec_timer_mark(&partition_timer);
+      int64_t time = aom_usec_timer_elapsed(&partition_timer);
+      partition_times[PARTITION_HORZ_B] += time;
+      partition_timer_on = 0;
+    }
+#endif
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+  }
+
+  if (cpi->sf.part_sf.prune_ab_partition_using_split_info &&
+      verta_partition_allowed) {
+    verta_partition_allowed &= evaluate_ab_partition_based_on_split(
+        pc_tree, PARTITION_VERT, rect_part_win_info, x->qindex, 0, 2);
+  }
+
+  // PARTITION_VERT_A
+  if (!terminate_partition_search && partition_vert_allowed &&
+      verta_partition_allowed && !is_gt_max_sq_part) {
+    subsize = get_partition_subsize(bsize, PARTITION_VERT_A);
+    pc_tree->verticala[0].rd_mode_is_ready = 0;
+    pc_tree->verticala[1].rd_mode_is_ready = 0;
+    pc_tree->verticala[2].rd_mode_is_ready = 0;
+    if (split_ctx_is_ready[0]) {
+      av1_copy_tree_context(&pc_tree->verticala[0], &pc_tree->split[0]->none);
+      pc_tree->verticala[0].mic.partition = PARTITION_VERT_A;
+      pc_tree->verticala[0].rd_mode_is_ready = 1;
+    }
+#if CONFIG_COLLECT_PARTITION_STATS
+    {
+      RD_STATS tmp_sum_rdc;
+      av1_init_rd_stats(&tmp_sum_rdc);
+      tmp_sum_rdc.rate = x->partition_cost[pl][PARTITION_VERT_A];
+      tmp_sum_rdc.rdcost = RDCOST(x->rdmult, tmp_sum_rdc.rate, 0);
+      if (best_rdc.rdcost - tmp_sum_rdc.rdcost >= 0) {
+        partition_attempts[PARTITION_VERT_A] += 1;
+        aom_usec_timer_start(&partition_timer);
+        partition_timer_on = 1;
+      }
+    }
+#endif
+    found_best_partition |= rd_test_partition3(
+        cpi, td, tile_data, tp, pc_tree, &best_rdc, pc_tree->verticala,
+        ctx_none, mi_row, mi_col, bsize, PARTITION_VERT_A, mi_row, mi_col,
+        bsize2, mi_row + mi_step, mi_col, bsize2, mi_row, mi_col + mi_step,
+        subsize);
+#if CONFIG_COLLECT_PARTITION_STATS
+    if (partition_timer_on) {
+      aom_usec_timer_mark(&partition_timer);
+      int64_t time = aom_usec_timer_elapsed(&partition_timer);
+      partition_times[PARTITION_VERT_A] += time;
+      partition_timer_on = 0;
+    }
+#endif
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+  }
+
+  if (cpi->sf.part_sf.prune_ab_partition_using_split_info &&
+      vertb_partition_allowed) {
+    vertb_partition_allowed &= evaluate_ab_partition_based_on_split(
+        pc_tree, PARTITION_VERT, rect_part_win_info, x->qindex, 1, 3);
+  }
+
+  // PARTITION_VERT_B
+  if (!terminate_partition_search && partition_vert_allowed &&
+      vertb_partition_allowed && !is_gt_max_sq_part) {
+    subsize = get_partition_subsize(bsize, PARTITION_VERT_B);
+    pc_tree->verticalb[0].rd_mode_is_ready = 0;
+    pc_tree->verticalb[1].rd_mode_is_ready = 0;
+    pc_tree->verticalb[2].rd_mode_is_ready = 0;
+    if (vert_ctx_is_ready) {
+      av1_copy_tree_context(&pc_tree->verticalb[0], &pc_tree->vertical[0]);
+      pc_tree->verticalb[0].mic.partition = PARTITION_VERT_B;
+      pc_tree->verticalb[0].rd_mode_is_ready = 1;
+    }
+#if CONFIG_COLLECT_PARTITION_STATS
+    {
+      RD_STATS tmp_sum_rdc;
+      av1_init_rd_stats(&tmp_sum_rdc);
+      tmp_sum_rdc.rate = x->partition_cost[pl][PARTITION_VERT_B];
+      tmp_sum_rdc.rdcost = RDCOST(x->rdmult, tmp_sum_rdc.rate, 0);
+      if (!frame_is_intra_only(cm) &&
+          best_rdc.rdcost - tmp_sum_rdc.rdcost >= 0) {
+        partition_attempts[PARTITION_VERT_B] += 1;
+        aom_usec_timer_start(&partition_timer);
+        partition_timer_on = 1;
+      }
+    }
+#endif
+    found_best_partition |= rd_test_partition3(
+        cpi, td, tile_data, tp, pc_tree, &best_rdc, pc_tree->verticalb,
+        ctx_none, mi_row, mi_col, bsize, PARTITION_VERT_B, mi_row, mi_col,
+        subsize, mi_row, mi_col + mi_step, bsize2, mi_row + mi_step,
+        mi_col + mi_step, bsize2);
+#if CONFIG_COLLECT_PARTITION_STATS
+    if (partition_timer_on) {
+      aom_usec_timer_mark(&partition_timer);
+      int64_t time = aom_usec_timer_elapsed(&partition_timer);
+      partition_times[PARTITION_VERT_B] += time;
+      partition_timer_on = 0;
+    }
+#endif
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+  }
+
+  // partition4_allowed is 1 if we can use a PARTITION_HORZ_4 or
+  // PARTITION_VERT_4 for this block. This is almost the same as
+  // ext_partition_allowed, except that we don't allow 128x32 or 32x128
+  // blocks, so we require that bsize is not BLOCK_128X128.
+  const int partition4_allowed = cpi->oxcf.enable_1to4_partitions &&
+                                 ext_partition_allowed &&
+                                 bsize != BLOCK_128X128;
+
+  int partition_horz4_allowed =
+      partition4_allowed && partition_horz_allowed &&
+      get_plane_block_size(get_partition_subsize(bsize, PARTITION_HORZ_4), xss,
+                           yss) != BLOCK_INVALID;
+  int partition_vert4_allowed =
+      partition4_allowed && partition_vert_allowed &&
+      get_plane_block_size(get_partition_subsize(bsize, PARTITION_VERT_4), xss,
+                           yss) != BLOCK_INVALID;
+  if (cpi->sf.part_sf.prune_ext_partition_types_search_level == 2) {
+    partition_horz4_allowed &= (pc_tree->partitioning == PARTITION_HORZ ||
+                                pc_tree->partitioning == PARTITION_HORZ_A ||
+                                pc_tree->partitioning == PARTITION_HORZ_B ||
+                                pc_tree->partitioning == PARTITION_SPLIT ||
+                                pc_tree->partitioning == PARTITION_NONE);
+    partition_vert4_allowed &= (pc_tree->partitioning == PARTITION_VERT ||
+                                pc_tree->partitioning == PARTITION_VERT_A ||
+                                pc_tree->partitioning == PARTITION_VERT_B ||
+                                pc_tree->partitioning == PARTITION_SPLIT ||
+                                pc_tree->partitioning == PARTITION_NONE);
+  }
+  if (cpi->sf.part_sf.ml_prune_4_partition && partition4_allowed &&
+      partition_horz_allowed && partition_vert_allowed) {
+    av1_ml_prune_4_partition(cpi, x, bsize, pc_tree->partitioning,
+                             best_rdc.rdcost, horz_rd, vert_rd, split_rd,
+                             &partition_horz4_allowed, &partition_vert4_allowed,
+                             pb_source_variance, mi_row, mi_col);
+  }
+
+  if (blksize < (min_partition_size << 2)) {
+    partition_horz4_allowed = 0;
+    partition_vert4_allowed = 0;
+  }
+
+  if (cpi->sf.part_sf.prune_4_partition_using_split_info &&
+      (partition_horz4_allowed || partition_vert4_allowed)) {
+    // Count of child blocks in which HORZ or VERT partition has won
+    int num_child_horz_win = 0, num_child_vert_win = 0;
+    for (int idx = 0; idx < 4; idx++) {
+      num_child_horz_win += (split_part_rect_win[idx].horz_win) ? 1 : 0;
+      num_child_vert_win += (split_part_rect_win[idx].vert_win) ? 1 : 0;
+    }
+
+    // Prune HORZ4/VERT4 partitions based on number of HORZ/VERT winners of
+    // split partiitons.
+    // Conservative pruning for high quantizers
+    const int num_win_thresh = AOMMIN(3 * (MAXQ - x->qindex) / MAXQ + 1, 3);
+    if (num_child_horz_win < num_win_thresh) {
+      partition_horz4_allowed = 0;
+    }
+    if (num_child_vert_win < num_win_thresh) {
+      partition_vert4_allowed = 0;
+    }
+  }
+
+  // PARTITION_HORZ_4
+  assert(IMPLIES(!cpi->oxcf.enable_rect_partitions, !partition_horz4_allowed));
+  if (!terminate_partition_search && partition_horz4_allowed && has_rows &&
+      (do_rectangular_split || active_h_edge(cpi, mi_row, mi_step)) &&
+      !is_gt_max_sq_part) {
+    av1_init_rd_stats(&sum_rdc);
+    const int quarter_step = mi_size_high[bsize] / 4;
+    PICK_MODE_CONTEXT *ctx_prev = ctx_none;
+
+    subsize = get_partition_subsize(bsize, PARTITION_HORZ_4);
+    sum_rdc.rate = partition_cost[PARTITION_HORZ_4];
+    sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
+
+#if CONFIG_COLLECT_PARTITION_STATS
+    if (best_rdc.rdcost - sum_rdc.rdcost >= 0) {
+      partition_attempts[PARTITION_HORZ_4] += 1;
+      aom_usec_timer_start(&partition_timer);
+      partition_timer_on = 1;
+    }
+#endif
+    for (int i = 0; i < 4; ++i) {
+      const int this_mi_row = mi_row + i * quarter_step;
+
+      if (i > 0 && this_mi_row >= mi_params->mi_rows) break;
+
+      PICK_MODE_CONTEXT *ctx_this = &pc_tree->horizontal4[i];
+
+      ctx_this->rd_mode_is_ready = 0;
+      if (!rd_try_subblock(cpi, td, tile_data, tp, (i == 3), this_mi_row,
+                           mi_col, subsize, best_rdc, &sum_rdc,
+                           PARTITION_HORZ_4, ctx_prev, ctx_this)) {
+        av1_invalid_rd_stats(&sum_rdc);
+        break;
+      }
+
+      ctx_prev = ctx_this;
+    }
+
+    av1_rd_cost_update(x->rdmult, &sum_rdc);
+    if (sum_rdc.rdcost < best_rdc.rdcost) {
+      best_rdc = sum_rdc;
+      found_best_partition = true;
+      pc_tree->partitioning = PARTITION_HORZ_4;
+    }
+
+#if CONFIG_COLLECT_PARTITION_STATS
+    if (partition_timer_on) {
+      aom_usec_timer_mark(&partition_timer);
+      int64_t time = aom_usec_timer_elapsed(&partition_timer);
+      partition_times[PARTITION_HORZ_4] += time;
+      partition_timer_on = 0;
+    }
+#endif
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+  }
+
+  // PARTITION_VERT_4
+  assert(IMPLIES(!cpi->oxcf.enable_rect_partitions, !partition_vert4_allowed));
+  if (!terminate_partition_search && partition_vert4_allowed && has_cols &&
+      (do_rectangular_split || active_v_edge(cpi, mi_row, mi_step)) &&
+      !is_gt_max_sq_part) {
+    av1_init_rd_stats(&sum_rdc);
+    const int quarter_step = mi_size_wide[bsize] / 4;
+    PICK_MODE_CONTEXT *ctx_prev = ctx_none;
+
+    subsize = get_partition_subsize(bsize, PARTITION_VERT_4);
+    sum_rdc.rate = partition_cost[PARTITION_VERT_4];
+    sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
+
+#if CONFIG_COLLECT_PARTITION_STATS
+    if (best_rdc.rdcost - sum_rdc.rdcost >= 0) {
+      partition_attempts[PARTITION_VERT_4] += 1;
+      aom_usec_timer_start(&partition_timer);
+      partition_timer_on = 1;
+    }
+#endif
+    for (int i = 0; i < 4; ++i) {
+      const int this_mi_col = mi_col + i * quarter_step;
+
+      if (i > 0 && this_mi_col >= mi_params->mi_cols) break;
+
+      PICK_MODE_CONTEXT *ctx_this = &pc_tree->vertical4[i];
+
+      ctx_this->rd_mode_is_ready = 0;
+      if (!rd_try_subblock(cpi, td, tile_data, tp, (i == 3), mi_row,
+                           this_mi_col, subsize, best_rdc, &sum_rdc,
+                           PARTITION_VERT_4, ctx_prev, ctx_this)) {
+        av1_invalid_rd_stats(&sum_rdc);
+        break;
+      }
+
+      ctx_prev = ctx_this;
+    }
+
+    av1_rd_cost_update(x->rdmult, &sum_rdc);
+    if (sum_rdc.rdcost < best_rdc.rdcost) {
+      best_rdc = sum_rdc;
+      found_best_partition = true;
+      pc_tree->partitioning = PARTITION_VERT_4;
+    }
+#if CONFIG_COLLECT_PARTITION_STATS
+    if (partition_timer_on) {
+      aom_usec_timer_mark(&partition_timer);
+      int64_t time = aom_usec_timer_elapsed(&partition_timer);
+      partition_times[PARTITION_VERT_4] += time;
+      partition_timer_on = 0;
+    }
+#endif
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+  }
+
+  if (bsize == cm->seq_params.sb_size && !found_best_partition) {
+    // Did not find a valid partition, go back and search again, with less
+    // constraint on which partition types to search.
+    x->must_find_valid_partition = 1;
+#if CONFIG_COLLECT_PARTITION_STATS == 2
+    part_stats->partition_redo += 1;
+#endif
+    goto BEGIN_PARTITION_SEARCH;
+  }
+
+  *rd_cost = best_rdc;
+
+#if CONFIG_COLLECT_PARTITION_STATS
+  if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX) {
+    partition_decisions[pc_tree->partitioning] += 1;
+  }
+#endif
+
+#if CONFIG_COLLECT_PARTITION_STATS == 1
+  // If CONFIG_COLLECT_PARTITION_STATS is 1, then print out the stats for each
+  // prediction block
+  FILE *f = fopen("data.csv", "a");
+  fprintf(f, "%d,%d,%d,", bsize, cm->show_frame, frame_is_intra_only(cm));
+  for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) {
+    fprintf(f, "%d,", partition_decisions[idx]);
+  }
+  for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) {
+    fprintf(f, "%d,", partition_attempts[idx]);
+  }
+  for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) {
+    fprintf(f, "%ld,", partition_times[idx]);
+  }
+  fprintf(f, "\n");
+  fclose(f);
+#endif
+
+#if CONFIG_COLLECT_PARTITION_STATS == 2
+  // If CONFIG_COLLECTION_PARTITION_STATS is 2, then we print out the stats for
+  // the whole clip. So we need to pass the information upstream to the encoder
+  const int bsize_idx = av1_get_bsize_idx_for_part_stats(bsize);
+  int *agg_attempts = part_stats->partition_attempts[bsize_idx];
+  int *agg_decisions = part_stats->partition_decisions[bsize_idx];
+  int64_t *agg_times = part_stats->partition_times[bsize_idx];
+  for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) {
+    agg_attempts[idx] += partition_attempts[idx];
+    agg_decisions[idx] += partition_decisions[idx];
+    agg_times[idx] += partition_times[idx];
+  }
+#endif
+
+  if (found_best_partition && pc_tree->index != 3) {
+    if (bsize == cm->seq_params.sb_size) {
+      const int emit_output = multi_pass_mode != SB_DRY_PASS;
+      const RUN_TYPE run_type = emit_output ? OUTPUT_ENABLED : DRY_RUN_NORMAL;
+
+      x->cb_offset = 0;
+      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, run_type, bsize,
+                pc_tree, NULL);
+    } else {
+      encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
+                pc_tree, NULL);
+    }
+  }
+
+  if (bsize == cm->seq_params.sb_size) {
+    assert(best_rdc.rate < INT_MAX);
+    assert(best_rdc.dist < INT64_MAX);
+  } else {
+    assert(tp_orig == *tp);
+  }
+
+  x->rdmult = orig_rdmult;
+  return found_best_partition;
+}
+#endif  // !CONFIG_REALTIME_ONLY
+#undef NUM_SIMPLE_MOTION_FEATURES
+
+#if !CONFIG_REALTIME_ONLY
+
+static int get_rdmult_delta(AV1_COMP *cpi, BLOCK_SIZE bsize, int analysis_type,
+                            int mi_row, int mi_col, int orig_rdmult) {
+  AV1_COMMON *const cm = &cpi->common;
+  assert(IMPLIES(cpi->gf_group.size > 0,
+                 cpi->gf_group.index < cpi->gf_group.size));
+  const int tpl_idx = cpi->gf_group.index;
+  TplParams *const tpl_data = &cpi->tpl_data;
+  TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_idx];
+  TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
+  const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2;
+  int tpl_stride = tpl_frame->stride;
+  int64_t intra_cost = 0;
+  int64_t mc_dep_cost = 0;
+  const int mi_wide = mi_size_wide[bsize];
+  const int mi_high = mi_size_high[bsize];
+
+  if (tpl_frame->is_valid == 0) return orig_rdmult;
+
+  if (!is_frame_tpl_eligible(cpi)) return orig_rdmult;
+
+  if (cpi->gf_group.index >= MAX_LAG_BUFFERS) return orig_rdmult;
+
+  int64_t mc_count = 0, mc_saved = 0;
+  int mi_count = 0;
+  const int mi_col_sr =
+      coded_to_superres_mi(mi_col, cm->superres_scale_denominator);
+  const int mi_col_end_sr =
+      coded_to_superres_mi(mi_col + mi_wide, cm->superres_scale_denominator);
+  const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width);
+  const int step = 1 << block_mis_log2;
+  for (int row = mi_row; row < mi_row + mi_high; row += step) {
+    for (int col = mi_col_sr; col < mi_col_end_sr; col += step) {
+      if (row >= cm->mi_params.mi_rows || col >= mi_cols_sr) continue;
+      TplDepStats *this_stats =
+          &tpl_stats[av1_tpl_ptr_pos(row, col, tpl_stride, block_mis_log2)];
+      int64_t mc_dep_delta =
+          RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate,
+                 this_stats->mc_dep_dist);
+      intra_cost += this_stats->recrf_dist << RDDIV_BITS;
+      mc_dep_cost += (this_stats->recrf_dist << RDDIV_BITS) + mc_dep_delta;
+      mc_count += this_stats->mc_count;
+      mc_saved += this_stats->mc_saved;
+      mi_count++;
+    }
+  }
+
+  aom_clear_system_state();
+
+  double beta = 1.0;
+  if (analysis_type == 0) {
+    if (mc_dep_cost > 0 && intra_cost > 0) {
+      const double r0 = cpi->rd.r0;
+      const double rk = (double)intra_cost / mc_dep_cost;
+      beta = (r0 / rk);
+    }
+  } else if (analysis_type == 1) {
+    const double mc_count_base = (mi_count * cpi->rd.mc_count_base);
+    beta = (mc_count + 1.0) / (mc_count_base + 1.0);
+    beta = pow(beta, 0.5);
+  } else if (analysis_type == 2) {
+    const double mc_saved_base = (mi_count * cpi->rd.mc_saved_base);
+    beta = (mc_saved + 1.0) / (mc_saved_base + 1.0);
+    beta = pow(beta, 0.5);
+  }
+
+  int rdmult = av1_get_adaptive_rdmult(cpi, beta);
+
+  aom_clear_system_state();
+
+  rdmult = AOMMIN(rdmult, orig_rdmult * 3 / 2);
+  rdmult = AOMMAX(rdmult, orig_rdmult * 1 / 2);
+
+  rdmult = AOMMAX(1, rdmult);
+
+  return rdmult;
+}
+
+static int get_tpl_stats_b(AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row,
+                           int mi_col, int64_t *intra_cost_b,
+                           int64_t *inter_cost_b,
+                           int_mv mv_b[][INTER_REFS_PER_FRAME], int *stride) {
+  if (!cpi->oxcf.enable_tpl_model) return 0;
+  if (cpi->superres_mode != SUPERRES_NONE) return 0;
+  if (cpi->common.current_frame.frame_type == KEY_FRAME) return 0;
+  const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->gf_group);
+  if (update_type == INTNL_OVERLAY_UPDATE || update_type == OVERLAY_UPDATE)
+    return 0;
+  assert(IMPLIES(cpi->gf_group.size > 0,
+                 cpi->gf_group.index < cpi->gf_group.size));
+
+  AV1_COMMON *const cm = &cpi->common;
+  const int gf_group_index = cpi->gf_group.index;
+  TplParams *const tpl_data = &cpi->tpl_data;
+  TplDepFrame *tpl_frame = &tpl_data->tpl_frame[gf_group_index];
+  TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
+  int tpl_stride = tpl_frame->stride;
+  const int mi_wide = mi_size_wide[bsize];
+  const int mi_high = mi_size_high[bsize];
+
+  if (tpl_frame->is_valid == 0) return 0;
+  if (gf_group_index >= MAX_LAG_BUFFERS) return 0;
+
+  int mi_count = 0;
+  int count = 0;
+  const int mi_col_sr =
+      coded_to_superres_mi(mi_col, cm->superres_scale_denominator);
+  const int mi_col_end_sr =
+      coded_to_superres_mi(mi_col + mi_wide, cm->superres_scale_denominator);
+  // mi_cols_sr is mi_cols at superres case.
+  const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width);
+
+  // TPL store unit size is not the same as the motion estimation unit size.
+  // Here always use motion estimation size to avoid getting repetitive inter/
+  // intra cost.
+  const BLOCK_SIZE tpl_bsize = convert_length_to_bsize(MC_FLOW_BSIZE_1D);
+  const int step = mi_size_wide[tpl_bsize];
+  assert(mi_size_wide[tpl_bsize] == mi_size_high[tpl_bsize]);
+
+  // Stride is only based on SB size, and we fill in values for every 16x16
+  // block in a SB.
+  *stride = (mi_col_end_sr - mi_col_sr) / step;
+
+  for (int row = mi_row; row < mi_row + mi_high; row += step) {
+    for (int col = mi_col_sr; col < mi_col_end_sr; col += step) {
+      // Handle partial SB, so that no invalid values are used later.
+      if (row >= cm->mi_params.mi_rows || col >= mi_cols_sr) {
+        inter_cost_b[count] = INT64_MAX;
+        intra_cost_b[count] = INT64_MAX;
+        for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+          mv_b[count][i].as_int = INVALID_MV;
+        }
+        count++;
+        continue;
+      }
+
+      TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos(
+          row, col, tpl_stride, tpl_data->tpl_stats_block_mis_log2)];
+      inter_cost_b[count] = this_stats->inter_cost;
+      intra_cost_b[count] = this_stats->intra_cost;
+      memcpy(mv_b[count], this_stats->mv, sizeof(this_stats->mv));
+      mi_count++;
+      count++;
+    }
+  }
+
+  return mi_count;
+}
+
+// analysis_type 0: Use mc_dep_cost and intra_cost
+// analysis_type 1: Use count of best inter predictor chosen
+// analysis_type 2: Use cost reduction from intra to inter for best inter
+//                  predictor chosen
+static int get_q_for_deltaq_objective(AV1_COMP *const cpi, BLOCK_SIZE bsize,
+                                      int mi_row, int mi_col) {
+  AV1_COMMON *const cm = &cpi->common;
+  assert(IMPLIES(cpi->gf_group.size > 0,
+                 cpi->gf_group.index < cpi->gf_group.size));
+  const int tpl_idx = cpi->gf_group.index;
+  TplParams *const tpl_data = &cpi->tpl_data;
+  TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_idx];
+  TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
+  const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2;
+  int tpl_stride = tpl_frame->stride;
+  int64_t intra_cost = 0;
+  int64_t mc_dep_cost = 0;
+  const int mi_wide = mi_size_wide[bsize];
+  const int mi_high = mi_size_high[bsize];
+  const int base_qindex = cm->quant_params.base_qindex;
+
+  if (tpl_frame->is_valid == 0) return base_qindex;
+
+  if (!is_frame_tpl_eligible(cpi)) return base_qindex;
+
+  if (cpi->gf_group.index >= MAX_LAG_BUFFERS) return base_qindex;
+
+  int64_t mc_count = 0, mc_saved = 0;
+  int mi_count = 0;
+  const int mi_col_sr =
+      coded_to_superres_mi(mi_col, cm->superres_scale_denominator);
+  const int mi_col_end_sr =
+      coded_to_superres_mi(mi_col + mi_wide, cm->superres_scale_denominator);
+  const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width);
+  const int step = 1 << block_mis_log2;
+  for (int row = mi_row; row < mi_row + mi_high; row += step) {
+    for (int col = mi_col_sr; col < mi_col_end_sr; col += step) {
+      if (row >= cm->mi_params.mi_rows || col >= mi_cols_sr) continue;
+      TplDepStats *this_stats =
+          &tpl_stats[av1_tpl_ptr_pos(row, col, tpl_stride, block_mis_log2)];
+      int64_t mc_dep_delta =
+          RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate,
+                 this_stats->mc_dep_dist);
+      intra_cost += this_stats->recrf_dist << RDDIV_BITS;
+      mc_dep_cost += (this_stats->recrf_dist << RDDIV_BITS) + mc_dep_delta;
+      mc_count += this_stats->mc_count;
+      mc_saved += this_stats->mc_saved;
+      mi_count++;
+    }
+  }
+
+  aom_clear_system_state();
+
+  int offset = 0;
+  double beta = 1.0;
+  if (mc_dep_cost > 0 && intra_cost > 0) {
+    const double r0 = cpi->rd.r0;
+    const double rk = (double)intra_cost / mc_dep_cost;
+    beta = (r0 / rk);
+    assert(beta > 0.0);
+  }
+  offset = av1_get_deltaq_offset(cpi, base_qindex, beta);
+  aom_clear_system_state();
+
+  const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
+  offset = AOMMIN(offset, delta_q_info->delta_q_res * 9 - 1);
+  offset = AOMMAX(offset, -delta_q_info->delta_q_res * 9 + 1);
+  int qindex = cm->quant_params.base_qindex + offset;
+  qindex = AOMMIN(qindex, MAXQ);
+  qindex = AOMMAX(qindex, MINQ);
+
+  return qindex;
+}
+
+static AOM_INLINE void setup_delta_q(AV1_COMP *const cpi, ThreadData *td,
+                                     MACROBLOCK *const x,
+                                     const TileInfo *const tile_info,
+                                     int mi_row, int mi_col, int num_planes) {
+  AV1_COMMON *const cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
+  assert(delta_q_info->delta_q_present_flag);
+
+  const BLOCK_SIZE sb_size = cm->seq_params.sb_size;
+  // Delta-q modulation based on variance
+  av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, sb_size);
+
+  int current_qindex = cm->quant_params.base_qindex;
+  if (cpi->oxcf.deltaq_mode == DELTA_Q_PERCEPTUAL) {
+    if (DELTA_Q_PERCEPTUAL_MODULATION == 1) {
+      const int block_wavelet_energy_level =
+          av1_block_wavelet_energy_level(cpi, x, sb_size);
+      x->sb_energy_level = block_wavelet_energy_level;
+      current_qindex = av1_compute_q_from_energy_level_deltaq_mode(
+          cpi, block_wavelet_energy_level);
+    } else {
+      const int block_var_level = av1_log_block_var(cpi, x, sb_size);
+      x->sb_energy_level = block_var_level;
+      current_qindex =
+          av1_compute_q_from_energy_level_deltaq_mode(cpi, block_var_level);
+    }
+  } else if (cpi->oxcf.deltaq_mode == DELTA_Q_OBJECTIVE &&
+             cpi->oxcf.enable_tpl_model) {
+    // Setup deltaq based on tpl stats
+    current_qindex = get_q_for_deltaq_objective(cpi, sb_size, mi_row, mi_col);
+  }
+
+  const int delta_q_res = delta_q_info->delta_q_res;
+  // Right now aq only works with tpl model. So if tpl is disabled, we set the
+  // current_qindex to base_qindex.
+  if (cpi->oxcf.enable_tpl_model && cpi->oxcf.deltaq_mode != NO_DELTA_Q) {
+    current_qindex =
+        clamp(current_qindex, delta_q_res, 256 - delta_q_info->delta_q_res);
+  } else {
+    current_qindex = cm->quant_params.base_qindex;
+  }
+
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int sign_deltaq_index =
+      current_qindex - xd->current_qindex >= 0 ? 1 : -1;
+  const int deltaq_deadzone = delta_q_res / 4;
+  const int qmask = ~(delta_q_res - 1);
+  int abs_deltaq_index = abs(current_qindex - xd->current_qindex);
+  abs_deltaq_index = (abs_deltaq_index + deltaq_deadzone) & qmask;
+  current_qindex = xd->current_qindex + sign_deltaq_index * abs_deltaq_index;
+  current_qindex = AOMMAX(current_qindex, MINQ + 1);
+  assert(current_qindex > 0);
+
+  xd->delta_qindex = current_qindex - cm->quant_params.base_qindex;
+  set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size);
+  xd->mi[0]->current_qindex = current_qindex;
+  av1_init_plane_quantizers(cpi, x, xd->mi[0]->segment_id);
+
+  // keep track of any non-zero delta-q used
+  td->deltaq_used |= (xd->delta_qindex != 0);
+
+  if (cpi->oxcf.deltalf_mode) {
+    const int delta_lf_res = delta_q_info->delta_lf_res;
+    const int lfmask = ~(delta_lf_res - 1);
+    const int delta_lf_from_base =
+        ((xd->delta_qindex / 2 + delta_lf_res / 2) & lfmask);
+    const int8_t delta_lf =
+        (int8_t)clamp(delta_lf_from_base, -MAX_LOOP_FILTER, MAX_LOOP_FILTER);
+    const int frame_lf_count =
+        av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
+    const int mib_size = cm->seq_params.mib_size;
+
+    // pre-set the delta lf for loop filter. Note that this value is set
+    // before mi is assigned for each block in current superblock
+    for (int j = 0; j < AOMMIN(mib_size, mi_params->mi_rows - mi_row); j++) {
+      for (int k = 0; k < AOMMIN(mib_size, mi_params->mi_cols - mi_col); k++) {
+        const int grid_idx = get_mi_grid_idx(mi_params, mi_row + j, mi_col + k);
+        mi_params->mi_grid_base[grid_idx]->delta_lf_from_base = delta_lf;
+        for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) {
+          mi_params->mi_grid_base[grid_idx]->delta_lf[lf_id] = delta_lf;
+        }
+      }
+    }
+  }
+}
+#endif  // !CONFIG_REALTIME_ONLY
+
+#define AVG_CDF_WEIGHT_LEFT 3
+#define AVG_CDF_WEIGHT_TOP_RIGHT 1
+
+static AOM_INLINE void avg_cdf_symbol(aom_cdf_prob *cdf_ptr_left,
+                                      aom_cdf_prob *cdf_ptr_tr, int num_cdfs,
+                                      int cdf_stride, int nsymbs, int wt_left,
+                                      int wt_tr) {
+  for (int i = 0; i < num_cdfs; i++) {
+    for (int j = 0; j <= nsymbs; j++) {
+      cdf_ptr_left[i * cdf_stride + j] =
+          (aom_cdf_prob)(((int)cdf_ptr_left[i * cdf_stride + j] * wt_left +
+                          (int)cdf_ptr_tr[i * cdf_stride + j] * wt_tr +
+                          ((wt_left + wt_tr) / 2)) /
+                         (wt_left + wt_tr));
+      assert(cdf_ptr_left[i * cdf_stride + j] >= 0 &&
+             cdf_ptr_left[i * cdf_stride + j] < CDF_PROB_TOP);
+    }
+  }
+}
+
+#define AVERAGE_CDF(cname_left, cname_tr, nsymbs) \
+  AVG_CDF_STRIDE(cname_left, cname_tr, nsymbs, CDF_SIZE(nsymbs))
+
+#define AVG_CDF_STRIDE(cname_left, cname_tr, nsymbs, cdf_stride)           \
+  do {                                                                     \
+    aom_cdf_prob *cdf_ptr_left = (aom_cdf_prob *)cname_left;               \
+    aom_cdf_prob *cdf_ptr_tr = (aom_cdf_prob *)cname_tr;                   \
+    int array_size = (int)sizeof(cname_left) / sizeof(aom_cdf_prob);       \
+    int num_cdfs = array_size / cdf_stride;                                \
+    avg_cdf_symbol(cdf_ptr_left, cdf_ptr_tr, num_cdfs, cdf_stride, nsymbs, \
+                   wt_left, wt_tr);                                        \
+  } while (0)
+
+static AOM_INLINE void avg_nmv(nmv_context *nmv_left, nmv_context *nmv_tr,
+                               int wt_left, int wt_tr) {
+  AVERAGE_CDF(nmv_left->joints_cdf, nmv_tr->joints_cdf, 4);
+  for (int i = 0; i < 2; i++) {
+    AVERAGE_CDF(nmv_left->comps[i].classes_cdf, nmv_tr->comps[i].classes_cdf,
+                MV_CLASSES);
+    AVERAGE_CDF(nmv_left->comps[i].class0_fp_cdf,
+                nmv_tr->comps[i].class0_fp_cdf, MV_FP_SIZE);
+    AVERAGE_CDF(nmv_left->comps[i].fp_cdf, nmv_tr->comps[i].fp_cdf, MV_FP_SIZE);
+    AVERAGE_CDF(nmv_left->comps[i].sign_cdf, nmv_tr->comps[i].sign_cdf, 2);
+    AVERAGE_CDF(nmv_left->comps[i].class0_hp_cdf,
+                nmv_tr->comps[i].class0_hp_cdf, 2);
+    AVERAGE_CDF(nmv_left->comps[i].hp_cdf, nmv_tr->comps[i].hp_cdf, 2);
+    AVERAGE_CDF(nmv_left->comps[i].class0_cdf, nmv_tr->comps[i].class0_cdf,
+                CLASS0_SIZE);
+    AVERAGE_CDF(nmv_left->comps[i].bits_cdf, nmv_tr->comps[i].bits_cdf, 2);
+  }
+}
+
+// In case of row-based multi-threading of encoder, since we always
+// keep a top - right sync, we can average the top - right SB's CDFs and
+// the left SB's CDFs and use the same for current SB's encoding to
+// improve the performance. This function facilitates the averaging
+// of CDF and used only when row-mt is enabled in encoder.
+static AOM_INLINE void avg_cdf_symbols(FRAME_CONTEXT *ctx_left,
+                                       FRAME_CONTEXT *ctx_tr, int wt_left,
+                                       int wt_tr) {
+  AVERAGE_CDF(ctx_left->txb_skip_cdf, ctx_tr->txb_skip_cdf, 2);
+  AVERAGE_CDF(ctx_left->eob_extra_cdf, ctx_tr->eob_extra_cdf, 2);
+  AVERAGE_CDF(ctx_left->dc_sign_cdf, ctx_tr->dc_sign_cdf, 2);
+  AVERAGE_CDF(ctx_left->eob_flag_cdf16, ctx_tr->eob_flag_cdf16, 5);
+  AVERAGE_CDF(ctx_left->eob_flag_cdf32, ctx_tr->eob_flag_cdf32, 6);
+  AVERAGE_CDF(ctx_left->eob_flag_cdf64, ctx_tr->eob_flag_cdf64, 7);
+  AVERAGE_CDF(ctx_left->eob_flag_cdf128, ctx_tr->eob_flag_cdf128, 8);
+  AVERAGE_CDF(ctx_left->eob_flag_cdf256, ctx_tr->eob_flag_cdf256, 9);
+  AVERAGE_CDF(ctx_left->eob_flag_cdf512, ctx_tr->eob_flag_cdf512, 10);
+  AVERAGE_CDF(ctx_left->eob_flag_cdf1024, ctx_tr->eob_flag_cdf1024, 11);
+  AVERAGE_CDF(ctx_left->coeff_base_eob_cdf, ctx_tr->coeff_base_eob_cdf, 3);
+  AVERAGE_CDF(ctx_left->coeff_base_cdf, ctx_tr->coeff_base_cdf, 4);
+  AVERAGE_CDF(ctx_left->coeff_br_cdf, ctx_tr->coeff_br_cdf, BR_CDF_SIZE);
+  AVERAGE_CDF(ctx_left->newmv_cdf, ctx_tr->newmv_cdf, 2);
+  AVERAGE_CDF(ctx_left->zeromv_cdf, ctx_tr->zeromv_cdf, 2);
+  AVERAGE_CDF(ctx_left->refmv_cdf, ctx_tr->refmv_cdf, 2);
+  AVERAGE_CDF(ctx_left->drl_cdf, ctx_tr->drl_cdf, 2);
+  AVERAGE_CDF(ctx_left->inter_compound_mode_cdf,
+              ctx_tr->inter_compound_mode_cdf, INTER_COMPOUND_MODES);
+  AVERAGE_CDF(ctx_left->compound_type_cdf, ctx_tr->compound_type_cdf,
+              MASKED_COMPOUND_TYPES);
+  AVERAGE_CDF(ctx_left->wedge_idx_cdf, ctx_tr->wedge_idx_cdf, 16);
+  AVERAGE_CDF(ctx_left->interintra_cdf, ctx_tr->interintra_cdf, 2);
+  AVERAGE_CDF(ctx_left->wedge_interintra_cdf, ctx_tr->wedge_interintra_cdf, 2);
+  AVERAGE_CDF(ctx_left->interintra_mode_cdf, ctx_tr->interintra_mode_cdf,
+              INTERINTRA_MODES);
+  AVERAGE_CDF(ctx_left->motion_mode_cdf, ctx_tr->motion_mode_cdf, MOTION_MODES);
+  AVERAGE_CDF(ctx_left->obmc_cdf, ctx_tr->obmc_cdf, 2);
+  AVERAGE_CDF(ctx_left->palette_y_size_cdf, ctx_tr->palette_y_size_cdf,
+              PALETTE_SIZES);
+  AVERAGE_CDF(ctx_left->palette_uv_size_cdf, ctx_tr->palette_uv_size_cdf,
+              PALETTE_SIZES);
+  for (int j = 0; j < PALETTE_SIZES; j++) {
+    int nsymbs = j + PALETTE_MIN_SIZE;
+    AVG_CDF_STRIDE(ctx_left->palette_y_color_index_cdf[j],
+                   ctx_tr->palette_y_color_index_cdf[j], nsymbs,
+                   CDF_SIZE(PALETTE_COLORS));
+    AVG_CDF_STRIDE(ctx_left->palette_uv_color_index_cdf[j],
+                   ctx_tr->palette_uv_color_index_cdf[j], nsymbs,
+                   CDF_SIZE(PALETTE_COLORS));
+  }
+  AVERAGE_CDF(ctx_left->palette_y_mode_cdf, ctx_tr->palette_y_mode_cdf, 2);
+  AVERAGE_CDF(ctx_left->palette_uv_mode_cdf, ctx_tr->palette_uv_mode_cdf, 2);
+  AVERAGE_CDF(ctx_left->comp_inter_cdf, ctx_tr->comp_inter_cdf, 2);
+  AVERAGE_CDF(ctx_left->single_ref_cdf, ctx_tr->single_ref_cdf, 2);
+  AVERAGE_CDF(ctx_left->comp_ref_type_cdf, ctx_tr->comp_ref_type_cdf, 2);
+  AVERAGE_CDF(ctx_left->uni_comp_ref_cdf, ctx_tr->uni_comp_ref_cdf, 2);
+  AVERAGE_CDF(ctx_left->comp_ref_cdf, ctx_tr->comp_ref_cdf, 2);
+  AVERAGE_CDF(ctx_left->comp_bwdref_cdf, ctx_tr->comp_bwdref_cdf, 2);
+  AVERAGE_CDF(ctx_left->txfm_partition_cdf, ctx_tr->txfm_partition_cdf, 2);
+  AVERAGE_CDF(ctx_left->compound_index_cdf, ctx_tr->compound_index_cdf, 2);
+  AVERAGE_CDF(ctx_left->comp_group_idx_cdf, ctx_tr->comp_group_idx_cdf, 2);
+  AVERAGE_CDF(ctx_left->skip_mode_cdfs, ctx_tr->skip_mode_cdfs, 2);
+  AVERAGE_CDF(ctx_left->skip_cdfs, ctx_tr->skip_cdfs, 2);
+  AVERAGE_CDF(ctx_left->intra_inter_cdf, ctx_tr->intra_inter_cdf, 2);
+  avg_nmv(&ctx_left->nmvc, &ctx_tr->nmvc, wt_left, wt_tr);
+  avg_nmv(&ctx_left->ndvc, &ctx_tr->ndvc, wt_left, wt_tr);
+  AVERAGE_CDF(ctx_left->intrabc_cdf, ctx_tr->intrabc_cdf, 2);
+  AVERAGE_CDF(ctx_left->seg.tree_cdf, ctx_tr->seg.tree_cdf, MAX_SEGMENTS);
+  AVERAGE_CDF(ctx_left->seg.pred_cdf, ctx_tr->seg.pred_cdf, 2);
+  AVERAGE_CDF(ctx_left->seg.spatial_pred_seg_cdf,
+              ctx_tr->seg.spatial_pred_seg_cdf, MAX_SEGMENTS);
+  AVERAGE_CDF(ctx_left->filter_intra_cdfs, ctx_tr->filter_intra_cdfs, 2);
+  AVERAGE_CDF(ctx_left->filter_intra_mode_cdf, ctx_tr->filter_intra_mode_cdf,
+              FILTER_INTRA_MODES);
+  AVERAGE_CDF(ctx_left->switchable_restore_cdf, ctx_tr->switchable_restore_cdf,
+              RESTORE_SWITCHABLE_TYPES);
+  AVERAGE_CDF(ctx_left->wiener_restore_cdf, ctx_tr->wiener_restore_cdf, 2);
+  AVERAGE_CDF(ctx_left->sgrproj_restore_cdf, ctx_tr->sgrproj_restore_cdf, 2);
+  AVERAGE_CDF(ctx_left->y_mode_cdf, ctx_tr->y_mode_cdf, INTRA_MODES);
+  AVG_CDF_STRIDE(ctx_left->uv_mode_cdf[0], ctx_tr->uv_mode_cdf[0],
+                 UV_INTRA_MODES - 1, CDF_SIZE(UV_INTRA_MODES));
+  AVERAGE_CDF(ctx_left->uv_mode_cdf[1], ctx_tr->uv_mode_cdf[1], UV_INTRA_MODES);
+  for (int i = 0; i < PARTITION_CONTEXTS; i++) {
+    if (i < 4) {
+      AVG_CDF_STRIDE(ctx_left->partition_cdf[i], ctx_tr->partition_cdf[i], 4,
+                     CDF_SIZE(10));
+    } else if (i < 16) {
+      AVERAGE_CDF(ctx_left->partition_cdf[i], ctx_tr->partition_cdf[i], 10);
+    } else {
+      AVG_CDF_STRIDE(ctx_left->partition_cdf[i], ctx_tr->partition_cdf[i], 8,
+                     CDF_SIZE(10));
+    }
+  }
+  AVERAGE_CDF(ctx_left->switchable_interp_cdf, ctx_tr->switchable_interp_cdf,
+              SWITCHABLE_FILTERS);
+  AVERAGE_CDF(ctx_left->kf_y_cdf, ctx_tr->kf_y_cdf, INTRA_MODES);
+  AVERAGE_CDF(ctx_left->angle_delta_cdf, ctx_tr->angle_delta_cdf,
+              2 * MAX_ANGLE_DELTA + 1);
+  AVG_CDF_STRIDE(ctx_left->tx_size_cdf[0], ctx_tr->tx_size_cdf[0], MAX_TX_DEPTH,
+                 CDF_SIZE(MAX_TX_DEPTH + 1));
+  AVERAGE_CDF(ctx_left->tx_size_cdf[1], ctx_tr->tx_size_cdf[1],
+              MAX_TX_DEPTH + 1);
+  AVERAGE_CDF(ctx_left->tx_size_cdf[2], ctx_tr->tx_size_cdf[2],
+              MAX_TX_DEPTH + 1);
+  AVERAGE_CDF(ctx_left->tx_size_cdf[3], ctx_tr->tx_size_cdf[3],
+              MAX_TX_DEPTH + 1);
+  AVERAGE_CDF(ctx_left->delta_q_cdf, ctx_tr->delta_q_cdf, DELTA_Q_PROBS + 1);
+  AVERAGE_CDF(ctx_left->delta_lf_cdf, ctx_tr->delta_lf_cdf, DELTA_LF_PROBS + 1);
+  for (int i = 0; i < FRAME_LF_COUNT; i++) {
+    AVERAGE_CDF(ctx_left->delta_lf_multi_cdf[i], ctx_tr->delta_lf_multi_cdf[i],
+                DELTA_LF_PROBS + 1);
+  }
+  AVG_CDF_STRIDE(ctx_left->intra_ext_tx_cdf[1], ctx_tr->intra_ext_tx_cdf[1], 7,
+                 CDF_SIZE(TX_TYPES));
+  AVG_CDF_STRIDE(ctx_left->intra_ext_tx_cdf[2], ctx_tr->intra_ext_tx_cdf[2], 5,
+                 CDF_SIZE(TX_TYPES));
+  AVG_CDF_STRIDE(ctx_left->inter_ext_tx_cdf[1], ctx_tr->inter_ext_tx_cdf[1], 16,
+                 CDF_SIZE(TX_TYPES));
+  AVG_CDF_STRIDE(ctx_left->inter_ext_tx_cdf[2], ctx_tr->inter_ext_tx_cdf[2], 12,
+                 CDF_SIZE(TX_TYPES));
+  AVG_CDF_STRIDE(ctx_left->inter_ext_tx_cdf[3], ctx_tr->inter_ext_tx_cdf[3], 2,
+                 CDF_SIZE(TX_TYPES));
+  AVERAGE_CDF(ctx_left->cfl_sign_cdf, ctx_tr->cfl_sign_cdf, CFL_JOINT_SIGNS);
+  AVERAGE_CDF(ctx_left->cfl_alpha_cdf, ctx_tr->cfl_alpha_cdf,
+              CFL_ALPHABET_SIZE);
+}
+
+#if !CONFIG_REALTIME_ONLY
+static AOM_INLINE void adjust_rdmult_tpl_model(AV1_COMP *cpi, MACROBLOCK *x,
+                                               int mi_row, int mi_col) {
+  const BLOCK_SIZE sb_size = cpi->common.seq_params.sb_size;
+  const int orig_rdmult = cpi->rd.RDMULT;
+
+  assert(IMPLIES(cpi->gf_group.size > 0,
+                 cpi->gf_group.index < cpi->gf_group.size));
+  const int gf_group_index = cpi->gf_group.index;
+  if (cpi->oxcf.enable_tpl_model && cpi->oxcf.aq_mode == NO_AQ &&
+      cpi->oxcf.deltaq_mode == NO_DELTA_Q && gf_group_index > 0 &&
+      cpi->gf_group.update_type[gf_group_index] == ARF_UPDATE) {
+    const int dr =
+        get_rdmult_delta(cpi, sb_size, 0, mi_row, mi_col, orig_rdmult);
+    x->rdmult = dr;
+  }
+}
+#endif
+
+static void source_content_sb(AV1_COMP *cpi, MACROBLOCK *x, int shift) {
+  unsigned int tmp_sse;
+  unsigned int tmp_variance;
+  const BLOCK_SIZE bsize = BLOCK_64X64;
+  uint8_t *src_y = cpi->source->y_buffer;
+  int src_ystride = cpi->source->y_stride;
+  uint8_t *last_src_y = cpi->last_source->y_buffer;
+  int last_src_ystride = cpi->last_source->y_stride;
+  uint64_t avg_source_sse_threshold = 100000;        // ~5*5*(64*64)
+  uint64_t avg_source_sse_threshold_high = 1000000;  // ~15*15*(64*64)
+  uint64_t sum_sq_thresh = 10000;  // sum = sqrt(thresh / 64*64)) ~1.5
+#if CONFIG_AV1_HIGHBITDEPTH
+  MACROBLOCKD *xd = &x->e_mbd;
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) return;
+#endif
+  src_y += shift;
+  last_src_y += shift;
+  tmp_variance = cpi->fn_ptr[bsize].vf(src_y, src_ystride, last_src_y,
+                                       last_src_ystride, &tmp_sse);
+  // Note: tmp_sse - tmp_variance = ((sum * sum) >> 12)
+  // Detect large lighting change.
+  if (tmp_variance < (tmp_sse >> 1) && (tmp_sse - tmp_variance) > sum_sq_thresh)
+    x->content_state_sb = kLowVarHighSumdiff;
+  else if (tmp_sse < avg_source_sse_threshold)
+    x->content_state_sb = kLowSad;
+  else if (tmp_sse > avg_source_sse_threshold_high)
+    x->content_state_sb = kHighSad;
+}
+
+static AOM_INLINE void encode_nonrd_sb(AV1_COMP *cpi, ThreadData *td,
+                                       TileDataEnc *tile_data,
+                                       PC_TREE *const pc_root, TOKENEXTRA **tp,
+                                       const int mi_row, const int mi_col,
+                                       const int seg_skip) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &td->mb;
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  const TileInfo *const tile_info = &tile_data->tile_info;
+  MB_MODE_INFO **mi = cm->mi_params.mi_grid_base +
+                      get_mi_grid_idx(&cm->mi_params, mi_row, mi_col);
+  const BLOCK_SIZE sb_size = cm->seq_params.sb_size;
+  if (sf->rt_sf.source_metrics_sb_nonrd && sb_size == BLOCK_64X64 &&
+      cpi->svc.number_spatial_layers <= 1 &&
+      cm->current_frame.frame_type != KEY_FRAME) {
+    int shift = cpi->source->y_stride * (mi_row << 2) + (mi_col << 2);
+    source_content_sb(cpi, x, shift);
+  }
+  if (sf->part_sf.partition_search_type == FIXED_PARTITION || seg_skip) {
+    set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size);
+    const BLOCK_SIZE bsize =
+        seg_skip ? sb_size : sf->part_sf.always_this_block_size;
+    set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
+  } else if (cpi->partition_search_skippable_frame) {
+    set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size);
+    const BLOCK_SIZE bsize =
+        get_rd_var_based_fixed_partition(cpi, x, mi_row, mi_col);
+    set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
+  } else if (sf->part_sf.partition_search_type == VAR_BASED_PARTITION) {
+    set_offsets_without_segment_id(cpi, tile_info, x, mi_row, mi_col, sb_size);
+    av1_choose_var_based_partitioning(cpi, tile_info, td, x, mi_row, mi_col);
+  }
+  assert(sf->part_sf.partition_search_type == FIXED_PARTITION || seg_skip ||
+         cpi->partition_search_skippable_frame ||
+         sf->part_sf.partition_search_type == VAR_BASED_PARTITION);
+  td->mb.cb_offset = 0;
+  nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, sb_size,
+                      pc_root);
+}
+
+// Memset the mbmis at the current superblock to 0
+static INLINE void reset_mbmi(CommonModeInfoParams *const mi_params,
+                              BLOCK_SIZE sb_size, int mi_row, int mi_col) {
+  // size of sb in unit of mi (BLOCK_4X4)
+  const int sb_size_mi = mi_size_wide[sb_size];
+  const int mi_alloc_size_1d = mi_size_wide[mi_params->mi_alloc_bsize];
+  // size of sb in unit of allocated mi size
+  const int sb_size_alloc_mi = mi_size_wide[sb_size] / mi_alloc_size_1d;
+  assert(mi_params->mi_alloc_stride % sb_size_alloc_mi == 0 &&
+         "mi is not allocated as a multiple of sb!");
+  assert(mi_params->mi_stride % sb_size_mi == 0 &&
+         "mi_grid_base is not allocated as a multiple of sb!");
+
+  const int mi_rows = mi_size_high[sb_size];
+  for (int cur_mi_row = 0; cur_mi_row < mi_rows; cur_mi_row++) {
+    assert(get_mi_grid_idx(mi_params, 0, mi_col + mi_alloc_size_1d) <
+           mi_params->mi_stride);
+    const int mi_grid_idx =
+        get_mi_grid_idx(mi_params, mi_row + cur_mi_row, mi_col);
+    const int alloc_mi_idx =
+        get_alloc_mi_idx(mi_params, mi_row + cur_mi_row, mi_col);
+    memset(&mi_params->mi_grid_base[mi_grid_idx], 0,
+           sb_size_mi * sizeof(*mi_params->mi_grid_base));
+    memset(&mi_params->tx_type_map[mi_grid_idx], 0,
+           sb_size_mi * sizeof(*mi_params->tx_type_map));
+    if (cur_mi_row % mi_alloc_size_1d == 0) {
+      memset(&mi_params->mi_alloc[alloc_mi_idx], 0,
+             sb_size_alloc_mi * sizeof(*mi_params->mi_alloc));
+    }
+  }
+}
+
+static INLINE void backup_sb_state(SB_FIRST_PASS_STATS *sb_fp_stats,
+                                   const AV1_COMP *cpi, ThreadData *td,
+                                   const TileDataEnc *tile_data, int mi_row,
+                                   int mi_col) {
+  MACROBLOCK *x = &td->mb;
+  MACROBLOCKD *xd = &x->e_mbd;
+  const TileInfo *tile_info = &tile_data->tile_info;
+
+  const AV1_COMMON *cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  const BLOCK_SIZE sb_size = cm->seq_params.sb_size;
+
+  xd->above_txfm_context =
+      cm->above_contexts.txfm[tile_info->tile_row] + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+  save_context(x, &sb_fp_stats->x_ctx, mi_row, mi_col, sb_size, num_planes);
+
+  sb_fp_stats->rd_count = cpi->td.rd_counts;
+  sb_fp_stats->split_count = cpi->td.mb.txb_split_count;
+
+  sb_fp_stats->fc = *td->counts;
+
+  memcpy(sb_fp_stats->inter_mode_rd_models, tile_data->inter_mode_rd_models,
+         sizeof(sb_fp_stats->inter_mode_rd_models));
+
+  memcpy(sb_fp_stats->thresh_freq_fact, x->thresh_freq_fact,
+         sizeof(sb_fp_stats->thresh_freq_fact));
+
+  const int alloc_mi_idx = get_alloc_mi_idx(&cm->mi_params, mi_row, mi_col);
+  sb_fp_stats->current_qindex =
+      cm->mi_params.mi_alloc[alloc_mi_idx].current_qindex;
+
+#if CONFIG_INTERNAL_STATS
+  memcpy(sb_fp_stats->mode_chosen_counts, cpi->mode_chosen_counts,
+         sizeof(sb_fp_stats->mode_chosen_counts));
+#endif  // CONFIG_INTERNAL_STATS
+}
+
+static INLINE void restore_sb_state(const SB_FIRST_PASS_STATS *sb_fp_stats,
+                                    AV1_COMP *cpi, ThreadData *td,
+                                    TileDataEnc *tile_data, int mi_row,
+                                    int mi_col) {
+  MACROBLOCK *x = &td->mb;
+
+  const AV1_COMMON *cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  const BLOCK_SIZE sb_size = cm->seq_params.sb_size;
+
+  restore_context(x, &sb_fp_stats->x_ctx, mi_row, mi_col, sb_size, num_planes);
+
+  cpi->td.rd_counts = sb_fp_stats->rd_count;
+  cpi->td.mb.txb_split_count = sb_fp_stats->split_count;
+
+  *td->counts = sb_fp_stats->fc;
+
+  memcpy(tile_data->inter_mode_rd_models, sb_fp_stats->inter_mode_rd_models,
+         sizeof(sb_fp_stats->inter_mode_rd_models));
+  memcpy(x->thresh_freq_fact, sb_fp_stats->thresh_freq_fact,
+         sizeof(sb_fp_stats->thresh_freq_fact));
+
+  const int alloc_mi_idx = get_alloc_mi_idx(&cm->mi_params, mi_row, mi_col);
+  cm->mi_params.mi_alloc[alloc_mi_idx].current_qindex =
+      sb_fp_stats->current_qindex;
+
+#if CONFIG_INTERNAL_STATS
+  memcpy(cpi->mode_chosen_counts, sb_fp_stats->mode_chosen_counts,
+         sizeof(sb_fp_stats->mode_chosen_counts));
+#endif  // CONFIG_INTERNAL_STATS
+}
+
+#if !CONFIG_REALTIME_ONLY
+static void init_ref_frame_space(AV1_COMP *cpi, ThreadData *td, int mi_row,
+                                 int mi_col) {
+  const AV1_COMMON *cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  MACROBLOCK *x = &td->mb;
+  const int frame_idx = cpi->gf_group.index;
+  TplParams *const tpl_data = &cpi->tpl_data;
+  TplDepFrame *tpl_frame = &tpl_data->tpl_frame[frame_idx];
+  const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2;
+
+  av1_zero(x->search_ref_frame);
+
+  if (tpl_frame->is_valid == 0) return;
+  if (!is_frame_tpl_eligible(cpi)) return;
+  if (frame_idx >= MAX_LAG_BUFFERS) return;
+  if (cpi->superres_mode != SUPERRES_NONE) return;
+  if (cpi->oxcf.aq_mode != NO_AQ) return;
+
+  const int is_overlay = cpi->gf_group.update_type[frame_idx] == OVERLAY_UPDATE;
+  if (is_overlay) {
+    memset(x->search_ref_frame, 1, sizeof(x->search_ref_frame));
+    return;
+  }
+
+  TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
+  const int tpl_stride = tpl_frame->stride;
+  int64_t inter_cost[INTER_REFS_PER_FRAME] = { 0 };
+  const int step = 1 << block_mis_log2;
+  const BLOCK_SIZE sb_size = cm->seq_params.sb_size;
+  const int mi_row_end =
+      AOMMIN(mi_size_high[sb_size] + mi_row, mi_params->mi_rows);
+  const int mi_col_end =
+      AOMMIN(mi_size_wide[sb_size] + mi_col, mi_params->mi_cols);
+
+  for (int row = mi_row; row < mi_row_end; row += step) {
+    for (int col = mi_col; col < mi_col_end; col += step) {
+      const TplDepStats *this_stats =
+          &tpl_stats[av1_tpl_ptr_pos(row, col, tpl_stride, block_mis_log2)];
+      int64_t tpl_pred_error[INTER_REFS_PER_FRAME] = { 0 };
+      // Find the winner ref frame idx for the current block
+      int64_t best_inter_cost = this_stats->pred_error[0];
+      int best_rf_idx = 0;
+      for (int idx = 1; idx < INTER_REFS_PER_FRAME; ++idx) {
+        if ((this_stats->pred_error[idx] < best_inter_cost) &&
+            (this_stats->pred_error[idx] != 0)) {
+          best_inter_cost = this_stats->pred_error[idx];
+          best_rf_idx = idx;
+        }
+      }
+      // tpl_pred_error is the pred_error reduction of best_ref w.r.t.
+      // LAST_FRAME.
+      tpl_pred_error[best_rf_idx] = this_stats->pred_error[best_rf_idx] -
+                                    this_stats->pred_error[LAST_FRAME - 1];
+
+      for (int rf_idx = 1; rf_idx < INTER_REFS_PER_FRAME; ++rf_idx)
+        inter_cost[rf_idx] += tpl_pred_error[rf_idx];
+    }
+  }
+
+  int rank_index[INTER_REFS_PER_FRAME - 1];
+  for (int idx = 0; idx < INTER_REFS_PER_FRAME - 1; ++idx) {
+    rank_index[idx] = idx + 1;
+    for (int i = idx; i > 0; --i) {
+      if (inter_cost[rank_index[i - 1]] > inter_cost[rank_index[i]]) {
+        const int tmp = rank_index[i - 1];
+        rank_index[i - 1] = rank_index[i];
+        rank_index[i] = tmp;
+      }
+    }
+  }
+
+  x->search_ref_frame[INTRA_FRAME] = 1;
+  x->search_ref_frame[LAST_FRAME] = 1;
+
+  int cutoff_ref = 0;
+  for (int idx = 0; idx < INTER_REFS_PER_FRAME - 1; ++idx) {
+    x->search_ref_frame[rank_index[idx] + LAST_FRAME] = 1;
+    if (idx > 2) {
+      if (!cutoff_ref) {
+        // If the predictive coding gains are smaller than the previous more
+        // relevant frame over certain amount, discard this frame and all the
+        // frames afterwards.
+        if (llabs(inter_cost[rank_index[idx]]) <
+                llabs(inter_cost[rank_index[idx - 1]]) / 8 ||
+            inter_cost[rank_index[idx]] == 0)
+          cutoff_ref = 1;
+      }
+
+      if (cutoff_ref) x->search_ref_frame[rank_index[idx] + LAST_FRAME] = 0;
+    }
+  }
+}
+#endif  // !CONFIG_REALTIME_ONLY
+
+// This function initializes the stats for encode_rd_sb.
+static INLINE void init_encode_rd_sb(AV1_COMP *cpi, ThreadData *td,
+                                     const TileDataEnc *tile_data,
+                                     PC_TREE *pc_root, RD_STATS *rd_cost,
+                                     int mi_row, int mi_col,
+                                     int gather_tpl_data) {
+  const AV1_COMMON *cm = &cpi->common;
+  const TileInfo *tile_info = &tile_data->tile_info;
+  MACROBLOCK *x = &td->mb;
+
+  const SPEED_FEATURES *sf = &cpi->sf;
+  const int use_simple_motion_search =
+      (sf->part_sf.simple_motion_search_split ||
+       sf->part_sf.simple_motion_search_prune_rect ||
+       sf->part_sf.simple_motion_search_early_term_none ||
+       sf->part_sf.ml_early_term_after_part_split_level) &&
+      !frame_is_intra_only(cm);
+  if (use_simple_motion_search) {
+    init_simple_motion_search_mvs(pc_root);
+  }
+
+#if !CONFIG_REALTIME_ONLY
+  init_ref_frame_space(cpi, td, mi_row, mi_col);
+  x->sb_energy_level = 0;
+  x->cnn_output_valid = 0;
+  if (gather_tpl_data) {
+    if (cm->delta_q_info.delta_q_present_flag) {
+      const int num_planes = av1_num_planes(cm);
+      const BLOCK_SIZE sb_size = cm->seq_params.sb_size;
+      setup_delta_q(cpi, td, x, tile_info, mi_row, mi_col, num_planes);
+      av1_tpl_rdmult_setup_sb(cpi, x, sb_size, mi_row, mi_col);
+    }
+    if (cpi->oxcf.enable_tpl_model) {
+      adjust_rdmult_tpl_model(cpi, x, mi_row, mi_col);
+    }
+  }
+#else
+  (void)tile_info;
+  (void)mi_row;
+  (void)mi_col;
+  (void)gather_tpl_data;
+#endif
+
+  // Reset hash state for transform/mode rd hash information
+  reset_hash_records(x, cpi->sf.tx_sf.use_inter_txb_hash);
+  av1_zero(x->picked_ref_frames_mask);
+  av1_zero(x->pred_mv);
+  av1_invalid_rd_stats(rd_cost);
+}
+
+static AOM_INLINE void encode_rd_sb(AV1_COMP *cpi, ThreadData *td,
+                                    TileDataEnc *tile_data,
+                                    PC_TREE *const pc_root, TOKENEXTRA **tp,
+                                    const int mi_row, const int mi_col,
+                                    const int seg_skip) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &td->mb;
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  const TileInfo *const tile_info = &tile_data->tile_info;
+  MB_MODE_INFO **mi = cm->mi_params.mi_grid_base +
+                      get_mi_grid_idx(&cm->mi_params, mi_row, mi_col);
+  const BLOCK_SIZE sb_size = cm->seq_params.sb_size;
+  int dummy_rate;
+  int64_t dummy_dist;
+  RD_STATS dummy_rdc;
+
+#if CONFIG_REALTIME_ONLY
+  (void)seg_skip;
+#endif  // CONFIG_REALTIME_ONLY
+
+  init_encode_rd_sb(cpi, td, tile_data, pc_root, &dummy_rdc, mi_row, mi_col, 1);
+
+  if (sf->part_sf.partition_search_type == VAR_BASED_PARTITION) {
+    set_offsets_without_segment_id(cpi, tile_info, x, mi_row, mi_col, sb_size);
+    av1_choose_var_based_partitioning(cpi, tile_info, td, x, mi_row, mi_col);
+    rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, sb_size,
+                     &dummy_rate, &dummy_dist, 1, pc_root);
+  }
+#if !CONFIG_REALTIME_ONLY
+  else if (sf->part_sf.partition_search_type == FIXED_PARTITION || seg_skip) {
+    set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size);
+    const BLOCK_SIZE bsize =
+        seg_skip ? sb_size : sf->part_sf.always_this_block_size;
+    set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
+    rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, sb_size,
+                     &dummy_rate, &dummy_dist, 1, pc_root);
+  } else if (cpi->partition_search_skippable_frame) {
+    set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size);
+    const BLOCK_SIZE bsize =
+        get_rd_var_based_fixed_partition(cpi, x, mi_row, mi_col);
+    set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
+    rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, sb_size,
+                     &dummy_rate, &dummy_dist, 1, pc_root);
+  } else {
+    // No stats for overlay frames. Exclude key frame.
+    x->valid_cost_b =
+        get_tpl_stats_b(cpi, sb_size, mi_row, mi_col, x->intra_cost_b,
+                        x->inter_cost_b, x->mv_b, &x->cost_stride);
+
+    reset_partition(pc_root, sb_size);
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    start_timing(cpi, rd_pick_partition_time);
+#endif
+    BLOCK_SIZE max_sq_size = x->max_partition_size;
+    BLOCK_SIZE min_sq_size = x->min_partition_size;
+
+    if (use_auto_max_partition(cpi, sb_size, mi_row, mi_col)) {
+      float features[FEATURE_SIZE_MAX_MIN_PART_PRED] = { 0.0f };
+
+      av1_get_max_min_partition_features(cpi, x, mi_row, mi_col, features);
+      max_sq_size = AOMMAX(
+          AOMMIN(av1_predict_max_partition(cpi, x, features), max_sq_size),
+          min_sq_size);
+    }
+
+    const int num_passes = cpi->oxcf.sb_multipass_unit_test ? 2 : 1;
+
+    if (num_passes == 1) {
+      rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, sb_size,
+                        max_sq_size, min_sq_size, &dummy_rdc, dummy_rdc,
+                        pc_root, NULL, SB_SINGLE_PASS, NULL);
+    } else {
+      // First pass
+      SB_FIRST_PASS_STATS sb_fp_stats;
+      backup_sb_state(&sb_fp_stats, cpi, td, tile_data, mi_row, mi_col);
+      rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, sb_size,
+                        max_sq_size, min_sq_size, &dummy_rdc, dummy_rdc,
+                        pc_root, NULL, SB_DRY_PASS, NULL);
+
+      // Second pass
+      init_encode_rd_sb(cpi, td, tile_data, pc_root, &dummy_rdc, mi_row, mi_col,
+                        0);
+      reset_mbmi(&cm->mi_params, sb_size, mi_row, mi_col);
+      reset_partition(pc_root, sb_size);
+
+      restore_sb_state(&sb_fp_stats, cpi, td, tile_data, mi_row, mi_col);
+
+      rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, sb_size,
+                        max_sq_size, min_sq_size, &dummy_rdc, dummy_rdc,
+                        pc_root, NULL, SB_WET_PASS, NULL);
+    }
+    // Reset to 0 so that it wouldn't be used elsewhere mistakenly.
+    x->valid_cost_b = 0;
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    end_timing(cpi, rd_pick_partition_time);
+#endif
+  }
+#endif  // !CONFIG_REALTIME_ONLY
+
+  // TODO(angiebird): Let inter_mode_rd_model_estimation support multi-tile.
+  if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1 &&
+      cm->tiles.cols == 1 && cm->tiles.rows == 1) {
+    av1_inter_mode_data_fit(tile_data, x->rdmult);
+  }
+}
+
+static AOM_INLINE void set_cost_upd_freq(AV1_COMP *cpi, ThreadData *td,
+                                         const TileInfo *const tile_info,
+                                         const int mi_row, const int mi_col) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+
+  switch (cpi->oxcf.coeff_cost_upd_freq) {
+    case COST_UPD_TILE:  // Tile level
+      if (mi_row != tile_info->mi_row_start) break;
+      AOM_FALLTHROUGH_INTENDED;
+    case COST_UPD_SBROW:  // SB row level in tile
+      if (mi_col != tile_info->mi_col_start) break;
+      AOM_FALLTHROUGH_INTENDED;
+    case COST_UPD_SB:  // SB level
+      if (cpi->sf.inter_sf.disable_sb_level_coeff_cost_upd &&
+          mi_col != tile_info->mi_col_start)
+        break;
+      av1_fill_coeff_costs(&td->mb, xd->tile_ctx, num_planes);
+      break;
+    default: assert(0);
+  }
+
+  switch (cpi->oxcf.mode_cost_upd_freq) {
+    case COST_UPD_TILE:  // Tile level
+      if (mi_row != tile_info->mi_row_start) break;
+      AOM_FALLTHROUGH_INTENDED;
+    case COST_UPD_SBROW:  // SB row level in tile
+      if (mi_col != tile_info->mi_col_start) break;
+      AOM_FALLTHROUGH_INTENDED;
+    case COST_UPD_SB:  // SB level
+      av1_fill_mode_rates(cm, x, xd->tile_ctx);
+      break;
+    default: assert(0);
+  }
+  switch (cpi->oxcf.mv_cost_upd_freq) {
+    case COST_UPD_OFF: break;
+    case COST_UPD_TILE:  // Tile level
+      if (mi_row != tile_info->mi_row_start) break;
+      AOM_FALLTHROUGH_INTENDED;
+    case COST_UPD_SBROW:  // SB row level in tile
+      if (mi_col != tile_info->mi_col_start) break;
+      AOM_FALLTHROUGH_INTENDED;
+    case COST_UPD_SB:  // SB level
+      if (cpi->sf.inter_sf.disable_sb_level_mv_cost_upd &&
+          mi_col != tile_info->mi_col_start)
+        break;
+      av1_fill_mv_costs(xd->tile_ctx, cm->features.cur_frame_force_integer_mv,
+                        cm->features.allow_high_precision_mv, x);
+      break;
+    default: assert(0);
+  }
+}
+
+static AOM_INLINE void encode_sb_row(AV1_COMP *cpi, ThreadData *td,
+                                     TileDataEnc *tile_data, int mi_row,
+                                     TOKENEXTRA **tp) {
+  AV1_COMMON *const cm = &cpi->common;
+  const TileInfo *const tile_info = &tile_data->tile_info;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int sb_cols_in_tile = av1_get_sb_cols_in_tile(cm, tile_data->tile_info);
+  const BLOCK_SIZE sb_size = cm->seq_params.sb_size;
+  const int mib_size = cm->seq_params.mib_size;
+  const int mib_size_log2 = cm->seq_params.mib_size_log2;
+  const int sb_row = (mi_row - tile_info->mi_row_start) >> mib_size_log2;
+  const int use_nonrd_mode = cpi->sf.rt_sf.use_nonrd_pick_mode;
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, encode_sb_time);
+#endif
+
+  // Initialize the left context for the new SB row
+  av1_zero_left_context(xd);
+
+  // Reset delta for every tile
+  if (mi_row == tile_info->mi_row_start || cpi->row_mt) {
+    if (cm->delta_q_info.delta_q_present_flag)
+      xd->current_qindex = cm->quant_params.base_qindex;
+    if (cm->delta_q_info.delta_lf_present_flag) {
+      av1_reset_loop_filter_delta(xd, av1_num_planes(cm));
+    }
+  }
+  reset_thresh_freq_fact(x);
+
+  // Code each SB in the row
+  for (int mi_col = tile_info->mi_col_start, sb_col_in_tile = 0;
+       mi_col < tile_info->mi_col_end; mi_col += mib_size, sb_col_in_tile++) {
+    (*(cpi->row_mt_sync_read_ptr))(&tile_data->row_mt_sync, sb_row,
+                                   sb_col_in_tile);
+    if (tile_data->allow_update_cdf && (cpi->row_mt == 1) &&
+        (tile_info->mi_row_start != mi_row)) {
+      if ((tile_info->mi_col_start == mi_col)) {
+        // restore frame context of 1st column sb
+        memcpy(xd->tile_ctx, x->row_ctx, sizeof(*xd->tile_ctx));
+      } else {
+        int wt_left = AVG_CDF_WEIGHT_LEFT;
+        int wt_tr = AVG_CDF_WEIGHT_TOP_RIGHT;
+        if (tile_info->mi_col_end > (mi_col + mib_size))
+          avg_cdf_symbols(xd->tile_ctx, x->row_ctx + sb_col_in_tile, wt_left,
+                          wt_tr);
+        else
+          avg_cdf_symbols(xd->tile_ctx, x->row_ctx + sb_col_in_tile - 1,
+                          wt_left, wt_tr);
+      }
+    }
+
+    set_cost_upd_freq(cpi, td, tile_info, mi_row, mi_col);
+
+    x->color_sensitivity[0] = 0;
+    x->color_sensitivity[1] = 0;
+    x->content_state_sb = 0;
+
+    PC_TREE *const pc_root = td->pc_root;
+    pc_root->index = 0;
+
+    xd->cur_frame_force_integer_mv = cm->features.cur_frame_force_integer_mv;
+    td->mb.cb_coef_buff = av1_get_cb_coeff_buffer(cpi, mi_row, mi_col);
+    x->source_variance = UINT_MAX;
+    x->simple_motion_pred_sse = UINT_MAX;
+
+    const struct segmentation *const seg = &cm->seg;
+    int seg_skip = 0;
+    if (seg->enabled) {
+      const uint8_t *const map =
+          seg->update_map ? cpi->enc_seg.map : cm->last_frame_seg_map;
+      const int segment_id =
+          map ? get_segment_id(&cm->mi_params, map, sb_size, mi_row, mi_col)
+              : 0;
+      seg_skip = segfeature_active(seg, segment_id, SEG_LVL_SKIP);
+    }
+
+    if (use_nonrd_mode) {
+      encode_nonrd_sb(cpi, td, tile_data, pc_root, tp, mi_row, mi_col,
+                      seg_skip);
+    } else {
+      encode_rd_sb(cpi, td, tile_data, pc_root, tp, mi_row, mi_col, seg_skip);
+    }
+
+    if (tile_data->allow_update_cdf && (cpi->row_mt == 1) &&
+        (tile_info->mi_row_end > (mi_row + mib_size))) {
+      if (sb_cols_in_tile == 1)
+        memcpy(x->row_ctx, xd->tile_ctx, sizeof(*xd->tile_ctx));
+      else if (sb_col_in_tile >= 1)
+        memcpy(x->row_ctx + sb_col_in_tile - 1, xd->tile_ctx,
+               sizeof(*xd->tile_ctx));
+    }
+    (*(cpi->row_mt_sync_write_ptr))(&tile_data->row_mt_sync, sb_row,
+                                    sb_col_in_tile, sb_cols_in_tile);
+  }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, encode_sb_time);
+#endif
+}
+
+static AOM_INLINE void init_encode_frame_mb_context(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCK *const x = &cpi->td.mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+
+  // Copy data over into macro block data structures.
+  av1_setup_src_planes(x, cpi->source, 0, 0, num_planes,
+                       cm->seq_params.sb_size);
+
+  av1_setup_block_planes(xd, cm->seq_params.subsampling_x,
+                         cm->seq_params.subsampling_y, num_planes);
+}
+
+void av1_alloc_tile_data(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int tile_cols = cm->tiles.cols;
+  const int tile_rows = cm->tiles.rows;
+
+  if (cpi->tile_data != NULL) aom_free(cpi->tile_data);
+  CHECK_MEM_ERROR(
+      cm, cpi->tile_data,
+      aom_memalign(32, tile_cols * tile_rows * sizeof(*cpi->tile_data)));
+
+  cpi->allocated_tiles = tile_cols * tile_rows;
+}
+
+void av1_init_tile_data(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  const int tile_cols = cm->tiles.cols;
+  const int tile_rows = cm->tiles.rows;
+  int tile_col, tile_row;
+  TOKENEXTRA *pre_tok = cpi->tile_tok[0][0];
+  TOKENLIST *tplist = cpi->tplist[0][0];
+  unsigned int tile_tok = 0;
+  int tplist_count = 0;
+
+  for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
+    for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
+      TileDataEnc *const tile_data =
+          &cpi->tile_data[tile_row * tile_cols + tile_col];
+      TileInfo *const tile_info = &tile_data->tile_info;
+      av1_tile_init(tile_info, cm, tile_row, tile_col);
+
+      cpi->tile_tok[tile_row][tile_col] = pre_tok + tile_tok;
+      pre_tok = cpi->tile_tok[tile_row][tile_col];
+      tile_tok = allocated_tokens(
+          *tile_info, cm->seq_params.mib_size_log2 + MI_SIZE_LOG2, num_planes);
+      cpi->tplist[tile_row][tile_col] = tplist + tplist_count;
+      tplist = cpi->tplist[tile_row][tile_col];
+      tplist_count = av1_get_sb_rows_in_tile(cm, tile_data->tile_info);
+      tile_data->allow_update_cdf = !cm->tiles.large_scale;
+      tile_data->allow_update_cdf =
+          tile_data->allow_update_cdf && !cm->features.disable_cdf_update;
+      tile_data->tctx = *cm->fc;
+    }
+  }
+}
+
+void av1_encode_sb_row(AV1_COMP *cpi, ThreadData *td, int tile_row,
+                       int tile_col, int mi_row) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  const int tile_cols = cm->tiles.cols;
+  TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col];
+  const TileInfo *const tile_info = &this_tile->tile_info;
+  TOKENEXTRA *tok = NULL;
+  const int sb_row_in_tile =
+      (mi_row - tile_info->mi_row_start) >> cm->seq_params.mib_size_log2;
+  const int tile_mb_cols =
+      (tile_info->mi_col_end - tile_info->mi_col_start + 2) >> 2;
+  const int num_mb_rows_in_sb =
+      ((1 << (cm->seq_params.mib_size_log2 + MI_SIZE_LOG2)) + 8) >> 4;
+
+  get_start_tok(cpi, tile_row, tile_col, mi_row, &tok,
+                cm->seq_params.mib_size_log2 + MI_SIZE_LOG2, num_planes);
+  cpi->tplist[tile_row][tile_col][sb_row_in_tile].start = tok;
+
+  encode_sb_row(cpi, td, this_tile, mi_row, &tok);
+
+  cpi->tplist[tile_row][tile_col][sb_row_in_tile].stop = tok;
+  cpi->tplist[tile_row][tile_col][sb_row_in_tile].count =
+      (unsigned int)(cpi->tplist[tile_row][tile_col][sb_row_in_tile].stop -
+                     cpi->tplist[tile_row][tile_col][sb_row_in_tile].start);
+
+  assert(
+      (unsigned int)(tok -
+                     cpi->tplist[tile_row][tile_col][sb_row_in_tile].start) <=
+      get_token_alloc(num_mb_rows_in_sb, tile_mb_cols,
+                      cm->seq_params.mib_size_log2 + MI_SIZE_LOG2, num_planes));
+
+  (void)tile_mb_cols;
+  (void)num_mb_rows_in_sb;
+}
+
+void av1_encode_tile(AV1_COMP *cpi, ThreadData *td, int tile_row,
+                     int tile_col) {
+  AV1_COMMON *const cm = &cpi->common;
+  TileDataEnc *const this_tile =
+      &cpi->tile_data[tile_row * cm->tiles.cols + tile_col];
+  const TileInfo *const tile_info = &this_tile->tile_info;
+
+  if (!cpi->sf.rt_sf.use_nonrd_pick_mode) av1_inter_mode_data_init(this_tile);
+
+  av1_zero_above_context(cm, &td->mb.e_mbd, tile_info->mi_col_start,
+                         tile_info->mi_col_end, tile_row);
+  av1_init_above_context(&cm->above_contexts, av1_num_planes(cm), tile_row,
+                         &td->mb.e_mbd);
+
+  if (cpi->oxcf.enable_cfl_intra) cfl_init(&td->mb.e_mbd.cfl, &cm->seq_params);
+
+  av1_crc32c_calculator_init(&td->mb.mb_rd_record.crc_calculator);
+
+  for (int mi_row = tile_info->mi_row_start; mi_row < tile_info->mi_row_end;
+       mi_row += cm->seq_params.mib_size) {
+    av1_encode_sb_row(cpi, td, tile_row, tile_col, mi_row);
+  }
+}
+
+static AOM_INLINE void encode_tiles(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int tile_cols = cm->tiles.cols;
+  const int tile_rows = cm->tiles.rows;
+  int tile_col, tile_row;
+
+  if (cpi->tile_data == NULL || cpi->allocated_tiles < tile_cols * tile_rows)
+    av1_alloc_tile_data(cpi);
+
+  av1_init_tile_data(cpi);
+
+  for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
+    for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
+      TileDataEnc *const this_tile =
+          &cpi->tile_data[tile_row * cm->tiles.cols + tile_col];
+      cpi->td.intrabc_used = 0;
+      cpi->td.deltaq_used = 0;
+      cpi->td.mb.e_mbd.tile_ctx = &this_tile->tctx;
+      cpi->td.mb.tile_pb_ctx = &this_tile->tctx;
+      av1_encode_tile(cpi, &cpi->td, tile_row, tile_col);
+      cpi->intrabc_used |= cpi->td.intrabc_used;
+      cpi->deltaq_used |= cpi->td.deltaq_used;
+    }
+  }
+}
+
+#define GLOBAL_TRANS_TYPES_ENC 3  // highest motion model to search
+static int gm_get_params_cost(const WarpedMotionParams *gm,
+                              const WarpedMotionParams *ref_gm, int allow_hp) {
+  int params_cost = 0;
+  int trans_bits, trans_prec_diff;
+  switch (gm->wmtype) {
+    case AFFINE:
+    case ROTZOOM:
+      params_cost += aom_count_signed_primitive_refsubexpfin(
+          GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+          (ref_gm->wmmat[2] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS),
+          (gm->wmmat[2] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS));
+      params_cost += aom_count_signed_primitive_refsubexpfin(
+          GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+          (ref_gm->wmmat[3] >> GM_ALPHA_PREC_DIFF),
+          (gm->wmmat[3] >> GM_ALPHA_PREC_DIFF));
+      if (gm->wmtype >= AFFINE) {
+        params_cost += aom_count_signed_primitive_refsubexpfin(
+            GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+            (ref_gm->wmmat[4] >> GM_ALPHA_PREC_DIFF),
+            (gm->wmmat[4] >> GM_ALPHA_PREC_DIFF));
+        params_cost += aom_count_signed_primitive_refsubexpfin(
+            GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+            (ref_gm->wmmat[5] >> GM_ALPHA_PREC_DIFF) -
+                (1 << GM_ALPHA_PREC_BITS),
+            (gm->wmmat[5] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS));
+      }
+      AOM_FALLTHROUGH_INTENDED;
+    case TRANSLATION:
+      trans_bits = (gm->wmtype == TRANSLATION)
+                       ? GM_ABS_TRANS_ONLY_BITS - !allow_hp
+                       : GM_ABS_TRANS_BITS;
+      trans_prec_diff = (gm->wmtype == TRANSLATION)
+                            ? GM_TRANS_ONLY_PREC_DIFF + !allow_hp
+                            : GM_TRANS_PREC_DIFF;
+      params_cost += aom_count_signed_primitive_refsubexpfin(
+          (1 << trans_bits) + 1, SUBEXPFIN_K,
+          (ref_gm->wmmat[0] >> trans_prec_diff),
+          (gm->wmmat[0] >> trans_prec_diff));
+      params_cost += aom_count_signed_primitive_refsubexpfin(
+          (1 << trans_bits) + 1, SUBEXPFIN_K,
+          (ref_gm->wmmat[1] >> trans_prec_diff),
+          (gm->wmmat[1] >> trans_prec_diff));
+      AOM_FALLTHROUGH_INTENDED;
+    case IDENTITY: break;
+    default: assert(0);
+  }
+  return (params_cost << AV1_PROB_COST_SHIFT);
+}
+
+static int do_gm_search_logic(SPEED_FEATURES *const sf, int frame) {
+  (void)frame;
+  switch (sf->gm_sf.gm_search_type) {
+    case GM_FULL_SEARCH: return 1;
+    case GM_REDUCED_REF_SEARCH_SKIP_L2_L3:
+      return !(frame == LAST2_FRAME || frame == LAST3_FRAME);
+    case GM_REDUCED_REF_SEARCH_SKIP_L2_L3_ARF2:
+      return !(frame == LAST2_FRAME || frame == LAST3_FRAME ||
+               (frame == ALTREF2_FRAME));
+    case GM_DISABLE_SEARCH: return 0;
+    default: assert(0);
+  }
+  return 1;
+}
+
+// Set the relative distance of a reference frame w.r.t. current frame
+static AOM_INLINE void set_rel_frame_dist(AV1_COMP *cpi) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const OrderHintInfo *const order_hint_info = &cm->seq_params.order_hint_info;
+  MV_REFERENCE_FRAME ref_frame;
+  int min_past_dist = INT32_MAX, min_future_dist = INT32_MAX;
+  cpi->nearest_past_ref = NONE_FRAME;
+  cpi->nearest_future_ref = NONE_FRAME;
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    cpi->ref_relative_dist[ref_frame - LAST_FRAME] = 0;
+    if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame]) {
+      int dist = av1_encoder_get_relative_dist(
+          order_hint_info,
+          cm->cur_frame->ref_display_order_hint[ref_frame - LAST_FRAME],
+          cm->current_frame.display_order_hint);
+      cpi->ref_relative_dist[ref_frame - LAST_FRAME] = dist;
+      // Get the nearest ref_frame in the past
+      if (abs(dist) < min_past_dist && dist < 0) {
+        cpi->nearest_past_ref = ref_frame;
+        min_past_dist = abs(dist);
+      }
+      // Get the nearest ref_frame in the future
+      if (dist < min_future_dist && dist > 0) {
+        cpi->nearest_future_ref = ref_frame;
+        min_future_dist = dist;
+      }
+    }
+  }
+}
+
+static INLINE int refs_are_one_sided(const AV1_COMMON *cm) {
+  assert(!frame_is_intra_only(cm));
+
+  int one_sided_refs = 1;
+  for (int ref = LAST_FRAME; ref <= ALTREF_FRAME; ++ref) {
+    const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref);
+    if (buf == NULL) continue;
+
+    const int ref_display_order_hint = buf->display_order_hint;
+    if (av1_encoder_get_relative_dist(
+            &cm->seq_params.order_hint_info, ref_display_order_hint,
+            (int)cm->current_frame.display_order_hint) > 0) {
+      one_sided_refs = 0;  // bwd reference
+      break;
+    }
+  }
+  return one_sided_refs;
+}
+
+static INLINE void get_skip_mode_ref_offsets(const AV1_COMMON *cm,
+                                             int ref_order_hint[2]) {
+  const SkipModeInfo *const skip_mode_info = &cm->current_frame.skip_mode_info;
+  ref_order_hint[0] = ref_order_hint[1] = 0;
+  if (!skip_mode_info->skip_mode_allowed) return;
+
+  const RefCntBuffer *const buf_0 =
+      get_ref_frame_buf(cm, LAST_FRAME + skip_mode_info->ref_frame_idx_0);
+  const RefCntBuffer *const buf_1 =
+      get_ref_frame_buf(cm, LAST_FRAME + skip_mode_info->ref_frame_idx_1);
+  assert(buf_0 != NULL && buf_1 != NULL);
+
+  ref_order_hint[0] = buf_0->order_hint;
+  ref_order_hint[1] = buf_1->order_hint;
+}
+
+static int check_skip_mode_enabled(AV1_COMP *const cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+
+  av1_setup_skip_mode_allowed(cm);
+  if (!cm->current_frame.skip_mode_info.skip_mode_allowed) return 0;
+
+  // Turn off skip mode if the temporal distances of the reference pair to the
+  // current frame are different by more than 1 frame.
+  const int cur_offset = (int)cm->current_frame.order_hint;
+  int ref_offset[2];
+  get_skip_mode_ref_offsets(cm, ref_offset);
+  const int cur_to_ref0 = get_relative_dist(&cm->seq_params.order_hint_info,
+                                            cur_offset, ref_offset[0]);
+  const int cur_to_ref1 = abs(get_relative_dist(&cm->seq_params.order_hint_info,
+                                                cur_offset, ref_offset[1]));
+  if (abs(cur_to_ref0 - cur_to_ref1) > 1) return 0;
+
+  // High Latency: Turn off skip mode if all refs are fwd.
+  if (cpi->all_one_sided_refs && cpi->oxcf.lag_in_frames > 0) return 0;
+
+  static const int flag_list[REF_FRAMES] = { 0,
+                                             AOM_LAST_FLAG,
+                                             AOM_LAST2_FLAG,
+                                             AOM_LAST3_FLAG,
+                                             AOM_GOLD_FLAG,
+                                             AOM_BWD_FLAG,
+                                             AOM_ALT2_FLAG,
+                                             AOM_ALT_FLAG };
+  const int ref_frame[2] = {
+    cm->current_frame.skip_mode_info.ref_frame_idx_0 + LAST_FRAME,
+    cm->current_frame.skip_mode_info.ref_frame_idx_1 + LAST_FRAME
+  };
+  if (!(cpi->ref_frame_flags & flag_list[ref_frame[0]]) ||
+      !(cpi->ref_frame_flags & flag_list[ref_frame[1]]))
+    return 0;
+
+  return 1;
+}
+
+// Function to decide if we can skip the global motion parameter computation
+// for a particular ref frame
+static INLINE int skip_gm_frame(AV1_COMMON *const cm, int ref_frame) {
+  if ((ref_frame == LAST3_FRAME || ref_frame == LAST2_FRAME) &&
+      cm->global_motion[GOLDEN_FRAME].wmtype != IDENTITY) {
+    return get_relative_dist(
+               &cm->seq_params.order_hint_info,
+               cm->cur_frame->ref_order_hints[ref_frame - LAST_FRAME],
+               cm->cur_frame->ref_order_hints[GOLDEN_FRAME - LAST_FRAME]) <= 0;
+  }
+  return 0;
+}
+
+static AOM_INLINE void set_default_interp_skip_flags(
+    const AV1_COMMON *cm, InterpSearchFlags *interp_search_flags) {
+  const int num_planes = av1_num_planes(cm);
+  interp_search_flags->default_interp_skip_flags =
+      (num_planes == 1) ? INTERP_SKIP_LUMA_EVAL_CHROMA
+                        : INTERP_SKIP_LUMA_SKIP_CHROMA;
+}
+
+// TODO(Remya): Can include erroradv_prod_tr[] for threshold calculation
+static INLINE int64_t calc_erroradv_threshold(AV1_COMP *cpi,
+                                              int64_t ref_frame_error) {
+  if (!cpi->sf.gm_sf.disable_adaptive_warp_error_thresh)
+    return (int64_t)(
+        ref_frame_error * erroradv_tr[cpi->sf.gm_sf.gm_erroradv_type] + 0.5);
+  else
+    return INT64_MAX;
+}
+
+static void compute_global_motion_for_ref_frame(
+    AV1_COMP *cpi, YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES], int frame,
+    int *num_frm_corners, int *frm_corners, unsigned char *frm_buffer,
+    MotionModel *params_by_motion, uint8_t *segment_map,
+    const int segment_map_w, const int segment_map_h,
+    const WarpedMotionParams *ref_params) {
+  ThreadData *const td = &cpi->td;
+  MACROBLOCK *const x = &td->mb;
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  int i;
+  // clang-format off
+  static const double kIdentityParams[MAX_PARAMDIM - 1] = {
+     0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0
+  };
+  // clang-format on
+  WarpedMotionParams tmp_wm_params;
+  const double *params_this_motion;
+  int inliers_by_motion[RANSAC_NUM_MOTIONS];
+  assert(ref_buf[frame] != NULL);
+  if (*num_frm_corners < 0) {
+    // compute interest points using FAST features
+    *num_frm_corners = av1_fast_corner_detect(
+        frm_buffer, cpi->source->y_width, cpi->source->y_height,
+        cpi->source->y_stride, frm_corners, MAX_CORNERS);
+  }
+  TransformationType model;
+
+  aom_clear_system_state();
+
+  // TODO(sarahparker, debargha): Explore do_adaptive_gm_estimation = 1
+  const int do_adaptive_gm_estimation = 0;
+
+  const int ref_frame_dist = get_relative_dist(
+      &cm->seq_params.order_hint_info, cm->current_frame.order_hint,
+      cm->cur_frame->ref_order_hints[frame - LAST_FRAME]);
+  const GlobalMotionEstimationType gm_estimation_type =
+      cm->seq_params.order_hint_info.enable_order_hint &&
+              abs(ref_frame_dist) <= 2 && do_adaptive_gm_estimation
+          ? GLOBAL_MOTION_DISFLOW_BASED
+          : GLOBAL_MOTION_FEATURE_BASED;
+  for (model = ROTZOOM; model < GLOBAL_TRANS_TYPES_ENC; ++model) {
+    int64_t best_warp_error = INT64_MAX;
+    // Initially set all params to identity.
+    for (i = 0; i < RANSAC_NUM_MOTIONS; ++i) {
+      memcpy(params_by_motion[i].params, kIdentityParams,
+             (MAX_PARAMDIM - 1) * sizeof(*(params_by_motion[i].params)));
+      params_by_motion[i].num_inliers = 0;
+    }
+
+    av1_compute_global_motion(
+        model, frm_buffer, cpi->source->y_width, cpi->source->y_height,
+        cpi->source->y_stride, frm_corners, *num_frm_corners, ref_buf[frame],
+        cpi->common.seq_params.bit_depth, gm_estimation_type, inliers_by_motion,
+        params_by_motion, RANSAC_NUM_MOTIONS);
+    int64_t ref_frame_error = 0;
+    for (i = 0; i < RANSAC_NUM_MOTIONS; ++i) {
+      if (inliers_by_motion[i] == 0) continue;
+
+      params_this_motion = params_by_motion[i].params;
+      av1_convert_model_to_params(params_this_motion, &tmp_wm_params);
+
+      if (tmp_wm_params.wmtype != IDENTITY) {
+        av1_compute_feature_segmentation_map(
+            segment_map, segment_map_w, segment_map_h,
+            params_by_motion[i].inliers, params_by_motion[i].num_inliers);
+
+        ref_frame_error = av1_segmented_frame_error(
+            is_cur_buf_hbd(xd), xd->bd, ref_buf[frame]->y_buffer,
+            ref_buf[frame]->y_stride, cpi->source->y_buffer,
+            cpi->source->y_width, cpi->source->y_height, cpi->source->y_stride,
+            segment_map, segment_map_w);
+
+        int64_t erroradv_threshold =
+            calc_erroradv_threshold(cpi, ref_frame_error);
+
+        const int64_t warp_error = av1_refine_integerized_param(
+            &tmp_wm_params, tmp_wm_params.wmtype, is_cur_buf_hbd(xd), xd->bd,
+            ref_buf[frame]->y_buffer, ref_buf[frame]->y_width,
+            ref_buf[frame]->y_height, ref_buf[frame]->y_stride,
+            cpi->source->y_buffer, cpi->source->y_width, cpi->source->y_height,
+            cpi->source->y_stride, GM_REFINEMENT_COUNT, best_warp_error,
+            segment_map, segment_map_w, erroradv_threshold);
+
+        if (warp_error < best_warp_error) {
+          best_warp_error = warp_error;
+          // Save the wm_params modified by
+          // av1_refine_integerized_param() rather than motion index to
+          // avoid rerunning refine() below.
+          memcpy(&(cm->global_motion[frame]), &tmp_wm_params,
+                 sizeof(WarpedMotionParams));
+        }
+      }
+    }
+    if (cm->global_motion[frame].wmtype <= AFFINE)
+      if (!av1_get_shear_params(&cm->global_motion[frame]))
+        cm->global_motion[frame] = default_warp_params;
+
+    if (cm->global_motion[frame].wmtype == TRANSLATION) {
+      cm->global_motion[frame].wmmat[0] =
+          convert_to_trans_prec(cm->features.allow_high_precision_mv,
+                                cm->global_motion[frame].wmmat[0]) *
+          GM_TRANS_ONLY_DECODE_FACTOR;
+      cm->global_motion[frame].wmmat[1] =
+          convert_to_trans_prec(cm->features.allow_high_precision_mv,
+                                cm->global_motion[frame].wmmat[1]) *
+          GM_TRANS_ONLY_DECODE_FACTOR;
+    }
+
+    if (cm->global_motion[frame].wmtype == IDENTITY) continue;
+
+    if (ref_frame_error == 0) continue;
+
+    // If the best error advantage found doesn't meet the threshold for
+    // this motion type, revert to IDENTITY.
+    if (!av1_is_enough_erroradvantage(
+            (double)best_warp_error / ref_frame_error,
+            gm_get_params_cost(&cm->global_motion[frame], ref_params,
+                               cm->features.allow_high_precision_mv),
+            cpi->sf.gm_sf.gm_erroradv_type)) {
+      cm->global_motion[frame] = default_warp_params;
+    }
+
+    if (cm->global_motion[frame].wmtype != IDENTITY) break;
+  }
+
+  aom_clear_system_state();
+}
+
+typedef struct {
+  int distance;
+  MV_REFERENCE_FRAME frame;
+} FrameDistPair;
+
+static INLINE void update_valid_ref_frames_for_gm(
+    AV1_COMP *cpi, YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES],
+    FrameDistPair *past_ref_frame, FrameDistPair *future_ref_frame,
+    int *num_past_ref_frames, int *num_future_ref_frames) {
+  AV1_COMMON *const cm = &cpi->common;
+  const OrderHintInfo *const order_hint_info = &cm->seq_params.order_hint_info;
+  for (int frame = ALTREF_FRAME; frame >= LAST_FRAME; --frame) {
+    const MV_REFERENCE_FRAME ref_frame[2] = { frame, NONE_FRAME };
+    RefCntBuffer *buf = get_ref_frame_buf(cm, frame);
+    const int ref_disabled =
+        !(cpi->ref_frame_flags & av1_ref_frame_flag_list[frame]);
+    ref_buf[frame] = NULL;
+    cm->global_motion[frame] = default_warp_params;
+    // Skip global motion estimation for invalid ref frames
+    if (buf == NULL ||
+        (ref_disabled && cpi->sf.hl_sf.recode_loop != DISALLOW_RECODE)) {
+      cpi->gm_info.params_cost[frame] = 0;
+      continue;
+    } else {
+      ref_buf[frame] = &buf->buf;
+    }
+
+    if (ref_buf[frame]->y_crop_width == cpi->source->y_crop_width &&
+        ref_buf[frame]->y_crop_height == cpi->source->y_crop_height &&
+        do_gm_search_logic(&cpi->sf, frame) &&
+        !prune_ref_by_selective_ref_frame(
+            cpi, NULL, ref_frame, cm->cur_frame->ref_display_order_hint) &&
+        !(cpi->sf.gm_sf.selective_ref_gm && skip_gm_frame(cm, frame))) {
+      assert(ref_buf[frame] != NULL);
+      int relative_frame_dist = av1_encoder_get_relative_dist(
+          order_hint_info, buf->display_order_hint,
+          cm->cur_frame->display_order_hint);
+      // Populate past and future ref frames
+      if (relative_frame_dist <= 0) {
+        past_ref_frame[*num_past_ref_frames].distance =
+            abs(relative_frame_dist);
+        past_ref_frame[*num_past_ref_frames].frame = frame;
+        (*num_past_ref_frames)++;
+      } else {
+        future_ref_frame[*num_future_ref_frames].distance =
+            abs(relative_frame_dist);
+        future_ref_frame[*num_future_ref_frames].frame = frame;
+        (*num_future_ref_frames)++;
+      }
+    }
+  }
+}
+
+static INLINE void compute_gm_for_valid_ref_frames(
+    AV1_COMP *cpi, YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES], int frame,
+    int *num_frm_corners, int *frm_corners, unsigned char *frm_buffer,
+    MotionModel *params_by_motion, uint8_t *segment_map,
+    const int segment_map_w, const int segment_map_h) {
+  AV1_COMMON *const cm = &cpi->common;
+  GlobalMotionInfo *const gm_info = &cpi->gm_info;
+  const WarpedMotionParams *ref_params =
+      cm->prev_frame ? &cm->prev_frame->global_motion[frame]
+                     : &default_warp_params;
+
+  compute_global_motion_for_ref_frame(
+      cpi, ref_buf, frame, num_frm_corners, frm_corners, frm_buffer,
+      params_by_motion, segment_map, segment_map_w, segment_map_h, ref_params);
+
+  gm_info->params_cost[frame] =
+      gm_get_params_cost(&cm->global_motion[frame], ref_params,
+                         cm->features.allow_high_precision_mv) +
+      gm_info->type_cost[cm->global_motion[frame].wmtype] -
+      gm_info->type_cost[IDENTITY];
+}
+
+static int compare_distance(const void *a, const void *b) {
+  const int diff =
+      ((FrameDistPair *)a)->distance - ((FrameDistPair *)b)->distance;
+  if (diff > 0)
+    return 1;
+  else if (diff < 0)
+    return -1;
+  return 0;
+}
+
+static INLINE void compute_global_motion_for_references(
+    AV1_COMP *cpi, YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES],
+    FrameDistPair reference_frame[REF_FRAMES - 1], int num_ref_frames,
+    int *num_frm_corners, int *frm_corners, unsigned char *frm_buffer,
+    MotionModel *params_by_motion, uint8_t *segment_map,
+    const int segment_map_w, const int segment_map_h) {
+  AV1_COMMON *const cm = &cpi->common;
+  // Compute global motion w.r.t. reference frames starting from the nearest ref
+  // frame in a given direction
+  for (int frame = 0; frame < num_ref_frames; frame++) {
+    int ref_frame = reference_frame[frame].frame;
+    compute_gm_for_valid_ref_frames(cpi, ref_buf, ref_frame, num_frm_corners,
+                                    frm_corners, frm_buffer, params_by_motion,
+                                    segment_map, segment_map_w, segment_map_h);
+    // If global motion w.r.t. current ref frame is
+    // INVALID/TRANSLATION/IDENTITY, skip the evaluation of global motion w.r.t
+    // the remaining ref frames in that direction. The below exit is disabled
+    // when ref frame distance w.r.t. current frame is zero. E.g.:
+    // source_alt_ref_frame w.r.t. ARF frames
+    if (cpi->sf.gm_sf.prune_ref_frame_for_gm_search &&
+        reference_frame[frame].distance != 0 &&
+        cm->global_motion[ref_frame].wmtype != ROTZOOM)
+      break;
+  }
+}
+
+static AOM_INLINE void setup_prune_ref_frame_mask(AV1_COMP *cpi) {
+  if (!cpi->sf.rt_sf.use_nonrd_pick_mode &&
+      cpi->sf.inter_sf.selective_ref_frame >= 2) {
+    AV1_COMMON *const cm = &cpi->common;
+    const OrderHintInfo *const order_hint_info =
+        &cm->seq_params.order_hint_info;
+    const int cur_frame_display_order_hint =
+        cm->current_frame.display_order_hint;
+    unsigned int *ref_display_order_hint =
+        cm->cur_frame->ref_display_order_hint;
+    const int arf2_dist = av1_encoder_get_relative_dist(
+        order_hint_info, ref_display_order_hint[ALTREF2_FRAME - LAST_FRAME],
+        cur_frame_display_order_hint);
+    const int bwd_dist = av1_encoder_get_relative_dist(
+        order_hint_info, ref_display_order_hint[BWDREF_FRAME - LAST_FRAME],
+        cur_frame_display_order_hint);
+
+    for (int ref_idx = REF_FRAMES; ref_idx < MODE_CTX_REF_FRAMES; ++ref_idx) {
+      MV_REFERENCE_FRAME rf[2];
+      av1_set_ref_frame(rf, ref_idx);
+      if (!(cpi->ref_frame_flags & av1_ref_frame_flag_list[rf[0]]) ||
+          !(cpi->ref_frame_flags & av1_ref_frame_flag_list[rf[1]])) {
+        continue;
+      }
+
+      if (!cpi->all_one_sided_refs) {
+        int ref_dist[2];
+        for (int i = 0; i < 2; ++i) {
+          ref_dist[i] = av1_encoder_get_relative_dist(
+              order_hint_info, ref_display_order_hint[rf[i] - LAST_FRAME],
+              cur_frame_display_order_hint);
+        }
+
+        // One-sided compound is used only when all reference frames are
+        // one-sided.
+        if ((ref_dist[0] > 0) == (ref_dist[1] > 0)) {
+          cpi->prune_ref_frame_mask |= 1 << ref_idx;
+        }
+      }
+
+      if (cpi->sf.inter_sf.selective_ref_frame >= 4 &&
+          (rf[0] == ALTREF2_FRAME || rf[1] == ALTREF2_FRAME) &&
+          (cpi->ref_frame_flags & av1_ref_frame_flag_list[BWDREF_FRAME])) {
+        // Check if both ALTREF2_FRAME and BWDREF_FRAME are future references.
+        if (arf2_dist > 0 && bwd_dist > 0 && bwd_dist <= arf2_dist) {
+          // Drop ALTREF2_FRAME as a reference if BWDREF_FRAME is a closer
+          // reference to the current frame than ALTREF2_FRAME
+          cpi->prune_ref_frame_mask |= 1 << ref_idx;
+        }
+      }
+    }
+  }
+}
+
+#define CHECK_PRECOMPUTED_REF_FRAME_MAP 0
+
+static AOM_INLINE void encode_frame_internal(AV1_COMP *cpi) {
+  ThreadData *const td = &cpi->td;
+  MACROBLOCK *const x = &td->mb;
+  AV1_COMMON *const cm = &cpi->common;
+  CommonModeInfoParams *const mi_params = &cm->mi_params;
+  FeatureFlags *const features = &cm->features;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  RD_COUNTS *const rdc = &cpi->td.rd_counts;
+  GlobalMotionInfo *const gm_info = &cpi->gm_info;
+  FrameProbInfo *const frame_probs = &cpi->frame_probs;
+  IntraBCHashInfo *const intrabc_hash_info = &x->intrabc_hash_info;
+  int i;
+
+  if (!cpi->sf.rt_sf.use_nonrd_pick_mode) {
+    mi_params->setup_mi(mi_params);
+  }
+
+  set_mi_offsets(mi_params, xd, 0, 0);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+  x->fwd_txfm4x4 = aom_fdct4x4;
+#else
+  x->fwd_txfm4x4 = aom_fdct4x4_lp;
+#endif
+
+  av1_zero(*td->counts);
+  av1_zero(rdc->comp_pred_diff);
+  av1_zero(rdc->tx_type_used);
+  av1_zero(rdc->obmc_used);
+  av1_zero(rdc->warped_used);
+
+  // Reset the flag.
+  cpi->intrabc_used = 0;
+  // Need to disable intrabc when superres is selected
+  if (av1_superres_scaled(cm)) {
+    features->allow_intrabc = 0;
+  }
+
+  features->allow_intrabc &= (cpi->oxcf.enable_intrabc);
+
+  if (features->allow_warped_motion &&
+      cpi->sf.inter_sf.prune_warped_prob_thresh > 0) {
+    const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->gf_group);
+    if (frame_probs->warped_probs[update_type] <
+        cpi->sf.inter_sf.prune_warped_prob_thresh)
+      features->allow_warped_motion = 0;
+  }
+
+  int hash_table_created = 0;
+  if (!is_stat_generation_stage(cpi) && av1_use_hash_me(cpi) &&
+      !cpi->sf.rt_sf.use_nonrd_pick_mode) {
+    // TODO(any): move this outside of the recoding loop to avoid recalculating
+    // the hash table.
+    // add to hash table
+    const int pic_width = cpi->source->y_crop_width;
+    const int pic_height = cpi->source->y_crop_height;
+    uint32_t *block_hash_values[2][2];
+    int8_t *is_block_same[2][3];
+    int k, j;
+
+    for (k = 0; k < 2; k++) {
+      for (j = 0; j < 2; j++) {
+        CHECK_MEM_ERROR(cm, block_hash_values[k][j],
+                        aom_malloc(sizeof(uint32_t) * pic_width * pic_height));
+      }
+
+      for (j = 0; j < 3; j++) {
+        CHECK_MEM_ERROR(cm, is_block_same[k][j],
+                        aom_malloc(sizeof(int8_t) * pic_width * pic_height));
+      }
+    }
+
+    av1_hash_table_init(intrabc_hash_info);
+    av1_hash_table_create(&intrabc_hash_info->intrabc_hash_table);
+    hash_table_created = 1;
+    av1_generate_block_2x2_hash_value(intrabc_hash_info, cpi->source,
+                                      block_hash_values[0], is_block_same[0]);
+    // Hash data generated for screen contents is used for intraBC ME
+    const int min_alloc_size = block_size_wide[mi_params->mi_alloc_bsize];
+    const int max_sb_size =
+        (1 << (cm->seq_params.mib_size_log2 + MI_SIZE_LOG2));
+    int src_idx = 0;
+    for (int size = 4; size <= max_sb_size; size *= 2, src_idx = !src_idx) {
+      const int dst_idx = !src_idx;
+      av1_generate_block_hash_value(
+          intrabc_hash_info, cpi->source, size, block_hash_values[src_idx],
+          block_hash_values[dst_idx], is_block_same[src_idx],
+          is_block_same[dst_idx]);
+      if (size >= min_alloc_size) {
+        av1_add_to_hash_map_by_row_with_precal_data(
+            &intrabc_hash_info->intrabc_hash_table, block_hash_values[dst_idx],
+            is_block_same[dst_idx][2], pic_width, pic_height, size);
+      }
+    }
+
+    for (k = 0; k < 2; k++) {
+      for (j = 0; j < 2; j++) {
+        aom_free(block_hash_values[k][j]);
+      }
+
+      for (j = 0; j < 3; j++) {
+        aom_free(is_block_same[k][j]);
+      }
+    }
+  }
+
+  const CommonQuantParams *quant_params = &cm->quant_params;
+  for (i = 0; i < MAX_SEGMENTS; ++i) {
+    const int qindex =
+        cm->seg.enabled ? av1_get_qindex(&cm->seg, i, quant_params->base_qindex)
+                        : quant_params->base_qindex;
+    xd->lossless[i] =
+        qindex == 0 && quant_params->y_dc_delta_q == 0 &&
+        quant_params->u_dc_delta_q == 0 && quant_params->u_ac_delta_q == 0 &&
+        quant_params->v_dc_delta_q == 0 && quant_params->v_ac_delta_q == 0;
+    if (xd->lossless[i]) cpi->enc_seg.has_lossless_segment = 1;
+    xd->qindex[i] = qindex;
+    if (xd->lossless[i]) {
+      cpi->optimize_seg_arr[i] = NO_TRELLIS_OPT;
+    } else {
+      cpi->optimize_seg_arr[i] = cpi->sf.rd_sf.optimize_coefficients;
+    }
+  }
+  features->coded_lossless = is_coded_lossless(cm, xd);
+  features->all_lossless = features->coded_lossless && !av1_superres_scaled(cm);
+
+  // Fix delta q resolution for the moment
+  cm->delta_q_info.delta_q_res = 0;
+  if (cpi->oxcf.deltaq_mode == DELTA_Q_OBJECTIVE)
+    cm->delta_q_info.delta_q_res = DEFAULT_DELTA_Q_RES_OBJECTIVE;
+  else if (cpi->oxcf.deltaq_mode == DELTA_Q_PERCEPTUAL)
+    cm->delta_q_info.delta_q_res = DEFAULT_DELTA_Q_RES_PERCEPTUAL;
+  // Set delta_q_present_flag before it is used for the first time
+  cm->delta_q_info.delta_lf_res = DEFAULT_DELTA_LF_RES;
+  cm->delta_q_info.delta_q_present_flag = cpi->oxcf.deltaq_mode != NO_DELTA_Q;
+
+  // Turn off cm->delta_q_info.delta_q_present_flag if objective delta_q is used
+  // for ineligible frames. That effectively will turn off row_mt usage.
+  // Note objective delta_q and tpl eligible frames are only altref frames
+  // currently.
+  if (cm->delta_q_info.delta_q_present_flag) {
+    if (cpi->oxcf.deltaq_mode == DELTA_Q_OBJECTIVE &&
+        !is_frame_tpl_eligible(cpi))
+      cm->delta_q_info.delta_q_present_flag = 0;
+  }
+
+  // Reset delta_q_used flag
+  cpi->deltaq_used = 0;
+
+  cm->delta_q_info.delta_lf_present_flag =
+      cm->delta_q_info.delta_q_present_flag && cpi->oxcf.deltalf_mode;
+  cm->delta_q_info.delta_lf_multi = DEFAULT_DELTA_LF_MULTI;
+
+  // update delta_q_present_flag and delta_lf_present_flag based on
+  // base_qindex
+  cm->delta_q_info.delta_q_present_flag &= quant_params->base_qindex > 0;
+  cm->delta_q_info.delta_lf_present_flag &= quant_params->base_qindex > 0;
+
+  av1_frame_init_quantizer(cpi);
+  av1_initialize_rd_consts(cpi);
+  av1_initialize_me_consts(cpi, x, quant_params->base_qindex);
+
+  init_encode_frame_mb_context(cpi);
+  set_default_interp_skip_flags(cm, &cpi->interp_search_flags);
+  if (cm->prev_frame && cm->prev_frame->seg.enabled)
+    cm->last_frame_seg_map = cm->prev_frame->seg_map;
+  else
+    cm->last_frame_seg_map = NULL;
+  if (features->allow_intrabc || features->coded_lossless) {
+    av1_set_default_ref_deltas(cm->lf.ref_deltas);
+    av1_set_default_mode_deltas(cm->lf.mode_deltas);
+  } else if (cm->prev_frame) {
+    memcpy(cm->lf.ref_deltas, cm->prev_frame->ref_deltas, REF_FRAMES);
+    memcpy(cm->lf.mode_deltas, cm->prev_frame->mode_deltas, MAX_MODE_LF_DELTAS);
+  }
+  memcpy(cm->cur_frame->ref_deltas, cm->lf.ref_deltas, REF_FRAMES);
+  memcpy(cm->cur_frame->mode_deltas, cm->lf.mode_deltas, MAX_MODE_LF_DELTAS);
+
+  cpi->all_one_sided_refs =
+      frame_is_intra_only(cm) ? 0 : refs_are_one_sided(cm);
+
+  cpi->prune_ref_frame_mask = 0;
+  // Figure out which ref frames can be skipped at frame level.
+  setup_prune_ref_frame_mask(cpi);
+
+  x->txb_split_count = 0;
+#if CONFIG_SPEED_STATS
+  x->tx_search_count = 0;
+#endif  // CONFIG_SPEED_STATS
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, av1_compute_global_motion_time);
+#endif
+  av1_zero(rdc->global_motion_used);
+  av1_zero(gm_info->params_cost);
+  if (cpi->common.current_frame.frame_type == INTER_FRAME && cpi->source &&
+      cpi->oxcf.enable_global_motion && !gm_info->search_done) {
+    YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES];
+    MotionModel params_by_motion[RANSAC_NUM_MOTIONS];
+    for (int m = 0; m < RANSAC_NUM_MOTIONS; m++) {
+      memset(&params_by_motion[m], 0, sizeof(params_by_motion[m]));
+      params_by_motion[m].inliers =
+          aom_malloc(sizeof(*(params_by_motion[m].inliers)) * 2 * MAX_CORNERS);
+    }
+
+    int num_frm_corners = -1;
+    int frm_corners[2 * MAX_CORNERS];
+    unsigned char *frm_buffer = cpi->source->y_buffer;
+    if (cpi->source->flags & YV12_FLAG_HIGHBITDEPTH) {
+      // The frame buffer is 16-bit, so we need to convert to 8 bits for the
+      // following code. We cache the result until the frame is released.
+      frm_buffer =
+          av1_downconvert_frame(cpi->source, cpi->common.seq_params.bit_depth);
+    }
+    const int segment_map_w =
+        (cpi->source->y_width + WARP_ERROR_BLOCK) >> WARP_ERROR_BLOCK_LOG;
+    const int segment_map_h =
+        (cpi->source->y_height + WARP_ERROR_BLOCK) >> WARP_ERROR_BLOCK_LOG;
+
+    uint8_t *segment_map =
+        aom_malloc(sizeof(*segment_map) * segment_map_w * segment_map_h);
+    memset(segment_map, 0,
+           sizeof(*segment_map) * segment_map_w * segment_map_h);
+
+    FrameDistPair future_ref_frame[REF_FRAMES - 1] = {
+      { -1, NONE_FRAME }, { -1, NONE_FRAME }, { -1, NONE_FRAME },
+      { -1, NONE_FRAME }, { -1, NONE_FRAME }, { -1, NONE_FRAME },
+      { -1, NONE_FRAME }
+    };
+    FrameDistPair past_ref_frame[REF_FRAMES - 1] = {
+      { -1, NONE_FRAME }, { -1, NONE_FRAME }, { -1, NONE_FRAME },
+      { -1, NONE_FRAME }, { -1, NONE_FRAME }, { -1, NONE_FRAME },
+      { -1, NONE_FRAME }
+    };
+    int num_past_ref_frames = 0;
+    int num_future_ref_frames = 0;
+    // Populate ref_buf for valid ref frames in global motion
+    update_valid_ref_frames_for_gm(cpi, ref_buf, past_ref_frame,
+                                   future_ref_frame, &num_past_ref_frames,
+                                   &num_future_ref_frames);
+
+    // Sort the ref frames in the ascending order of their distance from the
+    // current frame
+    qsort(past_ref_frame, num_past_ref_frames, sizeof(past_ref_frame[0]),
+          compare_distance);
+    qsort(future_ref_frame, num_future_ref_frames, sizeof(future_ref_frame[0]),
+          compare_distance);
+
+    // Compute global motion w.r.t. past reference frames
+    if (num_past_ref_frames > 0)
+      compute_global_motion_for_references(
+          cpi, ref_buf, past_ref_frame, num_past_ref_frames, &num_frm_corners,
+          frm_corners, frm_buffer, params_by_motion, segment_map, segment_map_w,
+          segment_map_h);
+
+    // Compute global motion w.r.t. future reference frames
+    if (num_future_ref_frames > 0)
+      compute_global_motion_for_references(
+          cpi, ref_buf, future_ref_frame, num_future_ref_frames,
+          &num_frm_corners, frm_corners, frm_buffer, params_by_motion,
+          segment_map, segment_map_w, segment_map_h);
+
+    aom_free(segment_map);
+
+    gm_info->search_done = 1;
+    for (int m = 0; m < RANSAC_NUM_MOTIONS; m++) {
+      aom_free(params_by_motion[m].inliers);
+    }
+  }
+  memcpy(cm->cur_frame->global_motion, cm->global_motion,
+         REF_FRAMES * sizeof(WarpedMotionParams));
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, av1_compute_global_motion_time);
+#endif
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, av1_setup_motion_field_time);
+#endif
+  if (features->allow_ref_frame_mvs) av1_setup_motion_field(cm);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, av1_setup_motion_field_time);
+#endif
+
+  cm->current_frame.skip_mode_info.skip_mode_flag =
+      check_skip_mode_enabled(cpi);
+
+  cpi->row_mt_sync_read_ptr = av1_row_mt_sync_read_dummy;
+  cpi->row_mt_sync_write_ptr = av1_row_mt_sync_write_dummy;
+  cpi->row_mt = 0;
+
+  if (cpi->oxcf.row_mt && (cpi->oxcf.max_threads > 1)) {
+    cpi->row_mt = 1;
+    cpi->row_mt_sync_read_ptr = av1_row_mt_sync_read;
+    cpi->row_mt_sync_write_ptr = av1_row_mt_sync_write;
+    av1_encode_tiles_row_mt(cpi);
+  } else {
+    if (AOMMIN(cpi->oxcf.max_threads, cm->tiles.cols * cm->tiles.rows) > 1)
+      av1_encode_tiles_mt(cpi);
+    else
+      encode_tiles(cpi);
+  }
+
+  // If intrabc is allowed but never selected, reset the allow_intrabc flag.
+  if (features->allow_intrabc && !cpi->intrabc_used) {
+    features->allow_intrabc = 0;
+  }
+  if (features->allow_intrabc) {
+    cm->delta_q_info.delta_lf_present_flag = 0;
+  }
+
+  if (cm->delta_q_info.delta_q_present_flag && cpi->deltaq_used == 0) {
+    cm->delta_q_info.delta_q_present_flag = 0;
+  }
+
+  // Set the transform size appropriately before bitstream creation
+  const MODE_EVAL_TYPE eval_type =
+      cpi->sf.winner_mode_sf.enable_winner_mode_for_tx_size_srch
+          ? WINNER_MODE_EVAL
+          : DEFAULT_EVAL;
+  const TX_SIZE_SEARCH_METHOD tx_search_type =
+      cpi->winner_mode_params.tx_size_search_methods[eval_type];
+  assert(cpi->oxcf.enable_tx64 || tx_search_type != USE_LARGESTALL);
+  features->tx_mode = select_tx_mode(cm, tx_search_type);
+
+  if (cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats) {
+    const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->gf_group);
+
+    for (i = 0; i < TX_SIZES_ALL; i++) {
+      int sum = 0;
+      int j;
+      int left = 1024;
+
+      for (j = 0; j < TX_TYPES; j++)
+        sum += cpi->td.rd_counts.tx_type_used[i][j];
+
+      for (j = TX_TYPES - 1; j >= 0; j--) {
+        const int new_prob =
+            sum ? 1024 * cpi->td.rd_counts.tx_type_used[i][j] / sum
+                : (j ? 0 : 1024);
+        int prob =
+            (frame_probs->tx_type_probs[update_type][i][j] + new_prob) >> 1;
+        left -= prob;
+        if (j == 0) prob += left;
+        frame_probs->tx_type_probs[update_type][i][j] = prob;
+      }
+    }
+  }
+
+  if (!cpi->sf.inter_sf.disable_obmc &&
+      cpi->sf.inter_sf.prune_obmc_prob_thresh > 0) {
+    const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->gf_group);
+
+    for (i = 0; i < BLOCK_SIZES_ALL; i++) {
+      int sum = 0;
+      for (int j = 0; j < 2; j++) sum += cpi->td.rd_counts.obmc_used[i][j];
+
+      const int new_prob =
+          sum ? 128 * cpi->td.rd_counts.obmc_used[i][1] / sum : 0;
+      frame_probs->obmc_probs[update_type][i] =
+          (frame_probs->obmc_probs[update_type][i] + new_prob) >> 1;
+    }
+  }
+
+  if (features->allow_warped_motion &&
+      cpi->sf.inter_sf.prune_warped_prob_thresh > 0) {
+    const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->gf_group);
+    int sum = 0;
+    for (i = 0; i < 2; i++) sum += cpi->td.rd_counts.warped_used[i];
+    const int new_prob = sum ? 128 * cpi->td.rd_counts.warped_used[1] / sum : 0;
+    frame_probs->warped_probs[update_type] =
+        (frame_probs->warped_probs[update_type] + new_prob) >> 1;
+  }
+
+  if (cm->current_frame.frame_type != KEY_FRAME &&
+      cpi->sf.interp_sf.adaptive_interp_filter_search == 2 &&
+      features->interp_filter == SWITCHABLE) {
+    const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->gf_group);
+
+    for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
+      int sum = 0;
+      int j;
+      int left = 1536;
+
+      for (j = 0; j < SWITCHABLE_FILTERS; j++) {
+        sum += cpi->td.counts->switchable_interp[i][j];
+      }
+
+      for (j = SWITCHABLE_FILTERS - 1; j >= 0; j--) {
+        const int new_prob =
+            sum ? 1536 * cpi->td.counts->switchable_interp[i][j] / sum
+                : (j ? 0 : 1536);
+        int prob = (frame_probs->switchable_interp_probs[update_type][i][j] +
+                    new_prob) >>
+                   1;
+        left -= prob;
+        if (j == 0) prob += left;
+        frame_probs->switchable_interp_probs[update_type][i][j] = prob;
+      }
+    }
+  }
+
+  if ((!is_stat_generation_stage(cpi) && av1_use_hash_me(cpi) &&
+       !cpi->sf.rt_sf.use_nonrd_pick_mode) ||
+      hash_table_created) {
+    av1_hash_table_destroy(&intrabc_hash_info->intrabc_hash_table);
+  }
+}
+
+void av1_encode_frame(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  CurrentFrame *const current_frame = &cm->current_frame;
+  FeatureFlags *const features = &cm->features;
+  const int num_planes = av1_num_planes(cm);
+  // Indicates whether or not to use a default reduced set for ext-tx
+  // rather than the potential full set of 16 transforms
+  features->reduced_tx_set_used = cpi->oxcf.reduced_tx_type_set;
+
+  // Make sure segment_id is no larger than last_active_segid.
+  if (cm->seg.enabled && cm->seg.update_map) {
+    const int mi_rows = cm->mi_params.mi_rows;
+    const int mi_cols = cm->mi_params.mi_cols;
+    const int last_active_segid = cm->seg.last_active_segid;
+    uint8_t *map = cpi->enc_seg.map;
+    for (int mi_row = 0; mi_row < mi_rows; ++mi_row) {
+      for (int mi_col = 0; mi_col < mi_cols; ++mi_col) {
+        map[mi_col] = AOMMIN(map[mi_col], last_active_segid);
+      }
+      map += mi_cols;
+    }
+  }
+
+  av1_setup_frame_buf_refs(cm);
+  enforce_max_ref_frames(cpi, &cpi->ref_frame_flags);
+  set_rel_frame_dist(cpi);
+  av1_setup_frame_sign_bias(cm);
+
+#if CHECK_PRECOMPUTED_REF_FRAME_MAP
+  GF_GROUP *gf_group = &cpi->gf_group;
+  // TODO(yuec): The check is disabled on OVERLAY frames for now, because info
+  // in cpi->gf_group has been refreshed for the next GOP when the check is
+  // performed for OVERLAY frames. Since we have not support inter-GOP ref
+  // frame map computation, the precomputed ref map for an OVERLAY frame is all
+  // -1 at this point (although it is meaning before gf_group is refreshed).
+  if (!frame_is_intra_only(cm) && gf_group->index != 0) {
+    const RefCntBuffer *const golden_buf = get_ref_frame_buf(cm, GOLDEN_FRAME);
+
+    if (golden_buf) {
+      const int golden_order_hint = golden_buf->order_hint;
+
+      for (int ref = LAST_FRAME; ref < EXTREF_FRAME; ++ref) {
+        const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref);
+        const int ref_disp_idx_precomputed =
+            gf_group->ref_frame_disp_idx[gf_group->index][ref - LAST_FRAME];
+
+        (void)ref_disp_idx_precomputed;
+
+        if (buf != NULL) {
+          const int ref_disp_idx =
+              get_relative_dist(&cm->seq_params.order_hint_info,
+                                buf->order_hint, golden_order_hint);
+
+          if (ref_disp_idx >= 0)
+            assert(ref_disp_idx == ref_disp_idx_precomputed);
+          else
+            assert(ref_disp_idx_precomputed == -1);
+        } else {
+          assert(ref_disp_idx_precomputed == -1);
+        }
+      }
+    }
+  }
+#endif
+
+#if CONFIG_MISMATCH_DEBUG
+  mismatch_reset_frame(num_planes);
+#else
+  (void)num_planes;
+#endif
+
+  if (cpi->sf.hl_sf.frame_parameter_update) {
+    RD_COUNTS *const rdc = &cpi->td.rd_counts;
+
+    if (frame_is_intra_only(cm))
+      current_frame->reference_mode = SINGLE_REFERENCE;
+    else
+      current_frame->reference_mode = REFERENCE_MODE_SELECT;
+
+    features->interp_filter = SWITCHABLE;
+    if (cm->tiles.large_scale) features->interp_filter = EIGHTTAP_REGULAR;
+
+    features->switchable_motion_mode = 1;
+
+    rdc->compound_ref_used_flag = 0;
+    rdc->skip_mode_used_flag = 0;
+
+    encode_frame_internal(cpi);
+
+    if (current_frame->reference_mode == REFERENCE_MODE_SELECT) {
+      // Use a flag that includes 4x4 blocks
+      if (rdc->compound_ref_used_flag == 0) {
+        current_frame->reference_mode = SINGLE_REFERENCE;
+#if CONFIG_ENTROPY_STATS
+        av1_zero(cpi->td.counts->comp_inter);
+#endif  // CONFIG_ENTROPY_STATS
+      }
+    }
+    // Re-check on the skip mode status as reference mode may have been
+    // changed.
+    SkipModeInfo *const skip_mode_info = &current_frame->skip_mode_info;
+    if (frame_is_intra_only(cm) ||
+        current_frame->reference_mode == SINGLE_REFERENCE) {
+      skip_mode_info->skip_mode_allowed = 0;
+      skip_mode_info->skip_mode_flag = 0;
+    }
+    if (skip_mode_info->skip_mode_flag && rdc->skip_mode_used_flag == 0)
+      skip_mode_info->skip_mode_flag = 0;
+
+    if (!cm->tiles.large_scale) {
+      if (features->tx_mode == TX_MODE_SELECT &&
+          cpi->td.mb.txb_split_count == 0)
+        features->tx_mode = TX_MODE_LARGEST;
+    }
+  } else {
+    encode_frame_internal(cpi);
+  }
+}
+
+static AOM_INLINE void update_txfm_count(MACROBLOCK *x, MACROBLOCKD *xd,
+                                         FRAME_COUNTS *counts, TX_SIZE tx_size,
+                                         int depth, int blk_row, int blk_col,
+                                         uint8_t allow_update_cdf) {
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const int max_blocks_high = max_block_high(xd, bsize, 0);
+  const int max_blocks_wide = max_block_wide(xd, bsize, 0);
+  int ctx = txfm_partition_context(xd->above_txfm_context + blk_col,
+                                   xd->left_txfm_context + blk_row,
+                                   mbmi->sb_type, tx_size);
+  const int txb_size_index = av1_get_txb_size_index(bsize, blk_row, blk_col);
+  const TX_SIZE plane_tx_size = mbmi->inter_tx_size[txb_size_index];
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+  assert(tx_size > TX_4X4);
+
+  if (depth == MAX_VARTX_DEPTH) {
+    // Don't add to counts in this case
+    mbmi->tx_size = tx_size;
+    txfm_partition_update(xd->above_txfm_context + blk_col,
+                          xd->left_txfm_context + blk_row, tx_size, tx_size);
+    return;
+  }
+
+  if (tx_size == plane_tx_size) {
+#if CONFIG_ENTROPY_STATS
+    ++counts->txfm_partition[ctx][0];
+#endif
+    if (allow_update_cdf)
+      update_cdf(xd->tile_ctx->txfm_partition_cdf[ctx], 0, 2);
+    mbmi->tx_size = tx_size;
+    txfm_partition_update(xd->above_txfm_context + blk_col,
+                          xd->left_txfm_context + blk_row, tx_size, tx_size);
+  } else {
+    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+    const int bsw = tx_size_wide_unit[sub_txs];
+    const int bsh = tx_size_high_unit[sub_txs];
+
+#if CONFIG_ENTROPY_STATS
+    ++counts->txfm_partition[ctx][1];
+#endif
+    if (allow_update_cdf)
+      update_cdf(xd->tile_ctx->txfm_partition_cdf[ctx], 1, 2);
+    ++x->txb_split_count;
+
+    if (sub_txs == TX_4X4) {
+      mbmi->inter_tx_size[txb_size_index] = TX_4X4;
+      mbmi->tx_size = TX_4X4;
+      txfm_partition_update(xd->above_txfm_context + blk_col,
+                            xd->left_txfm_context + blk_row, TX_4X4, tx_size);
+      return;
+    }
+
+    for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) {
+      for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
+        int offsetr = row;
+        int offsetc = col;
+
+        update_txfm_count(x, xd, counts, sub_txs, depth + 1, blk_row + offsetr,
+                          blk_col + offsetc, allow_update_cdf);
+      }
+    }
+  }
+}
+
+static AOM_INLINE void tx_partition_count_update(const AV1_COMMON *const cm,
+                                                 MACROBLOCK *x,
+                                                 BLOCK_SIZE plane_bsize,
+                                                 FRAME_COUNTS *td_counts,
+                                                 uint8_t allow_update_cdf) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  const int mi_width = mi_size_wide[plane_bsize];
+  const int mi_height = mi_size_high[plane_bsize];
+  const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, 0);
+  const int bh = tx_size_high_unit[max_tx_size];
+  const int bw = tx_size_wide_unit[max_tx_size];
+
+  xd->above_txfm_context =
+      cm->above_contexts.txfm[xd->tile.tile_row] + xd->mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (xd->mi_row & MAX_MIB_MASK);
+
+  for (int idy = 0; idy < mi_height; idy += bh) {
+    for (int idx = 0; idx < mi_width; idx += bw) {
+      update_txfm_count(x, xd, td_counts, max_tx_size, 0, idy, idx,
+                        allow_update_cdf);
+    }
+  }
+}
+
+static AOM_INLINE void set_txfm_context(MACROBLOCKD *xd, TX_SIZE tx_size,
+                                        int blk_row, int blk_col) {
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const int max_blocks_high = max_block_high(xd, bsize, 0);
+  const int max_blocks_wide = max_block_wide(xd, bsize, 0);
+  const int txb_size_index = av1_get_txb_size_index(bsize, blk_row, blk_col);
+  const TX_SIZE plane_tx_size = mbmi->inter_tx_size[txb_size_index];
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+  if (tx_size == plane_tx_size) {
+    mbmi->tx_size = tx_size;
+    txfm_partition_update(xd->above_txfm_context + blk_col,
+                          xd->left_txfm_context + blk_row, tx_size, tx_size);
+
+  } else {
+    if (tx_size == TX_8X8) {
+      mbmi->inter_tx_size[txb_size_index] = TX_4X4;
+      mbmi->tx_size = TX_4X4;
+      txfm_partition_update(xd->above_txfm_context + blk_col,
+                            xd->left_txfm_context + blk_row, TX_4X4, tx_size);
+      return;
+    }
+    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+    const int bsw = tx_size_wide_unit[sub_txs];
+    const int bsh = tx_size_high_unit[sub_txs];
+    for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) {
+      for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
+        const int offsetr = blk_row + row;
+        const int offsetc = blk_col + col;
+        if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
+        set_txfm_context(xd, sub_txs, offsetr, offsetc);
+      }
+    }
+  }
+}
+
+static AOM_INLINE void tx_partition_set_contexts(const AV1_COMMON *const cm,
+                                                 MACROBLOCKD *xd,
+                                                 BLOCK_SIZE plane_bsize) {
+  const int mi_width = mi_size_wide[plane_bsize];
+  const int mi_height = mi_size_high[plane_bsize];
+  const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, 0);
+  const int bh = tx_size_high_unit[max_tx_size];
+  const int bw = tx_size_wide_unit[max_tx_size];
+
+  xd->above_txfm_context =
+      cm->above_contexts.txfm[xd->tile.tile_row] + xd->mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (xd->mi_row & MAX_MIB_MASK);
+
+  for (int idy = 0; idy < mi_height; idy += bh) {
+    for (int idx = 0; idx < mi_width; idx += bw) {
+      set_txfm_context(xd, max_tx_size, idy, idx);
+    }
+  }
+}
+
+static AOM_INLINE void encode_superblock(const AV1_COMP *const cpi,
+                                         TileDataEnc *tile_data, ThreadData *td,
+                                         TOKENEXTRA **t, RUN_TYPE dry_run,
+                                         BLOCK_SIZE bsize, int *rate) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO **mi_4x4 = xd->mi;
+  MB_MODE_INFO *mbmi = mi_4x4[0];
+  const int seg_skip =
+      segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP);
+  const int mis = cm->mi_params.mi_stride;
+  const int mi_width = mi_size_wide[bsize];
+  const int mi_height = mi_size_high[bsize];
+  const int is_inter = is_inter_block(mbmi);
+
+  // Initialize tx_mode and tx_size_search_method
+  set_tx_size_search_method(
+      cm, &cpi->winner_mode_params, x,
+      cpi->sf.winner_mode_sf.enable_winner_mode_for_tx_size_srch, 1);
+
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+  if (!is_inter) {
+    xd->cfl.store_y = store_cfl_required(cm, xd);
+    mbmi->skip = 1;
+    for (int plane = 0; plane < num_planes; ++plane) {
+      av1_encode_intra_block_plane(cpi, x, bsize, plane, dry_run,
+                                   cpi->optimize_seg_arr[mbmi->segment_id]);
+    }
+
+    // If there is at least one lossless segment, force the skip for intra
+    // block to be 0, in order to avoid the segment_id to be changed by in
+    // write_segment_id().
+    if (!cpi->common.seg.segid_preskip && cpi->common.seg.update_map &&
+        cpi->enc_seg.has_lossless_segment)
+      mbmi->skip = 0;
+
+    xd->cfl.store_y = 0;
+    if (av1_allow_palette(cm->features.allow_screen_content_tools, bsize)) {
+      for (int plane = 0; plane < AOMMIN(2, num_planes); ++plane) {
+        if (mbmi->palette_mode_info.palette_size[plane] > 0) {
+          if (!dry_run) {
+            av1_tokenize_color_map(x, plane, t, bsize, mbmi->tx_size,
+                                   PALETTE_MAP, tile_data->allow_update_cdf,
+                                   td->counts);
+          } else if (dry_run == DRY_RUN_COSTCOEFFS) {
+            rate +=
+                av1_cost_color_map(x, plane, bsize, mbmi->tx_size, PALETTE_MAP);
+          }
+        }
+      }
+    }
+
+    av1_update_txb_context(cpi, td, dry_run, bsize,
+                           tile_data->allow_update_cdf);
+  } else {
+    int ref;
+    const int is_compound = has_second_ref(mbmi);
+
+    set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+    for (ref = 0; ref < 1 + is_compound; ++ref) {
+      const YV12_BUFFER_CONFIG *cfg =
+          get_ref_frame_yv12_buf(cm, mbmi->ref_frame[ref]);
+      assert(IMPLIES(!is_intrabc_block(mbmi), cfg));
+      av1_setup_pre_planes(xd, ref, cfg, mi_row, mi_col,
+                           xd->block_ref_scale_factors[ref], num_planes);
+    }
+    int start_plane = (cpi->sf.rt_sf.reuse_inter_pred_nonrd) ? 1 : 0;
+    av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+                                  start_plane, av1_num_planes(cm) - 1);
+    if (mbmi->motion_mode == OBMC_CAUSAL) {
+      assert(cpi->oxcf.enable_obmc == 1);
+      av1_build_obmc_inter_predictors_sb(cm, xd);
+    }
+
+#if CONFIG_MISMATCH_DEBUG
+    if (dry_run == OUTPUT_ENABLED) {
+      for (int plane = 0; plane < num_planes; ++plane) {
+        const struct macroblockd_plane *pd = &xd->plane[plane];
+        int pixel_c, pixel_r;
+        mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, 0, 0,
+                        pd->subsampling_x, pd->subsampling_y);
+        if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
+                                 pd->subsampling_y))
+          continue;
+        mismatch_record_block_pre(pd->dst.buf, pd->dst.stride,
+                                  cm->current_frame.order_hint, plane, pixel_c,
+                                  pixel_r, pd->width, pd->height,
+                                  xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH);
+      }
+    }
+#else
+    (void)num_planes;
+#endif
+
+    av1_encode_sb(cpi, x, bsize, dry_run);
+    av1_tokenize_sb_vartx(cpi, td, dry_run, bsize, rate,
+                          tile_data->allow_update_cdf);
+  }
+
+  if (!dry_run) {
+    if (av1_allow_intrabc(cm) && is_intrabc_block(mbmi)) td->intrabc_used = 1;
+    if (x->tx_mode_search_type == TX_MODE_SELECT &&
+        !xd->lossless[mbmi->segment_id] && mbmi->sb_type > BLOCK_4X4 &&
+        !(is_inter && (mbmi->skip || seg_skip))) {
+      if (is_inter) {
+        tx_partition_count_update(cm, x, bsize, td->counts,
+                                  tile_data->allow_update_cdf);
+      } else {
+        if (mbmi->tx_size != max_txsize_rect_lookup[bsize])
+          ++x->txb_split_count;
+        if (block_signals_txsize(bsize)) {
+          const int tx_size_ctx = get_tx_size_context(xd);
+          const int32_t tx_size_cat = bsize_to_tx_size_cat(bsize);
+          const int depth = tx_size_to_depth(mbmi->tx_size, bsize);
+          const int max_depths = bsize_to_max_depth(bsize);
+
+          if (tile_data->allow_update_cdf)
+            update_cdf(xd->tile_ctx->tx_size_cdf[tx_size_cat][tx_size_ctx],
+                       depth, max_depths + 1);
+#if CONFIG_ENTROPY_STATS
+          ++td->counts->intra_tx_size[tx_size_cat][tx_size_ctx][depth];
+#endif
+        }
+      }
+      assert(IMPLIES(is_rect_tx(mbmi->tx_size), is_rect_tx_allowed(xd, mbmi)));
+    } else {
+      int i, j;
+      TX_SIZE intra_tx_size;
+      // The new intra coding scheme requires no change of transform size
+      if (is_inter) {
+        if (xd->lossless[mbmi->segment_id]) {
+          intra_tx_size = TX_4X4;
+        } else {
+          intra_tx_size = tx_size_from_tx_mode(bsize, x->tx_mode_search_type);
+        }
+      } else {
+        intra_tx_size = mbmi->tx_size;
+      }
+
+      for (j = 0; j < mi_height; j++)
+        for (i = 0; i < mi_width; i++)
+          if (mi_col + i < cm->mi_params.mi_cols &&
+              mi_row + j < cm->mi_params.mi_rows)
+            mi_4x4[mis * j + i]->tx_size = intra_tx_size;
+
+      if (intra_tx_size != max_txsize_rect_lookup[bsize]) ++x->txb_split_count;
+    }
+  }
+
+  if (x->tx_mode_search_type == TX_MODE_SELECT &&
+      block_signals_txsize(mbmi->sb_type) && is_inter &&
+      !(mbmi->skip || seg_skip) && !xd->lossless[mbmi->segment_id]) {
+    if (dry_run) tx_partition_set_contexts(cm, xd, bsize);
+  } else {
+    TX_SIZE tx_size = mbmi->tx_size;
+    // The new intra coding scheme requires no change of transform size
+    if (is_inter) {
+      if (xd->lossless[mbmi->segment_id]) {
+        tx_size = TX_4X4;
+      } else {
+        tx_size = tx_size_from_tx_mode(bsize, x->tx_mode_search_type);
+      }
+    } else {
+      tx_size = (bsize > BLOCK_4X4) ? tx_size : TX_4X4;
+    }
+    mbmi->tx_size = tx_size;
+    set_txfm_ctxs(tx_size, xd->width, xd->height,
+                  (mbmi->skip || seg_skip) && is_inter_block(mbmi), xd);
+  }
+
+  if (is_inter_block(mbmi) && !xd->is_chroma_ref && is_cfl_allowed(xd)) {
+    cfl_store_block(xd, mbmi->sb_type, mbmi->tx_size);
+  }
+}
diff --git a/libs/libaom/src/av1/encoder/encodeframe.h b/libs/libaom/src/av1/encoder/encodeframe.h
new file mode 100644
index 000000000..e4c484105
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/encodeframe.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ENCODEFRAME_H_
+#define AOM_AV1_ENCODER_ENCODEFRAME_H_
+
+#include "aom/aom_integer.h"
+#include "av1/common/blockd.h"
+#include "av1/common/enums.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define DELTA_Q_PERCEPTUAL_MODULATION \
+  1  // 0: variance based
+     // 1: wavelet AC energy based
+
+struct macroblock;
+struct yv12_buffer_config;
+struct AV1_COMP;
+struct ThreadData;
+
+void av1_setup_src_planes(struct macroblock *x,
+                          const struct yv12_buffer_config *src, int mi_row,
+                          int mi_col, const int num_planes, BLOCK_SIZE bsize);
+
+void av1_encode_frame(struct AV1_COMP *cpi);
+
+void av1_alloc_tile_data(struct AV1_COMP *cpi);
+void av1_init_tile_data(struct AV1_COMP *cpi);
+void av1_encode_tile(struct AV1_COMP *cpi, struct ThreadData *td, int tile_row,
+                     int tile_col);
+void av1_encode_sb_row(struct AV1_COMP *cpi, struct ThreadData *td,
+                       int tile_row, int tile_col, int mi_row);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_ENCODEFRAME_H_
diff --git a/libs/libaom/src/av1/encoder/encodemb.c b/libs/libaom/src/av1/encoder/encodemb.c
new file mode 100644
index 000000000..ec3336229
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/encodemb.c
@@ -0,0 +1,805 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/bitwriter.h"
+#include "aom_dsp/quantize.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+
+#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
+#include "aom_util/debug_util.h"
+#endif  // CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
+
+#include "av1/common/cfl.h"
+#include "av1/common/idct.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+#include "av1/common/scan.h"
+
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/encodemb.h"
+#include "av1/encoder/encodetxb.h"
+#include "av1/encoder/hybrid_fwd_txfm.h"
+#include "av1/encoder/rd.h"
+#include "av1/encoder/rdopt.h"
+
+void av1_subtract_block(const MACROBLOCKD *xd, int rows, int cols,
+                        int16_t *diff, ptrdiff_t diff_stride,
+                        const uint8_t *src8, ptrdiff_t src_stride,
+                        const uint8_t *pred8, ptrdiff_t pred_stride) {
+  assert(rows >= 4 && cols >= 4);
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (is_cur_buf_hbd(xd)) {
+    aom_highbd_subtract_block(rows, cols, diff, diff_stride, src8, src_stride,
+                              pred8, pred_stride, xd->bd);
+    return;
+  }
+#endif
+  (void)xd;
+  aom_subtract_block(rows, cols, diff, diff_stride, src8, src_stride, pred8,
+                     pred_stride);
+}
+
+void av1_subtract_txb(MACROBLOCK *x, int plane, BLOCK_SIZE plane_bsize,
+                      int blk_col, int blk_row, TX_SIZE tx_size) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblock_plane *const p = &x->plane[plane];
+  const struct macroblockd_plane *const pd = &x->e_mbd.plane[plane];
+  const int diff_stride = block_size_wide[plane_bsize];
+  const int src_stride = p->src.stride;
+  const int dst_stride = pd->dst.stride;
+  const int tx1d_width = tx_size_wide[tx_size];
+  const int tx1d_height = tx_size_high[tx_size];
+  uint8_t *dst = &pd->dst.buf[(blk_row * dst_stride + blk_col) << MI_SIZE_LOG2];
+  uint8_t *src = &p->src.buf[(blk_row * src_stride + blk_col) << MI_SIZE_LOG2];
+  int16_t *src_diff =
+      &p->src_diff[(blk_row * diff_stride + blk_col) << MI_SIZE_LOG2];
+  av1_subtract_block(xd, tx1d_height, tx1d_width, src_diff, diff_stride, src,
+                     src_stride, dst, dst_stride);
+}
+
+void av1_subtract_plane(MACROBLOCK *x, BLOCK_SIZE plane_bsize, int plane) {
+  struct macroblock_plane *const p = &x->plane[plane];
+  const struct macroblockd_plane *const pd = &x->e_mbd.plane[plane];
+  assert(plane_bsize < BLOCK_SIZES_ALL);
+  const int bw = block_size_wide[plane_bsize];
+  const int bh = block_size_high[plane_bsize];
+  const MACROBLOCKD *xd = &x->e_mbd;
+
+  av1_subtract_block(xd, bh, bw, p->src_diff, bw, p->src.buf, p->src.stride,
+                     pd->dst.buf, pd->dst.stride);
+}
+
+int av1_optimize_b(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
+                   int block, TX_SIZE tx_size, TX_TYPE tx_type,
+                   const TXB_CTX *const txb_ctx, int fast_mode,
+                   int *rate_cost) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblock_plane *const p = &x->plane[plane];
+  const int eob = p->eobs[block];
+  const int segment_id = xd->mi[0]->segment_id;
+
+  if (eob == 0 || !cpi->optimize_seg_arr[segment_id] ||
+      xd->lossless[segment_id]) {
+    *rate_cost = av1_cost_skip_txb(x, txb_ctx, plane, tx_size);
+    return eob;
+  }
+
+  return av1_optimize_txb_new(cpi, x, plane, block, tx_size, tx_type, txb_ctx,
+                              rate_cost, cpi->oxcf.sharpness, fast_mode);
+}
+
+// Hyper-parameters for dropout optimization, based on following logics.
+// TODO(yjshen): These settings are tuned by experiments. They may still be
+// optimized for better performance.
+// (1) Coefficients which are large enough will ALWAYS be kept.
+const tran_low_t DROPOUT_COEFF_MAX = 2;  // Max dropout-able coefficient.
+// (2) Continuous coefficients will ALWAYS be kept. Here rigorous continuity is
+//     NOT required. For example, `5 0 0 0 7` is treated as two continuous
+//     coefficients if three zeros do not fulfill the dropout condition.
+const int DROPOUT_CONTINUITY_MAX = 2;  // Max dropout-able continuous coeff.
+// (3) Dropout operation is NOT applicable to blocks with large or small
+//     quantization index.
+const int DROPOUT_Q_MAX = 128;
+const int DROPOUT_Q_MIN = 16;
+// (4) Recall that dropout optimization will forcibly set some quantized
+//     coefficients to zero. The key logic on determining whether a coefficient
+//     should be dropped is to check the number of continuous zeros before AND
+//     after this coefficient. The exact number of zeros for judgement depends
+//     on block size and quantization index. More concretely, block size
+//     determines the base number of zeros, while quantization index determines
+//     the multiplier. Intuitively, larger block requires more zeros and larger
+//     quantization index also requires more zeros (more information is lost
+//     when using larger quantization index).
+const int DROPOUT_BEFORE_BASE_MAX = 32;  // Max base number for leading zeros.
+const int DROPOUT_BEFORE_BASE_MIN = 16;  // Min base number for leading zeros.
+const int DROPOUT_AFTER_BASE_MAX = 32;   // Max base number for trailing zeros.
+const int DROPOUT_AFTER_BASE_MIN = 16;   // Min base number for trailing zeros.
+const int DROPOUT_MULTIPLIER_MAX = 8;    // Max multiplier on number of zeros.
+const int DROPOUT_MULTIPLIER_MIN = 2;    // Min multiplier on number of zeros.
+const int DROPOUT_MULTIPLIER_Q_BASE = 32;  // Base Q to compute multiplier.
+
+void av1_dropout_qcoeff(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
+                        TX_TYPE tx_type, int qindex) {
+  MACROBLOCKD *const xd = &mb->e_mbd;
+  const struct macroblock_plane *const p = &mb->plane[plane];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  tran_low_t *const qcoeff = p->qcoeff + BLOCK_OFFSET(block);
+  tran_low_t *const dqcoeff = pd->dqcoeff + BLOCK_OFFSET(block);
+  const int tx_width = tx_size_wide[tx_size];
+  const int tx_height = tx_size_high[tx_size];
+  const int max_eob = av1_get_max_eob(tx_size);
+  const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
+
+  // Early return if `qindex` is out of range.
+  if (qindex > DROPOUT_Q_MAX || qindex < DROPOUT_Q_MIN) {
+    return;
+  }
+
+  // Compute number of zeros used for dropout judgement.
+  const int base_size = AOMMAX(tx_width, tx_height);
+  const int multiplier = CLIP(qindex / DROPOUT_MULTIPLIER_Q_BASE,
+                              DROPOUT_MULTIPLIER_MIN, DROPOUT_MULTIPLIER_MAX);
+  const int dropout_num_before =
+      multiplier *
+      CLIP(base_size, DROPOUT_BEFORE_BASE_MIN, DROPOUT_BEFORE_BASE_MAX);
+  const int dropout_num_after =
+      multiplier *
+      CLIP(base_size, DROPOUT_AFTER_BASE_MIN, DROPOUT_AFTER_BASE_MAX);
+
+  // Early return if there are not enough non-zero coefficients.
+  if (p->eobs[block] == 0 || p->eobs[block] <= dropout_num_before) {
+    return;
+  }
+
+  int count_zeros_before = 0;
+  int count_zeros_after = 0;
+  int count_nonzeros = 0;
+  // Index of the first non-zero coefficient after sufficient number of
+  // continuous zeros. If equals to `-1`, it means number of leading zeros
+  // hasn't reach `dropout_num_before`.
+  int idx = -1;
+  int eob = 0;  // New end of block.
+
+  for (int i = 0; i < p->eobs[block]; ++i) {
+    const int scan_idx = scan_order->scan[i];
+    if (qcoeff[scan_idx] > DROPOUT_COEFF_MAX) {  // Keep large coefficients.
+      count_zeros_before = 0;
+      count_zeros_after = 0;
+      idx = -1;
+      eob = i + 1;
+    } else if (qcoeff[scan_idx] == 0) {  // Count zeros.
+      if (idx == -1) {
+        ++count_zeros_before;
+      } else {
+        ++count_zeros_after;
+      }
+    } else {  // Count non-zeros.
+      if (count_zeros_before >= dropout_num_before) {
+        idx = (idx == -1) ? i : idx;
+        ++count_nonzeros;
+      } else {
+        count_zeros_before = 0;
+        eob = i + 1;
+      }
+    }
+
+    // Handle continuity.
+    if (count_nonzeros > DROPOUT_CONTINUITY_MAX) {
+      count_zeros_before = 0;
+      count_zeros_after = 0;
+      idx = -1;
+      eob = i + 1;
+    }
+
+    // Handle the trailing zeros after original end of block.
+    if (idx != -1 && i == p->eobs[block] - 1) {
+      count_zeros_after += (max_eob - p->eobs[block]);
+    }
+
+    // Set redundant coefficients to zeros if needed.
+    if (count_zeros_after >= dropout_num_after) {
+      for (int j = idx; j <= i; ++j) {
+        qcoeff[scan_order->scan[j]] = 0;
+        dqcoeff[scan_order->scan[j]] = 0;
+      }
+      count_zeros_before += (i - idx + 1);
+      count_zeros_after = 0;
+      count_nonzeros = 0;
+    } else if (i == p->eobs[block] - 1) {
+      eob = i + 1;
+    }
+  }
+
+  if (eob != p->eobs[block]) {
+    p->eobs[block] = eob;
+    p->txb_entropy_ctx[block] =
+        (uint8_t)av1_get_txb_entropy_context(qcoeff, scan_order, eob);
+  }
+}
+
+// Settings for optimization type. NOTE: To set optimization type for all intra
+// frames, both `KEY_BLOCK_OPT_TYPE` and `INTRA_BLOCK_OPT_TYPE` should be set.
+// TODO(yjshen): These settings are hard-coded and look okay for now. They
+// should be made configurable later.
+// Blocks of key frames ONLY.
+const OPT_TYPE KEY_BLOCK_OPT_TYPE = TRELLIS_DROPOUT_OPT;
+// Blocks of intra frames (key frames EXCLUSIVE).
+const OPT_TYPE INTRA_BLOCK_OPT_TYPE = TRELLIS_DROPOUT_OPT;
+// Blocks of inter frames. (NOTE: Dropout optimization is DISABLED by default
+// if trellis optimization is on for inter frames.)
+const OPT_TYPE INTER_BLOCK_OPT_TYPE = TRELLIS_DROPOUT_OPT;
+
+enum {
+  QUANT_FUNC_LOWBD = 0,
+  QUANT_FUNC_HIGHBD = 1,
+  QUANT_FUNC_TYPES = 2
+} UENUM1BYTE(QUANT_FUNC);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static AV1_QUANT_FACADE
+    quant_func_list[AV1_XFORM_QUANT_TYPES][QUANT_FUNC_TYPES] = {
+      { av1_quantize_fp_facade, av1_highbd_quantize_fp_facade },
+      { av1_quantize_b_facade, av1_highbd_quantize_b_facade },
+      { av1_quantize_dc_facade, av1_highbd_quantize_dc_facade },
+      { NULL, NULL }
+    };
+#else
+static AV1_QUANT_FACADE quant_func_list[AV1_XFORM_QUANT_TYPES] = {
+  av1_quantize_fp_facade, av1_quantize_b_facade, av1_quantize_dc_facade, NULL
+};
+#endif
+
+void av1_xform_quant(MACROBLOCK *x, int plane, int block, int blk_row,
+                     int blk_col, BLOCK_SIZE plane_bsize, TxfmParam *txfm_param,
+                     QUANT_PARAM *qparam) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const SCAN_ORDER *const scan_order =
+      get_scan(txfm_param->tx_size, txfm_param->tx_type);
+  const int block_offset = BLOCK_OFFSET(block);
+  tran_low_t *const coeff = p->coeff + block_offset;
+  tran_low_t *const qcoeff = p->qcoeff + block_offset;
+  tran_low_t *const dqcoeff = pd->dqcoeff + block_offset;
+  uint16_t *const eob = &p->eobs[block];
+  const int diff_stride = block_size_wide[plane_bsize];
+
+  const int src_offset = (blk_row * diff_stride + blk_col);
+  const int16_t *src_diff = &p->src_diff[src_offset << MI_SIZE_LOG2];
+
+  av1_fwd_txfm(src_diff, coeff, diff_stride, txfm_param);
+
+  if (qparam->xform_quant_idx != AV1_XFORM_QUANT_SKIP_QUANT) {
+    const int n_coeffs = av1_get_max_eob(txfm_param->tx_size);
+    if (LIKELY(!x->skip_block)) {
+#if CONFIG_AV1_HIGHBITDEPTH
+      quant_func_list[qparam->xform_quant_idx][txfm_param->is_hbd](
+          coeff, n_coeffs, p, qcoeff, dqcoeff, eob, scan_order, qparam);
+#else
+      quant_func_list[qparam->xform_quant_idx](
+          coeff, n_coeffs, p, qcoeff, dqcoeff, eob, scan_order, qparam);
+#endif
+    } else {
+      av1_quantize_skip(n_coeffs, qcoeff, dqcoeff, eob);
+    }
+  }
+  // use_optimize_b is true means av1_optimze_b will be called,
+  // thus cannot update entropy ctx now (performed in optimize_b)
+  if (qparam->use_optimize_b) {
+    p->txb_entropy_ctx[block] = 0;
+  } else {
+    p->txb_entropy_ctx[block] =
+        (uint8_t)av1_get_txb_entropy_context(qcoeff, scan_order, *eob);
+  }
+  return;
+}
+
+void av1_setup_xform(const AV1_COMMON *cm, MACROBLOCK *x, TX_SIZE tx_size,
+                     TX_TYPE tx_type, TxfmParam *txfm_param) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+
+  txfm_param->tx_type = tx_type;
+  txfm_param->tx_size = tx_size;
+  txfm_param->lossless = xd->lossless[mbmi->segment_id];
+  txfm_param->tx_set_type = av1_get_ext_tx_set_type(
+      tx_size, is_inter_block(mbmi), cm->features.reduced_tx_set_used);
+
+  txfm_param->bd = xd->bd;
+  txfm_param->is_hbd = is_cur_buf_hbd(xd);
+}
+void av1_setup_quant(TX_SIZE tx_size, int use_optimize_b, int xform_quant_idx,
+                     int use_quant_b_adapt, QUANT_PARAM *qparam) {
+  qparam->log_scale = av1_get_tx_scale(tx_size);
+  qparam->tx_size = tx_size;
+
+  qparam->use_quant_b_adapt = use_quant_b_adapt;
+
+  // TODO(bohanli): optimize_b and quantization idx has relationship,
+  // but is kind of buried and complicated in different encoding stages.
+  // Should have a unified function to derive quant_idx, rather than
+  // determine and pass in the quant_idx
+  qparam->use_optimize_b = use_optimize_b;
+  qparam->xform_quant_idx = xform_quant_idx;
+
+  qparam->qmatrix = NULL;
+  qparam->iqmatrix = NULL;
+}
+void av1_setup_qmatrix(const CommonQuantParams *quant_params,
+                       const MACROBLOCKD *xd, int plane, TX_SIZE tx_size,
+                       TX_TYPE tx_type, QUANT_PARAM *qparam) {
+  qparam->qmatrix = av1_get_qmatrix(quant_params, xd, plane, tx_size, tx_type);
+  qparam->iqmatrix =
+      av1_get_iqmatrix(quant_params, xd, plane, tx_size, tx_type);
+}
+
+static void encode_block(int plane, int block, int blk_row, int blk_col,
+                         BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg,
+                         RUN_TYPE dry_run) {
+  (void)dry_run;
+  struct encode_b_args *const args = arg;
+  const AV1_COMP *const cpi = args->cpi;
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = args->x;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  struct macroblock_plane *const p = &x->plane[plane];
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  tran_low_t *const dqcoeff = pd->dqcoeff + BLOCK_OFFSET(block);
+  uint8_t *dst;
+  ENTROPY_CONTEXT *a, *l;
+  int dummy_rate_cost = 0;
+
+  const int bw = mi_size_wide[plane_bsize];
+  dst = &pd->dst.buf[(blk_row * pd->dst.stride + blk_col) << MI_SIZE_LOG2];
+
+  a = &args->ta[blk_col];
+  l = &args->tl[blk_row];
+
+  TX_TYPE tx_type = DCT_DCT;
+  if (!is_blk_skip(x, plane, blk_row * bw + blk_col) && !mbmi->skip_mode) {
+    tx_type = av1_get_tx_type(xd, pd->plane_type, blk_row, blk_col, tx_size,
+                              cm->features.reduced_tx_set_used);
+    TxfmParam txfm_param;
+    QUANT_PARAM quant_param;
+    const int use_trellis = is_trellis_used(args->enable_optimize_b, dry_run);
+    int quant_idx;
+    if (use_trellis)
+      quant_idx = AV1_XFORM_QUANT_FP;
+    else
+      quant_idx =
+          USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP;
+    av1_setup_xform(cm, x, tx_size, tx_type, &txfm_param);
+    av1_setup_quant(tx_size, use_trellis, quant_idx, cpi->oxcf.quant_b_adapt,
+                    &quant_param);
+    av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, tx_type,
+                      &quant_param);
+    av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param,
+                    &quant_param);
+
+    // Whether trellis or dropout optimization is required for inter frames.
+    const bool do_trellis = INTER_BLOCK_OPT_TYPE == TRELLIS_OPT ||
+                            INTER_BLOCK_OPT_TYPE == TRELLIS_DROPOUT_OPT;
+    const bool do_dropout = INTER_BLOCK_OPT_TYPE == DROPOUT_OPT ||
+                            INTER_BLOCK_OPT_TYPE == TRELLIS_DROPOUT_OPT;
+
+    if (quant_param.use_optimize_b && do_trellis) {
+      TXB_CTX txb_ctx;
+      get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
+      av1_optimize_b(args->cpi, x, plane, block, tx_size, tx_type, &txb_ctx,
+                     args->cpi->sf.rd_sf.trellis_eob_fast, &dummy_rate_cost);
+    }
+    if (!quant_param.use_optimize_b && do_dropout) {
+      av1_dropout_qcoeff(x, plane, block, tx_size, tx_type,
+                         cm->quant_params.base_qindex);
+    }
+  } else {
+    p->eobs[block] = 0;
+    p->txb_entropy_ctx[block] = 0;
+  }
+
+  av1_set_txb_context(x, plane, block, tx_size, a, l);
+
+  if (p->eobs[block]) {
+    *(args->skip) = 0;
+    av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, dst,
+                                pd->dst.stride, p->eobs[block],
+                                cm->features.reduced_tx_set_used);
+  }
+
+  // TODO(debargha, jingning): Temporarily disable txk_type check for eob=0
+  // case. It is possible that certain collision in hash index would cause
+  // the assertion failure. To further optimize the rate-distortion
+  // performance, we need to re-visit this part and enable this assert
+  // again.
+  if (p->eobs[block] == 0 && plane == 0) {
+#if 0
+    if (args->cpi->oxcf.aq_mode == NO_AQ &&
+        args->cpi->oxcf.deltaq_mode == NO_DELTA_Q) {
+      // TODO(jingning,angiebird,huisu@google.com): enable txk_check when
+      // enable_optimize_b is true to detect potential RD bug.
+      const uint8_t disable_txk_check = args->enable_optimize_b;
+      if (!disable_txk_check) {
+        assert(xd->tx_type_map[blk_row * xd->tx_type_map_stride + blk_col)] ==
+            DCT_DCT);
+      }
+    }
+#endif
+    update_txk_array(xd, blk_row, blk_col, tx_size, DCT_DCT);
+  }
+
+#if CONFIG_MISMATCH_DEBUG
+  if (dry_run == OUTPUT_ENABLED) {
+    int pixel_c, pixel_r;
+    BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
+    int blk_w = block_size_wide[bsize];
+    int blk_h = block_size_high[bsize];
+    mi_to_pixel_loc(&pixel_c, &pixel_r, xd->mi_col, xd->mi_row, blk_col,
+                    blk_row, pd->subsampling_x, pd->subsampling_y);
+    mismatch_record_block_tx(dst, pd->dst.stride, cm->current_frame.order_hint,
+                             plane, pixel_c, pixel_r, blk_w, blk_h,
+                             xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH);
+  }
+#endif
+}
+
+static void encode_block_inter(int plane, int block, int blk_row, int blk_col,
+                               BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                               void *arg, RUN_TYPE dry_run) {
+  struct encode_b_args *const args = arg;
+  MACROBLOCK *const x = args->x;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+  const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+  const TX_SIZE plane_tx_size =
+      plane ? av1_get_max_uv_txsize(mbmi->sb_type, pd->subsampling_x,
+                                    pd->subsampling_y)
+            : mbmi->inter_tx_size[av1_get_txb_size_index(plane_bsize, blk_row,
+                                                         blk_col)];
+  if (!plane) {
+    assert(tx_size_wide[tx_size] >= tx_size_wide[plane_tx_size] &&
+           tx_size_high[tx_size] >= tx_size_high[plane_tx_size]);
+  }
+
+  if (tx_size == plane_tx_size || plane) {
+    encode_block(plane, block, blk_row, blk_col, plane_bsize, tx_size, arg,
+                 dry_run);
+  } else {
+    assert(tx_size < TX_SIZES_ALL);
+    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+    assert(IMPLIES(tx_size <= TX_4X4, sub_txs == tx_size));
+    assert(IMPLIES(tx_size > TX_4X4, sub_txs < tx_size));
+    // This is the square transform block partition entry point.
+    const int bsw = tx_size_wide_unit[sub_txs];
+    const int bsh = tx_size_high_unit[sub_txs];
+    const int step = bsh * bsw;
+    assert(bsw > 0 && bsh > 0);
+
+    for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) {
+      for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
+        const int offsetr = blk_row + row;
+        const int offsetc = blk_col + col;
+
+        if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
+
+        encode_block_inter(plane, block, offsetr, offsetc, plane_bsize, sub_txs,
+                           arg, dry_run);
+        block += step;
+      }
+    }
+  }
+}
+
+void av1_foreach_transformed_block_in_plane(
+    const MACROBLOCKD *const xd, BLOCK_SIZE plane_bsize, int plane,
+    foreach_transformed_block_visitor visit, void *arg) {
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  // block and transform sizes, in number of 4x4 blocks log 2 ("*_b")
+  // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8
+  // transform size varies per plane, look it up in a common way.
+  const TX_SIZE tx_size = av1_get_tx_size(plane, xd);
+  const uint8_t txw_unit = tx_size_wide_unit[tx_size];
+  const uint8_t txh_unit = tx_size_high_unit[tx_size];
+  const int step = txw_unit * txh_unit;
+
+  // If mb_to_right_edge is < 0 we are in a situation in which
+  // the current block size extends into the UMV and we won't
+  // visit the sub blocks that are wholly within the UMV.
+  const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+  const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+  const BLOCK_SIZE max_unit_bsize =
+      get_plane_block_size(BLOCK_64X64, pd->subsampling_x, pd->subsampling_y);
+  const int mu_blocks_wide =
+      AOMMIN(mi_size_wide[max_unit_bsize], max_blocks_wide);
+  const int mu_blocks_high =
+      AOMMIN(mi_size_high[max_unit_bsize], max_blocks_high);
+
+  // Keep track of the row and column of the blocks we use so that we know
+  // if we are in the unrestricted motion border.
+  int i = 0;
+  for (int r = 0; r < max_blocks_high; r += mu_blocks_high) {
+    const int unit_height = AOMMIN(mu_blocks_high + r, max_blocks_high);
+    // Skip visiting the sub blocks that are wholly within the UMV.
+    for (int c = 0; c < max_blocks_wide; c += mu_blocks_wide) {
+      const int unit_width = AOMMIN(mu_blocks_wide + c, max_blocks_wide);
+      for (int blk_row = r; blk_row < unit_height; blk_row += txh_unit) {
+        for (int blk_col = c; blk_col < unit_width; blk_col += txw_unit) {
+          visit(plane, i, blk_row, blk_col, plane_bsize, tx_size, arg);
+          i += step;
+        }
+      }
+    }
+  }
+}
+
+typedef struct encode_block_pass1_args {
+  AV1_COMP *cpi;
+  MACROBLOCK *x;
+} encode_block_pass1_args;
+
+static void encode_block_pass1(int plane, int block, int blk_row, int blk_col,
+                               BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                               void *arg) {
+  encode_block_pass1_args *args = (encode_block_pass1_args *)arg;
+  AV1_COMP *cpi = args->cpi;
+  AV1_COMMON *cm = &cpi->common;
+  MACROBLOCK *const x = args->x;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblock_plane *const p = &x->plane[plane];
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  tran_low_t *const dqcoeff = pd->dqcoeff + BLOCK_OFFSET(block);
+
+  uint8_t *dst;
+  dst = &pd->dst.buf[(blk_row * pd->dst.stride + blk_col) << MI_SIZE_LOG2];
+
+  TxfmParam txfm_param;
+  QUANT_PARAM quant_param;
+
+  av1_setup_xform(cm, x, tx_size, DCT_DCT, &txfm_param);
+  av1_setup_quant(tx_size, 0, AV1_XFORM_QUANT_B, cpi->oxcf.quant_b_adapt,
+                  &quant_param);
+  av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, DCT_DCT,
+                    &quant_param);
+
+  av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param,
+                  &quant_param);
+
+  if (p->eobs[block] > 0) {
+    txfm_param.eob = p->eobs[block];
+    if (txfm_param.is_hbd) {
+      av1_highbd_inv_txfm_add(dqcoeff, dst, pd->dst.stride, &txfm_param);
+      return;
+    }
+    av1_inv_txfm_add(dqcoeff, dst, pd->dst.stride, &txfm_param);
+  }
+}
+
+void av1_encode_sby_pass1(AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize) {
+  encode_block_pass1_args args = { cpi, x };
+  av1_subtract_plane(x, bsize, 0);
+  av1_foreach_transformed_block_in_plane(&x->e_mbd, bsize, 0,
+                                         encode_block_pass1, &args);
+}
+
+void av1_encode_sb(const struct AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+                   RUN_TYPE dry_run) {
+  assert(bsize < BLOCK_SIZES_ALL);
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  mbmi->skip = 1;
+  if (x->force_skip) return;
+
+  struct optimize_ctx ctx;
+  struct encode_b_args arg = {
+    cpi,  x,    &ctx,    &mbmi->skip,
+    NULL, NULL, dry_run, cpi->optimize_seg_arr[mbmi->segment_id]
+  };
+  const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  for (int plane = 0; plane < num_planes; ++plane) {
+    const struct macroblockd_plane *const pd = &xd->plane[plane];
+    const int subsampling_x = pd->subsampling_x;
+    const int subsampling_y = pd->subsampling_y;
+    if (plane && !xd->is_chroma_ref) break;
+    const BLOCK_SIZE plane_bsize =
+        get_plane_block_size(bsize, subsampling_x, subsampling_y);
+    assert(plane_bsize < BLOCK_SIZES_ALL);
+    const int mi_width = mi_size_wide[plane_bsize];
+    const int mi_height = mi_size_high[plane_bsize];
+    const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, plane);
+    const BLOCK_SIZE txb_size = txsize_to_bsize[max_tx_size];
+    const int bw = mi_size_wide[txb_size];
+    const int bh = mi_size_high[txb_size];
+    int block = 0;
+    const int step =
+        tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
+    av1_get_entropy_contexts(plane_bsize, pd, ctx.ta[plane], ctx.tl[plane]);
+    av1_subtract_plane(x, plane_bsize, plane);
+    arg.ta = ctx.ta[plane];
+    arg.tl = ctx.tl[plane];
+    const BLOCK_SIZE max_unit_bsize =
+        get_plane_block_size(BLOCK_64X64, subsampling_x, subsampling_y);
+    int mu_blocks_wide = mi_size_wide[max_unit_bsize];
+    int mu_blocks_high = mi_size_high[max_unit_bsize];
+    mu_blocks_wide = AOMMIN(mi_width, mu_blocks_wide);
+    mu_blocks_high = AOMMIN(mi_height, mu_blocks_high);
+
+    for (int idy = 0; idy < mi_height; idy += mu_blocks_high) {
+      for (int idx = 0; idx < mi_width; idx += mu_blocks_wide) {
+        int blk_row, blk_col;
+        const int unit_height = AOMMIN(mu_blocks_high + idy, mi_height);
+        const int unit_width = AOMMIN(mu_blocks_wide + idx, mi_width);
+        for (blk_row = idy; blk_row < unit_height; blk_row += bh) {
+          for (blk_col = idx; blk_col < unit_width; blk_col += bw) {
+            encode_block_inter(plane, block, blk_row, blk_col, plane_bsize,
+                               max_tx_size, &arg, dry_run);
+            block += step;
+          }
+        }
+      }
+    }
+  }
+}
+
+static void encode_block_intra_and_set_context(int plane, int block,
+                                               int blk_row, int blk_col,
+                                               BLOCK_SIZE plane_bsize,
+                                               TX_SIZE tx_size, void *arg) {
+  av1_encode_block_intra(plane, block, blk_row, blk_col, plane_bsize, tx_size,
+                         arg);
+
+  struct encode_b_args *const args = arg;
+  MACROBLOCK *x = args->x;
+  ENTROPY_CONTEXT *a = &args->ta[blk_col];
+  ENTROPY_CONTEXT *l = &args->tl[blk_row];
+  av1_set_txb_context(x, plane, block, tx_size, a, l);
+}
+
+void av1_encode_block_intra(int plane, int block, int blk_row, int blk_col,
+                            BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                            void *arg) {
+  struct encode_b_args *const args = arg;
+  const AV1_COMP *const cpi = args->cpi;
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = args->x;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblock_plane *const p = &x->plane[plane];
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  tran_low_t *dqcoeff = pd->dqcoeff + BLOCK_OFFSET(block);
+  PLANE_TYPE plane_type = get_plane_type(plane);
+  uint16_t *eob = &p->eobs[block];
+  const int dst_stride = pd->dst.stride;
+  uint8_t *dst = &pd->dst.buf[(blk_row * dst_stride + blk_col) << MI_SIZE_LOG2];
+  int dummy_rate_cost = 0;
+
+  av1_predict_intra_block_facade(cm, xd, plane, blk_col, blk_row, tx_size);
+
+  TX_TYPE tx_type = DCT_DCT;
+  const int bw = mi_size_wide[plane_bsize];
+  if (plane == 0 && is_blk_skip(x, plane, blk_row * bw + blk_col)) {
+    *eob = 0;
+    p->txb_entropy_ctx[block] = 0;
+  } else {
+    av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size);
+
+    const ENTROPY_CONTEXT *a = &args->ta[blk_col];
+    const ENTROPY_CONTEXT *l = &args->tl[blk_row];
+    tx_type = av1_get_tx_type(xd, plane_type, blk_row, blk_col, tx_size,
+                              cm->features.reduced_tx_set_used);
+    TxfmParam txfm_param;
+    QUANT_PARAM quant_param;
+    const int use_trellis =
+        is_trellis_used(args->enable_optimize_b, args->dry_run);
+    int quant_idx;
+    if (use_trellis)
+      quant_idx = AV1_XFORM_QUANT_FP;
+    else
+      quant_idx =
+          USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP;
+
+    av1_setup_xform(cm, x, tx_size, tx_type, &txfm_param);
+    av1_setup_quant(tx_size, use_trellis, quant_idx, cpi->oxcf.quant_b_adapt,
+                    &quant_param);
+    av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, tx_type,
+                      &quant_param);
+
+    av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param,
+                    &quant_param);
+
+    // Whether trellis or dropout optimization is required for key frames and
+    // intra frames.
+    const bool do_trellis = (frame_is_intra_only(cm) &&
+                             (KEY_BLOCK_OPT_TYPE == TRELLIS_OPT ||
+                              KEY_BLOCK_OPT_TYPE == TRELLIS_DROPOUT_OPT)) ||
+                            (!frame_is_intra_only(cm) &&
+                             (INTRA_BLOCK_OPT_TYPE == TRELLIS_OPT ||
+                              INTRA_BLOCK_OPT_TYPE == TRELLIS_DROPOUT_OPT));
+    const bool do_dropout = (frame_is_intra_only(cm) &&
+                             (KEY_BLOCK_OPT_TYPE == DROPOUT_OPT ||
+                              KEY_BLOCK_OPT_TYPE == TRELLIS_DROPOUT_OPT)) ||
+                            (!frame_is_intra_only(cm) &&
+                             (INTRA_BLOCK_OPT_TYPE == DROPOUT_OPT ||
+                              INTRA_BLOCK_OPT_TYPE == TRELLIS_DROPOUT_OPT));
+
+    if (quant_param.use_optimize_b && do_trellis) {
+      TXB_CTX txb_ctx;
+      get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
+      av1_optimize_b(args->cpi, x, plane, block, tx_size, tx_type, &txb_ctx,
+                     args->cpi->sf.rd_sf.trellis_eob_fast, &dummy_rate_cost);
+    }
+    if (do_dropout) {
+      av1_dropout_qcoeff(x, plane, block, tx_size, tx_type,
+                         cm->quant_params.base_qindex);
+    }
+  }
+
+  if (*eob) {
+    av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, dst,
+                                dst_stride, *eob,
+                                cm->features.reduced_tx_set_used);
+  }
+
+  // TODO(jingning): Temporarily disable txk_type check for eob=0 case.
+  // It is possible that certain collision in hash index would cause
+  // the assertion failure. To further optimize the rate-distortion
+  // performance, we need to re-visit this part and enable this assert
+  // again.
+  if (*eob == 0 && plane == 0) {
+#if 0
+    if (args->cpi->oxcf.aq_mode == NO_AQ
+        && args->cpi->oxcf.deltaq_mode == NO_DELTA_Q) {
+      assert(xd->tx_type_map[blk_row * xd->tx_type_map_stride + blk_col)] ==
+          DCT_DCT);
+    }
+#endif
+    update_txk_array(xd, blk_row, blk_col, tx_size, DCT_DCT);
+  }
+
+  // For intra mode, skipped blocks are so rare that transmitting skip=1 is
+  // very expensive.
+  *(args->skip) = 0;
+
+  if (plane == AOM_PLANE_Y && xd->cfl.store_y) {
+    cfl_store_tx(xd, blk_row, blk_col, tx_size, plane_bsize);
+  }
+}
+
+void av1_encode_intra_block_plane(const struct AV1_COMP *cpi, MACROBLOCK *x,
+                                  BLOCK_SIZE bsize, int plane, RUN_TYPE dry_run,
+                                  TRELLIS_OPT_TYPE enable_optimize_b) {
+  assert(bsize < BLOCK_SIZES_ALL);
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  if (plane && !xd->is_chroma_ref) return;
+
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int ss_x = pd->subsampling_x;
+  const int ss_y = pd->subsampling_y;
+  ENTROPY_CONTEXT ta[MAX_MIB_SIZE] = { 0 };
+  ENTROPY_CONTEXT tl[MAX_MIB_SIZE] = { 0 };
+  struct encode_b_args arg = { cpi, x,  NULL,    &(xd->mi[0]->skip),
+                               ta,  tl, dry_run, enable_optimize_b };
+  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y);
+  if (enable_optimize_b) {
+    av1_get_entropy_contexts(plane_bsize, pd, ta, tl);
+  }
+  av1_foreach_transformed_block_in_plane(
+      xd, plane_bsize, plane, encode_block_intra_and_set_context, &arg);
+}
diff --git a/libs/libaom/src/av1/encoder/encodemb.h b/libs/libaom/src/av1/encoder/encodemb.h
new file mode 100644
index 000000000..a337c83db
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/encodemb.h
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ENCODEMB_H_
+#define AOM_AV1_ENCODER_ENCODEMB_H_
+
+#include "config/aom_config.h"
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/txb_common.h"
+#include "av1/encoder/block.h"
+#include "av1/encoder/tokenize.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct optimize_ctx {
+  ENTROPY_CONTEXT ta[MAX_MB_PLANE][MAX_MIB_SIZE];
+  ENTROPY_CONTEXT tl[MAX_MB_PLANE][MAX_MIB_SIZE];
+};
+
+struct encode_b_args {
+  const struct AV1_COMP *cpi;
+  MACROBLOCK *x;
+  struct optimize_ctx *ctx;
+  int8_t *skip;
+  ENTROPY_CONTEXT *ta;
+  ENTROPY_CONTEXT *tl;
+  RUN_TYPE dry_run;
+  TRELLIS_OPT_TYPE enable_optimize_b;
+};
+
+enum {
+  AV1_XFORM_QUANT_FP = 0,
+  AV1_XFORM_QUANT_B = 1,
+  AV1_XFORM_QUANT_DC = 2,
+  AV1_XFORM_QUANT_SKIP_QUANT,
+  AV1_XFORM_QUANT_TYPES,
+} UENUM1BYTE(AV1_XFORM_QUANT);
+
+// Available optimization types to optimize the quantized coefficients.
+enum {
+  NONE_OPT = 0,            // No optimization.
+  TRELLIS_OPT = 1,         // Trellis optimization. See `av1_optimize_b()`.
+  DROPOUT_OPT = 2,         // Dropout optimization. See `av1_dropout_qcoeff()`.
+  TRELLIS_DROPOUT_OPT = 3  // Perform dropout after trellis optimization.
+} UENUM1BYTE(OPT_TYPE);
+
+void av1_encode_sb(const struct AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+                   RUN_TYPE dry_run);
+
+void av1_foreach_transformed_block_in_plane(
+    const MACROBLOCKD *const xd, BLOCK_SIZE plane_bsize, int plane,
+    foreach_transformed_block_visitor visit, void *arg);
+
+void av1_encode_sby_pass1(struct AV1_COMP *cpi, MACROBLOCK *x,
+                          BLOCK_SIZE bsize);
+
+void av1_setup_xform(const AV1_COMMON *cm, MACROBLOCK *x, TX_SIZE tx_size,
+                     TX_TYPE tx_type, TxfmParam *txfm_param);
+void av1_setup_quant(TX_SIZE tx_size, int use_optimize_b, int xform_quant_idx,
+                     int use_quant_b_adapt, QUANT_PARAM *qparam);
+void av1_setup_qmatrix(const CommonQuantParams *quant_params,
+                       const MACROBLOCKD *xd, int plane, TX_SIZE tx_size,
+                       TX_TYPE tx_type, QUANT_PARAM *qparam);
+
+void av1_xform_quant(MACROBLOCK *x, int plane, int block, int blk_row,
+                     int blk_col, BLOCK_SIZE plane_bsize, TxfmParam *txfm_param,
+                     QUANT_PARAM *qparam);
+
+int av1_optimize_b(const struct AV1_COMP *cpi, MACROBLOCK *mb, int plane,
+                   int block, TX_SIZE tx_size, TX_TYPE tx_type,
+                   const TXB_CTX *const txb_ctx, int fast_mode, int *rate_cost);
+
+// This function can be used as (i) a further optimization to reduce the
+// redundancy of quantized coefficients (a.k.a., `qcoeff`) after trellis
+// optimization, or (ii) an alternative to trellis optimization in high-speed
+// compression mode (e.g., real-time mode under speed-6) due to its LOW time
+// complexity. The rational behind is to drop out the may-be redundant quantized
+// coefficient which is among a bunch of zeros. NOTE: This algorithm is not as
+// accurate as trellis optimization since the hyper-parameters are hard-coded
+// instead of dynamic search. More adaptive logic may improve the performance.
+// This function should be applied to all or partical block cells.
+// Inputs:
+//   mb: Pointer to the MACROBLOCK to perform dropout on.
+//   plane: Index of the plane to which the target block belongs.
+//   block: Index of the target block.
+//   tx_size: Transform size of the target block.
+//   tx_type: Transform type of the target block. This field is particularly
+//            used to find out the scan order of the block.
+//   qindex: Quantization index used for target block. In general, all blocks
+//           in a same plane share the same quantization index. This field is
+//           particularly used to determine how many zeros should be used to
+//           drop out a coefficient.
+// Returns:
+//   Nothing will be returned, but `qcoeff`, `dqcoeff`, `eob`, as well as
+//   `txb_entropy_ctx`, which `mb` points to, may be modified by this function.
+void av1_dropout_qcoeff(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
+                        TX_TYPE tx_type, int qindex);
+
+void av1_subtract_block(const MACROBLOCKD *xd, int rows, int cols,
+                        int16_t *diff, ptrdiff_t diff_stride,
+                        const uint8_t *src8, ptrdiff_t src_stride,
+                        const uint8_t *pred8, ptrdiff_t pred_stride);
+
+void av1_subtract_txb(MACROBLOCK *x, int plane, BLOCK_SIZE plane_bsize,
+                      int blk_col, int blk_row, TX_SIZE tx_size);
+
+void av1_subtract_plane(MACROBLOCK *x, BLOCK_SIZE plane_bsize, int plane);
+
+static INLINE void av1_set_txb_context(MACROBLOCK *x, int plane, int block,
+                                       TX_SIZE tx_size, ENTROPY_CONTEXT *a,
+                                       ENTROPY_CONTEXT *l) {
+  const uint8_t ctx = x->plane[plane].txb_entropy_ctx[block];
+  memset(a, ctx, tx_size_wide_unit[tx_size] * sizeof(*a));
+  memset(l, ctx, tx_size_high_unit[tx_size] * sizeof(*l));
+}
+
+void av1_encode_block_intra(int plane, int block, int blk_row, int blk_col,
+                            BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg);
+
+void av1_encode_intra_block_plane(const struct AV1_COMP *cpi, MACROBLOCK *x,
+                                  BLOCK_SIZE bsize, int plane, RUN_TYPE dry_run,
+                                  TRELLIS_OPT_TYPE enable_optimize_b);
+
+static INLINE int is_trellis_used(TRELLIS_OPT_TYPE optimize_b,
+                                  RUN_TYPE dry_run) {
+  if (optimize_b == NO_TRELLIS_OPT) return false;
+  if (optimize_b == FINAL_PASS_TRELLIS_OPT && dry_run != OUTPUT_ENABLED)
+    return false;
+  return true;
+}
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_ENCODEMB_H_
diff --git a/libs/libaom/src/av1/encoder/encodemv.c b/libs/libaom/src/av1/encoder/encodemv.c
new file mode 100644
index 000000000..167e9c0a3
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/encodemv.c
@@ -0,0 +1,270 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+
+#include "av1/common/common.h"
+#include "av1/common/entropymode.h"
+
+#include "av1/encoder/cost.h"
+#include "av1/encoder/encodemv.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/bitops.h"
+
+static void update_mv_component_stats(int comp, nmv_component *mvcomp,
+                                      MvSubpelPrecision precision) {
+  assert(comp != 0);
+  int offset;
+  const int sign = comp < 0;
+  const int mag = sign ? -comp : comp;
+  const int mv_class = av1_get_mv_class(mag - 1, &offset);
+  const int d = offset >> 3;         // int mv data
+  const int fr = (offset >> 1) & 3;  // fractional mv data
+  const int hp = offset & 1;         // high precision mv data
+
+  // Sign
+  update_cdf(mvcomp->sign_cdf, sign, 2);
+
+  // Class
+  update_cdf(mvcomp->classes_cdf, mv_class, MV_CLASSES);
+
+  // Integer bits
+  if (mv_class == MV_CLASS_0) {
+    update_cdf(mvcomp->class0_cdf, d, CLASS0_SIZE);
+  } else {
+    const int n = mv_class + CLASS0_BITS - 1;  // number of bits
+    for (int i = 0; i < n; ++i)
+      update_cdf(mvcomp->bits_cdf[i], (d >> i) & 1, 2);
+  }
+  // Fractional bits
+  if (precision > MV_SUBPEL_NONE) {
+    aom_cdf_prob *fp_cdf =
+        mv_class == MV_CLASS_0 ? mvcomp->class0_fp_cdf[d] : mvcomp->fp_cdf;
+    update_cdf(fp_cdf, fr, MV_FP_SIZE);
+  }
+
+  // High precision bit
+  if (precision > MV_SUBPEL_LOW_PRECISION) {
+    aom_cdf_prob *hp_cdf =
+        mv_class == MV_CLASS_0 ? mvcomp->class0_hp_cdf : mvcomp->hp_cdf;
+    update_cdf(hp_cdf, hp, 2);
+  }
+}
+
+void av1_update_mv_stats(const MV *mv, const MV *ref, nmv_context *mvctx,
+                         MvSubpelPrecision precision) {
+  const MV diff = { mv->row - ref->row, mv->col - ref->col };
+  const MV_JOINT_TYPE j = av1_get_mv_joint(&diff);
+
+  update_cdf(mvctx->joints_cdf, j, MV_JOINTS);
+
+  if (mv_joint_vertical(j))
+    update_mv_component_stats(diff.row, &mvctx->comps[0], precision);
+
+  if (mv_joint_horizontal(j))
+    update_mv_component_stats(diff.col, &mvctx->comps[1], precision);
+}
+
+static void encode_mv_component(aom_writer *w, int comp, nmv_component *mvcomp,
+                                MvSubpelPrecision precision) {
+  assert(comp != 0);
+  int offset;
+  const int sign = comp < 0;
+  const int mag = sign ? -comp : comp;
+  const int mv_class = av1_get_mv_class(mag - 1, &offset);
+  const int d = offset >> 3;         // int mv data
+  const int fr = (offset >> 1) & 3;  // fractional mv data
+  const int hp = offset & 1;         // high precision mv data
+
+  // Sign
+  aom_write_symbol(w, sign, mvcomp->sign_cdf, 2);
+
+  // Class
+  aom_write_symbol(w, mv_class, mvcomp->classes_cdf, MV_CLASSES);
+
+  // Integer bits
+  if (mv_class == MV_CLASS_0) {
+    aom_write_symbol(w, d, mvcomp->class0_cdf, CLASS0_SIZE);
+  } else {
+    int i;
+    const int n = mv_class + CLASS0_BITS - 1;  // number of bits
+    for (i = 0; i < n; ++i)
+      aom_write_symbol(w, (d >> i) & 1, mvcomp->bits_cdf[i], 2);
+  }
+  // Fractional bits
+  if (precision > MV_SUBPEL_NONE) {
+    aom_write_symbol(
+        w, fr,
+        mv_class == MV_CLASS_0 ? mvcomp->class0_fp_cdf[d] : mvcomp->fp_cdf,
+        MV_FP_SIZE);
+  }
+
+  // High precision bit
+  if (precision > MV_SUBPEL_LOW_PRECISION)
+    aom_write_symbol(
+        w, hp, mv_class == MV_CLASS_0 ? mvcomp->class0_hp_cdf : mvcomp->hp_cdf,
+        2);
+}
+
+static void build_nmv_component_cost_table(int *mvcost,
+                                           const nmv_component *const mvcomp,
+                                           MvSubpelPrecision precision) {
+  int i, v;
+  int sign_cost[2], class_cost[MV_CLASSES], class0_cost[CLASS0_SIZE];
+  int bits_cost[MV_OFFSET_BITS][2];
+  int class0_fp_cost[CLASS0_SIZE][MV_FP_SIZE], fp_cost[MV_FP_SIZE];
+  int class0_hp_cost[2], hp_cost[2];
+
+  av1_cost_tokens_from_cdf(sign_cost, mvcomp->sign_cdf, NULL);
+  av1_cost_tokens_from_cdf(class_cost, mvcomp->classes_cdf, NULL);
+  av1_cost_tokens_from_cdf(class0_cost, mvcomp->class0_cdf, NULL);
+  for (i = 0; i < MV_OFFSET_BITS; ++i) {
+    av1_cost_tokens_from_cdf(bits_cost[i], mvcomp->bits_cdf[i], NULL);
+  }
+
+  for (i = 0; i < CLASS0_SIZE; ++i)
+    av1_cost_tokens_from_cdf(class0_fp_cost[i], mvcomp->class0_fp_cdf[i], NULL);
+  av1_cost_tokens_from_cdf(fp_cost, mvcomp->fp_cdf, NULL);
+
+  if (precision > MV_SUBPEL_LOW_PRECISION) {
+    av1_cost_tokens_from_cdf(class0_hp_cost, mvcomp->class0_hp_cdf, NULL);
+    av1_cost_tokens_from_cdf(hp_cost, mvcomp->hp_cdf, NULL);
+  }
+  mvcost[0] = 0;
+  for (v = 1; v <= MV_MAX; ++v) {
+    int z, c, o, d, e, f, cost = 0;
+    z = v - 1;
+    c = av1_get_mv_class(z, &o);
+    cost += class_cost[c];
+    d = (o >> 3);     /* int mv data */
+    f = (o >> 1) & 3; /* fractional pel mv data */
+    e = (o & 1);      /* high precision mv data */
+    if (c == MV_CLASS_0) {
+      cost += class0_cost[d];
+    } else {
+      const int b = c + CLASS0_BITS - 1; /* number of bits */
+      for (i = 0; i < b; ++i) cost += bits_cost[i][((d >> i) & 1)];
+    }
+    if (precision > MV_SUBPEL_NONE) {
+      if (c == MV_CLASS_0) {
+        cost += class0_fp_cost[d][f];
+      } else {
+        cost += fp_cost[f];
+      }
+      if (precision > MV_SUBPEL_LOW_PRECISION) {
+        if (c == MV_CLASS_0) {
+          cost += class0_hp_cost[e];
+        } else {
+          cost += hp_cost[e];
+        }
+      }
+    }
+    mvcost[v] = cost + sign_cost[0];
+    mvcost[-v] = cost + sign_cost[1];
+  }
+}
+
+void av1_encode_mv(AV1_COMP *cpi, aom_writer *w, const MV *mv, const MV *ref,
+                   nmv_context *mvctx, int usehp) {
+  const MV diff = { mv->row - ref->row, mv->col - ref->col };
+  const MV_JOINT_TYPE j = av1_get_mv_joint(&diff);
+  // If the mv_diff is zero, then we should have used near or nearest instead.
+  assert(j != MV_JOINT_ZERO);
+  if (cpi->common.features.cur_frame_force_integer_mv) {
+    usehp = MV_SUBPEL_NONE;
+  }
+  aom_write_symbol(w, j, mvctx->joints_cdf, MV_JOINTS);
+  if (mv_joint_vertical(j))
+    encode_mv_component(w, diff.row, &mvctx->comps[0], usehp);
+
+  if (mv_joint_horizontal(j))
+    encode_mv_component(w, diff.col, &mvctx->comps[1], usehp);
+
+  // If auto_mv_step_size is enabled then keep track of the largest
+  // motion vector component used.
+  if (cpi->sf.mv_sf.auto_mv_step_size) {
+    int maxv = AOMMAX(abs(mv->row), abs(mv->col)) >> 3;
+    cpi->mv_search_params.max_mv_magnitude =
+        AOMMAX(maxv, cpi->mv_search_params.max_mv_magnitude);
+  }
+}
+
+void av1_encode_dv(aom_writer *w, const MV *mv, const MV *ref,
+                   nmv_context *mvctx) {
+  // DV and ref DV should not have sub-pel.
+  assert((mv->col & 7) == 0);
+  assert((mv->row & 7) == 0);
+  assert((ref->col & 7) == 0);
+  assert((ref->row & 7) == 0);
+  const MV diff = { mv->row - ref->row, mv->col - ref->col };
+  const MV_JOINT_TYPE j = av1_get_mv_joint(&diff);
+
+  aom_write_symbol(w, j, mvctx->joints_cdf, MV_JOINTS);
+  if (mv_joint_vertical(j))
+    encode_mv_component(w, diff.row, &mvctx->comps[0], MV_SUBPEL_NONE);
+
+  if (mv_joint_horizontal(j))
+    encode_mv_component(w, diff.col, &mvctx->comps[1], MV_SUBPEL_NONE);
+}
+
+void av1_build_nmv_cost_table(int *mvjoint, int *mvcost[2],
+                              const nmv_context *ctx,
+                              MvSubpelPrecision precision) {
+  av1_cost_tokens_from_cdf(mvjoint, ctx->joints_cdf, NULL);
+  build_nmv_component_cost_table(mvcost[0], &ctx->comps[0], precision);
+  build_nmv_component_cost_table(mvcost[1], &ctx->comps[1], precision);
+}
+
+int_mv av1_get_ref_mv_from_stack(int ref_idx,
+                                 const MV_REFERENCE_FRAME *ref_frame,
+                                 int ref_mv_idx,
+                                 const MB_MODE_INFO_EXT *mbmi_ext) {
+  const int8_t ref_frame_type = av1_ref_frame_type(ref_frame);
+  const CANDIDATE_MV *curr_ref_mv_stack =
+      mbmi_ext->ref_mv_stack[ref_frame_type];
+
+  if (ref_frame[1] > INTRA_FRAME) {
+    assert(ref_idx == 0 || ref_idx == 1);
+    return ref_idx ? curr_ref_mv_stack[ref_mv_idx].comp_mv
+                   : curr_ref_mv_stack[ref_mv_idx].this_mv;
+  }
+
+  assert(ref_idx == 0);
+  return ref_mv_idx < mbmi_ext->ref_mv_count[ref_frame_type]
+             ? curr_ref_mv_stack[ref_mv_idx].this_mv
+             : mbmi_ext->global_mvs[ref_frame_type];
+}
+
+int_mv av1_get_ref_mv(const MACROBLOCK *x, int ref_idx) {
+  const MACROBLOCKD *xd = &x->e_mbd;
+  const MB_MODE_INFO *mbmi = xd->mi[0];
+  int ref_mv_idx = mbmi->ref_mv_idx;
+  if (mbmi->mode == NEAR_NEWMV || mbmi->mode == NEW_NEARMV) {
+    assert(has_second_ref(mbmi));
+    ref_mv_idx += 1;
+  }
+  return av1_get_ref_mv_from_stack(ref_idx, mbmi->ref_frame, ref_mv_idx,
+                                   x->mbmi_ext);
+}
+
+void av1_find_best_ref_mvs_from_stack(int allow_hp,
+                                      const MB_MODE_INFO_EXT *mbmi_ext,
+                                      MV_REFERENCE_FRAME ref_frame,
+                                      int_mv *nearest_mv, int_mv *near_mv,
+                                      int is_integer) {
+  const int ref_idx = 0;
+  MV_REFERENCE_FRAME ref_frames[2] = { ref_frame, NONE_FRAME };
+  *nearest_mv = av1_get_ref_mv_from_stack(ref_idx, ref_frames, 0, mbmi_ext);
+  lower_mv_precision(&nearest_mv->as_mv, allow_hp, is_integer);
+  *near_mv = av1_get_ref_mv_from_stack(ref_idx, ref_frames, 1, mbmi_ext);
+  lower_mv_precision(&near_mv->as_mv, allow_hp, is_integer);
+}
diff --git a/libs/libaom/src/av1/encoder/encodemv.h b/libs/libaom/src/av1/encoder/encodemv.h
new file mode 100644
index 000000000..0d130143e
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/encodemv.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ENCODEMV_H_
+#define AOM_AV1_ENCODER_ENCODEMV_H_
+
+#include "av1/encoder/encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_encode_mv(AV1_COMP *cpi, aom_writer *w, const MV *mv, const MV *ref,
+                   nmv_context *mvctx, int usehp);
+
+void av1_update_mv_stats(const MV *mv, const MV *ref, nmv_context *mvctx,
+                         MvSubpelPrecision precision);
+
+void av1_build_nmv_cost_table(int *mvjoint, int *mvcost[2],
+                              const nmv_context *mvctx,
+                              MvSubpelPrecision precision);
+
+void av1_update_mv_count(ThreadData *td);
+
+void av1_encode_dv(aom_writer *w, const MV *mv, const MV *ref,
+                   nmv_context *mvctx);
+int_mv av1_get_ref_mv(const MACROBLOCK *x, int ref_idx);
+int_mv av1_get_ref_mv_from_stack(int ref_idx,
+                                 const MV_REFERENCE_FRAME *ref_frame,
+                                 int ref_mv_idx,
+                                 const MB_MODE_INFO_EXT *mbmi_ext);
+void av1_find_best_ref_mvs_from_stack(int allow_hp,
+                                      const MB_MODE_INFO_EXT *mbmi_ext,
+                                      MV_REFERENCE_FRAME ref_frame,
+                                      int_mv *nearest_mv, int_mv *near_mv,
+                                      int is_integer);
+
+static INLINE MV_JOINT_TYPE av1_get_mv_joint(const MV *mv) {
+  // row:  Z  col:  Z  | MV_JOINT_ZERO   (0)
+  // row:  Z  col: NZ  | MV_JOINT_HNZVZ  (1)
+  // row: NZ  col:  Z  | MV_JOINT_HZVNZ  (2)
+  // row: NZ  col: NZ  | MV_JOINT_HNZVNZ (3)
+  return (!!mv->col) | ((!!mv->row) << 1);
+}
+
+static INLINE int av1_mv_class_base(MV_CLASS_TYPE c) {
+  return c ? CLASS0_SIZE << (c + 2) : 0;
+}
+
+// If n != 0, returns the floor of log base 2 of n. If n == 0, returns 0.
+static INLINE uint8_t av1_log_in_base_2(unsigned int n) {
+  // get_msb() is only valid when n != 0.
+  return n == 0 ? 0 : get_msb(n);
+}
+
+static INLINE MV_CLASS_TYPE av1_get_mv_class(int z, int *offset) {
+  const MV_CLASS_TYPE c = (z >= CLASS0_SIZE * 4096)
+                              ? MV_CLASS_10
+                              : (MV_CLASS_TYPE)av1_log_in_base_2(z >> 3);
+  if (offset) *offset = z - av1_mv_class_base(c);
+  return c;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_ENCODEMV_H_
diff --git a/libs/libaom/src/av1/encoder/encoder.c b/libs/libaom/src/av1/encoder/encoder.c
new file mode 100644
index 000000000..6406afd4a
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/encoder.c
@@ -0,0 +1,7187 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+#include <float.h>
+#include <math.h>
+#include <stdio.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_scale_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#if CONFIG_DENOISE
+#include "aom_dsp/grain_table.h"
+#include "aom_dsp/noise_util.h"
+#include "aom_dsp/noise_model.h"
+#endif
+#include "aom_dsp/psnr.h"
+#if CONFIG_INTERNAL_STATS
+#include "aom_dsp/ssim.h"
+#endif
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/system_state.h"
+#include "aom_scale/aom_scale.h"
+#if CONFIG_BITSTREAM_DEBUG
+#include "aom_util/debug_util.h"
+#endif  // CONFIG_BITSTREAM_DEBUG
+
+#include "av1/common/alloccommon.h"
+#include "av1/common/cdef.h"
+#include "av1/common/filter.h"
+#include "av1/common/idct.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+#include "av1/common/resize.h"
+#include "av1/common/tile_common.h"
+
+#include "av1/encoder/av1_multi_thread.h"
+#include "av1/encoder/aq_complexity.h"
+#include "av1/encoder/aq_cyclicrefresh.h"
+#include "av1/encoder/aq_variance.h"
+#include "av1/encoder/bitstream.h"
+#include "av1/encoder/context_tree.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/encode_strategy.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encodetxb.h"
+#include "av1/encoder/ethread.h"
+#include "av1/encoder/firstpass.h"
+#include "av1/encoder/grain_test_vectors.h"
+#include "av1/encoder/hash_motion.h"
+#include "av1/encoder/mv_prec.h"
+#include "av1/encoder/pass2_strategy.h"
+#include "av1/encoder/picklpf.h"
+#include "av1/encoder/pickrst.h"
+#include "av1/encoder/random.h"
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/rd.h"
+#include "av1/encoder/rdopt.h"
+#include "av1/encoder/segmentation.h"
+#include "av1/encoder/speed_features.h"
+#include "av1/encoder/tpl_model.h"
+#include "av1/encoder/reconinter_enc.h"
+#include "av1/encoder/var_based_part.h"
+
+#if CONFIG_TUNE_VMAF
+#include "av1/encoder/tune_vmaf.h"
+#endif
+
+#define DEFAULT_EXPLICIT_ORDER_HINT_BITS 7
+
+#if CONFIG_ENTROPY_STATS
+FRAME_COUNTS aggregate_fc;
+#endif  // CONFIG_ENTROPY_STATS
+
+#define AM_SEGMENT_ID_INACTIVE 7
+#define AM_SEGMENT_ID_ACTIVE 0
+
+// #define OUTPUT_YUV_REC
+#ifdef OUTPUT_YUV_SKINMAP
+FILE *yuv_skinmap_file = NULL;
+#endif
+#ifdef OUTPUT_YUV_REC
+FILE *yuv_rec_file;
+#define FILE_NAME_LEN 100
+#endif
+
+const int default_tx_type_probs[FRAME_UPDATE_TYPES][TX_SIZES_ALL][TX_TYPES] = {
+  { { 221, 189, 214, 292, 0, 0, 0, 0, 0, 2, 38, 68, 0, 0, 0, 0 },
+    { 262, 203, 216, 239, 0, 0, 0, 0, 0, 1, 37, 66, 0, 0, 0, 0 },
+    { 315, 231, 239, 226, 0, 0, 0, 0, 0, 13, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 222, 188, 214, 287, 0, 0, 0, 0, 0, 2, 50, 61, 0, 0, 0, 0 },
+    { 256, 182, 205, 282, 0, 0, 0, 0, 0, 2, 21, 76, 0, 0, 0, 0 },
+    { 281, 214, 217, 222, 0, 0, 0, 0, 0, 1, 48, 41, 0, 0, 0, 0 },
+    { 263, 194, 225, 225, 0, 0, 0, 0, 0, 2, 15, 100, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 170, 192, 242, 293, 0, 0, 0, 0, 0, 1, 68, 58, 0, 0, 0, 0 },
+    { 199, 210, 213, 291, 0, 0, 0, 0, 0, 1, 14, 96, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+  { { 106, 69, 107, 278, 9, 15, 20, 45, 49, 23, 23, 88, 36, 74, 25, 57 },
+    { 105, 72, 81, 98, 45, 49, 47, 50, 56, 72, 30, 81, 33, 95, 27, 83 },
+    { 211, 105, 109, 120, 57, 62, 43, 49, 52, 58, 42, 116, 0, 0, 0, 0 },
+    { 1008, 0, 0, 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 131, 57, 98, 172, 19, 40, 37, 64, 69, 22, 41, 52, 51, 77, 35, 59 },
+    { 176, 83, 93, 202, 22, 24, 28, 47, 50, 16, 12, 93, 26, 76, 17, 59 },
+    { 136, 72, 89, 95, 46, 59, 47, 56, 61, 68, 35, 51, 32, 82, 26, 69 },
+    { 122, 80, 87, 105, 49, 47, 46, 46, 57, 52, 13, 90, 19, 103, 15, 93 },
+    { 1009, 0, 0, 0, 0, 0, 0, 0, 0, 15, 0, 0, 0, 0, 0, 0 },
+    { 1011, 0, 0, 0, 0, 0, 0, 0, 0, 13, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 202, 20, 84, 114, 14, 60, 41, 79, 99, 21, 41, 15, 50, 84, 34, 66 },
+    { 196, 44, 23, 72, 30, 22, 28, 57, 67, 13, 4, 165, 15, 148, 9, 131 },
+    { 882, 0, 0, 0, 0, 0, 0, 0, 0, 142, 0, 0, 0, 0, 0, 0 },
+    { 840, 0, 0, 0, 0, 0, 0, 0, 0, 184, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+  { { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 } },
+  { { 213, 110, 141, 269, 12, 16, 15, 19, 21, 11, 38, 68, 22, 29, 16, 24 },
+    { 216, 119, 128, 143, 38, 41, 26, 30, 31, 30, 42, 70, 23, 36, 19, 32 },
+    { 367, 149, 154, 154, 38, 35, 17, 21, 21, 10, 22, 36, 0, 0, 0, 0 },
+    { 1022, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 219, 96, 127, 191, 21, 40, 25, 32, 34, 18, 45, 45, 33, 39, 26, 33 },
+    { 296, 99, 122, 198, 23, 21, 19, 24, 25, 13, 20, 64, 23, 32, 18, 27 },
+    { 275, 128, 142, 143, 35, 48, 23, 30, 29, 18, 42, 36, 18, 23, 14, 20 },
+    { 239, 132, 166, 175, 36, 27, 19, 21, 24, 14, 13, 85, 9, 31, 8, 25 },
+    { 1022, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0 },
+    { 1022, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 309, 25, 79, 59, 25, 80, 34, 53, 61, 25, 49, 23, 43, 64, 36, 59 },
+    { 270, 57, 40, 54, 50, 42, 41, 53, 56, 28, 17, 81, 45, 86, 34, 70 },
+    { 1005, 0, 0, 0, 0, 0, 0, 0, 0, 19, 0, 0, 0, 0, 0, 0 },
+    { 992, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+  { { 133, 63, 55, 83, 57, 87, 58, 72, 68, 16, 24, 35, 29, 105, 25, 114 },
+    { 131, 75, 74, 60, 71, 77, 65, 66, 73, 33, 21, 79, 20, 83, 18, 78 },
+    { 276, 95, 82, 58, 86, 93, 63, 60, 64, 17, 38, 92, 0, 0, 0, 0 },
+    { 1006, 0, 0, 0, 0, 0, 0, 0, 0, 18, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 147, 49, 75, 78, 50, 97, 60, 67, 76, 17, 42, 35, 31, 93, 27, 80 },
+    { 157, 49, 58, 75, 61, 52, 56, 67, 69, 12, 15, 79, 24, 119, 11, 120 },
+    { 178, 69, 83, 77, 69, 85, 72, 77, 77, 20, 35, 40, 25, 48, 23, 46 },
+    { 174, 55, 64, 57, 73, 68, 62, 61, 75, 15, 12, 90, 17, 99, 16, 86 },
+    { 1008, 0, 0, 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 0, 0, 0 },
+    { 1018, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 266, 31, 63, 64, 21, 52, 39, 54, 63, 30, 52, 31, 48, 89, 46, 75 },
+    { 272, 26, 32, 44, 29, 31, 32, 53, 51, 13, 13, 88, 22, 153, 16, 149 },
+    { 923, 0, 0, 0, 0, 0, 0, 0, 0, 101, 0, 0, 0, 0, 0, 0 },
+    { 969, 0, 0, 0, 0, 0, 0, 0, 0, 55, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
+  { { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 } },
+  { { 158, 92, 125, 298, 12, 15, 20, 29, 31, 12, 29, 67, 34, 44, 23, 35 },
+    { 147, 94, 103, 123, 45, 48, 38, 41, 46, 48, 37, 78, 33, 63, 27, 53 },
+    { 268, 126, 125, 136, 54, 53, 31, 38, 38, 33, 35, 87, 0, 0, 0, 0 },
+    { 1018, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 159, 72, 103, 194, 20, 35, 37, 50, 56, 21, 39, 40, 51, 61, 38, 48 },
+    { 259, 86, 95, 188, 32, 20, 25, 34, 37, 13, 12, 85, 25, 53, 17, 43 },
+    { 189, 99, 113, 123, 45, 59, 37, 46, 48, 44, 39, 41, 31, 47, 26, 37 },
+    { 175, 110, 113, 128, 58, 38, 33, 33, 43, 29, 13, 100, 14, 68, 12, 57 },
+    { 1017, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0 },
+    { 1019, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 208, 22, 84, 101, 21, 59, 44, 70, 90, 25, 59, 13, 64, 67, 49, 48 },
+    { 277, 52, 32, 63, 43, 26, 33, 48, 54, 11, 6, 130, 18, 119, 11, 101 },
+    { 963, 0, 0, 0, 0, 0, 0, 0, 0, 61, 0, 0, 0, 0, 0, 0 },
+    { 979, 0, 0, 0, 0, 0, 0, 0, 0, 45, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 1024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }
+};
+
+const int default_obmc_probs[FRAME_UPDATE_TYPES][BLOCK_SIZES_ALL] = {
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0,  0,  0,  106, 90, 90, 97, 67, 59, 70, 28,
+    30, 38, 16, 16,  16, 0,  0,  44, 50, 26, 25 },
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0,  0,  0,  98, 93, 97, 68, 82, 85, 33, 30,
+    33, 16, 16, 16, 16, 0,  0,  43, 37, 26, 16 },
+  { 0,  0,  0,  91, 80, 76, 78, 55, 49, 24, 16,
+    16, 16, 16, 16, 16, 0,  0,  29, 45, 16, 38 },
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0,  0,  0,  103, 89, 89, 89, 62, 63, 76, 34,
+    35, 32, 19, 16,  16, 0,  0,  49, 55, 29, 19 }
+};
+
+const int default_warped_probs[FRAME_UPDATE_TYPES] = { 64, 64, 64, 64,
+                                                       64, 64, 64 };
+
+// TODO(yunqing): the default probs can be trained later from better
+// performance.
+const int default_switchable_interp_probs[FRAME_UPDATE_TYPES]
+                                         [SWITCHABLE_FILTER_CONTEXTS]
+                                         [SWITCHABLE_FILTERS] = {
+                                           { { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 } },
+                                           { { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 } },
+                                           { { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 } },
+                                           { { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 } },
+                                           { { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 } },
+                                           { { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 } },
+                                           { { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 },
+                                             { 512, 512, 512 } }
+                                         };
+
+static INLINE void Scale2Ratio(AOM_SCALING mode, int *hr, int *hs) {
+  switch (mode) {
+    case NORMAL:
+      *hr = 1;
+      *hs = 1;
+      break;
+    case FOURFIVE:
+      *hr = 4;
+      *hs = 5;
+      break;
+    case THREEFIVE:
+      *hr = 3;
+      *hs = 5;
+      break;
+    case ONETWO:
+      *hr = 1;
+      *hs = 2;
+      break;
+    default:
+      *hr = 1;
+      *hs = 1;
+      assert(0);
+      break;
+  }
+}
+
+// Mark all inactive blocks as active. Other segmentation features may be set
+// so memset cannot be used, instead only inactive blocks should be reset.
+static void suppress_active_map(AV1_COMP *cpi) {
+  unsigned char *const seg_map = cpi->enc_seg.map;
+  int i;
+  if (cpi->active_map.enabled || cpi->active_map.update)
+    for (i = 0;
+         i < cpi->common.mi_params.mi_rows * cpi->common.mi_params.mi_cols; ++i)
+      if (seg_map[i] == AM_SEGMENT_ID_INACTIVE)
+        seg_map[i] = AM_SEGMENT_ID_ACTIVE;
+}
+
+static void apply_active_map(AV1_COMP *cpi) {
+  struct segmentation *const seg = &cpi->common.seg;
+  unsigned char *const seg_map = cpi->enc_seg.map;
+  const unsigned char *const active_map = cpi->active_map.map;
+  int i;
+
+  assert(AM_SEGMENT_ID_ACTIVE == CR_SEGMENT_ID_BASE);
+
+  if (frame_is_intra_only(&cpi->common)) {
+    cpi->active_map.enabled = 0;
+    cpi->active_map.update = 1;
+  }
+
+  if (cpi->active_map.update) {
+    if (cpi->active_map.enabled) {
+      for (i = 0;
+           i < cpi->common.mi_params.mi_rows * cpi->common.mi_params.mi_cols;
+           ++i)
+        if (seg_map[i] == AM_SEGMENT_ID_ACTIVE) seg_map[i] = active_map[i];
+      av1_enable_segmentation(seg);
+      av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_SKIP);
+      av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_H);
+      av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_V);
+      av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_U);
+      av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_V);
+
+      av1_set_segdata(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_H,
+                      -MAX_LOOP_FILTER);
+      av1_set_segdata(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_V,
+                      -MAX_LOOP_FILTER);
+      av1_set_segdata(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_U,
+                      -MAX_LOOP_FILTER);
+      av1_set_segdata(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_V,
+                      -MAX_LOOP_FILTER);
+    } else {
+      av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_SKIP);
+      av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_H);
+      av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_V);
+      av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_U);
+      av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_V);
+      if (seg->enabled) {
+        seg->update_data = 1;
+        seg->update_map = 1;
+      }
+    }
+    cpi->active_map.update = 0;
+  }
+}
+
+int av1_set_active_map(AV1_COMP *cpi, unsigned char *new_map_16x16, int rows,
+                       int cols) {
+  const CommonModeInfoParams *const mi_params = &cpi->common.mi_params;
+  if (rows == mi_params->mb_rows && cols == mi_params->mb_cols) {
+    unsigned char *const active_map_8x8 = cpi->active_map.map;
+    const int mi_rows = mi_params->mi_rows;
+    const int mi_cols = mi_params->mi_cols;
+    const int row_scale = mi_size_high[BLOCK_16X16] == 2 ? 1 : 2;
+    const int col_scale = mi_size_wide[BLOCK_16X16] == 2 ? 1 : 2;
+    cpi->active_map.update = 1;
+    if (new_map_16x16) {
+      int r, c;
+      for (r = 0; r < mi_rows; ++r) {
+        for (c = 0; c < mi_cols; ++c) {
+          active_map_8x8[r * mi_cols + c] =
+              new_map_16x16[(r >> row_scale) * cols + (c >> col_scale)]
+                  ? AM_SEGMENT_ID_ACTIVE
+                  : AM_SEGMENT_ID_INACTIVE;
+        }
+      }
+      cpi->active_map.enabled = 1;
+    } else {
+      cpi->active_map.enabled = 0;
+    }
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+int av1_get_active_map(AV1_COMP *cpi, unsigned char *new_map_16x16, int rows,
+                       int cols) {
+  const CommonModeInfoParams *const mi_params = &cpi->common.mi_params;
+  if (rows == mi_params->mb_rows && cols == mi_params->mb_cols &&
+      new_map_16x16) {
+    unsigned char *const seg_map_8x8 = cpi->enc_seg.map;
+    const int mi_rows = mi_params->mi_rows;
+    const int mi_cols = mi_params->mi_cols;
+    const int row_scale = mi_size_high[BLOCK_16X16] == 2 ? 1 : 2;
+    const int col_scale = mi_size_wide[BLOCK_16X16] == 2 ? 1 : 2;
+
+    memset(new_map_16x16, !cpi->active_map.enabled, rows * cols);
+    if (cpi->active_map.enabled) {
+      int r, c;
+      for (r = 0; r < mi_rows; ++r) {
+        for (c = 0; c < mi_cols; ++c) {
+          // Cyclic refresh segments are considered active despite not having
+          // AM_SEGMENT_ID_ACTIVE
+          new_map_16x16[(r >> row_scale) * cols + (c >> col_scale)] |=
+              seg_map_8x8[r * mi_cols + c] != AM_SEGMENT_ID_INACTIVE;
+        }
+      }
+    }
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+// Compute the horizontal frequency components' energy in a frame
+// by calculuating the 16x4 Horizontal DCT. This is to be used to
+// decide the superresolution parameters.
+static void analyze_hor_freq(const AV1_COMP *cpi, double *energy) {
+  uint64_t freq_energy[16] = { 0 };
+  const YV12_BUFFER_CONFIG *buf = cpi->source;
+  const int bd = cpi->td.mb.e_mbd.bd;
+  const int width = buf->y_crop_width;
+  const int height = buf->y_crop_height;
+  DECLARE_ALIGNED(16, int32_t, coeff[16 * 4]);
+  int n = 0;
+  memset(freq_energy, 0, sizeof(freq_energy));
+  if (buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    const int16_t *src16 = (const int16_t *)CONVERT_TO_SHORTPTR(buf->y_buffer);
+    for (int i = 0; i < height - 4; i += 4) {
+      for (int j = 0; j < width - 16; j += 16) {
+        av1_fwd_txfm2d_16x4(src16 + i * buf->y_stride + j, coeff, buf->y_stride,
+                            H_DCT, bd);
+        for (int k = 1; k < 16; ++k) {
+          const uint64_t this_energy =
+              ((int64_t)coeff[k] * coeff[k]) +
+              ((int64_t)coeff[k + 16] * coeff[k + 16]) +
+              ((int64_t)coeff[k + 32] * coeff[k + 32]) +
+              ((int64_t)coeff[k + 48] * coeff[k + 48]);
+          freq_energy[k] += ROUND_POWER_OF_TWO(this_energy, 2 + 2 * (bd - 8));
+        }
+        n++;
+      }
+    }
+  } else {
+    assert(bd == 8);
+    DECLARE_ALIGNED(16, int16_t, src16[16 * 4]);
+    for (int i = 0; i < height - 4; i += 4) {
+      for (int j = 0; j < width - 16; j += 16) {
+        for (int ii = 0; ii < 4; ++ii)
+          for (int jj = 0; jj < 16; ++jj)
+            src16[ii * 16 + jj] =
+                buf->y_buffer[(i + ii) * buf->y_stride + (j + jj)];
+        av1_fwd_txfm2d_16x4(src16, coeff, 16, H_DCT, bd);
+        for (int k = 1; k < 16; ++k) {
+          const uint64_t this_energy =
+              ((int64_t)coeff[k] * coeff[k]) +
+              ((int64_t)coeff[k + 16] * coeff[k + 16]) +
+              ((int64_t)coeff[k + 32] * coeff[k + 32]) +
+              ((int64_t)coeff[k + 48] * coeff[k + 48]);
+          freq_energy[k] += ROUND_POWER_OF_TWO(this_energy, 2);
+        }
+        n++;
+      }
+    }
+  }
+  if (n) {
+    for (int k = 1; k < 16; ++k) energy[k] = (double)freq_energy[k] / n;
+    // Convert to cumulative energy
+    for (int k = 14; k > 0; --k) energy[k] += energy[k + 1];
+  } else {
+    for (int k = 1; k < 16; ++k) energy[k] = 1e+20;
+  }
+}
+
+static BLOCK_SIZE select_sb_size(const AV1_COMP *const cpi) {
+  const AV1_COMMON *const cm = &cpi->common;
+
+  if (cpi->oxcf.superblock_size == AOM_SUPERBLOCK_SIZE_64X64)
+    return BLOCK_64X64;
+  if (cpi->oxcf.superblock_size == AOM_SUPERBLOCK_SIZE_128X128)
+    return BLOCK_128X128;
+
+  assert(cpi->oxcf.superblock_size == AOM_SUPERBLOCK_SIZE_DYNAMIC);
+
+  if (cpi->svc.number_spatial_layers > 1) {
+    // Use the configured size (top resolution) for spatial layers.
+    return AOMMIN(cpi->oxcf.width, cpi->oxcf.height) > 480 ? BLOCK_128X128
+                                                           : BLOCK_64X64;
+  }
+
+  // TODO(any): Possibly could improve this with a heuristic.
+  // When superres / resize is on, 'cm->width / height' can change between
+  // calls, so we don't apply this heuristic there.
+  // Things break if superblock size changes between the first pass and second
+  // pass encoding, which is why this heuristic is not configured as a
+  // speed-feature.
+  if (cpi->oxcf.superres_mode == SUPERRES_NONE &&
+      cpi->oxcf.resize_mode == RESIZE_NONE && cpi->oxcf.speed >= 1) {
+    return AOMMIN(cm->width, cm->height) > 480 ? BLOCK_128X128 : BLOCK_64X64;
+  }
+
+  return BLOCK_128X128;
+}
+
+static void setup_frame(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  // Set up entropy context depending on frame type. The decoder mandates
+  // the use of the default context, index 0, for keyframes and inter
+  // frames where the error_resilient_mode or intra_only flag is set. For
+  // other inter-frames the encoder currently uses only two contexts;
+  // context 1 for ALTREF frames and context 0 for the others.
+
+  if (frame_is_intra_only(cm) || cm->features.error_resilient_mode ||
+      cpi->ext_flags.use_primary_ref_none) {
+    av1_setup_past_independence(cm);
+  }
+
+  if ((cm->current_frame.frame_type == KEY_FRAME && cm->show_frame) ||
+      frame_is_sframe(cm)) {
+    if (!cpi->seq_params_locked) {
+      set_sb_size(&cm->seq_params, select_sb_size(cpi));
+    }
+  } else {
+    const RefCntBuffer *const primary_ref_buf = get_primary_ref_frame_buf(cm);
+    if (primary_ref_buf == NULL) {
+      av1_setup_past_independence(cm);
+      cm->seg.update_map = 1;
+      cm->seg.update_data = 1;
+    } else {
+      *cm->fc = primary_ref_buf->frame_context;
+    }
+  }
+
+  av1_zero(cm->cur_frame->interp_filter_selected);
+  cm->prev_frame = get_primary_ref_frame_buf(cm);
+  cpi->vaq_refresh = 0;
+}
+
+static void set_mb_mi(CommonModeInfoParams *mi_params, int width, int height) {
+  // Ensure that the decoded width and height are both multiples of
+  // 8 luma pixels (note: this may only be a multiple of 4 chroma pixels if
+  // subsampling is used).
+  // This simplifies the implementation of various experiments,
+  // eg. cdef, which operates on units of 8x8 luma pixels.
+  const int aligned_width = ALIGN_POWER_OF_TWO(width, 3);
+  const int aligned_height = ALIGN_POWER_OF_TWO(height, 3);
+
+  mi_params->mi_cols = aligned_width >> MI_SIZE_LOG2;
+  mi_params->mi_rows = aligned_height >> MI_SIZE_LOG2;
+  mi_params->mi_stride = calc_mi_size(mi_params->mi_cols);
+
+  mi_params->mb_cols = (mi_params->mi_cols + 2) >> 2;
+  mi_params->mb_rows = (mi_params->mi_rows + 2) >> 2;
+  mi_params->MBs = mi_params->mb_rows * mi_params->mb_cols;
+
+  const int mi_alloc_size_1d = mi_size_wide[mi_params->mi_alloc_bsize];
+  mi_params->mi_alloc_stride =
+      (mi_params->mi_stride + mi_alloc_size_1d - 1) / mi_alloc_size_1d;
+
+  assert(mi_size_wide[mi_params->mi_alloc_bsize] ==
+         mi_size_high[mi_params->mi_alloc_bsize]);
+
+#if CONFIG_LPF_MASK
+  av1_alloc_loop_filter_mask(mi_params);
+#endif
+}
+
+static void enc_set_mb_mi(CommonModeInfoParams *mi_params, int width,
+                          int height) {
+  const int is_4k_or_larger = AOMMIN(width, height) >= 2160;
+  mi_params->mi_alloc_bsize = is_4k_or_larger ? BLOCK_8X8 : BLOCK_4X4;
+
+  set_mb_mi(mi_params, width, height);
+}
+
+static void stat_stage_set_mb_mi(CommonModeInfoParams *mi_params, int width,
+                                 int height) {
+  mi_params->mi_alloc_bsize = BLOCK_16X16;
+
+  set_mb_mi(mi_params, width, height);
+}
+
+static void enc_setup_mi(CommonModeInfoParams *mi_params) {
+  const int mi_grid_size =
+      mi_params->mi_stride * calc_mi_size(mi_params->mi_rows);
+  memset(mi_params->mi_alloc, 0,
+         mi_params->mi_alloc_size * sizeof(*mi_params->mi_alloc));
+  memset(mi_params->mi_grid_base, 0,
+         mi_grid_size * sizeof(*mi_params->mi_grid_base));
+  memset(mi_params->tx_type_map, 0,
+         mi_grid_size * sizeof(*mi_params->tx_type_map));
+}
+
+static void enc_free_mi(CommonModeInfoParams *mi_params) {
+  aom_free(mi_params->mi_alloc);
+  mi_params->mi_alloc = NULL;
+  aom_free(mi_params->mi_grid_base);
+  mi_params->mi_grid_base = NULL;
+  mi_params->mi_alloc_size = 0;
+  aom_free(mi_params->tx_type_map);
+  mi_params->tx_type_map = NULL;
+}
+
+void av1_initialize_enc(void) {
+  av1_rtcd();
+  aom_dsp_rtcd();
+  aom_scale_rtcd();
+  av1_init_intra_predictors();
+  av1_init_me_luts();
+  av1_rc_init_minq_luts();
+  av1_init_wedge_masks();
+}
+
+static void dealloc_context_buffers_ext(MBMIExtFrameBufferInfo *mbmi_ext_info) {
+  if (mbmi_ext_info->frame_base) {
+    aom_free(mbmi_ext_info->frame_base);
+    mbmi_ext_info->frame_base = NULL;
+    mbmi_ext_info->alloc_size = 0;
+  }
+}
+
+static void alloc_context_buffers_ext(AV1_COMMON *cm,
+                                      MBMIExtFrameBufferInfo *mbmi_ext_info) {
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+
+  const int mi_alloc_size_1d = mi_size_wide[mi_params->mi_alloc_bsize];
+  const int mi_alloc_rows =
+      (mi_params->mi_rows + mi_alloc_size_1d - 1) / mi_alloc_size_1d;
+  const int mi_alloc_cols =
+      (mi_params->mi_cols + mi_alloc_size_1d - 1) / mi_alloc_size_1d;
+  const int new_ext_mi_size = mi_alloc_rows * mi_alloc_cols;
+
+  if (new_ext_mi_size > mbmi_ext_info->alloc_size) {
+    dealloc_context_buffers_ext(mbmi_ext_info);
+    CHECK_MEM_ERROR(
+        cm, mbmi_ext_info->frame_base,
+        aom_calloc(new_ext_mi_size, sizeof(*mbmi_ext_info->frame_base)));
+    mbmi_ext_info->alloc_size = new_ext_mi_size;
+  }
+  // The stride needs to be updated regardless of whether new allocation
+  // happened or not.
+  mbmi_ext_info->stride = mi_alloc_cols;
+}
+
+static void reset_film_grain_chroma_params(aom_film_grain_t *pars) {
+  pars->num_cr_points = 0;
+  pars->cr_mult = 0;
+  pars->cr_luma_mult = 0;
+  memset(pars->scaling_points_cr, 0, sizeof(pars->scaling_points_cr));
+  memset(pars->ar_coeffs_cr, 0, sizeof(pars->ar_coeffs_cr));
+  pars->num_cb_points = 0;
+  pars->cb_mult = 0;
+  pars->cb_luma_mult = 0;
+  pars->chroma_scaling_from_luma = 0;
+  memset(pars->scaling_points_cb, 0, sizeof(pars->scaling_points_cb));
+  memset(pars->ar_coeffs_cb, 0, sizeof(pars->ar_coeffs_cb));
+}
+
+static void update_film_grain_parameters(struct AV1_COMP *cpi,
+                                         const AV1EncoderConfig *oxcf) {
+  AV1_COMMON *const cm = &cpi->common;
+  cpi->oxcf = *oxcf;
+
+  if (cpi->film_grain_table) {
+    aom_film_grain_table_free(cpi->film_grain_table);
+    aom_free(cpi->film_grain_table);
+    cpi->film_grain_table = NULL;
+  }
+
+  if (oxcf->film_grain_test_vector) {
+    cm->seq_params.film_grain_params_present = 1;
+    if (cm->current_frame.frame_type == KEY_FRAME) {
+      memcpy(&cm->film_grain_params,
+             film_grain_test_vectors + oxcf->film_grain_test_vector - 1,
+             sizeof(cm->film_grain_params));
+      if (oxcf->monochrome)
+        reset_film_grain_chroma_params(&cm->film_grain_params);
+      cm->film_grain_params.bit_depth = cm->seq_params.bit_depth;
+      if (cm->seq_params.color_range == AOM_CR_FULL_RANGE) {
+        cm->film_grain_params.clip_to_restricted_range = 0;
+      }
+    }
+  } else if (oxcf->film_grain_table_filename) {
+    cm->seq_params.film_grain_params_present = 1;
+
+    cpi->film_grain_table = aom_malloc(sizeof(*cpi->film_grain_table));
+    memset(cpi->film_grain_table, 0, sizeof(aom_film_grain_table_t));
+
+    aom_film_grain_table_read(cpi->film_grain_table,
+                              oxcf->film_grain_table_filename, &cm->error);
+  } else {
+#if CONFIG_DENOISE
+    cm->seq_params.film_grain_params_present = (cpi->oxcf.noise_level > 0);
+#else
+    cm->seq_params.film_grain_params_present = 0;
+#endif
+    memset(&cm->film_grain_params, 0, sizeof(cm->film_grain_params));
+  }
+}
+
+static void dealloc_compressor_data(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+
+  dealloc_context_buffers_ext(&cpi->mbmi_ext_info);
+
+  aom_free(cpi->tile_data);
+  cpi->tile_data = NULL;
+
+  // Delete sementation map
+  aom_free(cpi->enc_seg.map);
+  cpi->enc_seg.map = NULL;
+
+  av1_cyclic_refresh_free(cpi->cyclic_refresh);
+  cpi->cyclic_refresh = NULL;
+
+  aom_free(cpi->active_map.map);
+  cpi->active_map.map = NULL;
+
+  aom_free(cpi->ssim_rdmult_scaling_factors);
+  cpi->ssim_rdmult_scaling_factors = NULL;
+
+  aom_free(cpi->tpl_rdmult_scaling_factors);
+  cpi->tpl_rdmult_scaling_factors = NULL;
+
+  aom_free(cpi->tpl_sb_rdmult_scaling_factors);
+  cpi->tpl_sb_rdmult_scaling_factors = NULL;
+
+#if CONFIG_TUNE_VMAF
+  aom_free(cpi->vmaf_rdmult_scaling_factors);
+  cpi->vmaf_rdmult_scaling_factors = NULL;
+#endif
+
+  aom_free(cpi->td.mb.above_pred_buf);
+  cpi->td.mb.above_pred_buf = NULL;
+
+  aom_free(cpi->td.mb.left_pred_buf);
+  cpi->td.mb.left_pred_buf = NULL;
+
+  aom_free(cpi->td.mb.wsrc_buf);
+  cpi->td.mb.wsrc_buf = NULL;
+
+  aom_free(cpi->td.mb.inter_modes_info);
+  cpi->td.mb.inter_modes_info = NULL;
+
+  for (int i = 0; i < 2; i++)
+    for (int j = 0; j < 2; j++) {
+      aom_free(cpi->td.mb.intrabc_hash_info.hash_value_buffer[i][j]);
+      cpi->td.mb.intrabc_hash_info.hash_value_buffer[i][j] = NULL;
+    }
+  aom_free(cpi->td.mb.mask_buf);
+  cpi->td.mb.mask_buf = NULL;
+
+  aom_free(cm->tpl_mvs);
+  cm->tpl_mvs = NULL;
+
+  aom_free(cpi->td.mb.mbmi_ext);
+  cpi->td.mb.mbmi_ext = NULL;
+
+  if (cpi->td.vt64x64) {
+    aom_free(cpi->td.vt64x64);
+    cpi->td.vt64x64 = NULL;
+  }
+
+  av1_free_ref_frame_buffers(cm->buffer_pool);
+  av1_free_txb_buf(cpi);
+  av1_free_context_buffers(cm);
+
+  aom_free_frame_buffer(&cpi->last_frame_uf);
+  av1_free_restoration_buffers(cm);
+  aom_free_frame_buffer(&cpi->trial_frame_rst);
+  aom_free_frame_buffer(&cpi->scaled_source);
+  aom_free_frame_buffer(&cpi->scaled_last_source);
+  aom_free_frame_buffer(&cpi->alt_ref_buffer);
+  av1_lookahead_destroy(cpi->lookahead);
+
+  aom_free(cpi->tile_tok[0][0]);
+  cpi->tile_tok[0][0] = 0;
+
+  aom_free(cpi->tplist[0][0]);
+  cpi->tplist[0][0] = NULL;
+
+  av1_free_pc_tree(cpi, &cpi->td, num_planes, cm->seq_params.sb_size);
+
+  aom_free(cpi->td.mb.palette_buffer);
+  av1_release_compound_type_rd_buffers(&cpi->td.mb.comp_rd_buffer);
+  aom_free(cpi->td.mb.tmp_conv_dst);
+  for (int j = 0; j < 2; ++j) {
+    aom_free(cpi->td.mb.tmp_obmc_bufs[j]);
+  }
+
+#if CONFIG_DENOISE
+  if (cpi->denoise_and_model) {
+    aom_denoise_and_model_free(cpi->denoise_and_model);
+    cpi->denoise_and_model = NULL;
+  }
+#endif
+  if (cpi->film_grain_table) {
+    aom_film_grain_table_free(cpi->film_grain_table);
+    cpi->film_grain_table = NULL;
+  }
+
+  for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) {
+    aom_free(cpi->level_params.level_info[i]);
+  }
+
+  if (cpi->use_svc) av1_free_svc_cyclic_refresh(cpi);
+}
+
+static void configure_static_seg_features(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  struct segmentation *const seg = &cm->seg;
+
+  int high_q = (int)(rc->avg_q > 48.0);
+  int qi_delta;
+
+  // Disable and clear down for KF
+  if (cm->current_frame.frame_type == KEY_FRAME) {
+    // Clear down the global segmentation map
+    memset(cpi->enc_seg.map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols);
+    seg->update_map = 0;
+    seg->update_data = 0;
+
+    // Disable segmentation
+    av1_disable_segmentation(seg);
+
+    // Clear down the segment features.
+    av1_clearall_segfeatures(seg);
+  } else if (cpi->refresh_alt_ref_frame) {
+    // If this is an alt ref frame
+    // Clear down the global segmentation map
+    memset(cpi->enc_seg.map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols);
+    seg->update_map = 0;
+    seg->update_data = 0;
+
+    // Disable segmentation and individual segment features by default
+    av1_disable_segmentation(seg);
+    av1_clearall_segfeatures(seg);
+
+    // If segmentation was enabled set those features needed for the
+    // arf itself.
+    if (seg->enabled) {
+      seg->update_map = 1;
+      seg->update_data = 1;
+
+      qi_delta = av1_compute_qdelta(rc, rc->avg_q, rc->avg_q * 0.875,
+                                    cm->seq_params.bit_depth);
+      av1_set_segdata(seg, 1, SEG_LVL_ALT_Q, qi_delta - 2);
+      av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_Y_H, -2);
+      av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_Y_V, -2);
+      av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_U, -2);
+      av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_V, -2);
+
+      av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_Y_H);
+      av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_Y_V);
+      av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_U);
+      av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_V);
+
+      av1_enable_segfeature(seg, 1, SEG_LVL_ALT_Q);
+    }
+  } else if (seg->enabled) {
+    // All other frames if segmentation has been enabled
+
+    // First normal frame in a valid gf or alt ref group
+    if (rc->frames_since_golden == 0) {
+      // Set up segment features for normal frames in an arf group
+      if (rc->source_alt_ref_active) {
+        seg->update_map = 0;
+        seg->update_data = 1;
+
+        qi_delta = av1_compute_qdelta(rc, rc->avg_q, rc->avg_q * 1.125,
+                                      cm->seq_params.bit_depth);
+        av1_set_segdata(seg, 1, SEG_LVL_ALT_Q, qi_delta + 2);
+        av1_enable_segfeature(seg, 1, SEG_LVL_ALT_Q);
+
+        av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_Y_H, -2);
+        av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_Y_V, -2);
+        av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_U, -2);
+        av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_V, -2);
+
+        av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_Y_H);
+        av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_Y_V);
+        av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_U);
+        av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_V);
+
+        // Segment coding disabled for compred testing
+        if (high_q) {
+          av1_set_segdata(seg, 1, SEG_LVL_REF_FRAME, ALTREF_FRAME);
+          av1_enable_segfeature(seg, 1, SEG_LVL_REF_FRAME);
+          av1_enable_segfeature(seg, 1, SEG_LVL_SKIP);
+        }
+      } else {
+        // Disable segmentation and clear down features if alt ref
+        // is not active for this group
+
+        av1_disable_segmentation(seg);
+
+        memset(cpi->enc_seg.map, 0,
+               cm->mi_params.mi_rows * cm->mi_params.mi_cols);
+
+        seg->update_map = 0;
+        seg->update_data = 0;
+
+        av1_clearall_segfeatures(seg);
+      }
+    } else if (rc->is_src_frame_alt_ref) {
+      // Special case where we are coding over the top of a previous
+      // alt ref frame.
+      // Segment coding disabled for compred testing
+
+      // Enable ref frame features for segment 0 as well
+      av1_enable_segfeature(seg, 0, SEG_LVL_REF_FRAME);
+      av1_enable_segfeature(seg, 1, SEG_LVL_REF_FRAME);
+
+      // All mbs should use ALTREF_FRAME
+      av1_clear_segdata(seg, 0, SEG_LVL_REF_FRAME);
+      av1_set_segdata(seg, 0, SEG_LVL_REF_FRAME, ALTREF_FRAME);
+      av1_clear_segdata(seg, 1, SEG_LVL_REF_FRAME);
+      av1_set_segdata(seg, 1, SEG_LVL_REF_FRAME, ALTREF_FRAME);
+
+      // Skip all MBs if high Q (0,0 mv and skip coeffs)
+      if (high_q) {
+        av1_enable_segfeature(seg, 0, SEG_LVL_SKIP);
+        av1_enable_segfeature(seg, 1, SEG_LVL_SKIP);
+      }
+      // Enable data update
+      seg->update_data = 1;
+    } else {
+      // All other frames.
+
+      // No updates.. leave things as they are.
+      seg->update_map = 0;
+      seg->update_data = 0;
+    }
+  }
+}
+
+static void update_reference_segmentation_map(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  MB_MODE_INFO **mi_4x4_ptr = mi_params->mi_grid_base;
+  uint8_t *cache_ptr = cm->cur_frame->seg_map;
+
+  for (int row = 0; row < mi_params->mi_rows; row++) {
+    MB_MODE_INFO **mi_4x4 = mi_4x4_ptr;
+    uint8_t *cache = cache_ptr;
+    for (int col = 0; col < mi_params->mi_cols; col++, mi_4x4++, cache++)
+      cache[0] = mi_4x4[0]->segment_id;
+    mi_4x4_ptr += mi_params->mi_stride;
+    cache_ptr += mi_params->mi_cols;
+  }
+}
+
+static void alloc_altref_frame_buffer(AV1_COMP *cpi) {
+  AV1_COMMON *cm = &cpi->common;
+  const SequenceHeader *const seq_params = &cm->seq_params;
+  const AV1EncoderConfig *oxcf = &cpi->oxcf;
+
+  // TODO(agrange) Check if ARF is enabled and skip allocation if not.
+  if (aom_realloc_frame_buffer(
+          &cpi->alt_ref_buffer, oxcf->width, oxcf->height,
+          seq_params->subsampling_x, seq_params->subsampling_y,
+          seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels,
+          cm->features.byte_alignment, NULL, NULL, NULL))
+    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate altref buffer");
+}
+
+static void alloc_util_frame_buffers(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  const SequenceHeader *const seq_params = &cm->seq_params;
+  const int byte_alignment = cm->features.byte_alignment;
+  if (aom_realloc_frame_buffer(
+          &cpi->last_frame_uf, cm->width, cm->height, seq_params->subsampling_x,
+          seq_params->subsampling_y, seq_params->use_highbitdepth,
+          cpi->oxcf.border_in_pixels, byte_alignment, NULL, NULL, NULL))
+    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate last frame buffer");
+
+  if (aom_realloc_frame_buffer(
+          &cpi->trial_frame_rst, cm->superres_upscaled_width,
+          cm->superres_upscaled_height, seq_params->subsampling_x,
+          seq_params->subsampling_y, seq_params->use_highbitdepth,
+          AOM_RESTORATION_FRAME_BORDER, byte_alignment, NULL, NULL, NULL))
+    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate trial restored frame buffer");
+
+  if (aom_realloc_frame_buffer(
+          &cpi->scaled_source, cm->width, cm->height, seq_params->subsampling_x,
+          seq_params->subsampling_y, seq_params->use_highbitdepth,
+          cpi->oxcf.border_in_pixels, byte_alignment, NULL, NULL, NULL))
+    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate scaled source buffer");
+
+  if (aom_realloc_frame_buffer(
+          &cpi->scaled_last_source, cm->width, cm->height,
+          seq_params->subsampling_x, seq_params->subsampling_y,
+          seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels,
+          byte_alignment, NULL, NULL, NULL))
+    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate scaled last source buffer");
+}
+
+static void alloc_compressor_data(AV1_COMP *cpi) {
+  AV1_COMMON *cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+
+  if (av1_alloc_context_buffers(cm, cm->width, cm->height)) {
+    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate context buffers");
+  }
+
+  int mi_rows_aligned_to_sb =
+      ALIGN_POWER_OF_TWO(cm->mi_params.mi_rows, cm->seq_params.mib_size_log2);
+  int sb_rows = mi_rows_aligned_to_sb >> cm->seq_params.mib_size_log2;
+
+  if (!is_stat_generation_stage(cpi)) {
+    av1_alloc_txb_buf(cpi);
+
+    alloc_context_buffers_ext(cm, &cpi->mbmi_ext_info);
+  }
+
+  aom_free(cpi->tile_tok[0][0]);
+  aom_free(cpi->tplist[0][0]);
+
+  if (!is_stat_generation_stage(cpi)) {
+    unsigned int tokens =
+        get_token_alloc(cm->mi_params.mb_rows, cm->mi_params.mb_cols,
+                        MAX_SB_SIZE_LOG2, num_planes);
+    CHECK_MEM_ERROR(cm, cpi->tile_tok[0][0],
+                    aom_calloc(tokens, sizeof(*cpi->tile_tok[0][0])));
+
+    CHECK_MEM_ERROR(cm, cpi->tplist[0][0],
+                    aom_calloc(sb_rows * MAX_TILE_ROWS * MAX_TILE_COLS,
+                               sizeof(*cpi->tplist[0][0])));
+  }
+
+  av1_setup_pc_tree(cpi, &cpi->td);
+}
+
+void av1_new_framerate(AV1_COMP *cpi, double framerate) {
+  cpi->framerate = framerate < 0.1 ? 30 : framerate;
+  av1_rc_update_framerate(cpi, cpi->common.width, cpi->common.height);
+}
+
+double av1_get_compression_ratio(const AV1_COMMON *const cm,
+                                 size_t encoded_frame_size) {
+  const int upscaled_width = cm->superres_upscaled_width;
+  const int height = cm->height;
+  const int luma_pic_size = upscaled_width * height;
+  const SequenceHeader *const seq_params = &cm->seq_params;
+  const BITSTREAM_PROFILE profile = seq_params->profile;
+  const int pic_size_profile_factor =
+      profile == PROFILE_0 ? 15 : (profile == PROFILE_1 ? 30 : 36);
+  encoded_frame_size =
+      (encoded_frame_size > 129 ? encoded_frame_size - 128 : 1);
+  const size_t uncompressed_frame_size =
+      (luma_pic_size * pic_size_profile_factor) >> 3;
+  return uncompressed_frame_size / (double)encoded_frame_size;
+}
+
+static void set_tile_info(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  const SequenceHeader *const seq_params = &cm->seq_params;
+  CommonTileParams *const tiles = &cm->tiles;
+  int i, start_sb;
+
+  av1_get_tile_limits(cm);
+
+  // configure tile columns
+  if (cpi->oxcf.tile_width_count == 0 || cpi->oxcf.tile_height_count == 0) {
+    tiles->uniform_spacing = 1;
+    tiles->log2_cols = AOMMAX(cpi->oxcf.tile_columns, tiles->min_log2_cols);
+    tiles->log2_cols = AOMMIN(tiles->log2_cols, tiles->max_log2_cols);
+  } else {
+    int mi_cols =
+        ALIGN_POWER_OF_TWO(mi_params->mi_cols, seq_params->mib_size_log2);
+    int sb_cols = mi_cols >> seq_params->mib_size_log2;
+    int size_sb, j = 0;
+    tiles->uniform_spacing = 0;
+    for (i = 0, start_sb = 0; start_sb < sb_cols && i < MAX_TILE_COLS; i++) {
+      tiles->col_start_sb[i] = start_sb;
+      size_sb = cpi->oxcf.tile_widths[j++];
+      if (j >= cpi->oxcf.tile_width_count) j = 0;
+      start_sb += AOMMIN(size_sb, tiles->max_width_sb);
+    }
+    tiles->cols = i;
+    tiles->col_start_sb[i] = sb_cols;
+  }
+  av1_calculate_tile_cols(seq_params, mi_params->mi_rows, mi_params->mi_cols,
+                          tiles);
+
+  // configure tile rows
+  if (tiles->uniform_spacing) {
+    tiles->log2_rows = AOMMAX(cpi->oxcf.tile_rows, tiles->min_log2_rows);
+    tiles->log2_rows = AOMMIN(tiles->log2_rows, tiles->max_log2_rows);
+  } else {
+    int mi_rows =
+        ALIGN_POWER_OF_TWO(mi_params->mi_rows, seq_params->mib_size_log2);
+    int sb_rows = mi_rows >> seq_params->mib_size_log2;
+    int size_sb, j = 0;
+    for (i = 0, start_sb = 0; start_sb < sb_rows && i < MAX_TILE_ROWS; i++) {
+      tiles->row_start_sb[i] = start_sb;
+      size_sb = cpi->oxcf.tile_heights[j++];
+      if (j >= cpi->oxcf.tile_height_count) j = 0;
+      start_sb += AOMMIN(size_sb, tiles->max_height_sb);
+    }
+    tiles->rows = i;
+    tiles->row_start_sb[i] = sb_rows;
+  }
+  av1_calculate_tile_rows(seq_params, mi_params->mi_rows, tiles);
+}
+
+static void update_frame_size(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+
+  // We need to reallocate the context buffers here in case we need more mis.
+  if (av1_alloc_context_buffers(cm, cm->width, cm->height)) {
+    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate context buffers");
+  }
+  av1_init_mi_buffers(&cm->mi_params);
+
+  av1_init_macroblockd(cm, xd, NULL);
+
+  if (!is_stat_generation_stage(cpi))
+    alloc_context_buffers_ext(cm, &cpi->mbmi_ext_info);
+  set_tile_info(cpi);
+}
+
+static void init_buffer_indices(ForceIntegerMVInfo *const force_intpel_info,
+                                int *const remapped_ref_idx) {
+  int fb_idx;
+  for (fb_idx = 0; fb_idx < REF_FRAMES; ++fb_idx)
+    remapped_ref_idx[fb_idx] = fb_idx;
+  force_intpel_info->rate_index = 0;
+  force_intpel_info->rate_size = 0;
+}
+
+static INLINE int does_level_match(int width, int height, double fps,
+                                   int lvl_width, int lvl_height,
+                                   double lvl_fps, int lvl_dim_mult) {
+  const int64_t lvl_luma_pels = lvl_width * lvl_height;
+  const double lvl_display_sample_rate = lvl_luma_pels * lvl_fps;
+  const int64_t luma_pels = width * height;
+  const double display_sample_rate = luma_pels * fps;
+  return luma_pels <= lvl_luma_pels &&
+         display_sample_rate <= lvl_display_sample_rate &&
+         width <= lvl_width * lvl_dim_mult &&
+         height <= lvl_height * lvl_dim_mult;
+}
+
+static void set_bitstream_level_tier(SequenceHeader *seq, AV1_COMMON *cm,
+                                     const AV1EncoderConfig *oxcf) {
+  // TODO(any): This is a placeholder function that only addresses dimensions
+  // and max display sample rates.
+  // Need to add checks for max bit rate, max decoded luma sample rate, header
+  // rate, etc. that are not covered by this function.
+  AV1_LEVEL level = SEQ_LEVEL_MAX;
+  if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate, 512,
+                       288, 30.0, 4)) {
+    level = SEQ_LEVEL_2_0;
+  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
+                              704, 396, 30.0, 4)) {
+    level = SEQ_LEVEL_2_1;
+  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
+                              1088, 612, 30.0, 4)) {
+    level = SEQ_LEVEL_3_0;
+  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
+                              1376, 774, 30.0, 4)) {
+    level = SEQ_LEVEL_3_1;
+  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
+                              2048, 1152, 30.0, 3)) {
+    level = SEQ_LEVEL_4_0;
+  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
+                              2048, 1152, 60.0, 3)) {
+    level = SEQ_LEVEL_4_1;
+  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
+                              4096, 2176, 30.0, 2)) {
+    level = SEQ_LEVEL_5_0;
+  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
+                              4096, 2176, 60.0, 2)) {
+    level = SEQ_LEVEL_5_1;
+  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
+                              4096, 2176, 120.0, 2)) {
+    level = SEQ_LEVEL_5_2;
+  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
+                              8192, 4352, 30.0, 2)) {
+    level = SEQ_LEVEL_6_0;
+  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
+                              8192, 4352, 60.0, 2)) {
+    level = SEQ_LEVEL_6_1;
+  } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
+                              8192, 4352, 120.0, 2)) {
+    level = SEQ_LEVEL_6_2;
+  }
+
+  SequenceHeader *const seq_params = &cm->seq_params;
+  for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) {
+    seq->seq_level_idx[i] = level;
+    // Set the maximum parameters for bitrate and buffer size for this profile,
+    // level, and tier
+    seq_params->op_params[i].bitrate = av1_max_level_bitrate(
+        cm->seq_params.profile, seq->seq_level_idx[i], seq->tier[i]);
+    // Level with seq_level_idx = 31 returns a high "dummy" bitrate to pass the
+    // check
+    if (seq_params->op_params[i].bitrate == 0)
+      aom_internal_error(
+          &cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+          "AV1 does not support this combination of profile, level, and tier.");
+    // Buffer size in bits/s is bitrate in bits/s * 1 s
+    seq_params->op_params[i].buffer_size = seq_params->op_params[i].bitrate;
+  }
+}
+
+static void init_seq_coding_tools(SequenceHeader *seq, AV1_COMMON *cm,
+                                  const AV1EncoderConfig *oxcf, int use_svc) {
+  seq->still_picture = (oxcf->force_video_mode == 0) && (oxcf->limit == 1);
+  seq->reduced_still_picture_hdr = seq->still_picture;
+  seq->reduced_still_picture_hdr &= !oxcf->full_still_picture_hdr;
+  seq->force_screen_content_tools = (oxcf->mode == REALTIME) ? 0 : 2;
+  seq->force_integer_mv = 2;
+  seq->order_hint_info.enable_order_hint = oxcf->enable_order_hint;
+  seq->frame_id_numbers_present_flag =
+      !(seq->still_picture && seq->reduced_still_picture_hdr) &&
+      !oxcf->large_scale_tile && oxcf->error_resilient_mode && !use_svc;
+  if (seq->still_picture && seq->reduced_still_picture_hdr) {
+    seq->order_hint_info.enable_order_hint = 0;
+    seq->force_screen_content_tools = 2;
+    seq->force_integer_mv = 2;
+  }
+  seq->order_hint_info.order_hint_bits_minus_1 =
+      seq->order_hint_info.enable_order_hint
+          ? DEFAULT_EXPLICIT_ORDER_HINT_BITS - 1
+          : -1;
+
+  seq->max_frame_width =
+      oxcf->forced_max_frame_width ? oxcf->forced_max_frame_width : oxcf->width;
+  seq->max_frame_height = oxcf->forced_max_frame_height
+                              ? oxcf->forced_max_frame_height
+                              : oxcf->height;
+  seq->num_bits_width =
+      (seq->max_frame_width > 1) ? get_msb(seq->max_frame_width - 1) + 1 : 1;
+  seq->num_bits_height =
+      (seq->max_frame_height > 1) ? get_msb(seq->max_frame_height - 1) + 1 : 1;
+  assert(seq->num_bits_width <= 16);
+  assert(seq->num_bits_height <= 16);
+
+  seq->frame_id_length = FRAME_ID_LENGTH;
+  seq->delta_frame_id_length = DELTA_FRAME_ID_LENGTH;
+
+  seq->enable_dual_filter = oxcf->enable_dual_filter;
+  seq->order_hint_info.enable_dist_wtd_comp = oxcf->enable_dist_wtd_comp;
+  seq->order_hint_info.enable_dist_wtd_comp &=
+      seq->order_hint_info.enable_order_hint;
+  seq->order_hint_info.enable_ref_frame_mvs = oxcf->enable_ref_frame_mvs;
+  seq->order_hint_info.enable_ref_frame_mvs &=
+      seq->order_hint_info.enable_order_hint;
+  seq->enable_superres = oxcf->enable_superres;
+  seq->enable_cdef = oxcf->enable_cdef;
+  seq->enable_restoration = oxcf->enable_restoration;
+  seq->enable_warped_motion = oxcf->enable_warped_motion;
+  seq->enable_interintra_compound = oxcf->enable_interintra_comp;
+  seq->enable_masked_compound = oxcf->enable_masked_comp;
+  seq->enable_intra_edge_filter = oxcf->enable_intra_edge_filter;
+  seq->enable_filter_intra = oxcf->enable_filter_intra;
+
+  set_bitstream_level_tier(seq, cm, oxcf);
+
+  if (seq->operating_points_cnt_minus_1 == 0) {
+    seq->operating_point_idc[0] = 0;
+  } else {
+    // Set operating_point_idc[] such that the i=0 point corresponds to the
+    // highest quality operating point (all layers), and subsequent
+    // operarting points (i > 0) are lower quality corresponding to
+    // skip decoding enhancement  layers (temporal first).
+    int i = 0;
+    assert(seq->operating_points_cnt_minus_1 ==
+           (int)(cm->number_spatial_layers * cm->number_temporal_layers - 1));
+    for (unsigned int sl = 0; sl < cm->number_spatial_layers; sl++) {
+      for (unsigned int tl = 0; tl < cm->number_temporal_layers; tl++) {
+        seq->operating_point_idc[i] =
+            (~(~0u << (cm->number_spatial_layers - sl)) << 8) |
+            ~(~0u << (cm->number_temporal_layers - tl));
+        i++;
+      }
+    }
+  }
+}
+
+static void init_config(struct AV1_COMP *cpi, AV1EncoderConfig *oxcf) {
+  AV1_COMMON *const cm = &cpi->common;
+  SequenceHeader *const seq_params = &cm->seq_params;
+  ResizePendingParams *resize_pending_params = &cpi->resize_pending_params;
+
+  cpi->oxcf = *oxcf;
+  cpi->framerate = oxcf->init_framerate;
+
+  seq_params->profile = oxcf->profile;
+  seq_params->bit_depth = oxcf->bit_depth;
+  seq_params->use_highbitdepth = oxcf->use_highbitdepth;
+  seq_params->color_primaries = oxcf->color_primaries;
+  seq_params->transfer_characteristics = oxcf->transfer_characteristics;
+  seq_params->matrix_coefficients = oxcf->matrix_coefficients;
+  seq_params->monochrome = oxcf->monochrome;
+  seq_params->chroma_sample_position = oxcf->chroma_sample_position;
+  seq_params->color_range = oxcf->color_range;
+  seq_params->timing_info_present = oxcf->timing_info_present;
+  seq_params->timing_info.num_units_in_display_tick =
+      oxcf->timing_info.num_units_in_display_tick;
+  seq_params->timing_info.time_scale = oxcf->timing_info.time_scale;
+  seq_params->timing_info.equal_picture_interval =
+      oxcf->timing_info.equal_picture_interval;
+  seq_params->timing_info.num_ticks_per_picture =
+      oxcf->timing_info.num_ticks_per_picture;
+
+  seq_params->display_model_info_present_flag =
+      oxcf->display_model_info_present_flag;
+  seq_params->decoder_model_info_present_flag =
+      oxcf->decoder_model_info_present_flag;
+  if (oxcf->decoder_model_info_present_flag) {
+    // set the decoder model parameters in schedule mode
+    seq_params->decoder_model_info.num_units_in_decoding_tick =
+        oxcf->buffer_model.num_units_in_decoding_tick;
+    cm->buffer_removal_time_present = 1;
+    av1_set_aom_dec_model_info(&seq_params->decoder_model_info);
+    av1_set_dec_model_op_parameters(&seq_params->op_params[0]);
+  } else if (seq_params->timing_info_present &&
+             seq_params->timing_info.equal_picture_interval &&
+             !seq_params->decoder_model_info_present_flag) {
+    // set the decoder model parameters in resource availability mode
+    av1_set_resource_availability_parameters(&seq_params->op_params[0]);
+  } else {
+    seq_params->op_params[0].initial_display_delay =
+        10;  // Default value (not signaled)
+  }
+
+  if (seq_params->monochrome) {
+    seq_params->subsampling_x = 1;
+    seq_params->subsampling_y = 1;
+  } else if (seq_params->color_primaries == AOM_CICP_CP_BT_709 &&
+             seq_params->transfer_characteristics == AOM_CICP_TC_SRGB &&
+             seq_params->matrix_coefficients == AOM_CICP_MC_IDENTITY) {
+    seq_params->subsampling_x = 0;
+    seq_params->subsampling_y = 0;
+  } else {
+    if (seq_params->profile == 0) {
+      seq_params->subsampling_x = 1;
+      seq_params->subsampling_y = 1;
+    } else if (seq_params->profile == 1) {
+      seq_params->subsampling_x = 0;
+      seq_params->subsampling_y = 0;
+    } else {
+      if (seq_params->bit_depth == AOM_BITS_12) {
+        seq_params->subsampling_x = oxcf->chroma_subsampling_x;
+        seq_params->subsampling_y = oxcf->chroma_subsampling_y;
+      } else {
+        seq_params->subsampling_x = 1;
+        seq_params->subsampling_y = 0;
+      }
+    }
+  }
+
+  cm->width = oxcf->width;
+  cm->height = oxcf->height;
+  set_sb_size(seq_params,
+              select_sb_size(cpi));  // set sb size before allocations
+  alloc_compressor_data(cpi);
+
+  update_film_grain_parameters(cpi, oxcf);
+
+  // Single thread case: use counts in common.
+  cpi->td.counts = &cpi->counts;
+
+  // Set init SVC parameters.
+  cpi->use_svc = 0;
+  cpi->svc.external_ref_frame_config = 0;
+  cpi->svc.non_reference_frame = 0;
+  cpi->svc.number_spatial_layers = 1;
+  cpi->svc.number_temporal_layers = 1;
+  cm->number_spatial_layers = 1;
+  cm->number_temporal_layers = 1;
+  cm->spatial_layer_id = 0;
+  cm->temporal_layer_id = 0;
+
+  // change includes all joint functionality
+  av1_change_config(cpi, oxcf);
+
+  cpi->ref_frame_flags = 0;
+
+  // Reset resize pending flags
+  resize_pending_params->width = 0;
+  resize_pending_params->height = 0;
+
+  init_buffer_indices(&cpi->force_intpel_info, cm->remapped_ref_idx);
+}
+
+static void set_rc_buffer_sizes(RATE_CONTROL *rc,
+                                const AV1EncoderConfig *oxcf) {
+  const int64_t bandwidth = oxcf->target_bandwidth;
+  const int64_t starting = oxcf->starting_buffer_level_ms;
+  const int64_t optimal = oxcf->optimal_buffer_level_ms;
+  const int64_t maximum = oxcf->maximum_buffer_size_ms;
+
+  rc->starting_buffer_level = starting * bandwidth / 1000;
+  rc->optimal_buffer_level =
+      (optimal == 0) ? bandwidth / 8 : optimal * bandwidth / 1000;
+  rc->maximum_buffer_size =
+      (maximum == 0) ? bandwidth / 8 : maximum * bandwidth / 1000;
+}
+
+#define HIGHBD_BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF, JSDAF, JSVAF) \
+  cpi->fn_ptr[BT].sdf = SDF;                                           \
+  cpi->fn_ptr[BT].sdaf = SDAF;                                         \
+  cpi->fn_ptr[BT].vf = VF;                                             \
+  cpi->fn_ptr[BT].svf = SVF;                                           \
+  cpi->fn_ptr[BT].svaf = SVAF;                                         \
+  cpi->fn_ptr[BT].sdx4df = SDX4DF;                                     \
+  cpi->fn_ptr[BT].jsdaf = JSDAF;                                       \
+  cpi->fn_ptr[BT].jsvaf = JSVAF;
+
+#define MAKE_BFP_SAD_WRAPPER(fnname)                                           \
+  static unsigned int fnname##_bits8(const uint8_t *src_ptr,                   \
+                                     int source_stride,                        \
+                                     const uint8_t *ref_ptr, int ref_stride) { \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride);                \
+  }                                                                            \
+  static unsigned int fnname##_bits10(                                         \
+      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,       \
+      int ref_stride) {                                                        \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride) >> 2;           \
+  }                                                                            \
+  static unsigned int fnname##_bits12(                                         \
+      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,       \
+      int ref_stride) {                                                        \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride) >> 4;           \
+  }
+
+#define MAKE_BFP_SADAVG_WRAPPER(fnname)                                        \
+  static unsigned int fnname##_bits8(                                          \
+      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,       \
+      int ref_stride, const uint8_t *second_pred) {                            \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred);   \
+  }                                                                            \
+  static unsigned int fnname##_bits10(                                         \
+      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,       \
+      int ref_stride, const uint8_t *second_pred) {                            \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred) >> \
+           2;                                                                  \
+  }                                                                            \
+  static unsigned int fnname##_bits12(                                         \
+      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,       \
+      int ref_stride, const uint8_t *second_pred) {                            \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred) >> \
+           4;                                                                  \
+  }
+
+#define MAKE_BFP_SAD4D_WRAPPER(fnname)                                        \
+  static void fnname##_bits8(const uint8_t *src_ptr, int source_stride,       \
+                             const uint8_t *const ref_ptr[], int ref_stride,  \
+                             unsigned int *sad_array) {                       \
+    fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array);           \
+  }                                                                           \
+  static void fnname##_bits10(const uint8_t *src_ptr, int source_stride,      \
+                              const uint8_t *const ref_ptr[], int ref_stride, \
+                              unsigned int *sad_array) {                      \
+    int i;                                                                    \
+    fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array);           \
+    for (i = 0; i < 4; i++) sad_array[i] >>= 2;                               \
+  }                                                                           \
+  static void fnname##_bits12(const uint8_t *src_ptr, int source_stride,      \
+                              const uint8_t *const ref_ptr[], int ref_stride, \
+                              unsigned int *sad_array) {                      \
+    int i;                                                                    \
+    fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array);           \
+    for (i = 0; i < 4; i++) sad_array[i] >>= 4;                               \
+  }
+
+#define MAKE_BFP_JSADAVG_WRAPPER(fnname)                                    \
+  static unsigned int fnname##_bits8(                                       \
+      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,    \
+      int ref_stride, const uint8_t *second_pred,                           \
+      const DIST_WTD_COMP_PARAMS *jcp_param) {                              \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred, \
+                  jcp_param);                                               \
+  }                                                                         \
+  static unsigned int fnname##_bits10(                                      \
+      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,    \
+      int ref_stride, const uint8_t *second_pred,                           \
+      const DIST_WTD_COMP_PARAMS *jcp_param) {                              \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred, \
+                  jcp_param) >>                                             \
+           2;                                                               \
+  }                                                                         \
+  static unsigned int fnname##_bits12(                                      \
+      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,    \
+      int ref_stride, const uint8_t *second_pred,                           \
+      const DIST_WTD_COMP_PARAMS *jcp_param) {                              \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred, \
+                  jcp_param) >>                                             \
+           4;                                                               \
+  }
+
+#if CONFIG_AV1_HIGHBITDEPTH
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad128x128)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad128x128_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad128x128x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad128x64)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad128x64_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad128x64x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x128)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x128_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x128x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x16)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x16_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x16x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x32)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x32_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x32x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x32)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x32_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x32x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x64)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x64_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x64x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x32)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x32_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x32x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x64)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x64_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x64x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x16)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x16_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x16x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x8)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x8_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x8x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x16)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x16_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x16x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x8)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x8_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x8x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x4)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x4_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x4x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad4x8)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad4x8_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x8x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad4x4)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad4x4_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x4x4d)
+
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad4x16)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad4x16_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x16x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x4)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x4_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x4x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x32)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x32_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x32x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x8)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x8_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x8x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x64)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x64_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x64x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x16)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x16_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x16x4d)
+
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad128x128_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad128x64_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad64x128_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad32x16_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad16x32_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad64x32_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad32x64_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad32x32_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad64x64_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad16x16_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad16x8_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad8x16_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad8x8_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad8x4_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad4x8_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad4x4_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad4x16_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad16x4_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad8x32_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad32x8_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad16x64_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad64x16_avg)
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+#define HIGHBD_MBFP(BT, MCSDF, MCSVF) \
+  cpi->fn_ptr[BT].msdf = MCSDF;       \
+  cpi->fn_ptr[BT].msvf = MCSVF;
+
+#define MAKE_MBFP_COMPOUND_SAD_WRAPPER(fnname)                           \
+  static unsigned int fnname##_bits8(                                    \
+      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
+      int ref_stride, const uint8_t *second_pred_ptr, const uint8_t *m,  \
+      int m_stride, int invert_mask) {                                   \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride,           \
+                  second_pred_ptr, m, m_stride, invert_mask);            \
+  }                                                                      \
+  static unsigned int fnname##_bits10(                                   \
+      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
+      int ref_stride, const uint8_t *second_pred_ptr, const uint8_t *m,  \
+      int m_stride, int invert_mask) {                                   \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride,           \
+                  second_pred_ptr, m, m_stride, invert_mask) >>          \
+           2;                                                            \
+  }                                                                      \
+  static unsigned int fnname##_bits12(                                   \
+      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
+      int ref_stride, const uint8_t *second_pred_ptr, const uint8_t *m,  \
+      int m_stride, int invert_mask) {                                   \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride,           \
+                  second_pred_ptr, m, m_stride, invert_mask) >>          \
+           4;                                                            \
+  }
+
+#if CONFIG_AV1_HIGHBITDEPTH
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad128x128)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad128x64)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x128)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x64)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x32)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad32x64)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad32x32)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad32x16)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x32)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x16)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x8)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad8x16)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad8x8)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad8x4)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad4x8)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad4x4)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad4x16)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x4)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad8x32)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad32x8)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x64)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x16)
+#endif
+
+#define HIGHBD_OBFP(BT, OSDF, OVF, OSVF) \
+  cpi->fn_ptr[BT].osdf = OSDF;           \
+  cpi->fn_ptr[BT].ovf = OVF;             \
+  cpi->fn_ptr[BT].osvf = OSVF;
+
+#define MAKE_OBFP_SAD_WRAPPER(fnname)                                     \
+  static unsigned int fnname##_bits8(const uint8_t *ref, int ref_stride,  \
+                                     const int32_t *wsrc,                 \
+                                     const int32_t *msk) {                \
+    return fnname(ref, ref_stride, wsrc, msk);                            \
+  }                                                                       \
+  static unsigned int fnname##_bits10(const uint8_t *ref, int ref_stride, \
+                                      const int32_t *wsrc,                \
+                                      const int32_t *msk) {               \
+    return fnname(ref, ref_stride, wsrc, msk) >> 2;                       \
+  }                                                                       \
+  static unsigned int fnname##_bits12(const uint8_t *ref, int ref_stride, \
+                                      const int32_t *wsrc,                \
+                                      const int32_t *msk) {               \
+    return fnname(ref, ref_stride, wsrc, msk) >> 4;                       \
+  }
+
+#if CONFIG_AV1_HIGHBITDEPTH
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad128x128)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad128x64)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x128)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x64)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x32)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x64)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x32)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x16)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x32)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x16)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x8)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x16)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x8)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x4)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad4x8)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad4x4)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad4x16)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x4)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x32)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x8)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x64)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x16)
+
+static void highbd_set_var_fns(AV1_COMP *const cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  if (cm->seq_params.use_highbitdepth) {
+    switch (cm->seq_params.bit_depth) {
+      case AOM_BITS_8:
+        HIGHBD_BFP(BLOCK_64X16, aom_highbd_sad64x16_bits8,
+                   aom_highbd_sad64x16_avg_bits8, aom_highbd_8_variance64x16,
+                   aom_highbd_8_sub_pixel_variance64x16,
+                   aom_highbd_8_sub_pixel_avg_variance64x16,
+                   aom_highbd_sad64x16x4d_bits8,
+                   aom_highbd_dist_wtd_sad64x16_avg_bits8,
+                   aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x16)
+
+        HIGHBD_BFP(BLOCK_16X64, aom_highbd_sad16x64_bits8,
+                   aom_highbd_sad16x64_avg_bits8, aom_highbd_8_variance16x64,
+                   aom_highbd_8_sub_pixel_variance16x64,
+                   aom_highbd_8_sub_pixel_avg_variance16x64,
+                   aom_highbd_sad16x64x4d_bits8,
+                   aom_highbd_dist_wtd_sad16x64_avg_bits8,
+                   aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x64)
+
+        HIGHBD_BFP(
+            BLOCK_32X8, aom_highbd_sad32x8_bits8, aom_highbd_sad32x8_avg_bits8,
+            aom_highbd_8_variance32x8, aom_highbd_8_sub_pixel_variance32x8,
+            aom_highbd_8_sub_pixel_avg_variance32x8,
+            aom_highbd_sad32x8x4d_bits8, aom_highbd_dist_wtd_sad32x8_avg_bits8,
+            aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x8)
+
+        HIGHBD_BFP(
+            BLOCK_8X32, aom_highbd_sad8x32_bits8, aom_highbd_sad8x32_avg_bits8,
+            aom_highbd_8_variance8x32, aom_highbd_8_sub_pixel_variance8x32,
+            aom_highbd_8_sub_pixel_avg_variance8x32,
+            aom_highbd_sad8x32x4d_bits8, aom_highbd_dist_wtd_sad8x32_avg_bits8,
+            aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x32)
+
+        HIGHBD_BFP(
+            BLOCK_16X4, aom_highbd_sad16x4_bits8, aom_highbd_sad16x4_avg_bits8,
+            aom_highbd_8_variance16x4, aom_highbd_8_sub_pixel_variance16x4,
+            aom_highbd_8_sub_pixel_avg_variance16x4,
+            aom_highbd_sad16x4x4d_bits8, aom_highbd_dist_wtd_sad16x4_avg_bits8,
+            aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x4)
+
+        HIGHBD_BFP(
+            BLOCK_4X16, aom_highbd_sad4x16_bits8, aom_highbd_sad4x16_avg_bits8,
+            aom_highbd_8_variance4x16, aom_highbd_8_sub_pixel_variance4x16,
+            aom_highbd_8_sub_pixel_avg_variance4x16,
+            aom_highbd_sad4x16x4d_bits8, aom_highbd_dist_wtd_sad4x16_avg_bits8,
+            aom_highbd_8_dist_wtd_sub_pixel_avg_variance4x16)
+
+        HIGHBD_BFP(BLOCK_32X16, aom_highbd_sad32x16_bits8,
+                   aom_highbd_sad32x16_avg_bits8, aom_highbd_8_variance32x16,
+                   aom_highbd_8_sub_pixel_variance32x16,
+                   aom_highbd_8_sub_pixel_avg_variance32x16,
+                   aom_highbd_sad32x16x4d_bits8,
+                   aom_highbd_dist_wtd_sad32x16_avg_bits8,
+                   aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x16)
+
+        HIGHBD_BFP(BLOCK_16X32, aom_highbd_sad16x32_bits8,
+                   aom_highbd_sad16x32_avg_bits8, aom_highbd_8_variance16x32,
+                   aom_highbd_8_sub_pixel_variance16x32,
+                   aom_highbd_8_sub_pixel_avg_variance16x32,
+                   aom_highbd_sad16x32x4d_bits8,
+                   aom_highbd_dist_wtd_sad16x32_avg_bits8,
+                   aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x32)
+
+        HIGHBD_BFP(BLOCK_64X32, aom_highbd_sad64x32_bits8,
+                   aom_highbd_sad64x32_avg_bits8, aom_highbd_8_variance64x32,
+                   aom_highbd_8_sub_pixel_variance64x32,
+                   aom_highbd_8_sub_pixel_avg_variance64x32,
+                   aom_highbd_sad64x32x4d_bits8,
+                   aom_highbd_dist_wtd_sad64x32_avg_bits8,
+                   aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x32)
+
+        HIGHBD_BFP(BLOCK_32X64, aom_highbd_sad32x64_bits8,
+                   aom_highbd_sad32x64_avg_bits8, aom_highbd_8_variance32x64,
+                   aom_highbd_8_sub_pixel_variance32x64,
+                   aom_highbd_8_sub_pixel_avg_variance32x64,
+                   aom_highbd_sad32x64x4d_bits8,
+                   aom_highbd_dist_wtd_sad32x64_avg_bits8,
+                   aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x64)
+
+        HIGHBD_BFP(BLOCK_32X32, aom_highbd_sad32x32_bits8,
+                   aom_highbd_sad32x32_avg_bits8, aom_highbd_8_variance32x32,
+                   aom_highbd_8_sub_pixel_variance32x32,
+                   aom_highbd_8_sub_pixel_avg_variance32x32,
+                   aom_highbd_sad32x32x4d_bits8,
+                   aom_highbd_dist_wtd_sad32x32_avg_bits8,
+                   aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x32)
+
+        HIGHBD_BFP(BLOCK_64X64, aom_highbd_sad64x64_bits8,
+                   aom_highbd_sad64x64_avg_bits8, aom_highbd_8_variance64x64,
+                   aom_highbd_8_sub_pixel_variance64x64,
+                   aom_highbd_8_sub_pixel_avg_variance64x64,
+                   aom_highbd_sad64x64x4d_bits8,
+                   aom_highbd_dist_wtd_sad64x64_avg_bits8,
+                   aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x64)
+
+        HIGHBD_BFP(BLOCK_16X16, aom_highbd_sad16x16_bits8,
+                   aom_highbd_sad16x16_avg_bits8, aom_highbd_8_variance16x16,
+                   aom_highbd_8_sub_pixel_variance16x16,
+                   aom_highbd_8_sub_pixel_avg_variance16x16,
+                   aom_highbd_sad16x16x4d_bits8,
+                   aom_highbd_dist_wtd_sad16x16_avg_bits8,
+                   aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x16)
+
+        HIGHBD_BFP(
+            BLOCK_16X8, aom_highbd_sad16x8_bits8, aom_highbd_sad16x8_avg_bits8,
+            aom_highbd_8_variance16x8, aom_highbd_8_sub_pixel_variance16x8,
+            aom_highbd_8_sub_pixel_avg_variance16x8,
+            aom_highbd_sad16x8x4d_bits8, aom_highbd_dist_wtd_sad16x8_avg_bits8,
+            aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x8)
+
+        HIGHBD_BFP(
+            BLOCK_8X16, aom_highbd_sad8x16_bits8, aom_highbd_sad8x16_avg_bits8,
+            aom_highbd_8_variance8x16, aom_highbd_8_sub_pixel_variance8x16,
+            aom_highbd_8_sub_pixel_avg_variance8x16,
+            aom_highbd_sad8x16x4d_bits8, aom_highbd_dist_wtd_sad8x16_avg_bits8,
+            aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x16)
+
+        HIGHBD_BFP(
+            BLOCK_8X8, aom_highbd_sad8x8_bits8, aom_highbd_sad8x8_avg_bits8,
+            aom_highbd_8_variance8x8, aom_highbd_8_sub_pixel_variance8x8,
+            aom_highbd_8_sub_pixel_avg_variance8x8, aom_highbd_sad8x8x4d_bits8,
+            aom_highbd_dist_wtd_sad8x8_avg_bits8,
+            aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x8)
+
+        HIGHBD_BFP(
+            BLOCK_8X4, aom_highbd_sad8x4_bits8, aom_highbd_sad8x4_avg_bits8,
+            aom_highbd_8_variance8x4, aom_highbd_8_sub_pixel_variance8x4,
+            aom_highbd_8_sub_pixel_avg_variance8x4, aom_highbd_sad8x4x4d_bits8,
+            aom_highbd_dist_wtd_sad8x4_avg_bits8,
+            aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x4)
+
+        HIGHBD_BFP(
+            BLOCK_4X8, aom_highbd_sad4x8_bits8, aom_highbd_sad4x8_avg_bits8,
+            aom_highbd_8_variance4x8, aom_highbd_8_sub_pixel_variance4x8,
+            aom_highbd_8_sub_pixel_avg_variance4x8, aom_highbd_sad4x8x4d_bits8,
+            aom_highbd_dist_wtd_sad4x8_avg_bits8,
+            aom_highbd_8_dist_wtd_sub_pixel_avg_variance4x8)
+
+        HIGHBD_BFP(
+            BLOCK_4X4, aom_highbd_sad4x4_bits8, aom_highbd_sad4x4_avg_bits8,
+            aom_highbd_8_variance4x4, aom_highbd_8_sub_pixel_variance4x4,
+            aom_highbd_8_sub_pixel_avg_variance4x4, aom_highbd_sad4x4x4d_bits8,
+            aom_highbd_dist_wtd_sad4x4_avg_bits8,
+            aom_highbd_8_dist_wtd_sub_pixel_avg_variance4x4)
+
+        HIGHBD_BFP(BLOCK_128X128, aom_highbd_sad128x128_bits8,
+                   aom_highbd_sad128x128_avg_bits8,
+                   aom_highbd_8_variance128x128,
+                   aom_highbd_8_sub_pixel_variance128x128,
+                   aom_highbd_8_sub_pixel_avg_variance128x128,
+                   aom_highbd_sad128x128x4d_bits8,
+                   aom_highbd_dist_wtd_sad128x128_avg_bits8,
+                   aom_highbd_8_dist_wtd_sub_pixel_avg_variance128x128)
+
+        HIGHBD_BFP(BLOCK_128X64, aom_highbd_sad128x64_bits8,
+                   aom_highbd_sad128x64_avg_bits8, aom_highbd_8_variance128x64,
+                   aom_highbd_8_sub_pixel_variance128x64,
+                   aom_highbd_8_sub_pixel_avg_variance128x64,
+                   aom_highbd_sad128x64x4d_bits8,
+                   aom_highbd_dist_wtd_sad128x64_avg_bits8,
+                   aom_highbd_8_dist_wtd_sub_pixel_avg_variance128x64)
+
+        HIGHBD_BFP(BLOCK_64X128, aom_highbd_sad64x128_bits8,
+                   aom_highbd_sad64x128_avg_bits8, aom_highbd_8_variance64x128,
+                   aom_highbd_8_sub_pixel_variance64x128,
+                   aom_highbd_8_sub_pixel_avg_variance64x128,
+                   aom_highbd_sad64x128x4d_bits8,
+                   aom_highbd_dist_wtd_sad64x128_avg_bits8,
+                   aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x128)
+
+        HIGHBD_MBFP(BLOCK_128X128, aom_highbd_masked_sad128x128_bits8,
+                    aom_highbd_8_masked_sub_pixel_variance128x128)
+        HIGHBD_MBFP(BLOCK_128X64, aom_highbd_masked_sad128x64_bits8,
+                    aom_highbd_8_masked_sub_pixel_variance128x64)
+        HIGHBD_MBFP(BLOCK_64X128, aom_highbd_masked_sad64x128_bits8,
+                    aom_highbd_8_masked_sub_pixel_variance64x128)
+        HIGHBD_MBFP(BLOCK_64X64, aom_highbd_masked_sad64x64_bits8,
+                    aom_highbd_8_masked_sub_pixel_variance64x64)
+        HIGHBD_MBFP(BLOCK_64X32, aom_highbd_masked_sad64x32_bits8,
+                    aom_highbd_8_masked_sub_pixel_variance64x32)
+        HIGHBD_MBFP(BLOCK_32X64, aom_highbd_masked_sad32x64_bits8,
+                    aom_highbd_8_masked_sub_pixel_variance32x64)
+        HIGHBD_MBFP(BLOCK_32X32, aom_highbd_masked_sad32x32_bits8,
+                    aom_highbd_8_masked_sub_pixel_variance32x32)
+        HIGHBD_MBFP(BLOCK_32X16, aom_highbd_masked_sad32x16_bits8,
+                    aom_highbd_8_masked_sub_pixel_variance32x16)
+        HIGHBD_MBFP(BLOCK_16X32, aom_highbd_masked_sad16x32_bits8,
+                    aom_highbd_8_masked_sub_pixel_variance16x32)
+        HIGHBD_MBFP(BLOCK_16X16, aom_highbd_masked_sad16x16_bits8,
+                    aom_highbd_8_masked_sub_pixel_variance16x16)
+        HIGHBD_MBFP(BLOCK_8X16, aom_highbd_masked_sad8x16_bits8,
+                    aom_highbd_8_masked_sub_pixel_variance8x16)
+        HIGHBD_MBFP(BLOCK_16X8, aom_highbd_masked_sad16x8_bits8,
+                    aom_highbd_8_masked_sub_pixel_variance16x8)
+        HIGHBD_MBFP(BLOCK_8X8, aom_highbd_masked_sad8x8_bits8,
+                    aom_highbd_8_masked_sub_pixel_variance8x8)
+        HIGHBD_MBFP(BLOCK_4X8, aom_highbd_masked_sad4x8_bits8,
+                    aom_highbd_8_masked_sub_pixel_variance4x8)
+        HIGHBD_MBFP(BLOCK_8X4, aom_highbd_masked_sad8x4_bits8,
+                    aom_highbd_8_masked_sub_pixel_variance8x4)
+        HIGHBD_MBFP(BLOCK_4X4, aom_highbd_masked_sad4x4_bits8,
+                    aom_highbd_8_masked_sub_pixel_variance4x4)
+        HIGHBD_MBFP(BLOCK_64X16, aom_highbd_masked_sad64x16_bits8,
+                    aom_highbd_8_masked_sub_pixel_variance64x16)
+        HIGHBD_MBFP(BLOCK_16X64, aom_highbd_masked_sad16x64_bits8,
+                    aom_highbd_8_masked_sub_pixel_variance16x64)
+        HIGHBD_MBFP(BLOCK_32X8, aom_highbd_masked_sad32x8_bits8,
+                    aom_highbd_8_masked_sub_pixel_variance32x8)
+        HIGHBD_MBFP(BLOCK_8X32, aom_highbd_masked_sad8x32_bits8,
+                    aom_highbd_8_masked_sub_pixel_variance8x32)
+        HIGHBD_MBFP(BLOCK_16X4, aom_highbd_masked_sad16x4_bits8,
+                    aom_highbd_8_masked_sub_pixel_variance16x4)
+        HIGHBD_MBFP(BLOCK_4X16, aom_highbd_masked_sad4x16_bits8,
+                    aom_highbd_8_masked_sub_pixel_variance4x16)
+        HIGHBD_OBFP(BLOCK_128X128, aom_highbd_obmc_sad128x128_bits8,
+                    aom_highbd_obmc_variance128x128,
+                    aom_highbd_obmc_sub_pixel_variance128x128)
+        HIGHBD_OBFP(BLOCK_128X64, aom_highbd_obmc_sad128x64_bits8,
+                    aom_highbd_obmc_variance128x64,
+                    aom_highbd_obmc_sub_pixel_variance128x64)
+        HIGHBD_OBFP(BLOCK_64X128, aom_highbd_obmc_sad64x128_bits8,
+                    aom_highbd_obmc_variance64x128,
+                    aom_highbd_obmc_sub_pixel_variance64x128)
+        HIGHBD_OBFP(BLOCK_64X64, aom_highbd_obmc_sad64x64_bits8,
+                    aom_highbd_obmc_variance64x64,
+                    aom_highbd_obmc_sub_pixel_variance64x64)
+        HIGHBD_OBFP(BLOCK_64X32, aom_highbd_obmc_sad64x32_bits8,
+                    aom_highbd_obmc_variance64x32,
+                    aom_highbd_obmc_sub_pixel_variance64x32)
+        HIGHBD_OBFP(BLOCK_32X64, aom_highbd_obmc_sad32x64_bits8,
+                    aom_highbd_obmc_variance32x64,
+                    aom_highbd_obmc_sub_pixel_variance32x64)
+        HIGHBD_OBFP(BLOCK_32X32, aom_highbd_obmc_sad32x32_bits8,
+                    aom_highbd_obmc_variance32x32,
+                    aom_highbd_obmc_sub_pixel_variance32x32)
+        HIGHBD_OBFP(BLOCK_32X16, aom_highbd_obmc_sad32x16_bits8,
+                    aom_highbd_obmc_variance32x16,
+                    aom_highbd_obmc_sub_pixel_variance32x16)
+        HIGHBD_OBFP(BLOCK_16X32, aom_highbd_obmc_sad16x32_bits8,
+                    aom_highbd_obmc_variance16x32,
+                    aom_highbd_obmc_sub_pixel_variance16x32)
+        HIGHBD_OBFP(BLOCK_16X16, aom_highbd_obmc_sad16x16_bits8,
+                    aom_highbd_obmc_variance16x16,
+                    aom_highbd_obmc_sub_pixel_variance16x16)
+        HIGHBD_OBFP(BLOCK_8X16, aom_highbd_obmc_sad8x16_bits8,
+                    aom_highbd_obmc_variance8x16,
+                    aom_highbd_obmc_sub_pixel_variance8x16)
+        HIGHBD_OBFP(BLOCK_16X8, aom_highbd_obmc_sad16x8_bits8,
+                    aom_highbd_obmc_variance16x8,
+                    aom_highbd_obmc_sub_pixel_variance16x8)
+        HIGHBD_OBFP(BLOCK_8X8, aom_highbd_obmc_sad8x8_bits8,
+                    aom_highbd_obmc_variance8x8,
+                    aom_highbd_obmc_sub_pixel_variance8x8)
+        HIGHBD_OBFP(BLOCK_4X8, aom_highbd_obmc_sad4x8_bits8,
+                    aom_highbd_obmc_variance4x8,
+                    aom_highbd_obmc_sub_pixel_variance4x8)
+        HIGHBD_OBFP(BLOCK_8X4, aom_highbd_obmc_sad8x4_bits8,
+                    aom_highbd_obmc_variance8x4,
+                    aom_highbd_obmc_sub_pixel_variance8x4)
+        HIGHBD_OBFP(BLOCK_4X4, aom_highbd_obmc_sad4x4_bits8,
+                    aom_highbd_obmc_variance4x4,
+                    aom_highbd_obmc_sub_pixel_variance4x4)
+        HIGHBD_OBFP(BLOCK_64X16, aom_highbd_obmc_sad64x16_bits8,
+                    aom_highbd_obmc_variance64x16,
+                    aom_highbd_obmc_sub_pixel_variance64x16)
+        HIGHBD_OBFP(BLOCK_16X64, aom_highbd_obmc_sad16x64_bits8,
+                    aom_highbd_obmc_variance16x64,
+                    aom_highbd_obmc_sub_pixel_variance16x64)
+        HIGHBD_OBFP(BLOCK_32X8, aom_highbd_obmc_sad32x8_bits8,
+                    aom_highbd_obmc_variance32x8,
+                    aom_highbd_obmc_sub_pixel_variance32x8)
+        HIGHBD_OBFP(BLOCK_8X32, aom_highbd_obmc_sad8x32_bits8,
+                    aom_highbd_obmc_variance8x32,
+                    aom_highbd_obmc_sub_pixel_variance8x32)
+        HIGHBD_OBFP(BLOCK_16X4, aom_highbd_obmc_sad16x4_bits8,
+                    aom_highbd_obmc_variance16x4,
+                    aom_highbd_obmc_sub_pixel_variance16x4)
+        HIGHBD_OBFP(BLOCK_4X16, aom_highbd_obmc_sad4x16_bits8,
+                    aom_highbd_obmc_variance4x16,
+                    aom_highbd_obmc_sub_pixel_variance4x16)
+        break;
+
+      case AOM_BITS_10:
+        HIGHBD_BFP(BLOCK_64X16, aom_highbd_sad64x16_bits10,
+                   aom_highbd_sad64x16_avg_bits10, aom_highbd_10_variance64x16,
+                   aom_highbd_10_sub_pixel_variance64x16,
+                   aom_highbd_10_sub_pixel_avg_variance64x16,
+                   aom_highbd_sad64x16x4d_bits10,
+                   aom_highbd_dist_wtd_sad64x16_avg_bits10,
+                   aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x16);
+
+        HIGHBD_BFP(BLOCK_16X64, aom_highbd_sad16x64_bits10,
+                   aom_highbd_sad16x64_avg_bits10, aom_highbd_10_variance16x64,
+                   aom_highbd_10_sub_pixel_variance16x64,
+                   aom_highbd_10_sub_pixel_avg_variance16x64,
+                   aom_highbd_sad16x64x4d_bits10,
+                   aom_highbd_dist_wtd_sad16x64_avg_bits10,
+                   aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x64);
+
+        HIGHBD_BFP(BLOCK_32X8, aom_highbd_sad32x8_bits10,
+                   aom_highbd_sad32x8_avg_bits10, aom_highbd_10_variance32x8,
+                   aom_highbd_10_sub_pixel_variance32x8,
+                   aom_highbd_10_sub_pixel_avg_variance32x8,
+                   aom_highbd_sad32x8x4d_bits10,
+                   aom_highbd_dist_wtd_sad32x8_avg_bits10,
+                   aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x8);
+
+        HIGHBD_BFP(BLOCK_8X32, aom_highbd_sad8x32_bits10,
+                   aom_highbd_sad8x32_avg_bits10, aom_highbd_10_variance8x32,
+                   aom_highbd_10_sub_pixel_variance8x32,
+                   aom_highbd_10_sub_pixel_avg_variance8x32,
+                   aom_highbd_sad8x32x4d_bits10,
+                   aom_highbd_dist_wtd_sad8x32_avg_bits10,
+                   aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x32);
+
+        HIGHBD_BFP(BLOCK_16X4, aom_highbd_sad16x4_bits10,
+                   aom_highbd_sad16x4_avg_bits10, aom_highbd_10_variance16x4,
+                   aom_highbd_10_sub_pixel_variance16x4,
+                   aom_highbd_10_sub_pixel_avg_variance16x4,
+                   aom_highbd_sad16x4x4d_bits10,
+                   aom_highbd_dist_wtd_sad16x4_avg_bits10,
+                   aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x4);
+
+        HIGHBD_BFP(BLOCK_4X16, aom_highbd_sad4x16_bits10,
+                   aom_highbd_sad4x16_avg_bits10, aom_highbd_10_variance4x16,
+                   aom_highbd_10_sub_pixel_variance4x16,
+                   aom_highbd_10_sub_pixel_avg_variance4x16,
+                   aom_highbd_sad4x16x4d_bits10,
+                   aom_highbd_dist_wtd_sad4x16_avg_bits10,
+                   aom_highbd_10_dist_wtd_sub_pixel_avg_variance4x16);
+
+        HIGHBD_BFP(BLOCK_32X16, aom_highbd_sad32x16_bits10,
+                   aom_highbd_sad32x16_avg_bits10, aom_highbd_10_variance32x16,
+                   aom_highbd_10_sub_pixel_variance32x16,
+                   aom_highbd_10_sub_pixel_avg_variance32x16,
+                   aom_highbd_sad32x16x4d_bits10,
+                   aom_highbd_dist_wtd_sad32x16_avg_bits10,
+                   aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x16);
+
+        HIGHBD_BFP(BLOCK_16X32, aom_highbd_sad16x32_bits10,
+                   aom_highbd_sad16x32_avg_bits10, aom_highbd_10_variance16x32,
+                   aom_highbd_10_sub_pixel_variance16x32,
+                   aom_highbd_10_sub_pixel_avg_variance16x32,
+                   aom_highbd_sad16x32x4d_bits10,
+                   aom_highbd_dist_wtd_sad16x32_avg_bits10,
+                   aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x32);
+
+        HIGHBD_BFP(BLOCK_64X32, aom_highbd_sad64x32_bits10,
+                   aom_highbd_sad64x32_avg_bits10, aom_highbd_10_variance64x32,
+                   aom_highbd_10_sub_pixel_variance64x32,
+                   aom_highbd_10_sub_pixel_avg_variance64x32,
+                   aom_highbd_sad64x32x4d_bits10,
+                   aom_highbd_dist_wtd_sad64x32_avg_bits10,
+                   aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x32);
+
+        HIGHBD_BFP(BLOCK_32X64, aom_highbd_sad32x64_bits10,
+                   aom_highbd_sad32x64_avg_bits10, aom_highbd_10_variance32x64,
+                   aom_highbd_10_sub_pixel_variance32x64,
+                   aom_highbd_10_sub_pixel_avg_variance32x64,
+                   aom_highbd_sad32x64x4d_bits10,
+                   aom_highbd_dist_wtd_sad32x64_avg_bits10,
+                   aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x64);
+
+        HIGHBD_BFP(BLOCK_32X32, aom_highbd_sad32x32_bits10,
+                   aom_highbd_sad32x32_avg_bits10, aom_highbd_10_variance32x32,
+                   aom_highbd_10_sub_pixel_variance32x32,
+                   aom_highbd_10_sub_pixel_avg_variance32x32,
+                   aom_highbd_sad32x32x4d_bits10,
+                   aom_highbd_dist_wtd_sad32x32_avg_bits10,
+                   aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x32);
+
+        HIGHBD_BFP(BLOCK_64X64, aom_highbd_sad64x64_bits10,
+                   aom_highbd_sad64x64_avg_bits10, aom_highbd_10_variance64x64,
+                   aom_highbd_10_sub_pixel_variance64x64,
+                   aom_highbd_10_sub_pixel_avg_variance64x64,
+                   aom_highbd_sad64x64x4d_bits10,
+                   aom_highbd_dist_wtd_sad64x64_avg_bits10,
+                   aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x64);
+
+        HIGHBD_BFP(BLOCK_16X16, aom_highbd_sad16x16_bits10,
+                   aom_highbd_sad16x16_avg_bits10, aom_highbd_10_variance16x16,
+                   aom_highbd_10_sub_pixel_variance16x16,
+                   aom_highbd_10_sub_pixel_avg_variance16x16,
+                   aom_highbd_sad16x16x4d_bits10,
+                   aom_highbd_dist_wtd_sad16x16_avg_bits10,
+                   aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x16);
+
+        HIGHBD_BFP(BLOCK_16X8, aom_highbd_sad16x8_bits10,
+                   aom_highbd_sad16x8_avg_bits10, aom_highbd_10_variance16x8,
+                   aom_highbd_10_sub_pixel_variance16x8,
+                   aom_highbd_10_sub_pixel_avg_variance16x8,
+                   aom_highbd_sad16x8x4d_bits10,
+                   aom_highbd_dist_wtd_sad16x8_avg_bits10,
+                   aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x8);
+
+        HIGHBD_BFP(BLOCK_8X16, aom_highbd_sad8x16_bits10,
+                   aom_highbd_sad8x16_avg_bits10, aom_highbd_10_variance8x16,
+                   aom_highbd_10_sub_pixel_variance8x16,
+                   aom_highbd_10_sub_pixel_avg_variance8x16,
+                   aom_highbd_sad8x16x4d_bits10,
+                   aom_highbd_dist_wtd_sad8x16_avg_bits10,
+                   aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x16);
+
+        HIGHBD_BFP(
+            BLOCK_8X8, aom_highbd_sad8x8_bits10, aom_highbd_sad8x8_avg_bits10,
+            aom_highbd_10_variance8x8, aom_highbd_10_sub_pixel_variance8x8,
+            aom_highbd_10_sub_pixel_avg_variance8x8,
+            aom_highbd_sad8x8x4d_bits10, aom_highbd_dist_wtd_sad8x8_avg_bits10,
+            aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x8);
+
+        HIGHBD_BFP(
+            BLOCK_8X4, aom_highbd_sad8x4_bits10, aom_highbd_sad8x4_avg_bits10,
+            aom_highbd_10_variance8x4, aom_highbd_10_sub_pixel_variance8x4,
+            aom_highbd_10_sub_pixel_avg_variance8x4,
+            aom_highbd_sad8x4x4d_bits10, aom_highbd_dist_wtd_sad8x4_avg_bits10,
+            aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x4);
+
+        HIGHBD_BFP(
+            BLOCK_4X8, aom_highbd_sad4x8_bits10, aom_highbd_sad4x8_avg_bits10,
+            aom_highbd_10_variance4x8, aom_highbd_10_sub_pixel_variance4x8,
+            aom_highbd_10_sub_pixel_avg_variance4x8,
+            aom_highbd_sad4x8x4d_bits10, aom_highbd_dist_wtd_sad4x8_avg_bits10,
+            aom_highbd_10_dist_wtd_sub_pixel_avg_variance4x8);
+
+        HIGHBD_BFP(
+            BLOCK_4X4, aom_highbd_sad4x4_bits10, aom_highbd_sad4x4_avg_bits10,
+            aom_highbd_10_variance4x4, aom_highbd_10_sub_pixel_variance4x4,
+            aom_highbd_10_sub_pixel_avg_variance4x4,
+            aom_highbd_sad4x4x4d_bits10, aom_highbd_dist_wtd_sad4x4_avg_bits10,
+            aom_highbd_10_dist_wtd_sub_pixel_avg_variance4x4);
+
+        HIGHBD_BFP(BLOCK_128X128, aom_highbd_sad128x128_bits10,
+                   aom_highbd_sad128x128_avg_bits10,
+                   aom_highbd_10_variance128x128,
+                   aom_highbd_10_sub_pixel_variance128x128,
+                   aom_highbd_10_sub_pixel_avg_variance128x128,
+                   aom_highbd_sad128x128x4d_bits10,
+                   aom_highbd_dist_wtd_sad128x128_avg_bits10,
+                   aom_highbd_10_dist_wtd_sub_pixel_avg_variance128x128);
+
+        HIGHBD_BFP(BLOCK_128X64, aom_highbd_sad128x64_bits10,
+                   aom_highbd_sad128x64_avg_bits10,
+                   aom_highbd_10_variance128x64,
+                   aom_highbd_10_sub_pixel_variance128x64,
+                   aom_highbd_10_sub_pixel_avg_variance128x64,
+                   aom_highbd_sad128x64x4d_bits10,
+                   aom_highbd_dist_wtd_sad128x64_avg_bits10,
+                   aom_highbd_10_dist_wtd_sub_pixel_avg_variance128x64);
+
+        HIGHBD_BFP(BLOCK_64X128, aom_highbd_sad64x128_bits10,
+                   aom_highbd_sad64x128_avg_bits10,
+                   aom_highbd_10_variance64x128,
+                   aom_highbd_10_sub_pixel_variance64x128,
+                   aom_highbd_10_sub_pixel_avg_variance64x128,
+                   aom_highbd_sad64x128x4d_bits10,
+                   aom_highbd_dist_wtd_sad64x128_avg_bits10,
+                   aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x128);
+
+        HIGHBD_MBFP(BLOCK_128X128, aom_highbd_masked_sad128x128_bits10,
+                    aom_highbd_10_masked_sub_pixel_variance128x128)
+        HIGHBD_MBFP(BLOCK_128X64, aom_highbd_masked_sad128x64_bits10,
+                    aom_highbd_10_masked_sub_pixel_variance128x64)
+        HIGHBD_MBFP(BLOCK_64X128, aom_highbd_masked_sad64x128_bits10,
+                    aom_highbd_10_masked_sub_pixel_variance64x128)
+        HIGHBD_MBFP(BLOCK_64X64, aom_highbd_masked_sad64x64_bits10,
+                    aom_highbd_10_masked_sub_pixel_variance64x64)
+        HIGHBD_MBFP(BLOCK_64X32, aom_highbd_masked_sad64x32_bits10,
+                    aom_highbd_10_masked_sub_pixel_variance64x32)
+        HIGHBD_MBFP(BLOCK_32X64, aom_highbd_masked_sad32x64_bits10,
+                    aom_highbd_10_masked_sub_pixel_variance32x64)
+        HIGHBD_MBFP(BLOCK_32X32, aom_highbd_masked_sad32x32_bits10,
+                    aom_highbd_10_masked_sub_pixel_variance32x32)
+        HIGHBD_MBFP(BLOCK_32X16, aom_highbd_masked_sad32x16_bits10,
+                    aom_highbd_10_masked_sub_pixel_variance32x16)
+        HIGHBD_MBFP(BLOCK_16X32, aom_highbd_masked_sad16x32_bits10,
+                    aom_highbd_10_masked_sub_pixel_variance16x32)
+        HIGHBD_MBFP(BLOCK_16X16, aom_highbd_masked_sad16x16_bits10,
+                    aom_highbd_10_masked_sub_pixel_variance16x16)
+        HIGHBD_MBFP(BLOCK_8X16, aom_highbd_masked_sad8x16_bits10,
+                    aom_highbd_10_masked_sub_pixel_variance8x16)
+        HIGHBD_MBFP(BLOCK_16X8, aom_highbd_masked_sad16x8_bits10,
+                    aom_highbd_10_masked_sub_pixel_variance16x8)
+        HIGHBD_MBFP(BLOCK_8X8, aom_highbd_masked_sad8x8_bits10,
+                    aom_highbd_10_masked_sub_pixel_variance8x8)
+        HIGHBD_MBFP(BLOCK_4X8, aom_highbd_masked_sad4x8_bits10,
+                    aom_highbd_10_masked_sub_pixel_variance4x8)
+        HIGHBD_MBFP(BLOCK_8X4, aom_highbd_masked_sad8x4_bits10,
+                    aom_highbd_10_masked_sub_pixel_variance8x4)
+        HIGHBD_MBFP(BLOCK_4X4, aom_highbd_masked_sad4x4_bits10,
+                    aom_highbd_10_masked_sub_pixel_variance4x4)
+        HIGHBD_MBFP(BLOCK_64X16, aom_highbd_masked_sad64x16_bits10,
+                    aom_highbd_10_masked_sub_pixel_variance64x16)
+        HIGHBD_MBFP(BLOCK_16X64, aom_highbd_masked_sad16x64_bits10,
+                    aom_highbd_10_masked_sub_pixel_variance16x64)
+        HIGHBD_MBFP(BLOCK_32X8, aom_highbd_masked_sad32x8_bits10,
+                    aom_highbd_10_masked_sub_pixel_variance32x8)
+        HIGHBD_MBFP(BLOCK_8X32, aom_highbd_masked_sad8x32_bits10,
+                    aom_highbd_10_masked_sub_pixel_variance8x32)
+        HIGHBD_MBFP(BLOCK_16X4, aom_highbd_masked_sad16x4_bits10,
+                    aom_highbd_10_masked_sub_pixel_variance16x4)
+        HIGHBD_MBFP(BLOCK_4X16, aom_highbd_masked_sad4x16_bits10,
+                    aom_highbd_10_masked_sub_pixel_variance4x16)
+        HIGHBD_OBFP(BLOCK_128X128, aom_highbd_obmc_sad128x128_bits10,
+                    aom_highbd_10_obmc_variance128x128,
+                    aom_highbd_10_obmc_sub_pixel_variance128x128)
+        HIGHBD_OBFP(BLOCK_128X64, aom_highbd_obmc_sad128x64_bits10,
+                    aom_highbd_10_obmc_variance128x64,
+                    aom_highbd_10_obmc_sub_pixel_variance128x64)
+        HIGHBD_OBFP(BLOCK_64X128, aom_highbd_obmc_sad64x128_bits10,
+                    aom_highbd_10_obmc_variance64x128,
+                    aom_highbd_10_obmc_sub_pixel_variance64x128)
+        HIGHBD_OBFP(BLOCK_64X64, aom_highbd_obmc_sad64x64_bits10,
+                    aom_highbd_10_obmc_variance64x64,
+                    aom_highbd_10_obmc_sub_pixel_variance64x64)
+        HIGHBD_OBFP(BLOCK_64X32, aom_highbd_obmc_sad64x32_bits10,
+                    aom_highbd_10_obmc_variance64x32,
+                    aom_highbd_10_obmc_sub_pixel_variance64x32)
+        HIGHBD_OBFP(BLOCK_32X64, aom_highbd_obmc_sad32x64_bits10,
+                    aom_highbd_10_obmc_variance32x64,
+                    aom_highbd_10_obmc_sub_pixel_variance32x64)
+        HIGHBD_OBFP(BLOCK_32X32, aom_highbd_obmc_sad32x32_bits10,
+                    aom_highbd_10_obmc_variance32x32,
+                    aom_highbd_10_obmc_sub_pixel_variance32x32)
+        HIGHBD_OBFP(BLOCK_32X16, aom_highbd_obmc_sad32x16_bits10,
+                    aom_highbd_10_obmc_variance32x16,
+                    aom_highbd_10_obmc_sub_pixel_variance32x16)
+        HIGHBD_OBFP(BLOCK_16X32, aom_highbd_obmc_sad16x32_bits10,
+                    aom_highbd_10_obmc_variance16x32,
+                    aom_highbd_10_obmc_sub_pixel_variance16x32)
+        HIGHBD_OBFP(BLOCK_16X16, aom_highbd_obmc_sad16x16_bits10,
+                    aom_highbd_10_obmc_variance16x16,
+                    aom_highbd_10_obmc_sub_pixel_variance16x16)
+        HIGHBD_OBFP(BLOCK_8X16, aom_highbd_obmc_sad8x16_bits10,
+                    aom_highbd_10_obmc_variance8x16,
+                    aom_highbd_10_obmc_sub_pixel_variance8x16)
+        HIGHBD_OBFP(BLOCK_16X8, aom_highbd_obmc_sad16x8_bits10,
+                    aom_highbd_10_obmc_variance16x8,
+                    aom_highbd_10_obmc_sub_pixel_variance16x8)
+        HIGHBD_OBFP(BLOCK_8X8, aom_highbd_obmc_sad8x8_bits10,
+                    aom_highbd_10_obmc_variance8x8,
+                    aom_highbd_10_obmc_sub_pixel_variance8x8)
+        HIGHBD_OBFP(BLOCK_4X8, aom_highbd_obmc_sad4x8_bits10,
+                    aom_highbd_10_obmc_variance4x8,
+                    aom_highbd_10_obmc_sub_pixel_variance4x8)
+        HIGHBD_OBFP(BLOCK_8X4, aom_highbd_obmc_sad8x4_bits10,
+                    aom_highbd_10_obmc_variance8x4,
+                    aom_highbd_10_obmc_sub_pixel_variance8x4)
+        HIGHBD_OBFP(BLOCK_4X4, aom_highbd_obmc_sad4x4_bits10,
+                    aom_highbd_10_obmc_variance4x4,
+                    aom_highbd_10_obmc_sub_pixel_variance4x4)
+
+        HIGHBD_OBFP(BLOCK_64X16, aom_highbd_obmc_sad64x16_bits10,
+                    aom_highbd_10_obmc_variance64x16,
+                    aom_highbd_10_obmc_sub_pixel_variance64x16)
+
+        HIGHBD_OBFP(BLOCK_16X64, aom_highbd_obmc_sad16x64_bits10,
+                    aom_highbd_10_obmc_variance16x64,
+                    aom_highbd_10_obmc_sub_pixel_variance16x64)
+
+        HIGHBD_OBFP(BLOCK_32X8, aom_highbd_obmc_sad32x8_bits10,
+                    aom_highbd_10_obmc_variance32x8,
+                    aom_highbd_10_obmc_sub_pixel_variance32x8)
+
+        HIGHBD_OBFP(BLOCK_8X32, aom_highbd_obmc_sad8x32_bits10,
+                    aom_highbd_10_obmc_variance8x32,
+                    aom_highbd_10_obmc_sub_pixel_variance8x32)
+
+        HIGHBD_OBFP(BLOCK_16X4, aom_highbd_obmc_sad16x4_bits10,
+                    aom_highbd_10_obmc_variance16x4,
+                    aom_highbd_10_obmc_sub_pixel_variance16x4)
+
+        HIGHBD_OBFP(BLOCK_4X16, aom_highbd_obmc_sad4x16_bits10,
+                    aom_highbd_10_obmc_variance4x16,
+                    aom_highbd_10_obmc_sub_pixel_variance4x16)
+        break;
+
+      case AOM_BITS_12:
+        HIGHBD_BFP(BLOCK_64X16, aom_highbd_sad64x16_bits12,
+                   aom_highbd_sad64x16_avg_bits12, aom_highbd_12_variance64x16,
+                   aom_highbd_12_sub_pixel_variance64x16,
+                   aom_highbd_12_sub_pixel_avg_variance64x16,
+                   aom_highbd_sad64x16x4d_bits12,
+                   aom_highbd_dist_wtd_sad64x16_avg_bits12,
+                   aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x16);
+
+        HIGHBD_BFP(BLOCK_16X64, aom_highbd_sad16x64_bits12,
+                   aom_highbd_sad16x64_avg_bits12, aom_highbd_12_variance16x64,
+                   aom_highbd_12_sub_pixel_variance16x64,
+                   aom_highbd_12_sub_pixel_avg_variance16x64,
+                   aom_highbd_sad16x64x4d_bits12,
+                   aom_highbd_dist_wtd_sad16x64_avg_bits12,
+                   aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x64);
+
+        HIGHBD_BFP(BLOCK_32X8, aom_highbd_sad32x8_bits12,
+                   aom_highbd_sad32x8_avg_bits12, aom_highbd_12_variance32x8,
+                   aom_highbd_12_sub_pixel_variance32x8,
+                   aom_highbd_12_sub_pixel_avg_variance32x8,
+                   aom_highbd_sad32x8x4d_bits12,
+                   aom_highbd_dist_wtd_sad32x8_avg_bits12,
+                   aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x8);
+
+        HIGHBD_BFP(BLOCK_8X32, aom_highbd_sad8x32_bits12,
+                   aom_highbd_sad8x32_avg_bits12, aom_highbd_12_variance8x32,
+                   aom_highbd_12_sub_pixel_variance8x32,
+                   aom_highbd_12_sub_pixel_avg_variance8x32,
+                   aom_highbd_sad8x32x4d_bits12,
+                   aom_highbd_dist_wtd_sad8x32_avg_bits12,
+                   aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x32);
+
+        HIGHBD_BFP(BLOCK_16X4, aom_highbd_sad16x4_bits12,
+                   aom_highbd_sad16x4_avg_bits12, aom_highbd_12_variance16x4,
+                   aom_highbd_12_sub_pixel_variance16x4,
+                   aom_highbd_12_sub_pixel_avg_variance16x4,
+                   aom_highbd_sad16x4x4d_bits12,
+                   aom_highbd_dist_wtd_sad16x4_avg_bits12,
+                   aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x4);
+
+        HIGHBD_BFP(BLOCK_4X16, aom_highbd_sad4x16_bits12,
+                   aom_highbd_sad4x16_avg_bits12, aom_highbd_12_variance4x16,
+                   aom_highbd_12_sub_pixel_variance4x16,
+                   aom_highbd_12_sub_pixel_avg_variance4x16,
+                   aom_highbd_sad4x16x4d_bits12,
+                   aom_highbd_dist_wtd_sad4x16_avg_bits12,
+                   aom_highbd_12_dist_wtd_sub_pixel_avg_variance4x16);
+
+        HIGHBD_BFP(BLOCK_32X16, aom_highbd_sad32x16_bits12,
+                   aom_highbd_sad32x16_avg_bits12, aom_highbd_12_variance32x16,
+                   aom_highbd_12_sub_pixel_variance32x16,
+                   aom_highbd_12_sub_pixel_avg_variance32x16,
+                   aom_highbd_sad32x16x4d_bits12,
+                   aom_highbd_dist_wtd_sad32x16_avg_bits12,
+                   aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x16);
+
+        HIGHBD_BFP(BLOCK_16X32, aom_highbd_sad16x32_bits12,
+                   aom_highbd_sad16x32_avg_bits12, aom_highbd_12_variance16x32,
+                   aom_highbd_12_sub_pixel_variance16x32,
+                   aom_highbd_12_sub_pixel_avg_variance16x32,
+                   aom_highbd_sad16x32x4d_bits12,
+                   aom_highbd_dist_wtd_sad16x32_avg_bits12,
+                   aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x32);
+
+        HIGHBD_BFP(BLOCK_64X32, aom_highbd_sad64x32_bits12,
+                   aom_highbd_sad64x32_avg_bits12, aom_highbd_12_variance64x32,
+                   aom_highbd_12_sub_pixel_variance64x32,
+                   aom_highbd_12_sub_pixel_avg_variance64x32,
+                   aom_highbd_sad64x32x4d_bits12,
+                   aom_highbd_dist_wtd_sad64x32_avg_bits12,
+                   aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x32);
+
+        HIGHBD_BFP(BLOCK_32X64, aom_highbd_sad32x64_bits12,
+                   aom_highbd_sad32x64_avg_bits12, aom_highbd_12_variance32x64,
+                   aom_highbd_12_sub_pixel_variance32x64,
+                   aom_highbd_12_sub_pixel_avg_variance32x64,
+                   aom_highbd_sad32x64x4d_bits12,
+                   aom_highbd_dist_wtd_sad32x64_avg_bits12,
+                   aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x64);
+
+        HIGHBD_BFP(BLOCK_32X32, aom_highbd_sad32x32_bits12,
+                   aom_highbd_sad32x32_avg_bits12, aom_highbd_12_variance32x32,
+                   aom_highbd_12_sub_pixel_variance32x32,
+                   aom_highbd_12_sub_pixel_avg_variance32x32,
+                   aom_highbd_sad32x32x4d_bits12,
+                   aom_highbd_dist_wtd_sad32x32_avg_bits12,
+                   aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x32);
+
+        HIGHBD_BFP(BLOCK_64X64, aom_highbd_sad64x64_bits12,
+                   aom_highbd_sad64x64_avg_bits12, aom_highbd_12_variance64x64,
+                   aom_highbd_12_sub_pixel_variance64x64,
+                   aom_highbd_12_sub_pixel_avg_variance64x64,
+                   aom_highbd_sad64x64x4d_bits12,
+                   aom_highbd_dist_wtd_sad64x64_avg_bits12,
+                   aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x64);
+
+        HIGHBD_BFP(BLOCK_16X16, aom_highbd_sad16x16_bits12,
+                   aom_highbd_sad16x16_avg_bits12, aom_highbd_12_variance16x16,
+                   aom_highbd_12_sub_pixel_variance16x16,
+                   aom_highbd_12_sub_pixel_avg_variance16x16,
+                   aom_highbd_sad16x16x4d_bits12,
+                   aom_highbd_dist_wtd_sad16x16_avg_bits12,
+                   aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x16);
+
+        HIGHBD_BFP(BLOCK_16X8, aom_highbd_sad16x8_bits12,
+                   aom_highbd_sad16x8_avg_bits12, aom_highbd_12_variance16x8,
+                   aom_highbd_12_sub_pixel_variance16x8,
+                   aom_highbd_12_sub_pixel_avg_variance16x8,
+                   aom_highbd_sad16x8x4d_bits12,
+                   aom_highbd_dist_wtd_sad16x8_avg_bits12,
+                   aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x8);
+
+        HIGHBD_BFP(BLOCK_8X16, aom_highbd_sad8x16_bits12,
+                   aom_highbd_sad8x16_avg_bits12, aom_highbd_12_variance8x16,
+                   aom_highbd_12_sub_pixel_variance8x16,
+                   aom_highbd_12_sub_pixel_avg_variance8x16,
+                   aom_highbd_sad8x16x4d_bits12,
+                   aom_highbd_dist_wtd_sad8x16_avg_bits12,
+                   aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x16);
+
+        HIGHBD_BFP(
+            BLOCK_8X8, aom_highbd_sad8x8_bits12, aom_highbd_sad8x8_avg_bits12,
+            aom_highbd_12_variance8x8, aom_highbd_12_sub_pixel_variance8x8,
+            aom_highbd_12_sub_pixel_avg_variance8x8,
+            aom_highbd_sad8x8x4d_bits12, aom_highbd_dist_wtd_sad8x8_avg_bits12,
+            aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x8);
+
+        HIGHBD_BFP(
+            BLOCK_8X4, aom_highbd_sad8x4_bits12, aom_highbd_sad8x4_avg_bits12,
+            aom_highbd_12_variance8x4, aom_highbd_12_sub_pixel_variance8x4,
+            aom_highbd_12_sub_pixel_avg_variance8x4,
+            aom_highbd_sad8x4x4d_bits12, aom_highbd_dist_wtd_sad8x4_avg_bits12,
+            aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x4);
+
+        HIGHBD_BFP(
+            BLOCK_4X8, aom_highbd_sad4x8_bits12, aom_highbd_sad4x8_avg_bits12,
+            aom_highbd_12_variance4x8, aom_highbd_12_sub_pixel_variance4x8,
+            aom_highbd_12_sub_pixel_avg_variance4x8,
+            aom_highbd_sad4x8x4d_bits12, aom_highbd_dist_wtd_sad4x8_avg_bits12,
+            aom_highbd_12_dist_wtd_sub_pixel_avg_variance4x8);
+
+        HIGHBD_BFP(
+            BLOCK_4X4, aom_highbd_sad4x4_bits12, aom_highbd_sad4x4_avg_bits12,
+            aom_highbd_12_variance4x4, aom_highbd_12_sub_pixel_variance4x4,
+            aom_highbd_12_sub_pixel_avg_variance4x4,
+            aom_highbd_sad4x4x4d_bits12, aom_highbd_dist_wtd_sad4x4_avg_bits12,
+            aom_highbd_12_dist_wtd_sub_pixel_avg_variance4x4);
+
+        HIGHBD_BFP(BLOCK_128X128, aom_highbd_sad128x128_bits12,
+                   aom_highbd_sad128x128_avg_bits12,
+                   aom_highbd_12_variance128x128,
+                   aom_highbd_12_sub_pixel_variance128x128,
+                   aom_highbd_12_sub_pixel_avg_variance128x128,
+                   aom_highbd_sad128x128x4d_bits12,
+                   aom_highbd_dist_wtd_sad128x128_avg_bits12,
+                   aom_highbd_12_dist_wtd_sub_pixel_avg_variance128x128);
+
+        HIGHBD_BFP(BLOCK_128X64, aom_highbd_sad128x64_bits12,
+                   aom_highbd_sad128x64_avg_bits12,
+                   aom_highbd_12_variance128x64,
+                   aom_highbd_12_sub_pixel_variance128x64,
+                   aom_highbd_12_sub_pixel_avg_variance128x64,
+                   aom_highbd_sad128x64x4d_bits12,
+                   aom_highbd_dist_wtd_sad128x64_avg_bits12,
+                   aom_highbd_12_dist_wtd_sub_pixel_avg_variance128x64);
+
+        HIGHBD_BFP(BLOCK_64X128, aom_highbd_sad64x128_bits12,
+                   aom_highbd_sad64x128_avg_bits12,
+                   aom_highbd_12_variance64x128,
+                   aom_highbd_12_sub_pixel_variance64x128,
+                   aom_highbd_12_sub_pixel_avg_variance64x128,
+                   aom_highbd_sad64x128x4d_bits12,
+                   aom_highbd_dist_wtd_sad64x128_avg_bits12,
+                   aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x128);
+
+        HIGHBD_MBFP(BLOCK_128X128, aom_highbd_masked_sad128x128_bits12,
+                    aom_highbd_12_masked_sub_pixel_variance128x128)
+        HIGHBD_MBFP(BLOCK_128X64, aom_highbd_masked_sad128x64_bits12,
+                    aom_highbd_12_masked_sub_pixel_variance128x64)
+        HIGHBD_MBFP(BLOCK_64X128, aom_highbd_masked_sad64x128_bits12,
+                    aom_highbd_12_masked_sub_pixel_variance64x128)
+        HIGHBD_MBFP(BLOCK_64X64, aom_highbd_masked_sad64x64_bits12,
+                    aom_highbd_12_masked_sub_pixel_variance64x64)
+        HIGHBD_MBFP(BLOCK_64X32, aom_highbd_masked_sad64x32_bits12,
+                    aom_highbd_12_masked_sub_pixel_variance64x32)
+        HIGHBD_MBFP(BLOCK_32X64, aom_highbd_masked_sad32x64_bits12,
+                    aom_highbd_12_masked_sub_pixel_variance32x64)
+        HIGHBD_MBFP(BLOCK_32X32, aom_highbd_masked_sad32x32_bits12,
+                    aom_highbd_12_masked_sub_pixel_variance32x32)
+        HIGHBD_MBFP(BLOCK_32X16, aom_highbd_masked_sad32x16_bits12,
+                    aom_highbd_12_masked_sub_pixel_variance32x16)
+        HIGHBD_MBFP(BLOCK_16X32, aom_highbd_masked_sad16x32_bits12,
+                    aom_highbd_12_masked_sub_pixel_variance16x32)
+        HIGHBD_MBFP(BLOCK_16X16, aom_highbd_masked_sad16x16_bits12,
+                    aom_highbd_12_masked_sub_pixel_variance16x16)
+        HIGHBD_MBFP(BLOCK_8X16, aom_highbd_masked_sad8x16_bits12,
+                    aom_highbd_12_masked_sub_pixel_variance8x16)
+        HIGHBD_MBFP(BLOCK_16X8, aom_highbd_masked_sad16x8_bits12,
+                    aom_highbd_12_masked_sub_pixel_variance16x8)
+        HIGHBD_MBFP(BLOCK_8X8, aom_highbd_masked_sad8x8_bits12,
+                    aom_highbd_12_masked_sub_pixel_variance8x8)
+        HIGHBD_MBFP(BLOCK_4X8, aom_highbd_masked_sad4x8_bits12,
+                    aom_highbd_12_masked_sub_pixel_variance4x8)
+        HIGHBD_MBFP(BLOCK_8X4, aom_highbd_masked_sad8x4_bits12,
+                    aom_highbd_12_masked_sub_pixel_variance8x4)
+        HIGHBD_MBFP(BLOCK_4X4, aom_highbd_masked_sad4x4_bits12,
+                    aom_highbd_12_masked_sub_pixel_variance4x4)
+        HIGHBD_MBFP(BLOCK_64X16, aom_highbd_masked_sad64x16_bits12,
+                    aom_highbd_12_masked_sub_pixel_variance64x16)
+        HIGHBD_MBFP(BLOCK_16X64, aom_highbd_masked_sad16x64_bits12,
+                    aom_highbd_12_masked_sub_pixel_variance16x64)
+        HIGHBD_MBFP(BLOCK_32X8, aom_highbd_masked_sad32x8_bits12,
+                    aom_highbd_12_masked_sub_pixel_variance32x8)
+        HIGHBD_MBFP(BLOCK_8X32, aom_highbd_masked_sad8x32_bits12,
+                    aom_highbd_12_masked_sub_pixel_variance8x32)
+        HIGHBD_MBFP(BLOCK_16X4, aom_highbd_masked_sad16x4_bits12,
+                    aom_highbd_12_masked_sub_pixel_variance16x4)
+        HIGHBD_MBFP(BLOCK_4X16, aom_highbd_masked_sad4x16_bits12,
+                    aom_highbd_12_masked_sub_pixel_variance4x16)
+        HIGHBD_OBFP(BLOCK_128X128, aom_highbd_obmc_sad128x128_bits12,
+                    aom_highbd_12_obmc_variance128x128,
+                    aom_highbd_12_obmc_sub_pixel_variance128x128)
+        HIGHBD_OBFP(BLOCK_128X64, aom_highbd_obmc_sad128x64_bits12,
+                    aom_highbd_12_obmc_variance128x64,
+                    aom_highbd_12_obmc_sub_pixel_variance128x64)
+        HIGHBD_OBFP(BLOCK_64X128, aom_highbd_obmc_sad64x128_bits12,
+                    aom_highbd_12_obmc_variance64x128,
+                    aom_highbd_12_obmc_sub_pixel_variance64x128)
+        HIGHBD_OBFP(BLOCK_64X64, aom_highbd_obmc_sad64x64_bits12,
+                    aom_highbd_12_obmc_variance64x64,
+                    aom_highbd_12_obmc_sub_pixel_variance64x64)
+        HIGHBD_OBFP(BLOCK_64X32, aom_highbd_obmc_sad64x32_bits12,
+                    aom_highbd_12_obmc_variance64x32,
+                    aom_highbd_12_obmc_sub_pixel_variance64x32)
+        HIGHBD_OBFP(BLOCK_32X64, aom_highbd_obmc_sad32x64_bits12,
+                    aom_highbd_12_obmc_variance32x64,
+                    aom_highbd_12_obmc_sub_pixel_variance32x64)
+        HIGHBD_OBFP(BLOCK_32X32, aom_highbd_obmc_sad32x32_bits12,
+                    aom_highbd_12_obmc_variance32x32,
+                    aom_highbd_12_obmc_sub_pixel_variance32x32)
+        HIGHBD_OBFP(BLOCK_32X16, aom_highbd_obmc_sad32x16_bits12,
+                    aom_highbd_12_obmc_variance32x16,
+                    aom_highbd_12_obmc_sub_pixel_variance32x16)
+        HIGHBD_OBFP(BLOCK_16X32, aom_highbd_obmc_sad16x32_bits12,
+                    aom_highbd_12_obmc_variance16x32,
+                    aom_highbd_12_obmc_sub_pixel_variance16x32)
+        HIGHBD_OBFP(BLOCK_16X16, aom_highbd_obmc_sad16x16_bits12,
+                    aom_highbd_12_obmc_variance16x16,
+                    aom_highbd_12_obmc_sub_pixel_variance16x16)
+        HIGHBD_OBFP(BLOCK_8X16, aom_highbd_obmc_sad8x16_bits12,
+                    aom_highbd_12_obmc_variance8x16,
+                    aom_highbd_12_obmc_sub_pixel_variance8x16)
+        HIGHBD_OBFP(BLOCK_16X8, aom_highbd_obmc_sad16x8_bits12,
+                    aom_highbd_12_obmc_variance16x8,
+                    aom_highbd_12_obmc_sub_pixel_variance16x8)
+        HIGHBD_OBFP(BLOCK_8X8, aom_highbd_obmc_sad8x8_bits12,
+                    aom_highbd_12_obmc_variance8x8,
+                    aom_highbd_12_obmc_sub_pixel_variance8x8)
+        HIGHBD_OBFP(BLOCK_4X8, aom_highbd_obmc_sad4x8_bits12,
+                    aom_highbd_12_obmc_variance4x8,
+                    aom_highbd_12_obmc_sub_pixel_variance4x8)
+        HIGHBD_OBFP(BLOCK_8X4, aom_highbd_obmc_sad8x4_bits12,
+                    aom_highbd_12_obmc_variance8x4,
+                    aom_highbd_12_obmc_sub_pixel_variance8x4)
+        HIGHBD_OBFP(BLOCK_4X4, aom_highbd_obmc_sad4x4_bits12,
+                    aom_highbd_12_obmc_variance4x4,
+                    aom_highbd_12_obmc_sub_pixel_variance4x4)
+        HIGHBD_OBFP(BLOCK_64X16, aom_highbd_obmc_sad64x16_bits12,
+                    aom_highbd_12_obmc_variance64x16,
+                    aom_highbd_12_obmc_sub_pixel_variance64x16)
+        HIGHBD_OBFP(BLOCK_16X64, aom_highbd_obmc_sad16x64_bits12,
+                    aom_highbd_12_obmc_variance16x64,
+                    aom_highbd_12_obmc_sub_pixel_variance16x64)
+        HIGHBD_OBFP(BLOCK_32X8, aom_highbd_obmc_sad32x8_bits12,
+                    aom_highbd_12_obmc_variance32x8,
+                    aom_highbd_12_obmc_sub_pixel_variance32x8)
+        HIGHBD_OBFP(BLOCK_8X32, aom_highbd_obmc_sad8x32_bits12,
+                    aom_highbd_12_obmc_variance8x32,
+                    aom_highbd_12_obmc_sub_pixel_variance8x32)
+        HIGHBD_OBFP(BLOCK_16X4, aom_highbd_obmc_sad16x4_bits12,
+                    aom_highbd_12_obmc_variance16x4,
+                    aom_highbd_12_obmc_sub_pixel_variance16x4)
+        HIGHBD_OBFP(BLOCK_4X16, aom_highbd_obmc_sad4x16_bits12,
+                    aom_highbd_12_obmc_variance4x16,
+                    aom_highbd_12_obmc_sub_pixel_variance4x16)
+        break;
+
+      default:
+        assert(0 &&
+               "cm->seq_params.bit_depth should be AOM_BITS_8, "
+               "AOM_BITS_10 or AOM_BITS_12");
+    }
+  }
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+static void realloc_segmentation_maps(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  CommonModeInfoParams *const mi_params = &cm->mi_params;
+
+  // Create the encoder segmentation map and set all entries to 0
+  aom_free(cpi->enc_seg.map);
+  CHECK_MEM_ERROR(cm, cpi->enc_seg.map,
+                  aom_calloc(mi_params->mi_rows * mi_params->mi_cols, 1));
+
+  // Create a map used for cyclic background refresh.
+  if (cpi->cyclic_refresh) av1_cyclic_refresh_free(cpi->cyclic_refresh);
+  CHECK_MEM_ERROR(
+      cm, cpi->cyclic_refresh,
+      av1_cyclic_refresh_alloc(mi_params->mi_rows, mi_params->mi_cols));
+
+  // Create a map used to mark inactive areas.
+  aom_free(cpi->active_map.map);
+  CHECK_MEM_ERROR(cm, cpi->active_map.map,
+                  aom_calloc(mi_params->mi_rows * mi_params->mi_cols, 1));
+}
+
+static AOM_INLINE void set_tpl_stats_block_size(int width, int height,
+                                                uint8_t *block_mis_log2) {
+  const int is_720p_or_larger = AOMMIN(width, height) >= 720;
+
+  // 0: 4x4, 1: 8x8, 2: 16x16
+  *block_mis_log2 = is_720p_or_larger ? 2 : 1;
+}
+
+void av1_alloc_compound_type_rd_buffers(AV1_COMMON *const cm,
+                                        CompoundTypeRdBuffers *const bufs) {
+  CHECK_MEM_ERROR(
+      cm, bufs->pred0,
+      (uint8_t *)aom_memalign(16, 2 * MAX_SB_SQUARE * sizeof(*bufs->pred0)));
+  CHECK_MEM_ERROR(
+      cm, bufs->pred1,
+      (uint8_t *)aom_memalign(16, 2 * MAX_SB_SQUARE * sizeof(*bufs->pred1)));
+  CHECK_MEM_ERROR(
+      cm, bufs->residual1,
+      (int16_t *)aom_memalign(32, MAX_SB_SQUARE * sizeof(*bufs->residual1)));
+  CHECK_MEM_ERROR(
+      cm, bufs->diff10,
+      (int16_t *)aom_memalign(32, MAX_SB_SQUARE * sizeof(*bufs->diff10)));
+  CHECK_MEM_ERROR(cm, bufs->tmp_best_mask_buf,
+                  (uint8_t *)aom_malloc(2 * MAX_SB_SQUARE *
+                                        sizeof(*bufs->tmp_best_mask_buf)));
+}
+
+void av1_release_compound_type_rd_buffers(CompoundTypeRdBuffers *const bufs) {
+  aom_free(bufs->pred0);
+  aom_free(bufs->pred1);
+  aom_free(bufs->residual1);
+  aom_free(bufs->diff10);
+  aom_free(bufs->tmp_best_mask_buf);
+  av1_zero(*bufs);  // Set all pointers to NULL for safety.
+}
+
+static void config_target_level(AV1_COMP *const cpi, AV1_LEVEL target_level,
+                                int tier) {
+  aom_clear_system_state();
+
+  AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  SequenceHeader *const seq_params = &cpi->common.seq_params;
+
+  // Adjust target bitrate to be no larger than 70% of level limit.
+  const BITSTREAM_PROFILE profile = seq_params->profile;
+  const double level_bitrate_limit =
+      av1_get_max_bitrate_for_level(target_level, tier, profile);
+  const int64_t max_bitrate = (int64_t)(level_bitrate_limit * 0.70);
+  oxcf->target_bandwidth = AOMMIN(oxcf->target_bandwidth, max_bitrate);
+  // Also need to update cpi->twopass.bits_left.
+  TWO_PASS *const twopass = &cpi->twopass;
+  FIRSTPASS_STATS *stats = twopass->stats_buf_ctx->total_stats;
+  if (stats != NULL)
+    cpi->twopass.bits_left =
+        (int64_t)(stats->duration * cpi->oxcf.target_bandwidth / 10000000.0);
+
+  // Adjust max over-shoot percentage.
+  oxcf->over_shoot_pct = 0;
+
+  // Adjust max quantizer.
+  oxcf->worst_allowed_q = 255;
+
+  // Adjust number of tiles and tile columns to be under level limit.
+  int max_tiles, max_tile_cols;
+  av1_get_max_tiles_for_level(target_level, &max_tiles, &max_tile_cols);
+  while (oxcf->tile_columns > 0 && (1 << oxcf->tile_columns) > max_tile_cols) {
+    --oxcf->tile_columns;
+  }
+  const int tile_cols = (1 << oxcf->tile_columns);
+  while (oxcf->tile_rows > 0 &&
+         tile_cols * (1 << oxcf->tile_rows) > max_tiles) {
+    --oxcf->tile_rows;
+  }
+
+  // Adjust min compression ratio.
+  const int still_picture = seq_params->still_picture;
+  const double min_cr =
+      av1_get_min_cr_for_level(target_level, tier, still_picture);
+  oxcf->min_cr = AOMMAX(oxcf->min_cr, (unsigned int)(min_cr * 100));
+}
+
+void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
+  AV1_COMMON *const cm = &cpi->common;
+  SequenceHeader *const seq_params = &cm->seq_params;
+  const int num_planes = av1_num_planes(cm);
+  RATE_CONTROL *const rc = &cpi->rc;
+  MACROBLOCK *const x = &cpi->td.mb;
+  AV1LevelParams *const level_params = &cpi->level_params;
+
+  if (seq_params->profile != oxcf->profile) seq_params->profile = oxcf->profile;
+  seq_params->bit_depth = oxcf->bit_depth;
+  seq_params->color_primaries = oxcf->color_primaries;
+  seq_params->transfer_characteristics = oxcf->transfer_characteristics;
+  seq_params->matrix_coefficients = oxcf->matrix_coefficients;
+  seq_params->monochrome = oxcf->monochrome;
+  seq_params->chroma_sample_position = oxcf->chroma_sample_position;
+  seq_params->color_range = oxcf->color_range;
+
+  assert(IMPLIES(seq_params->profile <= PROFILE_1,
+                 seq_params->bit_depth <= AOM_BITS_10));
+
+  seq_params->timing_info_present = oxcf->timing_info_present;
+  seq_params->timing_info.num_units_in_display_tick =
+      oxcf->timing_info.num_units_in_display_tick;
+  seq_params->timing_info.time_scale = oxcf->timing_info.time_scale;
+  seq_params->timing_info.equal_picture_interval =
+      oxcf->timing_info.equal_picture_interval;
+  seq_params->timing_info.num_ticks_per_picture =
+      oxcf->timing_info.num_ticks_per_picture;
+
+  seq_params->display_model_info_present_flag =
+      oxcf->display_model_info_present_flag;
+  seq_params->decoder_model_info_present_flag =
+      oxcf->decoder_model_info_present_flag;
+  if (oxcf->decoder_model_info_present_flag) {
+    // set the decoder model parameters in schedule mode
+    seq_params->decoder_model_info.num_units_in_decoding_tick =
+        oxcf->buffer_model.num_units_in_decoding_tick;
+    cm->buffer_removal_time_present = 1;
+    av1_set_aom_dec_model_info(&seq_params->decoder_model_info);
+    av1_set_dec_model_op_parameters(&seq_params->op_params[0]);
+  } else if (seq_params->timing_info_present &&
+             seq_params->timing_info.equal_picture_interval &&
+             !seq_params->decoder_model_info_present_flag) {
+    // set the decoder model parameters in resource availability mode
+    av1_set_resource_availability_parameters(&seq_params->op_params[0]);
+  } else {
+    seq_params->op_params[0].initial_display_delay =
+        10;  // Default value (not signaled)
+  }
+
+  update_film_grain_parameters(cpi, oxcf);
+
+  cpi->oxcf = *oxcf;
+  cpi->superres_mode = oxcf->superres_mode;  // default
+  x->e_mbd.bd = (int)seq_params->bit_depth;
+  x->e_mbd.global_motion = cm->global_motion;
+
+  memcpy(level_params->target_seq_level_idx, cpi->oxcf.target_seq_level_idx,
+         sizeof(level_params->target_seq_level_idx));
+  level_params->keep_level_stats = 0;
+  for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) {
+    if (level_params->target_seq_level_idx[i] <= SEQ_LEVELS) {
+      level_params->keep_level_stats |= 1u << i;
+      if (!level_params->level_info[i]) {
+        CHECK_MEM_ERROR(cm, level_params->level_info[i],
+                        aom_calloc(1, sizeof(*level_params->level_info[i])));
+      }
+    }
+  }
+
+  // TODO(huisu@): level targeting currently only works for the 0th operating
+  // point, so scalable coding is not supported yet.
+  if (level_params->target_seq_level_idx[0] < SEQ_LEVELS) {
+    // Adjust encoder config in order to meet target level.
+    config_target_level(cpi, level_params->target_seq_level_idx[0],
+                        seq_params->tier[0]);
+  }
+
+  if ((has_no_stats_stage(cpi)) && (oxcf->rc_mode == AOM_Q)) {
+    rc->baseline_gf_interval = FIXED_GF_INTERVAL;
+  } else {
+    rc->baseline_gf_interval = (MIN_GF_INTERVAL + MAX_GF_INTERVAL) / 2;
+  }
+
+  cpi->refresh_golden_frame = 0;
+  cpi->refresh_bwd_ref_frame = 0;
+
+  cm->features.refresh_frame_context = (oxcf->frame_parallel_decoding_mode)
+                                           ? REFRESH_FRAME_CONTEXT_DISABLED
+                                           : REFRESH_FRAME_CONTEXT_BACKWARD;
+  if (oxcf->large_scale_tile)
+    cm->features.refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED;
+
+  if (x->palette_buffer == NULL) {
+    CHECK_MEM_ERROR(cm, x->palette_buffer,
+                    aom_memalign(16, sizeof(*x->palette_buffer)));
+  }
+
+  if (x->comp_rd_buffer.pred0 == NULL) {
+    av1_alloc_compound_type_rd_buffers(cm, &x->comp_rd_buffer);
+  }
+
+  if (x->tmp_conv_dst == NULL) {
+    CHECK_MEM_ERROR(
+        cm, x->tmp_conv_dst,
+        aom_memalign(32, MAX_SB_SIZE * MAX_SB_SIZE * sizeof(*x->tmp_conv_dst)));
+    x->e_mbd.tmp_conv_dst = x->tmp_conv_dst;
+  }
+  for (int i = 0; i < 2; ++i) {
+    if (x->tmp_obmc_bufs[i] == NULL) {
+      CHECK_MEM_ERROR(cm, x->tmp_obmc_bufs[i],
+                      aom_memalign(32, 2 * MAX_MB_PLANE * MAX_SB_SQUARE *
+                                           sizeof(*x->tmp_obmc_bufs[i])));
+      x->e_mbd.tmp_obmc_bufs[i] = x->tmp_obmc_bufs[i];
+    }
+  }
+
+  av1_reset_segment_features(cm);
+
+  av1_set_high_precision_mv(cpi, 1, 0);
+
+  set_rc_buffer_sizes(rc, &cpi->oxcf);
+
+  // Under a configuration change, where maximum_buffer_size may change,
+  // keep buffer level clipped to the maximum allowed buffer size.
+  rc->bits_off_target = AOMMIN(rc->bits_off_target, rc->maximum_buffer_size);
+  rc->buffer_level = AOMMIN(rc->buffer_level, rc->maximum_buffer_size);
+
+  // Set up frame rate and related parameters rate control values.
+  av1_new_framerate(cpi, cpi->framerate);
+
+  // Set absolute upper and lower quality limits
+  rc->worst_quality = cpi->oxcf.worst_allowed_q;
+  rc->best_quality = cpi->oxcf.best_allowed_q;
+
+  cm->features.interp_filter =
+      oxcf->large_scale_tile ? EIGHTTAP_REGULAR : SWITCHABLE;
+  cm->features.switchable_motion_mode = 1;
+
+  if (cpi->oxcf.render_width > 0 && cpi->oxcf.render_height > 0) {
+    cm->render_width = cpi->oxcf.render_width;
+    cm->render_height = cpi->oxcf.render_height;
+  } else {
+    cm->render_width = cpi->oxcf.width;
+    cm->render_height = cpi->oxcf.height;
+  }
+  cm->width = cpi->oxcf.width;
+  cm->height = cpi->oxcf.height;
+
+  int sb_size = seq_params->sb_size;
+  // Superblock size should not be updated after the first key frame.
+  if (!cpi->seq_params_locked) {
+    set_sb_size(&cm->seq_params, select_sb_size(cpi));
+    for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i)
+      seq_params->tier[i] = (oxcf->tier_mask >> i) & 1;
+  }
+
+  if (cpi->initial_width || sb_size != seq_params->sb_size) {
+    if (cm->width > cpi->initial_width || cm->height > cpi->initial_height ||
+        seq_params->sb_size != sb_size) {
+      av1_free_context_buffers(cm);
+      av1_free_pc_tree(cpi, &cpi->td, num_planes, (BLOCK_SIZE)sb_size);
+      alloc_compressor_data(cpi);
+      realloc_segmentation_maps(cpi);
+      cpi->initial_width = cpi->initial_height = 0;
+    }
+  }
+  update_frame_size(cpi);
+
+  rc->is_src_frame_alt_ref = 0;
+
+  set_tile_info(cpi);
+
+  if (!cpi->svc.external_ref_frame_config)
+    cpi->ext_flags.refresh_frame_flags_pending = 0;
+  cpi->ext_flags.refresh_frame_context_pending = 0;
+
+#if CONFIG_AV1_HIGHBITDEPTH
+  highbd_set_var_fns(cpi);
+#endif
+
+  // Init sequence level coding tools
+  // This should not be called after the first key frame.
+  if (!cpi->seq_params_locked) {
+    seq_params->operating_points_cnt_minus_1 =
+        (cm->number_spatial_layers > 1 || cm->number_temporal_layers > 1)
+            ? cm->number_spatial_layers * cm->number_temporal_layers - 1
+            : 0;
+    init_seq_coding_tools(&cm->seq_params, cm, oxcf, cpi->use_svc);
+  }
+
+  if (cpi->use_svc)
+    av1_update_layer_context_change_config(cpi, oxcf->target_bandwidth);
+}
+
+static INLINE void setup_tpl_buffers(AV1_COMMON *const cm,
+                                     TplParams *const tpl_data) {
+  CommonModeInfoParams *const mi_params = &cm->mi_params;
+  set_tpl_stats_block_size(cm->width, cm->height,
+                           &tpl_data->tpl_stats_block_mis_log2);
+  const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2;
+
+  for (int frame = 0; frame < MAX_LENGTH_TPL_FRAME_STATS; ++frame) {
+    const int mi_cols =
+        ALIGN_POWER_OF_TWO(mi_params->mi_cols, MAX_MIB_SIZE_LOG2);
+    const int mi_rows =
+        ALIGN_POWER_OF_TWO(mi_params->mi_rows, MAX_MIB_SIZE_LOG2);
+
+    tpl_data->tpl_stats_buffer[frame].is_valid = 0;
+    tpl_data->tpl_stats_buffer[frame].width = mi_cols >> block_mis_log2;
+    tpl_data->tpl_stats_buffer[frame].height = mi_rows >> block_mis_log2;
+    tpl_data->tpl_stats_buffer[frame].stride =
+        tpl_data->tpl_stats_buffer[frame].width;
+    tpl_data->tpl_stats_buffer[frame].mi_rows = mi_params->mi_rows;
+    tpl_data->tpl_stats_buffer[frame].mi_cols = mi_params->mi_cols;
+  }
+
+  for (int frame = 0; frame < MAX_LAG_BUFFERS; ++frame) {
+    CHECK_MEM_ERROR(
+        cm, tpl_data->tpl_stats_pool[frame],
+        aom_calloc(tpl_data->tpl_stats_buffer[frame].width *
+                       tpl_data->tpl_stats_buffer[frame].height,
+                   sizeof(*tpl_data->tpl_stats_buffer[frame].tpl_stats_ptr)));
+    if (aom_alloc_frame_buffer(
+            &tpl_data->tpl_rec_pool[frame], cm->width, cm->height,
+            cm->seq_params.subsampling_x, cm->seq_params.subsampling_y,
+            cm->seq_params.use_highbitdepth, AOM_ENC_NO_SCALE_BORDER,
+            cm->features.byte_alignment))
+      aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                         "Failed to allocate frame buffer");
+  }
+
+  tpl_data->tpl_frame = &tpl_data->tpl_stats_buffer[REF_FRAMES + 1];
+}
+
+static INLINE void init_frame_info(FRAME_INFO *frame_info,
+                                   const AV1_COMMON *const cm) {
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  const SequenceHeader *const seq_params = &cm->seq_params;
+  frame_info->frame_width = cm->width;
+  frame_info->frame_height = cm->height;
+  frame_info->mi_cols = mi_params->mi_cols;
+  frame_info->mi_rows = mi_params->mi_rows;
+  frame_info->mb_cols = mi_params->mb_cols;
+  frame_info->mb_rows = mi_params->mb_rows;
+  frame_info->num_mbs = mi_params->MBs;
+  frame_info->bit_depth = seq_params->bit_depth;
+  frame_info->subsampling_x = seq_params->subsampling_x;
+  frame_info->subsampling_y = seq_params->subsampling_y;
+}
+
+AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf, BufferPool *const pool,
+                                FIRSTPASS_STATS *frame_stats_buf,
+                                COMPRESSOR_STAGE stage, int num_lap_buffers,
+                                int lap_lag_in_frames,
+                                STATS_BUFFER_CTX *stats_buf_context) {
+  AV1_COMP *volatile const cpi = aom_memalign(32, sizeof(AV1_COMP));
+  AV1_COMMON *volatile const cm = cpi != NULL ? &cpi->common : NULL;
+
+  if (!cm) return NULL;
+
+  av1_zero(*cpi);
+
+  // The jmp_buf is valid only for the duration of the function that calls
+  // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+  // before it returns.
+  if (setjmp(cm->error.jmp)) {
+    cm->error.setjmp = 0;
+    av1_remove_compressor(cpi);
+    return 0;
+  }
+
+  cm->error.setjmp = 1;
+  cpi->lap_enabled = num_lap_buffers > 0;
+  cpi->compressor_stage = stage;
+
+  CommonModeInfoParams *const mi_params = &cm->mi_params;
+  mi_params->free_mi = enc_free_mi;
+  mi_params->setup_mi = enc_setup_mi;
+  mi_params->set_mb_mi = (oxcf->pass == 1 || cpi->compressor_stage == LAP_STAGE)
+                             ? stat_stage_set_mb_mi
+                             : enc_set_mb_mi;
+
+  mi_params->mi_alloc_bsize = BLOCK_4X4;
+
+  CHECK_MEM_ERROR(cm, cm->fc,
+                  (FRAME_CONTEXT *)aom_memalign(32, sizeof(*cm->fc)));
+  CHECK_MEM_ERROR(
+      cm, cm->default_frame_context,
+      (FRAME_CONTEXT *)aom_memalign(32, sizeof(*cm->default_frame_context)));
+  memset(cm->fc, 0, sizeof(*cm->fc));
+  memset(cm->default_frame_context, 0, sizeof(*cm->default_frame_context));
+
+  cpi->common.buffer_pool = pool;
+
+  init_config(cpi, oxcf);
+  if (cpi->compressor_stage == LAP_STAGE) {
+    cpi->oxcf.lag_in_frames = lap_lag_in_frames;
+  }
+
+  av1_rc_init(&cpi->oxcf, oxcf->pass, &cpi->rc);
+
+  cpi->rc.enable_scenecut_detection = 1;
+  if (cpi->lap_enabled &&
+      (num_lap_buffers < (MAX_GF_LENGTH_LAP + SCENE_CUT_KEY_TEST_INTERVAL + 1)))
+    cpi->rc.enable_scenecut_detection = 0;
+  init_frame_info(&cpi->frame_info, cm);
+
+  cm->current_frame.frame_number = 0;
+  cm->current_frame_id = -1;
+  cpi->seq_params_locked = 0;
+  cpi->partition_search_skippable_frame = 0;
+  cpi->tile_data = NULL;
+  cpi->last_show_frame_buf = NULL;
+  realloc_segmentation_maps(cpi);
+
+  cpi->refresh_alt_ref_frame = 0;
+
+  cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS;
+#if CONFIG_INTERNAL_STATS
+  cpi->b_calculate_blockiness = 1;
+  cpi->b_calculate_consistency = 1;
+  cpi->total_inconsistency = 0;
+  cpi->psnr.worst = 100.0;
+  cpi->worst_ssim = 100.0;
+
+  cpi->count = 0;
+  cpi->bytes = 0;
+#if CONFIG_SPEED_STATS
+  cpi->tx_search_count = 0;
+#endif  // CONFIG_SPEED_STATS
+
+  if (cpi->b_calculate_psnr) {
+    cpi->total_sq_error = 0;
+    cpi->total_samples = 0;
+    cpi->tot_recode_hits = 0;
+    cpi->summed_quality = 0;
+    cpi->summed_weights = 0;
+  }
+
+  cpi->fastssim.worst = 100.0;
+  cpi->psnrhvs.worst = 100.0;
+
+  if (cpi->b_calculate_blockiness) {
+    cpi->total_blockiness = 0;
+    cpi->worst_blockiness = 0.0;
+  }
+
+  if (cpi->b_calculate_consistency) {
+    CHECK_MEM_ERROR(
+        cm, cpi->ssim_vars,
+        aom_malloc(sizeof(*cpi->ssim_vars) * 4 * cpi->common.mi_params.mi_rows *
+                   cpi->common.mi_params.mi_cols));
+    cpi->worst_consistency = 100.0;
+  }
+#endif
+#if CONFIG_ENTROPY_STATS
+  av1_zero(aggregate_fc);
+#endif  // CONFIG_ENTROPY_STATS
+
+  cpi->time_stamps.first_ever = INT64_MAX;
+
+#ifdef OUTPUT_YUV_SKINMAP
+  yuv_skinmap_file = fopen("skinmap.yuv", "ab");
+#endif
+#ifdef OUTPUT_YUV_REC
+  yuv_rec_file = fopen("rec.yuv", "wb");
+#endif
+
+  assert(MAX_LAP_BUFFERS >= MAX_LAG_BUFFERS);
+  int size = get_stats_buf_size(num_lap_buffers, MAX_LAG_BUFFERS);
+  for (int i = 0; i < size; i++)
+    cpi->twopass.frame_stats_arr[i] = &frame_stats_buf[i];
+
+  cpi->twopass.stats_buf_ctx = stats_buf_context;
+  cpi->twopass.stats_in = cpi->twopass.stats_buf_ctx->stats_in_start;
+
+#if !CONFIG_REALTIME_ONLY
+  if (is_stat_consumption_stage(cpi)) {
+    const size_t packet_sz = sizeof(FIRSTPASS_STATS);
+    const int packets = (int)(oxcf->two_pass_stats_in.sz / packet_sz);
+
+    if (!cpi->lap_enabled) {
+      /*Re-initialize to stats buffer, populated by application in the case of
+       * two pass*/
+      cpi->twopass.stats_buf_ctx->stats_in_start = oxcf->two_pass_stats_in.buf;
+      cpi->twopass.stats_in = cpi->twopass.stats_buf_ctx->stats_in_start;
+      cpi->twopass.stats_buf_ctx->stats_in_end =
+          &cpi->twopass.stats_buf_ctx->stats_in_start[packets - 1];
+
+      av1_init_second_pass(cpi);
+    } else {
+      av1_init_single_pass_lap(cpi);
+    }
+  }
+#endif
+
+  int sb_mi_size = av1_get_sb_mi_size(cm);
+
+  CHECK_MEM_ERROR(
+      cm, cpi->td.mb.above_pred_buf,
+      (uint8_t *)aom_memalign(16, MAX_MB_PLANE * MAX_SB_SQUARE *
+                                      sizeof(*cpi->td.mb.above_pred_buf)));
+  CHECK_MEM_ERROR(
+      cm, cpi->td.mb.left_pred_buf,
+      (uint8_t *)aom_memalign(16, MAX_MB_PLANE * MAX_SB_SQUARE *
+                                      sizeof(*cpi->td.mb.left_pred_buf)));
+
+  CHECK_MEM_ERROR(cm, cpi->td.mb.wsrc_buf,
+                  (int32_t *)aom_memalign(
+                      16, MAX_SB_SQUARE * sizeof(*cpi->td.mb.wsrc_buf)));
+
+  CHECK_MEM_ERROR(
+      cm, cpi->td.mb.inter_modes_info,
+      (InterModesInfo *)aom_malloc(sizeof(*cpi->td.mb.inter_modes_info)));
+
+  for (int x = 0; x < 2; x++)
+    for (int y = 0; y < 2; y++)
+      CHECK_MEM_ERROR(
+          cm, cpi->td.mb.intrabc_hash_info.hash_value_buffer[x][y],
+          (uint32_t *)aom_malloc(
+              AOM_BUFFER_SIZE_FOR_BLOCK_HASH *
+              sizeof(*cpi->td.mb.intrabc_hash_info.hash_value_buffer[0][0])));
+
+  cpi->td.mb.intrabc_hash_info.g_crc_initialized = 0;
+
+  CHECK_MEM_ERROR(cm, cpi->td.mb.mask_buf,
+                  (int32_t *)aom_memalign(
+                      16, MAX_SB_SQUARE * sizeof(*cpi->td.mb.mask_buf)));
+
+  CHECK_MEM_ERROR(cm, cpi->td.mb.mbmi_ext,
+                  aom_calloc(sb_mi_size, sizeof(*cpi->td.mb.mbmi_ext)));
+
+  av1_set_speed_features_framesize_independent(cpi, oxcf->speed);
+  av1_set_speed_features_framesize_dependent(cpi, oxcf->speed);
+
+  {
+    const int bsize = BLOCK_16X16;
+    const int w = mi_size_wide[bsize];
+    const int h = mi_size_high[bsize];
+    const int num_cols = (mi_params->mi_cols + w - 1) / w;
+    const int num_rows = (mi_params->mi_rows + h - 1) / h;
+    CHECK_MEM_ERROR(cm, cpi->tpl_rdmult_scaling_factors,
+                    aom_calloc(num_rows * num_cols,
+                               sizeof(*cpi->tpl_rdmult_scaling_factors)));
+    CHECK_MEM_ERROR(cm, cpi->tpl_sb_rdmult_scaling_factors,
+                    aom_calloc(num_rows * num_cols,
+                               sizeof(*cpi->tpl_sb_rdmult_scaling_factors)));
+  }
+
+  {
+    const int bsize = BLOCK_16X16;
+    const int w = mi_size_wide[bsize];
+    const int h = mi_size_high[bsize];
+    const int num_cols = (mi_params->mi_cols + w - 1) / w;
+    const int num_rows = (mi_params->mi_rows + h - 1) / h;
+    CHECK_MEM_ERROR(cm, cpi->ssim_rdmult_scaling_factors,
+                    aom_calloc(num_rows * num_cols,
+                               sizeof(*cpi->ssim_rdmult_scaling_factors)));
+  }
+
+#if CONFIG_TUNE_VMAF
+  {
+    const int bsize = BLOCK_64X64;
+    const int w = mi_size_wide[bsize];
+    const int h = mi_size_high[bsize];
+    const int num_cols = (mi_params->mi_cols + w - 1) / w;
+    const int num_rows = (mi_params->mi_rows + h - 1) / h;
+    CHECK_MEM_ERROR(cm, cpi->vmaf_rdmult_scaling_factors,
+                    aom_calloc(num_rows * num_cols,
+                               sizeof(*cpi->vmaf_rdmult_scaling_factors)));
+    cpi->last_frame_unsharp_amount = 0.0;
+  }
+#endif
+
+  if (!is_stat_generation_stage(cpi)) {
+    setup_tpl_buffers(cm, &cpi->tpl_data);
+  }
+
+#if CONFIG_COLLECT_PARTITION_STATS == 2
+  av1_zero(cpi->partition_stats);
+#endif
+
+#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF, JSDAF, JSVAF) \
+  cpi->fn_ptr[BT].sdf = SDF;                                    \
+  cpi->fn_ptr[BT].sdaf = SDAF;                                  \
+  cpi->fn_ptr[BT].vf = VF;                                      \
+  cpi->fn_ptr[BT].svf = SVF;                                    \
+  cpi->fn_ptr[BT].svaf = SVAF;                                  \
+  cpi->fn_ptr[BT].sdx4df = SDX4DF;                              \
+  cpi->fn_ptr[BT].jsdaf = JSDAF;                                \
+  cpi->fn_ptr[BT].jsvaf = JSVAF;
+
+  BFP(BLOCK_4X16, aom_sad4x16, aom_sad4x16_avg, aom_variance4x16,
+      aom_sub_pixel_variance4x16, aom_sub_pixel_avg_variance4x16,
+      aom_sad4x16x4d, aom_dist_wtd_sad4x16_avg,
+      aom_dist_wtd_sub_pixel_avg_variance4x16)
+
+  BFP(BLOCK_16X4, aom_sad16x4, aom_sad16x4_avg, aom_variance16x4,
+      aom_sub_pixel_variance16x4, aom_sub_pixel_avg_variance16x4,
+      aom_sad16x4x4d, aom_dist_wtd_sad16x4_avg,
+      aom_dist_wtd_sub_pixel_avg_variance16x4)
+
+  BFP(BLOCK_8X32, aom_sad8x32, aom_sad8x32_avg, aom_variance8x32,
+      aom_sub_pixel_variance8x32, aom_sub_pixel_avg_variance8x32,
+      aom_sad8x32x4d, aom_dist_wtd_sad8x32_avg,
+      aom_dist_wtd_sub_pixel_avg_variance8x32)
+
+  BFP(BLOCK_32X8, aom_sad32x8, aom_sad32x8_avg, aom_variance32x8,
+      aom_sub_pixel_variance32x8, aom_sub_pixel_avg_variance32x8,
+      aom_sad32x8x4d, aom_dist_wtd_sad32x8_avg,
+      aom_dist_wtd_sub_pixel_avg_variance32x8)
+
+  BFP(BLOCK_16X64, aom_sad16x64, aom_sad16x64_avg, aom_variance16x64,
+      aom_sub_pixel_variance16x64, aom_sub_pixel_avg_variance16x64,
+      aom_sad16x64x4d, aom_dist_wtd_sad16x64_avg,
+      aom_dist_wtd_sub_pixel_avg_variance16x64)
+
+  BFP(BLOCK_64X16, aom_sad64x16, aom_sad64x16_avg, aom_variance64x16,
+      aom_sub_pixel_variance64x16, aom_sub_pixel_avg_variance64x16,
+      aom_sad64x16x4d, aom_dist_wtd_sad64x16_avg,
+      aom_dist_wtd_sub_pixel_avg_variance64x16)
+
+  BFP(BLOCK_128X128, aom_sad128x128, aom_sad128x128_avg, aom_variance128x128,
+      aom_sub_pixel_variance128x128, aom_sub_pixel_avg_variance128x128,
+      aom_sad128x128x4d, aom_dist_wtd_sad128x128_avg,
+      aom_dist_wtd_sub_pixel_avg_variance128x128)
+
+  BFP(BLOCK_128X64, aom_sad128x64, aom_sad128x64_avg, aom_variance128x64,
+      aom_sub_pixel_variance128x64, aom_sub_pixel_avg_variance128x64,
+      aom_sad128x64x4d, aom_dist_wtd_sad128x64_avg,
+      aom_dist_wtd_sub_pixel_avg_variance128x64)
+
+  BFP(BLOCK_64X128, aom_sad64x128, aom_sad64x128_avg, aom_variance64x128,
+      aom_sub_pixel_variance64x128, aom_sub_pixel_avg_variance64x128,
+      aom_sad64x128x4d, aom_dist_wtd_sad64x128_avg,
+      aom_dist_wtd_sub_pixel_avg_variance64x128)
+
+  BFP(BLOCK_32X16, aom_sad32x16, aom_sad32x16_avg, aom_variance32x16,
+      aom_sub_pixel_variance32x16, aom_sub_pixel_avg_variance32x16,
+      aom_sad32x16x4d, aom_dist_wtd_sad32x16_avg,
+      aom_dist_wtd_sub_pixel_avg_variance32x16)
+
+  BFP(BLOCK_16X32, aom_sad16x32, aom_sad16x32_avg, aom_variance16x32,
+      aom_sub_pixel_variance16x32, aom_sub_pixel_avg_variance16x32,
+      aom_sad16x32x4d, aom_dist_wtd_sad16x32_avg,
+      aom_dist_wtd_sub_pixel_avg_variance16x32)
+
+  BFP(BLOCK_64X32, aom_sad64x32, aom_sad64x32_avg, aom_variance64x32,
+      aom_sub_pixel_variance64x32, aom_sub_pixel_avg_variance64x32,
+      aom_sad64x32x4d, aom_dist_wtd_sad64x32_avg,
+      aom_dist_wtd_sub_pixel_avg_variance64x32)
+
+  BFP(BLOCK_32X64, aom_sad32x64, aom_sad32x64_avg, aom_variance32x64,
+      aom_sub_pixel_variance32x64, aom_sub_pixel_avg_variance32x64,
+      aom_sad32x64x4d, aom_dist_wtd_sad32x64_avg,
+      aom_dist_wtd_sub_pixel_avg_variance32x64)
+
+  BFP(BLOCK_32X32, aom_sad32x32, aom_sad32x32_avg, aom_variance32x32,
+      aom_sub_pixel_variance32x32, aom_sub_pixel_avg_variance32x32,
+      aom_sad32x32x4d, aom_dist_wtd_sad32x32_avg,
+      aom_dist_wtd_sub_pixel_avg_variance32x32)
+
+  BFP(BLOCK_64X64, aom_sad64x64, aom_sad64x64_avg, aom_variance64x64,
+      aom_sub_pixel_variance64x64, aom_sub_pixel_avg_variance64x64,
+      aom_sad64x64x4d, aom_dist_wtd_sad64x64_avg,
+      aom_dist_wtd_sub_pixel_avg_variance64x64)
+
+  BFP(BLOCK_16X16, aom_sad16x16, aom_sad16x16_avg, aom_variance16x16,
+      aom_sub_pixel_variance16x16, aom_sub_pixel_avg_variance16x16,
+      aom_sad16x16x4d, aom_dist_wtd_sad16x16_avg,
+      aom_dist_wtd_sub_pixel_avg_variance16x16)
+
+  BFP(BLOCK_16X8, aom_sad16x8, aom_sad16x8_avg, aom_variance16x8,
+      aom_sub_pixel_variance16x8, aom_sub_pixel_avg_variance16x8,
+      aom_sad16x8x4d, aom_dist_wtd_sad16x8_avg,
+      aom_dist_wtd_sub_pixel_avg_variance16x8)
+
+  BFP(BLOCK_8X16, aom_sad8x16, aom_sad8x16_avg, aom_variance8x16,
+      aom_sub_pixel_variance8x16, aom_sub_pixel_avg_variance8x16,
+      aom_sad8x16x4d, aom_dist_wtd_sad8x16_avg,
+      aom_dist_wtd_sub_pixel_avg_variance8x16)
+
+  BFP(BLOCK_8X8, aom_sad8x8, aom_sad8x8_avg, aom_variance8x8,
+      aom_sub_pixel_variance8x8, aom_sub_pixel_avg_variance8x8, aom_sad8x8x4d,
+      aom_dist_wtd_sad8x8_avg, aom_dist_wtd_sub_pixel_avg_variance8x8)
+
+  BFP(BLOCK_8X4, aom_sad8x4, aom_sad8x4_avg, aom_variance8x4,
+      aom_sub_pixel_variance8x4, aom_sub_pixel_avg_variance8x4, aom_sad8x4x4d,
+      aom_dist_wtd_sad8x4_avg, aom_dist_wtd_sub_pixel_avg_variance8x4)
+
+  BFP(BLOCK_4X8, aom_sad4x8, aom_sad4x8_avg, aom_variance4x8,
+      aom_sub_pixel_variance4x8, aom_sub_pixel_avg_variance4x8, aom_sad4x8x4d,
+      aom_dist_wtd_sad4x8_avg, aom_dist_wtd_sub_pixel_avg_variance4x8)
+
+  BFP(BLOCK_4X4, aom_sad4x4, aom_sad4x4_avg, aom_variance4x4,
+      aom_sub_pixel_variance4x4, aom_sub_pixel_avg_variance4x4, aom_sad4x4x4d,
+      aom_dist_wtd_sad4x4_avg, aom_dist_wtd_sub_pixel_avg_variance4x4)
+
+#define OBFP(BT, OSDF, OVF, OSVF) \
+  cpi->fn_ptr[BT].osdf = OSDF;    \
+  cpi->fn_ptr[BT].ovf = OVF;      \
+  cpi->fn_ptr[BT].osvf = OSVF;
+
+  OBFP(BLOCK_128X128, aom_obmc_sad128x128, aom_obmc_variance128x128,
+       aom_obmc_sub_pixel_variance128x128)
+  OBFP(BLOCK_128X64, aom_obmc_sad128x64, aom_obmc_variance128x64,
+       aom_obmc_sub_pixel_variance128x64)
+  OBFP(BLOCK_64X128, aom_obmc_sad64x128, aom_obmc_variance64x128,
+       aom_obmc_sub_pixel_variance64x128)
+  OBFP(BLOCK_64X64, aom_obmc_sad64x64, aom_obmc_variance64x64,
+       aom_obmc_sub_pixel_variance64x64)
+  OBFP(BLOCK_64X32, aom_obmc_sad64x32, aom_obmc_variance64x32,
+       aom_obmc_sub_pixel_variance64x32)
+  OBFP(BLOCK_32X64, aom_obmc_sad32x64, aom_obmc_variance32x64,
+       aom_obmc_sub_pixel_variance32x64)
+  OBFP(BLOCK_32X32, aom_obmc_sad32x32, aom_obmc_variance32x32,
+       aom_obmc_sub_pixel_variance32x32)
+  OBFP(BLOCK_32X16, aom_obmc_sad32x16, aom_obmc_variance32x16,
+       aom_obmc_sub_pixel_variance32x16)
+  OBFP(BLOCK_16X32, aom_obmc_sad16x32, aom_obmc_variance16x32,
+       aom_obmc_sub_pixel_variance16x32)
+  OBFP(BLOCK_16X16, aom_obmc_sad16x16, aom_obmc_variance16x16,
+       aom_obmc_sub_pixel_variance16x16)
+  OBFP(BLOCK_16X8, aom_obmc_sad16x8, aom_obmc_variance16x8,
+       aom_obmc_sub_pixel_variance16x8)
+  OBFP(BLOCK_8X16, aom_obmc_sad8x16, aom_obmc_variance8x16,
+       aom_obmc_sub_pixel_variance8x16)
+  OBFP(BLOCK_8X8, aom_obmc_sad8x8, aom_obmc_variance8x8,
+       aom_obmc_sub_pixel_variance8x8)
+  OBFP(BLOCK_4X8, aom_obmc_sad4x8, aom_obmc_variance4x8,
+       aom_obmc_sub_pixel_variance4x8)
+  OBFP(BLOCK_8X4, aom_obmc_sad8x4, aom_obmc_variance8x4,
+       aom_obmc_sub_pixel_variance8x4)
+  OBFP(BLOCK_4X4, aom_obmc_sad4x4, aom_obmc_variance4x4,
+       aom_obmc_sub_pixel_variance4x4)
+  OBFP(BLOCK_4X16, aom_obmc_sad4x16, aom_obmc_variance4x16,
+       aom_obmc_sub_pixel_variance4x16)
+  OBFP(BLOCK_16X4, aom_obmc_sad16x4, aom_obmc_variance16x4,
+       aom_obmc_sub_pixel_variance16x4)
+  OBFP(BLOCK_8X32, aom_obmc_sad8x32, aom_obmc_variance8x32,
+       aom_obmc_sub_pixel_variance8x32)
+  OBFP(BLOCK_32X8, aom_obmc_sad32x8, aom_obmc_variance32x8,
+       aom_obmc_sub_pixel_variance32x8)
+  OBFP(BLOCK_16X64, aom_obmc_sad16x64, aom_obmc_variance16x64,
+       aom_obmc_sub_pixel_variance16x64)
+  OBFP(BLOCK_64X16, aom_obmc_sad64x16, aom_obmc_variance64x16,
+       aom_obmc_sub_pixel_variance64x16)
+
+#define MBFP(BT, MCSDF, MCSVF)  \
+  cpi->fn_ptr[BT].msdf = MCSDF; \
+  cpi->fn_ptr[BT].msvf = MCSVF;
+
+  MBFP(BLOCK_128X128, aom_masked_sad128x128,
+       aom_masked_sub_pixel_variance128x128)
+  MBFP(BLOCK_128X64, aom_masked_sad128x64, aom_masked_sub_pixel_variance128x64)
+  MBFP(BLOCK_64X128, aom_masked_sad64x128, aom_masked_sub_pixel_variance64x128)
+  MBFP(BLOCK_64X64, aom_masked_sad64x64, aom_masked_sub_pixel_variance64x64)
+  MBFP(BLOCK_64X32, aom_masked_sad64x32, aom_masked_sub_pixel_variance64x32)
+  MBFP(BLOCK_32X64, aom_masked_sad32x64, aom_masked_sub_pixel_variance32x64)
+  MBFP(BLOCK_32X32, aom_masked_sad32x32, aom_masked_sub_pixel_variance32x32)
+  MBFP(BLOCK_32X16, aom_masked_sad32x16, aom_masked_sub_pixel_variance32x16)
+  MBFP(BLOCK_16X32, aom_masked_sad16x32, aom_masked_sub_pixel_variance16x32)
+  MBFP(BLOCK_16X16, aom_masked_sad16x16, aom_masked_sub_pixel_variance16x16)
+  MBFP(BLOCK_16X8, aom_masked_sad16x8, aom_masked_sub_pixel_variance16x8)
+  MBFP(BLOCK_8X16, aom_masked_sad8x16, aom_masked_sub_pixel_variance8x16)
+  MBFP(BLOCK_8X8, aom_masked_sad8x8, aom_masked_sub_pixel_variance8x8)
+  MBFP(BLOCK_4X8, aom_masked_sad4x8, aom_masked_sub_pixel_variance4x8)
+  MBFP(BLOCK_8X4, aom_masked_sad8x4, aom_masked_sub_pixel_variance8x4)
+  MBFP(BLOCK_4X4, aom_masked_sad4x4, aom_masked_sub_pixel_variance4x4)
+
+  MBFP(BLOCK_4X16, aom_masked_sad4x16, aom_masked_sub_pixel_variance4x16)
+
+  MBFP(BLOCK_16X4, aom_masked_sad16x4, aom_masked_sub_pixel_variance16x4)
+
+  MBFP(BLOCK_8X32, aom_masked_sad8x32, aom_masked_sub_pixel_variance8x32)
+
+  MBFP(BLOCK_32X8, aom_masked_sad32x8, aom_masked_sub_pixel_variance32x8)
+
+  MBFP(BLOCK_16X64, aom_masked_sad16x64, aom_masked_sub_pixel_variance16x64)
+
+  MBFP(BLOCK_64X16, aom_masked_sad64x16, aom_masked_sub_pixel_variance64x16)
+
+#if CONFIG_AV1_HIGHBITDEPTH
+  highbd_set_var_fns(cpi);
+#endif
+
+  /* av1_init_quantizer() is first called here. Add check in
+   * av1_frame_init_quantizer() so that av1_init_quantizer is only
+   * called later when needed. This will avoid unnecessary calls of
+   * av1_init_quantizer() for every frame.
+   */
+  av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params,
+                     cm->seq_params.bit_depth);
+  av1_qm_init(&cm->quant_params, av1_num_planes(cm));
+
+  av1_loop_filter_init(cm);
+  cm->superres_scale_denominator = SCALE_NUMERATOR;
+  cm->superres_upscaled_width = oxcf->width;
+  cm->superres_upscaled_height = oxcf->height;
+  av1_loop_restoration_precal();
+
+  cm->error.setjmp = 0;
+
+  return cpi;
+}
+
+#if CONFIG_INTERNAL_STATS
+#define SNPRINT(H, T) snprintf((H) + strlen(H), sizeof(H) - strlen(H), (T))
+
+#define SNPRINT2(H, T, V) \
+  snprintf((H) + strlen(H), sizeof(H) - strlen(H), (T), (V))
+#endif  // CONFIG_INTERNAL_STATS
+
+void av1_remove_compressor(AV1_COMP *cpi) {
+  AV1_COMMON *cm;
+  TplParams *const tpl_data = &cpi->tpl_data;
+  int t;
+
+  if (!cpi) return;
+
+  cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+
+  if (cm->current_frame.frame_number > 0) {
+#if CONFIG_ENTROPY_STATS
+    if (!is_stat_generation_stage(cpi)) {
+      fprintf(stderr, "Writing counts.stt\n");
+      FILE *f = fopen("counts.stt", "wb");
+      fwrite(&aggregate_fc, sizeof(aggregate_fc), 1, f);
+      fclose(f);
+    }
+#endif  // CONFIG_ENTROPY_STATS
+#if CONFIG_INTERNAL_STATS
+    aom_clear_system_state();
+
+    if (!is_stat_generation_stage(cpi)) {
+      char headings[512] = { 0 };
+      char results[512] = { 0 };
+      FILE *f = fopen("opsnr.stt", "a");
+      double time_encoded =
+          (cpi->time_stamps.prev_end_seen - cpi->time_stamps.first_ever) /
+          10000000.000;
+      double total_encode_time =
+          (cpi->time_receive_data + cpi->time_compress_data) / 1000.000;
+      const double dr =
+          (double)cpi->bytes * (double)8 / (double)1000 / time_encoded;
+      const double peak = (double)((1 << cpi->oxcf.input_bit_depth) - 1);
+      const double target_rate = (double)cpi->oxcf.target_bandwidth / 1000;
+      const double rate_err = ((100.0 * (dr - target_rate)) / target_rate);
+
+      if (cpi->b_calculate_psnr) {
+        const double total_psnr = aom_sse_to_psnr(
+            (double)cpi->total_samples, peak, (double)cpi->total_sq_error);
+        const double total_ssim =
+            100 * pow(cpi->summed_quality / cpi->summed_weights, 8.0);
+        snprintf(headings, sizeof(headings),
+                 "Bitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\tGLPsnrP\t"
+                 "AOMSSIM\tVPSSIMP\tFASTSIM\tPSNRHVS\t"
+                 "WstPsnr\tWstSsim\tWstFast\tWstHVS\t"
+                 "AVPsrnY\tAPsnrCb\tAPsnrCr");
+        snprintf(results, sizeof(results),
+                 "%7.2f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
+                 "%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
+                 "%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
+                 "%7.3f\t%7.3f\t%7.3f",
+                 dr, cpi->psnr.stat[STAT_ALL] / cpi->count, total_psnr,
+                 cpi->psnr.stat[STAT_ALL] / cpi->count, total_psnr, total_ssim,
+                 total_ssim, cpi->fastssim.stat[STAT_ALL] / cpi->count,
+                 cpi->psnrhvs.stat[STAT_ALL] / cpi->count, cpi->psnr.worst,
+                 cpi->worst_ssim, cpi->fastssim.worst, cpi->psnrhvs.worst,
+                 cpi->psnr.stat[STAT_Y] / cpi->count,
+                 cpi->psnr.stat[STAT_U] / cpi->count,
+                 cpi->psnr.stat[STAT_V] / cpi->count);
+
+        if (cpi->b_calculate_blockiness) {
+          SNPRINT(headings, "\t  Block\tWstBlck");
+          SNPRINT2(results, "\t%7.3f", cpi->total_blockiness / cpi->count);
+          SNPRINT2(results, "\t%7.3f", cpi->worst_blockiness);
+        }
+
+        if (cpi->b_calculate_consistency) {
+          double consistency =
+              aom_sse_to_psnr((double)cpi->total_samples, peak,
+                              (double)cpi->total_inconsistency);
+
+          SNPRINT(headings, "\tConsist\tWstCons");
+          SNPRINT2(results, "\t%7.3f", consistency);
+          SNPRINT2(results, "\t%7.3f", cpi->worst_consistency);
+        }
+
+        SNPRINT(headings, "\t    Time\tRcErr\tAbsErr");
+        SNPRINT2(results, "\t%8.0f", total_encode_time);
+        SNPRINT2(results, "\t%7.2f", rate_err);
+        SNPRINT2(results, "\t%7.2f", fabs(rate_err));
+
+        fprintf(f, "%s\tAPsnr611\n", headings);
+        fprintf(f, "%s\t%7.3f\n", results,
+                (6 * cpi->psnr.stat[STAT_Y] + cpi->psnr.stat[STAT_U] +
+                 cpi->psnr.stat[STAT_V]) /
+                    (cpi->count * 8));
+      }
+
+      fclose(f);
+    }
+#endif  // CONFIG_INTERNAL_STATS
+#if CONFIG_SPEED_STATS
+    if (!is_stat_generation_stage(cpi)) {
+      fprintf(stdout, "tx_search_count = %d\n", cpi->tx_search_count);
+    }
+#endif  // CONFIG_SPEED_STATS
+
+#if CONFIG_COLLECT_PARTITION_STATS == 2
+    if (!is_stat_generation_stage(cpi)) {
+      av1_print_partition_stats(&cpi->partition_stats);
+    }
+#endif
+  }
+
+  for (int frame = 0; frame < MAX_LAG_BUFFERS; ++frame) {
+    aom_free(tpl_data->tpl_stats_pool[frame]);
+    aom_free_frame_buffer(&tpl_data->tpl_rec_pool[frame]);
+  }
+
+  for (t = cpi->num_workers - 1; t >= 0; --t) {
+    AVxWorker *const worker = &cpi->workers[t];
+    EncWorkerData *const thread_data = &cpi->tile_thr_data[t];
+
+    // Deallocate allocated threads.
+    aom_get_worker_interface()->end(worker);
+
+    // Deallocate allocated thread data.
+    aom_free(thread_data->td->tctx);
+    if (t > 0) {
+      aom_free(thread_data->td->palette_buffer);
+      aom_free(thread_data->td->tmp_conv_dst);
+      av1_release_compound_type_rd_buffers(&thread_data->td->comp_rd_buffer);
+      for (int j = 0; j < 2; ++j) {
+        aom_free(thread_data->td->tmp_obmc_bufs[j]);
+      }
+      aom_free(thread_data->td->above_pred_buf);
+      aom_free(thread_data->td->left_pred_buf);
+      aom_free(thread_data->td->wsrc_buf);
+      aom_free(thread_data->td->vt64x64);
+
+      aom_free(thread_data->td->inter_modes_info);
+      for (int x = 0; x < 2; x++) {
+        for (int y = 0; y < 2; y++) {
+          aom_free(thread_data->td->hash_value_buffer[x][y]);
+          thread_data->td->hash_value_buffer[x][y] = NULL;
+        }
+      }
+      aom_free(thread_data->td->mask_buf);
+      aom_free(thread_data->td->counts);
+      av1_free_pc_tree(cpi, thread_data->td, num_planes,
+                       cm->seq_params.sb_size);
+      aom_free(thread_data->td->mbmi_ext);
+      aom_free(thread_data->td);
+    }
+  }
+#if CONFIG_MULTITHREAD
+  if (cpi->row_mt_mutex_ != NULL) {
+    pthread_mutex_destroy(cpi->row_mt_mutex_);
+    aom_free(cpi->row_mt_mutex_);
+  }
+#endif
+  av1_row_mt_mem_dealloc(cpi);
+  aom_free(cpi->tile_thr_data);
+  aom_free(cpi->workers);
+
+  if (cpi->num_workers > 1) {
+    av1_loop_filter_dealloc(&cpi->lf_row_sync);
+    av1_loop_restoration_dealloc(&cpi->lr_row_sync, cpi->num_workers);
+  }
+
+  dealloc_compressor_data(cpi);
+
+#if CONFIG_INTERNAL_STATS
+  aom_free(cpi->ssim_vars);
+  cpi->ssim_vars = NULL;
+#endif  // CONFIG_INTERNAL_STATS
+
+  av1_remove_common(cm);
+#if CONFIG_HTB_TRELLIS
+  if (cpi->sf.use_hash_based_trellis) hbt_destroy();
+#endif  // CONFIG_HTB_TRELLIS
+  av1_free_ref_frame_buffers(cm->buffer_pool);
+
+  aom_free(cpi);
+
+#ifdef OUTPUT_YUV_SKINMAP
+  fclose(yuv_skinmap_file);
+#endif
+#ifdef OUTPUT_YUV_REC
+  fclose(yuv_rec_file);
+#endif
+}
+
+static void generate_psnr_packet(AV1_COMP *cpi) {
+  struct aom_codec_cx_pkt pkt;
+  int i;
+  PSNR_STATS psnr;
+#if CONFIG_AV1_HIGHBITDEPTH
+  const uint32_t in_bit_depth = cpi->oxcf.input_bit_depth;
+  const uint32_t bit_depth = cpi->td.mb.e_mbd.bd;
+  aom_calc_highbd_psnr(cpi->source, &cpi->common.cur_frame->buf, &psnr,
+                       bit_depth, in_bit_depth);
+#else
+  aom_calc_psnr(cpi->source, &cpi->common.cur_frame->buf, &psnr);
+#endif
+
+  for (i = 0; i < 4; ++i) {
+    pkt.data.psnr.samples[i] = psnr.samples[i];
+    pkt.data.psnr.sse[i] = psnr.sse[i];
+    pkt.data.psnr.psnr[i] = psnr.psnr[i];
+  }
+  pkt.kind = AOM_CODEC_PSNR_PKT;
+  aom_codec_pkt_list_add(cpi->output_pkt_list, &pkt);
+}
+
+int av1_use_as_reference(int *ext_ref_frame_flags, int ref_frame_flags) {
+  if (ref_frame_flags > ((1 << INTER_REFS_PER_FRAME) - 1)) return -1;
+
+  *ext_ref_frame_flags = ref_frame_flags;
+  return 0;
+}
+
+int av1_copy_reference_enc(AV1_COMP *cpi, int idx, YV12_BUFFER_CONFIG *sd) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  YV12_BUFFER_CONFIG *cfg = get_ref_frame(cm, idx);
+  if (cfg) {
+    aom_yv12_copy_frame(cfg, sd, num_planes);
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+int av1_set_reference_enc(AV1_COMP *cpi, int idx, YV12_BUFFER_CONFIG *sd) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  YV12_BUFFER_CONFIG *cfg = get_ref_frame(cm, idx);
+  if (cfg) {
+    aom_yv12_copy_frame(sd, cfg, num_planes);
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+int av1_update_entropy(bool *ext_refresh_frame_context,
+                       bool *ext_refresh_frame_context_pending, bool update) {
+  *ext_refresh_frame_context = update;
+  *ext_refresh_frame_context_pending = 1;
+  return 0;
+}
+
+#if defined(OUTPUT_YUV_DENOISED) || defined(OUTPUT_YUV_SKINMAP)
+// The denoiser buffer is allocated as a YUV 440 buffer. This function writes it
+// as YUV 420. We simply use the top-left pixels of the UV buffers, since we do
+// not denoise the UV channels at this time. If ever we implement UV channel
+// denoising we will have to modify this.
+void aom_write_yuv_frame_420(YV12_BUFFER_CONFIG *s, FILE *f) {
+  uint8_t *src = s->y_buffer;
+  int h = s->y_height;
+
+  do {
+    fwrite(src, s->y_width, 1, f);
+    src += s->y_stride;
+  } while (--h);
+
+  src = s->u_buffer;
+  h = s->uv_height;
+
+  do {
+    fwrite(src, s->uv_width, 1, f);
+    src += s->uv_stride;
+  } while (--h);
+
+  src = s->v_buffer;
+  h = s->uv_height;
+
+  do {
+    fwrite(src, s->uv_width, 1, f);
+    src += s->uv_stride;
+  } while (--h);
+}
+#endif
+
+#ifdef OUTPUT_YUV_REC
+void aom_write_one_yuv_frame(AV1_COMMON *cm, YV12_BUFFER_CONFIG *s) {
+  uint8_t *src = s->y_buffer;
+  int h = cm->height;
+  if (yuv_rec_file == NULL) return;
+  if (s->flags & YV12_FLAG_HIGHBITDEPTH) {
+    uint16_t *src16 = CONVERT_TO_SHORTPTR(s->y_buffer);
+
+    do {
+      fwrite(src16, s->y_width, 2, yuv_rec_file);
+      src16 += s->y_stride;
+    } while (--h);
+
+    src16 = CONVERT_TO_SHORTPTR(s->u_buffer);
+    h = s->uv_height;
+
+    do {
+      fwrite(src16, s->uv_width, 2, yuv_rec_file);
+      src16 += s->uv_stride;
+    } while (--h);
+
+    src16 = CONVERT_TO_SHORTPTR(s->v_buffer);
+    h = s->uv_height;
+
+    do {
+      fwrite(src16, s->uv_width, 2, yuv_rec_file);
+      src16 += s->uv_stride;
+    } while (--h);
+
+    fflush(yuv_rec_file);
+    return;
+  }
+
+  do {
+    fwrite(src, s->y_width, 1, yuv_rec_file);
+    src += s->y_stride;
+  } while (--h);
+
+  src = s->u_buffer;
+  h = s->uv_height;
+
+  do {
+    fwrite(src, s->uv_width, 1, yuv_rec_file);
+    src += s->uv_stride;
+  } while (--h);
+
+  src = s->v_buffer;
+  h = s->uv_height;
+
+  do {
+    fwrite(src, s->uv_width, 1, yuv_rec_file);
+    src += s->uv_stride;
+  } while (--h);
+
+  fflush(yuv_rec_file);
+}
+#endif  // OUTPUT_YUV_REC
+
+#define GM_RECODE_LOOP_NUM4X4_FACTOR 192
+static int recode_loop_test_global_motion(
+    WarpedMotionParams *const global_motion,
+    const int *const global_motion_used, int *const gm_params_cost) {
+  int i;
+  int recode = 0;
+  for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
+    if (global_motion[i].wmtype != IDENTITY &&
+        global_motion_used[i] * GM_RECODE_LOOP_NUM4X4_FACTOR <
+            gm_params_cost[i]) {
+      global_motion[i] = default_warp_params;
+      assert(global_motion[i].wmtype == IDENTITY);
+      gm_params_cost[i] = 0;
+      recode = 1;
+      // TODO(sarahparker): The earlier condition for recoding here was:
+      // "recode |= (rdc->global_motion_used[i] > 0);". Can we bring something
+      // similar to that back to speed up global motion?
+    }
+  }
+  return recode;
+}
+
+// Function to test for conditions that indicate we should loop
+// back and recode a frame.
+static int recode_loop_test(AV1_COMP *cpi, int high_limit, int low_limit, int q,
+                            int maxq, int minq) {
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  const int frame_is_kfgfarf = frame_is_kf_gf_arf(cpi);
+  int force_recode = 0;
+
+  if ((rc->projected_frame_size >= rc->max_frame_bandwidth) ||
+      (cpi->sf.hl_sf.recode_loop == ALLOW_RECODE) ||
+      (frame_is_kfgfarf &&
+       (cpi->sf.hl_sf.recode_loop == ALLOW_RECODE_KFARFGF))) {
+    // TODO(agrange) high_limit could be greater than the scale-down threshold.
+    if ((rc->projected_frame_size > high_limit && q < maxq) ||
+        (rc->projected_frame_size < low_limit && q > minq)) {
+      force_recode = 1;
+    } else if (cpi->oxcf.rc_mode == AOM_CQ) {
+      // Deal with frame undershoot and whether or not we are
+      // below the automatically set cq level.
+      if (q > oxcf->cq_level &&
+          rc->projected_frame_size < ((rc->this_frame_target * 7) >> 3)) {
+        force_recode = 1;
+      }
+    }
+  }
+  return force_recode;
+}
+
+static void scale_references(AV1_COMP *cpi) {
+  AV1_COMMON *cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MV_REFERENCE_FRAME ref_frame;
+
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    // Need to convert from AOM_REFFRAME to index into ref_mask (subtract 1).
+    if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame]) {
+      BufferPool *const pool = cm->buffer_pool;
+      const YV12_BUFFER_CONFIG *const ref =
+          get_ref_frame_yv12_buf(cm, ref_frame);
+
+      if (ref == NULL) {
+        cpi->scaled_ref_buf[ref_frame - 1] = NULL;
+        continue;
+      }
+
+      if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) {
+        // Replace the reference buffer with a copy having a thicker border,
+        // if the reference buffer is higher resolution than the current
+        // frame, and the border is thin.
+        if ((ref->y_crop_width > cm->width ||
+             ref->y_crop_height > cm->height) &&
+            ref->border < AOM_BORDER_IN_PIXELS) {
+          RefCntBuffer *ref_fb = get_ref_frame_buf(cm, ref_frame);
+          if (aom_yv12_realloc_with_new_border(
+                  &ref_fb->buf, AOM_BORDER_IN_PIXELS,
+                  cm->features.byte_alignment, num_planes) != 0) {
+            aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                               "Failed to allocate frame buffer");
+          }
+        }
+        int force_scaling = 0;
+        RefCntBuffer *new_fb = cpi->scaled_ref_buf[ref_frame - 1];
+        if (new_fb == NULL) {
+          const int new_fb_idx = get_free_fb(cm);
+          if (new_fb_idx == INVALID_IDX) {
+            aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                               "Unable to find free frame buffer");
+          }
+          force_scaling = 1;
+          new_fb = &pool->frame_bufs[new_fb_idx];
+        }
+
+        if (force_scaling || new_fb->buf.y_crop_width != cm->width ||
+            new_fb->buf.y_crop_height != cm->height) {
+          if (aom_realloc_frame_buffer(
+                  &new_fb->buf, cm->width, cm->height,
+                  cm->seq_params.subsampling_x, cm->seq_params.subsampling_y,
+                  cm->seq_params.use_highbitdepth, AOM_BORDER_IN_PIXELS,
+                  cm->features.byte_alignment, NULL, NULL, NULL)) {
+            if (force_scaling) {
+              // Release the reference acquired in the get_free_fb() call above.
+              --new_fb->ref_count;
+            }
+            aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                               "Failed to allocate frame buffer");
+          }
+          av1_resize_and_extend_frame(
+              ref, &new_fb->buf, (int)cm->seq_params.bit_depth, num_planes);
+          cpi->scaled_ref_buf[ref_frame - 1] = new_fb;
+          alloc_frame_mvs(cm, new_fb);
+        }
+      } else {
+        RefCntBuffer *buf = get_ref_frame_buf(cm, ref_frame);
+        buf->buf.y_crop_width = ref->y_crop_width;
+        buf->buf.y_crop_height = ref->y_crop_height;
+        cpi->scaled_ref_buf[ref_frame - 1] = buf;
+        ++buf->ref_count;
+      }
+    } else {
+      if (!has_no_stats_stage(cpi)) cpi->scaled_ref_buf[ref_frame - 1] = NULL;
+    }
+  }
+}
+
+static void release_scaled_references(AV1_COMP *cpi) {
+  // TODO(isbs): only refresh the necessary frames, rather than all of them
+  for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+    RefCntBuffer *const buf = cpi->scaled_ref_buf[i];
+    if (buf != NULL) {
+      --buf->ref_count;
+      cpi->scaled_ref_buf[i] = NULL;
+    }
+  }
+}
+
+static void set_mv_search_params(AV1_COMP *cpi) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MotionVectorSearchParams *const mv_search_params = &cpi->mv_search_params;
+  const int max_mv_def = AOMMAX(cm->width, cm->height);
+
+  // Default based on max resolution.
+  mv_search_params->mv_step_param = av1_init_search_range(max_mv_def);
+
+  if (cpi->sf.mv_sf.auto_mv_step_size) {
+    if (frame_is_intra_only(cm)) {
+      // Initialize max_mv_magnitude for use in the first INTER frame
+      // after a key/intra-only frame.
+      mv_search_params->max_mv_magnitude = max_mv_def;
+    } else {
+      // Use cpi->max_mv_magnitude == -1 to exclude first pass case.
+      if (cm->show_frame && mv_search_params->max_mv_magnitude != -1) {
+        // Allow mv_steps to correspond to twice the max mv magnitude found
+        // in the previous frame, capped by the default max_mv_magnitude based
+        // on resolution.
+        mv_search_params->mv_step_param = av1_init_search_range(
+            AOMMIN(max_mv_def, 2 * mv_search_params->max_mv_magnitude));
+      }
+      mv_search_params->max_mv_magnitude = -1;
+    }
+  }
+}
+
+void av1_set_screen_content_options(const AV1_COMP *cpi,
+                                    FeatureFlags *features) {
+  const AV1_COMMON *const cm = &cpi->common;
+
+  if (cm->seq_params.force_screen_content_tools != 2) {
+    features->allow_screen_content_tools = features->allow_intrabc =
+        cm->seq_params.force_screen_content_tools;
+    return;
+  }
+
+  if (cpi->oxcf.mode == REALTIME) {
+    assert(cm->seq_params.reduced_still_picture_hdr);
+    features->allow_screen_content_tools = features->allow_intrabc = 0;
+    return;
+  }
+
+  if (cpi->oxcf.content == AOM_CONTENT_SCREEN) {
+    features->allow_screen_content_tools = features->allow_intrabc = 1;
+    return;
+  }
+
+  // Estimate if the source frame is screen content, based on the portion of
+  // blocks that have few luma colors.
+  const uint8_t *src = cpi->unfiltered_source->y_buffer;
+  assert(src != NULL);
+  const int use_hbd = cpi->unfiltered_source->flags & YV12_FLAG_HIGHBITDEPTH;
+  const int stride = cpi->unfiltered_source->y_stride;
+  const int width = cpi->unfiltered_source->y_width;
+  const int height = cpi->unfiltered_source->y_height;
+  const int bd = cm->seq_params.bit_depth;
+  const int blk_w = 16;
+  const int blk_h = 16;
+  // These threshold values are selected experimentally.
+  const int color_thresh = 4;
+  const unsigned int var_thresh = 0;
+  // Counts of blocks with no more than color_thresh colors.
+  int counts_1 = 0;
+  // Counts of blocks with no more than color_thresh colors and variance larger
+  // than var_thresh.
+  int counts_2 = 0;
+
+  for (int r = 0; r + blk_h <= height; r += blk_h) {
+    for (int c = 0; c + blk_w <= width; c += blk_w) {
+      int count_buf[1 << 12];  // Maximum (1 << 12) color levels.
+      const uint8_t *const this_src = src + r * stride + c;
+      const int n_colors =
+          use_hbd ? av1_count_colors_highbd(this_src, stride, blk_w, blk_h, bd,
+                                            count_buf)
+                  : av1_count_colors(this_src, stride, blk_w, blk_h, count_buf);
+      if (n_colors > 1 && n_colors <= color_thresh) {
+        ++counts_1;
+        struct buf_2d buf;
+        buf.stride = stride;
+        buf.buf = (uint8_t *)this_src;
+        const unsigned int var =
+            use_hbd
+                ? av1_high_get_sby_perpixel_variance(cpi, &buf, BLOCK_16X16, bd)
+                : av1_get_sby_perpixel_variance(cpi, &buf, BLOCK_16X16);
+        if (var > var_thresh) ++counts_2;
+      }
+    }
+  }
+
+  // The threshold values are selected experimentally.
+  features->allow_screen_content_tools =
+      counts_1 * blk_h * blk_w * 10 > width * height;
+  // IntraBC would force loop filters off, so we use more strict rules that also
+  // requires that the block has high variance.
+  features->allow_intrabc = features->allow_screen_content_tools &&
+                            counts_2 * blk_h * blk_w * 12 > width * height;
+}
+
+static void set_size_independent_vars(AV1_COMP *cpi) {
+  int i;
+  AV1_COMMON *const cm = &cpi->common;
+  for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
+    cm->global_motion[i] = default_warp_params;
+  }
+  cpi->gm_info.search_done = 0;
+
+  av1_set_speed_features_framesize_independent(cpi, cpi->speed);
+  av1_set_rd_speed_thresholds(cpi);
+  cm->features.interp_filter = SWITCHABLE;
+  cm->features.switchable_motion_mode = 1;
+}
+
+#if !CONFIG_REALTIME_ONLY
+double av1_get_gfu_boost_projection_factor(double min_factor, double max_factor,
+                                           int frame_count) {
+  double factor = sqrt((double)frame_count);
+  factor = AOMMIN(factor, max_factor);
+  factor = AOMMAX(factor, min_factor);
+  factor = (200.0 + 10.0 * factor);
+  return factor;
+}
+
+static int get_gfu_boost_from_r0_lap(double min_factor, double max_factor,
+                                     double r0, int frames_to_key) {
+  double factor = av1_get_gfu_boost_projection_factor(min_factor, max_factor,
+                                                      frames_to_key);
+  const int boost = (int)rint(factor / r0);
+  return boost;
+}
+
+double av1_get_kf_boost_projection_factor(int frame_count) {
+  double factor = sqrt((double)frame_count);
+  factor = AOMMIN(factor, 10.0);
+  factor = AOMMAX(factor, 4.0);
+  factor = (75.0 + 14.0 * factor);
+  return factor;
+}
+
+static int get_kf_boost_from_r0(double r0, int frames_to_key) {
+  double factor = av1_get_kf_boost_projection_factor(frames_to_key);
+  const int boost = (int)rint(factor / r0);
+  return boost;
+}
+#endif
+
+#define MIN_BOOST_COMBINE_FACTOR 4.0
+#define MAX_BOOST_COMBINE_FACTOR 12.0
+int combine_prior_with_tpl_boost(double min_factor, double max_factor,
+                                 int prior_boost, int tpl_boost,
+                                 int frames_to_key) {
+  double factor = sqrt((double)frames_to_key);
+  double range = max_factor - min_factor;
+  factor = AOMMIN(factor, max_factor);
+  factor = AOMMAX(factor, min_factor);
+  factor -= min_factor;
+  int boost =
+      (int)((factor * prior_boost + (range - factor) * tpl_boost) / range);
+  return boost;
+}
+
+#if !CONFIG_REALTIME_ONLY
+static void process_tpl_stats_frame(AV1_COMP *cpi) {
+  const GF_GROUP *const gf_group = &cpi->gf_group;
+  AV1_COMMON *const cm = &cpi->common;
+
+  assert(IMPLIES(gf_group->size > 0, gf_group->index < gf_group->size));
+
+  const int tpl_idx = gf_group->index;
+  TplParams *const tpl_data = &cpi->tpl_data;
+  TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_idx];
+  TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
+
+  if (tpl_frame->is_valid) {
+    int tpl_stride = tpl_frame->stride;
+    int64_t intra_cost_base = 0;
+    int64_t mc_dep_cost_base = 0;
+    int64_t mc_saved_base = 0;
+    int64_t mc_count_base = 0;
+    const int step = 1 << tpl_data->tpl_stats_block_mis_log2;
+    const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width);
+
+    for (int row = 0; row < cm->mi_params.mi_rows; row += step) {
+      for (int col = 0; col < mi_cols_sr; col += step) {
+        TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos(
+            row, col, tpl_stride, tpl_data->tpl_stats_block_mis_log2)];
+        int64_t mc_dep_delta =
+            RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate,
+                   this_stats->mc_dep_dist);
+        intra_cost_base += (this_stats->recrf_dist << RDDIV_BITS);
+        mc_dep_cost_base +=
+            (this_stats->recrf_dist << RDDIV_BITS) + mc_dep_delta;
+        mc_count_base += this_stats->mc_count;
+        mc_saved_base += this_stats->mc_saved;
+      }
+    }
+
+    if (mc_dep_cost_base == 0) {
+      tpl_frame->is_valid = 0;
+    } else {
+      aom_clear_system_state();
+      cpi->rd.r0 = (double)intra_cost_base / mc_dep_cost_base;
+      if (is_frame_arf_and_tpl_eligible(gf_group)) {
+        cpi->rd.arf_r0 = cpi->rd.r0;
+        if (cpi->lap_enabled) {
+          double min_boost_factor = sqrt(cpi->rc.baseline_gf_interval);
+          const int gfu_boost = get_gfu_boost_from_r0_lap(
+              min_boost_factor, MAX_GFUBOOST_FACTOR, cpi->rd.arf_r0,
+              cpi->rc.num_stats_required_for_gfu_boost);
+          // printf("old boost %d new boost %d\n", cpi->rc.gfu_boost,
+          //        gfu_boost);
+          cpi->rc.gfu_boost = combine_prior_with_tpl_boost(
+              min_boost_factor, MAX_BOOST_COMBINE_FACTOR, cpi->rc.gfu_boost,
+              gfu_boost, cpi->rc.num_stats_used_for_gfu_boost);
+        } else {
+          const int gfu_boost = (int)(200.0 / cpi->rd.r0);
+          cpi->rc.gfu_boost = combine_prior_with_tpl_boost(
+              MIN_BOOST_COMBINE_FACTOR, MAX_BOOST_COMBINE_FACTOR,
+              cpi->rc.gfu_boost, gfu_boost, cpi->rc.frames_to_key);
+        }
+      } else if (frame_is_intra_only(cm)) {
+        // TODO(debargha): Turn off q adjustment for kf temporarily to
+        // reduce impact on speed of encoding. Need to investigate how
+        // to mitigate the issue.
+        if (cpi->oxcf.rc_mode == AOM_Q) {
+          const int kf_boost =
+              get_kf_boost_from_r0(cpi->rd.r0, cpi->rc.frames_to_key);
+          if (cpi->lap_enabled) {
+            cpi->rc.kf_boost = combine_prior_with_tpl_boost(
+                MIN_BOOST_COMBINE_FACTOR, MAX_BOOST_COMBINE_FACTOR,
+                cpi->rc.kf_boost, kf_boost,
+                cpi->rc.num_stats_used_for_kf_boost);
+          } else {
+            cpi->rc.kf_boost = combine_prior_with_tpl_boost(
+                MIN_BOOST_COMBINE_FACTOR, MAX_BOOST_COMBINE_FACTOR,
+                cpi->rc.kf_boost, kf_boost, cpi->rc.frames_to_key);
+          }
+        }
+      }
+      cpi->rd.mc_count_base = (double)mc_count_base /
+                              (cm->mi_params.mi_rows * cm->mi_params.mi_cols);
+      cpi->rd.mc_saved_base = (double)mc_saved_base /
+                              (cm->mi_params.mi_rows * cm->mi_params.mi_cols);
+      aom_clear_system_state();
+    }
+  }
+}
+#endif  // !CONFIG_REALTIME_ONLY
+
+static void set_size_dependent_vars(AV1_COMP *cpi, int *q, int *bottom_index,
+                                    int *top_index) {
+  AV1_COMMON *const cm = &cpi->common;
+
+  // Setup variables that depend on the dimensions of the frame.
+  av1_set_speed_features_framesize_dependent(cpi, cpi->speed);
+
+#if !CONFIG_REALTIME_ONLY
+  if (cpi->oxcf.enable_tpl_model && is_frame_tpl_eligible(cpi)) {
+    process_tpl_stats_frame(cpi);
+    av1_tpl_rdmult_setup(cpi);
+  }
+#endif
+
+  // Decide q and q bounds.
+  *q = av1_rc_pick_q_and_bounds(cpi, &cpi->rc, cm->width, cm->height,
+                                cpi->gf_group.index, bottom_index, top_index);
+
+  // Configure experimental use of segmentation for enhanced coding of
+  // static regions if indicated.
+  // Only allowed in the second pass of a two pass encode, as it requires
+  // lagged coding, and if the relevant speed feature flag is set.
+  if (is_stat_consumption_stage_twopass(cpi) &&
+      cpi->sf.hl_sf.static_segmentation)
+    configure_static_seg_features(cpi);
+}
+
+static void init_motion_estimation(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  MotionVectorSearchParams *const mv_search_params = &cpi->mv_search_params;
+  const int y_stride = cpi->scaled_source.y_stride;
+  const int y_stride_src =
+      ((cpi->oxcf.width != cm->width || cpi->oxcf.height != cm->height) ||
+       av1_superres_scaled(cm))
+          ? y_stride
+          : cpi->lookahead->buf->img.y_stride;
+  int fpf_y_stride = cm->cur_frame != NULL ? cm->cur_frame->buf.y_stride
+                                           : cpi->scaled_source.y_stride;
+
+  // Update if ss_cfg is uninitialized or the current frame has a new stride
+  const int should_update =
+      !mv_search_params->ss_cfg[SS_CFG_SRC].stride ||
+      !mv_search_params->ss_cfg[SS_CFG_LOOKAHEAD].stride ||
+      (y_stride != mv_search_params->ss_cfg[SS_CFG_SRC].stride);
+
+  if (!should_update) {
+    return;
+  }
+
+  if (cpi->sf.mv_sf.search_method == DIAMOND) {
+    av1_init_dsmotion_compensation(&mv_search_params->ss_cfg[SS_CFG_SRC],
+                                   y_stride);
+    av1_init_dsmotion_compensation(&mv_search_params->ss_cfg[SS_CFG_LOOKAHEAD],
+                                   y_stride_src);
+  } else {
+    av1_init3smotion_compensation(&mv_search_params->ss_cfg[SS_CFG_SRC],
+                                  y_stride);
+    av1_init3smotion_compensation(&mv_search_params->ss_cfg[SS_CFG_LOOKAHEAD],
+                                  y_stride_src);
+  }
+  av1_init_motion_fpf(&mv_search_params->ss_cfg[SS_CFG_FPF], fpf_y_stride);
+}
+
+#define COUPLED_CHROMA_FROM_LUMA_RESTORATION 0
+static void set_restoration_unit_size(int width, int height, int sx, int sy,
+                                      RestorationInfo *rst) {
+  (void)width;
+  (void)height;
+  (void)sx;
+  (void)sy;
+#if COUPLED_CHROMA_FROM_LUMA_RESTORATION
+  int s = AOMMIN(sx, sy);
+#else
+  int s = 0;
+#endif  // !COUPLED_CHROMA_FROM_LUMA_RESTORATION
+
+  if (width * height > 352 * 288)
+    rst[0].restoration_unit_size = RESTORATION_UNITSIZE_MAX;
+  else
+    rst[0].restoration_unit_size = (RESTORATION_UNITSIZE_MAX >> 1);
+  rst[1].restoration_unit_size = rst[0].restoration_unit_size >> s;
+  rst[2].restoration_unit_size = rst[1].restoration_unit_size;
+}
+
+static void init_ref_frame_bufs(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  int i;
+  BufferPool *const pool = cm->buffer_pool;
+  cm->cur_frame = NULL;
+  for (i = 0; i < REF_FRAMES; ++i) {
+    cm->ref_frame_map[i] = NULL;
+  }
+  for (i = 0; i < FRAME_BUFFERS; ++i) {
+    pool->frame_bufs[i].ref_count = 0;
+  }
+}
+
+void av1_check_initial_width(AV1_COMP *cpi, int use_highbitdepth,
+                             int subsampling_x, int subsampling_y) {
+  AV1_COMMON *const cm = &cpi->common;
+  SequenceHeader *const seq_params = &cm->seq_params;
+
+  if (!cpi->initial_width || seq_params->use_highbitdepth != use_highbitdepth ||
+      seq_params->subsampling_x != subsampling_x ||
+      seq_params->subsampling_y != subsampling_y) {
+    seq_params->subsampling_x = subsampling_x;
+    seq_params->subsampling_y = subsampling_y;
+    seq_params->use_highbitdepth = use_highbitdepth;
+
+    av1_set_speed_features_framesize_independent(cpi, cpi->oxcf.speed);
+    av1_set_speed_features_framesize_dependent(cpi, cpi->oxcf.speed);
+
+    if (!is_stat_generation_stage(cpi)) {
+      alloc_altref_frame_buffer(cpi);
+      alloc_util_frame_buffers(cpi);
+    }
+    init_ref_frame_bufs(cpi);
+
+    init_motion_estimation(cpi);  // TODO(agrange) This can be removed.
+
+    cpi->initial_width = cm->width;
+    cpi->initial_height = cm->height;
+    cpi->initial_mbs = cm->mi_params.MBs;
+  }
+}
+
+// Returns 1 if the assigned width or height was <= 0.
+int av1_set_size_literal(AV1_COMP *cpi, int width, int height) {
+  AV1_COMMON *cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  av1_check_initial_width(cpi, cm->seq_params.use_highbitdepth,
+                          cm->seq_params.subsampling_x,
+                          cm->seq_params.subsampling_y);
+
+  if (width <= 0 || height <= 0) return 1;
+
+  cm->width = width;
+  cm->height = height;
+
+  if (cpi->initial_width && cpi->initial_height &&
+      (cm->width > cpi->initial_width || cm->height > cpi->initial_height)) {
+    av1_free_context_buffers(cm);
+    av1_free_pc_tree(cpi, &cpi->td, num_planes, cm->seq_params.sb_size);
+    alloc_compressor_data(cpi);
+    realloc_segmentation_maps(cpi);
+    cpi->initial_width = cpi->initial_height = 0;
+  }
+  update_frame_size(cpi);
+
+  return 0;
+}
+
+void av1_set_frame_size(AV1_COMP *cpi, int width, int height) {
+  AV1_COMMON *const cm = &cpi->common;
+  const SequenceHeader *const seq_params = &cm->seq_params;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+  int ref_frame;
+
+  if (width != cm->width || height != cm->height) {
+    // There has been a change in the encoded frame size
+    av1_set_size_literal(cpi, width, height);
+    // Recalculate 'all_lossless' in case super-resolution was (un)selected.
+    cm->features.all_lossless =
+        cm->features.coded_lossless && !av1_superres_scaled(cm);
+  }
+  set_mv_search_params(cpi);
+
+  if (is_stat_consumption_stage(cpi)) {
+    av1_set_target_rate(cpi, cm->width, cm->height);
+  }
+
+  alloc_frame_mvs(cm, cm->cur_frame);
+
+  // Allocate above context buffers
+  CommonContexts *const above_contexts = &cm->above_contexts;
+  if (above_contexts->num_planes < av1_num_planes(cm) ||
+      above_contexts->num_mi_cols < cm->mi_params.mi_cols ||
+      above_contexts->num_tile_rows < cm->tiles.rows) {
+    av1_free_above_context_buffers(above_contexts);
+    if (av1_alloc_above_context_buffers(above_contexts, cm->tiles.rows,
+                                        cm->mi_params.mi_cols,
+                                        av1_num_planes(cm)))
+      aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                         "Failed to allocate context buffers");
+  }
+
+  // Reset the frame pointers to the current frame size.
+  if (aom_realloc_frame_buffer(
+          &cm->cur_frame->buf, cm->width, cm->height, seq_params->subsampling_x,
+          seq_params->subsampling_y, seq_params->use_highbitdepth,
+          cpi->oxcf.border_in_pixels, cm->features.byte_alignment, NULL, NULL,
+          NULL))
+    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate frame buffer");
+
+  const int frame_width = cm->superres_upscaled_width;
+  const int frame_height = cm->superres_upscaled_height;
+  set_restoration_unit_size(frame_width, frame_height,
+                            seq_params->subsampling_x,
+                            seq_params->subsampling_y, cm->rst_info);
+  for (int i = 0; i < num_planes; ++i)
+    cm->rst_info[i].frame_restoration_type = RESTORE_NONE;
+
+  av1_alloc_restoration_buffers(cm);
+  if (!is_stat_generation_stage(cpi)) alloc_util_frame_buffers(cpi);
+  init_motion_estimation(cpi);
+
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame);
+    if (buf != NULL) {
+      struct scale_factors *sf = get_ref_scale_factors(cm, ref_frame);
+      av1_setup_scale_factors_for_frame(sf, buf->buf.y_crop_width,
+                                        buf->buf.y_crop_height, cm->width,
+                                        cm->height);
+      if (av1_is_scaled(sf)) aom_extend_frame_borders(&buf->buf, num_planes);
+    }
+  }
+
+  av1_setup_scale_factors_for_frame(&cm->sf_identity, cm->width, cm->height,
+                                    cm->width, cm->height);
+
+  set_ref_ptrs(cm, xd, LAST_FRAME, LAST_FRAME);
+}
+
+static uint8_t calculate_next_resize_scale(const AV1_COMP *cpi) {
+  // Choose an arbitrary random number
+  static unsigned int seed = 56789;
+  const AV1EncoderConfig *oxcf = &cpi->oxcf;
+  if (is_stat_generation_stage(cpi)) return SCALE_NUMERATOR;
+  uint8_t new_denom = SCALE_NUMERATOR;
+
+  if (cpi->common.seq_params.reduced_still_picture_hdr) return SCALE_NUMERATOR;
+  switch (oxcf->resize_mode) {
+    case RESIZE_NONE: new_denom = SCALE_NUMERATOR; break;
+    case RESIZE_FIXED:
+      if (cpi->common.current_frame.frame_type == KEY_FRAME)
+        new_denom = oxcf->resize_kf_scale_denominator;
+      else
+        new_denom = oxcf->resize_scale_denominator;
+      break;
+    case RESIZE_RANDOM: new_denom = lcg_rand16(&seed) % 9 + 8; break;
+    default: assert(0);
+  }
+  return new_denom;
+}
+
+#if CONFIG_SUPERRES_IN_RECODE
+static int superres_in_recode_allowed(const AV1_COMP *const cpi) {
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  // Empirically found to not be beneficial for AOM_Q mode and images coding.
+  return oxcf->superres_mode == SUPERRES_AUTO &&
+         (oxcf->rc_mode == AOM_VBR || oxcf->rc_mode == AOM_CQ) &&
+         cpi->rc.frames_to_key > 1;
+}
+#endif  // CONFIG_SUPERRES_IN_RECODE
+
+#define SUPERRES_ENERGY_BY_Q2_THRESH_KEYFRAME_SOLO 0.012
+#define SUPERRES_ENERGY_BY_Q2_THRESH_KEYFRAME 0.008
+#define SUPERRES_ENERGY_BY_Q2_THRESH_ARFFRAME 0.008
+#define SUPERRES_ENERGY_BY_AC_THRESH 0.2
+
+static double get_energy_by_q2_thresh(const GF_GROUP *gf_group,
+                                      const RATE_CONTROL *rc) {
+  // TODO(now): Return keyframe thresh * factor based on frame type / pyramid
+  // level.
+  if (gf_group->update_type[gf_group->index] == ARF_UPDATE) {
+    return SUPERRES_ENERGY_BY_Q2_THRESH_ARFFRAME;
+  } else if (gf_group->update_type[gf_group->index] == KF_UPDATE) {
+    if (rc->frames_to_key <= 1)
+      return SUPERRES_ENERGY_BY_Q2_THRESH_KEYFRAME_SOLO;
+    else
+      return SUPERRES_ENERGY_BY_Q2_THRESH_KEYFRAME;
+  } else {
+    assert(0);
+  }
+  return 0;
+}
+
+static uint8_t get_superres_denom_from_qindex_energy(int qindex, double *energy,
+                                                     double threshq,
+                                                     double threshp) {
+  const double q = av1_convert_qindex_to_q(qindex, AOM_BITS_8);
+  const double tq = threshq * q * q;
+  const double tp = threshp * energy[1];
+  const double thresh = AOMMIN(tq, tp);
+  int k;
+  for (k = SCALE_NUMERATOR * 2; k > SCALE_NUMERATOR; --k) {
+    if (energy[k - 1] > thresh) break;
+  }
+  return 3 * SCALE_NUMERATOR - k;
+}
+
+static uint8_t get_superres_denom_for_qindex(const AV1_COMP *cpi, int qindex,
+                                             int sr_kf, int sr_arf) {
+  // Use superres for Key-frames and Alt-ref frames only.
+  const GF_GROUP *gf_group = &cpi->gf_group;
+  if (gf_group->update_type[gf_group->index] != KF_UPDATE &&
+      gf_group->update_type[gf_group->index] != ARF_UPDATE) {
+    return SCALE_NUMERATOR;
+  }
+  if (gf_group->update_type[gf_group->index] == KF_UPDATE && !sr_kf) {
+    return SCALE_NUMERATOR;
+  }
+  if (gf_group->update_type[gf_group->index] == ARF_UPDATE && !sr_arf) {
+    return SCALE_NUMERATOR;
+  }
+
+  double energy[16];
+  analyze_hor_freq(cpi, energy);
+
+  const double energy_by_q2_thresh =
+      get_energy_by_q2_thresh(gf_group, &cpi->rc);
+  int denom = get_superres_denom_from_qindex_energy(
+      qindex, energy, energy_by_q2_thresh, SUPERRES_ENERGY_BY_AC_THRESH);
+  /*
+  printf("\nenergy = [");
+  for (int k = 1; k < 16; ++k) printf("%f, ", energy[k]);
+  printf("]\n");
+  printf("boost = %d\n",
+         (gf_group->update_type[gf_group->index] == KF_UPDATE)
+             ? cpi->rc.kf_boost
+             : cpi->rc.gfu_boost);
+  printf("denom = %d\n", denom);
+  */
+#if CONFIG_SUPERRES_IN_RECODE
+  if (superres_in_recode_allowed(cpi)) {
+    assert(cpi->superres_mode != SUPERRES_NONE);
+    // Force superres to be tried in the recode loop, as full-res is also going
+    // to be tried anyway.
+    denom = AOMMAX(denom, SCALE_NUMERATOR + 1);
+  }
+#endif  // CONFIG_SUPERRES_IN_RECODE
+  return denom;
+}
+
+// If true, SUPERRES_AUTO mode will exhaustively search over all superres
+// denominators for all frames (except overlay and internal overlay frames).
+#define SUPERRES_RECODE_ALL_RATIOS 0
+
+static uint8_t calculate_next_superres_scale(AV1_COMP *cpi) {
+  // Choose an arbitrary random number
+  static unsigned int seed = 34567;
+  const AV1EncoderConfig *oxcf = &cpi->oxcf;
+  if (is_stat_generation_stage(cpi)) return SCALE_NUMERATOR;
+  uint8_t new_denom = SCALE_NUMERATOR;
+
+  // Make sure that superres mode of the frame is consistent with the
+  // sequence-level flag.
+  assert(IMPLIES(oxcf->superres_mode != SUPERRES_NONE,
+                 cpi->common.seq_params.enable_superres));
+  assert(IMPLIES(!cpi->common.seq_params.enable_superres,
+                 oxcf->superres_mode == SUPERRES_NONE));
+  // Make sure that superres mode for current encoding is consistent with user
+  // provided superres mode.
+  assert(IMPLIES(oxcf->superres_mode != SUPERRES_AUTO,
+                 cpi->superres_mode == oxcf->superres_mode));
+
+  // Note: we must look at the current superres_mode to be tried in 'cpi' here,
+  // not the user given mode in 'oxcf'.
+  switch (cpi->superres_mode) {
+    case SUPERRES_NONE: new_denom = SCALE_NUMERATOR; break;
+    case SUPERRES_FIXED:
+      if (cpi->common.current_frame.frame_type == KEY_FRAME)
+        new_denom = oxcf->superres_kf_scale_denominator;
+      else
+        new_denom = oxcf->superres_scale_denominator;
+      break;
+    case SUPERRES_RANDOM: new_denom = lcg_rand16(&seed) % 9 + 8; break;
+    case SUPERRES_QTHRESH: {
+      // Do not use superres when screen content tools are used.
+      if (cpi->common.features.allow_screen_content_tools) break;
+      if (oxcf->rc_mode == AOM_VBR || oxcf->rc_mode == AOM_CQ)
+        av1_set_target_rate(cpi, cpi->oxcf.width, cpi->oxcf.height);
+
+      // Now decide the use of superres based on 'q'.
+      int bottom_index, top_index;
+      const int q = av1_rc_pick_q_and_bounds(
+          cpi, &cpi->rc, cpi->oxcf.width, cpi->oxcf.height, cpi->gf_group.index,
+          &bottom_index, &top_index);
+
+      const int qthresh = (frame_is_intra_only(&cpi->common))
+                              ? oxcf->superres_kf_qthresh
+                              : oxcf->superres_qthresh;
+      if (q <= qthresh) {
+        new_denom = SCALE_NUMERATOR;
+      } else {
+        new_denom = get_superres_denom_for_qindex(cpi, q, 1, 1);
+      }
+      break;
+    }
+    case SUPERRES_AUTO: {
+      // Do not use superres when screen content tools are used.
+      if (cpi->common.features.allow_screen_content_tools) break;
+      if (oxcf->rc_mode == AOM_VBR || oxcf->rc_mode == AOM_CQ)
+        av1_set_target_rate(cpi, cpi->oxcf.width, cpi->oxcf.height);
+
+      // Now decide the use of superres based on 'q'.
+      int bottom_index, top_index;
+      const int q = av1_rc_pick_q_and_bounds(
+          cpi, &cpi->rc, cpi->oxcf.width, cpi->oxcf.height, cpi->gf_group.index,
+          &bottom_index, &top_index);
+
+      const int qthresh = 128;
+      if (q <= qthresh) {
+        new_denom = SCALE_NUMERATOR;
+      } else {
+#if SUPERRES_RECODE_ALL_RATIOS
+        if (cpi->common.current_frame.frame_type == KEY_FRAME)
+          new_denom = oxcf->superres_kf_scale_denominator;
+        else
+          new_denom = oxcf->superres_scale_denominator;
+#else
+        new_denom = get_superres_denom_for_qindex(cpi, q, 1, 1);
+#endif  // SUPERRES_RECODE_ALL_RATIOS
+      }
+      break;
+    }
+    default: assert(0);
+  }
+  return new_denom;
+}
+
+static int dimension_is_ok(int orig_dim, int resized_dim, int denom) {
+  return (resized_dim * SCALE_NUMERATOR >= orig_dim * denom / 2);
+}
+
+static int dimensions_are_ok(int owidth, int oheight, size_params_type *rsz) {
+  // Only need to check the width, as scaling is horizontal only.
+  (void)oheight;
+  return dimension_is_ok(owidth, rsz->resize_width, rsz->superres_denom);
+}
+
+static int validate_size_scales(RESIZE_MODE resize_mode,
+                                SUPERRES_MODE superres_mode, int owidth,
+                                int oheight, size_params_type *rsz) {
+  if (dimensions_are_ok(owidth, oheight, rsz)) {  // Nothing to do.
+    return 1;
+  }
+
+  // Calculate current resize scale.
+  int resize_denom =
+      AOMMAX(DIVIDE_AND_ROUND(owidth * SCALE_NUMERATOR, rsz->resize_width),
+             DIVIDE_AND_ROUND(oheight * SCALE_NUMERATOR, rsz->resize_height));
+
+  if (resize_mode != RESIZE_RANDOM && superres_mode == SUPERRES_RANDOM) {
+    // Alter superres scale as needed to enforce conformity.
+    rsz->superres_denom =
+        (2 * SCALE_NUMERATOR * SCALE_NUMERATOR) / resize_denom;
+    if (!dimensions_are_ok(owidth, oheight, rsz)) {
+      if (rsz->superres_denom > SCALE_NUMERATOR) --rsz->superres_denom;
+    }
+  } else if (resize_mode == RESIZE_RANDOM && superres_mode != SUPERRES_RANDOM) {
+    // Alter resize scale as needed to enforce conformity.
+    resize_denom =
+        (2 * SCALE_NUMERATOR * SCALE_NUMERATOR) / rsz->superres_denom;
+    rsz->resize_width = owidth;
+    rsz->resize_height = oheight;
+    av1_calculate_scaled_size(&rsz->resize_width, &rsz->resize_height,
+                              resize_denom);
+    if (!dimensions_are_ok(owidth, oheight, rsz)) {
+      if (resize_denom > SCALE_NUMERATOR) {
+        --resize_denom;
+        rsz->resize_width = owidth;
+        rsz->resize_height = oheight;
+        av1_calculate_scaled_size(&rsz->resize_width, &rsz->resize_height,
+                                  resize_denom);
+      }
+    }
+  } else if (resize_mode == RESIZE_RANDOM && superres_mode == SUPERRES_RANDOM) {
+    // Alter both resize and superres scales as needed to enforce conformity.
+    do {
+      if (resize_denom > rsz->superres_denom)
+        --resize_denom;
+      else
+        --rsz->superres_denom;
+      rsz->resize_width = owidth;
+      rsz->resize_height = oheight;
+      av1_calculate_scaled_size(&rsz->resize_width, &rsz->resize_height,
+                                resize_denom);
+    } while (!dimensions_are_ok(owidth, oheight, rsz) &&
+             (resize_denom > SCALE_NUMERATOR ||
+              rsz->superres_denom > SCALE_NUMERATOR));
+  } else {  // We are allowed to alter neither resize scale nor superres
+            // scale.
+    return 0;
+  }
+  return dimensions_are_ok(owidth, oheight, rsz);
+}
+
+// Calculates resize and superres params for next frame
+static size_params_type calculate_next_size_params(AV1_COMP *cpi) {
+  const AV1EncoderConfig *oxcf = &cpi->oxcf;
+  ResizePendingParams *resize_pending_params = &cpi->resize_pending_params;
+  size_params_type rsz = { oxcf->width, oxcf->height, SCALE_NUMERATOR };
+  int resize_denom = SCALE_NUMERATOR;
+  if (has_no_stats_stage(cpi) && cpi->use_svc &&
+      cpi->svc.spatial_layer_id < cpi->svc.number_spatial_layers - 1) {
+    rsz.resize_width = cpi->common.width;
+    rsz.resize_height = cpi->common.height;
+    return rsz;
+  }
+  if (is_stat_generation_stage(cpi)) return rsz;
+  if (resize_pending_params->width && resize_pending_params->height) {
+    rsz.resize_width = resize_pending_params->width;
+    rsz.resize_height = resize_pending_params->height;
+    resize_pending_params->width = resize_pending_params->height = 0;
+  } else {
+    resize_denom = calculate_next_resize_scale(cpi);
+    rsz.resize_width = oxcf->width;
+    rsz.resize_height = oxcf->height;
+    av1_calculate_scaled_size(&rsz.resize_width, &rsz.resize_height,
+                              resize_denom);
+  }
+  rsz.superres_denom = calculate_next_superres_scale(cpi);
+  if (!validate_size_scales(oxcf->resize_mode, cpi->superres_mode, oxcf->width,
+                            oxcf->height, &rsz))
+    assert(0 && "Invalid scale parameters");
+  return rsz;
+}
+
+static void setup_frame_size_from_params(AV1_COMP *cpi,
+                                         const size_params_type *rsz) {
+  int encode_width = rsz->resize_width;
+  int encode_height = rsz->resize_height;
+
+  AV1_COMMON *cm = &cpi->common;
+  cm->superres_upscaled_width = encode_width;
+  cm->superres_upscaled_height = encode_height;
+  cm->superres_scale_denominator = rsz->superres_denom;
+  av1_calculate_scaled_superres_size(&encode_width, &encode_height,
+                                     rsz->superres_denom);
+  av1_set_frame_size(cpi, encode_width, encode_height);
+}
+
+void av1_setup_frame_size(AV1_COMP *cpi) {
+  AV1_COMMON *cm = &cpi->common;
+  // Reset superres params from previous frame.
+  cm->superres_scale_denominator = SCALE_NUMERATOR;
+  const size_params_type rsz = calculate_next_size_params(cpi);
+  setup_frame_size_from_params(cpi, &rsz);
+
+  assert(av1_is_min_tile_width_satisfied(cm));
+}
+
+static void superres_post_encode(AV1_COMP *cpi) {
+  AV1_COMMON *cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+
+  if (!av1_superres_scaled(cm)) return;
+
+  assert(cpi->oxcf.enable_superres);
+  assert(!is_lossless_requested(&cpi->oxcf));
+  assert(!cm->features.all_lossless);
+
+  av1_superres_upscale(cm, NULL);
+
+  // If regular resizing is occurring the source will need to be downscaled to
+  // match the upscaled superres resolution. Otherwise the original source is
+  // used.
+  if (!av1_resize_scaled(cm)) {
+    cpi->source = cpi->unscaled_source;
+    if (cpi->last_source != NULL) cpi->last_source = cpi->unscaled_last_source;
+  } else {
+    assert(cpi->unscaled_source->y_crop_width != cm->superres_upscaled_width);
+    assert(cpi->unscaled_source->y_crop_height != cm->superres_upscaled_height);
+    // Do downscale. cm->(width|height) has been updated by
+    // av1_superres_upscale
+    if (aom_realloc_frame_buffer(
+            &cpi->scaled_source, cm->superres_upscaled_width,
+            cm->superres_upscaled_height, cm->seq_params.subsampling_x,
+            cm->seq_params.subsampling_y, cm->seq_params.use_highbitdepth,
+            AOM_BORDER_IN_PIXELS, cm->features.byte_alignment, NULL, NULL,
+            NULL))
+      aom_internal_error(
+          &cm->error, AOM_CODEC_MEM_ERROR,
+          "Failed to reallocate scaled source buffer for superres");
+    assert(cpi->scaled_source.y_crop_width == cm->superres_upscaled_width);
+    assert(cpi->scaled_source.y_crop_height == cm->superres_upscaled_height);
+    av1_resize_and_extend_frame(cpi->unscaled_source, &cpi->scaled_source,
+                                (int)cm->seq_params.bit_depth, num_planes);
+    cpi->source = &cpi->scaled_source;
+  }
+}
+
+static void cdef_restoration_frame(AV1_COMP *cpi, AV1_COMMON *cm,
+                                   MACROBLOCKD *xd, int use_restoration,
+                                   int use_cdef) {
+  if (use_restoration)
+    av1_loop_restoration_save_boundary_lines(&cm->cur_frame->buf, cm, 0);
+
+  if (use_cdef) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    start_timing(cpi, cdef_time);
+#endif
+    // Find CDEF parameters
+    av1_cdef_search(&cm->cur_frame->buf, cpi->source, cm, xd,
+                    cpi->sf.lpf_sf.cdef_pick_method, cpi->td.mb.rdmult);
+
+    // Apply the filter
+    av1_cdef_frame(&cm->cur_frame->buf, cm, xd);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    end_timing(cpi, cdef_time);
+#endif
+  } else {
+    cm->cdef_info.cdef_bits = 0;
+    cm->cdef_info.cdef_strengths[0] = 0;
+    cm->cdef_info.nb_cdef_strengths = 1;
+    cm->cdef_info.cdef_uv_strengths[0] = 0;
+  }
+
+  superres_post_encode(cpi);
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, loop_restoration_time);
+#endif
+  if (use_restoration) {
+    av1_loop_restoration_save_boundary_lines(&cm->cur_frame->buf, cm, 1);
+    av1_pick_filter_restoration(cpi->source, cpi);
+    if (cm->rst_info[0].frame_restoration_type != RESTORE_NONE ||
+        cm->rst_info[1].frame_restoration_type != RESTORE_NONE ||
+        cm->rst_info[2].frame_restoration_type != RESTORE_NONE) {
+      if (cpi->num_workers > 1)
+        av1_loop_restoration_filter_frame_mt(&cm->cur_frame->buf, cm, 0,
+                                             cpi->workers, cpi->num_workers,
+                                             &cpi->lr_row_sync, &cpi->lr_ctxt);
+      else
+        av1_loop_restoration_filter_frame(&cm->cur_frame->buf, cm, 0,
+                                          &cpi->lr_ctxt);
+    }
+  } else {
+    cm->rst_info[0].frame_restoration_type = RESTORE_NONE;
+    cm->rst_info[1].frame_restoration_type = RESTORE_NONE;
+    cm->rst_info[2].frame_restoration_type = RESTORE_NONE;
+  }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, loop_restoration_time);
+#endif
+}
+
+static void loopfilter_frame(AV1_COMP *cpi, AV1_COMMON *cm) {
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *xd = &cpi->td.mb.e_mbd;
+
+  assert(IMPLIES(is_lossless_requested(&cpi->oxcf),
+                 cm->features.coded_lossless && cm->features.all_lossless));
+
+  const int use_loopfilter =
+      !cm->features.coded_lossless && !cm->tiles.large_scale;
+  const int use_cdef = cm->seq_params.enable_cdef &&
+                       !cm->features.coded_lossless && !cm->tiles.large_scale;
+  const int use_restoration = cm->seq_params.enable_restoration &&
+                              !cm->features.all_lossless &&
+                              !cm->tiles.large_scale;
+
+  struct loopfilter *lf = &cm->lf;
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, loop_filter_time);
+#endif
+  if (use_loopfilter) {
+    aom_clear_system_state();
+    av1_pick_filter_level(cpi->source, cpi, cpi->sf.lpf_sf.lpf_pick);
+  } else {
+    lf->filter_level[0] = 0;
+    lf->filter_level[1] = 0;
+  }
+
+  if (lf->filter_level[0] || lf->filter_level[1]) {
+    if (cpi->num_workers > 1)
+      av1_loop_filter_frame_mt(&cm->cur_frame->buf, cm, xd, 0, num_planes, 0,
+#if CONFIG_LPF_MASK
+                               0,
+#endif
+                               cpi->workers, cpi->num_workers,
+                               &cpi->lf_row_sync);
+    else
+      av1_loop_filter_frame(&cm->cur_frame->buf, cm, xd,
+#if CONFIG_LPF_MASK
+                            0,
+#endif
+                            0, num_planes, 0);
+  }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, loop_filter_time);
+#endif
+
+  cdef_restoration_frame(cpi, cm, xd, use_restoration, use_cdef);
+}
+
+static void fix_interp_filter(InterpFilter *const interp_filter,
+                              const FRAME_COUNTS *const counts) {
+  if (*interp_filter == SWITCHABLE) {
+    // Check to see if only one of the filters is actually used
+    int count[SWITCHABLE_FILTERS] = { 0 };
+    int num_filters_used = 0;
+    for (int i = 0; i < SWITCHABLE_FILTERS; ++i) {
+      for (int j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j)
+        count[i] += counts->switchable_interp[j][i];
+      num_filters_used += (count[i] > 0);
+    }
+    if (num_filters_used == 1) {
+      // Only one filter is used. So set the filter at frame level
+      for (int i = 0; i < SWITCHABLE_FILTERS; ++i) {
+        if (count[i]) {
+          if (i == EIGHTTAP_REGULAR) *interp_filter = i;
+          break;
+        }
+      }
+    }
+  }
+}
+
+static void finalize_encoded_frame(AV1_COMP *const cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  CurrentFrame *const current_frame = &cm->current_frame;
+
+  if (!cm->seq_params.reduced_still_picture_hdr &&
+      encode_show_existing_frame(cm)) {
+    RefCntBuffer *const frame_to_show =
+        cm->ref_frame_map[cpi->existing_fb_idx_to_show];
+
+    if (frame_to_show == NULL) {
+      aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                         "Buffer does not contain a reconstructed frame");
+    }
+    assert(frame_to_show->ref_count > 0);
+    assign_frame_buffer_p(&cm->cur_frame, frame_to_show);
+  }
+
+  if (!encode_show_existing_frame(cm) &&
+      cm->seq_params.film_grain_params_present &&
+      (cm->show_frame || cm->showable_frame)) {
+    // Copy the current frame's film grain params to the its corresponding
+    // RefCntBuffer slot.
+    cm->cur_frame->film_grain_params = cm->film_grain_params;
+
+    // We must update the parameters if this is not an INTER_FRAME
+    if (current_frame->frame_type != INTER_FRAME)
+      cm->cur_frame->film_grain_params.update_parameters = 1;
+
+    // Iterate the random seed for the next frame.
+    cm->film_grain_params.random_seed += 3381;
+    if (cm->film_grain_params.random_seed == 0)
+      cm->film_grain_params.random_seed = 7391;
+  }
+
+  // Initialise all tiles' contexts from the global frame context
+  for (int tile_col = 0; tile_col < cm->tiles.cols; tile_col++) {
+    for (int tile_row = 0; tile_row < cm->tiles.rows; tile_row++) {
+      const int tile_idx = tile_row * cm->tiles.cols + tile_col;
+      cpi->tile_data[tile_idx].tctx = *cm->fc;
+    }
+  }
+
+  fix_interp_filter(&cm->features.interp_filter, cpi->td.counts);
+}
+
+static int get_regulated_q_overshoot(AV1_COMP *const cpi, int q_low, int q_high,
+                                     int top_index, int bottom_index) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+
+  av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
+
+  int q_regulated =
+      av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
+                        AOMMAX(q_high, top_index), cm->width, cm->height);
+
+  int retries = 0;
+  while (q_regulated < q_low && retries < 10) {
+    av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
+    q_regulated =
+        av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
+                          AOMMAX(q_high, top_index), cm->width, cm->height);
+    retries++;
+  }
+  return q_regulated;
+}
+
+static int get_regulated_q_undershoot(AV1_COMP *const cpi, int q_high,
+                                      int top_index, int bottom_index) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+
+  av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
+  int q_regulated = av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
+                                      top_index, cm->width, cm->height);
+
+  int retries = 0;
+  while (q_regulated > q_high && retries < 10) {
+    av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
+    q_regulated = av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
+                                    top_index, cm->width, cm->height);
+    retries++;
+  }
+  return q_regulated;
+}
+
+// Called after encode_with_recode_loop() has just encoded a frame and packed
+// its bitstream.  This function works out whether we under- or over-shot
+// our bitrate target and adjusts q as appropriate.  Also decides whether
+// or not we should do another recode loop, indicated by *loop
+static void recode_loop_update_q(
+    AV1_COMP *const cpi, int *const loop, int *const q, int *const q_low,
+    int *const q_high, const int top_index, const int bottom_index,
+    int *const undershoot_seen, int *const overshoot_seen,
+    int *const low_cr_seen, const int loop_at_this_size) {
+  AV1_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  *loop = 0;
+
+  const int min_cr = cpi->oxcf.min_cr;
+  if (min_cr > 0) {
+    aom_clear_system_state();
+    const double compression_ratio =
+        av1_get_compression_ratio(cm, rc->projected_frame_size >> 3);
+    const double target_cr = min_cr / 100.0;
+    if (compression_ratio < target_cr) {
+      *low_cr_seen = 1;
+      if (*q < rc->worst_quality) {
+        const double cr_ratio = target_cr / compression_ratio;
+        const int projected_q = AOMMAX(*q + 1, (int)(*q * cr_ratio * cr_ratio));
+        *q = AOMMIN(AOMMIN(projected_q, *q + 32), rc->worst_quality);
+        *q_low = AOMMAX(*q, *q_low);
+        *q_high = AOMMAX(*q, *q_high);
+        *loop = 1;
+      }
+    }
+    if (*low_cr_seen) return;
+  }
+
+  if (cpi->oxcf.rc_mode == AOM_Q) return;
+
+  const int last_q = *q;
+  int frame_over_shoot_limit = 0, frame_under_shoot_limit = 0;
+  av1_rc_compute_frame_size_bounds(cpi, rc->this_frame_target,
+                                   &frame_under_shoot_limit,
+                                   &frame_over_shoot_limit);
+  if (frame_over_shoot_limit == 0) frame_over_shoot_limit = 1;
+
+  if (cm->current_frame.frame_type == KEY_FRAME && rc->this_key_frame_forced &&
+      rc->projected_frame_size < rc->max_frame_bandwidth) {
+    int64_t kf_err;
+    const int64_t high_err_target = cpi->ambient_err;
+    const int64_t low_err_target = cpi->ambient_err >> 1;
+
+#if CONFIG_AV1_HIGHBITDEPTH
+    if (cm->seq_params.use_highbitdepth) {
+      kf_err = aom_highbd_get_y_sse(cpi->source, &cm->cur_frame->buf);
+    } else {
+      kf_err = aom_get_y_sse(cpi->source, &cm->cur_frame->buf);
+    }
+#else
+    kf_err = aom_get_y_sse(cpi->source, &cm->cur_frame->buf);
+#endif
+    // Prevent possible divide by zero error below for perfect KF
+    kf_err += !kf_err;
+
+    // The key frame is not good enough or we can afford
+    // to make it better without undue risk of popping.
+    if ((kf_err > high_err_target &&
+         rc->projected_frame_size <= frame_over_shoot_limit) ||
+        (kf_err > low_err_target &&
+         rc->projected_frame_size <= frame_under_shoot_limit)) {
+      // Lower q_high
+      *q_high = AOMMAX(*q - 1, *q_low);
+
+      // Adjust Q
+      *q = (int)((*q * high_err_target) / kf_err);
+      *q = AOMMIN(*q, (*q_high + *q_low) >> 1);
+    } else if (kf_err < low_err_target &&
+               rc->projected_frame_size >= frame_under_shoot_limit) {
+      // The key frame is much better than the previous frame
+      // Raise q_low
+      *q_low = AOMMIN(*q + 1, *q_high);
+
+      // Adjust Q
+      *q = (int)((*q * low_err_target) / kf_err);
+      *q = AOMMIN(*q, (*q_high + *q_low + 1) >> 1);
+    }
+
+    // Clamp Q to upper and lower limits:
+    *q = clamp(*q, *q_low, *q_high);
+    *loop = (*q != last_q);
+    return;
+  }
+
+  if (recode_loop_test(cpi, frame_over_shoot_limit, frame_under_shoot_limit, *q,
+                       AOMMAX(*q_high, top_index), bottom_index)) {
+    // Is the projected frame size out of range and are we allowed
+    // to attempt to recode.
+
+    // Frame size out of permitted range:
+    // Update correction factor & compute new Q to try...
+    // Frame is too large
+    if (rc->projected_frame_size > rc->this_frame_target) {
+      // Special case if the projected size is > the max allowed.
+      if (*q == *q_high &&
+          rc->projected_frame_size >= rc->max_frame_bandwidth) {
+        const double q_val_high_current =
+            av1_convert_qindex_to_q(*q_high, cm->seq_params.bit_depth);
+        const double q_val_high_new =
+            q_val_high_current *
+            ((double)rc->projected_frame_size / rc->max_frame_bandwidth);
+        *q_high = av1_find_qindex(q_val_high_new, cm->seq_params.bit_depth,
+                                  rc->best_quality, rc->worst_quality);
+      }
+
+      // Raise Qlow as to at least the current value
+      *q_low = AOMMIN(*q + 1, *q_high);
+
+      if (*undershoot_seen || loop_at_this_size > 2 ||
+          (loop_at_this_size == 2 && !frame_is_intra_only(cm))) {
+        av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
+
+        *q = (*q_high + *q_low + 1) / 2;
+      } else if (loop_at_this_size == 2 && frame_is_intra_only(cm)) {
+        const int q_mid = (*q_high + *q_low + 1) / 2;
+        const int q_regulated = get_regulated_q_overshoot(
+            cpi, *q_low, *q_high, top_index, bottom_index);
+        // Get 'q' in-between 'q_mid' and 'q_regulated' for a smooth
+        // transition between loop_at_this_size < 2 and loop_at_this_size > 2.
+        *q = (q_mid + q_regulated + 1) / 2;
+      } else {
+        *q = get_regulated_q_overshoot(cpi, *q_low, *q_high, top_index,
+                                       bottom_index);
+      }
+
+      *overshoot_seen = 1;
+    } else {
+      // Frame is too small
+      *q_high = AOMMAX(*q - 1, *q_low);
+
+      if (*overshoot_seen || loop_at_this_size > 2 ||
+          (loop_at_this_size == 2 && !frame_is_intra_only(cm))) {
+        av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
+        *q = (*q_high + *q_low) / 2;
+      } else if (loop_at_this_size == 2 && frame_is_intra_only(cm)) {
+        const int q_mid = (*q_high + *q_low) / 2;
+        const int q_regulated =
+            get_regulated_q_undershoot(cpi, *q_high, top_index, bottom_index);
+        // Get 'q' in-between 'q_mid' and 'q_regulated' for a smooth
+        // transition between loop_at_this_size < 2 and loop_at_this_size > 2.
+        *q = (q_mid + q_regulated) / 2;
+
+        // Special case reset for qlow for constrained quality.
+        // This should only trigger where there is very substantial
+        // undershoot on a frame and the auto cq level is above
+        // the user passsed in value.
+        if (cpi->oxcf.rc_mode == AOM_CQ && q_regulated < *q_low) {
+          *q_low = *q;
+        }
+      } else {
+        *q = get_regulated_q_undershoot(cpi, *q_high, top_index, bottom_index);
+
+        // Special case reset for qlow for constrained quality.
+        // This should only trigger where there is very substantial
+        // undershoot on a frame and the auto cq level is above
+        // the user passsed in value.
+        if (cpi->oxcf.rc_mode == AOM_CQ && *q < *q_low) {
+          *q_low = *q;
+        }
+      }
+
+      *undershoot_seen = 1;
+    }
+
+    // Clamp Q to upper and lower limits:
+    *q = clamp(*q, *q_low, *q_high);
+  }
+
+  *loop = (*q != last_q);
+}
+
+static int get_interp_filter_selected(const AV1_COMMON *const cm,
+                                      MV_REFERENCE_FRAME ref,
+                                      InterpFilter ifilter) {
+  const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref);
+  if (buf == NULL) return 0;
+  return buf->interp_filter_selected[ifilter];
+}
+
+static uint16_t setup_interp_filter_search_mask(AV1_COMP *cpi) {
+  const AV1_COMMON *const cm = &cpi->common;
+  int ref_total[REF_FRAMES] = { 0 };
+  uint16_t mask = ALLOW_ALL_INTERP_FILT_MASK;
+
+  if (cpi->last_frame_type == KEY_FRAME || cpi->refresh_alt_ref_frame)
+    return mask;
+
+  for (MV_REFERENCE_FRAME ref = LAST_FRAME; ref <= ALTREF_FRAME; ++ref) {
+    for (InterpFilter ifilter = EIGHTTAP_REGULAR; ifilter <= MULTITAP_SHARP;
+         ++ifilter) {
+      ref_total[ref] += get_interp_filter_selected(cm, ref, ifilter);
+    }
+  }
+  int ref_total_total = (ref_total[LAST2_FRAME] + ref_total[LAST3_FRAME] +
+                         ref_total[GOLDEN_FRAME] + ref_total[BWDREF_FRAME] +
+                         ref_total[ALTREF2_FRAME] + ref_total[ALTREF_FRAME]);
+
+  for (InterpFilter ifilter = EIGHTTAP_REGULAR; ifilter <= MULTITAP_SHARP;
+       ++ifilter) {
+    int last_score = get_interp_filter_selected(cm, LAST_FRAME, ifilter) * 30;
+    if (ref_total[LAST_FRAME] && last_score <= ref_total[LAST_FRAME]) {
+      int filter_score =
+          get_interp_filter_selected(cm, LAST2_FRAME, ifilter) * 20 +
+          get_interp_filter_selected(cm, LAST3_FRAME, ifilter) * 20 +
+          get_interp_filter_selected(cm, GOLDEN_FRAME, ifilter) * 20 +
+          get_interp_filter_selected(cm, BWDREF_FRAME, ifilter) * 10 +
+          get_interp_filter_selected(cm, ALTREF2_FRAME, ifilter) * 10 +
+          get_interp_filter_selected(cm, ALTREF_FRAME, ifilter) * 10;
+      if (filter_score < ref_total_total) {
+        DUAL_FILTER_TYPE filt_type = ifilter + SWITCHABLE_FILTERS * ifilter;
+        reset_interp_filter_allowed_mask(&mask, filt_type);
+      }
+    }
+  }
+  return mask;
+}
+
+#if !CONFIG_REALTIME_ONLY
+#define STRICT_PSNR_DIFF_THRESH 0.9
+// Encode key frame with/without screen content tools to determine whether
+// screen content tools should be enabled for this key frame group or not.
+// The first encoding is without screen content tools.
+// The second encoding is with screen content tools.
+// We compare the psnr and frame size to make the decision.
+static void screen_content_tools_determination(
+    AV1_COMP *cpi, const int allow_screen_content_tools_orig_decision,
+    const int allow_intrabc_orig_decision,
+    const int is_screen_content_type_orig_decision, const int pass,
+    int *projected_size_pass, PSNR_STATS *psnr) {
+  AV1_COMMON *const cm = &cpi->common;
+  FeatureFlags *const features = &cm->features;
+  projected_size_pass[pass] = cpi->rc.projected_frame_size;
+#if CONFIG_AV1_HIGHBITDEPTH
+  const uint32_t in_bit_depth = cpi->oxcf.input_bit_depth;
+  const uint32_t bit_depth = cpi->td.mb.e_mbd.bd;
+  aom_calc_highbd_psnr(cpi->source, &cpi->common.cur_frame->buf, &psnr[pass],
+                       bit_depth, in_bit_depth);
+#else
+  aom_calc_psnr(cpi->source, &cpi->common.cur_frame->buf, &psnr[pass]);
+#endif
+  if (pass != 1) return;
+
+  const double psnr_diff = psnr[1].psnr[0] - psnr[0].psnr[0];
+  const int is_sc_encoding_much_better = psnr_diff > STRICT_PSNR_DIFF_THRESH;
+  if (is_sc_encoding_much_better) {
+    // Use screen content tools, if we get coding gain.
+    features->allow_screen_content_tools = 1;
+    features->allow_intrabc = cpi->intrabc_used;
+    cpi->is_screen_content_type = 1;
+  } else {
+    // Use original screen content decision.
+    features->allow_screen_content_tools =
+        allow_screen_content_tools_orig_decision;
+    features->allow_intrabc = allow_intrabc_orig_decision;
+    cpi->is_screen_content_type = is_screen_content_type_orig_decision;
+  }
+}
+
+// Set some encoding parameters to make the encoding process fast.
+// A fixed block partition size, and a large q is used.
+static void set_encoding_params_for_screen_content(AV1_COMP *cpi,
+                                                   const int pass) {
+  AV1_COMMON *const cm = &cpi->common;
+  if (pass == 0) {
+    // In the first pass, encode without screen content tools.
+    // Use a high q, and a fixed block size for fast encoding.
+    cm->features.allow_screen_content_tools = 0;
+    cm->features.allow_intrabc = 0;
+    cpi->is_screen_content_type = 0;
+    cpi->sf.part_sf.partition_search_type = FIXED_PARTITION;
+    cpi->sf.part_sf.always_this_block_size = BLOCK_32X32;
+    return;
+  }
+  assert(pass == 1);
+  // In the second pass, encode with screen content tools.
+  // Use a high q, and a fixed block size for fast encoding.
+  cm->features.allow_screen_content_tools = 1;
+  // TODO(chengchen): turn intrabc on could lead to data race issue.
+  // cm->allow_intrabc = 1;
+  cpi->is_screen_content_type = 1;
+  cpi->sf.part_sf.partition_search_type = FIXED_PARTITION;
+  cpi->sf.part_sf.always_this_block_size = BLOCK_32X32;
+}
+
+// Determines whether to use screen content tools for the key frame group.
+// This function modifies "cm->features.allow_screen_content_tools",
+// "cm->features.allow_intrabc" and "cpi->is_screen_content_type".
+static void determine_sc_tools_with_encoding(AV1_COMP *cpi, const int q_orig) {
+  AV1_COMMON *const cm = &cpi->common;
+  // Variables to help determine if we should allow screen content tools.
+  int projected_size_pass[3] = { 0 };
+  PSNR_STATS psnr[3];
+  const int is_key_frame = cm->current_frame.frame_type == KEY_FRAME;
+  const int allow_screen_content_tools_orig_decision =
+      cm->features.allow_screen_content_tools;
+  const int allow_intrabc_orig_decision = cm->features.allow_intrabc;
+  const int is_screen_content_type_orig_decision = cpi->is_screen_content_type;
+  // Turn off the encoding trial for forward key frame and superres.
+  if (cpi->sf.rt_sf.use_nonrd_pick_mode || cpi->oxcf.fwd_kf_enabled ||
+      cpi->superres_mode != SUPERRES_NONE || cpi->oxcf.mode == REALTIME ||
+      is_screen_content_type_orig_decision || !is_key_frame) {
+    return;
+  }
+
+  // TODO(chengchen): multiple encoding for the lossless mode is time consuming.
+  // Find a better way to determine whether screen content tools should be used
+  // for lossless coding.
+  // Use a high q and a fixed partition to do quick encoding.
+  const int q_for_screen_content_quick_run =
+      is_lossless_requested(&cpi->oxcf) ? q_orig : AOMMAX(q_orig, 244);
+  const int partition_search_type_orig = cpi->sf.part_sf.partition_search_type;
+  const BLOCK_SIZE fixed_partition_block_size_orig =
+      cpi->sf.part_sf.always_this_block_size;
+
+  // Setup necessary params for encoding, including frame source, etc.
+  aom_clear_system_state();
+
+  cpi->source =
+      av1_scale_if_required(cm, cpi->unscaled_source, &cpi->scaled_source);
+  if (cpi->unscaled_last_source != NULL) {
+    cpi->last_source = av1_scale_if_required(cm, cpi->unscaled_last_source,
+                                             &cpi->scaled_last_source);
+  }
+
+  setup_frame(cpi);
+
+  if (cm->seg.enabled) {
+    if (!cm->seg.update_data && cm->prev_frame) {
+      segfeatures_copy(&cm->seg, &cm->prev_frame->seg);
+      cm->seg.enabled = cm->prev_frame->seg.enabled;
+    } else {
+      av1_calculate_segdata(&cm->seg);
+    }
+  } else {
+    memset(&cm->seg, 0, sizeof(cm->seg));
+  }
+  segfeatures_copy(&cm->cur_frame->seg, &cm->seg);
+  cm->cur_frame->seg.enabled = cm->seg.enabled;
+
+  // The two encoding passes aim to help determine whether to use screen
+  // content tools, with a high q and fixed partition.
+  for (int pass = 0; pass < 2; ++pass) {
+    set_encoding_params_for_screen_content(cpi, pass);
+#if CONFIG_TUNE_VMAF
+    if (cpi->oxcf.tuning == AOM_TUNE_VMAF_WITH_PREPROCESSING ||
+        cpi->oxcf.tuning == AOM_TUNE_VMAF_WITHOUT_PREPROCESSING ||
+        cpi->oxcf.tuning == AOM_TUNE_VMAF_MAX_GAIN) {
+      av1_set_quantizer(
+          cm, cpi->oxcf.qm_minlevel, cpi->oxcf.qm_maxlevel,
+          av1_get_vmaf_base_qindex(cpi, q_for_screen_content_quick_run));
+    } else {
+#endif
+      av1_set_quantizer(cm, cpi->oxcf.qm_minlevel, cpi->oxcf.qm_maxlevel,
+                        q_for_screen_content_quick_run);
+#if CONFIG_TUNE_VMAF
+    }
+#endif
+    av1_set_speed_features_qindex_dependent(cpi, cpi->oxcf.speed);
+    if (cpi->oxcf.deltaq_mode != NO_DELTA_Q)
+      av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params,
+                         cm->seq_params.bit_depth);
+
+    av1_set_variance_partition_thresholds(cpi, q_for_screen_content_quick_run,
+                                          0);
+    // transform / motion compensation build reconstruction frame
+    av1_encode_frame(cpi);
+    // Screen content decision
+    screen_content_tools_determination(
+        cpi, allow_screen_content_tools_orig_decision,
+        allow_intrabc_orig_decision, is_screen_content_type_orig_decision, pass,
+        projected_size_pass, psnr);
+  }
+
+  // Set partition speed feature back.
+  cpi->sf.part_sf.partition_search_type = partition_search_type_orig;
+  cpi->sf.part_sf.always_this_block_size = fixed_partition_block_size_orig;
+}
+#endif  // CONFIG_REALTIME_ONLY
+
+static int encode_with_recode_loop(AV1_COMP *cpi, size_t *size, uint8_t *dest) {
+  AV1_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  GlobalMotionInfo *const gm_info = &cpi->gm_info;
+  const int allow_recode = (cpi->sf.hl_sf.recode_loop != DISALLOW_RECODE);
+  // Must allow recode if minimum compression ratio is set.
+  assert(IMPLIES(cpi->oxcf.min_cr > 0, allow_recode));
+
+  set_size_independent_vars(cpi);
+  if (is_stat_consumption_stage_twopass(cpi) &&
+      cpi->sf.interp_sf.adaptive_interp_filter_search)
+    cpi->interp_search_flags.interp_filter_search_mask =
+        setup_interp_filter_search_mask(cpi);
+  cpi->source->buf_8bit_valid = 0;
+
+  av1_setup_frame_size(cpi);
+
+#if CONFIG_SUPERRES_IN_RECODE
+  if (superres_in_recode_allowed(cpi) && cpi->superres_mode != SUPERRES_NONE &&
+      cm->superres_scale_denominator == SCALE_NUMERATOR) {
+    // Superres mode is currently enabled, but the denominator selected will
+    // disable superres. So no need to continue, as we will go through another
+    // recode loop for full-resolution after this anyway.
+    return -1;
+  }
+#endif  // CONFIG_SUPERRES_IN_RECODE
+
+  int top_index = 0, bottom_index = 0;
+  int q = 0, q_low = 0, q_high = 0;
+  set_size_dependent_vars(cpi, &q, &bottom_index, &top_index);
+  q_low = bottom_index;
+  q_high = top_index;
+  if (cpi->sf.part_sf.partition_search_type == VAR_BASED_PARTITION) {
+    const int num_64x64_blocks =
+        (cm->seq_params.sb_size == BLOCK_64X64) ? 1 : 4;
+    if (cpi->td.vt64x64) {
+      if (num_64x64_blocks != cpi->td.num_64x64_blocks) {
+        aom_free(cpi->td.vt64x64);
+        cpi->td.vt64x64 = NULL;
+      }
+    }
+    if (!cpi->td.vt64x64) {
+      CHECK_MEM_ERROR(cm, cpi->td.vt64x64,
+                      aom_malloc(sizeof(*cpi->td.vt64x64) * num_64x64_blocks));
+      cpi->td.num_64x64_blocks = num_64x64_blocks;
+    }
+  }
+
+  if (cm->current_frame.frame_type == KEY_FRAME) {
+    FrameProbInfo *const frame_probs = &cpi->frame_probs;
+
+    if (cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats) {
+      av1_copy(frame_probs->tx_type_probs, default_tx_type_probs);
+    }
+
+    if (!cpi->sf.inter_sf.disable_obmc &&
+        cpi->sf.inter_sf.prune_obmc_prob_thresh > 0) {
+      av1_copy(frame_probs->obmc_probs, default_obmc_probs);
+    }
+
+    if (cpi->sf.inter_sf.prune_warped_prob_thresh > 0) {
+      av1_copy(frame_probs->warped_probs, default_warped_probs);
+    }
+
+    if (cpi->sf.interp_sf.adaptive_interp_filter_search == 2) {
+      av1_copy(frame_probs->switchable_interp_probs,
+               default_switchable_interp_probs);
+    }
+  }
+#if !CONFIG_REALTIME_ONLY
+  // Determine whether to use screen content tools using two fast encoding.
+  determine_sc_tools_with_encoding(cpi, q);
+#endif  // CONFIG_REALTIME_ONLY
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  printf("\n Encoding a frame:");
+#endif
+
+  // Loop variables
+  int loop = 0;
+  int loop_count = 0;
+  int loop_at_this_size = 0;
+  int overshoot_seen = 0;
+  int undershoot_seen = 0;
+  int low_cr_seen = 0;
+  int last_loop_allow_hp = 0;
+
+  do {
+    loop = 0;
+    aom_clear_system_state();
+
+    // if frame was scaled calculate global_motion_search again if already
+    // done
+    if (loop_count > 0 && cpi->source && gm_info->search_done) {
+      if (cpi->source->y_crop_width != cm->width ||
+          cpi->source->y_crop_height != cm->height) {
+        gm_info->search_done = 0;
+      }
+    }
+    cpi->source =
+        av1_scale_if_required(cm, cpi->unscaled_source, &cpi->scaled_source);
+    if (cpi->unscaled_last_source != NULL) {
+      cpi->last_source = av1_scale_if_required(cm, cpi->unscaled_last_source,
+                                               &cpi->scaled_last_source);
+    }
+
+    if (!frame_is_intra_only(cm)) {
+      if (loop_count > 0) {
+        release_scaled_references(cpi);
+      }
+      scale_references(cpi);
+    }
+#if CONFIG_TUNE_VMAF
+    if (cpi->oxcf.tuning == AOM_TUNE_VMAF_WITH_PREPROCESSING ||
+        cpi->oxcf.tuning == AOM_TUNE_VMAF_WITHOUT_PREPROCESSING ||
+        cpi->oxcf.tuning == AOM_TUNE_VMAF_MAX_GAIN) {
+      av1_set_quantizer(cm, cpi->oxcf.qm_minlevel, cpi->oxcf.qm_maxlevel,
+                        av1_get_vmaf_base_qindex(cpi, q));
+    } else {
+#endif
+      av1_set_quantizer(cm, cpi->oxcf.qm_minlevel, cpi->oxcf.qm_maxlevel, q);
+#if CONFIG_TUNE_VMAF
+    }
+#endif
+    av1_set_speed_features_qindex_dependent(cpi, cpi->oxcf.speed);
+
+    if (cpi->oxcf.deltaq_mode != NO_DELTA_Q)
+      av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params,
+                         cm->seq_params.bit_depth);
+
+    av1_set_variance_partition_thresholds(cpi, q, 0);
+
+    // printf("Frame %d/%d: q = %d, frame_type = %d superres_denom = %d\n",
+    //        cm->current_frame.frame_number, cm->show_frame, q,
+    //        cm->current_frame.frame_type, cm->superres_scale_denominator);
+
+    if (loop_count == 0) {
+      setup_frame(cpi);
+    } else if (get_primary_ref_frame_buf(cm) == NULL) {
+      // Base q-index may have changed, so we need to assign proper default coef
+      // probs before every iteration.
+      av1_default_coef_probs(cm);
+      av1_setup_frame_contexts(cm);
+    }
+
+    if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
+      av1_vaq_frame_setup(cpi);
+    } else if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) {
+      av1_setup_in_frame_q_adj(cpi);
+    } else if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && !allow_recode) {
+      suppress_active_map(cpi);
+      av1_cyclic_refresh_setup(cpi);
+      apply_active_map(cpi);
+    }
+
+    if (cm->seg.enabled) {
+      if (!cm->seg.update_data && cm->prev_frame) {
+        segfeatures_copy(&cm->seg, &cm->prev_frame->seg);
+        cm->seg.enabled = cm->prev_frame->seg.enabled;
+      } else {
+        av1_calculate_segdata(&cm->seg);
+      }
+    } else {
+      memset(&cm->seg, 0, sizeof(cm->seg));
+    }
+    segfeatures_copy(&cm->cur_frame->seg, &cm->seg);
+    cm->cur_frame->seg.enabled = cm->seg.enabled;
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    start_timing(cpi, av1_encode_frame_time);
+#endif
+    // Set the motion vector precision based on mv stats from the last coded
+    // frame.
+    if (!frame_is_intra_only(cm)) {
+      av1_pick_and_set_high_precision_mv(cpi, q);
+
+      // If the precision has changed during different iteration of the loop,
+      // then we need to reset the global motion vectors
+      if (loop_count > 0 &&
+          cm->features.allow_high_precision_mv != last_loop_allow_hp) {
+        gm_info->search_done = 0;
+      }
+      last_loop_allow_hp = cm->features.allow_high_precision_mv;
+    }
+
+    // transform / motion compensation build reconstruction frame
+    av1_encode_frame(cpi);
+#if !CONFIG_REALTIME_ONLY
+    // Reset the mv_stats in case we are interrupted by an intraframe or an
+    // overlay frame.
+    if (cpi->mv_stats.valid) {
+      av1_zero(cpi->mv_stats);
+    }
+    // Gather the mv_stats for the next frame
+    if (cpi->sf.hl_sf.high_precision_mv_usage == LAST_MV_DATA &&
+        av1_frame_allows_smart_mv(cpi)) {
+      av1_collect_mv_stats(cpi, q);
+    }
+#endif  // !CONFIG_REALTIME_ONLY
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    end_timing(cpi, av1_encode_frame_time);
+#endif
+
+    aom_clear_system_state();
+
+    // Dummy pack of the bitstream using up to date stats to get an
+    // accurate estimate of output frame size to determine if we need
+    // to recode.
+    const int do_dummy_pack =
+        (cpi->sf.hl_sf.recode_loop >= ALLOW_RECODE_KFARFGF &&
+         cpi->oxcf.rc_mode != AOM_Q) ||
+        cpi->oxcf.min_cr > 0;
+    if (do_dummy_pack) {
+      finalize_encoded_frame(cpi);
+      int largest_tile_id = 0;  // Output from bitstream: unused here
+      if (av1_pack_bitstream(cpi, dest, size, &largest_tile_id) !=
+          AOM_CODEC_OK) {
+        return AOM_CODEC_ERROR;
+      }
+
+      rc->projected_frame_size = (int)(*size) << 3;
+    }
+
+    if (allow_recode) {
+      // Update q and decide whether to do a recode loop
+      recode_loop_update_q(cpi, &loop, &q, &q_low, &q_high, top_index,
+                           bottom_index, &undershoot_seen, &overshoot_seen,
+                           &low_cr_seen, loop_at_this_size);
+    }
+
+    // Special case for overlay frame.
+    if (loop && rc->is_src_frame_alt_ref &&
+        rc->projected_frame_size < rc->max_frame_bandwidth) {
+      loop = 0;
+    }
+
+    if (allow_recode && !cpi->sf.gm_sf.gm_disable_recode &&
+        recode_loop_test_global_motion(cm->global_motion,
+                                       cpi->td.rd_counts.global_motion_used,
+                                       gm_info->params_cost)) {
+      loop = 1;
+    }
+
+    if (loop) {
+      ++loop_count;
+      ++loop_at_this_size;
+
+#if CONFIG_INTERNAL_STATS
+      ++cpi->tot_recode_hits;
+#endif
+    }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    if (loop) printf("\n Recoding:");
+#endif
+  } while (loop);
+
+  // Update some stats from cyclic refresh.
+  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && !frame_is_intra_only(cm))
+    av1_cyclic_refresh_postencode(cpi);
+
+  return AOM_CODEC_OK;
+}
+
+static int encode_with_recode_loop_and_filter(AV1_COMP *cpi, size_t *size,
+                                              uint8_t *dest, int64_t *sse,
+                                              int64_t *rate,
+                                              int *largest_tile_id) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, encode_with_recode_loop_time);
+#endif
+  int err = encode_with_recode_loop(cpi, size, dest);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, encode_with_recode_loop_time);
+#endif
+  if (err != AOM_CODEC_OK) {
+    if (err == -1) {
+      // special case as described in encode_with_recode_loop().
+      // Encoding was skipped.
+      err = AOM_CODEC_OK;
+      if (sse != NULL) *sse = INT64_MAX;
+      if (rate != NULL) *rate = INT64_MAX;
+      *largest_tile_id = 0;
+    }
+    return err;
+  }
+
+#ifdef OUTPUT_YUV_SKINMAP
+  if (cpi->common.current_frame.frame_number > 1) {
+    av1_compute_skin_map(cpi, yuv_skinmap_file);
+  }
+#endif  // OUTPUT_YUV_SKINMAP
+
+  AV1_COMMON *const cm = &cpi->common;
+  SequenceHeader *const seq_params = &cm->seq_params;
+
+  // Special case code to reduce pulsing when key frames are forced at a
+  // fixed interval. Note the reconstruction error if it is the frame before
+  // the force key frame
+  if (cpi->rc.next_key_frame_forced && cpi->rc.frames_to_key == 1) {
+#if CONFIG_AV1_HIGHBITDEPTH
+    if (seq_params->use_highbitdepth) {
+      cpi->ambient_err = aom_highbd_get_y_sse(cpi->source, &cm->cur_frame->buf);
+    } else {
+      cpi->ambient_err = aom_get_y_sse(cpi->source, &cm->cur_frame->buf);
+    }
+#else
+    cpi->ambient_err = aom_get_y_sse(cpi->source, &cm->cur_frame->buf);
+#endif
+  }
+
+  cm->cur_frame->buf.color_primaries = seq_params->color_primaries;
+  cm->cur_frame->buf.transfer_characteristics =
+      seq_params->transfer_characteristics;
+  cm->cur_frame->buf.matrix_coefficients = seq_params->matrix_coefficients;
+  cm->cur_frame->buf.monochrome = seq_params->monochrome;
+  cm->cur_frame->buf.chroma_sample_position =
+      seq_params->chroma_sample_position;
+  cm->cur_frame->buf.color_range = seq_params->color_range;
+  cm->cur_frame->buf.render_width = cm->render_width;
+  cm->cur_frame->buf.render_height = cm->render_height;
+
+  // TODO(zoeliu): For non-ref frames, loop filtering may need to be turned
+  // off.
+
+  // Pick the loop filter level for the frame.
+  if (!cm->features.allow_intrabc) {
+    loopfilter_frame(cpi, cm);
+  } else {
+    cm->lf.filter_level[0] = 0;
+    cm->lf.filter_level[1] = 0;
+    cm->cdef_info.cdef_bits = 0;
+    cm->cdef_info.cdef_strengths[0] = 0;
+    cm->cdef_info.nb_cdef_strengths = 1;
+    cm->cdef_info.cdef_uv_strengths[0] = 0;
+    cm->rst_info[0].frame_restoration_type = RESTORE_NONE;
+    cm->rst_info[1].frame_restoration_type = RESTORE_NONE;
+    cm->rst_info[2].frame_restoration_type = RESTORE_NONE;
+  }
+
+  // TODO(debargha): Fix mv search range on encoder side
+  // aom_extend_frame_inner_borders(&cm->cur_frame->buf, av1_num_planes(cm));
+  aom_extend_frame_borders(&cm->cur_frame->buf, av1_num_planes(cm));
+
+#ifdef OUTPUT_YUV_REC
+  aom_write_one_yuv_frame(cm, &cm->cur_frame->buf);
+#endif
+
+  finalize_encoded_frame(cpi);
+  // Build the bitstream
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, av1_pack_bitstream_final_time);
+#endif
+  if (av1_pack_bitstream(cpi, dest, size, largest_tile_id) != AOM_CODEC_OK)
+    return AOM_CODEC_ERROR;
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, av1_pack_bitstream_final_time);
+#endif
+
+  // Compute sse and rate.
+  if (sse != NULL) {
+#if CONFIG_AV1_HIGHBITDEPTH
+    *sse = (seq_params->use_highbitdepth)
+               ? aom_highbd_get_y_sse(cpi->source, &cm->cur_frame->buf)
+               : aom_get_y_sse(cpi->source, &cm->cur_frame->buf);
+#else
+    *sse = aom_get_y_sse(cpi->source, &cm->cur_frame->buf);
+#endif
+  }
+  if (rate != NULL) {
+    const int64_t bits = (*size << 3);
+    *rate = (bits << 5);  // To match scale.
+  }
+  return AOM_CODEC_OK;
+}
+
+#if CONFIG_SUPERRES_IN_RECODE
+
+static void save_cur_buf(AV1_COMP *cpi) {
+  CODING_CONTEXT *const cc = &cpi->coding_context;
+  AV1_COMMON *cm = &cpi->common;
+  const YV12_BUFFER_CONFIG *ybf = &cm->cur_frame->buf;
+  memset(&cc->copy_buffer, 0, sizeof(cc->copy_buffer));
+  if (aom_alloc_frame_buffer(&cc->copy_buffer, ybf->y_crop_width,
+                             ybf->y_crop_height, ybf->subsampling_x,
+                             ybf->subsampling_y,
+                             ybf->flags & YV12_FLAG_HIGHBITDEPTH, ybf->border,
+                             cm->features.byte_alignment) != AOM_CODEC_OK) {
+    aom_internal_error(
+        &cm->error, AOM_CODEC_MEM_ERROR,
+        "Failed to allocate copy buffer for saving coding context");
+  }
+  aom_yv12_copy_frame(ybf, &cc->copy_buffer, av1_num_planes(cm));
+}
+
+// Coding context that only needs to be saved when recode loop includes
+// filtering (deblocking, CDEF, superres post-encode upscale and/or loop
+// restoraton).
+static void save_extra_coding_context(AV1_COMP *cpi) {
+  CODING_CONTEXT *const cc = &cpi->coding_context;
+  AV1_COMMON *cm = &cpi->common;
+
+  cc->lf = cm->lf;
+  cc->cdef_info = cm->cdef_info;
+  cc->rc = cpi->rc;
+}
+
+static void save_all_coding_context(AV1_COMP *cpi) {
+  save_cur_buf(cpi);
+  save_extra_coding_context(cpi);
+  if (!frame_is_intra_only(&cpi->common)) release_scaled_references(cpi);
+}
+
+static void restore_cur_buf(AV1_COMP *cpi) {
+  CODING_CONTEXT *const cc = &cpi->coding_context;
+  AV1_COMMON *cm = &cpi->common;
+  aom_yv12_copy_frame(&cc->copy_buffer, &cm->cur_frame->buf,
+                      av1_num_planes(cm));
+}
+
+// Coding context that only needs to be restored when recode loop includes
+// filtering (deblocking, CDEF, superres post-encode upscale and/or loop
+// restoraton).
+static void restore_extra_coding_context(AV1_COMP *cpi) {
+  CODING_CONTEXT *const cc = &cpi->coding_context;
+  AV1_COMMON *cm = &cpi->common;
+  cm->lf = cc->lf;
+  cm->cdef_info = cc->cdef_info;
+  cpi->rc = cc->rc;
+}
+
+static void restore_all_coding_context(AV1_COMP *cpi) {
+  restore_cur_buf(cpi);
+  restore_extra_coding_context(cpi);
+  if (!frame_is_intra_only(&cpi->common)) release_scaled_references(cpi);
+}
+
+static void release_copy_buffer(CODING_CONTEXT *cc) {
+  aom_free_frame_buffer(&cc->copy_buffer);
+}
+
+static int encode_with_and_without_superres(AV1_COMP *cpi, size_t *size,
+                                            uint8_t *dest,
+                                            int *largest_tile_id) {
+  const AV1_COMMON *const cm = &cpi->common;
+  assert(cm->seq_params.enable_superres);
+  assert(superres_in_recode_allowed(cpi));
+  aom_codec_err_t err = AOM_CODEC_OK;
+  save_all_coding_context(cpi);
+
+  // Encode with superres.
+#if SUPERRES_RECODE_ALL_RATIOS
+  AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  int64_t superres_sses[SCALE_NUMERATOR];
+  int64_t superres_rates[SCALE_NUMERATOR];
+  int superres_largest_tile_ids[SCALE_NUMERATOR];
+  // Use superres for Key-frames and Alt-ref frames only.
+  const GF_GROUP *const gf_group = &cpi->gf_group;
+  if (gf_group->update_type[gf_group->index] != OVERLAY_UPDATE &&
+      gf_group->update_type[gf_group->index] != INTNL_OVERLAY_UPDATE) {
+    for (int denom = SCALE_NUMERATOR + 1; denom <= 2 * SCALE_NUMERATOR;
+         ++denom) {
+      oxcf->superres_scale_denominator = denom;
+      oxcf->superres_kf_scale_denominator = denom;
+      const int this_index = denom - (SCALE_NUMERATOR + 1);
+      err = encode_with_recode_loop_and_filter(
+          cpi, size, dest, &superres_sses[this_index],
+          &superres_rates[this_index], &superres_largest_tile_ids[this_index]);
+      if (err != AOM_CODEC_OK) return err;
+      restore_all_coding_context(cpi);
+    }
+    // Reset.
+    oxcf->superres_scale_denominator = SCALE_NUMERATOR;
+    oxcf->superres_kf_scale_denominator = SCALE_NUMERATOR;
+  } else {
+    for (int denom = SCALE_NUMERATOR + 1; denom <= 2 * SCALE_NUMERATOR;
+         ++denom) {
+      const int this_index = denom - (SCALE_NUMERATOR + 1);
+      superres_sses[this_index] = INT64_MAX;
+      superres_rates[this_index] = INT64_MAX;
+    }
+  }
+#else
+  int64_t sse1 = INT64_MAX;
+  int64_t rate1 = INT64_MAX;
+  int largest_tile_id1;
+  err = encode_with_recode_loop_and_filter(cpi, size, dest, &sse1, &rate1,
+                                           &largest_tile_id1);
+  if (err != AOM_CODEC_OK) return err;
+  restore_all_coding_context(cpi);
+#endif  // SUPERRES_RECODE_ALL_RATIOS
+
+  // Encode without superres.
+  int64_t sse2 = INT64_MAX;
+  int64_t rate2 = INT64_MAX;
+  int largest_tile_id2;
+  cpi->superres_mode = SUPERRES_NONE;  // To force full-res.
+  err = encode_with_recode_loop_and_filter(cpi, size, dest, &sse2, &rate2,
+                                           &largest_tile_id2);
+  cpi->superres_mode = cpi->oxcf.superres_mode;  // Reset.
+  assert(cpi->oxcf.superres_mode == SUPERRES_AUTO);
+  if (err != AOM_CODEC_OK) return err;
+
+  // Note: Both use common rdmult based on base qindex of fullres.
+  const int64_t rdmult =
+      av1_compute_rd_mult_based_on_qindex(cpi, cm->quant_params.base_qindex);
+
+#if SUPERRES_RECODE_ALL_RATIOS
+  // Find the best rdcost among all superres denoms.
+  double proj_rdcost1 = DBL_MAX;
+  int64_t sse1 = INT64_MAX;
+  int64_t rate1 = INT64_MAX;
+  int largest_tile_id1 = 0;
+  (void)sse1;
+  (void)rate1;
+  (void)largest_tile_id1;
+  int best_denom = -1;
+  for (int denom = SCALE_NUMERATOR + 1; denom <= 2 * SCALE_NUMERATOR; ++denom) {
+    const int this_index = denom - (SCALE_NUMERATOR + 1);
+    const int64_t this_sse = superres_sses[this_index];
+    const int64_t this_rate = superres_rates[this_index];
+    const int this_largest_tile_id = superres_largest_tile_ids[this_index];
+    const double this_rdcost = RDCOST_DBL(rdmult, this_rate, this_sse);
+    if (this_rdcost < proj_rdcost1) {
+      sse1 = this_sse;
+      rate1 = this_rate;
+      largest_tile_id1 = this_largest_tile_id;
+      proj_rdcost1 = this_rdcost;
+      best_denom = denom;
+    }
+  }
+#else
+  const double proj_rdcost1 = RDCOST_DBL(rdmult, rate1, sse1);
+#endif  // SUPERRES_RECODE_ALL_RATIOS
+  const double proj_rdcost2 = RDCOST_DBL(rdmult, rate2, sse2);
+
+  // Re-encode with superres if it's better.
+  if (proj_rdcost1 < proj_rdcost2) {
+    restore_all_coding_context(cpi);
+    // TODO(urvang): We should avoid rerunning the recode loop by saving
+    // previous output+state, or running encode only for the selected 'q' in
+    // previous step.
+#if SUPERRES_RECODE_ALL_RATIOS
+    // Again, temporarily force the best denom.
+    oxcf->superres_scale_denominator = best_denom;
+    oxcf->superres_kf_scale_denominator = best_denom;
+#endif  // SUPERRES_RECODE_ALL_RATIOS
+    int64_t sse3 = INT64_MAX;
+    int64_t rate3 = INT64_MAX;
+    err = encode_with_recode_loop_and_filter(cpi, size, dest, &sse3, &rate3,
+                                             largest_tile_id);
+    assert(sse1 == sse3);
+    assert(rate1 == rate3);
+    assert(largest_tile_id1 == *largest_tile_id);
+#if SUPERRES_RECODE_ALL_RATIOS
+    // Reset.
+    oxcf->superres_scale_denominator = SCALE_NUMERATOR;
+    oxcf->superres_kf_scale_denominator = SCALE_NUMERATOR;
+#endif  // SUPERRES_RECODE_ALL_RATIOS
+  } else {
+    *largest_tile_id = largest_tile_id2;
+  }
+
+  release_copy_buffer(&cpi->coding_context);
+
+  return err;
+}
+#endif  // CONFIG_SUPERRES_IN_RECODE
+
+#define DUMP_RECON_FRAMES 0
+
+#if DUMP_RECON_FRAMES == 1
+// NOTE(zoeliu): For debug - Output the filtered reconstructed video.
+static void dump_filtered_recon_frames(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  const CurrentFrame *const current_frame = &cm->current_frame;
+  const YV12_BUFFER_CONFIG *recon_buf = &cm->cur_frame->buf;
+
+  if (recon_buf == NULL) {
+    printf("Frame %d is not ready.\n", current_frame->frame_number);
+    return;
+  }
+
+  static const int flag_list[REF_FRAMES] = { 0,
+                                             AOM_LAST_FLAG,
+                                             AOM_LAST2_FLAG,
+                                             AOM_LAST3_FLAG,
+                                             AOM_GOLD_FLAG,
+                                             AOM_BWD_FLAG,
+                                             AOM_ALT2_FLAG,
+                                             AOM_ALT_FLAG };
+  printf(
+      "\n***Frame=%d (frame_offset=%d, show_frame=%d, "
+      "show_existing_frame=%d) "
+      "[LAST LAST2 LAST3 GOLDEN BWD ALT2 ALT]=[",
+      current_frame->frame_number, current_frame->order_hint, cm->show_frame,
+      cm->show_existing_frame);
+  for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame);
+    const int ref_offset = buf != NULL ? (int)buf->order_hint : -1;
+    printf(" %d(%c)", ref_offset,
+           (cpi->ref_frame_flags & flag_list[ref_frame]) ? 'Y' : 'N');
+  }
+  printf(" ]\n");
+
+  if (!cm->show_frame) {
+    printf("Frame %d is a no show frame, so no image dump.\n",
+           current_frame->frame_number);
+    return;
+  }
+
+  int h;
+  char file_name[256] = "/tmp/enc_filtered_recon.yuv";
+  FILE *f_recon = NULL;
+
+  if (current_frame->frame_number == 0) {
+    if ((f_recon = fopen(file_name, "wb")) == NULL) {
+      printf("Unable to open file %s to write.\n", file_name);
+      return;
+    }
+  } else {
+    if ((f_recon = fopen(file_name, "ab")) == NULL) {
+      printf("Unable to open file %s to append.\n", file_name);
+      return;
+    }
+  }
+  printf(
+      "\nFrame=%5d, encode_update_type[%5d]=%1d, frame_offset=%d, "
+      "show_frame=%d, show_existing_frame=%d, source_alt_ref_active=%d, "
+      "refresh_alt_ref_frame=%d, "
+      "y_stride=%4d, uv_stride=%4d, cm->width=%4d, cm->height=%4d\n\n",
+      current_frame->frame_number, cpi->gf_group.index,
+      cpi->gf_group.update_type[cpi->gf_group.index], current_frame->order_hint,
+      cm->show_frame, cm->show_existing_frame, cpi->rc.source_alt_ref_active,
+      cpi->refresh_alt_ref_frame, recon_buf->y_stride, recon_buf->uv_stride,
+      cm->width, cm->height);
+#if 0
+  int ref_frame;
+  printf("get_ref_frame_map_idx: [");
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame)
+    printf(" %d", get_ref_frame_map_idx(cm, ref_frame));
+  printf(" ]\n");
+#endif  // 0
+
+  // --- Y ---
+  for (h = 0; h < cm->height; ++h) {
+    fwrite(&recon_buf->y_buffer[h * recon_buf->y_stride], 1, cm->width,
+           f_recon);
+  }
+  // --- U ---
+  for (h = 0; h < (cm->height >> 1); ++h) {
+    fwrite(&recon_buf->u_buffer[h * recon_buf->uv_stride], 1, (cm->width >> 1),
+           f_recon);
+  }
+  // --- V ---
+  for (h = 0; h < (cm->height >> 1); ++h) {
+    fwrite(&recon_buf->v_buffer[h * recon_buf->uv_stride], 1, (cm->width >> 1),
+           f_recon);
+  }
+
+  fclose(f_recon);
+}
+#endif  // DUMP_RECON_FRAMES
+
+static int is_integer_mv(const YV12_BUFFER_CONFIG *cur_picture,
+                         const YV12_BUFFER_CONFIG *last_picture,
+                         ForceIntegerMVInfo *const force_intpel_info) {
+  aom_clear_system_state();
+  // check use hash ME
+  int k;
+
+  const int block_size = FORCE_INT_MV_DECISION_BLOCK_SIZE;
+  const double threshold_current = 0.8;
+  const double threshold_average = 0.95;
+  const int max_history_size = 32;
+  int T = 0;  // total block
+  int C = 0;  // match with collocated block
+  int S = 0;  // smooth region but not match with collocated block
+
+  const int pic_width = cur_picture->y_width;
+  const int pic_height = cur_picture->y_height;
+  for (int i = 0; i + block_size <= pic_height; i += block_size) {
+    for (int j = 0; j + block_size <= pic_width; j += block_size) {
+      const int x_pos = j;
+      const int y_pos = i;
+      int match = 1;
+      T++;
+
+      // check whether collocated block match with current
+      uint8_t *p_cur = cur_picture->y_buffer;
+      uint8_t *p_ref = last_picture->y_buffer;
+      int stride_cur = cur_picture->y_stride;
+      int stride_ref = last_picture->y_stride;
+      p_cur += (y_pos * stride_cur + x_pos);
+      p_ref += (y_pos * stride_ref + x_pos);
+
+      if (cur_picture->flags & YV12_FLAG_HIGHBITDEPTH) {
+        uint16_t *p16_cur = CONVERT_TO_SHORTPTR(p_cur);
+        uint16_t *p16_ref = CONVERT_TO_SHORTPTR(p_ref);
+        for (int tmpY = 0; tmpY < block_size && match; tmpY++) {
+          for (int tmpX = 0; tmpX < block_size && match; tmpX++) {
+            if (p16_cur[tmpX] != p16_ref[tmpX]) {
+              match = 0;
+            }
+          }
+          p16_cur += stride_cur;
+          p16_ref += stride_ref;
+        }
+      } else {
+        for (int tmpY = 0; tmpY < block_size && match; tmpY++) {
+          for (int tmpX = 0; tmpX < block_size && match; tmpX++) {
+            if (p_cur[tmpX] != p_ref[tmpX]) {
+              match = 0;
+            }
+          }
+          p_cur += stride_cur;
+          p_ref += stride_ref;
+        }
+      }
+
+      if (match) {
+        C++;
+        continue;
+      }
+
+      if (av1_hash_is_horizontal_perfect(cur_picture, block_size, x_pos,
+                                         y_pos) ||
+          av1_hash_is_vertical_perfect(cur_picture, block_size, x_pos, y_pos)) {
+        S++;
+        continue;
+      }
+    }
+  }
+
+  assert(T > 0);
+  double cs_rate = ((double)(C + S)) / ((double)(T));
+
+  force_intpel_info->cs_rate_array[force_intpel_info->rate_index] = cs_rate;
+
+  force_intpel_info->rate_index =
+      (force_intpel_info->rate_index + 1) % max_history_size;
+  force_intpel_info->rate_size++;
+  force_intpel_info->rate_size =
+      AOMMIN(force_intpel_info->rate_size, max_history_size);
+
+  if (cs_rate < threshold_current) {
+    return 0;
+  }
+
+  if (C == T) {
+    return 1;
+  }
+
+  double cs_average = 0.0;
+
+  for (k = 0; k < force_intpel_info->rate_size; k++) {
+    cs_average += force_intpel_info->cs_rate_array[k];
+  }
+  cs_average /= force_intpel_info->rate_size;
+
+  if (cs_average < threshold_average) {
+    return 0;
+  }
+
+  if ((T - C - S) < 0) {
+    return 1;
+  }
+
+  if (cs_average > 1.01) {
+    return 1;
+  }
+
+  return 0;
+}
+
+// Refresh reference frame buffers according to refresh_frame_flags.
+static void refresh_reference_frames(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  // All buffers are refreshed for shown keyframes and S-frames.
+
+  for (int ref_frame = 0; ref_frame < REF_FRAMES; ref_frame++) {
+    if (((cm->current_frame.refresh_frame_flags >> ref_frame) & 1) == 1) {
+      assign_frame_buffer_p(&cm->ref_frame_map[ref_frame], cm->cur_frame);
+    }
+  }
+}
+
+static void set_mb_ssim_rdmult_scaling(AV1_COMP *cpi) {
+  const CommonModeInfoParams *const mi_params = &cpi->common.mi_params;
+  ThreadData *td = &cpi->td;
+  MACROBLOCK *x = &td->mb;
+  MACROBLOCKD *xd = &x->e_mbd;
+  uint8_t *y_buffer = cpi->source->y_buffer;
+  const int y_stride = cpi->source->y_stride;
+  const int block_size = BLOCK_16X16;
+
+  const int num_mi_w = mi_size_wide[block_size];
+  const int num_mi_h = mi_size_high[block_size];
+  const int num_cols = (mi_params->mi_cols + num_mi_w - 1) / num_mi_w;
+  const int num_rows = (mi_params->mi_rows + num_mi_h - 1) / num_mi_h;
+  double log_sum = 0.0;
+  const int use_hbd = cpi->source->flags & YV12_FLAG_HIGHBITDEPTH;
+
+  // Loop through each 16x16 block.
+  for (int row = 0; row < num_rows; ++row) {
+    for (int col = 0; col < num_cols; ++col) {
+      double var = 0.0, num_of_var = 0.0;
+      const int index = row * num_cols + col;
+
+      // Loop through each 8x8 block.
+      for (int mi_row = row * num_mi_h;
+           mi_row < mi_params->mi_rows && mi_row < (row + 1) * num_mi_h;
+           mi_row += 2) {
+        for (int mi_col = col * num_mi_w;
+             mi_col < mi_params->mi_cols && mi_col < (col + 1) * num_mi_w;
+             mi_col += 2) {
+          struct buf_2d buf;
+          const int row_offset_y = mi_row << 2;
+          const int col_offset_y = mi_col << 2;
+
+          buf.buf = y_buffer + row_offset_y * y_stride + col_offset_y;
+          buf.stride = y_stride;
+
+          if (use_hbd) {
+            var += av1_high_get_sby_perpixel_variance(cpi, &buf, BLOCK_8X8,
+                                                      xd->bd);
+          } else {
+            var += av1_get_sby_perpixel_variance(cpi, &buf, BLOCK_8X8);
+          }
+
+          num_of_var += 1.0;
+        }
+      }
+      var = var / num_of_var;
+
+      // Curve fitting with an exponential model on all 16x16 blocks from the
+      // midres dataset.
+      var = 67.035434 * (1 - exp(-0.0021489 * var)) + 17.492222;
+      cpi->ssim_rdmult_scaling_factors[index] = var;
+      log_sum += log(var);
+    }
+  }
+  log_sum = exp(log_sum / (double)(num_rows * num_cols));
+
+  for (int row = 0; row < num_rows; ++row) {
+    for (int col = 0; col < num_cols; ++col) {
+      const int index = row * num_cols + col;
+      cpi->ssim_rdmult_scaling_factors[index] /= log_sum;
+    }
+  }
+}
+
+extern void av1_print_frame_contexts(const FRAME_CONTEXT *fc,
+                                     const char *filename);
+
+static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
+                                     uint8_t *dest) {
+  AV1_COMMON *const cm = &cpi->common;
+  SequenceHeader *const seq_params = &cm->seq_params;
+  CurrentFrame *const current_frame = &cm->current_frame;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  struct segmentation *const seg = &cm->seg;
+  FeatureFlags *const features = &cm->features;
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, encode_frame_to_data_rate_time);
+#endif
+
+  // frame type has been decided outside of this function call
+  cm->cur_frame->frame_type = current_frame->frame_type;
+
+  cm->tiles.large_scale = cpi->oxcf.large_scale_tile;
+  cm->tiles.single_tile_decoding = cpi->oxcf.single_tile_decoding;
+
+  features->allow_ref_frame_mvs &= frame_might_allow_ref_frame_mvs(cm);
+  // features->allow_ref_frame_mvs needs to be written into the frame header
+  // while cm->tiles.large_scale is 1, therefore, "cm->tiles.large_scale=1" case
+  // is separated from frame_might_allow_ref_frame_mvs().
+  features->allow_ref_frame_mvs &= !cm->tiles.large_scale;
+
+  features->allow_warped_motion =
+      cpi->oxcf.allow_warped_motion && frame_might_allow_warped_motion(cm);
+
+  cpi->last_frame_type = current_frame->frame_type;
+
+  if (encode_show_existing_frame(cm)) {
+    finalize_encoded_frame(cpi);
+    // Build the bitstream
+    int largest_tile_id = 0;  // Output from bitstream: unused here
+    if (av1_pack_bitstream(cpi, dest, size, &largest_tile_id) != AOM_CODEC_OK)
+      return AOM_CODEC_ERROR;
+
+    if (seq_params->frame_id_numbers_present_flag &&
+        current_frame->frame_type == KEY_FRAME) {
+      // Displaying a forward key-frame, so reset the ref buffer IDs
+      int display_frame_id = cm->ref_frame_id[cpi->existing_fb_idx_to_show];
+      for (int i = 0; i < REF_FRAMES; i++)
+        cm->ref_frame_id[i] = display_frame_id;
+    }
+
+    cpi->seq_params_locked = 1;
+
+#if DUMP_RECON_FRAMES == 1
+    // NOTE(zoeliu): For debug - Output the filtered reconstructed video.
+    dump_filtered_recon_frames(cpi);
+#endif  // DUMP_RECON_FRAMES
+
+    // NOTE: Save the new show frame buffer index for --test-code=warn, i.e.,
+    //       for the purpose to verify no mismatch between encoder and decoder.
+    if (cm->show_frame) cpi->last_show_frame_buf = cm->cur_frame;
+
+    refresh_reference_frames(cpi);
+
+    // Since we allocate a spot for the OVERLAY frame in the gf group, we need
+    // to do post-encoding update accordingly.
+    if (cpi->rc.is_src_frame_alt_ref) {
+      av1_set_target_rate(cpi, cm->width, cm->height);
+      av1_rc_postencode_update(cpi, *size);
+    }
+
+    ++current_frame->frame_number;
+
+    return AOM_CODEC_OK;
+  }
+
+  // Work out whether to force_integer_mv this frame
+  if (!is_stat_generation_stage(cpi) &&
+      cpi->common.features.allow_screen_content_tools &&
+      !frame_is_intra_only(cm)) {
+    if (cpi->common.seq_params.force_integer_mv == 2) {
+      // Adaptive mode: see what previous frame encoded did
+      if (cpi->unscaled_last_source != NULL) {
+        features->cur_frame_force_integer_mv = is_integer_mv(
+            cpi->source, cpi->unscaled_last_source, &cpi->force_intpel_info);
+      } else {
+        cpi->common.features.cur_frame_force_integer_mv = 0;
+      }
+    } else {
+      cpi->common.features.cur_frame_force_integer_mv =
+          cpi->common.seq_params.force_integer_mv;
+    }
+  } else {
+    cpi->common.features.cur_frame_force_integer_mv = 0;
+  }
+
+  // Set default state for segment based loop filter update flags.
+  cm->lf.mode_ref_delta_update = 0;
+
+  // Set various flags etc to special state if it is a key frame.
+  if (frame_is_intra_only(cm) || frame_is_sframe(cm)) {
+    // Reset the loop filter deltas and segmentation map.
+    av1_reset_segment_features(cm);
+
+    // If segmentation is enabled force a map update for key frames.
+    if (seg->enabled) {
+      seg->update_map = 1;
+      seg->update_data = 1;
+    }
+
+    // The alternate reference frame cannot be active for a key frame.
+    cpi->rc.source_alt_ref_active = 0;
+  }
+  if (cpi->oxcf.mtu == 0) {
+    cpi->num_tg = cpi->oxcf.num_tile_groups;
+  } else {
+    // Use a default value for the purposes of weighting costs in probability
+    // updates
+    cpi->num_tg = DEFAULT_MAX_NUM_TG;
+  }
+
+  // For 1 pass CBR, check if we are dropping this frame.
+  // Never drop on key frame.
+  if (has_no_stats_stage(cpi) && oxcf->rc_mode == AOM_CBR &&
+      current_frame->frame_type != KEY_FRAME) {
+    if (av1_rc_drop_frame(cpi)) {
+      av1_setup_frame_size(cpi);
+      av1_rc_postencode_update_drop_frame(cpi);
+      release_scaled_references(cpi);
+      return AOM_CODEC_OK;
+    }
+  }
+
+  if (oxcf->tuning == AOM_TUNE_SSIM) set_mb_ssim_rdmult_scaling(cpi);
+
+#if CONFIG_TUNE_VMAF
+  if (oxcf->tuning == AOM_TUNE_VMAF_WITHOUT_PREPROCESSING ||
+      oxcf->tuning == AOM_TUNE_VMAF_MAX_GAIN) {
+    av1_set_mb_vmaf_rdmult_scaling(cpi);
+  }
+#endif
+
+  aom_clear_system_state();
+
+#if CONFIG_INTERNAL_STATS
+  memset(cpi->mode_chosen_counts, 0,
+         MAX_MODES * sizeof(*cpi->mode_chosen_counts));
+#endif
+
+  if (seq_params->frame_id_numbers_present_flag) {
+    /* Non-normative definition of current_frame_id ("frame counter" with
+     * wraparound) */
+    if (cm->current_frame_id == -1) {
+      int lsb, msb;
+      /* quasi-random initialization of current_frame_id for a key frame */
+      if (cpi->source->flags & YV12_FLAG_HIGHBITDEPTH) {
+        lsb = CONVERT_TO_SHORTPTR(cpi->source->y_buffer)[0] & 0xff;
+        msb = CONVERT_TO_SHORTPTR(cpi->source->y_buffer)[1] & 0xff;
+      } else {
+        lsb = cpi->source->y_buffer[0] & 0xff;
+        msb = cpi->source->y_buffer[1] & 0xff;
+      }
+      cm->current_frame_id =
+          ((msb << 8) + lsb) % (1 << seq_params->frame_id_length);
+
+      // S_frame is meant for stitching different streams of different
+      // resolutions together, so current_frame_id must be the
+      // same across different streams of the same content current_frame_id
+      // should be the same and not random. 0x37 is a chosen number as start
+      // point
+      if (cpi->oxcf.sframe_enabled) cm->current_frame_id = 0x37;
+    } else {
+      cm->current_frame_id =
+          (cm->current_frame_id + 1 + (1 << seq_params->frame_id_length)) %
+          (1 << seq_params->frame_id_length);
+    }
+  }
+
+  switch (cpi->oxcf.cdf_update_mode) {
+    case 0:  // No CDF update for any frames(4~6% compression loss).
+      features->disable_cdf_update = 1;
+      break;
+    case 1:  // Enable CDF update for all frames.
+      features->disable_cdf_update = 0;
+      break;
+    case 2:
+      // Strategically determine at which frames to do CDF update.
+      // Currently only enable CDF update for all-intra and no-show frames(1.5%
+      // compression loss).
+      // TODO(huisu@google.com): design schemes for various trade-offs between
+      // compression quality and decoding speed.
+      features->disable_cdf_update =
+          (frame_is_intra_only(cm) || !cm->show_frame) ? 0 : 1;
+      break;
+  }
+  seq_params->timing_info_present &= !seq_params->reduced_still_picture_hdr;
+
+  int largest_tile_id = 0;
+#if CONFIG_SUPERRES_IN_RECODE
+  if (superres_in_recode_allowed(cpi)) {
+    if (encode_with_and_without_superres(cpi, size, dest, &largest_tile_id) !=
+        AOM_CODEC_OK) {
+      return AOM_CODEC_ERROR;
+    }
+  } else {
+#endif  // CONFIG_SUPERRES_IN_RECODE
+    if (encode_with_recode_loop_and_filter(cpi, size, dest, NULL, NULL,
+                                           &largest_tile_id) != AOM_CODEC_OK) {
+      return AOM_CODEC_ERROR;
+    }
+#if CONFIG_SUPERRES_IN_RECODE
+  }
+#endif  // CONFIG_SUPERRES_IN_RECODE
+
+  cpi->seq_params_locked = 1;
+
+  // Update reference frame ids for reference frames this frame will overwrite
+  if (seq_params->frame_id_numbers_present_flag) {
+    for (int i = 0; i < REF_FRAMES; i++) {
+      if ((current_frame->refresh_frame_flags >> i) & 1) {
+        cm->ref_frame_id[i] = cm->current_frame_id;
+      }
+    }
+  }
+
+#if DUMP_RECON_FRAMES == 1
+  // NOTE(zoeliu): For debug - Output the filtered reconstructed video.
+  dump_filtered_recon_frames(cpi);
+#endif  // DUMP_RECON_FRAMES
+
+  if (cm->seg.enabled) {
+    if (cm->seg.update_map) {
+      update_reference_segmentation_map(cpi);
+    } else if (cm->last_frame_seg_map) {
+      memcpy(cm->cur_frame->seg_map, cm->last_frame_seg_map,
+             cm->mi_params.mi_cols * cm->mi_params.mi_rows * sizeof(uint8_t));
+    }
+  }
+
+  if (frame_is_intra_only(cm) == 0) {
+    release_scaled_references(cpi);
+  }
+
+  // NOTE: Save the new show frame buffer index for --test-code=warn, i.e.,
+  //       for the purpose to verify no mismatch between encoder and decoder.
+  if (cm->show_frame) cpi->last_show_frame_buf = cm->cur_frame;
+
+  refresh_reference_frames(cpi);
+
+#if CONFIG_ENTROPY_STATS
+  av1_accumulate_frame_counts(&aggregate_fc, &cpi->counts);
+#endif  // CONFIG_ENTROPY_STATS
+
+  if (features->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
+    *cm->fc = cpi->tile_data[largest_tile_id].tctx;
+    av1_reset_cdf_symbol_counters(cm->fc);
+  }
+  if (!cm->tiles.large_scale) {
+    cm->cur_frame->frame_context = *cm->fc;
+  }
+
+  if (cpi->oxcf.ext_tile_debug) {
+    // (yunqing) This test ensures the correctness of large scale tile coding.
+    if (cm->tiles.large_scale && is_stat_consumption_stage(cpi)) {
+      char fn[20] = "./fc";
+      fn[4] = current_frame->frame_number / 100 + '0';
+      fn[5] = (current_frame->frame_number % 100) / 10 + '0';
+      fn[6] = (current_frame->frame_number % 10) + '0';
+      fn[7] = '\0';
+      av1_print_frame_contexts(cm->fc, fn);
+    }
+  }
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, encode_frame_to_data_rate_time);
+
+  // Print out timing information.
+  int i;
+  fprintf(stderr, "\n Frame number: %d, Frame type: %s, Show Frame: %d\n",
+          cm->current_frame.frame_number,
+          get_frame_type_enum(cm->current_frame.frame_type), cm->show_frame);
+  for (i = 0; i < kTimingComponents; i++) {
+    cpi->component_time[i] += cpi->frame_component_time[i];
+    fprintf(stderr, " %s:  %" PRId64 " us (total: %" PRId64 " us)\n",
+            get_component_name(i), cpi->frame_component_time[i],
+            cpi->component_time[i]);
+    cpi->frame_component_time[i] = 0;
+  }
+#endif
+
+  cpi->last_frame_type = current_frame->frame_type;
+
+  av1_rc_postencode_update(cpi, *size);
+
+  // Clear the one shot update flags for segmentation map and mode/ref loop
+  // filter deltas.
+  cm->seg.update_map = 0;
+  cm->seg.update_data = 0;
+  cm->lf.mode_ref_delta_update = 0;
+
+  // A droppable frame might not be shown but it always
+  // takes a space in the gf group. Therefore, even when
+  // it is not shown, we still need update the count down.
+
+  if (cm->show_frame) {
+    // Don't increment frame counters if this was an altref buffer
+    // update not a real frame
+    ++current_frame->frame_number;
+  }
+
+  return AOM_CODEC_OK;
+}
+
+int av1_encode(AV1_COMP *const cpi, uint8_t *const dest,
+               const EncodeFrameInput *const frame_input,
+               const EncodeFrameParams *const frame_params,
+               EncodeFrameResults *const frame_results) {
+  AV1_COMMON *const cm = &cpi->common;
+  CurrentFrame *const current_frame = &cm->current_frame;
+
+  cpi->unscaled_source = frame_input->source;
+  cpi->source = frame_input->source;
+  cpi->unscaled_last_source = frame_input->last_source;
+
+  current_frame->refresh_frame_flags = frame_params->refresh_frame_flags;
+  cm->features.error_resilient_mode = frame_params->error_resilient_mode;
+  cm->features.primary_ref_frame = frame_params->primary_ref_frame;
+  cm->current_frame.frame_type = frame_params->frame_type;
+  cm->show_frame = frame_params->show_frame;
+  cpi->ref_frame_flags = frame_params->ref_frame_flags;
+  cpi->speed = frame_params->speed;
+  cm->show_existing_frame = frame_params->show_existing_frame;
+  cpi->existing_fb_idx_to_show = frame_params->existing_fb_idx_to_show;
+
+  memcpy(cm->remapped_ref_idx, frame_params->remapped_ref_idx,
+         REF_FRAMES * sizeof(*cm->remapped_ref_idx));
+
+  cpi->refresh_golden_frame = frame_params->refresh_golden_frame;
+  cpi->refresh_bwd_ref_frame = frame_params->refresh_bwd_ref_frame;
+  cpi->refresh_alt_ref_frame = frame_params->refresh_alt_ref_frame;
+
+  if (current_frame->frame_type == KEY_FRAME && cm->show_frame)
+    current_frame->frame_number = 0;
+
+  current_frame->order_hint =
+      current_frame->frame_number + frame_params->order_offset;
+  current_frame->display_order_hint = current_frame->order_hint;
+  current_frame->order_hint %=
+      (1 << (cm->seq_params.order_hint_info.order_hint_bits_minus_1 + 1));
+
+  if (is_stat_generation_stage(cpi)) {
+#if !CONFIG_REALTIME_ONLY
+    av1_first_pass(cpi, frame_input->ts_duration);
+#endif
+  } else if (cpi->oxcf.pass == 0 || cpi->oxcf.pass == 2) {
+    if (encode_frame_to_data_rate(cpi, &frame_results->size, dest) !=
+        AOM_CODEC_OK) {
+      return AOM_CODEC_ERROR;
+    }
+  } else {
+    return AOM_CODEC_ERROR;
+  }
+
+  return AOM_CODEC_OK;
+}
+
+#if CONFIG_DENOISE
+static int apply_denoise_2d(AV1_COMP *cpi, YV12_BUFFER_CONFIG *sd,
+                            int block_size, float noise_level,
+                            int64_t time_stamp, int64_t end_time) {
+  AV1_COMMON *const cm = &cpi->common;
+  if (!cpi->denoise_and_model) {
+    cpi->denoise_and_model = aom_denoise_and_model_alloc(
+        cm->seq_params.bit_depth, block_size, noise_level);
+    if (!cpi->denoise_and_model) {
+      aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                         "Error allocating denoise and model");
+      return -1;
+    }
+  }
+  if (!cpi->film_grain_table) {
+    cpi->film_grain_table = aom_malloc(sizeof(*cpi->film_grain_table));
+    if (!cpi->film_grain_table) {
+      aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                         "Error allocating grain table");
+      return -1;
+    }
+    memset(cpi->film_grain_table, 0, sizeof(*cpi->film_grain_table));
+  }
+  if (aom_denoise_and_model_run(cpi->denoise_and_model, sd,
+                                &cm->film_grain_params)) {
+    if (cm->film_grain_params.apply_grain) {
+      aom_film_grain_table_append(cpi->film_grain_table, time_stamp, end_time,
+                                  &cm->film_grain_params);
+    }
+  }
+  return 0;
+}
+#endif
+
+int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags,
+                          YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
+                          int64_t end_time) {
+  AV1_COMMON *const cm = &cpi->common;
+  const SequenceHeader *const seq_params = &cm->seq_params;
+  int res = 0;
+  const int subsampling_x = sd->subsampling_x;
+  const int subsampling_y = sd->subsampling_y;
+  const int use_highbitdepth = (sd->flags & YV12_FLAG_HIGHBITDEPTH) != 0;
+
+#if CONFIG_TUNE_VMAF
+  if (!is_stat_generation_stage(cpi) &&
+      cpi->oxcf.tuning == AOM_TUNE_VMAF_WITH_PREPROCESSING) {
+    av1_vmaf_frame_preprocessing(cpi, sd);
+  }
+  if (!is_stat_generation_stage(cpi) &&
+      cpi->oxcf.tuning == AOM_TUNE_VMAF_MAX_GAIN) {
+    av1_vmaf_blk_preprocessing(cpi, sd);
+  }
+#endif
+
+#if CONFIG_INTERNAL_STATS
+  struct aom_usec_timer timer;
+  aom_usec_timer_start(&timer);
+#endif
+#if CONFIG_DENOISE
+  if (cpi->oxcf.noise_level > 0)
+    if (apply_denoise_2d(cpi, sd, cpi->oxcf.noise_block_size,
+                         cpi->oxcf.noise_level, time_stamp, end_time) < 0)
+      res = -1;
+#endif  //  CONFIG_DENOISE
+
+  if (av1_lookahead_push(cpi->lookahead, sd, time_stamp, end_time,
+                         use_highbitdepth, frame_flags))
+    res = -1;
+#if CONFIG_INTERNAL_STATS
+  aom_usec_timer_mark(&timer);
+  cpi->time_receive_data += aom_usec_timer_elapsed(&timer);
+#endif
+  if ((seq_params->profile == PROFILE_0) && !seq_params->monochrome &&
+      (subsampling_x != 1 || subsampling_y != 1)) {
+    aom_internal_error(&cm->error, AOM_CODEC_INVALID_PARAM,
+                       "Non-4:2:0 color format requires profile 1 or 2");
+    res = -1;
+  }
+  if ((seq_params->profile == PROFILE_1) &&
+      !(subsampling_x == 0 && subsampling_y == 0)) {
+    aom_internal_error(&cm->error, AOM_CODEC_INVALID_PARAM,
+                       "Profile 1 requires 4:4:4 color format");
+    res = -1;
+  }
+  if ((seq_params->profile == PROFILE_2) &&
+      (seq_params->bit_depth <= AOM_BITS_10) &&
+      !(subsampling_x == 1 && subsampling_y == 0)) {
+    aom_internal_error(&cm->error, AOM_CODEC_INVALID_PARAM,
+                       "Profile 2 bit-depth < 10 requires 4:2:2 color format");
+    res = -1;
+  }
+
+  return res;
+}
+
+#if CONFIG_INTERNAL_STATS
+extern double av1_get_blockiness(const unsigned char *img1, int img1_pitch,
+                                 const unsigned char *img2, int img2_pitch,
+                                 int width, int height);
+
+static void adjust_image_stat(double y, double u, double v, double all,
+                              ImageStat *s) {
+  s->stat[STAT_Y] += y;
+  s->stat[STAT_U] += u;
+  s->stat[STAT_V] += v;
+  s->stat[STAT_ALL] += all;
+  s->worst = AOMMIN(s->worst, all);
+}
+
+static void compute_internal_stats(AV1_COMP *cpi, int frame_bytes) {
+  AV1_COMMON *const cm = &cpi->common;
+  double samples = 0.0;
+  const uint32_t in_bit_depth = cpi->oxcf.input_bit_depth;
+  const uint32_t bit_depth = cpi->td.mb.e_mbd.bd;
+
+#if CONFIG_INTER_STATS_ONLY
+  if (cm->current_frame.frame_type == KEY_FRAME) return;  // skip key frame
+#endif
+  cpi->bytes += frame_bytes;
+  if (cm->show_frame) {
+    const YV12_BUFFER_CONFIG *orig = cpi->source;
+    const YV12_BUFFER_CONFIG *recon = &cpi->common.cur_frame->buf;
+    double y, u, v, frame_all;
+
+    cpi->count++;
+    if (cpi->b_calculate_psnr) {
+      PSNR_STATS psnr;
+      double frame_ssim2 = 0.0, weight = 0.0;
+      aom_clear_system_state();
+#if CONFIG_AV1_HIGHBITDEPTH
+      aom_calc_highbd_psnr(orig, recon, &psnr, bit_depth, in_bit_depth);
+#else
+      aom_calc_psnr(orig, recon, &psnr);
+#endif
+      adjust_image_stat(psnr.psnr[1], psnr.psnr[2], psnr.psnr[3], psnr.psnr[0],
+                        &cpi->psnr);
+      cpi->total_sq_error += psnr.sse[0];
+      cpi->total_samples += psnr.samples[0];
+      samples = psnr.samples[0];
+      // TODO(yaowu): unify these two versions into one.
+      if (cm->seq_params.use_highbitdepth)
+        frame_ssim2 =
+            aom_highbd_calc_ssim(orig, recon, &weight, bit_depth, in_bit_depth);
+      else
+        frame_ssim2 = aom_calc_ssim(orig, recon, &weight);
+
+      cpi->worst_ssim = AOMMIN(cpi->worst_ssim, frame_ssim2);
+      cpi->summed_quality += frame_ssim2 * weight;
+      cpi->summed_weights += weight;
+
+#if 0
+      {
+        FILE *f = fopen("q_used.stt", "a");
+        double y2 = psnr.psnr[1];
+        double u2 = psnr.psnr[2];
+        double v2 = psnr.psnr[3];
+        double frame_psnr2 = psnr.psnr[0];
+        fprintf(f, "%5d : Y%f7.3:U%f7.3:V%f7.3:F%f7.3:S%7.3f\n",
+                cm->current_frame.frame_number, y2, u2, v2,
+                frame_psnr2, frame_ssim2);
+        fclose(f);
+      }
+#endif
+    }
+    if (cpi->b_calculate_blockiness) {
+      if (!cm->seq_params.use_highbitdepth) {
+        const double frame_blockiness =
+            av1_get_blockiness(orig->y_buffer, orig->y_stride, recon->y_buffer,
+                               recon->y_stride, orig->y_width, orig->y_height);
+        cpi->worst_blockiness = AOMMAX(cpi->worst_blockiness, frame_blockiness);
+        cpi->total_blockiness += frame_blockiness;
+      }
+
+      if (cpi->b_calculate_consistency) {
+        if (!cm->seq_params.use_highbitdepth) {
+          const double this_inconsistency = aom_get_ssim_metrics(
+              orig->y_buffer, orig->y_stride, recon->y_buffer, recon->y_stride,
+              orig->y_width, orig->y_height, cpi->ssim_vars, &cpi->metrics, 1);
+
+          const double peak = (double)((1 << in_bit_depth) - 1);
+          const double consistency =
+              aom_sse_to_psnr(samples, peak, cpi->total_inconsistency);
+          if (consistency > 0.0)
+            cpi->worst_consistency =
+                AOMMIN(cpi->worst_consistency, consistency);
+          cpi->total_inconsistency += this_inconsistency;
+        }
+      }
+    }
+
+    frame_all =
+        aom_calc_fastssim(orig, recon, &y, &u, &v, bit_depth, in_bit_depth);
+    adjust_image_stat(y, u, v, frame_all, &cpi->fastssim);
+    frame_all = aom_psnrhvs(orig, recon, &y, &u, &v, bit_depth, in_bit_depth);
+    adjust_image_stat(y, u, v, frame_all, &cpi->psnrhvs);
+  }
+}
+#endif  // CONFIG_INTERNAL_STATS
+int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
+                            size_t *size, uint8_t *dest, int64_t *time_stamp,
+                            int64_t *time_end, int flush,
+                            const aom_rational64_t *timestamp_ratio) {
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  AV1_COMMON *const cm = &cpi->common;
+
+#if CONFIG_BITSTREAM_DEBUG
+  assert(cpi->oxcf.max_threads == 0 &&
+         "bitstream debug tool does not support multithreading");
+  bitstream_queue_record_write();
+  aom_bitstream_queue_set_frame_write(cm->current_frame.frame_number * 2 +
+                                      cm->show_frame);
+#endif
+  if (cpi->use_svc && cm->number_spatial_layers > 1) {
+    av1_one_pass_cbr_svc_start_layer(cpi);
+  }
+
+  cm->showable_frame = 0;
+  *size = 0;
+#if CONFIG_INTERNAL_STATS
+  struct aom_usec_timer cmptimer;
+  aom_usec_timer_start(&cmptimer);
+#endif
+  av1_set_high_precision_mv(cpi, 1, 0);
+
+  // Normal defaults
+  cm->features.refresh_frame_context = oxcf->frame_parallel_decoding_mode
+                                           ? REFRESH_FRAME_CONTEXT_DISABLED
+                                           : REFRESH_FRAME_CONTEXT_BACKWARD;
+  if (oxcf->large_scale_tile)
+    cm->features.refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED;
+
+  // Initialize fields related to forward keyframes
+  cpi->no_show_kf = 0;
+
+  if (assign_cur_frame_new_fb(cm) == NULL) return AOM_CODEC_ERROR;
+
+  const int result =
+      av1_encode_strategy(cpi, size, dest, frame_flags, time_stamp, time_end,
+                          timestamp_ratio, flush);
+  if (result != AOM_CODEC_OK && result != -1) {
+    return AOM_CODEC_ERROR;
+  } else if (result == -1) {
+    // Returning -1 indicates no frame encoded; more input is required
+    return -1;
+  }
+#if CONFIG_INTERNAL_STATS
+  aom_usec_timer_mark(&cmptimer);
+  cpi->time_compress_data += aom_usec_timer_elapsed(&cmptimer);
+#endif  // CONFIG_INTERNAL_STATS
+  if (cpi->b_calculate_psnr) {
+    if (cm->show_existing_frame ||
+        (!is_stat_generation_stage(cpi) && cm->show_frame)) {
+      generate_psnr_packet(cpi);
+    }
+  }
+
+#if CONFIG_TUNE_VMAF
+  if (!is_stat_generation_stage(cpi) &&
+      (oxcf->tuning == AOM_TUNE_VMAF_WITH_PREPROCESSING ||
+       oxcf->tuning == AOM_TUNE_VMAF_WITHOUT_PREPROCESSING ||
+       oxcf->tuning == AOM_TUNE_VMAF_MAX_GAIN)) {
+    av1_update_vmaf_curve(cpi, cpi->source, &cpi->common.cur_frame->buf);
+  }
+#endif
+
+  if (cpi->level_params.keep_level_stats && !is_stat_generation_stage(cpi)) {
+    // Initialize level info. at the beginning of each sequence.
+    if (cm->current_frame.frame_type == KEY_FRAME && cm->show_frame) {
+      av1_init_level_info(cpi);
+    }
+    av1_update_level_info(cpi, *size, *time_stamp, *time_end);
+  }
+
+#if CONFIG_INTERNAL_STATS
+  if (!is_stat_generation_stage(cpi)) {
+    compute_internal_stats(cpi, (int)(*size));
+  }
+#endif  // CONFIG_INTERNAL_STATS
+#if CONFIG_SPEED_STATS
+  if (!is_stat_generation_stage(cpi) && !cm->show_existing_frame) {
+    cpi->tx_search_count += cpi->td.mb.tx_search_count;
+    cpi->td.mb.tx_search_count = 0;
+  }
+#endif  // CONFIG_SPEED_STATS
+
+  aom_clear_system_state();
+
+  return AOM_CODEC_OK;
+}
+
+int av1_get_preview_raw_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *dest) {
+  AV1_COMMON *cm = &cpi->common;
+  if (!cm->show_frame) {
+    return -1;
+  } else {
+    int ret;
+    if (cm->cur_frame != NULL) {
+      *dest = cm->cur_frame->buf;
+      dest->y_width = cm->width;
+      dest->y_height = cm->height;
+      dest->uv_width = cm->width >> cm->seq_params.subsampling_x;
+      dest->uv_height = cm->height >> cm->seq_params.subsampling_y;
+      ret = 0;
+    } else {
+      ret = -1;
+    }
+    aom_clear_system_state();
+    return ret;
+  }
+}
+
+int av1_get_last_show_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *frame) {
+  if (cpi->last_show_frame_buf == NULL) return -1;
+
+  *frame = cpi->last_show_frame_buf->buf;
+  return 0;
+}
+
+static int equal_dimensions_and_border(const YV12_BUFFER_CONFIG *a,
+                                       const YV12_BUFFER_CONFIG *b) {
+  return a->y_height == b->y_height && a->y_width == b->y_width &&
+         a->uv_height == b->uv_height && a->uv_width == b->uv_width &&
+         a->y_stride == b->y_stride && a->uv_stride == b->uv_stride &&
+         a->border == b->border &&
+         (a->flags & YV12_FLAG_HIGHBITDEPTH) ==
+             (b->flags & YV12_FLAG_HIGHBITDEPTH);
+}
+
+aom_codec_err_t av1_copy_new_frame_enc(AV1_COMMON *cm,
+                                       YV12_BUFFER_CONFIG *new_frame,
+                                       YV12_BUFFER_CONFIG *sd) {
+  const int num_planes = av1_num_planes(cm);
+  if (!equal_dimensions_and_border(new_frame, sd))
+    aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+                       "Incorrect buffer dimensions");
+  else
+    aom_yv12_copy_frame(new_frame, sd, num_planes);
+
+  return cm->error.error_code;
+}
+
+int av1_set_internal_size(AV1EncoderConfig *const oxcf,
+                          ResizePendingParams *resize_pending_params,
+                          AOM_SCALING horiz_mode, AOM_SCALING vert_mode) {
+  int hr = 0, hs = 0, vr = 0, vs = 0;
+
+  if (horiz_mode > ONETWO || vert_mode > ONETWO) return -1;
+
+  Scale2Ratio(horiz_mode, &hr, &hs);
+  Scale2Ratio(vert_mode, &vr, &vs);
+
+  // always go to the next whole number
+  resize_pending_params->width = (hs - 1 + oxcf->width * hr) / hs;
+  resize_pending_params->height = (vs - 1 + oxcf->height * vr) / vs;
+
+  return 0;
+}
+
+int av1_get_quantizer(AV1_COMP *cpi) {
+  return cpi->common.quant_params.base_qindex;
+}
+
+int av1_convert_sect5obus_to_annexb(uint8_t *buffer, size_t *frame_size) {
+  size_t output_size = 0;
+  size_t total_bytes_read = 0;
+  size_t remaining_size = *frame_size;
+  uint8_t *buff_ptr = buffer;
+
+  // go through each OBUs
+  while (total_bytes_read < *frame_size) {
+    uint8_t saved_obu_header[2];
+    uint64_t obu_payload_size;
+    size_t length_of_payload_size;
+    size_t length_of_obu_size;
+    uint32_t obu_header_size = (buff_ptr[0] >> 2) & 0x1 ? 2 : 1;
+    size_t obu_bytes_read = obu_header_size;  // bytes read for current obu
+
+    // save the obu header (1 or 2 bytes)
+    memmove(saved_obu_header, buff_ptr, obu_header_size);
+    // clear the obu_has_size_field
+    saved_obu_header[0] = saved_obu_header[0] & (~0x2);
+
+    // get the payload_size and length of payload_size
+    if (aom_uleb_decode(buff_ptr + obu_header_size, remaining_size,
+                        &obu_payload_size, &length_of_payload_size) != 0) {
+      return AOM_CODEC_ERROR;
+    }
+    obu_bytes_read += length_of_payload_size;
+
+    // calculate the length of size of the obu header plus payload
+    length_of_obu_size =
+        aom_uleb_size_in_bytes((uint64_t)(obu_header_size + obu_payload_size));
+
+    // move the rest of data to new location
+    memmove(buff_ptr + length_of_obu_size + obu_header_size,
+            buff_ptr + obu_bytes_read, remaining_size - obu_bytes_read);
+    obu_bytes_read += (size_t)obu_payload_size;
+
+    // write the new obu size
+    const uint64_t obu_size = obu_header_size + obu_payload_size;
+    size_t coded_obu_size;
+    if (aom_uleb_encode(obu_size, sizeof(obu_size), buff_ptr,
+                        &coded_obu_size) != 0) {
+      return AOM_CODEC_ERROR;
+    }
+
+    // write the saved (modified) obu_header following obu size
+    memmove(buff_ptr + length_of_obu_size, saved_obu_header, obu_header_size);
+
+    total_bytes_read += obu_bytes_read;
+    remaining_size -= obu_bytes_read;
+    buff_ptr += length_of_obu_size + obu_size;
+    output_size += length_of_obu_size + (size_t)obu_size;
+  }
+
+  *frame_size = output_size;
+  return AOM_CODEC_OK;
+}
+
+static void svc_set_updates_external_ref_frame_config(
+    ExternalFlags *const ext_flags, SVC *const svc) {
+  ext_flags->refresh_frame_flags_pending = 1;
+  ext_flags->refresh_last_frame = svc->refresh[svc->ref_idx[0]];
+  ext_flags->refresh_golden_frame = svc->refresh[svc->ref_idx[3]];
+  ext_flags->refresh_bwd_ref_frame = svc->refresh[svc->ref_idx[4]];
+  ext_flags->refresh_alt2_ref_frame = svc->refresh[svc->ref_idx[5]];
+  ext_flags->refresh_alt_ref_frame = svc->refresh[svc->ref_idx[6]];
+  svc->non_reference_frame = 1;
+  for (int i = 0; i < REF_FRAMES; i++) {
+    if (svc->refresh[i] == 1) {
+      svc->non_reference_frame = 0;
+      break;
+    }
+  }
+}
+
+static int svc_set_references_external_ref_frame_config(AV1_COMP *cpi) {
+  // LAST_FRAME (0), LAST2_FRAME(1), LAST3_FRAME(2), GOLDEN_FRAME(3),
+  // BWDREF_FRAME(4), ALTREF2_FRAME(5), ALTREF_FRAME(6).
+  int ref = AOM_REFFRAME_ALL;
+  for (int i = 0; i < INTER_REFS_PER_FRAME; i++) {
+    if (!cpi->svc.reference[i]) ref ^= (1 << i);
+  }
+  return ref;
+}
+
+void av1_apply_encoding_flags(AV1_COMP *cpi, aom_enc_frame_flags_t flags) {
+  // TODO(yunqingwang): For what references to use, external encoding flags
+  // should be consistent with internal reference frame selection. Need to
+  // ensure that there is not conflict between the two. In AV1 encoder, the
+  // priority rank for 7 reference frames are: LAST, ALTREF, LAST2, LAST3,
+  // GOLDEN, BWDREF, ALTREF2.
+
+  ExternalFlags *const ext_flags = &cpi->ext_flags;
+  ext_flags->ref_frame_flags = AOM_REFFRAME_ALL;
+  if (flags &
+      (AOM_EFLAG_NO_REF_LAST | AOM_EFLAG_NO_REF_LAST2 | AOM_EFLAG_NO_REF_LAST3 |
+       AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF | AOM_EFLAG_NO_REF_BWD |
+       AOM_EFLAG_NO_REF_ARF2)) {
+    int ref = AOM_REFFRAME_ALL;
+
+    if (flags & AOM_EFLAG_NO_REF_LAST) ref ^= AOM_LAST_FLAG;
+    if (flags & AOM_EFLAG_NO_REF_LAST2) ref ^= AOM_LAST2_FLAG;
+    if (flags & AOM_EFLAG_NO_REF_LAST3) ref ^= AOM_LAST3_FLAG;
+
+    if (flags & AOM_EFLAG_NO_REF_GF) ref ^= AOM_GOLD_FLAG;
+
+    if (flags & AOM_EFLAG_NO_REF_ARF) {
+      ref ^= AOM_ALT_FLAG;
+      ref ^= AOM_BWD_FLAG;
+      ref ^= AOM_ALT2_FLAG;
+    } else {
+      if (flags & AOM_EFLAG_NO_REF_BWD) ref ^= AOM_BWD_FLAG;
+      if (flags & AOM_EFLAG_NO_REF_ARF2) ref ^= AOM_ALT2_FLAG;
+    }
+
+    av1_use_as_reference(&ext_flags->ref_frame_flags, ref);
+  } else {
+    if (cpi->svc.external_ref_frame_config) {
+      int ref = svc_set_references_external_ref_frame_config(cpi);
+      av1_use_as_reference(&ext_flags->ref_frame_flags, ref);
+    }
+  }
+
+  if (flags &
+      (AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF)) {
+    int upd = AOM_REFFRAME_ALL;
+
+    // Refreshing LAST/LAST2/LAST3 is handled by 1 common flag.
+    if (flags & AOM_EFLAG_NO_UPD_LAST) upd ^= AOM_LAST_FLAG;
+
+    if (flags & AOM_EFLAG_NO_UPD_GF) upd ^= AOM_GOLD_FLAG;
+
+    if (flags & AOM_EFLAG_NO_UPD_ARF) {
+      upd ^= AOM_ALT_FLAG;
+      upd ^= AOM_BWD_FLAG;
+      upd ^= AOM_ALT2_FLAG;
+    }
+
+    ext_flags->refresh_last_frame = (upd & AOM_LAST_FLAG) != 0;
+    ext_flags->refresh_golden_frame = (upd & AOM_GOLD_FLAG) != 0;
+    ext_flags->refresh_alt_ref_frame = (upd & AOM_ALT_FLAG) != 0;
+    ext_flags->refresh_bwd_ref_frame = (upd & AOM_BWD_FLAG) != 0;
+    ext_flags->refresh_alt2_ref_frame = (upd & AOM_ALT2_FLAG) != 0;
+    ext_flags->refresh_frame_flags_pending = 1;
+  } else {
+    if (cpi->svc.external_ref_frame_config)
+      svc_set_updates_external_ref_frame_config(ext_flags, &cpi->svc);
+    else
+      ext_flags->refresh_frame_flags_pending = 0;
+  }
+
+  ext_flags->use_ref_frame_mvs = cpi->oxcf.allow_ref_frame_mvs &
+                                 ((flags & AOM_EFLAG_NO_REF_FRAME_MVS) == 0);
+  ext_flags->use_error_resilient = cpi->oxcf.error_resilient_mode |
+                                   ((flags & AOM_EFLAG_ERROR_RESILIENT) != 0);
+  ext_flags->use_s_frame =
+      cpi->oxcf.s_frame_mode | ((flags & AOM_EFLAG_SET_S_FRAME) != 0);
+  ext_flags->use_primary_ref_none =
+      (flags & AOM_EFLAG_SET_PRIMARY_REF_NONE) != 0;
+
+  if (flags & AOM_EFLAG_NO_UPD_ENTROPY) {
+    av1_update_entropy(&ext_flags->refresh_frame_context,
+                       &ext_flags->refresh_frame_context_pending, 0);
+  }
+}
+
+aom_fixed_buf_t *av1_get_global_headers(AV1_COMP *cpi) {
+  if (!cpi) return NULL;
+
+  uint8_t header_buf[512] = { 0 };
+  const uint32_t sequence_header_size =
+      av1_write_sequence_header_obu(&cpi->common.seq_params, &header_buf[0]);
+  assert(sequence_header_size <= sizeof(header_buf));
+  if (sequence_header_size == 0) return NULL;
+
+  const size_t obu_header_size = 1;
+  const size_t size_field_size = aom_uleb_size_in_bytes(sequence_header_size);
+  const size_t payload_offset = obu_header_size + size_field_size;
+
+  if (payload_offset + sequence_header_size > sizeof(header_buf)) return NULL;
+  memmove(&header_buf[payload_offset], &header_buf[0], sequence_header_size);
+
+  if (av1_write_obu_header(&cpi->level_params, OBU_SEQUENCE_HEADER, 0,
+                           &header_buf[0]) != obu_header_size) {
+    return NULL;
+  }
+
+  size_t coded_size_field_size = 0;
+  if (aom_uleb_encode(sequence_header_size, size_field_size,
+                      &header_buf[obu_header_size],
+                      &coded_size_field_size) != 0) {
+    return NULL;
+  }
+  assert(coded_size_field_size == size_field_size);
+
+  aom_fixed_buf_t *global_headers =
+      (aom_fixed_buf_t *)malloc(sizeof(*global_headers));
+  if (!global_headers) return NULL;
+
+  const size_t global_header_buf_size =
+      obu_header_size + size_field_size + sequence_header_size;
+
+  global_headers->buf = malloc(global_header_buf_size);
+  if (!global_headers->buf) {
+    free(global_headers);
+    return NULL;
+  }
+
+  memcpy(global_headers->buf, &header_buf[0], global_header_buf_size);
+  global_headers->sz = global_header_buf_size;
+  return global_headers;
+}
diff --git a/libs/libaom/src/av1/encoder/encoder.h b/libs/libaom/src/av1/encoder/encoder.h
new file mode 100644
index 000000000..82d00cb76
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/encoder.h
@@ -0,0 +1,1965 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ENCODER_H_
+#define AOM_AV1_ENCODER_ENCODER_H_
+
+#include <stdbool.h>
+#include <stdio.h>
+
+#include "config/aom_config.h"
+
+#include "aom/aomcx.h"
+
+#include "av1/common/alloccommon.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/common/blockd.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/enums.h"
+#include "av1/common/resize.h"
+#include "av1/common/thread_common.h"
+#include "av1/common/timing.h"
+#include "av1/encoder/aq_cyclicrefresh.h"
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/block.h"
+#include "av1/encoder/context_tree.h"
+#include "av1/encoder/encodemb.h"
+#include "av1/encoder/firstpass.h"
+#include "av1/encoder/level.h"
+#include "av1/encoder/lookahead.h"
+#include "av1/encoder/mcomp.h"
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/rd.h"
+#include "av1/encoder/speed_features.h"
+#include "av1/encoder/svc_layercontext.h"
+#include "av1/encoder/tokenize.h"
+
+#if CONFIG_INTERNAL_STATS
+#include "aom_dsp/ssim.h"
+#endif
+#include "aom_dsp/variance.h"
+#if CONFIG_DENOISE
+#include "aom_dsp/noise_model.h"
+#endif
+#include "aom/internal/aom_codec_internal.h"
+#include "aom_util/aom_thread.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Number of frames required to test for scene cut detection
+#define SCENE_CUT_KEY_TEST_INTERVAL 16
+
+// Rational number with an int64 numerator
+// This structure holds a fractional value
+typedef struct aom_rational64 {
+  int64_t num;       // fraction numerator
+  int den;           // fraction denominator
+} aom_rational64_t;  // alias for struct aom_rational
+
+typedef struct {
+#if CONFIG_SUPERRES_IN_RECODE
+  struct loopfilter lf;
+  CdefInfo cdef_info;
+  YV12_BUFFER_CONFIG copy_buffer;
+  RATE_CONTROL rc;
+#endif  // CONFIG_SUPERRES_IN_RECODE
+} CODING_CONTEXT;
+
+enum {
+  NORMAL = 0,
+  FOURFIVE = 1,
+  THREEFIVE = 2,
+  ONETWO = 3
+} UENUM1BYTE(AOM_SCALING);
+
+enum {
+  // Good Quality Fast Encoding. The encoder balances quality with the amount of
+  // time it takes to encode the output. Speed setting controls how fast.
+  GOOD,
+  // Realtime Fast Encoding. Will force some restrictions on bitrate
+  // constraints.
+  REALTIME
+} UENUM1BYTE(MODE);
+
+enum {
+  FRAMEFLAGS_KEY = 1 << 0,
+  FRAMEFLAGS_GOLDEN = 1 << 1,
+  FRAMEFLAGS_BWDREF = 1 << 2,
+  // TODO(zoeliu): To determine whether a frame flag is needed for ALTREF2_FRAME
+  FRAMEFLAGS_ALTREF = 1 << 3,
+  FRAMEFLAGS_INTRAONLY = 1 << 4,
+  FRAMEFLAGS_SWITCH = 1 << 5,
+  FRAMEFLAGS_ERROR_RESILIENT = 1 << 6,
+} UENUM1BYTE(FRAMETYPE_FLAGS);
+
+enum {
+  NO_AQ = 0,
+  VARIANCE_AQ = 1,
+  COMPLEXITY_AQ = 2,
+  CYCLIC_REFRESH_AQ = 3,
+  AQ_MODE_COUNT  // This should always be the last member of the enum
+} UENUM1BYTE(AQ_MODE);
+enum {
+  NO_DELTA_Q = 0,
+  DELTA_Q_OBJECTIVE = 1,   // Modulation to improve objective quality
+  DELTA_Q_PERCEPTUAL = 2,  // Modulation to improve perceptual quality
+  DELTA_Q_MODE_COUNT       // This should always be the last member of the enum
+} UENUM1BYTE(DELTAQ_MODE);
+
+enum {
+  RESIZE_NONE = 0,    // No frame resizing allowed.
+  RESIZE_FIXED = 1,   // All frames are coded at the specified scale.
+  RESIZE_RANDOM = 2,  // All frames are coded at a random scale.
+  RESIZE_MODES
+} UENUM1BYTE(RESIZE_MODE);
+
+enum {
+  SUPERRES_NONE,     // No frame superres allowed.
+  SUPERRES_FIXED,    // All frames are coded at the specified scale,
+                     // and super-resolved.
+  SUPERRES_RANDOM,   // All frames are coded at a random scale,
+                     // and super-resolved.
+  SUPERRES_QTHRESH,  // Superres scale for a frame is determined based on
+                     // q_index.
+  SUPERRES_AUTO,     // Automatically select superres for appropriate frames.
+  SUPERRES_MODES
+} UENUM1BYTE(SUPERRES_MODE);
+
+typedef enum {
+  kInvalid = 0,
+  kLowSad = 1,
+  kHighSad = 2,
+  kLowVarHighSumdiff = 3,
+} CONTENT_STATE_SB;
+
+enum {
+  SS_CFG_SRC = 0,
+  SS_CFG_LOOKAHEAD = 1,
+  SS_CFG_FPF = 2,
+  SS_CFG_TOTAL = 3
+} UENUM1BYTE(SS_CFG_OFFSET);
+
+// TODO(jingning): This needs to be cleaned up next.
+#define MAX_LENGTH_TPL_FRAME_STATS (MAX_TOTAL_BUFFERS + REF_FRAMES + 1)
+
+typedef struct TplDepStats {
+  int64_t intra_cost;
+  int64_t inter_cost;
+  int64_t srcrf_dist;
+  int64_t recrf_dist;
+  int64_t srcrf_rate;
+  int64_t recrf_rate;
+  int64_t mc_dep_rate;
+  int64_t mc_dep_dist;
+  int_mv mv[INTER_REFS_PER_FRAME];
+  int ref_frame_index;
+  int64_t pred_error[INTER_REFS_PER_FRAME];
+  int64_t mc_count;
+  int64_t mc_saved;
+} TplDepStats;
+
+typedef struct TplDepFrame {
+  uint8_t is_valid;
+  TplDepStats *tpl_stats_ptr;
+  const YV12_BUFFER_CONFIG *gf_picture;
+  YV12_BUFFER_CONFIG *rec_picture;
+  int ref_map_index[REF_FRAMES];
+  int stride;
+  int width;
+  int height;
+  int mi_rows;
+  int mi_cols;
+  unsigned int frame_display_index;
+  int base_rdmult;
+} TplDepFrame;
+
+typedef struct TplParams {
+  // Block granularity of tpl score storage.
+  uint8_t tpl_stats_block_mis_log2;
+
+  // Buffer to store the frame level tpl information for each frame in a gf
+  // group. tpl_stats_buffer[i] stores the tpl information of ith frame in a gf
+  // group
+  TplDepFrame tpl_stats_buffer[MAX_LENGTH_TPL_FRAME_STATS];
+
+  // Buffer to store tpl stats at block granularity.
+  // tpl_stats_pool[i][j] stores the tpl stats of jth block of ith frame in a gf
+  // group.
+  TplDepStats *tpl_stats_pool[MAX_LAG_BUFFERS];
+
+  // Buffer to store tpl reconstructed frame.
+  // tpl_rec_pool[i] stores the reconstructed frame of ith frame in a gf group.
+  YV12_BUFFER_CONFIG tpl_rec_pool[MAX_LAG_BUFFERS];
+
+  // Pointer to tpl_stats_buffer.
+  TplDepFrame *tpl_frame;
+} TplParams;
+
+typedef enum {
+  COST_UPD_SB,
+  COST_UPD_SBROW,
+  COST_UPD_TILE,
+  COST_UPD_OFF,
+} COST_UPDATE_TYPE;
+
+#define TPL_DEP_COST_SCALE_LOG2 4
+
+typedef struct AV1EncoderConfig {
+  BITSTREAM_PROFILE profile;
+  aom_bit_depth_t bit_depth;     // Codec bit-depth.
+  int width;                     // width of data passed to the compressor
+  int height;                    // height of data passed to the compressor
+  int forced_max_frame_width;    // forced maximum width of frame (if != 0)
+  int forced_max_frame_height;   // forced maximum height of frame (if != 0)
+  unsigned int input_bit_depth;  // Input bit depth.
+  double init_framerate;         // set to passed in framerate
+  int64_t target_bandwidth;      // bandwidth to be used in bits per second
+
+  int noise_sensitivity;  // pre processing blur: recommendation 0
+  int sharpness;          // sharpening output: recommendation 0:
+  int speed;
+  // maximum allowed bitrate for any intra frame in % of bitrate target.
+  unsigned int rc_max_intra_bitrate_pct;
+  // maximum allowed bitrate for any inter frame in % of bitrate target.
+  unsigned int rc_max_inter_bitrate_pct;
+  // percent of rate boost for golden frame in CBR mode.
+  unsigned int gf_cbr_boost_pct;
+
+  MODE mode;
+  int pass;
+
+  // Key Framing Operations
+  int auto_key;  // autodetect cut scenes and set the keyframes
+  int key_freq;  // maximum distance to key frame.
+  int sframe_dist;
+  int sframe_mode;
+  int sframe_enabled;
+  int lag_in_frames;  // how many frames lag before we start encoding
+  int fwd_kf_enabled;
+
+  // ----------------------------------------------------------------
+  // DATARATE CONTROL OPTIONS
+
+  // vbr, cbr, constrained quality or constant quality
+  enum aom_rc_mode rc_mode;
+
+  // buffer targeting aggressiveness
+  int under_shoot_pct;
+  int over_shoot_pct;
+
+  // buffering parameters
+  int64_t starting_buffer_level_ms;
+  int64_t optimal_buffer_level_ms;
+  int64_t maximum_buffer_size_ms;
+
+  // Frame drop threshold.
+  int drop_frames_water_mark;
+
+  // controlling quality
+  int fixed_q;
+  int worst_allowed_q;
+  int best_allowed_q;
+  int cq_level;
+  int enable_chroma_deltaq;
+  AQ_MODE aq_mode;  // Adaptive Quantization mode
+  DELTAQ_MODE deltaq_mode;
+  int deltalf_mode;
+  int enable_cdef;
+  int enable_restoration;
+  int force_video_mode;
+  int enable_obmc;
+  int disable_trellis_quant;
+  int using_qm;
+  int qm_y;
+  int qm_u;
+  int qm_v;
+  int qm_minlevel;
+  int qm_maxlevel;
+  unsigned int num_tile_groups;
+  unsigned int mtu;
+
+  // Internal frame size scaling.
+  RESIZE_MODE resize_mode;
+  uint8_t resize_scale_denominator;
+  uint8_t resize_kf_scale_denominator;
+
+  // Frame Super-Resolution size scaling.
+  SUPERRES_MODE superres_mode;
+  uint8_t superres_scale_denominator;
+  uint8_t superres_kf_scale_denominator;
+  int superres_qthresh;
+  int superres_kf_qthresh;
+
+  // Enable feature to reduce the frame quantization every x frames.
+  int frame_periodic_boost;
+
+  // two pass datarate control
+  int two_pass_vbrbias;  // two pass datarate control tweaks
+  int two_pass_vbrmin_section;
+  int two_pass_vbrmax_section;
+  // END DATARATE CONTROL OPTIONS
+  // ----------------------------------------------------------------
+
+  int enable_auto_arf;
+  int enable_auto_brf;  // (b)ackward (r)ef (f)rame
+
+  /* Bitfield defining the error resiliency features to enable.
+   * Can provide decodable frames after losses in previous
+   * frames and decodable partitions after losses in the same frame.
+   */
+  unsigned int error_resilient_mode;
+
+  unsigned int s_frame_mode;
+
+  /* Bitfield defining the parallel decoding mode where the
+   * decoding in successive frames may be conducted in parallel
+   * just by decoding the frame headers.
+   */
+  unsigned int frame_parallel_decoding_mode;
+
+  unsigned int limit;
+
+  int arnr_max_frames;
+  int arnr_strength;
+
+  int min_gf_interval;
+  int max_gf_interval;
+  int gf_min_pyr_height;
+  int gf_max_pyr_height;
+
+  int row_mt;
+  int tile_columns;
+  int tile_rows;
+  int tile_width_count;
+  int tile_height_count;
+  int tile_widths[MAX_TILE_COLS];
+  int tile_heights[MAX_TILE_ROWS];
+
+  int enable_tpl_model;
+  int enable_keyframe_filtering;
+
+  int max_threads;
+
+  aom_fixed_buf_t two_pass_stats_in;
+
+  aom_tune_metric tuning;
+  const char *vmaf_model_path;
+  aom_tune_content content;
+  int use_highbitdepth;
+  aom_color_primaries_t color_primaries;
+  aom_transfer_characteristics_t transfer_characteristics;
+  aom_matrix_coefficients_t matrix_coefficients;
+  aom_chroma_sample_position_t chroma_sample_position;
+  int color_range;
+  int render_width;
+  int render_height;
+  int timing_info_present;
+  aom_timing_info_t timing_info;
+  int decoder_model_info_present_flag;
+  int display_model_info_present_flag;
+  int buffer_removal_time_present;
+  aom_dec_model_info_t buffer_model;
+  int film_grain_test_vector;
+  const char *film_grain_table_filename;
+
+  uint8_t cdf_update_mode;
+  aom_superblock_size_t superblock_size;
+  unsigned int large_scale_tile;
+  unsigned int single_tile_decoding;
+  uint8_t monochrome;
+  unsigned int full_still_picture_hdr;
+  int enable_dual_filter;
+  unsigned int motion_vector_unit_test;
+  unsigned int sb_multipass_unit_test;
+  unsigned int ext_tile_debug;
+  int enable_rect_partitions;
+  int enable_ab_partitions;
+  int enable_1to4_partitions;
+  int min_partition_size;
+  int max_partition_size;
+  int enable_intra_edge_filter;
+  int enable_tx64;
+  int enable_flip_idtx;
+  int enable_order_hint;
+  int enable_dist_wtd_comp;
+  int enable_ref_frame_mvs;
+  unsigned int max_reference_frames;
+  int enable_reduced_reference_set;
+  unsigned int allow_ref_frame_mvs;
+  int enable_masked_comp;
+  int enable_onesided_comp;
+  int enable_interintra_comp;
+  int enable_smooth_interintra;
+  int enable_diff_wtd_comp;
+  int enable_interinter_wedge;
+  int enable_interintra_wedge;
+  int enable_global_motion;
+  int enable_warped_motion;
+  int allow_warped_motion;
+  int enable_filter_intra;
+  int enable_smooth_intra;
+  int enable_paeth_intra;
+  int enable_cfl_intra;
+  int enable_superres;
+  int enable_overlay;
+  int enable_palette;
+  int enable_intrabc;
+  int enable_angle_delta;
+  unsigned int save_as_annexb;
+
+#if CONFIG_DENOISE
+  float noise_level;
+  int noise_block_size;
+#endif
+
+  unsigned int chroma_subsampling_x;
+  unsigned int chroma_subsampling_y;
+  int reduced_tx_type_set;
+  int use_intra_dct_only;
+  int use_inter_dct_only;
+  int use_intra_default_tx_only;
+  int quant_b_adapt;
+  COST_UPDATE_TYPE coeff_cost_upd_freq;
+  COST_UPDATE_TYPE mode_cost_upd_freq;
+  COST_UPDATE_TYPE mv_cost_upd_freq;
+  int border_in_pixels;
+  AV1_LEVEL target_seq_level_idx[MAX_NUM_OPERATING_POINTS];
+  // Bit mask to specify which tier each of the 32 possible operating points
+  // conforms to.
+  unsigned int tier_mask;
+  // If true, encoder will use fixed QP offsets, that are either:
+  // - Given by the user, and stored in 'fixed_qp_offsets' array, OR
+  // - Picked automatically from cq_level.
+  int use_fixed_qp_offsets;
+  // List of QP offsets for: keyframe, ALTREF, and 3 levels of internal ARFs.
+  // If any of these values are negative, fixed offsets are disabled.
+  // Uses internal q range.
+  double fixed_qp_offsets[FIXED_QP_OFFSET_COUNT];
+  // min_cr / 100 is the target minimum compression ratio for each frame.
+  unsigned int min_cr;
+  const cfg_options_t *encoder_cfg;
+} AV1EncoderConfig;
+
+static INLINE int is_lossless_requested(const AV1EncoderConfig *cfg) {
+  return cfg->best_allowed_q == 0 && cfg->worst_allowed_q == 0;
+}
+
+typedef struct {
+  // obmc_probs[i][j] is the probability of OBMC being the best motion mode for
+  // jth block size and ith frame update type, averaged over past frames. If
+  // obmc_probs[i][j] < thresh, then OBMC search is pruned.
+  int obmc_probs[FRAME_UPDATE_TYPES][BLOCK_SIZES_ALL];
+
+  // warped_probs[i] is the probability of warped motion being the best motion
+  // mode for ith frame update type, averaged over past frames. If
+  // warped_probs[i] < thresh, then warped motion search is pruned.
+  int warped_probs[FRAME_UPDATE_TYPES];
+
+  // tx_type_probs[i][j][k] is the probability of kth tx_type being the best
+  // for jth transform size and ith frame update type, averaged over past
+  // frames. If tx_type_probs[i][j][k] < thresh, then transform search for that
+  // type is pruned.
+  int tx_type_probs[FRAME_UPDATE_TYPES][TX_SIZES_ALL][TX_TYPES];
+
+  // switchable_interp_probs[i][j][k] is the probability of kth interpolation
+  // filter being the best for jth filter context and ith frame update type,
+  // averaged over past frames. If switchable_interp_probs[i][j][k] < thresh,
+  // then interpolation filter search is pruned for that case.
+  int switchable_interp_probs[FRAME_UPDATE_TYPES][SWITCHABLE_FILTER_CONTEXTS]
+                             [SWITCHABLE_FILTERS];
+} FrameProbInfo;
+
+typedef struct FRAME_COUNTS {
+// Note: This structure should only contain 'unsigned int' fields, or
+// aggregates built solely from 'unsigned int' fields/elements
+#if CONFIG_ENTROPY_STATS
+  unsigned int kf_y_mode[KF_MODE_CONTEXTS][KF_MODE_CONTEXTS][INTRA_MODES];
+  unsigned int angle_delta[DIRECTIONAL_MODES][2 * MAX_ANGLE_DELTA + 1];
+  unsigned int y_mode[BLOCK_SIZE_GROUPS][INTRA_MODES];
+  unsigned int uv_mode[CFL_ALLOWED_TYPES][INTRA_MODES][UV_INTRA_MODES];
+  unsigned int cfl_sign[CFL_JOINT_SIGNS];
+  unsigned int cfl_alpha[CFL_ALPHA_CONTEXTS][CFL_ALPHABET_SIZE];
+  unsigned int palette_y_mode[PALATTE_BSIZE_CTXS][PALETTE_Y_MODE_CONTEXTS][2];
+  unsigned int palette_uv_mode[PALETTE_UV_MODE_CONTEXTS][2];
+  unsigned int palette_y_size[PALATTE_BSIZE_CTXS][PALETTE_SIZES];
+  unsigned int palette_uv_size[PALATTE_BSIZE_CTXS][PALETTE_SIZES];
+  unsigned int palette_y_color_index[PALETTE_SIZES]
+                                    [PALETTE_COLOR_INDEX_CONTEXTS]
+                                    [PALETTE_COLORS];
+  unsigned int palette_uv_color_index[PALETTE_SIZES]
+                                     [PALETTE_COLOR_INDEX_CONTEXTS]
+                                     [PALETTE_COLORS];
+  unsigned int partition[PARTITION_CONTEXTS][EXT_PARTITION_TYPES];
+  unsigned int txb_skip[TOKEN_CDF_Q_CTXS][TX_SIZES][TXB_SKIP_CONTEXTS][2];
+  unsigned int eob_extra[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES]
+                        [EOB_COEF_CONTEXTS][2];
+  unsigned int dc_sign[PLANE_TYPES][DC_SIGN_CONTEXTS][2];
+  unsigned int coeff_lps[TX_SIZES][PLANE_TYPES][BR_CDF_SIZE - 1][LEVEL_CONTEXTS]
+                        [2];
+  unsigned int eob_flag[TX_SIZES][PLANE_TYPES][EOB_COEF_CONTEXTS][2];
+  unsigned int eob_multi16[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][5];
+  unsigned int eob_multi32[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][6];
+  unsigned int eob_multi64[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][7];
+  unsigned int eob_multi128[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][8];
+  unsigned int eob_multi256[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][9];
+  unsigned int eob_multi512[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][10];
+  unsigned int eob_multi1024[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][11];
+  unsigned int coeff_lps_multi[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES]
+                              [LEVEL_CONTEXTS][BR_CDF_SIZE];
+  unsigned int coeff_base_multi[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES]
+                               [SIG_COEF_CONTEXTS][NUM_BASE_LEVELS + 2];
+  unsigned int coeff_base_eob_multi[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES]
+                                   [SIG_COEF_CONTEXTS_EOB][NUM_BASE_LEVELS + 1];
+  unsigned int newmv_mode[NEWMV_MODE_CONTEXTS][2];
+  unsigned int zeromv_mode[GLOBALMV_MODE_CONTEXTS][2];
+  unsigned int refmv_mode[REFMV_MODE_CONTEXTS][2];
+  unsigned int drl_mode[DRL_MODE_CONTEXTS][2];
+  unsigned int inter_compound_mode[INTER_MODE_CONTEXTS][INTER_COMPOUND_MODES];
+  unsigned int wedge_idx[BLOCK_SIZES_ALL][16];
+  unsigned int interintra[BLOCK_SIZE_GROUPS][2];
+  unsigned int interintra_mode[BLOCK_SIZE_GROUPS][INTERINTRA_MODES];
+  unsigned int wedge_interintra[BLOCK_SIZES_ALL][2];
+  unsigned int compound_type[BLOCK_SIZES_ALL][MASKED_COMPOUND_TYPES];
+  unsigned int motion_mode[BLOCK_SIZES_ALL][MOTION_MODES];
+  unsigned int obmc[BLOCK_SIZES_ALL][2];
+  unsigned int intra_inter[INTRA_INTER_CONTEXTS][2];
+  unsigned int comp_inter[COMP_INTER_CONTEXTS][2];
+  unsigned int comp_ref_type[COMP_REF_TYPE_CONTEXTS][2];
+  unsigned int uni_comp_ref[UNI_COMP_REF_CONTEXTS][UNIDIR_COMP_REFS - 1][2];
+  unsigned int single_ref[REF_CONTEXTS][SINGLE_REFS - 1][2];
+  unsigned int comp_ref[REF_CONTEXTS][FWD_REFS - 1][2];
+  unsigned int comp_bwdref[REF_CONTEXTS][BWD_REFS - 1][2];
+  unsigned int intrabc[2];
+
+  unsigned int txfm_partition[TXFM_PARTITION_CONTEXTS][2];
+  unsigned int intra_tx_size[MAX_TX_CATS][TX_SIZE_CONTEXTS][MAX_TX_DEPTH + 1];
+  unsigned int skip_mode[SKIP_MODE_CONTEXTS][2];
+  unsigned int skip[SKIP_CONTEXTS][2];
+  unsigned int compound_index[COMP_INDEX_CONTEXTS][2];
+  unsigned int comp_group_idx[COMP_GROUP_IDX_CONTEXTS][2];
+  unsigned int delta_q[DELTA_Q_PROBS][2];
+  unsigned int delta_lf_multi[FRAME_LF_COUNT][DELTA_LF_PROBS][2];
+  unsigned int delta_lf[DELTA_LF_PROBS][2];
+
+  unsigned int inter_ext_tx[EXT_TX_SETS_INTER][EXT_TX_SIZES][TX_TYPES];
+  unsigned int intra_ext_tx[EXT_TX_SETS_INTRA][EXT_TX_SIZES][INTRA_MODES]
+                           [TX_TYPES];
+  unsigned int filter_intra_mode[FILTER_INTRA_MODES];
+  unsigned int filter_intra[BLOCK_SIZES_ALL][2];
+  unsigned int switchable_restore[RESTORE_SWITCHABLE_TYPES];
+  unsigned int wiener_restore[2];
+  unsigned int sgrproj_restore[2];
+#endif  // CONFIG_ENTROPY_STATS
+
+  unsigned int switchable_interp[SWITCHABLE_FILTER_CONTEXTS]
+                                [SWITCHABLE_FILTERS];
+} FRAME_COUNTS;
+
+#define INTER_MODE_RD_DATA_OVERALL_SIZE 6400
+
+typedef struct {
+  int ready;
+  double a;
+  double b;
+  double dist_mean;
+  double ld_mean;
+  double sse_mean;
+  double sse_sse_mean;
+  double sse_ld_mean;
+  int num;
+  double dist_sum;
+  double ld_sum;
+  double sse_sum;
+  double sse_sse_sum;
+  double sse_ld_sum;
+} InterModeRdModel;
+
+typedef struct {
+  int idx;
+  int64_t rd;
+} RdIdxPair;
+// TODO(angiebird): This is an estimated size. We still need to figure what is
+// the maximum number of modes.
+#define MAX_INTER_MODES 1024
+typedef struct inter_modes_info {
+  int num;
+  MB_MODE_INFO mbmi_arr[MAX_INTER_MODES];
+  int mode_rate_arr[MAX_INTER_MODES];
+  int64_t sse_arr[MAX_INTER_MODES];
+  int64_t est_rd_arr[MAX_INTER_MODES];
+  RdIdxPair rd_idx_pair_arr[MAX_INTER_MODES];
+  RD_STATS rd_cost_arr[MAX_INTER_MODES];
+  RD_STATS rd_cost_y_arr[MAX_INTER_MODES];
+  RD_STATS rd_cost_uv_arr[MAX_INTER_MODES];
+} InterModesInfo;
+
+// Encoder row synchronization
+typedef struct AV1RowMTSyncData {
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t *mutex_;
+  pthread_cond_t *cond_;
+#endif
+  // Allocate memory to store the sb/mb block index in each row.
+  int *cur_col;
+  int sync_range;
+  int rows;
+} AV1RowMTSync;
+
+typedef struct AV1RowMTInfo {
+  int current_mi_row;
+  int num_threads_working;
+} AV1RowMTInfo;
+
+typedef struct {
+  // TODO(kyslov): consider changing to 64bit
+
+  // This struct is used for computing variance in choose_partitioning(), where
+  // the max number of samples within a superblock is 32x32 (with 4x4 avg).
+  // With 8bit bitdepth, uint32_t is enough for sum_square_error (2^8 * 2^8 * 32
+  // * 32 = 2^26). For high bitdepth we need to consider changing this to 64 bit
+  uint32_t sum_square_error;
+  int32_t sum_error;
+  int log2_count;
+  int variance;
+} VPartVar;
+
+typedef struct {
+  VPartVar none;
+  VPartVar horz[2];
+  VPartVar vert[2];
+} VPVariance;
+
+typedef struct {
+  VPVariance part_variances;
+  VPartVar split[4];
+} VP4x4;
+
+typedef struct {
+  VPVariance part_variances;
+  VP4x4 split[4];
+} VP8x8;
+
+typedef struct {
+  VPVariance part_variances;
+  VP8x8 split[4];
+} VP16x16;
+
+typedef struct {
+  VPVariance part_variances;
+  VP16x16 split[4];
+} VP32x32;
+
+typedef struct {
+  VPVariance part_variances;
+  VP32x32 split[4];
+} VP64x64;
+
+typedef struct {
+  VPVariance part_variances;
+  VP64x64 *split;
+} VP128x128;
+
+typedef struct {
+  // Thresholds for variance based partitioning. If block variance > threshold,
+  // then that block is forced to split.
+  // thresholds[0] - threshold for 128x128;
+  // thresholds[1] - threshold for 64x64;
+  // thresholds[2] - threshold for 32x32;
+  // thresholds[3] - threshold for 16x16;
+  // thresholds[4] - threshold for 8x8;
+  int64_t thresholds[5];
+
+  // MinMax variance threshold for 8x8 sub blocks of a 16x16 block. If actual
+  // minmax > threshold_minmax, the 16x16 is forced to split.
+  int64_t threshold_minmax;
+} VarBasedPartitionInfo;
+
+// TODO(jingning) All spatially adaptive variables should go to TileDataEnc.
+typedef struct TileDataEnc {
+  TileInfo tile_info;
+  CFL_CTX cfl;
+  DECLARE_ALIGNED(16, FRAME_CONTEXT, tctx);
+  FRAME_CONTEXT *row_ctx;
+  uint8_t allow_update_cdf;
+  InterModeRdModel inter_mode_rd_models[BLOCK_SIZES_ALL];
+  AV1RowMTSync row_mt_sync;
+  AV1RowMTInfo row_mt_info;
+} TileDataEnc;
+
+typedef struct {
+  TOKENEXTRA *start;
+  TOKENEXTRA *stop;
+  unsigned int count;
+} TOKENLIST;
+
+typedef struct MultiThreadHandle {
+  int allocated_tile_rows;
+  int allocated_tile_cols;
+  int allocated_sb_rows;
+  int thread_id_to_tile_id[MAX_NUM_THREADS];  // Mapping of threads to tiles
+} MultiThreadHandle;
+
+typedef struct RD_COUNTS {
+  int64_t comp_pred_diff[REFERENCE_MODES];
+  // Stores number of 4x4 blocks using global motion per reference frame.
+  int global_motion_used[REF_FRAMES];
+  int compound_ref_used_flag;
+  int skip_mode_used_flag;
+  int tx_type_used[TX_SIZES_ALL][TX_TYPES];
+  int obmc_used[BLOCK_SIZES_ALL][2];
+  int warped_used[2];
+} RD_COUNTS;
+
+typedef struct ThreadData {
+  MACROBLOCK mb;
+  RD_COUNTS rd_counts;
+  FRAME_COUNTS *counts;
+  PC_TREE *pc_tree;
+  PC_TREE *pc_root;
+  tran_low_t *tree_coeff_buf[MAX_MB_PLANE];
+  tran_low_t *tree_qcoeff_buf[MAX_MB_PLANE];
+  tran_low_t *tree_dqcoeff_buf[MAX_MB_PLANE];
+  InterModesInfo *inter_modes_info;
+  uint32_t *hash_value_buffer[2][2];
+  int32_t *wsrc_buf;
+  int32_t *mask_buf;
+  uint8_t *above_pred_buf;
+  uint8_t *left_pred_buf;
+  PALETTE_BUFFER *palette_buffer;
+  CompoundTypeRdBuffers comp_rd_buffer;
+  CONV_BUF_TYPE *tmp_conv_dst;
+  uint8_t *tmp_obmc_bufs[2];
+  int intrabc_used;
+  int deltaq_used;
+  FRAME_CONTEXT *tctx;
+  MB_MODE_INFO_EXT *mbmi_ext;
+  VP64x64 *vt64x64;
+  int32_t num_64x64_blocks;
+} ThreadData;
+
+struct EncWorkerData;
+
+typedef struct ActiveMap {
+  int enabled;
+  int update;
+  unsigned char *map;
+} ActiveMap;
+
+typedef struct {
+  // cs_rate_array[i] is the fraction of blocks in a frame which either match
+  // with the collocated block or are smooth, where i is the rate_index.
+  double cs_rate_array[32];
+  // rate_index is used to index cs_rate_array.
+  int rate_index;
+  // rate_size is the total number of entries populated in cs_rate_array.
+  int rate_size;
+} ForceIntegerMVInfo;
+
+#if CONFIG_INTERNAL_STATS
+// types of stats
+enum {
+  STAT_Y,
+  STAT_U,
+  STAT_V,
+  STAT_ALL,
+  NUM_STAT_TYPES  // This should always be the last member of the enum
+} UENUM1BYTE(StatType);
+
+typedef struct IMAGE_STAT {
+  double stat[NUM_STAT_TYPES];
+  double worst;
+} ImageStat;
+#endif  // CONFIG_INTERNAL_STATS
+
+typedef struct {
+  int ref_count;
+  YV12_BUFFER_CONFIG buf;
+} EncRefCntBuffer;
+
+typedef struct {
+  // Buffer to store mode information at mi_alloc_bsize (4x4 or 8x8) level for
+  // use in bitstream preparation. frame_base[mi_row * stride + mi_col] stores
+  // the mode information of block (mi_row,mi_col).
+  MB_MODE_INFO_EXT_FRAME *frame_base;
+  // Size of frame_base buffer.
+  int alloc_size;
+  // Stride of frame_base buffer.
+  int stride;
+} MBMIExtFrameBufferInfo;
+
+#if CONFIG_COLLECT_PARTITION_STATS == 2
+typedef struct PartitionStats {
+  int partition_decisions[6][EXT_PARTITION_TYPES];
+  int partition_attempts[6][EXT_PARTITION_TYPES];
+  int64_t partition_times[6][EXT_PARTITION_TYPES];
+
+  int partition_redo;
+} PartitionStats;
+#endif
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+#include "aom_ports/aom_timer.h"
+// Adjust the following to add new components.
+enum {
+  encode_frame_to_data_rate_time,
+  encode_with_recode_loop_time,
+  loop_filter_time,
+  cdef_time,
+  loop_restoration_time,
+  av1_pack_bitstream_final_time,
+  av1_encode_frame_time,
+  av1_compute_global_motion_time,
+  av1_setup_motion_field_time,
+  encode_sb_time,
+  rd_pick_partition_time,
+  rd_pick_sb_modes_time,
+  av1_rd_pick_intra_mode_sb_time,
+  av1_rd_pick_inter_mode_sb_time,
+  handle_intra_mode_time,
+  do_tx_search_time,
+  handle_newmv_time,
+  compound_type_rd_time,
+  interpolation_filter_search_time,
+  motion_mode_rd_time,
+  kTimingComponents,
+} UENUM1BYTE(TIMING_COMPONENT);
+
+static INLINE char const *get_component_name(int index) {
+  switch (index) {
+    case encode_frame_to_data_rate_time:
+      return "encode_frame_to_data_rate_time";
+    case encode_with_recode_loop_time: return "encode_with_recode_loop_time";
+    case loop_filter_time: return "loop_filter_time";
+    case cdef_time: return "cdef_time";
+    case loop_restoration_time: return "loop_restoration_time";
+    case av1_pack_bitstream_final_time: return "av1_pack_bitstream_final_time";
+    case av1_encode_frame_time: return "av1_encode_frame_time";
+    case av1_compute_global_motion_time:
+      return "av1_compute_global_motion_time";
+    case av1_setup_motion_field_time: return "av1_setup_motion_field_time";
+    case encode_sb_time: return "encode_sb_time";
+    case rd_pick_partition_time: return "rd_pick_partition_time";
+    case rd_pick_sb_modes_time: return "rd_pick_sb_modes_time";
+    case av1_rd_pick_intra_mode_sb_time:
+      return "av1_rd_pick_intra_mode_sb_time";
+    case av1_rd_pick_inter_mode_sb_time:
+      return "av1_rd_pick_inter_mode_sb_time";
+    case handle_intra_mode_time: return "handle_intra_mode_time";
+    case do_tx_search_time: return "do_tx_search_time";
+    case handle_newmv_time: return "handle_newmv_time";
+    case compound_type_rd_time: return "compound_type_rd_time";
+    case interpolation_filter_search_time:
+      return "interpolation_filter_search_time";
+    case motion_mode_rd_time: return "motion_mode_rd_time";
+    default: assert(0);
+  }
+  return "error";
+}
+#endif
+
+// The maximum number of internal ARFs except ALTREF_FRAME
+#define MAX_INTERNAL_ARFS (REF_FRAMES - BWDREF_FRAME - 1)
+
+typedef struct {
+  // Array to store the cost for signalling each global motion model.
+  // gmtype_cost[i] stores the cost of signalling the ith Global Motion model.
+  int type_cost[TRANS_TYPES];
+
+  // Array to store the cost for signalling a particular global motion model for
+  // each reference frame. gmparams_cost[i] stores the cost of signalling global
+  // motion for the ith reference frame.
+  int params_cost[REF_FRAMES];
+
+  // Flag to indicate if global motion search needs to be rerun.
+  bool search_done;
+} GlobalMotionInfo;
+
+typedef struct {
+  // Stores the default value of skip flag depending on chroma format
+  // Set as 1 for monochrome and 3 for other color formats
+  int default_interp_skip_flags;
+  // Filter mask to allow certain interp_filter type.
+  uint16_t interp_filter_search_mask;
+} InterpSearchFlags;
+
+typedef struct {
+  // Largest MV component used in a frame.
+  // The value from the previous frame is used to set the full pixel search
+  // range for the current frame.
+  int max_mv_magnitude;
+  // Parameter indicating initial search window to be used in full-pixel search.
+  // Range [0, MAX_MVSEARCH_STEPS-2]. Lower value indicates larger window.
+  int mv_step_param;
+  // Pointer to sub-pixel search function.
+  // In encoder: av1_find_best_sub_pixel_tree
+  //             av1_find_best_sub_pixel_tree_pruned
+  //             av1_find_best_sub_pixel_tree_pruned_more
+  //             av1_find_best_sub_pixel_tree_pruned_evenmore
+  // In MV unit test: av1_return_max_sub_pixel_mv
+  //                  av1_return_min_sub_pixel_mv
+  fractional_mv_step_fp *find_fractional_mv_step;
+  // Search site configuration for full-pel MV search.
+  // ss_cfg[SS_CFG_SRC]: Used in tpl, rd/non-rd inter mode loop, simple motion
+  // search.
+  // ss_cfg[SS_CFG_LOOKAHEAD]: Used in intraBC, temporal filter
+  // ss_cfg[SS_CFG_FPF]: Used during first pass and lookahead
+  search_site_config ss_cfg[SS_CFG_TOTAL];
+} MotionVectorSearchParams;
+
+typedef struct {
+  // When resize is triggered externally, the desired dimensions are stored in
+  // this struct until used in the next frame to be coded. These values are
+  // effective only for one frame and are reset after they are used.
+  int width;
+  int height;
+} ResizePendingParams;
+
+typedef struct {
+  // Threshold of transform domain distortion
+  // Index 0: Default mode evaluation, Winner mode processing is not applicable
+  // (Eg : IntraBc).
+  // Index 1: Mode evaluation.
+  // Index 2: Winner mode evaluation.
+  // Index 1 and 2 are applicable when enable_winner_mode_for_use_tx_domain_dist
+  // speed feature is ON
+  unsigned int tx_domain_dist_threshold[MODE_EVAL_TYPES];
+
+  // Factor to control R-D optimization of coeffs based on block
+  // mse.
+  // Index 0: Default mode evaluation, Winner mode processing is not applicable
+  // (Eg : IntraBc). Index 1: Mode evaluation.
+  // Index 2: Winner mode evaluation
+  // Index 1 and 2 are applicable when enable_winner_mode_for_coeff_opt speed
+  // feature is ON
+  unsigned int coeff_opt_dist_threshold[MODE_EVAL_TYPES];
+
+  // Transform size to be used in transform search
+  // Index 0: Default mode evaluation, Winner mode processing is not applicable
+  // (Eg : IntraBc).
+  // Index 1: Mode evaluation. Index 2: Winner mode evaluation
+  // Index 1 and 2 are applicable when enable_winner_mode_for_tx_size_srch speed
+  // feature is ON
+  TX_SIZE_SEARCH_METHOD tx_size_search_methods[MODE_EVAL_TYPES];
+
+  // Transform domain distortion levels
+  // Index 0: Default mode evaluation, Winner mode processing is not applicable
+  // (Eg : IntraBc).
+  // Index 1: Mode evaluation. Index 2: Winner mode evaluation
+  // Index 1 and 2 are applicable when enable_winner_mode_for_use_tx_domain_dist
+  // speed feature is ON
+  unsigned int use_transform_domain_distortion[MODE_EVAL_TYPES];
+
+  // Predict transform skip levels to be used for default, mode and winner mode
+  // evaluation. Index 0: Default mode evaluation, Winner mode processing is not
+  // applicable. Index 1: Mode evaluation, Index 2: Winner mode evaluation
+  unsigned int predict_skip_level[MODE_EVAL_TYPES];
+} WinnerModeParams;
+
+typedef struct {
+  // Bit mask to disable certain reference frame types.
+  int ref_frame_flags;
+
+  // Flags to determine which reference buffers are refreshed by this frame.
+  // When set, the encoder will update the particular reference frame buffer
+  // with the contents of the current frame.
+  bool refresh_last_frame;
+  bool refresh_golden_frame;
+  bool refresh_bwd_ref_frame;
+  bool refresh_alt2_ref_frame;
+  bool refresh_alt_ref_frame;
+
+  // Flag to indicate that updation of refresh frame flags from external
+  // interface is pending.
+  bool refresh_frame_flags_pending;
+
+  // Flag to enable the updation of frame contexts at the end of a frame decode.
+  bool refresh_frame_context;
+
+  // Flag to indicate that updation of refresh_frame_context from external
+  // interface is pending.
+  bool refresh_frame_context_pending;
+
+  // Flag to enable temporal MV prediction.
+  bool use_ref_frame_mvs;
+
+  // Flag to code the frame as error-resilient.
+  bool use_error_resilient;
+
+  // Flag to code the frame as s-frame.
+  bool use_s_frame;
+
+  // Flag to set the frame's primary_ref_frame to PRIMARY_REF_NONE.
+  bool use_primary_ref_none;
+} ExternalFlags;
+
+typedef struct {
+  int arf_stack[FRAME_BUFFERS];
+  int arf_stack_size;
+  int lst_stack[FRAME_BUFFERS];
+  int lst_stack_size;
+  int gld_stack[FRAME_BUFFERS];
+  int gld_stack_size;
+} RefBufferStack;
+
+typedef struct {
+  // Some misc info
+  int high_prec;
+  int q;
+  int order;
+
+  // MV counters
+  int inter_count;
+  int intra_count;
+  int default_mvs;
+  int mv_joint_count[4];
+  int last_bit_zero;
+  int last_bit_nonzero;
+
+  // Keep track of the rates
+  int total_mv_rate;
+  int hp_total_mv_rate;
+  int lp_total_mv_rate;
+
+  // Texture info
+  int horz_text;
+  int vert_text;
+  int diag_text;
+
+  // Whether the current struct contains valid data
+  int valid;
+} MV_STATS;
+
+typedef struct {
+  int frame_width;
+  int frame_height;
+  int mi_rows;
+  int mi_cols;
+  int mb_rows;
+  int mb_cols;
+  int num_mbs;
+  aom_bit_depth_t bit_depth;
+  int subsampling_x;
+  int subsampling_y;
+} FRAME_INFO;
+
+typedef struct {
+  // 3-bit number containing the segment affiliation for each 4x4 block in the
+  // frame. map[y * stride + x] contains the segment id of the 4x4 block at
+  // (x,y) position.
+  uint8_t *map;
+  // Flag to indicate if current frame has lossless segments or not.
+  // 1: frame has at least one lossless segment.
+  // 0: frame has no lossless segments.
+  bool has_lossless_segment;
+} EncSegmentationInfo;
+
+typedef struct {
+  // Start time stamp of the previous frame
+  int64_t prev_start_seen;
+  // End time stamp of the previous frame
+  int64_t prev_end_seen;
+  // Start time stamp of the first frame
+  int64_t first_ever;
+} TimeStamps;
+
+typedef struct AV1_COMP {
+  // Quantization and dequantization parameters for internal quantizer setup
+  // in the encoder.
+  EncQuantDequantParams enc_quant_dequant_params;
+  ThreadData td;
+  FRAME_COUNTS counts;
+
+  // Holds buffer storing mode information at 4x4/8x8 level.
+  MBMIExtFrameBufferInfo mbmi_ext_info;
+
+  CB_COEFF_BUFFER *coeff_buffer_base;
+  AV1_COMMON common;
+  AV1EncoderConfig oxcf;
+  struct lookahead_ctx *lookahead;
+  int no_show_kf;
+
+  TRELLIS_OPT_TYPE optimize_seg_arr[MAX_SEGMENTS];
+
+  YV12_BUFFER_CONFIG *source;
+  YV12_BUFFER_CONFIG *last_source;  // NULL for first frame and alt_ref frames
+  YV12_BUFFER_CONFIG *unscaled_source;
+  YV12_BUFFER_CONFIG scaled_source;
+  YV12_BUFFER_CONFIG *unscaled_last_source;
+  YV12_BUFFER_CONFIG scaled_last_source;
+  YV12_BUFFER_CONFIG *unfiltered_source;
+
+  TplParams tpl_data;
+
+  // For a still frame, this flag is set to 1 to skip partition search.
+  int partition_search_skippable_frame;
+
+  // Variables related to forcing integer mv decisions for the current frame.
+  ForceIntegerMVInfo force_intpel_info;
+
+  unsigned int row_mt;
+  RefCntBuffer *scaled_ref_buf[INTER_REFS_PER_FRAME];
+
+  RefCntBuffer *last_show_frame_buf;  // last show frame buffer
+
+  // refresh_*_frame are boolean flags. If 'refresh_xyz_frame' is true, then
+  // after the current frame is encoded, the XYZ reference frame gets refreshed
+  // (updated) to be the current frame.
+  //
+  // Note: Usually at most one of these refresh flags is true at a time.
+  // But a key-frame is special, for which all the flags are true at once.
+  int refresh_golden_frame;
+  int refresh_bwd_ref_frame;
+  int refresh_alt_ref_frame;
+
+  // For each type of reference frame, this contains the index of a reference
+  // frame buffer for a reference frame of the same type.  We use this to
+  // choose our primary reference frame (which is the most recent reference
+  // frame of the same type as the current frame).
+  int fb_of_context_type[REF_FRAMES];
+
+  // Flags signalled by the external interface at frame level.
+  ExternalFlags ext_flags;
+
+  YV12_BUFFER_CONFIG last_frame_uf;
+  YV12_BUFFER_CONFIG trial_frame_rst;
+
+  // Ambient reconstruction err target for force key frames
+  int64_t ambient_err;
+
+  RD_OPT rd;
+
+  CODING_CONTEXT coding_context;
+
+  // Parameters related to global motion search.
+  GlobalMotionInfo gm_info;
+
+  // Parameters related to winner mode processing.
+  WinnerModeParams winner_mode_params;
+
+  // Frame time stamps
+  TimeStamps time_stamps;
+
+  RATE_CONTROL rc;
+  double framerate;
+
+  struct aom_codec_pkt_list *output_pkt_list;
+
+  int ref_frame_flags;
+
+  // speed is passed as a per-frame parameter into the encoder
+  int speed;
+  // sf contains fine-grained config set internally based on speed
+  SPEED_FEATURES sf;
+
+  // Parameters for motion vector search process.
+  MotionVectorSearchParams mv_search_params;
+
+  int all_one_sided_refs;
+
+  // Segmentation related information for current frame.
+  EncSegmentationInfo enc_seg;
+
+  CYCLIC_REFRESH *cyclic_refresh;
+  ActiveMap active_map;
+
+  aom_variance_fn_ptr_t fn_ptr[BLOCK_SIZES_ALL];
+
+#if CONFIG_INTERNAL_STATS
+  uint64_t time_receive_data;
+  uint64_t time_compress_data;
+#endif
+
+  // number of show frames encoded in current gf_group
+  int num_gf_group_show_frames;
+
+  TWO_PASS twopass;
+
+  GF_GROUP gf_group;
+
+  // To control the reference frame buffer and selection.
+  RefBufferStack ref_buffer_stack;
+
+  YV12_BUFFER_CONFIG alt_ref_buffer;
+
+  // Tell if OVERLAY frame shows existing alt_ref frame.
+  int show_existing_alt_ref;
+
+#if CONFIG_INTERNAL_STATS
+  unsigned int mode_chosen_counts[MAX_MODES];
+
+  int count;
+  uint64_t total_sq_error;
+  uint64_t total_samples;
+  ImageStat psnr;
+
+  double total_blockiness;
+  double worst_blockiness;
+
+  int bytes;
+  double summed_quality;
+  double summed_weights;
+  unsigned int tot_recode_hits;
+  double worst_ssim;
+
+  ImageStat fastssim;
+  ImageStat psnrhvs;
+
+  int b_calculate_blockiness;
+  int b_calculate_consistency;
+
+  double total_inconsistency;
+  double worst_consistency;
+  Ssimv *ssim_vars;
+  Metrics metrics;
+#endif
+  int b_calculate_psnr;
+#if CONFIG_SPEED_STATS
+  unsigned int tx_search_count;
+#endif  // CONFIG_SPEED_STATS
+
+  int droppable;
+
+  FRAME_INFO frame_info;
+
+  int initial_width;
+  int initial_height;
+  int initial_mbs;  // Number of MBs in the full-size frame; to be used to
+                    // normalize the firstpass stats. This will differ from the
+                    // number of MBs in the current frame when the frame is
+                    // scaled.
+  // Resize related parameters
+  ResizePendingParams resize_pending_params;
+
+  TileDataEnc *tile_data;
+  int allocated_tiles;  // Keep track of memory allocated for tiles.
+
+  TOKENEXTRA *tile_tok[MAX_TILE_ROWS][MAX_TILE_COLS];
+  TOKENLIST *tplist[MAX_TILE_ROWS][MAX_TILE_COLS];
+
+  // Sequence parameters have been transmitted already and locked
+  // or not. Once locked av1_change_config cannot change the seq
+  // parameters.
+  int seq_params_locked;
+
+  // VARIANCE_AQ segment map refresh
+  int vaq_refresh;
+
+  // Thresholds for variance based partitioning.
+  VarBasedPartitionInfo vbp_info;
+
+  // Probabilities for pruning of various AV1 tools.
+  FrameProbInfo frame_probs;
+
+  // Multi-threading
+  int num_workers;
+  AVxWorker *workers;
+  struct EncWorkerData *tile_thr_data;
+  int existing_fb_idx_to_show;
+  int internal_altref_allowed;
+  // A flag to indicate if intrabc is ever used in current frame.
+  int intrabc_used;
+
+  // Tables to calculate IntraBC MV cost.
+  IntraBCMVCosts dv_costs;
+
+  // Mark which ref frames can be skipped for encoding current frame druing RDO.
+  int prune_ref_frame_mask;
+
+  AV1LfSync lf_row_sync;
+  AV1LrSync lr_row_sync;
+  AV1LrStruct lr_ctxt;
+
+  aom_film_grain_table_t *film_grain_table;
+#if CONFIG_DENOISE
+  struct aom_denoise_and_model_t *denoise_and_model;
+#endif
+
+  // Flags related to interpolation filter search.
+  InterpSearchFlags interp_search_flags;
+
+  MultiThreadHandle multi_thread_ctxt;
+  void (*row_mt_sync_read_ptr)(AV1RowMTSync *const, int, int);
+  void (*row_mt_sync_write_ptr)(AV1RowMTSync *const, int, int, const int);
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t *row_mt_mutex_;
+#endif
+  // Set if screen content is set or relevant tools are enabled
+  int is_screen_content_type;
+#if CONFIG_COLLECT_PARTITION_STATS == 2
+  PartitionStats partition_stats;
+#endif
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  // component_time[] are initialized to zero while encoder starts.
+  uint64_t component_time[kTimingComponents];
+  struct aom_usec_timer component_timer[kTimingComponents];
+  // frame_component_time[] are initialized to zero at beginning of each frame.
+  uint64_t frame_component_time[kTimingComponents];
+#endif
+
+  // Parameters for AV1 bitstream levels.
+  AV1LevelParams level_params;
+
+  // whether any no-zero delta_q was actually used
+  int deltaq_used;
+
+  // Indicates the true relative distance of ref frame w.r.t. current frame
+  int ref_relative_dist[INTER_REFS_PER_FRAME];
+
+  // Indicate nearest references w.r.t. current frame in past and future
+  int8_t nearest_past_ref;
+  int8_t nearest_future_ref;
+
+  // TODO(sdeng): consider merge the following arrays.
+  double *tpl_rdmult_scaling_factors;
+  double *tpl_sb_rdmult_scaling_factors;
+  double *ssim_rdmult_scaling_factors;
+
+#if CONFIG_TUNE_VMAF
+  double *vmaf_rdmult_scaling_factors;
+  double last_frame_ysse;
+  double last_frame_vmaf;
+  double last_frame_unsharp_amount;
+#endif
+
+  int use_svc;
+  SVC svc;
+
+  int lap_enabled;
+  COMPRESSOR_STAGE compressor_stage;
+
+  // Some motion vector stats from the last encoded frame to help us decide what
+  // precision to use to encode the current frame.
+  MV_STATS mv_stats;
+
+  // Frame type of the last frame. May be used in some heuristics for speeding
+  // up the encoding.
+  FRAME_TYPE last_frame_type;
+  int num_tg;
+
+  // Super-resolution mode currently being used by the encoder.
+  // This may / may not be same as user-supplied mode in oxcf->superres_mode
+  // (when we are recoding to try multiple options for example).
+  SUPERRES_MODE superres_mode;
+} AV1_COMP;
+
+typedef struct {
+  YV12_BUFFER_CONFIG *source;
+  YV12_BUFFER_CONFIG *last_source;
+  int64_t ts_duration;
+} EncodeFrameInput;
+
+// EncodeFrameParams contains per-frame encoding parameters decided upon by
+// av1_encode_strategy() and passed down to av1_encode()
+struct EncodeFrameParams {
+  int error_resilient_mode;
+  FRAME_TYPE frame_type;
+  int primary_ref_frame;
+  int order_offset;
+  int show_frame;
+  int refresh_frame_flags;
+
+  int show_existing_frame;
+  int existing_fb_idx_to_show;
+
+  // Bitmask of which reference buffers may be referenced by this frame
+  int ref_frame_flags;
+
+  // Reference buffer assignment for this frame.
+  int remapped_ref_idx[REF_FRAMES];
+
+  // Flags which determine which reference buffers are refreshed by this frame
+  int refresh_golden_frame;
+  int refresh_bwd_ref_frame;
+  int refresh_alt_ref_frame;
+
+  // Speed level to use for this frame: Bigger number means faster.
+  int speed;
+};
+typedef struct EncodeFrameParams EncodeFrameParams;
+
+// EncodeFrameResults contains information about the result of encoding a
+// single frame
+typedef struct {
+  size_t size;  // Size of resulting bitstream
+} EncodeFrameResults;
+
+// Must not be called more than once.
+void av1_initialize_enc(void);
+
+struct AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
+                                       BufferPool *const pool,
+                                       FIRSTPASS_STATS *frame_stats_buf,
+                                       COMPRESSOR_STAGE stage,
+                                       int num_lap_buffers,
+                                       int lap_lag_in_frames,
+                                       STATS_BUFFER_CTX *stats_buf_context);
+void av1_remove_compressor(AV1_COMP *cpi);
+
+void av1_change_config(AV1_COMP *cpi, const AV1EncoderConfig *oxcf);
+
+void av1_check_initial_width(AV1_COMP *cpi, int use_highbitdepth,
+                             int subsampling_x, int subsampling_y);
+
+// receive a frames worth of data. caller can assume that a copy of this
+// frame is made and not just a copy of the pointer..
+int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags,
+                          YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
+                          int64_t end_time_stamp);
+
+int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
+                            size_t *size, uint8_t *dest, int64_t *time_stamp,
+                            int64_t *time_end, int flush,
+                            const aom_rational64_t *timebase);
+
+int av1_encode(AV1_COMP *const cpi, uint8_t *const dest,
+               const EncodeFrameInput *const frame_input,
+               const EncodeFrameParams *const frame_params,
+               EncodeFrameResults *const frame_results);
+
+int av1_get_preview_raw_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *dest);
+
+int av1_get_last_show_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *frame);
+
+aom_codec_err_t av1_copy_new_frame_enc(AV1_COMMON *cm,
+                                       YV12_BUFFER_CONFIG *new_frame,
+                                       YV12_BUFFER_CONFIG *sd);
+
+int av1_use_as_reference(int *ext_ref_frame_flags, int ref_frame_flags);
+
+int av1_copy_reference_enc(AV1_COMP *cpi, int idx, YV12_BUFFER_CONFIG *sd);
+
+int av1_set_reference_enc(AV1_COMP *cpi, int idx, YV12_BUFFER_CONFIG *sd);
+
+int av1_set_size_literal(AV1_COMP *cpi, int width, int height);
+
+void av1_set_frame_size(AV1_COMP *cpi, int width, int height);
+
+int av1_update_entropy(bool *ext_refresh_frame_context,
+                       bool *ext_refresh_frame_context_pending, bool update);
+
+int av1_set_active_map(AV1_COMP *cpi, unsigned char *map, int rows, int cols);
+
+int av1_get_active_map(AV1_COMP *cpi, unsigned char *map, int rows, int cols);
+
+int av1_set_internal_size(AV1EncoderConfig *const oxcf,
+                          ResizePendingParams *resize_pending_params,
+                          AOM_SCALING horiz_mode, AOM_SCALING vert_mode);
+
+int av1_get_quantizer(struct AV1_COMP *cpi);
+
+int av1_convert_sect5obus_to_annexb(uint8_t *buffer, size_t *input_size);
+
+void av1_alloc_compound_type_rd_buffers(AV1_COMMON *const cm,
+                                        CompoundTypeRdBuffers *const bufs);
+void av1_release_compound_type_rd_buffers(CompoundTypeRdBuffers *const bufs);
+
+// Set screen content options.
+// This function estimates whether to use screen content tools, by counting
+// the portion of blocks that have few luma colors.
+// Modifies:
+//   cpi->commom.allow_screen_content_tools
+//   cpi->common.allow_intrabc
+// However, the estimation is not accurate and may misclassify videos.
+// A slower but more accurate approach that determines whether to use screen
+// content tools is employed later. See determine_sc_tools_with_encoding().
+void av1_set_screen_content_options(const struct AV1_COMP *cpi,
+                                    FeatureFlags *features);
+
+// TODO(jingning): Move these functions as primitive members for the new cpi
+// class.
+static INLINE void stack_push(int *stack, int *stack_size, int item) {
+  for (int i = *stack_size - 1; i >= 0; --i) stack[i + 1] = stack[i];
+  stack[0] = item;
+  ++*stack_size;
+}
+
+static INLINE int stack_pop(int *stack, int *stack_size) {
+  if (*stack_size <= 0) return -1;
+
+  int item = stack[0];
+  for (int i = 0; i < *stack_size; ++i) stack[i] = stack[i + 1];
+  --*stack_size;
+
+  return item;
+}
+
+static INLINE int stack_pop_end(int *stack, int *stack_size) {
+  int item = stack[*stack_size - 1];
+  stack[*stack_size - 1] = -1;
+  --*stack_size;
+
+  return item;
+}
+
+static INLINE void stack_reset(int *stack, int *stack_size) {
+  for (int i = 0; i < *stack_size; ++i) stack[i] = INVALID_IDX;
+  *stack_size = 0;
+}
+
+// av1 uses 10,000,000 ticks/second as time stamp
+#define TICKS_PER_SEC 10000000LL
+
+static INLINE int64_t
+timebase_units_to_ticks(const aom_rational64_t *timestamp_ratio, int64_t n) {
+  return n * timestamp_ratio->num / timestamp_ratio->den;
+}
+
+static INLINE int64_t
+ticks_to_timebase_units(const aom_rational64_t *timestamp_ratio, int64_t n) {
+  int64_t round = timestamp_ratio->num / 2;
+  if (round > 0) --round;
+  return (n * timestamp_ratio->den + round) / timestamp_ratio->num;
+}
+
+static INLINE int frame_is_kf_gf_arf(const AV1_COMP *cpi) {
+  const GF_GROUP *const gf_group = &cpi->gf_group;
+  const FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_group->index];
+
+  return frame_is_intra_only(&cpi->common) || update_type == ARF_UPDATE ||
+         update_type == GF_UPDATE;
+}
+
+// TODO(huisu@google.com, youzhou@microsoft.com): enable hash-me for HBD.
+static INLINE int av1_use_hash_me(const AV1_COMP *const cpi) {
+  return (cpi->common.features.allow_screen_content_tools &&
+          cpi->common.features.allow_intrabc &&
+          frame_is_intra_only(&cpi->common));
+}
+
+static INLINE const YV12_BUFFER_CONFIG *get_ref_frame_yv12_buf(
+    const AV1_COMMON *const cm, MV_REFERENCE_FRAME ref_frame) {
+  const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame);
+  return buf != NULL ? &buf->buf : NULL;
+}
+
+static INLINE int enc_is_ref_frame_buf(const AV1_COMMON *const cm,
+                                       const RefCntBuffer *const frame_buf) {
+  MV_REFERENCE_FRAME ref_frame;
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame);
+    if (buf == NULL) continue;
+    if (frame_buf == buf) break;
+  }
+  return (ref_frame <= ALTREF_FRAME);
+}
+
+static INLINE void alloc_frame_mvs(AV1_COMMON *const cm, RefCntBuffer *buf) {
+  assert(buf != NULL);
+  ensure_mv_buffer(buf, cm);
+  buf->width = cm->width;
+  buf->height = cm->height;
+}
+
+// Token buffer is only used for palette tokens.
+static INLINE unsigned int get_token_alloc(int mb_rows, int mb_cols,
+                                           int sb_size_log2,
+                                           const int num_planes) {
+  // Calculate the maximum number of max superblocks in the image.
+  const int shift = sb_size_log2 - 4;
+  const int sb_size = 1 << sb_size_log2;
+  const int sb_size_square = sb_size * sb_size;
+  const int sb_rows = ALIGN_POWER_OF_TWO(mb_rows, shift) >> shift;
+  const int sb_cols = ALIGN_POWER_OF_TWO(mb_cols, shift) >> shift;
+
+  // One palette token for each pixel. There can be palettes on two planes.
+  const int sb_palette_toks = AOMMIN(2, num_planes) * sb_size_square;
+
+  return sb_rows * sb_cols * sb_palette_toks;
+}
+
+// Get the allocated token size for a tile. It does the same calculation as in
+// the frame token allocation.
+static INLINE unsigned int allocated_tokens(TileInfo tile, int sb_size_log2,
+                                            int num_planes) {
+  int tile_mb_rows = (tile.mi_row_end - tile.mi_row_start + 2) >> 2;
+  int tile_mb_cols = (tile.mi_col_end - tile.mi_col_start + 2) >> 2;
+
+  return get_token_alloc(tile_mb_rows, tile_mb_cols, sb_size_log2, num_planes);
+}
+
+static INLINE void get_start_tok(AV1_COMP *cpi, int tile_row, int tile_col,
+                                 int mi_row, TOKENEXTRA **tok, int sb_size_log2,
+                                 int num_planes) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int tile_cols = cm->tiles.cols;
+  TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col];
+  const TileInfo *const tile_info = &this_tile->tile_info;
+
+  const int tile_mb_cols =
+      (tile_info->mi_col_end - tile_info->mi_col_start + 2) >> 2;
+  const int tile_mb_row = (mi_row - tile_info->mi_row_start + 2) >> 2;
+
+  *tok = cpi->tile_tok[tile_row][tile_col] +
+         get_token_alloc(tile_mb_row, tile_mb_cols, sb_size_log2, num_planes);
+}
+
+void av1_apply_encoding_flags(AV1_COMP *cpi, aom_enc_frame_flags_t flags);
+
+#define ALT_MIN_LAG 3
+static INLINE int is_altref_enabled(const AV1_COMP *const cpi) {
+  return cpi->oxcf.lag_in_frames >= ALT_MIN_LAG && cpi->oxcf.enable_auto_arf;
+}
+
+// Check if statistics generation stage
+static INLINE int is_stat_generation_stage(const AV1_COMP *const cpi) {
+  assert(IMPLIES(cpi->compressor_stage == LAP_STAGE,
+                 cpi->oxcf.pass == 0 && cpi->lap_enabled));
+  return (cpi->oxcf.pass == 1 || (cpi->compressor_stage == LAP_STAGE));
+}
+// Check if statistics consumption stage
+static INLINE int is_stat_consumption_stage_twopass(const AV1_COMP *const cpi) {
+  return (cpi->oxcf.pass == 2);
+}
+
+// Check if statistics consumption stage
+static INLINE int is_stat_consumption_stage(const AV1_COMP *const cpi) {
+  return (is_stat_consumption_stage_twopass(cpi) ||
+          (cpi->oxcf.pass == 0 && (cpi->compressor_stage == ENCODE_STAGE) &&
+           cpi->lap_enabled));
+}
+
+// Check if the current stage has statistics
+static INLINE int has_no_stats_stage(const AV1_COMP *const cpi) {
+  assert(IMPLIES(!cpi->lap_enabled, cpi->compressor_stage == ENCODE_STAGE));
+  return (cpi->oxcf.pass == 0 && !cpi->lap_enabled);
+}
+
+// Function return size of frame stats buffer
+static INLINE int get_stats_buf_size(int num_lap_buffer, int num_lag_buffer) {
+  /* if lookahead is enabled return num_lap_buffers else num_lag_buffers */
+  return (num_lap_buffer > 0 ? num_lap_buffer + 1 : num_lag_buffer);
+}
+
+// TODO(zoeliu): To set up cpi->oxcf.enable_auto_brf
+
+static INLINE void set_ref_ptrs(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                MV_REFERENCE_FRAME ref0,
+                                MV_REFERENCE_FRAME ref1) {
+  xd->block_ref_scale_factors[0] =
+      get_ref_scale_factors_const(cm, ref0 >= LAST_FRAME ? ref0 : 1);
+  xd->block_ref_scale_factors[1] =
+      get_ref_scale_factors_const(cm, ref1 >= LAST_FRAME ? ref1 : 1);
+}
+
+static INLINE int get_chessboard_index(int frame_index) {
+  return frame_index & 0x1;
+}
+
+static INLINE const int *cond_cost_list_const(const struct AV1_COMP *cpi,
+                                              const int *cost_list) {
+  const int use_cost_list = cpi->sf.mv_sf.subpel_search_method != SUBPEL_TREE &&
+                            cpi->sf.mv_sf.use_fullpel_costlist;
+  return use_cost_list ? cost_list : NULL;
+}
+
+static INLINE int *cond_cost_list(const struct AV1_COMP *cpi, int *cost_list) {
+  const int use_cost_list = cpi->sf.mv_sf.subpel_search_method != SUBPEL_TREE &&
+                            cpi->sf.mv_sf.use_fullpel_costlist;
+  return use_cost_list ? cost_list : NULL;
+}
+
+// Compression ratio of current frame.
+double av1_get_compression_ratio(const AV1_COMMON *const cm,
+                                 size_t encoded_frame_size);
+
+void av1_new_framerate(AV1_COMP *cpi, double framerate);
+
+void av1_setup_frame_size(AV1_COMP *cpi);
+
+#define LAYER_IDS_TO_IDX(sl, tl, num_tl) ((sl) * (num_tl) + (tl))
+
+// Returns 1 if a frame is scaled and 0 otherwise.
+static INLINE int av1_resize_scaled(const AV1_COMMON *cm) {
+  return !(cm->superres_upscaled_width == cm->render_width &&
+           cm->superres_upscaled_height == cm->render_height);
+}
+
+static INLINE int av1_frame_scaled(const AV1_COMMON *cm) {
+  return !av1_superres_scaled(cm) && av1_resize_scaled(cm);
+}
+
+// Don't allow a show_existing_frame to coincide with an error resilient
+// frame. An exception can be made for a forward keyframe since it has no
+// previous dependencies.
+static INLINE int encode_show_existing_frame(const AV1_COMMON *cm) {
+  return cm->show_existing_frame && (!cm->features.error_resilient_mode ||
+                                     cm->current_frame.frame_type == KEY_FRAME);
+}
+
+// Get index into the 'cpi->mbmi_ext_info.frame_base' array for the given
+// 'mi_row' and 'mi_col'.
+static INLINE int get_mi_ext_idx(const int mi_row, const int mi_col,
+                                 const BLOCK_SIZE mi_alloc_bsize,
+                                 const int mbmi_ext_stride) {
+  const int mi_ext_size_1d = mi_size_wide[mi_alloc_bsize];
+  const int mi_ext_row = mi_row / mi_ext_size_1d;
+  const int mi_ext_col = mi_col / mi_ext_size_1d;
+  return mi_ext_row * mbmi_ext_stride + mi_ext_col;
+}
+
+// Lighter version of set_offsets that only sets the mode info
+// pointers.
+static INLINE void set_mode_info_offsets(
+    const CommonModeInfoParams *const mi_params,
+    const MBMIExtFrameBufferInfo *const mbmi_ext_info, MACROBLOCK *const x,
+    MACROBLOCKD *const xd, int mi_row, int mi_col) {
+  set_mi_offsets(mi_params, xd, mi_row, mi_col);
+  const int ext_idx = get_mi_ext_idx(mi_row, mi_col, mi_params->mi_alloc_bsize,
+                                     mbmi_ext_info->stride);
+  x->mbmi_ext_frame = mbmi_ext_info->frame_base + ext_idx;
+}
+
+// Check to see if the given partition size is allowed for a specified number
+// of mi block rows and columns remaining in the image.
+// If not then return the largest allowed partition size
+static INLINE BLOCK_SIZE find_partition_size(BLOCK_SIZE bsize, int rows_left,
+                                             int cols_left, int *bh, int *bw) {
+  int int_size = (int)bsize;
+  if (rows_left <= 0 || cols_left <= 0) {
+    return AOMMIN(bsize, BLOCK_8X8);
+  } else {
+    for (; int_size > 0; int_size -= 3) {
+      *bh = mi_size_high[int_size];
+      *bw = mi_size_wide[int_size];
+      if ((*bh <= rows_left) && (*bw <= cols_left)) {
+        break;
+      }
+    }
+  }
+  return (BLOCK_SIZE)int_size;
+}
+
+static const uint8_t av1_ref_frame_flag_list[REF_FRAMES] = { 0,
+                                                             AOM_LAST_FLAG,
+                                                             AOM_LAST2_FLAG,
+                                                             AOM_LAST3_FLAG,
+                                                             AOM_GOLD_FLAG,
+                                                             AOM_BWD_FLAG,
+                                                             AOM_ALT2_FLAG,
+                                                             AOM_ALT_FLAG };
+
+// When more than 'max_allowed_refs' are available, we reduce the number of
+// reference frames one at a time based on this order.
+static const MV_REFERENCE_FRAME disable_order[] = {
+  LAST3_FRAME,
+  LAST2_FRAME,
+  ALTREF2_FRAME,
+  GOLDEN_FRAME,
+};
+
+static INLINE int get_max_allowed_ref_frames(const AV1_COMP *cpi) {
+  const unsigned int max_allowed_refs_for_given_speed =
+      (cpi->sf.inter_sf.selective_ref_frame >= 3) ? INTER_REFS_PER_FRAME - 1
+                                                  : INTER_REFS_PER_FRAME;
+  return AOMMIN(max_allowed_refs_for_given_speed,
+                cpi->oxcf.max_reference_frames);
+}
+
+static const MV_REFERENCE_FRAME
+    ref_frame_priority_order[INTER_REFS_PER_FRAME] = {
+      LAST_FRAME,    ALTREF_FRAME, BWDREF_FRAME, GOLDEN_FRAME,
+      ALTREF2_FRAME, LAST2_FRAME,  LAST3_FRAME,
+    };
+
+static INLINE int get_ref_frame_flags(const SPEED_FEATURES *const sf,
+                                      const YV12_BUFFER_CONFIG **ref_frames,
+                                      const int ext_ref_frame_flags) {
+  // cpi->ext_flags.ref_frame_flags allows certain reference types to be
+  // disabled by the external interface.  These are set by
+  // av1_apply_encoding_flags(). Start with what the external interface allows,
+  // then suppress any reference types which we have found to be duplicates.
+  int flags = ext_ref_frame_flags;
+
+  for (int i = 1; i < INTER_REFS_PER_FRAME; ++i) {
+    const YV12_BUFFER_CONFIG *const this_ref = ref_frames[i];
+    // If this_ref has appeared before, mark the corresponding ref frame as
+    // invalid. For nonrd mode, only disable GOLDEN_FRAME if it's the same
+    // as LAST_FRAME or ALTREF_FRAME (if ALTREF is being used in nonrd).
+    int index = (sf->rt_sf.use_nonrd_pick_mode &&
+                 ref_frame_priority_order[i] == GOLDEN_FRAME)
+                    ? (1 + sf->rt_sf.use_nonrd_altref_frame)
+                    : i;
+    for (int j = 0; j < index; ++j) {
+      if (this_ref == ref_frames[j]) {
+        flags &= ~(1 << (ref_frame_priority_order[i] - 1));
+        break;
+      }
+    }
+  }
+  return flags;
+}
+
+// Enforce the number of references for each arbitrary frame based on user
+// options and speed.
+static AOM_INLINE void enforce_max_ref_frames(AV1_COMP *cpi,
+                                              int *ref_frame_flags) {
+  MV_REFERENCE_FRAME ref_frame;
+  int total_valid_refs = 0;
+
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    if (*ref_frame_flags & av1_ref_frame_flag_list[ref_frame]) {
+      total_valid_refs++;
+    }
+  }
+
+  const int max_allowed_refs = get_max_allowed_ref_frames(cpi);
+
+  for (int i = 0; i < 4 && total_valid_refs > max_allowed_refs; ++i) {
+    const MV_REFERENCE_FRAME ref_frame_to_disable = disable_order[i];
+
+    if (!(*ref_frame_flags & av1_ref_frame_flag_list[ref_frame_to_disable])) {
+      continue;
+    }
+
+    switch (ref_frame_to_disable) {
+      case LAST3_FRAME: *ref_frame_flags &= ~AOM_LAST3_FLAG; break;
+      case LAST2_FRAME: *ref_frame_flags &= ~AOM_LAST2_FLAG; break;
+      case ALTREF2_FRAME: *ref_frame_flags &= ~AOM_ALT2_FLAG; break;
+      case GOLDEN_FRAME: *ref_frame_flags &= ~AOM_GOLD_FLAG; break;
+      default: assert(0);
+    }
+    --total_valid_refs;
+  }
+  assert(total_valid_refs <= max_allowed_refs);
+}
+
+// Returns a Sequence Header OBU stored in an aom_fixed_buf_t, or NULL upon
+// failure. When a non-NULL aom_fixed_buf_t pointer is returned by this
+// function, the memory must be freed by the caller. Both the buf member of the
+// aom_fixed_buf_t, and the aom_fixed_buf_t pointer itself must be freed. Memory
+// returned must be freed via call to free().
+//
+// Note: The OBU returned is in Low Overhead Bitstream Format. Specifically,
+// the obu_has_size_field bit is set, and the buffer contains the obu_size
+// field.
+aom_fixed_buf_t *av1_get_global_headers(AV1_COMP *cpi);
+
+#define MAX_GFUBOOST_FACTOR 10.0
+#define MIN_GFUBOOST_FACTOR 4.0
+double av1_get_gfu_boost_projection_factor(double min_factor, double max_factor,
+                                           int frame_count);
+double av1_get_kf_boost_projection_factor(int frame_count);
+
+#define ENABLE_KF_TPL 1
+#define MAX_PYR_LEVEL_FROMTOP_DELTAQ 0
+
+static INLINE int is_frame_kf_and_tpl_eligible(AV1_COMP *const cpi) {
+  AV1_COMMON *cm = &cpi->common;
+  return (cm->current_frame.frame_type == KEY_FRAME) && cm->show_frame &&
+         (cpi->rc.frames_to_key > 1);
+}
+
+static INLINE int is_frame_arf_and_tpl_eligible(const GF_GROUP *gf_group) {
+  const FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_group->index];
+  return update_type == ARF_UPDATE || update_type == GF_UPDATE;
+}
+
+static INLINE int is_frame_tpl_eligible(AV1_COMP *const cpi) {
+#if ENABLE_KF_TPL
+  return is_frame_kf_and_tpl_eligible(cpi) ||
+         is_frame_arf_and_tpl_eligible(&cpi->gf_group);
+#else
+  return is_frame_arf_and_tpl_eligible(&cpi->gf_group);
+#endif  // ENABLE_KF_TPL
+}
+
+// Get update type of the current frame.
+static INLINE FRAME_UPDATE_TYPE
+get_frame_update_type(const GF_GROUP *gf_group) {
+  return gf_group->update_type[gf_group->index];
+}
+
+static INLINE int av1_pixels_to_mi(int pixels) {
+  return ALIGN_POWER_OF_TWO(pixels, 3) >> MI_SIZE_LOG2;
+}
+
+#if CONFIG_COLLECT_PARTITION_STATS == 2
+static INLINE void av1_print_partition_stats(PartitionStats *part_stats) {
+  FILE *f = fopen("partition_stats.csv", "w");
+  if (!f) {
+    return;
+  }
+
+  fprintf(f, "bsize,redo,");
+  for (int part = 0; part < EXT_PARTITION_TYPES; part++) {
+    fprintf(f, "decision_%d,", part);
+  }
+  for (int part = 0; part < EXT_PARTITION_TYPES; part++) {
+    fprintf(f, "attempt_%d,", part);
+  }
+  for (int part = 0; part < EXT_PARTITION_TYPES; part++) {
+    fprintf(f, "time_%d,", part);
+  }
+  fprintf(f, "\n");
+
+  const int bsizes[6] = { 128, 64, 32, 16, 8, 4 };
+
+  for (int bsize_idx = 0; bsize_idx < 6; bsize_idx++) {
+    fprintf(f, "%d,%d,", bsizes[bsize_idx], part_stats->partition_redo);
+    for (int part = 0; part < EXT_PARTITION_TYPES; part++) {
+      fprintf(f, "%d,", part_stats->partition_decisions[bsize_idx][part]);
+    }
+    for (int part = 0; part < EXT_PARTITION_TYPES; part++) {
+      fprintf(f, "%d,", part_stats->partition_attempts[bsize_idx][part]);
+    }
+    for (int part = 0; part < EXT_PARTITION_TYPES; part++) {
+      fprintf(f, "%ld,", part_stats->partition_times[bsize_idx][part]);
+    }
+    fprintf(f, "\n");
+  }
+  fclose(f);
+}
+
+static INLINE int av1_get_bsize_idx_for_part_stats(BLOCK_SIZE bsize) {
+  assert(bsize == BLOCK_128X128 || bsize == BLOCK_64X64 ||
+         bsize == BLOCK_32X32 || bsize == BLOCK_16X16 || bsize == BLOCK_8X8 ||
+         bsize == BLOCK_4X4);
+  switch (bsize) {
+    case BLOCK_128X128: return 0;
+    case BLOCK_64X64: return 1;
+    case BLOCK_32X32: return 2;
+    case BLOCK_16X16: return 3;
+    case BLOCK_8X8: return 4;
+    case BLOCK_4X4: return 5;
+    default: assert(0 && "Invalid bsize for partition_stats."); return -1;
+  }
+}
+#endif
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+static INLINE void start_timing(AV1_COMP *cpi, int component) {
+  aom_usec_timer_start(&cpi->component_timer[component]);
+}
+static INLINE void end_timing(AV1_COMP *cpi, int component) {
+  aom_usec_timer_mark(&cpi->component_timer[component]);
+  cpi->frame_component_time[component] +=
+      aom_usec_timer_elapsed(&cpi->component_timer[component]);
+}
+static INLINE char const *get_frame_type_enum(int type) {
+  switch (type) {
+    case 0: return "KEY_FRAME";
+    case 1: return "INTER_FRAME";
+    case 2: return "INTRA_ONLY_FRAME";
+    case 3: return "S_FRAME";
+    default: assert(0);
+  }
+  return "error";
+}
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_ENCODER_H_
diff --git a/libs/libaom/src/av1/encoder/encodetxb.c b/libs/libaom/src/av1/encoder/encodetxb.c
new file mode 100644
index 000000000..825d52a7a
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/encodetxb.c
@@ -0,0 +1,2261 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/encoder/encodetxb.h"
+
+#include "aom_ports/mem.h"
+#include "av1/common/blockd.h"
+#include "av1/common/idct.h"
+#include "av1/common/pred_common.h"
+#include "av1/common/scan.h"
+#include "av1/encoder/bitstream.h"
+#include "av1/encoder/cost.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/hash.h"
+#include "av1/encoder/rdopt.h"
+#include "av1/encoder/tokenize.h"
+
+#if CONFIG_HTB_TRELLIS
+static int hbt_needs_init = 1;
+static CRC32C crc_calculator;
+static const int HBT_EOB = 16;            // also the length in opt_qcoeff
+static const int HBT_TABLE_SIZE = 65536;  // 16 bit: holds 65536 'arrays'
+static const int HBT_ARRAY_LENGTH = 256;  // 8 bit: 256 entries
+// If removed in hbt_create_hashes or increased beyond int8_t, widen deltas type
+static const int HBT_KICKOUT = 3;
+
+typedef struct OptTxbQcoeff {
+  // Use larger type if larger/no kickout value is used in hbt_create_hashes
+  int8_t deltas[16];
+  uint32_t hbt_qc_hash;
+  uint32_t hbt_ctx_hash;
+  int init;
+  int rate_cost;
+} OptTxbQcoeff;
+
+OptTxbQcoeff *hbt_hash_table;
+#endif  // CONFIG_HTB_TRELLIS
+
+typedef struct LevelDownStats {
+  int update;
+  tran_low_t low_qc;
+  tran_low_t low_dqc;
+  int64_t dist0;
+  int rate;
+  int rate_low;
+  int64_t dist;
+  int64_t dist_low;
+  int64_t rd;
+  int64_t rd_low;
+  int64_t nz_rd;
+  int64_t rd_diff;
+  int cost_diff;
+  int64_t dist_diff;
+  int new_eob;
+} LevelDownStats;
+
+static INLINE int get_dqv(const int16_t *dequant, int coeff_idx,
+                          const qm_val_t *iqmatrix) {
+  int dqv = dequant[!!coeff_idx];
+  if (iqmatrix != NULL)
+    dqv =
+        ((iqmatrix[coeff_idx] * dqv) + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
+  return dqv;
+}
+
+void av1_alloc_txb_buf(AV1_COMP *cpi) {
+  AV1_COMMON *cm = &cpi->common;
+  int size = ((cm->mi_params.mi_rows >> cm->seq_params.mib_size_log2) + 1) *
+             ((cm->mi_params.mi_cols >> cm->seq_params.mib_size_log2) + 1);
+
+  av1_free_txb_buf(cpi);
+  // TODO(jingning): This should be further reduced.
+  CHECK_MEM_ERROR(cm, cpi->coeff_buffer_base,
+                  aom_memalign(32, sizeof(*cpi->coeff_buffer_base) * size));
+}
+
+void av1_free_txb_buf(AV1_COMP *cpi) { aom_free(cpi->coeff_buffer_base); }
+
+static void write_golomb(aom_writer *w, int level) {
+  int x = level + 1;
+  int i = x;
+  int length = 0;
+
+  while (i) {
+    i >>= 1;
+    ++length;
+  }
+  assert(length > 0);
+
+  for (i = 0; i < length - 1; ++i) aom_write_bit(w, 0);
+
+  for (i = length - 1; i >= 0; --i) aom_write_bit(w, (x >> i) & 0x01);
+}
+
+static INLINE tran_low_t get_lower_coeff(tran_low_t qc) {
+  if (qc == 0) {
+    return 0;
+  }
+  return qc > 0 ? qc - 1 : qc + 1;
+}
+
+static INLINE tran_low_t qcoeff_to_dqcoeff(tran_low_t qc, int coeff_idx,
+                                           int dqv, int shift,
+                                           const qm_val_t *iqmatrix) {
+  int sign = qc < 0 ? -1 : 1;
+  if (iqmatrix != NULL)
+    dqv =
+        ((iqmatrix[coeff_idx] * dqv) + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
+  return sign * ((abs(qc) * dqv) >> shift);
+}
+
+static INLINE int64_t get_coeff_dist(tran_low_t tcoeff, tran_low_t dqcoeff,
+                                     int shift) {
+  const int64_t diff = (tcoeff - dqcoeff) * (1 << shift);
+  const int64_t error = diff * diff;
+  return error;
+}
+
+static const int8_t eob_to_pos_small[33] = {
+  0, 1, 2,                                        // 0-2
+  3, 3,                                           // 3-4
+  4, 4, 4, 4,                                     // 5-8
+  5, 5, 5, 5, 5, 5, 5, 5,                         // 9-16
+  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6  // 17-32
+};
+
+static const int8_t eob_to_pos_large[17] = {
+  6,                               // place holder
+  7,                               // 33-64
+  8,  8,                           // 65-128
+  9,  9,  9,  9,                   // 129-256
+  10, 10, 10, 10, 10, 10, 10, 10,  // 257-512
+  11                               // 513-
+};
+
+static INLINE int get_eob_pos_token(const int eob, int *const extra) {
+  int t;
+
+  if (eob < 33) {
+    t = eob_to_pos_small[eob];
+  } else {
+    const int e = AOMMIN((eob - 1) >> 5, 16);
+    t = eob_to_pos_large[e];
+  }
+
+  *extra = eob - av1_eob_group_start[t];
+
+  return t;
+}
+
+#if CONFIG_ENTROPY_STATS
+void av1_update_eob_context(int cdf_idx, int eob, TX_SIZE tx_size,
+                            TX_CLASS tx_class, PLANE_TYPE plane,
+                            FRAME_CONTEXT *ec_ctx, FRAME_COUNTS *counts,
+                            uint8_t allow_update_cdf) {
+#else
+void av1_update_eob_context(int eob, TX_SIZE tx_size, TX_CLASS tx_class,
+                            PLANE_TYPE plane, FRAME_CONTEXT *ec_ctx,
+                            uint8_t allow_update_cdf) {
+#endif
+  int eob_extra;
+  const int eob_pt = get_eob_pos_token(eob, &eob_extra);
+  TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+
+  const int eob_multi_size = txsize_log2_minus4[tx_size];
+  const int eob_multi_ctx = (tx_class == TX_CLASS_2D) ? 0 : 1;
+
+  switch (eob_multi_size) {
+    case 0:
+#if CONFIG_ENTROPY_STATS
+      ++counts->eob_multi16[cdf_idx][plane][eob_multi_ctx][eob_pt - 1];
+#endif
+      if (allow_update_cdf)
+        update_cdf(ec_ctx->eob_flag_cdf16[plane][eob_multi_ctx], eob_pt - 1, 5);
+      break;
+    case 1:
+#if CONFIG_ENTROPY_STATS
+      ++counts->eob_multi32[cdf_idx][plane][eob_multi_ctx][eob_pt - 1];
+#endif
+      if (allow_update_cdf)
+        update_cdf(ec_ctx->eob_flag_cdf32[plane][eob_multi_ctx], eob_pt - 1, 6);
+      break;
+    case 2:
+#if CONFIG_ENTROPY_STATS
+      ++counts->eob_multi64[cdf_idx][plane][eob_multi_ctx][eob_pt - 1];
+#endif
+      if (allow_update_cdf)
+        update_cdf(ec_ctx->eob_flag_cdf64[plane][eob_multi_ctx], eob_pt - 1, 7);
+      break;
+    case 3:
+#if CONFIG_ENTROPY_STATS
+      ++counts->eob_multi128[cdf_idx][plane][eob_multi_ctx][eob_pt - 1];
+#endif
+      if (allow_update_cdf) {
+        update_cdf(ec_ctx->eob_flag_cdf128[plane][eob_multi_ctx], eob_pt - 1,
+                   8);
+      }
+      break;
+    case 4:
+#if CONFIG_ENTROPY_STATS
+      ++counts->eob_multi256[cdf_idx][plane][eob_multi_ctx][eob_pt - 1];
+#endif
+      if (allow_update_cdf) {
+        update_cdf(ec_ctx->eob_flag_cdf256[plane][eob_multi_ctx], eob_pt - 1,
+                   9);
+      }
+      break;
+    case 5:
+#if CONFIG_ENTROPY_STATS
+      ++counts->eob_multi512[cdf_idx][plane][eob_multi_ctx][eob_pt - 1];
+#endif
+      if (allow_update_cdf) {
+        update_cdf(ec_ctx->eob_flag_cdf512[plane][eob_multi_ctx], eob_pt - 1,
+                   10);
+      }
+      break;
+    case 6:
+    default:
+#if CONFIG_ENTROPY_STATS
+      ++counts->eob_multi1024[cdf_idx][plane][eob_multi_ctx][eob_pt - 1];
+#endif
+      if (allow_update_cdf) {
+        update_cdf(ec_ctx->eob_flag_cdf1024[plane][eob_multi_ctx], eob_pt - 1,
+                   11);
+      }
+      break;
+  }
+
+  if (av1_eob_offset_bits[eob_pt] > 0) {
+    int eob_ctx = eob_pt - 3;
+    int eob_shift = av1_eob_offset_bits[eob_pt] - 1;
+    int bit = (eob_extra & (1 << eob_shift)) ? 1 : 0;
+#if CONFIG_ENTROPY_STATS
+    counts->eob_extra[cdf_idx][txs_ctx][plane][eob_pt][bit]++;
+#endif  // CONFIG_ENTROPY_STATS
+    if (allow_update_cdf)
+      update_cdf(ec_ctx->eob_extra_cdf[txs_ctx][plane][eob_ctx], bit, 2);
+  }
+}
+
+static int get_eob_cost(int eob, const LV_MAP_EOB_COST *txb_eob_costs,
+                        const LV_MAP_COEFF_COST *txb_costs, TX_CLASS tx_class) {
+  int eob_extra;
+  const int eob_pt = get_eob_pos_token(eob, &eob_extra);
+  int eob_cost = 0;
+  const int eob_multi_ctx = (tx_class == TX_CLASS_2D) ? 0 : 1;
+  eob_cost = txb_eob_costs->eob_cost[eob_multi_ctx][eob_pt - 1];
+
+  if (av1_eob_offset_bits[eob_pt] > 0) {
+    const int eob_ctx = eob_pt - 3;
+    const int eob_shift = av1_eob_offset_bits[eob_pt] - 1;
+    const int bit = (eob_extra & (1 << eob_shift)) ? 1 : 0;
+    eob_cost += txb_costs->eob_extra_cost[eob_ctx][bit];
+    const int offset_bits = av1_eob_offset_bits[eob_pt];
+    if (offset_bits > 1) eob_cost += av1_cost_literal(offset_bits - 1);
+  }
+  return eob_cost;
+}
+
+static INLINE int get_sign_bit_cost(tran_low_t qc, int coeff_idx,
+                                    const int (*dc_sign_cost)[2],
+                                    int dc_sign_ctx) {
+  if (coeff_idx == 0) {
+    const int sign = (qc < 0) ? 1 : 0;
+    return dc_sign_cost[dc_sign_ctx][sign];
+  }
+  return av1_cost_literal(1);
+}
+
+static const int golomb_bits_cost[32] = {
+  0,       512,     512 * 3, 512 * 3, 512 * 5, 512 * 5, 512 * 5, 512 * 5,
+  512 * 7, 512 * 7, 512 * 7, 512 * 7, 512 * 7, 512 * 7, 512 * 7, 512 * 7,
+  512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9,
+  512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9
+};
+static const int golomb_cost_diff[32] = {
+  0,       512, 512 * 2, 0, 512 * 2, 0, 0, 0, 512 * 2, 0, 0, 0, 0, 0, 0, 0,
+  512 * 2, 0,   0,       0, 0,       0, 0, 0, 0,       0, 0, 0, 0, 0, 0, 0
+};
+
+static INLINE int get_golomb_cost(int abs_qc) {
+  if (abs_qc >= 1 + NUM_BASE_LEVELS + COEFF_BASE_RANGE) {
+    const int r = abs_qc - COEFF_BASE_RANGE - NUM_BASE_LEVELS;
+    const int length = get_msb(r) + 1;
+    return av1_cost_literal(2 * length - 1);
+  }
+  return 0;
+}
+
+static INLINE int get_br_cost_with_diff(tran_low_t level, const int *coeff_lps,
+                                        int *diff) {
+  const int base_range = AOMMIN(level - 1 - NUM_BASE_LEVELS, COEFF_BASE_RANGE);
+  int golomb_bits = 0;
+  if (level <= COEFF_BASE_RANGE + 1 + NUM_BASE_LEVELS)
+    *diff += coeff_lps[base_range + COEFF_BASE_RANGE + 1];
+
+  if (level >= COEFF_BASE_RANGE + 1 + NUM_BASE_LEVELS) {
+    int r = level - COEFF_BASE_RANGE - NUM_BASE_LEVELS;
+    if (r < 32) {
+      golomb_bits = golomb_bits_cost[r];
+      *diff += golomb_cost_diff[r];
+    } else {
+      golomb_bits = get_golomb_cost(level);
+      *diff += (r & (r - 1)) == 0 ? 1024 : 0;
+    }
+  }
+
+  return coeff_lps[base_range] + golomb_bits;
+}
+
+static INLINE int get_br_cost(tran_low_t level, const int *coeff_lps) {
+  const int base_range = AOMMIN(level - 1 - NUM_BASE_LEVELS, COEFF_BASE_RANGE);
+  return coeff_lps[base_range] + get_golomb_cost(level);
+}
+
+static int get_coeff_cost(const tran_low_t qc, const int scan_idx,
+                          const int is_eob, const TxbInfo *const txb_info,
+                          const LV_MAP_COEFF_COST *const txb_costs,
+                          const int coeff_ctx, const TX_CLASS tx_class) {
+  const TXB_CTX *const txb_ctx = txb_info->txb_ctx;
+  const int is_nz = (qc != 0);
+  const tran_low_t abs_qc = abs(qc);
+  int cost = 0;
+  const int16_t *const scan = txb_info->scan_order->scan;
+  const int pos = scan[scan_idx];
+
+  if (is_eob) {
+    cost += txb_costs->base_eob_cost[coeff_ctx][AOMMIN(abs_qc, 3) - 1];
+  } else {
+    cost += txb_costs->base_cost[coeff_ctx][AOMMIN(abs_qc, 3)];
+  }
+  if (is_nz) {
+    cost += get_sign_bit_cost(qc, scan_idx, txb_costs->dc_sign_cost,
+                              txb_ctx->dc_sign_ctx);
+
+    if (abs_qc > NUM_BASE_LEVELS) {
+      const int ctx =
+          get_br_ctx(txb_info->levels, pos, txb_info->bwl, tx_class);
+      cost += get_br_cost(abs_qc, txb_costs->lps_cost[ctx]);
+    }
+  }
+  return cost;
+}
+
+static INLINE int get_nz_map_ctx(const uint8_t *const levels,
+                                 const int coeff_idx, const int bwl,
+                                 const int height, const int scan_idx,
+                                 const int is_eob, const TX_SIZE tx_size,
+                                 const TX_CLASS tx_class) {
+  if (is_eob) {
+    if (scan_idx == 0) return 0;
+    if (scan_idx <= (height << bwl) / 8) return 1;
+    if (scan_idx <= (height << bwl) / 4) return 2;
+    return 3;
+  }
+  const int stats =
+      get_nz_mag(levels + get_padded_idx(coeff_idx, bwl), bwl, tx_class);
+  return get_nz_map_ctx_from_stats(stats, coeff_idx, bwl, tx_size, tx_class);
+}
+
+static void get_dist_cost_stats(LevelDownStats *const stats, const int scan_idx,
+                                const int is_eob,
+                                const LV_MAP_COEFF_COST *const txb_costs,
+                                const TxbInfo *const txb_info,
+                                const TX_CLASS tx_class) {
+  const int16_t *const scan = txb_info->scan_order->scan;
+  const int coeff_idx = scan[scan_idx];
+  const tran_low_t qc = txb_info->qcoeff[coeff_idx];
+  const uint8_t *const levels = txb_info->levels;
+  stats->new_eob = -1;
+  stats->update = 0;
+  stats->rd_low = 0;
+  stats->rd = 0;
+  stats->nz_rd = 0;
+  stats->dist_low = 0;
+  stats->rate_low = 0;
+  stats->low_qc = 0;
+
+  const tran_low_t tqc = txb_info->tcoeff[coeff_idx];
+  const int dqv = txb_info->dequant[coeff_idx != 0];
+  const int coeff_ctx =
+      get_nz_map_ctx(levels, coeff_idx, txb_info->bwl, txb_info->height,
+                     scan_idx, is_eob, txb_info->tx_size, tx_class);
+  const int qc_cost = get_coeff_cost(qc, scan_idx, is_eob, txb_info, txb_costs,
+                                     coeff_ctx, tx_class);
+  assert(qc != 0);
+  const tran_low_t dqc = qcoeff_to_dqcoeff(qc, coeff_idx, dqv, txb_info->shift,
+                                           txb_info->iqmatrix);
+  const int64_t dqc_dist = get_coeff_dist(tqc, dqc, txb_info->shift);
+
+  // distortion difference when coefficient is quantized to 0
+  const tran_low_t dqc0 =
+      qcoeff_to_dqcoeff(0, coeff_idx, dqv, txb_info->shift, txb_info->iqmatrix);
+
+  stats->dist0 = get_coeff_dist(tqc, dqc0, txb_info->shift);
+  stats->dist = dqc_dist - stats->dist0;
+  stats->rate = qc_cost;
+
+  stats->rd = RDCOST(txb_info->rdmult, stats->rate, stats->dist);
+
+  stats->low_qc = get_lower_coeff(qc);
+
+  if (is_eob && stats->low_qc == 0) {
+    stats->rd_low = stats->rd;  // disable selection of low_qc in this case.
+  } else {
+    if (stats->low_qc == 0) {
+      stats->dist_low = 0;
+    } else {
+      stats->low_dqc = qcoeff_to_dqcoeff(stats->low_qc, coeff_idx, dqv,
+                                         txb_info->shift, txb_info->iqmatrix);
+      const int64_t low_dqc_dist =
+          get_coeff_dist(tqc, stats->low_dqc, txb_info->shift);
+      stats->dist_low = low_dqc_dist - stats->dist0;
+    }
+    const int low_qc_cost =
+        get_coeff_cost(stats->low_qc, scan_idx, is_eob, txb_info, txb_costs,
+                       coeff_ctx, tx_class);
+    stats->rate_low = low_qc_cost;
+    stats->rd_low = RDCOST(txb_info->rdmult, stats->rate_low, stats->dist_low);
+  }
+}
+
+static void get_dist_cost_stats_with_eob(
+    LevelDownStats *const stats, const int scan_idx,
+    const LV_MAP_COEFF_COST *const txb_costs, const TxbInfo *const txb_info,
+    const TX_CLASS tx_class) {
+  const int is_eob = 0;
+  get_dist_cost_stats(stats, scan_idx, is_eob, txb_costs, txb_info, tx_class);
+
+  const int16_t *const scan = txb_info->scan_order->scan;
+  const int coeff_idx = scan[scan_idx];
+  const tran_low_t qc = txb_info->qcoeff[coeff_idx];
+  const int coeff_ctx_temp = get_nz_map_ctx(
+      txb_info->levels, coeff_idx, txb_info->bwl, txb_info->height, scan_idx, 1,
+      txb_info->tx_size, tx_class);
+  const int qc_eob_cost = get_coeff_cost(qc, scan_idx, 1, txb_info, txb_costs,
+                                         coeff_ctx_temp, tx_class);
+  int64_t rd_eob = RDCOST(txb_info->rdmult, qc_eob_cost, stats->dist);
+  if (stats->low_qc != 0) {
+    const int low_qc_eob_cost =
+        get_coeff_cost(stats->low_qc, scan_idx, 1, txb_info, txb_costs,
+                       coeff_ctx_temp, tx_class);
+    int64_t rd_eob_low =
+        RDCOST(txb_info->rdmult, low_qc_eob_cost, stats->dist_low);
+    rd_eob = (rd_eob > rd_eob_low) ? rd_eob_low : rd_eob;
+  }
+
+  stats->nz_rd = AOMMIN(stats->rd_low, stats->rd) - rd_eob;
+}
+
+static INLINE void update_qcoeff(const int coeff_idx, const tran_low_t qc,
+                                 const TxbInfo *const txb_info) {
+  txb_info->qcoeff[coeff_idx] = qc;
+  txb_info->levels[get_padded_idx(coeff_idx, txb_info->bwl)] =
+      (uint8_t)clamp(abs(qc), 0, INT8_MAX);
+}
+
+static INLINE void update_coeff(const int coeff_idx, const tran_low_t qc,
+                                const TxbInfo *const txb_info) {
+  update_qcoeff(coeff_idx, qc, txb_info);
+  const int dqv = txb_info->dequant[coeff_idx != 0];
+  txb_info->dqcoeff[coeff_idx] = qcoeff_to_dqcoeff(
+      qc, coeff_idx, dqv, txb_info->shift, txb_info->iqmatrix);
+}
+
+void av1_txb_init_levels_c(const tran_low_t *const coeff, const int width,
+                           const int height, uint8_t *const levels) {
+  const int stride = width + TX_PAD_HOR;
+  uint8_t *ls = levels;
+
+  memset(levels + stride * height, 0,
+         sizeof(*levels) * (TX_PAD_BOTTOM * stride + TX_PAD_END));
+
+  for (int i = 0; i < height; i++) {
+    for (int j = 0; j < width; j++) {
+      *ls++ = (uint8_t)clamp(abs(coeff[i * width + j]), 0, INT8_MAX);
+    }
+    for (int j = 0; j < TX_PAD_HOR; j++) {
+      *ls++ = 0;
+    }
+  }
+}
+
+void av1_get_nz_map_contexts_c(const uint8_t *const levels,
+                               const int16_t *const scan, const uint16_t eob,
+                               const TX_SIZE tx_size, const TX_CLASS tx_class,
+                               int8_t *const coeff_contexts) {
+  const int bwl = get_txb_bwl(tx_size);
+  const int height = get_txb_high(tx_size);
+  for (int i = 0; i < eob; ++i) {
+    const int pos = scan[i];
+    coeff_contexts[pos] = get_nz_map_ctx(levels, pos, bwl, height, i,
+                                         i == eob - 1, tx_size, tx_class);
+  }
+}
+
+void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCK *const x,
+                          aom_writer *w, int blk_row, int blk_col, int plane,
+                          int block, TX_SIZE tx_size) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  const CB_COEFF_BUFFER *cb_coef_buff = x->cb_coef_buff;
+  const int txb_offset =
+      x->mbmi_ext_frame->cb_offset / (TX_SIZE_W_MIN * TX_SIZE_H_MIN);
+  const uint16_t *eob_txb = cb_coef_buff->eobs[plane] + txb_offset;
+  const uint16_t eob = eob_txb[block];
+  const uint8_t *entropy_ctx = cb_coef_buff->entropy_ctx[plane] + txb_offset;
+  const int txb_skip_ctx = entropy_ctx[block] & TXB_SKIP_CTX_MASK;
+  const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  aom_write_symbol(w, eob == 0, ec_ctx->txb_skip_cdf[txs_ctx][txb_skip_ctx], 2);
+  if (eob == 0) return;
+
+  const PLANE_TYPE plane_type = get_plane_type(plane);
+  const TX_TYPE tx_type =
+      av1_get_tx_type(xd, plane_type, blk_row, blk_col, tx_size,
+                      cm->features.reduced_tx_set_used);
+  // Only y plane's tx_type is transmitted
+  if (plane == 0) {
+    av1_write_tx_type(cm, xd, tx_type, tx_size, w);
+  }
+
+  int eob_extra;
+  const int eob_pt = get_eob_pos_token(eob, &eob_extra);
+  const int eob_multi_size = txsize_log2_minus4[tx_size];
+  const TX_CLASS tx_class = tx_type_to_class[tx_type];
+  const int eob_multi_ctx = (tx_class == TX_CLASS_2D) ? 0 : 1;
+  switch (eob_multi_size) {
+    case 0:
+      aom_write_symbol(w, eob_pt - 1,
+                       ec_ctx->eob_flag_cdf16[plane_type][eob_multi_ctx], 5);
+      break;
+    case 1:
+      aom_write_symbol(w, eob_pt - 1,
+                       ec_ctx->eob_flag_cdf32[plane_type][eob_multi_ctx], 6);
+      break;
+    case 2:
+      aom_write_symbol(w, eob_pt - 1,
+                       ec_ctx->eob_flag_cdf64[plane_type][eob_multi_ctx], 7);
+      break;
+    case 3:
+      aom_write_symbol(w, eob_pt - 1,
+                       ec_ctx->eob_flag_cdf128[plane_type][eob_multi_ctx], 8);
+      break;
+    case 4:
+      aom_write_symbol(w, eob_pt - 1,
+                       ec_ctx->eob_flag_cdf256[plane_type][eob_multi_ctx], 9);
+      break;
+    case 5:
+      aom_write_symbol(w, eob_pt - 1,
+                       ec_ctx->eob_flag_cdf512[plane_type][eob_multi_ctx], 10);
+      break;
+    default:
+      aom_write_symbol(w, eob_pt - 1,
+                       ec_ctx->eob_flag_cdf1024[plane_type][eob_multi_ctx], 11);
+      break;
+  }
+
+  const int eob_offset_bits = av1_eob_offset_bits[eob_pt];
+  if (eob_offset_bits > 0) {
+    const int eob_ctx = eob_pt - 3;
+    int eob_shift = eob_offset_bits - 1;
+    int bit = (eob_extra & (1 << eob_shift)) ? 1 : 0;
+    aom_write_symbol(w, bit,
+                     ec_ctx->eob_extra_cdf[txs_ctx][plane_type][eob_ctx], 2);
+    for (int i = 1; i < eob_offset_bits; i++) {
+      eob_shift = eob_offset_bits - 1 - i;
+      bit = (eob_extra & (1 << eob_shift)) ? 1 : 0;
+      aom_write_bit(w, bit);
+    }
+  }
+
+  const int width = get_txb_wide(tx_size);
+  const int height = get_txb_high(tx_size);
+  uint8_t levels_buf[TX_PAD_2D];
+  uint8_t *const levels = set_levels(levels_buf, width);
+  const tran_low_t *tcoeff_txb =
+      cb_coef_buff->tcoeff[plane] + x->mbmi_ext_frame->cb_offset;
+  const tran_low_t *tcoeff = tcoeff_txb + BLOCK_OFFSET(block);
+  av1_txb_init_levels(tcoeff, width, height, levels);
+  const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
+  const int16_t *const scan = scan_order->scan;
+  DECLARE_ALIGNED(16, int8_t, coeff_contexts[MAX_TX_SQUARE]);
+  av1_get_nz_map_contexts(levels, scan, eob, tx_size, tx_class, coeff_contexts);
+
+  const int bwl = get_txb_bwl(tx_size);
+  for (int c = eob - 1; c >= 0; --c) {
+    const int pos = scan[c];
+    const int coeff_ctx = coeff_contexts[pos];
+    const tran_low_t v = tcoeff[pos];
+    const tran_low_t level = abs(v);
+
+    if (c == eob - 1) {
+      aom_write_symbol(
+          w, AOMMIN(level, 3) - 1,
+          ec_ctx->coeff_base_eob_cdf[txs_ctx][plane_type][coeff_ctx], 3);
+    } else {
+      aom_write_symbol(w, AOMMIN(level, 3),
+                       ec_ctx->coeff_base_cdf[txs_ctx][plane_type][coeff_ctx],
+                       4);
+    }
+    if (level > NUM_BASE_LEVELS) {
+      // level is above 1.
+      const int base_range = level - 1 - NUM_BASE_LEVELS;
+      const int br_ctx = get_br_ctx(levels, pos, bwl, tx_class);
+      aom_cdf_prob *cdf =
+          ec_ctx->coeff_br_cdf[AOMMIN(txs_ctx, TX_32X32)][plane_type][br_ctx];
+      for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) {
+        const int k = AOMMIN(base_range - idx, BR_CDF_SIZE - 1);
+        aom_write_symbol(w, k, cdf, BR_CDF_SIZE);
+        if (k < BR_CDF_SIZE - 1) break;
+      }
+    }
+  }
+
+  // Loop to code all signs in the transform block,
+  // starting with the sign of DC (if applicable)
+  for (int c = 0; c < eob; ++c) {
+    const tran_low_t v = tcoeff[scan[c]];
+    const tran_low_t level = abs(v);
+    const int sign = (v < 0) ? 1 : 0;
+    if (level) {
+      if (c == 0) {
+        const int dc_sign_ctx =
+            (entropy_ctx[block] >> DC_SIGN_CTX_SHIFT) & DC_SIGN_CTX_MASK;
+        aom_write_symbol(w, sign, ec_ctx->dc_sign_cdf[plane_type][dc_sign_ctx],
+                         2);
+      } else {
+        aom_write_bit(w, sign);
+      }
+      if (level > COEFF_BASE_RANGE + NUM_BASE_LEVELS)
+        write_golomb(w, level - COEFF_BASE_RANGE - 1 - NUM_BASE_LEVELS);
+    }
+  }
+}
+
+typedef struct encode_txb_args {
+  const AV1_COMMON *cm;
+  MACROBLOCK *x;
+  aom_writer *w;
+} ENCODE_TXB_ARGS;
+
+void av1_write_coeffs_mb(const AV1_COMMON *const cm, MACROBLOCK *x,
+                         aom_writer *w, BLOCK_SIZE bsize) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  const int num_planes = av1_num_planes(cm);
+  int block[MAX_MB_PLANE] = { 0 };
+  int row, col;
+  assert(bsize == get_plane_block_size(bsize, xd->plane[0].subsampling_x,
+                                       xd->plane[0].subsampling_y));
+  const int max_blocks_wide = max_block_wide(xd, bsize, 0);
+  const int max_blocks_high = max_block_high(xd, bsize, 0);
+  const BLOCK_SIZE max_unit_bsize = BLOCK_64X64;
+  int mu_blocks_wide = mi_size_wide[max_unit_bsize];
+  int mu_blocks_high = mi_size_high[max_unit_bsize];
+  mu_blocks_wide = AOMMIN(max_blocks_wide, mu_blocks_wide);
+  mu_blocks_high = AOMMIN(max_blocks_high, mu_blocks_high);
+
+  for (row = 0; row < max_blocks_high; row += mu_blocks_high) {
+    for (col = 0; col < max_blocks_wide; col += mu_blocks_wide) {
+      for (int plane = 0; plane < num_planes; ++plane) {
+        if (plane && !xd->is_chroma_ref) break;
+        const TX_SIZE tx_size = av1_get_tx_size(plane, xd);
+        const int stepr = tx_size_high_unit[tx_size];
+        const int stepc = tx_size_wide_unit[tx_size];
+        const int step = stepr * stepc;
+        const struct macroblockd_plane *const pd = &xd->plane[plane];
+        const int unit_height = ROUND_POWER_OF_TWO(
+            AOMMIN(mu_blocks_high + row, max_blocks_high), pd->subsampling_y);
+        const int unit_width = ROUND_POWER_OF_TWO(
+            AOMMIN(mu_blocks_wide + col, max_blocks_wide), pd->subsampling_x);
+        for (int blk_row = row >> pd->subsampling_y; blk_row < unit_height;
+             blk_row += stepr) {
+          for (int blk_col = col >> pd->subsampling_x; blk_col < unit_width;
+               blk_col += stepc) {
+            av1_write_coeffs_txb(cm, x, w, blk_row, blk_col, plane,
+                                 block[plane], tx_size);
+            block[plane] += step;
+          }
+        }
+      }
+    }
+  }
+}
+
+// TODO(angiebird): use this function whenever it's possible
+static int get_tx_type_cost(const MACROBLOCK *x, const MACROBLOCKD *xd,
+                            int plane, TX_SIZE tx_size, TX_TYPE tx_type,
+                            int reduced_tx_set_used) {
+  if (plane > 0) return 0;
+
+  const TX_SIZE square_tx_size = txsize_sqr_map[tx_size];
+
+  const MB_MODE_INFO *mbmi = xd->mi[0];
+  const int is_inter = is_inter_block(mbmi);
+  if (get_ext_tx_types(tx_size, is_inter, reduced_tx_set_used) > 1 &&
+      !xd->lossless[xd->mi[0]->segment_id]) {
+    const int ext_tx_set =
+        get_ext_tx_set(tx_size, is_inter, reduced_tx_set_used);
+    if (is_inter) {
+      if (ext_tx_set > 0)
+        return x->inter_tx_type_costs[ext_tx_set][square_tx_size][tx_type];
+    } else {
+      if (ext_tx_set > 0) {
+        PREDICTION_MODE intra_dir;
+        if (mbmi->filter_intra_mode_info.use_filter_intra)
+          intra_dir = fimode_to_intradir[mbmi->filter_intra_mode_info
+                                             .filter_intra_mode];
+        else
+          intra_dir = mbmi->mode;
+        return x->intra_tx_type_costs[ext_tx_set][square_tx_size][intra_dir]
+                                     [tx_type];
+      }
+    }
+  }
+  return 0;
+}
+
+static INLINE void update_coeff_eob_fast(int *eob, int shift,
+                                         const int16_t *dequant_ptr,
+                                         const int16_t *scan,
+                                         const tran_low_t *coeff_ptr,
+                                         tran_low_t *qcoeff_ptr,
+                                         tran_low_t *dqcoeff_ptr) {
+  // TODO(sarahparker) make this work for aomqm
+  int eob_out = *eob;
+  int zbin[2] = { dequant_ptr[0] + ROUND_POWER_OF_TWO(dequant_ptr[0] * 70, 7),
+                  dequant_ptr[1] + ROUND_POWER_OF_TWO(dequant_ptr[1] * 70, 7) };
+
+  for (int i = *eob - 1; i >= 0; i--) {
+    const int rc = scan[i];
+    const int qcoeff = qcoeff_ptr[rc];
+    const int coeff = coeff_ptr[rc];
+    const int coeff_sign = AOMSIGN(coeff);
+    int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+
+    if (((abs_coeff << (1 + shift)) < zbin[rc != 0]) || (qcoeff == 0)) {
+      eob_out--;
+      qcoeff_ptr[rc] = 0;
+      dqcoeff_ptr[rc] = 0;
+    } else {
+      break;
+    }
+  }
+
+  *eob = eob_out;
+}
+
+static AOM_FORCE_INLINE int warehouse_efficients_txb(
+    const MACROBLOCK *x, const int plane, const int block,
+    const TX_SIZE tx_size, const TXB_CTX *const txb_ctx,
+    const struct macroblock_plane *p, const int eob,
+    const PLANE_TYPE plane_type, const LV_MAP_COEFF_COST *const coeff_costs,
+    const MACROBLOCKD *const xd, const TX_TYPE tx_type, const TX_CLASS tx_class,
+    int reduced_tx_set_used) {
+  const tran_low_t *const qcoeff = p->qcoeff + BLOCK_OFFSET(block);
+  const int txb_skip_ctx = txb_ctx->txb_skip_ctx;
+  const int bwl = get_txb_bwl(tx_size);
+  const int width = get_txb_wide(tx_size);
+  const int height = get_txb_high(tx_size);
+  const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
+  const int16_t *const scan = scan_order->scan;
+  uint8_t levels_buf[TX_PAD_2D];
+  uint8_t *const levels = set_levels(levels_buf, width);
+  DECLARE_ALIGNED(16, int8_t, coeff_contexts[MAX_TX_SQUARE]);
+  const int eob_multi_size = txsize_log2_minus4[tx_size];
+  const LV_MAP_EOB_COST *const eob_costs =
+      &x->eob_costs[eob_multi_size][plane_type];
+  int cost = coeff_costs->txb_skip_cost[txb_skip_ctx][0];
+
+  av1_txb_init_levels(qcoeff, width, height, levels);
+
+  cost += get_tx_type_cost(x, xd, plane, tx_size, tx_type, reduced_tx_set_used);
+
+  cost += get_eob_cost(eob, eob_costs, coeff_costs, tx_class);
+
+  av1_get_nz_map_contexts(levels, scan, eob, tx_size, tx_class, coeff_contexts);
+
+  const int(*lps_cost)[COEFF_BASE_RANGE + 1 + COEFF_BASE_RANGE + 1] =
+      coeff_costs->lps_cost;
+  int c = eob - 1;
+  {
+    const int pos = scan[c];
+    const tran_low_t v = qcoeff[pos];
+    const int sign = AOMSIGN(v);
+    const int level = (v ^ sign) - sign;
+    const int coeff_ctx = coeff_contexts[pos];
+    cost += coeff_costs->base_eob_cost[coeff_ctx][AOMMIN(level, 3) - 1];
+
+    if (v) {
+      // sign bit cost
+      if (level > NUM_BASE_LEVELS) {
+        const int ctx = get_br_ctx_eob(pos, bwl, tx_class);
+        cost += get_br_cost(level, lps_cost[ctx]);
+      }
+      if (c) {
+        cost += av1_cost_literal(1);
+      } else {
+        const int sign01 = (sign ^ sign) - sign;
+        const int dc_sign_ctx = txb_ctx->dc_sign_ctx;
+        cost += coeff_costs->dc_sign_cost[dc_sign_ctx][sign01];
+        return cost;
+      }
+    }
+  }
+  const int(*base_cost)[8] = coeff_costs->base_cost;
+  for (c = eob - 2; c >= 1; --c) {
+    const int pos = scan[c];
+    const int coeff_ctx = coeff_contexts[pos];
+    const tran_low_t v = qcoeff[pos];
+    const int level = abs(v);
+    cost += base_cost[coeff_ctx][AOMMIN(level, 3)];
+    if (v) {
+      // sign bit cost
+      cost += av1_cost_literal(1);
+      if (level > NUM_BASE_LEVELS) {
+        const int ctx = get_br_ctx(levels, pos, bwl, tx_class);
+        cost += get_br_cost(level, lps_cost[ctx]);
+      }
+    }
+  }
+  // c == 0 after previous loop
+  {
+    const int pos = scan[c];
+    const tran_low_t v = qcoeff[pos];
+    const int coeff_ctx = coeff_contexts[pos];
+    const int sign = AOMSIGN(v);
+    const int level = (v ^ sign) - sign;
+    cost += base_cost[coeff_ctx][AOMMIN(level, 3)];
+
+    if (v) {
+      // sign bit cost
+      const int sign01 = (sign ^ sign) - sign;
+      const int dc_sign_ctx = txb_ctx->dc_sign_ctx;
+      cost += coeff_costs->dc_sign_cost[dc_sign_ctx][sign01];
+      if (level > NUM_BASE_LEVELS) {
+        const int ctx = get_br_ctx(levels, pos, bwl, tx_class);
+        cost += get_br_cost(level, lps_cost[ctx]);
+      }
+    }
+  }
+  return cost;
+}
+
+static AOM_FORCE_INLINE int warehouse_efficients_txb_laplacian(
+    const MACROBLOCK *x, const int plane, const int block,
+    const TX_SIZE tx_size, const TXB_CTX *const txb_ctx, const int eob,
+    const PLANE_TYPE plane_type, const LV_MAP_COEFF_COST *const coeff_costs,
+    const MACROBLOCKD *const xd, const TX_TYPE tx_type, const TX_CLASS tx_class,
+    int reduced_tx_set_used) {
+  const int txb_skip_ctx = txb_ctx->txb_skip_ctx;
+
+  const int eob_multi_size = txsize_log2_minus4[tx_size];
+  const LV_MAP_EOB_COST *const eob_costs =
+      &x->eob_costs[eob_multi_size][plane_type];
+  int cost = coeff_costs->txb_skip_cost[txb_skip_ctx][0];
+
+  cost += get_tx_type_cost(x, xd, plane, tx_size, tx_type, reduced_tx_set_used);
+
+  cost += get_eob_cost(eob, eob_costs, coeff_costs, tx_class);
+
+  cost += av1_cost_coeffs_txb_estimate(x, plane, block, tx_size, tx_type);
+  return cost;
+}
+
+// Look up table of individual cost of coefficient by its quantization level.
+// determined based on Laplacian distribution conditioned on estimated context
+static const int costLUT[15] = { -1143, 53,   545,  825,  1031,
+                                 1209,  1393, 1577, 1763, 1947,
+                                 2132,  2317, 2501, 2686, 2871 };
+static const int const_term = (1 << AV1_PROB_COST_SHIFT);
+static const int loge_par = ((14427 << AV1_PROB_COST_SHIFT) + 5000) / 10000;
+int av1_cost_coeffs_txb_estimate(const MACROBLOCK *x, const int plane,
+                                 const int block, const TX_SIZE tx_size,
+                                 const TX_TYPE tx_type) {
+  assert(plane == 0);
+
+  int cost = 0;
+  const struct macroblock_plane *p = &x->plane[plane];
+  const SCAN_ORDER *scan_order = get_scan(tx_size, tx_type);
+  const int16_t *scan = scan_order->scan;
+  tran_low_t *qcoeff = p->qcoeff + BLOCK_OFFSET(block);
+
+  int eob = p->eobs[block];
+
+  // coeffs
+  int c = eob - 1;
+  // eob
+  {
+    const int pos = scan[c];
+    const tran_low_t v = abs(qcoeff[pos]) - 1;
+    cost += (v << (AV1_PROB_COST_SHIFT + 2));
+  }
+  // other coeffs
+  for (c = eob - 2; c >= 0; c--) {
+    const int pos = scan[c];
+    const tran_low_t v = abs(qcoeff[pos]);
+    const int idx = AOMMIN(v, 14);
+
+    cost += costLUT[idx];
+  }
+
+  // const_term does not contain DC, and log(e) does not contain eob, so both
+  // (eob-1)
+  cost += (const_term + loge_par) * (eob - 1);
+
+  return cost;
+}
+
+int av1_cost_coeffs_txb(const MACROBLOCK *x, const int plane, const int block,
+                        const TX_SIZE tx_size, const TX_TYPE tx_type,
+                        const TXB_CTX *const txb_ctx, int reduced_tx_set_used) {
+  const struct macroblock_plane *p = &x->plane[plane];
+  const int eob = p->eobs[block];
+  const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+  const PLANE_TYPE plane_type = get_plane_type(plane);
+  const LV_MAP_COEFF_COST *const coeff_costs =
+      &x->coeff_costs[txs_ctx][plane_type];
+  if (eob == 0) {
+    return coeff_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][1];
+  }
+
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const TX_CLASS tx_class = tx_type_to_class[tx_type];
+
+  return warehouse_efficients_txb(x, plane, block, tx_size, txb_ctx, p, eob,
+                                  plane_type, coeff_costs, xd, tx_type,
+                                  tx_class, reduced_tx_set_used);
+}
+
+int av1_cost_coeffs_txb_laplacian(const MACROBLOCK *x, const int plane,
+                                  const int block, const TX_SIZE tx_size,
+                                  const TX_TYPE tx_type,
+                                  const TXB_CTX *const txb_ctx,
+                                  const int reduced_tx_set_used,
+                                  const int adjust_eob) {
+  const struct macroblock_plane *p = &x->plane[plane];
+  int eob = p->eobs[block];
+
+  if (adjust_eob) {
+    const SCAN_ORDER *scan_order = get_scan(tx_size, tx_type);
+    const int16_t *scan = scan_order->scan;
+    tran_low_t *tcoeff = p->coeff + BLOCK_OFFSET(block);
+    tran_low_t *qcoeff = p->qcoeff + BLOCK_OFFSET(block);
+    const MACROBLOCKD *xd = &x->e_mbd;
+    const struct macroblockd_plane *const pd = &xd->plane[plane];
+    tran_low_t *dqcoeff = pd->dqcoeff + BLOCK_OFFSET(block);
+    update_coeff_eob_fast(&eob, av1_get_tx_scale(tx_size), p->dequant_QTX, scan,
+                          tcoeff, qcoeff, dqcoeff);
+    p->eobs[block] = eob;
+  }
+
+  const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+  const PLANE_TYPE plane_type = get_plane_type(plane);
+  const LV_MAP_COEFF_COST *const coeff_costs =
+      &x->coeff_costs[txs_ctx][plane_type];
+  if (eob == 0) {
+    return coeff_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][1];
+  }
+
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const TX_CLASS tx_class = tx_type_to_class[tx_type];
+
+  return warehouse_efficients_txb_laplacian(
+      x, plane, block, tx_size, txb_ctx, eob, plane_type, coeff_costs, xd,
+      tx_type, tx_class, reduced_tx_set_used);
+}
+
+static int optimize_txb(TxbInfo *txb_info, const LV_MAP_COEFF_COST *txb_costs,
+                        const LV_MAP_EOB_COST *txb_eob_costs, int *rate_cost) {
+  int update = 0;
+  if (txb_info->eob == 0) return update;
+  const int16_t *const scan = txb_info->scan_order->scan;
+  // forward optimize the nz_map`
+  const int init_eob = txb_info->eob;
+  const TX_CLASS tx_class = tx_type_to_class[txb_info->tx_type];
+  const int eob_cost =
+      get_eob_cost(init_eob, txb_eob_costs, txb_costs, tx_class);
+
+  // backward optimize the level-k map
+  int accu_rate = eob_cost;
+  int64_t accu_dist = 0;
+  int64_t prev_eob_rd_cost = INT64_MAX;
+  int64_t cur_eob_rd_cost = 0;
+
+  {
+    const int si = init_eob - 1;
+    const int coeff_idx = scan[si];
+    LevelDownStats stats;
+    get_dist_cost_stats(&stats, si, si == init_eob - 1, txb_costs, txb_info,
+                        tx_class);
+    if ((stats.rd_low < stats.rd) && (stats.low_qc != 0)) {
+      update = 1;
+      update_coeff(coeff_idx, stats.low_qc, txb_info);
+      accu_rate += stats.rate_low;
+      accu_dist += stats.dist_low;
+    } else {
+      accu_rate += stats.rate;
+      accu_dist += stats.dist;
+    }
+  }
+
+  int si = init_eob - 2;
+  int8_t has_nz_tail = 0;
+  // eob is not fixed
+  for (; si >= 0 && has_nz_tail < 2; --si) {
+    assert(si != init_eob - 1);
+    const int coeff_idx = scan[si];
+    tran_low_t qc = txb_info->qcoeff[coeff_idx];
+
+    if (qc == 0) {
+      const int coeff_ctx =
+          get_lower_levels_ctx(txb_info->levels, coeff_idx, txb_info->bwl,
+                               txb_info->tx_size, tx_class);
+      accu_rate += txb_costs->base_cost[coeff_ctx][0];
+    } else {
+      LevelDownStats stats;
+      get_dist_cost_stats_with_eob(&stats, si, txb_costs, txb_info, tx_class);
+      // check if it is better to make this the last significant coefficient
+      int cur_eob_rate =
+          get_eob_cost(si + 1, txb_eob_costs, txb_costs, tx_class);
+      cur_eob_rd_cost = RDCOST(txb_info->rdmult, cur_eob_rate, 0);
+      prev_eob_rd_cost =
+          RDCOST(txb_info->rdmult, accu_rate, accu_dist) + stats.nz_rd;
+      if (cur_eob_rd_cost <= prev_eob_rd_cost) {
+        update = 1;
+        for (int j = si + 1; j < txb_info->eob; j++) {
+          const int coeff_pos_j = scan[j];
+          update_coeff(coeff_pos_j, 0, txb_info);
+        }
+        txb_info->eob = si + 1;
+
+        // rerun cost calculation due to change of eob
+        accu_rate = cur_eob_rate;
+        accu_dist = 0;
+        get_dist_cost_stats(&stats, si, 1, txb_costs, txb_info, tx_class);
+        if ((stats.rd_low < stats.rd) && (stats.low_qc != 0)) {
+          update = 1;
+          update_coeff(coeff_idx, stats.low_qc, txb_info);
+          accu_rate += stats.rate_low;
+          accu_dist += stats.dist_low;
+        } else {
+          accu_rate += stats.rate;
+          accu_dist += stats.dist;
+        }
+
+        // reset non zero tail when new eob is found
+        has_nz_tail = 0;
+      } else {
+        int bUpdCoeff = 0;
+        if (stats.rd_low < stats.rd) {
+          if ((si < txb_info->eob - 1)) {
+            bUpdCoeff = 1;
+            update = 1;
+          }
+        } else {
+          ++has_nz_tail;
+        }
+
+        if (bUpdCoeff) {
+          update_coeff(coeff_idx, stats.low_qc, txb_info);
+          accu_rate += stats.rate_low;
+          accu_dist += stats.dist_low;
+        } else {
+          accu_rate += stats.rate;
+          accu_dist += stats.dist;
+        }
+      }
+    }
+  }  // for (si)
+
+  // eob is fixed
+  for (; si >= 0; --si) {
+    assert(si != init_eob - 1);
+    const int coeff_idx = scan[si];
+    tran_low_t qc = txb_info->qcoeff[coeff_idx];
+
+    if (qc == 0) {
+      const int coeff_ctx =
+          get_lower_levels_ctx(txb_info->levels, coeff_idx, txb_info->bwl,
+                               txb_info->tx_size, tx_class);
+      accu_rate += txb_costs->base_cost[coeff_ctx][0];
+    } else {
+      LevelDownStats stats;
+      get_dist_cost_stats(&stats, si, 0, txb_costs, txb_info, tx_class);
+
+      int bUpdCoeff = 0;
+      if (stats.rd_low < stats.rd) {
+        if ((si < txb_info->eob - 1)) {
+          bUpdCoeff = 1;
+          update = 1;
+        }
+      }
+      if (bUpdCoeff) {
+        update_coeff(coeff_idx, stats.low_qc, txb_info);
+        accu_rate += stats.rate_low;
+        accu_dist += stats.dist_low;
+      } else {
+        accu_rate += stats.rate;
+        accu_dist += stats.dist;
+      }
+    }
+  }  // for (si)
+
+  int non_zero_blk_rate =
+      txb_costs->txb_skip_cost[txb_info->txb_ctx->txb_skip_ctx][0];
+  prev_eob_rd_cost =
+      RDCOST(txb_info->rdmult, accu_rate + non_zero_blk_rate, accu_dist);
+
+  int zero_blk_rate =
+      txb_costs->txb_skip_cost[txb_info->txb_ctx->txb_skip_ctx][1];
+  int64_t zero_blk_rd_cost = RDCOST(txb_info->rdmult, zero_blk_rate, 0);
+  if (zero_blk_rd_cost <= prev_eob_rd_cost) {
+    update = 1;
+    for (int j = 0; j < txb_info->eob; j++) {
+      const int coeff_pos_j = scan[j];
+      update_coeff(coeff_pos_j, 0, txb_info);
+    }
+    txb_info->eob = 0;
+  }
+
+  // record total rate cost
+  *rate_cost = zero_blk_rd_cost <= prev_eob_rd_cost
+                   ? zero_blk_rate
+                   : accu_rate + non_zero_blk_rate;
+
+  if (txb_info->eob > 0) {
+    *rate_cost += txb_info->tx_type_cost;
+  }
+
+  return update;
+}
+
+#if CONFIG_HTB_TRELLIS
+static void hbt_init() {
+  hbt_hash_table =
+      aom_malloc(sizeof(OptTxbQcoeff) * HBT_TABLE_SIZE * HBT_ARRAY_LENGTH);
+  memset(hbt_hash_table, 0,
+         sizeof(OptTxbQcoeff) * HBT_TABLE_SIZE * HBT_ARRAY_LENGTH);
+  av1_crc32c_calculator_init(&crc_calculator);  // 31 bit: qc & ctx
+
+  hbt_needs_init = 0;
+}
+
+void hbt_destroy() { aom_free(hbt_hash_table); }
+
+static int hbt_hash_miss(uint32_t hbt_ctx_hash, uint32_t hbt_qc_hash,
+                         TxbInfo *txb_info, const LV_MAP_COEFF_COST *txb_costs,
+                         const LV_MAP_EOB_COST *txb_eob_costs,
+                         const struct macroblock_plane *p, int block,
+                         int fast_mode, int *rate_cost) {
+  (void)fast_mode;
+  const int16_t *scan = txb_info->scan_order->scan;
+  int prev_eob = txb_info->eob;
+  assert(HBT_EOB <= 16);  // Lengthen array if allowing longer eob.
+  int32_t prev_coeff[16];
+  for (int i = 0; i < prev_eob; i++) {
+    prev_coeff[i] = txb_info->qcoeff[scan[i]];
+  }
+  for (int i = prev_eob; i < HBT_EOB; i++) {
+    prev_coeff[i] = 0;  // For compiler piece of mind.
+  }
+
+  av1_txb_init_levels(txb_info->qcoeff, txb_info->width, txb_info->height,
+                      txb_info->levels);
+
+  const int update =
+      optimize_txb(txb_info, txb_costs, txb_eob_costs, rate_cost);
+
+  // Overwrite old entry
+  uint16_t hbt_table_index = hbt_ctx_hash % HBT_TABLE_SIZE;
+  uint16_t hbt_array_index = hbt_qc_hash % HBT_ARRAY_LENGTH;
+  hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
+      .rate_cost = *rate_cost;
+  hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index].init = 1;
+  hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
+      .hbt_qc_hash = hbt_qc_hash;
+  hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
+      .hbt_ctx_hash = hbt_ctx_hash;
+  assert(prev_eob >= txb_info->eob);  // eob can't get longer
+  for (int i = 0; i < txb_info->eob; i++) {
+    // Record how coeff changed. Convention: towards zero is negative.
+    if (txb_info->qcoeff[scan[i]] > 0)
+      hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
+          .deltas[i] = txb_info->qcoeff[scan[i]] - prev_coeff[i];
+    else
+      hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
+          .deltas[i] = prev_coeff[i] - txb_info->qcoeff[scan[i]];
+  }
+  for (int i = txb_info->eob; i < prev_eob; i++) {
+    // If eob got shorter, record that all after it changed to zero.
+    if (prev_coeff[i] > 0)
+      hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
+          .deltas[i] = -prev_coeff[i];
+    else
+      hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
+          .deltas[i] = prev_coeff[i];
+  }
+  for (int i = prev_eob; i < HBT_EOB; i++) {
+    // Record 'no change' after optimized coefficients run out.
+    hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
+        .deltas[i] = 0;
+  }
+
+  if (update) {
+    p->eobs[block] = txb_info->eob;
+    p->txb_entropy_ctx[block] = av1_get_txb_entropy_context(
+        txb_info->qcoeff, txb_info->scan_order, txb_info->eob);
+  }
+  return txb_info->eob;
+}
+
+static int hbt_hash_hit(uint32_t hbt_table_index, int hbt_array_index,
+                        TxbInfo *txb_info, const struct macroblock_plane *p,
+                        int block, int *rate_cost) {
+  const int16_t *scan = txb_info->scan_order->scan;
+  int new_eob = 0;
+  int update = 0;
+
+  for (int i = 0; i < txb_info->eob; i++) {
+    // Delta convention is negatives go towards zero, so only apply those ones.
+    if (hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
+            .deltas[i] < 0) {
+      if (txb_info->qcoeff[scan[i]] > 0)
+        txb_info->qcoeff[scan[i]] +=
+            hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
+                .deltas[i];
+      else
+        txb_info->qcoeff[scan[i]] -=
+            hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
+                .deltas[i];
+
+      update = 1;
+      update_coeff(scan[i], txb_info->qcoeff[scan[i]], txb_info);
+    }
+    if (txb_info->qcoeff[scan[i]]) new_eob = i + 1;
+  }
+
+  // Rate_cost can be calculated here instead (av1_cost_coeffs_txb), but
+  // it is expensive and gives little benefit as long as qc_hash is high bit
+  *rate_cost =
+      hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
+          .rate_cost;
+
+  if (update) {
+    txb_info->eob = new_eob;
+    p->eobs[block] = txb_info->eob;
+    p->txb_entropy_ctx[block] = av1_get_txb_entropy_context(
+        txb_info->qcoeff, txb_info->scan_order, txb_info->eob);
+  }
+
+  return txb_info->eob;
+}
+
+static int hbt_search_match(uint32_t hbt_ctx_hash, uint32_t hbt_qc_hash,
+                            TxbInfo *txb_info,
+                            const LV_MAP_COEFF_COST *txb_costs,
+                            const LV_MAP_EOB_COST *txb_eob_costs,
+                            const struct macroblock_plane *p, int block,
+                            int fast_mode, int *rate_cost) {
+  // Check for qcoeff match
+  int hbt_array_index = hbt_qc_hash % HBT_ARRAY_LENGTH;
+  int hbt_table_index = hbt_ctx_hash % HBT_TABLE_SIZE;
+
+  if (hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
+              .hbt_qc_hash == hbt_qc_hash &&
+      hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
+              .hbt_ctx_hash == hbt_ctx_hash &&
+      hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
+          .init) {
+    return hbt_hash_hit(hbt_table_index, hbt_array_index, txb_info, p, block,
+                        rate_cost);
+  } else {
+    return hbt_hash_miss(hbt_ctx_hash, hbt_qc_hash, txb_info, txb_costs,
+                         txb_eob_costs, p, block, fast_mode, rate_cost);
+  }
+}
+
+static int hbt_create_hashes(TxbInfo *txb_info,
+                             const LV_MAP_COEFF_COST *txb_costs,
+                             const LV_MAP_EOB_COST *txb_eob_costs,
+                             const struct macroblock_plane *p, int block,
+                             int fast_mode, int *rate_cost) {
+  // Initialize hash table if needed.
+  if (hbt_needs_init) {
+    hbt_init();
+  }
+
+  //// Hash creation
+  uint8_t txb_hash_data[256];  // Asserts below to ensure enough space.
+  const int16_t *scan = txb_info->scan_order->scan;
+  uint8_t chunk = 0;
+  int hash_data_index = 0;
+
+  // Make qc_hash.
+  int packing_index = 0;  // needed for packing.
+  for (int i = 0; i < txb_info->eob; i++) {
+    tran_low_t prechunk = txb_info->qcoeff[scan[i]];
+
+    // Softening: Improves speed. Aligns with signed deltas.
+    if (prechunk < 0) prechunk *= -1;
+
+    // Early kick out: Don't apply feature if there are large coeffs:
+    // If this kickout value is removed or raised beyond int8_t,
+    // widen deltas type in OptTxbQcoeff struct.
+    assert((int8_t)HBT_KICKOUT == HBT_KICKOUT);  // If not, widen types.
+    if (prechunk > HBT_KICKOUT) {
+      av1_txb_init_levels(txb_info->qcoeff, txb_info->width, txb_info->height,
+                          txb_info->levels);
+
+      const int update =
+          optimize_txb(txb_info, txb_costs, txb_eob_costs, rate_cost);
+
+      if (update) {
+        p->eobs[block] = txb_info->eob;
+        p->txb_entropy_ctx[block] = av1_get_txb_entropy_context(
+            txb_info->qcoeff, txb_info->scan_order, txb_info->eob);
+      }
+      return txb_info->eob;
+    }
+
+    // Since coeffs are 0 to 3, only 2 bits are needed: pack into bytes
+    if (packing_index == 0) txb_hash_data[hash_data_index] = 0;
+    chunk = prechunk << packing_index;
+    packing_index += 2;
+    txb_hash_data[hash_data_index] |= chunk;
+
+    // Full byte:
+    if (packing_index == 8) {
+      packing_index = 0;
+      hash_data_index++;
+    }
+  }
+  // Needed when packing_index != 0, to include final byte.
+  hash_data_index++;
+  assert(hash_data_index <= 64);
+  // 31 bit qc_hash: index to array
+  uint32_t hbt_qc_hash =
+      av1_get_crc32c_value(&crc_calculator, txb_hash_data, hash_data_index);
+
+  // Make ctx_hash.
+  hash_data_index = 0;
+  tran_low_t prechunk;
+
+  for (int i = 0; i < txb_info->eob; i++) {
+    // Save as magnitudes towards or away from zero.
+    if (txb_info->tcoeff[scan[i]] >= 0)
+      prechunk = txb_info->tcoeff[scan[i]] - txb_info->dqcoeff[scan[i]];
+    else
+      prechunk = txb_info->dqcoeff[scan[i]] - txb_info->tcoeff[scan[i]];
+
+    chunk = prechunk & 0xff;
+    txb_hash_data[hash_data_index++] = chunk;
+  }
+
+  // Extra ctx data:
+  // Include dequants.
+  txb_hash_data[hash_data_index++] = txb_info->dequant[0] & 0xff;
+  txb_hash_data[hash_data_index++] = txb_info->dequant[1] & 0xff;
+  chunk = txb_info->txb_ctx->txb_skip_ctx & 0xff;
+  txb_hash_data[hash_data_index++] = chunk;
+  chunk = txb_info->txb_ctx->dc_sign_ctx & 0xff;
+  txb_hash_data[hash_data_index++] = chunk;
+  // eob
+  chunk = txb_info->eob & 0xff;
+  txb_hash_data[hash_data_index++] = chunk;
+  // rdmult (int64)
+  chunk = txb_info->rdmult & 0xff;
+  txb_hash_data[hash_data_index++] = chunk;
+  // tx_type
+  chunk = txb_info->tx_type & 0xff;
+  txb_hash_data[hash_data_index++] = chunk;
+  // base_eob_cost
+  for (int i = 1; i < 3; i++) {  // i = 0 are softened away
+    for (int j = 0; j < SIG_COEF_CONTEXTS_EOB; j++) {
+      chunk = (txb_costs->base_eob_cost[j][i] & 0xff00) >> 8;
+      txb_hash_data[hash_data_index++] = chunk;
+    }
+  }
+  // eob_cost
+  for (int i = 0; i < 11; i++) {
+    for (int j = 0; j < 2; j++) {
+      chunk = (txb_eob_costs->eob_cost[j][i] & 0xff00) >> 8;
+      txb_hash_data[hash_data_index++] = chunk;
+    }
+  }
+  // dc_sign_cost
+  for (int i = 0; i < 2; i++) {
+    for (int j = 0; j < DC_SIGN_CONTEXTS; j++) {
+      chunk = (txb_costs->dc_sign_cost[j][i] & 0xff00) >> 8;
+      txb_hash_data[hash_data_index++] = chunk;
+    }
+  }
+
+  assert(hash_data_index <= 256);
+  // 31 bit ctx_hash: used to index table
+  uint32_t hbt_ctx_hash =
+      av1_get_crc32c_value(&crc_calculator, txb_hash_data, hash_data_index);
+  //// End hash creation
+
+  return hbt_search_match(hbt_ctx_hash, hbt_qc_hash, txb_info, txb_costs,
+                          txb_eob_costs, p, block, fast_mode, rate_cost);
+}
+#endif  // CONFIG_HTB_TRELLIS
+
+static AOM_FORCE_INLINE int get_two_coeff_cost_simple(
+    int ci, tran_low_t abs_qc, int coeff_ctx,
+    const LV_MAP_COEFF_COST *txb_costs, int bwl, TX_CLASS tx_class,
+    const uint8_t *levels, int *cost_low) {
+  // this simple version assumes the coeff's scan_idx is not DC (scan_idx != 0)
+  // and not the last (scan_idx != eob - 1)
+  assert(ci > 0);
+  int cost = txb_costs->base_cost[coeff_ctx][AOMMIN(abs_qc, 3)];
+  int diff = 0;
+  if (abs_qc <= 3) diff = txb_costs->base_cost[coeff_ctx][abs_qc + 4];
+  if (abs_qc) {
+    cost += av1_cost_literal(1);
+    if (abs_qc > NUM_BASE_LEVELS) {
+      const int br_ctx = get_br_ctx(levels, ci, bwl, tx_class);
+      int brcost_diff = 0;
+      cost += get_br_cost_with_diff(abs_qc, txb_costs->lps_cost[br_ctx],
+                                    &brcost_diff);
+      diff += brcost_diff;
+    }
+  }
+  *cost_low = cost - diff;
+
+  return cost;
+}
+
+static INLINE int get_coeff_cost_eob(int ci, tran_low_t abs_qc, int sign,
+                                     int coeff_ctx, int dc_sign_ctx,
+                                     const LV_MAP_COEFF_COST *txb_costs,
+                                     int bwl, TX_CLASS tx_class) {
+  int cost = 0;
+  cost += txb_costs->base_eob_cost[coeff_ctx][AOMMIN(abs_qc, 3) - 1];
+  if (abs_qc != 0) {
+    if (ci == 0) {
+      cost += txb_costs->dc_sign_cost[dc_sign_ctx][sign];
+    } else {
+      cost += av1_cost_literal(1);
+    }
+    if (abs_qc > NUM_BASE_LEVELS) {
+      int br_ctx;
+      br_ctx = get_br_ctx_eob(ci, bwl, tx_class);
+      cost += get_br_cost(abs_qc, txb_costs->lps_cost[br_ctx]);
+    }
+  }
+  return cost;
+}
+
+static INLINE int get_coeff_cost_general(int is_last, int ci, tran_low_t abs_qc,
+                                         int sign, int coeff_ctx,
+                                         int dc_sign_ctx,
+                                         const LV_MAP_COEFF_COST *txb_costs,
+                                         int bwl, TX_CLASS tx_class,
+                                         const uint8_t *levels) {
+  int cost = 0;
+  if (is_last) {
+    cost += txb_costs->base_eob_cost[coeff_ctx][AOMMIN(abs_qc, 3) - 1];
+  } else {
+    cost += txb_costs->base_cost[coeff_ctx][AOMMIN(abs_qc, 3)];
+  }
+  if (abs_qc != 0) {
+    if (ci == 0) {
+      cost += txb_costs->dc_sign_cost[dc_sign_ctx][sign];
+    } else {
+      cost += av1_cost_literal(1);
+    }
+    if (abs_qc > NUM_BASE_LEVELS) {
+      int br_ctx;
+      if (is_last)
+        br_ctx = get_br_ctx_eob(ci, bwl, tx_class);
+      else
+        br_ctx = get_br_ctx(levels, ci, bwl, tx_class);
+      cost += get_br_cost(abs_qc, txb_costs->lps_cost[br_ctx]);
+    }
+  }
+  return cost;
+}
+
+static INLINE void get_qc_dqc_low(tran_low_t abs_qc, int sign, int dqv,
+                                  int shift, tran_low_t *qc_low,
+                                  tran_low_t *dqc_low) {
+  tran_low_t abs_qc_low = abs_qc - 1;
+  *qc_low = (-sign ^ abs_qc_low) + sign;
+  assert((sign ? -abs_qc_low : abs_qc_low) == *qc_low);
+  tran_low_t abs_dqc_low = (abs_qc_low * dqv) >> shift;
+  *dqc_low = (-sign ^ abs_dqc_low) + sign;
+  assert((sign ? -abs_dqc_low : abs_dqc_low) == *dqc_low);
+}
+
+static INLINE void update_coeff_general(
+    int *accu_rate, int64_t *accu_dist, int si, int eob, TX_SIZE tx_size,
+    TX_CLASS tx_class, int bwl, int height, int64_t rdmult, int shift,
+    int dc_sign_ctx, const int16_t *dequant, const int16_t *scan,
+    const LV_MAP_COEFF_COST *txb_costs, const tran_low_t *tcoeff,
+    tran_low_t *qcoeff, tran_low_t *dqcoeff, uint8_t *levels,
+    const qm_val_t *iqmatrix) {
+  const int dqv = get_dqv(dequant, scan[si], iqmatrix);
+  const int ci = scan[si];
+  const tran_low_t qc = qcoeff[ci];
+  const int is_last = si == (eob - 1);
+  const int coeff_ctx = get_lower_levels_ctx_general(
+      is_last, si, bwl, height, levels, ci, tx_size, tx_class);
+  if (qc == 0) {
+    *accu_rate += txb_costs->base_cost[coeff_ctx][0];
+  } else {
+    const int sign = (qc < 0) ? 1 : 0;
+    const tran_low_t abs_qc = abs(qc);
+    const tran_low_t tqc = tcoeff[ci];
+    const tran_low_t dqc = dqcoeff[ci];
+    const int64_t dist = get_coeff_dist(tqc, dqc, shift);
+    const int64_t dist0 = get_coeff_dist(tqc, 0, shift);
+    const int rate =
+        get_coeff_cost_general(is_last, ci, abs_qc, sign, coeff_ctx,
+                               dc_sign_ctx, txb_costs, bwl, tx_class, levels);
+    const int64_t rd = RDCOST(rdmult, rate, dist);
+
+    tran_low_t qc_low, dqc_low;
+    tran_low_t abs_qc_low;
+    int64_t dist_low, rd_low;
+    int rate_low;
+    if (abs_qc == 1) {
+      abs_qc_low = qc_low = dqc_low = 0;
+      dist_low = dist0;
+      rate_low = txb_costs->base_cost[coeff_ctx][0];
+    } else {
+      get_qc_dqc_low(abs_qc, sign, dqv, shift, &qc_low, &dqc_low);
+      abs_qc_low = abs_qc - 1;
+      dist_low = get_coeff_dist(tqc, dqc_low, shift);
+      rate_low =
+          get_coeff_cost_general(is_last, ci, abs_qc_low, sign, coeff_ctx,
+                                 dc_sign_ctx, txb_costs, bwl, tx_class, levels);
+    }
+
+    rd_low = RDCOST(rdmult, rate_low, dist_low);
+    if (rd_low < rd) {
+      qcoeff[ci] = qc_low;
+      dqcoeff[ci] = dqc_low;
+      levels[get_padded_idx(ci, bwl)] = AOMMIN(abs_qc_low, INT8_MAX);
+      *accu_rate += rate_low;
+      *accu_dist += dist_low - dist0;
+    } else {
+      *accu_rate += rate;
+      *accu_dist += dist - dist0;
+    }
+  }
+}
+
+static AOM_FORCE_INLINE void update_coeff_simple(
+    int *accu_rate, int si, int eob, TX_SIZE tx_size, TX_CLASS tx_class,
+    int bwl, int64_t rdmult, int shift, const int16_t *dequant,
+    const int16_t *scan, const LV_MAP_COEFF_COST *txb_costs,
+    const tran_low_t *tcoeff, tran_low_t *qcoeff, tran_low_t *dqcoeff,
+    uint8_t *levels, const qm_val_t *iqmatrix) {
+  const int dqv = get_dqv(dequant, scan[si], iqmatrix);
+  (void)eob;
+  // this simple version assumes the coeff's scan_idx is not DC (scan_idx != 0)
+  // and not the last (scan_idx != eob - 1)
+  assert(si != eob - 1);
+  assert(si > 0);
+  const int ci = scan[si];
+  const tran_low_t qc = qcoeff[ci];
+  const int coeff_ctx =
+      get_lower_levels_ctx(levels, ci, bwl, tx_size, tx_class);
+  if (qc == 0) {
+    *accu_rate += txb_costs->base_cost[coeff_ctx][0];
+  } else {
+    const tran_low_t abs_qc = abs(qc);
+    const tran_low_t abs_tqc = abs(tcoeff[ci]);
+    const tran_low_t abs_dqc = abs(dqcoeff[ci]);
+    int rate_low = 0;
+    const int rate = get_two_coeff_cost_simple(
+        ci, abs_qc, coeff_ctx, txb_costs, bwl, tx_class, levels, &rate_low);
+    if (abs_dqc < abs_tqc) {
+      *accu_rate += rate;
+      return;
+    }
+
+    const int64_t dist = get_coeff_dist(abs_tqc, abs_dqc, shift);
+    const int64_t rd = RDCOST(rdmult, rate, dist);
+
+    const tran_low_t abs_qc_low = abs_qc - 1;
+    const tran_low_t abs_dqc_low = (abs_qc_low * dqv) >> shift;
+    const int64_t dist_low = get_coeff_dist(abs_tqc, abs_dqc_low, shift);
+    const int64_t rd_low = RDCOST(rdmult, rate_low, dist_low);
+
+    if (rd_low < rd) {
+      const int sign = (qc < 0) ? 1 : 0;
+      qcoeff[ci] = (-sign ^ abs_qc_low) + sign;
+      dqcoeff[ci] = (-sign ^ abs_dqc_low) + sign;
+      levels[get_padded_idx(ci, bwl)] = AOMMIN(abs_qc_low, INT8_MAX);
+      *accu_rate += rate_low;
+    } else {
+      *accu_rate += rate;
+    }
+  }
+}
+
+static AOM_FORCE_INLINE void update_coeff_eob(
+    int *accu_rate, int64_t *accu_dist, int *eob, int *nz_num, int *nz_ci,
+    int si, TX_SIZE tx_size, TX_CLASS tx_class, int bwl, int height,
+    int dc_sign_ctx, int64_t rdmult, int shift, const int16_t *dequant,
+    const int16_t *scan, const LV_MAP_EOB_COST *txb_eob_costs,
+    const LV_MAP_COEFF_COST *txb_costs, const tran_low_t *tcoeff,
+    tran_low_t *qcoeff, tran_low_t *dqcoeff, uint8_t *levels, int sharpness,
+    const qm_val_t *iqmatrix) {
+  const int dqv = get_dqv(dequant, scan[si], iqmatrix);
+  assert(si != *eob - 1);
+  const int ci = scan[si];
+  const tran_low_t qc = qcoeff[ci];
+  const int coeff_ctx =
+      get_lower_levels_ctx(levels, ci, bwl, tx_size, tx_class);
+  if (qc == 0) {
+    *accu_rate += txb_costs->base_cost[coeff_ctx][0];
+  } else {
+    int lower_level = 0;
+    const tran_low_t abs_qc = abs(qc);
+    const tran_low_t tqc = tcoeff[ci];
+    const tran_low_t dqc = dqcoeff[ci];
+    const int sign = (qc < 0) ? 1 : 0;
+    const int64_t dist0 = get_coeff_dist(tqc, 0, shift);
+    int64_t dist = get_coeff_dist(tqc, dqc, shift) - dist0;
+    int rate =
+        get_coeff_cost_general(0, ci, abs_qc, sign, coeff_ctx, dc_sign_ctx,
+                               txb_costs, bwl, tx_class, levels);
+    int64_t rd = RDCOST(rdmult, *accu_rate + rate, *accu_dist + dist);
+
+    tran_low_t qc_low, dqc_low;
+    tran_low_t abs_qc_low;
+    int64_t dist_low, rd_low;
+    int rate_low;
+    if (abs_qc == 1) {
+      abs_qc_low = 0;
+      dqc_low = qc_low = 0;
+      dist_low = 0;
+      rate_low = txb_costs->base_cost[coeff_ctx][0];
+      rd_low = RDCOST(rdmult, *accu_rate + rate_low, *accu_dist);
+    } else {
+      get_qc_dqc_low(abs_qc, sign, dqv, shift, &qc_low, &dqc_low);
+      abs_qc_low = abs_qc - 1;
+      dist_low = get_coeff_dist(tqc, dqc_low, shift) - dist0;
+      rate_low =
+          get_coeff_cost_general(0, ci, abs_qc_low, sign, coeff_ctx,
+                                 dc_sign_ctx, txb_costs, bwl, tx_class, levels);
+      rd_low = RDCOST(rdmult, *accu_rate + rate_low, *accu_dist + dist_low);
+    }
+
+    int lower_level_new_eob = 0;
+    const int new_eob = si + 1;
+    const int coeff_ctx_new_eob = get_lower_levels_ctx_eob(bwl, height, si);
+    const int new_eob_cost =
+        get_eob_cost(new_eob, txb_eob_costs, txb_costs, tx_class);
+    int rate_coeff_eob =
+        new_eob_cost + get_coeff_cost_eob(ci, abs_qc, sign, coeff_ctx_new_eob,
+                                          dc_sign_ctx, txb_costs, bwl,
+                                          tx_class);
+    int64_t dist_new_eob = dist;
+    int64_t rd_new_eob = RDCOST(rdmult, rate_coeff_eob, dist_new_eob);
+
+    if (abs_qc_low > 0) {
+      const int rate_coeff_eob_low =
+          new_eob_cost + get_coeff_cost_eob(ci, abs_qc_low, sign,
+                                            coeff_ctx_new_eob, dc_sign_ctx,
+                                            txb_costs, bwl, tx_class);
+      const int64_t dist_new_eob_low = dist_low;
+      const int64_t rd_new_eob_low =
+          RDCOST(rdmult, rate_coeff_eob_low, dist_new_eob_low);
+      if (rd_new_eob_low < rd_new_eob) {
+        lower_level_new_eob = 1;
+        rd_new_eob = rd_new_eob_low;
+        rate_coeff_eob = rate_coeff_eob_low;
+        dist_new_eob = dist_new_eob_low;
+      }
+    }
+
+    if (rd_low < rd) {
+      lower_level = 1;
+      rd = rd_low;
+      rate = rate_low;
+      dist = dist_low;
+    }
+
+    if (sharpness == 0 && rd_new_eob < rd) {
+      for (int ni = 0; ni < *nz_num; ++ni) {
+        int last_ci = nz_ci[ni];
+        levels[get_padded_idx(last_ci, bwl)] = 0;
+        qcoeff[last_ci] = 0;
+        dqcoeff[last_ci] = 0;
+      }
+      *eob = new_eob;
+      *nz_num = 0;
+      *accu_rate = rate_coeff_eob;
+      *accu_dist = dist_new_eob;
+      lower_level = lower_level_new_eob;
+    } else {
+      *accu_rate += rate;
+      *accu_dist += dist;
+    }
+
+    if (lower_level) {
+      qcoeff[ci] = qc_low;
+      dqcoeff[ci] = dqc_low;
+      levels[get_padded_idx(ci, bwl)] = AOMMIN(abs_qc_low, INT8_MAX);
+    }
+    if (qcoeff[ci]) {
+      nz_ci[*nz_num] = ci;
+      ++*nz_num;
+    }
+  }
+}
+
+static INLINE void update_skip(int *accu_rate, int64_t accu_dist, int *eob,
+                               int nz_num, int *nz_ci, int64_t rdmult,
+                               int skip_cost, int non_skip_cost,
+                               tran_low_t *qcoeff, tran_low_t *dqcoeff,
+                               int sharpness) {
+  const int64_t rd = RDCOST(rdmult, *accu_rate + non_skip_cost, accu_dist);
+  const int64_t rd_new_eob = RDCOST(rdmult, skip_cost, 0);
+  if (sharpness == 0 && rd_new_eob < rd) {
+    for (int i = 0; i < nz_num; ++i) {
+      const int ci = nz_ci[i];
+      qcoeff[ci] = 0;
+      dqcoeff[ci] = 0;
+      // no need to set up levels because this is the last step
+      // levels[get_padded_idx(ci, bwl)] = 0;
+    }
+    *accu_rate = 0;
+    *eob = 0;
+  }
+}
+
+int av1_optimize_txb_new(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
+                         int block, TX_SIZE tx_size, TX_TYPE tx_type,
+                         const TXB_CTX *const txb_ctx, int *rate_cost,
+                         int sharpness, int fast_mode) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  struct macroblockd_plane *pd = &xd->plane[plane];
+  const struct macroblock_plane *p = &x->plane[plane];
+  const SCAN_ORDER *scan_order = get_scan(tx_size, tx_type);
+  const int16_t *scan = scan_order->scan;
+  const int shift = av1_get_tx_scale(tx_size);
+  int eob = p->eobs[block];
+  const int16_t *dequant = p->dequant_QTX;
+  const qm_val_t *iqmatrix =
+      av1_get_iqmatrix(&cpi->common.quant_params, xd, plane, tx_size, tx_type);
+  const int block_offset = BLOCK_OFFSET(block);
+  tran_low_t *qcoeff = p->qcoeff + block_offset;
+  tran_low_t *dqcoeff = pd->dqcoeff + block_offset;
+  const tran_low_t *tcoeff = p->coeff + block_offset;
+
+  // This function is not called if eob = 0.
+  assert(eob > 0);
+
+  if (fast_mode) {
+    update_coeff_eob_fast(&eob, shift, dequant, scan, tcoeff, qcoeff, dqcoeff);
+    p->eobs[block] = eob;
+    if (eob == 0) {
+      *rate_cost = av1_cost_skip_txb(x, txb_ctx, plane, tx_size);
+      return eob;
+    }
+  }
+
+  const AV1_COMMON *cm = &cpi->common;
+  const PLANE_TYPE plane_type = get_plane_type(plane);
+  const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+  const TX_CLASS tx_class = tx_type_to_class[tx_type];
+  const MB_MODE_INFO *mbmi = xd->mi[0];
+  const int bwl = get_txb_bwl(tx_size);
+  const int width = get_txb_wide(tx_size);
+  const int height = get_txb_high(tx_size);
+  assert(width == (1 << bwl));
+  const int is_inter = is_inter_block(mbmi);
+  const LV_MAP_COEFF_COST *txb_costs = &x->coeff_costs[txs_ctx][plane_type];
+  const int eob_multi_size = txsize_log2_minus4[tx_size];
+  const LV_MAP_EOB_COST *txb_eob_costs =
+      &x->eob_costs[eob_multi_size][plane_type];
+
+  const int rshift =
+      (sharpness +
+       (cpi->oxcf.aq_mode == VARIANCE_AQ && mbmi->segment_id < 4
+            ? 7 - mbmi->segment_id
+            : 2) +
+       (cpi->oxcf.aq_mode != VARIANCE_AQ &&
+                cpi->oxcf.deltaq_mode == DELTA_Q_PERCEPTUAL &&
+                cm->delta_q_info.delta_q_present_flag && x->sb_energy_level < 0
+            ? (3 - x->sb_energy_level)
+            : 0));
+  const int64_t rdmult =
+      (((int64_t)x->rdmult *
+        (plane_rd_mult[is_inter][plane_type] << (2 * (xd->bd - 8)))) +
+       2) >>
+      rshift;
+
+  uint8_t levels_buf[TX_PAD_2D];
+  uint8_t *const levels = set_levels(levels_buf, width);
+
+  if (eob > 1) av1_txb_init_levels(qcoeff, width, height, levels);
+
+  // TODO(angirbird): check iqmatrix
+
+  const int non_skip_cost = txb_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][0];
+  const int skip_cost = txb_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][1];
+  const int eob_cost = get_eob_cost(eob, txb_eob_costs, txb_costs, tx_class);
+  int accu_rate = eob_cost;
+  int64_t accu_dist = 0;
+  int si = eob - 1;
+  const int ci = scan[si];
+  const tran_low_t qc = qcoeff[ci];
+  const tran_low_t abs_qc = abs(qc);
+  const int sign = qc < 0;
+  const int max_nz_num = 2;
+  int nz_num = 1;
+  int nz_ci[3] = { ci, 0, 0 };
+  if (abs_qc >= 2) {
+    update_coeff_general(&accu_rate, &accu_dist, si, eob, tx_size, tx_class,
+                         bwl, height, rdmult, shift, txb_ctx->dc_sign_ctx,
+                         dequant, scan, txb_costs, tcoeff, qcoeff, dqcoeff,
+                         levels, iqmatrix);
+    --si;
+  } else {
+    assert(abs_qc == 1);
+    const int coeff_ctx = get_lower_levels_ctx_eob(bwl, height, si);
+    accu_rate +=
+        get_coeff_cost_eob(ci, abs_qc, sign, coeff_ctx, txb_ctx->dc_sign_ctx,
+                           txb_costs, bwl, tx_class);
+    const tran_low_t tqc = tcoeff[ci];
+    const tran_low_t dqc = dqcoeff[ci];
+    const int64_t dist = get_coeff_dist(tqc, dqc, shift);
+    const int64_t dist0 = get_coeff_dist(tqc, 0, shift);
+    accu_dist += dist - dist0;
+    --si;
+  }
+
+#define UPDATE_COEFF_EOB_CASE(tx_class_literal)                            \
+  case tx_class_literal:                                                   \
+    for (; si >= 0 && nz_num <= max_nz_num && !fast_mode; --si) {          \
+      update_coeff_eob(&accu_rate, &accu_dist, &eob, &nz_num, nz_ci, si,   \
+                       tx_size, tx_class_literal, bwl, height,             \
+                       txb_ctx->dc_sign_ctx, rdmult, shift, dequant, scan, \
+                       txb_eob_costs, txb_costs, tcoeff, qcoeff, dqcoeff,  \
+                       levels, sharpness, iqmatrix);                       \
+    }                                                                      \
+    break;
+  switch (tx_class) {
+    UPDATE_COEFF_EOB_CASE(TX_CLASS_2D);
+    UPDATE_COEFF_EOB_CASE(TX_CLASS_HORIZ);
+    UPDATE_COEFF_EOB_CASE(TX_CLASS_VERT);
+#undef UPDATE_COEFF_EOB_CASE
+    default: assert(false);
+  }
+
+  if (si == -1 && nz_num <= max_nz_num) {
+    update_skip(&accu_rate, accu_dist, &eob, nz_num, nz_ci, rdmult, skip_cost,
+                non_skip_cost, qcoeff, dqcoeff, sharpness);
+  }
+
+#define UPDATE_COEFF_SIMPLE_CASE(tx_class_literal)                             \
+  case tx_class_literal:                                                       \
+    for (; si >= 1; --si) {                                                    \
+      update_coeff_simple(&accu_rate, si, eob, tx_size, tx_class_literal, bwl, \
+                          rdmult, shift, dequant, scan, txb_costs, tcoeff,     \
+                          qcoeff, dqcoeff, levels, iqmatrix);                  \
+    }                                                                          \
+    break;
+  switch (tx_class) {
+    UPDATE_COEFF_SIMPLE_CASE(TX_CLASS_2D);
+    UPDATE_COEFF_SIMPLE_CASE(TX_CLASS_HORIZ);
+    UPDATE_COEFF_SIMPLE_CASE(TX_CLASS_VERT);
+#undef UPDATE_COEFF_SIMPLE_CASE
+    default: assert(false);
+  }
+
+  // DC position
+  if (si == 0) {
+    // no need to update accu_dist because it's not used after this point
+    int64_t dummy_dist = 0;
+    update_coeff_general(&accu_rate, &dummy_dist, si, eob, tx_size, tx_class,
+                         bwl, height, rdmult, shift, txb_ctx->dc_sign_ctx,
+                         dequant, scan, txb_costs, tcoeff, qcoeff, dqcoeff,
+                         levels, iqmatrix);
+  }
+
+  const int tx_type_cost = get_tx_type_cost(x, xd, plane, tx_size, tx_type,
+                                            cm->features.reduced_tx_set_used);
+  if (eob == 0)
+    accu_rate += skip_cost;
+  else
+    accu_rate += non_skip_cost + tx_type_cost;
+
+  p->eobs[block] = eob;
+  p->txb_entropy_ctx[block] =
+      av1_get_txb_entropy_context(qcoeff, scan_order, p->eobs[block]);
+
+  *rate_cost = accu_rate;
+  return eob;
+}
+
+// This function is deprecated, but we keep it here because hash trellis
+// is not integrated with av1_optimize_txb_new yet
+int av1_optimize_txb(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
+                     int blk_row, int blk_col, int block, TX_SIZE tx_size,
+                     TXB_CTX *txb_ctx, int fast_mode, int *rate_cost) {
+  const AV1_COMMON *cm = &cpi->common;
+  const int reduced_tx_set_used = cm->features.reduced_tx_set_used;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const PLANE_TYPE plane_type = get_plane_type(plane);
+  const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+  const TX_TYPE tx_type = av1_get_tx_type(xd, plane_type, blk_row, blk_col,
+                                          tx_size, reduced_tx_set_used);
+  const MB_MODE_INFO *mbmi = xd->mi[0];
+  const struct macroblock_plane *p = &x->plane[plane];
+  struct macroblockd_plane *pd = &xd->plane[plane];
+  const int eob = p->eobs[block];
+  const int block_offset = BLOCK_OFFSET(block);
+  tran_low_t *qcoeff = p->qcoeff + block_offset;
+  tran_low_t *dqcoeff = pd->dqcoeff + block_offset;
+  const tran_low_t *tcoeff = p->coeff + block_offset;
+  const int16_t *dequant = p->dequant_QTX;
+  const int seg_eob = av1_get_max_eob(tx_size);
+  const int bwl = get_txb_bwl(tx_size);
+  const int width = get_txb_wide(tx_size);
+  const int height = get_txb_high(tx_size);
+  const int is_inter = is_inter_block(mbmi);
+  const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
+  const LV_MAP_COEFF_COST *txb_costs = &x->coeff_costs[txs_ctx][plane_type];
+  const int eob_multi_size = txsize_log2_minus4[tx_size];
+  const LV_MAP_EOB_COST txb_eob_costs =
+      x->eob_costs[eob_multi_size][plane_type];
+
+  const int shift = av1_get_tx_scale(tx_size);
+  const int64_t rdmult =
+      (((int64_t)x->rdmult * plane_rd_mult[is_inter][plane_type]
+        << (2 * (xd->bd - 8))) +
+       2) >>
+      2;
+  uint8_t levels_buf[TX_PAD_2D];
+  uint8_t *const levels = set_levels(levels_buf, width);
+  const qm_val_t *iqmatrix =
+      av1_get_iqmatrix(&cpi->common.quant_params, xd, plane, tx_size, tx_type);
+  assert(width == (1 << bwl));
+  const int tx_type_cost =
+      get_tx_type_cost(x, xd, plane, tx_size, tx_type, reduced_tx_set_used);
+  TxbInfo txb_info = {
+    qcoeff,     levels,  dqcoeff, tcoeff,   dequant,      shift, tx_size,
+    txs_ctx,    tx_type, bwl,     width,    height,       eob,   seg_eob,
+    scan_order, txb_ctx, rdmult,  iqmatrix, tx_type_cost,
+  };
+
+#if CONFIG_HTB_TRELLIS
+  // Hash based trellis (hbt) speed feature: avoid expensive optimize_txb calls
+  // by storing the coefficient deltas in a hash table.
+  // Currently disabled in speedfeatures.c
+  if (eob <= HBT_EOB && eob > 0 && cpi->sf.use_hash_based_trellis) {
+    return hbt_create_hashes(&txb_info, txb_costs, &txb_eob_costs, p, block,
+                             fast_mode, rate_cost);
+  }
+#else
+  (void)fast_mode;
+#endif  // CONFIG_HTB_TRELLIS
+  av1_txb_init_levels(qcoeff, width, height, levels);
+
+  const int update =
+      optimize_txb(&txb_info, txb_costs, &txb_eob_costs, rate_cost);
+
+  if (update) {
+    p->eobs[block] = txb_info.eob;
+    p->txb_entropy_ctx[block] =
+        av1_get_txb_entropy_context(qcoeff, scan_order, txb_info.eob);
+  }
+  return txb_info.eob;
+}
+
+int av1_get_txb_entropy_context(const tran_low_t *qcoeff,
+                                const SCAN_ORDER *scan_order, int eob) {
+  const int16_t *const scan = scan_order->scan;
+  int cul_level = 0;
+  int c;
+
+  if (eob == 0) return 0;
+  for (c = 0; c < eob; ++c) {
+    cul_level += abs(qcoeff[scan[c]]);
+    if (cul_level > COEFF_CONTEXT_MASK) break;
+  }
+
+  cul_level = AOMMIN(COEFF_CONTEXT_MASK, cul_level);
+  set_dc_sign(&cul_level, qcoeff[0]);
+
+  return cul_level;
+}
+
+static void update_tx_type_count(const AV1_COMP *cpi, const AV1_COMMON *cm,
+                                 MACROBLOCKD *xd, int blk_row, int blk_col,
+                                 int plane, TX_SIZE tx_size,
+                                 FRAME_COUNTS *counts,
+                                 uint8_t allow_update_cdf) {
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  int is_inter = is_inter_block(mbmi);
+  const int reduced_tx_set_used = cm->features.reduced_tx_set_used;
+  FRAME_CONTEXT *fc = xd->tile_ctx;
+#if !CONFIG_ENTROPY_STATS
+  (void)counts;
+#endif  // !CONFIG_ENTROPY_STATS
+
+  // Only y plane's tx_type is updated
+  if (plane > 0) return;
+  const TX_TYPE tx_type = av1_get_tx_type(xd, PLANE_TYPE_Y, blk_row, blk_col,
+                                          tx_size, reduced_tx_set_used);
+  if (is_inter) {
+    if (cpi->oxcf.use_inter_dct_only) {
+      assert(tx_type == DCT_DCT);
+    }
+  } else {
+    if (cpi->oxcf.use_intra_dct_only) {
+      assert(tx_type == DCT_DCT);
+    } else if (cpi->oxcf.use_intra_default_tx_only) {
+      const TX_TYPE default_type = get_default_tx_type(
+          PLANE_TYPE_Y, xd, tx_size, cpi->is_screen_content_type);
+      (void)default_type;
+      assert(tx_type == default_type);
+    }
+  }
+
+  if (get_ext_tx_types(tx_size, is_inter, reduced_tx_set_used) > 1 &&
+      cm->quant_params.base_qindex > 0 && !mbmi->skip &&
+      !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+    const int eset = get_ext_tx_set(tx_size, is_inter, reduced_tx_set_used);
+    if (eset > 0) {
+      const TxSetType tx_set_type =
+          av1_get_ext_tx_set_type(tx_size, is_inter, reduced_tx_set_used);
+      if (is_inter) {
+        if (allow_update_cdf) {
+          update_cdf(fc->inter_ext_tx_cdf[eset][txsize_sqr_map[tx_size]],
+                     av1_ext_tx_ind[tx_set_type][tx_type],
+                     av1_num_ext_tx_set[tx_set_type]);
+        }
+#if CONFIG_ENTROPY_STATS
+        ++counts->inter_ext_tx[eset][txsize_sqr_map[tx_size]]
+                              [av1_ext_tx_ind[tx_set_type][tx_type]];
+#endif  // CONFIG_ENTROPY_STATS
+      } else {
+        PREDICTION_MODE intra_dir;
+        if (mbmi->filter_intra_mode_info.use_filter_intra)
+          intra_dir = fimode_to_intradir[mbmi->filter_intra_mode_info
+                                             .filter_intra_mode];
+        else
+          intra_dir = mbmi->mode;
+#if CONFIG_ENTROPY_STATS
+        ++counts->intra_ext_tx[eset][txsize_sqr_map[tx_size]][intra_dir]
+                              [av1_ext_tx_ind[tx_set_type][tx_type]];
+#endif  // CONFIG_ENTROPY_STATS
+        if (allow_update_cdf) {
+          update_cdf(
+              fc->intra_ext_tx_cdf[eset][txsize_sqr_map[tx_size]][intra_dir],
+              av1_ext_tx_ind[tx_set_type][tx_type],
+              av1_num_ext_tx_set[tx_set_type]);
+        }
+      }
+    }
+  }
+}
+
+void av1_update_and_record_txb_context(int plane, int block, int blk_row,
+                                       int blk_col, BLOCK_SIZE plane_bsize,
+                                       TX_SIZE tx_size, void *arg) {
+  struct tokenize_b_args *const args = arg;
+  const AV1_COMP *cpi = args->cpi;
+  const AV1_COMMON *cm = &cpi->common;
+  ThreadData *const td = args->td;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblock_plane *p = &x->plane[plane];
+  struct macroblockd_plane *pd = &xd->plane[plane];
+  const int eob = p->eobs[block];
+  const int block_offset = BLOCK_OFFSET(block);
+  tran_low_t *qcoeff = p->qcoeff + block_offset;
+  const PLANE_TYPE plane_type = pd->plane_type;
+  const TX_TYPE tx_type =
+      av1_get_tx_type(xd, plane_type, blk_row, blk_col, tx_size,
+                      cm->features.reduced_tx_set_used);
+  const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
+  tran_low_t *tcoeff;
+  assert(args->dry_run != DRY_RUN_COSTCOEFFS);
+  if (args->dry_run == OUTPUT_ENABLED) {
+    MB_MODE_INFO *mbmi = xd->mi[0];
+    TXB_CTX txb_ctx;
+    get_txb_ctx(plane_bsize, tx_size, plane,
+                pd->above_entropy_context + blk_col,
+                pd->left_entropy_context + blk_row, &txb_ctx);
+    const int bwl = get_txb_bwl(tx_size);
+    const int width = get_txb_wide(tx_size);
+    const int height = get_txb_high(tx_size);
+    const uint8_t allow_update_cdf = args->allow_update_cdf;
+    const TX_SIZE txsize_ctx = get_txsize_entropy_ctx(tx_size);
+    FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+#if CONFIG_ENTROPY_STATS
+    int cdf_idx = cm->coef_cdf_category;
+    ++td->counts->txb_skip[cdf_idx][txsize_ctx][txb_ctx.txb_skip_ctx][eob == 0];
+#endif  // CONFIG_ENTROPY_STATS
+    if (allow_update_cdf) {
+      update_cdf(ec_ctx->txb_skip_cdf[txsize_ctx][txb_ctx.txb_skip_ctx],
+                 eob == 0, 2);
+    }
+
+    CB_COEFF_BUFFER *cb_coef_buff = x->cb_coef_buff;
+    const int txb_offset =
+        x->mbmi_ext_frame->cb_offset / (TX_SIZE_W_MIN * TX_SIZE_H_MIN);
+    uint16_t *eob_txb = cb_coef_buff->eobs[plane] + txb_offset;
+    uint8_t *const entropy_ctx = cb_coef_buff->entropy_ctx[plane] + txb_offset;
+    entropy_ctx[block] = txb_ctx.txb_skip_ctx;
+    eob_txb[block] = eob;
+
+    if (eob == 0) {
+      av1_set_entropy_contexts(xd, pd, plane, plane_bsize, tx_size, 0, blk_col,
+                               blk_row);
+      return;
+    }
+    const int segment_id = mbmi->segment_id;
+    const int seg_eob = av1_get_tx_eob(&cpi->common.seg, segment_id, tx_size);
+    tran_low_t *tcoeff_txb =
+        cb_coef_buff->tcoeff[plane] + x->mbmi_ext_frame->cb_offset;
+    tcoeff = tcoeff_txb + block_offset;
+    memcpy(tcoeff, qcoeff, sizeof(*tcoeff) * seg_eob);
+
+    uint8_t levels_buf[TX_PAD_2D];
+    uint8_t *const levels = set_levels(levels_buf, width);
+    av1_txb_init_levels(tcoeff, width, height, levels);
+    update_tx_type_count(cpi, cm, xd, blk_row, blk_col, plane, tx_size,
+                         td->counts, allow_update_cdf);
+
+    const TX_CLASS tx_class = tx_type_to_class[tx_type];
+    const int16_t *const scan = scan_order->scan;
+
+    // record tx type usage
+    td->rd_counts.tx_type_used[tx_size][tx_type]++;
+
+#if CONFIG_ENTROPY_STATS
+    av1_update_eob_context(cdf_idx, eob, tx_size, tx_class, plane_type, ec_ctx,
+                           td->counts, allow_update_cdf);
+#else
+    av1_update_eob_context(eob, tx_size, tx_class, plane_type, ec_ctx,
+                           allow_update_cdf);
+#endif
+
+    DECLARE_ALIGNED(16, int8_t, coeff_contexts[MAX_TX_SQUARE]);
+    av1_get_nz_map_contexts(levels, scan, eob, tx_size, tx_class,
+                            coeff_contexts);
+
+    for (int c = eob - 1; c >= 0; --c) {
+      const int pos = scan[c];
+      const int coeff_ctx = coeff_contexts[pos];
+      const tran_low_t v = qcoeff[pos];
+      const tran_low_t level = abs(v);
+
+      if (allow_update_cdf) {
+        if (c == eob - 1) {
+          assert(coeff_ctx < 4);
+          update_cdf(
+              ec_ctx->coeff_base_eob_cdf[txsize_ctx][plane_type][coeff_ctx],
+              AOMMIN(level, 3) - 1, 3);
+        } else {
+          update_cdf(ec_ctx->coeff_base_cdf[txsize_ctx][plane_type][coeff_ctx],
+                     AOMMIN(level, 3), 4);
+        }
+      }
+      if (c == eob - 1) {
+        assert(coeff_ctx < 4);
+#if CONFIG_ENTROPY_STATS
+        ++td->counts->coeff_base_eob_multi[cdf_idx][txsize_ctx][plane_type]
+                                          [coeff_ctx][AOMMIN(level, 3) - 1];
+      } else {
+        ++td->counts->coeff_base_multi[cdf_idx][txsize_ctx][plane_type]
+                                      [coeff_ctx][AOMMIN(level, 3)];
+#endif
+      }
+      if (level > NUM_BASE_LEVELS) {
+        const int base_range = level - 1 - NUM_BASE_LEVELS;
+        const int br_ctx = get_br_ctx(levels, pos, bwl, tx_class);
+        for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) {
+          const int k = AOMMIN(base_range - idx, BR_CDF_SIZE - 1);
+          if (allow_update_cdf) {
+            update_cdf(ec_ctx->coeff_br_cdf[AOMMIN(txsize_ctx, TX_32X32)]
+                                           [plane_type][br_ctx],
+                       k, BR_CDF_SIZE);
+          }
+          for (int lps = 0; lps < BR_CDF_SIZE - 1; lps++) {
+#if CONFIG_ENTROPY_STATS
+            ++td->counts->coeff_lps[AOMMIN(txsize_ctx, TX_32X32)][plane_type]
+                                   [lps][br_ctx][lps == k];
+#endif  // CONFIG_ENTROPY_STATS
+            if (lps == k) break;
+          }
+#if CONFIG_ENTROPY_STATS
+          ++td->counts->coeff_lps_multi[cdf_idx][AOMMIN(txsize_ctx, TX_32X32)]
+                                       [plane_type][br_ctx][k];
+#endif
+          if (k < BR_CDF_SIZE - 1) break;
+        }
+      }
+    }
+    // Update the context needed to code the DC sign (if applicable)
+    if (tcoeff[0] != 0) {
+      const int dc_sign = (tcoeff[0] < 0) ? 1 : 0;
+      const int dc_sign_ctx = txb_ctx.dc_sign_ctx;
+#if CONFIG_ENTROPY_STATS
+      ++td->counts->dc_sign[plane_type][dc_sign_ctx][dc_sign];
+#endif  // CONFIG_ENTROPY_STATS
+      if (allow_update_cdf)
+        update_cdf(ec_ctx->dc_sign_cdf[plane_type][dc_sign_ctx], dc_sign, 2);
+      entropy_ctx[block] |= dc_sign_ctx << DC_SIGN_CTX_SHIFT;
+    }
+  } else {
+    tcoeff = qcoeff;
+  }
+  const int cul_level = av1_get_txb_entropy_context(tcoeff, scan_order, eob);
+  av1_set_entropy_contexts(xd, pd, plane, plane_bsize, tx_size, cul_level,
+                           blk_col, blk_row);
+}
+
+void av1_update_txb_context(const AV1_COMP *cpi, ThreadData *td,
+                            RUN_TYPE dry_run, BLOCK_SIZE bsize,
+                            uint8_t allow_update_cdf) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  struct tokenize_b_args arg = { cpi, td, 0, allow_update_cdf, dry_run };
+  if (mbmi->skip) {
+    av1_reset_entropy_context(xd, bsize, num_planes);
+    return;
+  }
+
+  for (int plane = 0; plane < num_planes; ++plane) {
+    if (plane && !xd->is_chroma_ref) break;
+    const struct macroblockd_plane *const pd = &xd->plane[plane];
+    const int ss_x = pd->subsampling_x;
+    const int ss_y = pd->subsampling_y;
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y);
+    av1_foreach_transformed_block_in_plane(
+        xd, plane_bsize, plane, av1_update_and_record_txb_context, &arg);
+  }
+}
+
+CB_COEFF_BUFFER *av1_get_cb_coeff_buffer(const struct AV1_COMP *cpi, int mi_row,
+                                         int mi_col) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int mib_size_log2 = cm->seq_params.mib_size_log2;
+  const int stride = (cm->mi_params.mi_cols >> mib_size_log2) + 1;
+  const int offset =
+      (mi_row >> mib_size_log2) * stride + (mi_col >> mib_size_log2);
+  return cpi->coeff_buffer_base + offset;
+}
diff --git a/libs/libaom/src/av1/encoder/encodetxb.h b/libs/libaom/src/av1/encoder/encodetxb.h
new file mode 100644
index 000000000..7122895d1
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/encodetxb.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ENCODETXB_H_
+#define AOM_AV1_ENCODER_ENCODETXB_H_
+
+#include "config/aom_config.h"
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/blockd.h"
+#include "av1/common/txb_common.h"
+#include "av1/encoder/block.h"
+#include "av1/encoder/encoder.h"
+#include "aom_dsp/bitwriter.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define TXB_SKIP_CTX_MASK 15
+#define DC_SIGN_CTX_SHIFT 4
+#define DC_SIGN_CTX_MASK 3
+
+typedef struct TxbInfo {
+  tran_low_t *qcoeff;
+  uint8_t *levels;  // absolute values and clamped to 255.
+  tran_low_t *dqcoeff;
+  const tran_low_t *tcoeff;
+  const int16_t *dequant;
+  int shift;
+  TX_SIZE tx_size;
+  TX_SIZE txs_ctx;
+  TX_TYPE tx_type;
+  int bwl;
+  int width;
+  int height;
+  int eob;
+  int seg_eob;
+  const SCAN_ORDER *scan_order;
+  TXB_CTX *txb_ctx;
+  int64_t rdmult;
+  const qm_val_t *iqmatrix;
+  int tx_type_cost;
+} TxbInfo;
+
+void av1_alloc_txb_buf(AV1_COMP *cpi);
+void av1_free_txb_buf(AV1_COMP *cpi);
+int av1_cost_coeffs_txb(const MACROBLOCK *x, const int plane, const int block,
+                        const TX_SIZE tx_size, const TX_TYPE tx_type,
+                        const TXB_CTX *const txb_ctx, int reduced_tx_set_used);
+int av1_cost_coeffs_txb_laplacian(const MACROBLOCK *x, const int plane,
+                                  const int block, const TX_SIZE tx_size,
+                                  const TX_TYPE tx_type,
+                                  const TXB_CTX *const txb_ctx,
+                                  const int reduced_tx_set_used,
+                                  const int adjust_eob);
+int av1_cost_coeffs_txb_estimate(const MACROBLOCK *x, const int plane,
+                                 const int block, const TX_SIZE tx_size,
+                                 const TX_TYPE tx_type);
+void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCK *const x,
+                          aom_writer *w, int blk_row, int blk_col, int plane,
+                          int block, TX_SIZE tx_size);
+void av1_write_coeffs_mb(const AV1_COMMON *const cm, MACROBLOCK *x,
+                         aom_writer *w, BLOCK_SIZE bsize);
+int av1_get_txb_entropy_context(const tran_low_t *qcoeff,
+                                const SCAN_ORDER *scan_order, int eob);
+void av1_update_txb_context(const AV1_COMP *cpi, ThreadData *td,
+                            RUN_TYPE dry_run, BLOCK_SIZE bsize,
+                            uint8_t allow_update_cdf);
+void av1_update_and_record_txb_context(int plane, int block, int blk_row,
+                                       int blk_col, BLOCK_SIZE plane_bsize,
+                                       TX_SIZE tx_size, void *arg);
+#if CONFIG_HTB_TRELLIS
+void hbt_destroy();
+#endif  // CONFIG_HTB_TRELLIS
+int av1_optimize_txb_new(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
+                         int block, TX_SIZE tx_size, TX_TYPE tx_type,
+                         const TXB_CTX *const txb_ctx, int *rate_cost,
+                         int sharpness, int fast_mode);
+
+CB_COEFF_BUFFER *av1_get_cb_coeff_buffer(const struct AV1_COMP *cpi, int mi_row,
+                                         int mi_col);
+
+// These numbers are empirically obtained.
+static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] = {
+  { 17, 13 },
+  { 16, 10 },
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // AOM_AV1_ENCODER_ENCODETXB_H_
diff --git a/libs/libaom/src/av1/encoder/ethread.c b/libs/libaom/src/av1/encoder/ethread.c
new file mode 100644
index 000000000..693270b87
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/ethread.c
@@ -0,0 +1,729 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/encoder/av1_multi_thread.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/ethread.h"
+#include "av1/encoder/rdopt.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+static AOM_INLINE void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) {
+  for (int i = 0; i < REFERENCE_MODES; i++)
+    td->rd_counts.comp_pred_diff[i] += td_t->rd_counts.comp_pred_diff[i];
+
+  for (int i = 0; i < REF_FRAMES; i++)
+    td->rd_counts.global_motion_used[i] +=
+        td_t->rd_counts.global_motion_used[i];
+
+  td->rd_counts.compound_ref_used_flag |=
+      td_t->rd_counts.compound_ref_used_flag;
+  td->rd_counts.skip_mode_used_flag |= td_t->rd_counts.skip_mode_used_flag;
+
+  for (int i = 0; i < TX_SIZES_ALL; i++) {
+    for (int j = 0; j < TX_TYPES; j++)
+      td->rd_counts.tx_type_used[i][j] += td_t->rd_counts.tx_type_used[i][j];
+  }
+
+  for (int i = 0; i < BLOCK_SIZES_ALL; i++) {
+    for (int j = 0; j < 2; j++) {
+      td->rd_counts.obmc_used[i][j] += td_t->rd_counts.obmc_used[i][j];
+    }
+  }
+
+  for (int i = 0; i < 2; i++) {
+    td->rd_counts.warped_used[i] += td_t->rd_counts.warped_used[i];
+  }
+}
+
+static AOM_INLINE void update_delta_lf_for_row_mt(AV1_COMP *cpi) {
+  AV1_COMMON *cm = &cpi->common;
+  MACROBLOCKD *xd = &cpi->td.mb.e_mbd;
+  const int mib_size = cm->seq_params.mib_size;
+  const int frame_lf_count =
+      av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
+  for (int row = 0; row < cm->tiles.rows; row++) {
+    for (int col = 0; col < cm->tiles.cols; col++) {
+      TileDataEnc *tile_data = &cpi->tile_data[row * cm->tiles.cols + col];
+      const TileInfo *const tile_info = &tile_data->tile_info;
+      for (int mi_row = tile_info->mi_row_start; mi_row < tile_info->mi_row_end;
+           mi_row += mib_size) {
+        if (mi_row == tile_info->mi_row_start)
+          av1_reset_loop_filter_delta(xd, av1_num_planes(cm));
+        for (int mi_col = tile_info->mi_col_start;
+             mi_col < tile_info->mi_col_end; mi_col += mib_size) {
+          const int idx_str = cm->mi_params.mi_stride * mi_row + mi_col;
+          MB_MODE_INFO **mi = cm->mi_params.mi_grid_base + idx_str;
+          MB_MODE_INFO *mbmi = mi[0];
+          if (mbmi->skip == 1 && (mbmi->sb_type == cm->seq_params.sb_size)) {
+            for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id)
+              mbmi->delta_lf[lf_id] = xd->delta_lf[lf_id];
+            mbmi->delta_lf_from_base = xd->delta_lf_from_base;
+          } else {
+            if (cm->delta_q_info.delta_lf_multi) {
+              for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id)
+                xd->delta_lf[lf_id] = mbmi->delta_lf[lf_id];
+            } else {
+              xd->delta_lf_from_base = mbmi->delta_lf_from_base;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+void av1_row_mt_sync_read_dummy(struct AV1RowMTSyncData *const row_mt_sync,
+                                int r, int c) {
+  (void)row_mt_sync;
+  (void)r;
+  (void)c;
+  return;
+}
+
+void av1_row_mt_sync_write_dummy(struct AV1RowMTSyncData *const row_mt_sync,
+                                 int r, int c, const int cols) {
+  (void)row_mt_sync;
+  (void)r;
+  (void)c;
+  (void)cols;
+  return;
+}
+
+void av1_row_mt_sync_read(AV1RowMTSync *const row_mt_sync, int r, int c) {
+#if CONFIG_MULTITHREAD
+  const int nsync = row_mt_sync->sync_range;
+
+  if (r) {
+    pthread_mutex_t *const mutex = &row_mt_sync->mutex_[r - 1];
+    pthread_mutex_lock(mutex);
+
+    while (c > row_mt_sync->cur_col[r - 1] - nsync) {
+      pthread_cond_wait(&row_mt_sync->cond_[r - 1], mutex);
+    }
+    pthread_mutex_unlock(mutex);
+  }
+#else
+  (void)row_mt_sync;
+  (void)r;
+  (void)c;
+#endif  // CONFIG_MULTITHREAD
+}
+
+void av1_row_mt_sync_write(AV1RowMTSync *const row_mt_sync, int r, int c,
+                           const int cols) {
+#if CONFIG_MULTITHREAD
+  const int nsync = row_mt_sync->sync_range;
+  int cur;
+  // Only signal when there are enough encoded blocks for next row to run.
+  int sig = 1;
+
+  if (c < cols - 1) {
+    cur = c;
+    if (c % nsync) sig = 0;
+  } else {
+    cur = cols + nsync;
+  }
+
+  if (sig) {
+    pthread_mutex_lock(&row_mt_sync->mutex_[r]);
+
+    row_mt_sync->cur_col[r] = cur;
+
+    pthread_cond_signal(&row_mt_sync->cond_[r]);
+    pthread_mutex_unlock(&row_mt_sync->mutex_[r]);
+  }
+#else
+  (void)row_mt_sync;
+  (void)r;
+  (void)c;
+  (void)cols;
+#endif  // CONFIG_MULTITHREAD
+}
+
+// Allocate memory for row synchronization
+void av1_row_mt_sync_mem_alloc(AV1RowMTSync *row_mt_sync, AV1_COMMON *cm,
+                               int rows) {
+  row_mt_sync->rows = rows;
+#if CONFIG_MULTITHREAD
+  {
+    int i;
+
+    CHECK_MEM_ERROR(cm, row_mt_sync->mutex_,
+                    aom_malloc(sizeof(*row_mt_sync->mutex_) * rows));
+    if (row_mt_sync->mutex_) {
+      for (i = 0; i < rows; ++i) {
+        pthread_mutex_init(&row_mt_sync->mutex_[i], NULL);
+      }
+    }
+
+    CHECK_MEM_ERROR(cm, row_mt_sync->cond_,
+                    aom_malloc(sizeof(*row_mt_sync->cond_) * rows));
+    if (row_mt_sync->cond_) {
+      for (i = 0; i < rows; ++i) {
+        pthread_cond_init(&row_mt_sync->cond_[i], NULL);
+      }
+    }
+  }
+#endif  // CONFIG_MULTITHREAD
+
+  CHECK_MEM_ERROR(cm, row_mt_sync->cur_col,
+                  aom_malloc(sizeof(*row_mt_sync->cur_col) * rows));
+
+  // Set up nsync.
+  row_mt_sync->sync_range = 1;
+}
+
+// Deallocate row based multi-threading synchronization related mutex and data
+void av1_row_mt_sync_mem_dealloc(AV1RowMTSync *row_mt_sync) {
+  if (row_mt_sync != NULL) {
+#if CONFIG_MULTITHREAD
+    int i;
+
+    if (row_mt_sync->mutex_ != NULL) {
+      for (i = 0; i < row_mt_sync->rows; ++i) {
+        pthread_mutex_destroy(&row_mt_sync->mutex_[i]);
+      }
+      aom_free(row_mt_sync->mutex_);
+    }
+    if (row_mt_sync->cond_ != NULL) {
+      for (i = 0; i < row_mt_sync->rows; ++i) {
+        pthread_cond_destroy(&row_mt_sync->cond_[i]);
+      }
+      aom_free(row_mt_sync->cond_);
+    }
+#endif  // CONFIG_MULTITHREAD
+    aom_free(row_mt_sync->cur_col);
+    // clear the structure as the source of this call may be dynamic change
+    // in tiles in which case this call will be followed by an _alloc()
+    // which may fail.
+    av1_zero(*row_mt_sync);
+  }
+}
+
+static AOM_INLINE void assign_tile_to_thread(
+    MultiThreadHandle *multi_thread_ctxt, int num_tiles, int num_workers) {
+  int tile_id = 0;
+  int i;
+
+  for (i = 0; i < num_workers; i++) {
+    multi_thread_ctxt->thread_id_to_tile_id[i] = tile_id++;
+    if (tile_id == num_tiles) tile_id = 0;
+  }
+}
+
+static int get_next_job(AV1_COMP *const cpi, int *current_mi_row,
+                        int cur_tile_id) {
+  AV1_COMMON *const cm = &cpi->common;
+  TileDataEnc *const this_tile = &cpi->tile_data[cur_tile_id];
+  AV1RowMTInfo *row_mt_info = &this_tile->row_mt_info;
+
+  if (row_mt_info->current_mi_row < this_tile->tile_info.mi_row_end) {
+    *current_mi_row = row_mt_info->current_mi_row;
+    row_mt_info->num_threads_working++;
+    row_mt_info->current_mi_row += cm->seq_params.mib_size;
+    return 1;
+  }
+  return 0;
+}
+
+static AOM_INLINE void switch_tile_and_get_next_job(AV1_COMP *const cpi,
+                                                    int *cur_tile_id,
+                                                    int *current_mi_row,
+                                                    int *end_of_frame) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int tile_cols = cm->tiles.cols;
+  const int tile_rows = cm->tiles.rows;
+
+  int tile_id = -1;  // Stores the tile ID with minimum proc done
+  int max_mis_to_encode = 0;
+  int min_num_threads_working = INT_MAX;
+
+  for (int tile_row = 0; tile_row < tile_rows; tile_row++) {
+    for (int tile_col = 0; tile_col < tile_cols; tile_col++) {
+      int tile_index = tile_row * tile_cols + tile_col;
+      TileDataEnc *this_tile = &cpi->tile_data[tile_index];
+      AV1RowMTInfo *row_mt_info = &this_tile->row_mt_info;
+      int num_sb_rows_in_tile =
+          av1_get_sb_rows_in_tile(cm, this_tile->tile_info);
+      int num_sb_cols_in_tile =
+          av1_get_sb_cols_in_tile(cm, this_tile->tile_info);
+      int theoretical_limit_on_threads =
+          AOMMIN((num_sb_cols_in_tile + 1) >> 1, num_sb_rows_in_tile);
+      int num_threads_working = row_mt_info->num_threads_working;
+      if (num_threads_working < theoretical_limit_on_threads) {
+        int num_mis_to_encode =
+            this_tile->tile_info.mi_row_end - row_mt_info->current_mi_row;
+
+        // Tile to be processed by this thread is selected on the basis of
+        // availability of jobs:
+        // 1) If jobs are available, tile to be processed is chosen on the
+        // basis of minimum number of threads working for that tile. If two or
+        // more tiles have same number of threads working for them, then the
+        // tile with maximum number of jobs available will be chosen.
+        // 2) If no jobs are available, then end_of_frame is reached.
+        if (num_mis_to_encode > 0) {
+          if (num_threads_working < min_num_threads_working) {
+            min_num_threads_working = num_threads_working;
+            max_mis_to_encode = 0;
+          }
+          if (num_threads_working == min_num_threads_working &&
+              num_mis_to_encode > max_mis_to_encode) {
+            tile_id = tile_index;
+            max_mis_to_encode = num_mis_to_encode;
+          }
+        }
+      }
+    }
+  }
+  if (tile_id == -1) {
+    *end_of_frame = 1;
+  } else {
+    // Update the cur ID to the next tile ID that will be processed,
+    // which will be the least processed tile
+    *cur_tile_id = tile_id;
+    get_next_job(cpi, current_mi_row, *cur_tile_id);
+  }
+}
+
+static int enc_row_mt_worker_hook(void *arg1, void *unused) {
+  EncWorkerData *const thread_data = (EncWorkerData *)arg1;
+  AV1_COMP *const cpi = thread_data->cpi;
+  AV1_COMMON *const cm = &cpi->common;
+
+  MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt;
+  int thread_id = thread_data->thread_id;
+  int cur_tile_id = multi_thread_ctxt->thread_id_to_tile_id[thread_id];
+  (void)unused;
+
+  assert(cur_tile_id != -1);
+
+  int end_of_frame = 0;
+  while (1) {
+    int current_mi_row = -1;
+#if CONFIG_MULTITHREAD
+    pthread_mutex_lock(cpi->row_mt_mutex_);
+#endif
+    if (!get_next_job(cpi, &current_mi_row, cur_tile_id)) {
+      // No jobs are available for the current tile. Query for the status of
+      // other tiles and get the next job if available
+      switch_tile_and_get_next_job(cpi, &cur_tile_id, &current_mi_row,
+                                   &end_of_frame);
+    }
+#if CONFIG_MULTITHREAD
+    pthread_mutex_unlock(cpi->row_mt_mutex_);
+#endif
+    if (end_of_frame == 1) break;
+
+    TileDataEnc *const this_tile = &cpi->tile_data[cur_tile_id];
+    int tile_row = this_tile->tile_info.tile_row;
+    int tile_col = this_tile->tile_info.tile_col;
+
+    assert(current_mi_row != -1 &&
+           current_mi_row <= this_tile->tile_info.mi_row_end);
+
+    ThreadData *td = thread_data->td;
+
+    td->mb.e_mbd.tile_ctx = td->tctx;
+    td->mb.tile_pb_ctx = &this_tile->tctx;
+    if (this_tile->allow_update_cdf) {
+      td->mb.row_ctx = this_tile->row_ctx;
+      if (current_mi_row == this_tile->tile_info.mi_row_start)
+        memcpy(td->mb.e_mbd.tile_ctx, &this_tile->tctx, sizeof(FRAME_CONTEXT));
+    } else {
+      memcpy(td->mb.e_mbd.tile_ctx, &this_tile->tctx, sizeof(FRAME_CONTEXT));
+    }
+
+    av1_init_above_context(&cm->above_contexts, av1_num_planes(cm), tile_row,
+                           &td->mb.e_mbd);
+
+    cfl_init(&td->mb.e_mbd.cfl, &cm->seq_params);
+    av1_crc32c_calculator_init(&td->mb.mb_rd_record.crc_calculator);
+
+    av1_encode_sb_row(cpi, td, tile_row, tile_col, current_mi_row);
+#if CONFIG_MULTITHREAD
+    pthread_mutex_lock(cpi->row_mt_mutex_);
+#endif
+    this_tile->row_mt_info.num_threads_working--;
+#if CONFIG_MULTITHREAD
+    pthread_mutex_unlock(cpi->row_mt_mutex_);
+#endif
+  }
+
+  return 1;
+}
+
+static int enc_worker_hook(void *arg1, void *unused) {
+  EncWorkerData *const thread_data = (EncWorkerData *)arg1;
+  AV1_COMP *const cpi = thread_data->cpi;
+  const AV1_COMMON *const cm = &cpi->common;
+  const int tile_cols = cm->tiles.cols;
+  const int tile_rows = cm->tiles.rows;
+  int t;
+
+  (void)unused;
+
+  for (t = thread_data->start; t < tile_rows * tile_cols;
+       t += cpi->num_workers) {
+    int tile_row = t / tile_cols;
+    int tile_col = t % tile_cols;
+
+    TileDataEnc *const this_tile =
+        &cpi->tile_data[tile_row * cm->tiles.cols + tile_col];
+    thread_data->td->mb.e_mbd.tile_ctx = &this_tile->tctx;
+    thread_data->td->mb.tile_pb_ctx = &this_tile->tctx;
+    av1_encode_tile(cpi, thread_data->td, tile_row, tile_col);
+  }
+
+  return 1;
+}
+
+static AOM_INLINE void create_enc_workers(AV1_COMP *cpi, int num_workers) {
+  AV1_COMMON *const cm = &cpi->common;
+  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+  int sb_mi_size = av1_get_sb_mi_size(cm);
+
+  CHECK_MEM_ERROR(cm, cpi->workers,
+                  aom_malloc(num_workers * sizeof(*cpi->workers)));
+
+  CHECK_MEM_ERROR(cm, cpi->tile_thr_data,
+                  aom_calloc(num_workers, sizeof(*cpi->tile_thr_data)));
+
+#if CONFIG_MULTITHREAD
+  if (cpi->oxcf.row_mt == 1) {
+    if (cpi->row_mt_mutex_ == NULL) {
+      CHECK_MEM_ERROR(cm, cpi->row_mt_mutex_,
+                      aom_malloc(sizeof(*(cpi->row_mt_mutex_))));
+      if (cpi->row_mt_mutex_) pthread_mutex_init(cpi->row_mt_mutex_, NULL);
+    }
+  }
+#endif
+
+  for (int i = num_workers - 1; i >= 0; i--) {
+    AVxWorker *const worker = &cpi->workers[i];
+    EncWorkerData *const thread_data = &cpi->tile_thr_data[i];
+
+    ++cpi->num_workers;
+    winterface->init(worker);
+    worker->thread_name = "aom enc worker";
+
+    thread_data->cpi = cpi;
+    thread_data->thread_id = i;
+
+    if (i > 0) {
+      // Allocate thread data.
+      CHECK_MEM_ERROR(cm, thread_data->td,
+                      aom_memalign(32, sizeof(*thread_data->td)));
+      av1_zero(*thread_data->td);
+
+      // Set up pc_tree.
+      thread_data->td->pc_tree = NULL;
+      av1_setup_pc_tree(cpi, thread_data->td);
+
+      CHECK_MEM_ERROR(cm, thread_data->td->above_pred_buf,
+                      (uint8_t *)aom_memalign(
+                          16, MAX_MB_PLANE * MAX_SB_SQUARE *
+                                  sizeof(*thread_data->td->above_pred_buf)));
+      CHECK_MEM_ERROR(cm, thread_data->td->left_pred_buf,
+                      (uint8_t *)aom_memalign(
+                          16, MAX_MB_PLANE * MAX_SB_SQUARE *
+                                  sizeof(*thread_data->td->left_pred_buf)));
+
+      CHECK_MEM_ERROR(
+          cm, thread_data->td->wsrc_buf,
+          (int32_t *)aom_memalign(
+              16, MAX_SB_SQUARE * sizeof(*thread_data->td->wsrc_buf)));
+
+      CHECK_MEM_ERROR(cm, thread_data->td->inter_modes_info,
+                      (InterModesInfo *)aom_malloc(
+                          sizeof(*thread_data->td->inter_modes_info)));
+
+      for (int x = 0; x < 2; x++)
+        for (int y = 0; y < 2; y++)
+          CHECK_MEM_ERROR(
+              cm, thread_data->td->hash_value_buffer[x][y],
+              (uint32_t *)aom_malloc(
+                  AOM_BUFFER_SIZE_FOR_BLOCK_HASH *
+                  sizeof(*thread_data->td->hash_value_buffer[0][0])));
+
+      CHECK_MEM_ERROR(
+          cm, thread_data->td->mask_buf,
+          (int32_t *)aom_memalign(
+              16, MAX_SB_SQUARE * sizeof(*thread_data->td->mask_buf)));
+      // Allocate frame counters in thread data.
+      CHECK_MEM_ERROR(cm, thread_data->td->counts,
+                      aom_calloc(1, sizeof(*thread_data->td->counts)));
+
+      // Allocate buffers used by palette coding mode.
+      CHECK_MEM_ERROR(
+          cm, thread_data->td->palette_buffer,
+          aom_memalign(16, sizeof(*thread_data->td->palette_buffer)));
+
+      av1_alloc_compound_type_rd_buffers(cm, &thread_data->td->comp_rd_buffer);
+
+      CHECK_MEM_ERROR(
+          cm, thread_data->td->tmp_conv_dst,
+          aom_memalign(32, MAX_SB_SIZE * MAX_SB_SIZE *
+                               sizeof(*thread_data->td->tmp_conv_dst)));
+      for (int j = 0; j < 2; ++j) {
+        CHECK_MEM_ERROR(
+            cm, thread_data->td->tmp_obmc_bufs[j],
+            aom_memalign(32, 2 * MAX_MB_PLANE * MAX_SB_SQUARE *
+                                 sizeof(*thread_data->td->tmp_obmc_bufs[j])));
+      }
+
+      CHECK_MEM_ERROR(
+          cm, thread_data->td->mbmi_ext,
+          aom_calloc(sb_mi_size, sizeof(*thread_data->td->mbmi_ext)));
+
+      if (cpi->sf.part_sf.partition_search_type == VAR_BASED_PARTITION) {
+        const int num_64x64_blocks =
+            (cm->seq_params.sb_size == BLOCK_64X64) ? 1 : 4;
+        CHECK_MEM_ERROR(
+            cm, thread_data->td->vt64x64,
+            aom_malloc(sizeof(*thread_data->td->vt64x64) * num_64x64_blocks));
+      }
+
+      // Create threads
+      if (!winterface->reset(worker))
+        aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+                           "Tile encoder thread creation failed");
+    } else {
+      // Main thread acts as a worker and uses the thread data in cpi.
+      thread_data->td = &cpi->td;
+    }
+    if (cpi->oxcf.row_mt == 1)
+      CHECK_MEM_ERROR(
+          cm, thread_data->td->tctx,
+          (FRAME_CONTEXT *)aom_memalign(16, sizeof(*thread_data->td->tctx)));
+    winterface->sync(worker);
+  }
+}
+
+static AOM_INLINE void launch_enc_workers(AV1_COMP *cpi, int num_workers) {
+  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+  // Encode a frame
+  for (int i = num_workers - 1; i >= 0; i--) {
+    AVxWorker *const worker = &cpi->workers[i];
+    EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;
+
+    // Set the starting tile for each thread.
+    thread_data->start = i;
+
+    if (i == 0)
+      winterface->execute(worker);
+    else
+      winterface->launch(worker);
+  }
+}
+
+static AOM_INLINE void sync_enc_workers(AV1_COMP *cpi, int num_workers) {
+  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+  int had_error = 0;
+
+  // Encoding ends.
+  for (int i = num_workers - 1; i >= 0; i--) {
+    AVxWorker *const worker = &cpi->workers[i];
+    had_error |= !winterface->sync(worker);
+  }
+
+  if (had_error)
+    aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR,
+                       "Failed to encode tile data");
+}
+
+static AOM_INLINE void accumulate_counters_enc_workers(AV1_COMP *cpi,
+                                                       int num_workers) {
+  for (int i = num_workers - 1; i >= 0; i--) {
+    AVxWorker *const worker = &cpi->workers[i];
+    EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;
+    cpi->intrabc_used |= thread_data->td->intrabc_used;
+    cpi->deltaq_used |= thread_data->td->deltaq_used;
+
+    // Accumulate counters.
+    if (i > 0) {
+      av1_accumulate_frame_counts(&cpi->counts, thread_data->td->counts);
+      accumulate_rd_opt(&cpi->td, thread_data->td);
+      cpi->td.mb.txb_split_count += thread_data->td->mb.txb_split_count;
+#if CONFIG_SPEED_STATS
+      cpi->td.mb.tx_search_count += thread_data->td->mb.tx_search_count;
+#endif  // CONFIG_SPEED_STATS
+    }
+  }
+}
+
+static AOM_INLINE void prepare_enc_workers(AV1_COMP *cpi, AVxWorkerHook hook,
+                                           int num_workers) {
+  for (int i = num_workers - 1; i >= 0; i--) {
+    AVxWorker *const worker = &cpi->workers[i];
+    EncWorkerData *const thread_data = &cpi->tile_thr_data[i];
+
+    worker->hook = hook;
+    worker->data1 = thread_data;
+    worker->data2 = NULL;
+
+    thread_data->td->intrabc_used = 0;
+    thread_data->td->deltaq_used = 0;
+
+    // Before encoding a frame, copy the thread data from cpi.
+    if (thread_data->td != &cpi->td) {
+      thread_data->td->mb = cpi->td.mb;
+      thread_data->td->rd_counts = cpi->td.rd_counts;
+      thread_data->td->mb.above_pred_buf = thread_data->td->above_pred_buf;
+      thread_data->td->mb.left_pred_buf = thread_data->td->left_pred_buf;
+      thread_data->td->mb.wsrc_buf = thread_data->td->wsrc_buf;
+
+      thread_data->td->mb.inter_modes_info = thread_data->td->inter_modes_info;
+      for (int x = 0; x < 2; x++) {
+        for (int y = 0; y < 2; y++) {
+          memcpy(thread_data->td->hash_value_buffer[x][y],
+                 cpi->td.mb.intrabc_hash_info.hash_value_buffer[x][y],
+                 AOM_BUFFER_SIZE_FOR_BLOCK_HASH *
+                     sizeof(*thread_data->td->hash_value_buffer[0][0]));
+          thread_data->td->mb.intrabc_hash_info.hash_value_buffer[x][y] =
+              thread_data->td->hash_value_buffer[x][y];
+        }
+      }
+      thread_data->td->mb.mask_buf = thread_data->td->mask_buf;
+      thread_data->td->mb.mbmi_ext = thread_data->td->mbmi_ext;
+    }
+    if (thread_data->td->counts != &cpi->counts) {
+      memcpy(thread_data->td->counts, &cpi->counts, sizeof(cpi->counts));
+    }
+
+    if (i > 0) {
+      thread_data->td->mb.palette_buffer = thread_data->td->palette_buffer;
+      thread_data->td->mb.comp_rd_buffer = thread_data->td->comp_rd_buffer;
+      thread_data->td->mb.tmp_conv_dst = thread_data->td->tmp_conv_dst;
+      for (int j = 0; j < 2; ++j) {
+        thread_data->td->mb.tmp_obmc_bufs[j] =
+            thread_data->td->tmp_obmc_bufs[j];
+      }
+
+      thread_data->td->mb.e_mbd.tmp_conv_dst = thread_data->td->mb.tmp_conv_dst;
+      for (int j = 0; j < 2; ++j) {
+        thread_data->td->mb.e_mbd.tmp_obmc_bufs[j] =
+            thread_data->td->mb.tmp_obmc_bufs[j];
+      }
+    }
+  }
+}
+
+void av1_encode_tiles_mt(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int tile_cols = cm->tiles.cols;
+  const int tile_rows = cm->tiles.rows;
+  int num_workers = AOMMIN(cpi->oxcf.max_threads, tile_cols * tile_rows);
+
+  if (cpi->tile_data == NULL || cpi->allocated_tiles < tile_cols * tile_rows)
+    av1_alloc_tile_data(cpi);
+
+  av1_init_tile_data(cpi);
+  // Only run once to create threads and allocate thread data.
+  if (cpi->num_workers == 0) {
+    create_enc_workers(cpi, num_workers);
+  } else {
+    num_workers = AOMMIN(num_workers, cpi->num_workers);
+  }
+  prepare_enc_workers(cpi, enc_worker_hook, num_workers);
+  launch_enc_workers(cpi, num_workers);
+  sync_enc_workers(cpi, num_workers);
+  accumulate_counters_enc_workers(cpi, num_workers);
+}
+
+// Accumulate frame counts. FRAME_COUNTS consist solely of 'unsigned int'
+// members, so we treat it as an array, and sum over the whole length.
+void av1_accumulate_frame_counts(FRAME_COUNTS *acc_counts,
+                                 const FRAME_COUNTS *counts) {
+  unsigned int *const acc = (unsigned int *)acc_counts;
+  const unsigned int *const cnt = (const unsigned int *)counts;
+
+  const unsigned int n_counts = sizeof(FRAME_COUNTS) / sizeof(unsigned int);
+
+  for (unsigned int i = 0; i < n_counts; i++) acc[i] += cnt[i];
+}
+
+void av1_encode_tiles_row_mt(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int tile_cols = cm->tiles.cols;
+  const int tile_rows = cm->tiles.rows;
+  MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt;
+  int num_workers = 0;
+  int total_num_threads_row_mt = 0;
+  int max_sb_rows = 0;
+
+  if (cpi->tile_data == NULL || cpi->allocated_tiles < tile_cols * tile_rows) {
+    av1_row_mt_mem_dealloc(cpi);
+    av1_alloc_tile_data(cpi);
+  }
+
+  av1_init_tile_data(cpi);
+
+  for (int row = 0; row < tile_rows; row++) {
+    for (int col = 0; col < tile_cols; col++) {
+      TileDataEnc *tile_data = &cpi->tile_data[row * cm->tiles.cols + col];
+      int num_sb_rows_in_tile =
+          av1_get_sb_rows_in_tile(cm, tile_data->tile_info);
+      int num_sb_cols_in_tile =
+          av1_get_sb_cols_in_tile(cm, tile_data->tile_info);
+      total_num_threads_row_mt +=
+          AOMMIN((num_sb_cols_in_tile + 1) >> 1, num_sb_rows_in_tile);
+      max_sb_rows = AOMMAX(max_sb_rows, num_sb_rows_in_tile);
+    }
+  }
+  // TODO(ravi.chaudhary@ittiam.com): Currently the percentage of
+  // post-processing stages in encoder is quiet low, so limiting the number of
+  // threads to the theoretical limit in row-mt does not have much impact on
+  // post-processing multi-threading stage. Need to revisit this when
+  // post-processing time starts shooting up.
+  num_workers = AOMMIN(cpi->oxcf.max_threads, total_num_threads_row_mt);
+
+  if (multi_thread_ctxt->allocated_tile_cols != tile_cols ||
+      multi_thread_ctxt->allocated_tile_rows != tile_rows ||
+      multi_thread_ctxt->allocated_sb_rows != max_sb_rows) {
+    av1_row_mt_mem_dealloc(cpi);
+    av1_row_mt_mem_alloc(cpi, max_sb_rows);
+  }
+
+  memset(multi_thread_ctxt->thread_id_to_tile_id, -1,
+         sizeof(*multi_thread_ctxt->thread_id_to_tile_id) * MAX_NUM_THREADS);
+
+  for (int tile_row = 0; tile_row < tile_rows; tile_row++) {
+    for (int tile_col = 0; tile_col < tile_cols; tile_col++) {
+      int tile_id = tile_row * tile_cols + tile_col;
+      TileDataEnc *this_tile = &cpi->tile_data[tile_id];
+
+      // Initialize cur_col to -1 for all rows.
+      memset(this_tile->row_mt_sync.cur_col, -1,
+             sizeof(*this_tile->row_mt_sync.cur_col) * max_sb_rows);
+      this_tile->row_mt_info.current_mi_row = this_tile->tile_info.mi_row_start;
+      this_tile->row_mt_info.num_threads_working = 0;
+
+      av1_inter_mode_data_init(this_tile);
+      av1_zero_above_context(cm, &cpi->td.mb.e_mbd,
+                             this_tile->tile_info.mi_col_start,
+                             this_tile->tile_info.mi_col_end, tile_row);
+    }
+  }
+
+  // Only run once to create threads and allocate thread data.
+  if (cpi->num_workers == 0) {
+    create_enc_workers(cpi, num_workers);
+  } else {
+    num_workers = AOMMIN(num_workers, cpi->num_workers);
+  }
+  assign_tile_to_thread(multi_thread_ctxt, tile_cols * tile_rows, num_workers);
+  prepare_enc_workers(cpi, enc_row_mt_worker_hook, num_workers);
+  launch_enc_workers(cpi, num_workers);
+  sync_enc_workers(cpi, num_workers);
+  if (cm->delta_q_info.delta_lf_present_flag) update_delta_lf_for_row_mt(cpi);
+  accumulate_counters_enc_workers(cpi, num_workers);
+}
diff --git a/libs/libaom/src/av1/encoder/ethread.h b/libs/libaom/src/av1/encoder/ethread.h
new file mode 100644
index 000000000..183075950
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/ethread.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ETHREAD_H_
+#define AOM_AV1_ENCODER_ETHREAD_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct AV1_COMP;
+struct ThreadData;
+struct AV1RowMTSyncData;
+
+typedef struct EncWorkerData {
+  struct AV1_COMP *cpi;
+  struct ThreadData *td;
+  int start;
+  int thread_id;
+} EncWorkerData;
+
+void av1_row_mt_sync_read(AV1RowMTSync *const row_mt_sync, int r, int c);
+void av1_row_mt_sync_write(AV1RowMTSync *const row_mt_sync, int r, int c,
+                           const int cols);
+
+void av1_row_mt_sync_read_dummy(struct AV1RowMTSyncData *const row_mt_sync,
+                                int r, int c);
+void av1_row_mt_sync_write_dummy(struct AV1RowMTSyncData *const row_mt_sync,
+                                 int r, int c, const int cols);
+
+void av1_row_mt_sync_mem_dealloc(AV1RowMTSync *row_mt_sync);
+// Allocate memory for row based multi-threading synchronization.
+void av1_row_mt_sync_mem_alloc(AV1RowMTSync *row_mt_sync, struct AV1Common *cm,
+                               int rows);
+
+void av1_encode_tiles_mt(struct AV1_COMP *cpi);
+void av1_encode_tiles_row_mt(struct AV1_COMP *cpi);
+
+void av1_accumulate_frame_counts(struct FRAME_COUNTS *acc_counts,
+                                 const struct FRAME_COUNTS *counts);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_ETHREAD_H_
diff --git a/libs/libaom/src/av1/encoder/extend.c b/libs/libaom/src/av1/encoder/extend.c
new file mode 100644
index 000000000..934cf5644
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/extend.c
@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+
+#include "av1/common/common.h"
+#include "av1/encoder/extend.h"
+
+static void copy_and_extend_plane(const uint8_t *src, int src_pitch,
+                                  uint8_t *dst, int dst_pitch, int w, int h,
+                                  int extend_top, int extend_left,
+                                  int extend_bottom, int extend_right) {
+  int i, linesize;
+
+  // copy the left and right most columns out
+  const uint8_t *src_ptr1 = src;
+  const uint8_t *src_ptr2 = src + w - 1;
+  uint8_t *dst_ptr1 = dst - extend_left;
+  uint8_t *dst_ptr2 = dst + w;
+
+  for (i = 0; i < h; i++) {
+    memset(dst_ptr1, src_ptr1[0], extend_left);
+    memcpy(dst_ptr1 + extend_left, src_ptr1, w);
+    memset(dst_ptr2, src_ptr2[0], extend_right);
+    src_ptr1 += src_pitch;
+    src_ptr2 += src_pitch;
+    dst_ptr1 += dst_pitch;
+    dst_ptr2 += dst_pitch;
+  }
+
+  // Now copy the top and bottom lines into each line of the respective
+  // borders
+  src_ptr1 = dst - extend_left;
+  src_ptr2 = dst + dst_pitch * (h - 1) - extend_left;
+  dst_ptr1 = dst + dst_pitch * (-extend_top) - extend_left;
+  dst_ptr2 = dst + dst_pitch * (h)-extend_left;
+  linesize = extend_left + extend_right + w;
+
+  for (i = 0; i < extend_top; i++) {
+    memcpy(dst_ptr1, src_ptr1, linesize);
+    dst_ptr1 += dst_pitch;
+  }
+
+  for (i = 0; i < extend_bottom; i++) {
+    memcpy(dst_ptr2, src_ptr2, linesize);
+    dst_ptr2 += dst_pitch;
+  }
+}
+
+static void highbd_copy_and_extend_plane(const uint8_t *src8, int src_pitch,
+                                         uint8_t *dst8, int dst_pitch, int w,
+                                         int h, int extend_top, int extend_left,
+                                         int extend_bottom, int extend_right) {
+  int i, linesize;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+
+  // copy the left and right most columns out
+  const uint16_t *src_ptr1 = src;
+  const uint16_t *src_ptr2 = src + w - 1;
+  uint16_t *dst_ptr1 = dst - extend_left;
+  uint16_t *dst_ptr2 = dst + w;
+
+  for (i = 0; i < h; i++) {
+    aom_memset16(dst_ptr1, src_ptr1[0], extend_left);
+    memcpy(dst_ptr1 + extend_left, src_ptr1, w * sizeof(src_ptr1[0]));
+    aom_memset16(dst_ptr2, src_ptr2[0], extend_right);
+    src_ptr1 += src_pitch;
+    src_ptr2 += src_pitch;
+    dst_ptr1 += dst_pitch;
+    dst_ptr2 += dst_pitch;
+  }
+
+  // Now copy the top and bottom lines into each line of the respective
+  // borders
+  src_ptr1 = dst - extend_left;
+  src_ptr2 = dst + dst_pitch * (h - 1) - extend_left;
+  dst_ptr1 = dst + dst_pitch * (-extend_top) - extend_left;
+  dst_ptr2 = dst + dst_pitch * (h)-extend_left;
+  linesize = extend_left + extend_right + w;
+
+  for (i = 0; i < extend_top; i++) {
+    memcpy(dst_ptr1, src_ptr1, linesize * sizeof(src_ptr1[0]));
+    dst_ptr1 += dst_pitch;
+  }
+
+  for (i = 0; i < extend_bottom; i++) {
+    memcpy(dst_ptr2, src_ptr2, linesize * sizeof(src_ptr2[0]));
+    dst_ptr2 += dst_pitch;
+  }
+}
+
+void av1_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src,
+                               YV12_BUFFER_CONFIG *dst) {
+  // Extend src frame in buffer
+  const int et_y = dst->border;
+  const int el_y = dst->border;
+  const int er_y =
+      AOMMAX(src->y_width + dst->border, ALIGN_POWER_OF_TWO(src->y_width, 6)) -
+      src->y_crop_width;
+  const int eb_y = AOMMAX(src->y_height + dst->border,
+                          ALIGN_POWER_OF_TWO(src->y_height, 6)) -
+                   src->y_crop_height;
+  const int uv_width_subsampling = (src->uv_width != src->y_width);
+  const int uv_height_subsampling = (src->uv_height != src->y_height);
+  const int et_uv = et_y >> uv_height_subsampling;
+  const int el_uv = el_y >> uv_width_subsampling;
+  const int eb_uv = eb_y >> uv_height_subsampling;
+  const int er_uv = er_y >> uv_width_subsampling;
+
+  if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
+    highbd_copy_and_extend_plane(src->y_buffer, src->y_stride, dst->y_buffer,
+                                 dst->y_stride, src->y_crop_width,
+                                 src->y_crop_height, et_y, el_y, eb_y, er_y);
+    if (src->u_buffer) {
+      highbd_copy_and_extend_plane(
+          src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride,
+          src->uv_crop_width, src->uv_crop_height, et_uv, el_uv, eb_uv, er_uv);
+    }
+    if (src->v_buffer) {
+      highbd_copy_and_extend_plane(
+          src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride,
+          src->uv_crop_width, src->uv_crop_height, et_uv, el_uv, eb_uv, er_uv);
+    }
+    return;
+  }
+
+  copy_and_extend_plane(src->y_buffer, src->y_stride, dst->y_buffer,
+                        dst->y_stride, src->y_crop_width, src->y_crop_height,
+                        et_y, el_y, eb_y, er_y);
+  if (src->u_buffer) {
+    copy_and_extend_plane(src->u_buffer, src->uv_stride, dst->u_buffer,
+                          dst->uv_stride, src->uv_crop_width,
+                          src->uv_crop_height, et_uv, el_uv, eb_uv, er_uv);
+  }
+  if (src->v_buffer) {
+    copy_and_extend_plane(src->v_buffer, src->uv_stride, dst->v_buffer,
+                          dst->uv_stride, src->uv_crop_width,
+                          src->uv_crop_height, et_uv, el_uv, eb_uv, er_uv);
+  }
+}
diff --git a/libs/libaom/src/av1/encoder/extend.h b/libs/libaom/src/av1/encoder/extend.h
new file mode 100644
index 000000000..b8cc5b9d2
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/extend.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_EXTEND_H_
+#define AOM_AV1_ENCODER_EXTEND_H_
+
+#include "aom_scale/yv12config.h"
+#include "aom/aom_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src,
+                               YV12_BUFFER_CONFIG *dst);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_EXTEND_H_
diff --git a/libs/libaom/src/av1/encoder/firstpass.c b/libs/libaom/src/av1/encoder/firstpass.c
new file mode 100644
index 000000000..0955510ca
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/firstpass.c
@@ -0,0 +1,1065 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/variance.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/system_state.h"
+#include "aom_scale/aom_scale.h"
+#include "aom_scale/yv12config.h"
+
+#include "av1/common/entropymv.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/reconinter.h"  // av1_setup_dst_planes()
+#include "av1/common/txb_common.h"
+#include "av1/encoder/aq_variance.h"
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/block.h"
+#include "av1/encoder/dwt.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/encodemb.h"
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encode_strategy.h"
+#include "av1/encoder/extend.h"
+#include "av1/encoder/firstpass.h"
+#include "av1/encoder/mcomp.h"
+#include "av1/encoder/rd.h"
+#include "av1/encoder/reconinter_enc.h"
+
+#define OUTPUT_FPF 0
+
+#define FIRST_PASS_Q 10.0
+#define INTRA_MODE_PENALTY 1024
+#define NEW_MV_MODE_PENALTY 32
+#define DARK_THRESH 64
+
+#define NCOUNT_INTRA_THRESH 8192
+#define NCOUNT_INTRA_FACTOR 3
+
+static AOM_INLINE void output_stats(FIRSTPASS_STATS *stats,
+                                    struct aom_codec_pkt_list *pktlist) {
+  struct aom_codec_cx_pkt pkt;
+  pkt.kind = AOM_CODEC_STATS_PKT;
+  pkt.data.twopass_stats.buf = stats;
+  pkt.data.twopass_stats.sz = sizeof(FIRSTPASS_STATS);
+  if (pktlist != NULL) aom_codec_pkt_list_add(pktlist, &pkt);
+
+// TEMP debug code
+#if OUTPUT_FPF
+  {
+    FILE *fpfile;
+    fpfile = fopen("firstpass.stt", "a");
+
+    fprintf(fpfile,
+            "%12.0lf %12.4lf %12.0lf %12.0lf %12.0lf %12.4lf %12.4lf"
+            "%12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf"
+            "%12.4lf %12.4lf %12.0lf %12.0lf %12.0lf %12.4lf %12.4lf\n",
+            stats->frame, stats->weight, stats->intra_error, stats->coded_error,
+            stats->sr_coded_error, stats->pcnt_inter, stats->pcnt_motion,
+            stats->pcnt_second_ref, stats->pcnt_neutral, stats->intra_skip_pct,
+            stats->inactive_zone_rows, stats->inactive_zone_cols, stats->MVr,
+            stats->mvr_abs, stats->MVc, stats->mvc_abs, stats->MVrv,
+            stats->MVcv, stats->mv_in_out_count, stats->new_mv_count,
+            stats->count, stats->duration);
+    fclose(fpfile);
+  }
+#endif
+}
+
+void av1_twopass_zero_stats(FIRSTPASS_STATS *section) {
+  section->frame = 0.0;
+  section->weight = 0.0;
+  section->intra_error = 0.0;
+  section->frame_avg_wavelet_energy = 0.0;
+  section->coded_error = 0.0;
+  section->sr_coded_error = 0.0;
+  section->pcnt_inter = 0.0;
+  section->pcnt_motion = 0.0;
+  section->pcnt_second_ref = 0.0;
+  section->pcnt_neutral = 0.0;
+  section->intra_skip_pct = 0.0;
+  section->inactive_zone_rows = 0.0;
+  section->inactive_zone_cols = 0.0;
+  section->MVr = 0.0;
+  section->mvr_abs = 0.0;
+  section->MVc = 0.0;
+  section->mvc_abs = 0.0;
+  section->MVrv = 0.0;
+  section->MVcv = 0.0;
+  section->mv_in_out_count = 0.0;
+  section->new_mv_count = 0.0;
+  section->count = 0.0;
+  section->duration = 1.0;
+}
+
+static AOM_INLINE void accumulate_stats(FIRSTPASS_STATS *section,
+                                        const FIRSTPASS_STATS *frame) {
+  section->frame += frame->frame;
+  section->weight += frame->weight;
+  section->intra_error += frame->intra_error;
+  section->frame_avg_wavelet_energy += frame->frame_avg_wavelet_energy;
+  section->coded_error += frame->coded_error;
+  section->sr_coded_error += frame->sr_coded_error;
+  section->pcnt_inter += frame->pcnt_inter;
+  section->pcnt_motion += frame->pcnt_motion;
+  section->pcnt_second_ref += frame->pcnt_second_ref;
+  section->pcnt_neutral += frame->pcnt_neutral;
+  section->intra_skip_pct += frame->intra_skip_pct;
+  section->inactive_zone_rows += frame->inactive_zone_rows;
+  section->inactive_zone_cols += frame->inactive_zone_cols;
+  section->MVr += frame->MVr;
+  section->mvr_abs += frame->mvr_abs;
+  section->MVc += frame->MVc;
+  section->mvc_abs += frame->mvc_abs;
+  section->MVrv += frame->MVrv;
+  section->MVcv += frame->MVcv;
+  section->mv_in_out_count += frame->mv_in_out_count;
+  section->new_mv_count += frame->new_mv_count;
+  section->count += frame->count;
+  section->duration += frame->duration;
+}
+
+void av1_end_first_pass(AV1_COMP *cpi) {
+  if (cpi->twopass.stats_buf_ctx->total_stats)
+    output_stats(cpi->twopass.stats_buf_ctx->total_stats, cpi->output_pkt_list);
+}
+
+static aom_variance_fn_t get_block_variance_fn(BLOCK_SIZE bsize) {
+  switch (bsize) {
+    case BLOCK_8X8: return aom_mse8x8;
+    case BLOCK_16X8: return aom_mse16x8;
+    case BLOCK_8X16: return aom_mse8x16;
+    default: return aom_mse16x16;
+  }
+}
+
+static unsigned int get_prediction_error(BLOCK_SIZE bsize,
+                                         const struct buf_2d *src,
+                                         const struct buf_2d *ref) {
+  unsigned int sse;
+  const aom_variance_fn_t fn = get_block_variance_fn(bsize);
+  fn(src->buf, src->stride, ref->buf, ref->stride, &sse);
+  return sse;
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static aom_variance_fn_t highbd_get_block_variance_fn(BLOCK_SIZE bsize,
+                                                      int bd) {
+  switch (bd) {
+    default:
+      switch (bsize) {
+        case BLOCK_8X8: return aom_highbd_8_mse8x8;
+        case BLOCK_16X8: return aom_highbd_8_mse16x8;
+        case BLOCK_8X16: return aom_highbd_8_mse8x16;
+        default: return aom_highbd_8_mse16x16;
+      }
+      break;
+    case 10:
+      switch (bsize) {
+        case BLOCK_8X8: return aom_highbd_10_mse8x8;
+        case BLOCK_16X8: return aom_highbd_10_mse16x8;
+        case BLOCK_8X16: return aom_highbd_10_mse8x16;
+        default: return aom_highbd_10_mse16x16;
+      }
+      break;
+    case 12:
+      switch (bsize) {
+        case BLOCK_8X8: return aom_highbd_12_mse8x8;
+        case BLOCK_16X8: return aom_highbd_12_mse16x8;
+        case BLOCK_8X16: return aom_highbd_12_mse8x16;
+        default: return aom_highbd_12_mse16x16;
+      }
+      break;
+  }
+}
+
+static unsigned int highbd_get_prediction_error(BLOCK_SIZE bsize,
+                                                const struct buf_2d *src,
+                                                const struct buf_2d *ref,
+                                                int bd) {
+  unsigned int sse;
+  const aom_variance_fn_t fn = highbd_get_block_variance_fn(bsize, bd);
+  fn(src->buf, src->stride, ref->buf, ref->stride, &sse);
+  return sse;
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+// Refine the motion search range according to the frame dimension
+// for first pass test.
+static int get_search_range(const AV1_COMP *cpi) {
+  int sr = 0;
+  const int dim = AOMMIN(cpi->initial_width, cpi->initial_height);
+
+  while ((dim << sr) < MAX_FULL_PEL_VAL) ++sr;
+  return sr;
+}
+
+static AOM_INLINE void first_pass_motion_search(AV1_COMP *cpi, MACROBLOCK *x,
+                                                const MV *ref_mv,
+                                                FULLPEL_MV *best_mv,
+                                                int *best_motion_err) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  FULLPEL_MV start_mv = get_fullmv_from_mv(ref_mv);
+  int tmp_err;
+  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+  aom_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[bsize];
+  const int new_mv_mode_penalty = NEW_MV_MODE_PENALTY;
+  const int sr = get_search_range(cpi);
+  const int step_param = 3 + sr;
+
+  const search_site_config *first_pass_search_sites =
+      &cpi->mv_search_params.ss_cfg[SS_CFG_FPF];
+  FULLPEL_MOTION_SEARCH_PARAMS ms_params;
+  av1_make_default_fullpel_ms_params(&ms_params, cpi, x, bsize, ref_mv,
+                                     first_pass_search_sites);
+  ms_params.search_method = NSTEP;
+
+  FULLPEL_MV this_best_mv;
+  tmp_err = av1_full_pixel_search(start_mv, &ms_params, step_param, NULL,
+                                  &this_best_mv, NULL);
+
+  if (tmp_err < INT_MAX) {
+    tmp_err = av1_get_mvpred_sse(x, &this_best_mv, ref_mv, &v_fn_ptr) +
+              new_mv_mode_penalty;
+  }
+
+  if (tmp_err < *best_motion_err) {
+    *best_motion_err = tmp_err;
+    *best_mv = this_best_mv;
+  }
+}
+
+static BLOCK_SIZE get_bsize(const CommonModeInfoParams *const mi_params,
+                            int mb_row, int mb_col) {
+  if (mi_size_wide[BLOCK_16X16] * mb_col + mi_size_wide[BLOCK_8X8] <
+      mi_params->mi_cols) {
+    return mi_size_wide[BLOCK_16X16] * mb_row + mi_size_wide[BLOCK_8X8] <
+                   mi_params->mi_rows
+               ? BLOCK_16X16
+               : BLOCK_16X8;
+  } else {
+    return mi_size_wide[BLOCK_16X16] * mb_row + mi_size_wide[BLOCK_8X8] <
+                   mi_params->mi_rows
+               ? BLOCK_8X16
+               : BLOCK_8X8;
+  }
+}
+
+static int find_fp_qindex(aom_bit_depth_t bit_depth) {
+  return av1_find_qindex(FIRST_PASS_Q, bit_depth, 0, QINDEX_RANGE - 1);
+}
+
+static double raw_motion_error_stdev(int *raw_motion_err_list,
+                                     int raw_motion_err_counts) {
+  int64_t sum_raw_err = 0;
+  double raw_err_avg = 0;
+  double raw_err_stdev = 0;
+  if (raw_motion_err_counts == 0) return 0;
+
+  int i;
+  for (i = 0; i < raw_motion_err_counts; i++) {
+    sum_raw_err += raw_motion_err_list[i];
+  }
+  raw_err_avg = (double)sum_raw_err / raw_motion_err_counts;
+  for (i = 0; i < raw_motion_err_counts; i++) {
+    raw_err_stdev += (raw_motion_err_list[i] - raw_err_avg) *
+                     (raw_motion_err_list[i] - raw_err_avg);
+  }
+  // Calculate the standard deviation for the motion error of all the inter
+  // blocks of the 0,0 motion using the last source
+  // frame as the reference.
+  raw_err_stdev = sqrt(raw_err_stdev / raw_motion_err_counts);
+  return raw_err_stdev;
+}
+
+// This structure contains several key parameters to be accumulate for this
+// frame.
+typedef struct {
+  // Intra prediction error.
+  int64_t intra_error;
+  // Average wavelet energy computed using Discrete Wavelet Transform (DWT).
+  int64_t frame_avg_wavelet_energy;
+  // Best of intra pred error and inter pred error using last frame as ref.
+  int64_t coded_error;
+  // Best of intra pred error and inter pred error using golden frame as ref.
+  int64_t sr_coded_error;
+  // Best of intra pred error and inter pred error using altref frame as ref.
+  int64_t tr_coded_error;
+  // Count of motion vector.
+  int mv_count;
+  // Count of blocks that pick inter prediction (inter pred error is smaller
+  // than intra pred error).
+  int inter_count;
+  // Count of blocks that pick second ref (golden frame).
+  int second_ref_count;
+  // Count of blocks that pick third ref (altref frame).
+  int third_ref_count;
+  // Count of blocks where the inter and intra are very close and very low.
+  double neutral_count;
+  // Count of blocks where intra error is very small.
+  int intra_skip_count;
+  // Start row.
+  int image_data_start_row;
+  // Count of unique non-zero motion vectors.
+  int new_mv_count;
+  // Sum of inward motion vectors.
+  int sum_in_vectors;
+  // Sum of motion vector row.
+  int sum_mvr;
+  // Sum of motion vector column.
+  int sum_mvc;
+  // Sum of absolute value of motion vector row.
+  int sum_mvr_abs;
+  // Sum of absolute value of motion vector column.
+  int sum_mvc_abs;
+  // Sum of the square of motion vector row.
+  int64_t sum_mvrs;
+  // Sum of the square of motion vector column.
+  int64_t sum_mvcs;
+  // A factor calculated using intra pred error.
+  double intra_factor;
+  // A factor that measures brightness.
+  double brightness_factor;
+} FRAME_STATS;
+
+#define UL_INTRA_THRESH 50
+#define INVALID_ROW -1
+// Computes and returns the intra pred error of a block.
+// intra pred error: sum of squared error of the intra predicted residual.
+// Inputs:
+//   cpi: the encoder setting. Only a few params in it will be used.
+//   this_frame: the current frame buffer.
+//   tile: tile information (not used in first pass, already init to zero)
+//   mb_row: row index in the unit of first pass block size.
+//   mb_col: column index in the unit of first pass block size.
+//   y_offset: the offset of y frame buffer, indicating the starting point of
+//             the current block.
+//   uv_offset: the offset of u and v frame buffer, indicating the starting
+//              point of the current block.
+//   fp_block_size: first pass block size.
+//   qindex: quantization step size to encode the frame.
+//   stats: frame encoding stats.
+// Modifies:
+//   stats->intra_skip_count
+//   stats->image_data_start_row
+//   stats->intra_factor
+//   stats->brightness_factor
+//   stats->intra_error
+//   stats->frame_avg_wavelet_energy
+// Returns:
+//   this_intra_error.
+static int firstpass_intra_prediction(
+    AV1_COMP *cpi, YV12_BUFFER_CONFIG *const this_frame,
+    const TileInfo *const tile, const int mb_row, const int mb_col,
+    const int y_offset, const int uv_offset, const BLOCK_SIZE fp_block_size,
+    const int qindex, FRAME_STATS *const stats) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  const SequenceHeader *const seq_params = &cm->seq_params;
+  MACROBLOCK *const x = &cpi->td.mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int mb_scale = mi_size_wide[fp_block_size];
+  const int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
+  const int num_planes = av1_num_planes(cm);
+  const BLOCK_SIZE bsize = get_bsize(mi_params, mb_row, mb_col);
+
+  aom_clear_system_state();
+  set_mi_offsets(mi_params, xd, mb_row * mb_scale, mb_col * mb_scale);
+  xd->plane[0].dst.buf = this_frame->y_buffer + y_offset;
+  xd->plane[1].dst.buf = this_frame->u_buffer + uv_offset;
+  xd->plane[2].dst.buf = this_frame->v_buffer + uv_offset;
+  xd->left_available = (mb_col != 0);
+  xd->mi[0]->sb_type = bsize;
+  xd->mi[0]->ref_frame[0] = INTRA_FRAME;
+  set_mi_row_col(xd, tile, mb_row * mb_scale, mi_size_high[bsize],
+                 mb_col * mb_scale, mi_size_wide[bsize], mi_params->mi_rows,
+                 mi_params->mi_cols);
+  set_plane_n4(xd, mi_size_wide[bsize], mi_size_high[bsize], num_planes);
+  xd->mi[0]->segment_id = 0;
+  xd->lossless[xd->mi[0]->segment_id] = (qindex == 0);
+  xd->mi[0]->mode = DC_PRED;
+  xd->mi[0]->tx_size =
+      use_dc_pred ? (bsize >= fp_block_size ? TX_16X16 : TX_8X8) : TX_4X4;
+
+  av1_encode_intra_block_plane(cpi, x, bsize, 0, DRY_RUN_NORMAL, 0);
+  int this_intra_error = aom_get_mb_ss(x->plane[0].src_diff);
+
+  if (this_intra_error < UL_INTRA_THRESH) {
+    ++stats->intra_skip_count;
+  } else if ((mb_col > 0) && (stats->image_data_start_row == INVALID_ROW)) {
+    stats->image_data_start_row = mb_row;
+  }
+
+  if (seq_params->use_highbitdepth) {
+    switch (seq_params->bit_depth) {
+      case AOM_BITS_8: break;
+      case AOM_BITS_10: this_intra_error >>= 4; break;
+      case AOM_BITS_12: this_intra_error >>= 8; break;
+      default:
+        assert(0 &&
+               "seq_params->bit_depth should be AOM_BITS_8, "
+               "AOM_BITS_10 or AOM_BITS_12");
+        return -1;
+    }
+  }
+
+  aom_clear_system_state();
+  double log_intra = log(this_intra_error + 1.0);
+  if (log_intra < 10.0) {
+    stats->intra_factor += 1.0 + ((10.0 - log_intra) * 0.05);
+  } else {
+    stats->intra_factor += 1.0;
+  }
+
+  int level_sample;
+  if (seq_params->use_highbitdepth) {
+    level_sample = CONVERT_TO_SHORTPTR(x->plane[0].src.buf)[0];
+  } else {
+    level_sample = x->plane[0].src.buf[0];
+  }
+  if ((level_sample < DARK_THRESH) && (log_intra < 9.0)) {
+    stats->brightness_factor += 1.0 + (0.01 * (DARK_THRESH - level_sample));
+  } else {
+    stats->brightness_factor += 1.0;
+  }
+
+  // Intrapenalty below deals with situations where the intra and inter
+  // error scores are very low (e.g. a plain black frame).
+  // We do not have special cases in first pass for 0,0 and nearest etc so
+  // all inter modes carry an overhead cost estimate for the mv.
+  // When the error score is very low this causes us to pick all or lots of
+  // INTRA modes and throw lots of key frames.
+  // This penalty adds a cost matching that of a 0,0 mv to the intra case.
+  this_intra_error += INTRA_MODE_PENALTY;
+
+  // Accumulate the intra error.
+  stats->intra_error += (int64_t)this_intra_error;
+
+  const int hbd = is_cur_buf_hbd(xd);
+  const int stride = x->plane[0].src.stride;
+  uint8_t *buf = x->plane[0].src.buf;
+  for (int r8 = 0; r8 < 2; ++r8) {
+    for (int c8 = 0; c8 < 2; ++c8) {
+      stats->frame_avg_wavelet_energy += av1_haar_ac_sad_8x8_uint8_input(
+          buf + c8 * 8 + r8 * 8 * stride, stride, hbd);
+    }
+  }
+
+  return this_intra_error;
+}
+
+// Returns the sum of square error between source and reference blocks.
+static int get_prediction_error_bitdepth(const int is_high_bitdepth,
+                                         const int bitdepth,
+                                         const BLOCK_SIZE block_size,
+                                         const struct buf_2d *src,
+                                         const struct buf_2d *ref) {
+  (void)is_high_bitdepth;
+  (void)bitdepth;
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (is_high_bitdepth) {
+    return highbd_get_prediction_error(block_size, src, ref, bitdepth);
+  }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+  return get_prediction_error(block_size, src, ref);
+}
+
+// Accumulates motion vector stats.
+// Modifies member variables of "stats".
+static void accumulate_mv_stats(const MV best_mv, const FULLPEL_MV mv,
+                                const int mb_row, const int mb_col,
+                                const int mb_rows, const int mb_cols,
+                                MV *last_mv, FRAME_STATS *stats) {
+  if (is_zero_mv(&best_mv)) return;
+
+  ++stats->mv_count;
+  // Non-zero vector, was it different from the last non zero vector?
+  if (!is_equal_mv(&best_mv, last_mv)) ++stats->new_mv_count;
+  *last_mv = best_mv;
+
+  // Does the row vector point inwards or outwards?
+  if (mb_row < mb_rows / 2) {
+    if (mv.row > 0) {
+      --stats->sum_in_vectors;
+    } else if (mv.row < 0) {
+      ++stats->sum_in_vectors;
+    }
+  } else if (mb_row > mb_rows / 2) {
+    if (mv.row > 0) {
+      ++stats->sum_in_vectors;
+    } else if (mv.row < 0) {
+      --stats->sum_in_vectors;
+    }
+  }
+
+  // Does the col vector point inwards or outwards?
+  if (mb_col < mb_cols / 2) {
+    if (mv.col > 0) {
+      --stats->sum_in_vectors;
+    } else if (mv.col < 0) {
+      ++stats->sum_in_vectors;
+    }
+  } else if (mb_col > mb_cols / 2) {
+    if (mv.col > 0) {
+      ++stats->sum_in_vectors;
+    } else if (mv.col < 0) {
+      --stats->sum_in_vectors;
+    }
+  }
+}
+
+#define LOW_MOTION_ERROR_THRESH 25
+// Computes and returns the inter prediction error from the last frame.
+// Computes inter prediction errors from the golden and alt ref frams and
+// Updates stats accordingly.
+// Inputs:
+//   cpi: the encoder setting. Only a few params in it will be used.
+//   last_frame: the frame buffer of the last frame.
+//   golden_frame: the frame buffer of the golden frame.
+//   alt_ref_frame: the frame buffer of the alt ref frame.
+//   mb_row: row index in the unit of first pass block size.
+//   mb_col: column index in the unit of first pass block size.
+//   recon_yoffset: the y offset of the reconstructed  frame buffer,
+//                  indicating the starting point of the current block.
+//   recont_uvoffset: the u/v offset of the reconstructed frame buffer,
+//                    indicating the starting point of the current block.
+//   src_yoffset: the y offset of the source frame buffer.
+//   alt_ref_frame_offset: the y offset of the alt ref frame buffer.
+//   fp_block_size: first pass block size.
+//   this_intra_error: the intra prediction error of this block.
+//   raw_motion_err_counts: the count of raw motion vectors.
+//   raw_motion_err_list: the array that records the raw motion error.
+//   best_ref_mv: best reference mv found so far.
+//   last_mv: last mv.
+//   stats: frame encoding stats.
+//  Modifies:
+//    raw_motion_err_list
+//    best_ref_mv
+//    last_mv
+//    stats: many member params in it.
+//  Returns:
+//    this_inter_error
+static int firstpass_inter_prediction(
+    AV1_COMP *cpi, const YV12_BUFFER_CONFIG *const last_frame,
+    const YV12_BUFFER_CONFIG *const golden_frame,
+    const YV12_BUFFER_CONFIG *const alt_ref_frame, const int mb_row,
+    const int mb_col, const int recon_yoffset, const int recon_uvoffset,
+    const int src_yoffset, const int alt_ref_frame_yoffset,
+    const BLOCK_SIZE fp_block_size, const int this_intra_error,
+    const int raw_motion_err_counts, int *raw_motion_err_list, MV *best_ref_mv,
+    MV *last_mv, FRAME_STATS *stats) {
+  int this_inter_error = this_intra_error;
+  AV1_COMMON *const cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  CurrentFrame *const current_frame = &cm->current_frame;
+  MACROBLOCK *const x = &cpi->td.mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int is_high_bitdepth = is_cur_buf_hbd(xd);
+  const int bitdepth = xd->bd;
+  const int mb_scale = mi_size_wide[fp_block_size];
+  const BLOCK_SIZE bsize = get_bsize(mi_params, mb_row, mb_col);
+  const int fp_block_size_height = block_size_wide[fp_block_size];
+  // Assume 0,0 motion with no mv overhead.
+  FULLPEL_MV mv = kZeroFullMv;
+  FULLPEL_MV tmp_mv = kZeroFullMv;
+  xd->plane[0].pre[0].buf = last_frame->y_buffer + recon_yoffset;
+  // Set up limit values for motion vectors to prevent them extending
+  // outside the UMV borders.
+  av1_set_mv_col_limits(mi_params, &x->mv_limits, (mb_col << 2),
+                        (fp_block_size_height >> MI_SIZE_LOG2),
+                        cpi->oxcf.border_in_pixels);
+
+  int motion_error =
+      get_prediction_error_bitdepth(is_high_bitdepth, bitdepth, bsize,
+                                    &x->plane[0].src, &xd->plane[0].pre[0]);
+
+  // Compute the motion error of the 0,0 motion using the last source
+  // frame as the reference. Skip the further motion search on
+  // reconstructed frame if this error is small.
+  struct buf_2d unscaled_last_source_buf_2d;
+  unscaled_last_source_buf_2d.buf =
+      cpi->unscaled_last_source->y_buffer + src_yoffset;
+  unscaled_last_source_buf_2d.stride = cpi->unscaled_last_source->y_stride;
+  const int raw_motion_error = get_prediction_error_bitdepth(
+      is_high_bitdepth, bitdepth, bsize, &x->plane[0].src,
+      &unscaled_last_source_buf_2d);
+  raw_motion_err_list[raw_motion_err_counts] = raw_motion_error;
+
+  // TODO(pengchong): Replace the hard-coded threshold
+  if (raw_motion_error > LOW_MOTION_ERROR_THRESH) {
+    // Test last reference frame using the previous best mv as the
+    // starting point (best reference) for the search.
+    first_pass_motion_search(cpi, x, best_ref_mv, &mv, &motion_error);
+
+    // If the current best reference mv is not centered on 0,0 then do a
+    // 0,0 based search as well.
+    if (!is_zero_mv(best_ref_mv)) {
+      int tmp_err = INT_MAX;
+      first_pass_motion_search(cpi, x, &kZeroMv, &tmp_mv, &tmp_err);
+
+      if (tmp_err < motion_error) {
+        motion_error = tmp_err;
+        mv = tmp_mv;
+      }
+    }
+
+    // Motion search in 2nd reference frame.
+    int gf_motion_error = motion_error;
+    if ((current_frame->frame_number > 1) && golden_frame != NULL) {
+      // Assume 0,0 motion with no mv overhead.
+      xd->plane[0].pre[0].buf = golden_frame->y_buffer + recon_yoffset;
+      xd->plane[0].pre[0].stride = golden_frame->y_stride;
+      gf_motion_error =
+          get_prediction_error_bitdepth(is_high_bitdepth, bitdepth, bsize,
+                                        &x->plane[0].src, &xd->plane[0].pre[0]);
+      first_pass_motion_search(cpi, x, &kZeroMv, &tmp_mv, &gf_motion_error);
+    }
+    if (gf_motion_error < motion_error && gf_motion_error < this_intra_error) {
+      ++stats->second_ref_count;
+    }
+    // In accumulating a score for the 2nd reference frame take the
+    // best of the motion predicted score and the intra coded error
+    // (just as will be done for) accumulation of "coded_error" for
+    // the last frame.
+    if ((current_frame->frame_number > 1) && golden_frame != NULL) {
+      stats->sr_coded_error += AOMMIN(gf_motion_error, this_intra_error);
+    } else {
+      // TODO(chengchen): I believe logically this should also be changed to
+      // stats->sr_coded_error += AOMMIN(gf_motion_error, this_intra_error).
+      stats->sr_coded_error += motion_error;
+    }
+
+    // Motion search in 3rd reference frame.
+    int alt_motion_error = motion_error;
+    if (alt_ref_frame != NULL) {
+      xd->plane[0].pre[0].buf = alt_ref_frame->y_buffer + alt_ref_frame_yoffset;
+      xd->plane[0].pre[0].stride = alt_ref_frame->y_stride;
+      alt_motion_error =
+          get_prediction_error_bitdepth(is_high_bitdepth, bitdepth, bsize,
+                                        &x->plane[0].src, &xd->plane[0].pre[0]);
+      first_pass_motion_search(cpi, x, &kZeroMv, &tmp_mv, &alt_motion_error);
+    }
+    if (alt_motion_error < motion_error && alt_motion_error < gf_motion_error &&
+        alt_motion_error < this_intra_error) {
+      ++stats->third_ref_count;
+    }
+    // In accumulating a score for the 3rd reference frame take the
+    // best of the motion predicted score and the intra coded error
+    // (just as will be done for) accumulation of "coded_error" for
+    // the last frame.
+    if (alt_ref_frame != NULL) {
+      stats->tr_coded_error += AOMMIN(alt_motion_error, this_intra_error);
+    } else {
+      // TODO(chengchen): I believe logically this should also be changed to
+      // stats->tr_coded_error += AOMMIN(alt_motion_error, this_intra_error).
+      stats->tr_coded_error += motion_error;
+    }
+
+    // Reset to last frame as reference buffer.
+    xd->plane[0].pre[0].buf = last_frame->y_buffer + recon_yoffset;
+    xd->plane[1].pre[0].buf = last_frame->u_buffer + recon_uvoffset;
+    xd->plane[2].pre[0].buf = last_frame->v_buffer + recon_uvoffset;
+  } else {
+    stats->sr_coded_error += motion_error;
+    stats->tr_coded_error += motion_error;
+  }
+
+  // Start by assuming that intra mode is best.
+  best_ref_mv->row = 0;
+  best_ref_mv->col = 0;
+
+  if (motion_error <= this_intra_error) {
+    aom_clear_system_state();
+
+    // Keep a count of cases where the inter and intra were very close
+    // and very low. This helps with scene cut detection for example in
+    // cropped clips with black bars at the sides or top and bottom.
+    if (((this_intra_error - INTRA_MODE_PENALTY) * 9 <= motion_error * 10) &&
+        (this_intra_error < (2 * INTRA_MODE_PENALTY))) {
+      stats->neutral_count += 1.0;
+      // Also track cases where the intra is not much worse than the inter
+      // and use this in limiting the GF/arf group length.
+    } else if ((this_intra_error > NCOUNT_INTRA_THRESH) &&
+               (this_intra_error < (NCOUNT_INTRA_FACTOR * motion_error))) {
+      stats->neutral_count +=
+          (double)motion_error / DOUBLE_DIVIDE_CHECK((double)this_intra_error);
+    }
+
+    const MV best_mv = get_mv_from_fullmv(&mv);
+    this_inter_error = motion_error;
+    xd->mi[0]->mode = NEWMV;
+    xd->mi[0]->mv[0].as_mv = best_mv;
+    xd->mi[0]->tx_size = TX_4X4;
+    xd->mi[0]->ref_frame[0] = LAST_FRAME;
+    xd->mi[0]->ref_frame[1] = NONE_FRAME;
+    av1_enc_build_inter_predictor(cm, xd, mb_row * mb_scale, mb_col * mb_scale,
+                                  NULL, bsize, AOM_PLANE_Y, AOM_PLANE_Y);
+    av1_encode_sby_pass1(cpi, x, bsize);
+    stats->sum_mvr += best_mv.row;
+    stats->sum_mvr_abs += abs(best_mv.row);
+    stats->sum_mvc += best_mv.col;
+    stats->sum_mvc_abs += abs(best_mv.col);
+    stats->sum_mvrs += best_mv.row * best_mv.row;
+    stats->sum_mvcs += best_mv.col * best_mv.col;
+    ++stats->inter_count;
+
+    *best_ref_mv = best_mv;
+    accumulate_mv_stats(best_mv, mv, mb_row, mb_col, mi_params->mb_rows,
+                        mi_params->mb_cols, last_mv, stats);
+  }
+
+  return this_inter_error;
+}
+
+// Updates the first pass stats of this frame.
+// Input:
+//   cpi: the encoder setting. Only a few params in it will be used.
+//   stats: stats accumulated for this frame.
+//   raw_err_stdev: the statndard deviation for the motion error of all the
+//                  inter blocks of the (0,0) motion using the last source
+//                  frame as the reference.
+//   frame_number: current frame number.
+//   ts_duration: Duration of the frame / collection of frames.
+// Updates:
+//   twopass->total_stats: the accumulated stats.
+//   twopass->stats_buf_ctx->stats_in_end: the pointer to the current stats,
+//                                         update its value and its position
+//                                         in the buffer.
+static void update_firstpass_stats(AV1_COMP *cpi,
+                                   const FRAME_STATS *const stats,
+                                   const double raw_err_stdev,
+                                   const int frame_number,
+                                   const int64_t ts_duration) {
+  TWO_PASS *twopass = &cpi->twopass;
+  AV1_COMMON *const cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  FIRSTPASS_STATS *this_frame_stats = twopass->stats_buf_ctx->stats_in_end;
+  FIRSTPASS_STATS fps;
+  // The minimum error here insures some bit allocation to frames even
+  // in static regions. The allocation per MB declines for larger formats
+  // where the typical "real" energy per MB also falls.
+  // Initial estimate here uses sqrt(mbs) to define the min_err, where the
+  // number of mbs is proportional to the image area.
+  const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs
+                                                             : mi_params->MBs;
+  const double min_err = 200 * sqrt(num_mbs);
+
+  fps.weight = stats->intra_factor * stats->brightness_factor;
+  fps.frame = frame_number;
+  fps.coded_error = (double)(stats->coded_error >> 8) + min_err;
+  fps.sr_coded_error = (double)(stats->sr_coded_error >> 8) + min_err;
+  fps.tr_coded_error = (double)(stats->tr_coded_error >> 8) + min_err;
+  fps.intra_error = (double)(stats->intra_error >> 8) + min_err;
+  fps.frame_avg_wavelet_energy = (double)stats->frame_avg_wavelet_energy;
+  fps.count = 1.0;
+  fps.pcnt_inter = (double)stats->inter_count / num_mbs;
+  fps.pcnt_second_ref = (double)stats->second_ref_count / num_mbs;
+  fps.pcnt_third_ref = (double)stats->third_ref_count / num_mbs;
+  fps.pcnt_neutral = (double)stats->neutral_count / num_mbs;
+  fps.intra_skip_pct = (double)stats->intra_skip_count / num_mbs;
+  fps.inactive_zone_rows = (double)stats->image_data_start_row;
+  fps.inactive_zone_cols = (double)0;  // TODO(paulwilkins): fix
+  fps.raw_error_stdev = raw_err_stdev;
+
+  if (stats->mv_count > 0) {
+    fps.MVr = (double)stats->sum_mvr / stats->mv_count;
+    fps.mvr_abs = (double)stats->sum_mvr_abs / stats->mv_count;
+    fps.MVc = (double)stats->sum_mvc / stats->mv_count;
+    fps.mvc_abs = (double)stats->sum_mvc_abs / stats->mv_count;
+    fps.MVrv = ((double)stats->sum_mvrs -
+                ((double)stats->sum_mvr * stats->sum_mvr / stats->mv_count)) /
+               stats->mv_count;
+    fps.MVcv = ((double)stats->sum_mvcs -
+                ((double)stats->sum_mvc * stats->sum_mvc / stats->mv_count)) /
+               stats->mv_count;
+    fps.mv_in_out_count = (double)stats->sum_in_vectors / (stats->mv_count * 2);
+    fps.new_mv_count = stats->new_mv_count;
+    fps.pcnt_motion = (double)stats->mv_count / num_mbs;
+  } else {
+    fps.MVr = 0.0;
+    fps.mvr_abs = 0.0;
+    fps.MVc = 0.0;
+    fps.mvc_abs = 0.0;
+    fps.MVrv = 0.0;
+    fps.MVcv = 0.0;
+    fps.mv_in_out_count = 0.0;
+    fps.new_mv_count = 0.0;
+    fps.pcnt_motion = 0.0;
+  }
+
+  // TODO(paulwilkins):  Handle the case when duration is set to 0, or
+  // something less than the full time between subsequent values of
+  // cpi->source_time_stamp.
+  fps.duration = (double)ts_duration;
+
+  // We will store the stats inside the persistent twopass struct (and NOT the
+  // local variable 'fps'), and then cpi->output_pkt_list will point to it.
+  *this_frame_stats = fps;
+  output_stats(this_frame_stats, cpi->output_pkt_list);
+  if (cpi->twopass.stats_buf_ctx->total_stats != NULL) {
+    accumulate_stats(cpi->twopass.stats_buf_ctx->total_stats, &fps);
+  }
+  /*In the case of two pass, first pass uses it as a circular buffer,
+   * when LAP is enabled it is used as a linear buffer*/
+  twopass->stats_buf_ctx->stats_in_end++;
+  if ((cpi->oxcf.pass == 1) && (twopass->stats_buf_ctx->stats_in_end >=
+                                twopass->stats_buf_ctx->stats_in_buf_end)) {
+    twopass->stats_buf_ctx->stats_in_end =
+        twopass->stats_buf_ctx->stats_in_start;
+  }
+}
+
+static void print_reconstruction_frame(
+    const YV12_BUFFER_CONFIG *const last_frame, int frame_number,
+    int do_print) {
+  if (!do_print) return;
+
+  char filename[512];
+  FILE *recon_file;
+  snprintf(filename, sizeof(filename), "enc%04d.yuv", frame_number);
+
+  if (frame_number == 0) {
+    recon_file = fopen(filename, "wb");
+  } else {
+    recon_file = fopen(filename, "ab");
+  }
+
+  fwrite(last_frame->buffer_alloc, last_frame->frame_size, 1, recon_file);
+  fclose(recon_file);
+}
+
+#define FIRST_PASS_ALT_REF_DISTANCE 16
+void av1_first_pass(AV1_COMP *cpi, const int64_t ts_duration) {
+  MACROBLOCK *const x = &cpi->td.mb;
+  AV1_COMMON *const cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  CurrentFrame *const current_frame = &cm->current_frame;
+  const SequenceHeader *const seq_params = &cm->seq_params;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const PICK_MODE_CONTEXT *ctx = &cpi->td.pc_root->none;
+  MV last_mv = kZeroMv;
+  const int qindex = find_fp_qindex(seq_params->bit_depth);
+  // Detect if the key frame is screen content type.
+  if (frame_is_intra_only(cm)) {
+    FeatureFlags *const features = &cm->features;
+    av1_set_screen_content_options(cpi, features);
+    cpi->is_screen_content_type = features->allow_screen_content_tools;
+  }
+  // First pass coding proceeds in raster scan order with unit size of 16x16.
+  const BLOCK_SIZE fp_block_size = BLOCK_16X16;
+  const int fp_block_size_width = block_size_high[fp_block_size];
+  const int fp_block_size_height = block_size_wide[fp_block_size];
+  int *raw_motion_err_list;
+  int raw_motion_err_counts = 0;
+  CHECK_MEM_ERROR(cm, raw_motion_err_list,
+                  aom_calloc(mi_params->mb_rows * mi_params->mb_cols,
+                             sizeof(*raw_motion_err_list)));
+  // Tiling is ignored in the first pass.
+  TileInfo tile;
+  av1_tile_init(&tile, cm, 0, 0);
+  FRAME_STATS stats = { 0 };
+  stats.image_data_start_row = INVALID_ROW;
+
+  const YV12_BUFFER_CONFIG *const last_frame =
+      get_ref_frame_yv12_buf(cm, LAST_FRAME);
+  const YV12_BUFFER_CONFIG *golden_frame =
+      get_ref_frame_yv12_buf(cm, GOLDEN_FRAME);
+  const YV12_BUFFER_CONFIG *alt_ref_frame = NULL;
+  const int alt_ref_offset =
+      FIRST_PASS_ALT_REF_DISTANCE -
+      (current_frame->frame_number % FIRST_PASS_ALT_REF_DISTANCE);
+  if (alt_ref_offset < FIRST_PASS_ALT_REF_DISTANCE) {
+    const struct lookahead_entry *const alt_ref_frame_buffer =
+        av1_lookahead_peek(cpi->lookahead, alt_ref_offset,
+                           cpi->compressor_stage);
+    if (alt_ref_frame_buffer != NULL) {
+      alt_ref_frame = &alt_ref_frame_buffer->img;
+    }
+  }
+  YV12_BUFFER_CONFIG *const this_frame = &cm->cur_frame->buf;
+  // First pass code requires valid last and new frame buffers.
+  assert(this_frame != NULL);
+  assert(frame_is_intra_only(cm) || (last_frame != NULL));
+
+  av1_setup_frame_size(cpi);
+  aom_clear_system_state();
+
+  set_mi_offsets(mi_params, xd, 0, 0);
+  xd->mi[0]->sb_type = fp_block_size;
+
+  // Do not use periodic key frames.
+  cpi->rc.frames_to_key = INT_MAX;
+
+  av1_set_quantizer(cm, cpi->oxcf.qm_minlevel, cpi->oxcf.qm_maxlevel, qindex);
+
+  av1_setup_block_planes(xd, seq_params->subsampling_x,
+                         seq_params->subsampling_y, num_planes);
+
+  av1_setup_src_planes(x, cpi->source, 0, 0, num_planes, fp_block_size);
+  av1_setup_dst_planes(xd->plane, seq_params->sb_size, this_frame, 0, 0, 0,
+                       num_planes);
+
+  if (!frame_is_intra_only(cm)) {
+    av1_setup_pre_planes(xd, 0, last_frame, 0, 0, NULL, num_planes);
+  }
+
+  set_mi_offsets(mi_params, xd, 0, 0);
+
+  // Don't store luma on the fist pass since chroma is not computed
+  xd->cfl.store_y = 0;
+  av1_frame_init_quantizer(cpi);
+
+  for (int i = 0; i < num_planes; ++i) {
+    x->plane[i].coeff = ctx->coeff[i];
+    x->plane[i].qcoeff = ctx->qcoeff[i];
+    x->plane[i].eobs = ctx->eobs[i];
+    x->plane[i].txb_entropy_ctx = ctx->txb_entropy_ctx[i];
+    xd->plane[i].dqcoeff = ctx->dqcoeff[i];
+  }
+
+  av1_init_mv_probs(cm);
+  av1_initialize_rd_consts(cpi);
+
+  const int src_y_stride = cpi->source->y_stride;
+  const int recon_y_stride = this_frame->y_stride;
+  const int recon_uv_stride = this_frame->uv_stride;
+  const int uv_mb_height =
+      fp_block_size_height >> (this_frame->y_height > this_frame->uv_height);
+
+  for (int mb_row = 0; mb_row < mi_params->mb_rows; ++mb_row) {
+    MV best_ref_mv = kZeroMv;
+
+    // Reset above block coeffs.
+    xd->up_available = (mb_row != 0);
+    int recon_yoffset = (mb_row * recon_y_stride * fp_block_size_height);
+    int src_yoffset = (mb_row * src_y_stride * fp_block_size_height);
+    int recon_uvoffset = (mb_row * recon_uv_stride * uv_mb_height);
+    int alt_ref_frame_yoffset =
+        (alt_ref_frame != NULL)
+            ? mb_row * alt_ref_frame->y_stride * fp_block_size_height
+            : -1;
+
+    // Set up limit values for motion vectors to prevent them extending
+    // outside the UMV borders.
+    av1_set_mv_row_limits(mi_params, &x->mv_limits, (mb_row << 2),
+                          (fp_block_size_height >> MI_SIZE_LOG2),
+                          cpi->oxcf.border_in_pixels);
+
+    for (int mb_col = 0; mb_col < mi_params->mb_cols; ++mb_col) {
+      int this_intra_error = firstpass_intra_prediction(
+          cpi, this_frame, &tile, mb_row, mb_col, recon_yoffset, recon_uvoffset,
+          fp_block_size, qindex, &stats);
+
+      if (!frame_is_intra_only(cm)) {
+        const int this_inter_error = firstpass_inter_prediction(
+            cpi, last_frame, golden_frame, alt_ref_frame, mb_row, mb_col,
+            recon_yoffset, recon_uvoffset, src_yoffset, alt_ref_frame_yoffset,
+            fp_block_size, this_intra_error, raw_motion_err_counts,
+            raw_motion_err_list, &best_ref_mv, &last_mv, &stats);
+        stats.coded_error += this_inter_error;
+        ++raw_motion_err_counts;
+      } else {
+        stats.sr_coded_error += this_intra_error;
+        stats.tr_coded_error += this_intra_error;
+        stats.coded_error += this_intra_error;
+      }
+
+      // Adjust to the next column of MBs.
+      x->plane[0].src.buf += fp_block_size_width;
+      x->plane[1].src.buf += uv_mb_height;
+      x->plane[2].src.buf += uv_mb_height;
+
+      recon_yoffset += fp_block_size_width;
+      src_yoffset += fp_block_size_width;
+      recon_uvoffset += uv_mb_height;
+      alt_ref_frame_yoffset += fp_block_size_width;
+    }
+    // Adjust to the next row of MBs.
+    x->plane[0].src.buf += fp_block_size_height * x->plane[0].src.stride -
+                           fp_block_size_width * mi_params->mb_cols;
+    x->plane[1].src.buf += uv_mb_height * x->plane[1].src.stride -
+                           uv_mb_height * mi_params->mb_cols;
+    x->plane[2].src.buf += uv_mb_height * x->plane[1].src.stride -
+                           uv_mb_height * mi_params->mb_cols;
+  }
+  const double raw_err_stdev =
+      raw_motion_error_stdev(raw_motion_err_list, raw_motion_err_counts);
+  aom_free(raw_motion_err_list);
+
+  // Clamp the image start to rows/2. This number of rows is discarded top
+  // and bottom as dead data so rows / 2 means the frame is blank.
+  if ((stats.image_data_start_row > mi_params->mb_rows / 2) ||
+      (stats.image_data_start_row == INVALID_ROW)) {
+    stats.image_data_start_row = mi_params->mb_rows / 2;
+  }
+  // Exclude any image dead zone
+  if (stats.image_data_start_row > 0) {
+    stats.intra_skip_count =
+        AOMMAX(0, stats.intra_skip_count -
+                      (stats.image_data_start_row * mi_params->mb_cols * 2));
+  }
+
+  TWO_PASS *twopass = &cpi->twopass;
+  const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs
+                                                             : mi_params->MBs;
+  stats.intra_factor = stats.intra_factor / (double)num_mbs;
+  stats.brightness_factor = stats.brightness_factor / (double)num_mbs;
+  FIRSTPASS_STATS *this_frame_stats = twopass->stats_buf_ctx->stats_in_end;
+  update_firstpass_stats(cpi, &stats, raw_err_stdev,
+                         current_frame->frame_number, ts_duration);
+
+  // Copy the previous Last Frame back into gf buffer if the prediction is good
+  // enough... but also don't allow it to lag too far.
+  if ((twopass->sr_update_lag > 3) ||
+      ((current_frame->frame_number > 0) &&
+       (this_frame_stats->pcnt_inter > 0.20) &&
+       ((this_frame_stats->intra_error /
+         DOUBLE_DIVIDE_CHECK(this_frame_stats->coded_error)) > 2.0))) {
+    if (golden_frame != NULL) {
+      assign_frame_buffer_p(
+          &cm->ref_frame_map[get_ref_frame_map_idx(cm, GOLDEN_FRAME)],
+          cm->ref_frame_map[get_ref_frame_map_idx(cm, LAST_FRAME)]);
+    }
+    twopass->sr_update_lag = 1;
+  } else {
+    ++twopass->sr_update_lag;
+  }
+
+  aom_extend_frame_borders(this_frame, num_planes);
+
+  // The frame we just compressed now becomes the last frame.
+  assign_frame_buffer_p(
+      &cm->ref_frame_map[get_ref_frame_map_idx(cm, LAST_FRAME)], cm->cur_frame);
+
+  // Special case for the first frame. Copy into the GF buffer as a second
+  // reference.
+  if (current_frame->frame_number == 0 &&
+      get_ref_frame_map_idx(cm, GOLDEN_FRAME) != INVALID_IDX) {
+    assign_frame_buffer_p(
+        &cm->ref_frame_map[get_ref_frame_map_idx(cm, GOLDEN_FRAME)],
+        cm->ref_frame_map[get_ref_frame_map_idx(cm, LAST_FRAME)]);
+  }
+
+  print_reconstruction_frame(last_frame, current_frame->frame_number,
+                             /*do_print=*/0);
+
+  ++current_frame->frame_number;
+}
diff --git a/libs/libaom/src/av1/encoder/firstpass.h b/libs/libaom/src/av1/encoder/firstpass.h
new file mode 100644
index 000000000..99d444539
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/firstpass.h
@@ -0,0 +1,196 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_FIRSTPASS_H_
+#define AOM_AV1_ENCODER_FIRSTPASS_H_
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/enums.h"
+#include "av1/encoder/lookahead.h"
+#include "av1/encoder/ratectrl.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define DOUBLE_DIVIDE_CHECK(x) ((x) < 0 ? (x)-0.000001 : (x) + 0.000001)
+
+#define MIN_ZERO_MOTION 0.95
+#define MAX_SR_CODED_ERROR 40
+#define MAX_RAW_ERR_VAR 2000
+#define MIN_MV_IN_OUT 0.4
+
+#define VLOW_MOTION_THRESHOLD 950
+
+typedef struct {
+  // Frame number in display order, if stats are for a single frame.
+  // No real meaning for a collection of frames.
+  double frame;
+  // Weight assigned to this frame (or total weight for the collection of
+  // frames) currently based on intra factor and brightness factor. This is used
+  // to distribute bits betweeen easier and harder frames.
+  double weight;
+  // Intra prediction error.
+  double intra_error;
+  // Average wavelet energy computed using Discrete Wavelet Transform (DWT).
+  double frame_avg_wavelet_energy;
+  // Best of intra pred error and inter pred error using last frame as ref.
+  double coded_error;
+  // Best of intra pred error and inter pred error using golden frame as ref.
+  double sr_coded_error;
+  // Best of intra pred error and inter pred error using altref frame as ref.
+  double tr_coded_error;
+  // Percentage of blocks with inter pred error < intra pred error.
+  double pcnt_inter;
+  // Percentage of blocks using (inter prediction and) non-zero motion vectors.
+  double pcnt_motion;
+  // Percentage of blocks where golden frame was better than last or intra:
+  // inter pred error using golden frame < inter pred error using last frame and
+  // inter pred error using golden frame < intra pred error
+  double pcnt_second_ref;
+  // Percentage of blocks where altref frame was better than intra, last, golden
+  double pcnt_third_ref;
+  // Percentage of blocks where intra and inter prediction errors were very
+  // close. Note that this is a 'weighted count', that is, the so blocks may be
+  // weighted by how close the two errors were.
+  double pcnt_neutral;
+  // Percentage of blocks that have almost no intra error residual
+  // (i.e. are in effect completely flat and untextured in the intra
+  // domain). In natural videos this is uncommon, but it is much more
+  // common in animations, graphics and screen content, so may be used
+  // as a signal to detect these types of content.
+  double intra_skip_pct;
+  // Image mask rows top and bottom.
+  double inactive_zone_rows;
+  // Image mask columns at left and right edges.
+  double inactive_zone_cols;
+  // Average of row motion vectors.
+  double MVr;
+  // Mean of absolute value of row motion vectors.
+  double mvr_abs;
+  // Mean of column motion vectors.
+  double MVc;
+  // Mean of absolute value of column motion vectors.
+  double mvc_abs;
+  // Variance of row motion vectors.
+  double MVrv;
+  // Variance of column motion vectors.
+  double MVcv;
+  // Value in range [-1,1] indicating fraction of row and column motion vectors
+  // that point inwards (negative MV value) or outwards (positive MV value).
+  // For example, value of 1 indicates, all row/column MVs are inwards.
+  double mv_in_out_count;
+  // Count of unique non-zero motion vectors.
+  double new_mv_count;
+  // Duration of the frame / collection of frames.
+  double duration;
+  // 1.0 if stats are for a single frame, OR
+  // Number of frames in this collection for which the stats are accumulated.
+  double count;
+  // standard deviation for (0, 0) motion prediction error
+  double raw_error_stdev;
+} FIRSTPASS_STATS;
+
+#define FC_ANIMATION_THRESH 0.15
+enum {
+  FC_NORMAL = 0,
+  FC_GRAPHICS_ANIMATION = 1,
+  FRAME_CONTENT_TYPES = 2
+} UENUM1BYTE(FRAME_CONTENT_TYPE);
+
+typedef struct {
+  unsigned char index;
+  FRAME_UPDATE_TYPE update_type[MAX_STATIC_GF_GROUP_LENGTH];
+  unsigned char arf_src_offset[MAX_STATIC_GF_GROUP_LENGTH];
+  // The number of frames displayed so far within the GOP at a given coding
+  // frame.
+  unsigned char cur_frame_idx[MAX_STATIC_GF_GROUP_LENGTH];
+  unsigned char frame_disp_idx[MAX_STATIC_GF_GROUP_LENGTH];
+  int ref_frame_disp_idx[MAX_STATIC_GF_GROUP_LENGTH][REF_FRAMES];
+  int ref_frame_gop_idx[MAX_STATIC_GF_GROUP_LENGTH][REF_FRAMES];
+
+  // TODO(jingning): Unify the data structure used here after the new control
+  // mechanism is in place.
+  int layer_depth[MAX_STATIC_GF_GROUP_LENGTH];
+  int arf_boost[MAX_STATIC_GF_GROUP_LENGTH];
+  int max_layer_depth;
+  int max_layer_depth_allowed;
+  // This is currently only populated for AOM_Q mode
+  unsigned char q_val[MAX_STATIC_GF_GROUP_LENGTH];
+  int bit_allocation[MAX_STATIC_GF_GROUP_LENGTH];
+  int size;
+} GF_GROUP;
+
+typedef struct {
+  FIRSTPASS_STATS *stats_in_start;
+  FIRSTPASS_STATS *stats_in_end;
+  FIRSTPASS_STATS *stats_in_buf_end;
+  FIRSTPASS_STATS *total_stats;
+  FIRSTPASS_STATS *total_left_stats;
+} STATS_BUFFER_CTX;
+
+typedef struct {
+  unsigned int section_intra_rating;
+  // Circular queue of first pass stats stored for most recent frames.
+  // cpi->output_pkt_list[i].data.twopass_stats.buf points to actual data stored
+  // here.
+  FIRSTPASS_STATS *frame_stats_arr[MAX_LAP_BUFFERS + 1];
+  int frame_stats_next_idx;  // Index to next unused element in frame_stats_arr.
+  const FIRSTPASS_STATS *stats_in;
+  STATS_BUFFER_CTX *stats_buf_ctx;
+  int first_pass_done;
+  int64_t bits_left;
+  double modified_error_min;
+  double modified_error_max;
+  double modified_error_left;
+  double mb_av_energy;
+  double frame_avg_haar_energy;
+
+  // An indication of the content type of the current frame
+  FRAME_CONTENT_TYPE fr_content_type;
+
+  // Projected total bits available for a key frame group of frames
+  int64_t kf_group_bits;
+
+  // Error score of frames still to be coded in kf group
+  int64_t kf_group_error_left;
+
+  // Over time correction for bits per macro block estimation
+  double bpm_factor;
+
+  // Record of target and actual bits spent in current ARF group
+  int rolling_arf_group_target_bits;
+  int rolling_arf_group_actual_bits;
+
+  int sr_update_lag;
+
+  int kf_zeromotion_pct;
+  int last_kfgroup_zeromotion_pct;
+  int extend_minq;
+  int extend_maxq;
+  int extend_minq_fast;
+} TWO_PASS;
+
+struct AV1_COMP;
+struct EncodeFrameParams;
+struct AV1EncoderConfig;
+
+void av1_rc_get_first_pass_params(struct AV1_COMP *cpi);
+void av1_first_pass(struct AV1_COMP *cpi, const int64_t ts_duration);
+void av1_end_first_pass(struct AV1_COMP *cpi);
+
+void av1_twopass_zero_stats(FIRSTPASS_STATS *section);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_FIRSTPASS_H_
diff --git a/libs/libaom/src/av1/encoder/global_motion.c b/libs/libaom/src/av1/encoder/global_motion.c
new file mode 100644
index 000000000..9623ec301
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/global_motion.c
@@ -0,0 +1,1014 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <math.h>
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "av1/encoder/global_motion.h"
+
+#include "av1/common/convolve.h"
+#include "av1/common/resize.h"
+#include "av1/common/warped_motion.h"
+
+#include "av1/encoder/segmentation.h"
+#include "av1/encoder/corner_detect.h"
+#include "av1/encoder/corner_match.h"
+#include "av1/encoder/ransac.h"
+
+#define MIN_INLIER_PROB 0.1
+
+#define MIN_TRANS_THRESH (1 * GM_TRANS_DECODE_FACTOR)
+
+// Border over which to compute the global motion
+#define ERRORADV_BORDER 0
+
+// Number of pyramid levels in disflow computation
+#define N_LEVELS 2
+// Size of square patches in the disflow dense grid
+#define PATCH_SIZE 8
+// Center point of square patch
+#define PATCH_CENTER ((PATCH_SIZE + 1) >> 1)
+// Step size between patches, lower value means greater patch overlap
+#define PATCH_STEP 1
+// Minimum size of border padding for disflow
+#define MIN_PAD 7
+// Warp error convergence threshold for disflow
+#define DISFLOW_ERROR_TR 0.01
+// Max number of iterations if warp convergence is not found
+#define DISFLOW_MAX_ITR 10
+
+// Struct for an image pyramid
+typedef struct {
+  int n_levels;
+  int pad_size;
+  int has_gradient;
+  int widths[N_LEVELS];
+  int heights[N_LEVELS];
+  int strides[N_LEVELS];
+  int level_loc[N_LEVELS];
+  unsigned char *level_buffer;
+  double *level_dx_buffer;
+  double *level_dy_buffer;
+} ImagePyramid;
+
+int av1_is_enough_erroradvantage(double best_erroradvantage, int params_cost,
+                                 int erroradv_type) {
+  assert(erroradv_type < GM_ERRORADV_TR_TYPES);
+  return best_erroradvantage < erroradv_tr[erroradv_type] &&
+         best_erroradvantage * params_cost < erroradv_prod_tr[erroradv_type];
+}
+
+static void convert_to_params(const double *params, int32_t *model) {
+  int i;
+  int alpha_present = 0;
+  model[0] = (int32_t)floor(params[0] * (1 << GM_TRANS_PREC_BITS) + 0.5);
+  model[1] = (int32_t)floor(params[1] * (1 << GM_TRANS_PREC_BITS) + 0.5);
+  model[0] = (int32_t)clamp(model[0], GM_TRANS_MIN, GM_TRANS_MAX) *
+             GM_TRANS_DECODE_FACTOR;
+  model[1] = (int32_t)clamp(model[1], GM_TRANS_MIN, GM_TRANS_MAX) *
+             GM_TRANS_DECODE_FACTOR;
+
+  for (i = 2; i < 6; ++i) {
+    const int diag_value = ((i == 2 || i == 5) ? (1 << GM_ALPHA_PREC_BITS) : 0);
+    model[i] = (int32_t)floor(params[i] * (1 << GM_ALPHA_PREC_BITS) + 0.5);
+    model[i] =
+        (int32_t)clamp(model[i] - diag_value, GM_ALPHA_MIN, GM_ALPHA_MAX);
+    alpha_present |= (model[i] != 0);
+    model[i] = (model[i] + diag_value) * GM_ALPHA_DECODE_FACTOR;
+  }
+  for (; i < 8; ++i) {
+    model[i] = (int32_t)floor(params[i] * (1 << GM_ROW3HOMO_PREC_BITS) + 0.5);
+    model[i] = (int32_t)clamp(model[i], GM_ROW3HOMO_MIN, GM_ROW3HOMO_MAX) *
+               GM_ROW3HOMO_DECODE_FACTOR;
+    alpha_present |= (model[i] != 0);
+  }
+
+  if (!alpha_present) {
+    if (abs(model[0]) < MIN_TRANS_THRESH && abs(model[1]) < MIN_TRANS_THRESH) {
+      model[0] = 0;
+      model[1] = 0;
+    }
+  }
+}
+
+void av1_convert_model_to_params(const double *params,
+                                 WarpedMotionParams *model) {
+  convert_to_params(params, model->wmmat);
+  model->wmtype = get_wmtype(model);
+  model->invalid = 0;
+}
+
+// Adds some offset to a global motion parameter and handles
+// all of the necessary precision shifts, clamping, and
+// zero-centering.
+static int32_t add_param_offset(int param_index, int32_t param_value,
+                                int32_t offset) {
+  const int scale_vals[3] = { GM_TRANS_PREC_DIFF, GM_ALPHA_PREC_DIFF,
+                              GM_ROW3HOMO_PREC_DIFF };
+  const int clamp_vals[3] = { GM_TRANS_MAX, GM_ALPHA_MAX, GM_ROW3HOMO_MAX };
+  // type of param: 0 - translation, 1 - affine, 2 - homography
+  const int param_type = (param_index < 2 ? 0 : (param_index < 6 ? 1 : 2));
+  const int is_one_centered = (param_index == 2 || param_index == 5);
+
+  // Make parameter zero-centered and offset the shift that was done to make
+  // it compatible with the warped model
+  param_value = (param_value - (is_one_centered << WARPEDMODEL_PREC_BITS)) >>
+                scale_vals[param_type];
+  // Add desired offset to the rescaled/zero-centered parameter
+  param_value += offset;
+  // Clamp the parameter so it does not overflow the number of bits allotted
+  // to it in the bitstream
+  param_value = (int32_t)clamp(param_value, -clamp_vals[param_type],
+                               clamp_vals[param_type]);
+  // Rescale the parameter to WARPEDMODEL_PRECISION_BITS so it is compatible
+  // with the warped motion library
+  param_value *= (1 << scale_vals[param_type]);
+
+  // Undo the zero-centering step if necessary
+  return param_value + (is_one_centered << WARPEDMODEL_PREC_BITS);
+}
+
+static void force_wmtype(WarpedMotionParams *wm, TransformationType wmtype) {
+  switch (wmtype) {
+    case IDENTITY:
+      wm->wmmat[0] = 0;
+      wm->wmmat[1] = 0;
+      AOM_FALLTHROUGH_INTENDED;
+    case TRANSLATION:
+      wm->wmmat[2] = 1 << WARPEDMODEL_PREC_BITS;
+      wm->wmmat[3] = 0;
+      AOM_FALLTHROUGH_INTENDED;
+    case ROTZOOM:
+      wm->wmmat[4] = -wm->wmmat[3];
+      wm->wmmat[5] = wm->wmmat[2];
+      AOM_FALLTHROUGH_INTENDED;
+    case AFFINE: wm->wmmat[6] = wm->wmmat[7] = 0; break;
+    default: assert(0);
+  }
+  wm->wmtype = wmtype;
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static int64_t highbd_warp_error(
+    WarpedMotionParams *wm, const uint16_t *const ref, int width, int height,
+    int stride, const uint16_t *const dst, int p_col, int p_row, int p_width,
+    int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd,
+    int64_t best_error, uint8_t *segment_map, int segment_map_stride) {
+  int64_t gm_sumerr = 0;
+  const int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
+  const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
+  uint16_t tmp[WARP_ERROR_BLOCK * WARP_ERROR_BLOCK];
+
+  ConvolveParams conv_params = get_conv_params(0, 0, bd);
+  conv_params.use_dist_wtd_comp_avg = 0;
+  for (int i = p_row; i < p_row + p_height; i += WARP_ERROR_BLOCK) {
+    for (int j = p_col; j < p_col + p_width; j += WARP_ERROR_BLOCK) {
+      int seg_x = j >> WARP_ERROR_BLOCK_LOG;
+      int seg_y = i >> WARP_ERROR_BLOCK_LOG;
+      // Only compute the error if this block contains inliers from the motion
+      // model
+      if (!segment_map[seg_y * segment_map_stride + seg_x]) continue;
+      // avoid warping extra 8x8 blocks in the padded region of the frame
+      // when p_width and p_height are not multiples of WARP_ERROR_BLOCK
+      const int warp_w = AOMMIN(error_bsize_w, p_col + p_width - j);
+      const int warp_h = AOMMIN(error_bsize_h, p_row + p_height - i);
+      highbd_warp_plane(wm, ref, width, height, stride, tmp, j, i, warp_w,
+                        warp_h, WARP_ERROR_BLOCK, subsampling_x, subsampling_y,
+                        bd, &conv_params);
+      gm_sumerr += av1_calc_highbd_frame_error(tmp, WARP_ERROR_BLOCK,
+                                               dst + j + i * p_stride, warp_w,
+                                               warp_h, p_stride, bd);
+      if (gm_sumerr > best_error) return INT64_MAX;
+    }
+  }
+  return gm_sumerr;
+}
+#endif
+
+static int64_t warp_error(WarpedMotionParams *wm, const uint8_t *const ref,
+                          int width, int height, int stride,
+                          const uint8_t *const dst, int p_col, int p_row,
+                          int p_width, int p_height, int p_stride,
+                          int subsampling_x, int subsampling_y,
+                          int64_t best_error, uint8_t *segment_map,
+                          int segment_map_stride) {
+  int64_t gm_sumerr = 0;
+  int warp_w, warp_h;
+  const int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
+  const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
+  DECLARE_ALIGNED(16, uint8_t, tmp[WARP_ERROR_BLOCK * WARP_ERROR_BLOCK]);
+  ConvolveParams conv_params = get_conv_params(0, 0, 8);
+  conv_params.use_dist_wtd_comp_avg = 0;
+
+  for (int i = p_row; i < p_row + p_height; i += WARP_ERROR_BLOCK) {
+    for (int j = p_col; j < p_col + p_width; j += WARP_ERROR_BLOCK) {
+      int seg_x = j >> WARP_ERROR_BLOCK_LOG;
+      int seg_y = i >> WARP_ERROR_BLOCK_LOG;
+      // Only compute the error if this block contains inliers from the motion
+      // model
+      if (!segment_map[seg_y * segment_map_stride + seg_x]) continue;
+      // avoid warping extra 8x8 blocks in the padded region of the frame
+      // when p_width and p_height are not multiples of WARP_ERROR_BLOCK
+      warp_w = AOMMIN(error_bsize_w, p_col + p_width - j);
+      warp_h = AOMMIN(error_bsize_h, p_row + p_height - i);
+      warp_plane(wm, ref, width, height, stride, tmp, j, i, warp_w, warp_h,
+                 WARP_ERROR_BLOCK, subsampling_x, subsampling_y, &conv_params);
+
+      gm_sumerr +=
+          av1_calc_frame_error(tmp, WARP_ERROR_BLOCK, dst + j + i * p_stride,
+                               warp_w, warp_h, p_stride);
+      if (gm_sumerr > best_error) return INT64_MAX;
+    }
+  }
+  return gm_sumerr;
+}
+
+int64_t av1_warp_error(WarpedMotionParams *wm, int use_hbd, int bd,
+                       const uint8_t *ref, int width, int height, int stride,
+                       uint8_t *dst, int p_col, int p_row, int p_width,
+                       int p_height, int p_stride, int subsampling_x,
+                       int subsampling_y, int64_t best_error,
+                       uint8_t *segment_map, int segment_map_stride) {
+  if (wm->wmtype <= AFFINE)
+    if (!av1_get_shear_params(wm)) return INT64_MAX;
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (use_hbd)
+    return highbd_warp_error(wm, CONVERT_TO_SHORTPTR(ref), width, height,
+                             stride, CONVERT_TO_SHORTPTR(dst), p_col, p_row,
+                             p_width, p_height, p_stride, subsampling_x,
+                             subsampling_y, bd, best_error, segment_map,
+                             segment_map_stride);
+#endif
+  (void)use_hbd;
+  (void)bd;
+  return warp_error(wm, ref, width, height, stride, dst, p_col, p_row, p_width,
+                    p_height, p_stride, subsampling_x, subsampling_y,
+                    best_error, segment_map, segment_map_stride);
+}
+
+// Factors used to calculate the thresholds for av1_warp_error
+static double thresh_factors[GM_REFINEMENT_COUNT] = { 1.25, 1.20, 1.15, 1.10,
+                                                      1.05 };
+
+static INLINE int64_t calc_approx_erroradv_threshold(
+    double scaling_factor, int64_t erroradv_threshold) {
+  return erroradv_threshold <
+                 (int64_t)(((double)INT64_MAX / scaling_factor) + 0.5)
+             ? (int64_t)(scaling_factor * erroradv_threshold + 0.5)
+             : INT64_MAX;
+}
+
+int64_t av1_refine_integerized_param(
+    WarpedMotionParams *wm, TransformationType wmtype, int use_hbd, int bd,
+    uint8_t *ref, int r_width, int r_height, int r_stride, uint8_t *dst,
+    int d_width, int d_height, int d_stride, int n_refinements,
+    int64_t best_frame_error, uint8_t *segment_map, int segment_map_stride,
+    int64_t erroradv_threshold) {
+  static const int max_trans_model_params[TRANS_TYPES] = { 0, 2, 4, 6 };
+  const int border = ERRORADV_BORDER;
+  int i = 0, p;
+  int n_params = max_trans_model_params[wmtype];
+  int32_t *param_mat = wm->wmmat;
+  int64_t step_error, best_error;
+  int32_t step;
+  int32_t *param;
+  int32_t curr_param;
+  int32_t best_param;
+
+  force_wmtype(wm, wmtype);
+  best_error =
+      av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride,
+                     dst + border * d_stride + border, border, border,
+                     d_width - 2 * border, d_height - 2 * border, d_stride, 0,
+                     0, best_frame_error, segment_map, segment_map_stride);
+  best_error = AOMMIN(best_error, best_frame_error);
+  step = 1 << (n_refinements - 1);
+  for (i = 0; i < n_refinements; i++, step >>= 1) {
+    int64_t error_adv_thresh =
+        calc_approx_erroradv_threshold(thresh_factors[i], erroradv_threshold);
+    for (p = 0; p < n_params; ++p) {
+      int step_dir = 0;
+      // Skip searches for parameters that are forced to be 0
+      param = param_mat + p;
+      curr_param = *param;
+      best_param = curr_param;
+      // look to the left
+      *param = add_param_offset(p, curr_param, -step);
+      step_error =
+          av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride,
+                         dst + border * d_stride + border, border, border,
+                         d_width - 2 * border, d_height - 2 * border, d_stride,
+                         0, 0, AOMMIN(best_error, error_adv_thresh),
+                         segment_map, segment_map_stride);
+      if (step_error < best_error) {
+        best_error = step_error;
+        best_param = *param;
+        step_dir = -1;
+      }
+
+      // look to the right
+      *param = add_param_offset(p, curr_param, step);
+      step_error =
+          av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride,
+                         dst + border * d_stride + border, border, border,
+                         d_width - 2 * border, d_height - 2 * border, d_stride,
+                         0, 0, AOMMIN(best_error, error_adv_thresh),
+                         segment_map, segment_map_stride);
+      if (step_error < best_error) {
+        best_error = step_error;
+        best_param = *param;
+        step_dir = 1;
+      }
+      *param = best_param;
+
+      // look to the direction chosen above repeatedly until error increases
+      // for the biggest step size
+      while (step_dir) {
+        *param = add_param_offset(p, best_param, step * step_dir);
+        step_error =
+            av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride,
+                           dst + border * d_stride + border, border, border,
+                           d_width - 2 * border, d_height - 2 * border,
+                           d_stride, 0, 0, AOMMIN(best_error, error_adv_thresh),
+                           segment_map, segment_map_stride);
+        if (step_error < best_error) {
+          best_error = step_error;
+          best_param = *param;
+        } else {
+          *param = best_param;
+          step_dir = 0;
+        }
+      }
+    }
+  }
+  force_wmtype(wm, wmtype);
+  wm->wmtype = get_wmtype(wm);
+  return best_error;
+}
+
+unsigned char *av1_downconvert_frame(YV12_BUFFER_CONFIG *frm, int bit_depth) {
+  int i, j;
+  uint16_t *orig_buf = CONVERT_TO_SHORTPTR(frm->y_buffer);
+  uint8_t *buf_8bit = frm->y_buffer_8bit;
+  assert(buf_8bit);
+  if (!frm->buf_8bit_valid) {
+    for (i = 0; i < frm->y_height; ++i) {
+      for (j = 0; j < frm->y_width; ++j) {
+        buf_8bit[i * frm->y_stride + j] =
+            orig_buf[i * frm->y_stride + j] >> (bit_depth - 8);
+      }
+    }
+    frm->buf_8bit_valid = 1;
+  }
+  return buf_8bit;
+}
+
+static void get_inliers_from_indices(MotionModel *params,
+                                     int *correspondences) {
+  int *inliers_tmp = (int *)aom_malloc(2 * MAX_CORNERS * sizeof(*inliers_tmp));
+  memset(inliers_tmp, 0, 2 * MAX_CORNERS * sizeof(*inliers_tmp));
+
+  for (int i = 0; i < params->num_inliers; i++) {
+    int index = params->inliers[i];
+    inliers_tmp[2 * i] = correspondences[4 * index];
+    inliers_tmp[2 * i + 1] = correspondences[4 * index + 1];
+  }
+  memcpy(params->inliers, inliers_tmp, sizeof(*inliers_tmp) * 2 * MAX_CORNERS);
+  aom_free(inliers_tmp);
+}
+
+#define FEAT_COUNT_TR 3
+#define SEG_COUNT_TR 0.40
+void av1_compute_feature_segmentation_map(uint8_t *segment_map, int width,
+                                          int height, int *inliers,
+                                          int num_inliers) {
+  int seg_count = 0;
+  memset(segment_map, 0, sizeof(*segment_map) * width * height);
+
+  for (int i = 0; i < num_inliers; i++) {
+    int x = inliers[i * 2];
+    int y = inliers[i * 2 + 1];
+    int seg_x = x >> WARP_ERROR_BLOCK_LOG;
+    int seg_y = y >> WARP_ERROR_BLOCK_LOG;
+    segment_map[seg_y * width + seg_x] += 1;
+  }
+
+  for (int i = 0; i < height; i++) {
+    for (int j = 0; j < width; j++) {
+      uint8_t feat_count = segment_map[i * width + j];
+      segment_map[i * width + j] = (feat_count >= FEAT_COUNT_TR);
+      seg_count += (segment_map[i * width + j]);
+    }
+  }
+
+  // If this motion does not make up a large enough portion of the frame,
+  // use the unsegmented version of the error metric
+  if (seg_count < (width * height * SEG_COUNT_TR))
+    memset(segment_map, 1, width * height * sizeof(*segment_map));
+}
+
+static int compute_global_motion_feature_based(
+    TransformationType type, unsigned char *frm_buffer, int frm_width,
+    int frm_height, int frm_stride, int *frm_corners, int num_frm_corners,
+    YV12_BUFFER_CONFIG *ref, int bit_depth, int *num_inliers_by_motion,
+    MotionModel *params_by_motion, int num_motions) {
+  int i;
+  int num_ref_corners;
+  int num_correspondences;
+  int *correspondences;
+  int ref_corners[2 * MAX_CORNERS];
+  unsigned char *ref_buffer = ref->y_buffer;
+  RansacFunc ransac = av1_get_ransac_type(type);
+
+  if (ref->flags & YV12_FLAG_HIGHBITDEPTH) {
+    ref_buffer = av1_downconvert_frame(ref, bit_depth);
+  }
+
+  num_ref_corners =
+      av1_fast_corner_detect(ref_buffer, ref->y_width, ref->y_height,
+                             ref->y_stride, ref_corners, MAX_CORNERS);
+
+  // find correspondences between the two images
+  correspondences =
+      (int *)malloc(num_frm_corners * 4 * sizeof(*correspondences));
+  num_correspondences = av1_determine_correspondence(
+      frm_buffer, (int *)frm_corners, num_frm_corners, ref_buffer,
+      (int *)ref_corners, num_ref_corners, frm_width, frm_height, frm_stride,
+      ref->y_stride, correspondences);
+
+  ransac(correspondences, num_correspondences, num_inliers_by_motion,
+         params_by_motion, num_motions);
+
+  // Set num_inliers = 0 for motions with too few inliers so they are ignored.
+  for (i = 0; i < num_motions; ++i) {
+    if (num_inliers_by_motion[i] < MIN_INLIER_PROB * num_correspondences ||
+        num_correspondences == 0) {
+      num_inliers_by_motion[i] = 0;
+    } else {
+      get_inliers_from_indices(&params_by_motion[i], correspondences);
+    }
+  }
+
+  free(correspondences);
+
+  // Return true if any one of the motions has inliers.
+  for (i = 0; i < num_motions; ++i) {
+    if (num_inliers_by_motion[i] > 0) return 1;
+  }
+  return 0;
+}
+
+// Don't use points around the frame border since they are less reliable
+static INLINE int valid_point(int x, int y, int width, int height) {
+  return (x > (PATCH_SIZE + PATCH_CENTER)) &&
+         (x < (width - PATCH_SIZE - PATCH_CENTER)) &&
+         (y > (PATCH_SIZE + PATCH_CENTER)) &&
+         (y < (height - PATCH_SIZE - PATCH_CENTER));
+}
+
+static int determine_disflow_correspondence(int *frm_corners,
+                                            int num_frm_corners, double *flow_u,
+                                            double *flow_v, int width,
+                                            int height, int stride,
+                                            double *correspondences) {
+  int num_correspondences = 0;
+  int x, y;
+  for (int i = 0; i < num_frm_corners; ++i) {
+    x = frm_corners[2 * i];
+    y = frm_corners[2 * i + 1];
+    if (valid_point(x, y, width, height)) {
+      correspondences[4 * num_correspondences] = x;
+      correspondences[4 * num_correspondences + 1] = y;
+      correspondences[4 * num_correspondences + 2] = x + flow_u[y * stride + x];
+      correspondences[4 * num_correspondences + 3] = y + flow_v[y * stride + x];
+      num_correspondences++;
+    }
+  }
+  return num_correspondences;
+}
+
+static double getCubicValue(double p[4], double x) {
+  return p[1] + 0.5 * x *
+                    (p[2] - p[0] +
+                     x * (2.0 * p[0] - 5.0 * p[1] + 4.0 * p[2] - p[3] +
+                          x * (3.0 * (p[1] - p[2]) + p[3] - p[0])));
+}
+
+static void get_subcolumn(unsigned char *ref, double col[4], int stride, int x,
+                          int y_start) {
+  int i;
+  for (i = 0; i < 4; ++i) {
+    col[i] = ref[(i + y_start) * stride + x];
+  }
+}
+
+static double bicubic(unsigned char *ref, double x, double y, int stride) {
+  double arr[4];
+  int k;
+  int i = (int)x;
+  int j = (int)y;
+  for (k = 0; k < 4; ++k) {
+    double arr_temp[4];
+    get_subcolumn(ref, arr_temp, stride, i + k - 1, j - 1);
+    arr[k] = getCubicValue(arr_temp, y - j);
+  }
+  return getCubicValue(arr, x - i);
+}
+
+// Interpolate a warped block using bicubic interpolation when possible
+static unsigned char interpolate(unsigned char *ref, double x, double y,
+                                 int width, int height, int stride) {
+  if (x < 0 && y < 0)
+    return ref[0];
+  else if (x < 0 && y > height - 1)
+    return ref[(height - 1) * stride];
+  else if (x > width - 1 && y < 0)
+    return ref[width - 1];
+  else if (x > width - 1 && y > height - 1)
+    return ref[(height - 1) * stride + (width - 1)];
+  else if (x < 0) {
+    int v;
+    int i = (int)y;
+    double a = y - i;
+    if (y > 1 && y < height - 2) {
+      double arr[4];
+      get_subcolumn(ref, arr, stride, 0, i - 1);
+      return clamp((int)(getCubicValue(arr, a) + 0.5), 0, 255);
+    }
+    v = (int)(ref[i * stride] * (1 - a) + ref[(i + 1) * stride] * a + 0.5);
+    return clamp(v, 0, 255);
+  } else if (y < 0) {
+    int v;
+    int j = (int)x;
+    double b = x - j;
+    if (x > 1 && x < width - 2) {
+      double arr[4] = { ref[j - 1], ref[j], ref[j + 1], ref[j + 2] };
+      return clamp((int)(getCubicValue(arr, b) + 0.5), 0, 255);
+    }
+    v = (int)(ref[j] * (1 - b) + ref[j + 1] * b + 0.5);
+    return clamp(v, 0, 255);
+  } else if (x > width - 1) {
+    int v;
+    int i = (int)y;
+    double a = y - i;
+    if (y > 1 && y < height - 2) {
+      double arr[4];
+      get_subcolumn(ref, arr, stride, width - 1, i - 1);
+      return clamp((int)(getCubicValue(arr, a) + 0.5), 0, 255);
+    }
+    v = (int)(ref[i * stride + width - 1] * (1 - a) +
+              ref[(i + 1) * stride + width - 1] * a + 0.5);
+    return clamp(v, 0, 255);
+  } else if (y > height - 1) {
+    int v;
+    int j = (int)x;
+    double b = x - j;
+    if (x > 1 && x < width - 2) {
+      int row = (height - 1) * stride;
+      double arr[4] = { ref[row + j - 1], ref[row + j], ref[row + j + 1],
+                        ref[row + j + 2] };
+      return clamp((int)(getCubicValue(arr, b) + 0.5), 0, 255);
+    }
+    v = (int)(ref[(height - 1) * stride + j] * (1 - b) +
+              ref[(height - 1) * stride + j + 1] * b + 0.5);
+    return clamp(v, 0, 255);
+  } else if (x > 1 && y > 1 && x < width - 2 && y < height - 2) {
+    return clamp((int)(bicubic(ref, x, y, stride) + 0.5), 0, 255);
+  } else {
+    int i = (int)y;
+    int j = (int)x;
+    double a = y - i;
+    double b = x - j;
+    int v = (int)(ref[i * stride + j] * (1 - a) * (1 - b) +
+                  ref[i * stride + j + 1] * (1 - a) * b +
+                  ref[(i + 1) * stride + j] * a * (1 - b) +
+                  ref[(i + 1) * stride + j + 1] * a * b);
+    return clamp(v, 0, 255);
+  }
+}
+
+// Warps a block using flow vector [u, v] and computes the mse
+static double compute_warp_and_error(unsigned char *ref, unsigned char *frm,
+                                     int width, int height, int stride, int x,
+                                     int y, double u, double v, int16_t *dt) {
+  int i, j;
+  unsigned char warped;
+  double x_w, y_w;
+  double mse = 0;
+  int16_t err = 0;
+  for (i = y; i < y + PATCH_SIZE; ++i)
+    for (j = x; j < x + PATCH_SIZE; ++j) {
+      x_w = (double)j + u;
+      y_w = (double)i + v;
+      warped = interpolate(ref, x_w, y_w, width, height, stride);
+      err = warped - frm[j + i * stride];
+      mse += err * err;
+      dt[(i - y) * PATCH_SIZE + (j - x)] = err;
+    }
+
+  mse /= (PATCH_SIZE * PATCH_SIZE);
+  return mse;
+}
+
+// Computes the components of the system of equations used to solve for
+// a flow vector. This includes:
+// 1.) The hessian matrix for optical flow. This matrix is in the
+// form of:
+//
+//       M = |sum(dx * dx)  sum(dx * dy)|
+//           |sum(dx * dy)  sum(dy * dy)|
+//
+// 2.)   b = |sum(dx * dt)|
+//           |sum(dy * dt)|
+// Where the sums are computed over a square window of PATCH_SIZE.
+static INLINE void compute_flow_system(const double *dx, int dx_stride,
+                                       const double *dy, int dy_stride,
+                                       const int16_t *dt, int dt_stride,
+                                       double *M, double *b) {
+  for (int i = 0; i < PATCH_SIZE; i++) {
+    for (int j = 0; j < PATCH_SIZE; j++) {
+      M[0] += dx[i * dx_stride + j] * dx[i * dx_stride + j];
+      M[1] += dx[i * dx_stride + j] * dy[i * dy_stride + j];
+      M[3] += dy[i * dy_stride + j] * dy[i * dy_stride + j];
+
+      b[0] += dx[i * dx_stride + j] * dt[i * dt_stride + j];
+      b[1] += dy[i * dy_stride + j] * dt[i * dt_stride + j];
+    }
+  }
+
+  M[2] = M[1];
+}
+
+// Solves a general Mx = b where M is a 2x2 matrix and b is a 2x1 matrix
+static INLINE void solve_2x2_system(const double *M, const double *b,
+                                    double *output_vec) {
+  double M_0 = M[0];
+  double M_3 = M[3];
+  double det = (M_0 * M_3) - (M[1] * M[2]);
+  if (det < 1e-5) {
+    // Handle singular matrix
+    // TODO(sarahparker) compare results using pseudo inverse instead
+    M_0 += 1e-10;
+    M_3 += 1e-10;
+    det = (M_0 * M_3) - (M[1] * M[2]);
+  }
+  const double det_inv = 1 / det;
+  const double mult_b0 = det_inv * b[0];
+  const double mult_b1 = det_inv * b[1];
+  output_vec[0] = M_3 * mult_b0 - M[1] * mult_b1;
+  output_vec[1] = -M[2] * mult_b0 + M_0 * mult_b1;
+}
+
+/*
+static INLINE void image_difference(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
+                                    int16_t *dst, int dst_stride, int height,
+                                    int width) {
+  const int block_unit = 8;
+  // Take difference in 8x8 blocks to make use of optimized diff function
+  for (int i = 0; i < height; i += block_unit) {
+    for (int j = 0; j < width; j += block_unit) {
+      aom_subtract_block(block_unit, block_unit, dst + i * dst_stride + j,
+                         dst_stride, src + i * src_stride + j, src_stride,
+                         ref + i * ref_stride + j, ref_stride);
+    }
+  }
+}
+*/
+
+// Compute an image gradient using a sobel filter.
+// If dir == 1, compute the x gradient. If dir == 0, compute y. This function
+// assumes the images have been padded so that they can be processed in units
+// of 8.
+static INLINE void sobel_xy_image_gradient(const uint8_t *src, int src_stride,
+                                           double *dst, int dst_stride,
+                                           int height, int width, int dir) {
+  double norm = 1.0;
+  // TODO(sarahparker) experiment with doing this over larger block sizes
+  const int block_unit = 8;
+  // Filter in 8x8 blocks to eventually make use of optimized convolve function
+  for (int i = 0; i < height; i += block_unit) {
+    for (int j = 0; j < width; j += block_unit) {
+      av1_convolve_2d_sobel_y_c(src + i * src_stride + j, src_stride,
+                                dst + i * dst_stride + j, dst_stride,
+                                block_unit, block_unit, dir, norm);
+    }
+  }
+}
+
+static ImagePyramid *alloc_pyramid(int width, int height, int pad_size,
+                                   int compute_gradient) {
+  ImagePyramid *pyr = aom_malloc(sizeof(*pyr));
+  pyr->has_gradient = compute_gradient;
+  // 2 * width * height is the upper bound for a buffer that fits
+  // all pyramid levels + padding for each level
+  const int buffer_size = sizeof(*pyr->level_buffer) * 2 * width * height +
+                          (width + 2 * pad_size) * 2 * pad_size * N_LEVELS;
+  pyr->level_buffer = aom_malloc(buffer_size);
+  memset(pyr->level_buffer, 0, buffer_size);
+
+  if (compute_gradient) {
+    const int gradient_size =
+        sizeof(*pyr->level_dx_buffer) * 2 * width * height +
+        (width + 2 * pad_size) * 2 * pad_size * N_LEVELS;
+    pyr->level_dx_buffer = aom_malloc(gradient_size);
+    pyr->level_dy_buffer = aom_malloc(gradient_size);
+    memset(pyr->level_dx_buffer, 0, gradient_size);
+    memset(pyr->level_dy_buffer, 0, gradient_size);
+  }
+  return pyr;
+}
+
+static void free_pyramid(ImagePyramid *pyr) {
+  aom_free(pyr->level_buffer);
+  if (pyr->has_gradient) {
+    aom_free(pyr->level_dx_buffer);
+    aom_free(pyr->level_dy_buffer);
+  }
+  aom_free(pyr);
+}
+
+static INLINE void update_level_dims(ImagePyramid *frm_pyr, int level) {
+  frm_pyr->widths[level] = frm_pyr->widths[level - 1] >> 1;
+  frm_pyr->heights[level] = frm_pyr->heights[level - 1] >> 1;
+  frm_pyr->strides[level] = frm_pyr->widths[level] + 2 * frm_pyr->pad_size;
+  // Point the beginning of the next level buffer to the correct location inside
+  // the padded border
+  frm_pyr->level_loc[level] =
+      frm_pyr->level_loc[level - 1] +
+      frm_pyr->strides[level - 1] *
+          (2 * frm_pyr->pad_size + frm_pyr->heights[level - 1]);
+}
+
+// Compute coarse to fine pyramids for a frame
+static void compute_flow_pyramids(unsigned char *frm, const int frm_width,
+                                  const int frm_height, const int frm_stride,
+                                  int n_levels, int pad_size, int compute_grad,
+                                  ImagePyramid *frm_pyr) {
+  int cur_width, cur_height, cur_stride, cur_loc;
+  assert((frm_width >> n_levels) > 0);
+  assert((frm_height >> n_levels) > 0);
+
+  // Initialize first level
+  frm_pyr->n_levels = n_levels;
+  frm_pyr->pad_size = pad_size;
+  frm_pyr->widths[0] = frm_width;
+  frm_pyr->heights[0] = frm_height;
+  frm_pyr->strides[0] = frm_width + 2 * frm_pyr->pad_size;
+  // Point the beginning of the level buffer to the location inside
+  // the padded border
+  frm_pyr->level_loc[0] =
+      frm_pyr->strides[0] * frm_pyr->pad_size + frm_pyr->pad_size;
+  // This essentially copies the original buffer into the pyramid buffer
+  // without the original padding
+  av1_resize_plane(frm, frm_height, frm_width, frm_stride,
+                   frm_pyr->level_buffer + frm_pyr->level_loc[0],
+                   frm_pyr->heights[0], frm_pyr->widths[0],
+                   frm_pyr->strides[0]);
+
+  if (compute_grad) {
+    cur_width = frm_pyr->widths[0];
+    cur_height = frm_pyr->heights[0];
+    cur_stride = frm_pyr->strides[0];
+    cur_loc = frm_pyr->level_loc[0];
+    assert(frm_pyr->has_gradient && frm_pyr->level_dx_buffer != NULL &&
+           frm_pyr->level_dy_buffer != NULL);
+    // Computation x gradient
+    sobel_xy_image_gradient(frm_pyr->level_buffer + cur_loc, cur_stride,
+                            frm_pyr->level_dx_buffer + cur_loc, cur_stride,
+                            cur_height, cur_width, 1);
+
+    // Computation y gradient
+    sobel_xy_image_gradient(frm_pyr->level_buffer + cur_loc, cur_stride,
+                            frm_pyr->level_dy_buffer + cur_loc, cur_stride,
+                            cur_height, cur_width, 0);
+  }
+
+  // Start at the finest level and resize down to the coarsest level
+  for (int level = 1; level < n_levels; ++level) {
+    update_level_dims(frm_pyr, level);
+    cur_width = frm_pyr->widths[level];
+    cur_height = frm_pyr->heights[level];
+    cur_stride = frm_pyr->strides[level];
+    cur_loc = frm_pyr->level_loc[level];
+
+    av1_resize_plane(frm_pyr->level_buffer + frm_pyr->level_loc[level - 1],
+                     frm_pyr->heights[level - 1], frm_pyr->widths[level - 1],
+                     frm_pyr->strides[level - 1],
+                     frm_pyr->level_buffer + cur_loc, cur_height, cur_width,
+                     cur_stride);
+
+    if (compute_grad) {
+      assert(frm_pyr->has_gradient && frm_pyr->level_dx_buffer != NULL &&
+             frm_pyr->level_dy_buffer != NULL);
+      // Computation x gradient
+      sobel_xy_image_gradient(frm_pyr->level_buffer + cur_loc, cur_stride,
+                              frm_pyr->level_dx_buffer + cur_loc, cur_stride,
+                              cur_height, cur_width, 1);
+
+      // Computation y gradient
+      sobel_xy_image_gradient(frm_pyr->level_buffer + cur_loc, cur_stride,
+                              frm_pyr->level_dy_buffer + cur_loc, cur_stride,
+                              cur_height, cur_width, 0);
+    }
+  }
+}
+
+static INLINE void compute_flow_at_point(unsigned char *frm, unsigned char *ref,
+                                         double *dx, double *dy, int x, int y,
+                                         int width, int height, int stride,
+                                         double *u, double *v) {
+  double M[4] = { 0 };
+  double b[2] = { 0 };
+  double tmp_output_vec[2] = { 0 };
+  double error = 0;
+  int16_t dt[PATCH_SIZE * PATCH_SIZE];
+  double o_u = *u;
+  double o_v = *v;
+
+  for (int itr = 0; itr < DISFLOW_MAX_ITR; itr++) {
+    error = compute_warp_and_error(ref, frm, width, height, stride, x, y, *u,
+                                   *v, dt);
+    if (error <= DISFLOW_ERROR_TR) break;
+    compute_flow_system(dx, stride, dy, stride, dt, PATCH_SIZE, M, b);
+    solve_2x2_system(M, b, tmp_output_vec);
+    *u += tmp_output_vec[0];
+    *v += tmp_output_vec[1];
+  }
+  if (fabs(*u - o_u) > PATCH_SIZE || fabs(*v - o_u) > PATCH_SIZE) {
+    *u = o_u;
+    *v = o_v;
+  }
+}
+
+// make sure flow_u and flow_v start at 0
+static void compute_flow_field(ImagePyramid *frm_pyr, ImagePyramid *ref_pyr,
+                               double *flow_u, double *flow_v) {
+  int cur_width, cur_height, cur_stride, cur_loc, patch_loc, patch_center;
+  double *u_upscale =
+      aom_malloc(frm_pyr->strides[0] * frm_pyr->heights[0] * sizeof(*flow_u));
+  double *v_upscale =
+      aom_malloc(frm_pyr->strides[0] * frm_pyr->heights[0] * sizeof(*flow_v));
+
+  assert(frm_pyr->n_levels == ref_pyr->n_levels);
+
+  // Compute flow field from coarsest to finest level of the pyramid
+  for (int level = frm_pyr->n_levels - 1; level >= 0; --level) {
+    cur_width = frm_pyr->widths[level];
+    cur_height = frm_pyr->heights[level];
+    cur_stride = frm_pyr->strides[level];
+    cur_loc = frm_pyr->level_loc[level];
+
+    for (int i = PATCH_SIZE; i < cur_height - PATCH_SIZE; i += PATCH_STEP) {
+      for (int j = PATCH_SIZE; j < cur_width - PATCH_SIZE; j += PATCH_STEP) {
+        patch_loc = i * cur_stride + j;
+        patch_center = patch_loc + PATCH_CENTER * cur_stride + PATCH_CENTER;
+        compute_flow_at_point(frm_pyr->level_buffer + cur_loc,
+                              ref_pyr->level_buffer + cur_loc,
+                              frm_pyr->level_dx_buffer + cur_loc + patch_loc,
+                              frm_pyr->level_dy_buffer + cur_loc + patch_loc, j,
+                              i, cur_width, cur_height, cur_stride,
+                              flow_u + patch_center, flow_v + patch_center);
+      }
+    }
+    // TODO(sarahparker) Replace this with upscale function in resize.c
+    if (level > 0) {
+      int h_upscale = frm_pyr->heights[level - 1];
+      int w_upscale = frm_pyr->widths[level - 1];
+      int s_upscale = frm_pyr->strides[level - 1];
+      for (int i = 0; i < h_upscale; ++i) {
+        for (int j = 0; j < w_upscale; ++j) {
+          u_upscale[j + i * s_upscale] =
+              flow_u[(int)(j >> 1) + (int)(i >> 1) * cur_stride];
+          v_upscale[j + i * s_upscale] =
+              flow_v[(int)(j >> 1) + (int)(i >> 1) * cur_stride];
+        }
+      }
+      memcpy(flow_u, u_upscale,
+             frm_pyr->strides[0] * frm_pyr->heights[0] * sizeof(*flow_u));
+      memcpy(flow_v, v_upscale,
+             frm_pyr->strides[0] * frm_pyr->heights[0] * sizeof(*flow_v));
+    }
+  }
+  aom_free(u_upscale);
+  aom_free(v_upscale);
+}
+
+static int compute_global_motion_disflow_based(
+    TransformationType type, unsigned char *frm_buffer, int frm_width,
+    int frm_height, int frm_stride, int *frm_corners, int num_frm_corners,
+    YV12_BUFFER_CONFIG *ref, int bit_depth, int *num_inliers_by_motion,
+    MotionModel *params_by_motion, int num_motions) {
+  unsigned char *ref_buffer = ref->y_buffer;
+  const int ref_width = ref->y_width;
+  const int ref_height = ref->y_height;
+  const int pad_size = AOMMAX(PATCH_SIZE, MIN_PAD);
+  int num_correspondences;
+  double *correspondences;
+  RansacFuncDouble ransac = av1_get_ransac_double_prec_type(type);
+  assert(frm_width == ref_width);
+  assert(frm_height == ref_height);
+
+  // Ensure the number of pyramid levels will work with the frame resolution
+  const int msb =
+      frm_width < frm_height ? get_msb(frm_width) : get_msb(frm_height);
+  const int n_levels = AOMMIN(msb, N_LEVELS);
+
+  if (ref->flags & YV12_FLAG_HIGHBITDEPTH) {
+    ref_buffer = av1_downconvert_frame(ref, bit_depth);
+  }
+
+  // TODO(sarahparker) We will want to do the source pyramid computation
+  // outside of this function so it doesn't get recomputed for every
+  // reference. We also don't need to compute every pyramid level for the
+  // reference in advance, since lower levels can be overwritten once their
+  // flow field is computed and upscaled. I'll add these optimizations
+  // once the full implementation is working.
+  // Allocate frm image pyramids
+  int compute_gradient = 1;
+  ImagePyramid *frm_pyr =
+      alloc_pyramid(frm_width, frm_height, pad_size, compute_gradient);
+  compute_flow_pyramids(frm_buffer, frm_width, frm_height, frm_stride, n_levels,
+                        pad_size, compute_gradient, frm_pyr);
+  // Allocate ref image pyramids
+  compute_gradient = 0;
+  ImagePyramid *ref_pyr =
+      alloc_pyramid(ref_width, ref_height, pad_size, compute_gradient);
+  compute_flow_pyramids(ref_buffer, ref_width, ref_height, ref->y_stride,
+                        n_levels, pad_size, compute_gradient, ref_pyr);
+
+  double *flow_u =
+      aom_malloc(frm_pyr->strides[0] * frm_pyr->heights[0] * sizeof(*flow_u));
+  double *flow_v =
+      aom_malloc(frm_pyr->strides[0] * frm_pyr->heights[0] * sizeof(*flow_v));
+
+  memset(flow_u, 0,
+         frm_pyr->strides[0] * frm_pyr->heights[0] * sizeof(*flow_u));
+  memset(flow_v, 0,
+         frm_pyr->strides[0] * frm_pyr->heights[0] * sizeof(*flow_v));
+
+  compute_flow_field(frm_pyr, ref_pyr, flow_u, flow_v);
+
+  // find correspondences between the two images using the flow field
+  correspondences = aom_malloc(num_frm_corners * 4 * sizeof(*correspondences));
+  num_correspondences = determine_disflow_correspondence(
+      frm_corners, num_frm_corners, flow_u, flow_v, frm_width, frm_height,
+      frm_pyr->strides[0], correspondences);
+  ransac(correspondences, num_correspondences, num_inliers_by_motion,
+         params_by_motion, num_motions);
+
+  free_pyramid(frm_pyr);
+  free_pyramid(ref_pyr);
+  aom_free(correspondences);
+  aom_free(flow_u);
+  aom_free(flow_v);
+  // Set num_inliers = 0 for motions with too few inliers so they are ignored.
+  for (int i = 0; i < num_motions; ++i) {
+    if (num_inliers_by_motion[i] < MIN_INLIER_PROB * num_correspondences) {
+      num_inliers_by_motion[i] = 0;
+    }
+  }
+
+  // Return true if any one of the motions has inliers.
+  for (int i = 0; i < num_motions; ++i) {
+    if (num_inliers_by_motion[i] > 0) return 1;
+  }
+  return 0;
+}
+
+int av1_compute_global_motion(TransformationType type,
+                              unsigned char *frm_buffer, int frm_width,
+                              int frm_height, int frm_stride, int *frm_corners,
+                              int num_frm_corners, YV12_BUFFER_CONFIG *ref,
+                              int bit_depth,
+                              GlobalMotionEstimationType gm_estimation_type,
+                              int *num_inliers_by_motion,
+                              MotionModel *params_by_motion, int num_motions) {
+  switch (gm_estimation_type) {
+    case GLOBAL_MOTION_FEATURE_BASED:
+      return compute_global_motion_feature_based(
+          type, frm_buffer, frm_width, frm_height, frm_stride, frm_corners,
+          num_frm_corners, ref, bit_depth, num_inliers_by_motion,
+          params_by_motion, num_motions);
+    case GLOBAL_MOTION_DISFLOW_BASED:
+      return compute_global_motion_disflow_based(
+          type, frm_buffer, frm_width, frm_height, frm_stride, frm_corners,
+          num_frm_corners, ref, bit_depth, num_inliers_by_motion,
+          params_by_motion, num_motions);
+    default: assert(0 && "Unknown global motion estimation type");
+  }
+  return 0;
+}
diff --git a/libs/libaom/src/av1/encoder/global_motion.h b/libs/libaom/src/av1/encoder/global_motion.h
new file mode 100644
index 000000000..0a6d0ecac
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/global_motion.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_GLOBAL_MOTION_H_
+#define AOM_AV1_ENCODER_GLOBAL_MOTION_H_
+
+#include "aom/aom_integer.h"
+#include "aom_scale/yv12config.h"
+#include "av1/common/mv.h"
+#include "av1/common/warped_motion.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_CORNERS 4096
+#define RANSAC_NUM_MOTIONS 1
+#define GM_REFINEMENT_COUNT 5
+
+typedef enum {
+  GLOBAL_MOTION_FEATURE_BASED,
+  GLOBAL_MOTION_DISFLOW_BASED,
+} GlobalMotionEstimationType;
+
+unsigned char *av1_downconvert_frame(YV12_BUFFER_CONFIG *frm, int bit_depth);
+
+typedef struct {
+  double params[MAX_PARAMDIM - 1];
+  int *inliers;
+  int num_inliers;
+} MotionModel;
+
+void av1_convert_model_to_params(const double *params,
+                                 WarpedMotionParams *model);
+
+// TODO(sarahparker) These need to be retuned for speed 0 and 1 to
+// maximize gains from segmented error metric
+static const double erroradv_tr[] = { 0.65, 0.60, 0.65 };
+static const double erroradv_prod_tr[] = { 20000, 18000, 16000 };
+
+int av1_is_enough_erroradvantage(double best_erroradvantage, int params_cost,
+                                 int erroradv_type);
+
+void av1_compute_feature_segmentation_map(uint8_t *segment_map, int width,
+                                          int height, int *inliers,
+                                          int num_inliers);
+
+// Returns the error between the result of applying motion 'wm' to the frame
+// described by 'ref' and the frame described by 'dst'.
+int64_t av1_warp_error(WarpedMotionParams *wm, int use_hbd, int bd,
+                       const uint8_t *ref, int width, int height, int stride,
+                       uint8_t *dst, int p_col, int p_row, int p_width,
+                       int p_height, int p_stride, int subsampling_x,
+                       int subsampling_y, int64_t best_error,
+                       uint8_t *segment_map, int segment_map_stride);
+
+// Returns the av1_warp_error between "dst" and the result of applying the
+// motion params that result from fine-tuning "wm" to "ref". Note that "wm" is
+// modified in place.
+int64_t av1_refine_integerized_param(
+    WarpedMotionParams *wm, TransformationType wmtype, int use_hbd, int bd,
+    uint8_t *ref, int r_width, int r_height, int r_stride, uint8_t *dst,
+    int d_width, int d_height, int d_stride, int n_refinements,
+    int64_t best_frame_error, uint8_t *segment_map, int segment_map_stride,
+    int64_t erroradv_threshold);
+
+/*
+  Computes "num_motions" candidate global motion parameters between two frames.
+  The array "params_by_motion" should be length 8 * "num_motions". The ordering
+  of each set of parameters is best described  by the homography:
+
+        [x'     (m2 m3 m0   [x
+    z .  y'  =   m4 m5 m1 *  y
+         1]      m6 m7 1)    1]
+
+  where m{i} represents the ith value in any given set of parameters.
+
+  "num_inliers" should be length "num_motions", and will be populated with the
+  number of inlier feature points for each motion. Params for which the
+  num_inliers entry is 0 should be ignored by the caller.
+*/
+int av1_compute_global_motion(TransformationType type,
+                              unsigned char *frm_buffer, int frm_width,
+                              int frm_height, int frm_stride, int *frm_corners,
+                              int num_frm_corners, YV12_BUFFER_CONFIG *ref,
+                              int bit_depth,
+                              GlobalMotionEstimationType gm_estimation_type,
+                              int *num_inliers_by_motion,
+                              MotionModel *params_by_motion, int num_motions);
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+#endif  // AOM_AV1_ENCODER_GLOBAL_MOTION_H_
diff --git a/libs/libaom/src/av1/encoder/gop_structure.c b/libs/libaom/src/av1/encoder/gop_structure.c
new file mode 100644
index 000000000..1ed71a0f9
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/gop_structure.c
@@ -0,0 +1,311 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdint.h>
+
+#include "config/aom_config.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "aom/aom_codec.h"
+#include "aom/aom_encoder.h"
+
+#include "aom_ports/system_state.h"
+
+#include "av1/common/av1_common_int.h"
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/firstpass.h"
+#include "av1/encoder/gop_structure.h"
+
+// Set parameters for frames between 'start' and 'end' (excluding both).
+static void set_multi_layer_params(const TWO_PASS *twopass,
+                                   GF_GROUP *const gf_group, RATE_CONTROL *rc,
+                                   FRAME_INFO *frame_info, int start, int end,
+                                   int *cur_frame_idx, int *frame_ind,
+                                   int arf_ind, int layer_depth) {
+  const int num_frames_to_process = end - start - 1;
+  assert(num_frames_to_process >= 0);
+  if (num_frames_to_process == 0) return;
+
+  // Either we are at the last level of the pyramid, or we don't have enough
+  // frames between 'l' and 'r' to create one more level.
+  if (layer_depth > gf_group->max_layer_depth_allowed ||
+      num_frames_to_process < 3) {
+    // Leaf nodes.
+    while (++start < end) {
+      gf_group->update_type[*frame_ind] = LF_UPDATE;
+      gf_group->arf_src_offset[*frame_ind] = 0;
+      ++*cur_frame_idx;
+      gf_group->cur_frame_idx[*frame_ind] = *cur_frame_idx;
+      gf_group->frame_disp_idx[*frame_ind] = start;
+      gf_group->layer_depth[*frame_ind] = MAX_ARF_LAYERS;
+      gf_group->arf_boost[*frame_ind] = av1_calc_arf_boost(
+          twopass, rc, frame_info, start, end - start, 0, NULL, NULL);
+      gf_group->max_layer_depth =
+          AOMMAX(gf_group->max_layer_depth, layer_depth);
+      ++(*frame_ind);
+    }
+  } else {
+    const int m = (start + end) / 2;
+
+    // Internal ARF.
+    gf_group->update_type[*frame_ind] = INTNL_ARF_UPDATE;
+    gf_group->arf_src_offset[*frame_ind] = m - start - 1;
+    gf_group->cur_frame_idx[*frame_ind] = *cur_frame_idx;
+    gf_group->frame_disp_idx[*frame_ind] = m;
+    gf_group->layer_depth[*frame_ind] = layer_depth;
+
+    // Get the boost factor for intermediate ARF frames.
+    gf_group->arf_boost[*frame_ind] = av1_calc_arf_boost(
+        twopass, rc, frame_info, m, end - m, m - start, NULL, NULL);
+    ++(*frame_ind);
+
+    // Frames displayed before this internal ARF.
+    set_multi_layer_params(twopass, gf_group, rc, frame_info, start, m,
+                           cur_frame_idx, frame_ind, 1, layer_depth + 1);
+
+    // Overlay for internal ARF.
+    gf_group->update_type[*frame_ind] = INTNL_OVERLAY_UPDATE;
+    gf_group->arf_src_offset[*frame_ind] = 0;
+    gf_group->cur_frame_idx[*frame_ind] = *cur_frame_idx;
+    gf_group->frame_disp_idx[*frame_ind] = m;
+    gf_group->arf_boost[*frame_ind] = 0;
+    gf_group->layer_depth[*frame_ind] = layer_depth;
+    ++(*frame_ind);
+
+    // Frames displayed after this internal ARF.
+    set_multi_layer_params(twopass, gf_group, rc, frame_info, m, end,
+                           cur_frame_idx, frame_ind, arf_ind, layer_depth + 1);
+  }
+}
+
+static int construct_multi_layer_gf_structure(
+    AV1_COMP *cpi, TWO_PASS *twopass, GF_GROUP *const gf_group,
+    RATE_CONTROL *rc, FRAME_INFO *const frame_info, int gf_interval,
+    FRAME_UPDATE_TYPE first_frame_update_type) {
+  int frame_index = 0;
+
+  // Keyframe / Overlay frame / Golden frame.
+  assert(gf_interval >= 1);
+  assert(first_frame_update_type == KF_UPDATE ||
+         first_frame_update_type == OVERLAY_UPDATE ||
+         first_frame_update_type == GF_UPDATE);
+
+  gf_group->update_type[frame_index] = first_frame_update_type;
+  gf_group->arf_src_offset[frame_index] = 0;
+  gf_group->cur_frame_idx[frame_index] = 0;
+  gf_group->layer_depth[frame_index] =
+      first_frame_update_type == OVERLAY_UPDATE ? MAX_ARF_LAYERS + 1 : 0;
+  gf_group->max_layer_depth = 0;
+  ++frame_index;
+
+  // ALTREF.
+  const int use_altref = gf_group->max_layer_depth_allowed > 0;
+  if (use_altref) {
+    gf_group->update_type[frame_index] = ARF_UPDATE;
+    gf_group->arf_src_offset[frame_index] = gf_interval - 1;
+    gf_group->cur_frame_idx[frame_index] = 0;
+    gf_group->frame_disp_idx[frame_index] = gf_interval;
+    gf_group->layer_depth[frame_index] = 1;
+    gf_group->arf_boost[frame_index] = cpi->rc.gfu_boost;
+    gf_group->max_layer_depth = 1;
+    ++frame_index;
+  }
+
+  int cur_frame_index = 0;
+  // Rest of the frames.
+  set_multi_layer_params(twopass, gf_group, rc, frame_info, 0, gf_interval,
+                         &cur_frame_index, &frame_index, 0, use_altref + 1);
+
+  // The end frame will be Overlay frame for an ARF GOP; otherwise set it to
+  // be GF, for consistency, which will be updated in the next GOP.
+  gf_group->update_type[frame_index] = use_altref ? OVERLAY_UPDATE : GF_UPDATE;
+  gf_group->arf_src_offset[frame_index] = 0;
+  return frame_index;
+}
+
+#define CHECK_GF_PARAMETER 0
+#if CHECK_GF_PARAMETER
+void check_frame_params(GF_GROUP *const gf_group, int gf_interval) {
+  static const char *update_type_strings[FRAME_UPDATE_TYPES] = {
+    "KF_UPDATE",       "LF_UPDATE",      "GF_UPDATE",
+    "ARF_UPDATE",      "OVERLAY_UPDATE", "INTNL_OVERLAY_UPDATE",
+    "INTNL_ARF_UPDATE"
+  };
+  FILE *fid = fopen("GF_PARAMS.txt", "a");
+
+  fprintf(fid, "\ngf_interval = {%d}\n", gf_interval);
+  for (int i = 0; i < gf_group->size; ++i) {
+    fprintf(fid, "#%2d : %s %d %d %d %d\n", i,
+            update_type_strings[gf_group->update_type[i]],
+            gf_group->arf_src_offset[i], gf_group->arf_pos_in_gf[i],
+            gf_group->arf_update_idx[i], gf_group->pyramid_level[i]);
+  }
+
+  fprintf(fid, "number of nodes in each level: \n");
+  for (int i = 0; i < gf_group->pyramid_height; ++i) {
+    fprintf(fid, "lvl %d: %d ", i, gf_group->pyramid_lvl_nodes[i]);
+  }
+  fprintf(fid, "\n");
+  fclose(fid);
+}
+#endif  // CHECK_GF_PARAMETER
+
+#define REF_IDX(ref) ((ref)-LAST_FRAME)
+
+static INLINE void reset_ref_frame_idx(int *ref_idx, int reset_value) {
+  for (int i = 0; i < REF_FRAMES; ++i) ref_idx[i] = reset_value;
+}
+
+static INLINE void set_ref_frame_disp_idx(GF_GROUP *const gf_group) {
+  for (int i = 0; i < gf_group->size; ++i) {
+    for (int ref = 0; ref < INTER_REFS_PER_FRAME + 1; ++ref) {
+      int ref_gop_idx = gf_group->ref_frame_gop_idx[i][ref];
+      if (ref_gop_idx == -1) {
+        gf_group->ref_frame_disp_idx[i][ref] = -1;
+      } else {
+        gf_group->ref_frame_disp_idx[i][ref] =
+            gf_group->frame_disp_idx[ref_gop_idx];
+      }
+    }
+  }
+}
+
+static void set_gop_ref_frame_map(GF_GROUP *const gf_group) {
+  // Initialize the reference slots as all -1.
+  for (int frame_idx = 0; frame_idx < gf_group->size; ++frame_idx)
+    reset_ref_frame_idx(gf_group->ref_frame_gop_idx[frame_idx], -1);
+
+  // Set the map for frames in the current gop
+  for (int frame_idx = 0; frame_idx < gf_group->size; ++frame_idx) {
+    const FRAME_UPDATE_TYPE update_type = gf_group->update_type[frame_idx];
+    // TODO(yuec): need to figure out how to determine
+    // (1) whether a KEY_FRAME has show_frame on
+    // (2) whether a frame with INTNL_OVERLAY_UPDATE type has
+    //     show_existing_frame on
+    const int show_frame =
+        update_type != ARF_UPDATE && update_type != INTNL_ARF_UPDATE;
+    const int show_existing_frame =
+        update_type == OVERLAY_UPDATE || update_type == INTNL_OVERLAY_UPDATE;
+
+    int this_ref_map[INTER_REFS_PER_FRAME + 1];
+    memcpy(this_ref_map, gf_group->ref_frame_gop_idx[frame_idx],
+           sizeof(this_ref_map));
+    int *next_ref_map = &gf_group->ref_frame_gop_idx[frame_idx + 1][0];
+
+    switch (update_type) {
+      case KF_UPDATE:
+        if (show_frame) {
+          reset_ref_frame_idx(this_ref_map, frame_idx);
+        } else {
+          this_ref_map[REF_IDX(LAST3_FRAME)] = frame_idx;
+          this_ref_map[REF_IDX(EXTREF_FRAME)] = frame_idx;
+          this_ref_map[REF_IDX(ALTREF2_FRAME)] = frame_idx;
+          this_ref_map[REF_IDX(GOLDEN_FRAME)] = frame_idx;
+          this_ref_map[REF_IDX(ALTREF_FRAME)] = frame_idx;
+        }
+        break;
+      case LF_UPDATE: this_ref_map[REF_IDX(LAST3_FRAME)] = frame_idx; break;
+      case GF_UPDATE:
+        this_ref_map[REF_IDX(LAST3_FRAME)] = frame_idx;
+        this_ref_map[REF_IDX(GOLDEN_FRAME)] = frame_idx;
+        break;
+      case OVERLAY_UPDATE:
+        this_ref_map[REF_IDX(ALTREF_FRAME)] = frame_idx;
+        break;
+      case ARF_UPDATE: this_ref_map[REF_IDX(ALTREF_FRAME)] = frame_idx; break;
+      case INTNL_OVERLAY_UPDATE:
+        if (!show_existing_frame)
+          this_ref_map[REF_IDX(LAST3_FRAME)] = frame_idx;
+        break;
+      case INTNL_ARF_UPDATE:
+        this_ref_map[REF_IDX(EXTREF_FRAME)] = frame_idx;
+        break;
+      default: assert(0); break;
+    }
+
+    memcpy(next_ref_map, this_ref_map, sizeof(this_ref_map));
+
+    switch (update_type) {
+      case LF_UPDATE:
+      case GF_UPDATE:
+        next_ref_map[REF_IDX(LAST3_FRAME)] = this_ref_map[REF_IDX(LAST2_FRAME)];
+        next_ref_map[REF_IDX(LAST2_FRAME)] = this_ref_map[REF_IDX(LAST_FRAME)];
+        next_ref_map[REF_IDX(LAST_FRAME)] = this_ref_map[REF_IDX(LAST3_FRAME)];
+        break;
+      case INTNL_OVERLAY_UPDATE:
+        if (!show_existing_frame) {
+          next_ref_map[REF_IDX(LAST3_FRAME)] =
+              this_ref_map[REF_IDX(LAST2_FRAME)];
+          next_ref_map[REF_IDX(LAST2_FRAME)] =
+              this_ref_map[REF_IDX(LAST_FRAME)];
+          next_ref_map[REF_IDX(LAST_FRAME)] =
+              this_ref_map[REF_IDX(LAST3_FRAME)];
+        } else {
+          next_ref_map[REF_IDX(LAST_FRAME)] =
+              this_ref_map[REF_IDX(BWDREF_FRAME)];
+          next_ref_map[REF_IDX(LAST2_FRAME)] =
+              this_ref_map[REF_IDX(LAST_FRAME)];
+          next_ref_map[REF_IDX(LAST3_FRAME)] =
+              this_ref_map[REF_IDX(LAST2_FRAME)];
+          next_ref_map[REF_IDX(BWDREF_FRAME)] =
+              this_ref_map[REF_IDX(ALTREF2_FRAME)];
+          next_ref_map[REF_IDX(ALTREF2_FRAME)] =
+              this_ref_map[REF_IDX(EXTREF_FRAME)];
+          next_ref_map[REF_IDX(EXTREF_FRAME)] =
+              this_ref_map[REF_IDX(LAST3_FRAME)];
+        }
+        break;
+      case INTNL_ARF_UPDATE:
+        if (!show_existing_frame) {
+          next_ref_map[REF_IDX(BWDREF_FRAME)] =
+              this_ref_map[REF_IDX(EXTREF_FRAME)];
+          next_ref_map[REF_IDX(ALTREF2_FRAME)] =
+              this_ref_map[REF_IDX(BWDREF_FRAME)];
+          next_ref_map[REF_IDX(EXTREF_FRAME)] =
+              this_ref_map[REF_IDX(ALTREF2_FRAME)];
+        }
+        break;
+      case OVERLAY_UPDATE:
+        next_ref_map[REF_IDX(ALTREF_FRAME)] =
+            this_ref_map[REF_IDX(GOLDEN_FRAME)];
+        next_ref_map[REF_IDX(GOLDEN_FRAME)] =
+            this_ref_map[REF_IDX(ALTREF_FRAME)];
+        break;
+      default: break;
+    }
+  }
+
+  // Set the map in display order index by converting from gop indices in the
+  // above map
+  set_ref_frame_disp_idx(gf_group);
+}
+
+void av1_gop_setup_structure(AV1_COMP *cpi,
+                             const EncodeFrameParams *const frame_params) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  GF_GROUP *const gf_group = &cpi->gf_group;
+  TWO_PASS *const twopass = &cpi->twopass;
+  FRAME_INFO *const frame_info = &cpi->frame_info;
+  const int key_frame = (frame_params->frame_type == KEY_FRAME);
+  const FRAME_UPDATE_TYPE first_frame_update_type =
+      key_frame ? KF_UPDATE
+                : rc->source_alt_ref_active ? OVERLAY_UPDATE : GF_UPDATE;
+  gf_group->size = construct_multi_layer_gf_structure(
+      cpi, twopass, gf_group, rc, frame_info, rc->baseline_gf_interval,
+      first_frame_update_type);
+
+  set_gop_ref_frame_map(gf_group);
+
+#if CHECK_GF_PARAMETER
+  check_frame_params(gf_group, rc->baseline_gf_interval);
+#endif
+}
diff --git a/libs/libaom/src/av1/encoder/gop_structure.h b/libs/libaom/src/av1/encoder/gop_structure.h
new file mode 100644
index 000000000..0c775c7b4
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/gop_structure.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_GOP_STRUCTURE_H_
+#define AOM_AV1_ENCODER_GOP_STRUCTURE_H_
+
+#include "av1/common/av1_common_int.h"
+#include "av1/encoder/ratectrl.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct AV1_COMP;
+struct EncodeFrameParams;
+
+#define MIN_ARF_GF_BOOST 240
+#define NORMAL_BOOST 100
+
+// Set up the Group-Of-Pictures structure for this GF_GROUP.  This involves
+// deciding where to place the various FRAME_UPDATE_TYPEs in the group.  It does
+// this primarily by setting the contents of
+// cpi->twopass.gf_group.update_type[].
+void av1_gop_setup_structure(
+    struct AV1_COMP *cpi, const struct EncodeFrameParams *const frame_params);
+
+int av1_calc_arf_boost(const TWO_PASS *twopass, const RATE_CONTROL *rc,
+                       FRAME_INFO *frame_info, int offset, int f_frames,
+                       int b_frames, int *num_fpstats_used,
+                       int *num_fpstats_required);
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_GOP_STRUCTURE_H_
diff --git a/libs/libaom/src/av1/encoder/grain_test_vectors.h b/libs/libaom/src/av1/encoder/grain_test_vectors.h
new file mode 100644
index 000000000..945dc3733
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/grain_test_vectors.h
@@ -0,0 +1,781 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_ENCODER_GRAIN_TEST_VECTORS_H_
+#define AOM_AV1_ENCODER_GRAIN_TEST_VECTORS_H_
+
+/* Test vectors for emulation of different film grain types.
+ * Note that bit depth would be derived from the bitstream and
+ * not signaled in film grain metadata. The parameters are valid
+ * for any bit depth.
+ */
+static aom_film_grain_t film_grain_test_vectors[16] = {
+  /* Test 1 */
+  {
+      1 /* apply_grain */,
+      1 /* update_parameters */,
+      { { 16, 0 },
+        { 25, 136 },
+        { 33, 144 },
+        { 41, 160 },
+        { 48, 168 },
+        { 56, 136 },
+        { 67, 128 },
+        { 82, 144 },
+        { 97, 152 },
+        { 113, 144 },
+        { 128, 176 },
+        { 143, 168 },
+        { 158, 176 },
+        { 178, 184 } },
+      14 /* num_points_y */,
+      { { 16, 0 },
+        { 20, 64 },
+        { 28, 88 },
+        { 60, 104 },
+        { 90, 136 },
+        { 105, 160 },
+        { 134, 168 },
+        { 168, 208 } },
+      8 /* num_cb_points */,
+      { { 16, 0 },
+        { 28, 96 },
+        { 56, 80 },
+        { 66, 96 },
+        { 80, 104 },
+        { 108, 96 },
+        { 122, 112 },
+        { 137, 112 },
+        { 169, 176 } },
+      9 /* num_cr_points */,
+      11 /* scaling_shift */,
+      2 /* ar_coeff_lag */,
+      { 0, 0, -58, 0, 0, 0, -76, 100, -43, 0, -51, 82 },
+      { 0, 0, -49, 0, 0, 0, -36, 22, -30, 0, -38, 7, 39 },
+      { 0, 0, -47, 0, 0, 0, -31, 31, -25, 0, -32, 13, -100 },
+      8 /* ar_coeff_shift */,
+      247 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      18 /* cb_offset */,
+      229 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      54 /* cr_offset */,
+      0 /* overlap_flag */,
+      1 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      0 /* chroma_scaling_from_luma*/,
+      0 /* grain_scale_shift*/,
+      45231 /* random_seed */
+  },
+  /* Test 2 */
+  {
+      1 /* apply_grain */,
+      1 /* update_parameters */,
+      { { 0, 96 }, { 255, 96 } },
+      2 /* num_points_y */,
+      { { 0, 64 }, { 255, 64 } },
+      2 /* num_cb_points */,
+      { { 0, 64 }, { 255, 64 } },
+      2 /* num_cr_points */,
+      11 /* scaling_shift */,
+      3 /* ar_coeff_lag */,
+      {
+          4, 1,   3, 0,   1,  -3, 8,  -3, 7,  -23, 1, -25,
+          0, -10, 6, -17, -4, 53, 36, 5,  -5, -17, 8, 66,
+      },
+      {
+          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   0,
+          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 127,
+      },
+      {
+          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   0,
+          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 127,
+      },
+      7 /* ar_coeff_shift */,
+      128 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      256 /* cb_offset */,
+      128 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      256 /* cr_offset */,
+      1 /* overlap_flag */,
+      0 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      0 /*chroma_scaling_from_luma*/,
+      0 /* grain_scale_shift*/,
+      45231 /* random_seed */
+  },
+  /* Test 3 */
+  {
+      1 /* apply_grain */,
+      1 /* update_parameters */,
+      { { 0, 192 }, { 255, 192 } },
+      2 /* num_points_y */,
+      { { 0, 128 }, { 255, 128 } },
+      2 /* num_cb_points */,
+      { { 0, 128 }, { 255, 128 } },
+      2 /* num_cr_points */,
+      11 /* scaling_shift */,
+      3 /* ar_coeff_lag */,
+      {
+          4, 1,   3, 0,   1,  -3, 8,  -3, 7,  -23, 1, -25,
+          0, -10, 6, -17, -4, 53, 36, 5,  -5, -17, 8, 66,
+      },
+      {
+          4,   -7, 2,  4,   12, -12, 5,   -8, 6,  8,   -19, -16, 19,
+          -10, -2, 17, -42, 58, -2,  -13, 9,  14, -36, 67,  0,
+      },
+      {
+          4,   -7, 2,  4,   12, -12, 5,   -8, 6,  8,   -19, -16, 19,
+          -10, -2, 17, -42, 58, -2,  -13, 9,  14, -36, 67,  0,
+      },
+      7 /* ar_coeff_shift */,
+      128 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      256 /* cb_offset */,
+      128 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      256 /* cr_offset */,
+      1 /* overlap_flag */,
+      1 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      0 /*chroma_scaling_from_luma*/,
+      1 /* grain_scale_shift*/,
+      45231 /* random_seed */
+  },
+  /* Test 4 */
+  {
+      1 /* apply_grain */,
+      1 /* update_parameters */,
+      {
+          { 16, 0 },
+          { 24, 137 },
+          { 53, 146 },
+          { 63, 155 },
+          { 78, 155 },
+          { 107, 150 },
+          { 122, 147 },
+          { 136, 147 },
+          { 166, 153 },
+      },
+      9 /* num_points_y */,
+      {
+          { 16, 0 },
+          { 20, 72 },
+          { 27, 82 },
+          { 33, 91 },
+          { 69, 121 },
+          { 95, 143 },
+          { 108, 154 },
+          { 134, 169 },
+          { 147, 177 },
+      },
+      9 /* num_cb_points */,
+      {
+          { 16, 0 },
+          { 24, 95 },
+          { 54, 93 },
+          { 65, 94 },
+          { 79, 98 },
+          { 109, 107 },
+          { 124, 119 },
+          { 139, 136 },
+          { 169, 170 },
+      },
+      9 /* num_cr_points */,
+      11 /* scaling_shift */,
+      3 /* ar_coeff_lag */,
+      {
+          7,  -9,  2, 4,   7, -12, 7,  -18, 18, -30, -27, -42,
+          13, -20, 7, -18, 6, 107, 55, -2,  -4, -9,  -22, 113,
+      },
+      {
+          -3, -1, -4,  3,   -6,  -2,  3,  1,  -4, -10, -10, -5, -5,
+          -3, -1, -13, -28, -25, -31, -6, -4, 14, -64, 66,  0,
+      },
+      {
+          0,  4, -3, 13,  0,  1,   -3, 0,  -3, -10, -68, -4, -2,
+          -5, 2, -3, -20, 62, -31, 0,  -4, -1, -8,  -29, 0,
+      },
+      8 /* ar_coeff_shift */,
+      128 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      256 /* cb_offset */,
+      128 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      256 /* cr_offset */,
+      1 /* overlap_flag */,
+      0 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      0 /*chroma_scaling_from_luma*/,
+      0 /* grain_scale_shift*/,
+      45231 /* random_seed */
+  },
+  /* Test 5 */
+  {
+      1 /* apply_grain */,
+      0 /* update_parameters */,
+      { { 0, 64 }, { 255, 64 } },
+      2 /* num_points_y */,
+      {
+          { 0, 96 },
+          { 32, 90 },
+          { 64, 83 },
+          { 96, 76 },
+          { 128, 68 },
+          { 159, 59 },
+          { 191, 48 },
+          { 223, 34 },
+          { 255, 0 },
+      },
+      9 /* num_cb_points */,
+      {
+          { 0, 0 },
+          { 32, 34 },
+          { 64, 48 },
+          { 96, 59 },
+          { 128, 68 },
+          { 159, 76 },
+          { 191, 83 },
+          { 223, 90 },
+          { 255, 96 },
+      },
+      9 /* num_cr_points */,
+      11 /* scaling_shift */,
+      3 /* ar_coeff_lag */,
+      {
+          4, 1,   3, 0,   1,  -3, 8,  -3, 7,  -23, 1, -25,
+          0, -10, 6, -17, -4, 53, 36, 5,  -5, -17, 8, 66,
+      },
+      {
+          -2, 2,  -5, 7,   -6, 4,   -2, -1, 1,  -2,  0,  -2, 2,
+          -3, -5, 13, -13, 6,  -14, 8,  -1, 18, -36, 58, 0,
+      },
+      {
+          -2, -1, -3, 14, -4, -1, -3, 0, -1, 7, -31, 7, 2,
+          0,  1,  0,  -7, 50, -8, -2, 2, 2,  2, -4,  0,
+      },
+      7 /* ar_coeff_shift */,
+      128 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      256 /* cb_offset */,
+      128 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      256 /* cr_offset */,
+      1 /* overlap_flag */,
+      1 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      0 /*chroma_scaling_from_luma*/,
+      0 /* grain_scale_shift*/,
+      1063 /* random_seed */
+  },
+  /* Test 6 */
+  {
+      1 /* apply_grain */,
+      1 /* update_parameters */,
+      {
+          { 0, 96 },
+          { 20, 92 },
+          { 39, 88 },
+          { 59, 84 },
+          { 78, 80 },
+          { 98, 75 },
+          { 118, 70 },
+          { 137, 65 },
+          { 157, 60 },
+          { 177, 53 },
+          { 196, 46 },
+          { 216, 38 },
+          { 235, 27 },
+          { 255, 0 },
+      },
+      14 /* num_points_y */,
+      { { 0, 0 } },
+      0 /* num_cb_points */,
+      { { 0, 0 } },
+      0 /* num_cr_points */,
+      11 /* scaling_shift */,
+      3 /* ar_coeff_lag */,
+      {
+          4, 1,   3, 0,   1,  -3, 8,  -3, 7,  -23, 1, -25,
+          0, -10, 6, -17, -4, 53, 36, 5,  -5, -17, 8, 66,
+      },
+      {
+          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      },
+      {
+          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      },
+      7 /* ar_coeff_shift */,
+      128 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      256 /* cb_offset */,
+      128 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      256 /* cr_offset */,
+      1 /* overlap_flag */,
+      1 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      0 /*chroma_scaling_from_luma*/,
+      0 /* grain_scale_shift*/,
+      2754 /* random_seed */
+  },
+  /* Test 7 */
+  {
+      1 /* apply_grain */,
+      1 /* update_parameters */,
+      {
+          { 0, 0 },
+          { 20, 27 },
+          { 39, 38 },
+          { 59, 46 },
+          { 78, 53 },
+          { 98, 60 },
+          { 118, 65 },
+          { 137, 70 },
+          { 157, 75 },
+          { 177, 80 },
+          { 196, 84 },
+          { 216, 88 },
+          { 235, 92 },
+          { 255, 96 },
+      },
+      14 /* num_points_y */,
+      { { 0, 0 }, { 255, 0 } },
+      2 /* num_cb_points */,
+      { { 0, 0 }, { 255, 0 } },
+      2 /* num_cr_points */,
+      11 /* scaling_shift */,
+      3 /* ar_coeff_lag */,
+      {
+          4, 1,   3, 0,   1,  -3, 8,  -3, 7,  -23, 1, -25,
+          0, -10, 6, -17, -4, 53, 36, 5,  -5, -17, 8, 66,
+      },
+      {
+          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      },
+      {
+          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      },
+      7 /* ar_coeff_shift */,
+      128 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      256 /* cb_offset */,
+      128 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      256 /* cr_offset */,
+      1 /* overlap_flag */,
+      1 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      0 /*chroma_scaling_from_luma*/,
+      0 /* grain_scale_shift*/,
+      45231 /* random_seed */
+  },
+  /* Test 8 */
+  {
+      1 /* apply_grain */,
+      1 /* update_parameters */,
+      { { 0, 96 }, { 255, 96 } },
+      2 /* num_points_y */,
+      { { 0, 62 }, { 255, 62 } },
+      2 /* num_cb_points */,
+      { { 0, 62 }, { 255, 62 } },
+      2 /* num_cr_points */,
+      11 /* scaling_shift */,
+      3 /* ar_coeff_lag */,
+      {
+          4, 1,   3, 0,   1,  -3, 8,  -3, 7,  -23, 1, -25,
+          0, -10, 6, -17, -4, 53, 36, 5,  -5, -17, 8, 66,
+      },
+      {
+          0,  -2, -2, 8,   5,  -1, 1,   -1, 5,  16,  -33, -9,  6,
+          -1, -3, 10, -47, 63, 0,  -15, 3,  11, -42, 75,  -69,
+      },
+      {
+          1,  -1, -1, 9,   5,  0, 1,   -1, 5,  15,  -32, -10, 8,
+          -2, -4, 11, -46, 62, 1, -16, 3,  13, -43, 75,  -55,
+      },
+      7 /* ar_coeff_shift */,
+      128 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      256 /* cb_offset */,
+      128 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      256 /* cr_offset */,
+      1 /* overlap_flag */,
+      0 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      0 /*chroma_scaling_from_luma*/,
+      0 /* grain_scale_shift*/,
+      45231 /* random_seed */
+  },
+  /* Test 9 */
+  {
+      1 /* apply_grain */,
+      0 /* update_parameters */,
+      { { 0, 48 }, { 255, 48 } },
+      2 /* num_points_y */,
+      { { 0, 32 }, { 255, 32 } },
+      2 /* num_cb_points */,
+      { { 0, 32 }, { 255, 32 } },
+      2 /* num_cr_points */,
+      10 /* scaling_shift */,
+      2 /* ar_coeff_lag */,
+      { 10, -30, -20, -39, 1, -24, 12, 103, 60, -9, -24, 113 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 127 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 127 },
+      8 /* ar_coeff_shift */,
+      128 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      256 /* cb_offset */,
+      128 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      256 /* cr_offset */,
+      1 /* overlap_flag */,
+      0 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      0 /*chroma_scaling_from_luma*/,
+      0 /* grain_scale_shift*/,
+      45231 /* random_seed */
+  },
+  /* Test 10 */
+  {
+      1 /* apply_grain */,
+      1 /* update_parameters */,
+      { { 0, 48 }, { 255, 48 } },
+      2 /* num_points_y */,
+      { { 0, 32 }, { 255, 32 } },
+      2 /* num_cb_points */,
+      { { 0, 32 }, { 255, 32 } },
+      2 /* num_cr_points */,
+      10 /* scaling_shift */,
+      2 /* ar_coeff_lag */,
+      { 10, -30, -20, -39, 1, -24, 12, 103, 60, -9, -24, 113 },
+      { -7, -6, -48, -22, 2, -3, -45, 73, -11, -26, -52, 76, 0 },
+      { -7, -6, -48, -22, 2, -3, -45, 73, -11, -26, -52, 76, 0 },
+      8 /* ar_coeff_shift */,
+      128 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      256 /* cb_offset */,
+      128 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      256 /* cr_offset */,
+      1 /* overlap_flag */,
+      0 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      0 /*chroma_scaling_from_luma*/,
+      0 /* grain_scale_shift*/,
+      45231 /* random_seed */
+  },
+  /* Test 11 */
+  {
+      1 /* apply_grain */,
+      0 /* update_parameters */,
+      { { 0, 32 }, { 255, 32 } },
+      2 /* num_points_y */,
+      {
+          { 0, 48 },
+          { 32, 45 },
+          { 64, 42 },
+          { 96, 38 },
+          { 128, 34 },
+          { 159, 29 },
+          { 191, 24 },
+          { 223, 17 },
+          { 255, 0 },
+      },
+      9 /* num_cb_points */,
+      {
+          { 0, 0 },
+          { 32, 17 },
+          { 64, 24 },
+          { 96, 29 },
+          { 128, 34 },
+          { 159, 38 },
+          { 191, 42 },
+          { 223, 45 },
+          { 255, 48 },
+      },
+      9 /* num_cr_points */,
+      10 /* scaling_shift */,
+      3 /* ar_coeff_lag */,
+      {
+          7,  -9,  2, 4,   7, -12, 7,  -18, 18, -30, -27, -42,
+          13, -20, 7, -18, 6, 107, 55, -2,  -4, -9,  -22, 113,
+      },
+      {
+          -3, -1, -4,  3,   -6,  -2,  3,  1,  -4, -10, -10, -5, -5,
+          -3, -1, -13, -28, -25, -31, -6, -4, 14, -64, 66,  0,
+      },
+      {
+          0,  4, -3, 13,  0,  1,   -3, 0,  -3, -10, -68, -4, -2,
+          -5, 2, -3, -20, 62, -31, 0,  -4, -1, -8,  -29, 0,
+      },
+      8 /* ar_coeff_shift */,
+      128 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      256 /* cb_offset */,
+      128 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      256 /* cr_offset */,
+      1 /* overlap_flag */,
+      1 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      0 /*chroma_scaling_from_luma*/,
+      0 /* grain_scale_shift*/,
+      1357 /* random_seed */
+  },
+  /* Test 12 */
+  {
+      1 /* apply_grain */,
+      1 /* update_parameters */,
+      {
+          { 16, 0 },
+          { 24, 49 },
+          { 39, 69 },
+          { 46, 84 },
+          { 53, 91 },
+          { 63, 100 },
+          { 78, 114 },
+          { 92, 134 },
+          { 164, 139 },
+      },
+      9 /* num_points_y */,
+      {
+          { 16, 0 },
+          { 20, 31 },
+          { 26, 42 },
+          { 33, 54 },
+          { 40, 65 },
+          { 47, 72 },
+          { 56, 85 },
+          { 84, 123 },
+          { 152, 157 },
+      },
+      9 /* num_cb_points */,
+      {
+          { 16, 0 },
+          { 25, 14 },
+          { 39, 33 },
+          { 47, 40 },
+          { 54, 47 },
+          { 64, 62 },
+          { 79, 76 },
+          { 94, 83 },
+          { 167, 101 },
+      },
+      9 /* num_cr_points */,
+      10 /* scaling_shift */,
+      2 /* ar_coeff_lag */,
+      { 0, 0, -58, 0, 0, 0, -76, 100, -43, 0, -51, 82 },
+      { 0, 0, -49, 0, 0, 0, -36, 22, -30, 0, -38, 7, 39 },
+      { 0, 0, -47, 0, 0, 0, -31, 31, -25, 0, -32, 13, -100 },
+      8 /* ar_coeff_shift */,
+      128 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      256 /* cb_offset */,
+      128 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      256 /* cr_offset */,
+      0 /* overlap_flag */,
+      0 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      0 /*chroma_scaling_from_luma*/,
+      0 /* grain_scale_shift*/,
+      45231 /* random_seed */
+  },
+  /* Test 13 */
+  {
+      1 /* apply_grain */,
+      1 /* update_parameters */,
+      {
+          { 0, 48 },
+          { 20, 46 },
+          { 39, 44 },
+          { 59, 42 },
+          { 78, 40 },
+          { 98, 38 },
+          { 118, 35 },
+          { 137, 33 },
+          { 157, 30 },
+          { 177, 27 },
+          { 196, 23 },
+          { 216, 19 },
+          { 235, 13 },
+          { 255, 0 },
+      },
+      14 /* num_points_y */,
+      { { 0, 0 }, { 255, 0 } },
+      0 /* num_cb_points */,
+      { { 0, 0 }, { 255, 0 } },
+      0 /* num_cr_points */,
+      10 /* scaling_shift */,
+      2 /* ar_coeff_lag */,
+      { 10, -30, -20, -39, 1, -24, 12, 103, 60, -9, -24, 113 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      8 /* ar_coeff_shift */,
+      128 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      256 /* cb_offset */,
+      128 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      256 /* cr_offset */,
+      1 /* overlap_flag */,
+      0 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      0 /*chroma_scaling_from_luma*/,
+      0 /* grain_scale_shift*/,
+      45231 /* random_seed */
+  },
+  /* Test 14 */
+  {
+      1 /* apply_grain */,
+      1 /* update_parameters */,
+      {
+          { 0, 0 },
+          { 20, 13 },
+          { 39, 19 },
+          { 59, 23 },
+          { 78, 27 },
+          { 98, 30 },
+          { 118, 33 },
+          { 137, 35 },
+          { 157, 38 },
+          { 177, 40 },
+          { 196, 42 },
+          { 216, 44 },
+          { 235, 46 },
+          { 255, 48 },
+      },
+      14 /* num_points_y */,
+      { { 0, 0 }, { 255, 0 } },
+      0 /* num_cb_points */,
+      { { 0, 0 }, { 255, 0 } },
+      0 /* num_cr_points */,
+      10 /* scaling_shift */,
+      2 /* ar_coeff_lag */,
+      { 10, -30, -20, -39, 1, -24, 12, 103, 60, -9, -24, 113 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      8 /* ar_coeff_shift */,
+      128 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      256 /* cb_offset */,
+      128 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      256 /* cr_offset */,
+      1 /* overlap_flag */,
+      1 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      0 /*chroma_scaling_from_luma*/,
+      0 /* grain_scale_shift*/,
+      45231 /* random_seed */
+  },
+  /* Test 15 */
+  {
+      1 /* apply_grain */,
+      1 /* update_parameters */,
+      { { 0, 96 }, { 255, 96 } },
+      1 /* num_points_y */,
+      { { 0, 96 }, { 255, 96 } },
+      0 /* num_cb_points */,
+      { { 0, 96 }, { 255, 96 } },
+      0 /* num_cr_points */,
+      11 /* scaling_shift */,
+      2 /* ar_coeff_lag */,
+      { 5, -15, -10, -19, 0, -12, 6, 51, 30, -5, -12, 56 },
+      { 2, 2, -24, -5, 1, 1, -18, 37, -2, 0, -15, 39, -70 },
+      { 2, 3, -24, -5, -1, 0, -18, 38, -2, 0, -15, 39, -55 },
+      7 /* ar_coeff_shift */,
+      128 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      256 /* cb_offset */,
+      128 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      256 /* cr_offset */,
+      1 /* overlap_flag */,
+      0 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      1 /*chroma_scaling_from_luma*/,
+      0 /* grain_scale_shift*/,
+      45231 /* random_seed */
+  },
+  /* Test 16 */
+  {
+      1 /* apply_grain */,
+      1 /* update_parameters */,
+      {
+          { 16, 0 },
+          { 58, 126 },
+          { 87, 120 },
+          { 97, 122 },
+          { 112, 125 },
+          { 126, 131 },
+          { 141, 139 },
+          { 199, 153 },
+      },
+      8 /* num_points_y */,
+      {
+          { 16, 0 },
+          { 59, 68 },
+          { 66, 76 },
+          { 73, 82 },
+          { 79, 85 },
+          { 86, 86 },
+          { 151, 95 },
+          { 192, 101 },
+      },
+      8 /* num_cb_points */,
+      {
+          { 16, 0 },
+          { 59, 64 },
+          { 89, 80 },
+          { 99, 86 },
+          { 114, 90 },
+          { 129, 93 },
+          { 144, 97 },
+          { 203, 85 },
+      },
+      8 /* num_cr_points */,
+      10 /* scaling_shift */,
+      3 /* ar_coeff_lag */,
+      {
+          4, 1,   3, 0,   1,  -3, 8,  -3, 7,  -23, 1, -25,
+          0, -10, 6, -17, -4, 53, 36, 5,  -5, -17, 8, 66,
+      },
+      {
+          0,  -2, -2, 8,   5,  -1, 1,   -1, 5,  16,  -33, -9,  6,
+          -1, -3, 10, -47, 63, 0,  -15, 3,  11, -42, 75,  -69,
+      },
+      {
+          1,  -1, -1, 9,   5,  0, 1,   -1, 5,  15,  -32, -10, 8,
+          -2, -4, 11, -46, 62, 1, -16, 3,  13, -43, 75,  -55,
+      },
+      7 /* ar_coeff_shift */,
+      128 /* cb_mult */,
+      192 /* cb_luma_mult */,
+      256 /* cb_offset */,
+      128 /* cr_mult */,
+      192 /* cr_luma_mult */,
+      256 /* cr_offset */,
+      1 /* overlap_flag */,
+      0 /* clip_to_restricted_range */,
+      8 /* bit_depth */,
+      0 /*chroma_scaling_from_luma*/,
+      2 /* grain_scale_shift*/,
+      45231 /* random_seed */
+  },
+};
+#endif  // AOM_AV1_ENCODER_GRAIN_TEST_VECTORS_H_
diff --git a/libs/libaom/src/av1/encoder/hash.c b/libs/libaom/src/av1/encoder/hash.c
new file mode 100644
index 000000000..3091037eb
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/hash.c
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/encoder/hash.h"
+
+static void crc_calculator_process_data(CRC_CALCULATOR *p_crc_calculator,
+                                        uint8_t *pData, uint32_t dataLength) {
+  for (uint32_t i = 0; i < dataLength; i++) {
+    const uint8_t index = (uint8_t)(
+        (p_crc_calculator->remainder >> (p_crc_calculator->bits - 8)) ^
+        pData[i]);
+    p_crc_calculator->remainder <<= 8;
+    p_crc_calculator->remainder ^= p_crc_calculator->table[index];
+  }
+}
+
+static void crc_calculator_reset(CRC_CALCULATOR *p_crc_calculator) {
+  p_crc_calculator->remainder = 0;
+}
+
+static uint32_t crc_calculator_get_crc(CRC_CALCULATOR *p_crc_calculator) {
+  return p_crc_calculator->remainder & p_crc_calculator->final_result_mask;
+}
+
+static void crc_calculator_init_table(CRC_CALCULATOR *p_crc_calculator) {
+  const uint32_t high_bit = 1 << (p_crc_calculator->bits - 1);
+  const uint32_t byte_high_bit = 1 << (8 - 1);
+
+  for (uint32_t value = 0; value < 256; value++) {
+    uint32_t remainder = 0;
+    for (uint8_t mask = byte_high_bit; mask != 0; mask >>= 1) {
+      if (value & mask) {
+        remainder ^= high_bit;
+      }
+
+      if (remainder & high_bit) {
+        remainder <<= 1;
+        remainder ^= p_crc_calculator->trunc_poly;
+      } else {
+        remainder <<= 1;
+      }
+    }
+    p_crc_calculator->table[value] = remainder;
+  }
+}
+
+void av1_crc_calculator_init(CRC_CALCULATOR *p_crc_calculator, uint32_t bits,
+                             uint32_t truncPoly) {
+  p_crc_calculator->remainder = 0;
+  p_crc_calculator->bits = bits;
+  p_crc_calculator->trunc_poly = truncPoly;
+  p_crc_calculator->final_result_mask = (1 << bits) - 1;
+  crc_calculator_init_table(p_crc_calculator);
+}
+
+uint32_t av1_get_crc_value(CRC_CALCULATOR *p_crc_calculator, uint8_t *p,
+                           int length) {
+  crc_calculator_reset(p_crc_calculator);
+  crc_calculator_process_data(p_crc_calculator, p, length);
+  return crc_calculator_get_crc(p_crc_calculator);
+}
+
+/* CRC-32C (iSCSI) polynomial in reversed bit order. */
+#define POLY 0x82f63b78
+
+/* Construct table for software CRC-32C calculation. */
+void av1_crc32c_calculator_init(CRC32C *p_crc32c) {
+  uint32_t crc;
+
+  for (int n = 0; n < 256; n++) {
+    crc = n;
+    crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+    crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+    crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+    crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+    crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+    crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+    crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+    crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+    p_crc32c->table[0][n] = crc;
+  }
+  for (int n = 0; n < 256; n++) {
+    crc = p_crc32c->table[0][n];
+    for (int k = 1; k < 8; k++) {
+      crc = p_crc32c->table[0][crc & 0xff] ^ (crc >> 8);
+      p_crc32c->table[k][n] = crc;
+    }
+  }
+}
+
+/* Table-driven software version as a fall-back.  This is about 15 times slower
+ than using the hardware instructions.  This assumes little-endian integers,
+ as is the case on Intel processors that the assembler code here is for. */
+uint32_t av1_get_crc32c_value_c(void *c, uint8_t *buf, size_t len) {
+  const uint8_t *next = (const uint8_t *)(buf);
+  uint64_t crc;
+  CRC32C *p = (CRC32C *)c;
+  crc = 0 ^ 0xffffffff;
+  while (len && ((uintptr_t)next & 7) != 0) {
+    crc = p->table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
+    len--;
+  }
+  while (len >= 8) {
+    crc ^= *(uint64_t *)next;
+    crc = p->table[7][crc & 0xff] ^ p->table[6][(crc >> 8) & 0xff] ^
+          p->table[5][(crc >> 16) & 0xff] ^ p->table[4][(crc >> 24) & 0xff] ^
+          p->table[3][(crc >> 32) & 0xff] ^ p->table[2][(crc >> 40) & 0xff] ^
+          p->table[1][(crc >> 48) & 0xff] ^ p->table[0][crc >> 56];
+    next += 8;
+    len -= 8;
+  }
+  while (len) {
+    crc = p->table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
+    len--;
+  }
+  return (uint32_t)crc ^ 0xffffffff;
+}
diff --git a/libs/libaom/src/av1/encoder/hash.h b/libs/libaom/src/av1/encoder/hash.h
new file mode 100644
index 000000000..d8e8cc3a0
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/hash.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_HASH_H_
+#define AOM_AV1_ENCODER_HASH_H_
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _crc_calculator {
+  uint32_t remainder;
+  uint32_t trunc_poly;
+  uint32_t bits;
+  uint32_t table[256];
+  uint32_t final_result_mask;
+} CRC_CALCULATOR;
+
+// Initialize the crc calculator. It must be executed at least once before
+// calling av1_get_crc_value().
+void av1_crc_calculator_init(CRC_CALCULATOR *p_crc_calculator, uint32_t bits,
+                             uint32_t truncPoly);
+uint32_t av1_get_crc_value(CRC_CALCULATOR *p_crc_calculator, uint8_t *p,
+                           int length);
+
+// CRC32C: POLY = 0x82f63b78;
+typedef struct _CRC32C {
+  /* Table for a quadword-at-a-time software crc. */
+  uint32_t table[8][256];
+} CRC32C;
+
+// init table for software version crc32c
+void av1_crc32c_calculator_init(CRC32C *p_crc32c);
+
+#define AOM_BUFFER_SIZE_FOR_BLOCK_HASH (4096)
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_HASH_H_
diff --git a/libs/libaom/src/av1/encoder/hash_motion.c b/libs/libaom/src/av1/encoder/hash_motion.c
new file mode 100644
index 000000000..310cde886
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/hash_motion.c
@@ -0,0 +1,491 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/encoder/block.h"
+#include "av1/encoder/hash.h"
+#include "av1/encoder/hash_motion.h"
+
+#define kSrcBits 16
+#define kBlockSizeBits 3
+#define kMaxAddr (1 << (kSrcBits + kBlockSizeBits))
+
+// TODO(youzhou@microsoft.com): is higher than 8 bits screen content supported?
+// If yes, fix this function
+static void get_pixels_in_1D_char_array_by_block_2x2(const uint8_t *y_src,
+                                                     int stride,
+                                                     uint8_t *p_pixels_in1D) {
+  const uint8_t *p_pel = y_src;
+  int index = 0;
+  for (int i = 0; i < 2; i++) {
+    for (int j = 0; j < 2; j++) {
+      p_pixels_in1D[index++] = p_pel[j];
+    }
+    p_pel += stride;
+  }
+}
+
+static void get_pixels_in_1D_short_array_by_block_2x2(const uint16_t *y_src,
+                                                      int stride,
+                                                      uint16_t *p_pixels_in1D) {
+  const uint16_t *p_pel = y_src;
+  int index = 0;
+  for (int i = 0; i < 2; i++) {
+    for (int j = 0; j < 2; j++) {
+      p_pixels_in1D[index++] = p_pel[j];
+    }
+    p_pel += stride;
+  }
+}
+
+static int is_block_2x2_row_same_value(const uint8_t *p) {
+  if (p[0] != p[1] || p[2] != p[3]) {
+    return 0;
+  }
+  return 1;
+}
+
+static int is_block16_2x2_row_same_value(const uint16_t *p) {
+  if (p[0] != p[1] || p[2] != p[3]) {
+    return 0;
+  }
+  return 1;
+}
+
+static int is_block_2x2_col_same_value(const uint8_t *p) {
+  if ((p[0] != p[2]) || (p[1] != p[3])) {
+    return 0;
+  }
+  return 1;
+}
+
+static int is_block16_2x2_col_same_value(const uint16_t *p) {
+  if ((p[0] != p[2]) || (p[1] != p[3])) {
+    return 0;
+  }
+  return 1;
+}
+
+// the hash value (hash_value1 consists two parts, the first 3 bits relate to
+// the block size and the remaining 16 bits are the crc values. This fuction
+// is used to get the first 3 bits.
+static int hash_block_size_to_index(int block_size) {
+  switch (block_size) {
+    case 4: return 0;
+    case 8: return 1;
+    case 16: return 2;
+    case 32: return 3;
+    case 64: return 4;
+    case 128: return 5;
+    default: return -1;
+  }
+}
+
+void av1_hash_table_init(IntraBCHashInfo *intrabc_hash_info) {
+  if (!intrabc_hash_info->g_crc_initialized) {
+    av1_crc_calculator_init(&intrabc_hash_info->crc_calculator1, 24, 0x5D6DCB);
+    av1_crc_calculator_init(&intrabc_hash_info->crc_calculator2, 24, 0x864CFB);
+    intrabc_hash_info->g_crc_initialized = 1;
+  }
+  intrabc_hash_info->intrabc_hash_table.p_lookup_table = NULL;
+}
+
+void av1_hash_table_clear_all(hash_table *p_hash_table) {
+  if (p_hash_table->p_lookup_table == NULL) {
+    return;
+  }
+  for (int i = 0; i < kMaxAddr; i++) {
+    if (p_hash_table->p_lookup_table[i] != NULL) {
+      aom_vector_destroy(p_hash_table->p_lookup_table[i]);
+      aom_free(p_hash_table->p_lookup_table[i]);
+      p_hash_table->p_lookup_table[i] = NULL;
+    }
+  }
+}
+
+void av1_hash_table_destroy(hash_table *p_hash_table) {
+  av1_hash_table_clear_all(p_hash_table);
+  aom_free(p_hash_table->p_lookup_table);
+  p_hash_table->p_lookup_table = NULL;
+}
+
+void av1_hash_table_create(hash_table *p_hash_table) {
+  if (p_hash_table->p_lookup_table != NULL) {
+    av1_hash_table_clear_all(p_hash_table);
+    return;
+  }
+  p_hash_table->p_lookup_table =
+      (Vector **)aom_malloc(sizeof(p_hash_table->p_lookup_table[0]) * kMaxAddr);
+  memset(p_hash_table->p_lookup_table, 0,
+         sizeof(p_hash_table->p_lookup_table[0]) * kMaxAddr);
+}
+
+static void hash_table_add_to_table(hash_table *p_hash_table,
+                                    uint32_t hash_value,
+                                    block_hash *curr_block_hash) {
+  if (p_hash_table->p_lookup_table[hash_value] == NULL) {
+    p_hash_table->p_lookup_table[hash_value] =
+        aom_malloc(sizeof(p_hash_table->p_lookup_table[0][0]));
+    aom_vector_setup(p_hash_table->p_lookup_table[hash_value], 10,
+                     sizeof(curr_block_hash[0]));
+    aom_vector_push_back(p_hash_table->p_lookup_table[hash_value],
+                         curr_block_hash);
+  } else {
+    aom_vector_push_back(p_hash_table->p_lookup_table[hash_value],
+                         curr_block_hash);
+  }
+}
+
+int32_t av1_hash_table_count(const hash_table *p_hash_table,
+                             uint32_t hash_value) {
+  if (p_hash_table->p_lookup_table[hash_value] == NULL) {
+    return 0;
+  } else {
+    return (int32_t)(p_hash_table->p_lookup_table[hash_value]->size);
+  }
+}
+
+Iterator av1_hash_get_first_iterator(hash_table *p_hash_table,
+                                     uint32_t hash_value) {
+  assert(av1_hash_table_count(p_hash_table, hash_value) > 0);
+  return aom_vector_begin(p_hash_table->p_lookup_table[hash_value]);
+}
+
+int32_t av1_has_exact_match(hash_table *p_hash_table, uint32_t hash_value1,
+                            uint32_t hash_value2) {
+  if (p_hash_table->p_lookup_table[hash_value1] == NULL) {
+    return 0;
+  }
+  Iterator iterator =
+      aom_vector_begin(p_hash_table->p_lookup_table[hash_value1]);
+  Iterator last = aom_vector_end(p_hash_table->p_lookup_table[hash_value1]);
+  for (; !aom_iterator_equals(&iterator, &last);
+       aom_iterator_increment(&iterator)) {
+    if ((*(block_hash *)aom_iterator_get(&iterator)).hash_value2 ==
+        hash_value2) {
+      return 1;
+    }
+  }
+  return 0;
+}
+
+void av1_generate_block_2x2_hash_value(IntraBCHashInfo *intrabc_hash_info,
+                                       const YV12_BUFFER_CONFIG *picture,
+                                       uint32_t *pic_block_hash[2],
+                                       int8_t *pic_block_same_info[3]) {
+  const int width = 2;
+  const int height = 2;
+  const int x_end = picture->y_crop_width - width + 1;
+  const int y_end = picture->y_crop_height - height + 1;
+  CRC_CALCULATOR *calc_1 = &intrabc_hash_info->crc_calculator1;
+  CRC_CALCULATOR *calc_2 = &intrabc_hash_info->crc_calculator2;
+
+  const int length = width * 2;
+  if (picture->flags & YV12_FLAG_HIGHBITDEPTH) {
+    uint16_t p[4];
+    int pos = 0;
+    for (int y_pos = 0; y_pos < y_end; y_pos++) {
+      for (int x_pos = 0; x_pos < x_end; x_pos++) {
+        get_pixels_in_1D_short_array_by_block_2x2(
+            CONVERT_TO_SHORTPTR(picture->y_buffer) + y_pos * picture->y_stride +
+                x_pos,
+            picture->y_stride, p);
+        pic_block_same_info[0][pos] = is_block16_2x2_row_same_value(p);
+        pic_block_same_info[1][pos] = is_block16_2x2_col_same_value(p);
+
+        pic_block_hash[0][pos] =
+            av1_get_crc_value(calc_1, (uint8_t *)p, length * sizeof(p[0]));
+        pic_block_hash[1][pos] =
+            av1_get_crc_value(calc_2, (uint8_t *)p, length * sizeof(p[0]));
+        pos++;
+      }
+      pos += width - 1;
+    }
+  } else {
+    uint8_t p[4];
+    int pos = 0;
+    for (int y_pos = 0; y_pos < y_end; y_pos++) {
+      for (int x_pos = 0; x_pos < x_end; x_pos++) {
+        get_pixels_in_1D_char_array_by_block_2x2(
+            picture->y_buffer + y_pos * picture->y_stride + x_pos,
+            picture->y_stride, p);
+        pic_block_same_info[0][pos] = is_block_2x2_row_same_value(p);
+        pic_block_same_info[1][pos] = is_block_2x2_col_same_value(p);
+
+        pic_block_hash[0][pos] =
+            av1_get_crc_value(calc_1, p, length * sizeof(p[0]));
+        pic_block_hash[1][pos] =
+            av1_get_crc_value(calc_2, p, length * sizeof(p[0]));
+        pos++;
+      }
+      pos += width - 1;
+    }
+  }
+}
+
+void av1_generate_block_hash_value(IntraBCHashInfo *intrabc_hash_info,
+                                   const YV12_BUFFER_CONFIG *picture,
+                                   int block_size,
+                                   uint32_t *src_pic_block_hash[2],
+                                   uint32_t *dst_pic_block_hash[2],
+                                   int8_t *src_pic_block_same_info[3],
+                                   int8_t *dst_pic_block_same_info[3]) {
+  CRC_CALCULATOR *calc_1 = &intrabc_hash_info->crc_calculator1;
+  CRC_CALCULATOR *calc_2 = &intrabc_hash_info->crc_calculator2;
+
+  const int pic_width = picture->y_crop_width;
+  const int x_end = picture->y_crop_width - block_size + 1;
+  const int y_end = picture->y_crop_height - block_size + 1;
+
+  const int src_size = block_size >> 1;
+  const int quad_size = block_size >> 2;
+
+  uint32_t p[4];
+  const int length = sizeof(p);
+
+  int pos = 0;
+  for (int y_pos = 0; y_pos < y_end; y_pos++) {
+    for (int x_pos = 0; x_pos < x_end; x_pos++) {
+      p[0] = src_pic_block_hash[0][pos];
+      p[1] = src_pic_block_hash[0][pos + src_size];
+      p[2] = src_pic_block_hash[0][pos + src_size * pic_width];
+      p[3] = src_pic_block_hash[0][pos + src_size * pic_width + src_size];
+      dst_pic_block_hash[0][pos] =
+          av1_get_crc_value(calc_1, (uint8_t *)p, length);
+
+      p[0] = src_pic_block_hash[1][pos];
+      p[1] = src_pic_block_hash[1][pos + src_size];
+      p[2] = src_pic_block_hash[1][pos + src_size * pic_width];
+      p[3] = src_pic_block_hash[1][pos + src_size * pic_width + src_size];
+      dst_pic_block_hash[1][pos] =
+          av1_get_crc_value(calc_2, (uint8_t *)p, length);
+
+      dst_pic_block_same_info[0][pos] =
+          src_pic_block_same_info[0][pos] &&
+          src_pic_block_same_info[0][pos + quad_size] &&
+          src_pic_block_same_info[0][pos + src_size] &&
+          src_pic_block_same_info[0][pos + src_size * pic_width] &&
+          src_pic_block_same_info[0][pos + src_size * pic_width + quad_size] &&
+          src_pic_block_same_info[0][pos + src_size * pic_width + src_size];
+
+      dst_pic_block_same_info[1][pos] =
+          src_pic_block_same_info[1][pos] &&
+          src_pic_block_same_info[1][pos + src_size] &&
+          src_pic_block_same_info[1][pos + quad_size * pic_width] &&
+          src_pic_block_same_info[1][pos + quad_size * pic_width + src_size] &&
+          src_pic_block_same_info[1][pos + src_size * pic_width] &&
+          src_pic_block_same_info[1][pos + src_size * pic_width + src_size];
+      pos++;
+    }
+    pos += block_size - 1;
+  }
+
+  if (block_size >= 4) {
+    const int size_minus_1 = block_size - 1;
+    pos = 0;
+    for (int y_pos = 0; y_pos < y_end; y_pos++) {
+      for (int x_pos = 0; x_pos < x_end; x_pos++) {
+        dst_pic_block_same_info[2][pos] =
+            (!dst_pic_block_same_info[0][pos] &&
+             !dst_pic_block_same_info[1][pos]) ||
+            (((x_pos & size_minus_1) == 0) && ((y_pos & size_minus_1) == 0));
+        pos++;
+      }
+      pos += block_size - 1;
+    }
+  }
+}
+
+void av1_add_to_hash_map_by_row_with_precal_data(hash_table *p_hash_table,
+                                                 uint32_t *pic_hash[2],
+                                                 int8_t *pic_is_same,
+                                                 int pic_width, int pic_height,
+                                                 int block_size) {
+  const int x_end = pic_width - block_size + 1;
+  const int y_end = pic_height - block_size + 1;
+
+  const int8_t *src_is_added = pic_is_same;
+  const uint32_t *src_hash[2] = { pic_hash[0], pic_hash[1] };
+
+  int add_value = hash_block_size_to_index(block_size);
+  assert(add_value >= 0);
+  add_value <<= kSrcBits;
+  const int crc_mask = (1 << kSrcBits) - 1;
+
+  for (int x_pos = 0; x_pos < x_end; x_pos++) {
+    for (int y_pos = 0; y_pos < y_end; y_pos++) {
+      const int pos = y_pos * pic_width + x_pos;
+      // valid data
+      if (src_is_added[pos]) {
+        block_hash curr_block_hash;
+        curr_block_hash.x = x_pos;
+        curr_block_hash.y = y_pos;
+
+        const uint32_t hash_value1 = (src_hash[0][pos] & crc_mask) + add_value;
+        curr_block_hash.hash_value2 = src_hash[1][pos];
+
+        hash_table_add_to_table(p_hash_table, hash_value1, &curr_block_hash);
+      }
+    }
+  }
+}
+
+int av1_hash_is_horizontal_perfect(const YV12_BUFFER_CONFIG *picture,
+                                   int block_size, int x_start, int y_start) {
+  const int stride = picture->y_stride;
+  const uint8_t *p = picture->y_buffer + y_start * stride + x_start;
+
+  if (picture->flags & YV12_FLAG_HIGHBITDEPTH) {
+    const uint16_t *p16 = CONVERT_TO_SHORTPTR(p);
+    for (int i = 0; i < block_size; i++) {
+      for (int j = 1; j < block_size; j++) {
+        if (p16[j] != p16[0]) {
+          return 0;
+        }
+      }
+      p16 += stride;
+    }
+  } else {
+    for (int i = 0; i < block_size; i++) {
+      for (int j = 1; j < block_size; j++) {
+        if (p[j] != p[0]) {
+          return 0;
+        }
+      }
+      p += stride;
+    }
+  }
+
+  return 1;
+}
+
+int av1_hash_is_vertical_perfect(const YV12_BUFFER_CONFIG *picture,
+                                 int block_size, int x_start, int y_start) {
+  const int stride = picture->y_stride;
+  const uint8_t *p = picture->y_buffer + y_start * stride + x_start;
+
+  if (picture->flags & YV12_FLAG_HIGHBITDEPTH) {
+    const uint16_t *p16 = CONVERT_TO_SHORTPTR(p);
+    for (int i = 0; i < block_size; i++) {
+      for (int j = 1; j < block_size; j++) {
+        if (p16[j * stride + i] != p16[i]) {
+          return 0;
+        }
+      }
+    }
+  } else {
+    for (int i = 0; i < block_size; i++) {
+      for (int j = 1; j < block_size; j++) {
+        if (p[j * stride + i] != p[i]) {
+          return 0;
+        }
+      }
+    }
+  }
+  return 1;
+}
+
+void av1_get_block_hash_value(IntraBCHashInfo *intrabc_hash_info,
+                              const uint8_t *y_src, int stride, int block_size,
+                              uint32_t *hash_value1, uint32_t *hash_value2,
+                              int use_highbitdepth) {
+  int add_value = hash_block_size_to_index(block_size);
+  assert(add_value >= 0);
+  add_value <<= kSrcBits;
+  const int crc_mask = (1 << kSrcBits) - 1;
+
+  CRC_CALCULATOR *calc_1 = &intrabc_hash_info->crc_calculator1;
+  CRC_CALCULATOR *calc_2 = &intrabc_hash_info->crc_calculator2;
+  uint32_t **buf_1 = intrabc_hash_info->hash_value_buffer[0];
+  uint32_t **buf_2 = intrabc_hash_info->hash_value_buffer[1];
+
+  // 2x2 subblock hash values in current CU
+  int sub_block_in_width = (block_size >> 1);
+  if (use_highbitdepth) {
+    uint16_t pixel_to_hash[4];
+    uint16_t *y16_src = CONVERT_TO_SHORTPTR(y_src);
+    for (int y_pos = 0; y_pos < block_size; y_pos += 2) {
+      for (int x_pos = 0; x_pos < block_size; x_pos += 2) {
+        int pos = (y_pos >> 1) * sub_block_in_width + (x_pos >> 1);
+        get_pixels_in_1D_short_array_by_block_2x2(
+            y16_src + y_pos * stride + x_pos, stride, pixel_to_hash);
+        assert(pos < AOM_BUFFER_SIZE_FOR_BLOCK_HASH);
+        buf_1[0][pos] = av1_get_crc_value(calc_1, (uint8_t *)pixel_to_hash,
+                                          sizeof(pixel_to_hash));
+        buf_2[0][pos] = av1_get_crc_value(calc_2, (uint8_t *)pixel_to_hash,
+                                          sizeof(pixel_to_hash));
+      }
+    }
+  } else {
+    uint8_t pixel_to_hash[4];
+    for (int y_pos = 0; y_pos < block_size; y_pos += 2) {
+      for (int x_pos = 0; x_pos < block_size; x_pos += 2) {
+        int pos = (y_pos >> 1) * sub_block_in_width + (x_pos >> 1);
+        get_pixels_in_1D_char_array_by_block_2x2(y_src + y_pos * stride + x_pos,
+                                                 stride, pixel_to_hash);
+        assert(pos < AOM_BUFFER_SIZE_FOR_BLOCK_HASH);
+        buf_1[0][pos] =
+            av1_get_crc_value(calc_1, pixel_to_hash, sizeof(pixel_to_hash));
+        buf_2[0][pos] =
+            av1_get_crc_value(calc_2, pixel_to_hash, sizeof(pixel_to_hash));
+      }
+    }
+  }
+
+  int src_sub_block_in_width = sub_block_in_width;
+  sub_block_in_width >>= 1;
+
+  int src_idx = 1;
+  int dst_idx = 0;
+
+  // 4x4 subblock hash values to current block hash values
+  uint32_t to_hash[4];
+  for (int sub_width = 4; sub_width <= block_size; sub_width *= 2) {
+    src_idx = 1 - src_idx;
+    dst_idx = 1 - dst_idx;
+
+    int dst_pos = 0;
+    for (int y_pos = 0; y_pos < sub_block_in_width; y_pos++) {
+      for (int x_pos = 0; x_pos < sub_block_in_width; x_pos++) {
+        int srcPos = (y_pos << 1) * src_sub_block_in_width + (x_pos << 1);
+
+        assert(srcPos + 1 < AOM_BUFFER_SIZE_FOR_BLOCK_HASH);
+        assert(srcPos + src_sub_block_in_width + 1 <
+               AOM_BUFFER_SIZE_FOR_BLOCK_HASH);
+        assert(dst_pos < AOM_BUFFER_SIZE_FOR_BLOCK_HASH);
+        to_hash[0] = buf_1[src_idx][srcPos];
+        to_hash[1] = buf_1[src_idx][srcPos + 1];
+        to_hash[2] = buf_1[src_idx][srcPos + src_sub_block_in_width];
+        to_hash[3] = buf_1[src_idx][srcPos + src_sub_block_in_width + 1];
+
+        buf_1[dst_idx][dst_pos] =
+            av1_get_crc_value(calc_1, (uint8_t *)to_hash, sizeof(to_hash));
+
+        to_hash[0] = buf_2[src_idx][srcPos];
+        to_hash[1] = buf_2[src_idx][srcPos + 1];
+        to_hash[2] = buf_2[src_idx][srcPos + src_sub_block_in_width];
+        to_hash[3] = buf_2[src_idx][srcPos + src_sub_block_in_width + 1];
+        buf_2[dst_idx][dst_pos] =
+            av1_get_crc_value(calc_2, (uint8_t *)to_hash, sizeof(to_hash));
+        dst_pos++;
+      }
+    }
+
+    src_sub_block_in_width = sub_block_in_width;
+    sub_block_in_width >>= 1;
+  }
+
+  *hash_value1 = (buf_1[dst_idx][0] & crc_mask) + add_value;
+  *hash_value2 = buf_2[dst_idx][0];
+}
diff --git a/libs/libaom/src/av1/encoder/hash_motion.h b/libs/libaom/src/av1/encoder/hash_motion.h
new file mode 100644
index 000000000..e4ea1f394
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/hash_motion.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_HASH_MOTION_H_
+#define AOM_AV1_ENCODER_HASH_MOTION_H_
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+#include "aom_scale/yv12config.h"
+#include "av1/encoder/hash.h"
+#include "third_party/vector/vector.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Block size used for force_integer_mv decisions
+#define FORCE_INT_MV_DECISION_BLOCK_SIZE 8
+
+// store a block's hash info.
+// x and y are the position from the top left of the picture
+// hash_value2 is used to store the second hash value
+typedef struct _block_hash {
+  int16_t x;
+  int16_t y;
+  uint32_t hash_value2;
+} block_hash;
+
+typedef struct _hash_table {
+  Vector **p_lookup_table;
+} hash_table;
+
+struct intrabc_hash_info;
+
+typedef struct intrabc_hash_info {
+  // buffer for hash value calculation of a block
+  // used only in av1_get_block_hash_value()
+  // [first hash/second hash]
+  // [two buffers used ping-pong]
+  uint32_t *hash_value_buffer[2][2];
+  hash_table intrabc_hash_table;
+
+  CRC_CALCULATOR crc_calculator1;
+  CRC_CALCULATOR crc_calculator2;
+  int g_crc_initialized;
+} IntraBCHashInfo;
+
+void av1_hash_table_init(IntraBCHashInfo *intra_bc_hash_info);
+void av1_hash_table_clear_all(hash_table *p_hash_table);
+void av1_hash_table_destroy(hash_table *p_hash_table);
+void av1_hash_table_create(hash_table *p_hash_table);
+int32_t av1_hash_table_count(const hash_table *p_hash_table,
+                             uint32_t hash_value);
+Iterator av1_hash_get_first_iterator(hash_table *p_hash_table,
+                                     uint32_t hash_value);
+int32_t av1_has_exact_match(hash_table *p_hash_table, uint32_t hash_value1,
+                            uint32_t hash_value2);
+void av1_generate_block_2x2_hash_value(IntraBCHashInfo *intra_bc_hash_info,
+                                       const YV12_BUFFER_CONFIG *picture,
+                                       uint32_t *pic_block_hash[2],
+                                       int8_t *pic_block_same_info[3]);
+void av1_generate_block_hash_value(IntraBCHashInfo *intra_bc_hash_info,
+                                   const YV12_BUFFER_CONFIG *picture,
+                                   int block_size,
+                                   uint32_t *src_pic_block_hash[2],
+                                   uint32_t *dst_pic_block_hash[2],
+                                   int8_t *src_pic_block_same_info[3],
+                                   int8_t *dst_pic_block_same_info[3]);
+void av1_add_to_hash_map_by_row_with_precal_data(hash_table *p_hash_table,
+                                                 uint32_t *pic_hash[2],
+                                                 int8_t *pic_is_same,
+                                                 int pic_width, int pic_height,
+                                                 int block_size);
+
+// check whether the block starts from (x_start, y_start) with the size of
+// block_size x block_size has the same color in all rows
+int av1_hash_is_horizontal_perfect(const YV12_BUFFER_CONFIG *picture,
+                                   int block_size, int x_start, int y_start);
+// check whether the block starts from (x_start, y_start) with the size of
+// block_size x block_size has the same color in all columns
+int av1_hash_is_vertical_perfect(const YV12_BUFFER_CONFIG *picture,
+                                 int block_size, int x_start, int y_start);
+
+void av1_get_block_hash_value(IntraBCHashInfo *intrabc_hash_info,
+                              const uint8_t *y_src, int stride, int block_size,
+                              uint32_t *hash_value1, uint32_t *hash_value2,
+                              int use_highbitdepth);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_HASH_MOTION_H_
diff --git a/libs/libaom/src/av1/encoder/hybrid_fwd_txfm.c b/libs/libaom/src/av1/encoder/hybrid_fwd_txfm.c
new file mode 100644
index 000000000..06990857a
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/hybrid_fwd_txfm.c
@@ -0,0 +1,308 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "av1/common/idct.h"
+#include "av1/encoder/hybrid_fwd_txfm.h"
+
+/* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per
+   pixel. */
+void av1_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) {
+  int i;
+  tran_high_t a1, b1, c1, d1, e1;
+  const int16_t *ip_pass0 = input;
+  const tran_low_t *ip = NULL;
+  tran_low_t *op = output;
+
+  for (i = 0; i < 4; i++) {
+    a1 = ip_pass0[0 * stride];
+    b1 = ip_pass0[1 * stride];
+    c1 = ip_pass0[2 * stride];
+    d1 = ip_pass0[3 * stride];
+
+    a1 += b1;
+    d1 = d1 - c1;
+    e1 = (a1 - d1) >> 1;
+    b1 = e1 - b1;
+    c1 = e1 - c1;
+    a1 -= c1;
+    d1 += b1;
+    op[0] = (tran_low_t)a1;
+    op[4] = (tran_low_t)c1;
+    op[8] = (tran_low_t)d1;
+    op[12] = (tran_low_t)b1;
+
+    ip_pass0++;
+    op++;
+  }
+  ip = output;
+  op = output;
+
+  for (i = 0; i < 4; i++) {
+    a1 = ip[0];
+    b1 = ip[1];
+    c1 = ip[2];
+    d1 = ip[3];
+
+    a1 += b1;
+    d1 -= c1;
+    e1 = (a1 - d1) >> 1;
+    b1 = e1 - b1;
+    c1 = e1 - c1;
+    a1 -= c1;
+    d1 += b1;
+    op[0] = (tran_low_t)(a1 * UNIT_QUANT_FACTOR);
+    op[1] = (tran_low_t)(c1 * UNIT_QUANT_FACTOR);
+    op[2] = (tran_low_t)(d1 * UNIT_QUANT_FACTOR);
+    op[3] = (tran_low_t)(b1 * UNIT_QUANT_FACTOR);
+
+    ip += 4;
+    op += 4;
+  }
+}
+
+void av1_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output,
+                          int stride) {
+  av1_fwht4x4_c(input, output, stride);
+}
+
+static void highbd_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
+                                int diff_stride, TxfmParam *txfm_param) {
+  int32_t *dst_coeff = (int32_t *)coeff;
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  const int bd = txfm_param->bd;
+  if (txfm_param->lossless) {
+    assert(tx_type == DCT_DCT);
+    av1_highbd_fwht4x4(src_diff, coeff, diff_stride);
+    return;
+  }
+  av1_fwd_txfm2d_4x4(src_diff, dst_coeff, diff_stride, tx_type, bd);
+}
+
+static void highbd_fwd_txfm_4x8(const int16_t *src_diff, tran_low_t *coeff,
+                                int diff_stride, TxfmParam *txfm_param) {
+  int32_t *dst_coeff = (int32_t *)coeff;
+  av1_fwd_txfm2d_4x8(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+                     txfm_param->bd);
+}
+
+static void highbd_fwd_txfm_8x4(const int16_t *src_diff, tran_low_t *coeff,
+                                int diff_stride, TxfmParam *txfm_param) {
+  int32_t *dst_coeff = (int32_t *)coeff;
+  av1_fwd_txfm2d_8x4(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+                     txfm_param->bd);
+}
+
+static void highbd_fwd_txfm_8x16(const int16_t *src_diff, tran_low_t *coeff,
+                                 int diff_stride, TxfmParam *txfm_param) {
+  int32_t *dst_coeff = (int32_t *)coeff;
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  const int bd = txfm_param->bd;
+  av1_fwd_txfm2d_8x16(src_diff, dst_coeff, diff_stride, tx_type, bd);
+}
+
+static void highbd_fwd_txfm_16x8(const int16_t *src_diff, tran_low_t *coeff,
+                                 int diff_stride, TxfmParam *txfm_param) {
+  int32_t *dst_coeff = (int32_t *)coeff;
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  const int bd = txfm_param->bd;
+  av1_fwd_txfm2d_16x8(src_diff, dst_coeff, diff_stride, tx_type, bd);
+}
+
+static void highbd_fwd_txfm_16x32(const int16_t *src_diff, tran_low_t *coeff,
+                                  int diff_stride, TxfmParam *txfm_param) {
+  int32_t *dst_coeff = (int32_t *)coeff;
+  av1_fwd_txfm2d_16x32(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+                       txfm_param->bd);
+}
+
+static void highbd_fwd_txfm_32x16(const int16_t *src_diff, tran_low_t *coeff,
+                                  int diff_stride, TxfmParam *txfm_param) {
+  int32_t *dst_coeff = (int32_t *)coeff;
+  av1_fwd_txfm2d_32x16(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+                       txfm_param->bd);
+}
+
+static void highbd_fwd_txfm_16x4(const int16_t *src_diff, tran_low_t *coeff,
+                                 int diff_stride, TxfmParam *txfm_param) {
+  int32_t *dst_coeff = (int32_t *)coeff;
+  av1_fwd_txfm2d_16x4(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+                      txfm_param->bd);
+}
+
+static void highbd_fwd_txfm_4x16(const int16_t *src_diff, tran_low_t *coeff,
+                                 int diff_stride, TxfmParam *txfm_param) {
+  int32_t *dst_coeff = (int32_t *)coeff;
+  av1_fwd_txfm2d_4x16(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+                      txfm_param->bd);
+}
+
+static void highbd_fwd_txfm_32x8(const int16_t *src_diff, tran_low_t *coeff,
+                                 int diff_stride, TxfmParam *txfm_param) {
+  int32_t *dst_coeff = (int32_t *)coeff;
+  av1_fwd_txfm2d_32x8(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+                      txfm_param->bd);
+}
+
+static void highbd_fwd_txfm_8x32(const int16_t *src_diff, tran_low_t *coeff,
+                                 int diff_stride, TxfmParam *txfm_param) {
+  int32_t *dst_coeff = (int32_t *)coeff;
+  av1_fwd_txfm2d_8x32(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+                      txfm_param->bd);
+}
+
+static void highbd_fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
+                                int diff_stride, TxfmParam *txfm_param) {
+  int32_t *dst_coeff = (int32_t *)coeff;
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  const int bd = txfm_param->bd;
+  av1_fwd_txfm2d_8x8(src_diff, dst_coeff, diff_stride, tx_type, bd);
+}
+
+static void highbd_fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff,
+                                  int diff_stride, TxfmParam *txfm_param) {
+  int32_t *dst_coeff = (int32_t *)coeff;
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  const int bd = txfm_param->bd;
+  av1_fwd_txfm2d_16x16(src_diff, dst_coeff, diff_stride, tx_type, bd);
+}
+
+static void highbd_fwd_txfm_32x32(const int16_t *src_diff, tran_low_t *coeff,
+                                  int diff_stride, TxfmParam *txfm_param) {
+  int32_t *dst_coeff = (int32_t *)coeff;
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  const int bd = txfm_param->bd;
+  av1_fwd_txfm2d_32x32(src_diff, dst_coeff, diff_stride, tx_type, bd);
+}
+
+static void highbd_fwd_txfm_32x64(const int16_t *src_diff, tran_low_t *coeff,
+                                  int diff_stride, TxfmParam *txfm_param) {
+  assert(txfm_param->tx_type == DCT_DCT);
+  int32_t *dst_coeff = (int32_t *)coeff;
+  const int bd = txfm_param->bd;
+  av1_fwd_txfm2d_32x64(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+                       bd);
+}
+
+static void highbd_fwd_txfm_64x32(const int16_t *src_diff, tran_low_t *coeff,
+                                  int diff_stride, TxfmParam *txfm_param) {
+  assert(txfm_param->tx_type == DCT_DCT);
+  int32_t *dst_coeff = (int32_t *)coeff;
+  const int bd = txfm_param->bd;
+  av1_fwd_txfm2d_64x32(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+                       bd);
+}
+
+static void highbd_fwd_txfm_16x64(const int16_t *src_diff, tran_low_t *coeff,
+                                  int diff_stride, TxfmParam *txfm_param) {
+  assert(txfm_param->tx_type == DCT_DCT);
+  int32_t *dst_coeff = (int32_t *)coeff;
+  const int bd = txfm_param->bd;
+  av1_fwd_txfm2d_16x64(src_diff, dst_coeff, diff_stride, DCT_DCT, bd);
+}
+
+static void highbd_fwd_txfm_64x16(const int16_t *src_diff, tran_low_t *coeff,
+                                  int diff_stride, TxfmParam *txfm_param) {
+  assert(txfm_param->tx_type == DCT_DCT);
+  int32_t *dst_coeff = (int32_t *)coeff;
+  const int bd = txfm_param->bd;
+  av1_fwd_txfm2d_64x16(src_diff, dst_coeff, diff_stride, DCT_DCT, bd);
+}
+
+static void highbd_fwd_txfm_64x64(const int16_t *src_diff, tran_low_t *coeff,
+                                  int diff_stride, TxfmParam *txfm_param) {
+  assert(txfm_param->tx_type == DCT_DCT);
+  int32_t *dst_coeff = (int32_t *)coeff;
+  const int bd = txfm_param->bd;
+  av1_fwd_txfm2d_64x64(src_diff, dst_coeff, diff_stride, DCT_DCT, bd);
+}
+
+void av1_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, int diff_stride,
+                  TxfmParam *txfm_param) {
+  if (txfm_param->bd == 8)
+    av1_lowbd_fwd_txfm(src_diff, coeff, diff_stride, txfm_param);
+  else
+    av1_highbd_fwd_txfm(src_diff, coeff, diff_stride, txfm_param);
+}
+
+void av1_lowbd_fwd_txfm_c(const int16_t *src_diff, tran_low_t *coeff,
+                          int diff_stride, TxfmParam *txfm_param) {
+  av1_highbd_fwd_txfm(src_diff, coeff, diff_stride, txfm_param);
+}
+
+void av1_highbd_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff,
+                         int diff_stride, TxfmParam *txfm_param) {
+  assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
+  const TX_SIZE tx_size = txfm_param->tx_size;
+  switch (tx_size) {
+    case TX_64X64:
+      highbd_fwd_txfm_64x64(src_diff, coeff, diff_stride, txfm_param);
+      break;
+    case TX_32X64:
+      highbd_fwd_txfm_32x64(src_diff, coeff, diff_stride, txfm_param);
+      break;
+    case TX_64X32:
+      highbd_fwd_txfm_64x32(src_diff, coeff, diff_stride, txfm_param);
+      break;
+    case TX_16X64:
+      highbd_fwd_txfm_16x64(src_diff, coeff, diff_stride, txfm_param);
+      break;
+    case TX_64X16:
+      highbd_fwd_txfm_64x16(src_diff, coeff, diff_stride, txfm_param);
+      break;
+    case TX_32X32:
+      highbd_fwd_txfm_32x32(src_diff, coeff, diff_stride, txfm_param);
+      break;
+    case TX_16X16:
+      highbd_fwd_txfm_16x16(src_diff, coeff, diff_stride, txfm_param);
+      break;
+    case TX_8X8:
+      highbd_fwd_txfm_8x8(src_diff, coeff, diff_stride, txfm_param);
+      break;
+    case TX_4X8:
+      highbd_fwd_txfm_4x8(src_diff, coeff, diff_stride, txfm_param);
+      break;
+    case TX_8X4:
+      highbd_fwd_txfm_8x4(src_diff, coeff, diff_stride, txfm_param);
+      break;
+    case TX_8X16:
+      highbd_fwd_txfm_8x16(src_diff, coeff, diff_stride, txfm_param);
+      break;
+    case TX_16X8:
+      highbd_fwd_txfm_16x8(src_diff, coeff, diff_stride, txfm_param);
+      break;
+    case TX_16X32:
+      highbd_fwd_txfm_16x32(src_diff, coeff, diff_stride, txfm_param);
+      break;
+    case TX_32X16:
+      highbd_fwd_txfm_32x16(src_diff, coeff, diff_stride, txfm_param);
+      break;
+    case TX_4X4:
+      highbd_fwd_txfm_4x4(src_diff, coeff, diff_stride, txfm_param);
+      break;
+    case TX_4X16:
+      highbd_fwd_txfm_4x16(src_diff, coeff, diff_stride, txfm_param);
+      break;
+    case TX_16X4:
+      highbd_fwd_txfm_16x4(src_diff, coeff, diff_stride, txfm_param);
+      break;
+    case TX_8X32:
+      highbd_fwd_txfm_8x32(src_diff, coeff, diff_stride, txfm_param);
+      break;
+    case TX_32X8:
+      highbd_fwd_txfm_32x8(src_diff, coeff, diff_stride, txfm_param);
+      break;
+    default: assert(0); break;
+  }
+}
diff --git a/libs/libaom/src/av1/encoder/hybrid_fwd_txfm.h b/libs/libaom/src/av1/encoder/hybrid_fwd_txfm.h
new file mode 100644
index 000000000..daabc7119
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/hybrid_fwd_txfm.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_HYBRID_FWD_TXFM_H_
+#define AOM_AV1_ENCODER_HYBRID_FWD_TXFM_H_
+
+#include "config/aom_config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, int diff_stride,
+                  TxfmParam *txfm_param);
+
+void av1_highbd_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff,
+                         int diff_stride, TxfmParam *txfm_param);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_HYBRID_FWD_TXFM_H_
diff --git a/libs/libaom/src/av1/encoder/interp_search.c b/libs/libaom/src/av1/encoder/interp_search.c
new file mode 100644
index 000000000..6b7317be7
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/interp_search.c
@@ -0,0 +1,753 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/pred_common.h"
+#include "av1/encoder/interp_search.h"
+#include "av1/encoder/model_rd.h"
+#include "av1/encoder/rdopt_utils.h"
+#include "av1/encoder/reconinter_enc.h"
+
+// return mv_diff
+static INLINE int is_interp_filter_good_match(
+    const INTERPOLATION_FILTER_STATS *st, MB_MODE_INFO *const mi,
+    int skip_level) {
+  const int is_comp = has_second_ref(mi);
+  int i;
+
+  for (i = 0; i < 1 + is_comp; ++i) {
+    if (st->ref_frames[i] != mi->ref_frame[i]) return INT_MAX;
+  }
+
+  if (skip_level == 1 && is_comp) {
+    if (st->comp_type != mi->interinter_comp.type) return INT_MAX;
+    if (st->compound_idx != mi->compound_idx) return INT_MAX;
+  }
+
+  int mv_diff = 0;
+  for (i = 0; i < 1 + is_comp; ++i) {
+    mv_diff += abs(st->mv[i].as_mv.row - mi->mv[i].as_mv.row) +
+               abs(st->mv[i].as_mv.col - mi->mv[i].as_mv.col);
+  }
+  return mv_diff;
+}
+
+static INLINE int save_interp_filter_search_stat(
+    MB_MODE_INFO *const mbmi, int64_t rd, unsigned int pred_sse,
+    INTERPOLATION_FILTER_STATS *interp_filter_stats,
+    int interp_filter_stats_idx) {
+  if (interp_filter_stats_idx < MAX_INTERP_FILTER_STATS) {
+    INTERPOLATION_FILTER_STATS stat = { mbmi->interp_filters,
+                                        { mbmi->mv[0], mbmi->mv[1] },
+                                        { mbmi->ref_frame[0],
+                                          mbmi->ref_frame[1] },
+                                        mbmi->interinter_comp.type,
+                                        mbmi->compound_idx,
+                                        rd,
+                                        pred_sse };
+    interp_filter_stats[interp_filter_stats_idx] = stat;
+    interp_filter_stats_idx++;
+  }
+  return interp_filter_stats_idx;
+}
+
+static INLINE int find_interp_filter_in_stats(
+    MB_MODE_INFO *const mbmi, INTERPOLATION_FILTER_STATS *interp_filter_stats,
+    int interp_filter_stats_idx, int skip_level) {
+  // [skip_levels][single or comp]
+  const int thr[2][2] = { { 0, 0 }, { 3, 7 } };
+  const int is_comp = has_second_ref(mbmi);
+
+  // Find good enough match.
+  // TODO(yunqing): Separate single-ref mode and comp mode stats for fast
+  // search.
+  int best = INT_MAX;
+  int match = -1;
+  for (int j = 0; j < interp_filter_stats_idx; ++j) {
+    const INTERPOLATION_FILTER_STATS *st = &interp_filter_stats[j];
+    const int mv_diff = is_interp_filter_good_match(st, mbmi, skip_level);
+    // Exact match is found.
+    if (mv_diff == 0) {
+      match = j;
+      break;
+    } else if (mv_diff < best && mv_diff <= thr[skip_level - 1][is_comp]) {
+      best = mv_diff;
+      match = j;
+    }
+  }
+
+  if (match != -1) {
+    mbmi->interp_filters = interp_filter_stats[match].filters;
+    return match;
+  }
+  return -1;  // no match result found
+}
+
+int av1_find_interp_filter_match(
+    MB_MODE_INFO *const mbmi, const AV1_COMP *const cpi,
+    const InterpFilter assign_filter, const int need_search,
+    INTERPOLATION_FILTER_STATS *interp_filter_stats,
+    int interp_filter_stats_idx) {
+  int match_found_idx = -1;
+  if (cpi->sf.interp_sf.use_interp_filter && need_search)
+    match_found_idx = find_interp_filter_in_stats(
+        mbmi, interp_filter_stats, interp_filter_stats_idx,
+        cpi->sf.interp_sf.use_interp_filter);
+
+  if (!need_search || match_found_idx == -1)
+    set_default_interp_filters(mbmi, assign_filter);
+  return match_found_idx;
+}
+
+static INLINE void swap_dst_buf(MACROBLOCKD *xd, const BUFFER_SET *dst_bufs[2],
+                                int num_planes) {
+  const BUFFER_SET *buf0 = dst_bufs[0];
+  dst_bufs[0] = dst_bufs[1];
+  dst_bufs[1] = buf0;
+  restore_dst_buf(xd, *dst_bufs[0], num_planes);
+}
+
+static INLINE int get_switchable_rate(MACROBLOCK *const x,
+                                      const int_interpfilters filters,
+                                      const int ctx[2]) {
+  int inter_filter_cost;
+  const InterpFilter filter0 = filters.as_filters.y_filter;
+  const InterpFilter filter1 = filters.as_filters.x_filter;
+  inter_filter_cost = x->switchable_interp_costs[ctx[0]][filter0];
+  inter_filter_cost += x->switchable_interp_costs[ctx[1]][filter1];
+  return SWITCHABLE_INTERP_RATE_FACTOR * inter_filter_cost;
+}
+
+// Build inter predictor and calculate model rd
+// for a given plane.
+static INLINE void interp_model_rd_eval(
+    MACROBLOCK *const x, const AV1_COMP *const cpi, BLOCK_SIZE bsize,
+    const BUFFER_SET *const orig_dst, int plane_from, int plane_to,
+    RD_STATS *rd_stats, int is_skip_build_pred) {
+  const AV1_COMMON *cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  RD_STATS tmp_rd_stats;
+  av1_init_rd_stats(&tmp_rd_stats);
+
+  // Skip inter predictor if the predictor is already avilable.
+  if (!is_skip_build_pred) {
+    const int mi_row = xd->mi_row;
+    const int mi_col = xd->mi_col;
+    av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+                                  plane_from, plane_to);
+  }
+
+  model_rd_sb_fn[cpi->sf.rt_sf.use_simple_rd_model
+                     ? MODELRD_LEGACY
+                     : MODELRD_TYPE_INTERP_FILTER](
+      cpi, bsize, x, xd, plane_from, plane_to, &tmp_rd_stats.rate,
+      &tmp_rd_stats.dist, &tmp_rd_stats.skip, &tmp_rd_stats.sse, NULL, NULL,
+      NULL);
+
+  av1_merge_rd_stats(rd_stats, &tmp_rd_stats);
+}
+
+// calculate the rdcost of given interpolation_filter
+static INLINE int64_t interpolation_filter_rd(
+    MACROBLOCK *const x, const AV1_COMP *const cpi,
+    const TileDataEnc *tile_data, BLOCK_SIZE bsize,
+    const BUFFER_SET *const orig_dst, int64_t *const rd,
+    RD_STATS *rd_stats_luma, RD_STATS *rd_stats, int *const switchable_rate,
+    const BUFFER_SET *dst_bufs[2], int filter_idx, const int switchable_ctx[2],
+    const int skip_pred) {
+  const AV1_COMMON *cm = &cpi->common;
+  const InterpSearchFlags *interp_search_flags = &cpi->interp_search_flags;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  RD_STATS this_rd_stats_luma, this_rd_stats;
+
+  // Initialize rd_stats structures to default values.
+  av1_init_rd_stats(&this_rd_stats_luma);
+  this_rd_stats = *rd_stats_luma;
+  const int_interpfilters last_best = mbmi->interp_filters;
+  mbmi->interp_filters = filter_sets[filter_idx];
+  const int tmp_rs =
+      get_switchable_rate(x, mbmi->interp_filters, switchable_ctx);
+
+  int64_t min_rd = RDCOST(x->rdmult, tmp_rs, 0);
+  if (min_rd > *rd) {
+    mbmi->interp_filters = last_best;
+    return 0;
+  }
+
+  (void)tile_data;
+
+  assert(skip_pred != 2);
+  assert((rd_stats_luma->rate >= 0) && (rd_stats->rate >= 0));
+  assert((rd_stats_luma->dist >= 0) && (rd_stats->dist >= 0));
+  assert((rd_stats_luma->sse >= 0) && (rd_stats->sse >= 0));
+  assert((rd_stats_luma->skip == 0) || (rd_stats_luma->skip == 1));
+  assert((rd_stats->skip == 0) || (rd_stats->skip == 1));
+  assert((skip_pred >= 0) &&
+         (skip_pred <= interp_search_flags->default_interp_skip_flags));
+
+  // When skip pred is equal to default_interp_skip_flags,
+  // skip both luma and chroma MC.
+  // For mono-chrome images:
+  // num_planes = 1 and cpi->default_interp_skip_flags = 1,
+  // skip_pred = 1: skip both luma and chroma
+  // skip_pred = 0: Evaluate luma and as num_planes=1,
+  // skip chroma evaluation
+  int tmp_skip_pred =
+      (skip_pred == interp_search_flags->default_interp_skip_flags)
+          ? INTERP_SKIP_LUMA_SKIP_CHROMA
+          : skip_pred;
+
+  switch (tmp_skip_pred) {
+    case INTERP_EVAL_LUMA_EVAL_CHROMA:
+      // skip_pred = 0: Evaluate both luma and chroma.
+      // Luma MC
+      interp_model_rd_eval(x, cpi, bsize, orig_dst, AOM_PLANE_Y, AOM_PLANE_Y,
+                           &this_rd_stats_luma, 0);
+      this_rd_stats = this_rd_stats_luma;
+#if CONFIG_COLLECT_RD_STATS == 3
+      RD_STATS rd_stats_y;
+      av1_pick_recursive_tx_size_type_yrd(cpi, x, &rd_stats_y, bsize,
+                                          INT64_MAX);
+      PrintPredictionUnitStats(cpi, tile_data, x, &rd_stats_y, bsize);
+#endif  // CONFIG_COLLECT_RD_STATS == 3
+      AOM_FALLTHROUGH_INTENDED;
+    case INTERP_SKIP_LUMA_EVAL_CHROMA:
+      // skip_pred = 1: skip luma evaluation (retain previous best luma stats)
+      // and do chroma evaluation.
+      for (int plane = 1; plane < num_planes; ++plane) {
+        int64_t tmp_rd =
+            RDCOST(x->rdmult, tmp_rs + this_rd_stats.rate, this_rd_stats.dist);
+        if (tmp_rd >= *rd) {
+          mbmi->interp_filters = last_best;
+          return 0;
+        }
+        interp_model_rd_eval(x, cpi, bsize, orig_dst, plane, plane,
+                             &this_rd_stats, 0);
+      }
+      break;
+    case INTERP_SKIP_LUMA_SKIP_CHROMA:
+      // both luma and chroma evaluation is skipped
+      this_rd_stats = *rd_stats;
+      break;
+    case INTERP_EVAL_INVALID:
+    default: assert(0); return 0;
+  }
+  int64_t tmp_rd =
+      RDCOST(x->rdmult, tmp_rs + this_rd_stats.rate, this_rd_stats.dist);
+
+  if (tmp_rd < *rd) {
+    *rd = tmp_rd;
+    *switchable_rate = tmp_rs;
+    if (skip_pred != interp_search_flags->default_interp_skip_flags) {
+      if (skip_pred == INTERP_EVAL_LUMA_EVAL_CHROMA) {
+        // Overwrite the data as current filter is the best one
+        *rd_stats_luma = this_rd_stats_luma;
+        *rd_stats = this_rd_stats;
+        // As luma MC data is computed, no need to recompute after the search
+        x->recalc_luma_mc_data = 0;
+      } else if (skip_pred == INTERP_SKIP_LUMA_EVAL_CHROMA) {
+        // As luma MC data is not computed, update of luma data can be skipped
+        *rd_stats = this_rd_stats;
+        // As luma MC data is not recomputed and current filter is the best,
+        // indicate the possibility of recomputing MC data
+        // If current buffer contains valid MC data, toggle to indicate that
+        // luma MC data needs to be recomputed
+        x->recalc_luma_mc_data ^= 1;
+      }
+      swap_dst_buf(xd, dst_bufs, num_planes);
+    }
+    return 1;
+  }
+  mbmi->interp_filters = last_best;
+  return 0;
+}
+
+static INLINE INTERP_PRED_TYPE is_pred_filter_search_allowed(
+    const AV1_COMP *const cpi, MACROBLOCKD *xd, BLOCK_SIZE bsize,
+    int_interpfilters *af, int_interpfilters *lf) {
+  const AV1_COMMON *cm = &cpi->common;
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const int bsl = mi_size_wide_log2[bsize];
+  int is_horiz_eq = 0, is_vert_eq = 0;
+
+  if (above_mbmi && is_inter_block(above_mbmi))
+    *af = above_mbmi->interp_filters;
+
+  if (left_mbmi && is_inter_block(left_mbmi)) *lf = left_mbmi->interp_filters;
+
+  if (af->as_filters.x_filter != INTERP_INVALID)
+    is_horiz_eq = af->as_filters.x_filter == lf->as_filters.x_filter;
+  if (af->as_filters.y_filter != INTERP_INVALID)
+    is_vert_eq = af->as_filters.y_filter == lf->as_filters.y_filter;
+
+  INTERP_PRED_TYPE pred_filter_type = (is_vert_eq << 1) + is_horiz_eq;
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+  int pred_filter_enable =
+      cpi->sf.interp_sf.cb_pred_filter_search
+          ? (((mi_row + mi_col) >> bsl) +
+             get_chessboard_index(cm->current_frame.frame_number)) &
+                0x1
+          : 0;
+  pred_filter_enable &= is_horiz_eq || is_vert_eq;
+  // pred_filter_search = 0: pred_filter is disabled
+  // pred_filter_search = 1: pred_filter is enabled and only horz pred matching
+  // pred_filter_search = 2: pred_filter is enabled and only vert pred matching
+  // pred_filter_search = 3: pred_filter is enabled and
+  //                         both vert, horz pred matching
+  return pred_filter_enable * pred_filter_type;
+}
+
+static DUAL_FILTER_TYPE find_best_interp_rd_facade(
+    MACROBLOCK *const x, const AV1_COMP *const cpi,
+    const TileDataEnc *tile_data, BLOCK_SIZE bsize,
+    const BUFFER_SET *const orig_dst, int64_t *const rd, RD_STATS *rd_stats_y,
+    RD_STATS *rd_stats, int *const switchable_rate,
+    const BUFFER_SET *dst_bufs[2], const int switchable_ctx[2],
+    const int skip_pred, uint16_t allow_interp_mask, int is_w4_or_h4) {
+  int tmp_skip_pred = skip_pred;
+  DUAL_FILTER_TYPE best_filt_type = REG_REG;
+
+  // If no filter are set to be evaluated, return from function
+  if (allow_interp_mask == 0x0) return best_filt_type;
+  // For block width or height is 4, skip the pred evaluation of SHARP_SHARP
+  tmp_skip_pred = is_w4_or_h4
+                      ? cpi->interp_search_flags.default_interp_skip_flags
+                      : skip_pred;
+
+  // Loop over the all filter types and evaluate for only allowed filter types
+  for (int filt_type = SHARP_SHARP; filt_type >= REG_REG; --filt_type) {
+    const int is_filter_allowed =
+        get_interp_filter_allowed_mask(allow_interp_mask, filt_type);
+    if (is_filter_allowed)
+      if (interpolation_filter_rd(x, cpi, tile_data, bsize, orig_dst, rd,
+                                  rd_stats_y, rd_stats, switchable_rate,
+                                  dst_bufs, filt_type, switchable_ctx,
+                                  tmp_skip_pred))
+        best_filt_type = filt_type;
+    tmp_skip_pred = skip_pred;
+  }
+  return best_filt_type;
+}
+
+static INLINE void pred_dual_interp_filter_rd(
+    MACROBLOCK *const x, const AV1_COMP *const cpi,
+    const TileDataEnc *tile_data, BLOCK_SIZE bsize,
+    const BUFFER_SET *const orig_dst, int64_t *const rd, RD_STATS *rd_stats_y,
+    RD_STATS *rd_stats, int *const switchable_rate,
+    const BUFFER_SET *dst_bufs[2], const int switchable_ctx[2],
+    const int skip_pred, INTERP_PRED_TYPE pred_filt_type, int_interpfilters *af,
+    int_interpfilters *lf) {
+  (void)lf;
+  assert(pred_filt_type > INTERP_HORZ_NEQ_VERT_NEQ);
+  assert(pred_filt_type < INTERP_PRED_TYPE_ALL);
+  uint16_t allowed_interp_mask = 0;
+
+  if (pred_filt_type == INTERP_HORZ_EQ_VERT_NEQ) {
+    // pred_filter_search = 1: Only horizontal filter is matching
+    allowed_interp_mask =
+        av1_interp_dual_filt_mask[pred_filt_type - 1][af->as_filters.x_filter];
+  } else if (pred_filt_type == INTERP_HORZ_NEQ_VERT_EQ) {
+    // pred_filter_search = 2: Only vertical filter is matching
+    allowed_interp_mask =
+        av1_interp_dual_filt_mask[pred_filt_type - 1][af->as_filters.y_filter];
+  } else {
+    // pred_filter_search = 3: Both horizontal and vertical filter are matching
+    int filt_type =
+        af->as_filters.x_filter + af->as_filters.y_filter * SWITCHABLE_FILTERS;
+    set_interp_filter_allowed_mask(&allowed_interp_mask, filt_type);
+  }
+  // REG_REG is already been evaluated in the beginning
+  reset_interp_filter_allowed_mask(&allowed_interp_mask, REG_REG);
+  find_best_interp_rd_facade(x, cpi, tile_data, bsize, orig_dst, rd, rd_stats_y,
+                             rd_stats, switchable_rate, dst_bufs,
+                             switchable_ctx, skip_pred, allowed_interp_mask, 0);
+}
+// Evaluate dual filter type
+// a) Using above, left block interp filter
+// b) Find the best horizontal filter and
+//    then evaluate corresponding vertical filters.
+static INLINE void fast_dual_interp_filter_rd(
+    MACROBLOCK *const x, const AV1_COMP *const cpi,
+    const TileDataEnc *tile_data, BLOCK_SIZE bsize,
+    const BUFFER_SET *const orig_dst, int64_t *const rd, RD_STATS *rd_stats_y,
+    RD_STATS *rd_stats, int *const switchable_rate,
+    const BUFFER_SET *dst_bufs[2], const int switchable_ctx[2],
+    const int skip_hor, const int skip_ver) {
+  const InterpSearchFlags *interp_search_flags = &cpi->interp_search_flags;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  INTERP_PRED_TYPE pred_filter_type = INTERP_HORZ_NEQ_VERT_NEQ;
+  int_interpfilters af = av1_broadcast_interp_filter(INTERP_INVALID);
+  int_interpfilters lf = af;
+
+  if (!have_newmv_in_inter_mode(mbmi->mode)) {
+    pred_filter_type = is_pred_filter_search_allowed(cpi, xd, bsize, &af, &lf);
+  }
+
+  if (pred_filter_type) {
+    pred_dual_interp_filter_rd(x, cpi, tile_data, bsize, orig_dst, rd,
+                               rd_stats_y, rd_stats, switchable_rate, dst_bufs,
+                               switchable_ctx, (skip_hor & skip_ver),
+                               pred_filter_type, &af, &lf);
+  } else {
+    const int bw = block_size_wide[bsize];
+    const int bh = block_size_high[bsize];
+    int best_dual_mode = 0;
+    int skip_pred =
+        bw <= 4 ? interp_search_flags->default_interp_skip_flags : skip_hor;
+    // TODO(any): Make use of find_best_interp_rd_facade()
+    // if speed impact is negligible
+    for (int i = (SWITCHABLE_FILTERS - 1); i >= 1; --i) {
+      if (interpolation_filter_rd(x, cpi, tile_data, bsize, orig_dst, rd,
+                                  rd_stats_y, rd_stats, switchable_rate,
+                                  dst_bufs, i, switchable_ctx, skip_pred)) {
+        best_dual_mode = i;
+      }
+      skip_pred = skip_hor;
+    }
+    // From best of horizontal EIGHTTAP_REGULAR modes, check vertical modes
+    skip_pred =
+        bh <= 4 ? interp_search_flags->default_interp_skip_flags : skip_ver;
+    for (int i = (best_dual_mode + (SWITCHABLE_FILTERS * 2));
+         i >= (best_dual_mode + SWITCHABLE_FILTERS); i -= SWITCHABLE_FILTERS) {
+      interpolation_filter_rd(x, cpi, tile_data, bsize, orig_dst, rd,
+                              rd_stats_y, rd_stats, switchable_rate, dst_bufs,
+                              i, switchable_ctx, skip_pred);
+      skip_pred = skip_ver;
+    }
+  }
+}
+
+// Find the best interp filter if dual_interp_filter = 0
+static INLINE void find_best_non_dual_interp_filter(
+    MACROBLOCK *const x, const AV1_COMP *const cpi,
+    const TileDataEnc *tile_data, BLOCK_SIZE bsize,
+    const BUFFER_SET *const orig_dst, int64_t *const rd, RD_STATS *rd_stats_y,
+    RD_STATS *rd_stats, int *const switchable_rate,
+    const BUFFER_SET *dst_bufs[2], const int switchable_ctx[2],
+    const int skip_ver, const int skip_hor) {
+  const InterpSearchFlags *interp_search_flags = &cpi->interp_search_flags;
+  int8_t i;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+
+  uint16_t interp_filter_search_mask =
+      interp_search_flags->interp_filter_search_mask;
+
+  if (cpi->sf.interp_sf.adaptive_interp_filter_search == 2) {
+    const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->gf_group);
+    const int ctx0 = av1_get_pred_context_switchable_interp(xd, 0);
+    const int ctx1 = av1_get_pred_context_switchable_interp(xd, 1);
+    const int *switchable_interp_p0 =
+        cpi->frame_probs.switchable_interp_probs[update_type][ctx0];
+    const int *switchable_interp_p1 =
+        cpi->frame_probs.switchable_interp_probs[update_type][ctx1];
+
+    static const int thr[7] = { 0, 8, 8, 8, 8, 0, 8 };
+    const int thresh = thr[update_type];
+    for (i = 0; i < SWITCHABLE_FILTERS; i++) {
+      // For non-dual case, the 2 dir's prob should be identical.
+      assert(switchable_interp_p0[i] == switchable_interp_p1[i]);
+      if (switchable_interp_p0[i] < thresh &&
+          switchable_interp_p1[i] < thresh) {
+        DUAL_FILTER_TYPE filt_type = i + SWITCHABLE_FILTERS * i;
+        reset_interp_filter_allowed_mask(&interp_filter_search_mask, filt_type);
+      }
+    }
+  }
+
+  // Regular filter evaluation should have been done and hence the same should
+  // be the winner
+  assert(x->e_mbd.mi[0]->interp_filters.as_int == filter_sets[0].as_int);
+  if ((skip_hor & skip_ver) != interp_search_flags->default_interp_skip_flags) {
+    INTERP_PRED_TYPE pred_filter_type = INTERP_HORZ_NEQ_VERT_NEQ;
+    int_interpfilters af = av1_broadcast_interp_filter(INTERP_INVALID);
+    int_interpfilters lf = af;
+
+    pred_filter_type = is_pred_filter_search_allowed(cpi, xd, bsize, &af, &lf);
+    if (pred_filter_type) {
+      assert(af.as_filters.x_filter != INTERP_INVALID);
+      int filter_idx = SWITCHABLE * af.as_filters.x_filter;
+      // This assert tells that (filter_x == filter_y) for non-dual filter case
+      assert(filter_sets[filter_idx].as_filters.x_filter ==
+             filter_sets[filter_idx].as_filters.y_filter);
+      if (cpi->sf.interp_sf.adaptive_interp_filter_search &&
+          !(get_interp_filter_allowed_mask(interp_filter_search_mask,
+                                           filter_idx))) {
+        return;
+      }
+      if (filter_idx) {
+        interpolation_filter_rd(x, cpi, tile_data, bsize, orig_dst, rd,
+                                rd_stats_y, rd_stats, switchable_rate, dst_bufs,
+                                filter_idx, switchable_ctx,
+                                (skip_hor & skip_ver));
+      }
+      return;
+    }
+  }
+  // Reuse regular filter's modeled rd data for sharp filter for following
+  // cases
+  // 1) When bsize is 4x4
+  // 2) When block width is 4 (i.e. 4x8/4x16 blocks) and MV in vertical
+  // direction is full-pel
+  // 3) When block height is 4 (i.e. 8x4/16x4 blocks) and MV in horizontal
+  // direction is full-pel
+  // TODO(any): Optimize cases 2 and 3 further if luma MV in relavant direction
+  // alone is full-pel
+
+  if ((bsize == BLOCK_4X4) ||
+      (block_size_wide[bsize] == 4 &&
+       skip_ver == interp_search_flags->default_interp_skip_flags) ||
+      (block_size_high[bsize] == 4 &&
+       skip_hor == interp_search_flags->default_interp_skip_flags)) {
+    int skip_pred = skip_hor & skip_ver;
+    uint16_t allowed_interp_mask = 0;
+
+    // REG_REG filter type is evaluated beforehand, hence skip it
+    set_interp_filter_allowed_mask(&allowed_interp_mask, SHARP_SHARP);
+    set_interp_filter_allowed_mask(&allowed_interp_mask, SMOOTH_SMOOTH);
+    if (cpi->sf.interp_sf.adaptive_interp_filter_search)
+      allowed_interp_mask &= interp_filter_search_mask;
+
+    find_best_interp_rd_facade(x, cpi, tile_data, bsize, orig_dst, rd,
+                               rd_stats_y, rd_stats, switchable_rate, dst_bufs,
+                               switchable_ctx, skip_pred, allowed_interp_mask,
+                               1);
+  } else {
+    int skip_pred = (skip_hor & skip_ver);
+    for (i = (SWITCHABLE_FILTERS + 1); i < DUAL_FILTER_SET_SIZE;
+         i += (SWITCHABLE_FILTERS + 1)) {
+      // This assert tells that (filter_x == filter_y) for non-dual filter case
+      assert(filter_sets[i].as_filters.x_filter ==
+             filter_sets[i].as_filters.y_filter);
+      if (cpi->sf.interp_sf.adaptive_interp_filter_search &&
+          !(get_interp_filter_allowed_mask(interp_filter_search_mask, i))) {
+        continue;
+      }
+      interpolation_filter_rd(x, cpi, tile_data, bsize, orig_dst, rd,
+                              rd_stats_y, rd_stats, switchable_rate, dst_bufs,
+                              i, switchable_ctx, skip_pred);
+      // In first iteration, smooth filter is evaluated. If smooth filter
+      // (which is less sharper) is the winner among regular and smooth filters,
+      // sharp filter evaluation is skipped
+      // TODO(any): Refine this gating based on modelled rd only (i.e., by not
+      // accounting switchable filter rate)
+      if (cpi->sf.interp_sf.skip_sharp_interp_filter_search &&
+          skip_pred != interp_search_flags->default_interp_skip_flags) {
+        if (mbmi->interp_filters.as_int == filter_sets[SMOOTH_SMOOTH].as_int)
+          break;
+      }
+    }
+  }
+}
+
+static INLINE void calc_interp_skip_pred_flag(MACROBLOCK *const x,
+                                              const AV1_COMP *const cpi,
+                                              int *skip_hor, int *skip_ver) {
+  const AV1_COMMON *cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int num_planes = av1_num_planes(cm);
+  const int is_compound = has_second_ref(mbmi);
+  assert(is_intrabc_block(mbmi) == 0);
+  for (int ref = 0; ref < 1 + is_compound; ++ref) {
+    const struct scale_factors *const sf =
+        get_ref_scale_factors_const(cm, mbmi->ref_frame[ref]);
+    // TODO(any): Refine skip flag calculation considering scaling
+    if (av1_is_scaled(sf)) {
+      *skip_hor = 0;
+      *skip_ver = 0;
+      break;
+    }
+    const MV mv = mbmi->mv[ref].as_mv;
+    int skip_hor_plane = 0;
+    int skip_ver_plane = 0;
+    for (int plane_idx = 0; plane_idx < AOMMAX(1, (num_planes - 1));
+         ++plane_idx) {
+      struct macroblockd_plane *const pd = &xd->plane[plane_idx];
+      const int bw = pd->width;
+      const int bh = pd->height;
+      const MV mv_q4 = clamp_mv_to_umv_border_sb(
+          xd, &mv, bw, bh, pd->subsampling_x, pd->subsampling_y);
+      const int sub_x = (mv_q4.col & SUBPEL_MASK) << SCALE_EXTRA_BITS;
+      const int sub_y = (mv_q4.row & SUBPEL_MASK) << SCALE_EXTRA_BITS;
+      skip_hor_plane |= ((sub_x == 0) << plane_idx);
+      skip_ver_plane |= ((sub_y == 0) << plane_idx);
+    }
+    *skip_hor &= skip_hor_plane;
+    *skip_ver &= skip_ver_plane;
+    // It is not valid that "luma MV is sub-pel, whereas chroma MV is not"
+    assert(*skip_hor != 2);
+    assert(*skip_ver != 2);
+  }
+  // When compond prediction type is compound segment wedge, luma MC and chroma
+  // MC need to go hand in hand as mask generated during luma MC is reuired for
+  // chroma MC. If skip_hor = 0 and skip_ver = 1, mask used for chroma MC during
+  // vertical filter decision may be incorrect as temporary MC evaluation
+  // overwrites the mask. Make skip_ver as 0 for this case so that mask is
+  // populated during luma MC
+  if (is_compound && mbmi->compound_idx == 1 &&
+      mbmi->interinter_comp.type == COMPOUND_DIFFWTD) {
+    assert(mbmi->comp_group_idx == 1);
+    if (*skip_hor == 0 && *skip_ver == 1) *skip_ver = 0;
+  }
+}
+
+int64_t av1_interpolation_filter_search(
+    MACROBLOCK *const x, const AV1_COMP *const cpi,
+    const TileDataEnc *tile_data, BLOCK_SIZE bsize,
+    const BUFFER_SET *const tmp_dst, const BUFFER_SET *const orig_dst,
+    int64_t *const rd, int *const switchable_rate, int *skip_build_pred,
+    HandleInterModeArgs *args, int64_t ref_best_rd) {
+  const AV1_COMMON *cm = &cpi->common;
+  const InterpSearchFlags *interp_search_flags = &cpi->interp_search_flags;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int need_search =
+      av1_is_interp_needed(xd) && !cpi->sf.rt_sf.skip_interp_filter_search;
+  const int ref_frame = xd->mi[0]->ref_frame[0];
+  RD_STATS rd_stats_luma, rd_stats;
+
+  // Initialization of rd_stats structures with default values
+  av1_init_rd_stats(&rd_stats_luma);
+  av1_init_rd_stats(&rd_stats);
+
+  int match_found_idx = -1;
+  const InterpFilter assign_filter = cm->features.interp_filter;
+
+  match_found_idx = av1_find_interp_filter_match(
+      mbmi, cpi, assign_filter, need_search, args->interp_filter_stats,
+      args->interp_filter_stats_idx);
+
+  if (match_found_idx != -1) {
+    *rd = args->interp_filter_stats[match_found_idx].rd;
+    x->pred_sse[ref_frame] =
+        args->interp_filter_stats[match_found_idx].pred_sse;
+    return 0;
+  }
+
+  int switchable_ctx[2];
+  switchable_ctx[0] = av1_get_pred_context_switchable_interp(xd, 0);
+  switchable_ctx[1] = av1_get_pred_context_switchable_interp(xd, 1);
+  *switchable_rate =
+      get_switchable_rate(x, mbmi->interp_filters, switchable_ctx);
+
+  // Do MC evaluation for default filter_type.
+  // Luma MC
+  interp_model_rd_eval(x, cpi, bsize, orig_dst, AOM_PLANE_Y, AOM_PLANE_Y,
+                       &rd_stats_luma, *skip_build_pred);
+
+#if CONFIG_COLLECT_RD_STATS == 3
+  RD_STATS rd_stats_y;
+  av1_pick_recursive_tx_size_type_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
+  PrintPredictionUnitStats(cpi, tile_data, x, &rd_stats_y, bsize);
+#endif  // CONFIG_COLLECT_RD_STATS == 3
+  // Chroma MC
+  if (num_planes > 1) {
+    interp_model_rd_eval(x, cpi, bsize, orig_dst, AOM_PLANE_U, AOM_PLANE_V,
+                         &rd_stats, *skip_build_pred);
+  }
+  *skip_build_pred = 1;
+
+  av1_merge_rd_stats(&rd_stats, &rd_stats_luma);
+
+  assert(rd_stats.rate >= 0);
+
+  *rd = RDCOST(x->rdmult, *switchable_rate + rd_stats.rate, rd_stats.dist);
+  x->pred_sse[ref_frame] = (unsigned int)(rd_stats_luma.sse >> 4);
+
+  if (assign_filter != SWITCHABLE || match_found_idx != -1) {
+    return 0;
+  }
+  if (!need_search) {
+    int_interpfilters filters = av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+    assert(mbmi->interp_filters.as_int == filters.as_int);
+    (void)filters;
+    return 0;
+  }
+  if (args->modelled_rd != NULL) {
+    if (has_second_ref(mbmi)) {
+      const int ref_mv_idx = mbmi->ref_mv_idx;
+      MV_REFERENCE_FRAME *refs = mbmi->ref_frame;
+      const int mode0 = compound_ref0_mode(mbmi->mode);
+      const int mode1 = compound_ref1_mode(mbmi->mode);
+      const int64_t mrd = AOMMIN(args->modelled_rd[mode0][ref_mv_idx][refs[0]],
+                                 args->modelled_rd[mode1][ref_mv_idx][refs[1]]);
+      if ((*rd >> 1) > mrd && ref_best_rd < INT64_MAX) {
+        return INT64_MAX;
+      }
+    }
+  }
+
+  x->recalc_luma_mc_data = 0;
+  // skip_flag=xx (in binary form)
+  // Setting 0th flag corresonds to skipping luma MC and setting 1st bt
+  // corresponds to skipping chroma MC  skip_flag=0 corresponds to "Don't skip
+  // luma and chroma MC"  Skip flag=1 corresponds to "Skip Luma MC only"
+  // Skip_flag=2 is not a valid case
+  // skip_flag=3 corresponds to "Skip both luma and chroma MC"
+  int skip_hor = interp_search_flags->default_interp_skip_flags;
+  int skip_ver = interp_search_flags->default_interp_skip_flags;
+  calc_interp_skip_pred_flag(x, cpi, &skip_hor, &skip_ver);
+
+  // do interp_filter search
+  restore_dst_buf(xd, *tmp_dst, num_planes);
+  const BUFFER_SET *dst_bufs[2] = { tmp_dst, orig_dst };
+  // Evaluate dual interp filters
+  if (cm->seq_params.enable_dual_filter) {
+    if (cpi->sf.interp_sf.use_fast_interpolation_filter_search) {
+      fast_dual_interp_filter_rd(x, cpi, tile_data, bsize, orig_dst, rd,
+                                 &rd_stats_luma, &rd_stats, switchable_rate,
+                                 dst_bufs, switchable_ctx, skip_hor, skip_ver);
+    } else {
+      // Use full interpolation filter search
+      uint16_t allowed_interp_mask = ALLOW_ALL_INTERP_FILT_MASK;
+      // REG_REG filter type is evaluated beforehand, so loop is repeated over
+      // REG_SMOOTH to SHARP_SHARP for full interpolation filter search
+      reset_interp_filter_allowed_mask(&allowed_interp_mask, REG_REG);
+      find_best_interp_rd_facade(x, cpi, tile_data, bsize, orig_dst, rd,
+                                 &rd_stats_luma, &rd_stats, switchable_rate,
+                                 dst_bufs, switchable_ctx,
+                                 (skip_hor & skip_ver), allowed_interp_mask, 0);
+    }
+  } else {
+    // Evaluate non-dual interp filters
+    find_best_non_dual_interp_filter(
+        x, cpi, tile_data, bsize, orig_dst, rd, &rd_stats_luma, &rd_stats,
+        switchable_rate, dst_bufs, switchable_ctx, skip_ver, skip_hor);
+  }
+  swap_dst_buf(xd, dst_bufs, num_planes);
+  // Recompute final MC data if required
+  if (x->recalc_luma_mc_data == 1) {
+    // Recomputing final luma MC data is required only if the same was skipped
+    // in either of the directions  Condition below is necessary, but not
+    // sufficient
+    assert((skip_hor == 1) || (skip_ver == 1));
+    const int mi_row = xd->mi_row;
+    const int mi_col = xd->mi_col;
+    av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+                                  AOM_PLANE_Y, AOM_PLANE_Y);
+  }
+  x->pred_sse[ref_frame] = (unsigned int)(rd_stats_luma.sse >> 4);
+
+  // save search results
+  if (cpi->sf.interp_sf.use_interp_filter) {
+    assert(match_found_idx == -1);
+    args->interp_filter_stats_idx = save_interp_filter_search_stat(
+        mbmi, *rd, x->pred_sse[ref_frame], args->interp_filter_stats,
+        args->interp_filter_stats_idx);
+  }
+  return 0;
+}
diff --git a/libs/libaom/src/av1/encoder/interp_search.h b/libs/libaom/src/av1/encoder/interp_search.h
new file mode 100644
index 000000000..401e14f5b
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/interp_search.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_INTERP_FILTER_SEARCH_H_
+#define AOM_AV1_ENCODER_INTERP_FILTER_SEARCH_H_
+
+#include "av1/encoder/block.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/rdopt_utils.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_INTERP_FILTER_STATS 128
+#define DUAL_FILTER_SET_SIZE (SWITCHABLE_FILTERS * SWITCHABLE_FILTERS)
+
+typedef struct {
+  int_interpfilters filters;
+  int_mv mv[2];
+  int8_t ref_frames[2];
+  COMPOUND_TYPE comp_type;
+  int compound_idx;
+  int64_t rd;
+  unsigned int pred_sse;
+} INTERPOLATION_FILTER_STATS;
+
+typedef struct {
+  // OBMC secondary prediction buffers and respective strides
+  uint8_t *above_pred_buf[MAX_MB_PLANE];
+  int above_pred_stride[MAX_MB_PLANE];
+  uint8_t *left_pred_buf[MAX_MB_PLANE];
+  int left_pred_stride[MAX_MB_PLANE];
+  int_mv (*single_newmv)[REF_FRAMES];
+  // Pointer to array of motion vectors to use for each ref and their rates
+  // Should point to first of 2 arrays in 2D array
+  int (*single_newmv_rate)[REF_FRAMES];
+  int (*single_newmv_valid)[REF_FRAMES];
+  // Pointer to array of predicted rate-distortion
+  // Should point to first of 2 arrays in 2D array
+  int64_t (*modelled_rd)[MAX_REF_MV_SEARCH][REF_FRAMES];
+  int ref_frame_cost;
+  int single_comp_cost;
+  int64_t (*simple_rd)[MAX_REF_MV_SEARCH][REF_FRAMES];
+  int skip_motion_mode;
+  INTERINTRA_MODE *inter_intra_mode;
+  int single_ref_first_pass;
+  SimpleRDState *simple_rd_state;
+  // [comp_idx][saved stat_idx]
+  INTERPOLATION_FILTER_STATS interp_filter_stats[MAX_INTERP_FILTER_STATS];
+  int interp_filter_stats_idx;
+} HandleInterModeArgs;
+
+static const int_interpfilters filter_sets[DUAL_FILTER_SET_SIZE] = {
+  { 0x00000000 }, { 0x00010000 }, { 0x00020000 },  // y = 0
+  { 0x00000001 }, { 0x00010001 }, { 0x00020001 },  // y = 1
+  { 0x00000002 }, { 0x00010002 }, { 0x00020002 },  // y = 2
+};
+
+int av1_find_interp_filter_match(
+    MB_MODE_INFO *const mbmi, const AV1_COMP *const cpi,
+    const InterpFilter assign_filter, const int need_search,
+    INTERPOLATION_FILTER_STATS *interp_filter_stats,
+    int interp_filter_stats_idx);
+
+int64_t av1_interpolation_filter_search(
+    MACROBLOCK *const x, const AV1_COMP *const cpi,
+    const TileDataEnc *tile_data, BLOCK_SIZE bsize,
+    const BUFFER_SET *const tmp_dst, const BUFFER_SET *const orig_dst,
+    int64_t *const rd, int *const switchable_rate, int *skip_build_pred,
+    HandleInterModeArgs *args, int64_t ref_best_rd);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_INTERP_FILTER_SEARCH_H_
diff --git a/libs/libaom/src/av1/encoder/intra_mode_search.c b/libs/libaom/src/av1/encoder/intra_mode_search.c
new file mode 100644
index 000000000..43192a945
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/intra_mode_search.c
@@ -0,0 +1,2132 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/encoder/intra_mode_search.h"
+#include "av1/encoder/model_rd.h"
+#include "av1/encoder/palette.h"
+#include "av1/common/pred_common.h"
+#include "av1/common/reconintra.h"
+#include "av1/encoder/tx_search.h"
+
+static const PREDICTION_MODE intra_rd_search_mode_order[INTRA_MODES] = {
+  DC_PRED,       H_PRED,        V_PRED,    SMOOTH_PRED, PAETH_PRED,
+  SMOOTH_V_PRED, SMOOTH_H_PRED, D135_PRED, D203_PRED,   D157_PRED,
+  D67_PRED,      D113_PRED,     D45_PRED,
+};
+
+static const UV_PREDICTION_MODE uv_rd_search_mode_order[UV_INTRA_MODES] = {
+  UV_DC_PRED,     UV_CFL_PRED,   UV_H_PRED,        UV_V_PRED,
+  UV_SMOOTH_PRED, UV_PAETH_PRED, UV_SMOOTH_V_PRED, UV_SMOOTH_H_PRED,
+  UV_D135_PRED,   UV_D203_PRED,  UV_D157_PRED,     UV_D67_PRED,
+  UV_D113_PRED,   UV_D45_PRED,
+};
+
+#define BINS 32
+static const float intra_hog_model_bias[DIRECTIONAL_MODES] = {
+  0.450578f,  0.695518f,  -0.717944f, -0.639894f,
+  -0.602019f, -0.453454f, 0.055857f,  -0.465480f,
+};
+
+static const float intra_hog_model_weights[BINS * DIRECTIONAL_MODES] = {
+  -3.076402f, -3.757063f, -3.275266f, -3.180665f, -3.452105f, -3.216593f,
+  -2.871212f, -3.134296f, -1.822324f, -2.401411f, -1.541016f, -1.195322f,
+  -0.434156f, 0.322868f,  2.260546f,  3.368715f,  3.989290f,  3.308487f,
+  2.277893f,  0.923793f,  0.026412f,  -0.385174f, -0.718622f, -1.408867f,
+  -1.050558f, -2.323941f, -2.225827f, -2.585453f, -3.054283f, -2.875087f,
+  -2.985709f, -3.447155f, 3.758139f,  3.204353f,  2.170998f,  0.826587f,
+  -0.269665f, -0.702068f, -1.085776f, -2.175249f, -1.623180f, -2.975142f,
+  -2.779629f, -3.190799f, -3.521900f, -3.375480f, -3.319355f, -3.897389f,
+  -3.172334f, -3.594528f, -2.879132f, -2.547777f, -2.921023f, -2.281844f,
+  -1.818988f, -2.041771f, -0.618268f, -1.396458f, -0.567153f, -0.285868f,
+  -0.088058f, 0.753494f,  2.092413f,  3.215266f,  -3.300277f, -2.748658f,
+  -2.315784f, -2.423671f, -2.257283f, -2.269583f, -2.196660f, -2.301076f,
+  -2.646516f, -2.271319f, -2.254366f, -2.300102f, -2.217960f, -2.473300f,
+  -2.116866f, -2.528246f, -3.314712f, -1.701010f, -0.589040f, -0.088077f,
+  0.813112f,  1.702213f,  2.653045f,  3.351749f,  3.243554f,  3.199409f,
+  2.437856f,  1.468854f,  0.533039f,  -0.099065f, -0.622643f, -2.200732f,
+  -4.228861f, -2.875263f, -1.273956f, -0.433280f, 0.803771f,  1.975043f,
+  3.179528f,  3.939064f,  3.454379f,  3.689386f,  3.116411f,  1.970991f,
+  0.798406f,  -0.628514f, -1.252546f, -2.825176f, -4.090178f, -3.777448f,
+  -3.227314f, -3.479403f, -3.320569f, -3.159372f, -2.729202f, -2.722341f,
+  -3.054913f, -2.742923f, -2.612703f, -2.662632f, -2.907314f, -3.117794f,
+  -3.102660f, -3.970972f, -4.891357f, -3.935582f, -3.347758f, -2.721924f,
+  -2.219011f, -1.702391f, -0.866529f, -0.153743f, 0.107733f,  1.416882f,
+  2.572884f,  3.607755f,  3.974820f,  3.997783f,  2.970459f,  0.791687f,
+  -1.478921f, -1.228154f, -1.216955f, -1.765932f, -1.951003f, -1.985301f,
+  -1.975881f, -1.985593f, -2.422371f, -2.419978f, -2.531288f, -2.951853f,
+  -3.071380f, -3.277027f, -3.373539f, -4.462010f, -0.967888f, 0.805524f,
+  2.794130f,  3.685984f,  3.745195f,  3.252444f,  2.316108f,  1.399146f,
+  -0.136519f, -0.162811f, -1.004357f, -1.667911f, -1.964662f, -2.937579f,
+  -3.019533f, -3.942766f, -5.102767f, -3.882073f, -3.532027f, -3.451956f,
+  -2.944015f, -2.643064f, -2.529872f, -2.077290f, -2.809965f, -1.803734f,
+  -1.783593f, -1.662585f, -1.415484f, -1.392673f, -0.788794f, -1.204819f,
+  -1.998864f, -1.182102f, -0.892110f, -1.317415f, -1.359112f, -1.522867f,
+  -1.468552f, -1.779072f, -2.332959f, -2.160346f, -2.329387f, -2.631259f,
+  -2.744936f, -3.052494f, -2.787363f, -3.442548f, -4.245075f, -3.032172f,
+  -2.061609f, -1.768116f, -1.286072f, -0.706587f, -0.192413f, 0.386938f,
+  0.716997f,  1.481393f,  2.216702f,  2.737986f,  3.109809f,  3.226084f,
+  2.490098f,  -0.095827f, -3.864816f, -3.507248f, -3.128925f, -2.908251f,
+  -2.883836f, -2.881411f, -2.524377f, -2.624478f, -2.399573f, -2.367718f,
+  -1.918255f, -1.926277f, -1.694584f, -1.723790f, -0.966491f, -1.183115f,
+  -1.430687f, 0.872896f,  2.766550f,  3.610080f,  3.578041f,  3.334928f,
+  2.586680f,  1.895721f,  1.122195f,  0.488519f,  -0.140689f, -0.799076f,
+  -1.222860f, -1.502437f, -1.900969f, -3.206816f,
+};
+
+static void generate_hog(const uint8_t *src, int stride, int rows, int cols,
+                         float *hist) {
+  const float step = (float)PI / BINS;
+  float total = 0.1f;
+  src += stride;
+  for (int r = 1; r < rows - 1; ++r) {
+    for (int c = 1; c < cols - 1; ++c) {
+      const uint8_t *above = &src[c - stride];
+      const uint8_t *below = &src[c + stride];
+      const uint8_t *left = &src[c - 1];
+      const uint8_t *right = &src[c + 1];
+      // Calculate gradient using Sobel fitlers.
+      const int dx = (right[-stride] + 2 * right[0] + right[stride]) -
+                     (left[-stride] + 2 * left[0] + left[stride]);
+      const int dy = (below[-1] + 2 * below[0] + below[1]) -
+                     (above[-1] + 2 * above[0] + above[1]);
+      if (dx == 0 && dy == 0) continue;
+      const int temp = abs(dx) + abs(dy);
+      if (!temp) continue;
+      total += temp;
+      if (dx == 0) {
+        hist[0] += temp / 2;
+        hist[BINS - 1] += temp / 2;
+      } else {
+        const float angle = atanf(dy * 1.0f / dx);
+        int idx = (int)roundf(angle / step) + BINS / 2;
+        idx = AOMMIN(idx, BINS - 1);
+        idx = AOMMAX(idx, 0);
+        hist[idx] += temp;
+      }
+    }
+    src += stride;
+  }
+
+  for (int i = 0; i < BINS; ++i) hist[i] /= total;
+}
+
+static void generate_hog_hbd(const uint8_t *src8, int stride, int rows,
+                             int cols, float *hist) {
+  const float step = (float)PI / BINS;
+  float total = 0.1f;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  src += stride;
+  for (int r = 1; r < rows - 1; ++r) {
+    for (int c = 1; c < cols - 1; ++c) {
+      const uint16_t *above = &src[c - stride];
+      const uint16_t *below = &src[c + stride];
+      const uint16_t *left = &src[c - 1];
+      const uint16_t *right = &src[c + 1];
+      // Calculate gradient using Sobel fitlers.
+      const int dx = (right[-stride] + 2 * right[0] + right[stride]) -
+                     (left[-stride] + 2 * left[0] + left[stride]);
+      const int dy = (below[-1] + 2 * below[0] + below[1]) -
+                     (above[-1] + 2 * above[0] + above[1]);
+      if (dx == 0 && dy == 0) continue;
+      const int temp = abs(dx) + abs(dy);
+      if (!temp) continue;
+      total += temp;
+      if (dx == 0) {
+        hist[0] += temp / 2;
+        hist[BINS - 1] += temp / 2;
+      } else {
+        const float angle = atanf(dy * 1.0f / dx);
+        int idx = (int)roundf(angle / step) + BINS / 2;
+        idx = AOMMIN(idx, BINS - 1);
+        idx = AOMMAX(idx, 0);
+        hist[idx] += temp;
+      }
+    }
+    src += stride;
+  }
+
+  for (int i = 0; i < BINS; ++i) hist[i] /= total;
+}
+
+static void prune_intra_mode_with_hog(const MACROBLOCK *x, BLOCK_SIZE bsize,
+                                      float th,
+                                      uint8_t *directional_mode_skip_mask) {
+  aom_clear_system_state();
+
+  const int bh = block_size_high[bsize];
+  const int bw = block_size_wide[bsize];
+  const MACROBLOCKD *xd = &x->e_mbd;
+  const int rows =
+      (xd->mb_to_bottom_edge >= 0) ? bh : (xd->mb_to_bottom_edge >> 3) + bh;
+  const int cols =
+      (xd->mb_to_right_edge >= 0) ? bw : (xd->mb_to_right_edge >> 3) + bw;
+  const int src_stride = x->plane[0].src.stride;
+  const uint8_t *src = x->plane[0].src.buf;
+  float hist[BINS] = { 0.0f };
+  if (is_cur_buf_hbd(xd)) {
+    generate_hog_hbd(src, src_stride, rows, cols, hist);
+  } else {
+    generate_hog(src, src_stride, rows, cols, hist);
+  }
+
+  for (int i = 0; i < DIRECTIONAL_MODES; ++i) {
+    float this_score = intra_hog_model_bias[i];
+    const float *weights = &intra_hog_model_weights[i * BINS];
+    for (int j = 0; j < BINS; ++j) {
+      this_score += weights[j] * hist[j];
+    }
+    if (this_score < th) directional_mode_skip_mask[i + 1] = 1;
+  }
+
+  aom_clear_system_state();
+}
+
+#undef BINS
+
+// Model based RD estimation for luma intra blocks.
+static int64_t intra_model_yrd(const AV1_COMP *const cpi, MACROBLOCK *const x,
+                               BLOCK_SIZE bsize, int mode_cost) {
+  const AV1_COMMON *cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  assert(!is_inter_block(mbmi));
+  RD_STATS this_rd_stats;
+  int row, col;
+  int64_t temp_sse, this_rd;
+  TX_SIZE tx_size = tx_size_from_tx_mode(bsize, x->tx_mode_search_type);
+  const int stepr = tx_size_high_unit[tx_size];
+  const int stepc = tx_size_wide_unit[tx_size];
+  const int max_blocks_wide = max_block_wide(xd, bsize, 0);
+  const int max_blocks_high = max_block_high(xd, bsize, 0);
+  mbmi->tx_size = tx_size;
+  // Prediction.
+  for (row = 0; row < max_blocks_high; row += stepr) {
+    for (col = 0; col < max_blocks_wide; col += stepc) {
+      av1_predict_intra_block_facade(cm, xd, 0, col, row, tx_size);
+    }
+  }
+  // RD estimation.
+  model_rd_sb_fn[cpi->sf.rt_sf.use_simple_rd_model ? MODELRD_LEGACY
+                                                   : MODELRD_TYPE_INTRA](
+      cpi, bsize, x, xd, 0, 0, &this_rd_stats.rate, &this_rd_stats.dist,
+      &this_rd_stats.skip, &temp_sse, NULL, NULL, NULL);
+  if (av1_is_directional_mode(mbmi->mode) && av1_use_angle_delta(bsize)) {
+    mode_cost +=
+        x->angle_delta_cost[mbmi->mode - V_PRED]
+                           [MAX_ANGLE_DELTA + mbmi->angle_delta[PLANE_TYPE_Y]];
+  }
+  if (mbmi->mode == DC_PRED &&
+      av1_filter_intra_allowed_bsize(cm, mbmi->sb_type)) {
+    if (mbmi->filter_intra_mode_info.use_filter_intra) {
+      const int mode = mbmi->filter_intra_mode_info.filter_intra_mode;
+      mode_cost += x->filter_intra_cost[mbmi->sb_type][1] +
+                   x->filter_intra_mode_cost[mode];
+    } else {
+      mode_cost += x->filter_intra_cost[mbmi->sb_type][0];
+    }
+  }
+  this_rd =
+      RDCOST(x->rdmult, this_rd_stats.rate + mode_cost, this_rd_stats.dist);
+  return this_rd;
+}
+
+// Update the intra model yrd and prune the current mode if the new estimate
+// y_rd > 1.5 * best_model_rd.
+static AOM_INLINE int model_intra_yrd_and_prune(const AV1_COMP *const cpi,
+                                                MACROBLOCK *x, BLOCK_SIZE bsize,
+                                                int mode_info_cost,
+                                                int64_t *best_model_rd) {
+  const int64_t this_model_rd = intra_model_yrd(cpi, x, bsize, mode_info_cost);
+  if (*best_model_rd != INT64_MAX &&
+      this_model_rd > *best_model_rd + (*best_model_rd >> 1)) {
+    return 1;
+  } else if (this_model_rd < *best_model_rd) {
+    *best_model_rd = this_model_rd;
+  }
+  return 0;
+}
+
+// Run RD calculation with given luma intra prediction angle., and return
+// the RD cost. Update the best mode info. if the RD cost is the best so far.
+static int64_t calc_rd_given_intra_angle(
+    const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mode_cost,
+    int64_t best_rd_in, int8_t angle_delta, int max_angle_delta, int *rate,
+    RD_STATS *rd_stats, int *best_angle_delta, TX_SIZE *best_tx_size,
+    int64_t *best_rd, int64_t *best_model_rd, uint8_t *best_tx_type_map,
+    uint8_t *best_blk_skip, int skip_model_rd) {
+  RD_STATS tokenonly_rd_stats;
+  int64_t this_rd;
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  const int n4 = bsize_to_num_blk(bsize);
+  assert(!is_inter_block(mbmi));
+  mbmi->angle_delta[PLANE_TYPE_Y] = angle_delta;
+  if (!skip_model_rd) {
+    if (model_intra_yrd_and_prune(cpi, x, bsize, mode_cost, best_model_rd)) {
+      return INT64_MAX;
+    }
+  }
+  av1_pick_uniform_tx_size_type_yrd(cpi, x, &tokenonly_rd_stats, bsize,
+                                    best_rd_in);
+  if (tokenonly_rd_stats.rate == INT_MAX) return INT64_MAX;
+
+  int this_rate =
+      mode_cost + tokenonly_rd_stats.rate +
+      x->angle_delta_cost[mbmi->mode - V_PRED][max_angle_delta + angle_delta];
+  this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
+
+  if (this_rd < *best_rd) {
+    memcpy(best_blk_skip, x->blk_skip, sizeof(best_blk_skip[0]) * n4);
+    av1_copy_array(best_tx_type_map, xd->tx_type_map, n4);
+    *best_rd = this_rd;
+    *best_angle_delta = mbmi->angle_delta[PLANE_TYPE_Y];
+    *best_tx_size = mbmi->tx_size;
+    *rate = this_rate;
+    rd_stats->rate = tokenonly_rd_stats.rate;
+    rd_stats->dist = tokenonly_rd_stats.dist;
+    rd_stats->skip = tokenonly_rd_stats.skip;
+  }
+  return this_rd;
+}
+
+static INLINE int write_uniform_cost(int n, int v) {
+  const int l = get_unsigned_bits(n);
+  const int m = (1 << l) - n;
+  if (l == 0) return 0;
+  if (v < m)
+    return av1_cost_literal(l - 1);
+  else
+    return av1_cost_literal(l);
+}
+
+// Return the rate cost for luma prediction mode info. of intra blocks.
+static int intra_mode_info_cost_y(const AV1_COMP *cpi, const MACROBLOCK *x,
+                                  const MB_MODE_INFO *mbmi, BLOCK_SIZE bsize,
+                                  int mode_cost) {
+  int total_rate = mode_cost;
+  const int use_palette = mbmi->palette_mode_info.palette_size[0] > 0;
+  const int use_filter_intra = mbmi->filter_intra_mode_info.use_filter_intra;
+  const int use_intrabc = mbmi->use_intrabc;
+  // Can only activate one mode.
+  assert(((mbmi->mode != DC_PRED) + use_palette + use_intrabc +
+          use_filter_intra) <= 1);
+  const int try_palette = av1_allow_palette(
+      cpi->common.features.allow_screen_content_tools, mbmi->sb_type);
+  if (try_palette && mbmi->mode == DC_PRED) {
+    const MACROBLOCKD *xd = &x->e_mbd;
+    const int bsize_ctx = av1_get_palette_bsize_ctx(bsize);
+    const int mode_ctx = av1_get_palette_mode_ctx(xd);
+    total_rate += x->palette_y_mode_cost[bsize_ctx][mode_ctx][use_palette];
+    if (use_palette) {
+      const uint8_t *const color_map = xd->plane[0].color_index_map;
+      int block_width, block_height, rows, cols;
+      av1_get_block_dimensions(bsize, 0, xd, &block_width, &block_height, &rows,
+                               &cols);
+      const int plt_size = mbmi->palette_mode_info.palette_size[0];
+      int palette_mode_cost =
+          x->palette_y_size_cost[bsize_ctx][plt_size - PALETTE_MIN_SIZE] +
+          write_uniform_cost(plt_size, color_map[0]);
+      uint16_t color_cache[2 * PALETTE_MAX_SIZE];
+      const int n_cache = av1_get_palette_cache(xd, 0, color_cache);
+      palette_mode_cost +=
+          av1_palette_color_cost_y(&mbmi->palette_mode_info, color_cache,
+                                   n_cache, cpi->common.seq_params.bit_depth);
+      palette_mode_cost +=
+          av1_cost_color_map(x, 0, bsize, mbmi->tx_size, PALETTE_MAP);
+      total_rate += palette_mode_cost;
+    }
+  }
+  if (av1_filter_intra_allowed(&cpi->common, mbmi)) {
+    total_rate += x->filter_intra_cost[mbmi->sb_type][use_filter_intra];
+    if (use_filter_intra) {
+      total_rate += x->filter_intra_mode_cost[mbmi->filter_intra_mode_info
+                                                  .filter_intra_mode];
+    }
+  }
+  if (av1_is_directional_mode(mbmi->mode)) {
+    if (av1_use_angle_delta(bsize)) {
+      total_rate += x->angle_delta_cost[mbmi->mode - V_PRED]
+                                       [MAX_ANGLE_DELTA +
+                                        mbmi->angle_delta[PLANE_TYPE_Y]];
+    }
+  }
+  if (av1_allow_intrabc(&cpi->common))
+    total_rate += x->intrabc_cost[use_intrabc];
+  return total_rate;
+}
+
+// Return the rate cost for chroma prediction mode info. of intra blocks.
+static int intra_mode_info_cost_uv(const AV1_COMP *cpi, const MACROBLOCK *x,
+                                   const MB_MODE_INFO *mbmi, BLOCK_SIZE bsize,
+                                   int mode_cost) {
+  int total_rate = mode_cost;
+  const int use_palette = mbmi->palette_mode_info.palette_size[1] > 0;
+  const UV_PREDICTION_MODE mode = mbmi->uv_mode;
+  // Can only activate one mode.
+  assert(((mode != UV_DC_PRED) + use_palette + mbmi->use_intrabc) <= 1);
+
+  const int try_palette = av1_allow_palette(
+      cpi->common.features.allow_screen_content_tools, mbmi->sb_type);
+  if (try_palette && mode == UV_DC_PRED) {
+    const PALETTE_MODE_INFO *pmi = &mbmi->palette_mode_info;
+    total_rate +=
+        x->palette_uv_mode_cost[pmi->palette_size[0] > 0][use_palette];
+    if (use_palette) {
+      const int bsize_ctx = av1_get_palette_bsize_ctx(bsize);
+      const int plt_size = pmi->palette_size[1];
+      const MACROBLOCKD *xd = &x->e_mbd;
+      const uint8_t *const color_map = xd->plane[1].color_index_map;
+      int palette_mode_cost =
+          x->palette_uv_size_cost[bsize_ctx][plt_size - PALETTE_MIN_SIZE] +
+          write_uniform_cost(plt_size, color_map[0]);
+      uint16_t color_cache[2 * PALETTE_MAX_SIZE];
+      const int n_cache = av1_get_palette_cache(xd, 1, color_cache);
+      palette_mode_cost += av1_palette_color_cost_uv(
+          pmi, color_cache, n_cache, cpi->common.seq_params.bit_depth);
+      palette_mode_cost +=
+          av1_cost_color_map(x, 1, bsize, mbmi->tx_size, PALETTE_MAP);
+      total_rate += palette_mode_cost;
+    }
+  }
+  if (av1_is_directional_mode(get_uv_mode(mode))) {
+    if (av1_use_angle_delta(bsize)) {
+      total_rate +=
+          x->angle_delta_cost[mode - V_PRED][mbmi->angle_delta[PLANE_TYPE_UV] +
+                                             MAX_ANGLE_DELTA];
+    }
+  }
+  return total_rate;
+}
+
+// Return 1 if an filter intra mode is selected; return 0 otherwise.
+static int rd_pick_filter_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                    int *rate, int *rate_tokenonly,
+                                    int64_t *distortion, int *skippable,
+                                    BLOCK_SIZE bsize, int mode_cost,
+                                    int64_t *best_rd, int64_t *best_model_rd,
+                                    PICK_MODE_CONTEXT *ctx) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  int filter_intra_selected_flag = 0;
+  FILTER_INTRA_MODE mode;
+  TX_SIZE best_tx_size = TX_8X8;
+  FILTER_INTRA_MODE_INFO filter_intra_mode_info;
+  uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  (void)ctx;
+  av1_zero(filter_intra_mode_info);
+  mbmi->filter_intra_mode_info.use_filter_intra = 1;
+  mbmi->mode = DC_PRED;
+  mbmi->palette_mode_info.palette_size[0] = 0;
+
+  for (mode = 0; mode < FILTER_INTRA_MODES; ++mode) {
+    int64_t this_rd;
+    RD_STATS tokenonly_rd_stats;
+    mbmi->filter_intra_mode_info.filter_intra_mode = mode;
+
+    if (model_intra_yrd_and_prune(cpi, x, bsize, mode_cost, best_model_rd)) {
+      continue;
+    }
+    av1_pick_uniform_tx_size_type_yrd(cpi, x, &tokenonly_rd_stats, bsize,
+                                      *best_rd);
+    if (tokenonly_rd_stats.rate == INT_MAX) continue;
+    const int this_rate =
+        tokenonly_rd_stats.rate +
+        intra_mode_info_cost_y(cpi, x, mbmi, bsize, mode_cost);
+    this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
+
+    // Collect mode stats for multiwinner mode processing
+    const int txfm_search_done = 1;
+    store_winner_mode_stats(
+        &cpi->common, x, mbmi, NULL, NULL, NULL, 0, NULL, bsize, this_rd,
+        cpi->sf.winner_mode_sf.enable_multiwinner_mode_process,
+        txfm_search_done);
+    if (this_rd < *best_rd) {
+      *best_rd = this_rd;
+      best_tx_size = mbmi->tx_size;
+      filter_intra_mode_info = mbmi->filter_intra_mode_info;
+      av1_copy_array(best_tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
+      memcpy(ctx->blk_skip, x->blk_skip,
+             sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+      *rate = this_rate;
+      *rate_tokenonly = tokenonly_rd_stats.rate;
+      *distortion = tokenonly_rd_stats.dist;
+      *skippable = tokenonly_rd_stats.skip;
+      filter_intra_selected_flag = 1;
+    }
+  }
+
+  if (filter_intra_selected_flag) {
+    mbmi->mode = DC_PRED;
+    mbmi->tx_size = best_tx_size;
+    mbmi->filter_intra_mode_info = filter_intra_mode_info;
+    av1_copy_array(ctx->tx_type_map, best_tx_type_map, ctx->num_4x4_blk);
+    return 1;
+  } else {
+    return 0;
+  }
+}
+
+int av1_count_colors(const uint8_t *src, int stride, int rows, int cols,
+                     int *val_count) {
+  const int max_pix_val = 1 << 8;
+  memset(val_count, 0, max_pix_val * sizeof(val_count[0]));
+  for (int r = 0; r < rows; ++r) {
+    for (int c = 0; c < cols; ++c) {
+      const int this_val = src[r * stride + c];
+      assert(this_val < max_pix_val);
+      ++val_count[this_val];
+    }
+  }
+  int n = 0;
+  for (int i = 0; i < max_pix_val; ++i) {
+    if (val_count[i]) ++n;
+  }
+  return n;
+}
+
+int av1_count_colors_highbd(const uint8_t *src8, int stride, int rows, int cols,
+                            int bit_depth, int *val_count) {
+  assert(bit_depth <= 12);
+  const int max_pix_val = 1 << bit_depth;
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  memset(val_count, 0, max_pix_val * sizeof(val_count[0]));
+  for (int r = 0; r < rows; ++r) {
+    for (int c = 0; c < cols; ++c) {
+      const int this_val = src[r * stride + c];
+      assert(this_val < max_pix_val);
+      if (this_val >= max_pix_val) return 0;
+      ++val_count[this_val];
+    }
+  }
+  int n = 0;
+  for (int i = 0; i < max_pix_val; ++i) {
+    if (val_count[i]) ++n;
+  }
+  return n;
+}
+
+// Extends 'color_map' array from 'orig_width x orig_height' to 'new_width x
+// new_height'. Extra rows and columns are filled in by copying last valid
+// row/column.
+static AOM_INLINE void extend_palette_color_map(uint8_t *const color_map,
+                                                int orig_width, int orig_height,
+                                                int new_width, int new_height) {
+  int j;
+  assert(new_width >= orig_width);
+  assert(new_height >= orig_height);
+  if (new_width == orig_width && new_height == orig_height) return;
+
+  for (j = orig_height - 1; j >= 0; --j) {
+    memmove(color_map + j * new_width, color_map + j * orig_width, orig_width);
+    // Copy last column to extra columns.
+    memset(color_map + j * new_width + orig_width,
+           color_map[j * new_width + orig_width - 1], new_width - orig_width);
+  }
+  // Copy last row to extra rows.
+  for (j = orig_height; j < new_height; ++j) {
+    memcpy(color_map + j * new_width, color_map + (orig_height - 1) * new_width,
+           new_width);
+  }
+}
+
+// Bias toward using colors in the cache.
+// TODO(huisu): Try other schemes to improve compression.
+static AOM_INLINE void optimize_palette_colors(uint16_t *color_cache,
+                                               int n_cache, int n_colors,
+                                               int stride, int *centroids) {
+  if (n_cache <= 0) return;
+  for (int i = 0; i < n_colors * stride; i += stride) {
+    int min_diff = abs(centroids[i] - (int)color_cache[0]);
+    int idx = 0;
+    for (int j = 1; j < n_cache; ++j) {
+      const int this_diff = abs(centroids[i] - color_cache[j]);
+      if (this_diff < min_diff) {
+        min_diff = this_diff;
+        idx = j;
+      }
+    }
+    if (min_diff <= 1) centroids[i] = color_cache[idx];
+  }
+}
+
+// Given the base colors as specified in centroids[], calculate the RD cost
+// of palette mode.
+static AOM_INLINE void palette_rd_y(
+    const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi,
+    BLOCK_SIZE bsize, int dc_mode_cost, const int *data, int *centroids, int n,
+    uint16_t *color_cache, int n_cache, MB_MODE_INFO *best_mbmi,
+    uint8_t *best_palette_color_map, int64_t *best_rd, int64_t *best_model_rd,
+    int *rate, int *rate_tokenonly, int64_t *distortion, int *skippable,
+    int *beat_best_rd, PICK_MODE_CONTEXT *ctx, uint8_t *blk_skip,
+    uint8_t *tx_type_map, int *beat_best_pallette_rd) {
+  optimize_palette_colors(color_cache, n_cache, n, 1, centroids);
+  const int num_unique_colors = av1_remove_duplicates(centroids, n);
+  if (num_unique_colors < PALETTE_MIN_SIZE) {
+    // Too few unique colors to create a palette. And DC_PRED will work
+    // well for that case anyway. So skip.
+    return;
+  }
+  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  if (cpi->common.seq_params.use_highbitdepth) {
+    for (int i = 0; i < num_unique_colors; ++i) {
+      pmi->palette_colors[i] = clip_pixel_highbd(
+          (int)centroids[i], cpi->common.seq_params.bit_depth);
+    }
+  } else {
+    for (int i = 0; i < num_unique_colors; ++i) {
+      pmi->palette_colors[i] = clip_pixel(centroids[i]);
+    }
+  }
+  pmi->palette_size[0] = num_unique_colors;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  uint8_t *const color_map = xd->plane[0].color_index_map;
+  int block_width, block_height, rows, cols;
+  av1_get_block_dimensions(bsize, 0, xd, &block_width, &block_height, &rows,
+                           &cols);
+  av1_calc_indices(data, centroids, color_map, rows * cols, num_unique_colors,
+                   1);
+  extend_palette_color_map(color_map, cols, rows, block_width, block_height);
+
+  const int palette_mode_cost =
+      intra_mode_info_cost_y(cpi, x, mbmi, bsize, dc_mode_cost);
+  if (model_intra_yrd_and_prune(cpi, x, bsize, palette_mode_cost,
+                                best_model_rd)) {
+    return;
+  }
+
+  RD_STATS tokenonly_rd_stats;
+  av1_pick_uniform_tx_size_type_yrd(cpi, x, &tokenonly_rd_stats, bsize,
+                                    *best_rd);
+  if (tokenonly_rd_stats.rate == INT_MAX) return;
+  int this_rate = tokenonly_rd_stats.rate + palette_mode_cost;
+  int64_t this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
+  if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(mbmi->sb_type)) {
+    tokenonly_rd_stats.rate -= tx_size_cost(x, bsize, mbmi->tx_size);
+  }
+  // Collect mode stats for multiwinner mode processing
+  const int txfm_search_done = 1;
+  store_winner_mode_stats(
+      &cpi->common, x, mbmi, NULL, NULL, NULL, THR_DC, color_map, bsize,
+      this_rd, cpi->sf.winner_mode_sf.enable_multiwinner_mode_process,
+      txfm_search_done);
+  if (this_rd < *best_rd) {
+    *best_rd = this_rd;
+    // Setting beat_best_rd flag because current mode rd is better than best_rd.
+    // This flag need to be updated only for palette evaluation in key frames
+    if (beat_best_rd) *beat_best_rd = 1;
+    memcpy(best_palette_color_map, color_map,
+           block_width * block_height * sizeof(color_map[0]));
+    *best_mbmi = *mbmi;
+    memcpy(blk_skip, x->blk_skip, sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+    av1_copy_array(tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
+    if (rate) *rate = this_rate;
+    if (rate_tokenonly) *rate_tokenonly = tokenonly_rd_stats.rate;
+    if (distortion) *distortion = tokenonly_rd_stats.dist;
+    if (skippable) *skippable = tokenonly_rd_stats.skip;
+    if (beat_best_pallette_rd) *beat_best_pallette_rd = 1;
+  }
+}
+
+static AOM_INLINE int perform_top_color_coarse_palette_search(
+    const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi,
+    BLOCK_SIZE bsize, int dc_mode_cost, const int *data,
+    const int *const top_colors, int start_n, int end_n, int step_size,
+    uint16_t *color_cache, int n_cache, MB_MODE_INFO *best_mbmi,
+    uint8_t *best_palette_color_map, int64_t *best_rd, int64_t *best_model_rd,
+    int *rate, int *rate_tokenonly, int64_t *distortion, int *skippable,
+    int *beat_best_rd, PICK_MODE_CONTEXT *ctx, uint8_t *best_blk_skip,
+    uint8_t *tx_type_map) {
+  int centroids[PALETTE_MAX_SIZE];
+  int n = start_n;
+  int top_color_winner = end_n + 1;
+  while (1) {
+    int beat_best_pallette_rd = 0;
+    for (int i = 0; i < n; ++i) centroids[i] = top_colors[i];
+    palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, n,
+                 color_cache, n_cache, best_mbmi, best_palette_color_map,
+                 best_rd, best_model_rd, rate, rate_tokenonly, distortion,
+                 skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map,
+                 &beat_best_pallette_rd);
+    // Break if current palette colors is not winning
+    if (beat_best_pallette_rd) top_color_winner = n;
+    n += step_size;
+    if (n > end_n) break;
+  }
+  return top_color_winner;
+}
+
+static AOM_INLINE int perform_k_means_coarse_palette_search(
+    const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi,
+    BLOCK_SIZE bsize, int dc_mode_cost, const int *data, int lb, int ub,
+    int start_n, int end_n, int step_size, uint16_t *color_cache, int n_cache,
+    MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map, int64_t *best_rd,
+    int64_t *best_model_rd, int *rate, int *rate_tokenonly, int64_t *distortion,
+    int *skippable, int *beat_best_rd, PICK_MODE_CONTEXT *ctx,
+    uint8_t *best_blk_skip, uint8_t *tx_type_map, uint8_t *color_map,
+    int data_points) {
+  int centroids[PALETTE_MAX_SIZE];
+  const int max_itr = 50;
+  int n = start_n;
+  int k_means_winner = end_n + 1;
+  while (1) {
+    int beat_best_pallette_rd = 0;
+    for (int i = 0; i < n; ++i) {
+      centroids[i] = lb + (2 * i + 1) * (ub - lb) / n / 2;
+    }
+    av1_k_means(data, centroids, color_map, data_points, n, 1, max_itr);
+    palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, n,
+                 color_cache, n_cache, best_mbmi, best_palette_color_map,
+                 best_rd, best_model_rd, rate, rate_tokenonly, distortion,
+                 skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map,
+                 &beat_best_pallette_rd);
+    // Break if current palette colors is not winning
+    if (beat_best_pallette_rd) k_means_winner = n;
+    n += step_size;
+    if (n > end_n) break;
+  }
+  return k_means_winner;
+}
+
+// Perform palette search for top colors from minimum palette colors (/maximum)
+// with a step-size of 1 (/-1)
+static AOM_INLINE int perform_top_color_palette_search(
+    const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi,
+    BLOCK_SIZE bsize, int dc_mode_cost, const int *data, int *top_colors,
+    int start_n, int end_n, int step_size, uint16_t *color_cache, int n_cache,
+    MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map, int64_t *best_rd,
+    int64_t *best_model_rd, int *rate, int *rate_tokenonly, int64_t *distortion,
+    int *skippable, int *beat_best_rd, PICK_MODE_CONTEXT *ctx,
+    uint8_t *best_blk_skip, uint8_t *tx_type_map) {
+  int centroids[PALETTE_MAX_SIZE];
+  int n = start_n;
+  assert((step_size == -1) || (step_size == 1) || (step_size == 0) ||
+         (step_size == 2));
+  assert(IMPLIES(step_size == -1, start_n > end_n));
+  assert(IMPLIES(step_size == 1, start_n < end_n));
+  while (1) {
+    int beat_best_pallette_rd = 0;
+    for (int i = 0; i < n; ++i) centroids[i] = top_colors[i];
+    palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, n,
+                 color_cache, n_cache, best_mbmi, best_palette_color_map,
+                 best_rd, best_model_rd, rate, rate_tokenonly, distortion,
+                 skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map,
+                 &beat_best_pallette_rd);
+    // Break if current palette colors is not winning
+    if ((cpi->sf.intra_sf.prune_palette_search_level == 2) &&
+        !beat_best_pallette_rd)
+      return n;
+    n += step_size;
+    if (n == end_n) break;
+  }
+  return n;
+}
+// Perform k-means based palette search from minimum palette colors (/maximum)
+// with a step-size of 1 (/-1)
+static AOM_INLINE int perform_k_means_palette_search(
+    const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi,
+    BLOCK_SIZE bsize, int dc_mode_cost, const int *data, int lb, int ub,
+    int start_n, int end_n, int step_size, uint16_t *color_cache, int n_cache,
+    MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map, int64_t *best_rd,
+    int64_t *best_model_rd, int *rate, int *rate_tokenonly, int64_t *distortion,
+    int *skippable, int *beat_best_rd, PICK_MODE_CONTEXT *ctx,
+    uint8_t *best_blk_skip, uint8_t *tx_type_map, uint8_t *color_map,
+    int data_points) {
+  int centroids[PALETTE_MAX_SIZE];
+  const int max_itr = 50;
+  int n = start_n;
+  assert((step_size == -1) || (step_size == 1) || (step_size == 0) ||
+         (step_size == 2));
+  assert(IMPLIES(step_size == -1, start_n > end_n));
+  assert(IMPLIES(step_size == 1, start_n < end_n));
+  while (1) {
+    int beat_best_pallette_rd = 0;
+    for (int i = 0; i < n; ++i) {
+      centroids[i] = lb + (2 * i + 1) * (ub - lb) / n / 2;
+    }
+    av1_k_means(data, centroids, color_map, data_points, n, 1, max_itr);
+    palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, n,
+                 color_cache, n_cache, best_mbmi, best_palette_color_map,
+                 best_rd, best_model_rd, rate, rate_tokenonly, distortion,
+                 skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map,
+                 &beat_best_pallette_rd);
+    // Break if current palette colors is not winning
+    if ((cpi->sf.intra_sf.prune_palette_search_level == 2) &&
+        !beat_best_pallette_rd)
+      return n;
+    n += step_size;
+    if (n == end_n) break;
+  }
+  return n;
+}
+
+#define START_N_STAGE2(x)                         \
+  ((x == PALETTE_MIN_SIZE) ? PALETTE_MIN_SIZE + 1 \
+                           : AOMMAX(x - 1, PALETTE_MIN_SIZE));
+#define END_N_STAGE2(x, end_n) \
+  ((x == end_n) ? x - 1 : AOMMIN(x + 1, PALETTE_MAX_SIZE));
+
+static AOM_INLINE void update_start_end_stage_2(int *start_n_stage2,
+                                                int *end_n_stage2,
+                                                int *step_size_stage2,
+                                                int winner, int end_n) {
+  *start_n_stage2 = START_N_STAGE2(winner);
+  *end_n_stage2 = END_N_STAGE2(winner, end_n);
+  *step_size_stage2 = *end_n_stage2 - *start_n_stage2;
+}
+
+// Start index and step size below are chosen to evaluate unique
+// candidates in neighbor search, in case a winner candidate is found in
+// coarse search. Example,
+// 1) 8 colors (end_n = 8): 2,3,4,5,6,7,8. start_n is chosen as 2 and step
+// size is chosen as 3. Therefore, coarse search will evaluate 2, 5 and 8.
+// If winner is found at 5, then 4 and 6 are evaluated. Similarly, for 2
+// (3) and 8 (7).
+// 2) 7 colors (end_n = 7): 2,3,4,5,6,7. If start_n is chosen as 2 (same
+// as for 8 colors) then step size should also be 2, to cover all
+// candidates. Coarse search will evaluate 2, 4 and 6. If winner is either
+// 2 or 4, 3 will be evaluated. Instead, if start_n=3 and step_size=3,
+// coarse search will evaluate 3 and 6. For the winner, unique neighbors
+// (3: 2,4 or 6: 5,7) would be evaluated.
+
+// start index for coarse palette search for dominant colors and k-means
+static const uint8_t start_n_lookup_table[PALETTE_MAX_SIZE + 1] = { 0, 0, 0,
+                                                                    3, 3, 2,
+                                                                    3, 3, 2 };
+// step size for coarse palette search for dominant colors and k-means
+static const uint8_t step_size_lookup_table[PALETTE_MAX_SIZE + 1] = { 0, 0, 0,
+                                                                      3, 3, 3,
+                                                                      3, 3, 3 };
+
+static void rd_pick_palette_intra_sby(
+    const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+    int dc_mode_cost, MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map,
+    int64_t *best_rd, int64_t *best_model_rd, int *rate, int *rate_tokenonly,
+    int64_t *distortion, int *skippable, int *beat_best_rd,
+    PICK_MODE_CONTEXT *ctx, uint8_t *best_blk_skip, uint8_t *tx_type_map) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  assert(!is_inter_block(mbmi));
+  assert(av1_allow_palette(cpi->common.features.allow_screen_content_tools,
+                           bsize));
+
+  const int src_stride = x->plane[0].src.stride;
+  const uint8_t *const src = x->plane[0].src.buf;
+  int block_width, block_height, rows, cols;
+  av1_get_block_dimensions(bsize, 0, xd, &block_width, &block_height, &rows,
+                           &cols);
+  const SequenceHeader *const seq_params = &cpi->common.seq_params;
+  const int is_hbd = seq_params->use_highbitdepth;
+  const int bit_depth = seq_params->bit_depth;
+  int count_buf[1 << 12];  // Maximum (1 << 12) color levels.
+  int colors;
+  if (is_hbd) {
+    colors = av1_count_colors_highbd(src, src_stride, rows, cols, bit_depth,
+                                     count_buf);
+  } else {
+    colors = av1_count_colors(src, src_stride, rows, cols, count_buf);
+  }
+
+  uint8_t *const color_map = xd->plane[0].color_index_map;
+  if (colors > 1 && colors <= 64) {
+    int *const data = x->palette_buffer->kmeans_data_buf;
+    int centroids[PALETTE_MAX_SIZE];
+    int lb, ub;
+    if (is_hbd) {
+      int *data_pt = data;
+      const uint16_t *src_pt = CONVERT_TO_SHORTPTR(src);
+      lb = ub = src_pt[0];
+      for (int r = 0; r < rows; ++r) {
+        for (int c = 0; c < cols; ++c) {
+          const int val = src_pt[c];
+          data_pt[c] = val;
+          lb = AOMMIN(lb, val);
+          ub = AOMMAX(ub, val);
+        }
+        src_pt += src_stride;
+        data_pt += cols;
+      }
+    } else {
+      int *data_pt = data;
+      const uint8_t *src_pt = src;
+      lb = ub = src[0];
+      for (int r = 0; r < rows; ++r) {
+        for (int c = 0; c < cols; ++c) {
+          const int val = src_pt[c];
+          data_pt[c] = val;
+          lb = AOMMIN(lb, val);
+          ub = AOMMAX(ub, val);
+        }
+        src_pt += src_stride;
+        data_pt += cols;
+      }
+    }
+
+    mbmi->mode = DC_PRED;
+    mbmi->filter_intra_mode_info.use_filter_intra = 0;
+
+    uint16_t color_cache[2 * PALETTE_MAX_SIZE];
+    const int n_cache = av1_get_palette_cache(xd, 0, color_cache);
+
+    // Find the dominant colors, stored in top_colors[].
+    int top_colors[PALETTE_MAX_SIZE] = { 0 };
+    for (int i = 0; i < AOMMIN(colors, PALETTE_MAX_SIZE); ++i) {
+      int max_count = 0;
+      for (int j = 0; j < (1 << bit_depth); ++j) {
+        if (count_buf[j] > max_count) {
+          max_count = count_buf[j];
+          top_colors[i] = j;
+        }
+      }
+      assert(max_count > 0);
+      count_buf[top_colors[i]] = 0;
+    }
+
+    // Try the dominant colors directly.
+    // TODO(huisu@google.com): Try to avoid duplicate computation in cases
+    // where the dominant colors and the k-means results are similar.
+    if ((cpi->sf.intra_sf.prune_palette_search_level == 1) &&
+        (colors > PALETTE_MIN_SIZE)) {
+      const int end_n = AOMMIN(colors, PALETTE_MAX_SIZE);
+      assert(PALETTE_MAX_SIZE == 8);
+      assert(PALETTE_MIN_SIZE == 2);
+      // Choose the start index and step size for coarse search based on number
+      // of colors
+      const int start_n = start_n_lookup_table[end_n];
+      const int step_size = step_size_lookup_table[end_n];
+      // Perform top color coarse palette search to find the winner candidate
+      const int top_color_winner = perform_top_color_coarse_palette_search(
+          cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, start_n, end_n,
+          step_size, color_cache, n_cache, best_mbmi, best_palette_color_map,
+          best_rd, best_model_rd, rate, rate_tokenonly, distortion, skippable,
+          beat_best_rd, ctx, best_blk_skip, tx_type_map);
+      // Evaluate neighbors for the winner color (if winner is found) in the
+      // above coarse search for dominant colors
+      if (top_color_winner <= end_n) {
+        int start_n_stage2, end_n_stage2, step_size_stage2;
+        update_start_end_stage_2(&start_n_stage2, &end_n_stage2,
+                                 &step_size_stage2, top_color_winner, end_n);
+        // perform finer search for the winner candidate
+        perform_top_color_palette_search(
+            cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, start_n_stage2,
+            end_n_stage2 + step_size_stage2, step_size_stage2, color_cache,
+            n_cache, best_mbmi, best_palette_color_map, best_rd, best_model_rd,
+            rate, rate_tokenonly, distortion, skippable, beat_best_rd, ctx,
+            best_blk_skip, tx_type_map);
+      }
+      // K-means clustering.
+      // Perform k-means coarse palette search to find the winner candidate
+      const int k_means_winner = perform_k_means_coarse_palette_search(
+          cpi, x, mbmi, bsize, dc_mode_cost, data, lb, ub, start_n, end_n,
+          step_size, color_cache, n_cache, best_mbmi, best_palette_color_map,
+          best_rd, best_model_rd, rate, rate_tokenonly, distortion, skippable,
+          beat_best_rd, ctx, best_blk_skip, tx_type_map, color_map,
+          rows * cols);
+      // Evaluate neighbors for the winner color (if winner is found) in the
+      // above coarse search for k-means
+      if (k_means_winner <= end_n) {
+        int start_n_stage2, end_n_stage2, step_size_stage2;
+        update_start_end_stage_2(&start_n_stage2, &end_n_stage2,
+                                 &step_size_stage2, k_means_winner, end_n);
+        // perform finer search for the winner candidate
+        perform_k_means_palette_search(
+            cpi, x, mbmi, bsize, dc_mode_cost, data, lb, ub, start_n_stage2,
+            end_n_stage2 + step_size_stage2, step_size_stage2, color_cache,
+            n_cache, best_mbmi, best_palette_color_map, best_rd, best_model_rd,
+            rate, rate_tokenonly, distortion, skippable, beat_best_rd, ctx,
+            best_blk_skip, tx_type_map, color_map, rows * cols);
+      }
+    } else {
+      const int start_n = AOMMIN(colors, PALETTE_MAX_SIZE),
+                end_n = PALETTE_MIN_SIZE;
+      // Perform top color palette search from start_n
+      const int top_color_winner = perform_top_color_palette_search(
+          cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, start_n,
+          end_n - 1, -1, color_cache, n_cache, best_mbmi,
+          best_palette_color_map, best_rd, best_model_rd, rate, rate_tokenonly,
+          distortion, skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map);
+
+      if (top_color_winner > end_n) {
+        // Perform top color palette search in reverse order for the remaining
+        // colors
+        perform_top_color_palette_search(
+            cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, end_n,
+            top_color_winner, 1, color_cache, n_cache, best_mbmi,
+            best_palette_color_map, best_rd, best_model_rd, rate,
+            rate_tokenonly, distortion, skippable, beat_best_rd, ctx,
+            best_blk_skip, tx_type_map);
+      }
+      // K-means clustering.
+      if (colors == PALETTE_MIN_SIZE) {
+        // Special case: These colors automatically become the centroids.
+        assert(colors == 2);
+        centroids[0] = lb;
+        centroids[1] = ub;
+        palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, colors,
+                     color_cache, n_cache, best_mbmi, best_palette_color_map,
+                     best_rd, best_model_rd, rate, rate_tokenonly, distortion,
+                     skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map,
+                     NULL);
+      } else {
+        // Perform k-means palette search from start_n
+        const int k_means_winner = perform_k_means_palette_search(
+            cpi, x, mbmi, bsize, dc_mode_cost, data, lb, ub, start_n, end_n - 1,
+            -1, color_cache, n_cache, best_mbmi, best_palette_color_map,
+            best_rd, best_model_rd, rate, rate_tokenonly, distortion, skippable,
+            beat_best_rd, ctx, best_blk_skip, tx_type_map, color_map,
+            rows * cols);
+        if (k_means_winner > end_n) {
+          // Perform k-means palette search in reverse order for the remaining
+          // colors
+          perform_k_means_palette_search(
+              cpi, x, mbmi, bsize, dc_mode_cost, data, lb, ub, end_n,
+              k_means_winner, 1, color_cache, n_cache, best_mbmi,
+              best_palette_color_map, best_rd, best_model_rd, rate,
+              rate_tokenonly, distortion, skippable, beat_best_rd, ctx,
+              best_blk_skip, tx_type_map, color_map, rows * cols);
+        }
+      }
+    }
+  }
+
+  if (best_mbmi->palette_mode_info.palette_size[0] > 0) {
+    memcpy(color_map, best_palette_color_map,
+           block_width * block_height * sizeof(best_palette_color_map[0]));
+  }
+  *mbmi = *best_mbmi;
+}
+
+static AOM_INLINE void rd_pick_palette_intra_sbuv(
+    const AV1_COMP *const cpi, MACROBLOCK *x, int dc_mode_cost,
+    uint8_t *best_palette_color_map, MB_MODE_INFO *const best_mbmi,
+    int64_t *best_rd, int *rate, int *rate_tokenonly, int64_t *distortion,
+    int *skippable) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  assert(!is_inter_block(mbmi));
+  assert(av1_allow_palette(cpi->common.features.allow_screen_content_tools,
+                           mbmi->sb_type));
+  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const SequenceHeader *const seq_params = &cpi->common.seq_params;
+  int this_rate;
+  int64_t this_rd;
+  int colors_u, colors_v, colors;
+  const int src_stride = x->plane[1].src.stride;
+  const uint8_t *const src_u = x->plane[1].src.buf;
+  const uint8_t *const src_v = x->plane[2].src.buf;
+  uint8_t *const color_map = xd->plane[1].color_index_map;
+  RD_STATS tokenonly_rd_stats;
+  int plane_block_width, plane_block_height, rows, cols;
+  av1_get_block_dimensions(bsize, 1, xd, &plane_block_width,
+                           &plane_block_height, &rows, &cols);
+
+  mbmi->uv_mode = UV_DC_PRED;
+
+  int count_buf[1 << 12];  // Maximum (1 << 12) color levels.
+  if (seq_params->use_highbitdepth) {
+    colors_u = av1_count_colors_highbd(src_u, src_stride, rows, cols,
+                                       seq_params->bit_depth, count_buf);
+    colors_v = av1_count_colors_highbd(src_v, src_stride, rows, cols,
+                                       seq_params->bit_depth, count_buf);
+  } else {
+    colors_u = av1_count_colors(src_u, src_stride, rows, cols, count_buf);
+    colors_v = av1_count_colors(src_v, src_stride, rows, cols, count_buf);
+  }
+
+  uint16_t color_cache[2 * PALETTE_MAX_SIZE];
+  const int n_cache = av1_get_palette_cache(xd, 1, color_cache);
+
+  colors = colors_u > colors_v ? colors_u : colors_v;
+  if (colors > 1 && colors <= 64) {
+    int r, c, n, i, j;
+    const int max_itr = 50;
+    int lb_u, ub_u, val_u;
+    int lb_v, ub_v, val_v;
+    int *const data = x->palette_buffer->kmeans_data_buf;
+    int centroids[2 * PALETTE_MAX_SIZE];
+
+    uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src_u);
+    uint16_t *src_v16 = CONVERT_TO_SHORTPTR(src_v);
+    if (seq_params->use_highbitdepth) {
+      lb_u = src_u16[0];
+      ub_u = src_u16[0];
+      lb_v = src_v16[0];
+      ub_v = src_v16[0];
+    } else {
+      lb_u = src_u[0];
+      ub_u = src_u[0];
+      lb_v = src_v[0];
+      ub_v = src_v[0];
+    }
+
+    for (r = 0; r < rows; ++r) {
+      for (c = 0; c < cols; ++c) {
+        if (seq_params->use_highbitdepth) {
+          val_u = src_u16[r * src_stride + c];
+          val_v = src_v16[r * src_stride + c];
+          data[(r * cols + c) * 2] = val_u;
+          data[(r * cols + c) * 2 + 1] = val_v;
+        } else {
+          val_u = src_u[r * src_stride + c];
+          val_v = src_v[r * src_stride + c];
+          data[(r * cols + c) * 2] = val_u;
+          data[(r * cols + c) * 2 + 1] = val_v;
+        }
+        if (val_u < lb_u)
+          lb_u = val_u;
+        else if (val_u > ub_u)
+          ub_u = val_u;
+        if (val_v < lb_v)
+          lb_v = val_v;
+        else if (val_v > ub_v)
+          ub_v = val_v;
+      }
+    }
+
+    for (n = colors > PALETTE_MAX_SIZE ? PALETTE_MAX_SIZE : colors; n >= 2;
+         --n) {
+      for (i = 0; i < n; ++i) {
+        centroids[i * 2] = lb_u + (2 * i + 1) * (ub_u - lb_u) / n / 2;
+        centroids[i * 2 + 1] = lb_v + (2 * i + 1) * (ub_v - lb_v) / n / 2;
+      }
+      av1_k_means(data, centroids, color_map, rows * cols, n, 2, max_itr);
+      optimize_palette_colors(color_cache, n_cache, n, 2, centroids);
+      // Sort the U channel colors in ascending order.
+      for (i = 0; i < 2 * (n - 1); i += 2) {
+        int min_idx = i;
+        int min_val = centroids[i];
+        for (j = i + 2; j < 2 * n; j += 2)
+          if (centroids[j] < min_val) min_val = centroids[j], min_idx = j;
+        if (min_idx != i) {
+          int temp_u = centroids[i], temp_v = centroids[i + 1];
+          centroids[i] = centroids[min_idx];
+          centroids[i + 1] = centroids[min_idx + 1];
+          centroids[min_idx] = temp_u, centroids[min_idx + 1] = temp_v;
+        }
+      }
+      av1_calc_indices(data, centroids, color_map, rows * cols, n, 2);
+      extend_palette_color_map(color_map, cols, rows, plane_block_width,
+                               plane_block_height);
+      pmi->palette_size[1] = n;
+      for (i = 1; i < 3; ++i) {
+        for (j = 0; j < n; ++j) {
+          if (seq_params->use_highbitdepth)
+            pmi->palette_colors[i * PALETTE_MAX_SIZE + j] = clip_pixel_highbd(
+                (int)centroids[j * 2 + i - 1], seq_params->bit_depth);
+          else
+            pmi->palette_colors[i * PALETTE_MAX_SIZE + j] =
+                clip_pixel((int)centroids[j * 2 + i - 1]);
+        }
+      }
+
+      av1_txfm_uvrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd);
+      if (tokenonly_rd_stats.rate == INT_MAX) continue;
+      this_rate = tokenonly_rd_stats.rate +
+                  intra_mode_info_cost_uv(cpi, x, mbmi, bsize, dc_mode_cost);
+      this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
+      if (this_rd < *best_rd) {
+        *best_rd = this_rd;
+        *best_mbmi = *mbmi;
+        memcpy(best_palette_color_map, color_map,
+               plane_block_width * plane_block_height *
+                   sizeof(best_palette_color_map[0]));
+        *rate = this_rate;
+        *distortion = tokenonly_rd_stats.dist;
+        *rate_tokenonly = tokenonly_rd_stats.rate;
+        *skippable = tokenonly_rd_stats.skip;
+      }
+    }
+  }
+  if (best_mbmi->palette_mode_info.palette_size[1] > 0) {
+    memcpy(color_map, best_palette_color_map,
+           plane_block_width * plane_block_height *
+               sizeof(best_palette_color_map[0]));
+  }
+}
+
+void av1_restore_uv_color_map(const AV1_COMP *const cpi, MACROBLOCK *x) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  int src_stride = x->plane[1].src.stride;
+  const uint8_t *const src_u = x->plane[1].src.buf;
+  const uint8_t *const src_v = x->plane[2].src.buf;
+  int *const data = x->palette_buffer->kmeans_data_buf;
+  int centroids[2 * PALETTE_MAX_SIZE];
+  uint8_t *const color_map = xd->plane[1].color_index_map;
+  int r, c;
+  const uint16_t *const src_u16 = CONVERT_TO_SHORTPTR(src_u);
+  const uint16_t *const src_v16 = CONVERT_TO_SHORTPTR(src_v);
+  int plane_block_width, plane_block_height, rows, cols;
+  av1_get_block_dimensions(bsize, 1, xd, &plane_block_width,
+                           &plane_block_height, &rows, &cols);
+
+  for (r = 0; r < rows; ++r) {
+    for (c = 0; c < cols; ++c) {
+      if (cpi->common.seq_params.use_highbitdepth) {
+        data[(r * cols + c) * 2] = src_u16[r * src_stride + c];
+        data[(r * cols + c) * 2 + 1] = src_v16[r * src_stride + c];
+      } else {
+        data[(r * cols + c) * 2] = src_u[r * src_stride + c];
+        data[(r * cols + c) * 2 + 1] = src_v[r * src_stride + c];
+      }
+    }
+  }
+
+  for (r = 1; r < 3; ++r) {
+    for (c = 0; c < pmi->palette_size[1]; ++c) {
+      centroids[c * 2 + r - 1] = pmi->palette_colors[r * PALETTE_MAX_SIZE + c];
+    }
+  }
+
+  av1_calc_indices(data, centroids, color_map, rows * cols,
+                   pmi->palette_size[1], 2);
+  extend_palette_color_map(color_map, cols, rows, plane_block_width,
+                           plane_block_height);
+}
+
+static AOM_INLINE void choose_intra_uv_mode(
+    const AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize,
+    TX_SIZE max_tx_size, int *rate_uv, int *rate_uv_tokenonly, int64_t *dist_uv,
+    int *skip_uv, UV_PREDICTION_MODE *mode_uv) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  // Use an estimated rd for uv_intra based on DC_PRED if the
+  // appropriate speed flag is set.
+  init_sbuv_mode(mbmi);
+  if (!xd->is_chroma_ref) {
+    *rate_uv = 0;
+    *rate_uv_tokenonly = 0;
+    *dist_uv = 0;
+    *skip_uv = 1;
+    *mode_uv = UV_DC_PRED;
+    return;
+  }
+
+  // Only store reconstructed luma when there's chroma RDO. When there's no
+  // chroma RDO, the reconstructed luma will be stored in encode_superblock().
+  xd->cfl.store_y = store_cfl_required_rdo(cm, x);
+  if (xd->cfl.store_y) {
+    // Restore reconstructed luma values.
+    av1_encode_intra_block_plane(cpi, x, mbmi->sb_type, AOM_PLANE_Y,
+                                 DRY_RUN_NORMAL,
+                                 cpi->optimize_seg_arr[mbmi->segment_id]);
+    xd->cfl.store_y = 0;
+  }
+  av1_rd_pick_intra_sbuv_mode(cpi, x, rate_uv, rate_uv_tokenonly, dist_uv,
+                              skip_uv, bsize, max_tx_size);
+  *mode_uv = mbmi->uv_mode;
+}
+
+// Run RD calculation with given chroma intra prediction angle., and return
+// the RD cost. Update the best mode info. if the RD cost is the best so far.
+static int64_t pick_intra_angle_routine_sbuv(
+    const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+    int rate_overhead, int64_t best_rd_in, int *rate, RD_STATS *rd_stats,
+    int *best_angle_delta, int64_t *best_rd) {
+  MB_MODE_INFO *mbmi = x->e_mbd.mi[0];
+  assert(!is_inter_block(mbmi));
+  int this_rate;
+  int64_t this_rd;
+  RD_STATS tokenonly_rd_stats;
+
+  if (!av1_txfm_uvrd(cpi, x, &tokenonly_rd_stats, bsize, best_rd_in))
+    return INT64_MAX;
+  this_rate = tokenonly_rd_stats.rate +
+              intra_mode_info_cost_uv(cpi, x, mbmi, bsize, rate_overhead);
+  this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
+  if (this_rd < *best_rd) {
+    *best_rd = this_rd;
+    *best_angle_delta = mbmi->angle_delta[PLANE_TYPE_UV];
+    *rate = this_rate;
+    rd_stats->rate = tokenonly_rd_stats.rate;
+    rd_stats->dist = tokenonly_rd_stats.dist;
+    rd_stats->skip = tokenonly_rd_stats.skip;
+  }
+  return this_rd;
+}
+
+// With given chroma directional intra prediction mode, pick the best angle
+// delta. Return true if a RD cost that is smaller than the input one is found.
+static int rd_pick_intra_angle_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                    BLOCK_SIZE bsize, int rate_overhead,
+                                    int64_t best_rd, int *rate,
+                                    RD_STATS *rd_stats) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  assert(!is_inter_block(mbmi));
+  int i, angle_delta, best_angle_delta = 0;
+  int64_t this_rd, best_rd_in, rd_cost[2 * (MAX_ANGLE_DELTA + 2)];
+
+  rd_stats->rate = INT_MAX;
+  rd_stats->skip = 0;
+  rd_stats->dist = INT64_MAX;
+  for (i = 0; i < 2 * (MAX_ANGLE_DELTA + 2); ++i) rd_cost[i] = INT64_MAX;
+
+  for (angle_delta = 0; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) {
+    for (i = 0; i < 2; ++i) {
+      best_rd_in = (best_rd == INT64_MAX)
+                       ? INT64_MAX
+                       : (best_rd + (best_rd >> ((angle_delta == 0) ? 3 : 5)));
+      mbmi->angle_delta[PLANE_TYPE_UV] = (1 - 2 * i) * angle_delta;
+      this_rd = pick_intra_angle_routine_sbuv(cpi, x, bsize, rate_overhead,
+                                              best_rd_in, rate, rd_stats,
+                                              &best_angle_delta, &best_rd);
+      rd_cost[2 * angle_delta + i] = this_rd;
+      if (angle_delta == 0) {
+        if (this_rd == INT64_MAX) return 0;
+        rd_cost[1] = this_rd;
+        break;
+      }
+    }
+  }
+
+  assert(best_rd != INT64_MAX);
+  for (angle_delta = 1; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) {
+    int64_t rd_thresh;
+    for (i = 0; i < 2; ++i) {
+      int skip_search = 0;
+      rd_thresh = best_rd + (best_rd >> 5);
+      if (rd_cost[2 * (angle_delta + 1) + i] > rd_thresh &&
+          rd_cost[2 * (angle_delta - 1) + i] > rd_thresh)
+        skip_search = 1;
+      if (!skip_search) {
+        mbmi->angle_delta[PLANE_TYPE_UV] = (1 - 2 * i) * angle_delta;
+        pick_intra_angle_routine_sbuv(cpi, x, bsize, rate_overhead, best_rd,
+                                      rate, rd_stats, &best_angle_delta,
+                                      &best_rd);
+      }
+    }
+  }
+
+  mbmi->angle_delta[PLANE_TYPE_UV] = best_angle_delta;
+  return rd_stats->rate != INT_MAX;
+}
+
+#define PLANE_SIGN_TO_JOINT_SIGN(plane, a, b) \
+  (plane == CFL_PRED_U ? a * CFL_SIGNS + b - 1 : b * CFL_SIGNS + a - 1)
+static int cfl_rd_pick_alpha(MACROBLOCK *const x, const AV1_COMP *const cpi,
+                             TX_SIZE tx_size, int64_t best_rd) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const MACROBLOCKD_PLANE *pd = &xd->plane[AOM_PLANE_U];
+  const BLOCK_SIZE plane_bsize =
+      get_plane_block_size(mbmi->sb_type, pd->subsampling_x, pd->subsampling_y);
+
+  assert(is_cfl_allowed(xd) && cpi->oxcf.enable_cfl_intra);
+  assert(plane_bsize < BLOCK_SIZES_ALL);
+  if (!xd->lossless[mbmi->segment_id]) {
+    assert(block_size_wide[plane_bsize] == tx_size_wide[tx_size]);
+    assert(block_size_high[plane_bsize] == tx_size_high[tx_size]);
+  }
+
+  xd->cfl.use_dc_pred_cache = 1;
+  const int64_t mode_rd =
+      RDCOST(x->rdmult,
+             x->intra_uv_mode_cost[CFL_ALLOWED][mbmi->mode][UV_CFL_PRED], 0);
+  int64_t best_rd_uv[CFL_JOINT_SIGNS][CFL_PRED_PLANES];
+  int best_c[CFL_JOINT_SIGNS][CFL_PRED_PLANES];
+#if CONFIG_DEBUG
+  int best_rate_uv[CFL_JOINT_SIGNS][CFL_PRED_PLANES];
+#endif  // CONFIG_DEBUG
+
+  const int skip_trellis = 0;
+  for (int plane = 0; plane < CFL_PRED_PLANES; plane++) {
+    RD_STATS rd_stats;
+    av1_init_rd_stats(&rd_stats);
+    for (int joint_sign = 0; joint_sign < CFL_JOINT_SIGNS; joint_sign++) {
+      best_rd_uv[joint_sign][plane] = INT64_MAX;
+      best_c[joint_sign][plane] = 0;
+    }
+    // Collect RD stats for an alpha value of zero in this plane.
+    // Skip i == CFL_SIGN_ZERO as (0, 0) is invalid.
+    for (int i = CFL_SIGN_NEG; i < CFL_SIGNS; i++) {
+      const int8_t joint_sign =
+          PLANE_SIGN_TO_JOINT_SIGN(plane, CFL_SIGN_ZERO, i);
+      if (i == CFL_SIGN_NEG) {
+        mbmi->cfl_alpha_idx = 0;
+        mbmi->cfl_alpha_signs = joint_sign;
+        av1_txfm_rd_in_plane(
+            x, cpi, &rd_stats, best_rd, 0, plane + 1, plane_bsize, tx_size,
+            cpi->sf.rd_sf.use_fast_coef_costing, FTXS_NONE, skip_trellis);
+        if (rd_stats.rate == INT_MAX) break;
+      }
+      const int alpha_rate = x->cfl_cost[joint_sign][plane][0];
+      best_rd_uv[joint_sign][plane] =
+          RDCOST(x->rdmult, rd_stats.rate + alpha_rate, rd_stats.dist);
+#if CONFIG_DEBUG
+      best_rate_uv[joint_sign][plane] = rd_stats.rate;
+#endif  // CONFIG_DEBUG
+    }
+  }
+
+  int8_t best_joint_sign = -1;
+
+  for (int plane = 0; plane < CFL_PRED_PLANES; plane++) {
+    for (int pn_sign = CFL_SIGN_NEG; pn_sign < CFL_SIGNS; pn_sign++) {
+      int progress = 0;
+      for (int c = 0; c < CFL_ALPHABET_SIZE; c++) {
+        int flag = 0;
+        RD_STATS rd_stats;
+        if (c > 2 && progress < c) break;
+        av1_init_rd_stats(&rd_stats);
+        for (int i = 0; i < CFL_SIGNS; i++) {
+          const int8_t joint_sign = PLANE_SIGN_TO_JOINT_SIGN(plane, pn_sign, i);
+          if (i == 0) {
+            mbmi->cfl_alpha_idx = (c << CFL_ALPHABET_SIZE_LOG2) + c;
+            mbmi->cfl_alpha_signs = joint_sign;
+            av1_txfm_rd_in_plane(
+                x, cpi, &rd_stats, best_rd, 0, plane + 1, plane_bsize, tx_size,
+                cpi->sf.rd_sf.use_fast_coef_costing, FTXS_NONE, skip_trellis);
+            if (rd_stats.rate == INT_MAX) break;
+          }
+          const int alpha_rate = x->cfl_cost[joint_sign][plane][c];
+          int64_t this_rd =
+              RDCOST(x->rdmult, rd_stats.rate + alpha_rate, rd_stats.dist);
+          if (this_rd >= best_rd_uv[joint_sign][plane]) continue;
+          best_rd_uv[joint_sign][plane] = this_rd;
+          best_c[joint_sign][plane] = c;
+#if CONFIG_DEBUG
+          best_rate_uv[joint_sign][plane] = rd_stats.rate;
+#endif  // CONFIG_DEBUG
+          flag = 2;
+          if (best_rd_uv[joint_sign][!plane] == INT64_MAX) continue;
+          this_rd += mode_rd + best_rd_uv[joint_sign][!plane];
+          if (this_rd >= best_rd) continue;
+          best_rd = this_rd;
+          best_joint_sign = joint_sign;
+        }
+        progress += flag;
+      }
+    }
+  }
+
+  int best_rate_overhead = INT_MAX;
+  uint8_t ind = 0;
+  if (best_joint_sign >= 0) {
+    const int u = best_c[best_joint_sign][CFL_PRED_U];
+    const int v = best_c[best_joint_sign][CFL_PRED_V];
+    ind = (u << CFL_ALPHABET_SIZE_LOG2) + v;
+    best_rate_overhead = x->cfl_cost[best_joint_sign][CFL_PRED_U][u] +
+                         x->cfl_cost[best_joint_sign][CFL_PRED_V][v];
+#if CONFIG_DEBUG
+    xd->cfl.rate = x->intra_uv_mode_cost[CFL_ALLOWED][mbmi->mode][UV_CFL_PRED] +
+                   best_rate_overhead +
+                   best_rate_uv[best_joint_sign][CFL_PRED_U] +
+                   best_rate_uv[best_joint_sign][CFL_PRED_V];
+#endif  // CONFIG_DEBUG
+  } else {
+    best_joint_sign = 0;
+  }
+
+  mbmi->cfl_alpha_idx = ind;
+  mbmi->cfl_alpha_signs = best_joint_sign;
+  xd->cfl.use_dc_pred_cache = 0;
+  xd->cfl.dc_pred_is_cached[0] = 0;
+  xd->cfl.dc_pred_is_cached[1] = 0;
+  return best_rate_overhead;
+}
+
+int64_t av1_rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                    int *rate, int *rate_tokenonly,
+                                    int64_t *distortion, int *skippable,
+                                    BLOCK_SIZE bsize, TX_SIZE max_tx_size) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  assert(!is_inter_block(mbmi));
+  MB_MODE_INFO best_mbmi = *mbmi;
+  int64_t best_rd = INT64_MAX, this_rd;
+
+  for (int mode_idx = 0; mode_idx < UV_INTRA_MODES; ++mode_idx) {
+    int this_rate;
+    RD_STATS tokenonly_rd_stats;
+    UV_PREDICTION_MODE mode = uv_rd_search_mode_order[mode_idx];
+    const int is_directional_mode = av1_is_directional_mode(get_uv_mode(mode));
+    if (!(cpi->sf.intra_sf.intra_uv_mode_mask[txsize_sqr_up_map[max_tx_size]] &
+          (1 << mode)))
+      continue;
+    if (!cpi->oxcf.enable_smooth_intra && mode >= UV_SMOOTH_PRED &&
+        mode <= UV_SMOOTH_H_PRED)
+      continue;
+
+    if (!cpi->oxcf.enable_paeth_intra && mode == UV_PAETH_PRED) continue;
+
+    mbmi->uv_mode = mode;
+    int cfl_alpha_rate = 0;
+    if (mode == UV_CFL_PRED) {
+      if (!is_cfl_allowed(xd) || !cpi->oxcf.enable_cfl_intra) continue;
+      assert(!is_directional_mode);
+      const TX_SIZE uv_tx_size = av1_get_tx_size(AOM_PLANE_U, xd);
+      cfl_alpha_rate = cfl_rd_pick_alpha(x, cpi, uv_tx_size, best_rd);
+      if (cfl_alpha_rate == INT_MAX) continue;
+    }
+    mbmi->angle_delta[PLANE_TYPE_UV] = 0;
+    if (is_directional_mode && av1_use_angle_delta(mbmi->sb_type) &&
+        cpi->oxcf.enable_angle_delta) {
+      const int rate_overhead =
+          x->intra_uv_mode_cost[is_cfl_allowed(xd)][mbmi->mode][mode];
+      if (!rd_pick_intra_angle_sbuv(cpi, x, bsize, rate_overhead, best_rd,
+                                    &this_rate, &tokenonly_rd_stats))
+        continue;
+    } else {
+      if (!av1_txfm_uvrd(cpi, x, &tokenonly_rd_stats, bsize, best_rd)) {
+        continue;
+      }
+    }
+    const int mode_cost =
+        x->intra_uv_mode_cost[is_cfl_allowed(xd)][mbmi->mode][mode] +
+        cfl_alpha_rate;
+    this_rate = tokenonly_rd_stats.rate +
+                intra_mode_info_cost_uv(cpi, x, mbmi, bsize, mode_cost);
+    if (mode == UV_CFL_PRED) {
+      assert(is_cfl_allowed(xd) && cpi->oxcf.enable_cfl_intra);
+#if CONFIG_DEBUG
+      if (!xd->lossless[mbmi->segment_id])
+        assert(xd->cfl.rate == tokenonly_rd_stats.rate + mode_cost);
+#endif  // CONFIG_DEBUG
+    }
+    this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
+
+    if (this_rd < best_rd) {
+      best_mbmi = *mbmi;
+      best_rd = this_rd;
+      *rate = this_rate;
+      *rate_tokenonly = tokenonly_rd_stats.rate;
+      *distortion = tokenonly_rd_stats.dist;
+      *skippable = tokenonly_rd_stats.skip;
+    }
+  }
+
+  const int try_palette =
+      cpi->oxcf.enable_palette &&
+      av1_allow_palette(cpi->common.features.allow_screen_content_tools,
+                        mbmi->sb_type);
+  if (try_palette) {
+    uint8_t *best_palette_color_map = x->palette_buffer->best_palette_color_map;
+    rd_pick_palette_intra_sbuv(
+        cpi, x,
+        x->intra_uv_mode_cost[is_cfl_allowed(xd)][mbmi->mode][UV_DC_PRED],
+        best_palette_color_map, &best_mbmi, &best_rd, rate, rate_tokenonly,
+        distortion, skippable);
+  }
+
+  *mbmi = best_mbmi;
+  // Make sure we actually chose a mode
+  assert(best_rd < INT64_MAX);
+  return best_rd;
+}
+
+int av1_search_palette_mode(const AV1_COMP *cpi, MACROBLOCK *x,
+                            RD_STATS *this_rd_cost, PICK_MODE_CONTEXT *ctx,
+                            BLOCK_SIZE bsize, MB_MODE_INFO *const mbmi,
+                            PALETTE_MODE_INFO *const pmi,
+                            unsigned int *ref_costs_single,
+                            IntraModeSearchState *intra_search_state,
+                            int64_t best_rd) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *const xd = &x->e_mbd;
+  int rate2 = 0;
+  int64_t distortion2 = 0, best_rd_palette = best_rd, this_rd,
+          best_model_rd_palette = INT64_MAX;
+  int skippable = 0;
+  TX_SIZE uv_tx = TX_4X4;
+  uint8_t *const best_palette_color_map =
+      x->palette_buffer->best_palette_color_map;
+  uint8_t *const color_map = xd->plane[0].color_index_map;
+  MB_MODE_INFO best_mbmi_palette = *mbmi;
+  uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  const int *const intra_mode_cost = x->mbmode_cost[size_group_lookup[bsize]];
+  const int rows = block_size_high[bsize];
+  const int cols = block_size_wide[bsize];
+
+  mbmi->mode = DC_PRED;
+  mbmi->uv_mode = UV_DC_PRED;
+  mbmi->ref_frame[0] = INTRA_FRAME;
+  mbmi->ref_frame[1] = NONE_FRAME;
+  RD_STATS rd_stats_y;
+  av1_invalid_rd_stats(&rd_stats_y);
+  rd_pick_palette_intra_sby(
+      cpi, x, bsize, intra_mode_cost[DC_PRED], &best_mbmi_palette,
+      best_palette_color_map, &best_rd_palette, &best_model_rd_palette,
+      &rd_stats_y.rate, NULL, &rd_stats_y.dist, &rd_stats_y.skip, NULL, ctx,
+      best_blk_skip, best_tx_type_map);
+  if (rd_stats_y.rate == INT_MAX || pmi->palette_size[0] == 0) {
+    this_rd_cost->rdcost = INT64_MAX;
+    return skippable;
+  }
+
+  memcpy(x->blk_skip, best_blk_skip,
+         sizeof(best_blk_skip[0]) * bsize_to_num_blk(bsize));
+  av1_copy_array(xd->tx_type_map, best_tx_type_map, ctx->num_4x4_blk);
+  memcpy(color_map, best_palette_color_map,
+         rows * cols * sizeof(best_palette_color_map[0]));
+
+  skippable = rd_stats_y.skip;
+  distortion2 = rd_stats_y.dist;
+  rate2 = rd_stats_y.rate + ref_costs_single[INTRA_FRAME];
+  if (num_planes > 1) {
+    uv_tx = av1_get_tx_size(AOM_PLANE_U, xd);
+    if (intra_search_state->rate_uv_intra == INT_MAX) {
+      choose_intra_uv_mode(
+          cpi, x, bsize, uv_tx, &intra_search_state->rate_uv_intra,
+          &intra_search_state->rate_uv_tokenonly, &intra_search_state->dist_uvs,
+          &intra_search_state->skip_uvs, &intra_search_state->mode_uv);
+      intra_search_state->pmi_uv = *pmi;
+      intra_search_state->uv_angle_delta = mbmi->angle_delta[PLANE_TYPE_UV];
+    }
+    mbmi->uv_mode = intra_search_state->mode_uv;
+    pmi->palette_size[1] = intra_search_state->pmi_uv.palette_size[1];
+    if (pmi->palette_size[1] > 0) {
+      memcpy(pmi->palette_colors + PALETTE_MAX_SIZE,
+             intra_search_state->pmi_uv.palette_colors + PALETTE_MAX_SIZE,
+             2 * PALETTE_MAX_SIZE * sizeof(pmi->palette_colors[0]));
+    }
+    mbmi->angle_delta[PLANE_TYPE_UV] = intra_search_state->uv_angle_delta;
+    skippable = skippable && intra_search_state->skip_uvs;
+    distortion2 += intra_search_state->dist_uvs;
+    rate2 += intra_search_state->rate_uv_intra;
+  }
+
+  if (skippable) {
+    rate2 -= rd_stats_y.rate;
+    if (num_planes > 1) rate2 -= intra_search_state->rate_uv_tokenonly;
+    rate2 += x->skip_cost[av1_get_skip_context(xd)][1];
+  } else {
+    rate2 += x->skip_cost[av1_get_skip_context(xd)][0];
+  }
+  this_rd = RDCOST(x->rdmult, rate2, distortion2);
+  this_rd_cost->rate = rate2;
+  this_rd_cost->dist = distortion2;
+  this_rd_cost->rdcost = this_rd;
+  return skippable;
+}
+
+// Given selected prediction mode, search for the best tx type and size.
+static AOM_INLINE int intra_block_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                      BLOCK_SIZE bsize, const int *bmode_costs,
+                                      int64_t *best_rd, int *rate,
+                                      int *rate_tokenonly, int64_t *distortion,
+                                      int *skippable, MB_MODE_INFO *best_mbmi,
+                                      PICK_MODE_CONTEXT *ctx) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  RD_STATS rd_stats;
+  // In order to improve txfm search avoid rd based breakouts during winner
+  // mode evaluation. Hence passing ref_best_rd as a maximum value
+  av1_pick_uniform_tx_size_type_yrd(cpi, x, &rd_stats, bsize, INT64_MAX);
+  if (rd_stats.rate == INT_MAX) return 0;
+  int this_rate_tokenonly = rd_stats.rate;
+  if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(mbmi->sb_type)) {
+    // av1_pick_uniform_tx_size_type_yrd above includes the cost of the tx_size
+    // in the tokenonly rate, but for intra blocks, tx_size is always coded
+    // (prediction granularity), so we account for it in the full rate,
+    // not the tokenonly rate.
+    this_rate_tokenonly -= tx_size_cost(x, bsize, mbmi->tx_size);
+  }
+  const int this_rate =
+      rd_stats.rate +
+      intra_mode_info_cost_y(cpi, x, mbmi, bsize, bmode_costs[mbmi->mode]);
+  const int64_t this_rd = RDCOST(x->rdmult, this_rate, rd_stats.dist);
+  if (this_rd < *best_rd) {
+    *best_mbmi = *mbmi;
+    *best_rd = this_rd;
+    *rate = this_rate;
+    *rate_tokenonly = this_rate_tokenonly;
+    *distortion = rd_stats.dist;
+    *skippable = rd_stats.skip;
+    av1_copy_array(ctx->blk_skip, x->blk_skip, ctx->num_4x4_blk);
+    av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
+    return 1;
+  }
+  return 0;
+}
+
+// With given luma directional intra prediction mode, pick the best angle delta
+// Return the RD cost corresponding to the best angle delta.
+static int64_t rd_pick_intra_angle_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                       int *rate, RD_STATS *rd_stats,
+                                       BLOCK_SIZE bsize, int mode_cost,
+                                       int64_t best_rd, int64_t *best_model_rd,
+                                       int skip_model_rd_for_zero_deg) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  assert(!is_inter_block(mbmi));
+
+  int best_angle_delta = 0;
+  int64_t rd_cost[2 * (MAX_ANGLE_DELTA + 2)];
+  TX_SIZE best_tx_size = mbmi->tx_size;
+  uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
+
+  for (int i = 0; i < 2 * (MAX_ANGLE_DELTA + 2); ++i) rd_cost[i] = INT64_MAX;
+
+  int first_try = 1;
+  for (int angle_delta = 0; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) {
+    for (int i = 0; i < 2; ++i) {
+      const int64_t best_rd_in =
+          (best_rd == INT64_MAX) ? INT64_MAX
+                                 : (best_rd + (best_rd >> (first_try ? 3 : 5)));
+      const int64_t this_rd = calc_rd_given_intra_angle(
+          cpi, x, bsize, mode_cost, best_rd_in, (1 - 2 * i) * angle_delta,
+          MAX_ANGLE_DELTA, rate, rd_stats, &best_angle_delta, &best_tx_size,
+          &best_rd, best_model_rd, best_tx_type_map, best_blk_skip,
+          (skip_model_rd_for_zero_deg & !angle_delta));
+      rd_cost[2 * angle_delta + i] = this_rd;
+      if (first_try && this_rd == INT64_MAX) return best_rd;
+      first_try = 0;
+      if (angle_delta == 0) {
+        rd_cost[1] = this_rd;
+        break;
+      }
+    }
+  }
+
+  assert(best_rd != INT64_MAX);
+  for (int angle_delta = 1; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) {
+    for (int i = 0; i < 2; ++i) {
+      int skip_search = 0;
+      const int64_t rd_thresh = best_rd + (best_rd >> 5);
+      if (rd_cost[2 * (angle_delta + 1) + i] > rd_thresh &&
+          rd_cost[2 * (angle_delta - 1) + i] > rd_thresh)
+        skip_search = 1;
+      if (!skip_search) {
+        calc_rd_given_intra_angle(
+            cpi, x, bsize, mode_cost, best_rd, (1 - 2 * i) * angle_delta,
+            MAX_ANGLE_DELTA, rate, rd_stats, &best_angle_delta, &best_tx_size,
+            &best_rd, best_model_rd, best_tx_type_map, best_blk_skip, 0);
+      }
+    }
+  }
+
+  if (rd_stats->rate != INT_MAX) {
+    mbmi->tx_size = best_tx_size;
+    mbmi->angle_delta[PLANE_TYPE_Y] = best_angle_delta;
+    const int n4 = bsize_to_num_blk(bsize);
+    memcpy(x->blk_skip, best_blk_skip, sizeof(best_blk_skip[0]) * n4);
+    av1_copy_array(xd->tx_type_map, best_tx_type_map, n4);
+  }
+  return best_rd;
+}
+
+int64_t av1_handle_intra_mode(IntraModeSearchState *intra_search_state,
+                              const AV1_COMP *cpi, MACROBLOCK *x,
+                              BLOCK_SIZE bsize, int ref_frame_cost,
+                              const PICK_MODE_CONTEXT *ctx, int disable_skip,
+                              RD_STATS *rd_stats, RD_STATS *rd_stats_y,
+                              RD_STATS *rd_stats_uv, int64_t best_rd,
+                              int64_t *best_intra_rd, int8_t best_mbmode_skip) {
+  const AV1_COMMON *cm = &cpi->common;
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  assert(mbmi->ref_frame[0] == INTRA_FRAME);
+  const PREDICTION_MODE mode = mbmi->mode;
+  const int mode_cost =
+      x->mbmode_cost[size_group_lookup[bsize]][mode] + ref_frame_cost;
+  const int intra_cost_penalty = av1_get_intra_cost_penalty(
+      cm->quant_params.base_qindex, cm->quant_params.y_dc_delta_q,
+      cm->seq_params.bit_depth);
+  const int skip_ctx = av1_get_skip_context(xd);
+
+  int known_rate = mode_cost;
+  known_rate += ref_frame_cost;
+  if (mode != DC_PRED && mode != PAETH_PRED) known_rate += intra_cost_penalty;
+  known_rate += AOMMIN(x->skip_cost[skip_ctx][0], x->skip_cost[skip_ctx][1]);
+  const int64_t known_rd = RDCOST(x->rdmult, known_rate, 0);
+  if (known_rd > best_rd) {
+    intra_search_state->skip_intra_modes = 1;
+    return INT64_MAX;
+  }
+
+  const int is_directional_mode = av1_is_directional_mode(mode);
+  if (is_directional_mode && av1_use_angle_delta(bsize) &&
+      cpi->oxcf.enable_angle_delta) {
+    if (sf->intra_sf.intra_pruning_with_hog &&
+        !intra_search_state->angle_stats_ready) {
+      prune_intra_mode_with_hog(x, bsize,
+                                cpi->sf.intra_sf.intra_pruning_with_hog_thresh,
+                                intra_search_state->directional_mode_skip_mask);
+      intra_search_state->angle_stats_ready = 1;
+    }
+    if (intra_search_state->directional_mode_skip_mask[mode]) return INT64_MAX;
+    av1_init_rd_stats(rd_stats_y);
+    rd_stats_y->rate = INT_MAX;
+    int64_t model_rd = INT64_MAX;
+    int rate_dummy;
+    rd_pick_intra_angle_sby(cpi, x, &rate_dummy, rd_stats_y, bsize, mode_cost,
+                            best_rd, &model_rd, 0);
+
+  } else {
+    av1_init_rd_stats(rd_stats_y);
+    mbmi->angle_delta[PLANE_TYPE_Y] = 0;
+    av1_pick_uniform_tx_size_type_yrd(cpi, x, rd_stats_y, bsize, best_rd);
+  }
+
+  // Pick filter intra modes.
+  if (mode == DC_PRED && av1_filter_intra_allowed_bsize(cm, bsize)) {
+    int try_filter_intra = 0;
+    int64_t best_rd_so_far = INT64_MAX;
+    if (rd_stats_y->rate != INT_MAX) {
+      const int tmp_rate =
+          rd_stats_y->rate + x->filter_intra_cost[bsize][0] + mode_cost;
+      best_rd_so_far = RDCOST(x->rdmult, tmp_rate, rd_stats_y->dist);
+      try_filter_intra = (best_rd_so_far / 2) <= best_rd;
+    } else {
+      try_filter_intra = !best_mbmode_skip;
+    }
+
+    if (try_filter_intra) {
+      RD_STATS rd_stats_y_fi;
+      int filter_intra_selected_flag = 0;
+      TX_SIZE best_tx_size = mbmi->tx_size;
+      FILTER_INTRA_MODE best_fi_mode = FILTER_DC_PRED;
+      uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+      memcpy(best_blk_skip, x->blk_skip,
+             sizeof(best_blk_skip[0]) * ctx->num_4x4_blk);
+      uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
+      av1_copy_array(best_tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
+      mbmi->filter_intra_mode_info.use_filter_intra = 1;
+      for (FILTER_INTRA_MODE fi_mode = FILTER_DC_PRED;
+           fi_mode < FILTER_INTRA_MODES; ++fi_mode) {
+        mbmi->filter_intra_mode_info.filter_intra_mode = fi_mode;
+        av1_pick_uniform_tx_size_type_yrd(cpi, x, &rd_stats_y_fi, bsize,
+                                          best_rd);
+        if (rd_stats_y_fi.rate == INT_MAX) continue;
+        const int this_rate_tmp =
+            rd_stats_y_fi.rate +
+            intra_mode_info_cost_y(cpi, x, mbmi, bsize, mode_cost);
+        const int64_t this_rd_tmp =
+            RDCOST(x->rdmult, this_rate_tmp, rd_stats_y_fi.dist);
+
+        if (this_rd_tmp != INT64_MAX && this_rd_tmp / 2 > best_rd) {
+          break;
+        }
+        if (this_rd_tmp < best_rd_so_far) {
+          best_tx_size = mbmi->tx_size;
+          av1_copy_array(best_tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
+          memcpy(best_blk_skip, x->blk_skip,
+                 sizeof(best_blk_skip[0]) * ctx->num_4x4_blk);
+          best_fi_mode = fi_mode;
+          *rd_stats_y = rd_stats_y_fi;
+          filter_intra_selected_flag = 1;
+          best_rd_so_far = this_rd_tmp;
+        }
+      }
+
+      mbmi->tx_size = best_tx_size;
+      av1_copy_array(xd->tx_type_map, best_tx_type_map, ctx->num_4x4_blk);
+      memcpy(x->blk_skip, best_blk_skip,
+             sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+
+      if (filter_intra_selected_flag) {
+        mbmi->filter_intra_mode_info.use_filter_intra = 1;
+        mbmi->filter_intra_mode_info.filter_intra_mode = best_fi_mode;
+      } else {
+        mbmi->filter_intra_mode_info.use_filter_intra = 0;
+      }
+    }
+  }
+
+  if (rd_stats_y->rate == INT_MAX) return INT64_MAX;
+
+  const int mode_cost_y =
+      intra_mode_info_cost_y(cpi, x, mbmi, bsize, mode_cost);
+  av1_init_rd_stats(rd_stats);
+  av1_init_rd_stats(rd_stats_uv);
+  const int num_planes = av1_num_planes(cm);
+  if (num_planes > 1) {
+    PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+    const int try_palette =
+        cpi->oxcf.enable_palette &&
+        av1_allow_palette(cm->features.allow_screen_content_tools,
+                          mbmi->sb_type);
+    const TX_SIZE uv_tx = av1_get_tx_size(AOM_PLANE_U, xd);
+    if (intra_search_state->rate_uv_intra == INT_MAX) {
+      const int rate_y =
+          rd_stats_y->skip ? x->skip_cost[skip_ctx][1] : rd_stats_y->rate;
+      const int64_t rdy =
+          RDCOST(x->rdmult, rate_y + mode_cost_y, rd_stats_y->dist);
+      if (best_rd < (INT64_MAX / 2) && rdy > (best_rd + (best_rd >> 2))) {
+        intra_search_state->skip_intra_modes = 1;
+        return INT64_MAX;
+      }
+      choose_intra_uv_mode(
+          cpi, x, bsize, uv_tx, &intra_search_state->rate_uv_intra,
+          &intra_search_state->rate_uv_tokenonly, &intra_search_state->dist_uvs,
+          &intra_search_state->skip_uvs, &intra_search_state->mode_uv);
+      if (try_palette) intra_search_state->pmi_uv = *pmi;
+      intra_search_state->uv_angle_delta = mbmi->angle_delta[PLANE_TYPE_UV];
+
+      const int uv_rate = intra_search_state->rate_uv_tokenonly;
+      const int64_t uv_dist = intra_search_state->dist_uvs;
+      const int64_t uv_rd = RDCOST(x->rdmult, uv_rate, uv_dist);
+      if (uv_rd > best_rd) {
+        intra_search_state->skip_intra_modes = 1;
+        return INT64_MAX;
+      }
+    }
+
+    rd_stats_uv->rate = intra_search_state->rate_uv_tokenonly;
+    rd_stats_uv->dist = intra_search_state->dist_uvs;
+    rd_stats_uv->skip = intra_search_state->skip_uvs;
+    rd_stats->skip = rd_stats_y->skip && rd_stats_uv->skip;
+    mbmi->uv_mode = intra_search_state->mode_uv;
+    if (try_palette) {
+      pmi->palette_size[1] = intra_search_state->pmi_uv.palette_size[1];
+      memcpy(pmi->palette_colors + PALETTE_MAX_SIZE,
+             intra_search_state->pmi_uv.palette_colors + PALETTE_MAX_SIZE,
+             2 * PALETTE_MAX_SIZE * sizeof(pmi->palette_colors[0]));
+    }
+    mbmi->angle_delta[PLANE_TYPE_UV] = intra_search_state->uv_angle_delta;
+  }
+
+  rd_stats->rate = rd_stats_y->rate + mode_cost_y;
+  if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(bsize)) {
+    // av1_pick_uniform_tx_size_type_yrd above includes the cost of the tx_size
+    // in the tokenonly rate, but for intra blocks, tx_size is always coded
+    // (prediction granularity), so we account for it in the full rate,
+    // not the tokenonly rate.
+    rd_stats_y->rate -= tx_size_cost(x, bsize, mbmi->tx_size);
+  }
+  if (num_planes > 1 && xd->is_chroma_ref) {
+    const int uv_mode_cost =
+        x->intra_uv_mode_cost[is_cfl_allowed(xd)][mode][mbmi->uv_mode];
+    rd_stats->rate +=
+        rd_stats_uv->rate +
+        intra_mode_info_cost_uv(cpi, x, mbmi, bsize, uv_mode_cost);
+  }
+  if (mode != DC_PRED && mode != PAETH_PRED) {
+    rd_stats->rate += intra_cost_penalty;
+  }
+
+  // Intra block is always coded as non-skip
+  rd_stats->skip = 0;
+  rd_stats->dist = rd_stats_y->dist + rd_stats_uv->dist;
+  // Add in the cost of the no skip flag.
+  rd_stats->rate += x->skip_cost[skip_ctx][0];
+  // Calculate the final RD estimate for this mode.
+  const int64_t this_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+  // Keep record of best intra rd
+  if (this_rd < *best_intra_rd) {
+    *best_intra_rd = this_rd;
+    intra_search_state->best_intra_mode = mode;
+  }
+
+  if (sf->intra_sf.skip_intra_in_interframe) {
+    if (best_rd < (INT64_MAX / 2) && this_rd > (best_rd + (best_rd >> 1)))
+      intra_search_state->skip_intra_modes = 1;
+  }
+
+  if (!disable_skip) {
+    for (int i = 0; i < REFERENCE_MODES; ++i) {
+      intra_search_state->best_pred_rd[i] =
+          AOMMIN(intra_search_state->best_pred_rd[i], this_rd);
+    }
+  }
+  return this_rd;
+}
+
+// This function is used only for intra_only frames
+int64_t av1_rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                   int *rate, int *rate_tokenonly,
+                                   int64_t *distortion, int *skippable,
+                                   BLOCK_SIZE bsize, int64_t best_rd,
+                                   PICK_MODE_CONTEXT *ctx) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  assert(!is_inter_block(mbmi));
+  int64_t best_model_rd = INT64_MAX;
+  int is_directional_mode;
+  uint8_t directional_mode_skip_mask[INTRA_MODES] = { 0 };
+  // Flag to check rd of any intra mode is better than best_rd passed to this
+  // function
+  int beat_best_rd = 0;
+  const int *bmode_costs;
+  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  const int try_palette =
+      cpi->oxcf.enable_palette &&
+      av1_allow_palette(cpi->common.features.allow_screen_content_tools,
+                        mbmi->sb_type);
+  uint8_t *best_palette_color_map =
+      try_palette ? x->palette_buffer->best_palette_color_map : NULL;
+  const MB_MODE_INFO *above_mi = xd->above_mbmi;
+  const MB_MODE_INFO *left_mi = xd->left_mbmi;
+  const PREDICTION_MODE A = av1_above_block_mode(above_mi);
+  const PREDICTION_MODE L = av1_left_block_mode(left_mi);
+  const int above_ctx = intra_mode_context[A];
+  const int left_ctx = intra_mode_context[L];
+  bmode_costs = x->y_mode_costs[above_ctx][left_ctx];
+
+  mbmi->angle_delta[PLANE_TYPE_Y] = 0;
+  if (cpi->sf.intra_sf.intra_pruning_with_hog) {
+    prune_intra_mode_with_hog(x, bsize,
+                              cpi->sf.intra_sf.intra_pruning_with_hog_thresh,
+                              directional_mode_skip_mask);
+  }
+  mbmi->filter_intra_mode_info.use_filter_intra = 0;
+  pmi->palette_size[0] = 0;
+
+  // Set params for mode evaluation
+  set_mode_eval_params(cpi, x, MODE_EVAL);
+
+  MB_MODE_INFO best_mbmi = *mbmi;
+  av1_zero(x->winner_mode_stats);
+  x->winner_mode_count = 0;
+
+  /* Y Search for intra prediction mode */
+  for (int mode_idx = INTRA_MODE_START; mode_idx < INTRA_MODE_END; ++mode_idx) {
+    RD_STATS this_rd_stats;
+    int this_rate, this_rate_tokenonly, s;
+    int64_t this_distortion, this_rd;
+    mbmi->mode = intra_rd_search_mode_order[mode_idx];
+    if ((!cpi->oxcf.enable_smooth_intra ||
+         cpi->sf.intra_sf.disable_smooth_intra) &&
+        (mbmi->mode == SMOOTH_PRED || mbmi->mode == SMOOTH_H_PRED ||
+         mbmi->mode == SMOOTH_V_PRED))
+      continue;
+    if (!cpi->oxcf.enable_paeth_intra && mbmi->mode == PAETH_PRED) continue;
+    mbmi->angle_delta[PLANE_TYPE_Y] = 0;
+
+    if (model_intra_yrd_and_prune(cpi, x, bsize, bmode_costs[mbmi->mode],
+                                  &best_model_rd)) {
+      continue;
+    }
+
+    is_directional_mode = av1_is_directional_mode(mbmi->mode);
+    if (is_directional_mode && directional_mode_skip_mask[mbmi->mode]) continue;
+    if (is_directional_mode && av1_use_angle_delta(bsize) &&
+        cpi->oxcf.enable_angle_delta) {
+      this_rd_stats.rate = INT_MAX;
+      rd_pick_intra_angle_sby(cpi, x, &this_rate, &this_rd_stats, bsize,
+                              bmode_costs[mbmi->mode], best_rd, &best_model_rd,
+                              1);
+    } else {
+      av1_pick_uniform_tx_size_type_yrd(cpi, x, &this_rd_stats, bsize, best_rd);
+    }
+    this_rate_tokenonly = this_rd_stats.rate;
+    this_distortion = this_rd_stats.dist;
+    s = this_rd_stats.skip;
+
+    if (this_rate_tokenonly == INT_MAX) continue;
+
+    if (!xd->lossless[mbmi->segment_id] &&
+        block_signals_txsize(mbmi->sb_type)) {
+      // av1_pick_uniform_tx_size_type_yrd above includes the cost of the
+      // tx_size in the tokenonly rate, but for intra blocks, tx_size is always
+      // coded (prediction granularity), so we account for it in the full rate,
+      // not the tokenonly rate.
+      this_rate_tokenonly -= tx_size_cost(x, bsize, mbmi->tx_size);
+    }
+    this_rate =
+        this_rd_stats.rate +
+        intra_mode_info_cost_y(cpi, x, mbmi, bsize, bmode_costs[mbmi->mode]);
+    this_rd = RDCOST(x->rdmult, this_rate, this_distortion);
+    // Collect mode stats for multiwinner mode processing
+    const int txfm_search_done = 1;
+    store_winner_mode_stats(
+        &cpi->common, x, mbmi, NULL, NULL, NULL, 0, NULL, bsize, this_rd,
+        cpi->sf.winner_mode_sf.enable_multiwinner_mode_process,
+        txfm_search_done);
+    if (this_rd < best_rd) {
+      best_mbmi = *mbmi;
+      best_rd = this_rd;
+      // Setting beat_best_rd flag because current mode rd is better than
+      // best_rd passed to this function
+      beat_best_rd = 1;
+      *rate = this_rate;
+      *rate_tokenonly = this_rate_tokenonly;
+      *distortion = this_distortion;
+      *skippable = s;
+      memcpy(ctx->blk_skip, x->blk_skip,
+             sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+      av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
+    }
+  }
+
+  if (try_palette) {
+    rd_pick_palette_intra_sby(
+        cpi, x, bsize, bmode_costs[DC_PRED], &best_mbmi, best_palette_color_map,
+        &best_rd, &best_model_rd, rate, rate_tokenonly, distortion, skippable,
+        &beat_best_rd, ctx, ctx->blk_skip, ctx->tx_type_map);
+  }
+
+  if (beat_best_rd && av1_filter_intra_allowed_bsize(&cpi->common, bsize)) {
+    if (rd_pick_filter_intra_sby(cpi, x, rate, rate_tokenonly, distortion,
+                                 skippable, bsize, bmode_costs[DC_PRED],
+                                 &best_rd, &best_model_rd, ctx)) {
+      best_mbmi = *mbmi;
+    }
+  }
+  // No mode is identified with less rd value than best_rd passed to this
+  // function. In such cases winner mode processing is not necessary and return
+  // best_rd as INT64_MAX to indicate best mode is not identified
+  if (!beat_best_rd) return INT64_MAX;
+
+  // In multi-winner mode processing, perform tx search for few best modes
+  // identified during mode evaluation. Winner mode processing uses best tx
+  // configuration for tx search.
+  if (cpi->sf.winner_mode_sf.enable_multiwinner_mode_process) {
+    int best_mode_idx = 0;
+    int block_width, block_height;
+    uint8_t *color_map_dst = xd->plane[PLANE_TYPE_Y].color_index_map;
+    av1_get_block_dimensions(bsize, AOM_PLANE_Y, xd, &block_width,
+                             &block_height, NULL, NULL);
+
+    for (int mode_idx = 0; mode_idx < x->winner_mode_count; mode_idx++) {
+      *mbmi = x->winner_mode_stats[mode_idx].mbmi;
+      if (is_winner_mode_processing_enabled(cpi, mbmi, mbmi->mode)) {
+        // Restore color_map of palette mode before winner mode processing
+        if (mbmi->palette_mode_info.palette_size[0] > 0) {
+          uint8_t *color_map_src =
+              x->winner_mode_stats[mode_idx].color_index_map;
+          memcpy(color_map_dst, color_map_src,
+                 block_width * block_height * sizeof(*color_map_src));
+        }
+        // Set params for winner mode evaluation
+        set_mode_eval_params(cpi, x, WINNER_MODE_EVAL);
+
+        // Winner mode processing
+        // If previous searches use only the default tx type/no R-D optimization
+        // of quantized coeffs, do an extra search for the best tx type/better
+        // R-D optimization of quantized coeffs
+        if (intra_block_yrd(cpi, x, bsize, bmode_costs, &best_rd, rate,
+                            rate_tokenonly, distortion, skippable, &best_mbmi,
+                            ctx))
+          best_mode_idx = mode_idx;
+      }
+    }
+    // Copy color_map of palette mode for final winner mode
+    if (best_mbmi.palette_mode_info.palette_size[0] > 0) {
+      uint8_t *color_map_src =
+          x->winner_mode_stats[best_mode_idx].color_index_map;
+      memcpy(color_map_dst, color_map_src,
+             block_width * block_height * sizeof(*color_map_src));
+    }
+  } else {
+    // If previous searches use only the default tx type/no R-D optimization of
+    // quantized coeffs, do an extra search for the best tx type/better R-D
+    // optimization of quantized coeffs
+    if (is_winner_mode_processing_enabled(cpi, mbmi, best_mbmi.mode)) {
+      // Set params for winner mode evaluation
+      set_mode_eval_params(cpi, x, WINNER_MODE_EVAL);
+      *mbmi = best_mbmi;
+      intra_block_yrd(cpi, x, bsize, bmode_costs, &best_rd, rate,
+                      rate_tokenonly, distortion, skippable, &best_mbmi, ctx);
+    }
+  }
+  *mbmi = best_mbmi;
+  av1_copy_array(xd->tx_type_map, ctx->tx_type_map, ctx->num_4x4_blk);
+  return best_rd;
+}
diff --git a/libs/libaom/src/av1/encoder/intra_mode_search.h b/libs/libaom/src/av1/encoder/intra_mode_search.h
new file mode 100644
index 000000000..4b5d31c3e
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/intra_mode_search.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_INTRA_MODE_SEARCH_H_
+#define AOM_AV1_ENCODER_INTRA_MODE_SEARCH_H_
+
+#include "av1/encoder/encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct IntraModeSearchState {
+  int skip_intra_modes;
+  PREDICTION_MODE best_intra_mode;
+  int angle_stats_ready;
+  uint8_t directional_mode_skip_mask[INTRA_MODES];
+  int rate_uv_intra;
+  int rate_uv_tokenonly;
+  int64_t dist_uvs;
+  int skip_uvs;
+  UV_PREDICTION_MODE mode_uv;
+  PALETTE_MODE_INFO pmi_uv;
+  int8_t uv_angle_delta;
+  int64_t best_pred_rd[REFERENCE_MODES];
+} IntraModeSearchState;
+
+void av1_restore_uv_color_map(const AV1_COMP *const cpi, MACROBLOCK *x);
+int av1_search_palette_mode(const AV1_COMP *cpi, MACROBLOCK *x,
+                            RD_STATS *this_rd_cost, PICK_MODE_CONTEXT *ctx,
+                            BLOCK_SIZE bsize, MB_MODE_INFO *const mbmi,
+                            PALETTE_MODE_INFO *const pmi,
+                            unsigned int *ref_costs_single,
+                            IntraModeSearchState *intra_search_state,
+                            int64_t best_rd);
+
+int64_t av1_rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                    int *rate, int *rate_tokenonly,
+                                    int64_t *distortion, int *skippable,
+                                    BLOCK_SIZE bsize, TX_SIZE max_tx_size);
+
+int64_t av1_handle_intra_mode(IntraModeSearchState *intra_search_state,
+                              const AV1_COMP *cpi, MACROBLOCK *x,
+                              BLOCK_SIZE bsize, int ref_frame_cost,
+                              const PICK_MODE_CONTEXT *ctx, int disable_skip,
+                              RD_STATS *rd_stats, RD_STATS *rd_stats_y,
+                              RD_STATS *rd_stats_uv, int64_t best_rd,
+                              int64_t *best_intra_rd, int8_t best_mbmode_skip);
+
+int64_t av1_rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                   int *rate, int *rate_tokenonly,
+                                   int64_t *distortion, int *skippable,
+                                   BLOCK_SIZE bsize, int64_t best_rd,
+                                   PICK_MODE_CONTEXT *ctx);
+#endif  // AOM_AV1_ENCODER_INTRA_MODE_SEARCH_H_
diff --git a/libs/libaom/src/av1/encoder/k_means_template.h b/libs/libaom/src/av1/encoder/k_means_template.h
new file mode 100644
index 000000000..9e526b88b
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/k_means_template.h
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "av1/encoder/palette.h"
+#include "av1/encoder/random.h"
+
+#ifndef AV1_K_MEANS_DIM
+#error "This template requires AV1_K_MEANS_DIM to be defined"
+#endif
+
+#define RENAME_(x, y) AV1_K_MEANS_RENAME(x, y)
+#define RENAME(x) RENAME_(x, AV1_K_MEANS_DIM)
+
+static int RENAME(calc_dist)(const int *p1, const int *p2) {
+  int dist = 0;
+  for (int i = 0; i < AV1_K_MEANS_DIM; ++i) {
+    const int diff = p1[i] - p2[i];
+    dist += diff * diff;
+  }
+  return dist;
+}
+
+void RENAME(av1_calc_indices)(const int *data, const int *centroids,
+                              uint8_t *indices, int n, int k) {
+  for (int i = 0; i < n; ++i) {
+    int min_dist = RENAME(calc_dist)(data + i * AV1_K_MEANS_DIM, centroids);
+    indices[i] = 0;
+    for (int j = 1; j < k; ++j) {
+      const int this_dist = RENAME(calc_dist)(data + i * AV1_K_MEANS_DIM,
+                                              centroids + j * AV1_K_MEANS_DIM);
+      if (this_dist < min_dist) {
+        min_dist = this_dist;
+        indices[i] = j;
+      }
+    }
+  }
+}
+
+static void RENAME(calc_centroids)(const int *data, int *centroids,
+                                   const uint8_t *indices, int n, int k) {
+  int i, j;
+  int count[PALETTE_MAX_SIZE] = { 0 };
+  unsigned int rand_state = (unsigned int)data[0];
+  assert(n <= 32768);
+  memset(centroids, 0, sizeof(centroids[0]) * k * AV1_K_MEANS_DIM);
+
+  for (i = 0; i < n; ++i) {
+    const int index = indices[i];
+    assert(index < k);
+    ++count[index];
+    for (j = 0; j < AV1_K_MEANS_DIM; ++j) {
+      centroids[index * AV1_K_MEANS_DIM + j] += data[i * AV1_K_MEANS_DIM + j];
+    }
+  }
+
+  for (i = 0; i < k; ++i) {
+    if (count[i] == 0) {
+      memcpy(centroids + i * AV1_K_MEANS_DIM,
+             data + (lcg_rand16(&rand_state) % n) * AV1_K_MEANS_DIM,
+             sizeof(centroids[0]) * AV1_K_MEANS_DIM);
+    } else {
+      for (j = 0; j < AV1_K_MEANS_DIM; ++j) {
+        centroids[i * AV1_K_MEANS_DIM + j] =
+            DIVIDE_AND_ROUND(centroids[i * AV1_K_MEANS_DIM + j], count[i]);
+      }
+    }
+  }
+}
+
+static int64_t RENAME(calc_total_dist)(const int *data, const int *centroids,
+                                       const uint8_t *indices, int n, int k) {
+  int64_t dist = 0;
+  (void)k;
+  for (int i = 0; i < n; ++i) {
+    dist += RENAME(calc_dist)(data + i * AV1_K_MEANS_DIM,
+                              centroids + indices[i] * AV1_K_MEANS_DIM);
+  }
+  return dist;
+}
+
+void RENAME(av1_k_means)(const int *data, int *centroids, uint8_t *indices,
+                         int n, int k, int max_itr) {
+  int pre_centroids[2 * PALETTE_MAX_SIZE];
+  uint8_t pre_indices[MAX_SB_SQUARE];
+
+  RENAME(av1_calc_indices)(data, centroids, indices, n, k);
+  int64_t this_dist = RENAME(calc_total_dist)(data, centroids, indices, n, k);
+
+  for (int i = 0; i < max_itr; ++i) {
+    const int64_t pre_dist = this_dist;
+    memcpy(pre_centroids, centroids,
+           sizeof(pre_centroids[0]) * k * AV1_K_MEANS_DIM);
+    memcpy(pre_indices, indices, sizeof(pre_indices[0]) * n);
+
+    RENAME(calc_centroids)(data, centroids, indices, n, k);
+    RENAME(av1_calc_indices)(data, centroids, indices, n, k);
+    this_dist = RENAME(calc_total_dist)(data, centroids, indices, n, k);
+
+    if (this_dist > pre_dist) {
+      memcpy(centroids, pre_centroids,
+             sizeof(pre_centroids[0]) * k * AV1_K_MEANS_DIM);
+      memcpy(indices, pre_indices, sizeof(pre_indices[0]) * n);
+      break;
+    }
+    if (!memcmp(centroids, pre_centroids,
+                sizeof(pre_centroids[0]) * k * AV1_K_MEANS_DIM))
+      break;
+  }
+}
+#undef RENAME_
+#undef RENAME
diff --git a/libs/libaom/src/av1/encoder/level.c b/libs/libaom/src/av1/encoder/level.c
new file mode 100644
index 000000000..3403a3a84
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/level.c
@@ -0,0 +1,1184 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_ports/system_state.h"
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/level.h"
+
+#define UNDEFINED_LEVEL                                                 \
+  {                                                                     \
+    .level = SEQ_LEVEL_MAX, .max_picture_size = 0, .max_h_size = 0,     \
+    .max_v_size = 0, .max_display_rate = 0, .max_decode_rate = 0,       \
+    .max_header_rate = 0, .main_mbps = 0, .high_mbps = 0, .main_cr = 0, \
+    .high_cr = 0, .max_tiles = 0, .max_tile_cols = 0                    \
+  }
+
+static const AV1LevelSpec av1_level_defs[SEQ_LEVELS] = {
+  { .level = SEQ_LEVEL_2_0,
+    .max_picture_size = 147456,
+    .max_h_size = 2048,
+    .max_v_size = 1152,
+    .max_display_rate = 4423680L,
+    .max_decode_rate = 5529600L,
+    .max_header_rate = 150,
+    .main_mbps = 1.5,
+    .high_mbps = 0,
+    .main_cr = 2.0,
+    .high_cr = 0,
+    .max_tiles = 8,
+    .max_tile_cols = 4 },
+  { .level = SEQ_LEVEL_2_1,
+    .max_picture_size = 278784,
+    .max_h_size = 2816,
+    .max_v_size = 1584,
+    .max_display_rate = 8363520L,
+    .max_decode_rate = 10454400L,
+    .max_header_rate = 150,
+    .main_mbps = 3.0,
+    .high_mbps = 0,
+    .main_cr = 2.0,
+    .high_cr = 0,
+    .max_tiles = 8,
+    .max_tile_cols = 4 },
+  UNDEFINED_LEVEL,
+  UNDEFINED_LEVEL,
+  { .level = SEQ_LEVEL_3_0,
+    .max_picture_size = 665856,
+    .max_h_size = 4352,
+    .max_v_size = 2448,
+    .max_display_rate = 19975680L,
+    .max_decode_rate = 24969600L,
+    .max_header_rate = 150,
+    .main_mbps = 6.0,
+    .high_mbps = 0,
+    .main_cr = 2.0,
+    .high_cr = 0,
+    .max_tiles = 16,
+    .max_tile_cols = 6 },
+  { .level = SEQ_LEVEL_3_1,
+    .max_picture_size = 1065024,
+    .max_h_size = 5504,
+    .max_v_size = 3096,
+    .max_display_rate = 31950720L,
+    .max_decode_rate = 39938400L,
+    .max_header_rate = 150,
+    .main_mbps = 10.0,
+    .high_mbps = 0,
+    .main_cr = 2.0,
+    .high_cr = 0,
+    .max_tiles = 16,
+    .max_tile_cols = 6 },
+  UNDEFINED_LEVEL,
+  UNDEFINED_LEVEL,
+  { .level = SEQ_LEVEL_4_0,
+    .max_picture_size = 2359296,
+    .max_h_size = 6144,
+    .max_v_size = 3456,
+    .max_display_rate = 70778880L,
+    .max_decode_rate = 77856768L,
+    .max_header_rate = 300,
+    .main_mbps = 12.0,
+    .high_mbps = 30.0,
+    .main_cr = 4.0,
+    .high_cr = 4.0,
+    .max_tiles = 32,
+    .max_tile_cols = 8 },
+  { .level = SEQ_LEVEL_4_1,
+    .max_picture_size = 2359296,
+    .max_h_size = 6144,
+    .max_v_size = 3456,
+    .max_display_rate = 141557760L,
+    .max_decode_rate = 155713536L,
+    .max_header_rate = 300,
+    .main_mbps = 20.0,
+    .high_mbps = 50.0,
+    .main_cr = 4.0,
+    .high_cr = 4.0,
+    .max_tiles = 32,
+    .max_tile_cols = 8 },
+  UNDEFINED_LEVEL,
+  UNDEFINED_LEVEL,
+  { .level = SEQ_LEVEL_5_0,
+    .max_picture_size = 8912896,
+    .max_h_size = 8192,
+    .max_v_size = 4352,
+    .max_display_rate = 267386880L,
+    .max_decode_rate = 273715200L,
+    .max_header_rate = 300,
+    .main_mbps = 30.0,
+    .high_mbps = 100.0,
+    .main_cr = 6.0,
+    .high_cr = 4.0,
+    .max_tiles = 64,
+    .max_tile_cols = 8 },
+  { .level = SEQ_LEVEL_5_1,
+    .max_picture_size = 8912896,
+    .max_h_size = 8192,
+    .max_v_size = 4352,
+    .max_display_rate = 534773760L,
+    .max_decode_rate = 547430400L,
+    .max_header_rate = 300,
+    .main_mbps = 40.0,
+    .high_mbps = 160.0,
+    .main_cr = 8.0,
+    .high_cr = 4.0,
+    .max_tiles = 64,
+    .max_tile_cols = 8 },
+  { .level = SEQ_LEVEL_5_2,
+    .max_picture_size = 8912896,
+    .max_h_size = 8192,
+    .max_v_size = 4352,
+    .max_display_rate = 1069547520L,
+    .max_decode_rate = 1094860800L,
+    .max_header_rate = 300,
+    .main_mbps = 60.0,
+    .high_mbps = 240.0,
+    .main_cr = 8.0,
+    .high_cr = 4.0,
+    .max_tiles = 64,
+    .max_tile_cols = 8 },
+  { .level = SEQ_LEVEL_5_3,
+    .max_picture_size = 8912896,
+    .max_h_size = 8192,
+    .max_v_size = 4352,
+    .max_display_rate = 1069547520L,
+    .max_decode_rate = 1176502272L,
+    .max_header_rate = 300,
+    .main_mbps = 60.0,
+    .high_mbps = 240.0,
+    .main_cr = 8.0,
+    .high_cr = 4.0,
+    .max_tiles = 64,
+    .max_tile_cols = 8 },
+  { .level = SEQ_LEVEL_6_0,
+    .max_picture_size = 35651584,
+    .max_h_size = 16384,
+    .max_v_size = 8704,
+    .max_display_rate = 1069547520L,
+    .max_decode_rate = 1176502272L,
+    .max_header_rate = 300,
+    .main_mbps = 60.0,
+    .high_mbps = 240.0,
+    .main_cr = 8.0,
+    .high_cr = 4.0,
+    .max_tiles = 128,
+    .max_tile_cols = 16 },
+  { .level = SEQ_LEVEL_6_1,
+    .max_picture_size = 35651584,
+    .max_h_size = 16384,
+    .max_v_size = 8704,
+    .max_display_rate = 2139095040L,
+    .max_decode_rate = 2189721600L,
+    .max_header_rate = 300,
+    .main_mbps = 100.0,
+    .high_mbps = 480.0,
+    .main_cr = 8.0,
+    .high_cr = 4.0,
+    .max_tiles = 128,
+    .max_tile_cols = 16 },
+  { .level = SEQ_LEVEL_6_2,
+    .max_picture_size = 35651584,
+    .max_h_size = 16384,
+    .max_v_size = 8704,
+    .max_display_rate = 4278190080L,
+    .max_decode_rate = 4379443200L,
+    .max_header_rate = 300,
+    .main_mbps = 160.0,
+    .high_mbps = 800.0,
+    .main_cr = 8.0,
+    .high_cr = 4.0,
+    .max_tiles = 128,
+    .max_tile_cols = 16 },
+  { .level = SEQ_LEVEL_6_3,
+    .max_picture_size = 35651584,
+    .max_h_size = 16384,
+    .max_v_size = 8704,
+    .max_display_rate = 4278190080L,
+    .max_decode_rate = 4706009088L,
+    .max_header_rate = 300,
+    .main_mbps = 160.0,
+    .high_mbps = 800.0,
+    .main_cr = 8.0,
+    .high_cr = 4.0,
+    .max_tiles = 128,
+    .max_tile_cols = 16 },
+  UNDEFINED_LEVEL,
+  UNDEFINED_LEVEL,
+  UNDEFINED_LEVEL,
+  UNDEFINED_LEVEL,
+};
+
+typedef enum {
+  LUMA_PIC_SIZE_TOO_LARGE,
+  LUMA_PIC_H_SIZE_TOO_LARGE,
+  LUMA_PIC_V_SIZE_TOO_LARGE,
+  LUMA_PIC_H_SIZE_TOO_SMALL,
+  LUMA_PIC_V_SIZE_TOO_SMALL,
+  TOO_MANY_TILE_COLUMNS,
+  TOO_MANY_TILES,
+  TILE_RATE_TOO_HIGH,
+  TILE_TOO_LARGE,
+  SUPERRES_TILE_WIDTH_TOO_LARGE,
+  CROPPED_TILE_WIDTH_TOO_SMALL,
+  CROPPED_TILE_HEIGHT_TOO_SMALL,
+  TILE_WIDTH_INVALID,
+  FRAME_HEADER_RATE_TOO_HIGH,
+  DISPLAY_RATE_TOO_HIGH,
+  DECODE_RATE_TOO_HIGH,
+  CR_TOO_SMALL,
+  TILE_SIZE_HEADER_RATE_TOO_HIGH,
+  BITRATE_TOO_HIGH,
+  DECODER_MODEL_FAIL,
+
+  TARGET_LEVEL_FAIL_IDS,
+  TARGET_LEVEL_OK,
+} TARGET_LEVEL_FAIL_ID;
+
+static const char *level_fail_messages[TARGET_LEVEL_FAIL_IDS] = {
+  "The picture size is too large.",
+  "The picture width is too large.",
+  "The picture height is too large.",
+  "The picture width is too small.",
+  "The picture height is too small.",
+  "Too many tile columns are used.",
+  "Too many tiles are used.",
+  "The tile rate is too high.",
+  "The tile size is too large.",
+  "The superres tile width is too large.",
+  "The cropped tile width is less than 8.",
+  "The cropped tile height is less than 8.",
+  "The tile width is invalid.",
+  "The frame header rate is too high.",
+  "The display luma sample rate is too high.",
+  "The decoded luma sample rate is too high.",
+  "The compression ratio is too small.",
+  "The product of max tile size and header rate is too high.",
+  "The bitrate is too high.",
+  "The decoder model fails.",
+};
+
+static double get_max_bitrate(const AV1LevelSpec *const level_spec, int tier,
+                              BITSTREAM_PROFILE profile) {
+  if (level_spec->level < SEQ_LEVEL_4_0) tier = 0;
+  const double bitrate_basis =
+      (tier ? level_spec->high_mbps : level_spec->main_mbps) * 1e6;
+  const double bitrate_profile_factor =
+      profile == PROFILE_0 ? 1.0 : (profile == PROFILE_1 ? 2.0 : 3.0);
+  return bitrate_basis * bitrate_profile_factor;
+}
+
+double av1_get_max_bitrate_for_level(AV1_LEVEL level_index, int tier,
+                                     BITSTREAM_PROFILE profile) {
+  assert(is_valid_seq_level_idx(level_index));
+  return get_max_bitrate(&av1_level_defs[level_index], tier, profile);
+}
+
+void av1_get_max_tiles_for_level(AV1_LEVEL level_index, int *const max_tiles,
+                                 int *const max_tile_cols) {
+  assert(is_valid_seq_level_idx(level_index));
+  const AV1LevelSpec *const level_spec = &av1_level_defs[level_index];
+  *max_tiles = level_spec->max_tiles;
+  *max_tile_cols = level_spec->max_tile_cols;
+}
+
+// We assume time t to be valid if and only if t >= 0.0.
+// So INVALID_TIME can be defined as anything less than 0.
+#define INVALID_TIME (-1.0)
+
+// This corresponds to "free_buffer" in the spec.
+static void release_buffer(DECODER_MODEL *const decoder_model, int idx) {
+  assert(idx >= 0 && idx < BUFFER_POOL_MAX_SIZE);
+  FRAME_BUFFER *const this_buffer = &decoder_model->frame_buffer_pool[idx];
+  this_buffer->decoder_ref_count = 0;
+  this_buffer->player_ref_count = 0;
+  this_buffer->display_index = -1;
+  this_buffer->presentation_time = INVALID_TIME;
+}
+
+static void initialize_buffer_pool(DECODER_MODEL *const decoder_model) {
+  for (int i = 0; i < BUFFER_POOL_MAX_SIZE; ++i) {
+    release_buffer(decoder_model, i);
+  }
+  for (int i = 0; i < REF_FRAMES; ++i) {
+    decoder_model->vbi[i] = -1;
+  }
+}
+
+static int get_free_buffer(DECODER_MODEL *const decoder_model) {
+  for (int i = 0; i < BUFFER_POOL_MAX_SIZE; ++i) {
+    const FRAME_BUFFER *const this_buffer =
+        &decoder_model->frame_buffer_pool[i];
+    if (this_buffer->decoder_ref_count == 0 &&
+        this_buffer->player_ref_count == 0)
+      return i;
+  }
+  return -1;
+}
+
+static void update_ref_buffers(DECODER_MODEL *const decoder_model, int idx,
+                               int refresh_frame_flags) {
+  FRAME_BUFFER *const this_buffer = &decoder_model->frame_buffer_pool[idx];
+  for (int i = 0; i < REF_FRAMES; ++i) {
+    if (refresh_frame_flags & (1 << i)) {
+      const int pre_idx = decoder_model->vbi[i];
+      if (pre_idx != -1) {
+        --decoder_model->frame_buffer_pool[pre_idx].decoder_ref_count;
+      }
+      decoder_model->vbi[i] = idx;
+      ++this_buffer->decoder_ref_count;
+    }
+  }
+}
+
+// The time (in seconds) required to decode a frame.
+static double time_to_decode_frame(const AV1_COMMON *const cm,
+                                   int64_t max_decode_rate) {
+  if (cm->show_existing_frame) return 0.0;
+
+  const FRAME_TYPE frame_type = cm->current_frame.frame_type;
+  int luma_samples = 0;
+  if (frame_type == KEY_FRAME || frame_type == INTRA_ONLY_FRAME) {
+    luma_samples = cm->superres_upscaled_width * cm->height;
+  } else {
+    const int spatial_layer_dimensions_present_flag = 0;
+    if (spatial_layer_dimensions_present_flag) {
+      assert(0 && "Spatial layer dimensions not supported yet.");
+    } else {
+      const SequenceHeader *const seq_params = &cm->seq_params;
+      const int max_frame_width = seq_params->max_frame_width;
+      const int max_frame_height = seq_params->max_frame_height;
+      luma_samples = max_frame_width * max_frame_height;
+    }
+  }
+
+  return luma_samples / (double)max_decode_rate;
+}
+
+// Release frame buffers that are no longer needed for decode or display.
+// It corresponds to "start_decode_at_removal_time" in the spec.
+static void release_processed_frames(DECODER_MODEL *const decoder_model,
+                                     double removal_time) {
+  for (int i = 0; i < BUFFER_POOL_MAX_SIZE; ++i) {
+    FRAME_BUFFER *const this_buffer = &decoder_model->frame_buffer_pool[i];
+    if (this_buffer->player_ref_count > 0) {
+      if (this_buffer->presentation_time >= 0.0 &&
+          this_buffer->presentation_time <= removal_time) {
+        this_buffer->player_ref_count = 0;
+        if (this_buffer->decoder_ref_count == 0) {
+          release_buffer(decoder_model, i);
+        }
+      }
+    }
+  }
+}
+
+static int frames_in_buffer_pool(const DECODER_MODEL *const decoder_model) {
+  int frames_in_pool = 0;
+  for (int i = 0; i < BUFFER_POOL_MAX_SIZE; ++i) {
+    const FRAME_BUFFER *const this_buffer =
+        &decoder_model->frame_buffer_pool[i];
+    if (this_buffer->decoder_ref_count > 0 ||
+        this_buffer->player_ref_count > 0) {
+      ++frames_in_pool;
+    }
+  }
+  return frames_in_pool;
+}
+
+static double get_presentation_time(const DECODER_MODEL *const decoder_model,
+                                    int display_index) {
+  if (decoder_model->mode == SCHEDULE_MODE) {
+    assert(0 && "SCHEDULE_MODE NOT SUPPORTED");
+    return INVALID_TIME;
+  } else {
+    const double initial_presentation_delay =
+        decoder_model->initial_presentation_delay;
+    // Can't decide presentation time until the initial presentation delay is
+    // known.
+    if (initial_presentation_delay < 0.0) return INVALID_TIME;
+
+    return initial_presentation_delay +
+           display_index * decoder_model->num_ticks_per_picture *
+               decoder_model->display_clock_tick;
+  }
+}
+
+#define MAX_TIME 1e16
+double time_next_buffer_is_free(const DECODER_MODEL *const decoder_model) {
+  if (decoder_model->num_decoded_frame == 0) {
+    return (double)decoder_model->decoder_buffer_delay / 90000.0;
+  }
+
+  double buf_free_time = MAX_TIME;
+  for (int i = 0; i < BUFFER_POOL_MAX_SIZE; ++i) {
+    const FRAME_BUFFER *const this_buffer =
+        &decoder_model->frame_buffer_pool[i];
+    if (this_buffer->decoder_ref_count == 0) {
+      if (this_buffer->player_ref_count == 0) {
+        return decoder_model->current_time;
+      }
+      const double presentation_time = this_buffer->presentation_time;
+      if (presentation_time >= 0.0 && presentation_time < buf_free_time) {
+        buf_free_time = presentation_time;
+      }
+    }
+  }
+  return buf_free_time < MAX_TIME ? buf_free_time : INVALID_TIME;
+}
+#undef MAX_TIME
+
+static double get_removal_time(const DECODER_MODEL *const decoder_model) {
+  if (decoder_model->mode == SCHEDULE_MODE) {
+    assert(0 && "SCHEDULE_MODE IS NOT SUPPORTED YET");
+    return INVALID_TIME;
+  } else {
+    return time_next_buffer_is_free(decoder_model);
+  }
+}
+
+void av1_decoder_model_print_status(const DECODER_MODEL *const decoder_model) {
+  printf(
+      "\n status %d, num_frame %3d, num_decoded_frame %3d, "
+      "num_shown_frame %3d, current time %6.2f, frames in buffer %2d, "
+      "presentation delay %6.2f, total interval %6.2f\n",
+      decoder_model->status, decoder_model->num_frame,
+      decoder_model->num_decoded_frame, decoder_model->num_shown_frame,
+      decoder_model->current_time, frames_in_buffer_pool(decoder_model),
+      decoder_model->initial_presentation_delay,
+      decoder_model->dfg_interval_queue.total_interval);
+  for (int i = 0; i < 10; ++i) {
+    const FRAME_BUFFER *const this_buffer =
+        &decoder_model->frame_buffer_pool[i];
+    printf("buffer %d, decode count %d, display count %d, present time %6.4f\n",
+           i, this_buffer->decoder_ref_count, this_buffer->player_ref_count,
+           this_buffer->presentation_time);
+  }
+}
+
+// op_index is the operating point index.
+void av1_decoder_model_init(const AV1_COMP *const cpi, AV1_LEVEL level,
+                            int op_index, DECODER_MODEL *const decoder_model) {
+  aom_clear_system_state();
+
+  decoder_model->status = DECODER_MODEL_OK;
+  decoder_model->level = level;
+
+  const AV1_COMMON *const cm = &cpi->common;
+  const SequenceHeader *const seq_params = &cm->seq_params;
+  decoder_model->bit_rate = get_max_bitrate(
+      av1_level_defs + level, seq_params->tier[op_index], seq_params->profile);
+
+  // TODO(huisu or anyone): implement SCHEDULE_MODE.
+  decoder_model->mode = RESOURCE_MODE;
+  decoder_model->encoder_buffer_delay = 20000;
+  decoder_model->decoder_buffer_delay = 70000;
+  decoder_model->is_low_delay_mode = false;
+
+  decoder_model->first_bit_arrival_time = 0.0;
+  decoder_model->last_bit_arrival_time = 0.0;
+  decoder_model->coded_bits = 0;
+
+  decoder_model->removal_time = INVALID_TIME;
+  decoder_model->presentation_time = INVALID_TIME;
+  decoder_model->decode_samples = 0;
+  decoder_model->display_samples = 0;
+  decoder_model->max_decode_rate = 0.0;
+  decoder_model->max_display_rate = 0.0;
+
+  decoder_model->num_frame = -1;
+  decoder_model->num_decoded_frame = -1;
+  decoder_model->num_shown_frame = -1;
+  decoder_model->current_time = 0.0;
+
+  initialize_buffer_pool(decoder_model);
+
+  DFG_INTERVAL_QUEUE *const dfg_interval_queue =
+      &decoder_model->dfg_interval_queue;
+  dfg_interval_queue->total_interval = 0.0;
+  dfg_interval_queue->head = 0;
+  dfg_interval_queue->size = 0;
+
+  if (seq_params->timing_info_present) {
+    decoder_model->num_ticks_per_picture =
+        seq_params->timing_info.num_ticks_per_picture;
+    decoder_model->display_clock_tick =
+        seq_params->timing_info.num_units_in_display_tick /
+        seq_params->timing_info.time_scale;
+  } else {
+    decoder_model->num_ticks_per_picture = 1;
+    decoder_model->display_clock_tick = 1.0 / cpi->framerate;
+  }
+
+  decoder_model->initial_display_delay =
+      seq_params->op_params[op_index].initial_display_delay;
+  decoder_model->initial_presentation_delay = INVALID_TIME;
+  decoder_model->decode_rate = av1_level_defs[level].max_decode_rate;
+}
+
+void av1_decoder_model_process_frame(const AV1_COMP *const cpi,
+                                     size_t coded_bits,
+                                     DECODER_MODEL *const decoder_model) {
+  if (!decoder_model || decoder_model->status != DECODER_MODEL_OK) return;
+
+  aom_clear_system_state();
+
+  const AV1_COMMON *const cm = &cpi->common;
+  const int luma_pic_size = cm->superres_upscaled_width * cm->height;
+  const int show_existing_frame = cm->show_existing_frame;
+  const int show_frame = cm->show_frame || show_existing_frame;
+  ++decoder_model->num_frame;
+  if (!show_existing_frame) ++decoder_model->num_decoded_frame;
+  if (show_frame) ++decoder_model->num_shown_frame;
+  decoder_model->coded_bits += coded_bits;
+
+  int display_idx = -1;
+  if (show_existing_frame) {
+    display_idx = decoder_model->vbi[cpi->existing_fb_idx_to_show];
+    if (display_idx < 0) {
+      decoder_model->status = DECODE_EXISTING_FRAME_BUF_EMPTY;
+      return;
+    }
+    if (decoder_model->frame_buffer_pool[display_idx].frame_type == KEY_FRAME) {
+      update_ref_buffers(decoder_model, display_idx, 0xFF);
+    }
+  } else {
+    const double removal_time = get_removal_time(decoder_model);
+    if (removal_time < 0.0) {
+      decoder_model->status = DECODE_FRAME_BUF_UNAVAILABLE;
+      return;
+    }
+
+    const int previous_decode_samples = decoder_model->decode_samples;
+    const double previous_removal_time = decoder_model->removal_time;
+    assert(previous_removal_time < removal_time);
+    decoder_model->removal_time = removal_time;
+    decoder_model->decode_samples = luma_pic_size;
+    const double this_decode_rate =
+        previous_decode_samples / (removal_time - previous_removal_time);
+    decoder_model->max_decode_rate =
+        AOMMAX(decoder_model->max_decode_rate, this_decode_rate);
+
+    // A frame with show_existing_frame being false indicates the end of a DFG.
+    // Update the bits arrival time of this DFG.
+    const double buffer_delay = (decoder_model->encoder_buffer_delay +
+                                 decoder_model->decoder_buffer_delay) /
+                                90000.0;
+    const double latest_arrival_time = removal_time - buffer_delay;
+    decoder_model->first_bit_arrival_time =
+        AOMMAX(decoder_model->last_bit_arrival_time, latest_arrival_time);
+    decoder_model->last_bit_arrival_time =
+        decoder_model->first_bit_arrival_time +
+        (double)decoder_model->coded_bits / decoder_model->bit_rate;
+    // Smoothing buffer underflows if the last bit arrives after the removal
+    // time.
+    if (decoder_model->last_bit_arrival_time > removal_time &&
+        !decoder_model->is_low_delay_mode) {
+      decoder_model->status = SMOOTHING_BUFFER_UNDERFLOW;
+      return;
+    }
+    // Reset the coded bits for the next DFG.
+    decoder_model->coded_bits = 0;
+
+    // Check if the smoothing buffer overflows.
+    DFG_INTERVAL_QUEUE *const queue = &decoder_model->dfg_interval_queue;
+    if (queue->size >= DFG_INTERVAL_QUEUE_SIZE) {
+      assert(0);
+    }
+    const double first_bit_arrival_time = decoder_model->first_bit_arrival_time;
+    const double last_bit_arrival_time = decoder_model->last_bit_arrival_time;
+    // Remove the DFGs with removal time earlier than last_bit_arrival_time.
+    while (queue->buf[queue->head].removal_time <= last_bit_arrival_time &&
+           queue->size > 0) {
+      if (queue->buf[queue->head].removal_time - first_bit_arrival_time +
+              queue->total_interval >
+          1.0) {
+        decoder_model->status = SMOOTHING_BUFFER_OVERFLOW;
+        return;
+      }
+      queue->total_interval -= queue->buf[queue->head].last_bit_arrival_time -
+                               queue->buf[queue->head].first_bit_arrival_time;
+      queue->head = (queue->head + 1) % DFG_INTERVAL_QUEUE_SIZE;
+      --queue->size;
+    }
+    // Push current DFG into the queue.
+    const int queue_index =
+        (queue->head + queue->size++) % DFG_INTERVAL_QUEUE_SIZE;
+    queue->buf[queue_index].first_bit_arrival_time = first_bit_arrival_time;
+    queue->buf[queue_index].last_bit_arrival_time = last_bit_arrival_time;
+    queue->buf[queue_index].removal_time = removal_time;
+    queue->total_interval += last_bit_arrival_time - first_bit_arrival_time;
+    // The smoothing buffer can hold at most "bit_rate" bits, which is
+    // equivalent to 1 second of total interval.
+    if (queue->total_interval > 1.0) {
+      decoder_model->status = SMOOTHING_BUFFER_OVERFLOW;
+      return;
+    }
+
+    release_processed_frames(decoder_model, removal_time);
+    decoder_model->current_time =
+        removal_time + time_to_decode_frame(cm, decoder_model->decode_rate);
+
+    const int cfbi = get_free_buffer(decoder_model);
+    if (cfbi < 0) {
+      decoder_model->status = DECODE_FRAME_BUF_UNAVAILABLE;
+      return;
+    }
+    const CurrentFrame *const current_frame = &cm->current_frame;
+    decoder_model->frame_buffer_pool[cfbi].frame_type =
+        cm->current_frame.frame_type;
+    display_idx = cfbi;
+    update_ref_buffers(decoder_model, cfbi, current_frame->refresh_frame_flags);
+
+    if (decoder_model->initial_presentation_delay < 0.0) {
+      // Display can begin after required number of frames have been buffered.
+      if (frames_in_buffer_pool(decoder_model) >=
+          decoder_model->initial_display_delay) {
+        decoder_model->initial_presentation_delay = decoder_model->current_time;
+        // Update presentation time for each shown frame in the frame buffer.
+        for (int i = 0; i < BUFFER_POOL_MAX_SIZE; ++i) {
+          FRAME_BUFFER *const this_buffer =
+              &decoder_model->frame_buffer_pool[i];
+          if (this_buffer->player_ref_count == 0) continue;
+          assert(this_buffer->display_index >= 0);
+          this_buffer->presentation_time =
+              get_presentation_time(decoder_model, this_buffer->display_index);
+        }
+      }
+    }
+  }
+
+  // Display.
+  if (show_frame) {
+    assert(display_idx >= 0 && display_idx < BUFFER_POOL_MAX_SIZE);
+    FRAME_BUFFER *const this_buffer =
+        &decoder_model->frame_buffer_pool[display_idx];
+    ++this_buffer->player_ref_count;
+    this_buffer->display_index = decoder_model->num_shown_frame;
+    const double presentation_time =
+        get_presentation_time(decoder_model, this_buffer->display_index);
+    this_buffer->presentation_time = presentation_time;
+    if (presentation_time >= 0.0 &&
+        decoder_model->current_time > presentation_time) {
+      decoder_model->status = DISPLAY_FRAME_LATE;
+      return;
+    }
+
+    const int previous_display_samples = decoder_model->display_samples;
+    const double previous_presentation_time = decoder_model->presentation_time;
+    decoder_model->display_samples = luma_pic_size;
+    decoder_model->presentation_time = presentation_time;
+    if (presentation_time >= 0.0 && previous_presentation_time >= 0.0) {
+      assert(previous_presentation_time < presentation_time);
+      const double this_display_rate =
+          previous_display_samples /
+          (presentation_time - previous_presentation_time);
+      decoder_model->max_display_rate =
+          AOMMAX(decoder_model->max_display_rate, this_display_rate);
+    }
+  }
+}
+
+void av1_init_level_info(AV1_COMP *cpi) {
+  for (int op_index = 0; op_index < MAX_NUM_OPERATING_POINTS; ++op_index) {
+    AV1LevelInfo *const this_level_info =
+        cpi->level_params.level_info[op_index];
+    if (!this_level_info) continue;
+    memset(this_level_info, 0, sizeof(*this_level_info));
+    AV1LevelSpec *const level_spec = &this_level_info->level_spec;
+    level_spec->level = SEQ_LEVEL_MAX;
+    AV1LevelStats *const level_stats = &this_level_info->level_stats;
+    level_stats->min_cropped_tile_width = INT_MAX;
+    level_stats->min_cropped_tile_height = INT_MAX;
+    level_stats->min_frame_width = INT_MAX;
+    level_stats->min_frame_height = INT_MAX;
+    level_stats->tile_width_is_valid = 1;
+    level_stats->min_cr = 1e8;
+
+    FrameWindowBuffer *const frame_window_buffer =
+        &this_level_info->frame_window_buffer;
+    frame_window_buffer->num = 0;
+    frame_window_buffer->start = 0;
+
+    const AV1_COMMON *const cm = &cpi->common;
+    const int upscaled_width = cm->superres_upscaled_width;
+    const int height = cm->height;
+    const int pic_size = upscaled_width * height;
+    for (AV1_LEVEL level = SEQ_LEVEL_2_0; level < SEQ_LEVELS; ++level) {
+      DECODER_MODEL *const this_model = &this_level_info->decoder_models[level];
+      const AV1LevelSpec *const spec = &av1_level_defs[level];
+      if (upscaled_width > spec->max_h_size || height > spec->max_v_size ||
+          pic_size > spec->max_picture_size) {
+        // Turn off decoder model for this level as the frame size already
+        // exceeds level constraints.
+        this_model->status = DECODER_MODEL_DISABLED;
+      } else {
+        av1_decoder_model_init(cpi, level, op_index, this_model);
+      }
+    }
+  }
+}
+
+static double get_min_cr(const AV1LevelSpec *const level_spec, int tier,
+                         int is_still_picture, int64_t decoded_sample_rate) {
+  if (is_still_picture) return 0.8;
+  if (level_spec->level < SEQ_LEVEL_4_0) tier = 0;
+  const double min_cr_basis = tier ? level_spec->high_cr : level_spec->main_cr;
+  const double speed_adj =
+      (double)decoded_sample_rate / level_spec->max_display_rate;
+  return AOMMAX(min_cr_basis * speed_adj, 0.8);
+}
+
+double av1_get_min_cr_for_level(AV1_LEVEL level_index, int tier,
+                                int is_still_picture) {
+  assert(is_valid_seq_level_idx(level_index));
+  const AV1LevelSpec *const level_spec = &av1_level_defs[level_index];
+  return get_min_cr(level_spec, tier, is_still_picture,
+                    level_spec->max_decode_rate);
+}
+
+static void get_temporal_parallel_params(int scalability_mode_idc,
+                                         int *temporal_parallel_num,
+                                         int *temporal_parallel_denom) {
+  if (scalability_mode_idc < 0) {
+    *temporal_parallel_num = 1;
+    *temporal_parallel_denom = 1;
+    return;
+  }
+
+  // TODO(huisu@): handle scalability cases.
+  if (scalability_mode_idc == SCALABILITY_SS) {
+    (void)scalability_mode_idc;
+  } else {
+    (void)scalability_mode_idc;
+  }
+}
+
+#define MAX_TILE_SIZE (4096 * 2304)
+#define MIN_CROPPED_TILE_WIDTH 8
+#define MIN_CROPPED_TILE_HEIGHT 8
+#define MIN_FRAME_WIDTH 16
+#define MIN_FRAME_HEIGHT 16
+#define MAX_TILE_SIZE_HEADER_RATE_PRODUCT 588251136
+
+static TARGET_LEVEL_FAIL_ID check_level_constraints(
+    const AV1LevelInfo *const level_info, AV1_LEVEL level, int tier,
+    int is_still_picture, BITSTREAM_PROFILE profile, int check_bitrate) {
+  const DECODER_MODEL *const decoder_model = &level_info->decoder_models[level];
+  const DECODER_MODEL_STATUS decoder_model_status = decoder_model->status;
+  if (decoder_model_status != DECODER_MODEL_OK &&
+      decoder_model_status != DECODER_MODEL_DISABLED) {
+    return DECODER_MODEL_FAIL;
+  }
+
+  const AV1LevelSpec *const level_spec = &level_info->level_spec;
+  const AV1LevelSpec *const target_level_spec = &av1_level_defs[level];
+  const AV1LevelStats *const level_stats = &level_info->level_stats;
+  TARGET_LEVEL_FAIL_ID fail_id = TARGET_LEVEL_OK;
+  do {
+    if (level_spec->max_picture_size > target_level_spec->max_picture_size) {
+      fail_id = LUMA_PIC_SIZE_TOO_LARGE;
+      break;
+    }
+
+    if (level_spec->max_h_size > target_level_spec->max_h_size) {
+      fail_id = LUMA_PIC_H_SIZE_TOO_LARGE;
+      break;
+    }
+
+    if (level_spec->max_v_size > target_level_spec->max_v_size) {
+      fail_id = LUMA_PIC_V_SIZE_TOO_LARGE;
+      break;
+    }
+
+    if (level_spec->max_tile_cols > target_level_spec->max_tile_cols) {
+      fail_id = TOO_MANY_TILE_COLUMNS;
+      break;
+    }
+
+    if (level_spec->max_tiles > target_level_spec->max_tiles) {
+      fail_id = TOO_MANY_TILES;
+      break;
+    }
+
+    if (level_spec->max_header_rate > target_level_spec->max_header_rate) {
+      fail_id = FRAME_HEADER_RATE_TOO_HIGH;
+      break;
+    }
+
+    if (decoder_model->max_display_rate >
+        (double)target_level_spec->max_display_rate) {
+      fail_id = DISPLAY_RATE_TOO_HIGH;
+      break;
+    }
+
+    // TODO(huisu): we are not using max decode rate calculated by the decoder
+    // model because the model in resource availability mode always returns
+    // MaxDecodeRate(as in the level definitions) as the max decode rate.
+    if (level_spec->max_decode_rate > target_level_spec->max_decode_rate) {
+      fail_id = DECODE_RATE_TOO_HIGH;
+      break;
+    }
+
+    if (level_spec->max_tile_rate > target_level_spec->max_tiles * 120) {
+      fail_id = TILE_RATE_TOO_HIGH;
+      break;
+    }
+
+    if (level_stats->max_tile_size > MAX_TILE_SIZE) {
+      fail_id = TILE_TOO_LARGE;
+      break;
+    }
+
+    if (level_stats->max_superres_tile_width > MAX_TILE_WIDTH) {
+      fail_id = SUPERRES_TILE_WIDTH_TOO_LARGE;
+      break;
+    }
+
+    if (level_stats->min_cropped_tile_width < MIN_CROPPED_TILE_WIDTH) {
+      fail_id = CROPPED_TILE_WIDTH_TOO_SMALL;
+      break;
+    }
+
+    if (level_stats->min_cropped_tile_height < MIN_CROPPED_TILE_HEIGHT) {
+      fail_id = CROPPED_TILE_HEIGHT_TOO_SMALL;
+      break;
+    }
+
+    if (level_stats->min_frame_width < MIN_FRAME_WIDTH) {
+      fail_id = LUMA_PIC_H_SIZE_TOO_SMALL;
+      break;
+    }
+
+    if (level_stats->min_frame_height < MIN_FRAME_HEIGHT) {
+      fail_id = LUMA_PIC_V_SIZE_TOO_SMALL;
+      break;
+    }
+
+    if (!level_stats->tile_width_is_valid) {
+      fail_id = TILE_WIDTH_INVALID;
+      break;
+    }
+
+    const double min_cr = get_min_cr(target_level_spec, tier, is_still_picture,
+                                     level_spec->max_decode_rate);
+    if (level_stats->min_cr < min_cr) {
+      fail_id = CR_TOO_SMALL;
+      break;
+    }
+
+    if (check_bitrate) {
+      // Check average bitrate instead of max_bitrate.
+      const double bitrate_limit =
+          get_max_bitrate(target_level_spec, tier, profile);
+      const double avg_bitrate = level_stats->total_compressed_size * 8.0 /
+                                 level_stats->total_time_encoded;
+      if (avg_bitrate > bitrate_limit) {
+        fail_id = BITRATE_TOO_HIGH;
+        break;
+      }
+    }
+
+    if (target_level_spec->level > SEQ_LEVEL_5_1) {
+      int temporal_parallel_num;
+      int temporal_parallel_denom;
+      const int scalability_mode_idc = -1;
+      get_temporal_parallel_params(scalability_mode_idc, &temporal_parallel_num,
+                                   &temporal_parallel_denom);
+      const int val = level_stats->max_tile_size * level_spec->max_header_rate *
+                      temporal_parallel_denom / temporal_parallel_num;
+      if (val > MAX_TILE_SIZE_HEADER_RATE_PRODUCT) {
+        fail_id = TILE_SIZE_HEADER_RATE_TOO_HIGH;
+        break;
+      }
+    }
+  } while (0);
+
+  return fail_id;
+}
+
+static void get_tile_stats(const AV1_COMMON *const cm,
+                           const TileDataEnc *const tile_data,
+                           int *max_tile_size, int *max_superres_tile_width,
+                           int *min_cropped_tile_width,
+                           int *min_cropped_tile_height,
+                           int *tile_width_valid) {
+  const int tile_cols = cm->tiles.cols;
+  const int tile_rows = cm->tiles.rows;
+  const int superres_scale_denominator = cm->superres_scale_denominator;
+
+  *max_tile_size = 0;
+  *max_superres_tile_width = 0;
+  *min_cropped_tile_width = INT_MAX;
+  *min_cropped_tile_height = INT_MAX;
+  *tile_width_valid = 1;
+
+  for (int tile_row = 0; tile_row < tile_rows; ++tile_row) {
+    for (int tile_col = 0; tile_col < tile_cols; ++tile_col) {
+      const TileInfo *const tile_info =
+          &tile_data[tile_row * cm->tiles.cols + tile_col].tile_info;
+      const int tile_width =
+          (tile_info->mi_col_end - tile_info->mi_col_start) * MI_SIZE;
+      const int tile_height =
+          (tile_info->mi_row_end - tile_info->mi_row_start) * MI_SIZE;
+      const int tile_size = tile_width * tile_height;
+      *max_tile_size = AOMMAX(*max_tile_size, tile_size);
+
+      const int supperres_tile_width =
+          tile_width * superres_scale_denominator / SCALE_NUMERATOR;
+      *max_superres_tile_width =
+          AOMMAX(*max_superres_tile_width, supperres_tile_width);
+
+      const int cropped_tile_width =
+          cm->width - tile_info->mi_col_start * MI_SIZE;
+      const int cropped_tile_height =
+          cm->height - tile_info->mi_row_start * MI_SIZE;
+      *min_cropped_tile_width =
+          AOMMIN(*min_cropped_tile_width, cropped_tile_width);
+      *min_cropped_tile_height =
+          AOMMIN(*min_cropped_tile_height, cropped_tile_height);
+
+      const int is_right_most_tile =
+          tile_info->mi_col_end == cm->mi_params.mi_cols;
+      if (!is_right_most_tile) {
+        if (av1_superres_scaled(cm))
+          *tile_width_valid &= tile_width >= 128;
+        else
+          *tile_width_valid &= tile_width >= 64;
+      }
+    }
+  }
+}
+
+static int store_frame_record(int64_t ts_start, int64_t ts_end,
+                              size_t encoded_size, int pic_size,
+                              int frame_header_count, int tiles, int show_frame,
+                              int show_existing_frame,
+                              FrameWindowBuffer *const buffer) {
+  if (buffer->num < FRAME_WINDOW_SIZE) {
+    ++buffer->num;
+  } else {
+    buffer->start = (buffer->start + 1) % FRAME_WINDOW_SIZE;
+  }
+  const int new_idx = (buffer->start + buffer->num - 1) % FRAME_WINDOW_SIZE;
+  FrameRecord *const record = &buffer->buf[new_idx];
+  record->ts_start = ts_start;
+  record->ts_end = ts_end;
+  record->encoded_size_in_bytes = encoded_size;
+  record->pic_size = pic_size;
+  record->frame_header_count = frame_header_count;
+  record->tiles = tiles;
+  record->show_frame = show_frame;
+  record->show_existing_frame = show_existing_frame;
+
+  return new_idx;
+}
+
+// Count the number of frames encoded in the last "duration" ticks, in display
+// time.
+static int count_frames(const FrameWindowBuffer *const buffer,
+                        int64_t duration) {
+  const int current_idx = (buffer->start + buffer->num - 1) % FRAME_WINDOW_SIZE;
+  // Assume current frame is shown frame.
+  assert(buffer->buf[current_idx].show_frame);
+
+  const int64_t current_time = buffer->buf[current_idx].ts_end;
+  const int64_t time_limit = AOMMAX(current_time - duration, 0);
+  int num_frames = 1;
+  int index = current_idx - 1;
+  for (int i = buffer->num - 2; i >= 0; --i, --index, ++num_frames) {
+    if (index < 0) index = FRAME_WINDOW_SIZE - 1;
+    const FrameRecord *const record = &buffer->buf[index];
+    if (!record->show_frame) continue;
+    const int64_t ts_start = record->ts_start;
+    if (ts_start < time_limit) break;
+  }
+
+  return num_frames;
+}
+
+// Scan previously encoded frames and update level metrics accordingly.
+static void scan_past_frames(const FrameWindowBuffer *const buffer,
+                             int num_frames_to_scan,
+                             AV1LevelSpec *const level_spec,
+                             AV1LevelStats *const level_stats) {
+  const int num_frames_in_buffer = buffer->num;
+  int index = (buffer->start + num_frames_in_buffer - 1) % FRAME_WINDOW_SIZE;
+  int frame_headers = 0;
+  int tiles = 0;
+  int64_t display_samples = 0;
+  int64_t decoded_samples = 0;
+  size_t encoded_size_in_bytes = 0;
+  for (int i = 0; i < AOMMIN(num_frames_in_buffer, num_frames_to_scan); ++i) {
+    const FrameRecord *const record = &buffer->buf[index];
+    if (!record->show_existing_frame) {
+      frame_headers += record->frame_header_count;
+      decoded_samples += record->pic_size;
+    }
+    if (record->show_frame) {
+      display_samples += record->pic_size;
+    }
+    tiles += record->tiles;
+    encoded_size_in_bytes += record->encoded_size_in_bytes;
+    --index;
+    if (index < 0) index = FRAME_WINDOW_SIZE - 1;
+  }
+  level_spec->max_header_rate =
+      AOMMAX(level_spec->max_header_rate, frame_headers);
+  // TODO(huisu): we can now compute max display rate with the decoder model, so
+  // these couple of lines can be removed. Keep them here for a while for
+  // debugging purpose.
+  level_spec->max_display_rate =
+      AOMMAX(level_spec->max_display_rate, display_samples);
+  level_spec->max_decode_rate =
+      AOMMAX(level_spec->max_decode_rate, decoded_samples);
+  level_spec->max_tile_rate = AOMMAX(level_spec->max_tile_rate, tiles);
+  level_stats->max_bitrate =
+      AOMMAX(level_stats->max_bitrate, (int)encoded_size_in_bytes * 8);
+}
+
+void av1_update_level_info(AV1_COMP *cpi, size_t size, int64_t ts_start,
+                           int64_t ts_end) {
+  AV1_COMMON *const cm = &cpi->common;
+  const AV1LevelParams *const level_params = &cpi->level_params;
+
+  const int upscaled_width = cm->superres_upscaled_width;
+  const int width = cm->width;
+  const int height = cm->height;
+  const int tile_cols = cm->tiles.cols;
+  const int tile_rows = cm->tiles.rows;
+  const int tiles = tile_cols * tile_rows;
+  const int luma_pic_size = upscaled_width * height;
+  const int frame_header_count = level_params->frame_header_count;
+  const int show_frame = cm->show_frame;
+  const int show_existing_frame = cm->show_existing_frame;
+
+  int max_tile_size;
+  int min_cropped_tile_width;
+  int min_cropped_tile_height;
+  int max_superres_tile_width;
+  int tile_width_is_valid;
+  get_tile_stats(cm, cpi->tile_data, &max_tile_size, &max_superres_tile_width,
+                 &min_cropped_tile_width, &min_cropped_tile_height,
+                 &tile_width_is_valid);
+
+  aom_clear_system_state();
+  const double compression_ratio = av1_get_compression_ratio(cm, size);
+  const double total_time_encoded =
+      (cpi->time_stamps.prev_end_seen - cpi->time_stamps.first_ever) /
+      (double)TICKS_PER_SEC;
+
+  const int temporal_layer_id = cm->temporal_layer_id;
+  const int spatial_layer_id = cm->spatial_layer_id;
+  const SequenceHeader *const seq_params = &cm->seq_params;
+  const BITSTREAM_PROFILE profile = seq_params->profile;
+  const int is_still_picture = seq_params->still_picture;
+  // update level_stats
+  // TODO(kyslov@) fix the implementation according to buffer model
+  for (int i = 0; i < seq_params->operating_points_cnt_minus_1 + 1; ++i) {
+    if (!is_in_operating_point(seq_params->operating_point_idc[i],
+                               temporal_layer_id, spatial_layer_id) ||
+        !((level_params->keep_level_stats >> i) & 1)) {
+      continue;
+    }
+
+    AV1LevelInfo *const level_info = level_params->level_info[i];
+    assert(level_info != NULL);
+    AV1LevelStats *const level_stats = &level_info->level_stats;
+
+    level_stats->max_tile_size =
+        AOMMAX(level_stats->max_tile_size, max_tile_size);
+    level_stats->max_superres_tile_width =
+        AOMMAX(level_stats->max_superres_tile_width, max_superres_tile_width);
+    level_stats->min_cropped_tile_width =
+        AOMMIN(level_stats->min_cropped_tile_width, min_cropped_tile_width);
+    level_stats->min_cropped_tile_height =
+        AOMMIN(level_stats->min_cropped_tile_height, min_cropped_tile_height);
+    level_stats->tile_width_is_valid &= tile_width_is_valid;
+    level_stats->min_frame_width = AOMMIN(level_stats->min_frame_width, width);
+    level_stats->min_frame_height =
+        AOMMIN(level_stats->min_frame_height, height);
+    level_stats->min_cr = AOMMIN(level_stats->min_cr, compression_ratio);
+    level_stats->total_compressed_size += (double)size;
+
+    // update level_spec
+    // TODO(kyslov@) update all spec fields
+    AV1LevelSpec *const level_spec = &level_info->level_spec;
+    level_spec->max_picture_size =
+        AOMMAX(level_spec->max_picture_size, luma_pic_size);
+    level_spec->max_h_size =
+        AOMMAX(level_spec->max_h_size, cm->superres_upscaled_width);
+    level_spec->max_v_size = AOMMAX(level_spec->max_v_size, height);
+    level_spec->max_tile_cols = AOMMAX(level_spec->max_tile_cols, tile_cols);
+    level_spec->max_tiles = AOMMAX(level_spec->max_tiles, tiles);
+
+    // Store info. of current frame into FrameWindowBuffer.
+    FrameWindowBuffer *const buffer = &level_info->frame_window_buffer;
+    store_frame_record(ts_start, ts_end, size, luma_pic_size,
+                       frame_header_count, tiles, show_frame,
+                       show_existing_frame, buffer);
+    if (show_frame) {
+      // Count the number of frames encoded in the past 1 second.
+      const int encoded_frames_in_last_second =
+          show_frame ? count_frames(buffer, TICKS_PER_SEC) : 0;
+      scan_past_frames(buffer, encoded_frames_in_last_second, level_spec,
+                       level_stats);
+      level_stats->total_time_encoded = total_time_encoded;
+    }
+
+    DECODER_MODEL *const decoder_models = level_info->decoder_models;
+    for (AV1_LEVEL level = SEQ_LEVEL_2_0; level < SEQ_LEVELS; ++level) {
+      av1_decoder_model_process_frame(cpi, size << 3, &decoder_models[level]);
+    }
+
+    // Check whether target level is met.
+    const AV1_LEVEL target_level = level_params->target_seq_level_idx[i];
+    if (target_level < SEQ_LEVELS) {
+      assert(is_valid_seq_level_idx(target_level));
+      const int tier = seq_params->tier[i];
+      const TARGET_LEVEL_FAIL_ID fail_id = check_level_constraints(
+          level_info, target_level, tier, is_still_picture, profile, 0);
+      if (fail_id != TARGET_LEVEL_OK) {
+        const int target_level_major = 2 + (target_level >> 2);
+        const int target_level_minor = target_level & 3;
+        aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+                           "Failed to encode to the target level %d_%d. %s",
+                           target_level_major, target_level_minor,
+                           level_fail_messages[fail_id]);
+      }
+    }
+  }
+}
+
+aom_codec_err_t av1_get_seq_level_idx(const SequenceHeader *seq_params,
+                                      const AV1LevelParams *level_params,
+                                      int *seq_level_idx) {
+  const int is_still_picture = seq_params->still_picture;
+  const BITSTREAM_PROFILE profile = seq_params->profile;
+  for (int op = 0; op < seq_params->operating_points_cnt_minus_1 + 1; ++op) {
+    seq_level_idx[op] = (int)SEQ_LEVEL_MAX;
+    if (!((level_params->keep_level_stats >> op) & 1)) continue;
+    const int tier = seq_params->tier[op];
+    const AV1LevelInfo *const level_info = level_params->level_info[op];
+    assert(level_info != NULL);
+    for (int level = 0; level < SEQ_LEVELS; ++level) {
+      if (!is_valid_seq_level_idx(level)) continue;
+      const TARGET_LEVEL_FAIL_ID fail_id = check_level_constraints(
+          level_info, level, tier, is_still_picture, profile, 1);
+      if (fail_id == TARGET_LEVEL_OK) {
+        seq_level_idx[op] = level;
+        break;
+      }
+    }
+  }
+
+  return AOM_CODEC_OK;
+}
diff --git a/libs/libaom/src/av1/encoder/level.h b/libs/libaom/src/av1/encoder/level.h
new file mode 100644
index 000000000..5e0cce200
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/level.h
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_LEVEL_H_
+#define AOM_AV1_ENCODER_LEVEL_H_
+
+#include "av1/common/enums.h"
+
+struct AV1_COMP;
+
+// AV1 Level Specifications
+typedef struct {
+  AV1_LEVEL level;
+  int max_picture_size;
+  int max_h_size;
+  int max_v_size;
+  int max_header_rate;
+  int max_tile_rate;
+  int max_tiles;
+  int max_tile_cols;
+  int64_t max_display_rate;
+  int64_t max_decode_rate;
+  double main_mbps;
+  double high_mbps;
+  double main_cr;
+  double high_cr;
+} AV1LevelSpec;
+
+typedef struct {
+  int64_t ts_start;
+  int64_t ts_end;
+  size_t encoded_size_in_bytes;
+  int pic_size;
+  int frame_header_count;
+  int tiles;
+  int show_frame;
+  int show_existing_frame;
+} FrameRecord;
+
+// Record frame info. in a rolling window.
+#define FRAME_WINDOW_SIZE 256
+typedef struct {
+  FrameRecord buf[FRAME_WINDOW_SIZE];
+  int num;    // Number of FrameRecord stored in the buffer.
+  int start;  // Buffer index of the first FrameRecord.
+} FrameWindowBuffer;
+
+typedef struct {
+  int max_bitrate;  // Max bitrate in any 1-second window, in bps.
+  int max_tile_size;
+  int max_superres_tile_width;
+  int min_cropped_tile_width;
+  int min_cropped_tile_height;
+  int tile_width_is_valid;
+  int min_frame_width;
+  int min_frame_height;
+  double total_compressed_size;  // In bytes.
+  double total_time_encoded;     // In seconds.
+  double min_cr;
+} AV1LevelStats;
+
+// The following data structures are for the decoder model.
+typedef struct {
+  int decoder_ref_count;
+  int player_ref_count;
+  int display_index;
+  FRAME_TYPE frame_type;
+  double presentation_time;
+} FRAME_BUFFER;
+
+// Interval of bits transmission for a DFG(Decodable Frame Group).
+typedef struct {
+  double first_bit_arrival_time;  // Time when the first bit arrives.
+  double last_bit_arrival_time;   // Time when the last bit arrives.
+  // Removal time means the time when the bits to be decoded are removed from
+  // the smoothing buffer. Removal time is essentially the time when the
+  // decoding of the frame starts.
+  double removal_time;
+} DFG_INTERVAL;
+
+#define DFG_INTERVAL_QUEUE_SIZE 64
+typedef struct {
+  int head;
+  int size;
+  double total_interval;
+  DFG_INTERVAL buf[DFG_INTERVAL_QUEUE_SIZE];
+} DFG_INTERVAL_QUEUE;
+
+enum {
+  RESOURCE_MODE = 0,  // Resource availability mode.
+  SCHEDULE_MODE       // Decoding schedule mode.
+} UENUM1BYTE(DECODER_MODEL_MODE);
+
+enum {
+  DECODER_MODEL_OK = 0,
+  DECODE_BUFFER_AVAILABLE_LATE,
+  DECODE_FRAME_BUF_UNAVAILABLE,
+  DECODE_EXISTING_FRAME_BUF_EMPTY,
+  DISPLAY_FRAME_LATE,
+  SMOOTHING_BUFFER_UNDERFLOW,
+  SMOOTHING_BUFFER_OVERFLOW,
+  DECODER_MODEL_DISABLED
+} UENUM1BYTE(DECODER_MODEL_STATUS);
+
+#define BUFFER_POOL_MAX_SIZE 10
+typedef struct {
+  DECODER_MODEL_STATUS status;
+  DECODER_MODEL_MODE mode;
+  bool is_low_delay_mode;
+  AV1_LEVEL level;
+  int encoder_buffer_delay;  // In units of 1/90000 seconds.
+  int decoder_buffer_delay;  // In units of 1/90000 seconds.
+  int num_ticks_per_picture;
+  int initial_display_delay;  // In units of frames.
+  int64_t decode_rate;
+  double display_clock_tick;          // In units of seconds.
+  double current_time;                // In units of seconds.
+  double initial_presentation_delay;  // In units of seconds.
+  double bit_rate;                    // Bits per second.
+
+  int num_frame;
+  int num_decoded_frame;
+  int num_shown_frame;
+  int vbi[REF_FRAMES];  // Virtual buffer index.
+  FRAME_BUFFER frame_buffer_pool[BUFFER_POOL_MAX_SIZE];
+  DFG_INTERVAL_QUEUE dfg_interval_queue;
+
+  // Information for the DFG(Decodable Frame Group) being processed.
+  double first_bit_arrival_time;
+  double last_bit_arrival_time;
+  size_t coded_bits;
+
+  // Information for the frame being processed.
+  double removal_time;
+  double presentation_time;
+  int decode_samples;
+  int display_samples;
+
+  double max_display_rate;
+  double max_decode_rate;
+} DECODER_MODEL;
+
+typedef struct {
+  AV1LevelStats level_stats;
+  AV1LevelSpec level_spec;
+  FrameWindowBuffer frame_window_buffer;
+  DECODER_MODEL decoder_models[SEQ_LEVELS];
+} AV1LevelInfo;
+
+typedef struct AV1LevelParams {
+  // Specifies the level that the coded video sequence conforms to for each
+  // operating point.
+  AV1_LEVEL target_seq_level_idx[MAX_NUM_OPERATING_POINTS];
+  // Bit mask to indicate whether to keep level stats for corresponding
+  // operating points.
+  uint32_t keep_level_stats;
+  // Level information for each operating point.
+  AV1LevelInfo *level_info[MAX_NUM_OPERATING_POINTS];
+  // Count the number of OBU_FRAME and OBU_FRAME_HEADER for level calculation.
+  int frame_header_count;
+} AV1LevelParams;
+
+static INLINE int is_in_operating_point(int operating_point,
+                                        int temporal_layer_id,
+                                        int spatial_layer_id) {
+  if (!operating_point) return 1;
+
+  return ((operating_point >> temporal_layer_id) & 1) &&
+         ((operating_point >> (spatial_layer_id + 8)) & 1);
+}
+
+void av1_init_level_info(struct AV1_COMP *cpi);
+
+void av1_update_level_info(struct AV1_COMP *cpi, size_t size, int64_t ts_start,
+                           int64_t ts_end);
+
+// Return sequence level indices in seq_level_idx[MAX_NUM_OPERATING_POINTS].
+aom_codec_err_t av1_get_seq_level_idx(const SequenceHeader *seq_params,
+                                      const AV1LevelParams *level_params,
+                                      int *seq_level_idx);
+
+// Print the status of the decoder model(for debugging).
+void av1_decoder_model_print_status(const DECODER_MODEL *const decoder_model);
+
+void av1_decoder_model_init(const struct AV1_COMP *const cpi, AV1_LEVEL level,
+                            int op_index, DECODER_MODEL *const decoder_model);
+
+void av1_decoder_model_process_frame(const struct AV1_COMP *const cpi,
+                                     size_t coded_bits,
+                                     DECODER_MODEL *const decoder_model);
+
+// Return max bitrate(bps) for given level.
+double av1_get_max_bitrate_for_level(AV1_LEVEL level_index, int tier,
+                                     BITSTREAM_PROFILE profile);
+
+// Get max number of tiles and tile columns for given level.
+void av1_get_max_tiles_for_level(AV1_LEVEL level_index, int *const max_tiles,
+                                 int *const max_tile_cols);
+
+// Return minimum compression ratio for given level.
+double av1_get_min_cr_for_level(AV1_LEVEL level_index, int tier,
+                                int is_still_picture);
+#endif  // AOM_AV1_ENCODER_LEVEL_H_
diff --git a/libs/libaom/src/av1/encoder/lookahead.c b/libs/libaom/src/av1/encoder/lookahead.c
new file mode 100644
index 000000000..0f7c81989
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/lookahead.c
@@ -0,0 +1,205 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <assert.h>
+#include <stdlib.h>
+
+#include "config/aom_config.h"
+
+#include "aom_scale/yv12config.h"
+#include "av1/common/common.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/extend.h"
+#include "av1/encoder/lookahead.h"
+
+/* Return the buffer at the given absolute index and increment the index */
+static struct lookahead_entry *pop(struct lookahead_ctx *ctx, int *idx) {
+  int index = *idx;
+  struct lookahead_entry *buf = ctx->buf + index;
+
+  assert(index < ctx->max_sz);
+  if (++index >= ctx->max_sz) index -= ctx->max_sz;
+  *idx = index;
+  return buf;
+}
+
+void av1_lookahead_destroy(struct lookahead_ctx *ctx) {
+  if (ctx) {
+    if (ctx->buf) {
+      int i;
+
+      for (i = 0; i < ctx->max_sz; i++) aom_free_frame_buffer(&ctx->buf[i].img);
+      free(ctx->buf);
+    }
+    free(ctx);
+  }
+}
+
+struct lookahead_ctx *av1_lookahead_init(
+    unsigned int width, unsigned int height, unsigned int subsampling_x,
+    unsigned int subsampling_y, int use_highbitdepth, unsigned int depth,
+    const int border_in_pixels, int byte_alignment, int num_lap_buffers) {
+  struct lookahead_ctx *ctx = NULL;
+  int lag_in_frames = AOMMAX(1, depth);
+
+  // Add the lags to depth and clamp
+  depth += num_lap_buffers;
+  depth = clamp(depth, 1, MAX_TOTAL_BUFFERS);
+
+  // Allocate memory to keep previous source frames available.
+  depth += MAX_PRE_FRAMES;
+
+  // Allocate the lookahead structures
+  ctx = calloc(1, sizeof(*ctx));
+  if (ctx) {
+    unsigned int i;
+    ctx->max_sz = depth;
+    ctx->read_ctxs[ENCODE_STAGE].pop_sz = ctx->max_sz - MAX_PRE_FRAMES;
+    ctx->read_ctxs[ENCODE_STAGE].valid = 1;
+    if (num_lap_buffers) {
+      ctx->read_ctxs[LAP_STAGE].pop_sz = lag_in_frames;
+      ctx->read_ctxs[LAP_STAGE].valid = 1;
+    }
+    ctx->buf = calloc(depth, sizeof(*ctx->buf));
+    if (!ctx->buf) goto fail;
+    for (i = 0; i < depth; i++) {
+      aom_free_frame_buffer(&ctx->buf[i].img);
+      if (aom_realloc_frame_buffer(&ctx->buf[i].img, width, height,
+                                   subsampling_x, subsampling_y,
+                                   use_highbitdepth, border_in_pixels,
+                                   byte_alignment, NULL, NULL, NULL))
+        goto fail;
+    }
+  }
+  return ctx;
+fail:
+  av1_lookahead_destroy(ctx);
+  return NULL;
+}
+
+int av1_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src,
+                       int64_t ts_start, int64_t ts_end, int use_highbitdepth,
+                       aom_enc_frame_flags_t flags) {
+  struct lookahead_entry *buf;
+  int width = src->y_crop_width;
+  int height = src->y_crop_height;
+  int uv_width = src->uv_crop_width;
+  int uv_height = src->uv_crop_height;
+  int subsampling_x = src->subsampling_x;
+  int subsampling_y = src->subsampling_y;
+  int larger_dimensions, new_dimensions;
+
+  assert(ctx->read_ctxs[ENCODE_STAGE].valid == 1);
+  if (ctx->read_ctxs[ENCODE_STAGE].sz + 1 + MAX_PRE_FRAMES > ctx->max_sz)
+    return 1;
+  ctx->read_ctxs[ENCODE_STAGE].sz++;
+  if (ctx->read_ctxs[LAP_STAGE].valid) {
+    ctx->read_ctxs[LAP_STAGE].sz++;
+  }
+  buf = pop(ctx, &ctx->write_idx);
+
+  new_dimensions = width != buf->img.y_crop_width ||
+                   height != buf->img.y_crop_height ||
+                   uv_width != buf->img.uv_crop_width ||
+                   uv_height != buf->img.uv_crop_height;
+  larger_dimensions = width > buf->img.y_width || height > buf->img.y_height ||
+                      uv_width > buf->img.uv_width ||
+                      uv_height > buf->img.uv_height;
+  assert(!larger_dimensions || new_dimensions);
+
+  if (larger_dimensions) {
+    YV12_BUFFER_CONFIG new_img;
+    memset(&new_img, 0, sizeof(new_img));
+    if (aom_alloc_frame_buffer(&new_img, width, height, subsampling_x,
+                               subsampling_y, use_highbitdepth,
+                               AOM_BORDER_IN_PIXELS, 0))
+      return 1;
+    aom_free_frame_buffer(&buf->img);
+    buf->img = new_img;
+  } else if (new_dimensions) {
+    buf->img.y_crop_width = src->y_crop_width;
+    buf->img.y_crop_height = src->y_crop_height;
+    buf->img.uv_crop_width = src->uv_crop_width;
+    buf->img.uv_crop_height = src->uv_crop_height;
+    buf->img.subsampling_x = src->subsampling_x;
+    buf->img.subsampling_y = src->subsampling_y;
+  }
+  // Partial copy not implemented yet
+  av1_copy_and_extend_frame(src, &buf->img);
+
+  buf->ts_start = ts_start;
+  buf->ts_end = ts_end;
+  buf->flags = flags;
+  aom_remove_metadata_from_frame_buffer(&buf->img);
+  aom_copy_metadata_to_frame_buffer(&buf->img, src->metadata);
+  return 0;
+}
+
+struct lookahead_entry *av1_lookahead_pop(struct lookahead_ctx *ctx, int drain,
+                                          COMPRESSOR_STAGE stage) {
+  struct lookahead_entry *buf = NULL;
+  if (ctx) {
+    struct read_ctx *read_ctx = &ctx->read_ctxs[stage];
+    assert(read_ctx->valid == 1);
+    if (read_ctx->sz && (drain || read_ctx->sz == read_ctx->pop_sz)) {
+      buf = pop(ctx, &read_ctx->read_idx);
+      read_ctx->sz--;
+    }
+  }
+  return buf;
+}
+
+struct lookahead_entry *av1_lookahead_peek(struct lookahead_ctx *ctx, int index,
+                                           COMPRESSOR_STAGE stage) {
+  struct lookahead_entry *buf = NULL;
+  struct read_ctx *read_ctx = NULL;
+  if (ctx == NULL) {
+    return buf;
+  }
+
+  read_ctx = &ctx->read_ctxs[stage];
+  assert(read_ctx->valid == 1);
+  if (index >= 0) {
+    // Forward peek
+    if (index < read_ctx->sz) {
+      index += read_ctx->read_idx;
+      if (index >= ctx->max_sz) index -= ctx->max_sz;
+      buf = ctx->buf + index;
+    }
+  } else if (index < 0) {
+    // Backward peek
+    if (-index <= MAX_PRE_FRAMES) {
+      index += (int)(read_ctx->read_idx);
+      if (index < 0) index += (int)(ctx->max_sz);
+      buf = ctx->buf + index;
+    }
+  }
+
+  return buf;
+}
+
+unsigned int av1_lookahead_depth(struct lookahead_ctx *ctx,
+                                 COMPRESSOR_STAGE stage) {
+  struct read_ctx *read_ctx = NULL;
+  assert(ctx != NULL);
+
+  read_ctx = &ctx->read_ctxs[stage];
+  assert(read_ctx->valid == 1);
+  return read_ctx->sz;
+}
+
+int av1_lookahead_pop_sz(struct lookahead_ctx *ctx, COMPRESSOR_STAGE stage) {
+  struct read_ctx *read_ctx = NULL;
+  assert(ctx != NULL);
+
+  read_ctx = &ctx->read_ctxs[stage];
+  assert(read_ctx->valid == 1);
+  return read_ctx->pop_sz;
+}
diff --git a/libs/libaom/src/av1/encoder/lookahead.h b/libs/libaom/src/av1/encoder/lookahead.h
new file mode 100644
index 000000000..03693d383
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/lookahead.h
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_LOOKAHEAD_H_
+#define AOM_AV1_ENCODER_LOOKAHEAD_H_
+
+#include "aom_scale/yv12config.h"
+#include "aom/aom_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_LAG_BUFFERS 35
+#define MAX_LAP_BUFFERS 35
+#define MAX_TOTAL_BUFFERS (MAX_LAG_BUFFERS + MAX_LAP_BUFFERS)
+#define LAP_LAG_IN_FRAMES 17
+
+struct lookahead_entry {
+  YV12_BUFFER_CONFIG img;
+  int64_t ts_start;
+  int64_t ts_end;
+  aom_enc_frame_flags_t flags;
+};
+
+// The max of past frames we want to keep in the queue.
+#define MAX_PRE_FRAMES 1
+
+enum { ENCODE_STAGE, LAP_STAGE, MAX_STAGES } UENUM1BYTE(COMPRESSOR_STAGE);
+
+struct read_ctx {
+  int sz;       /* Number of buffers currently in the queue */
+  int read_idx; /* Read index */
+  int pop_sz;   /* Size to check for pop condition */
+  int valid;    /* Is this ctx valid? */
+};
+
+struct lookahead_ctx {
+  int max_sz;                            /* Absolute size of the queue */
+  int write_idx;                         /* Write index */
+  struct read_ctx read_ctxs[MAX_STAGES]; /* Read context */
+  struct lookahead_entry *buf;           /* Buffer list */
+};
+
+/**\brief Initializes the lookahead stage
+ *
+ * The lookahead stage is a queue of frame buffers on which some analysis
+ * may be done when buffers are enqueued.
+ */
+struct lookahead_ctx *av1_lookahead_init(
+    unsigned int width, unsigned int height, unsigned int subsampling_x,
+    unsigned int subsampling_y, int use_highbitdepth, unsigned int depth,
+    const int border_in_pixels, int byte_alignment, int num_lap_buffers);
+
+/**\brief Destroys the lookahead stage
+ */
+void av1_lookahead_destroy(struct lookahead_ctx *ctx);
+
+/**\brief Enqueue a source buffer
+ *
+ * This function will copy the source image into a new framebuffer with
+ * the expected stride/border.
+ *
+ * If active_map is non-NULL and there is only one frame in the queue, then copy
+ * only active macroblocks.
+ *
+ * \param[in] ctx         Pointer to the lookahead context
+ * \param[in] src         Pointer to the image to enqueue
+ * \param[in] ts_start    Timestamp for the start of this frame
+ * \param[in] ts_end      Timestamp for the end of this frame
+ * \param[in] flags       Flags set on this frame
+ * \param[in] active_map  Map that specifies which macroblock is active
+ */
+int av1_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src,
+                       int64_t ts_start, int64_t ts_end, int use_highbitdepth,
+                       aom_enc_frame_flags_t flags);
+
+/**\brief Get the next source buffer to encode
+ *
+ *
+ * \param[in] ctx       Pointer to the lookahead context
+ * \param[in] drain     Flag indicating the buffer should be drained
+ *                      (return a buffer regardless of the current queue depth)
+ *
+ * \retval NULL, if drain set and queue is empty
+ * \retval NULL, if drain not set and queue not of the configured depth
+ */
+struct lookahead_entry *av1_lookahead_pop(struct lookahead_ctx *ctx, int drain,
+                                          COMPRESSOR_STAGE stage);
+
+/**\brief Get a future source buffer to encode
+ *
+ * \param[in] ctx       Pointer to the lookahead context
+ * \param[in] index     Index of the frame to be returned, 0 == next frame
+ *
+ * \retval NULL, if no buffer exists at the specified index
+ */
+struct lookahead_entry *av1_lookahead_peek(struct lookahead_ctx *ctx, int index,
+                                           COMPRESSOR_STAGE stage);
+
+/**\brief Get the number of frames currently in the lookahead queue
+ *
+ * \param[in] ctx       Pointer to the lookahead context
+ */
+unsigned int av1_lookahead_depth(struct lookahead_ctx *ctx,
+                                 COMPRESSOR_STAGE stage);
+
+int av1_lookahead_pop_sz(struct lookahead_ctx *ctx, COMPRESSOR_STAGE stage);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_LOOKAHEAD_H_
diff --git a/libs/libaom/src/av1/encoder/mathutils.h b/libs/libaom/src/av1/encoder/mathutils.h
new file mode 100644
index 000000000..64f936176
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/mathutils.h
@@ -0,0 +1,359 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_MATHUTILS_H_
+#define AOM_AV1_ENCODER_MATHUTILS_H_
+
+#include <memory.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+
+static const double TINY_NEAR_ZERO = 1.0E-16;
+
+// Solves Ax = b, where x and b are column vectors of size nx1 and A is nxn
+static INLINE int linsolve(int n, double *A, int stride, double *b, double *x) {
+  int i, j, k;
+  double c;
+  // Forward elimination
+  for (k = 0; k < n - 1; k++) {
+    // Bring the largest magnitude to the diagonal position
+    for (i = n - 1; i > k; i--) {
+      if (fabs(A[(i - 1) * stride + k]) < fabs(A[i * stride + k])) {
+        for (j = 0; j < n; j++) {
+          c = A[i * stride + j];
+          A[i * stride + j] = A[(i - 1) * stride + j];
+          A[(i - 1) * stride + j] = c;
+        }
+        c = b[i];
+        b[i] = b[i - 1];
+        b[i - 1] = c;
+      }
+    }
+    for (i = k; i < n - 1; i++) {
+      if (fabs(A[k * stride + k]) < TINY_NEAR_ZERO) return 0;
+      c = A[(i + 1) * stride + k] / A[k * stride + k];
+      for (j = 0; j < n; j++) A[(i + 1) * stride + j] -= c * A[k * stride + j];
+      b[i + 1] -= c * b[k];
+    }
+  }
+  // Backward substitution
+  for (i = n - 1; i >= 0; i--) {
+    if (fabs(A[i * stride + i]) < TINY_NEAR_ZERO) return 0;
+    c = 0;
+    for (j = i + 1; j <= n - 1; j++) c += A[i * stride + j] * x[j];
+    x[i] = (b[i] - c) / A[i * stride + i];
+  }
+
+  return 1;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Least-squares
+// Solves for n-dim x in a least squares sense to minimize |Ax - b|^2
+// The solution is simply x = (A'A)^-1 A'b or simply the solution for
+// the system: A'A x = A'b
+static INLINE int least_squares(int n, double *A, int rows, int stride,
+                                double *b, double *scratch, double *x) {
+  int i, j, k;
+  double *scratch_ = NULL;
+  double *AtA, *Atb;
+  if (!scratch) {
+    scratch_ = (double *)aom_malloc(sizeof(*scratch) * n * (n + 1));
+    scratch = scratch_;
+  }
+  AtA = scratch;
+  Atb = scratch + n * n;
+
+  for (i = 0; i < n; ++i) {
+    for (j = i; j < n; ++j) {
+      AtA[i * n + j] = 0.0;
+      for (k = 0; k < rows; ++k)
+        AtA[i * n + j] += A[k * stride + i] * A[k * stride + j];
+      AtA[j * n + i] = AtA[i * n + j];
+    }
+    Atb[i] = 0;
+    for (k = 0; k < rows; ++k) Atb[i] += A[k * stride + i] * b[k];
+  }
+  int ret = linsolve(n, AtA, n, Atb, x);
+  if (scratch_) aom_free(scratch_);
+  return ret;
+}
+
+// Matrix multiply
+static INLINE void multiply_mat(const double *m1, const double *m2, double *res,
+                                const int m1_rows, const int inner_dim,
+                                const int m2_cols) {
+  double sum;
+
+  int row, col, inner;
+  for (row = 0; row < m1_rows; ++row) {
+    for (col = 0; col < m2_cols; ++col) {
+      sum = 0;
+      for (inner = 0; inner < inner_dim; ++inner)
+        sum += m1[row * inner_dim + inner] * m2[inner * m2_cols + col];
+      *(res++) = sum;
+    }
+  }
+}
+
+//
+// The functions below are needed only for homography computation
+// Remove if the homography models are not used.
+//
+///////////////////////////////////////////////////////////////////////////////
+// svdcmp
+// Adopted from Numerical Recipes in C
+
+static INLINE double sign(double a, double b) {
+  return ((b) >= 0 ? fabs(a) : -fabs(a));
+}
+
+static INLINE double pythag(double a, double b) {
+  double ct;
+  const double absa = fabs(a);
+  const double absb = fabs(b);
+
+  if (absa > absb) {
+    ct = absb / absa;
+    return absa * sqrt(1.0 + ct * ct);
+  } else {
+    ct = absa / absb;
+    return (absb == 0) ? 0 : absb * sqrt(1.0 + ct * ct);
+  }
+}
+
+static INLINE int svdcmp(double **u, int m, int n, double w[], double **v) {
+  const int max_its = 30;
+  int flag, i, its, j, jj, k, l, nm;
+  double anorm, c, f, g, h, s, scale, x, y, z;
+  double *rv1 = (double *)aom_malloc(sizeof(*rv1) * (n + 1));
+  g = scale = anorm = 0.0;
+  for (i = 0; i < n; i++) {
+    l = i + 1;
+    rv1[i] = scale * g;
+    g = s = scale = 0.0;
+    if (i < m) {
+      for (k = i; k < m; k++) scale += fabs(u[k][i]);
+      if (scale != 0.) {
+        for (k = i; k < m; k++) {
+          u[k][i] /= scale;
+          s += u[k][i] * u[k][i];
+        }
+        f = u[i][i];
+        g = -sign(sqrt(s), f);
+        h = f * g - s;
+        u[i][i] = f - g;
+        for (j = l; j < n; j++) {
+          for (s = 0.0, k = i; k < m; k++) s += u[k][i] * u[k][j];
+          f = s / h;
+          for (k = i; k < m; k++) u[k][j] += f * u[k][i];
+        }
+        for (k = i; k < m; k++) u[k][i] *= scale;
+      }
+    }
+    w[i] = scale * g;
+    g = s = scale = 0.0;
+    if (i < m && i != n - 1) {
+      for (k = l; k < n; k++) scale += fabs(u[i][k]);
+      if (scale != 0.) {
+        for (k = l; k < n; k++) {
+          u[i][k] /= scale;
+          s += u[i][k] * u[i][k];
+        }
+        f = u[i][l];
+        g = -sign(sqrt(s), f);
+        h = f * g - s;
+        u[i][l] = f - g;
+        for (k = l; k < n; k++) rv1[k] = u[i][k] / h;
+        for (j = l; j < m; j++) {
+          for (s = 0.0, k = l; k < n; k++) s += u[j][k] * u[i][k];
+          for (k = l; k < n; k++) u[j][k] += s * rv1[k];
+        }
+        for (k = l; k < n; k++) u[i][k] *= scale;
+      }
+    }
+    anorm = fmax(anorm, (fabs(w[i]) + fabs(rv1[i])));
+  }
+
+  for (i = n - 1; i >= 0; i--) {
+    if (i < n - 1) {
+      if (g != 0.) {
+        for (j = l; j < n; j++) v[j][i] = (u[i][j] / u[i][l]) / g;
+        for (j = l; j < n; j++) {
+          for (s = 0.0, k = l; k < n; k++) s += u[i][k] * v[k][j];
+          for (k = l; k < n; k++) v[k][j] += s * v[k][i];
+        }
+      }
+      for (j = l; j < n; j++) v[i][j] = v[j][i] = 0.0;
+    }
+    v[i][i] = 1.0;
+    g = rv1[i];
+    l = i;
+  }
+  for (i = AOMMIN(m, n) - 1; i >= 0; i--) {
+    l = i + 1;
+    g = w[i];
+    for (j = l; j < n; j++) u[i][j] = 0.0;
+    if (g != 0.) {
+      g = 1.0 / g;
+      for (j = l; j < n; j++) {
+        for (s = 0.0, k = l; k < m; k++) s += u[k][i] * u[k][j];
+        f = (s / u[i][i]) * g;
+        for (k = i; k < m; k++) u[k][j] += f * u[k][i];
+      }
+      for (j = i; j < m; j++) u[j][i] *= g;
+    } else {
+      for (j = i; j < m; j++) u[j][i] = 0.0;
+    }
+    ++u[i][i];
+  }
+  for (k = n - 1; k >= 0; k--) {
+    for (its = 0; its < max_its; its++) {
+      flag = 1;
+      for (l = k; l >= 0; l--) {
+        nm = l - 1;
+        if ((double)(fabs(rv1[l]) + anorm) == anorm || nm < 0) {
+          flag = 0;
+          break;
+        }
+        if ((double)(fabs(w[nm]) + anorm) == anorm) break;
+      }
+      if (flag) {
+        c = 0.0;
+        s = 1.0;
+        for (i = l; i <= k; i++) {
+          f = s * rv1[i];
+          rv1[i] = c * rv1[i];
+          if ((double)(fabs(f) + anorm) == anorm) break;
+          g = w[i];
+          h = pythag(f, g);
+          w[i] = h;
+          h = 1.0 / h;
+          c = g * h;
+          s = -f * h;
+          for (j = 0; j < m; j++) {
+            y = u[j][nm];
+            z = u[j][i];
+            u[j][nm] = y * c + z * s;
+            u[j][i] = z * c - y * s;
+          }
+        }
+      }
+      z = w[k];
+      if (l == k) {
+        if (z < 0.0) {
+          w[k] = -z;
+          for (j = 0; j < n; j++) v[j][k] = -v[j][k];
+        }
+        break;
+      }
+      if (its == max_its - 1) {
+        aom_free(rv1);
+        return 1;
+      }
+      assert(k > 0);
+      x = w[l];
+      nm = k - 1;
+      y = w[nm];
+      g = rv1[nm];
+      h = rv1[k];
+      f = ((y - z) * (y + z) + (g - h) * (g + h)) / (2.0 * h * y);
+      g = pythag(f, 1.0);
+      f = ((x - z) * (x + z) + h * ((y / (f + sign(g, f))) - h)) / x;
+      c = s = 1.0;
+      for (j = l; j <= nm; j++) {
+        i = j + 1;
+        g = rv1[i];
+        y = w[i];
+        h = s * g;
+        g = c * g;
+        z = pythag(f, h);
+        rv1[j] = z;
+        c = f / z;
+        s = h / z;
+        f = x * c + g * s;
+        g = g * c - x * s;
+        h = y * s;
+        y *= c;
+        for (jj = 0; jj < n; jj++) {
+          x = v[jj][j];
+          z = v[jj][i];
+          v[jj][j] = x * c + z * s;
+          v[jj][i] = z * c - x * s;
+        }
+        z = pythag(f, h);
+        w[j] = z;
+        if (z != 0.) {
+          z = 1.0 / z;
+          c = f * z;
+          s = h * z;
+        }
+        f = c * g + s * y;
+        x = c * y - s * g;
+        for (jj = 0; jj < m; jj++) {
+          y = u[jj][j];
+          z = u[jj][i];
+          u[jj][j] = y * c + z * s;
+          u[jj][i] = z * c - y * s;
+        }
+      }
+      rv1[l] = 0.0;
+      rv1[k] = f;
+      w[k] = x;
+    }
+  }
+  aom_free(rv1);
+  return 0;
+}
+
+static INLINE int SVD(double *U, double *W, double *V, double *matx, int M,
+                      int N) {
+  // Assumes allocation for U is MxN
+  double **nrU = (double **)aom_malloc((M) * sizeof(*nrU));
+  double **nrV = (double **)aom_malloc((N) * sizeof(*nrV));
+  int problem, i;
+
+  problem = !(nrU && nrV);
+  if (!problem) {
+    for (i = 0; i < M; i++) {
+      nrU[i] = &U[i * N];
+    }
+    for (i = 0; i < N; i++) {
+      nrV[i] = &V[i * N];
+    }
+  } else {
+    if (nrU) aom_free(nrU);
+    if (nrV) aom_free(nrV);
+    return 1;
+  }
+
+  /* copy from given matx into nrU */
+  for (i = 0; i < M; i++) {
+    memcpy(&(nrU[i][0]), matx + N * i, N * sizeof(*matx));
+  }
+
+  /* HERE IT IS: do SVD */
+  if (svdcmp(nrU, M, N, W, nrV)) {
+    aom_free(nrU);
+    aom_free(nrV);
+    return 1;
+  }
+
+  /* aom_free Numerical Recipes arrays */
+  aom_free(nrU);
+  aom_free(nrV);
+
+  return 0;
+}
+
+#endif  // AOM_AV1_ENCODER_MATHUTILS_H_
diff --git a/libs/libaom/src/av1/encoder/mcomp.c b/libs/libaom/src/av1/encoder/mcomp.c
new file mode 100644
index 000000000..43f7f5c6c
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/mcomp.c
@@ -0,0 +1,3391 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/common.h"
+#include "av1/common/filter.h"
+#include "av1/common/mvref_common.h"
+#include "av1/common/reconinter.h"
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/mcomp.h"
+#include "av1/encoder/rdopt.h"
+#include "av1/encoder/reconinter_enc.h"
+
+static INLINE void init_mv_cost_params(MV_COST_PARAMS *mv_cost_params,
+                                       const MACROBLOCK *x, const MV *ref_mv) {
+  mv_cost_params->ref_mv = ref_mv;
+  mv_cost_params->full_ref_mv = get_fullmv_from_mv(ref_mv);
+  mv_cost_params->error_per_bit = x->errorperbit;
+  mv_cost_params->sad_per_bit = x->sadperbit;
+  mv_cost_params->mvjcost = x->nmv_vec_cost;
+  mv_cost_params->mvcost[0] = x->mv_cost_stack[0];
+  mv_cost_params->mvcost[1] = x->mv_cost_stack[1];
+  mv_cost_params->mv_cost_type = x->mv_cost_type;
+}
+
+static INLINE void init_ms_buffers(MSBuffers *ms_buffers, const MACROBLOCK *x) {
+  ms_buffers->ref = &x->e_mbd.plane[0].pre[0];
+  ms_buffers->src = &x->plane[0].src;
+
+  av1_set_ms_compound_refs(ms_buffers, NULL, NULL, 0, 0);
+
+  ms_buffers->wsrc = x->wsrc_buf;
+  ms_buffers->obmc_mask = x->mask_buf;
+}
+
+void av1_make_default_fullpel_ms_params(
+    FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const struct AV1_COMP *cpi,
+    const MACROBLOCK *x, BLOCK_SIZE bsize, const MV *ref_mv,
+    const search_site_config *search_sites) {
+  // High level params
+  ms_params->bsize = bsize;
+  ms_params->vfp = &cpi->fn_ptr[bsize];
+
+  init_ms_buffers(&ms_params->ms_buffers, x);
+
+  ms_params->search_method = cpi->sf.mv_sf.search_method;
+  ms_params->search_sites = search_sites;
+
+  ms_params->mesh_patterns[0] = cpi->sf.mv_sf.mesh_patterns;
+  ms_params->mesh_patterns[1] = cpi->sf.mv_sf.intrabc_mesh_patterns;
+  ms_params->force_mesh_thresh = cpi->sf.mv_sf.exhaustive_searches_thresh;
+  ms_params->prune_mesh_search = cpi->sf.mv_sf.prune_mesh_search;
+  ms_params->run_mesh_search = 0;
+
+  ms_params->is_intra_mode = 0;
+
+  ms_params->fast_obmc_search = cpi->sf.mv_sf.obmc_full_pixel_search_level;
+
+  ms_params->mv_limits = x->mv_limits;
+  av1_set_mv_search_range(&ms_params->mv_limits, ref_mv);
+
+  // Mvcost params
+  init_mv_cost_params(&ms_params->mv_cost_params, x, ref_mv);
+}
+
+void av1_make_default_subpel_ms_params(SUBPEL_MOTION_SEARCH_PARAMS *ms_params,
+                                       const struct AV1_COMP *cpi,
+                                       const MACROBLOCK *x, BLOCK_SIZE bsize,
+                                       const MV *ref_mv, const int *cost_list) {
+  const AV1_COMMON *cm = &cpi->common;
+  // High level params
+  ms_params->allow_hp = cm->features.allow_high_precision_mv;
+  ms_params->forced_stop = cpi->sf.mv_sf.subpel_force_stop;
+  ms_params->iters_per_step = cpi->sf.mv_sf.subpel_iters_per_step;
+  ms_params->cost_list = cond_cost_list_const(cpi, cost_list);
+
+  av1_set_subpel_mv_search_range(&ms_params->mv_limits, &x->mv_limits, ref_mv);
+
+  // Mvcost params
+  init_mv_cost_params(&ms_params->mv_cost_params, x, ref_mv);
+
+  // Subpel variance params
+  ms_params->var_params.vfp = &cpi->fn_ptr[bsize];
+  ms_params->var_params.subpel_search_type =
+      cpi->sf.mv_sf.use_accurate_subpel_search;
+  ms_params->var_params.w = block_size_wide[bsize];
+  ms_params->var_params.h = block_size_high[bsize];
+
+  // Ref and src buffers
+  MSBuffers *ms_buffers = &ms_params->var_params.ms_buffers;
+  init_ms_buffers(ms_buffers, x);
+}
+
+static INLINE int get_offset_from_fullmv(const FULLPEL_MV *mv, int stride) {
+  return mv->row * stride + mv->col;
+}
+
+static INLINE const uint8_t *get_buf_from_fullmv(const struct buf_2d *buf,
+                                                 const FULLPEL_MV *mv) {
+  return &buf->buf[get_offset_from_fullmv(mv, buf->stride)];
+}
+
+void av1_set_mv_search_range(FullMvLimits *mv_limits, const MV *mv) {
+  int col_min =
+      GET_MV_RAWPEL(mv->col) - MAX_FULL_PEL_VAL + (mv->col & 7 ? 1 : 0);
+  int row_min =
+      GET_MV_RAWPEL(mv->row) - MAX_FULL_PEL_VAL + (mv->row & 7 ? 1 : 0);
+  int col_max = GET_MV_RAWPEL(mv->col) + MAX_FULL_PEL_VAL;
+  int row_max = GET_MV_RAWPEL(mv->row) + MAX_FULL_PEL_VAL;
+
+  col_min = AOMMAX(col_min, GET_MV_RAWPEL(MV_LOW) + 1);
+  row_min = AOMMAX(row_min, GET_MV_RAWPEL(MV_LOW) + 1);
+  col_max = AOMMIN(col_max, GET_MV_RAWPEL(MV_UPP) - 1);
+  row_max = AOMMIN(row_max, GET_MV_RAWPEL(MV_UPP) - 1);
+
+  // Get intersection of UMV window and valid MV window to reduce # of checks
+  // in diamond search.
+  if (mv_limits->col_min < col_min) mv_limits->col_min = col_min;
+  if (mv_limits->col_max > col_max) mv_limits->col_max = col_max;
+  if (mv_limits->row_min < row_min) mv_limits->row_min = row_min;
+  if (mv_limits->row_max > row_max) mv_limits->row_max = row_max;
+}
+
+int av1_init_search_range(int size) {
+  int sr = 0;
+  // Minimum search size no matter what the passed in value.
+  size = AOMMAX(16, size);
+
+  while ((size << sr) < MAX_FULL_PEL_VAL) sr++;
+
+  sr = AOMMIN(sr, MAX_MVSEARCH_STEPS - 2);
+  return sr;
+}
+
+// ============================================================================
+//  Cost of motion vectors
+// ============================================================================
+// TODO(any): Adaptively adjust the regularization strength based on image size
+// and motion activity instead of using hard-coded values. It seems like we
+// roughly half the lambda for each increase in resolution
+// These are multiplier used to perform regularization in motion compensation
+// when x->mv_cost_type is set to MV_COST_L1.
+// LOWRES
+#define SSE_LAMBDA_LOWRES 2   // Used by mv_cost_err_fn
+#define SAD_LAMBDA_LOWRES 32  // Used by mvsad_err_cost during full pixel search
+// MIDRES
+#define SSE_LAMBDA_MIDRES 0   // Used by mv_cost_err_fn
+#define SAD_LAMBDA_MIDRES 15  // Used by mvsad_err_cost during full pixel search
+// HDRES
+#define SSE_LAMBDA_HDRES 1  // Used by mv_cost_err_fn
+#define SAD_LAMBDA_HDRES 8  // Used by mvsad_err_cost during full pixel search
+
+// Returns the rate of encoding the current motion vector based on the
+// joint_cost and comp_cost. joint_costs covers the cost of transmitting
+// JOINT_MV, and comp_cost covers the cost of transmitting the actual motion
+// vector.
+static INLINE int mv_cost(const MV *mv, const int *joint_cost,
+                          const int *const comp_cost[2]) {
+  return joint_cost[av1_get_mv_joint(mv)] + comp_cost[0][mv->row] +
+         comp_cost[1][mv->col];
+}
+
+#define CONVERT_TO_CONST_MVCOST(ptr) ((const int *const *)(ptr))
+// Returns the cost of encoding the motion vector diff := *mv - *ref. The cost
+// is defined as the rate required to encode diff * weight, rounded to the
+// nearest 2 ** 7.
+// This is NOT used during motion compensation.
+int av1_mv_bit_cost(const MV *mv, const MV *ref_mv, const int *mvjcost,
+                    int *mvcost[2], int weight) {
+  const MV diff = { mv->row - ref_mv->row, mv->col - ref_mv->col };
+  return ROUND_POWER_OF_TWO(
+      mv_cost(&diff, mvjcost, CONVERT_TO_CONST_MVCOST(mvcost)) * weight, 7);
+}
+
+// Returns the cost of using the current mv during the motion search. This is
+// used when var is used as the error metric.
+#define PIXEL_TRANSFORM_ERROR_SCALE 4
+static INLINE int mv_err_cost(const MV *mv, const MV *ref_mv,
+                              const int *mvjcost, const int *const mvcost[2],
+                              int error_per_bit, MV_COST_TYPE mv_cost_type) {
+  const MV diff = { mv->row - ref_mv->row, mv->col - ref_mv->col };
+  const MV abs_diff = { abs(diff.row), abs(diff.col) };
+
+  switch (mv_cost_type) {
+    case MV_COST_ENTROPY:
+      if (mvcost) {
+        return (int)ROUND_POWER_OF_TWO_64(
+            (int64_t)mv_cost(&diff, mvjcost, mvcost) * error_per_bit,
+            RDDIV_BITS + AV1_PROB_COST_SHIFT - RD_EPB_SHIFT +
+                PIXEL_TRANSFORM_ERROR_SCALE);
+      }
+      return 0;
+    case MV_COST_L1_LOWRES:
+      return (SSE_LAMBDA_LOWRES * (abs_diff.row + abs_diff.col)) >> 3;
+    case MV_COST_L1_MIDRES:
+      return (SSE_LAMBDA_MIDRES * (abs_diff.row + abs_diff.col)) >> 3;
+    case MV_COST_L1_HDRES:
+      return (SSE_LAMBDA_HDRES * (abs_diff.row + abs_diff.col)) >> 3;
+    case MV_COST_NONE: return 0;
+    default: assert(0 && "Invalid rd_cost_type"); return 0;
+  }
+}
+
+static INLINE int mv_err_cost_(const MV *mv,
+                               const MV_COST_PARAMS *mv_cost_params) {
+  return mv_err_cost(mv, mv_cost_params->ref_mv, mv_cost_params->mvjcost,
+                     mv_cost_params->mvcost, mv_cost_params->error_per_bit,
+                     mv_cost_params->mv_cost_type);
+}
+
+// Returns the cost of using the current mv during the motion search. This is
+// only used during full pixel motion search when sad is used as the error
+// metric
+static INLINE int mvsad_err_cost(const FULLPEL_MV *mv, const FULLPEL_MV *ref_mv,
+                                 const int *mvjcost, const int *const mvcost[2],
+                                 int sad_per_bit, MV_COST_TYPE mv_cost_type) {
+  const MV diff = { GET_MV_SUBPEL(mv->row - ref_mv->row),
+                    GET_MV_SUBPEL(mv->col - ref_mv->col) };
+
+  switch (mv_cost_type) {
+    case MV_COST_ENTROPY:
+      return ROUND_POWER_OF_TWO(
+          (unsigned)mv_cost(&diff, mvjcost, CONVERT_TO_CONST_MVCOST(mvcost)) *
+              sad_per_bit,
+          AV1_PROB_COST_SHIFT);
+    case MV_COST_L1_LOWRES:
+      return (SAD_LAMBDA_LOWRES * (abs(diff.row) + abs(diff.col))) >> 3;
+    case MV_COST_L1_MIDRES:
+      return (SAD_LAMBDA_MIDRES * (abs(diff.row) + abs(diff.col))) >> 3;
+    case MV_COST_L1_HDRES:
+      return (SAD_LAMBDA_HDRES * (abs(diff.row) + abs(diff.col))) >> 3;
+    case MV_COST_NONE: return 0;
+    default: assert(0 && "Invalid rd_cost_type"); return 0;
+  }
+}
+
+static INLINE int mvsad_err_cost_(const FULLPEL_MV *mv,
+                                  const MV_COST_PARAMS *mv_cost_params) {
+  return mvsad_err_cost(mv, &mv_cost_params->full_ref_mv,
+                        mv_cost_params->mvjcost, mv_cost_params->mvcost,
+                        mv_cost_params->sad_per_bit,
+                        mv_cost_params->mv_cost_type);
+}
+
+// =============================================================================
+//  Fullpixel Motion Search: Translational
+// =============================================================================
+#define MAX_PATTERN_SCALES 11
+#define MAX_PATTERN_CANDIDATES 8  // max number of candidates per scale
+#define PATTERN_CANDIDATES_REF 3  // number of refinement candidates
+
+void av1_init_dsmotion_compensation(search_site_config *cfg, int stride) {
+  int ss_count = 0;
+  int stage_index = MAX_MVSEARCH_STEPS - 1;
+
+  cfg->ss[stage_index][0].mv.col = cfg->ss[stage_index][0].mv.row = 0;
+  cfg->ss[stage_index][0].offset = 0;
+  cfg->stride = stride;
+
+  for (int radius = MAX_FIRST_STEP; radius > 0; radius /= 2) {
+    int num_search_pts = 8;
+
+    const FULLPEL_MV ss_mvs[13] = {
+      { 0, 0 },           { -radius, 0 },      { radius, 0 },
+      { 0, -radius },     { 0, radius },       { -radius, -radius },
+      { radius, radius }, { -radius, radius }, { radius, -radius },
+    };
+
+    int i;
+    for (i = 0; i <= num_search_pts; ++i) {
+      search_site *const ss = &cfg->ss[stage_index][i];
+      ss->mv = ss_mvs[i];
+      ss->offset = get_offset_from_fullmv(&ss->mv, stride);
+    }
+    cfg->searches_per_step[stage_index] = num_search_pts;
+    cfg->radius[stage_index] = radius;
+    --stage_index;
+    ++ss_count;
+  }
+  cfg->ss_count = ss_count;
+}
+
+void av1_init_motion_fpf(search_site_config *cfg, int stride) {
+  int ss_count = 0;
+  int stage_index = MAX_MVSEARCH_STEPS - 1;
+
+  cfg->ss[stage_index][0].mv.col = cfg->ss[stage_index][0].mv.row = 0;
+  cfg->ss[stage_index][0].offset = 0;
+  cfg->stride = stride;
+
+  for (int radius = MAX_FIRST_STEP; radius > 0; radius /= 2) {
+    // Generate offsets for 8 search sites per step.
+    int tan_radius = AOMMAX((int)(0.41 * radius), 1);
+    int num_search_pts = 12;
+    if (radius == 1) num_search_pts = 8;
+
+    const FULLPEL_MV ss_mvs[13] = {
+      { 0, 0 },
+      { -radius, 0 },
+      { radius, 0 },
+      { 0, -radius },
+      { 0, radius },
+      { -radius, -tan_radius },
+      { radius, tan_radius },
+      { -tan_radius, radius },
+      { tan_radius, -radius },
+      { -radius, tan_radius },
+      { radius, -tan_radius },
+      { tan_radius, radius },
+      { -tan_radius, -radius },
+    };
+
+    int i;
+    for (i = 0; i <= num_search_pts; ++i) {
+      search_site *const ss = &cfg->ss[stage_index][i];
+      ss->mv = ss_mvs[i];
+      ss->offset = get_offset_from_fullmv(&ss->mv, stride);
+    }
+    cfg->searches_per_step[stage_index] = num_search_pts;
+    cfg->radius[stage_index] = radius;
+    --stage_index;
+    ++ss_count;
+  }
+  cfg->ss_count = ss_count;
+}
+
+void av1_init3smotion_compensation(search_site_config *cfg, int stride) {
+  int ss_count = 0;
+  int stage_index = 0;
+  cfg->stride = stride;
+  int radius = 1;
+  for (stage_index = 0; stage_index < 15; ++stage_index) {
+    int tan_radius = AOMMAX((int)(0.41 * radius), 1);
+    int num_search_pts = 12;
+    if (radius <= 5) {
+      tan_radius = radius;
+      num_search_pts = 8;
+    }
+    const FULLPEL_MV ss_mvs[13] = {
+      { 0, 0 },
+      { -radius, 0 },
+      { radius, 0 },
+      { 0, -radius },
+      { 0, radius },
+      { -radius, -tan_radius },
+      { radius, tan_radius },
+      { -tan_radius, radius },
+      { tan_radius, -radius },
+      { -radius, tan_radius },
+      { radius, -tan_radius },
+      { tan_radius, radius },
+      { -tan_radius, -radius },
+    };
+
+    for (int i = 0; i <= num_search_pts; ++i) {
+      search_site *const ss = &cfg->ss[stage_index][i];
+      ss->mv = ss_mvs[i];
+      ss->offset = get_offset_from_fullmv(&ss->mv, stride);
+    }
+    cfg->searches_per_step[stage_index] = num_search_pts;
+    cfg->radius[stage_index] = radius;
+    ++ss_count;
+    if (stage_index < 12)
+      radius = (int)AOMMAX((radius * 1.5 + 0.5), radius + 1);
+  }
+  cfg->ss_count = ss_count;
+}
+
+// Checks whether the mv is within range of the mv_limits
+static INLINE int check_bounds(const FullMvLimits *mv_limits, int row, int col,
+                               int range) {
+  return ((row - range) >= mv_limits->row_min) &
+         ((row + range) <= mv_limits->row_max) &
+         ((col - range) >= mv_limits->col_min) &
+         ((col + range) <= mv_limits->col_max);
+}
+
+static INLINE int get_mvpred_var_cost(
+    const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const FULLPEL_MV *this_mv) {
+  const aom_variance_fn_ptr_t *vfp = ms_params->vfp;
+  const MV sub_this_mv = get_mv_from_fullmv(this_mv);
+  const struct buf_2d *const src = ms_params->ms_buffers.src;
+  const struct buf_2d *const ref = ms_params->ms_buffers.ref;
+  const uint8_t *src_buf = src->buf;
+  const int src_stride = src->stride;
+  const int ref_stride = ref->stride;
+
+  unsigned unused;
+  int bestsme;
+
+  bestsme = vfp->vf(src_buf, src_stride, get_buf_from_fullmv(ref, this_mv),
+                    ref_stride, &unused);
+
+  bestsme += mv_err_cost_(&sub_this_mv, &ms_params->mv_cost_params);
+
+  return bestsme;
+}
+
+static INLINE int get_mvpred_sad(const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+                                 const struct buf_2d *const src,
+                                 const uint8_t *const ref_address,
+                                 const int ref_stride) {
+  const aom_variance_fn_ptr_t *vfp = ms_params->vfp;
+  const uint8_t *src_buf = src->buf;
+  const int src_stride = src->stride;
+
+  return vfp->sdf(src_buf, src_stride, ref_address, ref_stride);
+}
+
+static INLINE int get_mvpred_compound_var_cost(
+    const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const FULLPEL_MV *this_mv) {
+  const aom_variance_fn_ptr_t *vfp = ms_params->vfp;
+  const struct buf_2d *const src = ms_params->ms_buffers.src;
+  const struct buf_2d *const ref = ms_params->ms_buffers.ref;
+  const uint8_t *src_buf = src->buf;
+  const int src_stride = src->stride;
+  const int ref_stride = ref->stride;
+
+  const uint8_t *mask = ms_params->ms_buffers.mask;
+  const uint8_t *second_pred = ms_params->ms_buffers.second_pred;
+  const int mask_stride = ms_params->ms_buffers.mask_stride;
+  const int invert_mask = ms_params->ms_buffers.inv_mask;
+  unsigned unused;
+  int bestsme;
+
+  if (mask) {
+    bestsme = vfp->msvf(src_buf, src_stride, 0, 0,
+                        get_buf_from_fullmv(ref, this_mv), ref_stride,
+                        second_pred, mask, mask_stride, invert_mask, &unused);
+  } else if (second_pred) {
+    bestsme = vfp->svaf(get_buf_from_fullmv(ref, this_mv), ref_stride, 0, 0,
+                        src_buf, src_stride, &unused, second_pred);
+  } else {
+    bestsme = vfp->vf(src_buf, src_stride, get_buf_from_fullmv(ref, this_mv),
+                      ref_stride, &unused);
+  }
+
+  const MV sub_this_mv = get_mv_from_fullmv(this_mv);
+  bestsme += mv_err_cost_(&sub_this_mv, &ms_params->mv_cost_params);
+
+  return bestsme;
+}
+
+static INLINE int get_mvpred_compound_sad(
+    const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+    const struct buf_2d *const src, const uint8_t *const ref_address,
+    const int ref_stride) {
+  const aom_variance_fn_ptr_t *vfp = ms_params->vfp;
+  const uint8_t *src_buf = src->buf;
+  const int src_stride = src->stride;
+
+  const uint8_t *mask = ms_params->ms_buffers.mask;
+  const uint8_t *second_pred = ms_params->ms_buffers.second_pred;
+  const int mask_stride = ms_params->ms_buffers.mask_stride;
+  const int invert_mask = ms_params->ms_buffers.inv_mask;
+
+  if (mask) {
+    return vfp->msdf(src_buf, src_stride, ref_address, ref_stride, second_pred,
+                     mask, mask_stride, invert_mask);
+  } else if (second_pred) {
+    return vfp->sdaf(src_buf, src_stride, ref_address, ref_stride, second_pred);
+  } else {
+    return vfp->sdf(src_buf, src_stride, ref_address, ref_stride);
+  }
+}
+
+// Calculates and returns a sad+mvcost list around an integer best pel during
+// fullpixel motion search. The resulting list can be used to speed up subpel
+// motion search later.
+#define USE_SAD_COSTLIST 1
+
+// calc_int_cost_list uses var to populate the costlist, which is more accurate
+// than sad but slightly slower.
+static AOM_FORCE_INLINE void calc_int_cost_list(
+    const FULLPEL_MV best_mv, const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+    int *cost_list) {
+  static const FULLPEL_MV neighbors[4] = {
+    { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 }
+  };
+  const int br = best_mv.row;
+  const int bc = best_mv.col;
+
+  cost_list[0] = get_mvpred_var_cost(ms_params, &best_mv);
+
+  if (check_bounds(&ms_params->mv_limits, br, bc, 1)) {
+    for (int i = 0; i < 4; i++) {
+      const FULLPEL_MV neighbor_mv = { br + neighbors[i].row,
+                                       bc + neighbors[i].col };
+      cost_list[i + 1] = get_mvpred_var_cost(ms_params, &neighbor_mv);
+    }
+  } else {
+    for (int i = 0; i < 4; i++) {
+      const FULLPEL_MV neighbor_mv = { br + neighbors[i].row,
+                                       bc + neighbors[i].col };
+      if (!av1_is_fullmv_in_range(&ms_params->mv_limits, neighbor_mv)) {
+        cost_list[i + 1] = INT_MAX;
+      } else {
+        cost_list[i + 1] = get_mvpred_var_cost(ms_params, &neighbor_mv);
+      }
+    }
+  }
+}
+
+// calc_int_sad_list uses sad to populate the costlist, which is less accurate
+// than var but faster.
+static AOM_FORCE_INLINE void calc_int_sad_list(
+    const FULLPEL_MV best_mv, const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+    int *cost_list, int costlist_has_sad) {
+  static const FULLPEL_MV neighbors[4] = {
+    { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 }
+  };
+  const struct buf_2d *const src = ms_params->ms_buffers.src;
+  const struct buf_2d *const ref = ms_params->ms_buffers.ref;
+  const int ref_stride = ref->stride;
+  const int br = best_mv.row;
+  const int bc = best_mv.col;
+
+  assert(av1_is_fullmv_in_range(&ms_params->mv_limits, best_mv));
+
+  // Refresh the costlist it does not contain valid sad
+  if (!costlist_has_sad) {
+    cost_list[0] = get_mvpred_sad(
+        ms_params, src, get_buf_from_fullmv(ref, &best_mv), ref_stride);
+
+    if (check_bounds(&ms_params->mv_limits, br, bc, 1)) {
+      for (int i = 0; i < 4; i++) {
+        const FULLPEL_MV this_mv = { br + neighbors[i].row,
+                                     bc + neighbors[i].col };
+        cost_list[i + 1] = get_mvpred_sad(
+            ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride);
+      }
+    } else {
+      for (int i = 0; i < 4; i++) {
+        const FULLPEL_MV this_mv = { br + neighbors[i].row,
+                                     bc + neighbors[i].col };
+        if (!av1_is_fullmv_in_range(&ms_params->mv_limits, this_mv)) {
+          cost_list[i + 1] = INT_MAX;
+        } else {
+          cost_list[i + 1] = get_mvpred_sad(
+              ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride);
+        }
+      }
+    }
+  }
+
+  const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
+  cost_list[0] += mvsad_err_cost_(&best_mv, mv_cost_params);
+
+  for (int idx = 0; idx < 4; idx++) {
+    if (cost_list[idx + 1] != INT_MAX) {
+      const FULLPEL_MV this_mv = { br + neighbors[idx].row,
+                                   bc + neighbors[idx].col };
+      cost_list[idx + 1] += mvsad_err_cost_(&this_mv, mv_cost_params);
+    }
+  }
+}
+
+#define CHECK_BETTER                                                      \
+  if (thissad < bestsad) {                                                \
+    int tmp_thissad = thissad;                                            \
+    if (use_mvcost) thissad += mvsad_err_cost_(&this_mv, mv_cost_params); \
+    if (thissad < bestsad) {                                              \
+      raw_bestsad = tmp_thissad;                                          \
+      bestsad = thissad;                                                  \
+      best_site = i;                                                      \
+    }                                                                     \
+  }
+
+// Generic pattern search function that searches over multiple scales.
+// Each scale can have a different number of candidates and shape of
+// candidates as indicated in the num_candidates and candidates arrays
+// passed into this function
+static int pattern_search(
+    FULLPEL_MV start_mv, const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+    const int search_param, const int do_init_search,
+    const int num_candidates[MAX_PATTERN_SCALES],
+    const MV candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES],
+    int *cost_list, FULLPEL_MV *best_mv) {
+  static const int search_param_to_steps[MAX_MVSEARCH_STEPS] = {
+    10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+  };
+  int i, s, t;
+
+  const struct buf_2d *const src = ms_params->ms_buffers.src;
+  const struct buf_2d *const ref = ms_params->ms_buffers.ref;
+  const int ref_stride = ref->stride;
+  const int last_is_4 = num_candidates[0] == 4;
+  int br, bc;
+  int bestsad = INT_MAX, raw_bestsad = INT_MAX;
+  int thissad;
+  int k = -1;
+  const int use_mvcost = ms_params->mv_cost_params.mv_cost_type != MV_COST_NONE;
+  const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
+  assert(search_param < MAX_MVSEARCH_STEPS);
+  int best_init_s = search_param_to_steps[search_param];
+  // adjust ref_mv to make sure it is within MV range
+  clamp_fullmv(&start_mv, &ms_params->mv_limits);
+  br = start_mv.row;
+  bc = start_mv.col;
+  if (cost_list != NULL) {
+    cost_list[0] = cost_list[1] = cost_list[2] = cost_list[3] = cost_list[4] =
+        INT_MAX;
+  }
+  int costlist_has_sad = 0;
+
+  // Work out the start point for the search
+  raw_bestsad = get_mvpred_sad(ms_params, src,
+                               get_buf_from_fullmv(ref, &start_mv), ref_stride);
+  bestsad = raw_bestsad + mvsad_err_cost_(&start_mv, mv_cost_params);
+
+  // Search all possible scales up to the search param around the center point
+  // pick the scale of the point that is best as the starting scale of
+  // further steps around it.
+  if (do_init_search) {
+    s = best_init_s;
+    best_init_s = -1;
+    for (t = 0; t <= s; ++t) {
+      int best_site = -1;
+      if (check_bounds(&ms_params->mv_limits, br, bc, 1 << t)) {
+        for (i = 0; i < num_candidates[t]; i++) {
+          const FULLPEL_MV this_mv = { br + candidates[t][i].row,
+                                       bc + candidates[t][i].col };
+          thissad = get_mvpred_sad(
+              ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride);
+          CHECK_BETTER
+        }
+      } else {
+        for (i = 0; i < num_candidates[t]; i++) {
+          const FULLPEL_MV this_mv = { br + candidates[t][i].row,
+                                       bc + candidates[t][i].col };
+          if (!av1_is_fullmv_in_range(&ms_params->mv_limits, this_mv)) continue;
+          thissad = get_mvpred_sad(
+              ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride);
+          CHECK_BETTER
+        }
+      }
+      if (best_site == -1) {
+        continue;
+      } else {
+        best_init_s = t;
+        k = best_site;
+      }
+    }
+    if (best_init_s != -1) {
+      br += candidates[best_init_s][k].row;
+      bc += candidates[best_init_s][k].col;
+    }
+  }
+
+  // If the center point is still the best, just skip this and move to
+  // the refinement step.
+  if (best_init_s != -1) {
+    const int last_s = (last_is_4 && cost_list != NULL);
+    int best_site = -1;
+    s = best_init_s;
+
+    for (; s >= last_s; s--) {
+      // No need to search all points the 1st time if initial search was used
+      if (!do_init_search || s != best_init_s) {
+        if (check_bounds(&ms_params->mv_limits, br, bc, 1 << s)) {
+          for (i = 0; i < num_candidates[s]; i++) {
+            const FULLPEL_MV this_mv = { br + candidates[s][i].row,
+                                         bc + candidates[s][i].col };
+            thissad = get_mvpred_sad(
+                ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride);
+            CHECK_BETTER
+          }
+        } else {
+          for (i = 0; i < num_candidates[s]; i++) {
+            const FULLPEL_MV this_mv = { br + candidates[s][i].row,
+                                         bc + candidates[s][i].col };
+            if (!av1_is_fullmv_in_range(&ms_params->mv_limits, this_mv))
+              continue;
+            thissad = get_mvpred_sad(
+                ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride);
+            CHECK_BETTER
+          }
+        }
+
+        if (best_site == -1) {
+          continue;
+        } else {
+          br += candidates[s][best_site].row;
+          bc += candidates[s][best_site].col;
+          k = best_site;
+        }
+      }
+
+      do {
+        int next_chkpts_indices[PATTERN_CANDIDATES_REF];
+        best_site = -1;
+        next_chkpts_indices[0] = (k == 0) ? num_candidates[s] - 1 : k - 1;
+        next_chkpts_indices[1] = k;
+        next_chkpts_indices[2] = (k == num_candidates[s] - 1) ? 0 : k + 1;
+
+        if (check_bounds(&ms_params->mv_limits, br, bc, 1 << s)) {
+          for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
+            const FULLPEL_MV this_mv = {
+              br + candidates[s][next_chkpts_indices[i]].row,
+              bc + candidates[s][next_chkpts_indices[i]].col
+            };
+            thissad = get_mvpred_sad(
+                ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride);
+            CHECK_BETTER
+          }
+        } else {
+          for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
+            const FULLPEL_MV this_mv = {
+              br + candidates[s][next_chkpts_indices[i]].row,
+              bc + candidates[s][next_chkpts_indices[i]].col
+            };
+            if (!av1_is_fullmv_in_range(&ms_params->mv_limits, this_mv))
+              continue;
+            thissad = get_mvpred_sad(
+                ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride);
+            CHECK_BETTER
+          }
+        }
+
+        if (best_site != -1) {
+          k = next_chkpts_indices[best_site];
+          br += candidates[s][k].row;
+          bc += candidates[s][k].col;
+        }
+      } while (best_site != -1);
+    }
+
+    // Note: If we enter the if below, then cost_list must be non-NULL.
+    if (s == 0) {
+      cost_list[0] = raw_bestsad;
+      costlist_has_sad = 1;
+      if (!do_init_search || s != best_init_s) {
+        if (check_bounds(&ms_params->mv_limits, br, bc, 1 << s)) {
+          for (i = 0; i < num_candidates[s]; i++) {
+            const FULLPEL_MV this_mv = { br + candidates[s][i].row,
+                                         bc + candidates[s][i].col };
+            cost_list[i + 1] = thissad = get_mvpred_sad(
+                ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride);
+            CHECK_BETTER
+          }
+        } else {
+          for (i = 0; i < num_candidates[s]; i++) {
+            const FULLPEL_MV this_mv = { br + candidates[s][i].row,
+                                         bc + candidates[s][i].col };
+            if (!av1_is_fullmv_in_range(&ms_params->mv_limits, this_mv))
+              continue;
+            cost_list[i + 1] = thissad = get_mvpred_sad(
+                ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride);
+            CHECK_BETTER
+          }
+        }
+
+        if (best_site != -1) {
+          br += candidates[s][best_site].row;
+          bc += candidates[s][best_site].col;
+          k = best_site;
+        }
+      }
+      while (best_site != -1) {
+        int next_chkpts_indices[PATTERN_CANDIDATES_REF];
+        best_site = -1;
+        next_chkpts_indices[0] = (k == 0) ? num_candidates[s] - 1 : k - 1;
+        next_chkpts_indices[1] = k;
+        next_chkpts_indices[2] = (k == num_candidates[s] - 1) ? 0 : k + 1;
+        cost_list[1] = cost_list[2] = cost_list[3] = cost_list[4] = INT_MAX;
+        cost_list[((k + 2) % 4) + 1] = cost_list[0];
+        cost_list[0] = raw_bestsad;
+
+        if (check_bounds(&ms_params->mv_limits, br, bc, 1 << s)) {
+          for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
+            const FULLPEL_MV this_mv = {
+              br + candidates[s][next_chkpts_indices[i]].row,
+              bc + candidates[s][next_chkpts_indices[i]].col
+            };
+            cost_list[next_chkpts_indices[i] + 1] = thissad = get_mvpred_sad(
+                ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride);
+            CHECK_BETTER
+          }
+        } else {
+          for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
+            const FULLPEL_MV this_mv = {
+              br + candidates[s][next_chkpts_indices[i]].row,
+              bc + candidates[s][next_chkpts_indices[i]].col
+            };
+            if (!av1_is_fullmv_in_range(&ms_params->mv_limits, this_mv)) {
+              cost_list[next_chkpts_indices[i] + 1] = INT_MAX;
+              continue;
+            }
+            cost_list[next_chkpts_indices[i] + 1] = thissad = get_mvpred_sad(
+                ms_params, src, get_buf_from_fullmv(ref, &this_mv), ref_stride);
+            CHECK_BETTER
+          }
+        }
+
+        if (best_site != -1) {
+          k = next_chkpts_indices[best_site];
+          br += candidates[s][k].row;
+          bc += candidates[s][k].col;
+        }
+      }
+    }
+  }
+
+  best_mv->row = br;
+  best_mv->col = bc;
+
+  // Returns the one-away integer pel cost/sad around the best as follows:
+  // cost_list[0]: cost/sad at the best integer pel
+  // cost_list[1]: cost/sad at delta {0, -1} (left)   from the best integer pel
+  // cost_list[2]: cost/sad at delta { 1, 0} (bottom) from the best integer pel
+  // cost_list[3]: cost/sad at delta { 0, 1} (right)  from the best integer pel
+  // cost_list[4]: cost/sad at delta {-1, 0} (top)    from the best integer pel
+  if (cost_list) {
+    if (USE_SAD_COSTLIST) {
+      calc_int_sad_list(*best_mv, ms_params, cost_list, costlist_has_sad);
+    } else {
+      calc_int_cost_list(*best_mv, ms_params, cost_list);
+    }
+  }
+  best_mv->row = br;
+  best_mv->col = bc;
+
+  const int var_cost = get_mvpred_var_cost(ms_params, best_mv);
+  return var_cost;
+}
+#undef CHECK_BETTER
+
+// For the following foo_search, the input arguments are:
+// x: The struct used to hold a bunch of random configs.
+// start_mv: where we are starting our motion search
+// search_param: how many steps to skip in our motion search. For example,
+//   a value 3 suggests that 3 search steps have already taken place prior to
+//   this function call, so we jump directly to step 4 of the search process
+// sad_per_bit: a multiplier used to convert rate to sad cost
+// do_init_search: if on, do an initial search of all possible scales around the
+//   start_mv, and then pick the best scale.
+// cond_list: used to hold the cost around the best full mv so we can use it to
+//   speed up subpel search later.
+// vfp: a function pointer to the simd function so we can compute the cost
+//   efficiently
+// ref_mv: the reference mv used to compute the mv cost
+static int hex_search(const FULLPEL_MV start_mv,
+                      const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+                      const int search_param, const int do_init_search,
+                      int *cost_list, FULLPEL_MV *best_mv) {
+  // First scale has 8-closest points, the rest have 6 points in hex shape
+  // at increasing scales
+  static const int hex_num_candidates[MAX_PATTERN_SCALES] = { 8, 6, 6, 6, 6, 6,
+                                                              6, 6, 6, 6, 6 };
+  // Note that the largest candidate step at each scale is 2^scale
+  /* clang-format off */
+  static const MV hex_candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES] = {
+    { { -1, -1 }, { 0, -1 }, { 1, -1 }, { 1, 0 }, { 1, 1 }, { 0, 1 }, { -1, 1 },
+      { -1, 0 } },
+    { { -1, -2 }, { 1, -2 }, { 2, 0 }, { 1, 2 }, { -1, 2 }, { -2, 0 } },
+    { { -2, -4 }, { 2, -4 }, { 4, 0 }, { 2, 4 }, { -2, 4 }, { -4, 0 } },
+    { { -4, -8 }, { 4, -8 }, { 8, 0 }, { 4, 8 }, { -4, 8 }, { -8, 0 } },
+    { { -8, -16 }, { 8, -16 }, { 16, 0 }, { 8, 16 }, { -8, 16 }, { -16, 0 } },
+    { { -16, -32 }, { 16, -32 }, { 32, 0 }, { 16, 32 }, { -16, 32 },
+      { -32, 0 } },
+    { { -32, -64 }, { 32, -64 }, { 64, 0 }, { 32, 64 }, { -32, 64 },
+      { -64, 0 } },
+    { { -64, -128 }, { 64, -128 }, { 128, 0 }, { 64, 128 }, { -64, 128 },
+      { -128, 0 } },
+    { { -128, -256 }, { 128, -256 }, { 256, 0 }, { 128, 256 }, { -128, 256 },
+      { -256, 0 } },
+    { { -256, -512 }, { 256, -512 }, { 512, 0 }, { 256, 512 }, { -256, 512 },
+      { -512, 0 } },
+    { { -512, -1024 }, { 512, -1024 }, { 1024, 0 }, { 512, 1024 },
+      { -512, 1024 }, { -1024, 0 } },
+  };
+  /* clang-format on */
+  return pattern_search(start_mv, ms_params, search_param, do_init_search,
+                        hex_num_candidates, hex_candidates, cost_list, best_mv);
+}
+
+static int bigdia_search(const FULLPEL_MV start_mv,
+                         const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+                         const int search_param, const int do_init_search,
+                         int *cost_list, FULLPEL_MV *best_mv) {
+  // First scale has 4-closest points, the rest have 8 points in diamond
+  // shape at increasing scales
+  static const int bigdia_num_candidates[MAX_PATTERN_SCALES] = {
+    4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  };
+  // Note that the largest candidate step at each scale is 2^scale
+  /* clang-format off */
+  static const MV
+      bigdia_candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES] = {
+        { { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 } },
+        { { -1, -1 }, { 0, -2 }, { 1, -1 }, { 2, 0 }, { 1, 1 }, { 0, 2 },
+          { -1, 1 }, { -2, 0 } },
+        { { -2, -2 }, { 0, -4 }, { 2, -2 }, { 4, 0 }, { 2, 2 }, { 0, 4 },
+          { -2, 2 }, { -4, 0 } },
+        { { -4, -4 }, { 0, -8 }, { 4, -4 }, { 8, 0 }, { 4, 4 }, { 0, 8 },
+          { -4, 4 }, { -8, 0 } },
+        { { -8, -8 }, { 0, -16 }, { 8, -8 }, { 16, 0 }, { 8, 8 }, { 0, 16 },
+          { -8, 8 }, { -16, 0 } },
+        { { -16, -16 }, { 0, -32 }, { 16, -16 }, { 32, 0 }, { 16, 16 },
+          { 0, 32 }, { -16, 16 }, { -32, 0 } },
+        { { -32, -32 }, { 0, -64 }, { 32, -32 }, { 64, 0 }, { 32, 32 },
+          { 0, 64 }, { -32, 32 }, { -64, 0 } },
+        { { -64, -64 }, { 0, -128 }, { 64, -64 }, { 128, 0 }, { 64, 64 },
+          { 0, 128 }, { -64, 64 }, { -128, 0 } },
+        { { -128, -128 }, { 0, -256 }, { 128, -128 }, { 256, 0 }, { 128, 128 },
+          { 0, 256 }, { -128, 128 }, { -256, 0 } },
+        { { -256, -256 }, { 0, -512 }, { 256, -256 }, { 512, 0 }, { 256, 256 },
+          { 0, 512 }, { -256, 256 }, { -512, 0 } },
+        { { -512, -512 }, { 0, -1024 }, { 512, -512 }, { 1024, 0 },
+          { 512, 512 }, { 0, 1024 }, { -512, 512 }, { -1024, 0 } },
+      };
+  /* clang-format on */
+  return pattern_search(start_mv, ms_params, search_param, do_init_search,
+                        bigdia_num_candidates, bigdia_candidates, cost_list,
+                        best_mv);
+}
+
+static int square_search(const FULLPEL_MV start_mv,
+                         const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+                         const int search_param, const int do_init_search,
+                         int *cost_list, FULLPEL_MV *best_mv) {
+  // All scales have 8 closest points in square shape
+  static const int square_num_candidates[MAX_PATTERN_SCALES] = {
+    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  };
+  // Note that the largest candidate step at each scale is 2^scale
+  /* clang-format off */
+  static const MV
+      square_candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES] = {
+        { { -1, -1 }, { 0, -1 }, { 1, -1 }, { 1, 0 }, { 1, 1 }, { 0, 1 },
+          { -1, 1 }, { -1, 0 } },
+        { { -2, -2 }, { 0, -2 }, { 2, -2 }, { 2, 0 }, { 2, 2 }, { 0, 2 },
+          { -2, 2 }, { -2, 0 } },
+        { { -4, -4 }, { 0, -4 }, { 4, -4 }, { 4, 0 }, { 4, 4 }, { 0, 4 },
+          { -4, 4 }, { -4, 0 } },
+        { { -8, -8 }, { 0, -8 }, { 8, -8 }, { 8, 0 }, { 8, 8 }, { 0, 8 },
+          { -8, 8 }, { -8, 0 } },
+        { { -16, -16 }, { 0, -16 }, { 16, -16 }, { 16, 0 }, { 16, 16 },
+          { 0, 16 }, { -16, 16 }, { -16, 0 } },
+        { { -32, -32 }, { 0, -32 }, { 32, -32 }, { 32, 0 }, { 32, 32 },
+          { 0, 32 }, { -32, 32 }, { -32, 0 } },
+        { { -64, -64 }, { 0, -64 }, { 64, -64 }, { 64, 0 }, { 64, 64 },
+          { 0, 64 }, { -64, 64 }, { -64, 0 } },
+        { { -128, -128 }, { 0, -128 }, { 128, -128 }, { 128, 0 }, { 128, 128 },
+          { 0, 128 }, { -128, 128 }, { -128, 0 } },
+        { { -256, -256 }, { 0, -256 }, { 256, -256 }, { 256, 0 }, { 256, 256 },
+          { 0, 256 }, { -256, 256 }, { -256, 0 } },
+        { { -512, -512 }, { 0, -512 }, { 512, -512 }, { 512, 0 }, { 512, 512 },
+          { 0, 512 }, { -512, 512 }, { -512, 0 } },
+        { { -1024, -1024 }, { 0, -1024 }, { 1024, -1024 }, { 1024, 0 },
+          { 1024, 1024 }, { 0, 1024 }, { -1024, 1024 }, { -1024, 0 } },
+      };
+  /* clang-format on */
+  return pattern_search(start_mv, ms_params, search_param, do_init_search,
+                        square_num_candidates, square_candidates, cost_list,
+                        best_mv);
+}
+
+static int fast_hex_search(const FULLPEL_MV start_mv,
+                           const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+                           const int search_param, const int do_init_search,
+                           int *cost_list, FULLPEL_MV *best_mv) {
+  return hex_search(start_mv, ms_params,
+                    AOMMAX(MAX_MVSEARCH_STEPS - 2, search_param),
+                    do_init_search, cost_list, best_mv);
+}
+
+static int fast_dia_search(const FULLPEL_MV start_mv,
+                           const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+                           const int search_param, const int do_init_search,
+                           int *cost_list, FULLPEL_MV *best_mv) {
+  return bigdia_search(start_mv, ms_params,
+                       AOMMAX(MAX_MVSEARCH_STEPS - 2, search_param),
+                       do_init_search, cost_list, best_mv);
+}
+
+static int diamond_search_sad(FULLPEL_MV start_mv,
+                              const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+                              const int search_param, int *num00,
+                              FULLPEL_MV *best_mv, FULLPEL_MV *second_best_mv) {
+  const struct buf_2d *const src = ms_params->ms_buffers.src;
+  const struct buf_2d *const ref = ms_params->ms_buffers.ref;
+
+  const int ref_stride = ref->stride;
+  const uint8_t *best_address;
+
+  const aom_variance_fn_ptr_t *vfp = ms_params->vfp;
+  const uint8_t *mask = ms_params->ms_buffers.mask;
+  const uint8_t *second_pred = ms_params->ms_buffers.second_pred;
+  const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
+
+  const search_site_config *cfg = ms_params->search_sites;
+
+  unsigned int bestsad = INT_MAX;
+  int best_site = 0;
+  int is_off_center = 0;
+
+  clamp_fullmv(&start_mv, &ms_params->mv_limits);
+
+  // search_param determines the length of the initial step and hence the number
+  // of iterations.
+  const int tot_steps = cfg->ss_count - search_param;
+
+  *num00 = 0;
+  *best_mv = start_mv;
+
+  // Check the starting position
+  best_address = get_buf_from_fullmv(ref, &start_mv);
+  bestsad = get_mvpred_compound_sad(ms_params, src, best_address, ref_stride);
+  bestsad += mvsad_err_cost_(best_mv, &ms_params->mv_cost_params);
+
+  int next_step_size = tot_steps > 2 ? cfg->radius[tot_steps - 2] : 1;
+  for (int step = tot_steps - 1; step >= 0; --step) {
+    const search_site *ss = cfg->ss[step];
+    best_site = 0;
+    if (step > 0) next_step_size = cfg->radius[step - 1];
+
+    int all_in = 1, j;
+    // Trap illegal vectors
+    all_in &= best_mv->row + ss[1].mv.row >= ms_params->mv_limits.row_min;
+    all_in &= best_mv->row + ss[2].mv.row <= ms_params->mv_limits.row_max;
+    all_in &= best_mv->col + ss[3].mv.col >= ms_params->mv_limits.col_min;
+    all_in &= best_mv->col + ss[4].mv.col <= ms_params->mv_limits.col_max;
+
+    // TODO(anyone): Implement 4 points search for msdf&sdaf
+    if (all_in && !mask && !second_pred) {
+      const uint8_t *src_buf = src->buf;
+      const int src_stride = src->stride;
+      for (int idx = 1; idx <= cfg->searches_per_step[step]; idx += 4) {
+        unsigned char const *block_offset[4];
+        unsigned int sads[4];
+
+        for (j = 0; j < 4; j++)
+          block_offset[j] = ss[idx + j].offset + best_address;
+
+        vfp->sdx4df(src_buf, src_stride, block_offset, ref_stride, sads);
+        for (j = 0; j < 4; j++) {
+          if (sads[j] < bestsad) {
+            const FULLPEL_MV this_mv = { best_mv->row + ss[idx + j].mv.row,
+                                         best_mv->col + ss[idx + j].mv.col };
+            unsigned int thissad =
+                sads[j] + mvsad_err_cost_(&this_mv, mv_cost_params);
+            if (thissad < bestsad) {
+              bestsad = thissad;
+              best_site = idx + j;
+            }
+          }
+        }
+      }
+    } else {
+      for (int idx = 1; idx <= cfg->searches_per_step[step]; idx++) {
+        const FULLPEL_MV this_mv = { best_mv->row + ss[idx].mv.row,
+                                     best_mv->col + ss[idx].mv.col };
+
+        if (av1_is_fullmv_in_range(&ms_params->mv_limits, this_mv)) {
+          const uint8_t *const check_here = ss[idx].offset + best_address;
+          unsigned int thissad;
+
+          thissad =
+              get_mvpred_compound_sad(ms_params, src, check_here, ref_stride);
+
+          if (thissad < bestsad) {
+            thissad += mvsad_err_cost_(&this_mv, mv_cost_params);
+            if (thissad < bestsad) {
+              bestsad = thissad;
+              best_site = idx;
+            }
+          }
+        }
+      }
+    }
+
+    if (best_site != 0) {
+      if (second_best_mv) {
+        *second_best_mv = *best_mv;
+      }
+      best_mv->row += ss[best_site].mv.row;
+      best_mv->col += ss[best_site].mv.col;
+      best_address += ss[best_site].offset;
+      is_off_center = 1;
+    }
+
+    if (is_off_center == 0) (*num00)++;
+
+    if (best_site == 0) {
+      while (next_step_size == cfg->radius[step] && step > 2) {
+        ++(*num00);
+        --step;
+        next_step_size = cfg->radius[step - 1];
+      }
+    }
+  }
+
+  return bestsad;
+}
+
+/* do_refine: If last step (1-away) of n-step search doesn't pick the center
+              point as the best match, we will do a final 1-away diamond
+              refining search  */
+static int full_pixel_diamond(const FULLPEL_MV start_mv,
+                              const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+                              const int step_param, int *cost_list,
+                              FULLPEL_MV *best_mv, FULLPEL_MV *second_best_mv) {
+  const search_site_config *cfg = ms_params->search_sites;
+  int thissme, n, num00 = 0;
+  int bestsme = diamond_search_sad(start_mv, ms_params, step_param, &n, best_mv,
+                                   second_best_mv);
+
+  if (bestsme < INT_MAX) {
+    bestsme = get_mvpred_compound_var_cost(ms_params, best_mv);
+  }
+
+  // If there won't be more n-step search, check to see if refining search is
+  // needed.
+  const int further_steps = cfg->ss_count - 1 - step_param;
+  while (n < further_steps) {
+    ++n;
+
+    if (num00) {
+      num00--;
+    } else {
+      // TODO(chiyotsai@google.com): There is another bug here where the second
+      // best mv gets incorrectly overwritten. Fix it later.
+      FULLPEL_MV tmp_best_mv;
+      thissme = diamond_search_sad(start_mv, ms_params, step_param + n, &num00,
+                                   &tmp_best_mv, second_best_mv);
+
+      if (thissme < INT_MAX) {
+        thissme = get_mvpred_compound_var_cost(ms_params, &tmp_best_mv);
+      }
+
+      if (thissme < bestsme) {
+        bestsme = thissme;
+        *best_mv = tmp_best_mv;
+      }
+    }
+  }
+
+  // Return cost list.
+  if (cost_list) {
+    if (USE_SAD_COSTLIST) {
+      const int costlist_has_sad = 0;
+      calc_int_sad_list(*best_mv, ms_params, cost_list, costlist_has_sad);
+    } else {
+      calc_int_cost_list(*best_mv, ms_params, cost_list);
+    }
+  }
+  return bestsme;
+}
+
+// Exhaustive motion search around a given centre position with a given
+// step size.
+static int exhaustive_mesh_search(FULLPEL_MV start_mv,
+                                  const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+                                  const int range, const int step,
+                                  FULLPEL_MV *best_mv,
+                                  FULLPEL_MV *second_best_mv) {
+  const aom_variance_fn_ptr_t *vfp = ms_params->vfp;
+  const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
+  const struct buf_2d *const src = ms_params->ms_buffers.src;
+  const struct buf_2d *const ref = ms_params->ms_buffers.ref;
+  const int ref_stride = ref->stride;
+  unsigned int best_sad = INT_MAX;
+  int r, c, i;
+  int start_col, end_col, start_row, end_row;
+  int col_step = (step > 1) ? step : 4;
+
+  assert(step >= 1);
+
+  clamp_fullmv(&start_mv, &ms_params->mv_limits);
+  *best_mv = start_mv;
+  best_sad = get_mvpred_sad(ms_params, src, get_buf_from_fullmv(ref, &start_mv),
+                            ref_stride);
+  best_sad += mvsad_err_cost_(&start_mv, mv_cost_params);
+  start_row = AOMMAX(-range, ms_params->mv_limits.row_min - start_mv.row);
+  start_col = AOMMAX(-range, ms_params->mv_limits.col_min - start_mv.col);
+  end_row = AOMMIN(range, ms_params->mv_limits.row_max - start_mv.row);
+  end_col = AOMMIN(range, ms_params->mv_limits.col_max - start_mv.col);
+
+  for (r = start_row; r <= end_row; r += step) {
+    for (c = start_col; c <= end_col; c += col_step) {
+      // Step > 1 means we are not checking every location in this pass.
+      if (step > 1) {
+        const FULLPEL_MV mv = { start_mv.row + r, start_mv.col + c };
+        unsigned int sad = get_mvpred_sad(
+            ms_params, src, get_buf_from_fullmv(ref, &mv), ref_stride);
+        if (sad < best_sad) {
+          sad += mvsad_err_cost_(&mv, mv_cost_params);
+          if (sad < best_sad) {
+            best_sad = sad;
+            if (second_best_mv) {
+              *second_best_mv = *best_mv;
+            }
+            *best_mv = mv;
+          }
+        }
+      } else {
+        // 4 sads in a single call if we are checking every location
+        if (c + 3 <= end_col) {
+          unsigned int sads[4];
+          const uint8_t *addrs[4];
+          for (i = 0; i < 4; ++i) {
+            const FULLPEL_MV mv = { start_mv.row + r, start_mv.col + c + i };
+            addrs[i] = get_buf_from_fullmv(ref, &mv);
+          }
+          vfp->sdx4df(src->buf, src->stride, addrs, ref_stride, sads);
+
+          for (i = 0; i < 4; ++i) {
+            if (sads[i] < best_sad) {
+              const FULLPEL_MV mv = { start_mv.row + r, start_mv.col + c + i };
+              const unsigned int sad =
+                  sads[i] + mvsad_err_cost_(&mv, mv_cost_params);
+              if (sad < best_sad) {
+                best_sad = sad;
+                if (second_best_mv) {
+                  *second_best_mv = *best_mv;
+                }
+                *best_mv = mv;
+              }
+            }
+          }
+        } else {
+          for (i = 0; i < end_col - c; ++i) {
+            const FULLPEL_MV mv = { start_mv.row + r, start_mv.col + c + i };
+            unsigned int sad = get_mvpred_sad(
+                ms_params, src, get_buf_from_fullmv(ref, &mv), ref_stride);
+            if (sad < best_sad) {
+              sad += mvsad_err_cost_(&mv, mv_cost_params);
+              if (sad < best_sad) {
+                best_sad = sad;
+                if (second_best_mv) {
+                  *second_best_mv = *best_mv;
+                }
+                *best_mv = mv;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  return best_sad;
+}
+
+// Runs an limited range exhaustive mesh search using a pattern set
+// according to the encode speed profile.
+static int full_pixel_exhaustive(const FULLPEL_MV start_mv,
+                                 const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+                                 const struct MESH_PATTERN *const mesh_patterns,
+                                 int *cost_list, FULLPEL_MV *best_mv,
+                                 FULLPEL_MV *second_best_mv) {
+  const int kMinRange = 7;
+  const int kMaxRange = 256;
+  const int kMinInterval = 1;
+
+  int bestsme;
+  int i;
+  int interval = mesh_patterns[0].interval;
+  int range = mesh_patterns[0].range;
+  int baseline_interval_divisor;
+
+  *best_mv = start_mv;
+
+  // Trap illegal values for interval and range for this function.
+  if ((range < kMinRange) || (range > kMaxRange) || (interval < kMinInterval) ||
+      (interval > range))
+    return INT_MAX;
+
+  baseline_interval_divisor = range / interval;
+
+  // Check size of proposed first range against magnitude of the centre
+  // value used as a starting point.
+  range = AOMMAX(range, (5 * AOMMAX(abs(best_mv->row), abs(best_mv->col))) / 4);
+  range = AOMMIN(range, kMaxRange);
+  interval = AOMMAX(interval, range / baseline_interval_divisor);
+
+  // initial search
+  bestsme = exhaustive_mesh_search(*best_mv, ms_params, range, interval,
+                                   best_mv, second_best_mv);
+
+  if ((interval > kMinInterval) && (range > kMinRange)) {
+    // Progressive searches with range and step size decreasing each time
+    // till we reach a step size of 1. Then break out.
+    for (i = 1; i < MAX_MESH_STEP; ++i) {
+      // First pass with coarser step and longer range
+      bestsme = exhaustive_mesh_search(
+          *best_mv, ms_params, mesh_patterns[i].range,
+          mesh_patterns[i].interval, best_mv, second_best_mv);
+
+      if (mesh_patterns[i].interval == 1) break;
+    }
+  }
+
+  if (bestsme < INT_MAX) {
+    bestsme = get_mvpred_var_cost(ms_params, best_mv);
+  }
+
+  // Return cost list.
+  if (cost_list) {
+    if (USE_SAD_COSTLIST) {
+      const int costlist_has_sad = 0;
+      calc_int_sad_list(*best_mv, ms_params, cost_list, costlist_has_sad);
+    } else {
+      calc_int_cost_list(*best_mv, ms_params, cost_list);
+    }
+  }
+  return bestsme;
+}
+
+// This function is called when we do joint motion search in comp_inter_inter
+// mode, or when searching for one component of an ext-inter compound mode.
+int av1_refining_search_8p_c(const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+                             const FULLPEL_MV start_mv, FULLPEL_MV *best_mv) {
+  static const search_neighbors neighbors[8] = {
+    { { -1, 0 }, -1 * SEARCH_GRID_STRIDE_8P + 0 },
+    { { 0, -1 }, 0 * SEARCH_GRID_STRIDE_8P - 1 },
+    { { 0, 1 }, 0 * SEARCH_GRID_STRIDE_8P + 1 },
+    { { 1, 0 }, 1 * SEARCH_GRID_STRIDE_8P + 0 },
+    { { -1, -1 }, -1 * SEARCH_GRID_STRIDE_8P - 1 },
+    { { 1, -1 }, 1 * SEARCH_GRID_STRIDE_8P - 1 },
+    { { -1, 1 }, -1 * SEARCH_GRID_STRIDE_8P + 1 },
+    { { 1, 1 }, 1 * SEARCH_GRID_STRIDE_8P + 1 }
+  };
+
+  uint8_t do_refine_search_grid[SEARCH_GRID_STRIDE_8P *
+                                SEARCH_GRID_STRIDE_8P] = { 0 };
+  int grid_center = SEARCH_GRID_CENTER_8P;
+  int grid_coord = grid_center;
+
+  const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
+  const FullMvLimits *mv_limits = &ms_params->mv_limits;
+  const MSBuffers *ms_buffers = &ms_params->ms_buffers;
+  const struct buf_2d *src = ms_buffers->src;
+  const struct buf_2d *ref = ms_buffers->ref;
+  const int ref_stride = ref->stride;
+
+  *best_mv = start_mv;
+  clamp_fullmv(best_mv, mv_limits);
+
+  unsigned int best_sad = get_mvpred_compound_sad(
+      ms_params, src, get_buf_from_fullmv(ref, best_mv), ref_stride);
+  best_sad += mvsad_err_cost_(best_mv, mv_cost_params);
+
+  do_refine_search_grid[grid_coord] = 1;
+
+  for (int i = 0; i < SEARCH_RANGE_8P; ++i) {
+    int best_site = -1;
+
+    for (int j = 0; j < 8; ++j) {
+      grid_coord = grid_center + neighbors[j].coord_offset;
+      if (do_refine_search_grid[grid_coord] == 1) {
+        continue;
+      }
+      const FULLPEL_MV mv = { best_mv->row + neighbors[j].coord.row,
+                              best_mv->col + neighbors[j].coord.col };
+
+      do_refine_search_grid[grid_coord] = 1;
+      if (av1_is_fullmv_in_range(mv_limits, mv)) {
+        unsigned int sad;
+        sad = get_mvpred_compound_sad(
+            ms_params, src, get_buf_from_fullmv(ref, &mv), ref_stride);
+        if (sad < best_sad) {
+          sad += mvsad_err_cost_(&mv, mv_cost_params);
+
+          if (sad < best_sad) {
+            best_sad = sad;
+            best_site = j;
+          }
+        }
+      }
+    }
+
+    if (best_site == -1) {
+      break;
+    } else {
+      best_mv->row += neighbors[best_site].coord.row;
+      best_mv->col += neighbors[best_site].coord.col;
+      grid_center += neighbors[best_site].coord_offset;
+    }
+  }
+  return best_sad;
+}
+
+int av1_full_pixel_search(const FULLPEL_MV start_mv,
+                          const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+                          const int step_param, int *cost_list,
+                          FULLPEL_MV *best_mv, FULLPEL_MV *second_best_mv) {
+  const BLOCK_SIZE bsize = ms_params->bsize;
+  const SEARCH_METHODS search_method = ms_params->search_method;
+
+  const int is_intra_mode = ms_params->is_intra_mode;
+  int run_mesh_search = ms_params->run_mesh_search;
+
+  int var = 0;
+  MARK_MV_INVALID(best_mv);
+  if (second_best_mv) {
+    MARK_MV_INVALID(second_best_mv);
+  }
+
+  assert(ms_params->ms_buffers.second_pred == NULL &&
+         ms_params->ms_buffers.mask == NULL &&
+         "av1_full_pixel_search does not support compound pred");
+
+  if (cost_list) {
+    cost_list[0] = INT_MAX;
+    cost_list[1] = INT_MAX;
+    cost_list[2] = INT_MAX;
+    cost_list[3] = INT_MAX;
+    cost_list[4] = INT_MAX;
+  }
+
+  switch (search_method) {
+    case FAST_DIAMOND:
+      var = fast_dia_search(start_mv, ms_params, step_param, 0, cost_list,
+                            best_mv);
+      break;
+    case FAST_HEX:
+      var = fast_hex_search(start_mv, ms_params, step_param, 0, cost_list,
+                            best_mv);
+      break;
+    case HEX:
+      var = hex_search(start_mv, ms_params, step_param, 1, cost_list, best_mv);
+      break;
+    case SQUARE:
+      var =
+          square_search(start_mv, ms_params, step_param, 1, cost_list, best_mv);
+      break;
+    case BIGDIA:
+      var =
+          bigdia_search(start_mv, ms_params, step_param, 1, cost_list, best_mv);
+      break;
+    case NSTEP:
+    case DIAMOND:
+      var = full_pixel_diamond(start_mv, ms_params, step_param, cost_list,
+                               best_mv, second_best_mv);
+      break;
+    default: assert(0 && "Invalid search method.");
+  }
+
+  // Should we allow a follow on exhaustive search?
+  if (!run_mesh_search && search_method == NSTEP) {
+    int exhuastive_thr = ms_params->force_mesh_thresh;
+    exhuastive_thr >>=
+        10 - (mi_size_wide_log2[bsize] + mi_size_high_log2[bsize]);
+    // Threshold variance for an exhaustive full search.
+    if (var > exhuastive_thr) run_mesh_search = 1;
+  }
+
+  // TODO(yunqing): the following is used to reduce mesh search in temporal
+  // filtering. Can extend it to intrabc.
+  if (!is_intra_mode && ms_params->prune_mesh_search) {
+    const int full_pel_mv_diff = AOMMAX(abs(start_mv.row - best_mv->row),
+                                        abs(start_mv.col - best_mv->col));
+    if (full_pel_mv_diff <= 4) {
+      run_mesh_search = 0;
+    }
+  }
+
+  if (run_mesh_search) {
+    int var_ex;
+    FULLPEL_MV tmp_mv_ex;
+    // Pick the mesh pattern for exhaustive search based on the toolset (intraBC
+    // or non-intraBC)
+    // TODO(chiyotsai@google.com):  There is a bug here where the second best mv
+    // gets overwritten without actually comparing the rdcost.
+    const MESH_PATTERN *const mesh_patterns =
+        ms_params->mesh_patterns[is_intra_mode];
+    // TODO(chiyotsai@google.com): the second best mv is not set correctly by
+    // full_pixel_exhaustive, which can incorrectly override it.
+    var_ex = full_pixel_exhaustive(*best_mv, ms_params, mesh_patterns,
+                                   cost_list, &tmp_mv_ex, second_best_mv);
+    if (var_ex < var) {
+      var = var_ex;
+      *best_mv = tmp_mv_ex;
+    }
+  }
+
+  return var;
+}
+
+int av1_intrabc_hash_search(const AV1_COMP *cpi, const MACROBLOCKD *xd,
+                            const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+                            IntraBCHashInfo *intrabc_hash_info,
+                            FULLPEL_MV *best_mv) {
+  if (!av1_use_hash_me(cpi)) return INT_MAX;
+
+  const BLOCK_SIZE bsize = ms_params->bsize;
+  const int block_width = block_size_wide[bsize];
+  const int block_height = block_size_high[bsize];
+
+  if (block_width != block_height) return INT_MAX;
+
+  const FullMvLimits *mv_limits = &ms_params->mv_limits;
+  const MSBuffers *ms_buffer = &ms_params->ms_buffers;
+
+  const uint8_t *src = ms_buffer->src->buf;
+  const int src_stride = ms_buffer->src->stride;
+
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+  const int x_pos = mi_col * MI_SIZE;
+  const int y_pos = mi_row * MI_SIZE;
+
+  uint32_t hash_value1, hash_value2;
+  int best_hash_cost = INT_MAX;
+
+  // for the hashMap
+  hash_table *ref_frame_hash = &intrabc_hash_info->intrabc_hash_table;
+
+  av1_get_block_hash_value(intrabc_hash_info, src, src_stride, block_width,
+                           &hash_value1, &hash_value2, is_cur_buf_hbd(xd));
+
+  const int count = av1_hash_table_count(ref_frame_hash, hash_value1);
+  if (count <= 1) {
+    return INT_MAX;
+  }
+
+  Iterator iterator = av1_hash_get_first_iterator(ref_frame_hash, hash_value1);
+  for (int i = 0; i < count; i++, aom_iterator_increment(&iterator)) {
+    block_hash ref_block_hash = *(block_hash *)(aom_iterator_get(&iterator));
+    if (hash_value2 == ref_block_hash.hash_value2) {
+      // Make sure the prediction is from valid area.
+      const MV dv = { GET_MV_SUBPEL(ref_block_hash.y - y_pos),
+                      GET_MV_SUBPEL(ref_block_hash.x - x_pos) };
+      if (!av1_is_dv_valid(dv, &cpi->common, xd, mi_row, mi_col, bsize,
+                           cpi->common.seq_params.mib_size_log2))
+        continue;
+
+      FULLPEL_MV hash_mv;
+      hash_mv.col = ref_block_hash.x - x_pos;
+      hash_mv.row = ref_block_hash.y - y_pos;
+      if (!av1_is_fullmv_in_range(mv_limits, hash_mv)) continue;
+      const int refCost = get_mvpred_var_cost(ms_params, &hash_mv);
+      if (refCost < best_hash_cost) {
+        best_hash_cost = refCost;
+        *best_mv = hash_mv;
+      }
+    }
+  }
+
+  return best_hash_cost;
+}
+
+static int vector_match(int16_t *ref, int16_t *src, int bwl) {
+  int best_sad = INT_MAX;
+  int this_sad;
+  int d;
+  int center, offset = 0;
+  int bw = 4 << bwl;  // redundant variable, to be changed in the experiments.
+  for (d = 0; d <= bw; d += 16) {
+    this_sad = aom_vector_var(&ref[d], src, bwl);
+    if (this_sad < best_sad) {
+      best_sad = this_sad;
+      offset = d;
+    }
+  }
+  center = offset;
+
+  for (d = -8; d <= 8; d += 16) {
+    int this_pos = offset + d;
+    // check limit
+    if (this_pos < 0 || this_pos > bw) continue;
+    this_sad = aom_vector_var(&ref[this_pos], src, bwl);
+    if (this_sad < best_sad) {
+      best_sad = this_sad;
+      center = this_pos;
+    }
+  }
+  offset = center;
+
+  for (d = -4; d <= 4; d += 8) {
+    int this_pos = offset + d;
+    // check limit
+    if (this_pos < 0 || this_pos > bw) continue;
+    this_sad = aom_vector_var(&ref[this_pos], src, bwl);
+    if (this_sad < best_sad) {
+      best_sad = this_sad;
+      center = this_pos;
+    }
+  }
+  offset = center;
+
+  for (d = -2; d <= 2; d += 4) {
+    int this_pos = offset + d;
+    // check limit
+    if (this_pos < 0 || this_pos > bw) continue;
+    this_sad = aom_vector_var(&ref[this_pos], src, bwl);
+    if (this_sad < best_sad) {
+      best_sad = this_sad;
+      center = this_pos;
+    }
+  }
+  offset = center;
+
+  for (d = -1; d <= 1; d += 2) {
+    int this_pos = offset + d;
+    // check limit
+    if (this_pos < 0 || this_pos > bw) continue;
+    this_sad = aom_vector_var(&ref[this_pos], src, bwl);
+    if (this_sad < best_sad) {
+      best_sad = this_sad;
+      center = this_pos;
+    }
+  }
+
+  return (center - (bw >> 1));
+}
+
+// A special fast version of motion search used in rt mode
+unsigned int av1_int_pro_motion_estimation(const AV1_COMP *cpi, MACROBLOCK *x,
+                                           BLOCK_SIZE bsize, int mi_row,
+                                           int mi_col, const MV *ref_mv) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mi = xd->mi[0];
+  struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0, 0, 0, 0 } };
+  DECLARE_ALIGNED(16, int16_t, hbuf[256]);
+  DECLARE_ALIGNED(16, int16_t, vbuf[256]);
+  DECLARE_ALIGNED(16, int16_t, src_hbuf[128]);
+  DECLARE_ALIGNED(16, int16_t, src_vbuf[128]);
+  int idx;
+  const int bw = 4 << mi_size_wide_log2[bsize];
+  const int bh = 4 << mi_size_high_log2[bsize];
+  const int search_width = bw << 1;
+  const int search_height = bh << 1;
+  const int src_stride = x->plane[0].src.stride;
+  const int ref_stride = xd->plane[0].pre[0].stride;
+  uint8_t const *ref_buf, *src_buf;
+  int_mv *best_int_mv = &xd->mi[0]->mv[0];
+  unsigned int best_sad, tmp_sad, this_sad[4];
+  const int norm_factor = 3 + (bw >> 5);
+  const YV12_BUFFER_CONFIG *scaled_ref_frame =
+      av1_get_scaled_ref_frame(cpi, mi->ref_frame[0]);
+  static const MV search_pos[4] = {
+    { -1, 0 },
+    { 0, -1 },
+    { 0, 1 },
+    { 1, 0 },
+  };
+
+  if (scaled_ref_frame) {
+    int i;
+    // Swap out the reference frame for a version that's been scaled to
+    // match the resolution of the current frame, allowing the existing
+    // motion search code to be used without additional modifications.
+    for (i = 0; i < MAX_MB_PLANE; i++) backup_yv12[i] = xd->plane[i].pre[0];
+    av1_setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL,
+                         MAX_MB_PLANE);
+  }
+
+  if (xd->bd != 8) {
+    unsigned int sad;
+    best_int_mv->as_fullmv = kZeroFullMv;
+    sad = cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf, src_stride,
+                                 xd->plane[0].pre[0].buf, ref_stride);
+
+    if (scaled_ref_frame) {
+      int i;
+      for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i];
+    }
+    return sad;
+  }
+
+  // Set up prediction 1-D reference set
+  ref_buf = xd->plane[0].pre[0].buf - (bw >> 1);
+  for (idx = 0; idx < search_width; idx += 16) {
+    aom_int_pro_row(&hbuf[idx], ref_buf, ref_stride, bh);
+    ref_buf += 16;
+  }
+
+  ref_buf = xd->plane[0].pre[0].buf - (bh >> 1) * ref_stride;
+  for (idx = 0; idx < search_height; ++idx) {
+    vbuf[idx] = aom_int_pro_col(ref_buf, bw) >> norm_factor;
+    ref_buf += ref_stride;
+  }
+
+  // Set up src 1-D reference set
+  for (idx = 0; idx < bw; idx += 16) {
+    src_buf = x->plane[0].src.buf + idx;
+    aom_int_pro_row(&src_hbuf[idx], src_buf, src_stride, bh);
+  }
+
+  src_buf = x->plane[0].src.buf;
+  for (idx = 0; idx < bh; ++idx) {
+    src_vbuf[idx] = aom_int_pro_col(src_buf, bw) >> norm_factor;
+    src_buf += src_stride;
+  }
+
+  // Find the best match per 1-D search
+  best_int_mv->as_fullmv.col =
+      vector_match(hbuf, src_hbuf, mi_size_wide_log2[bsize]);
+  best_int_mv->as_fullmv.row =
+      vector_match(vbuf, src_vbuf, mi_size_high_log2[bsize]);
+
+  FULLPEL_MV this_mv = best_int_mv->as_fullmv;
+  src_buf = x->plane[0].src.buf;
+  ref_buf = get_buf_from_fullmv(&xd->plane[0].pre[0], &this_mv);
+  best_sad = cpi->fn_ptr[bsize].sdf(src_buf, src_stride, ref_buf, ref_stride);
+
+  {
+    const uint8_t *const pos[4] = {
+      ref_buf - ref_stride,
+      ref_buf - 1,
+      ref_buf + 1,
+      ref_buf + ref_stride,
+    };
+
+    cpi->fn_ptr[bsize].sdx4df(src_buf, src_stride, pos, ref_stride, this_sad);
+  }
+
+  for (idx = 0; idx < 4; ++idx) {
+    if (this_sad[idx] < best_sad) {
+      best_sad = this_sad[idx];
+      best_int_mv->as_fullmv.row = search_pos[idx].row + this_mv.row;
+      best_int_mv->as_fullmv.col = search_pos[idx].col + this_mv.col;
+    }
+  }
+
+  if (this_sad[0] < this_sad[3])
+    this_mv.row -= 1;
+  else
+    this_mv.row += 1;
+
+  if (this_sad[1] < this_sad[2])
+    this_mv.col -= 1;
+  else
+    this_mv.col += 1;
+
+  ref_buf = get_buf_from_fullmv(&xd->plane[0].pre[0], &this_mv);
+
+  tmp_sad = cpi->fn_ptr[bsize].sdf(src_buf, src_stride, ref_buf, ref_stride);
+  if (best_sad > tmp_sad) {
+    best_int_mv->as_fullmv = this_mv;
+    best_sad = tmp_sad;
+  }
+
+  convert_fullmv_to_mv(best_int_mv);
+
+  SubpelMvLimits subpel_mv_limits;
+  av1_set_subpel_mv_search_range(&subpel_mv_limits, &x->mv_limits, ref_mv);
+  clamp_mv(&best_int_mv->as_mv, &subpel_mv_limits);
+
+  if (scaled_ref_frame) {
+    int i;
+    for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i];
+  }
+
+  return best_sad;
+}
+
+// =============================================================================
+//  Fullpixel Motion Search: OBMC
+// =============================================================================
+static INLINE int get_obmc_mvpred_var(
+    const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const FULLPEL_MV *this_mv) {
+  const aom_variance_fn_ptr_t *vfp = ms_params->vfp;
+  const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
+  const MSBuffers *ms_buffers = &ms_params->ms_buffers;
+  const int32_t *wsrc = ms_buffers->wsrc;
+  const int32_t *mask = ms_buffers->obmc_mask;
+  const struct buf_2d *ref_buf = ms_buffers->ref;
+
+  const MV mv = get_mv_from_fullmv(this_mv);
+  unsigned int unused;
+
+  return vfp->ovf(get_buf_from_fullmv(ref_buf, this_mv), ref_buf->stride, wsrc,
+                  mask, &unused) +
+         mv_err_cost_(&mv, mv_cost_params);
+}
+
+static int obmc_refining_search_sad(
+    const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, FULLPEL_MV *best_mv) {
+  const aom_variance_fn_ptr_t *fn_ptr = ms_params->vfp;
+  const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
+  const MSBuffers *ms_buffers = &ms_params->ms_buffers;
+  const int32_t *wsrc = ms_buffers->wsrc;
+  const int32_t *mask = ms_buffers->obmc_mask;
+  const struct buf_2d *ref_buf = ms_buffers->ref;
+  const FULLPEL_MV neighbors[4] = { { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 } };
+  const int kSearchRange = 8;
+
+  unsigned int best_sad = fn_ptr->osdf(get_buf_from_fullmv(ref_buf, best_mv),
+                                       ref_buf->stride, wsrc, mask) +
+                          mvsad_err_cost_(best_mv, mv_cost_params);
+
+  for (int i = 0; i < kSearchRange; i++) {
+    int best_site = -1;
+
+    for (int j = 0; j < 4; j++) {
+      const FULLPEL_MV mv = { best_mv->row + neighbors[j].row,
+                              best_mv->col + neighbors[j].col };
+      if (av1_is_fullmv_in_range(&ms_params->mv_limits, mv)) {
+        unsigned int sad = fn_ptr->osdf(get_buf_from_fullmv(ref_buf, &mv),
+                                        ref_buf->stride, wsrc, mask);
+        if (sad < best_sad) {
+          sad += mvsad_err_cost_(&mv, mv_cost_params);
+
+          if (sad < best_sad) {
+            best_sad = sad;
+            best_site = j;
+          }
+        }
+      }
+    }
+
+    if (best_site == -1) {
+      break;
+    } else {
+      best_mv->row += neighbors[best_site].row;
+      best_mv->col += neighbors[best_site].col;
+    }
+  }
+  return best_sad;
+}
+
+static int obmc_diamond_search_sad(
+    const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, FULLPEL_MV start_mv,
+    FULLPEL_MV *best_mv, int search_param, int *num00) {
+  const aom_variance_fn_ptr_t *fn_ptr = ms_params->vfp;
+  const search_site_config *cfg = ms_params->search_sites;
+  const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
+  const MSBuffers *ms_buffers = &ms_params->ms_buffers;
+  const int32_t *wsrc = ms_buffers->wsrc;
+  const int32_t *mask = ms_buffers->obmc_mask;
+  const struct buf_2d *const ref_buf = ms_buffers->ref;
+  // search_param determines the length of the initial step and hence the number
+  // of iterations
+  // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 =
+  // (MAX_FIRST_STEP/4) pel... etc.
+
+  const int tot_steps = MAX_MVSEARCH_STEPS - 1 - search_param;
+  const uint8_t *best_address, *init_ref;
+  int best_sad = INT_MAX;
+  int best_site = 0;
+  int step;
+
+  clamp_fullmv(&start_mv, &ms_params->mv_limits);
+  best_address = init_ref = get_buf_from_fullmv(ref_buf, &start_mv);
+  *num00 = 0;
+  *best_mv = start_mv;
+
+  // Check the starting position
+  best_sad = fn_ptr->osdf(best_address, ref_buf->stride, wsrc, mask) +
+             mvsad_err_cost_(best_mv, mv_cost_params);
+
+  for (step = tot_steps; step >= 0; --step) {
+    const search_site *const ss = cfg->ss[step];
+    best_site = 0;
+    for (int idx = 1; idx <= cfg->searches_per_step[step]; ++idx) {
+      const FULLPEL_MV mv = { best_mv->row + ss[idx].mv.row,
+                              best_mv->col + ss[idx].mv.col };
+      if (av1_is_fullmv_in_range(&ms_params->mv_limits, mv)) {
+        int sad = fn_ptr->osdf(best_address + ss[idx].offset, ref_buf->stride,
+                               wsrc, mask);
+        if (sad < best_sad) {
+          sad += mvsad_err_cost_(&mv, mv_cost_params);
+
+          if (sad < best_sad) {
+            best_sad = sad;
+            best_site = idx;
+          }
+        }
+      }
+    }
+
+    if (best_site != 0) {
+      best_mv->row += ss[best_site].mv.row;
+      best_mv->col += ss[best_site].mv.col;
+      best_address += ss[best_site].offset;
+    } else if (best_address == init_ref) {
+      (*num00)++;
+    }
+  }
+  return best_sad;
+}
+
+static int obmc_full_pixel_diamond(
+    const FULLPEL_MOTION_SEARCH_PARAMS *ms_params, const FULLPEL_MV start_mv,
+    int step_param, int do_refine, FULLPEL_MV *best_mv) {
+  const search_site_config *cfg = ms_params->search_sites;
+  FULLPEL_MV tmp_mv;
+  int thissme, n, num00 = 0;
+  int bestsme =
+      obmc_diamond_search_sad(ms_params, start_mv, &tmp_mv, step_param, &n);
+  if (bestsme < INT_MAX) bestsme = get_obmc_mvpred_var(ms_params, &tmp_mv);
+  *best_mv = tmp_mv;
+
+  // If there won't be more n-step search, check to see if refining search is
+  // needed.
+  const int further_steps = cfg->ss_count - 1 - step_param;
+  if (n > further_steps) do_refine = 0;
+
+  while (n < further_steps) {
+    ++n;
+
+    if (num00) {
+      num00--;
+    } else {
+      thissme = obmc_diamond_search_sad(ms_params, start_mv, &tmp_mv,
+                                        step_param + n, &num00);
+      if (thissme < INT_MAX) thissme = get_obmc_mvpred_var(ms_params, &tmp_mv);
+
+      // check to see if refining search is needed.
+      if (num00 > further_steps - n) do_refine = 0;
+
+      if (thissme < bestsme) {
+        bestsme = thissme;
+        *best_mv = tmp_mv;
+      }
+    }
+  }
+
+  // final 1-away diamond refining search
+  if (do_refine) {
+    tmp_mv = *best_mv;
+    thissme = obmc_refining_search_sad(ms_params, &tmp_mv);
+    if (thissme < INT_MAX) thissme = get_obmc_mvpred_var(ms_params, &tmp_mv);
+    if (thissme < bestsme) {
+      bestsme = thissme;
+      *best_mv = tmp_mv;
+    }
+  }
+  return bestsme;
+}
+
+int av1_obmc_full_pixel_search(const FULLPEL_MV start_mv,
+                               const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+                               const int step_param, FULLPEL_MV *best_mv) {
+  if (!ms_params->fast_obmc_search) {
+    const int do_refine = 1;
+    const int bestsme = obmc_full_pixel_diamond(ms_params, start_mv, step_param,
+                                                do_refine, best_mv);
+    return bestsme;
+  } else {
+    *best_mv = start_mv;
+    clamp_fullmv(best_mv, &ms_params->mv_limits);
+    int thissme = obmc_refining_search_sad(ms_params, best_mv);
+    if (thissme < INT_MAX) thissme = get_obmc_mvpred_var(ms_params, best_mv);
+    return thissme;
+  }
+}
+
+// =============================================================================
+//  Subpixel Motion Search: Translational
+// =============================================================================
+#define INIT_SUBPEL_STEP_SIZE (4)
+/*
+ * To avoid the penalty for crossing cache-line read, preload the reference
+ * area in a small buffer, which is aligned to make sure there won't be crossing
+ * cache-line read while reading from this buffer. This reduced the cpu
+ * cycles spent on reading ref data in sub-pixel filter functions.
+ * TODO: Currently, since sub-pixel search range here is -3 ~ 3, copy 22 rows x
+ * 32 cols area that is enough for 16x16 macroblock. Later, for SPLITMV, we
+ * could reduce the area.
+ */
+
+// Returns the subpel offset used by various subpel variance functions [m]sv[a]f
+static INLINE int get_subpel_part(int x) { return x & 7; }
+
+// Gets the address of the ref buffer at subpel location (r, c), rounded to the
+// nearest fullpel precision toward - \infty
+
+static INLINE const uint8_t *get_buf_from_mv(const struct buf_2d *buf,
+                                             const MV mv) {
+  const int offset = (mv.row >> 3) * buf->stride + (mv.col >> 3);
+  return &buf->buf[offset];
+}
+
+// Estimates the variance of prediction residue using bilinear filter for fast
+// search.
+static INLINE int estimated_pref_error(
+    const MV *this_mv, const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+    unsigned int *sse) {
+  const aom_variance_fn_ptr_t *vfp = var_params->vfp;
+
+  const MSBuffers *ms_buffers = &var_params->ms_buffers;
+  const uint8_t *src = ms_buffers->src->buf;
+  const uint8_t *ref = get_buf_from_mv(ms_buffers->ref, *this_mv);
+  const int src_stride = ms_buffers->src->stride;
+  const int ref_stride = ms_buffers->ref->stride;
+  const uint8_t *second_pred = ms_buffers->second_pred;
+  const uint8_t *mask = ms_buffers->mask;
+  const int mask_stride = ms_buffers->mask_stride;
+  const int invert_mask = ms_buffers->inv_mask;
+
+  const int subpel_x_q3 = get_subpel_part(this_mv->col);
+  const int subpel_y_q3 = get_subpel_part(this_mv->row);
+
+  if (second_pred == NULL) {
+    return vfp->svf(ref, ref_stride, subpel_x_q3, subpel_y_q3, src, src_stride,
+                    sse);
+  } else if (mask) {
+    return vfp->msvf(ref, ref_stride, subpel_x_q3, subpel_y_q3, src, src_stride,
+                     second_pred, mask, mask_stride, invert_mask, sse);
+  } else {
+    return vfp->svaf(ref, ref_stride, subpel_x_q3, subpel_y_q3, src, src_stride,
+                     sse, second_pred);
+  }
+}
+
+// Calculates the variance of prediction residue.
+static int upsampled_pref_error(MACROBLOCKD *xd, const AV1_COMMON *cm,
+                                const MV *this_mv,
+                                const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+                                unsigned int *sse) {
+  const aom_variance_fn_ptr_t *vfp = var_params->vfp;
+  const SUBPEL_SEARCH_TYPE subpel_search_type = var_params->subpel_search_type;
+
+  const MSBuffers *ms_buffers = &var_params->ms_buffers;
+  const uint8_t *src = ms_buffers->src->buf;
+  const uint8_t *ref = get_buf_from_mv(ms_buffers->ref, *this_mv);
+  const int src_stride = ms_buffers->src->stride;
+  const int ref_stride = ms_buffers->ref->stride;
+  const uint8_t *second_pred = ms_buffers->second_pred;
+  const uint8_t *mask = ms_buffers->mask;
+  const int mask_stride = ms_buffers->mask_stride;
+  const int invert_mask = ms_buffers->inv_mask;
+  const int w = var_params->w;
+  const int h = var_params->h;
+
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+  const int subpel_x_q3 = get_subpel_part(this_mv->col);
+  const int subpel_y_q3 = get_subpel_part(this_mv->row);
+
+  unsigned int besterr;
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (is_cur_buf_hbd(xd)) {
+    DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]);
+    uint8_t *pred8 = CONVERT_TO_BYTEPTR(pred16);
+    if (second_pred != NULL) {
+      if (mask) {
+        aom_highbd_comp_mask_upsampled_pred(
+            xd, cm, mi_row, mi_col, this_mv, pred8, second_pred, w, h,
+            subpel_x_q3, subpel_y_q3, ref, ref_stride, mask, mask_stride,
+            invert_mask, xd->bd, subpel_search_type);
+      } else {
+        aom_highbd_comp_avg_upsampled_pred(
+            xd, cm, mi_row, mi_col, this_mv, pred8, second_pred, w, h,
+            subpel_x_q3, subpel_y_q3, ref, ref_stride, xd->bd,
+            subpel_search_type);
+      }
+    } else {
+      aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred8, w, h,
+                                subpel_x_q3, subpel_y_q3, ref, ref_stride,
+                                xd->bd, subpel_search_type);
+    }
+    besterr = vfp->vf(pred8, w, src, src_stride, sse);
+  } else {
+    DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
+    if (second_pred != NULL) {
+      if (mask) {
+        aom_comp_mask_upsampled_pred(
+            xd, cm, mi_row, mi_col, this_mv, pred, second_pred, w, h,
+            subpel_x_q3, subpel_y_q3, ref, ref_stride, mask, mask_stride,
+            invert_mask, subpel_search_type);
+      } else {
+        aom_comp_avg_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred,
+                                    second_pred, w, h, subpel_x_q3, subpel_y_q3,
+                                    ref, ref_stride, subpel_search_type);
+      }
+    } else {
+      aom_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred, w, h,
+                         subpel_x_q3, subpel_y_q3, ref, ref_stride,
+                         subpel_search_type);
+    }
+
+    besterr = vfp->vf(pred, w, src, src_stride, sse);
+  }
+#else
+  DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
+  if (second_pred != NULL) {
+    if (mask) {
+      aom_comp_mask_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred,
+                                   second_pred, w, h, subpel_x_q3, subpel_y_q3,
+                                   ref, ref_stride, mask, mask_stride,
+                                   invert_mask, subpel_search_type);
+    } else {
+      aom_comp_avg_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred,
+                                  second_pred, w, h, subpel_x_q3, subpel_y_q3,
+                                  ref, ref_stride, subpel_search_type);
+    }
+  } else {
+    aom_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred, w, h, subpel_x_q3,
+                       subpel_y_q3, ref, ref_stride, subpel_search_type);
+  }
+
+  besterr = vfp->vf(pred, w, src, src_stride, sse);
+#endif
+  return besterr;
+}
+
+// Estimates whether this_mv is better than best_mv. This function incorporates
+// both prediction error and residue into account. It is suffixed "fast" because
+// it uses bilinear filter to estimate the prediction.
+static INLINE unsigned int check_better_fast(
+    const MV *this_mv, MV *best_mv, const SubpelMvLimits *mv_limits,
+    const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+    const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
+    unsigned int *sse1, int *distortion, int *has_better_mv) {
+  unsigned int cost;
+  if (av1_is_subpelmv_in_range(mv_limits, *this_mv)) {
+    unsigned int sse;
+    int thismse = estimated_pref_error(this_mv, var_params, &sse);
+    cost = mv_err_cost_(this_mv, mv_cost_params);
+    cost += thismse;
+
+    if (cost < *besterr) {
+      *besterr = cost;
+      *best_mv = *this_mv;
+      *distortion = thismse;
+      *sse1 = sse;
+      *has_better_mv |= 1;
+    }
+  } else {
+    cost = INT_MAX;
+  }
+  return cost;
+}
+
+// Checks whether this_mv is better than best_mv. This function incorporates
+// both prediction error and residue into account.
+static AOM_FORCE_INLINE unsigned int check_better(
+    MACROBLOCKD *xd, const AV1_COMMON *cm, const MV *this_mv, MV *best_mv,
+    const SubpelMvLimits *mv_limits, const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+    const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
+    unsigned int *sse1, int *distortion, int *is_better) {
+  unsigned int cost;
+  if (av1_is_subpelmv_in_range(mv_limits, *this_mv)) {
+    unsigned int sse;
+    int thismse;
+    thismse = upsampled_pref_error(xd, cm, this_mv, var_params, &sse);
+    cost = mv_err_cost_(this_mv, mv_cost_params);
+    cost += thismse;
+    if (cost < *besterr) {
+      *besterr = cost;
+      *best_mv = *this_mv;
+      *distortion = thismse;
+      *sse1 = sse;
+      *is_better |= 1;
+    }
+  } else {
+    cost = INT_MAX;
+  }
+  return cost;
+}
+
+static INLINE MV get_best_diag_step(int step_size, unsigned int left_cost,
+                                    unsigned int right_cost,
+                                    unsigned int up_cost,
+                                    unsigned int down_cost) {
+  const MV diag_step = { up_cost <= down_cost ? -step_size : step_size,
+                         left_cost <= right_cost ? -step_size : step_size };
+
+  return diag_step;
+}
+
+// Searches the four cardinal direction for a better mv, then follows up with a
+// search in the best quadrant. This uses bilinear filter to speed up the
+// calculation.
+static AOM_FORCE_INLINE MV first_level_check_fast(
+    const MV this_mv, MV *best_mv, int hstep, const SubpelMvLimits *mv_limits,
+    const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+    const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
+    unsigned int *sse1, int *distortion) {
+  // Check the four cardinal directions
+  const MV left_mv = { this_mv.row, this_mv.col - hstep };
+  int dummy = 0;
+  const unsigned int left =
+      check_better_fast(&left_mv, best_mv, mv_limits, var_params,
+                        mv_cost_params, besterr, sse1, distortion, &dummy);
+
+  const MV right_mv = { this_mv.row, this_mv.col + hstep };
+  const unsigned int right =
+      check_better_fast(&right_mv, best_mv, mv_limits, var_params,
+                        mv_cost_params, besterr, sse1, distortion, &dummy);
+
+  const MV top_mv = { this_mv.row - hstep, this_mv.col };
+  const unsigned int up =
+      check_better_fast(&top_mv, best_mv, mv_limits, var_params, mv_cost_params,
+                        besterr, sse1, distortion, &dummy);
+
+  const MV bottom_mv = { this_mv.row + hstep, this_mv.col };
+  const unsigned int down =
+      check_better_fast(&bottom_mv, best_mv, mv_limits, var_params,
+                        mv_cost_params, besterr, sse1, distortion, &dummy);
+
+  const MV diag_step = get_best_diag_step(hstep, left, right, up, down);
+  const MV diag_mv = { this_mv.row + diag_step.row,
+                       this_mv.col + diag_step.col };
+
+  // Check the diagonal direction with the best mv
+  check_better_fast(&diag_mv, best_mv, mv_limits, var_params, mv_cost_params,
+                    besterr, sse1, distortion, &dummy);
+
+  return diag_step;
+}
+
+// Performs a following up search after first_level_check_fast is called. This
+// performs two extra chess pattern searches in the best quadrant.
+static AOM_FORCE_INLINE void second_level_check_fast(
+    const MV this_mv, const MV diag_step, MV *best_mv, int hstep,
+    const SubpelMvLimits *mv_limits, const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+    const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
+    unsigned int *sse1, int *distortion) {
+  assert(diag_step.row == hstep || diag_step.row == -hstep);
+  assert(diag_step.col == hstep || diag_step.col == -hstep);
+  const int tr = this_mv.row;
+  const int tc = this_mv.col;
+  const int br = best_mv->row;
+  const int bc = best_mv->col;
+  int dummy = 0;
+  if (tr != br && tc != bc) {
+    assert(diag_step.col == bc - tc);
+    assert(diag_step.row == br - tr);
+    const MV chess_mv_1 = { br, bc + diag_step.col };
+    const MV chess_mv_2 = { br + diag_step.row, bc };
+    check_better_fast(&chess_mv_1, best_mv, mv_limits, var_params,
+                      mv_cost_params, besterr, sse1, distortion, &dummy);
+
+    check_better_fast(&chess_mv_2, best_mv, mv_limits, var_params,
+                      mv_cost_params, besterr, sse1, distortion, &dummy);
+  } else if (tr == br && tc != bc) {
+    assert(diag_step.col == bc - tc);
+    // Continue searching in the best direction
+    const MV bottom_long_mv = { br + hstep, bc + diag_step.col };
+    const MV top_long_mv = { br - hstep, bc + diag_step.col };
+    check_better_fast(&bottom_long_mv, best_mv, mv_limits, var_params,
+                      mv_cost_params, besterr, sse1, distortion, &dummy);
+    check_better_fast(&top_long_mv, best_mv, mv_limits, var_params,
+                      mv_cost_params, besterr, sse1, distortion, &dummy);
+
+    // Search in the direction opposite of the best quadrant
+    const MV rev_mv = { br - diag_step.row, bc };
+    check_better_fast(&rev_mv, best_mv, mv_limits, var_params, mv_cost_params,
+                      besterr, sse1, distortion, &dummy);
+  } else if (tr != br && tc == bc) {
+    assert(diag_step.row == br - tr);
+    // Continue searching in the best direction
+    const MV right_long_mv = { br + diag_step.row, bc + hstep };
+    const MV left_long_mv = { br + diag_step.row, bc - hstep };
+    check_better_fast(&right_long_mv, best_mv, mv_limits, var_params,
+                      mv_cost_params, besterr, sse1, distortion, &dummy);
+    check_better_fast(&left_long_mv, best_mv, mv_limits, var_params,
+                      mv_cost_params, besterr, sse1, distortion, &dummy);
+
+    // Search in the direction opposite of the best quadrant
+    const MV rev_mv = { br, bc - diag_step.col };
+    check_better_fast(&rev_mv, best_mv, mv_limits, var_params, mv_cost_params,
+                      besterr, sse1, distortion, &dummy);
+  }
+}
+
+// Combines first level check and second level check when applicable. This first
+// searches the four cardinal directions, and perform several
+// diagonal/chess-pattern searches in the best quadrant.
+static AOM_FORCE_INLINE void two_level_checks_fast(
+    const MV this_mv, MV *best_mv, int hstep, const SubpelMvLimits *mv_limits,
+    const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+    const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
+    unsigned int *sse1, int *distortion, int iters) {
+  const MV diag_step =
+      first_level_check_fast(this_mv, best_mv, hstep, mv_limits, var_params,
+                             mv_cost_params, besterr, sse1, distortion);
+  if (iters > 1) {
+    second_level_check_fast(this_mv, diag_step, best_mv, hstep, mv_limits,
+                            var_params, mv_cost_params, besterr, sse1,
+                            distortion);
+  }
+}
+
+static AOM_FORCE_INLINE MV
+first_level_check(MACROBLOCKD *xd, const AV1_COMMON *const cm, const MV this_mv,
+                  MV *best_mv, const int hstep, const SubpelMvLimits *mv_limits,
+                  const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+                  const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
+                  unsigned int *sse1, int *distortion) {
+  int dummy = 0;
+  const MV left_mv = { this_mv.row, this_mv.col - hstep };
+  const MV right_mv = { this_mv.row, this_mv.col + hstep };
+  const MV top_mv = { this_mv.row - hstep, this_mv.col };
+  const MV bottom_mv = { this_mv.row + hstep, this_mv.col };
+
+  const unsigned int left =
+      check_better(xd, cm, &left_mv, best_mv, mv_limits, var_params,
+                   mv_cost_params, besterr, sse1, distortion, &dummy);
+  const unsigned int right =
+      check_better(xd, cm, &right_mv, best_mv, mv_limits, var_params,
+                   mv_cost_params, besterr, sse1, distortion, &dummy);
+  const unsigned int up =
+      check_better(xd, cm, &top_mv, best_mv, mv_limits, var_params,
+                   mv_cost_params, besterr, sse1, distortion, &dummy);
+  const unsigned int down =
+      check_better(xd, cm, &bottom_mv, best_mv, mv_limits, var_params,
+                   mv_cost_params, besterr, sse1, distortion, &dummy);
+
+  const MV diag_step = get_best_diag_step(hstep, left, right, up, down);
+  const MV diag_mv = { this_mv.row + diag_step.row,
+                       this_mv.col + diag_step.col };
+
+  // Check the diagonal direction with the best mv
+  check_better(xd, cm, &diag_mv, best_mv, mv_limits, var_params, mv_cost_params,
+               besterr, sse1, distortion, &dummy);
+
+  return diag_step;
+}
+
+// A newer version of second level check that gives better quality.
+// TODO(chiyotsai@google.com): evaluate this on subpel_search_types different
+// from av1_find_best_sub_pixel_tree
+static AOM_FORCE_INLINE void second_level_check_v2(
+    MACROBLOCKD *xd, const AV1_COMMON *const cm, const MV this_mv, MV diag_step,
+    MV *best_mv, const SubpelMvLimits *mv_limits,
+    const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+    const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
+    unsigned int *sse1, int *distortion) {
+  assert(best_mv->row == this_mv.row + diag_step.row ||
+         best_mv->col == this_mv.col + diag_step.col);
+  if (CHECK_MV_EQUAL(this_mv, *best_mv)) {
+    return;
+  } else if (this_mv.row == best_mv->row) {
+    // Search away from diagonal step since diagonal search did not provide any
+    // improvement
+    diag_step.row *= -1;
+  } else if (this_mv.col == best_mv->col) {
+    diag_step.col *= -1;
+  }
+
+  const MV row_bias_mv = { best_mv->row + diag_step.row, best_mv->col };
+  const MV col_bias_mv = { best_mv->row, best_mv->col + diag_step.col };
+  const MV diag_bias_mv = { best_mv->row + diag_step.row,
+                            best_mv->col + diag_step.col };
+  int has_better_mv = 0;
+
+  if (var_params->subpel_search_type != USE_2_TAPS_ORIG) {
+    check_better(xd, cm, &row_bias_mv, best_mv, mv_limits, var_params,
+                 mv_cost_params, besterr, sse1, distortion, &has_better_mv);
+    check_better(xd, cm, &col_bias_mv, best_mv, mv_limits, var_params,
+                 mv_cost_params, besterr, sse1, distortion, &has_better_mv);
+
+    // Do an additional search if the second iteration gives a better mv
+    if (has_better_mv) {
+      check_better(xd, cm, &diag_bias_mv, best_mv, mv_limits, var_params,
+                   mv_cost_params, besterr, sse1, distortion, &has_better_mv);
+    }
+  } else {
+    check_better_fast(&row_bias_mv, best_mv, mv_limits, var_params,
+                      mv_cost_params, besterr, sse1, distortion,
+                      &has_better_mv);
+    check_better_fast(&col_bias_mv, best_mv, mv_limits, var_params,
+                      mv_cost_params, besterr, sse1, distortion,
+                      &has_better_mv);
+
+    // Do an additional search if the second iteration gives a better mv
+    if (has_better_mv) {
+      check_better_fast(&diag_bias_mv, best_mv, mv_limits, var_params,
+                        mv_cost_params, besterr, sse1, distortion,
+                        &has_better_mv);
+    }
+  }
+}
+
+// Gets the error at the beginning when the mv has fullpel precision
+static unsigned int setup_center_error(
+    const MACROBLOCKD *xd, const MV *bestmv,
+    const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+    const MV_COST_PARAMS *mv_cost_params, unsigned int *sse1, int *distortion) {
+  const aom_variance_fn_ptr_t *vfp = var_params->vfp;
+  const int w = var_params->w;
+  const int h = var_params->h;
+
+  const MSBuffers *ms_buffers = &var_params->ms_buffers;
+  const uint8_t *src = ms_buffers->src->buf;
+  const uint8_t *y = get_buf_from_mv(ms_buffers->ref, *bestmv);
+  const int src_stride = ms_buffers->src->stride;
+  const int y_stride = ms_buffers->ref->stride;
+  const uint8_t *second_pred = ms_buffers->second_pred;
+  const uint8_t *mask = ms_buffers->mask;
+  const int mask_stride = ms_buffers->mask_stride;
+  const int invert_mask = ms_buffers->inv_mask;
+
+  unsigned int besterr;
+
+  if (second_pred != NULL) {
+#if CONFIG_AV1_HIGHBITDEPTH
+    if (is_cur_buf_hbd(xd)) {
+      DECLARE_ALIGNED(16, uint16_t, comp_pred16[MAX_SB_SQUARE]);
+      uint8_t *comp_pred = CONVERT_TO_BYTEPTR(comp_pred16);
+      if (mask) {
+        aom_highbd_comp_mask_pred(comp_pred, second_pred, w, h, y, y_stride,
+                                  mask, mask_stride, invert_mask);
+      } else {
+        aom_highbd_comp_avg_pred(comp_pred, second_pred, w, h, y, y_stride);
+      }
+      besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
+    } else {
+      DECLARE_ALIGNED(16, uint8_t, comp_pred[MAX_SB_SQUARE]);
+      if (mask) {
+        aom_comp_mask_pred(comp_pred, second_pred, w, h, y, y_stride, mask,
+                           mask_stride, invert_mask);
+      } else {
+        aom_comp_avg_pred(comp_pred, second_pred, w, h, y, y_stride);
+      }
+      besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
+    }
+#else
+    (void)xd;
+    DECLARE_ALIGNED(16, uint8_t, comp_pred[MAX_SB_SQUARE]);
+    if (mask) {
+      aom_comp_mask_pred(comp_pred, second_pred, w, h, y, y_stride, mask,
+                         mask_stride, invert_mask);
+    } else {
+      aom_comp_avg_pred(comp_pred, second_pred, w, h, y, y_stride);
+    }
+    besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
+#endif
+  } else {
+    besterr = vfp->vf(y, y_stride, src, src_stride, sse1);
+  }
+  *distortion = besterr;
+  besterr += mv_err_cost_(bestmv, mv_cost_params);
+  return besterr;
+}
+
+// Gets the error at the beginning when the mv has fullpel precision
+static unsigned int upsampled_setup_center_error(
+    MACROBLOCKD *xd, const AV1_COMMON *const cm, const MV *bestmv,
+    const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+    const MV_COST_PARAMS *mv_cost_params, unsigned int *sse1, int *distortion) {
+  unsigned int besterr = upsampled_pref_error(xd, cm, bestmv, var_params, sse1);
+  *distortion = besterr;
+  besterr += mv_err_cost_(bestmv, mv_cost_params);
+  return besterr;
+}
+
+static INLINE int divide_and_round(int n, int d) {
+  return ((n < 0) ^ (d < 0)) ? ((n - d / 2) / d) : ((n + d / 2) / d);
+}
+
+static INLINE int is_cost_list_wellbehaved(const int *cost_list) {
+  return cost_list[0] < cost_list[1] && cost_list[0] < cost_list[2] &&
+         cost_list[0] < cost_list[3] && cost_list[0] < cost_list[4];
+}
+
+// Returns surface minima estimate at given precision in 1/2^n bits.
+// Assume a model for the cost surface: S = A(x - x0)^2 + B(y - y0)^2 + C
+// For a given set of costs S0, S1, S2, S3, S4 at points
+// (y, x) = (0, 0), (0, -1), (1, 0), (0, 1) and (-1, 0) respectively,
+// the solution for the location of the minima (x0, y0) is given by:
+// x0 = 1/2 (S1 - S3)/(S1 + S3 - 2*S0),
+// y0 = 1/2 (S4 - S2)/(S4 + S2 - 2*S0).
+// The code below is an integerized version of that.
+static AOM_INLINE void get_cost_surf_min(const int *cost_list, int *ir, int *ic,
+                                         int bits) {
+  *ic = divide_and_round((cost_list[1] - cost_list[3]) * (1 << (bits - 1)),
+                         (cost_list[1] - 2 * cost_list[0] + cost_list[3]));
+  *ir = divide_and_round((cost_list[4] - cost_list[2]) * (1 << (bits - 1)),
+                         (cost_list[4] - 2 * cost_list[0] + cost_list[2]));
+}
+
+// Checks the list of mvs searched in the last iteration and see if we are
+// repeating it. If so, return 1. Otherwise we update the last_mv_search_list
+// with current_mv and return 0.
+static INLINE int check_repeated_mv_and_update(int_mv *last_mv_search_list,
+                                               const MV current_mv, int iter) {
+  if (last_mv_search_list) {
+    if (CHECK_MV_EQUAL(last_mv_search_list[iter].as_mv, current_mv)) {
+      return 1;
+    }
+
+    last_mv_search_list[iter].as_mv = current_mv;
+  }
+  return 0;
+}
+
+int av1_find_best_sub_pixel_tree_pruned_evenmore(
+    MACROBLOCKD *xd, const AV1_COMMON *const cm,
+    const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, MV start_mv, MV *bestmv,
+    int *distortion, unsigned int *sse1, int_mv *last_mv_search_list) {
+  (void)cm;
+  const int allow_hp = ms_params->allow_hp;
+  const int forced_stop = ms_params->forced_stop;
+  const int iters_per_step = ms_params->iters_per_step;
+  const int *cost_list = ms_params->cost_list;
+  const SubpelMvLimits *mv_limits = &ms_params->mv_limits;
+  const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
+  const SUBPEL_SEARCH_VAR_PARAMS *var_params = &ms_params->var_params;
+
+  // The iteration we are current searching for. Iter 0 corresponds to fullpel
+  // mv, iter 1 to half pel, and so on
+  int iter = 0;
+  int hstep = INIT_SUBPEL_STEP_SIZE;  // Step size, initialized to 4/8=1/2 pel
+  unsigned int besterr = INT_MAX;
+  *bestmv = start_mv;
+
+  besterr = setup_center_error(xd, bestmv, var_params, mv_cost_params, sse1,
+                               distortion);
+
+  if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) {
+    return INT_MAX;
+  }
+  iter++;
+
+  if (cost_list && cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
+      cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
+      cost_list[4] != INT_MAX && is_cost_list_wellbehaved(cost_list)) {
+    int ir, ic;
+    int dummy = 0;
+    get_cost_surf_min(cost_list, &ir, &ic, 2);
+    if (ir != 0 || ic != 0) {
+      const MV this_mv = { start_mv.row + 2 * ir, start_mv.col + 2 * ic };
+      check_better_fast(&this_mv, bestmv, mv_limits, var_params, mv_cost_params,
+                        &besterr, sse1, distortion, &dummy);
+    }
+  } else {
+    two_level_checks_fast(start_mv, bestmv, hstep, mv_limits, var_params,
+                          mv_cost_params, &besterr, sse1, distortion,
+                          iters_per_step);
+
+    // Each subsequent iteration checks at least one point in common with
+    // the last iteration could be 2 ( if diag selected) 1/4 pel
+    if (forced_stop != HALF_PEL) {
+      if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) {
+        return INT_MAX;
+      }
+      iter++;
+
+      hstep >>= 1;
+      start_mv = *bestmv;
+      two_level_checks_fast(start_mv, bestmv, hstep, mv_limits, var_params,
+                            mv_cost_params, &besterr, sse1, distortion,
+                            iters_per_step);
+    }
+  }
+
+  if (allow_hp && forced_stop == EIGHTH_PEL) {
+    if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) {
+      return INT_MAX;
+    }
+    iter++;
+
+    hstep >>= 1;
+    start_mv = *bestmv;
+    two_level_checks_fast(start_mv, bestmv, hstep, mv_limits, var_params,
+                          mv_cost_params, &besterr, sse1, distortion,
+                          iters_per_step);
+  }
+
+  return besterr;
+}
+
+int av1_find_best_sub_pixel_tree_pruned_more(
+    MACROBLOCKD *xd, const AV1_COMMON *const cm,
+    const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, MV start_mv, MV *bestmv,
+    int *distortion, unsigned int *sse1, int_mv *last_mv_search_list) {
+  (void)cm;
+  const int allow_hp = ms_params->allow_hp;
+  const int forced_stop = ms_params->forced_stop;
+  const int iters_per_step = ms_params->iters_per_step;
+  const int *cost_list = ms_params->cost_list;
+  const SubpelMvLimits *mv_limits = &ms_params->mv_limits;
+  const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
+  const SUBPEL_SEARCH_VAR_PARAMS *var_params = &ms_params->var_params;
+
+  // The iteration we are current searching for. Iter 0 corresponds to fullpel
+  // mv, iter 1 to half pel, and so on
+  int iter = 0;
+  int hstep = INIT_SUBPEL_STEP_SIZE;  // Step size, initialized to 4/8=1/2 pel
+  unsigned int besterr = INT_MAX;
+  *bestmv = start_mv;
+
+  besterr = setup_center_error(xd, bestmv, var_params, mv_cost_params, sse1,
+                               distortion);
+
+  if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) {
+    return INT_MAX;
+  }
+  iter++;
+
+  if (cost_list && cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
+      cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
+      cost_list[4] != INT_MAX && is_cost_list_wellbehaved(cost_list)) {
+    int ir, ic;
+    get_cost_surf_min(cost_list, &ir, &ic, 1);
+    if (ir != 0 || ic != 0) {
+      const MV this_mv = { start_mv.row + ir * hstep,
+                           start_mv.col + ic * hstep };
+      int dummy = 0;
+      check_better_fast(&this_mv, bestmv, mv_limits, var_params, mv_cost_params,
+                        &besterr, sse1, distortion, &dummy);
+    }
+  } else {
+    two_level_checks_fast(start_mv, bestmv, hstep, mv_limits, var_params,
+                          mv_cost_params, &besterr, sse1, distortion,
+                          iters_per_step);
+  }
+
+  // Each subsequent iteration checks at least one point in common with
+  // the last iteration could be 2 ( if diag selected) 1/4 pel
+  if (forced_stop != HALF_PEL) {
+    if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) {
+      return INT_MAX;
+    }
+    iter++;
+
+    hstep >>= 1;
+    start_mv = *bestmv;
+    two_level_checks_fast(start_mv, bestmv, hstep, mv_limits, var_params,
+                          mv_cost_params, &besterr, sse1, distortion,
+                          iters_per_step);
+  }
+
+  if (allow_hp && forced_stop == EIGHTH_PEL) {
+    if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) {
+      return INT_MAX;
+    }
+    iter++;
+
+    hstep >>= 1;
+    start_mv = *bestmv;
+    two_level_checks_fast(start_mv, bestmv, hstep, mv_limits, var_params,
+                          mv_cost_params, &besterr, sse1, distortion,
+                          iters_per_step);
+  }
+
+  return besterr;
+}
+
+int av1_find_best_sub_pixel_tree_pruned(
+    MACROBLOCKD *xd, const AV1_COMMON *const cm,
+    const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, MV start_mv, MV *bestmv,
+    int *distortion, unsigned int *sse1, int_mv *last_mv_search_list) {
+  (void)cm;
+  const int allow_hp = ms_params->allow_hp;
+  const int forced_stop = ms_params->forced_stop;
+  const int iters_per_step = ms_params->iters_per_step;
+  const int *cost_list = ms_params->cost_list;
+  const SubpelMvLimits *mv_limits = &ms_params->mv_limits;
+  const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
+  const SUBPEL_SEARCH_VAR_PARAMS *var_params = &ms_params->var_params;
+
+  // The iteration we are current searching for. Iter 0 corresponds to fullpel
+  // mv, iter 1 to half pel, and so on
+  int iter = 0;
+  int hstep = INIT_SUBPEL_STEP_SIZE;  // Step size, initialized to 4/8=1/2 pel
+  unsigned int besterr = INT_MAX;
+  *bestmv = start_mv;
+
+  besterr = setup_center_error(xd, bestmv, var_params, mv_cost_params, sse1,
+                               distortion);
+  if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) {
+    return INT_MAX;
+  }
+  iter++;
+
+  if (cost_list && cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
+      cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
+      cost_list[4] != INT_MAX) {
+    const unsigned int whichdir = (cost_list[1] < cost_list[3] ? 0 : 1) +
+                                  (cost_list[2] < cost_list[4] ? 0 : 2);
+
+    const MV left_mv = { start_mv.row, start_mv.col - hstep };
+    const MV right_mv = { start_mv.row, start_mv.col + hstep };
+    const MV bottom_mv = { start_mv.row + hstep, start_mv.col };
+    const MV top_mv = { start_mv.row - hstep, start_mv.col };
+
+    const MV bottom_left_mv = { start_mv.row + hstep, start_mv.col - hstep };
+    const MV bottom_right_mv = { start_mv.row + hstep, start_mv.col + hstep };
+    const MV top_left_mv = { start_mv.row - hstep, start_mv.col - hstep };
+    const MV top_right_mv = { start_mv.row - hstep, start_mv.col + hstep };
+
+    int dummy = 0;
+
+    switch (whichdir) {
+      case 0:  // bottom left quadrant
+        check_better_fast(&left_mv, bestmv, mv_limits, var_params,
+                          mv_cost_params, &besterr, sse1, distortion, &dummy);
+        check_better_fast(&bottom_mv, bestmv, mv_limits, var_params,
+                          mv_cost_params, &besterr, sse1, distortion, &dummy);
+        check_better_fast(&bottom_left_mv, bestmv, mv_limits, var_params,
+                          mv_cost_params, &besterr, sse1, distortion, &dummy);
+        break;
+      case 1:  // bottom right quadrant
+        check_better_fast(&right_mv, bestmv, mv_limits, var_params,
+                          mv_cost_params, &besterr, sse1, distortion, &dummy);
+        check_better_fast(&bottom_mv, bestmv, mv_limits, var_params,
+                          mv_cost_params, &besterr, sse1, distortion, &dummy);
+        check_better_fast(&bottom_right_mv, bestmv, mv_limits, var_params,
+                          mv_cost_params, &besterr, sse1, distortion, &dummy);
+        break;
+      case 2:  // top left quadrant
+        check_better_fast(&left_mv, bestmv, mv_limits, var_params,
+                          mv_cost_params, &besterr, sse1, distortion, &dummy);
+        check_better_fast(&top_mv, bestmv, mv_limits, var_params,
+                          mv_cost_params, &besterr, sse1, distortion, &dummy);
+        check_better_fast(&top_left_mv, bestmv, mv_limits, var_params,
+                          mv_cost_params, &besterr, sse1, distortion, &dummy);
+        break;
+      case 3:  // top right quadrant
+        check_better_fast(&right_mv, bestmv, mv_limits, var_params,
+                          mv_cost_params, &besterr, sse1, distortion, &dummy);
+        check_better_fast(&top_mv, bestmv, mv_limits, var_params,
+                          mv_cost_params, &besterr, sse1, distortion, &dummy);
+        check_better_fast(&top_right_mv, bestmv, mv_limits, var_params,
+                          mv_cost_params, &besterr, sse1, distortion, &dummy);
+        break;
+    }
+  } else {
+    two_level_checks_fast(start_mv, bestmv, hstep, mv_limits, var_params,
+                          mv_cost_params, &besterr, sse1, distortion,
+                          iters_per_step);
+  }
+
+  // Each subsequent iteration checks at least one point in common with
+  // the last iteration could be 2 ( if diag selected) 1/4 pel
+  if (forced_stop != HALF_PEL) {
+    if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) {
+      return INT_MAX;
+    }
+    iter++;
+
+    hstep >>= 1;
+    start_mv = *bestmv;
+    two_level_checks_fast(start_mv, bestmv, hstep, mv_limits, var_params,
+                          mv_cost_params, &besterr, sse1, distortion,
+                          iters_per_step);
+  }
+
+  if (allow_hp && forced_stop == EIGHTH_PEL) {
+    if (check_repeated_mv_and_update(last_mv_search_list, *bestmv, iter)) {
+      return INT_MAX;
+    }
+    iter++;
+
+    hstep >>= 1;
+    start_mv = *bestmv;
+    two_level_checks_fast(start_mv, bestmv, hstep, mv_limits, var_params,
+                          mv_cost_params, &besterr, sse1, distortion,
+                          iters_per_step);
+  }
+
+  return besterr;
+}
+
+int av1_find_best_sub_pixel_tree(MACROBLOCKD *xd, const AV1_COMMON *const cm,
+                                 const SUBPEL_MOTION_SEARCH_PARAMS *ms_params,
+                                 MV start_mv, MV *bestmv, int *distortion,
+                                 unsigned int *sse1,
+                                 int_mv *last_mv_search_list) {
+  const int allow_hp = ms_params->allow_hp;
+  const int forced_stop = ms_params->forced_stop;
+  const int iters_per_step = ms_params->iters_per_step;
+  const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
+  const SUBPEL_SEARCH_VAR_PARAMS *var_params = &ms_params->var_params;
+  const SUBPEL_SEARCH_TYPE subpel_search_type =
+      ms_params->var_params.subpel_search_type;
+  const SubpelMvLimits *mv_limits = &ms_params->mv_limits;
+
+  // How many steps to take. A round of 0 means fullpel search only, 1 means
+  // half-pel, and so on.
+  const int round = AOMMIN(FULL_PEL - forced_stop, 3 - !allow_hp);
+  int hstep = INIT_SUBPEL_STEP_SIZE;  // Step size, initialized to 4/8=1/2 pel
+
+  unsigned int besterr = INT_MAX;
+
+  *bestmv = start_mv;
+
+  if (subpel_search_type != USE_2_TAPS_ORIG) {
+    besterr = upsampled_setup_center_error(xd, cm, bestmv, var_params,
+                                           mv_cost_params, sse1, distortion);
+  } else {
+    besterr = setup_center_error(xd, bestmv, var_params, mv_cost_params, sse1,
+                                 distortion);
+  }
+
+  for (int iter = 0; iter < round; ++iter) {
+    MV iter_center_mv = *bestmv;
+    if (check_repeated_mv_and_update(last_mv_search_list, iter_center_mv,
+                                     iter)) {
+      return INT_MAX;
+    }
+
+    MV diag_step;
+    if (subpel_search_type != USE_2_TAPS_ORIG) {
+      diag_step = first_level_check(xd, cm, iter_center_mv, bestmv, hstep,
+                                    mv_limits, var_params, mv_cost_params,
+                                    &besterr, sse1, distortion);
+    } else {
+      diag_step = first_level_check_fast(iter_center_mv, bestmv, hstep,
+                                         mv_limits, var_params, mv_cost_params,
+                                         &besterr, sse1, distortion);
+    }
+
+    // Check diagonal sub-pixel position
+    if (!CHECK_MV_EQUAL(iter_center_mv, *bestmv) && iters_per_step > 1) {
+      second_level_check_v2(xd, cm, iter_center_mv, diag_step, bestmv,
+                            mv_limits, var_params, mv_cost_params, &besterr,
+                            sse1, distortion);
+    }
+
+    hstep >>= 1;
+  }
+
+  return besterr;
+}
+
+// Note(yunqingwang): The following 2 functions are only used in the motion
+// vector unit test, which return extreme motion vectors allowed by the MV
+// limits.
+// Returns the maximum MV.
+int av1_return_max_sub_pixel_mv(MACROBLOCKD *xd, const AV1_COMMON *const cm,
+                                const SUBPEL_MOTION_SEARCH_PARAMS *ms_params,
+                                MV start_mv, MV *bestmv, int *distortion,
+                                unsigned int *sse1,
+                                int_mv *last_mv_search_list) {
+  (void)xd;
+  (void)cm;
+  (void)start_mv;
+  (void)sse1;
+  (void)distortion;
+  (void)last_mv_search_list;
+
+  const int allow_hp = ms_params->allow_hp;
+  const SubpelMvLimits *mv_limits = &ms_params->mv_limits;
+
+  bestmv->row = mv_limits->row_max;
+  bestmv->col = mv_limits->col_max;
+
+  unsigned int besterr = 0;
+
+  // In the sub-pel motion search, if hp is not used, then the last bit of mv
+  // has to be 0.
+  lower_mv_precision(bestmv, allow_hp, 0);
+  return besterr;
+}
+
+// Returns the minimum MV.
+int av1_return_min_sub_pixel_mv(MACROBLOCKD *xd, const AV1_COMMON *const cm,
+                                const SUBPEL_MOTION_SEARCH_PARAMS *ms_params,
+                                MV start_mv, MV *bestmv, int *distortion,
+                                unsigned int *sse1,
+                                int_mv *last_mv_search_list) {
+  (void)xd;
+  (void)cm;
+  (void)start_mv;
+  (void)sse1;
+  (void)distortion;
+  (void)last_mv_search_list;
+
+  const int allow_hp = ms_params->allow_hp;
+  const SubpelMvLimits *mv_limits = &ms_params->mv_limits;
+
+  bestmv->row = mv_limits->row_min;
+  bestmv->col = mv_limits->col_min;
+
+  unsigned int besterr = 0;
+  // In the sub-pel motion search, if hp is not used, then the last bit of mv
+  // has to be 0.
+  lower_mv_precision(bestmv, allow_hp, 0);
+  return besterr;
+}
+
+// Computes the cost of the current predictor by going through the whole
+// av1_enc_build_inter_predictor pipeline. This is mainly used by warped mv
+// during motion_mode_rd. We are going through the whole
+// av1_enc_build_inter_predictor because we might have changed the interpolation
+// filter, etc before motion_mode_rd is called.
+static INLINE unsigned int compute_motion_cost(
+    MACROBLOCKD *xd, const AV1_COMMON *const cm,
+    const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, BLOCK_SIZE bsize,
+    const MV *this_mv) {
+  unsigned int mse;
+  unsigned int sse;
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+
+  av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+                                AOM_PLANE_Y, AOM_PLANE_Y);
+
+  const SUBPEL_SEARCH_VAR_PARAMS *var_params = &ms_params->var_params;
+  const MSBuffers *ms_buffers = &var_params->ms_buffers;
+
+  const uint8_t *const src = ms_buffers->src->buf;
+  const int src_stride = ms_buffers->src->stride;
+  const uint8_t *const dst = xd->plane[0].dst.buf;
+  const int dst_stride = xd->plane[0].dst.stride;
+  const aom_variance_fn_ptr_t *vfp = ms_params->var_params.vfp;
+
+  mse = vfp->vf(dst, dst_stride, src, src_stride, &sse);
+  mse += mv_err_cost_(this_mv, &ms_params->mv_cost_params);
+  return mse;
+}
+
+// Refines MV in a small range
+unsigned int av1_refine_warped_mv(MACROBLOCKD *xd, const AV1_COMMON *const cm,
+                                  const SUBPEL_MOTION_SEARCH_PARAMS *ms_params,
+                                  BLOCK_SIZE bsize, const int *pts0,
+                                  const int *pts_inref0, int total_samples) {
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  static const MV neighbors[8] = { { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 },
+                                   { 0, -2 }, { 2, 0 }, { 0, 2 }, { -2, 0 } };
+  MV *best_mv = &mbmi->mv[0].as_mv;
+
+  WarpedMotionParams best_wm_params = mbmi->wm_params;
+  int best_num_proj_ref = mbmi->num_proj_ref;
+  unsigned int bestmse;
+  const SubpelMvLimits *mv_limits = &ms_params->mv_limits;
+
+  const int start = ms_params->allow_hp ? 0 : 4;
+
+  // Calculate the center position's error
+  assert(av1_is_subpelmv_in_range(mv_limits, *best_mv));
+  bestmse = compute_motion_cost(xd, cm, ms_params, bsize, best_mv);
+
+  // MV search
+  int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+  for (int ite = 0; ite < 2; ++ite) {
+    int best_idx = -1;
+
+    for (int idx = start; idx < start + 4; ++idx) {
+      unsigned int thismse;
+
+      MV this_mv = { best_mv->row + neighbors[idx].row,
+                     best_mv->col + neighbors[idx].col };
+      if (av1_is_subpelmv_in_range(mv_limits, this_mv)) {
+        memcpy(pts, pts0, total_samples * 2 * sizeof(*pts0));
+        memcpy(pts_inref, pts_inref0, total_samples * 2 * sizeof(*pts_inref0));
+        if (total_samples > 1)
+          mbmi->num_proj_ref =
+              av1_selectSamples(&this_mv, pts, pts_inref, total_samples, bsize);
+
+        if (!av1_find_projection(mbmi->num_proj_ref, pts, pts_inref, bsize,
+                                 this_mv.row, this_mv.col, &mbmi->wm_params,
+                                 mi_row, mi_col)) {
+          thismse = compute_motion_cost(xd, cm, ms_params, bsize, &this_mv);
+
+          if (thismse < bestmse) {
+            best_idx = idx;
+            best_wm_params = mbmi->wm_params;
+            best_num_proj_ref = mbmi->num_proj_ref;
+            bestmse = thismse;
+          }
+        }
+      }
+    }
+
+    if (best_idx == -1) break;
+
+    if (best_idx >= 0) {
+      best_mv->row += neighbors[best_idx].row;
+      best_mv->col += neighbors[best_idx].col;
+    }
+  }
+
+  mbmi->wm_params = best_wm_params;
+  mbmi->num_proj_ref = best_num_proj_ref;
+  return bestmse;
+}
+// =============================================================================
+//  Subpixel Motion Search: OBMC
+// =============================================================================
+// Estimates the variance of prediction residue
+static INLINE int estimate_obmc_pref_error(
+    const MV *this_mv, const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+    unsigned int *sse) {
+  const aom_variance_fn_ptr_t *vfp = var_params->vfp;
+
+  const MSBuffers *ms_buffers = &var_params->ms_buffers;
+  const int32_t *src = ms_buffers->wsrc;
+  const int32_t *mask = ms_buffers->obmc_mask;
+  const uint8_t *ref = get_buf_from_mv(ms_buffers->ref, *this_mv);
+  const int ref_stride = ms_buffers->ref->stride;
+
+  const int subpel_x_q3 = get_subpel_part(this_mv->col);
+  const int subpel_y_q3 = get_subpel_part(this_mv->row);
+
+  return vfp->osvf(ref, ref_stride, subpel_x_q3, subpel_y_q3, src, mask, sse);
+}
+
+// Calculates the variance of prediction residue
+static int upsampled_obmc_pref_error(MACROBLOCKD *xd, const AV1_COMMON *cm,
+                                     const MV *this_mv,
+                                     const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+                                     unsigned int *sse) {
+  const aom_variance_fn_ptr_t *vfp = var_params->vfp;
+  const SUBPEL_SEARCH_TYPE subpel_search_type = var_params->subpel_search_type;
+  const int w = var_params->w;
+  const int h = var_params->h;
+
+  const MSBuffers *ms_buffers = &var_params->ms_buffers;
+  const int32_t *wsrc = ms_buffers->wsrc;
+  const int32_t *mask = ms_buffers->obmc_mask;
+  const uint8_t *ref = get_buf_from_mv(ms_buffers->ref, *this_mv);
+  const int ref_stride = ms_buffers->ref->stride;
+
+  const int subpel_x_q3 = get_subpel_part(this_mv->col);
+  const int subpel_y_q3 = get_subpel_part(this_mv->row);
+
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+
+  unsigned int besterr;
+  DECLARE_ALIGNED(16, uint8_t, pred[2 * MAX_SB_SQUARE]);
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (is_cur_buf_hbd(xd)) {
+    uint8_t *pred8 = CONVERT_TO_BYTEPTR(pred);
+    aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred8, w, h,
+                              subpel_x_q3, subpel_y_q3, ref, ref_stride, xd->bd,
+                              subpel_search_type);
+    besterr = vfp->ovf(pred8, w, wsrc, mask, sse);
+  } else {
+    aom_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred, w, h, subpel_x_q3,
+                       subpel_y_q3, ref, ref_stride, subpel_search_type);
+
+    besterr = vfp->ovf(pred, w, wsrc, mask, sse);
+  }
+#else
+  aom_upsampled_pred(xd, cm, mi_row, mi_col, this_mv, pred, w, h, subpel_x_q3,
+                     subpel_y_q3, ref, ref_stride, subpel_search_type);
+
+  besterr = vfp->ovf(pred, w, wsrc, mask, sse);
+#endif
+  return besterr;
+}
+
+static unsigned int setup_obmc_center_error(
+    const MV *this_mv, const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+    const MV_COST_PARAMS *mv_cost_params, unsigned int *sse1, int *distortion) {
+  // TODO(chiyotsai@google.com): There might be a bug here where we didn't use
+  // get_buf_from_mv(ref, *this_mv).
+  const MSBuffers *ms_buffers = &var_params->ms_buffers;
+  const int32_t *wsrc = ms_buffers->wsrc;
+  const int32_t *mask = ms_buffers->obmc_mask;
+  const uint8_t *ref = ms_buffers->ref->buf;
+  const int ref_stride = ms_buffers->ref->stride;
+  unsigned int besterr =
+      var_params->vfp->ovf(ref, ref_stride, wsrc, mask, sse1);
+  *distortion = besterr;
+  besterr += mv_err_cost_(this_mv, mv_cost_params);
+  return besterr;
+}
+
+static unsigned int upsampled_setup_obmc_center_error(
+    MACROBLOCKD *xd, const AV1_COMMON *const cm, const MV *this_mv,
+    const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+    const MV_COST_PARAMS *mv_cost_params, unsigned int *sse1, int *distortion) {
+  unsigned int besterr =
+      upsampled_obmc_pref_error(xd, cm, this_mv, var_params, sse1);
+  *distortion = besterr;
+  besterr += mv_err_cost_(this_mv, mv_cost_params);
+  return besterr;
+}
+
+// Estimates the variance of prediction residue
+// TODO(chiyotsai@google.com): the cost does does not match the cost in
+// mv_cost_. Investigate this later.
+static INLINE int estimate_obmc_mvcost(const MV *this_mv,
+                                       const MV_COST_PARAMS *mv_cost_params) {
+  const MV *ref_mv = mv_cost_params->ref_mv;
+  const int *mvjcost = mv_cost_params->mvjcost;
+  const int *const *mvcost = mv_cost_params->mvcost;
+  const int error_per_bit = mv_cost_params->error_per_bit;
+  const MV_COST_TYPE mv_cost_type = mv_cost_params->mv_cost_type;
+  const MV diff_mv = { GET_MV_SUBPEL(this_mv->row - ref_mv->row),
+                       GET_MV_SUBPEL(this_mv->col - ref_mv->col) };
+
+  switch (mv_cost_type) {
+    case MV_COST_ENTROPY:
+      return (unsigned)((mv_cost(&diff_mv, mvjcost,
+                                 CONVERT_TO_CONST_MVCOST(mvcost)) *
+                             error_per_bit +
+                         4096) >>
+                        13);
+    case MV_COST_NONE: return 0;
+    default:
+      assert(0 && "L1 norm is not tuned for estimated obmc mvcost");
+      return 0;
+  }
+}
+
+// Estimates whether this_mv is better than best_mv. This function incorporates
+// both prediction error and residue into account.
+static INLINE unsigned int obmc_check_better_fast(
+    const MV *this_mv, MV *best_mv, const SubpelMvLimits *mv_limits,
+    const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+    const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
+    unsigned int *sse1, int *distortion, int *has_better_mv) {
+  unsigned int cost;
+  if (av1_is_subpelmv_in_range(mv_limits, *this_mv)) {
+    unsigned int sse;
+    const int thismse = estimate_obmc_pref_error(this_mv, var_params, &sse);
+
+    cost = estimate_obmc_mvcost(this_mv, mv_cost_params);
+    cost += thismse;
+
+    if (cost < *besterr) {
+      *besterr = cost;
+      *best_mv = *this_mv;
+      *distortion = thismse;
+      *sse1 = sse;
+      *has_better_mv |= 1;
+    }
+  } else {
+    cost = INT_MAX;
+  }
+  return cost;
+}
+
+// Estimates whether this_mv is better than best_mv. This function incorporates
+// both prediction error and residue into account.
+static INLINE unsigned int obmc_check_better(
+    MACROBLOCKD *xd, const AV1_COMMON *cm, const MV *this_mv, MV *best_mv,
+    const SubpelMvLimits *mv_limits, const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+    const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
+    unsigned int *sse1, int *distortion, int *has_better_mv) {
+  unsigned int cost;
+  if (av1_is_subpelmv_in_range(mv_limits, *this_mv)) {
+    unsigned int sse;
+    const int thismse =
+        upsampled_obmc_pref_error(xd, cm, this_mv, var_params, &sse);
+    cost = mv_err_cost_(this_mv, mv_cost_params);
+
+    cost += thismse;
+
+    if (cost < *besterr) {
+      *besterr = cost;
+      *best_mv = *this_mv;
+      *distortion = thismse;
+      *sse1 = sse;
+      *has_better_mv |= 1;
+    }
+  } else {
+    cost = INT_MAX;
+  }
+  return cost;
+}
+
+static AOM_FORCE_INLINE MV obmc_first_level_check(
+    MACROBLOCKD *xd, const AV1_COMMON *const cm, const MV this_mv, MV *best_mv,
+    const int hstep, const SubpelMvLimits *mv_limits,
+    const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+    const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
+    unsigned int *sse1, int *distortion) {
+  int dummy = 0;
+  const MV left_mv = { this_mv.row, this_mv.col - hstep };
+  const MV right_mv = { this_mv.row, this_mv.col + hstep };
+  const MV top_mv = { this_mv.row - hstep, this_mv.col };
+  const MV bottom_mv = { this_mv.row + hstep, this_mv.col };
+
+  if (var_params->subpel_search_type != USE_2_TAPS_ORIG) {
+    const unsigned int left =
+        obmc_check_better(xd, cm, &left_mv, best_mv, mv_limits, var_params,
+                          mv_cost_params, besterr, sse1, distortion, &dummy);
+    const unsigned int right =
+        obmc_check_better(xd, cm, &right_mv, best_mv, mv_limits, var_params,
+                          mv_cost_params, besterr, sse1, distortion, &dummy);
+    const unsigned int up =
+        obmc_check_better(xd, cm, &top_mv, best_mv, mv_limits, var_params,
+                          mv_cost_params, besterr, sse1, distortion, &dummy);
+    const unsigned int down =
+        obmc_check_better(xd, cm, &bottom_mv, best_mv, mv_limits, var_params,
+                          mv_cost_params, besterr, sse1, distortion, &dummy);
+
+    const MV diag_step = get_best_diag_step(hstep, left, right, up, down);
+    const MV diag_mv = { this_mv.row + diag_step.row,
+                         this_mv.col + diag_step.col };
+
+    // Check the diagonal direction with the best mv
+    obmc_check_better(xd, cm, &diag_mv, best_mv, mv_limits, var_params,
+                      mv_cost_params, besterr, sse1, distortion, &dummy);
+
+    return diag_step;
+  } else {
+    const unsigned int left = obmc_check_better_fast(
+        &left_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr, sse1,
+        distortion, &dummy);
+    const unsigned int right = obmc_check_better_fast(
+        &right_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr,
+        sse1, distortion, &dummy);
+
+    const unsigned int up = obmc_check_better_fast(
+        &top_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr, sse1,
+        distortion, &dummy);
+
+    const unsigned int down = obmc_check_better_fast(
+        &bottom_mv, best_mv, mv_limits, var_params, mv_cost_params, besterr,
+        sse1, distortion, &dummy);
+
+    const MV diag_step = get_best_diag_step(hstep, left, right, up, down);
+    const MV diag_mv = { this_mv.row + diag_step.row,
+                         this_mv.col + diag_step.col };
+
+    // Check the diagonal direction with the best mv
+    obmc_check_better_fast(&diag_mv, best_mv, mv_limits, var_params,
+                           mv_cost_params, besterr, sse1, distortion, &dummy);
+
+    return diag_step;
+  }
+}
+
+// A newer version of second level check for obmc that gives better quality.
+static AOM_FORCE_INLINE void obmc_second_level_check_v2(
+    MACROBLOCKD *xd, const AV1_COMMON *const cm, const MV this_mv, MV diag_step,
+    MV *best_mv, const SubpelMvLimits *mv_limits,
+    const SUBPEL_SEARCH_VAR_PARAMS *var_params,
+    const MV_COST_PARAMS *mv_cost_params, unsigned int *besterr,
+    unsigned int *sse1, int *distortion) {
+  assert(best_mv->row == this_mv.row + diag_step.row ||
+         best_mv->col == this_mv.col + diag_step.col);
+  if (CHECK_MV_EQUAL(this_mv, *best_mv)) {
+    return;
+  } else if (this_mv.row == best_mv->row) {
+    // Search away from diagonal step since diagonal search did not provide any
+    // improvement
+    diag_step.row *= -1;
+  } else if (this_mv.col == best_mv->col) {
+    diag_step.col *= -1;
+  }
+
+  const MV row_bias_mv = { best_mv->row + diag_step.row, best_mv->col };
+  const MV col_bias_mv = { best_mv->row, best_mv->col + diag_step.col };
+  const MV diag_bias_mv = { best_mv->row + diag_step.row,
+                            best_mv->col + diag_step.col };
+  int has_better_mv = 0;
+
+  if (var_params->subpel_search_type != USE_2_TAPS_ORIG) {
+    obmc_check_better(xd, cm, &row_bias_mv, best_mv, mv_limits, var_params,
+                      mv_cost_params, besterr, sse1, distortion,
+                      &has_better_mv);
+    obmc_check_better(xd, cm, &col_bias_mv, best_mv, mv_limits, var_params,
+                      mv_cost_params, besterr, sse1, distortion,
+                      &has_better_mv);
+
+    // Do an additional search if the second iteration gives a better mv
+    if (has_better_mv) {
+      obmc_check_better(xd, cm, &diag_bias_mv, best_mv, mv_limits, var_params,
+                        mv_cost_params, besterr, sse1, distortion,
+                        &has_better_mv);
+    }
+  } else {
+    obmc_check_better_fast(&row_bias_mv, best_mv, mv_limits, var_params,
+                           mv_cost_params, besterr, sse1, distortion,
+                           &has_better_mv);
+    obmc_check_better_fast(&col_bias_mv, best_mv, mv_limits, var_params,
+                           mv_cost_params, besterr, sse1, distortion,
+                           &has_better_mv);
+
+    // Do an additional search if the second iteration gives a better mv
+    if (has_better_mv) {
+      obmc_check_better_fast(&diag_bias_mv, best_mv, mv_limits, var_params,
+                             mv_cost_params, besterr, sse1, distortion,
+                             &has_better_mv);
+    }
+  }
+}
+
+int av1_find_best_obmc_sub_pixel_tree_up(
+    MACROBLOCKD *xd, const AV1_COMMON *const cm,
+    const SUBPEL_MOTION_SEARCH_PARAMS *ms_params, MV start_mv, MV *bestmv,
+    int *distortion, unsigned int *sse1, int_mv *last_mv_search_list) {
+  (void)last_mv_search_list;
+  const int allow_hp = ms_params->allow_hp;
+  const int forced_stop = ms_params->forced_stop;
+  const int iters_per_step = ms_params->iters_per_step;
+  const MV_COST_PARAMS *mv_cost_params = &ms_params->mv_cost_params;
+  const SUBPEL_SEARCH_VAR_PARAMS *var_params = &ms_params->var_params;
+  const SUBPEL_SEARCH_TYPE subpel_search_type =
+      ms_params->var_params.subpel_search_type;
+  const SubpelMvLimits *mv_limits = &ms_params->mv_limits;
+
+  int hstep = INIT_SUBPEL_STEP_SIZE;
+  const int round = AOMMIN(FULL_PEL - forced_stop, 3 - !allow_hp);
+
+  unsigned int besterr = INT_MAX;
+  *bestmv = start_mv;
+
+  if (subpel_search_type != USE_2_TAPS_ORIG)
+    besterr = upsampled_setup_obmc_center_error(
+        xd, cm, bestmv, var_params, mv_cost_params, sse1, distortion);
+  else
+    besterr = setup_obmc_center_error(bestmv, var_params, mv_cost_params, sse1,
+                                      distortion);
+
+  for (int iter = 0; iter < round; ++iter) {
+    MV iter_center_mv = *bestmv;
+    MV diag_step = obmc_first_level_check(xd, cm, iter_center_mv, bestmv, hstep,
+                                          mv_limits, var_params, mv_cost_params,
+                                          &besterr, sse1, distortion);
+
+    if (!CHECK_MV_EQUAL(iter_center_mv, *bestmv) && iters_per_step > 1) {
+      obmc_second_level_check_v2(xd, cm, iter_center_mv, diag_step, bestmv,
+                                 mv_limits, var_params, mv_cost_params,
+                                 &besterr, sse1, distortion);
+    }
+    hstep >>= 1;
+  }
+
+  return besterr;
+}
+
+// =============================================================================
+//  Public cost function: mv_cost + pred error
+// =============================================================================
+int av1_get_mvpred_sse(const MACROBLOCK *x, const FULLPEL_MV *best_mv,
+                       const MV *ref_mv, const aom_variance_fn_ptr_t *vfp) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+  const MV mv = get_mv_from_fullmv(best_mv);
+  const MV_COST_TYPE mv_cost_type = x->mv_cost_type;
+  unsigned int sse, var;
+
+  var = vfp->vf(what->buf, what->stride, get_buf_from_fullmv(in_what, best_mv),
+                in_what->stride, &sse);
+  (void)var;
+
+  return sse + mv_err_cost(&mv, ref_mv, x->nmv_vec_cost,
+                           CONVERT_TO_CONST_MVCOST(x->mv_cost_stack),
+                           x->errorperbit, mv_cost_type);
+}
+
+static INLINE int get_mvpred_av_var(const MV_COST_PARAMS *mv_cost_params,
+                                    const FULLPEL_MV best_mv,
+                                    const uint8_t *second_pred,
+                                    const aom_variance_fn_ptr_t *vfp,
+                                    const struct buf_2d *src,
+                                    const struct buf_2d *pre) {
+  const struct buf_2d *const what = src;
+  const struct buf_2d *const in_what = pre;
+  const MV mv = get_mv_from_fullmv(&best_mv);
+  unsigned int unused;
+
+  return vfp->svaf(get_buf_from_fullmv(in_what, &best_mv), in_what->stride, 0,
+                   0, what->buf, what->stride, &unused, second_pred) +
+         mv_err_cost_(&mv, mv_cost_params);
+}
+
+static INLINE int get_mvpred_mask_var(
+    const MV_COST_PARAMS *mv_cost_params, const FULLPEL_MV best_mv,
+    const uint8_t *second_pred, const uint8_t *mask, int mask_stride,
+    int invert_mask, const aom_variance_fn_ptr_t *vfp, const struct buf_2d *src,
+    const struct buf_2d *pre) {
+  const struct buf_2d *const what = src;
+  const struct buf_2d *const in_what = pre;
+  const MV mv = get_mv_from_fullmv(&best_mv);
+  unsigned int unused;
+
+  return vfp->msvf(what->buf, what->stride, 0, 0,
+                   get_buf_from_fullmv(in_what, &best_mv), in_what->stride,
+                   second_pred, mask, mask_stride, invert_mask, &unused) +
+         mv_err_cost_(&mv, mv_cost_params);
+}
+
+int av1_get_mvpred_compound_var(const MV_COST_PARAMS *mv_cost_params,
+                                const FULLPEL_MV best_mv,
+                                const uint8_t *second_pred, const uint8_t *mask,
+                                int mask_stride, int invert_mask,
+                                const aom_variance_fn_ptr_t *vfp,
+                                const struct buf_2d *src,
+                                const struct buf_2d *pre) {
+  if (mask) {
+    return get_mvpred_mask_var(mv_cost_params, best_mv, second_pred, mask,
+                               mask_stride, invert_mask, vfp, src, pre);
+  } else {
+    return get_mvpred_av_var(mv_cost_params, best_mv, second_pred, vfp, src,
+                             pre);
+  }
+}
diff --git a/libs/libaom/src/av1/encoder/mcomp.h b/libs/libaom/src/av1/encoder/mcomp.h
new file mode 100644
index 000000000..73135d859
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/mcomp.h
@@ -0,0 +1,329 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_MCOMP_H_
+#define AOM_AV1_ENCODER_MCOMP_H_
+
+#include "av1/common/mv.h"
+#include "av1/encoder/block.h"
+
+#include "aom_dsp/variance.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// The maximum number of steps in a step search given the largest
+// allowed initial step
+#define MAX_MVSEARCH_STEPS 11
+// Max full pel mv specified in the unit of full pixel
+// Enable the use of motion vector in range [-1023, 1023].
+#define MAX_FULL_PEL_VAL ((1 << (MAX_MVSEARCH_STEPS - 1)) - 1)
+// Maximum size of the first step in full pel units
+#define MAX_FIRST_STEP (1 << (MAX_MVSEARCH_STEPS - 1))
+
+#define SEARCH_RANGE_8P 3
+#define SEARCH_GRID_STRIDE_8P (2 * SEARCH_RANGE_8P + 1)
+#define SEARCH_GRID_CENTER_8P \
+  (SEARCH_RANGE_8P * SEARCH_GRID_STRIDE_8P + SEARCH_RANGE_8P)
+
+// motion search site
+typedef struct search_site {
+  FULLPEL_MV mv;
+  int offset;
+} search_site;
+
+typedef struct search_site_config {
+  search_site ss[MAX_MVSEARCH_STEPS * 2][16 + 1];
+  int ss_count;
+  int searches_per_step[MAX_MVSEARCH_STEPS * 2];
+  int radius[MAX_MVSEARCH_STEPS * 2];
+  int stride;
+} search_site_config;
+
+typedef struct {
+  FULLPEL_MV coord;
+  int coord_offset;
+} search_neighbors;
+
+struct AV1_COMP;
+struct SPEED_FEATURES;
+
+// =============================================================================
+//  Cost functions
+// =============================================================================
+typedef struct {
+  const MV *ref_mv;
+  FULLPEL_MV full_ref_mv;
+  const int *mvjcost;
+  const int *mvcost[2];
+  int error_per_bit;
+  int sad_per_bit;
+  MV_COST_TYPE mv_cost_type;
+} MV_COST_PARAMS;
+
+int av1_mv_bit_cost(const MV *mv, const MV *ref_mv, const int *mvjcost,
+                    int *mvcost[2], int weight);
+
+int av1_get_mvpred_sse(const MACROBLOCK *x, const FULLPEL_MV *best_mv,
+                       const MV *ref_mv, const aom_variance_fn_ptr_t *vfp);
+int av1_get_mvpred_compound_var(const MV_COST_PARAMS *ms_params,
+                                const FULLPEL_MV best_mv,
+                                const uint8_t *second_pred, const uint8_t *mask,
+                                int mask_stride, int invert_mask,
+                                const aom_variance_fn_ptr_t *vfp,
+                                const struct buf_2d *src,
+                                const struct buf_2d *pre);
+
+// =============================================================================
+//  Motion Search
+// =============================================================================
+typedef struct {
+  // The reference buffer
+  const struct buf_2d *ref;
+
+  // The source and predictors/mask used by translational search
+  const struct buf_2d *src;
+  const uint8_t *second_pred;
+  const uint8_t *mask;
+  int mask_stride;
+  int inv_mask;
+
+  // The weighted source and mask used by OBMC
+  const int32_t *wsrc;
+  const int32_t *obmc_mask;
+} MSBuffers;
+
+static INLINE void av1_set_ms_compound_refs(MSBuffers *ms_buffers,
+                                            const uint8_t *second_pred,
+                                            const uint8_t *mask,
+                                            int mask_stride, int invert_mask) {
+  ms_buffers->second_pred = second_pred;
+  ms_buffers->mask = mask;
+  ms_buffers->mask_stride = mask_stride;
+  ms_buffers->inv_mask = invert_mask;
+}
+
+// =============================================================================
+//  Fullpixel Motion Search
+// =============================================================================
+enum {
+  DIAMOND = 0,
+  NSTEP = 1,
+  HEX = 2,
+  BIGDIA = 3,
+  SQUARE = 4,
+  FAST_HEX = 5,
+  FAST_DIAMOND = 6
+} UENUM1BYTE(SEARCH_METHODS);
+
+// This struct holds fullpixel motion search parameters that should be constant
+// during the search
+typedef struct {
+  BLOCK_SIZE bsize;
+  const aom_variance_fn_ptr_t *vfp;
+
+  MSBuffers ms_buffers;
+
+  SEARCH_METHODS search_method;
+  const search_site_config *search_sites;
+  FullMvLimits mv_limits;
+
+  int run_mesh_search;    // Sets mesh search unless it got pruned by
+                          // prune_mesh_search.
+  int prune_mesh_search;  // Disables mesh search if the best_mv after a normal
+                          // search if close to the start_mv.
+  int force_mesh_thresh;  // Forces mesh search if the residue variance is
+                          // higher than the threshold.
+  const struct MESH_PATTERN *mesh_patterns[2];
+
+  int is_intra_mode;
+
+  int fast_obmc_search;
+
+  // For calculating mv cost
+  MV_COST_PARAMS mv_cost_params;
+} FULLPEL_MOTION_SEARCH_PARAMS;
+
+void av1_make_default_fullpel_ms_params(FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+                                        const struct AV1_COMP *cpi,
+                                        const MACROBLOCK *x, BLOCK_SIZE bsize,
+                                        const MV *ref_mv,
+                                        const search_site_config *search_sites);
+
+// Sets up configs for fullpixel diamond search
+void av1_init_dsmotion_compensation(search_site_config *cfg, int stride);
+// Sets up configs for firstpass motion search
+void av1_init_motion_fpf(search_site_config *cfg, int stride);
+// Sets up configs for all other types of motion search
+void av1_init3smotion_compensation(search_site_config *cfg, int stride);
+
+// Set up limit values for MV components.
+// Mv beyond the range do not produce new/different prediction block.
+static INLINE void av1_set_mv_row_limits(
+    const CommonModeInfoParams *const mi_params, FullMvLimits *mv_limits,
+    int mi_row, int mi_height, int border) {
+  const int min1 = -(mi_row * MI_SIZE + border - 2 * AOM_INTERP_EXTEND);
+  const int min2 = -(((mi_row + mi_height) * MI_SIZE) + 2 * AOM_INTERP_EXTEND);
+  mv_limits->row_min = AOMMAX(min1, min2);
+  const int max1 = (mi_params->mi_rows - mi_row - mi_height) * MI_SIZE +
+                   border - 2 * AOM_INTERP_EXTEND;
+  const int max2 =
+      (mi_params->mi_rows - mi_row) * MI_SIZE + 2 * AOM_INTERP_EXTEND;
+  mv_limits->row_max = AOMMIN(max1, max2);
+}
+
+static INLINE void av1_set_mv_col_limits(
+    const CommonModeInfoParams *const mi_params, FullMvLimits *mv_limits,
+    int mi_col, int mi_width, int border) {
+  const int min1 = -(mi_col * MI_SIZE + border - 2 * AOM_INTERP_EXTEND);
+  const int min2 = -(((mi_col + mi_width) * MI_SIZE) + 2 * AOM_INTERP_EXTEND);
+  mv_limits->col_min = AOMMAX(min1, min2);
+  const int max1 = (mi_params->mi_cols - mi_col - mi_width) * MI_SIZE + border -
+                   2 * AOM_INTERP_EXTEND;
+  const int max2 =
+      (mi_params->mi_cols - mi_col) * MI_SIZE + 2 * AOM_INTERP_EXTEND;
+  mv_limits->col_max = AOMMIN(max1, max2);
+}
+
+static INLINE void av1_set_mv_limits(
+    const CommonModeInfoParams *const mi_params, FullMvLimits *mv_limits,
+    int mi_row, int mi_col, int mi_height, int mi_width, int border) {
+  av1_set_mv_row_limits(mi_params, mv_limits, mi_row, mi_height, border);
+  av1_set_mv_col_limits(mi_params, mv_limits, mi_col, mi_width, border);
+}
+
+void av1_set_mv_search_range(FullMvLimits *mv_limits, const MV *mv);
+
+int av1_init_search_range(int size);
+
+unsigned int av1_int_pro_motion_estimation(const struct AV1_COMP *cpi,
+                                           MACROBLOCK *x, BLOCK_SIZE bsize,
+                                           int mi_row, int mi_col,
+                                           const MV *ref_mv);
+
+int av1_refining_search_8p_c(const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+                             const FULLPEL_MV start_mv, FULLPEL_MV *best_mv);
+
+int av1_full_pixel_search(const FULLPEL_MV start_mv,
+                          const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+                          const int step_param, int *cost_list,
+                          FULLPEL_MV *best_mv, FULLPEL_MV *second_best_mv);
+
+int av1_intrabc_hash_search(const struct AV1_COMP *cpi, const MACROBLOCKD *xd,
+                            const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+                            IntraBCHashInfo *intrabc_hash_info,
+                            FULLPEL_MV *best_mv);
+
+int av1_obmc_full_pixel_search(const FULLPEL_MV start_mv,
+                               const FULLPEL_MOTION_SEARCH_PARAMS *ms_params,
+                               const int step_param, FULLPEL_MV *best_mv);
+
+static INLINE int av1_is_fullmv_in_range(const FullMvLimits *mv_limits,
+                                         FULLPEL_MV mv) {
+  return (mv.col >= mv_limits->col_min) && (mv.col <= mv_limits->col_max) &&
+         (mv.row >= mv_limits->row_min) && (mv.row <= mv_limits->row_max);
+}
+// =============================================================================
+//  Subpixel Motion Search
+// =============================================================================
+enum {
+  EIGHTH_PEL,
+  QUARTER_PEL,
+  HALF_PEL,
+  FULL_PEL
+} UENUM1BYTE(SUBPEL_FORCE_STOP);
+
+typedef struct {
+  const aom_variance_fn_ptr_t *vfp;
+  SUBPEL_SEARCH_TYPE subpel_search_type;
+  // Source and reference buffers
+  MSBuffers ms_buffers;
+  int w, h;
+} SUBPEL_SEARCH_VAR_PARAMS;
+
+// This struct holds subpixel motion search parameters that should be constant
+// during the search
+typedef struct {
+  // High level motion search settings
+  int allow_hp;
+  const int *cost_list;
+  SUBPEL_FORCE_STOP forced_stop;
+  int iters_per_step;
+  SubpelMvLimits mv_limits;
+
+  // For calculating mv cost
+  MV_COST_PARAMS mv_cost_params;
+
+  // Distortion calculation params
+  SUBPEL_SEARCH_VAR_PARAMS var_params;
+} SUBPEL_MOTION_SEARCH_PARAMS;
+
+void av1_make_default_subpel_ms_params(SUBPEL_MOTION_SEARCH_PARAMS *ms_params,
+                                       const struct AV1_COMP *cpi,
+                                       const MACROBLOCK *x, BLOCK_SIZE bsize,
+                                       const MV *ref_mv, const int *cost_list);
+
+typedef int(fractional_mv_step_fp)(MACROBLOCKD *xd, const AV1_COMMON *const cm,
+                                   const SUBPEL_MOTION_SEARCH_PARAMS *ms_params,
+                                   MV start_mv, MV *bestmv, int *distortion,
+                                   unsigned int *sse1,
+                                   int_mv *last_mv_search_list);
+
+extern fractional_mv_step_fp av1_find_best_sub_pixel_tree;
+extern fractional_mv_step_fp av1_find_best_sub_pixel_tree_pruned;
+extern fractional_mv_step_fp av1_find_best_sub_pixel_tree_pruned_more;
+extern fractional_mv_step_fp av1_find_best_sub_pixel_tree_pruned_evenmore;
+extern fractional_mv_step_fp av1_return_max_sub_pixel_mv;
+extern fractional_mv_step_fp av1_return_min_sub_pixel_mv;
+extern fractional_mv_step_fp av1_find_best_obmc_sub_pixel_tree_up;
+
+unsigned int av1_refine_warped_mv(MACROBLOCKD *xd, const AV1_COMMON *const cm,
+                                  const SUBPEL_MOTION_SEARCH_PARAMS *ms_params,
+                                  BLOCK_SIZE bsize, const int *pts0,
+                                  const int *pts_inref0, int total_samples);
+
+static INLINE void av1_set_fractional_mv(int_mv *fractional_best_mv) {
+  for (int z = 0; z < 3; z++) {
+    fractional_best_mv[z].as_int = INVALID_MV;
+  }
+}
+
+static INLINE void av1_set_subpel_mv_search_range(SubpelMvLimits *subpel_limits,
+                                                  const FullMvLimits *mv_limits,
+                                                  const MV *ref_mv) {
+  const int max_mv = GET_MV_SUBPEL(MAX_FULL_PEL_VAL);
+  const int minc =
+      AOMMAX(GET_MV_SUBPEL(mv_limits->col_min), ref_mv->col - max_mv);
+  const int maxc =
+      AOMMIN(GET_MV_SUBPEL(mv_limits->col_max), ref_mv->col + max_mv);
+  const int minr =
+      AOMMAX(GET_MV_SUBPEL(mv_limits->row_min), ref_mv->row - max_mv);
+  const int maxr =
+      AOMMIN(GET_MV_SUBPEL(mv_limits->row_max), ref_mv->row + max_mv);
+
+  subpel_limits->col_min = AOMMAX(MV_LOW + 1, minc);
+  subpel_limits->col_max = AOMMIN(MV_UPP - 1, maxc);
+  subpel_limits->row_min = AOMMAX(MV_LOW + 1, minr);
+  subpel_limits->row_max = AOMMIN(MV_UPP - 1, maxr);
+}
+
+static INLINE int av1_is_subpelmv_in_range(const SubpelMvLimits *mv_limits,
+                                           MV mv) {
+  return (mv.col >= mv_limits->col_min) && (mv.col <= mv_limits->col_max) &&
+         (mv.row >= mv_limits->row_min) && (mv.row <= mv_limits->row_max);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_MCOMP_H_
diff --git a/libs/libaom/src/av1/encoder/mips/msa/error_msa.c b/libs/libaom/src/av1/encoder/mips/msa/error_msa.c
new file mode 100644
index 000000000..2e86dee43
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/mips/msa/error_msa.c
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/mips/macros_msa.h"
+
+#define BLOCK_ERROR_BLOCKSIZE_MSA(BSize)                                     \
+  static int64_t block_error_##BSize##size_msa(                              \
+      const int16_t *coeff_ptr, const int16_t *dq_coeff_ptr, int64_t *ssz) { \
+    int64_t err = 0;                                                         \
+    uint32_t loop_cnt;                                                       \
+    v8i16 coeff, dq_coeff, coeff_r_h, coeff_l_h;                             \
+    v4i32 diff_r, diff_l, coeff_r_w, coeff_l_w;                              \
+    v2i64 sq_coeff_r, sq_coeff_l;                                            \
+    v2i64 err0, err_dup0, err1, err_dup1;                                    \
+                                                                             \
+    coeff = LD_SH(coeff_ptr);                                                \
+    dq_coeff = LD_SH(dq_coeff_ptr);                                          \
+    UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w);                                \
+    ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h);                      \
+    HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l);                       \
+    DOTP_SW2_SD(coeff_r_w, coeff_l_w, coeff_r_w, coeff_l_w, sq_coeff_r,      \
+                sq_coeff_l);                                                 \
+    DOTP_SW2_SD(diff_r, diff_l, diff_r, diff_l, err0, err1);                 \
+                                                                             \
+    coeff = LD_SH(coeff_ptr + 8);                                            \
+    dq_coeff = LD_SH(dq_coeff_ptr + 8);                                      \
+    UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w);                                \
+    ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h);                      \
+    HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l);                       \
+    DPADD_SD2_SD(coeff_r_w, coeff_l_w, sq_coeff_r, sq_coeff_l);              \
+    DPADD_SD2_SD(diff_r, diff_l, err0, err1);                                \
+                                                                             \
+    coeff_ptr += 16;                                                         \
+    dq_coeff_ptr += 16;                                                      \
+                                                                             \
+    for (loop_cnt = ((BSize >> 4) - 1); loop_cnt--;) {                       \
+      coeff = LD_SH(coeff_ptr);                                              \
+      dq_coeff = LD_SH(dq_coeff_ptr);                                        \
+      UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w);                              \
+      ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h);                    \
+      HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l);                     \
+      DPADD_SD2_SD(coeff_r_w, coeff_l_w, sq_coeff_r, sq_coeff_l);            \
+      DPADD_SD2_SD(diff_r, diff_l, err0, err1);                              \
+                                                                             \
+      coeff = LD_SH(coeff_ptr + 8);                                          \
+      dq_coeff = LD_SH(dq_coeff_ptr + 8);                                    \
+      UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w);                              \
+      ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h);                    \
+      HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l);                     \
+      DPADD_SD2_SD(coeff_r_w, coeff_l_w, sq_coeff_r, sq_coeff_l);            \
+      DPADD_SD2_SD(diff_r, diff_l, err0, err1);                              \
+                                                                             \
+      coeff_ptr += 16;                                                       \
+      dq_coeff_ptr += 16;                                                    \
+    }                                                                        \
+                                                                             \
+    err_dup0 = __msa_splati_d(sq_coeff_r, 1);                                \
+    err_dup1 = __msa_splati_d(sq_coeff_l, 1);                                \
+    sq_coeff_r += err_dup0;                                                  \
+    sq_coeff_l += err_dup1;                                                  \
+    *ssz = __msa_copy_s_d(sq_coeff_r, 0);                                    \
+    *ssz += __msa_copy_s_d(sq_coeff_l, 0);                                   \
+                                                                             \
+    err_dup0 = __msa_splati_d(err0, 1);                                      \
+    err_dup1 = __msa_splati_d(err1, 1);                                      \
+    err0 += err_dup0;                                                        \
+    err1 += err_dup1;                                                        \
+    err = __msa_copy_s_d(err0, 0);                                           \
+    err += __msa_copy_s_d(err1, 0);                                          \
+                                                                             \
+    return err;                                                              \
+  }
+
+/* clang-format off */
+BLOCK_ERROR_BLOCKSIZE_MSA(16)
+BLOCK_ERROR_BLOCKSIZE_MSA(64)
+BLOCK_ERROR_BLOCKSIZE_MSA(256)
+BLOCK_ERROR_BLOCKSIZE_MSA(1024)
+/* clang-format on */
+
+int64_t av1_block_error_msa(const tran_low_t *coeff_ptr,
+                            const tran_low_t *dq_coeff_ptr, intptr_t blk_size,
+                            int64_t *ssz) {
+  int64_t err;
+  const int16_t *coeff = (const int16_t *)coeff_ptr;
+  const int16_t *dq_coeff = (const int16_t *)dq_coeff_ptr;
+
+  switch (blk_size) {
+    case 16: err = block_error_16size_msa(coeff, dq_coeff, ssz); break;
+    case 64: err = block_error_64size_msa(coeff, dq_coeff, ssz); break;
+    case 256: err = block_error_256size_msa(coeff, dq_coeff, ssz); break;
+    case 1024: err = block_error_1024size_msa(coeff, dq_coeff, ssz); break;
+    default:
+      err = av1_block_error_c(coeff_ptr, dq_coeff_ptr, blk_size, ssz);
+      break;
+  }
+
+  return err;
+}
diff --git a/libs/libaom/src/av1/encoder/mips/msa/fdct4x4_msa.c b/libs/libaom/src/av1/encoder/mips/msa/fdct4x4_msa.c
new file mode 100644
index 000000000..085c08bfb
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/mips/msa/fdct4x4_msa.c
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "av1/common/enums.h"
+
+void av1_fwht4x4_msa(const int16_t *input, int16_t *output,
+                     int32_t src_stride) {
+  v8i16 in0, in1, in2, in3, in4;
+
+  LD_SH4(input, src_stride, in0, in1, in2, in3);
+
+  in0 += in1;
+  in3 -= in2;
+  in4 = (in0 - in3) >> 1;
+  SUB2(in4, in1, in4, in2, in1, in2);
+  in0 -= in2;
+  in3 += in1;
+
+  TRANSPOSE4x4_SH_SH(in0, in2, in3, in1, in0, in2, in3, in1);
+
+  in0 += in2;
+  in1 -= in3;
+  in4 = (in0 - in1) >> 1;
+  SUB2(in4, in2, in4, in3, in2, in3);
+  in0 -= in3;
+  in1 += in2;
+
+  SLLI_4V(in0, in1, in2, in3, 2);
+
+  TRANSPOSE4x4_SH_SH(in0, in3, in1, in2, in0, in3, in1, in2);
+
+  ST4x2_UB(in0, output, 4);
+  ST4x2_UB(in3, output + 4, 4);
+  ST4x2_UB(in1, output + 8, 4);
+  ST4x2_UB(in2, output + 12, 4);
+}
diff --git a/libs/libaom/src/av1/encoder/mips/msa/temporal_filter_msa.c b/libs/libaom/src/av1/encoder/mips/msa/temporal_filter_msa.c
new file mode 100644
index 000000000..effa75b83
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/mips/msa/temporal_filter_msa.c
@@ -0,0 +1,286 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/mips/macros_msa.h"
+
+static void temporal_filter_apply_8size_msa(uint8_t *frm1_ptr, uint32_t stride,
+                                            uint8_t *frm2_ptr, int32_t filt_sth,
+                                            int32_t filt_wgt, uint32_t *acc,
+                                            uint16_t *cnt) {
+  uint32_t row;
+  uint64_t f0, f1, f2, f3;
+  v16i8 frm2, frm1 = { 0 };
+  v16i8 frm4, frm3 = { 0 };
+  v16u8 frm_r, frm_l;
+  v8i16 frm2_r, frm2_l;
+  v8i16 diff0, diff1, mod0_h, mod1_h;
+  v4i32 cnst3, cnst16, filt_wt, strength;
+  v4i32 mod0_w, mod1_w, mod2_w, mod3_w;
+  v4i32 diff0_r, diff0_l, diff1_r, diff1_l;
+  v4i32 frm2_rr, frm2_rl, frm2_lr, frm2_ll;
+  v4i32 acc0, acc1, acc2, acc3;
+  v8i16 cnt0, cnt1;
+
+  filt_wt = __msa_fill_w(filt_wgt);
+  strength = __msa_fill_w(filt_sth);
+  cnst3 = __msa_ldi_w(3);
+  cnst16 = __msa_ldi_w(16);
+
+  for (row = 2; row--;) {
+    LD4(frm1_ptr, stride, f0, f1, f2, f3);
+    frm1_ptr += (4 * stride);
+
+    LD_SB2(frm2_ptr, 16, frm2, frm4);
+    frm2_ptr += 32;
+
+    LD_SW2(acc, 4, acc0, acc1);
+    LD_SW2(acc + 8, 4, acc2, acc3);
+    LD_SH2(cnt, 8, cnt0, cnt1);
+
+    INSERT_D2_SB(f0, f1, frm1);
+    INSERT_D2_SB(f2, f3, frm3);
+    ILVRL_B2_UB(frm1, frm2, frm_r, frm_l);
+    HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
+    UNPCK_SH_SW(diff0, diff0_r, diff0_l);
+    UNPCK_SH_SW(diff1, diff1_r, diff1_l);
+    MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w,
+         mod1_w, mod2_w, mod3_w);
+    SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
+
+    diff0_r = (mod0_w < cnst16);
+    diff0_l = (mod1_w < cnst16);
+    diff1_r = (mod2_w < cnst16);
+    diff1_l = (mod3_w < cnst16);
+
+    SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w,
+         mod1_w, mod2_w, mod3_w);
+
+    mod0_w = diff0_r & mod0_w;
+    mod1_w = diff0_l & mod1_w;
+    mod2_w = diff1_r & mod2_w;
+    mod3_w = diff1_l & mod3_w;
+
+    MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
+    ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
+    ST_SH2(mod0_h, mod1_h, cnt, 8);
+    cnt += 16;
+
+    UNPCK_UB_SH(frm2, frm2_r, frm2_l);
+    UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
+    UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
+    MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
+         mod2_w, mod3_w);
+
+    ST_SW2(mod0_w, mod1_w, acc, 4);
+    acc += 8;
+    ST_SW2(mod2_w, mod3_w, acc, 4);
+    acc += 8;
+
+    LD_SW2(acc, 4, acc0, acc1);
+    LD_SW2(acc + 8, 4, acc2, acc3);
+    LD_SH2(cnt, 8, cnt0, cnt1);
+
+    ILVRL_B2_UB(frm3, frm4, frm_r, frm_l);
+    HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
+    UNPCK_SH_SW(diff0, diff0_r, diff0_l);
+    UNPCK_SH_SW(diff1, diff1_r, diff1_l);
+    MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w,
+         mod1_w, mod2_w, mod3_w);
+    SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
+
+    diff0_r = (mod0_w < cnst16);
+    diff0_l = (mod1_w < cnst16);
+    diff1_r = (mod2_w < cnst16);
+    diff1_l = (mod3_w < cnst16);
+
+    SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w,
+         mod1_w, mod2_w, mod3_w);
+
+    mod0_w = diff0_r & mod0_w;
+    mod1_w = diff0_l & mod1_w;
+    mod2_w = diff1_r & mod2_w;
+    mod3_w = diff1_l & mod3_w;
+
+    MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
+    ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
+    ST_SH2(mod0_h, mod1_h, cnt, 8);
+    cnt += 16;
+    UNPCK_UB_SH(frm4, frm2_r, frm2_l);
+    UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
+    UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
+    MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
+         mod2_w, mod3_w);
+
+    ST_SW2(mod0_w, mod1_w, acc, 4);
+    acc += 8;
+    ST_SW2(mod2_w, mod3_w, acc, 4);
+    acc += 8;
+  }
+}
+
+static void temporal_filter_apply_16size_msa(uint8_t *frm1_ptr, uint32_t stride,
+                                             uint8_t *frm2_ptr,
+                                             int32_t filt_sth, int32_t filt_wgt,
+                                             uint32_t *acc, uint16_t *cnt) {
+  uint32_t row;
+  v16i8 frm1, frm2, frm3, frm4;
+  v16u8 frm_r, frm_l;
+  v16i8 zero = { 0 };
+  v8u16 frm2_r, frm2_l;
+  v8i16 diff0, diff1, mod0_h, mod1_h;
+  v4i32 cnst3, cnst16, filt_wt, strength;
+  v4i32 mod0_w, mod1_w, mod2_w, mod3_w;
+  v4i32 diff0_r, diff0_l, diff1_r, diff1_l;
+  v4i32 frm2_rr, frm2_rl, frm2_lr, frm2_ll;
+  v4i32 acc0, acc1, acc2, acc3;
+  v8i16 cnt0, cnt1;
+
+  filt_wt = __msa_fill_w(filt_wgt);
+  strength = __msa_fill_w(filt_sth);
+  cnst3 = __msa_ldi_w(3);
+  cnst16 = __msa_ldi_w(16);
+
+  for (row = 8; row--;) {
+    LD_SB2(frm1_ptr, stride, frm1, frm3);
+    frm1_ptr += stride;
+
+    LD_SB2(frm2_ptr, 16, frm2, frm4);
+    frm2_ptr += 16;
+
+    LD_SW2(acc, 4, acc0, acc1);
+    LD_SW2(acc, 4, acc2, acc3);
+    LD_SH2(cnt, 8, cnt0, cnt1);
+
+    ILVRL_B2_UB(frm1, frm2, frm_r, frm_l);
+    HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
+    UNPCK_SH_SW(diff0, diff0_r, diff0_l);
+    UNPCK_SH_SW(diff1, diff1_r, diff1_l);
+    MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w,
+         mod1_w, mod2_w, mod3_w);
+    SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
+
+    diff0_r = (mod0_w < cnst16);
+    diff0_l = (mod1_w < cnst16);
+    diff1_r = (mod2_w < cnst16);
+    diff1_l = (mod3_w < cnst16);
+
+    SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w,
+         mod1_w, mod2_w, mod3_w);
+
+    mod0_w = diff0_r & mod0_w;
+    mod1_w = diff0_l & mod1_w;
+    mod2_w = diff1_r & mod2_w;
+    mod3_w = diff1_l & mod3_w;
+
+    MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
+    ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
+    ST_SH2(mod0_h, mod1_h, cnt, 8);
+    cnt += 16;
+
+    ILVRL_B2_UH(zero, frm2, frm2_r, frm2_l);
+    UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
+    UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
+    MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
+         mod2_w, mod3_w);
+
+    ST_SW2(mod0_w, mod1_w, acc, 4);
+    acc += 8;
+    ST_SW2(mod2_w, mod3_w, acc, 4);
+    acc += 8;
+
+    LD_SW2(acc, 4, acc0, acc1);
+    LD_SW2(acc + 8, 4, acc2, acc3);
+    LD_SH2(cnt, 8, cnt0, cnt1);
+
+    ILVRL_B2_UB(frm3, frm4, frm_r, frm_l);
+    HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
+    UNPCK_SH_SW(diff0, diff0_r, diff0_l);
+    UNPCK_SH_SW(diff1, diff1_r, diff1_l);
+    MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w,
+         mod1_w, mod2_w, mod3_w);
+    SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
+
+    diff0_r = (mod0_w < cnst16);
+    diff0_l = (mod1_w < cnst16);
+    diff1_r = (mod2_w < cnst16);
+    diff1_l = (mod3_w < cnst16);
+
+    SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w,
+         mod1_w, mod2_w, mod3_w);
+
+    mod0_w = diff0_r & mod0_w;
+    mod1_w = diff0_l & mod1_w;
+    mod2_w = diff1_r & mod2_w;
+    mod3_w = diff1_l & mod3_w;
+
+    MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
+    ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
+    ST_SH2(mod0_h, mod1_h, cnt, 8);
+    cnt += 16;
+
+    ILVRL_B2_UH(zero, frm4, frm2_r, frm2_l);
+    UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
+    UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
+    MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
+         mod0_w, mod1_w, mod2_w, mod3_w);
+    ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
+         mod2_w, mod3_w);
+    ST_SW2(mod0_w, mod1_w, acc, 4);
+    acc += 8;
+    ST_SW2(mod2_w, mod3_w, acc, 4);
+    acc += 8;
+
+    frm1_ptr += stride;
+    frm2_ptr += 16;
+  }
+}
+
+// TODO(yunqing) The following optimization is not used since c code changes.
+void av1_temporal_filter_apply_msa(uint8_t *frame1_ptr, uint32_t stride,
+                                   uint8_t *frame2_ptr, uint32_t blk_w,
+                                   uint32_t blk_h, int32_t strength,
+                                   int32_t filt_wgt, uint32_t *accu,
+                                   uint16_t *cnt) {
+  if (8 == (blk_w * blk_h)) {
+    temporal_filter_apply_8size_msa(frame1_ptr, stride, frame2_ptr, strength,
+                                    filt_wgt, accu, cnt);
+  } else if (16 == (blk_w * blk_h)) {
+    temporal_filter_apply_16size_msa(frame1_ptr, stride, frame2_ptr, strength,
+                                     filt_wgt, accu, cnt);
+  } else {
+    av1_temporal_filter_apply_c(frame1_ptr, stride, frame2_ptr, blk_w, blk_h,
+                                strength, filt_wgt, accu, cnt);
+  }
+}
diff --git a/libs/libaom/src/av1/encoder/misc_model_weights.h b/libs/libaom/src/av1/encoder/misc_model_weights.h
new file mode 100644
index 000000000..f00aeabcf
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/misc_model_weights.h
@@ -0,0 +1,696 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_MISC_MODEL_WEIGHTS_H_
+#define AOM_AV1_ENCODER_MISC_MODEL_WEIGHTS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/encoder/ml.h"
+
+#define MV_PREC_FEATURE_SIZE 18
+
+#define NUM_DNN_LAYERS 1
+#define NUM_DNN_FEATURES MV_PREC_FEATURE_SIZE
+#define MV_PREC_LAYER_SIZE_0 32
+#define NUM_LOGITS 1
+
+const float av1_mv_prec_mean[MV_PREC_FEATURE_SIZE] = { 143.67358891063745f,
+                                                       141.6251917346238f,
+                                                       0.36313633945679064f,
+                                                       0.0028162791958822085f,
+                                                       0.000484820537626698f,
+                                                       0.002769969388939025f,
+                                                       0.0f,
+                                                       0.00031274626720947577f,
+                                                       0.00020578555375160075f,
+                                                       0.0007075246732697733f,
+                                                       0.000539641029909925f,
+                                                       0.0013939401375906984f,
+                                                       4.985394760423499f,
+                                                       4.985394760423499f,
+                                                       4.9992148717283085f,
+                                                       5.143739822380163f,
+                                                       5.518483124004564f,
+                                                       87.63597847427077f };
+
+const float av1_mv_prec_std[MV_PREC_FEATURE_SIZE] = { 66.86256140247244f,
+                                                      68.04472572607503f,
+                                                      13.23247674430399f,
+                                                      0.0029123438396921955f,
+                                                      0.0015331406169374737f,
+                                                      0.0029149813096313775f,
+                                                      1.0f,
+                                                      0.00047501102871357813f,
+                                                      0.00030025962993117947f,
+                                                      0.0009861163580391207f,
+                                                      0.0012157593528004055f,
+                                                      0.002004954948490521f,
+                                                      6.539447500484038f,
+                                                      6.539447500484038f,
+                                                      6.396589058279465f,
+                                                      3.4870155874262516f,
+                                                      3.8911353973740535f,
+                                                      112.07985259573601f };
+
+const float av1_mv_prec_nn_weights_layer_0[] = { -0.13008492159557145f,
+                                                 -0.1483527373474774f,
+                                                 0.08112076098858864f,
+                                                 -0.9582568679627453f,
+                                                 -0.34794757171071206f,
+                                                 0.6465225723304947f,
+                                                 0.0f,
+                                                 0.06754171885839604f,
+                                                 0.27156803620541214f,
+                                                 0.10635231245664407f,
+                                                 -0.031183926995968583f,
+                                                 0.048122572260291f,
+                                                 -0.19498534230045128f,
+                                                 -0.2614116319273316f,
+                                                 -0.3223762845136331f,
+                                                 -1.2063368350609205f,
+                                                 -0.523333556911706f,
+                                                 1.075632260890728f,
+                                                 0.48989726814387946f,
+                                                 -0.34816466111070477f,
+                                                 0.41668357610256473f,
+                                                 -1.0973562848791671f,
+                                                 0.04183921854389494f,
+                                                 -0.9123815389260476f,
+                                                 0.0f,
+                                                 0.859965047744027f,
+                                                 0.1962095804679813f,
+                                                 0.2606564339077058f,
+                                                 0.26695868715184895f,
+                                                 0.5319308568326692f,
+                                                 -0.23717505799723165f,
+                                                 -0.43127224481782567f,
+                                                 -0.3214545776203726f,
+                                                 0.5850852241402176f,
+                                                 -0.26705531612587813f,
+                                                 -0.5786016766610093f,
+                                                 0.9360519909983003f,
+                                                 0.20771329289016555f,
+                                                 -0.027614159544811823f,
+                                                 -1.175022807046164f,
+                                                 -0.07578967497693835f,
+                                                 0.6890172485324256f,
+                                                 0.0f,
+                                                 -0.008008338164988263f,
+                                                 -0.08064800010158935f,
+                                                 -0.22606910981666667f,
+                                                 0.4541586669210879f,
+                                                 0.07731527661370792f,
+                                                 -0.6744475941247964f,
+                                                 -0.2625842448396184f,
+                                                 1.7018613444303785f,
+                                                 -0.08622229073162656f,
+                                                 0.041858142814941275f,
+                                                 -0.24575964090386415f,
+                                                 -0.046626044730994964f,
+                                                 0.7608713064175202f,
+                                                 -0.23330119070907146f,
+                                                 -0.10115510984500826f,
+                                                 0.9722537349192069f,
+                                                 0.11718554254290829f,
+                                                 0.0f,
+                                                 0.2075123446014759f,
+                                                 0.09465167310768637f,
+                                                 0.7609896851963016f,
+                                                 0.4441038581385328f,
+                                                 0.26064144727430955f,
+                                                 -0.14678625366485035f,
+                                                 -0.03597014452200524f,
+                                                 0.3128680867196166f,
+                                                 1.102496797385966f,
+                                                 0.06642253233084111f,
+                                                 -1.2665494483407629f,
+                                                 0.09049412632000911f,
+                                                 -1.1160621999565095f,
+                                                 0.043420275255913035f,
+                                                 -0.8811412259978966f,
+                                                 0.21076234632287777f,
+                                                 0.16571534463543866f,
+                                                 0.0f,
+                                                 -0.7324075176473275f,
+                                                 -0.3677622514459495f,
+                                                 0.3273532243056415f,
+                                                 0.22922161936797775f,
+                                                 0.8204766691058087f,
+                                                 0.02982161033720488f,
+                                                 0.5266419954188112f,
+                                                 -1.0032154963302191f,
+                                                 0.7007602969763729f,
+                                                 0.37196355167990885f,
+                                                 -0.7608579453228548f,
+                                                 0.08568111584781847f,
+                                                 0.07011061059123677f,
+                                                 0.3233263598082507f,
+                                                 -0.08249928295410253f,
+                                                 0.08220165761319252f,
+                                                 0.22148722752246794f,
+                                                 0.0f,
+                                                 0.6122392701743506f,
+                                                 -0.26429838296378333f,
+                                                 0.31958081620005463f,
+                                                 -0.006027177397853826f,
+                                                 -0.3088310785887994f,
+                                                 -0.5436192046707807f,
+                                                 -0.011080356757423306f,
+                                                 0.12632650770008413f,
+                                                 -0.45097913215234525f,
+                                                 1.8008072867127298f,
+                                                 -0.7630029654575501f,
+                                                 -0.4054774329826579f,
+                                                 0.40386074452544535f,
+                                                 -0.18541426257453025f,
+                                                 0.2444879765079863f,
+                                                 -0.6216724756115081f,
+                                                 0.27030299321302f,
+                                                 0.0f,
+                                                 -0.6835848952967989f,
+                                                 -0.7914184320964815f,
+                                                 -0.6761595019582928f,
+                                                 -1.009565565604081f,
+                                                 -0.1904242439353305f,
+                                                 0.4463417126318631f,
+                                                 0.6025503823452971f,
+                                                 0.5149990860115566f,
+                                                 1.0242970663937634f,
+                                                 0.037947306826401385f,
+                                                 0.07039339786212848f,
+                                                 0.14273796789711987f,
+                                                 0.168103961425691f,
+                                                 1.6596066376811978f,
+                                                 0.19321092229384657f,
+                                                 -0.3710750388148514f,
+                                                 -0.01717015559410288f,
+                                                 0.0f,
+                                                 0.3005688477942597f,
+                                                 0.23877080653829577f,
+                                                 0.2718594552971173f,
+                                                 0.3885402571589898f,
+                                                 0.32999531945669247f,
+                                                 -0.6134460954213243f,
+                                                 -0.13972265462799183f,
+                                                 -0.07180089575716991f,
+                                                 -1.014572598188105f,
+                                                 0.0717207322809836f,
+                                                 0.34896157745155615f,
+                                                 -0.27127687591403f,
+                                                 -0.5058651212773623f,
+                                                 -1.5442435628306925f,
+                                                 -0.6399784724734707f,
+                                                 0.6274301429074947f,
+                                                 -0.4645750072767051f,
+                                                 0.0f,
+                                                 -0.2406726815244178f,
+                                                 -0.06321214115916597f,
+                                                 0.312856714253404f,
+                                                 0.16459514124116134f,
+                                                 0.3993579604809623f,
+                                                 -0.15232044351561913f,
+                                                 -0.5613743948568469f,
+                                                 0.7219801372223262f,
+                                                 0.2936857469624009f,
+                                                 0.7823466656034087f,
+                                                 -0.12416947814098349f,
+                                                 -0.36413756654028345f,
+                                                 -0.07992098796866462f,
+                                                 -0.7395722879842416f,
+                                                 0.8639913543220514f,
+                                                 -0.311931773757945f,
+                                                 -1.7308240470400613f,
+                                                 0.0f,
+                                                 0.394499716712104f,
+                                                 0.6511462819539963f,
+                                                 -0.0722425275974144f,
+                                                 0.13490818194661386f,
+                                                 0.055319135836378035f,
+                                                 0.15389577508097013f,
+                                                 0.28958598328870605f,
+                                                 -0.14608429470539772f,
+                                                 0.09488817462478298f,
+                                                 -0.17231294096622088f,
+                                                 0.6721115415911466f,
+                                                 -0.05664621150536103f,
+                                                 0.03291799673669331f,
+                                                 0.02845382711057482f,
+                                                 -0.9953563446999164f,
+                                                 -0.17994298220605923f,
+                                                 0.6560824519337476f,
+                                                 0.0f,
+                                                 -0.30990646375917935f,
+                                                 0.17215517202874f,
+                                                 0.2026816225170481f,
+                                                 0.22011958747715601f,
+                                                 0.3562520768889686f,
+                                                 -0.18436559057189175f,
+                                                 0.1733377147302066f,
+                                                 0.02818276995640877f,
+                                                 -0.29703005574859076f,
+                                                 -0.3310652639215064f,
+                                                 -1.6091173258529277f,
+                                                 0.45461585790028003f,
+                                                 -0.5078643334592593f,
+                                                 -0.338997374732338f,
+                                                 0.4688619590359733f,
+                                                 0.627099126828289f,
+                                                 -0.5249801376494249f,
+                                                 0.0f,
+                                                 0.34465498218272883f,
+                                                 0.009891680630908135f,
+                                                 -0.27244020967349f,
+                                                 0.05404589867626979f,
+                                                 -0.06220329325739666f,
+                                                 -0.13365376464759104f,
+                                                 -0.13098573553512366f,
+                                                 0.11434198976289106f,
+                                                 0.6740951247574676f,
+                                                 1.3381727185724581f,
+                                                 -1.4865773213251936f,
+                                                 0.05809898701966341f,
+                                                 0.25380780261023456f,
+                                                 1.2716367496512722f,
+                                                 0.1768290070780598f,
+                                                 -0.07554828135356352f,
+                                                 0.8180570085344856f,
+                                                 0.0f,
+                                                 1.0788448980077463f,
+                                                 0.0651938742459459f,
+                                                 0.3807672030015587f,
+                                                 0.6144792680268445f,
+                                                 0.011660612214908059f,
+                                                 -0.018306023765580288f,
+                                                 0.44140813809926516f,
+                                                 -0.13411994195502386f,
+                                                 0.15920368955127778f,
+                                                 -0.19382358417849888f,
+                                                 -0.08802147969690055f,
+                                                 -0.019731052733814477f,
+                                                 0.1104744229169665f,
+                                                 -0.195834419735958f,
+                                                 -0.5005295046454347f,
+                                                 -0.17041241868229032f,
+                                                 -0.471942117351489f,
+                                                 0.0f,
+                                                 -0.3599073304761372f,
+                                                 -0.2745532782968519f,
+                                                 -0.8323064841106417f,
+                                                 -0.88355885384943f,
+                                                 -0.02826466859020679f,
+                                                 0.06977870308805256f,
+                                                 0.11926112095374196f,
+                                                 1.367382707959643f,
+                                                 -0.06119843162964051f,
+                                                 -0.5331395268889569f,
+                                                 -1.2155531584240624f,
+                                                 -0.01896651779524327f,
+                                                 0.10591845408571081f,
+                                                 -0.010632842156504733f,
+                                                 0.6150787968629282f,
+                                                 -0.4191690185896091f,
+                                                 -0.9961718918346271f,
+                                                 0.0f,
+                                                 0.23370364516013867f,
+                                                 0.4156033072362998f,
+                                                 0.1261005546633433f,
+                                                 0.0812413884532226f,
+                                                 -0.008894337353937203f,
+                                                 0.07984447025056046f,
+                                                 -0.1258098052766725f,
+                                                 -0.40245475467767916f,
+                                                 1.78188906675019f,
+                                                 -1.1544387954232302f,
+                                                 -0.41768781481273387f,
+                                                 0.6791211165341995f,
+                                                 -0.4175127856183446f,
+                                                 -0.07353219159767788f,
+                                                 -0.2888813577574072f,
+                                                 -0.7107767892597061f,
+                                                 -1.0450031091195449f,
+                                                 0.0f,
+                                                 -0.9221599545079143f,
+                                                 -0.6747876356740621f,
+                                                 0.30241454354872105f,
+                                                 0.4924965303373908f,
+                                                 -0.14042722740054084f,
+                                                 0.27744210409350445f,
+                                                 -0.14788270997426836f,
+                                                 -0.9081467469237995f,
+                                                 -0.04513115674995093f,
+                                                 -0.5254168669125793f,
+                                                 -0.6999012037974789f,
+                                                 0.434661246306547f,
+                                                 -0.7193303957246092f,
+                                                 -0.9117952623409744f,
+                                                 -1.5097267865916142f,
+                                                 -0.20779888103770922f,
+                                                 0.4935562480901218f,
+                                                 0.0f,
+                                                 0.18303393908923593f,
+                                                 0.34753722677570037f,
+                                                 0.29291001533177663f,
+                                                 0.3832351878354224f,
+                                                 0.3295194956120599f,
+                                                 -0.32398033003617527f,
+                                                 -0.31570906736433746f,
+                                                 0.23657779050372962f,
+                                                 0.9510794465234161f,
+                                                 -0.5122243902568278f,
+                                                 0.08652112725315658f,
+                                                 0.2246634353717998f,
+                                                 -0.9032595595582497f,
+                                                 -0.8936484034533545f,
+                                                 0.6012969720865752f,
+                                                 -0.6454216646117924f,
+                                                 -1.1753786049658332f,
+                                                 0.0f,
+                                                 -0.4360545677728656f,
+                                                 -0.6586237455328507f,
+                                                 -0.34347301697886656f,
+                                                 -0.8909724651992144f,
+                                                 -0.24378721818350263f,
+                                                 0.6179733359297576f,
+                                                 0.0661661181742234f,
+                                                 -0.14120142044993794f,
+                                                 -0.07732699885498932f,
+                                                 1.0221355882357506f,
+                                                 0.44514798994115284f,
+                                                 -0.7371569579959046f,
+                                                 -0.7212499572378936f,
+                                                 0.7453626921081045f,
+                                                 0.5478757761345768f,
+                                                 -0.39411232789985384f,
+                                                 0.7200542656743857f,
+                                                 0.0f,
+                                                 -0.11790869453118827f,
+                                                 -0.12317030713581928f,
+                                                 -0.4207902738133338f,
+                                                 0.15895105878327986f,
+                                                 0.304261777102111f,
+                                                 0.11450744587017621f,
+                                                 -0.11470709991317944f,
+                                                 0.5949222371739038f,
+                                                 0.6549518619412444f,
+                                                 -0.24390606570422838f,
+                                                 -0.4212796009440803f,
+                                                 -0.6269666206320964f,
+                                                 -0.5421193969807078f,
+                                                 -0.12297772128652287f,
+                                                 0.021517257619930424f,
+                                                 0.25462855095544523f,
+                                                 -0.22107798187348246f,
+                                                 0.0f,
+                                                 0.5204516300095662f,
+                                                 0.2837402841862462f,
+                                                 0.11310823283285916f,
+                                                 0.8944351685018025f,
+                                                 0.17487203235834015f,
+                                                 -0.5271221928634433f,
+                                                 -0.19516594503423199f,
+                                                 0.452456617580365f,
+                                                 1.2456272242706414f,
+                                                 0.24166615894862817f,
+                                                 0.09411429305204502f,
+                                                 -0.2730072283327243f,
+                                                 -0.8129383770918172f,
+                                                 -0.24093254193486136f,
+                                                 0.5696499174142177f,
+                                                 -0.11110805836073044f,
+                                                 -0.3968204166235694f,
+                                                 0.0f,
+                                                 -0.04388165369378549f,
+                                                 -0.005631266017272595f,
+                                                 -0.02574211858479705f,
+                                                 0.06230399626660669f,
+                                                 0.17677671232932785f,
+                                                 0.5172871274400965f,
+                                                 0.4919150085620063f,
+                                                 -1.597656637582941f,
+                                                 0.02415185715719143f,
+                                                 -0.17945446376668306f,
+                                                 -0.39340600199798886f,
+                                                 0.25013205256886845f,
+                                                 0.05972330340308685f,
+                                                 0.1359911505596489f,
+                                                 -0.02341033271820833f,
+                                                 0.15726074644063684f,
+                                                 0.47512625913020357f,
+                                                 0.0f,
+                                                 0.7327341664835779f,
+                                                 -0.3689092312320013f,
+                                                 0.4571824787436036f,
+                                                 0.6215465537945456f,
+                                                 0.0944111296842023f,
+                                                 -0.12571956176607574f,
+                                                 -0.2507235674395462f,
+                                                 -0.09579602654351593f,
+                                                 1.4463357293728496f,
+                                                 0.749153535856049f,
+                                                 -0.5553955120807588f,
+                                                 -0.09622771929369946f,
+                                                 -0.2598697420394813f,
+                                                 -0.964691815299676f,
+                                                 -0.8289963178173902f,
+                                                 0.7112949291983329f,
+                                                 -0.8667009730492162f,
+                                                 0.0f,
+                                                 -0.48698304169042794f,
+                                                 -0.18786095669893707f,
+                                                 -0.11425249263203247f,
+                                                 -0.3693391011684809f,
+                                                 0.09933145842585253f,
+                                                 0.2568559685298844f,
+                                                 0.7048512233651738f,
+                                                 0.6056238412407038f,
+                                                 -0.4355558119826642f,
+                                                 0.17318931883915484f,
+                                                 0.6481333496429564f,
+                                                 -0.45728823054344486f,
+                                                 -0.006325004538589701f,
+                                                 0.45609864075494927f,
+                                                 -0.6199385981116988f,
+                                                 0.035105808783046165f,
+                                                 0.1203147963894839f,
+                                                 0.0f,
+                                                 0.383402190836527f,
+                                                 0.048429009055370106f,
+                                                 0.5887186439275204f,
+                                                 -0.20538767641607814f,
+                                                 -0.031237879611002117f,
+                                                 0.3140759860883231f,
+                                                 0.24447070584999556f,
+                                                 0.7271263905705878f,
+                                                 0.8432799162434237f,
+                                                 -0.11530577554199217f,
+                                                 -0.7781023892314718f,
+                                                 0.05359488822710336f,
+                                                 0.5624870388700809f,
+                                                 0.5134656523208906f,
+                                                 0.18304041423438375f,
+                                                 -0.04237421156328257f,
+                                                 -0.20759809886942207f,
+                                                 0.0f,
+                                                 -0.06249337454975615f,
+                                                 0.10081284533873777f,
+                                                 0.3894374350259183f,
+                                                 1.518217777528342f,
+                                                 -0.9100037950171563f,
+                                                 0.17796906121831477f,
+                                                 -0.2892167255357892f,
+                                                 0.6117902467884032f,
+                                                 0.13332120964959573f,
+                                                 -0.3487155932849374f,
+                                                 -0.32920583745734694f,
+                                                 0.08242631209809854f,
+                                                 -0.24920225708110588f,
+                                                 0.8401757259392635f,
+                                                 0.11729108681358365f,
+                                                 0.11222925752499184f,
+                                                 -0.027078490721459958f,
+                                                 0.0f,
+                                                 0.726132375517389f,
+                                                 0.72220359881096f,
+                                                 0.5721582611845177f,
+                                                 0.15139162075524315f,
+                                                 0.6676549461551197f,
+                                                 -0.321449586554697f,
+                                                 -0.10141104515219895f,
+                                                 -0.09711123988777906f,
+                                                 0.9623356184776928f,
+                                                 -0.7941822373167173f,
+                                                 -0.9373923554119346f,
+                                                 0.4573241832354059f,
+                                                 -0.42029139056126147f,
+                                                 0.2675223459380999f,
+                                                 -0.5487300191551386f,
+                                                 0.2236621891916084f,
+                                                 0.11692039230044018f,
+                                                 0.0f,
+                                                 0.1758399202780961f,
+                                                 0.676447587678781f,
+                                                 0.5945412815881029f,
+                                                 0.5669863357359594f,
+                                                 0.8433565415303922f,
+                                                 -0.30300550790708036f,
+                                                 -0.43332881999693673f,
+                                                 -0.4996522695731392f,
+                                                 -0.2084930815451962f,
+                                                 0.27765278702463786f,
+                                                 1.0886848763946915f,
+                                                 -0.0739433655813831f,
+                                                 -0.4762801579229192f,
+                                                 -0.2490825339320731f,
+                                                 -1.8820479350439439f,
+                                                 -0.4251592225775914f,
+                                                 -0.3992922365484464f,
+                                                 0.0f,
+                                                 0.19598917760218867f,
+                                                 0.4860238022746914f,
+                                                 0.3364528828641281f,
+                                                 0.3350950865226741f,
+                                                 0.2773654548632006f,
+                                                 -0.30547262140782566f,
+                                                 0.028649620490728344f,
+                                                 -0.11763407628280315f,
+                                                 0.6237318502627169f,
+                                                 -0.3958952632477945f,
+                                                 0.14797171297835243f,
+                                                 0.45821729624747465f,
+                                                 -0.8687137170773626f,
+                                                 0.06989667196937126f,
+                                                 -0.5752606929478727f,
+                                                 0.16986945686358412f,
+                                                 0.6925071596817824f,
+                                                 0.0f,
+                                                 0.4991250796183003f,
+                                                 0.03424654896322111f,
+                                                 0.6153698611882319f,
+                                                 0.5070872444849457f,
+                                                 0.43615747516328135f,
+                                                 -0.7870352838659244f,
+                                                 -0.6424101231965247f,
+                                                 -0.7005774876651399f,
+                                                 0.79983115431488f,
+                                                 0.15720357955596242f,
+                                                 -1.408372612176309f,
+                                                 -0.039294695217213765f,
+                                                 0.6979415372962309f,
+                                                 0.27403316751965656f,
+                                                 1.2844596102619275f,
+                                                 -0.2781534150257364f,
+                                                 0.3248437714908865f,
+                                                 0.0f,
+                                                 0.4364362371752831f,
+                                                 -0.2548580911485434f,
+                                                 -0.19578001373349452f,
+                                                 -0.04597194387828005f,
+                                                 -0.010035156855533233f,
+                                                 0.0415941475251266f,
+                                                 0.07929549739797387f,
+                                                 -0.060629652912508866f,
+                                                 0.5977303008711333f,
+                                                 -1.4404008068066554f,
+                                                 0.8555694790197376f,
+                                                 -0.03693438534401856f,
+                                                 0.17761411164512408f,
+                                                 -0.11858304304109235f,
+                                                 -1.4241324353471327f,
+                                                 0.1533849765389186f,
+                                                 0.7650643783126995f,
+                                                 0.0f,
+                                                 -0.0639949379280401f,
+                                                 0.4288617817939563f,
+                                                 0.4235508646885404f,
+                                                 0.3419843254383798f,
+                                                 -0.015992360660098768f,
+                                                 -0.773247697505441f,
+                                                 -0.4908452922015917f,
+                                                 0.9868134897291486f,
+                                                 -0.5078689994742608f,
+                                                 1.05632043744864f,
+                                                 -0.38867419409275117f,
+                                                 -0.0065547696858664194f,
+                                                 -0.3056003173415037f,
+                                                 -0.333762331930102f,
+                                                 0.4459671174011671f,
+                                                 0.08219092584580244f,
+                                                 -0.08099158579518179f,
+                                                 0.0f,
+                                                 -0.1568180656346373f,
+                                                 -0.061962372393910135f,
+                                                 0.14065868174859464f,
+                                                 -0.055925712798972765f,
+                                                 0.05136117465820622f,
+                                                 0.0907831030477633f,
+                                                 0.19518110495319604f,
+                                                 -0.7470794578145956f,
+                                                 1.5945999734733545f,
+                                                 -0.4351697502345834f,
+                                                 -0.33253649399571805f };
+
+const float av1_mv_prec_nn_bias_layer_0[] = {
+  -0.651213833993862f,   -1.1243309933417809f,  -0.2123880023097051f,
+  0.23095477452877616f,  -0.6668057665893545f,  0.3082268148379634f,
+  -0.3344916753975844f,  -0.20920185606857844f, 0.6057933917964854f,
+  0.5031857662559803f,   -1.5380096313468152f,  -0.4457245344804041f,
+  1.82368055812373f,     0.7973912064077963f,   0.25706500555622913f,
+  0.1394695119825382f,   0.4508811973450553f,   -0.5408959545111782f,
+  1.064829233697863f,    0.3733268644246235f,   1.1173169029905483f,
+  -0.2012817466400134f,  -0.16628447748302294f, 1.3086000088940826f,
+  0.7267092979664235f,   -0.9097857006590555f,  -0.7564259343863077f,
+  -0.49844128036716173f, -0.4675729246975423f,  -0.03626154526362181f,
+  -0.41957330902404616f, -0.9658160514319954f
+};
+
+const float av1_mv_prec_nn_weights_layer_1[] = {
+  1.5017296484510276f,  1.044216918060133f,   -1.066541411740906f,
+  -0.7762965171172661f, -0.9814396609661653f, 0.9334065847340715f,
+  0.7117244268817873f,  -0.7695942296628597f, 0.7892157680137047f,
+  -0.5786309358654476f, -2.4444494892027264f, 1.1666759262637185f,
+  -0.9699580532370483f, 0.5849682956422552f,  -1.0372272986941953f,
+  -0.5005014627824439f, 1.1816204711740521f,  -1.2204867615892114f,
+  0.4510263977504913f,  0.35567865078585165f, -0.7811389330738839f,
+  -0.6643977800301099f, -0.6283287371705794f, 0.790873821018048f,
+  0.8861643352684585f,  0.6438840651522237f,  0.6677191546466089f,
+  0.9703715021995785f,  1.250893534236489f,   0.7733742028067933f,
+  -1.249673977776904f,  -1.2890127265725608f
+};
+
+const float av1_mv_prec_nn_bias_layer_1[] = { -0.341771735378258f };
+
+static const NN_CONFIG av1_mv_prec_dnn_config = {
+  NUM_DNN_FEATURES,
+  NUM_LOGITS,
+  NUM_DNN_LAYERS,
+  { MV_PREC_LAYER_SIZE_0 },
+  {
+      av1_mv_prec_nn_weights_layer_0,
+      av1_mv_prec_nn_weights_layer_1,
+  },
+  {
+      av1_mv_prec_nn_bias_layer_0,
+      av1_mv_prec_nn_bias_layer_1,
+  },
+};
+#undef NUM_DNN_LAYERS
+#undef NUM_DNN_FEATURES
+#undef NUM_LAYER_0_UNITS
+#undef NUM_LOGITS
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_MISC_MODEL_WEIGHTS_H_
diff --git a/libs/libaom/src/av1/encoder/ml.c b/libs/libaom/src/av1/encoder/ml.c
new file mode 100644
index 000000000..57228ec91
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/ml.c
@@ -0,0 +1,156 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <math.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "av1/encoder/ml.h"
+
+void av1_nn_output_prec_reduce(float *const output, int num_output) {
+  const int prec_bits = 11;
+  const int prec = 1 << prec_bits;
+  const float inv_prec = (float)(1.0 / prec);
+  for (int i = 0; i < num_output; i++) {
+    output[i] = ((int)(output[i] * prec + 0.5)) * inv_prec;
+  }
+}
+
+// Calculate prediction based on the given input features and neural net config.
+// Assume there are no more than NN_MAX_NODES_PER_LAYER nodes in each hidden
+// layer.
+void av1_nn_predict_c(const float *input_nodes,
+                      const NN_CONFIG *const nn_config, int reduce_prec,
+                      float *const output) {
+  int num_input_nodes = nn_config->num_inputs;
+  int buf_index = 0;
+  float buf[2][NN_MAX_NODES_PER_LAYER];
+
+  // Propagate hidden layers.
+  const int num_layers = nn_config->num_hidden_layers;
+  assert(num_layers <= NN_MAX_HIDDEN_LAYERS);
+  for (int layer = 0; layer < num_layers; ++layer) {
+    const float *layer_weights = nn_config->weights[layer];
+    const float *layer_bias = nn_config->bias[layer];
+    float *output_nodes = buf[buf_index];
+    const int num_output_nodes = nn_config->num_hidden_nodes[layer];
+    assert(num_output_nodes < NN_MAX_NODES_PER_LAYER);
+    for (int node = 0; node < num_output_nodes; ++node) {
+      float val = layer_bias[node];
+      for (int i = 0; i < num_input_nodes; ++i)
+        val += layer_weights[node * num_input_nodes + i] * input_nodes[i];
+      // ReLU as activation function.
+      val = val > 0.0f ? val : 0.0f;  // Could use AOMMAX().
+      output_nodes[node] = val;
+    }
+    num_input_nodes = num_output_nodes;
+    input_nodes = output_nodes;
+    buf_index = 1 - buf_index;
+  }
+
+  // Final output layer.
+  const float *layer_weights = nn_config->weights[num_layers];
+  const float *layer_bias = nn_config->bias[num_layers];
+  for (int node = 0; node < nn_config->num_outputs; ++node) {
+    float val = layer_bias[node];
+    for (int i = 0; i < num_input_nodes; ++i)
+      val += layer_weights[node * num_input_nodes + i] * input_nodes[i];
+    output[node] = val;
+  }
+  if (reduce_prec) av1_nn_output_prec_reduce(output, nn_config->num_outputs);
+}
+
+#if CONFIG_NN_V2
+// Applies the ReLu activation to one fc layer
+// output[i] = Max(input[i],0.0f)
+static float *nn_relu(const float *input, FC_LAYER *layer) {
+  for (int i = 0; i < layer->num_outputs; ++i) {
+    layer->output[i] = AOMMAX(input[i], 0.0f);
+  }
+
+  return layer->output;
+}
+
+// Applies the Sigmoid activation to one fc layer
+// output[i] = 1/(1+exp(input[i]))
+static float *nn_sigmoid(const float *input, FC_LAYER *layer) {
+  for (int i = 0; i < layer->num_outputs; ++i) {
+    const float tmp = AOMMIN(AOMMAX(input[i], -10.0f), 10.0f);
+    layer->output[i] = 1.0f / (1.0f + expf(-tmp));
+  }
+
+  return layer->output;
+}
+
+// Forward prediction in one fc layer, used in function av1_nn_predict_V2
+static float *nn_fc_forward(const float *input, FC_LAYER *layer) {
+  const float *weights = layer->weights;
+  const float *bias = layer->bias;
+  assert(layer->num_outputs < NN_MAX_NODES_PER_LAYER);
+  // fc
+  for (int node = 0; node < layer->num_outputs; ++node) {
+    float val = bias[node];
+    for (int i = 0; i < layer->num_inputs; ++i) val += weights[i] * input[i];
+    layer->output[node] = val;
+    weights += layer->num_inputs;
+  }
+
+  // activation
+  switch (layer->activation) {
+    case NONE: return layer->output;
+    case RELU: return nn_relu(layer->output, layer);
+    case SIGMOID: return nn_sigmoid(layer->output, layer);
+    case SOFTSIGN:
+      assert(0 && "Softsign has not been supported in NN.");  // TO DO
+      return NULL;
+    default:
+      assert(0 && "Unknown activation");  // Unknown activation
+      return NULL;
+  }
+}
+
+void av1_nn_predict_v2(const float *feature, NN_CONFIG_V2 *nn_config,
+                       int reduce_prec, float *output) {
+  const float *input_nodes = feature;
+
+  // Propagate the layers.
+  const int num_layers = nn_config->num_hidden_layers;
+  assert(num_layers <= NN_MAX_HIDDEN_LAYERS);
+  for (int i = 0; i < num_layers; ++i) {
+    input_nodes = nn_fc_forward(input_nodes, nn_config->layer + i);
+    assert(nn_config->layer[i + 1].num_inputs ==
+           nn_config->layer[i].num_outputs);
+  }
+
+  // Final layer
+  input_nodes = nn_fc_forward(input_nodes, nn_config->layer + num_layers);
+  assert(nn_config->layer[num_layers].num_outputs == nn_config->num_logits);
+  // Copy the final layer output
+  memcpy(output, input_nodes, sizeof(*input_nodes) * nn_config->num_logits);
+  if (reduce_prec) av1_nn_output_prec_reduce(output, nn_config->num_logits);
+}
+#endif  // CONFIG_NN_V2
+
+void av1_nn_softmax(const float *input, float *output, int n) {
+  // Softmax function is invariant to adding the same constant
+  // to all input values, so we subtract the maximum input to avoid
+  // possible overflow.
+  float max_inp = input[0];
+  for (int i = 1; i < n; i++) max_inp = AOMMAX(max_inp, input[i]);
+  float sum_out = 0.0f;
+  for (int i = 0; i < n; i++) {
+    // Clamp to range [-10.0, 0.0] to prevent FE_UNDERFLOW errors.
+    const float normalized_input = AOMMAX(input[i] - max_inp, -10.0f);
+    output[i] = (float)exp(normalized_input);
+    sum_out += output[i];
+  }
+  for (int i = 0; i < n; i++) output[i] /= sum_out;
+}
diff --git a/libs/libaom/src/av1/encoder/ml.h b/libs/libaom/src/av1/encoder/ml.h
new file mode 100644
index 000000000..62d543d6b
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/ml.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ML_H_
+#define AOM_AV1_ENCODER_ML_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "config/av1_rtcd.h"
+
+#define NN_MAX_HIDDEN_LAYERS 10
+#define NN_MAX_NODES_PER_LAYER 128
+
+struct NN_CONFIG {
+  int num_inputs;         // Number of input nodes, i.e. features.
+  int num_outputs;        // Number of output nodes.
+  int num_hidden_layers;  // Number of hidden layers, maximum 10.
+  // Number of nodes for each hidden layer.
+  int num_hidden_nodes[NN_MAX_HIDDEN_LAYERS];
+  // Weight parameters, indexed by layer.
+  const float *weights[NN_MAX_HIDDEN_LAYERS + 1];
+  // Bias parameters, indexed by layer.
+  const float *bias[NN_MAX_HIDDEN_LAYERS + 1];
+};
+// Typedef from struct NN_CONFIG to NN_CONFIG is in rtcd_defs
+
+#if CONFIG_NN_V2
+// Fully-connectedly layer configuration
+struct FC_LAYER {
+  const int num_inputs;   // Number of input nodes, i.e. features.
+  const int num_outputs;  // Number of output nodes.
+
+  float *weights;               // Weight parameters.
+  float *bias;                  // Bias parameters.
+  const ACTIVATION activation;  // Activation function.
+
+  float *output;  // The output array.
+  float *dY;      // Gradient of outputs
+  float *dW;      // Gradient of weights.
+  float *db;      // Gradient of bias
+};
+
+// NN configure structure V2
+struct NN_CONFIG_V2 {
+  const int num_hidden_layers;  // Number of hidden layers, max = 10.
+  FC_LAYER layer[NN_MAX_HIDDEN_LAYERS + 1];  // The layer array
+  const int num_logits;                      // Number of output nodes.
+  float *logits;    // Raw prediction (same as output of final layer)
+  const LOSS loss;  // Loss function
+};
+
+// Calculate prediction based on the given input features and neural net config.
+// Assume there are no more than NN_MAX_NODES_PER_LAYER nodes in each hidden
+// layer.
+void av1_nn_predict_v2(const float *features, NN_CONFIG_V2 *nn_config,
+                       int reduce_prec, float *output);
+#endif  // CONFIG_NN_V2
+
+// Applies the softmax normalization function to the input
+// to get a valid probability distribution in the output:
+// output[i] = exp(input[i]) / sum_{k \in [0,n)}(exp(input[k]))
+void av1_nn_softmax(const float *input, float *output, int n);
+
+// Applies a precision reduction to output of av1_nn_predict to prevent
+// mismatches between C and SIMD implementations.
+void av1_nn_output_prec_reduce(float *const output, int num_output);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_ML_H_
diff --git a/libs/libaom/src/av1/encoder/mode_prune_model_weights.h b/libs/libaom/src/av1/encoder/mode_prune_model_weights.h
new file mode 100644
index 000000000..98ec36808
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/mode_prune_model_weights.h
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_MODE_PRUNE_MODEL_WEIGHTS_H_
+#define AOM_AV1_ENCODER_MODE_PRUNE_MODEL_WEIGHTS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define NUM_HIDDEN_LAYERS_12 1
+#define NUM_FEATURES_12 6
+#define NUM_LAYER_0_UNITS_12 24
+#define NUM_LOGITS_12 2
+
+static const float av1_intrap_hiddenlayer_0_kernel_12[] = {
+  7.28372f,       -1.3333898f,    -1.3180022f,   -0.007156151f, -0.40799126f,
+  -0.57538104f,   -31.81647f,     6.7057495f,    6.351472f,     -0.029544508f,
+  0.026801195f,   1.12863f,       -0.70769817f,  -0.24183524f,  0.0649113f,
+  -0.7189517f,    0.21791299f,    0.12840256f,   -0.56424767f,  0.16924907f,
+  0.4605501f,     -0.170895f,     -0.60358995f,  -0.15383226f,  -4.0523643f,
+  0.6961917f,     1.3100256f,     -0.4189354f,   0.37264112f,   -0.14555685f,
+  10.628014f,     8.184437f,      8.941916f,     -0.011731001f, -0.45127156f,
+  0.42704004f,    36.84277f,      8.988796f,     8.844238f,     0.00030091056f,
+  -0.022038324f,  1.3566176f,     -8.863219f,    -0.84811693f,  -1.0908632f,
+  0.00023130262f, -1.0698471f,    -6.755927f,    7.1711984f,    4.7216063f,
+  3.5099216f,     -0.6650184f,    0.5935173f,    -0.6696286f,   11.8595295f,
+  0.3001874f,     0.29822728f,    0.04319222f,   -1.203178f,    1.1210147f,
+  0.035045594f,   -0.20559944f,   -0.015388541f, -0.7857941f,   -0.94100875f,
+  -0.1278549f,    -19.22603f,     7.9466896f,    6.5048656f,    -0.22195444f,
+  0.19061874f,    1.3927288f,     -8.896529f,    -0.48146892f,  -1.6098932f,
+  -0.0030235797f, -0.6533787f,    -2.1333003f,   -22.256454f,   -4.934058f,
+  -4.4707212f,    -0.015831878f,  -0.4243649f,   -2.776269f,    -0.23762038f,
+  0.1820098f,     -0.51865315f,   -1.1893421f,   0.34969202f,   0.10636194f,
+  14.545696f,     1.3849198f,     2.6815193f,    -0.5145498f,   0.45948258f,
+  -0.8842355f,    -0.9111363f,    -0.39652422f,  0.077266276f,  -0.68084997f,
+  0.4593515f,     -0.28872707f,   -6.936231f,    1.12253f,      1.7616503f,
+  -0.014069137f,  -0.0052156276f, -4.5095444f,   6.2076726f,    -0.058755957f,
+  -0.4675936f,    -0.13039507f,   0.12094394f,   -0.07285393f,  68.26125f,
+  7.4893136f,     8.770954f,      0.020274093f,  -0.027877754f, 1.6579602f,
+  -0.1825479f,    0.34832543f,    0.07472531f,   -0.44812247f,  -1.0941806f,
+  -0.16749863f,   1.1394324f,     0.47983396f,   -0.99983627f,  -0.00064249727f,
+  -1.3345739f,    -0.057157427f,  -18.14875f,    16.506035f,    15.539248f,
+  0.013191509f,   -0.021674965f,  -25.006235f,   0.51220596f,   0.7334426f,
+  0.81836903f,    -1.0443225f,    0.4459505f,    -1.2045046f
+};
+
+static const float av1_intrap_hiddenlayer_0_bias_12[] = {
+  -4.154915f,   14.33833f,   0.0f,       0.0f,         2.0440118f, 12.40922f,
+  -16.77514f,   0.5879813f,  3.2305415f, 0.8303539f,   0.0f,       14.488708f,
+  2.94393f,     1.874383f,   0.0f,       -0.53140444f, 0.0f,       1.8456234f,
+  -0.55427986f, -19.856262f, 0.0f,       0.17281002f,  48.31631f,  0.0f
+};
+
+static const float av1_intrap_logits_kernel_12[] = {
+  0.26843873f,   -0.09576241f,  0.34427166f,  0.09914787f,  -0.10275399f,
+  0.02999484f,   -0.1467772f,   0.11594324f,  0.29200763f,  0.0067976206f,
+  0.050393578f,  -0.018694371f, 0.3333476f,   0.2127221f,   0.35128218f,
+  0.19968672f,   0.08099991f,   0.084850654f, -0.16045967f, 0.30286232f,
+  0.6164765f,    -0.27140254f,  0.08210814f,  0.34852806f,  0.25028184f,
+  -0.12188078f,  0.16310331f,   0.31253803f,  -0.10792341f, 0.065858394f,
+  -0.1349708f,   0.08948815f,   0.31905392f,  0.03680656f,  -0.05040944f,
+  -0.051539157f, 0.3211852f,    0.2137136f,   0.45037416f,  0.22748767f,
+  -0.10978614f,  0.06475646f,   -0.16954158f, 0.32831904f,  0.16479677f,
+  -0.30020145f,  0.066221856f,  0.37213042f
+};
+
+static const float av1_intrap_logits_bias_12[] = { 0.95783f, -0.95823103f };
+
+static const NN_CONFIG av1_intrap_nn_config = {
+  NUM_FEATURES_12,
+  NUM_LOGITS_12,
+  NUM_HIDDEN_LAYERS_12,
+  {
+      NUM_LAYER_0_UNITS_12,
+  },
+  {
+      av1_intrap_hiddenlayer_0_kernel_12,
+      av1_intrap_logits_kernel_12,
+  },
+  {
+      av1_intrap_hiddenlayer_0_bias_12,
+      av1_intrap_logits_bias_12,
+  },
+};
+
+#undef NUM_HIDDEN_LAYERS_12
+#undef NUM_FEATURES_12
+#undef NUM_LAYER_0_UNITS_12
+#undef NUM_LOGITS_12
+
+#define NUM_HIDDEN_LAYERS_15 1
+#define NUM_FEATURES_15 6
+#define NUM_LAYER_0_UNITS_15 24
+#define NUM_LOGITS_15 2
+
+static const float av1_intraph_hiddenlayer_0_kernel_15[] = {
+  -0.77480125f,   0.3219551f,    -0.015702145f,   -0.5310235f,   0.5254026f,
+  -1.1522819f,    2.682016f,     0.08001052f,     -0.2539285f,   0.04711023f,
+  -0.81296307f,   0.2675382f,    0.1952474f,      -0.0664705f,   1.2989824f,
+  -0.3150117f,    -0.8022715f,   0.045423955f,    -27.584324f,   -2.5608704f,
+  -3.2280366f,    0.05272543f,   -0.47141576f,    -0.07644298f,  -53.77942f,
+  -22.393923f,    -23.027853f,   -0.00015186476f, -0.010696465f, 2.7064638f,
+  -22.776028f,    11.514891f,    11.138167f,      -0.001243723f, -0.4802433f,
+  -8.758646f,     0.26398206f,   -0.23485385f,    0.27586034f,   -0.004954741f,
+  -0.4935232f,    -0.017607696f, 69.56049f,       -1.1756641f,   -0.052366666f,
+  -0.38052833f,   0.32474658f,   0.04634263f,     0.8583235f,    -0.528438f,
+  -0.7868907f,    -0.4757781f,   0.4620985f,      -0.70621157f,  231.40195f,
+  6.805205f,      9.420295f,     0.02585775f,     -0.03480937f,  1.3577378f,
+  0.1758226f,     15.056758f,    14.437874f,      -0.1305005f,   0.115103304f,
+  0.21297209f,    55.821743f,    -6.611156f,      -6.8552365f,   -0.011928095f,
+  -0.2042175f,    1.2557873f,    -1.0722278f,     -0.2683614f,   0.48318478f,
+  -0.73739994f,   0.54055226f,   -0.03224738f,    -0.06767959f,  -0.21015017f,
+  0.29171246f,    -0.6937296f,   -1.2342545f,     -0.41278538f,  -37.9365f,
+  17.68424f,      16.263042f,    -0.074828684f,   0.06607806f,   -0.16763286f,
+  13.594707f,     0.6152676f,    -0.4371223f,     -0.8365592f,   0.8273623f,
+  -1.2126317f,    0.1216157f,    -1.3002136f,     -0.18856938f,  -0.2589358f,
+  -0.76897144f,   0.21777137f,   -122.25033f,     -0.23490006f,  -3.1238277f,
+  -0.13916978f,   0.08576391f,   -1.7391548f,     -116.24812f,   14.906071f,
+  13.468357f,     0.02332889f,   -0.034617376f,   -18.506111f,   0.7500542f,
+  -1.1882535f,    0.40848416f,   -0.28434393f,    -0.71471655f,  -0.29188696f,
+  -0.46588746f,   -0.17324813f,  -0.62460244f,    -1.1801276f,   0.28993344f,
+  -0.22072886f,   129.2688f,     -0.33782578f,    -0.34836572f,  -0.034112718f,
+  -0.023666814f,  -0.5865087f,   -33.484146f,     1.1431375f,    0.56056374f,
+  -0.0049730353f, -0.24347587f,  -1.3003352f,     0.88973033f,   0.8499571f,
+  -0.5678484f,    -0.39009875f,  -0.062105156f,   -0.13965102f
+};
+
+static const float av1_intraph_hiddenlayer_0_bias_15[] = {
+  0.0f,       -0.2926711f, 0.0f,         -1.0303509f, -27.459345f,  12.412848f,
+  0.0f,       -2.5971522f, -0.02733541f, -19.881912f, 14.391992f,   -8.249469f,
+  0.0f,       0.0f,        13.676118f,   -0.6472994f, -0.07189449f, 1.1986839f,
+  52.479107f, 0.0f,        0.0f,         -3.0187025f, 1.4435643f,   0.0f
+};
+
+static const float av1_intraph_logits_kernel_15[] = {
+  0.05390722f,   -0.06859513f, 0.036842898f, 0.190772f,    0.13623567f,
+  0.09321194f,   0.2314745f,   -0.13958375f, -0.3058229f,  -0.0104543045f,
+  0.11336068f,   -0.276115f,   0.00470723f,  -0.49123898f, -0.15988174f,
+  0.087681435f,  0.022517204f, 0.073877744f, 0.2968856f,   -0.1401399f,
+  -0.38788354f,  -0.26005393f, -0.39564916f, -0.16195515f, 0.2680102f,
+  -0.032179773f, -0.35758728f, 0.25819537f,  0.11468631f,  0.13573235f,
+  -0.2672175f,   0.016490124f, 0.048118807f, 0.020319486f, 0.07892215f,
+  -0.21821865f,  0.08434734f,  0.3129456f,   -0.18215221f, 0.08884877f,
+  -0.35621428f,  0.11405768f,  0.27370325f,  0.14956686f,  0.01604587f,
+  -0.18334487f,  -0.42385718f, -0.08033409f
+};
+
+static const float av1_intraph_logits_bias_15[] = { 0.83619016f, -0.8340626f };
+
+static const NN_CONFIG av1_intrap_hd_nn_config = {
+  NUM_FEATURES_15,
+  NUM_LOGITS_15,
+  NUM_HIDDEN_LAYERS_15,
+  {
+      NUM_LAYER_0_UNITS_15,
+  },
+  {
+      av1_intraph_hiddenlayer_0_kernel_15,
+      av1_intraph_logits_kernel_15,
+  },
+  {
+      av1_intraph_hiddenlayer_0_bias_15,
+      av1_intraph_logits_bias_15,
+  },
+};
+
+#undef NUM_HIDDEN_LAYERS_15
+#undef NUM_FEATURES_15
+#undef NUM_LAYER_0_UNITS_15
+#undef NUM_LOGITS_15
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_MODE_PRUNE_MODEL_WEIGHTS_H_
diff --git a/libs/libaom/src/av1/encoder/model_rd.h b/libs/libaom/src/av1/encoder/model_rd.h
new file mode 100644
index 000000000..c353c8f85
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/model_rd.h
@@ -0,0 +1,275 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_MODEL_RD_H_
+#define AOM_AV1_ENCODER_MODEL_RD_H_
+
+#include "aom/aom_integer.h"
+#include "av1/encoder/block.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/pustats.h"
+#include "av1/encoder/rdopt_utils.h"
+#include "aom_ports/system_state.h"
+#include "config/aom_dsp_rtcd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// 0: Legacy model
+// 1: Curve fit model
+// 2: Surface fit model
+// 3: DNN regression model
+// 4: Full rd model
+#define MODELRD_TYPE_INTERP_FILTER 1
+#define MODELRD_TYPE_TX_SEARCH_PRUNE 1
+#define MODELRD_TYPE_MASKED_COMPOUND 1
+#define MODELRD_TYPE_INTERINTRA 1
+#define MODELRD_TYPE_INTRA 1
+#define MODELRD_TYPE_MOTION_MODE_RD 1
+
+typedef void (*model_rd_for_sb_type)(const AV1_COMP *const cpi,
+                                     BLOCK_SIZE bsize, MACROBLOCK *x,
+                                     MACROBLOCKD *xd, int plane_from,
+                                     int plane_to, int *out_rate_sum,
+                                     int64_t *out_dist_sum, int *skip_txfm_sb,
+                                     int64_t *skip_sse_sb, int *plane_rate,
+                                     int64_t *plane_sse, int64_t *plane_dist);
+typedef void (*model_rd_from_sse_type)(const AV1_COMP *const cpi,
+                                       const MACROBLOCK *const x,
+                                       BLOCK_SIZE plane_bsize, int plane,
+                                       int64_t sse, int num_samples, int *rate,
+                                       int64_t *dist);
+
+static int64_t calculate_sse(MACROBLOCKD *const xd,
+                             const struct macroblock_plane *p,
+                             struct macroblockd_plane *pd, const int bw,
+                             const int bh) {
+  int64_t sse = 0;
+  const int shift = xd->bd - 8;
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (is_cur_buf_hbd(xd)) {
+    sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride,
+                         bw, bh);
+  } else {
+    sse =
+        aom_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw, bh);
+  }
+#else
+  sse = aom_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw, bh);
+#endif
+  sse = ROUND_POWER_OF_TWO(sse, shift * 2);
+  return sse;
+}
+
+static AOM_INLINE int64_t compute_sse_plane(MACROBLOCK *x, MACROBLOCKD *xd,
+                                            int plane, const BLOCK_SIZE bsize) {
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const BLOCK_SIZE plane_bsize =
+      get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+  int bw, bh;
+  const struct macroblock_plane *const p = &x->plane[plane];
+  get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL, &bw,
+                     &bh);
+
+  int64_t sse = calculate_sse(xd, p, pd, bw, bh);
+
+  return sse;
+}
+
+static AOM_INLINE void model_rd_from_sse(const AV1_COMP *const cpi,
+                                         const MACROBLOCK *const x,
+                                         BLOCK_SIZE plane_bsize, int plane,
+                                         int64_t sse, int num_samples,
+                                         int *rate, int64_t *dist) {
+  (void)num_samples;
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3;
+
+  // Fast approximate the modelling function.
+  if (cpi->sf.rd_sf.simple_model_rd_from_var) {
+    const int64_t square_error = sse;
+    int quantizer = p->dequant_QTX[1] >> dequant_shift;
+    if (quantizer < 120)
+      *rate = (int)AOMMIN(
+          (square_error * (280 - quantizer)) >> (16 - AV1_PROB_COST_SHIFT),
+          INT_MAX);
+    else
+      *rate = 0;
+    assert(*rate >= 0);
+    *dist = (square_error * quantizer) >> 8;
+  } else {
+    av1_model_rd_from_var_lapndz(sse, num_pels_log2_lookup[plane_bsize],
+                                 p->dequant_QTX[1] >> dequant_shift, rate,
+                                 dist);
+  }
+  *dist <<= 4;
+}
+
+// Fits a curve for rate and distortion using as feature:
+// log2(sse_norm/qstep^2)
+static AOM_INLINE void model_rd_with_curvfit(const AV1_COMP *const cpi,
+                                             const MACROBLOCK *const x,
+                                             BLOCK_SIZE plane_bsize, int plane,
+                                             int64_t sse, int num_samples,
+                                             int *rate, int64_t *dist) {
+  (void)cpi;
+  (void)plane_bsize;
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3;
+  const int qstep = AOMMAX(p->dequant_QTX[1] >> dequant_shift, 1);
+
+  if (sse == 0) {
+    if (rate) *rate = 0;
+    if (dist) *dist = 0;
+    return;
+  }
+  aom_clear_system_state();
+  const double sse_norm = (double)sse / num_samples;
+  const double qstepsqr = (double)qstep * qstep;
+  const double xqr = log2(sse_norm / qstepsqr);
+  double rate_f, dist_by_sse_norm_f;
+  av1_model_rd_curvfit(plane_bsize, sse_norm, xqr, &rate_f,
+                       &dist_by_sse_norm_f);
+
+  const double dist_f = dist_by_sse_norm_f * sse_norm;
+  int rate_i = (int)(AOMMAX(0.0, rate_f * num_samples) + 0.5);
+  int64_t dist_i = (int64_t)(AOMMAX(0.0, dist_f * num_samples) + 0.5);
+  aom_clear_system_state();
+
+  // Check if skip is better
+  if (rate_i == 0) {
+    dist_i = sse << 4;
+  } else if (RDCOST(x->rdmult, rate_i, dist_i) >=
+             RDCOST(x->rdmult, 0, sse << 4)) {
+    rate_i = 0;
+    dist_i = sse << 4;
+  }
+
+  if (rate) *rate = rate_i;
+  if (dist) *dist = dist_i;
+}
+
+static AOM_INLINE void model_rd_for_sb(
+    const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
+    int plane_from, int plane_to, int *out_rate_sum, int64_t *out_dist_sum,
+    int *skip_txfm_sb, int64_t *skip_sse_sb, int *plane_rate,
+    int64_t *plane_sse, int64_t *plane_dist) {
+  // Note our transform coeffs are 8 times an orthogonal transform.
+  // Hence quantizer step is also 8 times. To get effective quantizer
+  // we need to divide by 8 before sending to modeling function.
+  int plane;
+  const int ref = xd->mi[0]->ref_frame[0];
+
+  int64_t rate_sum = 0;
+  int64_t dist_sum = 0;
+  int64_t total_sse = 0;
+
+  assert(bsize < BLOCK_SIZES_ALL);
+
+  for (plane = plane_from; plane <= plane_to; ++plane) {
+    if (plane && !xd->is_chroma_ref) break;
+    struct macroblock_plane *const p = &x->plane[plane];
+    struct macroblockd_plane *const pd = &xd->plane[plane];
+    const BLOCK_SIZE plane_bsize =
+        get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+    assert(plane_bsize < BLOCK_SIZES_ALL);
+    const int bw = block_size_wide[plane_bsize];
+    const int bh = block_size_high[plane_bsize];
+    int64_t sse;
+    int rate;
+    int64_t dist;
+
+    sse = calculate_sse(xd, p, pd, bw, bh);
+
+    model_rd_from_sse(cpi, x, plane_bsize, plane, sse, bw * bh, &rate, &dist);
+
+    if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX);
+
+    total_sse += sse;
+    rate_sum += rate;
+    dist_sum += dist;
+    if (plane_rate) plane_rate[plane] = rate;
+    if (plane_sse) plane_sse[plane] = sse;
+    if (plane_dist) plane_dist[plane] = dist;
+    assert(rate_sum >= 0);
+  }
+
+  if (skip_txfm_sb) *skip_txfm_sb = total_sse == 0;
+  if (skip_sse_sb) *skip_sse_sb = total_sse << 4;
+  rate_sum = AOMMIN(rate_sum, INT_MAX);
+  *out_rate_sum = (int)rate_sum;
+  *out_dist_sum = dist_sum;
+}
+
+static AOM_INLINE void model_rd_for_sb_with_curvfit(
+    const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
+    int plane_from, int plane_to, int *out_rate_sum, int64_t *out_dist_sum,
+    int *skip_txfm_sb, int64_t *skip_sse_sb, int *plane_rate,
+    int64_t *plane_sse, int64_t *plane_dist) {
+  // Note our transform coeffs are 8 times an orthogonal transform.
+  // Hence quantizer step is also 8 times. To get effective quantizer
+  // we need to divide by 8 before sending to modeling function.
+  const int ref = xd->mi[0]->ref_frame[0];
+
+  int64_t rate_sum = 0;
+  int64_t dist_sum = 0;
+  int64_t total_sse = 0;
+
+  for (int plane = plane_from; plane <= plane_to; ++plane) {
+    if (plane && !xd->is_chroma_ref) break;
+    struct macroblockd_plane *const pd = &xd->plane[plane];
+    const BLOCK_SIZE plane_bsize =
+        get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+    int64_t dist, sse;
+    int rate;
+    int bw, bh;
+    const struct macroblock_plane *const p = &x->plane[plane];
+    get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL,
+                       &bw, &bh);
+
+    sse = calculate_sse(xd, p, pd, bw, bh);
+    model_rd_with_curvfit(cpi, x, plane_bsize, plane, sse, bw * bh, &rate,
+                          &dist);
+
+    if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX);
+
+    total_sse += sse;
+    rate_sum += rate;
+    dist_sum += dist;
+
+    if (plane_rate) plane_rate[plane] = rate;
+    if (plane_sse) plane_sse[plane] = sse;
+    if (plane_dist) plane_dist[plane] = dist;
+  }
+
+  if (skip_txfm_sb) *skip_txfm_sb = rate_sum == 0;
+  if (skip_sse_sb) *skip_sse_sb = total_sse << 4;
+  *out_rate_sum = (int)rate_sum;
+  *out_dist_sum = dist_sum;
+}
+
+enum { MODELRD_LEGACY, MODELRD_CURVFIT, MODELRD_TYPES } UENUM1BYTE(ModelRdType);
+
+static const model_rd_for_sb_type model_rd_sb_fn[MODELRD_TYPES] = {
+  model_rd_for_sb, model_rd_for_sb_with_curvfit
+};
+
+static const model_rd_from_sse_type model_rd_sse_fn[MODELRD_TYPES] = {
+  model_rd_from_sse, model_rd_with_curvfit
+};
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+#endif  // AOM_AV1_ENCODER_MODEL_RD_H_
diff --git a/libs/libaom/src/av1/encoder/motion_search_facade.c b/libs/libaom/src/av1/encoder/motion_search_facade.c
new file mode 100644
index 000000000..8db1423e7
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/motion_search_facade.c
@@ -0,0 +1,861 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_ports/system_state.h"
+
+#include "av1/common/reconinter.h"
+
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/mcomp.h"
+#include "av1/encoder/motion_search_facade.h"
+#include "av1/encoder/partition_strategy.h"
+#include "av1/encoder/reconinter_enc.h"
+#include "av1/encoder/tpl_model.h"
+
+#define RIGHT_SHIFT_MV(x) (((x) + 3 + ((x) >= 0)) >> 3)
+
+typedef struct {
+  FULLPEL_MV fmv;
+  int weight;
+} cand_mv_t;
+
+static int compare_weight(const void *a, const void *b) {
+  const int diff = ((cand_mv_t *)a)->weight - ((cand_mv_t *)b)->weight;
+  if (diff < 0)
+    return 1;
+  else if (diff > 0)
+    return -1;
+  return 0;
+}
+
+void av1_single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
+                              BLOCK_SIZE bsize, int ref_idx, int *rate_mv,
+                              int search_range, inter_mode_info *mode_info,
+                              int_mv *best_mv) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  const AV1_COMMON *cm = &cpi->common;
+  const MotionVectorSearchParams *mv_search_params = &cpi->mv_search_params;
+  const int num_planes = av1_num_planes(cm);
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0, 0, 0, 0 } };
+  int bestsme = INT_MAX;
+  const int ref = mbmi->ref_frame[ref_idx];
+  const YV12_BUFFER_CONFIG *scaled_ref_frame =
+      av1_get_scaled_ref_frame(cpi, ref);
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+
+  if (scaled_ref_frame) {
+    // Swap out the reference frame for a version that's been scaled to
+    // match the resolution of the current frame, allowing the existing
+    // full-pixel motion search code to be used without additional
+    // modifications.
+    for (int i = 0; i < num_planes; i++) {
+      backup_yv12[i] = xd->plane[i].pre[ref_idx];
+    }
+    av1_setup_pre_planes(xd, ref_idx, scaled_ref_frame, mi_row, mi_col, NULL,
+                         num_planes);
+  }
+
+  // Work out the size of the first step in the mv step search.
+  // 0 here is maximum length first step. 1 is AOMMAX >> 1 etc.
+  int step_param;
+  if (cpi->sf.mv_sf.auto_mv_step_size && cm->show_frame) {
+    // Take the weighted average of the step_params based on the last frame's
+    // max mv magnitude and that based on the best ref mvs of the current
+    // block for the given reference.
+    step_param = (av1_init_search_range(x->max_mv_context[ref]) +
+                  mv_search_params->mv_step_param) /
+                 2;
+  } else {
+    step_param = mv_search_params->mv_step_param;
+  }
+
+  if (cpi->sf.mv_sf.adaptive_motion_search && bsize < cm->seq_params.sb_size) {
+    int boffset =
+        2 * (mi_size_wide_log2[cm->seq_params.sb_size] -
+             AOMMIN(mi_size_high_log2[bsize], mi_size_wide_log2[bsize]));
+    step_param = AOMMAX(step_param, boffset);
+  }
+
+  if (cpi->sf.mv_sf.adaptive_motion_search) {
+    int bwl = mi_size_wide_log2[bsize];
+    int bhl = mi_size_high_log2[bsize];
+    int tlevel = x->pred_mv_sad[ref] >> (bwl + bhl + 4);
+
+    if (tlevel < 5) {
+      step_param += 2;
+      step_param = AOMMIN(step_param, MAX_MVSEARCH_STEPS - 1);
+    }
+
+    // prev_mv_sad is not setup for dynamically scaled frames.
+    if (cpi->oxcf.resize_mode != RESIZE_RANDOM) {
+      int i;
+      for (i = LAST_FRAME; i <= ALTREF_FRAME && cm->show_frame; ++i) {
+        if ((x->pred_mv_sad[ref] >> 3) > x->pred_mv_sad[i]) {
+          x->pred_mv[ref].row = 0;
+          x->pred_mv[ref].col = 0;
+          best_mv->as_int = INVALID_MV;
+
+          if (scaled_ref_frame) {
+            // Swap back the original buffers before returning.
+            for (int j = 0; j < num_planes; ++j)
+              xd->plane[j].pre[ref_idx] = backup_yv12[j];
+          }
+          return;
+        }
+      }
+    }
+  }
+
+  const MV ref_mv = av1_get_ref_mv(x, ref_idx).as_mv;
+  FULLPEL_MV start_mv;
+  if (mbmi->motion_mode != SIMPLE_TRANSLATION)
+    start_mv = get_fullmv_from_mv(&mbmi->mv[0].as_mv);
+  else
+    start_mv = get_fullmv_from_mv(&ref_mv);
+
+  // cand stores start_mv and all possible MVs in a SB.
+  cand_mv_t cand[MAX_MC_FLOW_BLK_IN_SB * MAX_MC_FLOW_BLK_IN_SB + 1] = {
+    { { 0, 0 }, 0 }
+  };
+  cand[0].fmv = start_mv;
+  int cnt = 1;
+  int total_weight = 0;
+
+  if (!cpi->sf.mv_sf.full_pixel_search_level &&
+      mbmi->motion_mode == SIMPLE_TRANSLATION) {
+    if (x->valid_cost_b) {
+      const BLOCK_SIZE tpl_bsize = convert_length_to_bsize(MC_FLOW_BSIZE_1D);
+      const int tplw = mi_size_wide[tpl_bsize];
+      const int tplh = mi_size_high[tpl_bsize];
+      const int nw = mi_size_wide[bsize] / tplw;
+      const int nh = mi_size_high[bsize] / tplh;
+
+      if (nw >= 1 && nh >= 1) {
+        const int of_h = mi_row % mi_size_high[cm->seq_params.sb_size];
+        const int of_w = mi_col % mi_size_wide[cm->seq_params.sb_size];
+        const int start = of_h / tplh * x->cost_stride + of_w / tplw;
+        int valid = 1;
+
+        // Assign large weight to start_mv, so it is always tested.
+        cand[0].weight = nw * nh;
+
+        for (int k = 0; k < nh; k++) {
+          for (int l = 0; l < nw; l++) {
+            const int_mv mv =
+                x->mv_b[start + k * x->cost_stride + l][ref - LAST_FRAME];
+            if (mv.as_int == INVALID_MV) {
+              valid = 0;
+              break;
+            }
+
+            const FULLPEL_MV fmv = { GET_MV_RAWPEL(mv.as_mv.row),
+                                     GET_MV_RAWPEL(mv.as_mv.col) };
+            int unique = 1;
+            for (int m = 0; m < cnt; m++) {
+              if (RIGHT_SHIFT_MV(fmv.row) == RIGHT_SHIFT_MV(cand[m].fmv.row) &&
+                  RIGHT_SHIFT_MV(fmv.col) == RIGHT_SHIFT_MV(cand[m].fmv.col)) {
+                unique = 0;
+                cand[m].weight++;
+                break;
+              }
+            }
+
+            if (unique) {
+              cand[cnt].fmv = fmv;
+              cand[cnt].weight = 1;
+              cnt++;
+            }
+          }
+          if (!valid) break;
+        }
+
+        if (valid) {
+          total_weight = 2 * nh * nw;
+          if (cnt > 2) qsort(cand, cnt, sizeof(cand[0]), &compare_weight);
+        }
+      }
+    }
+  }
+
+  // Further reduce the search range.
+  if (search_range < INT_MAX) {
+    const search_site_config *ss_cfg = &mv_search_params->ss_cfg[SS_CFG_SRC];
+    // MAx step_param is ss_cfg->ss_count.
+    if (search_range < 1) {
+      step_param = ss_cfg->ss_count;
+    } else {
+      while (ss_cfg->radius[ss_cfg->ss_count - step_param - 1] >
+                 (search_range << 1) &&
+             ss_cfg->ss_count - step_param - 1 > 0)
+        step_param++;
+    }
+  }
+
+  int cost_list[5];
+  int_mv second_best_mv;
+  best_mv->as_int = second_best_mv.as_int = INVALID_MV;
+
+  const search_site_config *src_search_sites =
+      &mv_search_params->ss_cfg[SS_CFG_SRC];
+  FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
+  av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize, &ref_mv,
+                                     src_search_sites);
+
+  switch (mbmi->motion_mode) {
+    case SIMPLE_TRANSLATION: {
+      int sum_weight = 0;
+
+      for (int m = 0; m < cnt; m++) {
+        FULLPEL_MV smv = cand[m].fmv;
+        FULLPEL_MV this_best_mv, this_second_best_mv;
+
+        int thissme = av1_full_pixel_search(
+            smv, &full_ms_params, step_param, cond_cost_list(cpi, cost_list),
+            &this_best_mv, &this_second_best_mv);
+
+        if (thissme < bestsme) {
+          bestsme = thissme;
+          best_mv->as_fullmv = this_best_mv;
+          second_best_mv.as_fullmv = this_second_best_mv;
+        }
+
+        sum_weight += cand[m].weight;
+        if (m >= 2 || 4 * sum_weight > 3 * total_weight) break;
+      }
+    } break;
+    case OBMC_CAUSAL:
+      bestsme = av1_obmc_full_pixel_search(start_mv, &full_ms_params,
+                                           step_param, &best_mv->as_fullmv);
+      break;
+    default: assert(0 && "Invalid motion mode!\n");
+  }
+
+  if (scaled_ref_frame) {
+    // Swap back the original buffers for subpel motion search.
+    for (int i = 0; i < num_planes; i++) {
+      xd->plane[i].pre[ref_idx] = backup_yv12[i];
+    }
+  }
+
+  // Terminate search with the current ref_idx if we have already encountered
+  // another ref_mv in the drl such that:
+  //  1. The other drl has the same fullpel_mv during the SIMPLE_TRANSLATION
+  //     search process as the current fullpel_mv.
+  //  2. The rate needed to encode the current fullpel_mv is larger than that
+  //     for the other ref_mv.
+  if (cpi->sf.inter_sf.skip_repeated_full_newmv &&
+      mbmi->motion_mode == SIMPLE_TRANSLATION &&
+      best_mv->as_int != INVALID_MV) {
+    int_mv this_mv;
+    this_mv.as_mv = get_mv_from_fullmv(&best_mv->as_fullmv);
+    const int ref_mv_idx = mbmi->ref_mv_idx;
+    const int this_mv_rate =
+        av1_mv_bit_cost(&this_mv.as_mv, &ref_mv, x->nmv_vec_cost,
+                        x->mv_cost_stack, MV_COST_WEIGHT);
+    mode_info[ref_mv_idx].full_search_mv.as_int = this_mv.as_int;
+    mode_info[ref_mv_idx].full_mv_rate = this_mv_rate;
+
+    for (int prev_ref_idx = 0; prev_ref_idx < ref_mv_idx; ++prev_ref_idx) {
+      // Check if the motion search result same as previous results
+      if (this_mv.as_int == mode_info[prev_ref_idx].full_search_mv.as_int) {
+        // Compare the rate cost
+        const int prev_rate_cost = mode_info[prev_ref_idx].full_mv_rate +
+                                   mode_info[prev_ref_idx].drl_cost;
+        const int this_rate_cost =
+            this_mv_rate + mode_info[ref_mv_idx].drl_cost;
+
+        if (prev_rate_cost <= this_rate_cost) {
+          // If the current rate_cost is worse than the previous rate_cost, then
+          // we terminate the search. Since av1_single_motion_search is only
+          // called by handle_new_mv in SIMPLE_TRANSLATION mode, we set the
+          // best_mv to INVALID mv to signal that we wish to terminate search
+          // for the current mode.
+          best_mv->as_int = INVALID_MV;
+          return;
+        }
+      }
+    }
+  }
+
+  if (cpi->common.features.cur_frame_force_integer_mv) {
+    convert_fullmv_to_mv(best_mv);
+  }
+
+  const int use_fractional_mv =
+      bestsme < INT_MAX && cpi->common.features.cur_frame_force_integer_mv == 0;
+  if (use_fractional_mv) {
+    int_mv fractional_ms_list[3];
+    av1_set_fractional_mv(fractional_ms_list);
+    int dis; /* TODO: use dis in distortion calculation later. */
+
+    SUBPEL_MOTION_SEARCH_PARAMS ms_params;
+    av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, &ref_mv,
+                                      cost_list);
+    MV subpel_start_mv = get_mv_from_fullmv(&best_mv->as_fullmv);
+
+    switch (mbmi->motion_mode) {
+      case SIMPLE_TRANSLATION:
+        if (cpi->sf.mv_sf.use_accurate_subpel_search) {
+          const int try_second = second_best_mv.as_int != INVALID_MV &&
+                                 second_best_mv.as_int != best_mv->as_int;
+          const int best_mv_var = mv_search_params->find_fractional_mv_step(
+              xd, cm, &ms_params, subpel_start_mv, &best_mv->as_mv, &dis,
+              &x->pred_sse[ref], fractional_ms_list);
+
+          if (try_second) {
+            MV this_best_mv;
+            subpel_start_mv = get_mv_from_fullmv(&second_best_mv.as_fullmv);
+            if (av1_is_subpelmv_in_range(&ms_params.mv_limits,
+                                         subpel_start_mv)) {
+              const int this_var = mv_search_params->find_fractional_mv_step(
+                  xd, cm, &ms_params, subpel_start_mv, &this_best_mv, &dis,
+                  &x->pred_sse[ref], fractional_ms_list);
+              if (this_var < best_mv_var) best_mv->as_mv = this_best_mv;
+            }
+          }
+        } else {
+          mv_search_params->find_fractional_mv_step(
+              xd, cm, &ms_params, subpel_start_mv, &best_mv->as_mv, &dis,
+              &x->pred_sse[ref], NULL);
+        }
+        break;
+      case OBMC_CAUSAL:
+        av1_find_best_obmc_sub_pixel_tree_up(xd, cm, &ms_params,
+                                             subpel_start_mv, &best_mv->as_mv,
+                                             &dis, &x->pred_sse[ref], NULL);
+        break;
+      default: assert(0 && "Invalid motion mode!\n");
+    }
+  }
+  *rate_mv = av1_mv_bit_cost(&best_mv->as_mv, &ref_mv, x->nmv_vec_cost,
+                             x->mv_cost_stack, MV_COST_WEIGHT);
+
+  if (cpi->sf.mv_sf.adaptive_motion_search &&
+      mbmi->motion_mode == SIMPLE_TRANSLATION)
+    x->pred_mv[ref] = best_mv->as_mv;
+}
+
+void av1_joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
+                             BLOCK_SIZE bsize, int_mv *cur_mv,
+                             const uint8_t *mask, int mask_stride,
+                             int *rate_mv) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  const int pw = block_size_wide[bsize];
+  const int ph = block_size_high[bsize];
+  const int plane = 0;
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  // This function should only ever be called for compound modes
+  assert(has_second_ref(mbmi));
+  const int_mv init_mv[2] = { cur_mv[0], cur_mv[1] };
+  const int refs[2] = { mbmi->ref_frame[0], mbmi->ref_frame[1] };
+  int_mv ref_mv[2];
+  int ite, ref;
+
+  // Get the prediction block from the 'other' reference frame.
+  const int_interpfilters interp_filters =
+      av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+
+  InterPredParams inter_pred_params;
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+
+  // Do joint motion search in compound mode to get more accurate mv.
+  struct buf_2d backup_yv12[2][MAX_MB_PLANE];
+  int last_besterr[2] = { INT_MAX, INT_MAX };
+  const YV12_BUFFER_CONFIG *const scaled_ref_frame[2] = {
+    av1_get_scaled_ref_frame(cpi, refs[0]),
+    av1_get_scaled_ref_frame(cpi, refs[1])
+  };
+
+  // Prediction buffer from second frame.
+  DECLARE_ALIGNED(16, uint8_t, second_pred16[MAX_SB_SQUARE * sizeof(uint16_t)]);
+  uint8_t *second_pred = get_buf_by_bd(xd, second_pred16);
+  int_mv best_mv;
+
+  // Allow joint search multiple times iteratively for each reference frame
+  // and break out of the search loop if it couldn't find a better mv.
+  for (ite = 0; ite < 4; ite++) {
+    struct buf_2d ref_yv12[2];
+    int bestsme = INT_MAX;
+    int id = ite % 2;  // Even iterations search in the first reference frame,
+                       // odd iterations search in the second. The predictor
+                       // found for the 'other' reference frame is factored in.
+    if (ite >= 2 && cur_mv[!id].as_int == init_mv[!id].as_int) {
+      if (cur_mv[id].as_int == init_mv[id].as_int) {
+        break;
+      } else {
+        int_mv cur_int_mv, init_int_mv;
+        cur_int_mv.as_mv.col = cur_mv[id].as_mv.col >> 3;
+        cur_int_mv.as_mv.row = cur_mv[id].as_mv.row >> 3;
+        init_int_mv.as_mv.row = init_mv[id].as_mv.row >> 3;
+        init_int_mv.as_mv.col = init_mv[id].as_mv.col >> 3;
+        if (cur_int_mv.as_int == init_int_mv.as_int) {
+          break;
+        }
+      }
+    }
+    for (ref = 0; ref < 2; ++ref) {
+      ref_mv[ref] = av1_get_ref_mv(x, ref);
+      // Swap out the reference frame for a version that's been scaled to
+      // match the resolution of the current frame, allowing the existing
+      // motion search code to be used without additional modifications.
+      if (scaled_ref_frame[ref]) {
+        int i;
+        for (i = 0; i < num_planes; i++)
+          backup_yv12[ref][i] = xd->plane[i].pre[ref];
+        av1_setup_pre_planes(xd, ref, scaled_ref_frame[ref], mi_row, mi_col,
+                             NULL, num_planes);
+      }
+    }
+
+    assert(IMPLIES(scaled_ref_frame[0] != NULL,
+                   cm->width == scaled_ref_frame[0]->y_crop_width &&
+                       cm->height == scaled_ref_frame[0]->y_crop_height));
+    assert(IMPLIES(scaled_ref_frame[1] != NULL,
+                   cm->width == scaled_ref_frame[1]->y_crop_width &&
+                       cm->height == scaled_ref_frame[1]->y_crop_height));
+
+    // Initialize based on (possibly scaled) prediction buffers.
+    ref_yv12[0] = xd->plane[plane].pre[0];
+    ref_yv12[1] = xd->plane[plane].pre[1];
+
+    av1_init_inter_params(&inter_pred_params, pw, ph, mi_row * MI_SIZE,
+                          mi_col * MI_SIZE, 0, 0, xd->bd, is_cur_buf_hbd(xd), 0,
+                          &cm->sf_identity, &ref_yv12[!id], interp_filters);
+    inter_pred_params.conv_params = get_conv_params(0, 0, xd->bd);
+
+    // Since we have scaled the reference frames to match the size of the
+    // current frame we must use a unit scaling factor during mode selection.
+    av1_enc_build_one_inter_predictor(second_pred, pw, &cur_mv[!id].as_mv,
+                                      &inter_pred_params);
+
+    const int order_idx = id != 0;
+    av1_dist_wtd_comp_weight_assign(
+        cm, mbmi, order_idx, &xd->jcp_param.fwd_offset,
+        &xd->jcp_param.bck_offset, &xd->jcp_param.use_dist_wtd_comp_avg, 1);
+
+    // Do full-pixel compound motion search on the current reference frame.
+    if (id) xd->plane[plane].pre[0] = ref_yv12[id];
+
+    // Make motion search params
+    FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
+    av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize,
+                                       &ref_mv[id].as_mv, NULL);
+    av1_set_ms_compound_refs(&full_ms_params.ms_buffers, second_pred, mask,
+                             mask_stride, id);
+
+    // Use the mv result from the single mode as mv predictor.
+    const FULLPEL_MV start_fullmv = get_fullmv_from_mv(&cur_mv[id].as_mv);
+
+    // Small-range full-pixel motion search.
+    bestsme = av1_refining_search_8p_c(&full_ms_params, start_fullmv,
+                                       &best_mv.as_fullmv);
+
+    if (bestsme < INT_MAX) {
+      bestsme = av1_get_mvpred_compound_var(
+          &full_ms_params.mv_cost_params, best_mv.as_fullmv, second_pred, mask,
+          mask_stride, id, &cpi->fn_ptr[bsize], &x->plane[0].src,
+          &ref_yv12[id]);
+    }
+
+    // Restore the pointer to the first (possibly scaled) prediction buffer.
+    if (id) xd->plane[plane].pre[0] = ref_yv12[0];
+
+    for (ref = 0; ref < 2; ++ref) {
+      if (scaled_ref_frame[ref]) {
+        // Swap back the original buffers for subpel motion search.
+        for (int i = 0; i < num_planes; i++) {
+          xd->plane[i].pre[ref] = backup_yv12[ref][i];
+        }
+        // Re-initialize based on unscaled prediction buffers.
+        ref_yv12[ref] = xd->plane[plane].pre[ref];
+      }
+    }
+
+    // Do sub-pixel compound motion search on the current reference frame.
+    if (id) xd->plane[plane].pre[0] = ref_yv12[id];
+
+    if (cpi->common.features.cur_frame_force_integer_mv) {
+      convert_fullmv_to_mv(&best_mv);
+    }
+    if (bestsme < INT_MAX &&
+        cpi->common.features.cur_frame_force_integer_mv == 0) {
+      int dis; /* TODO: use dis in distortion calculation later. */
+      unsigned int sse;
+      SUBPEL_MOTION_SEARCH_PARAMS ms_params;
+      av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize,
+                                        &ref_mv[id].as_mv, NULL);
+      av1_set_ms_compound_refs(&ms_params.var_params.ms_buffers, second_pred,
+                               mask, mask_stride, id);
+      ms_params.forced_stop = EIGHTH_PEL;
+      MV start_mv = get_mv_from_fullmv(&best_mv.as_fullmv);
+      bestsme = cpi->mv_search_params.find_fractional_mv_step(
+          xd, cm, &ms_params, start_mv, &best_mv.as_mv, &dis, &sse, NULL);
+    }
+
+    // Restore the pointer to the first prediction buffer.
+    if (id) xd->plane[plane].pre[0] = ref_yv12[0];
+    if (bestsme < last_besterr[id]) {
+      cur_mv[id] = best_mv;
+      last_besterr[id] = bestsme;
+    } else {
+      break;
+    }
+  }
+
+  *rate_mv = 0;
+
+  for (ref = 0; ref < 2; ++ref) {
+    const int_mv curr_ref_mv = av1_get_ref_mv(x, ref);
+    *rate_mv +=
+        av1_mv_bit_cost(&cur_mv[ref].as_mv, &curr_ref_mv.as_mv, x->nmv_vec_cost,
+                        x->mv_cost_stack, MV_COST_WEIGHT);
+  }
+}
+
+// Search for the best mv for one component of a compound,
+// given that the other component is fixed.
+void av1_compound_single_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
+                                       BLOCK_SIZE bsize, MV *this_mv,
+                                       const uint8_t *second_pred,
+                                       const uint8_t *mask, int mask_stride,
+                                       int *rate_mv, int ref_idx) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  const int ref = mbmi->ref_frame[ref_idx];
+  const int_mv ref_mv = av1_get_ref_mv(x, ref_idx);
+  struct macroblockd_plane *const pd = &xd->plane[0];
+
+  struct buf_2d backup_yv12[MAX_MB_PLANE];
+  const YV12_BUFFER_CONFIG *const scaled_ref_frame =
+      av1_get_scaled_ref_frame(cpi, ref);
+
+  // Check that this is either an interinter or an interintra block
+  assert(has_second_ref(mbmi) || (ref_idx == 0 && is_interintra_mode(mbmi)));
+
+  // Store the first prediction buffer.
+  struct buf_2d orig_yv12;
+  struct buf_2d ref_yv12 = pd->pre[ref_idx];
+  if (ref_idx) {
+    orig_yv12 = pd->pre[0];
+    pd->pre[0] = pd->pre[ref_idx];
+  }
+
+  if (scaled_ref_frame) {
+    // Swap out the reference frame for a version that's been scaled to
+    // match the resolution of the current frame, allowing the existing
+    // full-pixel motion search code to be used without additional
+    // modifications.
+    for (int i = 0; i < num_planes; i++) {
+      backup_yv12[i] = xd->plane[i].pre[ref_idx];
+    }
+    const int mi_row = xd->mi_row;
+    const int mi_col = xd->mi_col;
+    av1_setup_pre_planes(xd, ref_idx, scaled_ref_frame, mi_row, mi_col, NULL,
+                         num_planes);
+  }
+
+  int bestsme = INT_MAX;
+  int_mv best_mv;
+
+  // Make motion search params
+  FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
+  av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize,
+                                     &ref_mv.as_mv, NULL);
+  av1_set_ms_compound_refs(&full_ms_params.ms_buffers, second_pred, mask,
+                           mask_stride, ref_idx);
+
+  // Use the mv result from the single mode as mv predictor.
+  const FULLPEL_MV start_fullmv = get_fullmv_from_mv(this_mv);
+
+  // Small-range full-pixel motion search.
+  bestsme = av1_refining_search_8p_c(&full_ms_params, start_fullmv,
+                                     &best_mv.as_fullmv);
+
+  if (bestsme < INT_MAX) {
+    bestsme = av1_get_mvpred_compound_var(
+        &full_ms_params.mv_cost_params, best_mv.as_fullmv, second_pred, mask,
+        mask_stride, ref_idx, &cpi->fn_ptr[bsize], &x->plane[0].src, &ref_yv12);
+  }
+
+  if (scaled_ref_frame) {
+    // Swap back the original buffers for subpel motion search.
+    for (int i = 0; i < num_planes; i++) {
+      xd->plane[i].pre[ref_idx] = backup_yv12[i];
+    }
+  }
+
+  if (cpi->common.features.cur_frame_force_integer_mv) {
+    convert_fullmv_to_mv(&best_mv);
+  }
+  const int use_fractional_mv =
+      bestsme < INT_MAX && cpi->common.features.cur_frame_force_integer_mv == 0;
+  if (use_fractional_mv) {
+    int dis; /* TODO: use dis in distortion calculation later. */
+    unsigned int sse;
+    SUBPEL_MOTION_SEARCH_PARAMS ms_params;
+    av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, &ref_mv.as_mv,
+                                      NULL);
+    av1_set_ms_compound_refs(&ms_params.var_params.ms_buffers, second_pred,
+                             mask, mask_stride, ref_idx);
+    ms_params.forced_stop = EIGHTH_PEL;
+    MV start_mv = get_mv_from_fullmv(&best_mv.as_fullmv);
+    bestsme = cpi->mv_search_params.find_fractional_mv_step(
+        xd, cm, &ms_params, start_mv, &best_mv.as_mv, &dis, &sse, NULL);
+  }
+
+  // Restore the pointer to the first unscaled prediction buffer.
+  if (ref_idx) pd->pre[0] = orig_yv12;
+
+  if (bestsme < INT_MAX) *this_mv = best_mv.as_mv;
+
+  *rate_mv = 0;
+
+  *rate_mv += av1_mv_bit_cost(this_mv, &ref_mv.as_mv, x->nmv_vec_cost,
+                              x->mv_cost_stack, MV_COST_WEIGHT);
+}
+
+static AOM_INLINE void build_second_inter_pred(const AV1_COMP *cpi,
+                                               MACROBLOCK *x, BLOCK_SIZE bsize,
+                                               const MV *other_mv, int ref_idx,
+                                               uint8_t *second_pred) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int pw = block_size_wide[bsize];
+  const int ph = block_size_high[bsize];
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  struct macroblockd_plane *const pd = &xd->plane[0];
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+  const int p_col = ((mi_col * MI_SIZE) >> pd->subsampling_x);
+  const int p_row = ((mi_row * MI_SIZE) >> pd->subsampling_y);
+
+  // This function should only ever be called for compound modes
+  assert(has_second_ref(mbmi));
+
+  const int plane = 0;
+  struct buf_2d ref_yv12 = xd->plane[plane].pre[!ref_idx];
+
+  struct scale_factors sf;
+  av1_setup_scale_factors_for_frame(&sf, ref_yv12.width, ref_yv12.height,
+                                    cm->width, cm->height);
+
+  InterPredParams inter_pred_params;
+
+  av1_init_inter_params(&inter_pred_params, pw, ph, p_row, p_col,
+                        pd->subsampling_x, pd->subsampling_y, xd->bd,
+                        is_cur_buf_hbd(xd), 0, &sf, &ref_yv12,
+                        mbmi->interp_filters);
+  inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd);
+
+  // Get the prediction block from the 'other' reference frame.
+  av1_enc_build_one_inter_predictor(second_pred, pw, other_mv,
+                                    &inter_pred_params);
+
+  av1_dist_wtd_comp_weight_assign(cm, mbmi, 0, &xd->jcp_param.fwd_offset,
+                                  &xd->jcp_param.bck_offset,
+                                  &xd->jcp_param.use_dist_wtd_comp_avg, 1);
+}
+
+// Wrapper for av1_compound_single_motion_search, for the common case
+// where the second prediction is also an inter mode.
+void av1_compound_single_motion_search_interinter(
+    const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int_mv *cur_mv,
+    const uint8_t *mask, int mask_stride, int *rate_mv, int ref_idx) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  // This function should only ever be called for compound modes
+  assert(has_second_ref(xd->mi[0]));
+
+  // Prediction buffer from second frame.
+  DECLARE_ALIGNED(16, uint16_t, second_pred_alloc_16[MAX_SB_SQUARE]);
+  uint8_t *second_pred;
+  if (is_cur_buf_hbd(xd))
+    second_pred = CONVERT_TO_BYTEPTR(second_pred_alloc_16);
+  else
+    second_pred = (uint8_t *)second_pred_alloc_16;
+
+  MV *this_mv = &cur_mv[ref_idx].as_mv;
+  const MV *other_mv = &cur_mv[!ref_idx].as_mv;
+  build_second_inter_pred(cpi, x, bsize, other_mv, ref_idx, second_pred);
+  av1_compound_single_motion_search(cpi, x, bsize, this_mv, second_pred, mask,
+                                    mask_stride, rate_mv, ref_idx);
+}
+
+static AOM_INLINE void do_masked_motion_search_indexed(
+    const AV1_COMP *const cpi, MACROBLOCK *x, const int_mv *const cur_mv,
+    const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE bsize,
+    int_mv *tmp_mv, int *rate_mv, int which) {
+  // NOTE: which values: 0 - 0 only, 1 - 1 only, 2 - both
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  BLOCK_SIZE sb_type = mbmi->sb_type;
+  const uint8_t *mask;
+  const int mask_stride = block_size_wide[bsize];
+
+  mask = av1_get_compound_type_mask(comp_data, sb_type);
+
+  tmp_mv[0].as_int = cur_mv[0].as_int;
+  tmp_mv[1].as_int = cur_mv[1].as_int;
+  if (which == 0 || which == 1) {
+    av1_compound_single_motion_search_interinter(cpi, x, bsize, tmp_mv, mask,
+                                                 mask_stride, rate_mv, which);
+  } else if (which == 2) {
+    av1_joint_motion_search(cpi, x, bsize, tmp_mv, mask, mask_stride, rate_mv);
+  }
+}
+
+int av1_interinter_compound_motion_search(const AV1_COMP *const cpi,
+                                          MACROBLOCK *x,
+                                          const int_mv *const cur_mv,
+                                          const BLOCK_SIZE bsize,
+                                          const PREDICTION_MODE this_mode) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  int_mv tmp_mv[2];
+  int tmp_rate_mv = 0;
+  mbmi->interinter_comp.seg_mask = xd->seg_mask;
+  const INTERINTER_COMPOUND_DATA *compound_data = &mbmi->interinter_comp;
+
+  if (this_mode == NEW_NEWMV) {
+    do_masked_motion_search_indexed(cpi, x, cur_mv, compound_data, bsize,
+                                    tmp_mv, &tmp_rate_mv, 2);
+    mbmi->mv[0].as_int = tmp_mv[0].as_int;
+    mbmi->mv[1].as_int = tmp_mv[1].as_int;
+  } else if (this_mode >= NEAREST_NEWMV && this_mode <= NEW_NEARMV) {
+    // which = 1 if this_mode == NEAREST_NEWMV || this_mode == NEAR_NEWMV
+    // which = 0 if this_mode == NEW_NEARESTMV || this_mode == NEW_NEARMV
+    int which = (NEWMV == compound_ref1_mode(this_mode));
+    do_masked_motion_search_indexed(cpi, x, cur_mv, compound_data, bsize,
+                                    tmp_mv, &tmp_rate_mv, which);
+    mbmi->mv[which].as_int = tmp_mv[which].as_int;
+  }
+  return tmp_rate_mv;
+}
+
+int_mv av1_simple_motion_search(AV1_COMP *const cpi, MACROBLOCK *x, int mi_row,
+                                int mi_col, BLOCK_SIZE bsize, int ref,
+                                FULLPEL_MV start_mv, int num_planes,
+                                int use_subpixel) {
+  assert(num_planes == 1 &&
+         "Currently simple_motion_search only supports luma plane");
+  assert(!frame_is_intra_only(&cpi->common) &&
+         "Simple motion search only enabled for non-key frames");
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+
+  set_offsets_for_motion_search(cpi, x, mi_row, mi_col, bsize);
+
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  mbmi->sb_type = bsize;
+  mbmi->ref_frame[0] = ref;
+  mbmi->ref_frame[1] = NONE_FRAME;
+  mbmi->motion_mode = SIMPLE_TRANSLATION;
+  mbmi->interp_filters = av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+
+  const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, ref);
+  const YV12_BUFFER_CONFIG *scaled_ref_frame =
+      av1_get_scaled_ref_frame(cpi, ref);
+  struct buf_2d backup_yv12;
+  // ref_mv is used to calculate the cost of the motion vector
+  const MV ref_mv = kZeroMv;
+  const int step_param = cpi->mv_search_params.mv_step_param;
+  const search_site_config *src_search_sites =
+      &cpi->mv_search_params.ss_cfg[SS_CFG_SRC];
+  int cost_list[5];
+  const int ref_idx = 0;
+  int var;
+  int_mv best_mv;
+
+  av1_setup_pre_planes(xd, ref_idx, yv12, mi_row, mi_col,
+                       get_ref_scale_factors(cm, ref), num_planes);
+  set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+  if (scaled_ref_frame) {
+    backup_yv12 = xd->plane[AOM_PLANE_Y].pre[ref_idx];
+    av1_setup_pre_planes(xd, ref_idx, scaled_ref_frame, mi_row, mi_col, NULL,
+                         num_planes);
+  }
+
+  FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
+  av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize, &ref_mv,
+                                     src_search_sites);
+
+  var = av1_full_pixel_search(start_mv, &full_ms_params, step_param,
+                              cond_cost_list(cpi, cost_list),
+                              &best_mv.as_fullmv, NULL);
+
+  const int use_subpel_search =
+      var < INT_MAX && !cpi->common.features.cur_frame_force_integer_mv &&
+      use_subpixel;
+  if (scaled_ref_frame) {
+    xd->plane[AOM_PLANE_Y].pre[ref_idx] = backup_yv12;
+  }
+  if (use_subpel_search) {
+    int not_used = 0;
+
+    SUBPEL_MOTION_SEARCH_PARAMS ms_params;
+    av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, &ref_mv,
+                                      cost_list);
+    // TODO(yunqing): integrate this into av1_make_default_subpel_ms_params().
+    ms_params.forced_stop = cpi->sf.mv_sf.simple_motion_subpel_force_stop;
+
+    MV subpel_start_mv = get_mv_from_fullmv(&best_mv.as_fullmv);
+
+    cpi->mv_search_params.find_fractional_mv_step(
+        xd, cm, &ms_params, subpel_start_mv, &best_mv.as_mv, &not_used,
+        &x->pred_sse[ref], NULL);
+  } else {
+    // Manually convert from units of pixel to 1/8-pixels if we are not doing
+    // subpel search
+    convert_fullmv_to_mv(&best_mv);
+  }
+
+  mbmi->mv[0] = best_mv;
+
+  // Get a copy of the prediction output
+  av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+                                AOM_PLANE_Y, AOM_PLANE_Y);
+
+  aom_clear_system_state();
+
+  if (scaled_ref_frame) {
+    xd->plane[AOM_PLANE_Y].pre[ref_idx] = backup_yv12;
+  }
+
+  return best_mv;
+}
+
+int_mv av1_simple_motion_sse_var(AV1_COMP *cpi, MACROBLOCK *x, int mi_row,
+                                 int mi_col, BLOCK_SIZE bsize,
+                                 const FULLPEL_MV start_mv, int use_subpixel,
+                                 unsigned int *sse, unsigned int *var) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  const MV_REFERENCE_FRAME ref =
+      cpi->rc.is_src_frame_alt_ref ? ALTREF_FRAME : LAST_FRAME;
+
+  int_mv best_mv = av1_simple_motion_search(cpi, x, mi_row, mi_col, bsize, ref,
+                                            start_mv, 1, use_subpixel);
+
+  const uint8_t *src = x->plane[0].src.buf;
+  const int src_stride = x->plane[0].src.stride;
+  const uint8_t *dst = xd->plane[0].dst.buf;
+  const int dst_stride = xd->plane[0].dst.stride;
+
+  *var = cpi->fn_ptr[bsize].vf(src, src_stride, dst, dst_stride, sse);
+
+  return best_mv;
+}
diff --git a/libs/libaom/src/av1/encoder/motion_search_facade.h b/libs/libaom/src/av1/encoder/motion_search_facade.h
new file mode 100644
index 000000000..3b86e9376
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/motion_search_facade.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_MOTION_SEARCH_H_
+#define AOM_AV1_ENCODER_MOTION_SEARCH_H_
+
+#include "av1/encoder/encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+  int64_t rd;
+  int drl_cost;
+
+  int rate_mv;
+  int_mv mv;
+
+  int_mv full_search_mv;
+  int full_mv_rate;
+} inter_mode_info;
+
+void av1_single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
+                              BLOCK_SIZE bsize, int ref_idx, int *rate_mv,
+                              int search_range, inter_mode_info *mode_info,
+                              int_mv *best_mv);
+
+void av1_joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
+                             BLOCK_SIZE bsize, int_mv *cur_mv,
+                             const uint8_t *mask, int mask_stride,
+                             int *rate_mv);
+
+int av1_interinter_compound_motion_search(const AV1_COMP *const cpi,
+                                          MACROBLOCK *x,
+                                          const int_mv *const cur_mv,
+                                          const BLOCK_SIZE bsize,
+                                          const PREDICTION_MODE this_mode);
+
+void av1_compound_single_motion_search_interinter(
+    const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int_mv *cur_mv,
+    const uint8_t *mask, int mask_stride, int *rate_mv, int ref_idx);
+
+void av1_compound_single_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
+                                       BLOCK_SIZE bsize, MV *this_mv,
+                                       const uint8_t *second_pred,
+                                       const uint8_t *mask, int mask_stride,
+                                       int *rate_mv, int ref_idx);
+
+// Performs a motion search in SIMPLE_TRANSLATION mode using reference frame
+// ref. Note that this sets the offset of mbmi, so we will need to reset it
+// after calling this function.
+int_mv av1_simple_motion_search(struct AV1_COMP *const cpi, MACROBLOCK *x,
+                                int mi_row, int mi_col, BLOCK_SIZE bsize,
+                                int ref, FULLPEL_MV start_mv, int num_planes,
+                                int use_subpixel);
+
+// Performs a simple motion search to calculate the sse and var of the residue
+int_mv av1_simple_motion_sse_var(struct AV1_COMP *cpi, MACROBLOCK *x,
+                                 int mi_row, int mi_col, BLOCK_SIZE bsize,
+                                 const FULLPEL_MV start_mv, int use_subpixel,
+                                 unsigned int *sse, unsigned int *var);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_MOTION_SEARCH_H_
diff --git a/libs/libaom/src/av1/encoder/mv_prec.c b/libs/libaom/src/av1/encoder/mv_prec.c
new file mode 100644
index 000000000..8fcbde98e
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/mv_prec.c
@@ -0,0 +1,430 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_config.h"
+
+#include "aom_ports/system_state.h"
+
+#include "av1/encoder/encodemv.h"
+#if !CONFIG_REALTIME_ONLY
+#include "av1/encoder/misc_model_weights.h"
+#endif  // !CONFIG_REALTIME_ONLY
+#include "av1/encoder/mv_prec.h"
+
+#if !CONFIG_REALTIME_ONLY
+static AOM_INLINE int_mv get_ref_mv_for_mv_stats(
+    const MB_MODE_INFO *mbmi, const MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame,
+    int ref_idx) {
+  int ref_mv_idx = mbmi->ref_mv_idx;
+  if (mbmi->mode == NEAR_NEWMV || mbmi->mode == NEW_NEARMV) {
+    assert(has_second_ref(mbmi));
+    ref_mv_idx += 1;
+  }
+
+  const MV_REFERENCE_FRAME *ref_frames = mbmi->ref_frame;
+  const int8_t ref_frame_type = av1_ref_frame_type(ref_frames);
+  const CANDIDATE_MV *curr_ref_mv_stack = mbmi_ext_frame->ref_mv_stack;
+
+  if (ref_frames[1] > INTRA_FRAME) {
+    assert(ref_idx == 0 || ref_idx == 1);
+    return ref_idx ? curr_ref_mv_stack[ref_mv_idx].comp_mv
+                   : curr_ref_mv_stack[ref_mv_idx].this_mv;
+  }
+
+  assert(ref_idx == 0);
+  return ref_mv_idx < mbmi_ext_frame->ref_mv_count
+             ? curr_ref_mv_stack[ref_mv_idx].this_mv
+             : mbmi_ext_frame->global_mvs[ref_frame_type];
+}
+
+static AOM_INLINE int get_symbol_cost(const aom_cdf_prob *cdf, int symbol) {
+  const aom_cdf_prob cur_cdf = AOM_ICDF(cdf[symbol]);
+  const aom_cdf_prob prev_cdf = symbol ? AOM_ICDF(cdf[symbol - 1]) : 0;
+  const aom_cdf_prob p15 = AOMMAX(cur_cdf - prev_cdf, EC_MIN_PROB);
+
+  return av1_cost_symbol(p15);
+}
+
+static AOM_INLINE int keep_one_comp_stat(MV_STATS *mv_stats, int comp_val,
+                                         int comp_idx, const AV1_COMP *cpi,
+                                         int *rates) {
+  assert(comp_val != 0 && "mv component should not have zero value!");
+  const int sign = comp_val < 0;
+  const int mag = sign ? -comp_val : comp_val;
+  const int mag_minus_1 = mag - 1;
+  int offset;
+  const int mv_class = av1_get_mv_class(mag_minus_1, &offset);
+  const int int_part = offset >> 3;         // int mv data
+  const int frac_part = (offset >> 1) & 3;  // fractional mv data
+  const int high_part = offset & 1;         // high precision mv data
+  const int use_hp = cpi->common.features.allow_high_precision_mv;
+  int r_idx = 0;
+
+  const MACROBLOCK *const x = &cpi->td.mb;
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  nmv_context *nmvc = &ec_ctx->nmvc;
+  nmv_component *mvcomp_ctx = nmvc->comps;
+  nmv_component *cur_mvcomp_ctx = &mvcomp_ctx[comp_idx];
+  aom_cdf_prob *sign_cdf = cur_mvcomp_ctx->sign_cdf;
+  aom_cdf_prob *class_cdf = cur_mvcomp_ctx->classes_cdf;
+  aom_cdf_prob *class0_cdf = cur_mvcomp_ctx->class0_cdf;
+  aom_cdf_prob(*bits_cdf)[3] = cur_mvcomp_ctx->bits_cdf;
+  aom_cdf_prob *frac_part_cdf = mv_class
+                                    ? (cur_mvcomp_ctx->fp_cdf)
+                                    : (cur_mvcomp_ctx->class0_fp_cdf[int_part]);
+  aom_cdf_prob *high_part_cdf =
+      mv_class ? (cur_mvcomp_ctx->hp_cdf) : (cur_mvcomp_ctx->class0_hp_cdf);
+
+  const int sign_rate = get_symbol_cost(sign_cdf, sign);
+  rates[r_idx++] = sign_rate;
+  update_cdf(sign_cdf, sign, 2);
+
+  const int class_rate = get_symbol_cost(class_cdf, mv_class);
+  rates[r_idx++] = class_rate;
+  update_cdf(class_cdf, mv_class, MV_CLASSES);
+
+  int int_bit_rate = 0;
+  if (mv_class == MV_CLASS_0) {
+    int_bit_rate = get_symbol_cost(class0_cdf, int_part);
+    update_cdf(class0_cdf, int_part, CLASS0_SIZE);
+  } else {
+    const int n = mv_class + CLASS0_BITS - 1;  // number of bits
+    for (int i = 0; i < n; ++i) {
+      int_bit_rate += get_symbol_cost(bits_cdf[i], (int_part >> i) & 1);
+      update_cdf(bits_cdf[i], (int_part >> i) & 1, 2);
+    }
+  }
+  rates[r_idx++] = int_bit_rate;
+  const int frac_part_rate = get_symbol_cost(frac_part_cdf, frac_part);
+  rates[r_idx++] = frac_part_rate;
+  update_cdf(frac_part_cdf, frac_part, MV_FP_SIZE);
+  const int high_part_rate =
+      use_hp ? get_symbol_cost(high_part_cdf, high_part) : 0;
+  if (use_hp) {
+    update_cdf(high_part_cdf, high_part, 2);
+  }
+  rates[r_idx++] = high_part_rate;
+
+  mv_stats->last_bit_zero += !high_part;
+  mv_stats->last_bit_nonzero += high_part;
+  const int total_rate =
+      (sign_rate + class_rate + int_bit_rate + frac_part_rate + high_part_rate);
+  return total_rate;
+}
+
+static AOM_INLINE void keep_one_mv_stat(MV_STATS *mv_stats, const MV *ref_mv,
+                                        const MV *cur_mv, const AV1_COMP *cpi) {
+  const MACROBLOCK *const x = &cpi->td.mb;
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  nmv_context *nmvc = &ec_ctx->nmvc;
+  aom_cdf_prob *joint_cdf = nmvc->joints_cdf;
+  const int use_hp = cpi->common.features.allow_high_precision_mv;
+
+  const MV diff = { cur_mv->row - ref_mv->row, cur_mv->col - ref_mv->col };
+  const int mv_joint = av1_get_mv_joint(&diff);
+  // TODO(chiyotsai@google.com): Estimate hp_diff when we are using lp
+  const MV hp_diff = diff;
+  const int hp_mv_joint = av1_get_mv_joint(&hp_diff);
+  const MV truncated_diff = { (diff.row / 2) * 2, (diff.col / 2) * 2 };
+  const MV lp_diff = use_hp ? truncated_diff : diff;
+  const int lp_mv_joint = av1_get_mv_joint(&lp_diff);
+
+  aom_clear_system_state();
+  const int mv_joint_rate = get_symbol_cost(joint_cdf, mv_joint);
+  const int hp_mv_joint_rate = get_symbol_cost(joint_cdf, hp_mv_joint);
+  const int lp_mv_joint_rate = get_symbol_cost(joint_cdf, lp_mv_joint);
+
+  update_cdf(joint_cdf, mv_joint, MV_JOINTS);
+
+  mv_stats->total_mv_rate += mv_joint_rate;
+  mv_stats->hp_total_mv_rate += hp_mv_joint_rate;
+  mv_stats->lp_total_mv_rate += lp_mv_joint_rate;
+  mv_stats->mv_joint_count[mv_joint]++;
+
+  for (int comp_idx = 0; comp_idx < 2; comp_idx++) {
+    const int comp_val = comp_idx ? diff.col : diff.row;
+    const int hp_comp_val = comp_idx ? hp_diff.col : hp_diff.row;
+    const int lp_comp_val = comp_idx ? lp_diff.col : lp_diff.row;
+    int rates[5];
+    av1_zero_array(rates, 5);
+
+    const int comp_rate =
+        comp_val ? keep_one_comp_stat(mv_stats, comp_val, comp_idx, cpi, rates)
+                 : 0;
+    // TODO(chiyotsai@google.com): Properly get hp rate when use_hp is false
+    const int hp_rate =
+        hp_comp_val ? rates[0] + rates[1] + rates[2] + rates[3] + rates[4] : 0;
+    const int lp_rate =
+        lp_comp_val ? rates[0] + rates[1] + rates[2] + rates[3] : 0;
+
+    mv_stats->total_mv_rate += comp_rate;
+    mv_stats->hp_total_mv_rate += hp_rate;
+    mv_stats->lp_total_mv_rate += lp_rate;
+  }
+}
+
+static AOM_INLINE void collect_mv_stats_b(MV_STATS *mv_stats,
+                                          const AV1_COMP *cpi, int mi_row,
+                                          int mi_col) {
+  const AV1_COMMON *cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+
+  if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) {
+    return;
+  }
+
+  const MB_MODE_INFO *mbmi =
+      mi_params->mi_grid_base[mi_row * mi_params->mi_stride + mi_col];
+  const MB_MODE_INFO_EXT_FRAME *mbmi_ext_frame =
+      cpi->mbmi_ext_info.frame_base +
+      get_mi_ext_idx(mi_row, mi_col, cm->mi_params.mi_alloc_bsize,
+                     cpi->mbmi_ext_info.stride);
+
+  if (!is_inter_block(mbmi)) {
+    mv_stats->intra_count++;
+    return;
+  }
+  mv_stats->inter_count++;
+
+  const PREDICTION_MODE mode = mbmi->mode;
+  const int is_compound = has_second_ref(mbmi);
+
+  if (mode == NEWMV || mode == NEW_NEWMV) {
+    // All mvs are new
+    for (int ref_idx = 0; ref_idx < 1 + is_compound; ++ref_idx) {
+      const MV ref_mv =
+          get_ref_mv_for_mv_stats(mbmi, mbmi_ext_frame, ref_idx).as_mv;
+      const MV cur_mv = mbmi->mv[ref_idx].as_mv;
+      keep_one_mv_stat(mv_stats, &ref_mv, &cur_mv, cpi);
+    }
+  } else if (mode == NEAREST_NEWMV || mode == NEAR_NEWMV ||
+             mode == NEW_NEARESTMV || mode == NEW_NEARMV) {
+    // has exactly one new_mv
+    mv_stats->default_mvs += 1;
+
+    const int ref_idx = (mode == NEAREST_NEWMV || mode == NEAR_NEWMV);
+    const MV ref_mv =
+        get_ref_mv_for_mv_stats(mbmi, mbmi_ext_frame, ref_idx).as_mv;
+    const MV cur_mv = mbmi->mv[ref_idx].as_mv;
+
+    keep_one_mv_stat(mv_stats, &ref_mv, &cur_mv, cpi);
+  } else {
+    // No new_mv
+    mv_stats->default_mvs += 1 + is_compound;
+  }
+
+  // Add texture information
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const int num_rows = block_size_high[bsize];
+  const int num_cols = block_size_wide[bsize];
+  const int y_stride = cpi->source->y_stride;
+  const int px_row = 4 * mi_row, px_col = 4 * mi_col;
+  const int buf_is_hbd = cpi->source->flags & YV12_FLAG_HIGHBITDEPTH;
+  const int bd = cm->seq_params.bit_depth;
+  if (buf_is_hbd) {
+    uint16_t *source_buf =
+        CONVERT_TO_SHORTPTR(cpi->source->y_buffer) + px_row * y_stride + px_col;
+    for (int row = 0; row < num_rows - 1; row++) {
+      for (int col = 0; col < num_cols - 1; col++) {
+        const int offset = row * y_stride + col;
+        const int horz_diff =
+            abs(source_buf[offset + 1] - source_buf[offset]) >> (bd - 8);
+        const int vert_diff =
+            abs(source_buf[offset + y_stride] - source_buf[offset]) >> (bd - 8);
+        mv_stats->horz_text += horz_diff;
+        mv_stats->vert_text += vert_diff;
+        mv_stats->diag_text += horz_diff * vert_diff;
+      }
+    }
+  } else {
+    uint8_t *source_buf = cpi->source->y_buffer + px_row * y_stride + px_col;
+    for (int row = 0; row < num_rows - 1; row++) {
+      for (int col = 0; col < num_cols - 1; col++) {
+        const int offset = row * y_stride + col;
+        const int horz_diff = abs(source_buf[offset + 1] - source_buf[offset]);
+        const int vert_diff =
+            abs(source_buf[offset + y_stride] - source_buf[offset]);
+        mv_stats->horz_text += horz_diff;
+        mv_stats->vert_text += vert_diff;
+        mv_stats->diag_text += horz_diff * vert_diff;
+      }
+    }
+  }
+}
+
+// Split block
+static AOM_INLINE void collect_mv_stats_sb(MV_STATS *mv_stats,
+                                           const AV1_COMP *cpi, int mi_row,
+                                           int mi_col, BLOCK_SIZE bsize) {
+  assert(bsize < BLOCK_SIZES_ALL);
+  const AV1_COMMON *cm = &cpi->common;
+
+  if (mi_row >= cm->mi_params.mi_rows || mi_col >= cm->mi_params.mi_cols)
+    return;
+
+  const PARTITION_TYPE partition = get_partition(cm, mi_row, mi_col, bsize);
+  const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
+
+  const int hbs = mi_size_wide[bsize] / 2;
+  const int qbs = mi_size_wide[bsize] / 4;
+  switch (partition) {
+    case PARTITION_NONE:
+      collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col);
+      break;
+    case PARTITION_HORZ:
+      collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col);
+      collect_mv_stats_b(mv_stats, cpi, mi_row + hbs, mi_col);
+      break;
+    case PARTITION_VERT:
+      collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col);
+      collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col + hbs);
+      break;
+    case PARTITION_SPLIT:
+      collect_mv_stats_sb(mv_stats, cpi, mi_row, mi_col, subsize);
+      collect_mv_stats_sb(mv_stats, cpi, mi_row, mi_col + hbs, subsize);
+      collect_mv_stats_sb(mv_stats, cpi, mi_row + hbs, mi_col, subsize);
+      collect_mv_stats_sb(mv_stats, cpi, mi_row + hbs, mi_col + hbs, subsize);
+      break;
+    case PARTITION_HORZ_A:
+      collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col);
+      collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col + hbs);
+      collect_mv_stats_b(mv_stats, cpi, mi_row + hbs, mi_col);
+      break;
+    case PARTITION_HORZ_B:
+      collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col);
+      collect_mv_stats_b(mv_stats, cpi, mi_row + hbs, mi_col);
+      collect_mv_stats_b(mv_stats, cpi, mi_row + hbs, mi_col + hbs);
+      break;
+    case PARTITION_VERT_A:
+      collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col);
+      collect_mv_stats_b(mv_stats, cpi, mi_row + hbs, mi_col);
+      collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col + hbs);
+      break;
+    case PARTITION_VERT_B:
+      collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col);
+      collect_mv_stats_b(mv_stats, cpi, mi_row, mi_col + hbs);
+      collect_mv_stats_b(mv_stats, cpi, mi_row + hbs, mi_col + hbs);
+      break;
+    case PARTITION_HORZ_4:
+      for (int i = 0; i < 4; ++i) {
+        const int this_mi_row = mi_row + i * qbs;
+        collect_mv_stats_b(mv_stats, cpi, this_mi_row, mi_col);
+      }
+      break;
+    case PARTITION_VERT_4:
+      for (int i = 0; i < 4; ++i) {
+        const int this_mi_col = mi_col + i * qbs;
+        collect_mv_stats_b(mv_stats, cpi, mi_row, this_mi_col);
+      }
+      break;
+    default: assert(0);
+  }
+}
+
+static AOM_INLINE void collect_mv_stats_tile(MV_STATS *mv_stats,
+                                             const AV1_COMP *cpi,
+                                             const TileInfo *tile_info) {
+  const AV1_COMMON *cm = &cpi->common;
+  const int mi_row_start = tile_info->mi_row_start;
+  const int mi_row_end = tile_info->mi_row_end;
+  const int mi_col_start = tile_info->mi_col_start;
+  const int mi_col_end = tile_info->mi_col_end;
+  const int sb_size_mi = cm->seq_params.mib_size;
+  BLOCK_SIZE sb_size = cm->seq_params.sb_size;
+  for (int mi_row = mi_row_start; mi_row < mi_row_end; mi_row += sb_size_mi) {
+    for (int mi_col = mi_col_start; mi_col < mi_col_end; mi_col += sb_size_mi) {
+      collect_mv_stats_sb(mv_stats, cpi, mi_row, mi_col, sb_size);
+    }
+  }
+}
+
+void av1_collect_mv_stats(AV1_COMP *cpi, int current_q) {
+  MV_STATS *mv_stats = &cpi->mv_stats;
+  const AV1_COMMON *cm = &cpi->common;
+  const int tile_cols = cm->tiles.cols;
+  const int tile_rows = cm->tiles.rows;
+
+  for (int tile_row = 0; tile_row < tile_rows; tile_row++) {
+    TileInfo tile_info;
+    av1_tile_set_row(&tile_info, cm, tile_row);
+    for (int tile_col = 0; tile_col < tile_cols; tile_col++) {
+      const int tile_idx = tile_row * tile_cols + tile_col;
+      av1_tile_set_col(&tile_info, cm, tile_col);
+      cpi->tile_data[tile_idx].tctx = *cm->fc;
+      cpi->td.mb.e_mbd.tile_ctx = &cpi->tile_data[tile_idx].tctx;
+      collect_mv_stats_tile(mv_stats, cpi, &tile_info);
+    }
+  }
+
+  mv_stats->q = current_q;
+  mv_stats->order = cpi->common.current_frame.order_hint;
+  mv_stats->valid = 1;
+}
+
+static AOM_INLINE int get_smart_mv_prec(AV1_COMP *cpi, const MV_STATS *mv_stats,
+                                        int current_q) {
+  const AV1_COMMON *cm = &cpi->common;
+  const int order_hint = cpi->common.current_frame.order_hint;
+  const int order_diff = order_hint - mv_stats->order;
+  aom_clear_system_state();
+  const float area = (float)(cm->width * cm->height);
+  float features[MV_PREC_FEATURE_SIZE] = {
+    (float)current_q,
+    (float)mv_stats->q,
+    (float)order_diff,
+    mv_stats->inter_count / area,
+    mv_stats->intra_count / area,
+    mv_stats->default_mvs / area,
+    mv_stats->mv_joint_count[0] / area,
+    mv_stats->mv_joint_count[1] / area,
+    mv_stats->mv_joint_count[2] / area,
+    mv_stats->mv_joint_count[3] / area,
+    mv_stats->last_bit_zero / area,
+    mv_stats->last_bit_nonzero / area,
+    mv_stats->total_mv_rate / area,
+    mv_stats->hp_total_mv_rate / area,
+    mv_stats->lp_total_mv_rate / area,
+    mv_stats->horz_text / area,
+    mv_stats->vert_text / area,
+    mv_stats->diag_text / area,
+  };
+
+  for (int f_idx = 0; f_idx < MV_PREC_FEATURE_SIZE; f_idx++) {
+    features[f_idx] =
+        (features[f_idx] - av1_mv_prec_mean[f_idx]) / av1_mv_prec_std[f_idx];
+  }
+  float score = 0.0f;
+
+  av1_nn_predict(features, &av1_mv_prec_dnn_config, 1, &score);
+
+  const int use_high_hp = score >= 0.0f;
+  return use_high_hp;
+}
+#endif  // !CONFIG_REALTIME_ONLY
+
+void av1_pick_and_set_high_precision_mv(AV1_COMP *cpi, int qindex) {
+  int use_hp = qindex < HIGH_PRECISION_MV_QTHRESH;
+
+  if (cpi->sf.hl_sf.high_precision_mv_usage == QTR_ONLY) {
+    use_hp = 0;
+  }
+#if !CONFIG_REALTIME_ONLY
+  else if (cpi->sf.hl_sf.high_precision_mv_usage == LAST_MV_DATA &&
+           av1_frame_allows_smart_mv(cpi) && cpi->mv_stats.valid) {
+    use_hp = get_smart_mv_prec(cpi, &cpi->mv_stats, qindex);
+  }
+#endif  // !CONFIG_REALTIME_ONLY
+
+  av1_set_high_precision_mv(cpi, use_hp,
+                            cpi->common.features.cur_frame_force_integer_mv);
+}
diff --git a/libs/libaom/src/av1/encoder/mv_prec.h b/libs/libaom/src/av1/encoder/mv_prec.h
new file mode 100644
index 000000000..8df8b96dc
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/mv_prec.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_MV_PREC_H_
+#define AOM_AV1_ENCODER_MV_PREC_H_
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/speed_features.h"
+
+// Q threshold for high precision mv.
+#define HIGH_PRECISION_MV_QTHRESH 128
+#if !CONFIG_REALTIME_ONLY
+void av1_collect_mv_stats(AV1_COMP *cpi, int current_q);
+
+static AOM_INLINE int av1_frame_allows_smart_mv(const AV1_COMP *cpi) {
+  const int gf_group_index = cpi->gf_group.index;
+  const int gf_update_type = cpi->gf_group.update_type[gf_group_index];
+  return !frame_is_intra_only(&cpi->common) &&
+         !(gf_update_type == INTNL_OVERLAY_UPDATE ||
+           gf_update_type == OVERLAY_UPDATE);
+}
+#endif  // !CONFIG_REALTIME_ONLY
+
+static AOM_INLINE void av1_set_high_precision_mv(
+    AV1_COMP *cpi, int allow_high_precision_mv,
+    int cur_frame_force_integer_mv) {
+  MACROBLOCK *const x = &cpi->td.mb;
+  const int copy_hp = cpi->common.features.allow_high_precision_mv =
+      allow_high_precision_mv && !cur_frame_force_integer_mv;
+  x->nmvcost[0] = &x->nmv_costs[0][MV_MAX];
+  x->nmvcost[1] = &x->nmv_costs[1][MV_MAX];
+  x->nmvcost_hp[0] = &x->nmv_costs_hp[0][MV_MAX];
+  x->nmvcost_hp[1] = &x->nmv_costs_hp[1][MV_MAX];
+  int *(*src)[2] = copy_hp ? &x->nmvcost_hp : &x->nmvcost;
+  x->mv_cost_stack = *src;
+}
+
+void av1_pick_and_set_high_precision_mv(AV1_COMP *cpi, int qindex);
+
+#endif  // AOM_AV1_ENCODER_MV_PREC_H_
diff --git a/libs/libaom/src/av1/encoder/nonrd_pickmode.c b/libs/libaom/src/av1/encoder/nonrd_pickmode.c
new file mode 100644
index 000000000..a1180015c
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/nonrd_pickmode.c
@@ -0,0 +1,2182 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/blend.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/system_state.h"
+
+#include "av1/encoder/model_rd.h"
+#include "av1/common/mvref_common.h"
+#include "av1/common/pred_common.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/rdopt.h"
+#include "av1/encoder/reconinter_enc.h"
+
+extern int g_pick_inter_mode_cnt;
+typedef struct {
+  uint8_t *data;
+  int stride;
+  int in_use;
+} PRED_BUFFER;
+
+typedef struct {
+  PRED_BUFFER *best_pred;
+  PREDICTION_MODE best_mode;
+  TX_SIZE best_tx_size;
+  TX_SIZE best_intra_tx_size;
+  MV_REFERENCE_FRAME best_ref_frame;
+  MV_REFERENCE_FRAME best_second_ref_frame;
+  uint8_t best_mode_skip_txfm;
+  int_interpfilters best_pred_filter;
+} BEST_PICKMODE;
+
+typedef struct {
+  MV_REFERENCE_FRAME ref_frame;
+  PREDICTION_MODE pred_mode;
+} REF_MODE;
+
+static const int pos_shift_16x16[4][4] = {
+  { 9, 10, 13, 14 }, { 11, 12, 15, 16 }, { 17, 18, 21, 22 }, { 19, 20, 23, 24 }
+};
+
+#define RT_INTER_MODES 9
+static const REF_MODE ref_mode_set[RT_INTER_MODES] = {
+  { LAST_FRAME, NEARESTMV },   { LAST_FRAME, NEARMV },
+  { LAST_FRAME, NEWMV },       { GOLDEN_FRAME, NEARESTMV },
+  { GOLDEN_FRAME, NEARMV },    { GOLDEN_FRAME, NEWMV },
+  { ALTREF_FRAME, NEARESTMV }, { ALTREF_FRAME, NEARMV },
+  { ALTREF_FRAME, NEWMV }
+};
+
+static const THR_MODES mode_idx[REF_FRAMES][4] = {
+  { THR_DC, THR_V_PRED, THR_H_PRED, THR_SMOOTH },
+  { THR_NEARESTMV, THR_NEARMV, THR_GLOBALMV, THR_NEWMV },
+  { THR_NEARESTL2, THR_NEARL2, THR_GLOBALL2, THR_NEWL2 },
+  { THR_NEARESTL3, THR_NEARL3, THR_GLOBALL3, THR_NEWL3 },
+  { THR_NEARESTG, THR_NEARG, THR_GLOBALMV, THR_NEWG },
+};
+
+static const PREDICTION_MODE intra_mode_list[] = { DC_PRED, V_PRED, H_PRED,
+                                                   SMOOTH_PRED };
+
+static INLINE int mode_offset(const PREDICTION_MODE mode) {
+  if (mode >= NEARESTMV) {
+    return INTER_OFFSET(mode);
+  } else {
+    switch (mode) {
+      case DC_PRED: return 0;
+      case V_PRED: return 1;
+      case H_PRED: return 2;
+      case SMOOTH_PRED: return 3;
+      default: assert(0); return -1;
+    }
+  }
+}
+
+enum {
+  //  INTER_ALL = (1 << NEARESTMV) | (1 << NEARMV) | (1 << NEWMV),
+  INTER_NEAREST = (1 << NEARESTMV),
+  INTER_NEAREST_NEW = (1 << NEARESTMV) | (1 << NEWMV),
+  INTER_NEAREST_NEAR = (1 << NEARESTMV) | (1 << NEARMV),
+  INTER_NEAR_NEW = (1 << NEARMV) | (1 << NEWMV),
+};
+
+static INLINE void init_best_pickmode(BEST_PICKMODE *bp) {
+  bp->best_mode = NEARESTMV;
+  bp->best_ref_frame = LAST_FRAME;
+  bp->best_tx_size = TX_8X8;
+  bp->best_intra_tx_size = TX_8X8;
+  bp->best_pred_filter = av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+  bp->best_mode_skip_txfm = 0;
+  bp->best_second_ref_frame = NONE_FRAME;
+  bp->best_pred = NULL;
+}
+
+static int combined_motion_search(AV1_COMP *cpi, MACROBLOCK *x,
+                                  BLOCK_SIZE bsize, int mi_row, int mi_col,
+                                  int_mv *tmp_mv, int *rate_mv,
+                                  int64_t best_rd_sofar, int use_base_mv) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  const AV1_COMMON *cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MB_MODE_INFO *mi = xd->mi[0];
+  struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0, 0, 0, 0 } };
+  int step_param = cpi->mv_search_params.mv_step_param;
+  FULLPEL_MV start_mv;
+  const int ref = mi->ref_frame[0];
+  const MV ref_mv = av1_get_ref_mv(x, mi->ref_mv_idx).as_mv;
+  MV center_mv;
+  int dis;
+  int rv = 0;
+  int cost_list[5];
+  int search_subpel = 1;
+  const YV12_BUFFER_CONFIG *scaled_ref_frame =
+      av1_get_scaled_ref_frame(cpi, ref);
+
+  if (scaled_ref_frame) {
+    int i;
+    // Swap out the reference frame for a version that's been scaled to
+    // match the resolution of the current frame, allowing the existing
+    // motion search code to be used without additional modifications.
+    for (i = 0; i < MAX_MB_PLANE; i++) backup_yv12[i] = xd->plane[i].pre[0];
+    av1_setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL,
+                         num_planes);
+  }
+
+  start_mv = get_fullmv_from_mv(&ref_mv);
+
+  if (!use_base_mv)
+    center_mv = ref_mv;
+  else
+    center_mv = tmp_mv->as_mv;
+
+  const search_site_config *src_search_sites =
+      &cpi->mv_search_params.ss_cfg[SS_CFG_SRC];
+  FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
+  av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize, &center_mv,
+                                     src_search_sites);
+
+  av1_full_pixel_search(start_mv, &full_ms_params, step_param,
+                        cond_cost_list(cpi, cost_list), &tmp_mv->as_fullmv,
+                        NULL);
+
+  // calculate the bit cost on motion vector
+  MV mvp_full = get_mv_from_fullmv(&tmp_mv->as_fullmv);
+
+  *rate_mv = av1_mv_bit_cost(&mvp_full, &ref_mv, x->nmv_vec_cost,
+                             x->mv_cost_stack, MV_COST_WEIGHT);
+
+  // TODO(kyslov) Account for Rate Mode!
+  rv = !(RDCOST(x->rdmult, (*rate_mv), 0) > best_rd_sofar);
+
+  if (rv && search_subpel) {
+    SUBPEL_MOTION_SEARCH_PARAMS ms_params;
+    av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, &ref_mv,
+                                      cost_list);
+    MV subpel_start_mv = get_mv_from_fullmv(&tmp_mv->as_fullmv);
+    cpi->mv_search_params.find_fractional_mv_step(
+        xd, cm, &ms_params, subpel_start_mv, &tmp_mv->as_mv, &dis,
+        &x->pred_sse[ref], NULL);
+
+    *rate_mv = av1_mv_bit_cost(&tmp_mv->as_mv, &ref_mv, x->nmv_vec_cost,
+                               x->mv_cost_stack, MV_COST_WEIGHT);
+  }
+
+  if (scaled_ref_frame) {
+    int i;
+    for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i];
+  }
+  return rv;
+}
+
+static int search_new_mv(AV1_COMP *cpi, MACROBLOCK *x,
+                         int_mv frame_mv[][REF_FRAMES],
+                         MV_REFERENCE_FRAME ref_frame, int gf_temporal_ref,
+                         BLOCK_SIZE bsize, int mi_row, int mi_col,
+                         int best_pred_sad, int *rate_mv, RD_STATS *best_rdc) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mi = xd->mi[0];
+  AV1_COMMON *cm = &cpi->common;
+  if (ref_frame > LAST_FRAME && gf_temporal_ref &&
+      cpi->oxcf.rc_mode == AOM_CBR) {
+    int tmp_sad;
+    int dis;
+    int cost_list[5] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX, INT_MAX };
+
+    if (bsize < BLOCK_16X16) return -1;
+
+    tmp_sad = av1_int_pro_motion_estimation(
+        cpi, x, bsize, mi_row, mi_col,
+        &x->mbmi_ext->ref_mv_stack[ref_frame][0].this_mv.as_mv);
+
+    if (tmp_sad > x->pred_mv_sad[LAST_FRAME]) return -1;
+    if (tmp_sad + (num_pels_log2_lookup[bsize] << 4) > best_pred_sad) return -1;
+
+    frame_mv[NEWMV][ref_frame].as_int = mi->mv[0].as_int;
+    int_mv best_mv = mi->mv[0];
+    best_mv.as_mv.row >>= 3;
+    best_mv.as_mv.col >>= 3;
+    MV ref_mv = av1_get_ref_mv(x, 0).as_mv;
+
+    *rate_mv =
+        av1_mv_bit_cost(&frame_mv[NEWMV][ref_frame].as_mv, &ref_mv,
+                        x->nmv_vec_cost, x->mv_cost_stack, MV_COST_WEIGHT);
+    frame_mv[NEWMV][ref_frame].as_mv.row >>= 3;
+    frame_mv[NEWMV][ref_frame].as_mv.col >>= 3;
+
+    SUBPEL_MOTION_SEARCH_PARAMS ms_params;
+    av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, &ref_mv,
+                                      cost_list);
+    MV start_mv = get_mv_from_fullmv(&best_mv.as_fullmv);
+    cpi->mv_search_params.find_fractional_mv_step(
+        xd, cm, &ms_params, start_mv, &best_mv.as_mv, &dis,
+        &x->pred_sse[ref_frame], NULL);
+    frame_mv[NEWMV][ref_frame].as_int = best_mv.as_int;
+  } else if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col,
+                                     &frame_mv[NEWMV][ref_frame], rate_mv,
+                                     best_rdc->rdcost, 0)) {
+    return -1;
+  }
+
+  return 0;
+}
+
+static INLINE void find_predictors(
+    AV1_COMP *cpi, MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame,
+    int_mv frame_mv[MB_MODE_COUNT][REF_FRAMES], int *ref_frame_skip_mask,
+    const int flag_list[4], TileDataEnc *tile_data,
+    struct buf_2d yv12_mb[8][MAX_MB_PLANE], BLOCK_SIZE bsize,
+    int force_skip_low_temp_var) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+  const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, ref_frame);
+  const int num_planes = av1_num_planes(cm);
+  (void)tile_data;
+
+  x->pred_mv_sad[ref_frame] = INT_MAX;
+  frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
+  // TODO(kyslov) this needs various further optimizations. to be continued..
+  if ((cpi->ref_frame_flags & flag_list[ref_frame]) && (yv12 != NULL)) {
+    const struct scale_factors *const sf =
+        get_ref_scale_factors_const(cm, ref_frame);
+    av1_setup_pred_block(xd, yv12_mb[ref_frame], yv12, sf, sf, num_planes);
+    av1_find_mv_refs(cm, xd, mbmi, ref_frame, mbmi_ext->ref_mv_count,
+                     xd->ref_mv_stack, xd->weight, NULL, mbmi_ext->global_mvs,
+                     mbmi_ext->mode_context);
+    // TODO(Ravi): Populate mbmi_ext->ref_mv_stack[ref_frame][4] and
+    // mbmi_ext->weight[ref_frame][4] inside av1_find_mv_refs.
+    av1_copy_usable_ref_mv_stack_and_weight(xd, mbmi_ext, ref_frame);
+    av1_find_best_ref_mvs_from_stack(
+        cm->features.allow_high_precision_mv, mbmi_ext, ref_frame,
+        &frame_mv[NEARESTMV][ref_frame], &frame_mv[NEARMV][ref_frame], 0);
+    // Early exit for non-LAST frame if force_skip_low_temp_var is set.
+    if (!av1_is_scaled(sf) && bsize >= BLOCK_8X8 &&
+        !(force_skip_low_temp_var && ref_frame != LAST_FRAME)) {
+      av1_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12->y_stride, ref_frame,
+                  bsize);
+    }
+  } else {
+    *ref_frame_skip_mask |= (1 << ref_frame);
+  }
+  av1_count_overlappable_neighbors(cm, xd);
+  mbmi->num_proj_ref = 1;
+}
+
+static void estimate_single_ref_frame_costs(const AV1_COMMON *cm,
+                                            const MACROBLOCKD *xd,
+                                            const MACROBLOCK *x, int segment_id,
+                                            unsigned int *ref_costs_single) {
+  int seg_ref_active =
+      segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME);
+  if (seg_ref_active) {
+    memset(ref_costs_single, 0, REF_FRAMES * sizeof(*ref_costs_single));
+  } else {
+    int intra_inter_ctx = av1_get_intra_inter_context(xd);
+    ref_costs_single[INTRA_FRAME] = x->intra_inter_cost[intra_inter_ctx][0];
+    unsigned int base_cost = x->intra_inter_cost[intra_inter_ctx][1];
+
+    for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i)
+      ref_costs_single[i] = base_cost;
+
+    const int ctx_p1 = av1_get_pred_context_single_ref_p1(xd);
+    const int ctx_p2 = av1_get_pred_context_single_ref_p2(xd);
+    const int ctx_p3 = av1_get_pred_context_single_ref_p3(xd);
+    const int ctx_p4 = av1_get_pred_context_single_ref_p4(xd);
+    const int ctx_p5 = av1_get_pred_context_single_ref_p5(xd);
+    const int ctx_p6 = av1_get_pred_context_single_ref_p6(xd);
+
+    // Determine cost of a single ref frame, where frame types are represented
+    // by a tree:
+    // Level 0: add cost whether this ref is a forward or backward ref
+    ref_costs_single[LAST_FRAME] += x->single_ref_cost[ctx_p1][0][0];
+    ref_costs_single[LAST2_FRAME] += x->single_ref_cost[ctx_p1][0][0];
+    ref_costs_single[LAST3_FRAME] += x->single_ref_cost[ctx_p1][0][0];
+    ref_costs_single[GOLDEN_FRAME] += x->single_ref_cost[ctx_p1][0][0];
+    ref_costs_single[BWDREF_FRAME] += x->single_ref_cost[ctx_p1][0][1];
+    ref_costs_single[ALTREF2_FRAME] += x->single_ref_cost[ctx_p1][0][1];
+    ref_costs_single[ALTREF_FRAME] += x->single_ref_cost[ctx_p1][0][1];
+
+    // Level 1: if this ref is forward ref,
+    // add cost whether it is last/last2 or last3/golden
+    ref_costs_single[LAST_FRAME] += x->single_ref_cost[ctx_p3][2][0];
+    ref_costs_single[LAST2_FRAME] += x->single_ref_cost[ctx_p3][2][0];
+    ref_costs_single[LAST3_FRAME] += x->single_ref_cost[ctx_p3][2][1];
+    ref_costs_single[GOLDEN_FRAME] += x->single_ref_cost[ctx_p3][2][1];
+
+    // Level 1: if this ref is backward ref
+    // then add cost whether this ref is altref or backward ref
+    ref_costs_single[BWDREF_FRAME] += x->single_ref_cost[ctx_p2][1][0];
+    ref_costs_single[ALTREF2_FRAME] += x->single_ref_cost[ctx_p2][1][0];
+    ref_costs_single[ALTREF_FRAME] += x->single_ref_cost[ctx_p2][1][1];
+
+    // Level 2: further add cost whether this ref is last or last2
+    ref_costs_single[LAST_FRAME] += x->single_ref_cost[ctx_p4][3][0];
+    ref_costs_single[LAST2_FRAME] += x->single_ref_cost[ctx_p4][3][1];
+
+    // Level 2: last3 or golden
+    ref_costs_single[LAST3_FRAME] += x->single_ref_cost[ctx_p5][4][0];
+    ref_costs_single[GOLDEN_FRAME] += x->single_ref_cost[ctx_p5][4][1];
+
+    // Level 2: bwdref or altref2
+    ref_costs_single[BWDREF_FRAME] += x->single_ref_cost[ctx_p6][5][0];
+    ref_costs_single[ALTREF2_FRAME] += x->single_ref_cost[ctx_p6][5][1];
+  }
+}
+
+static void estimate_comp_ref_frame_costs(
+    const AV1_COMMON *cm, const MACROBLOCKD *xd, const MACROBLOCK *x,
+    int segment_id, unsigned int (*ref_costs_comp)[REF_FRAMES]) {
+  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
+    for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame)
+      memset(ref_costs_comp[ref_frame], 0,
+             REF_FRAMES * sizeof((*ref_costs_comp)[0]));
+  } else {
+    int intra_inter_ctx = av1_get_intra_inter_context(xd);
+    unsigned int base_cost = x->intra_inter_cost[intra_inter_ctx][1];
+
+    if (cm->current_frame.reference_mode != SINGLE_REFERENCE) {
+      // Similar to single ref, determine cost of compound ref frames.
+      // cost_compound_refs = cost_first_ref + cost_second_ref
+      const int bwdref_comp_ctx_p = av1_get_pred_context_comp_bwdref_p(xd);
+      const int bwdref_comp_ctx_p1 = av1_get_pred_context_comp_bwdref_p1(xd);
+      const int ref_comp_ctx_p = av1_get_pred_context_comp_ref_p(xd);
+      const int ref_comp_ctx_p1 = av1_get_pred_context_comp_ref_p1(xd);
+      const int ref_comp_ctx_p2 = av1_get_pred_context_comp_ref_p2(xd);
+
+      const int comp_ref_type_ctx = av1_get_comp_reference_type_context(xd);
+      unsigned int ref_bicomp_costs[REF_FRAMES] = { 0 };
+
+      ref_bicomp_costs[LAST_FRAME] = ref_bicomp_costs[LAST2_FRAME] =
+          ref_bicomp_costs[LAST3_FRAME] = ref_bicomp_costs[GOLDEN_FRAME] =
+              base_cost + x->comp_ref_type_cost[comp_ref_type_ctx][1];
+      ref_bicomp_costs[BWDREF_FRAME] = ref_bicomp_costs[ALTREF2_FRAME] = 0;
+      ref_bicomp_costs[ALTREF_FRAME] = 0;
+
+      // cost of first ref frame
+      ref_bicomp_costs[LAST_FRAME] += x->comp_ref_cost[ref_comp_ctx_p][0][0];
+      ref_bicomp_costs[LAST2_FRAME] += x->comp_ref_cost[ref_comp_ctx_p][0][0];
+      ref_bicomp_costs[LAST3_FRAME] += x->comp_ref_cost[ref_comp_ctx_p][0][1];
+      ref_bicomp_costs[GOLDEN_FRAME] += x->comp_ref_cost[ref_comp_ctx_p][0][1];
+
+      ref_bicomp_costs[LAST_FRAME] += x->comp_ref_cost[ref_comp_ctx_p1][1][0];
+      ref_bicomp_costs[LAST2_FRAME] += x->comp_ref_cost[ref_comp_ctx_p1][1][1];
+
+      ref_bicomp_costs[LAST3_FRAME] += x->comp_ref_cost[ref_comp_ctx_p2][2][0];
+      ref_bicomp_costs[GOLDEN_FRAME] += x->comp_ref_cost[ref_comp_ctx_p2][2][1];
+
+      // cost of second ref frame
+      ref_bicomp_costs[BWDREF_FRAME] +=
+          x->comp_bwdref_cost[bwdref_comp_ctx_p][0][0];
+      ref_bicomp_costs[ALTREF2_FRAME] +=
+          x->comp_bwdref_cost[bwdref_comp_ctx_p][0][0];
+      ref_bicomp_costs[ALTREF_FRAME] +=
+          x->comp_bwdref_cost[bwdref_comp_ctx_p][0][1];
+
+      ref_bicomp_costs[BWDREF_FRAME] +=
+          x->comp_bwdref_cost[bwdref_comp_ctx_p1][1][0];
+      ref_bicomp_costs[ALTREF2_FRAME] +=
+          x->comp_bwdref_cost[bwdref_comp_ctx_p1][1][1];
+
+      // cost: if one ref frame is forward ref, the other ref is backward ref
+      for (int ref0 = LAST_FRAME; ref0 <= GOLDEN_FRAME; ++ref0) {
+        for (int ref1 = BWDREF_FRAME; ref1 <= ALTREF_FRAME; ++ref1) {
+          ref_costs_comp[ref0][ref1] =
+              ref_bicomp_costs[ref0] + ref_bicomp_costs[ref1];
+        }
+      }
+
+      // cost: if both ref frames are the same side.
+      const int uni_comp_ref_ctx_p = av1_get_pred_context_uni_comp_ref_p(xd);
+      const int uni_comp_ref_ctx_p1 = av1_get_pred_context_uni_comp_ref_p1(xd);
+      const int uni_comp_ref_ctx_p2 = av1_get_pred_context_uni_comp_ref_p2(xd);
+      ref_costs_comp[LAST_FRAME][LAST2_FRAME] =
+          base_cost + x->comp_ref_type_cost[comp_ref_type_ctx][0] +
+          x->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][0] +
+          x->uni_comp_ref_cost[uni_comp_ref_ctx_p1][1][0];
+      ref_costs_comp[LAST_FRAME][LAST3_FRAME] =
+          base_cost + x->comp_ref_type_cost[comp_ref_type_ctx][0] +
+          x->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][0] +
+          x->uni_comp_ref_cost[uni_comp_ref_ctx_p1][1][1] +
+          x->uni_comp_ref_cost[uni_comp_ref_ctx_p2][2][0];
+      ref_costs_comp[LAST_FRAME][GOLDEN_FRAME] =
+          base_cost + x->comp_ref_type_cost[comp_ref_type_ctx][0] +
+          x->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][0] +
+          x->uni_comp_ref_cost[uni_comp_ref_ctx_p1][1][1] +
+          x->uni_comp_ref_cost[uni_comp_ref_ctx_p2][2][1];
+      ref_costs_comp[BWDREF_FRAME][ALTREF_FRAME] =
+          base_cost + x->comp_ref_type_cost[comp_ref_type_ctx][0] +
+          x->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][1];
+    } else {
+      for (int ref0 = LAST_FRAME; ref0 <= GOLDEN_FRAME; ++ref0) {
+        for (int ref1 = BWDREF_FRAME; ref1 <= ALTREF_FRAME; ++ref1)
+          ref_costs_comp[ref0][ref1] = 512;
+      }
+      ref_costs_comp[LAST_FRAME][LAST2_FRAME] = 512;
+      ref_costs_comp[LAST_FRAME][LAST3_FRAME] = 512;
+      ref_costs_comp[LAST_FRAME][GOLDEN_FRAME] = 512;
+      ref_costs_comp[BWDREF_FRAME][ALTREF_FRAME] = 512;
+    }
+  }
+}
+
+static TX_SIZE calculate_tx_size(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
+                                 MACROBLOCK *const x, unsigned int var,
+                                 unsigned int sse) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  TX_SIZE tx_size;
+  if (x->tx_mode_search_type == TX_MODE_SELECT) {
+    if (sse > (var << 2))
+      tx_size = AOMMIN(max_txsize_lookup[bsize],
+                       tx_mode_to_biggest_tx_size[x->tx_mode_search_type]);
+    else
+      tx_size = TX_8X8;
+
+    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
+        cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id))
+      tx_size = TX_8X8;
+    else if (tx_size > TX_16X16)
+      tx_size = TX_16X16;
+  } else {
+    tx_size = AOMMIN(max_txsize_lookup[bsize],
+                     tx_mode_to_biggest_tx_size[x->tx_mode_search_type]);
+  }
+
+  if (x->tx_mode_search_type != ONLY_4X4 && bsize > BLOCK_32X32)
+    tx_size = TX_16X16;
+
+  return AOMMIN(tx_size, TX_16X16);
+}
+
+static const uint8_t b_width_log2_lookup[BLOCK_SIZES] = { 0, 0, 1, 1, 1, 2,
+                                                          2, 2, 3, 3, 3, 4,
+                                                          4, 4, 5, 5 };
+static const uint8_t b_height_log2_lookup[BLOCK_SIZES] = { 0, 1, 0, 1, 2, 1,
+                                                           2, 3, 2, 3, 4, 3,
+                                                           4, 5, 4, 5 };
+
+static void block_variance(const uint8_t *src, int src_stride,
+                           const uint8_t *ref, int ref_stride, int w, int h,
+                           unsigned int *sse, int *sum, int block_size,
+                           uint32_t *sse8x8, int *sum8x8, uint32_t *var8x8) {
+  int i, j, k = 0;
+
+  *sse = 0;
+  *sum = 0;
+
+  for (i = 0; i < h; i += block_size) {
+    for (j = 0; j < w; j += block_size) {
+      aom_get8x8var(src + src_stride * i + j, src_stride,
+                    ref + ref_stride * i + j, ref_stride, &sse8x8[k],
+                    &sum8x8[k]);
+      *sse += sse8x8[k];
+      *sum += sum8x8[k];
+      var8x8[k] = sse8x8[k] - (uint32_t)(((int64_t)sum8x8[k] * sum8x8[k]) >> 6);
+      k++;
+    }
+  }
+}
+
+static void calculate_variance(int bw, int bh, TX_SIZE tx_size,
+                               unsigned int *sse_i, int *sum_i,
+                               unsigned int *var_o, unsigned int *sse_o,
+                               int *sum_o) {
+  const BLOCK_SIZE unit_size = txsize_to_bsize[tx_size];
+  const int nw = 1 << (bw - b_width_log2_lookup[unit_size]);
+  const int nh = 1 << (bh - b_height_log2_lookup[unit_size]);
+  int i, j, k = 0;
+
+  for (i = 0; i < nh; i += 2) {
+    for (j = 0; j < nw; j += 2) {
+      sse_o[k] = sse_i[i * nw + j] + sse_i[i * nw + j + 1] +
+                 sse_i[(i + 1) * nw + j] + sse_i[(i + 1) * nw + j + 1];
+      sum_o[k] = sum_i[i * nw + j] + sum_i[i * nw + j + 1] +
+                 sum_i[(i + 1) * nw + j] + sum_i[(i + 1) * nw + j + 1];
+      var_o[k] = sse_o[k] - (uint32_t)(((int64_t)sum_o[k] * sum_o[k]) >>
+                                       (b_width_log2_lookup[unit_size] +
+                                        b_height_log2_lookup[unit_size] + 6));
+      k++;
+    }
+  }
+}
+
+// Adjust the ac_thr according to speed, width, height and normalized sum
+static int ac_thr_factor(const int speed, const int width, const int height,
+                         const int norm_sum) {
+  if (speed >= 8 && norm_sum < 5) {
+    if (width <= 640 && height <= 480)
+      return 4;
+    else
+      return 2;
+  }
+  return 1;
+}
+
+static void model_skip_for_sb_y_large(AV1_COMP *cpi, BLOCK_SIZE bsize,
+                                      int mi_row, int mi_col, MACROBLOCK *x,
+                                      MACROBLOCKD *xd, int *out_rate,
+                                      int64_t *out_dist, unsigned int *var_y,
+                                      unsigned int *sse_y, int *early_term,
+                                      int calculate_rd) {
+  // Note our transform coeffs are 8 times an orthogonal transform.
+  // Hence quantizer step is also 8 times. To get effective quantizer
+  // we need to divide by 8 before sending to modeling function.
+  unsigned int sse;
+  struct macroblock_plane *const p = &x->plane[0];
+  struct macroblockd_plane *const pd = &xd->plane[0];
+  const uint32_t dc_quant = p->dequant_QTX[0];
+  const uint32_t ac_quant = p->dequant_QTX[1];
+  const int64_t dc_thr = dc_quant * dc_quant >> 6;
+  int64_t ac_thr = ac_quant * ac_quant >> 6;
+  unsigned int var;
+  int sum;
+
+  const int bw = b_width_log2_lookup[bsize];
+  const int bh = b_height_log2_lookup[bsize];
+  const int num8x8 = 1 << (bw + bh - 2);
+  unsigned int sse8x8[256] = { 0 };
+  int sum8x8[256] = { 0 };
+  unsigned int var8x8[256] = { 0 };
+  TX_SIZE tx_size;
+  int k;
+  // Calculate variance for whole partition, and also save 8x8 blocks' variance
+  // to be used in following transform skipping test.
+  block_variance(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride,
+                 4 << bw, 4 << bh, &sse, &sum, 8, sse8x8, sum8x8, var8x8);
+  var = sse - (unsigned int)(((int64_t)sum * sum) >> (bw + bh + 4));
+
+  *var_y = var;
+  *sse_y = sse;
+
+  ac_thr *= ac_thr_factor(cpi->oxcf.speed, cpi->common.width,
+                          cpi->common.height, abs(sum) >> (bw + bh));
+
+  tx_size = calculate_tx_size(cpi, bsize, x, var, sse);
+  // The code below for setting skip flag assumes tranform size of at least 8x8,
+  // so force this lower limit on transform.
+  if (tx_size < TX_8X8) tx_size = TX_8X8;
+  xd->mi[0]->tx_size = tx_size;
+
+  // Evaluate if the partition block is a skippable block in Y plane.
+  {
+    unsigned int sse16x16[64] = { 0 };
+    int sum16x16[64] = { 0 };
+    unsigned int var16x16[64] = { 0 };
+    const int num16x16 = num8x8 >> 2;
+
+    unsigned int sse32x32[16] = { 0 };
+    int sum32x32[16] = { 0 };
+    unsigned int var32x32[16] = { 0 };
+    const int num32x32 = num8x8 >> 4;
+
+    int ac_test = 1;
+    int dc_test = 1;
+    const int num = (tx_size == TX_8X8)
+                        ? num8x8
+                        : ((tx_size == TX_16X16) ? num16x16 : num32x32);
+    const unsigned int *sse_tx =
+        (tx_size == TX_8X8) ? sse8x8
+                            : ((tx_size == TX_16X16) ? sse16x16 : sse32x32);
+    const unsigned int *var_tx =
+        (tx_size == TX_8X8) ? var8x8
+                            : ((tx_size == TX_16X16) ? var16x16 : var32x32);
+
+    // Calculate variance if tx_size > TX_8X8
+    if (tx_size >= TX_16X16)
+      calculate_variance(bw, bh, TX_8X8, sse8x8, sum8x8, var16x16, sse16x16,
+                         sum16x16);
+    if (tx_size == TX_32X32)
+      calculate_variance(bw, bh, TX_16X16, sse16x16, sum16x16, var32x32,
+                         sse32x32, sum32x32);
+
+    // Skipping test
+    *early_term = 0;
+    for (k = 0; k < num; k++)
+      // Check if all ac coefficients can be quantized to zero.
+      if (!(var_tx[k] < ac_thr || var == 0)) {
+        ac_test = 0;
+        break;
+      }
+
+    for (k = 0; k < num; k++)
+      // Check if dc coefficient can be quantized to zero.
+      if (!(sse_tx[k] - var_tx[k] < dc_thr || sse == var)) {
+        dc_test = 0;
+        break;
+      }
+
+    if (ac_test && dc_test) {
+      int skip_uv[2] = { 0 };
+      unsigned int var_uv[2];
+      unsigned int sse_uv[2];
+      AV1_COMMON *const cm = &cpi->common;
+      // Transform skipping test in UV planes.
+      for (int i = 1; i <= 2; i++) {
+        int j = i - 1;
+        skip_uv[j] = 1;
+        if (x->color_sensitivity[j]) {
+          skip_uv[j] = 0;
+          struct macroblock_plane *const puv = &x->plane[i];
+          struct macroblockd_plane *const puvd = &xd->plane[i];
+          const BLOCK_SIZE uv_bsize = get_plane_block_size(
+              bsize, puvd->subsampling_x, puvd->subsampling_y);
+          // Adjust these thresholds for UV.
+          const int64_t uv_dc_thr =
+              (puv->dequant_QTX[0] * puv->dequant_QTX[0]) >> 3;
+          const int64_t uv_ac_thr =
+              (puv->dequant_QTX[1] * puv->dequant_QTX[1]) >> 3;
+          av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, i,
+                                        i);
+          var_uv[j] = cpi->fn_ptr[uv_bsize].vf(puv->src.buf, puv->src.stride,
+                                               puvd->dst.buf, puvd->dst.stride,
+                                               &sse_uv[j]);
+          if ((var_uv[j] < uv_ac_thr || var_uv[j] == 0) &&
+              (sse_uv[j] - var_uv[j] < uv_dc_thr || sse_uv[j] == var_uv[j]))
+            skip_uv[j] = 1;
+          else
+            break;
+        }
+      }
+      if (skip_uv[0] & skip_uv[1]) {
+        *early_term = 1;
+      }
+    }
+  }
+  if (calculate_rd && out_dist != NULL && out_rate != NULL) {
+    if (!*early_term) {
+      const int bwide = block_size_wide[bsize];
+      const int bhigh = block_size_high[bsize];
+
+      model_rd_with_curvfit(cpi, x, bsize, AOM_PLANE_Y, sse, bwide * bhigh,
+                            out_rate, out_dist);
+    }
+
+    if (*early_term) {
+      *out_rate = 0;
+      *out_dist = sse << 4;
+    }
+  }
+}
+
+static void model_rd_for_sb_y(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
+                              MACROBLOCK *x, MACROBLOCKD *xd, int *out_rate_sum,
+                              int64_t *out_dist_sum, int *skip_txfm_sb,
+                              int64_t *skip_sse_sb, unsigned int *var_y,
+                              unsigned int *sse_y, int calculate_rd) {
+  // Note our transform coeffs are 8 times an orthogonal transform.
+  // Hence quantizer step is also 8 times. To get effective quantizer
+  // we need to divide by 8 before sending to modeling function.
+  const int ref = xd->mi[0]->ref_frame[0];
+
+  assert(bsize < BLOCK_SIZES_ALL);
+
+  struct macroblock_plane *const p = &x->plane[0];
+  struct macroblockd_plane *const pd = &xd->plane[0];
+  unsigned int sse;
+  int rate;
+  int64_t dist;
+
+  unsigned int var = cpi->fn_ptr[bsize].vf(p->src.buf, p->src.stride,
+                                           pd->dst.buf, pd->dst.stride, &sse);
+  xd->mi[0]->tx_size = calculate_tx_size(cpi, bsize, x, var, sse);
+
+  if (calculate_rd) {
+    const int bwide = block_size_wide[bsize];
+    const int bhigh = block_size_high[bsize];
+    model_rd_with_curvfit(cpi, x, bsize, AOM_PLANE_Y, sse, bwide * bhigh, &rate,
+                          &dist);
+  } else {
+    rate = INT_MAX;  // this will be overwritten later with block_yrd
+    dist = INT_MAX;
+  }
+  *var_y = var;
+  *sse_y = sse;
+  x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX);
+
+  assert(rate >= 0);
+
+  if (skip_txfm_sb) *skip_txfm_sb = rate == 0;
+  if (skip_sse_sb) *skip_sse_sb = sse << 4;
+  rate = AOMMIN(rate, INT_MAX);
+  *out_rate_sum = (int)rate;
+  *out_dist_sum = dist;
+}
+
+static void block_yrd(AV1_COMP *cpi, MACROBLOCK *x, int mi_row, int mi_col,
+                      RD_STATS *this_rdc, int *skippable, int64_t *sse,
+                      BLOCK_SIZE bsize, TX_SIZE tx_size) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  const struct macroblockd_plane *pd = &xd->plane[0];
+  struct macroblock_plane *const p = &x->plane[0];
+  const int num_4x4_w = mi_size_wide[bsize];
+  const int num_4x4_h = mi_size_high[bsize];
+  const int step = 1 << (tx_size << 1);
+  const int block_step = (1 << tx_size);
+  int block = 0;
+  const int max_blocks_wide =
+      num_4x4_w + (xd->mb_to_right_edge >= 0 ? 0 : xd->mb_to_right_edge >> 5);
+  const int max_blocks_high =
+      num_4x4_h + (xd->mb_to_bottom_edge >= 0 ? 0 : xd->mb_to_bottom_edge >> 5);
+  int eob_cost = 0;
+  const int bw = 4 * num_4x4_w;
+  const int bh = 4 * num_4x4_h;
+
+  (void)mi_row;
+  (void)mi_col;
+  (void)cpi;
+
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    aom_highbd_subtract_block(bh, bw, p->src_diff, bw, p->src.buf,
+                              p->src.stride, pd->dst.buf, pd->dst.stride,
+                              x->e_mbd.bd);
+  } else {
+    aom_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride,
+                       pd->dst.buf, pd->dst.stride);
+  }
+#else
+  aom_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride,
+                     pd->dst.buf, pd->dst.stride);
+#endif
+
+  *skippable = 1;
+  // Keep track of the row and column of the blocks we use so that we know
+  // if we are in the unrestricted motion border.
+  for (int r = 0; r < max_blocks_high; r += block_step) {
+    for (int c = 0; c < num_4x4_w; c += block_step) {
+      if (c < max_blocks_wide) {
+        const SCAN_ORDER *const scan_order = &av1_default_scan_orders[tx_size];
+        const int block_offset = BLOCK_OFFSET(block);
+#if CONFIG_AV1_HIGHBITDEPTH
+        tran_low_t *const coeff = p->coeff + block_offset;
+        tran_low_t *const qcoeff = p->qcoeff + block_offset;
+        tran_low_t *const dqcoeff = pd->dqcoeff + block_offset;
+#else
+        int16_t *const low_coeff = (int16_t *)p->coeff + block_offset;
+        int16_t *const low_qcoeff = (int16_t *)p->qcoeff + block_offset;
+        int16_t *const low_dqcoeff = (int16_t *)pd->dqcoeff + block_offset;
+#endif
+        uint16_t *const eob = &p->eobs[block];
+        const int diff_stride = bw;
+        const int16_t *src_diff;
+        src_diff = &p->src_diff[(r * diff_stride + c) << 2];
+
+        switch (tx_size) {
+          case TX_64X64:
+            assert(0);  // Not implemented
+            break;
+          case TX_32X32:
+            assert(0);  // Not used
+            break;
+#if CONFIG_AV1_HIGHBITDEPTH
+          case TX_16X16:
+            aom_hadamard_16x16(src_diff, diff_stride, coeff);
+            av1_quantize_fp(coeff, 16 * 16, p->zbin_QTX, p->round_fp_QTX,
+                            p->quant_fp_QTX, p->quant_shift_QTX, qcoeff,
+                            dqcoeff, p->dequant_QTX, eob, scan_order->scan,
+                            scan_order->iscan);
+            break;
+          case TX_8X8:
+            aom_hadamard_8x8(src_diff, diff_stride, coeff);
+            av1_quantize_fp(coeff, 8 * 8, p->zbin_QTX, p->round_fp_QTX,
+                            p->quant_fp_QTX, p->quant_shift_QTX, qcoeff,
+                            dqcoeff, p->dequant_QTX, eob, scan_order->scan,
+                            scan_order->iscan);
+            break;
+#else
+          case TX_16X16:
+            aom_hadamard_lp_16x16(src_diff, diff_stride, low_coeff);
+            av1_quantize_lp(low_coeff, 16 * 16, p->round_fp_QTX,
+                            p->quant_fp_QTX, low_qcoeff, low_dqcoeff,
+                            p->dequant_QTX, eob, scan_order->scan);
+            break;
+          case TX_8X8:
+            aom_hadamard_lp_8x8(src_diff, diff_stride, low_coeff);
+            av1_quantize_lp(low_coeff, 8 * 8, p->round_fp_QTX, p->quant_fp_QTX,
+                            low_qcoeff, low_dqcoeff, p->dequant_QTX, eob,
+                            scan_order->scan);
+            break;
+          default:
+            assert(tx_size == TX_4X4);
+            x->fwd_txfm4x4(src_diff, low_coeff, diff_stride);
+            av1_quantize_lp(low_coeff, 4 * 4, p->round_fp_QTX, p->quant_fp_QTX,
+                            low_qcoeff, low_dqcoeff, p->dequant_QTX, eob,
+                            scan_order->scan);
+            break;
+#endif
+        }
+        *skippable &= (*eob == 0);
+        eob_cost += 1;
+      }
+      block += step;
+    }
+  }
+  this_rdc->skip = *skippable;
+  this_rdc->rate = 0;
+  if (*sse < INT64_MAX) {
+    *sse = (*sse << 6) >> 2;
+    if (*skippable) {
+      this_rdc->dist = *sse;
+      return;
+    }
+  }
+
+  block = 0;
+  this_rdc->dist = 0;
+  for (int r = 0; r < max_blocks_high; r += block_step) {
+    for (int c = 0; c < num_4x4_w; c += block_step) {
+      if (c < max_blocks_wide) {
+        const int block_offset = BLOCK_OFFSET(block);
+        uint16_t *const eob = &p->eobs[block];
+#if CONFIG_AV1_HIGHBITDEPTH
+        int64_t dummy;
+        tran_low_t *const coeff = p->coeff + block_offset;
+        tran_low_t *const qcoeff = p->qcoeff + block_offset;
+        tran_low_t *const dqcoeff = pd->dqcoeff + block_offset;
+
+        if (*eob == 1)
+          this_rdc->rate += (int)abs(qcoeff[0]);
+        else if (*eob > 1)
+          this_rdc->rate += aom_satd(qcoeff, step << 4);
+
+        this_rdc->dist +=
+            av1_block_error(coeff, dqcoeff, step << 4, &dummy) >> 2;
+#else
+        int16_t *const low_coeff = (int16_t *)p->coeff + block_offset;
+        int16_t *const low_qcoeff = (int16_t *)p->qcoeff + block_offset;
+        int16_t *const low_dqcoeff = (int16_t *)pd->dqcoeff + block_offset;
+
+        if (*eob == 1)
+          this_rdc->rate += (int)abs(low_qcoeff[0]);
+        else if (*eob > 1)
+          this_rdc->rate += aom_satd_lp(low_qcoeff, step << 4);
+
+        this_rdc->dist +=
+            av1_block_error_lp(low_coeff, low_dqcoeff, step << 4) >> 2;
+#endif
+      }
+      block += step;
+    }
+  }
+
+  // If skippable is set, rate gets clobbered later.
+  this_rdc->rate <<= (2 + AV1_PROB_COST_SHIFT);
+  this_rdc->rate += (eob_cost << AV1_PROB_COST_SHIFT);
+}
+
+static INLINE void init_mbmi(MB_MODE_INFO *mbmi, PREDICTION_MODE pred_mode,
+                             MV_REFERENCE_FRAME ref_frame0,
+                             MV_REFERENCE_FRAME ref_frame1,
+                             const AV1_COMMON *cm) {
+  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  mbmi->ref_mv_idx = 0;
+  mbmi->mode = pred_mode;
+  mbmi->uv_mode = UV_DC_PRED;
+  mbmi->ref_frame[0] = ref_frame0;
+  mbmi->ref_frame[1] = ref_frame1;
+  pmi->palette_size[0] = 0;
+  pmi->palette_size[1] = 0;
+  mbmi->filter_intra_mode_info.use_filter_intra = 0;
+  mbmi->mv[0].as_int = mbmi->mv[1].as_int = 0;
+  mbmi->motion_mode = SIMPLE_TRANSLATION;
+  mbmi->num_proj_ref = 1;
+  mbmi->interintra_mode = 0;
+  set_default_interp_filters(mbmi, cm->features.interp_filter);
+}
+
+#if CONFIG_INTERNAL_STATS
+static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
+                                 int mode_index) {
+#else
+static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
+#endif  // CONFIG_INTERNAL_STATS
+  MACROBLOCKD *const xd = &x->e_mbd;
+
+  // Take a snapshot of the coding context so it can be
+  // restored if we decide to encode this way
+  ctx->rd_stats.skip = x->force_skip;
+  memset(ctx->blk_skip, 0, sizeof(ctx->blk_skip[0]) * ctx->num_4x4_blk);
+  memset(ctx->tx_type_map, DCT_DCT,
+         sizeof(ctx->tx_type_map[0]) * ctx->num_4x4_blk);
+  ctx->skippable = x->force_skip;
+#if CONFIG_INTERNAL_STATS
+  ctx->best_mode_index = mode_index;
+#endif  // CONFIG_INTERNAL_STATS
+  ctx->mic = *xd->mi[0];
+  ctx->skippable = x->force_skip;
+  av1_copy_mbmi_ext_to_mbmi_ext_frame(&ctx->mbmi_ext_best, x->mbmi_ext,
+                                      av1_ref_frame_type(xd->mi[0]->ref_frame));
+  ctx->comp_pred_diff = 0;
+  ctx->hybrid_pred_diff = 0;
+  ctx->single_pred_diff = 0;
+}
+
+static int get_pred_buffer(PRED_BUFFER *p, int len) {
+  for (int i = 0; i < len; i++) {
+    if (!p[i].in_use) {
+      p[i].in_use = 1;
+      return i;
+    }
+  }
+  return -1;
+}
+
+static void free_pred_buffer(PRED_BUFFER *p) {
+  if (p != NULL) p->in_use = 0;
+}
+
+static int cost_mv_ref(const MACROBLOCK *const x, PREDICTION_MODE mode,
+                       int16_t mode_context) {
+  if (is_inter_compound_mode(mode)) {
+    return x
+        ->inter_compound_mode_cost[mode_context][INTER_COMPOUND_OFFSET(mode)];
+  }
+
+  int mode_cost = 0;
+  int16_t mode_ctx = mode_context & NEWMV_CTX_MASK;
+
+  assert(is_inter_mode(mode));
+
+  if (mode == NEWMV) {
+    mode_cost = x->newmv_mode_cost[mode_ctx][0];
+    return mode_cost;
+  } else {
+    mode_cost = x->newmv_mode_cost[mode_ctx][1];
+    mode_ctx = (mode_context >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK;
+
+    if (mode == GLOBALMV) {
+      mode_cost += x->zeromv_mode_cost[mode_ctx][0];
+      return mode_cost;
+    } else {
+      mode_cost += x->zeromv_mode_cost[mode_ctx][1];
+      mode_ctx = (mode_context >> REFMV_OFFSET) & REFMV_CTX_MASK;
+      mode_cost += x->refmv_mode_cost[mode_ctx][mode != NEARESTMV];
+      return mode_cost;
+    }
+  }
+}
+
+static void newmv_diff_bias(MACROBLOCKD *xd, PREDICTION_MODE this_mode,
+                            RD_STATS *this_rdc, BLOCK_SIZE bsize, int mv_row,
+                            int mv_col, int speed, uint32_t spatial_variance) {
+  // Bias against MVs associated with NEWMV mode that are very different from
+  // top/left neighbors.
+  if (this_mode == NEWMV) {
+    int al_mv_average_row;
+    int al_mv_average_col;
+    int left_row, left_col;
+    int row_diff, col_diff;
+    int above_mv_valid = 0;
+    int left_mv_valid = 0;
+    int above_row = 0;
+    int above_col = 0;
+
+    if (xd->above_mbmi) {
+      above_mv_valid = xd->above_mbmi->mv[0].as_int != INVALID_MV;
+      above_row = xd->above_mbmi->mv[0].as_mv.row;
+      above_col = xd->above_mbmi->mv[0].as_mv.col;
+    }
+    if (xd->left_mbmi) {
+      left_mv_valid = xd->left_mbmi->mv[0].as_int != INVALID_MV;
+      left_row = xd->left_mbmi->mv[0].as_mv.row;
+      left_col = xd->left_mbmi->mv[0].as_mv.col;
+    }
+    if (above_mv_valid && left_mv_valid) {
+      al_mv_average_row = (above_row + left_row + 1) >> 1;
+      al_mv_average_col = (above_col + left_col + 1) >> 1;
+    } else if (above_mv_valid) {
+      al_mv_average_row = above_row;
+      al_mv_average_col = above_col;
+    } else if (left_mv_valid) {
+      al_mv_average_row = left_row;
+      al_mv_average_col = left_col;
+    } else {
+      al_mv_average_row = al_mv_average_col = 0;
+    }
+    row_diff = al_mv_average_row - mv_row;
+    col_diff = al_mv_average_col - mv_col;
+    if (row_diff > 80 || row_diff < -80 || col_diff > 80 || col_diff < -80) {
+      if (bsize >= BLOCK_32X32)
+        this_rdc->rdcost = this_rdc->rdcost << 1;
+      else
+        this_rdc->rdcost = 5 * this_rdc->rdcost >> 2;
+    }
+  } else {
+    // Bias for speed >= 8 for low spatial variance.
+    if (speed >= 8 && spatial_variance < 150 &&
+        (mv_row > 64 || mv_row < -64 || mv_col > 64 || mv_col < -64))
+      this_rdc->rdcost = 5 * this_rdc->rdcost >> 2;
+  }
+}
+
+static void model_rd_for_sb_uv(AV1_COMP *cpi, BLOCK_SIZE plane_bsize,
+                               MACROBLOCK *x, MACROBLOCKD *xd,
+                               RD_STATS *this_rdc, unsigned int *var_y,
+                               unsigned int *sse_y, int start_plane,
+                               int stop_plane) {
+  // Note our transform coeffs are 8 times an orthogonal transform.
+  // Hence quantizer step is also 8 times. To get effective quantizer
+  // we need to divide by 8 before sending to modeling function.
+  unsigned int sse;
+  int rate;
+  int64_t dist;
+  int i;
+  uint32_t tot_var = *var_y;
+  uint32_t tot_sse = *sse_y;
+
+  this_rdc->rate = 0;
+  this_rdc->dist = 0;
+  this_rdc->skip = 0;
+
+  for (i = start_plane; i <= stop_plane; ++i) {
+    struct macroblock_plane *const p = &x->plane[i];
+    struct macroblockd_plane *const pd = &xd->plane[i];
+    const uint32_t dc_quant = p->dequant_QTX[0];
+    const uint32_t ac_quant = p->dequant_QTX[1];
+    const BLOCK_SIZE bs = plane_bsize;
+    unsigned int var;
+    if (!x->color_sensitivity[i - 1]) continue;
+
+    var = cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf,
+                             pd->dst.stride, &sse);
+    assert(sse >= var);
+    tot_var += var;
+    tot_sse += sse;
+
+    av1_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bs],
+                                 dc_quant >> 3, &rate, &dist);
+
+    this_rdc->rate += rate >> 1;
+    this_rdc->dist += dist << 3;
+
+    av1_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bs], ac_quant >> 3,
+                                 &rate, &dist);
+
+    this_rdc->rate += rate;
+    this_rdc->dist += dist << 4;
+  }
+
+  if (this_rdc->rate == 0) {
+    this_rdc->skip = 1;
+  }
+
+  if (RDCOST(x->rdmult, this_rdc->rate, this_rdc->dist) >=
+      RDCOST(x->rdmult, 0, ((int64_t)tot_sse) << 4)) {
+    this_rdc->rate = 0;
+    this_rdc->dist = tot_sse << 4;
+    this_rdc->skip = 1;
+  }
+
+  *var_y = tot_var;
+  *sse_y = tot_sse;
+}
+
+struct estimate_block_intra_args {
+  AV1_COMP *cpi;
+  MACROBLOCK *x;
+  PREDICTION_MODE mode;
+  int skippable;
+  RD_STATS *rdc;
+};
+
+static void estimate_block_intra(int plane, int block, int row, int col,
+                                 BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                                 void *arg) {
+  struct estimate_block_intra_args *const args = arg;
+  AV1_COMP *const cpi = args->cpi;
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = args->x;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblock_plane *const p = &x->plane[plane];
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const BLOCK_SIZE bsize_tx = txsize_to_bsize[tx_size];
+  uint8_t *const src_buf_base = p->src.buf;
+  uint8_t *const dst_buf_base = pd->dst.buf;
+  const int64_t src_stride = p->src.stride;
+  const int64_t dst_stride = pd->dst.stride;
+  RD_STATS this_rdc;
+
+  (void)block;
+
+  p->src.buf = &src_buf_base[4 * (row * src_stride + col)];
+  pd->dst.buf = &dst_buf_base[4 * (row * dst_stride + col)];
+
+  av1_predict_intra_block_facade(cm, xd, plane, col, row, tx_size);
+
+  if (plane == 0) {
+    int64_t this_sse = INT64_MAX;
+    block_yrd(cpi, x, 0, 0, &this_rdc, &args->skippable, &this_sse, bsize_tx,
+              AOMMIN(tx_size, TX_16X16));
+  } else {
+    unsigned int var = 0;
+    unsigned int sse = 0;
+    model_rd_for_sb_uv(cpi, plane_bsize, x, xd, &this_rdc, &var, &sse, plane,
+                       plane);
+  }
+
+  p->src.buf = src_buf_base;
+  pd->dst.buf = dst_buf_base;
+  args->rdc->rate += this_rdc.rate;
+  args->rdc->dist += this_rdc.dist;
+}
+
+static INLINE void update_thresh_freq_fact(AV1_COMP *cpi, MACROBLOCK *x,
+                                           BLOCK_SIZE bsize,
+                                           MV_REFERENCE_FRAME ref_frame,
+                                           THR_MODES best_mode_idx,
+                                           PREDICTION_MODE mode) {
+  THR_MODES thr_mode_idx = mode_idx[ref_frame][mode_offset(mode)];
+  int *freq_fact = &x->thresh_freq_fact[bsize][thr_mode_idx];
+  if (thr_mode_idx == best_mode_idx) {
+    *freq_fact -= (*freq_fact >> 4);
+  } else {
+    *freq_fact =
+        AOMMIN(*freq_fact + RD_THRESH_INC,
+               cpi->sf.inter_sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT);
+  }
+}
+
+static INLINE int get_force_skip_low_temp_var_small_sb(uint8_t *variance_low,
+                                                       int mi_row, int mi_col,
+                                                       BLOCK_SIZE bsize) {
+  // Relative indices of MB inside the superblock.
+  const int mi_x = mi_row & 0xF;
+  const int mi_y = mi_col & 0xF;
+  // Relative indices of 16x16 block inside the superblock.
+  const int i = mi_x >> 2;
+  const int j = mi_y >> 2;
+  int force_skip_low_temp_var = 0;
+  // Set force_skip_low_temp_var based on the block size and block offset.
+  switch (bsize) {
+    case BLOCK_64X64: force_skip_low_temp_var = variance_low[0]; break;
+    case BLOCK_64X32:
+      if (!mi_y && !mi_x) {
+        force_skip_low_temp_var = variance_low[1];
+      } else if (!mi_y && mi_x) {
+        force_skip_low_temp_var = variance_low[2];
+      }
+      break;
+    case BLOCK_32X64:
+      if (!mi_y && !mi_x) {
+        force_skip_low_temp_var = variance_low[3];
+      } else if (mi_y && !mi_x) {
+        force_skip_low_temp_var = variance_low[4];
+      }
+      break;
+    case BLOCK_32X32:
+      if (!mi_y && !mi_x) {
+        force_skip_low_temp_var = variance_low[5];
+      } else if (mi_y && !mi_x) {
+        force_skip_low_temp_var = variance_low[6];
+      } else if (!mi_y && mi_x) {
+        force_skip_low_temp_var = variance_low[7];
+      } else if (mi_y && mi_x) {
+        force_skip_low_temp_var = variance_low[8];
+      }
+      break;
+    case BLOCK_32X16:
+    case BLOCK_16X32:
+    case BLOCK_16X16:
+      force_skip_low_temp_var = variance_low[pos_shift_16x16[i][j]];
+      break;
+    default: break;
+  }
+
+  return force_skip_low_temp_var;
+}
+
+static INLINE int get_force_skip_low_temp_var(uint8_t *variance_low, int mi_row,
+                                              int mi_col, BLOCK_SIZE bsize) {
+  int force_skip_low_temp_var = 0;
+  int x, y;
+  x = (mi_col & 0x1F) >> 4;
+  // y = (mi_row & 0x1F) >> 4;
+  // const int idx64 = (y << 1) + x;
+  y = (mi_row & 0x17) >> 3;
+  const int idx64 = y + x;
+
+  x = (mi_col & 0xF) >> 3;
+  // y = (mi_row & 0xF) >> 3;
+  // const int idx32 = (y << 1) + x;
+  y = (mi_row & 0xB) >> 2;
+  const int idx32 = y + x;
+
+  x = (mi_col & 0x7) >> 2;
+  // y = (mi_row & 0x7) >> 2;
+  // const int idx16 = (y << 1) + x;
+  y = (mi_row & 0x5) >> 1;
+  const int idx16 = y + x;
+  // Set force_skip_low_temp_var based on the block size and block offset.
+  switch (bsize) {
+    case BLOCK_128X128: force_skip_low_temp_var = variance_low[0]; break;
+    case BLOCK_128X64:
+      assert((mi_col & 0x1F) == 0);
+      force_skip_low_temp_var = variance_low[1 + ((mi_row & 0x1F) != 0)];
+      break;
+    case BLOCK_64X128:
+      assert((mi_row & 0x1F) == 0);
+      force_skip_low_temp_var = variance_low[3 + ((mi_col & 0x1F) != 0)];
+      break;
+    case BLOCK_64X64:
+      // Location of this 64x64 block inside the 128x128 superblock
+      force_skip_low_temp_var = variance_low[5 + idx64];
+      break;
+    case BLOCK_64X32:
+      x = (mi_col & 0x1F) >> 4;
+      y = (mi_row & 0x1F) >> 3;
+      /*
+      .---------------.---------------.
+      | x=0,y=0,idx=0 | x=0,y=0,idx=2 |
+      :---------------+---------------:
+      | x=0,y=1,idx=1 | x=1,y=1,idx=3 |
+      :---------------+---------------:
+      | x=0,y=2,idx=4 | x=1,y=2,idx=6 |
+      :---------------+---------------:
+      | x=0,y=3,idx=5 | x=1,y=3,idx=7 |
+      '---------------'---------------'
+      */
+      const int idx64x32 = (x << 1) + (y % 2) + ((y >> 1) << 2);
+      force_skip_low_temp_var = variance_low[9 + idx64x32];
+      break;
+    case BLOCK_32X64:
+      x = (mi_col & 0x1F) >> 3;
+      y = (mi_row & 0x1F) >> 4;
+      const int idx32x64 = (y << 2) + x;
+      force_skip_low_temp_var = variance_low[17 + idx32x64];
+      break;
+    case BLOCK_32X32:
+      force_skip_low_temp_var = variance_low[25 + (idx64 << 2) + idx32];
+      break;
+    case BLOCK_32X16:
+    case BLOCK_16X32:
+    case BLOCK_16X16:
+      force_skip_low_temp_var =
+          variance_low[41 + (idx64 << 4) + (idx32 << 2) + idx16];
+      break;
+    default: break;
+  }
+  return force_skip_low_temp_var;
+}
+
+#define FILTER_SEARCH_SIZE 2
+static void search_filter_ref(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *this_rdc,
+                              int mi_row, int mi_col, PRED_BUFFER *tmp,
+                              BLOCK_SIZE bsize, int reuse_inter_pred,
+                              PRED_BUFFER **this_mode_pred, unsigned int *var_y,
+                              unsigned int *sse_y, int *this_early_term,
+                              int use_model_yrd_large, int64_t *sse_block_yrd) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblockd_plane *const pd = &xd->plane[0];
+  MB_MODE_INFO *const mi = xd->mi[0];
+  const int bw = block_size_wide[bsize];
+  int pf_rate[FILTER_SEARCH_SIZE] = { 0 };
+  int64_t pf_dist[FILTER_SEARCH_SIZE] = { 0 };
+  unsigned int pf_var[FILTER_SEARCH_SIZE] = { 0 };
+  unsigned int pf_sse[FILTER_SEARCH_SIZE] = { 0 };
+  int64_t pf_sse_block_yrd[FILTER_SEARCH_SIZE] = { 0 };
+  TX_SIZE pf_tx_size[FILTER_SEARCH_SIZE] = { 0 };
+  PRED_BUFFER *current_pred = *this_mode_pred;
+  int skip_txfm[FILTER_SEARCH_SIZE] = { 0 };
+  int best_skip = 0;
+  int best_early_term = 0;
+  int64_t best_cost = INT64_MAX;
+  int best_filter_index = -1;
+  InterpFilter filters[FILTER_SEARCH_SIZE] = { EIGHTTAP_REGULAR,
+                                               EIGHTTAP_SMOOTH };
+  int i;
+  for (i = 0; i < FILTER_SEARCH_SIZE; ++i) {
+    int64_t cost;
+    InterpFilter filter = filters[i];
+    mi->interp_filters = av1_broadcast_interp_filter(filter);
+    av1_enc_build_inter_predictor_y(xd, mi_row, mi_col);
+    if (use_model_yrd_large)
+      model_skip_for_sb_y_large(cpi, bsize, mi_row, mi_col, x, xd, &pf_rate[i],
+                                &pf_dist[i], &pf_var[i], &pf_sse[i],
+                                this_early_term, 1);
+    else
+      model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rate[i], &pf_dist[i],
+                        &skip_txfm[i], NULL, &pf_var[i], &pf_sse[i], 1);
+    pf_rate[i] += av1_get_switchable_rate(x, xd, cm->features.interp_filter);
+    cost = RDCOST(x->rdmult, pf_rate[i], pf_dist[i]);
+    pf_tx_size[i] = mi->tx_size;
+    if (cost < best_cost) {
+      best_filter_index = i;
+      best_cost = cost;
+      best_skip = skip_txfm[i];
+      best_early_term = *this_early_term;
+      if (reuse_inter_pred) {
+        if (*this_mode_pred != current_pred) {
+          free_pred_buffer(*this_mode_pred);
+          *this_mode_pred = current_pred;
+        }
+        current_pred = &tmp[get_pred_buffer(tmp, 3)];
+        pd->dst.buf = current_pred->data;
+        pd->dst.stride = bw;
+      }
+    }
+  }
+  assert(best_filter_index >= 0 && best_filter_index < FILTER_SEARCH_SIZE);
+  if (reuse_inter_pred && *this_mode_pred != current_pred)
+    free_pred_buffer(current_pred);
+
+  mi->interp_filters = av1_broadcast_interp_filter(filters[best_filter_index]);
+  mi->tx_size = pf_tx_size[best_filter_index];
+  this_rdc->rate = pf_rate[best_filter_index];
+  this_rdc->dist = pf_dist[best_filter_index];
+  *var_y = pf_var[best_filter_index];
+  *sse_y = pf_sse[best_filter_index];
+  *sse_block_yrd = pf_sse_block_yrd[best_filter_index];
+  this_rdc->skip = (best_skip || best_early_term);
+  *this_early_term = best_early_term;
+  if (reuse_inter_pred) {
+    pd->dst.buf = (*this_mode_pred)->data;
+    pd->dst.stride = (*this_mode_pred)->stride;
+  } else if (best_filter_index < FILTER_SEARCH_SIZE - 1) {
+    av1_enc_build_inter_predictor_y(xd, mi_row, mi_col);
+  }
+}
+
+#define COLLECT_PICK_MODE_STAT 0
+
+#if COLLECT_PICK_MODE_STAT
+typedef struct _mode_search_stat {
+  int32_t num_blocks[BLOCK_SIZES];
+  int64_t avg_block_times[BLOCK_SIZES];
+  int32_t num_searches[BLOCK_SIZES][MB_MODE_COUNT];
+  int32_t num_nonskipped_searches[BLOCK_SIZES][MB_MODE_COUNT];
+  int64_t search_times[BLOCK_SIZES][MB_MODE_COUNT];
+  int64_t nonskipped_search_times[BLOCK_SIZES][MB_MODE_COUNT];
+  struct aom_usec_timer timer1;
+  struct aom_usec_timer timer2;
+} mode_search_stat;
+#endif  // COLLECT_PICK_MODE_STAT
+
+static void compute_intra_yprediction(const AV1_COMMON *cm,
+                                      PREDICTION_MODE mode, BLOCK_SIZE bsize,
+                                      MACROBLOCK *x, MACROBLOCKD *xd) {
+  struct macroblockd_plane *const pd = &xd->plane[0];
+  struct macroblock_plane *const p = &x->plane[0];
+  uint8_t *const src_buf_base = p->src.buf;
+  uint8_t *const dst_buf_base = pd->dst.buf;
+  const int src_stride = p->src.stride;
+  const int dst_stride = pd->dst.stride;
+  int plane = 0;
+  int row, col;
+  // block and transform sizes, in number of 4x4 blocks log 2 ("*_b")
+  // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8
+  // transform size varies per plane, look it up in a common way.
+  const TX_SIZE tx_size = max_txsize_lookup[bsize];
+  const BLOCK_SIZE plane_bsize =
+      get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+  // If mb_to_right_edge is < 0 we are in a situation in which
+  // the current block size extends into the UMV and we won't
+  // visit the sub blocks that are wholly within the UMV.
+  const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+  const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+  // Keep track of the row and column of the blocks we use so that we know
+  // if we are in the unrestricted motion border.
+  for (row = 0; row < max_blocks_high; row += (1 << tx_size)) {
+    // Skip visiting the sub blocks that are wholly within the UMV.
+    for (col = 0; col < max_blocks_wide; col += (1 << tx_size)) {
+      p->src.buf = &src_buf_base[4 * (row * (int64_t)src_stride + col)];
+      pd->dst.buf = &dst_buf_base[4 * (row * (int64_t)dst_stride + col)];
+      av1_predict_intra_block(cm, xd, block_size_wide[bsize],
+                              block_size_high[bsize], tx_size, mode, 0, 0,
+                              FILTER_INTRA_MODES, pd->dst.buf, dst_stride,
+                              pd->dst.buf, dst_stride, 0, 0, plane);
+    }
+  }
+  p->src.buf = src_buf_base;
+  pd->dst.buf = dst_buf_base;
+}
+
+void av1_pick_intra_mode(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *rd_cost,
+                         BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mi = xd->mi[0];
+  RD_STATS this_rdc, best_rdc;
+  struct estimate_block_intra_args args = { cpi, x, DC_PRED, 1, 0 };
+  const TX_SIZE intra_tx_size =
+      AOMMIN(max_txsize_lookup[bsize],
+             tx_mode_to_biggest_tx_size[x->tx_mode_search_type]);
+  int *bmode_costs;
+  const MB_MODE_INFO *above_mi = xd->above_mbmi;
+  const MB_MODE_INFO *left_mi = xd->left_mbmi;
+  const PREDICTION_MODE A = av1_above_block_mode(above_mi);
+  const PREDICTION_MODE L = av1_left_block_mode(left_mi);
+  bmode_costs = x->y_mode_costs[A][L];
+
+  av1_invalid_rd_stats(&best_rdc);
+  av1_invalid_rd_stats(&this_rdc);
+
+  init_mbmi(mi, DC_PRED, INTRA_FRAME, NONE_FRAME, cm);
+  mi->mv[0].as_int = mi->mv[1].as_int = INVALID_MV;
+
+  // Change the limit of this loop to add other intra prediction
+  // mode tests.
+  for (int i = 0; i < 4; ++i) {
+    PREDICTION_MODE this_mode = intra_mode_list[i];
+    this_rdc.dist = this_rdc.rate = 0;
+    args.mode = this_mode;
+    args.skippable = 1;
+    args.rdc = &this_rdc;
+    mi->tx_size = intra_tx_size;
+    av1_foreach_transformed_block_in_plane(xd, bsize, 0, estimate_block_intra,
+                                           &args);
+    if (args.skippable) {
+      this_rdc.rate = av1_cost_symbol(av1_get_skip_cdf(xd)[1]);
+    } else {
+      this_rdc.rate += av1_cost_symbol(av1_get_skip_cdf(xd)[0]);
+    }
+    this_rdc.rate += bmode_costs[this_mode];
+    this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist);
+
+    if (this_rdc.rdcost < best_rdc.rdcost) {
+      best_rdc = this_rdc;
+      mi->mode = this_mode;
+    }
+  }
+
+  *rd_cost = best_rdc;
+
+#if CONFIG_INTERNAL_STATS
+  store_coding_context(x, ctx, mi->mode);
+#else
+  store_coding_context(x, ctx);
+#endif  // CONFIG_INTERNAL_STATS
+}
+
+void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
+                                  MACROBLOCK *x, RD_STATS *rd_cost,
+                                  BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+                                  int64_t best_rd_so_far) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mi = xd->mi[0];
+  struct macroblockd_plane *const pd = &xd->plane[0];
+
+  BEST_PICKMODE best_pickmode;
+  int inter_mode_mask[BLOCK_SIZES];
+#if COLLECT_PICK_MODE_STAT
+  static mode_search_stat ms_stat;
+#endif
+  MV_REFERENCE_FRAME ref_frame;
+  MV_REFERENCE_FRAME usable_ref_frame, second_ref_frame;
+  int_mv frame_mv[MB_MODE_COUNT][REF_FRAMES];
+  uint8_t mode_checked[MB_MODE_COUNT][REF_FRAMES];
+  struct buf_2d yv12_mb[8][MAX_MB_PLANE];
+  static const int flag_list[8] = { 0, AOM_LAST_FLAG, 0, 0, AOM_GOLD_FLAG, 0,
+                                    0, AOM_ALT_FLAG };
+  RD_STATS this_rdc, best_rdc;
+  // var_y and sse_y are saved to be used in skipping checking
+  unsigned int sse_y = UINT_MAX;
+  unsigned int var_y = UINT_MAX;
+  const int *const rd_threshes = cpi->rd.threshes[mi->segment_id][bsize];
+  const int *const rd_thresh_freq_fact = x->thresh_freq_fact[bsize];
+  InterpFilter filter_ref;
+  int ref_frame_skip_mask = 0;
+  int best_pred_sad = INT_MAX;
+  int best_early_term = 0;
+  unsigned int ref_costs_single[REF_FRAMES],
+      ref_costs_comp[REF_FRAMES][REF_FRAMES];
+  int force_skip_low_temp_var = 0;
+  int skip_ref_find_pred[8] = { 0 };
+  unsigned int sse_zeromv_norm = UINT_MAX;
+  const unsigned int thresh_skip_golden = 500;
+  int gf_temporal_ref = 0;
+  const struct segmentation *const seg = &cm->seg;
+  int num_inter_modes = RT_INTER_MODES;
+  unsigned char segment_id = mi->segment_id;
+  PRED_BUFFER tmp[4];
+  DECLARE_ALIGNED(16, uint8_t, pred_buf[3 * 128 * 128]);
+  PRED_BUFFER *this_mode_pred = NULL;
+  const int reuse_inter_pred =
+      cpi->sf.rt_sf.reuse_inter_pred_nonrd && cm->seq_params.bit_depth == 8;
+  const int bh = block_size_high[bsize];
+  const int bw = block_size_wide[bsize];
+  const int pixels_in_block = bh * bw;
+  struct buf_2d orig_dst = pd->dst;
+  const CommonQuantParams *quant_params = &cm->quant_params;
+#if COLLECT_PICK_MODE_STAT
+  aom_usec_timer_start(&ms_stat.timer2);
+#endif
+  int intra_cost_penalty = av1_get_intra_cost_penalty(
+      quant_params->base_qindex, quant_params->y_dc_delta_q,
+      cm->seq_params.bit_depth);
+  int64_t inter_mode_thresh = RDCOST(x->rdmult, intra_cost_penalty, 0);
+  const int perform_intra_pred = cpi->sf.rt_sf.check_intra_pred_nonrd;
+  int use_modeled_non_rd_cost = 0;
+  int enable_filter_search = 0;
+  InterpFilter default_interp_filter = EIGHTTAP_REGULAR;
+  int64_t thresh_sad_pred = INT64_MAX;
+
+  (void)best_rd_so_far;
+
+  init_best_pickmode(&best_pickmode);
+
+  for (int i = 0; i < BLOCK_SIZES; ++i) inter_mode_mask[i] = INTER_ALL;
+
+  // TODO(kyslov) Move this to Speed Features
+  inter_mode_mask[BLOCK_128X128] = INTER_NEAREST_NEAR;
+
+  struct scale_factors *const sf_last = get_ref_scale_factors(cm, LAST_FRAME);
+  struct scale_factors *const sf_golden =
+      get_ref_scale_factors(cm, GOLDEN_FRAME);
+  gf_temporal_ref = 1;
+  // For temporal long term prediction, check that the golden reference
+  // is same scale as last reference, otherwise disable.
+  if ((sf_last->x_scale_fp != sf_golden->x_scale_fp) ||
+      (sf_last->y_scale_fp != sf_golden->y_scale_fp)) {
+    gf_temporal_ref = 0;
+  }
+
+  av1_collect_neighbors_ref_counts(xd);
+
+  estimate_single_ref_frame_costs(cm, xd, x, segment_id, ref_costs_single);
+  if (cpi->sf.rt_sf.use_comp_ref_nonrd)
+    estimate_comp_ref_frame_costs(cm, xd, x, segment_id, ref_costs_comp);
+
+  memset(&mode_checked[0][0], 0, MB_MODE_COUNT * REF_FRAMES);
+  if (reuse_inter_pred) {
+    for (int i = 0; i < 3; i++) {
+      tmp[i].data = &pred_buf[pixels_in_block * i];
+      tmp[i].stride = bw;
+      tmp[i].in_use = 0;
+    }
+    tmp[3].data = pd->dst.buf;
+    tmp[3].stride = pd->dst.stride;
+    tmp[3].in_use = 0;
+  }
+
+  x->force_skip = 0;
+
+  // Instead of using av1_get_pred_context_switchable_interp(xd) to assign
+  // filter_ref, we use a less strict condition on assigning filter_ref.
+  // This is to reduce the probabily of entering the flow of not assigning
+  // filter_ref and then skip filter search.
+  filter_ref = cm->features.interp_filter;
+
+  // initialize mode decisions
+  av1_invalid_rd_stats(&best_rdc);
+  av1_invalid_rd_stats(&this_rdc);
+  av1_invalid_rd_stats(rd_cost);
+  mi->sb_type = bsize;
+  mi->ref_frame[0] = NONE_FRAME;
+  mi->ref_frame[1] = NONE_FRAME;
+
+  usable_ref_frame =
+      cpi->sf.rt_sf.use_nonrd_altref_frame ? ALTREF_FRAME : GOLDEN_FRAME;
+
+  if (cpi->rc.frames_since_golden == 0 && gf_temporal_ref) {
+    skip_ref_find_pred[GOLDEN_FRAME] = 1;
+    if (!cpi->sf.rt_sf.use_nonrd_altref_frame) usable_ref_frame = LAST_FRAME;
+  }
+
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+  const int is_small_sb = (cm->seq_params.sb_size == BLOCK_64X64);
+  if (cpi->sf.rt_sf.short_circuit_low_temp_var &&
+      x->nonrd_prune_ref_frame_search) {
+    if (is_small_sb)
+      force_skip_low_temp_var = get_force_skip_low_temp_var_small_sb(
+          &x->variance_low[0], mi_row, mi_col, bsize);
+    else
+      force_skip_low_temp_var = get_force_skip_low_temp_var(
+          &x->variance_low[0], mi_row, mi_col, bsize);
+    // If force_skip_low_temp_var is set, skip golden reference.
+    if (force_skip_low_temp_var) {
+      usable_ref_frame = LAST_FRAME;
+    }
+  }
+
+  // If the segment reference frame feature is enabled and it's set to GOLDEN
+  // reference, then make sure we don't skip checking GOLDEN, this is to
+  // prevent possibility of not picking any mode.
+  if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME) &&
+      get_segdata(seg, mi->segment_id, SEG_LVL_REF_FRAME) == GOLDEN_FRAME) {
+    usable_ref_frame = GOLDEN_FRAME;
+    skip_ref_find_pred[GOLDEN_FRAME] = 0;
+  }
+
+  for (MV_REFERENCE_FRAME ref_frame_iter = LAST_FRAME;
+       ref_frame_iter <= usable_ref_frame; ++ref_frame_iter) {
+    // Skip find_predictor if the reference frame is not in the
+    // ref_frame_flags (i.e., not used as a reference for this frame).
+    skip_ref_find_pred[ref_frame_iter] =
+        !(cpi->ref_frame_flags & flag_list[ref_frame_iter]);
+    if (!skip_ref_find_pred[ref_frame_iter]) {
+      find_predictors(cpi, x, ref_frame_iter, frame_mv, &ref_frame_skip_mask,
+                      flag_list, tile_data, yv12_mb, bsize,
+                      force_skip_low_temp_var);
+    }
+  }
+
+  thresh_sad_pred = ((int64_t)x->pred_mv_sad[LAST_FRAME]) << 1;
+  // Increase threshold for less agressive pruning.
+  if (cpi->sf.rt_sf.nonrd_prune_ref_frame_search == 1)
+    thresh_sad_pred += (x->pred_mv_sad[LAST_FRAME] >> 2);
+
+  const int large_block = bsize >= BLOCK_32X32;
+  const int use_model_yrd_large =
+      cpi->oxcf.rc_mode == AOM_CBR && large_block &&
+      !cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id) &&
+      quant_params->base_qindex && cm->seq_params.bit_depth == 8;
+
+#if COLLECT_PICK_MODE_STAT
+  ms_stat.num_blocks[bsize]++;
+#endif
+  init_mbmi(mi, DC_PRED, NONE_FRAME, NONE_FRAME, cm);
+  mi->tx_size =
+      AOMMIN(AOMMIN(max_txsize_lookup[bsize],
+                    tx_mode_to_biggest_tx_size[x->tx_mode_search_type]),
+             TX_16X16);
+
+  // TODO(marpan): Look into reducing these conditions. For now constrain
+  // it to avoid significant bdrate loss.
+  if (cpi->sf.rt_sf.use_modeled_non_rd_cost &&
+      quant_params->base_qindex > 120 && x->source_variance > 100 &&
+      bsize <= BLOCK_16X16 && x->content_state_sb != kLowVarHighSumdiff &&
+      x->content_state_sb != kHighSad)
+    use_modeled_non_rd_cost = 1;
+
+  if (cpi->sf.rt_sf.use_nonrd_filter_search) {
+    enable_filter_search = 1;
+    if (cpi->sf.interp_sf.cb_pred_filter_search) {
+      const int bsl = mi_size_wide_log2[bsize];
+      enable_filter_search =
+          (((mi_row + mi_col) >> bsl) +
+           get_chessboard_index(cm->current_frame.frame_number)) &
+          0x1;
+    }
+    if (x->source_variance <=
+        cpi->sf.interp_sf.disable_filter_search_var_thresh)
+      enable_filter_search = 0;
+  }
+
+  for (int idx = 0; idx < num_inter_modes; ++idx) {
+    int rate_mv = 0;
+    int mode_rd_thresh;
+    int mode_index;
+    int64_t this_sse;
+    int is_skippable;
+    int this_early_term = 0;
+    int skip_this_mv = 0;
+    int comp_pred = 0;
+    int force_mv_inter_layer = 0;
+    PREDICTION_MODE this_mode;
+    MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+    second_ref_frame = NONE_FRAME;
+
+    this_mode = ref_mode_set[idx].pred_mode;
+    ref_frame = ref_mode_set[idx].ref_frame;
+
+#if COLLECT_PICK_MODE_STAT
+    aom_usec_timer_start(&ms_stat.timer1);
+    ms_stat.num_searches[bsize][this_mode]++;
+#endif
+    mi->mode = this_mode;
+    mi->ref_frame[0] = ref_frame;
+
+    if (ref_frame > usable_ref_frame) continue;
+    if (skip_ref_find_pred[ref_frame]) continue;
+
+    // Skip non-zero motion for SVC if skip_nonzeromv_ref is set.
+    if (cpi->use_svc && frame_mv[this_mode][ref_frame].as_int != 0) {
+      if (ref_frame == LAST_FRAME && cpi->svc.skip_nonzeromv_last)
+        continue;
+      else if (ref_frame == GOLDEN_FRAME && cpi->svc.skip_nonzeromv_gf)
+        continue;
+    }
+
+    // If the segment reference frame feature is enabled then do nothing if the
+    // current ref frame is not allowed.
+    if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME) &&
+        get_segdata(seg, mi->segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame)
+      continue;
+
+    if (ref_frame != LAST_FRAME && cpi->oxcf.rc_mode == AOM_CBR &&
+        sse_zeromv_norm < thresh_skip_golden && this_mode == NEWMV)
+      continue;
+
+    if (!(cpi->ref_frame_flags & flag_list[ref_frame])) continue;
+
+    if (!(inter_mode_mask[bsize] & (1 << this_mode))) continue;
+
+    // Skip testing non-LAST if this flag is set.
+    if (x->nonrd_prune_ref_frame_search) {
+      if (x->nonrd_prune_ref_frame_search > 1 && ref_frame != LAST_FRAME &&
+          (bsize > BLOCK_64X64 || (bsize > BLOCK_16X16 && this_mode == NEWMV)))
+        continue;
+
+      if (ref_frame != LAST_FRAME && this_mode == NEARMV) continue;
+    }
+
+    // Skip non-zeromv mode search for non-LAST frame if force_skip_low_temp_var
+    // is set. If nearestmv for golden frame is 0, zeromv mode will be skipped
+    // later.
+    if (!force_mv_inter_layer && force_skip_low_temp_var &&
+        ref_frame != LAST_FRAME && frame_mv[this_mode][ref_frame].as_int != 0) {
+      continue;
+    }
+
+#if 0
+        if (x->content_state_sb != kVeryHighSad &&
+        (cpi->sf.short_circuit_low_temp_var >= 2 ||
+        (cpi->sf.short_circuit_low_temp_var == 1 && bsize == BLOCK_64X64))
+        && force_skip_low_temp_var && ref_frame == LAST_FRAME && this_mode ==
+            NEWMV)  {
+          continue;
+        }
+#endif
+
+    // Disable this drop out case if the ref frame segment level feature is
+    // enabled for this segment. This is to prevent the possibility that we
+    // end up unable to pick any mode.
+    if (!segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME)) {
+      // Check for skipping GOLDEN and ALTREF based pred_mv_sad.
+      if (cpi->sf.rt_sf.nonrd_prune_ref_frame_search > 0 &&
+          x->pred_mv_sad[ref_frame] != INT_MAX && ref_frame != LAST_FRAME) {
+        if ((int64_t)(x->pred_mv_sad[ref_frame]) > thresh_sad_pred)
+          ref_frame_skip_mask |= (1 << ref_frame);
+      }
+      if (ref_frame_skip_mask & (1 << ref_frame)) continue;
+    }
+
+    // Select prediction reference frames.
+    for (int i = 0; i < MAX_MB_PLANE; i++) {
+      xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
+    }
+
+    mi->ref_frame[0] = ref_frame;
+    mi->ref_frame[1] = second_ref_frame;
+    set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
+
+    mode_index = mode_idx[ref_frame][INTER_OFFSET(this_mode)];
+    mode_rd_thresh = best_pickmode.best_mode_skip_txfm
+                         ? rd_threshes[mode_index] << 1
+                         : rd_threshes[mode_index];
+
+    // Increase mode_rd_thresh value for non-LAST for improved encoding
+    // speed
+    if (ref_frame != LAST_FRAME) {
+      mode_rd_thresh = mode_rd_thresh << 1;
+      if (ref_frame == GOLDEN_FRAME && cpi->rc.frames_since_golden > 4)
+        mode_rd_thresh = mode_rd_thresh << 1;
+    }
+
+    if (rd_less_than_thresh(best_rdc.rdcost, mode_rd_thresh,
+                            rd_thresh_freq_fact[mode_index]))
+      if (frame_mv[this_mode][ref_frame].as_int != 0) continue;
+
+    if (this_mode == NEWMV && !force_mv_inter_layer) {
+      if (search_new_mv(cpi, x, frame_mv, ref_frame, gf_temporal_ref, bsize,
+                        mi_row, mi_col, best_pred_sad, &rate_mv, &best_rdc))
+        continue;
+    }
+
+    for (PREDICTION_MODE inter_mv_mode = NEARESTMV; inter_mv_mode <= NEWMV;
+         inter_mv_mode++) {
+      if (inter_mv_mode == this_mode || comp_pred) continue;
+      if (mode_checked[inter_mv_mode][ref_frame] &&
+          frame_mv[this_mode][ref_frame].as_int ==
+              frame_mv[inter_mv_mode][ref_frame].as_int) {
+        skip_this_mv = 1;
+        break;
+      }
+    }
+
+    if (skip_this_mv) continue;
+
+    mi->mode = this_mode;
+    mi->mv[0].as_int = frame_mv[this_mode][ref_frame].as_int;
+    mi->mv[1].as_int = 0;
+    if (reuse_inter_pred) {
+      if (!this_mode_pred) {
+        this_mode_pred = &tmp[3];
+      } else {
+        this_mode_pred = &tmp[get_pred_buffer(tmp, 3)];
+        pd->dst.buf = this_mode_pred->data;
+        pd->dst.stride = bw;
+      }
+    }
+#if COLLECT_PICK_MODE_STAT
+    ms_stat.num_nonskipped_searches[bsize][this_mode]++;
+#endif
+    if (enable_filter_search &&
+        ((mi->mv[0].as_mv.row & 0x07) || (mi->mv[0].as_mv.col & 0x07)) &&
+        (ref_frame == LAST_FRAME || !x->nonrd_prune_ref_frame_search)) {
+      search_filter_ref(cpi, x, &this_rdc, mi_row, mi_col, tmp, bsize,
+                        reuse_inter_pred, &this_mode_pred, &var_y, &sse_y,
+                        &this_early_term, use_model_yrd_large, &this_sse);
+    } else {
+      mi->interp_filters =
+          (filter_ref == SWITCHABLE)
+              ? av1_broadcast_interp_filter(default_interp_filter)
+              : av1_broadcast_interp_filter(filter_ref);
+      av1_enc_build_inter_predictor_y(xd, mi_row, mi_col);
+      if (use_model_yrd_large) {
+        model_skip_for_sb_y_large(cpi, bsize, mi_row, mi_col, x, xd, NULL, NULL,
+                                  &var_y, &sse_y, &this_early_term,
+                                  use_modeled_non_rd_cost);
+      } else {
+        model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc.rate, &this_rdc.dist,
+                          &this_rdc.skip, NULL, &var_y, &sse_y,
+                          use_modeled_non_rd_cost);
+      }
+    }
+
+    if (ref_frame == LAST_FRAME && frame_mv[this_mode][ref_frame].as_int == 0) {
+      sse_zeromv_norm =
+          sse_y >> (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
+    }
+
+    const int skip_ctx = av1_get_skip_context(xd);
+    const int skip_cost = x->skip_cost[skip_ctx][1];
+    const int no_skip_cost = x->skip_cost[skip_ctx][0];
+    if (!this_early_term) {
+      if (use_modeled_non_rd_cost) {
+        if (this_rdc.skip) {
+          this_rdc.rate = skip_cost;
+        } else {
+          this_rdc.rate += no_skip_cost;
+        }
+      } else {
+        this_sse = (int64_t)sse_y;
+        block_yrd(cpi, x, mi_row, mi_col, &this_rdc, &is_skippable, &this_sse,
+                  bsize, mi->tx_size);
+        if (this_rdc.skip) {
+          this_rdc.rate = skip_cost;
+        } else {
+          if (RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist) >=
+              RDCOST(x->rdmult, 0,
+                     this_sse)) {  // this_sse already multiplied by 16 in
+                                   // block_yrd
+            this_rdc.skip = 1;
+            this_rdc.rate = skip_cost;
+            this_rdc.dist = this_sse;
+          } else {
+            this_rdc.rate += no_skip_cost;
+          }
+        }
+      }
+    } else {
+      this_rdc.skip = 1;
+      this_rdc.rate = skip_cost;
+      this_rdc.dist = sse_y << 4;
+    }
+
+    if (!this_early_term &&
+        (x->color_sensitivity[0] || x->color_sensitivity[1])) {
+      RD_STATS rdc_uv;
+      const BLOCK_SIZE uv_bsize = get_plane_block_size(
+          bsize, xd->plane[1].subsampling_x, xd->plane[1].subsampling_y);
+      if (x->color_sensitivity[0]) {
+        av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+                                      AOM_PLANE_U, AOM_PLANE_U);
+      }
+      if (x->color_sensitivity[1]) {
+        av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+                                      AOM_PLANE_V, AOM_PLANE_V);
+      }
+      model_rd_for_sb_uv(cpi, uv_bsize, x, xd, &rdc_uv, &var_y, &sse_y, 1, 2);
+      this_rdc.rate += rdc_uv.rate;
+      this_rdc.dist += rdc_uv.dist;
+      this_rdc.skip = this_rdc.skip && rdc_uv.skip;
+    }
+
+    // TODO(kyslov) account for UV prediction cost
+    this_rdc.rate += rate_mv;
+    const int16_t mode_ctx =
+        av1_mode_context_analyzer(mbmi_ext->mode_context, mi->ref_frame);
+    this_rdc.rate += cost_mv_ref(x, this_mode, mode_ctx);
+
+    this_rdc.rate += ref_costs_single[ref_frame];
+
+    this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist);
+    if (cpi->oxcf.rc_mode == AOM_CBR) {
+      newmv_diff_bias(xd, this_mode, &this_rdc, bsize,
+                      frame_mv[this_mode][ref_frame].as_mv.row,
+                      frame_mv[this_mode][ref_frame].as_mv.col, cpi->speed,
+                      x->source_variance);
+    }
+
+    mode_checked[this_mode][ref_frame] = 1;
+#if COLLECT_PICK_MODE_STAT
+    aom_usec_timer_mark(&ms_stat.timer1);
+    ms_stat.nonskipped_search_times[bsize][this_mode] +=
+        aom_usec_timer_elapsed(&ms_stat.timer1);
+#endif
+    if (this_rdc.rdcost < best_rdc.rdcost) {
+      best_rdc = this_rdc;
+      best_early_term = this_early_term;
+      best_pickmode.best_mode = this_mode;
+      best_pickmode.best_pred_filter = mi->interp_filters;
+      best_pickmode.best_tx_size = mi->tx_size;
+      best_pickmode.best_ref_frame = ref_frame;
+      best_pickmode.best_mode_skip_txfm = this_rdc.skip;
+      best_pickmode.best_second_ref_frame = second_ref_frame;
+      if (reuse_inter_pred) {
+        free_pred_buffer(best_pickmode.best_pred);
+        best_pickmode.best_pred = this_mode_pred;
+      }
+    } else {
+      if (reuse_inter_pred) free_pred_buffer(this_mode_pred);
+    }
+    if (best_early_term && idx > 0) {
+      x->force_skip = 1;
+      break;
+    }
+  }
+
+  mi->mode = best_pickmode.best_mode;
+  mi->interp_filters = best_pickmode.best_pred_filter;
+  mi->tx_size = best_pickmode.best_tx_size;
+  memset(mi->inter_tx_size, mi->tx_size, sizeof(mi->inter_tx_size));
+  mi->ref_frame[0] = best_pickmode.best_ref_frame;
+  mi->mv[0].as_int =
+      frame_mv[best_pickmode.best_mode][best_pickmode.best_ref_frame].as_int;
+  mi->ref_frame[1] = best_pickmode.best_second_ref_frame;
+  x->force_skip = best_rdc.skip;
+
+  // Perform intra prediction search, if the best SAD is above a certain
+  // threshold.
+  mi->angle_delta[PLANE_TYPE_Y] = 0;
+  mi->angle_delta[PLANE_TYPE_UV] = 0;
+  mi->filter_intra_mode_info.use_filter_intra = 0;
+
+  uint32_t spatial_var_thresh = 50;
+  int motion_thresh = 32;
+  // Adjust thresholds to make intra mode likely tested if the other
+  // references (golden, alt) are skipped/not checked.
+  if (cpi->sf.rt_sf.use_nonrd_altref_frame == 0 &&
+      cpi->sf.rt_sf.nonrd_prune_ref_frame_search > 0) {
+    spatial_var_thresh = 150;
+    motion_thresh = 0;
+  }
+  int do_early_exit_rdthresh = 1;
+  // Some adjustments to checking intra mode based on source variance.
+  if (x->source_variance < spatial_var_thresh) {
+    // If the best inter mode is large motion or non-LAST ref reduce intra cost
+    // penalty, so intra mode is more likely tested.
+    if (best_pickmode.best_ref_frame != LAST_FRAME ||
+        abs(mi->mv[0].as_mv.row) >= motion_thresh ||
+        abs(mi->mv[0].as_mv.col) >= motion_thresh) {
+      intra_cost_penalty = intra_cost_penalty >> 2;
+      inter_mode_thresh = RDCOST(x->rdmult, intra_cost_penalty, 0);
+      do_early_exit_rdthresh = 0;
+    }
+    // For big blocks worth checking intra (since only DC will be checked),
+    // even if best_early_term is set.
+    if (bsize >= BLOCK_32X32) best_early_term = 0;
+  }
+
+  if (best_rdc.rdcost == INT64_MAX ||
+      (perform_intra_pred && !best_early_term &&
+       best_rdc.rdcost > inter_mode_thresh &&
+       bsize <= cpi->sf.part_sf.max_intra_bsize)) {
+    int64_t this_sse = INT64_MAX;
+    struct estimate_block_intra_args args = { cpi, x, DC_PRED, 1, 0 };
+    PRED_BUFFER *const best_pred = best_pickmode.best_pred;
+    TX_SIZE intra_tx_size =
+        AOMMIN(AOMMIN(max_txsize_lookup[bsize],
+                      tx_mode_to_biggest_tx_size[x->tx_mode_search_type]),
+               TX_16X16);
+
+    if (reuse_inter_pred && best_pred != NULL) {
+      if (best_pred->data == orig_dst.buf) {
+        this_mode_pred = &tmp[get_pred_buffer(tmp, 3)];
+        aom_convolve_copy(best_pred->data, best_pred->stride,
+                          this_mode_pred->data, this_mode_pred->stride, 0, 0, 0,
+                          0, bw, bh);
+        best_pickmode.best_pred = this_mode_pred;
+      }
+    }
+    pd->dst = orig_dst;
+
+    for (int i = 0; i < 4; ++i) {
+      const PREDICTION_MODE this_mode = intra_mode_list[i];
+      const THR_MODES mode_index =
+          mode_idx[INTRA_FRAME][mode_offset(this_mode)];
+      const int mode_rd_thresh = rd_threshes[mode_index];
+
+      // Only check DC for blocks >= 32X32.
+      if (this_mode > 0 && bsize >= BLOCK_32X32) continue;
+
+      if (rd_less_than_thresh(best_rdc.rdcost, mode_rd_thresh,
+                              rd_thresh_freq_fact[mode_index]) &&
+          (do_early_exit_rdthresh || this_mode == SMOOTH_PRED)) {
+        continue;
+      }
+      const BLOCK_SIZE uv_bsize = get_plane_block_size(
+          bsize, xd->plane[1].subsampling_x, xd->plane[1].subsampling_y);
+
+      mi->mode = this_mode;
+      mi->ref_frame[0] = INTRA_FRAME;
+      mi->ref_frame[1] = NONE_FRAME;
+
+      this_rdc.dist = this_rdc.rate = 0;
+      args.mode = this_mode;
+      args.skippable = 1;
+      args.rdc = &this_rdc;
+      mi->tx_size = intra_tx_size;
+      compute_intra_yprediction(cm, this_mode, bsize, x, xd);
+      // Look into selecting tx_size here, based on prediction residual.
+      if (use_modeled_non_rd_cost)
+        model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc.rate, &this_rdc.dist,
+                          &this_rdc.skip, NULL, &var_y, &sse_y, 1);
+      else
+        block_yrd(cpi, x, mi_row, mi_col, &this_rdc, &args.skippable, &this_sse,
+                  bsize, mi->tx_size);
+      // TODO(kyslov@) Need to account for skippable
+      if (x->color_sensitivity[0]) {
+        av1_foreach_transformed_block_in_plane(xd, uv_bsize, 1,
+                                               estimate_block_intra, &args);
+      }
+      if (x->color_sensitivity[1]) {
+        av1_foreach_transformed_block_in_plane(xd, uv_bsize, 2,
+                                               estimate_block_intra, &args);
+      }
+
+      int mode_cost = 0;
+      if (av1_is_directional_mode(this_mode) && av1_use_angle_delta(bsize)) {
+        mode_cost += x->angle_delta_cost[this_mode - V_PRED]
+                                        [MAX_ANGLE_DELTA +
+                                         mi->angle_delta[PLANE_TYPE_Y]];
+      }
+      if (this_mode == DC_PRED && av1_filter_intra_allowed_bsize(cm, bsize)) {
+        mode_cost += x->filter_intra_cost[bsize][0];
+      }
+      this_rdc.rate += ref_costs_single[INTRA_FRAME];
+      this_rdc.rate += intra_cost_penalty;
+      this_rdc.rate += mode_cost;
+      this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist);
+
+      if (this_rdc.rdcost < best_rdc.rdcost) {
+        best_rdc = this_rdc;
+        best_pickmode.best_mode = this_mode;
+        best_pickmode.best_intra_tx_size = mi->tx_size;
+        best_pickmode.best_ref_frame = INTRA_FRAME;
+        best_pickmode.best_second_ref_frame = NONE_FRAME;
+        mi->uv_mode = this_mode;
+        mi->mv[0].as_int = INVALID_MV;
+        mi->mv[1].as_int = INVALID_MV;
+      }
+    }
+
+    // Reset mb_mode_info to the best inter mode.
+    if (best_pickmode.best_ref_frame != INTRA_FRAME) {
+      mi->tx_size = best_pickmode.best_tx_size;
+    } else {
+      mi->tx_size = best_pickmode.best_intra_tx_size;
+    }
+  }
+
+  pd->dst = orig_dst;
+  mi->mode = best_pickmode.best_mode;
+  mi->ref_frame[0] = best_pickmode.best_ref_frame;
+  mi->ref_frame[1] = best_pickmode.best_second_ref_frame;
+
+  if (!is_inter_block(mi)) {
+    mi->interp_filters = av1_broadcast_interp_filter(SWITCHABLE_FILTERS);
+  }
+
+  if (reuse_inter_pred && best_pickmode.best_pred != NULL) {
+    PRED_BUFFER *const best_pred = best_pickmode.best_pred;
+    if (best_pred->data != orig_dst.buf && is_inter_mode(mi->mode)) {
+      aom_convolve_copy(best_pred->data, best_pred->stride, pd->dst.buf,
+                        pd->dst.stride, 0, 0, 0, 0, bw, bh);
+    }
+  }
+  if (cpi->sf.inter_sf.adaptive_rd_thresh) {
+    THR_MODES best_mode_idx =
+        mode_idx[best_pickmode.best_ref_frame][mode_offset(mi->mode)];
+    if (best_pickmode.best_ref_frame == INTRA_FRAME) {
+      // Only consider the modes that are included in the intra_mode_list.
+      int intra_modes = sizeof(intra_mode_list) / sizeof(PREDICTION_MODE);
+      for (int i = 0; i < intra_modes; i++) {
+        update_thresh_freq_fact(cpi, x, bsize, INTRA_FRAME, best_mode_idx,
+                                intra_mode_list[i]);
+      }
+    } else {
+      for (ref_frame = LAST_FRAME; ref_frame <= usable_ref_frame; ++ref_frame) {
+        PREDICTION_MODE this_mode;
+        if (best_pickmode.best_ref_frame != ref_frame) continue;
+        for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
+          update_thresh_freq_fact(cpi, x, bsize, ref_frame, best_mode_idx,
+                                  this_mode);
+        }
+      }
+    }
+  }
+
+#if CONFIG_INTERNAL_STATS
+  store_coding_context(x, ctx, mi->mode);
+#else
+  store_coding_context(x, ctx);
+#endif  // CONFIG_INTERNAL_STATS
+#if COLLECT_PICK_MODE_STAT
+  aom_usec_timer_mark(&ms_stat.timer2);
+  ms_stat.avg_block_times[bsize] += aom_usec_timer_elapsed(&ms_stat.timer2);
+  //
+  if ((mi_row + mi_size_high[bsize] >= (cpi->common.mi_params.mi_rows)) &&
+      (mi_col + mi_size_wide[bsize] >= (cpi->common.mi_params.mi_cols))) {
+    int i, j;
+    PREDICTION_MODE used_modes[3] = { NEARESTMV, NEARMV, NEWMV };
+    BLOCK_SIZE bss[5] = { BLOCK_8X8, BLOCK_16X16, BLOCK_32X32, BLOCK_64X64,
+                          BLOCK_128X128 };
+    int64_t total_time = 0l;
+    int32_t total_blocks = 0;
+
+    printf("\n");
+    for (i = 0; i < 5; i++) {
+      printf("BS(%d) Num %d, Avg_time %f: ", bss[i], ms_stat.num_blocks[bss[i]],
+             ms_stat.num_blocks[bss[i]] > 0
+                 ? (float)ms_stat.avg_block_times[bss[i]] /
+                       ms_stat.num_blocks[bss[i]]
+                 : 0);
+      total_time += ms_stat.avg_block_times[bss[i]];
+      total_blocks += ms_stat.num_blocks[bss[i]];
+      for (j = 0; j < 3; j++) {
+        printf("Mode %d, %d/%d tps %f ", used_modes[j],
+               ms_stat.num_nonskipped_searches[bss[i]][used_modes[j]],
+               ms_stat.num_searches[bss[i]][used_modes[j]],
+               ms_stat.num_nonskipped_searches[bss[i]][used_modes[j]] > 0
+                   ? (float)ms_stat
+                             .nonskipped_search_times[bss[i]][used_modes[j]] /
+                         ms_stat.num_nonskipped_searches[bss[i]][used_modes[j]]
+                   : 0l);
+      }
+      printf("\n");
+    }
+    printf("Total time = %ld. Total blocks = %d\n", total_time, total_blocks);
+  }
+  //
+#endif  // COLLECT_PICK_MODE_STAT
+  *rd_cost = best_rdc;
+}
diff --git a/libs/libaom/src/av1/encoder/palette.c b/libs/libaom/src/av1/encoder/palette.c
new file mode 100644
index 000000000..e61cd02ce
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/palette.c
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+
+#include "av1/encoder/cost.h"
+#include "av1/encoder/palette.h"
+#include "av1/encoder/random.h"
+
+#define AV1_K_MEANS_DIM 1
+#include "av1/encoder/k_means_template.h"
+#undef AV1_K_MEANS_DIM
+#define AV1_K_MEANS_DIM 2
+#include "av1/encoder/k_means_template.h"
+#undef AV1_K_MEANS_DIM
+
+static int int_comparer(const void *a, const void *b) {
+  return (*(int *)a - *(int *)b);
+}
+
+int av1_remove_duplicates(int *centroids, int num_centroids) {
+  int num_unique;  // number of unique centroids
+  int i;
+  qsort(centroids, num_centroids, sizeof(*centroids), int_comparer);
+  // Remove duplicates.
+  num_unique = 1;
+  for (i = 1; i < num_centroids; ++i) {
+    if (centroids[i] != centroids[i - 1]) {  // found a new unique centroid
+      centroids[num_unique++] = centroids[i];
+    }
+  }
+  return num_unique;
+}
+
+static int delta_encode_cost(const int *colors, int num, int bit_depth,
+                             int min_val) {
+  if (num <= 0) return 0;
+  int bits_cost = bit_depth;
+  if (num == 1) return bits_cost;
+  bits_cost += 2;
+  int max_delta = 0;
+  int deltas[PALETTE_MAX_SIZE];
+  const int min_bits = bit_depth - 3;
+  for (int i = 1; i < num; ++i) {
+    const int delta = colors[i] - colors[i - 1];
+    deltas[i - 1] = delta;
+    assert(delta >= min_val);
+    if (delta > max_delta) max_delta = delta;
+  }
+  int bits_per_delta = AOMMAX(av1_ceil_log2(max_delta + 1 - min_val), min_bits);
+  assert(bits_per_delta <= bit_depth);
+  int range = (1 << bit_depth) - colors[0] - min_val;
+  for (int i = 0; i < num - 1; ++i) {
+    bits_cost += bits_per_delta;
+    range -= deltas[i];
+    bits_per_delta = AOMMIN(bits_per_delta, av1_ceil_log2(range));
+  }
+  return bits_cost;
+}
+
+int av1_index_color_cache(const uint16_t *color_cache, int n_cache,
+                          const uint16_t *colors, int n_colors,
+                          uint8_t *cache_color_found, int *out_cache_colors) {
+  if (n_cache <= 0) {
+    for (int i = 0; i < n_colors; ++i) out_cache_colors[i] = colors[i];
+    return n_colors;
+  }
+  memset(cache_color_found, 0, n_cache * sizeof(*cache_color_found));
+  int n_in_cache = 0;
+  int in_cache_flags[PALETTE_MAX_SIZE];
+  memset(in_cache_flags, 0, sizeof(in_cache_flags));
+  for (int i = 0; i < n_cache && n_in_cache < n_colors; ++i) {
+    for (int j = 0; j < n_colors; ++j) {
+      if (colors[j] == color_cache[i]) {
+        in_cache_flags[j] = 1;
+        cache_color_found[i] = 1;
+        ++n_in_cache;
+        break;
+      }
+    }
+  }
+  int j = 0;
+  for (int i = 0; i < n_colors; ++i)
+    if (!in_cache_flags[i]) out_cache_colors[j++] = colors[i];
+  assert(j == n_colors - n_in_cache);
+  return j;
+}
+
+int av1_get_palette_delta_bits_v(const PALETTE_MODE_INFO *const pmi,
+                                 int bit_depth, int *zero_count,
+                                 int *min_bits) {
+  const int n = pmi->palette_size[1];
+  const int max_val = 1 << bit_depth;
+  int max_d = 0;
+  *min_bits = bit_depth - 4;
+  *zero_count = 0;
+  for (int i = 1; i < n; ++i) {
+    const int delta = pmi->palette_colors[2 * PALETTE_MAX_SIZE + i] -
+                      pmi->palette_colors[2 * PALETTE_MAX_SIZE + i - 1];
+    const int v = abs(delta);
+    const int d = AOMMIN(v, max_val - v);
+    if (d > max_d) max_d = d;
+    if (d == 0) ++(*zero_count);
+  }
+  return AOMMAX(av1_ceil_log2(max_d + 1), *min_bits);
+}
+
+int av1_palette_color_cost_y(const PALETTE_MODE_INFO *const pmi,
+                             uint16_t *color_cache, int n_cache,
+                             int bit_depth) {
+  const int n = pmi->palette_size[0];
+  int out_cache_colors[PALETTE_MAX_SIZE];
+  uint8_t cache_color_found[2 * PALETTE_MAX_SIZE];
+  const int n_out_cache =
+      av1_index_color_cache(color_cache, n_cache, pmi->palette_colors, n,
+                            cache_color_found, out_cache_colors);
+  const int total_bits =
+      n_cache + delta_encode_cost(out_cache_colors, n_out_cache, bit_depth, 1);
+  return av1_cost_literal(total_bits);
+}
+
+int av1_palette_color_cost_uv(const PALETTE_MODE_INFO *const pmi,
+                              uint16_t *color_cache, int n_cache,
+                              int bit_depth) {
+  const int n = pmi->palette_size[1];
+  int total_bits = 0;
+  // U channel palette color cost.
+  int out_cache_colors[PALETTE_MAX_SIZE];
+  uint8_t cache_color_found[2 * PALETTE_MAX_SIZE];
+  const int n_out_cache = av1_index_color_cache(
+      color_cache, n_cache, pmi->palette_colors + PALETTE_MAX_SIZE, n,
+      cache_color_found, out_cache_colors);
+  total_bits +=
+      n_cache + delta_encode_cost(out_cache_colors, n_out_cache, bit_depth, 0);
+
+  // V channel palette color cost.
+  int zero_count = 0, min_bits_v = 0;
+  const int bits_v =
+      av1_get_palette_delta_bits_v(pmi, bit_depth, &zero_count, &min_bits_v);
+  const int bits_using_delta =
+      2 + bit_depth + (bits_v + 1) * (n - 1) - zero_count;
+  const int bits_using_raw = bit_depth * n;
+  total_bits += 1 + AOMMIN(bits_using_delta, bits_using_raw);
+  return av1_cost_literal(total_bits);
+}
diff --git a/libs/libaom/src/av1/encoder/palette.h b/libs/libaom/src/av1/encoder/palette.h
new file mode 100644
index 000000000..8b88c4755
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/palette.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_PALETTE_H_
+#define AOM_AV1_ENCODER_PALETTE_H_
+
+#include "av1/common/blockd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define AV1_K_MEANS_RENAME(func, dim) func##_dim##dim
+
+void AV1_K_MEANS_RENAME(av1_calc_indices, 1)(const int *data,
+                                             const int *centroids,
+                                             uint8_t *indices, int n, int k);
+void AV1_K_MEANS_RENAME(av1_calc_indices, 2)(const int *data,
+                                             const int *centroids,
+                                             uint8_t *indices, int n, int k);
+void AV1_K_MEANS_RENAME(av1_k_means, 1)(const int *data, int *centroids,
+                                        uint8_t *indices, int n, int k,
+                                        int max_itr);
+void AV1_K_MEANS_RENAME(av1_k_means, 2)(const int *data, int *centroids,
+                                        uint8_t *indices, int n, int k,
+                                        int max_itr);
+
+// Given 'n' 'data' points and 'k' 'centroids' each of dimension 'dim',
+// calculate the centroid 'indices' for the data points.
+static INLINE void av1_calc_indices(const int *data, const int *centroids,
+                                    uint8_t *indices, int n, int k, int dim) {
+  if (dim == 1) {
+    AV1_K_MEANS_RENAME(av1_calc_indices, 1)(data, centroids, indices, n, k);
+  } else if (dim == 2) {
+    AV1_K_MEANS_RENAME(av1_calc_indices, 2)(data, centroids, indices, n, k);
+  } else {
+    assert(0 && "Untemplated k means dimension");
+  }
+}
+
+// Given 'n' 'data' points and an initial guess of 'k' 'centroids' each of
+// dimension 'dim', runs up to 'max_itr' iterations of k-means algorithm to get
+// updated 'centroids' and the centroid 'indices' for elements in 'data'.
+// Note: the output centroids are rounded off to nearest integers.
+static INLINE void av1_k_means(const int *data, int *centroids,
+                               uint8_t *indices, int n, int k, int dim,
+                               int max_itr) {
+  if (dim == 1) {
+    AV1_K_MEANS_RENAME(av1_k_means, 1)(data, centroids, indices, n, k, max_itr);
+  } else if (dim == 2) {
+    AV1_K_MEANS_RENAME(av1_k_means, 2)(data, centroids, indices, n, k, max_itr);
+  } else {
+    assert(0 && "Untemplated k means dimension");
+  }
+}
+
+// Given a list of centroids, returns the unique number of centroids 'k', and
+// puts these unique centroids in first 'k' indices of 'centroids' array.
+// Ideally, the centroids should be rounded to integers before calling this
+// method.
+int av1_remove_duplicates(int *centroids, int num_centroids);
+
+// Given a color cache and a set of base colors, find if each cache color is
+// present in the base colors, record the binary results in "cache_color_found".
+// Record the colors that are not in the color cache in "out_cache_colors".
+int av1_index_color_cache(const uint16_t *color_cache, int n_cache,
+                          const uint16_t *colors, int n_colors,
+                          uint8_t *cache_color_found, int *out_cache_colors);
+
+// Return the number of bits used to transmit each v palette color delta;
+// assign zero_count with the number of deltas being 0.
+int av1_get_palette_delta_bits_v(const PALETTE_MODE_INFO *const pmi,
+                                 int bit_depth, int *zero_count, int *min_bits);
+
+// Return the rate cost for transmitting luma palette color values.
+int av1_palette_color_cost_y(const PALETTE_MODE_INFO *const pmi,
+                             uint16_t *color_cache, int n_cache, int bit_depth);
+
+// Return the rate cost for transmitting chroma palette color values.
+int av1_palette_color_cost_uv(const PALETTE_MODE_INFO *const pmi,
+                              uint16_t *color_cache, int n_cache,
+                              int bit_depth);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_PALETTE_H_
diff --git a/libs/libaom/src/av1/encoder/partition_cnn_weights.h b/libs/libaom/src/av1/encoder/partition_cnn_weights.h
new file mode 100644
index 000000000..504038c63
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/partition_cnn_weights.h
@@ -0,0 +1,2139 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_PARTITION_CNN_WEIGHTS_H_
+#define AOM_AV1_ENCODER_PARTITION_CNN_WEIGHTS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/encoder/cnn.h"
+#include "av1/encoder/ml.h"
+
+#define CNN_BRANCH_0_OUT_CH 20
+#define CNN_BRANCH_1_OUT_CH 4
+#define CNN_BRANCH_2_OUT_CH 20
+#define CNN_BRANCH_3_OUT_CH 20
+#define CNN_TOT_OUT_CH                                                      \
+  (((CNN_BRANCH_0_OUT_CH) + (CNN_BRANCH_1_OUT_CH) + (CNN_BRANCH_2_OUT_CH) + \
+    (CNN_BRANCH_3_OUT_CH)))
+#define CNN_BRANCH_0_OUT_SIZE (CNN_BRANCH_0_OUT_CH)
+#define CNN_BRANCH_1_OUT_SIZE ((CNN_BRANCH_1_OUT_CH)*2 * 2)
+#define CNN_BRANCH_2_OUT_SIZE ((CNN_BRANCH_2_OUT_CH)*4 * 4)
+#define CNN_BRANCH_3_OUT_SIZE ((CNN_BRANCH_3_OUT_CH)*8 * 8)
+#define CNN_OUT_BUF_SIZE                                \
+  (((CNN_BRANCH_0_OUT_SIZE) + (CNN_BRANCH_1_OUT_SIZE) + \
+    (CNN_BRANCH_2_OUT_SIZE) + (CNN_BRANCH_3_OUT_SIZE)))
+
+#define NUM_DNN_BRANCHES 4
+#define NUM_CNN_LAYERS 5
+#define BRANCH_0_NUM_DNN_LAYERS 2
+#define BRANCH_1_NUM_DNN_LAYERS 2
+#define BRANCH_2_NUM_DNN_LAYERS 2
+#define BRANCH_3_NUM_DNN_LAYERS 2
+#define CNN_LAYER_0_HEIGHT 5
+#define CNN_LAYER_0_WIDTH 5
+#define CNN_LAYER_0_IN_CH 1
+#define CNN_LAYER_0_OUT_CH 20
+#define CNN_LAYER_0_HORZ_STRIDE 4
+#define CNN_LAYER_0_VERT_STRIDE 4
+#define CNN_LAYER_1_HEIGHT 2
+#define CNN_LAYER_1_WIDTH 2
+#define CNN_LAYER_1_IN_CH 20
+#define CNN_LAYER_1_OUT_CH 20
+#define CNN_LAYER_1_HORZ_STRIDE 2
+#define CNN_LAYER_1_VERT_STRIDE 2
+#define CNN_LAYER_2_HEIGHT 2
+#define CNN_LAYER_2_WIDTH 2
+#define CNN_LAYER_2_IN_CH 20
+#define CNN_LAYER_2_OUT_CH 20
+#define CNN_LAYER_2_HORZ_STRIDE 2
+#define CNN_LAYER_2_VERT_STRIDE 2
+#define CNN_LAYER_3_HEIGHT 2
+#define CNN_LAYER_3_WIDTH 2
+#define CNN_LAYER_3_IN_CH 20
+#define CNN_LAYER_3_OUT_CH 4
+#define CNN_LAYER_3_HORZ_STRIDE 2
+#define CNN_LAYER_3_VERT_STRIDE 2
+#define CNN_LAYER_4_HEIGHT 2
+#define CNN_LAYER_4_WIDTH 2
+#define CNN_LAYER_4_IN_CH 4
+#define CNN_LAYER_4_OUT_CH 20
+#define CNN_LAYER_4_HORZ_STRIDE 2
+#define CNN_LAYER_4_VERT_STRIDE 2
+#define BRANCH_0_NUM_DNN_FEATURES 37
+#define BRANCH_0_NUM_DNN_LAYER_0_UNITS 16
+#define BRANCH_0_NUM_DNN_LAYER_1_UNITS 24
+#define BRANCH_0_NUM_LOGITS 1
+#define BRANCH_1_NUM_DNN_FEATURES 25
+#define BRANCH_1_NUM_DNN_LAYER_0_UNITS 16
+#define BRANCH_1_NUM_DNN_LAYER_1_UNITS 24
+#define BRANCH_1_NUM_LOGITS 1
+#define BRANCH_2_NUM_DNN_FEATURES 25
+#define BRANCH_2_NUM_DNN_LAYER_0_UNITS 16
+#define BRANCH_2_NUM_DNN_LAYER_1_UNITS 24
+#define BRANCH_2_NUM_LOGITS 1
+#define BRANCH_3_NUM_DNN_FEATURES 41
+#define BRANCH_3_NUM_DNN_LAYER_0_UNITS 16
+#define BRANCH_3_NUM_DNN_LAYER_1_UNITS 24
+#define BRANCH_3_NUM_LOGITS 1
+
+static const float av1_intra_mode_cnn_partition_cnn_layer_0_kernel[] = {
+  0.131894f,    -0.593536f,  -0.212935f,  -0.00220011f, -0.396949f,
+  0.287753f,    -0.91875f,   -0.0095057f, 0.804197f,    -0.395239f,
+  0.516604f,    1.16439f,    0.445784f,   -0.163349f,   0.746488f,
+  -0.33891f,    -0.562652f,  0.481403f,   0.755378f,    -0.200753f,
+  0.0784307f,   0.105657f,   0.0205673f,  -0.524089f,   -0.476146f,
+  -0.161206f,   -0.65079f,   0.137474f,   0.28584f,     0.508768f,
+  -0.643386f,   0.227068f,   -0.899507f,  -0.413382f,   0.631466f,
+  0.398203f,    -0.544392f,  0.825155f,   0.671847f,    -0.249779f,
+  0.323121f,    0.125357f,   -0.719564f,  -0.0714854f,  -0.168472f,
+  -0.213246f,   -0.674525f,  0.330148f,   -0.138414f,   0.20462f,
+  -0.518571f,   -0.15091f,   -0.605116f,  -0.448732f,   -0.475599f,
+  0.738f,       -0.328526f,  0.755035f,   0.969414f,    -0.321039f,
+  -0.23068f,    0.408567f,   -0.377813f,  -0.273974f,   1.0684f,
+  0.373968f,    -0.450305f,  0.439258f,   -0.381846f,   -0.267331f,
+  0.30613f,     -0.39369f,   0.622438f,   -0.52877f,    -0.334991f,
+  0.263193f,    -0.402121f,  0.64142f,    0.793048f,    -0.0231174f,
+  -0.68474f,    -0.293338f,  -0.737511f,  -0.462654f,   0.474629f,
+  0.141397f,    -0.152529f,  0.345879f,   -0.499991f,   0.00174024f,
+  0.337387f,    -0.131151f,  0.427385f,   -0.457449f,   -0.879614f,
+  -0.425908f,   -0.263172f,  0.0344974f,  1.07861f,     -0.00416662f,
+  0.0208952f,   0.233905f,   0.765965f,   0.0423685f,   -0.117554f,
+  -0.248237f,   0.49848f,    -0.845131f,  0.223648f,    -0.838709f,
+  0.5834f,      0.309956f,   -0.0625093f, -0.619619f,   0.918957f,
+  0.358271f,    -0.668459f,  0.518783f,   -0.418963f,   -0.206788f,
+  0.364983f,    -0.0396087f, 0.624309f,   -0.138679f,   -0.142453f,
+  0.28309f,     0.895092f,   -0.215713f,  0.439025f,    0.659333f,
+  -0.366025f,   -0.413518f,  0.66657f,    -0.265919f,   0.473471f,
+  -1.0729f,     -0.526702f,  0.2838f,     0.367648f,    -0.61242f,
+  0.121656f,    0.547727f,   -0.0636793f, -0.33006f,    -0.306604f,
+  -0.00897731f, 0.688242f,   0.0944626f,  0.321508f,    0.0437392f,
+  -0.560035f,   -0.768334f,  0.0571051f,  -0.0427601f,  -0.0437806f,
+  -0.816209f,   -0.395829f,  0.293733f,   0.217645f,    -0.646428f,
+  0.132448f,    -0.435806f,  -0.0556814f, 0.0218857f,   0.348525f,
+  -0.17296f,    0.669057f,   0.638604f,   -0.0995596f,  -0.024099f,
+  -0.262332f,   -0.548975f,  0.357894f,   0.43873f,     -0.688234f,
+  -0.425519f,   0.190986f,   -0.074778f,  0.294232f,    -0.548969f,
+  -0.731198f,   0.03616f,    -0.475969f,  -0.306075f,   -0.111929f,
+  -0.234146f,   0.612669f,   0.882254f,   -0.622893f,   0.262431f,
+  0.465242f,    0.245384f,   -0.811016f,  0.501798f,    -0.925875f,
+  0.264373f,    0.307766f,   -0.26872f,   0.113027f,    -0.158875f,
+  0.0711483f,   0.220275f,   -0.0699022f, -0.0111303f,  -0.435384f,
+  -0.720014f,   0.593484f,   -0.964082f,  0.750925f,    0.252433f,
+  0.964332f,    -0.256904f,  -0.421715f,  -0.403851f,   -0.188081f,
+  0.694014f,    -1.00183f,   0.798921f,   0.0603123f,   0.213814f,
+  0.739642f,    -0.0203375f, 0.72569f,    -0.260224f,   0.0199516f,
+  -0.322451f,   0.318204f,   -0.38392f,   0.740994f,    -0.265215f,
+  -0.54541f,    -0.51479f,   -0.458397f,  0.519564f,    0.0509182f,
+  0.0363331f,   -0.293051f,  0.317714f,   -0.327488f,   -0.0840401f,
+  0.318437f,    -0.619403f,  0.641094f,   -0.288435f,   -0.260185f,
+  0.181083f,    -0.169294f,  0.292645f,   0.140405f,    0.0572885f,
+  -0.637428f,   -0.102616f,  0.288955f,   0.817314f,    0.116855f,
+  0.635532f,    0.283334f,   -0.236391f,  -0.305035f,   -0.217365f,
+  -0.033021f,   -0.455858f,  0.439922f,   -0.104039f,   0.373376f,
+  0.310659f,    0.388789f,   0.266341f,   0.0746306f,   -0.428192f,
+  -0.202695f,   -0.347625f,  0.00585741f, 0.366203f,    0.221413f,
+  0.518856f,    0.57245f,    -0.375071f,  -0.2436f,     -0.511895f,
+  -1.03708f,    0.681455f,   -0.111544f,  -0.183563f,   0.109729f,
+  -0.422646f,   -0.529777f,  0.747473f,   -0.270223f,   -0.11435f,
+  0.378931f,    0.420456f,   0.236331f,   0.49261f,     -0.0666801f,
+  0.0475846f,   0.906095f,   -0.4146f,    -0.020588f,   -0.653285f,
+  0.135335f,    0.543846f,   -0.309061f,  0.11899f,     -0.639168f,
+  -0.719994f,   -0.219706f,  -0.645631f,  -0.829049f,   -0.0114746f,
+  0.834604f,    0.0378035f,  0.107957f,   0.546929f,    -0.674395f,
+  -0.854817f,   -1.1443f,    0.223413f,   -0.326324f,   0.440971f,
+  0.383582f,    -0.495084f,  0.280091f,   -0.53116f,    0.0333923f,
+  -0.354339f,   -0.0449156f, -0.538896f,  -0.753355f,   0.463995f,
+  0.000969967f, -0.2832f,    0.587276f,   0.853094f,    -0.481985f,
+  -0.138202f,   0.180989f,   -0.349044f,  -0.417534f,   0.455591f,
+  0.287332f,    0.251496f,   0.381416f,   0.339632f,    -0.0825727f,
+  0.352739f,    0.161697f,   -0.319764f,  -0.258015f,   0.668833f,
+  -0.553303f,   -0.578815f,  -0.3758f,    0.289f,       0.247368f,
+  0.00681103f,  0.421092f,   -0.191033f,  -0.425868f,   -0.1239f,
+  0.0540422f,   -0.0856856f, 0.481168f,   -0.0283741f,  -0.196018f,
+  0.230923f,    -0.145288f,  0.52188f,    0.00628462f,  -0.604556f,
+  -0.562879f,   0.319282f,   0.323799f,   0.453941f,    0.271129f,
+  -0.0520196f,  0.684571f,   -0.391779f,  -0.404614f,   0.134097f,
+  -0.825482f,   0.0913949f,  0.483543f,   0.159084f,    0.301637f,
+  0.427013f,    0.196153f,   0.460091f,   -0.730573f,   -0.12278f,
+  0.221665f,    0.674622f,   -0.623363f,  -0.0761517f,  0.637979f,
+  -0.468498f,   0.527276f,   -0.596894f,  -0.34675f,    -0.251241f,
+  0.418533f,    -0.476696f,  -0.901267f,  -0.0088241f,  -0.12421f,
+  -0.660316f,   -0.0222117f, -0.470898f,  -1.10739f,    -0.441645f,
+  0.39516f,     -0.0117906f, 0.254122f,   0.00722599f,  -1.00697f,
+  0.48908f,     -0.122287f,  -0.378608f,  -0.339145f,   0.682463f,
+  0.305606f,    0.453628f,   -0.49923f,   -0.791388f,   -0.202515f,
+  0.23214f,     -0.434209f,  -0.778283f,  -0.538015f,   0.145769f,
+  0.446281f,    -0.339329f,  -0.198478f,  -0.183717f,   -0.855441f,
+  -0.105778f,   0.575067f,   -0.18592f,   -0.348094f,   0.740614f,
+  0.041549f,    -0.109663f,  0.0434492f,  0.245242f,    -1.22192f,
+  0.685896f,    -0.208115f,  -0.0616216f, -1.00552f,    0.31045f,
+  -0.184394f,   0.466705f,   -0.0984364f, -0.506252f,   0.144874f,
+  0.357038f,    0.675221f,   -0.822171f,  -0.52729f,    0.991212f,
+  0.432422f,    0.383493f,   -0.372395f,  0.35651f,     -0.25369f,
+  0.660208f,    -0.117745f,  -0.142433f,  -0.724115f,   -1.0035f,
+  -0.59178f,    0.563444f,   -0.282531f,  -0.599989f,   0.507424f,
+  -0.782875f,   0.755029f,   -0.754962f,  -0.617825f,   0.565984f,
+  -0.826878f,   -0.456563f,  0.0212161f,  0.469867f,    -0.144864f,
+  0.225748f,    -0.279029f,  0.21052f,    -0.440183f,   0.936069f,
+  0.170595f,    0.40966f,    0.452453f,   -0.576006f,   1.50696f,
+  0.649049f,    0.094957f,   -0.167706f,  -0.258342f,   0.59269f
+};
+
+static const float av1_intra_mode_cnn_partition_cnn_layer_0_bias[] = {
+  0.00475215f,  -0.00362332f, -0.00317542f, 0.190083f,    0.0488147f,
+  -0.0268093f,  -0.00432231f, 0.0112229f,   0.0626653f,   -0.0025698f,
+  0.0018675f,   -0.00368139f, -0.00159125f, -0.00034354f, 0.311437f,
+  0.000136436f, 0.0667295f,   0.0251274f,   0.00226553f,  -0.000638344f
+};
+
+static const float av1_intra_mode_cnn_partition_cnn_layer_1_kernel[] = {
+  0.228403f,    0.241933f,     0.181079f,    0.101728f,    0.278455f,
+  -0.222078f,   0.387578f,     0.0847356f,   -0.0737012f,  0.26518f,
+  -1.0817f,     0.0404161f,    -0.805199f,   0.336576f,    -0.541494f,
+  0.246264f,    0.116597f,     -0.756804f,   -0.914136f,   0.410265f,
+  0.413294f,    0.07873f,      0.450017f,    -0.264346f,   0.549095f,
+  1.03755f,     -0.203542f,    1.61018f,     0.374131f,    0.402515f,
+  -2.36115f,    0.116427f,     -0.172157f,   -0.231482f,   -0.905736f,
+  -0.0183059f,  -0.575746f,    0.110348f,    -0.268018f,   0.140399f,
+  0.427196f,    0.0718528f,    0.247936f,    -0.326661f,   0.150404f,
+  -0.659979f,   -0.157148f,    0.00826241f,  -0.679275f,   -0.131564f,
+  -1.04822f,    1.06039f,      -0.207898f,   0.510167f,    0.484233f,
+  0.138972f,    -0.0801639f,   -0.184416f,   0.0741107f,   -0.0299281f,
+  0.112263f,    0.380071f,     -0.0185269f,  -0.0821188f,  0.918796f,
+  -0.576106f,   0.593007f,     0.479446f,    0.0440703f,   0.322379f,
+  0.176783f,    -0.147111f,    0.0953247f,   -0.636377f,   0.0702104f,
+  0.130979f,    0.293892f,     -0.0112124f,  -0.040347f,   -0.16034f,
+  0.3252f,      -0.586802f,    0.601786f,    -0.487148f,   -0.458777f,
+  0.463835f,    0.144942f,     0.00339965f,  -0.779966f,   0.0585298f,
+  -1.20758f,    -0.275614f,    0.292346f,    -0.132781f,   0.337892f,
+  -0.357677f,   1.48511f,      0.172907f,    -0.148668f,   0.243184f,
+  -0.503392f,   -0.0791543f,   0.0265389f,   -0.102267f,   0.213294f,
+  0.0657801f,   0.156996f,     0.0891168f,   0.120805f,    0.261285f,
+  -0.343025f,   -0.0792235f,   -0.106415f,   0.133878f,    -0.112981f,
+  -0.00151126f, -0.0643829f,   0.0458938f,   -0.0452731f,  -0.00147422f,
+  0.1871f,      -0.0208793f,   0.0752037f,   0.0794674f,   0.167666f,
+  0.198028f,    -0.361015f,    -0.0661721f,  -0.10672f,    -0.0773641f,
+  -1.15856f,    -0.516443f,    -0.322702f,   0.15668f,     0.0075841f,
+  -0.157731f,   0.270926f,     -0.241551f,   0.0169097f,   -0.0263953f,
+  -0.303556f,   -0.239237f,    0.117792f,    -0.137871f,   0.122054f,
+  -0.587381f,   0.112938f,     0.0867262f,   -0.27909f,    -0.203622f,
+  -0.622195f,   0.42623f,      0.670704f,    0.190826f,    -0.304979f,
+  -0.570075f,   -0.240699f,    0.43744f,     0.632896f,    -0.563846f,
+  -0.0160434f,  -0.0709745f,   0.816662f,    0.269999f,    -0.358734f,
+  0.193644f,    1.19339f,      -0.118223f,   -0.363291f,   -0.723616f,
+  -1.58825f,    0.0222856f,    0.769852f,    0.322713f,    0.0857619f,
+  -0.669756f,   -1.08414f,     1.18593f,     0.486166f,    -0.520646f,
+  0.0861854f,   -0.134197f,    0.258337f,    0.223345f,    0.697639f,
+  -0.57261f,    0.54031f,      0.892644f,    0.497572f,    -0.287076f,
+  -1.95928f,    -0.0568128f,   -0.253335f,   0.00233392f,  -0.192787f,
+  -0.115203f,   -0.0975649f,   0.277954f,    0.000704534f, -0.315884f,
+  0.309583f,    0.357458f,     0.0939298f,   -0.072701f,   0.433045f,
+  -0.536938f,   0.534523f,     0.184585f,    -0.0415175f,  -0.120909f,
+  -1.2622f,     0.412449f,     -0.114741f,   0.290453f,    -0.441671f,
+  -0.0242497f,  -0.20746f,     0.139019f,    -0.422668f,   -0.146732f,
+  -0.688828f,   -0.00339426f,  0.04166f,     0.41755f,     0.405675f,
+  0.562564f,    0.0216812f,    0.0271391f,   0.215227f,    0.328183f,
+  -1.6442f,     -0.827838f,    0.115491f,    0.0951442f,   -0.133779f,
+  -0.0482928f,  0.203177f,     0.322953f,    -0.513259f,   0.0676788f,
+  -0.0877928f,  0.224448f,     0.451957f,    0.314243f,    0.307403f,
+  0.35653f,     0.0286278f,    2.27554f,     0.569313f,    -0.0488753f,
+  -2.48809f,    0.274555f,     -0.248375f,   -0.635634f,   -0.187663f,
+  0.1827f,      -0.409634f,    -0.0280568f,  -0.207119f,   -0.208192f,
+  -0.410268f,   -0.017669f,    0.134856f,    0.434551f,    0.165201f,
+  0.584608f,    -0.389997f,    -0.088713f,   0.118087f,    0.00210905f,
+  -1.07698f,    -0.520967f,    -0.198742f,   0.190255f,    -0.162639f,
+  0.0122759f,   0.460774f,     -0.684633f,   -0.149512f,   0.167556f,
+  -0.295034f,   -0.0650964f,   0.0868653f,   -0.691352f,   0.089795f,
+  0.0620608f,   0.0531289f,    0.0124286f,   0.151921f,    1.51067f,
+  -0.10586f,    -0.0311871f,   0.114706f,    0.0565205f,   -0.159634f,
+  -0.423987f,   -0.226896f,    0.0605352f,   -0.36324f,    -0.142205f,
+  -0.252249f,   0.0666312f,    0.316655f,    0.00687196f,  0.131079f,
+  -0.128281f,   -0.293468f,    1.3327f,      0.542277f,    -0.060088f,
+  -1.73475f,    0.0542297f,    -0.227522f,   -0.376004f,   -0.147028f,
+  0.0228252f,   0.0569538f,    -0.0796497f,  0.0937596f,   -0.0660153f,
+  -0.979219f,   -0.377322f,    0.0523787f,   0.467299f,    0.0824278f,
+  0.437147f,    0.263637f,     0.0325681f,   0.303581f,    0.353479f,
+  -0.142369f,   -0.394797f,    0.597185f,    0.116482f,    -0.0782593f,
+  0.364539f,    -0.30396f,     0.119016f,    -0.0022429f,  -0.044292f,
+  -0.0110531f,  0.233571f,     0.000975879f, 0.447332f,    -0.0320396f,
+  0.541609f,    0.14232f,      0.163905f,    0.848609f,    0.19954f,
+  -0.186591f,   -0.44465f,     -0.431672f,   0.159037f,    -0.129977f,
+  -0.141778f,   0.246818f,     -0.197539f,   -0.70115f,    0.185449f,
+  0.400274f,    -0.0350744f,   0.239727f,    -0.290504f,   0.0698443f,
+  -0.180374f,   -0.759591f,    -0.0569088f,  -0.50246f,    -0.0986616f,
+  -0.892114f,   0.306737f,     -0.133937f,   0.285625f,    0.495471f,
+  -0.686222f,   -0.168647f,    -0.0926158f,  0.351772f,    -0.0215394f,
+  0.361223f,    0.0657142f,    0.268229f,    -0.616299f,   0.0564718f,
+  -0.294013f,   -0.588019f,    0.0234195f,   -0.426863f,   -0.511253f,
+  -0.72177f,    0.420903f,     0.0987506f,   0.309368f,    0.523532f,
+  1.06073f,     -0.33028f,     0.0818142f,   0.0130354f,   0.0180882f,
+  0.0316898f,   -0.416614f,    -0.566344f,   -0.163083f,   0.285085f,
+  -0.0534352f,  0.385496f,     0.151068f,    -0.208295f,   -0.175648f,
+  0.0476705f,   0.190428f,     -0.643391f,   0.484004f,    -0.421836f,
+  -0.19829f,    -0.227574f,    -0.0869152f,  1.09881f,     0.345129f,
+  -0.236732f,   -0.381935f,    -1.46271f,    0.465914f,    0.610375f,
+  0.689968f,    -0.688546f,    1.95033f,     0.420946f,    0.0282428f,
+  0.147823f,    0.669393f,     0.429085f,    -0.328385f,   -0.150439f,
+  -0.419097f,   -0.828102f,    0.248743f,    0.24644f,     0.0186131f,
+  -0.384319f,   -0.126294f,    -0.417067f,   0.271483f,    -0.0128456f,
+  -0.881351f,   0.152581f,     0.185584f,    -0.745827f,   0.0551359f,
+  0.127083f,    0.936983f,     -0.0225341f,  0.575861f,    0.767417f,
+  -0.140867f,   -0.762518f,    0.422446f,    -0.0611973f,  0.0515641f,
+  -0.144168f,   -0.298882f,    0.308461f,    0.0208704f,   0.213872f,
+  -0.258708f,   1.13186f,      0.314083f,    -0.347536f,   -0.137768f,
+  0.653953f,    -0.217883f,    -0.56112f,    -0.864661f,   0.488836f,
+  0.268133f,    -0.548664f,    -0.765226f,   0.117082f,    0.326798f,
+  -0.678246f,   0.477785f,     -1.27584f,    0.198912f,    -0.710395f,
+  1.39096f,     -0.411577f,    -0.55119f,    0.51092f,     -0.295023f,
+  0.245983f,    -0.0957192f,   -0.312001f,   0.0175991f,   0.524423f,
+  -0.126379f,   0.124687f,     -1.53945f,    -0.342856f,   0.514072f,
+  0.400884f,    -0.00581101f,  -0.219327f,   0.0977873f,   0.337551f,
+  -0.058603f,   0.20034f,      0.0429945f,   0.676803f,    -0.273585f,
+  -0.173435f,   -0.581596f,    0.226263f,    -0.0946223f,  -0.060088f,
+  -0.0100809f,  -0.022242f,    -0.22218f,    -0.030463f,   -0.141389f,
+  -0.190757f,   -0.00526518f,  -0.77519f,    -0.0825695f,  0.308403f,
+  0.262792f,    -0.601842f,    0.0783697f,   0.197527f,    0.0714048f,
+  0.0392629f,   -0.388628f,    0.172541f,    -0.0222009f,  0.252096f,
+  0.0728652f,   0.173632f,     0.192914f,    -0.00969965f, 0.0530136f,
+  -0.00765759f, 0.440234f,     -0.0943323f,  0.112319f,    0.0878737f,
+  -0.739021f,   0.385305f,     0.133334f,    -0.396697f,   0.177818f,
+  -0.0712558f,  0.516923f,     0.102174f,    0.17158f,     -0.211068f,
+  0.295795f,    -0.36198f,     0.179087f,    -0.845744f,   -0.242514f,
+  -1.49073f,    0.272702f,     0.59011f,     -0.408184f,   -0.0731313f,
+  0.234643f,    0.589642f,     -0.100778f,   0.516921f,    -0.700154f,
+  0.316432f,    0.36117f,      0.0380282f,   0.480101f,    -0.0975487f,
+  0.941452f,    0.231705f,     -0.151182f,   -1.20305f,    0.28255f,
+  -0.0427662f,  -0.00717175f,  -0.842085f,   -0.357376f,   0.545581f,
+  -0.290714f,   0.741498f,     1.00377f,     0.483864f,    0.150405f,
+  0.0834512f,   -0.10031f,     0.424054f,    -0.0223491f,  -0.0696701f,
+  -0.134479f,   -0.747227f,    0.422208f,    0.123858f,    -0.392624f,
+  -0.0299847f,  -0.0376142f,   -0.392536f,   -0.0343114f,  0.298224f,
+  -0.375899f,   0.693119f,     0.27909f,     -0.53463f,    0.105459f,
+  -0.0267383f,  0.5094f,       -0.411557f,   0.451749f,    -0.348479f,
+  -0.0497316f,  -0.353913f,    -0.14858f,    0.241838f,    0.331039f,
+  0.756607f,    -0.0701661f,   -0.827264f,   -0.367772f,   0.447201f,
+  0.834616f,    -0.00497265f,  -0.0557285f,  0.055088f,    -0.300115f,
+  -0.143833f,   -1.07838f,     -0.106896f,   0.16945f,     0.0170324f,
+  0.108754f,    0.335893f,     -0.0923708f,  0.450209f,    -0.0713308f,
+  -0.0233037f,  -0.0129902f,   -1.40664f,    -0.0996218f,  0.711236f,
+  0.400716f,    0.227871f,     2.01499f,     0.572926f,    0.135673f,
+  -0.0340458f,  -0.316736f,    0.24257f,     -0.700768f,   -0.194985f,
+  0.312011f,    -0.179599f,    0.128114f,    0.0725977f,   -0.193816f,
+  0.352143f,    0.070641f,     -0.467808f,   -0.399047f,   0.10136f,
+  0.671574f,    -0.553965f,    0.105729f,    0.210383f,    0.065048f,
+  0.248198f,    -0.731674f,    0.588725f,    -0.308237f,   0.24511f,
+  0.00608906f,  0.170906f,     0.246175f,    0.149521f,    0.106071f,
+  0.160246f,    0.118487f,     -0.104102f,   0.872823f,    0.227478f,
+  0.0182631f,   -0.115083f,    0.0142445f,   0.307947f,    -0.884925f,
+  0.0767105f,   0.0414042f,    -0.448021f,   -0.0400193f,  -0.0765448f,
+  -0.411931f,   -0.199624f,    0.333371f,    0.17267f,     -0.0431816f,
+  0.190826f,    -0.0758961f,   -1.02831f,    -0.0414525f,  0.605374f,
+  -0.0188181f,  -0.2207f,      1.30004f,     -0.207005f,   -0.0333617f,
+  0.227145f,    0.105059f,     -0.0473393f,  -0.448752f,   -0.0342152f,
+  -0.0244812f,  0.220329f,     0.0313591f,   -0.0902074f,  -0.0731945f,
+  0.88488f,     0.306306f,     -0.275613f,   -0.476372f,   0.00678104f,
+  0.442029f,    0.122049f,     0.118042f,    0.270527f,    -0.462538f,
+  0.0665021f,   -0.260255f,    0.209182f,    0.162321f,    0.0629934f,
+  -0.244896f,   -0.078863f,    0.655585f,    -0.0506617f,  -0.487128f,
+  0.118765f,    -0.34408f,     0.0930615f,   -0.365632f,   -0.0670776f,
+  0.44428f,     0.286734f,     0.146608f,    0.686757f,    -0.0738428f,
+  -0.10034f,    -0.928438f,    -0.172601f,   -0.0959575f,  -0.010532f,
+  0.277549f,    0.28773f,      -0.318883f,   0.71254f,     0.273593f,
+  -0.382845f,   -0.0104587f,   -0.647769f,   0.25541f,     0.194625f,
+  0.265197f,    -0.750938f,    -0.0650515f,  -0.567092f,   0.070613f,
+  0.209531f,    0.429699f,     0.130676f,    0.514914f,    0.615778f,
+  0.594535f,    -0.0878778f,   0.40593f,     -0.303383f,   0.0907863f,
+  -0.320068f,   0.0137162f,    -0.303424f,   0.594207f,    -0.236524f,
+  -0.692627f,   -0.990063f,    -0.0262934f,  0.222375f,    0.503412f,
+  0.220224f,    0.676871f,     -0.150996f,   0.379777f,    0.841339f,
+  -1.05981f,    0.259943f,     -0.781745f,   0.0346478f,   0.115791f,
+  -0.25171f,    -0.00872158f,  0.395561f,    -0.0849893f,  -1.20134f,
+  -0.313938f,   0.789542f,     0.159606f,    -0.782095f,   -0.229754f,
+  0.266687f,    -0.0354282f,   -0.3041f,     0.0338618f,   -0.390001f,
+  -0.28362f,    -0.436144f,    0.777351f,    0.855321f,    0.653338f,
+  -0.0382912f,  -0.204577f,    1.13828f,     0.220395f,    -4.60853f,
+  0.575694f,    0.0453189f,    1.76567f,     0.466151f,    -0.366109f,
+  0.594717f,    0.278891f,     -0.750676f,   -0.332739f,   -0.942304f,
+  0.280363f,    0.284561f,     0.209326f,    0.238347f,    -0.0124311f,
+  -0.439463f,   -0.036186f,    0.165997f,    0.374717f,    -0.481148f,
+  -0.626417f,   0.0223598f,    0.039337f,    -0.379918f,   0.211046f,
+  0.0795812f,   0.863355f,     -0.341448f,   0.421494f,    0.410477f,
+  -0.117025f,   -0.511108f,    0.565193f,    -0.063582f,   -0.031349f,
+  -0.0750174f,  0.387941f,     0.541266f,    0.0919753f,   1.05041f,
+  0.263004f,    0.289006f,     0.0439694f,   -1.22439f,    -0.247832f,
+  0.260967f,    0.355794f,     0.599694f,    -0.69418f,    0.372805f,
+  -0.161731f,   0.0720574f,    0.0394657f,   0.122772f,    -0.458067f,
+  -0.370826f,   -1.34495e-05f, -0.373404f,   0.0245539f,   -2.3472f,
+  -2.61448f,    0.264794f,     0.0601582f,   -0.968597f,   -0.196022f,
+  -0.727067f,   0.167346f,     0.517478f,    0.0035377f,   0.777219f,
+  0.553128f,    0.727211f,     0.606202f,    -0.495604f,   2.41445f,
+  0.465214f,    -0.0443004f,   0.142972f,    0.141459f,    -0.17771f,
+  0.0156117f,   0.169264f,     0.0428022f,   -0.164827f,   -0.240632f,
+  0.215289f,    -0.213134f,    -0.184163f,   0.0161321f,   -0.20025f,
+  -0.0311616f,  0.00292108f,   -0.0131921f,  0.0437664f,   -0.104817f,
+  -0.131906f,   0.0822771f,    0.237307f,    -0.347567f,   -1.2485f,
+  0.253616f,    -0.442217f,    0.0514077f,   0.337561f,    -0.0147658f,
+  -0.132888f,   -0.643821f,    0.445573f,    -0.0146213f,  0.235511f,
+  0.53583f,     -0.640644f,    0.0280044f,   0.00628834f,  0.143885f,
+  0.380077f,    -0.542342f,    0.363101f,    0.0647334f,   -0.476556f,
+  -0.822676f,   0.482454f,     -0.0467326f,  -0.253083f,   0.116726f,
+  0.317333f,    0.548131f,     -0.234667f,   0.579923f,    -0.420683f,
+  0.595613f,    -0.279864f,    -0.753204f,   -0.516844f,   -0.436574f,
+  -0.120682f,   -0.278939f,    0.752202f,    -0.183443f,   -0.14632f,
+  -0.0344068f,  0.127638f,     -0.225245f,   0.489391f,    0.145082f,
+  -0.73672f,    0.980065f,     -0.0367412f,  0.40632f,     -0.802509f,
+  0.356897f,    0.366172f,     1.23858f,     -0.978381f,   -0.684924f,
+  -0.0870693f,  -0.353628f,    0.695788f,    -0.244593f,   -1.8897f,
+  -0.257803f,   0.686937f,     0.405155f,    -0.125696f,   0.258075f,
+  0.570584f,    -0.439481f,    -0.59798f,    0.0745711f,   -0.235162f,
+  0.133048f,    -0.243033f,    0.0415527f,   -0.00118735f, 0.00980514f,
+  -0.297429f,   -0.144983f,    0.463093f,    0.0965441f,   -0.338508f,
+  -0.651077f,   0.817577f,     -0.0364773f,  -0.388465f,   0.113288f,
+  0.231198f,    0.316208f,     -0.592201f,   0.530376f,    -0.431434f,
+  0.0200985f,   0.104303f,     -0.130705f,   0.4374f,      0.362342f,
+  0.70641f,     0.20037f,      0.309128f,    -0.484535f,   -1.18469f,
+  0.513893f,    0.201236f,     -0.022396f,   0.179638f,    -0.361289f,
+  -0.0794946f,  -1.04704f,     -0.0281103f,  0.0494822f,   0.00196415f,
+  0.0625478f,   -0.229033f,    0.12018f,     0.542629f,    -0.222423f,
+  -0.0123321f,  -0.0988525f,   0.773192f,    -0.192218f,   -3.19156f,
+  0.300606f,    0.462751f,     2.2968f,      0.137182f,    0.132539f,
+  0.165884f,    0.128818f,     -0.155856f,   -0.558538f,   -0.231742f,
+  -0.244377f,   -0.442397f,    0.250947f,    0.0850658f,   -0.00820139f,
+  0.391284f,    0.17453f,      0.306003f,    -0.531499f,   -0.624451f,
+  0.564584f,    -0.343953f,    -0.0278713f,  0.212664f,    -0.135969f,
+  -0.0179867f,  -0.687887f,    0.371065f,    -0.0537029f,  0.0499509f,
+  0.0980684f,   -0.0438569f,   0.186731f,    0.182105f,    0.172254f,
+  -0.149446f,   -0.0247637f,   0.148098f,    1.20772f,     -0.136664f,
+  0.00983112f,  0.0181381f,    -0.0147549f,  -0.0846561f,  -0.827022f,
+  0.00207177f,  0.0478215f,    0.0652549f,   0.0898219f,   -0.0224959f,
+  -0.0274246f,  0.0166498f,    -0.0211715f,  -0.502932f,   0.0961452f,
+  0.251206f,    -0.0623632f,   0.741566f,    0.0078449f,   -2.99162f,
+  -0.187244f,   0.0743479f,    1.46425f,     0.0737923f,   0.0133544f,
+  0.20922f,     -0.178671f,    -0.0528492f,  -0.526717f,   0.0282125f,
+  -0.0363201f,  0.37406f,      -0.303658f,   -0.066803f,   0.132237f,
+  0.962057f,    -0.399733f,    0.191765f,    -0.452606f,   -0.348732f,
+  0.444939f,    0.153025f,     0.0796317f,   0.265985f,    -0.319638f,
+  0.0278161f,   -0.333734f,    0.226108f,    0.147895f,    -0.124066f,
+  -0.37306f,    0.19541f,      0.200175f,    -0.0593244f,  0.0333887f,
+  -0.0284278f,  0.462491f,     0.0686487f,   -0.332435f,   -0.437166f,
+  0.302795f,    0.100542f,     0.0265019f,   0.767212f,    -0.140621f,
+  0.11558f,     -0.70584f,     -0.00017415f, 0.00793092f,  -0.0490901f,
+  0.0598338f,   0.484876f,     -0.13025f,    0.660349f,    0.147503f,
+  -0.462766f,   0.0843824f,    0.218493f,    0.310921f,    -0.162284f,
+  0.210404f,    -0.788799f,    0.0698512f,   -0.484799f,   0.0311505f,
+  -0.308243f,   0.417298f,     0.0593723f,   0.208908f,    0.451437f,
+  0.354546f,    -0.0700888f,   -0.281678f,   -0.311177f,   0.00914652f,
+  -0.372084f,   0.135036f,     0.185393f,    0.461347f,    -0.114241f,
+  -0.402347f,   -0.692327f,    0.0376155f,   -0.200267f,   0.565963f,
+  -0.0627442f,  0.429677f,     0.170514f,    0.350565f,    0.699528f,
+  -0.948126f,   -0.364205f,    0.348878f,    -0.137832f,   -0.0791649f,
+  -0.0462295f,  -0.255078f,    -0.398509f,   0.136783f,    -0.0164628f,
+  -0.555472f,   0.690396f,     0.147715f,    0.000523095f, 0.14874f,
+  0.524804f,    0.162974f,     0.797599f,    0.277473f,    -0.500696f,
+  0.189917f,    -0.333309f,    0.00613646f,  -1.07817f,    0.0470502f,
+  0.210766f,    0.159768f,     -0.447774f,   -0.252968f,   -1.72739f,
+  0.0658259f,   -0.448747f,    2.26511f,     0.349651f,    0.157232f,
+  0.956842f,    0.856676f,     0.149227f,    -0.626957f,   -0.566771f,
+  -0.0980846f,  0.351668f,     -0.362741f,   -0.0272282f,  -0.113632f,
+  0.366015f,    -0.00790003f,  -0.458632f,   -0.31157f,    -0.182257f,
+  -0.953975f,   0.0583582f,    0.164721f,    -0.900107f,   -0.115542f,
+  0.0654192f,   0.99056f,      -0.247976f,   0.48254f,     0.670196f,
+  0.098585f,    -0.212855f,    0.310072f,    0.0894616f,   0.151944f,
+  0.119629f,    -0.26735f,     0.162257f,    -0.0305818f,  0.681526f,
+  -0.229847f,   1.01556f,      0.29132f,     0.740113f,    0.0703937f,
+  0.537892f,    -0.18653f,     -0.0252359f,  -0.420014f,   0.197631f,
+  -0.176629f,   0.00674754f,   0.301288f,    -0.162816f,   0.636235f,
+  -0.341362f,   0.197296f,     -0.589747f,   -0.749363f,   -0.277197f,
+  -1.27291f,    -0.0857908f,   -0.147591f,   -0.0956297f,  -0.109097f,
+  0.0717554f,   0.359078f,     0.301457f,    0.486934f,    -0.260955f,
+  -0.126821f,   1.55756f,      0.477469f,    -1.45363f,    1.42198f,
+  -0.360847f,   -0.0211924f,   -0.0184957f,  -0.110706f,   -0.152136f,
+  0.104703f,    0.267615f,     0.127392f,    0.172996f,    0.258326f,
+  0.268578f,    -0.431123f,    -0.114419f,   0.0101172f,   -0.195671f,
+  0.0792025f,   -0.151505f,    -0.064077f,   0.0479777f,   -0.141882f,
+  0.121492f,    -0.139132f,    -0.348252f,   0.341043f,    -0.565367f,
+  -0.0791259f,  -0.781086f,    0.0140045f,   0.571094f,    -0.00875077f,
+  0.217132f,    -0.202345f,    0.157213f,    0.228445f,    0.366612f,
+  -0.529989f,   0.42241f,      -0.540538f,   -0.0425556f,  -0.207774f,
+  -0.0663941f,  0.37836f,      -0.0650245f,  -0.0828694f,  -0.0835478f,
+  -0.795512f,   0.470268f,     0.1551f,      -0.69017f,    -0.116735f,
+  0.157614f,    0.555973f,     -0.293311f,   0.245428f,    -0.0853701f,
+  -0.449278f,   -0.0551647f,   -0.00137429f, 0.709439f,    -0.456796f,
+  0.132062f,    -0.0449484f,   -0.308599f,   0.180608f,    -2.24196f,
+  0.421478f,    -0.640946f,    -0.460397f,   -0.920628f,   -0.184949f,
+  -0.0416982f,  0.6484f,       -0.22806f,    0.412229f,    -0.468079f,
+  -0.72372f,    -0.347698f,    -1.3899f,     0.631876f,    0.0611046f,
+  0.0294258f,   -0.128091f,    -0.205615f,   0.355348f,    -0.267725f,
+  -0.644835f,   0.435879f,     0.517477f,    -0.338123f,   -0.157764f,
+  0.32762f,     -0.166454f,    0.221007f,    -0.0438278f,  -0.0777725f,
+  0.10986f,     0.941545f,     -0.542284f,   -0.172312f,   -0.256597f,
+  -0.0181391f,  0.220623f,     -0.432456f,   0.0164074f,   0.250226f,
+  -0.522576f,   0.783109f,     0.198703f,    -0.784554f,   -0.0929628f,
+  0.326861f,    0.470293f,     0.442684f,    0.271879f,    -0.108256f,
+  0.0483558f,   -0.403151f,    0.36183f,     -0.268186f,   0.270851f,
+  -0.696826f,   -0.166037f,    -0.354658f,   0.405977f,    -0.473447f,
+  0.649689f,    -0.0863114f,   -0.147319f,   0.0869966f,   0.319792f,
+  0.493026f,    -1.07456f,     0.354751f,    0.114605f,    -0.120647f,
+  -0.238315f,   0.0290955f,    -0.355299f,   -0.45381f,    0.0812865f,
+  -0.0180434f,  0.00861318f,   -0.892943f,   -0.0127801f,  -1.66398f,
+  0.290505f,    0.126832f,     2.08173f,     -0.0454847f,  -0.162481f,
+  1.07426f,     0.228566f,     0.280528f,    -0.537625f,   -0.175288f,
+  -0.118012f,   0.649114f,     -0.349926f,   -0.0189864f,  -0.30934f,
+  -0.363178f,   -0.119822f,    -0.22656f,    0.484513f,    -0.173269f,
+  0.41987f,     -0.448517f,    -0.0950466f,  0.482443f,    0.061558f,
+  0.4219f,      -0.536388f,    0.0781972f,   0.212489f,    0.104229f,
+  -0.0792804f,  0.402066f,     -0.676313f,   -0.2272f,     -0.16379f,
+  0.260145f,    -0.0504658f,   -0.0826579f,  -1.37749f,    0.00790747f,
+  0.0841031f,   -0.0671308f,   -0.00301736f, -0.386206f,   0.190311f,
+  0.0702639f,   0.0643968f,    0.133741f,    -0.0141555f,  -0.0365324f,
+  0.87028f,     0.207894f,     -0.421266f,   0.689256f,    0.145037f,
+  -0.270796f,   0.212604f,     -0.345326f,   0.0074631f,   -1.72379f,
+  0.0672097f,   -0.273153f,    1.30503f,     -1.01324f,    0.00284696f,
+  0.851459f,    0.176847f,     0.30948f,     -0.57144f,    -0.0596695f,
+  -0.111189f,   0.130361f,     -0.298286f,   0.0567591f,   -0.0885215f,
+  -0.847601f,   0.238624f,     -0.162391f,   0.452357f,    -0.0192713f,
+  0.226661f,    0.0762922f,    -0.0894055f,  0.332702f,    0.424484f,
+  0.0443207f,   -0.162345f,    -0.601036f,   0.280527f,    -0.137362f,
+  0.266345f,    0.729438f,     -0.887182f,   0.152943f,    -0.573548f,
+  -0.0201383f,  -0.56521f,     0.033582f,    0.300284f,    -0.144472f,
+  0.633026f,    0.30866f,      0.0653073f,   0.316901f,    0.0721326f,
+  0.192252f,    -0.833162f,    0.194292f,    -0.08663f,    -0.189401f,
+  -0.178242f,   0.111488f,     0.522487f,    -0.65497f,    0.457049f,
+  0.390654f,    0.0522936f,    -0.39712f,    -0.293717f,   -0.374656f,
+  -0.118916f,   -0.853076f,    -0.0829578f,  -0.17335f,    -0.0218694f,
+  0.367968f,    0.478469f,     0.0913813f,   0.519251f,    0.803526f,
+  -0.272516f,   -0.341329f,    0.0897285f,   0.247653f,    0.000898686f,
+  0.313196f,    0.000587979f,  -0.314189f,   -0.449439f,   -0.0291611f,
+  -0.356287f,   -0.722904f,    -0.0480958f,  -0.523758f,   -0.576146f,
+  0.133754f,    0.616921f,     -0.085494f,   0.487487f,    0.745129f,
+  0.993267f,    0.256555f,     0.0822743f,   0.0411971f,   0.139388f
+};
+
+static const float av1_intra_mode_cnn_partition_cnn_layer_1_bias[] = {
+  0.00447951f,  0.0202534f,  0.00970833f, -0.00460874f,  0.0942288f,
+  -0.0534704f,  0.00829869f, -0.0255174f, -0.0809143f,   0.00169117f,
+  0.0177427f,   0.0259387f,  0.0291077f,  -0.0267599f,   0.100275f,
+  -0.00389366f, 0.0315499f,  0.0265846f,  -0.000206604f, 0.0302221f
+};
+
+static const float av1_intra_mode_cnn_partition_cnn_layer_2_kernel[] = {
+  0.153048f,    0.0725422f,   0.068901f,     -0.475608f,   0.0736706f,
+  -0.134076f,   0.229289f,    0.0217921f,    0.0449205f,   -1.00002f,
+  0.149133f,    0.0497258f,   0.118988f,     0.0741764f,   0.0385486f,
+  0.225181f,    0.012966f,    0.155593f,     -3.07175f,    -0.0641051f,
+  0.09161f,     0.0259005f,   -0.209998f,    -0.420298f,   0.0587126f,
+  0.00352744f,  0.0451313f,   -0.049384f,    0.11516f,     0.083135f,
+  0.103675f,    -0.0185604f,  0.0623248f,    -0.0993726f,  0.0448522f,
+  0.0134017f,   -0.294776f,   -0.251924f,    0.0712635f,   -0.0764298f,
+  -0.463766f,   -0.0295011f,  -0.579168f,    0.573853f,    -0.00596607f,
+  0.0237762f,   -0.0500104f,  -0.0969275f,   0.155573f,    0.0515382f,
+  -0.178454f,   -0.154008f,   -0.278299f,    -0.166421f,   0.0149533f,
+  -0.0700236f,  0.239287f,    -1.19545f,     -0.0744625f,  0.143037f,
+  0.141874f,    0.086302f,    0.0838633f,    -0.454179f,   0.120308f,
+  -0.0896718f,  0.254909f,    0.0714462f,    0.00471098f,  -0.869494f,
+  0.209407f,    0.138285f,    0.0816641f,    0.0666266f,   0.0848555f,
+  0.173313f,    0.0695633f,   0.285667f,     -3.15384f,    0.00140275f,
+  -0.969824f,   -0.0318689f,  -0.00487396f,  0.412541f,    0.0263593f,
+  -0.249824f,   0.0897776f,   0.0208836f,    -0.0982745f,  -0.16049f,
+  -0.12719f,    -0.186166f,   0.102338f,     0.273931f,    -0.0886306f,
+  -0.19513f,    -0.0135712f,  -0.194127f,    -0.0834291f,  0.426623f,
+  -0.0705446f,  0.0327476f,   0.0800862f,    0.478757f,    -0.00849111f,
+  -0.554911f,   -0.0489312f,  -0.184029f,    -0.227428f,   0.159989f,
+  -0.0677731f,  -0.0901436f,  0.00308696f,   -0.352243f,   0.278715f,
+  0.306374f,    -0.0772054f,  -0.0122733f,   -0.0693457f,  0.074365f,
+  -0.267458f,   -0.123612f,   -0.495954f,    0.552604f,    -0.103951f,
+  -0.121771f,   0.179966f,    -0.377947f,    -1.35472f,    0.153294f,
+  -0.445284f,   -0.089813f,   -0.00529807f,  0.254047f,    -0.0378426f,
+  0.114597f,    -0.143052f,   0.0815258f,    -0.10528f,    0.00833533f,
+  -0.117508f,   0.129052f,    0.0706719f,    -1.39506f,    0.0124731f,
+  0.109831f,    -0.0744156f,  0.181612f,     0.0787894f,   0.0293352f,
+  0.494929f,    0.00997207f,  -0.585882f,    -0.0844138f,  -0.00864134f,
+  -0.109943f,   0.0713114f,   0.14883f,      0.0610554f,   0.204145f,
+  -0.00390313f, 0.0184763f,   -0.111387f,    0.175442f,    -0.0840215f,
+  -0.178785f,   -0.0693612f,  -0.254507f,    -0.191549f,   0.501561f,
+  -0.0858995f,  -0.164921f,   0.0250706f,    -0.0916282f,  0.247085f,
+  0.13877f,     -0.419487f,   -0.295065f,    -0.213812f,   -0.10362f,
+  0.138243f,    0.086985f,    0.113633f,     -0.459273f,   0.12388f,
+  -0.139296f,   0.253792f,    0.0421624f,    0.0665065f,   -0.977282f,
+  0.199927f,    0.115194f,    0.099045f,     0.0534806f,   0.089283f,
+  0.0815367f,   0.150901f,    0.253458f,     -3.24825f,    -0.0118163f,
+  -0.544565f,   0.0201825f,   -0.0682201f,   0.759028f,    0.00479696f,
+  -0.00625607f, 0.058007f,    -0.0811189f,   -0.114617f,   -0.0998578f,
+  0.133312f,    0.0246256f,   -0.0167416f,   0.196118f,    0.109823f,
+  0.109489f,    0.474682f,    -0.763475f,    0.0818745f,   0.0798777f,
+  -0.0994905f,  -0.00138143f, -0.108563f,    0.697289f,    -0.103702f,
+  -0.306085f,   -0.0996705f,  -0.142618f,    -0.130989f,   0.0813303f,
+  -0.0909275f,  -0.10786f,    -0.0280431f,   0.206877f,    -1.70798f,
+  0.525568f,    0.559891f,    -0.166132f,    -0.227574f,   -0.150955f,
+  0.0849226f,   0.00497342f,  -0.168667f,    -0.282575f,   0.00537805f,
+  -0.0185572f,  0.0607167f,   -0.0534948f,   -0.0215776f,  -0.14825f,
+  -0.0164577f,  -0.0611978f,  0.0347562f,    0.286917f,    0.226598f,
+  0.149497f,    -0.478101f,   -0.246006f,    0.0663239f,   -0.121728f,
+  0.267087f,    0.0802681f,   -0.184741f,    -0.558267f,   0.0437066f,
+  0.13816f,     -0.0710939f,  0.0725697f,    0.339857f,    0.161069f,
+  0.304871f,    0.108138f,    0.193396f,     0.0891607f,   -0.0701939f,
+  -0.182038f,   -0.451873f,   -0.233883f,    0.0444747f,   0.0436545f,
+  -0.245894f,   -0.0721136f,  0.309013f,     0.278996f,    0.0259377f,
+  0.0278116f,   0.0686773f,   -0.271237f,    0.235082f,    -0.0778285f,
+  -0.456541f,   -0.109303f,   -0.074565f,    -0.407301f,   -0.162191f,
+  -0.801819f,   0.372435f,    -0.559083f,    -0.039189f,   0.0477762f,
+  0.0875363f,   0.0699926f,   0.116552f,     -0.308217f,   0.0341607f,
+  -0.14202f,    0.135517f,    0.0316971f,    0.153297f,    -0.759722f,
+  0.12849f,     0.114229f,    0.0814893f,    0.275402f,    0.0403976f,
+  0.0357503f,   0.212295f,    0.0673998f,    -2.59822f,    -0.0475021f,
+  -0.0594725f,  0.0659163f,   0.0469717f,    -0.0370461f,  -0.12863f,
+  -0.381743f,   -0.0445055f,  -0.106843f,    -0.0880648f,  0.00591106f,
+  0.235514f,    -0.165162f,   -0.0696645f,   0.115374f,    0.245558f,
+  0.192049f,    -0.388628f,   -0.48291f,     0.154313f,    -0.160207f,
+  0.125928f,    0.122039f,    0.0713794f,    -0.161244f,   0.128082f,
+  -0.234659f,   0.0680219f,   0.0597933f,    0.208421f,    -0.163623f,
+  0.196873f,    0.156603f,    0.184179f,     -0.278331f,   -0.0481286f,
+  0.0828152f,   0.247004f,    0.0915582f,    -0.0906229f,  -0.20376f,
+  0.136593f,    0.0740336f,   -0.0134935f,   -0.355048f,   0.0898485f,
+  -0.0962068f,  0.185804f,    -0.0145596f,   0.0966589f,   -0.515784f,
+  0.121602f,    0.0320428f,   0.11093f,      -0.0559421f,  0.0355484f,
+  0.192128f,    0.0500888f,   0.133641f,     -1.73282f,    -0.0624599f,
+  0.122524f,    0.0757292f,   -0.0974648f,   -0.193649f,   0.0561096f,
+  0.0159959f,   0.0334472f,   -0.0168832f,   -0.12386f,    -0.112419f,
+  0.19552f,     0.0308502f,   0.0537643f,    -0.0181012f,  0.0392183f,
+  0.0461833f,   -0.52623f,    -0.238252f,    0.0821762f,   -0.212384f,
+  0.112901f,    0.096063f,    0.0540225f,    0.0773583f,   0.143045f,
+  -0.101551f,   0.282418f,    0.0176749f,    -0.00244542f, -0.780154f,
+  -0.254428f,   -5.82215f,    0.106638f,     0.11746f,     0.0486823f,
+  0.164562f,    0.0303006f,   0.229614f,     -2.41845f,    -0.117122f,
+  0.0451654f,   0.0237383f,   -0.208731f,    0.0721137f,   0.0761163f,
+  -0.0569416f,  -0.00830511f, -0.045256f,    0.14535f,     -0.0189222f,
+  -0.283363f,   -3.15502f,    0.0971161f,    -0.035913f,   0.00813281f,
+  0.0187974f,   -0.361573f,   -0.302067f,    0.118014f,    -0.0956148f,
+  -0.596567f,   0.0105443f,   -0.49019f,     -0.0801959f,  0.0322344f,
+  -0.0280032f,  0.0555038f,   -0.111495f,    -0.0994456f,  0.0178021f,
+  0.0358362f,   1.07063f,     -0.0833138f,   0.0621246f,   0.0637157f,
+  0.0999207f,   0.191975f,    -1.2811f,      0.0341681f,   0.14818f,
+  0.0957259f,   0.109909f,    0.0566115f,    0.0585633f,   0.179939f,
+  -0.104372f,   0.309091f,    0.0172941f,    0.0243182f,   -0.935252f,
+  -0.296257f,   -5.83634f,    0.0899249f,    0.455347f,    0.129505f,
+  0.220212f,    0.0214801f,   0.284802f,     -2.94585f,    -0.0805413f,
+  -1.01819f,    0.00534034f,  -0.057203f,    0.0869331f,   0.0207575f,
+  -0.124479f,   -0.0465806f,  0.0894252f,    0.32203f,     0.0858497f,
+  0.25178f,     0.0932205f,   0.0888455f,    0.233153f,    -0.446398f,
+  -0.00791233f, 0.0909603f,   -0.0904397f,   0.131835f,    0.475597f,
+  -0.1236f,     0.0231622f,   0.138602f,     -0.097731f,   -0.0282484f,
+  -0.549095f,   -0.0457428f,  -0.0895407f,   -0.293965f,   0.166872f,
+  0.46719f,     0.236254f,    0.0615991f,    0.499236f,    0.540366f,
+  0.402035f,    0.0606324f,   -0.0499928f,   -0.0155198f,  0.0994403f,
+  -0.14773f,    -0.183433f,   -0.612093f,    -0.334201f,   -0.110877f,
+  -0.143441f,   0.05815f,     -0.318586f,    -0.344235f,   0.199593f,
+  0.51109f,     -0.252281f,   -0.028834f,    0.0615421f,   0.0623699f,
+  0.210745f,    -0.236448f,   0.166279f,     0.127516f,    -0.0971157f,
+  -0.204389f,   0.208112f,    0.0377023f,    0.271837f,    -0.00859528f,
+  0.0797081f,   -0.00582115f, 0.140018f,     -0.384865f,   -0.0853243f,
+  -0.586727f,   -0.0664489f,  -0.631436f,    -0.245828f,   -0.0647894f,
+  -0.171912f,   -0.0801706f,  0.0731614f,    -0.11725f,    0.281478f,
+  -0.03047f,    0.0363488f,   -0.0481651f,   -0.326329f,   -0.0155898f,
+  -0.428316f,   -0.0989367f,  -0.271902f,    -0.00263837f, 0.366168f,
+  0.325989f,    0.165463f,    0.0668512f,    -0.142202f,   0.419992f,
+  0.164971f,    -0.515479f,   -0.187585f,    -0.151783f,   -0.0682468f,
+  0.0910191f,   0.117086f,    0.106579f,     0.0961825f,   0.162148f,
+  -0.129645f,   0.301039f,    0.000320343f,  -0.0558097f,  -0.844295f,
+  -0.218919f,   -5.7571f,     0.0982612f,    0.238955f,    0.0703565f,
+  0.0969388f,   0.107202f,    0.321585f,     -3.00594f,    -0.058755f,
+  -0.620004f,   0.052114f,    0.128423f,     -0.177673f,   -0.00341509f,
+  -0.146756f,   -0.0414309f,  -0.0893262f,   -0.0584779f,  -0.129552f,
+  0.127629f,    0.13275f,     -0.0973342f,   -0.215617f,   0.0724309f,
+  0.0102229f,   0.178137f,    -0.943374f,    -0.171465f,   0.304949f,
+  -0.0963836f,  -0.0346437f,  -0.138667f,    -0.234184f,   0.0344159f,
+  -0.319592f,   -0.0990766f,  -0.16065f,     0.369432f,    0.194911f,
+  0.363348f,    -0.356009f,   -0.00736217f,  0.241788f,    -2.21311f,
+  0.704816f,    0.697019f,    0.129186f,     -0.132799f,   -0.11861f,
+  0.0383451f,   0.0247782f,   -0.12687f,     0.0256552f,   0.048413f,
+  0.00660549f,  0.0457962f,   -0.012819f,    0.115991f,    -0.1117f,
+  -0.291045f,   -0.646138f,   0.0813613f,    0.112063f,    0.191675f,
+  0.120835f,    -0.444267f,   -0.340385f,    0.0391936f,   -0.151132f,
+  0.184419f,    0.124998f,    -0.14089f,     0.214087f,    0.00108535f,
+  0.119611f,    0.0236965f,   0.0715074f,    -0.225997f,   -0.0126552f,
+  -0.459214f,   -0.490444f,   0.173716f,     0.355811f,    -0.13607f,
+  -0.191091f,   -0.530085f,   -0.400666f,    0.011221f,    0.10527f,
+  -0.11498f,    -0.011864f,   0.364376f,     0.0319587f,   -0.0528563f,
+  0.0353899f,   0.0393453f,   -0.289211f,    -0.347785f,   -0.0417157f,
+  0.545848f,    0.741785f,    -0.0732565f,   -1.29687f,    -0.0433128f,
+  -1.44162f,    0.318894f,    -0.377784f,    0.123751f,    -0.00444347f,
+  0.0957118f,   0.0893616f,   0.0911595f,    0.092917f,    0.127681f,
+  -0.159929f,   0.190417f,    -0.0297948f,   -0.00132599f, -0.742756f,
+  -0.0364169f,  -4.00108f,    0.0784767f,    0.223048f,    0.0430138f,
+  0.0180493f,   0.212842f,    0.122987f,     -2.83267f,    -0.0641464f,
+  -0.173247f,   0.100946f,    0.0804885f,    0.0172631f,   0.0877408f,
+  -0.353222f,   0.0108262f,   -0.0452121f,   -0.116127f,   0.268154f,
+  -0.132587f,   -0.27481f,    -0.0316914f,   0.0610525f,   0.439691f,
+  0.00966415f,  -0.78962f,    -0.424823f,    -0.0214365f,  -0.113846f,
+  0.100793f,    0.126482f,    0.0415354f,    0.0427995f,   0.14273f,
+  -0.315674f,   0.110095f,    0.0061568f,    0.0320474f,   -0.3596f,
+  -0.12533f,    -1.28837f,    0.174673f,     -0.235912f,   0.00495439f,
+  0.0695473f,   0.266489f,    0.049248f,     0.0868526f,   -0.0685969f,
+  0.102984f,    0.0924639f,   -0.027535f,    0.0709277f,   0.155776f,
+  -0.190944f,   0.188273f,    -0.00897471f,  0.0964232f,   -0.475822f,
+  -0.209374f,   -5.00252f,    0.103495f,     0.110698f,    0.00682092f,
+  0.208586f,    0.0489575f,   0.0966254f,    -1.42973f,    -0.0645128f,
+  0.0515961f,   0.0571281f,   -0.0992321f,   0.00791648f,  0.0087609f,
+  0.0607367f,   0.0315705f,   0.0183317f,    0.0756087f,   -0.0292847f,
+  -0.212932f,   -0.782259f,   0.0899944f,    0.102677f,    0.0681135f,
+  0.0447764f,   -0.481969f,   -0.221459f,    0.0794475f,   -0.229157f,
+  0.136781f,    0.0832359f,   0.0297807f,    -0.00287225f, -5.97897f,
+  -0.0960581f,  0.250945f,    -0.00133314f,  -0.112396f,   -0.856922f,
+  0.115776f,    0.124536f,    0.0914194f,    -0.160775f,   0.128684f,
+  0.106718f,    0.100665f,    0.139579f,     -0.86141f,    -0.190323f,
+  0.0884896f,   0.0363845f,   -0.19831f,     0.121601f,    0.0264453f,
+  -0.00557822f, 0.0720238f,   -0.0140132f,   -0.166814f,   -0.266214f,
+  0.00500545f,  0.0146905f,   0.126035f,     0.0812372f,   0.0615973f,
+  0.0766063f,   -0.420156f,   -0.126157f,    -0.0284299f,  -0.112513f,
+  -0.567008f,   -0.0100263f,  -0.607567f,    0.193053f,    0.0067527f,
+  -0.0753897f,  0.00134269f,  -0.0512249f,   -0.161661f,   0.0667741f,
+  -0.113702f,   -0.071606f,   -0.300563f,    0.276479f,    -0.155318f,
+  -0.0512306f,  0.0896443f,   -0.987911f,    0.0440889f,   0.430958f,
+  0.175427f,    0.101385f,    0.0303662f,    0.0672653f,   -6.62463f,
+  -0.10475f,    0.228249f,    -0.00482173f,  -0.0608713f,  -0.895836f,
+  0.187976f,    0.162173f,    0.0747544f,    0.219953f,    0.0682489f,
+  0.142665f,    0.100287f,    0.301887f,     -1.97736f,    -0.295001f,
+  -1.0733f,     -0.0562668f,  -0.0604295f,   0.0304073f,   0.194274f,
+  -0.243593f,   0.0727137f,   0.0610967f,    -0.0692415f,  -0.02967f,
+  0.055633f,    0.0192402f,   0.105841f,     0.102236f,    -0.0757102f,
+  -0.0067639f,  0.0102317f,   -0.257959f,    -0.0638652f,  0.45521f,
+  -0.114967f,   0.0921177f,   0.223796f,     0.277072f,    -0.0613282f,
+  -0.564693f,   -0.151333f,   -0.158035f,    0.228491f,    0.12997f,
+  -0.192625f,   -0.125344f,   0.0983258f,    -0.931206f,   0.618715f,
+  0.273759f,    -0.145527f,   -0.099431f,    -0.119551f,   0.0663484f,
+  -0.161419f,   -0.202377f,   -0.545393f,    0.0917645f,   0.042263f,
+  -0.17117f,    -0.178622f,   -0.336977f,    0.866715f,    0.0376922f,
+  -0.319728f,   -0.127406f,   0.0599384f,    0.268804f,    -0.0331844f,
+  0.355326f,    -0.103902f,   0.0425935f,    0.00525512f,  -0.133687f,
+  -0.122695f,   0.145582f,    0.139013f,     -0.0053352f,  0.0313566f,
+  0.327295f,    -0.0117993f,  0.233524f,     0.162388f,    -0.0793262f,
+  0.454543f,    0.0442224f,   -0.742673f,    -0.144882f,   0.0874983f,
+  -0.0707259f,  0.0219869f,   0.201728f,     0.0204537f,   0.0788857f,
+  -0.0374329f,  0.0724169f,   0.0743593f,    -0.0193526f,  -0.313546f,
+  -0.418882f,   -0.0815754f,  -0.197144f,    0.305053f,    0.330196f,
+  -0.131006f,   -0.00113249f, 0.0750458f,    -0.541764f,   0.299935f,
+  0.308516f,    -0.20547f,    -0.333066f,    0.0285833f,   0.191147f,
+  0.160372f,    0.0724649f,   0.0426326f,    0.153046f,    -6.59656f,
+  -0.081237f,   0.219163f,    0.0147081f,    -0.0109837f,  -1.01487f,
+  0.170055f,    0.163386f,    0.106413f,     0.150188f,    0.0688875f,
+  0.0541359f,   0.156307f,    0.178844f,     -1.51054f,    -0.149477f,
+  -0.504503f,   0.017878f,    -0.181821f,    -0.0999659f,  0.0484548f,
+  -0.32211f,    0.0406744f,   0.0017627f,    0.0220593f,   0.0900512f,
+  -0.561625f,   0.107279f,    -0.0861521f,   -0.0862376f,  0.0816765f,
+  0.168072f,    0.150063f,    -0.816825f,    -0.13569f,    0.557555f,
+  -0.155265f,   0.025135f,    -0.109304f,    -0.0487062f,  -0.00347487f,
+  -0.454803f,   -0.0394371f,  -0.214597f,    -0.248898f,   0.286501f,
+  -0.249246f,   -0.138935f,   0.00391409f,   -0.122544f,   -2.14993f,
+  0.588942f,    0.541231f,    0.0154047f,    -0.359742f,   0.0520729f,
+  0.0667058f,   0.0418163f,   -0.132533f,    -0.184759f,   0.0546118f,
+  -0.131198f,   0.109664f,    -0.0714679f,   -0.114163f,   -0.243081f,
+  -0.0405089f,  0.0342795f,   0.0801825f,    -0.268408f,   0.192207f,
+  0.0800494f,   -0.586539f,   -0.118155f,    -0.0508569f,  -0.193987f,
+  0.261478f,    0.105719f,    -0.125361f,    -0.0956201f,  0.0233802f,
+  0.271098f,    0.0113352f,   0.0910447f,    0.00628244f,  -0.071722f,
+  0.21439f,     0.0747191f,   0.207765f,     -0.0782454f,  -0.0151716f,
+  -0.196505f,   -0.44798f,    -0.228597f,    0.0549039f,   -0.120715f,
+  -0.19388f,    -0.0768461f,  0.361102f,     0.122936f,    -0.0334211f,
+  -0.202503f,   -0.0450776f,  -0.272345f,    0.662321f,    0.109247f,
+  -0.218026f,   -0.0669386f,  -0.0864701f,   -0.633421f,   -0.158007f,
+  -1.10778f,    0.351211f,    -0.541458f,    -0.0171707f,  0.149606f,
+  0.106105f,    0.0880349f,   0.0968455f,    0.113269f,    -5.01949f,
+  -0.106404f,   0.175578f,    -0.030045f,    -0.0267249f,  -0.563713f,
+  0.173885f,    0.130772f,    0.0334519f,    0.0770157f,   0.0394389f,
+  -0.0290326f,  0.220003f,    0.180901f,     -1.62203f,    -0.151858f,
+  -0.202386f,   -0.0067836f,  0.0287665f,    -0.194183f,   -0.239834f,
+  -0.484159f,   0.00671722f,  -0.122459f,    0.0808959f,   -0.263769f,
+  -0.015066f,   -0.0429868f,  -0.111255f,    -0.231872f,   0.219659f,
+  -0.0437412f,  -0.536618f,   -0.477831f,    0.0421895f,   -0.0815851f,
+  0.119638f,    0.0786293f,   -0.000668378f, 0.0305567f,   -0.0868189f,
+  -0.178327f,   0.0799657f,   0.0280923f,    -0.211395f,   -0.464577f,
+  0.216912f,    0.0761976f,   0.160288f,     -0.416372f,   -0.10286f,
+  -0.0733786f,  0.261033f,    0.0493698f,    0.143137f,    -0.179979f,
+  0.15655f,     0.0897976f,   -0.0258041f,   -0.152852f,   -6.15512f,
+  -0.118917f,   0.227283f,    -0.0514043f,   -0.0786432f,  -0.523485f,
+  0.1644f,      0.0869001f,   0.0984082f,    -0.428288f,   0.0791992f,
+  0.141904f,    0.0652073f,   0.104429f,     -0.775125f,   -0.121479f,
+  0.0841637f,   0.0135705f,   -0.208863f,    -0.0629523f,  0.0455794f,
+  0.0513898f,   -0.0147657f,  0.0401145f,    0.0660079f,   0.0210609f,
+  -0.0151801f,  0.0562111f,   0.140308f,     -0.0196394f,  0.0230753f,
+  -0.0336115f,  -0.422411f,   -0.196974f,    -0.0405748f,  -0.283428f,
+  0.15458f,     0.0876296f,   0.0314038f,    0.16389f,     -7.01385f,
+  -0.117146f,   0.197273f,    -0.0400688f,   0.0143951f,   -0.964007f,
+  -0.0618919f,  0.0406891f,   0.07992f,      -0.144132f,   0.116416f,
+  0.0326838f,   0.103641f,    0.171805f,     -1.05158f,    -0.182589f,
+  0.116991f,    0.0530774f,   -0.212454f,    -0.016727f,   -0.0565992f,
+  0.0712873f,   0.0445466f,   -0.000107032f, -0.121449f,   -0.15148f,
+  0.0220338f,   0.0762024f,   0.12253f,      0.0622466f,   0.0835822f,
+  0.0465119f,   -0.388743f,   -0.34665f,     -0.0720734f,  -0.101581f,
+  -0.630565f,   -0.0512685f,  -0.520541f,    0.0530119f,   -0.0245276f,
+  -0.19116f,    -0.0144446f,  -0.0604486f,   0.187251f,    -0.021341f,
+  -0.217823f,   0.0510256f,   -0.197946f,    0.060955f,    -0.0617316f,
+  0.0741673f,   0.117591f,    -1.47844f,     -0.0911093f,  0.359225f,
+  0.145027f,    0.127513f,    0.0617905f,    0.141154f,    -7.63868f,
+  -0.0808127f,  0.274843f,    0.00693195f,   -0.0283113f,  -0.853871f,
+  -0.15737f,    0.0858904f,   0.0746279f,    0.109912f,    0.193775f,
+  0.0698094f,   0.174159f,    0.259556f,     -1.49885f,    -0.156706f,
+  -1.04113f,    -0.0329546f,  -0.0491449f,   -0.0304125f,  0.0514892f,
+  -0.244284f,   0.126814f,    -0.0387081f,   -0.153173f,   -0.0566748f,
+  0.294111f,    -0.0170534f,  0.102381f,     0.447606f,    -0.0613267f,
+  -0.0636869f,  -0.0347599f,  -0.259572f,    -0.0657846f,  0.454352f,
+  -0.169453f,   -0.00177987f, 0.133279f,     -0.0863932f,  -0.134423f,
+  -0.475107f,   -0.00448962f, -0.214607f,    0.111413f,    0.194377f,
+  -0.0710837f,  0.0562353f,   0.0401193f,    0.248595f,    0.538374f,
+  0.449469f,    -0.39111f,    0.0125057f,    0.0448811f,   -0.00707751f,
+  -0.164894f,   -0.317516f,   -0.56231f,     -0.270262f,   0.127016f,
+  -0.12092f,    -0.0881587f,  -0.323908f,    0.872344f,    0.103391f,
+  0.267971f,    -0.155088f,   -0.0136683f,   0.309517f,    0.119901f,
+  0.271307f,    -0.188463f,   0.185121f,     -0.142777f,   -0.110535f,
+  -0.163107f,   0.175502f,    0.0801924f,    0.240499f,    0.0874759f,
+  0.308907f,    -0.00222504f, 0.193366f,     0.109018f,    -0.0772158f,
+  -0.520675f,   0.0259432f,   -0.736666f,    -0.296579f,   0.043486f,
+  -0.128932f,   0.0417669f,   0.125747f,     0.157879f,    0.112857f,
+  -0.0595681f,  0.0611936f,   -0.042125f,    -0.270338f,   0.120072f,
+  -0.36675f,    -0.0347962f,  -0.119539f,    0.0873369f,   0.296432f,
+  -0.069501f,   -0.0383859f,  0.0913597f,    -0.40747f,    0.234276f,
+  0.332536f,    -0.732132f,   -0.312291f,    0.137759f,    0.227593f,
+  0.14165f,     0.129068f,    0.102734f,     0.135818f,    -7.35883f,
+  -0.101533f,   0.256027f,    -0.0142278f,   -0.0561601f,  -1.09899f,
+  -0.106538f,   0.0612256f,   0.099487f,     -0.0605983f,  0.134311f,
+  0.052226f,    0.143672f,    0.219944f,     -1.47539f,    -0.101828f,
+  -0.429979f,   0.010478f,    -0.0132605f,   0.103363f,    0.0267373f,
+  -0.338865f,   0.0090188f,   0.0810085f,    -0.124368f,   -0.0133776f,
+  0.595666f,    -0.00162201f, -0.212444f,    -0.26342f,    0.0913656f,
+  -0.106279f,   0.414515f,    -0.709901f,    -0.00198859f, 0.305288f,
+  -0.188536f,   -0.0377482f,  -0.131909f,    -0.116099f,   -0.236827f,
+  -0.36356f,    0.0179455f,   -0.202143f,    -0.00395508f, 0.177363f,
+  0.0630679f,   -0.145173f,   -0.0558639f,   -0.44879f,    -1.55687f,
+  0.473398f,    0.50531f,     -0.0656231f,   -0.137197f,   0.064707f,
+  0.122083f,    0.0321111f,   -0.167096f,    0.0406581f,   -0.0793592f,
+  -0.0777081f,  0.0321379f,   -0.0108834f,   -0.0652323f,  -0.102918f,
+  0.0178664f,   0.0781873f,   0.0613189f,    -0.04177f,    0.159566f,
+  0.15134f,     -0.445996f,   -0.384905f,    0.0951659f,   -0.175046f,
+  0.255746f,    0.177047f,    -0.150632f,    0.200522f,    0.00778549f,
+  0.232168f,    -0.0304652f,  0.083155f,     -0.125395f,   -0.0203289f,
+  -0.23874f,    0.0349836f,   0.231701f,     -0.14849f,    -0.204272f,
+  -0.198309f,   -0.364955f,   -0.228428f,    0.0614142f,   -0.040976f,
+  -0.227785f,   -0.0898404f,  0.271566f,     -0.209196f,   0.0226431f,
+  -0.0911715f,  0.0840369f,   -0.299411f,    -0.529182f,   0.0622292f,
+  0.202475f,    0.0155583f,   -0.083114f,    0.124253f,    -0.22721f,
+  -1.02565f,    0.193961f,    -0.54287f,     -0.00849364f, 0.11124f,
+  0.0993531f,   0.120621f,    0.0959537f,    0.136274f,    -5.23358f,
+  -0.107433f,   0.155286f,    -0.0136043f,   -0.0246768f,  -0.631187f,
+  -0.0493852f,  0.0446751f,   0.0588353f,    0.160766f,    -0.0354385f,
+  -0.0672548f,  0.243743f,    0.186004f,     -1.20199f,    -0.151872f,
+  -0.0760096f,  -0.00775123f, -0.0122227f,   0.0891327f,   -0.377876f,
+  -0.469926f,   -0.134715f,   -0.0969362f,   0.212542f,    0.0871489f,
+  0.164638f,    -0.0485785f,  -0.167754f,    -0.515052f,   0.13821f,
+  0.0515572f,   -0.430691f,   -0.394719f,    0.143947f,    -0.00670816f,
+  0.129623f,    0.140299f,    0.0336978f,    0.153545f,    -0.350927f,
+  -0.213485f,   0.0344809f,   0.0405889f,    0.0749967f,   -0.369352f,
+  -0.109398f,   0.0350649f,   0.190893f,     -0.284106f,   -0.185376f,
+  0.0105842f,   0.263692f,    0.160429f,     0.0998209f,   -0.127779f,
+  0.140558f,    0.108968f,    -0.0122672f,   0.102875f,    -5.72172f,
+  -0.161288f,   0.135935f,    -0.0143087f,   0.106556f,    -0.649813f,
+  -0.123049f,   -0.0108861f,  0.102918f,     -0.298137f,   0.0329013f,
+  0.100763f,    0.12018f,     0.100782f,     -0.648036f,   -0.111122f,
+  0.12363f,     0.0211952f,   -0.225201f,    0.0506021f,   0.0167621f,
+  0.0608759f,   -0.0245646f,  0.0503477f,    -0.0972749f,  -0.0415155f,
+  -0.00578366f, -0.0977591f,  0.124867f,     0.0134788f,   -0.0375816f,
+  -0.00581233f, -0.272292f,   -0.250393f,    0.024511f,    -0.184891f
+};
+
+static const float av1_intra_mode_cnn_partition_cnn_layer_2_bias[] = {
+  0.182474f,  0.0223202f,  0.204111f, 0.0573683f,  0.111143f,
+  0.0800926f, -0.0364215f, 0.192371f, 0.00498262f, 0.302543f,
+  0.0133081f, 0.119719f,   0.237522f, -0.266705f,  0.129427f,
+  0.0695857f, 0.22068f,    0.231667f, 0.405829f,   -0.0972567f
+};
+
+static const float av1_intra_mode_cnn_partition_cnn_layer_3_kernel[] = {
+  -0.0393876f,  -0.269924f,   -0.0703231f,   -0.0236484f,  0.170478f,
+  0.245566f,    0.175963f,    0.104194f,     -0.0490501f,  -0.157605f,
+  -0.0275165f,  -0.0169499f,  -0.250725f,    0.215203f,    -0.00733655f,
+  0.0111298f,   0.205606f,    0.928046f,     0.15139f,     0.0955483f,
+  -0.015115f,   -0.126643f,   0.0957605f,    -0.140178f,   -0.0246866f,
+  0.097097f,    0.116287f,    0.177746f,     0.0570021f,   -0.0518686f,
+  -0.0446482f,  -0.0125318f,  0.0116092f,    0.102431f,    0.0898519f,
+  0.0870372f,   -0.843274f,   0.383311f,     -0.102761f,   -0.0246494f,
+  0.0312555f,   0.19472f,     0.111573f,     0.0920392f,   -0.0555618f,
+  0.326461f,    0.219357f,    -0.133727f,    -0.118399f,   -0.0611432f,
+  -0.169931f,   0.123733f,    -0.204607f,    0.082592f,    0.0323181f,
+  0.201618f,    -0.00388867f, -0.053583f,    0.0266333f,   -0.0951787f,
+  -0.0358283f,  -0.0649549f,  0.0119263f,    -0.11812f,    0.209851f,
+  -0.036616f,   -0.014911f,   -0.138096f,    -0.139664f,   -0.207395f,
+  0.0128848f,   -0.201816f,   0.0899419f,    0.343308f,    -0.0096243f,
+  -0.212605f,   -0.0905284f,  -0.0597114f,   -0.055261f,   -0.0653405f,
+  0.0330484f,   -0.27681f,    -0.0994095f,   -0.0468272f,  0.145713f,
+  0.267216f,    0.185335f,    0.1798f,       -0.0437882f,  -0.200401f,
+  -0.0398117f,  -0.0736501f,  -0.166349f,    0.203316f,    0.0710647f,
+  0.061825f,    0.281131f,    0.733323f,     0.215488f,    0.00145659f,
+  -0.138995f,   -0.0833713f,  0.107809f,     -0.105343f,   -0.0672139f,
+  0.101852f,    0.135455f,    0.132903f,     0.0312017f,   -0.0643586f,
+  -0.0274546f,  -0.0687466f,  -0.020233f,    0.109444f,    0.0774587f,
+  0.139497f,    -0.800587f,   0.325783f,     -0.0546695f,  -0.092003f,
+  -0.0773301f,  0.189672f,    0.0604666f,    0.0939425f,   0.679495f,
+  0.114789f,    -0.161153f,   0.12843f,      -0.0345385f,  -0.134641f,
+  -0.153995f,   0.0823055f,   -0.0349296f,   0.0299183f,   -0.0606872f,
+  0.137588f,    0.0449805f,   -0.0555399f,   -0.00553351f, -0.120719f,
+  -0.204701f,   -0.0739813f,  0.0584115f,    -0.104833f,   -0.110989f,
+  0.00845446f,  0.0630702f,   -0.147861f,    0.0268545f,   -0.216419f,
+  0.00531986f,  -0.206641f,   0.253082f,     0.413215f,    -0.05909f,
+  -0.0939983f,  -0.116818f,   -0.0450892f,   -0.0551134f,  -0.00696931f,
+  -0.113003f,   -0.289192f,   -0.00884866f,  -0.0365724f,  0.0401887f,
+  0.238622f,    0.149151f,    0.175751f,     -0.157425f,   -0.138924f,
+  -0.0277598f,  -0.0285915f,  0.10165f,      0.209532f,    0.0862249f,
+  0.0256428f,   0.623204f,    -0.0941196f,   0.20345f,     -0.132869f,
+  0.00947298f,  -0.14753f,    0.103918f,     -0.161799f,   0.125566f,
+  0.10916f,     0.115446f,    0.135627f,     -0.0181667f,  -0.0734694f,
+  -0.0154729f,  -0.085849f,   -0.000427605f, 0.113614f,    0.0776308f,
+  0.111899f,    -0.214917f,   0.393234f,     -0.132223f,   0.020783f,
+  -0.074902f,   0.217477f,    0.107883f,     0.109466f,    0.146609f,
+  0.317061f,    0.074379f,    -0.0505457f,   -0.0503772f,  -0.0678954f,
+  -0.220003f,   0.114878f,    0.176014f,     -0.00657996f, -0.0875497f,
+  0.065582f,    0.00238612f,  -0.063395f,    0.0295323f,   -0.127126f,
+  0.099813f,    -0.115452f,   0.0106309f,    -0.179632f,   -0.0436553f,
+  0.0120295f,   0.0652713f,   -0.131512f,    -0.081714f,   -0.205363f,
+  -0.0374944f,  -0.196707f,   0.680568f,     -0.00991824f, -0.0212223f,
+  -0.186258f,   -0.432361f,   -0.0291303f,   -0.0475983f,  -0.071383f,
+  -0.0116416f,  -0.28257f,    -0.0635272f,   -0.0576546f,  -0.280129f,
+  0.286528f,    0.199997f,    0.192851f,     0.323829f,    -0.185006f,
+  -0.04791f,    -0.0882187f,  -0.0496895f,   0.293135f,    0.125539f,
+  0.0341828f,   0.993452f,    0.0369177f,    0.0453796f,   0.0329807f,
+  0.157673f,    -0.153195f,   0.122383f,     -0.161983f,   -0.317619f,
+  0.105129f,    0.155673f,    0.152489f,     0.0685417f,   -0.0595907f,
+  -0.026657f,   -0.0954336f,  -0.0359557f,   0.105617f,    0.0825066f,
+  0.100189f,    -0.22125f,    0.382508f,     -0.0247677f,  -0.115807f,
+  -0.0639787f,  0.177786f,    0.0566206f,    0.0496389f,   1.31533f,
+  0.0482907f,   -0.118743f,   0.190632f,     0.172867f,    -0.108446f,
+  -0.200186f,   0.122572f,    0.0897468f,    0.0155328f,   -0.0380217f,
+  0.125161f,    -0.141723f,   -0.023157f,    0.0270805f,   -0.101961f,
+  0.12358f,     -0.0866255f,  0.00306761f,   -0.131764f,   -0.461118f,
+  -0.00803936f, 0.0895496f,   -0.153905f,    0.207623f,    -0.249099f,
+  -0.0198487f,  -0.160013f,   0.81136f,      -0.109978f,   -0.0880332f,
+  -0.0761368f,  -0.0755881f,  -0.0384827f,   -0.0554777f,  -0.0750048f
+};
+
+static const float av1_intra_mode_cnn_partition_cnn_layer_3_bias[] = {
+  0.0106809f, 0.136699f, 0.285316f, 0.395746f
+};
+
+static const float av1_intra_mode_cnn_partition_cnn_layer_4_kernel[] = {
+  -0.0161019f,  -0.088871f,  0.0463358f,   -0.198037f,   0.038122f,
+  0.0135483f,   -0.196641f,  -0.433531f,   0.527972f,    -0.143716f,
+  0.558627f,    0.459889f,   0.322864f,    -0.491514f,   -0.190915f,
+  -0.0765601f,  0.210329f,   0.689389f,    -0.100415f,   -1.8788f,
+  0.2228f,      0.292781f,   -0.954838f,   -0.0788763f,  -0.131402f,
+  -0.17154f,    0.049934f,   -0.0541183f,  -0.530529f,   -0.666165f,
+  0.195492f,    0.218548f,   -0.314895f,   0.0749444f,   -0.191344f,
+  0.349469f,    0.00811248f, -0.760157f,   0.0707434f,   -0.0719285f,
+  -0.264495f,   -0.432009f,  -0.432686f,   0.155738f,    -0.020197f,
+  0.19278f,     -0.658335f,  -0.273143f,   -0.286079f,   0.243402f,
+  0.497701f,    0.0121003f,  -0.666308f,   0.028172f,    -0.547901f,
+  -0.11755f,    0.322028f,   0.0878274f,   -0.0328334f,  0.311816f,
+  0.0951026f,   -1.11429f,   -0.0417486f,  0.123467f,    -0.0910681f,
+  -0.0154255f,  0.311201f,   -0.0156158f,  -0.600437f,   0.0274156f,
+  -0.174907f,   -1.29313f,   -0.178656f,   0.596556f,    -0.421725f,
+  -0.289137f,   0.529297f,   0.114833f,    -0.0155887f,  -0.308232f,
+  -0.0228361f,  0.184017f,   0.138232f,    0.146347f,    -0.117867f,
+  0.248351f,    -0.282846f,  -0.18058f,    0.348355f,    -0.415754f,
+  0.0657168f,   0.431728f,   -0.231043f,   -0.186745f,   0.137401f,
+  -0.282329f,   -0.159678f,  0.754262f,    0.037824f,    -1.68521f,
+  -0.290175f,   0.289588f,   -0.18683f,    -0.300385f,   0.285449f,
+  -0.00386456f, 0.0563485f,  -0.376541f,   0.159899f,    -0.697312f,
+  0.0284389f,   0.437307f,   0.3968f,      -0.372082f,   -0.232535f,
+  0.394629f,    0.00315248f, -0.38374f,    0.0311291f,   -0.624353f,
+  0.498083f,    -0.342663f,  -0.125978f,   0.186797f,    0.187723f,
+  0.149335f,    -0.82727f,   -0.0740974f,  -0.659039f,   0.42671f,
+  -0.448835f,   0.150677f,   0.830742f,    -0.233148f,   -0.65308f,
+  -0.0878935f,  -0.407797f,  -0.511826f,   -0.0739023f,  0.506305f,
+  -0.187451f,   0.0284968f,  -0.822238f,   0.362523f,    -0.270865f,
+  0.032335f,    0.560413f,   -0.00388247f, -0.446333f,   0.163147f,
+  -0.409633f,   -0.372575f,  0.306993f,    0.55953f,     -0.24362f,
+  -0.0929369f,  -0.520298f,  -0.444022f,   0.186077f,    -0.0942208f,
+  0.624049f,    -0.429625f,  -0.869528f,   0.405257f,    -0.120445f,
+  0.537685f,    -0.3911f,    0.142142f,    0.0913808f,   -0.00375967f,
+  0.382781f,    0.60505f,    -0.271608f,   -0.0630436f,  -0.150625f,
+  -0.0124598f,  0.0132878f,  0.138475f,    -0.106264f,   -0.416581f,
+  -0.518415f,   0.185127f,   -0.464622f,   -0.0102925f,  0.0389567f,
+  0.406439f,    -0.0414264f, -0.366185f,   -0.511867f,   -0.650255f,
+  0.278252f,    0.0270234f,  0.262788f,    -0.0294793f,  0.12651f,
+  0.421537f,    0.0300837f,  0.0742187f,   0.281954f,    -0.122069f,
+  -0.450145f,   -0.312206f,  -0.402633f,   -0.0868137f,  0.190433f,
+  -0.149602f,   -0.175029f,  0.00900023f,  -0.266596f,   0.21721f,
+  -0.245079f,   -1.09798f,   0.319409f,    -0.337938f,   0.358514f,
+  0.0771549f,   0.447087f,   -0.305507f,   -0.285492f,   0.383896f,
+  0.145933f,    -0.264944f,  -0.118486f,   0.068805f,    -0.194231f,
+  -1.79133f,    0.363408f,   -0.17434f,    -0.229629f,   0.132188f,
+  0.207548f,    -0.876264f,  0.265634f,    0.139332f,    0.236206f,
+  -0.0145184f,  0.562865f,   0.526612f,    -0.0333508f,  -0.421885f,
+  0.273485f,    -0.110882f,  0.425557f,    0.513303f,    -0.422322f,
+  0.0563155f,   -0.0409693f, 0.194768f,    -0.419828f,   -0.107195f,
+  -1.19224f,    0.48552f,    0.132782f,    -0.00932096f, -0.225484f,
+  -0.428484f,   -0.0392684f, 0.750697f,    0.337615f,    0.158476f,
+  0.413484f,    0.326017f,   -0.757107f,   -0.183962f,   0.00884361f,
+  0.126507f,    -0.0751588f, -0.308782f,   -0.104237f,   -0.703877f,
+  -0.491806f,   -0.204251f,  -0.317212f,   0.0815479f,   0.296323f,
+  0.219632f,    -0.039859f,  0.556257f,    0.176144f,    -0.0750654f,
+  -0.106419f,   0.00400385f, -0.172266f,   0.000178763f, 0.146532f,
+  0.255202f,    -0.427235f,  -0.182198f,   -0.256557f,   0.260255f,
+  -0.0143364f,  0.0868664f,  -0.564373f,   -0.0876947f,  0.726289f,
+  0.0160001f,   -0.381562f,  -0.638214f,   -0.803803f,   0.25945f,
+  -0.371542f,   -0.419611f,  0.238617f,    0.371834f,    -0.226777f,
+  -0.894602f,   0.37458f,    -0.354866f,   0.0249312f,   0.142374f,
+  0.433813f,    -0.0218183f, -0.33248f,    0.107223f,    0.390823f,
+  -0.0271108f,  -0.616878f,  -0.604984f,   0.517269f,    -0.293573f
+};
+
+static const float av1_intra_mode_cnn_partition_cnn_layer_4_bias[] = {
+  -0.290371f, -0.0560272f,  -0.118144f,  -0.270583f,  0.401388f,
+  -0.308677f, 0.150729f,    -0.0324442f, -0.135937f,  0.0875581f,
+  0.0206493f, -0.212682f,   -0.0266535f, -0.326656f,  0.0185105f,
+  -1.01429f,  -0.00315052f, -0.0273938f, -0.0263379f, -0.171702f
+};
+
+static const CNN_CONFIG av1_intra_mode_cnn_partition_cnn_config = {
+  NUM_CNN_LAYERS,  // num_layers
+  0,               // is_residue
+  0,               // ext_width
+  0,               // ext_height
+  0,               // strict_bounds
+  {
+      {
+          CNN_LAYER_0_IN_CH,                                // in_channels
+          CNN_LAYER_0_WIDTH,                                // filter_width
+          CNN_LAYER_0_WIDTH,                                // filter_height
+          CNN_LAYER_0_OUT_CH,                               // out_channels
+          CNN_LAYER_0_HORZ_STRIDE,                          // skip_width
+          CNN_LAYER_0_VERT_STRIDE,                          // skip_height
+          0,                                                // maxpool
+          av1_intra_mode_cnn_partition_cnn_layer_0_kernel,  // weights
+          av1_intra_mode_cnn_partition_cnn_layer_0_bias,    // bias
+          PADDING_VALID,                                    // pad
+          RELU,                                             // activation
+          0,                                                // deconvolve
+          0,                                                // branch
+          BRANCH_NO_COPY,                                   // branch_copy_type
+          BRANCH_NOC,        // branch_combine_type
+          NO_BRANCH_CONFIG,  // branch_config
+          NO_BN_PARAMS,      // bn_params
+          -1,                // output_num
+      },
+      {
+          CNN_LAYER_1_IN_CH,                                // in_channels
+          CNN_LAYER_1_WIDTH,                                // filter_width
+          CNN_LAYER_1_WIDTH,                                // filter_height
+          CNN_LAYER_1_OUT_CH,                               // out_channels
+          CNN_LAYER_1_HORZ_STRIDE,                          // skip_width
+          CNN_LAYER_1_VERT_STRIDE,                          // skip_height
+          0,                                                // maxpool
+          av1_intra_mode_cnn_partition_cnn_layer_1_kernel,  // weights
+          av1_intra_mode_cnn_partition_cnn_layer_1_bias,    // bias
+          PADDING_VALID,                                    // pad
+          RELU,                                             // activation
+          0,                                                // deconvolve
+          0,                                                // branch
+          BRANCH_NO_COPY,                                   // branch_copy_type
+          BRANCH_NOC,        // branch_combine_type
+          NO_BRANCH_CONFIG,  // branch_config
+          NO_BN_PARAMS,      // bn_params
+          3,                 // output_num
+      },
+      {
+          CNN_LAYER_2_IN_CH,                                // in_channels
+          CNN_LAYER_2_WIDTH,                                // filter_width
+          CNN_LAYER_2_WIDTH,                                // filter_height
+          CNN_LAYER_2_OUT_CH,                               // out_channels
+          CNN_LAYER_2_HORZ_STRIDE,                          // skip_width
+          CNN_LAYER_2_VERT_STRIDE,                          // skip_height
+          0,                                                // maxpool
+          av1_intra_mode_cnn_partition_cnn_layer_2_kernel,  // weights
+          av1_intra_mode_cnn_partition_cnn_layer_2_bias,    // bias
+          PADDING_VALID,                                    // pad
+          RELU,                                             // activation
+          0,                                                // deconvolve
+          0,                                                // branch
+          BRANCH_NO_COPY,                                   // branch_copy_type
+          BRANCH_NOC,        // branch_combine_type
+          NO_BRANCH_CONFIG,  // branch_config
+          NO_BN_PARAMS,      // bn_params
+          2,                 // output_num
+      },
+      {
+          CNN_LAYER_3_IN_CH,                                // in_channels
+          CNN_LAYER_3_WIDTH,                                // filter_width
+          CNN_LAYER_3_WIDTH,                                // filter_height
+          CNN_LAYER_3_OUT_CH,                               // out_channels
+          CNN_LAYER_3_HORZ_STRIDE,                          // skip_width
+          CNN_LAYER_3_VERT_STRIDE,                          // skip_height
+          0,                                                // maxpool
+          av1_intra_mode_cnn_partition_cnn_layer_3_kernel,  // weights
+          av1_intra_mode_cnn_partition_cnn_layer_3_bias,    // bias
+          PADDING_VALID,                                    // pad
+          RELU,                                             // activation
+          0,                                                // deconvolve
+          0,                                                // branch
+          BRANCH_NO_COPY,                                   // branch_copy_type
+          BRANCH_NOC,        // branch_combine_type
+          NO_BRANCH_CONFIG,  // branch_config
+          NO_BN_PARAMS,      // bn_params
+          1,                 // output_num
+      },
+      {
+          CNN_LAYER_4_IN_CH,                                // in_channels
+          CNN_LAYER_4_WIDTH,                                // filter_width
+          CNN_LAYER_4_WIDTH,                                // filter_height
+          CNN_LAYER_4_OUT_CH,                               // out_channels
+          CNN_LAYER_4_HORZ_STRIDE,                          // skip_width
+          CNN_LAYER_4_VERT_STRIDE,                          // skip_height
+          0,                                                // maxpool
+          av1_intra_mode_cnn_partition_cnn_layer_4_kernel,  // weights
+          av1_intra_mode_cnn_partition_cnn_layer_4_bias,    // bias
+          PADDING_VALID,                                    // pad
+          RELU,                                             // activation
+          0,                                                // deconvolve
+          0,                                                // branch
+          BRANCH_NO_COPY,                                   // branch_copy_type
+          BRANCH_NOC,        // branch_combine_type
+          NO_BRANCH_CONFIG,  // branch_config
+          NO_BN_PARAMS,      // bn_params
+          0,                 // output_num
+      },
+  },
+};
+
+static const float
+    av1_intra_mode_cnn_partition_branch_0_dnn_layer_0_kernel[] = {
+      0.604356f,    -0.236007f,   0.342172f,   0.531397f,     -0.635698f,
+      -0.591573f,   0.833872f,    0.492814f,   -0.100308f,    0.186385f,
+      0.202779f,    0.263578f,    0.330001f,   -0.15531f,     0.879584f,
+      -0.0048796f,  0.490796f,    0.242254f,   -0.292211f,    -0.696912f,
+      0.746664f,    0.129371f,    -0.0122443f, 0.196234f,     -0.251605f,
+      -0.385617f,   0.157707f,    0.699963f,   0.0432536f,    -0.11141f,
+      -0.0353473f,  -0.0364045f,  -0.113556f,  -0.520842f,    0.231248f,
+      0.230638f,    -0.323852f,   -1.08633f,   -0.0469168f,   -0.481821f,
+      0.366838f,    0.189627f,    -0.0637262f, -0.484917f,    -0.109874f,
+      0.292237f,    0.368702f,    -0.183896f,  -0.109038f,    -1.22613f,
+      -0.880355f,   -1.63768f,    0.337426f,   -0.940994f,    0.413097f,
+      -0.37879f,    -0.480525f,   -0.594819f,  -0.0172653f,   -0.499436f,
+      -0.298395f,   -0.840181f,   -0.0758645f, -0.772089f,    -0.232727f,
+      -0.815968f,   0.160785f,    -0.0767165f, 0.0064244f,    -0.540491f,
+      0.417776f,    -0.384337f,   -0.497377f,  0.68414f,      0.00797514f,
+      0.262626f,    0.203732f,    0.702047f,   0.0617544f,    0.0878249f,
+      -0.315032f,   -0.0169776f,  0.403986f,   0.815872f,     0.135388f,
+      0.0858594f,   0.169172f,    -0.638227f,  -1.65268f,     -0.0476042f,
+      -0.982685f,   0.45707f,     -0.0577537f, 0.367329f,     0.176513f,
+      -0.356454f,   0.0979095f,   -0.277476f,  0.257271f,     -0.333451f,
+      0.0241497f,   0.0671127f,   0.221216f,   0.106065f,     0.537151f,
+      0.0257329f,   0.265559f,    -0.348353f,  0.285569f,     -0.0610511f,
+      -1.59334f,    -1.63826f,    -0.164898f,  -0.36605f,     -0.489304f,
+      0.729241f,    0.0197627f,   0.200291f,   -0.231506f,    -0.255715f,
+      -0.0932264f,  -0.728793f,   0.468297f,   -1.09592f,     -0.079791f,
+      -1.76531f,    -0.182904f,   -2.05897f,   -0.371894f,    0.207124f,
+      0.255029f,    0.186501f,    -0.005805f,  0.00160733f,   -0.178206f,
+      -0.352757f,   -0.164741f,   -0.557583f,  -0.559692f,    -0.00731467f,
+      0.149326f,    0.409735f,    0.22083f,    -0.332572f,    -0.1741f,
+      -0.0519008f,  -0.266402f,   0.294031f,   -2.4453f,      0.339851f,
+      -0.573747f,   -5.97783f,    -0.084142f,  0.20286f,      -0.576038f,
+      -0.111081f,   0.101238f,    -5.83427f,   -1.98537f,     0.322796f,
+      -0.60171f,    0.212412f,    0.247176f,   0.603694f,     -0.54357f,
+      -0.693439f,   0.250725f,    -4.31988f,   0.0935924f,    0.43669f,
+      -0.139706f,   -0.158391f,   0.244309f,   0.619213f,     -0.309154f,
+      -0.135341f,   0.475815f,    -0.290804f,  -0.109038f,    -0.0937104f,
+      0.0385907f,   -0.29105f,    -0.0597651f, -0.451187f,    -1.51821f,
+      0.141772f,    0.822204f,    -0.729661f,  -0.109908f,    0.178217f,
+      -0.750278f,   0.113762f,    -0.0959985f, 0.066579f,     -0.104209f,
+      -0.951378f,   1.4087f,      -1.13175f,   -1.09103f,     -1.50416f,
+      -0.182273f,   -1.80129f,    -0.152135f,  0.356931f,     0.205591f,
+      0.183148f,    -0.498671f,   -0.183034f,  -0.176428f,    0.395706f,
+      -0.589908f,   -0.318276f,   -0.421162f,  0.658766f,     -0.186752f,
+      0.0656253f,   0.248002f,    0.289618f,   -0.458111f,    -0.130789f,
+      -0.542988f,   0.405804f,    -0.35364f,   -0.311927f,    0.218339f,
+      0.309215f,    -0.130347f,   -0.0257543f, 0.0413234f,    -0.190205f,
+      -0.242382f,   0.819886f,    -0.255157f,  -0.181219f,    -0.290903f,
+      -0.301995f,   -0.0469988f,  0.702936f,   0.209122f,     0.0234243f,
+      0.598637f,    0.0305196f,   0.0423457f,  -0.618799f,    0.0190867f,
+      0.420584f,    -0.224752f,   -0.410077f,  0.127854f,     0.395261f,
+      -0.393685f,   -0.282822f,   0.0289504f,  0.0406515f,    -0.511531f,
+      -0.497611f,   0.0252715f,   0.0812549f,  0.80205f,      1.29084f,
+      0.764972f,    0.561258f,    -0.23499f,   0.217594f,     -0.690935f,
+      -0.26607f,    0.357955f,    0.391608f,   0.448352f,     0.458586f,
+      -0.790071f,   0.719959f,    -0.468052f,  1.24579f,      0.220705f,
+      0.284044f,    0.141346f,    0.246687f,   0.147826f,     -0.403557f,
+      -0.00648195f, 0.398034f,    -0.100464f,  -0.77107f,     -0.188274f,
+      -0.219245f,   -0.0330375f,  0.367585f,   -0.220391f,    0.308736f,
+      0.221399f,    0.340292f,    0.037597f,   0.606083f,     0.665634f,
+      -0.755529f,   -0.95989f,    -0.243673f,  0.233709f,     -0.454628f,
+      -0.110952f,   0.776062f,    0.731136f,   -0.140422f,    0.19261f,
+      0.355086f,    0.975026f,    0.190936f,   0.776205f,     0.982781f,
+      0.555569f,    0.42382f,     -0.409721f,  0.25053f,      -0.271328f,
+      0.859941f,    -0.0210901f,  0.0176916f,  -0.562895f,    -0.0787431f,
+      -0.861032f,   -0.34022f,    -0.571995f,  0.205436f,     0.346968f,
+      0.377033f,    -1.08484f,    0.297007f,   -1.01693f,     0.189463f,
+      -0.483242f,   0.147058f,    0.0159503f,  0.0908779f,    -0.46962f,
+      0.174024f,    -0.490704f,   -0.383501f,  -0.0507626f,   0.00902188f,
+      -0.202495f,   0.205047f,    0.0562261f,  -0.143371f,    0.219524f,
+      -0.317294f,   -0.0575756f,  -0.0595825f, -0.000625279f, -0.278864f,
+      -0.0516874f,  -0.225259f,   0.429046f,   -0.0952421f,   0.0799135f,
+      -0.122883f,   -0.262308f,   -0.481006f,  -0.0466122f,   -0.402822f,
+      0.150595f,    -0.0919558f,  -0.356765f,  -0.199222f,    0.219389f,
+      -0.214452f,   -0.196361f,   -0.095758f,  -0.115891f,    -0.143777f,
+      0.549843f,    -0.113036f,   0.764895f,   -0.0114812f,   -0.0684054f,
+      -0.98045f,    -0.0170634f,  0.247719f,   -0.18718f,     -0.381566f,
+      0.150758f,    -0.526257f,   1.00851f,    0.776634f,     1.69728f,
+      -0.303058f,   0.228967f,    -0.414134f,  0.0858226f,    -0.285472f,
+      0.431459f,    0.315318f,    0.587835f,   0.335737f,     -0.0222039f,
+      0.18945f,     0.274008f,    0.609263f,   0.320232f,     -0.214137f,
+      -0.0297668f,  0.0439046f,   -0.52821f,   -0.0127375f,   0.431885f,
+      0.508846f,    -0.329189f,   -0.166778f,  -0.94338f,     -0.358807f,
+      0.208641f,    -0.517986f,   -0.128278f,  0.693464f,     -0.24408f,
+      -0.0669412f,  -0.410287f,   0.0444145f,  -0.264179f,    0.143884f,
+      0.276842f,    0.498934f,    -0.682557f,  -0.217198f,    -0.8249f,
+      -0.40446f,    -0.115376f,   0.417934f,   0.65605f,      -0.00570035f,
+      -0.365742f,   -0.367625f,   0.526824f,   -0.0164913f,   -0.255998f,
+      0.247292f,    0.0846536f,   0.109302f,   -0.302996f,    0.160564f,
+      0.0228132f,   0.035211f,    -0.236951f,  0.493801f,     1.37315f,
+      -0.182348f,   0.234437f,    -0.256906f,  0.12523f,      0.667113f,
+      -0.437981f,   -0.0721831f,  0.303976f,   -0.041336f,    -0.145894f,
+      -0.733741f,   0.436056f,    0.368542f,   -0.149072f,    -0.290281f,
+      0.0946743f,   -0.0579292f,  0.264539f,   0.170048f,     0.262411f,
+      0.049679f,    0.371369f,    0.760675f,   0.482157f,     -0.0196783f,
+      0.260888f,    0.948856f,    0.170228f,   -0.134432f,    -0.942235f,
+      -1.23226f,    -0.373963f,   -0.0381773f, -0.17947f,     0.00947998f,
+      0.01086f,     0.389578f,    -0.380389f,  -0.0865851f,   -0.220328f,
+      -0.171901f,   -0.384325f,   -0.0787615f, 0.392678f,     0.123392f,
+      -0.0895824f,  0.00480886f,  -0.162918f,  0.214336f,     -0.00147339f,
+      0.203899f,    -0.00292344f, -0.148594f,  0.0425697f,    -0.306896f,
+      -0.342225f,   -0.45088f,    -0.184454f,  -0.00923638f,  -0.521993f,
+      -0.334464f,   0.156497f,    -0.0856832f, -0.277661f,    -0.0721105f,
+      -0.488781f,   -0.509543f,   -0.012664f,  0.0940558f,    -0.29869f,
+      0.0434843f,   -0.0178945f,  -0.0525666f, -0.303178f,    0.713507f,
+      -0.137413f,   -0.170289f,   -0.142942f,  -0.316002f,    0.229125f,
+      -0.277585f,   0.0125026f,   0.508316f,   -1.20614f,     -0.915129f,
+      -1.63389f,    -0.454604f,   -0.893951f,  -0.447403f,    -0.751423f,
+      1.3886f,      0.617818f,    0.611458f,   -0.884173f,    -0.7779f,
+      -0.608639f,   -0.164759f,   -0.631846f,  -0.176894f,    -0.459361f,
+      -0.187119f,   0.173283f,    -0.477191f,  -0.156736f,    0.182675f,
+      0.598854f,    -0.489941f,   -0.420493f,  -0.162002f,    0.344418f,
+      0.33832f,     -0.187463f,   -0.388721f,  -0.0733151f,   -0.138835f,
+      0.313699f,    0.0625967f,   -0.291488f,  0.114088f,     -0.356843f,
+      0.197506f,    0.0320749f,   1.16745f,    -0.36081f,     1.63416f,
+      0.198392f,    1.13928f,     -0.317971f,  0.531019f,     0.526518f,
+      0.185814f,    0.0923607f,   0.192858f,   -0.234378f,    0.18091f,
+      -0.228837f,   0.397216f,    0.581501f,   0.284376f,     -0.130434f,
+      0.20076f,     0.242662f,    -0.0480872f, 0.131746f,     0.362712f,
+      0.0146821f,   0.475679f
+    };
+
+static const float av1_intra_mode_cnn_partition_branch_0_dnn_layer_0_bias[] = {
+  0.477356f,   0.385222f,  0.389122f, 0.539506f,   -0.0272558f, 0.581605f,
+  -0.800961f,  0.142229f,  0.117549f, -0.0724944f, 0.102095f,   -0.71319f,
+  -0.0162434f, -0.132858f, 0.543411f, -0.626599f
+};
+
+static const float
+    av1_intra_mode_cnn_partition_branch_0_dnn_layer_1_kernel[] = {
+      0.195436f,   -0.623354f,  1.27907f,    0.270071f,   -0.677612f,
+      0.0266141f,  0.272991f,   -0.425446f,  0.891889f,   -0.299836f,
+      -0.611825f,  -0.0322273f, 0.185276f,   0.238639f,   -0.150954f,
+      0.083495f,   -0.472106f,  0.573506f,   1.16465f,    -0.154947f,
+      0.640631f,   -1.59467f,   -9.8166f,    -0.22889f,   -0.189912f,
+      0.227052f,   -0.540787f,  0.0840873f,  -3.04293f,   -0.0209975f,
+      -6.10979f,   -5.92801f,   0.288467f,   -0.169476f,  0.0527948f,
+      -1.21202f,   -0.280915f,  0.290863f,   -0.601877f,  0.0598784f,
+      -0.592136f,  -0.535588f,  -0.0434018f, -0.653223f,  0.00339129f,
+      -0.133273f,  0.279463f,   0.483879f,   0.463664f,   -0.14174f,
+      -1.56354f,   0.560043f,   -1.44639f,   0.673528f,   -0.108418f,
+      -0.707313f,  0.49633f,    -0.0321971f, 0.411475f,   -0.382184f,
+      -0.965501f,  -0.0507655f, 0.540415f,   -0.977297f,  0.370382f,
+      -0.375683f,  0.0844529f,  -2.0002f,    -0.346289f,  0.621251f,
+      -0.489855f,  0.191252f,   -0.576629f,  -0.35773f,   0.023167f,
+      0.180793f,   -0.417864f,  0.0587254f,  0.167824f,   0.0612058f,
+      -0.712108f,  0.155614f,   0.900036f,   -0.480124f,  0.146117f,
+      0.467011f,   0.412525f,   0.312724f,   0.551826f,   -0.179601f,
+      0.706261f,   0.00674965f, -0.495221f,  0.140829f,   -0.0619195f,
+      -0.0697912f, 0.511967f,   -0.0318237f, -0.285946f,  -0.28608f,
+      0.0894142f,  0.234351f,   -0.272328f,  -0.350369f,  -0.392605f,
+      0.287318f,   0.310426f,   0.293524f,   0.357681f,   -0.157868f,
+      0.149652f,   -0.259363f,  0.192941f,   -0.850096f,  0.456507f,
+      0.387857f,   -0.491187f,  -0.0541993f, -0.28118f,   0.193991f,
+      -0.0956664f, 0.0679829f,  0.0341118f,  0.141826f,   0.271538f,
+      -0.285295f,  -0.68666f,   0.306414f,   0.600678f,   0.494801f,
+      -1.11907f,   0.524849f,   0.151169f,   0.474068f,   -0.43441f,
+      -0.229138f,  0.0345483f,  0.682888f,   -0.471534f,  -0.0457066f,
+      -2.36721f,   0.446407f,   0.20396f,    -1.17868f,   0.815363f,
+      -1.13897f,   0.397217f,   -0.593796f,  -6.95512f,   0.650695f,
+      0.771657f,   0.15227f,    -0.824519f,  0.617854f,   -0.295353f,
+      -0.101207f,  0.600989f,   -0.550653f,  -0.722371f,  0.292006f,
+      -0.451891f,  0.54544f,    0.354278f,   0.0136258f,  0.192003f,
+      0.258275f,   -0.0443647f, 0.0928186f,  0.667775f,   0.239558f,
+      0.0523887f,  0.71586f,    0.292563f,   0.362479f,   0.373453f,
+      0.250638f,   -0.423037f,  -0.486574f,  -0.619397f,  0.343888f,
+      0.974971f,   0.574218f,   0.273989f,   -0.209956f,  -0.274333f,
+      0.0553766f,  0.263918f,   0.733824f,   0.038713f,   -0.0788992f,
+      0.292014f,   0.111808f,   -0.197507f,  0.593668f,   -0.0245337f,
+      0.0873662f,  0.530997f,   0.620717f,   0.310697f,   -1.54861f,
+      1.12915f,    0.0991346f,  -0.59214f,   0.422325f,   -0.0157936f,
+      0.380975f,   0.626403f,   0.268064f,   -0.615231f,  -1.43172f,
+      0.0928048f,  0.0949026f,  -0.470912f,  -0.0867527f, -0.0381206f,
+      0.178393f,   -1.13737f,   0.12798f,    0.258214f,   -0.803364f,
+      0.177506f,   0.542718f,   0.660656f,   0.145091f,   0.183056f,
+      -0.47338f,   0.469287f,   0.10832f,    0.0994899f,  -0.402719f,
+      0.157287f,   0.523071f,   -0.324493f,  0.343599f,   0.664839f,
+      -0.0375519f, -0.279238f,  -0.0722333f, 0.395344f,   -0.289316f,
+      0.0259298f,  -0.843245f,  -0.160021f,  0.741429f,   -1.38726f,
+      -0.2969f,    -0.240443f,  0.247731f,   -1.04088f,   -0.280454f,
+      -0.237054f,  -0.759227f,  0.0456369f,  -0.647453f,  -1.02372f,
+      -0.200395f,  -0.546839f,  -0.104226f,  -0.152727f,  -0.56685f,
+      -0.0559663f, -0.425494f,  -0.610679f,  -0.987096f,  -0.575138f,
+      -0.0887979f, 0.463646f,   -1.041f,     -0.49412f,   -0.175298f,
+      -0.463296f,  -0.955177f,  0.17852f,    -1.10694f,   0.181991f,
+      -0.18998f,   0.227818f,   0.688237f,   -1.10444f,   0.549108f,
+      -0.171849f,  -0.245614f,  0.120624f,   1.29571f,    0.607116f,
+      0.00809927f, 0.1041f,     -1.22918f,   -0.212948f,  0.430239f,
+      -1.57341f,   0.482054f,   0.275905f,   0.939785f,   -1.0209f,
+      -0.355534f,  0.397337f,   -0.0593077f, -0.239603f,  0.475483f,
+      -0.999101f,  -0.140578f,  1.04787f,    -0.591981f,  -0.306989f,
+      -0.879012f,  -0.994715f,  0.0343158f,  0.218509f,   0.34704f,
+      0.0672934f,  -0.178941f,  0.20509f,    -0.360031f,  0.161241f,
+      -0.324775f,  -0.359531f,  -0.0657085f, -0.864422f,  -0.444865f,
+      0.597095f,   -0.948691f,  0.240001f,   -0.783159f,  -0.569422f,
+      0.974205f,   -1.04539f,   0.345915f,   -0.681558f,  -0.246047f,
+      0.256174f,   0.493667f,   0.681324f,   0.155613f,   0.773309f,
+      -0.647027f,  -0.214744f,  -0.474202f,  -0.661092f,  -1.02316f,
+      0.0572593f,  -0.437082f,  -0.119874f,  -0.464877f,  -0.58067f,
+      -0.218029f,  0.319516f,   -0.378983f,  -0.0698695f, 0.554693f,
+      -0.537875f,  0.126429f,   -0.145113f,  -0.594312f,  -0.218021f,
+      -0.703569f,  0.0720548f,  0.261054f,   -0.81438f,   0.249921f,
+      0.165296f,   -0.079028f,  -0.322647f,  0.134458f,   0.0975046f,
+      0.538594f,   -0.250126f,  0.142309f,   0.526486f,   0.0532615f,
+      -0.383332f,  -0.38143f,   -0.101611f,  0.519776f,   -0.278364f,
+      -0.23287f,   -0.29139f,   0.22353f,    0.472085f,   0.366264f,
+      0.741187f,   0.42019f,    0.0676459f,  -0.230008f
+    };
+
+static const float av1_intra_mode_cnn_partition_branch_0_dnn_layer_1_bias[] = {
+  -0.48603f,  -0.578556f,  0.257639f, 0.459915f, 0.178156f,  -1.16663f,
+  0.828891f,  0.620291f,   0.413257f, -1.00508f, -0.574179f, -1.20623f,
+  -0.377837f, -0.0360333f, 0.681536f, 0.137189f, -0.458718f, 0.387131f,
+  0.0233112f, 0.126045f,   0.361304f, 0.655317f, 0.413134f,  0.769947f
+};
+
+static const float av1_intra_mode_cnn_partition_branch_0_logits_kernel[] = {
+  0.67244f,   -2.59179f, 0.50425f,  -1.86481f,  1.15891f,   -1.26447f,
+  0.761081f,  0.645117f, -1.78594f, -0.872703f, -0.192054f, -1.82359f,
+  -0.560935f, 0.838959f, 0.502264f, -1.28958f,  -0.205551f, 0.635671f,
+  -1.12619f,  -1.68277f, 0.83361f,  1.57235f,   1.15839f,   0.35345f
+};
+
+static const float av1_intra_mode_cnn_partition_branch_0_logits_bias[] = {
+  1.14463f
+};
+
+static const float
+    av1_intra_mode_cnn_partition_branch_1_dnn_layer_0_kernel[] = {
+      0.364612f,    0.237868f,    -0.192821f,   0.12364f,      0.522205f,
+      -0.205785f,   -0.503288f,   -0.426503f,   -0.083073f,    0.0164429f,
+      0.184278f,    -0.426055f,   0.0717997f,   -0.261968f,    0.176412f,
+      -0.101226f,   0.0400285f,   -0.332051f,   0.344385f,     0.189565f,
+      0.441162f,    0.330462f,    -0.719857f,   -1.14209f,     0.557831f,
+      0.104756f,    0.0562001f,   -0.465923f,   -0.344592f,    -0.191554f,
+      -0.0656866f,  -0.640162f,   0.419388f,    0.409308f,     -1.68632f,
+      -1.10829f,    0.105485f,    -0.14561f,    -0.944738f,    0.104629f,
+      -0.146837f,   0.538823f,    -0.153157f,   0.321081f,     -1.77714f,
+      -0.0559296f,  0.324136f,    -0.497023f,   -1.15793f,     -0.740144f,
+      -0.0888472f,  0.010059f,    -0.18394f,    -0.234405f,    -0.10586f,
+      0.130958f,    -0.101944f,   -0.186483f,   -0.447049f,    -0.900026f,
+      0.128444f,    0.401696f,    0.128509f,    0.123778f,     0.062168f,
+      -0.321755f,   -0.0691584f,  0.254468f,    -0.115212f,    -0.848885f,
+      0.817005f,    0.0615853f,   0.153363f,    0.513855f,     0.789225f,
+      0.356168f,    0.371613f,    0.269541f,    0.268173f,     0.220481f,
+      -0.109063f,   -0.00620798f, -0.0334622f,  0.236267f,     -0.0235294f,
+      -0.0800253f,  0.0294184f,   0.047131f,    -0.224047f,    0.0890737f,
+      -0.356293f,   0.0989534f,   0.16799f,     0.498266f,     0.612581f,
+      -0.372897f,   -0.75125f,    0.77698f,     1.1032f,       -0.0764679f,
+      0.0266299f,   0.309532f,    0.461305f,    0.0193521f,    -0.0939161f,
+      -0.276156f,   -0.102714f,   -0.0828328f,  0.40003f,      0.122542f,
+      0.0867203f,   -0.170738f,   0.0850642f,   -0.130762f,    0.082324f,
+      -0.115218f,   -0.0244491f,  0.0434331f,   0.216453f,     0.443733f,
+      -0.173679f,   -0.161617f,   0.316209f,    -0.689656f,    -1.52007f,
+      -0.421018f,   0.430833f,    -0.00734122f, 0.284499f,     -0.0207885f,
+      0.0572024f,   -0.878942f,   0.388264f,    0.0191589f,    -0.123415f,
+      -0.0461196f,  -0.0444461f,  -0.00383171f, 0.0945655f,    -0.0597219f,
+      -0.374918f,   0.0182124f,   0.523083f,    0.00519547f,   0.80513f,
+      -0.221433f,   -1.30591f,    -0.416917f,   -0.718173f,    0.622999f,
+      0.941798f,    0.0477536f,   0.0303772f,   0.268078f,     0.414778f,
+      0.394325f,    0.299733f,    -0.583208f,   0.309379f,     0.416581f,
+      0.0299948f,   -0.409145f,   -0.161557f,   -0.214082f,    -0.0098119f,
+      0.221912f,    0.107135f,    0.0692518f,   0.00490957f,   0.107613f,
+      -0.368404f,   -0.548006f,   0.208274f,    0.550475f,     0.643678f,
+      -1.65859f,    0.095938f,    -0.0434245f,  -0.0792685f,   0.838109f,
+      -0.0138653f,  -0.527573f,   -0.123472f,   -0.235618f,    -0.677401f,
+      -0.125877f,   -0.175604f,   -0.203196f,   0.113478f,     -0.228323f,
+      -0.53539f,    0.134458f,    0.0534899f,   -0.213006f,    -0.138679f,
+      -2.15023f,    0.186303f,    0.48566f,     -1.22301f,     -0.240982f,
+      -0.486836f,   -0.121181f,   -0.131382f,   -0.0320283f,   0.278828f,
+      0.342581f,    -0.182257f,   -0.365193f,   -0.226351f,    0.108928f,
+      -0.100159f,   0.448355f,    -0.0768947f,  0.0633719f,    -0.104786f,
+      0.0456653f,   0.0965752f,   0.156403f,    -0.157337f,    0.212259f,
+      0.317939f,    0.124193f,    -0.329475f,   0.206868f,     -2.15986f,
+      -0.108385f,   -0.396769f,   -0.0317231f,  -0.271524f,    -0.184697f,
+      0.662615f,    0.412926f,    -0.0217462f,  -0.0285475f,   -0.118826f,
+      0.0252706f,   -0.137091f,   0.198973f,    0.329509f,     -0.0831966f,
+      -0.621237f,   0.0896179f,   0.805261f,    -0.019675f,    0.962452f,
+      0.307433f,    0.892168f,    -0.537587f,   -2.46145f,     0.125606f,
+      0.920491f,    0.219462f,    0.292765f,    -0.748238f,    -0.0537239f,
+      -0.224326f,   0.505492f,    0.176426f,    0.0343168f,    0.16708f,
+      -0.581393f,   0.951726f,    -1.1777f,     -0.561914f,    -1.53288f,
+      0.864567f,    -1.19648f,    -1.24141f,    -0.334688f,    -0.622026f,
+      0.666876f,    -0.197005f,   -0.600507f,   -0.851924f,    0.492299f,
+      0.31078f,     -0.0736115f,  0.030999f,    -6.02463e-05f, -0.0604341f,
+      -0.0254238f,  0.139222f,    0.333235f,    0.366534f,     -0.191982f,
+      -0.0156092f,  0.44234f,     -0.0193213f,  0.0938745f,    -0.015709f,
+      -0.12043f,    0.00895591f,  0.0464401f,   0.0530699f,    -0.623018f,
+      -1.23372f,    -0.538647f,   -1.12389f,    0.26742f,      0.548694f,
+      0.00540655f,  -0.219703f,   0.314894f,    -0.573463f,    -0.241555f,
+      0.441851f,    0.422491f,    0.253785f,    -0.384683f,    0.0370165f,
+      0.226669f,    0.245587f,    0.215265f,    -0.122272f,    0.0492235f,
+      0.000658591f, -0.312877f,   0.436487f,    -0.229199f,    -0.174373f,
+      0.904268f,    -0.855845f,   -0.877293f,   -0.65409f,     0.313795f,
+      0.461748f,    -0.737766f,   -0.228523f,   0.182181f,     0.334522f,
+      0.0629676f,   -0.151087f,   0.178798f,    -0.325809f,    -0.331672f,
+      0.0865837f,   -0.0684225f,  0.0252008f,   -0.0820631f,   0.0481863f,
+      0.209473f,    -0.0242151f,  -0.0898919f,  -0.163828f,    -0.164282f,
+      0.581888f,    0.816896f,    0.0607674f,   0.364855f,     -0.346512f,
+      -0.764174f,   0.595561f,    0.302872f,    0.206361f,     0.106917f,
+      -0.972338f,   0.176948f,    0.6415f,      -0.131897f,    -0.155802f,
+      0.216337f,    -0.342511f,   0.123743f,    -0.123014f,    0.0205439f,
+      0.15173f,     -0.23801f,    -1.00387f,    0.651328f,     0.237439f,
+      -0.542952f,   1.066f,       -0.161107f,   -0.593545f,    0.219343f,
+      -0.178094f,   0.0789992f,   0.428332f,    0.23827f,      -0.327421f,
+      0.416144f,    0.00394653f,  0.052046f,    -0.238289f,    0.405942f,
+      0.00141984f,  0.161017f,    0.077111f,    0.0823985f,    0.0981208f,
+      0.109949f,    -0.0428502f,  0.343629f,    -0.722978f,    -0.375269f,
+      -0.111634f,   -0.271523f,   0.712093f,    0.684904f,     -0.572331f
+    };
+
+static const float av1_intra_mode_cnn_partition_branch_1_dnn_layer_0_bias[] = {
+  0.583367f,  -0.202004f, -0.207626f, 0.412451f,  -0.258311f, 0.0304954f,
+  -0.102458f, 0.450087f,  -0.376851f, -0.338702f, 0.335226f,  0.889072f,
+  0.502411f,  0.649282f,  0.15345f,   -0.0109896f
+};
+
+static const float
+    av1_intra_mode_cnn_partition_branch_1_dnn_layer_1_kernel[] = {
+      0.0214882f,    -0.934339f,  -0.173335f,  0.8362f,      -0.764234f,
+      0.525163f,     0.409749f,   0.821539f,   -0.784157f,   -0.455593f,
+      0.446099f,     0.406756f,   0.479242f,   -0.814038f,   -0.419332f,
+      0.328869f,     -0.340707f,  0.133219f,   0.0320347f,   0.25089f,
+      -0.324917f,    -0.0684265f, 0.0377777f,  -0.262556f,   0.673458f,
+      -0.0291454f,   -0.417957f,  -1.0075f,    -0.481537f,   0.922105f,
+      -0.000516239f, -0.40034f,   0.242067f,   -0.43178f,    0.32001f,
+      0.143599f,     -0.345172f,  0.126093f,   0.148518f,    -1.12151f,
+      -1.03435f,     0.551691f,   -0.310001f,  -0.323194f,   -0.595128f,
+      -0.395689f,    0.737268f,   -0.729227f,  0.590804f,    -0.590022f,
+      -1.01427f,     -0.521159f,  -0.617579f,  1.07292f,     -0.613047f,
+      -0.619093f,    0.335268f,   0.473753f,   -0.795027f,   1.24635f,
+      -0.556193f,    0.241046f,   -0.0354181f, -0.354215f,   0.716752f,
+      -0.00200745f,  -1.25171f,   -0.440731f,  -0.763918f,   -0.588614f,
+      -0.183901f,    -0.396056f,  0.226903f,   0.921471f,    1.10465f,
+      0.207053f,     0.57681f,    -0.555699f,  0.235469f,    -0.92149f,
+      0.625808f,     0.29653f,    -0.81775f,   -0.307889f,   -1.41384f,
+      -0.136205f,    -0.365314f,  -0.516741f,  0.748052f,    0.617947f,
+      0.0973239f,    0.839607f,   0.530668f,   -0.227032f,   -0.449044f,
+      -1.04725f,     -0.244363f,  -0.396888f,  -0.146161f,   0.359789f,
+      0.0436599f,    1.21645f,    -0.336069f,  0.0534646f,   -0.00200328f,
+      0.658551f,     -0.156142f,  -1.0728f,    0.0951015f,   0.234837f,
+      -0.380525f,    0.041783f,   -0.269273f,  0.0386013f,   -0.455589f,
+      -0.174338f,    0.0345251f,  0.17116f,    -0.507642f,   0.210453f,
+      0.739987f,     -0.0438776f, 0.570145f,   -0.118811f,   0.0548662f,
+      0.153458f,     -0.89887f,   0.493704f,   0.283351f,    0.785441f,
+      -0.586002f,    -0.0616167f, -0.714328f,  -0.145941f,   -0.449656f,
+      0.850117f,     0.279997f,   0.204143f,   -0.31356f,    0.947057f,
+      -0.135787f,    0.747071f,   0.0145968f,  -0.81414f,    0.431009f,
+      -0.275824f,    -0.342928f,  -0.0528272f, -0.592183f,   0.433915f,
+      -0.251752f,    -0.311815f,  -1.47533f,   -1.43677f,    0.0698436f,
+      1.01341f,      0.305063f,   -0.252003f,  -0.428915f,   -0.00104153f,
+      -0.368267f,    -0.354523f,  -0.27956f,   -0.771664f,   0.232092f,
+      -0.428495f,    0.424952f,   -0.343229f,  0.196899f,    -0.761084f,
+      -0.0110293f,   -0.335361f,  0.571637f,   -0.423489f,   -0.52773f,
+      0.0108043f,    -0.504715f,  -1.1419f,    -0.402904f,   -0.160747f,
+      -0.329184f,    0.375374f,   -1.02604f,   -0.601371f,   0.631652f,
+      0.0742486f,    -0.464765f,  0.467445f,   0.240562f,    -0.38211f,
+      -0.459004f,    0.704196f,   0.021357f,   0.860785f,    -1.16731f,
+      -0.479029f,    -0.139644f,  -0.444087f,  0.322326f,    -0.25455f,
+      0.874399f,     0.477696f,   0.0464487f,  1.20658f,     0.0993356f,
+      0.00682712f,   -0.10163f,   -0.371765f,  -0.629513f,   -0.679196f,
+      -0.193935f,    0.47405f,    -0.18238f,   0.254918f,    -0.35306f,
+      -0.375611f,    0.119771f,   -0.257282f,  -0.565124f,   0.162667f,
+      -0.356128f,    0.870351f,   0.241847f,   -0.264712f,   -0.384322f,
+      0.31807f,      0.211621f,   -0.180767f,  0.764944f,    0.368646f,
+      0.186111f,     1.02458f,    -0.494252f,  -0.483375f,   -0.699664f,
+      0.00415657f,   -0.189376f,  -0.677103f,  -0.030319f,   0.667087f,
+      0.810951f,     -0.488237f,  -0.387355f,  -0.726579f,   -0.304763f,
+      1.10392f,      -0.775977f,  -0.247731f,  0.532396f,    1.24089f,
+      0.206621f,     -0.670568f,  -1.08142f,   -0.342503f,   0.189854f,
+      -0.200846f,    0.784204f,   0.641112f,   -0.509346f,   0.0805264f,
+      -1.40006f,     0.322084f,   -0.823739f,  -1.12965f,    -0.215668f,
+      0.099673f,     0.425966f,   0.771697f,   0.338834f,    0.345364f,
+      -0.297826f,    -0.176746f,  -0.297299f,  -1.80029f,    -0.178348f,
+      0.421194f,     -0.19155f,   0.417653f,   0.374441f,    -0.135654f,
+      -0.895843f,    0.220647f,   0.368264f,   0.369233f,    0.382707f,
+      0.0800511f,    0.542053f,   0.318896f,   -0.385539f,   0.313305f,
+      -1.01166f,     -0.222379f,  -1.53708f,   1.32407f,     -0.665444f,
+      -0.102348f,    0.0410504f,  -0.616825f,  1.3108f,      0.405902f,
+      1.27777f,      0.0630558f,  -0.172696f,  0.16224f,     -1.10111f,
+      -3.31326f,     -0.242566f,  0.831422f,   0.917397f,    0.311749f,
+      -0.238613f,    0.438007f,   -0.407089f,  -0.0202555f,  -1.82502f,
+      -0.907965f,    -0.300031f,  -0.616669f,  -0.767921f,   0.285919f,
+      -0.112019f,    0.252677f,   0.350892f,   0.000214244f, 0.315915f,
+      0.260344f,     0.327362f,   -0.0211213f, -0.41241f,    0.0418355f,
+      0.103328f,     -0.0158439f, -0.230505f,  -0.0215114f,  0.266739f,
+      -0.234376f,    -0.352583f,  0.0709437f,  -0.90649f,    -0.535843f,
+      1.21322f,      -1.05144f,   -0.983682f,  -0.189956f,   1.14208f,
+      -0.0188492f,   -0.254821f,  -0.463214f,  -0.708714f,   0.0447348f,
+      -0.220831f,    0.476299f,   0.102544f,   1.1173f,      -0.36981f,
+      -0.814102f,    0.103604f,   -0.247871f,  0.0610701f,   -0.356616f,
+      -0.144093f,    1.66496f,    0.180206f,   -1.04384f,    -0.65883f,
+      0.0290771f,    -0.622728f,  0.761523f,   -0.909091f,   -0.0340348f,
+      0.666895f,     -0.0232575f, 0.962643f,   -2.50103f,    -1.69745f,
+      -0.0482305f,   0.771811f,   -1.32233f,   -0.778722f,   -0.203309f,
+      0.395875f,     -0.171812f,  0.253794f,   0.432799f
+    };
+
+static const float av1_intra_mode_cnn_partition_branch_1_dnn_layer_1_bias[] = {
+  -0.152159f, 0.552347f,   -0.806068f, 0.227901f,  0.335896f,  0.180785f,
+  0.75277f,   0.982208f,   0.409823f,  -0.17755f,  -0.125365f, 0.738114f,
+  0.202331f,  0.751737f,   -0.360511f, 0.149254f,  0.085073f,  -0.214542f,
+  0.529727f,  -0.0348777f, -2.13162f,  -0.893332f, -0.136952f, -0.71258f
+};
+
+static const float av1_intra_mode_cnn_partition_branch_1_logits_kernel[] = {
+  -0.632145f, 0.738727f, -0.750737f, -0.931571f, -1.79763f,  -2.31153f,
+  0.912733f,  0.879995f, -1.00602f,  -1.02467f,  0.0536835f, 1.76011f,
+  -0.898546f, 1.06959f,  1.60471f,   -1.7312f,   -0.877168f, -0.681185f,
+  -1.57286f,  -1.16038f, -4.11303f,  -3.06351f,  -3.02536f,  -2.92186f
+};
+
+static const float av1_intra_mode_cnn_partition_branch_1_logits_bias[] = {
+  1.33207f
+};
+
+static const float
+    av1_intra_mode_cnn_partition_branch_2_dnn_layer_0_kernel[] = {
+      0.0419551f,  0.0924078f,   -0.153084f,   0.191642f,    0.069586f,
+      -0.530661f,  0.431968f,    0.000453838f, 0.793047f,    0.0161817f,
+      -0.476075f,  -0.156638f,   -0.219066f,   0.372716f,    -0.0642299f,
+      0.156813f,   -0.105819f,   -0.0519422f,  0.149935f,    0.295544f,
+      0.192037f,   -0.0450383f,  0.828794f,    -0.0510661f,  -1.22549f,
+      -0.100293f,  -0.178274f,   0.0304427f,   -0.0664097f,  -0.0438936f,
+      0.948248f,   0.425486f,    -0.238206f,   1.3744f,      0.336897f,
+      0.0760769f,  -0.583508f,   0.0735519f,   -0.117024f,   0.0501598f,
+      0.332212f,   0.199531f,    0.424764f,    0.206712f,    0.342868f,
+      0.592673f,   -0.0961148f,  -0.190113f,   -0.155027f,   0.00789871f,
+      -0.0514839f, -0.416154f,   -0.290309f,   0.407541f,    0.48534f,
+      0.126564f,   0.0709566f,   -0.0469664f,  0.735403f,    -0.365963f,
+      0.150295f,   -0.50147f,    0.021383f,    0.76514f,     0.0085721f,
+      -0.416384f,  1.22268f,     0.0832438f,   0.367813f,    -0.12012f,
+      0.823183f,   -0.0525972f,  -0.325526f,   -0.0983032f,  0.370128f,
+      0.368778f,   0.138971f,    -0.0397997f,  0.411058f,    -0.0400404f,
+      0.588437f,   -0.29963f,    -0.107992f,   -1.75238f,    -0.274387f,
+      0.430418f,   0.495152f,    0.283172f,    -0.441166f,   0.195339f,
+      -0.436182f,  -0.252613f,   0.176204f,    -0.126541f,   -0.474833f,
+      -0.0721603f, -0.496599f,   -0.0608464f,  0.0333451f,   -0.0621485f,
+      0.0843859f,  0.0637854f,   -0.145291f,   0.14876f,     0.181665f,
+      -0.675805f,  0.294903f,    0.301118f,    -0.225957f,   0.0105897f,
+      -0.136427f,  -0.555925f,   -0.158853f,   -0.216779f,   0.0612481f,
+      -0.107158f,  0.352451f,    0.140536f,    -0.0148237f,  0.189371f,
+      -0.091046f,  -0.0476226f,  0.366054f,    -0.0723413f,  0.389883f,
+      -0.0213411f, 0.0279539f,   0.194827f,    -0.271502f,   -0.166474f,
+      0.0690549f,  0.0584665f,   0.0198415f,   -0.442348f,   0.1571f,
+      -0.113463f,  -0.16822f,    -0.0580659f,  -0.13441f,    -0.0022386f,
+      0.251521f,   -0.160494f,   -0.0753547f,  0.0897289f,   0.137917f,
+      0.129836f,   0.0816833f,   -0.626288f,   0.0643293f,   -1.20001f,
+      0.085631f,   -0.195602f,   0.251244f,    0.0321744f,   0.0493178f,
+      -0.220616f,  0.724075f,    -0.00831514f, 2.00319f,     0.407932f,
+      0.0710799f,  -0.166128f,   0.0126611f,   -0.229644f,   -0.0984299f,
+      0.632041f,   -0.0946141f,  0.295315f,    0.100934f,    0.184883f,
+      -0.236173f,  0.158081f,    0.195775f,    0.413542f,    0.789801f,
+      0.767741f,   0.166275f,    -0.348271f,   -0.384074f,   -0.291648f,
+      -0.119899f,  0.0368354f,   0.0751987f,   1.04217f,     -0.159002f,
+      -2.71592f,   -0.788502f,   -1.06268f,    0.536057f,    0.0575876f,
+      1.06811f,    0.12033f,     0.198578f,    -0.0419196f,  0.0631388f,
+      0.623138f,   -0.142226f,   1.33129f,     0.0868059f,   -0.0287825f,
+      0.139378f,   -0.143037f,   0.307452f,    0.0363987f,   -0.0976368f,
+      0.040544f,   0.0269327f,   -0.0845524f,  0.0674699f,   0.104501f,
+      -0.0351155f, 0.167071f,    0.00986971f,  0.10284f,     0.0300016f,
+      0.192601f,   0.0397177f,   0.0251346f,   -0.00912908f, -0.0452825f,
+      0.0164356f,  -0.0275149f,  0.194846f,    0.0943608f,   1.61674f,
+      0.0124345f,  0.523787f,    0.0397258f,   -0.17208f,    -0.147808f,
+      -1.23583f,   0.676385f,    0.551994f,    0.0233041f,   0.0116391f,
+      -0.466706f,  0.154725f,    -0.207371f,   0.606662f,    0.247286f,
+      0.31216f,    0.173765f,    -0.268033f,   0.224422f,    0.314649f,
+      0.481922f,   -0.190604f,   -0.0129162f,  0.270552f,    0.135195f,
+      0.0927735f,  -0.226099f,   0.53897f,     0.103309f,    -0.0257271f,
+      -0.0246776f, 0.442013f,    -0.179246f,   -1.02581f,    0.206176f,
+      -0.326365f,  0.391623f,    -0.103549f,   0.115645f,    0.0269328f,
+      -0.584517f,  -0.237502f,   0.157996f,    0.0447407f,   -0.161f,
+      -0.126072f,  -0.148967f,   -0.416347f,   0.0236496f,   -1.12612f,
+      0.0120709f,  -0.00979376f, 0.0507126f,   -0.172262f,   0.0697059f,
+      -0.212334f,  0.335731f,    -0.0301362f,  -0.839583f,   -0.238539f,
+      0.0636752f,  -0.0467217f,  -0.0372118f,  -0.144615f,   -0.161773f,
+      -0.648242f,  0.158197f,    -0.051471f,   -0.0615805f,  -0.0426936f,
+      -0.0745554f, 0.358975f,    0.358297f,    0.0568553f,   -1.14383f,
+      -0.103955f,  0.728194f,    -0.224945f,   -0.31659f,    -0.204458f,
+      0.171763f,   -0.465666f,   0.899234f,    -0.37042f,    -0.0894774f,
+      0.11478f,    -0.334957f,   0.0896514f,   0.413251f,    0.359471f,
+      1.41597f,    0.558082f,    0.153486f,    0.0270558f,   -0.0178797f,
+      0.124983f,   -0.12273f,    -1.04516f,    -0.125375f,   0.370336f,
+      -0.209423f,  -0.36816f,    -0.66077f,    -0.0180773f,  -0.628921f,
+      -0.178542f,  0.0346841f,   0.0319309f,   -0.470138f,   0.172763f,
+      0.0798846f,  -0.259737f,   -0.652461f,   -0.386283f,   -0.474447f,
+      -0.924054f,  -0.0154613f,  -0.613712f,   -0.138068f,   -0.337842f,
+      0.217921f,   -0.0711405f,  0.000404091f, -0.703766f,   0.0364683f,
+      0.150173f,   0.0126249f,   0.170594f,    0.0371879f,   -0.0862515f,
+      -0.23454f,   -0.0144143f,  0.164947f,    0.45591f,     0.115703f,
+      0.069752f,   -0.011993f,   0.0402097f,   0.00697581f,  0.0811613f,
+      0.384752f,   0.341977f,    0.06087f,     0.0590107f,   0.00812679f,
+      0.121211f,   -0.0612108f,  0.167851f,    0.195781f,    -1.62162f,
+      0.336292f,   -0.0772523f,  -0.310786f,   0.188257f,    -0.0325804f,
+      -0.240098f,  0.158748f,    -0.265264f,   3.19593f,     -0.449251f,
+      -1.33102f,   -0.482856f,   -0.435731f,   0.300808f,    0.346503f,
+      2.67378f,    -0.152379f,   0.219322f,    -0.146119f,   -0.0584806f,
+      -0.0276895f, -0.21955f,    -0.479179f,   -0.689545f,   0.152799f
+    };
+
+static const float av1_intra_mode_cnn_partition_branch_2_dnn_layer_0_bias[] = {
+  -0.296575f, 0.101072f,  -0.208429f, 0.111585f, 0.699552f,   -0.379484f,
+  0.313244f,  -0.746369f, 0.867757f,  0.457318f, -0.0190943f, -0.290745f,
+  0.45592f,   -0.160465f, -0.634243f, 0.0829737f
+};
+
+static const float
+    av1_intra_mode_cnn_partition_branch_2_dnn_layer_1_kernel[] = {
+      0.27511f,    -2.14172f,   1.25755f,    -0.554772f,  0.589508f,
+      0.228307f,   0.0754914f,  1.07061f,    0.293323f,   0.65162f,
+      -0.272016f,  -1.33519f,   -0.606759f,  -0.57827f,   0.368807f,
+      -1.48668f,   0.162439f,   0.0821667f,  0.225535f,   -0.795996f,
+      0.0328293f,  0.975476f,   -0.187514f,  2.47069f,    -1.5638f,
+      -0.461524f,  0.00310062f, 1.1556f,     -0.286206f,  0.00426021f,
+      0.585836f,   0.900007f,   0.384055f,   0.189435f,   -0.157291f,
+      -0.0710573f, -0.0663986f, -0.710772f,  -0.669136f,  -0.379493f,
+      -1.2634f,    -0.377524f,  0.824094f,   0.312308f,   0.125368f,
+      -0.382737f,  0.637109f,   0.61907f,    -0.741184f,  0.00257198f,
+      -0.0151343f, -0.669826f,  -0.439855f,  0.564852f,   -0.0588036f,
+      -1.38123f,   -1.1126f,    0.701831f,   0.198686f,   0.266866f,
+      0.270172f,   -0.692401f,  0.272533f,   -1.70914f,   0.66064f,
+      0.0886659f,  -0.132233f,  0.270531f,   -0.479581f,  0.704338f,
+      -0.307039f,  -0.111792f,  -2.05753f,   -0.231749f,  0.300528f,
+      0.383266f,   -0.130857f,  -0.373944f,  1.21025f,    0.704655f,
+      -0.589422f,  0.267185f,   -0.109065f,  -0.195991f,  0.20209f,
+      -0.0676526f, -0.183926f,  0.164894f,   0.0877923f,  0.565943f,
+      -0.0610466f, -0.86354f,   -0.80853f,   -0.176111f,  -1.45016f,
+      -2.29078f,   -0.124524f,  -0.139305f,  -0.187858f,  -0.0250151f,
+      -0.572544f,  0.185336f,   -0.69275f,   -0.430354f,  -0.30861f,
+      -0.754258f,  -0.468221f,  -0.160487f,  -0.766692f,  -0.636418f,
+      -0.71016f,   0.576125f,   -0.240476f,  -0.954556f,  -0.104693f,
+      0.155557f,   -0.840224f,  -0.685457f,  -0.0346927f, -0.644882f,
+      -1.92475f,   -0.314544f,  0.463569f,   0.323569f,   -0.990124f,
+      -0.213658f,  0.407183f,   1.19797f,    -4.77004f,   -0.0613379f,
+      -2.40345f,   -0.0591791f, -0.477622f,  -0.303556f,  0.104077f,
+      -0.974128f,  -0.035172f,  1.47064f,    0.233727f,   -0.0754056f,
+      0.158553f,   0.0614361f,  -1.38865f,   0.690729f,   0.568455f,
+      0.205866f,   -0.0236852f, -0.0921077f, -0.538954f,  0.336613f,
+      -0.427115f,  0.791754f,   -1.819f,     -0.404432f,  0.670242f,
+      -0.0343869f, -0.37191f,   0.0271262f,  0.988161f,   -0.547343f,
+      0.925304f,   0.548079f,   -0.430343f,  -0.214109f,  0.242013f,
+      1.39027f,    0.37648f,    -1.63524f,   -0.158864f,  -0.572779f,
+      -0.766801f,  -2.62032f,   0.47799f,    -1.12025f,   -0.115283f,
+      1.22349f,    -0.262132f,  -0.151274f,  0.390483f,   -0.496482f,
+      1.06166f,    -0.183052f,  0.54647f,    0.847486f,   0.0229506f,
+      0.653309f,   -0.020736f,  -1.27453f,   0.48386f,    -0.366625f,
+      -0.515725f,  -1.31196f,   0.140701f,   -0.183636f,  0.000413912f,
+      0.300993f,   -0.849529f,  -0.59764f,   -0.212992f,  -0.933365f,
+      -1.4054f,    -0.091982f,  0.41695f,    0.264004f,   -0.26379f,
+      -0.0738219f, 0.434052f,   1.16617f,    -0.639624f,  -0.146465f,
+      0.0409936f,  -0.900182f,  0.73517f,    0.805746f,   -0.208088f,
+      1.74459f,    -0.0592751f, 0.624865f,   -0.62325f,   -0.446315f,
+      0.150526f,   0.0526697f,  0.374254f,   -0.658043f,  1.02623f,
+      -0.941758f,  0.381217f,   -0.359448f,  0.160051f,   0.556455f,
+      0.239382f,   0.75851f,    0.437583f,   -0.122221f,  0.746136f,
+      0.218286f,   -0.426729f,  0.0353903f,  -0.830513f,  -0.877586f,
+      0.488077f,   -0.132354f,  -0.180756f,  0.736163f,   -0.202934f,
+      -0.882534f,  0.166305f,   0.183122f,   0.0599858f,  0.442687f,
+      0.0522908f,  -1.17755f,   -1.03733f,   0.392363f,   0.672718f,
+      -1.44704f,   0.360623f,   0.390298f,   -0.213968f,  0.169783f,
+      -0.717536f,  -0.830984f,  -0.445049f,  0.196772f,   -0.730634f,
+      -1.09497f,   0.344012f,   -0.292802f,  -0.67966f,   0.138515f,
+      -0.361803f,  0.936778f,   -0.189802f,  0.197777f,   -0.367507f,
+      -0.293653f,  0.447759f,   -0.409245f,  -0.687568f,  -0.431301f,
+      -0.271234f,  -0.585413f,  -0.936414f,  -0.396049f,  -0.29388f,
+      -0.0930843f, 0.0179339f,  0.262463f,   -0.166598f,  0.0171466f,
+      -0.329641f,  0.39343f,    0.657445f,   -0.579052f,  -0.312444f,
+      -0.0915881f, -0.432622f,  -0.247645f,  0.485749f,   -0.602508f,
+      -0.347936f,  0.287353f,   0.288705f,   0.168397f,   0.568228f,
+      -0.493586f,  1.04155f,    -0.097956f,  0.658928f,   -0.561007f,
+      0.0457783f,  2.12744f,    0.182683f,   -0.690282f,  0.183302f,
+      0.0309499f,  -0.722251f,  0.0660448f,  -0.333277f,  0.198929f,
+      -0.724102f,  -0.405597f,  0.614868f,   -0.292862f,  0.886513f,
+      0.142353f,   -1.48934f,   -0.97273f,   0.199683f,   0.522121f,
+      0.0877478f,  -0.172593f,  -1.58858f,   0.113191f,   -0.436178f,
+      0.640895f,   -0.504676f,  0.0658654f,  -0.361301f,  0.604323f,
+      0.315196f,   -0.423021f,  -0.323484f,  -0.563163f,  0.118989f,
+      -0.404508f,  -0.0550995f, -0.0359236f, -0.126574f,  -0.357288f,
+      -0.0494502f, 1.04959f,    -0.31646f,   -0.0376684f, -0.300744f,
+      -0.135016f,  0.102696f,   -0.392333f,  -1.17502f,   0.505227f,
+      0.337608f,   -0.348831f,  -0.420815f,  0.202791f,   -0.154264f,
+      -0.563686f,  0.0942187f,  0.353862f,   0.0303509f,  -0.132794f,
+      0.420746f,   0.143529f,   0.455822f,   -1.28348f,   -1.35662f,
+      -0.850688f,  -1.76361f,   -0.717546f,  0.443111f,   0.227155f,
+      -0.863307f,  -0.452033f,  -0.278151f,  1.86233f
+    };
+
+static const float av1_intra_mode_cnn_partition_branch_2_dnn_layer_1_bias[] = {
+  -0.103218f, -0.359587f, 0.619666f,  -0.473497f,  -0.649803f, 0.86992f,
+  -0.115561f, 0.335114f,  -0.285044f, -0.59295f,   0.24497f,   0.611583f,
+  0.38568f,   0.137913f,  -0.281191f, -0.0107777f, 0.487236f,  -0.262363f,
+  0.696962f,  0.121565f,  0.312511f,  0.430916f,   0.694134f,  0.393632f
+};
+
+static const float av1_intra_mode_cnn_partition_branch_2_logits_kernel[] = {
+  -2.42496f,  -1.239f,   0.832673f, 1.56923f,   -2.6175f,  -1.42492f,
+  -0.311387f, -1.94237f, 0.54071f,  -2.50391f,  0.352205f, -0.96572f,
+  1.47144f,   -2.04702f, -1.12372f, -0.709186f, 0.812238f, 0.310389f,
+  0.789163f,  -0.65236f, 1.77018f,  0.273867f,  1.19506f,  1.07022f
+};
+
+static const float av1_intra_mode_cnn_partition_branch_2_logits_bias[] = {
+  0.953424f
+};
+
+static const float
+    av1_intra_mode_cnn_partition_branch_3_dnn_layer_0_kernel[] = {
+      0.0485154f,    0.0496279f,    0.0268229f,    -0.0584843f,   -0.166928f,
+      0.0316731f,    -0.0895094f,   -0.0433243f,   -0.00893639f,  -0.0886265f,
+      -0.0345622f,   -0.235395f,    -0.213754f,    -0.00212398f,  0.0218857f,
+      -0.0054983f,   -0.0248236f,   0.081822f,     -0.0355708f,   -0.0795593f,
+      -0.106995f,    -0.0596378f,   0.0350686f,    -0.133863f,    -0.00582928f,
+      0.114963f,     0.193906f,     -0.00419085f,  0.0430529f,    -0.128318f,
+      0.0614715f,    -0.000952935f, -0.0345722f,   -0.109459f,    0.074204f,
+      -0.0865131f,   0.0649158f,    -0.0942417f,   -0.10122f,     -0.047551f,
+      -1.27825f,     -0.0125456f,   -0.019722f,    -0.152058f,    0.280306f,
+      -0.121231f,    -0.0565484f,   0.0959188f,    0.0603919f,    0.0457468f,
+      0.967589f,     0.105892f,     -0.118326f,    0.198933f,     0.163437f,
+      -0.056824f,    -0.0302956f,   -0.07366f,     -0.681407f,    -0.0781575f,
+      0.255732f,     -0.0712105f,   0.177882f,     0.709206f,     -0.232457f,
+      1.33809f,      -0.0328557f,   0.0572231f,    -1.01361f,     0.130676f,
+      -0.205159f,    0.975398f,     0.356293f,     0.0766364f,    -0.297397f,
+      -0.0261066f,   -0.0933549f,   0.0568851f,    -0.0123034f,   -0.0433538f,
+      0.131003f,     0.890705f,     0.0084565f,    0.00547395f,   0.00157634f,
+      0.0047937f,    -0.0511092f,   0.0300034f,    -0.00604993f,  -0.0133502f,
+      -0.000274302f, 0.129728f,     -0.00532916f,  0.0855351f,    0.136885f,
+      0.0175562f,    -0.0123633f,   -0.000512229f, -0.019924f,    -0.0316328f,
+      0.422972f,     0.0460336f,    0.0170841f,    -0.00086795f,  -0.0655137f,
+      0.0287308f,    -0.0375644f,   -0.0329215f,   -0.0273072f,   0.0241426f,
+      -0.0429052f,   0.0221593f,    -0.063881f,    -0.0347391f,   -6.44339e-07f,
+      0.0476934f,    -0.0150068f,   0.0146403f,    -0.0653099f,   0.0107635f,
+      0.012407f,     0.0048935f,    1.50975f,      0.322256f,     0.17881f,
+      0.0943775f,    -0.100583f,    -0.367022f,    -0.156525f,    -0.0397161f,
+      0.0752784f,    -0.00219022f,  -0.887456f,    0.0153415f,    -0.0148185f,
+      -0.56435f,     0.163996f,     -0.0221024f,   -0.0115872f,   -0.0529284f,
+      0.156838f,     -1.13813f,     -0.207863f,    -0.00484959f,  0.135719f,
+      0.131004f,     0.0417939f,    0.31453f,      0.121719f,     -0.101515f,
+      0.267951f,     0.219727f,     0.0398821f,    0.0713504f,    3.65918e-06f,
+      -0.00659998f,  0.477343f,     -0.128426f,    0.0648877f,    0.111884f,
+      0.224552f,     0.0617426f,    0.117742f,     0.031377f,     0.0586865f,
+      -0.459293f,    0.100211f,     -0.14127f,     0.624412f,     0.014659f,
+      -1.41807f,     -0.382452f,    -0.695931f,    -0.103153f,    0.145808f,
+      0.333526f,     -0.256367f,    0.096842f,     0.102458f,     -0.181224f,
+      0.729272f,     0.151177f,     1.46729f,      0.111044f,     -4.28813f,
+      0.0178379f,    0.47641f,      -6.57533f,     0.0633335f,    0.496934f,
+      -0.154657f,    -9.07298e-05f, 0.848937f,     -5.40143f,     0.375685f,
+      0.23586f,      -0.166591f,    -0.0191648f,   -0.039862f,    -3.25093f,
+      0.168472f,     -0.260317f,    -5.51548f,     0.0575334f,    0.328979f,
+      0.112644f,     0.231339f,     -0.122641f,    0.0567331f,    1.19541f,
+      -0.038735f,    0.0630576f,    0.176668f,     0.0757184f,    -0.833104f,
+      0.133669f,     0.982669f,     0.0311783f,    0.0908558f,    -0.10065f,
+      -0.0386599f,   -0.231587f,    -0.83876f,     -0.347148f,    0.225529f,
+      -1.29625f,     0.0806834f,    0.369648f,     -1.63367f,     0.118057f,
+      -0.311948f,    0.95022f,      -0.354807f,    -0.648657f,    -1.72048f,
+      0.260397f,     0.915555f,     0.057737f,     -0.162019f,    -0.453543f,
+      -1.70388f,     -0.311632f,    -0.731593f,    -0.678089f,    0.10438f,
+      -0.293911f,    0.144864f,     0.039212f,     0.0289241f,    -0.0685266f,
+      0.634592f,     -0.0798614f,   -0.119197f,    -0.00517433f,  -0.04653f,
+      -0.127568f,    -0.0582645f,   0.0735302f,    -0.0946823f,   0.00865585f,
+      0.0115748f,    0.0194847f,    0.0455664f,    0.181006f,     -0.0824601f,
+      0.0869093f,    0.264767f,     -0.0750432f,   0.135136f,     0.316511f,
+      0.399015f,     0.0994808f,    -0.166944f,    -0.102126f,    0.457858f,
+      0.300488f,     0.467582f,     0.830244f,     -0.0511439f,   -0.522892f,
+      -0.183049f,    0.2626f,       0.118382f,     0.241674f,     0.250399f,
+      -0.0963507f,   -0.83231f,     -0.227699f,    -0.133314f,    0.231718f,
+      -0.0700274f,   0.891311f,     0.224742f,     -0.572836f,    0.402798f,
+      -0.191576f,    0.740922f,     -0.00374073f,  0.658178f,     -0.209364f,
+      -0.416259f,    0.166297f,     0.0095577f,    -0.0876076f,   0.424954f,
+      0.265226f,     -0.129343f,    -0.203146f,    -0.194637f,    -0.818142f,
+      -0.164152f,    -0.368962f,    0.273373f,     0.599927f,     -0.19859f,
+      0.0939651f,    -0.12458f,     -0.751816f,    -0.302997f,    -0.139176f,
+      -0.372737f,    0.332704f,     -0.206045f,    -0.00593763f,  -0.452363f,
+      -0.2704f,      -0.198846f,    0.0976308f,    -0.216124f,    0.110122f,
+      -0.220342f,    0.00763426f,   -0.0272775f,   -0.190395f,    -0.0359411f,
+      -0.0395759f,   0.000941162f,  -1.49959f,     0.0914233f,    0.448346f,
+      -0.420435f,    -0.0102102f,   -0.0757978f,   -0.0177687f,   -0.0231492f,
+      -0.142125f,    1.31774f,      0.0269368f,    0.134566f,     0.152079f,
+      -0.139933f,    0.139226f,     -0.214467f,    -0.194446f,    -0.555893f,
+      0.271197f,     -0.111047f,    0.0888069f,    -0.198121f,    0.0871713f,
+      0.100612f,     0.429782f,     -0.3787f,      0.123147f,     -0.12538f,
+      0.235678f,     0.139237f,     0.223326f,     0.85806f,      -0.00554756f,
+      0.285095f,     0.0954683f,    0.0464989f,    0.100806f,     -0.0211297f,
+      0.121672f,     0.242473f,     0.0810475f,    -0.834356f,    0.119629f,
+      0.111338f,     -0.227126f,    0.159296f,     -0.0584685f,   -0.108265f,
+      -0.0909221f,   -0.21749f,     0.0929309f,    -0.176815f,    0.178067f,
+      -0.0025905f,   0.317883f,     0.313045f,     0.26774f,      -0.589329f,
+      -1.19882f,     -0.285513f,    -0.109478f,    0.309441f,     -0.0604479f,
+      0.947461f,     -0.142342f,    -0.9086f,      -0.814788f,    0.184588f,
+      -0.0736317f,   0.276237f,     0.13132f,      -0.3931f,      -0.381744f,
+      -0.0122719f,   0.0246101f,    -0.0920412f,   0.11331f,      -0.110355f,
+      0.00848064f,   0.0931248f,    -0.0638655f,   -4.30869e-05f, -0.300367f,
+      0.0489508f,    0.464441f,     -0.0466243f,   -0.0137732f,   0.0099241f,
+      -0.223972f,    0.188966f,     -0.653173f,    -0.354322f,    0.189237f,
+      -0.624276f,    -1.46218f,     -0.075161f,    -0.516172f,    0.40993f,
+      0.291178f,     -1.95088f,     -0.0352157f,   0.196354f,     -0.335897f,
+      0.0857039f,    0.605319f,     -1.12923f,     -0.638387f,    1.41868f,
+      0.0955757f,    -0.00913477f,  0.315935f,     -0.671223f,    -0.851436f,
+      -0.157464f,    -0.296763f,    0.182277f,     -0.139309f,    0.232789f,
+      0.869562f,     0.248894f,     0.242709f,     0.195479f,     0.106153f,
+      0.358881f,     0.167443f,     0.982987f,     0.104767f,     -0.033925f,
+      -0.0263185f,   0.0045304f,    0.0722479f,    -0.111307f,    0.00128896f,
+      0.406128f,     -0.00944947f,  0.121592f,     0.546284f,     -0.00175696f,
+      0.776588f,     0.238846f,     0.064469f,     0.27082f,      0.269187f,
+      0.0294455f,    0.62364f,      -0.27872f,     -0.0488013f,   0.229024f,
+      0.154457f,     0.0445898f,    0.349943f,     0.0710998f,    0.0820674f,
+      0.0279449f,    0.172826f,     -0.122156f,    -0.164688f,    0.0292124f,
+      0.0496112f,    -0.741762f,    0.0673926f,    0.108159f,     -0.0942327f,
+      -0.0562883f,   0.558231f,     0.0552399f,    0.211393f,     0.0376817f,
+      -0.275788f,    0.0548436f,    0.212732f,     0.163603f,     0.0663363f,
+      -0.0252315f,   0.164533f,     0.0826088f,    0.0301389f,    0.345705f,
+      -0.0378046f,   -0.139581f,    1.30162f,      1.23551f,      -0.446693f,
+      0.682534f,     -0.0831157f,   -0.0121595f,   1.50505f,      0.0839017f,
+      -0.953413f,    0.0820985f,    -0.125556f,    0.699796f,     -0.140453f,
+      0.168438f,     -0.110966f,    0.173806f,     0.114683f,     0.132502f,
+      -0.0453539f,   -0.133096f,    0.511947f,     -0.180657f,    -0.0298605f,
+      0.291437f,     -0.0275017f,   -0.229703f,    -0.0504205f,   0.559622f,
+      0.384601f,     0.111024f,     -0.0773559f,   -0.0591752f,   -0.0866182f,
+      -0.189437f,    -0.262345f,    -0.0372182f,   0.149925f,     0.154644f,
+      -0.188298f,    0.236949f,     -0.199328f,    -0.378909f,    -0.680128f,
+      0.277184f,     -0.172784f,    0.184717f,     -0.23899f,     0.0712069f,
+      0.0235425f,    0.4225f,       -0.441487f,    0.177434f,     -0.298303f,
+      0.295696f,     0.17346f,      0.220542f,     -0.680116f,    0.00266223f,
+      -0.0408459f,   -0.15486f,     0.24335f,      0.237258f,     -0.0283245f,
+      0.19703f,      -0.100027f,    0.0554843f,    -1.03081f,     0.151745f,
+      0.538582f,     0.370368f,     0.196683f,     0.0222123f,    -0.0831401f,
+      -0.0832803f,   -0.286743f,    -0.686003f,    0.0995004f,    0.148901f,
+      -0.0436037f,   -0.316508f,    0.00391835f,   -0.228452f,    0.940058f,
+      0.520047f,     -0.334211f,    0.652142f,     -0.0755971f,   0.0965123f,
+      -0.98191f,     0.394096f,     -0.420466f,    0.327284f,     -0.134651f,
+      0.849297f,     -0.523372f,    0.010327f,     0.133636f,     0.298119f,
+      -0.257389f,    0.0376153f,    -0.198298f,    0.0736235f,    0.608809f,
+      0.0291836f,    -0.290005f,    -0.141316f,    0.0184599f,    0.0554437f,
+      0.0621519f,    0.485276f,     0.617062f,     -0.0924811f,   -0.0120834f,
+      0.0817611f,    0.100421f,     -0.0153553f,   -0.135958f,    -0.0185322f,
+      -0.395803f,    -0.204862f,    0.547916f,     -0.438117f,    0.0229788f,
+      0.406981f,     0.795584f,     -2.02756f,     -0.8355f,      -0.386789f,
+      0.00968368f,   1.2147f,       -0.740869f,    -1.18415f,     -0.954918f,
+      -0.541142f,    0.0596003f,    0.107189f,     -0.411708f,    -0.964593f,
+      0.511906f
+    };
+
+static const float av1_intra_mode_cnn_partition_branch_3_dnn_layer_0_bias[] = {
+  -0.485545f, 0.131552f,   0.796833f,   -0.157582f, -0.0948124f, 0.00818613f,
+  -0.485562f, 0.3826f,     -0.0839326f, 0.170998f,  0.279545f,   -0.287143f,
+  0.184986f,  -0.0719864f, 0.19748f,    0.404145f
+};
+
+static const float
+    av1_intra_mode_cnn_partition_branch_3_dnn_layer_1_kernel[] = {
+      1.30172f,     0.720189f,   0.261675f,   -0.466201f,   1.21773f,
+      0.495525f,    0.62398f,    0.44567f,    -0.330993f,   -0.269798f,
+      0.835161f,    -0.294874f,  0.186981f,   0.0162467f,   0.367654f,
+      0.658468f,    1.08325f,    1.01558f,    0.12783f,     -0.280581f,
+      2.2204f,      0.0337286f,  -0.403649f,  -0.230908f,   -0.35188f,
+      0.437712f,    -0.103634f,  -0.645929f,  1.17407f,     0.157385f,
+      0.212438f,    1.41874f,    0.284242f,   -0.493105f,   1.0703f,
+      0.00632116f,  1.18222f,    -0.26003f,   0.276795f,    -0.823156f,
+      0.29577f,     -0.157467f,  -0.18092f,   0.0237336f,   0.205715f,
+      -0.295679f,   0.165443f,   -0.628279f,  1.00804f,     0.361232f,
+      0.646155f,    -0.028651f,  1.64317f,    0.334251f,    -1.50713f,
+      -1.51685f,    -0.488522f,  0.169694f,   -0.593176f,   -0.372682f,
+      -1.50223f,    0.35076f,    -0.24641f,   -0.237189f,   0.190502f,
+      -0.948191f,   -0.303346f,  0.45108f,    -0.794368f,   -2.3116f,
+      0.404008f,    -2.67269f,   -0.941992f,  -0.45336f,    0.0655987f,
+      -0.288432f,   0.106068f,   0.286978f,   0.121403f,    0.462739f,
+      0.0130292f,   0.240597f,   -2.30983f,   -0.453309f,   -0.149335f,
+      0.856424f,    -0.186576f,  0.769961f,   -0.0657097f,  -0.976188f,
+      0.972971f,    -0.532728f,  -0.699334f,  -0.168803f,   0.361945f,
+      0.950769f,    1.5368f,     -0.223899f,  1.17547f,     -0.281483f,
+      0.533619f,    0.315344f,   0.0854543f,  0.464701f,    0.346828f,
+      0.271794f,    -0.0185388f, 0.109517f,   0.371662f,    -0.10852f,
+      0.244092f,    0.491959f,   -0.750281f,  1.41865f,     -3.51221f,
+      0.298194f,    -0.0790832f, -0.134158f,  -0.424084f,   0.189593f,
+      -0.238361f,   -0.407872f,  -0.366222f,  -0.606813f,   -0.230498f,
+      0.387248f,    -0.102734f,  -0.190544f,  -1.43649f,    0.141338f,
+      -0.0438917f,  0.204628f,   1.57033f,    0.0366937f,   -0.14733f,
+      0.048198f,    -0.122631f,  0.183354f,   0.0658753f,   -0.243381f,
+      0.0246889f,   -0.768798f,  -0.0644054f, 0.775073f,    1.63419f,
+      0.491624f,    0.21898f,    -0.358944f,  3.31304f,     0.0195916f,
+      0.236174f,    0.530704f,   0.140124f,   0.0736778f,   -0.27361f,
+      -0.598836f,   -1.01659f,   0.361765f,   0.00455986f,  -0.345222f,
+      1.68731f,     0.764082f,   0.193555f,   0.322782f,    1.19801f,
+      0.538935f,    -0.0393231f, -0.0248292f, -0.151168f,   0.479879f,
+      -0.208582f,   0.22798f,    0.335473f,   -0.00295455f, 0.139539f,
+      0.400814f,    0.478307f,   -0.189376f,  0.540084f,    0.466072f,
+      0.920231f,    0.398774f,   -0.472403f,  -0.0431972f,  -0.581665f,
+      -0.990058f,   0.258995f,   -0.0148889f, 0.27105f,     0.340334f,
+      0.223576f,    -0.0405193f, -1.23888f,   -1.45229f,    -1.44543f,
+      -0.376146f,   0.132601f,   -0.4064f,    -0.583611f,   -0.374588f,
+      0.0659428f,   0.325652f,   -0.338456f,  0.253767f,    -0.0181164f,
+      0.681732f,    0.222041f,   0.837496f,   1.09735f,     0.156328f,
+      0.177236f,    -0.702702f,  0.473689f,   0.322118f,    0.43343f,
+      0.315441f,    -0.40798f,   0.0811291f,  0.631431f,    0.361929f,
+      0.0723276f,   0.0164498f,  0.0293847f,  0.156406f,    -1.10453f,
+      0.837977f,    -1.03449f,   -0.348408f,  1.71953f,     -0.401765f,
+      0.64272f,     -0.182438f,  -0.233954f,  0.364597f,    0.269177f,
+      -0.578512f,   0.397216f,   0.0425122f,  -0.258728f,   1.41621f,
+      -0.688768f,   0.0944726f,  0.253163f,   -0.989037f,   1.72726f,
+      1.15976f,     -0.0460612f, 0.534186f,   -0.136814f,   0.49327f,
+      0.115744f,    -0.633052f,  -0.433855f,  -1.01874f,    -0.324035f,
+      0.489487f,    1.08696f,    0.836376f,   -0.423477f,   -0.421309f,
+      1.07348f,     0.323266f,   0.717604f,   0.366422f,    0.32983f,
+      0.336583f,    0.749292f,   -0.210666f,  0.387101f,    -0.583376f,
+      0.0391101f,   -1.07537f,   0.914591f,   -0.51303f,    1.15023f,
+      -0.0378782f,  0.262889f,   -0.841128f,  0.41619f,     -0.669704f,
+      -0.109995f,   1.01825f,    -0.194853f,  0.120739f,    0.627889f,
+      -0.00269221f, 0.751152f,   -0.529865f,  -1.50238f,    0.184521f,
+      0.795464f,    0.106099f,   1.83117f,    0.0883305f,   0.306844f,
+      -0.0671504f,  -0.169306f,  -0.214575f,  -0.121606f,   -0.234965f,
+      0.109752f,    -0.35831f,   -0.07894f,   0.497203f,    -2.63013f,
+      0.815608f,    -0.193593f,  -0.62292f,   0.338941f,    0.0970922f,
+      -0.531178f,   0.723346f,   0.35063f,    0.182647f,    -0.257013f,
+      0.784924f,    -0.217915f,  -0.0797363f, -0.399706f,   -0.485602f,
+      1.23155f,     0.345998f,   0.322949f,   -0.168196f,   -0.173313f,
+      0.282205f,    0.45117f,    0.918706f,   -0.046172f,   -0.0873883f,
+      0.56103f,     -0.485768f,  0.546199f,   0.254997f,    0.394296f,
+      0.607178f,    0.667532f,   -0.343883f,  0.374402f,    -0.531439f,
+      2.27782f,     -1.13255f,   0.505867f,   -0.514742f,   0.998571f,
+      -1.60984f,    -0.172873f,  -0.0604094f, 0.719791f,    -0.733982f,
+      0.348905f,    1.39008f,    -0.895343f,  -0.677064f,   -1.84221f,
+      0.0434018f,   -0.534794f,  0.0434753f,  -0.266576f,   0.268099f,
+      -0.242935f,   0.00166289f, 0.0263789f,  -0.224794f,   -0.113493f,
+      -0.236397f,   0.0879936f,  0.510895f,   -0.511789f,   -1.48962f,
+      -2.78268f,    -0.0495784f, -0.0343907f, 0.440459f,    -0.364209f,
+      0.833223f,    -0.0589337f, 0.00181418f, 0.455499f,    0.101762f,
+      -1.16424f,    0.270405f,   0.219033f,   -4.91105f
+    };
+
+static const float av1_intra_mode_cnn_partition_branch_3_dnn_layer_1_bias[] = {
+  -0.40114f,  -0.372342f, -0.216186f, -0.240014f,  -0.341773f, -0.344489f,
+  -0.113037f, 0.198479f,  0.482958f,  -0.630072f,  -0.728704f, -0.171963f,
+  0.519883f,  0.253003f,  -0.121618f, -0.0569875f, -0.485568f, -0.147577f,
+  0.533305f,  -0.587251f, -0.120837f, -0.483953f,  0.445641f,  -0.125136f
+};
+
+static const float av1_intra_mode_cnn_partition_branch_3_logits_kernel[] = {
+  -1.57431f,  -1.09069f,  1.67996f,   -0.669702f, 0.499807f, -3.03145f,
+  -0.878135f, 0.637818f,  -1.58419f,  -3.79756f,  0.62755f,  -0.446646f,
+  0.653269f,  -0.667854f, -2.19774f,  -3.53349f,  2.6107f,   -0.685892f,
+  -1.2603f,   -0.89707f,  -0.715551f, 0.382202f,  2.09574f,  0.469386f
+};
+
+static const float av1_intra_mode_cnn_partition_branch_3_logits_bias[] = {
+  -0.022787f
+};
+
+static const NN_CONFIG av1_intra_mode_cnn_partition_branch_0_dnn_config = {
+  BRANCH_0_NUM_DNN_FEATURES,
+  BRANCH_0_NUM_LOGITS,
+  BRANCH_0_NUM_DNN_LAYERS,
+  {
+      BRANCH_0_NUM_DNN_LAYER_0_UNITS,
+      BRANCH_0_NUM_DNN_LAYER_1_UNITS,
+  },
+  {
+      av1_intra_mode_cnn_partition_branch_0_dnn_layer_0_kernel,
+      av1_intra_mode_cnn_partition_branch_0_dnn_layer_1_kernel,
+      av1_intra_mode_cnn_partition_branch_0_logits_kernel,
+  },
+  {
+      av1_intra_mode_cnn_partition_branch_0_dnn_layer_0_bias,
+      av1_intra_mode_cnn_partition_branch_0_dnn_layer_1_bias,
+      av1_intra_mode_cnn_partition_branch_0_logits_bias,
+  },
+};
+static const NN_CONFIG av1_intra_mode_cnn_partition_branch_1_dnn_config = {
+  BRANCH_1_NUM_DNN_FEATURES,
+  BRANCH_1_NUM_LOGITS,
+  BRANCH_1_NUM_DNN_LAYERS,
+  {
+      BRANCH_1_NUM_DNN_LAYER_0_UNITS,
+      BRANCH_1_NUM_DNN_LAYER_1_UNITS,
+  },
+  {
+      av1_intra_mode_cnn_partition_branch_1_dnn_layer_0_kernel,
+      av1_intra_mode_cnn_partition_branch_1_dnn_layer_1_kernel,
+      av1_intra_mode_cnn_partition_branch_1_logits_kernel,
+  },
+  {
+      av1_intra_mode_cnn_partition_branch_1_dnn_layer_0_bias,
+      av1_intra_mode_cnn_partition_branch_1_dnn_layer_1_bias,
+      av1_intra_mode_cnn_partition_branch_1_logits_bias,
+  },
+};
+static const NN_CONFIG av1_intra_mode_cnn_partition_branch_2_dnn_config = {
+  BRANCH_2_NUM_DNN_FEATURES,
+  BRANCH_2_NUM_LOGITS,
+  BRANCH_2_NUM_DNN_LAYERS,
+  {
+      BRANCH_2_NUM_DNN_LAYER_0_UNITS,
+      BRANCH_2_NUM_DNN_LAYER_1_UNITS,
+  },
+  {
+      av1_intra_mode_cnn_partition_branch_2_dnn_layer_0_kernel,
+      av1_intra_mode_cnn_partition_branch_2_dnn_layer_1_kernel,
+      av1_intra_mode_cnn_partition_branch_2_logits_kernel,
+  },
+  {
+      av1_intra_mode_cnn_partition_branch_2_dnn_layer_0_bias,
+      av1_intra_mode_cnn_partition_branch_2_dnn_layer_1_bias,
+      av1_intra_mode_cnn_partition_branch_2_logits_bias,
+  },
+};
+static const NN_CONFIG av1_intra_mode_cnn_partition_branch_3_dnn_config = {
+  BRANCH_3_NUM_DNN_FEATURES,
+  BRANCH_3_NUM_LOGITS,
+  BRANCH_3_NUM_DNN_LAYERS,
+  {
+      BRANCH_3_NUM_DNN_LAYER_0_UNITS,
+      BRANCH_3_NUM_DNN_LAYER_1_UNITS,
+  },
+  {
+      av1_intra_mode_cnn_partition_branch_3_dnn_layer_0_kernel,
+      av1_intra_mode_cnn_partition_branch_3_dnn_layer_1_kernel,
+      av1_intra_mode_cnn_partition_branch_3_logits_kernel,
+  },
+  {
+      av1_intra_mode_cnn_partition_branch_3_dnn_layer_0_bias,
+      av1_intra_mode_cnn_partition_branch_3_dnn_layer_1_bias,
+      av1_intra_mode_cnn_partition_branch_3_logits_bias,
+  },
+};
+
+#undef NUM_DNN_BRANCHES
+#undef NUM_CNN_LAYERS
+#undef BRANCH_0_NUM_DNN_LAYERS
+#undef BRANCH_1_NUM_DNN_LAYERS
+#undef BRANCH_2_NUM_DNN_LAYERS
+#undef BRANCH_3_NUM_DNN_LAYERS
+#undef CNN_LAYER_0_HEIGHT
+#undef CNN_LAYER_0_WIDTH
+#undef CNN_LAYER_0_IN_CH
+#undef CNN_LAYER_0_OUT_CH
+#undef CNN_LAYER_0_HORZ_STRIDE
+#undef CNN_LAYER_0_VERT_STRIDE
+#undef CNN_LAYER_1_HEIGHT
+#undef CNN_LAYER_1_WIDTH
+#undef CNN_LAYER_1_IN_CH
+#undef CNN_LAYER_1_OUT_CH
+#undef CNN_LAYER_1_HORZ_STRIDE
+#undef CNN_LAYER_1_VERT_STRIDE
+#undef CNN_LAYER_2_HEIGHT
+#undef CNN_LAYER_2_WIDTH
+#undef CNN_LAYER_2_IN_CH
+#undef CNN_LAYER_2_OUT_CH
+#undef CNN_LAYER_2_HORZ_STRIDE
+#undef CNN_LAYER_2_VERT_STRIDE
+#undef CNN_LAYER_3_HEIGHT
+#undef CNN_LAYER_3_WIDTH
+#undef CNN_LAYER_3_IN_CH
+#undef CNN_LAYER_3_OUT_CH
+#undef CNN_LAYER_3_HORZ_STRIDE
+#undef CNN_LAYER_3_VERT_STRIDE
+#undef CNN_LAYER_4_HEIGHT
+#undef CNN_LAYER_4_WIDTH
+#undef CNN_LAYER_4_IN_CH
+#undef CNN_LAYER_4_OUT_CH
+#undef CNN_LAYER_4_HORZ_STRIDE
+#undef CNN_LAYER_4_VERT_STRIDE
+#undef BRANCH_0_NUM_DNN_FEATURES
+#undef BRANCH_0_NUM_DNN_LAYER_0_UNITS
+#undef BRANCH_0_NUM_DNN_LAYER_1_UNITS
+#undef BRANCH_0_NUM_LOGITS
+#undef BRANCH_1_NUM_DNN_FEATURES
+#undef BRANCH_1_NUM_DNN_LAYER_0_UNITS
+#undef BRANCH_1_NUM_DNN_LAYER_1_UNITS
+#undef BRANCH_1_NUM_LOGITS
+#undef BRANCH_2_NUM_DNN_FEATURES
+#undef BRANCH_2_NUM_DNN_LAYER_0_UNITS
+#undef BRANCH_2_NUM_DNN_LAYER_1_UNITS
+#undef BRANCH_2_NUM_LOGITS
+#undef BRANCH_3_NUM_DNN_FEATURES
+#undef BRANCH_3_NUM_DNN_LAYER_0_UNITS
+#undef BRANCH_3_NUM_DNN_LAYER_1_UNITS
+#undef BRANCH_3_NUM_LOGITS
+
+static const float av1_intra_mode_cnn_partition_split_thresh_hdres[5] = {
+  100.000000f, 4.750139f, 1.655964f, 3.711212f, 0.963839f,
+};
+
+static const float av1_intra_mode_cnn_partition_no_split_thresh_hdres[5] = {
+  -100.000000f, -2.404842f, -3.858223f, -2.041206f, -1.573735f,
+};
+
+static const float av1_intra_mode_cnn_partition_split_thresh_midres[5] = {
+  100.000000f, 3.218737f, 2.657764f, 0.868458f, 2.454447f,
+};
+
+static const float av1_intra_mode_cnn_partition_no_split_thresh_midres[5] = {
+  -100.000000f, -3.842426f, -4.005076f, -3.642994f, -2.467197f,
+};
+
+static const float av1_intra_mode_cnn_partition_split_thresh_lowres[5] = {
+  100.000000f, 1.890757f, 2.658417f, 1.450626f, 1.833180f,
+};
+
+static const float av1_intra_mode_cnn_partition_no_split_thresh_lowres[5] = {
+  -100.000000f, -4.100921f, -4.564202f, -5.695176f, -1.483546f,
+};
+
+static const float av1_intra_mode_cnn_partition_mean[1] = {
+  1.191922f,
+};
+
+static const float av1_intra_mode_cnn_partition_std[1] = {
+  1.730044f,
+};
+
+static const int quad_to_linear_0[1] = { 0 };
+static const int quad_to_linear_1[4] = { 0, 1, 2, 3 };
+static const int quad_to_linear_2[16] = { 0, 1, 4,  5,  2,  3,  6,  7,
+                                          8, 9, 12, 13, 10, 11, 14, 15 };
+static const int quad_to_linear_3[64] = {
+  0,  1,  8,  9,  2,  3,  10, 11, 16, 17, 24, 25, 18, 19, 26, 27,
+  4,  5,  12, 13, 6,  7,  14, 15, 20, 21, 28, 29, 22, 23, 30, 31,
+  32, 33, 40, 41, 34, 35, 42, 43, 48, 49, 56, 57, 50, 51, 58, 59,
+  36, 37, 44, 45, 38, 39, 46, 47, 52, 53, 60, 61, 54, 55, 62, 63
+};
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_PARTITION_CNN_WEIGHTS_H_
diff --git a/libs/libaom/src/av1/encoder/partition_model_weights.h b/libs/libaom/src/av1/encoder/partition_model_weights.h
new file mode 100644
index 000000000..71c1ace78
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/partition_model_weights.h
@@ -0,0 +1,5646 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_PARTITION_MODEL_WEIGHTS_H_
+#define AOM_AV1_ENCODER_PARTITION_MODEL_WEIGHTS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/encoder/ml.h"
+
+// TODO(chiyotsai@google.com): The performance of these models are getting worse
+// due the changes in the encoder. We should retrain the models here to get
+// better performance once we have the time.
+
+#define FEATURE_SIZE 10
+#define LABEL_SIZE 16
+// nn model for ab partition pruning, 128x128.
+static const float av1_ab_partition_nn_weights_128_layer0[FEATURE_SIZE * 64] = {
+  -0.715251f, -0.015767f, -0.667353f, -0.345255f, 0.177887f,  -0.469759f,
+  0.426152f,  0.489798f,  0.469865f,  0.773821f,  0.088517f,  0.074585f,
+  0.838754f,  0.048449f,  -0.007584f, 0.638968f,  0.233305f,  -0.319236f,
+  -0.257124f, -0.170869f, 0.137180f,  0.114852f,  -0.721241f, -0.947962f,
+  -0.411298f, 0.494306f,  -0.060435f, -0.648421f, -0.126624f, 0.072686f,
+  -0.143904f, -0.115839f, -0.175527f, -0.117728f, 0.040686f,  -0.189925f,
+  0.134361f,  -0.258070f, -0.177558f, 0.158049f,  0.168668f,  -0.062919f,
+  0.341986f,  0.038100f,  -0.435577f, -0.321255f, 0.203213f,  0.213061f,
+  0.533304f,  0.359296f,  -0.079558f, 0.004637f,  0.663904f,  0.043779f,
+  0.383018f,  1.136559f,  -0.084155f, 0.333057f,  -0.199011f, 0.152059f,
+  -0.078419f, -0.167752f, -0.093651f, 0.083171f,  -0.190143f, 0.086195f,
+  -0.280632f, -0.160663f, -0.017298f, 0.122628f,  -0.138116f, 0.062927f,
+  0.222462f,  0.626979f,  0.426928f,  0.117170f,  -0.240457f, 0.053750f,
+  0.038017f,  0.007359f,  -0.017595f, 0.101407f,  0.332891f,  0.074933f,
+  0.306498f,  0.219380f,  -0.151638f, -0.247976f, 0.343405f,  0.121256f,
+  0.049173f,  0.171474f,  -0.139608f, -1.016599f, -0.345553f, -0.901138f,
+  0.243401f,  0.059928f,  -0.089396f, -0.195565f, 0.364705f,  -0.020400f,
+  -1.383672f, 0.413018f,  0.536950f,  -0.020904f, -1.335306f, -0.732290f,
+  0.102885f,  0.315290f,  -0.208521f, -0.081811f, 0.182300f,  0.125712f,
+  -0.593833f, -0.220639f, -0.314155f, 0.188327f,  0.118503f,  0.524427f,
+  -1.083859f, -1.130640f, 0.390352f,  -0.045591f, 0.113160f,  -0.009149f,
+  -0.096183f, 0.115829f,  0.377752f,  0.318396f,  -0.591983f, 0.004797f,
+  -0.497377f, -0.342248f, 0.079546f,  -0.025249f, -0.295972f, 0.615501f,
+  -0.464372f, 0.418315f,  -0.173556f, 0.105217f,  0.298073f,  0.082478f,
+  0.033223f,  0.977341f,  -0.372982f, -0.052337f, 0.154124f,  0.396787f,
+  0.536654f,  -0.139061f, -0.223702f, 0.229666f,  -0.846766f, 0.107723f,
+  0.563839f,  -0.483141f, 0.304813f,  -0.765283f, 0.070964f,  0.151101f,
+  0.275188f,  0.490303f,  1.175892f,  0.085377f,  -0.191200f, 0.544532f,
+  -0.365075f, 0.167546f,  0.052183f,  -0.220529f, -0.212227f, -0.144988f,
+  -0.273356f, -0.062023f, 0.103993f,  -0.238493f, -0.161204f, -0.054611f,
+  -0.166672f, 0.128327f,  0.461751f,  -0.545822f, 0.739798f,  0.594386f,
+  -0.163192f, -0.332501f, 0.363834f,  -0.065043f, 0.474812f,  -0.138811f,
+  0.170924f,  -0.778142f, -0.316474f, -0.508065f, -0.039986f, -0.478001f,
+  0.340591f,  0.041783f,  0.055419f,  0.015155f,  -0.981830f, -1.355237f,
+  0.347516f,  1.155327f,  0.081319f,  0.274163f,  -0.327230f, -0.113478f,
+  0.556552f,  -0.055986f, 0.217318f,  -0.445351f, 0.325759f,  0.526547f,
+  -0.657434f, -0.572214f, -0.037087f, 0.081384f,  0.064518f,  0.014892f,
+  0.215279f,  1.834504f,  -0.242107f, 0.079810f,  0.129558f,  0.079588f,
+  -0.035189f, -0.221745f, -0.163414f, 0.043978f,  -1.028662f, -0.623609f,
+  1.130336f,  0.664661f,  -0.063975f, -0.415863f, 0.018581f,  0.157758f,
+  0.200570f,  0.063420f,  0.901039f,  -0.746286f, 0.196230f,  -0.290592f,
+  0.042373f,  -0.502500f, 0.183638f,  0.103394f,  -0.298858f, 0.145436f,
+  0.196916f,  0.108319f,  -0.448572f, -0.881385f, 0.302497f,  0.121679f,
+  -0.021327f, 0.025150f,  0.481306f,  -0.359634f, 0.350257f,  -0.228647f,
+  -0.669860f, 0.260025f,  -0.034182f, 0.619247f,  -0.158826f, -0.405864f,
+  0.674112f,  -0.027885f, -0.325274f, -0.241492f, 0.036024f,  -0.437685f,
+  -0.091458f, -0.109295f, -0.350676f, 0.044706f,  0.297059f,  0.016290f,
+  1.121203f,  1.289062f,  -1.299476f, -1.129221f, 0.103752f,  0.131302f,
+  -0.263265f, 0.222155f,  -0.229908f, 0.013922f,  -0.226001f, -0.248383f,
+  -0.004415f, -0.020958f, 0.055634f,  0.086200f,  0.114556f,  -0.184061f,
+  -0.096210f, -0.146466f, -0.249618f, -0.195998f, 0.088758f,  0.023781f,
+  -0.264460f, 0.157026f,  -0.235228f, -0.102564f, 0.043463f,  -0.187823f,
+  -0.257500f, -0.199049f, -0.242210f, 0.030448f,  0.221604f,  0.151804f,
+  -0.100404f, -0.073931f, 0.144749f,  -0.001572f, -1.438079f, -0.233716f,
+  0.733422f,  1.727080f,  -0.036397f, 0.027551f,  0.425321f,  0.085703f,
+  0.031186f,  0.032333f,  -0.675130f, 1.437733f,  -0.202392f, -0.525003f,
+  0.087048f,  0.328194f,  -0.079989f, -0.391088f, -0.238732f, -0.120660f,
+  -0.139600f, 0.154665f,  0.026202f,  -0.233501f, -0.009046f, -0.149187f,
+  -0.199646f, 0.115375f,  0.209762f,  -0.014875f, 0.124038f,  -0.119985f,
+  1.079625f,  -0.461513f, 0.614114f,  0.021003f,  0.439449f,  -0.824834f,
+  -0.299701f, 0.193817f,  -0.870551f, -1.262313f, -0.079517f, 0.341570f,
+  0.305310f,  -0.089721f, -0.317314f, -0.075631f, 0.127172f,  -0.208635f,
+  1.191922f,  0.163141f,  0.564285f,  0.286352f,  0.480865f,  0.173094f,
+  -0.094034f, -0.071339f, -0.328992f, -0.006382f, 0.314705f,  0.090258f,
+  -0.016099f, 0.193230f,  0.188061f,  0.398144f,  0.722781f,  0.769949f,
+  0.025442f,  -0.162016f, 0.070192f,  -0.056946f, -0.100957f, -0.219934f,
+  -0.203492f, -0.015454f, -0.013272f, -0.098008f, 0.051707f,  -0.017493f,
+  0.527446f,  0.083605f,  0.588318f,  0.878215f,  0.028747f,  -0.146479f,
+  -0.345170f, -0.136059f, -0.152005f, -0.203634f, 0.232702f,  -0.101340f,
+  -0.027733f, -0.282611f, 0.265366f,  0.082362f,  -0.265420f, -0.131124f,
+  0.166303f,  0.040194f,  -0.100710f, 0.579151f,  -0.530136f, 0.163422f,
+  -0.998821f, -1.565311f, -1.774785f, -2.493372f, 0.116970f,  -0.090302f,
+  1.723272f,  0.552370f,  -0.295954f, -0.439095f, -0.266730f, 0.027936f,
+  0.539616f,  -0.234902f, -0.167601f, -0.149877f, -0.242983f, 0.122353f,
+  -0.121620f, -0.205517f, -0.180144f, -0.264208f, 0.151500f,  -0.159378f,
+  0.029145f,  -0.050892f, -0.223407f, -0.246239f, 0.043152f,  -0.018460f,
+  0.169972f,  -0.187769f, -0.034670f, -0.238330f, 0.288070f,  -0.093243f,
+  -0.437105f, -0.573376f, 0.660073f,  0.285727f,  0.408470f,  0.158475f,
+  0.032699f,  0.056280f,  -0.237176f, -0.083003f, 0.105598f,  -0.169522f,
+  -0.260420f, -0.121100f, -0.173983f, -0.195693f, -0.232028f, 0.224940f,
+  0.029124f,  0.009580f,  -0.252034f, 0.103087f,  1.156561f,  0.603848f,
+  -0.562805f, -1.652742f, -0.568288f, -1.829395f, 0.046169f,  0.076095f,
+  1.490819f,  0.415893f,  -0.277788f, -0.115787f, 0.093750f,  0.270726f,
+  -0.395983f, -0.353742f, 0.034605f,  0.005342f,  0.184537f,  0.086445f,
+  0.156417f,  1.476367f,  0.122587f,  0.002145f,  0.431057f,  -0.381184f,
+  -1.646457f, -0.014009f, -0.671224f, 0.193726f,  -0.019247f, -0.031267f,
+  -0.046208f, 0.298733f,  0.064734f,  0.616984f,  0.039381f,  0.182722f,
+  -0.116670f, 0.233093f,  -1.214374f, -0.817970f, -0.064394f, -0.584783f,
+  0.077697f,  -0.266720f, 0.130875f,  -0.235295f, -0.265754f, -0.159999f,
+  -0.250114f, -0.183017f, 0.194403f,  -0.105808f, -0.169215f, -0.240866f,
+  -0.026662f, -0.045123f, -0.036175f, -0.167471f, -0.192908f, -0.232602f,
+  -0.267036f, -0.112500f, -0.257944f, -0.111909f, -0.802226f, -0.008800f,
+  0.881460f,  -0.678603f, 0.008666f,  -0.252053f, -0.341035f, -0.175290f,
+  0.183012f,  0.385991f,  0.079888f,  -0.014039f, -0.148653f, 0.671778f,
+  -0.130219f, 1.086467f,  0.129267f,  -0.040400f, -0.201221f, -0.077005f,
+  0.015890f,  0.000781f,  0.137764f,  1.389546f,  0.172152f,  0.047279f,
+  -0.042783f, 0.127740f,  0.141467f,  -0.335738f, -1.396392f, 0.031496f,
+  0.357385f,  0.343602f,  -0.714553f, 0.311014f,  0.132845f,  0.061149f,
+  0.006796f,  0.568106f,  -0.255949f, 0.104134f,  -0.993447f, 0.298135f,
+  -0.406590f, -0.049228f, -0.578570f, -0.188561f, -0.107046f, 0.374095f,
+  0.068481f,  0.036240f,  -0.495801f, 0.180574f,  -0.766129f, 0.886967f,
+  -0.568868f, -0.936062f, -0.418886f, -0.058735f, -0.511964f, -0.438596f,
+  0.019016f,  -0.015837f, 0.600197f,  0.429773f,  0.315026f,  0.319667f,
+  0.214617f,  -0.017316f, 0.270257f,  -0.040524f, 0.695803f,  -0.015223f,
+  -1.554965f, 0.356997f,  -1.472428f, 0.024637f,  -0.562958f, 0.870351f,
+  0.193635f,  0.036063f,  0.328638f,  0.200274f,  -1.634707f, 0.110534f,
+  0.420104f,  -0.072042f, -0.006404f, 0.171680f,
+};
+
+static const float av1_ab_partition_nn_bias_128_layer0[64] = {
+  0.643147f,  -1.348826f, 0.431627f,  0.000000f,  0.102717f,  -0.772628f,
+  -0.034351f, -0.761977f, -0.638397f, 0.541969f,  -0.391311f, 0.563076f,
+  0.148553f,  0.267217f,  -0.788092f, 0.544573f,  -0.546280f, 0.000000f,
+  -0.446945f, 0.127732f,  0.270624f,  -0.219435f, -1.220203f, 0.324584f,
+  0.110885f,  0.276547f,  0.179726f,  -0.375160f, 0.026401f,  -0.032595f,
+  0.000000f,  -0.047932f, -0.648602f, -0.512637f, -0.031661f, -0.236761f,
+  0.476453f,  -0.028021f, -0.013673f, -0.015578f, -0.920077f, 0.000000f,
+  0.915351f,  -0.209962f, 0.000000f,  -0.025731f, 0.218288f,  0.000000f,
+  0.047726f,  -0.813077f, -1.263281f, 0.239087f,  0.278614f,  -0.030753f,
+  0.000000f,  0.346744f,  -0.948543f, -1.174211f, 0.216377f,  0.498913f,
+  0.853918f,  0.002504f,  -0.190403f, 0.452050f,
+};
+
+static const float av1_ab_partition_nn_weights_128_layer1[64 * LABEL_SIZE] = {
+  0.179769f,  1.499417f,  -0.445135f, -0.142278f, -0.337661f, 0.682064f,
+  -0.203213f, 0.302171f,  0.226877f,  -0.422169f, 1.687586f,  0.783773f,
+  0.220995f,  0.253482f,  0.370435f,  -1.342775f, 0.337229f,  -0.271473f,
+  0.291796f,  1.362227f,  -1.751397f, -0.086178f, 0.725496f,  -0.118597f,
+  0.227963f,  -0.501577f, 0.223849f,  -0.122421f, -0.123437f, -0.051045f,
+  -0.020115f, 0.212711f,  0.246025f,  0.088120f,  -0.168995f, 1.740190f,
+  -0.195098f, 0.680339f,  -0.589572f, -0.075244f, 0.878766f,  0.064092f,
+  -3.548527f, 0.001660f,  0.107926f,  -0.169501f, -0.455212f, 0.123045f,
+  -1.836998f, 0.330365f,  1.301475f,  0.454761f,  -0.576552f, -0.190761f,
+  0.208459f,  0.618483f,  1.383364f,  0.970718f,  0.390174f,  0.406252f,
+  -0.564519f, -0.312062f, 1.345712f,  -0.151873f, 0.109290f,  0.408847f,
+  0.391243f,  0.152024f,  0.181764f,  -0.036263f, -0.160466f, 0.153595f,
+  0.049163f,  -0.753012f, -1.804062f, 0.347475f,  -2.746580f, 0.575618f,
+  0.261799f,  0.210505f,  -0.302054f, -0.109872f, 0.199506f,  -1.182971f,
+  0.723668f,  0.177758f,  -0.338202f, 0.254396f,  -0.220023f, 0.043504f,
+  0.669866f,  -0.040816f, -0.402730f, 0.017990f,  0.215523f,  -0.216816f,
+  0.454826f,  -0.726067f, -0.018750f, -0.928679f, 0.154315f,  -0.465641f,
+  0.144566f,  -0.030064f, -0.054667f, -0.154055f, 0.625384f,  1.323795f,
+  -0.159496f, 0.097072f,  -0.463197f, -0.057938f, 0.750290f,  -0.233061f,
+  0.412631f,  -0.535223f, -0.151423f, -0.154583f, 0.024721f,  -0.494448f,
+  0.230594f,  -0.980138f, -0.653968f, 0.126079f,  0.051814f,  -0.053219f,
+  -0.421708f, -0.228853f, 0.237885f,  0.888157f,  0.059655f,  0.241295f,
+  0.210443f,  0.228238f,  0.119127f,  -0.051989f, -0.355408f, 0.182215f,
+  0.244277f,  -0.104577f, -0.558035f, -0.023270f, 0.054571f,  0.700646f,
+  -0.223006f, 0.115523f,  0.023391f,  0.437264f,  0.709477f,  -0.531212f,
+  -0.094731f, 0.328161f,  -0.105418f, -0.133511f, 0.497168f,  -0.030948f,
+  -0.407132f, -0.043943f, 0.155505f,  0.251945f,  0.205010f,  0.167160f,
+  0.083654f,  -0.636810f, 0.401315f,  -0.398414f, 0.290046f,  0.206846f,
+  0.042218f,  0.168150f,  0.843181f,  -0.671242f, -0.202392f, -0.073301f,
+  0.142895f,  0.237466f,  0.212145f,  -0.091828f, 0.187038f,  -0.720841f,
+  -0.616069f, -0.238021f, 0.065365f,  0.434119f,  0.179023f,  -0.040107f,
+  -0.430734f, -0.297368f, 0.575954f,  0.382619f,  -0.709787f, -0.320810f,
+  0.242342f,  -0.047614f, 0.705216f,  0.098077f,  0.357179f,  0.046017f,
+  0.115074f,  -0.412305f, -0.272304f, 0.048096f,  -0.803811f, 0.275000f,
+  0.642198f,  0.180286f,  -0.087178f, -0.112707f, -0.394443f, 0.201989f,
+  0.241759f,  -1.038870f, 0.728124f,  0.800559f,  -1.296268f, 0.198612f,
+  -0.053478f, 0.414344f,  -0.510529f, 0.124179f,  -2.219115f, -0.074583f,
+  -0.143055f, 0.001697f,  0.810811f,  -0.657140f, 0.186818f,  -0.936414f,
+  0.539578f,  -0.308244f, -0.126624f, -0.204767f, 0.091145f,  -0.049340f,
+  0.252014f,  0.394582f,  0.018764f,  -0.060377f, -0.019133f, 0.064083f,
+  0.069211f,  -0.526693f, 0.209850f,  -0.481466f, -0.468302f, -0.100407f,
+  0.241018f,  -1.037781f, 0.038539f,  -2.113840f, -0.974895f, 0.163187f,
+  0.425132f,  -0.772546f, -1.261254f, -0.217488f, -0.971748f, -0.805640f,
+  -0.745175f, -0.177077f, 0.217658f,  0.381431f,  -0.052338f, 0.087176f,
+  -0.165972f, 0.085937f,  0.472564f,  -0.796627f, -2.453307f, 0.569664f,
+  -0.233010f, -0.192134f, 0.064339f,  -0.111411f, -0.262469f, -0.410022f,
+  0.519993f,  -0.684620f, 0.393460f,  -0.277753f, -0.153624f, 0.528984f,
+  -0.415558f, -0.445863f, 0.588512f,  -0.142439f, -0.132127f, 0.199776f,
+  -0.579284f, 0.119488f,  -0.033590f, -0.503846f, -0.674979f, 0.335125f,
+  0.020519f,  0.233973f,  -0.297998f, -0.051511f, 0.518626f,  -0.412782f,
+  -0.074045f, 0.130523f,  0.465751f,  -0.117795f, 2.535813f,  0.352108f,
+  -0.499228f, 0.379784f,  0.056699f,  0.173142f,  -0.076519f, -0.026666f,
+  0.017834f,  0.492333f,  0.093364f,  0.037867f,  -0.165420f, -0.356429f,
+  -0.562334f, 0.057656f,  -0.307544f, 0.085857f,  -0.559851f, 0.107230f,
+  -0.398633f, 0.152618f,  -0.216835f, -0.024539f, 0.026044f,  -0.249519f,
+  -0.563594f, -0.746025f, 0.025265f,  -0.298888f, -0.185243f, 0.058794f,
+  0.233696f,  -0.115223f, 0.144617f,  -0.864390f, 0.619944f,  -0.023980f,
+  0.019481f,  0.225252f,  0.416552f,  -0.115993f, 0.935387f,  0.744386f,
+  0.053353f,  -0.052582f, -0.065650f, 0.228488f,  -0.032042f, -0.371252f,
+  -0.003638f, -0.736984f, -0.203776f, 0.030922f,  -0.065577f, -0.031643f,
+  -0.049253f, -0.054640f, 0.787134f,  0.545414f,  -0.140297f, -0.124274f,
+  -0.110011f, -0.029552f, 0.657005f,  0.214973f,  -0.374300f, 0.251642f,
+  0.276591f,  0.030566f,  -0.145470f, 0.350579f,  -0.356436f, -0.052694f,
+  -0.063966f, -0.751008f, -1.042392f, 0.328892f,  -0.425058f, -0.421571f,
+  -0.571889f, -1.141472f, -0.125216f, 0.212713f,  -0.485170f, -0.088791f,
+  0.124589f,  0.023237f,  0.077635f,  0.020901f,  -0.271402f, -0.321424f,
+  -0.513946f, -0.867872f, -0.284593f, 0.106276f,  0.220192f,  -0.143532f,
+  -0.014648f, 0.073402f,  0.327256f,  -0.139803f, 0.168763f,  0.048199f,
+  -0.122526f, 0.111713f,  -0.134257f, 0.810364f,  -0.085222f, -0.259221f,
+  -0.239349f, 0.044448f,  0.205031f,  0.413113f,  -0.107720f, -0.018816f,
+  -0.247741f, -0.004963f, 0.041170f,  -0.158019f, 0.134839f,  0.129502f,
+  0.800488f,  -1.041584f, -0.129336f, 0.170834f,  0.566586f,  -0.230443f,
+  0.437937f,  -0.149922f, -0.046665f, -0.094646f, 0.200070f,  0.072943f,
+  -0.076943f, -0.084971f, -0.515843f, -0.146720f, 0.472869f,  -0.444731f,
+  -0.100877f, 0.545196f,  -1.786626f, -0.482946f, 0.500509f,  -0.843257f,
+  0.200374f,  0.045103f,  -0.575718f, -0.164335f, -0.232522f, -0.021825f,
+  -0.139490f, 0.356058f,  -0.352075f, 0.061751f,  -0.200616f, -1.180921f,
+  -0.181355f, -0.137459f, 0.247574f,  0.181541f,  0.184314f,  -0.961482f,
+  0.493615f,  0.910261f,  -2.279238f, 0.648631f,  -0.055526f, -0.037137f,
+  0.038643f,  0.136609f,  -0.819373f, -0.040840f, -0.265989f, 0.006877f,
+  0.454651f,  -0.595323f, -0.099500f, -0.263717f, 0.150456f,  0.245077f,
+  -0.268666f, 0.162232f,  -0.516451f, -0.024501f, 0.188046f,  -0.002262f,
+  0.261319f,  0.004173f,  0.746982f,  0.174761f,  0.470447f,  -0.159558f,
+  -0.385240f, 0.023084f,  -0.133520f, -0.220607f, -0.018731f, -0.373558f,
+  -0.707763f, -1.850150f, -0.807404f, -0.168063f, -0.071435f, -0.160740f,
+  -0.478789f, -1.070674f, -0.489740f, -0.255796f, 0.100486f,  -0.153361f,
+  0.334394f,  -0.569472f, -0.198118f, 0.255922f,  0.104717f,  -0.065179f,
+  0.111879f,  -0.447237f, 1.373623f,  -0.190191f, -0.063311f, 0.337529f,
+  -0.138800f, 0.057009f,  -0.137006f, 0.641378f,  0.883147f,  -0.679655f,
+  0.267717f,  -0.351602f, -0.135225f, 0.229398f,  -0.513225f, -1.120345f,
+  0.528786f,  -0.051081f, 0.086653f,  0.140141f,  -0.563969f, 0.333402f,
+  -0.174745f, 0.321093f,  -0.438641f, -0.005131f, 0.247415f,  0.110120f,
+  -0.076308f, -0.083244f, 0.838944f,  -0.113043f, -0.013258f, -0.175028f,
+  -0.179941f, 0.272676f,  -0.047946f, -0.088076f, -0.450031f, 0.053929f,
+  -0.083549f, -0.089952f, -0.186253f, 0.257483f,  0.011019f,  0.586435f,
+  0.060580f,  -0.052078f, 0.090277f,  -0.780869f, 0.969811f,  -0.025349f,
+  -0.281917f, 0.014857f,  0.231863f,  -0.228601f, -0.003861f, 0.226550f,
+  0.141825f,  -0.102171f, -0.010387f, 0.220378f,  -2.561975f, -0.497071f,
+  -0.315117f, 0.371981f,  0.138247f,  0.625031f,  -0.308133f, -0.217876f,
+  0.005615f,  -0.860179f, 0.747491f,  0.006356f,  -0.057024f, -0.483189f,
+  0.055592f,  -0.316834f, 0.069858f,  0.218788f,  -0.200044f, 0.227588f,
+  0.215496f,  -0.055324f, -0.393147f, -0.394062f, -0.253264f, -0.075619f,
+  -0.152512f, -0.332995f, 0.129053f,  0.178668f,  -0.302694f, 0.030678f,
+  0.925896f,  0.964375f,  0.169021f,  -0.218657f, -0.627204f, 0.206437f,
+  -0.521336f, 0.176206f,  0.142733f,  0.139248f,  0.411682f,  0.181544f,
+  0.224850f,  -0.935547f, -0.558208f, 0.348096f,  0.342129f,  -0.389340f,
+  -0.236308f, -0.132099f, 0.073642f,  0.089391f,  -0.306901f, -0.397842f,
+  0.444282f,  0.074623f,  -0.051075f, -0.106617f, -0.184037f, -0.239046f,
+  -0.138761f, 0.120794f,  -0.647577f, -0.336471f, 0.527899f,  -0.164234f,
+  -0.028354f, 1.083678f,  -0.251534f, -0.145903f, -0.182783f, 0.070976f,
+  -0.199590f, -0.400306f, -0.029763f, -0.548042f, -0.266270f, -0.118084f,
+  -1.152632f, 0.383685f,  -0.105895f, -0.096829f, 0.118382f,  0.047447f,
+  -0.019051f, 0.310180f,  -0.162793f, -0.029574f, 0.058054f,  -0.636017f,
+  0.490639f,  0.158347f,  -0.385701f, -0.147057f, 1.285825f,  -1.276083f,
+  -0.021795f, -0.101600f, 0.163254f,  0.267160f,  -2.317864f, -0.098598f,
+  -0.296337f, -0.309017f, 0.164127f,  -0.270012f, -0.071187f, -0.262270f,
+  0.075415f,  -0.368328f, 0.186728f,  -0.158031f, 0.481663f,  0.515950f,
+  -0.162551f, 0.497981f,  0.262196f,  0.168479f,  0.726066f,  -0.243856f,
+  -0.058998f, 0.140168f,  0.053242f,  -0.624623f, -0.249480f, 0.055197f,
+  -1.376804f, 0.417571f,  0.203784f,  0.174370f,  -0.155531f, -0.029400f,
+  -0.491473f, 0.079811f,  -0.080123f, 1.345900f,  0.637077f,  0.434862f,
+  -1.787438f, 0.005756f,  -0.362706f, 0.179458f,  -0.288263f, 0.516788f,
+  -0.921248f, 0.043794f,  -0.137729f, -0.196171f, -0.046295f, -0.793781f,
+  -0.156532f, -0.132566f, 0.517989f,  -0.154321f, -0.054174f, -0.077900f,
+  -0.373316f, -0.117718f, 0.188986f,  -0.476188f, -0.245312f, 0.181439f,
+  -0.161024f, -0.229059f, -3.079907f, -0.225452f, -0.594355f, -0.558027f,
+  -0.135429f, 0.125766f,  -0.081314f, -0.350894f, -0.163165f, -1.936507f,
+  -0.205966f, 0.031472f,  0.744446f,  -0.006680f, -0.837551f, 0.605862f,
+  -0.854929f, -1.543750f, -0.307704f, -0.240517f, 0.178240f,  -0.183586f,
+  -0.010307f, 0.099373f,  -0.228278f, 0.175236f,  -0.000133f, 0.104491f,
+  -1.540545f, -0.570971f, -0.252885f, 0.483036f,  0.052531f,  0.260214f,
+  -0.515016f, -0.602081f, -0.485690f, -0.730710f, 0.163719f,  -1.775975f,
+  -0.298634f, 0.323626f,  -0.373579f, -0.872977f, 0.619574f,  0.026862f,
+  -0.122531f, -0.084698f, -2.436297f, 0.483996f,  -0.203640f, -0.302157f,
+  -0.150666f, -0.238320f, 0.089250f,  0.236485f,  -0.668654f, -0.122863f,
+  0.491152f,  -0.226444f, -0.181248f, 0.120158f,  0.294027f,  0.250056f,
+  0.307601f,  0.357875f,  -1.746455f, -0.175670f, 0.385447f,  -0.108808f,
+  -0.090235f, -0.642504f, -0.486004f, -0.055160f, -0.068692f, 0.009736f,
+  0.607555f,  -0.489426f, 0.150624f,  0.598114f,  -0.128816f, -0.445793f,
+  -0.066524f, -0.254380f, 0.227106f,  -0.406495f, -0.121632f, -0.275960f,
+  -0.136494f, 0.339457f,  -1.318132f, -0.417572f, -2.614077f, 0.324603f,
+  -0.001211f, 0.375192f,  -0.473448f, -0.162510f, 0.099329f,  -0.277965f,
+  0.101221f,  -0.060263f, 0.121867f,  -1.042140f, 0.440851f,  0.078898f,
+  -0.209007f, -0.243699f, 0.715197f,  -0.093997f, 0.086022f,  -0.178203f,
+  -2.275496f, -0.098413f, 0.199352f,  -0.526791f, -0.162086f, -0.197806f,
+  -0.231657f, -0.269202f, -0.794294f, -0.223461f, 0.503584f,  0.416236f,
+  0.064082f,  0.197655f,  0.340871f,  -0.186645f, -0.291498f, 0.433938f,
+  -1.110063f, 0.003751f,  0.392738f,  0.069360f,  0.102088f,  -0.302128f,
+  -1.518457f, 0.106939f,  0.404527f,  -0.306868f, -0.286928f, 0.729276f,
+  -0.531710f, 0.745048f,  -0.168837f, -1.953886f, -0.258828f, -0.190252f,
+  0.241877f,  -0.916744f, -0.030326f, -0.070541f, -0.271037f, 0.211303f,
+  -0.489957f, 0.100850f,  0.323999f,  -0.802837f, -0.462408f, -0.079350f,
+  -0.029374f, 0.131213f,  -0.825032f, 0.040202f,  0.351821f,  0.002869f,
+  -0.132516f, -0.471264f, -0.297002f, 0.263913f,  0.033478f,  0.146161f,
+  0.533229f,  -0.228608f, -0.200639f, -0.170955f, -0.915037f, 0.724491f,
+  0.005151f,  0.018584f,  -0.029771f, -0.396038f, -0.159236f, 0.038691f,
+  -1.197056f, 0.146302f,  0.226840f,  -0.852126f, 0.031214f,  0.108880f,
+  0.562000f,  -0.134633f, -0.713343f, -0.342252f, -1.764521f, -0.114653f,
+  0.515073f,  -0.080515f, -0.121155f, -0.865139f, -0.833694f, -0.368553f,
+  0.347673f,  0.623379f,  0.722067f,  -0.492458f, -0.513263f, 0.585167f,
+  0.721518f,  -0.693499f, 0.343725f,  -0.273861f, -0.040230f, -0.785664f,
+  -0.157500f, -0.308445f, 0.054062f,  0.600131f,  -0.860887f, 0.434470f,
+  -0.191382f, -0.306150f, -0.243965f, 0.705444f,  0.007789f,  -0.146154f,
+  -0.054499f, -0.073500f, -1.067364f, 0.404936f,  -2.864590f, 0.182323f,
+  0.326126f,  0.102405f,  -0.135800f, 1.128095f,  -0.012267f, -0.023996f,
+  -0.264834f, -0.108967f, -1.176746f, -0.926666f, 0.082999f,  -0.498361f,
+  0.083560f,  -0.210074f, 0.019225f,  -0.201614f, -0.904760f, 0.181421f,
+  0.586384f,  -0.177706f, 0.065471f,  0.168552f,  0.054705f,  0.045241f,
+  0.048057f,  -0.410957f, -2.188854f, -0.169812f, 0.015521f,  0.176856f,
+  -0.179331f, -0.352640f, -0.491735f, -1.743206f, 0.044227f,  0.010454f,
+  0.823643f,  -0.119781f, -0.098359f, 0.093119f,
+};
+
+static const float av1_ab_partition_nn_bias_128_layer1[LABEL_SIZE] = {
+  -0.433195f, -0.120488f, -0.116721f, 0.112134f,  0.118170f, -0.259769f,
+  -0.077530f, 0.394044f,  0.279167f,  -0.317988f, 0.189538f, 0.314776f,
+  0.325655f,  -0.107123f, 0.591049f,  0.358744f,
+};
+
+static const NN_CONFIG av1_ab_partition_nnconfig_128 = {
+  FEATURE_SIZE,  // num_inputs
+  LABEL_SIZE,    // num_outputs
+  1,             // num_hidden_layers
+  {
+      64,  // num_hidden_nodes
+  },
+  {
+      av1_ab_partition_nn_weights_128_layer0,
+      av1_ab_partition_nn_weights_128_layer1,
+  },
+  {
+      av1_ab_partition_nn_bias_128_layer0,
+      av1_ab_partition_nn_bias_128_layer1,
+  },
+};
+
+// nn model for ab partition pruning, 64x64.
+static const float av1_ab_partition_nn_weights_64_layer0[FEATURE_SIZE * 64] = {
+  -0.495347f, -0.049498f, -0.026804f, 0.030474f,  -0.289308f, -0.264193f,
+  -0.141121f, -0.072562f, -0.391665f, -0.051491f, -0.234761f, 0.027155f,
+  -0.038217f, 0.014872f,  -0.289728f, -0.233577f, -0.415875f, -0.343615f,
+  -0.442543f, -0.482492f, 0.073510f,  0.007503f,  2.162329f,  -0.362849f,
+  2.145915f,  -0.883135f, 0.185636f,  -0.062859f, -0.465574f, -0.486205f,
+  -0.056710f, -0.330642f, -0.321860f, 0.042321f,  -0.348965f, 0.003542f,
+  -0.291365f, -0.078164f, -0.345093f, -0.220272f, -0.471270f, -0.763853f,
+  0.246622f,  0.199651f,  -0.663420f, -0.154152f, -1.220383f, 0.047138f,
+  0.816811f,  0.083247f,  -0.218839f, 0.038143f,  -0.063436f, 0.015517f,
+  -0.307320f, -0.166956f, -0.169499f, -0.399005f, -0.234638f, -0.162266f,
+  0.050425f,  -0.221723f, -0.256942f, -0.287285f, 0.144011f,  -0.033245f,
+  0.083649f,  0.119428f,  -0.056706f, -0.117805f, 0.021866f,  -0.257300f,
+  -0.201378f, -0.217484f, -0.413780f, -0.145793f, 0.082792f,  -0.347247f,
+  0.042539f,  -0.302697f, 1.652316f,  0.000701f,  -0.482843f, -0.160332f,
+  -0.450099f, 0.212399f,  -4.715360f, -5.336774f, -5.375758f, -6.048339f,
+  0.085956f,  -0.037767f, 1.052409f,  -0.931924f, -2.221907f, 0.268946f,
+  0.015512f,  1.237094f,  -1.092185f, 0.418247f,  -0.082143f, -0.076914f,
+  -0.060749f, -0.325440f, -0.296960f, -0.066815f, -0.158477f, -0.373945f,
+  -0.122322f, -0.113495f, -0.097978f, -0.192816f, -0.270418f, 0.035840f,
+  -0.015458f, -0.121071f, -0.279582f, -0.067683f, 0.097855f,  0.019839f,
+  0.451127f,  0.004376f,  1.410392f,  3.255835f,  -0.344815f, 0.145202f,
+  0.204132f,  0.171948f,  -0.527736f, -0.110353f, 0.901448f,  0.003238f,
+  -3.822090f, 0.235462f,  1.024823f,  -0.821244f, 0.876056f,  2.553762f,
+  -3.478597f, -2.076582f, -0.265515f, -0.055923f, -0.156980f, -0.164097f,
+  -0.246040f, 0.039430f,  -0.071769f, -0.118847f, -0.304053f, -0.281541f,
+  -0.226021f, -0.263091f, -0.127359f, -0.249410f, -0.051023f, 0.083911f,
+  0.084721f,  0.168089f,  -0.272169f, -0.204998f, -0.008303f, -0.173998f,
+  0.079376f,  -0.197426f, -0.199052f, -0.118794f, -0.063753f, -0.094769f,
+  0.066176f,  -0.175832f, -0.238752f, -0.287960f, -0.134307f, -0.185953f,
+  -0.385845f, 0.119769f,  -0.006567f, -0.382126f, -0.214221f, 0.038449f,
+  -0.253484f, -0.282766f, -0.020249f, -0.193929f, 0.016281f,  -0.114423f,
+  -0.145940f, -0.281621f, -0.007588f, -0.131470f, -0.189012f, -0.185699f,
+  -0.279011f, -0.008132f, 0.208463f,  0.020569f,  -0.206803f, -0.213408f,
+  -0.206131f, -0.290245f, 0.069701f,  -0.000371f, -0.307572f, -0.451785f,
+  -0.300838f, -0.453186f, -0.301691f, 0.046327f,  -0.312668f, 0.058272f,
+  -0.303131f, -0.376252f, 0.108384f,  -0.086623f, -0.100630f, -0.027330f,
+  -0.003969f, 0.089502f,  -0.200722f, -0.107889f, 0.061843f,  -0.008478f,
+  -0.265057f, -0.271132f, -0.073562f, 0.129337f,  -0.283698f, -0.353414f,
+  0.076420f,  -0.244280f, -0.119537f, -0.105366f, -0.184692f, -0.038817f,
+  -0.478507f, -0.118808f, -0.472979f, -0.305884f, -0.462813f, -0.189581f,
+  -0.011932f, -0.585700f, 0.253212f,  -1.061900f, -0.205116f, -0.336407f,
+  -0.762199f, 0.577737f,  0.230832f,  0.434440f,  -0.096713f, 0.038552f,
+  -0.147800f, -0.213553f, 0.041740f,  -0.281907f, -0.026154f, -0.082356f,
+  -0.331871f, -0.408247f, -0.129022f, -0.037550f, -0.310233f, -0.320883f,
+  -0.391963f, -0.467392f, 0.027453f,  -0.394761f, -0.045544f, 0.076052f,
+  0.483985f,  0.067093f,  0.141361f,  0.576772f,  0.859718f,  2.566515f,
+  -0.025476f, 0.769738f,  -0.680235f, -1.683309f, -2.394131f, -0.000714f,
+  -0.615021f, -0.195856f, -0.434035f, -0.295010f, -0.668659f, -0.245959f,
+  0.551148f,  1.777227f,  -0.461630f, 0.043093f,  0.012293f,  -0.255841f,
+  -0.097070f, -0.371156f, -0.146323f, -0.015508f, -0.103873f, -0.087476f,
+  -0.297266f, -0.128699f, -0.149555f, 0.016534f,  -0.375498f, -0.346759f,
+  -0.455156f, -0.147509f, -0.427076f, -0.354431f, -0.158025f, -0.164604f,
+  -0.237038f, -0.010314f, -0.092884f, -0.397084f, -0.217980f, -0.127184f,
+  -0.048421f, -0.144133f, 0.889073f,  0.012606f,  3.007608f,  -0.602584f,
+  -1.849480f, -0.373159f, -1.890695f, -3.609938f, 0.811923f,  -1.867208f,
+  -0.244326f, -0.018012f, -0.211192f, -0.220196f, 0.169363f,  0.119141f,
+  -0.230715f, 0.083247f,  0.020367f,  -0.128629f, -0.217455f, -0.159640f,
+  1.815952f,  -0.369238f, -1.186447f, -0.658753f, -0.511026f, -0.096934f,
+  0.662971f,  0.486475f,  0.159746f,  -0.018932f, 3.692397f,  1.384353f,
+  -0.401984f, -0.248380f, -0.140861f, 0.215248f,  -0.023711f, 0.059679f,
+  -0.072260f, 0.004271f,  0.039545f,  -0.347971f, -0.081851f, -0.474896f,
+  -0.181572f, 0.066736f,  -0.157822f, -0.163760f, -0.171113f, -0.089935f,
+  -0.338281f, -0.421444f, -0.306687f, -0.085283f, -0.377953f, -0.138750f,
+  -0.102701f, -0.312336f, 0.149831f,  0.007229f,  -0.155700f, -0.173611f,
+  4.074261f,  1.342306f,  -1.272712f, 1.570899f,  -0.545093f, -0.317605f,
+  -0.189440f, -0.133910f, -0.273190f, -0.108020f, -0.166107f, 0.021413f,
+  -0.239130f, -0.067211f, 0.041957f,  -0.039234f, -1.003587f, -0.094412f,
+  0.532512f,  -0.870538f, -1.118023f, -1.160983f, -0.736307f, -0.418752f,
+  0.419466f,  0.492122f,  -0.004368f, -0.022096f, -1.115132f, 0.150886f,
+  2.396852f,  2.660000f,  -0.376537f, 0.468628f,  0.149413f,  -0.074898f,
+  -0.067154f, 0.021245f,  0.127857f,  0.294189f,  0.508056f,  0.390232f,
+  -3.899177f, -3.414681f, -3.929195f, -4.160545f, -0.274323f, -0.052583f,
+  -0.003545f, -0.433084f, -0.404891f, -0.145051f, -0.312367f, 0.004579f,
+  -0.398724f, -0.372068f, -0.234279f, 0.017799f,  -0.424760f, -0.646717f,
+  -0.047568f, 2.924664f,  -0.644165f, 0.359349f,  -0.294800f, 0.591746f,
+  -0.404710f, -0.092358f, -0.250729f, 0.030829f,  -0.147149f, -0.476023f,
+  -0.071803f, -0.482516f, -0.293117f, -0.215923f, -0.373122f, -0.085315f,
+  -0.377052f, -0.449899f, -0.056452f, 0.138081f,  -0.085350f, -0.308391f,
+  0.106661f,  0.176234f,  0.258869f,  -0.230172f, -0.233029f, -0.241208f,
+  -0.067509f, -0.223172f, -0.118353f, -0.302478f, -0.579632f, -0.561326f,
+  -0.158114f, -0.223167f, -0.026689f, 0.051863f,  0.212834f,  -0.304714f,
+  -0.169071f, -0.193695f, -0.075682f, -0.170860f, -0.241008f, -0.044648f,
+  0.280815f,  -0.002585f, -0.283552f, -0.037701f, -0.681169f, -0.274535f,
+  -0.380595f, 0.109504f,  -0.111141f, -0.437685f, -0.094459f, 0.144206f,
+  -0.106139f, -0.211832f, -0.054742f, -0.172813f, -0.295905f, -0.071907f,
+  -0.418429f, -0.183240f, 0.031319f,  -0.095785f, -0.315447f, 0.069404f,
+  -0.422910f, -0.029867f, -0.357321f, -0.199976f, -0.337707f, -0.070188f,
+  -0.178198f, 0.177208f,  0.134688f,  -0.081933f, -0.229452f, -0.208872f,
+  0.026287f,  -0.364040f, -0.063696f, -0.227443f, -0.234401f, -0.205699f,
+  -0.267238f, -0.494125f, -0.056255f, 0.053715f,  -0.487754f, 0.014818f,
+  0.087383f,  -0.077556f, -0.168085f, -0.436851f, -0.276286f, -0.137845f,
+  -0.107606f, -0.103653f, -0.233766f, -0.419083f, 0.169185f,  0.010186f,
+  -0.001587f, 0.086735f,  -2.465718f, 1.482185f,  1.621193f,  -2.081680f,
+  1.386553f,  -3.204335f, -0.267111f, -0.004508f, 0.164712f,  0.274147f,
+  1.724306f,  -2.273659f, 0.749574f,  -0.891905f, 0.105965f,  -0.030428f,
+  -0.416018f, -0.300762f, 0.122911f,  -0.316908f, -0.292504f, 0.138666f,
+  -0.161327f, -0.042143f, -0.249128f, 0.149210f,  -0.088987f, -0.654101f,
+  -1.501843f, 0.216777f,  0.955914f,  0.524158f,  -1.642561f, -1.643626f,
+  0.864797f,  -0.425451f, -2.115764f, -0.012502f, 0.065172f,  1.297270f,
+  0.018845f,  1.167276f,  -0.470970f, -0.244995f, 0.374782f,  -1.811056f,
+  -0.055430f, -0.024102f, -0.376519f, -0.339640f, -0.119177f, -0.277995f,
+  -0.290095f, -0.081362f, -0.144139f, -0.118037f, -0.180357f, -0.217559f,
+  -0.370683f, 0.172816f,  -0.265069f, 0.194321f,  -0.273478f, 0.037442f,
+  -0.235552f, -0.078625f, -0.447541f, 0.016836f,  -0.271123f, -0.171481f,
+  -0.321477f, -0.184826f, -0.442981f, -0.227273f, -0.370666f, -0.237232f,
+  -0.257493f, -0.225714f, -0.153716f, -0.283487f, -0.155399f, 0.067697f,
+  0.230343f,  -0.034318f, -0.022687f, -0.047090f,
+};
+
+static const float av1_ab_partition_nn_bias_64_layer0[64] = {
+  -0.212182f, -0.233725f, -0.758846f, -0.158162f, 0.614743f,  -0.150944f,
+  -0.075727f, -0.208414f, 1.054996f,  0.713758f,  -0.300051f, -0.151482f,
+  -2.443570f, 0.430590f,  -0.129001f, -0.160733f, -0.230547f, -0.143228f,
+  -0.140577f, -0.086812f, -0.212298f, -0.159557f, -0.055647f, -0.211423f,
+  0.578161f,  -0.220318f, -0.210107f, -3.111584f, 0.604419f,  -0.232622f,
+  -0.209924f, -0.130794f, -0.084097f, -0.036005f, 0.294594f,  -2.535531f,
+  -0.209783f, -0.211189f, -2.766337f, 0.000000f,  0.450177f,  -1.754884f,
+  3.262664f,  -0.209691f, -0.614886f, -0.211257f, -0.109096f, -0.190492f,
+  -0.109007f, -0.026910f, -0.136035f, -0.212321f, -0.139320f, -0.212233f,
+  -0.305430f, 0.739171f,  0.991277f,  -0.088150f, 0.086313f,  -0.023379f,
+  -0.125366f, -0.063576f, -0.212169f, -0.047463f,
+};
+
+static const float av1_ab_partition_nn_weights_64_layer1[64 * LABEL_SIZE] = {
+  -0.036800f, 0.528721f,  0.490767f,   0.144409f,  1.103640f,  0.361910f,
+  -0.180069f, 0.068033f,  -14.868382f, 0.359013f,  0.322567f,  -0.199212f,
+  0.906164f,  -0.488254f, 0.149653f,   -0.216394f, -0.099347f, 0.004936f,
+  -0.111391f, 0.074848f,  -0.041709f,  0.147627f,  -0.018905f, 0.096116f,
+  0.184817f,  -0.016241f, 0.115739f,   2.376754f,  0.637097f,  0.052954f,
+  0.136428f,  0.225267f,  -0.181873f,  -0.142876f, 0.684048f,  0.658791f,
+  0.105795f,  0.241705f,  1.381114f,   -0.209379f, 1.145949f,  0.795293f,
+  -9.361877f, 0.198302f,  0.539600f,   0.092317f,  -0.081695f, 0.200777f,
+  0.102334f,  0.081583f,  0.060948f,   -0.025110f, 0.160951f,  -0.020170f,
+  0.234006f,  -0.029369f, 0.375036f,   0.270209f,  -0.556529f, 1.402949f,
+  0.101777f,  -0.027331f, 0.004502f,   -0.153166f, -0.116651f, 0.151573f,
+  -0.022187f, 0.144044f,  -0.108719f,  -0.129942f, -0.270321f, 0.227363f,
+  1.892330f,  -0.661052f, -0.219398f,  -0.229417f, -0.856438f, -1.196988f,
+  -0.081774f, 0.078847f,  -0.207057f,  -0.048947f, 0.152073f,  -0.243056f,
+  -0.233329f, -0.288689f, -0.158333f,  -0.141177f, -0.715436f, 0.016947f,
+  -0.093752f, 0.204984f,  -1.209782f,  0.155683f,  0.092239f,  0.146495f,
+  0.813146f,  -0.027757f, 0.330982f,   2.173948f,  -0.028867f, -0.141815f,
+  0.292708f,  -0.204794f, 0.014496f,   1.032799f,  1.312155f,  0.107020f,
+  0.824752f,  -0.013945f, 0.184829f,   -0.041633f, 0.215300f,  -0.476088f,
+  -0.053213f, 0.126862f,  -0.020777f,  0.082893f,  -0.223727f, -0.923063f,
+  0.466529f,  0.082140f,  -0.845758f,  -1.140791f, -0.262033f, 0.138491f,
+  0.151717f,  -0.182479f, -0.131128f,  0.055411f,  0.106771f,  0.125552f,
+  0.297184f,  -0.257403f, -0.059884f,  -0.274903f, 2.694357f,  -0.108244f,
+  0.025377f,  0.043092f,  -0.558317f,  3.517159f,  -0.270833f, -0.240676f,
+  0.205100f,  -0.057068f, -0.140445f,  -0.193449f, -0.030061f, -0.286762f,
+  -0.467523f, -0.012647f, 0.190564f,   0.022394f,  -0.101479f, 0.339684f,
+  -0.902743f, -0.169578f, -0.178029f,  -0.041836f, -3.952108f, -0.028298f,
+  -0.221137f, -0.733895f, -0.223895f,  0.039012f,  0.687867f,  0.021423f,
+  0.113063f,  0.676087f,  -0.961000f,  -0.064847f, 0.712856f,  -0.192765f,
+  -0.001132f, 0.016689f,  -0.236020f,  -0.766186f, -0.175729f, 0.012879f,
+  -0.251064f, -0.105523f, -0.039212f,  -0.347584f, 0.304352f,  -0.034174f,
+  -0.364258f, -0.685252f, -0.266115f,  -0.247345f, -0.155905f, 0.152283f,
+  -0.156315f, 0.174082f,  -0.757654f,  0.102303f,  -2.192316f, -0.245815f,
+  0.119882f,  -0.086542f, 1.987246f,   -1.353163f, -0.374813f, -0.233504f,
+  -1.980895f, 0.692093f,  -0.168351f,  0.172700f,  -0.009052f, -0.015734f,
+  0.106679f,  -0.060472f, -0.256813f,  -0.074874f, -0.207488f, -0.329515f,
+  -0.418268f, -0.017940f, -0.036081f,  0.064719f,  -1.488016f, 0.020591f,
+  -0.176325f, -0.141074f, 0.944494f,   0.150237f,  -0.249805f, -0.277280f,
+  0.012686f,  0.132483f,  0.116123f,   0.013737f,  -0.116091f, 0.750340f,
+  3.251343f,  -0.188864f, 1.096992f,   0.058467f,  -0.041433f, -0.037937f,
+  -0.133294f, -0.137908f, -0.171132f,  0.106362f,  0.069383f,  -0.052662f,
+  -0.177883f, -0.408049f, 0.680221f,   -0.117035f, -0.904240f, -1.395228f,
+  0.154527f,  0.134427f,  0.022767f,   -0.158886f, -0.230316f, 0.161096f,
+  0.362213f,  -0.235060f, -0.941620f,  0.055912f,  -0.049458f, -0.166632f,
+  0.481418f,  0.930146f,  0.041108f,   0.033674f,  1.372066f,  -1.847709f,
+  0.003324f,  0.259534f,  0.177014f,   -0.202761f, -0.262017f, -0.190852f,
+  -0.102839f, 0.028338f,  0.187193f,   -0.041684f, 0.123973f,  -0.198576f,
+  -0.110369f, -1.431400f, 0.208369f,   -0.302370f, -0.248549f, 0.062985f,
+  0.673409f,  0.036662f,  -0.711340f,  -0.120584f, -0.189789f, 0.098812f,
+  2.947819f,  0.216567f,  -0.414472f,  -0.181742f, 1.873779f,  -0.222726f,
+  -0.782870f, 0.007889f,  0.015062f,   -0.554328f, 0.182928f,  -0.191430f,
+  0.123636f,  -0.215460f, -0.225245f,  0.251516f,  -0.013025f, -1.359595f,
+  -0.750602f, 0.342667f,  -0.141899f,  -0.687493f, -0.072639f, 0.048018f,
+  -0.242107f, -0.031917f, -0.287472f,  -0.046088f, 0.832197f,  -0.016576f,
+  -1.553349f, -0.216341f, 0.023077f,   -0.410867f, 4.243743f,  -0.514878f,
+  -0.066007f, -0.160696f, -0.262678f,  -0.648790f, -0.430586f, 0.199940f,
+  -0.202496f, -0.222241f, -0.016406f,  -0.121473f, 0.000828f,  -0.081584f,
+  -0.152641f, -0.190166f, 0.644400f,   0.040196f,  -0.302104f, -1.143654f,
+  -0.160327f, -0.320780f, -0.187006f,  0.037311f,  0.440618f,  -0.070733f,
+  -0.117785f, 1.527539f,  -0.419310f,  0.001300f,  1.389956f,  -0.036366f,
+  -0.269203f, 0.612265f,  2.721897f,   -0.086836f, -0.446999f, 0.012525f,
+  -0.078317f, -0.287052f, -0.111188f,  -0.085181f, -0.164667f, -0.010466f,
+  -0.569722f, -0.018888f, -0.101663f,  -1.147130f, -0.465204f, 0.114524f,
+  -2.192402f, -0.221325f, 0.375748f,   0.206284f,  -0.261548f, -0.246257f,
+  -0.143004f, -0.069981f, -0.057306f,  -0.116481f, -0.435903f, -0.314970f,
+  0.013210f,  -0.010175f, 4.630571f,   -0.473226f, -0.197199f, -0.028204f,
+  0.122907f,  2.475548f,  0.025011f,   -0.092603f, -0.127561f, -0.151330f,
+  -0.077295f, 0.245016f,  -0.045005f,  0.183396f,  -0.330556f, -0.384887f,
+  0.356374f,  -0.016618f, -0.463353f,  -1.291546f, -0.071986f, -0.311599f,
+  0.072385f,  -0.430786f, -2.094788f,  0.202733f,  -0.910109f, -1.336543f,
+  -0.086800f, -0.096413f, 1.544383f,   0.031860f,  -0.796211f, 0.762786f,
+  3.250022f,  -0.441798f, -0.698537f,  0.062839f,  0.033525f,  -0.362996f,
+  0.027022f,  -1.131264f, -0.228926f,  0.053885f,  -0.338628f, 0.155037f,
+  -0.046844f, -0.888172f, -0.241767f,  0.084965f,  -0.617743f, -0.049896f,
+  -0.036894f, -0.304783f, -0.002639f,  0.137957f,  0.052121f,  -0.131161f,
+  -0.117200f, -0.253380f, -0.205561f,  -0.302450f, -0.047397f, -0.330518f,
+  3.613420f,  -1.525951f, -0.026738f,  0.209150f,  -2.103534f, 2.019689f,
+  -0.366199f, -0.095260f, 0.027417f,   -0.242512f, 0.162579f,  0.052113f,
+  -0.293851f, -0.068138f, -0.005799f,  -0.344696f, -0.114824f, -0.431107f,
+  -0.120058f, -1.139926f, -1.048379f,  0.036446f,  -0.323020f, -0.432945f,
+  0.454151f,  -0.140058f, 0.050649f,   -0.094900f, -0.017278f, -0.238719f,
+  1.193153f,  0.120447f,  -0.496061f,  0.917431f,  2.936126f,  -0.115521f,
+  -0.347397f, -0.435325f, -0.004383f,  -0.211864f, 0.162383f,  -1.040726f,
+  0.089537f,  -0.128579f, -0.133505f,  0.107129f,  -0.435657f, -0.180388f,
+  0.043650f,  0.018709f,  -0.773242f,  -0.687192f, -0.120633f, -0.063626f,
+  0.029912f,  0.113972f,  -0.403502f,  -0.127640f, -0.269625f, 0.129794f,
+  -0.188539f, 0.041641f,  0.029769f,   -0.198374f, 1.401407f,  0.353887f,
+  -0.219925f, 0.260515f,  1.157034f,   -2.992044f, -0.097618f, -0.064417f,
+  -0.203626f, -0.008217f, -0.112339f,  -0.227407f, -0.155118f, 0.247705f,
+  -0.012304f, -0.248447f, -0.913463f,  -0.064788f, -0.214619f, -0.251761f,
+  -0.386861f, -0.040574f, -0.163219f,  -0.100700f, 1.488274f,  -0.071684f,
+  -0.033626f, -0.006497f, -0.246945f,  -0.145221f, -3.747390f, 0.149609f,
+  -0.263326f, -0.297385f, -1.039896f,  -0.083174f, -0.025473f, -0.235586f,
+  -0.001087f, 0.254286f,  0.265106f,   0.007325f,  0.199239f,  0.134103f,
+  -0.578211f, -0.259801f, -0.062373f,  2.368348f,  0.560556f,  -0.252260f,
+  0.889997f,  -0.447872f, -0.059218f,  -0.095315f, -0.061667f, 0.183580f,
+  -0.157479f, 0.055387f,  -0.831734f,  0.007606f,  -1.104906f, 0.301180f,
+  -0.117115f, 0.212959f,  4.727223f,   -0.243833f, -0.397495f, -0.025021f,
+  -0.367587f, -2.082058f, -0.217699f,  0.148111f,  0.252430f,  0.111088f,
+  -0.260692f, 0.095124f,  -0.407774f,  -0.322169f, 0.002927f,  0.126169f,
+  -1.272325f, -0.279772f, -0.373680f,  -0.485177f, -0.605458f, 0.021225f,
+  -0.092031f, -0.226585f, 1.895162f,   0.037866f,  -0.275475f, 1.614360f,
+  -0.014972f, -0.277679f, -3.449082f,  -0.092060f, -0.747873f, 0.020716f,
+  2.776178f,  -0.049963f, 0.183999f,   -0.295259f, -0.028868f, 0.221895f,
+  0.001265f,  0.336823f,  0.219372f,   0.112824f,  0.408132f,  -0.017940f,
+  -0.311666f, 1.489606f,  -0.058093f,  -0.305659f, -0.491933f, -0.143847f,
+  0.166115f,  0.042867f,  -0.123447f,  -0.087099f, -0.305395f, -0.365079f,
+  -0.755801f, -0.160649f, 0.736260f,   -0.008611f, 0.095836f,  -0.017345f,
+  5.697515f,  -0.498971f, -0.125280f,  0.199907f,  0.300053f,  0.605026f,
+  -0.228225f, -0.259523f, 0.016384f,   0.146973f,  0.210258f,  0.226766f,
+  -0.075178f, -0.050924f, 0.188496f,   -0.415266f, -0.484880f, -0.236384f,
+  0.071931f,  -0.331863f, -0.601243f,  -0.232479f, -0.285272f, 0.123789f,
+  -1.341333f, 0.037082f,  -0.315202f,  -1.587215f, -0.271576f, 0.003216f,
+  -4.437186f, -0.256205f, -0.576589f,  -0.114147f, 2.153916f,  -0.369618f,
+  0.271415f,  0.145036f,  -0.158731f,  -0.240938f, -0.187369f, 0.036325f,
+  0.254771f,  0.211488f,  -0.240297f,  0.098417f,  -0.415011f, 2.334793f,
+  -0.127252f, 0.020069f,  -0.168755f,  -0.448922f, -0.219207f, 0.016232f,
+  -0.221935f, -0.269500f, -0.100636f,  0.102545f,  -0.809376f, -0.054979f,
+  0.360713f,  -0.326541f, 0.112933f,   0.138073f,  4.229404f,  -0.763801f,
+  -0.305429f, 0.199955f,  -1.787713f,  0.272866f,  0.109895f,  0.138466f,
+  -0.250259f, -0.167162f, -0.212588f,  -0.217589f, -0.067125f, -0.077490f,
+  -0.208970f, -0.006863f, -0.671146f,  -0.298320f, -0.165509f, 0.044597f,
+  -1.408624f, -0.213957f, -0.220947f,  0.129718f,  1.316777f,  -0.098928f,
+  -0.008121f, -0.558293f, -0.297290f,  -0.218873f, -4.346638f, -0.228174f,
+  -0.204710f, -0.388864f, 2.697919f,   0.025260f,  0.857020f,  0.009921f,
+  0.036915f,  -0.320275f, -0.087937f,  0.022636f,  0.236667f,  0.135496f,
+  -0.059616f, -0.192955f, 0.009470f,   2.139589f,  -0.200449f, 0.129818f,
+  1.017444f,  -0.608299f, 0.257914f,   -0.134306f, -0.033327f, 0.002855f,
+  -0.338598f, 0.015559f,  0.117362f,   -0.166760f, 0.086903f,  -0.167666f,
+  0.193523f,  0.033852f,  -1.147686f,  0.489468f,  -0.006969f, 0.125630f,
+  1.557907f,  -1.604449f, -0.071114f,  0.096178f,  0.007065f,  0.200013f,
+  0.213393f,  0.168466f,  -0.100568f,  -0.117861f, -0.161542f, -0.072561f,
+  -1.069871f, -0.470138f, -0.352578f,  -1.503513f, -0.001394f, -0.380109f,
+  0.065089f,  -0.281668f, 0.988953f,   -0.002778f, -0.659026f, -0.470692f,
+  -0.407292f, 0.011710f,  -1.362085f,  0.184738f,  -0.135786f, -1.374241f,
+  4.487930f,  -0.067274f, -0.956404f,  -0.233995f, 0.224527f,  -0.454556f,
+  0.037900f,  -0.281658f, 0.208224f,   -0.254753f, 0.045740f,  0.051444f,
+  -0.388281f, 0.257112f,  -0.485030f,  -0.082659f, 0.148103f,  -1.007456f,
+  -0.022295f, 0.036984f,  -0.369401f,  -0.076943f, -0.007636f, -0.293022f,
+  0.470466f,  0.199012f,  -2.158182f,  0.036577f,  -0.014725f, -0.229516f,
+  2.236929f,  0.030945f,  -0.400045f,  0.109348f,  0.214691f,  -0.891516f,
+  -0.251379f, -0.217358f, 0.013733f,   0.205573f,  -0.151725f, -0.191782f,
+  -0.339630f, -0.163905f, -0.119191f,  -0.032516f, 0.503015f,  0.025772f,
+  0.029094f,  -1.146153f, 0.216723f,   -0.330023f, 0.064695f,  -0.262521f,
+  0.425612f,  -0.093080f, -0.489648f,  1.051293f,  -0.092332f, 0.095557f,
+  -0.874132f, 0.218483f,  -0.127648f,  -1.605802f, 2.763617f,  -0.186734f,
+  -1.243166f, -0.193514f, -0.173748f,  0.337822f,  0.183873f,  -0.251594f,
+  -0.211582f, 0.144081f,  0.029620f,   -0.024853f, -0.385140f, 0.467341f,
+  -0.928316f, -0.195442f, 0.917783f,   0.357084f,  0.174445f,  -0.073659f,
+  -0.012811f, -0.115420f, -0.181147f,  -0.364449f, -0.567395f, -0.012969f,
+  -1.680714f, 0.065323f,  0.198063f,   -0.244201f, 1.428545f,  -0.432539f,
+  -0.208931f, -0.091205f, 0.957125f,   0.813519f,  -0.262677f, 0.246852f,
+  0.015536f,  0.055026f,  0.067054f,   0.262103f,  -0.358115f, -0.095206f,
+  -0.267522f, -0.402710f, -0.680397f,  -0.123627f, -0.385590f, -1.504680f,
+  -0.169513f, -0.215338f, 0.043633f,   -0.079052f, -0.464410f, 0.122894f,
+  -0.278231f, -2.456445f, -0.159917f,  -0.015597f, -0.735449f, -0.078854f,
+  -0.400290f, -1.153870f, 3.657228f,   -0.287093f, -1.174355f, -0.102001f,
+  -0.288281f, 0.185209f,  -0.145228f,  -0.200449f, -0.099914f, -0.138354f,
+  0.254428f,  -0.161751f, -0.118206f,  0.296043f,  -0.482613f, 0.080932f,
+  1.097605f,  -0.010190f, 0.232439f,   0.447617f,  -0.133508f, 0.115763f,
+  -0.388589f, 0.174695f,  -0.236014f,  0.006284f,  -1.374129f, 0.092015f,
+  -0.241419f, -0.231667f, 2.763950f,   -0.922932f, -0.061605f, 0.208740f,
+  -1.597190f, 1.353325f,  -0.198528f,  0.250498f,  -0.013950f, -0.203861f,
+  -0.254563f, 0.081931f,  -0.413369f,  0.011844f,  0.080961f,  -0.231161f,
+  -1.234909f, -0.440843f, -0.174980f,  -0.315283f, -0.337474f, -0.123243f,
+  -0.310001f, -0.271028f, 0.364179f,   0.022845f,  -0.535517f, -0.772936f,
+  -0.188435f, 0.039667f,  -0.807463f,  0.266550f,  -0.288857f, -1.630789f,
+  1.280155f,  0.065712f,  -0.279960f,  -0.300056f, 0.258440f,  -0.073781f,
+  0.213878f,  0.042196f,  0.021360f,   0.211698f,  -0.003751f, -0.192673f,
+  -0.137008f, 0.247878f,  -0.470604f,  0.073164f,  1.523241f,  0.734755f,
+  -0.114126f, -0.193834f, -0.025759f,  0.263183f,
+};
+
+static const float av1_ab_partition_nn_bias_64_layer1[LABEL_SIZE] = {
+  -0.343508f, -0.706936f, -0.160676f, -0.877101f, -0.517567f, -0.253254f,
+  -0.148074f, 0.923430f,  -0.364770f, 0.203550f,  0.401216f,  0.938246f,
+  -0.872737f, 0.718723f,  0.703398f,  2.560015f,
+};
+
+static const NN_CONFIG av1_ab_partition_nnconfig_64 = {
+  FEATURE_SIZE,  // num_inputs
+  LABEL_SIZE,    // num_outputs
+  1,             // num_hidden_layers
+  {
+      64,  // num_hidden_nodes
+  },
+  {
+      av1_ab_partition_nn_weights_64_layer0,
+      av1_ab_partition_nn_weights_64_layer1,
+  },
+  {
+      av1_ab_partition_nn_bias_64_layer0,
+      av1_ab_partition_nn_bias_64_layer1,
+  },
+};
+
+// nn model for ab partition pruning, 32x32.
+static const float av1_ab_partition_nn_weights_32_layer0[FEATURE_SIZE * 64] = {
+  -0.323723f, -0.214013f, -0.007772f, -0.458851f, -0.125542f, -0.123860f,
+  -0.410973f, -0.209389f, -0.087580f, -0.272881f, -0.168500f, -1.130845f,
+  0.344916f,  -0.475017f, -0.362262f, -0.195662f, -0.566124f, 0.782163f,
+  0.411575f,  -0.013378f, -0.318650f, -0.124678f, -0.612909f, -0.315788f,
+  -0.263990f, -0.508783f, -0.048938f, -0.416407f, -0.402648f, -0.156644f,
+  0.225887f,  -0.000493f, 2.682241f,  0.871204f,  0.059014f,  0.803542f,
+  -1.407028f, -1.154669f, 1.388148f,  -0.293348f, -0.003669f, -0.009607f,
+  1.330030f,  -0.337841f, 2.118617f,  1.033059f,  -0.084788f, 0.212904f,
+  0.082405f,  -0.070579f, -0.494005f, -0.173392f, 0.039546f,  -0.463865f,
+  0.077163f,  -0.434066f, 0.030835f,  -0.427139f, -0.560520f, -0.031606f,
+  -0.368541f, -0.027458f, 0.370574f,  0.461418f,  1.087682f,  -0.572137f,
+  -1.509596f, -0.765697f, -0.499383f, -0.277998f, -0.106492f, -0.129564f,
+  -0.169133f, -0.269834f, -0.114270f, -0.275431f, 0.016339f,  -0.156744f,
+  -0.267922f, 0.171216f,  0.110556f,  0.002954f,  -0.200327f, -0.187663f,
+  3.691601f,  1.234152f,  0.186315f,  -0.125370f, -0.211235f, -0.554432f,
+  -0.131072f, -0.124982f, -0.130339f, -0.235350f, 0.018903f,  0.012896f,
+  -0.159372f, -0.269571f, -0.025709f, -0.221251f, 0.061919f,  0.016307f,
+  0.384673f,  -0.134525f, -1.599126f, -0.416459f, -0.743052f, 0.670249f,
+  -0.169709f, 0.421681f,  -0.033360f, -0.072817f, 0.003647f,  -0.110632f,
+  -0.158651f, -0.095136f, 0.223759f,  0.165767f,  -0.269129f, -0.196075f,
+  -0.023183f, -0.293420f, 0.014875f,  0.018688f,  -0.153407f, -0.172009f,
+  -0.259947f, -0.124015f, 0.173653f,  -0.089103f, -0.021001f, -0.334230f,
+  0.027177f,  0.103371f,  -0.183860f, -0.204051f, -0.023721f, -0.192297f,
+  -0.143771f, -0.247106f, 0.218116f,  -0.013240f, 2.831783f,  1.483928f,
+  -0.877025f, -0.313462f, -0.411320f, -0.447825f, 0.605977f,  0.234684f,
+  -0.119150f, -0.075182f, -0.330463f, 0.071503f,  -0.254924f, -0.360071f,
+  -0.037022f, 0.063261f,  -0.148759f, -0.238254f, -0.462018f, -0.027166f,
+  0.065318f,  -0.235743f, -0.257194f, -0.094784f, 0.022423f,  0.055925f,
+  0.086672f,  -0.021010f, 0.009965f,  -0.001648f, -0.104917f, -0.387443f,
+  -0.102673f, -0.281706f, 0.145923f,  -0.233391f, -0.378365f, -0.145584f,
+  -0.077751f, -0.121166f, 1.134565f,  -0.097500f, -0.749202f, -0.544566f,
+  -1.361374f, -0.102494f, 1.089275f,  0.375299f,  -0.105091f, 0.037641f,
+  -0.054248f, -0.282691f, -0.377797f, -0.066427f, -0.253815f, -0.329677f,
+  -0.339326f, -0.128217f, -0.282905f, 0.014937f,  1.067185f,  -0.171764f,
+  0.484458f,  0.396706f,  -0.557055f, -0.891596f, -0.257839f, -0.720879f,
+  -0.218449f, -0.004755f, 1.572857f,  0.006229f,  1.962895f,  -0.029746f,
+  -4.137691f, -2.185991f, -2.763477f, -0.520437f, -0.208708f, 0.006444f,
+  -1.263078f, -0.304560f, 1.072374f,  2.556429f,  0.312850f,  0.257488f,
+  -0.634264f, 0.156769f,  -0.188943f, 0.040295f,  -0.389915f, 0.085250f,
+  -0.248525f, 0.045667f,  -0.776115f, -0.274680f, -0.448145f, -0.566161f,
+  -1.285316f, 0.079060f,  0.389124f,  -0.510401f, -0.015299f, -0.664661f,
+  0.099901f,  -0.470694f, -0.051593f, -1.076381f, -0.442104f, -0.197867f,
+  -0.330011f, -0.448523f, -0.301018f, -0.442093f, -0.491953f, -0.582091f,
+  -0.064569f, -0.156516f, 0.543522f,  -0.005924f, 0.161432f,  0.974793f,
+  0.273712f,  1.104850f,  -0.290312f, 0.313417f,  -0.125370f, 0.136234f,
+  -0.191227f, -0.165054f, 0.011872f,  -0.298871f, 0.095740f,  0.142760f,
+  -0.215771f, -0.031437f, 0.101041f,  -0.085620f, 0.435387f,  0.002786f,
+  1.971375f,  0.018392f,  -1.771940f, -0.401433f, 0.808263f,  -3.350013f,
+  2.296952f,  -1.024403f, -0.041645f, -0.034799f, -0.024078f, -0.347301f,
+  -0.276088f, -0.455907f, 0.266021f,  0.087348f,  -0.146566f, 0.040492f,
+  -0.539866f, -0.206851f, -0.387874f, -0.125508f, -0.496676f, -0.373845f,
+  -0.472356f, -0.357082f, -0.081254f, -0.456466f, 0.554713f,  0.002185f,
+  -4.225019f, 0.344025f,  0.728796f,  -0.262936f, 1.383924f,  1.577300f,
+  -2.653320f, -2.516156f, -0.301604f, -0.204105f, -0.138252f, -0.587536f,
+  -0.097889f, -0.352414f, -0.288276f, -0.184340f, -0.122741f, -0.243376f,
+  0.031970f,  -0.373402f, -0.396079f, 0.045566f,  0.072595f,  -0.222681f,
+  -0.243802f, -0.340129f, -0.258494f, -0.192041f, -0.386112f, -0.240940f,
+  -0.047268f, -0.555802f, -0.032514f, -0.241341f, -0.167463f, -0.478308f,
+  -0.205936f, -0.316275f, 0.103729f,  -0.197893f, -0.128029f, -0.218796f,
+  -0.167362f, -0.111814f, -0.126062f, -0.394260f, -0.025357f, -0.402697f,
+  -0.587395f, -0.400385f, -0.259664f, -0.415588f, -0.338503f, -0.399166f,
+  -0.270504f, 0.234505f,  0.272144f,  0.266938f,  -0.392395f, -0.011717f,
+  -0.384221f, -0.473446f, -0.038420f, -0.241101f, -0.234402f, -0.275567f,
+  -0.410454f, -0.377599f, -0.179099f, -0.138432f, -0.248083f, -0.543026f,
+  -0.428043f, -0.239895f, -0.333193f, -0.103346f, -0.039038f, -0.171109f,
+  -0.119432f, -0.222351f, 0.000450f,  0.208724f,  -0.510526f, -0.144656f,
+  -0.316721f, -0.344846f, -0.244794f, -0.129134f, -0.045634f, -0.400183f,
+  0.043714f,  -0.235414f, 0.115594f,  -0.195616f, -0.106693f, -0.124242f,
+  0.083990f,  0.049110f,  -0.196130f, -0.059860f, -0.464235f, -0.516443f,
+  -0.101521f, -0.422379f, -0.413955f, -0.042991f, -0.345263f, -0.129264f,
+  -0.106911f, -0.140156f, -0.457841f, -0.199848f, -0.218954f, -0.329850f,
+  -0.364097f, -0.335262f, -0.312254f, -0.299331f, -0.052710f, -0.251019f,
+  -0.023459f, -0.222538f, 0.028849f,  -0.088038f, -0.301550f, -0.273566f,
+  0.067295f,  -0.174608f, -0.445784f, -0.158366f, -0.567275f, -0.557652f,
+  -0.353503f, -0.302092f, -0.302049f, -0.551793f, -0.034535f, -0.225190f,
+  -0.210733f, -0.219377f, -0.057197f, -0.430933f, -0.025185f, -0.388150f,
+  -0.086147f, -0.430088f, 0.058466f,  -0.152129f, -0.058411f, -0.236392f,
+  -0.547669f, -0.613849f, -0.893774f, -0.351715f, -0.399227f, -0.454909f,
+  -0.324501f, 0.000490f,  -0.282167f, -0.073163f, -0.281452f, 0.047932f,
+  -0.175500f, 0.165220f,  -0.276212f, 0.062153f,  -0.217054f, -0.255487f,
+  -0.146416f, -0.097718f, -0.173809f, -0.559328f, -0.055695f, -0.391193f,
+  -0.132020f, -0.561184f, -0.308666f, -0.474053f, -0.219149f, -0.246558f,
+  -0.158325f, 0.151907f,  -0.266835f, -0.144697f, -0.193960f, -0.046587f,
+  -0.220028f, -0.247355f, 0.135584f,  0.016511f,  0.367705f,  -1.855877f,
+  0.435622f,  0.444710f,  -3.372301f, -3.030489f, 1.013267f,  0.380951f,
+  -0.170011f, -0.111415f, -0.456146f, -0.107254f, -0.095220f, -0.053078f,
+  -0.135864f, -0.591949f, -0.252810f, -0.324799f, -0.094796f, -0.260969f,
+  -0.391981f, -0.063170f, -0.336130f, -0.470127f, -0.405168f, -0.433219f,
+  -0.309563f, -0.295462f, -0.552270f, -0.012300f, -0.057793f, -0.034494f,
+  -0.446843f, -0.640160f, -1.188681f, -0.791361f, 0.543271f,  1.189112f,
+  1.458468f,  -0.005876f, -0.927475f, 0.062038f,  -1.170818f, 0.338227f,
+  -3.007096f, -4.559296f, -4.045457f, -5.953635f, -0.228386f, -0.266890f,
+  -0.092595f, -0.377440f, -0.044534f, -0.053565f, -0.349268f, -0.415030f,
+  -0.310094f, 0.062721f,  0.251422f,  -0.014350f, -1.282910f, 1.619560f,
+  1.180566f,  -0.032163f, -1.322951f, -0.603601f, 1.443710f,  0.654650f,
+  -0.393227f, 0.003536f,  0.029725f,  -0.108925f, -0.053911f, 0.133977f,
+  -0.036145f, -0.168438f, 0.046989f,  -0.331463f, -0.176983f, -0.311922f,
+  -0.272389f, -0.379592f, -0.399993f, -0.297873f, -0.193425f, -0.177524f,
+  -0.258309f, -0.567312f, -0.260217f, -0.241869f, 0.024010f,  -0.032867f,
+  -0.039424f, -0.063670f, 0.193808f,  -0.303514f, -0.013376f, -0.057761f,
+  0.187922f,  0.006938f,  0.031810f,  0.180594f,  -1.198427f, 2.820662f,
+  0.154986f,  -0.375518f, 0.116925f,  -0.795782f, -0.085139f, -0.079365f,
+  -0.197936f, -0.321468f, -0.205271f, -0.558203f, -0.296235f, -0.151193f,
+  -0.158282f, -0.245402f, -0.208504f, -0.042335f, -0.087426f, -0.557129f,
+  -0.381427f, -0.441551f, -0.541011f, -0.060567f, -0.469305f, -0.032326f,
+  -2.453587f, -0.045568f, -0.296932f, 0.613061f,  -0.320284f, 0.191620f,
+  -0.827145f, -0.225277f, 0.275800f,  1.696635f,
+};
+
+static const float av1_ab_partition_nn_bias_32_layer0[64] = {
+  -0.176206f, 0.660189f,  -0.186156f, -2.481963f, -1.564218f, -0.280424f,
+  0.732684f,  -0.135581f, -2.193132f, -0.172771f, 0.605001f,  -0.060392f,
+  -0.067190f, -0.132969f, -1.410812f, -0.298701f, -0.105963f, -0.086173f,
+  0.632779f,  0.005585f,  1.310169f,  1.392136f,  -0.563860f, -0.051053f,
+  0.660998f,  -0.214726f, -1.894342f, -0.128288f, -0.330721f, -0.053988f,
+  -0.177726f, 1.200859f,  -0.178902f, -0.172620f, -0.184476f, -0.175559f,
+  0.538503f,  -0.322158f, -0.219080f, -0.058208f, -0.171347f, -0.216060f,
+  -0.174950f, -0.295740f, -0.184820f, -0.213896f, 1.317728f,  -0.020116f,
+  -0.208096f, 0.000000f,  1.246166f,  -0.225421f, -0.181555f, 0.861761f,
+  1.172429f,  -0.172892f, -0.737092f, -0.189904f, -0.179385f, -0.114618f,
+  -1.384604f, -0.201713f, -0.271948f, 0.372351f,
+};
+
+static const float av1_ab_partition_nn_weights_32_layer1[64 * 16] = {
+  -0.037828f,  1.529029f,  0.004927f,  1.475763f,  0.627172f,  0.325872f,
+  -0.990757f,  0.129476f,  0.889958f,  -0.082031f, 0.332133f,  0.074422f,
+  -0.176212f,  -0.074355f, 0.774378f,  0.110987f,  -0.155469f, 0.253310f,
+  0.882538f,   0.253605f,  0.332436f,  -5.389474f, 0.278470f,  0.168644f,
+  0.914611f,   0.154165f,  0.809262f,  -0.174734f, 0.923673f,  0.064716f,
+  -0.070228f,  -0.228735f, 0.002312f,  0.112222f,  -0.045502f, -0.046004f,
+  0.514101f,   0.306480f,  0.021232f,  -0.015955f, -0.288260f, 0.189177f,
+  -0.104158f,  0.103273f,  0.096910f,  -0.086328f, 1.327289f,  -0.154247f,
+  0.056676f,   -0.243327f, -0.646676f, 0.177221f,  -0.086761f, 0.729729f,
+  -14.710893f, -0.044881f, 0.339003f,  -0.134737f, 0.073621f,  -0.162913f,
+  1.215237f,   0.140723f,  0.138630f,  1.241719f,  0.204092f,  -0.463080f,
+  -0.176086f,  1.125868f,  1.034814f,  0.225455f,  -0.203421f, -0.078787f,
+  -0.527498f,  0.012491f,  -0.563307f, -0.170792f, 0.002679f,  0.116153f,
+  0.211348f,   -0.191900f, -0.212505f, 0.263445f,  -0.074679f, -0.081441f,
+  -0.815405f,  2.448215f,  0.781299f,  0.149542f,  -1.045162f, 0.043014f,
+  0.217381f,   -0.094500f, -0.090427f, 0.025784f,  -0.228906f, -2.741798f,
+  0.230475f,   -0.256112f, -0.103297f, 0.159121f,  -0.229793f, -0.014883f,
+  -0.104131f,  -0.123816f, 0.164148f,  -0.052279f, -0.071845f, -0.041197f,
+  0.208527f,   -0.234197f, -0.542336f, 0.020053f,  0.088870f,  0.014346f,
+  2.502164f,   -0.010244f, -0.267792f, 0.844394f,  2.711486f,  -0.015262f,
+  -0.868053f,  -0.295704f, 0.222289f,  -0.000286f, -0.352098f, -0.079000f,
+  0.021267f,   -0.721739f, -0.240558f, -0.384775f, 0.065974f,  -2.161058f,
+  0.195889f,   0.268966f,  -0.009329f, 0.014949f,  0.314943f,  0.235885f,
+  0.072591f,   -0.127120f, 0.150784f,  0.105697f,  -1.297403f, -0.207509f,
+  -0.217688f,  -0.076752f, 0.170952f,  -0.294235f, 0.449973f,  -1.712690f,
+  0.860989f,   0.054757f,  -0.812627f, -0.105316f, -0.736230f, -0.133192f,
+  -3.741608f,  0.495660f,  -0.288936f, 4.654852f,  -0.021305f, -0.308916f,
+  0.049205f,   -0.259996f, 0.114248f,  -0.252647f, -0.253180f, -0.449314f,
+  0.022979f,   0.063281f,  -0.196154f, 0.078295f,  -0.322317f, -0.145142f,
+  0.300573f,   0.048385f,  -0.254787f, 0.123939f,  -1.263088f, -0.228565f,
+  -0.389061f,  0.391084f,  2.322438f,  0.075009f,  0.225743f,  -0.198808f,
+  -0.280538f,  -0.173939f, -0.120543f, -0.070792f, -0.417187f, -0.781056f,
+  -0.102756f,  -1.760965f, 0.019149f,  -0.867342f, 0.347141f,  0.031588f,
+  0.302572f,   -0.203573f, -0.357320f, -0.096078f, -0.527528f, 0.046699f,
+  -0.108561f,  -0.167077f, -2.851509f, -0.307116f, 0.202720f,  -0.160280f,
+  -0.215525f,  0.064355f,  -0.427220f, 1.516230f,  0.634453f,  0.099400f,
+  -1.013887f,  -0.029740f, -0.093426f, -0.044272f, -1.297636f, -0.237614f,
+  -0.160953f,  0.399036f,  -0.030685f, -0.113619f, -0.184704f, 0.040519f,
+  -0.588252f,  -0.210235f, -0.067623f, -0.031841f, -0.107261f, -0.192582f,
+  -0.253959f,  -0.430821f, -0.103184f, -0.280185f, -0.357723f, 0.197761f,
+  -0.175087f,  -0.055171f, 1.642014f,  -0.192559f, -0.288147f, 0.610311f,
+  4.688195f,   -0.128728f, -0.914869f, -0.108286f, 0.013789f,  0.092125f,
+  0.019770f,   -0.178386f, 0.074164f,  -1.152658f, -0.216738f, -0.277286f,
+  0.012381f,   0.418259f,  -0.680727f, -0.221917f, -0.485946f, 0.101672f,
+  2.009457f,   0.054302f,  1.019838f,  -0.116170f, 0.165134f,  -0.112567f,
+  0.852632f,   -0.385796f, -0.108666f, 0.053181f,  -0.311797f, -0.372875f,
+  -0.675717f,  2.409268f,  -0.514720f, -0.214245f, -0.646596f, 0.009756f,
+  0.203993f,   0.093617f,  -0.301290f, 0.253551f,  -0.128909f, -1.448442f,
+  -0.186823f,  -0.278001f, -0.294993f, -0.176928f, -0.473605f, 0.062049f,
+  -0.212084f,  -0.137326f, 0.012505f,  0.087850f,  -0.200413f, -0.394119f,
+  -0.132224f,  0.146917f,  0.155746f,  0.198725f,  -0.322541f, 0.196391f,
+  -0.945500f,  0.036736f,  -0.155646f, -0.677341f, 1.130545f,  -0.339554f,
+  0.411628f,   -0.355813f, -0.249843f, 0.213694f,  -2.035607f, 0.055694f,
+  -0.111669f,  0.408696f,  -0.067043f, -0.048182f, 0.398110f,  -0.067542f,
+  1.459801f,   0.236833f,  -0.178806f, 0.168758f,  0.492387f,  0.099691f,
+  -0.776680f,  -0.172865f, 0.204225f,  0.193982f,  0.575685f,  -0.062248f,
+  0.011486f,   0.058571f,  -0.493391f, 0.026893f,  -0.900467f, 3.793129f,
+  -0.634613f,  -0.064660f, -0.048262f, 0.361905f,  0.033641f,  0.245171f,
+  -0.064671f,  0.034954f,  0.204358f,  -0.904023f, -0.052714f, -0.250134f,
+  0.136700f,   0.000734f,  -0.371720f, 0.226483f,  0.217958f,  0.060559f,
+  0.180111f,   0.000970f,  0.079556f,  -0.096775f, 0.093855f,  -0.026224f,
+  -0.243664f,  0.004290f,  0.123281f,  -0.239476f, 1.230374f,  -0.107826f,
+  -0.101982f,  -0.153917f, 5.464427f,  0.304375f,  -0.809957f, 0.090564f,
+  -0.278416f,  -0.245555f, -2.078421f, 0.243093f,  -0.127666f, 0.052451f,
+  -0.126662f,  -0.783505f, 0.025149f,  -1.422675f, -0.207769f, -0.362547f,
+  0.115310f,   0.133390f,  1.264754f,  -0.027055f, -0.485312f, -0.240717f,
+  -0.239722f,  0.146818f,  -1.265043f, -0.235553f, 0.267104f,  -0.021357f,
+  -0.435949f,  -0.309371f, 0.049920f,  1.302721f,  -0.233978f, -0.097551f,
+  -0.240631f,  -0.287821f, -0.378380f, -0.273131f, -3.075169f, 0.226404f,
+  -0.029361f,  2.703590f,  -0.430659f, 0.067927f,  -0.387520f, -0.370630f,
+  -0.229236f,  0.085653f,  -0.370956f, -0.065556f, -0.187859f, 0.068309f,
+  -0.109299f,  -0.259898f, -0.103644f, -0.271199f, -0.209350f, 0.140993f,
+  -0.196713f,  -0.135508f, -1.423209f, -0.406385f, -0.019956f, -0.864694f,
+  5.963707f,   -0.201157f, 0.726377f,  -0.011076f, 0.010553f,  -0.102918f,
+  -2.230088f,  -0.258098f, -0.039547f, -0.029262f, -0.082324f, -0.860222f,
+  -0.094735f,  -1.381839f, 0.587298f,  -0.173048f, 0.721360f,  0.241900f,
+  0.764302f,   -0.023609f, -1.173755f, 0.103912f,  -0.185363f, 0.078435f,
+  -2.245062f,  -0.127269f, 0.202234f,  0.158975f,  -0.260909f, 0.098608f,
+  -0.348247f,  1.732502f,  -0.412298f, -0.269602f, -0.425771f, -0.146243f,
+  -0.530730f,  0.125716f,  -1.004419f, 0.145109f,  -0.059289f, 1.096304f,
+  0.012891f,   0.045033f,  -0.306875f, 0.003514f,  -0.176110f, 0.037544f,
+  -0.441537f,  -0.518921f, -0.262149f, -0.060407f, -0.379419f, -0.141245f,
+  -0.128894f,  -0.176537f, -1.161318f, -0.249100f, -0.118330f, 0.042816f,
+  1.173404f,   0.088312f,  -0.393568f, -0.175134f, 6.529819f,  -0.326652f,
+  -0.631917f,  -0.393476f, 0.057781f,  -0.217748f, -1.781139f, -0.012614f,
+  -0.212621f,  -0.720322f, -0.218498f, -0.388556f, -0.254796f, -0.248399f,
+  -0.608744f,  -0.265146f, 0.238517f,  0.066882f,  -2.916806f, 0.054642f,
+  0.282590f,   0.075248f,  0.010188f,  -0.133486f, 0.985945f,  -0.045849f,
+  -0.347564f,  0.057320f,  -0.417920f, 0.063664f,  0.387062f,  -2.692059f,
+  -0.535549f,  0.263736f,  0.327889f,  -0.070273f, -0.775254f, 0.147250f,
+  3.309425f,   -0.212191f, -0.067204f, -2.912663f, -0.061496f, 0.084233f,
+  0.022907f,   0.138421f,  -0.112159f, -0.288447f, -0.010799f, 0.056049f,
+  -0.036527f,  0.021525f,  0.106649f,  -0.291883f, 0.088424f,  -0.057773f,
+  -0.086031f,  0.015277f,  -0.318505f, -0.269049f, -1.008913f, -0.224785f,
+  -0.025820f,  -0.649037f, 0.706381f,  0.096410f,  0.643776f,  -0.046743f,
+  -0.009654f,  -0.024246f, 1.469255f,  -0.183536f, -0.370046f, -0.048442f,
+  -0.376527f,  -0.431264f, -0.245109f, -0.093951f, 0.203683f,  -0.099872f,
+  0.087210f,   0.160692f,  -3.527694f, -0.068891f, -0.228994f, -0.231817f,
+  -0.241949f,  0.193613f,  0.979597f,  -0.091259f, 0.414424f,  -0.047341f,
+  -0.209582f,  -0.295134f, -0.016824f, 0.460327f,  -0.072671f, 0.246234f,
+  0.235896f,   0.127238f,  -1.068683f, 0.035648f,  2.254888f,  0.180105f,
+  -0.260098f,  -2.322120f, -0.184249f, -0.314801f, -0.099969f, -0.272117f,
+  -0.237916f,  0.031103f,  -0.274063f, -0.049384f, -0.044917f, 0.102477f,
+  -0.342148f,  -0.257558f, -0.346300f, 0.115333f,  -0.115456f, 0.208354f,
+  -0.359301f,  -0.167395f, 1.146514f,  -0.177861f, -0.098658f, -0.444570f,
+  6.759993f,   -0.369772f, -0.831118f, 0.001866f,  -0.073298f, -0.072095f,
+  0.811902f,   -0.431997f, -0.286587f, -0.269500f, 0.111492f,  -0.525364f,
+  -0.351785f,  -2.463474f, -1.852659f, 0.135325f,  0.138267f,  0.100643f,
+  -2.373278f,  -0.285514f, -0.395388f, -0.185016f, -0.030249f, -0.005767f,
+  -0.716424f,  -0.031674f, 0.011147f,  0.057405f,  -0.215873f, -0.094401f,
+  0.573528f,   -1.223820f, 0.414852f,  -0.059053f, -0.076488f, -0.287168f,
+  -0.842640f,  0.174084f,  -0.567186f, 0.336629f,  -0.062514f, 2.075448f,
+  -0.061680f,  -0.131529f, -0.098994f, -0.204111f, -0.347865f, 0.108516f,
+  -0.049616f,  -0.069212f, -0.273935f, -0.096545f, -0.210784f, -0.284698f,
+  0.141501f,   -0.176924f, -0.361341f, -0.251197f, -0.286694f, 0.245569f,
+  -1.521661f,  -0.122639f, -0.015760f, -0.718912f, 5.877828f,  0.146916f,
+  0.151767f,   0.220785f,  -0.032298f, 0.230902f,  0.663943f,  -0.252613f,
+  0.057718f,   -0.436038f, -0.323994f, -1.139787f, -0.042489f, -1.326298f,
+  -1.031206f,  -0.104136f, 0.389897f,  0.127602f,  -2.667789f, -0.212366f,
+  -0.506262f,  -0.009115f, -0.213202f, 0.076167f,  -1.629405f, 0.055129f,
+  0.375393f,   -0.150272f, -0.241515f, -0.326497f, 0.100069f,  0.410703f,
+  0.340622f,   0.042437f,  -0.349945f, 0.041176f,  -1.178950f, 0.030992f,
+  0.933908f,   -0.035844f, -0.098660f, 1.030584f,  -0.092043f, -0.355739f,
+  -0.305562f,  0.036161f,  -0.049558f, -0.033225f, -0.403856f, -0.088276f,
+  0.215493f,   -0.149105f, -0.013363f, 0.025886f,  -0.101306f, -0.205781f,
+  -1.072487f,  -0.076019f, 0.077555f,  0.131003f,  1.267763f,  -0.008954f,
+  -0.327617f,  -0.246539f, 6.664081f,  -0.404403f, -1.442489f, 0.191301f,
+  -0.336361f,  0.181156f,  0.833108f,  0.007879f,  -0.194464f, -1.029408f,
+  -0.036268f,  -0.927110f, -0.379190f, -0.293443f, -1.848579f, -0.242548f,
+  -0.065990f,  0.203160f,  -0.291788f, 0.000680f,  0.587011f,  -0.241289f,
+  0.037034f,   0.000552f,  1.072308f,  -0.387230f, -0.230050f, 0.292322f,
+  -0.720001f,  0.034109f,  -0.467260f, 2.211644f,  -1.839191f, -0.048797f,
+  -0.083469f,  -0.334686f, -0.269056f, 0.051295f,  1.319904f,  -0.035603f,
+  -0.018457f,  -0.824915f, -0.212285f, -0.230516f, -0.035093f, -0.400843f,
+  -0.305469f,  -0.099011f, 0.014225f,  -0.452772f, 0.170331f,  -0.389312f,
+  -0.115084f,  -0.014770f, -0.429387f, -0.155961f, -0.568200f, -0.037853f,
+  -0.125137f,  0.067228f,  -1.329271f, -0.117874f, -0.132499f, -0.218376f,
+  -0.588325f,  -0.320024f, 0.085695f,  -0.235047f, -0.217790f, 0.103015f,
+  -0.698644f,  0.017766f,  -0.058299f, 0.199411f,  -0.122485f, -0.563949f,
+  -0.349011f,  -0.557045f, -0.131165f, 0.002281f,  0.118559f,  -0.210302f,
+  -1.153815f,  0.116738f,  -0.236007f, -0.003487f, -0.006885f, -0.244816f,
+  0.953222f,   0.093748f,  0.266869f,  0.241869f,  -0.860832f, -0.387012f,
+  -0.338986f,  2.097515f,  -1.942512f, -0.298021f, 0.543911f,  -0.043214f,
+  0.082125f,   -0.120242f, 0.712231f,  0.213327f,  -0.301687f, -0.544011f,
+  -0.392131f,  0.004302f,  0.004825f,  -0.317440f, -0.107518f, -0.293407f,
+  -0.159111f,  -0.080367f, 0.132663f,  -0.017726f, -0.237521f, -0.190297f,
+  -0.361633f,  0.200518f,  -0.538296f, -0.027975f, -0.381704f, -0.016963f,
+  0.630105f,   -0.190997f, -0.287840f, -0.603488f, 3.605598f,  -0.276614f,
+  -1.346383f,  0.186912f,  -0.047575f, -0.189232f, -1.519072f, 0.097816f,
+  -0.223722f,  0.304924f,  -0.213022f, -1.052433f, -0.322283f, -1.706734f,
+  -2.458027f,  0.237976f,  0.171050f,  -0.103139f, -0.278689f, 0.329824f,
+  -0.262448f,  -0.122916f, -0.236398f, -0.013848f, -0.969160f, -0.374907f,
+  0.091018f,   -0.386471f, -0.723940f, 0.064956f,  -0.057652f, 1.321024f,
+  -1.397418f,  -0.143136f, 0.272468f,  -0.030749f, 0.037324f,  0.069316f,
+  -0.904925f,  -0.333693f, -0.117709f, 2.279598f,  -0.428065f, -0.131157f,
+  -0.014288f,  -0.402862f, -0.666090f, 0.017070f,  -0.028333f, 0.002481f,
+  0.197156f,   -0.038120f, -0.271062f, -0.188275f, -0.021370f, -0.070849f,
+  -0.905007f,  -0.095886f, -0.093055f, -0.121821f, -1.239812f, -0.411799f,
+  -0.089948f,  -0.936827f, 1.437569f,  -0.388908f, 0.126170f,  0.186162f,
+  -0.018819f,  -0.138364f, -1.066412f, -0.138222f, -0.022186f, 0.107331f,
+  -0.230436f,  -1.352605f, -0.161323f, -1.081810f, -0.933825f, -0.136675f,
+  0.378157f,   0.113377f,  -0.850610f, 0.080245f,  -0.087305f, -0.002852f,
+  0.044408f,   -0.188172f, -1.891998f, 0.092189f,  0.125325f,  -0.105090f,
+  -0.848510f,  -0.396308f, -0.384130f, 2.007509f,  -1.480787f, -0.126946f,
+  0.314767f,   0.000195f,  -0.285628f, -0.110442f, -0.293948f, 0.258559f,
+  -0.417603f,  1.570705f,  0.092459f,  -0.340974f, -0.284754f, -0.007801f,
+  -0.324610f,  -0.004734f, -0.207716f, -0.057175f, 0.055467f,  -0.210830f,
+  -0.113005f,  -0.299177f, 0.068074f,  0.017929f,  -2.897598f, -0.260074f,
+  -0.014422f,  -0.206467f, 1.246997f,  -0.372863f, -0.214160f, -0.114035f,
+  5.805862f,   0.003611f,  -1.340990f, -0.021085f, -0.260431f, -0.002720f,
+  -1.251640f,  -0.353531f, -0.304009f, -0.153376f,
+};
+
+static const float av1_ab_partition_nn_bias_32_layer1[LABEL_SIZE] = {
+  -0.521497f, -1.061572f, -0.078756f, -0.660662f, -0.403741f, -0.960163f,
+  0.001427f,  0.523607f,  0.225068f,  -0.055273f, 1.019519f,  1.181880f,
+  -0.010198f, 0.130597f,  1.276752f,  2.028188f,
+};
+
+static const NN_CONFIG av1_ab_partition_nnconfig_32 = {
+  FEATURE_SIZE,  // num_inputs
+  LABEL_SIZE,    // num_outputs
+  1,             // num_hidden_layers
+  {
+      64,  // num_hidden_nodes
+  },
+  {
+      av1_ab_partition_nn_weights_32_layer0,
+      av1_ab_partition_nn_weights_32_layer1,
+  },
+  {
+      av1_ab_partition_nn_bias_32_layer0,
+      av1_ab_partition_nn_bias_32_layer1,
+  },
+};
+
+// nn model for ab partition pruning, 16x16.
+static const float av1_ab_partition_nn_weights_16_layer0[FEATURE_SIZE * 64] = {
+  0.151902f,  0.007947f,  -1.788454f, 0.431869f,  -2.971387f, 0.923566f,
+  1.632542f,  -1.665136f, -0.338632f, -5.075884f, 0.398267f,  0.030467f,
+  2.263534f,  -0.045532f, -1.066128f, 0.915139f,  -0.560500f, -3.293125f,
+  2.072793f,  -1.011414f, 0.122716f,  -0.060169f, -0.388860f, 0.031019f,
+  -0.381861f, 0.001551f,  -0.328472f, 0.038296f,  -0.060398f, -0.375556f,
+  0.209226f,  0.014764f,  -1.443469f, -0.345486f, 2.409269f,  1.524846f,
+  -0.640666f, 1.322139f,  -2.074771f, -0.580944f, -0.203960f, -0.072893f,
+  0.329701f,  0.115339f,  -1.339542f, 0.249024f,  -0.421545f, -0.409151f,
+  -0.258293f, 0.836288f,  -0.073685f, -0.009624f, 0.895712f,  0.320639f,
+  0.451002f,  -1.544558f, 0.193709f,  -1.389012f, 1.305451f,  0.089795f,
+  0.050338f,  -0.017433f, -0.304667f, 0.500729f,  0.504346f,  0.073757f,
+  0.582649f,  -0.993623f, 1.766766f,  -3.067265f, -0.415774f, -0.006036f,
+  -1.245281f, 0.253205f,  -0.591245f, -0.626238f, 0.551852f,  0.593755f,
+  0.491023f,  1.099384f,  -0.348448f, 0.054564f,  -0.451422f, -0.375781f,
+  -0.248390f, -0.052548f, -0.380069f, -0.165391f, -0.297968f, -0.052142f,
+  -0.316381f, -0.045246f, -0.243905f, -0.034169f, -0.247523f, -0.180773f,
+  0.068066f,  -0.374920f, 0.057536f,  -0.189748f, 0.058375f,  -0.267749f,
+  -0.147286f, -0.246153f, 0.006183f,  -0.202029f, -0.059128f, 0.116852f,
+  0.134719f,  -0.126900f, -0.064646f, -0.196458f, -0.182331f, 0.108029f,
+  -0.264499f, 0.155816f,  -0.107255f, -0.056983f, -0.209771f, -0.099070f,
+  0.007313f,  -0.254124f, -0.231964f, -0.275972f, 0.032098f,  -0.264564f,
+  -0.208743f, 0.155599f,  -0.121511f, -0.156145f, -0.162315f, -0.059788f,
+  -0.257073f, -0.076654f, -0.110616f, -0.321675f, -0.051952f, 0.006301f,
+  -0.154114f, 0.017032f,  -0.017364f, -0.233247f, 0.009918f,  -0.179289f,
+  -0.190722f, 0.147106f,  -0.063910f, -0.396872f, -0.263123f, -0.003850f,
+  -0.040718f, -0.324699f, 0.118660f,  -0.170727f, -0.316788f, 0.100886f,
+  -0.202842f, 0.045371f,  0.150561f,  -0.057054f, -0.308150f, 0.028346f,
+  -0.381473f, -0.195365f, 0.026221f,  -0.281795f, 0.087204f,  0.047689f,
+  -0.027643f, -0.104724f, -0.089030f, -0.117661f, -0.349160f, 0.056982f,
+  -0.340273f, 0.048086f,  0.046103f,  -0.121527f, 0.021697f,  0.054109f,
+  -0.002768f, -0.008461f, -2.297240f, 0.124651f,  3.621661f,  -0.057120f,
+  -1.151656f, 2.296894f,  -3.678720f, -0.290240f, 0.087683f,  -0.186389f,
+  0.007656f,  -0.090236f, -0.245217f, 0.110389f,  -0.251719f, -0.029084f,
+  -0.128203f, -0.100005f, -0.032779f, 0.007281f,  -0.366596f, -0.267870f,
+  -0.215620f, 0.047687f,  0.010303f,  0.097980f,  -0.191569f, -0.341162f,
+  0.119249f,  0.026279f,  -2.161546f, 0.459591f,  1.290566f,  1.791797f,
+  -0.409835f, 0.127081f,  -1.156367f, 0.198286f,  0.099561f,  -0.067445f,
+  -0.034352f, 0.017966f,  -0.277380f, -0.057220f, -0.174198f, -0.014164f,
+  0.146090f,  -0.357530f, 0.097644f,  -0.000932f, 0.446603f,  -0.066793f,
+  2.448620f,  0.937617f,  -1.232922f, 0.313183f,  0.816827f,  -0.275115f,
+  -0.245205f, -0.126895f, 0.156668f,  -0.186977f, -0.273505f, 0.013315f,
+  0.168629f,  -0.089084f, 0.006166f,  -0.116107f, -0.199316f, -0.024010f,
+  -0.242303f, 0.011612f,  -0.218485f, -0.229661f, -0.123922f, 0.136699f,
+  0.006732f,  -0.148718f, -0.164225f, 0.116063f,  1.587898f,  0.690519f,
+  0.360566f,  0.009739f,  -0.678702f, -0.046003f, 0.126984f,  0.605212f,
+  1.240663f,  -0.000228f, -1.119369f, -0.415589f, -0.721003f, 0.097936f,
+  -1.410586f, -2.358833f, -2.773129f, -3.983361f, -0.087144f, -0.050029f,
+  -0.242255f, 0.137424f,  -0.307490f, -0.084637f, -0.023812f, -0.196582f,
+  -0.078695f, 0.038257f,  -0.012110f, -0.263521f, 0.009839f,  -0.109125f,
+  -0.226036f, 0.060712f,  0.093671f,  0.153143f,  0.039116f,  -0.290891f,
+  0.227057f,  -0.204633f, -0.207539f, -0.148242f, 0.046204f,  -0.231268f,
+  -0.209315f, -0.307579f, -0.436556f, 0.023475f,  0.131793f,  -0.038301f,
+  1.650584f,  0.392570f,  1.446576f,  1.254380f,  -0.516867f, -0.057116f,
+  0.149320f,  0.414424f,  -0.246309f, 0.003877f,  -0.480238f, -1.037035f,
+  -0.830779f, -1.122244f, -0.408267f, -0.253956f, 0.382005f,  0.940609f,
+  -1.113370f, -0.018554f, 0.141064f,  -0.182504f, 1.270707f,  0.414904f,
+  -0.216036f, 0.203831f,  0.450716f,  -0.452909f, 0.139358f,  -0.027143f,
+  1.956892f,  1.643732f,  -0.867839f, -0.620520f, -0.334607f, -0.519982f,
+  0.205023f,  0.661159f,  -0.000809f, 0.049033f,  -0.348579f, -0.200338f,
+  -0.362144f, -0.346590f, -0.230096f, 0.180746f,  -0.149954f, -0.253429f,
+  -0.378170f, -0.040724f, -0.041597f, 0.243659f,  -0.472181f, 0.015401f,
+  -0.180376f, 0.153139f,  -0.247738f, -0.010485f, -0.157158f, 0.016825f,
+  -0.238925f, -0.265798f, -0.318374f, 0.142352f,  -0.210520f, 0.051928f,
+  -0.352190f, -0.179052f, -0.185498f, 0.025540f,  -0.111667f, -0.235187f,
+  -0.215454f, 0.010931f,  -0.238372f, -0.126659f, 0.075691f,  -0.091167f,
+  -2.462379f, -0.007950f, -0.637990f, 0.285554f,  -0.051275f, 0.282279f,
+  -0.744083f, -0.570646f, 0.592198f,  1.421332f,  -0.256027f, -0.140315f,
+  0.160247f,  -0.063185f, -0.055895f, -0.199864f, -0.287353f, -0.074561f,
+  -0.071228f, 0.055864f,  -1.084764f, -0.263409f, 0.779266f,  0.228187f,
+  0.375013f,  0.121204f,  -0.656948f, 0.533561f,  0.272671f,  -0.015423f,
+  -0.124180f, -0.009127f, 2.934838f,  -0.150998f, 1.163152f,  0.081997f,
+  -4.715939f, -3.676595f, -1.524886f, -0.167593f, 0.281186f,  0.024046f,
+  -1.451709f, 0.332558f,  0.990504f,  0.376290f,  -1.466773f, -0.448439f,
+  -2.929108f, -4.255188f, 0.065238f,  0.019950f,  1.372393f,  0.444052f,
+  -2.538772f, 1.579767f,  -0.464911f, -1.866114f, 1.053958f,  0.434467f,
+  -0.125964f, 0.034671f,  0.077116f,  -0.138466f, -0.413395f, -0.223453f,
+  -0.172127f, -0.251265f, -0.048239f, -0.395519f, 0.023141f,  0.037459f,
+  -0.249593f, -0.062215f, -0.047209f, -0.435189f, -0.164155f, -0.077590f,
+  -0.241164f, -0.126128f, -0.038243f, -0.180888f, 0.198840f,  -0.328036f,
+  -0.169790f, 0.036506f,  0.052572f,  -0.183570f, -0.073617f, -0.244959f,
+  0.266498f,  0.032846f,  -1.902106f, 0.486078f,  2.414993f,  0.975182f,
+  -0.382875f, 1.647810f,  -2.197017f, -0.890107f, 0.221287f,  0.010889f,
+  3.817042f,  0.572728f,  0.092466f,  0.473337f,  -1.634659f, -1.069455f,
+  1.486776f,  -1.023850f, 0.088184f,  0.008842f,  0.518202f,  0.270259f,
+  1.757191f,  -0.121839f, -2.912229f, -1.250866f, -2.381808f, 0.335309f,
+  -0.120079f, -0.061294f, -0.058725f, -0.315169f, -0.262443f, 0.072434f,
+  -0.267836f, -0.319354f, -0.274975f, 0.068970f,  -0.406467f, 0.044074f,
+  -0.152311f, -0.333656f, -0.228355f, -0.185613f, 0.017346f,  -0.177674f,
+  -0.090675f, -0.102047f, -0.011768f, -0.025280f, -0.271661f, 0.098099f,
+  -0.312272f, -0.222217f, -0.100548f, 0.106260f,  -0.034655f, 0.135109f,
+  -0.021276f, 0.018177f,  -0.353097f, -0.011128f, 0.061136f,  -0.511662f,
+  -0.223236f, -0.308841f, 0.118789f,  -0.154628f, -0.053178f, -0.055973f,
+  0.013175f,  -0.368337f, -0.090863f, -0.116920f, 0.178990f,  -0.025278f,
+  -0.190553f, -0.238092f, 0.303943f,  -0.024944f, 0.719373f,  0.384332f,
+  -0.378480f, -0.423316f, 0.709922f,  0.758514f,  -1.559023f, -2.503173f,
+  0.068652f,  -0.234741f, -0.182932f, 0.037878f,  0.020684f,  -0.174142f,
+  -0.182300f, -0.052796f, -0.219145f, 0.113028f,  -1.041826f, 0.035317f,
+  0.919904f,  -0.676011f, 0.652297f,  1.456447f,  -0.166904f, -0.861823f,
+  0.895827f,  0.429821f,  -0.180376f, -0.076587f, -0.273945f, -0.288990f,
+  -0.206692f, -0.080745f, -0.085444f, 0.186953f,  -0.050135f, 0.044243f,
+  -0.391706f, -0.160498f, -0.292268f, 0.164060f,  0.412649f,  0.211611f,
+  -0.327294f, -0.919399f, 0.320297f,  0.385284f,  -0.088848f, -0.072556f,
+  -0.384813f, -0.176267f, -0.065918f, 0.134724f,  -0.231104f, -0.337707f,
+  -0.195442f, -0.263569f, 0.098090f,  -0.341411f, -0.189211f, -0.439276f,
+  -0.404046f, 0.262491f,  -0.311093f, -0.086454f, -0.013400f, -0.061447f,
+  -0.026945f, -0.112036f, -0.322985f, 0.078500f,  -0.230205f, -0.344535f,
+  -0.021087f, 0.110220f,  -0.128671f, 0.044219f,
+};
+
+static const float av1_ab_partition_nn_bias_16_layer0[64] = {
+  2.936406f,  -0.396539f, -0.110456f, -1.254954f, 0.785350f,  0.516290f,
+  -0.172341f, 0.254386f,  -0.192465f, -0.106751f, -0.055518f, -0.094994f,
+  0.000000f,  -0.065018f, -0.004908f, -0.130483f, -0.119580f, -0.142072f,
+  0.457446f,  -0.125051f, -0.107712f, 0.714607f,  -0.140809f, -1.788650f,
+  -0.087199f, 0.000000f,  -1.290050f, 0.443930f,  -0.110634f, -0.109380f,
+  -0.188213f, -1.414179f, 1.193579f,  0.388775f,  -0.873193f, -0.110050f,
+  -0.072565f, -0.117050f, -0.119132f, 0.456959f,  -0.132069f, 0.131974f,
+  1.160474f,  1.746465f,  0.442628f,  -0.188849f, -0.207794f, -0.108364f,
+  -0.856655f, -2.141620f, 0.335476f,  -0.105508f, -0.212162f, -0.109319f,
+  -0.237213f, -0.109980f, -0.291044f, -0.137877f, 0.470191f,  -0.023908f,
+  0.123809f,  -0.109797f, 0.200510f,  -0.147542f,
+};
+
+static const float av1_ab_partition_nn_weights_16_layer1[64 * LABEL_SIZE] = {
+  -6.823716f, 1.406568f,  -0.144009f, 2.228765f,  0.838336f,  0.738107f,
+  -0.319014f, -0.148756f, 0.240862f,  -0.111089f, -0.004241f, 0.025758f,
+  -0.193820f, -0.246362f, -0.181363f, -0.201556f, 0.024268f,  0.252994f,
+  -0.289443f, 0.194932f,  0.057467f,  0.724735f,  0.014063f,  1.361352f,
+  0.025191f,  0.024274f,  0.231462f,  -7.227959f, -0.094515f, 0.039946f,
+  0.412719f,  0.812318f,  3.038903f,  -0.286289f, 0.647482f,  -0.115114f,
+  0.053590f,  0.066069f,  0.153134f,  0.996250f,  -0.125700f, 0.951365f,
+  -6.243494f, -4.827697f, 0.566320f,  0.239515f,  -0.099702f, 0.054546f,
+  1.847330f,  3.680076f,  -3.049829f, -0.127709f, 0.068469f,  -0.017794f,
+  0.223864f,  -0.106778f, -0.020425f, -0.040226f, -0.251890f, -0.168673f,
+  -0.552073f, 0.043311f,  0.218668f,  0.033209f,  -3.199210f, 0.193079f,
+  0.321406f,  0.718307f,  -0.181418f, -0.459612f, -1.981170f, 0.968496f,
+  -0.029757f, -0.130065f, 0.043782f,  0.072394f,  -0.088686f, 0.025322f,
+  0.129882f,  0.101324f,  0.335707f,  0.072714f,  -2.079774f, 0.203997f,
+  0.239321f,  -0.301757f, 0.257845f,  1.288382f,  -0.031275f, -0.234194f,
+  0.310722f,  2.045469f,  0.034716f,  0.135638f,  -0.251388f, 0.320071f,
+  -1.065301f, -0.322731f, -0.545028f, 0.226276f,  0.090799f,  0.019289f,
+  0.048950f,  -1.079300f, 0.231938f,  0.083683f,  4.762127f,  0.145037f,
+  -0.145549f, 0.075592f,  0.172336f,  0.108175f,  0.333751f,  1.090501f,
+  1.056114f,  0.047073f,  0.182052f,  -0.081587f, 0.089900f,  0.339286f,
+  2.049988f,  0.073585f,  0.537355f,  -0.243322f, -0.010179f, -0.052601f,
+  -0.174915f, 0.117793f,  2.222990f,  -2.520837f, -0.092699f, 1.199887f,
+  0.138720f,  0.679918f,  -0.463155f, -0.659496f, -0.109913f, -0.003398f,
+  0.114633f,  -0.128377f, 0.092970f,  -0.107489f, -0.191078f, 0.185182f,
+  0.216980f,  -0.019343f, 3.443133f,  0.287953f,  0.099314f,  0.985958f,
+  0.157268f,  -0.606516f, 0.049418f,  -0.221809f, -0.453081f, -0.344796f,
+  -0.003735f, -0.107269f, -0.128541f, -0.259543f, -0.934806f, -0.542456f,
+  -1.011192f, 0.022795f,  0.186363f,  -0.076356f, -0.050932f, -0.165098f,
+  0.168177f,  -0.101596f, -5.270886f, 2.553943f,  -0.440870f, -0.017494f,
+  0.215208f,  -0.017032f, 1.495915f,  -4.304677f, 0.762211f,  0.182937f,
+  0.254406f,  -0.029433f, -0.088364f, -0.110160f, -0.108257f, -0.036538f,
+  0.737697f,  -0.234989f, 0.168095f,  0.245118f,  -0.077262f, 0.195718f,
+  0.753302f,  -1.637869f, 0.126227f,  0.982129f,  -0.121444f, -0.295570f,
+  -1.215799f, 0.147867f,  -0.068496f, 0.132726f,  -0.005772f, -0.181774f,
+  0.126513f,  0.204723f,  -0.366123f, 0.103906f,  -0.148053f, -0.075272f,
+  0.243884f,  -0.104828f, 0.198988f,  0.501034f,  -0.112671f, 0.111421f,
+  0.167508f,  -0.117803f, -0.738624f, 2.046292f,  0.124011f,  0.057983f,
+  -0.359154f, -0.648883f, -0.259462f, -0.459041f, -2.501223f, -0.065138f,
+  0.122417f,  0.060291f,  -0.129033f, -0.843086f, 0.268241f,  -0.399927f,
+  1.585888f,  1.816393f,  -0.631427f, 0.127826f,  0.088105f,  0.073488f,
+  0.717694f,  -1.497362f, 2.608528f,  0.066896f,  -0.079230f, 0.223436f,
+  -0.010530f, 0.175310f,  1.120365f,  0.034391f,  0.835312f,  0.071652f,
+  -0.080615f, 0.111395f,  0.162742f,  0.079927f,  -3.859582f, -0.638431f,
+  -0.167880f, -0.992659f, -0.885355f, -1.276197f, 1.334344f,  0.931940f,
+  -0.078244f, -0.149030f, -0.070974f, -0.133566f, 0.200034f,  0.102793f,
+  -0.048546f, 0.063545f,  0.023864f,  -0.190863f, 1.934257f,  -0.136286f,
+  -0.107916f, -0.637468f, 0.066449f,  1.089693f,  -0.214047f, -0.265780f,
+  0.899660f,  -0.130333f, 0.288311f,  -0.049024f, 0.090202f,  0.487969f,
+  0.339704f,  0.858479f,  0.841253f,  -0.184100f, -0.637070f, -0.125071f,
+  -0.077650f, -0.087877f, 0.202268f,  -0.027300f, 2.842862f,  -0.100698f,
+  -0.259080f, 0.260556f,  0.157912f,  -0.070364f, 0.467190f,  1.200037f,
+  1.419317f,  -0.033588f, -0.227824f, 0.292617f,  0.228574f,  0.213839f,
+  -1.091099f, -0.022258f, -1.294681f, 0.136118f,  0.081652f,  -0.185359f,
+  -0.039706f, 0.191407f,  -2.053219f, -0.261934f, 0.047812f,  -0.029536f,
+  -0.823869f, -1.090534f, -0.755890f, 0.441035f,  -0.167945f, 0.231441f,
+  -0.135013f, -0.260762f, 0.256872f,  0.130339f,  -0.243751f, 0.189760f,
+  -0.288454f, 0.145363f,  0.338490f,  0.403898f,  -0.022814f, -1.263598f,
+  -0.101315f, 0.860135f,  0.136511f,  0.028942f,  0.574047f,  2.656370f,
+  0.037587f,  -0.188690f, -0.125312f, 1.100435f,  -1.080402f, 0.380905f,
+  0.004635f,  0.097144f,  -0.214309f, 0.085552f,  -0.285066f, -0.705134f,
+  -0.054704f, -0.319951f, 5.486626f,  0.958158f,  -1.380585f, 0.223340f,
+  -0.169167f, -0.170697f, -0.216748f, 0.324232f,  2.684204f,  -0.008490f,
+  -0.211052f, -0.201190f, 0.123466f,  -0.000234f, 0.579907f,  0.096938f,
+  -0.042745f, 0.201855f,  0.157195f,  -0.261440f, 0.029699f,  -0.046599f,
+  1.618216f,  -2.596280f, -0.377420f, -0.526725f, -0.493592f, -0.579615f,
+  0.579699f,  -0.100392f, 0.150694f,  0.061794f,  0.200425f,  -0.062515f,
+  -0.179122f, 0.250112f,  -0.344675f, -0.118359f, -0.095670f, 0.152311f,
+  3.662276f,  -0.154921f, -0.312991f, 0.972008f,  -0.308596f, -0.190426f,
+  0.133889f,  -0.238673f, -0.094726f, 1.683835f,  -0.215629f, -0.198890f,
+  -0.035278f, -0.367973f, -0.822435f, 0.240848f,  -0.194656f, 0.034655f,
+  -0.079424f, 0.146670f,  0.026646f,  -0.034507f, 0.059467f,  -0.153109f,
+  -0.431033f, 2.552991f,  -1.894091f, -0.180462f, -0.306839f, -0.025648f,
+  1.026326f,  -3.096230f, 1.346935f,  0.033633f,  -0.181827f, 0.094376f,
+  0.001696f,  -0.379264f, -1.069503f, -0.140972f, -0.208769f, -0.195239f,
+  0.281795f,  -0.127251f, 0.180776f,  0.067763f,  0.697124f,  -1.040779f,
+  0.111280f,  0.188351f,  -0.340234f, -0.207790f, -0.720075f, -0.137409f,
+  -0.070310f, -0.032918f, -0.060787f, 0.131484f,  -0.077845f, -0.258652f,
+  0.056911f,  -0.062034f, 0.007663f,  -0.185100f, 1.340361f,  0.014096f,
+  -0.124602f, 0.194241f,  0.128383f,  0.360465f,  0.082979f,  -0.050475f,
+  -0.519294f, 3.323262f,  0.067014f,  0.221203f,  -0.085082f, -0.228606f,
+  -0.916668f, -0.022643f, -1.386737f, -0.131902f, -0.349952f, -0.032874f,
+  -0.189190f, -0.898790f, -0.102394f, -1.017387f, 2.214050f,  1.790253f,
+  -1.913561f, -0.043716f, -0.214924f, -0.194598f, -0.064723f, -1.671793f,
+  2.251166f,  -0.146007f, 0.138527f,  -0.003134f, 0.103665f,  0.006928f,
+  -0.240253f, -0.227464f, 0.578437f,  -0.214724f, 0.503085f,  0.158093f,
+  0.033091f,  0.008061f,  4.815371f,  2.132264f,  0.281850f,  -2.288560f,
+  -0.145012f, 1.296832f,  -0.362401f, -0.403252f, 0.109873f,  0.185746f,
+  0.244764f,  0.172367f,  -0.185588f, 0.139801f,  -0.178254f, 0.068629f,
+  0.358488f,  -0.153969f, -6.433524f, 0.225983f,  -0.138123f, -0.095971f,
+  -0.036089f, -1.400083f, 0.265908f,  0.257787f,  0.181144f,  -1.647228f,
+  -0.136289f, -0.074206f, 0.122988f,  -0.088895f, -1.266717f, 0.006010f,
+  0.536681f,  0.263061f,  -0.032207f, -0.155136f, 0.086431f,  0.441950f,
+  -0.060755f, -0.280683f, -0.783475f, -2.567033f, 1.093221f,  0.117667f,
+  -0.000408f, 0.225719f,  -2.199698f, 0.141447f,  -1.459051f, 0.051315f,
+  0.203228f,  0.354432f,  -0.005775f, -0.028073f, -0.965817f, 0.231083f,
+  -0.666884f, 0.026283f,  -0.317486f, 0.210754f,  0.123897f,  0.223827f,
+  4.214405f,  1.457334f,  -0.253945f, -1.306733f, -0.391235f, 0.451154f,
+  -1.553888f, -0.353429f, 0.069533f,  0.159278f,  -0.173836f, -0.004952f,
+  -0.137033f, 0.127012f,  0.143600f,  0.051587f,  -0.070549f, 0.066509f,
+  -5.776547f, 0.180021f,  -0.189183f, -1.288504f, -0.233575f, -1.473873f,
+  0.140940f,  0.144451f,  -0.104534f, 2.089873f,  -0.168168f, 0.110726f,
+  0.132134f,  -0.215223f, -1.682754f, 0.157757f,  -0.146163f, 0.064882f,
+  0.117313f,  -0.038780f, -0.124720f, -0.501697f, 0.092047f,  -0.233992f,
+  3.324976f,  0.516601f,  1.294202f,  0.119989f,  0.061055f,  0.043420f,
+  -2.750727f, -0.382812f, -0.648496f, -0.115353f, -0.334205f, 0.024354f,
+  -0.282998f, -0.282705f, 0.073798f,  0.169851f,  0.135651f,  0.182677f,
+  -0.040220f, 0.132462f,  -0.303120f, -0.230113f, 6.165739f,  -0.258596f,
+  0.024127f,  -1.388283f, -0.006042f, 0.572600f,  0.348411f,  -0.387376f,
+  -0.075845f, 0.122319f,  -0.029616f, 0.077873f,  0.154763f,  0.049073f,
+  0.018597f,  0.102688f,  -0.204165f, 0.020734f,  -1.389133f, -0.032854f,
+  -0.147561f, 0.853944f,  0.132100f,  -3.259659f, 0.243745f,  0.181529f,
+  -0.738414f, 1.509994f,  0.023470f,  -0.005329f, 0.066115f,  -1.345081f,
+  -1.455402f, -0.172023f, -0.194625f, 0.071885f,  -0.201742f, -0.262402f,
+  0.077601f,  -0.048938f, 0.257993f,  -0.504029f, -2.032415f, 1.158880f,
+  0.448647f,  -0.025633f, 0.117586f,  -0.072275f, -0.673744f, -3.854342f,
+  -0.983843f, 0.047766f,  -0.017193f, -0.215775f, -0.158743f, -0.232042f,
+  -0.509112f, 0.148812f,  0.130122f,  0.006486f,  -0.099016f, 0.022514f,
+  -0.486850f, -0.059623f, 4.012731f,  0.025454f,  0.029059f,  -0.783546f,
+  -0.295260f, 0.322521f,  -0.473201f, -0.172100f, -0.100087f, -0.076516f,
+  -0.258367f, -0.112897f, 0.269364f,  -0.065912f, 0.169022f,  -0.178783f,
+  -0.095114f, 0.122089f,  -2.790099f, -0.100431f, -0.087963f, -0.009431f,
+  -0.087819f, -2.774399f, -0.100757f, 0.013005f,  -0.964533f, 3.236665f,
+  -0.354903f, -0.144169f, -0.166869f, -1.396513f, -0.931271f, -0.046261f,
+  -1.799262f, -0.365269f, 0.108611f,  0.037994f,  0.024747f,  -1.073639f,
+  -0.203158f, -0.935006f, 1.880891f,  1.578385f,  0.726272f,  -0.024546f,
+  -0.011626f, -0.151363f, -1.121716f, -1.787484f, 0.232806f,  0.075451f,
+  0.182899f,  0.092215f,  -0.207347f, -0.030111f, 0.054316f,  0.192481f,
+  0.594639f,  -0.247694f, 0.547471f,  -0.032094f, -0.065000f, 0.007198f,
+  1.605377f,  -0.155945f, -0.066200f, -2.343716f, -1.016283f, -0.079321f,
+  0.919365f,  0.599980f,  0.125545f,  0.265813f,  0.246884f,  0.095385f,
+  -0.260374f, -0.202916f, -0.042770f, 0.234967f,  -0.233139f, -0.326994f,
+  -1.375256f, 0.121766f,  0.077433f,  -1.103569f, 0.019497f,  -1.029185f,
+  0.253905f,  0.206569f,  0.187334f,  -0.237089f, -0.294351f, 0.164137f,
+  0.149696f,  -0.749787f, -0.413433f, 0.976587f,  1.027976f,  -0.285264f,
+  0.209273f,  -0.124762f, 0.050884f,  0.250764f,  -0.082031f, -0.646520f,
+  4.116680f,  0.437336f,  0.671684f,  0.129509f,  -0.078462f, 0.014072f,
+  -0.678232f, 0.094831f,  1.125624f,  0.207070f,  -0.154750f, -0.025780f,
+  -0.103030f, 0.118019f,  -0.908186f, -0.263546f, -1.555324f, -0.236887f,
+  -0.217854f, -0.051790f, 0.017915f,  0.171001f,  1.355562f,  0.094603f,
+  -0.233929f, -1.282169f, -0.773183f, -0.161682f, -0.834565f, -0.286776f,
+  -0.298901f, 0.038162f,  0.251899f,  0.039612f,  -0.022935f, -0.232308f,
+  -0.043855f, -0.192892f, -0.279009f, -0.182234f, -1.272808f, -0.070344f,
+  -0.092432f, -1.915946f, -0.134373f, -1.405496f, -0.067071f, -0.131922f,
+  0.185269f,  1.465082f,  0.040240f,  0.112665f,  0.144329f,  -0.286112f,
+  -0.617649f, 0.916177f,  0.221044f,  -0.079867f, 0.170251f,  -0.093638f,
+  -0.212620f, -0.305945f, -0.234356f, -0.482501f, 3.928472f,  1.241179f,
+  0.355922f,  -0.170848f, -0.189168f, 0.080225f,  -1.357793f, 0.190890f,
+  0.976800f,  -0.068070f, -0.016295f, -0.088623f, -0.129560f, -0.212267f,
+  -0.071537f, -0.219501f, -0.655198f, -0.225188f, -0.116024f, 0.224174f,
+  -0.049715f, -0.178005f, 3.029985f,  -1.141546f, 0.080066f,  -1.932316f,
+  -0.641137f, -0.189564f, 0.935080f,  0.136119f,  0.015558f,  -0.179331f,
+  0.204571f,  0.020350f,  0.009362f,  0.108478f,  0.037076f,  -0.049009f,
+  0.081090f,  -0.180202f, 1.455561f,  -0.081559f, 0.059361f,  0.484971f,
+  0.160923f,  -2.170744f, -0.013204f, 0.126561f,  -0.407122f, 1.223661f,
+  0.044262f,  0.118044f,  0.058274f,  -1.747100f, -0.171318f, 0.971374f,
+  0.306995f,  -0.103268f, -0.319443f, -0.333176f, -0.038608f, 0.119674f,
+  -0.106479f, -0.907933f, 1.121231f,  1.673840f,  -0.421458f, -0.021146f,
+  -0.254838f, 0.097632f,  0.235109f,  -2.901782f, 0.289518f,  -0.355459f,
+  -0.068264f, -0.179121f, 0.068560f,  -0.047570f, -0.522523f, -0.228963f,
+  -1.037158f, -0.163723f, 0.280563f,  -0.000868f, -0.197220f, -0.239329f,
+  1.985274f,  -0.256181f, -0.064341f, -0.822417f, -0.465140f, -0.010942f,
+  -0.792024f, -0.114290f, 0.060969f,  0.104106f,  -0.252123f, -0.150400f,
+  -0.133277f, 0.267147f,  0.274413f,  0.223744f,  -0.180223f, -0.345415f,
+  -0.104883f, 0.119210f,  -0.095041f, -0.301635f, 0.013175f,  -2.128121f,
+  -0.147208f, -0.151509f, -0.692013f, 3.418555f,  -0.016541f, 0.171511f,
+  0.107159f,  -1.516672f, 0.127408f,  0.687035f,  -0.906486f, -0.145463f,
+  -0.169382f, -0.143906f, 0.125091f,  -0.960645f, -0.180869f, -0.716908f,
+  2.840951f,  1.904919f,  -0.416268f, -0.425181f, -0.194697f, -0.075932f,
+  -0.950604f, -1.599800f, 0.943671f,  -0.022744f, -0.270492f, 0.080843f,
+  -0.372916f, 0.047838f,  -0.100300f, -0.026600f, 0.011733f,  -0.226051f,
+  0.172790f,  -0.172982f, 0.041258f,  -0.299379f,
+};
+
+static const float av1_ab_partition_nn_bias_16_layer1[LABEL_SIZE] = {
+  -0.053805f, -1.248639f, 0.520965f, -0.904962f, -0.126425f, -0.118798f,
+  0.748430f,  0.203096f,  0.059317f, 0.418219f,  0.841294f,  0.402693f,
+  -0.658522f, 0.723479f,  0.544264f, 1.035225f,
+};
+
+static const NN_CONFIG av1_ab_partition_nnconfig_16 = {
+  FEATURE_SIZE,  // num_inputs
+  LABEL_SIZE,    // num_outputs
+  1,             // num_hidden_layers
+  {
+      64,  // num_hidden_nodes
+  },
+  {
+      av1_ab_partition_nn_weights_16_layer0,
+      av1_ab_partition_nn_weights_16_layer1,
+  },
+  {
+      av1_ab_partition_nn_bias_16_layer0,
+      av1_ab_partition_nn_bias_16_layer1,
+  },
+};
+
+#undef FEATURE_SIZE
+#undef LABEL_SIZE
+
+#define FEATURE_SIZE 18
+#define LABEL_SIZE 4
+
+static const float av1_4_partition_nn_weights_16_layer0[FEATURE_SIZE * 24] = {
+  -2.032866f, 0.056691f,  0.495960f,  0.778785f,  0.548153f,  -0.806942f,
+  0.481155f,  0.282298f,  0.584980f,  0.504688f,  0.209648f,  0.234616f,
+  0.213484f,  0.221969f,  0.205862f,  0.235054f,  0.317863f,  0.257139f,
+  0.529478f,  0.098122f,  -0.657532f, 0.036296f,  0.327728f,  1.323180f,
+  -0.813082f, 0.160216f,  -0.702030f, 0.722733f,  -0.270576f, -0.347416f,
+  -0.264700f, -0.254248f, 0.159820f,  0.087995f,  -0.184163f, 0.117357f,
+  0.074194f,  -0.667369f, 0.498246f,  0.420506f,  0.072409f,  -0.121581f,
+  0.315788f,  0.000525f,  0.414986f,  0.678166f,  -0.011230f, 0.188131f,
+  -0.227749f, 0.009564f,  0.108672f,  0.106923f,  -0.080695f, -0.279382f,
+  -0.061339f, -0.297835f, -0.134707f, 0.145865f,  -0.009655f, -0.000842f,
+  -0.047436f, -0.159149f, -0.320353f, -0.089646f, -0.344765f, 0.313416f,
+  -0.143413f, 0.279668f,  0.000885f,  -0.022380f, -0.140194f, -0.310473f,
+  0.252699f,  0.066204f,  0.477568f,  0.994609f,  -0.276000f, 1.213182f,
+  0.277028f,  -0.411570f, -0.211559f, 0.377815f,  0.121488f,  -0.100559f,
+  -0.317082f, -0.251039f, -0.335181f, -0.154114f, -0.052726f, -0.332558f,
+  -0.143196f, -0.334035f, 0.162305f,  0.142279f,  -0.001210f, -0.135252f,
+  -0.033562f, 0.204307f,  -0.039757f, -0.394174f, 0.126617f,  -0.128648f,
+  -0.410979f, 0.107641f,  -0.117573f, -0.326512f, 0.235166f,  0.084959f,
+  0.290063f,  -0.005838f, 0.459894f,  1.023709f,  -0.196145f, 1.100137f,
+  -0.319815f, -0.308526f, -0.443389f, -0.272769f, -0.035259f, -0.026932f,
+  -0.029743f, 0.125113f,  -0.131024f, -0.321458f, -0.143996f, 0.008714f,
+  -0.101234f, 0.079706f,  -1.128615f, -0.467381f, 0.220563f,  -0.409900f,
+  -0.435353f, 0.759499f,  -0.465799f, -0.394309f, 0.176282f,  -0.086275f,
+  -0.161225f, -0.354814f, 0.562871f,  0.418253f,  0.414361f,  0.445480f,
+  -0.995903f, -0.086632f, -0.230645f, 0.354656f,  -0.317576f, 0.079926f,
+  0.424369f,  0.997232f,  -0.304388f, 1.071667f,  -0.023540f, 0.029677f,
+  0.108564f,  0.183581f,  -0.201395f, -0.054854f, -0.193039f, -0.049899f,
+  -0.271949f, -0.358483f, 0.304930f,  0.023823f,  -0.009319f, -0.214247f,
+  0.100712f,  -0.050162f, 0.327103f,  -0.212999f, -0.030496f, 0.316380f,
+  -0.439589f, -0.249959f, 0.229777f,  -0.353664f, -0.384559f, 0.114236f,
+  0.023119f,  0.007927f,  0.618368f,  0.957759f,  -0.019780f, -1.002389f,
+  0.564277f,  -0.839531f, 1.040445f,  0.054340f,  0.031908f,  -0.032893f,
+  -0.019170f, -0.042011f, 0.568928f,  0.362567f,  -0.559999f, -0.605344f,
+  -0.586146f, -0.290778f, 0.195943f,  -0.109580f, -0.088898f, -0.113054f,
+  0.293282f,  0.429019f,  0.306136f,  0.863025f,  0.021234f,  0.125770f,
+  -0.097108f, -0.072659f, -0.137053f, -0.191631f, 0.106281f,  0.064151f,
+  0.029883f,  0.076287f,  0.757543f,  0.276713f,  -2.529775f, -0.351727f,
+  -1.832316f, 0.544780f,  -0.944529f, 0.509705f,  -0.010236f, -0.016181f,
+  0.021520f,  0.086417f,  0.041312f,  0.296853f,  -0.372378f, 0.354446f,
+  -1.366762f, 0.048875f,  0.464918f,  -0.007450f, 0.750013f,  -0.360261f,
+  0.518532f,  0.753776f,  0.641448f,  0.710746f,  0.250866f,  0.257063f,
+  0.283421f,  0.253585f,  0.170303f,  0.210426f,  0.208842f,  0.158000f,
+  -0.033144f, 0.130748f,  0.907147f,  0.409248f,  -0.854301f, -0.981307f,
+  0.294427f,  -0.507137f, 1.079967f,  0.203203f,  0.383890f,  0.368278f,
+  0.305122f,  0.449288f,  -0.044507f, -0.547263f, -0.298245f, -0.497834f,
+  0.007016f,  -0.101982f, -0.073488f, -0.096111f, -0.479418f, -0.045497f,
+  0.033502f,  -0.018578f, -0.231531f, 0.177949f,  0.099564f,  -0.010233f,
+  -0.333055f, -0.078586f, -0.417867f, 0.171271f,  0.013662f,  -0.143599f,
+  -0.117296f, 0.135382f,  0.048321f,  0.000924f,  -0.055024f, -0.405595f,
+  -0.068260f, -0.271011f, -0.436425f, 0.206751f,  -0.899890f, 0.605510f,
+  0.535649f,  -0.238919f, -0.037619f, -0.213734f, -0.391360f, -0.132344f,
+  0.004660f,  0.176644f,  -1.008475f, -0.038895f, 0.155429f,  -0.095229f,
+  -0.680124f, -0.258063f, -0.261901f, 0.110380f,  -0.337649f, -0.505870f,
+  -1.428536f, 0.610629f,  0.254905f,  0.045098f,  0.044109f,  0.172329f,
+  0.060001f,  -0.234009f, -0.184855f, -0.153028f, -0.140897f, -0.152006f,
+  -0.312134f, 0.081261f,  0.160166f,  0.112690f,  0.266081f,  0.030175f,
+  -0.242746f, 0.000754f,  -0.341811f, -0.149774f, -0.017484f, -0.301342f,
+  -0.121466f, 0.067300f,  0.342176f,  0.474538f,  0.085441f,  -0.263935f,
+  0.479235f,  -0.003713f, -0.784840f, 0.119480f,  0.456632f,  -0.640082f,
+  -0.080575f, -0.744403f, 0.259970f,  0.034667f,  -0.274641f, -0.257594f,
+  -1.121124f, -0.003745f, -0.420693f, 0.300441f,  -0.100976f, -1.049016f,
+  0.201960f,  0.113054f,  0.187010f,  1.237427f,  0.054803f,  -0.028673f,
+  0.003596f,  -0.034724f, 0.117246f,  0.190977f,  0.278915f,  0.224307f,
+  0.017852f,  -0.336233f, -0.372311f, -0.182284f, -0.143510f, 0.331466f,
+  0.045698f,  -0.301095f, 0.184447f,  0.348240f,  -0.017021f, -0.145064f,
+  -0.000221f, -0.382256f, -0.302683f, -0.083927f, -0.008070f, 0.217907f,
+  0.647597f,  -0.050490f, -0.572736f, -0.985748f, -0.289943f, 0.041391f,
+  -0.795464f, -0.186680f, -0.354062f, -0.617400f, -0.282783f, -0.170450f,
+  -0.197197f, -0.146496f, -0.173692f, -0.106277f, -0.071004f, -0.124405f,
+  -0.971412f, 0.038542f,  0.705204f,  0.887113f,  0.150430f,  -0.243676f,
+  0.638410f,  0.320953f,  0.776676f,  0.527584f,  0.070389f,  0.051554f,
+  0.177519f,  0.140451f,  0.128892f,  0.087771f,  0.197660f,  0.194764f,
+};
+
+static const float av1_4_partition_nn_bias_16_layer0[24] = {
+  0.614063f,  -0.384872f, 0.084884f,  -0.023980f, -0.378765f, -0.082312f,
+  -0.458271f, 0.189578f,  -0.046169f, -0.073308f, -0.372322f, 0.162793f,
+  0.148803f,  0.829214f,  -0.221162f, -0.111157f, -0.017484f, -0.280596f,
+  -0.031905f, -0.143459f, 0.078823f,  -0.021940f, 0.026834f,  0.257472f,
+};
+
+static const float av1_4_partition_nn_weights_16_layer1[24 * LABEL_SIZE] = {
+  -0.985391f, 0.587616f,  0.740683f,  0.192066f,  0.447080f,  -0.016585f,
+  0.680449f,  0.028983f,  0.643111f,  0.234338f,  0.107148f,  0.328456f,
+  -0.216394f, 1.106838f,  -0.179062f, -0.129108f, -0.121655f, -0.151340f,
+  -0.306017f, -0.350989f, 0.859284f,  -0.372831f, -0.954419f, 0.250495f,
+  1.046732f,  0.287923f,  -0.421088f, 0.326613f,  -0.314396f, -0.084757f,
+  -0.474228f, 0.687999f,  0.052334f,  0.441708f,  -0.630698f, -0.350348f,
+  -0.602067f, -0.434161f, -0.489824f, -0.313193f, 0.315568f,  0.603119f,
+  0.120245f,  0.182920f,  -1.117797f, -0.239594f, -0.296296f, -0.718093f,
+  0.489497f,  -0.527019f, 0.102453f,  0.426731f,  0.034606f,  0.311461f,
+  -0.012723f, -0.229877f, -0.284290f, 0.383227f,  0.065696f,  -0.222400f,
+  1.279248f,  -0.862190f, 0.629766f,  -0.250011f, -0.325060f, -0.360115f,
+  -0.159540f, -0.291856f, -0.038348f, 0.224639f,  0.600934f,  0.030205f,
+  1.337615f,  -0.286409f, -0.473710f, -0.418995f, -1.035249f, 0.004359f,
+  -0.481860f, 0.563625f,  -0.154709f, -0.101198f, -0.758796f, -0.507616f,
+  -0.095253f, -0.711135f, 0.207759f,  0.076313f,  -0.056087f, -0.162719f,
+  -0.232918f, -0.128402f, -0.444620f, -0.447344f, 1.126012f,  -1.504446f,
+};
+
+static const float av1_4_partition_nn_bias_16_layer1[LABEL_SIZE] = {
+  -0.462133f,
+  0.465060f,
+  0.062211f,
+  0.401786f,
+};
+
+static const NN_CONFIG av1_4_partition_nnconfig_16 = {
+  FEATURE_SIZE,  // num_inputs
+  LABEL_SIZE,    // num_outputs
+  1,             // num_hidden_layers
+  {
+      24,  // num_hidden_nodes
+  },
+  {
+      av1_4_partition_nn_weights_16_layer0,
+      av1_4_partition_nn_weights_16_layer1,
+  },
+  {
+      av1_4_partition_nn_bias_16_layer0,
+      av1_4_partition_nn_bias_16_layer1,
+  },
+};
+
+static const float av1_4_partition_nn_weights_32_layer0[FEATURE_SIZE * 32] = {
+  -0.219494f, -0.428273f, 0.471006f,  0.448210f,  -0.152935f, 0.440435f,
+  0.922857f,  -0.074436f, 1.002195f,  0.414176f,  -0.327202f, -0.380066f,
+  -0.212346f, 0.061868f,  -0.056620f, 0.594134f,  0.617995f,  0.308358f,
+  0.232484f,  0.129849f,  1.483593f,  -0.071460f, 1.984515f,  1.116422f,
+  -1.141762f, -0.306220f, 0.089075f,  -0.271845f, 0.187524f,  0.050396f,
+  -0.061025f, 0.030809f,  0.172799f,  -0.458151f, -0.318357f, 0.122052f,
+  -0.414329f, 0.089366f,  0.118898f,  -0.376213f, -0.206151f, -0.519946f,
+  -0.463252f, -0.206694f, -0.254383f, -0.379487f, 0.093059f,  -0.245280f,
+  -0.205044f, -0.280060f, -0.171229f, -0.045389f, -0.179481f, -0.306245f,
+  -0.500856f, 0.003388f,  -0.527397f, -0.449330f, -0.174272f, 0.123769f,
+  0.023005f,  0.157273f,  0.073400f,  0.019099f,  -0.113848f, -0.098601f,
+  -0.290946f, -0.046770f, -0.314592f, -0.179914f, -0.391411f, -0.235631f,
+  -1.282604f, 0.048505f,  -0.746382f, 0.093740f,  -0.706583f, -0.085729f,
+  0.947382f,  -0.002961f, 1.175362f,  1.007309f,  0.141638f,  -0.037608f,
+  -0.118807f, -0.021474f, -0.146763f, 0.069363f,  -0.074372f, -0.215713f,
+  -0.004134f, -0.114110f, -0.330438f, -0.031136f, 0.111821f,  -0.534598f,
+  -0.357759f, -0.455950f, 0.139469f,  0.036582f,  -0.384743f, -0.168828f,
+  -0.239250f, 0.003520f,  -0.049003f, 0.075702f,  -0.025809f, -0.225972f,
+  -0.228905f, -0.412489f, 0.060570f,  -0.328819f, -0.206446f, -0.080231f,
+  -0.372008f, -0.218118f, -0.011954f, 0.024155f,  0.156014f,  0.020679f,
+  0.194398f,  -0.283491f, -0.024463f, -0.275099f, 0.028031f,  0.026340f,
+  -0.254668f, 0.103637f,  2.178693f,  0.552284f,  0.109366f,  -0.474806f,
+  -0.379286f, -0.026315f, 2.487924f,  -0.089466f, 0.206428f,  0.114578f,
+  0.152248f,  0.184050f,  -0.631948f, -0.014793f, -0.283782f, -0.830353f,
+  0.009343f,  -0.021029f, -0.060534f, -0.025164f, 1.841311f,  1.842748f,
+  -1.979708f, 0.450985f,  -1.606357f, -0.785454f, -0.212679f, -0.344342f,
+  0.198991f,  -0.258070f, 0.055974f,  0.224069f,  0.453051f,  0.408053f,
+  0.027873f,  -0.180538f, 0.056609f,  0.207654f,  0.104086f,  -0.194426f,
+  -0.359789f, -0.381143f, -0.331212f, -0.203973f, -0.324313f, -0.160825f,
+  -0.160439f, -0.044856f, -0.346647f, 0.044859f,  0.231398f,  -0.023643f,
+  -0.140316f, -0.260177f, 0.206965f,  -0.425386f, -0.420268f, -0.409748f,
+  0.006971f,  0.066186f,  -0.034950f, -0.345518f, 0.018633f,  -0.122489f,
+  -0.038506f, -0.330942f, 0.161236f,  -0.314119f, -0.050202f, -0.179597f,
+  0.731897f,  -0.184481f, 0.153598f,  -0.539501f, -0.301493f, -0.184967f,
+  -0.883754f, -0.586959f, -0.136292f, -1.772065f, -0.196276f, -0.053272f,
+  -0.101083f, -0.064142f, 0.161190f,  0.430826f,  0.355647f,  0.138266f,
+  0.051114f,  -0.028893f, -0.477673f, -0.238663f, -0.354117f, -0.056747f,
+  -0.334273f, -0.497688f, -0.486004f, -0.092033f, -0.241304f, -0.373250f,
+  0.120193f,  0.011360f,  -0.010475f, -0.092739f, -0.159650f, -0.033129f,
+  -0.259893f, -0.073217f, 0.200128f,  0.103407f,  -0.229233f, 0.128831f,
+  -0.063450f, -0.241732f, -0.408428f, -0.342239f, -0.264326f, -0.105403f,
+  -0.442879f, -0.310456f, -0.112881f, 0.263696f,  -0.205014f, -0.497936f,
+  -0.261734f, -0.382312f, -0.426807f, -0.021995f, -0.152794f, -0.301494f,
+  0.117232f,  -0.577809f, 0.154596f,  -0.409522f, -0.413113f, -0.359199f,
+  0.307294f,  -0.008746f, -0.310522f, 0.347620f,  -0.384845f, -0.451398f,
+  -0.226199f, 0.054154f,  -0.167608f, 0.046836f,  -0.013285f, -0.408119f,
+  -0.177973f, -0.248293f, -0.465830f, 0.035827f,  -0.222208f, -0.221717f,
+  0.066392f,  -0.349769f, -0.428029f, -0.516692f, 0.022398f,  -0.251682f,
+  0.134746f,  0.011167f,  -2.078787f, 0.173592f,  -1.948348f, 0.330060f,
+  1.993785f,  -0.052859f, -0.004795f, -3.703177f, 0.013450f,  -0.011687f,
+  0.073079f,  0.034803f,  0.025515f,  0.005994f,  0.101731f,  0.074303f,
+  -0.109962f, -0.270825f, -0.068273f, -0.163268f, -0.252826f, 0.137190f,
+  0.007667f,  -0.358453f, 0.027412f,  0.033492f,  0.021197f,  -0.049991f,
+  0.104468f,  -0.012157f, -0.056252f, -0.380756f, -0.338483f, 0.233235f,
+  -0.048631f, -0.441209f, -0.158482f, -0.148108f, -0.263453f, 0.138847f,
+  -0.304073f, -0.336312f, -0.017941f, -0.135563f, 0.075137f,  -0.246475f,
+  -0.229144f, -0.087744f, -0.346909f, 0.172611f,  0.004377f,  -0.009386f,
+  -0.023104f, 0.008000f,  -0.029390f, -0.317842f, 0.549674f,  -0.195337f,
+  -0.863979f, 0.160889f,  -0.269014f, -0.442104f, -1.799191f, 1.396533f,
+  -0.112837f, 0.881303f,  0.000764f,  -0.035415f, -0.141877f, 0.184831f,
+  -0.363566f, -0.178569f, 0.254134f,  -0.326893f, 0.127325f,  0.310620f,
+  -0.384621f, 0.146058f,  -0.287682f, -0.373447f, 0.026930f,  0.251650f,
+  0.053817f,  0.227509f,  0.121396f,  0.396514f,  -0.278381f, -0.038969f,
+  -1.538756f, -0.002856f, -0.892900f, 0.363426f,  -1.257922f, 0.743795f,
+  0.941177f,  0.219345f,  0.684189f,  1.396858f,  0.026299f,  -0.093433f,
+  -0.066182f, 0.057868f,  -0.089278f, -0.159680f, -0.262035f, -0.236656f,
+  0.005349f,  -0.031314f, 0.027917f,  -0.182113f, -0.212086f, -0.160774f,
+  0.051468f,  0.036787f,  0.183881f,  -0.288205f, -0.349691f, 0.162511f,
+  0.117878f,  -0.294534f, -0.365037f, -0.246313f, 0.073977f,  -0.072378f,
+  -0.173579f, -0.584560f, 0.547194f,  0.259853f,  -0.405287f, -0.421146f,
+  0.165788f,  -0.146964f, 0.257415f,  0.772394f,  -0.475302f, -0.310906f,
+  0.058723f,  0.276833f,  0.586842f,  0.248998f,  -0.061135f, 0.255779f,
+  0.152158f,  -0.024781f, 2.821834f,  1.365141f,  0.914744f,  0.165752f,
+  -1.048304f, -0.333891f, 1.804087f,  -0.437028f, -0.120211f, -0.020443f,
+  0.040077f,  0.258600f,  -0.598893f, -0.494579f, -0.281054f, -0.517041f,
+  0.005258f,  0.053986f,  0.322755f,  0.429495f,  -1.992364f, -0.717192f,
+  -1.774802f, 2.047362f,  -0.016194f, 0.312606f,  0.019331f,  0.060950f,
+  0.116428f,  0.168458f,  -0.307001f, -0.420734f, 0.475843f,  0.425346f,
+  -0.107119f, 0.049892f,  -1.168619f, 0.010878f,  0.354872f,  0.902717f,
+  -0.391407f, 0.332772f,  -1.335037f, -0.447100f, 0.481719f,  -0.101069f,
+  -1.806565f, 0.925280f,  0.346999f,  0.093809f,  0.006275f,  0.270814f,
+  -0.691123f, 0.230748f,  0.137033f,  0.068228f,  1.555975f,  -0.271637f,
+  -0.370403f, 0.236131f,  0.367464f,  -0.136562f, 0.428838f,  0.181750f,
+  0.338762f,  0.292449f,  -0.748204f, -0.922731f, -0.959445f, -0.806418f,
+  -0.140501f, 0.070525f,  1.248748f,  0.637990f,  -1.307246f, -0.514055f,
+  0.393858f,  -1.858727f, 0.713591f,  -0.141044f, 0.080723f,  0.120220f,
+  -0.031175f, 0.224488f,  0.753818f,  -0.833351f, -1.099132f, 0.651100f,
+  -0.135061f, -0.043820f, 0.026983f,  -0.059259f, 0.001345f,  -0.281775f,
+  0.006958f,  0.046103f,  -0.246539f, 0.057630f,  -0.360778f, -0.160681f,
+  -0.414870f, -0.301979f, 0.000683f,  0.132957f,  -0.477609f, 0.106110f,
+  -0.637769f, -0.078374f, -0.229494f, 0.583108f,  -0.822973f, -0.107540f,
+  1.063426f,  -0.268346f, 1.105787f,  2.587550f,  -0.020314f, -0.002161f,
+  -0.063836f, -0.099990f, -0.103975f, -0.114078f, -0.094199f, -0.065181f,
+  -0.019870f, -0.018920f, -0.219732f, 0.035608f,  -1.789450f, 0.483032f,
+  -0.464729f, 1.563277f,  -1.054195f, 0.359991f,  0.065204f,  0.135623f,
+  0.158380f,  -0.103815f, -1.398726f, -1.436666f, -0.356311f, 0.507752f,
+};
+
+static const float av1_4_partition_nn_bias_32_layer0[32] = {
+  0.421645f,  -0.620548f, -0.187819f, -0.189414f, -0.204975f, -0.189600f,
+  -0.174917f, -0.651928f, -0.799655f, -0.086105f, -0.163449f, -0.089212f,
+  -0.214495f, -0.108500f, -0.065777f, -0.127704f, 1.544948f,  -0.032831f,
+  -0.165621f, 0.145844f,  -0.032104f, -0.453246f, -0.113444f, 0.321589f,
+  -0.862375f, -0.108826f, -0.486259f, 0.685325f,  0.072569f,  -0.187961f,
+  0.109579f,  -0.082685f,
+};
+
+static const float av1_4_partition_nn_weights_32_layer1[32 * LABEL_SIZE] = {
+  0.255012f,  0.658860f,  0.216907f,  0.165947f,  0.241182f,  0.340854f,
+  0.409445f,  0.165220f,  0.553373f,  -0.242385f, -0.209571f, 0.255515f,
+  0.222500f,  0.037032f,  0.238590f,  0.061624f,  -2.038693f, 0.264167f,
+  -0.230144f, 0.129952f,  -0.027979f, 0.847761f,  0.438922f,  0.462323f,
+  0.555345f,  0.030689f,  0.336357f,  -0.357326f, -0.113137f, 0.272631f,
+  0.421022f,  0.367776f,  -0.197094f, 0.157117f,  -0.015008f, -0.056123f,
+  -0.283913f, 0.186417f,  0.178561f,  -0.763041f, 0.602038f,  0.341092f,
+  0.320453f,  -0.312776f, -0.371240f, -0.356279f, 0.220117f,  -0.131871f,
+  1.517429f,  0.162223f,  -0.255069f, 0.451861f,  0.045071f,  -0.223257f,
+  0.003257f,  0.015734f,  -0.630447f, -0.672588f, 0.670164f,  0.571031f,
+  -0.657948f, 0.034506f,  -0.249076f, 0.790293f,  0.066491f,  -0.131245f,
+  0.355173f,  0.564622f,  0.374048f,  0.033974f,  0.253970f,  0.495498f,
+  -0.556321f, -0.104651f, 0.276947f,  0.057148f,  -0.039126f, -0.170050f,
+  -0.141542f, 0.158541f,  0.582763f,  -0.100992f, 0.096705f,  -0.209029f,
+  0.008449f,  0.255865f,  0.103565f,  0.317719f,  0.479499f,  0.599126f,
+  -0.065613f, -0.268614f, 0.508736f,  0.180813f,  -0.815868f, 0.051238f,
+  0.001223f,  -0.305423f, -0.270079f, 0.036180f,  0.304342f,  0.202634f,
+  0.218348f,  -0.304304f, -0.438297f, 0.241123f,  0.200230f,  0.151804f,
+  0.051944f,  0.160422f,  -0.262981f, -0.417412f, 1.845729f,  -0.086183f,
+  0.403517f,  0.059667f,  0.564543f,  -0.081752f, 0.114907f,  -0.284489f,
+  -0.673943f, 0.056965f,  0.362221f,  0.403224f,  -0.000233f, -0.209552f,
+  -0.800926f, -0.134132f,
+};
+
+static const float av1_4_partition_nn_bias_32_layer1[LABEL_SIZE] = {
+  -0.019518f,
+  0.198546f,
+  0.339015f,
+  -0.261961f,
+};
+
+static const NN_CONFIG av1_4_partition_nnconfig_32 = {
+  FEATURE_SIZE,  // num_inputs
+  LABEL_SIZE,    // num_outputs
+  1,             // num_hidden_layers
+  {
+      32,  // num_hidden_nodes
+  },
+  {
+      av1_4_partition_nn_weights_32_layer0,
+      av1_4_partition_nn_weights_32_layer1,
+  },
+  {
+      av1_4_partition_nn_bias_32_layer0,
+      av1_4_partition_nn_bias_32_layer1,
+  },
+};
+
+static const float av1_4_partition_nn_weights_64_layer0[FEATURE_SIZE * 24] = {
+  -0.152649f, 0.074509f,  1.000136f,  0.601661f,  -1.416694f, -1.932396f,
+  -1.163850f, 0.640931f,  -0.888625f, -0.345711f, 0.161799f,  0.103165f,
+  0.147513f,  0.089956f,  0.204329f,  0.196922f,  0.014927f,  0.283714f,
+  -0.110422f, 0.062005f,  -0.531870f, -0.075287f, -0.448349f, -0.218881f,
+  -0.005592f, -0.130490f, -0.015779f, 0.093521f,  -0.158487f, 0.072241f,
+  0.066879f,  -0.418566f, -0.206281f, 0.025634f,  0.048334f,  -0.534750f,
+  0.302081f,  0.028707f,  -1.543248f, 0.103799f,  -1.214052f, 0.395870f,
+  0.394754f,  -0.272170f, -0.702953f, -4.057464f, -0.033497f, -0.042142f,
+  0.014742f,  0.065263f,  0.000879f,  -0.019768f, 0.101275f,  0.163059f,
+  -0.371392f, -0.283484f, 0.241915f,  0.012684f,  -0.210101f, -0.166534f,
+  -0.024894f, 0.274696f,  0.098993f,  0.104086f,  0.055044f,  -0.289378f,
+  0.146571f,  -0.147441f, 0.004056f,  0.112244f,  -0.416162f, -0.033176f,
+  -0.214836f, -0.213787f, 0.023197f,  -0.339043f, 0.301109f,  -0.408551f,
+  0.284922f,  -0.344418f, -0.039255f, 0.158748f,  -0.344169f, 0.078286f,
+  -0.043957f, -0.302162f, -0.310826f, 0.063425f,  0.198166f,  -0.285324f,
+  -0.108252f, 0.038992f,  -1.053110f, -1.663290f, -0.417185f, 1.504443f,
+  0.643206f,  -0.850240f, 0.889641f,  -0.733214f, 0.147302f,  0.060291f,
+  -0.052954f, 0.167453f,  0.111870f,  0.085471f,  0.035107f,  0.064361f,
+  0.176053f,  0.184373f,  0.676576f,  0.066164f,  1.455569f,  0.925111f,
+  -0.640845f, 0.803795f,  -0.653782f, -0.201038f, 0.060033f,  0.016964f,
+  -0.047590f, 0.045908f,  0.354162f,  0.014812f,  0.156978f,  0.058792f,
+  -0.238119f, 0.002450f,  -0.094388f, -0.155229f, 0.194858f,  -0.355429f,
+  -0.187098f, -0.119264f, -0.088694f, -0.102845f, 0.184905f,  -0.425339f,
+  -0.157808f, -0.104599f, -0.393248f, -0.379842f, 0.027741f,  -0.185816f,
+  -0.317294f, 0.002453f,  -0.498241f, -0.204302f, -0.079093f, 0.020646f,
+  -0.412850f, -0.426039f, -0.177050f, -0.419304f, -0.064478f, -0.191802f,
+  -0.146812f, 0.171111f,  0.090261f,  -0.367033f, -0.299051f, -0.322132f,
+  0.428192f,  -0.252613f, 0.488498f,  -0.559682f, 0.486720f,  -0.511084f,
+  0.992506f,  0.346765f,  -0.118697f, -0.065127f, -0.376612f, -0.345137f,
+  -0.426517f, -0.516836f, 0.307083f,  0.609362f,  0.369555f,  0.093775f,
+  -0.375664f, -0.221595f, -0.025465f, 0.134374f,  -0.387031f, 0.096236f,
+  0.337465f,  -0.124029f, -0.157340f, -0.368790f, -0.104490f, -0.279507f,
+  -0.247705f, 0.146559f,  -0.236206f, -0.036073f, 0.064206f,  -0.330919f,
+  0.516591f,  -0.013492f, 1.269568f,  1.182530f,  -0.455390f, -1.328091f,
+  -0.200950f, -0.380513f, -0.195532f, -0.341479f, 0.016064f,  0.021176f,
+  0.169119f,  0.103707f,  -0.174504f, -0.462719f, -0.079445f, -0.247128f,
+  0.459111f,  0.036129f,  0.769570f,  -0.080405f, 1.667107f,  0.355567f,
+  -2.433896f, 0.627572f,  -0.600090f, -0.651872f, -0.059769f, -0.041945f,
+  -0.009933f, 0.014864f,  -0.049378f, -0.041561f, 0.075180f,  0.138307f,
+  0.122366f,  -0.160756f, 0.215327f,  0.013572f,  0.198194f,  -0.762650f,
+  0.054466f,  1.110332f,  1.692853f,  0.658654f,  -0.409549f, 0.506085f,
+  0.330962f,  -0.223008f, 0.007448f,  -0.289062f, -0.476231f, -0.228359f,
+  0.013977f,  -0.000609f, -0.673604f, 0.275996f,  0.405291f,  1.693561f,
+  -1.079768f, 1.122516f,  -0.203227f, 0.099265f,  -0.165207f, -0.323899f,
+  -0.269973f, -0.080122f, 0.127700f,  0.190201f,  0.219527f,  0.306194f,
+  0.026049f,  -0.003779f, 1.107357f,  1.720315f,  1.017908f,  0.078664f,
+  -1.599813f, -0.482636f, -0.117450f, 0.122249f,  0.030220f,  0.039794f,
+  0.176350f,  0.129715f,  -0.305755f, -0.274044f, -0.299640f, -0.187335f,
+  -0.073616f, -0.564507f, -0.127758f, 0.044855f,  -0.191090f, 0.039095f,
+  0.115378f,  0.969352f,  -0.088360f, 0.301443f,  0.065726f,  -0.019740f,
+  -0.102350f, -0.084913f, -0.194615f, 0.118582f,  0.920789f,  -0.171615f,
+  -1.436553f, -0.026419f, -0.730864f, 0.615697f,  -0.795079f, 0.119701f,
+  0.601782f,  0.792902f,  0.184920f,  1.635090f,  -0.085860f, -0.033187f,
+  -0.166883f, 0.008487f,  -0.128300f, -0.089923f, -0.108781f, -0.133719f,
+  -0.011988f, -0.239816f, -0.092563f, -0.238471f, -0.339722f, 0.177432f,
+  -0.063101f, -0.121002f, 0.058072f,  -0.031166f, 0.086413f,  -0.016203f,
+  -0.305075f, -0.005420f, -0.168796f, 0.148745f,  -0.116737f, -0.050222f,
+  -0.287952f, -0.290982f, -0.090449f, 0.076098f,  -0.345632f, -0.061309f,
+  0.142218f,  0.035692f,  0.304517f,  -0.228031f, 0.119608f,  -0.120350f,
+  0.163404f,  -0.105605f, -0.305462f, -0.176657f, 0.210070f,  -0.227600f,
+  -0.081965f, -0.464027f, -0.053782f, -0.018367f, 0.119159f,  0.017162f,
+  -0.069792f, 0.305768f,  -0.421095f, 0.187740f,  -0.032059f, 0.575115f,
+  -0.064283f, -0.091828f, 0.772648f,  -0.393189f, -0.297098f, 0.141420f,
+  0.826389f,  -0.071586f, -0.893968f, -0.346793f, -1.151655f, 0.039393f,
+  1.546000f,  -0.094029f, -0.005786f, -0.195764f, -0.169724f, -0.133167f,
+  -0.129312f, -0.418860f, -0.026553f, -0.053667f, -0.091976f, -0.106275f,
+  -0.492625f, 0.025350f,  -0.332075f, -0.475638f, -0.076667f, -0.065779f,
+  0.108957f,  0.246298f,  -0.289007f, -0.442552f, -0.206692f, -0.257453f,
+  0.073806f,  -0.458606f, -0.410390f, -0.312674f, -0.144813f, 0.170128f,
+  0.018810f,  -0.098241f, 1.027369f,  0.479328f,  1.129707f,  0.484813f,
+  -0.085207f, 0.621873f,  -0.520981f, 0.236175f,  0.273487f,  0.061426f,
+  0.306085f,  0.161487f,  0.220991f,  0.223783f,  -0.091826f, 0.391031f,
+};
+
+static const float av1_4_partition_nn_bias_64_layer0[24] = {
+  0.580225f,  -0.191304f, 1.091767f,  -0.134522f, -0.089361f, 0.398750f,
+  -0.882708f, -0.213102f, -0.119981f, 0.378296f,  -0.075719f, 0.426598f,
+  -2.015505f, 0.202534f,  -1.044792f, -0.841519f, 0.266421f,  -0.047115f,
+  -0.131147f, -0.075066f, -0.009441f, 0.853007f,  -0.175606f, -0.868306f,
+};
+
+static const float av1_4_partition_nn_weights_64_layer1[24 * LABEL_SIZE] = {
+  -0.851937f, -0.211148f, -2.289513f, -0.275071f, 0.251340f,  -0.340847f,
+  0.498032f,  0.308652f,  -0.051574f, 0.323146f,  -0.097547f, -0.040269f,
+  1.909655f,  0.098348f,  0.588136f,  0.568112f,  0.313297f,  0.920848f,
+  -0.014486f, 0.386014f,  0.029199f,  -0.537330f, -0.021502f, 0.349073f,
+  -0.524715f, -0.351848f, 1.565454f,  -0.297148f, 0.020177f,  0.648369f,
+  0.027321f,  -0.096052f, -0.363163f, -0.132642f, 0.024292f,  -0.734176f,
+  -0.782700f, 0.408299f,  0.476945f,  -0.489512f, -0.728318f, -0.632042f,
+  0.405417f,  0.184086f,  -0.400730f, 0.359032f,  0.019710f,  -0.217409f,
+  0.519159f,  -0.136316f, 0.993592f,  -0.147128f, 0.097495f,  0.426189f,
+  -0.295233f, 0.278799f,  0.080667f,  -0.025052f, -0.307757f, 0.418716f,
+  -0.853388f, -0.374878f, -0.322725f, 0.696335f,  -0.380649f, -0.160356f,
+  -0.140060f, 0.502455f,  0.656728f,  -0.095023f, -0.184198f, -0.347069f,
+  0.456372f,  -0.029754f, 0.907923f,  0.265710f,  -0.065505f, 0.226763f,
+  -0.277798f, 0.413292f,  -0.593899f, -0.060740f, -0.313358f, -0.249944f,
+  -0.627329f, -0.327151f, -0.853788f, -1.163807f, -0.388944f, -0.228788f,
+  -0.057382f, 0.334741f,  -0.283083f, 0.368280f,  -0.407197f, -0.441849f,
+};
+
+static const float av1_4_partition_nn_bias_64_layer1[LABEL_SIZE] = {
+  -0.478735f,
+  0.292948f,
+  0.293172f,
+  0.040013f,
+};
+
+static const NN_CONFIG av1_4_partition_nnconfig_64 = {
+  FEATURE_SIZE,  // num_inputs
+  LABEL_SIZE,    // num_outputs
+  1,             // num_hidden_layers
+  {
+      24,  // num_hidden_nodes
+  },
+  {
+      av1_4_partition_nn_weights_64_layer0,
+      av1_4_partition_nn_weights_64_layer1,
+  },
+  {
+      av1_4_partition_nn_bias_64_layer0,
+      av1_4_partition_nn_bias_64_layer1,
+  },
+};
+
+#undef FEATURE_SIZE
+#undef LABEL_SIZE
+
+#define FEATURE_SIZE 4
+static const float
+    av1_partition_breakout_nn_weights_128_layer0[FEATURE_SIZE * 32] = {
+      -0.331785f,  0.068675f,  -0.323814f,  0.033714f,  -0.237835f, 0.166316f,
+      -0.498766f,  -0.545634f, -0.266173f,  -0.476957f, -0.120409f, -0.021042f,
+      0.124056f,   -0.278750f, -0.110120f,  -0.372812f, 4.547939f,  0.097618f,
+      -0.002710f,  -0.064169f, -1.841173f,  -0.403833f, 0.005536f,  0.067188f,
+      -0.434935f,  -0.227421f, -0.000011f,  -0.139961f, -0.174056f, -0.652384f,
+      -0.000015f,  -0.262847f, -3.319706f,  -0.947693f, 0.002981f,  0.016717f,
+      -10.408850f, -0.014568f, -0.000018f,  0.019084f,  1.523383f,  0.074525f,
+      -0.002076f,  -0.020734f, 4.881495f,   0.002799f,  0.000342f,  -0.019623f,
+      1.786154f,   0.037462f,  -0.019037f,  0.052833f,  11.408153f, -0.044602f,
+      0.026155f,   -0.518627f, -0.474499f,  -0.427430f, -0.442733f, -0.011116f,
+      -22.379410f, -0.000549f, -0.001418f,  0.008090f,  -0.295090f, -0.230268f,
+      -0.337278f,  -0.001127f, -0.644282f,  -0.598783f, -0.539417f, -0.003303f,
+      9.189824f,   0.038066f,  -0.004097f,  -0.460045f, -0.308858f, -0.242691f,
+      -0.230835f,  -0.273057f, 0.152226f,   0.179239f,  -0.146382f, -0.004655f,
+      -0.242940f,  -0.718862f, -0.001685f,  -0.214736f, 3.263186f,  0.079463f,
+      -0.003854f,  -0.187461f, -0.599144f,  -0.419808f, -0.000597f, -0.136980f,
+      0.184813f,   -0.319525f, -0.007246f,  0.079709f,  -0.883229f, -0.343748f,
+      -0.000077f,  -0.172214f, -0.548759f,  -0.194674f, -0.144786f, 0.043896f,
+      -0.176364f,  -0.248394f, -0.090215f,  -0.294743f, -0.280980f, -0.181436f,
+      -0.115681f,  -0.071915f, -13.035494f, -0.075623f, 0.017052f,  -0.171152f,
+      5.910803f,   0.128344f,  0.010256f,   -1.073301f, 2.387826f,  0.166183f,
+      -0.007193f,  -0.257836f,
+    };
+
+static const float av1_partition_breakout_nn_bias_128_layer0[32] = {
+  0.115591f,  -0.100178f, -0.165523f, -0.122997f, 11.045759f,  1.034761f,
+  -0.323672f, -0.189087f, 2.850950f,  7.010029f,  -21.447067f, 1.877031f,
+  0.437442f,  5.929414f,  -0.117274f, 4.462253f,  -0.135198f,  -0.145927f,
+  8.727211f,  0.000000f,  -3.532987f, -0.405898f, 11.364439f,  -0.141728f,
+  -5.994947f, -0.362574f, 1.857687f,  -0.100400f, -0.130312f,  0.006080f,
+  0.429660f,  -8.439470f,
+};
+
+static const float av1_partition_breakout_nn_weights_128_layer1[32] = {
+  -0.013738f, 0.022052f,  -0.074437f, -0.211377f, -0.080433f, 0.015543f,
+  0.002091f,  0.014252f,  0.134834f,  0.190263f,  0.244175f,  -0.031747f,
+  0.020068f,  -0.068326f, 0.185471f,  0.660268f,  -0.134898f, -0.010376f,
+  -0.276023f, -0.282921f, -0.022769f, 0.007070f,  -0.186235f, 0.024407f,
+  -0.024837f, 0.005764f,  0.016599f,  -0.040077f, 0.020990f,  0.095054f,
+  -0.039662f, 0.131499f,
+};
+
+static const float av1_partition_breakout_nn_bias_128_layer1[1] = {
+  0.86678213f,
+};
+
+static const NN_CONFIG av1_partition_breakout_nnconfig_128 = {
+  FEATURE_SIZE,  // num_inputs
+  1,             // num_outputs
+  1,             // num_hidden_layers
+  {
+      32,  // num_hidden_nodes
+  },
+  {
+      av1_partition_breakout_nn_weights_128_layer0,
+      av1_partition_breakout_nn_weights_128_layer1,
+  },
+  {
+      av1_partition_breakout_nn_bias_128_layer0,
+      av1_partition_breakout_nn_bias_128_layer1,
+  },
+};
+
+static const float
+    av1_partition_breakout_nn_weights_64_layer0[FEATURE_SIZE * 16] = {
+      0.872892f,  -0.235539f, -0.412159f, -0.142533f, -2.251479f, -0.057073f,
+      -0.001373f, 0.112147f,  5.281734f,  0.060704f,  0.000838f,  -0.961554f,
+      0.244995f,  0.154515f,  -0.292654f, -0.167177f, -3.759112f, -0.486347f,
+      0.003208f,  -0.418226f, 2.618152f,  0.026832f,  0.003988f,  -0.404406f,
+      -0.405434f, 0.102791f,  -0.033406f, -0.029820f, -4.492342f, -0.154291f,
+      0.012947f,  -0.195075f, 0.009311f,  -0.411410f, -0.010986f, -0.554822f,
+      0.160576f,  0.020796f,  -0.457230f, -0.191111f, -7.759542f, -0.065039f,
+      -0.001322f, 0.055691f,  0.291924f,  -0.053076f, -0.148379f, -0.298383f,
+      1.022023f,  -0.033668f, -0.000804f, -0.825778f, -3.902254f, -0.085812f,
+      -0.052520f, -0.035012f, -0.465468f, -0.319231f, -0.497529f, -0.183068f,
+      -2.407131f, -0.062304f, 0.000874f,  0.108786f,
+    };
+
+static const float av1_partition_breakout_nn_bias_64_layer0[16] = {
+  0.081425f,  -14.404084f, 11.511393f, -0.930053f, 1.841889f,  15.020920f,
+  -1.872288f, 5.392535f,   -0.329335f, -0.005358f, 12.600776f, 0.000000f,
+  -0.337413f, 4.492778f,   0.000000f,  17.043072f,
+};
+
+static const float av1_partition_breakout_nn_weights_64_layer1[16] = {
+  -0.465338f, -0.103023f, -0.174808f, -0.005156f, -0.016366f, -0.172494f,
+  0.014185f,  0.067030f,  -0.001939f, -0.175049f, 0.245992f,  -0.181660f,
+  -0.038572f, 0.307899f,  -0.294283f, 0.118323f,
+};
+
+static const float av1_partition_breakout_nn_bias_64_layer1[1] = {
+  -1.33438122f,
+};
+
+static const NN_CONFIG av1_partition_breakout_nnconfig_64 = {
+  FEATURE_SIZE,  // num_inputs
+  1,             // num_outputs
+  1,             // num_hidden_layers
+  {
+      16,  // num_hidden_nodes
+  },
+  {
+      av1_partition_breakout_nn_weights_64_layer0,
+      av1_partition_breakout_nn_weights_64_layer1,
+  },
+  {
+      av1_partition_breakout_nn_bias_64_layer0,
+      av1_partition_breakout_nn_bias_64_layer1,
+  },
+};
+
+static const float
+    av1_partition_breakout_nn_weights_32_layer0[FEATURE_SIZE * 16] = {
+      -4.825528f, -0.145737f, 0.001907f,  0.145415f,  -1.858153f, -0.080744f,
+      0.000601f,  0.211991f,  0.384265f,  -0.043945f, -0.521332f, -0.170622f,
+      -0.046866f, -0.600506f, -0.001216f, -0.332760f, -0.447677f, -0.605844f,
+      -0.121008f, -0.119936f, -0.215739f, -0.269665f, -0.668587f, 0.071318f,
+      -1.202551f, -0.729727f, -0.370084f, 0.088215f,  -1.926800f, -0.086519f,
+      0.000359f,  0.215120f,  0.718749f,  0.022942f,  0.003840f,  -0.176518f,
+      1.213451f,  0.080786f,  0.001557f,  -1.053430f, 0.202698f,  -0.583919f,
+      -0.535512f, -0.239927f, -0.110151f, -0.128832f, -0.441087f, -0.145575f,
+      -0.178518f, -0.585784f, 0.000029f,  -0.833014f, -0.331358f, -0.520297f,
+      -0.088676f, -0.178487f, -1.430755f, 0.022981f,  -0.106931f, 0.015573f,
+      -0.520814f, -0.045386f, -0.443123f, -0.484209f,
+    };
+
+static const float av1_partition_breakout_nn_bias_32_layer0[16] = {
+  11.747026f, -9.337718f, 0.341648f, -0.155847f, -0.104005f, 4.666283f,
+  6.669584f,  16.625504f, 9.885626f, 15.439183f, -0.346080f, 0.000000f,
+  -0.423808f, 0.000000f,  6.352258f, -0.155787f,
+};
+
+static const float av1_partition_breakout_nn_weights_32_layer1[16] = {
+  0.168561f,  -0.122519f, 0.524667f,  0.032474f,  0.059097f,  0.011900f,
+  0.166445f,  0.127256f,  -0.034838f, -0.212586f, -0.317973f, 0.348419f,
+  -0.004171f, 0.157694f,  0.117845f,  0.272115f,
+};
+
+static const float av1_partition_breakout_nn_bias_32_layer1[1] = {
+  0.09049262f,
+};
+
+static const NN_CONFIG av1_partition_breakout_nnconfig_32 = {
+  FEATURE_SIZE,  // num_inputs
+  1,             // num_outputs
+  1,             // num_hidden_layers
+  {
+      16,  // num_hidden_nodes
+  },
+  {
+      av1_partition_breakout_nn_weights_32_layer0,
+      av1_partition_breakout_nn_weights_32_layer1,
+  },
+  {
+      av1_partition_breakout_nn_bias_32_layer0,
+      av1_partition_breakout_nn_bias_32_layer1,
+  },
+};
+
+static const float
+    av1_partition_breakout_nn_weights_16_layer0[FEATURE_SIZE * 16] = {
+      0.209371f,  0.028758f,  0.005764f,  -0.384401f, -0.625777f, -0.005647f,
+      -0.316867f, 0.042985f,  0.127344f,  0.025461f,  0.011465f,  -0.071043f,
+      -0.295977f, -0.076093f, -0.209681f, -0.311653f, -0.147538f, 0.009910f,
+      -0.130997f, -0.012326f, 0.024124f,  -0.323578f, -0.005790f, -0.085664f,
+      -1.575066f, -0.119221f, 0.015018f,  0.187204f,  0.238117f,  0.084924f,
+      -0.004444f, -1.271538f, -0.709860f, -0.006226f, -0.903111f, 0.090573f,
+      -0.278642f, -0.011114f, 0.021162f,  0.081290f,  -0.467486f, -0.040771f,
+      -0.224069f, -0.714390f, -0.281905f, -0.001336f, -0.761212f, -0.060385f,
+      -0.814479f, -0.050450f, -0.003666f, 0.085668f,  -0.272589f, 0.057330f,
+      -0.206540f, -0.303418f, 0.075335f,  -0.180468f, -0.064872f, -0.755948f,
+      -0.509287f, -0.048877f, -0.001512f, 0.077086f,
+    };
+
+static const float av1_partition_breakout_nn_bias_16_layer0[16] = {
+  16.421495f, 4.012273f,  -1.828571f, 0.000000f,  -0.263564f, -0.201972f,
+  6.564987f,  14.651000f, -3.227779f, 2.241833f,  -0.137116f, 0.762876f,
+  5.625762f,  0.615822f,  0.040057f,  16.668884f,
+};
+
+static const float av1_partition_breakout_nn_weights_16_layer1[16] = {
+  -0.096440f, 0.184316f,  -0.021148f, 0.424974f, 0.003743f,  0.006310f,
+  0.046266f,  -0.219224f, -0.087004f, 0.024623f, -0.275798f, 0.120164f,
+  0.269773f,  -0.021105f, -0.146698f, 0.188764f,
+};
+
+static const float av1_partition_breakout_nn_bias_16_layer1[1] = {
+  1.60751927f,
+};
+
+static const NN_CONFIG av1_partition_breakout_nnconfig_16 = {
+  FEATURE_SIZE,  // num_inputs
+  1,             // num_outputs
+  1,             // num_hidden_layers
+  {
+      16,  // num_hidden_nodes
+  },
+  {
+      av1_partition_breakout_nn_weights_16_layer0,
+      av1_partition_breakout_nn_weights_16_layer1,
+  },
+  {
+      av1_partition_breakout_nn_bias_16_layer0,
+      av1_partition_breakout_nn_bias_16_layer1,
+  },
+};
+
+static const float
+    av1_partition_breakout_nn_weights_8_layer0[FEATURE_SIZE * 16] = {
+      -0.255885f, 0.109548f,  -0.111054f, -0.476119f, -1.083031f, -0.342003f,
+      0.048241f,  -0.356013f, -0.085054f, 0.124908f,  0.000084f,  -0.149906f,
+      -0.729829f, 0.133535f,  -0.002125f, 0.207516f,  -0.210163f, -0.567365f,
+      -0.590103f, 0.045308f,  -0.539406f, 0.130550f,  -0.663879f, -0.170549f,
+      0.017587f,  -0.054187f, 0.000550f,  0.038297f,  -0.112891f, -0.012751f,
+      -0.048067f, 0.095564f,  0.079892f,  0.077285f,  -0.749708f, -0.286312f,
+      -0.054334f, 0.132242f,  -0.004152f, -0.209758f, -0.073407f, 0.082306f,
+      -0.001034f, -0.090990f, 0.122823f,  -0.109794f, -0.230066f, -0.391155f,
+      -0.262245f, -0.004744f, -0.232246f, 0.099290f,  -0.637484f, 0.111937f,
+      -0.548556f, -0.598344f, 0.123265f,  -0.281395f, -0.399711f, -0.525671f,
+      -0.596269f, 0.098494f,  -0.005765f, 0.173652f,
+    };
+
+static const float av1_partition_breakout_nn_bias_8_layer0[16] = {
+  0.194141f, -0.111223f, 2.503733f, -7.155602f, -0.695068f, 0.114874f,
+  2.056990f, 5.284306f,  0.639643f, -2.792049f, -2.232339f, -0.232209f,
+  2.336705f, -0.278834f, 0.231905f, 7.954366f,
+};
+
+static const float av1_partition_breakout_nn_weights_8_layer1[16] = {
+  -0.014439f, 0.010171f, 0.048116f,  -0.090659f, -0.081235f, -0.021840f,
+  -0.017360f, 0.031063f, -0.031737f, -0.023439f, -0.037725f, 0.021954f,
+  0.055858f,  0.230970f, -0.056466f, 0.119780f,
+};
+
+static const float av1_partition_breakout_nn_bias_8_layer1[1] = {
+  1.27784479f,
+};
+
+static const NN_CONFIG av1_partition_breakout_nnconfig_8 = {
+  FEATURE_SIZE,  // num_inputs
+  1,             // num_outputs
+  1,             // num_hidden_layers
+  {
+      16,  // num_hidden_nodes
+  },
+  {
+      av1_partition_breakout_nn_weights_8_layer0,
+      av1_partition_breakout_nn_weights_8_layer1,
+  },
+  {
+      av1_partition_breakout_nn_bias_8_layer0,
+      av1_partition_breakout_nn_bias_8_layer1,
+  },
+};
+#undef FEATURE_SIZE
+
+#define FEATURE_SIZE 9  // Input layer size
+#define NUM_NODES 32    // Hidden layer size
+#define LABEL_SIZE 3    // Output layer size
+
+static const float av1_rect_partition_nn_weights_8_layer0[FEATURE_SIZE *
+                                                          NUM_NODES] = {
+  0.22151f,  0.99424f,  0.23415f,  -1.13841f, -0.11277f, 0.09530f,  0.14769f,
+  -1.18895f, -0.96640f, -0.21421f, -0.13974f, 0.03236f,  0.15777f,  -0.03176f,
+  0.02729f,  -0.37344f, -0.01727f, -0.05469f, 0.19402f,  -3.45508f, 0.90106f,
+  -2.91557f, 0.19379f,  0.14356f,  -0.13291f, 0.05734f,  -0.03032f, -0.13060f,
+  0.35744f,  1.31630f,  -1.54493f, -0.20749f, -0.24413f, -0.04524f, -0.12400f,
+  1.08305f,  -0.21596f, 0.76244f,  1.10616f,  -1.71706f, 0.05768f,  0.10966f,
+  0.00949f,  -0.12680f, 0.00699f,  -0.11522f, -0.38566f, 0.34283f,  -0.35266f,
+  -0.40643f, -0.22462f, 0.32300f,  -0.39737f, -0.20587f, -0.16096f, 1.07543f,
+  0.30314f,  -1.35659f, -0.38212f, 0.45857f,  0.76615f,  0.16819f,  -1.24459f,
+  0.39677f,  0.87436f,  -2.33757f, 1.27471f,  0.27488f,  0.01019f,  -0.01221f,
+  -0.07461f, -0.14577f, -0.01231f, -0.64426f, -1.02733f, -1.96242f, 0.95143f,
+  -0.06777f, -1.13868f, 0.01354f,  -0.75590f, -0.78222f, -0.07453f, 0.61788f,
+  0.56899f,  1.17144f,  0.70899f,  0.48568f,  0.11266f,  0.81579f,  -0.03929f,
+  0.01088f,  0.33599f,  -0.22401f, -0.49654f, -0.02598f, 0.04509f,  -0.08217f,
+  -0.30687f, 0.19851f,  -2.96860f, -2.30698f, 0.01848f,  0.11801f,  0.06614f,
+  0.01673f,  -0.11002f, -0.08168f, 0.09204f,  -0.06379f, 0.27972f,  -0.31716f,
+  -0.00566f, -0.13651f, -0.37276f, 0.01511f,  -0.23697f, 0.21696f,  -0.19480f,
+  0.60758f,  -0.43506f, -0.02247f, -1.45073f, 0.84442f,  -0.94018f, 0.32550f,
+  0.03985f,  -0.06581f, 0.21665f,  0.79472f,  -2.41080f, 0.04788f,  -0.09492f,
+  -0.10677f, 0.07250f,  0.14329f,  -0.37319f, 0.53043f,  -0.49108f, 0.25792f,
+  -0.36569f, -0.28669f, -0.18416f, -0.52385f, -1.17081f, -1.32153f, -1.13403f,
+  -0.26196f, 0.93379f,  0.72115f,  0.54464f,  0.27642f,  0.04757f,  2.01629f,
+  1.55787f,  -0.11665f, 1.00722f,  -0.24352f, 0.53308f,  0.57719f,  0.39344f,
+  0.19174f,  0.06339f,  -0.02530f, 0.07724f,  -0.32416f, -0.26992f, -0.35887f,
+  -0.35285f, -0.33379f, -0.37475f, -0.77335f, 1.70027f,  -1.52153f, -0.26503f,
+  0.97552f,  -2.96705f, -0.91220f, -0.11827f, 0.00406f,  -0.14514f, 0.18417f,
+  -0.20874f, 0.27293f,  -0.34072f, -0.34838f, -0.19054f, -0.29806f, -0.27960f,
+  -0.19293f, -0.18275f, -0.05902f, 0.58625f,  -0.05470f, -0.48814f, -0.45382f,
+  -0.05959f, 2.01250f,  -0.30014f, 0.69546f,  -1.24180f, 1.34923f,  0.20337f,
+  0.16850f,  0.07187f,  0.72630f,  -0.15380f, -2.40973f, -2.73561f, -1.71375f,
+  -1.61695f, 0.50052f,  0.09730f,  0.00579f,  0.06133f,  -0.06512f, -0.61439f,
+  -1.16173f, -0.58716f, 1.60438f,  0.23242f,  0.91847f,  0.49041f,  -0.16277f,
+  -0.02574f, -0.64593f, 1.17028f,  0.46852f,  0.14926f,  0.73853f,  -0.78521f,
+  0.05959f,  -0.35590f, 0.02039f,  0.10812f,  -0.28650f, 1.34038f,  -0.72188f,
+  0.62385f,  -0.35271f, -0.39599f, 0.41543f,  0.53124f,  -0.23510f, -0.15480f,
+  -0.05066f, -0.33529f, 0.05238f,  -0.35311f, -0.26983f, -0.39764f, 0.01085f,
+  0.26593f,  -0.18411f, -0.29945f, 0.50090f,  -0.03397f, 0.78562f,  -0.33068f,
+  1.21308f,  -2.23273f, -0.33366f, -0.15164f, -1.13270f, 0.17394f,  0.65567f,
+  0.76496f,  0.44325f,  0.01368f,  -0.33619f, -0.64256f, 0.64478f,  0.84553f,
+  1.74183f,  0.22563f,  -0.14550f, -0.16258f, 0.03010f,  0.49922f,  0.64575f,
+  -0.29187f, -0.10348f, -1.43619f, -0.56540f, -0.14779f, 0.04616f,  0.87411f,
+  -1.08228f,
+};
+
+static const float av1_rect_partition_nn_bias_8_layer0[NUM_NODES] = {
+  0.33919f,  -0.03003f, 0.79073f,  -0.18508f, 0.00668f,  -0.12017f, 0.35362f,
+  -0.51642f, 0.06536f,  0.41668f,  -0.06509f, 0.94606f,  -0.15385f, 0.14936f,
+  1.46274f,  -0.06961f, 2.82537f,  -1.95576f, -0.09457f, 0.02042f,  -0.07480f,
+  -0.55083f, 0.26170f,  4.39883f,  0.33999f,  -0.10502f, 0.70884f,  -0.06992f,
+  -0.22638f, 1.40940f,  -0.09309f, 0.05828f,
+};
+
+static const float av1_rect_partition_nn_weights_8_layer1[NUM_NODES *
+                                                          LABEL_SIZE] = {
+  0.09209f,  0.26236f,  0.62136f,  0.76324f,  -1.14678f, 0.42289f,  -0.08895f,
+  -0.97267f, 2.05958f,  0.00843f,  0.35335f,  1.12096f,  -0.11679f, 0.07350f,
+  -1.23231f, -0.61990f, 1.51379f,  -1.99450f, 0.22441f,  2.41974f,  -0.30488f,
+  -0.37869f, 0.47168f,  -3.70132f, 0.00061f,  0.19432f,  0.11512f,  0.26200f,
+  -0.35285f, 0.37985f,  0.90571f,  0.27344f,  0.74840f,  -0.17965f, -2.51433f,
+  0.59235f,  1.16670f,  -0.53446f, 0.67897f,  0.04505f,  -0.86874f, 0.45361f,
+  -0.35033f, 1.21283f,  0.31426f,  -0.20841f, 0.56757f,  0.45909f,  -1.23683f,
+  0.09835f,  -0.17214f, -0.96323f, 0.01138f,  -0.50233f, 0.30104f,  2.01814f,
+  1.15821f,  -0.11947f, 0.74574f,  -0.30714f, -0.39646f, -1.30086f, -0.88541f,
+  -0.12259f, -0.54977f, 0.30069f,  1.84299f,  -0.95141f, -0.65887f, -0.25888f,
+  -0.63265f, 1.29531f,  -0.56672f, 0.10837f,  -0.21297f, -2.19131f, 0.01156f,
+  0.51912f,  0.46704f,  0.42810f,  -0.59271f, 0.98469f,  -0.17914f, -1.91163f,
+  -0.32807f, 0.48199f,  -0.99525f, 1.67108f,  -0.87631f, -0.60258f, -0.78731f,
+  -0.32877f, 0.44237f,  0.01087f,  0.07489f,  -0.28224f,
+};
+
+static const float av1_rect_partition_nn_bias_8_layer1[LABEL_SIZE] = {
+  1.70665f,
+  -0.77954f,
+  -0.92709f,
+};
+
+static const NN_CONFIG av1_rect_partition_nnconfig_8 = {
+  FEATURE_SIZE,  // num_inputs
+  LABEL_SIZE,    // num_outputs
+  1,             // num_hidden_layers
+  {
+      NUM_NODES,
+  },  // num_hidden_nodes
+  { av1_rect_partition_nn_weights_8_layer0,
+    av1_rect_partition_nn_weights_8_layer1 },
+  { av1_rect_partition_nn_bias_8_layer0, av1_rect_partition_nn_bias_8_layer1 }
+};
+
+static const float av1_rect_partition_nn_weights_16_layer0[FEATURE_SIZE *
+                                                           NUM_NODES] = {
+  -0.18480f, -0.05410f, -0.18957f, 0.15451f,  -0.38649f, -0.26162f, -0.22727f,
+  -0.38555f, -0.36738f, 0.74384f,  -1.85999f, 0.98491f,  -0.72119f, 1.77321f,
+  0.39983f,  0.96314f,  0.23695f,  0.30200f,  0.30629f,  -0.47617f, -1.43320f,
+  -1.81730f, 0.36554f,  -0.07142f, -1.27242f, -1.27697f, 0.00110f,  -0.32179f,
+  0.27460f,  0.45428f,  0.15308f,  -0.73906f, -0.28577f, -0.01238f, -0.16958f,
+  -0.85390f, 1.05484f,  -1.62812f, 0.77632f,  -0.27327f, -0.32527f, 0.32726f,
+  1.73255f,  0.53763f,  0.59121f,  -0.39068f, -0.32451f, -0.31869f, 0.17777f,
+  0.07519f,  -0.18066f, -0.11250f, -0.14616f, -0.16882f, -0.04099f, -0.67959f,
+  0.39674f,  -0.08596f, 0.18587f,  -2.04097f, -1.73993f, 1.57212f,  1.42410f,
+  -1.36762f, -0.41485f, -1.12103f, 0.56959f,  0.11500f,  0.48945f,  -0.13585f,
+  1.22125f,  0.67071f,  -1.11812f, -0.20660f, -0.52856f, 0.70663f,  0.74382f,
+  0.61114f,  -0.11454f, 1.14687f,  0.80322f,  -0.45965f, -0.44466f, -0.05830f,
+  0.13206f,  -0.53750f, -0.11324f, -0.37971f, -0.13491f, -0.21268f, 1.93407f,
+  1.34433f,  2.49427f,  2.91955f,  1.71730f,  0.03295f,  0.03587f,  -0.14550f,
+  0.08189f,  -0.38655f, -0.35432f, -0.62706f, -0.01849f, -0.57882f, -0.60438f,
+  -1.01334f, -0.57302f, 0.22592f,  0.05916f,  -0.05305f, -0.89824f, -0.52969f,
+  -0.24542f, 0.27029f,  -0.40924f, -0.82452f, -0.60665f, -5.03025f, 0.83302f,
+  1.83695f,  2.19716f,  2.31001f,  0.03657f,  0.00063f,  -0.04379f, 0.05835f,
+  -0.08623f, 0.20557f,  -0.17791f, 0.07874f,  -0.25456f, -0.19513f, -0.27753f,
+  -0.31982f, 0.00245f,  -0.33183f, 0.26059f,  -0.22165f, 0.37582f,  -0.30411f,
+  -0.22639f, -0.14739f, -0.20201f, -0.37507f, -1.30653f, 0.49570f,  1.03673f,
+  0.66139f,  0.44941f,  -0.44461f, -0.50376f, -0.49664f, 0.18608f,  -0.26175f,
+  0.14844f,  0.78715f,  -0.70344f, -0.87624f, -0.98535f, -0.35346f, 0.37094f,
+  -0.43135f, -0.22571f, 3.46263f,  3.13580f,  -1.33203f, -0.15247f, -0.15866f,
+  -0.11214f, 0.12211f,  0.03964f,  -1.87597f, -4.81597f, -4.80195f, -4.98096f,
+  -5.62336f, -0.05337f, -0.00943f, 0.00792f,  0.02742f,  1.05679f,  2.41455f,
+  0.85382f,  1.42504f,  0.58096f,  0.21443f,  1.02694f,  1.06746f,  1.20242f,
+  0.60767f,  1.98667f,  -0.80879f, -0.63495f, 1.95508f,  0.23952f,  -0.15019f,
+  -0.16097f, 0.30155f,  -3.42407f, -1.34998f, 9.07689f,  -2.22559f, 2.22562f,
+  -0.03348f, -0.05229f, 0.05931f,  0.03042f,  -0.18068f, -0.05732f, -0.33010f,
+  -0.32279f, -0.26607f, -0.02723f, -0.04067f, 0.08700f,  -0.16366f, -0.24935f,
+  -0.69124f, 0.58508f,  0.50654f,  0.04492f,  1.38340f,  -1.51487f, 1.72889f,
+  -1.95618f, -3.65013f, -1.38525f, -3.05516f, -2.40448f, 2.47467f,  0.03784f,
+  0.08052f,  -0.01971f, -0.08918f, -0.84997f, -0.55302f, -1.07861f, -0.62626f,
+  0.61751f,  -0.11012f, -0.24185f, -0.39201f, -1.85390f, -0.31261f, -0.11927f,
+  0.15671f,  -0.23450f, -0.14916f, -0.31715f, -0.19350f, 0.01795f,  -0.11533f,
+  -0.05799f, -0.03142f, 0.20218f,  -0.39499f, -0.33859f, -0.13201f, -0.19527f,
+  -0.28459f, -0.20346f, 0.89457f,  -2.22103f, -2.37455f, -2.00221f, 2.44553f,
+  0.33915f,  0.50047f,  -0.34625f, -0.19667f, -0.56333f, -0.84328f, 1.25767f,
+  -1.70297f, 1.00482f,  -0.00103f, -1.40813f, 0.21311f,  0.39230f,  -0.07302f,
+  -3.49100f, 1.60675f,  -2.90692f, 0.11022f,  0.13507f,  -0.13308f, 0.15201f,
+  -0.05573f,
+};
+
+static const float av1_rect_partition_nn_bias_16_layer0[NUM_NODES] = {
+  -0.16783f, -0.16023f, 0.52215f,  -0.04109f, 2.00122f,  -0.11633f, 0.25535f,
+  1.80638f,  1.69273f,  -0.25998f, -6.83550f, -0.79682f, -1.03466f, 1.42721f,
+  0.00000f,  -0.00000f, -0.11665f, -0.12047f, -1.01497f, 7.27181f,  -0.78548f,
+  -1.39335f, -5.42248f, -0.10388f, 0.07634f,  2.81012f,  -0.57429f, -0.15629f,
+  -0.12044f, 1.65478f,  -0.75153f, 1.18441f,
+};
+
+static const float av1_rect_partition_nn_weights_16_layer1[NUM_NODES *
+                                                           LABEL_SIZE] = {
+  -0.26407f, 0.06322f,  0.87932f,  0.17772f,  0.71686f,  -0.12283f, 0.08454f,
+  0.20098f,  -0.31763f, -0.33178f, -4.59535f, -0.04367f, 0.17099f,  3.80486f,
+  0.16750f,  0.29218f,  0.57234f,  -0.96550f, -0.10599f, -4.91130f, -0.14658f,
+  0.95803f,  -4.13925f, 0.24567f,  0.25708f,  1.60547f,  -1.03251f, -0.31053f,
+  -0.05659f, -0.94121f, -0.68926f, -0.24738f, -0.38019f, 0.98950f,  0.13689f,
+  0.24504f,  0.49623f,  0.19980f,  0.38349f,  0.37481f,  0.54540f,  -0.02198f,
+  3.43385f,  1.02543f,  -0.40921f, -3.07235f, 0.02996f,  0.00323f,  -0.35414f,
+  0.71099f,  1.39334f,  2.43741f,  -1.11007f, -0.22739f, -4.21757f, 0.11905f,
+  0.00353f,  -1.69637f, 0.45944f,  -0.19884f, 0.03624f,  0.25729f,  0.23659f,
+  -2.08405f, 0.08573f,  -0.53393f, -1.28103f, -0.53970f, -0.65465f, 0.31821f,
+  -0.09884f, -0.69026f, -0.37284f, 0.04622f,  1.32973f,  -0.15414f, 0.19138f,
+  -0.67927f, -0.17658f, 0.36008f,  -0.51832f, 0.09887f,  -1.94414f, 2.95227f,
+  1.76937f,  -0.26687f, 8.50976f,  0.26247f,  0.60262f,  -0.27910f, 0.30061f,
+  -0.05117f, 0.16018f,  0.71195f,  0.57871f,  1.57794f,
+};
+
+static const float av1_rect_partition_nn_bias_16_layer1[3] = {
+  2.68750f,
+  -1.31894f,
+  -1.36768f,
+};
+
+static const NN_CONFIG av1_rect_partition_nnconfig_16 = {
+  FEATURE_SIZE,  // num_inputs
+  LABEL_SIZE,    // num_outputs
+  1,             // num_hidden_layers
+  {
+      NUM_NODES,
+  },  // num_hidden_nodes
+  { av1_rect_partition_nn_weights_16_layer0,
+    av1_rect_partition_nn_weights_16_layer1 },
+  { av1_rect_partition_nn_bias_16_layer0, av1_rect_partition_nn_bias_16_layer1 }
+};
+
+static const float av1_rect_partition_nn_weights_32_layer0[FEATURE_SIZE *
+                                                           NUM_NODES] = {
+  -0.54654f, -0.43537f, -0.10620f, -0.48051f, -0.43543f, -0.22737f, -0.15429f,
+  -0.09858f, -0.09438f, 0.37306f,  0.23934f,  -1.86375f, -1.18307f, -0.32995f,
+  -0.09745f, 0.05431f,  -0.13799f, 0.14734f,  -0.33219f, 0.18057f,  -0.23792f,
+  -0.28126f, 0.02977f,  -0.07431f, 0.07860f,  0.00067f,  -0.01927f, 1.01841f,
+  -0.57739f, 0.08412f,  -1.33843f, -1.05563f, -0.28693f, -0.39425f, -0.69572f,
+  -0.16703f, 0.02808f,  0.11994f,  -0.26267f, 0.19706f,  -0.29707f, -0.25305f,
+  -0.07050f, -0.02704f, -0.31528f, -0.42301f, 0.22496f,  -0.37001f, -0.23319f,
+  -0.11139f, -0.30513f, 0.04213f,  -0.12550f, 0.02504f,  0.33245f,  0.01102f,
+  -0.35950f, -0.05949f, -0.19590f, -0.27457f, -0.28339f, -0.15676f, -0.21538f,
+  0.65066f,  0.28443f,  -1.24943f, -3.00246f, -1.01897f, 0.09304f,  0.70052f,
+  -0.12877f, 0.21120f,  -0.37476f, 0.23261f,  -0.28401f, 0.09837f,  0.00020f,
+  -0.12106f, -0.32354f, -0.02472f, -0.19772f, 1.01886f,  0.16596f,  -0.06532f,
+  1.72938f,  1.57754f,  0.55963f,  0.33246f,  -0.20023f, 0.30715f,  0.08629f,
+  0.18945f,  -0.45988f, -1.22610f, -0.05152f, -0.48859f, -1.02104f, -0.27315f,
+  -0.57698f, 0.04157f,  -0.92428f, -1.31268f, 1.78210f,  0.10291f,  1.55042f,
+  -1.26793f, 1.39042f,  -1.43729f, 0.25600f,  5.21263f,  5.31955f,  5.19316f,
+  5.43430f,  0.00294f,  -0.00970f, -0.02333f, 0.00250f,  1.17672f,  6.27544f,
+  4.95973f,  3.54009f,  4.51269f,  0.30750f,  0.78780f,  -0.44741f, -0.76442f,
+  0.75050f,  0.58799f,  0.03400f,  -2.09859f, 1.67313f,  0.12503f,  0.28609f,
+  1.15809f,  2.46530f,  -0.04898f, 0.23072f,  -0.12635f, -0.82097f, -0.63827f,
+  2.16779f,  1.77132f,  0.15434f,  -1.06427f, 0.06206f,  -0.87732f, -0.61897f,
+  -0.44593f, -0.77131f, -0.15979f, -0.02282f, -0.74381f, 0.66052f,  -0.22992f,
+  1.74638f,  1.29199f,  -0.55464f, 0.98316f,  0.06665f,  0.50254f,  -0.66292f,
+  0.17113f,  -0.32633f, -1.85803f, -0.92759f, 4.44965f,  1.33057f,  0.02135f,
+  -0.27446f, -0.26018f, -0.12613f, -0.14470f, -0.23355f, -0.09717f, -0.24123f,
+  -0.05535f, -0.19146f, -0.36222f, -0.30458f, -0.40323f, 0.21779f,  0.14248f,
+  -0.48630f, 0.18840f,  0.11040f,  0.17287f,  -0.51880f, 1.12466f,  -0.38888f,
+  -0.16421f, -0.31784f, -0.36112f, -0.25386f, -0.01636f, 0.10029f,  -0.26881f,
+  -0.17051f, -0.30903f, -0.08573f, -0.28774f, -0.01173f, -0.09706f, -0.23089f,
+  -0.12922f, -0.17463f, -0.12433f, -0.23074f, 0.15220f,  1.29826f,  0.23788f,
+  0.04189f,  2.66416f,  0.48815f,  -0.06803f, 0.96742f,  1.27165f,  -0.70348f,
+  -0.09941f, -0.42948f, -0.20243f, -0.02364f, -0.26689f, -0.40629f, -0.68217f,
+  -0.48073f, 2.43657f,  -2.60191f, -1.82837f, 0.50440f,  0.71829f,  0.76491f,
+  0.28293f,  0.20568f,  0.92642f,  -0.02496f, 1.43637f,  -0.24474f, -1.21030f,
+  0.54084f,  1.05130f,  1.29572f,  0.03750f,  -0.36894f, 0.74548f,  -1.33857f,
+  -0.84858f, 1.35230f,  0.80175f,  0.66136f,  1.06473f,  0.18701f,  1.42413f,
+  0.04661f,  -0.07820f, 0.64990f,  -0.43595f, 1.18304f,  -0.11437f, -0.06365f,
+  0.03558f,  0.78260f,  -1.74890f, 1.56217f,  -1.23424f, 4.59193f,  -3.35072f,
+  0.01180f,  -0.18296f, -0.20870f, 0.04510f,  1.52595f,  -1.37402f, -0.33123f,
+  -0.85957f, 0.80598f,  0.03743f,  0.02354f,  0.37707f,  1.62095f,  -0.29627f,
+  -0.31778f, -0.45789f, -0.14906f, 0.25315f,  -0.10817f, -0.32610f, -0.40890f,
+  0.33984f,
+};
+
+static const float av1_rect_partition_nn_bias_32_layer0[NUM_NODES] = {
+  -0.17482f, 0.39042f,  0.00000f,  1.69677f,  0.08792f,  -0.09301f, 0.13809f,
+  4.84061f,  0.00000f,  0.40515f,  0.46246f,  0.20644f,  -5.77478f, -1.54510f,
+  0.05660f,  -0.32013f, 0.23649f,  0.03778f,  -2.53710f, -0.27869f, 0.45623f,
+  -0.04155f, -0.18445f, -0.73405f, -0.50243f, 2.23191f,  1.93272f,  -1.07032f,
+  -0.27602f, -1.98063f, 0.20816f,  -0.01315f,
+};
+
+static const float av1_rect_partition_nn_weights_32_layer1[NUM_NODES *
+                                                           LABEL_SIZE] = {
+  0.02827f,  1.02560f,  -0.07137f, -0.31911f, 0.11365f,  0.13684f,  -0.07816f,
+  -5.23036f, -0.34340f, 0.84526f,  -1.51845f, 0.07017f,  -8.12570f, 6.24061f,
+  0.35739f,  -0.09937f, -0.30978f, 0.22032f,  0.74968f,  -0.34557f, 0.45547f,
+  -0.16512f, 0.07118f,  1.66415f,  0.41320f,  -1.81533f, -1.96004f, 1.04666f,
+  0.84049f,  4.31009f,  0.68850f,  0.26322f,  -0.24634f, -1.25889f, 0.31952f,
+  0.63632f,  0.05801f,  -0.10664f, -0.21992f, 2.44386f,  0.19526f,  -0.09838f,
+  1.53049f,  -0.26630f, 3.54126f,  -3.40574f, 0.72730f,  0.04557f,  0.92652f,
+  0.15522f,  2.35895f,  -0.13347f, 0.56907f,  0.15352f,  0.01823f,  -0.73939f,
+  0.43104f,  1.90321f,  0.31267f,  -0.51972f, 0.50094f,  -3.98372f, -3.41518f,
+  -0.48183f, 0.26661f,  0.64146f,  0.14500f,  -0.01695f, 0.16653f,  -0.37846f,
+  0.08412f,  2.69714f,  -0.20258f, -0.75786f, 0.11201f,  0.61878f,  4.22231f,
+  -3.55330f, -1.14137f, -0.37722f, -0.28000f, -0.72581f, -2.62827f, -0.19448f,
+  -0.59398f, -0.30136f, -0.17725f, -0.69630f, -0.41132f, 0.12208f,  2.11441f,
+  -1.08794f, -1.41694f, 0.02620f,  2.18792f,  0.04271f,
+};
+
+static const float av1_rect_partition_nn_bias_32_layer1[3] = {
+  2.47332f,
+  -1.65756f,
+  -0.81573f,
+};
+
+static const NN_CONFIG av1_rect_partition_nnconfig_32 = {
+  FEATURE_SIZE,  // num_inputs
+  LABEL_SIZE,    // num_outputs
+  1,             // num_hidden_layers
+  {
+      NUM_NODES,
+  },  // num_hidden_nodes
+  { av1_rect_partition_nn_weights_32_layer0,
+    av1_rect_partition_nn_weights_32_layer1 },
+  { av1_rect_partition_nn_bias_32_layer0, av1_rect_partition_nn_bias_32_layer1 }
+};
+
+static const float av1_rect_partition_nn_weights_64_layer0[FEATURE_SIZE *
+                                                           NUM_NODES] = {
+  0.08972f,  4.09095f,  -0.31398f, -2.43631f, -0.74767f, 1.42471f,  1.60926f,
+  1.44721f,  1.88259f,  2.35375f,  1.88299f,  2.01109f,  0.98679f,  2.24131f,
+  0.06279f,  -0.08315f, 0.32107f,  0.91334f,  -0.36569f, 5.55049f,  5.44943f,
+  5.20471f,  5.39099f,  -0.01943f, -0.00284f, 0.02203f,  -0.01309f, 1.41917f,
+  6.68460f,  -6.15986f, 6.41341f,  -3.20630f, -0.00567f, -0.00038f, 0.05960f,
+  0.04308f,  0.95366f,  3.48535f,  2.98266f,  4.11784f,  3.44255f,  0.61630f,
+  0.71405f,  0.63945f,  -0.00713f, 0.39193f,  1.91621f,  3.32755f,  0.71674f,
+  -0.11647f, 2.07090f,  2.64191f,  0.07949f,  -0.05023f, 0.99935f,  0.83145f,
+  0.75898f,  -0.98764f, -0.58731f, 1.21734f,  -0.08076f, -3.26780f, 1.66278f,
+  0.04189f,  -0.33177f, -1.58648f, 1.00883f,  -0.56132f, -2.34877f, 0.67056f,
+  -2.32297f, -0.91641f, -1.02909f, 4.19781f,  3.87484f,  4.32778f,  -1.97171f,
+  -0.24734f, 0.00822f,  0.05892f,  0.12697f,  -3.62915f, -2.93127f, 7.94856f,
+  -3.29311f, 3.26001f,  -0.02231f, 0.02741f,  0.05919f,  0.08190f,  -1.49344f,
+  -0.64475f, -0.24627f, 4.03324f,  -1.14799f, -0.18465f, -0.17829f, 0.10394f,
+  0.08580f,  -5.74721f, 4.42467f,  3.63964f,  3.00258f,  -1.22744f, -0.29408f,
+  0.00767f,  0.12305f,  0.05249f,  -0.17166f, -0.20120f, -0.32941f, -0.31901f,
+  0.04628f,  -0.35249f, -0.18272f, 0.03956f,  -0.19329f, -0.33564f, 0.09856f,
+  -0.00173f, -0.31751f, -0.05702f, -0.20558f, -0.31464f, -0.02488f, -0.00729f,
+  -0.35854f, -0.14762f, -0.34897f, -0.12746f, 0.04011f,  -0.24918f, -0.53516f,
+  -0.28440f, -0.36789f, -1.34889f, -9.10044f, -9.19238f, 4.48042f,  6.54429f,
+  -0.00226f, 0.00430f,  0.00321f,  0.00442f,  0.87551f,  -0.16224f, -0.22832f,
+  -0.60640f, -0.28738f, 0.18062f,  0.22008f,  -0.47406f, 0.80302f,  0.12149f,
+  1.49530f,  1.05069f,  -2.02985f, -0.92833f, 0.25616f,  0.12852f,  3.51840f,
+  0.25226f,  -2.63283f, -4.04386f, 8.46300f,  -2.93408f, 0.44069f,  0.08276f,
+  0.34482f,  -0.22615f, 0.28666f,  3.02962f,  -1.20055f, -1.04832f, -0.97632f,
+  -0.99530f, 1.44196f,  1.68550f,  0.49360f,  1.08155f,  -0.26059f, -0.02876f,
+  -0.27492f, -0.06205f, -0.09496f, -0.12314f, -0.30228f, -0.07453f, -0.38857f,
+  1.17443f,  2.41497f,  1.90537f,  2.37716f,  2.91495f,  -0.44455f, -0.51176f,
+  0.48195f,  0.53032f,  0.23696f,  -1.06211f, 1.47459f,  -0.89029f, 0.29521f,
+  0.66291f,  -0.42653f, 1.82308f,  -1.30372f, -0.36192f, -3.40388f, -1.61476f,
+  -2.29745f, -0.66886f, -2.08252f, -0.54552f, -4.06849f, 0.02948f,  0.27297f,
+  -4.81472f, 4.60404f,  -0.11053f, 0.14765f,  0.02826f,  -0.14688f, -0.07066f,
+  -0.01224f, 1.20377f,  7.02725f,  -6.02627f, 6.87255f,  -3.14257f, 0.01074f,
+  0.02397f,  -0.02359f, 0.01901f,  0.14956f,  -1.67671f, 2.26714f,  2.57043f,
+  -0.45888f, -1.60265f, -2.11475f, -2.74029f, -2.74658f, -0.35630f, -2.63013f,
+  -2.14814f, -0.67266f, -1.56850f, 0.57137f,  -1.14428f, -0.34265f, -0.12521f,
+  0.01220f,  -0.74906f, -0.19270f, 0.68110f,  -0.24737f, -0.70568f, -1.64826f,
+  -0.35847f, -0.15984f, -1.17932f, -8.72306f, -8.72834f, 3.93701f,  6.17812f,
+  -0.03191f, -0.00104f, 0.01402f,  -0.00046f, -0.94517f, 1.51266f,  -0.56318f,
+  0.72260f,  -0.09253f, -0.09069f, -2.16695f, -0.23653f, 0.24418f,  2.21148f,
+  -1.47954f, -1.01439f, 0.31536f,  0.77238f,  -0.85083f, -0.15758f, -0.50886f,
+  0.09101f,
+};
+
+static const float av1_rect_partition_nn_bias_64_layer0[NUM_NODES] = {
+  0.91706f,  -1.31328f, -5.16196f, 1.13191f,  -0.98044f, -1.61122f, 1.03039f,
+  -0.98537f, -4.45568f, -4.34802f, -0.92116f, 0.66836f,  -0.10752f, -0.13065f,
+  -0.35567f, -0.35693f, 1.74941f,  1.17379f,  -3.45555f, 5.66321f,  -0.24917f,
+  -1.11940f, -0.73656f, -0.19299f, -0.04181f, 1.11010f,  -2.97859f, -0.16774f,
+  0.59835f,  -0.31269f, -0.30585f, -1.66212f,
+};
+
+static const float av1_rect_partition_nn_weights_64_layer1[NUM_NODES *
+                                                           LABEL_SIZE] = {
+  0.58963f,  4.20320f,  -8.62465f, -6.54014f, 5.41108f,  2.33581f,   -0.10354f,
+  -1.17753f, -3.45909f, -2.24722f, 2.20881f,  3.21971f,  -0.09087f,  -0.21624f,
+  0.16529f,  -8.40985f, -1.60205f, -1.41538f, 4.41826f,  -4.63069f,  -0.27742f,
+  4.08710f,  0.26439f,  -1.46028f, 0.51234f,  6.25212f,  -3.35650f,  -1.21348f,
+  1.37201f,  8.89151f,  0.28859f,  -0.97328f, -0.36196f, -2.71701f,  4.54196f,
+  -0.62476f, -2.43814f, -1.34209f, 0.12850f,  1.73859f,  3.09809f,   -4.42434f,
+  -1.82552f, -3.66420f, -0.31535f, 0.00968f,  -0.02019f, 9.66824f,   0.58835f,
+  1.50425f,  2.84487f,  2.55522f,  0.01409f,  -2.27594f, -0.31800f,  0.91076f,
+  -0.66808f, 0.33120f,  -0.12460f, 0.64457f,  -0.36416f, -10.30843f, 1.51013f,
+  2.06861f,  -0.20989f, -0.87119f, 3.68642f,  7.33662f,  -2.88037f,  -0.52414f,
+  -0.35036f, -0.45947f, -0.07406f, 6.46346f,  -0.16031f, 0.27071f,   0.38845f,
+  -0.21940f, 0.08583f,  -1.39526f, 0.50554f,  0.45279f,  -6.61856f,  1.84069f,
+  -0.19149f, -1.77235f, 0.75136f,  1.11797f,  0.32677f,  -7.10427f,  3.82908f,
+  1.04238f,  -0.91435f, 1.93317f,  -1.84946f, -0.48909f,
+};
+
+static const float av1_rect_partition_nn_bias_64_layer1[3] = {
+  0.32215f,
+  -0.57522f,
+  0.25314f,
+};
+
+static const NN_CONFIG av1_rect_partition_nnconfig_64 = {
+  FEATURE_SIZE,  // num_inputs
+  LABEL_SIZE,    // num_outputs
+  1,             // num_hidden_layers
+  {
+      NUM_NODES,
+  },  // num_hidden_nodes
+  { av1_rect_partition_nn_weights_64_layer0,
+    av1_rect_partition_nn_weights_64_layer1 },
+  { av1_rect_partition_nn_bias_64_layer0, av1_rect_partition_nn_bias_64_layer1 }
+};
+
+static const float av1_rect_partition_nn_weights_128_layer0[FEATURE_SIZE *
+                                                            NUM_NODES] = {
+  -0.70901f, -3.03481f, 3.30604f,  -1.28803f,  -0.08610f, -0.33320f, -0.30716f,
+  0.25100f,  0.14323f,  -0.98422f, -0.89084f,  -0.24508f, -1.10785f, -0.82524f,
+  0.11766f,  -0.42777f, 1.08965f,  4.35125f,   -1.19388f, 4.22042f,  4.96306f,
+  6.32406f,  3.29899f,  -0.90768f, 0.05203f,   0.38467f,  1.74257f,  -0.19918f,
+  -0.11335f, 0.00140f,  -0.42303f, -0.04419f,  0.03583f,  -0.05441f, -0.19586f,
+  0.01484f,  -1.19964f, 0.25497f,  3.04502f,   0.05446f,  -0.23253f, 0.00266f,
+  0.07117f,  -2.78986f, -4.62953f, 1.45331f,   0.43923f,  0.92298f,  -0.47736f,
+  1.49165f,  0.45942f,  -1.99787f, 3.33510f,   0.17234f,  0.04024f,  -1.42780f,
+  0.23566f,  -0.90970f, 1.18041f,  -1.45865f,  2.30878f,  -1.28507f, 1.87290f,
+  1.91186f,  4.74826f,  -3.70735f, 4.49808f,   -4.72275f, -0.02696f, -0.02642f,
+  -0.06093f, -0.01121f, -0.70683f, 2.69737f,   -1.88563f, 2.48637f,  1.10922f,
+  0.74624f,  0.40308f,  2.06396f,  1.39289f,   0.00909f,  -2.05271f, -1.53539f,
+  -1.38323f, 0.83303f,  -0.32250f, 0.51172f,   3.91249f,  1.66373f,  1.13184f,
+  -2.22874f, -1.13448f, -0.11185f, 0.19387f,   0.36770f,  -0.58933f, 0.22789f,
+  1.17307f,  0.77461f,  0.20817f,  0.33417f,   0.54037f,  0.32961f,  -0.18456f,
+  -9.78171f, -0.17216f, -3.44703f, -2.42158f,  0.51946f,  4.35949f,  -0.73335f,
+  -1.61515f, -0.29622f, -0.37617f, -0.42316f,  0.74922f,  1.44386f,  3.92704f,
+  -3.76274f, 4.19775f,  -3.86958f, 0.00074f,   -0.02418f, -0.12944f, 0.05857f,
+  -0.85507f, 5.42546f,  5.40338f,  5.54347f,   5.59791f,  -0.01611f, 0.01618f,
+  -0.01654f, -0.00270f, -0.39608f, -0.40410f,  -0.24551f, 0.09124f,  -0.34413f,
+  -0.11504f, 0.12793f,  -0.31523f, 0.09148f,   -0.08567f, -0.05140f, -0.13310f,
+  -0.81200f, 0.06882f,  -0.52537f, -12.74048f, -0.45395f, -4.04775f, -1.84887f,
+  -1.02573f, 0.32788f,  1.06828f,  -1.25503f,  -0.42693f, 2.01413f,  -2.29103f,
+  0.62271f,  1.11764f,  -1.83113f, -1.32325f,  -1.65651f, -2.87826f, 1.46910f,
+  0.60885f,  0.16079f,  0.00171f,  -0.25658f,  -0.25465f, -0.14149f, 0.19497f,
+  -0.07866f, -0.37080f, -0.05778f, -0.08870f,  -0.20491f, 0.84521f,  -0.18214f,
+  -1.38441f, -1.08932f, -1.76627f, 0.73172f,   0.05967f,  1.28057f,  3.42722f,
+  1.69287f,  0.77169f,  0.44528f,  1.85513f,   0.07840f,  1.31252f,  2.89948f,
+  1.49489f,  0.15281f,  0.54708f,  -1.14185f,  -2.51063f, 0.36618f,  -0.55322f,
+  0.96671f,  1.59470f,  1.38252f,  1.99697f,   0.03266f,  -0.23200f, -0.01127f,
+  -0.18918f, -0.37598f, -0.03119f, -0.36039f,  -0.21192f, -0.11565f, -4.22635f,
+  1.41252f,  0.56608f,  -0.08867f, 3.11924f,   -0.54597f, -0.12504f, -0.05289f,
+  -0.28665f, -0.58297f, -1.18362f, -0.76201f,  -1.22011f, -0.58756f, 0.14740f,
+  1.43971f,  0.98381f,  -0.02998f, -0.40678f,  -0.23047f, -0.12979f, 0.04003f,
+  -0.22081f, -0.09294f, -0.15955f, -0.10379f,  -0.10192f, -1.51316f, 2.39482f,
+  -1.69975f, 3.58976f,  -0.91032f, -0.03498f,  0.48982f,  -0.13418f, 0.76256f,
+  1.61003f,  -2.01676f, -1.24430f, -3.25763f,  1.12314f,  2.00740f,  0.04613f,
+  -0.14746f, -0.57374f, 3.44511f,  -0.56767f,  -4.08432f, -2.04894f, 2.35951f,
+  -0.00458f, 0.18512f,  0.09916f,  -0.04084f,  -1.56207f, 1.38034f,  4.17302f,
+  -1.47326f, -2.03530f, -0.00210f, 0.27469f,   -0.17423f, 0.86860f,  2.76195f,
+  2.43269f,  -3.57331f, 2.08715f,  -1.44171f,  -0.17389f, 2.26157f,  -0.07852f,
+  2.02519f,
+};
+
+static const float av1_rect_partition_nn_bias_128_layer0[NUM_NODES] = {
+  2.53427f,  1.66678f,  -0.84914f, -0.15070f, -1.74769f, 0.45218f,  -0.26067f,
+  2.05916f,  0.08978f,  5.30984f,  2.66243f,  -1.62740f, 0.70018f,  1.96403f,
+  -4.97152f, -0.05425f, -3.84474f, -1.28006f, 3.47490f,  -0.08373f, 0.00225f,
+  -1.40692f, -0.27569f, -0.30253f, 0.77377f,  -0.67636f, -0.26379f, 1.82348f,
+  0.66120f,  0.61119f,  -1.42293f, 0.32676f,
+};
+
+static const float av1_rect_partition_nn_weights_128_layer1[NUM_NODES *
+                                                            LABEL_SIZE] = {
+  1.53453f,  -0.23707f, 7.88368f,  0.33340f,  0.97523f,  1.38538f,  -0.16746f,
+  4.42070f,  3.18678f,  -5.03545f, -2.27029f, -3.75719f, -0.26850f, -4.93432f,
+  -8.75673f, 0.27398f,  -5.77882f, -0.91616f, -2.62725f, -0.23961f, 0.31249f,
+  3.32134f,  0.25375f,  -0.00394f, 2.30213f,  -0.14183f, 0.14544f,  -1.42830f,
+  1.31101f,  3.99389f,  -0.00017f, -2.90184f, -2.11444f, 2.16734f,  -3.05133f,
+  0.39206f,  4.61489f,  -2.88181f, -0.47745f, 2.86649f,  -1.20621f, 3.70550f,
+  1.58029f,  -4.58731f, -2.29350f, -0.76930f, 5.19135f,  -0.22521f, -5.08782f,
+  2.17316f,  1.30563f,  0.16777f,  -2.17767f, -2.09904f, 1.37001f,  0.25091f,
+  -1.76743f, 1.57940f,  0.30544f,  -2.39895f, -0.08532f, -1.77122f, 1.84010f,
+  -0.88449f, 0.79299f,  -1.35368f, -4.54110f, 0.02244f,  -5.11580f, 1.60883f,
+  0.29352f,  -6.47042f, -1.81426f, 1.24013f,  0.90980f,  7.93977f,  2.12555f,
+  5.24720f,  4.19508f,  0.21499f,  11.06045f, -0.74752f, 0.89396f,  0.26422f,
+  1.72332f,  -1.25113f, -1.71136f, 0.13676f,  -0.07867f, -0.96929f, 0.19911f,
+  3.58233f,  -0.76470f, -2.24162f, -2.87465f, 3.18736f,
+};
+
+static const float av1_rect_partition_nn_bias_128_layer1[3] = {
+  1.09014f,
+  -0.53317f,
+  -0.55668f,
+};
+
+static const NN_CONFIG av1_rect_partition_nnconfig_128 = {
+  FEATURE_SIZE,  // num_inputs
+  LABEL_SIZE,    // num_outputs
+  1,             // num_hidden_layers
+  {
+      NUM_NODES,
+  },  // num_hidden_nodes
+  { av1_rect_partition_nn_weights_128_layer0,
+    av1_rect_partition_nn_weights_128_layer1 },
+  { av1_rect_partition_nn_bias_128_layer0,
+    av1_rect_partition_nn_bias_128_layer1 }
+};
+#undef FEATURE_SIZE
+#undef NUM_NODES
+#undef LABEL_SIZE
+
+// Below are the models used for simple_motion_search_based_split
+// Thresholds
+// The first index level is for aggresiveness, and the second is frame
+// resolution, third is bsize
+static const float av1_simple_motion_search_split_thresh[4][3][5] = {
+  // Aggressiveness = 0
+  {
+      // lowres
+      {
+          1.40402595879f,  // p = 0.8028197
+          4.72845183649f,  // p = 0.99123732
+          1.86517797783f,  // p = 0.86589934
+          1.58715223005f,  // p = 0.83021506
+          7.22695596987f,  // p = 0.9992738
+      },
+      // midres
+      {
+          5.839480f,  // p = 0.997098
+          1.877167f,  // p = 0.867285
+          3.073499f,  // p = 0.955783
+          1.405601f,  // p = 0.803071
+          2.555636f,  // p = 0.927951
+      },
+      // hdres
+      {
+          5.839480f,  // p = 0.997098
+          1.877167f,  // p = 0.867285
+          3.073499f,  // p = 0.955783
+          1.405601f,  // p = 0.803071
+          2.555636f,  // p = 0.927951
+      },
+  },
+  // Aggressiveness = 1
+  {
+      // Lowres
+      {
+          100.0000f,  // p = 1.000000
+          4.952535f,  // p = 0.992984
+          1.720880f,  // p = 0.848242
+          1.426233f,  // p = 0.806314
+          1.491905f,  // p = 0.816364
+      },
+      // Midres
+      {
+          100.0000f,  // p = 100.0000
+          3.137263f,  // p = 0.958404
+          2.703262f,  // p = 0.937219
+          1.877166f,  // p = 0.867285
+          2.221149f,  // p = 0.902133
+      },
+      // Hdres
+      {
+          4.417680f,  // p = 0.988082
+          3.086898f,  // p = 0.956349
+          3.966704f,  // p = 0.981416
+          1.532565f,  // p = 0.822381
+          3.449975f,  // p = 0.969230
+      },
+  },
+  // Aggressiveness = 2
+  {
+      // lowres
+      {
+          100.000000f,  // p = 0.998048
+          1.484020f,    // p = 0.815179
+          1.866781f,    // p = 0.866085
+          1.706711f,    // p = 0.846409
+          2.080369f,    // p = 0.888980
+      },
+      // midres
+      {
+          100.000000f,  // p = 0.0
+          3.265763f,    // p = 0.963235428881
+          2.024598f,    // p = 0.883355591569
+          1.846446f,    // p = 0.863709256976
+          2.240962f,    // p = 0.903868036126
+      },
+      // hdres
+      {
+          3.133026f,  // p = 0.958234684141
+          2.940954f,  // p = 0.949834204693
+          2.484544f,  // p = 0.923051170045
+          1.702972f,  // p = 0.845922460525
+          1.655562f,  // p = 0.839641385729
+      },
+  },
+  // Aggressiveness = 3
+  {
+      // lowres
+      { 100.000000f, 1.41409519484f, 0.606066095487f, 0.0993410805635f,
+        0.762099214988f },
+      // midres
+      { 100.000000f, 0.702207995397f, 0.503550081119f, 0.0403228785199f,
+        0.557298794638f },
+      // hdres
+      { 1.21895384144f, 1.26798450469f, 0.872537808115f, 0.975869438148f,
+        1.86572095242f },
+  },
+};
+
+static const float av1_simple_motion_search_no_split_thresh[4][3][5] = {
+  // Aggressiveness = 0
+  {
+      // lowres
+      {
+          -100.0f,  // p = 0.0
+          -100.0f,  // p = 0.0
+          -100.0f,  // p = 0.0
+          -100.0f,  // p = 0.0
+          -100.0f,  // p = 0.0
+      },
+      // midres
+      {
+          -3.38168078f,  // p = 0.032872917
+          -4.08610739f,  // p = 0.016526795
+          -1.78302370f,  // p = 0.15270848
+          -100.000000f,  // p = 0.0
+          -100.000000f,  // p = 0.0
+      },
+      // hdres
+      {
+          -100.000000f,  // p = 0.0
+          -100.000000f,  // p = 0.0
+          -2.98718897f,  // p = 0.048008
+          -100.000000f,  // p = 0.0
+          -3.33229488f,  // p = 0.03447975
+      },
+  },
+  // Aggressiveness = 1
+  {
+      // Lowres
+      {
+          -100.0000f,  // p = 0.0
+          -4.893793f,  // p = 0.007437
+          -3.387766f,  // p = 0.032680
+          -2.982806f,  // p = 0.048209
+          -2.330372f,  // p = 0.088639
+      },
+      // Midres
+      {
+          -100.0000f,  // p = 0.000000
+          -6.131853f,  // p = 0.002168
+          -2.346579f,  // p = 0.087338
+          -2.712849f,  // p = 0.062219
+          -3.195430f,  // p = 0.039338
+      },
+      // Hdres
+      {
+          -3.491416f,  // p = 0.029557
+          -2.192853f,  // p = 0.100394
+          -3.620180f,  // p = 0.026079
+          -2.030855f,  // p = 0.116001
+          -2.797586f,  // p = 0.057455
+      },
+  },
+  // Aggressiveness = 2
+  {
+      // lowres
+      {
+          -100.0000f,  // p = 0.0
+          -3.617350f,  // p = 0.026151
+          -5.902503f,  // p = 0.002725
+          -4.677840f,  // p = 0.009213
+          -2.168378f,  // p = 0.102626
+      },
+      // midres
+      {
+          -100.0000f,  // p = 0.0
+          -3.204195f,  // p = 0.0390081679555
+          -2.354128f,  // p = 0.0867382128969
+          -2.523326f,  // p = 0.0742390077132
+          -3.112328f,  // p = 0.0426016085803
+      },
+      // hdres
+      {
+          -5.047760f,  // p = 0.00638270448225
+          -3.414994f,  // p = 0.0318301469487
+          -5.628090f,  // p = 0.00358255438917
+          -2.122691f,  // p = 0.10691083145
+          -1.972387f,  // p = 0.122132728355
+      },
+  },
+  // Aggressiveness = 3
+  {
+      // lowres
+      { -100.000000f, -2.04766486133f, -1.00442099188f, -1.15077982642f,
+        -1.0830321897f },
+      // midres
+      { -100.000000f, -0.985686808303f, -0.757739584866f, -0.890120107569f,
+        -0.228236297886f },
+      // hdres
+      { -1.03535679263f, -1.57431743203f, -0.564851540156f, -0.35442301663f,
+        -1.36741555171f },
+  },
+};
+
+static const float av1_simple_motion_search_split_mean_128[17] = {
+  14.119120f, 14.087010f, 12.016185f, 11.966075f, 12.042454f, 11.994805f,
+  12.152105f, 12.100394f, 12.178377f, 12.128937f, 4.779944f,  0.714786f,
+  3.535450f,  3.566207f,  0.835913f,  3.315452f,  3.302908f,
+};
+
+static const float av1_simple_motion_search_split_std_128[17] = {
+  1.832420f, 1.835338f, 2.019207f, 2.020793f, 2.008731f, 2.008403f,
+  1.900999f, 1.907081f, 1.908915f, 1.913122f, 2.109345f, 0.451517f,
+  1.407097f, 1.372501f, 0.370355f, 1.321495f, 1.319665f,
+};
+
+static const float av1_simple_motion_search_split_mean_64[17] = {
+  12.363721f, 12.314348f, 10.404341f, 10.333541f, 10.405775f, 10.336996f,
+  10.402246f, 10.330084f, 10.405584f, 10.334330f, 4.554232f,  0.896393f,
+  2.819613f,  2.855845f,  0.926296f,  2.808782f,  2.798229f,
+};
+
+static const float av1_simple_motion_search_split_std_64[17] = {
+  1.878920f, 1.882255f, 1.950167f, 1.953289f, 1.913869f, 1.914781f,
+  1.920096f, 1.924454f, 1.880200f, 1.882499f, 2.050922f, 0.304750f,
+  1.144391f, 1.125088f, 0.261289f, 1.145059f, 1.131215f,
+};
+
+static const float av1_simple_motion_search_split_mean_32[17] = {
+  10.750278f, 10.679627f, 8.745625f, 8.644149f, 8.757436f, 8.656657f,
+  8.759780f,  8.656299f,  8.772563f, 8.669839f, 4.208026f, 0.958573f,
+  2.308769f,  2.347375f,  0.961685f, 2.323464f, 2.296322f,
+};
+
+static const float av1_simple_motion_search_split_std_32[17] = {
+  1.879269f, 1.883531f, 1.935828f, 1.935677f, 1.915823f, 1.914773f,
+  1.909733f, 1.910315f, 1.890451f, 1.890032f, 1.913318f, 0.199276f,
+  0.988825f, 0.972115f, 0.191956f, 0.977131f, 0.951418f,
+};
+
+static const float av1_simple_motion_search_split_mean_16[17] = {
+  9.076768f, 8.974986f, 7.078364f, 6.926072f, 7.088739f, 6.936111f,
+  7.096697f, 6.942841f, 7.114978f, 6.961046f, 3.865480f, 0.982632f,
+  1.886023f, 1.912892f, 0.981492f, 1.926059f, 1.891233f,
+};
+
+static const float av1_simple_motion_search_split_std_16[17] = {
+  1.922965f, 1.925609f, 1.851980f, 1.847558f, 1.848410f, 1.843990f,
+  1.843931f, 1.839582f, 1.840304f, 1.836144f, 1.760042f, 0.130639f,
+  0.841086f, 0.833523f, 0.134780f, 0.840790f, 0.831309f,
+};
+
+static const float av1_simple_motion_search_split_mean_8[17] = {
+  7.120238f, 6.957731f, 5.176309f, 4.889594f, 5.178396f, 4.886607f,
+  5.195322f, 4.905566f, 5.198845f, 4.904745f, 3.648933f, 0.993198f,
+  1.496831f, 1.520804f, 0.991864f, 1.489763f, 1.460761f,
+};
+
+static const float av1_simple_motion_search_split_std_8[17] = {
+  1.698498f, 1.696000f, 1.629605f, 1.614641f, 1.632476f, 1.618831f,
+  1.618352f, 1.603742f, 1.623089f, 1.609674f, 1.668587f, 0.082193f,
+  0.759407f, 0.759684f, 0.089830f, 0.742797f, 0.730632f,
+};
+
+static const float *const av1_simple_motion_search_split_mean[5] = {
+  av1_simple_motion_search_split_mean_128,
+  av1_simple_motion_search_split_mean_64,
+  av1_simple_motion_search_split_mean_32,
+  av1_simple_motion_search_split_mean_16,
+  av1_simple_motion_search_split_mean_8,
+};
+
+static const float *const av1_simple_motion_search_split_std[5] = {
+  av1_simple_motion_search_split_std_128, av1_simple_motion_search_split_std_64,
+  av1_simple_motion_search_split_std_32,  av1_simple_motion_search_split_std_16,
+  av1_simple_motion_search_split_std_8,
+};
+
+#define NUM_HIDDEN_LAYERS_128 1
+#define NUM_FEATURES_128 17
+#define NUM_LAYER_0_UNITS_128 20
+#define NUM_LOGITS_128 1
+
+static const float av1_simple_motion_search_split_hiddenlayer_0_kernel_128[] = {
+  0.24095f,    -0.397761f,  -0.388619f,  -0.0629548f, -0.44577f,   0.688212f,
+  -0.20889f,   -1.08227f,   -0.0313894f, -0.615505f,  -0.401839f,  0.40233f,
+  -0.171305f,  0.439803f,   1.58527f,    -0.968535f,  -1.29255f,   1.14846f,
+  0.885777f,   0.116412f,   -0.225704f,  0.316506f,   0.793951f,   -0.63591f,
+  0.097789f,   -0.327027f,  -0.778396f,  -0.231667f,  -0.9622f,    1.0044f,
+  0.32594f,    0.179768f,   -0.115529f,  -0.499395f,  -1.14727f,   -1.26111f,
+  0.269818f,   -0.0882028f, -0.349107f,  0.100901f,   0.0249506f,  0.528929f,
+  0.113961f,   0.929794f,   0.242494f,   -0.122828f,  -0.0477379f, 0.170659f,
+  0.0500187f,  0.28859f,    0.78783f,    0.482412f,   0.795298f,   0.179517f,
+  0.453911f,   -0.298029f,  -0.903332f,  0.510615f,   0.691994f,   0.433383f,
+  -0.140802f,  -1.11635f,   -0.547326f,  1.11318f,    0.71905f,    0.978538f,
+  0.097444f,   -0.0386012f, 0.713599f,   0.465164f,   0.391278f,   -0.472864f,
+  0.230224f,   -0.279508f,  0.558192f,   -0.468625f,  0.55995f,    -0.57507f,
+  -1.39947f,   -0.755819f,  -1.04512f,   -0.411552f,  -0.830444f,  -0.106571f,
+  -0.0972184f, 0.251842f,   0.269955f,   0.230492f,   -0.290581f,  -0.484799f,
+  0.0151041f,  0.171047f,   0.829999f,   -0.384581f,  0.220301f,   -0.121687f,
+  1.88848f,    -0.482809f,  -0.48185f,   1.34482f,    -0.716438f,  -0.284482f,
+  -1.78592f,   -1.29333f,   0.886867f,   0.80106f,    0.456415f,   0.649095f,
+  0.231093f,   0.361562f,   0.290018f,   0.128009f,   -0.196343f,  0.0607802f,
+  0.576761f,   -0.0413836f, 0.0300984f,  -0.318998f,  0.204434f,   -0.712524f,
+  0.833394f,   -0.81168f,   0.765488f,   -0.720973f,  1.12866f,    -0.838694f,
+  1.295f,      -0.159127f,  1.05404f,    0.736519f,   0.248662f,   0.229233f,
+  0.0434302f,  0.0551856f,  0.197862f,   0.354823f,   -0.32429f,   -0.227353f,
+  -0.132198f,  -0.438118f,  -0.210401f,  -0.81046f,   0.653555f,   0.826737f,
+  0.154235f,   0.228945f,   0.123089f,   0.614964f,   -0.0940471f, -0.00676807f,
+  0.24996f,    0.949233f,   0.746526f,   -0.044474f,  0.386414f,   0.503221f,
+  0.155133f,   -0.698848f,  -0.735356f,  -0.255091f,  0.413235f,   -0.335295f,
+  -0.145757f,  0.326299f,   -0.602629f,  -0.844474f,  -0.346722f,  -0.42598f,
+  -0.491016f,  -0.447732f,  -0.965366f,  -0.0242841f, 0.836606f,   -0.104877f,
+  1.23236f,    0.683986f,   0.787005f,   -0.0253437f, 1.2145f,     1.29554f,
+  -1.24302f,   -0.229495f,  0.439415f,   0.885087f,   -0.408704f,  -0.119299f,
+  -0.0960972f, 0.60148f,    0.683271f,   -0.057129f,  -0.180295f,  -0.264815f,
+  -0.363184f,  0.638271f,   0.631083f,   -0.252899f,  -0.164364f,  -1.31274f,
+  0.354408f,   0.0429172f,  0.371154f,   -1.0978f,    0.0433642f,  -0.467394f,
+  -0.706572f,  1.57198f,    -0.0701271f, 1.93149f,    -0.446267f,  1.4519f,
+  -1.29567f,   0.309978f,   -0.878062f,  0.891494f,   0.364005f,   -0.209611f,
+  -0.125927f,  0.184097f,   0.0629695f,  -0.43375f,   -0.0980562f, 1.08547f,
+  0.578312f,   0.16566f,    -0.198852f,  -0.241854f,  -0.523934f,  -0.206037f,
+  -0.867721f,  1.00041f,    1.09848f,    -2.12562f,   -0.19992f,   -0.186128f,
+  -0.03507f,   0.0484884f,  0.160856f,   0.10802f,    -0.805141f,  -1.06902f,
+  0.290363f,   0.0222096f,  -0.849266f,  0.112932f,   0.148682f,   -0.0457585f,
+  1.139f,      1.79141f,    0.194122f,   -0.342508f,  -0.403572f,  0.133678f,
+  0.217553f,   -0.263759f,  0.18441f,    0.254529f,   0.0471115f,  0.733178f,
+  -0.416205f,  0.441447f,   -0.443335f,  0.725005f,   -0.78946f,   0.71301f,
+  -0.644969f,  1.5445f,     0.365277f,   -0.455775f,  -0.365066f,  0.4742f,
+  -0.381714f,  -0.545794f,  -0.0464861f, -0.222768f,  -0.0106466f, -0.069743f,
+  0.0335566f,  0.378348f,   -0.249663f,  0.922286f,   0.125711f,   -0.894619f,
+  0.444682f,   0.447893f,   -1.98936f,   -1.41978f,   0.0406667f,  -0.199928f,
+  -0.199786f,  0.463481f,   0.334931f,   -0.396222f,  -0.0732259f, 0.796684f,
+  -0.140817f,  -0.26878f,   0.194642f,   0.895784f,   -0.369976f,  -2.26981f,
+  -0.0791776f, -0.0492268f, 0.6715f,     0.281805f,   0.0156664f,  -0.779785f,
+  0.17743f,    0.188786f,   -0.588077f,  -0.359153f,  0.258319f,   0.881688f,
+  0.846894f,   1.00292f,    0.838134f,   0.680632f,   0.273098f,   -0.329261f,
+  0.217757f,   -0.506726f,  -0.336523f,  -0.695875f,  -0.252006f,  0.751216f,
+  0.334409f,   -0.0151467f, 0.0885474f,  0.0973114f,  -0.248754f,  -0.263716f,
+  0.369906f,   -0.213749f,  -0.0355395f, -0.137799f,  2.43233f,    -0.944233f,
+  -0.745167f,  0.318558f,   0.316608f,   0.568678f
+};
+
+static const float av1_simple_motion_search_split_hiddenlayer_0_bias_128[] = {
+  0.821344f,  1.11542f,   -1.24172f,  1.03642f,  1.13511f,
+  1.16414f,   -0.278655f, -1.35558f,  -1.26788f, -1.63189f,
+  -0.323271f, 1.21319f,   -0.888415f, 0.987145f, -1.16767f,
+  0.255833f,  -0.1392f,   1.43265f,   -1.54952f, 1.65159f
+};
+
+static const float av1_simple_motion_search_split_logits_kernel_128[] = {
+  0.3565753f, 0.5490161f, -1.015597f, 0.565366f,   0.751604f,
+  0.922747f,  -1.931846f, 1.759353f,  -0.7362949f, 0.5707034f,
+  -1.092127f, 0.936767f,  2.034499f,  2.08148f,    0.9509507f,
+  -1.342504f, -0.834566f, 0.618184f,  0.844113f,   1.182693f
+};
+
+static const float av1_simple_motion_search_split_logits_bias_128[] = {
+  1.819351f
+};
+
+static const NN_CONFIG av1_simple_motion_search_split_nn_config_128 = {
+  NUM_FEATURES_128,
+  NUM_LOGITS_128,
+  NUM_HIDDEN_LAYERS_128,
+  {
+      NUM_LAYER_0_UNITS_128,
+  },
+  {
+      av1_simple_motion_search_split_hiddenlayer_0_kernel_128,
+      av1_simple_motion_search_split_logits_kernel_128,
+  },
+  {
+      av1_simple_motion_search_split_hiddenlayer_0_bias_128,
+      av1_simple_motion_search_split_logits_bias_128,
+  },
+};
+
+#undef NUM_HIDDEN_LAYERS_128
+#undef NUM_FEATURES_128
+#undef NUM_LAYER_0_UNITS_128
+#undef NUM_LOGITS_128
+
+#define NUM_HIDDEN_LAYERS_64 1
+#define NUM_FEATURES_64 17
+#define NUM_LAYER_0_UNITS_64 24
+#define NUM_LOGITS_64 1
+
+static const float av1_simple_motion_search_split_hiddenlayer_0_kernel_64[] = {
+  -1.40663f,    -0.851503f,   -0.0613111f,  0.741591f,    0.302754f,
+  0.184001f,    0.0474853f,   0.371096f,    0.0541624f,   0.381508f,
+  0.355427f,    0.0428822f,   0.154916f,    -0.00490099f, 0.025484f,
+  0.0208921f,   0.140596f,    -0.292525f,   -0.459067f,   -0.081393f,
+  0.109824f,    -0.290183f,   0.720236f,    0.385835f,    -0.150643f,
+  -0.078518f,   0.0979819f,   -0.102135f,   0.137152f,    -0.0786457f,
+  0.0171441f,   0.991338f,    -0.546583f,   -1.0714f,     -0.0842851f,
+  0.244072f,    0.427379f,    0.146775f,    -0.921613f,   -0.912093f,
+  0.393566f,    -0.232375f,   0.19963f,     0.312355f,    0.55659f,
+  -0.104714f,   -0.137563f,   0.0985237f,   0.0788307f,   -0.225514f,
+  0.0228832f,   -0.288733f,   -0.00737685f, -0.711657f,   -0.256796f,
+  0.0869605f,   0.583977f,    0.384306f,    1.46692f,     -0.741126f,
+  -0.21105f,    -0.276604f,   -0.0151463f,  -0.0227997f,  -0.0403232f,
+  0.044122f,    0.0185784f,   -0.0451951f,  0.00489513f,  -0.387131f,
+  0.0966724f,   -0.599174f,   -0.00243351f, -0.21439f,    0.302043f,
+  0.130334f,    -0.191251f,   0.863261f,    -1.50112f,    0.00901057f,
+  0.000324294f, -0.0572545f,  0.0117685f,   -0.0734682f,  -0.0570435f,
+  -0.126253f,   1.2313f,      -0.328267f,   0.211788f,    -0.175438f,
+  -0.0419298f,  0.166447f,    -0.178739f,   -0.326221f,   -0.0439188f,
+  1.01182f,     -0.390678f,   -0.426343f,   0.0944665f,   -0.225042f,
+  -0.183344f,   0.0500763f,   -0.377393f,   -0.673401f,   -0.436907f,
+  -0.00366876f, -0.363412f,   0.195194f,    0.250248f,    -0.397193f,
+  -0.0917222f,  -0.0221579f,  1.7693f,      -0.0694484f,  -0.0410764f,
+  -0.134571f,   -0.159992f,   -0.170359f,   -0.249333f,   -0.128056f,
+  -0.617054f,   -0.808701f,   -0.540642f,   0.396391f,    0.147787f,
+  0.346916f,    0.709852f,    0.116064f,    0.0509731f,   0.073713f,
+  -0.365082f,   -1.09287f,    -0.618214f,   0.20545f,     0.126161f,
+  -0.140012f,   0.62592f,     0.316326f,    -0.392765f,   -0.15934f,
+  0.337617f,    -0.41669f,    -0.295225f,   0.0602025f,   -0.0150657f,
+  -0.319629f,   0.783729f,    -0.0661199f,  -0.362657f,   0.390042f,
+  -0.043614f,   -0.0414596f,  0.121155f,    -0.309775f,   -0.284761f,
+  -0.243932f,   0.279855f,    -0.266823f,   0.734824f,    -0.164028f,
+  0.261776f,    -0.105585f,   0.10733f,     -0.180469f,   1.18875f,
+  -1.12836f,    -0.173008f,   0.150221f,    0.111598f,    0.148306f,
+  -1.2833f,     -1.06346f,    0.233546f,    0.16432f,     0.00142378f,
+  0.340574f,    -0.0140885f,  0.634761f,    -0.122096f,   0.821487f,
+  0.421424f,    -0.0256687f,  -0.035503f,   -0.0453547f,  -0.0215179f,
+  -0.0671277f,  -0.0486862f,  -0.962761f,   -0.208383f,   0.109573f,
+  -0.210668f,   -0.176485f,   0.421279f,    0.41605f,     0.342084f,
+  0.619364f,    0.103718f,    -0.00341643f, 0.00266677f,  0.249089f,
+  -0.22848f,    -0.0368968f,  1.12092f,     -0.64912f,    -0.456579f,
+  0.477823f,    0.418345f,    1.41515f,     0.0936279f,   0.886155f,
+  -0.785656f,   -0.217109f,   -0.561829f,   -0.286435f,   -0.884068f,
+  -0.148839f,   -0.282848f,   0.0683745f,   0.0962815f,   -0.111975f,
+  0.0509158f,   -0.211274f,   0.744909f,    -0.8982f,     0.315232f,
+  -0.78624f,    0.598387f,    -0.530952f,   0.677357f,    0.0371339f,
+  0.99209f,     -0.681899f,   -0.291416f,   -0.224822f,   -0.26049f,
+  -0.0436525f,  -0.380004f,   -0.27187f,    0.534779f,    0.717939f,
+  0.418197f,    -0.152539f,   -0.0684039f,  -0.186308f,   -0.0653121f,
+  0.194145f,    -0.196367f,   0.256997f,    -0.726269f,   -0.307672f,
+  -0.153362f,   0.450827f,    0.708842f,    -0.0667079f,  0.555564f,
+  0.0486892f,   0.0715072f,   -0.7211f,     -0.849797f,   0.0650271f,
+  1.2747f,      -0.646738f,   -0.53042f,    0.182197f,    0.928203f,
+  0.180621f,    -0.00640791f, -0.171416f,   0.092688f,    -0.391275f,
+  -0.0650657f,  0.0843773f,   0.170824f,    0.378085f,    0.0596657f,
+  0.844398f,    -1.3083f,     -1.27828f,    -0.199179f,   0.557855f,
+  0.241479f,    0.385804f,    0.169533f,    -0.0028072f,  0.0538041f,
+  0.00136234f,  0.0130481f,   0.0349449f,   -0.0366494f,  -0.000474055f,
+  0.437956f,    0.286724f,    -0.298187f,   0.461967f,    0.43065f,
+  -0.0877194f,  -0.19133f,    0.379121f,    -0.687751f,   -1.64077f,
+  -0.375191f,   -0.336836f,   -0.323904f,   -0.101859f,   0.0126672f,
+  -0.346332f,   0.112303f,    -0.863336f,   0.155538f,    0.366509f,
+  -0.0976829f,  0.635278f,    -0.681967f,   -0.527729f,   0.591839f,
+  0.366678f,    0.189981f,    0.0208007f,   -0.565809f,   0.70183f,
+  -0.282844f,   -0.327485f,   0.347243f,    -1.13014f,    -0.373378f,
+  -0.514978f,   0.662994f,    -0.144931f,   0.1402f,      -0.820049f,
+  0.711498f,    0.681156f,    1.06515f,     -0.423409f,   -0.0392664f,
+  0.0675396f,   -0.0508602f,  0.0431443f,   0.0212639f,   -0.0279887f,
+  -0.62611f,    -0.202064f,   0.701934f,    1.28452f,     -0.00858481f,
+  -0.517249f,   0.0615832f,   -0.260215f,   0.0949119f,   -0.28423f,
+  -0.39573f,    -0.0574246f,  -0.318658f,   0.0601775f,   -0.0629386f,
+  -0.134208f,   0.111686f,    -0.23355f,    0.078667f,    0.741023f,
+  0.828523f,    -0.345067f,   -0.315135f,   -0.0957154f,  0.522825f,
+  -0.190057f,   -0.473789f,   -0.390489f,   0.200677f,    -0.0271802f,
+  0.110336f,    0.493302f,    0.663126f,    0.570148f,    -0.380042f,
+  -0.437349f,   -0.660884f,   0.301908f,    0.0644179f,   0.172494f,
+  0.461917f,    0.330938f,    -0.140041f,   -0.0430205f,  -1.51003f,
+  -0.410984f,   -0.182161f,   0.0235313f,   -0.364849f,   0.154183f,
+  -0.592465f,   0.272701f,    0.192389f,    -0.0497777f,  -0.924467f,
+  -0.179513f,   -0.592217f,   0.436363f,    -0.0716164f,  0.189094f,
+  -0.574697f,   -0.304303f,   0.326441f,    -0.0865553f,  0.735948f,
+  0.266912f,    0.435824f,    -0.123322f
+};
+
+static const float av1_simple_motion_search_split_hiddenlayer_0_bias_64[] = {
+  -1.19333f,  1.01834f,   -1.10844f,  0.0454873f, -1.45506f,   0.580864f,
+  -0.040979f, -0.505681f, -1.15072f,  0.692697f,  -0.520812f,  -0.479384f,
+  0.529652f,  0.507252f,  -1.08619f,  0.0586375f, 0.0929614f,  -0.46753f,
+  -0.701857f, -0.362933f, -0.291983f, -0.133933f, -0.0131351f, -0.267582f
+};
+
+static const float av1_simple_motion_search_split_logits_kernel_64[] = {
+  -3.32501f,  0.43082f,   -1.060692f, 1.328908f,  0.8892894f,  0.6488833f,
+  -1.096516f, -0.664786f, -1.301339f, 0.508805f,  -2.128406f,  -0.757304f,
+  0.383839f,  0.694763f,  -0.591725f, 0.770385f,  1.021594f,   0.589181f,
+  -0.76238f,  1.488826f,  0.709135f,  -0.575738f, 0.26421759f, -0.2484219f
+};
+
+static const float av1_simple_motion_search_split_logits_bias_64[] = {
+  0.699037f
+};
+
+static const NN_CONFIG av1_simple_motion_search_split_nn_config_64 = {
+  NUM_FEATURES_64,
+  NUM_LOGITS_64,
+  NUM_HIDDEN_LAYERS_64,
+  {
+      NUM_LAYER_0_UNITS_64,
+  },
+  {
+      av1_simple_motion_search_split_hiddenlayer_0_kernel_64,
+      av1_simple_motion_search_split_logits_kernel_64,
+  },
+  {
+      av1_simple_motion_search_split_hiddenlayer_0_bias_64,
+      av1_simple_motion_search_split_logits_bias_64,
+  },
+};
+
+#undef NUM_HIDDEN_LAYERS_64
+#undef NUM_FEATURES_64
+#undef NUM_LAYER_0_UNITS_64
+#undef NUM_LOGITS_64
+
+#define NUM_HIDDEN_LAYERS_32 1
+#define NUM_FEATURES_32 17
+#define NUM_LAYER_0_UNITS_32 20
+#define NUM_LOGITS_32 1
+
+static const float av1_simple_motion_search_split_hiddenlayer_0_kernel_32[] = {
+  -0.980626f,   -0.946611f,    0.103761f,    0.408899f,    0.498149f,
+  0.0490161f,   0.253279f,     0.332029f,    0.00367441f,  0.364401f,
+  -0.236433f,   0.0592119f,    -0.0978848f,  0.159733f,    -0.018052f,
+  -1.10726f,    1.16167f,      -0.244982f,   -0.147819f,   -0.147095f,
+  0.111404f,    -0.349502f,    0.441178f,    0.0984191f,   -0.135537f,
+  -0.0423312f,  0.0123079f,    0.358012f,    -0.266796f,   0.0125811f,
+  0.196563f,    0.337093f,     -1.07266f,    -1.25134f,    0.57337f,
+  -0.521717f,   0.259824f,     0.537383f,    -0.463688f,   -0.336128f,
+  0.373385f,    0.483443f,     -0.229293f,   -0.33373f,    -0.656021f,
+  0.768647f,    0.179279f,     0.315415f,    0.187749f,    1.07839f,
+  0.0626629f,   -0.230299f,    0.662606f,    -0.414154f,   0.459334f,
+  -0.6312f,     0.427704f,     -0.249849f,   0.701056f,    -0.707969f,
+  0.057401f,    0.620434f,     0.665748f,    -0.501356f,   -0.230685f,
+  0.0722371f,   -0.0988625f,   -0.114035f,   -0.653799f,   0.571353f,
+  0.268276f,    1.13251f,      -1.0695f,     -0.225607f,   -0.984355f,
+  -0.42213f,    0.300422f,     1.21492f,     -0.139931f,   -0.000726004f,
+  0.045964f,    -0.0817352f,   -0.0278813f,  -0.0102341f,  -0.0144087f,
+  -0.475882f,   1.20682f,      -0.359919f,   0.277189f,    -0.166401f,
+  0.599211f,    -0.129872f,    0.574211f,    -0.247573f,   0.824405f,
+  -1.53329f,    -0.202151f,    -0.328698f,   -0.516322f,   -0.281416f,
+  -0.383651f,   -0.252862f,    -0.43185f,    0.456802f,    -0.430055f,
+  -0.55245f,    -0.6884f,      -0.541456f,   -0.281376f,   1.10425f,
+  -0.140706f,   1.59816f,      -0.0343895f,  -0.00920039f, -0.0307667f,
+  0.0560132f,   -0.0340302f,   -0.10848f,    0.0593314f,   -0.951795f,
+  0.876831f,    -1.00548f,     -0.566244f,   0.430061f,    1.10109f,
+  -0.634212f,   -0.0755369f,   -0.108953f,   1.03191f,     0.109036f,
+  -0.0415309f,  0.0681162f,    -0.0611775f,  -0.0231938f,  0.0973158f,
+  -0.0558169f,  -0.823484f,    -0.918509f,   0.16756f,     0.27087f,
+  0.286074f,    0.174069f,     0.1304f,      0.386074f,    0.433953f,
+  0.0291467f,   -1.74087f,     0.0296094f,   -0.00793714f, -0.13041f,
+  0.00990992f,  -0.0137848f,   -0.0742606f,  -0.251029f,   -0.645316f,
+  0.640029f,    0.550607f,     0.470097f,    0.549451f,    -0.285723f,
+  -0.164759f,   -0.128166f,    -0.391496f,   -0.80287f,    0.0769472f,
+  1.34391f,     0.0215005f,    0.0669497f,   0.131919f,    0.291674f,
+  0.0952889f,   -0.677953f,    -0.364054f,   0.144823f,    0.246198f,
+  -0.12393f,    0.363661f,     0.215091f,    -0.239658f,   0.18491f,
+  0.118703f,    0.0064156f,    1.38619f,     -1.3845f,     0.0567323f,
+  1.20812f,     -0.720374f,    -1.92158f,    -1.48657f,    0.335601f,
+  0.409379f,    0.373618f,     0.231274f,    0.292194f,    0.368619f,
+  0.2398f,      0.473579f,     0.83402f,     -0.0133751f,  -0.00344358f,
+  2.20688e-05f, 0.00836757f,   0.00405377f,  0.0110539f,   -0.260154f,
+  0.192112f,    -0.666986f,    0.302875f,    -0.113302f,   0.17882f,
+  -0.221493f,   0.146161f,     -0.448697f,   0.584187f,    0.122109f,
+  0.989981f,    -1.14706f,     -0.734042f,   0.0638213f,   0.213357f,
+  0.068543f,    -0.808558f,    0.404741f,    0.808313f,    1.57523f,
+  -0.113448f,   0.254102f,     -0.350065f,   -0.615f,      0.0753549f,
+  -0.540936f,   -0.0250732f,   -0.225681f,   -0.161384f,   0.0128342f,
+  -0.0933368f,  -0.286904f,    0.130133f,    -0.874747f,   0.392585f,
+  -0.493135f,   0.169708f,     0.0909804f,   1.89921f,     -0.469954f,
+  0.65165f,     -0.953401f,    -0.21595f,    -0.37479f,    0.0451146f,
+  0.0234621f,   -0.0596903f,   -0.0682308f,  -0.0830426f,  0.130011f,
+  -0.409141f,   0.0627038f,    -0.581148f,   -0.513922f,   0.631676f,
+  0.0637034f,   0.0539081f,    0.0638872f,   0.515863f,    -0.0123463f,
+  0.177238f,    0.279506f,     -0.930345f,   1.23726f,     0.202851f,
+  0.708792f,    -0.445086f,    -0.0267075f,  -0.913822f,   -0.0714978f,
+  -0.281107f,   -0.0770565f,   -0.23086f,    -0.165893f,   -0.319683f,
+  0.216235f,    -0.490999f,    2.04841f,     -0.0524071f,  -0.239043f,
+  -0.0526375f,  0.023002f,     -0.132685f,   -0.155354f,   -0.186503f,
+  -0.904296f,   0.166478f,     0.063268f,    -0.302842f,   -0.27179f,
+  -0.428299f,   0.50193f,      0.480717f,    -0.864275f,   0.317096f,
+  0.40698f,     0.0286107f,    0.189432f,    -0.0374374f,  0.0671728f,
+  0.203681f,    -0.457959f,    -0.155776f,   0.340948f,    0.542841f,
+  0.342675f,    -0.000952399f, 0.470957f,    0.744418f,    -1.11763f,
+  -0.658812f,   -0.044832f,    0.0688237f,   -0.357766f,   0.428662f,
+  -0.087152f,   -0.291903f,    0.373244f,    -0.587853f,   0.415895f,
+  -0.535694f,   0.621785f,     -0.143648f,   0.0451373f,   0.00068827f,
+  1.84432f,     -1.26239f,     -0.432087f,   -0.152307f,   0.0293551f,
+  0.184744f,    -0.0173156f,   -0.00572154f, -0.0305062f,  -0.0900071f
+};
+
+static const float av1_simple_motion_search_split_hiddenlayer_0_bias_32[] = {
+  0.160011f,  0.903856f,   -0.13738f,  0.358221f, -0.0906044f,
+  -0.606558f, -0.0215651f, -0.03377f,  -1.67017f, -0.144554f,
+  -0.201482f, -0.87719f,   0.639815f,  -0.51976f, -0.309922f,
+  -1.33421f,  0.721328f,   -0.889354f, -1.7158f,  -0.285963f
+};
+
+static const float av1_simple_motion_search_split_logits_kernel_32[] = {
+  -0.2745374f,  0.333548f,  -0.2437388f, 0.288009f,   0.55635f,
+  0.4560176f,   0.2970518f, 0.391192f,   1.311854f,   -0.231219f,
+  -0.2968651f,  -1.819984f, 0.2775824f,  0.28929857f, 0.419126f,
+  -0.32868411f, -0.916399f, -0.1921077f, -0.617489f,  0.637953f
+};
+
+static const float av1_simple_motion_search_split_logits_bias_32[] = {
+  0.208473f
+};
+
+static const NN_CONFIG av1_simple_motion_search_split_nn_config_32 = {
+  NUM_FEATURES_32,
+  NUM_LOGITS_32,
+  NUM_HIDDEN_LAYERS_32,
+  {
+      NUM_LAYER_0_UNITS_32,
+  },
+  {
+      av1_simple_motion_search_split_hiddenlayer_0_kernel_32,
+      av1_simple_motion_search_split_logits_kernel_32,
+  },
+  {
+      av1_simple_motion_search_split_hiddenlayer_0_bias_32,
+      av1_simple_motion_search_split_logits_bias_32,
+  },
+};
+
+#undef NUM_HIDDEN_LAYERS_32
+#undef NUM_FEATURES_32
+#undef NUM_LAYER_0_UNITS_32
+#undef NUM_LOGITS_32
+
+#define NUM_HIDDEN_LAYERS_16 1
+#define NUM_FEATURES_16 17
+#define NUM_LAYER_0_UNITS_16 20
+#define NUM_LOGITS_16 1
+
+static const float av1_simple_motion_search_split_hiddenlayer_0_kernel_16[] = {
+  0.0136957f,   0.182135f,    -0.583394f,    0.0556956f,   0.211152f,
+  0.168234f,    -0.694203f,   -0.678216f,    0.289943f,    1.00014f,
+  -0.0427784f,  -0.0427538f,  -0.0276009f,   -0.00133608f, 0.0901944f,
+  0.0674892f,   0.104068f,    -0.308582f,    -0.43596f,    0.855997f,
+  -0.223414f,   0.0390026f,   0.366492f,     0.216065f,    -0.386863f,
+  -0.148823f,   -0.297022f,   0.0529546f,    -0.202885f,   1.26471f,
+  -0.861163f,   -0.0949431f,  0.573627f,     -0.00277083f, -0.616063f,
+  -0.626927f,   0.371583f,    -0.411743f,    0.173387f,    -0.209734f,
+  0.293697f,    -0.260714f,   0.442728f,     -0.594486f,   1.38987f,
+  0.208025f,    -0.0433776f,  0.01173f,      0.921766f,    -0.168379f,
+  0.000697326f, 0.209967f,    -0.304577f,    0.149551f,    -0.196658f,
+  0.389251f,    -0.449106f,   -0.456329f,    0.669073f,    -0.163806f,
+  0.083348f,    -0.0783998f,  0.0678355f,    0.0510435f,   0.103964f,
+  0.104537f,    -0.778093f,   -1.0641f,      -0.626102f,   -2.02131f,
+  0.159591f,    0.254161f,    -0.000362642f, 0.289859f,    0.192713f,
+  0.139801f,    -0.0251327f,  0.164002f,     1.22892f,     -0.0852193f,
+  0.0769487f,   0.0296408f,   -0.0418688f,   0.0936023f,   0.0448523f,
+  0.674015f,    -0.0732944f,  0.313575f,     -0.593432f,   0.642067f,
+  -1.06063f,    0.468223f,    -0.769085f,    -0.173798f,   -0.175663f,
+  0.692808f,    0.00753295f,  -0.123327f,    -0.0234937f,  -0.0923153f,
+  0.0216917f,   -0.0690157f,  -0.397488f,    0.426628f,    0.264475f,
+  0.342074f,    -0.139817f,   0.215915f,     0.422544f,    -0.321102f,
+  0.0355587f,   0.460193f,    0.0315326f,    0.080556f,    -0.0256533f,
+  -0.0857874f,  -0.488283f,   -0.299653f,    -0.245987f,   0.104383f,
+  0.203731f,    0.328734f,    0.668104f,     -0.586909f,   -0.501335f,
+  -0.661292f,   -0.359811f,   0.00951363f,   0.816315f,    -0.0124104f,
+  0.0545827f,   0.089863f,    0.0125486f,    0.043609f,    -0.0259544f,
+  0.0123911f,   0.12557f,     -0.539875f,    -0.0556721f,  0.16532f,
+  0.265834f,    -0.384171f,   0.646496f,     0.366147f,    -0.111272f,
+  0.262096f,    -0.0845724f,  0.382724f,     0.165783f,    0.1025f,
+  0.392988f,    0.290525f,    0.038659f,     0.540269f,    -0.485586f,
+  -0.273065f,   -0.154052f,   -0.0896895f,   -0.35394f,    0.193214f,
+  -0.423728f,   0.654576f,    -0.373321f,    0.814914f,    0.026278f,
+  -0.0328304f,  -0.220913f,   -0.0442121f,   0.487545f,    -0.509537f,
+  -0.777581f,   -1.23886f,    0.223482f,     0.206009f,    0.20391f,
+  0.194628f,    0.226762f,    0.171609f,     -0.219037f,   0.557892f,
+  -0.312011f,   1.27709f,     0.064013f,     0.105384f,    0.0493933f,
+  0.074059f,    -0.0100078f,  -0.0176888f,   -0.440005f,   0.302922f,
+  -0.197456f,   0.296128f,    -0.326647f,    0.305323f,    -0.30696f,
+  0.201951f,    -0.15874f,    -0.793042f,    0.0197254f,   0.0569867f,
+  -0.0295468f,  -0.0215012f,  0.025855f,     -0.0196102f,  0.215558f,
+  -0.253069f,   0.298469f,    0.261269f,     0.435305f,    0.0120354f,
+  -0.384789f,   -0.2772f,     0.0366613f,    -0.494994f,   0.149072f,
+  1.32981f,     -0.427717f,   0.43938f,      -0.16375f,    -0.444342f,
+  0.548214f,    0.127955f,    -1.24387f,     0.0863676f,   0.175071f,
+  0.172673f,    -0.0906204f,  0.444454f,     -0.546669f,   0.215857f,
+  -0.100621f,   0.200699f,    -0.0985915f,   0.134706f,    -0.256396f,
+  0.393427f,    0.119606f,    -0.214278f,    -0.0183637f,  0.194266f,
+  -0.238025f,   0.182203f,    0.599718f,     0.846933f,    0.0607852f,
+  -0.183434f,   -0.723743f,   -0.72414f,     -0.124701f,   0.0227527f,
+  -0.0664636f,  -0.0385867f,  -0.0257377f,   -0.149054f,   0.12077f,
+  0.678029f,    -0.624456f,   0.189644f,     -0.518604f,   0.134397f,
+  -0.189777f,   -0.309376f,   -0.00377086f,  0.701132f,    -0.170915f,
+  0.00736111f,  -0.121906f,   0.329136f,     0.165514f,    0.0328356f,
+  0.171275f,    0.248619f,    0.247704f,     -0.449933f,   0.0841684f,
+  0.136982f,    0.122703f,    -0.0169439f,   -0.0726496f,  0.302648f,
+  -0.128556f,   0.0667425f,   -0.289717f,    -0.207532f,   -1.20269f,
+  -0.68892f,    0.045259f,    0.0973945f,    0.0988314f,   -0.944748f,
+  -0.180401f,   0.134331f,    0.033834f,     0.109023f,    0.265723f,
+  0.38063f,     -0.106518f,   -0.0686953f,   0.3744f,      -1.0957f,
+  0.0302782f,   0.0515164f,   0.00188222f,   0.0014413f,   -0.0404425f,
+  0.0124618f,   -0.0828645f,  0.506166f,     -0.776352f,   -0.405138f,
+  -0.123887f,   0.0732116f,   0.379928f,     0.604524f,    -0.492317f,
+  0.439191f,    0.0744193f,   0.389101f,     0.0604518f,   0.0943165f,
+  0.0339942f,   0.0917975f,   0.0161988f,    0.512227f,    0.538021f,
+  -0.411495f,   0.307281f,    0.33746f,      -0.218639f,   0.265742f,
+  0.39738f,     -0.12442f,    0.125236f,     -0.0845223f,  -0.150396f,
+  0.0334878f,   -0.00391915f, 0.0406864f,    -0.0487059f,  0.0377073f
+};
+
+static const float av1_simple_motion_search_split_hiddenlayer_0_bias_16[] = {
+  0.0535976f, -0.0130279f, 0.150146f,   -0.511132f, -0.357698f,
+  0.6719f,    -1.27877f,   -0.0208048f, 0.0961914f, 0.263603f,
+  0.704574f,  -1.48998f,   0.728063f,   0.941829f,  -0.199981f,
+  0.797802f,  -0.29816f,   -0.60894f,   -0.116624f, -1.16723f
+};
+
+static const float av1_simple_motion_search_split_logits_kernel_16[] = {
+  0.343153f,   -0.2110482f, -0.487199f,   0.3274144f, -2.1975f,
+  -0.6051438f, 0.1901127f,  0.4741924f,   -0.24029f,  -0.185018f,
+  -0.652635f,  2.57714f,    -0.31033031f, -0.307222f, 0.329035f,
+  -0.430181f,  0.3429f,     0.742292f,    0.3269808f, 0.4142165f
+};
+
+static const float av1_simple_motion_search_split_logits_bias_16[] = {
+  -0.783658f
+};
+
+static const NN_CONFIG av1_simple_motion_search_split_nn_config_16 = {
+  NUM_FEATURES_16,
+  NUM_LOGITS_16,
+  NUM_HIDDEN_LAYERS_16,
+  {
+      NUM_LAYER_0_UNITS_16,
+  },
+  {
+      av1_simple_motion_search_split_hiddenlayer_0_kernel_16,
+      av1_simple_motion_search_split_logits_kernel_16,
+  },
+  {
+      av1_simple_motion_search_split_hiddenlayer_0_bias_16,
+      av1_simple_motion_search_split_logits_bias_16,
+  },
+};
+
+#undef NUM_HIDDEN_LAYERS_16
+#undef NUM_FEATURES_16
+#undef NUM_LAYER_0_UNITS_16
+#undef NUM_LOGITS_16
+
+#define NUM_HIDDEN_LAYERS_8 1
+#define NUM_FEATURES_8 17
+#define NUM_LAYER_0_UNITS_8 20
+#define NUM_LOGITS_8 1
+
+static const float av1_simple_motion_search_split_hiddenlayer_0_kernel_8[] = {
+  0.079443f,   -1.04068f,   0.336819f,    -0.20901f,   0.796251f,
+  0.181066f,   0.0118876f,  -0.207145f,   0.250671f,   -0.402119f,
+  -0.0847227f, 1.88683f,    0.303469f,    0.0718458f,  0.0338589f,
+  0.158896f,   0.0540238f,  -0.385426f,   0.955925f,   0.424506f,
+  0.492584f,   -0.795058f,  -0.248667f,   -0.905349f,  -0.316989f,
+  0.545471f,   0.63762f,    -0.232613f,   -0.238947f,  -0.395338f,
+  -0.322673f,  -0.0761563f, -0.125357f,   0.0694415f,  -0.371599f,
+  0.358387f,   -0.486841f,  0.403863f,    -0.0295666f, 0.283074f,
+  -0.424396f,  0.156318f,   -0.685355f,   0.6663f,     0.337949f,
+  0.273198f,   0.517448f,   0.458911f,    0.157252f,   0.692096f,
+  0.64965f,    -0.23987f,   -1.08431f,    -0.252475f,  -0.332614f,
+  -0.712291f,  -0.380973f,  0.460545f,    0.48936f,    0.337601f,
+  0.489223f,   1.65336f,    -0.223585f,   0.17367f,    -0.235057f,
+  -0.456773f,  0.327877f,   -0.221192f,   -0.940151f,  -1.06616f,
+  0.687084f,   -0.109973f,  0.106636f,    0.445895f,   0.163432f,
+  0.378306f,   0.201902f,   0.176811f,    0.693082f,   1.62156f,
+  -0.178346f,  0.455175f,   1.61943f,     0.231376f,   0.0890932f,
+  -0.889693f,  -1.03298f,   0.778196f,    -0.0289539f, 0.137848f,
+  0.18707f,    0.171889f,   0.119157f,    0.24893f,    -0.313628f,
+  0.00250735f, -0.0758209f, 0.272974f,    -0.229825f,  2.47926f,
+  -0.0354665f, 0.175366f,   0.0411555f,   -1.52149f,   -0.0258663f,
+  0.253027f,   -0.0520839f, -0.0189782f,  0.362387f,   -0.371154f,
+  0.622929f,   0.0447056f,  0.242529f,    -0.168391f,  0.308935f,
+  -0.117294f,  2.16307f,    0.0673638f,   0.080771f,   -0.460779f,
+  -0.940176f,  0.473266f,   -0.0125302f,  0.475145f,   -0.218187f,
+  0.43258f,    -0.0380196f, 0.413607f,    -0.110856f,  -1.52076f,
+  0.0896812f,  0.246636f,   -0.0612008f,  0.189583f,   0.0106902f,
+  -0.158403f,  -0.629377f,  -0.0634279f,  -0.0864584f, -0.226568f,
+  -0.286234f,  -0.0721132f, -0.43702f,    0.113702f,   0.433372f,
+  0.743396f,   0.14312f,    0.29914f,     0.801188f,   0.7609f,
+  0.385046f,   0.480314f,   0.171119f,    -1.59058f,   -1.18853f,
+  0.150676f,   0.408123f,   -0.00677924f, 0.398145f,   0.0914611f,
+  0.176945f,   0.0677457f,  0.316478f,    0.998219f,   -0.22618f,
+  0.0756793f,  -0.0156674f, 0.105716f,    0.0496245f,  -0.0827133f,
+  -0.423119f,  -0.161033f,  0.212962f,    -0.234453f,  0.743366f,
+  1.04108f,    0.0597604f,  -0.285993f,   -0.114829f,  -0.557364f,
+  -0.840051f,  0.326509f,   -0.192508f,   -0.141769f,  0.370626f,
+  -0.126353f,  0.00672923f, 0.493623f,    -0.852076f,  0.466798f,
+  -0.226436f,  0.259268f,   -0.452662f,   0.0721126f,  0.0198245f,
+  0.2048f,     0.02506f,    0.316194f,    0.814651f,   1.01288f,
+  -0.569607f,  -0.0838994f, 1.37146f,     -0.613135f,  0.441761f,
+  -0.643901f,  0.364269f,   -0.147177f,   0.338001f,   -0.332376f,
+  0.518875f,   -0.628964f,  -0.291889f,   -0.050736f,  0.108047f,
+  1.05673f,    0.0479492f,  0.466756f,    -0.0867334f, -0.0355575f,
+  0.57626f,    -0.227583f,  -0.146421f,   0.0990489f,  0.117351f,
+  -0.103858f,  -0.0336936f, 0.0201903f,   -0.0766383f, -0.010211f,
+  0.0400779f,  0.0725462f,  0.137142f,    0.478261f,   0.287869f,
+  0.0882359f,  -0.739754f,  -0.853521f,   -0.43703f,   0.316856f,
+  0.27593f,    0.312149f,   0.175575f,    0.441839f,   0.264325f,
+  0.0148051f,  -0.005559f,  0.373176f,    0.933701f,   -0.0197615f,
+  0.0219723f,  -0.0559883f, -0.103456f,   -0.0323009f, 0.0773202f,
+  -0.390838f,  0.855488f,   -0.596525f,   -0.249093f,  0.124262f,
+  0.220172f,   0.0552478f,  1.04041f,     -0.960992f,  -0.495255f,
+  -0.211612f,  0.350007f,   -0.238998f,   -0.0265068f, 0.384686f,
+  -0.0815808f, -0.0570019f, 0.123903f,    -0.485114f,  -0.00282573f,
+  -0.0649603f, 0.163719f,   -0.469479f,   -0.439713f,  0.0602562f,
+  -0.527993f,  -0.111458f,  2.48686f,     -0.180723f,  0.0553895f,
+  0.0560679f,  -0.0978928f, -0.216063f,   0.089457f,   -1.5602f,
+  -1.62332f,   -0.147388f,  0.736155f,    0.440409f,   0.243519f,
+  0.0622638f,  0.522932f,   0.109686f,    0.422849f,   0.510589f,
+  1.01116f,    0.174019f,   0.0191171f,   -0.0717751f, -0.0068308f,
+  0.172932f,   -0.834888f,  -0.635788f,   0.32012f,    0.298656f,
+  0.274309f,   -0.155456f,  0.1755f,      -0.175171f,  0.343498f,
+  -0.122832f,  -0.107696f,  0.279924f,    -0.797633f,  -0.344658f,
+  0.162669f,   0.389092f,   0.644479f,    -0.635216f,  -0.181868f,
+  0.0579244f,  -0.0568976f, 0.433003f,    -0.591067f,  0.71013f,
+  -0.165515f,  0.225725f,   -0.358156f,   0.0541944f,  1.95485f,
+  -0.315223f,  0.61537f,    -0.0401568f,  0.22811f,    0.271147f
+};
+
+static const float av1_simple_motion_search_split_hiddenlayer_0_bias_8[] = {
+  1.63441f,  -0.616459f, -0.437775f, -0.71669f,  1.56616f,  2.28109f, 1.64054f,
+  -1.51476f, 0.0274108f, 0.935156f,  -0.966329f, 0.906069f, 1.19954f, -1.25867f,
+  -1.7376f,  -0.594211f, 0.322242f,  0.438631f,  -1.01682f, 1.30032f
+};
+
+static const float av1_simple_motion_search_split_logits_kernel_8[] = {
+  -0.463187f, 0.2936127f, 0.16762f,    -0.1663271f, -0.292418f,
+  -0.421457f, -0.378265f, 1.053049f,   0.32432879f, -0.49775575f,
+  0.427357f,  -0.239251f, -0.1631546f, 0.335468f,   0.255371f,
+  0.276901f,  -0.665683f, -0.7021493f, 0.381513f,   -0.1339761f
+};
+
+static const float av1_simple_motion_search_split_logits_bias_8[] = {
+  -1.739754f
+};
+
+static const NN_CONFIG av1_simple_motion_search_split_nn_config_8 = {
+  NUM_FEATURES_8,
+  NUM_LOGITS_8,
+  NUM_HIDDEN_LAYERS_8,
+  {
+      NUM_LAYER_0_UNITS_8,
+  },
+  {
+      av1_simple_motion_search_split_hiddenlayer_0_kernel_8,
+      av1_simple_motion_search_split_logits_kernel_8,
+  },
+  {
+      av1_simple_motion_search_split_hiddenlayer_0_bias_8,
+      av1_simple_motion_search_split_logits_bias_8,
+  },
+};
+
+#undef NUM_HIDDEN_LAYERS_8
+#undef NUM_FEATURES_8
+#undef NUM_LAYER_0_UNITS_8
+#undef NUM_LOGITS_8
+
+static const NN_CONFIG *const av1_simple_motion_search_split_nn_config[5] = {
+  &av1_simple_motion_search_split_nn_config_128,
+  &av1_simple_motion_search_split_nn_config_64,
+  &av1_simple_motion_search_split_nn_config_32,
+  &av1_simple_motion_search_split_nn_config_16,
+  &av1_simple_motion_search_split_nn_config_8,
+};
+
+// Model based on simple_motion_search for pruning rect
+// Thresholds. The first idx level is aggresiveness, second is frame resolution,
+// third is bsize
+static const float av1_simple_motion_search_prune_rect_thresh[4][3][5] = {
+  // Aggressivness = 0
+  {
+      // Lowres
+      { 0.0288721601835f, 0.0281573780991f, 0.0225501403434f,
+        0.000961189195907f, 0.0f },
+      // Midres
+      { 0.0288721601835f, 0.0281573780991f, 0.0225501403434f,
+        0.000961189195907f, 0.0f },
+      // Hdres
+      { 0.0288721601835f, 0.0281573780991f, 0.0225501403434f,
+        0.000961189195907f, 0.0f },
+  },
+  // Aggressivness = 1
+  {
+      // Lowres
+      {
+          0.000000f,
+          0.116076f,
+          0.049759f,
+          0.057747f,
+          0.006001f,
+      },
+      // Midres
+      {
+          0.000000f,
+          0.017380f,
+          0.026077f,
+          0.078111f,
+          0.064477f,
+      },
+      // Hdres
+      {
+          0.002994f,
+          0.103093f,
+          0.076408f,
+          0.010456f,
+          0.187211f,
+      },
+  },
+  // Aggressiveness = 2
+  {
+      // Lowres
+      {
+          0.000000f,
+          0.003111f,
+          0.144294f,
+          0.144884f,
+          0.069924f,
+      },
+      // Midres
+      {
+          0.000000f,
+          0.013696f,
+          0.055203f,
+          0.152271f,
+          0.078886f,
+      },
+      // Hdres
+      {
+          0.030577f,
+          0.082486f,
+          0.040690f,
+          0.140924f,
+          0.067608f,
+      },
+  },
+  // Aggressiveness = 3
+  {
+      // Lowres
+      { 0.0f, 0.352338114654f, 0.171190796972f, 0.322629318068f,
+        0.287219697095f },
+      // Midres
+      { 0.0f, 0.30938393361f, 0.271772875141f, 0.240627957104f,
+        0.178833795641f },
+      // Hdres
+      { 0.285731215187f, 0.37521798723f, 0.142380566244f, 0.338288917819f,
+        0.21329309279f },
+  },
+};
+
+// Mean and std
+static const float av1_simple_motion_search_prune_rect_mean_128[25] = {
+  13.292176f, 13.231236f, 11.098058f, 11.049944f, 10.481336f,
+  10.431587f, 10.789337f, 10.732787f, 10.233817f, 10.173738f,
+  12.214045f, 12.157505f, 11.863353f, 11.802220f, 12.204053f,
+  12.152315f, 11.517566f, 11.465651f, 5.383040f,  0.757934f,
+  4.012611f,  4.052191f,  0.853365f,  3.954503f,  3.944135f,
+};
+
+static const float av1_simple_motion_search_prune_rect_std_128[25] = {
+  2.589217f, 2.559396f, 2.268402f, 2.282274f, 3.341234f, 3.341994f, 3.033007f,
+  3.041550f, 3.786247f, 3.784053f, 2.523459f, 2.511275f, 3.349364f, 3.340481f,
+  2.390149f, 2.384226f, 3.599467f, 3.587460f, 2.319911f, 0.428335f, 1.241087f,
+  1.208679f, 0.353742f, 1.228122f, 1.211777f,
+};
+
+static const float av1_simple_motion_search_prune_rect_mean_64[25] = {
+  11.439831f, 11.382639f, 9.647134f, 9.578121f, 9.146770f,
+  9.084122f,  8.559063f,  8.499496f, 8.095865f, 8.041795f,
+  10.547537f, 10.486240f, 9.362147f, 9.308391f, 10.548071f,
+  10.484358f, 10.002225f, 9.944480f, 4.964504f, 0.897164f,
+  3.306144f,  3.351039f,  0.928582f, 3.319739f, 3.287726f,
+};
+
+static const float av1_simple_motion_search_prune_rect_std_64[25] = {
+  2.033404f, 2.050657f, 2.064671f, 2.081519f, 2.916312f, 2.914649f, 3.628949f,
+  3.618760f, 4.011421f, 3.996068f, 2.087127f, 2.103106f, 3.885277f, 3.876166f,
+  2.035599f, 2.052976f, 3.052501f, 3.050985f, 2.232998f, 0.303745f, 1.111161f,
+  1.081292f, 0.257521f, 1.112510f, 1.089404f,
+};
+
+static const float av1_simple_motion_search_prune_rect_mean_32[25] = {
+  9.862349f, 9.793658f, 8.043962f, 7.954083f, 8.058867f, 7.966165f, 8.046844f,
+  7.956817f, 8.061414f, 7.967906f, 8.966450f, 8.890165f, 8.968315f, 8.891513f,
+  8.953573f, 8.877070f, 8.974275f, 8.895363f, 4.387239f, 0.954143f, 2.701000f,
+  2.751266f, 0.963302f, 2.716584f, 2.709725f,
+};
+
+static const float av1_simple_motion_search_prune_rect_std_32[25] = {
+  1.971555f, 1.985517f, 1.935986f, 1.944743f, 1.924122f, 1.932169f, 1.943151f,
+  1.950612f, 1.931156f, 1.938242f, 1.987803f, 1.997670f, 2.000859f, 2.009913f,
+  1.938270f, 1.949277f, 1.922999f, 1.933145f, 1.991504f, 0.209175f, 0.973824f,
+  0.952221f, 0.188018f, 0.985295f, 0.946228f,
+};
+
+static const float av1_simple_motion_search_prune_rect_mean_16[25] = {
+  8.391692f, 8.303431f, 6.590342f, 6.459725f, 6.460719f, 6.333274f, 6.592615f,
+  6.461661f, 6.464787f, 6.337191f, 7.499753f, 7.395166f, 7.503220f, 7.398344f,
+  7.498312f, 7.395039f, 7.353743f, 7.253139f, 3.874267f, 0.979701f, 2.087404f,
+  2.131698f, 0.981005f, 2.110868f, 2.106539f,
+};
+
+static const float av1_simple_motion_search_prune_rect_std_16[25] = {
+  1.865867f, 1.870012f, 1.773885f, 1.770447f, 1.972922f, 1.961361f, 1.777224f,
+  1.772864f, 1.974519f, 1.962281f, 1.831632f, 1.831837f, 1.837595f, 1.837008f,
+  1.822791f, 1.822053f, 2.074991f, 2.067200f, 1.676261f, 0.141022f, 0.840297f,
+  0.829935f, 0.136507f, 0.828972f, 0.808563f,
+};
+
+static const float av1_simple_motion_search_prune_rect_mean_8[25] = {
+  6.997798f, 6.867032f, 5.134819f, 4.883330f, 5.134804f, 4.879707f, 5.140518f,
+  4.886751f, 5.142186f, 4.885262f, 6.069946f, 5.896944f, 6.080442f, 5.906130f,
+  6.077539f, 5.905929f, 6.083087f, 5.909298f, 3.552709f, 0.990654f, 1.497349f,
+  1.531762f, 0.989606f, 1.496581f, 1.484139f,
+};
+
+static const float av1_simple_motion_search_prune_rect_std_8[25] = {
+  1.727562f, 1.725050f, 1.633396f, 1.618773f, 1.633586f, 1.620657f, 1.620798f,
+  1.604892f, 1.621570f, 1.607439f, 1.691024f, 1.684225f, 1.676065f, 1.668442f,
+  1.680016f, 1.672452f, 1.677775f, 1.671586f, 1.451902f, 0.096223f, 0.751190f,
+  0.754040f, 0.101419f, 0.738239f, 0.729455f,
+};
+
+static const float *const av1_simple_motion_search_prune_rect_mean[5] = {
+  av1_simple_motion_search_prune_rect_mean_128,
+  av1_simple_motion_search_prune_rect_mean_64,
+  av1_simple_motion_search_prune_rect_mean_32,
+  av1_simple_motion_search_prune_rect_mean_16,
+  av1_simple_motion_search_prune_rect_mean_8,
+};
+
+static const float *const av1_simple_motion_search_prune_rect_std[5] = {
+  av1_simple_motion_search_prune_rect_std_128,
+  av1_simple_motion_search_prune_rect_std_64,
+  av1_simple_motion_search_prune_rect_std_32,
+  av1_simple_motion_search_prune_rect_std_16,
+  av1_simple_motion_search_prune_rect_std_8,
+};
+
+#define NUM_HIDDEN_LAYERS_128 1
+#define NUM_FEATURES_128 25
+#define NUM_LAYER_0_UNITS_128 8
+#define NUM_LOGITS_128 4
+
+static const float av1_simple_motion_search_prune_rect_logits_kernel_128[] = {
+  -0.129103f, 0.457758f,  -0.489986f, 0.65462f,   -0.184312f, 3.81202f,
+  -0.444407f, -0.64198f,  -0.575008f, 0.0311711f, 0.525243f,  -20.892f,
+  1.08811f,   -65.0976f,  -12.3973f,  -1.38278f,  -0.264233f, 0.241636f,
+  -10.6925f,  -0.725414f, -18.8987f,  -40.2284f,  -16.08f,    0.995331f,
+  1.47614f,   -0.964864f, 0.405506f,  0.140449f,  0.459534f,  -1.9093f,
+  0.398452f,  0.696949f
+};
+
+static const float av1_simple_motion_search_prune_rect_layer_0_bias_128[] = {
+  1.22789f, -1.34527f, 0.759048f,  0.315086f,
+  1.0834f,  -1.58019f, -0.465158f, 1.20716f
+};
+
+static const float av1_simple_motion_search_prune_rect_layer_0_kernel_128[] = {
+  -0.668677f,  0.58694f,    -0.417094f,   0.754735f,   -0.7859f,
+  0.377479f,   -0.0415929f, -0.0140585f,  -0.730001f,  0.747528f,
+  -0.135247f,  0.406505f,   -0.234184f,   0.956362f,   -0.637555f,
+  0.791884f,   0.0303722f,  1.04424f,     -0.727859f,  -0.274321f,
+  -0.122986f,  0.066312f,   -0.00559175f, -0.239643f,  -0.0188767f,
+  -0.102787f,  -0.262967f,  0.071882f,    -0.283398f,  0.111607f,
+  -0.425826f,  0.02699f,    0.108873f,    -0.180558f,  -0.0794057f,
+  0.29665f,    -0.0252969f, -0.0266213f,  -0.277462f,  -0.361973f,
+  0.512552f,   0.395011f,   -0.225876f,   0.301924f,   0.136954f,
+  0.507259f,   1.23425f,    0.0137135f,   0.662572f,   0.591583f,
+  0.101564f,   0.416805f,   -0.645081f,   -0.179086f,  -0.36747f,
+  -0.332213f,  0.095177f,   0.220739f,    -0.153256f,  0.706155f,
+  0.161701f,   0.696815f,   -1.21531f,    -0.115059f,  0.486764f,
+  -0.396093f,  0.784883f,   0.535357f,    -0.278021f,  0.143496f,
+  -0.44931f,   -0.144543f,  0.319326f,    0.0190167f,  -0.206295f,
+  0.373995f,   -0.247897f,  -0.608095f,   -0.41796f,   -0.137129f,
+  -0.709562f,  0.678273f,   0.537607f,    0.557474f,   0.453308f,
+  0.21405f,    -0.0466495f, 0.519139f,    -0.168832f,  0.902911f,
+  0.681131f,   -0.139876f,  -0.2052f,     -0.393271f,  0.262222f,
+  -0.246246f,  -0.213993f,  0.646619f,    0.0496181f,  -0.00354157f,
+  0.822927f,   0.0939522f,  0.180738f,    0.118355f,   0.120456f,
+  -0.0472214f, -0.144958f,  0.173405f,    -0.886644f,  -0.0949769f,
+  -0.813518f,  -0.3947f,    -0.128021f,   0.356196f,   0.469169f,
+  -0.413702f,  1.04242f,    0.428853f,    -0.387293f,  0.0850877f,
+  0.279409f,   -0.142276f,  0.0579376f,   0.211112f,   0.0703013f,
+  -1.9274f,    -0.729147f,  0.534193f,    0.773586f,   0.922864f,
+  0.642881f,   1.15127f,    0.621032f,    0.933942f,   1.01837f,
+  -0.660282f,  -0.40059f,   -1.11279f,    -0.77088f,   -0.43349f,
+  0.202361f,   -0.0840912f, 0.0935707f,   0.056333f,   -0.0779369f,
+  0.0173447f,  -0.0104756f, 0.0115005f,   -0.0195593f, 0.03592f,
+  -0.343454f,  -0.618048f,  0.258172f,    -0.412322f,  -0.0463746f,
+  -0.0413654f, -0.0400194f, 0.615981f,    -0.452094f,  0.644555f,
+  0.0822476f,  -0.359791f,  -0.0904274f,  0.209427f,   0.0116338f,
+  -0.190978f,  0.890233f,   0.737769f,    -1.66663f,   -0.392605f,
+  0.0785728f,  -0.224553f,  -0.128258f,   -0.227227f,  -0.0777773f,
+  0.685976f,   0.347042f,   -0.555325f,   -0.249221f,  0.0919837f,
+  -0.0660016f, -0.272316f,  0.0390632f,   -0.619624f,  -0.0565801f,
+  0.585026f,   0.597375f,   0.54114f,     0.593389f,   0.604391f,
+  0.0820294f,  -0.85339f,   -1.40741f,    -0.391675f,  0.0579205f,
+  -0.197626f,  0.130044f,   -0.234488f,   -0.0373991f, -0.0717973f
+};
+
+static const float av1_simple_motion_search_prune_rect_logits_bias_128[] = {
+  1.58571f, -4.6314f, -2.00273f, 0.543699f
+};
+
+static const NN_CONFIG av1_simple_motion_search_prune_rect_nn_config_128 = {
+  NUM_FEATURES_128,
+  NUM_LOGITS_128,
+  NUM_HIDDEN_LAYERS_128,
+  {
+      NUM_LAYER_0_UNITS_128,
+  },
+  {
+      av1_simple_motion_search_prune_rect_layer_0_kernel_128,
+      av1_simple_motion_search_prune_rect_logits_kernel_128,
+  },
+  {
+      av1_simple_motion_search_prune_rect_layer_0_bias_128,
+      av1_simple_motion_search_prune_rect_logits_bias_128,
+  },
+};
+
+#undef NUM_HIDDEN_LAYERS_128
+#undef NUM_FEATURES_128
+#undef NUM_LAYER_0_UNITS_128
+#undef NUM_LOGITS_128
+
+#define NUM_HIDDEN_LAYERS_64 1
+#define NUM_FEATURES_64 25
+#define NUM_LAYER_0_UNITS_64 32
+#define NUM_LOGITS_64 10
+
+static const float av1_simple_motion_search_prune_rect_logits_kernel_64[] = {
+  0.10424f,    -0.346025f,  0.534547f,   -0.385925f,  2.58341f,    -0.256414f,
+  -0.232498f,  0.329823f,   -0.0777376f, -0.590939f,  0.062657f,   -0.628252f,
+  0.0934588f,  2.04029f,    -0.224448f,  0.371168f,   -0.385348f,  -0.589883f,
+  -3.73627f,   -0.943144f,  0.346409f,   -0.211215f,  -0.351008f,  0.418807f,
+  0.943663f,   0.173267f,   1.16585f,    -0.0840888f, 0.227464f,   0.374412f,
+  0.0422597f,  -0.338868f,  0.222576f,   0.431713f,   1.12366f,    0.00753411f,
+  0.248412f,   -0.0902425f, 0.542455f,   -0.665629f,  -0.311245f,  -0.205639f,
+  -0.447149f,  -0.0502733f, -0.290186f,  -0.794384f,  0.0940881f,  -0.0686117f,
+  -0.0199961f, -0.587965f,  0.777096f,   -0.083381f,  -1.21282f,   0.652959f,
+  -1.18238f,   0.539991f,   0.352497f,   -0.540076f,  -0.26222f,   -0.568556f,
+  0.409102f,   -0.131146f,  -0.407161f,  -0.188287f,  -0.478657f,  0.000401932f,
+  -0.689324f,  0.351064f,   -1.43704f,   -0.315185f,  -0.868726f,  0.376341f,
+  -0.0566277f, 0.364831f,   0.611298f,   -0.495253f,  -0.0193132f, 0.617978f,
+  0.189586f,   -0.236758f,  -0.608246f,  -0.149017f,  -1.78303f,   0.143023f,
+  0.698386f,   -0.994086f,  -0.673327f,  0.233868f,   0.360425f,   0.0294123f,
+  -0.248683f,  -0.148392f,  0.0861829f,  -0.190843f,  -0.414906f,  0.607378f,
+  -0.756715f,  -0.511713f,  -0.321556f,  1.0078f,     -1.18141f,   0.519751f,
+  0.834629f,   -0.359343f,  0.612262f,   -0.0730553f, 0.262935f,   0.488276f,
+  0.387071f,   -1.44123f,   1.08269f,    0.554402f,   -0.069f,     0.14113f,
+  0.323817f,   0.824314f,   -0.431417f,  -0.349448f,  0.950728f,   -0.587836f,
+  -0.83914f,   -0.10844f,   0.26602f,    0.831933f,   -0.271315f,  0.231563f,
+  0.417049f,   0.190627f,   -0.0940667f, 0.255363f,   -0.0741022f, -0.0987662f,
+  -0.847522f,  0.00287554f, 0.0615741f,  -0.0832218f, 0.0847148f,  -0.392843f,
+  -0.938068f,  -0.10621f,   -0.260859f,  -0.825175f,  -0.401039f,  0.315213f,
+  -0.108269f,  0.288036f,   -8.66166f,   -0.970752f,  -0.66678f,   -0.593405f,
+  -0.518294f,  -0.138722f,  -0.454698f,  -0.22969f,   -0.553006f,  -0.440111f,
+  0.462661f,   -0.536854f,  0.0108295f,  -0.522888f,  0.00111157f, 0.229999f,
+  0.0267768f,  0.176266f,   -1.57043f,   0.0318106f,  0.257534f,   -0.198583f,
+  0.175564f,   -0.251465f,  -0.262441f,  -1.65283f,   -0.319603f,  -0.875282f,
+  -0.301303f,  0.0170948f,  -0.227075f,  0.0299545f,  -4.98346f,   0.470046f,
+  -1.28051f,   -0.213809f,  -0.486585f,  -0.906463f,  -0.169984f,  -0.333153f,
+  -0.376733f,  0.108016f,   0.486744f,   -0.186936f,  -0.429259f,  0.056501f,
+  -0.266545f,  0.265447f,   -0.137718f,  -0.490687f,  -0.935668f,  -0.16229f,
+  -0.696932f,  0.173157f,   0.434959f,   -0.140595f,  0.345845f,   -1.08013f,
+  -0.0205929f, -0.815874f,  -0.179812f,  0.02767f,    -0.141727f,  0.471936f,
+  -7.29453f,   -1.04362f,   -0.745482f,  -0.28725f,   -0.214997f,  -0.0850651f,
+  -0.748471f,  0.161325f,   -1.04387f,   -0.705305f,  0.489427f,   -0.765373f,
+  -0.301576f,  0.0742467f,  -0.331282f,  0.0372328f,  -0.90298f,   -0.0608646f,
+  -2.18756f,   0.170384f,   -0.258357f,  0.106287f,   -0.161684f,  -0.103799f,
+  -0.127774f,  -0.156313f,  0.0705286f,  -0.977908f,  -0.281191f,  -0.056757f,
+  -0.309474f,  0.050476f,   -9.78198f,   -2.42795f,   -0.289626f,  -1.07579f,
+  -0.439256f,  -1.09948f,   -0.564671f,  0.0913182f,  -0.417216f,  -1.19909f,
+  0.287063f,   0.402315f,   -0.17646f,   0.540488f,   0.00840239f, 0.397492f,
+  0.702393f,   -0.10566f,   0.655296f,   -0.0443876f, 0.154918f,   -0.760479f,
+  -0.0523153f, -0.366199f,  -1.08212f,   -0.398556f,  -0.415203f,  -1.10488f,
+  0.208349f,   0.27079f,    0.101546f,   -0.205752f,  -13.7923f,   -0.218637f,
+  -1.10077f,   0.355735f,   -0.306196f,  0.627434f,   -0.473101f,  -0.308027f,
+  -1.12724f,   0.301597f,   0.660785f,   0.0576217f,  -0.155925f,  -0.56107f,
+  -0.223537f,  0.114299f,   -0.53803f,   -0.252674f,  -2.66103f,   -0.185245f,
+  -0.314673f,  0.403337f,   0.679821f,   -0.69231f,   0.506264f,   -0.999705f,
+  -0.549097f,  0.353745f,   0.188249f,   0.414484f,   -0.615853f,  0.525681f,
+  -5.23065f,   -3.05174f,   1.02074f,    -0.965499f,  -0.158947f,  0.0436088f,
+  -0.485824f,  0.0375094f,  -1.39985f,   -0.481392f,  0.485785f,   -0.24874f,
+  -0.359633f,  0.668108f
+};
+
+static const float av1_simple_motion_search_prune_rect_layer_0_bias_64[] = {
+  0.0735592f, -0.045064f, -0.0114103f, 1.39246f,    -0.683467f,  0.155765f,
+  -0.667652f, -0.202425f, -0.585433f,  -0.146752f,  -0.0812931f, 0.580642f,
+  0.578542f,  -0.831916f, 0.610063f,   0.0101856f,  -0.235863f,  0.538141f,
+  -2.91334f,  -1.71887f,  0.126616f,   0.582497f,   -0.438879f,  0.221833f,
+  0.850773f,  -0.280886f, 0.443233f,   -0.0964873f, -0.216161f,  0.34413f,
+  0.656818f,  0.0169274f
+};
+
+static const float av1_simple_motion_search_prune_rect_layer_0_kernel_64[] = {
+  -0.310947f,   -0.232675f,    0.0171092f,    0.0834474f,   0.373977f,
+  0.300429f,    0.215072f,     -0.454074f,    0.187565f,    0.282742f,
+  0.562562f,    -0.0419322f,   0.000978486f,  -0.298267f,   0.216934f,
+  -0.388722f,   -0.146866f,    -0.275946f,    0.202361f,    0.225847f,
+  1.42868f,     0.473127f,     -0.145747f,    -0.104986f,   0.153459f,
+  0.69382f,     0.162266f,     0.0207715f,    -0.45095f,    -0.412071f,
+  -0.235109f,   -0.130199f,    0.231741f,     0.460193f,    0.0378202f,
+  0.429516f,    0.387691f,     -0.272479f,    0.0723884f,   -0.453914f,
+  -0.150618f,   -0.10745f,     -0.258615f,    0.0838312f,   -0.00554958f,
+  0.105377f,    -0.0415479f,   0.13228f,      1.09044f,     -0.73053f,
+  -0.422553f,   -0.435842f,    0.211416f,     0.420332f,    0.0181353f,
+  -0.030891f,   0.522788f,     0.613526f,     0.374032f,    0.287986f,
+  -0.403118f,   -0.287362f,    -1.11523f,     -0.577713f,   -0.020228f,
+  0.86465f,     -0.0590579f,   0.341274f,     -0.0115644f,  -0.260236f,
+  0.192123f,    -0.0849825f,   0.0501709f,    0.444382f,    0.0762727f,
+  0.0926596f,   -0.101157f,    -0.142787f,    0.40861f,     0.555805f,
+  -0.00614654f, -0.122846f,    0.203163f,     0.234266f,    0.409795f,
+  -0.0206245f,  -0.224679f,    0.025081f,     0.518044f,    -0.287186f,
+  0.016494f,    -0.0886331f,   0.236438f,     -1.01032f,    0.118332f,
+  0.364217f,    0.061438f,     0.0381303f,    0.128418f,    0.0257077f,
+  -0.975751f,   -0.694894f,    0.00351914f,   0.278179f,    0.29363f,
+  0.525576f,    0.0604849f,    0.531734f,     0.406643f,    0.812497f,
+  -0.403196f,   -0.16664f,     -0.620887f,    -0.428194f,   0.275401f,
+  0.432063f,    -0.00378342f,  0.295758f,     0.105615f,    -0.00683626f,
+  0.00396146f,  0.00598654f,   -0.0131701f,   -0.0115787f,  0.00386643f,
+  -0.69686f,    -0.139623f,    -0.440817f,    0.0542873f,   0.217962f,
+  0.527035f,    -0.0201046f,   0.0471354f,    0.0271858f,   -0.0775197f,
+  -0.309797f,   0.184879f,     -0.232854f,    -0.407081f,   0.706227f,
+  -0.0877534f,  0.306843f,     0.455075f,     -0.333961f,   0.0759148f,
+  0.0444791f,   -0.0693626f,   -0.0850289f,   -0.513063f,   -0.643971f,
+  -0.630279f,   -0.153889f,    0.123315f,     0.00548238f,  0.170707f,
+  0.734339f,    -0.176988f,    0.322519f,     0.178365f,    0.183519f,
+  -0.698683f,   -0.12043f,     -0.349914f,    -0.0696762f,  -0.53986f,
+  -0.104738f,   1.05264f,      0.983568f,     -0.109035f,   0.0113748f,
+  0.0815189f,   -0.0628812f,   0.0769389f,    0.010261f,    0.146573f,
+  -0.433194f,   -0.211572f,    -0.000397392f, 0.445325f,    0.145091f,
+  -0.0625902f,  0.29394f,      0.302315f,     0.0892226f,   -0.209504f,
+  -0.0150374f,  0.242608f,     0.216223f,     0.366857f,    0.209829f,
+  -0.540035f,   0.117599f,     -0.329315f,    0.0471133f,   -0.0115449f,
+  -0.0638235f,  0.0527461f,    0.348149f,     0.360802f,    1.06624f,
+  -0.615991f,   -0.341396f,    0.18972f,      0.0709888f,   -0.0414466f,
+  -0.0193809f,  0.0938933f,    0.209058f,     0.575042f,    0.483608f,
+  -0.285875f,   -0.115905f,    -0.363637f,    0.375425f,    0.336217f,
+  0.0336358f,   -0.00265618f,  -0.406854f,    -0.792959f,   -0.219354f,
+  0.0331615f,   0.0298859f,    -0.211446f,    -0.00280773f, -0.194011f,
+  0.262109f,    0.548076f,     0.120183f,     -0.661603f,   0.241855f,
+  -0.501428f,   0.00102718f,   -0.347331f,    -0.58306f,    0.0977254f,
+  0.117491f,    0.0840667f,    0.00693675f,   0.000600294f, 0.649569f,
+  -0.0553811f,  -0.197198f,    0.397236f,     -0.523737f,   -0.564192f,
+  -0.374679f,   -0.249344f,    0.00861428f,   0.00393439f,  -0.0834608f,
+  0.124389f,    -0.0393049f,   0.0425391f,    -0.153383f,   -0.182346f,
+  0.420953f,    0.464221f,     0.288984f,     0.570921f,    -0.239965f,
+  0.247239f,    -0.083434f,    0.714418f,     0.986323f,    -0.460244f,
+  -0.260993f,   -0.947743f,    -1.0789f,      -0.0391231f,  0.612407f,
+  -0.0306767f,  0.281419f,     0.0072426f,    -0.37623f,    0.188744f,
+  0.221666f,    -0.424914f,    0.29703f,      0.261715f,    0.277809f,
+  -0.0617616f,  -0.000611999f, -0.0547053f,   -0.0901018f,  -0.347669f,
+  0.856072f,    0.596675f,     -0.467639f,    -1.09324f,    -0.184224f,
+  -0.56051f,    -0.0144704f,   0.102894f,     -0.122982f,   -0.0020749f,
+  -0.0423487f,  0.0328702f,    -0.0154263f,   0.0349021f,   -0.00315595f,
+  0.0254802f,   -0.729191f,    0.207296f,     -0.0212349f,  -0.207078f,
+  0.20636f,     -0.156883f,    0.429765f,     -0.42672f,    0.138775f,
+  -0.0267343f,  0.631528f,     0.300646f,     -0.4793f,     -0.273833f,
+  -0.0135367f,  -0.530819f,    -0.534881f,    0.830896f,    0.0266992f,
+  0.473744f,    0.210334f,     0.0234739f,    0.255394f,    0.123531f,
+  -0.489341f,   -0.796627f,    0.372617f,     0.190136f,    0.275342f,
+  0.739505f,    0.402354f,     0.782806f,     0.437374f,    1.04948f,
+  -0.55963f,    0.382704f,     -0.698321f,    0.0817868f,   -0.440108f,
+  -0.0635004f,  -0.277851f,    -0.524194f,    0.286157f,    -0.01097f,
+  -0.0293145f,  -0.0405071f,   -0.035662f,    -0.012871f,   -0.0516409f,
+  -0.406671f,   0.709259f,     -0.525177f,    0.521123f,    -0.44813f,
+  0.48412f,     -0.0546513f,   0.305253f,     -0.468328f,   0.316453f,
+  -0.36307f,    0.497515f,     -0.0606276f,   0.315764f,    -0.422066f,
+  0.554025f,    -0.679183f,    0.616914f,     0.00283324f,  -0.000643824f,
+  0.0639999f,   0.0488285f,    -0.141031f,    0.068003f,    -0.0792678f,
+  -0.425307f,   -0.152235f,    0.269917f,     -0.352327f,   0.44792f,
+  -0.116514f,   -0.465868f,    0.154287f,     0.0161028f,   -0.16848f,
+  -0.255487f,   0.189832f,     0.254883f,     0.0240822f,   0.432638f,
+  -0.136564f,   0.137036f,     0.0375734f,    0.989246f,    -0.126287f,
+  0.111416f,    -0.0271002f,   0.718755f,     -0.0412969f,  0.00645681f,
+  0.253811f,    -0.0186998f,   0.691971f,     -0.282042f,   -0.0783915f,
+  0.274592f,    -0.358449f,    0.34155f,      -0.186374f,   -0.136907f,
+  -0.192334f,   -0.251168f,    -0.100874f,    -0.166578f,   -0.336507f,
+  0.402373f,    0.173695f,     0.108788f,     0.00885581f,  -0.310063f,
+  1.05545f,     0.0295867f,    0.180785f,     -0.173469f,   -0.469924f,
+  -0.224155f,   0.665862f,     -0.126546f,    0.240691f,    -0.0415301f,
+  -0.598534f,   0.0012723f,    -0.122297f,    -0.558947f,   0.268844f,
+  0.241193f,    0.0524422f,    -0.1683f,      0.575588f,    -0.139012f,
+  0.0636691f,   -0.446709f,    -0.094532f,    0.883809f,    -0.112981f,
+  -0.224047f,   0.0811193f,    -0.140571f,    -0.09683f,    -0.0796143f,
+  -0.102246f,   -0.863392f,    -0.0755124f,   0.23125f,     -0.0301361f,
+  -0.153029f,   -0.172238f,    -0.0286382f,   -0.338495f,   -0.317216f,
+  -0.146629f,   -0.242264f,    -0.702306f,    -0.285052f,   0.0623479f,
+  0.265735f,    0.00674475f,   0.666196f,     0.883586f,    0.278416f,
+  -0.341692f,   -0.509931f,    -0.156263f,    0.635885f,    -0.544143f,
+  -0.572632f,   -0.213285f,    0.443396f,     -0.268329f,   0.0638439f,
+  -0.185397f,   0.071126f,     0.386503f,     -0.402212f,   -0.140784f,
+  -0.411661f,   0.049398f,     -0.0672907f,   -0.267034f,   -0.0560875f,
+  0.0607937f,   0.0445484f,    -0.547651f,    0.574718f,    0.417189f,
+  -0.0610166f,  0.0632293f,    0.391619f,     -0.00671215f, -0.136883f,
+  -0.339346f,   0.0356183f,    0.511993f,     0.178676f,    0.286998f,
+  0.136511f,    -0.00796929f,  0.203985f,     0.0423532f,   -0.175196f,
+  0.378534f,    0.770417f,     0.593778f,     0.0256067f,   -0.82394f,
+  -0.500691f,   -0.425725f,    -0.623708f,    -0.0406241f,  -0.00226464f,
+  0.0207836f,   0.30732f,      -0.00784268f,  0.0065445f,   -0.0991039f,
+  -0.20871f,    -0.206835f,    0.281219f,     0.119361f,    0.259346f,
+  -0.102713f,   0.186488f,     -0.034455f,    -0.00198392f, -0.279107f,
+  -0.638993f,   -0.374404f,    -0.48601f,     -0.262345f,   0.624532f,
+  0.620632f,    -0.227014f,    0.433579f,     -0.0455096f,  1.22123f,
+  -0.429156f,   0.12396f,      0.0815152f,    -0.0837355f,  0.0282623f,
+  -0.407475f,   0.787321f,     -0.434974f,    0.312904f,    -0.230805f,
+  0.213042f,    -0.250929f,    0.302997f,     -0.354709f,   0.0504905f,
+  -0.561706f,   0.595558f,     0.374951f,     0.802969f,    -0.674902f,
+  0.33136f,     0.156606f,     0.0218968f,    -0.694188f,   -0.0221949f,
+  -0.00639123f, 0.0146536f,    0.0104145f,    0.021635f,    -0.0499428f,
+  -0.575116f,   -0.239035f,    -0.0588276f,   0.599722f,    0.541932f,
+  0.437433f,    0.716268f,     0.193207f,     0.548351f,    0.326951f,
+  -0.197124f,   0.0355353f,    -0.0952009f,   -0.217265f,   -0.389789f,
+  0.0528124f,   -0.21334f,     -0.190296f,    -1.17367f,    0.108905f,
+  0.109397f,    -0.0192577f,   0.0343813f,    0.085004f,    -0.0556737f,
+  -0.0411158f,  -0.534989f,    0.0361896f,    0.124415f,    0.291603f,
+  -0.0311974f,  -0.326726f,    0.343131f,     0.0276456f,   -0.231827f,
+  -0.373894f,   -0.208898f,    -0.273011f,    0.061323f,    -0.0910538f,
+  -0.30746f,    -0.108644f,    -0.190736f,    1.58048f,     -0.0739711f,
+  -0.0623489f,  -0.137967f,    -0.0601359f,   -0.133004f,   -0.0857153f,
+  0.00955987f,  -0.365561f,    -0.0329051f,   0.463463f,    0.14758f,
+  -0.512256f,   -0.227463f,    -0.26008f,     -0.567777f,   0.0646234f,
+  1.02161f,     0.66157f,      -0.16733f,     0.264921f,    -0.242036f,
+  0.214622f,    0.0712054f,    -0.260377f,    0.0849665f,   0.735094f,
+  0.11001f,     0.297301f,     -0.333342f,    0.066978f,    -0.123625f,
+  1.07596f,     0.401263f,     0.0800875f,    -0.340862f,   -0.115587f,
+  -0.32692f,    -0.300842f,    0.0277397f,    0.0630788f,   -0.261198f,
+  0.428695f,    -0.0544757f,   -0.124511f,    0.036992f,    0.126322f,
+  0.0317603f,   0.0820762f,    0.117277f,     -1.14594f,    -0.108076f,
+  -0.0258198f,  -0.00337525f,  -0.00512531f,  0.1274f,      -0.0660535f,
+  -0.640733f,   0.197142f,     0.147278f,     0.489271f,    0.226507f,
+  -0.0668414f,  0.0946318f,    0.0994164f,    -0.820516f,   0.512939f,
+  -0.305172f,   -0.715187f,    -0.195125f,    0.279346f,    0.462144f,
+  0.913882f,    -0.453879f,    0.0582033f,    -0.462866f,   0.0538736f,
+  0.0115737f,   0.00626993f,   -0.0185185f,   0.0114601f,   -0.0181164f,
+  0.41588f,     -0.0447331f,   0.611756f,     0.43385f,     0.834465f,
+  0.122019f,    -0.352983f,    0.340429f,     -0.245425f,   -0.365328f,
+  -0.521825f,   0.0371057f,    0.172188f,     -0.387949f,   0.221054f,
+  0.0126359f,   0.422958f,     0.584198f,     -0.581498f,   -0.019466f,
+  -0.0271737f,  -0.0740885f,   0.00540879f,   0.186086f,    -0.0324402f,
+  -0.563462f,   -0.458759f,    -0.425296f,    -0.0118862f,  -0.641508f,
+  0.0132084f,   0.0581128f,    0.0231444f,    0.468587f,    0.258838f,
+  0.0296665f,   0.0562801f,    0.630014f,     0.381816f,    -0.269761f,
+  -0.135515f,   0.046186f,     1.07632f,      -0.050616f,   0.104987f,
+  0.29991f,     0.119316f,     0.117248f,     0.0795009f,   0.242573f,
+  0.0416634f,   -0.0577639f,   -0.0974078f,   0.106255f,    -0.13098f,
+  0.0141486f,   -0.00418257f,  0.144848f,     -0.463934f,   0.0452591f,
+  0.252617f,    0.205222f,     -0.189843f,    0.0652245f,   -0.135386f,
+  0.0500646f,   -0.200368f,    -0.0142312f,   -0.0286832f,  -0.254355f,
+  -1.02752f,    -0.73549f,     0.0364518f,    0.0416227f,   -0.13185f,
+  -0.0886515f,  -0.502314f,    -0.102916f,    0.410911f,    -0.355655f,
+  0.400416f,    -0.340217f,    0.208829f,     0.245972f,    0.149739f,
+  -0.49458f,    0.589482f,     0.550827f,     0.912709f,    -0.351275f,
+  -0.128076f,   -0.285172f,    -0.672752f,    0.090583f,    -0.245286f,
+  -0.737297f,   -0.201515f,    -0.025122f,    -0.109854f,   0.36738f
+};
+
+static const float av1_simple_motion_search_prune_rect_logits_bias_64[] = {
+  0.346819f,  0.442965f,  -0.0216032f,  0.0229235f, -0.402797f,
+  -0.666074f, -0.455388f, -0.00353411f, -0.595511f, -0.845667f
+};
+
+static const NN_CONFIG av1_simple_motion_search_prune_rect_nn_config_64 = {
+  NUM_FEATURES_64,
+  NUM_LOGITS_64,
+  NUM_HIDDEN_LAYERS_64,
+  {
+      NUM_LAYER_0_UNITS_64,
+  },
+  {
+      av1_simple_motion_search_prune_rect_layer_0_kernel_64,
+      av1_simple_motion_search_prune_rect_logits_kernel_64,
+  },
+  {
+      av1_simple_motion_search_prune_rect_layer_0_bias_64,
+      av1_simple_motion_search_prune_rect_logits_bias_64,
+  },
+};
+
+#undef NUM_HIDDEN_LAYERS_64
+#undef NUM_FEATURES_64
+#undef NUM_LAYER_0_UNITS_64
+#undef NUM_LOGITS_64
+
+#define NUM_HIDDEN_LAYERS_32 1
+#define NUM_FEATURES_32 25
+#define NUM_LAYER_0_UNITS_32 28
+#define NUM_LOGITS_32 10
+
+static const float av1_simple_motion_search_prune_rect_logits_kernel_32[] = {
+  0.486581f,    0.340847f,   -0.109226f,   0.467224f,   -0.541561f,
+  0.0943619f,   -0.429442f,  -0.207442f,   0.959963f,   0.618666f,
+  -0.0636751f,  0.144508f,   -0.0278289f,  0.332293f,   -0.751493f,
+  0.245438f,    -0.917758f,  0.612128f,    -0.32648f,   0.534618f,
+  -0.615239f,   2.71641f,    0.233759f,    0.820558f,   -0.249758f,
+  -0.427783f,   -0.359361f,  0.0375732f,   0.806973f,   0.352512f,
+  -0.0532192f,  0.0576861f,  -0.464178f,   -0.334877f,  -0.697042f,
+  0.0538218f,   0.0919659f,  -0.00765812f, 0.0603847f,  -0.460315f,
+  0.37979f,     -0.0867612f, -0.670683f,   -0.188619f,  -0.570586f,
+  0.233418f,    0.153581f,   0.290905f,    -0.624885f,  -0.557842f,
+  -0.555567f,   0.463773f,   -0.123909f,   -0.277731f,  0.0374468f,
+  0.409903f,    0.287638f,   -0.593066f,   -0.223434f,  0.154263f,
+  -0.250464f,   -0.077696f,  0.229652f,    -0.304174f,  0.308053f,
+  0.33155f,     -0.502825f,  0.361216f,    -0.499294f,  0.00595444f,
+  -0.307201f,   0.5766f,     -0.438384f,   -0.093701f,  -0.118586f,
+  0.202337f,    -0.486623f,  0.261552f,    0.139756f,   -0.655642f,
+  -0.0627001f,  -0.213053f,  -0.243037f,   0.205918f,   0.0718368f,
+  0.188041f,    0.141529f,   -0.132239f,   0.425827f,   -0.218353f,
+  0.153114f,    0.33268f,    0.0226116f,   0.167394f,   0.269854f,
+  -0.457001f,   0.1973f,     -0.526087f,   0.467528f,   0.290934f,
+  1.16267f,     0.0823663f,  -0.754389f,   -0.83716f,   0.270157f,
+  -1.41229f,    0.148511f,   -0.286832f,   0.664796f,   0.492254f,
+  0.360567f,    -0.533993f,  0.0435672f,   -0.103001f,  0.220668f,
+  0.594621f,    -0.0213356f, -0.347638f,   -0.694457f,  0.0759505f,
+  0.161358f,    -0.389384f,  -0.0455192f,  -0.61252f,   -0.174173f,
+  -0.00788878f, -1.22487f,   0.332233f,    -0.0457021f, -0.225918f,
+  -0.197657f,   -0.115408f,  -0.240589f,   -2.05681f,   0.00914629f,
+  -1.92213f,    0.0268578f,  -0.49076f,    -0.0120123f, 0.291157f,
+  0.267116f,    -0.0775724f, 0.181115f,    -0.392441f,  -0.488114f,
+  -0.28842f,    -0.115465f,  0.128974f,    -0.0829899f, -0.14096f,
+  -0.140145f,   -0.700281f,  0.0368945f,   -0.437598f,  0.243485f,
+  -1.00301f,    0.332324f,   0.125014f,    -0.0604481f, -0.0652028f,
+  -0.207295f,   -1.0209f,    -0.341525f,   0.191326f,   -0.147578f,
+  0.0878327f,   0.129827f,   -0.0848319f,  0.187381f,   -1.28663f,
+  0.00537885f,  -0.134277f,  -0.0411126f,  -0.3434f,    -0.0456494f,
+  0.37861f,     0.409095f,   0.237177f,    -0.396855f,  -0.205418f,
+  -1.31701f,    -0.319032f,  -0.123404f,   -0.240005f,  -0.305206f,
+  -0.0258176f,  -0.26367f,   -0.142396f,   0.191672f,   -1.44061f,
+  0.0554776f,   -0.571839f,  -0.284789f,   -0.425677f,  -0.0307376f,
+  0.20275f,     -0.223146f,  0.144612f,    0.0212636f,  0.0238303f,
+  -0.253802f,   -0.188922f,  -0.0637066f,  -0.340836f,  0.124774f,
+  0.130474f,    -0.154099f,  -0.0292733f,  0.158148f,   -0.246989f,
+  -0.259059f,   0.220224f,   0.228449f,    -0.41956f,   -0.321848f,
+  -0.2396f,     -0.316449f,  -1.3363f,     0.0264099f,  -1.46865f,
+  0.113073f,    0.0722885f,  -0.166986f,   -0.164877f,  0.0360911f,
+  0.534472f,    -0.551152f,  -0.328501f,   0.0781121f,  -0.378112f,
+  -0.459502f,   0.28015f,    -0.212302f,   -0.521641f,  0.618993f,
+  -0.347709f,   0.266253f,   -0.0280894f,  0.348511f,   -0.0155031f,
+  -0.100693f,   0.0447673f,  0.277519f,    -0.233998f,  -0.0796738f,
+  -1.73644f,    -0.160776f,  0.53092f,     -0.180406f,  0.056447f,
+  0.385356f,    -0.262337f,  -0.241479f,   -0.271426f,  -0.457354f,
+  -0.266788f,   0.367371f,   -0.103065f,   0.47783f,    -0.188327f,
+  -0.159636f,   0.00142907f, -0.409756f,   0.454889f,   -0.24566f,
+  -0.0760084f,  0.286355f,   0.462102f,    0.0431695f,  -0.127395f,
+  -0.200476f,   -0.350557f,  0.217275f,    -0.23975f,   0.255148f,
+  -0.280626f,   0.42476f,    0.157411f,    0.0358675f,  -0.192591f
+};
+
+static const float av1_simple_motion_search_prune_rect_layer_0_bias_32[] = {
+  0.940498f,  0.15602f,   -0.234831f, 0.0268585f, 0.144769f,  0.243081f,
+  0.611406f,  0.366093f,  0.361868f,  0.39668f,   0.401479f,  0.369467f,
+  0.0909503f, 0.710595f,  0.032786f,  0.525891f,  -1.0232f,   0.732557f,
+  -0.064425f, 0.865222f,  -0.042917f, -0.237191f, -0.527006f, -0.0172101f,
+  0.59681f,   -0.472405f, 0.0969218f, -0.250624f
+};
+
+static const float av1_simple_motion_search_prune_rect_layer_0_kernel_32[] = {
+  0.355607f,    0.126701f,    -0.0825159f,  0.200675f,     -0.011308f,
+  -0.280057f,   0.559816f,    0.142689f,    0.0422419f,    -0.151692f,
+  -0.0275637f,  -0.283101f,   -0.20822f,    -0.200394f,    0.465427f,
+  0.344491f,    -0.525319f,   -0.358813f,   -0.39767f,     0.0974486f,
+  0.00559058f,  -0.00546089f, 0.0506486f,   0.114475f,     -0.0436463f,
+  -0.574152f,   -0.376294f,   0.16563f,     -0.0967032f,   0.00579838f,
+  0.0639909f,   -0.037129f,   0.407574f,    -0.231428f,    0.489326f,
+  -0.221566f,   -0.270382f,   -0.784628f,   -0.155502f,    0.481698f,
+  -0.0296057f,  0.431855f,    0.840807f,    0.112291f,     0.773874f,
+  -0.0610936f,  -0.012892f,   0.365154f,    0.0267687f,    -0.0751114f,
+  0.25043f,     0.516472f,    -0.186133f,   -0.12762f,     -0.168804f,
+  -0.146309f,   0.139314f,    -0.367113f,   -0.601079f,    0.0559856f,
+  0.176081f,    0.22397f,     0.434113f,    0.0363256f,    0.313051f,
+  0.0143976f,   0.190076f,    0.474607f,    -0.681134f,    -0.0709097f,
+  -0.253289f,   -0.216277f,   -0.0593789f,  -0.107795f,    -0.194842f,
+  0.513945f,    0.239171f,    -0.720561f,   0.0136723f,    -0.391147f,
+  -0.272043f,   -0.164766f,   0.124248f,    0.147178f,     -0.35497f,
+  0.397725f,    -0.117603f,   0.262937f,    -0.331964f,    0.182418f,
+  0.315671f,    -0.0385649f,  0.488769f,    -0.334568f,    0.00596018f,
+  0.0661557f,   -0.0446985f,  -0.0928255f,  -0.0221032f,   -0.019045f,
+  -0.20881f,    0.197907f,    -0.381881f,   0.0598071f,    -0.0434551f,
+  0.159283f,    -0.110631f,   0.266996f,    -0.0265494f,   0.135199f,
+  -0.00833162f, 0.804482f,    -0.114698f,   -0.15066f,     -0.479553f,
+  0.448407f,    -0.344069f,   -0.0280952f,  -0.208211f,    -0.102269f,
+  -0.679066f,   -0.37476f,    -0.0228875f,  0.0535049f,    0.111015f,
+  -0.18125f,    -0.167584f,   0.0110497f,   0.262723f,     -0.413839f,
+  -0.0611238f,  0.358499f,    0.0807514f,   0.208254f,     0.214499f,
+  0.11137f,     -0.14262f,    -0.0513973f,  0.243718f,     -0.373716f,
+  -0.00413366f, 0.216501f,    -0.164149f,   -0.064935f,    -0.0840282f,
+  0.0566148f,   0.0377686f,   0.289835f,    0.769388f,     0.891198f,
+  -0.592739f,   0.40744f,     -0.153095f,   0.657311f,     0.140737f,
+  0.28209f,     0.158344f,    0.353546f,    0.0868246f,    0.116887f,
+  0.402004f,    0.437184f,    0.589219f,    0.760594f,     -0.575419f,
+  -0.754308f,   -0.709219f,   -0.297814f,   -0.418609f,    -0.0262104f,
+  0.0411959f,   0.0597708f,   -0.143728f,   -0.136642f,    0.099614f,
+  -0.257601f,   -0.2404f,     0.305893f,    0.254009f,     -0.0301398f,
+  -0.0653091f,  -0.459002f,   -0.163404f,   0.123152f,     -0.0284252f,
+  -0.457272f,   0.00788622f,  -0.828399f,   -0.0534199f,   0.586877f,
+  0.982728f,    0.424581f,    0.0891856f,   0.383182f,     -0.122053f,
+  0.0808408f,   -0.00384914f, -0.0560201f,  -0.0524772f,   -0.263444f,
+  -0.239287f,   -0.882777f,   0.0180592f,   -0.0948711f,   -0.177946f,
+  0.0296473f,   0.096082f,    0.0455604f,   -0.108608f,    0.00777951f,
+  -0.140896f,   0.117187f,    -0.342467f,   -0.0691604f,   0.0761611f,
+  -0.0892053f,  0.111386f,    -0.167456f,   1.40616f,      -0.00478793f,
+  0.00547665f,  -0.0441829f,  0.0151323f,   -0.0674099f,   -0.0380578f,
+  0.16072f,     0.31882f,     0.245486f,    -0.424318f,    0.101845f,
+  -0.203343f,   -0.197402f,   -0.163025f,   -0.0771961f,   -0.264435f,
+  0.319429f,    0.250076f,    0.782726f,    0.386003f,     0.00700673f,
+  -0.375715f,   0.151453f,    -0.296265f,   -0.560183f,    -0.00767249f,
+  -0.109593f,   -0.119419f,   -0.0161516f,  0.0380283f,    -0.156417f,
+  0.131708f,    0.396268f,    -0.221796f,   0.232099f,     0.128852f,
+  0.0567268f,   0.297297f,    0.173269f,    0.213411f,     0.0384426f,
+  -0.290985f,   -0.0426841f,  -0.488292f,   -0.087101f,    -0.311582f,
+  0.83009f,     -0.153163f,   0.903335f,    -1.15644f,     -0.0378635f,
+  -0.0552129f,  -0.126362f,   -0.176945f,   0.0653115f,    0.0989368f,
+  -0.333543f,   -0.330586f,   0.29775f,     -0.103535f,    0.210824f,
+  -0.00300509f, 0.317105f,    0.216852f,    0.479718f,     0.0485808f,
+  -0.15662f,    0.718199f,    0.327513f,    0.115169f,     -0.423598f,
+  -0.456633f,   -0.575814f,   -0.494454f,   0.304411f,     0.0493055f,
+  -0.381171f,   0.467251f,    -0.122872f,   -0.167441f,    0.017253f,
+  -0.0583646f,  -0.1586f,     0.214046f,    -0.0284424f,   -0.217112f,
+  0.606567f,    -0.107533f,   0.36615f,     -0.0709227f,   0.604761f,
+  -0.244657f,   -0.296651f,   -0.595611f,   -0.156629f,    -0.693468f,
+  -0.310603f,   0.499272f,    0.282941f,    0.295043f,     -0.178704f,
+  0.281186f,    0.014329f,    -0.120819f,   0.154234f,     0.0131325f,
+  -0.472231f,   -0.631281f,   0.422955f,    0.711432f,     -0.118025f,
+  0.0864996f,   0.343971f,    -0.301477f,   -0.246638f,    0.165068f,
+  0.218044f,    0.224236f,    -0.0848522f,  0.00671216f,   0.401141f,
+  -0.218857f,   -0.0298495f,  -0.135725f,   -0.377618f,    0.022473f,
+  0.106955f,    -0.0582005f,  0.0468484f,   -0.0217442f,   0.130911f,
+  -0.0926905f,  0.383007f,    -0.159353f,   -0.222711f,    -0.0286419f,
+  0.372315f,    -0.469095f,   0.797571f,    -0.301315f,    0.239327f,
+  -0.997507f,   -0.363409f,   0.353717f,    0.676686f,     -0.0500028f,
+  0.0638539f,   -0.431927f,   0.243852f,    0.000884826f,  -0.00166585f,
+  0.0613292f,   -0.029558f,   -0.0248432f,  -0.0125607f,   -0.0309674f,
+  -0.743308f,   0.0409806f,   0.0921015f,   0.167816f,     0.406849f,
+  0.095677f,    0.0308913f,   0.139956f,    -0.400472f,    0.396617f,
+  0.936517f,    0.355057f,    -0.423816f,   -0.232472f,    -0.220188f,
+  -0.399746f,   -0.409623f,   -0.158797f,   0.361153f,     0.0327019f,
+  0.0690844f,   -0.032197f,   0.0248558f,   0.00438518f,   0.0222724f,
+  -0.326832f,   -0.314295f,   0.156563f,    0.0562703f,    0.332694f,
+  0.299424f,    0.228206f,    0.322038f,    0.0136098f,    0.0060297f,
+  -0.165851f,   -0.306512f,   0.0796508f,   -0.37158f,     0.239395f,
+  -0.349442f,   0.198515f,    -0.253854f,   -1.13694f,     0.0202873f,
+  -0.0504009f,  -0.130528f,   -0.017126f,   -0.0370001f,   -0.087458f,
+  -0.119952f,   -0.130404f,   0.0333733f,   -0.184736f,    0.182162f,
+  0.227776f,    -0.166563f,   -0.156162f,   0.118215f,     -0.220183f,
+  0.00474779f,  -0.107792f,   0.260493f,    0.11884f,      0.156587f,
+  0.303936f,    -0.131788f,   -0.314774f,   0.310606f,     0.0935523f,
+  0.790767f,    0.26461f,     0.0236426f,   0.0629469f,    0.0344072f,
+  -0.151513f,   0.211498f,    0.0245435f,   0.0629973f,    0.052019f,
+  -0.03308f,    0.123487f,    0.0885027f,   0.159172f,     -0.0510615f,
+  0.0298033f,   -0.130515f,   -0.121799f,   -0.104915f,    0.208822f,
+  -0.310496f,   -0.314106f,   0.303307f,    -0.0196736f,   0.0420045f,
+  0.461777f,    -0.433699f,   0.00345407f,  0.703139f,     -0.655637f,
+  -0.210767f,   -0.201278f,   0.163694f,    -0.236534f,    0.300877f,
+  0.0769982f,   -0.282453f,   0.149721f,    -0.0303466f,   -0.191473f,
+  -0.406056f,   -0.213472f,   0.1619f,      -0.245953f,    0.00544399f,
+  -0.121434f,   0.193012f,    -0.307165f,   1.45431f,      -0.161468f,
+  -0.12444f,    -0.146129f,   -0.0528212f,  -0.0925165f,   -0.134528f,
+  -0.479475f,   0.315525f,    0.133845f,    0.382158f,     -0.0799693f,
+  -0.151041f,   0.255772f,    0.409536f,    -0.240663f,    -0.323741f,
+  -0.205876f,   0.03699f,     -0.217541f,   0.108511f,     0.640628f,
+  0.705993f,    -0.423899f,   -0.78314f,    -0.100733f,    -0.00859087f,
+  0.0251879f,   0.0458335f,   0.00210128f,  -0.047576f,    -0.0560518f,
+  -1.23869f,    -0.829914f,   0.0346551f,   0.350505f,     0.193688f,
+  0.459154f,    0.137898f,    0.503818f,    0.260867f,     0.649539f,
+  0.0150802f,   0.0239274f,   -0.276069f,   -0.0621478f,   -0.193106f,
+  -0.0375665f,  -0.654529f,   0.189493f,    0.446625f,     -0.0208265f,
+  0.019838f,    -0.0201955f,  0.00180428f,  -0.0110678f,   -0.0172414f,
+  0.0276489f,   -0.252882f,   -0.0351807f,  -0.0518874f,   0.279098f,
+  -0.245122f,   0.101287f,    -0.114202f,   -0.0812187f,   0.572429f,
+  -0.0821731f,  0.564183f,    0.0222552f,   0.190111f,     -0.0417497f,
+  -0.00385925f, -0.182995f,   -0.240482f,   -0.291572f,    -0.0450444f,
+  0.0962974f,   -0.165973f,   -0.0954637f,  -0.163841f,    -0.833405f,
+  -1.31541f,    -0.336473f,   -0.0920702f,  0.816105f,     0.393377f,
+  0.0340241f,   -0.0844545f,  0.61729f,     -0.17596f,     0.241149f,
+  -0.42825f,    -0.59091f,    -0.290702f,   0.0796465f,    0.0982819f,
+  0.466934f,    0.261666f,    0.0373333f,   0.332509f,     -0.0266694f,
+  -0.0476951f,  -0.00642167f, -0.0132542f,  -0.000320841f, 0.00475532f,
+  0.000502778f, 0.296534f,    -0.13297f,    -0.113082f,    -0.327923f,
+  0.35901f,     -0.302246f,   0.189799f,    -0.37994f,     0.16107f,
+  -0.20414f,    0.548575f,    -0.460821f,   0.591878f,     -0.213113f,
+  -0.169373f,   -0.07332f,    0.228841f,    0.682302f,     -0.0665316f,
+  -0.142456f,   -0.0873117f,  0.00607451f,  0.0376443f,    0.0536673f,
+  -0.0109536f,  -0.400279f,   0.550058f,    0.820871f,     -0.666373f,
+  -0.471962f,   -0.315925f,   -0.313142f,   0.952742f,     0.473928f,
+  -0.119006f,   0.153241f,    -0.0383078f,  0.631869f,     -0.343423f,
+  -0.233473f,   -0.218195f,   -0.077688f,   -0.728291f,    0.0382408f,
+  -0.00662886f, -0.0419666f,  0.0309776f,   -0.0281592f,   0.0154229f,
+  -0.198534f,   0.0206324f,   0.0152272f,   -0.235067f,    0.0330486f,
+  0.139198f,    -0.0612118f,  0.133154f,    -0.258675f,    0.0900275f,
+  -0.127771f,   0.157322f,    -0.00767807f, -0.329258f,    0.327458f,
+  0.0528581f,   -0.181125f,   0.409995f,    -0.162979f,    -0.0193475f,
+  0.186009f,    0.0519501f,   0.651877f,    -0.37821f,     -1.10341f,
+  -0.189776f,   -0.0922788f,  0.460256f,    0.168011f,     0.440295f,
+  0.478135f,    0.374573f,    0.384048f,    0.116953f,     0.68886f,
+  -0.427727f,   -0.36676f,    -0.500013f,   -0.228685f,    -0.218859f,
+  0.208396f,    -0.0173765f,  -0.0680241f,  -0.00538013f,  -0.0674409f,
+  -0.092764f,   0.0295707f,   -0.0462887f,  -0.00636006f,  0.0334169f
+};
+
+static const float av1_simple_motion_search_prune_rect_logits_bias_32[] = {
+  0.176459f,  0.154405f, 0.281821f,  0.375264f,  -0.882863f,
+  -0.240261f, -1.17075f, -0.280216f, -0.743836f, -0.317511f
+};
+
+static const NN_CONFIG av1_simple_motion_search_prune_rect_nn_config_32 = {
+  NUM_FEATURES_32,
+  NUM_LOGITS_32,
+  NUM_HIDDEN_LAYERS_32,
+  {
+      NUM_LAYER_0_UNITS_32,
+  },
+  {
+      av1_simple_motion_search_prune_rect_layer_0_kernel_32,
+      av1_simple_motion_search_prune_rect_logits_kernel_32,
+  },
+  {
+      av1_simple_motion_search_prune_rect_layer_0_bias_32,
+      av1_simple_motion_search_prune_rect_logits_bias_32,
+  },
+};
+
+#undef NUM_HIDDEN_LAYERS_32
+#undef NUM_FEATURES_32
+#undef NUM_LAYER_0_UNITS_32
+#undef NUM_LOGITS_32
+
+#define NUM_HIDDEN_LAYERS_16 1
+#define NUM_FEATURES_16 25
+#define NUM_LAYER_0_UNITS_16 32
+#define NUM_LOGITS_16 10
+
+static const float av1_simple_motion_search_prune_rect_logits_kernel_16[] = {
+  -0.520913f,   0.395611f,    0.0369091f,   -0.318591f,  -0.463252f,
+  0.134992f,    -0.43154f,    -0.0739112f,  -0.118817f,  0.476373f,
+  -0.281406f,   0.3413f,      0.456255f,    0.33307f,    0.2942f,
+  0.1317f,      0.498113f,    1.95406f,     -0.165726f,  -0.219306f,
+  -0.302656f,   -1.31157f,    -0.433662f,   0.151716f,   -0.214817f,
+  0.504523f,    -0.710049f,   0.359616f,    -0.412695f,  -0.103193f,
+  0.341912f,    0.351378f,    -0.181486f,   0.573862f,   -0.0396254f,
+  -0.17855f,    -0.276163f,   0.0367465f,   -0.353905f,  -0.204689f,
+  0.309581f,    -0.0439686f,  -0.147855f,   0.152745f,   0.290871f,
+  0.131049f,    -0.27808f,    -0.142997f,   0.207843f,   -1.23074f,
+  -0.267714f,   -0.336923f,   0.313781f,    -0.61488f,   -0.161984f,
+  0.238059f,    -0.0879942f,  -0.085543f,   -0.260156f,  -0.13614f,
+  -0.242196f,   0.201216f,    -0.248691f,   0.0936671f,  -0.350522f,
+  -0.35002f,    -0.156583f,   -0.00579001f, 0.300578f,   -0.341269f,
+  -0.290712f,   0.354802f,    -0.31629f,    0.509107f,   -0.236953f,
+  -0.0923519f,  0.544509f,    -0.280991f,   -0.017437f,  -0.202721f,
+  -0.116388f,   -0.7191f,     0.324586f,    0.254249f,   0.125505f,
+  0.00658697f,  -0.333322f,   -0.126537f,   -0.140004f,  -0.0241202f,
+  -0.172466f,   0.210035f,    -0.270833f,   0.0579044f,  0.0950352f,
+  -0.120382f,   0.063292f,    -0.394925f,   0.482165f,   0.147753f,
+  0.331465f,    -0.187444f,   0.1083f,      0.414028f,   0.279238f,
+  -0.486889f,   -0.674349f,   -0.313656f,   -0.131186f,  -0.100662f,
+  0.238191f,    -1.19083f,    -0.30667f,    -2.4324f,    0.235311f,
+  0.108605f,    1.67197f,     0.476157f,    0.30055f,    0.0839538f,
+  0.408469f,    -0.473517f,   0.560283f,    -0.0188136f, 0.273824f,
+  -0.43707f,    -0.0346978f,  -0.438315f,   -0.0196275f, -0.0567921f,
+  -0.220166f,   0.216175f,    -0.0180461f,  0.0116429f,  -0.0096949f,
+  -0.32613f,    0.176829f,    -0.243563f,   -0.240972f,  -0.621819f,
+  -0.00619648f, -0.145525f,   0.124324f,    -0.0306925f, 0.172208f,
+  -2.04631f,    -0.200087f,   -0.594135f,   -0.352303f,  -0.309826f,
+  0.0922786f,   -0.698371f,   -0.0366823f,  0.0244036f,  0.338775f,
+  -0.115947f,   0.144971f,    -0.0607037f,  -0.762412f,  0.0125584f,
+  -0.262427f,   -0.0830273f,  -0.291252f,   -0.176059f,  -0.203983f,
+  0.0871455f,   -0.0894925f,  0.0426263f,   -0.060001f,  -0.542355f,
+  -0.407837f,   -0.0419273f,  0.226608f,    -0.114844f,  0.158733f,
+  -0.187237f,   0.113163f,    -1.86337f,    -0.367544f,  -0.547048f,
+  -0.24192f,    -0.226764f,   0.090912f,    0.819604f,   0.433766f,
+  -0.841657f,   0.446987f,    -0.622761f,   -0.0296385f, -0.130176f,
+  -0.0518136f,  -0.640326f,   -0.330107f,   -0.137832f,  -0.0119033f,
+  0.39401f,     0.111331f,    -0.141367f,   -0.230289f,  0.171054f,
+  -0.924059f,   -0.107317f,   -0.347983f,   0.0261109f,  0.423002f,
+  -0.305817f,   0.247696f,    0.0436002f,   0.0305862f,  -1.52448f,
+  -0.595587f,   -0.155552f,   -1.11949f,    -0.513937f,  0.138347f,
+  -0.301487f,   0.352144f,    -0.615801f,   0.0326701f,  -0.215322f,
+  -0.0608176f,  -0.416557f,   -0.306073f,   -0.441512f,  -0.0569277f,
+  -0.709768f,   -0.602527f,   -0.311134f,   0.152471f,   -0.255299f,
+  0.354505f,    0.194464f,    0.0144251f,   0.110732f,   -0.4452f,
+  -0.804814f,   0.205325f,    -0.0957486f,  0.502684f,   0.09112f,
+  -0.533087f,   -1.77979f,    0.556992f,    -0.176157f,  -0.642633f,
+  0.11553f,     -0.232561f,   0.161277f,    -0.0631125f, -0.20759f,
+  0.489253f,    -0.067533f,   0.0231024f,   -0.179831f,  -0.272985f,
+  -0.390059f,   0.3089f,      0.185733f,    -0.257065f,  -0.508838f,
+  -0.550028f,   0.0665621f,   -0.138288f,   -0.413188f,  0.191193f,
+  -1.32969f,    -0.431025f,   0.270242f,    -0.340062f,  0.0817257f,
+  0.0376051f,   -0.18633f,    0.0828274f,   0.00670051f, -0.431295f,
+  -0.450316f,   -0.173042f,   -0.322248f,   0.370628f,   0.10019f,
+  0.317293f,    -0.266613f,   0.0752441f,   -0.425656f,  -0.112223f,
+  0.557991f,    -0.324368f,   -0.195261f,   -0.0526129f, -0.807472f,
+  -0.387466f,   0.192186f,    0.353213f,    -0.120238f,  0.107686f,
+  0.200678f,    -0.75363f,    0.466857f,    -0.282345f,  -0.0849236f,
+  -0.0490695f,  -0.00643182f, 0.123047f,    -0.207805f,  -0.130456f,
+  -1.09455f,    0.340973f,    0.334784f,    0.0706643f,  -1.65681f,
+  -0.319952f,   -0.198514f,   -0.0787972f,  0.089524f,   0.0531034f,
+  -0.202705f,   -0.0852339f,  -0.62572f,    -0.0734234f, -0.838088f
+};
+
+static const float av1_simple_motion_search_prune_rect_layer_0_bias_16[] = {
+  -0.0616197f, 0.939947f, 0.521161f,  0.213886f,  0.130324f,  -0.127443f,
+  -0.0538715f, 0.708746f, 0.445031f,  0.418781f,  -0.114539f, 0.521941f,
+  1.13719f,    0.606545f, -0.32193f,  -0.150788f, 0.158487f,  -0.224005f,
+  0.654715f,   0.115729f, -0.286506f, -2.06223f,  0.0117697f, 0.503905f,
+  -0.102339f,  0.653256f, -0.813561f, 0.905235f,  -0.417269f, -0.206265f,
+  0.661496f,   0.95533f
+};
+
+static const float av1_simple_motion_search_prune_rect_layer_0_kernel_16[] = {
+  -0.203489f,   0.00686229f,  -0.161414f,   0.0637276f,   0.27516f,
+  0.512219f,    0.164205f,    0.00326062f,  -0.41914f,    -0.400334f,
+  0.554419f,    0.715772f,    -0.295569f,   -0.703503f,   0.0137744f,
+  -0.0934259f,  0.174234f,    -0.148618f,   -0.0360558f,  -0.0986598f,
+  -0.138502f,   -0.0770713f,  0.122922f,    -0.00784415f, 0.0953234f,
+  -0.255754f,   -0.310967f,   0.185306f,    0.464554f,    0.147338f,
+  -0.0612304f,  0.164783f,    0.301097f,    0.161364f,    -0.12723f,
+  -0.0265984f,  -0.471361f,   0.0578776f,   -0.362865f,   0.425789f,
+  0.402758f,    -0.190235f,   0.00549738f,  -0.570908f,   1.27206f,
+  0.048868f,    -0.0097675f,  0.0708324f,   0.0456103f,   0.0149062f,
+  -0.563032f,   -0.420573f,   0.107278f,    0.0938258f,   0.142712f,
+  -0.00251036f, -0.250583f,   0.522272f,    0.0113175f,   0.126751f,
+  -0.433028f,   -0.035542f,   -0.536686f,   -0.0668722f,  0.253094f,
+  0.254007f,    -0.435505f,   0.343001f,    0.0531542f,   -0.361914f,
+  -0.102664f,   0.0404874f,   0.132686f,    0.0762298f,   0.0236971f,
+  -0.419454f,   0.230877f,    -0.223714f,   0.037813f,    0.0818604f,
+  0.383705f,    -0.235028f,   -0.0554801f,  0.429851f,    0.0845829f,
+  0.166295f,    0.355111f,    -0.421197f,   0.298949f,    0.0218224f,
+  0.445705f,    -0.392217f,   -0.429578f,   -0.076276f,   -0.0963531f,
+  -0.631425f,   -0.225977f,   8.06349e-06f, 0.0676679f,   0.0779651f,
+  0.0706891f,   0.101377f,    0.517103f,    0.0945502f,   -0.52522f,
+  -0.312022f,   0.0358089f,   0.616509f,    -0.0507444f,  -0.465814f,
+  -0.0326024f,  0.591298f,    0.188544f,    -0.0633316f,  -0.199987f,
+  0.403118f,    -0.511281f,   -0.696263f,   0.112996f,    0.103875f,
+  0.0495595f,   -0.0107449f,  0.521539f,    -0.0123823f,  -0.0642751f,
+  0.08548f,     -0.0679207f,  0.526558f,    0.0651114f,   -0.342643f,
+  -0.349934f,   0.307437f,    0.368763f,    -0.194851f,   -0.134117f,
+  0.102448f,    -0.0520666f,  0.0415824f,   -0.175085f,   0.272685f,
+  0.0675856f,   0.120627f,    0.391408f,    -0.135249f,   -0.357024f,
+  0.019666f,    -0.0622677f,  0.407427f,    0.22655f,     -0.129432f,
+  -0.165327f,   0.004893f,    0.5479f,      0.0613981f,   -0.479682f,
+  -0.144228f,   -0.130106f,   0.206458f,    -0.342086f,   0.12691f,
+  -0.113554f,   0.231164f,    -0.051419f,   0.0401286f,   -0.560429f,
+  -0.070609f,   0.420232f,    0.442465f,    -0.237501f,   -0.000293732f,
+  -1.017f,      -0.210222f,   0.0157063f,   0.0488178f,   0.0734721f,
+  -0.52626f,    -0.276441f,   -0.521579f,   0.443532f,    -0.0819051f,
+  -0.0732633f,  -0.17999f,    0.258525f,    -0.0374872f,  0.150115f,
+  0.0510939f,   0.168116f,    0.473372f,    0.824489f,    0.302195f,
+  -0.348613f,   0.238569f,    0.176444f,    -0.633945f,   -0.0567195f,
+  -0.0305827f,  -0.0551851f,  0.85822f,     -0.0628099f,  0.0364294f,
+  -0.234823f,   0.179067f,    0.143208f,    -0.0511014f,  -0.404191f,
+  0.428035f,    0.0235506f,   0.371991f,    -0.312909f,   0.550933f,
+  -0.389265f,   -0.271813f,   -0.293461f,   -0.583752f,   0.179991f,
+  0.191698f,    0.659094f,    1.07941f,     -0.509555f,   -0.100638f,
+  0.079988f,    -0.0519107f,  -0.112723f,   -0.0663326f,  0.0353569f,
+  -0.795055f,   -0.465999f,   0.283579f,    0.340913f,    0.152738f,
+  0.294664f,    0.527839f,    0.187735f,    0.359461f,    0.164629f,
+  0.107512f,    0.390402f,    0.236702f,    0.114674f,    -0.525655f,
+  -0.555476f,   -0.6589f,     -0.266601f,   -0.0946547f,  0.6306f,
+  0.0248513f,   0.038497f,    0.432706f,    -0.0715465f,  0.0410172f,
+  -0.115313f,   -0.428684f,   0.136283f,    0.0913185f,   0.11277f,
+  0.0968689f,   -0.00437052f, 0.0888981f,   0.10304f,     0.02442f,
+  -0.211315f,   0.00981596f,  -0.0974827f,  0.208611f,    0.140644f,
+  0.0315567f,   0.350332f,    -0.291049f,   -0.0715449f,  -0.352992f,
+  -0.858004f,   0.828658f,    0.439092f,    0.0151291f,   0.0503828f,
+  0.0656112f,   -0.710749f,   -0.0951757f,  0.193908f,    0.00908018f,
+  0.141486f,    -0.0657711f,  0.099791f,    0.153729f,    -0.419576f,
+  -0.892636f,   -0.0449268f,  -0.170786f,   -0.156564f,   0.384511f,
+  0.296565f,    0.0569815f,   -0.103938f,   1.27479f,     -0.0406475f,
+  0.154083f,    -0.186442f,   0.0282588f,   0.0312102f,   -0.188994f,
+  0.284243f,    -0.564693f,   0.425525f,    -0.00924596f, 0.810003f,
+  0.233812f,    -0.0180273f,  0.121082f,    -0.209096f,   0.151437f,
+  0.286921f,    -0.348095f,   0.174813f,    -0.413798f,   0.108994f,
+  -0.34266f,    -0.0337981f,  -0.459f,      -0.409812f,   -0.0890104f,
+  0.0834802f,   -0.00259191f, -0.105914f,   -0.164207f,   0.0697689f,
+  -0.312098f,   -0.00650536f, -0.486758f,   -0.248486f,   0.24314f,
+  -0.0857144f,  0.0884781f,   -0.65615f,    -0.121744f,   0.0709335f,
+  -0.0237193f,  0.10764f,     -0.0409452f,  -0.0824305f,  0.42329f,
+  0.138258f,    0.502607f,    0.228545f,    0.0687789f,   0.0361586f,
+  0.39074f,     0.0722654f,   -0.0133148f,  0.283278f,    0.0743384f,
+  0.310292f,    -0.297675f,   -0.359935f,   0.521021f,    -0.10082f,
+  -0.272333f,   0.0120283f,   0.138118f,    -0.123711f,   -0.0711386f,
+  0.0170747f,   0.831039f,    0.0509626f,   0.790608f,    -0.0863406f,
+  -0.31962f,    0.0631013f,   0.0873453f,   -0.472331f,   -0.0826027f,
+  -0.241722f,   0.148835f,    -0.131611f,   0.000195347f, -0.0615804f,
+  -0.838663f,   -0.586979f,   0.247713f,    0.362254f,    0.492727f,
+  -0.132163f,   0.0516545f,   0.477838f,    -0.0395182f,  0.0124993f,
+  -0.771514f,   0.0386912f,   -0.118525f,   -0.346172f,   -0.265905f,
+  -0.175257f,   -0.406287f,   0.393837f,    0.409096f,    -0.408501f,
+  -0.0207146f,  0.0487809f,   0.0636982f,   0.0276368f,   0.0878249f,
+  0.0425889f,   0.0868633f,   0.17423f,     -0.128217f,   -0.477068f,
+  -0.321294f,   0.0393771f,   0.00812823f,  -0.350529f,   -0.129012f,
+  0.439953f,    0.396662f,    0.410475f,    -0.123129f,   -0.565966f,
+  0.0298635f,   -0.614611f,   -0.477514f,   0.453651f,    0.0617068f,
+  0.0530563f,   0.0479074f,   0.213551f,    0.039034f,    0.0449095f,
+  -1.06868f,    -1.2654f,     -0.175482f,   0.595068f,    -0.230095f,
+  0.719838f,    -0.272148f,   0.696564f,    0.0485396f,   0.468584f,
+  0.0695439f,   -0.0842122f,  -0.228978f,   0.161397f,    -0.000441421f,
+  -0.0297514f,  -0.250599f,   0.196656f,    0.608423f,    -0.0112096f,
+  0.0236881f,   -0.00167311f, 0.0040709f,   0.015495f,    0.00757698f,
+  -0.165886f,   0.359767f,    -0.0214696f,  0.377208f,    0.0303547f,
+  0.0657094f,   0.140775f,    0.21867f,     -0.203922f,   0.263878f,
+  -0.0529099f,  0.202438f,    -0.243226f,   0.156659f,    -0.627056f,
+  -0.845036f,   -0.500873f,   0.172588f,    0.402972f,    -0.147734f,
+  0.151792f,    -0.075579f,   0.443519f,    0.0311335f,   -0.0328222f,
+  -0.0299781f,  0.435956f,    -0.0987376f,  0.288402f,    0.135902f,
+  -0.173584f,   -0.186255f,   0.224524f,    -0.249645f,   0.123702f,
+  -0.0846244f,  0.491317f,    0.544846f,    0.338677f,    -0.258885f,
+  -0.617434f,   -0.629003f,   -0.347233f,   0.181262f,    -0.0606015f,
+  -0.537766f,   0.215089f,    -0.334527f,   0.0488534f,   0.0577997f,
+  -1.12431f,    -0.932292f,   -0.11559f,    0.573715f,    0.151128f,
+  0.693818f,    -0.16956f,    0.802591f,    -0.231531f,   1.04318f,
+  -0.476417f,   0.293452f,    -0.610136f,   0.27506f,     -0.384012f,
+  0.305366f,    -0.0540464f,  -0.337583f,   -0.174285f,   0.157248f,
+  0.0477345f,   -0.0229535f,  0.0475766f,   -0.00603319f, 0.00856119f,
+  -0.702893f,   -0.0579673f,  0.183024f,    -0.166222f,   0.109763f,
+  -0.148019f,   -0.258873f,   -0.0820157f,  -0.186716f,   -0.449265f,
+  -0.0534138f,  0.15732f,     0.46357f,     0.00502591f,  -0.0282085f,
+  0.152277f,    -0.855199f,   -0.357115f,   0.0366159f,   0.0131101f,
+  -0.0407758f,  0.0462835f,   0.146309f,    -0.00276278f, -0.0591814f,
+  -0.109437f,   0.506764f,    -0.044421f,   0.465907f,    0.114444f,
+  -0.241053f,   -0.362649f,   -0.432615f,   0.199989f,    -0.00635866f,
+  -0.521886f,   0.0958924f,   -0.485725f,   0.0430527f,   0.069746f,
+  0.681091f,    -0.288144f,   0.505671f,    0.0489065f,   -0.0373836f,
+  0.266079f,    0.145173f,    -0.011481f,   -0.225074f,   -0.754501f,
+  -0.122939f,   -0.294213f,   0.334738f,    0.281561f,    0.558977f,
+  -0.21551f,    -0.346507f,   -0.0625635f,  0.0782034f,   -0.236999f,
+  -0.803783f,   -0.601117f,   0.091192f,    0.636122f,    -0.250626f,
+  0.0354961f,   0.103915f,    0.508571f,    0.329911f,    -0.0425999f,
+  -0.0867587f,  -0.0385824f,  1.13914f,     -0.0261992f,  0.00484478f,
+  0.124603f,    -0.012173f,   -0.377358f,   -0.243563f,   0.236094f,
+  0.145663f,    -0.132752f,   0.347497f,    -0.529315f,   0.271632f,
+  -0.372805f,   0.0261836f,   0.126169f,    0.0941008f,   0.283773f,
+  0.765701f,    -0.226477f,   -0.181549f,   -0.306896f,   0.110165f,
+  -0.0784234f,  -0.0827892f,  -0.0374252f,  -0.0950872f,  -0.451015f,
+  -0.995793f,   -0.452663f,   0.293338f,    -0.380865f,   0.032683f,
+  0.0178248f,   0.0699194f,   -0.0811722f,  -0.0866096f,  0.139289f,
+  0.296604f,    0.192293f,    -0.0589607f,  -0.179878f,   0.00360266f,
+  -0.0905794f,  0.136744f,    -0.191555f,   1.31877f,     -0.0592033f,
+  -0.158766f,   0.0214746f,   -0.190113f,   -0.116671f,   0.0449292f,
+  -0.109533f,   -0.709307f,   0.386424f,    0.40201f,     0.262211f,
+  -0.155244f,   0.233988f,    -0.0166317f,  0.462665f,    0.0484462f,
+  0.210902f,    -0.352798f,   0.38698f,     -0.228261f,   -0.084309f,
+  -0.220751f,   -0.170879f,   -0.352617f,   -1.24277f,    0.266004f,
+  -0.0125749f,  -0.0380073f,  0.101838f,    -0.0483024f,  -0.0629178f,
+  -0.0695577f,  -0.103439f,   0.242131f,    -0.0796858f,  0.349718f,
+  -0.332045f,   0.0138352f,   -0.380235f,   -0.28717f,    -0.176276f,
+  0.865903f,    0.36593f,     0.243925f,    -0.422289f,   -0.117327f,
+  0.21876f,     0.245393f,    -0.426134f,   -0.186077f,   0.0352515f,
+  -0.123742f,   0.249376f,    1.3281f,      0.0707771f,   0.071415f,
+  -0.286827f,   -0.131691f,   -0.270881f,   -0.434378f,   0.376064f,
+  0.35966f,     0.513374f,    0.439378f,    -0.222716f,   -0.5874f,
+  0.487997f,    -0.293271f,   -0.184245f,   -0.037256f,   0.17723f,
+  -0.438651f,   0.428184f,    0.112983f,    -0.449287f,   -0.0451963f,
+  0.0854929f,   0.0735442f,   -0.0148642f,  -0.0586782f,  -0.176455f,
+  -0.438979f,   -0.127109f,   0.211478f,    0.388035f,    -0.0372021f,
+  0.220575f,    0.382144f,    0.302121f,    0.0857121f,   0.193445f,
+  -0.488858f,   -0.195288f,   -0.316184f,   -0.314026f,   -0.111956f,
+  0.0744768f,   0.292709f,    0.30187f,     -0.285506f,   -0.105006f,
+  0.0851402f,   -0.082318f,   0.277518f,    0.725294f,    -0.756304f,
+  0.0155309f,   -0.378542f,   0.293377f,    -0.347252f,   -0.338458f,
+  0.221449f,    -0.176443f,   -0.131972f,   0.0129163f,   -0.290649f,
+  0.198596f,    -0.0721333f,  0.620591f,    0.568736f,    0.174001f,
+  -0.205186f,   -0.265606f,   -0.249155f,   0.299163f,    1.11842f,
+  0.17423f,     0.196417f,    -0.014484f,   0.0735422f,   0.26329f,
+  0.12284f,     -0.750305f,   -0.351337f,   0.121994f,    -0.00542878f,
+  -0.295707f,   -0.094124f,   0.300993f,    0.412408f,    -0.170761f,
+  -0.0676329f,  -0.106638f,   -0.419785f,   -0.43878f,    0.22421f,
+  0.0339903f,   0.619851f,    0.0615381f,   0.514631f,    1.35424f,
+  -0.0679228f,  -0.203457f,   0.131948f,    -0.0041251f,  -0.209054f
+};
+
+static const float av1_simple_motion_search_prune_rect_logits_bias_16[] = {
+  0.304025f,  0.131887f, 0.259279f,  -0.561564f, -0.161729f,
+  -0.208036f, 0.102206f, -0.162937f, -1.42311f,  -0.708305f
+};
+
+static const NN_CONFIG av1_simple_motion_search_prune_rect_nn_config_16 = {
+  NUM_FEATURES_16,
+  NUM_LOGITS_16,
+  NUM_HIDDEN_LAYERS_16,
+  {
+      NUM_LAYER_0_UNITS_16,
+  },
+  {
+      av1_simple_motion_search_prune_rect_layer_0_kernel_16,
+      av1_simple_motion_search_prune_rect_logits_kernel_16,
+  },
+  {
+      av1_simple_motion_search_prune_rect_layer_0_bias_16,
+      av1_simple_motion_search_prune_rect_logits_bias_16,
+  },
+};
+
+#undef NUM_HIDDEN_LAYERS_16
+#undef NUM_FEATURES_16
+#undef NUM_LAYER_0_UNITS_16
+#undef NUM_LOGITS_16
+
+#define NUM_HIDDEN_LAYERS_8 1
+#define NUM_FEATURES_8 25
+#define NUM_LAYER_0_UNITS_8 32
+#define NUM_LOGITS_8 4
+
+static const float av1_simple_motion_search_prune_rect_logits_kernel_8[] = {
+  -0.266303f,  -0.387676f,  0.204501f,   -0.120842f,  -0.0752326f, 0.0337739f,
+  0.0243477f,  -0.356748f,  0.0143051f,  -0.16403f,   -0.139013f,  0.175003f,
+  -0.206754f,  0.349059f,   0.181763f,   0.212768f,   -0.313783f,  0.182829f,
+  0.00205376f, -0.939525f,  -0.0992424f, 0.306254f,   0.083329f,   -0.133137f,
+  -0.179022f,  -0.0237902f, 0.0601026f,  -0.216698f,  -0.551149f,  0.081711f,
+  -0.442191f,  0.0680832f,  -0.0353678f, 0.237704f,   0.23155f,    -0.36097f,
+  0.123389f,   -0.288927f,  0.178133f,   -0.152222f,  -0.235648f,  -0.0495293f,
+  -0.316522f,  0.034207f,   0.0463139f,  -0.817825f,  0.417443f,   -0.110984f,
+  -0.402371f,  0.0341694f,  -0.37383f,   0.414532f,   0.093993f,   0.0039505f,
+  0.0803175f,  -0.511859f,  -0.0154802f, 0.0979595f,  0.0909049f,  -0.120938f,
+  -0.577382f,  -0.155041f,  -0.404295f,  0.122223f,   -0.084703f,  0.00415336f,
+  0.149135f,   0.113219f,   0.124236f,   -0.240905f,  0.163909f,   -0.154202f,
+  -0.208917f,  0.00200158f, -0.71796f,   0.105984f,   -0.131996f,  -0.539603f,
+  0.223768f,   -0.0710733f, -0.346679f,  -0.0745909f, 0.171032f,   0.215701f,
+  0.218519f,   0.105981f,   -0.096209f,  -0.166453f,  -0.468894f,  -0.401578f,
+  -0.239222f,  0.111382f,   0.38747f,    -0.164734f,  -0.175955f,  0.336621f,
+  -0.0305501f, -0.0576765f, 0.0672671f,  -0.183692f,  0.412082f,   -0.262951f,
+  -0.153429f,  -0.128589f,  -0.530472f,  0.0936412f,  -1.08296f,   -0.45147f,
+  0.0714904f,  -3.96842f,   0.438125f,   -0.313945f,  0.231104f,   -0.00183851f,
+  -0.0192768f, -0.637531f,  -0.109296f,  0.0531702f,  0.00262162f, -0.615951f,
+  -0.546241f,  -0.635305f,  -0.0762367f, 0.0122019f,  0.423693f,   -0.129142f,
+  -0.112242f,  0.295184f
+};
+
+static const float av1_simple_motion_search_prune_rect_layer_0_bias_8[] = {
+  -2.16023f,  -3.12831f, -0.213206f,  -2.97875f, -1.83791f, -2.84713f,
+  -0.909636f, -2.05893f, 0.00525274f, -1.51672f, -3.95017f, 1.82847f,
+  -0.853224f, -3.29503f, -0.537517f,  0.923106f, -3.18665f, -1.29905f,
+  1.64506f,   -1.99848f, -2.24315f,   0.408613f, 0.503671f, -3.83393f,
+  -2.88388f,  -3.52337f, 1.46818f,    -1.67169f, -3.83253f, 1.52644f,
+  -0.490783f, -0.415782f
+};
+
+static const float av1_simple_motion_search_prune_rect_layer_0_kernel_8[] = {
+  -0.702198f,  -0.102148f,   0.0564545f,   -0.0555548f,  0.16184f,
+  0.0950792f,  0.136974f,    -0.00824146f, 0.05746f,     0.0447542f,
+  0.145978f,   0.0855769f,   -0.041449f,   0.301347f,    -0.0206691f,
+  -0.0662514f, -0.0525079f,  -0.0998387f,  -0.0891438f,  0.110545f,
+  -0.863098f,  -1.83798f,    0.238818f,    0.127797f,    0.116872f,
+  -0.270655f,  -0.21057f,    0.197013f,    -0.123332f,   0.137104f,
+  -0.174766f,  -0.00803025f, 0.0234369f,   -0.0894175f,  -0.0380927f,
+  0.00827928f, -0.134148f,   0.110575f,    -0.250173f,   0.116273f,
+  0.0197749f,  0.270391f,    0.108437f,    0.173197f,    -0.0650348f,
+  0.0884626f,  0.262792f,    0.0649228f,   0.5573f,      -2.81315f,
+  -0.479801f,  -1.15825f,    0.0807932f,   -0.19144f,    0.404016f,
+  -0.211521f,  0.233269f,    -0.391414f,   0.160381f,    -0.277233f,
+  0.426354f,   0.156839f,    0.494315f,    -0.214259f,   -0.0132062f,
+  0.148628f,   -0.0899568f,  0.161845f,    0.467689f,    0.229474f,
+  0.590634f,   -0.705793f,   -0.0486113f,  -0.439088f,   0.994566f,
+  0.679065f,   0.777869f,    -0.225291f,   -0.0303006f,  -0.638782f,
+  -0.0824632f, -0.128561f,   -0.327603f,   0.105624f,    0.567581f,
+  -0.396135f,  -0.471028f,   0.181286f,    0.274604f,    0.180169f,
+  0.0612144f,  -0.865004f,   0.0306804f,   0.142985f,    -0.0914358f,
+  -0.243284f,  0.358359f,    -0.443847f,   -0.371978f,   0.606933f,
+  -0.900408f,  -0.52076f,    0.472118f,    0.0610973f,   0.152526f,
+  -0.550379f,  0.309331f,    -0.141573f,   0.203046f,    -0.231485f,
+  0.505156f,   0.393224f,    0.435487f,    -0.218681f,   0.123707f,
+  -0.270383f,  -0.033565f,   0.210373f,    -2.33967f,    0.367434f,
+  0.0308118f,  -0.205771f,   0.546141f,    0.19837f,     0.035648f,
+  -0.467007f,  -1.50995f,    -0.0314176f,  0.11762f,     -0.15307f,
+  0.618257f,   -0.139502f,   0.303386f,    -0.00758681f, 0.228107f,
+  -0.594499f,  -0.201984f,   -0.239666f,   0.114878f,    -0.922174f,
+  -0.530137f,  -0.379366f,   -0.319582f,   0.0889624f,   -0.00544663f,
+  0.316264f,   -0.204262f,   -0.0959358f,  0.23552f,     0.141369f,
+  -0.207129f,  -1.04067f,    -0.0780501f,  0.226768f,    -0.246752f,
+  0.0823105f,  0.114783f,    0.49315f,     0.0197732f,   0.705433f,
+  0.158076f,   -0.250584f,   -0.157326f,   -0.0439547f,  -0.139047f,
+  0.090531f,   -0.38833f,    0.743143f,    -1.47418f,    -0.155009f,
+  0.511466f,   -0.726716f,   -0.181075f,   0.450133f,    -0.390204f,
+  0.292725f,   0.00811462f,  -0.347738f,   0.613381f,    -0.237124f,
+  0.750748f,   -0.383123f,   0.410309f,    -0.204166f,   0.667199f,
+  -0.313197f,  0.436059f,    -0.607571f,   0.193681f,    0.409399f,
+  0.631747f,   -0.0454149f,  0.198232f,    0.345591f,    -0.0137374f,
+  -0.307014f,  -0.535515f,   0.764678f,    -0.225686f,   -0.451621f,
+  -2.75564f,   -1.52877f,    0.0511933f,   0.905979f,    0.145029f,
+  0.759615f,   0.130166f,    0.83827f,     0.0655081f,   1.07555f,
+  -0.529777f,  0.682967f,    -0.412052f,   0.611947f,    -0.83676f,
+  0.940695f,   -0.465681f,   0.51505f,     -0.883659f,   -0.105524f,
+  -0.0344173f, -0.0683618f,  -0.00698688f, -0.139349f,   0.135741f,
+  -0.294455f,  -0.377834f,   -0.602084f,   -1.00128f,    0.483291f,
+  1.25327f,    0.178987f,    0.75068f,     -0.520731f,   -0.325517f,
+  0.272032f,   0.144144f,    -0.279453f,   0.564907f,    0.144036f,
+  0.297448f,   -0.504243f,   -0.250508f,   -1.26395f,    0.4816f,
+  0.392771f,   -0.389961f,   -0.261585f,   -0.127124f,   -0.202945f,
+  -0.709716f,  -0.174719f,   0.113613f,    0.477753f,    -0.226659f,
+  0.0697828f,  -0.177994f,   0.300726f,    -0.185504f,   0.339424f,
+  -0.316746f,  0.369693f,    -0.339723f,   -0.143886f,   -0.0326589f,
+  -0.268761f,  -0.241094f,   0.284876f,    -0.0270867f,  -0.207397f,
+  -1.42738f,   0.495612f,    -0.0277732f,  0.199675f,    1.48638f,
+  -0.659257f,  -1.28199f,    0.498702f,    0.140695f,    0.571152f,
+  0.416368f,   0.14153f,     0.126876f,    0.521114f,    -0.00150571f,
+  0.375581f,   0.00537624f,  0.1286f,      -0.332227f,   0.417663f,
+  -0.539023f,  0.217124f,    -0.787111f,   -0.0335266f,  1.56751f,
+  0.0640563f,  -0.158791f,   0.118195f,    0.000970493f, -0.0403852f,
+  -0.0572557f, -0.0201181f,  -0.10255f,    0.63237f,     0.156662f,
+  0.418696f,   -0.274802f,   -0.663923f,   -0.375232f,   -0.40846f,
+  0.462092f,   1.2176f,      -0.301532f,   -0.779704f,   -0.112876f,
+  0.0806591f,  -0.0141923f,  0.00960801f,  -0.663557f,   0.0979948f,
+  -0.0575999f, -0.012847f,   0.0403853f,   -0.133666f,   -0.00330217f,
+  -0.931518f,  -0.774599f,   -0.21391f,    0.377601f,    -0.183365f,
+  0.299094f,   0.0238552f,   0.206716f,    -0.18959f,    0.346013f,
+  -0.150991f,  -0.192817f,   -0.293962f,   -0.0537604f,  -0.0648171f,
+  -0.275941f,  -0.144854f,   -0.224092f,   2.43113f,     0.0422494f,
+  -0.047236f,  -0.0262028f,  0.0282119f,   -0.175553f,   0.0888502f,
+  0.580682f,   0.951055f,    -0.284441f,   -0.120133f,   -0.268058f,
+  -0.312083f,  -0.411556f,   0.21431f,     -0.28033f,    0.324851f,
+  -1.02787f,   -0.936816f,   -0.577628f,   0.544743f,    0.295807f,
+  0.406157f,   0.447927f,    0.25369f,     -0.811421f,   -0.0424979f,
+  -0.189867f,  0.00778673f,  -0.113587f,   -0.116175f,   -0.0542222f,
+  -1.80089f,   -1.44175f,    -0.35332f,    0.191314f,    -0.236691f,
+  -0.0261926f, -0.502363f,   0.252278f,    -0.485478f,   0.296495f,
+  0.455612f,   -0.0489631f,  0.227255f,    0.170975f,    0.473487f,
+  0.257812f,   0.178048f,    0.2506f,      2.04637f,     -0.173857f,
+  0.0583379f,  0.00765589f,  -0.025772f,   -0.162666f,   -0.016214f,
+  -0.607486f,  -0.0808025f,  0.0551611f,   -0.0772291f,  0.126421f,
+  0.10869f,    -0.0877463f,  -0.111527f,   -0.0775766f,  0.503886f,
+  -0.002757f,  -0.0421354f,  -0.247857f,   0.140827f,    0.383576f,
+  0.228232f,   -0.157877f,   -0.0927911f,  0.344687f,    0.191181f,
+  0.236533f,   0.00102869f,  -0.0184502f,  -1.4509f,     -1.15945f,
+  -0.521978f,  -0.643225f,   0.133139f,    0.0660321f,   0.0851957f,
+  0.0303648f,  0.0296239f,   0.0455713f,   0.175647f,    0.080532f,
+  0.0445691f,  -0.257356f,   -0.125602f,   -0.138829f,   -0.167057f,
+  -0.0992552f, -0.13944f,    0.507531f,    0.444997f,    0.221452f,
+  -0.308384f,  -0.327554f,   0.13235f,     2.1487f,      -1.15453f,
+  -0.280239f,  -0.363582f,   -0.00358745f, 0.012866f,    0.251088f,
+  0.0676416f,  0.178492f,    -0.136631f,   0.197938f,    -0.078198f,
+  0.812439f,   1.1173f,      0.712113f,    1.10124f,     -0.836503f,
+  -1.22433f,   -1.07894f,    -1.29215f,    0.56057f,     2.23928f,
+  -0.419029f,  0.282178f,    -0.0719266f,  -0.172192f,   0.28034f,
+  -2.99124f,   -2.01481f,    0.0688982f,   0.697466f,    0.00635555f,
+  0.566069f,   0.047534f,    0.507755f,    -0.00690707f, 0.712594f,
+  -0.191467f,  0.355733f,    -0.480016f,   0.664669f,    -0.390619f,
+  0.351199f,   -0.482342f,   0.325005f,    1.9089f,      0.155987f,
+  0.17032f,    0.132729f,    0.0402649f,   0.146991f,    0.0314905f,
+  -0.775316f,  -0.208892f,   -0.105993f,   0.0181653f,   -0.12735f,
+  0.0897852f,  0.0470231f,   0.25807f,     0.127406f,    -0.0893252f,
+  -0.279776f,  0.190844f,    0.110384f,    -0.148833f,   0.025293f,
+  0.239838f,   0.00932245f,  0.35103f,     -0.128268f,   -0.0536754f,
+  0.506899f,   -0.16793f,    0.0955582f,   -2.01108f,    0.721433f,
+  -2.31413f,   -2.08646f,    0.033315f,    0.689828f,    -0.271213f,
+  0.790425f,   -0.114234f,   0.755325f,    -0.211533f,   0.774544f,
+  -0.263268f,  0.795762f,    -0.551455f,   0.953602f,    -0.168454f,
+  0.529055f,   -0.768991f,   0.882371f,    0.29763f,     -0.155017f,
+  0.00464101f, 0.121093f,    0.948271f,    0.113138f,    -0.110332f,
+  -2.0492f,    -1.31322f,    -0.129212f,   0.464778f,    -0.181465f,
+  0.618403f,   0.0627984f,   0.465228f,    0.165729f,    0.278277f,
+  -0.563276f,  -0.358358f,   -0.590638f,   0.0104993f,   0.731206f,
+  0.752569f,   0.631615f,    0.811822f,    0.129804f,    -0.0558327f,
+  0.570081f,   -0.417922f,   -0.168275f,   0.0703671f,   0.269127f,
+  0.240457f,   -0.197159f,   -0.00179261f, 0.220065f,    0.463511f,
+  0.0714626f,  -0.716477f,   -0.441865f,   -0.717028f,   -0.149176f,
+  0.452182f,   0.662699f,    -0.906534f,   -0.817133f,   0.237747f,
+  0.26024f,    -7.7441e-05f, 0.0934616f,   0.824641f,    -0.0404494f,
+  -0.088297f,  -0.157899f,   0.037408f,    0.132435f,    -0.316155f,
+  -0.276785f,  0.0117868f,   0.185008f,    0.32369f,     -0.465855f,
+  -0.302127f,  0.303289f,    0.338597f,    -0.665408f,   -0.507594f,
+  0.526979f,   0.532091f,    0.234395f,    0.754063f,    0.116769f,
+  0.0800309f,  -0.939344f,   -1.51269f,    1.4583f,      0.178444f,
+  0.0106756f,  -0.213468f,   -0.00369439f, 0.071015f,    -0.192798f,
+  -0.0933147f, -0.129901f,   -0.368279f,   -0.246564f,   0.126966f,
+  0.478565f,   -0.476246f,   -0.762863f,   0.168883f,    0.536136f,
+  -0.272969f,  0.2573f,      -0.161577f,   0.311428f,    -0.777994f,
+  -1.29752f,   0.216046f,    0.329016f,    1.57265f,     0.168075f,
+  -0.192518f,  0.0829308f,   -0.073533f,   -0.0202034f,  0.114716f,
+  -0.34888f,   -0.519215f,   0.190809f,    0.0138507f,   0.133635f,
+  0.14194f,    0.410618f,    -0.165106f,   0.214438f,    0.0438265f,
+  -0.8481f,    -1.19182f,    -1.07878f,    -0.882217f,   0.45616f,
+  0.977385f,   0.74929f,     0.918466f,    0.904704f,    0.041938f,
+  0.0362776f,  0.0757255f,   1.14007f,     0.0516825f,   -0.160068f,
+  0.219535f,   0.638634f,    -0.0284544f,  -0.222849f,   -0.0344915f,
+  -0.0350256f, -0.0504452f,  -0.0458416f,  0.146099f,    0.0783083f,
+  0.206579f,   0.241264f,    0.28401f,     0.0425312f,   -0.802049f,
+  -0.746271f,  -0.578969f,   -0.078218f,   0.436176f,    -0.281465f,
+  -2.5539f,    0.237868f,    -0.121796f,   0.0715619f,   0.106992f,
+  -0.621862f,  -0.167142f,   0.153716f,    0.0570912f,   -0.06525f,
+  -0.923773f,  0.130759f,    0.0517066f,   0.0729862f,   -0.873064f,
+  0.0403328f,  -0.186499f,   -0.0831918f,  -0.223723f,   0.144697f,
+  0.212845f,   0.416876f,    0.361598f,    0.138229f,    0.0728777f,
+  -1.95419f,   -0.00382816f, -0.0440387f,  0.433627f,    0.44781f,
+  -1.05229f,   -1.54506f,    0.564827f,    -0.263456f,   0.296105f,
+  -0.158055f,  0.388274f,    -0.366639f,   0.212006f,    -0.245619f,
+  0.593064f,   0.088727f,    0.410632f,    -0.263462f,   0.507075f,
+  -0.0974155f, 0.275268f,    -0.1293f,     0.136679f,    1.98276f,
+  0.411766f,   0.391987f,    0.34283f,     -0.114077f,   0.258462f,
+  -0.302443f,  0.301138f,    -0.00726621f, 0.276441f,    -0.291582f,
+  0.66498f,    -0.321451f,   -0.332805f,   0.0943272f,   0.572253f,
+  -0.45818f,   -0.0219593f,  -0.151679f,   0.402033f,    -1.15502f,
+  -0.882955f,  0.772904f,    0.88126f,     -0.149555f,   0.709525f,
+  0.350116f,   -0.21531f,    0.797893f,    0.0230234f,   0.0203034f,
+  0.2744f,     1.08273f,     0.039349f,    0.503909f,    -0.45892f,
+  -0.579516f,  -0.344058f,   0.390628f,    -0.386941f,   -0.430317f,
+  -0.0807066f, 0.435906f,    0.522996f,    0.724476f,    -0.74371f,
+  -0.05376f,   -0.340898f,   -0.962646f,   -0.0278005f,  0.0981149f,
+  -0.0811161f, 0.00237994f,  0.850042f,    0.0665473f,   0.134413f
+};
+
+static const float av1_simple_motion_search_prune_rect_logits_bias_8[] = {
+  1.63404f, -0.715866f, -1.0132f, -2.08745f
+};
+
+static const NN_CONFIG av1_simple_motion_search_prune_rect_nn_config_8 = {
+  NUM_FEATURES_8,
+  NUM_LOGITS_8,
+  NUM_HIDDEN_LAYERS_8,
+  {
+      NUM_LAYER_0_UNITS_8,
+  },
+  {
+      av1_simple_motion_search_prune_rect_layer_0_kernel_8,
+      av1_simple_motion_search_prune_rect_logits_kernel_8,
+  },
+  {
+      av1_simple_motion_search_prune_rect_layer_0_bias_8,
+      av1_simple_motion_search_prune_rect_logits_bias_8,
+  },
+};
+
+#undef NUM_HIDDEN_LAYERS_8
+#undef NUM_FEATURES_8
+#undef NUM_LAYER_0_UNITS_8
+#undef NUM_LOGITS_8
+
+static const NN_CONFIG
+    *const av1_simple_motion_search_prune_rect_nn_config[5] = {
+      &av1_simple_motion_search_prune_rect_nn_config_128,
+      &av1_simple_motion_search_prune_rect_nn_config_64,
+      &av1_simple_motion_search_prune_rect_nn_config_32,
+      &av1_simple_motion_search_prune_rect_nn_config_16,
+      &av1_simple_motion_search_prune_rect_nn_config_8,
+    };
+
+// nn model for predicting max square partition level of a superblock
+#define NUM_HIDDEN_LAYERS 1
+#define NUM_FEATURES 13
+#define NUM_LAYER_0_UNITS 48
+#define NUM_LOGITS 4
+
+static const float av1_max_part_pred_logits_kernel[] = {
+  -0.304561f,   0.0885596f,   -0.988539f,   1.08147f,    0.215213f,
+  0.202965f,    -0.828457f,   -0.233945f,   -0.0866977f, -0.115521f,
+  0.02079f,     0.196491f,    -0.0285075f,  0.05067f,    -0.00872862f,
+  0.00281844f,  -0.238954f,   0.0253801f,   0.0257775f,  0.339269f,
+  0.176174f,    -0.152545f,   -0.0588704f,  -1.62275f,   -0.189329f,
+  0.0808033f,   0.233844f,    -4.53798f,    0.674968f,   -0.0361688f,
+  -0.0754075f,  1.16129f,     -0.0188879f,  0.113255f,   -3.04378f,
+  0.814728f,    -0.568517f,   -0.00179383f, -3.61223f,   -1.67535f,
+  -2.20417f,    -0.197196f,   0.0507745f,   -0.0909394f, -0.0507879f,
+  -1.27999f,    -0.055623f,   0.0318497f,   0.192867f,   0.138726f,
+  0.0443392f,   -0.595075f,   -0.166774f,   0.0882958f,  -0.348161f,
+  0.0214428f,   -0.0599275f,  -0.0995385f,  -0.82358f,   0.141205f,
+  -0.053232f,   0.00508296f,  -1.90872f,    1.15004f,    -0.194219f,
+  0.0229019f,   -0.00354318f, 0.22016f,     0.154101f,   -0.159231f,
+  -0.0446647f,  -0.197503f,   0.0408453f,   0.197659f,   0.797858f,
+  -0.189722f,   0.343653f,    0.124666f,    -1.03083f,   0.603059f,
+  0.101565f,    0.0932993f,   0.462484f,    0.295984f,   1.11198f,
+  0.143709f,    -0.846232f,   -0.464392f,   -1.06058f,   -0.124889f,
+  0.0727475f,   1.18446f,     -0.100302f,   0.0641918f,  -0.101622f,
+  0.10219f,     0.130189f,    0.0915623f,   -0.166904f,  -1.10606f,
+  -0.16726f,    -0.146152f,   0.145443f,    -0.177091f,  -0.0215214f,
+  0.0158506f,   -0.553294f,   0.0784749f,   -0.0416628f, -0.027785f,
+  0.280027f,    0.484898f,    -0.164225f,   0.0238317f,  -0.0345254f,
+  0.0410244f,   0.131529f,    0.0239622f,   -0.0749436f, -0.0224914f,
+  0.128926f,    0.224539f,    0.413297f,    0.0638572f,  0.103308f,
+  0.0913242f,   -0.119274f,   0.0163103f,   0.113828f,   0.119809f,
+  0.297057f,    -0.124889f,   -0.533108f,   -0.181408f,  -0.129896f,
+  0.0221064f,   -0.0773281f,  -0.0386467f,  0.0342961f,  0.126575f,
+  -0.24114f,    0.0735576f,   0.0524791f,   0.246896f,   -0.130674f,
+  -0.03979f,    0.173639f,    1.95193f,     -0.113029f,  -0.0305852f,
+  -0.00671737f, 0.157159f,    -0.00102858f, -0.543688f,  0.566772f,
+  0.124124f,    -0.0294064f,  -0.0699021f,  -0.0704103f, -0.766097f,
+  -0.0625802f,  -0.0906173f,  -0.0520414f,  -0.0272724f, 0.283064f,
+  0.236213f,    -0.127319f,   0.019392f,    0.170042f,   -0.0214542f,
+  0.0740938f,   0.356578f,    -0.236257f,   0.269021f,   0.114759f,
+  -0.641166f,   0.136308f,    -0.0386959f,  -0.112024f,  -0.361209f,
+  0.686095f,    0.183906f,    0.288656f,    0.182007f,   0.337458f,
+  0.058974f,    -0.305512f,   -0.841708f,   -0.243779f,  -0.0614058f,
+  0.208747f,    0.448697f
+};
+
+static const float av1_max_part_pred_layer_0_bias[] = {
+  -0.776544f, -2.0022f,    -0.330294f, 2.47665f,  1.90206f,   -1.61571f,
+  0.536246f,  1.00455f,    5.24561f,   1.55111f,  -0.816399f, -4.88703f,
+  -1.06417f,  -1.15359f,   -0.145289f, 1.91831f,  0.630915f,  -1.94256f,
+  -3.35239f,  -1.05007f,   -1.05186f,  1.36824f,  -5.2878f,   1.10482f,
+  -5.00077f,  -0.0445198f, 3.41427f,   2.3439f,   -0.413306f, -1.88152f,
+  -2.28638f,  8.24783f,    -1.91961f,  -1.49324f, 1.96599f,   -6.32309f,
+  -0.332426f, -0.425506f,  4.06511f,   5.84386f,  4.15747f,   1.22402f,
+  2.8512f,    2.53027f,    0.0170272f, -1.43966f, -0.997785f, 5.43064f
+};
+
+static const float av1_max_part_pred_logits_bias[] = { -4.25432f, 0.144758f,
+                                                       1.96217f, 0.728905f };
+
+static const float av1_max_part_pred_layer_0_kernel[] = {
+  0.992471f,    0.533006f,    0.143743f,     -2.51788f,    -0.468337f,
+  -0.201376f,   -0.151834f,   0.479883f,     1.16061f,     -0.278878f,
+  -0.814954f,   -0.152405f,   -0.0521608f,   0.797104f,    -2.08912f,
+  0.385839f,    -2.22889f,    -0.106858f,    -0.239766f,   -0.951128f,
+  -0.698753f,   0.0831051f,   1.1702f,       0.342834f,    -0.0352795f,
+  -0.0847639f,  -0.802086f,   0.258982f,     1.14174f,     0.645885f,
+  -1.19226f,    -0.592888f,   -0.343659f,    1.1912f,      1.45411f,
+  -1.22927f,    0.152858f,    0.00373585f,   -1.60637f,    0.592611f,
+  0.0857475f,   -0.346147f,   -0.150784f,    -0.0817408f,  -0.189918f,
+  -0.804952f,   -1.33036f,    -1.03307f,     0.0248769f,   0.16607f,
+  -2.896f,      -2.1293f,     0.12293f,      -0.173179f,   -0.212128f,
+  -6.76221f,    0.033188f,    0.0231787f,    0.905957f,    0.0551327f,
+  -0.356276f,   0.0181795f,   0.0977523f,    -0.0352873f,  -0.0396386f,
+  2.3241f,      0.0632874f,   -0.11804f,     -6.32521f,    0.0224659f,
+  -0.00188896f, 0.267992f,    0.272337f,     0.00936963f,  0.659969f,
+  -2.25707f,    -0.0278229f,  -0.0185089f,   -1.14466f,    0.104827f,
+  0.0435885f,   0.558586f,    -0.00697004f,  0.0312611f,   0.540574f,
+  -0.568625f,   0.218608f,    0.378911f,     -0.0289192f,  -0.0734742f,
+  -1.08782f,    -2.42069f,    -0.0127239f,   0.0493651f,   -1.15837f,
+  0.261831f,    0.401824f,    -1.04545f,     0.284173f,    0.784972f,
+  -0.511243f,   -0.982599f,   -0.106134f,    -0.325964f,   -1.44107f,
+  -1.42434f,    -1.02402f,    -1.52034f,     0.0737116f,   0.0462242f,
+  0.628722f,    -1.0405f,     -0.113718f,    2.20573f,     -4.33951f,
+  -0.0192695f,  -0.0229314f,  -1.89156f,     0.645942f,    0.375708f,
+  -1.97447f,    -0.267014f,   0.0989443f,    -0.450534f,   -1.01737f,
+  -0.642416f,   -0.0897288f,  -2.08724f,     -0.190965f,   -0.279135f,
+  -0.830178f,   0.808754f,    -0.139091f,    1.11004f,     -0.454439f,
+  -0.479238f,   -1.44001f,    0.0888059f,    0.885689f,    -0.642505f,
+  -0.00773651f, -0.0265721f,  -0.906346f,    1.68504f,     0.084257f,
+  -0.951101f,   -8.06495f,    0.19231f,      0.16389f,     -0.193678f,
+  0.729837f,    -1.98392f,    -5.98513f,     3.32638f,     -0.0658378f,
+  -0.0910426f,  -0.666567f,   -0.315339f,    0.123124f,    -2.66375f,
+  -0.714852f,   -0.136176f,   -0.460166f,    -0.567551f,   -1.06193f,
+  -1.21389f,    -0.83865f,    0.00280695f,   -0.199519f,   -0.534704f,
+  0.419311f,    -0.149008f,   -3.68707f,     0.00285113f,  -0.0718198f,
+  -1.41026f,    -1.34155f,    -0.538687f,    -0.623666f,   -2.56462f,
+  -0.0183333f,  -0.323532f,   -1.27141f,     -0.0212039f,  0.198633f,
+  0.459554f,    -4.65103f,    -1.01293f,     -1.39512f,    -0.289026f,
+  0.208724f,    -0.665226f,   1.13369f,      -1.96734f,    -1.45442f,
+  -3.46172f,    0.810681f,    -0.603973f,    0.842764f,    -3.90371f,
+  -0.394561f,   -3.61363f,    -2.88085f,     0.031645f,    -0.23125f,
+  -2.63898f,    -1.35314f,    -0.46726f,     1.33145f,     1.20269f,
+  1.38682f,     -0.331637f,   0.069021f,     0.149523f,    -1.24957f,
+  -0.878857f,   -0.200368f,   0.465744f,     1.01365f,     -0.0122221f,
+  -0.550586f,   -1.12581f,    -0.422132f,    -0.0744868f,  -2.4804f,
+  -1.07072f,    -0.479006f,   0.101817f,     -0.118947f,   0.341576f,
+  -1.0538f,     -0.812346f,   -1.13727f,     -0.00939806f, 10.1571f,
+  -0.0441302f,  0.00280407f,  -21.5044f,     0.0181152f,   -0.0143246f,
+  3.23462f,     -1.38624f,    -1.80416f,     4.89763f,     -2.67364f,
+  2.31771e-05f, 0.000393989f, 0.352204f,     -0.193455f,   0.531455f,
+  0.488757f,    -0.442555f,   -0.518528f,    0.431482f,    -2.67727f,
+  -2.00626f,    -0.39729f,    -0.221494f,    -0.0188888f,  -0.0377649f,
+  -1.80169f,    0.0810332f,   -0.0408335f,   -1.28675f,    -0.0353824f,
+  -0.666723f,   -1.07281f,    0.252912f,     -1.24547f,    -1.7831f,
+  -1.14354f,    -0.137662f,   0.00230182f,   0.736862f,    0.175872f,
+  -0.187556f,   0.43963f,     -0.796524f,    0.056219f,    -0.387874f,
+  0.0710224f,   -0.16548f,    -0.100993f,    0.931481f,    -3.20738f,
+  -0.0197576f,  0.266148f,    -0.173909f,    -0.337795f,   -0.0682381f,
+  0.176844f,    0.140286f,    1.12033f,      0.429064f,    -2.24192f,
+  -1.54682f,    2.23646f,     -0.0371138f,   -0.0475339f,  -3.21766f,
+  0.0412858f,   0.387811f,    6.6711f,       0.140649f,    0.0559547f,
+  -0.802839f,   0.599977f,    0.64552f,      -2.08103f,    -0.503401f,
+  -0.0407036f,  -0.0299199f,  0.0849445f,    -0.111657f,   -1.63462f,
+  3.33762f,     0.0441394f,   0.0466889f,    -0.951806f,   0.0723954f,
+  0.00348661f,  -1.36903f,    2.24625f,      -0.0348915f,  -0.0508893f,
+  -0.240891f,   -0.120143f,   -0.17991f,     -2.09137f,    0.0150871f,
+  0.0480333f,   1.72012f,     0.0309551f,    -0.0370507f,  -0.377075f,
+  0.103916f,    -0.0169255f,  -0.0145395f,   -4.02144f,    0.83193f,
+  -0.316502f,   6.3832f,      -1.70038f,     -1.97215f,    -1.94501f,
+  1.45479f,     0.711725f,    -0.348496f,    -0.279056f,   -1.13396f,
+  -1.51744f,    -0.853307f,   1.53131f,      -0.0032358f,  1.41808f,
+  -1.32989f,    -0.245221f,   -0.161614f,    -0.500845f,   -0.449252f,
+  0.0724151f,   -0.116333f,   -0.0946182f,   -2.0945f,     0.0564572f,
+  0.393261f,    -1.06861f,    -0.111458f,    -0.839943f,   -0.0880348f,
+  0.0365742f,   0.415339f,    -1.57494f,     -0.713697f,   1.02349f,
+  -0.221371f,   -0.0446281f,  1.89223f,      -0.0811754f,  -0.402773f,
+  -0.930987f,   0.0243194f,   0.0678332f,    -0.0233014f,  0.165372f,
+  -0.44083f,    -1.2404f,     0.35675f,      -0.040916f,   -0.0512548f,
+  -2.9071f,     0.861174f,    -0.778133f,    2.14436f,     -0.688427f,
+  -0.480371f,   -1.69032f,    0.706687f,     -0.281982f,   -2.30451f,
+  1.61541f,     -0.0213638f,  -0.740509f,    -0.266677f,   0.0268434f,
+  -0.0116908f,  -3.17595f,    0.0114825f,    0.0196997f,   -0.144005f,
+  0.0550181f,   -0.851459f,   -0.000285073f, -0.538441f,   -0.0254868f,
+  -0.0104454f,  -0.0661998f,  -0.196469f,    -0.346372f,   -5.52892f,
+  -0.643683f,   -0.622224f,   -0.31463f,     -0.555956f,   -0.520132f,
+  -0.843166f,   -2.59479f,    -0.750195f,    0.00635995f,  -0.338615f,
+  -0.216676f,   -0.391544f,   -1.62185f,     -0.718471f,   -0.475406f,
+  -0.782041f,   -0.608824f,   -1.09633f,     -1.27308f,    -0.560719f,
+  -0.207539f,   -0.0196445f,  -1.05519f,     -0.575249f,   -1.0642f,
+  1.01615f,     -0.873633f,   -0.417953f,    -0.428051f,   0.350259f,
+  -2.53833f,    -2.72203f,    0.672846f,     -0.503094f,   -1.1374f,
+  0.214291f,    0.013305f,    0.0112064f,    1.10532f,     0.030455f,
+  0.0239614f,   0.628072f,    0.0539135f,    -0.472441f,   -0.688439f,
+  -0.32044f,    -0.0234867f,  -0.0158436f,   -0.949314f,   -0.0453161f,
+  -1.18306f,    0.626845f,    -0.426925f,    -0.688371f,   0.415062f,
+  0.0640985f,   -0.638387f,   -2.01399f,     -0.209744f,   -0.762892f,
+  -0.0753296f,  -0.879315f,   -0.520433f,    -0.111375f,   0.389742f,
+  -0.398862f,   -0.643227f,   -0.246396f,    0.0317051f,   1.06973f,
+  0.413617f,    0.180506f,    -0.0507897f,   -0.00650435f, 0.620892f,
+  0.046312f,    0.475032f,    0.906993f,     -0.0388061f,  -0.256271f,
+  -1.03323f,    0.0125266f,   -0.31116f,     -0.377611f,   -0.0386407f,
+  -0.0232745f,  -0.353644f,   -2.27289f,     0.0571779f,   -0.00865006f,
+  1.65101f,     0.0175711f,   0.0184585f,    0.558458f,    0.2213f,
+  -0.285089f,   0.433445f,    -0.427177f,    -0.0103682f,  -0.0101273f,
+  0.214085f,    -0.0459885f,  0.00761981f,   0.836381f,    0.0175293f,
+  0.02508f,     -1.51778f,    0.0143956f,    -0.162589f,   0.595418f,
+  0.21445f,     -0.0335848f,  -0.0136684f,   -0.16686f,    -0.14612f,
+  0.0816238f,   0.499636f,    0.12458f,      -2.41673f,    -0.261721f,
+  -0.676805f,   -1.88366f,    0.730462f,     0.69196f,     -0.0288489f,
+  -2.38272f,    0.329876f,    0.014517f,     -0.115145f,   -3.48151f,
+  -0.00209072f, -0.0732377f,  0.820443f,     -0.0118701f,  0.112145f,
+  0.272315f,    0.137531f,    -0.0200997f,   -0.0397883f,  -2.19458f,
+  0.183554f,    -0.639716f,   0.481605f,     -0.621639f,   -0.0980299f,
+  -0.710534f,   -0.143105f,   -6.77626f,     -1.65139f,    -2.37718f,
+  -0.533127f,   -1.12574f,    3.34182f,      -0.0758663f,  0.0334238f,
+  -9.48647f,    0.0674974f,   0.0507665f,    0.523007f,    -0.0668f,
+  0.5736f,      -0.589761f,   -1.1692f,      -0.0236497f,  -0.00828928f,
+  -0.265823f,   1.15284f,     0.307927f,     -0.695308f,   0.13725f,
+  -0.20394f,    -0.363965f,   -0.331159f,    -1.50927f,    -1.20051f,
+  -0.0205825f,  -0.0381859f,  -0.0579876f,   -1.6913f,     -1.94626f,
+  3.4214f,      3.3922f,      -2.13798f,     -0.679848f,   -0.890735f,
+  0.235017f,    -0.253202f,   -1.0571f,      1.40354f,     0.00719052f,
+  -1.54365f,    -0.7289f,     -1.05492f,     0.0238169f,   -0.00543592f,
+  -0.0510353f,  -0.175386f,   -0.724207f,    -0.788936f,   0.039976f,
+  1.36966f,     0.869475f,    -0.0302774f,   -0.0537556f
+};
+
+static const NN_CONFIG av1_max_part_pred_nn_config = {
+  NUM_FEATURES,
+  NUM_LOGITS,
+  NUM_HIDDEN_LAYERS,
+  {
+      NUM_LAYER_0_UNITS,
+  },
+  {
+      av1_max_part_pred_layer_0_kernel,
+      av1_max_part_pred_logits_kernel,
+  },
+  {
+      av1_max_part_pred_layer_0_bias,
+      av1_max_part_pred_logits_bias,
+  },
+};
+
+#undef NUM_HIDDEN_LAYERS
+#undef NUM_FEATURES
+#undef NUM_LAYER_0_UNITS
+#undef NUM_LOGITS
+
+// Early termination in second pass
+static const float av1_simple_motion_search_term_none_mean_128[28] = {
+  12.661922f, 12.638062f, 10.896497f, 10.865719f, 10.978963f, 10.940105f,
+  11.012235f, 10.972760f, 11.069924f, 11.018533f, 11.773865f, 11.747426f,
+  11.891315f, 11.858107f, 11.793916f, 11.766356f, 11.874997f, 11.840164f,
+  5.940535f,  0.770746f,  4.292692f,  4.309581f,  0.848423f,  4.292334f,
+  4.298179f,  8.514713f,  14.911736f, 19.825352f,
+};
+
+static const float av1_simple_motion_search_term_none_std_128[28] = {
+  1.796731f, 1.797056f, 1.898383f, 1.900753f, 1.846624f, 1.846953f, 1.906632f,
+  1.908089f, 1.836533f, 1.835967f, 1.840262f, 1.840671f, 1.816836f, 1.817103f,
+  1.879846f, 1.881333f, 1.803102f, 1.802654f, 2.263402f, 0.420354f, 1.117165f,
+  1.083779f, 0.358611f, 1.101183f, 1.084938f, 2.462638f, 1.577009f, 1.574711f,
+};
+
+static const float av1_simple_motion_search_term_none_mean_64[28] = {
+  10.904455f, 10.853546f, 9.247903f,  9.184479f,  9.251985f,  9.186686f,
+  9.253490f,  9.190190f,  9.270079f,  9.204357f,  10.086511f, 10.031060f,
+  10.100875f, 10.045429f, 10.069688f, 10.013173f, 10.082980f, 10.024640f,
+  4.888378f,  0.878113f,  3.598450f,  3.628491f,  0.925833f,  3.560971f,
+  3.573322f,  8.807137f,  13.348477f, 18.269117f,
+};
+
+static const float av1_simple_motion_search_term_none_std_64[28] = {
+  1.789300f, 1.787061f, 1.823519f, 1.820226f, 1.794643f, 1.788620f, 1.797194f,
+  1.795135f, 1.777795f, 1.773634f, 1.794000f, 1.790377f, 1.772197f, 1.769692f,
+  1.819050f, 1.817139f, 1.793577f, 1.789333f, 1.998251f, 0.327156f, 0.885748f,
+  0.853767f, 0.262043f, 0.902435f, 0.860033f, 1.224865f, 1.603411f, 1.589296f,
+};
+
+static const float av1_simple_motion_search_term_none_mean_32[28] = {
+  9.818970f, 9.751199f, 8.015079f, 7.927318f, 8.029113f, 7.938330f,  8.012570f,
+  7.923719f, 8.033508f, 7.941911f, 8.933057f, 8.857422f, 8.935639f,  8.859187f,
+  8.905495f, 8.829741f, 8.929428f, 8.851351f, 4.114069f, 0.954752f,  2.645082f,
+  2.709703f, 0.964678f, 2.652077f, 2.673393f, 9.430499f, 11.922798f, 16.942251f,
+};
+
+static const float av1_simple_motion_search_term_none_std_32[28] = {
+  1.737107f, 1.734327f, 1.727923f, 1.720244f, 1.721570f, 1.712775f, 1.718028f,
+  1.710370f, 1.711612f, 1.702596f, 1.754856f, 1.748855f, 1.741871f, 1.736304f,
+  1.722428f, 1.717380f, 1.713563f, 1.707582f, 1.761170f, 0.207847f, 0.900058f,
+  0.862356f, 0.184593f, 0.903822f, 0.856120f, 1.529199f, 1.412085f, 1.453153f,
+};
+
+static const float av1_simple_motion_search_term_none_mean_16[28] = {
+  8.998877f, 8.912468f, 7.085255f, 6.953476f, 7.086386f, 6.954091f,  7.088727f,
+  6.955747f, 7.093955f, 6.960635f, 8.065050f, 7.961432f, 8.071631f,  7.967233f,
+  8.041699f, 7.937715f, 8.046791f, 7.942183f, 3.833521f, 0.978421f,  1.901347f,
+  1.950124f, 0.979418f, 1.928000f, 1.936727f, 9.773951f, 10.735227f, 15.949769f,
+};
+
+static const float av1_simple_motion_search_term_none_std_16[28] = {
+  1.641193f, 1.640172f, 1.614794f, 1.608906f, 1.609571f, 1.603580f, 1.606928f,
+  1.601246f, 1.599230f, 1.593529f, 1.633747f, 1.630219f, 1.625695f, 1.622547f,
+  1.633827f, 1.630182f, 1.626607f, 1.622777f, 1.548838f, 0.145303f, 0.744550f,
+  0.736552f, 0.141980f, 0.742979f, 0.736977f, 1.366255f, 1.258794f, 1.294309f,
+};
+
+static const float av1_simple_motion_search_term_none_model_128[] = {
+  -0.6106842357f, -1.0402954455f, 0.6054417656f,  -0.2116623578f,
+  0.2447714930f,  0.3782256209f,  0.5095592479f,  -0.3275620904f,
+  0.3886188013f,  0.2629499420f,  -0.1979599415f, -0.5389565605f,
+  0.1209207902f,  -0.4913347466f, 0.3798542731f,  -0.2812861709f,
+  -0.1049824167f, -0.1088672020f, 0.4059596517f,  -0.1347896613f,
+  0.2276868621f,  0.0506386970f,  0.0071088411f,  0.0467952100f,
+  0.2091247458f,  -0.7371964736f, 0.1368935545f,  0.3175247786f,
+  -0.5493146094f,
+};
+
+static const float av1_simple_motion_search_term_none_model_64[] = {
+  -0.4150046575f, -0.3954358561f, 0.1997997444f,  0.3395826831f,
+  0.2827215753f,  0.3395683652f,  0.2483140395f,  0.2722216476f,
+  0.2610308009f,  0.3724974359f,  -0.0551479654f, -0.1721616359f,
+  -0.3459358629f, -0.0952524186f, -0.1428993840f, -0.0415654914f,
+  -0.3169539902f, -0.0269429900f, 0.9891530919f,  -0.0125084982f,
+  0.0972182377f,  0.0008889801f,  0.0205418050f,  0.0057237854f,
+  0.1005222691f,  -0.2851321920f, -1.5150336445f, 0.1893942436f,
+  -0.4337360901f,
+};
+
+static const float av1_simple_motion_search_term_none_model_32[] = {
+  -0.4667392852f, -0.3893302767f, 0.1603498635f,  0.2304974726f,
+  0.1404975592f,  0.2505516225f,  0.1423053884f,  0.2189318406f,
+  0.1379765409f,  0.2638241296f,  -0.1342865463f, -0.0549054345f,
+  -0.1925223436f, -0.1142702769f, 0.0127811659f,  0.0868639997f,
+  -0.0643197251f, 0.0279496470f,  0.9904395769f,  -0.0095178685f,
+  0.1179410649f,  -0.0013411972f, 0.0095060660f,  0.0195730400f,
+  0.0779717771f,  -0.2498860763f, -0.8168817125f, -0.4798397348f,
+  -0.6609679881f,
+};
+
+static const float av1_simple_motion_search_term_none_model_16[] = {
+  -0.3021081992f, -0.4620153673f, 0.0448577479f,  0.1738455035f,
+  0.0663209177f,  0.1629614573f,  0.0555168744f,  0.1631870212f,
+  0.0425805150f,  0.1688564954f,  0.0434083772f,  -0.0046603915f,
+  -0.0271580056f, -0.0183879127f, 0.1073730471f,  0.0314201476f,
+  0.0576891756f,  0.0119723753f,  0.9084332022f,  -0.0188429077f,
+  0.0755089811f,  -0.0172550234f, 0.0037663075f,  0.0022094472f,
+  0.0500247894f,  -0.2944572004f, -0.8908521199f, -0.2555515792f,
+  -0.5396254205f,
+};
+
+#define FEATURES 31
+#define HIDDEN_NODES 32
+static const float av1_early_term_after_split_nn_weights_64_layer0[] = {
+  -0.306296f, -0.691664f, 0.335148f,  -0.298465f, -0.509241f, -0.632796f,
+  -0.527979f, -0.009904f, -0.503646f, -0.494002f, -0.575101f, 0.239911f,
+  -0.413312f, -0.622825f, -0.405448f, -0.419103f, -0.505903f, -0.392550f,
+  -0.240293f, 0.121749f,  -0.489777f, -0.756647f, 0.001047f,  -0.016528f,
+  0.145714f,  0.172910f,  0.086197f,  0.162882f,  -0.070588f, -0.077104f,
+  0.502730f,  -0.244954f, 0.265605f,  -0.323994f, 0.223397f,  -1.086453f,
+  0.391886f,  0.200343f,  0.253878f,  0.018925f,  0.201819f,  -0.205136f,
+  0.427314f,  0.041155f,  0.070484f,  0.159925f,  -0.057095f, -0.146544f,
+  -0.073792f, 0.152628f,  0.003986f,  -0.515965f, -0.209754f, 0.037457f,
+  0.070622f,  -0.143571f, -0.059602f, 0.111734f,  0.319674f,  0.149894f,
+  -0.219883f, 0.206678f,  0.015809f,  -0.210549f, 0.130156f,  -0.189502f,
+  -0.850392f, -0.156363f, -0.060354f, 0.189044f,  0.266495f,  0.151305f,
+  -0.563677f, -0.354896f, 0.300637f,  0.257568f,  -0.008359f, -0.535497f,
+  -0.003127f, 0.293054f,  -0.020212f, -0.157278f, 0.229972f,  -0.309799f,
+  -0.329927f, -0.077140f, 0.001177f,  -0.024415f, 0.134044f,  -0.181587f,
+  -0.135380f, 0.230989f,  -0.281451f, 0.912282f,  0.511562f,  -3.900779f,
+  -0.039917f, 1.956406f,  -0.357589f, 0.292998f,  -0.950158f, 0.422041f,
+  0.526572f,  0.605746f,  -0.147110f, 0.256576f,  0.090010f,  0.221641f,
+  0.029763f,  0.351592f,  0.458324f,  -0.005888f, 0.010521f,  -0.389326f,
+  -0.094006f, -0.171489f, -0.013153f, 0.026333f,  -0.454571f, -1.932891f,
+  -0.168211f, 0.051298f,  -0.258061f, -0.028936f, -0.555937f, -0.475566f,
+  -0.304046f, -0.318113f, 0.099697f,  -0.217145f, 0.139433f,  -0.203986f,
+  -0.164012f, 0.051527f,  0.138603f,  -0.085100f, -0.082887f, -0.242955f,
+  -0.663410f, -0.535772f, -0.181665f, -0.197883f, 0.071319f,  0.135086f,
+  0.146200f,  0.184827f,  -0.199041f, 0.162570f,  -0.300167f, 0.017748f,
+  -0.140111f, 0.103553f,  0.206929f,  0.193446f,  0.123141f,  -1.201898f,
+  -0.052254f, -0.750121f, 0.111741f,  0.204092f,  -0.166266f, 0.124008f,
+  -0.455496f, 0.306035f,  0.275903f,  0.193599f,  -0.730011f, 0.126808f,
+  0.051059f,  0.103634f,  -0.044334f, 0.048889f,  0.405228f,  0.574099f,
+  0.061167f,  0.260576f,  0.070032f,  -0.038040f, 0.229183f,  -0.243269f,
+  -0.130116f, -0.538563f, -0.070199f, -0.129249f, -0.205153f, -0.268530f,
+  -0.290828f, -0.233006f, 0.068712f,  0.618085f,  -0.407008f, 0.686868f,
+  0.172247f,  0.826287f,  -0.002672f, 0.239825f,  -0.051548f, 0.420773f,
+  0.218747f,  0.041057f,  -0.071189f, 0.286987f,  -0.113915f, 0.122561f,
+  0.013979f,  -0.049046f, 0.148175f,  0.031313f,  -0.248601f, 0.209488f,
+  0.069008f,  0.072763f,  0.332475f,  0.079986f,  -0.151042f, -0.205110f,
+  -0.155550f, -0.510408f, 0.330429f,  0.577729f,  0.266524f,  -0.378489f,
+  0.228204f,  0.055318f,  0.117583f,  -0.588557f, -0.778201f, 0.434622f,
+  -0.227820f, 0.611642f,  0.170548f,  0.817761f,  0.006642f,  -1.005794f,
+  -0.911490f, 1.633684f,  -0.290664f, 0.308128f,  0.295986f,  0.243377f,
+  -0.001275f, -0.131156f, 0.275205f,  -0.041865f, -0.201951f, -0.016380f,
+  0.336604f,  -0.258118f, 0.890810f,  0.441065f,  -0.968006f, 0.135989f,
+  -1.447191f, 0.353426f,  -0.343235f, 0.376837f,  -0.071602f, -0.319639f,
+  -0.072347f, 0.547450f,  -0.215380f, 0.182141f,  -0.066186f, 0.033787f,
+  0.257482f,  0.217428f,  -0.130249f, 0.057525f,  0.263991f,  0.230664f,
+  -0.245113f, 0.048610f,  -0.079955f, 0.251737f,  -0.070368f, -0.017968f,
+  -0.151815f, 0.025945f,  -0.257769f, 0.299735f,  0.077263f,  -0.565526f,
+  0.326263f,  0.096429f,  0.113414f,  0.092754f,  -0.141908f, 0.172060f,
+  0.393117f,  -0.216755f, 0.331051f,  -0.363369f, -0.113363f, -0.095164f,
+  -0.072784f, 0.214572f,  0.010993f,  0.209456f,  0.260381f,  -0.314747f,
+  -0.422173f, -0.189963f, -0.225130f, 0.339448f,  0.153814f,  0.265616f,
+  -0.103575f, -0.123841f, -0.106236f, 0.155894f,  -0.156264f, -1.361406f,
+  -0.040736f, -0.614998f, -0.468200f, -0.266505f, -0.342786f, -0.908088f,
+  0.105758f,  0.040788f,  -0.313589f, -1.359318f, 0.071329f,  0.176404f,
+  -0.476141f, 0.010108f,  -0.201440f, -0.221167f, -0.197448f, -0.013927f,
+  -0.610270f, -0.607285f, 0.178070f,  0.174320f,  0.313115f,  0.026191f,
+  -0.112330f, 0.122338f,  -0.367751f, 0.196794f,  0.153709f,  -0.205454f,
+  -0.397471f, -1.879336f, -0.030129f, 0.143429f,  -0.079832f, 0.435259f,
+  -1.729539f, 0.518301f,  -0.141393f, 0.199399f,  -1.914601f, 0.142865f,
+  -0.219899f, 0.508458f,  0.086365f,  -0.220740f, -0.012507f, 1.263320f,
+  0.042136f,  0.050922f,  -0.329644f, -0.188198f, 0.251522f,  0.394731f,
+  -0.047866f, -0.260853f, -0.267207f, -0.248489f, 0.146474f,  0.359257f,
+  -0.427732f, -0.100652f, 0.192129f,  0.075572f,  0.916708f,  0.255747f,
+  0.486384f,  0.127989f,  -0.556449f, -0.484913f, 0.392298f,  0.045401f,
+  -0.839551f, -0.703619f, 0.069263f,  -0.040720f, 0.542265f,  0.443739f,
+  0.862552f,  -0.021726f, 0.230858f,  -0.261004f, -0.125697f, -0.106435f,
+  0.002341f,  0.013904f,  0.011034f,  0.542296f,  -0.284325f, 0.135736f,
+  0.113882f,  0.040610f,  -0.255485f, 0.224061f,  -0.087140f, 0.127872f,
+  -0.002638f, 0.164889f,  -0.335958f, -0.031166f, -0.393581f, 0.075455f,
+  0.055995f,  0.087934f,  -0.133859f, -0.342187f, 0.002492f,  -0.340722f,
+  0.058304f,  0.104165f,  -0.142136f, -0.351111f, -0.158037f, -0.079924f,
+  -0.253209f, -0.092840f, -0.174646f, -0.202772f, -0.353438f, -0.031111f,
+  0.076088f,  -0.232091f, -0.070052f, 0.097595f,  0.063173f,  -0.211195f,
+  0.126478f,  -0.178828f, 0.278723f,  -0.070807f, -0.179783f, 0.034123f,
+  0.035721f,  -0.200431f, 0.170640f,  0.107933f,  0.226594f,  -0.301499f,
+  -0.291096f, 0.228076f,  -0.272951f, 0.002490f,  -0.210707f, -0.128033f,
+  -0.194009f, -0.011347f, -0.256694f, -0.011841f, -0.005167f, -0.163203f,
+  -0.253796f, -0.198877f, -0.055827f, -0.882685f, -0.443471f, 0.349601f,
+  0.749334f,  -1.161845f, 0.505480f,  0.221733f,  0.210490f,  -0.234984f,
+  0.014183f,  -0.510401f, 0.238692f,  -0.134111f, 0.083844f,  -0.478751f,
+  -0.088434f, 0.304063f,  0.150336f,  -0.749682f, -0.081999f, 0.729739f,
+  0.412508f,  0.132571f,  0.058306f,  -0.047451f, -0.117435f, -0.445395f,
+  -0.005182f, -0.025757f, 0.175051f,  -0.258194f, -0.150311f, -0.196533f,
+  -1.314316f, -0.428627f, 0.512451f,  0.045138f,  -0.200925f, 0.081538f,
+  -0.346151f, -0.358197f, -0.422258f, -0.028542f, -0.383534f, -0.026163f,
+  -0.419858f, -0.154321f, 0.376970f,  0.094017f,  0.783520f,  0.110641f,
+  0.077966f,  -0.093064f, 0.160522f,  -0.863041f, 0.086210f,  0.560764f,
+  0.057032f,  0.159224f,  0.323068f,  -0.173109f, 0.014042f,  -0.126856f,
+  -0.128237f, -0.245273f, -0.317312f, -0.257597f, -0.181977f, 0.259485f,
+  -0.215834f, 0.062076f,  -0.270596f, 0.271581f,  -0.153486f, -0.247165f,
+  0.079737f,  -0.157049f, -0.027459f, -0.299397f, 0.136729f,  -0.334192f,
+  -0.191722f, 0.145865f,  -0.031324f, -0.307165f, -0.244923f, -0.228027f,
+  0.063807f,  0.054965f,  -0.005709f, -0.041977f, -0.276245f, 0.020003f,
+  0.133323f,  -0.145992f, -0.951030f, 0.414083f,  -1.063323f, 0.137872f,
+  0.104732f,  -0.123728f, 0.542532f,  0.213654f,  0.542954f,  0.155619f,
+  0.543072f,  0.399067f,  0.191402f,  -0.102552f, -0.176734f, -0.136776f,
+  -0.012814f, -0.021298f, -0.802467f, -0.957481f, -0.238787f, -0.138482f,
+  0.058331f,  0.126601f,  0.104420f,  -0.148684f, 0.343218f,  0.093604f,
+  -0.055642f, -0.383918f, -0.045250f, -0.090480f, -0.155464f, 0.278299f,
+  0.042791f,  -0.029084f, -0.373861f, -0.073233f, -0.085172f, 0.186841f,
+  -0.070898f, -0.156415f, 0.112831f,  -0.065931f, -0.353007f, 0.058453f,
+  -0.136982f, 0.233393f,  0.017240f,  -0.018428f, 0.229104f,  -0.371440f,
+  -0.262212f, 0.203075f,  -0.263293f, 0.034413f,  -0.299354f, 0.227269f,
+  0.204977f,  -0.118107f, -0.359832f, -0.068252f, 0.480105f,  -0.214711f,
+  -0.614381f, 0.209048f,  -0.456014f, -0.188819f, -0.220995f, -0.322104f,
+  -0.191457f, 0.420874f,  -0.454919f, 0.023119f,  0.291700f,  -0.532885f,
+  -0.032642f, 0.043271f,  0.133974f,  0.002399f,  -0.179899f, -0.044158f,
+  -0.027078f, -0.350075f, 0.236766f,  0.346771f,  -0.118534f, -0.421221f,
+  0.019544f,  0.109349f,  0.141517f,  0.403561f,  0.409102f,  0.054555f,
+  -0.561751f, 0.577183f,  -0.705156f, -0.231188f, -1.969772f, 0.172289f,
+  -0.048122f, 0.205671f,  -0.667130f, -0.066870f, 0.202838f,  -0.095538f,
+  -0.842651f, 0.254170f,  0.046256f,  -0.271891f, -0.369254f, 0.492101f,
+  0.001189f,  -0.186525f, 0.188470f,  -0.207072f, 0.030086f,  -0.132904f,
+  0.127001f,  0.116662f,  -0.079246f, 0.227241f,  -0.462178f, 0.446304f,
+  -1.660753f, 0.241832f,  -0.288040f, 0.054663f,  -0.435804f, 0.296782f,
+  -0.026421f, -0.115618f, 0.163416f,  0.834001f,  0.008019f,  -0.014243f,
+  0.524658f,  0.067894f,  -0.253936f, -0.100657f, 1.285389f,  -0.005952f,
+  0.087134f,  -0.088375f, -0.121866f, -0.171172f, 0.279463f,  -0.598593f,
+  -0.727761f, 0.189831f,  -0.822575f, -0.291141f, -0.012410f, -0.069999f,
+  0.098842f,  -0.218513f, 0.009494f,  0.100106f,  -0.402884f, -0.299236f,
+  -0.345668f, -0.057739f, -0.213248f, -0.426661f, -0.360268f, -0.349860f,
+  -0.382177f, -0.357802f, -0.032030f, -0.110597f, -0.155442f, -0.418794f,
+  -0.012113f, -0.032962f, -0.450648f, 0.129060f,  -0.135227f, -0.298593f,
+  0.001435f,  0.278790f,  -0.272945f, 0.162759f,  -0.290208f, 0.058481f,
+  -0.490971f, 0.019630f,  -0.210347f, 0.000520f,  -0.340413f, 0.641562f,
+  0.023104f,  0.194832f,  -0.441894f, -0.253538f, -0.228332f, 0.423264f,
+  -1.094073f, -0.475657f, -0.238752f, 0.033910f,  0.440425f,  0.036320f,
+  0.566989f,  -0.065326f, -0.297939f, 0.406098f,  0.529561f,  -0.113084f,
+  0.141472f,  -0.024462f, -0.179212f, 0.187801f,  -0.235787f, -0.229624f,
+  0.357791f,  0.061110f,  -0.607788f, -1.713694f, -0.651041f, 1.734283f,
+  -0.334701f, 0.161687f,  0.010215f,  0.320708f,  0.169447f,  0.513558f,
+  0.488340f,  -0.619036f, -0.525441f, -1.144352f, -0.546154f, 0.669973f,
+  0.327028f,  -0.100539f, 0.012048f,  -0.223013f, -0.239680f, 0.323035f,
+  0.165950f,  -0.155110f, 0.128664f,  -0.157378f, -0.124490f, 0.291553f,
+  0.055849f,  -0.221664f, 0.077770f,  -0.350658f, -0.181939f, 0.110230f,
+  -0.078219f, 0.007472f,  -0.031620f, 0.007708f,  -0.201794f, 0.017594f,
+  -0.027480f, 0.058884f,  -0.369166f, -0.369770f, 0.181635f,  -0.183318f,
+  -0.389184f, -0.256661f, 0.160107f,  0.037127f,  -0.082573f, -0.095815f,
+  -0.322782f, 0.072528f,  -0.348875f, 0.216247f,  -0.161757f, -0.385502f,
+  -0.315738f, 0.020123f,  -0.155609f, 0.114403f,  -0.383232f, 0.629529f,
+  0.066142f,  0.448392f,  -0.389557f, -0.083315f, 0.829535f,  -0.015531f,
+  -0.050728f, -0.325127f, 0.812992f,  -0.196780f, 0.021060f,  -0.952647f,
+  0.006687f,  -0.512715f, -0.066778f, 0.410067f,  -0.116945f, -0.288283f,
+  0.189334f,  -0.083153f, 0.159980f,  -0.068208f, 0.107358f,  -0.154411f,
+  -0.068914f, 0.186816f,  0.032251f,  0.109242f,  0.134825f,  0.035101f,
+  -0.253175f, 0.157309f,  -0.363597f, -0.138176f, -0.334141f, -0.172697f,
+  0.045800f,  -0.286057f, 0.173403f,  -0.172444f, -0.117996f, -0.383848f,
+  -0.173303f, -0.258482f, -0.021404f, -0.017898f, -0.001970f, 0.003273f,
+  0.056121f,  0.155046f,  0.044708f,  -0.295609f, -0.211688f, -0.233229f,
+  -0.264980f, 0.145549f,  0.045323f,  -0.027112f, 0.175638f,  -0.207251f,
+  -0.055274f, 0.092706f,  0.086200f,  -0.241340f, -0.147416f, 0.024510f,
+  -0.357194f, -0.181944f, -0.050104f, -0.079024f, -0.290473f, -0.169790f,
+  -0.277982f, -0.017781f, -0.004854f, -0.094132f, -0.348555f, 0.199291f,
+  -0.343989f, -0.319299f, -0.268935f, -0.021208f, 0.020938f,  -0.090609f,
+  0.006595f,  -0.200790f, 0.171856f,  -0.027766f, -0.032017f, -0.006745f,
+  0.566426f,  -0.096850f, 0.727633f,  -0.408065f, -0.012436f, 0.005646f,
+  -0.305148f, -0.095075f, -0.391549f, -0.020378f, -0.236498f, -0.252773f,
+  -0.231385f, -0.203175f, 0.041903f,  -0.373694f, 0.058239f,  -0.101116f,
+  0.183772f,  0.164523f,  -0.099046f, -0.201272f, -0.394523f, -0.157517f,
+  0.032079f,  -0.381173f, -0.238496f, -0.037990f, -0.294553f, 0.141473f,
+  0.100268f,  -0.023806f, 0.004978f,  0.184916f,  0.142699f,  -0.113240f,
+  -0.213364f, -0.160059f, -0.216263f, -0.406387f, -0.301140f, -0.406355f,
+  -0.113085f, -0.279699f, -0.267434f, 0.126263f,  -0.260527f, -0.153904f,
+  -0.494653f, -0.355144f, 0.030549f,  -0.216400f, -0.123363f, 0.189090f,
+  0.219122f,  0.096677f,  -0.202037f, -0.014489f, -0.137859f, -0.114184f,
+  -0.279423f, -0.270683f,
+};
+
+static const float av1_early_term_after_split_nn_bias_64_layer0[] = {
+  -0.491455f, 0.464538f,  -0.005742f, -0.219951f, -0.073682f, 0.102027f,
+  0.567071f,  0.441402f,  0.277521f,  0.314498f,  -0.448199f, -0.065032f,
+  0.488139f,  -0.079632f, 0.000000f,  0.521555f,  -0.151950f, -0.034616f,
+  0.393438f,  -0.072242f, -0.087343f, -0.571308f, 0.017372f,  -0.126144f,
+  0.372261f,  -0.451537f, -0.140238f, -0.092377f, -0.074475f, -0.068879f,
+  -0.109614f, -0.164492f,
+};
+
+static const float av1_early_term_after_split_nn_weights_64_layer1[] = {
+  -0.373195f, -0.283141f, 0.416113f,  0.483659f,  0.230583f,  0.349197f,
+  -0.168582f, -0.813338f, -0.472369f, -0.173872f, 1.297845f,  0.339355f,
+  -0.828033f, 0.019617f,  0.118757f,  -0.619360f, 0.282295f,  -0.054116f,
+  -0.730596f, 0.068567f,  -0.248707f, 0.461225f,  0.330224f,  -0.287080f,
+  -0.458103f, 0.591852f,  -0.008491f, 0.632119f,  -0.007872f, 0.007869f,
+  -0.230698f, -0.011437f,
+};
+
+static const float av1_early_term_after_split_nn_bias_64_layer1[] = {
+  -0.55403697f,
+};
+
+static const NN_CONFIG av1_early_term_after_split_nnconfig_64 = {
+  FEATURES,
+  1,
+  1,
+  {
+      HIDDEN_NODES,
+  },
+  {
+      av1_early_term_after_split_nn_weights_64_layer0,
+      av1_early_term_after_split_nn_weights_64_layer1,
+  },
+  {
+      av1_early_term_after_split_nn_bias_64_layer0,
+      av1_early_term_after_split_nn_bias_64_layer1,
+  },
+};
+
+static const float av1_early_term_after_split_nn_weights_32_layer0[] = {
+  0.026050f,  -0.226531f, 0.308107f,  -0.083744f, 0.201785f,  0.098562f,
+  0.147595f,  -0.495771f, -0.245741f, 0.201616f,  -0.272070f, -0.579545f,
+  -0.127261f, -0.229588f, 0.250831f,  -0.176929f, -0.031689f, 0.284718f,
+  0.085845f,  -0.285027f, 0.012304f,  0.382402f,  -0.204591f, 0.272514f,
+  -0.065854f, -0.054228f, -0.231174f, -0.174504f, 0.258287f,  0.195689f,
+  0.242530f,  0.023528f,  -0.294242f, -0.272132f, 0.460180f,  -0.731281f,
+  -0.208103f, 0.208204f,  0.348250f,  0.016328f,  0.043707f,  -0.169551f,
+  0.108521f,  0.226895f,  -0.020471f, 0.102443f,  0.429640f,  -0.252555f,
+  -0.218434f, -0.163665f, 0.175531f,  0.101588f,  -0.135798f, -0.158102f,
+  0.142565f,  0.128277f,  0.174985f,  -0.100073f, 0.113967f,  0.223682f,
+  -0.145576f, -0.008443f, 0.112748f,  -0.037845f, 0.076954f,  -0.287137f,
+  -0.518185f, -0.106833f, 0.175359f,  0.031408f,  0.219069f,  -0.294440f,
+  0.007766f,  0.067754f,  -0.049168f, -0.212368f, -0.261708f, 0.309252f,
+  0.220859f,  -0.274852f, -0.653157f, 0.083438f,  -0.265386f, 0.174429f,
+  -0.116931f, -0.091594f, -0.244897f, -0.089015f, 0.274453f,  0.212890f,
+  0.272053f,  -0.425315f, -0.107726f, 0.294444f,  -0.354629f, 0.104402f,
+  -0.307663f, 0.558430f,  0.140334f,  -0.054831f, -0.449456f, 0.058274f,
+  -0.033768f, -0.354117f, -0.331618f, -0.411772f, 0.232064f,  -0.079297f,
+  -0.638571f, 0.181823f,  -0.039611f, 0.206310f,  -0.659157f, -0.102930f,
+  -0.067303f, -0.176881f, -0.001038f, 0.091835f,  0.079739f,  -0.121923f,
+  0.211070f,  0.362719f,  -0.154915f, -0.151876f, -0.165460f, 0.023469f,
+  -0.251036f, 0.210014f,  -0.537125f, 0.156832f,  -0.216987f, 0.062975f,
+  -0.198462f, 0.329123f,  0.125870f,  0.225830f,  0.086377f,  -0.128773f,
+  -0.179673f, -0.074612f, 0.456645f,  0.021905f,  -0.243140f, 0.059145f,
+  -0.273942f, -0.277822f, 0.154556f,  -0.025459f, 0.227614f,  -0.313076f,
+  0.044705f,  -0.019017f, 0.108999f,  -0.020243f, -0.016373f, 0.560270f,
+  -0.064818f, 0.050880f,  -0.218458f, 0.825699f,  -0.534056f, -0.258253f,
+  0.222073f,  0.013295f,  0.477870f,  -0.386727f, 0.388509f,  0.004128f,
+  0.451388f,  -0.175788f, 0.264093f,  -0.109812f, 0.358132f,  0.500992f,
+  -0.446933f, -0.222397f, 0.345834f,  0.370943f,  -0.233115f, -0.047005f,
+  -0.111335f, -0.111586f, 0.026975f,  -0.052191f, -0.111800f, -0.129782f,
+  0.225132f,  0.102524f,  0.544557f,  -0.111674f, -0.857884f, 0.133258f,
+  0.310001f,  0.043829f,  0.104143f,  0.256493f,  0.242520f,  -0.342082f,
+  0.421447f,  0.124227f,  0.061542f,  -0.090206f, 0.316681f,  0.353452f,
+  -0.918408f, -0.001903f, -0.052303f, -0.004816f, -0.446393f, -0.053038f,
+  0.255725f,  -0.126346f, 0.034095f,  -0.240276f, -0.135918f, 0.095682f,
+  -0.147457f, -0.338216f, -0.200426f, 0.010265f,  -0.243915f, -0.231375f,
+  -0.323924f, -0.014353f, 0.150252f,  -0.264346f, 0.205303f,  -0.194610f,
+  -0.282527f, 0.180555f,  -0.000087f, 0.027240f,  -0.000903f, -0.345877f,
+  -0.353274f, -0.311829f, 0.172985f,  -0.111748f, -0.309380f, 0.108110f,
+  -0.260914f, -0.164990f, 0.183625f,  -0.319692f, -0.096988f, 0.094147f,
+  -0.047062f, -0.080978f, 0.227387f,  -0.000450f, -0.220159f, -0.211448f,
+  -0.020885f, -0.139646f, -0.086721f, 0.067928f,  -0.033084f, -0.251996f,
+  0.090317f,  0.086313f,  -0.228420f, -0.111356f, -0.314304f, -0.223664f,
+  0.188176f,  -0.002360f, -0.029491f, -0.006000f, -0.075343f, 0.173699f,
+  -0.272800f, -0.238507f, -0.272071f, -0.015000f, -0.215305f, -0.192943f,
+  -0.038595f, 0.119537f,  0.260477f,  -0.168014f, -0.172751f, 0.532861f,
+  -0.753250f, -0.017485f, -0.115541f, -0.109291f, -1.098943f, 0.418559f,
+  -0.532110f, 0.359323f,  -0.254786f, 0.471316f,  -0.545024f, 0.291912f,
+  -0.836939f, 0.443427f,  -0.441709f, 0.168866f,  -0.140372f, 0.546607f,
+  -0.315465f, 0.023328f,  0.137709f,  -0.083492f, -0.049986f, -0.071302f,
+  -0.293680f, -0.105049f, 0.315317f,  0.279569f,  0.220762f,  0.088161f,
+  -0.756456f, -0.074512f, 0.958318f,  -0.332924f, -0.004906f, -0.629271f,
+  0.212050f,  0.279123f,  0.311523f,  -0.599580f, 0.516150f,  0.456952f,
+  0.020255f,  0.247290f,  -0.182670f, -0.335554f, 0.021203f,  0.131081f,
+  -0.208584f, 0.112530f,  -0.198980f, 0.211583f,  -0.101271f, -0.206453f,
+  -0.502688f, -0.294976f, -0.187019f, -0.114473f, 0.282050f,  -0.165483f,
+  0.094953f,  -0.182578f, 0.055068f,  0.135605f,  -0.266941f, -0.297556f,
+  0.199181f,  0.015979f,  -0.158659f, -0.226841f, 0.171306f,  0.013438f,
+  -0.286309f, -0.071753f, -0.170300f, -0.238188f, 0.093572f,  -0.026230f,
+  -0.254502f, -0.297786f, -0.063480f, -0.300799f, -0.065644f, 0.074710f,
+  0.248576f,  -0.144425f, -0.113948f, -0.247297f, 0.276682f,  0.010963f,
+  -0.737786f, 0.026347f,  0.007830f,  0.753543f,  0.371904f,  0.305614f,
+  0.105028f,  0.073530f,  -0.119137f, 0.102352f,  -0.080523f, 0.176366f,
+  -0.159457f, -0.339948f, 0.360131f,  -0.007051f, -0.388378f, -0.101695f,
+  0.663041f,  -0.234486f, -0.142536f, -0.099931f, 0.041478f,  0.230425f,
+  0.005743f,  0.154060f,  0.056233f,  -0.080668f, -0.009754f, -0.194356f,
+  0.185474f,  -0.296474f, 0.192700f,  0.257767f,  0.348529f,  0.458265f,
+  0.060276f,  -0.130473f, 0.139889f,  0.310073f,  -0.306869f, -0.272922f,
+  -0.259862f, 0.409207f,  0.431991f,  -0.100357f, -0.050415f, -0.071830f,
+  -0.239665f, 0.153399f,  0.177192f,  -0.611644f, -0.176114f, -0.022694f,
+  -0.033701f, -0.345842f, 0.015660f,  0.158931f,  -0.097586f, 0.222001f,
+  0.257887f,  -0.171307f, -0.222607f, -0.245508f, -0.145742f, -0.096461f,
+  -0.010895f, 0.052815f,  -0.265306f, -0.081059f, 0.219162f,  -0.256084f,
+  -0.372676f, 0.148977f,  0.174831f,  0.086980f,  0.108518f,  0.074011f,
+  0.038032f,  -0.070856f, -0.109407f, 0.126174f,  0.022341f,  -0.249786f,
+  -0.356164f, -0.202841f, -0.087437f, -0.133740f, 0.090956f,  -0.017953f,
+  -0.028353f, 0.233621f,  0.109426f,  0.232798f,  -0.104950f, -0.241798f,
+  -0.018995f, -0.167954f, 0.002473f,  0.060418f,  -0.232717f, -0.195980f,
+  -0.283971f, -0.371881f, 0.219728f,  0.018072f,  -0.166694f, -0.083301f,
+  -0.000616f, -0.212641f, -0.173158f, 0.222739f,  -0.235302f, 0.237624f,
+  0.222232f,  -0.041235f, -0.342411f, 0.121194f,  0.211291f,  -0.032237f,
+  -0.249401f, -0.291668f, 0.206055f,  -0.148200f, 0.011824f,  -0.272728f,
+  -0.194854f, 0.367175f,  -0.257243f, 0.103433f,  -0.231077f, 0.236734f,
+  0.135733f,  -0.362845f, 0.197147f,  0.242782f,  -0.135289f, 0.123311f,
+  0.259420f,  -0.116278f, 0.127287f,  0.236789f,  -0.097438f, 0.118073f,
+  0.112796f,  -0.035949f, 0.184408f,  0.200948f,  -0.008859f, 0.195989f,
+  0.161970f,  -0.295320f, -0.330389f, 0.141034f,  0.066081f,  -0.707857f,
+  0.357037f,  0.149633f,  0.679877f,  0.548674f,  0.469076f,  0.194123f,
+  -0.209872f, -0.071764f, -0.126960f, 0.199420f,  0.327116f,  -0.169053f,
+  -0.429156f, 0.443429f,  -0.225530f, -0.130738f, -0.028351f, 0.644393f,
+  0.049606f,  -0.243602f, -0.409920f, 0.117028f,  -0.258557f, 0.073865f,
+  -0.200454f, -0.139957f, -0.031314f, 0.162325f,  0.247221f,  0.071909f,
+  -0.336276f, 0.079922f,  0.192780f,  -0.148882f, 0.133192f,  -0.143177f,
+  -0.121327f, 0.126221f,  -0.089521f, -0.181826f, 0.149923f,  -0.280682f,
+  0.391572f,  0.108990f,  -0.445494f, -0.170787f, 0.225182f,  0.223313f,
+  -0.234828f, -0.071072f, -0.072673f, -0.093686f, 0.223892f,  -0.049377f,
+  0.057976f,  0.033558f,  0.068733f,  -0.283353f, 0.217877f,  0.158093f,
+  -0.276761f, -0.097049f, -0.351913f, -0.383604f, 0.002863f,  -0.474510f,
+  -0.096738f, 0.256940f,  0.234203f,  -0.226667f, -0.260576f, -0.183403f,
+  -0.035578f, 0.141570f,  0.078764f,  -0.028086f, 0.155800f,  -0.251115f,
+  -0.286703f, -0.014739f, -0.072621f, -0.311506f, -0.048639f, 0.081621f,
+  0.043057f,  0.068136f,  -0.179903f, 0.143699f,  -0.002571f, 0.239012f,
+  0.197456f,  0.035745f,  -0.311927f, 0.220320f,  0.102687f,  -0.294105f,
+  0.426740f,  0.209050f,  0.211907f,  0.083453f,  0.006578f,  -0.143338f,
+  0.003157f,  0.040295f,  0.234497f,  0.035344f,  -0.163909f, 0.411115f,
+  0.289453f,  -0.075357f, -0.008884f, 0.469798f,  -0.033304f, -0.153293f,
+  -0.229322f, -0.004162f, 0.113363f,  0.395381f,  0.067414f,  -0.188966f,
+  -0.117424f, -0.166423f, 0.066839f,  0.595641f,  -0.204782f, -0.451727f,
+  0.198509f,  -0.921583f, -0.246765f, -0.153411f, 0.046491f,  0.365906f,
+  0.376710f,  -0.017355f, -0.035232f, 0.138785f,  -0.163918f, -0.283449f,
+  -0.094340f, 0.192127f,  0.154815f,  0.035787f,  -0.029087f, 0.115649f,
+  -0.220133f, -0.452741f, 0.311667f,  0.157666f,  0.091401f,  0.236040f,
+  -0.168523f, 0.122176f,  -0.219016f, -0.214856f, 0.172824f,  -0.091810f,
+  0.031520f,  -0.857420f, 0.643446f,  -0.017471f, 0.206082f,  -0.933517f,
+  -0.020070f, -0.065091f, -0.117680f, -1.271870f, -0.069177f, -0.149409f,
+  0.289970f,  -0.889775f, -0.044741f, 0.232647f,  -0.319416f, 0.073030f,
+  0.278549f,  0.238782f,  -0.202206f, 0.272540f,  0.201412f,  0.175574f,
+  -0.127971f, -0.253164f, -0.086352f, -0.005381f, 0.114714f,  0.505169f,
+  -0.175049f, -1.534280f, -0.320666f, -2.119298f, -0.023075f, -0.021259f,
+  -0.161019f, 0.344837f,  0.361958f,  -0.097050f, 0.014375f,  0.267110f,
+  0.341442f,  -0.016688f, 0.073393f,  0.131500f,  0.246331f,  0.011059f,
+  0.033597f,  0.014779f,  -0.269366f, -0.504788f, 0.048651f,  0.295682f,
+  0.237363f,  0.227484f,  -0.235814f, -0.160530f, 0.182682f,  -0.172999f,
+  -0.126630f, 0.168357f,  -0.078729f, 0.052805f,  0.377021f,  -0.004727f,
+  0.230415f,  -0.876673f, 0.458457f,  0.099401f,  -0.019616f, 0.611982f,
+  -0.231508f, -0.070894f, -0.056142f, 0.548969f,  -0.376599f, -0.600428f,
+  0.241930f,  -0.592893f, 0.189371f,  0.488651f,  -0.092446f, -0.272569f,
+  0.251643f,  0.315945f,  -0.301468f, 0.112961f,  0.052119f,  -0.066076f,
+  -0.082249f, 0.252805f,  -0.195539f, 0.150386f,  -0.865534f, 0.673447f,
+  0.030177f,  -0.438528f, -1.006174f, 0.575176f,  -0.271656f, 0.035835f,
+  -1.056916f, 0.495267f,  -0.092428f, -0.109511f, -0.192359f, 0.166669f,
+  -0.624326f, -0.000354f, -0.089075f, 0.176279f,  -0.289347f, 0.021346f,
+  0.020375f,  0.255282f,  -0.045588f, 0.173675f,  0.100957f,  -0.294373f,
+  0.049303f,  -0.134132f, -0.255731f, -0.025559f, -0.307463f, -0.205100f,
+  0.079024f,  0.101113f,  0.135742f,  -0.348869f, -0.026759f, -0.134155f,
+  -0.179275f, -0.054297f, -0.054948f, 0.029351f,  0.190560f,  0.102476f,
+  -0.025785f, 0.169442f,  -0.271303f, 0.200667f,  0.099063f,  0.074767f,
+  -0.326533f, 0.044426f,  -0.290251f, -0.082443f, -0.164482f, -0.349412f,
+  0.045109f,  -0.157330f, 0.165935f,  0.012672f,  -0.059818f, 0.399140f,
+  -0.316620f, 0.386638f,  -0.285399f, -0.296777f, -0.200473f, -0.144232f,
+  0.251851f,  -0.203768f, 0.001071f,  -0.179063f, 0.248952f,  -0.143029f,
+  0.010423f,  -0.030293f, -0.046786f, -0.196195f, -0.016845f, 0.295023f,
+  0.322825f,  0.133683f,  0.017388f,  0.142467f,  0.221320f,  0.004059f,
+  -0.115770f, 0.143363f,  0.137972f,  -0.272584f, 0.489366f,  -0.091828f,
+  -0.014703f, 0.082332f,  -0.476226f, -0.202859f, 0.356094f,  -0.283049f,
+  0.218086f,  0.202015f,  0.201724f,  0.012617f,  0.050720f,  0.255695f,
+  0.244653f,  0.111296f,  -0.151450f, -0.056210f, -0.757348f, 0.441724f,
+  -0.022455f, -0.244662f, 0.296205f,  -0.421883f, -0.217386f, -0.254301f,
+  0.409105f,  -0.031309f, 0.050147f,  -0.337170f, -0.106620f, -0.606455f,
+  0.308024f,  0.298144f,  0.363993f,  0.704870f,  -0.047292f, 0.166901f,
+  0.105991f,  -0.536757f, -0.424031f, -0.226034f, 0.213635f,  -0.526754f,
+  0.310990f,  -0.116038f, 0.007775f,  0.538330f,  -0.177912f, 0.445357f,
+  -0.290365f, 0.451169f,  0.030931f,  0.033388f,  0.209905f,  -0.244492f,
+  -0.097792f, -0.246042f, 0.132047f,  0.032576f,  0.115516f,  0.022890f,
+  0.093508f,  -0.071840f, 0.362948f,  -0.135245f, 0.659911f,  -0.321413f,
+  0.193118f,  -0.795001f, -0.218311f, 0.024862f,  0.206172f,  -0.832878f,
+  -0.255670f, 0.343402f,  -0.275211f, -0.898363f, -0.025172f, 0.158565f,
+  0.171347f,  -0.127518f, -0.215156f, -0.159198f, 0.250355f,  -0.132452f,
+  0.061254f,  -0.097544f, -0.223246f, 0.013183f,  0.239468f,  0.259017f,
+  -0.217739f, -0.032263f, 0.123755f,  -0.701777f, 0.150049f,  -0.555293f,
+  0.062430f,  -0.260304f, 0.494894f,  -0.168702f, -0.134829f, -0.113989f,
+  0.150092f,  -0.060248f, 0.115711f,  -0.277202f, 0.499811f,  0.417116f,
+  0.191081f,  -0.376432f, -0.321092f, 0.033992f,  0.057193f,  0.127077f,
+  -0.009042f, 0.014443f,  0.142808f,  -0.124349f, 0.213087f,  -0.381686f,
+  0.129726f,  -0.038396f,
+};
+
+static const float av1_early_term_after_split_nn_bias_32_layer0[] = {
+  -0.107171f, 0.060848f,  -0.069480f, -0.121982f, 0.037637f,  -0.291839f,
+  0.102257f,  -0.065889f, -0.032452f, 0.034171f,  -0.073984f, -0.005236f,
+  0.218820f,  0.132123f,  -0.089621f, -0.067679f, 0.049368f,  0.329444f,
+  -0.184729f, 0.031702f,  0.009735f,  -0.039964f, -0.018024f, -0.073031f,
+  -0.030166f, -0.191037f, -0.074862f, -0.076548f, 0.076537f,  0.216609f,
+  -0.078358f, -0.007740f,
+};
+
+static const float av1_early_term_after_split_nn_weights_32_layer1[] = {
+  0.047869f,  -0.231773f, -0.185663f, 0.460676f,  -0.208182f, 0.590555f,
+  -0.622627f, 0.279377f,  0.351681f,  0.633504f,  1.069884f,  0.332449f,
+  -0.457703f, -0.435817f, -0.028853f, 0.327490f,  -0.282469f, -0.975792f,
+  -0.062975f, -0.147187f, 0.348340f,  -1.207116f, 0.516159f,  -1.509626f,
+  -0.805072f, 0.522999f,  0.143671f,  0.304246f,  -0.360720f, -0.612472f,
+  0.260045f,  -0.223243f,
+};
+
+static const float av1_early_term_after_split_nn_bias_32_layer1[] = {
+  -0.07571174f,
+};
+
+static const NN_CONFIG av1_early_term_after_split_nnconfig_32 = {
+  FEATURES,
+  1,
+  1,
+  {
+      HIDDEN_NODES,
+  },
+  {
+      av1_early_term_after_split_nn_weights_32_layer0,
+      av1_early_term_after_split_nn_weights_32_layer1,
+  },
+  {
+      av1_early_term_after_split_nn_bias_32_layer0,
+      av1_early_term_after_split_nn_bias_32_layer1,
+  },
+};
+
+static const float av1_early_term_after_split_nn_weights_16_layer0[] = {
+  -0.113798f, 0.053357f,  -0.037947f, -0.477171f, 0.276517f,  -0.349252f,
+  -0.177284f, 0.189597f,  0.141744f,  0.230207f,  -0.328104f, 0.074328f,
+  0.247717f,  0.233533f,  0.145167f,  0.018029f,  -0.398725f, -0.226199f,
+  -0.309724f, 0.125279f,  0.194759f,  0.025531f,  0.349714f,  -0.273944f,
+  0.186871f,  0.181735f,  -0.520614f, -0.264076f, 0.308207f,  0.157438f,
+  -0.137791f, -0.054582f, 0.125879f,  0.796218f,  -0.897562f, 0.885439f,
+  0.381640f,  0.106625f,  -2.027456f, 0.000874f,  0.179581f,  0.013287f,
+  -2.329439f, -0.163169f, -0.136191f, 0.320108f,  -2.318779f, -0.196722f,
+  -0.295721f, 0.203658f,  -0.182275f, 0.615941f,  0.015762f,  0.257181f,
+  -0.115297f, 0.295774f,  -0.026144f, -0.022686f, -0.219423f, -0.042861f,
+  0.207647f,  -0.057791f, 0.201671f,  -0.169569f, 0.291492f,  -0.994991f,
+  0.137473f,  0.230948f,  0.505626f,  -1.065860f, 0.275225f,  -0.250861f,
+  0.519466f,  -1.217242f, -0.087384f, 0.053441f,  0.030729f,  -1.702304f,
+  -0.034635f, 0.010177f,  -0.035422f, -0.749979f, 0.355499f,  0.408166f,
+  -0.086883f, 0.017203f,  0.195706f,  -0.218056f, -0.029153f, 0.367335f,
+  -0.061732f, -0.241068f, 0.078496f,  -0.370346f, -0.124223f, -0.172708f,
+  0.037971f,  0.038875f,  -0.282489f, -0.266323f, -0.210864f, 0.214714f,
+  0.234695f,  -0.045625f, 0.015357f,  -0.007464f, -0.362003f, -0.113465f,
+  0.145141f,  0.238470f,  -0.202664f, -0.286587f, -0.347112f, 0.054501f,
+  -0.190290f, -0.283256f, 0.062179f,  0.041165f,  -0.006935f, -0.220351f,
+  -0.088800f, 0.220924f,  -0.200982f, 0.058493f,  -0.225175f, 0.057175f,
+  -0.618187f, 0.761023f,  -0.743774f, -0.500599f, -0.584999f, 1.545211f,
+  0.123055f,  -0.106848f, -0.353057f, 1.552187f,  0.174104f,  0.068060f,
+  -0.449859f, 1.254299f,  -0.161716f, -0.060630f, -0.230721f, 0.165976f,
+  -0.101582f, -0.422415f, 0.110384f,  -0.130098f, 0.104428f,  0.083518f,
+  0.031626f,  0.083048f,  0.158877f,  0.173340f,  0.063962f,  0.427845f,
+  0.663268f,  0.376996f,  0.146435f,  -0.091329f, 0.443447f,  0.518432f,
+  -0.182777f, -0.091313f, 0.331229f,  0.532604f,  -0.187001f, 0.054774f,
+  0.298068f,  0.502295f,  -0.362378f, 0.054283f,  0.292806f,  0.168901f,
+  -0.214787f, 0.025637f,  0.458009f,  -0.322714f, -0.264059f, 0.140313f,
+  -0.102696f, -0.431208f, -0.134450f, -0.545415f, 0.253851f,  -0.009061f,
+  -0.050681f, 0.108681f,  0.043272f,  -1.073133f, 0.206410f,  0.469576f,
+  0.291494f,  -2.021244f, -0.001183f, -0.067542f, 0.364907f,  -2.470543f,
+  0.049147f,  -0.018868f, 0.658500f,  -2.531048f, 0.275433f,  -0.034224f,
+  -0.171386f, 0.096369f,  0.728069f,  0.272332f,  0.222255f,  -0.030426f,
+  0.026994f,  0.208928f,  -0.173943f, -0.227581f, -0.214798f, 0.079341f,
+  0.032344f,  -0.253575f, -0.044353f, -0.239265f, -0.055852f, -0.162582f,
+  -0.086592f, 0.066487f,  0.337353f,  -0.168704f, 0.015702f,  0.022607f,
+  0.286647f,  0.218106f,  0.193319f,  -0.358714f, 0.030796f,  0.007646f,
+  -0.045617f, 0.165007f,  -0.284641f, -0.291812f, 0.207544f,  0.082823f,
+  -0.141907f, -0.331336f, -0.052908f, 0.120716f,  0.202521f,  0.232782f,
+  -0.348141f, -0.017332f, 1.191126f,  -0.391987f, -0.154537f, -0.206551f,
+  -2.378690f, 0.057918f,  -0.328183f, 2.151556f,  0.238803f,  0.164880f,
+  -0.480039f, 1.616200f,  0.260243f,  0.083704f,  -0.174461f, 1.804634f,
+  0.194810f,  0.223837f,  0.550107f,  -0.068171f, -0.293435f, -0.186770f,
+  -0.364846f, 0.127181f,  0.105556f,  -0.016202f, 0.278403f,  -0.344995f,
+  -0.009761f, -0.082555f, 0.046731f,  -0.301452f, 0.604259f,  0.055895f,
+  0.049862f,  0.314249f,  -0.305811f, -0.112937f, 0.658787f,  -0.549288f,
+  -0.307567f, -0.460650f, -0.840643f, 0.082576f,  0.373711f,  0.138318f,
+  0.336901f,  0.284984f,  -0.281400f, 0.408210f,  -0.449858f, 0.461054f,
+  0.227629f,  -0.131705f, 0.301769f,  -0.278540f, 0.189290f,  -0.269041f,
+  0.111350f,  -0.300257f, 0.436858f,  -0.265920f, -0.211938f, 0.272631f,
+  0.206291f,  0.253273f,  -0.229776f, -0.031112f, -0.171183f, -0.109676f,
+  -0.202390f, -0.068857f, 0.182125f,  -0.140523f, -0.308742f, -0.045840f,
+  0.256545f,  -0.262405f, 0.225951f,  -0.287463f, -0.189203f, -0.055552f,
+  -0.052448f, -0.242839f, -0.278877f, 0.140920f,  -0.175755f, 0.215402f,
+  -0.248841f, -0.264080f, -0.178303f, 0.147777f,  0.049460f,  -0.279877f,
+  -0.539725f, -0.004622f, 0.182874f,  0.338814f,  0.265974f,  0.249851f,
+  -0.141154f, 0.157228f,  -0.090972f, 0.179444f,  0.305255f,  0.127788f,
+  0.123270f,  0.355320f,  0.076797f,  0.263495f,  0.235965f,  -0.133816f,
+  0.243624f,  0.227062f,  -0.213629f, 0.002075f,  0.061203f,  -0.077820f,
+  -0.008807f, -0.247324f, -0.051464f, -0.191894f, -0.238713f, -0.389526f,
+  -0.274248f, 0.053950f,  -0.225750f, -0.367097f, -0.122391f, 0.181212f,
+  -0.411824f, -0.084241f, -0.302288f, 0.077860f,  -0.187443f, -0.300262f,
+  0.083156f,  -0.392461f, -0.332320f, -0.346474f, 0.140658f,  -0.283656f,
+  0.120714f,  -0.056577f, -0.280968f, 0.017795f,  -0.024686f, 0.073113f,
+  -0.346637f, 0.082567f,  -0.036556f, -0.369730f, 0.081225f,  -0.005211f,
+  0.144886f,  -0.003544f, 0.178307f,  -0.366035f, -0.063887f, -0.191767f,
+  0.105835f,  -0.273978f, -0.266532f, -0.023984f, 0.039166f,  0.065848f,
+  -0.026802f, -0.268923f, 0.189659f,  0.086300f,  0.030718f,  0.216565f,
+  -0.130025f, -0.215687f, 0.146341f,  -0.286438f, -0.394226f, -0.181509f,
+  -0.005612f, 0.186040f,  0.133491f,  0.032096f,  -0.261609f, 0.074007f,
+  -0.042929f, -0.234479f, 0.189704f,  0.088395f,  -0.003671f, -0.125055f,
+  -0.252418f, -0.086387f, 0.111197f,  -0.297071f, -0.018793f, -0.031902f,
+  -0.333191f, -0.186279f, 0.039868f,  0.091419f,  -0.264438f, -0.216150f,
+  -0.212550f, 0.203412f,  -0.113028f, -0.197169f, -0.346771f, 0.086066f,
+  0.091443f,  -0.128507f, -0.007281f, -0.118389f, 0.003370f,  -0.338661f,
+  0.026739f,  -0.063571f, -0.281567f, -0.166824f, 0.167455f,  0.216173f,
+  0.199163f,  0.256314f,  -0.222679f, 0.040282f,  -0.154808f, -0.133943f,
+  -0.270163f, -0.357398f, 0.260373f,  0.176950f,  -0.125162f, -0.085050f,
+  0.226376f,  -0.124585f, -0.324804f, 0.035536f,  -0.133600f, 0.173450f,
+  0.068107f,  -0.337442f, 0.169629f,  0.047223f,  0.057878f,  0.055555f,
+  -0.317449f, -0.103768f, 0.080899f,  -0.194759f, -1.137593f, 0.508999f,
+  0.045372f,  1.746454f,  1.250347f,  -0.342930f, -0.127821f, -0.220175f,
+  -0.417649f, -0.480595f, 0.071902f,  0.050231f,  -0.562554f, -0.677866f,
+  -0.121416f, -0.247558f, -0.483876f, -0.504157f, 1.731953f,  0.572936f,
+  0.047325f,  0.050619f,  0.112611f,  -0.035393f, 0.052585f,  -0.071076f,
+  -0.015798f, -0.050228f, -0.142875f, 0.189329f,  0.048833f,  0.503633f,
+  0.249588f,  0.175492f,  -0.137664f, -0.018533f, 0.288453f,  -0.025644f,
+  0.079131f,  0.195096f,  -0.154039f, -0.104220f, -0.224072f, 0.095946f,
+  -0.208424f, 0.214745f,  0.056468f,  0.182603f,  0.341784f,  -0.134664f,
+  -0.194050f, 0.058532f,  -0.107336f, -0.087783f, -0.238795f, -0.387212f,
+  0.049055f,  -0.127417f, -0.299919f, -0.094371f, -0.011735f, -0.264753f,
+  0.407375f,  -0.462654f, -0.609488f, 0.027742f,  -0.985512f, -0.109154f,
+  -0.423276f, 2.347960f,  0.129240f,  0.187610f,  -0.057081f, 2.424892f,
+  0.087666f,  0.106716f,  -0.039379f, 2.764866f,  0.113309f,  0.028196f,
+  -0.582789f, 0.335385f,  -0.538029f, -0.477337f, -0.114207f, 0.178829f,
+  0.006276f,  0.123179f,  0.095101f,  0.139898f,  -0.372074f, -0.111010f,
+  0.136330f,  0.272900f,  0.126737f,  -0.097808f, -0.363697f, 0.108665f,
+  -0.227749f, -0.083421f, 1.714677f,  0.451943f,  0.107931f,  -0.392281f,
+  1.615846f,  0.022307f,  -0.247011f, 0.257703f,  1.039134f,  0.537789f,
+  0.022177f,  -0.271532f, 0.351350f,  -0.399205f, -0.240534f, -0.315399f,
+  0.026928f,  -0.005618f, 0.053179f,  -0.010277f, 0.000501f,  0.040896f,
+  -0.109160f, 0.018282f,  0.003887f,  0.199599f,  0.095349f,  -0.337284f,
+  0.169929f,  -0.109409f, -0.166983f, 0.059908f,  -0.226574f, -0.120114f,
+  0.077329f,  -0.333133f, -0.220936f, 0.114309f,  -0.233965f, -0.281551f,
+  0.042948f,  0.100940f,  0.116037f,  -0.313122f, 0.215149f,  -0.309057f,
+  -0.341052f, -0.294417f, -0.179722f, 0.010795f,  0.192053f,  -0.275261f,
+  -0.033077f, 0.117348f,  0.090206f,  0.781573f,  0.602456f,  -0.220296f,
+  0.172159f,  0.758513f,  0.157910f,  -0.217897f, -0.372659f, 0.031935f,
+  0.791463f,  0.267195f,  0.931593f,  -0.057349f, 0.405512f,  -0.058512f,
+  -0.641663f, -0.076592f, 0.550227f,  -0.024094f, 0.048218f,  -0.289971f,
+  0.180940f,  0.167533f,  0.052711f,  -0.360726f, 0.019210f,  -0.488879f,
+  0.380498f,  0.151608f,  -0.276895f, -0.596554f, 0.106076f,  -0.245833f,
+  -0.048783f, 0.073823f,  0.098780f,  0.000211f,  0.113958f,  -0.068964f,
+  -0.265533f, -0.185457f, 0.175586f,  -0.163621f, -0.204919f, 0.145802f,
+  -0.163421f, 0.129576f,  -0.153486f, -0.105573f, 0.067289f,  -0.213120f,
+  -0.286103f, 0.249543f,  -0.044970f, -0.170464f, -0.105501f, -0.094765f,
+  -0.050734f, -0.369468f, 0.180020f,  -0.363328f, -0.151654f, -0.262550f,
+  -0.424503f, 0.829032f,  -0.559452f, 0.506837f,  0.143823f,  0.276660f,
+  -1.808608f, -0.259517f, -0.053945f, 0.035676f,  -1.842195f, -0.065960f,
+  -0.069285f, 0.462022f,  -2.319453f, -0.370299f, 0.183329f,  -0.146412f,
+  -0.563875f, 0.305068f,  0.480904f,  0.044319f,  -0.016098f, 0.168516f,
+  0.114874f,  -0.097621f, -0.030373f, 0.177700f,  0.181591f,  -0.146003f,
+  -0.330853f, -0.259200f, 0.779319f,  -1.517524f, 0.178781f,  0.135451f,
+  0.088784f,  -2.076089f, 0.628717f,  -0.048685f, 0.281327f,  -2.341596f,
+  0.422171f,  0.006135f,  0.367096f,  -1.663118f, 0.365253f,  -0.072884f,
+  -0.197620f, -0.688634f, 0.477354f,  0.395841f,  -0.098505f, 0.208709f,
+  -0.027523f, 0.127119f,  0.106274f,  0.114424f,  -0.122877f, -0.087245f,
+  0.086923f,  -0.527398f, -0.342062f, -0.764662f, 0.713094f,  -0.626453f,
+  -0.081454f, -0.087683f, 0.885047f,  0.323440f,  -0.018579f, -0.217166f,
+  1.617984f,  -0.159038f, 0.265991f,  -0.390313f, 1.933182f,  -0.032431f,
+  -0.057513f, -0.300841f, 0.461248f,  -0.072147f, -0.287052f, -0.078056f,
+  0.011734f,  0.044013f,  0.177174f,  0.093400f,  0.028819f,  0.193686f,
+  -0.224853f, 0.268321f,  -0.075059f, 0.074526f,  -0.015618f, 0.165615f,
+  -0.276780f, -0.063908f, -0.369264f, -0.171497f, -0.173624f, -0.130743f,
+  -0.224625f, -0.124980f, -0.104482f, 0.076864f,  -0.009631f, -0.164682f,
+  0.150480f,  -0.111880f, -0.260425f, 0.086234f,  -0.176936f, -0.136771f,
+  -0.168867f, -0.405626f, -0.288716f, -0.128950f, -0.207327f, 0.015581f,
+  -0.109061f, -0.098970f, 0.090792f,  -0.109623f, 0.349851f,  0.266341f,
+  -0.088602f, -0.108071f, 0.082519f,  0.472650f,  -1.838758f, 0.456694f,
+  0.119927f,  0.461077f,  -2.860022f, 0.231495f,  0.235771f,  0.256424f,
+  -1.938516f, -0.188202f, -0.000832f, -0.518206f, 0.194644f,  0.505510f,
+  0.615657f,  0.193760f,  0.224600f,  0.265732f,  -0.121553f, -0.354597f,
+  -0.242414f, -0.276639f, -0.057591f, 0.026369f,  -0.261148f, -0.356155f,
+  -0.149178f, -0.353566f, -0.340835f, -0.141776f, 0.076535f,  0.221299f,
+  -0.108857f, -0.156514f, 0.050901f,  0.058541f,  -0.077141f, 0.071515f,
+  -0.333283f, -0.181489f, -0.212900f, -0.224698f, -0.174693f, -0.178665f,
+  -0.143374f, -0.091811f, 0.165161f,  0.060156f,  -0.086103f, -0.039031f,
+  -0.377759f, -0.370533f, 0.074431f,  0.064192f,  0.186576f,  0.447858f,
+  -0.082260f, -0.020268f, -0.123089f, -0.402017f, 0.080500f,  0.176286f,
+  2.850013f,  0.019385f,  -0.225361f, -0.235315f, 1.654694f,  -0.073978f,
+  -0.341412f, -1.187575f, 2.815900f,  -0.228063f, -0.174547f, 0.623825f,
+  -0.010676f, 0.157189f,  0.111879f,  -0.198965f, 0.051851f,  0.158396f,
+  0.045194f,  0.293531f,  -0.246714f, -0.351493f, 0.026954f,  0.076233f,
+  0.420367f,  0.168154f,  -0.131450f, 0.134487f,  -0.288851f, -0.134553f,
+  0.014902f,  0.756381f,  0.277713f,  0.190080f,  -0.020869f, 1.446672f,
+  0.029792f,  -0.025927f, 0.060640f,  0.559864f,  0.422229f,  0.198459f,
+  0.036167f,  0.029432f,  0.001882f,  0.038480f,  -0.160528f, -0.288855f,
+  -0.310886f, 0.291296f,  0.190558f,  -0.182816f, -0.002252f, 0.073101f,
+  -0.172245f, -0.305980f, 0.112492f,  -0.422839f, -0.295999f, -0.078160f,
+  -0.173405f, -0.032819f, 0.373774f,  -0.715223f, 0.018911f,  0.131753f,
+  -0.237364f, -0.128499f, -0.228406f, 0.341619f,  0.343552f,  -0.521581f,
+  -0.263790f, 0.362502f,  -0.018450f, 0.054233f,  0.183068f,  0.382772f,
+  0.188811f,  -0.627287f, 0.040399f,  -0.487338f, -0.192591f, 0.247426f,
+  0.154372f,  -0.483994f,
+};
+
+static const float av1_early_term_after_split_nn_bias_16_layer0[] = {
+  -0.173976f, 0.305495f,  0.250981f,  -0.067127f, -0.313100f, 0.242464f,
+  0.315196f,  -0.056052f, -0.241227f, -0.253308f, -0.002697f, 0.003687f,
+  -0.124421f, -0.090383f, -0.070366f, -0.064074f, -0.056115f, 0.123313f,
+  -0.239698f, -0.182082f, -0.065296f, 0.021503f,  -0.036787f, 0.311861f,
+  0.118135f,  -0.320456f, -0.110719f, 0.220692f,  -0.071727f, -0.088226f,
+  -0.110874f, -0.111671f,
+};
+
+static const float av1_early_term_after_split_nn_weights_16_layer1[] = {
+  -0.338573f, 0.398159f,  0.314774f,  -0.037448f, -0.271950f, -0.774991f,
+  0.950901f,  -0.225380f, -1.841906f, -0.350379f, -0.079350f, 0.383148f,
+  -0.183676f, -0.313132f, -0.340820f, -0.309401f, -1.050540f, -0.432267f,
+  -0.657195f, 0.927632f,  -0.040150f, 0.578920f,  0.212301f,  0.292495f,
+  0.563590f,  -0.205735f, 0.195877f,  0.582122f,  -0.217860f, 1.613379f,
+  0.313278f,  -0.555802f,
+};
+
+static const float av1_early_term_after_split_nn_bias_16_layer1[] = {
+  0.16553f,
+};
+
+static const NN_CONFIG av1_early_term_after_split_nnconfig_16 = {
+  FEATURES,
+  1,
+  1,
+  {
+      HIDDEN_NODES,
+  },
+  {
+      av1_early_term_after_split_nn_weights_16_layer0,
+      av1_early_term_after_split_nn_weights_16_layer1,
+  },
+  {
+      av1_early_term_after_split_nn_bias_16_layer0,
+      av1_early_term_after_split_nn_bias_16_layer1,
+  },
+};
+
+static const float av1_early_term_after_split_nn_weights_8_layer0[] = {
+  -0.719472f, 0.305806f,  0.855829f,  0.100094f,  0.412517f,  1.254673f,
+  1.552105f,  -5.890773f, -0.089957f, -0.016736f, 1.418074f,  -5.393506f,
+  -0.028214f, 0.117758f,  1.479209f,  -5.299794f, 0.171585f,  -0.084182f,
+  -0.162105f, 0.388577f,  -0.044319f, -0.025861f, 0.251782f,  -0.181462f,
+  -0.101545f, -0.079999f, -0.033014f, -0.191627f, -0.032802f, -0.053404f,
+  0.038038f,  -0.119492f, 0.049104f,  -0.344384f, -0.354513f, 0.036977f,
+  0.017513f,  -0.004025f, -0.163212f, -0.261999f, 0.146575f,  0.207541f,
+  0.130365f,  -0.252127f, 0.097419f,  -0.231057f, -0.309421f, 0.347866f,
+  -0.064670f, -0.283171f, -0.244193f, -0.193323f, -0.226954f, -0.276194f,
+  -0.233553f, 0.156354f,  -0.184009f, 0.344289f,  -0.308058f, -0.205202f,
+  -0.325068f, 0.183820f,  -0.361667f, -0.069559f, -0.121834f, -0.038357f,
+  -0.210043f, -0.266129f, 0.003188f,  0.074902f,  -0.328843f, 0.293679f,
+  -0.234698f, -0.428268f, -0.308772f, -0.136538f, -0.008384f, -0.078227f,
+  0.166074f,  -0.262899f, 0.102114f,  -0.323420f, 0.057064f,  -0.203318f,
+  -0.397413f, -0.317324f, -0.307093f, 0.020574f,  -0.188627f, 0.132529f,
+  0.118992f,  -0.487387f, -0.282975f, 0.573231f,  -0.266071f, 0.125140f,
+  -0.970034f, 1.424008f,  -0.487366f, -0.196415f, 3.680273f,  -0.008407f,
+  0.081109f,  -0.187479f, 3.876021f,  0.159168f,  0.111721f,  -0.337423f,
+  3.901760f,  0.261268f,  -0.245555f, -0.187632f, -0.324298f, 0.167234f,
+  0.170986f,  -0.473055f, 0.087016f,  -0.003469f, 0.051035f,  0.251794f,
+  0.153549f,  0.217609f,  -0.326870f, -0.175511f, 0.637341f,  -0.694837f,
+  -0.873487f, -0.186614f, -1.089884f, -0.607316f, -0.523519f, 5.256331f,
+  0.071414f,  0.215265f,  -0.835999f, 5.735746f,  0.300101f,  0.089626f,
+  -0.450261f, 5.608051f,  0.190491f,  0.110220f,  -0.595360f, -0.446324f,
+  0.311380f,  0.268812f,  -0.339656f, -0.008708f, 0.011111f,  -0.027557f,
+  0.171534f,  0.000676f,  0.227232f,  0.033993f,  0.146684f,  0.094817f,
+  -0.175381f, -0.211927f, -0.362471f, 0.168834f,  0.264149f,  -0.350538f,
+  -0.463249f, -0.288105f, 0.347155f,  0.183231f,  -0.229732f, -0.252202f,
+  -0.218074f, -0.008769f, -0.156103f, 0.181233f,  -0.354736f, 0.263270f,
+  -0.106636f, 0.081057f,  0.060634f,  -0.046887f, 0.050468f,  0.071259f,
+  0.221287f,  0.199071f,  -0.180185f, -0.406902f, -0.239351f, -0.034957f,
+  0.369140f,  0.864600f,  0.233798f,  0.423612f,  -0.468918f, 0.976987f,
+  0.691198f,  -1.597908f, 0.102926f,  0.305546f,  0.391196f,  -3.909059f,
+  0.333635f,  0.311561f,  0.738886f,  -4.002001f, 0.236394f,  -0.233141f,
+  0.263342f,  0.679898f,  0.136233f,  0.254743f,  -0.367571f, 0.066412f,
+  0.001606f,  -0.059542f, 0.051726f,  -0.347145f, -0.045501f, -0.313847f,
+  -0.021952f, 1.386316f,  -0.579139f, -1.275844f, -0.003493f, -1.716577f,
+  0.250209f,  0.192086f,  4.177055f,  0.351835f,  0.338177f,  0.140163f,
+  4.099592f,  0.321866f,  -0.128153f, -0.360414f, 4.350767f,  0.025943f,
+  -0.116740f, -0.664107f, -0.064558f, -0.039553f, -0.208186f, -0.678774f,
+  0.149441f,  -0.019823f, 0.012759f,  0.404442f,  -0.108881f, 0.067974f,
+  -0.188278f, 0.136327f,  0.109927f,  -0.179270f, -0.272342f, 0.018064f,
+  -0.304216f, -0.469470f, 0.109310f,  -0.326214f, 0.061909f,  -0.278997f,
+  -0.352329f, -0.333770f, -0.186522f, -0.328567f, -0.206211f, -0.008804f,
+  0.042441f,  -0.126699f, -0.420399f, -0.033842f, 0.016773f,  -0.273789f,
+  0.081928f,  -0.191552f, -0.179533f, -0.263070f, -0.471807f, 0.062601f,
+  -0.232576f, 0.082955f,  -0.490080f, 0.073820f,  -0.090384f, 0.035781f,
+  -0.158880f, -0.506793f, -0.069132f, 0.047602f,  -0.349640f, -0.058389f,
+  -0.017387f, -0.194636f, -0.457227f, -0.143105f, 0.222045f,  -0.548909f,
+  -0.131561f, 0.247196f,  -0.207923f, 0.133056f,  -0.509854f, -0.193685f,
+  -0.181327f, -0.242442f, 0.091821f,  0.114430f,  -0.375233f, -0.015254f,
+  -0.336632f, -0.060279f, -0.169169f, -0.429914f, -0.036563f, -0.400560f,
+  -0.076332f, -0.186232f, -0.268491f, 0.075561f,  -0.389082f, -0.077435f,
+  0.352562f,  -0.020086f, -0.338181f, -0.404629f, 0.254983f,  0.150477f,
+  -0.265903f, 0.003341f,  0.099969f,  -0.211964f, -0.129372f, -0.166366f,
+  0.327712f,  -0.276234f, 0.140675f,  -0.433677f, -0.163050f, -0.143578f,
+  -0.397840f, -0.422130f, -0.293835f, -0.075362f, -0.468375f, 1.021238f,
+  1.394155f,  -0.922486f, -1.350222f, 2.030201f,  0.057717f,  0.227650f,
+  -0.193179f, 0.037224f,  0.065555f,  0.020558f,  -0.059205f, -0.023690f,
+  -0.008718f, 0.095976f,  -0.549587f, -0.321164f, -0.243728f, 1.344381f,
+  -1.254107f, 0.294244f,  -0.154737f, -0.152597f, 0.342419f,  0.301883f,
+  0.069866f,  -0.327766f, 0.209323f,  -0.364913f, -0.005530f, -0.558972f,
+  0.057684f,  -0.309357f, -0.283325f, -0.278445f, -0.420115f, -0.418457f,
+  -0.391481f, -0.418460f, -0.003897f, -0.023744f, -0.312330f, -0.366213f,
+  0.269628f,  -0.274877f, -0.189988f, -0.419555f, -0.034033f, 0.192874f,
+  -0.135487f, -0.326108f, -0.039019f, 0.185029f,  -0.264883f, -0.563447f,
+  -0.163532f, -0.447652f, -0.141851f, 0.001714f,  -0.193184f, 0.032609f,
+  -0.112883f, 0.074599f,  0.490665f,  0.434764f,  0.021652f,  -0.219618f,
+  0.743267f,  0.147195f,  -0.303479f, -0.097674f, 0.195813f,  0.704007f,
+  -1.290851f, 0.119701f,  0.224065f,  0.260246f,  -0.580657f, -0.096201f,
+  -0.333214f, -0.586689f, 0.567178f,  0.157340f,  -0.043184f, 0.194358f,
+  -0.026506f, -0.339894f, -0.571803f, -0.234828f, 0.147054f,  -0.564178f,
+  -0.156933f, -0.366055f, -0.691687f, -0.187501f, 0.215834f,  -0.346106f,
+  -0.256892f, 0.110915f,  -0.337464f, -0.341474f, -0.216113f, 0.249445f,
+  -0.070175f, -0.412141f, 0.153458f,  -0.081280f, 0.164669f,  -0.356396f,
+  -0.294971f, -0.165121f, -0.133585f, -0.071467f, 0.295147f,  -0.253233f,
+  -0.213833f, -0.343416f, -0.474344f, -0.304000f, -0.341379f, -0.331456f,
+  -0.393952f, -0.508004f, -0.569518f, -0.509864f, 0.121961f,  0.011957f,
+  0.000498f,  -0.201969f, -0.407195f, -0.414375f, -0.295846f, 0.247492f,
+  0.124249f,  -0.550804f, -0.420397f, -0.123462f, 0.333292f,  -0.240230f,
+  -0.025604f, 0.337536f,  -0.295006f, -0.272614f, -0.496850f, -0.278521f,
+  0.234591f,  -0.052775f, -0.014052f, -0.260078f, -0.279128f, -0.036385f,
+  0.008714f,  -0.064018f, -0.124873f, -0.334014f,
+};
+
+static const float av1_early_term_after_split_nn_bias_8_layer0[] = {
+  1.202379f,  -0.117005f, -0.135527f, -0.262255f, -0.443658f, -0.078981f,
+  0.615653f,  -0.124482f, -0.227768f, -0.227014f, -0.135898f, 0.143216f,
+  -0.225995f, 0.370877f,  -0.214821f, -0.227752f,
+};
+
+static const float av1_early_term_after_split_nn_weights_8_layer1[] = {
+  0.376594f,  0.266703f,  -0.039847f, 1.680142f,  -0.879939f, 0.286806f,
+  -0.378223f, -0.405295f, -0.021107f, 0.039188f,  0.259308f,  0.193091f,
+  0.077994f,  -0.269141f, 0.011180f,  -0.019262f,
+};
+
+static const float av1_early_term_after_split_nn_bias_8_layer1[] = {
+  -1.29585564f,
+};
+
+static const NN_CONFIG av1_early_term_after_split_nnconfig_8 = {
+  FEATURES,
+  1,
+  1,
+  {
+      16,
+  },
+  {
+      av1_early_term_after_split_nn_weights_8_layer0,
+      av1_early_term_after_split_nn_weights_8_layer1,
+  },
+  {
+      av1_early_term_after_split_nn_bias_8_layer0,
+      av1_early_term_after_split_nn_bias_8_layer1,
+  },
+};
+#undef FEATURES
+#undef HIDDEN_NODES
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_PARTITION_MODEL_WEIGHTS_H_
diff --git a/libs/libaom/src/av1/encoder/partition_strategy.c b/libs/libaom/src/av1/encoder/partition_strategy.c
new file mode 100644
index 000000000..cc820ba24
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/partition_strategy.c
@@ -0,0 +1,1288 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <float.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_ports/system_state.h"
+
+#include "av1/common/enums.h"
+#include "av1/common/reconinter.h"
+
+#if !CONFIG_REALTIME_ONLY
+#include "av1/encoder/cnn.h"
+#include "av1/encoder/partition_model_weights.h"
+#include "av1/encoder/partition_cnn_weights.h"
+#endif
+#include "av1/encoder/encoder.h"
+
+#include "av1/encoder/motion_search_facade.h"
+#include "av1/encoder/partition_strategy.h"
+#include "av1/encoder/rdopt.h"
+
+#if !CONFIG_REALTIME_ONLY
+static AOM_INLINE void simple_motion_search_prune_part_features(
+    AV1_COMP *const cpi, MACROBLOCK *x, PC_TREE *pc_tree, int mi_row,
+    int mi_col, BLOCK_SIZE bsize, float *features, int features_to_get);
+#endif
+
+static INLINE int convert_bsize_to_idx(BLOCK_SIZE bsize) {
+  switch (bsize) {
+    case BLOCK_128X128: return 0;
+    case BLOCK_64X64: return 1;
+    case BLOCK_32X32: return 2;
+    case BLOCK_16X16: return 3;
+    case BLOCK_8X8: return 4;
+    default: assert(0 && "Invalid bsize"); return -1;
+  }
+}
+
+#if !CONFIG_REALTIME_ONLY
+// TODO(chiyotsai@google.com): This is very much a work in progress. We still
+// need to the following:
+//   -- add support for hdres
+//   -- add support for pruning rectangular partitions
+//   -- use reconstructed pixels instead of source pixels for padding
+//   -- use chroma pixels in addition to luma pixels
+void av1_intra_mode_cnn_partition(const AV1_COMMON *const cm, MACROBLOCK *x,
+                                  int bsize, int quad_tree_idx,
+                                  int *partition_none_allowed,
+                                  int *partition_horz_allowed,
+                                  int *partition_vert_allowed,
+                                  int *do_rectangular_split,
+                                  int *do_square_split) {
+  assert(cm->seq_params.sb_size >= BLOCK_64X64 &&
+         "Invalid sb_size for intra_cnn!");
+  const int bsize_idx = convert_bsize_to_idx(bsize);
+
+  if (bsize == BLOCK_128X128) {
+    return;
+  }
+
+  // Precompute the CNN part and cache the result in MACROBLOCK
+  if (bsize == BLOCK_64X64 && !x->cnn_output_valid) {
+    aom_clear_system_state();
+    const CNN_CONFIG *cnn_config = &av1_intra_mode_cnn_partition_cnn_config;
+
+    // Prepare the output
+    const CNN_THREAD_DATA thread_data = { .num_workers = 1, .workers = NULL };
+    const int num_outputs = 4;
+    const int output_dims[4] = { 1, 2, 4, 8 };
+    const int out_chs[4] = { CNN_BRANCH_0_OUT_CH, CNN_BRANCH_1_OUT_CH,
+                             CNN_BRANCH_2_OUT_CH, CNN_BRANCH_3_OUT_CH };
+    float *output_buffer[CNN_TOT_OUT_CH];
+
+    float **cur_output_buf = output_buffer;
+    float *curr_buf_ptr = x->cnn_buffer;
+    for (int output_idx = 0; output_idx < num_outputs; output_idx++) {
+      const int num_chs = out_chs[output_idx];
+      const int ch_size = output_dims[output_idx] * output_dims[output_idx];
+      for (int ch = 0; ch < num_chs; ch++) {
+        cur_output_buf[ch] = curr_buf_ptr;
+        curr_buf_ptr += ch_size;
+      }
+      cur_output_buf += num_chs;
+    }
+
+    CNN_MULTI_OUT output = {
+      .num_outputs = 4,
+      .output_channels = out_chs,
+      .output_strides = output_dims,
+      .output_buffer = output_buffer,
+    };
+
+    // Prepare the input
+    const MACROBLOCKD *xd = &x->e_mbd;
+    const int bit_depth = xd->bd;
+    const int dc_q =
+        av1_dc_quant_QTX(x->qindex, 0, bit_depth) >> (bit_depth - 8);
+    x->log_q = logf(1.0f + (float)(dc_q * dc_q) / 256.0f);
+    x->log_q = (x->log_q - av1_intra_mode_cnn_partition_mean[0]) /
+               av1_intra_mode_cnn_partition_std[0];
+
+    const int width = 65, height = 65,
+              stride = x->plane[AOM_PLANE_Y].src.stride;
+
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      uint16_t *image[1] = {
+        CONVERT_TO_SHORTPTR(x->plane[AOM_PLANE_Y].src.buf) - stride - 1
+      };
+
+      av1_cnn_predict_img_multi_out_highbd(image, width, height, stride,
+                                           cnn_config, &thread_data, bit_depth,
+                                           &output);
+    } else {
+      uint8_t *image[1] = { x->plane[AOM_PLANE_Y].src.buf - stride - 1 };
+
+      av1_cnn_predict_img_multi_out(image, width, height, stride, cnn_config,
+                                    &thread_data, &output);
+    }
+
+    x->cnn_output_valid = 1;
+  }
+
+  if (!x->cnn_output_valid) {
+    return;
+  }
+
+  const NN_CONFIG *dnn_configs[5] = {
+    NULL,
+    &av1_intra_mode_cnn_partition_branch_0_dnn_config,
+    &av1_intra_mode_cnn_partition_branch_1_dnn_config,
+    &av1_intra_mode_cnn_partition_branch_2_dnn_config,
+    &av1_intra_mode_cnn_partition_branch_3_dnn_config,
+  };
+
+  const NN_CONFIG *dnn_config = dnn_configs[bsize_idx];
+
+  aom_clear_system_state();
+  float dnn_features[100];
+  float logits[4] = { 0.0f };
+
+  const float *branch_0 = x->cnn_buffer;
+  const float *branch_1 = branch_0 + CNN_BRANCH_0_OUT_SIZE;
+  const float *branch_2 = branch_1 + CNN_BRANCH_1_OUT_SIZE;
+  const float *branch_3 = branch_2 + CNN_BRANCH_2_OUT_SIZE;
+
+  if (bsize == BLOCK_64X64) {
+    int f_idx = 0;
+    for (int ch_idx = 0; ch_idx < CNN_BRANCH_0_OUT_CH; ch_idx++) {
+      dnn_features[f_idx++] = branch_0[ch_idx];
+    }
+
+    const int spa_stride = 2 * 2;
+    for (int lin_idx = 0; lin_idx < spa_stride; lin_idx++) {
+      for (int ch_idx = 0; ch_idx < CNN_BRANCH_1_OUT_CH; ch_idx++) {
+        dnn_features[f_idx++] = branch_1[lin_idx + ch_idx * spa_stride];
+      }
+    }
+    dnn_features[f_idx++] = x->log_q;
+  } else if (bsize == BLOCK_32X32) {
+    int f_idx = 0;
+    for (int idx = 0; idx < CNN_BRANCH_0_OUT_CH; idx++) {
+      dnn_features[f_idx++] = branch_0[idx];
+    }
+
+    const int curr_lin_idx = quad_to_linear_1[quad_tree_idx - 1];
+    const int spa_stride = 2 * 2;
+    for (int ch_idx = 0; ch_idx < CNN_BRANCH_1_OUT_CH; ch_idx++) {
+      dnn_features[f_idx++] = branch_1[curr_lin_idx + ch_idx * spa_stride];
+    }
+    dnn_features[f_idx++] = x->log_q;
+  } else if (bsize == BLOCK_16X16) {
+    int f_idx = 0;
+    const int prev_quad_idx = (quad_tree_idx - 1) / 4;
+    const int prev_lin_idx = quad_to_linear_1[prev_quad_idx - 1];
+    const int prev_spa_stride = 2 * 2;
+    for (int ch_idx = 0; ch_idx < CNN_BRANCH_1_OUT_CH; ch_idx++) {
+      dnn_features[f_idx++] = branch_1[prev_lin_idx + ch_idx * prev_spa_stride];
+    }
+
+    const int curr_lin_idx = quad_to_linear_2[quad_tree_idx - 5];
+    const int spa_stride = 4 * 4;
+    for (int ch_idx = 0; ch_idx < CNN_BRANCH_2_OUT_CH; ch_idx++) {
+      dnn_features[f_idx++] = branch_2[curr_lin_idx + ch_idx * spa_stride];
+    }
+    dnn_features[f_idx++] = x->log_q;
+  } else if (bsize == BLOCK_8X8) {
+    int f_idx = 0;
+    const int prev_quad_idx = (quad_tree_idx - 1) / 4;
+    const int prev_lin_idx = quad_to_linear_2[prev_quad_idx - 5];
+    const int prev_spa_stride = 4 * 4;
+    for (int ch_idx = 0; ch_idx < CNN_BRANCH_2_OUT_CH; ch_idx++) {
+      dnn_features[f_idx++] = branch_2[prev_lin_idx + ch_idx * prev_spa_stride];
+    }
+
+    const int curr_lin_idx = quad_to_linear_3[quad_tree_idx - 21];
+    const int spa_stride = 8 * 8;
+    for (int ch_idx = 0; ch_idx < CNN_BRANCH_3_OUT_CH; ch_idx++) {
+      dnn_features[f_idx++] = branch_3[curr_lin_idx + ch_idx * spa_stride];
+    }
+    dnn_features[f_idx++] = x->log_q;
+  } else {
+    assert(0 && "Invalid bsize in intra_cnn partition");
+  }
+
+  // Make decision
+  av1_nn_predict(dnn_features, dnn_config, 1, logits);
+  aom_clear_system_state();
+
+  const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720;
+  const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480;
+  float split_only_thresh = 100.0f, no_split_thresh = -100.0f;
+  if (is_720p_or_larger) {
+    split_only_thresh =
+        av1_intra_mode_cnn_partition_split_thresh_hdres[bsize_idx];
+    no_split_thresh =
+        av1_intra_mode_cnn_partition_no_split_thresh_hdres[bsize_idx];
+  } else if (is_480p_or_larger) {
+    split_only_thresh =
+        av1_intra_mode_cnn_partition_split_thresh_midres[bsize_idx];
+    no_split_thresh =
+        av1_intra_mode_cnn_partition_no_split_thresh_midres[bsize_idx];
+  } else {
+    split_only_thresh =
+        av1_intra_mode_cnn_partition_split_thresh_lowres[bsize_idx];
+    no_split_thresh =
+        av1_intra_mode_cnn_partition_no_split_thresh_lowres[bsize_idx];
+  }
+
+  if (logits[0] > split_only_thresh) {
+    *partition_none_allowed = 0;
+    *partition_horz_allowed = 0;
+    *partition_vert_allowed = 0;
+    *do_rectangular_split = 0;
+  }
+
+  if (logits[0] < no_split_thresh) {
+    *do_square_split = 0;
+  }
+}
+
+void av1_simple_motion_search_based_split(
+    AV1_COMP *const cpi, MACROBLOCK *x, PC_TREE *pc_tree, int mi_row,
+    int mi_col, BLOCK_SIZE bsize, int *partition_none_allowed,
+    int *partition_horz_allowed, int *partition_vert_allowed,
+    int *do_rectangular_split, int *do_square_split) {
+  aom_clear_system_state();
+
+  const AV1_COMMON *const cm = &cpi->common;
+  const int bsize_idx = convert_bsize_to_idx(bsize);
+  const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720;
+  const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480;
+  // res_idx is 0 for res < 480p, 1 for 480p, 2 for 720p+
+  const int res_idx = is_480p_or_larger + is_720p_or_larger;
+
+  assert(bsize_idx >= 0 && bsize_idx <= 4 &&
+         "Invalid bsize in simple_motion_search_based_split");
+
+  const float *ml_mean = av1_simple_motion_search_split_mean[bsize_idx];
+  const float *ml_std = av1_simple_motion_search_split_std[bsize_idx];
+  const NN_CONFIG *nn_config =
+      av1_simple_motion_search_split_nn_config[bsize_idx];
+  const int agg = cpi->sf.part_sf.simple_motion_search_prune_agg;
+
+  const float split_only_thresh =
+      av1_simple_motion_search_split_thresh[agg][res_idx][bsize_idx];
+  const float no_split_thresh =
+      av1_simple_motion_search_no_split_thresh[agg][res_idx][bsize_idx];
+
+  float features[FEATURE_SIZE_SMS_SPLIT] = { 0.0f };
+  simple_motion_search_prune_part_features(cpi, x, pc_tree, mi_row, mi_col,
+                                           bsize, features,
+                                           FEATURE_SMS_SPLIT_MODEL_FLAG);
+  for (int idx = 0; idx < FEATURE_SIZE_SMS_SPLIT; idx++) {
+    features[idx] = (features[idx] - ml_mean[idx]) / ml_std[idx];
+  }
+
+  float score = 0.0f;
+
+  av1_nn_predict(features, nn_config, 1, &score);
+  aom_clear_system_state();
+
+  if (score > split_only_thresh) {
+    *partition_none_allowed = 0;
+    *partition_horz_allowed = 0;
+    *partition_vert_allowed = 0;
+    *do_rectangular_split = 0;
+  }
+
+  if (cpi->sf.part_sf.simple_motion_search_split >= 2 &&
+      score < no_split_thresh) {
+    *do_square_split = 0;
+  }
+}
+
+// Given a list of ref frames in refs, performs simple_motion_search on each of
+// the refs and returns the ref with the smallest sse. Returns -1 if none of the
+// ref in the list is available. Also stores the best sse and var in best_sse,
+// best_var, respectively. If save_mv is 0, don't update mv_ref_fulls in
+// pc_tree. If save_mv is 1, update mv_ref_fulls under pc_tree and the
+// subtrees.
+static int simple_motion_search_get_best_ref(
+    AV1_COMP *const cpi, MACROBLOCK *x, PC_TREE *pc_tree, int mi_row,
+    int mi_col, BLOCK_SIZE bsize, const int *const refs, int num_refs,
+    int use_subpixel, int save_mv, unsigned int *best_sse,
+    unsigned int *best_var) {
+  const AV1_COMMON *const cm = &cpi->common;
+  int best_ref = -1;
+
+  if (mi_col >= cm->mi_params.mi_cols || mi_row >= cm->mi_params.mi_rows) {
+    // If the whole block is outside of the image, set the var and sse to 0.
+    *best_var = 0;
+    *best_sse = 0;
+
+    return best_ref;
+  }
+
+  // Otherwise do loop through the reference frames and find the one with the
+  // minimum SSE
+  const MACROBLOCKD *xd = &x->e_mbd;
+
+  const int num_planes = 1;
+
+  *best_sse = INT_MAX;
+
+  for (int ref_idx = 0; ref_idx < num_refs; ref_idx++) {
+    const int ref = refs[ref_idx];
+
+    if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ref]) {
+      const FULLPEL_MV *start_mvs = pc_tree->start_mvs;
+      unsigned int curr_sse = 0, curr_var = 0;
+      int_mv best_mv =
+          av1_simple_motion_search(cpi, x, mi_row, mi_col, bsize, ref,
+                                   start_mvs[ref], num_planes, use_subpixel);
+      curr_var = cpi->fn_ptr[bsize].vf(
+          x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].dst.buf,
+          xd->plane[0].dst.stride, &curr_sse);
+      if (curr_sse < *best_sse) {
+        *best_sse = curr_sse;
+        *best_var = curr_var;
+        best_ref = ref;
+      }
+
+      if (save_mv) {
+        pc_tree->start_mvs[ref].row = best_mv.as_mv.row / 8;
+        pc_tree->start_mvs[ref].col = best_mv.as_mv.col / 8;
+
+        if (bsize >= BLOCK_8X8) {
+          for (int r_idx = 0; r_idx < 4; r_idx++) {
+            // Propagate the new motion vectors to a lower level
+            PC_TREE *sub_tree = pc_tree->split[r_idx];
+            sub_tree->start_mvs[ref] = pc_tree->start_mvs[ref];
+          }
+        }
+      }
+    }
+  }
+
+  return best_ref;
+}
+
+// Collects features using simple_motion_search and store them in features. The
+// features are also cached in PC_TREE. By default, the features collected are
+// the sse and var from the subblocks flagged by features_to_get. Furthermore,
+// if features is not NULL, then 7 more features are appended to the end of
+// features:
+//  - log(1.0 + dc_q ** 2)
+//  - whether an above macroblock exists
+//  - width of above macroblock
+//  - height of above macroblock
+//  - whether a left marcoblock exists
+//  - width of left macroblock
+//  - height of left macroblock
+static AOM_INLINE void simple_motion_search_prune_part_features(
+    AV1_COMP *const cpi, MACROBLOCK *x, PC_TREE *pc_tree, int mi_row,
+    int mi_col, BLOCK_SIZE bsize, float *features, int features_to_get) {
+  const int w_mi = mi_size_wide[bsize];
+  const int h_mi = mi_size_high[bsize];
+  assert(mi_size_wide[bsize] == mi_size_high[bsize]);
+  assert(cpi->ref_frame_flags & av1_ref_frame_flag_list[LAST_FRAME] ||
+         cpi->ref_frame_flags & av1_ref_frame_flag_list[ALTREF_FRAME]);
+
+  // Setting up motion search
+  const int ref_list[] = { cpi->rc.is_src_frame_alt_ref ? ALTREF_FRAME
+                                                        : LAST_FRAME };
+  const int num_refs = 1;
+  const int use_subpixel = 1;
+
+  // Doing whole block first to update the mv
+  if (!pc_tree->sms_none_valid && features_to_get & FEATURE_SMS_NONE_FLAG) {
+    simple_motion_search_get_best_ref(cpi, x, pc_tree, mi_row, mi_col, bsize,
+                                      ref_list, num_refs, use_subpixel, 1,
+                                      &pc_tree->sms_none_feat[0],
+                                      &pc_tree->sms_none_feat[1]);
+    pc_tree->sms_none_valid = 1;
+  }
+
+  // Split subblocks
+  if (features_to_get & FEATURE_SMS_SPLIT_FLAG) {
+    const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+    for (int r_idx = 0; r_idx < 4; r_idx++) {
+      const int sub_mi_col = mi_col + (r_idx & 1) * w_mi / 2;
+      const int sub_mi_row = mi_row + (r_idx >> 1) * h_mi / 2;
+      PC_TREE *sub_tree = pc_tree->split[r_idx];
+
+      if (!sub_tree->sms_none_valid) {
+        simple_motion_search_get_best_ref(
+            cpi, x, sub_tree, sub_mi_row, sub_mi_col, subsize, ref_list,
+            num_refs, use_subpixel, 1, &sub_tree->sms_none_feat[0],
+            &sub_tree->sms_none_feat[1]);
+        sub_tree->sms_none_valid = 1;
+      }
+    }
+  }
+
+  // Rectangular subblocks
+  if (!pc_tree->sms_rect_valid && features_to_get & FEATURE_SMS_RECT_FLAG) {
+    // Horz subblock
+    BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_HORZ);
+    for (int r_idx = 0; r_idx < 2; r_idx++) {
+      const int sub_mi_col = mi_col + 0;
+      const int sub_mi_row = mi_row + r_idx * h_mi / 2;
+
+      simple_motion_search_get_best_ref(
+          cpi, x, pc_tree, sub_mi_row, sub_mi_col, subsize, ref_list, num_refs,
+          use_subpixel, 0, &pc_tree->sms_rect_feat[2 * r_idx],
+          &pc_tree->sms_rect_feat[2 * r_idx + 1]);
+    }
+
+    // Vert subblock
+    subsize = get_partition_subsize(bsize, PARTITION_VERT);
+    for (int r_idx = 0; r_idx < 2; r_idx++) {
+      const int sub_mi_col = mi_col + r_idx * w_mi / 2;
+      const int sub_mi_row = mi_row + 0;
+
+      simple_motion_search_get_best_ref(
+          cpi, x, pc_tree, sub_mi_row, sub_mi_col, subsize, ref_list, num_refs,
+          use_subpixel, 0, &pc_tree->sms_rect_feat[4 + 2 * r_idx],
+          &pc_tree->sms_rect_feat[4 + 2 * r_idx + 1]);
+    }
+    pc_tree->sms_rect_valid = 1;
+  }
+
+  if (!features) return;
+
+  aom_clear_system_state();
+  int f_idx = 0;
+  if (features_to_get & FEATURE_SMS_NONE_FLAG) {
+    for (int sub_idx = 0; sub_idx < 2; sub_idx++) {
+      features[f_idx++] = logf(1.0f + pc_tree->sms_none_feat[sub_idx]);
+    }
+  }
+
+  if (features_to_get & FEATURE_SMS_SPLIT_FLAG) {
+    for (int sub_idx = 0; sub_idx < 4; sub_idx++) {
+      PC_TREE *sub_tree = pc_tree->split[sub_idx];
+      features[f_idx++] = logf(1.0f + sub_tree->sms_none_feat[0]);
+      features[f_idx++] = logf(1.0f + sub_tree->sms_none_feat[1]);
+    }
+  }
+
+  if (features_to_get & FEATURE_SMS_RECT_FLAG) {
+    for (int sub_idx = 0; sub_idx < 8; sub_idx++) {
+      features[f_idx++] = logf(1.0f + pc_tree->sms_rect_feat[sub_idx]);
+    }
+  }
+
+  const MACROBLOCKD *xd = &x->e_mbd;
+  set_offsets_for_motion_search(cpi, x, mi_row, mi_col, bsize);
+
+  // Q_INDEX
+  const int dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd) >> (xd->bd - 8);
+  features[f_idx++] = logf(1.0f + (float)(dc_q * dc_q) / 256.0f);
+
+  // Neighbor stuff
+  const int has_above = !!xd->above_mbmi;
+  const int has_left = !!xd->left_mbmi;
+  const BLOCK_SIZE above_bsize = has_above ? xd->above_mbmi->sb_type : bsize;
+  const BLOCK_SIZE left_bsize = has_left ? xd->left_mbmi->sb_type : bsize;
+  features[f_idx++] = (float)has_above;
+  features[f_idx++] = (float)mi_size_wide_log2[above_bsize];
+  features[f_idx++] = (float)mi_size_high_log2[above_bsize];
+  features[f_idx++] = (float)has_left;
+  features[f_idx++] = (float)mi_size_wide_log2[left_bsize];
+  features[f_idx++] = (float)mi_size_high_log2[left_bsize];
+}
+
+void av1_simple_motion_search_prune_rect(AV1_COMP *const cpi, MACROBLOCK *x,
+                                         PC_TREE *pc_tree, int mi_row,
+                                         int mi_col, BLOCK_SIZE bsize,
+                                         int *partition_horz_allowed,
+                                         int *partition_vert_allowed,
+                                         int *prune_horz, int *prune_vert) {
+  aom_clear_system_state();
+  const AV1_COMMON *const cm = &cpi->common;
+  const int bsize_idx = convert_bsize_to_idx(bsize);
+  const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720;
+  const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480;
+  // res_idx is 0 for lowres, 1 for 48p, 2 for 720p+
+  const int res_idx = is_480p_or_larger + is_720p_or_larger;
+
+  // Get model parameters
+  const NN_CONFIG *nn_config =
+      av1_simple_motion_search_prune_rect_nn_config[bsize_idx];
+  const float *ml_mean = av1_simple_motion_search_prune_rect_mean[bsize_idx],
+              *ml_std = av1_simple_motion_search_prune_rect_std[bsize_idx];
+
+  const int agg = cpi->sf.part_sf.simple_motion_search_prune_agg;
+  const float prune_thresh =
+      av1_simple_motion_search_prune_rect_thresh[agg][res_idx][bsize_idx];
+
+  // If there is no valid threshold, return immediately.
+  if (!nn_config || prune_thresh == 0.0f) {
+    return;
+  }
+
+  // Get features
+  float features[FEATURE_SIZE_SMS_PRUNE_PART] = { 0.0f };
+  simple_motion_search_prune_part_features(cpi, x, pc_tree, mi_row, mi_col,
+                                           bsize, features,
+                                           FEATURE_SMS_PRUNE_PART_FLAG);
+  for (int f_idx = 0; f_idx < FEATURE_SIZE_SMS_PRUNE_PART; f_idx++) {
+    features[f_idx] = (features[f_idx] - ml_mean[f_idx]) / ml_std[f_idx];
+  }
+
+  // Get probabilities
+  float scores[EXT_PARTITION_TYPES] = { 0.0f },
+        probs[EXT_PARTITION_TYPES] = { 0.0f };
+  const int num_classes = (bsize == BLOCK_128X128 || bsize == BLOCK_8X8)
+                              ? PARTITION_TYPES
+                              : EXT_PARTITION_TYPES;
+
+  av1_nn_predict(features, nn_config, 1, scores);
+  aom_clear_system_state();
+
+  av1_nn_softmax(scores, probs, num_classes);
+
+  // Determine if we should prune rectangular partitions.
+  if (cpi->sf.part_sf.simple_motion_search_prune_rect &&
+      !frame_is_intra_only(cm) &&
+      (*partition_horz_allowed || *partition_vert_allowed) &&
+      bsize >= BLOCK_8X8 && !av1_superres_scaled(cm)) {
+    *prune_horz = probs[PARTITION_HORZ] <= prune_thresh;
+    *prune_vert = probs[PARTITION_VERT] <= prune_thresh;
+  }
+}
+
+// Early terminates PARTITION_NONE using simple_motion_search features and the
+// rate, distortion, and rdcost of PARTITION_NONE. This is only called when:
+//  - The frame is a show frame
+//  - The frame is not intra only
+//  - The current bsize is > BLOCK_8X8
+//  - blk_row + blk_height/2 < total_rows and blk_col + blk_width/2 < total_cols
+void av1_simple_motion_search_early_term_none(AV1_COMP *const cpi,
+                                              MACROBLOCK *x, PC_TREE *pc_tree,
+                                              int mi_row, int mi_col,
+                                              BLOCK_SIZE bsize,
+                                              const RD_STATS *none_rdc,
+                                              int *early_terminate) {
+  // TODO(chiyotsai@google.com): There are other features we can extract from
+  // PARTITION_NONE. Play with this later.
+  float features[FEATURE_SIZE_SMS_TERM_NONE] = { 0.0f };
+  simple_motion_search_prune_part_features(cpi, x, pc_tree, mi_row, mi_col,
+                                           bsize, features,
+                                           FEATURE_SMS_PRUNE_PART_FLAG);
+  int f_idx = FEATURE_SIZE_SMS_PRUNE_PART;
+
+  features[f_idx++] = logf(1.0f + (float)none_rdc->rate);
+  features[f_idx++] = logf(1.0f + (float)none_rdc->dist);
+  features[f_idx++] = logf(1.0f + (float)none_rdc->rdcost);
+
+  assert(f_idx == FEATURE_SIZE_SMS_TERM_NONE);
+
+  const float *ml_mean = NULL;
+  const float *ml_std = NULL;
+  const float *ml_model = NULL;
+
+  if (bsize == BLOCK_128X128) {
+    ml_mean = av1_simple_motion_search_term_none_mean_128;
+    ml_std = av1_simple_motion_search_term_none_std_128;
+    ml_model = av1_simple_motion_search_term_none_model_128;
+  } else if (bsize == BLOCK_64X64) {
+    ml_mean = av1_simple_motion_search_term_none_mean_64;
+    ml_std = av1_simple_motion_search_term_none_std_64;
+    ml_model = av1_simple_motion_search_term_none_model_64;
+  } else if (bsize == BLOCK_32X32) {
+    ml_mean = av1_simple_motion_search_term_none_mean_32;
+    ml_std = av1_simple_motion_search_term_none_std_32;
+    ml_model = av1_simple_motion_search_term_none_model_32;
+  } else if (bsize == BLOCK_16X16) {
+    ml_mean = av1_simple_motion_search_term_none_mean_16;
+    ml_std = av1_simple_motion_search_term_none_std_16;
+    ml_model = av1_simple_motion_search_term_none_model_16;
+  } else {
+    assert(0 && "Unexpected block size in simple_motion_term_none");
+  }
+
+  if (ml_model) {
+    float score = 0.0f;
+    for (f_idx = 0; f_idx < FEATURE_SIZE_SMS_TERM_NONE; f_idx++) {
+      score +=
+          ml_model[f_idx] * (features[f_idx] - ml_mean[f_idx]) / ml_std[f_idx];
+    }
+    score += ml_model[FEATURE_SIZE_SMS_TERM_NONE];
+
+    if (score >= 0.0f) {
+      *early_terminate = 1;
+    }
+  }
+}
+
+void av1_get_max_min_partition_features(AV1_COMP *const cpi, MACROBLOCK *x,
+                                        int mi_row, int mi_col,
+                                        float *features) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  const BLOCK_SIZE sb_size = cm->seq_params.sb_size;
+
+  assert(sb_size == BLOCK_128X128);
+
+  int f_idx = 0;
+
+  const int dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd) >> (xd->bd - 8);
+  aom_clear_system_state();
+  const float log_q_sq = logf(1.0f + (float)(dc_q * dc_q) / 256.0f);
+
+  // Perform full-pixel single motion search in Y plane of 16x16 mbs in the sb
+  float sum_mv_row_sq = 0;
+  float sum_mv_row = 0;
+  float min_abs_mv_row = FLT_MAX;
+  float max_abs_mv_row = 0;
+
+  float sum_mv_col_sq = 0;
+  float sum_mv_col = 0;
+  float min_abs_mv_col = FLT_MAX;
+  float max_abs_mv_col = 0;
+
+  float sum_log_sse_sq = 0;
+  float sum_log_sse = 0;
+  float min_log_sse = FLT_MAX;
+  float max_log_sse = 0;
+
+  const BLOCK_SIZE mb_size = BLOCK_16X16;
+  const int mb_rows = block_size_high[sb_size] / block_size_high[mb_size];
+  const int mb_cols = block_size_wide[sb_size] / block_size_wide[mb_size];
+  const int mb_in_mi_size_high_log2 = mi_size_high_log2[mb_size];
+  const int mb_in_mi_size_wide_log2 = mi_size_wide_log2[mb_size];
+
+  for (int mb_row = 0; mb_row < mb_rows; mb_row++)
+    for (int mb_col = 0; mb_col < mb_cols; mb_col++) {
+      const int this_mi_row = mi_row + (mb_row << mb_in_mi_size_high_log2);
+      const int this_mi_col = mi_col + (mb_col << mb_in_mi_size_wide_log2);
+      unsigned int sse = 0;
+      unsigned int var = 0;
+      const FULLPEL_MV start_mv = kZeroFullMv;
+      int_mv best_mv = av1_simple_motion_sse_var(
+          cpi, x, this_mi_row, this_mi_col, mb_size, start_mv, 0, &sse, &var);
+
+      aom_clear_system_state();
+      const float mv_row = (float)(best_mv.as_mv.row / 8);
+      const float mv_col = (float)(best_mv.as_mv.col / 8);
+      const float log_sse = logf(1.0f + (float)sse);
+      const float abs_mv_row = fabsf(mv_row);
+      const float abs_mv_col = fabsf(mv_col);
+
+      sum_mv_row_sq += mv_row * mv_row;
+      sum_mv_row += mv_row;
+      sum_mv_col_sq += mv_col * mv_col;
+      sum_mv_col += mv_col;
+
+      if (abs_mv_row < min_abs_mv_row) min_abs_mv_row = abs_mv_row;
+      if (abs_mv_row > max_abs_mv_row) max_abs_mv_row = abs_mv_row;
+      if (abs_mv_col < min_abs_mv_col) min_abs_mv_col = abs_mv_col;
+      if (abs_mv_col > max_abs_mv_col) max_abs_mv_col = abs_mv_col;
+
+      sum_log_sse_sq += log_sse * log_sse;
+      sum_log_sse += log_sse;
+      if (log_sse < min_log_sse) min_log_sse = log_sse;
+      if (log_sse > max_log_sse) max_log_sse = log_sse;
+    }
+  aom_clear_system_state();
+  const float avg_mv_row = sum_mv_row / 64.0f;
+  const float var_mv_row = sum_mv_row_sq / 64.0f - avg_mv_row * avg_mv_row;
+
+  const float avg_mv_col = sum_mv_col / 64.0f;
+  const float var_mv_col = sum_mv_col_sq / 64.0f - avg_mv_col * avg_mv_col;
+
+  const float avg_log_sse = sum_log_sse / 64.0f;
+  const float var_log_sse = sum_log_sse_sq / 64.0f - avg_log_sse * avg_log_sse;
+
+  features[f_idx++] = avg_log_sse;
+  features[f_idx++] = avg_mv_col;
+  features[f_idx++] = avg_mv_row;
+  features[f_idx++] = log_q_sq;
+  features[f_idx++] = max_abs_mv_col;
+  features[f_idx++] = max_abs_mv_row;
+  features[f_idx++] = max_log_sse;
+  features[f_idx++] = min_abs_mv_col;
+  features[f_idx++] = min_abs_mv_row;
+  features[f_idx++] = min_log_sse;
+  features[f_idx++] = var_log_sse;
+  features[f_idx++] = var_mv_col;
+  features[f_idx++] = var_mv_row;
+
+  assert(f_idx == FEATURE_SIZE_MAX_MIN_PART_PRED);
+}
+
+BLOCK_SIZE av1_predict_max_partition(AV1_COMP *const cpi, MACROBLOCK *const x,
+                                     const float *features) {
+  float scores[MAX_NUM_CLASSES_MAX_MIN_PART_PRED] = { 0.0f },
+        probs[MAX_NUM_CLASSES_MAX_MIN_PART_PRED] = { 0.0f };
+  const NN_CONFIG *nn_config = &av1_max_part_pred_nn_config;
+
+  assert(cpi->sf.part_sf.auto_max_partition_based_on_simple_motion !=
+         NOT_IN_USE);
+
+  aom_clear_system_state();
+  av1_nn_predict(features, nn_config, 1, scores);
+  av1_nn_softmax(scores, probs, MAX_NUM_CLASSES_MAX_MIN_PART_PRED);
+
+  int result = MAX_NUM_CLASSES_MAX_MIN_PART_PRED - 1;
+  if (cpi->sf.part_sf.auto_max_partition_based_on_simple_motion ==
+      DIRECT_PRED) {
+    result = 0;
+    float max_prob = probs[0];
+    for (int i = 1; i < MAX_NUM_CLASSES_MAX_MIN_PART_PRED; ++i) {
+      if (probs[i] > max_prob) {
+        max_prob = probs[i];
+        result = i;
+      }
+    }
+  } else if (cpi->sf.part_sf.auto_max_partition_based_on_simple_motion ==
+             RELAXED_PRED) {
+    for (result = MAX_NUM_CLASSES_MAX_MIN_PART_PRED - 1; result >= 0;
+         --result) {
+      if (result < MAX_NUM_CLASSES_MAX_MIN_PART_PRED - 1) {
+        probs[result] += probs[result + 1];
+      }
+      if (probs[result] > 0.2) break;
+    }
+  } else if (cpi->sf.part_sf.auto_max_partition_based_on_simple_motion ==
+             ADAPT_PRED) {
+    const BLOCK_SIZE sb_size = cpi->common.seq_params.sb_size;
+    MACROBLOCKD *const xd = &x->e_mbd;
+    // TODO(debargha): x->source_variance is unavailable at this point,
+    // so compute. The redundant recomputation later can be removed.
+    const unsigned int source_variance =
+        is_cur_buf_hbd(xd)
+            ? av1_high_get_sby_perpixel_variance(cpi, &x->plane[0].src, sb_size,
+                                                 xd->bd)
+            : av1_get_sby_perpixel_variance(cpi, &x->plane[0].src, sb_size);
+    if (source_variance > 16) {
+      const double thresh = source_variance < 128 ? 0.05 : 0.1;
+      for (result = MAX_NUM_CLASSES_MAX_MIN_PART_PRED - 1; result >= 0;
+           --result) {
+        if (result < MAX_NUM_CLASSES_MAX_MIN_PART_PRED - 1) {
+          probs[result] += probs[result + 1];
+        }
+        if (probs[result] > thresh) break;
+      }
+    }
+  }
+
+  return (BLOCK_SIZE)((result + 2) * 3);
+}
+
+// Get the minimum partition block width and height(in log scale) under a
+// PC_TREE.
+static AOM_INLINE void get_min_bsize(const PC_TREE *pc_tree, int *min_bw,
+                                     int *min_bh) {
+  if (!pc_tree) return;
+
+  const BLOCK_SIZE bsize = pc_tree->block_size;
+  if (bsize == BLOCK_4X4) {
+    *min_bw = 0;
+    *min_bh = 0;
+    return;
+  }
+
+  PARTITION_TYPE part_type = pc_tree->partitioning;
+  if (part_type == PARTITION_INVALID) return;
+
+  if (part_type == PARTITION_SPLIT) {
+    for (int i = 0; i < 4; ++i) {
+      get_min_bsize(pc_tree->split[i], min_bw, min_bh);
+    }
+  } else {
+    if (part_type == PARTITION_HORZ_A || part_type == PARTITION_HORZ_B ||
+        part_type == PARTITION_VERT_A || part_type == PARTITION_VERT_B)
+      part_type = PARTITION_SPLIT;
+    const BLOCK_SIZE subsize = get_partition_subsize(bsize, part_type);
+    if (subsize != BLOCK_INVALID) {
+      *min_bw = AOMMIN(*min_bw, mi_size_wide_log2[subsize]);
+      *min_bh = AOMMIN(*min_bh, mi_size_high_log2[subsize]);
+    }
+  }
+}
+
+static INLINE void add_rd_feature(int64_t rd, int64_t best_rd, float *features,
+                                  int *feature_idx) {
+  const int rd_valid = rd > 0 && rd < INT64_MAX;
+  const float rd_ratio = rd_valid ? (float)rd / best_rd : 1.0f;
+  features[(*feature_idx)++] = (float)rd_valid;
+  features[(*feature_idx)++] = rd_ratio;
+}
+
+#define FEATURES 31
+void av1_ml_early_term_after_split(AV1_COMP *const cpi, MACROBLOCK *const x,
+                                   PC_TREE *const pc_tree, BLOCK_SIZE bsize,
+                                   int64_t best_rd, int64_t part_none_rd,
+                                   int64_t part_split_rd,
+                                   int64_t *split_block_rd, int mi_row,
+                                   int mi_col,
+                                   int *const terminate_partition_search) {
+  if (best_rd <= 0 || best_rd == INT64_MAX || *terminate_partition_search)
+    return;
+
+  const AV1_COMMON *const cm = &cpi->common;
+  const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480;
+  const NN_CONFIG *nn_config = NULL;
+  float thresh = -1e6;
+  switch (bsize) {
+    case BLOCK_128X128: break;
+    case BLOCK_64X64:
+      nn_config = &av1_early_term_after_split_nnconfig_64;
+      thresh = is_480p_or_larger ? -2.0f : -1.2f;
+      break;
+    case BLOCK_32X32:
+      nn_config = &av1_early_term_after_split_nnconfig_32;
+      thresh = is_480p_or_larger ? -2.6f : -2.3f;
+      break;
+    case BLOCK_16X16:
+      nn_config = &av1_early_term_after_split_nnconfig_16;
+      thresh = is_480p_or_larger ? -2.0f : -2.4f;
+      break;
+    case BLOCK_8X8:
+      nn_config = &av1_early_term_after_split_nnconfig_8;
+      thresh = is_480p_or_larger ? -1.0f : -1.4f;
+      break;
+    case BLOCK_4X4: break;
+    default:
+      assert(0 && "Invalid block size in av1_ml_early_term_after_split().");
+      break;
+  }
+  if (!nn_config) return;
+
+  // Use more conservative threshold for level 1.
+  if (cpi->sf.part_sf.ml_early_term_after_part_split_level < 2) thresh -= 0.3f;
+
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const int dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd) >> (xd->bd - 8);
+  const int bs = block_size_wide[bsize];
+  int f_idx = 0;
+  float features[FEATURES] = { 0.0f };
+
+  aom_clear_system_state();
+
+  features[f_idx++] = logf(1.0f + (float)dc_q / 4.0f);
+  features[f_idx++] = logf(1.0f + (float)best_rd / bs / bs / 1024.0f);
+
+  add_rd_feature(part_none_rd, best_rd, features, &f_idx);
+  add_rd_feature(part_split_rd, best_rd, features, &f_idx);
+
+  for (int i = 0; i < 4; ++i) {
+    add_rd_feature(split_block_rd[i], best_rd, features, &f_idx);
+    int min_bw = MAX_SB_SIZE_LOG2;
+    int min_bh = MAX_SB_SIZE_LOG2;
+    get_min_bsize(pc_tree->split[i], &min_bw, &min_bh);
+    features[f_idx++] = (float)min_bw;
+    features[f_idx++] = (float)min_bh;
+  }
+
+  simple_motion_search_prune_part_features(cpi, x, pc_tree, mi_row, mi_col,
+                                           bsize, NULL,
+                                           FEATURE_SMS_PRUNE_PART_FLAG);
+
+  features[f_idx++] = logf(1.0f + (float)pc_tree->sms_none_feat[1]);
+
+  features[f_idx++] = logf(1.0f + (float)pc_tree->split[0]->sms_none_feat[1]);
+  features[f_idx++] = logf(1.0f + (float)pc_tree->split[1]->sms_none_feat[1]);
+  features[f_idx++] = logf(1.0f + (float)pc_tree->split[2]->sms_none_feat[1]);
+  features[f_idx++] = logf(1.0f + (float)pc_tree->split[3]->sms_none_feat[1]);
+
+  features[f_idx++] = logf(1.0f + (float)pc_tree->sms_rect_feat[1]);
+  features[f_idx++] = logf(1.0f + (float)pc_tree->sms_rect_feat[3]);
+  features[f_idx++] = logf(1.0f + (float)pc_tree->sms_rect_feat[5]);
+  features[f_idx++] = logf(1.0f + (float)pc_tree->sms_rect_feat[7]);
+
+  assert(f_idx == FEATURES);
+
+  float score = 0.0f;
+  av1_nn_predict(features, nn_config, 1, &score);
+  // Score is indicator of confidence that we should NOT terminate.
+  if (score < thresh) *terminate_partition_search = 1;
+}
+#undef FEATURES
+
+void av1_ml_prune_rect_partition(const AV1_COMP *const cpi,
+                                 const MACROBLOCK *const x, BLOCK_SIZE bsize,
+                                 int64_t best_rd, int64_t none_rd,
+                                 int64_t *split_rd, int *const dst_prune_horz,
+                                 int *const dst_prune_vert) {
+  if (bsize < BLOCK_8X8 || best_rd >= 1000000000) return;
+  best_rd = AOMMAX(best_rd, 1);
+  const NN_CONFIG *nn_config = NULL;
+  const float prob_thresholds[5] = { 0.01f, 0.01f, 0.004f, 0.002f, 0.002f };
+  float cur_thresh = 0.0f;
+  switch (bsize) {
+    case BLOCK_8X8:
+      nn_config = &av1_rect_partition_nnconfig_8;
+      cur_thresh = prob_thresholds[0];
+      break;
+    case BLOCK_16X16:
+      nn_config = &av1_rect_partition_nnconfig_16;
+      cur_thresh = prob_thresholds[1];
+      break;
+    case BLOCK_32X32:
+      nn_config = &av1_rect_partition_nnconfig_32;
+      cur_thresh = prob_thresholds[2];
+      break;
+    case BLOCK_64X64:
+      nn_config = &av1_rect_partition_nnconfig_64;
+      cur_thresh = prob_thresholds[3];
+      break;
+    case BLOCK_128X128:
+      nn_config = &av1_rect_partition_nnconfig_128;
+      cur_thresh = prob_thresholds[4];
+      break;
+    default: assert(0 && "Unexpected bsize.");
+  }
+  if (!nn_config) return;
+  aom_clear_system_state();
+
+  // 1. Compute input features
+  float features[9];
+
+  // RD cost ratios
+  for (int i = 0; i < 5; i++) features[i] = 1.0f;
+  if (none_rd > 0 && none_rd < 1000000000)
+    features[0] = (float)none_rd / (float)best_rd;
+  for (int i = 0; i < 4; i++) {
+    if (split_rd[i] > 0 && split_rd[i] < 1000000000)
+      features[1 + i] = (float)split_rd[i] / (float)best_rd;
+  }
+
+  // Variance ratios
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  int whole_block_variance;
+  if (is_cur_buf_hbd(xd)) {
+    whole_block_variance = av1_high_get_sby_perpixel_variance(
+        cpi, &x->plane[0].src, bsize, xd->bd);
+  } else {
+    whole_block_variance =
+        av1_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
+  }
+  whole_block_variance = AOMMAX(whole_block_variance, 1);
+
+  int split_variance[4];
+  const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+  struct buf_2d buf;
+  buf.stride = x->plane[0].src.stride;
+  const int bw = block_size_wide[bsize];
+  for (int i = 0; i < 4; ++i) {
+    const int x_idx = (i & 1) * bw / 2;
+    const int y_idx = (i >> 1) * bw / 2;
+    buf.buf = x->plane[0].src.buf + x_idx + y_idx * buf.stride;
+    if (is_cur_buf_hbd(xd)) {
+      split_variance[i] =
+          av1_high_get_sby_perpixel_variance(cpi, &buf, subsize, xd->bd);
+    } else {
+      split_variance[i] = av1_get_sby_perpixel_variance(cpi, &buf, subsize);
+    }
+  }
+
+  for (int i = 0; i < 4; i++)
+    features[5 + i] = (float)split_variance[i] / (float)whole_block_variance;
+
+  // 2. Do the prediction and prune 0-2 partitions based on their probabilities
+  float raw_scores[3] = { 0.0f };
+  av1_nn_predict(features, nn_config, 1, raw_scores);
+  aom_clear_system_state();
+  float probs[3] = { 0.0f };
+  av1_nn_softmax(raw_scores, probs, 3);
+
+  // probs[0] is the probability of the fact that both rectangular partitions
+  // are worse than current best_rd
+  if (probs[1] <= cur_thresh) (*dst_prune_horz) = 1;
+  if (probs[2] <= cur_thresh) (*dst_prune_vert) = 1;
+}
+
+// Use a ML model to predict if horz_a, horz_b, vert_a, and vert_b should be
+// considered.
+void av1_ml_prune_ab_partition(BLOCK_SIZE bsize, int part_ctx, int var_ctx,
+                               int64_t best_rd, int64_t horz_rd[2],
+                               int64_t vert_rd[2], int64_t split_rd[4],
+                               int *const horza_partition_allowed,
+                               int *const horzb_partition_allowed,
+                               int *const verta_partition_allowed,
+                               int *const vertb_partition_allowed) {
+  if (bsize < BLOCK_8X8 || best_rd >= 1000000000) return;
+  const NN_CONFIG *nn_config = NULL;
+  switch (bsize) {
+    case BLOCK_8X8: nn_config = NULL; break;
+    case BLOCK_16X16: nn_config = &av1_ab_partition_nnconfig_16; break;
+    case BLOCK_32X32: nn_config = &av1_ab_partition_nnconfig_32; break;
+    case BLOCK_64X64: nn_config = &av1_ab_partition_nnconfig_64; break;
+    case BLOCK_128X128: nn_config = &av1_ab_partition_nnconfig_128; break;
+    default: assert(0 && "Unexpected bsize.");
+  }
+  if (!nn_config) return;
+
+  aom_clear_system_state();
+
+  // Generate features.
+  float features[10];
+  int feature_index = 0;
+  features[feature_index++] = (float)part_ctx;
+  features[feature_index++] = (float)var_ctx;
+  const int rdcost = (int)AOMMIN(INT_MAX, best_rd);
+  int sub_block_rdcost[8] = { 0 };
+  int rd_index = 0;
+  for (int i = 0; i < 2; ++i) {
+    if (horz_rd[i] > 0 && horz_rd[i] < 1000000000)
+      sub_block_rdcost[rd_index] = (int)horz_rd[i];
+    ++rd_index;
+  }
+  for (int i = 0; i < 2; ++i) {
+    if (vert_rd[i] > 0 && vert_rd[i] < 1000000000)
+      sub_block_rdcost[rd_index] = (int)vert_rd[i];
+    ++rd_index;
+  }
+  for (int i = 0; i < 4; ++i) {
+    if (split_rd[i] > 0 && split_rd[i] < 1000000000)
+      sub_block_rdcost[rd_index] = (int)split_rd[i];
+    ++rd_index;
+  }
+  for (int i = 0; i < 8; ++i) {
+    // Ratio between the sub-block RD and the whole-block RD.
+    float rd_ratio = 1.0f;
+    if (sub_block_rdcost[i] > 0 && sub_block_rdcost[i] < rdcost)
+      rd_ratio = (float)sub_block_rdcost[i] / (float)rdcost;
+    features[feature_index++] = rd_ratio;
+  }
+  assert(feature_index == 10);
+
+  // Calculate scores using the NN model.
+  float score[16] = { 0.0f };
+  av1_nn_predict(features, nn_config, 1, score);
+  aom_clear_system_state();
+  int int_score[16];
+  int max_score = -1000;
+  for (int i = 0; i < 16; ++i) {
+    int_score[i] = (int)(100 * score[i]);
+    max_score = AOMMAX(int_score[i], max_score);
+  }
+
+  // Make decisions based on the model scores.
+  int thresh = max_score;
+  switch (bsize) {
+    case BLOCK_16X16: thresh -= 150; break;
+    case BLOCK_32X32: thresh -= 100; break;
+    default: break;
+  }
+  *horza_partition_allowed = 0;
+  *horzb_partition_allowed = 0;
+  *verta_partition_allowed = 0;
+  *vertb_partition_allowed = 0;
+  for (int i = 0; i < 16; ++i) {
+    if (int_score[i] >= thresh) {
+      if ((i >> 0) & 1) *horza_partition_allowed = 1;
+      if ((i >> 1) & 1) *horzb_partition_allowed = 1;
+      if ((i >> 2) & 1) *verta_partition_allowed = 1;
+      if ((i >> 3) & 1) *vertb_partition_allowed = 1;
+    }
+  }
+}
+
+#define FEATURES 18
+#define LABELS 4
+// Use a ML model to predict if horz4 and vert4 should be considered.
+void av1_ml_prune_4_partition(const AV1_COMP *const cpi, MACROBLOCK *const x,
+                              BLOCK_SIZE bsize, int part_ctx, int64_t best_rd,
+                              int64_t horz_rd[2], int64_t vert_rd[2],
+                              int64_t split_rd[4],
+                              int *const partition_horz4_allowed,
+                              int *const partition_vert4_allowed,
+                              unsigned int pb_source_variance, int mi_row,
+                              int mi_col) {
+  if (best_rd >= 1000000000) return;
+  const NN_CONFIG *nn_config = NULL;
+  switch (bsize) {
+    case BLOCK_16X16: nn_config = &av1_4_partition_nnconfig_16; break;
+    case BLOCK_32X32: nn_config = &av1_4_partition_nnconfig_32; break;
+    case BLOCK_64X64: nn_config = &av1_4_partition_nnconfig_64; break;
+    default: assert(0 && "Unexpected bsize.");
+  }
+  if (!nn_config) return;
+
+  aom_clear_system_state();
+
+  // Generate features.
+  float features[FEATURES];
+  int feature_index = 0;
+  features[feature_index++] = (float)part_ctx;
+  features[feature_index++] = (float)get_unsigned_bits(pb_source_variance);
+
+  const int rdcost = (int)AOMMIN(INT_MAX, best_rd);
+  int sub_block_rdcost[8] = { 0 };
+  int rd_index = 0;
+  for (int i = 0; i < 2; ++i) {
+    if (horz_rd[i] > 0 && horz_rd[i] < 1000000000)
+      sub_block_rdcost[rd_index] = (int)horz_rd[i];
+    ++rd_index;
+  }
+  for (int i = 0; i < 2; ++i) {
+    if (vert_rd[i] > 0 && vert_rd[i] < 1000000000)
+      sub_block_rdcost[rd_index] = (int)vert_rd[i];
+    ++rd_index;
+  }
+  for (int i = 0; i < 4; ++i) {
+    if (split_rd[i] > 0 && split_rd[i] < 1000000000)
+      sub_block_rdcost[rd_index] = (int)split_rd[i];
+    ++rd_index;
+  }
+  for (int i = 0; i < 8; ++i) {
+    // Ratio between the sub-block RD and the whole-block RD.
+    float rd_ratio = 1.0f;
+    if (sub_block_rdcost[i] > 0 && sub_block_rdcost[i] < rdcost)
+      rd_ratio = (float)sub_block_rdcost[i] / (float)rdcost;
+    features[feature_index++] = rd_ratio;
+  }
+
+  // Get variance of the 1:4 and 4:1 sub-blocks.
+  unsigned int horz_4_source_var[4] = { 0 };
+  unsigned int vert_4_source_var[4] = { 0 };
+  {
+    BLOCK_SIZE horz_4_bs = get_partition_subsize(bsize, PARTITION_HORZ_4);
+    BLOCK_SIZE vert_4_bs = get_partition_subsize(bsize, PARTITION_VERT_4);
+    av1_setup_src_planes(x, cpi->source, mi_row, mi_col,
+                         av1_num_planes(&cpi->common), bsize);
+    const int src_stride = x->plane[0].src.stride;
+    uint8_t *src = x->plane[0].src.buf;
+    const MACROBLOCKD *const xd = &x->e_mbd;
+
+    struct buf_2d horz_4_src, vert_4_src;
+    horz_4_src.stride = src_stride;
+    vert_4_src.stride = src_stride;
+
+    for (int i = 0; i < 4; ++i) {
+      horz_4_src.buf = src + i * block_size_high[horz_4_bs] * src_stride;
+      vert_4_src.buf = src + i * block_size_wide[vert_4_bs];
+
+      if (is_cur_buf_hbd(xd)) {
+        horz_4_source_var[i] = av1_high_get_sby_perpixel_variance(
+            cpi, &horz_4_src, horz_4_bs, xd->bd);
+        vert_4_source_var[i] = av1_high_get_sby_perpixel_variance(
+            cpi, &vert_4_src, vert_4_bs, xd->bd);
+      } else {
+        horz_4_source_var[i] =
+            av1_get_sby_perpixel_variance(cpi, &horz_4_src, horz_4_bs);
+        vert_4_source_var[i] =
+            av1_get_sby_perpixel_variance(cpi, &vert_4_src, vert_4_bs);
+      }
+    }
+  }
+
+  const float denom = (float)(pb_source_variance + 1);
+  const float low_b = 0.1f;
+  const float high_b = 10.0f;
+  for (int i = 0; i < 4; ++i) {
+    // Ratio between the 4:1 sub-block variance and the whole-block variance.
+    float var_ratio = (float)(horz_4_source_var[i] + 1) / denom;
+    if (var_ratio < low_b) var_ratio = low_b;
+    if (var_ratio > high_b) var_ratio = high_b;
+    features[feature_index++] = var_ratio;
+  }
+  for (int i = 0; i < 4; ++i) {
+    // Ratio between the 1:4 sub-block RD and the whole-block RD.
+    float var_ratio = (float)(vert_4_source_var[i] + 1) / denom;
+    if (var_ratio < low_b) var_ratio = low_b;
+    if (var_ratio > high_b) var_ratio = high_b;
+    features[feature_index++] = var_ratio;
+  }
+  assert(feature_index == FEATURES);
+
+  // Calculate scores using the NN model.
+  float score[LABELS] = { 0.0f };
+  av1_nn_predict(features, nn_config, 1, score);
+  aom_clear_system_state();
+  int int_score[LABELS];
+  int max_score = -1000;
+  for (int i = 0; i < LABELS; ++i) {
+    int_score[i] = (int)(100 * score[i]);
+    max_score = AOMMAX(int_score[i], max_score);
+  }
+
+  // Make decisions based on the model scores.
+  int thresh = max_score;
+  switch (bsize) {
+    case BLOCK_16X16: thresh -= 500; break;
+    case BLOCK_32X32: thresh -= 500; break;
+    case BLOCK_64X64: thresh -= 200; break;
+    default: break;
+  }
+  *partition_horz4_allowed = 0;
+  *partition_vert4_allowed = 0;
+  for (int i = 0; i < LABELS; ++i) {
+    if (int_score[i] >= thresh) {
+      if ((i >> 0) & 1) *partition_horz4_allowed = 1;
+      if ((i >> 1) & 1) *partition_vert4_allowed = 1;
+    }
+  }
+}
+#undef FEATURES
+#undef LABELS
+
+#define FEATURES 4
+int av1_ml_predict_breakout(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
+                            const MACROBLOCK *const x,
+                            const RD_STATS *const rd_stats,
+                            unsigned int pb_source_variance) {
+  const NN_CONFIG *nn_config = NULL;
+  int thresh = 0;
+  switch (bsize) {
+    case BLOCK_8X8:
+      nn_config = &av1_partition_breakout_nnconfig_8;
+      thresh = cpi->sf.part_sf.ml_partition_search_breakout_thresh[0];
+      break;
+    case BLOCK_16X16:
+      nn_config = &av1_partition_breakout_nnconfig_16;
+      thresh = cpi->sf.part_sf.ml_partition_search_breakout_thresh[1];
+      break;
+    case BLOCK_32X32:
+      nn_config = &av1_partition_breakout_nnconfig_32;
+      thresh = cpi->sf.part_sf.ml_partition_search_breakout_thresh[2];
+      break;
+    case BLOCK_64X64:
+      nn_config = &av1_partition_breakout_nnconfig_64;
+      thresh = cpi->sf.part_sf.ml_partition_search_breakout_thresh[3];
+      break;
+    case BLOCK_128X128:
+      nn_config = &av1_partition_breakout_nnconfig_128;
+      thresh = cpi->sf.part_sf.ml_partition_search_breakout_thresh[4];
+      break;
+    default: assert(0 && "Unexpected bsize.");
+  }
+  if (!nn_config || thresh < 0) return 0;
+
+  // Generate feature values.
+  float features[FEATURES];
+  int feature_index = 0;
+  aom_clear_system_state();
+
+  const int num_pels_log2 = num_pels_log2_lookup[bsize];
+  float rate_f = (float)AOMMIN(rd_stats->rate, INT_MAX);
+  rate_f = ((float)x->rdmult / 128.0f / 512.0f / (float)(1 << num_pels_log2)) *
+           rate_f;
+  features[feature_index++] = rate_f;
+
+  const float dist_f =
+      (float)(AOMMIN(rd_stats->dist, INT_MAX) >> num_pels_log2);
+  features[feature_index++] = dist_f;
+
+  features[feature_index++] = (float)pb_source_variance;
+
+  const int dc_q = (int)x->plane[0].dequant_QTX[0];
+  features[feature_index++] = (float)(dc_q * dc_q) / 256.0f;
+  assert(feature_index == FEATURES);
+
+  // Calculate score using the NN model.
+  float score = 0.0f;
+  av1_nn_predict(features, nn_config, 1, &score);
+  aom_clear_system_state();
+
+  // Make decision.
+  return (int)(score * 100) >= thresh;
+}
+#undef FEATURES
+#endif  // !CONFIG_REALTIME_ONLY
diff --git a/libs/libaom/src/av1/encoder/partition_strategy.h b/libs/libaom/src/av1/encoder/partition_strategy.h
new file mode 100644
index 000000000..f9b4d8bfd
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/partition_strategy.h
@@ -0,0 +1,222 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_PARTITION_STRATEGY_H_
+#define AOM_AV1_ENCODER_PARTITION_STRATEGY_H_
+
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/encodemb.h"
+#include "av1/encoder/encoder.h"
+
+#define FEATURE_SIZE_SMS_SPLIT_FAST 6
+#define FEATURE_SIZE_SMS_SPLIT 17
+#define FEATURE_SIZE_SMS_PRUNE_PART 25
+#define FEATURE_SIZE_SMS_TERM_NONE 28
+#define FEATURE_SIZE_FP_SMS_TERM_NONE 20
+#define FEATURE_SIZE_MAX_MIN_PART_PRED 13
+#define MAX_NUM_CLASSES_MAX_MIN_PART_PRED 4
+
+#define FEATURE_SMS_NONE_FLAG 1
+#define FEATURE_SMS_SPLIT_FLAG (1 << 1)
+#define FEATURE_SMS_RECT_FLAG (1 << 2)
+
+#define FEATURE_SMS_PRUNE_PART_FLAG \
+  (FEATURE_SMS_NONE_FLAG | FEATURE_SMS_SPLIT_FLAG | FEATURE_SMS_RECT_FLAG)
+#define FEATURE_SMS_SPLIT_MODEL_FLAG \
+  (FEATURE_SMS_NONE_FLAG | FEATURE_SMS_SPLIT_FLAG)
+
+void av1_intra_mode_cnn_partition(const AV1_COMMON *const cm, MACROBLOCK *x,
+                                  int bsize, int label_idx,
+                                  int *partition_none_allowed,
+                                  int *partition_horz_allowed,
+                                  int *partition_vert_allowed,
+                                  int *do_rectangular_split,
+                                  int *do_square_split);
+
+// Performs a simple_motion_search with a single reference frame and extract
+// the variance of residues. Then use the features to determine whether we want
+// to go straight to splitting without trying PARTITION_NONE
+void av1_simple_motion_search_based_split(
+    AV1_COMP *const cpi, MACROBLOCK *x, PC_TREE *pc_tree, int mi_row,
+    int mi_col, BLOCK_SIZE bsize, int *partition_none_allowed,
+    int *partition_horz_allowed, int *partition_vert_allowed,
+    int *do_rectangular_split, int *do_square_split);
+
+// Performs a simple_motion_search with two reference frames and extract
+// the variance of residues. Then use the features to determine whether we want
+// to prune some partitions.
+void av1_simple_motion_search_prune_rect(AV1_COMP *const cpi, MACROBLOCK *x,
+                                         PC_TREE *pc_tree, int mi_row,
+                                         int mi_col, BLOCK_SIZE bsize,
+                                         int *partition_horz_allowed,
+                                         int *partition_vert_allowed,
+                                         int *prune_horz, int *prune_vert);
+
+#if !CONFIG_REALTIME_ONLY
+// Early terminates PARTITION_NONE using simple_motion_search features and the
+// rate, distortion, and rdcost of PARTITION_NONE. This is only called when:
+//  - The frame is a show frame
+//  - The frame is not intra only
+//  - The current bsize is > BLOCK_8X8
+//  - blk_row + blk_height/2 < total_rows and blk_col + blk_width/2 < total_cols
+void av1_simple_motion_search_early_term_none(AV1_COMP *const cpi,
+                                              MACROBLOCK *x, PC_TREE *pc_tree,
+                                              int mi_row, int mi_col,
+                                              BLOCK_SIZE bsize,
+                                              const RD_STATS *none_rdc,
+                                              int *early_terminate);
+
+// Get the features for selecting the max and min partition size. Currently this
+// performs simple_motion_search on 16X16 subblocks of the current superblock,
+// and then extract the statistics of sse and motion vectors as features.
+void av1_get_max_min_partition_features(AV1_COMP *const cpi, MACROBLOCK *x,
+                                        int mi_row, int mi_col,
+                                        float *features);
+
+// Predict the maximum BLOCK_SIZE to be used to encoder the current superblock.
+BLOCK_SIZE av1_predict_max_partition(AV1_COMP *const cpi, MACROBLOCK *const x,
+                                     const float *features);
+
+// Attempts an early termination after PARTITION_SPLIT.
+void av1_ml_early_term_after_split(AV1_COMP *const cpi, MACROBLOCK *const x,
+                                   PC_TREE *const pc_tree, BLOCK_SIZE bsize,
+                                   int64_t best_rd, int64_t part_none_rd,
+                                   int64_t part_split_rd,
+                                   int64_t *split_block_rd, int mi_row,
+                                   int mi_col,
+                                   int *const terminate_partition_search);
+
+// Use the rdcost ratio and source var ratio to prune PARTITION_HORZ and
+// PARTITION_VERT.
+// TODO(chiyotsai@google.com): Currently this model does not use q value and has
+// no information about rectangular partitions. Preliminary experiments suggest
+// that we can get better performance by adding in q_index and rectangular
+// sse/var from SMS. We should retrain and tune this model later.
+void av1_ml_prune_rect_partition(const AV1_COMP *const cpi,
+                                 const MACROBLOCK *const x, BLOCK_SIZE bsize,
+                                 int64_t best_rd, int64_t none_rd,
+                                 int64_t *split_rd, int *const dst_prune_horz,
+                                 int *const dst_prune_vert);
+
+// Use a ML model to predict if horz_a, horz_b, vert_a, and vert_b should be
+// considered.
+void av1_ml_prune_ab_partition(BLOCK_SIZE bsize, int part_ctx, int var_ctx,
+                               int64_t best_rd, int64_t horz_rd[2],
+                               int64_t vert_rd[2], int64_t split_rd[4],
+                               int *const horza_partition_allowed,
+                               int *const horzb_partition_allowed,
+                               int *const verta_partition_allowed,
+                               int *const vertb_partition_allowed);
+
+// Use a ML model to predict if horz4 and vert4 should be considered.
+void av1_ml_prune_4_partition(const AV1_COMP *const cpi, MACROBLOCK *const x,
+                              BLOCK_SIZE bsize, int part_ctx, int64_t best_rd,
+                              int64_t horz_rd[2], int64_t vert_rd[2],
+                              int64_t split_rd[4],
+                              int *const partition_horz4_allowed,
+                              int *const partition_vert4_allowed,
+                              unsigned int pb_source_variance, int mi_row,
+                              int mi_col);
+
+// ML-based partition search breakout after PARTITION_NONE
+int av1_ml_predict_breakout(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
+                            const MACROBLOCK *const x,
+                            const RD_STATS *const rd_stats,
+                            unsigned int pb_source_variance);
+#endif  // !CONFIG_REALTIME_ONLY
+
+// A simplified version of set_offsets meant to be used for
+// simple_motion_search.
+static INLINE void set_offsets_for_motion_search(const AV1_COMP *const cpi,
+                                                 MACROBLOCK *const x,
+                                                 int mi_row, int mi_col,
+                                                 BLOCK_SIZE bsize) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int mi_width = mi_size_wide[bsize];
+  const int mi_height = mi_size_high[bsize];
+
+  set_mode_info_offsets(&cpi->common.mi_params, &cpi->mbmi_ext_info, x, xd,
+                        mi_row, mi_col);
+
+  // Set up destination pointers.
+  av1_setup_dst_planes(xd->plane, bsize, &cm->cur_frame->buf, mi_row, mi_col, 0,
+                       num_planes);
+
+  // Set up limit values for MV components.
+  // Mv beyond the range do not produce new/different prediction block.
+  av1_set_mv_limits(mi_params, &x->mv_limits, mi_row, mi_col, mi_height,
+                    mi_width, cpi->oxcf.border_in_pixels);
+
+  set_plane_n4(xd, mi_width, mi_height, num_planes);
+
+  xd->mi_row = mi_row;
+  xd->mi_col = mi_col;
+
+  // Set up distance of MB to edge of frame in 1/8th pel units.
+  assert(!(mi_col & (mi_width - 1)) && !(mi_row & (mi_height - 1)));
+  xd->mb_to_top_edge = -GET_MV_SUBPEL(mi_row * MI_SIZE);
+  xd->mb_to_bottom_edge =
+      GET_MV_SUBPEL((mi_params->mi_rows - mi_height - mi_row) * MI_SIZE);
+  xd->mb_to_left_edge = -GET_MV_SUBPEL(mi_col * MI_SIZE);
+  xd->mb_to_right_edge =
+      GET_MV_SUBPEL((mi_params->mi_cols - mi_width - mi_col) * MI_SIZE);
+
+  // Set up source buffers.
+  av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, bsize);
+}
+
+static INLINE void init_simple_motion_search_mvs(PC_TREE *pc_tree) {
+  av1_zero(pc_tree->start_mvs);
+
+  av1_zero(pc_tree->sms_none_feat);
+  av1_zero(pc_tree->sms_rect_feat);
+  av1_zero(pc_tree->sms_none_valid);
+  av1_zero(pc_tree->sms_rect_valid);
+
+  if (pc_tree->block_size >= BLOCK_8X8) {
+    init_simple_motion_search_mvs(pc_tree->split[0]);
+    init_simple_motion_search_mvs(pc_tree->split[1]);
+    init_simple_motion_search_mvs(pc_tree->split[2]);
+    init_simple_motion_search_mvs(pc_tree->split[3]);
+  }
+}
+
+static INLINE int is_full_sb(const CommonModeInfoParams *const mi_params,
+                             int mi_row, int mi_col, BLOCK_SIZE sb_size) {
+  const int sb_mi_wide = mi_size_wide[sb_size];
+  const int sb_mi_high = mi_size_high[sb_size];
+
+  return (mi_row + sb_mi_high) <= mi_params->mi_rows &&
+         (mi_col + sb_mi_wide) <= mi_params->mi_cols;
+}
+
+// Do not use this criteria for screen content videos.
+// Since screen content videos could often find good predictors and the largest
+// block size is likely to be used.
+static INLINE int use_auto_max_partition(AV1_COMP *const cpi,
+                                         BLOCK_SIZE sb_size, int mi_row,
+                                         int mi_col) {
+  assert(IMPLIES(cpi->gf_group.size > 0,
+                 cpi->gf_group.index < cpi->gf_group.size));
+  AV1_COMMON *const cm = &cpi->common;
+  return !frame_is_intra_only(cm) && !cpi->is_screen_content_type &&
+         cpi->sf.part_sf.auto_max_partition_based_on_simple_motion !=
+             NOT_IN_USE &&
+         sb_size == BLOCK_128X128 &&
+         is_full_sb(&cm->mi_params, mi_row, mi_col, sb_size) &&
+         cpi->gf_group.update_type[cpi->gf_group.index] != OVERLAY_UPDATE &&
+         cpi->gf_group.update_type[cpi->gf_group.index] != INTNL_OVERLAY_UPDATE;
+}
+
+#endif  // AOM_AV1_ENCODER_PARTITION_STRATEGY_H_
diff --git a/libs/libaom/src/av1/encoder/pass2_strategy.c b/libs/libaom/src/av1/encoder/pass2_strategy.c
new file mode 100644
index 000000000..6adc1fbf9
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/pass2_strategy.c
@@ -0,0 +1,2895 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdint.h>
+
+#include "config/aom_config.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "aom/aom_codec.h"
+#include "aom/aom_encoder.h"
+
+#include "aom_ports/system_state.h"
+
+#include "av1/common/av1_common_int.h"
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/firstpass.h"
+#include "av1/encoder/gop_structure.h"
+#include "av1/encoder/pass2_strategy.h"
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/tpl_model.h"
+#include "av1/encoder/use_flat_gop_model_params.h"
+#include "av1/encoder/encode_strategy.h"
+
+#define DEFAULT_KF_BOOST 2300
+#define DEFAULT_GF_BOOST 2000
+#define GROUP_ADAPTIVE_MAXQ 1
+static void init_gf_stats(GF_GROUP_STATS *gf_stats);
+
+// Calculate an active area of the image that discounts formatting
+// bars and partially discounts other 0 energy areas.
+#define MIN_ACTIVE_AREA 0.5
+#define MAX_ACTIVE_AREA 1.0
+static double calculate_active_area(const FRAME_INFO *frame_info,
+                                    const FIRSTPASS_STATS *this_frame) {
+  const double active_pct =
+      1.0 -
+      ((this_frame->intra_skip_pct / 2) +
+       ((this_frame->inactive_zone_rows * 2) / (double)frame_info->mb_rows));
+  return fclamp(active_pct, MIN_ACTIVE_AREA, MAX_ACTIVE_AREA);
+}
+
+// Calculate a modified Error used in distributing bits between easier and
+// harder frames.
+#define ACT_AREA_CORRECTION 0.5
+static double calculate_modified_err(const FRAME_INFO *frame_info,
+                                     const TWO_PASS *twopass,
+                                     const AV1EncoderConfig *oxcf,
+                                     const FIRSTPASS_STATS *this_frame) {
+  const FIRSTPASS_STATS *const stats = twopass->stats_buf_ctx->total_stats;
+  if (stats == NULL) {
+    return 0;
+  }
+  const double av_weight = stats->weight / stats->count;
+  const double av_err = (stats->coded_error * av_weight) / stats->count;
+  double modified_error =
+      av_err * pow(this_frame->coded_error * this_frame->weight /
+                       DOUBLE_DIVIDE_CHECK(av_err),
+                   oxcf->two_pass_vbrbias / 100.0);
+
+  // Correction for active area. Frames with a reduced active area
+  // (eg due to formatting bars) have a higher error per mb for the
+  // remaining active MBs. The correction here assumes that coding
+  // 0.5N blocks of complexity 2X is a little easier than coding N
+  // blocks of complexity X.
+  modified_error *=
+      pow(calculate_active_area(frame_info, this_frame), ACT_AREA_CORRECTION);
+
+  return fclamp(modified_error, twopass->modified_error_min,
+                twopass->modified_error_max);
+}
+
+// Resets the first pass file to the given position using a relative seek from
+// the current position.
+static void reset_fpf_position(TWO_PASS *p, const FIRSTPASS_STATS *position) {
+  p->stats_in = position;
+}
+
+static int input_stats(TWO_PASS *p, FIRSTPASS_STATS *fps) {
+  if (p->stats_in >= p->stats_buf_ctx->stats_in_end) return EOF;
+
+  *fps = *p->stats_in;
+  ++p->stats_in;
+  return 1;
+}
+
+static int input_stats_lap(TWO_PASS *p, FIRSTPASS_STATS *fps) {
+  if (p->stats_in >= p->stats_buf_ctx->stats_in_end) return EOF;
+
+  *fps = *p->stats_in;
+  /* Move old stats[0] out to accommodate for next frame stats  */
+  memmove(p->frame_stats_arr[0], p->frame_stats_arr[1],
+          (p->stats_buf_ctx->stats_in_end - p->stats_in - 1) *
+              sizeof(FIRSTPASS_STATS));
+  p->stats_buf_ctx->stats_in_end--;
+  return 1;
+}
+
+// Read frame stats at an offset from the current position.
+static const FIRSTPASS_STATS *read_frame_stats(const TWO_PASS *p, int offset) {
+  if ((offset >= 0 && p->stats_in + offset >= p->stats_buf_ctx->stats_in_end) ||
+      (offset < 0 && p->stats_in + offset < p->stats_buf_ctx->stats_in_start)) {
+    return NULL;
+  }
+
+  return &p->stats_in[offset];
+}
+
+static void subtract_stats(FIRSTPASS_STATS *section,
+                           const FIRSTPASS_STATS *frame) {
+  section->frame -= frame->frame;
+  section->weight -= frame->weight;
+  section->intra_error -= frame->intra_error;
+  section->frame_avg_wavelet_energy -= frame->frame_avg_wavelet_energy;
+  section->coded_error -= frame->coded_error;
+  section->sr_coded_error -= frame->sr_coded_error;
+  section->pcnt_inter -= frame->pcnt_inter;
+  section->pcnt_motion -= frame->pcnt_motion;
+  section->pcnt_second_ref -= frame->pcnt_second_ref;
+  section->pcnt_neutral -= frame->pcnt_neutral;
+  section->intra_skip_pct -= frame->intra_skip_pct;
+  section->inactive_zone_rows -= frame->inactive_zone_rows;
+  section->inactive_zone_cols -= frame->inactive_zone_cols;
+  section->MVr -= frame->MVr;
+  section->mvr_abs -= frame->mvr_abs;
+  section->MVc -= frame->MVc;
+  section->mvc_abs -= frame->mvc_abs;
+  section->MVrv -= frame->MVrv;
+  section->MVcv -= frame->MVcv;
+  section->mv_in_out_count -= frame->mv_in_out_count;
+  section->new_mv_count -= frame->new_mv_count;
+  section->count -= frame->count;
+  section->duration -= frame->duration;
+}
+
+// This function returns the maximum target rate per frame.
+static int frame_max_bits(const RATE_CONTROL *rc,
+                          const AV1EncoderConfig *oxcf) {
+  int64_t max_bits = ((int64_t)rc->avg_frame_bandwidth *
+                      (int64_t)oxcf->two_pass_vbrmax_section) /
+                     100;
+  if (max_bits < 0)
+    max_bits = 0;
+  else if (max_bits > rc->max_frame_bandwidth)
+    max_bits = rc->max_frame_bandwidth;
+
+  return (int)max_bits;
+}
+
+static const double q_pow_term[(QINDEX_RANGE >> 5) + 1] = { 0.65, 0.70, 0.75,
+                                                            0.80, 0.85, 0.90,
+                                                            0.95, 0.95, 0.95 };
+#define ERR_DIVISOR 96.0
+static double calc_correction_factor(double err_per_mb, int q) {
+  const double error_term = err_per_mb / ERR_DIVISOR;
+  const int index = q >> 5;
+  // Adjustment to power term based on qindex
+  const double power_term =
+      q_pow_term[index] +
+      (((q_pow_term[index + 1] - q_pow_term[index]) * (q % 32)) / 32.0);
+  assert(error_term >= 0.0);
+  return fclamp(pow(error_term, power_term), 0.05, 5.0);
+}
+
+static void twopass_update_bpm_factor(TWO_PASS *twopass) {
+  // Based on recent history adjust expectations of bits per macroblock.
+  double last_group_rate_err =
+      (double)twopass->rolling_arf_group_actual_bits /
+      DOUBLE_DIVIDE_CHECK((double)twopass->rolling_arf_group_target_bits);
+  last_group_rate_err = AOMMAX(0.25, AOMMIN(4.0, last_group_rate_err));
+  twopass->bpm_factor *= (3.0 + last_group_rate_err) / 4.0;
+  twopass->bpm_factor = AOMMAX(0.25, AOMMIN(4.0, twopass->bpm_factor));
+}
+
+static int qbpm_enumerator(int rate_err_tol) {
+  return 1350000 + ((300000 * AOMMIN(75, AOMMAX(rate_err_tol - 25, 0))) / 75);
+}
+
+// Similar to find_qindex_by_rate() function in ratectrl.c, but includes
+// calculation of a correction_factor.
+static int find_qindex_by_rate_with_correction(
+    int desired_bits_per_mb, aom_bit_depth_t bit_depth, double error_per_mb,
+    double group_weight_factor, int rate_err_tol, int best_qindex,
+    int worst_qindex) {
+  assert(best_qindex <= worst_qindex);
+  int low = best_qindex;
+  int high = worst_qindex;
+
+  while (low < high) {
+    const int mid = (low + high) >> 1;
+    const double mid_factor = calc_correction_factor(error_per_mb, mid);
+    const double q = av1_convert_qindex_to_q(mid, bit_depth);
+    const int enumerator = qbpm_enumerator(rate_err_tol);
+    const int mid_bits_per_mb =
+        (int)((enumerator * mid_factor * group_weight_factor) / q);
+
+    if (mid_bits_per_mb > desired_bits_per_mb) {
+      low = mid + 1;
+    } else {
+      high = mid;
+    }
+  }
+  return low;
+}
+
+static int get_twopass_worst_quality(AV1_COMP *cpi, const double section_err,
+                                     double inactive_zone,
+                                     int section_target_bandwidth,
+                                     double group_weight_factor) {
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+
+  inactive_zone = fclamp(inactive_zone, 0.0, 1.0);
+
+  if (section_target_bandwidth <= 0) {
+    return rc->worst_quality;  // Highest value allowed
+  } else {
+    const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
+                            ? cpi->initial_mbs
+                            : cpi->common.mi_params.MBs;
+    const int active_mbs = AOMMAX(1, num_mbs - (int)(num_mbs * inactive_zone));
+    const double av_err_per_mb = section_err / active_mbs;
+    const int target_norm_bits_per_mb =
+        (int)((uint64_t)section_target_bandwidth << BPER_MB_NORMBITS) /
+        active_mbs;
+    int rate_err_tol =
+        AOMMIN(cpi->oxcf.under_shoot_pct, cpi->oxcf.over_shoot_pct);
+
+    twopass_update_bpm_factor(&cpi->twopass);
+    // Try and pick a max Q that will be high enough to encode the
+    // content at the given rate.
+    int q = find_qindex_by_rate_with_correction(
+        target_norm_bits_per_mb, cpi->common.seq_params.bit_depth,
+        av_err_per_mb, group_weight_factor, rate_err_tol, rc->best_quality,
+        rc->worst_quality);
+
+    // Restriction on active max q for constrained quality mode.
+    if (cpi->oxcf.rc_mode == AOM_CQ) q = AOMMAX(q, oxcf->cq_level);
+    return q;
+  }
+}
+
+#define SR_DIFF_PART 0.0015
+#define MOTION_AMP_PART 0.003
+#define INTRA_PART 0.005
+#define DEFAULT_DECAY_LIMIT 0.75
+#define LOW_SR_DIFF_TRHESH 0.1
+#define SR_DIFF_MAX 128.0
+#define NCOUNT_FRAME_II_THRESH 5.0
+
+static double get_sr_decay_rate(const FRAME_INFO *frame_info,
+                                const FIRSTPASS_STATS *frame) {
+  const int num_mbs = frame_info->num_mbs;
+  double sr_diff = (frame->sr_coded_error - frame->coded_error) / num_mbs;
+  double sr_decay = 1.0;
+  double modified_pct_inter;
+  double modified_pcnt_intra;
+  const double motion_amplitude_factor =
+      frame->pcnt_motion * ((frame->mvc_abs + frame->mvr_abs) / 2);
+
+  modified_pct_inter = frame->pcnt_inter;
+  if ((frame->intra_error / DOUBLE_DIVIDE_CHECK(frame->coded_error)) <
+      (double)NCOUNT_FRAME_II_THRESH) {
+    modified_pct_inter = frame->pcnt_inter - frame->pcnt_neutral;
+  }
+  modified_pcnt_intra = 100 * (1.0 - modified_pct_inter);
+
+  if ((sr_diff > LOW_SR_DIFF_TRHESH)) {
+    sr_diff = AOMMIN(sr_diff, SR_DIFF_MAX);
+    sr_decay = 1.0 - (SR_DIFF_PART * sr_diff) -
+               (MOTION_AMP_PART * motion_amplitude_factor) -
+               (INTRA_PART * modified_pcnt_intra);
+  }
+  return AOMMAX(sr_decay, AOMMIN(DEFAULT_DECAY_LIMIT, modified_pct_inter));
+}
+
+// This function gives an estimate of how badly we believe the prediction
+// quality is decaying from frame to frame.
+static double get_zero_motion_factor(const FRAME_INFO *frame_info,
+                                     const FIRSTPASS_STATS *frame) {
+  const double zero_motion_pct = frame->pcnt_inter - frame->pcnt_motion;
+  double sr_decay = get_sr_decay_rate(frame_info, frame);
+  return AOMMIN(sr_decay, zero_motion_pct);
+}
+
+#define ZM_POWER_FACTOR 0.75
+
+static double get_prediction_decay_rate(const FRAME_INFO *frame_info,
+                                        const FIRSTPASS_STATS *next_frame) {
+  const double sr_decay_rate = get_sr_decay_rate(frame_info, next_frame);
+  const double zero_motion_factor =
+      (0.95 * pow((next_frame->pcnt_inter - next_frame->pcnt_motion),
+                  ZM_POWER_FACTOR));
+
+  return AOMMAX(zero_motion_factor,
+                (sr_decay_rate + ((1.0 - sr_decay_rate) * zero_motion_factor)));
+}
+
+// Function to test for a condition where a complex transition is followed
+// by a static section. For example in slide shows where there is a fade
+// between slides. This is to help with more optimal kf and gf positioning.
+static int detect_transition_to_still(TWO_PASS *const twopass,
+                                      const int min_gf_interval,
+                                      const int frame_interval,
+                                      const int still_interval,
+                                      const double loop_decay_rate,
+                                      const double last_decay_rate) {
+  // Break clause to detect very still sections after motion
+  // For example a static image after a fade or other transition
+  // instead of a clean scene cut.
+  if (frame_interval > min_gf_interval && loop_decay_rate >= 0.999 &&
+      last_decay_rate < 0.9) {
+    int j;
+    // Look ahead a few frames to see if static condition persists...
+    for (j = 0; j < still_interval; ++j) {
+      const FIRSTPASS_STATS *stats = &twopass->stats_in[j];
+      if (stats >= twopass->stats_buf_ctx->stats_in_end) break;
+
+      if (stats->pcnt_inter - stats->pcnt_motion < 0.999) break;
+    }
+    // Only if it does do we signal a transition to still.
+    return j == still_interval;
+  }
+  return 0;
+}
+
+// This function detects a flash through the high relative pcnt_second_ref
+// score in the frame following a flash frame. The offset passed in should
+// reflect this.
+static int detect_flash(const TWO_PASS *twopass, const int offset) {
+  const FIRSTPASS_STATS *const next_frame = read_frame_stats(twopass, offset);
+
+  // What we are looking for here is a situation where there is a
+  // brief break in prediction (such as a flash) but subsequent frames
+  // are reasonably well predicted by an earlier (pre flash) frame.
+  // The recovery after a flash is indicated by a high pcnt_second_ref
+  // compared to pcnt_inter.
+  return next_frame != NULL &&
+         next_frame->pcnt_second_ref > next_frame->pcnt_inter &&
+         next_frame->pcnt_second_ref >= 0.5;
+}
+
+// Update the motion related elements to the GF arf boost calculation.
+static void accumulate_frame_motion_stats(const FIRSTPASS_STATS *stats,
+                                          GF_GROUP_STATS *gf_stats) {
+  const double pct = stats->pcnt_motion;
+
+  // Accumulate Motion In/Out of frame stats.
+  gf_stats->this_frame_mv_in_out = stats->mv_in_out_count * pct;
+  gf_stats->mv_in_out_accumulator += gf_stats->this_frame_mv_in_out;
+  gf_stats->abs_mv_in_out_accumulator += fabs(gf_stats->this_frame_mv_in_out);
+
+  // Accumulate a measure of how uniform (or conversely how random) the motion
+  // field is (a ratio of abs(mv) / mv).
+  if (pct > 0.05) {
+    const double mvr_ratio =
+        fabs(stats->mvr_abs) / DOUBLE_DIVIDE_CHECK(fabs(stats->MVr));
+    const double mvc_ratio =
+        fabs(stats->mvc_abs) / DOUBLE_DIVIDE_CHECK(fabs(stats->MVc));
+
+    gf_stats->mv_ratio_accumulator +=
+        pct * (mvr_ratio < stats->mvr_abs ? mvr_ratio : stats->mvr_abs);
+    gf_stats->mv_ratio_accumulator +=
+        pct * (mvc_ratio < stats->mvc_abs ? mvc_ratio : stats->mvc_abs);
+  }
+}
+
+static void accumulate_this_frame_stats(const FIRSTPASS_STATS *stats,
+                                        const double mod_frame_err,
+                                        GF_GROUP_STATS *gf_stats) {
+  gf_stats->gf_group_err += mod_frame_err;
+#if GROUP_ADAPTIVE_MAXQ
+  gf_stats->gf_group_raw_error += stats->coded_error;
+#endif
+  gf_stats->gf_group_skip_pct += stats->intra_skip_pct;
+  gf_stats->gf_group_inactive_zone_rows += stats->inactive_zone_rows;
+}
+
+static void accumulate_next_frame_stats(
+    const FIRSTPASS_STATS *stats, const FRAME_INFO *frame_info,
+    TWO_PASS *const twopass, const int flash_detected,
+    const int frames_since_key, const int cur_idx, const int can_disable_arf,
+    const int min_gf_interval, GF_GROUP_STATS *gf_stats) {
+  accumulate_frame_motion_stats(stats, gf_stats);
+  // sum up the metric values of current gf group
+  gf_stats->avg_sr_coded_error += stats->sr_coded_error;
+  gf_stats->avg_tr_coded_error += stats->tr_coded_error;
+  gf_stats->avg_pcnt_second_ref += stats->pcnt_second_ref;
+  gf_stats->avg_pcnt_third_ref += stats->pcnt_third_ref;
+  gf_stats->avg_new_mv_count += stats->new_mv_count;
+  gf_stats->avg_wavelet_energy += stats->frame_avg_wavelet_energy;
+  if (fabs(stats->raw_error_stdev) > 0.000001) {
+    gf_stats->non_zero_stdev_count++;
+    gf_stats->avg_raw_err_stdev += stats->raw_error_stdev;
+  }
+
+  // Accumulate the effect of prediction quality decay
+  if (!flash_detected) {
+    gf_stats->last_loop_decay_rate = gf_stats->loop_decay_rate;
+    gf_stats->loop_decay_rate = get_prediction_decay_rate(frame_info, stats);
+
+    gf_stats->decay_accumulator =
+        gf_stats->decay_accumulator * gf_stats->loop_decay_rate;
+
+    // Monitor for static sections.
+    if ((frames_since_key + cur_idx - 1) > 1) {
+      gf_stats->zero_motion_accumulator =
+          AOMMIN(gf_stats->zero_motion_accumulator,
+                 get_zero_motion_factor(frame_info, stats));
+    }
+
+    // Break clause to detect very still sections after motion. For example,
+    // a static image after a fade or other transition.
+    if (can_disable_arf &&
+        detect_transition_to_still(twopass, min_gf_interval, cur_idx, 5,
+                                   gf_stats->loop_decay_rate,
+                                   gf_stats->last_loop_decay_rate)) {
+      gf_stats->allow_alt_ref = 0;
+    }
+  }
+}
+
+static void average_gf_stats(const int total_frame,
+                             const FIRSTPASS_STATS *last_stat,
+                             GF_GROUP_STATS *gf_stats) {
+  if (total_frame) {
+    gf_stats->avg_sr_coded_error /= total_frame;
+    gf_stats->avg_tr_coded_error /= total_frame;
+    gf_stats->avg_pcnt_second_ref /= total_frame;
+    if (total_frame - 1) {
+      gf_stats->avg_pcnt_third_ref_nolast =
+          (gf_stats->avg_pcnt_third_ref - last_stat->pcnt_third_ref) /
+          (total_frame - 1);
+    } else {
+      gf_stats->avg_pcnt_third_ref_nolast =
+          gf_stats->avg_pcnt_third_ref / total_frame;
+    }
+    gf_stats->avg_pcnt_third_ref /= total_frame;
+    gf_stats->avg_new_mv_count /= total_frame;
+    gf_stats->avg_wavelet_energy /= total_frame;
+  }
+
+  if (gf_stats->non_zero_stdev_count)
+    gf_stats->avg_raw_err_stdev /= gf_stats->non_zero_stdev_count;
+}
+
+static void get_features_from_gf_stats(const GF_GROUP_STATS *gf_stats,
+                                       const GF_FRAME_STATS *first_frame,
+                                       const GF_FRAME_STATS *last_frame,
+                                       const int num_mbs,
+                                       const int constrained_gf_group,
+                                       const int kf_zeromotion_pct,
+                                       const int num_frames, float *features) {
+  *features++ = (float)gf_stats->abs_mv_in_out_accumulator;
+  *features++ = (float)(gf_stats->avg_new_mv_count / num_mbs);
+  *features++ = (float)gf_stats->avg_pcnt_second_ref;
+  *features++ = (float)gf_stats->avg_pcnt_third_ref;
+  *features++ = (float)gf_stats->avg_pcnt_third_ref_nolast;
+  *features++ = (float)(gf_stats->avg_sr_coded_error / num_mbs);
+  *features++ = (float)(gf_stats->avg_tr_coded_error / num_mbs);
+  *features++ = (float)(gf_stats->avg_wavelet_energy / num_mbs);
+  *features++ = (float)(constrained_gf_group);
+  *features++ = (float)gf_stats->decay_accumulator;
+  *features++ = (float)(first_frame->frame_coded_error / num_mbs);
+  *features++ = (float)(first_frame->frame_sr_coded_error / num_mbs);
+  *features++ = (float)(first_frame->frame_tr_coded_error / num_mbs);
+  *features++ = (float)(first_frame->frame_err / num_mbs);
+  *features++ = (float)(kf_zeromotion_pct);
+  *features++ = (float)(last_frame->frame_coded_error / num_mbs);
+  *features++ = (float)(last_frame->frame_sr_coded_error / num_mbs);
+  *features++ = (float)(last_frame->frame_tr_coded_error / num_mbs);
+  *features++ = (float)num_frames;
+  *features++ = (float)gf_stats->mv_ratio_accumulator;
+  *features++ = (float)gf_stats->non_zero_stdev_count;
+}
+
+#define BOOST_FACTOR 12.5
+static double baseline_err_per_mb(const FRAME_INFO *frame_info) {
+  unsigned int screen_area = frame_info->frame_height * frame_info->frame_width;
+
+  // Use a different error per mb factor for calculating boost for
+  //  different formats.
+  if (screen_area <= 640 * 360) {
+    return 500.0;
+  } else {
+    return 1000.0;
+  }
+}
+
+static double calc_frame_boost(const RATE_CONTROL *rc,
+                               const FRAME_INFO *frame_info,
+                               const FIRSTPASS_STATS *this_frame,
+                               double this_frame_mv_in_out, double max_boost) {
+  double frame_boost;
+  const double lq = av1_convert_qindex_to_q(rc->avg_frame_qindex[INTER_FRAME],
+                                            frame_info->bit_depth);
+  const double boost_q_correction = AOMMIN((0.5 + (lq * 0.015)), 1.5);
+  const double active_area = calculate_active_area(frame_info, this_frame);
+  int num_mbs = frame_info->num_mbs;
+
+  // Correct for any inactive region in the image
+  num_mbs = (int)AOMMAX(1, num_mbs * active_area);
+
+  // Underlying boost factor is based on inter error ratio.
+  frame_boost = AOMMAX(baseline_err_per_mb(frame_info) * num_mbs,
+                       this_frame->intra_error * active_area) /
+                DOUBLE_DIVIDE_CHECK(this_frame->coded_error);
+  frame_boost = frame_boost * BOOST_FACTOR * boost_q_correction;
+
+  // Increase boost for frames where new data coming into frame (e.g. zoom out).
+  // Slightly reduce boost if there is a net balance of motion out of the frame
+  // (zoom in). The range for this_frame_mv_in_out is -1.0 to +1.0.
+  if (this_frame_mv_in_out > 0.0)
+    frame_boost += frame_boost * (this_frame_mv_in_out * 2.0);
+  // In the extreme case the boost is halved.
+  else
+    frame_boost += frame_boost * (this_frame_mv_in_out / 2.0);
+
+  return AOMMIN(frame_boost, max_boost * boost_q_correction);
+}
+
+static double calc_kf_frame_boost(const RATE_CONTROL *rc,
+                                  const FRAME_INFO *frame_info,
+                                  const FIRSTPASS_STATS *this_frame,
+                                  double *sr_accumulator, double max_boost) {
+  double frame_boost;
+  const double lq = av1_convert_qindex_to_q(rc->avg_frame_qindex[INTER_FRAME],
+                                            frame_info->bit_depth);
+  const double boost_q_correction = AOMMIN((0.50 + (lq * 0.015)), 2.00);
+  const double active_area = calculate_active_area(frame_info, this_frame);
+  int num_mbs = frame_info->num_mbs;
+
+  // Correct for any inactive region in the image
+  num_mbs = (int)AOMMAX(1, num_mbs * active_area);
+
+  // Underlying boost factor is based on inter error ratio.
+  frame_boost = AOMMAX(baseline_err_per_mb(frame_info) * num_mbs,
+                       this_frame->intra_error * active_area) /
+                DOUBLE_DIVIDE_CHECK(
+                    (this_frame->coded_error + *sr_accumulator) * active_area);
+
+  // Update the accumulator for second ref error difference.
+  // This is intended to give an indication of how much the coded error is
+  // increasing over time.
+  *sr_accumulator += (this_frame->sr_coded_error - this_frame->coded_error);
+  *sr_accumulator = AOMMAX(0.0, *sr_accumulator);
+
+  // Q correction and scaling
+  // The 40.0 value here is an experimentally derived baseline minimum.
+  // This value is in line with the minimum per frame boost in the alt_ref
+  // boost calculation.
+  frame_boost = ((frame_boost + 40.0) * boost_q_correction);
+
+  return AOMMIN(frame_boost, max_boost * boost_q_correction);
+}
+
+static int get_projected_gfu_boost(const RATE_CONTROL *rc, int gfu_boost,
+                                   int frames_to_project,
+                                   int num_stats_used_for_gfu_boost) {
+  /*
+   * If frames_to_project is equal to num_stats_used_for_gfu_boost,
+   * it means that gfu_boost was calculated over frames_to_project to
+   * begin with(ie; all stats required were available), hence return
+   * the original boost.
+   */
+  if (num_stats_used_for_gfu_boost >= frames_to_project) return gfu_boost;
+
+  double min_boost_factor = sqrt(rc->baseline_gf_interval);
+  // Get the current tpl factor (number of frames = frames_to_project).
+  double tpl_factor = av1_get_gfu_boost_projection_factor(
+      min_boost_factor, MAX_GFUBOOST_FACTOR, frames_to_project);
+  // Get the tpl factor when number of frames = num_stats_used_for_prior_boost.
+  double tpl_factor_num_stats = av1_get_gfu_boost_projection_factor(
+      min_boost_factor, MAX_GFUBOOST_FACTOR, num_stats_used_for_gfu_boost);
+  int projected_gfu_boost =
+      (int)rint((tpl_factor * gfu_boost) / tpl_factor_num_stats);
+  return projected_gfu_boost;
+}
+
+#define GF_MAX_BOOST 90.0
+#define MIN_DECAY_FACTOR 0.01
+int av1_calc_arf_boost(const TWO_PASS *twopass, const RATE_CONTROL *rc,
+                       FRAME_INFO *frame_info, int offset, int f_frames,
+                       int b_frames, int *num_fpstats_used,
+                       int *num_fpstats_required) {
+  int i;
+  GF_GROUP_STATS gf_stats;
+  init_gf_stats(&gf_stats);
+  double boost_score = (double)NORMAL_BOOST;
+  int arf_boost;
+  int flash_detected = 0;
+  if (num_fpstats_used) *num_fpstats_used = 0;
+
+  // Search forward from the proposed arf/next gf position.
+  for (i = 0; i < f_frames; ++i) {
+    const FIRSTPASS_STATS *this_frame = read_frame_stats(twopass, i + offset);
+    if (this_frame == NULL) break;
+
+    // Update the motion related elements to the boost calculation.
+    accumulate_frame_motion_stats(this_frame, &gf_stats);
+
+    // We want to discount the flash frame itself and the recovery
+    // frame that follows as both will have poor scores.
+    flash_detected = detect_flash(twopass, i + offset) ||
+                     detect_flash(twopass, i + offset + 1);
+
+    // Accumulate the effect of prediction quality decay.
+    if (!flash_detected) {
+      gf_stats.decay_accumulator *=
+          get_prediction_decay_rate(frame_info, this_frame);
+      gf_stats.decay_accumulator = gf_stats.decay_accumulator < MIN_DECAY_FACTOR
+                                       ? MIN_DECAY_FACTOR
+                                       : gf_stats.decay_accumulator;
+    }
+
+    boost_score +=
+        gf_stats.decay_accumulator *
+        calc_frame_boost(rc, frame_info, this_frame,
+                         gf_stats.this_frame_mv_in_out, GF_MAX_BOOST);
+    if (num_fpstats_used) (*num_fpstats_used)++;
+  }
+
+  arf_boost = (int)boost_score;
+
+  // Reset for backward looking loop.
+  boost_score = 0.0;
+  init_gf_stats(&gf_stats);
+  // Search backward towards last gf position.
+  for (i = -1; i >= -b_frames; --i) {
+    const FIRSTPASS_STATS *this_frame = read_frame_stats(twopass, i + offset);
+    if (this_frame == NULL) break;
+
+    // Update the motion related elements to the boost calculation.
+    accumulate_frame_motion_stats(this_frame, &gf_stats);
+
+    // We want to discount the the flash frame itself and the recovery
+    // frame that follows as both will have poor scores.
+    flash_detected = detect_flash(twopass, i + offset) ||
+                     detect_flash(twopass, i + offset + 1);
+
+    // Cumulative effect of prediction quality decay.
+    if (!flash_detected) {
+      gf_stats.decay_accumulator *=
+          get_prediction_decay_rate(frame_info, this_frame);
+      gf_stats.decay_accumulator = gf_stats.decay_accumulator < MIN_DECAY_FACTOR
+                                       ? MIN_DECAY_FACTOR
+                                       : gf_stats.decay_accumulator;
+    }
+
+    boost_score +=
+        gf_stats.decay_accumulator *
+        calc_frame_boost(rc, frame_info, this_frame,
+                         gf_stats.this_frame_mv_in_out, GF_MAX_BOOST);
+    if (num_fpstats_used) (*num_fpstats_used)++;
+  }
+  arf_boost += (int)boost_score;
+
+  if (num_fpstats_required) {
+    *num_fpstats_required = f_frames + b_frames;
+    if (num_fpstats_used) {
+      arf_boost = get_projected_gfu_boost(rc, arf_boost, *num_fpstats_required,
+                                          *num_fpstats_used);
+    }
+  }
+
+  if (arf_boost < ((b_frames + f_frames) * 50))
+    arf_boost = ((b_frames + f_frames) * 50);
+
+  return arf_boost;
+}
+
+// Calculate a section intra ratio used in setting max loop filter.
+static int calculate_section_intra_ratio(const FIRSTPASS_STATS *begin,
+                                         const FIRSTPASS_STATS *end,
+                                         int section_length) {
+  const FIRSTPASS_STATS *s = begin;
+  double intra_error = 0.0;
+  double coded_error = 0.0;
+  int i = 0;
+
+  while (s < end && i < section_length) {
+    intra_error += s->intra_error;
+    coded_error += s->coded_error;
+    ++s;
+    ++i;
+  }
+
+  return (int)(intra_error / DOUBLE_DIVIDE_CHECK(coded_error));
+}
+
+// Calculate the total bits to allocate in this GF/ARF group.
+static int64_t calculate_total_gf_group_bits(AV1_COMP *cpi,
+                                             double gf_group_err) {
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const TWO_PASS *const twopass = &cpi->twopass;
+  const int max_bits = frame_max_bits(rc, &cpi->oxcf);
+  int64_t total_group_bits;
+
+  // Calculate the bits to be allocated to the group as a whole.
+  if ((twopass->kf_group_bits > 0) && (twopass->kf_group_error_left > 0)) {
+    total_group_bits = (int64_t)(twopass->kf_group_bits *
+                                 (gf_group_err / twopass->kf_group_error_left));
+  } else {
+    total_group_bits = 0;
+  }
+
+  // Clamp odd edge cases.
+  total_group_bits = (total_group_bits < 0)
+                         ? 0
+                         : (total_group_bits > twopass->kf_group_bits)
+                               ? twopass->kf_group_bits
+                               : total_group_bits;
+
+  // Clip based on user supplied data rate variability limit.
+  if (total_group_bits > (int64_t)max_bits * rc->baseline_gf_interval)
+    total_group_bits = (int64_t)max_bits * rc->baseline_gf_interval;
+
+  return total_group_bits;
+}
+
+// Calculate the number of bits to assign to boosted frames in a group.
+static int calculate_boost_bits(int frame_count, int boost,
+                                int64_t total_group_bits) {
+  int allocation_chunks;
+
+  // return 0 for invalid inputs (could arise e.g. through rounding errors)
+  if (!boost || (total_group_bits <= 0)) return 0;
+
+  if (frame_count <= 0) return (int)(AOMMIN(total_group_bits, INT_MAX));
+
+  allocation_chunks = (frame_count * 100) + boost;
+
+  // Prevent overflow.
+  if (boost > 1023) {
+    int divisor = boost >> 10;
+    boost /= divisor;
+    allocation_chunks /= divisor;
+  }
+
+  // Calculate the number of extra bits for use in the boosted frame or frames.
+  return AOMMAX((int)(((int64_t)boost * total_group_bits) / allocation_chunks),
+                0);
+}
+
+// Calculate the boost factor based on the number of bits assigned, i.e. the
+// inverse of calculate_boost_bits().
+static int calculate_boost_factor(int frame_count, int bits,
+                                  int64_t total_group_bits) {
+  aom_clear_system_state();
+  return (int)(100.0 * frame_count * bits / (total_group_bits - bits));
+}
+
+// Reduce the number of bits assigned to keyframe or arf if necessary, to
+// prevent bitrate spikes that may break level constraints.
+// frame_type: 0: keyframe; 1: arf.
+static int adjust_boost_bits_for_target_level(const AV1_COMP *const cpi,
+                                              RATE_CONTROL *const rc,
+                                              int bits_assigned,
+                                              int64_t group_bits,
+                                              int frame_type) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const SequenceHeader *const seq_params = &cm->seq_params;
+  const int temporal_layer_id = cm->temporal_layer_id;
+  const int spatial_layer_id = cm->spatial_layer_id;
+  for (int index = 0; index < seq_params->operating_points_cnt_minus_1 + 1;
+       ++index) {
+    if (!is_in_operating_point(seq_params->operating_point_idc[index],
+                               temporal_layer_id, spatial_layer_id)) {
+      continue;
+    }
+
+    const AV1_LEVEL target_level =
+        cpi->level_params.target_seq_level_idx[index];
+    if (target_level >= SEQ_LEVELS) continue;
+
+    assert(is_valid_seq_level_idx(target_level));
+
+    const double level_bitrate_limit = av1_get_max_bitrate_for_level(
+        target_level, seq_params->tier[0], seq_params->profile);
+    const int target_bits_per_frame =
+        (int)(level_bitrate_limit / cpi->framerate);
+    if (frame_type == 0) {
+      // Maximum bits for keyframe is 8 times the target_bits_per_frame.
+      const int level_enforced_max_kf_bits = target_bits_per_frame * 8;
+      if (bits_assigned > level_enforced_max_kf_bits) {
+        const int frames = rc->frames_to_key - 1;
+        rc->kf_boost = calculate_boost_factor(
+            frames, level_enforced_max_kf_bits, group_bits);
+        bits_assigned = calculate_boost_bits(frames, rc->kf_boost, group_bits);
+      }
+    } else if (frame_type == 1) {
+      // Maximum bits for arf is 4 times the target_bits_per_frame.
+      const int level_enforced_max_arf_bits = target_bits_per_frame * 4;
+      if (bits_assigned > level_enforced_max_arf_bits) {
+        rc->gfu_boost = calculate_boost_factor(
+            rc->baseline_gf_interval, level_enforced_max_arf_bits, group_bits);
+        bits_assigned = calculate_boost_bits(rc->baseline_gf_interval,
+                                             rc->gfu_boost, group_bits);
+      }
+    } else {
+      assert(0);
+    }
+  }
+
+  return bits_assigned;
+}
+
+// Compile time switch on alternate algorithm to allocate bits in ARF groups
+// #define ALT_ARF_ALLOCATION
+#ifdef ALT_ARF_ALLOCATION
+double layer_fraction[MAX_ARF_LAYERS + 1] = { 1.0,  0.70, 0.55, 0.60,
+                                              0.60, 1.0,  1.0 };
+static void allocate_gf_group_bits(GF_GROUP *gf_group, RATE_CONTROL *const rc,
+                                   int64_t gf_group_bits, int gf_arf_bits,
+                                   int key_frame, int use_arf) {
+  int64_t total_group_bits = gf_group_bits;
+  int base_frame_bits;
+  const int gf_group_size = gf_group->size;
+  int layer_frames[MAX_ARF_LAYERS + 1] = { 0 };
+
+  // Subtract the extra bits set aside for ARF frames from the Group Total
+  if (use_arf || !key_frame) total_group_bits -= gf_arf_bits;
+
+  if (rc->baseline_gf_interval)
+    base_frame_bits = (int)(total_group_bits / rc->baseline_gf_interval);
+  else
+    base_frame_bits = (int)1;
+
+  // For key frames the frame target rate is already set and it
+  // is also the golden frame.
+  // === [frame_index == 0] ===
+  int frame_index = 0;
+  if (!key_frame) {
+    if (rc->source_alt_ref_active)
+      gf_group->bit_allocation[frame_index] = 0;
+    else
+      gf_group->bit_allocation[frame_index] =
+          base_frame_bits + (int)(gf_arf_bits * layer_fraction[1]);
+  }
+  frame_index++;
+
+  // Check the number of frames in each layer in case we have a
+  // non standard group length.
+  int max_arf_layer = gf_group->max_layer_depth - 1;
+  for (int idx = frame_index; idx < gf_group_size; ++idx) {
+    if ((gf_group->update_type[idx] == ARF_UPDATE) ||
+        (gf_group->update_type[idx] == INTNL_ARF_UPDATE)) {
+      // max_arf_layer = AOMMAX(max_arf_layer, gf_group->layer_depth[idx]);
+      layer_frames[gf_group->layer_depth[idx]]++;
+    }
+  }
+
+  // Allocate extra bits to each ARF layer
+  int i;
+  int layer_extra_bits[MAX_ARF_LAYERS + 1] = { 0 };
+  for (i = 1; i <= max_arf_layer; ++i) {
+    double fraction = (i == max_arf_layer) ? 1.0 : layer_fraction[i];
+    layer_extra_bits[i] =
+        (int)((gf_arf_bits * fraction) / AOMMAX(1, layer_frames[i]));
+    gf_arf_bits -= (int)(gf_arf_bits * fraction);
+  }
+
+  // Now combine ARF layer and baseline bits to give total bits for each frame.
+  int arf_extra_bits;
+  for (int idx = frame_index; idx < gf_group_size; ++idx) {
+    switch (gf_group->update_type[idx]) {
+      case ARF_UPDATE:
+      case INTNL_ARF_UPDATE:
+        arf_extra_bits = layer_extra_bits[gf_group->layer_depth[idx]];
+        gf_group->bit_allocation[idx] = base_frame_bits + arf_extra_bits;
+        break;
+      case INTNL_OVERLAY_UPDATE:
+      case OVERLAY_UPDATE: gf_group->bit_allocation[idx] = 0; break;
+      default: gf_group->bit_allocation[idx] = base_frame_bits; break;
+    }
+  }
+
+  // Set the frame following the current GOP to 0 bit allocation. For ARF
+  // groups, this next frame will be overlay frame, which is the first frame
+  // in the next GOP. For GF group, next GOP will overwrite the rate allocation.
+  // Setting this frame to use 0 bit (of out the current GOP budget) will
+  // simplify logics in reference frame management.
+  gf_group->bit_allocation[gf_group_size] = 0;
+}
+#else
+static void allocate_gf_group_bits(GF_GROUP *gf_group, RATE_CONTROL *const rc,
+                                   int64_t gf_group_bits, int gf_arf_bits,
+                                   int key_frame, int use_arf) {
+  int64_t total_group_bits = gf_group_bits;
+
+  // For key frames the frame target rate is already set and it
+  // is also the golden frame.
+  // === [frame_index == 0] ===
+  int frame_index = 0;
+  if (!key_frame) {
+    if (rc->source_alt_ref_active)
+      gf_group->bit_allocation[frame_index] = 0;
+    else
+      gf_group->bit_allocation[frame_index] = gf_arf_bits;
+  }
+
+  // Deduct the boost bits for arf (or gf if it is not a key frame)
+  // from the group total.
+  if (use_arf || !key_frame) total_group_bits -= gf_arf_bits;
+
+  frame_index++;
+
+  // Store the bits to spend on the ARF if there is one.
+  // === [frame_index == 1] ===
+  if (use_arf) {
+    gf_group->bit_allocation[frame_index] = gf_arf_bits;
+    ++frame_index;
+  }
+
+  const int gf_group_size = gf_group->size;
+  int arf_depth_bits[MAX_ARF_LAYERS + 1] = { 0 };
+  int arf_depth_count[MAX_ARF_LAYERS + 1] = { 0 };
+  int arf_depth_boost[MAX_ARF_LAYERS + 1] = { 0 };
+  int total_arfs = 0;
+  int total_overlays = rc->source_alt_ref_active;
+
+  for (int idx = 0; idx < gf_group_size; ++idx) {
+    if (gf_group->update_type[idx] == ARF_UPDATE ||
+        gf_group->update_type[idx] == INTNL_ARF_UPDATE ||
+        gf_group->update_type[idx] == LF_UPDATE) {
+      arf_depth_boost[gf_group->layer_depth[idx]] += gf_group->arf_boost[idx];
+      ++arf_depth_count[gf_group->layer_depth[idx]];
+    }
+  }
+
+  for (int idx = 2; idx <= MAX_ARF_LAYERS; ++idx) {
+    arf_depth_bits[idx] =
+        calculate_boost_bits(rc->baseline_gf_interval - total_arfs -
+                                 total_overlays - arf_depth_count[idx],
+                             arf_depth_boost[idx], total_group_bits);
+    total_group_bits -= arf_depth_bits[idx];
+    total_arfs += arf_depth_count[idx];
+  }
+
+  for (int idx = frame_index; idx < gf_group_size; ++idx) {
+    switch (gf_group->update_type[idx]) {
+      case ARF_UPDATE:
+      case INTNL_ARF_UPDATE:
+      case LF_UPDATE:
+        gf_group->bit_allocation[idx] =
+            (int)(((int64_t)arf_depth_bits[gf_group->layer_depth[idx]] *
+                   gf_group->arf_boost[idx]) /
+                  arf_depth_boost[gf_group->layer_depth[idx]]);
+        break;
+      case INTNL_OVERLAY_UPDATE:
+      case OVERLAY_UPDATE:
+      default: gf_group->bit_allocation[idx] = 0; break;
+    }
+  }
+
+  // Set the frame following the current GOP to 0 bit allocation. For ARF
+  // groups, this next frame will be overlay frame, which is the first frame
+  // in the next GOP. For GF group, next GOP will overwrite the rate allocation.
+  // Setting this frame to use 0 bit (of out the current GOP budget) will
+  // simplify logics in reference frame management.
+  gf_group->bit_allocation[gf_group_size] = 0;
+}
+#endif
+
+// Returns true if KF group and GF group both are almost completely static.
+static INLINE int is_almost_static(double gf_zero_motion, int kf_zero_motion) {
+  return (gf_zero_motion >= 0.995) &&
+         (kf_zero_motion >= STATIC_KF_GROUP_THRESH);
+}
+
+#define ARF_ABS_ZOOM_THRESH 4.4
+static INLINE int detect_gf_cut(AV1_COMP *cpi, int frame_index, int cur_start,
+                                int flash_detected, int active_max_gf_interval,
+                                int active_min_gf_interval,
+                                GF_GROUP_STATS *gf_stats) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  TWO_PASS *const twopass = &cpi->twopass;
+  // Motion breakout threshold for loop below depends on image size.
+  const double mv_ratio_accumulator_thresh =
+      (cpi->initial_height + cpi->initial_width) / 4.0;
+
+  if (!flash_detected) {
+    // Break clause to detect very still sections after motion. For example,
+    // a static image after a fade or other transition.
+    if (detect_transition_to_still(
+            twopass, rc->min_gf_interval, frame_index - cur_start, 5,
+            gf_stats->loop_decay_rate, gf_stats->last_loop_decay_rate)) {
+      return 1;
+    }
+  }
+
+  // Some conditions to breakout after min interval.
+  if (frame_index - cur_start >= active_min_gf_interval &&
+      // If possible don't break very close to a kf
+      (rc->frames_to_key - frame_index >= rc->min_gf_interval) &&
+      ((frame_index - cur_start) & 0x01) && !flash_detected &&
+      (gf_stats->mv_ratio_accumulator > mv_ratio_accumulator_thresh ||
+       gf_stats->abs_mv_in_out_accumulator > ARF_ABS_ZOOM_THRESH)) {
+    return 1;
+  }
+
+  // If almost totally static, we will not use the the max GF length later,
+  // so we can continue for more frames.
+  if (((frame_index - cur_start) >= active_max_gf_interval + 1) &&
+      !is_almost_static(gf_stats->zero_motion_accumulator,
+                        twopass->kf_zeromotion_pct)) {
+    return 1;
+  }
+  return 0;
+}
+
+#define MAX_PAD_GF_CHECK 6  // padding length to check for gf length
+#define AVG_SI_THRES 0.6    // thres for average silouette
+#define GF_SHRINK_OUTPUT 0  // print output for gf length decision
+int determine_high_err_gf(double *errs, int *is_high, double *si, int len,
+                          double *ratio, int gf_start, int gf_end,
+                          int before_pad) {
+  (void)gf_start;
+  (void)gf_end;
+  (void)before_pad;
+  // alpha and beta controls the threshold placement
+  // e.g. a smaller alpha makes the lower group more rigid
+  const double alpha = 0.5;
+  const double beta = 1 - alpha;
+  double mean = 0;
+  double mean_low = 0;
+  double mean_high = 0;
+  double prev_mean_low = 0;
+  double prev_mean_high = 0;
+  int count_low = 0;
+  int count_high = 0;
+  // calculate mean of errs
+  for (int i = 0; i < len; i++) {
+    mean += errs[i];
+  }
+  mean /= len;
+  // separate into two initial groups with greater / lower than mean
+  for (int i = 0; i < len; i++) {
+    if (errs[i] <= mean) {
+      is_high[i] = 0;
+      count_low++;
+      prev_mean_low += errs[i];
+    } else {
+      is_high[i] = 1;
+      count_high++;
+      prev_mean_high += errs[i];
+    }
+  }
+  prev_mean_low /= count_low;
+  prev_mean_high /= count_high;
+  // kmeans to refine
+  int count = 0;
+  while (count < 10) {
+    // re-group
+    mean_low = 0;
+    mean_high = 0;
+    count_low = 0;
+    count_high = 0;
+    double thres = prev_mean_low * alpha + prev_mean_high * beta;
+    for (int i = 0; i < len; i++) {
+      if (errs[i] <= thres) {
+        is_high[i] = 0;
+        count_low++;
+        mean_low += errs[i];
+      } else {
+        is_high[i] = 1;
+        count_high++;
+        mean_high += errs[i];
+      }
+    }
+    mean_low /= count_low;
+    mean_high /= count_high;
+
+    // break if not changed much
+    if (fabs((mean_low - prev_mean_low) / (prev_mean_low + 0.00001)) <
+            0.00001 &&
+        fabs((mean_high - prev_mean_high) / (prev_mean_high + 0.00001)) <
+            0.00001)
+      break;
+
+    // update means
+    prev_mean_high = mean_high;
+    prev_mean_low = mean_low;
+
+    count++;
+  }
+
+  // count how many jumps of group changes
+  int num_change = 0;
+  for (int i = 0; i < len - 1; i++) {
+    if (is_high[i] != is_high[i + 1]) num_change++;
+  }
+
+  // get silhouette as a measure of the classification quality
+  double avg_si = 0;
+  // ai: avg dist of its own class, bi: avg dist to the other class
+  double ai, bi;
+  if (count_low > 1 && count_high > 1) {
+    for (int i = 0; i < len; i++) {
+      ai = 0;
+      bi = 0;
+      // calculate average distance to everyone in the same group
+      // and in the other group
+      for (int j = 0; j < len; j++) {
+        if (i == j) continue;
+        if (is_high[i] == is_high[j]) {
+          ai += fabs(errs[i] - errs[j]);
+        } else {
+          bi += fabs(errs[i] - errs[j]);
+        }
+      }
+      if (is_high[i] == 0) {
+        ai = ai / (count_low - 1);
+        bi = bi / count_high;
+      } else {
+        ai = ai / (count_high - 1);
+        bi = bi / count_low;
+      }
+      if (ai <= bi) {
+        si[i] = 1 - ai / (bi + 0.00001);
+      } else {
+        si[i] = bi / (ai + 0.00001) - 1;
+      }
+      avg_si += si[i];
+    }
+    avg_si /= len;
+  }
+
+  int reset = 0;
+  *ratio = mean_high / (mean_low + 0.00001);
+  // if the two groups too similar, or
+  // if too many numbers of changes, or
+  // silhouette is too small, not confident
+  // reset everything to 0 later so we fallback to the original decision
+  if (*ratio < 1.3 || num_change > AOMMAX(len / 3, 6) ||
+      avg_si < AVG_SI_THRES) {
+    reset = 1;
+  }
+
+#if GF_SHRINK_OUTPUT
+  printf("\n");
+  for (int i = 0; i < len; i++) {
+    printf("%d: err %.1f, ishigh %d, si %.2f, (i=%d)\n",
+           gf_start + i - before_pad, errs[i], is_high[i], si[i], gf_end);
+  }
+  printf(
+      "count: %d, mean_high: %.1f, mean_low: %.1f, avg_si: %.2f, num_change: "
+      "%d, ratio %.2f, reset: %d\n",
+      count, mean_high, mean_low, avg_si, num_change,
+      mean_high / (mean_low + 0.000001), reset);
+#endif
+
+  if (reset) {
+    memset(is_high, 0, sizeof(is_high[0]) * len);
+    memset(si, 0, sizeof(si[0]) * len);
+  }
+  return reset;
+}
+
+#if GROUP_ADAPTIVE_MAXQ
+#define RC_FACTOR_MIN 0.75
+#define RC_FACTOR_MAX 1.25
+#endif  // GROUP_ADAPTIVE_MAXQ
+#define MIN_FWD_KF_INTERVAL 8
+#define MIN_SHRINK_LEN 6      // the minimum length of gf if we are shrinking
+#define SI_HIGH AVG_SI_THRES  // high quality classification
+#define SI_LOW 0.3            // very unsure classification
+// this function finds an low error frame previously to the current last frame
+// in the gf group, and set the last frame to it.
+// The resulting last frame is then returned by *cur_last_ptr
+// *cur_start_ptr and cut_pos[n] could also change due to shrinking
+// previous gf groups
+void set_last_prev_low_err(int *cur_start_ptr, int *cur_last_ptr, int *cut_pos,
+                           int count_cuts, int before_pad, double ratio,
+                           int *is_high, double *si, int prev_lows) {
+  int n;
+  int cur_start = *cur_start_ptr;
+  int cur_last = *cur_last_ptr;
+  for (n = cur_last; n >= cur_start + MIN_SHRINK_LEN; n--) {
+    // try to find a point that is very probable to be good
+    if (is_high[n - cur_start + before_pad] == 0 &&
+        si[n - cur_start + before_pad] > SI_HIGH) {
+      *cur_last_ptr = n;
+      return;
+    }
+  }
+  // could not find a low-err point, then let's try find an "unsure"
+  // point at least
+  for (n = cur_last; n >= cur_start + MIN_SHRINK_LEN; n--) {
+    if ((is_high[n - cur_start + before_pad] == 0) ||
+        (is_high[n - cur_start + before_pad] &&
+         si[n - cur_start + before_pad] < SI_LOW)) {
+      *cur_last_ptr = n;
+      return;
+    }
+  }
+  if (prev_lows) {
+    // try with shrinking previous all_zero interval
+    for (n = cur_start + MIN_SHRINK_LEN - 1; n > cur_start; n--) {
+      if (is_high[n - cur_start + before_pad] == 0 &&
+          si[n - cur_start + before_pad] > SI_HIGH) {
+        int tentative_start = n - MIN_SHRINK_LEN;
+        // check if the previous interval can shrink this much
+        int available =
+            tentative_start - cut_pos[count_cuts - 2] > MIN_SHRINK_LEN &&
+            cur_start - tentative_start < prev_lows;
+        // shrinking too agressively may worsen performance
+        // set stricter thres for shorter length
+        double ratio_thres =
+            1.0 * (cur_start - tentative_start) / (double)(MIN_SHRINK_LEN) +
+            1.0;
+
+        if (available && (ratio > ratio_thres)) {
+          cut_pos[count_cuts - 1] = tentative_start;
+          *cur_start_ptr = tentative_start;
+          *cur_last_ptr = n;
+          return;
+        }
+      }
+    }
+  }
+  if (prev_lows) {
+    // try with shrinking previous all_zero interval with unsure points
+    for (n = cur_start + MIN_SHRINK_LEN - 1; n > cur_start; n--) {
+      if ((is_high[n - cur_start + before_pad] == 0) ||
+          (is_high[n - cur_start + before_pad] &&
+           si[n - cur_start + before_pad] < SI_LOW)) {
+        int tentative_start = n - MIN_SHRINK_LEN;
+        // check if the previous interval can shrink this much
+        int available =
+            tentative_start - cut_pos[count_cuts - 2] > MIN_SHRINK_LEN &&
+            cur_start - tentative_start < prev_lows;
+        // shrinking too agressively may worsen performance
+        double ratio_thres =
+            1.0 * (cur_start - tentative_start) / (double)(MIN_SHRINK_LEN) +
+            1.0;
+
+        if (available && (ratio > ratio_thres)) {
+          cut_pos[count_cuts - 1] = tentative_start;
+          *cur_start_ptr = tentative_start;
+          *cur_last_ptr = n;
+          return;
+        }
+      }
+    }
+  }  // prev_lows
+  return;
+}
+
+// This function decides the gf group length of future frames in batch
+// rc->gf_intervals is modified to store the group lengths
+static void calculate_gf_length(AV1_COMP *cpi, int max_gop_length,
+                                int max_intervals) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  TWO_PASS *const twopass = &cpi->twopass;
+  FIRSTPASS_STATS next_frame;
+  const FIRSTPASS_STATS *const start_pos = twopass->stats_in;
+  FRAME_INFO *frame_info = &cpi->frame_info;
+  int i;
+
+  int flash_detected;
+
+  aom_clear_system_state();
+  av1_zero(next_frame);
+
+  if (has_no_stats_stage(cpi)) {
+    for (i = 0; i < MAX_NUM_GF_INTERVALS; i++) {
+      rc->gf_intervals[i] = AOMMIN(rc->max_gf_interval, max_gop_length);
+    }
+    rc->cur_gf_index = 0;
+    rc->intervals_till_gf_calculate_due = MAX_NUM_GF_INTERVALS;
+    return;
+  }
+
+  // TODO(urvang): Try logic to vary min and max interval based on q.
+  const int active_min_gf_interval = rc->min_gf_interval;
+  const int active_max_gf_interval =
+      AOMMIN(rc->max_gf_interval, max_gop_length);
+
+  i = 0;
+  max_intervals = cpi->lap_enabled ? 1 : max_intervals;
+  int cut_pos[MAX_NUM_GF_INTERVALS + 1] = { 0 };
+  int count_cuts = 1;
+  int cur_start = 0, cur_last;
+  int cut_here;
+  int prev_lows = 0;
+  GF_GROUP_STATS gf_stats;
+  init_gf_stats(&gf_stats);
+  while (count_cuts < max_intervals + 1) {
+    ++i;
+
+    // reaches next key frame, break here
+    if (i >= rc->frames_to_key) {
+      cut_pos[count_cuts] = i - 1;
+      count_cuts++;
+      break;
+    }
+
+    // reached maximum len, but nothing special yet (almost static)
+    // let's look at the next interval
+    if (i - cur_start >= rc->static_scene_max_gf_interval) {
+      cut_here = 1;
+    } else {
+      // reaches last frame, break
+      if (EOF == input_stats(twopass, &next_frame)) {
+        cut_pos[count_cuts] = i - 1;
+        count_cuts++;
+        break;
+      }
+      // Test for the case where there is a brief flash but the prediction
+      // quality back to an earlier frame is then restored.
+      flash_detected = detect_flash(twopass, 0);
+      // TODO(bohanli): remove redundant accumulations here, or unify
+      // this and the ones in define_gf_group
+      accumulate_next_frame_stats(&next_frame, frame_info, twopass,
+                                  flash_detected, rc->frames_since_key, i, 0,
+                                  rc->min_gf_interval, &gf_stats);
+
+      cut_here = detect_gf_cut(cpi, i, cur_start, flash_detected,
+                               active_max_gf_interval, active_min_gf_interval,
+                               &gf_stats);
+    }
+    if (cut_here) {
+      cur_last = i - 1;  // the current last frame in the gf group
+      // only try shrinking if interval smaller than active_max_gf_interval
+      if (cur_last - cur_start <= active_max_gf_interval) {
+        // determine in the current decided gop the higher and lower errs
+        int n;
+        double ratio;
+
+        // load neighboring coded errs
+        int is_high[MAX_GF_INTERVAL + 1 + MAX_PAD_GF_CHECK * 2] = { 0 };
+        double errs[MAX_GF_INTERVAL + 1 + MAX_PAD_GF_CHECK * 2] = { 0 };
+        double si[MAX_GF_INTERVAL + 1 + MAX_PAD_GF_CHECK * 2] = { 0 };
+        int before_pad =
+            AOMMIN(MAX_PAD_GF_CHECK, rc->frames_since_key - 1 + cur_start);
+        int after_pad =
+            AOMMIN(MAX_PAD_GF_CHECK, rc->frames_to_key - cur_last - 1);
+        for (n = cur_start - before_pad; n <= cur_last + after_pad; n++) {
+          if (start_pos + n - 1 > twopass->stats_buf_ctx->stats_in_end) {
+            after_pad = n - cur_last - 1;
+            assert(after_pad >= 0);
+            break;
+          } else if (start_pos + n - 1 <
+                     twopass->stats_buf_ctx->stats_in_start) {
+            before_pad = cur_start - n - 1;
+            continue;
+          }
+          errs[n + before_pad - cur_start] = (start_pos + n - 1)->coded_error;
+        }
+        const int len = before_pad + after_pad + cur_last - cur_start + 1;
+        const int reset = determine_high_err_gf(
+            errs, is_high, si, len, &ratio, cur_start, cur_last, before_pad);
+
+        // if the current frame may have high error, try shrinking
+        if (is_high[cur_last - cur_start + before_pad] == 1 ||
+            (!reset && si[cur_last - cur_start + before_pad] < SI_LOW)) {
+          // try not to cut in high err area
+          set_last_prev_low_err(&cur_start, &cur_last, cut_pos, count_cuts,
+                                before_pad, ratio, is_high, si, prev_lows);
+        }  // if current frame high error
+        // count how many trailing lower error frames we have in this decided
+        // gf group
+        prev_lows = 0;
+        for (n = cur_last - 1; n > cur_start + MIN_SHRINK_LEN; n--) {
+          if (is_high[n - cur_start + before_pad] == 0 &&
+              (si[n - cur_start + before_pad] > SI_HIGH || reset)) {
+            prev_lows++;
+          } else {
+            break;
+          }
+        }
+      }
+      cut_pos[count_cuts] = cur_last;
+      count_cuts++;
+
+      // reset pointers to the shrinked location
+      twopass->stats_in = start_pos + cur_last;
+      cur_start = cur_last;
+      i = cur_last;
+
+      // reset accumulators
+      init_gf_stats(&gf_stats);
+    }
+  }
+
+  // save intervals
+  rc->intervals_till_gf_calculate_due = count_cuts - 1;
+  for (int n = 1; n < count_cuts; n++) {
+    rc->gf_intervals[n - 1] = cut_pos[n] + 1 - cut_pos[n - 1];
+  }
+  rc->cur_gf_index = 0;
+  twopass->stats_in = start_pos;
+
+#if GF_SHRINK_OUTPUT
+  printf("\nf_to_key: %d, count_cut: %d. ", rc->frames_to_key, count_cuts);
+  for (int n = 0; n < count_cuts; n++) {
+    printf("%d ", cut_pos[n]);
+  }
+  printf("\n");
+
+  for (int n = 0; n < rc->intervals_till_gf_calculate_due; n++) {
+    printf("%d ", rc->gf_intervals[n]);
+  }
+  printf("\n\n");
+#endif
+}
+
+static void correct_frames_to_key(AV1_COMP *cpi) {
+  int lookahead_size =
+      (int)av1_lookahead_depth(cpi->lookahead, cpi->compressor_stage) + 1;
+  if (lookahead_size <
+      av1_lookahead_pop_sz(cpi->lookahead, cpi->compressor_stage)) {
+    cpi->rc.frames_to_key = AOMMIN(cpi->rc.frames_to_key, lookahead_size);
+  }
+}
+
+static void define_gf_group_pass0(AV1_COMP *cpi,
+                                  const EncodeFrameParams *const frame_params) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  GF_GROUP *const gf_group = &cpi->gf_group;
+  int target;
+
+  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
+    av1_cyclic_refresh_set_golden_update(cpi);
+  } else {
+    rc->baseline_gf_interval = rc->gf_intervals[rc->cur_gf_index];
+    rc->intervals_till_gf_calculate_due--;
+    rc->cur_gf_index++;
+  }
+
+  // correct frames_to_key when lookahead queue is flushing
+  correct_frames_to_key(cpi);
+
+  if (rc->baseline_gf_interval > rc->frames_to_key)
+    rc->baseline_gf_interval = rc->frames_to_key;
+
+  rc->gfu_boost = DEFAULT_GF_BOOST;
+  rc->constrained_gf_group =
+      (rc->baseline_gf_interval >= rc->frames_to_key) ? 1 : 0;
+
+  gf_group->max_layer_depth_allowed = cpi->oxcf.gf_max_pyr_height;
+
+  // Rare case when the look-ahead is less than the target GOP length, can't
+  // generate ARF frame.
+  if (rc->baseline_gf_interval > cpi->oxcf.lag_in_frames ||
+      !is_altref_enabled(cpi) || rc->baseline_gf_interval < rc->min_gf_interval)
+    gf_group->max_layer_depth_allowed = 0;
+
+  // Set up the structure of this Group-Of-Pictures (same as GF_GROUP)
+  av1_gop_setup_structure(cpi, frame_params);
+
+  // Allocate bits to each of the frames in the GF group.
+  // TODO(sarahparker) Extend this to work with pyramid structure.
+  for (int cur_index = 0; cur_index < gf_group->size; ++cur_index) {
+    const FRAME_UPDATE_TYPE cur_update_type = gf_group->update_type[cur_index];
+    if (cpi->oxcf.rc_mode == AOM_CBR) {
+      if (cur_update_type == KEY_FRAME) {
+        target = av1_calc_iframe_target_size_one_pass_cbr(cpi);
+      } else {
+        target = av1_calc_pframe_target_size_one_pass_cbr(cpi, cur_update_type);
+      }
+    } else {
+      if (cur_update_type == KEY_FRAME) {
+        target = av1_calc_iframe_target_size_one_pass_vbr(cpi);
+      } else {
+        target = av1_calc_pframe_target_size_one_pass_vbr(cpi, cur_update_type);
+      }
+    }
+    gf_group->bit_allocation[cur_index] = target;
+  }
+}
+
+static INLINE void set_baseline_gf_interval(AV1_COMP *cpi, int arf_position,
+                                            int active_max_gf_interval,
+                                            int use_alt_ref,
+                                            int is_final_pass) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  TWO_PASS *const twopass = &cpi->twopass;
+  // Set the interval until the next gf.
+  // If forward keyframes are enabled, ensure the final gf group obeys the
+  // MIN_FWD_KF_INTERVAL.
+  if (cpi->oxcf.fwd_kf_enabled && use_alt_ref &&
+      ((twopass->stats_in - arf_position + rc->frames_to_key) <
+       twopass->stats_buf_ctx->stats_in_end) &&
+      cpi->rc.next_is_fwd_key) {
+    if (arf_position == rc->frames_to_key) {
+      rc->baseline_gf_interval = arf_position;
+      // if the last gf group will be smaller than MIN_FWD_KF_INTERVAL
+    } else if ((rc->frames_to_key - arf_position <
+                AOMMAX(MIN_FWD_KF_INTERVAL, rc->min_gf_interval)) &&
+               (rc->frames_to_key != arf_position)) {
+      // if possible, merge the last two gf groups
+      if (rc->frames_to_key <= active_max_gf_interval) {
+        rc->baseline_gf_interval = rc->frames_to_key;
+        if (is_final_pass) rc->intervals_till_gf_calculate_due = 0;
+        // if merging the last two gf groups creates a group that is too long,
+        // split them and force the last gf group to be the MIN_FWD_KF_INTERVAL
+      } else {
+        rc->baseline_gf_interval = rc->frames_to_key - MIN_FWD_KF_INTERVAL;
+        if (is_final_pass) rc->intervals_till_gf_calculate_due = 0;
+      }
+    } else {
+      rc->baseline_gf_interval = arf_position - rc->source_alt_ref_pending;
+    }
+  } else {
+    rc->baseline_gf_interval = arf_position - rc->source_alt_ref_pending;
+  }
+}
+
+// initialize GF_GROUP_STATS
+static void init_gf_stats(GF_GROUP_STATS *gf_stats) {
+  gf_stats->gf_group_err = 0.0;
+  gf_stats->gf_group_raw_error = 0.0;
+  gf_stats->gf_group_skip_pct = 0.0;
+  gf_stats->gf_group_inactive_zone_rows = 0.0;
+
+  gf_stats->mv_ratio_accumulator = 0.0;
+  gf_stats->decay_accumulator = 1.0;
+  gf_stats->zero_motion_accumulator = 1.0;
+  gf_stats->loop_decay_rate = 1.0;
+  gf_stats->last_loop_decay_rate = 1.0;
+  gf_stats->this_frame_mv_in_out = 0.0;
+  gf_stats->mv_in_out_accumulator = 0.0;
+  gf_stats->abs_mv_in_out_accumulator = 0.0;
+
+  gf_stats->avg_sr_coded_error = 0.0;
+  gf_stats->avg_tr_coded_error = 0.0;
+  gf_stats->avg_pcnt_second_ref = 0.0;
+  gf_stats->avg_pcnt_third_ref = 0.0;
+  gf_stats->avg_pcnt_third_ref_nolast = 0.0;
+  gf_stats->avg_new_mv_count = 0.0;
+  gf_stats->avg_wavelet_energy = 0.0;
+  gf_stats->avg_raw_err_stdev = 0.0;
+  gf_stats->non_zero_stdev_count = 0;
+
+  gf_stats->allow_alt_ref = 0;
+}
+
+// Analyse and define a gf/arf group.
+#define MAX_GF_BOOST 5400
+static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame,
+                            const EncodeFrameParams *const frame_params,
+                            int max_gop_length, int is_final_pass) {
+  AV1_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  TWO_PASS *const twopass = &cpi->twopass;
+  FIRSTPASS_STATS next_frame;
+  const FIRSTPASS_STATS *const start_pos = twopass->stats_in;
+  GF_GROUP *gf_group = &cpi->gf_group;
+  FRAME_INFO *frame_info = &cpi->frame_info;
+  int i;
+
+  int flash_detected;
+  int64_t gf_group_bits;
+  const int is_intra_only = frame_params->frame_type == KEY_FRAME ||
+                            frame_params->frame_type == INTRA_ONLY_FRAME;
+  const int arf_active_or_kf = is_intra_only || rc->source_alt_ref_active;
+
+  cpi->internal_altref_allowed = (oxcf->gf_max_pyr_height > 1);
+
+  // Reset the GF group data structures unless this is a key
+  // frame in which case it will already have been done.
+  if (!is_intra_only) {
+    av1_zero(cpi->gf_group);
+  }
+
+  aom_clear_system_state();
+  av1_zero(next_frame);
+
+  if (has_no_stats_stage(cpi)) {
+    define_gf_group_pass0(cpi, frame_params);
+    return;
+  }
+
+  // correct frames_to_key when lookahead queue is emptying
+  if (cpi->lap_enabled) {
+    correct_frames_to_key(cpi);
+  }
+
+  GF_GROUP_STATS gf_stats;
+  init_gf_stats(&gf_stats);
+  GF_FRAME_STATS first_frame_stats, last_frame_stats;
+
+  gf_stats.allow_alt_ref = is_altref_enabled(cpi);
+  const int can_disable_arf = (oxcf->gf_min_pyr_height == MIN_PYRAMID_LVL);
+
+  // Load stats for the current frame.
+  double mod_frame_err =
+      calculate_modified_err(frame_info, twopass, oxcf, this_frame);
+
+  // Note the error of the frame at the start of the group. This will be
+  // the GF frame error if we code a normal gf.
+  first_frame_stats.frame_err = mod_frame_err;
+  first_frame_stats.frame_coded_error = this_frame->coded_error;
+  first_frame_stats.frame_sr_coded_error = this_frame->sr_coded_error;
+  first_frame_stats.frame_tr_coded_error = this_frame->tr_coded_error;
+
+  // If this is a key frame or the overlay from a previous arf then
+  // the error score / cost of this frame has already been accounted for.
+  if (arf_active_or_kf) {
+    gf_stats.gf_group_err -= first_frame_stats.frame_err;
+#if GROUP_ADAPTIVE_MAXQ
+    gf_stats.gf_group_raw_error -= this_frame->coded_error;
+#endif
+    gf_stats.gf_group_skip_pct -= this_frame->intra_skip_pct;
+    gf_stats.gf_group_inactive_zone_rows -= this_frame->inactive_zone_rows;
+  }
+
+  // TODO(urvang): Try logic to vary min and max interval based on q.
+  const int active_min_gf_interval = rc->min_gf_interval;
+  const int active_max_gf_interval =
+      AOMMIN(rc->max_gf_interval, max_gop_length);
+
+  i = 0;
+  // get the determined gf group length from rc->gf_intervals
+  while (i < rc->gf_intervals[rc->cur_gf_index]) {
+    ++i;
+    // Accumulate error score of frames in this gf group.
+    mod_frame_err =
+        calculate_modified_err(frame_info, twopass, oxcf, this_frame);
+    // accumulate stats for this frame
+    accumulate_this_frame_stats(this_frame, mod_frame_err, &gf_stats);
+
+    // read in the next frame
+    if (EOF == input_stats(twopass, &next_frame)) break;
+
+    // Test for the case where there is a brief flash but the prediction
+    // quality back to an earlier frame is then restored.
+    flash_detected = detect_flash(twopass, 0);
+
+    // accumulate stats for next frame
+    accumulate_next_frame_stats(
+        &next_frame, frame_info, twopass, flash_detected, rc->frames_since_key,
+        i, can_disable_arf, rc->min_gf_interval, &gf_stats);
+
+    *this_frame = next_frame;
+  }
+  // save the errs for the last frame
+  last_frame_stats.frame_coded_error = next_frame.coded_error;
+  last_frame_stats.frame_sr_coded_error = next_frame.sr_coded_error;
+  last_frame_stats.frame_tr_coded_error = next_frame.tr_coded_error;
+
+  if (is_final_pass) {
+    rc->intervals_till_gf_calculate_due--;
+    rc->cur_gf_index++;
+  }
+
+  // Was the group length constrained by the requirement for a new KF?
+  rc->constrained_gf_group = (i >= rc->frames_to_key) ? 1 : 0;
+
+  const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
+                          ? cpi->initial_mbs
+                          : cm->mi_params.MBs;
+  assert(num_mbs > 0);
+
+  average_gf_stats(i, &next_frame, &gf_stats);
+
+  // Disable internal ARFs for "still" gf groups.
+  //   zero_motion_accumulator: minimum percentage of (0,0) motion;
+  //   avg_sr_coded_error:      average of the SSE per pixel of each frame;
+  //   avg_raw_err_stdev:       average of the standard deviation of (0,0)
+  //                            motion error per block of each frame.
+  const int can_disable_internal_arfs =
+      (oxcf->gf_min_pyr_height <= MIN_PYRAMID_LVL + 1);
+  if (can_disable_internal_arfs &&
+      gf_stats.zero_motion_accumulator > MIN_ZERO_MOTION &&
+      gf_stats.avg_sr_coded_error / num_mbs < MAX_SR_CODED_ERROR &&
+      gf_stats.avg_raw_err_stdev < MAX_RAW_ERR_VAR) {
+    cpi->internal_altref_allowed = 0;
+  }
+
+  int use_alt_ref;
+  if (can_disable_arf) {
+    use_alt_ref = !is_almost_static(gf_stats.zero_motion_accumulator,
+                                    twopass->kf_zeromotion_pct) &&
+                  gf_stats.allow_alt_ref && (i < cpi->oxcf.lag_in_frames) &&
+                  (i >= MIN_GF_INTERVAL) &&
+                  (cpi->oxcf.gf_max_pyr_height > MIN_PYRAMID_LVL);
+
+    // TODO(urvang): Improve and use model for VBR, CQ etc as well.
+    if (use_alt_ref && cpi->oxcf.rc_mode == AOM_Q &&
+        cpi->oxcf.cq_level <= 200) {
+      aom_clear_system_state();
+      float features[21];
+      get_features_from_gf_stats(
+          &gf_stats, &first_frame_stats, &last_frame_stats, num_mbs,
+          rc->constrained_gf_group, twopass->kf_zeromotion_pct, i, features);
+      // Infer using ML model.
+      float score;
+      av1_nn_predict(features, &av1_use_flat_gop_nn_config, 1, &score);
+      use_alt_ref = (score <= 0.0);
+    }
+  } else {
+    assert(cpi->oxcf.gf_max_pyr_height > MIN_PYRAMID_LVL);
+    use_alt_ref =
+        gf_stats.allow_alt_ref && (i < cpi->oxcf.lag_in_frames) && (i > 2);
+  }
+
+#define REDUCE_GF_LENGTH_THRESH 4
+#define REDUCE_GF_LENGTH_TO_KEY_THRESH 9
+#define REDUCE_GF_LENGTH_BY 1
+  int alt_offset = 0;
+  // The length reduction strategy is tweaked for certain cases, and doesn't
+  // work well for certain other cases.
+  const int allow_gf_length_reduction =
+      ((cpi->oxcf.rc_mode == AOM_Q && cpi->oxcf.cq_level <= 128) ||
+       !cpi->internal_altref_allowed) &&
+      !is_lossless_requested(&cpi->oxcf);
+
+  if (allow_gf_length_reduction && use_alt_ref) {
+    // adjust length of this gf group if one of the following condition met
+    // 1: only one overlay frame left and this gf is too long
+    // 2: next gf group is too short to have arf compared to the current gf
+
+    // maximum length of next gf group
+    const int next_gf_len = rc->frames_to_key - i;
+    const int single_overlay_left =
+        next_gf_len == 0 && i > REDUCE_GF_LENGTH_THRESH;
+    // the next gf is probably going to have a ARF but it will be shorter than
+    // this gf
+    const int unbalanced_gf =
+        i > REDUCE_GF_LENGTH_TO_KEY_THRESH &&
+        next_gf_len + 1 < REDUCE_GF_LENGTH_TO_KEY_THRESH &&
+        next_gf_len + 1 >= rc->min_gf_interval;
+
+    if (single_overlay_left || unbalanced_gf) {
+      const int roll_back = REDUCE_GF_LENGTH_BY;
+      // Reduce length only if active_min_gf_interval will be respected later.
+      if (i - roll_back >= active_min_gf_interval + 1) {
+        alt_offset = -roll_back;
+        i -= roll_back;
+        if (is_final_pass) rc->intervals_till_gf_calculate_due = 0;
+      }
+    }
+  }
+
+  // Should we use the alternate reference frame.
+  if (use_alt_ref) {
+    rc->source_alt_ref_pending = 1;
+    gf_group->max_layer_depth_allowed = cpi->oxcf.gf_max_pyr_height;
+    set_baseline_gf_interval(cpi, i, active_max_gf_interval, use_alt_ref,
+                             is_final_pass);
+
+    const int forward_frames = (rc->frames_to_key - i >= i - 1)
+                                   ? i - 1
+                                   : AOMMAX(0, rc->frames_to_key - i);
+
+    // Calculate the boost for alt ref.
+    rc->gfu_boost = av1_calc_arf_boost(
+        twopass, rc, frame_info, alt_offset, forward_frames, (i - 1),
+        cpi->lap_enabled ? &rc->num_stats_used_for_gfu_boost : NULL,
+        cpi->lap_enabled ? &rc->num_stats_required_for_gfu_boost : NULL);
+  } else {
+    reset_fpf_position(twopass, start_pos);
+    rc->source_alt_ref_pending = 0;
+    gf_group->max_layer_depth_allowed = 0;
+    set_baseline_gf_interval(cpi, i, active_max_gf_interval, use_alt_ref,
+                             is_final_pass);
+
+    rc->gfu_boost = AOMMIN(
+        MAX_GF_BOOST,
+        av1_calc_arf_boost(
+            twopass, rc, frame_info, alt_offset, (i - 1), 0,
+            cpi->lap_enabled ? &rc->num_stats_used_for_gfu_boost : NULL,
+            cpi->lap_enabled ? &rc->num_stats_required_for_gfu_boost : NULL));
+  }
+
+  // rc->gf_intervals assumes the usage of alt_ref, therefore adding one overlay
+  // frame to the next gf. If no alt_ref is used, should substract 1 frame from
+  // the next gf group.
+  // TODO(bohanli): should incorporate the usage of alt_ref into
+  // calculate_gf_length
+  if (is_final_pass && rc->source_alt_ref_pending == 0 &&
+      rc->intervals_till_gf_calculate_due > 0) {
+    rc->gf_intervals[rc->cur_gf_index]--;
+  }
+
+#define LAST_ALR_BOOST_FACTOR 0.2f
+  rc->arf_boost_factor = 1.0;
+  if (rc->source_alt_ref_pending && !is_lossless_requested(&cpi->oxcf)) {
+    // Reduce the boost of altref in the last gf group
+    if (rc->frames_to_key - i == REDUCE_GF_LENGTH_BY ||
+        rc->frames_to_key - i == 0) {
+      rc->arf_boost_factor = LAST_ALR_BOOST_FACTOR;
+    }
+  }
+
+  rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+
+  // Reset the file position.
+  reset_fpf_position(twopass, start_pos);
+
+  // Calculate the bits to be allocated to the gf/arf group as a whole
+  gf_group_bits = calculate_total_gf_group_bits(cpi, gf_stats.gf_group_err);
+  rc->gf_group_bits = gf_group_bits;
+
+#if GROUP_ADAPTIVE_MAXQ
+  // Calculate an estimate of the maxq needed for the group.
+  // We are more agressive about correcting for sections
+  // where there could be significant overshoot than for easier
+  // sections where we do not wish to risk creating an overshoot
+  // of the allocated bit budget.
+  if ((cpi->oxcf.rc_mode != AOM_Q) && (rc->baseline_gf_interval > 1)) {
+    const int vbr_group_bits_per_frame =
+        (int)(gf_group_bits / rc->baseline_gf_interval);
+    const double group_av_err =
+        gf_stats.gf_group_raw_error / rc->baseline_gf_interval;
+    const double group_av_skip_pct =
+        gf_stats.gf_group_skip_pct / rc->baseline_gf_interval;
+    const double group_av_inactive_zone =
+        ((gf_stats.gf_group_inactive_zone_rows * 2) /
+         (rc->baseline_gf_interval * (double)cm->mi_params.mb_rows));
+
+    int tmp_q;
+    // rc factor is a weight factor that corrects for local rate control drift.
+    double rc_factor = 1.0;
+    int64_t bits = cpi->oxcf.target_bandwidth;
+
+    if (bits > 0) {
+      int rate_error;
+
+      rate_error = (int)((rc->vbr_bits_off_target * 100) / bits);
+      rate_error = clamp(rate_error, -100, 100);
+      if (rate_error > 0) {
+        rc_factor = AOMMAX(RC_FACTOR_MIN, (double)(100 - rate_error) / 100.0);
+      } else {
+        rc_factor = AOMMIN(RC_FACTOR_MAX, (double)(100 - rate_error) / 100.0);
+      }
+    }
+
+    tmp_q = get_twopass_worst_quality(
+        cpi, group_av_err, (group_av_skip_pct + group_av_inactive_zone),
+        vbr_group_bits_per_frame, rc_factor);
+    rc->active_worst_quality = AOMMAX(tmp_q, rc->active_worst_quality >> 1);
+  }
+#endif
+
+  // Adjust KF group bits and error remaining.
+  if (is_final_pass)
+    twopass->kf_group_error_left -= (int64_t)gf_stats.gf_group_err;
+
+  // Set up the structure of this Group-Of-Pictures (same as GF_GROUP)
+  av1_gop_setup_structure(cpi, frame_params);
+
+  // Reset the file position.
+  reset_fpf_position(twopass, start_pos);
+
+  // Calculate a section intra ratio used in setting max loop filter.
+  if (frame_params->frame_type != KEY_FRAME) {
+    twopass->section_intra_rating = calculate_section_intra_ratio(
+        start_pos, twopass->stats_buf_ctx->stats_in_end,
+        rc->baseline_gf_interval);
+  }
+
+  // Reset rolling actual and target bits counters for ARF groups.
+  twopass->rolling_arf_group_target_bits = 1;
+  twopass->rolling_arf_group_actual_bits = 1;
+
+  av1_gop_bit_allocation(cpi, rc, gf_group,
+                         frame_params->frame_type == KEY_FRAME, use_alt_ref,
+                         gf_group_bits);
+}
+
+// #define FIXED_ARF_BITS
+#ifdef FIXED_ARF_BITS
+#define ARF_BITS_FRACTION 0.75
+#endif
+void av1_gop_bit_allocation(const AV1_COMP *cpi, RATE_CONTROL *const rc,
+                            GF_GROUP *gf_group, int is_key_frame, int use_arf,
+                            int64_t gf_group_bits) {
+  // Calculate the extra bits to be used for boosted frame(s)
+#ifdef FIXED_ARF_BITS
+  int gf_arf_bits = (int)(ARF_BITS_FRACTION * gf_group_bits);
+#else
+  int gf_arf_bits = calculate_boost_bits(rc->baseline_gf_interval,
+                                         rc->gfu_boost, gf_group_bits);
+#endif
+
+  gf_arf_bits = adjust_boost_bits_for_target_level(cpi, rc, gf_arf_bits,
+                                                   gf_group_bits, 1);
+
+  // Allocate bits to each of the frames in the GF group.
+  allocate_gf_group_bits(gf_group, rc, gf_group_bits, gf_arf_bits, is_key_frame,
+                         use_arf);
+}
+
+// Minimum % intra coding observed in first pass (1.0 = 100%)
+#define MIN_INTRA_LEVEL 0.25
+// Minimum ratio between the % of intra coding and inter coding in the first
+// pass after discounting neutral blocks (discounting neutral blocks in this
+// way helps catch scene cuts in clips with very flat areas or letter box
+// format clips with image padding.
+#define INTRA_VS_INTER_THRESH 2.0
+// Hard threshold where the first pass chooses intra for almost all blocks.
+// In such a case even if the frame is not a scene cut coding a key frame
+// may be a good option.
+#define VERY_LOW_INTER_THRESH 0.05
+// Maximum threshold for the relative ratio of intra error score vs best
+// inter error score.
+#define KF_II_ERR_THRESHOLD 2.5
+// In real scene cuts there is almost always a sharp change in the intra
+// or inter error score.
+#define ERR_CHANGE_THRESHOLD 0.4
+// For real scene cuts we expect an improvment in the intra inter error
+// ratio in the next frame.
+#define II_IMPROVEMENT_THRESHOLD 3.5
+#define KF_II_MAX 128.0
+
+// Threshold for use of the lagging second reference frame. High second ref
+// usage may point to a transient event like a flash or occlusion rather than
+// a real scene cut.
+// We adapt the threshold based on number of frames in this key-frame group so
+// far.
+static double get_second_ref_usage_thresh(int frame_count_so_far) {
+  const int adapt_upto = 32;
+  const double min_second_ref_usage_thresh = 0.085;
+  const double second_ref_usage_thresh_max_delta = 0.035;
+  if (frame_count_so_far >= adapt_upto) {
+    return min_second_ref_usage_thresh + second_ref_usage_thresh_max_delta;
+  }
+  return min_second_ref_usage_thresh +
+         ((double)frame_count_so_far / (adapt_upto - 1)) *
+             second_ref_usage_thresh_max_delta;
+}
+
+static int test_candidate_kf(TWO_PASS *twopass,
+                             const FIRSTPASS_STATS *last_frame,
+                             const FIRSTPASS_STATS *this_frame,
+                             const FIRSTPASS_STATS *next_frame,
+                             int frame_count_so_far, enum aom_rc_mode rc_mode) {
+  int is_viable_kf = 0;
+  double pcnt_intra = 1.0 - this_frame->pcnt_inter;
+  double modified_pcnt_inter =
+      this_frame->pcnt_inter - this_frame->pcnt_neutral;
+  const double second_ref_usage_thresh =
+      get_second_ref_usage_thresh(frame_count_so_far);
+
+  // Does the frame satisfy the primary criteria of a key frame?
+  // See above for an explanation of the test criteria.
+  // If so, then examine how well it predicts subsequent frames.
+  if (IMPLIES(rc_mode == AOM_Q, frame_count_so_far >= 3) &&
+      (this_frame->pcnt_second_ref < second_ref_usage_thresh) &&
+      (next_frame->pcnt_second_ref < second_ref_usage_thresh) &&
+      ((this_frame->pcnt_inter < VERY_LOW_INTER_THRESH) ||
+       ((pcnt_intra > MIN_INTRA_LEVEL) &&
+        (pcnt_intra > (INTRA_VS_INTER_THRESH * modified_pcnt_inter)) &&
+        ((this_frame->intra_error /
+          DOUBLE_DIVIDE_CHECK(this_frame->coded_error)) <
+         KF_II_ERR_THRESHOLD) &&
+        ((fabs(last_frame->coded_error - this_frame->coded_error) /
+              DOUBLE_DIVIDE_CHECK(this_frame->coded_error) >
+          ERR_CHANGE_THRESHOLD) ||
+         (fabs(last_frame->intra_error - this_frame->intra_error) /
+              DOUBLE_DIVIDE_CHECK(this_frame->intra_error) >
+          ERR_CHANGE_THRESHOLD) ||
+         ((next_frame->intra_error /
+           DOUBLE_DIVIDE_CHECK(next_frame->coded_error)) >
+          II_IMPROVEMENT_THRESHOLD))))) {
+    int i;
+    const FIRSTPASS_STATS *start_pos = twopass->stats_in;
+    FIRSTPASS_STATS local_next_frame = *next_frame;
+    double boost_score = 0.0;
+    double old_boost_score = 0.0;
+    double decay_accumulator = 1.0;
+
+    // Examine how well the key frame predicts subsequent frames.
+    for (i = 0; i < SCENE_CUT_KEY_TEST_INTERVAL; ++i) {
+      double next_iiratio = (BOOST_FACTOR * local_next_frame.intra_error /
+                             DOUBLE_DIVIDE_CHECK(local_next_frame.coded_error));
+
+      if (next_iiratio > KF_II_MAX) next_iiratio = KF_II_MAX;
+
+      // Cumulative effect of decay in prediction quality.
+      if (local_next_frame.pcnt_inter > 0.85)
+        decay_accumulator *= local_next_frame.pcnt_inter;
+      else
+        decay_accumulator *= (0.85 + local_next_frame.pcnt_inter) / 2.0;
+
+      // Keep a running total.
+      boost_score += (decay_accumulator * next_iiratio);
+
+      // Test various breakout clauses.
+      if ((local_next_frame.pcnt_inter < 0.05) || (next_iiratio < 1.5) ||
+          (((local_next_frame.pcnt_inter - local_next_frame.pcnt_neutral) <
+            0.20) &&
+           (next_iiratio < 3.0)) ||
+          ((boost_score - old_boost_score) < 3.0) ||
+          (local_next_frame.intra_error < 200)) {
+        break;
+      }
+
+      old_boost_score = boost_score;
+
+      // Get the next frame details
+      if (EOF == input_stats(twopass, &local_next_frame)) break;
+    }
+
+    // If there is tolerable prediction for at least the next 3 frames then
+    // break out else discard this potential key frame and move on
+    if (boost_score > 30.0 && (i > 3)) {
+      is_viable_kf = 1;
+    } else {
+      // Reset the file position
+      reset_fpf_position(twopass, start_pos);
+
+      is_viable_kf = 0;
+    }
+  }
+
+  return is_viable_kf;
+}
+
+#define FRAMES_TO_CHECK_DECAY 8
+#define KF_MIN_FRAME_BOOST 80.0
+#define KF_MAX_FRAME_BOOST 128.0
+#define MIN_KF_BOOST 600  // Minimum boost for non-static KF interval
+#define MAX_KF_BOOST 3200
+#define MIN_STATIC_KF_BOOST 5400  // Minimum boost for static KF interval
+
+static int detect_app_forced_key(AV1_COMP *cpi) {
+  if (cpi->oxcf.fwd_kf_enabled) cpi->rc.next_is_fwd_key = 1;
+  int num_frames_to_app_forced_key = is_forced_keyframe_pending(
+      cpi->lookahead, cpi->lookahead->max_sz, cpi->compressor_stage);
+  if (num_frames_to_app_forced_key != -1) cpi->rc.next_is_fwd_key = 0;
+  return num_frames_to_app_forced_key;
+}
+
+static int get_projected_kf_boost(AV1_COMP *cpi) {
+  /*
+   * If num_stats_used_for_kf_boost >= frames_to_key, then
+   * all stats needed for prior boost calculation are available.
+   * Hence projecting the prior boost is not needed in this cases.
+   */
+  if (cpi->rc.num_stats_used_for_kf_boost >= cpi->rc.frames_to_key)
+    return cpi->rc.kf_boost;
+
+  // Get the current tpl factor (number of frames = frames_to_key).
+  double tpl_factor = av1_get_kf_boost_projection_factor(cpi->rc.frames_to_key);
+  // Get the tpl factor when number of frames = num_stats_used_for_kf_boost.
+  double tpl_factor_num_stats =
+      av1_get_kf_boost_projection_factor(cpi->rc.num_stats_used_for_kf_boost);
+  int projected_kf_boost =
+      (int)rint((tpl_factor * cpi->rc.kf_boost) / tpl_factor_num_stats);
+  return projected_kf_boost;
+}
+
+static int define_kf_interval(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame,
+                              double *kf_group_err,
+                              int num_frames_to_detect_scenecut) {
+  TWO_PASS *const twopass = &cpi->twopass;
+  RATE_CONTROL *const rc = &cpi->rc;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  double recent_loop_decay[FRAMES_TO_CHECK_DECAY];
+  FIRSTPASS_STATS last_frame;
+  double decay_accumulator = 1.0;
+  int i = 0, j;
+  int frames_to_key = 1;
+  int frames_since_key = rc->frames_since_key + 1;
+  FRAME_INFO *const frame_info = &cpi->frame_info;
+  int num_stats_used_for_kf_boost = 1;
+  int scenecut_detected = 0;
+
+  int num_frames_to_next_key = detect_app_forced_key(cpi);
+
+  if (num_frames_to_detect_scenecut == 0) {
+    if (num_frames_to_next_key != -1)
+      return num_frames_to_next_key;
+    else
+      return rc->frames_to_key;
+  }
+
+  if (num_frames_to_next_key != -1)
+    num_frames_to_detect_scenecut =
+        AOMMIN(num_frames_to_detect_scenecut, num_frames_to_next_key);
+
+  // Initialize the decay rates for the recent frames to check
+  for (j = 0; j < FRAMES_TO_CHECK_DECAY; ++j) recent_loop_decay[j] = 1.0;
+
+  i = 0;
+  while (twopass->stats_in < twopass->stats_buf_ctx->stats_in_end &&
+         frames_to_key < num_frames_to_detect_scenecut) {
+    // Accumulate total number of stats available till next key frame
+    num_stats_used_for_kf_boost++;
+
+    // Accumulate kf group error.
+    if (kf_group_err != NULL)
+      *kf_group_err +=
+          calculate_modified_err(frame_info, twopass, oxcf, this_frame);
+
+    // Load the next frame's stats.
+    last_frame = *this_frame;
+    input_stats(twopass, this_frame);
+
+    // Provided that we are not at the end of the file...
+    if (cpi->rc.enable_scenecut_detection && cpi->oxcf.auto_key &&
+        twopass->stats_in < twopass->stats_buf_ctx->stats_in_end) {
+      double loop_decay_rate;
+
+      // Check for a scene cut.
+      if (test_candidate_kf(twopass, &last_frame, this_frame, twopass->stats_in,
+                            frames_since_key, oxcf->rc_mode)) {
+        scenecut_detected = 1;
+        break;
+      }
+
+      // How fast is the prediction quality decaying?
+      loop_decay_rate =
+          get_prediction_decay_rate(frame_info, twopass->stats_in);
+
+      // We want to know something about the recent past... rather than
+      // as used elsewhere where we are concerned with decay in prediction
+      // quality since the last GF or KF.
+      recent_loop_decay[i % FRAMES_TO_CHECK_DECAY] = loop_decay_rate;
+      decay_accumulator = 1.0;
+      for (j = 0; j < FRAMES_TO_CHECK_DECAY; ++j)
+        decay_accumulator *= recent_loop_decay[j];
+
+      // Special check for transition or high motion followed by a
+      // static scene.
+      if (detect_transition_to_still(twopass, rc->min_gf_interval, i,
+                                     cpi->oxcf.key_freq - i, loop_decay_rate,
+                                     decay_accumulator)) {
+        scenecut_detected = 1;
+        break;
+      }
+
+      // Step on to the next frame.
+      ++frames_to_key;
+      ++frames_since_key;
+
+      // If we don't have a real key frame within the next two
+      // key_freq intervals then break out of the loop.
+      if (frames_to_key >= 2 * cpi->oxcf.key_freq) break;
+    } else {
+      ++frames_to_key;
+      ++frames_since_key;
+    }
+    ++i;
+  }
+
+  if (kf_group_err != NULL)
+    rc->num_stats_used_for_kf_boost = num_stats_used_for_kf_boost;
+
+  if (cpi->lap_enabled && !scenecut_detected)
+    frames_to_key = num_frames_to_next_key;
+
+  return frames_to_key;
+}
+
+static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  TWO_PASS *const twopass = &cpi->twopass;
+  GF_GROUP *const gf_group = &cpi->gf_group;
+  FRAME_INFO *const frame_info = &cpi->frame_info;
+  AV1_COMMON *const cm = &cpi->common;
+  CurrentFrame *const current_frame = &cm->current_frame;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  const FIRSTPASS_STATS first_frame = *this_frame;
+  FIRSTPASS_STATS next_frame;
+  av1_zero(next_frame);
+
+  rc->frames_since_key = 0;
+
+  // Reset the GF group data structures.
+  av1_zero(*gf_group);
+
+  // Clear the alt ref active flag and last group multi arf flags as they
+  // can never be set for a key frame.
+  rc->source_alt_ref_active = 0;
+
+  // KF is always a GF so clear frames till next gf counter.
+  rc->frames_till_gf_update_due = 0;
+
+  rc->frames_to_key = 1;
+
+  if (has_no_stats_stage(cpi)) {
+    int num_frames_to_app_forced_key = detect_app_forced_key(cpi);
+    rc->this_key_frame_forced =
+        current_frame->frame_number != 0 && rc->frames_to_key == 0;
+    if (num_frames_to_app_forced_key != -1)
+      rc->frames_to_key = num_frames_to_app_forced_key;
+    else
+      rc->frames_to_key = AOMMAX(1, cpi->oxcf.key_freq);
+    correct_frames_to_key(cpi);
+    rc->kf_boost = DEFAULT_KF_BOOST;
+    rc->source_alt_ref_active = 0;
+    gf_group->update_type[0] = KF_UPDATE;
+    return;
+  }
+  int i;
+  const FIRSTPASS_STATS *const start_position = twopass->stats_in;
+  int kf_bits = 0;
+  double zero_motion_accumulator = 1.0;
+  double boost_score = 0.0;
+  double kf_raw_err = 0.0;
+  double kf_mod_err = 0.0;
+  double kf_group_err = 0.0;
+  double sr_accumulator = 0.0;
+  int frames_to_key;
+  // Is this a forced key frame by interval.
+  rc->this_key_frame_forced = rc->next_key_frame_forced;
+
+  twopass->kf_group_bits = 0;        // Total bits available to kf group
+  twopass->kf_group_error_left = 0;  // Group modified error score.
+
+  kf_raw_err = this_frame->intra_error;
+  kf_mod_err = calculate_modified_err(frame_info, twopass, oxcf, this_frame);
+
+  frames_to_key =
+      define_kf_interval(cpi, this_frame, &kf_group_err, oxcf->key_freq);
+
+  if (frames_to_key != -1)
+    rc->frames_to_key = AOMMIN(oxcf->key_freq, frames_to_key);
+  else
+    rc->frames_to_key = oxcf->key_freq;
+
+  if (cpi->lap_enabled) correct_frames_to_key(cpi);
+
+  // If there is a max kf interval set by the user we must obey it.
+  // We already breakout of the loop above at 2x max.
+  // This code centers the extra kf if the actual natural interval
+  // is between 1x and 2x.
+  if (cpi->oxcf.auto_key && rc->frames_to_key > cpi->oxcf.key_freq) {
+    FIRSTPASS_STATS tmp_frame = first_frame;
+
+    rc->frames_to_key /= 2;
+
+    // Reset to the start of the group.
+    reset_fpf_position(twopass, start_position);
+
+    kf_group_err = 0.0;
+
+    // Rescan to get the correct error data for the forced kf group.
+    for (i = 0; i < rc->frames_to_key; ++i) {
+      kf_group_err +=
+          calculate_modified_err(frame_info, twopass, oxcf, &tmp_frame);
+      if (EOF == input_stats(twopass, &tmp_frame)) break;
+    }
+    rc->next_key_frame_forced = 1;
+  } else if ((twopass->stats_in == twopass->stats_buf_ctx->stats_in_end &&
+              is_stat_consumption_stage_twopass(cpi)) ||
+             rc->frames_to_key >= cpi->oxcf.key_freq) {
+    rc->next_key_frame_forced = 1;
+  } else {
+    rc->next_key_frame_forced = 0;
+  }
+
+  // Special case for the last key frame of the file.
+  if (twopass->stats_in >= twopass->stats_buf_ctx->stats_in_end) {
+    // Accumulate kf group error.
+    kf_group_err +=
+        calculate_modified_err(frame_info, twopass, oxcf, this_frame);
+  }
+
+  // Calculate the number of bits that should be assigned to the kf group.
+  if (twopass->bits_left > 0 && twopass->modified_error_left > 0.0) {
+    // Maximum number of bits for a single normal frame (not key frame).
+    const int max_bits = frame_max_bits(rc, &cpi->oxcf);
+
+    // Maximum number of bits allocated to the key frame group.
+    int64_t max_grp_bits;
+
+    // Default allocation based on bits left and relative
+    // complexity of the section.
+    twopass->kf_group_bits = (int64_t)(
+        twopass->bits_left * (kf_group_err / twopass->modified_error_left));
+
+    // Clip based on maximum per frame rate defined by the user.
+    max_grp_bits = (int64_t)max_bits * (int64_t)rc->frames_to_key;
+    if (twopass->kf_group_bits > max_grp_bits)
+      twopass->kf_group_bits = max_grp_bits;
+  } else {
+    twopass->kf_group_bits = 0;
+  }
+  twopass->kf_group_bits = AOMMAX(0, twopass->kf_group_bits);
+
+  // Reset the first pass file position.
+  reset_fpf_position(twopass, start_position);
+
+  // Scan through the kf group collating various stats used to determine
+  // how many bits to spend on it.
+  boost_score = 0.0;
+  const double kf_max_boost =
+      cpi->oxcf.rc_mode == AOM_Q
+          ? AOMMIN(AOMMAX(rc->frames_to_key * 2.0, KF_MIN_FRAME_BOOST),
+                   KF_MAX_FRAME_BOOST)
+          : KF_MAX_FRAME_BOOST;
+  for (i = 0; i < (rc->frames_to_key - 1); ++i) {
+    if (EOF == input_stats(twopass, &next_frame)) break;
+
+    // Monitor for static sections.
+    // For the first frame in kf group, the second ref indicator is invalid.
+    if (i > 0) {
+      zero_motion_accumulator =
+          AOMMIN(zero_motion_accumulator,
+                 get_zero_motion_factor(frame_info, &next_frame));
+    } else {
+      zero_motion_accumulator = next_frame.pcnt_inter - next_frame.pcnt_motion;
+    }
+
+    // Not all frames in the group are necessarily used in calculating boost.
+    if ((sr_accumulator < (kf_raw_err * 1.50)) &&
+        (i <= rc->max_gf_interval * 2)) {
+      double frame_boost;
+      double zm_factor;
+
+      // Factor 0.75-1.25 based on how much of frame is static.
+      zm_factor = (0.75 + (zero_motion_accumulator / 2.0));
+
+      if (i < 2) sr_accumulator = 0.0;
+      frame_boost = calc_kf_frame_boost(rc, frame_info, &next_frame,
+                                        &sr_accumulator, kf_max_boost);
+      boost_score += frame_boost * zm_factor;
+    }
+  }
+
+  reset_fpf_position(twopass, start_position);
+
+  // Store the zero motion percentage
+  twopass->kf_zeromotion_pct = (int)(zero_motion_accumulator * 100.0);
+
+  // Calculate a section intra ratio used in setting max loop filter.
+  twopass->section_intra_rating = calculate_section_intra_ratio(
+      start_position, twopass->stats_buf_ctx->stats_in_end, rc->frames_to_key);
+
+  rc->kf_boost = (int)boost_score;
+
+  if (cpi->lap_enabled) {
+    rc->kf_boost = get_projected_kf_boost(cpi);
+  }
+
+  // Special case for static / slide show content but don't apply
+  // if the kf group is very short.
+  if ((zero_motion_accumulator > STATIC_KF_GROUP_FLOAT_THRESH) &&
+      (rc->frames_to_key > 8)) {
+    rc->kf_boost = AOMMAX(rc->kf_boost, MIN_STATIC_KF_BOOST);
+  } else {
+    // Apply various clamps for min and max boost
+    rc->kf_boost = AOMMAX(rc->kf_boost, (rc->frames_to_key * 3));
+    rc->kf_boost = AOMMAX(rc->kf_boost, MIN_KF_BOOST);
+#ifdef STRICT_RC
+    rc->kf_boost = AOMMIN(rc->kf_boost, MAX_KF_BOOST);
+#endif
+  }
+
+  // Work out how many bits to allocate for the key frame itself.
+  kf_bits = calculate_boost_bits((rc->frames_to_key - 1), rc->kf_boost,
+                                 twopass->kf_group_bits);
+  // printf("kf boost = %d kf_bits = %d kf_zeromotion_pct = %d\n", rc->kf_boost,
+  //        kf_bits, twopass->kf_zeromotion_pct);
+  kf_bits = adjust_boost_bits_for_target_level(cpi, rc, kf_bits,
+                                               twopass->kf_group_bits, 0);
+
+  twopass->kf_group_bits -= kf_bits;
+
+  // Save the bits to spend on the key frame.
+  gf_group->bit_allocation[0] = kf_bits;
+  gf_group->update_type[0] = KF_UPDATE;
+
+  // Note the total error score of the kf group minus the key frame itself.
+  twopass->kf_group_error_left = (int)(kf_group_err - kf_mod_err);
+
+  // Adjust the count of total modified error left.
+  // The count of bits left is adjusted elsewhere based on real coded frame
+  // sizes.
+  twopass->modified_error_left -= kf_group_err;
+}
+
+static int is_skippable_frame(const AV1_COMP *cpi) {
+  if (has_no_stats_stage(cpi)) return 0;
+  // If the current frame does not have non-zero motion vector detected in the
+  // first  pass, and so do its previous and forward frames, then this frame
+  // can be skipped for partition check, and the partition size is assigned
+  // according to the variance
+  const TWO_PASS *const twopass = &cpi->twopass;
+
+  return (!frame_is_intra_only(&cpi->common) &&
+          twopass->stats_in - 2 > twopass->stats_buf_ctx->stats_in_start &&
+          twopass->stats_in < twopass->stats_buf_ctx->stats_in_end &&
+          (twopass->stats_in - 1)->pcnt_inter -
+                  (twopass->stats_in - 1)->pcnt_motion ==
+              1 &&
+          (twopass->stats_in - 2)->pcnt_inter -
+                  (twopass->stats_in - 2)->pcnt_motion ==
+              1 &&
+          twopass->stats_in->pcnt_inter - twopass->stats_in->pcnt_motion == 1);
+}
+
+#define ARF_STATS_OUTPUT 0
+#if ARF_STATS_OUTPUT
+unsigned int arf_count = 0;
+#endif
+#define DEFAULT_GRP_WEIGHT 1.0
+
+static void process_first_pass_stats(AV1_COMP *cpi,
+                                     FIRSTPASS_STATS *this_frame) {
+  AV1_COMMON *const cm = &cpi->common;
+  CurrentFrame *const current_frame = &cm->current_frame;
+  RATE_CONTROL *const rc = &cpi->rc;
+  TWO_PASS *const twopass = &cpi->twopass;
+
+  if (cpi->oxcf.rc_mode != AOM_Q && current_frame->frame_number == 0 &&
+      cpi->twopass.stats_buf_ctx->total_stats &&
+      cpi->twopass.stats_buf_ctx->total_left_stats) {
+    if (cpi->lap_enabled) {
+      /*
+       * Accumulate total_stats using available limited number of stats,
+       * and assign it to total_left_stats.
+       */
+      *cpi->twopass.stats_buf_ctx->total_left_stats =
+          *cpi->twopass.stats_buf_ctx->total_stats;
+    }
+    const int frames_left = (int)(twopass->stats_buf_ctx->total_stats->count -
+                                  current_frame->frame_number);
+
+    // Special case code for first frame.
+    const int section_target_bandwidth =
+        (int)(twopass->bits_left / frames_left);
+    const double section_length =
+        twopass->stats_buf_ctx->total_left_stats->count;
+    const double section_error =
+        twopass->stats_buf_ctx->total_left_stats->coded_error / section_length;
+    const double section_intra_skip =
+        twopass->stats_buf_ctx->total_left_stats->intra_skip_pct /
+        section_length;
+    const double section_inactive_zone =
+        (twopass->stats_buf_ctx->total_left_stats->inactive_zone_rows * 2) /
+        ((double)cm->mi_params.mb_rows * section_length);
+    const int tmp_q = get_twopass_worst_quality(
+        cpi, section_error, section_intra_skip + section_inactive_zone,
+        section_target_bandwidth, DEFAULT_GRP_WEIGHT);
+
+    rc->active_worst_quality = tmp_q;
+    rc->ni_av_qi = tmp_q;
+    rc->last_q[INTER_FRAME] = tmp_q;
+    rc->avg_q = av1_convert_qindex_to_q(tmp_q, cm->seq_params.bit_depth);
+    rc->avg_frame_qindex[INTER_FRAME] = tmp_q;
+    rc->last_q[KEY_FRAME] = (tmp_q + cpi->oxcf.best_allowed_q) / 2;
+    rc->avg_frame_qindex[KEY_FRAME] = rc->last_q[KEY_FRAME];
+  }
+
+  int err = 0;
+  if (cpi->lap_enabled) {
+    err = input_stats_lap(twopass, this_frame);
+  } else {
+    err = input_stats(twopass, this_frame);
+  }
+  if (err == EOF) return;
+
+  {
+    const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
+                            ? cpi->initial_mbs
+                            : cm->mi_params.MBs;
+    // The multiplication by 256 reverses a scaling factor of (>> 8)
+    // applied when combining MB error values for the frame.
+    twopass->mb_av_energy = log((this_frame->intra_error / num_mbs) + 1.0);
+    twopass->frame_avg_haar_energy =
+        log((this_frame->frame_avg_wavelet_energy / num_mbs) + 1.0);
+  }
+
+  // Update the total stats remaining structure.
+  if (twopass->stats_buf_ctx->total_left_stats)
+    subtract_stats(twopass->stats_buf_ctx->total_left_stats, this_frame);
+
+  // Set the frame content type flag.
+  if (this_frame->intra_skip_pct >= FC_ANIMATION_THRESH)
+    twopass->fr_content_type = FC_GRAPHICS_ANIMATION;
+  else
+    twopass->fr_content_type = FC_NORMAL;
+}
+
+static void setup_target_rate(AV1_COMP *cpi) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  GF_GROUP *const gf_group = &cpi->gf_group;
+
+  int target_rate = gf_group->bit_allocation[gf_group->index];
+
+  if (has_no_stats_stage(cpi)) {
+    av1_rc_set_frame_target(cpi, target_rate, cpi->common.width,
+                            cpi->common.height);
+  }
+
+  rc->base_frame_target = target_rate;
+}
+
+void av1_get_second_pass_params(AV1_COMP *cpi,
+                                EncodeFrameParams *const frame_params,
+                                const EncodeFrameInput *const frame_input,
+                                unsigned int frame_flags) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  TWO_PASS *const twopass = &cpi->twopass;
+  GF_GROUP *const gf_group = &cpi->gf_group;
+  AV1_COMMON *cm = &cpi->common;
+
+  if (frame_is_intra_only(cm)) {
+    FeatureFlags *const features = &cm->features;
+    av1_set_screen_content_options(cpi, features);
+    cpi->is_screen_content_type = features->allow_screen_content_tools;
+  }
+
+  if (is_stat_consumption_stage(cpi) && !twopass->stats_in) return;
+
+  if (rc->frames_till_gf_update_due > 0 && !(frame_flags & FRAMEFLAGS_KEY)) {
+    assert(gf_group->index < gf_group->size);
+    const int update_type = gf_group->update_type[gf_group->index];
+
+    setup_target_rate(cpi);
+
+    // If this is an arf frame then we dont want to read the stats file or
+    // advance the input pointer as we already have what we need.
+    if (update_type == ARF_UPDATE || update_type == INTNL_ARF_UPDATE) {
+      if (cpi->no_show_kf) {
+        assert(update_type == ARF_UPDATE);
+        frame_params->frame_type = KEY_FRAME;
+      } else {
+        frame_params->frame_type = INTER_FRAME;
+      }
+
+      // Do the firstpass stats indicate that this frame is skippable for the
+      // partition search?
+      if (cpi->sf.part_sf.allow_partition_search_skip && cpi->oxcf.pass == 2) {
+        cpi->partition_search_skippable_frame = is_skippable_frame(cpi);
+      }
+
+      return;
+    }
+  }
+
+  aom_clear_system_state();
+
+  if (cpi->oxcf.rc_mode == AOM_Q) rc->active_worst_quality = cpi->oxcf.cq_level;
+  FIRSTPASS_STATS this_frame;
+  av1_zero(this_frame);
+  // call above fn
+  if (is_stat_consumption_stage(cpi)) {
+    process_first_pass_stats(cpi, &this_frame);
+  } else {
+    rc->active_worst_quality = cpi->oxcf.cq_level;
+  }
+
+  // Keyframe and section processing.
+  if (rc->frames_to_key == 0 || (frame_flags & FRAMEFLAGS_KEY)) {
+    FIRSTPASS_STATS this_frame_copy;
+    this_frame_copy = this_frame;
+    frame_params->frame_type = KEY_FRAME;
+    // Define next KF group and assign bits to it.
+    find_next_key_frame(cpi, &this_frame);
+    this_frame = this_frame_copy;
+  } else {
+    frame_params->frame_type = INTER_FRAME;
+    const int altref_enabled = is_altref_enabled(cpi);
+    const int sframe_dist = cpi->oxcf.sframe_dist;
+    const int sframe_mode = cpi->oxcf.sframe_mode;
+    const int sframe_enabled = cpi->oxcf.sframe_enabled;
+    const int update_type = gf_group->update_type[gf_group->index];
+    CurrentFrame *const current_frame = &cpi->common.current_frame;
+    if (sframe_enabled) {
+      if (altref_enabled) {
+        if (sframe_mode == 1) {
+          // sframe_mode == 1: insert sframe if it matches altref frame.
+          if (current_frame->frame_number % sframe_dist == 0 &&
+              current_frame->frame_number != 0 && update_type == ARF_UPDATE) {
+            frame_params->frame_type = S_FRAME;
+          }
+        } else {
+          // sframe_mode != 1: if sframe will be inserted at the next available
+          // altref frame
+          if (current_frame->frame_number % sframe_dist == 0 &&
+              current_frame->frame_number != 0) {
+            rc->sframe_due = 1;
+          }
+          if (rc->sframe_due && update_type == ARF_UPDATE) {
+            frame_params->frame_type = S_FRAME;
+            rc->sframe_due = 0;
+          }
+        }
+      } else {
+        if (current_frame->frame_number % sframe_dist == 0 &&
+            current_frame->frame_number != 0) {
+          frame_params->frame_type = S_FRAME;
+        }
+      }
+    }
+  }
+
+  // Define a new GF/ARF group. (Should always enter here for key frames).
+  if (rc->frames_till_gf_update_due == 0) {
+    assert(cpi->common.current_frame.frame_number == 0 ||
+           gf_group->index == gf_group->size);
+    const FIRSTPASS_STATS *const start_position = twopass->stats_in;
+    int num_frames_to_detect_scenecut, frames_to_key;
+    if (cpi->lap_enabled && cpi->rc.enable_scenecut_detection)
+      num_frames_to_detect_scenecut = MAX_GF_LENGTH_LAP + 1;
+    else
+      num_frames_to_detect_scenecut = 0;
+    frames_to_key = define_kf_interval(cpi, &this_frame, NULL,
+                                       num_frames_to_detect_scenecut);
+    reset_fpf_position(twopass, start_position);
+    if (frames_to_key != -1)
+      rc->frames_to_key = AOMMIN(rc->frames_to_key, frames_to_key);
+
+    int max_gop_length = (cpi->oxcf.lag_in_frames >= 32 &&
+                          is_stat_consumption_stage_twopass(cpi))
+                             ? MAX_GF_INTERVAL
+                             : MAX_GF_LENGTH_LAP;
+    if (rc->intervals_till_gf_calculate_due == 0) {
+      calculate_gf_length(cpi, max_gop_length, MAX_NUM_GF_INTERVALS);
+    }
+
+    if (max_gop_length > 16) {
+      if (rc->gf_intervals[rc->cur_gf_index] - 1 > 16) {
+        // The calculate_gf_length function is previously used with
+        // max_gop_length = 32 with look-ahead gf intervals.
+        define_gf_group(cpi, &this_frame, frame_params, max_gop_length, 0);
+        if (!av1_tpl_setup_stats(cpi, 1, frame_params, frame_input)) {
+          // Tpl decides that a shorter gf interval is better.
+          // TODO(jingning): Remove redundant computations here.
+          max_gop_length = 16;
+          calculate_gf_length(cpi, max_gop_length, 1);
+        }
+      } else {
+        // Even based on 32 we still decide to use a short gf interval.
+        // Better to re-decide based on 16 then
+        max_gop_length = 16;
+        calculate_gf_length(cpi, max_gop_length, 1);
+      }
+    }
+    define_gf_group(cpi, &this_frame, frame_params, max_gop_length, 1);
+    rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+    cpi->num_gf_group_show_frames = 0;
+    assert(gf_group->index == 0);
+
+#if ARF_STATS_OUTPUT
+    {
+      FILE *fpfile;
+      fpfile = fopen("arf.stt", "a");
+      ++arf_count;
+      fprintf(fpfile, "%10d %10d %10d %10d %10d\n",
+              cpi->common.current_frame.frame_number,
+              rc->frames_till_gf_update_due, rc->kf_boost, arf_count,
+              rc->gfu_boost);
+
+      fclose(fpfile);
+    }
+#endif
+  }
+  assert(gf_group->index < gf_group->size);
+
+  // Do the firstpass stats indicate that this frame is skippable for the
+  // partition search?
+  if (cpi->sf.part_sf.allow_partition_search_skip && cpi->oxcf.pass == 2) {
+    cpi->partition_search_skippable_frame = is_skippable_frame(cpi);
+  }
+
+  setup_target_rate(cpi);
+}
+
+void av1_init_second_pass(AV1_COMP *cpi) {
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  TWO_PASS *const twopass = &cpi->twopass;
+  FRAME_INFO *const frame_info = &cpi->frame_info;
+  double frame_rate;
+  FIRSTPASS_STATS *stats;
+
+  if (!twopass->stats_buf_ctx->stats_in_end) return;
+
+  stats = twopass->stats_buf_ctx->total_stats;
+
+  *stats = *twopass->stats_buf_ctx->stats_in_end;
+  *twopass->stats_buf_ctx->total_left_stats = *stats;
+
+  frame_rate = 10000000.0 * stats->count / stats->duration;
+  // Each frame can have a different duration, as the frame rate in the source
+  // isn't guaranteed to be constant. The frame rate prior to the first frame
+  // encoded in the second pass is a guess. However, the sum duration is not.
+  // It is calculated based on the actual durations of all frames from the
+  // first pass.
+  av1_new_framerate(cpi, frame_rate);
+  twopass->bits_left =
+      (int64_t)(stats->duration * oxcf->target_bandwidth / 10000000.0);
+
+  // This variable monitors how far behind the second ref update is lagging.
+  twopass->sr_update_lag = 1;
+
+  // Scan the first pass file and calculate a modified total error based upon
+  // the bias/power function used to allocate bits.
+  {
+    const double avg_error =
+        stats->coded_error / DOUBLE_DIVIDE_CHECK(stats->count);
+    const FIRSTPASS_STATS *s = twopass->stats_in;
+    double modified_error_total = 0.0;
+    twopass->modified_error_min =
+        (avg_error * oxcf->two_pass_vbrmin_section) / 100;
+    twopass->modified_error_max =
+        (avg_error * oxcf->two_pass_vbrmax_section) / 100;
+    while (s < twopass->stats_buf_ctx->stats_in_end) {
+      modified_error_total +=
+          calculate_modified_err(frame_info, twopass, oxcf, s);
+      ++s;
+    }
+    twopass->modified_error_left = modified_error_total;
+  }
+
+  // Reset the vbr bits off target counters
+  cpi->rc.vbr_bits_off_target = 0;
+  cpi->rc.vbr_bits_off_target_fast = 0;
+
+  cpi->rc.rate_error_estimate = 0;
+
+  // Static sequence monitor variables.
+  twopass->kf_zeromotion_pct = 100;
+  twopass->last_kfgroup_zeromotion_pct = 100;
+
+  // Initialize bits per macro_block estimate correction factor.
+  twopass->bpm_factor = 1.0;
+  // Initialize actual and target bits counters for ARF groups so that
+  // at the start we have a neutral bpm adjustment.
+  twopass->rolling_arf_group_target_bits = 1;
+  twopass->rolling_arf_group_actual_bits = 1;
+}
+
+void av1_init_single_pass_lap(AV1_COMP *cpi) {
+  TWO_PASS *const twopass = &cpi->twopass;
+
+  if (!twopass->stats_buf_ctx->stats_in_end) return;
+
+  // This variable monitors how far behind the second ref update is lagging.
+  twopass->sr_update_lag = 1;
+
+  twopass->bits_left = 0;
+  twopass->modified_error_min = 0.0;
+  twopass->modified_error_max = 0.0;
+  twopass->modified_error_left = 0.0;
+
+  // Reset the vbr bits off target counters
+  cpi->rc.vbr_bits_off_target = 0;
+  cpi->rc.vbr_bits_off_target_fast = 0;
+
+  cpi->rc.rate_error_estimate = 0;
+
+  // Static sequence monitor variables.
+  twopass->kf_zeromotion_pct = 100;
+  twopass->last_kfgroup_zeromotion_pct = 100;
+
+  // Initialize bits per macro_block estimate correction factor.
+  twopass->bpm_factor = 1.0;
+  // Initialize actual and target bits counters for ARF groups so that
+  // at the start we have a neutral bpm adjustment.
+  twopass->rolling_arf_group_target_bits = 1;
+  twopass->rolling_arf_group_actual_bits = 1;
+}
+
+#define MINQ_ADJ_LIMIT 48
+#define MINQ_ADJ_LIMIT_CQ 20
+#define HIGH_UNDERSHOOT_RATIO 2
+void av1_twopass_postencode_update(AV1_COMP *cpi) {
+  TWO_PASS *const twopass = &cpi->twopass;
+  RATE_CONTROL *const rc = &cpi->rc;
+  const int bits_used = rc->base_frame_target;
+
+  // VBR correction is done through rc->vbr_bits_off_target. Based on the
+  // sign of this value, a limited % adjustment is made to the target rate
+  // of subsequent frames, to try and push it back towards 0. This method
+  // is designed to prevent extreme behaviour at the end of a clip
+  // or group of frames.
+  rc->vbr_bits_off_target += rc->base_frame_target - rc->projected_frame_size;
+  twopass->bits_left = AOMMAX(twopass->bits_left - bits_used, 0);
+
+  // Target vs actual bits for this arf group.
+  twopass->rolling_arf_group_target_bits += rc->this_frame_target;
+  twopass->rolling_arf_group_actual_bits += rc->projected_frame_size;
+
+  // Calculate the pct rc error.
+  if (rc->total_actual_bits) {
+    rc->rate_error_estimate =
+        (int)((rc->vbr_bits_off_target * 100) / rc->total_actual_bits);
+    rc->rate_error_estimate = clamp(rc->rate_error_estimate, -100, 100);
+  } else {
+    rc->rate_error_estimate = 0;
+  }
+
+  // Update the active best quality pyramid.
+  if (!rc->is_src_frame_alt_ref) {
+    const int pyramid_level = cpi->gf_group.layer_depth[cpi->gf_group.index];
+    int i;
+    for (i = pyramid_level; i <= MAX_ARF_LAYERS; ++i) {
+      rc->active_best_quality[i] = cpi->common.quant_params.base_qindex;
+      // if (pyramid_level >= 2) {
+      //   rc->active_best_quality[pyramid_level] =
+      //     AOMMAX(rc->active_best_quality[pyramid_level],
+      //            cpi->common.base_qindex);
+      // }
+    }
+  }
+
+#if 0
+  {
+    AV1_COMMON *cm = &cpi->common;
+    FILE *fpfile;
+    fpfile = fopen("details.stt", "a");
+    fprintf(fpfile,
+            "%10d %10d %10d %10" PRId64 " %10" PRId64
+            " %10d %10d %10d %10.4lf %10.4lf %10.4lf %10.4lf\n",
+            cm->current_frame.frame_number, rc->base_frame_target,
+            rc->projected_frame_size, rc->total_actual_bits,
+            rc->vbr_bits_off_target, rc->rate_error_estimate,
+            twopass->rolling_arf_group_target_bits,
+            twopass->rolling_arf_group_actual_bits,
+            (double)twopass->rolling_arf_group_actual_bits /
+                (double)twopass->rolling_arf_group_target_bits,
+            twopass->bpm_factor,
+            av1_convert_qindex_to_q(quant_params->base_qindex,
+                                    cm->seq_params.bit_depth),
+            av1_convert_qindex_to_q(rc->active_worst_quality,
+                                    cm->seq_params.bit_depth));
+    fclose(fpfile);
+  }
+#endif
+
+  if (cpi->common.current_frame.frame_type != KEY_FRAME) {
+    twopass->kf_group_bits -= bits_used;
+    twopass->last_kfgroup_zeromotion_pct = twopass->kf_zeromotion_pct;
+  }
+  twopass->kf_group_bits = AOMMAX(twopass->kf_group_bits, 0);
+
+  // If the rate control is drifting consider adjustment to min or maxq.
+  if ((cpi->oxcf.rc_mode != AOM_Q) && !cpi->rc.is_src_frame_alt_ref) {
+    const int maxq_adj_limit = rc->worst_quality - rc->active_worst_quality;
+    const int minq_adj_limit =
+        (cpi->oxcf.rc_mode == AOM_CQ ? MINQ_ADJ_LIMIT_CQ : MINQ_ADJ_LIMIT);
+
+    // Undershoot.
+    if (rc->rate_error_estimate > cpi->oxcf.under_shoot_pct) {
+      --twopass->extend_maxq;
+      if (rc->rolling_target_bits >= rc->rolling_actual_bits)
+        ++twopass->extend_minq;
+      // Overshoot.
+    } else if (rc->rate_error_estimate < -cpi->oxcf.over_shoot_pct) {
+      --twopass->extend_minq;
+      if (rc->rolling_target_bits < rc->rolling_actual_bits)
+        ++twopass->extend_maxq;
+    } else {
+      // Adjustment for extreme local overshoot.
+      if (rc->projected_frame_size > (2 * rc->base_frame_target) &&
+          rc->projected_frame_size > (2 * rc->avg_frame_bandwidth))
+        ++twopass->extend_maxq;
+
+      // Unwind undershoot or overshoot adjustment.
+      if (rc->rolling_target_bits < rc->rolling_actual_bits)
+        --twopass->extend_minq;
+      else if (rc->rolling_target_bits > rc->rolling_actual_bits)
+        --twopass->extend_maxq;
+    }
+
+    twopass->extend_minq = clamp(twopass->extend_minq, 0, minq_adj_limit);
+    twopass->extend_maxq = clamp(twopass->extend_maxq, 0, maxq_adj_limit);
+
+    // If there is a big and undexpected undershoot then feed the extra
+    // bits back in quickly. One situation where this may happen is if a
+    // frame is unexpectedly almost perfectly predicted by the ARF or GF
+    // but not very well predcited by the previous frame.
+    if (!frame_is_kf_gf_arf(cpi) && !cpi->rc.is_src_frame_alt_ref) {
+      int fast_extra_thresh = rc->base_frame_target / HIGH_UNDERSHOOT_RATIO;
+      if (rc->projected_frame_size < fast_extra_thresh) {
+        rc->vbr_bits_off_target_fast +=
+            fast_extra_thresh - rc->projected_frame_size;
+        rc->vbr_bits_off_target_fast =
+            AOMMIN(rc->vbr_bits_off_target_fast, (4 * rc->avg_frame_bandwidth));
+
+        // Fast adaptation of minQ if necessary to use up the extra bits.
+        if (rc->avg_frame_bandwidth) {
+          twopass->extend_minq_fast =
+              (int)(rc->vbr_bits_off_target_fast * 8 / rc->avg_frame_bandwidth);
+        }
+        twopass->extend_minq_fast = AOMMIN(
+            twopass->extend_minq_fast, minq_adj_limit - twopass->extend_minq);
+      } else if (rc->vbr_bits_off_target_fast) {
+        twopass->extend_minq_fast = AOMMIN(
+            twopass->extend_minq_fast, minq_adj_limit - twopass->extend_minq);
+      } else {
+        twopass->extend_minq_fast = 0;
+      }
+    }
+  }
+}
diff --git a/libs/libaom/src/av1/encoder/pass2_strategy.h b/libs/libaom/src/av1/encoder/pass2_strategy.h
new file mode 100644
index 000000000..437fb8f79
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/pass2_strategy.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_PASS2_STRATEGY_H_
+#define AOM_AV1_ENCODER_PASS2_STRATEGY_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct AV1_COMP;
+struct EncodeFrameParams;
+// structure of accumulated stats and features in a gf group
+typedef struct {
+  double gf_group_err;
+  double gf_group_raw_error;
+  double gf_group_skip_pct;
+  double gf_group_inactive_zone_rows;
+
+  double mv_ratio_accumulator;
+  double decay_accumulator;
+  double zero_motion_accumulator;
+  double loop_decay_rate;
+  double last_loop_decay_rate;
+  double this_frame_mv_in_out;
+  double mv_in_out_accumulator;
+  double abs_mv_in_out_accumulator;
+
+  double avg_sr_coded_error;
+  double avg_tr_coded_error;
+  double avg_pcnt_second_ref;
+  double avg_pcnt_third_ref;
+  double avg_pcnt_third_ref_nolast;
+  double avg_new_mv_count;
+  double avg_wavelet_energy;
+  double avg_raw_err_stdev;
+  int non_zero_stdev_count;
+
+  unsigned int allow_alt_ref;
+} GF_GROUP_STATS;
+
+typedef struct {
+  double frame_err;
+  double frame_coded_error;
+  double frame_sr_coded_error;
+  double frame_tr_coded_error;
+} GF_FRAME_STATS;
+
+void av1_init_second_pass(struct AV1_COMP *cpi);
+
+void av1_init_single_pass_lap(AV1_COMP *cpi);
+
+void av1_get_second_pass_params(struct AV1_COMP *cpi,
+                                struct EncodeFrameParams *const frame_params,
+                                const EncodeFrameInput *const frame_input,
+                                unsigned int frame_flags);
+
+void av1_twopass_postencode_update(struct AV1_COMP *cpi);
+
+void av1_gop_bit_allocation(const AV1_COMP *cpi, RATE_CONTROL *const rc,
+                            GF_GROUP *gf_group, int is_key_frame, int use_arf,
+                            int64_t gf_group_bits);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_PASS2_STRATEGY_H_
diff --git a/libs/libaom/src/av1/encoder/pickcdef.c b/libs/libaom/src/av1/encoder/pickcdef.c
new file mode 100644
index 000000000..a1092fd59
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/pickcdef.c
@@ -0,0 +1,587 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <string.h>
+
+#include "config/aom_scale_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_ports/system_state.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/common/cdef.h"
+#include "av1/common/reconinter.h"
+#include "av1/encoder/encoder.h"
+
+#define REDUCED_PRI_STRENGTHS_LVL1 8
+#define REDUCED_PRI_STRENGTHS_LVL2 5
+
+#define REDUCED_TOTAL_STRENGTHS_LVL1 \
+  (REDUCED_PRI_STRENGTHS_LVL1 * CDEF_SEC_STRENGTHS)
+#define REDUCED_TOTAL_STRENGTHS_LVL2 \
+  (REDUCED_PRI_STRENGTHS_LVL2 * CDEF_SEC_STRENGTHS)
+#define TOTAL_STRENGTHS (CDEF_PRI_STRENGTHS * CDEF_SEC_STRENGTHS)
+
+static const int priconv_lvl1[REDUCED_TOTAL_STRENGTHS_LVL1] = { 0, 1, 2,  3,
+                                                                5, 7, 10, 13 };
+static const int priconv_lvl2[REDUCED_TOTAL_STRENGTHS_LVL2] = { 0, 2, 4, 8,
+                                                                14 };
+static const int nb_cdef_strengths[CDEF_PICK_METHODS] = {
+  TOTAL_STRENGTHS, REDUCED_TOTAL_STRENGTHS_LVL1, REDUCED_TOTAL_STRENGTHS_LVL2,
+  TOTAL_STRENGTHS
+};
+
+// Get primary strength value for the given index and search method
+static INLINE int get_pri_strength(CDEF_PICK_METHOD pick_method, int pri_idx) {
+  switch (pick_method) {
+    case CDEF_FAST_SEARCH_LVL1: return priconv_lvl1[pri_idx];
+    case CDEF_FAST_SEARCH_LVL2: return priconv_lvl2[pri_idx];
+    default: assert(0 && "Invalid CDEF primary index"); return -1;
+  }
+}
+
+/* Search for the best strength to add as an option, knowing we
+   already selected nb_strengths options. */
+static uint64_t search_one(int *lev, int nb_strengths,
+                           uint64_t mse[][TOTAL_STRENGTHS], int sb_count,
+                           CDEF_PICK_METHOD pick_method) {
+  uint64_t tot_mse[TOTAL_STRENGTHS];
+  const int total_strengths = nb_cdef_strengths[pick_method];
+  int i, j;
+  uint64_t best_tot_mse = (uint64_t)1 << 63;
+  int best_id = 0;
+  memset(tot_mse, 0, sizeof(tot_mse));
+  for (i = 0; i < sb_count; i++) {
+    int gi;
+    uint64_t best_mse = (uint64_t)1 << 63;
+    /* Find best mse among already selected options. */
+    for (gi = 0; gi < nb_strengths; gi++) {
+      if (mse[i][lev[gi]] < best_mse) {
+        best_mse = mse[i][lev[gi]];
+      }
+    }
+    /* Find best mse when adding each possible new option. */
+    for (j = 0; j < total_strengths; j++) {
+      uint64_t best = best_mse;
+      if (mse[i][j] < best) best = mse[i][j];
+      tot_mse[j] += best;
+    }
+  }
+  for (j = 0; j < total_strengths; j++) {
+    if (tot_mse[j] < best_tot_mse) {
+      best_tot_mse = tot_mse[j];
+      best_id = j;
+    }
+  }
+  lev[nb_strengths] = best_id;
+  return best_tot_mse;
+}
+
+/* Search for the best luma+chroma strength to add as an option, knowing we
+   already selected nb_strengths options. */
+static uint64_t search_one_dual(int *lev0, int *lev1, int nb_strengths,
+                                uint64_t (**mse)[TOTAL_STRENGTHS], int sb_count,
+                                CDEF_PICK_METHOD pick_method) {
+  uint64_t tot_mse[TOTAL_STRENGTHS][TOTAL_STRENGTHS];
+  int i, j;
+  uint64_t best_tot_mse = (uint64_t)1 << 63;
+  int best_id0 = 0;
+  int best_id1 = 0;
+  const int total_strengths = nb_cdef_strengths[pick_method];
+  memset(tot_mse, 0, sizeof(tot_mse));
+  for (i = 0; i < sb_count; i++) {
+    int gi;
+    uint64_t best_mse = (uint64_t)1 << 63;
+    /* Find best mse among already selected options. */
+    for (gi = 0; gi < nb_strengths; gi++) {
+      uint64_t curr = mse[0][i][lev0[gi]];
+      curr += mse[1][i][lev1[gi]];
+      if (curr < best_mse) {
+        best_mse = curr;
+      }
+    }
+    /* Find best mse when adding each possible new option. */
+    for (j = 0; j < total_strengths; j++) {
+      int k;
+      for (k = 0; k < total_strengths; k++) {
+        uint64_t best = best_mse;
+        uint64_t curr = mse[0][i][j];
+        curr += mse[1][i][k];
+        if (curr < best) best = curr;
+        tot_mse[j][k] += best;
+      }
+    }
+  }
+  for (j = 0; j < total_strengths; j++) {
+    int k;
+    for (k = 0; k < total_strengths; k++) {
+      if (tot_mse[j][k] < best_tot_mse) {
+        best_tot_mse = tot_mse[j][k];
+        best_id0 = j;
+        best_id1 = k;
+      }
+    }
+  }
+  lev0[nb_strengths] = best_id0;
+  lev1[nb_strengths] = best_id1;
+  return best_tot_mse;
+}
+
+/* Search for the set of strengths that minimizes mse. */
+static uint64_t joint_strength_search(int *best_lev, int nb_strengths,
+                                      uint64_t mse[][TOTAL_STRENGTHS],
+                                      int sb_count,
+                                      CDEF_PICK_METHOD pick_method) {
+  uint64_t best_tot_mse;
+  int fast = (pick_method == CDEF_FAST_SEARCH_LVL1 ||
+              pick_method == CDEF_FAST_SEARCH_LVL2);
+  int i;
+  best_tot_mse = (uint64_t)1 << 63;
+  /* Greedy search: add one strength options at a time. */
+  for (i = 0; i < nb_strengths; i++) {
+    best_tot_mse = search_one(best_lev, i, mse, sb_count, pick_method);
+  }
+  /* Trying to refine the greedy search by reconsidering each
+     already-selected option. */
+  if (!fast) {
+    for (i = 0; i < 4 * nb_strengths; i++) {
+      int j;
+      for (j = 0; j < nb_strengths - 1; j++) best_lev[j] = best_lev[j + 1];
+      best_tot_mse =
+          search_one(best_lev, nb_strengths - 1, mse, sb_count, pick_method);
+    }
+  }
+  return best_tot_mse;
+}
+
+/* Search for the set of luma+chroma strengths that minimizes mse. */
+static uint64_t joint_strength_search_dual(int *best_lev0, int *best_lev1,
+                                           int nb_strengths,
+                                           uint64_t (**mse)[TOTAL_STRENGTHS],
+                                           int sb_count,
+                                           CDEF_PICK_METHOD pick_method) {
+  uint64_t best_tot_mse;
+  int i;
+  best_tot_mse = (uint64_t)1 << 63;
+  /* Greedy search: add one strength options at a time. */
+  for (i = 0; i < nb_strengths; i++) {
+    best_tot_mse =
+        search_one_dual(best_lev0, best_lev1, i, mse, sb_count, pick_method);
+  }
+  /* Trying to refine the greedy search by reconsidering each
+     already-selected option. */
+  for (i = 0; i < 4 * nb_strengths; i++) {
+    int j;
+    for (j = 0; j < nb_strengths - 1; j++) {
+      best_lev0[j] = best_lev0[j + 1];
+      best_lev1[j] = best_lev1[j + 1];
+    }
+    best_tot_mse = search_one_dual(best_lev0, best_lev1, nb_strengths - 1, mse,
+                                   sb_count, pick_method);
+  }
+  return best_tot_mse;
+}
+
+typedef void (*copy_fn_t)(uint16_t *dst, int dstride, const void *src,
+                          int src_voffset, int src_hoffset, int sstride,
+                          int vsize, int hsize);
+typedef uint64_t (*compute_cdef_dist_t)(void *dst, int dstride, uint16_t *src,
+                                        cdef_list *dlist, int cdef_count,
+                                        BLOCK_SIZE bsize, int coeff_shift,
+                                        int row, int col);
+
+static void copy_sb16_16_highbd(uint16_t *dst, int dstride, const void *src,
+                                int src_voffset, int src_hoffset, int sstride,
+                                int vsize, int hsize) {
+  int r;
+  const uint16_t *src16 = CONVERT_TO_SHORTPTR((uint8_t *)src);
+  const uint16_t *base = &src16[src_voffset * sstride + src_hoffset];
+  for (r = 0; r < vsize; r++)
+    memcpy(dst + r * dstride, base + r * sstride, hsize * sizeof(*base));
+}
+
+static void copy_sb16_16(uint16_t *dst, int dstride, const void *src,
+                         int src_voffset, int src_hoffset, int sstride,
+                         int vsize, int hsize) {
+  int r, c;
+  const uint8_t *src8 = (uint8_t *)src;
+  const uint8_t *base = &src8[src_voffset * sstride + src_hoffset];
+  for (r = 0; r < vsize; r++)
+    for (c = 0; c < hsize; c++)
+      dst[r * dstride + c] = (uint16_t)base[r * sstride + c];
+}
+
+static INLINE uint64_t mse_wxh_16bit_highbd(uint16_t *dst, int dstride,
+                                            uint16_t *src, int sstride, int w,
+                                            int h) {
+  uint64_t sum = 0;
+  int i, j;
+  for (i = 0; i < h; i++) {
+    for (j = 0; j < w; j++) {
+      int e = dst[i * dstride + j] - src[i * sstride + j];
+      sum += e * e;
+    }
+  }
+  return sum;
+}
+
+static INLINE uint64_t mse_wxh_16bit(uint8_t *dst, int dstride, uint16_t *src,
+                                     int sstride, int w, int h) {
+  uint64_t sum = 0;
+  int i, j;
+  for (i = 0; i < h; i++) {
+    for (j = 0; j < w; j++) {
+      int e = (uint16_t)dst[i * dstride + j] - src[i * sstride + j];
+      sum += e * e;
+    }
+  }
+  return sum;
+}
+
+static INLINE void init_src_params(int *src_stride, int *width, int *height,
+                                   int *width_log2, int *height_log2,
+                                   BLOCK_SIZE bsize) {
+  *src_stride = block_size_wide[bsize];
+  *width = block_size_wide[bsize];
+  *height = block_size_high[bsize];
+  *width_log2 = MI_SIZE_LOG2 + mi_size_wide_log2[bsize];
+  *height_log2 = MI_SIZE_LOG2 + mi_size_wide_log2[bsize];
+}
+
+/* Compute MSE only on the blocks we filtered. */
+static uint64_t compute_cdef_dist_highbd(void *dst, int dstride, uint16_t *src,
+                                         cdef_list *dlist, int cdef_count,
+                                         BLOCK_SIZE bsize, int coeff_shift,
+                                         int row, int col) {
+  assert(bsize == BLOCK_4X4 || bsize == BLOCK_4X8 || bsize == BLOCK_8X4 ||
+         bsize == BLOCK_8X8);
+  uint64_t sum = 0;
+  int bi, bx, by;
+  uint16_t *dst16 = CONVERT_TO_SHORTPTR((uint8_t *)dst);
+  uint16_t *dst_buff = &dst16[row * dstride + col];
+  int src_stride, width, height, width_log2, height_log2;
+  init_src_params(&src_stride, &width, &height, &width_log2, &height_log2,
+                  bsize);
+  for (bi = 0; bi < cdef_count; bi++) {
+    by = dlist[bi].by;
+    bx = dlist[bi].bx;
+    sum += mse_wxh_16bit_highbd(
+        &dst_buff[(by << height_log2) * dstride + (bx << width_log2)], dstride,
+        &src[bi << (height_log2 + width_log2)], src_stride, width, height);
+  }
+  return sum >> 2 * coeff_shift;
+}
+
+static uint64_t compute_cdef_dist(void *dst, int dstride, uint16_t *src,
+                                  cdef_list *dlist, int cdef_count,
+                                  BLOCK_SIZE bsize, int coeff_shift, int row,
+                                  int col) {
+  assert(bsize == BLOCK_4X4 || bsize == BLOCK_4X8 || bsize == BLOCK_8X4 ||
+         bsize == BLOCK_8X8);
+  uint64_t sum = 0;
+  int bi, bx, by;
+  uint8_t *dst8 = (uint8_t *)dst;
+  uint8_t *dst_buff = &dst8[row * dstride + col];
+  int src_stride, width, height, width_log2, height_log2;
+  init_src_params(&src_stride, &width, &height, &width_log2, &height_log2,
+                  bsize);
+  for (bi = 0; bi < cdef_count; bi++) {
+    by = dlist[bi].by;
+    bx = dlist[bi].bx;
+    sum += mse_wxh_16bit(
+        &dst_buff[(by << height_log2) * dstride + (bx << width_log2)], dstride,
+        &src[bi << (height_log2 + width_log2)], src_stride, width, height);
+  }
+  return sum >> 2 * coeff_shift;
+}
+
+static int sb_all_skip(const CommonModeInfoParams *const mi_params, int mi_row,
+                       int mi_col) {
+  const int maxr = AOMMIN(mi_params->mi_rows - mi_row, MI_SIZE_64X64);
+  const int maxc = AOMMIN(mi_params->mi_cols - mi_col, MI_SIZE_64X64);
+  const int stride = mi_params->mi_stride;
+  MB_MODE_INFO **mbmi = mi_params->mi_grid_base + mi_row * stride + mi_col;
+  for (int r = 0; r < maxr; ++r, mbmi += stride) {
+    for (int c = 0; c < maxc; ++c) {
+      if (!mbmi[c]->skip) return 0;
+    }
+  }
+  return 1;
+}
+
+static void pick_cdef_from_qp(AV1_COMMON *const cm) {
+  const int bd = cm->seq_params.bit_depth;
+  const int q =
+      av1_ac_quant_QTX(cm->quant_params.base_qindex, 0, bd) >> (bd - 8);
+  CdefInfo *const cdef_info = &cm->cdef_info;
+  cdef_info->cdef_bits = 0;
+  cdef_info->nb_cdef_strengths = 1;
+  cdef_info->cdef_damping = 3 + (cm->quant_params.base_qindex >> 6);
+
+  int predicted_y_f1 = 0;
+  int predicted_y_f2 = 0;
+  int predicted_uv_f1 = 0;
+  int predicted_uv_f2 = 0;
+  aom_clear_system_state();
+  if (!frame_is_intra_only(cm)) {
+    predicted_y_f1 = clamp((int)roundf(q * q * -0.0000023593946f +
+                                       q * 0.0068615186f + 0.02709886f),
+                           0, 15);
+    predicted_y_f2 = clamp((int)roundf(q * q * -0.00000057629734f +
+                                       q * 0.0013993345f + 0.03831067f),
+                           0, 3);
+    predicted_uv_f1 = clamp((int)roundf(q * q * -0.0000007095069f +
+                                        q * 0.0034628846f + 0.00887099f),
+                            0, 15);
+    predicted_uv_f2 = clamp((int)roundf(q * q * 0.00000023874085f +
+                                        q * 0.00028223585f + 0.05576307f),
+                            0, 3);
+  } else {
+    predicted_y_f1 = clamp(
+        (int)roundf(q * q * 0.0000033731974f + q * 0.008070594f + 0.0187634f),
+        0, 15);
+    predicted_y_f2 = clamp(
+        (int)roundf(q * q * 0.0000029167343f + q * 0.0027798624f + 0.0079405f),
+        0, 3);
+    predicted_uv_f1 = clamp(
+        (int)roundf(q * q * -0.0000130790995f + q * 0.012892405f - 0.00748388f),
+        0, 15);
+    predicted_uv_f2 = clamp((int)roundf(q * q * 0.0000032651783f +
+                                        q * 0.00035520183f + 0.00228092f),
+                            0, 3);
+  }
+  cdef_info->cdef_strengths[0] =
+      predicted_y_f1 * CDEF_SEC_STRENGTHS + predicted_y_f2;
+  cdef_info->cdef_uv_strengths[0] =
+      predicted_uv_f1 * CDEF_SEC_STRENGTHS + predicted_uv_f2;
+
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  const int nvfb = (mi_params->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+  const int nhfb = (mi_params->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+  MB_MODE_INFO **mbmi = mi_params->mi_grid_base;
+  for (int r = 0; r < nvfb; ++r) {
+    for (int c = 0; c < nhfb; ++c) {
+      mbmi[MI_SIZE_64X64 * c]->cdef_strength = 0;
+    }
+    mbmi += MI_SIZE_64X64 * mi_params->mi_stride;
+  }
+}
+
+void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
+                     AV1_COMMON *cm, MACROBLOCKD *xd, int pick_method,
+                     int rdmult) {
+  if (pick_method == CDEF_PICK_FROM_Q) {
+    pick_cdef_from_qp(cm);
+    return;
+  }
+
+  cdef_list dlist[MI_SIZE_128X128 * MI_SIZE_128X128];
+  int dir[CDEF_NBLOCKS][CDEF_NBLOCKS] = { { 0 } };
+  int var[CDEF_NBLOCKS][CDEF_NBLOCKS] = { { 0 } };
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  const int nvfb = (mi_params->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+  const int nhfb = (mi_params->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+  int *sb_index = aom_malloc(nvfb * nhfb * sizeof(*sb_index));
+  const int damping = 3 + (cm->quant_params.base_qindex >> 6);
+  const int fast = (pick_method == CDEF_FAST_SEARCH_LVL1 ||
+                    pick_method == CDEF_FAST_SEARCH_LVL2);
+  const int total_strengths = nb_cdef_strengths[pick_method];
+  DECLARE_ALIGNED(32, uint16_t, tmp_dst[1 << (MAX_SB_SIZE_LOG2 * 2)]);
+  const int num_planes = av1_num_planes(cm);
+  av1_setup_dst_planes(xd->plane, cm->seq_params.sb_size, frame, 0, 0, 0,
+                       num_planes);
+  uint64_t(*mse[2])[TOTAL_STRENGTHS];
+  mse[0] = aom_malloc(sizeof(**mse) * nvfb * nhfb);
+  mse[1] = aom_malloc(sizeof(**mse) * nvfb * nhfb);
+
+  int bsize[3];
+  int mi_wide_l2[3];
+  int mi_high_l2[3];
+  int xdec[3];
+  int ydec[3];
+  uint8_t *ref_buffer[3] = { ref->y_buffer, ref->u_buffer, ref->v_buffer };
+  int ref_stride[3] = { ref->y_stride, ref->uv_stride, ref->uv_stride };
+
+  for (int pli = 0; pli < num_planes; pli++) {
+    xdec[pli] = xd->plane[pli].subsampling_x;
+    ydec[pli] = xd->plane[pli].subsampling_y;
+    bsize[pli] = ydec[pli] ? (xdec[pli] ? BLOCK_4X4 : BLOCK_8X4)
+                           : (xdec[pli] ? BLOCK_4X8 : BLOCK_8X8);
+    mi_wide_l2[pli] = MI_SIZE_LOG2 - xd->plane[pli].subsampling_x;
+    mi_high_l2[pli] = MI_SIZE_LOG2 - xd->plane[pli].subsampling_y;
+  }
+
+  copy_fn_t copy_fn;
+  compute_cdef_dist_t compute_cdef_dist_fn;
+
+  if (cm->seq_params.use_highbitdepth) {
+    copy_fn = copy_sb16_16_highbd;
+    compute_cdef_dist_fn = compute_cdef_dist_highbd;
+  } else {
+    copy_fn = copy_sb16_16;
+    compute_cdef_dist_fn = compute_cdef_dist;
+  }
+
+  DECLARE_ALIGNED(32, uint16_t, inbuf[CDEF_INBUF_SIZE]);
+  uint16_t *const in = inbuf + CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER;
+  const int coeff_shift = AOMMAX(cm->seq_params.bit_depth - 8, 0);
+  int sb_count = 0;
+  for (int fbr = 0; fbr < nvfb; ++fbr) {
+    for (int fbc = 0; fbc < nhfb; ++fbc) {
+      // No filtering if the entire filter block is skipped
+      if (sb_all_skip(mi_params, fbr * MI_SIZE_64X64, fbc * MI_SIZE_64X64))
+        continue;
+
+      const MB_MODE_INFO *const mbmi =
+          mi_params->mi_grid_base[MI_SIZE_64X64 * fbr * mi_params->mi_stride +
+                                  MI_SIZE_64X64 * fbc];
+      if (((fbc & 1) &&
+           (mbmi->sb_type == BLOCK_128X128 || mbmi->sb_type == BLOCK_128X64)) ||
+          ((fbr & 1) &&
+           (mbmi->sb_type == BLOCK_128X128 || mbmi->sb_type == BLOCK_64X128)))
+        continue;
+
+      int nhb = AOMMIN(MI_SIZE_64X64, mi_params->mi_cols - MI_SIZE_64X64 * fbc);
+      int nvb = AOMMIN(MI_SIZE_64X64, mi_params->mi_rows - MI_SIZE_64X64 * fbr);
+      int hb_step = 1;
+      int vb_step = 1;
+      BLOCK_SIZE bs;
+      if (mbmi->sb_type == BLOCK_128X128 || mbmi->sb_type == BLOCK_128X64 ||
+          mbmi->sb_type == BLOCK_64X128) {
+        bs = mbmi->sb_type;
+        if (bs == BLOCK_128X128 || bs == BLOCK_128X64) {
+          nhb =
+              AOMMIN(MI_SIZE_128X128, mi_params->mi_cols - MI_SIZE_64X64 * fbc);
+          hb_step = 2;
+        }
+        if (bs == BLOCK_128X128 || bs == BLOCK_64X128) {
+          nvb =
+              AOMMIN(MI_SIZE_128X128, mi_params->mi_rows - MI_SIZE_64X64 * fbr);
+          vb_step = 2;
+        }
+      } else {
+        bs = BLOCK_64X64;
+      }
+
+      const int cdef_count = av1_cdef_compute_sb_list(
+          mi_params, fbr * MI_SIZE_64X64, fbc * MI_SIZE_64X64, dlist, bs);
+
+      const int yoff = CDEF_VBORDER * (fbr != 0);
+      const int xoff = CDEF_HBORDER * (fbc != 0);
+      int dirinit = 0;
+      for (int pli = 0; pli < num_planes; pli++) {
+        for (int i = 0; i < CDEF_INBUF_SIZE; i++) inbuf[i] = CDEF_VERY_LARGE;
+        /* We avoid filtering the pixels for which some of the pixels to
+           average are outside the frame. We could change the filter instead,
+           but it would add special cases for any future vectorization. */
+        const int ysize = (nvb << mi_high_l2[pli]) +
+                          CDEF_VBORDER * (fbr + vb_step < nvfb) + yoff;
+        const int xsize = (nhb << mi_wide_l2[pli]) +
+                          CDEF_HBORDER * (fbc + hb_step < nhfb) + xoff;
+        const int row = fbr * MI_SIZE_64X64 << mi_high_l2[pli];
+        const int col = fbc * MI_SIZE_64X64 << mi_wide_l2[pli];
+        for (int gi = 0; gi < total_strengths; gi++) {
+          int pri_strength = gi / CDEF_SEC_STRENGTHS;
+          if (fast) pri_strength = get_pri_strength(pick_method, pri_strength);
+          const int sec_strength = gi % CDEF_SEC_STRENGTHS;
+          copy_fn(&in[(-yoff * CDEF_BSTRIDE - xoff)], CDEF_BSTRIDE,
+                  xd->plane[pli].dst.buf, row - yoff, col - xoff,
+                  xd->plane[pli].dst.stride, ysize, xsize);
+          av1_cdef_filter_fb(
+              NULL, tmp_dst, CDEF_BSTRIDE, in, xdec[pli], ydec[pli], dir,
+              &dirinit, var, pli, dlist, cdef_count, pri_strength,
+              sec_strength + (sec_strength == 3), damping, coeff_shift);
+          const uint64_t curr_mse = compute_cdef_dist_fn(
+              ref_buffer[pli], ref_stride[pli], tmp_dst, dlist, cdef_count,
+              bsize[pli], coeff_shift, row, col);
+          if (pli < 2)
+            mse[pli][sb_count][gi] = curr_mse;
+          else
+            mse[1][sb_count][gi] += curr_mse;
+        }
+      }
+      sb_index[sb_count++] =
+          MI_SIZE_64X64 * fbr * mi_params->mi_stride + MI_SIZE_64X64 * fbc;
+    }
+  }
+
+  /* Search for different number of signalling bits. */
+  int nb_strength_bits = 0;
+  uint64_t best_rd = UINT64_MAX;
+  CdefInfo *const cdef_info = &cm->cdef_info;
+  for (int i = 0; i <= 3; i++) {
+    int best_lev0[CDEF_MAX_STRENGTHS];
+    int best_lev1[CDEF_MAX_STRENGTHS] = { 0 };
+    const int nb_strengths = 1 << i;
+    uint64_t tot_mse;
+    if (num_planes > 1) {
+      tot_mse = joint_strength_search_dual(best_lev0, best_lev1, nb_strengths,
+                                           mse, sb_count, pick_method);
+    } else {
+      tot_mse = joint_strength_search(best_lev0, nb_strengths, mse[0], sb_count,
+                                      pick_method);
+    }
+
+    const int total_bits = sb_count * i + nb_strengths * CDEF_STRENGTH_BITS *
+                                              (num_planes > 1 ? 2 : 1);
+    const int rate_cost = av1_cost_literal(total_bits);
+    const uint64_t dist = tot_mse * 16;
+    const uint64_t rd = RDCOST(rdmult, rate_cost, dist);
+    if (rd < best_rd) {
+      best_rd = rd;
+      nb_strength_bits = i;
+      memcpy(cdef_info->cdef_strengths, best_lev0,
+             nb_strengths * sizeof(best_lev0[0]));
+      if (num_planes > 1) {
+        memcpy(cdef_info->cdef_uv_strengths, best_lev1,
+               nb_strengths * sizeof(best_lev1[0]));
+      }
+    }
+  }
+
+  cdef_info->cdef_bits = nb_strength_bits;
+  cdef_info->nb_cdef_strengths = 1 << nb_strength_bits;
+  for (int i = 0; i < sb_count; i++) {
+    uint64_t best_mse = UINT64_MAX;
+    int best_gi = 0;
+    for (int gi = 0; gi < cdef_info->nb_cdef_strengths; gi++) {
+      uint64_t curr = mse[0][i][cdef_info->cdef_strengths[gi]];
+      if (num_planes > 1) curr += mse[1][i][cdef_info->cdef_uv_strengths[gi]];
+      if (curr < best_mse) {
+        best_gi = gi;
+        best_mse = curr;
+      }
+    }
+    mi_params->mi_grid_base[sb_index[i]]->cdef_strength = best_gi;
+  }
+
+  if (fast) {
+    for (int j = 0; j < cdef_info->nb_cdef_strengths; j++) {
+      const int luma_strength = cdef_info->cdef_strengths[j];
+      const int chroma_strength = cdef_info->cdef_uv_strengths[j];
+      int pri_strength;
+      pri_strength =
+          get_pri_strength(pick_method, luma_strength / CDEF_SEC_STRENGTHS);
+      cdef_info->cdef_strengths[j] = pri_strength * CDEF_SEC_STRENGTHS +
+                                     (luma_strength % CDEF_SEC_STRENGTHS);
+      pri_strength =
+          get_pri_strength(pick_method, chroma_strength / CDEF_SEC_STRENGTHS);
+      cdef_info->cdef_uv_strengths[j] = pri_strength * CDEF_SEC_STRENGTHS +
+                                        (chroma_strength % CDEF_SEC_STRENGTHS);
+    }
+  }
+
+  cdef_info->cdef_damping = damping;
+
+  aom_free(mse[0]);
+  aom_free(mse[1]);
+  aom_free(sb_index);
+}
diff --git a/libs/libaom/src/av1/encoder/picklpf.c b/libs/libaom/src/av1/encoder/picklpf.c
new file mode 100644
index 000000000..17c996551
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/picklpf.c
@@ -0,0 +1,285 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <limits.h>
+
+#include "config/aom_scale_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/psnr.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/av1_loopfilter.h"
+#include "av1/common/quant_common.h"
+
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/picklpf.h"
+
+static void yv12_copy_plane(const YV12_BUFFER_CONFIG *src_bc,
+                            YV12_BUFFER_CONFIG *dst_bc, int plane) {
+  switch (plane) {
+    case 0: aom_yv12_copy_y(src_bc, dst_bc); break;
+    case 1: aom_yv12_copy_u(src_bc, dst_bc); break;
+    case 2: aom_yv12_copy_v(src_bc, dst_bc); break;
+    default: assert(plane >= 0 && plane <= 2); break;
+  }
+}
+
+int av1_get_max_filter_level(const AV1_COMP *cpi) {
+  if (is_stat_consumption_stage_twopass(cpi)) {
+    return cpi->twopass.section_intra_rating > 8 ? MAX_LOOP_FILTER * 3 / 4
+                                                 : MAX_LOOP_FILTER;
+  } else {
+    return MAX_LOOP_FILTER;
+  }
+}
+
+static int64_t try_filter_frame(const YV12_BUFFER_CONFIG *sd,
+                                AV1_COMP *const cpi, int filt_level,
+                                int partial_frame, int plane, int dir) {
+  AV1_COMMON *const cm = &cpi->common;
+  int64_t filt_err;
+
+  assert(plane >= 0 && plane <= 2);
+  int filter_level[2] = { filt_level, filt_level };
+  if (plane == 0 && dir == 0) filter_level[1] = cm->lf.filter_level[1];
+  if (plane == 0 && dir == 1) filter_level[0] = cm->lf.filter_level[0];
+
+  // set base filters for use of av1_get_filter_level when in DELTA_LF mode
+  switch (plane) {
+    case 0:
+      cm->lf.filter_level[0] = filter_level[0];
+      cm->lf.filter_level[1] = filter_level[1];
+      break;
+    case 1: cm->lf.filter_level_u = filter_level[0]; break;
+    case 2: cm->lf.filter_level_v = filter_level[0]; break;
+  }
+
+  // TODO(any): please enable multi-thread and remove the flag when loop
+  // filter mask is compatible with multi-thread.
+  if (cpi->num_workers > 1)
+    av1_loop_filter_frame_mt(&cm->cur_frame->buf, cm, &cpi->td.mb.e_mbd, plane,
+                             plane + 1, partial_frame,
+#if CONFIG_LPF_MASK
+                             0,
+#endif
+                             cpi->workers, cpi->num_workers, &cpi->lf_row_sync);
+  else
+    av1_loop_filter_frame(&cm->cur_frame->buf, cm, &cpi->td.mb.e_mbd,
+#if CONFIG_LPF_MASK
+                          0,
+#endif
+                          plane, plane + 1, partial_frame);
+
+  filt_err = aom_get_sse_plane(sd, &cm->cur_frame->buf, plane,
+                               cm->seq_params.use_highbitdepth);
+
+  // Re-instate the unfiltered frame
+  yv12_copy_plane(&cpi->last_frame_uf, &cm->cur_frame->buf, plane);
+
+  return filt_err;
+}
+
+static int search_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
+                               int partial_frame,
+                               const int *last_frame_filter_level,
+                               double *best_cost_ret, int plane, int dir) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int min_filter_level = 0;
+  const int max_filter_level = av1_get_max_filter_level(cpi);
+  int filt_direction = 0;
+  int64_t best_err;
+  int filt_best;
+  MACROBLOCK *x = &cpi->td.mb;
+
+  // Start the search at the previous frame filter level unless it is now out of
+  // range.
+  int lvl;
+  switch (plane) {
+    case 0:
+      switch (dir) {
+        case 2:
+          lvl = (last_frame_filter_level[0] + last_frame_filter_level[1] + 1) >>
+                1;
+          break;
+        case 0:
+        case 1: lvl = last_frame_filter_level[dir]; break;
+        default: assert(dir >= 0 && dir <= 2); return 0;
+      }
+      break;
+    case 1: lvl = last_frame_filter_level[2]; break;
+    case 2: lvl = last_frame_filter_level[3]; break;
+    default: assert(plane >= 0 && plane <= 2); return 0;
+  }
+  int filt_mid = clamp(lvl, min_filter_level, max_filter_level);
+  int filter_step = filt_mid < 16 ? 4 : filt_mid / 4;
+  // Sum squared error at each filter level
+  int64_t ss_err[MAX_LOOP_FILTER + 1];
+
+  // Set each entry to -1
+  memset(ss_err, 0xFF, sizeof(ss_err));
+  yv12_copy_plane(&cm->cur_frame->buf, &cpi->last_frame_uf, plane);
+  best_err = try_filter_frame(sd, cpi, filt_mid, partial_frame, plane, dir);
+  filt_best = filt_mid;
+  ss_err[filt_mid] = best_err;
+
+  while (filter_step > 0) {
+    const int filt_high = AOMMIN(filt_mid + filter_step, max_filter_level);
+    const int filt_low = AOMMAX(filt_mid - filter_step, min_filter_level);
+
+    // Bias against raising loop filter in favor of lowering it.
+    int64_t bias = (best_err >> (15 - (filt_mid / 8))) * filter_step;
+
+    if ((is_stat_consumption_stage_twopass(cpi)) &&
+        (cpi->twopass.section_intra_rating < 20))
+      bias = (bias * cpi->twopass.section_intra_rating) / 20;
+
+    // yx, bias less for large block size
+    if (cm->features.tx_mode != ONLY_4X4) bias >>= 1;
+
+    if (filt_direction <= 0 && filt_low != filt_mid) {
+      // Get Low filter error score
+      if (ss_err[filt_low] < 0) {
+        ss_err[filt_low] =
+            try_filter_frame(sd, cpi, filt_low, partial_frame, plane, dir);
+      }
+      // If value is close to the best so far then bias towards a lower loop
+      // filter value.
+      if (ss_err[filt_low] < (best_err + bias)) {
+        // Was it actually better than the previous best?
+        if (ss_err[filt_low] < best_err) {
+          best_err = ss_err[filt_low];
+        }
+        filt_best = filt_low;
+      }
+    }
+
+    // Now look at filt_high
+    if (filt_direction >= 0 && filt_high != filt_mid) {
+      if (ss_err[filt_high] < 0) {
+        ss_err[filt_high] =
+            try_filter_frame(sd, cpi, filt_high, partial_frame, plane, dir);
+      }
+      // If value is significantly better than previous best, bias added against
+      // raising filter value
+      if (ss_err[filt_high] < (best_err - bias)) {
+        best_err = ss_err[filt_high];
+        filt_best = filt_high;
+      }
+    }
+
+    // Half the step distance if the best filter value was the same as last time
+    if (filt_best == filt_mid) {
+      filter_step /= 2;
+      filt_direction = 0;
+    } else {
+      filt_direction = (filt_best < filt_mid) ? -1 : 1;
+      filt_mid = filt_best;
+    }
+  }
+
+  // Update best error
+  best_err = ss_err[filt_best];
+
+  if (best_cost_ret) *best_cost_ret = RDCOST_DBL(x->rdmult, 0, best_err);
+  return filt_best;
+}
+
+void av1_pick_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
+                           LPF_PICK_METHOD method) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  struct loopfilter *const lf = &cm->lf;
+  (void)sd;
+
+  lf->sharpness_level = 0;
+  cpi->td.mb.rdmult = cpi->rd.RDMULT;
+
+  if (method == LPF_PICK_MINIMAL_LPF) {
+    lf->filter_level[0] = 0;
+    lf->filter_level[1] = 0;
+  } else if (method >= LPF_PICK_FROM_Q) {
+    const int min_filter_level = 0;
+    const int max_filter_level = av1_get_max_filter_level(cpi);
+    const int q = av1_ac_quant_QTX(cm->quant_params.base_qindex, 0,
+                                   cm->seq_params.bit_depth);
+    // based on tests result for rtc test set
+    // 0.04590 boosted or 0.02295 non-booseted in 18-bit fixed point
+    const int strength_boost_q_treshold = 700;
+    const int inter_frame_multiplier =
+        q > strength_boost_q_treshold ? 12034 : 6017;
+    // These values were determined by linear fitting the result of the
+    // searched level for 8 bit depth:
+    // Keyframes: filt_guess = q * 0.06699 - 1.60817
+    // Other frames: filt_guess = q * inter_frame_multiplier + 2.48225
+    //
+    // And high bit depth separately:
+    // filt_guess = q * 0.316206 + 3.87252
+    int filt_guess;
+    switch (cm->seq_params.bit_depth) {
+      case AOM_BITS_8:
+        filt_guess =
+            (cm->current_frame.frame_type == KEY_FRAME)
+                ? ROUND_POWER_OF_TWO(q * 17563 - 421574, 18)
+                : ROUND_POWER_OF_TWO(q * inter_frame_multiplier + 650707, 18);
+        break;
+      case AOM_BITS_10:
+        filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 4060632, 20);
+        break;
+      case AOM_BITS_12:
+        filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 16242526, 22);
+        break;
+      default:
+        assert(0 &&
+               "bit_depth should be AOM_BITS_8, AOM_BITS_10 "
+               "or AOM_BITS_12");
+        return;
+    }
+    if (cm->seq_params.bit_depth != AOM_BITS_8 &&
+        cm->current_frame.frame_type == KEY_FRAME)
+      filt_guess -= 4;
+    // TODO(chengchen): retrain the model for Y, U, V filter levels
+    lf->filter_level[0] = clamp(filt_guess, min_filter_level, max_filter_level);
+    lf->filter_level[1] = clamp(filt_guess, min_filter_level, max_filter_level);
+    lf->filter_level_u = clamp(filt_guess, min_filter_level, max_filter_level);
+    lf->filter_level_v = clamp(filt_guess, min_filter_level, max_filter_level);
+  } else {
+    const int last_frame_filter_level[4] = { lf->filter_level[0],
+                                             lf->filter_level[1],
+                                             lf->filter_level_u,
+                                             lf->filter_level_v };
+
+    lf->filter_level[0] = lf->filter_level[1] =
+        search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE,
+                            last_frame_filter_level, NULL, 0, 2);
+    if (method != LPF_PICK_FROM_FULL_IMAGE_NON_DUAL) {
+      lf->filter_level[0] =
+          search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE,
+                              last_frame_filter_level, NULL, 0, 0);
+      lf->filter_level[1] =
+          search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE,
+                              last_frame_filter_level, NULL, 0, 1);
+    }
+
+    if (num_planes > 1) {
+      lf->filter_level_u =
+          search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE,
+                              last_frame_filter_level, NULL, 1, 0);
+      lf->filter_level_v =
+          search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE,
+                              last_frame_filter_level, NULL, 2, 0);
+    }
+  }
+}
diff --git a/libs/libaom/src/av1/encoder/picklpf.h b/libs/libaom/src/av1/encoder/picklpf.h
new file mode 100644
index 000000000..357097ae1
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/picklpf.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_PICKLPF_H_
+#define AOM_AV1_ENCODER_PICKLPF_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/encoder/encoder.h"
+
+struct yv12_buffer_config;
+struct AV1_COMP;
+int av1_get_max_filter_level(const AV1_COMP *cpi);
+void av1_pick_filter_level(const struct yv12_buffer_config *sd,
+                           struct AV1_COMP *cpi, LPF_PICK_METHOD method);
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_PICKLPF_H_
diff --git a/libs/libaom/src/av1/encoder/pickrst.c b/libs/libaom/src/av1/encoder/pickrst.c
new file mode 100644
index 000000000..ccbe1cc3e
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/pickrst.c
@@ -0,0 +1,1768 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <float.h>
+#include <limits.h>
+#include <math.h>
+
+#include "config/aom_scale_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/binary_codes_writer.h"
+#include "aom_dsp/psnr.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/system_state.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/restoration.h"
+
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/mathutils.h"
+#include "av1/encoder/picklpf.h"
+#include "av1/encoder/pickrst.h"
+
+// When set to RESTORE_WIENER or RESTORE_SGRPROJ only those are allowed.
+// When set to RESTORE_TYPES we allow switchable.
+static const RestorationType force_restore_type = RESTORE_TYPES;
+
+// Number of Wiener iterations
+#define NUM_WIENER_ITERS 5
+
+// Penalty factor for use of dual sgr
+#define DUAL_SGR_PENALTY_MULT 0.01
+
+// Working precision for Wiener filter coefficients
+#define WIENER_TAP_SCALE_FACTOR ((int64_t)1 << 16)
+
+#define SGRPROJ_EP_GRP1_START_IDX 0
+#define SGRPROJ_EP_GRP1_END_IDX 9
+#define SGRPROJ_EP_GRP1_SEARCH_COUNT 4
+#define SGRPROJ_EP_GRP2_3_SEARCH_COUNT 2
+static const int sgproj_ep_grp1_seed[SGRPROJ_EP_GRP1_SEARCH_COUNT] = { 0, 3, 6,
+                                                                       9 };
+static const int sgproj_ep_grp2_3[SGRPROJ_EP_GRP2_3_SEARCH_COUNT][14] = {
+  { 10, 10, 11, 11, 12, 12, 13, 13, 13, 13, -1, -1, -1, -1 },
+  { 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15 }
+};
+
+typedef int64_t (*sse_extractor_type)(const YV12_BUFFER_CONFIG *a,
+                                      const YV12_BUFFER_CONFIG *b);
+typedef int64_t (*sse_part_extractor_type)(const YV12_BUFFER_CONFIG *a,
+                                           const YV12_BUFFER_CONFIG *b,
+                                           int hstart, int width, int vstart,
+                                           int height);
+typedef uint64_t (*var_part_extractor_type)(const YV12_BUFFER_CONFIG *a,
+                                            int hstart, int width, int vstart,
+                                            int height);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+#define NUM_EXTRACTORS (3 * (1 + 1))
+#else
+#define NUM_EXTRACTORS 3
+#endif
+static const sse_part_extractor_type sse_part_extractors[NUM_EXTRACTORS] = {
+  aom_get_y_sse_part,        aom_get_u_sse_part,
+  aom_get_v_sse_part,
+#if CONFIG_AV1_HIGHBITDEPTH
+  aom_highbd_get_y_sse_part, aom_highbd_get_u_sse_part,
+  aom_highbd_get_v_sse_part,
+#endif
+};
+static const var_part_extractor_type var_part_extractors[NUM_EXTRACTORS] = {
+  aom_get_y_var,        aom_get_u_var,        aom_get_v_var,
+#if CONFIG_AV1_HIGHBITDEPTH
+  aom_highbd_get_y_var, aom_highbd_get_u_var, aom_highbd_get_v_var,
+#endif
+};
+
+static int64_t sse_restoration_unit(const RestorationTileLimits *limits,
+                                    const YV12_BUFFER_CONFIG *src,
+                                    const YV12_BUFFER_CONFIG *dst, int plane,
+                                    int highbd) {
+  return sse_part_extractors[3 * highbd + plane](
+      src, dst, limits->h_start, limits->h_end - limits->h_start,
+      limits->v_start, limits->v_end - limits->v_start);
+}
+
+static uint64_t var_restoration_unit(const RestorationTileLimits *limits,
+                                     const YV12_BUFFER_CONFIG *src, int plane,
+                                     int highbd) {
+  return var_part_extractors[3 * highbd + plane](
+      src, limits->h_start, limits->h_end - limits->h_start, limits->v_start,
+      limits->v_end - limits->v_start);
+}
+
+typedef struct {
+  // The best coefficients for Wiener or Sgrproj restoration
+  WienerInfo wiener;
+  SgrprojInfo sgrproj;
+
+  // The sum of squared errors for this rtype.
+  int64_t sse[RESTORE_SWITCHABLE_TYPES];
+
+  // The rtype to use for this unit given a frame rtype as
+  // index. Indices: WIENER, SGRPROJ, SWITCHABLE.
+  RestorationType best_rtype[RESTORE_TYPES - 1];
+
+  // This flag will be set based on the speed feature
+  // 'prune_sgr_based_on_wiener'. 0 implies no pruning and 1 implies pruning.
+  uint8_t skip_sgr_eval;
+} RestUnitSearchInfo;
+
+typedef struct {
+  const YV12_BUFFER_CONFIG *src;
+  YV12_BUFFER_CONFIG *dst;
+
+  const AV1_COMMON *cm;
+  const MACROBLOCK *x;
+  int plane;
+  int plane_width;
+  int plane_height;
+  RestUnitSearchInfo *rusi;
+
+  // Speed features
+  const SPEED_FEATURES *sf;
+
+  uint8_t *dgd_buffer;
+  int dgd_stride;
+  const uint8_t *src_buffer;
+  int src_stride;
+
+  // sse and bits are initialised by reset_rsc in search_rest_type
+  int64_t sse;
+  int64_t bits;
+  int tile_y0, tile_stripe0;
+
+  // sgrproj and wiener are initialised by rsc_on_tile when starting the first
+  // tile in the frame.
+  SgrprojInfo sgrproj;
+  WienerInfo wiener;
+  AV1PixelRect tile_rect;
+} RestSearchCtxt;
+
+static AOM_INLINE void rsc_on_tile(void *priv) {
+  RestSearchCtxt *rsc = (RestSearchCtxt *)priv;
+  set_default_sgrproj(&rsc->sgrproj);
+  set_default_wiener(&rsc->wiener);
+  rsc->tile_stripe0 = 0;
+}
+
+static AOM_INLINE void reset_rsc(RestSearchCtxt *rsc) {
+  rsc->sse = 0;
+  rsc->bits = 0;
+}
+
+static AOM_INLINE void init_rsc(const YV12_BUFFER_CONFIG *src,
+                                const AV1_COMMON *cm, const MACROBLOCK *x,
+                                const SPEED_FEATURES *sf, int plane,
+                                RestUnitSearchInfo *rusi,
+                                YV12_BUFFER_CONFIG *dst, RestSearchCtxt *rsc) {
+  rsc->src = src;
+  rsc->dst = dst;
+  rsc->cm = cm;
+  rsc->x = x;
+  rsc->plane = plane;
+  rsc->rusi = rusi;
+  rsc->sf = sf;
+
+  const YV12_BUFFER_CONFIG *dgd = &cm->cur_frame->buf;
+  const int is_uv = plane != AOM_PLANE_Y;
+  rsc->plane_width = src->crop_widths[is_uv];
+  rsc->plane_height = src->crop_heights[is_uv];
+  rsc->src_buffer = src->buffers[plane];
+  rsc->src_stride = src->strides[is_uv];
+  rsc->dgd_buffer = dgd->buffers[plane];
+  rsc->dgd_stride = dgd->strides[is_uv];
+  rsc->tile_rect = av1_whole_frame_rect(cm, is_uv);
+  assert(src->crop_widths[is_uv] == dgd->crop_widths[is_uv]);
+  assert(src->crop_heights[is_uv] == dgd->crop_heights[is_uv]);
+}
+
+static int64_t try_restoration_unit(const RestSearchCtxt *rsc,
+                                    const RestorationTileLimits *limits,
+                                    const AV1PixelRect *tile_rect,
+                                    const RestorationUnitInfo *rui) {
+  const AV1_COMMON *const cm = rsc->cm;
+  const int plane = rsc->plane;
+  const int is_uv = plane > 0;
+  const RestorationInfo *rsi = &cm->rst_info[plane];
+  RestorationLineBuffers rlbs;
+  const int bit_depth = cm->seq_params.bit_depth;
+  const int highbd = cm->seq_params.use_highbitdepth;
+
+  const YV12_BUFFER_CONFIG *fts = &cm->cur_frame->buf;
+  // TODO(yunqing): For now, only use optimized LR filter in decoder. Can be
+  // also used in encoder.
+  const int optimized_lr = 0;
+
+  av1_loop_restoration_filter_unit(
+      limits, rui, &rsi->boundaries, &rlbs, tile_rect, rsc->tile_stripe0,
+      is_uv && cm->seq_params.subsampling_x,
+      is_uv && cm->seq_params.subsampling_y, highbd, bit_depth,
+      fts->buffers[plane], fts->strides[is_uv], rsc->dst->buffers[plane],
+      rsc->dst->strides[is_uv], cm->rst_tmpbuf, optimized_lr);
+
+  return sse_restoration_unit(limits, rsc->src, rsc->dst, plane, highbd);
+}
+
+int64_t av1_lowbd_pixel_proj_error_c(const uint8_t *src8, int width, int height,
+                                     int src_stride, const uint8_t *dat8,
+                                     int dat_stride, int32_t *flt0,
+                                     int flt0_stride, int32_t *flt1,
+                                     int flt1_stride, int xq[2],
+                                     const sgr_params_type *params) {
+  int i, j;
+  const uint8_t *src = src8;
+  const uint8_t *dat = dat8;
+  int64_t err = 0;
+  if (params->r[0] > 0 && params->r[1] > 0) {
+    for (i = 0; i < height; ++i) {
+      for (j = 0; j < width; ++j) {
+        assert(flt1[j] < (1 << 15) && flt1[j] > -(1 << 15));
+        assert(flt0[j] < (1 << 15) && flt0[j] > -(1 << 15));
+        const int32_t u = (int32_t)(dat[j] << SGRPROJ_RST_BITS);
+        int32_t v = u << SGRPROJ_PRJ_BITS;
+        v += xq[0] * (flt0[j] - u) + xq[1] * (flt1[j] - u);
+        const int32_t e =
+            ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) - src[j];
+        err += ((int64_t)e * e);
+      }
+      dat += dat_stride;
+      src += src_stride;
+      flt0 += flt0_stride;
+      flt1 += flt1_stride;
+    }
+  } else if (params->r[0] > 0) {
+    for (i = 0; i < height; ++i) {
+      for (j = 0; j < width; ++j) {
+        assert(flt0[j] < (1 << 15) && flt0[j] > -(1 << 15));
+        const int32_t u = (int32_t)(dat[j] << SGRPROJ_RST_BITS);
+        int32_t v = u << SGRPROJ_PRJ_BITS;
+        v += xq[0] * (flt0[j] - u);
+        const int32_t e =
+            ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) - src[j];
+        err += ((int64_t)e * e);
+      }
+      dat += dat_stride;
+      src += src_stride;
+      flt0 += flt0_stride;
+    }
+  } else if (params->r[1] > 0) {
+    for (i = 0; i < height; ++i) {
+      for (j = 0; j < width; ++j) {
+        assert(flt1[j] < (1 << 15) && flt1[j] > -(1 << 15));
+        const int32_t u = (int32_t)(dat[j] << SGRPROJ_RST_BITS);
+        int32_t v = u << SGRPROJ_PRJ_BITS;
+        v += xq[1] * (flt1[j] - u);
+        const int32_t e =
+            ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) - src[j];
+        err += ((int64_t)e * e);
+      }
+      dat += dat_stride;
+      src += src_stride;
+      flt1 += flt1_stride;
+    }
+  } else {
+    for (i = 0; i < height; ++i) {
+      for (j = 0; j < width; ++j) {
+        const int32_t e = (int32_t)(dat[j]) - src[j];
+        err += ((int64_t)e * e);
+      }
+      dat += dat_stride;
+      src += src_stride;
+    }
+  }
+
+  return err;
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+int64_t av1_highbd_pixel_proj_error_c(const uint8_t *src8, int width,
+                                      int height, int src_stride,
+                                      const uint8_t *dat8, int dat_stride,
+                                      int32_t *flt0, int flt0_stride,
+                                      int32_t *flt1, int flt1_stride, int xq[2],
+                                      const sgr_params_type *params) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+  int i, j;
+  int64_t err = 0;
+  const int32_t half = 1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1);
+  if (params->r[0] > 0 && params->r[1] > 0) {
+    int xq0 = xq[0];
+    int xq1 = xq[1];
+    for (i = 0; i < height; ++i) {
+      for (j = 0; j < width; ++j) {
+        const int32_t d = dat[j];
+        const int32_t s = src[j];
+        const int32_t u = (int32_t)(d << SGRPROJ_RST_BITS);
+        int32_t v0 = flt0[j] - u;
+        int32_t v1 = flt1[j] - u;
+        int32_t v = half;
+        v += xq0 * v0;
+        v += xq1 * v1;
+        const int32_t e = (v >> (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS)) + d - s;
+        err += ((int64_t)e * e);
+      }
+      dat += dat_stride;
+      flt0 += flt0_stride;
+      flt1 += flt1_stride;
+      src += src_stride;
+    }
+  } else if (params->r[0] > 0 || params->r[1] > 0) {
+    int exq;
+    int32_t *flt;
+    int flt_stride;
+    if (params->r[0] > 0) {
+      exq = xq[0];
+      flt = flt0;
+      flt_stride = flt0_stride;
+    } else {
+      exq = xq[1];
+      flt = flt1;
+      flt_stride = flt1_stride;
+    }
+    for (i = 0; i < height; ++i) {
+      for (j = 0; j < width; ++j) {
+        const int32_t d = dat[j];
+        const int32_t s = src[j];
+        const int32_t u = (int32_t)(d << SGRPROJ_RST_BITS);
+        int32_t v = half;
+        v += exq * (flt[j] - u);
+        const int32_t e = (v >> (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS)) + d - s;
+        err += ((int64_t)e * e);
+      }
+      dat += dat_stride;
+      flt += flt_stride;
+      src += src_stride;
+    }
+  } else {
+    for (i = 0; i < height; ++i) {
+      for (j = 0; j < width; ++j) {
+        const int32_t d = dat[j];
+        const int32_t s = src[j];
+        const int32_t e = d - s;
+        err += ((int64_t)e * e);
+      }
+      dat += dat_stride;
+      src += src_stride;
+    }
+  }
+  return err;
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+static int64_t get_pixel_proj_error(const uint8_t *src8, int width, int height,
+                                    int src_stride, const uint8_t *dat8,
+                                    int dat_stride, int use_highbitdepth,
+                                    int32_t *flt0, int flt0_stride,
+                                    int32_t *flt1, int flt1_stride, int *xqd,
+                                    const sgr_params_type *params) {
+  int xq[2];
+  av1_decode_xq(xqd, xq, params);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (use_highbitdepth) {
+    return av1_highbd_pixel_proj_error(src8, width, height, src_stride, dat8,
+                                       dat_stride, flt0, flt0_stride, flt1,
+                                       flt1_stride, xq, params);
+
+  } else {
+    return av1_lowbd_pixel_proj_error(src8, width, height, src_stride, dat8,
+                                      dat_stride, flt0, flt0_stride, flt1,
+                                      flt1_stride, xq, params);
+  }
+#else
+  (void)use_highbitdepth;
+  return av1_lowbd_pixel_proj_error(src8, width, height, src_stride, dat8,
+                                    dat_stride, flt0, flt0_stride, flt1,
+                                    flt1_stride, xq, params);
+#endif
+}
+
+#define USE_SGRPROJ_REFINEMENT_SEARCH 1
+static int64_t finer_search_pixel_proj_error(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int use_highbitdepth, int32_t *flt0,
+    int flt0_stride, int32_t *flt1, int flt1_stride, int start_step, int *xqd,
+    const sgr_params_type *params) {
+  int64_t err = get_pixel_proj_error(
+      src8, width, height, src_stride, dat8, dat_stride, use_highbitdepth, flt0,
+      flt0_stride, flt1, flt1_stride, xqd, params);
+  (void)start_step;
+#if USE_SGRPROJ_REFINEMENT_SEARCH
+  int64_t err2;
+  int tap_min[] = { SGRPROJ_PRJ_MIN0, SGRPROJ_PRJ_MIN1 };
+  int tap_max[] = { SGRPROJ_PRJ_MAX0, SGRPROJ_PRJ_MAX1 };
+  for (int s = start_step; s >= 1; s >>= 1) {
+    for (int p = 0; p < 2; ++p) {
+      if ((params->r[0] == 0 && p == 0) || (params->r[1] == 0 && p == 1)) {
+        continue;
+      }
+      int skip = 0;
+      do {
+        if (xqd[p] - s >= tap_min[p]) {
+          xqd[p] -= s;
+          err2 =
+              get_pixel_proj_error(src8, width, height, src_stride, dat8,
+                                   dat_stride, use_highbitdepth, flt0,
+                                   flt0_stride, flt1, flt1_stride, xqd, params);
+          if (err2 > err) {
+            xqd[p] += s;
+          } else {
+            err = err2;
+            skip = 1;
+            // At the highest step size continue moving in the same direction
+            if (s == start_step) continue;
+          }
+        }
+        break;
+      } while (1);
+      if (skip) break;
+      do {
+        if (xqd[p] + s <= tap_max[p]) {
+          xqd[p] += s;
+          err2 =
+              get_pixel_proj_error(src8, width, height, src_stride, dat8,
+                                   dat_stride, use_highbitdepth, flt0,
+                                   flt0_stride, flt1, flt1_stride, xqd, params);
+          if (err2 > err) {
+            xqd[p] -= s;
+          } else {
+            err = err2;
+            // At the highest step size continue moving in the same direction
+            if (s == start_step) continue;
+          }
+        }
+        break;
+      } while (1);
+    }
+  }
+#endif  // USE_SGRPROJ_REFINEMENT_SEARCH
+  return err;
+}
+
+static int64_t signed_rounded_divide(int64_t dividend, int64_t divisor) {
+  if (dividend < 0)
+    return (dividend - divisor / 2) / divisor;
+  else
+    return (dividend + divisor / 2) / divisor;
+}
+
+static AOM_INLINE void calc_proj_params_r0_r1_c(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+    int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) {
+  const int size = width * height;
+  const uint8_t *src = src8;
+  const uint8_t *dat = dat8;
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; ++j) {
+      const int32_t u = (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS);
+      const int32_t s =
+          (int32_t)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u;
+      const int32_t f1 = (int32_t)flt0[i * flt0_stride + j] - u;
+      const int32_t f2 = (int32_t)flt1[i * flt1_stride + j] - u;
+      H[0][0] += (int64_t)f1 * f1;
+      H[1][1] += (int64_t)f2 * f2;
+      H[0][1] += (int64_t)f1 * f2;
+      C[0] += (int64_t)f1 * s;
+      C[1] += (int64_t)f2 * s;
+    }
+  }
+  H[0][0] /= size;
+  H[0][1] /= size;
+  H[1][1] /= size;
+  H[1][0] = H[0][1];
+  C[0] /= size;
+  C[1] /= size;
+}
+
+static AOM_INLINE void calc_proj_params_r0_r1_high_bd_c(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+    int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) {
+  const int size = width * height;
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; ++j) {
+      const int32_t u = (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS);
+      const int32_t s =
+          (int32_t)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u;
+      const int32_t f1 = (int32_t)flt0[i * flt0_stride + j] - u;
+      const int32_t f2 = (int32_t)flt1[i * flt1_stride + j] - u;
+      H[0][0] += (int64_t)f1 * f1;
+      H[1][1] += (int64_t)f2 * f2;
+      H[0][1] += (int64_t)f1 * f2;
+      C[0] += (int64_t)f1 * s;
+      C[1] += (int64_t)f2 * s;
+    }
+  }
+  H[0][0] /= size;
+  H[0][1] /= size;
+  H[1][1] /= size;
+  H[1][0] = H[0][1];
+  C[0] /= size;
+  C[1] /= size;
+}
+
+static AOM_INLINE void calc_proj_params_r0_c(const uint8_t *src8, int width,
+                                             int height, int src_stride,
+                                             const uint8_t *dat8,
+                                             int dat_stride, int32_t *flt0,
+                                             int flt0_stride, int64_t H[2][2],
+                                             int64_t C[2]) {
+  const int size = width * height;
+  const uint8_t *src = src8;
+  const uint8_t *dat = dat8;
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; ++j) {
+      const int32_t u = (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS);
+      const int32_t s =
+          (int32_t)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u;
+      const int32_t f1 = (int32_t)flt0[i * flt0_stride + j] - u;
+      H[0][0] += (int64_t)f1 * f1;
+      C[0] += (int64_t)f1 * s;
+    }
+  }
+  H[0][0] /= size;
+  C[0] /= size;
+}
+
+static AOM_INLINE void calc_proj_params_r0_high_bd_c(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+    int64_t H[2][2], int64_t C[2]) {
+  const int size = width * height;
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; ++j) {
+      const int32_t u = (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS);
+      const int32_t s =
+          (int32_t)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u;
+      const int32_t f1 = (int32_t)flt0[i * flt0_stride + j] - u;
+      H[0][0] += (int64_t)f1 * f1;
+      C[0] += (int64_t)f1 * s;
+    }
+  }
+  H[0][0] /= size;
+  C[0] /= size;
+}
+
+static AOM_INLINE void calc_proj_params_r1_c(const uint8_t *src8, int width,
+                                             int height, int src_stride,
+                                             const uint8_t *dat8,
+                                             int dat_stride, int32_t *flt1,
+                                             int flt1_stride, int64_t H[2][2],
+                                             int64_t C[2]) {
+  const int size = width * height;
+  const uint8_t *src = src8;
+  const uint8_t *dat = dat8;
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; ++j) {
+      const int32_t u = (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS);
+      const int32_t s =
+          (int32_t)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u;
+      const int32_t f2 = (int32_t)flt1[i * flt1_stride + j] - u;
+      H[1][1] += (int64_t)f2 * f2;
+      C[1] += (int64_t)f2 * s;
+    }
+  }
+  H[1][1] /= size;
+  C[1] /= size;
+}
+
+static AOM_INLINE void calc_proj_params_r1_high_bd_c(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int32_t *flt1, int flt1_stride,
+    int64_t H[2][2], int64_t C[2]) {
+  const int size = width * height;
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; ++j) {
+      const int32_t u = (int32_t)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS);
+      const int32_t s =
+          (int32_t)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u;
+      const int32_t f2 = (int32_t)flt1[i * flt1_stride + j] - u;
+      H[1][1] += (int64_t)f2 * f2;
+      C[1] += (int64_t)f2 * s;
+    }
+  }
+  H[1][1] /= size;
+  C[1] /= size;
+}
+
+// The function calls 3 subfunctions for the following cases :
+// 1) When params->r[0] > 0 and params->r[1] > 0. In this case all elements
+// of C and H need to be computed.
+// 2) When only params->r[0] > 0. In this case only H[0][0] and C[0] are
+// non-zero and need to be computed.
+// 3) When only params->r[1] > 0. In this case only H[1][1] and C[1] are
+// non-zero and need to be computed.
+void av1_calc_proj_params_c(const uint8_t *src8, int width, int height,
+                            int src_stride, const uint8_t *dat8, int dat_stride,
+                            int32_t *flt0, int flt0_stride, int32_t *flt1,
+                            int flt1_stride, int64_t H[2][2], int64_t C[2],
+                            const sgr_params_type *params) {
+  if ((params->r[0] > 0) && (params->r[1] > 0)) {
+    calc_proj_params_r0_r1_c(src8, width, height, src_stride, dat8, dat_stride,
+                             flt0, flt0_stride, flt1, flt1_stride, H, C);
+  } else if (params->r[0] > 0) {
+    calc_proj_params_r0_c(src8, width, height, src_stride, dat8, dat_stride,
+                          flt0, flt0_stride, H, C);
+  } else if (params->r[1] > 0) {
+    calc_proj_params_r1_c(src8, width, height, src_stride, dat8, dat_stride,
+                          flt1, flt1_stride, H, C);
+  }
+}
+
+static AOM_INLINE void av1_calc_proj_params_high_bd_c(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+    int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2],
+    const sgr_params_type *params) {
+  if ((params->r[0] > 0) && (params->r[1] > 0)) {
+    calc_proj_params_r0_r1_high_bd_c(src8, width, height, src_stride, dat8,
+                                     dat_stride, flt0, flt0_stride, flt1,
+                                     flt1_stride, H, C);
+  } else if (params->r[0] > 0) {
+    calc_proj_params_r0_high_bd_c(src8, width, height, src_stride, dat8,
+                                  dat_stride, flt0, flt0_stride, H, C);
+  } else if (params->r[1] > 0) {
+    calc_proj_params_r1_high_bd_c(src8, width, height, src_stride, dat8,
+                                  dat_stride, flt1, flt1_stride, H, C);
+  }
+}
+
+static AOM_INLINE void get_proj_subspace(const uint8_t *src8, int width,
+                                         int height, int src_stride,
+                                         const uint8_t *dat8, int dat_stride,
+                                         int use_highbitdepth, int32_t *flt0,
+                                         int flt0_stride, int32_t *flt1,
+                                         int flt1_stride, int *xq,
+                                         const sgr_params_type *params) {
+  int64_t H[2][2] = { { 0, 0 }, { 0, 0 } };
+  int64_t C[2] = { 0, 0 };
+
+  // Default values to be returned if the problem becomes ill-posed
+  xq[0] = 0;
+  xq[1] = 0;
+
+  if (!use_highbitdepth) {
+    if ((width & 0x7) == 0) {
+      av1_calc_proj_params(src8, width, height, src_stride, dat8, dat_stride,
+                           flt0, flt0_stride, flt1, flt1_stride, H, C, params);
+    } else {
+      av1_calc_proj_params_c(src8, width, height, src_stride, dat8, dat_stride,
+                             flt0, flt0_stride, flt1, flt1_stride, H, C,
+                             params);
+    }
+  } else {
+    av1_calc_proj_params_high_bd_c(src8, width, height, src_stride, dat8,
+                                   dat_stride, flt0, flt0_stride, flt1,
+                                   flt1_stride, H, C, params);
+  }
+
+  if (params->r[0] == 0) {
+    // H matrix is now only the scalar H[1][1]
+    // C vector is now only the scalar C[1]
+    const int64_t Det = H[1][1];
+    if (Det == 0) return;  // ill-posed, return default values
+    xq[0] = 0;
+    xq[1] = (int)signed_rounded_divide(C[1] * (1 << SGRPROJ_PRJ_BITS), Det);
+  } else if (params->r[1] == 0) {
+    // H matrix is now only the scalar H[0][0]
+    // C vector is now only the scalar C[0]
+    const int64_t Det = H[0][0];
+    if (Det == 0) return;  // ill-posed, return default values
+    xq[0] = (int)signed_rounded_divide(C[0] * (1 << SGRPROJ_PRJ_BITS), Det);
+    xq[1] = 0;
+  } else {
+    const int64_t Det = H[0][0] * H[1][1] - H[0][1] * H[1][0];
+    if (Det == 0) return;  // ill-posed, return default values
+
+    // If scaling up dividend would overflow, instead scale down the divisor
+    const int64_t div1 = H[1][1] * C[0] - H[0][1] * C[1];
+    if ((div1 > 0 && INT64_MAX / (1 << SGRPROJ_PRJ_BITS) < div1) ||
+        (div1 < 0 && INT64_MIN / (1 << SGRPROJ_PRJ_BITS) > div1))
+      xq[0] = (int)signed_rounded_divide(div1, Det / (1 << SGRPROJ_PRJ_BITS));
+    else
+      xq[0] = (int)signed_rounded_divide(div1 * (1 << SGRPROJ_PRJ_BITS), Det);
+
+    const int64_t div2 = H[0][0] * C[1] - H[1][0] * C[0];
+    if ((div2 > 0 && INT64_MAX / (1 << SGRPROJ_PRJ_BITS) < div2) ||
+        (div2 < 0 && INT64_MIN / (1 << SGRPROJ_PRJ_BITS) > div2))
+      xq[1] = (int)signed_rounded_divide(div2, Det / (1 << SGRPROJ_PRJ_BITS));
+    else
+      xq[1] = (int)signed_rounded_divide(div2 * (1 << SGRPROJ_PRJ_BITS), Det);
+  }
+}
+
+static AOM_INLINE void encode_xq(int *xq, int *xqd,
+                                 const sgr_params_type *params) {
+  if (params->r[0] == 0) {
+    xqd[0] = 0;
+    xqd[1] = clamp((1 << SGRPROJ_PRJ_BITS) - xq[1], SGRPROJ_PRJ_MIN1,
+                   SGRPROJ_PRJ_MAX1);
+  } else if (params->r[1] == 0) {
+    xqd[0] = clamp(xq[0], SGRPROJ_PRJ_MIN0, SGRPROJ_PRJ_MAX0);
+    xqd[1] = clamp((1 << SGRPROJ_PRJ_BITS) - xqd[0], SGRPROJ_PRJ_MIN1,
+                   SGRPROJ_PRJ_MAX1);
+  } else {
+    xqd[0] = clamp(xq[0], SGRPROJ_PRJ_MIN0, SGRPROJ_PRJ_MAX0);
+    xqd[1] = clamp((1 << SGRPROJ_PRJ_BITS) - xqd[0] - xq[1], SGRPROJ_PRJ_MIN1,
+                   SGRPROJ_PRJ_MAX1);
+  }
+}
+
+// Apply the self-guided filter across an entire restoration unit.
+static AOM_INLINE void apply_sgr(int sgr_params_idx, const uint8_t *dat8,
+                                 int width, int height, int dat_stride,
+                                 int use_highbd, int bit_depth, int pu_width,
+                                 int pu_height, int32_t *flt0, int32_t *flt1,
+                                 int flt_stride) {
+  for (int i = 0; i < height; i += pu_height) {
+    const int h = AOMMIN(pu_height, height - i);
+    int32_t *flt0_row = flt0 + i * flt_stride;
+    int32_t *flt1_row = flt1 + i * flt_stride;
+    const uint8_t *dat8_row = dat8 + i * dat_stride;
+
+    // Iterate over the stripe in blocks of width pu_width
+    for (int j = 0; j < width; j += pu_width) {
+      const int w = AOMMIN(pu_width, width - j);
+      const int ret = av1_selfguided_restoration(
+          dat8_row + j, w, h, dat_stride, flt0_row + j, flt1_row + j,
+          flt_stride, sgr_params_idx, bit_depth, use_highbd);
+      (void)ret;
+      assert(!ret);
+    }
+  }
+}
+
+static AOM_INLINE void compute_sgrproj_err(
+    const uint8_t *dat8, const int width, const int height,
+    const int dat_stride, const uint8_t *src8, const int src_stride,
+    const int use_highbitdepth, const int bit_depth, const int pu_width,
+    const int pu_height, const int ep, int32_t *flt0, int32_t *flt1,
+    const int flt_stride, int *exqd, int64_t *err) {
+  int exq[2];
+  apply_sgr(ep, dat8, width, height, dat_stride, use_highbitdepth, bit_depth,
+            pu_width, pu_height, flt0, flt1, flt_stride);
+  aom_clear_system_state();
+  const sgr_params_type *const params = &av1_sgr_params[ep];
+  get_proj_subspace(src8, width, height, src_stride, dat8, dat_stride,
+                    use_highbitdepth, flt0, flt_stride, flt1, flt_stride, exq,
+                    params);
+  aom_clear_system_state();
+  encode_xq(exq, exqd, params);
+  *err = finer_search_pixel_proj_error(
+      src8, width, height, src_stride, dat8, dat_stride, use_highbitdepth, flt0,
+      flt_stride, flt1, flt_stride, 2, exqd, params);
+}
+
+static AOM_INLINE void get_best_error(int64_t *besterr, const int64_t err,
+                                      const int *exqd, int *bestxqd,
+                                      int *bestep, const int ep) {
+  if (*besterr == -1 || err < *besterr) {
+    *bestep = ep;
+    *besterr = err;
+    bestxqd[0] = exqd[0];
+    bestxqd[1] = exqd[1];
+  }
+}
+
+static SgrprojInfo search_selfguided_restoration(
+    const uint8_t *dat8, int width, int height, int dat_stride,
+    const uint8_t *src8, int src_stride, int use_highbitdepth, int bit_depth,
+    int pu_width, int pu_height, int32_t *rstbuf, int enable_sgr_ep_pruning) {
+  int32_t *flt0 = rstbuf;
+  int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
+  int ep, idx, bestep = 0;
+  int64_t besterr = -1;
+  int exqd[2], bestxqd[2] = { 0, 0 };
+  int flt_stride = ((width + 7) & ~7) + 8;
+  assert(pu_width == (RESTORATION_PROC_UNIT_SIZE >> 1) ||
+         pu_width == RESTORATION_PROC_UNIT_SIZE);
+  assert(pu_height == (RESTORATION_PROC_UNIT_SIZE >> 1) ||
+         pu_height == RESTORATION_PROC_UNIT_SIZE);
+  if (!enable_sgr_ep_pruning) {
+    for (ep = 0; ep < SGRPROJ_PARAMS; ep++) {
+      int64_t err;
+      compute_sgrproj_err(dat8, width, height, dat_stride, src8, src_stride,
+                          use_highbitdepth, bit_depth, pu_width, pu_height, ep,
+                          flt0, flt1, flt_stride, exqd, &err);
+      get_best_error(&besterr, err, exqd, bestxqd, &bestep, ep);
+    }
+  } else {
+    // evaluate first four seed ep in first group
+    for (idx = 0; idx < SGRPROJ_EP_GRP1_SEARCH_COUNT; idx++) {
+      ep = sgproj_ep_grp1_seed[idx];
+      int64_t err;
+      compute_sgrproj_err(dat8, width, height, dat_stride, src8, src_stride,
+                          use_highbitdepth, bit_depth, pu_width, pu_height, ep,
+                          flt0, flt1, flt_stride, exqd, &err);
+      get_best_error(&besterr, err, exqd, bestxqd, &bestep, ep);
+    }
+    // evaluate left and right ep of winner in seed ep
+    int bestep_ref = bestep;
+    for (ep = bestep_ref - 1; ep < bestep_ref + 2; ep += 2) {
+      if (ep < SGRPROJ_EP_GRP1_START_IDX || ep > SGRPROJ_EP_GRP1_END_IDX)
+        continue;
+      int64_t err;
+      compute_sgrproj_err(dat8, width, height, dat_stride, src8, src_stride,
+                          use_highbitdepth, bit_depth, pu_width, pu_height, ep,
+                          flt0, flt1, flt_stride, exqd, &err);
+      get_best_error(&besterr, err, exqd, bestxqd, &bestep, ep);
+    }
+    // evaluate last two group
+    for (idx = 0; idx < SGRPROJ_EP_GRP2_3_SEARCH_COUNT; idx++) {
+      ep = sgproj_ep_grp2_3[idx][bestep];
+      int64_t err;
+      compute_sgrproj_err(dat8, width, height, dat_stride, src8, src_stride,
+                          use_highbitdepth, bit_depth, pu_width, pu_height, ep,
+                          flt0, flt1, flt_stride, exqd, &err);
+      get_best_error(&besterr, err, exqd, bestxqd, &bestep, ep);
+    }
+  }
+
+  SgrprojInfo ret;
+  ret.ep = bestep;
+  ret.xqd[0] = bestxqd[0];
+  ret.xqd[1] = bestxqd[1];
+  return ret;
+}
+
+static int count_sgrproj_bits(SgrprojInfo *sgrproj_info,
+                              SgrprojInfo *ref_sgrproj_info) {
+  int bits = SGRPROJ_PARAMS_BITS;
+  const sgr_params_type *params = &av1_sgr_params[sgrproj_info->ep];
+  if (params->r[0] > 0)
+    bits += aom_count_primitive_refsubexpfin(
+        SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1, SGRPROJ_PRJ_SUBEXP_K,
+        ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0,
+        sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0);
+  if (params->r[1] > 0)
+    bits += aom_count_primitive_refsubexpfin(
+        SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1, SGRPROJ_PRJ_SUBEXP_K,
+        ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1,
+        sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1);
+  return bits;
+}
+
+static AOM_INLINE void search_sgrproj(const RestorationTileLimits *limits,
+                                      const AV1PixelRect *tile,
+                                      int rest_unit_idx, void *priv,
+                                      int32_t *tmpbuf,
+                                      RestorationLineBuffers *rlbs) {
+  (void)rlbs;
+  RestSearchCtxt *rsc = (RestSearchCtxt *)priv;
+  RestUnitSearchInfo *rusi = &rsc->rusi[rest_unit_idx];
+
+  const MACROBLOCK *const x = rsc->x;
+  const AV1_COMMON *const cm = rsc->cm;
+  const int highbd = cm->seq_params.use_highbitdepth;
+  const int bit_depth = cm->seq_params.bit_depth;
+
+  const int64_t bits_none = x->sgrproj_restore_cost[0];
+  // Prune evaluation of RESTORE_SGRPROJ if 'skip_sgr_eval' is set
+  if (rusi->skip_sgr_eval) {
+    rsc->bits += bits_none;
+    rsc->sse += rusi->sse[RESTORE_NONE];
+    rusi->best_rtype[RESTORE_SGRPROJ - 1] = RESTORE_NONE;
+    rusi->sse[RESTORE_SGRPROJ] = INT64_MAX;
+    return;
+  }
+
+  uint8_t *dgd_start =
+      rsc->dgd_buffer + limits->v_start * rsc->dgd_stride + limits->h_start;
+  const uint8_t *src_start =
+      rsc->src_buffer + limits->v_start * rsc->src_stride + limits->h_start;
+
+  const int is_uv = rsc->plane > 0;
+  const int ss_x = is_uv && cm->seq_params.subsampling_x;
+  const int ss_y = is_uv && cm->seq_params.subsampling_y;
+  const int procunit_width = RESTORATION_PROC_UNIT_SIZE >> ss_x;
+  const int procunit_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
+
+  rusi->sgrproj = search_selfguided_restoration(
+      dgd_start, limits->h_end - limits->h_start,
+      limits->v_end - limits->v_start, rsc->dgd_stride, src_start,
+      rsc->src_stride, highbd, bit_depth, procunit_width, procunit_height,
+      tmpbuf, rsc->sf->lpf_sf.enable_sgr_ep_pruning);
+
+  RestorationUnitInfo rui;
+  rui.restoration_type = RESTORE_SGRPROJ;
+  rui.sgrproj_info = rusi->sgrproj;
+
+  rusi->sse[RESTORE_SGRPROJ] = try_restoration_unit(rsc, limits, tile, &rui);
+
+  const int64_t bits_sgr = x->sgrproj_restore_cost[1] +
+                           (count_sgrproj_bits(&rusi->sgrproj, &rsc->sgrproj)
+                            << AV1_PROB_COST_SHIFT);
+
+  double cost_none =
+      RDCOST_DBL(x->rdmult, bits_none >> 4, rusi->sse[RESTORE_NONE]);
+  double cost_sgr =
+      RDCOST_DBL(x->rdmult, bits_sgr >> 4, rusi->sse[RESTORE_SGRPROJ]);
+  if (rusi->sgrproj.ep < 10)
+    cost_sgr *=
+        (1 + DUAL_SGR_PENALTY_MULT * rsc->sf->lpf_sf.dual_sgr_penalty_level);
+
+  RestorationType rtype =
+      (cost_sgr < cost_none) ? RESTORE_SGRPROJ : RESTORE_NONE;
+  rusi->best_rtype[RESTORE_SGRPROJ - 1] = rtype;
+
+  rsc->sse += rusi->sse[rtype];
+  rsc->bits += (cost_sgr < cost_none) ? bits_sgr : bits_none;
+  if (cost_sgr < cost_none) rsc->sgrproj = rusi->sgrproj;
+}
+
+void av1_compute_stats_c(int wiener_win, const uint8_t *dgd, const uint8_t *src,
+                         int h_start, int h_end, int v_start, int v_end,
+                         int dgd_stride, int src_stride, int64_t *M,
+                         int64_t *H) {
+  int i, j, k, l;
+  int16_t Y[WIENER_WIN2];
+  const int wiener_win2 = wiener_win * wiener_win;
+  const int wiener_halfwin = (wiener_win >> 1);
+  uint8_t avg = find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+
+  memset(M, 0, sizeof(*M) * wiener_win2);
+  memset(H, 0, sizeof(*H) * wiener_win2 * wiener_win2);
+  for (i = v_start; i < v_end; i++) {
+    for (j = h_start; j < h_end; j++) {
+      const int16_t X = (int16_t)src[i * src_stride + j] - (int16_t)avg;
+      int idx = 0;
+      for (k = -wiener_halfwin; k <= wiener_halfwin; k++) {
+        for (l = -wiener_halfwin; l <= wiener_halfwin; l++) {
+          Y[idx] = (int16_t)dgd[(i + l) * dgd_stride + (j + k)] - (int16_t)avg;
+          idx++;
+        }
+      }
+      assert(idx == wiener_win2);
+      for (k = 0; k < wiener_win2; ++k) {
+        M[k] += (int32_t)Y[k] * X;
+        for (l = k; l < wiener_win2; ++l) {
+          // H is a symmetric matrix, so we only need to fill out the upper
+          // triangle here. We can copy it down to the lower triangle outside
+          // the (i, j) loops.
+          H[k * wiener_win2 + l] += (int32_t)Y[k] * Y[l];
+        }
+      }
+    }
+  }
+  for (k = 0; k < wiener_win2; ++k) {
+    for (l = k + 1; l < wiener_win2; ++l) {
+      H[l * wiener_win2 + k] = H[k * wiener_win2 + l];
+    }
+  }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void av1_compute_stats_highbd_c(int wiener_win, const uint8_t *dgd8,
+                                const uint8_t *src8, int h_start, int h_end,
+                                int v_start, int v_end, int dgd_stride,
+                                int src_stride, int64_t *M, int64_t *H,
+                                aom_bit_depth_t bit_depth) {
+  int i, j, k, l;
+  int32_t Y[WIENER_WIN2];
+  const int wiener_win2 = wiener_win * wiener_win;
+  const int wiener_halfwin = (wiener_win >> 1);
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *dgd = CONVERT_TO_SHORTPTR(dgd8);
+  uint16_t avg =
+      find_average_highbd(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+
+  uint8_t bit_depth_divider = 1;
+  if (bit_depth == AOM_BITS_12)
+    bit_depth_divider = 16;
+  else if (bit_depth == AOM_BITS_10)
+    bit_depth_divider = 4;
+
+  memset(M, 0, sizeof(*M) * wiener_win2);
+  memset(H, 0, sizeof(*H) * wiener_win2 * wiener_win2);
+  for (i = v_start; i < v_end; i++) {
+    for (j = h_start; j < h_end; j++) {
+      const int32_t X = (int32_t)src[i * src_stride + j] - (int32_t)avg;
+      int idx = 0;
+      for (k = -wiener_halfwin; k <= wiener_halfwin; k++) {
+        for (l = -wiener_halfwin; l <= wiener_halfwin; l++) {
+          Y[idx] = (int32_t)dgd[(i + l) * dgd_stride + (j + k)] - (int32_t)avg;
+          idx++;
+        }
+      }
+      assert(idx == wiener_win2);
+      for (k = 0; k < wiener_win2; ++k) {
+        M[k] += (int64_t)Y[k] * X;
+        for (l = k; l < wiener_win2; ++l) {
+          // H is a symmetric matrix, so we only need to fill out the upper
+          // triangle here. We can copy it down to the lower triangle outside
+          // the (i, j) loops.
+          H[k * wiener_win2 + l] += (int64_t)Y[k] * Y[l];
+        }
+      }
+    }
+  }
+  for (k = 0; k < wiener_win2; ++k) {
+    M[k] /= bit_depth_divider;
+    H[k * wiener_win2 + k] /= bit_depth_divider;
+    for (l = k + 1; l < wiener_win2; ++l) {
+      H[k * wiener_win2 + l] /= bit_depth_divider;
+      H[l * wiener_win2 + k] = H[k * wiener_win2 + l];
+    }
+  }
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+static INLINE int wrap_index(int i, int wiener_win) {
+  const int wiener_halfwin1 = (wiener_win >> 1) + 1;
+  return (i >= wiener_halfwin1 ? wiener_win - 1 - i : i);
+}
+
+// Solve linear equations to find Wiener filter tap values
+// Taps are output scaled by WIENER_FILT_STEP
+static int linsolve_wiener(int n, int64_t *A, int stride, int64_t *b,
+                           int32_t *x) {
+  for (int k = 0; k < n - 1; k++) {
+    // Partial pivoting: bring the row with the largest pivot to the top
+    for (int i = n - 1; i > k; i--) {
+      // If row i has a better (bigger) pivot than row (i-1), swap them
+      if (llabs(A[(i - 1) * stride + k]) < llabs(A[i * stride + k])) {
+        for (int j = 0; j < n; j++) {
+          const int64_t c = A[i * stride + j];
+          A[i * stride + j] = A[(i - 1) * stride + j];
+          A[(i - 1) * stride + j] = c;
+        }
+        const int64_t c = b[i];
+        b[i] = b[i - 1];
+        b[i - 1] = c;
+      }
+    }
+    // Forward elimination (convert A to row-echelon form)
+    for (int i = k; i < n - 1; i++) {
+      if (A[k * stride + k] == 0) return 0;
+      const int64_t c = A[(i + 1) * stride + k];
+      const int64_t cd = A[k * stride + k];
+      for (int j = 0; j < n; j++) {
+        A[(i + 1) * stride + j] -= c / 256 * A[k * stride + j] / cd * 256;
+      }
+      b[i + 1] -= c * b[k] / cd;
+    }
+  }
+  // Back-substitution
+  for (int i = n - 1; i >= 0; i--) {
+    if (A[i * stride + i] == 0) return 0;
+    int64_t c = 0;
+    for (int j = i + 1; j <= n - 1; j++) {
+      c += A[i * stride + j] * x[j] / WIENER_TAP_SCALE_FACTOR;
+    }
+    // Store filter taps x in scaled form.
+    x[i] = (int32_t)(WIENER_TAP_SCALE_FACTOR * (b[i] - c) / A[i * stride + i]);
+  }
+
+  return 1;
+}
+
+// Fix vector b, update vector a
+static AOM_INLINE void update_a_sep_sym(int wiener_win, int64_t **Mc,
+                                        int64_t **Hc, int32_t *a, int32_t *b) {
+  int i, j;
+  int32_t S[WIENER_WIN];
+  int64_t A[WIENER_HALFWIN1], B[WIENER_HALFWIN1 * WIENER_HALFWIN1];
+  const int wiener_win2 = wiener_win * wiener_win;
+  const int wiener_halfwin1 = (wiener_win >> 1) + 1;
+  memset(A, 0, sizeof(A));
+  memset(B, 0, sizeof(B));
+  for (i = 0; i < wiener_win; i++) {
+    for (j = 0; j < wiener_win; ++j) {
+      const int jj = wrap_index(j, wiener_win);
+      A[jj] += Mc[i][j] * b[i] / WIENER_TAP_SCALE_FACTOR;
+    }
+  }
+  for (i = 0; i < wiener_win; i++) {
+    for (j = 0; j < wiener_win; j++) {
+      int k, l;
+      for (k = 0; k < wiener_win; ++k) {
+        for (l = 0; l < wiener_win; ++l) {
+          const int kk = wrap_index(k, wiener_win);
+          const int ll = wrap_index(l, wiener_win);
+          B[ll * wiener_halfwin1 + kk] +=
+              Hc[j * wiener_win + i][k * wiener_win2 + l] * b[i] /
+              WIENER_TAP_SCALE_FACTOR * b[j] / WIENER_TAP_SCALE_FACTOR;
+        }
+      }
+    }
+  }
+  // Normalization enforcement in the system of equations itself
+  for (i = 0; i < wiener_halfwin1 - 1; ++i) {
+    A[i] -=
+        A[wiener_halfwin1 - 1] * 2 +
+        B[i * wiener_halfwin1 + wiener_halfwin1 - 1] -
+        2 * B[(wiener_halfwin1 - 1) * wiener_halfwin1 + (wiener_halfwin1 - 1)];
+  }
+  for (i = 0; i < wiener_halfwin1 - 1; ++i) {
+    for (j = 0; j < wiener_halfwin1 - 1; ++j) {
+      B[i * wiener_halfwin1 + j] -=
+          2 * (B[i * wiener_halfwin1 + (wiener_halfwin1 - 1)] +
+               B[(wiener_halfwin1 - 1) * wiener_halfwin1 + j] -
+               2 * B[(wiener_halfwin1 - 1) * wiener_halfwin1 +
+                     (wiener_halfwin1 - 1)]);
+    }
+  }
+  if (linsolve_wiener(wiener_halfwin1 - 1, B, wiener_halfwin1, A, S)) {
+    S[wiener_halfwin1 - 1] = WIENER_TAP_SCALE_FACTOR;
+    for (i = wiener_halfwin1; i < wiener_win; ++i) {
+      S[i] = S[wiener_win - 1 - i];
+      S[wiener_halfwin1 - 1] -= 2 * S[i];
+    }
+    memcpy(a, S, wiener_win * sizeof(*a));
+  }
+}
+
+// Fix vector a, update vector b
+static AOM_INLINE void update_b_sep_sym(int wiener_win, int64_t **Mc,
+                                        int64_t **Hc, int32_t *a, int32_t *b) {
+  int i, j;
+  int32_t S[WIENER_WIN];
+  int64_t A[WIENER_HALFWIN1], B[WIENER_HALFWIN1 * WIENER_HALFWIN1];
+  const int wiener_win2 = wiener_win * wiener_win;
+  const int wiener_halfwin1 = (wiener_win >> 1) + 1;
+  memset(A, 0, sizeof(A));
+  memset(B, 0, sizeof(B));
+  for (i = 0; i < wiener_win; i++) {
+    const int ii = wrap_index(i, wiener_win);
+    for (j = 0; j < wiener_win; j++) {
+      A[ii] += Mc[i][j] * a[j] / WIENER_TAP_SCALE_FACTOR;
+    }
+  }
+
+  for (i = 0; i < wiener_win; i++) {
+    for (j = 0; j < wiener_win; j++) {
+      const int ii = wrap_index(i, wiener_win);
+      const int jj = wrap_index(j, wiener_win);
+      int k, l;
+      for (k = 0; k < wiener_win; ++k) {
+        for (l = 0; l < wiener_win; ++l) {
+          B[jj * wiener_halfwin1 + ii] +=
+              Hc[i * wiener_win + j][k * wiener_win2 + l] * a[k] /
+              WIENER_TAP_SCALE_FACTOR * a[l] / WIENER_TAP_SCALE_FACTOR;
+        }
+      }
+    }
+  }
+  // Normalization enforcement in the system of equations itself
+  for (i = 0; i < wiener_halfwin1 - 1; ++i) {
+    A[i] -=
+        A[wiener_halfwin1 - 1] * 2 +
+        B[i * wiener_halfwin1 + wiener_halfwin1 - 1] -
+        2 * B[(wiener_halfwin1 - 1) * wiener_halfwin1 + (wiener_halfwin1 - 1)];
+  }
+  for (i = 0; i < wiener_halfwin1 - 1; ++i) {
+    for (j = 0; j < wiener_halfwin1 - 1; ++j) {
+      B[i * wiener_halfwin1 + j] -=
+          2 * (B[i * wiener_halfwin1 + (wiener_halfwin1 - 1)] +
+               B[(wiener_halfwin1 - 1) * wiener_halfwin1 + j] -
+               2 * B[(wiener_halfwin1 - 1) * wiener_halfwin1 +
+                     (wiener_halfwin1 - 1)]);
+    }
+  }
+  if (linsolve_wiener(wiener_halfwin1 - 1, B, wiener_halfwin1, A, S)) {
+    S[wiener_halfwin1 - 1] = WIENER_TAP_SCALE_FACTOR;
+    for (i = wiener_halfwin1; i < wiener_win; ++i) {
+      S[i] = S[wiener_win - 1 - i];
+      S[wiener_halfwin1 - 1] -= 2 * S[i];
+    }
+    memcpy(b, S, wiener_win * sizeof(*b));
+  }
+}
+
+static int wiener_decompose_sep_sym(int wiener_win, int64_t *M, int64_t *H,
+                                    int32_t *a, int32_t *b) {
+  static const int32_t init_filt[WIENER_WIN] = {
+    WIENER_FILT_TAP0_MIDV, WIENER_FILT_TAP1_MIDV, WIENER_FILT_TAP2_MIDV,
+    WIENER_FILT_TAP3_MIDV, WIENER_FILT_TAP2_MIDV, WIENER_FILT_TAP1_MIDV,
+    WIENER_FILT_TAP0_MIDV,
+  };
+  int64_t *Hc[WIENER_WIN2];
+  int64_t *Mc[WIENER_WIN];
+  int i, j, iter;
+  const int plane_off = (WIENER_WIN - wiener_win) >> 1;
+  const int wiener_win2 = wiener_win * wiener_win;
+  for (i = 0; i < wiener_win; i++) {
+    a[i] = b[i] =
+        WIENER_TAP_SCALE_FACTOR / WIENER_FILT_STEP * init_filt[i + plane_off];
+  }
+  for (i = 0; i < wiener_win; i++) {
+    Mc[i] = M + i * wiener_win;
+    for (j = 0; j < wiener_win; j++) {
+      Hc[i * wiener_win + j] =
+          H + i * wiener_win * wiener_win2 + j * wiener_win;
+    }
+  }
+
+  iter = 1;
+  while (iter < NUM_WIENER_ITERS) {
+    update_a_sep_sym(wiener_win, Mc, Hc, a, b);
+    update_b_sep_sym(wiener_win, Mc, Hc, a, b);
+    iter++;
+  }
+  return 1;
+}
+
+// Computes the function x'*H*x - x'*M for the learned 2D filter x, and compares
+// against identity filters; Final score is defined as the difference between
+// the function values
+static int64_t compute_score(int wiener_win, int64_t *M, int64_t *H,
+                             InterpKernel vfilt, InterpKernel hfilt) {
+  int32_t ab[WIENER_WIN * WIENER_WIN];
+  int16_t a[WIENER_WIN], b[WIENER_WIN];
+  int64_t P = 0, Q = 0;
+  int64_t iP = 0, iQ = 0;
+  int64_t Score, iScore;
+  int i, k, l;
+  const int plane_off = (WIENER_WIN - wiener_win) >> 1;
+  const int wiener_win2 = wiener_win * wiener_win;
+
+  aom_clear_system_state();
+
+  a[WIENER_HALFWIN] = b[WIENER_HALFWIN] = WIENER_FILT_STEP;
+  for (i = 0; i < WIENER_HALFWIN; ++i) {
+    a[i] = a[WIENER_WIN - i - 1] = vfilt[i];
+    b[i] = b[WIENER_WIN - i - 1] = hfilt[i];
+    a[WIENER_HALFWIN] -= 2 * a[i];
+    b[WIENER_HALFWIN] -= 2 * b[i];
+  }
+  memset(ab, 0, sizeof(ab));
+  for (k = 0; k < wiener_win; ++k) {
+    for (l = 0; l < wiener_win; ++l)
+      ab[k * wiener_win + l] = a[l + plane_off] * b[k + plane_off];
+  }
+  for (k = 0; k < wiener_win2; ++k) {
+    P += ab[k] * M[k] / WIENER_FILT_STEP / WIENER_FILT_STEP;
+    for (l = 0; l < wiener_win2; ++l) {
+      Q += ab[k] * H[k * wiener_win2 + l] * ab[l] / WIENER_FILT_STEP /
+           WIENER_FILT_STEP / WIENER_FILT_STEP / WIENER_FILT_STEP;
+    }
+  }
+  Score = Q - 2 * P;
+
+  iP = M[wiener_win2 >> 1];
+  iQ = H[(wiener_win2 >> 1) * wiener_win2 + (wiener_win2 >> 1)];
+  iScore = iQ - 2 * iP;
+
+  return Score - iScore;
+}
+
+static AOM_INLINE void finalize_sym_filter(int wiener_win, int32_t *f,
+                                           InterpKernel fi) {
+  int i;
+  const int wiener_halfwin = (wiener_win >> 1);
+
+  for (i = 0; i < wiener_halfwin; ++i) {
+    const int64_t dividend = f[i] * WIENER_FILT_STEP;
+    const int64_t divisor = WIENER_TAP_SCALE_FACTOR;
+    // Perform this division with proper rounding rather than truncation
+    if (dividend < 0) {
+      fi[i] = (int16_t)((dividend - (divisor / 2)) / divisor);
+    } else {
+      fi[i] = (int16_t)((dividend + (divisor / 2)) / divisor);
+    }
+  }
+  // Specialize for 7-tap filter
+  if (wiener_win == WIENER_WIN) {
+    fi[0] = CLIP(fi[0], WIENER_FILT_TAP0_MINV, WIENER_FILT_TAP0_MAXV);
+    fi[1] = CLIP(fi[1], WIENER_FILT_TAP1_MINV, WIENER_FILT_TAP1_MAXV);
+    fi[2] = CLIP(fi[2], WIENER_FILT_TAP2_MINV, WIENER_FILT_TAP2_MAXV);
+  } else {
+    fi[2] = CLIP(fi[1], WIENER_FILT_TAP2_MINV, WIENER_FILT_TAP2_MAXV);
+    fi[1] = CLIP(fi[0], WIENER_FILT_TAP1_MINV, WIENER_FILT_TAP1_MAXV);
+    fi[0] = 0;
+  }
+  // Satisfy filter constraints
+  fi[WIENER_WIN - 1] = fi[0];
+  fi[WIENER_WIN - 2] = fi[1];
+  fi[WIENER_WIN - 3] = fi[2];
+  // The central element has an implicit +WIENER_FILT_STEP
+  fi[3] = -2 * (fi[0] + fi[1] + fi[2]);
+}
+
+static int count_wiener_bits(int wiener_win, WienerInfo *wiener_info,
+                             WienerInfo *ref_wiener_info) {
+  int bits = 0;
+  if (wiener_win == WIENER_WIN)
+    bits += aom_count_primitive_refsubexpfin(
+        WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1,
+        WIENER_FILT_TAP0_SUBEXP_K,
+        ref_wiener_info->vfilter[0] - WIENER_FILT_TAP0_MINV,
+        wiener_info->vfilter[0] - WIENER_FILT_TAP0_MINV);
+  bits += aom_count_primitive_refsubexpfin(
+      WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1,
+      WIENER_FILT_TAP1_SUBEXP_K,
+      ref_wiener_info->vfilter[1] - WIENER_FILT_TAP1_MINV,
+      wiener_info->vfilter[1] - WIENER_FILT_TAP1_MINV);
+  bits += aom_count_primitive_refsubexpfin(
+      WIENER_FILT_TAP2_MAXV - WIENER_FILT_TAP2_MINV + 1,
+      WIENER_FILT_TAP2_SUBEXP_K,
+      ref_wiener_info->vfilter[2] - WIENER_FILT_TAP2_MINV,
+      wiener_info->vfilter[2] - WIENER_FILT_TAP2_MINV);
+  if (wiener_win == WIENER_WIN)
+    bits += aom_count_primitive_refsubexpfin(
+        WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1,
+        WIENER_FILT_TAP0_SUBEXP_K,
+        ref_wiener_info->hfilter[0] - WIENER_FILT_TAP0_MINV,
+        wiener_info->hfilter[0] - WIENER_FILT_TAP0_MINV);
+  bits += aom_count_primitive_refsubexpfin(
+      WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1,
+      WIENER_FILT_TAP1_SUBEXP_K,
+      ref_wiener_info->hfilter[1] - WIENER_FILT_TAP1_MINV,
+      wiener_info->hfilter[1] - WIENER_FILT_TAP1_MINV);
+  bits += aom_count_primitive_refsubexpfin(
+      WIENER_FILT_TAP2_MAXV - WIENER_FILT_TAP2_MINV + 1,
+      WIENER_FILT_TAP2_SUBEXP_K,
+      ref_wiener_info->hfilter[2] - WIENER_FILT_TAP2_MINV,
+      wiener_info->hfilter[2] - WIENER_FILT_TAP2_MINV);
+  return bits;
+}
+
+#define USE_WIENER_REFINEMENT_SEARCH 1
+static int64_t finer_tile_search_wiener(const RestSearchCtxt *rsc,
+                                        const RestorationTileLimits *limits,
+                                        const AV1PixelRect *tile,
+                                        RestorationUnitInfo *rui,
+                                        int wiener_win) {
+  const int plane_off = (WIENER_WIN - wiener_win) >> 1;
+  int64_t err = try_restoration_unit(rsc, limits, tile, rui);
+#if USE_WIENER_REFINEMENT_SEARCH
+  int64_t err2;
+  int tap_min[] = { WIENER_FILT_TAP0_MINV, WIENER_FILT_TAP1_MINV,
+                    WIENER_FILT_TAP2_MINV };
+  int tap_max[] = { WIENER_FILT_TAP0_MAXV, WIENER_FILT_TAP1_MAXV,
+                    WIENER_FILT_TAP2_MAXV };
+
+  WienerInfo *plane_wiener = &rui->wiener_info;
+
+  // printf("err  pre = %"PRId64"\n", err);
+  const int start_step = 4;
+  for (int s = start_step; s >= 1; s >>= 1) {
+    for (int p = plane_off; p < WIENER_HALFWIN; ++p) {
+      int skip = 0;
+      do {
+        if (plane_wiener->hfilter[p] - s >= tap_min[p]) {
+          plane_wiener->hfilter[p] -= s;
+          plane_wiener->hfilter[WIENER_WIN - p - 1] -= s;
+          plane_wiener->hfilter[WIENER_HALFWIN] += 2 * s;
+          err2 = try_restoration_unit(rsc, limits, tile, rui);
+          if (err2 > err) {
+            plane_wiener->hfilter[p] += s;
+            plane_wiener->hfilter[WIENER_WIN - p - 1] += s;
+            plane_wiener->hfilter[WIENER_HALFWIN] -= 2 * s;
+          } else {
+            err = err2;
+            skip = 1;
+            // At the highest step size continue moving in the same direction
+            if (s == start_step) continue;
+          }
+        }
+        break;
+      } while (1);
+      if (skip) break;
+      do {
+        if (plane_wiener->hfilter[p] + s <= tap_max[p]) {
+          plane_wiener->hfilter[p] += s;
+          plane_wiener->hfilter[WIENER_WIN - p - 1] += s;
+          plane_wiener->hfilter[WIENER_HALFWIN] -= 2 * s;
+          err2 = try_restoration_unit(rsc, limits, tile, rui);
+          if (err2 > err) {
+            plane_wiener->hfilter[p] -= s;
+            plane_wiener->hfilter[WIENER_WIN - p - 1] -= s;
+            plane_wiener->hfilter[WIENER_HALFWIN] += 2 * s;
+          } else {
+            err = err2;
+            // At the highest step size continue moving in the same direction
+            if (s == start_step) continue;
+          }
+        }
+        break;
+      } while (1);
+    }
+    for (int p = plane_off; p < WIENER_HALFWIN; ++p) {
+      int skip = 0;
+      do {
+        if (plane_wiener->vfilter[p] - s >= tap_min[p]) {
+          plane_wiener->vfilter[p] -= s;
+          plane_wiener->vfilter[WIENER_WIN - p - 1] -= s;
+          plane_wiener->vfilter[WIENER_HALFWIN] += 2 * s;
+          err2 = try_restoration_unit(rsc, limits, tile, rui);
+          if (err2 > err) {
+            plane_wiener->vfilter[p] += s;
+            plane_wiener->vfilter[WIENER_WIN - p - 1] += s;
+            plane_wiener->vfilter[WIENER_HALFWIN] -= 2 * s;
+          } else {
+            err = err2;
+            skip = 1;
+            // At the highest step size continue moving in the same direction
+            if (s == start_step) continue;
+          }
+        }
+        break;
+      } while (1);
+      if (skip) break;
+      do {
+        if (plane_wiener->vfilter[p] + s <= tap_max[p]) {
+          plane_wiener->vfilter[p] += s;
+          plane_wiener->vfilter[WIENER_WIN - p - 1] += s;
+          plane_wiener->vfilter[WIENER_HALFWIN] -= 2 * s;
+          err2 = try_restoration_unit(rsc, limits, tile, rui);
+          if (err2 > err) {
+            plane_wiener->vfilter[p] -= s;
+            plane_wiener->vfilter[WIENER_WIN - p - 1] -= s;
+            plane_wiener->vfilter[WIENER_HALFWIN] += 2 * s;
+          } else {
+            err = err2;
+            // At the highest step size continue moving in the same direction
+            if (s == start_step) continue;
+          }
+        }
+        break;
+      } while (1);
+    }
+  }
+  // printf("err post = %"PRId64"\n", err);
+#endif  // USE_WIENER_REFINEMENT_SEARCH
+  return err;
+}
+
+static AOM_INLINE void search_wiener(const RestorationTileLimits *limits,
+                                     const AV1PixelRect *tile_rect,
+                                     int rest_unit_idx, void *priv,
+                                     int32_t *tmpbuf,
+                                     RestorationLineBuffers *rlbs) {
+  (void)tmpbuf;
+  (void)rlbs;
+  RestSearchCtxt *rsc = (RestSearchCtxt *)priv;
+  RestUnitSearchInfo *rusi = &rsc->rusi[rest_unit_idx];
+
+  const MACROBLOCK *const x = rsc->x;
+  const int64_t bits_none = x->wiener_restore_cost[0];
+
+  // Skip Wiener search for low variance contents
+  if (rsc->sf->lpf_sf.prune_wiener_based_on_src_var) {
+    const int scale[3] = { 0, 1, 2 };
+    // Obtain the normalized Qscale
+    const int qs = av1_dc_quant_QTX(rsc->cm->quant_params.base_qindex, 0,
+                                    rsc->cm->seq_params.bit_depth) >>
+                   3;
+    // Derive threshold as sqr(normalized Qscale) * scale / 16,
+    const uint64_t thresh =
+        (qs * qs * scale[rsc->sf->lpf_sf.prune_wiener_based_on_src_var]) >> 4;
+    const int highbd = rsc->cm->seq_params.use_highbitdepth;
+    const uint64_t src_var =
+        var_restoration_unit(limits, rsc->src, rsc->plane, highbd);
+    // Do not perform Wiener search if source variance is lower than threshold
+    // or if the reconstruction error is zero
+    int prune_wiener = (src_var < thresh) || (rusi->sse[RESTORE_NONE] == 0);
+    if (prune_wiener) {
+      rsc->bits += bits_none;
+      rsc->sse += rusi->sse[RESTORE_NONE];
+      rusi->best_rtype[RESTORE_WIENER - 1] = RESTORE_NONE;
+      rusi->sse[RESTORE_WIENER] = INT64_MAX;
+      if (rsc->sf->lpf_sf.prune_sgr_based_on_wiener == 2)
+        rusi->skip_sgr_eval = 1;
+      return;
+    }
+  }
+
+  const int wiener_win =
+      (rsc->plane == AOM_PLANE_Y) ? WIENER_WIN : WIENER_WIN_CHROMA;
+
+  int reduced_wiener_win = wiener_win;
+  if (rsc->sf->lpf_sf.reduce_wiener_window_size) {
+    reduced_wiener_win =
+        (rsc->plane == AOM_PLANE_Y) ? WIENER_WIN_REDUCED : WIENER_WIN_CHROMA;
+  }
+
+  int64_t M[WIENER_WIN2];
+  int64_t H[WIENER_WIN2 * WIENER_WIN2];
+  int32_t vfilter[WIENER_WIN], hfilter[WIENER_WIN];
+
+#if CONFIG_AV1_HIGHBITDEPTH
+  const AV1_COMMON *const cm = rsc->cm;
+  if (cm->seq_params.use_highbitdepth) {
+    av1_compute_stats_highbd(reduced_wiener_win, rsc->dgd_buffer,
+                             rsc->src_buffer, limits->h_start, limits->h_end,
+                             limits->v_start, limits->v_end, rsc->dgd_stride,
+                             rsc->src_stride, M, H, cm->seq_params.bit_depth);
+  } else {
+    av1_compute_stats(reduced_wiener_win, rsc->dgd_buffer, rsc->src_buffer,
+                      limits->h_start, limits->h_end, limits->v_start,
+                      limits->v_end, rsc->dgd_stride, rsc->src_stride, M, H);
+  }
+#else
+  av1_compute_stats(reduced_wiener_win, rsc->dgd_buffer, rsc->src_buffer,
+                    limits->h_start, limits->h_end, limits->v_start,
+                    limits->v_end, rsc->dgd_stride, rsc->src_stride, M, H);
+#endif
+
+  if (!wiener_decompose_sep_sym(reduced_wiener_win, M, H, vfilter, hfilter)) {
+    rsc->bits += bits_none;
+    rsc->sse += rusi->sse[RESTORE_NONE];
+    rusi->best_rtype[RESTORE_WIENER - 1] = RESTORE_NONE;
+    rusi->sse[RESTORE_WIENER] = INT64_MAX;
+    if (rsc->sf->lpf_sf.prune_sgr_based_on_wiener == 2) rusi->skip_sgr_eval = 1;
+    return;
+  }
+
+  RestorationUnitInfo rui;
+  memset(&rui, 0, sizeof(rui));
+  rui.restoration_type = RESTORE_WIENER;
+  finalize_sym_filter(reduced_wiener_win, vfilter, rui.wiener_info.vfilter);
+  finalize_sym_filter(reduced_wiener_win, hfilter, rui.wiener_info.hfilter);
+
+  // Filter score computes the value of the function x'*A*x - x'*b for the
+  // learned filter and compares it against identity filer. If there is no
+  // reduction in the function, the filter is reverted back to identity
+  if (compute_score(reduced_wiener_win, M, H, rui.wiener_info.vfilter,
+                    rui.wiener_info.hfilter) > 0) {
+    rsc->bits += bits_none;
+    rsc->sse += rusi->sse[RESTORE_NONE];
+    rusi->best_rtype[RESTORE_WIENER - 1] = RESTORE_NONE;
+    rusi->sse[RESTORE_WIENER] = INT64_MAX;
+    if (rsc->sf->lpf_sf.prune_sgr_based_on_wiener == 2) rusi->skip_sgr_eval = 1;
+    return;
+  }
+
+  aom_clear_system_state();
+
+  rusi->sse[RESTORE_WIENER] = finer_tile_search_wiener(
+      rsc, limits, tile_rect, &rui, reduced_wiener_win);
+  rusi->wiener = rui.wiener_info;
+
+  if (reduced_wiener_win != WIENER_WIN) {
+    assert(rui.wiener_info.vfilter[0] == 0 &&
+           rui.wiener_info.vfilter[WIENER_WIN - 1] == 0);
+    assert(rui.wiener_info.hfilter[0] == 0 &&
+           rui.wiener_info.hfilter[WIENER_WIN - 1] == 0);
+  }
+
+  const int64_t bits_wiener =
+      x->wiener_restore_cost[1] +
+      (count_wiener_bits(wiener_win, &rusi->wiener, &rsc->wiener)
+       << AV1_PROB_COST_SHIFT);
+
+  double cost_none =
+      RDCOST_DBL(x->rdmult, bits_none >> 4, rusi->sse[RESTORE_NONE]);
+  double cost_wiener =
+      RDCOST_DBL(x->rdmult, bits_wiener >> 4, rusi->sse[RESTORE_WIENER]);
+
+  RestorationType rtype =
+      (cost_wiener < cost_none) ? RESTORE_WIENER : RESTORE_NONE;
+  rusi->best_rtype[RESTORE_WIENER - 1] = rtype;
+
+  // Set 'skip_sgr_eval' based on rdcost ratio of RESTORE_WIENER and
+  // RESTORE_NONE or based on best_rtype
+  if (rsc->sf->lpf_sf.prune_sgr_based_on_wiener == 1) {
+    rusi->skip_sgr_eval = cost_wiener > (1.01 * cost_none);
+  } else if (rsc->sf->lpf_sf.prune_sgr_based_on_wiener == 2) {
+    rusi->skip_sgr_eval = rusi->best_rtype[RESTORE_WIENER - 1] == RESTORE_NONE;
+  }
+
+  rsc->sse += rusi->sse[rtype];
+  rsc->bits += (cost_wiener < cost_none) ? bits_wiener : bits_none;
+  if (cost_wiener < cost_none) rsc->wiener = rusi->wiener;
+}
+
+static AOM_INLINE void search_norestore(const RestorationTileLimits *limits,
+                                        const AV1PixelRect *tile_rect,
+                                        int rest_unit_idx, void *priv,
+                                        int32_t *tmpbuf,
+                                        RestorationLineBuffers *rlbs) {
+  (void)tile_rect;
+  (void)tmpbuf;
+  (void)rlbs;
+
+  RestSearchCtxt *rsc = (RestSearchCtxt *)priv;
+  RestUnitSearchInfo *rusi = &rsc->rusi[rest_unit_idx];
+
+  const int highbd = rsc->cm->seq_params.use_highbitdepth;
+  rusi->sse[RESTORE_NONE] = sse_restoration_unit(
+      limits, rsc->src, &rsc->cm->cur_frame->buf, rsc->plane, highbd);
+
+  rsc->sse += rusi->sse[RESTORE_NONE];
+}
+
+static AOM_INLINE void search_switchable(const RestorationTileLimits *limits,
+                                         const AV1PixelRect *tile_rect,
+                                         int rest_unit_idx, void *priv,
+                                         int32_t *tmpbuf,
+                                         RestorationLineBuffers *rlbs) {
+  (void)limits;
+  (void)tile_rect;
+  (void)tmpbuf;
+  (void)rlbs;
+  RestSearchCtxt *rsc = (RestSearchCtxt *)priv;
+  RestUnitSearchInfo *rusi = &rsc->rusi[rest_unit_idx];
+
+  const MACROBLOCK *const x = rsc->x;
+
+  const int wiener_win =
+      (rsc->plane == AOM_PLANE_Y) ? WIENER_WIN : WIENER_WIN_CHROMA;
+
+  double best_cost = 0;
+  int64_t best_bits = 0;
+  RestorationType best_rtype = RESTORE_NONE;
+
+  for (RestorationType r = 0; r < RESTORE_SWITCHABLE_TYPES; ++r) {
+    // Check for the condition that wiener or sgrproj search could not
+    // find a solution or the solution was worse than RESTORE_NONE.
+    // In either case the best_rtype will be set as RESTORE_NONE. These
+    // should be skipped from the test below.
+    if (r > RESTORE_NONE) {
+      if (rusi->best_rtype[r - 1] == RESTORE_NONE) continue;
+    }
+
+    const int64_t sse = rusi->sse[r];
+    int64_t coeff_pcost = 0;
+    switch (r) {
+      case RESTORE_NONE: coeff_pcost = 0; break;
+      case RESTORE_WIENER:
+        coeff_pcost =
+            count_wiener_bits(wiener_win, &rusi->wiener, &rsc->wiener);
+        break;
+      case RESTORE_SGRPROJ:
+        coeff_pcost = count_sgrproj_bits(&rusi->sgrproj, &rsc->sgrproj);
+        break;
+      default: assert(0); break;
+    }
+    const int64_t coeff_bits = coeff_pcost << AV1_PROB_COST_SHIFT;
+    const int64_t bits = x->switchable_restore_cost[r] + coeff_bits;
+    double cost = RDCOST_DBL(x->rdmult, bits >> 4, sse);
+    if (r == RESTORE_SGRPROJ && rusi->sgrproj.ep < 10)
+      cost *=
+          (1 + DUAL_SGR_PENALTY_MULT * rsc->sf->lpf_sf.dual_sgr_penalty_level);
+    if (r == 0 || cost < best_cost) {
+      best_cost = cost;
+      best_bits = bits;
+      best_rtype = r;
+    }
+  }
+
+  rusi->best_rtype[RESTORE_SWITCHABLE - 1] = best_rtype;
+
+  rsc->sse += rusi->sse[best_rtype];
+  rsc->bits += best_bits;
+  if (best_rtype == RESTORE_WIENER) rsc->wiener = rusi->wiener;
+  if (best_rtype == RESTORE_SGRPROJ) rsc->sgrproj = rusi->sgrproj;
+}
+
+static AOM_INLINE void copy_unit_info(RestorationType frame_rtype,
+                                      const RestUnitSearchInfo *rusi,
+                                      RestorationUnitInfo *rui) {
+  assert(frame_rtype > 0);
+  rui->restoration_type = rusi->best_rtype[frame_rtype - 1];
+  if (rui->restoration_type == RESTORE_WIENER)
+    rui->wiener_info = rusi->wiener;
+  else
+    rui->sgrproj_info = rusi->sgrproj;
+}
+
+static double search_rest_type(RestSearchCtxt *rsc, RestorationType rtype) {
+  static const rest_unit_visitor_t funs[RESTORE_TYPES] = {
+    search_norestore, search_wiener, search_sgrproj, search_switchable
+  };
+
+  reset_rsc(rsc);
+  rsc_on_tile(rsc);
+
+  av1_foreach_rest_unit_in_plane(rsc->cm, rsc->plane, funs[rtype], rsc,
+                                 &rsc->tile_rect, rsc->cm->rst_tmpbuf, NULL);
+  return RDCOST_DBL(rsc->x->rdmult, rsc->bits >> 4, rsc->sse);
+}
+
+static int rest_tiles_in_plane(const AV1_COMMON *cm, int plane) {
+  const RestorationInfo *rsi = &cm->rst_info[plane];
+  return rsi->units_per_tile;
+}
+
+void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  assert(!cm->features.all_lossless);
+
+  int ntiles[2];
+  for (int is_uv = 0; is_uv < 2; ++is_uv)
+    ntiles[is_uv] = rest_tiles_in_plane(cm, is_uv);
+
+  assert(ntiles[1] <= ntiles[0]);
+  RestUnitSearchInfo *rusi =
+      (RestUnitSearchInfo *)aom_memalign(16, sizeof(*rusi) * ntiles[0]);
+
+  // If the restoration unit dimensions are not multiples of
+  // rsi->restoration_unit_size then some elements of the rusi array may be
+  // left uninitialised when we reach copy_unit_info(...). This is not a
+  // problem, as these elements are ignored later, but in order to quiet
+  // Valgrind's warnings we initialise the array below.
+  memset(rusi, 0, sizeof(*rusi) * ntiles[0]);
+  cpi->td.mb.rdmult = cpi->rd.RDMULT;
+
+  RestSearchCtxt rsc;
+  const int plane_start = AOM_PLANE_Y;
+  const int plane_end = num_planes > 1 ? AOM_PLANE_V : AOM_PLANE_Y;
+  for (int plane = plane_start; plane <= plane_end; ++plane) {
+    init_rsc(src, &cpi->common, &cpi->td.mb, &cpi->sf, plane, rusi,
+             &cpi->trial_frame_rst, &rsc);
+
+    const int plane_ntiles = ntiles[plane > 0];
+    const RestorationType num_rtypes =
+        (plane_ntiles > 1) ? RESTORE_TYPES : RESTORE_SWITCHABLE_TYPES;
+
+    double best_cost = 0;
+    RestorationType best_rtype = RESTORE_NONE;
+
+    const int highbd = rsc.cm->seq_params.use_highbitdepth;
+    if (!cpi->sf.lpf_sf.disable_loop_restoration_chroma || !plane) {
+      av1_extend_frame(rsc.dgd_buffer, rsc.plane_width, rsc.plane_height,
+                       rsc.dgd_stride, RESTORATION_BORDER, RESTORATION_BORDER,
+                       highbd);
+
+      for (RestorationType r = 0; r < num_rtypes; ++r) {
+        if ((force_restore_type != RESTORE_TYPES) && (r != RESTORE_NONE) &&
+            (r != force_restore_type))
+          continue;
+
+        double cost = search_rest_type(&rsc, r);
+
+        if (r == 0 || cost < best_cost) {
+          best_cost = cost;
+          best_rtype = r;
+        }
+      }
+    }
+
+    cm->rst_info[plane].frame_restoration_type = best_rtype;
+    if (force_restore_type != RESTORE_TYPES)
+      assert(best_rtype == force_restore_type || best_rtype == RESTORE_NONE);
+
+    if (best_rtype != RESTORE_NONE) {
+      for (int u = 0; u < plane_ntiles; ++u) {
+        copy_unit_info(best_rtype, &rusi[u], &cm->rst_info[plane].unit_info[u]);
+      }
+    }
+  }
+
+  aom_free(rusi);
+}
diff --git a/libs/libaom/src/av1/encoder/pickrst.h b/libs/libaom/src/av1/encoder/pickrst.h
new file mode 100644
index 000000000..eee30553d
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/pickrst.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_ENCODER_PICKRST_H_
+#define AOM_AV1_ENCODER_PICKRST_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/encoder/encoder.h"
+#include "aom_ports/system_state.h"
+
+struct yv12_buffer_config;
+struct AV1_COMP;
+
+static const uint8_t g_shuffle_stats_data[16] = {
+  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+};
+
+static const uint8_t g_shuffle_stats_highbd_data[32] = {
+  0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9,
+  0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9,
+};
+
+static INLINE uint8_t find_average(const uint8_t *src, int h_start, int h_end,
+                                   int v_start, int v_end, int stride) {
+  uint64_t sum = 0;
+  for (int i = v_start; i < v_end; i++) {
+    for (int j = h_start; j < h_end; j++) {
+      sum += src[i * stride + j];
+    }
+  }
+  uint64_t avg = sum / ((v_end - v_start) * (h_end - h_start));
+  return (uint8_t)avg;
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE uint16_t find_average_highbd(const uint16_t *src, int h_start,
+                                           int h_end, int v_start, int v_end,
+                                           int stride) {
+  uint64_t sum = 0;
+  for (int i = v_start; i < v_end; i++) {
+    for (int j = h_start; j < h_end; j++) {
+      sum += src[i * stride + j];
+    }
+  }
+  uint64_t avg = sum / ((v_end - v_start) * (h_end - h_start));
+  return (uint16_t)avg;
+}
+#endif
+
+void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_PICKRST_H_
diff --git a/libs/libaom/src/av1/encoder/pustats.h b/libs/libaom/src/av1/encoder/pustats.h
new file mode 100644
index 000000000..2e8710108
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/pustats.h
@@ -0,0 +1,198 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_PUSTATS_H_
+#define AOM_AV1_ENCODER_PUSTATS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/encoder/ml.h"
+
+#define NUM_FEATURES_PUSTATS 8
+#define NUM_HIDDEN_LAYERS 2
+#define HIDDEN_LAYERS_0_NODES 12
+#define HIDDEN_LAYERS_1_NODES 10
+#define LOGITS_NODES 1
+
+static const float
+    av1_pustats_rate_hiddenlayer_0_kernel[NUM_FEATURES_PUSTATS *
+                                          HIDDEN_LAYERS_0_NODES] = {
+      -0.1758f, -0.0499f, -10.0069f, -2.2838f,  -0.3359f,  0.3459f,  -0.3285f,
+      -0.0515f, -0.5417f, 0.2357f,   -0.0575f,  -69.0782f, 0.5348f,  1.4068f,
+      0.2213f,  -1.0490f, -0.0636f,  0.1654f,   1.1002f,   33.4924f, 0.4358f,
+      1.2499f,  0.1143f,  0.0592f,   -1.6335f,  -0.0092f,  1.2207f,  -28.4543f,
+      -0.4973f, 0.4368f,  0.2341f,   -0.1623f,  -3.8986f,  0.1311f,  -1.8789f,
+      -3.9079f, -0.8158f, -0.8420f,  1.4295f,   -2.3629f,  -1.4825f, 0.6498f,
+      -5.3669f, 6.4434f,  1.8393f,   -35.0678f, 3.7459f,   -2.8504f, 2.0502f,
+      -0.1812f, -3.9011f, -1.0155f,  1.8375f,   -1.4517f,  1.3917f,  3.8664f,
+      0.8345f,  -0.3472f, 5.7740f,   -1.1196f,  -0.3264f,  -1.2481f, -0.9284f,
+      -4.9657f, 2.2831f,  0.7337f,   2.3176f,   0.6416f,   0.8804f,  1.9988f,
+      -1.3426f, 1.2728f,  1.2249f,   -0.1551f,  5.6045f,   0.2046f,  -2.1464f,
+      -2.4922f, -0.5334f, 12.1055f,  7.2467f,   -0.0070f,  0.0234f,  0.0021f,
+      0.0215f,  -0.0098f, -0.0682f,  -6.1494f,  -0.3176f,  -1.6069f, -0.2119f,
+      -1.0533f, -0.3566f, 0.5294f,   -0.4335f,  0.1626f,
+    };
+
+static const float
+    av1_pustats_rate_hiddenlayer_0_bias[HIDDEN_LAYERS_0_NODES] = {
+      10.5266f, 5.3268f, -1.0678f, 7.7411f,  8.7164f,  -0.3235f,
+      7.3028f,  9.0874f, -6.4594f, -1.0102f, -1.1146f, 10.8419f,
+    };
+
+static const float
+    av1_pustats_rate_hiddenlayer_1_kernel[HIDDEN_LAYERS_0_NODES *
+                                          HIDDEN_LAYERS_1_NODES] = {
+      10.5932f,  2.5192f,  -0.0015f, 5.9479f,   5.2426f,   -0.4091f, 5.3220f,
+      6.0469f,   0.7200f,  3.3241f,  5.5006f,   12.8290f,  -1.6396f, 0.5743f,
+      -0.8370f,  1.9956f,  -4.9270f, -1.5295f,  2.1350f,   -9.4415f, -0.7094f,
+      5.1822f,   19.7287f, -3.0444f, -0.3320f,  0.0031f,   -0.2709f, -0.5249f,
+      0.3281f,   -0.2240f, 0.2225f,  -0.2386f,  -0.4370f,  -0.2438f, -0.4928f,
+      -0.2842f,  -2.1772f, 9.2570f,  -17.6655f, 3.5448f,   -2.8394f, -1.0167f,
+      -0.5115f,  -1.9260f, -0.2111f, -0.7528f,  -1.2387f,  -0.0401f, 5.0716f,
+      -3.3763f,  -0.2898f, -0.4956f, -7.9993f,  0.1526f,   -0.0242f, 0.7354f,
+      6.0432f,   4.8043f,  7.4790f,  -0.6295f,  1.7565f,   3.7197f,  -2.3963f,
+      6.8945f,   2.9717f,  -3.1623f, 3.4241f,   4.4676f,   -1.8154f, -2.9401f,
+      -8.5657f,  -3.0240f, -1.4661f, 8.1145f,   -12.7858f, 3.3624f,  -1.0819f,
+      -4.2856f,  1.1801f,  -0.5587f, -1.6062f,  -1.1813f,  -3.5882f, -0.2490f,
+      -24.9566f, -0.4140f, -0.1113f, 3.5537f,   4.4112f,   0.1367f,  -1.5876f,
+      1.6605f,   1.3903f,  -0.0253f, -2.1419f,  -2.2197f,  -0.7659f, -0.4249f,
+      -0.0424f,  0.1486f,  0.4643f,  -0.9068f,  -0.3619f,  -0.7624f, -0.9132f,
+      -0.4947f,  -0.3527f, -0.5445f, -0.4768f,  -1.7761f,  -1.0686f, 0.5462f,
+      1.3371f,   4.3116f,  0.0777f,  -2.7216f,  -1.8908f,  3.4989f,  7.7269f,
+      -2.7566f,
+    };
+
+static const float
+    av1_pustats_rate_hiddenlayer_1_bias[HIDDEN_LAYERS_1_NODES] = {
+      13.2435f, -8.5477f, -0.0998f, -1.5131f, -12.0187f,
+      6.1715f,  0.5094f,  7.6433f,  -0.3992f, -1.3555f,
+    };
+
+static const float
+    av1_pustats_rate_logits_kernel[HIDDEN_LAYERS_1_NODES * LOGITS_NODES] = {
+      4.3078f, -17.3497f, 0.0195f,  34.6032f, -5.0127f,
+      5.3079f, 10.0077f,  -13.129f, 0.0087f,  -8.4009f,
+    };
+
+static const float av1_pustats_rate_logits_bias[LOGITS_NODES] = {
+  4.5103f,
+};
+
+static const NN_CONFIG av1_pustats_rate_nnconfig = {
+  NUM_FEATURES_PUSTATS,                              // num_inputs
+  LOGITS_NODES,                                      // num_outputs
+  NUM_HIDDEN_LAYERS,                                 // num_hidden_layers
+  { HIDDEN_LAYERS_0_NODES, HIDDEN_LAYERS_1_NODES },  // num_hidden_nodes
+  {
+      av1_pustats_rate_hiddenlayer_0_kernel,
+      av1_pustats_rate_hiddenlayer_1_kernel,
+      av1_pustats_rate_logits_kernel,
+  },
+  {
+      av1_pustats_rate_hiddenlayer_0_bias,
+      av1_pustats_rate_hiddenlayer_1_bias,
+      av1_pustats_rate_logits_bias,
+  },
+};
+
+static const float
+    av1_pustats_dist_hiddenlayer_0_kernel[NUM_FEATURES_PUSTATS *
+                                          HIDDEN_LAYERS_0_NODES] = {
+      -0.2560f, 0.1105f,  -0.8434f, -0.0132f, -8.9371f, -1.1176f, -0.3655f,
+      0.4885f,  1.7518f,  0.4985f,  0.5582f,  -0.3739f, 0.9403f,  0.3874f,
+      0.3265f,  1.7383f,  3.1747f,  0.0285f,  3.3942f,  -0.0123f, 0.5057f,
+      0.1584f,  0.2697f,  4.6151f,  3.6251f,  -0.0121f, -1.0047f, -0.0037f,
+      0.0127f,  0.1935f,  -0.5277f, -2.7144f, 0.0729f,  -0.1457f, -0.0816f,
+      -0.5462f, 0.4738f,  0.3599f,  -0.0564f, 0.0910f,  0.0126f,  -0.0310f,
+      -2.1311f, -0.4666f, -0.0074f, -0.0765f, 0.0287f,  -0.2662f, -0.0999f,
+      -0.2983f, -0.4899f, -0.2314f, 0.2873f,  -0.3614f, 0.1783f,  -0.1210f,
+      0.3569f,  0.5436f,  -8.0536f, -0.0044f, -1.5255f, -0.8247f, -0.4556f,
+      1.9045f,  0.5463f,  0.1102f,  -0.9293f, -0.0185f, -0.8302f, -0.4378f,
+      -0.3531f, -1.3095f, 0.6099f,  0.7977f,  4.1950f,  -0.0067f, -0.2762f,
+      -0.1574f, -0.2149f, 0.6104f,  -1.7053f, 0.1904f,  4.2402f,  -0.2671f,
+      0.8940f,  0.6820f,  0.2241f,  -0.9459f, 1.4571f,  0.5255f,  2.3352f,
+      -0.0806f, 0.5231f,  0.3928f,  0.4146f,  2.0956f,
+    };
+
+static const float
+    av1_pustats_dist_hiddenlayer_0_bias[HIDDEN_LAYERS_0_NODES] = {
+      1.1597f, 0.0836f, -0.7471f, -0.2439f, -0.0438f, 2.4626f,
+      0.f,     1.1485f, 2.7085f,  -4.7897f, 1.4093f,  -1.657f,
+    };
+
+static const float
+    av1_pustats_dist_hiddenlayer_1_kernel[HIDDEN_LAYERS_0_NODES *
+                                          HIDDEN_LAYERS_1_NODES] = {
+      -0.5203f, -1.3468f, 0.3865f,  -0.6859f, 0.0058f,  4.0682f,  0.4807f,
+      -0.1380f, 0.6050f,  0.8958f,  0.7748f,  -0.1311f, 1.7317f,  1.1265f,
+      0.0827f,  0.1407f,  -0.3605f, 0.5429f,  0.1880f,  -0.1439f, 0.2837f,
+      1.6477f,  0.0832f,  0.0593f,  -1.8464f, -0.7241f, -1.0672f, -0.3546f,
+      -0.3842f, -2.3637f, 0.2514f,  0.8263f,  -0.1872f, 0.5774f,  -0.3610f,
+      -0.0205f, 1.3977f,  -0.1083f, 0.6923f,  1.3039f,  -0.2870f, 1.0622f,
+      -0.0566f, 0.2697f,  -0.5429f, -0.6193f, 1.7559f,  0.3246f,  1.9159f,
+      0.3744f,  0.0686f,  1.0191f,  -0.4212f, 1.9591f,  -0.0691f, -0.1085f,
+      -1.2034f, 0.0606f,  1.0116f,  0.5565f,  -0.1874f, -0.7898f, 0.4796f,
+      0.2290f,  0.4334f,  -0.5817f, -0.2949f, 0.1367f,  -0.2932f, -1.1265f,
+      0.0133f,  -0.5309f, -3.3191f, 0.0939f,  0.3895f,  -2.5812f, -0.0066f,
+      -3.0063f, -0.2982f, 0.7309f,  -0.2422f, -0.2770f, -0.7152f, 0.1700f,
+      1.9630f,  0.1988f,  0.4194f,  0.8762f,  0.3402f,  0.1051f,  -0.1598f,
+      0.2405f,  0.0392f,  1.1256f,  1.5245f,  0.0950f,  0.2160f,  -0.5023f,
+      0.2584f,  0.2074f,  0.2218f,  0.3966f,  -0.0921f, -0.2435f, -0.4560f,
+      -1.1923f, -0.3716f, -0.3286f, -1.3225f, 0.1896f,  -0.3342f, -0.7888f,
+      -0.4488f, -1.7168f, 0.3341f,  0.1146f,  0.5226f,  0.2610f,  -0.4574f,
+      -0.4164f,
+    };
+
+static const float
+    av1_pustats_dist_hiddenlayer_1_bias[HIDDEN_LAYERS_1_NODES] = {
+      -2.3014f, -2.4292f, 1.3317f, -3.2361f, -1.918f,
+      2.7149f,  -2.5649f, 2.7765f, 2.9617f,  2.7684f,
+    };
+
+static const float
+    av1_pustats_dist_logits_kernel[HIDDEN_LAYERS_1_NODES * LOGITS_NODES] = {
+      -0.6868f, -0.6715f, 0.449f,  -1.293f, 0.6214f,
+      0.9894f,  -0.4342f, 0.7002f, 1.4363f, 0.6951f,
+    };
+
+static const float av1_pustats_dist_logits_bias[LOGITS_NODES] = {
+  2.3371f,
+};
+
+static const NN_CONFIG av1_pustats_dist_nnconfig = {
+  NUM_FEATURES_PUSTATS,                              // num_inputs
+  LOGITS_NODES,                                      // num_outputs
+  NUM_HIDDEN_LAYERS,                                 // num_hidden_layers
+  { HIDDEN_LAYERS_0_NODES, HIDDEN_LAYERS_1_NODES },  // num_hidden_nodes
+  {
+      av1_pustats_dist_hiddenlayer_0_kernel,
+      av1_pustats_dist_hiddenlayer_1_kernel,
+      av1_pustats_dist_logits_kernel,
+  },
+  {
+      av1_pustats_dist_hiddenlayer_0_bias,
+      av1_pustats_dist_hiddenlayer_1_bias,
+      av1_pustats_dist_logits_bias,
+  },
+};
+
+#undef NUM_HIDDEN_LAYERS
+#undef HIDDEN_LAYERS_0_NODES
+#undef HIDDEN_LAYERS_1_NODES
+#undef LOGITS_NODES
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_PUSTATS_H_
diff --git a/libs/libaom/src/av1/encoder/random.h b/libs/libaom/src/av1/encoder/random.h
new file mode 100644
index 000000000..0bca39102
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/random.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_RANDOM_H_
+#define AOM_AV1_ENCODER_RANDOM_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Generate a random number in the range [0, 32768).
+static INLINE unsigned int lcg_rand16(unsigned int *state) {
+  *state = (unsigned int)(*state * 1103515245ULL + 12345);
+  return *state / 65536 % 32768;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_RANDOM_H_
diff --git a/libs/libaom/src/av1/encoder/ransac.c b/libs/libaom/src/av1/encoder/ransac.c
new file mode 100644
index 000000000..07e1a5f5f
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/ransac.c
@@ -0,0 +1,820 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <memory.h>
+#include <math.h>
+#include <time.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+
+#include "av1/encoder/ransac.h"
+#include "av1/encoder/mathutils.h"
+#include "av1/encoder/random.h"
+
+#define MAX_MINPTS 4
+#define MAX_DEGENERATE_ITER 10
+#define MINPTS_MULTIPLIER 5
+
+#define INLIER_THRESHOLD 1.25
+#define MIN_TRIALS 20
+
+////////////////////////////////////////////////////////////////////////////////
+// ransac
+typedef int (*IsDegenerateFunc)(double *p);
+typedef void (*NormalizeFunc)(double *p, int np, double *T);
+typedef void (*DenormalizeFunc)(double *params, double *T1, double *T2);
+typedef int (*FindTransformationFunc)(int points, double *points1,
+                                      double *points2, double *params);
+typedef void (*ProjectPointsDoubleFunc)(double *mat, double *points,
+                                        double *proj, int n, int stride_points,
+                                        int stride_proj);
+
+static void project_points_double_translation(double *mat, double *points,
+                                              double *proj, int n,
+                                              int stride_points,
+                                              int stride_proj) {
+  int i;
+  for (i = 0; i < n; ++i) {
+    const double x = *(points++), y = *(points++);
+    *(proj++) = x + mat[0];
+    *(proj++) = y + mat[1];
+    points += stride_points - 2;
+    proj += stride_proj - 2;
+  }
+}
+
+static void project_points_double_rotzoom(double *mat, double *points,
+                                          double *proj, int n,
+                                          int stride_points, int stride_proj) {
+  int i;
+  for (i = 0; i < n; ++i) {
+    const double x = *(points++), y = *(points++);
+    *(proj++) = mat[2] * x + mat[3] * y + mat[0];
+    *(proj++) = -mat[3] * x + mat[2] * y + mat[1];
+    points += stride_points - 2;
+    proj += stride_proj - 2;
+  }
+}
+
+static void project_points_double_affine(double *mat, double *points,
+                                         double *proj, int n, int stride_points,
+                                         int stride_proj) {
+  int i;
+  for (i = 0; i < n; ++i) {
+    const double x = *(points++), y = *(points++);
+    *(proj++) = mat[2] * x + mat[3] * y + mat[0];
+    *(proj++) = mat[4] * x + mat[5] * y + mat[1];
+    points += stride_points - 2;
+    proj += stride_proj - 2;
+  }
+}
+
+static void normalize_homography(double *pts, int n, double *T) {
+  double *p = pts;
+  double mean[2] = { 0, 0 };
+  double msqe = 0;
+  double scale;
+  int i;
+
+  assert(n > 0);
+  for (i = 0; i < n; ++i, p += 2) {
+    mean[0] += p[0];
+    mean[1] += p[1];
+  }
+  mean[0] /= n;
+  mean[1] /= n;
+  for (p = pts, i = 0; i < n; ++i, p += 2) {
+    p[0] -= mean[0];
+    p[1] -= mean[1];
+    msqe += sqrt(p[0] * p[0] + p[1] * p[1]);
+  }
+  msqe /= n;
+  scale = (msqe == 0 ? 1.0 : sqrt(2) / msqe);
+  T[0] = scale;
+  T[1] = 0;
+  T[2] = -scale * mean[0];
+  T[3] = 0;
+  T[4] = scale;
+  T[5] = -scale * mean[1];
+  T[6] = 0;
+  T[7] = 0;
+  T[8] = 1;
+  for (p = pts, i = 0; i < n; ++i, p += 2) {
+    p[0] *= scale;
+    p[1] *= scale;
+  }
+}
+
+static void invnormalize_mat(double *T, double *iT) {
+  double is = 1.0 / T[0];
+  double m0 = -T[2] * is;
+  double m1 = -T[5] * is;
+  iT[0] = is;
+  iT[1] = 0;
+  iT[2] = m0;
+  iT[3] = 0;
+  iT[4] = is;
+  iT[5] = m1;
+  iT[6] = 0;
+  iT[7] = 0;
+  iT[8] = 1;
+}
+
+static void denormalize_homography(double *params, double *T1, double *T2) {
+  double iT2[9];
+  double params2[9];
+  invnormalize_mat(T2, iT2);
+  multiply_mat(params, T1, params2, 3, 3, 3);
+  multiply_mat(iT2, params2, params, 3, 3, 3);
+}
+
+static void denormalize_affine_reorder(double *params, double *T1, double *T2) {
+  double params_denorm[MAX_PARAMDIM];
+  params_denorm[0] = params[0];
+  params_denorm[1] = params[1];
+  params_denorm[2] = params[4];
+  params_denorm[3] = params[2];
+  params_denorm[4] = params[3];
+  params_denorm[5] = params[5];
+  params_denorm[6] = params_denorm[7] = 0;
+  params_denorm[8] = 1;
+  denormalize_homography(params_denorm, T1, T2);
+  params[0] = params_denorm[2];
+  params[1] = params_denorm[5];
+  params[2] = params_denorm[0];
+  params[3] = params_denorm[1];
+  params[4] = params_denorm[3];
+  params[5] = params_denorm[4];
+  params[6] = params[7] = 0;
+}
+
+static void denormalize_rotzoom_reorder(double *params, double *T1,
+                                        double *T2) {
+  double params_denorm[MAX_PARAMDIM];
+  params_denorm[0] = params[0];
+  params_denorm[1] = params[1];
+  params_denorm[2] = params[2];
+  params_denorm[3] = -params[1];
+  params_denorm[4] = params[0];
+  params_denorm[5] = params[3];
+  params_denorm[6] = params_denorm[7] = 0;
+  params_denorm[8] = 1;
+  denormalize_homography(params_denorm, T1, T2);
+  params[0] = params_denorm[2];
+  params[1] = params_denorm[5];
+  params[2] = params_denorm[0];
+  params[3] = params_denorm[1];
+  params[4] = -params[3];
+  params[5] = params[2];
+  params[6] = params[7] = 0;
+}
+
+static void denormalize_translation_reorder(double *params, double *T1,
+                                            double *T2) {
+  double params_denorm[MAX_PARAMDIM];
+  params_denorm[0] = 1;
+  params_denorm[1] = 0;
+  params_denorm[2] = params[0];
+  params_denorm[3] = 0;
+  params_denorm[4] = 1;
+  params_denorm[5] = params[1];
+  params_denorm[6] = params_denorm[7] = 0;
+  params_denorm[8] = 1;
+  denormalize_homography(params_denorm, T1, T2);
+  params[0] = params_denorm[2];
+  params[1] = params_denorm[5];
+  params[2] = params[5] = 1;
+  params[3] = params[4] = 0;
+  params[6] = params[7] = 0;
+}
+
+static int find_translation(int np, double *pts1, double *pts2, double *mat) {
+  int i;
+  double sx, sy, dx, dy;
+  double sumx, sumy;
+
+  double T1[9], T2[9];
+  normalize_homography(pts1, np, T1);
+  normalize_homography(pts2, np, T2);
+
+  sumx = 0;
+  sumy = 0;
+  for (i = 0; i < np; ++i) {
+    dx = *(pts2++);
+    dy = *(pts2++);
+    sx = *(pts1++);
+    sy = *(pts1++);
+
+    sumx += dx - sx;
+    sumy += dy - sy;
+  }
+  mat[0] = sumx / np;
+  mat[1] = sumy / np;
+  denormalize_translation_reorder(mat, T1, T2);
+  return 0;
+}
+
+static int find_rotzoom(int np, double *pts1, double *pts2, double *mat) {
+  const int np2 = np * 2;
+  double *a = (double *)aom_malloc(sizeof(*a) * (np2 * 5 + 20));
+  double *b = a + np2 * 4;
+  double *temp = b + np2;
+  int i;
+  double sx, sy, dx, dy;
+
+  double T1[9], T2[9];
+  normalize_homography(pts1, np, T1);
+  normalize_homography(pts2, np, T2);
+
+  for (i = 0; i < np; ++i) {
+    dx = *(pts2++);
+    dy = *(pts2++);
+    sx = *(pts1++);
+    sy = *(pts1++);
+
+    a[i * 2 * 4 + 0] = sx;
+    a[i * 2 * 4 + 1] = sy;
+    a[i * 2 * 4 + 2] = 1;
+    a[i * 2 * 4 + 3] = 0;
+    a[(i * 2 + 1) * 4 + 0] = sy;
+    a[(i * 2 + 1) * 4 + 1] = -sx;
+    a[(i * 2 + 1) * 4 + 2] = 0;
+    a[(i * 2 + 1) * 4 + 3] = 1;
+
+    b[2 * i] = dx;
+    b[2 * i + 1] = dy;
+  }
+  if (!least_squares(4, a, np2, 4, b, temp, mat)) {
+    aom_free(a);
+    return 1;
+  }
+  denormalize_rotzoom_reorder(mat, T1, T2);
+  aom_free(a);
+  return 0;
+}
+
+static int find_affine(int np, double *pts1, double *pts2, double *mat) {
+  assert(np > 0);
+  const int np2 = np * 2;
+  double *a = (double *)aom_malloc(sizeof(*a) * (np2 * 7 + 42));
+  if (a == NULL) return 1;
+  double *b = a + np2 * 6;
+  double *temp = b + np2;
+  int i;
+  double sx, sy, dx, dy;
+
+  double T1[9], T2[9];
+  normalize_homography(pts1, np, T1);
+  normalize_homography(pts2, np, T2);
+
+  for (i = 0; i < np; ++i) {
+    dx = *(pts2++);
+    dy = *(pts2++);
+    sx = *(pts1++);
+    sy = *(pts1++);
+
+    a[i * 2 * 6 + 0] = sx;
+    a[i * 2 * 6 + 1] = sy;
+    a[i * 2 * 6 + 2] = 0;
+    a[i * 2 * 6 + 3] = 0;
+    a[i * 2 * 6 + 4] = 1;
+    a[i * 2 * 6 + 5] = 0;
+    a[(i * 2 + 1) * 6 + 0] = 0;
+    a[(i * 2 + 1) * 6 + 1] = 0;
+    a[(i * 2 + 1) * 6 + 2] = sx;
+    a[(i * 2 + 1) * 6 + 3] = sy;
+    a[(i * 2 + 1) * 6 + 4] = 0;
+    a[(i * 2 + 1) * 6 + 5] = 1;
+
+    b[2 * i] = dx;
+    b[2 * i + 1] = dy;
+  }
+  if (!least_squares(6, a, np2, 6, b, temp, mat)) {
+    aom_free(a);
+    return 1;
+  }
+  denormalize_affine_reorder(mat, T1, T2);
+  aom_free(a);
+  return 0;
+}
+
+static int get_rand_indices(int npoints, int minpts, int *indices,
+                            unsigned int *seed) {
+  int i, j;
+  int ptr = lcg_rand16(seed) % npoints;
+  if (minpts > npoints) return 0;
+  indices[0] = ptr;
+  ptr = (ptr == npoints - 1 ? 0 : ptr + 1);
+  i = 1;
+  while (i < minpts) {
+    int index = lcg_rand16(seed) % npoints;
+    while (index) {
+      ptr = (ptr == npoints - 1 ? 0 : ptr + 1);
+      for (j = 0; j < i; ++j) {
+        if (indices[j] == ptr) break;
+      }
+      if (j == i) index--;
+    }
+    indices[i++] = ptr;
+  }
+  return 1;
+}
+
+typedef struct {
+  int num_inliers;
+  double variance;
+  int *inlier_indices;
+} RANSAC_MOTION;
+
+// Return -1 if 'a' is a better motion, 1 if 'b' is better, 0 otherwise.
+static int compare_motions(const void *arg_a, const void *arg_b) {
+  const RANSAC_MOTION *motion_a = (RANSAC_MOTION *)arg_a;
+  const RANSAC_MOTION *motion_b = (RANSAC_MOTION *)arg_b;
+
+  if (motion_a->num_inliers > motion_b->num_inliers) return -1;
+  if (motion_a->num_inliers < motion_b->num_inliers) return 1;
+  if (motion_a->variance < motion_b->variance) return -1;
+  if (motion_a->variance > motion_b->variance) return 1;
+  return 0;
+}
+
+static int is_better_motion(const RANSAC_MOTION *motion_a,
+                            const RANSAC_MOTION *motion_b) {
+  return compare_motions(motion_a, motion_b) < 0;
+}
+
+static void copy_points_at_indices(double *dest, const double *src,
+                                   const int *indices, int num_points) {
+  for (int i = 0; i < num_points; ++i) {
+    const int index = indices[i];
+    dest[i * 2] = src[index * 2];
+    dest[i * 2 + 1] = src[index * 2 + 1];
+  }
+}
+
+static const double kInfiniteVariance = 1e12;
+
+static void clear_motion(RANSAC_MOTION *motion, int num_points) {
+  motion->num_inliers = 0;
+  motion->variance = kInfiniteVariance;
+  memset(motion->inlier_indices, 0,
+         sizeof(*motion->inlier_indices) * num_points);
+}
+
+static int ransac(const int *matched_points, int npoints,
+                  int *num_inliers_by_motion, MotionModel *params_by_motion,
+                  int num_desired_motions, int minpts,
+                  IsDegenerateFunc is_degenerate,
+                  FindTransformationFunc find_transformation,
+                  ProjectPointsDoubleFunc projectpoints) {
+  int trial_count = 0;
+  int i = 0;
+  int ret_val = 0;
+
+  unsigned int seed = (unsigned int)npoints;
+
+  int indices[MAX_MINPTS] = { 0 };
+
+  double *points1, *points2;
+  double *corners1, *corners2;
+  double *image1_coord;
+
+  // Store information for the num_desired_motions best transformations found
+  // and the worst motion among them, as well as the motion currently under
+  // consideration.
+  RANSAC_MOTION *motions, *worst_kept_motion = NULL;
+  RANSAC_MOTION current_motion;
+
+  // Store the parameters and the indices of the inlier points for the motion
+  // currently under consideration.
+  double params_this_motion[MAX_PARAMDIM];
+
+  double *cnp1, *cnp2;
+
+  for (i = 0; i < num_desired_motions; ++i) {
+    num_inliers_by_motion[i] = 0;
+  }
+  if (npoints < minpts * MINPTS_MULTIPLIER || npoints == 0) {
+    return 1;
+  }
+
+  points1 = (double *)aom_malloc(sizeof(*points1) * npoints * 2);
+  points2 = (double *)aom_malloc(sizeof(*points2) * npoints * 2);
+  corners1 = (double *)aom_malloc(sizeof(*corners1) * npoints * 2);
+  corners2 = (double *)aom_malloc(sizeof(*corners2) * npoints * 2);
+  image1_coord = (double *)aom_malloc(sizeof(*image1_coord) * npoints * 2);
+
+  motions =
+      (RANSAC_MOTION *)aom_malloc(sizeof(RANSAC_MOTION) * num_desired_motions);
+  for (i = 0; i < num_desired_motions; ++i) {
+    motions[i].inlier_indices =
+        (int *)aom_malloc(sizeof(*motions->inlier_indices) * npoints);
+    clear_motion(motions + i, npoints);
+  }
+  current_motion.inlier_indices =
+      (int *)aom_malloc(sizeof(*current_motion.inlier_indices) * npoints);
+  clear_motion(&current_motion, npoints);
+
+  worst_kept_motion = motions;
+
+  if (!(points1 && points2 && corners1 && corners2 && image1_coord && motions &&
+        current_motion.inlier_indices)) {
+    ret_val = 1;
+    goto finish_ransac;
+  }
+
+  cnp1 = corners1;
+  cnp2 = corners2;
+  for (i = 0; i < npoints; ++i) {
+    *(cnp1++) = *(matched_points++);
+    *(cnp1++) = *(matched_points++);
+    *(cnp2++) = *(matched_points++);
+    *(cnp2++) = *(matched_points++);
+  }
+
+  while (MIN_TRIALS > trial_count) {
+    double sum_distance = 0.0;
+    double sum_distance_squared = 0.0;
+
+    clear_motion(&current_motion, npoints);
+
+    int degenerate = 1;
+    int num_degenerate_iter = 0;
+
+    while (degenerate) {
+      num_degenerate_iter++;
+      if (!get_rand_indices(npoints, minpts, indices, &seed)) {
+        ret_val = 1;
+        goto finish_ransac;
+      }
+
+      copy_points_at_indices(points1, corners1, indices, minpts);
+      copy_points_at_indices(points2, corners2, indices, minpts);
+
+      degenerate = is_degenerate(points1);
+      if (num_degenerate_iter > MAX_DEGENERATE_ITER) {
+        ret_val = 1;
+        goto finish_ransac;
+      }
+    }
+
+    if (find_transformation(minpts, points1, points2, params_this_motion)) {
+      trial_count++;
+      continue;
+    }
+
+    projectpoints(params_this_motion, corners1, image1_coord, npoints, 2, 2);
+
+    for (i = 0; i < npoints; ++i) {
+      double dx = image1_coord[i * 2] - corners2[i * 2];
+      double dy = image1_coord[i * 2 + 1] - corners2[i * 2 + 1];
+      double distance = sqrt(dx * dx + dy * dy);
+
+      if (distance < INLIER_THRESHOLD) {
+        current_motion.inlier_indices[current_motion.num_inliers++] = i;
+        sum_distance += distance;
+        sum_distance_squared += distance * distance;
+      }
+    }
+
+    if (current_motion.num_inliers >= worst_kept_motion->num_inliers &&
+        current_motion.num_inliers > 1) {
+      double mean_distance;
+      mean_distance = sum_distance / ((double)current_motion.num_inliers);
+      current_motion.variance =
+          sum_distance_squared / ((double)current_motion.num_inliers - 1.0) -
+          mean_distance * mean_distance * ((double)current_motion.num_inliers) /
+              ((double)current_motion.num_inliers - 1.0);
+      if (is_better_motion(&current_motion, worst_kept_motion)) {
+        // This motion is better than the worst currently kept motion. Remember
+        // the inlier points and variance. The parameters for each kept motion
+        // will be recomputed later using only the inliers.
+        worst_kept_motion->num_inliers = current_motion.num_inliers;
+        worst_kept_motion->variance = current_motion.variance;
+        memcpy(worst_kept_motion->inlier_indices, current_motion.inlier_indices,
+               sizeof(*current_motion.inlier_indices) * npoints);
+        assert(npoints > 0);
+        // Determine the new worst kept motion and its num_inliers and variance.
+        for (i = 0; i < num_desired_motions; ++i) {
+          if (is_better_motion(worst_kept_motion, &motions[i])) {
+            worst_kept_motion = &motions[i];
+          }
+        }
+      }
+    }
+    trial_count++;
+  }
+
+  // Sort the motions, best first.
+  qsort(motions, num_desired_motions, sizeof(RANSAC_MOTION), compare_motions);
+
+  // Recompute the motions using only the inliers.
+  for (i = 0; i < num_desired_motions; ++i) {
+    if (motions[i].num_inliers >= minpts) {
+      copy_points_at_indices(points1, corners1, motions[i].inlier_indices,
+                             motions[i].num_inliers);
+      copy_points_at_indices(points2, corners2, motions[i].inlier_indices,
+                             motions[i].num_inliers);
+
+      find_transformation(motions[i].num_inliers, points1, points2,
+                          params_by_motion[i].params);
+
+      params_by_motion[i].num_inliers = motions[i].num_inliers;
+      memcpy(params_by_motion[i].inliers, motions[i].inlier_indices,
+             sizeof(*motions[i].inlier_indices) * npoints);
+      num_inliers_by_motion[i] = motions[i].num_inliers;
+    }
+  }
+
+finish_ransac:
+  aom_free(points1);
+  aom_free(points2);
+  aom_free(corners1);
+  aom_free(corners2);
+  aom_free(image1_coord);
+  aom_free(current_motion.inlier_indices);
+  for (i = 0; i < num_desired_motions; ++i) {
+    aom_free(motions[i].inlier_indices);
+  }
+  aom_free(motions);
+
+  return ret_val;
+}
+
+static int ransac_double_prec(const double *matched_points, int npoints,
+                              int *num_inliers_by_motion,
+                              MotionModel *params_by_motion,
+                              int num_desired_motions, int minpts,
+                              IsDegenerateFunc is_degenerate,
+                              FindTransformationFunc find_transformation,
+                              ProjectPointsDoubleFunc projectpoints) {
+  int trial_count = 0;
+  int i = 0;
+  int ret_val = 0;
+
+  unsigned int seed = (unsigned int)npoints;
+
+  int indices[MAX_MINPTS] = { 0 };
+
+  double *points1, *points2;
+  double *corners1, *corners2;
+  double *image1_coord;
+
+  // Store information for the num_desired_motions best transformations found
+  // and the worst motion among them, as well as the motion currently under
+  // consideration.
+  RANSAC_MOTION *motions, *worst_kept_motion = NULL;
+  RANSAC_MOTION current_motion;
+
+  // Store the parameters and the indices of the inlier points for the motion
+  // currently under consideration.
+  double params_this_motion[MAX_PARAMDIM];
+
+  double *cnp1, *cnp2;
+
+  for (i = 0; i < num_desired_motions; ++i) {
+    num_inliers_by_motion[i] = 0;
+  }
+  if (npoints < minpts * MINPTS_MULTIPLIER || npoints == 0) {
+    return 1;
+  }
+
+  points1 = (double *)aom_malloc(sizeof(*points1) * npoints * 2);
+  points2 = (double *)aom_malloc(sizeof(*points2) * npoints * 2);
+  corners1 = (double *)aom_malloc(sizeof(*corners1) * npoints * 2);
+  corners2 = (double *)aom_malloc(sizeof(*corners2) * npoints * 2);
+  image1_coord = (double *)aom_malloc(sizeof(*image1_coord) * npoints * 2);
+
+  motions =
+      (RANSAC_MOTION *)aom_malloc(sizeof(RANSAC_MOTION) * num_desired_motions);
+  for (i = 0; i < num_desired_motions; ++i) {
+    motions[i].inlier_indices =
+        (int *)aom_malloc(sizeof(*motions->inlier_indices) * npoints);
+    clear_motion(motions + i, npoints);
+  }
+  current_motion.inlier_indices =
+      (int *)aom_malloc(sizeof(*current_motion.inlier_indices) * npoints);
+  clear_motion(&current_motion, npoints);
+
+  worst_kept_motion = motions;
+
+  if (!(points1 && points2 && corners1 && corners2 && image1_coord && motions &&
+        current_motion.inlier_indices)) {
+    ret_val = 1;
+    goto finish_ransac;
+  }
+
+  cnp1 = corners1;
+  cnp2 = corners2;
+  for (i = 0; i < npoints; ++i) {
+    *(cnp1++) = *(matched_points++);
+    *(cnp1++) = *(matched_points++);
+    *(cnp2++) = *(matched_points++);
+    *(cnp2++) = *(matched_points++);
+  }
+
+  while (MIN_TRIALS > trial_count) {
+    double sum_distance = 0.0;
+    double sum_distance_squared = 0.0;
+
+    clear_motion(&current_motion, npoints);
+
+    int degenerate = 1;
+    int num_degenerate_iter = 0;
+
+    while (degenerate) {
+      num_degenerate_iter++;
+      if (!get_rand_indices(npoints, minpts, indices, &seed)) {
+        ret_val = 1;
+        goto finish_ransac;
+      }
+
+      copy_points_at_indices(points1, corners1, indices, minpts);
+      copy_points_at_indices(points2, corners2, indices, minpts);
+
+      degenerate = is_degenerate(points1);
+      if (num_degenerate_iter > MAX_DEGENERATE_ITER) {
+        ret_val = 1;
+        goto finish_ransac;
+      }
+    }
+
+    if (find_transformation(minpts, points1, points2, params_this_motion)) {
+      trial_count++;
+      continue;
+    }
+
+    projectpoints(params_this_motion, corners1, image1_coord, npoints, 2, 2);
+
+    for (i = 0; i < npoints; ++i) {
+      double dx = image1_coord[i * 2] - corners2[i * 2];
+      double dy = image1_coord[i * 2 + 1] - corners2[i * 2 + 1];
+      double distance = sqrt(dx * dx + dy * dy);
+
+      if (distance < INLIER_THRESHOLD) {
+        current_motion.inlier_indices[current_motion.num_inliers++] = i;
+        sum_distance += distance;
+        sum_distance_squared += distance * distance;
+      }
+    }
+
+    if (current_motion.num_inliers >= worst_kept_motion->num_inliers &&
+        current_motion.num_inliers > 1) {
+      double mean_distance;
+      mean_distance = sum_distance / ((double)current_motion.num_inliers);
+      current_motion.variance =
+          sum_distance_squared / ((double)current_motion.num_inliers - 1.0) -
+          mean_distance * mean_distance * ((double)current_motion.num_inliers) /
+              ((double)current_motion.num_inliers - 1.0);
+      if (is_better_motion(&current_motion, worst_kept_motion)) {
+        // This motion is better than the worst currently kept motion. Remember
+        // the inlier points and variance. The parameters for each kept motion
+        // will be recomputed later using only the inliers.
+        worst_kept_motion->num_inliers = current_motion.num_inliers;
+        worst_kept_motion->variance = current_motion.variance;
+        memcpy(worst_kept_motion->inlier_indices, current_motion.inlier_indices,
+               sizeof(*current_motion.inlier_indices) * npoints);
+        assert(npoints > 0);
+        // Determine the new worst kept motion and its num_inliers and variance.
+        for (i = 0; i < num_desired_motions; ++i) {
+          if (is_better_motion(worst_kept_motion, &motions[i])) {
+            worst_kept_motion = &motions[i];
+          }
+        }
+      }
+    }
+    trial_count++;
+  }
+
+  // Sort the motions, best first.
+  qsort(motions, num_desired_motions, sizeof(RANSAC_MOTION), compare_motions);
+
+  // Recompute the motions using only the inliers.
+  for (i = 0; i < num_desired_motions; ++i) {
+    if (motions[i].num_inliers >= minpts) {
+      copy_points_at_indices(points1, corners1, motions[i].inlier_indices,
+                             motions[i].num_inliers);
+      copy_points_at_indices(points2, corners2, motions[i].inlier_indices,
+                             motions[i].num_inliers);
+
+      find_transformation(motions[i].num_inliers, points1, points2,
+                          params_by_motion[i].params);
+      memcpy(params_by_motion[i].inliers, motions[i].inlier_indices,
+             sizeof(*motions[i].inlier_indices) * npoints);
+    }
+    num_inliers_by_motion[i] = motions[i].num_inliers;
+  }
+
+finish_ransac:
+  aom_free(points1);
+  aom_free(points2);
+  aom_free(corners1);
+  aom_free(corners2);
+  aom_free(image1_coord);
+  aom_free(current_motion.inlier_indices);
+  for (i = 0; i < num_desired_motions; ++i) {
+    aom_free(motions[i].inlier_indices);
+  }
+  aom_free(motions);
+
+  return ret_val;
+}
+
+static int is_collinear3(double *p1, double *p2, double *p3) {
+  static const double collinear_eps = 1e-3;
+  const double v =
+      (p2[0] - p1[0]) * (p3[1] - p1[1]) - (p2[1] - p1[1]) * (p3[0] - p1[0]);
+  return fabs(v) < collinear_eps;
+}
+
+static int is_degenerate_translation(double *p) {
+  return (p[0] - p[2]) * (p[0] - p[2]) + (p[1] - p[3]) * (p[1] - p[3]) <= 2;
+}
+
+static int is_degenerate_affine(double *p) {
+  return is_collinear3(p, p + 2, p + 4);
+}
+
+static int ransac_translation(int *matched_points, int npoints,
+                              int *num_inliers_by_motion,
+                              MotionModel *params_by_motion,
+                              int num_desired_motions) {
+  return ransac(matched_points, npoints, num_inliers_by_motion,
+                params_by_motion, num_desired_motions, 3,
+                is_degenerate_translation, find_translation,
+                project_points_double_translation);
+}
+
+static int ransac_rotzoom(int *matched_points, int npoints,
+                          int *num_inliers_by_motion,
+                          MotionModel *params_by_motion,
+                          int num_desired_motions) {
+  return ransac(matched_points, npoints, num_inliers_by_motion,
+                params_by_motion, num_desired_motions, 3, is_degenerate_affine,
+                find_rotzoom, project_points_double_rotzoom);
+}
+
+static int ransac_affine(int *matched_points, int npoints,
+                         int *num_inliers_by_motion,
+                         MotionModel *params_by_motion,
+                         int num_desired_motions) {
+  return ransac(matched_points, npoints, num_inliers_by_motion,
+                params_by_motion, num_desired_motions, 3, is_degenerate_affine,
+                find_affine, project_points_double_affine);
+}
+
+RansacFunc av1_get_ransac_type(TransformationType type) {
+  switch (type) {
+    case AFFINE: return ransac_affine;
+    case ROTZOOM: return ransac_rotzoom;
+    case TRANSLATION: return ransac_translation;
+    default: assert(0); return NULL;
+  }
+}
+
+static int ransac_translation_double_prec(double *matched_points, int npoints,
+                                          int *num_inliers_by_motion,
+                                          MotionModel *params_by_motion,
+                                          int num_desired_motions) {
+  return ransac_double_prec(matched_points, npoints, num_inliers_by_motion,
+                            params_by_motion, num_desired_motions, 3,
+                            is_degenerate_translation, find_translation,
+                            project_points_double_translation);
+}
+
+static int ransac_rotzoom_double_prec(double *matched_points, int npoints,
+                                      int *num_inliers_by_motion,
+                                      MotionModel *params_by_motion,
+                                      int num_desired_motions) {
+  return ransac_double_prec(matched_points, npoints, num_inliers_by_motion,
+                            params_by_motion, num_desired_motions, 3,
+                            is_degenerate_affine, find_rotzoom,
+                            project_points_double_rotzoom);
+}
+
+static int ransac_affine_double_prec(double *matched_points, int npoints,
+                                     int *num_inliers_by_motion,
+                                     MotionModel *params_by_motion,
+                                     int num_desired_motions) {
+  return ransac_double_prec(matched_points, npoints, num_inliers_by_motion,
+                            params_by_motion, num_desired_motions, 3,
+                            is_degenerate_affine, find_affine,
+                            project_points_double_affine);
+}
+
+RansacFuncDouble av1_get_ransac_double_prec_type(TransformationType type) {
+  switch (type) {
+    case AFFINE: return ransac_affine_double_prec;
+    case ROTZOOM: return ransac_rotzoom_double_prec;
+    case TRANSLATION: return ransac_translation_double_prec;
+    default: assert(0); return NULL;
+  }
+}
diff --git a/libs/libaom/src/av1/encoder/ransac.h b/libs/libaom/src/av1/encoder/ransac.h
new file mode 100644
index 000000000..583d97152
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/ransac.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_RANSAC_H_
+#define AOM_AV1_ENCODER_RANSAC_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <memory.h>
+
+#include "av1/common/warped_motion.h"
+#include "av1/encoder/global_motion.h"
+
+typedef int (*RansacFunc)(int *matched_points, int npoints,
+                          int *num_inliers_by_motion,
+                          MotionModel *params_by_motion, int num_motions);
+typedef int (*RansacFuncDouble)(double *matched_points, int npoints,
+                                int *num_inliers_by_motion,
+                                MotionModel *params_by_motion, int num_motions);
+RansacFunc av1_get_ransac_type(TransformationType type);
+RansacFuncDouble av1_get_ransac_double_prec_type(TransformationType type);
+#endif  // AOM_AV1_ENCODER_RANSAC_H_
diff --git a/libs/libaom/src/av1/encoder/ratectrl.c b/libs/libaom/src/av1/encoder/ratectrl.c
new file mode 100644
index 000000000..433163f2e
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/ratectrl.c
@@ -0,0 +1,2117 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/system_state.h"
+
+#include "av1/common/alloccommon.h"
+#include "av1/encoder/aq_cyclicrefresh.h"
+#include "av1/common/common.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/seg_common.h"
+
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/encode_strategy.h"
+#include "av1/encoder/gop_structure.h"
+#include "av1/encoder/random.h"
+#include "av1/encoder/ratectrl.h"
+
+#define USE_UNRESTRICTED_Q_IN_CQ_MODE 0
+
+// Max rate target for 1080P and below encodes under normal circumstances
+// (1920 * 1080 / (16 * 16)) * MAX_MB_RATE bits per MB
+#define MAX_MB_RATE 250
+#define MAXRATE_1080P 2025000
+
+#define MIN_BPB_FACTOR 0.005
+#define MAX_BPB_FACTOR 50
+
+#define SUPERRES_QADJ_PER_DENOM_KEYFRAME_SOLO 0
+#define SUPERRES_QADJ_PER_DENOM_KEYFRAME 2
+#define SUPERRES_QADJ_PER_DENOM_ARFFRAME 0
+
+#define FRAME_OVERHEAD_BITS 200
+#define ASSIGN_MINQ_TABLE(bit_depth, name)                   \
+  do {                                                       \
+    switch (bit_depth) {                                     \
+      case AOM_BITS_8: name = name##_8; break;               \
+      case AOM_BITS_10: name = name##_10; break;             \
+      case AOM_BITS_12: name = name##_12; break;             \
+      default:                                               \
+        assert(0 &&                                          \
+               "bit_depth should be AOM_BITS_8, AOM_BITS_10" \
+               " or AOM_BITS_12");                           \
+        name = NULL;                                         \
+    }                                                        \
+  } while (0)
+
+// Tables relating active max Q to active min Q
+static int kf_low_motion_minq_8[QINDEX_RANGE];
+static int kf_high_motion_minq_8[QINDEX_RANGE];
+static int arfgf_low_motion_minq_8[QINDEX_RANGE];
+static int arfgf_high_motion_minq_8[QINDEX_RANGE];
+static int inter_minq_8[QINDEX_RANGE];
+static int rtc_minq_8[QINDEX_RANGE];
+
+static int kf_low_motion_minq_10[QINDEX_RANGE];
+static int kf_high_motion_minq_10[QINDEX_RANGE];
+static int arfgf_low_motion_minq_10[QINDEX_RANGE];
+static int arfgf_high_motion_minq_10[QINDEX_RANGE];
+static int inter_minq_10[QINDEX_RANGE];
+static int rtc_minq_10[QINDEX_RANGE];
+static int kf_low_motion_minq_12[QINDEX_RANGE];
+static int kf_high_motion_minq_12[QINDEX_RANGE];
+static int arfgf_low_motion_minq_12[QINDEX_RANGE];
+static int arfgf_high_motion_minq_12[QINDEX_RANGE];
+static int inter_minq_12[QINDEX_RANGE];
+static int rtc_minq_12[QINDEX_RANGE];
+
+static int gf_high = 2400;
+static int gf_low = 300;
+#ifdef STRICT_RC
+static int kf_high = 3200;
+#else
+static int kf_high = 5000;
+#endif
+static int kf_low = 400;
+
+// How many times less pixels there are to encode given the current scaling.
+// Temporary replacement for rcf_mult and rate_thresh_mult.
+static double resize_rate_factor(const AV1_COMP *cpi, int width, int height) {
+  return (double)(cpi->oxcf.width * cpi->oxcf.height) / (width * height);
+}
+
+// Functions to compute the active minq lookup table entries based on a
+// formulaic approach to facilitate easier adjustment of the Q tables.
+// The formulae were derived from computing a 3rd order polynomial best
+// fit to the original data (after plotting real maxq vs minq (not q index))
+static int get_minq_index(double maxq, double x3, double x2, double x1,
+                          aom_bit_depth_t bit_depth) {
+  const double minqtarget = AOMMIN(((x3 * maxq + x2) * maxq + x1) * maxq, maxq);
+
+  // Special case handling to deal with the step from q2.0
+  // down to lossless mode represented by q 1.0.
+  if (minqtarget <= 2.0) return 0;
+
+  return av1_find_qindex(minqtarget, bit_depth, 0, QINDEX_RANGE - 1);
+}
+
+static void init_minq_luts(int *kf_low_m, int *kf_high_m, int *arfgf_low,
+                           int *arfgf_high, int *inter, int *rtc,
+                           aom_bit_depth_t bit_depth) {
+  int i;
+  for (i = 0; i < QINDEX_RANGE; i++) {
+    const double maxq = av1_convert_qindex_to_q(i, bit_depth);
+    kf_low_m[i] = get_minq_index(maxq, 0.000001, -0.0004, 0.150, bit_depth);
+    kf_high_m[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.45, bit_depth);
+    arfgf_low[i] = get_minq_index(maxq, 0.0000015, -0.0009, 0.30, bit_depth);
+    arfgf_high[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.55, bit_depth);
+    inter[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.90, bit_depth);
+    rtc[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.70, bit_depth);
+  }
+}
+
+void av1_rc_init_minq_luts(void) {
+  init_minq_luts(kf_low_motion_minq_8, kf_high_motion_minq_8,
+                 arfgf_low_motion_minq_8, arfgf_high_motion_minq_8,
+                 inter_minq_8, rtc_minq_8, AOM_BITS_8);
+  init_minq_luts(kf_low_motion_minq_10, kf_high_motion_minq_10,
+                 arfgf_low_motion_minq_10, arfgf_high_motion_minq_10,
+                 inter_minq_10, rtc_minq_10, AOM_BITS_10);
+  init_minq_luts(kf_low_motion_minq_12, kf_high_motion_minq_12,
+                 arfgf_low_motion_minq_12, arfgf_high_motion_minq_12,
+                 inter_minq_12, rtc_minq_12, AOM_BITS_12);
+}
+
+// These functions use formulaic calculations to make playing with the
+// quantizer tables easier. If necessary they can be replaced by lookup
+// tables if and when things settle down in the experimental bitstream
+double av1_convert_qindex_to_q(int qindex, aom_bit_depth_t bit_depth) {
+  // Convert the index to a real Q value (scaled down to match old Q values)
+  switch (bit_depth) {
+    case AOM_BITS_8: return av1_ac_quant_QTX(qindex, 0, bit_depth) / 4.0;
+    case AOM_BITS_10: return av1_ac_quant_QTX(qindex, 0, bit_depth) / 16.0;
+    case AOM_BITS_12: return av1_ac_quant_QTX(qindex, 0, bit_depth) / 64.0;
+    default:
+      assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
+      return -1.0;
+  }
+}
+
+int av1_rc_bits_per_mb(FRAME_TYPE frame_type, int qindex,
+                       double correction_factor, aom_bit_depth_t bit_depth) {
+  const double q = av1_convert_qindex_to_q(qindex, bit_depth);
+  int enumerator = frame_type == KEY_FRAME ? 2000000 : 1500000;
+
+  assert(correction_factor <= MAX_BPB_FACTOR &&
+         correction_factor >= MIN_BPB_FACTOR);
+
+  // q based adjustment to baseline enumerator
+  return (int)(enumerator * correction_factor / q);
+}
+
+int av1_estimate_bits_at_q(FRAME_TYPE frame_type, int q, int mbs,
+                           double correction_factor,
+                           aom_bit_depth_t bit_depth) {
+  const int bpm =
+      (int)(av1_rc_bits_per_mb(frame_type, q, correction_factor, bit_depth));
+  return AOMMAX(FRAME_OVERHEAD_BITS,
+                (int)((uint64_t)bpm * mbs) >> BPER_MB_NORMBITS);
+}
+
+int av1_rc_clamp_pframe_target_size(const AV1_COMP *const cpi, int target,
+                                    FRAME_UPDATE_TYPE frame_update_type) {
+  const RATE_CONTROL *rc = &cpi->rc;
+  const AV1EncoderConfig *oxcf = &cpi->oxcf;
+  const int min_frame_target =
+      AOMMAX(rc->min_frame_bandwidth, rc->avg_frame_bandwidth >> 5);
+  // Clip the frame target to the minimum setup value.
+  if (frame_update_type == OVERLAY_UPDATE ||
+      frame_update_type == INTNL_OVERLAY_UPDATE) {
+    // If there is an active ARF at this location use the minimum
+    // bits on this frame even if it is a constructed arf.
+    // The active maximum quantizer insures that an appropriate
+    // number of bits will be spent if needed for constructed ARFs.
+    target = min_frame_target;
+  } else if (target < min_frame_target) {
+    target = min_frame_target;
+  }
+
+  // Clip the frame target to the maximum allowed value.
+  if (target > rc->max_frame_bandwidth) target = rc->max_frame_bandwidth;
+  if (oxcf->rc_max_inter_bitrate_pct) {
+    const int max_rate =
+        rc->avg_frame_bandwidth * oxcf->rc_max_inter_bitrate_pct / 100;
+    target = AOMMIN(target, max_rate);
+  }
+
+  return target;
+}
+
+int av1_rc_clamp_iframe_target_size(const AV1_COMP *const cpi, int target) {
+  const RATE_CONTROL *rc = &cpi->rc;
+  const AV1EncoderConfig *oxcf = &cpi->oxcf;
+  if (oxcf->rc_max_intra_bitrate_pct) {
+    const int max_rate =
+        rc->avg_frame_bandwidth * oxcf->rc_max_intra_bitrate_pct / 100;
+    target = AOMMIN(target, max_rate);
+  }
+  if (target > rc->max_frame_bandwidth) target = rc->max_frame_bandwidth;
+  return target;
+}
+
+// Update the buffer level for higher temporal layers, given the encoded current
+// temporal layer.
+static void update_layer_buffer_level(SVC *svc, int encoded_frame_size) {
+  const int current_temporal_layer = svc->temporal_layer_id;
+  for (int i = current_temporal_layer + 1; i < svc->number_temporal_layers;
+       ++i) {
+    const int layer =
+        LAYER_IDS_TO_IDX(svc->spatial_layer_id, i, svc->number_temporal_layers);
+    LAYER_CONTEXT *lc = &svc->layer_context[layer];
+    RATE_CONTROL *lrc = &lc->rc;
+    lrc->bits_off_target +=
+        (int)(lc->target_bandwidth / lc->framerate) - encoded_frame_size;
+    // Clip buffer level to maximum buffer size for the layer.
+    lrc->bits_off_target =
+        AOMMIN(lrc->bits_off_target, lrc->maximum_buffer_size);
+    lrc->buffer_level = lrc->bits_off_target;
+  }
+}
+// Update the buffer level: leaky bucket model.
+static void update_buffer_level(AV1_COMP *cpi, int encoded_frame_size) {
+  const AV1_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+
+  // Non-viewable frames are a special case and are treated as pure overhead.
+  if (!cm->show_frame)
+    rc->bits_off_target -= encoded_frame_size;
+  else
+    rc->bits_off_target += rc->avg_frame_bandwidth - encoded_frame_size;
+
+  // Clip the buffer level to the maximum specified buffer size.
+  rc->bits_off_target = AOMMIN(rc->bits_off_target, rc->maximum_buffer_size);
+  rc->buffer_level = rc->bits_off_target;
+
+  if (cpi->use_svc) update_layer_buffer_level(&cpi->svc, encoded_frame_size);
+}
+
+int av1_rc_get_default_min_gf_interval(int width, int height,
+                                       double framerate) {
+  // Assume we do not need any constraint lower than 4K 20 fps
+  static const double factor_safe = 3840 * 2160 * 20.0;
+  const double factor = width * height * framerate;
+  const int default_interval =
+      clamp((int)(framerate * 0.125), MIN_GF_INTERVAL, MAX_GF_INTERVAL);
+
+  if (factor <= factor_safe)
+    return default_interval;
+  else
+    return AOMMAX(default_interval,
+                  (int)(MIN_GF_INTERVAL * factor / factor_safe + 0.5));
+  // Note this logic makes:
+  // 4K24: 5
+  // 4K30: 6
+  // 4K60: 12
+}
+
+int av1_rc_get_default_max_gf_interval(double framerate, int min_gf_interval) {
+  int interval = AOMMIN(MAX_GF_INTERVAL, (int)(framerate * 0.75));
+  interval += (interval & 0x01);  // Round to even value
+  interval = AOMMAX(MAX_GF_INTERVAL, interval);
+  return AOMMAX(interval, min_gf_interval);
+}
+
+void av1_rc_init(const AV1EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) {
+  int i;
+
+  if (pass == 0 && oxcf->rc_mode == AOM_CBR) {
+    rc->avg_frame_qindex[KEY_FRAME] = oxcf->worst_allowed_q;
+    rc->avg_frame_qindex[INTER_FRAME] = oxcf->worst_allowed_q;
+  } else {
+    rc->avg_frame_qindex[KEY_FRAME] =
+        (oxcf->worst_allowed_q + oxcf->best_allowed_q) / 2;
+    rc->avg_frame_qindex[INTER_FRAME] =
+        (oxcf->worst_allowed_q + oxcf->best_allowed_q) / 2;
+  }
+
+  rc->last_q[KEY_FRAME] = oxcf->best_allowed_q;
+  rc->last_q[INTER_FRAME] = oxcf->worst_allowed_q;
+
+  rc->buffer_level = rc->starting_buffer_level;
+  rc->bits_off_target = rc->starting_buffer_level;
+
+  rc->rolling_target_bits = rc->avg_frame_bandwidth;
+  rc->rolling_actual_bits = rc->avg_frame_bandwidth;
+  rc->long_rolling_target_bits = rc->avg_frame_bandwidth;
+  rc->long_rolling_actual_bits = rc->avg_frame_bandwidth;
+
+  rc->total_actual_bits = 0;
+  rc->total_target_bits = 0;
+  rc->total_target_vs_actual = 0;
+
+  rc->frames_since_key = 8;  // Sensible default for first frame.
+  rc->this_key_frame_forced = 0;
+  rc->next_key_frame_forced = 0;
+  rc->source_alt_ref_pending = 0;
+  rc->source_alt_ref_active = 0;
+
+  rc->frames_till_gf_update_due = 0;
+  rc->ni_av_qi = oxcf->worst_allowed_q;
+  rc->ni_tot_qi = 0;
+  rc->ni_frames = 0;
+
+  rc->tot_q = 0.0;
+  rc->avg_q = av1_convert_qindex_to_q(oxcf->worst_allowed_q, oxcf->bit_depth);
+
+  for (i = 0; i < RATE_FACTOR_LEVELS; ++i) {
+    rc->rate_correction_factors[i] = 0.7;
+  }
+  rc->rate_correction_factors[KF_STD] = 1.0;
+  rc->min_gf_interval = oxcf->min_gf_interval;
+  rc->max_gf_interval = oxcf->max_gf_interval;
+  if (rc->min_gf_interval == 0)
+    rc->min_gf_interval = av1_rc_get_default_min_gf_interval(
+        oxcf->width, oxcf->height, oxcf->init_framerate);
+  if (rc->max_gf_interval == 0)
+    rc->max_gf_interval = av1_rc_get_default_max_gf_interval(
+        oxcf->init_framerate, rc->min_gf_interval);
+  rc->baseline_gf_interval = (rc->min_gf_interval + rc->max_gf_interval) / 2;
+}
+
+int av1_rc_drop_frame(AV1_COMP *cpi) {
+  const AV1EncoderConfig *oxcf = &cpi->oxcf;
+  RATE_CONTROL *const rc = &cpi->rc;
+
+  if (!oxcf->drop_frames_water_mark) {
+    return 0;
+  } else {
+    if (rc->buffer_level < 0) {
+      // Always drop if buffer is below 0.
+      return 1;
+    } else {
+      // If buffer is below drop_mark, for now just drop every other frame
+      // (starting with the next frame) until it increases back over drop_mark.
+      int drop_mark =
+          (int)(oxcf->drop_frames_water_mark * rc->optimal_buffer_level / 100);
+      if ((rc->buffer_level > drop_mark) && (rc->decimation_factor > 0)) {
+        --rc->decimation_factor;
+      } else if (rc->buffer_level <= drop_mark && rc->decimation_factor == 0) {
+        rc->decimation_factor = 1;
+      }
+      if (rc->decimation_factor > 0) {
+        if (rc->decimation_count > 0) {
+          --rc->decimation_count;
+          return 1;
+        } else {
+          rc->decimation_count = rc->decimation_factor;
+          return 0;
+        }
+      } else {
+        rc->decimation_count = 0;
+        return 0;
+      }
+    }
+  }
+}
+
+static int adjust_q_cbr(const AV1_COMP *cpi, int q, int active_worst_quality) {
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const AV1_COMMON *const cm = &cpi->common;
+  const int max_delta = 16;
+  const int change_avg_frame_bandwidth =
+      abs(rc->avg_frame_bandwidth - rc->prev_avg_frame_bandwidth) >
+      0.1 * (rc->avg_frame_bandwidth);
+  // If resolution changes or avg_frame_bandwidth significantly changed,
+  // then set this flag to indicate change in target bits per macroblock.
+  const int change_target_bits_mb =
+      cm->prev_frame &&
+      (cm->width != cm->prev_frame->width ||
+       cm->height != cm->prev_frame->height || change_avg_frame_bandwidth);
+  // Apply some control/clamp to QP under certain conditions.
+  if (cm->current_frame.frame_type != KEY_FRAME && !cpi->use_svc &&
+      rc->frames_since_key > 1 && !change_target_bits_mb &&
+      (!cpi->oxcf.gf_cbr_boost_pct ||
+       !(cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame))) {
+    // Make sure q is between oscillating Qs to prevent resonance.
+    if (rc->rc_1_frame * rc->rc_2_frame == -1 &&
+        rc->q_1_frame != rc->q_2_frame) {
+      q = clamp(q, AOMMIN(rc->q_1_frame, rc->q_2_frame),
+                AOMMAX(rc->q_1_frame, rc->q_2_frame));
+    }
+    // Limit the decrease in Q from previous frame.
+    if (rc->q_1_frame - q > max_delta) q = rc->q_1_frame - max_delta;
+  }
+  // For single spatial layer: if resolution has increased push q closer
+  // to the active_worst to avoid excess overshoot.
+  if (cpi->svc.number_spatial_layers <= 1 && cm->prev_frame &&
+      (cm->width * cm->height >
+       1.5 * cm->prev_frame->width * cm->prev_frame->height))
+    q = (q + active_worst_quality) >> 1;
+  return AOMMAX(AOMMIN(q, cpi->rc.worst_quality), cpi->rc.best_quality);
+}
+
+static const RATE_FACTOR_LEVEL rate_factor_levels[FRAME_UPDATE_TYPES] = {
+  KF_STD,        // KF_UPDATE
+  INTER_NORMAL,  // LF_UPDATE
+  GF_ARF_STD,    // GF_UPDATE
+  GF_ARF_STD,    // ARF_UPDATE
+  INTER_NORMAL,  // OVERLAY_UPDATE
+  INTER_NORMAL,  // INTNL_OVERLAY_UPDATE
+  GF_ARF_LOW,    // INTNL_ARF_UPDATE
+};
+
+static RATE_FACTOR_LEVEL get_rate_factor_level(const GF_GROUP *const gf_group) {
+  const FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_group->index];
+  assert(update_type < FRAME_UPDATE_TYPES);
+  return rate_factor_levels[update_type];
+}
+
+static double get_rate_correction_factor(const AV1_COMP *cpi, int width,
+                                         int height) {
+  const RATE_CONTROL *const rc = &cpi->rc;
+  double rcf;
+
+  if (cpi->common.current_frame.frame_type == KEY_FRAME) {
+    rcf = rc->rate_correction_factors[KF_STD];
+  } else if (is_stat_consumption_stage(cpi)) {
+    const RATE_FACTOR_LEVEL rf_lvl = get_rate_factor_level(&cpi->gf_group);
+    rcf = rc->rate_correction_factors[rf_lvl];
+  } else {
+    if ((cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) &&
+        !rc->is_src_frame_alt_ref && !cpi->use_svc &&
+        (cpi->oxcf.rc_mode != AOM_CBR || cpi->oxcf.gf_cbr_boost_pct > 20))
+      rcf = rc->rate_correction_factors[GF_ARF_STD];
+    else
+      rcf = rc->rate_correction_factors[INTER_NORMAL];
+  }
+  rcf *= resize_rate_factor(cpi, width, height);
+  return fclamp(rcf, MIN_BPB_FACTOR, MAX_BPB_FACTOR);
+}
+
+static void set_rate_correction_factor(AV1_COMP *cpi, double factor, int width,
+                                       int height) {
+  RATE_CONTROL *const rc = &cpi->rc;
+
+  // Normalize RCF to account for the size-dependent scaling factor.
+  factor /= resize_rate_factor(cpi, width, height);
+
+  factor = fclamp(factor, MIN_BPB_FACTOR, MAX_BPB_FACTOR);
+
+  if (cpi->common.current_frame.frame_type == KEY_FRAME) {
+    rc->rate_correction_factors[KF_STD] = factor;
+  } else if (is_stat_consumption_stage(cpi)) {
+    const RATE_FACTOR_LEVEL rf_lvl = get_rate_factor_level(&cpi->gf_group);
+    rc->rate_correction_factors[rf_lvl] = factor;
+  } else {
+    if ((cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) &&
+        !rc->is_src_frame_alt_ref && !cpi->use_svc &&
+        (cpi->oxcf.rc_mode != AOM_CBR || cpi->oxcf.gf_cbr_boost_pct > 20))
+      rc->rate_correction_factors[GF_ARF_STD] = factor;
+    else
+      rc->rate_correction_factors[INTER_NORMAL] = factor;
+  }
+}
+
+void av1_rc_update_rate_correction_factors(AV1_COMP *cpi, int width,
+                                           int height) {
+  const AV1_COMMON *const cm = &cpi->common;
+  int correction_factor = 100;
+  double rate_correction_factor =
+      get_rate_correction_factor(cpi, width, height);
+  double adjustment_limit;
+  const int MBs = av1_get_MBs(width, height);
+
+  int projected_size_based_on_q = 0;
+
+  // Do not update the rate factors for arf overlay frames.
+  if (cpi->rc.is_src_frame_alt_ref) return;
+
+  // Clear down mmx registers to allow floating point in what follows
+  aom_clear_system_state();
+
+  // Work out how big we would have expected the frame to be at this Q given
+  // the current correction factor.
+  // Stay in double to avoid int overflow when values are large
+  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cpi->common.seg.enabled) {
+    projected_size_based_on_q =
+        av1_cyclic_refresh_estimate_bits_at_q(cpi, rate_correction_factor);
+  } else {
+    projected_size_based_on_q = av1_estimate_bits_at_q(
+        cm->current_frame.frame_type, cm->quant_params.base_qindex, MBs,
+        rate_correction_factor, cm->seq_params.bit_depth);
+  }
+  // Work out a size correction factor.
+  if (projected_size_based_on_q > FRAME_OVERHEAD_BITS)
+    correction_factor = (int)((100 * (int64_t)cpi->rc.projected_frame_size) /
+                              projected_size_based_on_q);
+
+  // More heavily damped adjustment used if we have been oscillating either side
+  // of target.
+  if (correction_factor > 0) {
+    adjustment_limit =
+        0.25 + 0.5 * AOMMIN(1, fabs(log10(0.01 * correction_factor)));
+  } else {
+    adjustment_limit = 0.75;
+  }
+
+  cpi->rc.q_2_frame = cpi->rc.q_1_frame;
+  cpi->rc.q_1_frame = cm->quant_params.base_qindex;
+  cpi->rc.rc_2_frame = cpi->rc.rc_1_frame;
+  if (correction_factor > 110)
+    cpi->rc.rc_1_frame = -1;
+  else if (correction_factor < 90)
+    cpi->rc.rc_1_frame = 1;
+  else
+    cpi->rc.rc_1_frame = 0;
+
+  if (correction_factor > 102) {
+    // We are not already at the worst allowable quality
+    correction_factor =
+        (int)(100 + ((correction_factor - 100) * adjustment_limit));
+    rate_correction_factor = (rate_correction_factor * correction_factor) / 100;
+    // Keep rate_correction_factor within limits
+    if (rate_correction_factor > MAX_BPB_FACTOR)
+      rate_correction_factor = MAX_BPB_FACTOR;
+  } else if (correction_factor < 99) {
+    // We are not already at the best allowable quality
+    correction_factor =
+        (int)(100 - ((100 - correction_factor) * adjustment_limit));
+    rate_correction_factor = (rate_correction_factor * correction_factor) / 100;
+
+    // Keep rate_correction_factor within limits
+    if (rate_correction_factor < MIN_BPB_FACTOR)
+      rate_correction_factor = MIN_BPB_FACTOR;
+  }
+
+  set_rate_correction_factor(cpi, rate_correction_factor, width, height);
+}
+
+// Calculate rate for the given 'q'.
+static int get_bits_per_mb(const AV1_COMP *cpi, int use_cyclic_refresh,
+                           double correction_factor, int q) {
+  const AV1_COMMON *const cm = &cpi->common;
+  return use_cyclic_refresh
+             ? av1_cyclic_refresh_rc_bits_per_mb(cpi, q, correction_factor)
+             : av1_rc_bits_per_mb(cm->current_frame.frame_type, q,
+                                  correction_factor, cm->seq_params.bit_depth);
+}
+
+// Similar to find_qindex_by_rate() function in ratectrl.c, but returns the q
+// index with rate just above or below the desired rate, depending on which of
+// the two rates is closer to the desired rate.
+// Also, respects the selected aq_mode when computing the rate.
+static int find_closest_qindex_by_rate(int desired_bits_per_mb,
+                                       const AV1_COMP *cpi,
+                                       double correction_factor,
+                                       int best_qindex, int worst_qindex) {
+  const int use_cyclic_refresh = cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
+                                 cpi->cyclic_refresh->apply_cyclic_refresh;
+
+  // Find 'qindex' based on 'desired_bits_per_mb'.
+  assert(best_qindex <= worst_qindex);
+  int low = best_qindex;
+  int high = worst_qindex;
+  while (low < high) {
+    const int mid = (low + high) >> 1;
+    const int mid_bits_per_mb =
+        get_bits_per_mb(cpi, use_cyclic_refresh, correction_factor, mid);
+    if (mid_bits_per_mb > desired_bits_per_mb) {
+      low = mid + 1;
+    } else {
+      high = mid;
+    }
+  }
+  assert(low == high);
+
+  // Calculate rate difference of this q index from the desired rate.
+  const int curr_q = low;
+  const int curr_bits_per_mb =
+      get_bits_per_mb(cpi, use_cyclic_refresh, correction_factor, curr_q);
+  const int curr_bit_diff = (curr_bits_per_mb <= desired_bits_per_mb)
+                                ? desired_bits_per_mb - curr_bits_per_mb
+                                : INT_MAX;
+  assert((curr_bit_diff != INT_MAX && curr_bit_diff >= 0) ||
+         curr_q == worst_qindex);
+
+  // Calculate rate difference for previous q index too.
+  const int prev_q = curr_q - 1;
+  int prev_bit_diff;
+  if (curr_bit_diff == INT_MAX || curr_q == best_qindex) {
+    prev_bit_diff = INT_MAX;
+  } else {
+    const int prev_bits_per_mb =
+        get_bits_per_mb(cpi, use_cyclic_refresh, correction_factor, prev_q);
+    assert(prev_bits_per_mb > desired_bits_per_mb);
+    prev_bit_diff = prev_bits_per_mb - desired_bits_per_mb;
+  }
+
+  // Pick one of the two q indices, depending on which one has rate closer to
+  // the desired rate.
+  return (curr_bit_diff <= prev_bit_diff) ? curr_q : prev_q;
+}
+
+int av1_rc_regulate_q(const AV1_COMP *cpi, int target_bits_per_frame,
+                      int active_best_quality, int active_worst_quality,
+                      int width, int height) {
+  const int MBs = av1_get_MBs(width, height);
+  const double correction_factor =
+      get_rate_correction_factor(cpi, width, height);
+  const int target_bits_per_mb =
+      (int)(((uint64_t)target_bits_per_frame << BPER_MB_NORMBITS) / MBs);
+
+  int q =
+      find_closest_qindex_by_rate(target_bits_per_mb, cpi, correction_factor,
+                                  active_best_quality, active_worst_quality);
+  if (cpi->oxcf.rc_mode == AOM_CBR && has_no_stats_stage(cpi))
+    return adjust_q_cbr(cpi, q, active_worst_quality);
+
+  return q;
+}
+
+static int get_active_quality(int q, int gfu_boost, int low, int high,
+                              int *low_motion_minq, int *high_motion_minq) {
+  if (gfu_boost > high) {
+    return low_motion_minq[q];
+  } else if (gfu_boost < low) {
+    return high_motion_minq[q];
+  } else {
+    const int gap = high - low;
+    const int offset = high - gfu_boost;
+    const int qdiff = high_motion_minq[q] - low_motion_minq[q];
+    const int adjustment = ((offset * qdiff) + (gap >> 1)) / gap;
+    return low_motion_minq[q] + adjustment;
+  }
+}
+
+static int get_kf_active_quality(const RATE_CONTROL *const rc, int q,
+                                 aom_bit_depth_t bit_depth) {
+  int *kf_low_motion_minq;
+  int *kf_high_motion_minq;
+  ASSIGN_MINQ_TABLE(bit_depth, kf_low_motion_minq);
+  ASSIGN_MINQ_TABLE(bit_depth, kf_high_motion_minq);
+  return get_active_quality(q, rc->kf_boost, kf_low, kf_high,
+                            kf_low_motion_minq, kf_high_motion_minq);
+}
+
+static int get_gf_active_quality(const RATE_CONTROL *const rc, int q,
+                                 aom_bit_depth_t bit_depth) {
+  int *arfgf_low_motion_minq;
+  int *arfgf_high_motion_minq;
+  ASSIGN_MINQ_TABLE(bit_depth, arfgf_low_motion_minq);
+  ASSIGN_MINQ_TABLE(bit_depth, arfgf_high_motion_minq);
+  return get_active_quality(q, rc->gfu_boost, gf_low, gf_high,
+                            arfgf_low_motion_minq, arfgf_high_motion_minq);
+}
+
+static int get_gf_high_motion_quality(int q, aom_bit_depth_t bit_depth) {
+  int *arfgf_high_motion_minq;
+  ASSIGN_MINQ_TABLE(bit_depth, arfgf_high_motion_minq);
+  return arfgf_high_motion_minq[q];
+}
+
+static int calc_active_worst_quality_one_pass_vbr(const AV1_COMP *cpi) {
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const unsigned int curr_frame = cpi->common.current_frame.frame_number;
+  int active_worst_quality;
+
+  if (cpi->common.current_frame.frame_type == KEY_FRAME) {
+    active_worst_quality =
+        curr_frame == 0 ? rc->worst_quality : rc->last_q[KEY_FRAME] * 2;
+  } else {
+    if (!rc->is_src_frame_alt_ref &&
+        (cpi->refresh_golden_frame || cpi->refresh_bwd_ref_frame ||
+         cpi->refresh_alt_ref_frame)) {
+      active_worst_quality = curr_frame == 1 ? rc->last_q[KEY_FRAME] * 5 / 4
+                                             : rc->last_q[INTER_FRAME];
+    } else {
+      active_worst_quality = curr_frame == 1 ? rc->last_q[KEY_FRAME] * 2
+                                             : rc->last_q[INTER_FRAME] * 2;
+    }
+  }
+  return AOMMIN(active_worst_quality, rc->worst_quality);
+}
+
+// Adjust active_worst_quality level based on buffer level.
+static int calc_active_worst_quality_one_pass_cbr(const AV1_COMP *cpi) {
+  // Adjust active_worst_quality: If buffer is above the optimal/target level,
+  // bring active_worst_quality down depending on fullness of buffer.
+  // If buffer is below the optimal level, let the active_worst_quality go from
+  // ambient Q (at buffer = optimal level) to worst_quality level
+  // (at buffer = critical level).
+  const AV1_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *rc = &cpi->rc;
+  // Buffer level below which we push active_worst to worst_quality.
+  int64_t critical_level = rc->optimal_buffer_level >> 3;
+  int64_t buff_lvl_step = 0;
+  int adjustment = 0;
+  int active_worst_quality;
+  int ambient_qp;
+  if (cm->current_frame.frame_type == KEY_FRAME) return rc->worst_quality;
+  // For ambient_qp we use minimum of avg_frame_qindex[KEY_FRAME/INTER_FRAME]
+  // for the first few frames following key frame. These are both initialized
+  // to worst_quality and updated with (3/4, 1/4) average in postencode_update.
+  // So for first few frames following key, the qp of that key frame is weighted
+  // into the active_worst_quality setting.
+  ambient_qp = (cm->current_frame.frame_number < 5)
+                   ? AOMMIN(rc->avg_frame_qindex[INTER_FRAME],
+                            rc->avg_frame_qindex[KEY_FRAME])
+                   : rc->avg_frame_qindex[INTER_FRAME];
+  active_worst_quality = AOMMIN(rc->worst_quality, ambient_qp * 5 / 4);
+  if (rc->buffer_level > rc->optimal_buffer_level) {
+    // Adjust down.
+    // Maximum limit for down adjustment, ~30%.
+    int max_adjustment_down = active_worst_quality / 3;
+    if (max_adjustment_down) {
+      buff_lvl_step = ((rc->maximum_buffer_size - rc->optimal_buffer_level) /
+                       max_adjustment_down);
+      if (buff_lvl_step)
+        adjustment = (int)((rc->buffer_level - rc->optimal_buffer_level) /
+                           buff_lvl_step);
+      active_worst_quality -= adjustment;
+    }
+  } else if (rc->buffer_level > critical_level) {
+    // Adjust up from ambient Q.
+    if (critical_level) {
+      buff_lvl_step = (rc->optimal_buffer_level - critical_level);
+      if (buff_lvl_step) {
+        adjustment = (int)((rc->worst_quality - ambient_qp) *
+                           (rc->optimal_buffer_level - rc->buffer_level) /
+                           buff_lvl_step);
+      }
+      active_worst_quality = ambient_qp + adjustment;
+    }
+  } else {
+    // Set to worst_quality if buffer is below critical level.
+    active_worst_quality = rc->worst_quality;
+  }
+  return active_worst_quality;
+}
+
+static int rc_pick_q_and_bounds_one_pass_cbr(const AV1_COMP *cpi, int width,
+                                             int height, int *bottom_index,
+                                             int *top_index) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const CurrentFrame *const current_frame = &cm->current_frame;
+  int active_best_quality;
+  int active_worst_quality = calc_active_worst_quality_one_pass_cbr(cpi);
+  int q;
+  int *rtc_minq;
+  const int bit_depth = cm->seq_params.bit_depth;
+  ASSIGN_MINQ_TABLE(bit_depth, rtc_minq);
+
+  if (frame_is_intra_only(cm)) {
+    active_best_quality = rc->best_quality;
+    // Handle the special case for key frames forced when we have reached
+    // the maximum key frame interval. Here force the Q to a range
+    // based on the ambient Q to reduce the risk of popping.
+    if (rc->this_key_frame_forced) {
+      int qindex = rc->last_boosted_qindex;
+      double last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth);
+      int delta_qindex = av1_compute_qdelta(rc, last_boosted_q,
+                                            (last_boosted_q * 0.75), bit_depth);
+      active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
+    } else if (current_frame->frame_number > 0) {
+      // not first frame of one pass and kf_boost is set
+      double q_adj_factor = 1.0;
+      double q_val;
+
+      active_best_quality =
+          get_kf_active_quality(rc, rc->avg_frame_qindex[KEY_FRAME], bit_depth);
+
+      // Allow somewhat lower kf minq with small image formats.
+      if ((width * height) <= (352 * 288)) {
+        q_adj_factor -= 0.25;
+      }
+
+      // Convert the adjustment factor to a qindex delta
+      // on active_best_quality.
+      q_val = av1_convert_qindex_to_q(active_best_quality, bit_depth);
+      active_best_quality +=
+          av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, bit_depth);
+    }
+  } else if (!rc->is_src_frame_alt_ref && !cpi->use_svc &&
+             cpi->oxcf.gf_cbr_boost_pct &&
+             (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
+    // Use the lower of active_worst_quality and recent
+    // average Q as basis for GF/ARF best Q limit unless last frame was
+    // a key frame.
+    if (rc->frames_since_key > 1 &&
+        rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) {
+      q = rc->avg_frame_qindex[INTER_FRAME];
+    } else {
+      q = active_worst_quality;
+    }
+    active_best_quality = get_gf_active_quality(rc, q, bit_depth);
+  } else {
+    // Use the lower of active_worst_quality and recent/average Q.
+    if (current_frame->frame_number > 1) {
+      if (rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality)
+        active_best_quality = rtc_minq[rc->avg_frame_qindex[INTER_FRAME]];
+      else
+        active_best_quality = rtc_minq[active_worst_quality];
+    } else {
+      if (rc->avg_frame_qindex[KEY_FRAME] < active_worst_quality)
+        active_best_quality = rtc_minq[rc->avg_frame_qindex[KEY_FRAME]];
+      else
+        active_best_quality = rtc_minq[active_worst_quality];
+    }
+  }
+
+  // Clip the active best and worst quality values to limits
+  active_best_quality =
+      clamp(active_best_quality, rc->best_quality, rc->worst_quality);
+  active_worst_quality =
+      clamp(active_worst_quality, active_best_quality, rc->worst_quality);
+
+  *top_index = active_worst_quality;
+  *bottom_index = active_best_quality;
+
+  // Limit Q range for the adaptive loop.
+  if (current_frame->frame_type == KEY_FRAME && !rc->this_key_frame_forced &&
+      !(current_frame->frame_number == 0)) {
+    int qdelta = 0;
+    aom_clear_system_state();
+    qdelta = av1_compute_qdelta_by_rate(&cpi->rc, current_frame->frame_type,
+                                        active_worst_quality, 2.0, bit_depth);
+    *top_index = active_worst_quality + qdelta;
+    *top_index = AOMMAX(*top_index, *bottom_index);
+  }
+
+  // Special case code to try and match quality with forced key frames
+  if (current_frame->frame_type == KEY_FRAME && rc->this_key_frame_forced) {
+    q = rc->last_boosted_qindex;
+  } else {
+    q = av1_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality,
+                          active_worst_quality, width, height);
+    if (q > *top_index) {
+      // Special case when we are targeting the max allowed rate
+      if (rc->this_frame_target >= rc->max_frame_bandwidth)
+        *top_index = q;
+      else
+        q = *top_index;
+    }
+  }
+
+  assert(*top_index <= rc->worst_quality && *top_index >= rc->best_quality);
+  assert(*bottom_index <= rc->worst_quality &&
+         *bottom_index >= rc->best_quality);
+  assert(q <= rc->worst_quality && q >= rc->best_quality);
+  return q;
+}
+
+static int gf_group_pyramid_level(const GF_GROUP *gf_group, int gf_index) {
+  return gf_group->layer_depth[gf_index];
+}
+
+static int get_active_cq_level(const RATE_CONTROL *rc,
+                               const AV1EncoderConfig *const oxcf,
+                               int intra_only, SUPERRES_MODE superres_mode,
+                               int superres_denom) {
+  static const double cq_adjust_threshold = 0.1;
+  int active_cq_level = oxcf->cq_level;
+  (void)intra_only;
+  if (oxcf->rc_mode == AOM_CQ || oxcf->rc_mode == AOM_Q) {
+    // printf("Superres %d %d %d = %d\n", superres_denom, intra_only,
+    //        rc->frames_to_key, !(intra_only && rc->frames_to_key <= 1));
+    if ((superres_mode == SUPERRES_QTHRESH || superres_mode == SUPERRES_AUTO) &&
+        superres_denom != SCALE_NUMERATOR) {
+      int mult = SUPERRES_QADJ_PER_DENOM_KEYFRAME_SOLO;
+      if (intra_only && rc->frames_to_key <= 1) {
+        mult = 0;
+      } else if (intra_only) {
+        mult = SUPERRES_QADJ_PER_DENOM_KEYFRAME;
+      } else {
+        mult = SUPERRES_QADJ_PER_DENOM_ARFFRAME;
+      }
+      active_cq_level = AOMMAX(
+          active_cq_level - ((superres_denom - SCALE_NUMERATOR) * mult), 0);
+    }
+  }
+  if (oxcf->rc_mode == AOM_CQ && rc->total_target_bits > 0) {
+    const double x = (double)rc->total_actual_bits / rc->total_target_bits;
+    if (x < cq_adjust_threshold) {
+      active_cq_level = (int)(active_cq_level * x / cq_adjust_threshold);
+    }
+  }
+  return active_cq_level;
+}
+
+static int get_q_using_fixed_offsets(const AV1EncoderConfig *const oxcf,
+                                     const RATE_CONTROL *const rc,
+                                     const GF_GROUP *const gf_group,
+                                     int gf_index, int cq_level,
+                                     int bit_depth) {
+  assert(oxcf->use_fixed_qp_offsets);
+  assert(oxcf->rc_mode == AOM_Q);
+  const FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_index];
+
+  int offset_idx = -1;
+  if (update_type == KF_UPDATE) {
+    if (rc->frames_to_key == 1) {
+      // Image / intra-only coding: ignore offsets.
+      return cq_level;
+    }
+    offset_idx = 0;
+  } else if (update_type == ARF_UPDATE || update_type == GF_UPDATE) {
+    offset_idx = 1;
+  } else if (update_type == INTNL_ARF_UPDATE) {
+    offset_idx =
+        AOMMIN(gf_group->layer_depth[gf_index], FIXED_QP_OFFSET_COUNT - 1);
+  } else {  // Leaf level / overlay frame.
+    assert(update_type == LF_UPDATE || update_type == OVERLAY_UPDATE ||
+           update_type == INTNL_OVERLAY_UPDATE);
+    return cq_level;  // Directly Return worst quality allowed.
+  }
+  assert(offset_idx >= 0 && offset_idx < FIXED_QP_OFFSET_COUNT);
+  assert(oxcf->fixed_qp_offsets[offset_idx] >= 0);
+
+  // Get qindex offset, by first converting to 'q' and then back.
+  const double q_val_orig = av1_convert_qindex_to_q(cq_level, bit_depth);
+  const double q_val_target =
+      AOMMAX(q_val_orig - oxcf->fixed_qp_offsets[offset_idx], 0.0);
+  const int delta_qindex =
+      av1_compute_qdelta(rc, q_val_orig, q_val_target, bit_depth);
+  return AOMMAX(cq_level + delta_qindex, 0);
+}
+
+static int rc_pick_q_and_bounds_one_pass_vbr(const AV1_COMP *cpi, int width,
+                                             int height, int *bottom_index,
+                                             int *top_index) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const CurrentFrame *const current_frame = &cm->current_frame;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  const int cq_level =
+      get_active_cq_level(rc, oxcf, frame_is_intra_only(cm), cpi->superres_mode,
+                          cm->superres_scale_denominator);
+  const int bit_depth = cm->seq_params.bit_depth;
+
+  if (oxcf->use_fixed_qp_offsets) {
+    return get_q_using_fixed_offsets(oxcf, rc, &cpi->gf_group,
+                                     cpi->gf_group.index, cq_level, bit_depth);
+  }
+
+  int active_best_quality;
+  int active_worst_quality = calc_active_worst_quality_one_pass_vbr(cpi);
+  int q;
+  int *inter_minq;
+  ASSIGN_MINQ_TABLE(bit_depth, inter_minq);
+
+  if (frame_is_intra_only(cm)) {
+    if (oxcf->rc_mode == AOM_Q) {
+      const int qindex = cq_level;
+      const double q_val = av1_convert_qindex_to_q(qindex, bit_depth);
+      const int delta_qindex =
+          av1_compute_qdelta(rc, q_val, q_val * 0.25, bit_depth);
+      active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
+    } else if (rc->this_key_frame_forced) {
+      const int qindex = rc->last_boosted_qindex;
+      const double last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth);
+      const int delta_qindex = av1_compute_qdelta(
+          rc, last_boosted_q, last_boosted_q * 0.75, bit_depth);
+      active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
+    } else {  // not first frame of one pass and kf_boost is set
+      double q_adj_factor = 1.0;
+
+      active_best_quality =
+          get_kf_active_quality(rc, rc->avg_frame_qindex[KEY_FRAME], bit_depth);
+
+      // Allow somewhat lower kf minq with small image formats.
+      if ((width * height) <= (352 * 288)) {
+        q_adj_factor -= 0.25;
+      }
+
+      // Convert the adjustment factor to a qindex delta on active_best_quality.
+      {
+        const double q_val =
+            av1_convert_qindex_to_q(active_best_quality, bit_depth);
+        active_best_quality +=
+            av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, bit_depth);
+      }
+    }
+  } else if (!rc->is_src_frame_alt_ref &&
+             (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
+    // Use the lower of active_worst_quality and recent
+    // average Q as basis for GF/ARF best Q limit unless last frame was
+    // a key frame.
+    q = (rc->frames_since_key > 1 &&
+         rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality)
+            ? rc->avg_frame_qindex[INTER_FRAME]
+            : rc->avg_frame_qindex[KEY_FRAME];
+    // For constrained quality dont allow Q less than the cq level
+    if (oxcf->rc_mode == AOM_CQ) {
+      if (q < cq_level) q = cq_level;
+      active_best_quality = get_gf_active_quality(rc, q, bit_depth);
+      // Constrained quality use slightly lower active best.
+      active_best_quality = active_best_quality * 15 / 16;
+    } else if (oxcf->rc_mode == AOM_Q) {
+      const int qindex = cq_level;
+      const double q_val = av1_convert_qindex_to_q(qindex, bit_depth);
+      const int delta_qindex =
+          (cpi->refresh_alt_ref_frame)
+              ? av1_compute_qdelta(rc, q_val, q_val * 0.40, bit_depth)
+              : av1_compute_qdelta(rc, q_val, q_val * 0.50, bit_depth);
+      active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
+    } else {
+      active_best_quality = get_gf_active_quality(rc, q, bit_depth);
+    }
+  } else {
+    if (oxcf->rc_mode == AOM_Q) {
+      const int qindex = cq_level;
+      const double q_val = av1_convert_qindex_to_q(qindex, bit_depth);
+      const double delta_rate[FIXED_GF_INTERVAL] = { 0.50, 1.0, 0.85, 1.0,
+                                                     0.70, 1.0, 0.85, 1.0 };
+      const int delta_qindex = av1_compute_qdelta(
+          rc, q_val,
+          q_val * delta_rate[current_frame->frame_number % FIXED_GF_INTERVAL],
+          bit_depth);
+      active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
+    } else {
+      // Use the lower of active_worst_quality and recent/average Q.
+      active_best_quality = (current_frame->frame_number > 1)
+                                ? inter_minq[rc->avg_frame_qindex[INTER_FRAME]]
+                                : inter_minq[rc->avg_frame_qindex[KEY_FRAME]];
+      // For the constrained quality mode we don't want
+      // q to fall below the cq level.
+      if ((oxcf->rc_mode == AOM_CQ) && (active_best_quality < cq_level)) {
+        active_best_quality = cq_level;
+      }
+    }
+  }
+
+  // Clip the active best and worst quality values to limits
+  active_best_quality =
+      clamp(active_best_quality, rc->best_quality, rc->worst_quality);
+  active_worst_quality =
+      clamp(active_worst_quality, active_best_quality, rc->worst_quality);
+
+  *top_index = active_worst_quality;
+  *bottom_index = active_best_quality;
+
+  // Limit Q range for the adaptive loop.
+  {
+    int qdelta = 0;
+    aom_clear_system_state();
+    if (current_frame->frame_type == KEY_FRAME && !rc->this_key_frame_forced &&
+        !(current_frame->frame_number == 0)) {
+      qdelta = av1_compute_qdelta_by_rate(&cpi->rc, current_frame->frame_type,
+                                          active_worst_quality, 2.0, bit_depth);
+    } else if (!rc->is_src_frame_alt_ref &&
+               (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
+      qdelta =
+          av1_compute_qdelta_by_rate(&cpi->rc, current_frame->frame_type,
+                                     active_worst_quality, 1.75, bit_depth);
+    }
+    *top_index = active_worst_quality + qdelta;
+    *top_index = AOMMAX(*top_index, *bottom_index);
+  }
+
+  if (oxcf->rc_mode == AOM_Q) {
+    q = active_best_quality;
+    // Special case code to try and match quality with forced key frames
+  } else if ((current_frame->frame_type == KEY_FRAME) &&
+             rc->this_key_frame_forced) {
+    q = rc->last_boosted_qindex;
+  } else {
+    q = av1_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality,
+                          active_worst_quality, width, height);
+    if (q > *top_index) {
+      // Special case when we are targeting the max allowed rate
+      if (rc->this_frame_target >= rc->max_frame_bandwidth)
+        *top_index = q;
+      else
+        q = *top_index;
+    }
+  }
+
+  assert(*top_index <= rc->worst_quality && *top_index >= rc->best_quality);
+  assert(*bottom_index <= rc->worst_quality &&
+         *bottom_index >= rc->best_quality);
+  assert(q <= rc->worst_quality && q >= rc->best_quality);
+  return q;
+}
+
+static const double rate_factor_deltas[RATE_FACTOR_LEVELS] = {
+  1.00,  // INTER_NORMAL
+  1.50,  // GF_ARF_LOW
+  2.00,  // GF_ARF_STD
+  2.00,  // KF_STD
+};
+
+int av1_frame_type_qdelta(const AV1_COMP *cpi, int q) {
+  const RATE_FACTOR_LEVEL rf_lvl = get_rate_factor_level(&cpi->gf_group);
+  const FRAME_TYPE frame_type = (rf_lvl == KF_STD) ? KEY_FRAME : INTER_FRAME;
+  double rate_factor;
+
+  rate_factor = rate_factor_deltas[rf_lvl];
+  if (rf_lvl == GF_ARF_LOW) {
+    rate_factor -= (cpi->gf_group.layer_depth[cpi->gf_group.index] - 2) * 0.1;
+    rate_factor = AOMMAX(rate_factor, 1.0);
+  }
+  return av1_compute_qdelta_by_rate(&cpi->rc, frame_type, q, rate_factor,
+                                    cpi->common.seq_params.bit_depth);
+}
+
+// This unrestricted Q selection on CQ mode is useful when testing new features,
+// but may lead to Q being out of range on current RC restrictions
+#if USE_UNRESTRICTED_Q_IN_CQ_MODE
+static int rc_pick_q_and_bounds_one_pass_cq(const AV1_COMP *cpi, int width,
+                                            int height, int *bottom_index,
+                                            int *top_index) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  const int cq_level = get_active_cq_level(rc, oxcf, frame_is_intra_only(cm),
+                                           cm->superres_scale_denominator);
+  const int bit_depth = cm->seq_params.bit_depth;
+  const int q = (int)av1_convert_qindex_to_q(cq_level, bit_depth);
+  (void)width;
+  (void)height;
+  *top_index = q;
+  *bottom_index = q;
+
+  return q;
+}
+#endif  // USE_UNRESTRICTED_Q_IN_CQ_MODE
+
+#define STATIC_MOTION_THRESH 95
+static void get_intra_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
+                                            int height, int *active_best,
+                                            int *active_worst, int cq_level,
+                                            int is_fwd_kf) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  int active_best_quality;
+  int active_worst_quality = *active_worst;
+  const int bit_depth = cm->seq_params.bit_depth;
+
+  if (rc->frames_to_key == 1 && oxcf->rc_mode == AOM_Q) {
+    // If the next frame is also a key frame or the current frame is the
+    // only frame in the sequence in AOM_Q mode, just use the cq_level
+    // as q.
+    active_best_quality = cq_level;
+    active_worst_quality = cq_level;
+  } else if (is_fwd_kf) {
+    // Handle the special case for forward reference key frames.
+    // Increase the boost because this keyframe is used as a forward and
+    // backward reference.
+    const int qindex = rc->last_boosted_qindex;
+    const double last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth);
+    const int delta_qindex = av1_compute_qdelta(
+        rc, last_boosted_q, last_boosted_q * 0.25, bit_depth);
+    active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
+  } else if (rc->this_key_frame_forced) {
+    // Handle the special case for key frames forced when we have reached
+    // the maximum key frame interval. Here force the Q to a range
+    // based on the ambient Q to reduce the risk of popping.
+    double last_boosted_q;
+    int delta_qindex;
+    int qindex;
+
+    if (is_stat_consumption_stage_twopass(cpi) &&
+        cpi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) {
+      qindex = AOMMIN(rc->last_kf_qindex, rc->last_boosted_qindex);
+      active_best_quality = qindex;
+      last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth);
+      delta_qindex = av1_compute_qdelta(rc, last_boosted_q,
+                                        last_boosted_q * 1.25, bit_depth);
+      active_worst_quality =
+          AOMMIN(qindex + delta_qindex, active_worst_quality);
+    } else {
+      qindex = rc->last_boosted_qindex;
+      last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth);
+      delta_qindex = av1_compute_qdelta(rc, last_boosted_q,
+                                        last_boosted_q * 0.50, bit_depth);
+      active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
+    }
+  } else {
+    // Not forced keyframe.
+    double q_adj_factor = 1.0;
+    double q_val;
+
+    // Baseline value derived from cpi->active_worst_quality and kf boost.
+    active_best_quality =
+        get_kf_active_quality(rc, active_worst_quality, bit_depth);
+
+    if (is_stat_consumption_stage_twopass(cpi) &&
+        cpi->twopass.kf_zeromotion_pct >= STATIC_KF_GROUP_THRESH) {
+      active_best_quality /= 3;
+    }
+
+    // Allow somewhat lower kf minq with small image formats.
+    if ((width * height) <= (352 * 288)) {
+      q_adj_factor -= 0.25;
+    }
+
+    // Make a further adjustment based on the kf zero motion measure.
+    if (is_stat_consumption_stage_twopass(cpi))
+      q_adj_factor += 0.05 - (0.001 * (double)cpi->twopass.kf_zeromotion_pct);
+
+    // Convert the adjustment factor to a qindex delta
+    // on active_best_quality.
+    q_val = av1_convert_qindex_to_q(active_best_quality, bit_depth);
+    active_best_quality +=
+        av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, bit_depth);
+
+    // Tweak active_best_quality for AOM_Q mode when superres is on, as this
+    // will be used directly as 'q' later.
+    if (oxcf->rc_mode == AOM_Q &&
+        (cpi->superres_mode == SUPERRES_QTHRESH ||
+         cpi->superres_mode == SUPERRES_AUTO) &&
+        cm->superres_scale_denominator != SCALE_NUMERATOR) {
+      active_best_quality =
+          AOMMAX(active_best_quality -
+                     ((cm->superres_scale_denominator - SCALE_NUMERATOR) *
+                      SUPERRES_QADJ_PER_DENOM_KEYFRAME),
+                 0);
+    }
+  }
+  *active_best = active_best_quality;
+  *active_worst = active_worst_quality;
+}
+
+static void adjust_active_best_and_worst_quality(const AV1_COMP *cpi,
+                                                 const int is_intrl_arf_boost,
+                                                 int *active_worst,
+                                                 int *active_best) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const int bit_depth = cpi->common.seq_params.bit_depth;
+  int active_best_quality = *active_best;
+  int active_worst_quality = *active_worst;
+  // Extension to max or min Q if undershoot or overshoot is outside
+  // the permitted range.
+  if (cpi->oxcf.rc_mode != AOM_Q) {
+    if (frame_is_intra_only(cm) ||
+        (!rc->is_src_frame_alt_ref &&
+         (cpi->refresh_golden_frame || is_intrl_arf_boost ||
+          cpi->refresh_alt_ref_frame))) {
+      active_best_quality -=
+          (cpi->twopass.extend_minq + cpi->twopass.extend_minq_fast);
+      active_worst_quality += (cpi->twopass.extend_maxq / 2);
+    } else {
+      active_best_quality -=
+          (cpi->twopass.extend_minq + cpi->twopass.extend_minq_fast) / 2;
+      active_worst_quality += cpi->twopass.extend_maxq;
+    }
+  }
+
+  aom_clear_system_state();
+#ifndef STRICT_RC
+  // Static forced key frames Q restrictions dealt with elsewhere.
+  if (!(frame_is_intra_only(cm)) || !rc->this_key_frame_forced ||
+      (cpi->twopass.last_kfgroup_zeromotion_pct < STATIC_MOTION_THRESH)) {
+    const int qdelta = av1_frame_type_qdelta(cpi, active_worst_quality);
+    active_worst_quality =
+        AOMMAX(active_worst_quality + qdelta, active_best_quality);
+  }
+#endif
+
+  // Modify active_best_quality for downscaled normal frames.
+  if (av1_frame_scaled(cm) && !frame_is_kf_gf_arf(cpi)) {
+    int qdelta = av1_compute_qdelta_by_rate(
+        rc, cm->current_frame.frame_type, active_best_quality, 2.0, bit_depth);
+    active_best_quality =
+        AOMMAX(active_best_quality + qdelta, rc->best_quality);
+  }
+
+  active_best_quality =
+      clamp(active_best_quality, rc->best_quality, rc->worst_quality);
+  active_worst_quality =
+      clamp(active_worst_quality, active_best_quality, rc->worst_quality);
+
+  *active_best = active_best_quality;
+  *active_worst = active_worst_quality;
+}
+
+static int get_q(const AV1_COMP *cpi, const int width, const int height,
+                 const int active_worst_quality,
+                 const int active_best_quality) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  int q;
+
+  if (cpi->oxcf.rc_mode == AOM_Q ||
+      (frame_is_intra_only(cm) && !rc->this_key_frame_forced &&
+       cpi->twopass.kf_zeromotion_pct >= STATIC_KF_GROUP_THRESH &&
+       rc->frames_to_key > 1)) {
+    q = active_best_quality;
+    // Special case code to try and match quality with forced key frames.
+  } else if (frame_is_intra_only(cm) && rc->this_key_frame_forced) {
+    // If static since last kf use better of last boosted and last kf q.
+    if (cpi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) {
+      q = AOMMIN(rc->last_kf_qindex, rc->last_boosted_qindex);
+    } else {
+      q = AOMMIN(rc->last_boosted_qindex,
+                 (active_best_quality + active_worst_quality) / 2);
+    }
+    q = clamp(q, active_best_quality, active_worst_quality);
+  } else {
+    q = av1_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality,
+                          active_worst_quality, width, height);
+    if (q > active_worst_quality) {
+      // Special case when we are targeting the max allowed rate.
+      if (rc->this_frame_target < rc->max_frame_bandwidth) {
+        q = active_worst_quality;
+      }
+    }
+    q = AOMMAX(q, active_best_quality);
+  }
+  return q;
+}
+
+// Returns |active_best_quality| for an inter frame.
+// The |active_best_quality| depends on different rate control modes:
+// VBR, Q, CQ, CBR.
+// The returning active_best_quality could further be adjusted in
+// adjust_active_best_and_worst_quality().
+static int get_active_best_quality(const AV1_COMP *const cpi,
+                                   const int active_worst_quality,
+                                   const int cq_level, const int gf_index) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int bit_depth = cm->seq_params.bit_depth;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  const GF_GROUP *gf_group = &cpi->gf_group;
+  const int rc_mode = oxcf->rc_mode;
+  int *inter_minq;
+  ASSIGN_MINQ_TABLE(bit_depth, inter_minq);
+  int active_best_quality = 0;
+  const int is_intrl_arf_boost =
+      gf_group->update_type[gf_index] == INTNL_ARF_UPDATE;
+  const int is_leaf_frame = !(cpi->refresh_golden_frame ||
+                              cpi->refresh_alt_ref_frame || is_intrl_arf_boost);
+  const int is_overlay_frame = rc->is_src_frame_alt_ref;
+
+  if (is_leaf_frame || is_overlay_frame) {
+    if (rc_mode == AOM_Q) return cq_level;
+
+    active_best_quality = inter_minq[active_worst_quality];
+    // For the constrained quality mode we don't want
+    // q to fall below the cq level.
+    if ((rc_mode == AOM_CQ) && (active_best_quality < cq_level)) {
+      active_best_quality = cq_level;
+    }
+    return active_best_quality;
+  }
+
+  // TODO(chengchen): can we remove this condition?
+  if (rc_mode == AOM_Q && !cpi->refresh_alt_ref_frame && !is_intrl_arf_boost) {
+    return cq_level;
+  }
+
+  // Determine active_best_quality for frames that are not leaf or overlay.
+  int q = active_worst_quality;
+  // Use the lower of active_worst_quality and recent
+  // average Q as basis for GF/ARF best Q limit unless last frame was
+  // a key frame.
+  if (rc->frames_since_key > 1 &&
+      rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) {
+    q = rc->avg_frame_qindex[INTER_FRAME];
+  }
+  if (rc_mode == AOM_CQ && q < cq_level) q = cq_level;
+  active_best_quality = get_gf_active_quality(rc, q, bit_depth);
+  // Constrained quality use slightly lower active best.
+  if (rc_mode == AOM_CQ) active_best_quality = active_best_quality * 15 / 16;
+  const int min_boost = get_gf_high_motion_quality(q, bit_depth);
+  const int boost = min_boost - active_best_quality;
+  active_best_quality = min_boost - (int)(boost * rc->arf_boost_factor);
+  if (!is_intrl_arf_boost) return active_best_quality;
+
+  if (rc_mode == AOM_Q || rc_mode == AOM_CQ) active_best_quality = rc->arf_q;
+  int this_height = gf_group_pyramid_level(gf_group, gf_index);
+  while (this_height > 1) {
+    active_best_quality = (active_best_quality + active_worst_quality + 1) / 2;
+    --this_height;
+  }
+  return active_best_quality;
+}
+
+static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
+                                         int height, int gf_index,
+                                         int *bottom_index, int *top_index) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  const GF_GROUP *gf_group = &cpi->gf_group;
+  const int cq_level =
+      get_active_cq_level(rc, oxcf, frame_is_intra_only(cm), cpi->superres_mode,
+                          cm->superres_scale_denominator);
+  const int bit_depth = cm->seq_params.bit_depth;
+
+  if (oxcf->use_fixed_qp_offsets) {
+    return get_q_using_fixed_offsets(oxcf, rc, gf_group, gf_group->index,
+                                     cq_level, bit_depth);
+  }
+
+  int active_best_quality = 0;
+  int active_worst_quality = rc->active_worst_quality;
+  int q;
+
+  const int is_intrl_arf_boost =
+      gf_group->update_type[gf_index] == INTNL_ARF_UPDATE;
+
+  if (frame_is_intra_only(cm)) {
+    const int is_fwd_kf =
+        cm->current_frame.frame_type == KEY_FRAME && cm->show_frame == 0;
+    get_intra_q_and_bounds_two_pass(cpi, width, height, &active_best_quality,
+                                    &active_worst_quality, cq_level, is_fwd_kf);
+#ifdef STRICT_RC
+    active_best_quality = 0;
+#endif
+  } else {
+#ifdef STRICT_RC
+    //  Active best quality limited by previous layer.
+    const int pyramid_level = gf_group_pyramid_level(gf_group, gf_index);
+    active_best_quality =
+        rc->active_best_quality[pyramid_level - 1] +
+        AOMMAX((rc->active_best_quality[pyramid_level - 1] / 10), 5);
+#else
+    active_best_quality =
+        get_active_best_quality(cpi, active_worst_quality, cq_level, gf_index);
+#endif
+
+    // For alt_ref and GF frames (including internal arf frames) adjust the
+    // worst allowed quality as well. This insures that even on hard
+    // sections we dont clamp the Q at the same value for arf frames and
+    // leaf (non arf) frames. This is important to the TPL model which assumes
+    // Q drops with each arf level.
+    if (!(rc->is_src_frame_alt_ref) &&
+        (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame ||
+         is_intrl_arf_boost)) {
+      active_worst_quality =
+          (active_best_quality + (3 * active_worst_quality) + 2) / 4;
+    }
+  }
+
+  adjust_active_best_and_worst_quality(
+      cpi, is_intrl_arf_boost, &active_worst_quality, &active_best_quality);
+  q = get_q(cpi, width, height, active_worst_quality, active_best_quality);
+
+  // Special case when we are targeting the max allowed rate.
+  if (rc->this_frame_target >= rc->max_frame_bandwidth &&
+      q > active_worst_quality) {
+    active_worst_quality = q;
+  }
+
+#ifdef STRICT_RC
+  *top_index = rc->worst_quality;
+#else
+  *top_index = active_worst_quality;
+#endif
+  *bottom_index = active_best_quality;
+
+  assert(*top_index <= rc->worst_quality && *top_index >= rc->best_quality);
+  assert(*bottom_index <= rc->worst_quality &&
+         *bottom_index >= rc->best_quality);
+  assert(q <= rc->worst_quality && q >= rc->best_quality);
+
+  return q;
+}
+
+int av1_rc_pick_q_and_bounds(const AV1_COMP *cpi, RATE_CONTROL *rc, int width,
+                             int height, int gf_index, int *bottom_index,
+                             int *top_index) {
+  int q;
+  // TODO(sarahparker) merge onepass vbr and altref q computation
+  // with two pass
+  const GF_GROUP *gf_group = &cpi->gf_group;
+  if ((cpi->oxcf.rc_mode != AOM_Q ||
+       gf_group->update_type[gf_index] == ARF_UPDATE) &&
+      has_no_stats_stage(cpi)) {
+    if (cpi->oxcf.rc_mode == AOM_CBR)
+      q = rc_pick_q_and_bounds_one_pass_cbr(cpi, width, height, bottom_index,
+                                            top_index);
+#if USE_UNRESTRICTED_Q_IN_CQ_MODE
+    else if (cpi->oxcf.rc_mode == AOM_CQ)
+      q = rc_pick_q_and_bounds_one_pass_cq(cpi, width, height, bottom_index,
+                                           top_index);
+#endif  // USE_UNRESTRICTED_Q_IN_CQ_MODE
+    else
+      q = rc_pick_q_and_bounds_one_pass_vbr(cpi, width, height, bottom_index,
+                                            top_index);
+  } else {
+    q = rc_pick_q_and_bounds_two_pass(cpi, width, height, gf_index,
+                                      bottom_index, top_index);
+  }
+  if (gf_group->update_type[gf_index] == ARF_UPDATE) rc->arf_q = q;
+
+  return q;
+}
+
+void av1_rc_compute_frame_size_bounds(const AV1_COMP *cpi, int frame_target,
+                                      int *frame_under_shoot_limit,
+                                      int *frame_over_shoot_limit) {
+  if (cpi->oxcf.rc_mode == AOM_Q) {
+    *frame_under_shoot_limit = 0;
+    *frame_over_shoot_limit = INT_MAX;
+  } else {
+    // For very small rate targets where the fractional adjustment
+    // may be tiny make sure there is at least a minimum range.
+    const int tolerance =
+        AOMMAX(100, (cpi->sf.hl_sf.recode_tolerance * frame_target) / 100);
+    *frame_under_shoot_limit = AOMMAX(frame_target - tolerance, 0);
+    *frame_over_shoot_limit =
+        AOMMIN(frame_target + tolerance, cpi->rc.max_frame_bandwidth);
+  }
+}
+
+void av1_rc_set_frame_target(AV1_COMP *cpi, int target, int width, int height) {
+  const AV1_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+
+  rc->this_frame_target = target;
+
+  // Modify frame size target when down-scaled.
+  if (av1_frame_scaled(cm))
+    rc->this_frame_target =
+        (int)(rc->this_frame_target * resize_rate_factor(cpi, width, height));
+
+  // Target rate per SB64 (including partial SB64s.
+  rc->sb64_target_rate =
+      (int)(((int64_t)rc->this_frame_target << 12) / (width * height));
+}
+
+static void update_alt_ref_frame_stats(AV1_COMP *cpi) {
+  // this frame refreshes means next frames don't unless specified by user
+  RATE_CONTROL *const rc = &cpi->rc;
+  rc->frames_since_golden = 0;
+
+  // Mark the alt ref as done (setting to 0 means no further alt refs pending).
+  rc->source_alt_ref_pending = 0;
+
+  // Set the alternate reference frame active flag
+  rc->source_alt_ref_active = 1;
+}
+
+static void update_golden_frame_stats(AV1_COMP *cpi) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  const GF_GROUP *const gf_group = &cpi->gf_group;
+
+  // Update the Golden frame usage counts.
+  if (cpi->refresh_golden_frame || rc->is_src_frame_alt_ref) {
+    rc->frames_since_golden = 0;
+
+    // If we are not using alt ref in the up and coming group clear the arf
+    // active flag. In multi arf group case, if the index is not 0 then
+    // we are overlaying a mid group arf so should not reset the flag.
+    if (!rc->source_alt_ref_pending && (gf_group->index == 0))
+      rc->source_alt_ref_active = 0;
+  } else if (cpi->common.show_frame) {
+    rc->frames_since_golden++;
+  }
+}
+
+void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const CurrentFrame *const current_frame = &cm->current_frame;
+  RATE_CONTROL *const rc = &cpi->rc;
+  const GF_GROUP *const gf_group = &cpi->gf_group;
+
+  const int is_intrnl_arf =
+      gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE;
+
+  const int qindex = cm->quant_params.base_qindex;
+
+  // Update rate control heuristics
+  rc->projected_frame_size = (int)(bytes_used << 3);
+
+  // Post encode loop adjustment of Q prediction.
+  av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
+
+  // Keep a record of last Q and ambient average Q.
+  if (current_frame->frame_type == KEY_FRAME) {
+    rc->last_q[KEY_FRAME] = qindex;
+    rc->avg_frame_qindex[KEY_FRAME] =
+        ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[KEY_FRAME] + qindex, 2);
+  } else {
+    if ((cpi->use_svc && cpi->oxcf.rc_mode == AOM_CBR) ||
+        (!rc->is_src_frame_alt_ref &&
+         !(cpi->refresh_golden_frame || is_intrnl_arf ||
+           cpi->refresh_alt_ref_frame))) {
+      rc->last_q[INTER_FRAME] = qindex;
+      rc->avg_frame_qindex[INTER_FRAME] =
+          ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[INTER_FRAME] + qindex, 2);
+      rc->ni_frames++;
+      rc->tot_q += av1_convert_qindex_to_q(qindex, cm->seq_params.bit_depth);
+      rc->avg_q = rc->tot_q / rc->ni_frames;
+      // Calculate the average Q for normal inter frames (not key or GFU
+      // frames).
+      rc->ni_tot_qi += qindex;
+      rc->ni_av_qi = rc->ni_tot_qi / rc->ni_frames;
+    }
+  }
+
+  // Keep record of last boosted (KF/GF/ARF) Q value.
+  // If the current frame is coded at a lower Q then we also update it.
+  // If all mbs in this group are skipped only update if the Q value is
+  // better than that already stored.
+  // This is used to help set quality in forced key frames to reduce popping
+  if ((qindex < rc->last_boosted_qindex) ||
+      (current_frame->frame_type == KEY_FRAME) ||
+      (!rc->constrained_gf_group &&
+       (cpi->refresh_alt_ref_frame || is_intrnl_arf ||
+        (cpi->refresh_golden_frame && !rc->is_src_frame_alt_ref)))) {
+    rc->last_boosted_qindex = qindex;
+  }
+  if (current_frame->frame_type == KEY_FRAME) rc->last_kf_qindex = qindex;
+
+  update_buffer_level(cpi, rc->projected_frame_size);
+  rc->prev_avg_frame_bandwidth = rc->avg_frame_bandwidth;
+
+  // Rolling monitors of whether we are over or underspending used to help
+  // regulate min and Max Q in two pass.
+  if (av1_frame_scaled(cm))
+    rc->this_frame_target =
+        (int)(rc->this_frame_target /
+              resize_rate_factor(cpi, cm->width, cm->height));
+  if (current_frame->frame_type != KEY_FRAME) {
+    rc->rolling_target_bits = (int)ROUND_POWER_OF_TWO_64(
+        rc->rolling_target_bits * 3 + rc->this_frame_target, 2);
+    rc->rolling_actual_bits = (int)ROUND_POWER_OF_TWO_64(
+        rc->rolling_actual_bits * 3 + rc->projected_frame_size, 2);
+    rc->long_rolling_target_bits = (int)ROUND_POWER_OF_TWO_64(
+        rc->long_rolling_target_bits * 31 + rc->this_frame_target, 5);
+    rc->long_rolling_actual_bits = (int)ROUND_POWER_OF_TWO_64(
+        rc->long_rolling_actual_bits * 31 + rc->projected_frame_size, 5);
+  }
+
+  // Actual bits spent
+  rc->total_actual_bits += rc->projected_frame_size;
+  rc->total_target_bits += cm->show_frame ? rc->avg_frame_bandwidth : 0;
+
+  rc->total_target_vs_actual = rc->total_actual_bits - rc->total_target_bits;
+
+  if (is_altref_enabled(cpi) && cpi->refresh_alt_ref_frame &&
+      (current_frame->frame_type != KEY_FRAME))
+    // Update the alternate reference frame stats as appropriate.
+    update_alt_ref_frame_stats(cpi);
+  else
+    // Update the Golden frame stats as appropriate.
+    update_golden_frame_stats(cpi);
+
+  if (current_frame->frame_type == KEY_FRAME) rc->frames_since_key = 0;
+  // if (current_frame->frame_number == 1 && cm->show_frame)
+  /*
+  rc->this_frame_target =
+      (int)(rc->this_frame_target / resize_rate_factor(cpi, cm->width,
+  cm->height));
+      */
+}
+
+void av1_rc_postencode_update_drop_frame(AV1_COMP *cpi) {
+  // Update buffer level with zero size, update frame counters, and return.
+  update_buffer_level(cpi, 0);
+  cpi->rc.frames_since_key++;
+  cpi->rc.frames_to_key--;
+  cpi->rc.rc_2_frame = 0;
+  cpi->rc.rc_1_frame = 0;
+}
+
+int av1_find_qindex(double desired_q, aom_bit_depth_t bit_depth,
+                    int best_qindex, int worst_qindex) {
+  assert(best_qindex <= worst_qindex);
+  int low = best_qindex;
+  int high = worst_qindex;
+  while (low < high) {
+    const int mid = (low + high) >> 1;
+    const double mid_q = av1_convert_qindex_to_q(mid, bit_depth);
+    if (mid_q < desired_q) {
+      low = mid + 1;
+    } else {
+      high = mid;
+    }
+  }
+  assert(low == high);
+  assert(av1_convert_qindex_to_q(low, bit_depth) >= desired_q ||
+         low == worst_qindex);
+  return low;
+}
+
+int av1_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget,
+                       aom_bit_depth_t bit_depth) {
+  const int start_index =
+      av1_find_qindex(qstart, bit_depth, rc->best_quality, rc->worst_quality);
+  const int target_index =
+      av1_find_qindex(qtarget, bit_depth, rc->best_quality, rc->worst_quality);
+  return target_index - start_index;
+}
+
+// Find q_index for the desired_bits_per_mb, within [best_qindex, worst_qindex],
+// assuming 'correction_factor' is 1.0.
+// To be precise, 'q_index' is the smallest integer, for which the corresponding
+// bits per mb <= desired_bits_per_mb.
+// If no such q index is found, returns 'worst_qindex'.
+static int find_qindex_by_rate(int desired_bits_per_mb,
+                               aom_bit_depth_t bit_depth, FRAME_TYPE frame_type,
+                               int best_qindex, int worst_qindex) {
+  assert(best_qindex <= worst_qindex);
+  int low = best_qindex;
+  int high = worst_qindex;
+  while (low < high) {
+    const int mid = (low + high) >> 1;
+    const int mid_bits_per_mb =
+        av1_rc_bits_per_mb(frame_type, mid, 1.0, bit_depth);
+    if (mid_bits_per_mb > desired_bits_per_mb) {
+      low = mid + 1;
+    } else {
+      high = mid;
+    }
+  }
+  assert(low == high);
+  assert(av1_rc_bits_per_mb(frame_type, low, 1.0, bit_depth) <=
+             desired_bits_per_mb ||
+         low == worst_qindex);
+  return low;
+}
+
+int av1_compute_qdelta_by_rate(const RATE_CONTROL *rc, FRAME_TYPE frame_type,
+                               int qindex, double rate_target_ratio,
+                               aom_bit_depth_t bit_depth) {
+  // Look up the current projected bits per block for the base index
+  const int base_bits_per_mb =
+      av1_rc_bits_per_mb(frame_type, qindex, 1.0, bit_depth);
+
+  // Find the target bits per mb based on the base value and given ratio.
+  const int target_bits_per_mb = (int)(rate_target_ratio * base_bits_per_mb);
+
+  const int target_index =
+      find_qindex_by_rate(target_bits_per_mb, bit_depth, frame_type,
+                          rc->best_quality, rc->worst_quality);
+  return target_index - qindex;
+}
+
+void av1_rc_set_gf_interval_range(const AV1_COMP *const cpi,
+                                  RATE_CONTROL *const rc) {
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+
+  // Special case code for 1 pass fixed Q mode tests
+  if ((has_no_stats_stage(cpi)) && (oxcf->rc_mode == AOM_Q)) {
+    rc->max_gf_interval = FIXED_GF_INTERVAL;
+    rc->min_gf_interval = FIXED_GF_INTERVAL;
+    rc->static_scene_max_gf_interval = FIXED_GF_INTERVAL;
+  } else {
+    // Set Maximum gf/arf interval
+    rc->max_gf_interval = oxcf->max_gf_interval;
+    rc->min_gf_interval = oxcf->min_gf_interval;
+    if (rc->min_gf_interval == 0)
+      rc->min_gf_interval = av1_rc_get_default_min_gf_interval(
+          oxcf->width, oxcf->height, cpi->framerate);
+    if (rc->max_gf_interval == 0)
+      rc->max_gf_interval = av1_rc_get_default_max_gf_interval(
+          cpi->framerate, rc->min_gf_interval);
+    /*
+     * Extended max interval for genuinely static scenes like slide shows.
+     * The no.of.stats available in the case of LAP is limited,
+     * hence setting to max_gf_interval.
+     */
+    if (cpi->lap_enabled)
+      rc->static_scene_max_gf_interval = rc->max_gf_interval + 1;
+    else
+      rc->static_scene_max_gf_interval = MAX_STATIC_GF_GROUP_LENGTH;
+
+    if (rc->max_gf_interval > rc->static_scene_max_gf_interval)
+      rc->max_gf_interval = rc->static_scene_max_gf_interval;
+
+    // Clamp min to max
+    rc->min_gf_interval = AOMMIN(rc->min_gf_interval, rc->max_gf_interval);
+  }
+}
+
+void av1_rc_update_framerate(AV1_COMP *cpi, int width, int height) {
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  RATE_CONTROL *const rc = &cpi->rc;
+  int vbr_max_bits;
+  const int MBs = av1_get_MBs(width, height);
+
+  rc->avg_frame_bandwidth = (int)(oxcf->target_bandwidth / cpi->framerate);
+  rc->min_frame_bandwidth =
+      (int)(rc->avg_frame_bandwidth * oxcf->two_pass_vbrmin_section / 100);
+
+  rc->min_frame_bandwidth =
+      AOMMAX(rc->min_frame_bandwidth, FRAME_OVERHEAD_BITS);
+
+  // A maximum bitrate for a frame is defined.
+  // The baseline for this aligns with HW implementations that
+  // can support decode of 1080P content up to a bitrate of MAX_MB_RATE bits
+  // per 16x16 MB (averaged over a frame). However this limit is extended if
+  // a very high rate is given on the command line or the the rate cannnot
+  // be acheived because of a user specificed max q (e.g. when the user
+  // specifies lossless encode.
+  vbr_max_bits =
+      (int)(((int64_t)rc->avg_frame_bandwidth * oxcf->two_pass_vbrmax_section) /
+            100);
+  rc->max_frame_bandwidth =
+      AOMMAX(AOMMAX((MBs * MAX_MB_RATE), MAXRATE_1080P), vbr_max_bits);
+
+  av1_rc_set_gf_interval_range(cpi, rc);
+}
+
+#define VBR_PCT_ADJUSTMENT_LIMIT 50
+// For VBR...adjustment to the frame target based on error from previous frames
+static void vbr_rate_correction(AV1_COMP *cpi, int *this_frame_target) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  int64_t vbr_bits_off_target = rc->vbr_bits_off_target;
+  const int stats_count =
+      cpi->twopass.stats_buf_ctx->total_stats != NULL
+          ? (int)cpi->twopass.stats_buf_ctx->total_stats->count
+          : 0;
+  const int frame_window = AOMMIN(
+      16, (int)(stats_count - (int)cpi->common.current_frame.frame_number));
+
+  if (frame_window > 0) {
+    const int max_delta =
+        AOMMIN(abs((int)(vbr_bits_off_target / frame_window)),
+               (*this_frame_target * VBR_PCT_ADJUSTMENT_LIMIT) / 100);
+
+    // vbr_bits_off_target > 0 means we have extra bits to spend
+    // vbr_bits_off_target < 0 we are currently overshooting
+    *this_frame_target += (vbr_bits_off_target >= 0) ? max_delta : -max_delta;
+  }
+
+  // Fast redistribution of bits arising from massive local undershoot.
+  // Dont do it for kf,arf,gf or overlay frames.
+  if (!frame_is_kf_gf_arf(cpi) && !rc->is_src_frame_alt_ref &&
+      rc->vbr_bits_off_target_fast) {
+    int one_frame_bits = AOMMAX(rc->avg_frame_bandwidth, *this_frame_target);
+    int fast_extra_bits;
+    fast_extra_bits = (int)AOMMIN(rc->vbr_bits_off_target_fast, one_frame_bits);
+    fast_extra_bits = (int)AOMMIN(
+        fast_extra_bits,
+        AOMMAX(one_frame_bits / 8, rc->vbr_bits_off_target_fast / 8));
+    *this_frame_target += (int)fast_extra_bits;
+    rc->vbr_bits_off_target_fast -= fast_extra_bits;
+  }
+}
+
+void av1_set_target_rate(AV1_COMP *cpi, int width, int height) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  int target_rate = rc->base_frame_target;
+
+  // Correction to rate target based on prior over or under shoot.
+  if (cpi->oxcf.rc_mode == AOM_VBR || cpi->oxcf.rc_mode == AOM_CQ)
+    vbr_rate_correction(cpi, &target_rate);
+  av1_rc_set_frame_target(cpi, target_rate, width, height);
+}
+
+int av1_calc_pframe_target_size_one_pass_vbr(
+    const AV1_COMP *const cpi, FRAME_UPDATE_TYPE frame_update_type) {
+  static const int af_ratio = 10;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  int64_t target;
+#if USE_ALTREF_FOR_ONE_PASS
+  if (frame_update_type == KF_UPDATE || frame_update_type == GF_UPDATE ||
+      frame_update_type == ARF_UPDATE) {
+    target = ((int64_t)rc->avg_frame_bandwidth * rc->baseline_gf_interval *
+              af_ratio) /
+             (rc->baseline_gf_interval + af_ratio - 1);
+  } else {
+    target = ((int64_t)rc->avg_frame_bandwidth * rc->baseline_gf_interval) /
+             (rc->baseline_gf_interval + af_ratio - 1);
+  }
+  if (target > INT_MAX) target = INT_MAX;
+#else
+  target = rc->avg_frame_bandwidth;
+#endif
+  return av1_rc_clamp_pframe_target_size(cpi, (int)target, frame_update_type);
+}
+
+int av1_calc_iframe_target_size_one_pass_vbr(const AV1_COMP *const cpi) {
+  static const int kf_ratio = 25;
+  const RATE_CONTROL *rc = &cpi->rc;
+  const int target = rc->avg_frame_bandwidth * kf_ratio;
+  return av1_rc_clamp_iframe_target_size(cpi, target);
+}
+
+int av1_calc_pframe_target_size_one_pass_cbr(
+    const AV1_COMP *cpi, FRAME_UPDATE_TYPE frame_update_type) {
+  const AV1EncoderConfig *oxcf = &cpi->oxcf;
+  const RATE_CONTROL *rc = &cpi->rc;
+  const int64_t diff = rc->optimal_buffer_level - rc->buffer_level;
+  const int64_t one_pct_bits = 1 + rc->optimal_buffer_level / 100;
+  int min_frame_target =
+      AOMMAX(rc->avg_frame_bandwidth >> 4, FRAME_OVERHEAD_BITS);
+  int target;
+
+  if (oxcf->gf_cbr_boost_pct) {
+    const int af_ratio_pct = oxcf->gf_cbr_boost_pct + 100;
+    if (frame_update_type == GF_UPDATE || frame_update_type == OVERLAY_UPDATE) {
+      target =
+          (rc->avg_frame_bandwidth * rc->baseline_gf_interval * af_ratio_pct) /
+          (rc->baseline_gf_interval * 100 + af_ratio_pct - 100);
+    } else {
+      target = (rc->avg_frame_bandwidth * rc->baseline_gf_interval * 100) /
+               (rc->baseline_gf_interval * 100 + af_ratio_pct - 100);
+    }
+  } else {
+    target = rc->avg_frame_bandwidth;
+  }
+  if (cpi->use_svc) {
+    // Note that for layers, avg_frame_bandwidth is the cumulative
+    // per-frame-bandwidth. For the target size of this frame, use the
+    // layer average frame size (i.e., non-cumulative per-frame-bw).
+    int layer =
+        LAYER_IDS_TO_IDX(cpi->svc.spatial_layer_id, cpi->svc.temporal_layer_id,
+                         cpi->svc.number_temporal_layers);
+    const LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer];
+    target = lc->avg_frame_size;
+    min_frame_target = AOMMAX(lc->avg_frame_size >> 4, FRAME_OVERHEAD_BITS);
+  }
+  if (diff > 0) {
+    // Lower the target bandwidth for this frame.
+    const int pct_low = (int)AOMMIN(diff / one_pct_bits, oxcf->under_shoot_pct);
+    target -= (target * pct_low) / 200;
+  } else if (diff < 0) {
+    // Increase the target bandwidth for this frame.
+    const int pct_high =
+        (int)AOMMIN(-diff / one_pct_bits, oxcf->over_shoot_pct);
+    target += (target * pct_high) / 200;
+  }
+  if (oxcf->rc_max_inter_bitrate_pct) {
+    const int max_rate =
+        rc->avg_frame_bandwidth * oxcf->rc_max_inter_bitrate_pct / 100;
+    target = AOMMIN(target, max_rate);
+  }
+  return AOMMAX(min_frame_target, target);
+}
+
+int av1_calc_iframe_target_size_one_pass_cbr(const AV1_COMP *cpi) {
+  const RATE_CONTROL *rc = &cpi->rc;
+  int target;
+  if (cpi->common.current_frame.frame_number == 0) {
+    target = ((rc->starting_buffer_level / 2) > INT_MAX)
+                 ? INT_MAX
+                 : (int)(rc->starting_buffer_level / 2);
+  } else {
+    int kf_boost = 32;
+    double framerate = cpi->framerate;
+
+    kf_boost = AOMMAX(kf_boost, (int)(2 * framerate - 16));
+    if (rc->frames_since_key < framerate / 2) {
+      kf_boost = (int)(kf_boost * rc->frames_since_key / (framerate / 2));
+    }
+    target = ((16 + kf_boost) * rc->avg_frame_bandwidth) >> 4;
+  }
+  return av1_rc_clamp_iframe_target_size(cpi, target);
+}
+
+static void set_reference_structure_one_pass_rt(AV1_COMP *cpi, int gf_update) {
+  AV1_COMMON *const cm = &cpi->common;
+  ExternalFlags *const ext_flags = &cpi->ext_flags;
+  SVC *const svc = &cpi->svc;
+  // Specify the reference prediction structure, for 1 layer nonrd mode.
+  // Current structue is to use 3 references (LAST, GOLDEN, ALTREF),
+  // where ALT_REF always behind current by lag_alt frames, and GOLDEN is
+  // either updated on LAST with period baseline_gf_interval (fixed slot)
+  // or always behind current by lag_gld (gld_fixed_slot = 0, lag_gld <= 7).
+  const int gld_fixed_slot = 1;
+  const unsigned int lag_alt = 4;
+  int last_idx = 0;
+  int last_idx_refresh = 0;
+  int gld_idx = 0;
+  int alt_ref_idx = 0;
+  ext_flags->refresh_frame_flags_pending = 1;
+  svc->external_ref_frame_config = 1;
+  ext_flags->ref_frame_flags = 0;
+  ext_flags->refresh_last_frame = 1;
+  ext_flags->refresh_golden_frame = 0;
+  ext_flags->refresh_alt_ref_frame = 0;
+  for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) svc->ref_idx[i] = 7;
+  for (int i = 0; i < REF_FRAMES; ++i) svc->refresh[i] = 0;
+  // Always reference LAST, GOLDEN, ALTREF
+  ext_flags->ref_frame_flags ^= AOM_LAST_FLAG;
+  ext_flags->ref_frame_flags ^= AOM_GOLD_FLAG;
+  ext_flags->ref_frame_flags ^= AOM_ALT_FLAG;
+  const int sh = 7 - gld_fixed_slot;
+  // Moving index slot for last: 0 - (sh - 1).
+  if (cm->current_frame.frame_number > 1)
+    last_idx = ((cm->current_frame.frame_number - 1) % sh);
+  // Moving index for refresh of last: one ahead for next frame.
+  last_idx_refresh = (cm->current_frame.frame_number % sh);
+  gld_idx = 6;
+  if (!gld_fixed_slot) {
+    gld_idx = 7;
+    const unsigned int lag_gld = 7;  // Must be <= 7.
+    // Moving index for gld_ref, lag behind current by gld_interval frames.
+    if (cm->current_frame.frame_number > lag_gld)
+      gld_idx = ((cm->current_frame.frame_number - lag_gld) % sh);
+  }
+  // Moving index for alt_ref, lag behind LAST by lag_alt frames.
+  if (cm->current_frame.frame_number > lag_alt)
+    alt_ref_idx = ((cm->current_frame.frame_number - lag_alt) % sh);
+  svc->ref_idx[0] = last_idx;          // LAST
+  svc->ref_idx[1] = last_idx_refresh;  // LAST2 (for refresh of last).
+  svc->ref_idx[3] = gld_idx;           // GOLDEN
+  svc->ref_idx[6] = alt_ref_idx;       // ALT_REF
+  // Refresh this slot, which will become LAST on next frame.
+  svc->refresh[last_idx_refresh] = 1;
+  // Update GOLDEN on period for fixed slot case.
+  if (gld_fixed_slot && gf_update) {
+    ext_flags->refresh_golden_frame = 1;
+    svc->refresh[gld_idx] = 1;
+  }
+}
+
+#define DEFAULT_KF_BOOST_RT 2300
+#define DEFAULT_GF_BOOST_RT 2000
+
+void av1_get_one_pass_rt_params(AV1_COMP *cpi,
+                                EncodeFrameParams *const frame_params,
+                                unsigned int frame_flags) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  AV1_COMMON *const cm = &cpi->common;
+  GF_GROUP *const gf_group = &cpi->gf_group;
+  ResizePendingParams *const resize_pending_params =
+      &cpi->resize_pending_params;
+  int gf_update = 0;
+  int target;
+  const int resize_pending =
+      (resize_pending_params->width && resize_pending_params->height &&
+       (cm->width != resize_pending_params->width ||
+        cm->height != resize_pending_params->height));
+  // Turn this on to explicitly set the reference structure rather than
+  // relying on internal/default structure.
+  const int set_reference_structure = 1;
+  if (cpi->use_svc) {
+    av1_update_temporal_layer_framerate(cpi);
+    av1_restore_layer_context(cpi);
+  }
+  if ((!cpi->use_svc && rc->frames_to_key == 0) ||
+      (cpi->use_svc && cpi->svc.spatial_layer_id == 0 &&
+       cpi->svc.current_superframe % cpi->oxcf.key_freq == 0) ||
+      (frame_flags & FRAMEFLAGS_KEY)) {
+    frame_params->frame_type = KEY_FRAME;
+    rc->this_key_frame_forced =
+        cm->current_frame.frame_number != 0 && rc->frames_to_key == 0;
+    rc->frames_to_key = cpi->oxcf.key_freq;
+    rc->kf_boost = DEFAULT_KF_BOOST_RT;
+    rc->source_alt_ref_active = 0;
+    gf_group->update_type[gf_group->index] = KF_UPDATE;
+    if (cpi->use_svc && cm->current_frame.frame_number > 0)
+      av1_svc_reset_temporal_layers(cpi, 1);
+  } else {
+    frame_params->frame_type = INTER_FRAME;
+    gf_group->update_type[gf_group->index] = LF_UPDATE;
+  }
+  // GF update based on frames_till_gf_update_due, also
+  // force upddate on resize pending frame.
+  if ((resize_pending || rc->frames_till_gf_update_due == 0) &&
+      cpi->svc.temporal_layer_id == 0 && cpi->svc.spatial_layer_id == 0) {
+    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+      av1_cyclic_refresh_set_golden_update(cpi);
+    else
+      rc->baseline_gf_interval = MAX_GF_INTERVAL;
+    if (rc->baseline_gf_interval > rc->frames_to_key)
+      rc->baseline_gf_interval = rc->frames_to_key;
+    rc->gfu_boost = DEFAULT_GF_BOOST_RT;
+    rc->constrained_gf_group =
+        (rc->baseline_gf_interval >= rc->frames_to_key) ? 1 : 0;
+    rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+    gf_group->index = 0;
+    // SVC does not use GF as periodid boost.
+    // TODO(marpan): Find better way to disable this for SVC.
+    if (cpi->use_svc) {
+      SVC *const svc = &cpi->svc;
+      rc->baseline_gf_interval = MAX_STATIC_GF_GROUP_LENGTH - 1;
+      rc->gfu_boost = 1;
+      rc->constrained_gf_group = 0;
+      rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+      for (int layer = 0;
+           layer < svc->number_spatial_layers * svc->number_temporal_layers;
+           ++layer) {
+        LAYER_CONTEXT *const lc = &svc->layer_context[layer];
+        lc->rc.baseline_gf_interval = rc->baseline_gf_interval;
+        lc->rc.gfu_boost = rc->gfu_boost;
+        lc->rc.constrained_gf_group = rc->constrained_gf_group;
+        lc->rc.frames_till_gf_update_due = rc->frames_till_gf_update_due;
+        lc->group_index = 0;
+      }
+    }
+    gf_group->size = rc->baseline_gf_interval;
+    gf_group->update_type[0] =
+        (frame_params->frame_type == KEY_FRAME) ? KF_UPDATE : GF_UPDATE;
+    gf_update = 1;
+  }
+  if (cpi->oxcf.rc_mode == AOM_CBR) {
+    if (frame_params->frame_type == KEY_FRAME) {
+      target = av1_calc_iframe_target_size_one_pass_cbr(cpi);
+    } else {
+      target = av1_calc_pframe_target_size_one_pass_cbr(
+          cpi, gf_group->update_type[gf_group->index]);
+    }
+  } else {
+    if (frame_params->frame_type == KEY_FRAME) {
+      target = av1_calc_iframe_target_size_one_pass_vbr(cpi);
+    } else {
+      target = av1_calc_pframe_target_size_one_pass_vbr(
+          cpi, gf_group->update_type[gf_group->index]);
+    }
+  }
+  av1_rc_set_frame_target(cpi, target, cm->width, cm->height);
+  rc->base_frame_target = target;
+  if (set_reference_structure && cpi->oxcf.speed >= 6 &&
+      cm->number_spatial_layers == 1 && cm->number_temporal_layers == 1)
+    set_reference_structure_one_pass_rt(cpi, gf_update);
+  cm->current_frame.frame_type = frame_params->frame_type;
+}
diff --git a/libs/libaom/src/av1/encoder/ratectrl.h b/libs/libaom/src/av1/encoder/ratectrl.h
new file mode 100644
index 000000000..c46378663
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/ratectrl.h
@@ -0,0 +1,324 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_RATECTRL_H_
+#define AOM_AV1_ENCODER_RATECTRL_H_
+
+#include "aom/aom_codec.h"
+#include "aom/aom_integer.h"
+
+#include "aom_ports/mem.h"
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/blockd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Bits Per MB at different Q (Multiplied by 512)
+#define BPER_MB_NORMBITS 9
+
+// Use this macro to turn on/off use of alt-refs in one-pass mode.
+#define USE_ALTREF_FOR_ONE_PASS 1
+
+// Threshold used to define if a KF group is static (e.g. a slide show).
+// Essentially, this means that no frame in the group has more than 1% of MBs
+// that are not marked as coded with 0,0 motion in the first pass.
+#define STATIC_KF_GROUP_THRESH 99
+#define STATIC_KF_GROUP_FLOAT_THRESH 0.99
+
+// The maximum duration of a GF group that is static (e.g. a slide show).
+#define MAX_STATIC_GF_GROUP_LENGTH 250
+
+// Minimum and maximum height for the new pyramid structure.
+// (Old structure supports height = 1, but does NOT support height = 4).
+#define MIN_PYRAMID_LVL 0
+#define MAX_PYRAMID_LVL 4
+
+#define MIN_GF_INTERVAL 4
+#define MAX_GF_INTERVAL 32
+#define FIXED_GF_INTERVAL 8  // Used in some testing modes only
+#define MAX_GF_LENGTH_LAP 16
+
+#define MAX_NUM_GF_INTERVALS 15
+
+#define MAX_ARF_LAYERS 6
+// #define STRICT_RC
+
+typedef struct {
+  int resize_width;
+  int resize_height;
+  uint8_t superres_denom;
+} size_params_type;
+
+enum {
+  INTER_NORMAL,
+  GF_ARF_LOW,
+  GF_ARF_STD,
+  KF_STD,
+  RATE_FACTOR_LEVELS
+} UENUM1BYTE(RATE_FACTOR_LEVEL);
+
+enum {
+  KF_UPDATE,
+  LF_UPDATE,
+  GF_UPDATE,
+  ARF_UPDATE,
+  OVERLAY_UPDATE,
+  INTNL_OVERLAY_UPDATE,  // Internal Overlay Frame
+  INTNL_ARF_UPDATE,      // Internal Altref Frame
+  FRAME_UPDATE_TYPES
+} UENUM1BYTE(FRAME_UPDATE_TYPE);
+
+typedef struct {
+  // Rate targetting variables
+  int base_frame_target;  // A baseline frame target before adjustment
+                          // for previous under or over shoot.
+  int this_frame_target;  // Actual frame target after rc adjustment.
+
+  // gop bit budget
+  int64_t gf_group_bits;
+
+  int projected_frame_size;
+  int sb64_target_rate;
+  int last_q[FRAME_TYPES];  // Separate values for Intra/Inter
+  int last_boosted_qindex;  // Last boosted GF/KF/ARF q
+  int last_kf_qindex;       // Q index of the last key frame coded.
+
+  int gfu_boost;
+  int kf_boost;
+
+  double rate_correction_factors[RATE_FACTOR_LEVELS];
+
+  int frames_since_golden;
+  int frames_till_gf_update_due;
+
+  // number of determined gf group length left
+  int intervals_till_gf_calculate_due;
+  // stores gf group length intervals
+  int gf_intervals[MAX_NUM_GF_INTERVALS];
+  // the current index in gf_intervals
+  int cur_gf_index;
+
+  int min_gf_interval;
+  int max_gf_interval;
+  int static_scene_max_gf_interval;
+  int baseline_gf_interval;
+  int constrained_gf_group;
+  int frames_to_key;
+  int frames_since_key;
+  int this_key_frame_forced;
+  int next_key_frame_forced;
+  int source_alt_ref_pending;
+  int source_alt_ref_active;
+  int is_src_frame_alt_ref;
+  int sframe_due;
+
+  int avg_frame_bandwidth;  // Average frame size target for clip
+  int min_frame_bandwidth;  // Minimum allocation used for any frame
+  int max_frame_bandwidth;  // Maximum burst rate allowed for a frame.
+  int prev_avg_frame_bandwidth;
+
+  int ni_av_qi;
+  int ni_tot_qi;
+  int ni_frames;
+  int avg_frame_qindex[FRAME_TYPES];
+  double tot_q;
+  double avg_q;
+
+  int64_t buffer_level;
+  int64_t bits_off_target;
+  int64_t vbr_bits_off_target;
+  int64_t vbr_bits_off_target_fast;
+
+  int decimation_factor;
+  int decimation_count;
+
+  int rolling_target_bits;
+  int rolling_actual_bits;
+
+  int long_rolling_target_bits;
+  int long_rolling_actual_bits;
+
+  int rate_error_estimate;
+
+  int64_t total_actual_bits;
+  int64_t total_target_bits;
+  int64_t total_target_vs_actual;
+
+  int worst_quality;
+  int best_quality;
+
+  int64_t starting_buffer_level;
+  int64_t optimal_buffer_level;
+  int64_t maximum_buffer_size;
+
+  // rate control history for last frame(1) and the frame before(2).
+  // -1: undershot
+  //  1: overshoot
+  //  0: not initialized.
+  int rc_1_frame;
+  int rc_2_frame;
+  int q_1_frame;
+  int q_2_frame;
+
+  float_t arf_boost_factor;
+  // Q index used for ALT frame
+  int arf_q;
+  int active_worst_quality;
+  int active_best_quality[MAX_ARF_LAYERS + 1];
+  int base_layer_qp;
+
+  // Total number of stats used only for kf_boost calculation.
+  int num_stats_used_for_kf_boost;
+  // Total number of stats used only for gfu_boost calculation.
+  int num_stats_used_for_gfu_boost;
+  // Total number of stats required by gfu_boost calculation.
+  int num_stats_required_for_gfu_boost;
+  int next_is_fwd_key;
+  int enable_scenecut_detection;
+} RATE_CONTROL;
+
+struct AV1_COMP;
+struct AV1EncoderConfig;
+
+void av1_rc_init(const struct AV1EncoderConfig *oxcf, int pass,
+                 RATE_CONTROL *rc);
+
+int av1_estimate_bits_at_q(FRAME_TYPE frame_kind, int q, int mbs,
+                           double correction_factor, aom_bit_depth_t bit_depth);
+
+double av1_convert_qindex_to_q(int qindex, aom_bit_depth_t bit_depth);
+
+void av1_rc_init_minq_luts(void);
+
+int av1_rc_get_default_min_gf_interval(int width, int height, double framerate);
+// Note av1_rc_get_default_max_gf_interval() requires the min_gf_interval to
+// be passed in to ensure that the max_gf_interval returned is at least as bis
+// as that.
+int av1_rc_get_default_max_gf_interval(double framerate, int min_gf_interval);
+
+// Generally at the high level, the following flow is expected
+// to be enforced for rate control:
+// First call per frame, one of:
+//   av1_rc_get_first_pass_params()
+//   av1_rc_get_second_pass_params()
+// depending on the usage to set the rate control encode parameters desired.
+//
+// Then, call encode_frame_to_data_rate() to perform the
+// actual encode. This function will in turn call encode_frame()
+// one or more times, followed by one of:
+//   av1_rc_postencode_update()
+//   av1_rc_postencode_update_drop_frame()
+//
+// The majority of rate control parameters are only expected
+// to be set in the av1_rc_get_..._params() functions and
+// updated during the av1_rc_postencode_update...() functions.
+// The only exceptions are av1_rc_drop_frame() and
+// av1_rc_update_rate_correction_factors() functions.
+
+// Functions to set parameters for encoding before the actual
+// encode_frame_to_data_rate() function.
+struct EncodeFrameParams;
+
+// Post encode update of the rate control parameters based
+// on bytes used
+void av1_rc_postencode_update(struct AV1_COMP *cpi, uint64_t bytes_used);
+// Post encode update of the rate control parameters for dropped frames
+void av1_rc_postencode_update_drop_frame(struct AV1_COMP *cpi);
+
+// Updates rate correction factors
+// Changes only the rate correction factors in the rate control structure.
+void av1_rc_update_rate_correction_factors(struct AV1_COMP *cpi, int width,
+                                           int height);
+
+// Decide if we should drop this frame: For 1-pass CBR.
+// Changes only the decimation count in the rate control structure
+int av1_rc_drop_frame(struct AV1_COMP *cpi);
+
+// Computes frame size bounds.
+void av1_rc_compute_frame_size_bounds(const struct AV1_COMP *cpi,
+                                      int this_frame_target,
+                                      int *frame_under_shoot_limit,
+                                      int *frame_over_shoot_limit);
+
+// Picks q and q bounds given the target for bits
+int av1_rc_pick_q_and_bounds(const struct AV1_COMP *cpi, RATE_CONTROL *rc,
+                             int width, int height, int gf_index,
+                             int *bottom_index, int *top_index);
+
+// Estimates q to achieve a target bits per frame
+int av1_rc_regulate_q(const struct AV1_COMP *cpi, int target_bits_per_frame,
+                      int active_best_quality, int active_worst_quality,
+                      int width, int height);
+
+// Estimates bits per mb for a given qindex and correction factor.
+int av1_rc_bits_per_mb(FRAME_TYPE frame_type, int qindex,
+                       double correction_factor, aom_bit_depth_t bit_depth);
+
+// Clamping utilities for bitrate targets for iframes and pframes.
+int av1_rc_clamp_iframe_target_size(const struct AV1_COMP *const cpi,
+                                    int target);
+int av1_rc_clamp_pframe_target_size(const struct AV1_COMP *const cpi,
+                                    int target, uint8_t frame_update_type);
+
+// Find q_index corresponding to desired_q, within [best_qindex, worst_qindex].
+// To be precise, 'q_index' is the smallest integer, for which the corresponding
+// q >= desired_q.
+// If no such q index is found, returns 'worst_qindex'.
+int av1_find_qindex(double desired_q, aom_bit_depth_t bit_depth,
+                    int best_qindex, int worst_qindex);
+
+// Computes a q delta (in "q index" terms) to get from a starting q value
+// to a target q value
+int av1_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget,
+                       aom_bit_depth_t bit_depth);
+
+// Computes a q delta (in "q index" terms) to get from a starting q value
+// to a value that should equate to the given rate ratio.
+int av1_compute_qdelta_by_rate(const RATE_CONTROL *rc, FRAME_TYPE frame_type,
+                               int qindex, double rate_target_ratio,
+                               aom_bit_depth_t bit_depth);
+
+int av1_frame_type_qdelta(const struct AV1_COMP *cpi, int q);
+
+void av1_rc_update_framerate(struct AV1_COMP *cpi, int width, int height);
+
+void av1_rc_set_gf_interval_range(const struct AV1_COMP *const cpi,
+                                  RATE_CONTROL *const rc);
+
+void av1_set_target_rate(struct AV1_COMP *cpi, int width, int height);
+
+int av1_resize_one_pass_cbr(struct AV1_COMP *cpi);
+
+void av1_rc_set_frame_target(struct AV1_COMP *cpi, int target, int width,
+                             int height);
+
+int av1_calc_pframe_target_size_one_pass_vbr(
+    const struct AV1_COMP *const cpi, FRAME_UPDATE_TYPE frame_update_type);
+
+int av1_calc_iframe_target_size_one_pass_vbr(const struct AV1_COMP *const cpi);
+
+int av1_calc_pframe_target_size_one_pass_cbr(
+    const struct AV1_COMP *cpi, FRAME_UPDATE_TYPE frame_update_type);
+
+int av1_calc_iframe_target_size_one_pass_cbr(const struct AV1_COMP *cpi);
+
+void av1_get_one_pass_rt_params(struct AV1_COMP *cpi,
+                                struct EncodeFrameParams *const frame_params,
+                                unsigned int frame_flags);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_RATECTRL_H_
diff --git a/libs/libaom/src/av1/encoder/rd.c b/libs/libaom/src/av1/encoder/rd.c
new file mode 100644
index 000000000..e48c77119
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/rd.c
@@ -0,0 +1,1332 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/bitops.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/system_state.h"
+
+#include "av1/common/common.h"
+#include "av1/common/entropy.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/mvref_common.h"
+#include "av1/common/pred_common.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+#include "av1/common/seg_common.h"
+
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/cost.h"
+#include "av1/encoder/encodemb.h"
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encodetxb.h"
+#include "av1/encoder/mcomp.h"
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/rd.h"
+#include "av1/encoder/tokenize.h"
+
+#define RD_THRESH_POW 1.25
+
+// The baseline rd thresholds for breaking out of the rd loop for
+// certain modes are assumed to be based on 8x8 blocks.
+// This table is used to correct for block size.
+// The factors here are << 2 (2 = x0.5, 32 = x8 etc).
+static const uint8_t rd_thresh_block_size_factor[BLOCK_SIZES_ALL] = {
+  2, 3, 3, 4, 6, 6, 8, 12, 12, 16, 24, 24, 32, 48, 48, 64, 4, 4, 8, 8, 16, 16
+};
+
+static const int use_intra_ext_tx_for_txsize[EXT_TX_SETS_INTRA]
+                                            [EXT_TX_SIZES] = {
+                                              { 1, 1, 1, 1 },  // unused
+                                              { 1, 1, 0, 0 },
+                                              { 0, 0, 1, 0 },
+                                            };
+
+static const int use_inter_ext_tx_for_txsize[EXT_TX_SETS_INTER]
+                                            [EXT_TX_SIZES] = {
+                                              { 1, 1, 1, 1 },  // unused
+                                              { 1, 1, 0, 0 },
+                                              { 0, 0, 1, 0 },
+                                              { 0, 1, 1, 1 },
+                                            };
+
+static const int av1_ext_tx_set_idx_to_type[2][AOMMAX(EXT_TX_SETS_INTRA,
+                                                      EXT_TX_SETS_INTER)] = {
+  {
+      // Intra
+      EXT_TX_SET_DCTONLY,
+      EXT_TX_SET_DTT4_IDTX_1DDCT,
+      EXT_TX_SET_DTT4_IDTX,
+  },
+  {
+      // Inter
+      EXT_TX_SET_DCTONLY,
+      EXT_TX_SET_ALL16,
+      EXT_TX_SET_DTT9_IDTX_1DDCT,
+      EXT_TX_SET_DCT_IDTX,
+  },
+};
+
+void av1_fill_mode_rates(AV1_COMMON *const cm, MACROBLOCK *x,
+                         FRAME_CONTEXT *fc) {
+  int i, j;
+
+  for (i = 0; i < PARTITION_CONTEXTS; ++i)
+    av1_cost_tokens_from_cdf(x->partition_cost[i], fc->partition_cdf[i], NULL);
+
+  if (cm->current_frame.skip_mode_info.skip_mode_flag) {
+    for (i = 0; i < SKIP_CONTEXTS; ++i) {
+      av1_cost_tokens_from_cdf(x->skip_mode_cost[i], fc->skip_mode_cdfs[i],
+                               NULL);
+    }
+  }
+
+  for (i = 0; i < SKIP_CONTEXTS; ++i) {
+    av1_cost_tokens_from_cdf(x->skip_cost[i], fc->skip_cdfs[i], NULL);
+  }
+
+  for (i = 0; i < KF_MODE_CONTEXTS; ++i)
+    for (j = 0; j < KF_MODE_CONTEXTS; ++j)
+      av1_cost_tokens_from_cdf(x->y_mode_costs[i][j], fc->kf_y_cdf[i][j], NULL);
+
+  for (i = 0; i < BLOCK_SIZE_GROUPS; ++i)
+    av1_cost_tokens_from_cdf(x->mbmode_cost[i], fc->y_mode_cdf[i], NULL);
+  for (i = 0; i < CFL_ALLOWED_TYPES; ++i)
+    for (j = 0; j < INTRA_MODES; ++j)
+      av1_cost_tokens_from_cdf(x->intra_uv_mode_cost[i][j],
+                               fc->uv_mode_cdf[i][j], NULL);
+
+  av1_cost_tokens_from_cdf(x->filter_intra_mode_cost, fc->filter_intra_mode_cdf,
+                           NULL);
+  for (i = 0; i < BLOCK_SIZES_ALL; ++i) {
+    if (av1_filter_intra_allowed_bsize(cm, i))
+      av1_cost_tokens_from_cdf(x->filter_intra_cost[i],
+                               fc->filter_intra_cdfs[i], NULL);
+  }
+
+  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
+    av1_cost_tokens_from_cdf(x->switchable_interp_costs[i],
+                             fc->switchable_interp_cdf[i], NULL);
+
+  for (i = 0; i < PALATTE_BSIZE_CTXS; ++i) {
+    av1_cost_tokens_from_cdf(x->palette_y_size_cost[i],
+                             fc->palette_y_size_cdf[i], NULL);
+    av1_cost_tokens_from_cdf(x->palette_uv_size_cost[i],
+                             fc->palette_uv_size_cdf[i], NULL);
+    for (j = 0; j < PALETTE_Y_MODE_CONTEXTS; ++j) {
+      av1_cost_tokens_from_cdf(x->palette_y_mode_cost[i][j],
+                               fc->palette_y_mode_cdf[i][j], NULL);
+    }
+  }
+
+  for (i = 0; i < PALETTE_UV_MODE_CONTEXTS; ++i) {
+    av1_cost_tokens_from_cdf(x->palette_uv_mode_cost[i],
+                             fc->palette_uv_mode_cdf[i], NULL);
+  }
+
+  for (i = 0; i < PALETTE_SIZES; ++i) {
+    for (j = 0; j < PALETTE_COLOR_INDEX_CONTEXTS; ++j) {
+      av1_cost_tokens_from_cdf(x->palette_y_color_cost[i][j],
+                               fc->palette_y_color_index_cdf[i][j], NULL);
+      av1_cost_tokens_from_cdf(x->palette_uv_color_cost[i][j],
+                               fc->palette_uv_color_index_cdf[i][j], NULL);
+    }
+  }
+
+  int sign_cost[CFL_JOINT_SIGNS];
+  av1_cost_tokens_from_cdf(sign_cost, fc->cfl_sign_cdf, NULL);
+  for (int joint_sign = 0; joint_sign < CFL_JOINT_SIGNS; joint_sign++) {
+    int *cost_u = x->cfl_cost[joint_sign][CFL_PRED_U];
+    int *cost_v = x->cfl_cost[joint_sign][CFL_PRED_V];
+    if (CFL_SIGN_U(joint_sign) == CFL_SIGN_ZERO) {
+      memset(cost_u, 0, CFL_ALPHABET_SIZE * sizeof(*cost_u));
+    } else {
+      const aom_cdf_prob *cdf_u = fc->cfl_alpha_cdf[CFL_CONTEXT_U(joint_sign)];
+      av1_cost_tokens_from_cdf(cost_u, cdf_u, NULL);
+    }
+    if (CFL_SIGN_V(joint_sign) == CFL_SIGN_ZERO) {
+      memset(cost_v, 0, CFL_ALPHABET_SIZE * sizeof(*cost_v));
+    } else {
+      const aom_cdf_prob *cdf_v = fc->cfl_alpha_cdf[CFL_CONTEXT_V(joint_sign)];
+      av1_cost_tokens_from_cdf(cost_v, cdf_v, NULL);
+    }
+    for (int u = 0; u < CFL_ALPHABET_SIZE; u++)
+      cost_u[u] += sign_cost[joint_sign];
+  }
+
+  for (i = 0; i < MAX_TX_CATS; ++i)
+    for (j = 0; j < TX_SIZE_CONTEXTS; ++j)
+      av1_cost_tokens_from_cdf(x->tx_size_cost[i][j], fc->tx_size_cdf[i][j],
+                               NULL);
+
+  for (i = 0; i < TXFM_PARTITION_CONTEXTS; ++i) {
+    av1_cost_tokens_from_cdf(x->txfm_partition_cost[i],
+                             fc->txfm_partition_cdf[i], NULL);
+  }
+
+  for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+    int s;
+    for (s = 1; s < EXT_TX_SETS_INTER; ++s) {
+      if (use_inter_ext_tx_for_txsize[s][i]) {
+        av1_cost_tokens_from_cdf(
+            x->inter_tx_type_costs[s][i], fc->inter_ext_tx_cdf[s][i],
+            av1_ext_tx_inv[av1_ext_tx_set_idx_to_type[1][s]]);
+      }
+    }
+    for (s = 1; s < EXT_TX_SETS_INTRA; ++s) {
+      if (use_intra_ext_tx_for_txsize[s][i]) {
+        for (j = 0; j < INTRA_MODES; ++j) {
+          av1_cost_tokens_from_cdf(
+              x->intra_tx_type_costs[s][i][j], fc->intra_ext_tx_cdf[s][i][j],
+              av1_ext_tx_inv[av1_ext_tx_set_idx_to_type[0][s]]);
+        }
+      }
+    }
+  }
+  for (i = 0; i < DIRECTIONAL_MODES; ++i) {
+    av1_cost_tokens_from_cdf(x->angle_delta_cost[i], fc->angle_delta_cdf[i],
+                             NULL);
+  }
+  av1_cost_tokens_from_cdf(x->switchable_restore_cost,
+                           fc->switchable_restore_cdf, NULL);
+  av1_cost_tokens_from_cdf(x->wiener_restore_cost, fc->wiener_restore_cdf,
+                           NULL);
+  av1_cost_tokens_from_cdf(x->sgrproj_restore_cost, fc->sgrproj_restore_cdf,
+                           NULL);
+  av1_cost_tokens_from_cdf(x->intrabc_cost, fc->intrabc_cdf, NULL);
+
+  if (!frame_is_intra_only(cm)) {
+    for (i = 0; i < COMP_INTER_CONTEXTS; ++i) {
+      av1_cost_tokens_from_cdf(x->comp_inter_cost[i], fc->comp_inter_cdf[i],
+                               NULL);
+    }
+
+    for (i = 0; i < REF_CONTEXTS; ++i) {
+      for (j = 0; j < SINGLE_REFS - 1; ++j) {
+        av1_cost_tokens_from_cdf(x->single_ref_cost[i][j],
+                                 fc->single_ref_cdf[i][j], NULL);
+      }
+    }
+
+    for (i = 0; i < COMP_REF_TYPE_CONTEXTS; ++i) {
+      av1_cost_tokens_from_cdf(x->comp_ref_type_cost[i],
+                               fc->comp_ref_type_cdf[i], NULL);
+    }
+
+    for (i = 0; i < UNI_COMP_REF_CONTEXTS; ++i) {
+      for (j = 0; j < UNIDIR_COMP_REFS - 1; ++j) {
+        av1_cost_tokens_from_cdf(x->uni_comp_ref_cost[i][j],
+                                 fc->uni_comp_ref_cdf[i][j], NULL);
+      }
+    }
+
+    for (i = 0; i < REF_CONTEXTS; ++i) {
+      for (j = 0; j < FWD_REFS - 1; ++j) {
+        av1_cost_tokens_from_cdf(x->comp_ref_cost[i][j], fc->comp_ref_cdf[i][j],
+                                 NULL);
+      }
+    }
+
+    for (i = 0; i < REF_CONTEXTS; ++i) {
+      for (j = 0; j < BWD_REFS - 1; ++j) {
+        av1_cost_tokens_from_cdf(x->comp_bwdref_cost[i][j],
+                                 fc->comp_bwdref_cdf[i][j], NULL);
+      }
+    }
+
+    for (i = 0; i < INTRA_INTER_CONTEXTS; ++i) {
+      av1_cost_tokens_from_cdf(x->intra_inter_cost[i], fc->intra_inter_cdf[i],
+                               NULL);
+    }
+
+    for (i = 0; i < NEWMV_MODE_CONTEXTS; ++i) {
+      av1_cost_tokens_from_cdf(x->newmv_mode_cost[i], fc->newmv_cdf[i], NULL);
+    }
+
+    for (i = 0; i < GLOBALMV_MODE_CONTEXTS; ++i) {
+      av1_cost_tokens_from_cdf(x->zeromv_mode_cost[i], fc->zeromv_cdf[i], NULL);
+    }
+
+    for (i = 0; i < REFMV_MODE_CONTEXTS; ++i) {
+      av1_cost_tokens_from_cdf(x->refmv_mode_cost[i], fc->refmv_cdf[i], NULL);
+    }
+
+    for (i = 0; i < DRL_MODE_CONTEXTS; ++i) {
+      av1_cost_tokens_from_cdf(x->drl_mode_cost0[i], fc->drl_cdf[i], NULL);
+    }
+    for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
+      av1_cost_tokens_from_cdf(x->inter_compound_mode_cost[i],
+                               fc->inter_compound_mode_cdf[i], NULL);
+    for (i = 0; i < BLOCK_SIZES_ALL; ++i)
+      av1_cost_tokens_from_cdf(x->compound_type_cost[i],
+                               fc->compound_type_cdf[i], NULL);
+    for (i = 0; i < BLOCK_SIZES_ALL; ++i) {
+      if (av1_is_wedge_used(i)) {
+        av1_cost_tokens_from_cdf(x->wedge_idx_cost[i], fc->wedge_idx_cdf[i],
+                                 NULL);
+      }
+    }
+    for (i = 0; i < BLOCK_SIZE_GROUPS; ++i) {
+      av1_cost_tokens_from_cdf(x->interintra_cost[i], fc->interintra_cdf[i],
+                               NULL);
+      av1_cost_tokens_from_cdf(x->interintra_mode_cost[i],
+                               fc->interintra_mode_cdf[i], NULL);
+    }
+    for (i = 0; i < BLOCK_SIZES_ALL; ++i) {
+      av1_cost_tokens_from_cdf(x->wedge_interintra_cost[i],
+                               fc->wedge_interintra_cdf[i], NULL);
+    }
+    for (i = BLOCK_8X8; i < BLOCK_SIZES_ALL; i++) {
+      av1_cost_tokens_from_cdf(x->motion_mode_cost[i], fc->motion_mode_cdf[i],
+                               NULL);
+    }
+    for (i = BLOCK_8X8; i < BLOCK_SIZES_ALL; i++) {
+      av1_cost_tokens_from_cdf(x->motion_mode_cost1[i], fc->obmc_cdf[i], NULL);
+    }
+    for (i = 0; i < COMP_INDEX_CONTEXTS; ++i) {
+      av1_cost_tokens_from_cdf(x->comp_idx_cost[i], fc->compound_index_cdf[i],
+                               NULL);
+    }
+    for (i = 0; i < COMP_GROUP_IDX_CONTEXTS; ++i) {
+      av1_cost_tokens_from_cdf(x->comp_group_idx_cost[i],
+                               fc->comp_group_idx_cdf[i], NULL);
+    }
+  }
+}
+
+// Values are now correlated to quantizer.
+static int sad_per_bit_lut_8[QINDEX_RANGE];
+static int sad_per_bit_lut_10[QINDEX_RANGE];
+static int sad_per_bit_lut_12[QINDEX_RANGE];
+
+static void init_me_luts_bd(int *bit16lut, int range,
+                            aom_bit_depth_t bit_depth) {
+  int i;
+  // Initialize the sad lut tables using a formulaic calculation for now.
+  // This is to make it easier to resolve the impact of experimental changes
+  // to the quantizer tables.
+  for (i = 0; i < range; i++) {
+    const double q = av1_convert_qindex_to_q(i, bit_depth);
+    bit16lut[i] = (int)(0.0418 * q + 2.4107);
+  }
+}
+
+void av1_init_me_luts(void) {
+  init_me_luts_bd(sad_per_bit_lut_8, QINDEX_RANGE, AOM_BITS_8);
+  init_me_luts_bd(sad_per_bit_lut_10, QINDEX_RANGE, AOM_BITS_10);
+  init_me_luts_bd(sad_per_bit_lut_12, QINDEX_RANGE, AOM_BITS_12);
+}
+
+static const int rd_boost_factor[16] = { 64, 32, 32, 32, 24, 16, 12, 12,
+                                         8,  8,  4,  4,  2,  2,  1,  0 };
+static const int rd_frame_type_factor[FRAME_UPDATE_TYPES] = { 128, 144, 128,
+                                                              128, 144, 144,
+                                                              128 };
+
+int av1_compute_rd_mult_based_on_qindex(const AV1_COMP *cpi, int qindex) {
+  const int q = av1_dc_quant_QTX(qindex, 0, cpi->common.seq_params.bit_depth);
+  int rdmult = q * q;
+  rdmult = rdmult * 3 + (rdmult * 2 / 3);
+  switch (cpi->common.seq_params.bit_depth) {
+    case AOM_BITS_8: break;
+    case AOM_BITS_10: rdmult = ROUND_POWER_OF_TWO(rdmult, 4); break;
+    case AOM_BITS_12: rdmult = ROUND_POWER_OF_TWO(rdmult, 8); break;
+    default:
+      assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
+      return -1;
+  }
+  return rdmult > 0 ? rdmult : 1;
+}
+
+int av1_compute_rd_mult(const AV1_COMP *cpi, int qindex) {
+  int64_t rdmult = av1_compute_rd_mult_based_on_qindex(cpi, qindex);
+  if (is_stat_consumption_stage(cpi) &&
+      (cpi->common.current_frame.frame_type != KEY_FRAME)) {
+    const GF_GROUP *const gf_group = &cpi->gf_group;
+    const FRAME_UPDATE_TYPE frame_type = gf_group->update_type[gf_group->index];
+    const int boost_index = AOMMIN(15, (cpi->rc.gfu_boost / 100));
+
+    rdmult = (rdmult * rd_frame_type_factor[frame_type]) >> 7;
+    rdmult += ((rdmult * rd_boost_factor[boost_index]) >> 7);
+  }
+  return (int)rdmult;
+}
+
+int av1_get_deltaq_offset(const AV1_COMP *cpi, int qindex, double beta) {
+  assert(beta > 0.0);
+  int q = av1_dc_quant_QTX(qindex, 0, cpi->common.seq_params.bit_depth);
+  int newq = (int)rint(q / sqrt(beta));
+  int orig_qindex = qindex;
+  if (newq < q) {
+    do {
+      qindex--;
+      q = av1_dc_quant_QTX(qindex, 0, cpi->common.seq_params.bit_depth);
+    } while (newq < q && qindex > 0);
+  } else {
+    do {
+      qindex++;
+      q = av1_dc_quant_QTX(qindex, 0, cpi->common.seq_params.bit_depth);
+    } while (newq > q && qindex < MAXQ);
+  }
+  return qindex - orig_qindex;
+}
+
+int av1_get_adaptive_rdmult(const AV1_COMP *cpi, double beta) {
+  assert(beta > 0.0);
+  const AV1_COMMON *cm = &cpi->common;
+  int64_t q = av1_dc_quant_QTX(cm->quant_params.base_qindex, 0,
+                               cm->seq_params.bit_depth);
+  int64_t rdmult = 0;
+
+  switch (cm->seq_params.bit_depth) {
+    case AOM_BITS_8: rdmult = (int)((88 * q * q / beta) / 24); break;
+    case AOM_BITS_10:
+      rdmult = ROUND_POWER_OF_TWO((int)((88 * q * q / beta) / 24), 4);
+      break;
+    default:
+      assert(cm->seq_params.bit_depth == AOM_BITS_12);
+      rdmult = ROUND_POWER_OF_TWO((int)((88 * q * q / beta) / 24), 8);
+      break;
+  }
+
+  if (is_stat_consumption_stage(cpi) &&
+      (cm->current_frame.frame_type != KEY_FRAME)) {
+    const GF_GROUP *const gf_group = &cpi->gf_group;
+    const FRAME_UPDATE_TYPE frame_type = gf_group->update_type[gf_group->index];
+    const int boost_index = AOMMIN(15, (cpi->rc.gfu_boost / 100));
+
+    rdmult = (rdmult * rd_frame_type_factor[frame_type]) >> 7;
+    rdmult += ((rdmult * rd_boost_factor[boost_index]) >> 7);
+  }
+  if (rdmult < 1) rdmult = 1;
+  return (int)rdmult;
+}
+
+static int compute_rd_thresh_factor(int qindex, aom_bit_depth_t bit_depth) {
+  double q;
+  switch (bit_depth) {
+    case AOM_BITS_8: q = av1_dc_quant_QTX(qindex, 0, AOM_BITS_8) / 4.0; break;
+    case AOM_BITS_10:
+      q = av1_dc_quant_QTX(qindex, 0, AOM_BITS_10) / 16.0;
+      break;
+    case AOM_BITS_12:
+      q = av1_dc_quant_QTX(qindex, 0, AOM_BITS_12) / 64.0;
+      break;
+    default:
+      assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
+      return -1;
+  }
+  // TODO(debargha): Adjust the function below.
+  return AOMMAX((int)(pow(q, RD_THRESH_POW) * 5.12), 8);
+}
+
+void av1_initialize_me_consts(const AV1_COMP *cpi, MACROBLOCK *x, int qindex) {
+  switch (cpi->common.seq_params.bit_depth) {
+    case AOM_BITS_8: x->sadperbit = sad_per_bit_lut_8[qindex]; break;
+    case AOM_BITS_10: x->sadperbit = sad_per_bit_lut_10[qindex]; break;
+    case AOM_BITS_12: x->sadperbit = sad_per_bit_lut_12[qindex]; break;
+    default:
+      assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
+  }
+}
+
+static void set_block_thresholds(const AV1_COMMON *cm, RD_OPT *rd) {
+  int i, bsize, segment_id;
+
+  for (segment_id = 0; segment_id < MAX_SEGMENTS; ++segment_id) {
+    const int qindex = clamp(
+        av1_get_qindex(&cm->seg, segment_id, cm->quant_params.base_qindex) +
+            cm->quant_params.y_dc_delta_q,
+        0, MAXQ);
+    const int q = compute_rd_thresh_factor(qindex, cm->seq_params.bit_depth);
+
+    for (bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) {
+      // Threshold here seems unnecessarily harsh but fine given actual
+      // range of values used for cpi->sf.thresh_mult[].
+      const int t = q * rd_thresh_block_size_factor[bsize];
+      const int thresh_max = INT_MAX / t;
+
+      for (i = 0; i < MAX_MODES; ++i)
+        rd->threshes[segment_id][bsize][i] = rd->thresh_mult[i] < thresh_max
+                                                 ? rd->thresh_mult[i] * t / 4
+                                                 : INT_MAX;
+    }
+  }
+}
+
+void av1_fill_coeff_costs(MACROBLOCK *x, FRAME_CONTEXT *fc,
+                          const int num_planes) {
+  const int nplanes = AOMMIN(num_planes, PLANE_TYPES);
+  for (int eob_multi_size = 0; eob_multi_size < 7; ++eob_multi_size) {
+    for (int plane = 0; plane < nplanes; ++plane) {
+      LV_MAP_EOB_COST *pcost = &x->eob_costs[eob_multi_size][plane];
+
+      for (int ctx = 0; ctx < 2; ++ctx) {
+        aom_cdf_prob *pcdf;
+        switch (eob_multi_size) {
+          case 0: pcdf = fc->eob_flag_cdf16[plane][ctx]; break;
+          case 1: pcdf = fc->eob_flag_cdf32[plane][ctx]; break;
+          case 2: pcdf = fc->eob_flag_cdf64[plane][ctx]; break;
+          case 3: pcdf = fc->eob_flag_cdf128[plane][ctx]; break;
+          case 4: pcdf = fc->eob_flag_cdf256[plane][ctx]; break;
+          case 5: pcdf = fc->eob_flag_cdf512[plane][ctx]; break;
+          case 6:
+          default: pcdf = fc->eob_flag_cdf1024[plane][ctx]; break;
+        }
+        av1_cost_tokens_from_cdf(pcost->eob_cost[ctx], pcdf, NULL);
+      }
+    }
+  }
+  for (int tx_size = 0; tx_size < TX_SIZES; ++tx_size) {
+    for (int plane = 0; plane < nplanes; ++plane) {
+      LV_MAP_COEFF_COST *pcost = &x->coeff_costs[tx_size][plane];
+
+      for (int ctx = 0; ctx < TXB_SKIP_CONTEXTS; ++ctx)
+        av1_cost_tokens_from_cdf(pcost->txb_skip_cost[ctx],
+                                 fc->txb_skip_cdf[tx_size][ctx], NULL);
+
+      for (int ctx = 0; ctx < SIG_COEF_CONTEXTS_EOB; ++ctx)
+        av1_cost_tokens_from_cdf(pcost->base_eob_cost[ctx],
+                                 fc->coeff_base_eob_cdf[tx_size][plane][ctx],
+                                 NULL);
+      for (int ctx = 0; ctx < SIG_COEF_CONTEXTS; ++ctx)
+        av1_cost_tokens_from_cdf(pcost->base_cost[ctx],
+                                 fc->coeff_base_cdf[tx_size][plane][ctx], NULL);
+
+      for (int ctx = 0; ctx < SIG_COEF_CONTEXTS; ++ctx) {
+        pcost->base_cost[ctx][4] = 0;
+        pcost->base_cost[ctx][5] = pcost->base_cost[ctx][1] +
+                                   av1_cost_literal(1) -
+                                   pcost->base_cost[ctx][0];
+        pcost->base_cost[ctx][6] =
+            pcost->base_cost[ctx][2] - pcost->base_cost[ctx][1];
+        pcost->base_cost[ctx][7] =
+            pcost->base_cost[ctx][3] - pcost->base_cost[ctx][2];
+      }
+
+      for (int ctx = 0; ctx < EOB_COEF_CONTEXTS; ++ctx)
+        av1_cost_tokens_from_cdf(pcost->eob_extra_cost[ctx],
+                                 fc->eob_extra_cdf[tx_size][plane][ctx], NULL);
+
+      for (int ctx = 0; ctx < DC_SIGN_CONTEXTS; ++ctx)
+        av1_cost_tokens_from_cdf(pcost->dc_sign_cost[ctx],
+                                 fc->dc_sign_cdf[plane][ctx], NULL);
+
+      for (int ctx = 0; ctx < LEVEL_CONTEXTS; ++ctx) {
+        int br_rate[BR_CDF_SIZE];
+        int prev_cost = 0;
+        int i, j;
+        av1_cost_tokens_from_cdf(
+            br_rate, fc->coeff_br_cdf[AOMMIN(tx_size, TX_32X32)][plane][ctx],
+            NULL);
+        // printf("br_rate: ");
+        // for(j = 0; j < BR_CDF_SIZE; j++)
+        //  printf("%4d ", br_rate[j]);
+        // printf("\n");
+        for (i = 0; i < COEFF_BASE_RANGE; i += BR_CDF_SIZE - 1) {
+          for (j = 0; j < BR_CDF_SIZE - 1; j++) {
+            pcost->lps_cost[ctx][i + j] = prev_cost + br_rate[j];
+          }
+          prev_cost += br_rate[j];
+        }
+        pcost->lps_cost[ctx][i] = prev_cost;
+        // printf("lps_cost: %d %d %2d : ", tx_size, plane, ctx);
+        // for (i = 0; i <= COEFF_BASE_RANGE; i++)
+        //  printf("%5d ", pcost->lps_cost[ctx][i]);
+        // printf("\n");
+      }
+      for (int ctx = 0; ctx < LEVEL_CONTEXTS; ++ctx) {
+        pcost->lps_cost[ctx][0 + COEFF_BASE_RANGE + 1] =
+            pcost->lps_cost[ctx][0];
+        for (int i = 1; i <= COEFF_BASE_RANGE; ++i) {
+          pcost->lps_cost[ctx][i + COEFF_BASE_RANGE + 1] =
+              pcost->lps_cost[ctx][i] - pcost->lps_cost[ctx][i - 1];
+        }
+      }
+    }
+  }
+}
+
+void av1_fill_mv_costs(const FRAME_CONTEXT *fc, int integer_mv, int usehp,
+                       MACROBLOCK *x) {
+  x->nmvcost[0] = &x->nmv_costs[0][MV_MAX];
+  x->nmvcost[1] = &x->nmv_costs[1][MV_MAX];
+  x->nmvcost_hp[0] = &x->nmv_costs_hp[0][MV_MAX];
+  x->nmvcost_hp[1] = &x->nmv_costs_hp[1][MV_MAX];
+  if (integer_mv) {
+    av1_build_nmv_cost_table(x->nmv_vec_cost, x->nmvcost, &fc->nmvc,
+                             MV_SUBPEL_NONE);
+    x->mv_cost_stack = (int **)&x->nmvcost;
+  } else {
+    int *(*src)[2] = usehp ? &x->nmvcost_hp : &x->nmvcost;
+    x->mv_cost_stack = *src;
+    av1_build_nmv_cost_table(
+        x->nmv_vec_cost, usehp ? x->nmvcost_hp : x->nmvcost, &fc->nmvc, usehp);
+  }
+}
+
+void av1_initialize_rd_consts(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->td.mb;
+  RD_OPT *const rd = &cpi->rd;
+
+  aom_clear_system_state();
+
+  rd->RDMULT = av1_compute_rd_mult(
+      cpi, cm->quant_params.base_qindex + cm->quant_params.y_dc_delta_q);
+
+  set_error_per_bit(x, rd->RDMULT);
+
+  set_block_thresholds(cm, rd);
+
+  if ((!cpi->sf.rt_sf.use_nonrd_pick_mode &&
+       cpi->oxcf.mv_cost_upd_freq != COST_UPD_OFF) ||
+      frame_is_intra_only(cm) || (cm->current_frame.frame_number & 0x07) == 1)
+    av1_fill_mv_costs(cm->fc, cm->features.cur_frame_force_integer_mv,
+                      cm->features.allow_high_precision_mv, x);
+
+  if (!cpi->sf.rt_sf.use_nonrd_pick_mode && frame_is_intra_only(cm) &&
+      cm->features.allow_screen_content_tools &&
+      !is_stat_generation_stage(cpi)) {
+    IntraBCMVCosts *const dv_costs = &cpi->dv_costs;
+    int *dvcost[2] = { &dv_costs->mv_component[0][MV_MAX],
+                       &dv_costs->mv_component[1][MV_MAX] };
+    av1_build_nmv_cost_table(dv_costs->joint_mv, dvcost, &cm->fc->ndvc,
+                             MV_SUBPEL_NONE);
+  }
+
+  if (!is_stat_generation_stage(cpi)) {
+    for (int i = 0; i < TRANS_TYPES; ++i)
+      // IDENTITY: 1 bit
+      // TRANSLATION: 3 bits
+      // ROTZOOM: 2 bits
+      // AFFINE: 3 bits
+      cpi->gm_info.type_cost[i] = (1 + (i > 0 ? (i == ROTZOOM ? 1 : 2) : 0))
+                                  << AV1_PROB_COST_SHIFT;
+  }
+}
+
+static void model_rd_norm(int xsq_q10, int *r_q10, int *d_q10) {
+  // NOTE: The tables below must be of the same size.
+
+  // The functions described below are sampled at the four most significant
+  // bits of x^2 + 8 / 256.
+
+  // Normalized rate:
+  // This table models the rate for a Laplacian source with given variance
+  // when quantized with a uniform quantizer with given stepsize. The
+  // closed form expression is:
+  // Rn(x) = H(sqrt(r)) + sqrt(r)*[1 + H(r)/(1 - r)],
+  // where r = exp(-sqrt(2) * x) and x = qpstep / sqrt(variance),
+  // and H(x) is the binary entropy function.
+  static const int rate_tab_q10[] = {
+    65536, 6086, 5574, 5275, 5063, 4899, 4764, 4651, 4553, 4389, 4255, 4142,
+    4044,  3958, 3881, 3811, 3748, 3635, 3538, 3453, 3376, 3307, 3244, 3186,
+    3133,  3037, 2952, 2877, 2809, 2747, 2690, 2638, 2589, 2501, 2423, 2353,
+    2290,  2232, 2179, 2130, 2084, 2001, 1928, 1862, 1802, 1748, 1698, 1651,
+    1608,  1530, 1460, 1398, 1342, 1290, 1243, 1199, 1159, 1086, 1021, 963,
+    911,   864,  821,  781,  745,  680,  623,  574,  530,  490,  455,  424,
+    395,   345,  304,  269,  239,  213,  190,  171,  154,  126,  104,  87,
+    73,    61,   52,   44,   38,   28,   21,   16,   12,   10,   8,    6,
+    5,     3,    2,    1,    1,    1,    0,    0,
+  };
+  // Normalized distortion:
+  // This table models the normalized distortion for a Laplacian source
+  // with given variance when quantized with a uniform quantizer
+  // with given stepsize. The closed form expression is:
+  // Dn(x) = 1 - 1/sqrt(2) * x / sinh(x/sqrt(2))
+  // where x = qpstep / sqrt(variance).
+  // Note the actual distortion is Dn * variance.
+  static const int dist_tab_q10[] = {
+    0,    0,    1,    1,    1,    2,    2,    2,    3,    3,    4,    5,
+    5,    6,    7,    7,    8,    9,    11,   12,   13,   15,   16,   17,
+    18,   21,   24,   26,   29,   31,   34,   36,   39,   44,   49,   54,
+    59,   64,   69,   73,   78,   88,   97,   106,  115,  124,  133,  142,
+    151,  167,  184,  200,  215,  231,  245,  260,  274,  301,  327,  351,
+    375,  397,  418,  439,  458,  495,  528,  559,  587,  613,  637,  659,
+    680,  717,  749,  777,  801,  823,  842,  859,  874,  899,  919,  936,
+    949,  960,  969,  977,  983,  994,  1001, 1006, 1010, 1013, 1015, 1017,
+    1018, 1020, 1022, 1022, 1023, 1023, 1023, 1024,
+  };
+  static const int xsq_iq_q10[] = {
+    0,      4,      8,      12,     16,     20,     24,     28,     32,
+    40,     48,     56,     64,     72,     80,     88,     96,     112,
+    128,    144,    160,    176,    192,    208,    224,    256,    288,
+    320,    352,    384,    416,    448,    480,    544,    608,    672,
+    736,    800,    864,    928,    992,    1120,   1248,   1376,   1504,
+    1632,   1760,   1888,   2016,   2272,   2528,   2784,   3040,   3296,
+    3552,   3808,   4064,   4576,   5088,   5600,   6112,   6624,   7136,
+    7648,   8160,   9184,   10208,  11232,  12256,  13280,  14304,  15328,
+    16352,  18400,  20448,  22496,  24544,  26592,  28640,  30688,  32736,
+    36832,  40928,  45024,  49120,  53216,  57312,  61408,  65504,  73696,
+    81888,  90080,  98272,  106464, 114656, 122848, 131040, 147424, 163808,
+    180192, 196576, 212960, 229344, 245728,
+  };
+  const int tmp = (xsq_q10 >> 2) + 8;
+  const int k = get_msb(tmp) - 3;
+  const int xq = (k << 3) + ((tmp >> k) & 0x7);
+  const int one_q10 = 1 << 10;
+  const int a_q10 = ((xsq_q10 - xsq_iq_q10[xq]) << 10) >> (2 + k);
+  const int b_q10 = one_q10 - a_q10;
+  *r_q10 = (rate_tab_q10[xq] * b_q10 + rate_tab_q10[xq + 1] * a_q10) >> 10;
+  *d_q10 = (dist_tab_q10[xq] * b_q10 + dist_tab_q10[xq + 1] * a_q10) >> 10;
+}
+
+void av1_model_rd_from_var_lapndz(int64_t var, unsigned int n_log2,
+                                  unsigned int qstep, int *rate,
+                                  int64_t *dist) {
+  // This function models the rate and distortion for a Laplacian
+  // source with given variance when quantized with a uniform quantizer
+  // with given stepsize. The closed form expressions are in:
+  // Hang and Chen, "Source Model for transform video coder and its
+  // application - Part I: Fundamental Theory", IEEE Trans. Circ.
+  // Sys. for Video Tech., April 1997.
+  if (var == 0) {
+    *rate = 0;
+    *dist = 0;
+  } else {
+    int d_q10, r_q10;
+    static const uint32_t MAX_XSQ_Q10 = 245727;
+    const uint64_t xsq_q10_64 =
+        (((uint64_t)qstep * qstep << (n_log2 + 10)) + (var >> 1)) / var;
+    const int xsq_q10 = (int)AOMMIN(xsq_q10_64, MAX_XSQ_Q10);
+    model_rd_norm(xsq_q10, &r_q10, &d_q10);
+    *rate = ROUND_POWER_OF_TWO(r_q10 << n_log2, 10 - AV1_PROB_COST_SHIFT);
+    *dist = (var * (int64_t)d_q10 + 512) >> 10;
+  }
+}
+
+static double interp_cubic(const double *p, double x) {
+  return p[1] + 0.5 * x *
+                    (p[2] - p[0] +
+                     x * (2.0 * p[0] - 5.0 * p[1] + 4.0 * p[2] - p[3] +
+                          x * (3.0 * (p[1] - p[2]) + p[3] - p[0])));
+}
+
+/*
+static double interp_bicubic(const double *p, int p_stride, double x,
+                             double y) {
+  double q[4];
+  q[0] = interp_cubic(p, x);
+  q[1] = interp_cubic(p + p_stride, x);
+  q[2] = interp_cubic(p + 2 * p_stride, x);
+  q[3] = interp_cubic(p + 3 * p_stride, x);
+  return interp_cubic(q, y);
+}
+*/
+
+static const uint8_t bsize_curvfit_model_cat_lookup[BLOCK_SIZES_ALL] = {
+  0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 1, 1, 2, 2, 3, 3
+};
+
+static int sse_norm_curvfit_model_cat_lookup(double sse_norm) {
+  return (sse_norm > 16.0);
+}
+
+// Models distortion by sse using a logistic function on
+// l = log2(sse / q^2) as:
+// dbysse = 16 / (1 + k exp(l + c))
+static double get_dbysse_logistic(double l, double c, double k) {
+  const double A = 16.0;
+  const double dbysse = A / (1 + k * exp(l + c));
+  return dbysse;
+}
+
+// Models rate using a clamped linear function on
+// l = log2(sse / q^2) as:
+// rate = max(0, a + b * l)
+static double get_rate_clamplinear(double l, double a, double b) {
+  const double rate = a + b * l;
+  return (rate < 0 ? 0 : rate);
+}
+
+static const uint8_t bsize_surffit_model_cat_lookup[BLOCK_SIZES_ALL] = {
+  0, 0, 0, 0, 1, 1, 2, 3, 3, 4, 5, 5, 6, 7, 7, 8, 0, 0, 2, 2, 4, 4
+};
+
+static const double surffit_rate_params[9][4] = {
+  {
+      638.390212,
+      2.253108,
+      166.585650,
+      -3.939401,
+  },
+  {
+      5.256905,
+      81.997240,
+      -1.321771,
+      17.694216,
+  },
+  {
+      -74.193045,
+      72.431868,
+      -19.033152,
+      15.407276,
+  },
+  {
+      416.770113,
+      14.794188,
+      167.686830,
+      -6.997756,
+  },
+  {
+      378.511276,
+      9.558376,
+      154.658843,
+      -6.635663,
+  },
+  {
+      277.818787,
+      4.413180,
+      150.317637,
+      -9.893038,
+  },
+  {
+      142.212132,
+      11.542038,
+      94.393964,
+      -5.518517,
+  },
+  {
+      219.100256,
+      4.007421,
+      108.932852,
+      -6.981310,
+  },
+  {
+      222.261971,
+      3.251049,
+      95.972916,
+      -5.609789,
+  },
+};
+
+static const double surffit_dist_params[7] = { 1.475844,  4.328362, -5.680233,
+                                               -0.500994, 0.554585, 4.839478,
+                                               -0.695837 };
+
+static void rate_surffit_model_params_lookup(BLOCK_SIZE bsize, double xm,
+                                             double *rpar) {
+  const int cat = bsize_surffit_model_cat_lookup[bsize];
+  rpar[0] = surffit_rate_params[cat][0] + surffit_rate_params[cat][1] * xm;
+  rpar[1] = surffit_rate_params[cat][2] + surffit_rate_params[cat][3] * xm;
+}
+
+static void dist_surffit_model_params_lookup(BLOCK_SIZE bsize, double xm,
+                                             double *dpar) {
+  (void)bsize;
+  const double *params = surffit_dist_params;
+  dpar[0] = params[0] + params[1] / (1 + exp((xm + params[2]) * params[3]));
+  dpar[1] = params[4] + params[5] * exp(params[6] * xm);
+}
+
+void av1_model_rd_surffit(BLOCK_SIZE bsize, double sse_norm, double xm,
+                          double yl, double *rate_f, double *distbysse_f) {
+  (void)sse_norm;
+  double rpar[2], dpar[2];
+  rate_surffit_model_params_lookup(bsize, xm, rpar);
+  dist_surffit_model_params_lookup(bsize, xm, dpar);
+
+  *rate_f = get_rate_clamplinear(yl, rpar[0], rpar[1]);
+  *distbysse_f = get_dbysse_logistic(yl, dpar[0], dpar[1]);
+}
+
+static const double interp_rgrid_curv[4][65] = {
+  {
+      0.000000,    0.000000,    0.000000,    0.000000,    0.000000,
+      0.000000,    0.000000,    0.000000,    0.000000,    0.000000,
+      0.000000,    118.257702,  120.210658,  121.434853,  122.100487,
+      122.377758,  122.436865,  72.290102,   96.974289,   101.652727,
+      126.830141,  140.417377,  157.644879,  184.315291,  215.823873,
+      262.300169,  335.919859,  420.624173,  519.185032,  619.854243,
+      726.053595,  827.663369,  933.127475,  1037.988755, 1138.839609,
+      1233.342933, 1333.508064, 1428.760126, 1533.396364, 1616.952052,
+      1744.539319, 1803.413586, 1951.466618, 1994.227838, 2086.031680,
+      2148.635443, 2239.068450, 2222.590637, 2338.859809, 2402.929011,
+      2418.727875, 2435.342670, 2471.159469, 2523.187446, 2591.183827,
+      2674.905840, 2774.110714, 2888.555675, 3017.997952, 3162.194773,
+      3320.903365, 3493.880956, 3680.884773, 3881.672045, 4096.000000,
+  },
+  {
+      0.000000,    0.000000,    0.000000,    0.000000,    0.000000,
+      0.000000,    0.000000,    0.000000,    0.000000,    0.000000,
+      0.000000,    13.087244,   15.919735,   25.930313,   24.412411,
+      28.567417,   29.924194,   30.857010,   32.742979,   36.382570,
+      39.210386,   42.265690,   47.378572,   57.014850,   82.740067,
+      137.346562,  219.968084,  316.781856,  415.643773,  516.706538,
+      614.914364,  714.303763,  815.512135,  911.210485,  1008.501528,
+      1109.787854, 1213.772279, 1322.922561, 1414.752579, 1510.505641,
+      1615.741888, 1697.989032, 1780.123933, 1847.453790, 1913.742309,
+      1960.828122, 2047.500168, 2085.454095, 2129.230668, 2158.171824,
+      2182.231724, 2217.684864, 2269.589211, 2337.264824, 2420.618694,
+      2519.557814, 2633.989178, 2763.819779, 2908.956609, 3069.306660,
+      3244.776927, 3435.274401, 3640.706076, 3860.978945, 4096.000000,
+  },
+  {
+      0.000000,    0.000000,    0.000000,    0.000000,    0.000000,
+      0.000000,    0.000000,    0.000000,    0.000000,    0.000000,
+      0.000000,    4.656893,    5.123633,    5.594132,    6.162376,
+      6.918433,    7.768444,    8.739415,    10.105862,   11.477328,
+      13.236604,   15.421030,   19.093623,   25.801871,   46.724612,
+      98.841054,   181.113466,  272.586364,  359.499769,  445.546343,
+      525.944439,  605.188743,  681.793483,  756.668359,  838.486885,
+      926.950356,  1015.482542, 1113.353926, 1204.897193, 1288.871992,
+      1373.464145, 1455.746628, 1527.796460, 1588.475066, 1658.144771,
+      1710.302500, 1807.563351, 1863.197608, 1927.281616, 1964.450872,
+      2022.719898, 2100.041145, 2185.205712, 2280.993936, 2387.616216,
+      2505.282950, 2634.204540, 2774.591385, 2926.653884, 3090.602436,
+      3266.647443, 3454.999303, 3655.868416, 3869.465182, 4096.000000,
+  },
+  {
+      0.000000,    0.000000,    0.000000,    0.000000,    0.000000,
+      0.000000,    0.000000,    0.000000,    0.000000,    0.000000,
+      0.000000,    0.337370,    0.391916,    0.468839,    0.566334,
+      0.762564,    1.069225,    1.384361,    1.787581,    2.293948,
+      3.251909,    4.412991,    8.050068,    11.606073,   27.668092,
+      65.227758,   128.463938,  202.097653,  262.715851,  312.464873,
+      355.601398,  400.609054,  447.201352,  495.761568,  552.871938,
+      619.067625,  691.984883,  773.753288,  860.628503,  946.262808,
+      1019.805896, 1106.061360, 1178.422145, 1244.852258, 1302.173987,
+      1399.650266, 1548.092912, 1545.928652, 1670.817500, 1694.523823,
+      1779.195362, 1882.155494, 1990.662097, 2108.325181, 2235.456119,
+      2372.366287, 2519.367059, 2676.769812, 2844.885918, 3024.026754,
+      3214.503695, 3416.628115, 3630.711389, 3857.064892, 4096.000000,
+  },
+};
+
+static const double interp_dgrid_curv[3][65] = {
+  {
+      16.000000, 15.962891, 15.925174, 15.886888, 15.848074, 15.808770,
+      15.769015, 15.728850, 15.688313, 15.647445, 15.606284, 15.564870,
+      15.525918, 15.483820, 15.373330, 15.126844, 14.637442, 14.184387,
+      13.560070, 12.880717, 12.165995, 11.378144, 10.438769, 9.130790,
+      7.487633,  5.688649,  4.267515,  3.196300,  2.434201,  1.834064,
+      1.369920,  1.035921,  0.775279,  0.574895,  0.427232,  0.314123,
+      0.233236,  0.171440,  0.128188,  0.092762,  0.067569,  0.049324,
+      0.036330,  0.027008,  0.019853,  0.015539,  0.011093,  0.008733,
+      0.007624,  0.008105,  0.005427,  0.004065,  0.003427,  0.002848,
+      0.002328,  0.001865,  0.001457,  0.001103,  0.000801,  0.000550,
+      0.000348,  0.000193,  0.000085,  0.000021,  0.000000,
+  },
+  {
+      16.000000, 15.996116, 15.984769, 15.966413, 15.941505, 15.910501,
+      15.873856, 15.832026, 15.785466, 15.734633, 15.679981, 15.621967,
+      15.560961, 15.460157, 15.288367, 15.052462, 14.466922, 13.921212,
+      13.073692, 12.222005, 11.237799, 9.985848,  8.898823,  7.423519,
+      5.995325,  4.773152,  3.744032,  2.938217,  2.294526,  1.762412,
+      1.327145,  1.020728,  0.765535,  0.570548,  0.425833,  0.313825,
+      0.232959,  0.171324,  0.128174,  0.092750,  0.067558,  0.049319,
+      0.036330,  0.027008,  0.019853,  0.015539,  0.011093,  0.008733,
+      0.007624,  0.008105,  0.005427,  0.004065,  0.003427,  0.002848,
+      0.002328,  0.001865,  0.001457,  0.001103,  0.000801,  0.000550,
+      0.000348,  0.000193,  0.000085,  0.000021,  -0.000000,
+  },
+};
+
+void av1_model_rd_curvfit(BLOCK_SIZE bsize, double sse_norm, double xqr,
+                          double *rate_f, double *distbysse_f) {
+  const double x_start = -15.5;
+  const double x_end = 16.5;
+  const double x_step = 0.5;
+  const double epsilon = 1e-6;
+  const int rcat = bsize_curvfit_model_cat_lookup[bsize];
+  const int dcat = sse_norm_curvfit_model_cat_lookup(sse_norm);
+  (void)x_end;
+
+  xqr = AOMMAX(xqr, x_start + x_step + epsilon);
+  xqr = AOMMIN(xqr, x_end - x_step - epsilon);
+  const double x = (xqr - x_start) / x_step;
+  const int xi = (int)floor(x);
+  const double xo = x - xi;
+
+  assert(xi > 0);
+
+  const double *prate = &interp_rgrid_curv[rcat][(xi - 1)];
+  *rate_f = interp_cubic(prate, xo);
+  const double *pdist = &interp_dgrid_curv[dcat][(xi - 1)];
+  *distbysse_f = interp_cubic(pdist, xo);
+}
+
+static void get_entropy_contexts_plane(BLOCK_SIZE plane_bsize,
+                                       const struct macroblockd_plane *pd,
+                                       ENTROPY_CONTEXT t_above[MAX_MIB_SIZE],
+                                       ENTROPY_CONTEXT t_left[MAX_MIB_SIZE]) {
+  const int num_4x4_w = mi_size_wide[plane_bsize];
+  const int num_4x4_h = mi_size_high[plane_bsize];
+  const ENTROPY_CONTEXT *const above = pd->above_entropy_context;
+  const ENTROPY_CONTEXT *const left = pd->left_entropy_context;
+
+  memcpy(t_above, above, sizeof(ENTROPY_CONTEXT) * num_4x4_w);
+  memcpy(t_left, left, sizeof(ENTROPY_CONTEXT) * num_4x4_h);
+}
+
+void av1_get_entropy_contexts(BLOCK_SIZE plane_bsize,
+                              const struct macroblockd_plane *pd,
+                              ENTROPY_CONTEXT t_above[MAX_MIB_SIZE],
+                              ENTROPY_CONTEXT t_left[MAX_MIB_SIZE]) {
+  assert(plane_bsize < BLOCK_SIZES_ALL);
+  get_entropy_contexts_plane(plane_bsize, pd, t_above, t_left);
+}
+
+void av1_mv_pred(const AV1_COMP *cpi, MACROBLOCK *x, uint8_t *ref_y_buffer,
+                 int ref_y_stride, int ref_frame, BLOCK_SIZE block_size) {
+  const MV_REFERENCE_FRAME ref_frames[2] = { ref_frame, NONE_FRAME };
+  const int_mv ref_mv =
+      av1_get_ref_mv_from_stack(0, ref_frames, 0, x->mbmi_ext);
+  const int_mv ref_mv1 =
+      av1_get_ref_mv_from_stack(0, ref_frames, 1, x->mbmi_ext);
+  MV pred_mv[MAX_MV_REF_CANDIDATES + 1];
+  int num_mv_refs = 0;
+  pred_mv[num_mv_refs++] = ref_mv.as_mv;
+  if (ref_mv.as_int != ref_mv1.as_int) {
+    pred_mv[num_mv_refs++] = ref_mv1.as_mv;
+  }
+  if (cpi->sf.mv_sf.adaptive_motion_search &&
+      block_size < x->max_partition_size) {
+    pred_mv[num_mv_refs++] = x->pred_mv[ref_frame];
+  }
+
+  assert(num_mv_refs <= (int)(sizeof(pred_mv) / sizeof(pred_mv[0])));
+
+  const uint8_t *const src_y_ptr = x->plane[0].src.buf;
+  int zero_seen = 0;
+  int best_sad = INT_MAX;
+  int max_mv = 0;
+  // Get the sad for each candidate reference mv.
+  for (int i = 0; i < num_mv_refs; ++i) {
+    const MV *this_mv = &pred_mv[i];
+    const int fp_row = (this_mv->row + 3 + (this_mv->row >= 0)) >> 3;
+    const int fp_col = (this_mv->col + 3 + (this_mv->col >= 0)) >> 3;
+    max_mv = AOMMAX(max_mv, AOMMAX(abs(this_mv->row), abs(this_mv->col)) >> 3);
+
+    if (fp_row == 0 && fp_col == 0 && zero_seen) continue;
+    zero_seen |= (fp_row == 0 && fp_col == 0);
+
+    const uint8_t *const ref_y_ptr =
+        &ref_y_buffer[ref_y_stride * fp_row + fp_col];
+    // Find sad for current vector.
+    const int this_sad = cpi->fn_ptr[block_size].sdf(
+        src_y_ptr, x->plane[0].src.stride, ref_y_ptr, ref_y_stride);
+    // Note if it is the best so far.
+    if (this_sad < best_sad) {
+      best_sad = this_sad;
+    }
+  }
+
+  // Note the index of the mv that worked best in the reference list.
+  x->max_mv_context[ref_frame] = max_mv;
+  x->pred_mv_sad[ref_frame] = best_sad;
+}
+
+void av1_setup_pred_block(const MACROBLOCKD *xd,
+                          struct buf_2d dst[MAX_MB_PLANE],
+                          const YV12_BUFFER_CONFIG *src,
+                          const struct scale_factors *scale,
+                          const struct scale_factors *scale_uv,
+                          const int num_planes) {
+  dst[0].buf = src->y_buffer;
+  dst[0].stride = src->y_stride;
+  dst[1].buf = src->u_buffer;
+  dst[2].buf = src->v_buffer;
+  dst[1].stride = dst[2].stride = src->uv_stride;
+
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+  for (int i = 0; i < num_planes; ++i) {
+    setup_pred_plane(dst + i, xd->mi[0]->sb_type, dst[i].buf,
+                     i ? src->uv_crop_width : src->y_crop_width,
+                     i ? src->uv_crop_height : src->y_crop_height,
+                     dst[i].stride, mi_row, mi_col, i ? scale_uv : scale,
+                     xd->plane[i].subsampling_x, xd->plane[i].subsampling_y);
+  }
+}
+
+YV12_BUFFER_CONFIG *av1_get_scaled_ref_frame(const AV1_COMP *cpi,
+                                             int ref_frame) {
+  assert(ref_frame >= LAST_FRAME && ref_frame <= ALTREF_FRAME);
+  RefCntBuffer *const scaled_buf = cpi->scaled_ref_buf[ref_frame - 1];
+  const RefCntBuffer *const ref_buf =
+      get_ref_frame_buf(&cpi->common, ref_frame);
+  return (scaled_buf != ref_buf && scaled_buf != NULL) ? &scaled_buf->buf
+                                                       : NULL;
+}
+
+int av1_get_switchable_rate(const MACROBLOCK *x, const MACROBLOCKD *xd,
+                            InterpFilter interp_filter) {
+  if (interp_filter == SWITCHABLE) {
+    const MB_MODE_INFO *const mbmi = xd->mi[0];
+    int inter_filter_cost = 0;
+    int dir;
+
+    for (dir = 0; dir < 2; ++dir) {
+      const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
+      const InterpFilter filter =
+          av1_extract_interp_filter(mbmi->interp_filters, dir);
+      inter_filter_cost += x->switchable_interp_costs[ctx][filter];
+    }
+    return SWITCHABLE_INTERP_RATE_FACTOR * inter_filter_cost;
+  } else {
+    return 0;
+  }
+}
+
+void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
+  RD_OPT *const rd = &cpi->rd;
+
+  // Set baseline threshold values.
+  av1_zero(rd->thresh_mult);
+
+  rd->thresh_mult[THR_NEARESTMV] = 300;
+  rd->thresh_mult[THR_NEARESTL2] = 300;
+  rd->thresh_mult[THR_NEARESTL3] = 300;
+  rd->thresh_mult[THR_NEARESTB] = 300;
+  rd->thresh_mult[THR_NEARESTA2] = 300;
+  rd->thresh_mult[THR_NEARESTA] = 300;
+  rd->thresh_mult[THR_NEARESTG] = 300;
+
+  rd->thresh_mult[THR_NEWMV] = 1000;
+  rd->thresh_mult[THR_NEWL2] = 1000;
+  rd->thresh_mult[THR_NEWL3] = 1000;
+  rd->thresh_mult[THR_NEWB] = 1000;
+  rd->thresh_mult[THR_NEWA2] = 1100;
+  rd->thresh_mult[THR_NEWA] = 1000;
+  rd->thresh_mult[THR_NEWG] = 1000;
+
+  rd->thresh_mult[THR_NEARMV] = 1000;
+  rd->thresh_mult[THR_NEARL2] = 1000;
+  rd->thresh_mult[THR_NEARL3] = 1000;
+  rd->thresh_mult[THR_NEARB] = 1000;
+  rd->thresh_mult[THR_NEARA2] = 1000;
+  rd->thresh_mult[THR_NEARA] = 1000;
+  rd->thresh_mult[THR_NEARG] = 1000;
+
+  rd->thresh_mult[THR_GLOBALMV] = 2200;
+  rd->thresh_mult[THR_GLOBALL2] = 2000;
+  rd->thresh_mult[THR_GLOBALL3] = 2000;
+  rd->thresh_mult[THR_GLOBALB] = 2400;
+  rd->thresh_mult[THR_GLOBALA2] = 2000;
+  rd->thresh_mult[THR_GLOBALG] = 2000;
+  rd->thresh_mult[THR_GLOBALA] = 2400;
+
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTLA] = 1100;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTL2A] = 1000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTL3A] = 800;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTGA] = 900;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTLB] = 1000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTL2B] = 1000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTL3B] = 1000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTGB] = 1000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTLA2] = 1000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTL2A2] = 1000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTL3A2] = 1000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTGA2] = 1000;
+
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTLL2] = 2000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTLL3] = 2000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTLG] = 2000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTBA] = 2000;
+
+  rd->thresh_mult[THR_COMP_NEAR_NEARLA] = 1200;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWLA] = 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTLA] = 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWLA] = 1530;
+  rd->thresh_mult[THR_COMP_NEW_NEARLA] = 1870;
+  rd->thresh_mult[THR_COMP_NEW_NEWLA] = 2400;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLA] = 2750;
+
+  rd->thresh_mult[THR_COMP_NEAR_NEARL2A] = 1200;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWL2A] = 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTL2A] = 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWL2A] = 1870;
+  rd->thresh_mult[THR_COMP_NEW_NEARL2A] = 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWL2A] = 1800;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL2A] = 2500;
+
+  rd->thresh_mult[THR_COMP_NEAR_NEARL3A] = 1200;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWL3A] = 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTL3A] = 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWL3A] = 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEARL3A] = 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWL3A] = 2000;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL3A] = 3000;
+
+  rd->thresh_mult[THR_COMP_NEAR_NEARGA] = 1320;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWGA] = 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTGA] = 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWGA] = 2040;
+  rd->thresh_mult[THR_COMP_NEW_NEARGA] = 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWGA] = 2000;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALGA] = 2250;
+
+  rd->thresh_mult[THR_COMP_NEAR_NEARLB] = 1200;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWLB] = 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTLB] = 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWLB] = 1360;
+  rd->thresh_mult[THR_COMP_NEW_NEARLB] = 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWLB] = 2400;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLB] = 2250;
+
+  rd->thresh_mult[THR_COMP_NEAR_NEARL2B] = 1200;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWL2B] = 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTL2B] = 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWL2B] = 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEARL2B] = 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWL2B] = 2000;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL2B] = 2500;
+
+  rd->thresh_mult[THR_COMP_NEAR_NEARL3B] = 1200;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWL3B] = 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTL3B] = 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWL3B] = 1870;
+  rd->thresh_mult[THR_COMP_NEW_NEARL3B] = 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWL3B] = 2000;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL3B] = 2500;
+
+  rd->thresh_mult[THR_COMP_NEAR_NEARGB] = 1200;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWGB] = 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTGB] = 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWGB] = 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEARGB] = 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWGB] = 2000;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALGB] = 2500;
+
+  rd->thresh_mult[THR_COMP_NEAR_NEARLA2] = 1200;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWLA2] = 1800;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTLA2] = 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWLA2] = 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEARLA2] = 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWLA2] = 2000;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLA2] = 2500;
+
+  rd->thresh_mult[THR_COMP_NEAR_NEARL2A2] = 1200;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWL2A2] = 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTL2A2] = 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWL2A2] = 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEARL2A2] = 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWL2A2] = 2000;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL2A2] = 2500;
+
+  rd->thresh_mult[THR_COMP_NEAR_NEARL3A2] = 1440;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWL3A2] = 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTL3A2] = 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWL3A2] = 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEARL3A2] = 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWL3A2] = 2000;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL3A2] = 2500;
+
+  rd->thresh_mult[THR_COMP_NEAR_NEARGA2] = 1200;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWGA2] = 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTGA2] = 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWGA2] = 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEARGA2] = 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWGA2] = 2000;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALGA2] = 2750;
+
+  rd->thresh_mult[THR_COMP_NEAR_NEARLL2] = 1600;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWLL2] = 2000;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTLL2] = 2000;
+  rd->thresh_mult[THR_COMP_NEAR_NEWLL2] = 2640;
+  rd->thresh_mult[THR_COMP_NEW_NEARLL2] = 2200;
+  rd->thresh_mult[THR_COMP_NEW_NEWLL2] = 2400;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLL2] = 3200;
+
+  rd->thresh_mult[THR_COMP_NEAR_NEARLL3] = 1600;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWLL3] = 2000;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTLL3] = 1800;
+  rd->thresh_mult[THR_COMP_NEAR_NEWLL3] = 2200;
+  rd->thresh_mult[THR_COMP_NEW_NEARLL3] = 2200;
+  rd->thresh_mult[THR_COMP_NEW_NEWLL3] = 2400;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLL3] = 3200;
+
+  rd->thresh_mult[THR_COMP_NEAR_NEARLG] = 1760;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWLG] = 2400;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTLG] = 2000;
+  rd->thresh_mult[THR_COMP_NEAR_NEWLG] = 1760;
+  rd->thresh_mult[THR_COMP_NEW_NEARLG] = 2640;
+  rd->thresh_mult[THR_COMP_NEW_NEWLG] = 2400;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLG] = 3200;
+
+  rd->thresh_mult[THR_COMP_NEAR_NEARBA] = 1600;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWBA] = 2000;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTBA] = 2000;
+  rd->thresh_mult[THR_COMP_NEAR_NEWBA] = 2200;
+  rd->thresh_mult[THR_COMP_NEW_NEARBA] = 1980;
+  rd->thresh_mult[THR_COMP_NEW_NEWBA] = 2640;
+  rd->thresh_mult[THR_COMP_GLOBAL_GLOBALBA] = 3200;
+
+  rd->thresh_mult[THR_DC] = 1000;
+  rd->thresh_mult[THR_PAETH] = 1000;
+  rd->thresh_mult[THR_SMOOTH] = 2200;
+  rd->thresh_mult[THR_SMOOTH_V] = 2000;
+  rd->thresh_mult[THR_SMOOTH_H] = 2000;
+  rd->thresh_mult[THR_H_PRED] = 2000;
+  rd->thresh_mult[THR_V_PRED] = 1800;
+  rd->thresh_mult[THR_D135_PRED] = 2500;
+  rd->thresh_mult[THR_D203_PRED] = 2000;
+  rd->thresh_mult[THR_D157_PRED] = 2500;
+  rd->thresh_mult[THR_D67_PRED] = 2000;
+  rd->thresh_mult[THR_D113_PRED] = 2500;
+  rd->thresh_mult[THR_D45_PRED] = 2500;
+}
+
+void av1_update_rd_thresh_fact(const AV1_COMMON *const cm,
+                               int (*factor_buf)[MAX_MODES],
+                               int use_adaptive_rd_thresh, BLOCK_SIZE bsize,
+                               THR_MODES best_mode_index) {
+  assert(use_adaptive_rd_thresh > 0);
+  const THR_MODES top_mode = MAX_MODES;
+  const int max_rd_thresh_factor = use_adaptive_rd_thresh * RD_THRESH_MAX_FACT;
+
+  const int bsize_is_1_to_4 = bsize > cm->seq_params.sb_size;
+  BLOCK_SIZE min_size, max_size;
+  if (bsize_is_1_to_4) {
+    // This part handles block sizes with 1:4 and 4:1 aspect ratios
+    // TODO(any): Experiment with threshold update for parent/child blocks
+    min_size = bsize;
+    max_size = bsize;
+  } else {
+    min_size = AOMMAX(bsize - 2, BLOCK_4X4);
+    max_size = AOMMIN(bsize + 2, (int)cm->seq_params.sb_size);
+  }
+
+  for (THR_MODES mode = 0; mode < top_mode; ++mode) {
+    for (BLOCK_SIZE bs = min_size; bs <= max_size; ++bs) {
+      int *const fact = &factor_buf[bs][mode];
+      if (mode == best_mode_index) {
+        *fact -= (*fact >> RD_THRESH_LOG_DEC_FACTOR);
+      } else {
+        *fact = AOMMIN(*fact + RD_THRESH_INC, max_rd_thresh_factor);
+      }
+    }
+  }
+}
+
+int av1_get_intra_cost_penalty(int qindex, int qdelta,
+                               aom_bit_depth_t bit_depth) {
+  const int q = av1_dc_quant_QTX(qindex, qdelta, bit_depth);
+  switch (bit_depth) {
+    case AOM_BITS_8: return 20 * q;
+    case AOM_BITS_10: return 5 * q;
+    case AOM_BITS_12: return ROUND_POWER_OF_TWO(5 * q, 2);
+    default:
+      assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
+      return -1;
+  }
+}
diff --git a/libs/libaom/src/av1/encoder/rd.h b/libs/libaom/src/av1/encoder/rd.h
new file mode 100644
index 000000000..1addbaeb9
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/rd.h
@@ -0,0 +1,370 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_RD_H_
+#define AOM_AV1_ENCODER_RD_H_
+
+#include <limits.h>
+
+#include "av1/common/blockd.h"
+
+#include "av1/encoder/block.h"
+#include "av1/encoder/context_tree.h"
+#include "av1/encoder/cost.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define RDDIV_BITS 7
+#define RD_EPB_SHIFT 6
+
+#define RDCOST(RM, R, D)                                            \
+  (ROUND_POWER_OF_TWO(((int64_t)(R)) * (RM), AV1_PROB_COST_SHIFT) + \
+   ((D) * (1 << RDDIV_BITS)))
+
+#define RDCOST_NEG_R(RM, R, D) \
+  (((D) * (1 << RDDIV_BITS)) - \
+   ROUND_POWER_OF_TWO(((int64_t)(R)) * (RM), AV1_PROB_COST_SHIFT))
+
+#define RDCOST_DBL(RM, R, D)                                       \
+  (((((double)(R)) * (RM)) / (double)(1 << AV1_PROB_COST_SHIFT)) + \
+   ((double)(D) * (1 << RDDIV_BITS)))
+
+#define QIDX_SKIP_THRESH 115
+
+#define MV_COST_WEIGHT 108
+#define MV_COST_WEIGHT_SUB 120
+
+// The fractional part of rd_thresh factor is stored with 5 bits. The maximum
+// factor that we allow is two, which is stored as 2 ** (5+1) = 64
+#define RD_THRESH_FAC_FRAC_BITS (5)
+#define RD_THRESH_FAC_FRAC_VAL (1 << (RD_THRESH_FAC_FRAC_BITS))
+#define RD_THRESH_MAX_FACT ((RD_THRESH_FAC_FRAC_VAL) << 1)
+#define RD_THRESH_LOG_DEC_FACTOR (4)
+#define RD_THRESH_INC (1)
+
+// Factor to weigh the rate for switchable interp filters.
+#define SWITCHABLE_INTERP_RATE_FACTOR 1
+
+enum {
+  // Default initialization when we are not using winner mode framework. e.g.
+  // intrabc
+  DEFAULT_EVAL = 0,
+  // Initialization for selecting winner mode
+  MODE_EVAL,
+  // Initialization for winner mode evaluation
+  WINNER_MODE_EVAL,
+  // All mode evaluation types
+  MODE_EVAL_TYPES,
+} UENUM1BYTE(MODE_EVAL_TYPE);
+
+typedef struct RD_OPT {
+  // Thresh_mult is used to set a threshold for the rd score. A higher value
+  // means that we will accept the best mode so far more often. This number
+  // is used in combination with the current block size, and thresh_freq_fact
+  // to pick a threshold.
+  int thresh_mult[MAX_MODES];
+
+  int threshes[MAX_SEGMENTS][BLOCK_SIZES_ALL][MAX_MODES];
+
+  int RDMULT;
+
+  double r0, arf_r0;
+  double mc_saved_base, mc_count_base;
+} RD_OPT;
+
+typedef struct {
+  // Cost of transmitting the actual motion vector.
+  // mv_component[0][i] is the cost of motion vector with horizontal component
+  // (mv_row) equal to i - MV_MAX.
+  // mv_component[1][i] is the cost of motion vector with vertical component
+  // (mv_col) equal to i - MV_MAX.
+  int mv_component[2][MV_VALS];
+
+  // joint_mv[i] is the cost of transmitting joint mv(MV_JOINT_TYPE) of
+  // type i.
+  // TODO(huisu@google.com): we can update dv_joint_cost per SB.
+  int joint_mv[MV_JOINTS];
+} IntraBCMVCosts;
+
+static INLINE void av1_init_rd_stats(RD_STATS *rd_stats) {
+#if CONFIG_RD_DEBUG
+  int plane;
+#endif
+  rd_stats->rate = 0;
+  rd_stats->dist = 0;
+  rd_stats->rdcost = 0;
+  rd_stats->sse = 0;
+  rd_stats->skip = 1;
+  rd_stats->zero_rate = 0;
+#if CONFIG_RD_DEBUG
+  // This may run into problems when monochrome video is
+  // encoded, as there will only be 1 plane
+  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+    rd_stats->txb_coeff_cost[plane] = 0;
+    {
+      int r, c;
+      for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r)
+        for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c)
+          rd_stats->txb_coeff_cost_map[plane][r][c] = 0;
+    }
+  }
+#endif
+}
+
+static INLINE void av1_invalid_rd_stats(RD_STATS *rd_stats) {
+#if CONFIG_RD_DEBUG
+  int plane;
+#endif
+  rd_stats->rate = INT_MAX;
+  rd_stats->dist = INT64_MAX;
+  rd_stats->rdcost = INT64_MAX;
+  rd_stats->sse = INT64_MAX;
+  rd_stats->skip = 0;
+  rd_stats->zero_rate = 0;
+#if CONFIG_RD_DEBUG
+  // This may run into problems when monochrome video is
+  // encoded, as there will only be 1 plane
+  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+    rd_stats->txb_coeff_cost[plane] = INT_MAX;
+    {
+      int r, c;
+      for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r)
+        for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c)
+          rd_stats->txb_coeff_cost_map[plane][r][c] = INT16_MAX;
+    }
+  }
+#endif
+}
+
+static INLINE void av1_merge_rd_stats(RD_STATS *rd_stats_dst,
+                                      const RD_STATS *rd_stats_src) {
+  assert(rd_stats_dst->rate != INT_MAX && rd_stats_src->rate != INT_MAX);
+  rd_stats_dst->rate = (int)AOMMIN(
+      ((int64_t)rd_stats_dst->rate + (int64_t)rd_stats_src->rate), INT_MAX);
+  if (!rd_stats_dst->zero_rate)
+    rd_stats_dst->zero_rate = rd_stats_src->zero_rate;
+  rd_stats_dst->dist += rd_stats_src->dist;
+  rd_stats_dst->sse += rd_stats_src->sse;
+  rd_stats_dst->skip &= rd_stats_src->skip;
+#if CONFIG_RD_DEBUG
+  // This may run into problems when monochrome video is
+  // encoded, as there will only be 1 plane
+  for (int plane = 0; plane < MAX_MB_PLANE; ++plane) {
+    rd_stats_dst->txb_coeff_cost[plane] += rd_stats_src->txb_coeff_cost[plane];
+    {
+      // TODO(angiebird): optimize this part
+      int r, c;
+      int ref_txb_coeff_cost = 0;
+      for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r)
+        for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c) {
+          rd_stats_dst->txb_coeff_cost_map[plane][r][c] +=
+              rd_stats_src->txb_coeff_cost_map[plane][r][c];
+          ref_txb_coeff_cost += rd_stats_dst->txb_coeff_cost_map[plane][r][c];
+        }
+      assert(ref_txb_coeff_cost == rd_stats_dst->txb_coeff_cost[plane]);
+    }
+  }
+#endif
+}
+
+static INLINE void av1_accumulate_rd_stats(RD_STATS *rd_stats, int64_t dist,
+                                           int rate, int skip, int64_t sse,
+                                           int zero_rate) {
+  assert(rd_stats->rate != INT_MAX && rate != INT_MAX);
+  rd_stats->rate += rate;
+  if (!rd_stats->zero_rate) rd_stats->zero_rate = zero_rate;
+  rd_stats->dist += dist;
+  rd_stats->skip &= skip;
+  rd_stats->sse += sse;
+}
+
+static INLINE int64_t av1_calculate_rd_cost(int mult, int rate, int64_t dist) {
+  assert(mult >= 0);
+  if (rate >= 0) {
+    return RDCOST(mult, rate, dist);
+  }
+  return RDCOST_NEG_R(mult, -rate, dist);
+}
+
+static INLINE void av1_rd_cost_update(int mult, RD_STATS *rd_cost) {
+  if (rd_cost->rate < INT_MAX && rd_cost->dist < INT64_MAX &&
+      rd_cost->rdcost < INT64_MAX) {
+    rd_cost->rdcost = av1_calculate_rd_cost(mult, rd_cost->rate, rd_cost->dist);
+  } else {
+    av1_invalid_rd_stats(rd_cost);
+  }
+}
+
+static INLINE void av1_rd_stats_subtraction(int mult,
+                                            const RD_STATS *const left,
+                                            const RD_STATS *const right,
+                                            RD_STATS *result) {
+  if (left->rate == INT_MAX || right->rate == INT_MAX ||
+      left->dist == INT64_MAX || right->dist == INT64_MAX ||
+      left->rdcost == INT64_MAX || right->rdcost == INT64_MAX) {
+    av1_invalid_rd_stats(result);
+  } else {
+    result->rate = left->rate - right->rate;
+    result->dist = left->dist - right->dist;
+    result->rdcost = av1_calculate_rd_cost(mult, result->rate, result->dist);
+  }
+}
+
+struct TileInfo;
+struct TileDataEnc;
+struct AV1_COMP;
+struct macroblock;
+
+int av1_compute_rd_mult_based_on_qindex(const struct AV1_COMP *cpi, int qindex);
+
+int av1_compute_rd_mult(const struct AV1_COMP *cpi, int qindex);
+
+void av1_initialize_rd_consts(struct AV1_COMP *cpi);
+
+void av1_initialize_me_consts(const struct AV1_COMP *cpi, MACROBLOCK *x,
+                              int qindex);
+
+void av1_model_rd_from_var_lapndz(int64_t var, unsigned int n,
+                                  unsigned int qstep, int *rate, int64_t *dist);
+
+void av1_model_rd_curvfit(BLOCK_SIZE bsize, double sse_norm, double xqr,
+                          double *rate_f, double *distbysse_f);
+void av1_model_rd_surffit(BLOCK_SIZE bsize, double sse_norm, double xm,
+                          double yl, double *rate_f, double *distbysse_f);
+
+int av1_get_switchable_rate(const MACROBLOCK *x, const MACROBLOCKD *xd,
+                            InterpFilter interp_filter);
+
+YV12_BUFFER_CONFIG *av1_get_scaled_ref_frame(const struct AV1_COMP *cpi,
+                                             int ref_frame);
+
+void av1_init_me_luts(void);
+
+void av1_set_mvcost(MACROBLOCK *x, int ref, int ref_mv_idx);
+
+void av1_get_entropy_contexts(BLOCK_SIZE plane_bsize,
+                              const struct macroblockd_plane *pd,
+                              ENTROPY_CONTEXT t_above[MAX_MIB_SIZE],
+                              ENTROPY_CONTEXT t_left[MAX_MIB_SIZE]);
+
+void av1_set_rd_speed_thresholds(struct AV1_COMP *cpi);
+
+void av1_update_rd_thresh_fact(const AV1_COMMON *const cm,
+                               int (*fact)[MAX_MODES], int rd_thresh,
+                               BLOCK_SIZE bsize, THR_MODES best_mode_index);
+
+static INLINE void reset_thresh_freq_fact(MACROBLOCK *const x) {
+  for (int i = 0; i < BLOCK_SIZES_ALL; ++i) {
+    for (int j = 0; j < MAX_MODES; ++j) {
+      x->thresh_freq_fact[i][j] = RD_THRESH_FAC_FRAC_VAL;
+    }
+  }
+}
+
+static INLINE int rd_less_than_thresh(int64_t best_rd, int thresh,
+                                      int thresh_fact) {
+  return best_rd < ((int64_t)thresh * thresh_fact >> 5) || thresh == INT_MAX;
+}
+
+void av1_mv_pred(const struct AV1_COMP *cpi, MACROBLOCK *x,
+                 uint8_t *ref_y_buffer, int ref_y_stride, int ref_frame,
+                 BLOCK_SIZE block_size);
+
+static INLINE void set_error_per_bit(MACROBLOCK *x, int rdmult) {
+  x->errorperbit = rdmult >> RD_EPB_SHIFT;
+  x->errorperbit += (x->errorperbit == 0);
+}
+
+// Get the threshold for R-D optimization of coefficients depending upon mode
+// decision/winner mode processing
+static INLINE uint32_t get_rd_opt_coeff_thresh(
+    const uint32_t *const coeff_opt_dist_threshold,
+    int enable_winner_mode_for_coeff_opt, int is_winner_mode) {
+  // Default initialization of threshold
+  uint32_t coeff_opt_thresh = coeff_opt_dist_threshold[DEFAULT_EVAL];
+  // TODO(any): Experiment with coeff_opt_dist_threshold values when
+  // enable_winner_mode_for_coeff_opt is ON
+  // TODO(any): Skip the winner mode processing for blocks with lower residual
+  // energy as R-D optimization of coefficients would have been enabled during
+  // mode decision
+  if (enable_winner_mode_for_coeff_opt) {
+    // Use conservative threshold during mode decision and perform R-D
+    // optimization of coeffs always for winner modes
+    if (is_winner_mode)
+      coeff_opt_thresh = coeff_opt_dist_threshold[WINNER_MODE_EVAL];
+    else
+      coeff_opt_thresh = coeff_opt_dist_threshold[MODE_EVAL];
+  }
+  return coeff_opt_thresh;
+}
+
+// Used to reset the state of tx/mb rd hash information
+static INLINE void reset_hash_records(MACROBLOCK *const x,
+                                      int use_inter_txb_hash) {
+  int32_t record_idx;
+
+  // Reset the state for use_inter_txb_hash
+  if (use_inter_txb_hash) {
+    for (record_idx = 0;
+         record_idx < ((MAX_MIB_SIZE >> 1) * (MAX_MIB_SIZE >> 1)); record_idx++)
+      x->txb_rd_record_8X8[record_idx].num =
+          x->txb_rd_record_8X8[record_idx].index_start = 0;
+    for (record_idx = 0;
+         record_idx < ((MAX_MIB_SIZE >> 2) * (MAX_MIB_SIZE >> 2)); record_idx++)
+      x->txb_rd_record_16X16[record_idx].num =
+          x->txb_rd_record_16X16[record_idx].index_start = 0;
+    for (record_idx = 0;
+         record_idx < ((MAX_MIB_SIZE >> 3) * (MAX_MIB_SIZE >> 3)); record_idx++)
+      x->txb_rd_record_32X32[record_idx].num =
+          x->txb_rd_record_32X32[record_idx].index_start = 0;
+    for (record_idx = 0;
+         record_idx < ((MAX_MIB_SIZE >> 4) * (MAX_MIB_SIZE >> 4)); record_idx++)
+      x->txb_rd_record_64X64[record_idx].num =
+          x->txb_rd_record_64X64[record_idx].index_start = 0;
+  }
+
+  // Reset the state for use_intra_txb_hash
+  x->txb_rd_record_intra.num = x->txb_rd_record_intra.index_start = 0;
+
+  // Reset the state for use_mb_rd_hash
+  x->mb_rd_record.num = x->mb_rd_record.index_start = 0;
+}
+
+void av1_setup_pred_block(const MACROBLOCKD *xd,
+                          struct buf_2d dst[MAX_MB_PLANE],
+                          const YV12_BUFFER_CONFIG *src,
+                          const struct scale_factors *scale,
+                          const struct scale_factors *scale_uv,
+                          const int num_planes);
+
+int av1_get_intra_cost_penalty(int qindex, int qdelta,
+                               aom_bit_depth_t bit_depth);
+
+void av1_fill_mode_rates(AV1_COMMON *const cm, MACROBLOCK *x,
+                         FRAME_CONTEXT *fc);
+
+void av1_fill_coeff_costs(MACROBLOCK *x, FRAME_CONTEXT *fc,
+                          const int num_planes);
+
+void av1_fill_mv_costs(const FRAME_CONTEXT *fc, int integer_mv, int usehp,
+                       MACROBLOCK *x);
+
+int av1_get_adaptive_rdmult(const struct AV1_COMP *cpi, double beta);
+
+int av1_get_deltaq_offset(const struct AV1_COMP *cpi, int qindex, double beta);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_RD_H_
diff --git a/libs/libaom/src/av1/encoder/rdopt.c b/libs/libaom/src/av1/encoder/rdopt.c
new file mode 100644
index 000000000..02afcd1ff
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/rdopt.c
@@ -0,0 +1,5505 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <math.h>
+#include <stdbool.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/blend.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/system_state.h"
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/cfl.h"
+#include "av1/common/common.h"
+#include "av1/common/common_data.h"
+#include "av1/common/entropy.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/idct.h"
+#include "av1/common/mvref_common.h"
+#include "av1/common/obmc.h"
+#include "av1/common/pred_common.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+#include "av1/common/scan.h"
+#include "av1/common/seg_common.h"
+#include "av1/common/txb_common.h"
+#include "av1/common/warped_motion.h"
+
+#include "av1/encoder/aq_variance.h"
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/cost.h"
+#include "av1/encoder/compound_type.h"
+#include "av1/encoder/encodemb.h"
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encodetxb.h"
+#include "av1/encoder/hybrid_fwd_txfm.h"
+#include "av1/encoder/interp_search.h"
+#include "av1/encoder/intra_mode_search.h"
+#include "av1/encoder/mcomp.h"
+#include "av1/encoder/ml.h"
+#include "av1/encoder/mode_prune_model_weights.h"
+#include "av1/encoder/model_rd.h"
+#include "av1/encoder/motion_search_facade.h"
+#include "av1/encoder/palette.h"
+#include "av1/encoder/pustats.h"
+#include "av1/encoder/random.h"
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/rd.h"
+#include "av1/encoder/rdopt.h"
+#include "av1/encoder/reconinter_enc.h"
+#include "av1/encoder/tokenize.h"
+#include "av1/encoder/tpl_model.h"
+#include "av1/encoder/tx_search.h"
+
+#define LAST_NEW_MV_INDEX 6
+
+// Mode_threshold multiplication factor table for prune_inter_modes_if_skippable
+// The values are kept in Q12 format and equation used to derive is
+// (2.5 - ((float)x->qindex / MAXQ) * 1.5)
+#define MODE_THRESH_QBITS 12
+static const int mode_threshold_mul_factor[QINDEX_RANGE] = {
+  10240, 10216, 10192, 10168, 10144, 10120, 10095, 10071, 10047, 10023, 9999,
+  9975,  9951,  9927,  9903,  9879,  9854,  9830,  9806,  9782,  9758,  9734,
+  9710,  9686,  9662,  9638,  9614,  9589,  9565,  9541,  9517,  9493,  9469,
+  9445,  9421,  9397,  9373,  9349,  9324,  9300,  9276,  9252,  9228,  9204,
+  9180,  9156,  9132,  9108,  9083,  9059,  9035,  9011,  8987,  8963,  8939,
+  8915,  8891,  8867,  8843,  8818,  8794,  8770,  8746,  8722,  8698,  8674,
+  8650,  8626,  8602,  8578,  8553,  8529,  8505,  8481,  8457,  8433,  8409,
+  8385,  8361,  8337,  8312,  8288,  8264,  8240,  8216,  8192,  8168,  8144,
+  8120,  8096,  8072,  8047,  8023,  7999,  7975,  7951,  7927,  7903,  7879,
+  7855,  7831,  7806,  7782,  7758,  7734,  7710,  7686,  7662,  7638,  7614,
+  7590,  7566,  7541,  7517,  7493,  7469,  7445,  7421,  7397,  7373,  7349,
+  7325,  7301,  7276,  7252,  7228,  7204,  7180,  7156,  7132,  7108,  7084,
+  7060,  7035,  7011,  6987,  6963,  6939,  6915,  6891,  6867,  6843,  6819,
+  6795,  6770,  6746,  6722,  6698,  6674,  6650,  6626,  6602,  6578,  6554,
+  6530,  6505,  6481,  6457,  6433,  6409,  6385,  6361,  6337,  6313,  6289,
+  6264,  6240,  6216,  6192,  6168,  6144,  6120,  6096,  6072,  6048,  6024,
+  5999,  5975,  5951,  5927,  5903,  5879,  5855,  5831,  5807,  5783,  5758,
+  5734,  5710,  5686,  5662,  5638,  5614,  5590,  5566,  5542,  5518,  5493,
+  5469,  5445,  5421,  5397,  5373,  5349,  5325,  5301,  5277,  5253,  5228,
+  5204,  5180,  5156,  5132,  5108,  5084,  5060,  5036,  5012,  4987,  4963,
+  4939,  4915,  4891,  4867,  4843,  4819,  4795,  4771,  4747,  4722,  4698,
+  4674,  4650,  4626,  4602,  4578,  4554,  4530,  4506,  4482,  4457,  4433,
+  4409,  4385,  4361,  4337,  4313,  4289,  4265,  4241,  4216,  4192,  4168,
+  4144,  4120,  4096
+};
+
+static const THR_MODES av1_default_mode_order[MAX_MODES] = {
+  THR_NEARESTMV,
+  THR_NEARESTL2,
+  THR_NEARESTL3,
+  THR_NEARESTB,
+  THR_NEARESTA2,
+  THR_NEARESTA,
+  THR_NEARESTG,
+
+  THR_NEWMV,
+  THR_NEWL2,
+  THR_NEWL3,
+  THR_NEWB,
+  THR_NEWA2,
+  THR_NEWA,
+  THR_NEWG,
+
+  THR_NEARMV,
+  THR_NEARL2,
+  THR_NEARL3,
+  THR_NEARB,
+  THR_NEARA2,
+  THR_NEARA,
+  THR_NEARG,
+
+  THR_GLOBALMV,
+  THR_GLOBALL2,
+  THR_GLOBALL3,
+  THR_GLOBALB,
+  THR_GLOBALA2,
+  THR_GLOBALA,
+  THR_GLOBALG,
+
+  THR_COMP_NEAREST_NEARESTLA,
+  THR_COMP_NEAREST_NEARESTL2A,
+  THR_COMP_NEAREST_NEARESTL3A,
+  THR_COMP_NEAREST_NEARESTGA,
+  THR_COMP_NEAREST_NEARESTLB,
+  THR_COMP_NEAREST_NEARESTL2B,
+  THR_COMP_NEAREST_NEARESTL3B,
+  THR_COMP_NEAREST_NEARESTGB,
+  THR_COMP_NEAREST_NEARESTLA2,
+  THR_COMP_NEAREST_NEARESTL2A2,
+  THR_COMP_NEAREST_NEARESTL3A2,
+  THR_COMP_NEAREST_NEARESTGA2,
+  THR_COMP_NEAREST_NEARESTLL2,
+  THR_COMP_NEAREST_NEARESTLL3,
+  THR_COMP_NEAREST_NEARESTLG,
+  THR_COMP_NEAREST_NEARESTBA,
+
+  THR_COMP_NEAR_NEARLA,
+  THR_COMP_NEW_NEARESTLA,
+  THR_COMP_NEAREST_NEWLA,
+  THR_COMP_NEW_NEARLA,
+  THR_COMP_NEAR_NEWLA,
+  THR_COMP_NEW_NEWLA,
+  THR_COMP_GLOBAL_GLOBALLA,
+
+  THR_COMP_NEAR_NEARL2A,
+  THR_COMP_NEW_NEARESTL2A,
+  THR_COMP_NEAREST_NEWL2A,
+  THR_COMP_NEW_NEARL2A,
+  THR_COMP_NEAR_NEWL2A,
+  THR_COMP_NEW_NEWL2A,
+  THR_COMP_GLOBAL_GLOBALL2A,
+
+  THR_COMP_NEAR_NEARL3A,
+  THR_COMP_NEW_NEARESTL3A,
+  THR_COMP_NEAREST_NEWL3A,
+  THR_COMP_NEW_NEARL3A,
+  THR_COMP_NEAR_NEWL3A,
+  THR_COMP_NEW_NEWL3A,
+  THR_COMP_GLOBAL_GLOBALL3A,
+
+  THR_COMP_NEAR_NEARGA,
+  THR_COMP_NEW_NEARESTGA,
+  THR_COMP_NEAREST_NEWGA,
+  THR_COMP_NEW_NEARGA,
+  THR_COMP_NEAR_NEWGA,
+  THR_COMP_NEW_NEWGA,
+  THR_COMP_GLOBAL_GLOBALGA,
+
+  THR_COMP_NEAR_NEARLB,
+  THR_COMP_NEW_NEARESTLB,
+  THR_COMP_NEAREST_NEWLB,
+  THR_COMP_NEW_NEARLB,
+  THR_COMP_NEAR_NEWLB,
+  THR_COMP_NEW_NEWLB,
+  THR_COMP_GLOBAL_GLOBALLB,
+
+  THR_COMP_NEAR_NEARL2B,
+  THR_COMP_NEW_NEARESTL2B,
+  THR_COMP_NEAREST_NEWL2B,
+  THR_COMP_NEW_NEARL2B,
+  THR_COMP_NEAR_NEWL2B,
+  THR_COMP_NEW_NEWL2B,
+  THR_COMP_GLOBAL_GLOBALL2B,
+
+  THR_COMP_NEAR_NEARL3B,
+  THR_COMP_NEW_NEARESTL3B,
+  THR_COMP_NEAREST_NEWL3B,
+  THR_COMP_NEW_NEARL3B,
+  THR_COMP_NEAR_NEWL3B,
+  THR_COMP_NEW_NEWL3B,
+  THR_COMP_GLOBAL_GLOBALL3B,
+
+  THR_COMP_NEAR_NEARGB,
+  THR_COMP_NEW_NEARESTGB,
+  THR_COMP_NEAREST_NEWGB,
+  THR_COMP_NEW_NEARGB,
+  THR_COMP_NEAR_NEWGB,
+  THR_COMP_NEW_NEWGB,
+  THR_COMP_GLOBAL_GLOBALGB,
+
+  THR_COMP_NEAR_NEARLA2,
+  THR_COMP_NEW_NEARESTLA2,
+  THR_COMP_NEAREST_NEWLA2,
+  THR_COMP_NEW_NEARLA2,
+  THR_COMP_NEAR_NEWLA2,
+  THR_COMP_NEW_NEWLA2,
+  THR_COMP_GLOBAL_GLOBALLA2,
+
+  THR_COMP_NEAR_NEARL2A2,
+  THR_COMP_NEW_NEARESTL2A2,
+  THR_COMP_NEAREST_NEWL2A2,
+  THR_COMP_NEW_NEARL2A2,
+  THR_COMP_NEAR_NEWL2A2,
+  THR_COMP_NEW_NEWL2A2,
+  THR_COMP_GLOBAL_GLOBALL2A2,
+
+  THR_COMP_NEAR_NEARL3A2,
+  THR_COMP_NEW_NEARESTL3A2,
+  THR_COMP_NEAREST_NEWL3A2,
+  THR_COMP_NEW_NEARL3A2,
+  THR_COMP_NEAR_NEWL3A2,
+  THR_COMP_NEW_NEWL3A2,
+  THR_COMP_GLOBAL_GLOBALL3A2,
+
+  THR_COMP_NEAR_NEARGA2,
+  THR_COMP_NEW_NEARESTGA2,
+  THR_COMP_NEAREST_NEWGA2,
+  THR_COMP_NEW_NEARGA2,
+  THR_COMP_NEAR_NEWGA2,
+  THR_COMP_NEW_NEWGA2,
+  THR_COMP_GLOBAL_GLOBALGA2,
+
+  THR_COMP_NEAR_NEARLL2,
+  THR_COMP_NEW_NEARESTLL2,
+  THR_COMP_NEAREST_NEWLL2,
+  THR_COMP_NEW_NEARLL2,
+  THR_COMP_NEAR_NEWLL2,
+  THR_COMP_NEW_NEWLL2,
+  THR_COMP_GLOBAL_GLOBALLL2,
+
+  THR_COMP_NEAR_NEARLL3,
+  THR_COMP_NEW_NEARESTLL3,
+  THR_COMP_NEAREST_NEWLL3,
+  THR_COMP_NEW_NEARLL3,
+  THR_COMP_NEAR_NEWLL3,
+  THR_COMP_NEW_NEWLL3,
+  THR_COMP_GLOBAL_GLOBALLL3,
+
+  THR_COMP_NEAR_NEARLG,
+  THR_COMP_NEW_NEARESTLG,
+  THR_COMP_NEAREST_NEWLG,
+  THR_COMP_NEW_NEARLG,
+  THR_COMP_NEAR_NEWLG,
+  THR_COMP_NEW_NEWLG,
+  THR_COMP_GLOBAL_GLOBALLG,
+
+  THR_COMP_NEAR_NEARBA,
+  THR_COMP_NEW_NEARESTBA,
+  THR_COMP_NEAREST_NEWBA,
+  THR_COMP_NEW_NEARBA,
+  THR_COMP_NEAR_NEWBA,
+  THR_COMP_NEW_NEWBA,
+  THR_COMP_GLOBAL_GLOBALBA,
+
+  THR_DC,
+  THR_PAETH,
+  THR_SMOOTH,
+  THR_SMOOTH_V,
+  THR_SMOOTH_H,
+  THR_H_PRED,
+  THR_V_PRED,
+  THR_D135_PRED,
+  THR_D203_PRED,
+  THR_D157_PRED,
+  THR_D67_PRED,
+  THR_D113_PRED,
+  THR_D45_PRED,
+};
+
+static int find_last_single_ref_mode_idx(const THR_MODES *mode_order) {
+  uint8_t mode_found[NUM_SINGLE_REF_MODES];
+  av1_zero(mode_found);
+  int num_single_ref_modes_left = NUM_SINGLE_REF_MODES;
+
+  for (int idx = 0; idx < MAX_MODES; idx++) {
+    const THR_MODES curr_mode = mode_order[idx];
+    if (curr_mode < SINGLE_REF_MODE_END) {
+      num_single_ref_modes_left--;
+    }
+    if (!num_single_ref_modes_left) {
+      return idx;
+    }
+  }
+  return -1;
+}
+
+typedef struct SingleInterModeState {
+  int64_t rd;
+  MV_REFERENCE_FRAME ref_frame;
+  int valid;
+} SingleInterModeState;
+
+typedef struct InterModeSearchState {
+  int64_t best_rd;
+  int64_t best_skip_rd[2];
+  MB_MODE_INFO best_mbmode;
+  int best_rate_y;
+  int best_rate_uv;
+  int best_mode_skippable;
+  int best_skip2;
+  THR_MODES best_mode_index;
+  int num_available_refs;
+  int64_t dist_refs[REF_FRAMES];
+  int dist_order_refs[REF_FRAMES];
+  int64_t mode_threshold[MAX_MODES];
+  int64_t best_intra_rd;
+  unsigned int best_pred_sse;
+  int64_t best_pred_diff[REFERENCE_MODES];
+  // Save a set of single_newmv for each checked ref_mv.
+  int_mv single_newmv[MAX_REF_MV_SEARCH][REF_FRAMES];
+  int single_newmv_rate[MAX_REF_MV_SEARCH][REF_FRAMES];
+  int single_newmv_valid[MAX_REF_MV_SEARCH][REF_FRAMES];
+  int64_t modelled_rd[MB_MODE_COUNT][MAX_REF_MV_SEARCH][REF_FRAMES];
+  // The rd of simple translation in single inter modes
+  int64_t simple_rd[MB_MODE_COUNT][MAX_REF_MV_SEARCH][REF_FRAMES];
+
+  // Single search results by [directions][modes][reference frames]
+  SingleInterModeState single_state[2][SINGLE_INTER_MODE_NUM][FWD_REFS];
+  int single_state_cnt[2][SINGLE_INTER_MODE_NUM];
+  SingleInterModeState single_state_modelled[2][SINGLE_INTER_MODE_NUM]
+                                            [FWD_REFS];
+  int single_state_modelled_cnt[2][SINGLE_INTER_MODE_NUM];
+  MV_REFERENCE_FRAME single_rd_order[2][SINGLE_INTER_MODE_NUM][FWD_REFS];
+  IntraModeSearchState intra_search_state;
+} InterModeSearchState;
+
+void av1_inter_mode_data_init(TileDataEnc *tile_data) {
+  for (int i = 0; i < BLOCK_SIZES_ALL; ++i) {
+    InterModeRdModel *md = &tile_data->inter_mode_rd_models[i];
+    md->ready = 0;
+    md->num = 0;
+    md->dist_sum = 0;
+    md->ld_sum = 0;
+    md->sse_sum = 0;
+    md->sse_sse_sum = 0;
+    md->sse_ld_sum = 0;
+  }
+}
+
+static int get_est_rate_dist(const TileDataEnc *tile_data, BLOCK_SIZE bsize,
+                             int64_t sse, int *est_residue_cost,
+                             int64_t *est_dist) {
+  aom_clear_system_state();
+  const InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize];
+  if (md->ready) {
+    if (sse < md->dist_mean) {
+      *est_residue_cost = 0;
+      *est_dist = sse;
+    } else {
+      *est_dist = (int64_t)round(md->dist_mean);
+      const double est_ld = md->a * sse + md->b;
+      // Clamp estimated rate cost by INT_MAX / 2.
+      // TODO(angiebird@google.com): find better solution than clamping.
+      if (fabs(est_ld) < 1e-2) {
+        *est_residue_cost = INT_MAX / 2;
+      } else {
+        double est_residue_cost_dbl = ((sse - md->dist_mean) / est_ld);
+        if (est_residue_cost_dbl < 0) {
+          *est_residue_cost = 0;
+        } else {
+          *est_residue_cost =
+              (int)AOMMIN((int64_t)round(est_residue_cost_dbl), INT_MAX / 2);
+        }
+      }
+      if (*est_residue_cost <= 0) {
+        *est_residue_cost = 0;
+        *est_dist = sse;
+      }
+    }
+    return 1;
+  }
+  return 0;
+}
+
+void av1_inter_mode_data_fit(TileDataEnc *tile_data, int rdmult) {
+  aom_clear_system_state();
+  for (int bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) {
+    const int block_idx = inter_mode_data_block_idx(bsize);
+    InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize];
+    if (block_idx == -1) continue;
+    if ((md->ready == 0 && md->num < 200) || (md->ready == 1 && md->num < 64)) {
+      continue;
+    } else {
+      if (md->ready == 0) {
+        md->dist_mean = md->dist_sum / md->num;
+        md->ld_mean = md->ld_sum / md->num;
+        md->sse_mean = md->sse_sum / md->num;
+        md->sse_sse_mean = md->sse_sse_sum / md->num;
+        md->sse_ld_mean = md->sse_ld_sum / md->num;
+      } else {
+        const double factor = 3;
+        md->dist_mean =
+            (md->dist_mean * factor + (md->dist_sum / md->num)) / (factor + 1);
+        md->ld_mean =
+            (md->ld_mean * factor + (md->ld_sum / md->num)) / (factor + 1);
+        md->sse_mean =
+            (md->sse_mean * factor + (md->sse_sum / md->num)) / (factor + 1);
+        md->sse_sse_mean =
+            (md->sse_sse_mean * factor + (md->sse_sse_sum / md->num)) /
+            (factor + 1);
+        md->sse_ld_mean =
+            (md->sse_ld_mean * factor + (md->sse_ld_sum / md->num)) /
+            (factor + 1);
+      }
+
+      const double my = md->ld_mean;
+      const double mx = md->sse_mean;
+      const double dx = sqrt(md->sse_sse_mean);
+      const double dxy = md->sse_ld_mean;
+
+      md->a = (dxy - mx * my) / (dx * dx - mx * mx);
+      md->b = my - md->a * mx;
+      md->ready = 1;
+
+      md->num = 0;
+      md->dist_sum = 0;
+      md->ld_sum = 0;
+      md->sse_sum = 0;
+      md->sse_sse_sum = 0;
+      md->sse_ld_sum = 0;
+    }
+    (void)rdmult;
+  }
+}
+
+static AOM_INLINE void inter_mode_data_push(TileDataEnc *tile_data,
+                                            BLOCK_SIZE bsize, int64_t sse,
+                                            int64_t dist, int residue_cost) {
+  if (residue_cost == 0 || sse == dist) return;
+  const int block_idx = inter_mode_data_block_idx(bsize);
+  if (block_idx == -1) return;
+  InterModeRdModel *rd_model = &tile_data->inter_mode_rd_models[bsize];
+  if (rd_model->num < INTER_MODE_RD_DATA_OVERALL_SIZE) {
+    aom_clear_system_state();
+    const double ld = (sse - dist) * 1. / residue_cost;
+    ++rd_model->num;
+    rd_model->dist_sum += dist;
+    rd_model->ld_sum += ld;
+    rd_model->sse_sum += sse;
+    rd_model->sse_sse_sum += (double)sse * (double)sse;
+    rd_model->sse_ld_sum += sse * ld;
+  }
+}
+
+static AOM_INLINE void inter_modes_info_push(InterModesInfo *inter_modes_info,
+                                             int mode_rate, int64_t sse,
+                                             int64_t rd, RD_STATS *rd_cost,
+                                             RD_STATS *rd_cost_y,
+                                             RD_STATS *rd_cost_uv,
+                                             const MB_MODE_INFO *mbmi) {
+  const int num = inter_modes_info->num;
+  assert(num < MAX_INTER_MODES);
+  inter_modes_info->mbmi_arr[num] = *mbmi;
+  inter_modes_info->mode_rate_arr[num] = mode_rate;
+  inter_modes_info->sse_arr[num] = sse;
+  inter_modes_info->est_rd_arr[num] = rd;
+  inter_modes_info->rd_cost_arr[num] = *rd_cost;
+  inter_modes_info->rd_cost_y_arr[num] = *rd_cost_y;
+  inter_modes_info->rd_cost_uv_arr[num] = *rd_cost_uv;
+  ++inter_modes_info->num;
+}
+
+static int compare_rd_idx_pair(const void *a, const void *b) {
+  if (((RdIdxPair *)a)->rd == ((RdIdxPair *)b)->rd) {
+    return 0;
+  } else if (((const RdIdxPair *)a)->rd > ((const RdIdxPair *)b)->rd) {
+    return 1;
+  } else {
+    return -1;
+  }
+}
+
+static AOM_INLINE void inter_modes_info_sort(
+    const InterModesInfo *inter_modes_info, RdIdxPair *rd_idx_pair_arr) {
+  if (inter_modes_info->num == 0) {
+    return;
+  }
+  for (int i = 0; i < inter_modes_info->num; ++i) {
+    rd_idx_pair_arr[i].idx = i;
+    rd_idx_pair_arr[i].rd = inter_modes_info->est_rd_arr[i];
+  }
+  qsort(rd_idx_pair_arr, inter_modes_info->num, sizeof(rd_idx_pair_arr[0]),
+        compare_rd_idx_pair);
+}
+
+// Similar to get_horver_correlation, but also takes into account first
+// row/column, when computing horizontal/vertical correlation.
+void av1_get_horver_correlation_full_c(const int16_t *diff, int stride,
+                                       int width, int height, float *hcorr,
+                                       float *vcorr) {
+  // The following notation is used:
+  // x - current pixel
+  // y - left neighbor pixel
+  // z - top neighbor pixel
+  int64_t x_sum = 0, x2_sum = 0, xy_sum = 0, xz_sum = 0;
+  int64_t x_firstrow = 0, x_finalrow = 0, x_firstcol = 0, x_finalcol = 0;
+  int64_t x2_firstrow = 0, x2_finalrow = 0, x2_firstcol = 0, x2_finalcol = 0;
+
+  // First, process horizontal correlation on just the first row
+  x_sum += diff[0];
+  x2_sum += diff[0] * diff[0];
+  x_firstrow += diff[0];
+  x2_firstrow += diff[0] * diff[0];
+  for (int j = 1; j < width; ++j) {
+    const int16_t x = diff[j];
+    const int16_t y = diff[j - 1];
+    x_sum += x;
+    x_firstrow += x;
+    x2_sum += x * x;
+    x2_firstrow += x * x;
+    xy_sum += x * y;
+  }
+
+  // Process vertical correlation in the first column
+  x_firstcol += diff[0];
+  x2_firstcol += diff[0] * diff[0];
+  for (int i = 1; i < height; ++i) {
+    const int16_t x = diff[i * stride];
+    const int16_t z = diff[(i - 1) * stride];
+    x_sum += x;
+    x_firstcol += x;
+    x2_sum += x * x;
+    x2_firstcol += x * x;
+    xz_sum += x * z;
+  }
+
+  // Now process horiz and vert correlation through the rest unit
+  for (int i = 1; i < height; ++i) {
+    for (int j = 1; j < width; ++j) {
+      const int16_t x = diff[i * stride + j];
+      const int16_t y = diff[i * stride + j - 1];
+      const int16_t z = diff[(i - 1) * stride + j];
+      x_sum += x;
+      x2_sum += x * x;
+      xy_sum += x * y;
+      xz_sum += x * z;
+    }
+  }
+
+  for (int j = 0; j < width; ++j) {
+    x_finalrow += diff[(height - 1) * stride + j];
+    x2_finalrow +=
+        diff[(height - 1) * stride + j] * diff[(height - 1) * stride + j];
+  }
+  for (int i = 0; i < height; ++i) {
+    x_finalcol += diff[i * stride + width - 1];
+    x2_finalcol += diff[i * stride + width - 1] * diff[i * stride + width - 1];
+  }
+
+  int64_t xhor_sum = x_sum - x_finalcol;
+  int64_t xver_sum = x_sum - x_finalrow;
+  int64_t y_sum = x_sum - x_firstcol;
+  int64_t z_sum = x_sum - x_firstrow;
+  int64_t x2hor_sum = x2_sum - x2_finalcol;
+  int64_t x2ver_sum = x2_sum - x2_finalrow;
+  int64_t y2_sum = x2_sum - x2_firstcol;
+  int64_t z2_sum = x2_sum - x2_firstrow;
+
+  const float num_hor = (float)(height * (width - 1));
+  const float num_ver = (float)((height - 1) * width);
+
+  const float xhor_var_n = x2hor_sum - (xhor_sum * xhor_sum) / num_hor;
+  const float xver_var_n = x2ver_sum - (xver_sum * xver_sum) / num_ver;
+
+  const float y_var_n = y2_sum - (y_sum * y_sum) / num_hor;
+  const float z_var_n = z2_sum - (z_sum * z_sum) / num_ver;
+
+  const float xy_var_n = xy_sum - (xhor_sum * y_sum) / num_hor;
+  const float xz_var_n = xz_sum - (xver_sum * z_sum) / num_ver;
+
+  if (xhor_var_n > 0 && y_var_n > 0) {
+    *hcorr = xy_var_n / sqrtf(xhor_var_n * y_var_n);
+    *hcorr = *hcorr < 0 ? 0 : *hcorr;
+  } else {
+    *hcorr = 1.0;
+  }
+  if (xver_var_n > 0 && z_var_n > 0) {
+    *vcorr = xz_var_n / sqrtf(xver_var_n * z_var_n);
+    *vcorr = *vcorr < 0 ? 0 : *vcorr;
+  } else {
+    *vcorr = 1.0;
+  }
+}
+
+static int64_t get_sse(const AV1_COMP *cpi, const MACROBLOCK *x,
+                       int64_t *sse_y) {
+  const AV1_COMMON *cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  const MACROBLOCKD *xd = &x->e_mbd;
+  const MB_MODE_INFO *mbmi = xd->mi[0];
+  int64_t total_sse = 0;
+  for (int plane = 0; plane < num_planes; ++plane) {
+    if (plane && !xd->is_chroma_ref) break;
+    const struct macroblock_plane *const p = &x->plane[plane];
+    const struct macroblockd_plane *const pd = &xd->plane[plane];
+    const BLOCK_SIZE bs = get_plane_block_size(mbmi->sb_type, pd->subsampling_x,
+                                               pd->subsampling_y);
+    unsigned int sse;
+
+    cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride,
+                       &sse);
+    total_sse += sse;
+    if (!plane && sse_y) *sse_y = sse;
+  }
+  total_sse <<= 4;
+  return total_sse;
+}
+
+int64_t av1_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff,
+                          intptr_t block_size, int64_t *ssz) {
+  int i;
+  int64_t error = 0, sqcoeff = 0;
+
+  for (i = 0; i < block_size; i++) {
+    const int diff = coeff[i] - dqcoeff[i];
+    error += diff * diff;
+    sqcoeff += coeff[i] * coeff[i];
+  }
+
+  *ssz = sqcoeff;
+  return error;
+}
+
+int64_t av1_block_error_lp_c(const int16_t *coeff, const int16_t *dqcoeff,
+                             intptr_t block_size) {
+  int64_t error = 0;
+
+  for (int i = 0; i < block_size; i++) {
+    const int diff = coeff[i] - dqcoeff[i];
+    error += diff * diff;
+  }
+
+  return error;
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+int64_t av1_highbd_block_error_c(const tran_low_t *coeff,
+                                 const tran_low_t *dqcoeff, intptr_t block_size,
+                                 int64_t *ssz, int bd) {
+  int i;
+  int64_t error = 0, sqcoeff = 0;
+  int shift = 2 * (bd - 8);
+  int rounding = shift > 0 ? 1 << (shift - 1) : 0;
+
+  for (i = 0; i < block_size; i++) {
+    const int64_t diff = coeff[i] - dqcoeff[i];
+    error += diff * diff;
+    sqcoeff += (int64_t)coeff[i] * (int64_t)coeff[i];
+  }
+  assert(error >= 0 && sqcoeff >= 0);
+  error = (error + rounding) >> shift;
+  sqcoeff = (sqcoeff + rounding) >> shift;
+
+  *ssz = sqcoeff;
+  return error;
+}
+#endif
+
+static int conditional_skipintra(PREDICTION_MODE mode,
+                                 PREDICTION_MODE best_intra_mode) {
+  if (mode == D113_PRED && best_intra_mode != V_PRED &&
+      best_intra_mode != D135_PRED)
+    return 1;
+  if (mode == D67_PRED && best_intra_mode != V_PRED &&
+      best_intra_mode != D45_PRED)
+    return 1;
+  if (mode == D203_PRED && best_intra_mode != H_PRED &&
+      best_intra_mode != D45_PRED)
+    return 1;
+  if (mode == D157_PRED && best_intra_mode != H_PRED &&
+      best_intra_mode != D135_PRED)
+    return 1;
+  return 0;
+}
+
+static int cost_mv_ref(const MACROBLOCK *const x, PREDICTION_MODE mode,
+                       int16_t mode_context) {
+  if (is_inter_compound_mode(mode)) {
+    return x
+        ->inter_compound_mode_cost[mode_context][INTER_COMPOUND_OFFSET(mode)];
+  }
+
+  int mode_cost = 0;
+  int16_t mode_ctx = mode_context & NEWMV_CTX_MASK;
+
+  assert(is_inter_mode(mode));
+
+  if (mode == NEWMV) {
+    mode_cost = x->newmv_mode_cost[mode_ctx][0];
+    return mode_cost;
+  } else {
+    mode_cost = x->newmv_mode_cost[mode_ctx][1];
+    mode_ctx = (mode_context >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK;
+
+    if (mode == GLOBALMV) {
+      mode_cost += x->zeromv_mode_cost[mode_ctx][0];
+      return mode_cost;
+    } else {
+      mode_cost += x->zeromv_mode_cost[mode_ctx][1];
+      mode_ctx = (mode_context >> REFMV_OFFSET) & REFMV_CTX_MASK;
+      mode_cost += x->refmv_mode_cost[mode_ctx][mode != NEARESTMV];
+      return mode_cost;
+    }
+  }
+}
+
+static INLINE PREDICTION_MODE get_single_mode(PREDICTION_MODE this_mode,
+                                              int ref_idx) {
+  return ref_idx ? compound_ref1_mode(this_mode)
+                 : compound_ref0_mode(this_mode);
+}
+
+static AOM_INLINE void estimate_ref_frame_costs(
+    const AV1_COMMON *cm, const MACROBLOCKD *xd, const MACROBLOCK *x,
+    int segment_id, unsigned int *ref_costs_single,
+    unsigned int (*ref_costs_comp)[REF_FRAMES]) {
+  int seg_ref_active =
+      segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME);
+  if (seg_ref_active) {
+    memset(ref_costs_single, 0, REF_FRAMES * sizeof(*ref_costs_single));
+    int ref_frame;
+    for (ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame)
+      memset(ref_costs_comp[ref_frame], 0,
+             REF_FRAMES * sizeof((*ref_costs_comp)[0]));
+  } else {
+    int intra_inter_ctx = av1_get_intra_inter_context(xd);
+    ref_costs_single[INTRA_FRAME] = x->intra_inter_cost[intra_inter_ctx][0];
+    unsigned int base_cost = x->intra_inter_cost[intra_inter_ctx][1];
+
+    for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i)
+      ref_costs_single[i] = base_cost;
+
+    const int ctx_p1 = av1_get_pred_context_single_ref_p1(xd);
+    const int ctx_p2 = av1_get_pred_context_single_ref_p2(xd);
+    const int ctx_p3 = av1_get_pred_context_single_ref_p3(xd);
+    const int ctx_p4 = av1_get_pred_context_single_ref_p4(xd);
+    const int ctx_p5 = av1_get_pred_context_single_ref_p5(xd);
+    const int ctx_p6 = av1_get_pred_context_single_ref_p6(xd);
+
+    // Determine cost of a single ref frame, where frame types are represented
+    // by a tree:
+    // Level 0: add cost whether this ref is a forward or backward ref
+    ref_costs_single[LAST_FRAME] += x->single_ref_cost[ctx_p1][0][0];
+    ref_costs_single[LAST2_FRAME] += x->single_ref_cost[ctx_p1][0][0];
+    ref_costs_single[LAST3_FRAME] += x->single_ref_cost[ctx_p1][0][0];
+    ref_costs_single[GOLDEN_FRAME] += x->single_ref_cost[ctx_p1][0][0];
+    ref_costs_single[BWDREF_FRAME] += x->single_ref_cost[ctx_p1][0][1];
+    ref_costs_single[ALTREF2_FRAME] += x->single_ref_cost[ctx_p1][0][1];
+    ref_costs_single[ALTREF_FRAME] += x->single_ref_cost[ctx_p1][0][1];
+
+    // Level 1: if this ref is forward ref,
+    // add cost whether it is last/last2 or last3/golden
+    ref_costs_single[LAST_FRAME] += x->single_ref_cost[ctx_p3][2][0];
+    ref_costs_single[LAST2_FRAME] += x->single_ref_cost[ctx_p3][2][0];
+    ref_costs_single[LAST3_FRAME] += x->single_ref_cost[ctx_p3][2][1];
+    ref_costs_single[GOLDEN_FRAME] += x->single_ref_cost[ctx_p3][2][1];
+
+    // Level 1: if this ref is backward ref
+    // then add cost whether this ref is altref or backward ref
+    ref_costs_single[BWDREF_FRAME] += x->single_ref_cost[ctx_p2][1][0];
+    ref_costs_single[ALTREF2_FRAME] += x->single_ref_cost[ctx_p2][1][0];
+    ref_costs_single[ALTREF_FRAME] += x->single_ref_cost[ctx_p2][1][1];
+
+    // Level 2: further add cost whether this ref is last or last2
+    ref_costs_single[LAST_FRAME] += x->single_ref_cost[ctx_p4][3][0];
+    ref_costs_single[LAST2_FRAME] += x->single_ref_cost[ctx_p4][3][1];
+
+    // Level 2: last3 or golden
+    ref_costs_single[LAST3_FRAME] += x->single_ref_cost[ctx_p5][4][0];
+    ref_costs_single[GOLDEN_FRAME] += x->single_ref_cost[ctx_p5][4][1];
+
+    // Level 2: bwdref or altref2
+    ref_costs_single[BWDREF_FRAME] += x->single_ref_cost[ctx_p6][5][0];
+    ref_costs_single[ALTREF2_FRAME] += x->single_ref_cost[ctx_p6][5][1];
+
+    if (cm->current_frame.reference_mode != SINGLE_REFERENCE) {
+      // Similar to single ref, determine cost of compound ref frames.
+      // cost_compound_refs = cost_first_ref + cost_second_ref
+      const int bwdref_comp_ctx_p = av1_get_pred_context_comp_bwdref_p(xd);
+      const int bwdref_comp_ctx_p1 = av1_get_pred_context_comp_bwdref_p1(xd);
+      const int ref_comp_ctx_p = av1_get_pred_context_comp_ref_p(xd);
+      const int ref_comp_ctx_p1 = av1_get_pred_context_comp_ref_p1(xd);
+      const int ref_comp_ctx_p2 = av1_get_pred_context_comp_ref_p2(xd);
+
+      const int comp_ref_type_ctx = av1_get_comp_reference_type_context(xd);
+      unsigned int ref_bicomp_costs[REF_FRAMES] = { 0 };
+
+      ref_bicomp_costs[LAST_FRAME] = ref_bicomp_costs[LAST2_FRAME] =
+          ref_bicomp_costs[LAST3_FRAME] = ref_bicomp_costs[GOLDEN_FRAME] =
+              base_cost + x->comp_ref_type_cost[comp_ref_type_ctx][1];
+      ref_bicomp_costs[BWDREF_FRAME] = ref_bicomp_costs[ALTREF2_FRAME] = 0;
+      ref_bicomp_costs[ALTREF_FRAME] = 0;
+
+      // cost of first ref frame
+      ref_bicomp_costs[LAST_FRAME] += x->comp_ref_cost[ref_comp_ctx_p][0][0];
+      ref_bicomp_costs[LAST2_FRAME] += x->comp_ref_cost[ref_comp_ctx_p][0][0];
+      ref_bicomp_costs[LAST3_FRAME] += x->comp_ref_cost[ref_comp_ctx_p][0][1];
+      ref_bicomp_costs[GOLDEN_FRAME] += x->comp_ref_cost[ref_comp_ctx_p][0][1];
+
+      ref_bicomp_costs[LAST_FRAME] += x->comp_ref_cost[ref_comp_ctx_p1][1][0];
+      ref_bicomp_costs[LAST2_FRAME] += x->comp_ref_cost[ref_comp_ctx_p1][1][1];
+
+      ref_bicomp_costs[LAST3_FRAME] += x->comp_ref_cost[ref_comp_ctx_p2][2][0];
+      ref_bicomp_costs[GOLDEN_FRAME] += x->comp_ref_cost[ref_comp_ctx_p2][2][1];
+
+      // cost of second ref frame
+      ref_bicomp_costs[BWDREF_FRAME] +=
+          x->comp_bwdref_cost[bwdref_comp_ctx_p][0][0];
+      ref_bicomp_costs[ALTREF2_FRAME] +=
+          x->comp_bwdref_cost[bwdref_comp_ctx_p][0][0];
+      ref_bicomp_costs[ALTREF_FRAME] +=
+          x->comp_bwdref_cost[bwdref_comp_ctx_p][0][1];
+
+      ref_bicomp_costs[BWDREF_FRAME] +=
+          x->comp_bwdref_cost[bwdref_comp_ctx_p1][1][0];
+      ref_bicomp_costs[ALTREF2_FRAME] +=
+          x->comp_bwdref_cost[bwdref_comp_ctx_p1][1][1];
+
+      // cost: if one ref frame is forward ref, the other ref is backward ref
+      int ref0, ref1;
+      for (ref0 = LAST_FRAME; ref0 <= GOLDEN_FRAME; ++ref0) {
+        for (ref1 = BWDREF_FRAME; ref1 <= ALTREF_FRAME; ++ref1) {
+          ref_costs_comp[ref0][ref1] =
+              ref_bicomp_costs[ref0] + ref_bicomp_costs[ref1];
+        }
+      }
+
+      // cost: if both ref frames are the same side.
+      const int uni_comp_ref_ctx_p = av1_get_pred_context_uni_comp_ref_p(xd);
+      const int uni_comp_ref_ctx_p1 = av1_get_pred_context_uni_comp_ref_p1(xd);
+      const int uni_comp_ref_ctx_p2 = av1_get_pred_context_uni_comp_ref_p2(xd);
+      ref_costs_comp[LAST_FRAME][LAST2_FRAME] =
+          base_cost + x->comp_ref_type_cost[comp_ref_type_ctx][0] +
+          x->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][0] +
+          x->uni_comp_ref_cost[uni_comp_ref_ctx_p1][1][0];
+      ref_costs_comp[LAST_FRAME][LAST3_FRAME] =
+          base_cost + x->comp_ref_type_cost[comp_ref_type_ctx][0] +
+          x->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][0] +
+          x->uni_comp_ref_cost[uni_comp_ref_ctx_p1][1][1] +
+          x->uni_comp_ref_cost[uni_comp_ref_ctx_p2][2][0];
+      ref_costs_comp[LAST_FRAME][GOLDEN_FRAME] =
+          base_cost + x->comp_ref_type_cost[comp_ref_type_ctx][0] +
+          x->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][0] +
+          x->uni_comp_ref_cost[uni_comp_ref_ctx_p1][1][1] +
+          x->uni_comp_ref_cost[uni_comp_ref_ctx_p2][2][1];
+      ref_costs_comp[BWDREF_FRAME][ALTREF_FRAME] =
+          base_cost + x->comp_ref_type_cost[comp_ref_type_ctx][0] +
+          x->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][1];
+    } else {
+      int ref0, ref1;
+      for (ref0 = LAST_FRAME; ref0 <= GOLDEN_FRAME; ++ref0) {
+        for (ref1 = BWDREF_FRAME; ref1 <= ALTREF_FRAME; ++ref1)
+          ref_costs_comp[ref0][ref1] = 512;
+      }
+      ref_costs_comp[LAST_FRAME][LAST2_FRAME] = 512;
+      ref_costs_comp[LAST_FRAME][LAST3_FRAME] = 512;
+      ref_costs_comp[LAST_FRAME][GOLDEN_FRAME] = 512;
+      ref_costs_comp[BWDREF_FRAME][ALTREF_FRAME] = 512;
+    }
+  }
+}
+
+static AOM_INLINE void store_coding_context(
+#if CONFIG_INTERNAL_STATS
+    MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, int mode_index,
+#else
+    MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
+#endif  // CONFIG_INTERNAL_STATS
+    int64_t comp_pred_diff[REFERENCE_MODES], int skippable) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+
+  // Take a snapshot of the coding context so it can be
+  // restored if we decide to encode this way
+  ctx->rd_stats.skip = x->force_skip;
+  ctx->skippable = skippable;
+#if CONFIG_INTERNAL_STATS
+  ctx->best_mode_index = mode_index;
+#endif  // CONFIG_INTERNAL_STATS
+  ctx->mic = *xd->mi[0];
+  av1_copy_mbmi_ext_to_mbmi_ext_frame(&ctx->mbmi_ext_best, x->mbmi_ext,
+                                      av1_ref_frame_type(xd->mi[0]->ref_frame));
+  ctx->single_pred_diff = (int)comp_pred_diff[SINGLE_REFERENCE];
+  ctx->comp_pred_diff = (int)comp_pred_diff[COMPOUND_REFERENCE];
+  ctx->hybrid_pred_diff = (int)comp_pred_diff[REFERENCE_MODE_SELECT];
+}
+
+static AOM_INLINE void setup_buffer_ref_mvs_inter(
+    const AV1_COMP *const cpi, MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame,
+    BLOCK_SIZE block_size, struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE]) {
+  const AV1_COMMON *cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  const YV12_BUFFER_CONFIG *scaled_ref_frame =
+      av1_get_scaled_ref_frame(cpi, ref_frame);
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+  const struct scale_factors *const sf =
+      get_ref_scale_factors_const(cm, ref_frame);
+  const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, ref_frame);
+  assert(yv12 != NULL);
+
+  if (scaled_ref_frame) {
+    // Setup pred block based on scaled reference, because av1_mv_pred() doesn't
+    // support scaling.
+    av1_setup_pred_block(xd, yv12_mb[ref_frame], scaled_ref_frame, NULL, NULL,
+                         num_planes);
+  } else {
+    av1_setup_pred_block(xd, yv12_mb[ref_frame], yv12, sf, sf, num_planes);
+  }
+
+  // Gets an initial list of candidate vectors from neighbours and orders them
+  av1_find_mv_refs(cm, xd, mbmi, ref_frame, mbmi_ext->ref_mv_count,
+                   xd->ref_mv_stack, xd->weight, NULL, mbmi_ext->global_mvs,
+                   mbmi_ext->mode_context);
+  // TODO(Ravi): Populate mbmi_ext->ref_mv_stack[ref_frame][4] and
+  // mbmi_ext->weight[ref_frame][4] inside av1_find_mv_refs.
+  av1_copy_usable_ref_mv_stack_and_weight(xd, mbmi_ext, ref_frame);
+  // Further refinement that is encode side only to test the top few candidates
+  // in full and choose the best as the center point for subsequent searches.
+  // The current implementation doesn't support scaling.
+  av1_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12_mb[ref_frame][0].stride,
+              ref_frame, block_size);
+
+  // Go back to unscaled reference.
+  if (scaled_ref_frame) {
+    // We had temporarily setup pred block based on scaled reference above. Go
+    // back to unscaled reference now, for subsequent use.
+    av1_setup_pred_block(xd, yv12_mb[ref_frame], yv12, sf, sf, num_planes);
+  }
+}
+
+#define LEFT_TOP_MARGIN ((AOM_BORDER_IN_PIXELS - AOM_INTERP_EXTEND) << 3)
+#define RIGHT_BOTTOM_MARGIN ((AOM_BORDER_IN_PIXELS - AOM_INTERP_EXTEND) << 3)
+
+// TODO(jingning): this mv clamping function should be block size dependent.
+static INLINE void clamp_mv2(MV *mv, const MACROBLOCKD *xd) {
+  const SubpelMvLimits mv_limits = { xd->mb_to_left_edge - LEFT_TOP_MARGIN,
+                                     xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN,
+                                     xd->mb_to_top_edge - LEFT_TOP_MARGIN,
+                                     xd->mb_to_bottom_edge +
+                                         RIGHT_BOTTOM_MARGIN };
+  clamp_mv(mv, &mv_limits);
+}
+
+/* If the current mode shares the same mv with other modes with higher cost,
+ * skip this mode. */
+static int skip_repeated_mv(const AV1_COMMON *const cm,
+                            const MACROBLOCK *const x,
+                            PREDICTION_MODE this_mode,
+                            const MV_REFERENCE_FRAME ref_frames[2],
+                            InterModeSearchState *search_state) {
+  const int is_comp_pred = ref_frames[1] > INTRA_FRAME;
+  const uint8_t ref_frame_type = av1_ref_frame_type(ref_frames);
+  const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+  const int ref_mv_count = mbmi_ext->ref_mv_count[ref_frame_type];
+  PREDICTION_MODE compare_mode = MB_MODE_COUNT;
+  if (!is_comp_pred) {
+    if (this_mode == NEARMV) {
+      if (ref_mv_count == 0) {
+        // NEARMV has the same motion vector as NEARESTMV
+        compare_mode = NEARESTMV;
+      }
+      if (ref_mv_count == 1 &&
+          cm->global_motion[ref_frames[0]].wmtype <= TRANSLATION) {
+        // NEARMV has the same motion vector as GLOBALMV
+        compare_mode = GLOBALMV;
+      }
+    }
+    if (this_mode == GLOBALMV) {
+      if (ref_mv_count == 0 &&
+          cm->global_motion[ref_frames[0]].wmtype <= TRANSLATION) {
+        // GLOBALMV has the same motion vector as NEARESTMV
+        compare_mode = NEARESTMV;
+      }
+      if (ref_mv_count == 1) {
+        // GLOBALMV has the same motion vector as NEARMV
+        compare_mode = NEARMV;
+      }
+    }
+
+    if (compare_mode != MB_MODE_COUNT) {
+      // Use modelled_rd to check whether compare mode was searched
+      if (search_state->modelled_rd[compare_mode][0][ref_frames[0]] !=
+          INT64_MAX) {
+        const int16_t mode_ctx =
+            av1_mode_context_analyzer(mbmi_ext->mode_context, ref_frames);
+        const int compare_cost = cost_mv_ref(x, compare_mode, mode_ctx);
+        const int this_cost = cost_mv_ref(x, this_mode, mode_ctx);
+
+        // Only skip if the mode cost is larger than compare mode cost
+        if (this_cost > compare_cost) {
+          search_state->modelled_rd[this_mode][0][ref_frames[0]] =
+              search_state->modelled_rd[compare_mode][0][ref_frames[0]];
+          return 1;
+        }
+      }
+    }
+  }
+  return 0;
+}
+
+static INLINE int clamp_and_check_mv(int_mv *out_mv, int_mv in_mv,
+                                     const AV1_COMMON *cm,
+                                     const MACROBLOCK *x) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  *out_mv = in_mv;
+  lower_mv_precision(&out_mv->as_mv, cm->features.allow_high_precision_mv,
+                     cm->features.cur_frame_force_integer_mv);
+  clamp_mv2(&out_mv->as_mv, xd);
+  return av1_is_fullmv_in_range(&x->mv_limits,
+                                get_fullmv_from_mv(&out_mv->as_mv));
+}
+
+// To use single newmv directly for compound modes, need to clamp the mv to the
+// valid mv range. Without this, encoder would generate out of range mv, and
+// this is seen in 8k encoding.
+static INLINE void clamp_mv_in_range(MACROBLOCK *const x, int_mv *mv,
+                                     int ref_idx) {
+  const int_mv ref_mv = av1_get_ref_mv(x, ref_idx);
+  SubpelMvLimits mv_limits;
+
+  av1_set_subpel_mv_search_range(&mv_limits, &x->mv_limits, &ref_mv.as_mv);
+  clamp_mv(&mv->as_mv, &mv_limits);
+}
+
+static int64_t handle_newmv(const AV1_COMP *const cpi, MACROBLOCK *const x,
+                            const BLOCK_SIZE bsize, int_mv *cur_mv,
+                            int *const rate_mv, HandleInterModeArgs *const args,
+                            inter_mode_info *mode_info) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int is_comp_pred = has_second_ref(mbmi);
+  const PREDICTION_MODE this_mode = mbmi->mode;
+  const int refs[2] = { mbmi->ref_frame[0],
+                        mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1] };
+  const int ref_mv_idx = mbmi->ref_mv_idx;
+
+  if (is_comp_pred) {
+    const int valid_mv0 = args->single_newmv_valid[ref_mv_idx][refs[0]];
+    const int valid_mv1 = args->single_newmv_valid[ref_mv_idx][refs[1]];
+
+    if (this_mode == NEW_NEWMV) {
+      if (valid_mv0) {
+        cur_mv[0].as_int = args->single_newmv[ref_mv_idx][refs[0]].as_int;
+        clamp_mv_in_range(x, &cur_mv[0], 0);
+      }
+      if (valid_mv1) {
+        cur_mv[1].as_int = args->single_newmv[ref_mv_idx][refs[1]].as_int;
+        clamp_mv_in_range(x, &cur_mv[1], 1);
+      }
+
+      // aomenc1
+      if (cpi->sf.inter_sf.comp_inter_joint_search_thresh <= bsize ||
+          !valid_mv0 || !valid_mv1) {
+        av1_joint_motion_search(cpi, x, bsize, cur_mv, NULL, 0, rate_mv);
+      } else {
+        *rate_mv = 0;
+        for (int i = 0; i < 2; ++i) {
+          const int_mv ref_mv = av1_get_ref_mv(x, i);
+          *rate_mv +=
+              av1_mv_bit_cost(&cur_mv[i].as_mv, &ref_mv.as_mv, x->nmv_vec_cost,
+                              x->mv_cost_stack, MV_COST_WEIGHT);
+        }
+      }
+    } else if (this_mode == NEAREST_NEWMV || this_mode == NEAR_NEWMV) {
+      if (valid_mv1) {
+        cur_mv[1].as_int = args->single_newmv[ref_mv_idx][refs[1]].as_int;
+        clamp_mv_in_range(x, &cur_mv[1], 1);
+      }
+
+      // aomenc2
+      if (cpi->sf.inter_sf.comp_inter_joint_search_thresh <= bsize ||
+          !valid_mv1) {
+        av1_compound_single_motion_search_interinter(cpi, x, bsize, cur_mv,
+                                                     NULL, 0, rate_mv, 1);
+      } else {
+        const int_mv ref_mv = av1_get_ref_mv(x, 1);
+        *rate_mv =
+            av1_mv_bit_cost(&cur_mv[1].as_mv, &ref_mv.as_mv, x->nmv_vec_cost,
+                            x->mv_cost_stack, MV_COST_WEIGHT);
+      }
+    } else {
+      assert(this_mode == NEW_NEARESTMV || this_mode == NEW_NEARMV);
+      if (valid_mv0) {
+        cur_mv[0].as_int = args->single_newmv[ref_mv_idx][refs[0]].as_int;
+        clamp_mv_in_range(x, &cur_mv[0], 0);
+      }
+
+      // aomenc3
+      if (cpi->sf.inter_sf.comp_inter_joint_search_thresh <= bsize ||
+          !valid_mv0) {
+        av1_compound_single_motion_search_interinter(cpi, x, bsize, cur_mv,
+                                                     NULL, 0, rate_mv, 0);
+      } else {
+        const int_mv ref_mv = av1_get_ref_mv(x, 0);
+        *rate_mv =
+            av1_mv_bit_cost(&cur_mv[0].as_mv, &ref_mv.as_mv, x->nmv_vec_cost,
+                            x->mv_cost_stack, MV_COST_WEIGHT);
+      }
+    }
+  } else {
+    // Single ref case.
+    const int ref_idx = 0;
+    int search_range = INT_MAX;
+
+    if (cpi->sf.mv_sf.reduce_search_range && mbmi->ref_mv_idx > 0) {
+      const MV ref_mv = av1_get_ref_mv(x, ref_idx).as_mv;
+      int min_mv_diff = INT_MAX;
+      int best_match = -1;
+      MV prev_ref_mv[2] = { { 0 } };
+      for (int idx = 0; idx < mbmi->ref_mv_idx; ++idx) {
+        prev_ref_mv[idx] = av1_get_ref_mv_from_stack(ref_idx, mbmi->ref_frame,
+                                                     idx, x->mbmi_ext)
+                               .as_mv;
+        const int ref_mv_diff = AOMMAX(abs(ref_mv.row - prev_ref_mv[idx].row),
+                                       abs(ref_mv.col - prev_ref_mv[idx].col));
+
+        if (min_mv_diff > ref_mv_diff) {
+          min_mv_diff = ref_mv_diff;
+          best_match = idx;
+        }
+      }
+
+      if (min_mv_diff < (16 << 3)) {
+        if (args->single_newmv_valid[best_match][refs[0]]) {
+          search_range = min_mv_diff;
+          search_range +=
+              AOMMAX(abs(args->single_newmv[best_match][refs[0]].as_mv.row -
+                         prev_ref_mv[best_match].row),
+                     abs(args->single_newmv[best_match][refs[0]].as_mv.col -
+                         prev_ref_mv[best_match].col));
+          // Get full pixel search range.
+          search_range = (search_range + 4) >> 3;
+        }
+      }
+    }
+
+    int_mv best_mv;
+    av1_single_motion_search(cpi, x, bsize, ref_idx, rate_mv, search_range,
+                             mode_info, &best_mv);
+    if (best_mv.as_int == INVALID_MV) return INT64_MAX;
+
+    args->single_newmv[ref_mv_idx][refs[0]] = best_mv;
+    args->single_newmv_rate[ref_mv_idx][refs[0]] = *rate_mv;
+    args->single_newmv_valid[ref_mv_idx][refs[0]] = 1;
+    cur_mv[0].as_int = best_mv.as_int;
+  }
+
+  return 0;
+}
+
+// If number of valid neighbours is 1,
+// 1) ROTZOOM parameters can be obtained reliably (2 parameters from
+// one neighbouring MV)
+// 2) For IDENTITY/TRANSLATION cases, warp can perform better due to
+// a different interpolation filter being used. However the quality
+// gains (due to the same) may not be much
+// For above 2 cases warp evaluation is skipped
+
+static int check_if_optimal_warp(const AV1_COMP *cpi,
+                                 WarpedMotionParams *wm_params,
+                                 int num_proj_ref) {
+  int is_valid_warp = 1;
+  if (cpi->sf.inter_sf.prune_warp_using_wmtype) {
+    TransformationType wmtype = get_wmtype(wm_params);
+    if (num_proj_ref == 1) {
+      if (wmtype != ROTZOOM) is_valid_warp = 0;
+    } else {
+      if (wmtype < ROTZOOM) is_valid_warp = 0;
+    }
+  }
+  return is_valid_warp;
+}
+
+static INLINE void update_mode_start_end_index(const AV1_COMP *const cpi,
+                                               int *mode_index_start,
+                                               int *mode_index_end,
+                                               int last_motion_mode_allowed,
+                                               int interintra_allowed,
+                                               int eval_motion_mode) {
+  *mode_index_start = (int)SIMPLE_TRANSLATION;
+  *mode_index_end = (int)last_motion_mode_allowed + interintra_allowed;
+  if (cpi->sf.winner_mode_sf.motion_mode_for_winner_cand) {
+    if (!eval_motion_mode) {
+      *mode_index_end = (int)SIMPLE_TRANSLATION;
+    } else {
+      // Set the start index appropriately to process motion modes other than
+      // simple translation
+      *mode_index_start = 1;
+    }
+  }
+}
+
+// TODO(afergs): Refactor the MBMI references in here - there's four
+// TODO(afergs): Refactor optional args - add them to a struct or remove
+static int64_t motion_mode_rd(
+    const AV1_COMP *const cpi, TileDataEnc *tile_data, MACROBLOCK *const x,
+    BLOCK_SIZE bsize, RD_STATS *rd_stats, RD_STATS *rd_stats_y,
+    RD_STATS *rd_stats_uv, int *disable_skip, HandleInterModeArgs *const args,
+    int64_t ref_best_rd, int64_t *ref_skip_rd, int *rate_mv,
+    const BUFFER_SET *orig_dst, int64_t *best_est_rd, int do_tx_search,
+    InterModesInfo *inter_modes_info, int eval_motion_mode) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const FeatureFlags *const features = &cm->features;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  const int is_comp_pred = has_second_ref(mbmi);
+  const PREDICTION_MODE this_mode = mbmi->mode;
+  const int rate2_nocoeff = rd_stats->rate;
+  int best_xskip = 0, best_disable_skip = 0;
+  RD_STATS best_rd_stats, best_rd_stats_y, best_rd_stats_uv;
+  uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  const int rate_mv0 = *rate_mv;
+  const int interintra_allowed = cm->seq_params.enable_interintra_compound &&
+                                 is_interintra_allowed(mbmi) &&
+                                 mbmi->compound_idx;
+  int pts0[SAMPLES_ARRAY_SIZE], pts_inref0[SAMPLES_ARRAY_SIZE];
+
+  assert(mbmi->ref_frame[1] != INTRA_FRAME);
+  const MV_REFERENCE_FRAME ref_frame_1 = mbmi->ref_frame[1];
+  (void)tile_data;
+  av1_invalid_rd_stats(&best_rd_stats);
+  aom_clear_system_state();
+  mbmi->num_proj_ref = 1;  // assume num_proj_ref >=1
+  MOTION_MODE last_motion_mode_allowed = SIMPLE_TRANSLATION;
+  if (features->switchable_motion_mode) {
+    last_motion_mode_allowed = motion_mode_allowed(
+        xd->global_motion, xd, mbmi, features->allow_warped_motion);
+  }
+
+  if (last_motion_mode_allowed == WARPED_CAUSAL) {
+    mbmi->num_proj_ref = av1_findSamples(cm, xd, pts0, pts_inref0);
+  }
+  const int total_samples = mbmi->num_proj_ref;
+  if (total_samples == 0) {
+    last_motion_mode_allowed = OBMC_CAUSAL;
+  }
+
+  const MB_MODE_INFO base_mbmi = *mbmi;
+  MB_MODE_INFO best_mbmi;
+  SimpleRDState *const simple_states = &args->simple_rd_state[mbmi->ref_mv_idx];
+  const int interp_filter = features->interp_filter;
+  const int switchable_rate =
+      av1_is_interp_needed(xd) ? av1_get_switchable_rate(x, xd, interp_filter)
+                               : 0;
+  int64_t best_rd = INT64_MAX;
+  int best_rate_mv = rate_mv0;
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+  int mode_index_start, mode_index_end;
+  update_mode_start_end_index(cpi, &mode_index_start, &mode_index_end,
+                              last_motion_mode_allowed, interintra_allowed,
+                              eval_motion_mode);
+  for (int mode_index = mode_index_start; mode_index <= mode_index_end;
+       mode_index++) {
+    if (args->skip_motion_mode && mode_index) continue;
+    if (cpi->sf.inter_sf.prune_single_motion_modes_by_simple_trans &&
+        args->single_ref_first_pass && mode_index)
+      break;
+    int tmp_rate2 = rate2_nocoeff;
+    const int is_interintra_mode = mode_index > (int)last_motion_mode_allowed;
+    int tmp_rate_mv = rate_mv0;
+
+    *mbmi = base_mbmi;
+    if (is_interintra_mode) {
+      mbmi->motion_mode = SIMPLE_TRANSLATION;
+    } else {
+      mbmi->motion_mode = (MOTION_MODE)mode_index;
+      assert(mbmi->ref_frame[1] != INTRA_FRAME);
+    }
+
+    const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->gf_group);
+    const int prune_obmc = cpi->frame_probs.obmc_probs[update_type][bsize] <
+                           cpi->sf.inter_sf.prune_obmc_prob_thresh;
+    if ((cpi->oxcf.enable_obmc == 0 || cpi->sf.inter_sf.disable_obmc ||
+         cpi->sf.rt_sf.use_nonrd_pick_mode || prune_obmc) &&
+        mbmi->motion_mode == OBMC_CAUSAL)
+      continue;
+
+    if (mbmi->motion_mode == SIMPLE_TRANSLATION && !is_interintra_mode) {
+      // SIMPLE_TRANSLATION mode: no need to recalculate.
+      // The prediction is calculated before motion_mode_rd() is called in
+      // handle_inter_mode()
+      if (cpi->sf.inter_sf.prune_single_motion_modes_by_simple_trans &&
+          !is_comp_pred) {
+        if (args->single_ref_first_pass == 0) {
+          if (simple_states->early_skipped) {
+            assert(simple_states->rd_stats.rdcost == INT64_MAX);
+            return INT64_MAX;
+          }
+          if (simple_states->rd_stats.rdcost != INT64_MAX) {
+            best_rd = simple_states->rd_stats.rdcost;
+            best_rd_stats = simple_states->rd_stats;
+            best_rd_stats_y = simple_states->rd_stats_y;
+            best_rd_stats_uv = simple_states->rd_stats_uv;
+            memcpy(best_blk_skip, simple_states->blk_skip,
+                   sizeof(x->blk_skip[0]) * xd->height * xd->width);
+            av1_copy_array(best_tx_type_map, simple_states->tx_type_map,
+                           xd->height * xd->width);
+            best_xskip = simple_states->skip;
+            best_disable_skip = simple_states->disable_skip;
+            best_mbmi = *mbmi;
+          }
+          continue;
+        }
+        simple_states->early_skipped = 0;
+      }
+    } else if (mbmi->motion_mode == OBMC_CAUSAL) {
+      const uint32_t cur_mv = mbmi->mv[0].as_int;
+      assert(!is_comp_pred);
+      if (have_newmv_in_inter_mode(this_mode)) {
+        av1_single_motion_search(cpi, x, bsize, 0, &tmp_rate_mv, INT_MAX, NULL,
+                                 &mbmi->mv[0]);
+        tmp_rate2 = rate2_nocoeff - rate_mv0 + tmp_rate_mv;
+      }
+      if ((mbmi->mv[0].as_int != cur_mv) || eval_motion_mode) {
+        av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+                                      0, av1_num_planes(cm) - 1);
+      }
+      av1_build_obmc_inter_prediction(
+          cm, xd, args->above_pred_buf, args->above_pred_stride,
+          args->left_pred_buf, args->left_pred_stride);
+    } else if (mbmi->motion_mode == WARPED_CAUSAL) {
+      int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
+      mbmi->motion_mode = WARPED_CAUSAL;
+      mbmi->wm_params.wmtype = DEFAULT_WMTYPE;
+      mbmi->interp_filters =
+          av1_broadcast_interp_filter(av1_unswitchable_filter(interp_filter));
+
+      memcpy(pts, pts0, total_samples * 2 * sizeof(*pts0));
+      memcpy(pts_inref, pts_inref0, total_samples * 2 * sizeof(*pts_inref0));
+      // Select the samples according to motion vector difference
+      if (mbmi->num_proj_ref > 1) {
+        mbmi->num_proj_ref = av1_selectSamples(
+            &mbmi->mv[0].as_mv, pts, pts_inref, mbmi->num_proj_ref, bsize);
+      }
+
+      if (!av1_find_projection(mbmi->num_proj_ref, pts, pts_inref, bsize,
+                               mbmi->mv[0].as_mv.row, mbmi->mv[0].as_mv.col,
+                               &mbmi->wm_params, mi_row, mi_col)) {
+        // Refine MV for NEWMV mode
+        assert(!is_comp_pred);
+        if (have_newmv_in_inter_mode(this_mode)) {
+          const int_mv mv0 = mbmi->mv[0];
+          const WarpedMotionParams wm_params0 = mbmi->wm_params;
+          const int num_proj_ref0 = mbmi->num_proj_ref;
+
+          if (cpi->sf.inter_sf.prune_warp_using_wmtype) {
+            TransformationType wmtype = get_wmtype(&mbmi->wm_params);
+            if (wmtype < ROTZOOM) continue;
+          }
+
+          const int_mv ref_mv = av1_get_ref_mv(x, 0);
+          SUBPEL_MOTION_SEARCH_PARAMS ms_params;
+          av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize,
+                                            &ref_mv.as_mv, NULL);
+
+          // Refine MV in a small range.
+          av1_refine_warped_mv(xd, cm, &ms_params, bsize, pts0, pts_inref0,
+                               total_samples);
+
+          // Keep the refined MV and WM parameters.
+          if (mv0.as_int != mbmi->mv[0].as_int) {
+            tmp_rate_mv = av1_mv_bit_cost(&mbmi->mv[0].as_mv, &ref_mv.as_mv,
+                                          x->nmv_vec_cost, x->mv_cost_stack,
+                                          MV_COST_WEIGHT);
+            if (cpi->sf.mv_sf.adaptive_motion_search) {
+              x->pred_mv[mbmi->ref_frame[0]] = mbmi->mv[0].as_mv;
+            }
+            tmp_rate2 = rate2_nocoeff - rate_mv0 + tmp_rate_mv;
+          } else {
+            // Restore the old MV and WM parameters.
+            mbmi->mv[0] = mv0;
+            mbmi->wm_params = wm_params0;
+            mbmi->num_proj_ref = num_proj_ref0;
+          }
+        } else {
+          if (!check_if_optimal_warp(cpi, &mbmi->wm_params, mbmi->num_proj_ref))
+            continue;
+        }
+
+        av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0,
+                                      av1_num_planes(cm) - 1);
+      } else {
+        continue;
+      }
+    } else if (is_interintra_mode) {
+      const int ret =
+          av1_handle_inter_intra_mode(cpi, x, bsize, mbmi, args, ref_best_rd,
+                                      &tmp_rate_mv, &tmp_rate2, orig_dst);
+      if (ret < 0) continue;
+    }
+
+    // If we are searching newmv and the mv is the same as refmv, skip the
+    // current mode
+    if (this_mode == NEW_NEWMV) {
+      const int_mv ref_mv_0 = av1_get_ref_mv(x, 0);
+      const int_mv ref_mv_1 = av1_get_ref_mv(x, 1);
+      if (mbmi->mv[0].as_int == ref_mv_0.as_int ||
+          mbmi->mv[1].as_int == ref_mv_1.as_int) {
+        continue;
+      }
+    } else if (this_mode == NEAREST_NEWMV || this_mode == NEAR_NEWMV) {
+      const int_mv ref_mv_1 = av1_get_ref_mv(x, 1);
+      if (mbmi->mv[1].as_int == ref_mv_1.as_int) {
+        continue;
+      }
+    } else if (this_mode == NEW_NEARESTMV || this_mode == NEW_NEARMV) {
+      const int_mv ref_mv_0 = av1_get_ref_mv(x, 0);
+      if (mbmi->mv[0].as_int == ref_mv_0.as_int) {
+        continue;
+      }
+    } else if (this_mode == NEWMV) {
+      const int_mv ref_mv_0 = av1_get_ref_mv(x, 0);
+      if (mbmi->mv[0].as_int == ref_mv_0.as_int) {
+        continue;
+      }
+    }
+
+    x->force_skip = 0;
+    rd_stats->dist = 0;
+    rd_stats->sse = 0;
+    rd_stats->skip = 1;
+    rd_stats->rate = tmp_rate2;
+    if (mbmi->motion_mode != WARPED_CAUSAL) rd_stats->rate += switchable_rate;
+    if (interintra_allowed) {
+      rd_stats->rate += x->interintra_cost[size_group_lookup[bsize]]
+                                          [mbmi->ref_frame[1] == INTRA_FRAME];
+    }
+    if ((last_motion_mode_allowed > SIMPLE_TRANSLATION) &&
+        (mbmi->ref_frame[1] != INTRA_FRAME)) {
+      if (last_motion_mode_allowed == WARPED_CAUSAL) {
+        rd_stats->rate += x->motion_mode_cost[bsize][mbmi->motion_mode];
+      } else {
+        rd_stats->rate += x->motion_mode_cost1[bsize][mbmi->motion_mode];
+      }
+    }
+
+    if (!do_tx_search) {
+      int64_t curr_sse = -1;
+      int64_t sse_y = -1;
+      int est_residue_cost = 0;
+      int64_t est_dist = 0;
+      int64_t est_rd = 0;
+      if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1) {
+        curr_sse = get_sse(cpi, x, &sse_y);
+        // Scale luma SSE as per bit depth so as to be consistent with
+        // model_rd_sb_fn and compound type rd
+        sse_y = ROUND_POWER_OF_TWO(sse_y, (xd->bd - 8) * 2);
+        const int has_est_rd = get_est_rate_dist(tile_data, bsize, curr_sse,
+                                                 &est_residue_cost, &est_dist);
+        (void)has_est_rd;
+        assert(has_est_rd);
+      } else if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 2 ||
+                 cpi->sf.rt_sf.use_nonrd_pick_mode) {
+        model_rd_sb_fn[MODELRD_TYPE_MOTION_MODE_RD](
+            cpi, bsize, x, xd, 0, num_planes - 1, &est_residue_cost, &est_dist,
+            NULL, &curr_sse, NULL, NULL, NULL);
+        sse_y = x->pred_sse[xd->mi[0]->ref_frame[0]];
+      }
+      est_rd = RDCOST(x->rdmult, rd_stats->rate + est_residue_cost, est_dist);
+      if (est_rd * 0.80 > *best_est_rd) {
+        mbmi->ref_frame[1] = ref_frame_1;
+        continue;
+      }
+      const int mode_rate = rd_stats->rate;
+      rd_stats->rate += est_residue_cost;
+      rd_stats->dist = est_dist;
+      rd_stats->rdcost = est_rd;
+      if (rd_stats->rdcost < *best_est_rd) {
+        *best_est_rd = rd_stats->rdcost;
+        assert(sse_y >= 0);
+        ref_skip_rd[1] = cpi->sf.inter_sf.txfm_rd_gate_level
+                             ? RDCOST(x->rdmult, mode_rate, (sse_y << 4))
+                             : INT64_MAX;
+      }
+      if (cm->current_frame.reference_mode == SINGLE_REFERENCE) {
+        if (!is_comp_pred) {
+          assert(curr_sse >= 0);
+          inter_modes_info_push(inter_modes_info, mode_rate, curr_sse,
+                                rd_stats->rdcost, rd_stats, rd_stats_y,
+                                rd_stats_uv, mbmi);
+        }
+      } else {
+        assert(curr_sse >= 0);
+        inter_modes_info_push(inter_modes_info, mode_rate, curr_sse,
+                              rd_stats->rdcost, rd_stats, rd_stats_y,
+                              rd_stats_uv, mbmi);
+      }
+      mbmi->skip = 0;
+    } else {
+      int64_t skip_rd = INT64_MAX;
+      int64_t skip_rdy = INT64_MAX;
+      if (cpi->sf.inter_sf.txfm_rd_gate_level) {
+        // Check if the mode is good enough based on skip RD
+        int64_t sse_y = INT64_MAX;
+        int64_t curr_sse = get_sse(cpi, x, &sse_y);
+        // Scale luma SSE as per bit depth so as to be consistent with
+        // model_rd_sb_fn and compound type rd
+        sse_y = ROUND_POWER_OF_TWO(sse_y, (xd->bd - 8) * 2);
+        skip_rd = RDCOST(x->rdmult, rd_stats->rate, curr_sse);
+        skip_rdy = RDCOST(x->rdmult, rd_stats->rate, (sse_y << 4));
+        int eval_txfm = check_txfm_eval(x, bsize, ref_skip_rd[0], skip_rd,
+                                        cpi->sf.inter_sf.txfm_rd_gate_level, 0);
+        if (!eval_txfm) continue;
+      }
+
+      if (!av1_txfm_search(cpi, x, bsize, rd_stats, rd_stats_y, rd_stats_uv,
+                           rd_stats->rate, ref_best_rd)) {
+        if (rd_stats_y->rate == INT_MAX && mode_index == 0) {
+          if (cpi->sf.inter_sf.prune_single_motion_modes_by_simple_trans &&
+              !is_comp_pred) {
+            simple_states->early_skipped = 1;
+          }
+          return INT64_MAX;
+        }
+        continue;
+      }
+
+      const int64_t curr_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+      if (curr_rd < ref_best_rd) {
+        ref_best_rd = curr_rd;
+        ref_skip_rd[0] = skip_rd;
+        ref_skip_rd[1] = skip_rdy;
+      }
+      *disable_skip = 0;
+      if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1) {
+        const int skip_ctx = av1_get_skip_context(xd);
+        inter_mode_data_push(tile_data, mbmi->sb_type, rd_stats->sse,
+                             rd_stats->dist,
+                             rd_stats_y->rate + rd_stats_uv->rate +
+                                 x->skip_cost[skip_ctx][mbmi->skip]);
+      }
+    }
+
+    if (this_mode == GLOBALMV || this_mode == GLOBAL_GLOBALMV) {
+      if (is_nontrans_global_motion(xd, xd->mi[0])) {
+        mbmi->interp_filters =
+            av1_broadcast_interp_filter(av1_unswitchable_filter(interp_filter));
+      }
+    }
+
+    const int64_t tmp_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+    if (mode_index == 0) {
+      args->simple_rd[this_mode][mbmi->ref_mv_idx][mbmi->ref_frame[0]] = tmp_rd;
+      if (!is_comp_pred) {
+        simple_states->rd_stats = *rd_stats;
+        simple_states->rd_stats.rdcost = tmp_rd;
+        simple_states->rd_stats_y = *rd_stats_y;
+        simple_states->rd_stats_uv = *rd_stats_uv;
+        memcpy(simple_states->blk_skip, x->blk_skip,
+               sizeof(x->blk_skip[0]) * xd->height * xd->width);
+        av1_copy_array(simple_states->tx_type_map, xd->tx_type_map,
+                       xd->height * xd->width);
+        simple_states->skip = mbmi->skip;
+        simple_states->disable_skip = *disable_skip;
+      }
+    }
+    if (mode_index == 0 || tmp_rd < best_rd) {
+      best_mbmi = *mbmi;
+      best_rd = tmp_rd;
+      best_rd_stats = *rd_stats;
+      best_rd_stats_y = *rd_stats_y;
+      best_rate_mv = tmp_rate_mv;
+      if (num_planes > 1) best_rd_stats_uv = *rd_stats_uv;
+      memcpy(best_blk_skip, x->blk_skip,
+             sizeof(x->blk_skip[0]) * xd->height * xd->width);
+      av1_copy_array(best_tx_type_map, xd->tx_type_map, xd->height * xd->width);
+      best_xskip = mbmi->skip;
+      best_disable_skip = *disable_skip;
+      // TODO(anyone): evaluate the quality and speed trade-off of the early
+      // termination logic below.
+      // if (best_xskip) break;
+    }
+  }
+  mbmi->ref_frame[1] = ref_frame_1;
+  *rate_mv = best_rate_mv;
+  if (best_rd == INT64_MAX) {
+    av1_invalid_rd_stats(rd_stats);
+    restore_dst_buf(xd, *orig_dst, num_planes);
+    return INT64_MAX;
+  }
+  *mbmi = best_mbmi;
+  *rd_stats = best_rd_stats;
+  *rd_stats_y = best_rd_stats_y;
+  if (num_planes > 1) *rd_stats_uv = best_rd_stats_uv;
+  memcpy(x->blk_skip, best_blk_skip,
+         sizeof(x->blk_skip[0]) * xd->height * xd->width);
+  av1_copy_array(xd->tx_type_map, best_tx_type_map, xd->height * xd->width);
+  x->force_skip = best_xskip;
+  *disable_skip = best_disable_skip;
+
+  restore_dst_buf(xd, *orig_dst, num_planes);
+  return 0;
+}
+
+static int64_t skip_mode_rd(RD_STATS *rd_stats, const AV1_COMP *const cpi,
+                            MACROBLOCK *const x, BLOCK_SIZE bsize,
+                            const BUFFER_SET *const orig_dst) {
+  assert(bsize < BLOCK_SIZES_ALL);
+  const AV1_COMMON *cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+  av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize, 0,
+                                av1_num_planes(cm) - 1);
+
+  int64_t total_sse = 0;
+  for (int plane = 0; plane < num_planes; ++plane) {
+    const struct macroblock_plane *const p = &x->plane[plane];
+    const struct macroblockd_plane *const pd = &xd->plane[plane];
+    const BLOCK_SIZE plane_bsize =
+        get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+    const int bw = block_size_wide[plane_bsize];
+    const int bh = block_size_high[plane_bsize];
+
+    av1_subtract_plane(x, plane_bsize, plane);
+    int64_t sse = aom_sum_squares_2d_i16(p->src_diff, bw, bw, bh) << 4;
+    total_sse += sse;
+  }
+  const int skip_mode_ctx = av1_get_skip_mode_context(xd);
+  rd_stats->dist = rd_stats->sse = total_sse;
+  rd_stats->rate = x->skip_mode_cost[skip_mode_ctx][1];
+  rd_stats->rdcost = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+
+  restore_dst_buf(xd, *orig_dst, num_planes);
+  return 0;
+}
+
+// Check NEARESTMV, NEARMV, GLOBALMV ref mvs for duplicate and skip the relevant
+// mode
+static INLINE int check_repeat_ref_mv(const MB_MODE_INFO_EXT *mbmi_ext,
+                                      int ref_idx,
+                                      const MV_REFERENCE_FRAME *ref_frame,
+                                      PREDICTION_MODE single_mode) {
+  const uint8_t ref_frame_type = av1_ref_frame_type(ref_frame);
+  const int ref_mv_count = mbmi_ext->ref_mv_count[ref_frame_type];
+  assert(single_mode != NEWMV);
+  if (single_mode == NEARESTMV) {
+    return 0;
+  } else if (single_mode == NEARMV) {
+    // when ref_mv_count = 0, NEARESTMV and NEARMV are same as GLOBALMV
+    // when ref_mv_count = 1, NEARMV is same as GLOBALMV
+    if (ref_mv_count < 2) return 1;
+  } else if (single_mode == GLOBALMV) {
+    // when ref_mv_count == 0, GLOBALMV is same as NEARESTMV
+    if (ref_mv_count == 0) return 1;
+    // when ref_mv_count == 1, NEARMV is same as GLOBALMV
+    else if (ref_mv_count == 1)
+      return 0;
+
+    int stack_size = AOMMIN(USABLE_REF_MV_STACK_SIZE, ref_mv_count);
+    // Check GLOBALMV is matching with any mv in ref_mv_stack
+    for (int ref_mv_idx = 0; ref_mv_idx < stack_size; ref_mv_idx++) {
+      int_mv this_mv;
+
+      if (ref_idx == 0)
+        this_mv = mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv;
+      else
+        this_mv = mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].comp_mv;
+
+      if (this_mv.as_int == mbmi_ext->global_mvs[ref_frame[ref_idx]].as_int)
+        return 1;
+    }
+  }
+  return 0;
+}
+
+static INLINE int get_this_mv(int_mv *this_mv, PREDICTION_MODE this_mode,
+                              int ref_idx, int ref_mv_idx,
+                              int skip_repeated_ref_mv,
+                              const MV_REFERENCE_FRAME *ref_frame,
+                              const MB_MODE_INFO_EXT *mbmi_ext) {
+  const PREDICTION_MODE single_mode = get_single_mode(this_mode, ref_idx);
+  assert(is_inter_singleref_mode(single_mode));
+  if (single_mode == NEWMV) {
+    this_mv->as_int = INVALID_MV;
+  } else if (single_mode == GLOBALMV) {
+    if (skip_repeated_ref_mv &&
+        check_repeat_ref_mv(mbmi_ext, ref_idx, ref_frame, single_mode))
+      return 0;
+    *this_mv = mbmi_ext->global_mvs[ref_frame[ref_idx]];
+  } else {
+    assert(single_mode == NEARMV || single_mode == NEARESTMV);
+    const uint8_t ref_frame_type = av1_ref_frame_type(ref_frame);
+    const int ref_mv_offset = single_mode == NEARESTMV ? 0 : ref_mv_idx + 1;
+    if (ref_mv_offset < mbmi_ext->ref_mv_count[ref_frame_type]) {
+      assert(ref_mv_offset >= 0);
+      if (ref_idx == 0) {
+        *this_mv =
+            mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_offset].this_mv;
+      } else {
+        *this_mv =
+            mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_offset].comp_mv;
+      }
+    } else {
+      if (skip_repeated_ref_mv &&
+          check_repeat_ref_mv(mbmi_ext, ref_idx, ref_frame, single_mode))
+        return 0;
+      *this_mv = mbmi_ext->global_mvs[ref_frame[ref_idx]];
+    }
+  }
+  return 1;
+}
+
+// This function update the non-new mv for the current prediction mode
+static INLINE int build_cur_mv(int_mv *cur_mv, PREDICTION_MODE this_mode,
+                               const AV1_COMMON *cm, const MACROBLOCK *x,
+                               int skip_repeated_ref_mv) {
+  const MACROBLOCKD *xd = &x->e_mbd;
+  const MB_MODE_INFO *mbmi = xd->mi[0];
+  const int is_comp_pred = has_second_ref(mbmi);
+
+  int ret = 1;
+  for (int i = 0; i < is_comp_pred + 1; ++i) {
+    int_mv this_mv;
+    this_mv.as_int = INVALID_MV;
+    ret = get_this_mv(&this_mv, this_mode, i, mbmi->ref_mv_idx,
+                      skip_repeated_ref_mv, mbmi->ref_frame, x->mbmi_ext);
+    if (!ret) return 0;
+    const PREDICTION_MODE single_mode = get_single_mode(this_mode, i);
+    if (single_mode == NEWMV) {
+      const uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+      cur_mv[i] =
+          (i == 0) ? x->mbmi_ext->ref_mv_stack[ref_frame_type][mbmi->ref_mv_idx]
+                         .this_mv
+                   : x->mbmi_ext->ref_mv_stack[ref_frame_type][mbmi->ref_mv_idx]
+                         .comp_mv;
+    } else {
+      ret &= clamp_and_check_mv(cur_mv + i, this_mv, cm, x);
+    }
+  }
+  return ret;
+}
+
+static INLINE int get_drl_cost(const MB_MODE_INFO *mbmi,
+                               const MB_MODE_INFO_EXT *mbmi_ext,
+                               const int (*const drl_mode_cost0)[2],
+                               int8_t ref_frame_type) {
+  int cost = 0;
+  if (mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV) {
+    for (int idx = 0; idx < 2; ++idx) {
+      if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
+        uint8_t drl_ctx = av1_drl_ctx(mbmi_ext->weight[ref_frame_type], idx);
+        cost += drl_mode_cost0[drl_ctx][mbmi->ref_mv_idx != idx];
+        if (mbmi->ref_mv_idx == idx) return cost;
+      }
+    }
+    return cost;
+  }
+
+  if (have_nearmv_in_inter_mode(mbmi->mode)) {
+    for (int idx = 1; idx < 3; ++idx) {
+      if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
+        uint8_t drl_ctx = av1_drl_ctx(mbmi_ext->weight[ref_frame_type], idx);
+        cost += drl_mode_cost0[drl_ctx][mbmi->ref_mv_idx != (idx - 1)];
+        if (mbmi->ref_mv_idx == (idx - 1)) return cost;
+      }
+    }
+    return cost;
+  }
+  return cost;
+}
+
+static INLINE int is_single_newmv_valid(const HandleInterModeArgs *const args,
+                                        const MB_MODE_INFO *const mbmi,
+                                        PREDICTION_MODE this_mode) {
+  for (int ref_idx = 0; ref_idx < 2; ++ref_idx) {
+    const PREDICTION_MODE single_mode = get_single_mode(this_mode, ref_idx);
+    const MV_REFERENCE_FRAME ref = mbmi->ref_frame[ref_idx];
+    if (single_mode == NEWMV &&
+        args->single_newmv_valid[mbmi->ref_mv_idx][ref] == 0) {
+      return 0;
+    }
+  }
+  return 1;
+}
+
+static int get_drl_refmv_count(const MACROBLOCK *const x,
+                               const MV_REFERENCE_FRAME *ref_frame,
+                               PREDICTION_MODE mode) {
+  MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+  const int8_t ref_frame_type = av1_ref_frame_type(ref_frame);
+  const int has_nearmv = have_nearmv_in_inter_mode(mode) ? 1 : 0;
+  const int ref_mv_count = mbmi_ext->ref_mv_count[ref_frame_type];
+  const int only_newmv = (mode == NEWMV || mode == NEW_NEWMV);
+  const int has_drl =
+      (has_nearmv && ref_mv_count > 2) || (only_newmv && ref_mv_count > 1);
+  const int ref_set =
+      has_drl ? AOMMIN(MAX_REF_MV_SEARCH, ref_mv_count - has_nearmv) : 1;
+
+  return ref_set;
+}
+
+// Whether this reference motion vector can be skipped, based on initial
+// heuristics.
+static bool ref_mv_idx_early_breakout(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                      const HandleInterModeArgs *const args,
+                                      int64_t ref_best_rd, int ref_mv_idx) {
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+  const int8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+  const int is_comp_pred = has_second_ref(mbmi);
+  if (sf->inter_sf.reduce_inter_modes && ref_mv_idx > 0) {
+    if (mbmi->ref_frame[0] == LAST2_FRAME ||
+        mbmi->ref_frame[0] == LAST3_FRAME ||
+        mbmi->ref_frame[1] == LAST2_FRAME ||
+        mbmi->ref_frame[1] == LAST3_FRAME) {
+      const int has_nearmv = have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0;
+      if (mbmi_ext->weight[ref_frame_type][ref_mv_idx + has_nearmv] <
+          REF_CAT_LEVEL) {
+        return true;
+      }
+    }
+    // TODO(any): Experiment with reduce_inter_modes for compound prediction
+    if (sf->inter_sf.reduce_inter_modes >= 2 && !is_comp_pred &&
+        have_newmv_in_inter_mode(mbmi->mode)) {
+      if (mbmi->ref_frame[0] != cpi->nearest_past_ref &&
+          mbmi->ref_frame[0] != cpi->nearest_future_ref) {
+        const int has_nearmv = have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0;
+        if (mbmi_ext->weight[ref_frame_type][ref_mv_idx + has_nearmv] <
+            REF_CAT_LEVEL) {
+          return true;
+        }
+      }
+    }
+  }
+  if (sf->inter_sf.prune_single_motion_modes_by_simple_trans && !is_comp_pred &&
+      args->single_ref_first_pass == 0) {
+    if (args->simple_rd_state[ref_mv_idx].early_skipped) {
+      return true;
+    }
+  }
+  mbmi->ref_mv_idx = ref_mv_idx;
+  if (is_comp_pred && (!is_single_newmv_valid(args, mbmi, mbmi->mode))) {
+    return true;
+  }
+  size_t est_rd_rate = args->ref_frame_cost + args->single_comp_cost;
+  const int drl_cost =
+      get_drl_cost(mbmi, mbmi_ext, x->drl_mode_cost0, ref_frame_type);
+  est_rd_rate += drl_cost;
+  if (RDCOST(x->rdmult, est_rd_rate, 0) > ref_best_rd &&
+      mbmi->mode != NEARESTMV && mbmi->mode != NEAREST_NEARESTMV) {
+    return true;
+  }
+  return false;
+}
+
+// Compute the estimated RD cost for the motion vector with simple translation.
+static int64_t simple_translation_pred_rd(
+    AV1_COMP *const cpi, MACROBLOCK *x, RD_STATS *rd_stats,
+    HandleInterModeArgs *args, int ref_mv_idx, inter_mode_info *mode_info,
+    int64_t ref_best_rd, BLOCK_SIZE bsize) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+  const int8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+  const AV1_COMMON *cm = &cpi->common;
+  const int is_comp_pred = has_second_ref(mbmi);
+
+  struct macroblockd_plane *p = xd->plane;
+  const BUFFER_SET orig_dst = {
+    { p[0].dst.buf, p[1].dst.buf, p[2].dst.buf },
+    { p[0].dst.stride, p[1].dst.stride, p[2].dst.stride },
+  };
+  av1_init_rd_stats(rd_stats);
+
+  mbmi->interinter_comp.type = COMPOUND_AVERAGE;
+  mbmi->comp_group_idx = 0;
+  mbmi->compound_idx = 1;
+  if (mbmi->ref_frame[1] == INTRA_FRAME) {
+    mbmi->ref_frame[1] = NONE_FRAME;
+  }
+  int16_t mode_ctx =
+      av1_mode_context_analyzer(mbmi_ext->mode_context, mbmi->ref_frame);
+
+  mbmi->num_proj_ref = 0;
+  mbmi->motion_mode = SIMPLE_TRANSLATION;
+  mbmi->ref_mv_idx = ref_mv_idx;
+
+  rd_stats->rate += args->ref_frame_cost + args->single_comp_cost;
+  const int drl_cost =
+      get_drl_cost(mbmi, mbmi_ext, x->drl_mode_cost0, ref_frame_type);
+  rd_stats->rate += drl_cost;
+  mode_info[ref_mv_idx].drl_cost = drl_cost;
+
+  int_mv cur_mv[2];
+  if (!build_cur_mv(cur_mv, mbmi->mode, cm, x, 0)) {
+    return INT64_MAX;
+  }
+  assert(have_nearmv_in_inter_mode(mbmi->mode));
+  for (int i = 0; i < is_comp_pred + 1; ++i) {
+    mbmi->mv[i].as_int = cur_mv[i].as_int;
+  }
+  const int ref_mv_cost = cost_mv_ref(x, mbmi->mode, mode_ctx);
+  rd_stats->rate += ref_mv_cost;
+
+  if (RDCOST(x->rdmult, rd_stats->rate, 0) > ref_best_rd) {
+    return INT64_MAX;
+  }
+
+  mbmi->motion_mode = SIMPLE_TRANSLATION;
+  mbmi->num_proj_ref = 0;
+  if (is_comp_pred) {
+    // Only compound_average
+    mbmi->interinter_comp.type = COMPOUND_AVERAGE;
+    mbmi->comp_group_idx = 0;
+    mbmi->compound_idx = 1;
+  }
+  set_default_interp_filters(mbmi, cm->features.interp_filter);
+
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+  av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, &orig_dst, bsize,
+                                AOM_PLANE_Y, AOM_PLANE_Y);
+  int est_rate;
+  int64_t est_dist;
+  model_rd_sb_fn[MODELRD_CURVFIT](cpi, bsize, x, xd, 0, 0, &est_rate, &est_dist,
+                                  NULL, NULL, NULL, NULL, NULL);
+  return RDCOST(x->rdmult, rd_stats->rate + est_rate, est_dist);
+}
+
+// Represents a set of integers, from 0 to sizeof(int) * 8, as bits in
+// an integer. 0 for the i-th bit means that integer is excluded, 1 means
+// it is included.
+static INLINE void mask_set_bit(int *mask, int index) { *mask |= (1 << index); }
+
+static INLINE bool mask_check_bit(int mask, int index) {
+  return (mask >> index) & 0x1;
+}
+
+// Before performing the full MV search in handle_inter_mode, do a simple
+// translation search and see if we can eliminate any motion vectors.
+// Returns an integer where, if the i-th bit is set, it means that the i-th
+// motion vector should be searched. This is only set for NEAR_MV.
+static int ref_mv_idx_to_search(AV1_COMP *const cpi, MACROBLOCK *x,
+                                RD_STATS *rd_stats,
+                                HandleInterModeArgs *const args,
+                                int64_t ref_best_rd, inter_mode_info *mode_info,
+                                BLOCK_SIZE bsize, const int ref_set) {
+  AV1_COMMON *const cm = &cpi->common;
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  const PREDICTION_MODE this_mode = mbmi->mode;
+
+  // Only search indices if they have some chance of being good.
+  int good_indices = 0;
+  for (int i = 0; i < ref_set; ++i) {
+    if (ref_mv_idx_early_breakout(cpi, x, args, ref_best_rd, i)) {
+      continue;
+    }
+    mask_set_bit(&good_indices, i);
+  }
+
+  // Only prune in NEARMV mode, if the speed feature is set, and the block size
+  // is large enough. If these conditions are not met, return all good indices
+  // found so far.
+  if (!cpi->sf.inter_sf.prune_mode_search_simple_translation)
+    return good_indices;
+  if (!have_nearmv_in_inter_mode(this_mode)) return good_indices;
+  if (num_pels_log2_lookup[bsize] <= 6) return good_indices;
+  // Do not prune when there is internal resizing. TODO(elliottk) fix this
+  // so b/2384 can be resolved.
+  if (av1_is_scaled(get_ref_scale_factors(cm, mbmi->ref_frame[0])) ||
+      (mbmi->ref_frame[1] > 0 &&
+       av1_is_scaled(get_ref_scale_factors(cm, mbmi->ref_frame[1])))) {
+    return good_indices;
+  }
+
+  // Calculate the RD cost for the motion vectors using simple translation.
+  int64_t idx_rdcost[] = { INT64_MAX, INT64_MAX, INT64_MAX };
+  for (int ref_mv_idx = 0; ref_mv_idx < ref_set; ++ref_mv_idx) {
+    // If this index is bad, ignore it.
+    if (!mask_check_bit(good_indices, ref_mv_idx)) {
+      continue;
+    }
+    idx_rdcost[ref_mv_idx] = simple_translation_pred_rd(
+        cpi, x, rd_stats, args, ref_mv_idx, mode_info, ref_best_rd, bsize);
+  }
+  // Find the index with the best RD cost.
+  int best_idx = 0;
+  for (int i = 1; i < MAX_REF_MV_SEARCH; ++i) {
+    if (idx_rdcost[i] < idx_rdcost[best_idx]) {
+      best_idx = i;
+    }
+  }
+  // Only include indices that are good and within a % of the best.
+  const double dth = has_second_ref(mbmi) ? 1.05 : 1.001;
+  // If the simple translation cost is not within this multiple of the
+  // best RD, skip it. Note that the cutoff is derived experimentally.
+  const double ref_dth = 5;
+  int result = 0;
+  for (int i = 0; i < ref_set; ++i) {
+    if (mask_check_bit(good_indices, i) &&
+        (1.0 * idx_rdcost[i]) / idx_rdcost[best_idx] < dth &&
+        (1.0 * idx_rdcost[i]) / ref_best_rd < ref_dth) {
+      mask_set_bit(&result, i);
+    }
+  }
+  return result;
+}
+
+typedef struct motion_mode_candidate {
+  MB_MODE_INFO mbmi;
+  int rate_mv;
+  int rate2_nocoeff;
+  int skip_motion_mode;
+  int64_t rd_cost;
+} motion_mode_candidate;
+
+typedef struct motion_mode_best_st_candidate {
+  motion_mode_candidate motion_mode_cand[MAX_WINNER_MOTION_MODES];
+  int num_motion_mode_cand;
+} motion_mode_best_st_candidate;
+
+// Checks if the current reference frame matches with neighbouring block's
+// (top/left) reference frames
+static AOM_INLINE int ref_match_found_in_nb_blocks(MB_MODE_INFO *cur_mbmi,
+                                                   MB_MODE_INFO *nb_mbmi) {
+  MV_REFERENCE_FRAME nb_ref_frames[2] = { nb_mbmi->ref_frame[0],
+                                          nb_mbmi->ref_frame[1] };
+  MV_REFERENCE_FRAME cur_ref_frames[2] = { cur_mbmi->ref_frame[0],
+                                           cur_mbmi->ref_frame[1] };
+  const int is_cur_comp_pred = has_second_ref(cur_mbmi);
+  int match_found = 0;
+
+  for (int i = 0; i < (is_cur_comp_pred + 1); i++) {
+    if ((cur_ref_frames[i] == nb_ref_frames[0]) ||
+        (cur_ref_frames[i] == nb_ref_frames[1]))
+      match_found = 1;
+  }
+  return match_found;
+}
+
+static AOM_INLINE int find_ref_match_in_above_nbs(const int total_mi_cols,
+                                                  MACROBLOCKD *xd) {
+  if (!xd->up_available) return 0;
+  const int mi_col = xd->mi_col;
+  MB_MODE_INFO **cur_mbmi = xd->mi;
+  // prev_row_mi points into the mi array, starting at the beginning of the
+  // previous row.
+  MB_MODE_INFO **prev_row_mi = xd->mi - mi_col - 1 * xd->mi_stride;
+  const int end_col = AOMMIN(mi_col + xd->width, total_mi_cols);
+  uint8_t mi_step;
+  for (int above_mi_col = mi_col; above_mi_col < end_col;
+       above_mi_col += mi_step) {
+    MB_MODE_INFO **above_mi = prev_row_mi + above_mi_col;
+    mi_step = mi_size_wide[above_mi[0]->sb_type];
+    int match_found = 0;
+    if (is_inter_block(*above_mi))
+      match_found = ref_match_found_in_nb_blocks(*cur_mbmi, *above_mi);
+    if (match_found) return 1;
+  }
+  return 0;
+}
+
+static AOM_INLINE int find_ref_match_in_left_nbs(const int total_mi_rows,
+                                                 MACROBLOCKD *xd) {
+  if (!xd->left_available) return 0;
+  const int mi_row = xd->mi_row;
+  MB_MODE_INFO **cur_mbmi = xd->mi;
+  // prev_col_mi points into the mi array, starting at the top of the
+  // previous column
+  MB_MODE_INFO **prev_col_mi = xd->mi - 1 - mi_row * xd->mi_stride;
+  const int end_row = AOMMIN(mi_row + xd->height, total_mi_rows);
+  uint8_t mi_step;
+  for (int left_mi_row = mi_row; left_mi_row < end_row;
+       left_mi_row += mi_step) {
+    MB_MODE_INFO **left_mi = prev_col_mi + left_mi_row * xd->mi_stride;
+    mi_step = mi_size_high[left_mi[0]->sb_type];
+    int match_found = 0;
+    if (is_inter_block(*left_mi))
+      match_found = ref_match_found_in_nb_blocks(*cur_mbmi, *left_mi);
+    if (match_found) return 1;
+  }
+  return 0;
+}
+
+typedef struct {
+  int64_t best_inter_cost;
+  int64_t ref_inter_cost[INTER_REFS_PER_FRAME];
+} PruneInfoFromTpl;
+
+#if !CONFIG_REALTIME_ONLY
+// TODO(Remya): Check if get_tpl_stats_b() can be reused
+static AOM_INLINE void get_block_level_tpl_stats(
+    AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row, int mi_col, int *valid_refs,
+    PruneInfoFromTpl *inter_cost_info_from_tpl) {
+  const GF_GROUP *const gf_group = &cpi->gf_group;
+  AV1_COMMON *const cm = &cpi->common;
+
+  assert(IMPLIES(gf_group->size > 0, gf_group->index < gf_group->size));
+  const int tpl_idx = gf_group->index;
+  TplParams *const tpl_data = &cpi->tpl_data;
+  const TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_idx];
+  if (tpl_idx >= MAX_LAG_BUFFERS || !tpl_frame->is_valid) {
+    return;
+  }
+
+  const TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
+  const int mi_wide = mi_size_wide[bsize];
+  const int mi_high = mi_size_high[bsize];
+  const int tpl_stride = tpl_frame->stride;
+  const int step = 1 << tpl_data->tpl_stats_block_mis_log2;
+  const int mi_col_sr =
+      coded_to_superres_mi(mi_col, cm->superres_scale_denominator);
+  const int mi_col_end_sr =
+      coded_to_superres_mi(mi_col + mi_wide, cm->superres_scale_denominator);
+  const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width);
+
+  for (int row = mi_row; row < AOMMIN(mi_row + mi_high, cm->mi_params.mi_rows);
+       row += step) {
+    for (int col = mi_col_sr; col < AOMMIN(mi_col_end_sr, mi_cols_sr);
+         col += step) {
+      const TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos(
+          row, col, tpl_stride, tpl_data->tpl_stats_block_mis_log2)];
+
+      // Sums up the inter cost of corresponding ref frames
+      for (int ref_idx = 0; ref_idx < INTER_REFS_PER_FRAME; ref_idx++) {
+        inter_cost_info_from_tpl->ref_inter_cost[ref_idx] +=
+            this_stats->pred_error[ref_idx];
+      }
+    }
+  }
+
+  // Computes the best inter cost (minimum inter_cost)
+  int64_t best_inter_cost = INT64_MAX;
+  for (int ref_idx = 0; ref_idx < INTER_REFS_PER_FRAME; ref_idx++) {
+    const int64_t cur_inter_cost =
+        inter_cost_info_from_tpl->ref_inter_cost[ref_idx];
+    // For invalid ref frames, cur_inter_cost = 0 and has to be handled while
+    // calculating the minimum inter_cost
+    if (cur_inter_cost != 0 && (cur_inter_cost < best_inter_cost) &&
+        valid_refs[ref_idx])
+      best_inter_cost = cur_inter_cost;
+  }
+  inter_cost_info_from_tpl->best_inter_cost = best_inter_cost;
+}
+#endif
+
+static AOM_INLINE int prune_modes_based_on_tpl_stats(
+    PruneInfoFromTpl *inter_cost_info_from_tpl, const int *refs, int ref_mv_idx,
+    const PREDICTION_MODE this_mode, int prune_mode_level) {
+  const int have_newmv = have_newmv_in_inter_mode(this_mode);
+  if ((prune_mode_level < 3) && have_newmv) return 0;
+
+  static const int prune_level_idx[3] = { 0, 1, 1 };
+  const int prune_level = prune_level_idx[prune_mode_level - 1];
+  int64_t cur_inter_cost;
+
+  const int is_globalmv =
+      (this_mode == GLOBALMV) || (this_mode == GLOBAL_GLOBALMV);
+  const int prune_index = is_globalmv ? MAX_REF_MV_SEARCH : ref_mv_idx;
+
+  // Thresholds used for pruning:
+  // Lower value indicates aggressive pruning and higher value indicates
+  // conservative pruning which is set based on ref_mv_idx and speed feature.
+  // 'prune_index' 0, 1, 2 corresponds to ref_mv indices 0, 1 and 2. prune_index
+  // 3 corresponds to GLOBALMV/GLOBAL_GLOBALMV
+  static const int tpl_inter_mode_prune_mul_factor[2][MAX_REF_MV_SEARCH + 1] = {
+    { 3, 3, 3, 2 }, { 3, 2, 2, 2 }
+  };
+
+  const int is_comp_pred = (refs[1] > INTRA_FRAME);
+  if (!is_comp_pred) {
+    cur_inter_cost = inter_cost_info_from_tpl->ref_inter_cost[refs[0] - 1];
+  } else {
+    const int64_t inter_cost_ref0 =
+        inter_cost_info_from_tpl->ref_inter_cost[refs[0] - 1];
+    const int64_t inter_cost_ref1 =
+        inter_cost_info_from_tpl->ref_inter_cost[refs[1] - 1];
+    // Choose maximum inter_cost among inter_cost_ref0 and inter_cost_ref1 for
+    // more aggressive pruning
+    cur_inter_cost = AOMMAX(inter_cost_ref0, inter_cost_ref1);
+  }
+
+  // Prune the mode if cur_inter_cost is greater than threshold times
+  // best_inter_cost
+  const int64_t best_inter_cost = inter_cost_info_from_tpl->best_inter_cost;
+  if (cur_inter_cost >
+      ((tpl_inter_mode_prune_mul_factor[prune_level][prune_index] *
+        best_inter_cost) >>
+       1))
+    return 1;
+  return 0;
+}
+
+static int64_t handle_inter_mode(
+    AV1_COMP *const cpi, TileDataEnc *tile_data, MACROBLOCK *x,
+    BLOCK_SIZE bsize, RD_STATS *rd_stats, RD_STATS *rd_stats_y,
+    RD_STATS *rd_stats_uv, int *disable_skip, HandleInterModeArgs *args,
+    int64_t ref_best_rd, uint8_t *const tmp_buf,
+    const CompoundTypeRdBuffers *rd_buffers, int64_t *best_est_rd,
+    const int do_tx_search, InterModesInfo *inter_modes_info,
+    motion_mode_candidate *motion_mode_cand, int64_t *skip_rd,
+    PruneInfoFromTpl *inter_cost_info_from_tpl) {
+  const AV1_COMMON *cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+  const int is_comp_pred = has_second_ref(mbmi);
+  const PREDICTION_MODE this_mode = mbmi->mode;
+
+  const GF_GROUP *const gf_group = &cpi->gf_group;
+  const int tpl_idx = gf_group->index;
+  TplDepFrame *tpl_frame = &cpi->tpl_data.tpl_frame[tpl_idx];
+  const int prune_modes_based_on_tpl =
+      cpi->sf.inter_sf.prune_inter_modes_based_on_tpl &&
+      tpl_idx >= MAX_LAG_BUFFERS && tpl_frame->is_valid;
+  int i;
+  const int refs[2] = { mbmi->ref_frame[0],
+                        (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
+  int rate_mv = 0;
+  int64_t rd = INT64_MAX;
+  // do first prediction into the destination buffer. Do the next
+  // prediction into a temporary buffer. Then keep track of which one
+  // of these currently holds the best predictor, and use the other
+  // one for future predictions. In the end, copy from tmp_buf to
+  // dst if necessary.
+  struct macroblockd_plane *p = xd->plane;
+  const BUFFER_SET orig_dst = {
+    { p[0].dst.buf, p[1].dst.buf, p[2].dst.buf },
+    { p[0].dst.stride, p[1].dst.stride, p[2].dst.stride },
+  };
+  const BUFFER_SET tmp_dst = { { tmp_buf, tmp_buf + 1 * MAX_SB_SQUARE,
+                                 tmp_buf + 2 * MAX_SB_SQUARE },
+                               { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE } };
+
+  const int masked_compound_used = is_any_masked_compound_used(bsize) &&
+                                   cm->seq_params.enable_masked_compound;
+  int64_t ret_val = INT64_MAX;
+  const int8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+  RD_STATS best_rd_stats, best_rd_stats_y, best_rd_stats_uv;
+  int64_t best_rd = INT64_MAX;
+  uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  MB_MODE_INFO best_mbmi = *mbmi;
+  int best_disable_skip = 0;
+  int best_xskip = 0;
+  int64_t newmv_ret_val = INT64_MAX;
+  inter_mode_info mode_info[MAX_REF_MV_SEARCH];
+
+  int mode_search_mask = (1 << COMPOUND_AVERAGE) | (1 << COMPOUND_DISTWTD) |
+                         (1 << COMPOUND_WEDGE) | (1 << COMPOUND_DIFFWTD);
+
+  // Do not prune the mode based on inter cost from tpl if the current ref frame
+  // is the winner ref in neighbouring blocks.
+  int ref_match_found_in_above_nb = 0;
+  int ref_match_found_in_left_nb = 0;
+  if (prune_modes_based_on_tpl) {
+    ref_match_found_in_above_nb =
+        find_ref_match_in_above_nbs(cm->mi_params.mi_cols, xd);
+    ref_match_found_in_left_nb =
+        find_ref_match_in_left_nbs(cm->mi_params.mi_rows, xd);
+  }
+
+  // First, perform a simple translation search for each of the indices. If
+  // an index performs well, it will be fully searched here.
+  const int ref_set = get_drl_refmv_count(x, mbmi->ref_frame, this_mode);
+  // Save MV results from first 2 ref_mv_idx.
+  int_mv save_mv[MAX_REF_MV_SEARCH - 1][2] = { { { 0 } } };
+  int best_ref_mv_idx = -1;
+  const int idx_mask = ref_mv_idx_to_search(cpi, x, rd_stats, args, ref_best_rd,
+                                            mode_info, bsize, ref_set);
+  const int16_t mode_ctx =
+      av1_mode_context_analyzer(mbmi_ext->mode_context, mbmi->ref_frame);
+  const int ref_mv_cost = cost_mv_ref(x, this_mode, mode_ctx);
+  const int base_rate =
+      args->ref_frame_cost + args->single_comp_cost + ref_mv_cost;
+  for (int ref_mv_idx = 0; ref_mv_idx < ref_set; ++ref_mv_idx) {
+    mode_info[ref_mv_idx].full_search_mv.as_int = INVALID_MV;
+    mode_info[ref_mv_idx].mv.as_int = INVALID_MV;
+    mode_info[ref_mv_idx].rd = INT64_MAX;
+
+    if (!mask_check_bit(idx_mask, ref_mv_idx)) {
+      // MV did not perform well in simple translation search. Skip it.
+      continue;
+    }
+    if (prune_modes_based_on_tpl && !ref_match_found_in_above_nb &&
+        !ref_match_found_in_left_nb && (ref_best_rd != INT64_MAX)) {
+      if (prune_modes_based_on_tpl_stats(
+              inter_cost_info_from_tpl, refs, ref_mv_idx, this_mode,
+              cpi->sf.inter_sf.prune_inter_modes_based_on_tpl))
+        continue;
+    }
+    av1_init_rd_stats(rd_stats);
+
+    mbmi->interinter_comp.type = COMPOUND_AVERAGE;
+    mbmi->comp_group_idx = 0;
+    mbmi->compound_idx = 1;
+    if (mbmi->ref_frame[1] == INTRA_FRAME) mbmi->ref_frame[1] = NONE_FRAME;
+
+    mbmi->num_proj_ref = 0;
+    mbmi->motion_mode = SIMPLE_TRANSLATION;
+    mbmi->ref_mv_idx = ref_mv_idx;
+
+    rd_stats->rate = base_rate;
+    const int drl_cost =
+        get_drl_cost(mbmi, mbmi_ext, x->drl_mode_cost0, ref_frame_type);
+    rd_stats->rate += drl_cost;
+    mode_info[ref_mv_idx].drl_cost = drl_cost;
+
+    int rs = 0;
+    int compmode_interinter_cost = 0;
+
+    int_mv cur_mv[2];
+
+    // TODO(Cherma): Extend this speed feature to support compound mode
+    int skip_repeated_ref_mv =
+        is_comp_pred ? 0 : cpi->sf.inter_sf.skip_repeated_ref_mv;
+    if (!build_cur_mv(cur_mv, this_mode, cm, x, skip_repeated_ref_mv)) {
+      continue;
+    }
+
+    if (have_newmv_in_inter_mode(this_mode)) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      start_timing(cpi, handle_newmv_time);
+#endif
+      if (cpi->sf.inter_sf.prune_single_motion_modes_by_simple_trans &&
+          args->single_ref_first_pass == 0 && !is_comp_pred) {
+        const int ref0 = mbmi->ref_frame[0];
+        newmv_ret_val = args->single_newmv_valid[ref_mv_idx][ref0] ? 0 : 1;
+        cur_mv[0] = args->single_newmv[ref_mv_idx][ref0];
+        rate_mv = args->single_newmv_rate[ref_mv_idx][ref0];
+      } else {
+        newmv_ret_val =
+            handle_newmv(cpi, x, bsize, cur_mv, &rate_mv, args, mode_info);
+      }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      end_timing(cpi, handle_newmv_time);
+#endif
+
+      if (newmv_ret_val != 0) continue;
+
+      rd_stats->rate += rate_mv;
+
+      if (cpi->sf.inter_sf.skip_repeated_newmv) {
+        if (!is_comp_pred && this_mode == NEWMV && ref_mv_idx > 0) {
+          int skip = 0;
+          int this_rate_mv = 0;
+          for (i = 0; i < ref_mv_idx; ++i) {
+            // Check if the motion search result same as previous results
+            if (cur_mv[0].as_int == args->single_newmv[i][refs[0]].as_int &&
+                args->single_newmv_valid[i][refs[0]]) {
+              // If the compared mode has no valid rd, it is unlikely this
+              // mode will be the best mode
+              if (mode_info[i].rd == INT64_MAX) {
+                skip = 1;
+                break;
+              }
+              // Compare the cost difference including drl cost and mv cost
+              if (mode_info[i].mv.as_int != INVALID_MV) {
+                const int compare_cost =
+                    mode_info[i].rate_mv + mode_info[i].drl_cost;
+                const int_mv ref_mv = av1_get_ref_mv(x, 0);
+                this_rate_mv = av1_mv_bit_cost(
+                    &mode_info[i].mv.as_mv, &ref_mv.as_mv, x->nmv_vec_cost,
+                    x->mv_cost_stack, MV_COST_WEIGHT);
+                const int this_cost = this_rate_mv + drl_cost;
+
+                if (compare_cost <= this_cost) {
+                  skip = 1;
+                  break;
+                } else {
+                  // If the cost is less than current best result, make this
+                  // the best and update corresponding variables unless the
+                  // best_mv is the same as ref_mv. In this case we skip and
+                  // rely on NEAR(EST)MV instead
+                  if (best_mbmi.ref_mv_idx == i &&
+                      mode_info[i].mv.as_int != ref_mv.as_int) {
+                    assert(best_rd != INT64_MAX);
+                    best_mbmi.ref_mv_idx = ref_mv_idx;
+                    motion_mode_cand->rate_mv = this_rate_mv;
+                    best_rd_stats.rate += this_cost - compare_cost;
+                    best_rd = RDCOST(x->rdmult, best_rd_stats.rate,
+                                     best_rd_stats.dist);
+                    if (best_rd < ref_best_rd) ref_best_rd = best_rd;
+                    break;
+                  }
+                }
+              }
+            }
+          }
+          if (skip) {
+            const THR_MODES mode_enum = get_prediction_mode_idx(
+                best_mbmi.mode, best_mbmi.ref_frame[0], best_mbmi.ref_frame[1]);
+            // Collect mode stats for multiwinner mode processing
+            store_winner_mode_stats(
+                &cpi->common, x, &best_mbmi, &best_rd_stats, &best_rd_stats_y,
+                &best_rd_stats_uv, mode_enum, NULL, bsize, best_rd,
+                cpi->sf.winner_mode_sf.enable_multiwinner_mode_process,
+                do_tx_search);
+            args->modelled_rd[this_mode][ref_mv_idx][refs[0]] =
+                args->modelled_rd[this_mode][i][refs[0]];
+            args->simple_rd[this_mode][ref_mv_idx][refs[0]] =
+                args->simple_rd[this_mode][i][refs[0]];
+            mode_info[ref_mv_idx].rd = mode_info[i].rd;
+            mode_info[ref_mv_idx].rate_mv = this_rate_mv;
+            mode_info[ref_mv_idx].mv.as_int = mode_info[i].mv.as_int;
+
+            restore_dst_buf(xd, orig_dst, num_planes);
+            continue;
+          }
+        }
+      }
+    }
+    for (i = 0; i < is_comp_pred + 1; ++i) {
+      mbmi->mv[i].as_int = cur_mv[i].as_int;
+    }
+
+    if (RDCOST(x->rdmult, rd_stats->rate, 0) > ref_best_rd &&
+        mbmi->mode != NEARESTMV && mbmi->mode != NEAREST_NEARESTMV) {
+      continue;
+    }
+
+    if (cpi->sf.inter_sf.prune_ref_mv_idx_search && is_comp_pred) {
+      // TODO(yunqing): Move this part to a separate function when it is done.
+      // Store MV result.
+      if (ref_mv_idx < MAX_REF_MV_SEARCH - 1) {
+        for (i = 0; i < is_comp_pred + 1; ++i)
+          save_mv[ref_mv_idx][i].as_int = mbmi->mv[i].as_int;
+      }
+      // Skip the evaluation if an MV match is found.
+      if (ref_mv_idx > 0) {
+        int match = 0;
+        for (int idx = 0; idx < ref_mv_idx; ++idx) {
+          int mv_diff = 0;
+          for (i = 0; i < 1 + is_comp_pred; ++i) {
+            mv_diff += abs(save_mv[idx][i].as_mv.row - mbmi->mv[i].as_mv.row) +
+                       abs(save_mv[idx][i].as_mv.col - mbmi->mv[i].as_mv.col);
+          }
+
+          // If this mode is not the best one, and current MV is similar to
+          // previous stored MV, terminate this ref_mv_idx evaluation.
+          if (best_ref_mv_idx == -1 && mv_diff < 1) {
+            match = 1;
+            break;
+          }
+        }
+        if (match == 1) continue;
+      }
+    }
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    start_timing(cpi, compound_type_rd_time);
+#endif
+    int skip_build_pred = 0;
+    const int mi_row = xd->mi_row;
+    const int mi_col = xd->mi_col;
+    if (is_comp_pred) {
+      // Find matching interp filter or set to default interp filter
+      const int need_search = av1_is_interp_needed(xd);
+      const InterpFilter assign_filter = cm->features.interp_filter;
+      int is_luma_interp_done = 0;
+      av1_find_interp_filter_match(mbmi, cpi, assign_filter, need_search,
+                                   args->interp_filter_stats,
+                                   args->interp_filter_stats_idx);
+
+      int64_t best_rd_compound;
+      int64_t rd_thresh;
+      const int comp_type_rd_shift = COMP_TYPE_RD_THRESH_SHIFT;
+      const int comp_type_rd_scale = COMP_TYPE_RD_THRESH_SCALE;
+      rd_thresh = get_rd_thresh_from_best_rd(
+          ref_best_rd, (1 << comp_type_rd_shift), comp_type_rd_scale);
+      compmode_interinter_cost = av1_compound_type_rd(
+          cpi, x, bsize, cur_mv, mode_search_mask, masked_compound_used,
+          &orig_dst, &tmp_dst, rd_buffers, &rate_mv, &best_rd_compound,
+          rd_stats, ref_best_rd, skip_rd[1], &is_luma_interp_done, rd_thresh);
+      if (ref_best_rd < INT64_MAX &&
+          (best_rd_compound >> comp_type_rd_shift) * comp_type_rd_scale >
+              ref_best_rd) {
+        restore_dst_buf(xd, orig_dst, num_planes);
+        continue;
+      }
+      // No need to call av1_enc_build_inter_predictor for luma if
+      // COMPOUND_AVERAGE is selected because it is the first
+      // candidate in av1_compound_type_rd, and the following
+      // compound types searching uses tmp_dst buffer
+
+      if (mbmi->interinter_comp.type == COMPOUND_AVERAGE &&
+          is_luma_interp_done) {
+        if (num_planes > 1) {
+          av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, &orig_dst,
+                                        bsize, AOM_PLANE_U, num_planes - 1);
+        }
+        skip_build_pred = 1;
+      }
+    }
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    end_timing(cpi, compound_type_rd_time);
+#endif
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    start_timing(cpi, interpolation_filter_search_time);
+#endif
+    ret_val = av1_interpolation_filter_search(
+        x, cpi, tile_data, bsize, &tmp_dst, &orig_dst, &rd, &rs,
+        &skip_build_pred, args, ref_best_rd);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    end_timing(cpi, interpolation_filter_search_time);
+#endif
+    if (args->modelled_rd != NULL && !is_comp_pred) {
+      args->modelled_rd[this_mode][ref_mv_idx][refs[0]] = rd;
+    }
+    if (ret_val != 0) {
+      restore_dst_buf(xd, orig_dst, num_planes);
+      continue;
+    } else if (cpi->sf.inter_sf.model_based_post_interp_filter_breakout &&
+               ref_best_rd != INT64_MAX && (rd >> 3) * 3 > ref_best_rd) {
+      restore_dst_buf(xd, orig_dst, num_planes);
+      continue;
+    }
+
+    if (args->modelled_rd != NULL) {
+      if (is_comp_pred) {
+        const int mode0 = compound_ref0_mode(this_mode);
+        const int mode1 = compound_ref1_mode(this_mode);
+        const int64_t mrd =
+            AOMMIN(args->modelled_rd[mode0][ref_mv_idx][refs[0]],
+                   args->modelled_rd[mode1][ref_mv_idx][refs[1]]);
+        if ((rd >> 3) * 6 > mrd && ref_best_rd < INT64_MAX) {
+          restore_dst_buf(xd, orig_dst, num_planes);
+          continue;
+        }
+      }
+    }
+    rd_stats->rate += compmode_interinter_cost;
+    if (skip_build_pred != 1) {
+      av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, &orig_dst, bsize, 0,
+                                    av1_num_planes(cm) - 1);
+    }
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    start_timing(cpi, motion_mode_rd_time);
+#endif
+    int rate2_nocoeff = rd_stats->rate;
+    ret_val = motion_mode_rd(cpi, tile_data, x, bsize, rd_stats, rd_stats_y,
+                             rd_stats_uv, disable_skip, args, ref_best_rd,
+                             skip_rd, &rate_mv, &orig_dst, best_est_rd,
+                             do_tx_search, inter_modes_info, 0);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    end_timing(cpi, motion_mode_rd_time);
+#endif
+
+    mode_info[ref_mv_idx].mv.as_int = mbmi->mv[0].as_int;
+    mode_info[ref_mv_idx].rate_mv = rate_mv;
+    if (ret_val != INT64_MAX) {
+      int64_t tmp_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+      mode_info[ref_mv_idx].rd = tmp_rd;
+      const THR_MODES mode_enum = get_prediction_mode_idx(
+          mbmi->mode, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+      // Collect mode stats for multiwinner mode processing
+      store_winner_mode_stats(
+          &cpi->common, x, mbmi, rd_stats, rd_stats_y, rd_stats_uv, mode_enum,
+          NULL, bsize, tmp_rd,
+          cpi->sf.winner_mode_sf.enable_multiwinner_mode_process, do_tx_search);
+      if (tmp_rd < best_rd) {
+        best_rd_stats = *rd_stats;
+        best_rd_stats_y = *rd_stats_y;
+        best_rd_stats_uv = *rd_stats_uv;
+        best_rd = tmp_rd;
+        best_mbmi = *mbmi;
+        best_disable_skip = *disable_skip;
+        best_xskip = x->force_skip;
+        memcpy(best_blk_skip, x->blk_skip,
+               sizeof(best_blk_skip[0]) * xd->height * xd->width);
+        av1_copy_array(best_tx_type_map, xd->tx_type_map,
+                       xd->height * xd->width);
+        motion_mode_cand->rate_mv = rate_mv;
+        motion_mode_cand->rate2_nocoeff = rate2_nocoeff;
+      }
+
+      if (tmp_rd < ref_best_rd) {
+        ref_best_rd = tmp_rd;
+        best_ref_mv_idx = ref_mv_idx;
+      }
+    }
+    restore_dst_buf(xd, orig_dst, num_planes);
+  }
+
+  if (best_rd == INT64_MAX) return INT64_MAX;
+
+  // re-instate status of the best choice
+  *rd_stats = best_rd_stats;
+  *rd_stats_y = best_rd_stats_y;
+  *rd_stats_uv = best_rd_stats_uv;
+  *mbmi = best_mbmi;
+  *disable_skip = best_disable_skip;
+  x->force_skip = best_xskip;
+  assert(IMPLIES(mbmi->comp_group_idx == 1,
+                 mbmi->interinter_comp.type != COMPOUND_AVERAGE));
+  memcpy(x->blk_skip, best_blk_skip,
+         sizeof(best_blk_skip[0]) * xd->height * xd->width);
+  av1_copy_array(xd->tx_type_map, best_tx_type_map, xd->height * xd->width);
+
+  rd_stats->rdcost = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+
+  return rd_stats->rdcost;
+}
+
+static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
+                                       PICK_MODE_CONTEXT *ctx,
+                                       RD_STATS *rd_stats, BLOCK_SIZE bsize,
+                                       int64_t best_rd) {
+  const AV1_COMMON *const cm = &cpi->common;
+  if (!av1_allow_intrabc(cm) || !cpi->oxcf.enable_intrabc) return INT64_MAX;
+  const int num_planes = av1_num_planes(cm);
+
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const TileInfo *tile = &xd->tile;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+  const int w = block_size_wide[bsize];
+  const int h = block_size_high[bsize];
+  const int sb_row = mi_row >> cm->seq_params.mib_size_log2;
+  const int sb_col = mi_col >> cm->seq_params.mib_size_log2;
+
+  MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+  MV_REFERENCE_FRAME ref_frame = INTRA_FRAME;
+  av1_find_mv_refs(cm, xd, mbmi, ref_frame, mbmi_ext->ref_mv_count,
+                   xd->ref_mv_stack, xd->weight, NULL, mbmi_ext->global_mvs,
+                   mbmi_ext->mode_context);
+  // TODO(Ravi): Populate mbmi_ext->ref_mv_stack[ref_frame][4] and
+  // mbmi_ext->weight[ref_frame][4] inside av1_find_mv_refs.
+  av1_copy_usable_ref_mv_stack_and_weight(xd, mbmi_ext, ref_frame);
+  int_mv nearestmv, nearmv;
+  av1_find_best_ref_mvs_from_stack(0, mbmi_ext, ref_frame, &nearestmv, &nearmv,
+                                   0);
+
+  if (nearestmv.as_int == INVALID_MV) {
+    nearestmv.as_int = 0;
+  }
+  if (nearmv.as_int == INVALID_MV) {
+    nearmv.as_int = 0;
+  }
+
+  int_mv dv_ref = nearestmv.as_int == 0 ? nearmv : nearestmv;
+  if (dv_ref.as_int == 0) {
+    av1_find_ref_dv(&dv_ref, tile, cm->seq_params.mib_size, mi_row);
+  }
+  // Ref DV should not have sub-pel.
+  assert((dv_ref.as_mv.col & 7) == 0);
+  assert((dv_ref.as_mv.row & 7) == 0);
+  mbmi_ext->ref_mv_stack[INTRA_FRAME][0].this_mv = dv_ref;
+
+  struct buf_2d yv12_mb[MAX_MB_PLANE];
+  av1_setup_pred_block(xd, yv12_mb, xd->cur_buf, NULL, NULL, num_planes);
+  for (int i = 0; i < num_planes; ++i) {
+    xd->plane[i].pre[0] = yv12_mb[i];
+  }
+
+  enum IntrabcMotionDirection {
+    IBC_MOTION_ABOVE,
+    IBC_MOTION_LEFT,
+    IBC_MOTION_DIRECTIONS
+  };
+
+  MB_MODE_INFO best_mbmi = *mbmi;
+  RD_STATS best_rdstats = *rd_stats;
+  uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE] = { 0 };
+  uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  av1_copy_array(best_tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
+
+  FULLPEL_MOTION_SEARCH_PARAMS fullms_params;
+  const search_site_config *lookahead_search_sites =
+      &cpi->mv_search_params.ss_cfg[SS_CFG_LOOKAHEAD];
+  av1_make_default_fullpel_ms_params(&fullms_params, cpi, x, bsize,
+                                     &dv_ref.as_mv, lookahead_search_sites);
+  fullms_params.is_intra_mode = 1;
+
+  for (enum IntrabcMotionDirection dir = IBC_MOTION_ABOVE;
+       dir < IBC_MOTION_DIRECTIONS; ++dir) {
+    switch (dir) {
+      case IBC_MOTION_ABOVE:
+        fullms_params.mv_limits.col_min =
+            (tile->mi_col_start - mi_col) * MI_SIZE;
+        fullms_params.mv_limits.col_max =
+            (tile->mi_col_end - mi_col) * MI_SIZE - w;
+        fullms_params.mv_limits.row_min =
+            (tile->mi_row_start - mi_row) * MI_SIZE;
+        fullms_params.mv_limits.row_max =
+            (sb_row * cm->seq_params.mib_size - mi_row) * MI_SIZE - h;
+        break;
+      case IBC_MOTION_LEFT:
+        fullms_params.mv_limits.col_min =
+            (tile->mi_col_start - mi_col) * MI_SIZE;
+        fullms_params.mv_limits.col_max =
+            (sb_col * cm->seq_params.mib_size - mi_col) * MI_SIZE - w;
+        // TODO(aconverse@google.com): Minimize the overlap between above and
+        // left areas.
+        fullms_params.mv_limits.row_min =
+            (tile->mi_row_start - mi_row) * MI_SIZE;
+        int bottom_coded_mi_edge =
+            AOMMIN((sb_row + 1) * cm->seq_params.mib_size, tile->mi_row_end);
+        fullms_params.mv_limits.row_max =
+            (bottom_coded_mi_edge - mi_row) * MI_SIZE - h;
+        break;
+      default: assert(0);
+    }
+    assert(fullms_params.mv_limits.col_min >= fullms_params.mv_limits.col_min);
+    assert(fullms_params.mv_limits.col_max <= fullms_params.mv_limits.col_max);
+    assert(fullms_params.mv_limits.row_min >= fullms_params.mv_limits.row_min);
+    assert(fullms_params.mv_limits.row_max <= fullms_params.mv_limits.row_max);
+
+    av1_set_mv_search_range(&fullms_params.mv_limits, &dv_ref.as_mv);
+
+    if (fullms_params.mv_limits.col_max < fullms_params.mv_limits.col_min ||
+        fullms_params.mv_limits.row_max < fullms_params.mv_limits.row_min) {
+      continue;
+    }
+
+    const int step_param = cpi->mv_search_params.mv_step_param;
+    const FULLPEL_MV start_mv = get_fullmv_from_mv(&dv_ref.as_mv);
+    IntraBCHashInfo *intrabc_hash_info = &x->intrabc_hash_info;
+    int_mv best_mv, best_hash_mv;
+
+    int bestsme = av1_full_pixel_search(start_mv, &fullms_params, step_param,
+                                        NULL, &best_mv.as_fullmv, NULL);
+    const int hashsme = av1_intrabc_hash_search(
+        cpi, xd, &fullms_params, intrabc_hash_info, &best_hash_mv.as_fullmv);
+    if (hashsme < bestsme) {
+      best_mv = best_hash_mv;
+      bestsme = hashsme;
+    }
+
+    if (bestsme == INT_MAX) continue;
+    const MV dv = get_mv_from_fullmv(&best_mv.as_fullmv);
+    if (!av1_is_fullmv_in_range(&fullms_params.mv_limits,
+                                get_fullmv_from_mv(&dv)))
+      continue;
+    if (!av1_is_dv_valid(dv, cm, xd, mi_row, mi_col, bsize,
+                         cm->seq_params.mib_size_log2))
+      continue;
+
+    // DV should not have sub-pel.
+    assert((dv.col & 7) == 0);
+    assert((dv.row & 7) == 0);
+    memset(&mbmi->palette_mode_info, 0, sizeof(mbmi->palette_mode_info));
+    mbmi->filter_intra_mode_info.use_filter_intra = 0;
+    mbmi->use_intrabc = 1;
+    mbmi->mode = DC_PRED;
+    mbmi->uv_mode = UV_DC_PRED;
+    mbmi->motion_mode = SIMPLE_TRANSLATION;
+    mbmi->mv[0].as_mv = dv;
+    mbmi->interp_filters = av1_broadcast_interp_filter(BILINEAR);
+    mbmi->skip = 0;
+    av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0,
+                                  av1_num_planes(cm) - 1);
+
+    const IntraBCMVCosts *const dv_costs = &cpi->dv_costs;
+    int *dvcost[2] = { (int *)&dv_costs->mv_component[0][MV_MAX],
+                       (int *)&dv_costs->mv_component[1][MV_MAX] };
+    // TODO(aconverse@google.com): The full motion field defining discount
+    // in MV_COST_WEIGHT is too large. Explore other values.
+    const int rate_mv = av1_mv_bit_cost(&dv, &dv_ref.as_mv, dv_costs->joint_mv,
+                                        dvcost, MV_COST_WEIGHT_SUB);
+    const int rate_mode = x->intrabc_cost[1];
+    RD_STATS rd_stats_yuv, rd_stats_y, rd_stats_uv;
+    if (!av1_txfm_search(cpi, x, bsize, &rd_stats_yuv, &rd_stats_y,
+                         &rd_stats_uv, rate_mode + rate_mv, INT64_MAX))
+      continue;
+    rd_stats_yuv.rdcost =
+        RDCOST(x->rdmult, rd_stats_yuv.rate, rd_stats_yuv.dist);
+    if (rd_stats_yuv.rdcost < best_rd) {
+      best_rd = rd_stats_yuv.rdcost;
+      best_mbmi = *mbmi;
+      best_rdstats = rd_stats_yuv;
+      memcpy(best_blk_skip, x->blk_skip,
+             sizeof(x->blk_skip[0]) * xd->height * xd->width);
+      av1_copy_array(best_tx_type_map, xd->tx_type_map, xd->height * xd->width);
+    }
+  }
+  *mbmi = best_mbmi;
+  *rd_stats = best_rdstats;
+  memcpy(x->blk_skip, best_blk_skip,
+         sizeof(x->blk_skip[0]) * xd->height * xd->width);
+  av1_copy_array(xd->tx_type_map, best_tx_type_map, ctx->num_4x4_blk);
+#if CONFIG_RD_DEBUG
+  mbmi->rd_stats = *rd_stats;
+#endif
+  return best_rd;
+}
+
+void av1_rd_pick_intra_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
+                               RD_STATS *rd_cost, BLOCK_SIZE bsize,
+                               PICK_MODE_CONTEXT *ctx, int64_t best_rd) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int num_planes = av1_num_planes(cm);
+  int rate_y = 0, rate_uv = 0, rate_y_tokenonly = 0, rate_uv_tokenonly = 0;
+  int y_skip = 0, uv_skip = 0;
+  int64_t dist_y = 0, dist_uv = 0;
+
+  ctx->rd_stats.skip = 0;
+  mbmi->ref_frame[0] = INTRA_FRAME;
+  mbmi->ref_frame[1] = NONE_FRAME;
+  mbmi->use_intrabc = 0;
+  mbmi->mv[0].as_int = 0;
+  mbmi->skip_mode = 0;
+
+  const int64_t intra_yrd =
+      av1_rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly, &dist_y,
+                                 &y_skip, bsize, best_rd, ctx);
+
+  // Initialize default mode evaluation params
+  set_mode_eval_params(cpi, x, DEFAULT_EVAL);
+
+  if (intra_yrd < best_rd) {
+    // Only store reconstructed luma when there's chroma RDO. When there's no
+    // chroma RDO, the reconstructed luma will be stored in encode_superblock().
+    xd->cfl.store_y = store_cfl_required_rdo(cm, x);
+    if (xd->cfl.store_y) {
+      // Restore reconstructed luma values.
+      memcpy(x->blk_skip, ctx->blk_skip,
+             sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+      av1_copy_array(xd->tx_type_map, ctx->tx_type_map, ctx->num_4x4_blk);
+      av1_encode_intra_block_plane(cpi, x, bsize, AOM_PLANE_Y, DRY_RUN_NORMAL,
+                                   cpi->optimize_seg_arr[mbmi->segment_id]);
+      av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
+      xd->cfl.store_y = 0;
+    }
+    if (num_planes > 1) {
+      init_sbuv_mode(mbmi);
+      if (xd->is_chroma_ref) {
+        const TX_SIZE max_uv_tx_size = av1_get_tx_size(AOM_PLANE_U, xd);
+        av1_rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly,
+                                    &dist_uv, &uv_skip, bsize, max_uv_tx_size);
+      }
+    }
+
+    // Intra block is always coded as non-skip
+    rd_cost->rate =
+        rate_y + rate_uv + x->skip_cost[av1_get_skip_context(xd)][0];
+    rd_cost->dist = dist_y + dist_uv;
+    rd_cost->rdcost = RDCOST(x->rdmult, rd_cost->rate, rd_cost->dist);
+    rd_cost->skip = 0;
+  } else {
+    rd_cost->rate = INT_MAX;
+  }
+
+  if (rd_cost->rate != INT_MAX && rd_cost->rdcost < best_rd)
+    best_rd = rd_cost->rdcost;
+  if (rd_pick_intrabc_mode_sb(cpi, x, ctx, rd_cost, bsize, best_rd) < best_rd) {
+    ctx->rd_stats.skip = mbmi->skip;
+    memcpy(ctx->blk_skip, x->blk_skip,
+           sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+    assert(rd_cost->rate != INT_MAX);
+  }
+  if (rd_cost->rate == INT_MAX) return;
+
+  ctx->mic = *xd->mi[0];
+  av1_copy_mbmi_ext_to_mbmi_ext_frame(&ctx->mbmi_ext_best, x->mbmi_ext,
+                                      av1_ref_frame_type(xd->mi[0]->ref_frame));
+  av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
+}
+
+static AOM_INLINE void calc_target_weighted_pred(
+    const AV1_COMMON *cm, const MACROBLOCK *x, const MACROBLOCKD *xd,
+    const uint8_t *above, int above_stride, const uint8_t *left,
+    int left_stride);
+
+static AOM_INLINE void rd_pick_skip_mode(
+    RD_STATS *rd_cost, InterModeSearchState *search_state,
+    const AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize,
+    struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE]) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const SkipModeInfo *const skip_mode_info = &cm->current_frame.skip_mode_info;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+
+  x->compound_idx = 1;  // COMPOUND_AVERAGE
+  RD_STATS skip_mode_rd_stats;
+  av1_invalid_rd_stats(&skip_mode_rd_stats);
+
+  if (skip_mode_info->ref_frame_idx_0 == INVALID_IDX ||
+      skip_mode_info->ref_frame_idx_1 == INVALID_IDX) {
+    return;
+  }
+
+  const MV_REFERENCE_FRAME ref_frame =
+      LAST_FRAME + skip_mode_info->ref_frame_idx_0;
+  const MV_REFERENCE_FRAME second_ref_frame =
+      LAST_FRAME + skip_mode_info->ref_frame_idx_1;
+  const PREDICTION_MODE this_mode = NEAREST_NEARESTMV;
+  const THR_MODES mode_index =
+      get_prediction_mode_idx(this_mode, ref_frame, second_ref_frame);
+
+  if (mode_index == THR_INVALID) {
+    return;
+  }
+
+  if ((!cpi->oxcf.enable_onesided_comp ||
+       cpi->sf.inter_sf.disable_onesided_comp) &&
+      cpi->all_one_sided_refs) {
+    return;
+  }
+
+  mbmi->mode = this_mode;
+  mbmi->uv_mode = UV_DC_PRED;
+  mbmi->ref_frame[0] = ref_frame;
+  mbmi->ref_frame[1] = second_ref_frame;
+  const uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+  if (x->mbmi_ext->ref_mv_count[ref_frame_type] == UINT8_MAX) {
+    if (x->mbmi_ext->ref_mv_count[ref_frame] == UINT8_MAX ||
+        x->mbmi_ext->ref_mv_count[second_ref_frame] == UINT8_MAX) {
+      return;
+    }
+    MB_MODE_INFO_EXT *mbmi_ext = x->mbmi_ext;
+    av1_find_mv_refs(cm, xd, mbmi, ref_frame_type, mbmi_ext->ref_mv_count,
+                     xd->ref_mv_stack, xd->weight, NULL, mbmi_ext->global_mvs,
+                     mbmi_ext->mode_context);
+    // TODO(Ravi): Populate mbmi_ext->ref_mv_stack[ref_frame][4] and
+    // mbmi_ext->weight[ref_frame][4] inside av1_find_mv_refs.
+    av1_copy_usable_ref_mv_stack_and_weight(xd, mbmi_ext, ref_frame_type);
+  }
+
+  assert(this_mode == NEAREST_NEARESTMV);
+  if (!build_cur_mv(mbmi->mv, this_mode, cm, x, 0)) {
+    return;
+  }
+
+  mbmi->filter_intra_mode_info.use_filter_intra = 0;
+  mbmi->interintra_mode = (INTERINTRA_MODE)(II_DC_PRED - 1);
+  mbmi->comp_group_idx = 0;
+  mbmi->compound_idx = x->compound_idx;
+  mbmi->interinter_comp.type = COMPOUND_AVERAGE;
+  mbmi->motion_mode = SIMPLE_TRANSLATION;
+  mbmi->ref_mv_idx = 0;
+  mbmi->skip_mode = mbmi->skip = 1;
+
+  set_default_interp_filters(mbmi, cm->features.interp_filter);
+
+  set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+  for (int i = 0; i < num_planes; i++) {
+    xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i];
+    xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i];
+  }
+
+  BUFFER_SET orig_dst;
+  for (int i = 0; i < num_planes; i++) {
+    orig_dst.plane[i] = xd->plane[i].dst.buf;
+    orig_dst.stride[i] = xd->plane[i].dst.stride;
+  }
+
+  // Obtain the rdcost for skip_mode.
+  skip_mode_rd(&skip_mode_rd_stats, cpi, x, bsize, &orig_dst);
+
+  // Compare the use of skip_mode with the best intra/inter mode obtained.
+  const int skip_mode_ctx = av1_get_skip_mode_context(xd);
+  int64_t best_intra_inter_mode_cost = INT64_MAX;
+  if (rd_cost->dist < INT64_MAX && rd_cost->rate < INT32_MAX) {
+    best_intra_inter_mode_cost =
+        RDCOST(x->rdmult, rd_cost->rate + x->skip_mode_cost[skip_mode_ctx][0],
+               rd_cost->dist);
+    // Account for non-skip mode rate in total rd stats
+    rd_cost->rate += x->skip_mode_cost[skip_mode_ctx][0];
+    av1_rd_cost_update(x->rdmult, rd_cost);
+  }
+
+  if (skip_mode_rd_stats.rdcost <= best_intra_inter_mode_cost &&
+      (!xd->lossless[mbmi->segment_id] || skip_mode_rd_stats.dist == 0)) {
+    assert(mode_index != THR_INVALID);
+    search_state->best_mbmode.skip_mode = 1;
+    search_state->best_mbmode = *mbmi;
+
+    search_state->best_mbmode.skip_mode = search_state->best_mbmode.skip = 1;
+    search_state->best_mbmode.mode = NEAREST_NEARESTMV;
+    search_state->best_mbmode.ref_frame[0] = mbmi->ref_frame[0];
+    search_state->best_mbmode.ref_frame[1] = mbmi->ref_frame[1];
+    search_state->best_mbmode.mv[0].as_int = mbmi->mv[0].as_int;
+    search_state->best_mbmode.mv[1].as_int = mbmi->mv[1].as_int;
+    search_state->best_mbmode.ref_mv_idx = 0;
+
+    // Set up tx_size related variables for skip-specific loop filtering.
+    search_state->best_mbmode.tx_size =
+        block_signals_txsize(bsize)
+            ? tx_size_from_tx_mode(bsize, x->tx_mode_search_type)
+            : max_txsize_rect_lookup[bsize];
+    memset(search_state->best_mbmode.inter_tx_size,
+           search_state->best_mbmode.tx_size,
+           sizeof(search_state->best_mbmode.inter_tx_size));
+    set_txfm_ctxs(search_state->best_mbmode.tx_size, xd->width, xd->height,
+                  search_state->best_mbmode.skip && is_inter_block(mbmi), xd);
+
+    // Set up color-related variables for skip mode.
+    search_state->best_mbmode.uv_mode = UV_DC_PRED;
+    search_state->best_mbmode.palette_mode_info.palette_size[0] = 0;
+    search_state->best_mbmode.palette_mode_info.palette_size[1] = 0;
+
+    search_state->best_mbmode.comp_group_idx = 0;
+    search_state->best_mbmode.compound_idx = x->compound_idx;
+    search_state->best_mbmode.interinter_comp.type = COMPOUND_AVERAGE;
+    search_state->best_mbmode.motion_mode = SIMPLE_TRANSLATION;
+
+    search_state->best_mbmode.interintra_mode =
+        (INTERINTRA_MODE)(II_DC_PRED - 1);
+    search_state->best_mbmode.filter_intra_mode_info.use_filter_intra = 0;
+
+    set_default_interp_filters(&search_state->best_mbmode,
+                               cm->features.interp_filter);
+
+    search_state->best_mode_index = mode_index;
+
+    // Update rd_cost
+    rd_cost->rate = skip_mode_rd_stats.rate;
+    rd_cost->dist = rd_cost->sse = skip_mode_rd_stats.dist;
+    rd_cost->rdcost = skip_mode_rd_stats.rdcost;
+
+    search_state->best_rd = rd_cost->rdcost;
+    search_state->best_skip2 = 1;
+    search_state->best_mode_skippable = 1;
+
+    x->force_skip = 1;
+  }
+}
+
+// Get winner mode stats of given mode index
+static AOM_INLINE MB_MODE_INFO *get_winner_mode_stats(
+    MACROBLOCK *x, MB_MODE_INFO *best_mbmode, RD_STATS *best_rd_cost,
+    int best_rate_y, int best_rate_uv, THR_MODES *best_mode_index,
+    RD_STATS **winner_rd_cost, int *winner_rate_y, int *winner_rate_uv,
+    THR_MODES *winner_mode_index, int enable_multiwinner_mode_process,
+    int mode_idx) {
+  MB_MODE_INFO *winner_mbmi;
+  if (enable_multiwinner_mode_process) {
+    assert(mode_idx >= 0 && mode_idx < x->winner_mode_count);
+    WinnerModeStats *winner_mode_stat = &x->winner_mode_stats[mode_idx];
+    winner_mbmi = &winner_mode_stat->mbmi;
+
+    *winner_rd_cost = &winner_mode_stat->rd_cost;
+    *winner_rate_y = winner_mode_stat->rate_y;
+    *winner_rate_uv = winner_mode_stat->rate_uv;
+    *winner_mode_index = winner_mode_stat->mode_index;
+  } else {
+    winner_mbmi = best_mbmode;
+    *winner_rd_cost = best_rd_cost;
+    *winner_rate_y = best_rate_y;
+    *winner_rate_uv = best_rate_uv;
+    *winner_mode_index = *best_mode_index;
+  }
+  return winner_mbmi;
+}
+
+// speed feature: fast intra/inter transform type search
+// Used for speed >= 2
+// When this speed feature is on, in rd mode search, only DCT is used.
+// After the mode is determined, this function is called, to select
+// transform types and get accurate rdcost.
+static AOM_INLINE void refine_winner_mode_tx(
+    const AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *rd_cost, BLOCK_SIZE bsize,
+    PICK_MODE_CONTEXT *ctx, THR_MODES *best_mode_index,
+    MB_MODE_INFO *best_mbmode, struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE],
+    int best_rate_y, int best_rate_uv, int *best_skip2, int winner_mode_count) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  int64_t best_rd;
+  const int num_planes = av1_num_planes(cm);
+
+  if (!is_winner_mode_processing_enabled(cpi, best_mbmode, best_mbmode->mode))
+    return;
+
+  // Set params for winner mode evaluation
+  set_mode_eval_params(cpi, x, WINNER_MODE_EVAL);
+
+  // No best mode identified so far
+  if (*best_mode_index == THR_INVALID) return;
+
+  best_rd = RDCOST(x->rdmult, rd_cost->rate, rd_cost->dist);
+  for (int mode_idx = 0; mode_idx < winner_mode_count; mode_idx++) {
+    RD_STATS *winner_rd_stats = NULL;
+    int winner_rate_y = 0, winner_rate_uv = 0;
+    THR_MODES winner_mode_index = 0;
+
+    // TODO(any): Combine best mode and multi-winner mode processing paths
+    // Get winner mode stats for current mode index
+    MB_MODE_INFO *winner_mbmi = get_winner_mode_stats(
+        x, best_mbmode, rd_cost, best_rate_y, best_rate_uv, best_mode_index,
+        &winner_rd_stats, &winner_rate_y, &winner_rate_uv, &winner_mode_index,
+        cpi->sf.winner_mode_sf.enable_multiwinner_mode_process, mode_idx);
+
+    if (xd->lossless[winner_mbmi->segment_id] == 0 &&
+        winner_mode_index != THR_INVALID &&
+        is_winner_mode_processing_enabled(cpi, winner_mbmi,
+                                          winner_mbmi->mode)) {
+      RD_STATS rd_stats = *winner_rd_stats;
+      int skip_blk = 0;
+      RD_STATS rd_stats_y, rd_stats_uv;
+      const int skip_ctx = av1_get_skip_context(xd);
+
+      *mbmi = *winner_mbmi;
+
+      set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+
+      // Select prediction reference frames.
+      for (int i = 0; i < num_planes; i++) {
+        xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i];
+        if (has_second_ref(mbmi))
+          xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i];
+      }
+
+      if (is_inter_mode(mbmi->mode)) {
+        const int mi_row = xd->mi_row;
+        const int mi_col = xd->mi_col;
+        av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0,
+                                      av1_num_planes(cm) - 1);
+        if (mbmi->motion_mode == OBMC_CAUSAL)
+          av1_build_obmc_inter_predictors_sb(cm, xd);
+
+        av1_subtract_plane(x, bsize, 0);
+        if (x->tx_mode_search_type == TX_MODE_SELECT &&
+            !xd->lossless[mbmi->segment_id]) {
+          av1_pick_recursive_tx_size_type_yrd(cpi, x, &rd_stats_y, bsize,
+                                              INT64_MAX);
+          assert(rd_stats_y.rate != INT_MAX);
+        } else {
+          av1_pick_uniform_tx_size_type_yrd(cpi, x, &rd_stats_y, bsize,
+                                            INT64_MAX);
+          memset(mbmi->inter_tx_size, mbmi->tx_size,
+                 sizeof(mbmi->inter_tx_size));
+          for (int i = 0; i < xd->height * xd->width; ++i)
+            set_blk_skip(x, 0, i, rd_stats_y.skip);
+        }
+      } else {
+        av1_pick_uniform_tx_size_type_yrd(cpi, x, &rd_stats_y, bsize,
+                                          INT64_MAX);
+      }
+
+      if (num_planes > 1) {
+        av1_txfm_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
+      } else {
+        av1_init_rd_stats(&rd_stats_uv);
+      }
+
+      if (is_inter_mode(mbmi->mode) &&
+          RDCOST(x->rdmult,
+                 x->skip_cost[skip_ctx][0] + rd_stats_y.rate + rd_stats_uv.rate,
+                 (rd_stats_y.dist + rd_stats_uv.dist)) >
+              RDCOST(x->rdmult, x->skip_cost[skip_ctx][1],
+                     (rd_stats_y.sse + rd_stats_uv.sse))) {
+        skip_blk = 1;
+        rd_stats_y.rate = x->skip_cost[skip_ctx][1];
+        rd_stats_uv.rate = 0;
+        rd_stats_y.dist = rd_stats_y.sse;
+        rd_stats_uv.dist = rd_stats_uv.sse;
+      } else {
+        skip_blk = 0;
+        rd_stats_y.rate += x->skip_cost[skip_ctx][0];
+      }
+      int this_rate = rd_stats.rate + rd_stats_y.rate + rd_stats_uv.rate -
+                      winner_rate_y - winner_rate_uv;
+      int64_t this_rd =
+          RDCOST(x->rdmult, this_rate, (rd_stats_y.dist + rd_stats_uv.dist));
+      if (best_rd > this_rd) {
+        *best_mbmode = *mbmi;
+        *best_mode_index = winner_mode_index;
+        av1_copy_array(ctx->blk_skip, x->blk_skip, ctx->num_4x4_blk);
+        av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
+        rd_cost->rate = this_rate;
+        rd_cost->dist = rd_stats_y.dist + rd_stats_uv.dist;
+        rd_cost->sse = rd_stats_y.sse + rd_stats_uv.sse;
+        rd_cost->rdcost = this_rd;
+        best_rd = this_rd;
+        *best_skip2 = skip_blk;
+      }
+    }
+  }
+}
+
+typedef struct {
+  // Mask for each reference frame, specifying which prediction modes to NOT try
+  // during search.
+  uint32_t pred_modes[REF_FRAMES];
+  // If ref_combo[i][j + 1] is true, do NOT try prediction using combination of
+  // reference frames (i, j).
+  // Note: indexing with 'j + 1' is due to the fact that 2nd reference can be -1
+  // (NONE_FRAME).
+  bool ref_combo[REF_FRAMES][REF_FRAMES + 1];
+} mode_skip_mask_t;
+
+// Update 'ref_combo' mask to disable given 'ref' in single and compound modes.
+static AOM_INLINE void disable_reference(
+    MV_REFERENCE_FRAME ref, bool ref_combo[REF_FRAMES][REF_FRAMES + 1]) {
+  for (MV_REFERENCE_FRAME ref2 = NONE_FRAME; ref2 < REF_FRAMES; ++ref2) {
+    ref_combo[ref][ref2 + 1] = true;
+  }
+}
+
+// Update 'ref_combo' mask to disable all inter references except ALTREF.
+static AOM_INLINE void disable_inter_references_except_altref(
+    bool ref_combo[REF_FRAMES][REF_FRAMES + 1]) {
+  disable_reference(LAST_FRAME, ref_combo);
+  disable_reference(LAST2_FRAME, ref_combo);
+  disable_reference(LAST3_FRAME, ref_combo);
+  disable_reference(GOLDEN_FRAME, ref_combo);
+  disable_reference(BWDREF_FRAME, ref_combo);
+  disable_reference(ALTREF2_FRAME, ref_combo);
+}
+
+static const MV_REFERENCE_FRAME reduced_ref_combos[][2] = {
+  { LAST_FRAME, NONE_FRAME },     { ALTREF_FRAME, NONE_FRAME },
+  { LAST_FRAME, ALTREF_FRAME },   { GOLDEN_FRAME, NONE_FRAME },
+  { INTRA_FRAME, NONE_FRAME },    { GOLDEN_FRAME, ALTREF_FRAME },
+  { LAST_FRAME, GOLDEN_FRAME },   { LAST_FRAME, INTRA_FRAME },
+  { LAST_FRAME, BWDREF_FRAME },   { LAST_FRAME, LAST3_FRAME },
+  { GOLDEN_FRAME, BWDREF_FRAME }, { GOLDEN_FRAME, INTRA_FRAME },
+  { BWDREF_FRAME, NONE_FRAME },   { BWDREF_FRAME, ALTREF_FRAME },
+  { ALTREF_FRAME, INTRA_FRAME },  { BWDREF_FRAME, INTRA_FRAME },
+};
+
+static const MV_REFERENCE_FRAME real_time_ref_combos[][2] = {
+  { LAST_FRAME, NONE_FRAME },
+  { ALTREF_FRAME, NONE_FRAME },
+  { GOLDEN_FRAME, NONE_FRAME },
+  { INTRA_FRAME, NONE_FRAME }
+};
+
+typedef enum { REF_SET_FULL, REF_SET_REDUCED, REF_SET_REALTIME } REF_SET;
+
+static AOM_INLINE void default_skip_mask(mode_skip_mask_t *mask,
+                                         REF_SET ref_set) {
+  if (ref_set == REF_SET_FULL) {
+    // Everything available by default.
+    memset(mask, 0, sizeof(*mask));
+  } else {
+    // All modes available by default.
+    memset(mask->pred_modes, 0, sizeof(mask->pred_modes));
+    // All references disabled first.
+    for (MV_REFERENCE_FRAME ref1 = INTRA_FRAME; ref1 < REF_FRAMES; ++ref1) {
+      for (MV_REFERENCE_FRAME ref2 = NONE_FRAME; ref2 < REF_FRAMES; ++ref2) {
+        mask->ref_combo[ref1][ref2 + 1] = true;
+      }
+    }
+    const MV_REFERENCE_FRAME(*ref_set_combos)[2];
+    int num_ref_combos;
+
+    // Then enable reduced set of references explicitly.
+    switch (ref_set) {
+      case REF_SET_REDUCED:
+        ref_set_combos = reduced_ref_combos;
+        num_ref_combos =
+            (int)sizeof(reduced_ref_combos) / sizeof(reduced_ref_combos[0]);
+        break;
+      case REF_SET_REALTIME:
+        ref_set_combos = real_time_ref_combos;
+        num_ref_combos =
+            (int)sizeof(real_time_ref_combos) / sizeof(real_time_ref_combos[0]);
+        break;
+      default: assert(0); num_ref_combos = 0;
+    }
+
+    for (int i = 0; i < num_ref_combos; ++i) {
+      const MV_REFERENCE_FRAME *const this_combo = ref_set_combos[i];
+      mask->ref_combo[this_combo[0]][this_combo[1] + 1] = false;
+    }
+  }
+}
+
+static AOM_INLINE void init_mode_skip_mask(mode_skip_mask_t *mask,
+                                           const AV1_COMP *cpi, MACROBLOCK *x,
+                                           BLOCK_SIZE bsize) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const struct segmentation *const seg = &cm->seg;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  unsigned char segment_id = mbmi->segment_id;
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  REF_SET ref_set = REF_SET_FULL;
+
+  if (sf->rt_sf.use_real_time_ref_set)
+    ref_set = REF_SET_REALTIME;
+  else if (cpi->oxcf.enable_reduced_reference_set)
+    ref_set = REF_SET_REDUCED;
+
+  default_skip_mask(mask, ref_set);
+
+  int min_pred_mv_sad = INT_MAX;
+  MV_REFERENCE_FRAME ref_frame;
+  if (ref_set == REF_SET_REALTIME) {
+    // For real-time encoding, we only look at a subset of ref frames. So the
+    // threshold for pruning should be computed from this subset as well.
+    const int num_rt_refs =
+        sizeof(real_time_ref_combos) / sizeof(*real_time_ref_combos);
+    for (int r_idx = 0; r_idx < num_rt_refs; r_idx++) {
+      const MV_REFERENCE_FRAME ref = real_time_ref_combos[r_idx][0];
+      if (ref != INTRA_FRAME) {
+        min_pred_mv_sad = AOMMIN(min_pred_mv_sad, x->pred_mv_sad[ref]);
+      }
+    }
+  } else {
+    for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame)
+      min_pred_mv_sad = AOMMIN(min_pred_mv_sad, x->pred_mv_sad[ref_frame]);
+  }
+
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    if (!(cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame])) {
+      // Skip checking missing reference in both single and compound reference
+      // modes.
+      disable_reference(ref_frame, mask->ref_combo);
+    } else {
+      // Skip fixed mv modes for poor references
+      if ((x->pred_mv_sad[ref_frame] >> 2) > min_pred_mv_sad) {
+        mask->pred_modes[ref_frame] |= INTER_NEAREST_NEAR_ZERO;
+      }
+    }
+    if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
+        get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) {
+      // Reference not used for the segment.
+      disable_reference(ref_frame, mask->ref_combo);
+    }
+  }
+  // Note: We use the following drop-out only if the SEG_LVL_REF_FRAME feature
+  // is disabled for this segment. This is to prevent the possibility that we
+  // end up unable to pick any mode.
+  if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) {
+    // Only consider GLOBALMV/ALTREF_FRAME for alt ref frame,
+    // unless ARNR filtering is enabled in which case we want
+    // an unfiltered alternative. We allow near/nearest as well
+    // because they may result in zero-zero MVs but be cheaper.
+    if (cpi->rc.is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {
+      disable_inter_references_except_altref(mask->ref_combo);
+
+      mask->pred_modes[ALTREF_FRAME] = ~INTER_NEAREST_NEAR_ZERO;
+      const MV_REFERENCE_FRAME tmp_ref_frames[2] = { ALTREF_FRAME, NONE_FRAME };
+      int_mv near_mv, nearest_mv, global_mv;
+      get_this_mv(&nearest_mv, NEARESTMV, 0, 0, 0, tmp_ref_frames, x->mbmi_ext);
+      get_this_mv(&near_mv, NEARMV, 0, 0, 0, tmp_ref_frames, x->mbmi_ext);
+      get_this_mv(&global_mv, GLOBALMV, 0, 0, 0, tmp_ref_frames, x->mbmi_ext);
+
+      if (near_mv.as_int != global_mv.as_int)
+        mask->pred_modes[ALTREF_FRAME] |= (1 << NEARMV);
+      if (nearest_mv.as_int != global_mv.as_int)
+        mask->pred_modes[ALTREF_FRAME] |= (1 << NEARESTMV);
+    }
+  }
+
+  if (cpi->rc.is_src_frame_alt_ref) {
+    if (sf->inter_sf.alt_ref_search_fp) {
+      assert(cpi->ref_frame_flags & av1_ref_frame_flag_list[ALTREF_FRAME]);
+      mask->pred_modes[ALTREF_FRAME] = 0;
+      disable_inter_references_except_altref(mask->ref_combo);
+      disable_reference(INTRA_FRAME, mask->ref_combo);
+    }
+  }
+
+  if (sf->inter_sf.alt_ref_search_fp) {
+    if (!cm->show_frame && x->best_pred_mv_sad < INT_MAX) {
+      int sad_thresh = x->best_pred_mv_sad + (x->best_pred_mv_sad >> 3);
+      // Conservatively skip the modes w.r.t. BWDREF, ALTREF2 and ALTREF, if
+      // those are past frames
+      for (ref_frame = BWDREF_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
+        if (cpi->ref_relative_dist[ref_frame - LAST_FRAME] < 0)
+          if (x->pred_mv_sad[ref_frame] > sad_thresh)
+            mask->pred_modes[ref_frame] |= INTER_ALL;
+      }
+    }
+  }
+
+  if (sf->inter_sf.adaptive_mode_search) {
+    if (cm->show_frame && !cpi->rc.is_src_frame_alt_ref &&
+        cpi->rc.frames_since_golden >= 3)
+      if ((x->pred_mv_sad[GOLDEN_FRAME] >> 1) > x->pred_mv_sad[LAST_FRAME])
+        mask->pred_modes[GOLDEN_FRAME] |= INTER_ALL;
+  }
+
+  if (bsize > sf->part_sf.max_intra_bsize) {
+    disable_reference(INTRA_FRAME, mask->ref_combo);
+  }
+
+  mask->pred_modes[INTRA_FRAME] |=
+      ~(sf->intra_sf.intra_y_mode_mask[max_txsize_lookup[bsize]]);
+}
+
+static AOM_INLINE void init_pred_buf(const MACROBLOCK *const x,
+                                     HandleInterModeArgs *const args) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  if (is_cur_buf_hbd(xd)) {
+    const int len = sizeof(uint16_t);
+    args->above_pred_buf[0] = CONVERT_TO_BYTEPTR(x->above_pred_buf);
+    args->above_pred_buf[1] =
+        CONVERT_TO_BYTEPTR(x->above_pred_buf + (MAX_SB_SQUARE >> 1) * len);
+    args->above_pred_buf[2] =
+        CONVERT_TO_BYTEPTR(x->above_pred_buf + MAX_SB_SQUARE * len);
+    args->left_pred_buf[0] = CONVERT_TO_BYTEPTR(x->left_pred_buf);
+    args->left_pred_buf[1] =
+        CONVERT_TO_BYTEPTR(x->left_pred_buf + (MAX_SB_SQUARE >> 1) * len);
+    args->left_pred_buf[2] =
+        CONVERT_TO_BYTEPTR(x->left_pred_buf + MAX_SB_SQUARE * len);
+  } else {
+    args->above_pred_buf[0] = x->above_pred_buf;
+    args->above_pred_buf[1] = x->above_pred_buf + (MAX_SB_SQUARE >> 1);
+    args->above_pred_buf[2] = x->above_pred_buf + MAX_SB_SQUARE;
+    args->left_pred_buf[0] = x->left_pred_buf;
+    args->left_pred_buf[1] = x->left_pred_buf + (MAX_SB_SQUARE >> 1);
+    args->left_pred_buf[2] = x->left_pred_buf + MAX_SB_SQUARE;
+  }
+}
+
+// Please add/modify parameter setting in this function, making it consistent
+// and easy to read and maintain.
+static AOM_INLINE void set_params_rd_pick_inter_mode(
+    const AV1_COMP *cpi, MACROBLOCK *x, HandleInterModeArgs *args,
+    BLOCK_SIZE bsize, mode_skip_mask_t *mode_skip_mask, int skip_ref_frame_mask,
+    unsigned int *ref_costs_single, unsigned int (*ref_costs_comp)[REF_FRAMES],
+    struct buf_2d (*yv12_mb)[MAX_MB_PLANE]) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+  unsigned char segment_id = mbmi->segment_id;
+
+  init_pred_buf(x, args);
+  av1_collect_neighbors_ref_counts(xd);
+  estimate_ref_frame_costs(cm, xd, x, segment_id, ref_costs_single,
+                           ref_costs_comp);
+
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+  MV_REFERENCE_FRAME ref_frame;
+  x->best_pred_mv_sad = INT_MAX;
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    x->pred_mv_sad[ref_frame] = INT_MAX;
+    x->mbmi_ext->mode_context[ref_frame] = 0;
+    mbmi_ext->ref_mv_count[ref_frame] = UINT8_MAX;
+    if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame]) {
+      if (mbmi->partition != PARTITION_NONE &&
+          mbmi->partition != PARTITION_SPLIT) {
+        if (skip_ref_frame_mask & (1 << ref_frame)) {
+          int skip = 1;
+          for (int r = ALTREF_FRAME + 1; r < MODE_CTX_REF_FRAMES; ++r) {
+            if (!(skip_ref_frame_mask & (1 << r))) {
+              const MV_REFERENCE_FRAME *rf = ref_frame_map[r - REF_FRAMES];
+              if (rf[0] == ref_frame || rf[1] == ref_frame) {
+                skip = 0;
+                break;
+              }
+            }
+          }
+          if (skip) continue;
+        }
+      }
+      assert(get_ref_frame_yv12_buf(cm, ref_frame) != NULL);
+      setup_buffer_ref_mvs_inter(cpi, x, ref_frame, bsize, yv12_mb);
+    }
+    // Store the best pred_mv_sad across all past frames
+    if (cpi->sf.inter_sf.alt_ref_search_fp &&
+        cpi->ref_relative_dist[ref_frame - LAST_FRAME] < 0)
+      x->best_pred_mv_sad =
+          AOMMIN(x->best_pred_mv_sad, x->pred_mv_sad[ref_frame]);
+  }
+  // ref_frame = ALTREF_FRAME
+  if (!cpi->sf.rt_sf.use_real_time_ref_set) {
+    // No second reference on RT ref set, so no need to initialize
+    for (; ref_frame < MODE_CTX_REF_FRAMES; ++ref_frame) {
+      x->mbmi_ext->mode_context[ref_frame] = 0;
+      mbmi_ext->ref_mv_count[ref_frame] = UINT8_MAX;
+      const MV_REFERENCE_FRAME *rf = ref_frame_map[ref_frame - REF_FRAMES];
+      if (!((cpi->ref_frame_flags & av1_ref_frame_flag_list[rf[0]]) &&
+            (cpi->ref_frame_flags & av1_ref_frame_flag_list[rf[1]]))) {
+        continue;
+      }
+
+      if (mbmi->partition != PARTITION_NONE &&
+          mbmi->partition != PARTITION_SPLIT) {
+        if (skip_ref_frame_mask & (1 << ref_frame)) {
+          continue;
+        }
+      }
+      av1_find_mv_refs(cm, xd, mbmi, ref_frame, mbmi_ext->ref_mv_count,
+                       xd->ref_mv_stack, xd->weight, NULL, mbmi_ext->global_mvs,
+                       mbmi_ext->mode_context);
+      // TODO(Ravi): Populate mbmi_ext->ref_mv_stack[ref_frame][4] and
+      // mbmi_ext->weight[ref_frame][4] inside av1_find_mv_refs.
+      av1_copy_usable_ref_mv_stack_and_weight(xd, mbmi_ext, ref_frame);
+    }
+  }
+
+  av1_count_overlappable_neighbors(cm, xd);
+  const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->gf_group);
+  const int prune_obmc = cpi->frame_probs.obmc_probs[update_type][bsize] <
+                         cpi->sf.inter_sf.prune_obmc_prob_thresh;
+  if (cpi->oxcf.enable_obmc && !cpi->sf.inter_sf.disable_obmc && !prune_obmc) {
+    if (check_num_overlappable_neighbors(mbmi) &&
+        is_motion_variation_allowed_bsize(bsize)) {
+      int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+      int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1,
+                                       MAX_SB_SIZE >> 1 };
+      int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1,
+                                        MAX_SB_SIZE >> 1 };
+      int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+      av1_build_prediction_by_above_preds(cm, xd, args->above_pred_buf,
+                                          dst_width1, dst_height1,
+                                          args->above_pred_stride);
+      av1_build_prediction_by_left_preds(cm, xd, args->left_pred_buf,
+                                         dst_width2, dst_height2,
+                                         args->left_pred_stride);
+      const int num_planes = av1_num_planes(cm);
+      av1_setup_dst_planes(xd->plane, bsize, &cm->cur_frame->buf, mi_row,
+                           mi_col, 0, num_planes);
+      calc_target_weighted_pred(
+          cm, x, xd, args->above_pred_buf[0], args->above_pred_stride[0],
+          args->left_pred_buf[0], args->left_pred_stride[0]);
+    }
+  }
+
+  init_mode_skip_mask(mode_skip_mask, cpi, x, bsize);
+
+  // Set params for mode evaluation
+  set_mode_eval_params(cpi, x, MODE_EVAL);
+
+  x->comp_rd_stats_idx = 0;
+}
+
+static AOM_INLINE void init_intra_mode_search_state(
+    IntraModeSearchState *intra_search_state) {
+  intra_search_state->skip_intra_modes = 0;
+  intra_search_state->best_intra_mode = DC_PRED;
+  intra_search_state->angle_stats_ready = 0;
+  av1_zero(intra_search_state->directional_mode_skip_mask);
+  intra_search_state->rate_uv_intra = INT_MAX;
+  av1_zero(intra_search_state->pmi_uv);
+  for (int i = 0; i < REFERENCE_MODES; ++i)
+    intra_search_state->best_pred_rd[i] = INT64_MAX;
+}
+
+static AOM_INLINE void init_inter_mode_search_state(
+    InterModeSearchState *search_state, const AV1_COMP *cpi,
+    const MACROBLOCK *x, BLOCK_SIZE bsize, int64_t best_rd_so_far) {
+  init_intra_mode_search_state(&search_state->intra_search_state);
+
+  search_state->best_rd = best_rd_so_far;
+  search_state->best_skip_rd[0] = INT64_MAX;
+  search_state->best_skip_rd[1] = INT64_MAX;
+
+  av1_zero(search_state->best_mbmode);
+
+  search_state->best_rate_y = INT_MAX;
+
+  search_state->best_rate_uv = INT_MAX;
+
+  search_state->best_mode_skippable = 0;
+
+  search_state->best_skip2 = 0;
+
+  search_state->best_mode_index = THR_INVALID;
+
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  const unsigned char segment_id = mbmi->segment_id;
+
+  search_state->num_available_refs = 0;
+  memset(search_state->dist_refs, -1, sizeof(search_state->dist_refs));
+  memset(search_state->dist_order_refs, -1,
+         sizeof(search_state->dist_order_refs));
+
+  for (int i = 0; i <= LAST_NEW_MV_INDEX; ++i)
+    search_state->mode_threshold[i] = 0;
+  const int *const rd_threshes = cpi->rd.threshes[segment_id][bsize];
+  for (int i = LAST_NEW_MV_INDEX + 1; i < MAX_MODES; ++i)
+    search_state->mode_threshold[i] =
+        ((int64_t)rd_threshes[i] * x->thresh_freq_fact[bsize][i]) >>
+        RD_THRESH_FAC_FRAC_BITS;
+
+  search_state->best_intra_rd = INT64_MAX;
+
+  search_state->best_pred_sse = UINT_MAX;
+
+  av1_zero(search_state->single_newmv);
+  av1_zero(search_state->single_newmv_rate);
+  av1_zero(search_state->single_newmv_valid);
+  for (int i = 0; i < MB_MODE_COUNT; ++i) {
+    for (int j = 0; j < MAX_REF_MV_SEARCH; ++j) {
+      for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) {
+        search_state->modelled_rd[i][j][ref_frame] = INT64_MAX;
+        search_state->simple_rd[i][j][ref_frame] = INT64_MAX;
+      }
+    }
+  }
+
+  for (int dir = 0; dir < 2; ++dir) {
+    for (int mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) {
+      for (int ref_frame = 0; ref_frame < FWD_REFS; ++ref_frame) {
+        SingleInterModeState *state;
+
+        state = &search_state->single_state[dir][mode][ref_frame];
+        state->ref_frame = NONE_FRAME;
+        state->rd = INT64_MAX;
+
+        state = &search_state->single_state_modelled[dir][mode][ref_frame];
+        state->ref_frame = NONE_FRAME;
+        state->rd = INT64_MAX;
+      }
+    }
+  }
+  for (int dir = 0; dir < 2; ++dir) {
+    for (int mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) {
+      for (int ref_frame = 0; ref_frame < FWD_REFS; ++ref_frame) {
+        search_state->single_rd_order[dir][mode][ref_frame] = NONE_FRAME;
+      }
+    }
+  }
+  av1_zero(search_state->single_state_cnt);
+  av1_zero(search_state->single_state_modelled_cnt);
+}
+
+static bool mask_says_skip(const mode_skip_mask_t *mode_skip_mask,
+                           const MV_REFERENCE_FRAME *ref_frame,
+                           const PREDICTION_MODE this_mode) {
+  if (mode_skip_mask->pred_modes[ref_frame[0]] & (1 << this_mode)) {
+    return true;
+  }
+
+  return mode_skip_mask->ref_combo[ref_frame[0]][ref_frame[1] + 1];
+}
+
+static int inter_mode_compatible_skip(const AV1_COMP *cpi, const MACROBLOCK *x,
+                                      BLOCK_SIZE bsize,
+                                      PREDICTION_MODE curr_mode,
+                                      const MV_REFERENCE_FRAME *ref_frames) {
+  const int comp_pred = ref_frames[1] > INTRA_FRAME;
+  if (comp_pred) {
+    if (!is_comp_ref_allowed(bsize)) return 1;
+    if (!(cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frames[1]])) {
+      return 1;
+    }
+
+    const AV1_COMMON *const cm = &cpi->common;
+    if (frame_is_intra_only(cm)) return 1;
+
+    const CurrentFrame *const current_frame = &cm->current_frame;
+    if (current_frame->reference_mode == SINGLE_REFERENCE) return 1;
+
+    const struct segmentation *const seg = &cm->seg;
+    const unsigned char segment_id = x->e_mbd.mi[0]->segment_id;
+    // Do not allow compound prediction if the segment level reference frame
+    // feature is in use as in this case there can only be one reference.
+    if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) return 1;
+  }
+
+  if (ref_frames[0] > INTRA_FRAME && ref_frames[1] == INTRA_FRAME) {
+    // Mode must be compatible
+    if (!is_interintra_allowed_bsize(bsize)) return 1;
+    if (!is_interintra_allowed_mode(curr_mode)) return 1;
+  }
+
+  return 0;
+}
+
+static int fetch_picked_ref_frames_mask(const MACROBLOCK *const x,
+                                        BLOCK_SIZE bsize, int mib_size) {
+  const int sb_size_mask = mib_size - 1;
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+  const int mi_row_in_sb = mi_row & sb_size_mask;
+  const int mi_col_in_sb = mi_col & sb_size_mask;
+  const int mi_w = mi_size_wide[bsize];
+  const int mi_h = mi_size_high[bsize];
+  int picked_ref_frames_mask = 0;
+  for (int i = mi_row_in_sb; i < mi_row_in_sb + mi_h; ++i) {
+    for (int j = mi_col_in_sb; j < mi_col_in_sb + mi_w; ++j) {
+      picked_ref_frames_mask |= x->picked_ref_frames_mask[i * 32 + j];
+    }
+  }
+  return picked_ref_frames_mask;
+}
+
+// Case 1: return 0, means don't skip this mode
+// Case 2: return 1, means skip this mode completely
+// Case 3: return 2, means skip compound only, but still try single motion modes
+static int inter_mode_search_order_independent_skip(
+    const AV1_COMP *cpi, const MACROBLOCK *x, mode_skip_mask_t *mode_skip_mask,
+    InterModeSearchState *search_state, int skip_ref_frame_mask,
+    PREDICTION_MODE mode, const MV_REFERENCE_FRAME *ref_frame) {
+  if (mask_says_skip(mode_skip_mask, ref_frame, mode)) {
+    return 1;
+  }
+
+  const int ref_type = av1_ref_frame_type(ref_frame);
+  if ((cpi->prune_ref_frame_mask >> ref_type) & 1) return 1;
+
+  // This is only used in motion vector unit test.
+  if (cpi->oxcf.motion_vector_unit_test && ref_frame[0] == INTRA_FRAME)
+    return 1;
+
+  const AV1_COMMON *const cm = &cpi->common;
+  if (skip_repeated_mv(cm, x, mode, ref_frame, search_state)) {
+    return 1;
+  }
+
+  const int comp_pred = ref_frame[1] > INTRA_FRAME;
+  if ((!cpi->oxcf.enable_onesided_comp ||
+       cpi->sf.inter_sf.disable_onesided_comp) &&
+      comp_pred && cpi->all_one_sided_refs) {
+    return 1;
+  }
+
+  const MB_MODE_INFO *const mbmi = x->e_mbd.mi[0];
+  // If no valid mode has been found so far in PARTITION_NONE when finding a
+  // valid partition is required, do not skip mode.
+  if (search_state->best_rd == INT64_MAX && mbmi->partition == PARTITION_NONE &&
+      x->must_find_valid_partition)
+    return 0;
+
+  int skip_motion_mode = 0;
+  if (mbmi->partition != PARTITION_NONE && mbmi->partition != PARTITION_SPLIT) {
+    int skip_ref = skip_ref_frame_mask & (1 << ref_type);
+    if (ref_type <= ALTREF_FRAME && skip_ref) {
+      // Since the compound ref modes depends on the motion estimation result of
+      // two single ref modes( best mv of single ref modes as the start point )
+      // If current single ref mode is marked skip, we need to check if it will
+      // be used in compound ref modes.
+      for (int r = ALTREF_FRAME + 1; r < MODE_CTX_REF_FRAMES; ++r) {
+        if (skip_ref_frame_mask & (1 << r)) continue;
+        const MV_REFERENCE_FRAME *rf = ref_frame_map[r - REF_FRAMES];
+        if (rf[0] == ref_type || rf[1] == ref_type) {
+          // Found a not skipped compound ref mode which contains current
+          // single ref. So this single ref can't be skipped completly
+          // Just skip it's motion mode search, still try it's simple
+          // transition mode.
+          skip_motion_mode = 1;
+          skip_ref = 0;
+          break;
+        }
+      }
+    }
+    if (skip_ref) return 1;
+  }
+
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  if (ref_frame[0] == INTRA_FRAME) {
+    if (mode != DC_PRED) {
+      // Disable intra modes other than DC_PRED for blocks with low variance
+      // Threshold for intra skipping based on source variance
+      // TODO(debargha): Specialize the threshold for super block sizes
+      const unsigned int skip_intra_var_thresh = 64;
+      if ((sf->rt_sf.mode_search_skip_flags & FLAG_SKIP_INTRA_LOWVAR) &&
+          x->source_variance < skip_intra_var_thresh)
+        return 1;
+    }
+  }
+
+  if (prune_ref_by_selective_ref_frame(cpi, x, ref_frame,
+                                       cm->cur_frame->ref_display_order_hint))
+    return 1;
+
+  if (skip_motion_mode) return 2;
+
+  return 0;
+}
+
+static INLINE void init_mbmi(MB_MODE_INFO *mbmi, PREDICTION_MODE curr_mode,
+                             const MV_REFERENCE_FRAME *ref_frames,
+                             const AV1_COMMON *cm) {
+  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  mbmi->ref_mv_idx = 0;
+  mbmi->mode = curr_mode;
+  mbmi->uv_mode = UV_DC_PRED;
+  mbmi->ref_frame[0] = ref_frames[0];
+  mbmi->ref_frame[1] = ref_frames[1];
+  pmi->palette_size[0] = 0;
+  pmi->palette_size[1] = 0;
+  mbmi->filter_intra_mode_info.use_filter_intra = 0;
+  mbmi->mv[0].as_int = mbmi->mv[1].as_int = 0;
+  mbmi->motion_mode = SIMPLE_TRANSLATION;
+  mbmi->interintra_mode = (INTERINTRA_MODE)(II_DC_PRED - 1);
+  set_default_interp_filters(mbmi, cm->features.interp_filter);
+}
+
+static AOM_INLINE void collect_single_states(MACROBLOCK *x,
+                                             InterModeSearchState *search_state,
+                                             const MB_MODE_INFO *const mbmi) {
+  int i, j;
+  const MV_REFERENCE_FRAME ref_frame = mbmi->ref_frame[0];
+  const PREDICTION_MODE this_mode = mbmi->mode;
+  const int dir = ref_frame <= GOLDEN_FRAME ? 0 : 1;
+  const int mode_offset = INTER_OFFSET(this_mode);
+  const int ref_set = get_drl_refmv_count(x, mbmi->ref_frame, this_mode);
+
+  // Simple rd
+  int64_t simple_rd = search_state->simple_rd[this_mode][0][ref_frame];
+  for (int ref_mv_idx = 1; ref_mv_idx < ref_set; ++ref_mv_idx) {
+    const int64_t rd =
+        search_state->simple_rd[this_mode][ref_mv_idx][ref_frame];
+    if (rd < simple_rd) simple_rd = rd;
+  }
+
+  // Insertion sort of single_state
+  const SingleInterModeState this_state_s = { simple_rd, ref_frame, 1 };
+  SingleInterModeState *state_s = search_state->single_state[dir][mode_offset];
+  i = search_state->single_state_cnt[dir][mode_offset];
+  for (j = i; j > 0 && state_s[j - 1].rd > this_state_s.rd; --j)
+    state_s[j] = state_s[j - 1];
+  state_s[j] = this_state_s;
+  search_state->single_state_cnt[dir][mode_offset]++;
+
+  // Modelled rd
+  int64_t modelled_rd = search_state->modelled_rd[this_mode][0][ref_frame];
+  for (int ref_mv_idx = 1; ref_mv_idx < ref_set; ++ref_mv_idx) {
+    const int64_t rd =
+        search_state->modelled_rd[this_mode][ref_mv_idx][ref_frame];
+    if (rd < modelled_rd) modelled_rd = rd;
+  }
+
+  // Insertion sort of single_state_modelled
+  const SingleInterModeState this_state_m = { modelled_rd, ref_frame, 1 };
+  SingleInterModeState *state_m =
+      search_state->single_state_modelled[dir][mode_offset];
+  i = search_state->single_state_modelled_cnt[dir][mode_offset];
+  for (j = i; j > 0 && state_m[j - 1].rd > this_state_m.rd; --j)
+    state_m[j] = state_m[j - 1];
+  state_m[j] = this_state_m;
+  search_state->single_state_modelled_cnt[dir][mode_offset]++;
+}
+
+static AOM_INLINE void analyze_single_states(
+    const AV1_COMP *cpi, InterModeSearchState *search_state) {
+  const int prune_level = cpi->sf.inter_sf.prune_comp_search_by_single_result;
+  assert(prune_level >= 1);
+  int i, j, dir, mode;
+
+  for (dir = 0; dir < 2; ++dir) {
+    int64_t best_rd;
+    SingleInterModeState(*state)[FWD_REFS];
+    const int prune_factor = prune_level >= 2 ? 6 : 5;
+
+    // Use the best rd of GLOBALMV or NEWMV to prune the unlikely
+    // reference frames for all the modes (NEARESTMV and NEARMV may not
+    // have same motion vectors). Always keep the best of each mode
+    // because it might form the best possible combination with other mode.
+    state = search_state->single_state[dir];
+    best_rd = AOMMIN(state[INTER_OFFSET(NEWMV)][0].rd,
+                     state[INTER_OFFSET(GLOBALMV)][0].rd);
+    for (mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) {
+      for (i = 1; i < search_state->single_state_cnt[dir][mode]; ++i) {
+        if (state[mode][i].rd != INT64_MAX &&
+            (state[mode][i].rd >> 3) * prune_factor > best_rd) {
+          state[mode][i].valid = 0;
+        }
+      }
+    }
+
+    state = search_state->single_state_modelled[dir];
+    best_rd = AOMMIN(state[INTER_OFFSET(NEWMV)][0].rd,
+                     state[INTER_OFFSET(GLOBALMV)][0].rd);
+    for (mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) {
+      for (i = 1; i < search_state->single_state_modelled_cnt[dir][mode]; ++i) {
+        if (state[mode][i].rd != INT64_MAX &&
+            (state[mode][i].rd >> 3) * prune_factor > best_rd) {
+          state[mode][i].valid = 0;
+        }
+      }
+    }
+  }
+
+  // Ordering by simple rd first, then by modelled rd
+  for (dir = 0; dir < 2; ++dir) {
+    for (mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) {
+      const int state_cnt_s = search_state->single_state_cnt[dir][mode];
+      const int state_cnt_m =
+          search_state->single_state_modelled_cnt[dir][mode];
+      SingleInterModeState *state_s = search_state->single_state[dir][mode];
+      SingleInterModeState *state_m =
+          search_state->single_state_modelled[dir][mode];
+      int count = 0;
+      const int max_candidates = AOMMAX(state_cnt_s, state_cnt_m);
+      for (i = 0; i < state_cnt_s; ++i) {
+        if (state_s[i].rd == INT64_MAX) break;
+        if (state_s[i].valid) {
+          search_state->single_rd_order[dir][mode][count++] =
+              state_s[i].ref_frame;
+        }
+      }
+      if (count >= max_candidates) continue;
+
+      for (i = 0; i < state_cnt_m && count < max_candidates; ++i) {
+        if (state_m[i].rd == INT64_MAX) break;
+        if (!state_m[i].valid) continue;
+        const int ref_frame = state_m[i].ref_frame;
+        int match = 0;
+        // Check if existing already
+        for (j = 0; j < count; ++j) {
+          if (search_state->single_rd_order[dir][mode][j] == ref_frame) {
+            match = 1;
+            break;
+          }
+        }
+        if (match) continue;
+        // Check if this ref_frame is removed in simple rd
+        int valid = 1;
+        for (j = 0; j < state_cnt_s; ++j) {
+          if (ref_frame == state_s[j].ref_frame) {
+            valid = state_s[j].valid;
+            break;
+          }
+        }
+        if (valid) {
+          search_state->single_rd_order[dir][mode][count++] = ref_frame;
+        }
+      }
+    }
+  }
+}
+
+static int compound_skip_get_candidates(
+    const AV1_COMP *cpi, const InterModeSearchState *search_state,
+    const int dir, const PREDICTION_MODE mode) {
+  const int mode_offset = INTER_OFFSET(mode);
+  const SingleInterModeState *state =
+      search_state->single_state[dir][mode_offset];
+  const SingleInterModeState *state_modelled =
+      search_state->single_state_modelled[dir][mode_offset];
+
+  int max_candidates = 0;
+  for (int i = 0; i < FWD_REFS; ++i) {
+    if (search_state->single_rd_order[dir][mode_offset][i] == NONE_FRAME) break;
+    max_candidates++;
+  }
+
+  int candidates = max_candidates;
+  if (cpi->sf.inter_sf.prune_comp_search_by_single_result >= 2) {
+    candidates = AOMMIN(2, max_candidates);
+  }
+  if (cpi->sf.inter_sf.prune_comp_search_by_single_result >= 3) {
+    if (state[0].rd != INT64_MAX && state_modelled[0].rd != INT64_MAX &&
+        state[0].ref_frame == state_modelled[0].ref_frame)
+      candidates = 1;
+    if (mode == NEARMV || mode == GLOBALMV) candidates = 1;
+  }
+
+  if (cpi->sf.inter_sf.prune_comp_search_by_single_result >= 4) {
+    // Limit the number of candidates to 1 in each direction for compound
+    // prediction
+    candidates = AOMMIN(1, candidates);
+  }
+  return candidates;
+}
+
+static int compound_skip_by_single_states(
+    const AV1_COMP *cpi, const InterModeSearchState *search_state,
+    const PREDICTION_MODE this_mode, const MV_REFERENCE_FRAME ref_frame,
+    const MV_REFERENCE_FRAME second_ref_frame, const MACROBLOCK *x) {
+  const MV_REFERENCE_FRAME refs[2] = { ref_frame, second_ref_frame };
+  const int mode[2] = { compound_ref0_mode(this_mode),
+                        compound_ref1_mode(this_mode) };
+  const int mode_offset[2] = { INTER_OFFSET(mode[0]), INTER_OFFSET(mode[1]) };
+  const int mode_dir[2] = { refs[0] <= GOLDEN_FRAME ? 0 : 1,
+                            refs[1] <= GOLDEN_FRAME ? 0 : 1 };
+  int ref_searched[2] = { 0, 0 };
+  int ref_mv_match[2] = { 1, 1 };
+  int i, j;
+
+  for (i = 0; i < 2; ++i) {
+    const SingleInterModeState *state =
+        search_state->single_state[mode_dir[i]][mode_offset[i]];
+    const int state_cnt =
+        search_state->single_state_cnt[mode_dir[i]][mode_offset[i]];
+    for (j = 0; j < state_cnt; ++j) {
+      if (state[j].ref_frame == refs[i]) {
+        ref_searched[i] = 1;
+        break;
+      }
+    }
+  }
+
+  const int ref_set = get_drl_refmv_count(x, refs, this_mode);
+  for (i = 0; i < 2; ++i) {
+    if (!ref_searched[i] || (mode[i] != NEARESTMV && mode[i] != NEARMV)) {
+      continue;
+    }
+    const MV_REFERENCE_FRAME single_refs[2] = { refs[i], NONE_FRAME };
+    for (int ref_mv_idx = 0; ref_mv_idx < ref_set; ref_mv_idx++) {
+      int_mv single_mv;
+      int_mv comp_mv;
+      get_this_mv(&single_mv, mode[i], 0, ref_mv_idx, 0, single_refs,
+                  x->mbmi_ext);
+      get_this_mv(&comp_mv, this_mode, i, ref_mv_idx, 0, refs, x->mbmi_ext);
+      if (single_mv.as_int != comp_mv.as_int) {
+        ref_mv_match[i] = 0;
+        break;
+      }
+    }
+  }
+
+  for (i = 0; i < 2; ++i) {
+    if (!ref_searched[i] || !ref_mv_match[i]) continue;
+    const int candidates =
+        compound_skip_get_candidates(cpi, search_state, mode_dir[i], mode[i]);
+    const MV_REFERENCE_FRAME *ref_order =
+        search_state->single_rd_order[mode_dir[i]][mode_offset[i]];
+    int match = 0;
+    for (j = 0; j < candidates; ++j) {
+      if (refs[i] == ref_order[j]) {
+        match = 1;
+        break;
+      }
+    }
+    if (!match) return 1;
+  }
+
+  return 0;
+}
+
+// Check if ref frames of current block matches with given block.
+static INLINE void match_ref_frame(const MB_MODE_INFO *const mbmi,
+                                   const MV_REFERENCE_FRAME *ref_frames,
+                                   int *const is_ref_match) {
+  if (is_inter_block(mbmi)) {
+    is_ref_match[0] |= ref_frames[0] == mbmi->ref_frame[0];
+    is_ref_match[1] |= ref_frames[1] == mbmi->ref_frame[0];
+    if (has_second_ref(mbmi)) {
+      is_ref_match[0] |= ref_frames[0] == mbmi->ref_frame[1];
+      is_ref_match[1] |= ref_frames[1] == mbmi->ref_frame[1];
+    }
+  }
+}
+
+// Prune compound mode using ref frames of neighbor blocks.
+static INLINE int compound_skip_using_neighbor_refs(
+    MACROBLOCKD *const xd, const PREDICTION_MODE this_mode,
+    const MV_REFERENCE_FRAME *ref_frames, int prune_compound_using_neighbors) {
+  // Exclude non-extended compound modes from pruning
+  if (this_mode == NEAREST_NEARESTMV || this_mode == NEAR_NEARMV ||
+      this_mode == NEW_NEWMV || this_mode == GLOBAL_GLOBALMV)
+    return 0;
+
+  int is_ref_match[2] = { 0 };  // 0 - match for forward refs
+                                // 1 - match for backward refs
+  // Check if ref frames of this block matches with left neighbor.
+  if (xd->left_available)
+    match_ref_frame(xd->left_mbmi, ref_frames, is_ref_match);
+
+  // Check if ref frames of this block matches with above neighbor.
+  if (xd->up_available)
+    match_ref_frame(xd->above_mbmi, ref_frames, is_ref_match);
+
+  // Combine ref frame match with neighbors in forward and backward refs.
+  const int track_ref_match = is_ref_match[0] + is_ref_match[1];
+
+  // Pruning based on ref frame match with neighbors.
+  if (track_ref_match >= prune_compound_using_neighbors) return 0;
+  return 1;
+}
+
+static int compare_int64(const void *a, const void *b) {
+  int64_t a64 = *((int64_t *)a);
+  int64_t b64 = *((int64_t *)b);
+  if (a64 < b64) {
+    return -1;
+  } else if (a64 == b64) {
+    return 0;
+  } else {
+    return 1;
+  }
+}
+
+static INLINE void update_search_state(
+    InterModeSearchState *search_state, RD_STATS *best_rd_stats_dst,
+    PICK_MODE_CONTEXT *ctx, const RD_STATS *new_best_rd_stats,
+    const RD_STATS *new_best_rd_stats_y, const RD_STATS *new_best_rd_stats_uv,
+    THR_MODES new_best_mode, const MACROBLOCK *x, int txfm_search_done) {
+  const MACROBLOCKD *xd = &x->e_mbd;
+  const MB_MODE_INFO *mbmi = xd->mi[0];
+  const int skip_ctx = av1_get_skip_context(xd);
+  const int mode_is_intra =
+      (av1_mode_defs[new_best_mode].mode < INTRA_MODE_END);
+  const int skip = mbmi->skip && !mode_is_intra;
+
+  search_state->best_rd = new_best_rd_stats->rdcost;
+  search_state->best_mode_index = new_best_mode;
+  *best_rd_stats_dst = *new_best_rd_stats;
+  search_state->best_mbmode = *mbmi;
+  search_state->best_skip2 = skip;
+  search_state->best_mode_skippable = new_best_rd_stats->skip;
+  // When !txfm_search_done, new_best_rd_stats won't provide correct rate_y and
+  // rate_uv because av1_txfm_search process is replaced by rd estimation.
+  // Therfore, we should avoid updating best_rate_y and best_rate_uv here.
+  // These two values will be updated when av1_txfm_search is called.
+  if (txfm_search_done) {
+    search_state->best_rate_y =
+        new_best_rd_stats_y->rate +
+        x->skip_cost[skip_ctx][new_best_rd_stats->skip || skip];
+    search_state->best_rate_uv = new_best_rd_stats_uv->rate;
+  }
+  memcpy(ctx->blk_skip, x->blk_skip, sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+  av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
+}
+
+// Find the best RD for a reference frame (among single reference modes)
+// and store +10% of it in the 0-th element in ref_frame_rd.
+static AOM_INLINE void find_top_ref(int64_t ref_frame_rd[REF_FRAMES]) {
+  assert(ref_frame_rd[0] == INT64_MAX);
+  int64_t ref_copy[REF_FRAMES - 1];
+  memcpy(ref_copy, ref_frame_rd + 1,
+         sizeof(ref_frame_rd[0]) * (REF_FRAMES - 1));
+  qsort(ref_copy, REF_FRAMES - 1, sizeof(int64_t), compare_int64);
+
+  int64_t cutoff = ref_copy[0];
+  // The cut-off is within 10% of the best.
+  if (cutoff != INT64_MAX) {
+    assert(cutoff < INT64_MAX / 200);
+    cutoff = (110 * cutoff) / 100;
+  }
+  ref_frame_rd[0] = cutoff;
+}
+
+// Check if either frame is within the cutoff.
+static INLINE bool in_single_ref_cutoff(int64_t ref_frame_rd[REF_FRAMES],
+                                        MV_REFERENCE_FRAME frame1,
+                                        MV_REFERENCE_FRAME frame2) {
+  assert(frame2 > 0);
+  return ref_frame_rd[frame1] <= ref_frame_rd[0] ||
+         ref_frame_rd[frame2] <= ref_frame_rd[0];
+}
+
+static AOM_INLINE void evaluate_motion_mode_for_winner_candidates(
+    const AV1_COMP *const cpi, MACROBLOCK *const x, RD_STATS *const rd_cost,
+    HandleInterModeArgs *const args, TileDataEnc *const tile_data,
+    PICK_MODE_CONTEXT *const ctx,
+    struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE],
+    const motion_mode_best_st_candidate *const best_motion_mode_cands,
+    int do_tx_search, const BLOCK_SIZE bsize, int64_t *const best_est_rd,
+    InterModeSearchState *const search_state) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  InterModesInfo *const inter_modes_info = x->inter_modes_info;
+  const int num_best_cand = best_motion_mode_cands->num_motion_mode_cand;
+
+  for (int cand = 0; cand < num_best_cand; cand++) {
+    RD_STATS rd_stats;
+    RD_STATS rd_stats_y;
+    RD_STATS rd_stats_uv;
+    av1_init_rd_stats(&rd_stats);
+    av1_init_rd_stats(&rd_stats_y);
+    av1_init_rd_stats(&rd_stats_uv);
+    int disable_skip = 0, rate_mv;
+
+    rate_mv = best_motion_mode_cands->motion_mode_cand[cand].rate_mv;
+    args->skip_motion_mode =
+        best_motion_mode_cands->motion_mode_cand[cand].skip_motion_mode;
+    *mbmi = best_motion_mode_cands->motion_mode_cand[cand].mbmi;
+    rd_stats.rate =
+        best_motion_mode_cands->motion_mode_cand[cand].rate2_nocoeff;
+
+    // Continue if the best candidate is compound.
+    if (!is_inter_singleref_mode(mbmi->mode)) continue;
+
+    x->force_skip = 0;
+    const int mode_index = get_prediction_mode_idx(
+        mbmi->mode, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+    struct macroblockd_plane *p = xd->plane;
+    const BUFFER_SET orig_dst = {
+      { p[0].dst.buf, p[1].dst.buf, p[2].dst.buf },
+      { p[0].dst.stride, p[1].dst.stride, p[2].dst.stride },
+    };
+
+    set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+    args->simple_rd_state = x->simple_rd_state[mode_index];
+    // Initialize motion mode to simple translation
+    // Calculation of switchable rate depends on it.
+    mbmi->motion_mode = 0;
+    const int is_comp_pred = mbmi->ref_frame[1] > INTRA_FRAME;
+    for (int i = 0; i < num_planes; i++) {
+      xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i];
+      if (is_comp_pred) xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i];
+    }
+
+    int64_t skip_rd[2] = { search_state->best_skip_rd[0],
+                           search_state->best_skip_rd[1] };
+    int64_t ret_value = motion_mode_rd(
+        cpi, tile_data, x, bsize, &rd_stats, &rd_stats_y, &rd_stats_uv,
+        &disable_skip, args, search_state->best_rd, skip_rd, &rate_mv,
+        &orig_dst, best_est_rd, do_tx_search, inter_modes_info, 1);
+
+    if (ret_value != INT64_MAX) {
+      rd_stats.rdcost = RDCOST(x->rdmult, rd_stats.rate, rd_stats.dist);
+      const THR_MODES mode_enum = get_prediction_mode_idx(
+          mbmi->mode, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+      // Collect mode stats for multiwinner mode processing
+      store_winner_mode_stats(
+          &cpi->common, x, mbmi, &rd_stats, &rd_stats_y, &rd_stats_uv,
+          mode_enum, NULL, bsize, rd_stats.rdcost,
+          cpi->sf.winner_mode_sf.enable_multiwinner_mode_process, do_tx_search);
+      if (rd_stats.rdcost < search_state->best_rd) {
+        update_search_state(search_state, rd_cost, ctx, &rd_stats, &rd_stats_y,
+                            &rd_stats_uv, mode_enum, x, do_tx_search);
+        if (do_tx_search) search_state->best_skip_rd[0] = skip_rd[0];
+      }
+    }
+  }
+}
+
+// Arguments for speed feature pruning of inter mode search
+typedef struct {
+  int *skip_motion_mode;
+  mode_skip_mask_t *mode_skip_mask;
+  InterModeSearchState *search_state;
+  int skip_ref_frame_mask;
+  int reach_first_comp_mode;
+  int mode_thresh_mul_fact;
+  int *intra_mode_idx_ls;
+  int *intra_mode_num;
+  int prune_cpd_using_sr_stats_ready;
+} InterModeSFArgs;
+
+static int skip_inter_mode(AV1_COMP *cpi, MACROBLOCK *x, const BLOCK_SIZE bsize,
+                           int64_t *ref_frame_rd, int midx,
+                           InterModeSFArgs *args) {
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  // Get the actual prediction mode we are trying in this iteration
+  const THR_MODES mode_enum = av1_default_mode_order[midx];
+  const MODE_DEFINITION *mode_def = &av1_mode_defs[mode_enum];
+  const PREDICTION_MODE this_mode = mode_def->mode;
+  const MV_REFERENCE_FRAME *ref_frames = mode_def->ref_frame;
+  const MV_REFERENCE_FRAME ref_frame = ref_frames[0];
+  const MV_REFERENCE_FRAME second_ref_frame = ref_frames[1];
+  const int comp_pred = second_ref_frame > INTRA_FRAME;
+  const int last_single_ref_mode_idx =
+      find_last_single_ref_mode_idx(av1_default_mode_order);
+
+  // After we done with single reference modes, find the 2nd best RD
+  // for a reference frame. Only search compound modes that have a reference
+  // frame at least as good as the 2nd best.
+  if (sf->inter_sf.prune_compound_using_single_ref &&
+      midx == last_single_ref_mode_idx + 1) {
+    find_top_ref(ref_frame_rd);
+    args->prune_cpd_using_sr_stats_ready = 1;
+  }
+
+  // Check if this mode should be skipped because it is incompatible with the
+  // current frame
+  if (inter_mode_compatible_skip(cpi, x, bsize, this_mode, ref_frames))
+    return 1;
+  const int ret = inter_mode_search_order_independent_skip(
+      cpi, x, args->mode_skip_mask, args->search_state,
+      args->skip_ref_frame_mask, this_mode, mode_def->ref_frame);
+  if (ret == 1) return 1;
+  *(args->skip_motion_mode) = (ret == 2);
+
+  // We've reached the first compound prediction mode, get stats from the
+  // single reference predictors to help with pruning
+  if (sf->inter_sf.prune_comp_search_by_single_result > 0 && comp_pred &&
+      args->reach_first_comp_mode == 0) {
+    analyze_single_states(cpi, args->search_state);
+    args->reach_first_comp_mode = 1;
+  }
+
+  // Prune aggressively when best mode is skippable.
+  int mul_fact = args->search_state->best_mode_skippable
+                     ? args->mode_thresh_mul_fact
+                     : (1 << MODE_THRESH_QBITS);
+  int64_t mode_threshold =
+      (args->search_state->mode_threshold[mode_enum] * mul_fact) >>
+      MODE_THRESH_QBITS;
+
+  if (args->search_state->best_rd < mode_threshold) return 1;
+
+  // Skip this compound mode based on the RD results from the single prediction
+  // modes
+  if (sf->inter_sf.prune_comp_search_by_single_result > 0 && comp_pred) {
+    if (compound_skip_by_single_states(cpi, args->search_state, this_mode,
+                                       ref_frame, second_ref_frame, x))
+      return 1;
+  }
+
+  // Speed features to prune out INTRA frames
+  if (ref_frame == INTRA_FRAME) {
+    if ((!cpi->oxcf.enable_smooth_intra || sf->intra_sf.disable_smooth_intra) &&
+        (mbmi->mode == SMOOTH_PRED || mbmi->mode == SMOOTH_H_PRED ||
+         mbmi->mode == SMOOTH_V_PRED))
+      return 1;
+    if (!cpi->oxcf.enable_paeth_intra && mbmi->mode == PAETH_PRED) return 1;
+    if (sf->inter_sf.adaptive_mode_search > 1)
+      if ((x->source_variance << num_pels_log2_lookup[bsize]) >
+          args->search_state->best_pred_sse)
+        return 1;
+
+    // Intra modes will be handled in another loop later.
+    assert(*args->intra_mode_num < INTRA_MODES);
+    args->intra_mode_idx_ls[(*args->intra_mode_num)++] = mode_enum;
+    return 1;
+  }
+
+  if (sf->inter_sf.prune_compound_using_single_ref &&
+      args->prune_cpd_using_sr_stats_ready && comp_pred &&
+      !in_single_ref_cutoff(ref_frame_rd, ref_frame, second_ref_frame)) {
+    return 1;
+  }
+
+  if (sf->inter_sf.prune_compound_using_neighbors && comp_pred) {
+    if (compound_skip_using_neighbor_refs(
+            xd, this_mode, ref_frames,
+            sf->inter_sf.prune_compound_using_neighbors))
+      return 1;
+  }
+
+  return 0;
+}
+
+static void record_best_compound(REFERENCE_MODE reference_mode,
+                                 RD_STATS *rd_stats, int comp_pred, int rdmult,
+                                 InterModeSearchState *search_state,
+                                 int compmode_cost) {
+  int64_t single_rd, hybrid_rd, single_rate, hybrid_rate;
+
+  if (reference_mode == REFERENCE_MODE_SELECT) {
+    single_rate = rd_stats->rate - compmode_cost;
+    hybrid_rate = rd_stats->rate;
+  } else {
+    single_rate = rd_stats->rate;
+    hybrid_rate = rd_stats->rate + compmode_cost;
+  }
+
+  single_rd = RDCOST(rdmult, single_rate, rd_stats->dist);
+  hybrid_rd = RDCOST(rdmult, hybrid_rate, rd_stats->dist);
+
+  if (!comp_pred) {
+    if (single_rd <
+        search_state->intra_search_state.best_pred_rd[SINGLE_REFERENCE])
+      search_state->intra_search_state.best_pred_rd[SINGLE_REFERENCE] =
+          single_rd;
+  } else {
+    if (single_rd <
+        search_state->intra_search_state.best_pred_rd[COMPOUND_REFERENCE])
+      search_state->intra_search_state.best_pred_rd[COMPOUND_REFERENCE] =
+          single_rd;
+  }
+  if (hybrid_rd <
+      search_state->intra_search_state.best_pred_rd[REFERENCE_MODE_SELECT])
+    search_state->intra_search_state.best_pred_rd[REFERENCE_MODE_SELECT] =
+        hybrid_rd;
+}
+
+// Indicates number of winner simple translation modes to be used
+static const unsigned int num_winner_motion_modes[3] = { 0, 10, 3 };
+
+// Adds a motion mode to the candidate list for motion_mode_for_winner_cand
+// speed feature. This list consists of modes that have only searched
+// SIMPLE_TRANSLATION. The final list will be used to search other motion
+// modes after the initial RD search.
+static void handle_winner_cand(
+    MB_MODE_INFO *const mbmi,
+    motion_mode_best_st_candidate *best_motion_mode_cands,
+    int max_winner_motion_mode_cand, int64_t this_rd,
+    motion_mode_candidate *motion_mode_cand, int skip_motion_mode) {
+  // Number of current motion mode candidates in list
+  const int num_motion_mode_cand = best_motion_mode_cands->num_motion_mode_cand;
+  int valid_motion_mode_cand_loc = num_motion_mode_cand;
+
+  // find the best location to insert new motion mode candidate
+  for (int j = 0; j < num_motion_mode_cand; j++) {
+    if (this_rd < best_motion_mode_cands->motion_mode_cand[j].rd_cost) {
+      valid_motion_mode_cand_loc = j;
+      break;
+    }
+  }
+
+  // Insert motion mode if location is found
+  if (valid_motion_mode_cand_loc < max_winner_motion_mode_cand) {
+    if (num_motion_mode_cand > 0 &&
+        valid_motion_mode_cand_loc < max_winner_motion_mode_cand - 1)
+      memmove(
+          &best_motion_mode_cands
+               ->motion_mode_cand[valid_motion_mode_cand_loc + 1],
+          &best_motion_mode_cands->motion_mode_cand[valid_motion_mode_cand_loc],
+          (AOMMIN(num_motion_mode_cand, max_winner_motion_mode_cand - 1) -
+           valid_motion_mode_cand_loc) *
+              sizeof(best_motion_mode_cands->motion_mode_cand[0]));
+    motion_mode_cand->mbmi = *mbmi;
+    motion_mode_cand->rd_cost = this_rd;
+    motion_mode_cand->skip_motion_mode = skip_motion_mode;
+    best_motion_mode_cands->motion_mode_cand[valid_motion_mode_cand_loc] =
+        *motion_mode_cand;
+    best_motion_mode_cands->num_motion_mode_cand =
+        AOMMIN(max_winner_motion_mode_cand,
+               best_motion_mode_cands->num_motion_mode_cand + 1);
+  }
+}
+
+void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
+                               MACROBLOCK *x, RD_STATS *rd_cost,
+                               const BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+                               int64_t best_rd_so_far) {
+  AV1_COMMON *const cm = &cpi->common;
+  const FeatureFlags *const features = &cm->features;
+  const int num_planes = av1_num_planes(cm);
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  int i;
+  const int *comp_inter_cost =
+      x->comp_inter_cost[av1_get_reference_mode_context(xd)];
+
+  InterModeSearchState search_state;
+  init_inter_mode_search_state(&search_state, cpi, x, bsize, best_rd_so_far);
+  INTERINTRA_MODE interintra_modes[REF_FRAMES] = {
+    INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES,
+    INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES
+  };
+  HandleInterModeArgs args = { { NULL },
+                               { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE },
+                               { NULL },
+                               { MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1,
+                                 MAX_SB_SIZE >> 1 },
+                               NULL,
+                               NULL,
+                               NULL,
+                               search_state.modelled_rd,
+                               INT_MAX,
+                               INT_MAX,
+                               search_state.simple_rd,
+                               0,
+                               interintra_modes,
+                               1,
+                               NULL,
+                               { { { 0 }, { { 0 } }, { 0 }, 0, 0, 0, 0 } },
+                               0 };
+  // Indicates the appropriate number of simple translation winner modes for
+  // exhaustive motion mode evaluation
+  const int max_winner_motion_mode_cand =
+      num_winner_motion_modes[cpi->sf.winner_mode_sf
+                                  .motion_mode_for_winner_cand];
+  assert(max_winner_motion_mode_cand <= MAX_WINNER_MOTION_MODES);
+  motion_mode_candidate motion_mode_cand;
+  motion_mode_best_st_candidate best_motion_mode_cands;
+  // Initializing the number of motion mode candidates to zero.
+  best_motion_mode_cands.num_motion_mode_cand = 0;
+  for (i = 0; i < MAX_WINNER_MOTION_MODES; ++i)
+    best_motion_mode_cands.motion_mode_cand[i].rd_cost = INT64_MAX;
+
+  for (i = 0; i < REF_FRAMES; ++i) x->pred_sse[i] = INT_MAX;
+
+  av1_invalid_rd_stats(rd_cost);
+
+  // Ref frames that are selected by square partition blocks.
+  int picked_ref_frames_mask = 0;
+  if (cpi->sf.inter_sf.prune_ref_frame_for_rect_partitions &&
+      mbmi->partition != PARTITION_NONE && mbmi->partition != PARTITION_SPLIT) {
+    // prune_ref_frame_for_rect_partitions = 1 implies prune only extended
+    // partition blocks. prune_ref_frame_for_rect_partitions >=2
+    // implies prune for vert, horiz and extended partition blocks.
+    if ((mbmi->partition != PARTITION_VERT &&
+         mbmi->partition != PARTITION_HORZ) ||
+        cpi->sf.inter_sf.prune_ref_frame_for_rect_partitions >= 2) {
+      picked_ref_frames_mask =
+          fetch_picked_ref_frames_mask(x, bsize, cm->seq_params.mib_size);
+    }
+  }
+
+  // Skip ref frames that never selected by square blocks.
+  const int skip_ref_frame_mask =
+      picked_ref_frames_mask ? ~picked_ref_frames_mask : 0;
+  mode_skip_mask_t mode_skip_mask;
+  unsigned int ref_costs_single[REF_FRAMES];
+  unsigned int ref_costs_comp[REF_FRAMES][REF_FRAMES];
+  struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE];
+  // init params, set frame modes, speed features
+  set_params_rd_pick_inter_mode(cpi, x, &args, bsize, &mode_skip_mask,
+                                skip_ref_frame_mask, ref_costs_single,
+                                ref_costs_comp, yv12_mb);
+
+  int64_t best_est_rd = INT64_MAX;
+  const InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize];
+  // If do_tx_search is 0, only estimated RD should be computed.
+  // If do_tx_search is 1, all modes have TX search performed.
+  const int do_tx_search =
+      !((cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1 && md->ready) ||
+        (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 2 &&
+         num_pels_log2_lookup[bsize] > 8) ||
+        cpi->sf.rt_sf.force_tx_search_off);
+  InterModesInfo *inter_modes_info = x->inter_modes_info;
+  inter_modes_info->num = 0;
+
+  int intra_mode_num = 0;
+  int intra_mode_idx_ls[INTRA_MODES];
+
+  // Temporary buffers used by handle_inter_mode().
+  uint8_t *const tmp_buf = get_buf_by_bd(xd, x->tmp_obmc_bufs[0]);
+
+  // The best RD found for the reference frame, among single reference modes.
+  // Note that the 0-th element will contain a cut-off that is later used
+  // to determine if we should skip a compound mode.
+  int64_t ref_frame_rd[REF_FRAMES] = { INT64_MAX, INT64_MAX, INT64_MAX,
+                                       INT64_MAX, INT64_MAX, INT64_MAX,
+                                       INT64_MAX, INT64_MAX };
+  const int skip_ctx = av1_get_skip_context(xd);
+
+  // Prepared stats used later to check if we could skip intra mode eval.
+  int64_t inter_cost = -1;
+  int64_t intra_cost = -1;
+  // Need to tweak the threshold for hdres speed 0 & 1.
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+
+  // Obtain the relevant tpl stats for pruning inter modes
+  PruneInfoFromTpl inter_cost_info_from_tpl;
+#if !CONFIG_REALTIME_ONLY
+  if (cpi->sf.inter_sf.prune_inter_modes_based_on_tpl) {
+    // x->search_ref_frame[id] = 1 => no pruning in
+    // prune_ref_by_selective_ref_frame()
+    // x->search_ref_frame[id] = 0  => ref frame can be pruned in
+    // prune_ref_by_selective_ref_frame()
+    // Populating valid_refs[idx] = 1 ensures that
+    // 'inter_cost_info_from_tpl.best_inter_cost' does not correspond to a
+    // pruned ref frame.
+    int valid_refs[INTER_REFS_PER_FRAME];
+    for (MV_REFERENCE_FRAME frame = LAST_FRAME; frame < REF_FRAMES; frame++) {
+      const MV_REFERENCE_FRAME refs[2] = { frame, NONE_FRAME };
+      valid_refs[frame - 1] =
+          x->search_ref_frame[frame] ||
+          !prune_ref_by_selective_ref_frame(
+              cpi, x, refs, cm->cur_frame->ref_display_order_hint);
+    }
+    av1_zero(inter_cost_info_from_tpl);
+    get_block_level_tpl_stats(cpi, bsize, mi_row, mi_col, valid_refs,
+                              &inter_cost_info_from_tpl);
+  }
+#endif
+  const int do_pruning =
+      (AOMMIN(cm->width, cm->height) > 480 && cpi->speed <= 1) ? 0 : 1;
+  if (do_pruning && sf->intra_sf.skip_intra_in_interframe) {
+    // Only consider full SB.
+    int len = tpl_blocks_in_sb(cm->seq_params.sb_size);
+    if (len == x->valid_cost_b) {
+      const BLOCK_SIZE tpl_bsize = convert_length_to_bsize(MC_FLOW_BSIZE_1D);
+      const int tplw = mi_size_wide[tpl_bsize];
+      const int tplh = mi_size_high[tpl_bsize];
+      const int nw = mi_size_wide[bsize] / tplw;
+      const int nh = mi_size_high[bsize] / tplh;
+      if (nw >= 1 && nh >= 1) {
+        const int of_h = mi_row % mi_size_high[cm->seq_params.sb_size];
+        const int of_w = mi_col % mi_size_wide[cm->seq_params.sb_size];
+        const int start = of_h / tplh * x->cost_stride + of_w / tplw;
+
+        for (int k = 0; k < nh; k++) {
+          for (int l = 0; l < nw; l++) {
+            inter_cost += x->inter_cost_b[start + k * x->cost_stride + l];
+            intra_cost += x->intra_cost_b[start + k * x->cost_stride + l];
+          }
+        }
+        inter_cost /= nw * nh;
+        intra_cost /= nw * nh;
+      }
+    }
+  }
+
+  // Initialize best mode stats for winner mode processing
+  av1_zero(x->winner_mode_stats);
+  x->winner_mode_count = 0;
+  store_winner_mode_stats(
+      &cpi->common, x, mbmi, NULL, NULL, NULL, THR_INVALID, NULL, bsize,
+      best_rd_so_far, cpi->sf.winner_mode_sf.enable_multiwinner_mode_process,
+      0);
+
+  int mode_thresh_mul_fact = (1 << MODE_THRESH_QBITS);
+  if (sf->inter_sf.prune_inter_modes_if_skippable) {
+    // Higher multiplication factor values for lower quantizers.
+    mode_thresh_mul_fact = mode_threshold_mul_factor[x->qindex];
+  }
+
+  // Initialize arguments for mode loop speed features
+  InterModeSFArgs sf_args = { &args.skip_motion_mode,
+                              &mode_skip_mask,
+                              &search_state,
+                              skip_ref_frame_mask,
+                              0,
+                              mode_thresh_mul_fact,
+                              intra_mode_idx_ls,
+                              &intra_mode_num,
+                              0 };
+
+  // Here midx is just an iterator index that should not be used by itself
+  // except to keep track of the number of modes searched. It should be used
+  // with av1_default_mode_order to get the enum that defines the mode, which
+  // can be used with av1_mode_defs to get the prediction mode and the ref
+  // frames.
+  for (THR_MODES midx = THR_MODE_START; midx < THR_MODE_END; ++midx) {
+    // Get the actual prediction mode we are trying in this iteration
+    const THR_MODES mode_enum = av1_default_mode_order[midx];
+    const MODE_DEFINITION *mode_def = &av1_mode_defs[mode_enum];
+    const PREDICTION_MODE this_mode = mode_def->mode;
+    const MV_REFERENCE_FRAME *ref_frames = mode_def->ref_frame;
+
+    const MV_REFERENCE_FRAME ref_frame = ref_frames[0];
+    const MV_REFERENCE_FRAME second_ref_frame = ref_frames[1];
+    const int is_single_pred =
+        ref_frame > INTRA_FRAME && second_ref_frame == NONE_FRAME;
+    const int comp_pred = second_ref_frame > INTRA_FRAME;
+
+    init_mbmi(mbmi, this_mode, ref_frames, cm);
+
+    x->force_skip = 0;
+    set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
+
+    // Apply speed features to decide if this inter mode can be skipped
+    if (skip_inter_mode(cpi, x, bsize, ref_frame_rd, midx, &sf_args)) continue;
+
+    // Select prediction reference frames.
+    for (i = 0; i < num_planes; i++) {
+      xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
+      if (comp_pred) xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i];
+    }
+
+    mbmi->angle_delta[PLANE_TYPE_Y] = 0;
+    mbmi->angle_delta[PLANE_TYPE_UV] = 0;
+    mbmi->filter_intra_mode_info.use_filter_intra = 0;
+    mbmi->ref_mv_idx = 0;
+
+    const int64_t ref_best_rd = search_state.best_rd;
+    int disable_skip = 0;
+    RD_STATS rd_stats, rd_stats_y, rd_stats_uv;
+    av1_init_rd_stats(&rd_stats);
+
+    const int ref_frame_cost = comp_pred
+                                   ? ref_costs_comp[ref_frame][second_ref_frame]
+                                   : ref_costs_single[ref_frame];
+    const int compmode_cost =
+        is_comp_ref_allowed(mbmi->sb_type) ? comp_inter_cost[comp_pred] : 0;
+    const int real_compmode_cost =
+        cm->current_frame.reference_mode == REFERENCE_MODE_SELECT
+            ? compmode_cost
+            : 0;
+    // Point to variables that are maintained between loop iterations
+    args.single_newmv = search_state.single_newmv;
+    args.single_newmv_rate = search_state.single_newmv_rate;
+    args.single_newmv_valid = search_state.single_newmv_valid;
+    args.single_comp_cost = real_compmode_cost;
+    args.ref_frame_cost = ref_frame_cost;
+    if (is_single_pred) {
+      args.simple_rd_state = x->simple_rd_state[mode_enum];
+    }
+
+    int64_t skip_rd[2] = { search_state.best_skip_rd[0],
+                           search_state.best_skip_rd[1] };
+    int64_t this_rd = handle_inter_mode(
+        cpi, tile_data, x, bsize, &rd_stats, &rd_stats_y, &rd_stats_uv,
+        &disable_skip, &args, ref_best_rd, tmp_buf, &x->comp_rd_buffer,
+        &best_est_rd, do_tx_search, inter_modes_info, &motion_mode_cand,
+        skip_rd, &inter_cost_info_from_tpl);
+
+    if (sf->inter_sf.prune_comp_search_by_single_result > 0 &&
+        is_inter_singleref_mode(this_mode) && args.single_ref_first_pass) {
+      collect_single_states(x, &search_state, mbmi);
+    }
+
+    if (this_rd == INT64_MAX) continue;
+
+    if (mbmi->skip) {
+      rd_stats_y.rate = 0;
+      rd_stats_uv.rate = 0;
+    }
+
+    if (sf->inter_sf.prune_compound_using_single_ref && is_single_pred &&
+        this_rd < ref_frame_rd[ref_frame]) {
+      ref_frame_rd[ref_frame] = this_rd;
+    }
+
+    // Did this mode help, i.e., is it the new best mode
+    if (this_rd < search_state.best_rd) {
+      assert(IMPLIES(comp_pred,
+                     cm->current_frame.reference_mode != SINGLE_REFERENCE));
+      search_state.best_pred_sse = x->pred_sse[ref_frame];
+      update_search_state(&search_state, rd_cost, ctx, &rd_stats, &rd_stats_y,
+                          &rd_stats_uv, mode_enum, x, do_tx_search);
+      if (do_tx_search) search_state.best_skip_rd[0] = skip_rd[0];
+      search_state.best_skip_rd[1] = skip_rd[1];
+    }
+    if (cpi->sf.winner_mode_sf.motion_mode_for_winner_cand) {
+      // Add this mode to motion mode candidate list for motion mode search
+      // if using motion_mode_for_winner_cand speed feature
+      handle_winner_cand(mbmi, &best_motion_mode_cands,
+                         max_winner_motion_mode_cand, this_rd,
+                         &motion_mode_cand, args.skip_motion_mode);
+    }
+
+    /* keep record of best compound/single-only prediction */
+    if (!disable_skip) {
+      record_best_compound(cm->current_frame.reference_mode, &rd_stats,
+                           comp_pred, x->rdmult, &search_state, compmode_cost);
+    }
+  }
+
+  if (cpi->sf.winner_mode_sf.motion_mode_for_winner_cand) {
+    // For the single ref winner candidates, evaluate other motion modes (non
+    // simple translation).
+    evaluate_motion_mode_for_winner_candidates(
+        cpi, x, rd_cost, &args, tile_data, ctx, yv12_mb,
+        &best_motion_mode_cands, do_tx_search, bsize, &best_est_rd,
+        &search_state);
+  }
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, do_tx_search_time);
+#endif
+  if (do_tx_search != 1) {
+    inter_modes_info_sort(inter_modes_info, inter_modes_info->rd_idx_pair_arr);
+    search_state.best_rd = best_rd_so_far;
+    search_state.best_mode_index = THR_INVALID;
+    // Initialize best mode stats for winner mode processing
+    x->winner_mode_count = 0;
+    store_winner_mode_stats(
+        &cpi->common, x, mbmi, NULL, NULL, NULL, THR_INVALID, NULL, bsize,
+        best_rd_so_far, cpi->sf.winner_mode_sf.enable_multiwinner_mode_process,
+        do_tx_search);
+    inter_modes_info->num =
+        inter_modes_info->num < cpi->sf.rt_sf.num_inter_modes_for_tx_search
+            ? inter_modes_info->num
+            : cpi->sf.rt_sf.num_inter_modes_for_tx_search;
+    const int64_t top_est_rd =
+        inter_modes_info->num > 0
+            ? inter_modes_info
+                  ->est_rd_arr[inter_modes_info->rd_idx_pair_arr[0].idx]
+            : INT64_MAX;
+    for (int j = 0; j < inter_modes_info->num; ++j) {
+      const int data_idx = inter_modes_info->rd_idx_pair_arr[j].idx;
+      *mbmi = inter_modes_info->mbmi_arr[data_idx];
+      int64_t curr_est_rd = inter_modes_info->est_rd_arr[data_idx];
+      if (curr_est_rd * 0.80 > top_est_rd) break;
+
+      x->force_skip = 0;
+      set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+
+      // Select prediction reference frames.
+      const int is_comp_pred = mbmi->ref_frame[1] > INTRA_FRAME;
+      for (i = 0; i < num_planes; i++) {
+        xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i];
+        if (is_comp_pred) xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i];
+      }
+
+      av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0,
+                                    av1_num_planes(cm) - 1);
+      if (mbmi->motion_mode == OBMC_CAUSAL) {
+        av1_build_obmc_inter_predictors_sb(cm, xd);
+      }
+
+      RD_STATS rd_stats;
+      RD_STATS rd_stats_y;
+      RD_STATS rd_stats_uv;
+      const int mode_rate = inter_modes_info->mode_rate_arr[data_idx];
+      int64_t skip_rd = INT64_MAX;
+      if (cpi->sf.inter_sf.txfm_rd_gate_level) {
+        // Check if the mode is good enough based on skip RD
+        int64_t curr_sse = inter_modes_info->sse_arr[data_idx];
+        skip_rd = RDCOST(x->rdmult, mode_rate, curr_sse);
+        int eval_txfm =
+            check_txfm_eval(x, bsize, search_state.best_skip_rd[0], skip_rd,
+                            cpi->sf.inter_sf.txfm_rd_gate_level, 0);
+        if (!eval_txfm) continue;
+      }
+
+      if (!av1_txfm_search(cpi, x, bsize, &rd_stats, &rd_stats_y, &rd_stats_uv,
+                           mode_rate, search_state.best_rd)) {
+        continue;
+      } else if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1) {
+        inter_mode_data_push(tile_data, mbmi->sb_type, rd_stats.sse,
+                             rd_stats.dist,
+                             rd_stats_y.rate + rd_stats_uv.rate +
+                                 x->skip_cost[skip_ctx][mbmi->skip]);
+      }
+      rd_stats.rdcost = RDCOST(x->rdmult, rd_stats.rate, rd_stats.dist);
+
+      const THR_MODES mode_enum = get_prediction_mode_idx(
+          mbmi->mode, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+
+      // Collect mode stats for multiwinner mode processing
+      const int txfm_search_done = 1;
+      store_winner_mode_stats(
+          &cpi->common, x, mbmi, &rd_stats, &rd_stats_y, &rd_stats_uv,
+          mode_enum, NULL, bsize, rd_stats.rdcost,
+          cpi->sf.winner_mode_sf.enable_multiwinner_mode_process,
+          txfm_search_done);
+
+      if (rd_stats.rdcost < search_state.best_rd) {
+        update_search_state(&search_state, rd_cost, ctx, &rd_stats, &rd_stats_y,
+                            &rd_stats_uv, mode_enum, x, txfm_search_done);
+        search_state.best_skip_rd[0] = skip_rd;
+      }
+    }
+  }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, do_tx_search_time);
+#endif
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, handle_intra_mode_time);
+#endif
+
+  // Gate intra mode evaluation if best of inter is skip except when source
+  // variance is extremely low
+  if (sf->intra_sf.skip_intra_in_interframe &&
+      (x->source_variance > sf->intra_sf.src_var_thresh_intra_skip)) {
+    if (inter_cost >= 0 && intra_cost >= 0) {
+      aom_clear_system_state();
+      const NN_CONFIG *nn_config = (AOMMIN(cm->width, cm->height) <= 480)
+                                       ? &av1_intrap_nn_config
+                                       : &av1_intrap_hd_nn_config;
+      float nn_features[6];
+      float scores[2] = { 0.0f };
+      float probs[2] = { 0.0f };
+
+      nn_features[0] = (float)search_state.best_mbmode.skip;
+      nn_features[1] = (float)mi_size_wide_log2[bsize];
+      nn_features[2] = (float)mi_size_high_log2[bsize];
+      nn_features[3] = (float)intra_cost;
+      nn_features[4] = (float)inter_cost;
+      const int ac_q = av1_ac_quant_QTX(x->qindex, 0, xd->bd);
+      const int ac_q_max = av1_ac_quant_QTX(255, 0, xd->bd);
+      nn_features[5] = (float)(ac_q_max / ac_q);
+
+      av1_nn_predict(nn_features, nn_config, 1, scores);
+      aom_clear_system_state();
+      av1_nn_softmax(scores, probs, 2);
+
+      if (probs[1] > 0.8) search_state.intra_search_state.skip_intra_modes = 1;
+    } else if ((search_state.best_mbmode.skip) &&
+               (sf->intra_sf.skip_intra_in_interframe >= 2)) {
+      search_state.intra_search_state.skip_intra_modes = 1;
+    }
+  }
+
+  const int intra_ref_frame_cost = ref_costs_single[INTRA_FRAME];
+  for (int j = 0; j < intra_mode_num; ++j) {
+    if (sf->intra_sf.skip_intra_in_interframe &&
+        search_state.intra_search_state.skip_intra_modes)
+      break;
+    const THR_MODES mode_enum = intra_mode_idx_ls[j];
+    const MODE_DEFINITION *mode_def = &av1_mode_defs[mode_enum];
+    const PREDICTION_MODE this_mode = mode_def->mode;
+
+    assert(av1_mode_defs[mode_enum].ref_frame[0] == INTRA_FRAME);
+    assert(av1_mode_defs[mode_enum].ref_frame[1] == NONE_FRAME);
+    init_mbmi(mbmi, this_mode, av1_mode_defs[mode_enum].ref_frame, cm);
+    x->force_skip = 0;
+
+    if (this_mode != DC_PRED) {
+      // Only search the oblique modes if the best so far is
+      // one of the neighboring directional modes
+      if ((sf->rt_sf.mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) &&
+          (this_mode >= D45_PRED && this_mode <= PAETH_PRED)) {
+        if (search_state.best_mode_index != THR_INVALID &&
+            search_state.best_mbmode.ref_frame[0] > INTRA_FRAME)
+          continue;
+      }
+      if (sf->rt_sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
+        if (conditional_skipintra(
+                this_mode, search_state.intra_search_state.best_intra_mode))
+          continue;
+      }
+    }
+
+    RD_STATS intra_rd_stats, intra_rd_stats_y, intra_rd_stats_uv;
+    intra_rd_stats.rdcost = av1_handle_intra_mode(
+        &search_state.intra_search_state, cpi, x, bsize, intra_ref_frame_cost,
+        ctx, 0, &intra_rd_stats, &intra_rd_stats_y, &intra_rd_stats_uv,
+        search_state.best_rd, &search_state.best_intra_rd,
+        search_state.best_mbmode.skip);
+    // Collect mode stats for multiwinner mode processing
+    const int txfm_search_done = 1;
+    store_winner_mode_stats(
+        &cpi->common, x, mbmi, &intra_rd_stats, &intra_rd_stats_y,
+        &intra_rd_stats_uv, mode_enum, NULL, bsize, intra_rd_stats.rdcost,
+        cpi->sf.winner_mode_sf.enable_multiwinner_mode_process,
+        txfm_search_done);
+    if (intra_rd_stats.rdcost < search_state.best_rd) {
+      update_search_state(&search_state, rd_cost, ctx, &intra_rd_stats,
+                          &intra_rd_stats_y, &intra_rd_stats_uv, mode_enum, x,
+                          txfm_search_done);
+    }
+  }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, handle_intra_mode_time);
+#endif
+
+  int winner_mode_count = cpi->sf.winner_mode_sf.enable_multiwinner_mode_process
+                              ? x->winner_mode_count
+                              : 1;
+  // In effect only when fast tx search speed features are enabled.
+  refine_winner_mode_tx(
+      cpi, x, rd_cost, bsize, ctx, &search_state.best_mode_index,
+      &search_state.best_mbmode, yv12_mb, search_state.best_rate_y,
+      search_state.best_rate_uv, &search_state.best_skip2, winner_mode_count);
+
+  // Initialize default mode evaluation params
+  set_mode_eval_params(cpi, x, DEFAULT_EVAL);
+
+  // Only try palette mode when the best mode so far is an intra mode.
+  const int try_palette =
+      cpi->oxcf.enable_palette &&
+      av1_allow_palette(features->allow_screen_content_tools, mbmi->sb_type) &&
+      !is_inter_mode(search_state.best_mbmode.mode);
+  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  RD_STATS this_rd_cost;
+  int this_skippable = 0;
+  if (try_palette) {
+    this_skippable = av1_search_palette_mode(
+        cpi, x, &this_rd_cost, ctx, bsize, mbmi, pmi, ref_costs_single,
+        &search_state.intra_search_state, search_state.best_rd);
+    if (this_rd_cost.rdcost < search_state.best_rd) {
+      search_state.best_mode_index = THR_DC;
+      mbmi->mv[0].as_int = 0;
+      rd_cost->rate = this_rd_cost.rate;
+      rd_cost->dist = this_rd_cost.dist;
+      rd_cost->rdcost = this_rd_cost.rdcost;
+      search_state.best_rd = rd_cost->rdcost;
+      search_state.best_mbmode = *mbmi;
+      search_state.best_skip2 = 0;
+      search_state.best_mode_skippable = this_skippable;
+      memcpy(ctx->blk_skip, x->blk_skip,
+             sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+      av1_copy_array(ctx->tx_type_map, xd->tx_type_map, ctx->num_4x4_blk);
+    }
+  }
+
+  search_state.best_mbmode.skip_mode = 0;
+  if (cm->current_frame.skip_mode_info.skip_mode_flag &&
+      is_comp_ref_allowed(bsize)) {
+    const struct segmentation *const seg = &cm->seg;
+    unsigned char segment_id = mbmi->segment_id;
+    if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) {
+      rd_pick_skip_mode(rd_cost, &search_state, cpi, x, bsize, yv12_mb);
+    }
+  }
+
+  // Make sure that the ref_mv_idx is only nonzero when we're
+  // using a mode which can support ref_mv_idx
+  if (search_state.best_mbmode.ref_mv_idx != 0 &&
+      !(search_state.best_mbmode.mode == NEWMV ||
+        search_state.best_mbmode.mode == NEW_NEWMV ||
+        have_nearmv_in_inter_mode(search_state.best_mbmode.mode))) {
+    search_state.best_mbmode.ref_mv_idx = 0;
+  }
+
+  if (search_state.best_mode_index == THR_INVALID ||
+      search_state.best_rd >= best_rd_so_far) {
+    rd_cost->rate = INT_MAX;
+    rd_cost->rdcost = INT64_MAX;
+    return;
+  }
+
+  const InterpFilter interp_filter = features->interp_filter;
+  assert((interp_filter == SWITCHABLE) ||
+         (interp_filter ==
+          search_state.best_mbmode.interp_filters.as_filters.y_filter) ||
+         !is_inter_block(&search_state.best_mbmode));
+  assert((interp_filter == SWITCHABLE) ||
+         (interp_filter ==
+          search_state.best_mbmode.interp_filters.as_filters.x_filter) ||
+         !is_inter_block(&search_state.best_mbmode));
+
+  if (!cpi->rc.is_src_frame_alt_ref && cpi->sf.inter_sf.adaptive_rd_thresh) {
+    av1_update_rd_thresh_fact(cm, x->thresh_freq_fact,
+                              sf->inter_sf.adaptive_rd_thresh, bsize,
+                              search_state.best_mode_index);
+  }
+
+  // macroblock modes
+  *mbmi = search_state.best_mbmode;
+  x->force_skip |= search_state.best_skip2;
+
+  // Note: this section is needed since the mode may have been forced to
+  // GLOBALMV by the all-zero mode handling of ref-mv.
+  if (mbmi->mode == GLOBALMV || mbmi->mode == GLOBAL_GLOBALMV) {
+    // Correct the interp filters for GLOBALMV
+    if (is_nontrans_global_motion(xd, xd->mi[0])) {
+      int_interpfilters filters =
+          av1_broadcast_interp_filter(av1_unswitchable_filter(interp_filter));
+      assert(mbmi->interp_filters.as_int == filters.as_int);
+      (void)filters;
+    }
+  }
+
+  for (i = 0; i < REFERENCE_MODES; ++i) {
+    if (search_state.intra_search_state.best_pred_rd[i] == INT64_MAX) {
+      search_state.best_pred_diff[i] = INT_MIN;
+    } else {
+      search_state.best_pred_diff[i] =
+          search_state.best_rd -
+          search_state.intra_search_state.best_pred_rd[i];
+    }
+  }
+
+  x->force_skip |= search_state.best_mode_skippable;
+
+  assert(search_state.best_mode_index != THR_INVALID);
+
+#if CONFIG_INTERNAL_STATS
+  store_coding_context(x, ctx, search_state.best_mode_index,
+                       search_state.best_pred_diff,
+                       search_state.best_mode_skippable);
+#else
+  store_coding_context(x, ctx, search_state.best_pred_diff,
+                       search_state.best_mode_skippable);
+#endif  // CONFIG_INTERNAL_STATS
+
+  if (pmi->palette_size[1] > 0) {
+    assert(try_palette);
+    av1_restore_uv_color_map(cpi, x);
+  }
+}
+
+void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi,
+                                        TileDataEnc *tile_data, MACROBLOCK *x,
+                                        int mi_row, int mi_col,
+                                        RD_STATS *rd_cost, BLOCK_SIZE bsize,
+                                        PICK_MODE_CONTEXT *ctx,
+                                        int64_t best_rd_so_far) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const FeatureFlags *const features = &cm->features;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  unsigned char segment_id = mbmi->segment_id;
+  const int comp_pred = 0;
+  int i;
+  int64_t best_pred_diff[REFERENCE_MODES];
+  unsigned int ref_costs_single[REF_FRAMES];
+  unsigned int ref_costs_comp[REF_FRAMES][REF_FRAMES];
+  int *comp_inter_cost = x->comp_inter_cost[av1_get_reference_mode_context(xd)];
+  InterpFilter best_filter = SWITCHABLE;
+  int64_t this_rd = INT64_MAX;
+  int rate2 = 0;
+  const int64_t distortion2 = 0;
+  (void)mi_row;
+  (void)mi_col;
+  (void)tile_data;
+
+  av1_collect_neighbors_ref_counts(xd);
+
+  estimate_ref_frame_costs(cm, xd, x, segment_id, ref_costs_single,
+                           ref_costs_comp);
+
+  for (i = 0; i < REF_FRAMES; ++i) x->pred_sse[i] = INT_MAX;
+  for (i = LAST_FRAME; i < REF_FRAMES; ++i) x->pred_mv_sad[i] = INT_MAX;
+
+  rd_cost->rate = INT_MAX;
+
+  assert(segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP));
+
+  mbmi->palette_mode_info.palette_size[0] = 0;
+  mbmi->palette_mode_info.palette_size[1] = 0;
+  mbmi->filter_intra_mode_info.use_filter_intra = 0;
+  mbmi->mode = GLOBALMV;
+  mbmi->motion_mode = SIMPLE_TRANSLATION;
+  mbmi->uv_mode = UV_DC_PRED;
+  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME))
+    mbmi->ref_frame[0] = get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME);
+  else
+    mbmi->ref_frame[0] = LAST_FRAME;
+  mbmi->ref_frame[1] = NONE_FRAME;
+  mbmi->mv[0].as_int =
+      gm_get_motion_vector(&cm->global_motion[mbmi->ref_frame[0]],
+                           features->allow_high_precision_mv, bsize, mi_col,
+                           mi_row, features->cur_frame_force_integer_mv)
+          .as_int;
+  mbmi->tx_size = max_txsize_lookup[bsize];
+  x->force_skip = 1;
+
+  mbmi->ref_mv_idx = 0;
+
+  mbmi->motion_mode = SIMPLE_TRANSLATION;
+  av1_count_overlappable_neighbors(cm, xd);
+  if (is_motion_variation_allowed_bsize(bsize) && !has_second_ref(mbmi)) {
+    int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
+    mbmi->num_proj_ref = av1_findSamples(cm, xd, pts, pts_inref);
+    // Select the samples according to motion vector difference
+    if (mbmi->num_proj_ref > 1)
+      mbmi->num_proj_ref = av1_selectSamples(&mbmi->mv[0].as_mv, pts, pts_inref,
+                                             mbmi->num_proj_ref, bsize);
+  }
+
+  const InterpFilter interp_filter = features->interp_filter;
+  set_default_interp_filters(mbmi, interp_filter);
+
+  if (interp_filter != SWITCHABLE) {
+    best_filter = interp_filter;
+  } else {
+    best_filter = EIGHTTAP_REGULAR;
+    if (av1_is_interp_needed(xd) &&
+        x->source_variance >=
+            cpi->sf.interp_sf.disable_filter_search_var_thresh) {
+      int rs;
+      int best_rs = INT_MAX;
+      for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
+        mbmi->interp_filters = av1_broadcast_interp_filter(i);
+        rs = av1_get_switchable_rate(x, xd, interp_filter);
+        if (rs < best_rs) {
+          best_rs = rs;
+          best_filter = mbmi->interp_filters.as_filters.y_filter;
+        }
+      }
+    }
+  }
+  // Set the appropriate filter
+  mbmi->interp_filters = av1_broadcast_interp_filter(best_filter);
+  rate2 += av1_get_switchable_rate(x, xd, interp_filter);
+
+  if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT)
+    rate2 += comp_inter_cost[comp_pred];
+
+  // Estimate the reference frame signaling cost and add it
+  // to the rolling cost variable.
+  rate2 += ref_costs_single[LAST_FRAME];
+  this_rd = RDCOST(x->rdmult, rate2, distortion2);
+
+  rd_cost->rate = rate2;
+  rd_cost->dist = distortion2;
+  rd_cost->rdcost = this_rd;
+
+  if (this_rd >= best_rd_so_far) {
+    rd_cost->rate = INT_MAX;
+    rd_cost->rdcost = INT64_MAX;
+    return;
+  }
+
+  assert((interp_filter == SWITCHABLE) ||
+         (interp_filter == mbmi->interp_filters.as_filters.y_filter));
+
+  if (cpi->sf.inter_sf.adaptive_rd_thresh) {
+    av1_update_rd_thresh_fact(cm, x->thresh_freq_fact,
+                              cpi->sf.inter_sf.adaptive_rd_thresh, bsize,
+                              THR_GLOBALMV);
+  }
+
+  av1_zero(best_pred_diff);
+
+#if CONFIG_INTERNAL_STATS
+  store_coding_context(x, ctx, THR_GLOBALMV, best_pred_diff, 0);
+#else
+  store_coding_context(x, ctx, best_pred_diff, 0);
+#endif  // CONFIG_INTERNAL_STATS
+}
+
+struct calc_target_weighted_pred_ctxt {
+  const MACROBLOCK *x;
+  const uint8_t *tmp;
+  int tmp_stride;
+  int overlap;
+};
+
+static INLINE void calc_target_weighted_pred_above(
+    MACROBLOCKD *xd, int rel_mi_row, int rel_mi_col, uint8_t op_mi_size,
+    int dir, MB_MODE_INFO *nb_mi, void *fun_ctxt, const int num_planes) {
+  (void)nb_mi;
+  (void)num_planes;
+  (void)rel_mi_row;
+  (void)dir;
+
+  struct calc_target_weighted_pred_ctxt *ctxt =
+      (struct calc_target_weighted_pred_ctxt *)fun_ctxt;
+
+  const int bw = xd->width << MI_SIZE_LOG2;
+  const uint8_t *const mask1d = av1_get_obmc_mask(ctxt->overlap);
+
+  int32_t *wsrc = ctxt->x->wsrc_buf + (rel_mi_col * MI_SIZE);
+  int32_t *mask = ctxt->x->mask_buf + (rel_mi_col * MI_SIZE);
+  const uint8_t *tmp = ctxt->tmp + rel_mi_col * MI_SIZE;
+  const int is_hbd = is_cur_buf_hbd(xd);
+
+  if (!is_hbd) {
+    for (int row = 0; row < ctxt->overlap; ++row) {
+      const uint8_t m0 = mask1d[row];
+      const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
+      for (int col = 0; col < op_mi_size * MI_SIZE; ++col) {
+        wsrc[col] = m1 * tmp[col];
+        mask[col] = m0;
+      }
+      wsrc += bw;
+      mask += bw;
+      tmp += ctxt->tmp_stride;
+    }
+  } else {
+    const uint16_t *tmp16 = CONVERT_TO_SHORTPTR(tmp);
+
+    for (int row = 0; row < ctxt->overlap; ++row) {
+      const uint8_t m0 = mask1d[row];
+      const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
+      for (int col = 0; col < op_mi_size * MI_SIZE; ++col) {
+        wsrc[col] = m1 * tmp16[col];
+        mask[col] = m0;
+      }
+      wsrc += bw;
+      mask += bw;
+      tmp16 += ctxt->tmp_stride;
+    }
+  }
+}
+
+static INLINE void calc_target_weighted_pred_left(
+    MACROBLOCKD *xd, int rel_mi_row, int rel_mi_col, uint8_t op_mi_size,
+    int dir, MB_MODE_INFO *nb_mi, void *fun_ctxt, const int num_planes) {
+  (void)nb_mi;
+  (void)num_planes;
+  (void)rel_mi_col;
+  (void)dir;
+
+  struct calc_target_weighted_pred_ctxt *ctxt =
+      (struct calc_target_weighted_pred_ctxt *)fun_ctxt;
+
+  const int bw = xd->width << MI_SIZE_LOG2;
+  const uint8_t *const mask1d = av1_get_obmc_mask(ctxt->overlap);
+
+  int32_t *wsrc = ctxt->x->wsrc_buf + (rel_mi_row * MI_SIZE * bw);
+  int32_t *mask = ctxt->x->mask_buf + (rel_mi_row * MI_SIZE * bw);
+  const uint8_t *tmp = ctxt->tmp + (rel_mi_row * MI_SIZE * ctxt->tmp_stride);
+  const int is_hbd = is_cur_buf_hbd(xd);
+
+  if (!is_hbd) {
+    for (int row = 0; row < op_mi_size * MI_SIZE; ++row) {
+      for (int col = 0; col < ctxt->overlap; ++col) {
+        const uint8_t m0 = mask1d[col];
+        const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
+        wsrc[col] = (wsrc[col] >> AOM_BLEND_A64_ROUND_BITS) * m0 +
+                    (tmp[col] << AOM_BLEND_A64_ROUND_BITS) * m1;
+        mask[col] = (mask[col] >> AOM_BLEND_A64_ROUND_BITS) * m0;
+      }
+      wsrc += bw;
+      mask += bw;
+      tmp += ctxt->tmp_stride;
+    }
+  } else {
+    const uint16_t *tmp16 = CONVERT_TO_SHORTPTR(tmp);
+
+    for (int row = 0; row < op_mi_size * MI_SIZE; ++row) {
+      for (int col = 0; col < ctxt->overlap; ++col) {
+        const uint8_t m0 = mask1d[col];
+        const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
+        wsrc[col] = (wsrc[col] >> AOM_BLEND_A64_ROUND_BITS) * m0 +
+                    (tmp16[col] << AOM_BLEND_A64_ROUND_BITS) * m1;
+        mask[col] = (mask[col] >> AOM_BLEND_A64_ROUND_BITS) * m0;
+      }
+      wsrc += bw;
+      mask += bw;
+      tmp16 += ctxt->tmp_stride;
+    }
+  }
+}
+
+// This function has a structure similar to av1_build_obmc_inter_prediction
+//
+// The OBMC predictor is computed as:
+//
+//  PObmc(x,y) =
+//    AOM_BLEND_A64(Mh(x),
+//                  AOM_BLEND_A64(Mv(y), P(x,y), PAbove(x,y)),
+//                  PLeft(x, y))
+//
+// Scaling up by AOM_BLEND_A64_MAX_ALPHA ** 2 and omitting the intermediate
+// rounding, this can be written as:
+//
+//  AOM_BLEND_A64_MAX_ALPHA * AOM_BLEND_A64_MAX_ALPHA * Pobmc(x,y) =
+//    Mh(x) * Mv(y) * P(x,y) +
+//      Mh(x) * Cv(y) * Pabove(x,y) +
+//      AOM_BLEND_A64_MAX_ALPHA * Ch(x) * PLeft(x, y)
+//
+// Where :
+//
+//  Cv(y) = AOM_BLEND_A64_MAX_ALPHA - Mv(y)
+//  Ch(y) = AOM_BLEND_A64_MAX_ALPHA - Mh(y)
+//
+// This function computes 'wsrc' and 'mask' as:
+//
+//  wsrc(x, y) =
+//    AOM_BLEND_A64_MAX_ALPHA * AOM_BLEND_A64_MAX_ALPHA * src(x, y) -
+//      Mh(x) * Cv(y) * Pabove(x,y) +
+//      AOM_BLEND_A64_MAX_ALPHA * Ch(x) * PLeft(x, y)
+//
+//  mask(x, y) = Mh(x) * Mv(y)
+//
+// These can then be used to efficiently approximate the error for any
+// predictor P in the context of the provided neighbouring predictors by
+// computing:
+//
+//  error(x, y) =
+//    wsrc(x, y) - mask(x, y) * P(x, y) / (AOM_BLEND_A64_MAX_ALPHA ** 2)
+//
+static AOM_INLINE void calc_target_weighted_pred(
+    const AV1_COMMON *cm, const MACROBLOCK *x, const MACROBLOCKD *xd,
+    const uint8_t *above, int above_stride, const uint8_t *left,
+    int left_stride) {
+  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+  const int bw = xd->width << MI_SIZE_LOG2;
+  const int bh = xd->height << MI_SIZE_LOG2;
+  int32_t *mask_buf = x->mask_buf;
+  int32_t *wsrc_buf = x->wsrc_buf;
+
+  const int is_hbd = is_cur_buf_hbd(xd);
+  const int src_scale = AOM_BLEND_A64_MAX_ALPHA * AOM_BLEND_A64_MAX_ALPHA;
+
+  // plane 0 should not be sub-sampled
+  assert(xd->plane[0].subsampling_x == 0);
+  assert(xd->plane[0].subsampling_y == 0);
+
+  av1_zero_array(wsrc_buf, bw * bh);
+  for (int i = 0; i < bw * bh; ++i) mask_buf[i] = AOM_BLEND_A64_MAX_ALPHA;
+
+  // handle above row
+  if (xd->up_available) {
+    const int overlap =
+        AOMMIN(block_size_high[bsize], block_size_high[BLOCK_64X64]) >> 1;
+    struct calc_target_weighted_pred_ctxt ctxt = { x, above, above_stride,
+                                                   overlap };
+    foreach_overlappable_nb_above(cm, (MACROBLOCKD *)xd,
+                                  max_neighbor_obmc[mi_size_wide_log2[bsize]],
+                                  calc_target_weighted_pred_above, &ctxt);
+  }
+
+  for (int i = 0; i < bw * bh; ++i) {
+    wsrc_buf[i] *= AOM_BLEND_A64_MAX_ALPHA;
+    mask_buf[i] *= AOM_BLEND_A64_MAX_ALPHA;
+  }
+
+  // handle left column
+  if (xd->left_available) {
+    const int overlap =
+        AOMMIN(block_size_wide[bsize], block_size_wide[BLOCK_64X64]) >> 1;
+    struct calc_target_weighted_pred_ctxt ctxt = { x, left, left_stride,
+                                                   overlap };
+    foreach_overlappable_nb_left(cm, (MACROBLOCKD *)xd,
+                                 max_neighbor_obmc[mi_size_high_log2[bsize]],
+                                 calc_target_weighted_pred_left, &ctxt);
+  }
+
+  if (!is_hbd) {
+    const uint8_t *src = x->plane[0].src.buf;
+
+    for (int row = 0; row < bh; ++row) {
+      for (int col = 0; col < bw; ++col) {
+        wsrc_buf[col] = src[col] * src_scale - wsrc_buf[col];
+      }
+      wsrc_buf += bw;
+      src += x->plane[0].src.stride;
+    }
+  } else {
+    const uint16_t *src = CONVERT_TO_SHORTPTR(x->plane[0].src.buf);
+
+    for (int row = 0; row < bh; ++row) {
+      for (int col = 0; col < bw; ++col) {
+        wsrc_buf[col] = src[col] * src_scale - wsrc_buf[col];
+      }
+      wsrc_buf += bw;
+      src += x->plane[0].src.stride;
+    }
+  }
+}
+
+/* Use standard 3x3 Sobel matrix. Macro so it can be used for either high or
+   low bit-depth arrays. */
+#define SOBEL_X(src, stride, i, j)                       \
+  ((src)[((i)-1) + (stride) * ((j)-1)] -                 \
+   (src)[((i) + 1) + (stride) * ((j)-1)] +  /* NOLINT */ \
+   2 * (src)[((i)-1) + (stride) * (j)] -    /* NOLINT */ \
+   2 * (src)[((i) + 1) + (stride) * (j)] +  /* NOLINT */ \
+   (src)[((i)-1) + (stride) * ((j) + 1)] -  /* NOLINT */ \
+   (src)[((i) + 1) + (stride) * ((j) + 1)]) /* NOLINT */
+#define SOBEL_Y(src, stride, i, j)                       \
+  ((src)[((i)-1) + (stride) * ((j)-1)] +                 \
+   2 * (src)[(i) + (stride) * ((j)-1)] +    /* NOLINT */ \
+   (src)[((i) + 1) + (stride) * ((j)-1)] -  /* NOLINT */ \
+   (src)[((i)-1) + (stride) * ((j) + 1)] -  /* NOLINT */ \
+   2 * (src)[(i) + (stride) * ((j) + 1)] -  /* NOLINT */ \
+   (src)[((i) + 1) + (stride) * ((j) + 1)]) /* NOLINT */
+
+sobel_xy av1_sobel(const uint8_t *input, int stride, int i, int j,
+                   bool high_bd) {
+  int16_t s_x;
+  int16_t s_y;
+  if (high_bd) {
+    const uint16_t *src = CONVERT_TO_SHORTPTR(input);
+    s_x = SOBEL_X(src, stride, i, j);
+    s_y = SOBEL_Y(src, stride, i, j);
+  } else {
+    s_x = SOBEL_X(input, stride, i, j);
+    s_y = SOBEL_Y(input, stride, i, j);
+  }
+  sobel_xy r = { .x = s_x, .y = s_y };
+  return r;
+}
+
+// 8-tap Gaussian convolution filter with sigma = 1.3, sums to 128,
+// all co-efficients must be even.
+DECLARE_ALIGNED(16, static const int16_t, gauss_filter[8]) = { 2,  12, 30, 40,
+                                                               30, 12, 2,  0 };
+
+void av1_gaussian_blur(const uint8_t *src, int src_stride, int w, int h,
+                       uint8_t *dst, bool high_bd, int bd) {
+  ConvolveParams conv_params = get_conv_params(0, 0, bd);
+  InterpFilterParams filter = { .filter_ptr = gauss_filter,
+                                .taps = 8,
+                                .subpel_shifts = 0,
+                                .interp_filter = EIGHTTAP_REGULAR };
+  // Requirements from the vector-optimized implementations.
+  assert(h % 4 == 0);
+  assert(w % 8 == 0);
+  // Because we use an eight tap filter, the stride should be at least 7 + w.
+  assert(src_stride >= w + 7);
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (high_bd) {
+    av1_highbd_convolve_2d_sr(CONVERT_TO_SHORTPTR(src), src_stride,
+                              CONVERT_TO_SHORTPTR(dst), w, w, h, &filter,
+                              &filter, 0, 0, &conv_params, bd);
+  } else {
+    av1_convolve_2d_sr(src, src_stride, dst, w, w, h, &filter, &filter, 0, 0,
+                       &conv_params);
+  }
+#else
+  (void)high_bd;
+  av1_convolve_2d_sr(src, src_stride, dst, w, w, h, &filter, &filter, 0, 0,
+                     &conv_params);
+#endif
+}
+
+static EdgeInfo edge_probability(const uint8_t *input, int w, int h,
+                                 bool high_bd, int bd) {
+  // The probability of an edge in the whole image is the same as the highest
+  // probability of an edge for any individual pixel. Use Sobel as the metric
+  // for finding an edge.
+  uint16_t highest = 0;
+  uint16_t highest_x = 0;
+  uint16_t highest_y = 0;
+  // Ignore the 1 pixel border around the image for the computation.
+  for (int j = 1; j < h - 1; ++j) {
+    for (int i = 1; i < w - 1; ++i) {
+      sobel_xy g = av1_sobel(input, w, i, j, high_bd);
+      // Scale down to 8-bit to get same output regardless of bit depth.
+      int16_t g_x = g.x >> (bd - 8);
+      int16_t g_y = g.y >> (bd - 8);
+      uint16_t magnitude = (uint16_t)sqrt(g_x * g_x + g_y * g_y);
+      highest = AOMMAX(highest, magnitude);
+      highest_x = AOMMAX(highest_x, g_x);
+      highest_y = AOMMAX(highest_y, g_y);
+    }
+  }
+  EdgeInfo ei = { .magnitude = highest, .x = highest_x, .y = highest_y };
+  return ei;
+}
+
+/* Uses most of the Canny edge detection algorithm to find if there are any
+ * edges in the image.
+ */
+EdgeInfo av1_edge_exists(const uint8_t *src, int src_stride, int w, int h,
+                         bool high_bd, int bd) {
+  if (w < 3 || h < 3) {
+    EdgeInfo n = { .magnitude = 0, .x = 0, .y = 0 };
+    return n;
+  }
+  uint8_t *blurred;
+  if (high_bd) {
+    blurred = CONVERT_TO_BYTEPTR(aom_memalign(32, sizeof(uint16_t) * w * h));
+  } else {
+    blurred = (uint8_t *)aom_memalign(32, sizeof(uint8_t) * w * h);
+  }
+  av1_gaussian_blur(src, src_stride, w, h, blurred, high_bd, bd);
+  // Skip the non-maximum suppression step in Canny edge detection. We just
+  // want a probability of an edge existing in the buffer, which is determined
+  // by the strongest edge in it -- we don't need to eliminate the weaker
+  // edges. Use Sobel for the edge detection.
+  EdgeInfo prob = edge_probability(blurred, w, h, high_bd, bd);
+  if (high_bd) {
+    aom_free(CONVERT_TO_SHORTPTR(blurred));
+  } else {
+    aom_free(blurred);
+  }
+  return prob;
+}
diff --git a/libs/libaom/src/av1/encoder/rdopt.h b/libs/libaom/src/av1/encoder/rdopt.h
new file mode 100644
index 000000000..c7c99ac4b
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/rdopt.h
@@ -0,0 +1,244 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_RDOPT_H_
+#define AOM_AV1_ENCODER_RDOPT_H_
+
+#include <stdbool.h>
+
+#include "av1/common/blockd.h"
+#include "av1/common/txb_common.h"
+
+#include "av1/encoder/block.h"
+#include "av1/encoder/context_tree.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encodetxb.h"
+#include "av1/encoder/rdopt_utils.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define COMP_TYPE_RD_THRESH_SCALE 11
+#define COMP_TYPE_RD_THRESH_SHIFT 4
+#define MAX_WINNER_MOTION_MODES 10
+
+struct TileInfo;
+struct macroblock;
+struct RD_STATS;
+
+// Returns the number of colors in 'src'.
+int av1_count_colors(const uint8_t *src, int stride, int rows, int cols,
+                     int *val_count);
+// Same as av1_count_colors(), but for high-bitdepth mode.
+int av1_count_colors_highbd(const uint8_t *src8, int stride, int rows, int cols,
+                            int bit_depth, int *val_count);
+
+static INLINE int av1_cost_skip_txb(MACROBLOCK *x, const TXB_CTX *const txb_ctx,
+                                    int plane, TX_SIZE tx_size) {
+  const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+  const PLANE_TYPE plane_type = get_plane_type(plane);
+  const LV_MAP_COEFF_COST *const coeff_costs =
+      &x->coeff_costs[txs_ctx][plane_type];
+  return coeff_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][1];
+}
+
+void av1_rd_pick_intra_mode_sb(const struct AV1_COMP *cpi, struct macroblock *x,
+                               struct RD_STATS *rd_cost, BLOCK_SIZE bsize,
+                               PICK_MODE_CONTEXT *ctx, int64_t best_rd);
+
+unsigned int av1_get_sby_perpixel_variance(const struct AV1_COMP *cpi,
+                                           const struct buf_2d *ref,
+                                           BLOCK_SIZE bs);
+unsigned int av1_high_get_sby_perpixel_variance(const struct AV1_COMP *cpi,
+                                                const struct buf_2d *ref,
+                                                BLOCK_SIZE bs, int bd);
+
+void av1_rd_pick_inter_mode_sb(struct AV1_COMP *cpi,
+                               struct TileDataEnc *tile_data,
+                               struct macroblock *x, struct RD_STATS *rd_cost,
+                               BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+                               int64_t best_rd_so_far);
+
+void av1_pick_intra_mode(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *rd_cost,
+                         BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx);
+
+void av1_nonrd_pick_inter_mode_sb(struct AV1_COMP *cpi,
+                                  struct TileDataEnc *tile_data,
+                                  struct macroblock *x,
+                                  struct RD_STATS *rd_cost, BLOCK_SIZE bsize,
+                                  PICK_MODE_CONTEXT *ctx,
+                                  int64_t best_rd_so_far);
+
+void av1_rd_pick_inter_mode_sb_seg_skip(
+    const struct AV1_COMP *cpi, struct TileDataEnc *tile_data,
+    struct macroblock *x, int mi_row, int mi_col, struct RD_STATS *rd_cost,
+    BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far);
+
+// The best edge strength seen in the block, as well as the best x and y
+// components of edge strength seen.
+typedef struct {
+  uint16_t magnitude;
+  uint16_t x;
+  uint16_t y;
+} EdgeInfo;
+
+/** Returns an integer indicating the strength of the edge.
+ * 0 means no edge found, 556 is the strength of a solid black/white edge,
+ * and the number may range higher if the signal is even stronger (e.g., on a
+ * corner). high_bd is a bool indicating the source should be treated
+ * as a 16-bit array. bd is the bit depth.
+ */
+EdgeInfo av1_edge_exists(const uint8_t *src, int src_stride, int w, int h,
+                         bool high_bd, int bd);
+
+/** Applies a Gaussian blur with sigma = 1.3. Used by av1_edge_exists and
+ * tests.
+ */
+void av1_gaussian_blur(const uint8_t *src, int src_stride, int w, int h,
+                       uint8_t *dst, bool high_bd, int bd);
+
+/* Applies standard 3x3 Sobel matrix. */
+typedef struct {
+  int16_t x;
+  int16_t y;
+} sobel_xy;
+
+sobel_xy av1_sobel(const uint8_t *input, int stride, int i, int j,
+                   bool high_bd);
+
+void av1_inter_mode_data_init(struct TileDataEnc *tile_data);
+void av1_inter_mode_data_fit(TileDataEnc *tile_data, int rdmult);
+
+#if !CONFIG_REALTIME_ONLY
+static INLINE int coded_to_superres_mi(int mi_col, int denom) {
+  return (mi_col * denom + SCALE_NUMERATOR / 2) / SCALE_NUMERATOR;
+}
+#endif
+
+static INLINE int av1_encoder_get_relative_dist(const OrderHintInfo *oh, int a,
+                                                int b) {
+  if (!oh->enable_order_hint) return 0;
+
+  assert(a >= 0 && b >= 0);
+  return (a - b);
+}
+
+// This function will return number of mi's in a superblock.
+static INLINE int av1_get_sb_mi_size(const AV1_COMMON *const cm) {
+  const int mi_alloc_size_1d = mi_size_wide[cm->mi_params.mi_alloc_bsize];
+  int sb_mi_rows =
+      (mi_size_wide[cm->seq_params.sb_size] + mi_alloc_size_1d - 1) /
+      mi_alloc_size_1d;
+  assert(mi_size_wide[cm->seq_params.sb_size] ==
+         mi_size_high[cm->seq_params.sb_size]);
+  int sb_mi_size = sb_mi_rows * sb_mi_rows;
+
+  return sb_mi_size;
+}
+
+// This function will copy usable ref_mv_stack[ref_frame][4] and
+// weight[ref_frame][4] information from ref_mv_stack[ref_frame][8] and
+// weight[ref_frame][8].
+static INLINE void av1_copy_usable_ref_mv_stack_and_weight(
+    const MACROBLOCKD *xd, MB_MODE_INFO_EXT *const mbmi_ext,
+    MV_REFERENCE_FRAME ref_frame) {
+  memcpy(mbmi_ext->weight[ref_frame], xd->weight[ref_frame],
+         USABLE_REF_MV_STACK_SIZE * sizeof(xd->weight[0][0]));
+  memcpy(mbmi_ext->ref_mv_stack[ref_frame], xd->ref_mv_stack[ref_frame],
+         USABLE_REF_MV_STACK_SIZE * sizeof(xd->ref_mv_stack[0][0]));
+}
+
+// This function prunes the mode if either of the reference frame falls in the
+// pruning list
+static INLINE int prune_ref(const MV_REFERENCE_FRAME *const ref_frame,
+                            const OrderHintInfo *const order_hint_info,
+                            const unsigned int *const ref_display_order_hint,
+                            const unsigned int frame_display_order_hint,
+                            const int *ref_frame_list) {
+  for (int i = 0; i < 2; i++) {
+    if (ref_frame_list[i] == NONE_FRAME) continue;
+
+    if (ref_frame[0] == ref_frame_list[i] ||
+        ref_frame[1] == ref_frame_list[i]) {
+      if (av1_encoder_get_relative_dist(
+              order_hint_info,
+              ref_display_order_hint[ref_frame_list[i] - LAST_FRAME],
+              frame_display_order_hint) < 0)
+        return 1;
+    }
+  }
+  return 0;
+}
+
+static INLINE int prune_ref_by_selective_ref_frame(
+    const AV1_COMP *const cpi, const MACROBLOCK *const x,
+    const MV_REFERENCE_FRAME *const ref_frame,
+    const unsigned int *const ref_display_order_hint) {
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  if (!sf->inter_sf.selective_ref_frame) return 0;
+
+  const AV1_COMMON *const cm = &cpi->common;
+  const OrderHintInfo *const order_hint_info = &cm->seq_params.order_hint_info;
+  const int comp_pred = ref_frame[1] > INTRA_FRAME;
+
+  if (sf->inter_sf.selective_ref_frame >= 2 ||
+      (sf->inter_sf.selective_ref_frame == 1 && comp_pred)) {
+    int ref_frame_list[2] = { LAST3_FRAME, LAST2_FRAME };
+
+    if (x != NULL) {
+      if (x->search_ref_frame[LAST3_FRAME]) ref_frame_list[0] = NONE_FRAME;
+      if (x->search_ref_frame[LAST2_FRAME]) ref_frame_list[1] = NONE_FRAME;
+    }
+
+    if (prune_ref(ref_frame, order_hint_info, ref_display_order_hint,
+                  ref_display_order_hint[GOLDEN_FRAME - LAST_FRAME],
+                  ref_frame_list))
+      return 1;
+  }
+
+  if (sf->inter_sf.selective_ref_frame >= 3) {
+    int ref_frame_list[2] = { ALTREF2_FRAME, BWDREF_FRAME };
+
+    if (x != NULL) {
+      if (x->search_ref_frame[ALTREF2_FRAME]) ref_frame_list[0] = NONE_FRAME;
+      if (x->search_ref_frame[BWDREF_FRAME]) ref_frame_list[1] = NONE_FRAME;
+    }
+
+    if (prune_ref(ref_frame, order_hint_info, ref_display_order_hint,
+                  ref_display_order_hint[LAST_FRAME - LAST_FRAME],
+                  ref_frame_list))
+      return 1;
+  }
+
+  return 0;
+}
+
+// This function will copy the best reference mode information from
+// MB_MODE_INFO_EXT to MB_MODE_INFO_EXT_FRAME.
+static INLINE void av1_copy_mbmi_ext_to_mbmi_ext_frame(
+    MB_MODE_INFO_EXT_FRAME *mbmi_ext_best,
+    const MB_MODE_INFO_EXT *const mbmi_ext, uint8_t ref_frame_type) {
+  memcpy(mbmi_ext_best->ref_mv_stack, mbmi_ext->ref_mv_stack[ref_frame_type],
+         sizeof(mbmi_ext->ref_mv_stack[USABLE_REF_MV_STACK_SIZE]));
+  memcpy(mbmi_ext_best->weight, mbmi_ext->weight[ref_frame_type],
+         sizeof(mbmi_ext->weight[USABLE_REF_MV_STACK_SIZE]));
+  mbmi_ext_best->mode_context = mbmi_ext->mode_context[ref_frame_type];
+  mbmi_ext_best->ref_mv_count = mbmi_ext->ref_mv_count[ref_frame_type];
+  memcpy(mbmi_ext_best->global_mvs, mbmi_ext->global_mvs,
+         sizeof(mbmi_ext->global_mvs));
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_RDOPT_H_
diff --git a/libs/libaom/src/av1/encoder/rdopt_data_defs.h b/libs/libaom/src/av1/encoder/rdopt_data_defs.h
new file mode 100644
index 000000000..ca7ef810f
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/rdopt_data_defs.h
@@ -0,0 +1,294 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_RDOPT_DATA_DEFS_H_
+#define AOM_AV1_ENCODER_RDOPT_DATA_DEFS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static const THR_MODES intra_to_mode_idx[INTRA_MODE_NUM] = {
+  THR_DC,         // DC_PRED,
+  THR_V_PRED,     // V_PRED,
+  THR_H_PRED,     // H_PRED,
+  THR_D45_PRED,   // D45_PRED,
+  THR_D135_PRED,  // D135_PRED,
+  THR_D113_PRED,  // D113_PRED,
+  THR_D157_PRED,  // D157_PRED,
+  THR_D203_PRED,  // D203_PRED,
+  THR_D67_PRED,   // D67_PRED,
+  THR_SMOOTH,     // SMOOTH_PRED,
+  THR_SMOOTH_V,   // SMOOTH_V_PRED,
+  THR_SMOOTH_H,   // SMOOTH_H_PRED,
+  THR_PAETH,      // PAETH_PRED,
+};
+
+/* clang-format off */
+static const THR_MODES single_inter_to_mode_idx[SINGLE_INTER_MODE_NUM]
+    [REF_FRAMES] = {
+    // NEARESTMV,
+        { THR_INVALID, THR_NEARESTMV, THR_NEARESTL2, THR_NEARESTL3,
+        THR_NEARESTG, THR_NEARESTB, THR_NEARESTA2, THR_NEARESTA, },
+        // NEARMV,
+        { THR_INVALID, THR_NEARMV, THR_NEARL2, THR_NEARL3,
+        THR_NEARG, THR_NEARB, THR_NEARA2, THR_NEARA, },
+        // GLOBALMV,
+        { THR_INVALID, THR_GLOBALMV, THR_GLOBALL2, THR_GLOBALL3,
+        THR_GLOBALG, THR_GLOBALB, THR_GLOBALA2, THR_GLOBALA, },
+        // NEWMV,
+        { THR_INVALID, THR_NEWMV, THR_NEWL2, THR_NEWL3,
+        THR_NEWG, THR_NEWB, THR_NEWA2, THR_NEWA, },
+};
+/* clang-format on */
+
+/* clang-format off */
+static const THR_MODES comp_inter_to_mode_idx[COMP_INTER_MODE_NUM][REF_FRAMES]
+    [REF_FRAMES] = {
+    // NEAREST_NEARESTMV,
+        {
+            { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID, THR_INVALID, },
+            { THR_INVALID, THR_INVALID,
+            THR_COMP_NEAREST_NEARESTLL2, THR_COMP_NEAREST_NEARESTLL3,
+            THR_COMP_NEAREST_NEARESTLG, THR_COMP_NEAREST_NEARESTLB,
+            THR_COMP_NEAREST_NEARESTLA2, THR_COMP_NEAREST_NEARESTLA, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEAREST_NEARESTL2B,
+            THR_COMP_NEAREST_NEARESTL2A2, THR_COMP_NEAREST_NEARESTL2A, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEAREST_NEARESTL3B,
+            THR_COMP_NEAREST_NEARESTL3A2, THR_COMP_NEAREST_NEARESTL3A, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEAREST_NEARESTGB,
+            THR_COMP_NEAREST_NEARESTGA2, THR_COMP_NEAREST_NEARESTGA, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEAREST_NEARESTBA, },
+            { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID, THR_INVALID, },
+            { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID, THR_INVALID, },
+        },
+        // NEAR_NEARMV,
+        {
+            { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID, THR_INVALID, },
+            { THR_INVALID, THR_INVALID,
+            THR_COMP_NEAR_NEARLL2, THR_COMP_NEAR_NEARLL3,
+            THR_COMP_NEAR_NEARLG, THR_COMP_NEAR_NEARLB,
+            THR_COMP_NEAR_NEARLA2, THR_COMP_NEAR_NEARLA, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEAR_NEARL2B,
+            THR_COMP_NEAR_NEARL2A2, THR_COMP_NEAR_NEARL2A, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEAR_NEARL3B,
+            THR_COMP_NEAR_NEARL3A2, THR_COMP_NEAR_NEARL3A, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEAR_NEARGB,
+            THR_COMP_NEAR_NEARGA2, THR_COMP_NEAR_NEARGA, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEAR_NEARBA, },
+            { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID, THR_INVALID, },
+            { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID, THR_INVALID, },
+        },
+        // NEAREST_NEWMV,
+        {
+            { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID, THR_INVALID, },
+            { THR_INVALID, THR_INVALID,
+            THR_COMP_NEAREST_NEWLL2, THR_COMP_NEAREST_NEWLL3,
+            THR_COMP_NEAREST_NEWLG, THR_COMP_NEAREST_NEWLB,
+            THR_COMP_NEAREST_NEWLA2, THR_COMP_NEAREST_NEWLA, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEAREST_NEWL2B,
+            THR_COMP_NEAREST_NEWL2A2, THR_COMP_NEAREST_NEWL2A, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEAREST_NEWL3B,
+            THR_COMP_NEAREST_NEWL3A2, THR_COMP_NEAREST_NEWL3A, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEAREST_NEWGB,
+            THR_COMP_NEAREST_NEWGA2, THR_COMP_NEAREST_NEWGA, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEAREST_NEWBA, },
+            { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID, THR_INVALID, },
+            { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID, THR_INVALID, },
+        },
+        // NEW_NEARESTMV,
+        {
+            { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID, THR_INVALID, },
+            { THR_INVALID, THR_INVALID,
+            THR_COMP_NEW_NEARESTLL2, THR_COMP_NEW_NEARESTLL3,
+            THR_COMP_NEW_NEARESTLG, THR_COMP_NEW_NEARESTLB,
+            THR_COMP_NEW_NEARESTLA2, THR_COMP_NEW_NEARESTLA, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEW_NEARESTL2B,
+            THR_COMP_NEW_NEARESTL2A2, THR_COMP_NEW_NEARESTL2A, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEW_NEARESTL3B,
+            THR_COMP_NEW_NEARESTL3A2, THR_COMP_NEW_NEARESTL3A, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEW_NEARESTGB,
+            THR_COMP_NEW_NEARESTGA2, THR_COMP_NEW_NEARESTGA, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEW_NEARESTBA, },
+            { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID, THR_INVALID, },
+            { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID, THR_INVALID, },
+        },
+        // NEAR_NEWMV,
+        {
+            { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID, THR_INVALID, },
+            { THR_INVALID, THR_INVALID,
+            THR_COMP_NEAR_NEWLL2, THR_COMP_NEAR_NEWLL3,
+            THR_COMP_NEAR_NEWLG, THR_COMP_NEAR_NEWLB,
+            THR_COMP_NEAR_NEWLA2, THR_COMP_NEAR_NEWLA, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEAR_NEWL2B,
+            THR_COMP_NEAR_NEWL2A2, THR_COMP_NEAR_NEWL2A, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEAR_NEWL3B,
+            THR_COMP_NEAR_NEWL3A2, THR_COMP_NEAR_NEWL3A, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEAR_NEWGB,
+            THR_COMP_NEAR_NEWGA2, THR_COMP_NEAR_NEWGA, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEAR_NEWBA, },
+            { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID, THR_INVALID, },
+            { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID, THR_INVALID, },
+        },
+        // NEW_NEARMV,
+        {
+            { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID, THR_INVALID, },
+            { THR_INVALID, THR_INVALID,
+            THR_COMP_NEW_NEARLL2, THR_COMP_NEW_NEARLL3,
+            THR_COMP_NEW_NEARLG, THR_COMP_NEW_NEARLB,
+            THR_COMP_NEW_NEARLA2, THR_COMP_NEW_NEARLA, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEW_NEARL2B,
+            THR_COMP_NEW_NEARL2A2, THR_COMP_NEW_NEARL2A, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEW_NEARL3B,
+            THR_COMP_NEW_NEARL3A2, THR_COMP_NEW_NEARL3A, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEW_NEARGB,
+            THR_COMP_NEW_NEARGA2, THR_COMP_NEW_NEARGA, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEW_NEARBA, },
+            { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID, THR_INVALID, },
+            { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID, THR_INVALID, },
+        },
+        // GLOBAL_GLOBALMV,
+        {
+            { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID, THR_INVALID, },
+            { THR_INVALID, THR_INVALID,
+            THR_COMP_GLOBAL_GLOBALLL2, THR_COMP_GLOBAL_GLOBALLL3,
+            THR_COMP_GLOBAL_GLOBALLG, THR_COMP_GLOBAL_GLOBALLB,
+            THR_COMP_GLOBAL_GLOBALLA2, THR_COMP_GLOBAL_GLOBALLA, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_GLOBAL_GLOBALL2B,
+            THR_COMP_GLOBAL_GLOBALL2A2, THR_COMP_GLOBAL_GLOBALL2A, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_GLOBAL_GLOBALL3B,
+            THR_COMP_GLOBAL_GLOBALL3A2, THR_COMP_GLOBAL_GLOBALL3A, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_GLOBAL_GLOBALGB,
+            THR_COMP_GLOBAL_GLOBALGA2, THR_COMP_GLOBAL_GLOBALGA, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_GLOBAL_GLOBALBA, },
+            { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID, THR_INVALID, },
+            { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID, THR_INVALID, },
+        },
+        // NEW_NEWMV,
+        {
+            { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID, THR_INVALID, },
+            { THR_INVALID, THR_INVALID,
+            THR_COMP_NEW_NEWLL2, THR_COMP_NEW_NEWLL3,
+            THR_COMP_NEW_NEWLG, THR_COMP_NEW_NEWLB,
+            THR_COMP_NEW_NEWLA2, THR_COMP_NEW_NEWLA, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEW_NEWL2B,
+            THR_COMP_NEW_NEWL2A2, THR_COMP_NEW_NEWL2A, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEW_NEWL3B,
+            THR_COMP_NEW_NEWL3A2, THR_COMP_NEW_NEWL3A, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEW_NEWGB,
+            THR_COMP_NEW_NEWGA2, THR_COMP_NEW_NEWGA, },
+            { THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_COMP_NEW_NEWBA, },
+            { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID, THR_INVALID, },
+            { THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID, THR_INVALID,
+            THR_INVALID, THR_INVALID, THR_INVALID, },
+        },
+};
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_RDOPT_DATA_DEFS_H_
diff --git a/libs/libaom/src/av1/encoder/rdopt_utils.h b/libs/libaom/src/av1/encoder/rdopt_utils.h
new file mode 100644
index 000000000..53b410a22
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/rdopt_utils.h
@@ -0,0 +1,652 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_RDOPT_UTILS_H_
+#define AOM_AV1_ENCODER_RDOPT_UTILS_H_
+
+#include "aom/aom_integer.h"
+#include "av1/encoder/block.h"
+#include "av1/common/cfl.h"
+#include "av1/common/pred_common.h"
+#include "av1/encoder/rdopt_data_defs.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_REF_MV_SEARCH 3
+#define INTER_INTRA_RD_THRESH_SCALE 9
+#define INTER_INTRA_RD_THRESH_SHIFT 4
+
+typedef struct {
+  PREDICTION_MODE mode;
+  MV_REFERENCE_FRAME ref_frame[2];
+} MODE_DEFINITION;
+
+// This array defines the mapping from the enums in THR_MODES to the actual
+// prediction modes and refrence frames
+static const MODE_DEFINITION av1_mode_defs[MAX_MODES] = {
+  { NEARESTMV, { LAST_FRAME, NONE_FRAME } },
+  { NEARESTMV, { LAST2_FRAME, NONE_FRAME } },
+  { NEARESTMV, { LAST3_FRAME, NONE_FRAME } },
+  { NEARESTMV, { BWDREF_FRAME, NONE_FRAME } },
+  { NEARESTMV, { ALTREF2_FRAME, NONE_FRAME } },
+  { NEARESTMV, { ALTREF_FRAME, NONE_FRAME } },
+  { NEARESTMV, { GOLDEN_FRAME, NONE_FRAME } },
+
+  { NEWMV, { LAST_FRAME, NONE_FRAME } },
+  { NEWMV, { LAST2_FRAME, NONE_FRAME } },
+  { NEWMV, { LAST3_FRAME, NONE_FRAME } },
+  { NEWMV, { BWDREF_FRAME, NONE_FRAME } },
+  { NEWMV, { ALTREF2_FRAME, NONE_FRAME } },
+  { NEWMV, { ALTREF_FRAME, NONE_FRAME } },
+  { NEWMV, { GOLDEN_FRAME, NONE_FRAME } },
+
+  { NEARMV, { LAST_FRAME, NONE_FRAME } },
+  { NEARMV, { LAST2_FRAME, NONE_FRAME } },
+  { NEARMV, { LAST3_FRAME, NONE_FRAME } },
+  { NEARMV, { BWDREF_FRAME, NONE_FRAME } },
+  { NEARMV, { ALTREF2_FRAME, NONE_FRAME } },
+  { NEARMV, { ALTREF_FRAME, NONE_FRAME } },
+  { NEARMV, { GOLDEN_FRAME, NONE_FRAME } },
+
+  { GLOBALMV, { LAST_FRAME, NONE_FRAME } },
+  { GLOBALMV, { LAST2_FRAME, NONE_FRAME } },
+  { GLOBALMV, { LAST3_FRAME, NONE_FRAME } },
+  { GLOBALMV, { BWDREF_FRAME, NONE_FRAME } },
+  { GLOBALMV, { ALTREF2_FRAME, NONE_FRAME } },
+  { GLOBALMV, { ALTREF_FRAME, NONE_FRAME } },
+  { GLOBALMV, { GOLDEN_FRAME, NONE_FRAME } },
+
+  // TODO(zoeliu): May need to reconsider the order on the modes to check
+
+  { NEAREST_NEARESTMV, { LAST_FRAME, ALTREF_FRAME } },
+  { NEAREST_NEARESTMV, { LAST2_FRAME, ALTREF_FRAME } },
+  { NEAREST_NEARESTMV, { LAST3_FRAME, ALTREF_FRAME } },
+  { NEAREST_NEARESTMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+  { NEAREST_NEARESTMV, { LAST_FRAME, BWDREF_FRAME } },
+  { NEAREST_NEARESTMV, { LAST2_FRAME, BWDREF_FRAME } },
+  { NEAREST_NEARESTMV, { LAST3_FRAME, BWDREF_FRAME } },
+  { NEAREST_NEARESTMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+  { NEAREST_NEARESTMV, { LAST_FRAME, ALTREF2_FRAME } },
+  { NEAREST_NEARESTMV, { LAST2_FRAME, ALTREF2_FRAME } },
+  { NEAREST_NEARESTMV, { LAST3_FRAME, ALTREF2_FRAME } },
+  { NEAREST_NEARESTMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+
+  { NEAREST_NEARESTMV, { LAST_FRAME, LAST2_FRAME } },
+  { NEAREST_NEARESTMV, { LAST_FRAME, LAST3_FRAME } },
+  { NEAREST_NEARESTMV, { LAST_FRAME, GOLDEN_FRAME } },
+  { NEAREST_NEARESTMV, { BWDREF_FRAME, ALTREF_FRAME } },
+
+  { NEAR_NEARMV, { LAST_FRAME, ALTREF_FRAME } },
+  { NEW_NEARESTMV, { LAST_FRAME, ALTREF_FRAME } },
+  { NEAREST_NEWMV, { LAST_FRAME, ALTREF_FRAME } },
+  { NEW_NEARMV, { LAST_FRAME, ALTREF_FRAME } },
+  { NEAR_NEWMV, { LAST_FRAME, ALTREF_FRAME } },
+  { NEW_NEWMV, { LAST_FRAME, ALTREF_FRAME } },
+  { GLOBAL_GLOBALMV, { LAST_FRAME, ALTREF_FRAME } },
+
+  { NEAR_NEARMV, { LAST2_FRAME, ALTREF_FRAME } },
+  { NEW_NEARESTMV, { LAST2_FRAME, ALTREF_FRAME } },
+  { NEAREST_NEWMV, { LAST2_FRAME, ALTREF_FRAME } },
+  { NEW_NEARMV, { LAST2_FRAME, ALTREF_FRAME } },
+  { NEAR_NEWMV, { LAST2_FRAME, ALTREF_FRAME } },
+  { NEW_NEWMV, { LAST2_FRAME, ALTREF_FRAME } },
+  { GLOBAL_GLOBALMV, { LAST2_FRAME, ALTREF_FRAME } },
+
+  { NEAR_NEARMV, { LAST3_FRAME, ALTREF_FRAME } },
+  { NEW_NEARESTMV, { LAST3_FRAME, ALTREF_FRAME } },
+  { NEAREST_NEWMV, { LAST3_FRAME, ALTREF_FRAME } },
+  { NEW_NEARMV, { LAST3_FRAME, ALTREF_FRAME } },
+  { NEAR_NEWMV, { LAST3_FRAME, ALTREF_FRAME } },
+  { NEW_NEWMV, { LAST3_FRAME, ALTREF_FRAME } },
+  { GLOBAL_GLOBALMV, { LAST3_FRAME, ALTREF_FRAME } },
+
+  { NEAR_NEARMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+  { NEW_NEARESTMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+  { NEAREST_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+  { NEW_NEARMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+  { NEAR_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+  { NEW_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+  { GLOBAL_GLOBALMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+
+  { NEAR_NEARMV, { LAST_FRAME, BWDREF_FRAME } },
+  { NEW_NEARESTMV, { LAST_FRAME, BWDREF_FRAME } },
+  { NEAREST_NEWMV, { LAST_FRAME, BWDREF_FRAME } },
+  { NEW_NEARMV, { LAST_FRAME, BWDREF_FRAME } },
+  { NEAR_NEWMV, { LAST_FRAME, BWDREF_FRAME } },
+  { NEW_NEWMV, { LAST_FRAME, BWDREF_FRAME } },
+  { GLOBAL_GLOBALMV, { LAST_FRAME, BWDREF_FRAME } },
+
+  { NEAR_NEARMV, { LAST2_FRAME, BWDREF_FRAME } },
+  { NEW_NEARESTMV, { LAST2_FRAME, BWDREF_FRAME } },
+  { NEAREST_NEWMV, { LAST2_FRAME, BWDREF_FRAME } },
+  { NEW_NEARMV, { LAST2_FRAME, BWDREF_FRAME } },
+  { NEAR_NEWMV, { LAST2_FRAME, BWDREF_FRAME } },
+  { NEW_NEWMV, { LAST2_FRAME, BWDREF_FRAME } },
+  { GLOBAL_GLOBALMV, { LAST2_FRAME, BWDREF_FRAME } },
+
+  { NEAR_NEARMV, { LAST3_FRAME, BWDREF_FRAME } },
+  { NEW_NEARESTMV, { LAST3_FRAME, BWDREF_FRAME } },
+  { NEAREST_NEWMV, { LAST3_FRAME, BWDREF_FRAME } },
+  { NEW_NEARMV, { LAST3_FRAME, BWDREF_FRAME } },
+  { NEAR_NEWMV, { LAST3_FRAME, BWDREF_FRAME } },
+  { NEW_NEWMV, { LAST3_FRAME, BWDREF_FRAME } },
+  { GLOBAL_GLOBALMV, { LAST3_FRAME, BWDREF_FRAME } },
+
+  { NEAR_NEARMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+  { NEW_NEARESTMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+  { NEAREST_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+  { NEW_NEARMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+  { NEAR_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+  { NEW_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+  { GLOBAL_GLOBALMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+
+  { NEAR_NEARMV, { LAST_FRAME, ALTREF2_FRAME } },
+  { NEW_NEARESTMV, { LAST_FRAME, ALTREF2_FRAME } },
+  { NEAREST_NEWMV, { LAST_FRAME, ALTREF2_FRAME } },
+  { NEW_NEARMV, { LAST_FRAME, ALTREF2_FRAME } },
+  { NEAR_NEWMV, { LAST_FRAME, ALTREF2_FRAME } },
+  { NEW_NEWMV, { LAST_FRAME, ALTREF2_FRAME } },
+  { GLOBAL_GLOBALMV, { LAST_FRAME, ALTREF2_FRAME } },
+
+  { NEAR_NEARMV, { LAST2_FRAME, ALTREF2_FRAME } },
+  { NEW_NEARESTMV, { LAST2_FRAME, ALTREF2_FRAME } },
+  { NEAREST_NEWMV, { LAST2_FRAME, ALTREF2_FRAME } },
+  { NEW_NEARMV, { LAST2_FRAME, ALTREF2_FRAME } },
+  { NEAR_NEWMV, { LAST2_FRAME, ALTREF2_FRAME } },
+  { NEW_NEWMV, { LAST2_FRAME, ALTREF2_FRAME } },
+  { GLOBAL_GLOBALMV, { LAST2_FRAME, ALTREF2_FRAME } },
+
+  { NEAR_NEARMV, { LAST3_FRAME, ALTREF2_FRAME } },
+  { NEW_NEARESTMV, { LAST3_FRAME, ALTREF2_FRAME } },
+  { NEAREST_NEWMV, { LAST3_FRAME, ALTREF2_FRAME } },
+  { NEW_NEARMV, { LAST3_FRAME, ALTREF2_FRAME } },
+  { NEAR_NEWMV, { LAST3_FRAME, ALTREF2_FRAME } },
+  { NEW_NEWMV, { LAST3_FRAME, ALTREF2_FRAME } },
+  { GLOBAL_GLOBALMV, { LAST3_FRAME, ALTREF2_FRAME } },
+
+  { NEAR_NEARMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+  { NEW_NEARESTMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+  { NEAREST_NEWMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+  { NEW_NEARMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+  { NEAR_NEWMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+  { NEW_NEWMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+  { GLOBAL_GLOBALMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+
+  { NEAR_NEARMV, { LAST_FRAME, LAST2_FRAME } },
+  { NEW_NEARESTMV, { LAST_FRAME, LAST2_FRAME } },
+  { NEAREST_NEWMV, { LAST_FRAME, LAST2_FRAME } },
+  { NEW_NEARMV, { LAST_FRAME, LAST2_FRAME } },
+  { NEAR_NEWMV, { LAST_FRAME, LAST2_FRAME } },
+  { NEW_NEWMV, { LAST_FRAME, LAST2_FRAME } },
+  { GLOBAL_GLOBALMV, { LAST_FRAME, LAST2_FRAME } },
+
+  { NEAR_NEARMV, { LAST_FRAME, LAST3_FRAME } },
+  { NEW_NEARESTMV, { LAST_FRAME, LAST3_FRAME } },
+  { NEAREST_NEWMV, { LAST_FRAME, LAST3_FRAME } },
+  { NEW_NEARMV, { LAST_FRAME, LAST3_FRAME } },
+  { NEAR_NEWMV, { LAST_FRAME, LAST3_FRAME } },
+  { NEW_NEWMV, { LAST_FRAME, LAST3_FRAME } },
+  { GLOBAL_GLOBALMV, { LAST_FRAME, LAST3_FRAME } },
+
+  { NEAR_NEARMV, { LAST_FRAME, GOLDEN_FRAME } },
+  { NEW_NEARESTMV, { LAST_FRAME, GOLDEN_FRAME } },
+  { NEAREST_NEWMV, { LAST_FRAME, GOLDEN_FRAME } },
+  { NEW_NEARMV, { LAST_FRAME, GOLDEN_FRAME } },
+  { NEAR_NEWMV, { LAST_FRAME, GOLDEN_FRAME } },
+  { NEW_NEWMV, { LAST_FRAME, GOLDEN_FRAME } },
+  { GLOBAL_GLOBALMV, { LAST_FRAME, GOLDEN_FRAME } },
+
+  { NEAR_NEARMV, { BWDREF_FRAME, ALTREF_FRAME } },
+  { NEW_NEARESTMV, { BWDREF_FRAME, ALTREF_FRAME } },
+  { NEAREST_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } },
+  { NEW_NEARMV, { BWDREF_FRAME, ALTREF_FRAME } },
+  { NEAR_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } },
+  { NEW_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } },
+  { GLOBAL_GLOBALMV, { BWDREF_FRAME, ALTREF_FRAME } },
+
+  // intra modes
+  { DC_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { PAETH_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { SMOOTH_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { SMOOTH_V_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { SMOOTH_H_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { H_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { V_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { D135_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { D203_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { D157_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { D67_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { D113_PRED, { INTRA_FRAME, NONE_FRAME } },
+  { D45_PRED, { INTRA_FRAME, NONE_FRAME } },
+};
+
+static AOM_INLINE void restore_dst_buf(MACROBLOCKD *xd, const BUFFER_SET dst,
+                                       const int num_planes) {
+  for (int i = 0; i < num_planes; i++) {
+    xd->plane[i].dst.buf = dst.plane[i];
+    xd->plane[i].dst.stride = dst.stride[i];
+  }
+}
+
+/* clang-format on */
+// Calculate rd threshold based on ref best rd and relevant scaling factors
+static AOM_INLINE int64_t get_rd_thresh_from_best_rd(int64_t ref_best_rd,
+                                                     int mul_factor,
+                                                     int div_factor) {
+  int64_t rd_thresh = ref_best_rd;
+  if (div_factor != 0) {
+    rd_thresh = ref_best_rd < (div_factor * (INT64_MAX / mul_factor))
+                    ? ((ref_best_rd / div_factor) * mul_factor)
+                    : INT64_MAX;
+  }
+  return rd_thresh;
+}
+
+static AOM_INLINE THR_MODES
+get_prediction_mode_idx(PREDICTION_MODE this_mode, MV_REFERENCE_FRAME ref_frame,
+                        MV_REFERENCE_FRAME second_ref_frame) {
+  if (this_mode < INTRA_MODE_END) {
+    assert(ref_frame == INTRA_FRAME);
+    assert(second_ref_frame == NONE_FRAME);
+    return intra_to_mode_idx[this_mode - INTRA_MODE_START];
+  }
+  if (this_mode >= SINGLE_INTER_MODE_START &&
+      this_mode < SINGLE_INTER_MODE_END) {
+    assert((ref_frame > INTRA_FRAME) && (ref_frame <= ALTREF_FRAME));
+    return single_inter_to_mode_idx[this_mode - SINGLE_INTER_MODE_START]
+                                   [ref_frame];
+  }
+  if (this_mode >= COMP_INTER_MODE_START && this_mode < COMP_INTER_MODE_END) {
+    assert((ref_frame > INTRA_FRAME) && (ref_frame <= ALTREF_FRAME));
+    assert((second_ref_frame > INTRA_FRAME) &&
+           (second_ref_frame <= ALTREF_FRAME));
+    return comp_inter_to_mode_idx[this_mode - COMP_INTER_MODE_START][ref_frame]
+                                 [second_ref_frame];
+  }
+  assert(0);
+  return THR_INVALID;
+}
+
+static AOM_INLINE int inter_mode_data_block_idx(BLOCK_SIZE bsize) {
+  if (bsize == BLOCK_4X4 || bsize == BLOCK_4X8 || bsize == BLOCK_8X4 ||
+      bsize == BLOCK_4X16 || bsize == BLOCK_16X4) {
+    return -1;
+  }
+  return 1;
+}
+
+// Get transform block visible dimensions cropped to the MI units.
+static AOM_INLINE void get_txb_dimensions(const MACROBLOCKD *xd, int plane,
+                                          BLOCK_SIZE plane_bsize, int blk_row,
+                                          int blk_col, BLOCK_SIZE tx_bsize,
+                                          int *width, int *height,
+                                          int *visible_width,
+                                          int *visible_height) {
+  assert(tx_bsize <= plane_bsize);
+  const int txb_height = block_size_high[tx_bsize];
+  const int txb_width = block_size_wide[tx_bsize];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+
+  // TODO(aconverse@google.com): Investigate using crop_width/height here rather
+  // than the MI size
+  if (xd->mb_to_bottom_edge >= 0) {
+    *visible_height = txb_height;
+  } else {
+    const int block_height = block_size_high[plane_bsize];
+    const int block_rows =
+        (xd->mb_to_bottom_edge >> (3 + pd->subsampling_y)) + block_height;
+    *visible_height =
+        clamp(block_rows - (blk_row << MI_SIZE_LOG2), 0, txb_height);
+  }
+  if (height) *height = txb_height;
+
+  if (xd->mb_to_right_edge >= 0) {
+    *visible_width = txb_width;
+  } else {
+    const int block_width = block_size_wide[plane_bsize];
+    const int block_cols =
+        (xd->mb_to_right_edge >> (3 + pd->subsampling_x)) + block_width;
+    *visible_width =
+        clamp(block_cols - (blk_col << MI_SIZE_LOG2), 0, txb_width);
+  }
+  if (width) *width = txb_width;
+}
+
+static AOM_INLINE int bsize_to_num_blk(BLOCK_SIZE bsize) {
+  int num_blk = 1 << (num_pels_log2_lookup[bsize] - 2 * MI_SIZE_LOG2);
+  return num_blk;
+}
+
+static INLINE int check_txfm_eval(MACROBLOCK *const x, BLOCK_SIZE bsize,
+                                  int64_t best_skip_rd, int64_t skip_rd,
+                                  int level, int is_luma_only) {
+  int eval_txfm = 1;
+  // Derive aggressiveness factor for gating the transform search
+  // Lower value indicates more aggressiveness. Be more conservative (high
+  // value) for (i) low quantizers (ii) regions where prediction is poor
+  const int scale[5] = { INT_MAX, 4, 3, 3, 2 };
+  const int qslope = 2 * (!is_luma_only);
+  int aggr_factor = 1;
+  if (!is_luma_only) {
+    aggr_factor = AOMMAX(
+        1, ((MAXQ - x->qindex) * qslope + QINDEX_RANGE / 2) >> QINDEX_BITS);
+  }
+  if (best_skip_rd >
+      (x->source_variance << (num_pels_log2_lookup[bsize] + RDDIV_BITS)))
+    aggr_factor *= scale[level];
+  // For level setting 1, be more conservative for luma only case even when
+  // prediction is good
+  else if ((level <= 1) && !is_luma_only)
+    aggr_factor *= 2;
+
+  // Be more conservative for luma only cases (called from compound type rd)
+  // since best_skip_rd is computed after and skip_rd is computed (with 8-bit
+  // prediction signals blended for WEDGE/DIFFWTD rather than 16-bit) before
+  // interpolation filter search
+  const int luma_mul[5] = { INT_MAX, 32, 29, 20, 17 };
+  int mul_factor = is_luma_only ? luma_mul[level] : 16;
+  int64_t rd_thresh =
+      (best_skip_rd == INT64_MAX)
+          ? best_skip_rd
+          : (int64_t)(best_skip_rd * aggr_factor * mul_factor >> 4);
+  if (skip_rd > rd_thresh) eval_txfm = 0;
+  return eval_txfm;
+}
+
+static TX_MODE select_tx_mode(
+    const AV1_COMMON *cm, const TX_SIZE_SEARCH_METHOD tx_size_search_method) {
+  if (cm->features.coded_lossless) return ONLY_4X4;
+  if (tx_size_search_method == USE_LARGESTALL) {
+    return TX_MODE_LARGEST;
+  } else {
+    assert(tx_size_search_method == USE_FULL_RD ||
+           tx_size_search_method == USE_FAST_RD);
+    return TX_MODE_SELECT;
+  }
+}
+// Checks the conditions to enable winner mode processing
+static INLINE int is_winner_mode_processing_enabled(
+    const struct AV1_COMP *cpi, MB_MODE_INFO *const mbmi,
+    const PREDICTION_MODE best_mode) {
+  const SPEED_FEATURES *sf = &cpi->sf;
+
+  // TODO(any): Move block independent condition checks to frame level
+  if (is_inter_block(mbmi)) {
+    if (is_inter_mode(best_mode) &&
+        sf->tx_sf.tx_type_search.fast_inter_tx_type_search &&
+        !cpi->oxcf.use_inter_dct_only)
+      return 1;
+  } else {
+    if (sf->tx_sf.tx_type_search.fast_intra_tx_type_search &&
+        !cpi->oxcf.use_intra_default_tx_only && !cpi->oxcf.use_intra_dct_only)
+      return 1;
+  }
+
+  // Check speed feature related to winner mode processing
+  if (sf->winner_mode_sf.enable_winner_mode_for_coeff_opt &&
+      cpi->optimize_seg_arr[mbmi->segment_id] != NO_TRELLIS_OPT &&
+      cpi->optimize_seg_arr[mbmi->segment_id] != FINAL_PASS_TRELLIS_OPT)
+    return 1;
+  if (sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch) return 1;
+
+  return 0;
+}
+
+static INLINE void set_tx_size_search_method(
+    const AV1_COMMON *cm, const WinnerModeParams *winner_mode_params,
+    MACROBLOCK *x, int enable_winner_mode_for_tx_size_srch,
+    int is_winner_mode) {
+  // Populate transform size search method/transform mode appropriately
+  x->tx_size_search_method =
+      winner_mode_params->tx_size_search_methods[DEFAULT_EVAL];
+  if (enable_winner_mode_for_tx_size_srch) {
+    if (is_winner_mode)
+      x->tx_size_search_method =
+          winner_mode_params->tx_size_search_methods[WINNER_MODE_EVAL];
+    else
+      x->tx_size_search_method =
+          winner_mode_params->tx_size_search_methods[MODE_EVAL];
+  }
+  x->tx_mode_search_type = select_tx_mode(cm, x->tx_size_search_method);
+}
+
+static INLINE void set_tx_type_prune(const SPEED_FEATURES *sf, MACROBLOCK *x,
+                                     int enable_winner_mode_tx_type_pruning,
+                                     int is_winner_mode) {
+  // Populate prune transform mode appropriately
+  x->prune_mode = sf->tx_sf.tx_type_search.prune_mode;
+  if (enable_winner_mode_tx_type_pruning) {
+    if (is_winner_mode)
+      x->prune_mode = NO_PRUNE;
+    else
+      x->prune_mode = PRUNE_2D_AGGRESSIVE;
+  }
+}
+
+static INLINE void set_tx_domain_dist_params(
+    const WinnerModeParams *winner_mode_params, MACROBLOCK *x,
+    int enable_winner_mode_for_tx_domain_dist, int is_winner_mode) {
+  if (!enable_winner_mode_for_tx_domain_dist) {
+    x->use_transform_domain_distortion =
+        winner_mode_params->use_transform_domain_distortion[DEFAULT_EVAL];
+    x->tx_domain_dist_threshold =
+        winner_mode_params->tx_domain_dist_threshold[DEFAULT_EVAL];
+    return;
+  }
+
+  if (is_winner_mode) {
+    x->use_transform_domain_distortion =
+        winner_mode_params->use_transform_domain_distortion[WINNER_MODE_EVAL];
+    x->tx_domain_dist_threshold =
+        winner_mode_params->tx_domain_dist_threshold[WINNER_MODE_EVAL];
+  } else {
+    x->use_transform_domain_distortion =
+        winner_mode_params->use_transform_domain_distortion[MODE_EVAL];
+    x->tx_domain_dist_threshold =
+        winner_mode_params->tx_domain_dist_threshold[MODE_EVAL];
+  }
+}
+
+// This function sets mode parameters for different mode evaluation stages
+static INLINE void set_mode_eval_params(const struct AV1_COMP *cpi,
+                                        MACROBLOCK *x,
+                                        MODE_EVAL_TYPE mode_eval_type) {
+  const AV1_COMMON *cm = &cpi->common;
+  const SPEED_FEATURES *sf = &cpi->sf;
+  const WinnerModeParams *winner_mode_params = &cpi->winner_mode_params;
+
+  switch (mode_eval_type) {
+    case DEFAULT_EVAL:
+      x->use_default_inter_tx_type = 0;
+      x->use_default_intra_tx_type = 0;
+      x->predict_skip_level =
+          winner_mode_params->predict_skip_level[DEFAULT_EVAL];
+      // Set default transform domain distortion type
+      set_tx_domain_dist_params(winner_mode_params, x, 0, 0);
+
+      // Get default threshold for R-D optimization of coefficients
+      x->coeff_opt_dist_threshold = get_rd_opt_coeff_thresh(
+          winner_mode_params->coeff_opt_dist_threshold, 0, 0);
+      // Set default transform size search method
+      set_tx_size_search_method(cm, winner_mode_params, x, 0, 0);
+      // Set default transform type prune
+      set_tx_type_prune(sf, x, 0, 0);
+      break;
+    case MODE_EVAL:
+      x->use_default_intra_tx_type =
+          (cpi->sf.tx_sf.tx_type_search.fast_intra_tx_type_search ||
+           cpi->oxcf.use_intra_default_tx_only);
+      x->use_default_inter_tx_type =
+          cpi->sf.tx_sf.tx_type_search.fast_inter_tx_type_search;
+      x->predict_skip_level = winner_mode_params->predict_skip_level[MODE_EVAL];
+
+      // Set transform domain distortion type for mode evaluation
+      set_tx_domain_dist_params(
+          winner_mode_params, x,
+          sf->winner_mode_sf.enable_winner_mode_for_use_tx_domain_dist, 0);
+
+      // Get threshold for R-D optimization of coefficients during mode
+      // evaluation
+      x->coeff_opt_dist_threshold = get_rd_opt_coeff_thresh(
+          winner_mode_params->coeff_opt_dist_threshold,
+          sf->winner_mode_sf.enable_winner_mode_for_coeff_opt, 0);
+      // Set the transform size search method for mode evaluation
+      set_tx_size_search_method(
+          cm, winner_mode_params, x,
+          sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch, 0);
+      // Set transform type prune for mode evaluation
+      set_tx_type_prune(
+          sf, x, sf->tx_sf.tx_type_search.enable_winner_mode_tx_type_pruning,
+          0);
+      break;
+    case WINNER_MODE_EVAL:
+      x->use_default_inter_tx_type = 0;
+      x->use_default_intra_tx_type = 0;
+      x->predict_skip_level =
+          winner_mode_params->predict_skip_level[WINNER_MODE_EVAL];
+
+      // Set transform domain distortion type for winner mode evaluation
+      set_tx_domain_dist_params(
+          winner_mode_params, x,
+          sf->winner_mode_sf.enable_winner_mode_for_use_tx_domain_dist, 1);
+
+      // Get threshold for R-D optimization of coefficients for winner mode
+      // evaluation
+      x->coeff_opt_dist_threshold = get_rd_opt_coeff_thresh(
+          winner_mode_params->coeff_opt_dist_threshold,
+          sf->winner_mode_sf.enable_winner_mode_for_coeff_opt, 1);
+      // Set the transform size search method for winner mode evaluation
+      set_tx_size_search_method(
+          cm, winner_mode_params, x,
+          sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch, 1);
+      // Set default transform type prune mode for winner mode evaluation
+      set_tx_type_prune(
+          sf, x, sf->tx_sf.tx_type_search.enable_winner_mode_tx_type_pruning,
+          1);
+
+      // Reset hash state for winner mode processing. Winner mode and subsequent
+      // transform/mode evaluations (palette/IntraBC) cann't reuse old data as
+      // the decisions would have been sub-optimal
+      // TODO(any): Move the evaluation of palette/IntraBC modes before winner
+      // mode is processed and clean-up the code below
+      reset_hash_records(x, cpi->sf.tx_sf.use_inter_txb_hash);
+
+      break;
+    default: assert(0);
+  }
+}
+
+// Similar to store_cfl_required(), but for use during the RDO process,
+// where we haven't yet determined whether this block uses CfL.
+static INLINE CFL_ALLOWED_TYPE store_cfl_required_rdo(const AV1_COMMON *cm,
+                                                      const MACROBLOCK *x) {
+  const MACROBLOCKD *xd = &x->e_mbd;
+
+  if (cm->seq_params.monochrome || !xd->is_chroma_ref) return CFL_DISALLOWED;
+
+  if (!xd->is_chroma_ref) {
+    // For non-chroma-reference blocks, we should always store the luma pixels,
+    // in case the corresponding chroma-reference block uses CfL.
+    // Note that this can only happen for block sizes which are <8 on
+    // their shortest side, as otherwise they would be chroma reference
+    // blocks.
+    return CFL_ALLOWED;
+  }
+
+  // For chroma reference blocks, we should store data in the encoder iff we're
+  // allowed to try out CfL.
+  return is_cfl_allowed(xd);
+}
+
+static AOM_INLINE void init_sbuv_mode(MB_MODE_INFO *const mbmi) {
+  mbmi->uv_mode = UV_DC_PRED;
+  mbmi->palette_mode_info.palette_size[1] = 0;
+}
+
+// Store best mode stats for winner mode processing
+static INLINE void store_winner_mode_stats(
+    const AV1_COMMON *const cm, MACROBLOCK *x, MB_MODE_INFO *mbmi,
+    RD_STATS *rd_cost, RD_STATS *rd_cost_y, RD_STATS *rd_cost_uv,
+    THR_MODES mode_index, uint8_t *color_map, BLOCK_SIZE bsize, int64_t this_rd,
+    int enable_multiwinner_mode_process, int txfm_search_done) {
+  WinnerModeStats *winner_mode_stats = x->winner_mode_stats;
+  int mode_idx = 0;
+  int is_palette_mode = mbmi->palette_mode_info.palette_size[PLANE_TYPE_Y] > 0;
+  // Mode stat is not required when multiwinner mode processing is disabled
+  if (!enable_multiwinner_mode_process) return;
+  // Ignore mode with maximum rd
+  if (this_rd == INT64_MAX) return;
+  // TODO(any): Winner mode processing is currently not applicable for palette
+  // mode in Inter frames. Clean-up the following code, once support is added
+  if (!frame_is_intra_only(cm) && is_palette_mode) return;
+
+  const int max_winner_mode_count = frame_is_intra_only(cm)
+                                        ? MAX_WINNER_MODE_COUNT_INTRA
+                                        : MAX_WINNER_MODE_COUNT_INTER;
+  assert(x->winner_mode_count >= 0 &&
+         x->winner_mode_count <= max_winner_mode_count);
+
+  if (x->winner_mode_count) {
+    // Find the mode which has higher rd cost than this_rd
+    for (mode_idx = 0; mode_idx < x->winner_mode_count; mode_idx++)
+      if (winner_mode_stats[mode_idx].rd > this_rd) break;
+
+    if (mode_idx == max_winner_mode_count) {
+      // No mode has higher rd cost than this_rd
+      return;
+    } else if (mode_idx < max_winner_mode_count - 1) {
+      // Create a slot for current mode and move others to the next slot
+      memmove(
+          &winner_mode_stats[mode_idx + 1], &winner_mode_stats[mode_idx],
+          (max_winner_mode_count - mode_idx - 1) * sizeof(*winner_mode_stats));
+    }
+  }
+  // Add a mode stat for winner mode processing
+  winner_mode_stats[mode_idx].mbmi = *mbmi;
+  winner_mode_stats[mode_idx].rd = this_rd;
+  winner_mode_stats[mode_idx].mode_index = mode_index;
+
+  // Update rd stats required for inter frame
+  if (!frame_is_intra_only(cm) && rd_cost && rd_cost_y && rd_cost_uv) {
+    const MACROBLOCKD *xd = &x->e_mbd;
+    const int skip_ctx = av1_get_skip_context(xd);
+    const int is_intra_mode = av1_mode_defs[mode_index].mode < INTRA_MODE_END;
+    const int skip = mbmi->skip && !is_intra_mode;
+
+    winner_mode_stats[mode_idx].rd_cost = *rd_cost;
+    if (txfm_search_done) {
+      winner_mode_stats[mode_idx].rate_y =
+          rd_cost_y->rate + x->skip_cost[skip_ctx][rd_cost->skip || skip];
+      winner_mode_stats[mode_idx].rate_uv = rd_cost_uv->rate;
+    }
+  }
+
+  if (color_map) {
+    // Store color_index_map for palette mode
+    const MACROBLOCKD *const xd = &x->e_mbd;
+    int block_width, block_height;
+    av1_get_block_dimensions(bsize, AOM_PLANE_Y, xd, &block_width,
+                             &block_height, NULL, NULL);
+    memcpy(winner_mode_stats[mode_idx].color_index_map, color_map,
+           block_width * block_height * sizeof(color_map[0]));
+  }
+
+  x->winner_mode_count =
+      AOMMIN(x->winner_mode_count + 1, max_winner_mode_count);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_RDOPT_UTILS_H_
diff --git a/libs/libaom/src/av1/encoder/reconinter_enc.c b/libs/libaom/src/av1/encoder/reconinter_enc.c
new file mode 100644
index 000000000..231b02091
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/reconinter_enc.c
@@ -0,0 +1,407 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+#include <limits.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/blend.h"
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/blockd.h"
+#include "av1/common/mvref_common.h"
+#include "av1/common/obmc.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+#include "av1/encoder/reconinter_enc.h"
+
+static void enc_calc_subpel_params(const MV *const src_mv,
+                                   InterPredParams *const inter_pred_params,
+                                   MACROBLOCKD *xd, int mi_x, int mi_y, int ref,
+                                   uint8_t **pre, SubpelParams *subpel_params,
+                                   int *src_stride) {
+  // These are part of the function signature to use this function through a
+  // function pointer. See typedef of 'CalcSubpelParamsFunc'.
+  (void)xd;
+  (void)mi_x;
+  (void)mi_y;
+  (void)ref;
+
+  const struct scale_factors *sf = inter_pred_params->scale_factors;
+
+  struct buf_2d *pre_buf = &inter_pred_params->ref_frame_buf;
+  int ssx = inter_pred_params->subsampling_x;
+  int ssy = inter_pred_params->subsampling_y;
+  int orig_pos_y = inter_pred_params->pix_row << SUBPEL_BITS;
+  orig_pos_y += src_mv->row * (1 << (1 - ssy));
+  int orig_pos_x = inter_pred_params->pix_col << SUBPEL_BITS;
+  orig_pos_x += src_mv->col * (1 << (1 - ssx));
+  int pos_y = sf->scale_value_y(orig_pos_y, sf);
+  int pos_x = sf->scale_value_x(orig_pos_x, sf);
+  pos_x += SCALE_EXTRA_OFF;
+  pos_y += SCALE_EXTRA_OFF;
+
+  const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy);
+  const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx);
+  const int bottom = (pre_buf->height + AOM_INTERP_EXTEND) << SCALE_SUBPEL_BITS;
+  const int right = (pre_buf->width + AOM_INTERP_EXTEND) << SCALE_SUBPEL_BITS;
+  pos_y = clamp(pos_y, top, bottom);
+  pos_x = clamp(pos_x, left, right);
+
+  subpel_params->subpel_x = pos_x & SCALE_SUBPEL_MASK;
+  subpel_params->subpel_y = pos_y & SCALE_SUBPEL_MASK;
+  subpel_params->xs = sf->x_step_q4;
+  subpel_params->ys = sf->y_step_q4;
+  *pre = pre_buf->buf0 + (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride +
+         (pos_x >> SCALE_SUBPEL_BITS);
+  *src_stride = pre_buf->stride;
+}
+
+void av1_enc_build_one_inter_predictor(uint8_t *dst, int dst_stride,
+                                       const MV *src_mv,
+                                       InterPredParams *inter_pred_params) {
+  av1_build_one_inter_predictor(dst, dst_stride, src_mv, inter_pred_params,
+                                NULL /* xd */, 0 /* mi_x */, 0 /* mi_y */,
+                                0 /* ref */, enc_calc_subpel_params);
+}
+
+static void enc_build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                       int plane, const MB_MODE_INFO *mi,
+                                       int bw, int bh, int mi_x, int mi_y) {
+  av1_build_inter_predictors(cm, xd, plane, mi, 0 /* build_for_obmc */, bw, bh,
+                             mi_x, mi_y, enc_calc_subpel_params);
+}
+
+void av1_enc_build_inter_predictor_y(MACROBLOCKD *xd, int mi_row, int mi_col) {
+  const int mi_x = mi_col * MI_SIZE;
+  const int mi_y = mi_row * MI_SIZE;
+  struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y];
+  InterPredParams inter_pred_params;
+
+  struct buf_2d *const dst_buf = &pd->dst;
+  uint8_t *const dst = dst_buf->buf;
+  const MV mv = xd->mi[0]->mv[0].as_mv;
+  const struct scale_factors *const sf = xd->block_ref_scale_factors[0];
+
+  av1_init_inter_params(&inter_pred_params, pd->width, pd->height, mi_y, mi_x,
+                        pd->subsampling_x, pd->subsampling_y, xd->bd,
+                        is_cur_buf_hbd(xd), false, sf, pd->pre,
+                        xd->mi[0]->interp_filters);
+
+  inter_pred_params.conv_params = get_conv_params_no_round(
+      0, AOM_PLANE_Y, xd->tmp_conv_dst, MAX_SB_SIZE, false, xd->bd);
+
+  inter_pred_params.conv_params.use_dist_wtd_comp_avg = 0;
+  av1_enc_build_one_inter_predictor(dst, dst_buf->stride, &mv,
+                                    &inter_pred_params);
+}
+
+void av1_enc_build_inter_predictor(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                   int mi_row, int mi_col,
+                                   const BUFFER_SET *ctx, BLOCK_SIZE bsize,
+                                   int plane_from, int plane_to) {
+  for (int plane = plane_from; plane <= plane_to; ++plane) {
+    if (plane && !xd->is_chroma_ref) break;
+    const int mi_x = mi_col * MI_SIZE;
+    const int mi_y = mi_row * MI_SIZE;
+    enc_build_inter_predictors(cm, xd, plane, xd->mi[0], xd->plane[plane].width,
+                               xd->plane[plane].height, mi_x, mi_y);
+
+    if (is_interintra_pred(xd->mi[0])) {
+      BUFFER_SET default_ctx = {
+        { xd->plane[0].dst.buf, xd->plane[1].dst.buf, xd->plane[2].dst.buf },
+        { xd->plane[0].dst.stride, xd->plane[1].dst.stride,
+          xd->plane[2].dst.stride }
+      };
+      if (!ctx) {
+        ctx = &default_ctx;
+      }
+      av1_build_interintra_predictor(cm, xd, xd->plane[plane].dst.buf,
+                                     xd->plane[plane].dst.stride, ctx, plane,
+                                     bsize);
+    }
+  }
+}
+
+static INLINE void build_obmc_prediction(MACROBLOCKD *xd, int rel_mi_row,
+                                         int rel_mi_col, uint8_t op_mi_size,
+                                         int dir, MB_MODE_INFO *above_mbmi,
+                                         void *fun_ctxt, const int num_planes) {
+  struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt;
+  av1_setup_address_for_obmc(xd, rel_mi_row, rel_mi_col, above_mbmi, ctxt,
+                             num_planes);
+
+  const int mi_x = (xd->mi_col + rel_mi_col) << MI_SIZE_LOG2;
+  const int mi_y = (xd->mi_row + rel_mi_row) << MI_SIZE_LOG2;
+
+  const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+
+  InterPredParams inter_pred_params;
+
+  for (int j = 0; j < num_planes; ++j) {
+    const struct macroblockd_plane *pd = &xd->plane[j];
+    int bw = 0, bh = 0;
+
+    if (dir) {
+      // prepare left reference block size
+      bw = clamp(block_size_wide[bsize] >> (pd->subsampling_x + 1), 4,
+                 block_size_wide[BLOCK_64X64] >> (pd->subsampling_x + 1));
+      bh = (op_mi_size << MI_SIZE_LOG2) >> pd->subsampling_y;
+    } else {
+      // prepare above reference block size
+      bw = (op_mi_size * MI_SIZE) >> pd->subsampling_x;
+      bh = clamp(block_size_high[bsize] >> (pd->subsampling_y + 1), 4,
+                 block_size_high[BLOCK_64X64] >> (pd->subsampling_y + 1));
+    }
+
+    if (av1_skip_u4x4_pred_in_obmc(bsize, pd, dir)) continue;
+
+    const struct buf_2d *const pre_buf = &pd->pre[0];
+    const MV mv = above_mbmi->mv[0].as_mv;
+
+    av1_init_inter_params(&inter_pred_params, bw, bh, mi_y >> pd->subsampling_y,
+                          mi_x >> pd->subsampling_x, pd->subsampling_x,
+                          pd->subsampling_y, xd->bd, is_cur_buf_hbd(xd), 0,
+                          xd->block_ref_scale_factors[0], pre_buf,
+                          above_mbmi->interp_filters);
+    inter_pred_params.conv_params = get_conv_params(0, j, xd->bd);
+
+    av1_enc_build_one_inter_predictor(pd->dst.buf, pd->dst.stride, &mv,
+                                      &inter_pred_params);
+  }
+}
+
+void av1_build_prediction_by_above_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                         uint8_t *tmp_buf[MAX_MB_PLANE],
+                                         int tmp_width[MAX_MB_PLANE],
+                                         int tmp_height[MAX_MB_PLANE],
+                                         int tmp_stride[MAX_MB_PLANE]) {
+  if (!xd->up_available) return;
+  struct build_prediction_ctxt ctxt = { cm,         tmp_buf,
+                                        tmp_width,  tmp_height,
+                                        tmp_stride, xd->mb_to_right_edge };
+  BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+  foreach_overlappable_nb_above(cm, xd,
+                                max_neighbor_obmc[mi_size_wide_log2[bsize]],
+                                build_obmc_prediction, &ctxt);
+}
+
+void av1_build_prediction_by_left_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                        uint8_t *tmp_buf[MAX_MB_PLANE],
+                                        int tmp_width[MAX_MB_PLANE],
+                                        int tmp_height[MAX_MB_PLANE],
+                                        int tmp_stride[MAX_MB_PLANE]) {
+  if (!xd->left_available) return;
+  struct build_prediction_ctxt ctxt = { cm,         tmp_buf,
+                                        tmp_width,  tmp_height,
+                                        tmp_stride, xd->mb_to_bottom_edge };
+  BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+  foreach_overlappable_nb_left(cm, xd,
+                               max_neighbor_obmc[mi_size_high_log2[bsize]],
+                               build_obmc_prediction, &ctxt);
+}
+
+void av1_build_obmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd) {
+  const int num_planes = av1_num_planes(cm);
+  uint8_t *dst_buf1[MAX_MB_PLANE], *dst_buf2[MAX_MB_PLANE];
+  int dst_stride1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+  int dst_stride2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+  int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+  int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+  int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+  int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+
+  if (is_cur_buf_hbd(xd)) {
+    int len = sizeof(uint16_t);
+    dst_buf1[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0]);
+    dst_buf1[1] =
+        CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * len);
+    dst_buf1[2] =
+        CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * 2 * len);
+    dst_buf2[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1]);
+    dst_buf2[1] =
+        CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * len);
+    dst_buf2[2] =
+        CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * 2 * len);
+  } else {
+    dst_buf1[0] = xd->tmp_obmc_bufs[0];
+    dst_buf1[1] = xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE;
+    dst_buf1[2] = xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * 2;
+    dst_buf2[0] = xd->tmp_obmc_bufs[1];
+    dst_buf2[1] = xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE;
+    dst_buf2[2] = xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * 2;
+  }
+
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+  av1_build_prediction_by_above_preds(cm, xd, dst_buf1, dst_width1, dst_height1,
+                                      dst_stride1);
+  av1_build_prediction_by_left_preds(cm, xd, dst_buf2, dst_width2, dst_height2,
+                                     dst_stride2);
+  av1_setup_dst_planes(xd->plane, xd->mi[0]->sb_type, &cm->cur_frame->buf,
+                       mi_row, mi_col, 0, num_planes);
+  av1_build_obmc_inter_prediction(cm, xd, dst_buf1, dst_stride1, dst_buf2,
+                                  dst_stride2);
+}
+
+void av1_build_inter_predictors_for_planes_single_buf(
+    MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane_from, int plane_to, int ref,
+    uint8_t *ext_dst[3], int ext_dst_stride[3]) {
+  assert(bsize < BLOCK_SIZES_ALL);
+  const MB_MODE_INFO *mi = xd->mi[0];
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+  const int mi_x = mi_col * MI_SIZE;
+  const int mi_y = mi_row * MI_SIZE;
+  WarpTypesAllowed warp_types;
+  const WarpedMotionParams *const wm = &xd->global_motion[mi->ref_frame[ref]];
+  warp_types.global_warp_allowed = is_global_mv_block(mi, wm->wmtype);
+  warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
+
+  for (int plane = plane_from; plane <= plane_to; ++plane) {
+    const struct macroblockd_plane *pd = &xd->plane[plane];
+    const BLOCK_SIZE plane_bsize =
+        get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+    const int bw = block_size_wide[plane_bsize];
+    const int bh = block_size_high[plane_bsize];
+
+    InterPredParams inter_pred_params;
+
+    av1_init_inter_params(&inter_pred_params, bw, bh, mi_y >> pd->subsampling_y,
+                          mi_x >> pd->subsampling_x, pd->subsampling_x,
+                          pd->subsampling_y, xd->bd, is_cur_buf_hbd(xd), 0,
+                          xd->block_ref_scale_factors[ref], &pd->pre[ref],
+                          mi->interp_filters);
+    inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd);
+    av1_init_warp_params(&inter_pred_params, &warp_types, ref, xd, mi);
+
+    uint8_t *const dst = get_buf_by_bd(xd, ext_dst[plane]);
+    const MV mv = mi->mv[ref].as_mv;
+
+    av1_enc_build_one_inter_predictor(dst, ext_dst_stride[plane], &mv,
+                                      &inter_pred_params);
+  }
+}
+
+static void build_masked_compound(
+    uint8_t *dst, int dst_stride, const uint8_t *src0, int src0_stride,
+    const uint8_t *src1, int src1_stride,
+    const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type, int h,
+    int w) {
+  // Derive subsampling from h and w passed in. May be refactored to
+  // pass in subsampling factors directly.
+  const int subh = (2 << mi_size_high_log2[sb_type]) == h;
+  const int subw = (2 << mi_size_wide_log2[sb_type]) == w;
+  const uint8_t *mask = av1_get_compound_type_mask(comp_data, sb_type);
+  aom_blend_a64_mask(dst, dst_stride, src0, src0_stride, src1, src1_stride,
+                     mask, block_size_wide[sb_type], w, h, subw, subh);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static void build_masked_compound_highbd(
+    uint8_t *dst_8, int dst_stride, const uint8_t *src0_8, int src0_stride,
+    const uint8_t *src1_8, int src1_stride,
+    const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type, int h,
+    int w, int bd) {
+  // Derive subsampling from h and w passed in. May be refactored to
+  // pass in subsampling factors directly.
+  const int subh = (2 << mi_size_high_log2[sb_type]) == h;
+  const int subw = (2 << mi_size_wide_log2[sb_type]) == w;
+  const uint8_t *mask = av1_get_compound_type_mask(comp_data, sb_type);
+  // const uint8_t *mask =
+  //     av1_get_contiguous_soft_mask(wedge_index, wedge_sign, sb_type);
+  aom_highbd_blend_a64_mask(dst_8, dst_stride, src0_8, src0_stride, src1_8,
+                            src1_stride, mask, block_size_wide[sb_type], w, h,
+                            subw, subh, bd);
+}
+#endif
+
+static void build_wedge_inter_predictor_from_buf(
+    MACROBLOCKD *xd, int plane, int x, int y, int w, int h, uint8_t *ext_dst0,
+    int ext_dst_stride0, uint8_t *ext_dst1, int ext_dst_stride1) {
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int is_compound = has_second_ref(mbmi);
+  MACROBLOCKD_PLANE *const pd = &xd->plane[plane];
+  struct buf_2d *const dst_buf = &pd->dst;
+  uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
+  mbmi->interinter_comp.seg_mask = xd->seg_mask;
+  const INTERINTER_COMPOUND_DATA *comp_data = &mbmi->interinter_comp;
+  const int is_hbd = is_cur_buf_hbd(xd);
+
+  if (is_compound && is_masked_compound_type(comp_data->type)) {
+    if (!plane && comp_data->type == COMPOUND_DIFFWTD) {
+      if (is_hbd) {
+        av1_build_compound_diffwtd_mask_highbd(
+            comp_data->seg_mask, comp_data->mask_type,
+            CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
+            CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1, h, w, xd->bd);
+      } else {
+        av1_build_compound_diffwtd_mask(
+            comp_data->seg_mask, comp_data->mask_type, ext_dst0,
+            ext_dst_stride0, ext_dst1, ext_dst_stride1, h, w);
+      }
+    }
+#if CONFIG_AV1_HIGHBITDEPTH
+    if (is_hbd) {
+      build_masked_compound_highbd(
+          dst, dst_buf->stride, CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
+          CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1, comp_data,
+          mbmi->sb_type, h, w, xd->bd);
+    } else {
+      build_masked_compound(dst, dst_buf->stride, ext_dst0, ext_dst_stride0,
+                            ext_dst1, ext_dst_stride1, comp_data, mbmi->sb_type,
+                            h, w);
+    }
+#else
+    build_masked_compound(dst, dst_buf->stride, ext_dst0, ext_dst_stride0,
+                          ext_dst1, ext_dst_stride1, comp_data, mbmi->sb_type,
+                          h, w);
+#endif
+  } else {
+#if CONFIG_AV1_HIGHBITDEPTH
+    if (is_hbd) {
+      aom_highbd_convolve_copy(CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
+                               dst, dst_buf->stride, NULL, 0, NULL, 0, w, h,
+                               xd->bd);
+    } else {
+      aom_convolve_copy(ext_dst0, ext_dst_stride0, dst, dst_buf->stride, NULL,
+                        0, NULL, 0, w, h);
+    }
+#else
+    aom_convolve_copy(ext_dst0, ext_dst_stride0, dst, dst_buf->stride, NULL, 0,
+                      NULL, 0, w, h);
+#endif
+  }
+}
+
+void av1_build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, BLOCK_SIZE bsize,
+                                              int plane_from, int plane_to,
+                                              uint8_t *ext_dst0[3],
+                                              int ext_dst_stride0[3],
+                                              uint8_t *ext_dst1[3],
+                                              int ext_dst_stride1[3]) {
+  int plane;
+  assert(bsize < BLOCK_SIZES_ALL);
+  for (plane = plane_from; plane <= plane_to; ++plane) {
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(
+        bsize, xd->plane[plane].subsampling_x, xd->plane[plane].subsampling_y);
+    const int bw = block_size_wide[plane_bsize];
+    const int bh = block_size_high[plane_bsize];
+    build_wedge_inter_predictor_from_buf(
+        xd, plane, 0, 0, bw, bh, ext_dst0[plane], ext_dst_stride0[plane],
+        ext_dst1[plane], ext_dst_stride1[plane]);
+  }
+}
diff --git a/libs/libaom/src/av1/encoder/reconinter_enc.h b/libs/libaom/src/av1/encoder/reconinter_enc.h
new file mode 100644
index 000000000..fdc1f31c8
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/reconinter_enc.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_RECONINTER_ENC_H_
+#define AOM_AV1_ENCODER_RECONINTER_ENC_H_
+
+#include "aom/aom_integer.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/common/blockd.h"
+#include "av1/common/convolve.h"
+#include "av1/common/filter.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/warped_motion.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Build single or compound reference inter predictors for all planes.
+// Can build inter-intra predictors, masked predictors etc as well.
+void av1_enc_build_inter_predictor(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                   int mi_row, int mi_col,
+                                   const BUFFER_SET *ctx, BLOCK_SIZE bsize,
+                                   int plane_from, int plane_to);
+
+void av1_enc_build_inter_predictor_y(MACROBLOCKD *xd, int mi_row, int mi_col);
+
+// Build one inter predictor. It is called for building predictor for single
+// reference case, or just the 1st or 2nd reference in compound reference case.
+// Can build both regular and masked predictors.
+void av1_enc_build_one_inter_predictor(uint8_t *dst, int dst_stride,
+                                       const MV *src_mv,
+                                       InterPredParams *inter_pred_params);
+
+void av1_build_prediction_by_above_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                         uint8_t *tmp_buf[MAX_MB_PLANE],
+                                         int tmp_width[MAX_MB_PLANE],
+                                         int tmp_height[MAX_MB_PLANE],
+                                         int tmp_stride[MAX_MB_PLANE]);
+
+void av1_build_prediction_by_left_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                        uint8_t *tmp_buf[MAX_MB_PLANE],
+                                        int tmp_width[MAX_MB_PLANE],
+                                        int tmp_height[MAX_MB_PLANE],
+                                        int tmp_stride[MAX_MB_PLANE]);
+
+void av1_build_obmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd);
+
+void av1_build_inter_predictors_for_planes_single_buf(
+    MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane_from, int plane_to, int ref,
+    uint8_t *ext_dst[3], int ext_dst_stride[3]);
+
+void av1_build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, BLOCK_SIZE bsize,
+                                              int plane_from, int plane_to,
+                                              uint8_t *ext_dst0[3],
+                                              int ext_dst_stride0[3],
+                                              uint8_t *ext_dst1[3],
+                                              int ext_dst_stride1[3]);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_RECONINTER_ENC_H_
diff --git a/libs/libaom/src/av1/encoder/segmentation.c b/libs/libaom/src/av1/encoder/segmentation.c
new file mode 100644
index 000000000..0c029c0e6
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/segmentation.c
@@ -0,0 +1,251 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+
+#include "aom_mem/aom_mem.h"
+
+#include "av1/common/pred_common.h"
+#include "av1/common/tile_common.h"
+
+#include "av1/encoder/cost.h"
+#include "av1/encoder/segmentation.h"
+
+void av1_enable_segmentation(struct segmentation *seg) {
+  seg->enabled = 1;
+  seg->update_map = 1;
+  seg->update_data = 1;
+  seg->temporal_update = 0;
+}
+
+void av1_disable_segmentation(struct segmentation *seg) {
+  seg->enabled = 0;
+  seg->update_map = 0;
+  seg->update_data = 0;
+  seg->temporal_update = 0;
+}
+
+void av1_disable_segfeature(struct segmentation *seg, int segment_id,
+                            SEG_LVL_FEATURES feature_id) {
+  seg->feature_mask[segment_id] &= ~(1 << feature_id);
+}
+
+void av1_clear_segdata(struct segmentation *seg, int segment_id,
+                       SEG_LVL_FEATURES feature_id) {
+  seg->feature_data[segment_id][feature_id] = 0;
+}
+
+static void count_segs(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                       const TileInfo *tile, MB_MODE_INFO **mi,
+                       unsigned *no_pred_segcounts,
+                       unsigned (*temporal_predictor_count)[2],
+                       unsigned *t_unpred_seg_counts, int bw, int bh,
+                       int mi_row, int mi_col) {
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) return;
+
+  xd->mi = mi;
+  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, mi_params->mi_rows,
+                 mi_params->mi_cols);
+
+  // Count the number of hits on each segment with no prediction
+  const int segment_id = xd->mi[0]->segment_id;
+  no_pred_segcounts[segment_id]++;
+
+  // Temporal prediction not allowed on key frames
+  if (cm->current_frame.frame_type != KEY_FRAME) {
+    const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+    // Test to see if the segment id matches the predicted value.
+    const int pred_segment_id =
+        cm->last_frame_seg_map
+            ? get_segment_id(mi_params, cm->last_frame_seg_map, bsize, mi_row,
+                             mi_col)
+            : 0;
+    const int pred_flag = pred_segment_id == segment_id;
+    const int pred_context = av1_get_pred_context_seg_id(xd);
+
+    // Store the prediction status for this mb and update counts
+    // as appropriate
+    xd->mi[0]->seg_id_predicted = pred_flag;
+    temporal_predictor_count[pred_context][pred_flag]++;
+
+    // Update the "unpredicted" segment count
+    if (!pred_flag) t_unpred_seg_counts[segment_id]++;
+  }
+}
+
+static void count_segs_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                          const TileInfo *tile, MB_MODE_INFO **mi,
+                          unsigned *no_pred_segcounts,
+                          unsigned (*temporal_predictor_count)[2],
+                          unsigned *t_unpred_seg_counts, int mi_row, int mi_col,
+                          BLOCK_SIZE bsize) {
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  const int mis = mi_params->mi_stride;
+  const int bs = mi_size_wide[bsize], hbs = bs / 2;
+  PARTITION_TYPE partition;
+  const int qbs = bs / 4;
+
+  if (mi_row >= mi_params->mi_rows || mi_col >= mi_params->mi_cols) return;
+
+#define CSEGS(cs_bw, cs_bh, cs_rowoff, cs_coloff)                              \
+  count_segs(cm, xd, tile, mi + mis * (cs_rowoff) + (cs_coloff),               \
+             no_pred_segcounts, temporal_predictor_count, t_unpred_seg_counts, \
+             (cs_bw), (cs_bh), mi_row + (cs_rowoff), mi_col + (cs_coloff));
+
+  if (bsize == BLOCK_8X8)
+    partition = PARTITION_NONE;
+  else
+    partition = get_partition(cm, mi_row, mi_col, bsize);
+  switch (partition) {
+    case PARTITION_NONE: CSEGS(bs, bs, 0, 0); break;
+    case PARTITION_HORZ:
+      CSEGS(bs, hbs, 0, 0);
+      CSEGS(bs, hbs, hbs, 0);
+      break;
+    case PARTITION_VERT:
+      CSEGS(hbs, bs, 0, 0);
+      CSEGS(hbs, bs, 0, hbs);
+      break;
+    case PARTITION_HORZ_A:
+      CSEGS(hbs, hbs, 0, 0);
+      CSEGS(hbs, hbs, 0, hbs);
+      CSEGS(bs, hbs, hbs, 0);
+      break;
+    case PARTITION_HORZ_B:
+      CSEGS(bs, hbs, 0, 0);
+      CSEGS(hbs, hbs, hbs, 0);
+      CSEGS(hbs, hbs, hbs, hbs);
+      break;
+    case PARTITION_VERT_A:
+      CSEGS(hbs, hbs, 0, 0);
+      CSEGS(hbs, hbs, hbs, 0);
+      CSEGS(hbs, bs, 0, hbs);
+      break;
+    case PARTITION_VERT_B:
+      CSEGS(hbs, bs, 0, 0);
+      CSEGS(hbs, hbs, 0, hbs);
+      CSEGS(hbs, hbs, hbs, hbs);
+      break;
+    case PARTITION_HORZ_4:
+      CSEGS(bs, qbs, 0, 0);
+      CSEGS(bs, qbs, qbs, 0);
+      CSEGS(bs, qbs, 2 * qbs, 0);
+      if (mi_row + 3 * qbs < mi_params->mi_rows) CSEGS(bs, qbs, 3 * qbs, 0);
+      break;
+
+    case PARTITION_VERT_4:
+      CSEGS(qbs, bs, 0, 0);
+      CSEGS(qbs, bs, 0, qbs);
+      CSEGS(qbs, bs, 0, 2 * qbs);
+      if (mi_col + 3 * qbs < mi_params->mi_cols) CSEGS(qbs, bs, 0, 3 * qbs);
+      break;
+
+    case PARTITION_SPLIT: {
+      const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+      int n;
+      assert(subsize < BLOCK_SIZES_ALL);
+
+      for (n = 0; n < 4; n++) {
+        const int mi_dc = hbs * (n & 1);
+        const int mi_dr = hbs * (n >> 1);
+
+        count_segs_sb(cm, xd, tile, &mi[mi_dr * mis + mi_dc], no_pred_segcounts,
+                      temporal_predictor_count, t_unpred_seg_counts,
+                      mi_row + mi_dr, mi_col + mi_dc, subsize);
+      }
+    } break;
+    default: assert(0);
+  }
+
+#undef CSEGS
+}
+
+void av1_choose_segmap_coding_method(AV1_COMMON *cm, MACROBLOCKD *xd) {
+  struct segmentation *seg = &cm->seg;
+  struct segmentation_probs *segp = &cm->fc->seg;
+  int no_pred_cost;
+  int t_pred_cost = INT_MAX;
+  int tile_col, tile_row, mi_row, mi_col;
+  unsigned temporal_predictor_count[SEG_TEMPORAL_PRED_CTXS][2] = { { 0 } };
+  unsigned no_pred_segcounts[MAX_SEGMENTS] = { 0 };
+  unsigned t_unpred_seg_counts[MAX_SEGMENTS] = { 0 };
+  (void)xd;
+  int scale_up = cm->prev_frame && (cm->width > cm->prev_frame->width ||
+                                    cm->height > cm->prev_frame->height);
+  // First of all generate stats regarding how well the last segment map
+  // predicts this one
+  if (!scale_up) {
+    for (tile_row = 0; tile_row < cm->tiles.rows; tile_row++) {
+      TileInfo tile_info;
+      av1_tile_set_row(&tile_info, cm, tile_row);
+      for (tile_col = 0; tile_col < cm->tiles.cols; tile_col++) {
+        MB_MODE_INFO **mi_ptr;
+        av1_tile_set_col(&tile_info, cm, tile_col);
+        mi_ptr = cm->mi_params.mi_grid_base +
+                 tile_info.mi_row_start * cm->mi_params.mi_stride +
+                 tile_info.mi_col_start;
+        for (mi_row = tile_info.mi_row_start; mi_row < tile_info.mi_row_end;
+             mi_row += cm->seq_params.mib_size,
+            mi_ptr += cm->seq_params.mib_size * cm->mi_params.mi_stride) {
+          MB_MODE_INFO **mi = mi_ptr;
+          for (mi_col = tile_info.mi_col_start; mi_col < tile_info.mi_col_end;
+               mi_col += cm->seq_params.mib_size,
+              mi += cm->seq_params.mib_size) {
+            count_segs_sb(cm, xd, &tile_info, mi, no_pred_segcounts,
+                          temporal_predictor_count, t_unpred_seg_counts, mi_row,
+                          mi_col, cm->seq_params.sb_size);
+          }
+        }
+      }
+    }
+  }
+
+  int seg_id_cost[MAX_SEGMENTS];
+  av1_cost_tokens_from_cdf(seg_id_cost, segp->tree_cdf, NULL);
+  no_pred_cost = 0;
+  for (int i = 0; i < MAX_SEGMENTS; ++i)
+    no_pred_cost += no_pred_segcounts[i] * seg_id_cost[i];
+
+  // Frames without past dependency cannot use temporal prediction
+  if (cm->features.primary_ref_frame != PRIMARY_REF_NONE) {
+    int pred_flag_cost[SEG_TEMPORAL_PRED_CTXS][2];
+    for (int i = 0; i < SEG_TEMPORAL_PRED_CTXS; ++i)
+      av1_cost_tokens_from_cdf(pred_flag_cost[i], segp->pred_cdf[i], NULL);
+    t_pred_cost = 0;
+    // Cost for signaling the prediction flag.
+    for (int i = 0; i < SEG_TEMPORAL_PRED_CTXS; ++i) {
+      for (int j = 0; j < 2; ++j)
+        t_pred_cost += temporal_predictor_count[i][j] * pred_flag_cost[i][j];
+    }
+    // Cost for signaling the unpredicted segment id.
+    for (int i = 0; i < MAX_SEGMENTS; ++i)
+      t_pred_cost += t_unpred_seg_counts[i] * seg_id_cost[i];
+  }
+
+  // Now choose which coding method to use.
+  if (t_pred_cost < no_pred_cost) {
+    assert(!cm->features.error_resilient_mode);
+    seg->temporal_update = 1;
+  } else {
+    seg->temporal_update = 0;
+  }
+}
+
+void av1_reset_segment_features(AV1_COMMON *cm) {
+  struct segmentation *seg = &cm->seg;
+
+  // Set up default state for MB feature flags
+  seg->enabled = 0;
+  seg->update_map = 0;
+  seg->update_data = 0;
+  av1_clearall_segfeatures(seg);
+}
diff --git a/libs/libaom/src/av1/encoder/segmentation.h b/libs/libaom/src/av1/encoder/segmentation.h
new file mode 100644
index 000000000..1ad13d66a
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/segmentation.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_SEGMENTATION_H_
+#define AOM_AV1_ENCODER_SEGMENTATION_H_
+
+#include "av1/common/blockd.h"
+#include "av1/encoder/encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_enable_segmentation(struct segmentation *seg);
+void av1_disable_segmentation(struct segmentation *seg);
+
+void av1_disable_segfeature(struct segmentation *seg, int segment_id,
+                            SEG_LVL_FEATURES feature_id);
+void av1_clear_segdata(struct segmentation *seg, int segment_id,
+                       SEG_LVL_FEATURES feature_id);
+
+void av1_choose_segmap_coding_method(AV1_COMMON *cm, MACROBLOCKD *xd);
+
+void av1_reset_segment_features(AV1_COMMON *cm);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_SEGMENTATION_H_
diff --git a/libs/libaom/src/av1/encoder/speed_features.c b/libs/libaom/src/av1/encoder/speed_features.c
new file mode 100644
index 000000000..e03faeccc
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/speed_features.c
@@ -0,0 +1,1322 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+
+#include "av1/common/reconintra.h"
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/speed_features.h"
+#include "av1/encoder/rdopt.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+
+#define MAX_MESH_SPEED 5  // Max speed setting for mesh motion method
+// Max speed setting for tx domain evaluation
+#define MAX_TX_DOMAIN_EVAL_SPEED 5
+static MESH_PATTERN
+    good_quality_mesh_patterns[MAX_MESH_SPEED + 1][MAX_MESH_STEP] = {
+      { { 64, 8 }, { 28, 4 }, { 15, 1 }, { 7, 1 } },
+      { { 64, 8 }, { 28, 4 }, { 15, 1 }, { 7, 1 } },
+      { { 64, 8 }, { 14, 2 }, { 7, 1 }, { 7, 1 } },
+      { { 64, 16 }, { 24, 8 }, { 12, 4 }, { 7, 1 } },
+      { { 64, 16 }, { 24, 8 }, { 12, 4 }, { 7, 1 } },
+      { { 64, 16 }, { 24, 8 }, { 12, 4 }, { 7, 1 } },
+    };
+
+// TODO(huisu@google.com): These settings are pretty relaxed, tune them for
+// each speed setting
+static MESH_PATTERN intrabc_mesh_patterns[MAX_MESH_SPEED + 1][MAX_MESH_STEP] = {
+  { { 256, 1 }, { 256, 1 }, { 0, 0 }, { 0, 0 } },
+  { { 256, 1 }, { 256, 1 }, { 0, 0 }, { 0, 0 } },
+  { { 64, 1 }, { 64, 1 }, { 0, 0 }, { 0, 0 } },
+  { { 64, 1 }, { 64, 1 }, { 0, 0 }, { 0, 0 } },
+  { { 64, 4 }, { 16, 1 }, { 0, 0 }, { 0, 0 } },
+  { { 64, 4 }, { 16, 1 }, { 0, 0 }, { 0, 0 } },
+};
+
+// Threshold values to be used for pruning the txfm_domain_distortion
+// based on block MSE
+// Index 0: Default mode evaluation, Winner mode processing is not
+// applicable (Eg : IntraBc). Index 1: Mode evaluation.
+// Index 2: Winner mode evaluation. Index 1 and 2 are applicable when
+// enable_winner_mode_for_use_tx_domain_dist speed feature is ON
+// TODO(any): Experiment the threshold logic based on variance metric
+static unsigned int tx_domain_dist_thresholds[3][MODE_EVAL_TYPES] = {
+  { UINT_MAX, UINT_MAX, UINT_MAX }, { 22026, 22026, 22026 }, { 0, 0, 0 }
+};
+
+// Transform domain distortion type to be used for default, mode and winner mode
+// evaluation Index 0: Default mode evaluation, Winner mode processing is not
+// applicable (Eg : IntraBc). Index 1: Mode evaluation. Index 2: Winner mode
+// evaluation. Index 1 and 2 are applicable when
+// enable_winner_mode_for_use_tx_domain_dist speed feature is ON
+static unsigned int tx_domain_dist_types[3][MODE_EVAL_TYPES] = { { 0, 2, 0 },
+                                                                 { 1, 2, 0 },
+                                                                 { 2, 2, 0 } };
+
+// Threshold values to be used for disabling coeff RD-optimization
+// based on block MSE / qstep^2.
+// TODO(any): Experiment the threshold logic based on variance metric.
+// For each row, the indices are as follows.
+// Index 0: Default mode evaluation, Winner mode processing is not applicable
+// (Eg : IntraBc)
+// Index 1: Mode evaluation.
+// Index 2: Winner mode evaluation.
+// Index 1 and 2 are applicable when enable_winner_mode_for_coeff_opt speed
+// feature is ON
+// There are 6 levels with increasing speed, mapping to vertical indices.
+static unsigned int coeff_opt_dist_thresholds[6][MODE_EVAL_TYPES] = {
+  { UINT_MAX, UINT_MAX, UINT_MAX },
+  { 3200, 250, UINT_MAX },
+  { 1728, 142, UINT_MAX },
+  { 864, 142, UINT_MAX },
+  { 432, 86, UINT_MAX },
+  { 216, 86, UINT_MAX }
+};
+
+// Transform size to be used for default, mode and winner mode evaluation
+// Index 0: Default mode evaluation, Winner mode processing is not applicable
+// (Eg : IntraBc) Index 1: Mode evaluation. Index 2: Winner mode evaluation.
+// Index 1 and 2 are applicable when enable_winner_mode_for_tx_size_srch speed
+// feature is ON
+static TX_SIZE_SEARCH_METHOD tx_size_search_methods[3][MODE_EVAL_TYPES] = {
+  { USE_FULL_RD, USE_LARGESTALL, USE_FULL_RD },
+  { USE_FAST_RD, USE_LARGESTALL, USE_FULL_RD },
+  { USE_LARGESTALL, USE_LARGESTALL, USE_FULL_RD }
+};
+
+// Predict transform skip levels to be used for default, mode and winner mode
+// evaluation. Index 0: Default mode evaluation, Winner mode processing is not
+// applicable. Index 1: Mode evaluation, Index 2: Winner mode evaluation
+// Values indicate the aggressiveness of skip flag prediction.
+// 0 : no early skip prediction
+// 1 : conservative early skip prediction using DCT_DCT
+// 2 : early skip prediction based on SSE
+static unsigned int predict_skip_levels[3][MODE_EVAL_TYPES] = { { 0, 0, 0 },
+                                                                { 1, 1, 1 },
+                                                                { 1, 2, 1 } };
+
+// Intra only frames, golden frames (except alt ref overlays) and
+// alt ref frames tend to be coded at a higher than ambient quality
+static int frame_is_boosted(const AV1_COMP *cpi) {
+  return frame_is_kf_gf_arf(cpi);
+}
+
+static BLOCK_SIZE dim_to_size(int dim) {
+  switch (dim) {
+    case 4: return BLOCK_4X4;
+    case 8: return BLOCK_8X8;
+    case 16: return BLOCK_16X16;
+    case 32: return BLOCK_32X32;
+    case 64: return BLOCK_64X64;
+    case 128: return BLOCK_128X128;
+    default: assert(0); return 0;
+  }
+}
+
+static void set_good_speed_feature_framesize_dependent(
+    const AV1_COMP *const cpi, SPEED_FEATURES *const sf, int speed) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720;
+  const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480;
+  const int is_4k_or_larger = AOMMIN(cm->width, cm->height) >= 2160;
+
+  if (is_480p_or_larger) {
+    sf->part_sf.use_square_partition_only_threshold = BLOCK_128X128;
+    if (is_720p_or_larger)
+      sf->part_sf.auto_max_partition_based_on_simple_motion = ADAPT_PRED;
+    else
+      sf->part_sf.auto_max_partition_based_on_simple_motion = RELAXED_PRED;
+  } else {
+    sf->part_sf.use_square_partition_only_threshold = BLOCK_64X64;
+    sf->part_sf.auto_max_partition_based_on_simple_motion = DIRECT_PRED;
+  }
+
+  if (is_4k_or_larger) {
+    sf->part_sf.default_min_partition_size = BLOCK_8X8;
+  }
+
+  // TODO(huisu@google.com): train models for 720P and above.
+  if (!is_720p_or_larger) {
+    sf->part_sf.ml_partition_search_breakout_thresh[0] = 200;  // BLOCK_8X8
+    sf->part_sf.ml_partition_search_breakout_thresh[1] = 250;  // BLOCK_16X16
+    sf->part_sf.ml_partition_search_breakout_thresh[2] = 300;  // BLOCK_32X32
+    sf->part_sf.ml_partition_search_breakout_thresh[3] = 500;  // BLOCK_64X64
+    sf->part_sf.ml_partition_search_breakout_thresh[4] = -1;   // BLOCK_128X128
+    sf->part_sf.ml_early_term_after_part_split_level = 1;
+  }
+
+  if (speed >= 1) {
+    if (is_720p_or_larger) {
+      sf->part_sf.use_square_partition_only_threshold = BLOCK_128X128;
+    } else if (is_480p_or_larger) {
+      sf->part_sf.use_square_partition_only_threshold = BLOCK_64X64;
+    } else {
+      sf->part_sf.use_square_partition_only_threshold = BLOCK_32X32;
+    }
+
+    if (!is_720p_or_larger) {
+      sf->part_sf.ml_partition_search_breakout_thresh[0] = 200;  // BLOCK_8X8
+      sf->part_sf.ml_partition_search_breakout_thresh[1] = 250;  // BLOCK_16X16
+      sf->part_sf.ml_partition_search_breakout_thresh[2] = 300;  // BLOCK_32X32
+      sf->part_sf.ml_partition_search_breakout_thresh[3] = 300;  // BLOCK_64X64
+      sf->part_sf.ml_partition_search_breakout_thresh[4] = -1;  // BLOCK_128X128
+    }
+    sf->part_sf.ml_early_term_after_part_split_level = 2;
+  }
+
+  if (speed >= 2) {
+    if (is_720p_or_larger) {
+      sf->part_sf.use_square_partition_only_threshold = BLOCK_64X64;
+    } else if (is_480p_or_larger) {
+      sf->part_sf.use_square_partition_only_threshold = BLOCK_32X32;
+    } else {
+      sf->part_sf.use_square_partition_only_threshold = BLOCK_32X32;
+    }
+
+    if (is_720p_or_larger) {
+      sf->part_sf.partition_search_breakout_dist_thr = (1 << 24);
+      sf->part_sf.partition_search_breakout_rate_thr = 120;
+    } else {
+      sf->part_sf.partition_search_breakout_dist_thr = (1 << 22);
+      sf->part_sf.partition_search_breakout_rate_thr = 100;
+    }
+
+    if (is_720p_or_larger) {
+      sf->inter_sf.prune_obmc_prob_thresh = 16;
+    } else {
+      sf->inter_sf.prune_obmc_prob_thresh = 8;
+    }
+
+    if (is_480p_or_larger) {
+      sf->tx_sf.tx_type_search.prune_tx_type_using_stats = 1;
+    }
+  }
+
+  if (speed >= 3) {
+    sf->part_sf.ml_early_term_after_part_split_level = 0;
+
+    if (is_720p_or_larger) {
+      sf->part_sf.partition_search_breakout_dist_thr = (1 << 25);
+      sf->part_sf.partition_search_breakout_rate_thr = 200;
+    } else {
+      sf->part_sf.max_intra_bsize = BLOCK_32X32;
+      sf->part_sf.partition_search_breakout_dist_thr = (1 << 23);
+      sf->part_sf.partition_search_breakout_rate_thr = 120;
+    }
+  }
+
+  if (speed >= 4) {
+    if (is_720p_or_larger) {
+      sf->part_sf.partition_search_breakout_dist_thr = (1 << 26);
+    } else {
+      sf->part_sf.partition_search_breakout_dist_thr = (1 << 24);
+    }
+
+    if (is_480p_or_larger) {
+      sf->tx_sf.tx_type_search.prune_tx_type_using_stats = 2;
+    }
+
+    sf->inter_sf.prune_obmc_prob_thresh = 16;
+  }
+
+  if (speed >= 5) {
+    if (is_720p_or_larger) {
+      sf->inter_sf.prune_warped_prob_thresh = 16;
+    } else if (is_480p_or_larger) {
+      sf->inter_sf.prune_warped_prob_thresh = 8;
+    }
+  }
+}
+
+static void set_rt_speed_feature_framesize_dependent(const AV1_COMP *const cpi,
+                                                     SPEED_FEATURES *const sf,
+                                                     int speed) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720;
+  const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480;
+  const int is_360p_or_larger = AOMMIN(cm->width, cm->height) >= 360;
+
+  (void)is_720p_or_larger;  // Not used so far
+
+  if (!is_360p_or_larger) {
+    if (speed >= 6) sf->rt_sf.force_tx_search_off = 1;
+    if (speed >= 8) {
+      sf->rt_sf.use_modeled_non_rd_cost = 0;
+      sf->rt_sf.use_nonrd_filter_search = 0;
+    }
+  }
+  if (is_360p_or_larger) {
+    if (speed >= 7) {
+      sf->interp_sf.disable_filter_search_var_thresh = 0;
+    }
+  }
+  if (!is_480p_or_larger) {
+    if (speed == 7) {
+      sf->rt_sf.nonrd_check_partition_merge_mode = 2;
+    }
+    if (speed >= 8) {
+      sf->mv_sf.subpel_search_method = SUBPEL_TREE;
+
+      sf->rt_sf.estimate_motion_for_var_based_partition = 1;
+    }
+  }
+}
+
+static void set_good_speed_features_framesize_independent(
+    const AV1_COMP *const cpi, SPEED_FEATURES *const sf, int speed) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const GF_GROUP *const gf_group = &cpi->gf_group;
+  const int boosted = frame_is_boosted(cpi);
+  const int is_boosted_arf2_bwd_type =
+      boosted || gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE;
+  const int allow_screen_content_tools =
+      cm->features.allow_screen_content_tools;
+  if (!cpi->oxcf.large_scale_tile) {
+    sf->hl_sf.high_precision_mv_usage = LAST_MV_DATA;
+  }
+
+  // Speed 0 for all speed features that give neutral coding performance change.
+  sf->gm_sf.gm_disable_recode = 1;
+  sf->gm_sf.gm_search_type = GM_REDUCED_REF_SEARCH_SKIP_L2_L3;
+
+  sf->part_sf.less_rectangular_check_level = 1;
+  sf->part_sf.ml_prune_4_partition = 1;
+  sf->part_sf.ml_prune_ab_partition = 1;
+  sf->part_sf.ml_prune_rect_partition = 1;
+  sf->part_sf.prune_ext_partition_types_search_level = 1;
+  sf->part_sf.simple_motion_search_prune_rect = 1;
+
+  sf->inter_sf.disable_wedge_search_edge_thresh = 0;
+  sf->inter_sf.disable_wedge_search_var_thresh = 0;
+  // TODO(debargha): Test, tweak and turn on either 1 or 2
+  sf->inter_sf.inter_mode_rd_model_estimation = 1;
+  sf->inter_sf.model_based_post_interp_filter_breakout = 1;
+  sf->inter_sf.prune_compound_using_single_ref = 1;
+  sf->inter_sf.prune_mode_search_simple_translation = 1;
+  sf->inter_sf.prune_motion_mode_level = 1;
+  sf->inter_sf.prune_ref_frame_for_rect_partitions =
+      (boosted || (allow_screen_content_tools))
+          ? 0
+          : (is_boosted_arf2_bwd_type ? 1 : 2);
+  sf->inter_sf.prune_wedge_pred_diff_based = 1;
+  sf->inter_sf.reduce_inter_modes = 1;
+  sf->inter_sf.selective_ref_frame = 1;
+  sf->inter_sf.use_dist_wtd_comp_flag = DIST_WTD_COMP_SKIP_MV_SEARCH;
+
+  sf->interp_sf.cb_pred_filter_search = 0;
+  sf->interp_sf.use_fast_interpolation_filter_search = 1;
+
+  sf->intra_sf.intra_pruning_with_hog = 1;
+  sf->intra_sf.intra_pruning_with_hog_thresh = -1.2f;
+
+  sf->tx_sf.adaptive_txb_search_level = 1;
+  sf->tx_sf.intra_tx_size_search_init_depth_sqr = 1;
+  sf->tx_sf.model_based_prune_tx_search_level = 1;
+  sf->tx_sf.tx_type_search.use_reduced_intra_txset = 1;
+
+  sf->rt_sf.use_nonrd_pick_mode = 0;
+  sf->rt_sf.use_real_time_ref_set = 0;
+
+  if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION)
+    sf->mv_sf.exhaustive_searches_thresh = (1 << 24);
+  else
+    sf->mv_sf.exhaustive_searches_thresh = (1 << 25);
+
+  sf->rd_sf.perform_coeff_opt = 1;
+
+  if (speed >= 1) {
+    sf->gm_sf.disable_adaptive_warp_error_thresh = 0;
+    sf->gm_sf.gm_search_type = GM_REDUCED_REF_SEARCH_SKIP_L2_L3_ARF2;
+    sf->gm_sf.prune_ref_frame_for_gm_search = boosted ? 0 : 1;
+
+    sf->part_sf.intra_cnn_split = 1;
+    sf->part_sf.simple_motion_search_early_term_none = 1;
+    // TODO(Venkat): Clean-up frame type dependency for
+    // simple_motion_search_split in partition search function and set the
+    // speed feature accordingly
+    sf->part_sf.simple_motion_search_split = allow_screen_content_tools ? 1 : 2;
+
+    sf->mv_sf.exhaustive_searches_thresh <<= 1;
+    sf->mv_sf.obmc_full_pixel_search_level = 1;
+    sf->mv_sf.use_accurate_subpel_search = USE_4_TAPS;
+
+    sf->inter_sf.disable_interinter_wedge_newmv_search = boosted ? 0 : 1;
+    sf->inter_sf.prune_comp_search_by_single_result = boosted ? 2 : 1;
+    sf->inter_sf.prune_comp_type_by_comp_avg = 1;
+    sf->inter_sf.prune_comp_type_by_model_rd = boosted ? 0 : 1;
+    sf->inter_sf.prune_motion_mode_level = 2;
+    sf->inter_sf.prune_ref_frame_for_rect_partitions =
+        (frame_is_intra_only(&cpi->common) || (allow_screen_content_tools))
+            ? 0
+            : (boosted ? 1 : 2);
+    sf->inter_sf.reduce_inter_modes = boosted ? 1 : 2;
+    sf->inter_sf.reuse_inter_intra_mode = 1;
+    sf->inter_sf.selective_ref_frame = 2;
+    sf->inter_sf.skip_repeated_newmv = 1;
+
+    sf->interp_sf.cb_pred_filter_search = 0;
+    sf->interp_sf.use_interp_filter = 1;
+    sf->intra_sf.prune_palette_search_level = 1;
+
+    sf->tx_sf.adaptive_txb_search_level = 2;
+    sf->tx_sf.inter_tx_size_search_init_depth_rect = 1;
+    sf->tx_sf.inter_tx_size_search_init_depth_sqr = 1;
+    sf->tx_sf.intra_tx_size_search_init_depth_rect = 1;
+    sf->tx_sf.model_based_prune_tx_search_level = 0;
+    sf->tx_sf.tx_type_search.ml_tx_split_thresh = 4000;
+    sf->tx_sf.tx_type_search.prune_mode = PRUNE_2D_FAST;
+    sf->tx_sf.tx_type_search.skip_tx_search = 1;
+    sf->tx_sf.use_intra_txb_hash = 1;
+
+    sf->rd_sf.perform_coeff_opt = boosted ? 2 : 3;
+    sf->rd_sf.tx_domain_dist_level = boosted ? 1 : 2;
+    sf->rd_sf.tx_domain_dist_thres_level = 1;
+
+    sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL1;
+    sf->lpf_sf.dual_sgr_penalty_level = 1;
+    sf->lpf_sf.enable_sgr_ep_pruning = 1;
+
+    // TODO(any, yunqing): move this feature to speed 0.
+    sf->tpl_sf.skip_alike_starting_mv = 1;
+  }
+
+  if (speed >= 2) {
+    sf->gm_sf.gm_erroradv_type = GM_ERRORADV_TR_2;
+
+    sf->part_sf.allow_partition_search_skip = 1;
+
+    sf->mv_sf.auto_mv_step_size = 1;
+    sf->mv_sf.subpel_iters_per_step = 1;
+
+    // TODO(chiyotsai@google.com): We can get 10% speed up if we move
+    // adaptive_rd_thresh to speed 1. But currently it performs poorly on some
+    // clips (e.g. 5% loss on dinner_1080p). We need to examine the sequence a
+    // bit more closely to figure out why.
+    sf->inter_sf.adaptive_rd_thresh = 1;
+    sf->inter_sf.comp_inter_joint_search_thresh = BLOCK_SIZES_ALL;
+    sf->inter_sf.disable_interinter_wedge_newmv_search = 1;
+    sf->inter_sf.disable_wedge_search_edge_thresh = 0;
+    sf->inter_sf.disable_wedge_search_var_thresh = 100;
+    sf->inter_sf.fast_interintra_wedge_search = 1;
+    sf->inter_sf.fast_wedge_sign_estimate = 1;
+    sf->inter_sf.prune_comp_search_by_single_result = boosted ? 4 : 1;
+    sf->inter_sf.prune_compound_using_neighbors = 1;
+    sf->inter_sf.prune_comp_type_by_comp_avg = 2;
+    sf->inter_sf.prune_warp_using_wmtype = 1;
+    sf->inter_sf.selective_ref_frame = 3;
+    sf->inter_sf.use_dist_wtd_comp_flag = DIST_WTD_COMP_DISABLED;
+
+    // TODO(Sachin): Enable/Enhance this speed feature for speed 2 & 3
+    sf->interp_sf.adaptive_interp_filter_search = 1;
+    sf->interp_sf.disable_dual_filter = 1;
+    sf->interp_sf.disable_filter_search_var_thresh = 100;
+
+    sf->intra_sf.disable_smooth_intra =
+        !frame_is_intra_only(&cpi->common) || (cpi->rc.frames_to_key != 1);
+
+    sf->rd_sf.perform_coeff_opt = is_boosted_arf2_bwd_type ? 3 : 4;
+
+    sf->lpf_sf.prune_wiener_based_on_src_var = 1;
+    sf->lpf_sf.prune_sgr_based_on_wiener = !allow_screen_content_tools;
+  }
+
+  if (speed >= 3) {
+    sf->hl_sf.high_precision_mv_usage = CURRENT_Q;
+    sf->hl_sf.recode_loop = ALLOW_RECODE_KFARFGF;
+
+    sf->gm_sf.gm_search_type = GM_DISABLE_SEARCH;
+
+    sf->part_sf.less_rectangular_check_level = 2;
+    sf->part_sf.simple_motion_search_prune_agg = 1;
+    sf->part_sf.prune_4_partition_using_split_info =
+        !allow_screen_content_tools;
+
+    // adaptive_motion_search breaks encoder multi-thread tests.
+    // The values in x->pred_mv[] differ for single and multi-thread cases.
+    // See aomedia:1778.
+    // sf->mv_sf.adaptive_motion_search = 1;
+    sf->mv_sf.full_pixel_search_level = 1;
+    sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED;
+    sf->mv_sf.use_accurate_subpel_search = USE_2_TAPS;
+    sf->mv_sf.search_method = DIAMOND;
+
+    sf->inter_sf.disable_sb_level_mv_cost_upd = 1;
+    // TODO(yunqing): evaluate this speed feature for speed 1 & 2, and combine
+    // it with cpi->sf.disable_wedge_search_var_thresh.
+    sf->inter_sf.disable_wedge_interintra_search = 1;
+    // TODO(any): Experiment with the early exit mechanism for speeds 0, 1 and 2
+    // and clean-up the speed feature
+    sf->inter_sf.perform_best_rd_based_gating_for_chroma = 1;
+    sf->inter_sf.prune_inter_modes_based_on_tpl = boosted ? 0 : 1;
+    sf->inter_sf.prune_comp_search_by_single_result = boosted ? 4 : 2;
+    sf->inter_sf.prune_motion_mode_level = boosted ? 2 : 3;
+    sf->inter_sf.selective_ref_frame = 4;
+    sf->inter_sf.skip_repeated_ref_mv = 1;
+    sf->inter_sf.skip_repeated_full_newmv = 1;
+    if (cpi->oxcf.enable_smooth_interintra)
+      sf->inter_sf.disable_smooth_interintra = boosted ? 0 : 1;
+    sf->inter_sf.reuse_compound_type_decision = 1;
+    sf->inter_sf.txfm_rd_gate_level = (boosted || allow_screen_content_tools)
+                                          ? 0
+                                          : (is_boosted_arf2_bwd_type ? 1 : 2);
+
+    sf->intra_sf.prune_palette_search_level = 2;
+
+    sf->tpl_sf.skip_alike_starting_mv = 2;
+    sf->tpl_sf.prune_intra_modes = 1;
+    sf->tpl_sf.reduce_first_step_size = 6;
+
+    sf->tx_sf.adaptive_txb_search_level = boosted ? 2 : 3;
+    sf->tx_sf.tx_type_search.use_skip_flag_prediction =
+        allow_screen_content_tools ? 1 : 2;
+
+    // TODO(any): Refactor the code related to following winner mode speed
+    // features
+    sf->winner_mode_sf.enable_winner_mode_for_coeff_opt = 1;
+    // TODO(any): Experiment with this speed feature by enabling for key frames
+    sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch =
+        frame_is_intra_only(&cpi->common) ? 0 : 1;
+    sf->winner_mode_sf.enable_winner_mode_for_use_tx_domain_dist =
+        !allow_screen_content_tools;
+    sf->winner_mode_sf.motion_mode_for_winner_cand =
+        boosted
+            ? 0
+            : gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE ? 1
+                                                                         : 2;
+
+    // TODO(any): evaluate if these lpf features can be moved to speed 2.
+    sf->lpf_sf.prune_sgr_based_on_wiener = allow_screen_content_tools ? 0 : 2;
+    sf->lpf_sf.disable_loop_restoration_chroma =
+        (boosted || allow_screen_content_tools) ? 0 : 1;
+    sf->lpf_sf.reduce_wiener_window_size = !boosted;
+    sf->lpf_sf.prune_wiener_based_on_src_var = 2;
+
+    sf->hl_sf.second_alt_ref_filtering = 0;
+  }
+
+  if (speed >= 4) {
+    sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED_MORE;
+
+    sf->part_sf.simple_motion_search_prune_agg = 2;
+    sf->part_sf.prune_ab_partition_using_split_info =
+        !allow_screen_content_tools;
+
+    sf->inter_sf.adaptive_mode_search = 1;
+    sf->inter_sf.alt_ref_search_fp = 1;
+    sf->inter_sf.prune_ref_mv_idx_search = 1;
+    sf->inter_sf.txfm_rd_gate_level =
+        (boosted || allow_screen_content_tools) ? 0 : 3;
+
+    sf->inter_sf.prune_inter_modes_based_on_tpl = boosted ? 0 : 2;
+    sf->inter_sf.prune_compound_using_neighbors = 2;
+    sf->inter_sf.disable_smooth_interintra = 1;
+
+    sf->interp_sf.cb_pred_filter_search = 1;
+    sf->interp_sf.skip_sharp_interp_filter_search = 1;
+    sf->interp_sf.use_interp_filter = 2;
+    sf->interp_sf.adaptive_interp_filter_search = 2;
+
+    sf->intra_sf.intra_uv_mode_mask[TX_16X16] = UV_INTRA_DC_H_V_CFL;
+    sf->intra_sf.intra_uv_mode_mask[TX_32X32] = UV_INTRA_DC_H_V_CFL;
+    sf->intra_sf.intra_uv_mode_mask[TX_64X64] = UV_INTRA_DC_H_V_CFL;
+    sf->intra_sf.intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
+    sf->intra_sf.intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
+    sf->intra_sf.intra_y_mode_mask[TX_64X64] = INTRA_DC_H_V;
+    // TODO(any): Experiment with this speed feature set to 2 for higher quality
+    // presets as well
+    sf->intra_sf.skip_intra_in_interframe = 2;
+
+    sf->tx_sf.tx_type_search.enable_winner_mode_tx_type_pruning = 1;
+    sf->tx_sf.tx_type_search.fast_intra_tx_type_search = 1;
+    sf->tx_sf.tx_type_search.prune_mode = PRUNE_2D_MORE;
+    sf->tx_sf.tx_type_search.prune_tx_type_est_rd = 1;
+    // TODO(any): Experiment with enabling of this speed feature as hash state
+    // is reset during winner mode processing
+    sf->tx_sf.use_intra_txb_hash = 0;
+
+    sf->rd_sf.perform_coeff_opt = is_boosted_arf2_bwd_type ? 3 : 5;
+    sf->rd_sf.tx_domain_dist_thres_level = 2;
+
+    // TODO(any): Extend multi-winner mode processing support for inter frames
+    sf->winner_mode_sf.enable_multiwinner_mode_process =
+        frame_is_intra_only(&cpi->common) ? 1 : 0;
+    sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch = 1;
+
+    sf->lpf_sf.cdef_pick_method = allow_screen_content_tools
+                                      ? CDEF_FAST_SEARCH_LVL1
+                                      : CDEF_FAST_SEARCH_LVL2;
+
+    // TODO(any): The following features have no impact on quality and speed,
+    // and are disabled.
+    // sf->part_sf.partition_search_breakout_rate_thr = 300;
+    // sf->interp_sf.disable_filter_search_var_thresh = 200;
+    // sf->rd_sf.use_fast_coef_costing = 1;
+
+    // TODO(any): The following features give really bad quality/speed trade
+    // off. Needs to be re-worked.
+    // sf->mv_sf.search_method = BIGDIA;
+    // sf->inter_sf.adaptive_rd_thresh = 4;
+    // sf->rd_sf.tx_domain_dist_level = 2;
+    // sf->rt_sf.mode_search_skip_flags =
+    //     (cm->current_frame.frame_type == KEY_FRAME)
+    //     ? 0
+    //     : FLAG_SKIP_INTRA_DIRMISMATCH | FLAG_SKIP_INTRA_BESTINTER |
+    //     FLAG_SKIP_COMP_BESTINTRA | FLAG_SKIP_INTRA_LOWVAR |
+    //     FLAG_EARLY_TERMINATE;
+  }
+
+  if (speed >= 5) {
+    sf->part_sf.simple_motion_search_prune_agg = 3;
+    sf->part_sf.ext_partition_eval_thresh =
+        allow_screen_content_tools ? BLOCK_8X8 : BLOCK_16X16;
+
+    sf->inter_sf.prune_inter_modes_based_on_tpl = boosted ? 0 : 3;
+    sf->inter_sf.disable_interinter_wedge = 1;
+    sf->inter_sf.disable_obmc = 1;
+    sf->inter_sf.disable_onesided_comp = 1;
+    sf->inter_sf.txfm_rd_gate_level =
+        (boosted || allow_screen_content_tools) ? 0 : 4;
+    sf->inter_sf.prune_inter_modes_if_skippable = 1;
+
+    sf->lpf_sf.lpf_pick = LPF_PICK_FROM_FULL_IMAGE_NON_DUAL;
+    sf->lpf_sf.disable_lr_filter = 1;
+
+    sf->mv_sf.simple_motion_subpel_force_stop = QUARTER_PEL;
+    sf->mv_sf.prune_mesh_search = 1;
+    sf->mv_sf.reduce_search_range = 1;
+
+    sf->tpl_sf.subpel_force_stop = QUARTER_PEL;
+  }
+
+  if (speed >= 6) {
+  }
+}
+
+// TODO(kyslov): now this is very similar to
+// set_good_speed_features_framesize_independent
+//               except it sets non-rd flag on speed8. This function will likely
+//               be modified in the future with RT-specific speed features
+static void set_rt_speed_features_framesize_independent(AV1_COMP *cpi,
+                                                        SPEED_FEATURES *sf,
+                                                        int speed) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int boosted = frame_is_boosted(cpi);
+
+  // Speed 0 for all speed features that give neutral coding performance change.
+  sf->gm_sf.gm_disable_recode = 1;
+  sf->gm_sf.gm_search_type = GM_REDUCED_REF_SEARCH_SKIP_L2_L3;
+
+  sf->part_sf.less_rectangular_check_level = 1;
+  sf->part_sf.ml_prune_4_partition = 1;
+  sf->part_sf.ml_prune_ab_partition = 1;
+  sf->part_sf.ml_prune_rect_partition = 1;
+  sf->part_sf.prune_ext_partition_types_search_level = 1;
+
+  // TODO(debargha): Test, tweak and turn on either 1 or 2
+  sf->inter_sf.inter_mode_rd_model_estimation = 0;
+  sf->inter_sf.disable_wedge_search_edge_thresh = 0;
+  sf->inter_sf.disable_wedge_search_var_thresh = 0;
+  sf->inter_sf.model_based_post_interp_filter_breakout = 1;
+  sf->inter_sf.prune_compound_using_single_ref = 0;
+  sf->inter_sf.prune_mode_search_simple_translation = 1;
+  sf->inter_sf.prune_motion_mode_level = 1;
+  sf->inter_sf.prune_ref_frame_for_rect_partitions = !boosted;
+  sf->inter_sf.prune_wedge_pred_diff_based = 1;
+  sf->inter_sf.reduce_inter_modes = 1;
+  sf->inter_sf.selective_ref_frame = 1;
+  sf->inter_sf.use_dist_wtd_comp_flag = DIST_WTD_COMP_SKIP_MV_SEARCH;
+
+  sf->interp_sf.cb_pred_filter_search = 0;
+  sf->interp_sf.use_fast_interpolation_filter_search = 1;
+
+  sf->intra_sf.intra_pruning_with_hog = 1;
+  sf->intra_sf.intra_pruning_with_hog_thresh = -1.2f;
+
+  sf->mv_sf.full_pixel_search_level = 1;
+  sf->mv_sf.exhaustive_searches_thresh = INT_MAX;
+
+  sf->rt_sf.check_intra_pred_nonrd = 1;
+  sf->rt_sf.estimate_motion_for_var_based_partition = 1;
+  sf->rt_sf.hybrid_intra_pickmode = 0;
+  sf->rt_sf.nonrd_prune_ref_frame_search = 0;
+  sf->rt_sf.reuse_inter_pred_nonrd = 0;
+  sf->rt_sf.use_comp_ref_nonrd = 1;
+  sf->rt_sf.use_nonrd_filter_search = 1;
+  sf->rt_sf.use_nonrd_pick_mode = 0;
+  sf->rt_sf.use_real_time_ref_set = 0;
+  sf->tx_sf.adaptive_txb_search_level = 1;
+  sf->tx_sf.intra_tx_size_search_init_depth_sqr = 1;
+  sf->tx_sf.model_based_prune_tx_search_level = 1;
+  sf->tx_sf.tx_type_search.use_reduced_intra_txset = 1;
+
+  if (speed >= 1) {
+    sf->gm_sf.gm_erroradv_type = GM_ERRORADV_TR_1;
+    sf->gm_sf.gm_search_type = GM_REDUCED_REF_SEARCH_SKIP_L2_L3_ARF2;
+
+    sf->part_sf.prune_ext_partition_types_search_level = 2;
+    sf->part_sf.simple_motion_search_prune_rect = 1;
+
+    sf->mv_sf.obmc_full_pixel_search_level = 1;
+    sf->mv_sf.use_accurate_subpel_search = USE_4_TAPS;
+
+    sf->inter_sf.prune_comp_search_by_single_result = 1;
+    sf->inter_sf.reuse_inter_intra_mode = 1;
+    sf->inter_sf.selective_ref_frame = 2;
+    sf->inter_sf.skip_repeated_newmv = 1;
+    sf->inter_sf.disable_wedge_search_var_thresh = 0;
+    sf->inter_sf.disable_wedge_search_edge_thresh = 0;
+    sf->inter_sf.prune_comp_type_by_comp_avg = 1;
+    sf->inter_sf.prune_motion_mode_level = 2;
+    sf->inter_sf.prune_single_motion_modes_by_simple_trans = 1;
+
+    sf->interp_sf.cb_pred_filter_search = 1;
+    sf->interp_sf.use_interp_filter = 1;
+
+    sf->tx_sf.adaptive_txb_search_level = 2;
+    sf->tx_sf.intra_tx_size_search_init_depth_rect = 1;
+    sf->tx_sf.tx_size_search_lgr_block = 1;
+    sf->tx_sf.tx_type_search.ml_tx_split_thresh = 4000;
+    sf->tx_sf.tx_type_search.skip_tx_search = 1;
+    sf->tx_sf.use_intra_txb_hash = 1;
+
+    sf->rd_sf.optimize_b_precheck = 1;
+    sf->rd_sf.tx_domain_dist_level = boosted ? 0 : 1;
+    sf->rd_sf.tx_domain_dist_thres_level = 1;
+
+    sf->lpf_sf.dual_sgr_penalty_level = 1;
+  }
+
+  if (speed >= 2) {
+    sf->gm_sf.gm_erroradv_type = GM_ERRORADV_TR_2;
+
+    sf->part_sf.allow_partition_search_skip = 1;
+    sf->part_sf.partition_search_breakout_rate_thr = 80;
+
+    sf->mv_sf.auto_mv_step_size = 1;
+    sf->mv_sf.subpel_iters_per_step = 1;
+
+    sf->inter_sf.adaptive_rd_thresh = 1;
+    sf->inter_sf.comp_inter_joint_search_thresh = BLOCK_SIZES_ALL;
+    sf->inter_sf.disable_wedge_search_edge_thresh = 0;
+    sf->inter_sf.disable_wedge_search_var_thresh = 100;
+    sf->inter_sf.fast_wedge_sign_estimate = 1;
+    sf->inter_sf.prune_comp_type_by_comp_avg = 2;
+    sf->inter_sf.selective_ref_frame = 3;
+    sf->inter_sf.use_dist_wtd_comp_flag = DIST_WTD_COMP_DISABLED;
+
+    sf->interp_sf.adaptive_interp_filter_search = 1;
+    sf->interp_sf.cb_pred_filter_search = 0;
+    sf->interp_sf.disable_dual_filter = 1;
+    sf->interp_sf.disable_filter_search_var_thresh = 100;
+
+    sf->tx_sf.inter_tx_size_search_init_depth_rect = 1;
+    sf->tx_sf.inter_tx_size_search_init_depth_sqr = 1;
+    sf->tx_sf.model_based_prune_tx_search_level = 0;
+
+    sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL1;
+  }
+
+  if (speed >= 3) {
+    sf->hl_sf.recode_loop = ALLOW_RECODE_KFARFGF;
+
+    sf->gm_sf.gm_search_type = GM_DISABLE_SEARCH;
+
+    sf->part_sf.less_rectangular_check_level = 2;
+
+    sf->mv_sf.use_accurate_subpel_search = USE_2_TAPS;
+    // adaptive_motion_search breaks encoder multi-thread tests.
+    // The values in x->pred_mv[] differ for single and multi-thread cases.
+    // See aomedia:1778.
+    // sf->mv_sf.adaptive_motion_search = 1;
+
+    sf->inter_sf.adaptive_rd_thresh = 2;
+    sf->inter_sf.disable_sb_level_mv_cost_upd = 1;
+    // TODO(yunqing): evaluate this speed feature for speed 1 & 2, and combine
+    // it with cpi->sf.disable_wedge_search_var_thresh.
+    sf->inter_sf.disable_wedge_interintra_search = 1;
+    sf->inter_sf.prune_comp_search_by_single_result = 2;
+    sf->inter_sf.prune_motion_mode_level = boosted ? 2 : 3;
+    sf->inter_sf.prune_warp_using_wmtype = 1;
+    sf->inter_sf.selective_ref_frame = 4;
+
+    sf->tx_sf.tx_type_search.prune_mode = PRUNE_2D_FAST;
+
+    sf->rd_sf.tx_domain_dist_level = 1;
+
+    sf->winner_mode_sf.tx_size_search_level = boosted ? 0 : 2;
+  }
+
+  if (speed >= 4) {
+    sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED;
+
+    sf->inter_sf.adaptive_mode_search = 1;
+    sf->inter_sf.alt_ref_search_fp = 1;
+
+    sf->interp_sf.skip_sharp_interp_filter_search = 1;
+
+    sf->tx_sf.tx_type_search.fast_inter_tx_type_search = 1;
+    sf->tx_sf.tx_type_search.fast_intra_tx_type_search = 1;
+    sf->tx_sf.use_intra_txb_hash = 0;
+
+    sf->rd_sf.use_mb_rd_hash = 0;
+
+    sf->winner_mode_sf.tx_size_search_level = frame_is_intra_only(cm) ? 0 : 2;
+  }
+
+  if (speed >= 5) {
+    sf->hl_sf.recode_loop = ALLOW_RECODE_KFMAXBW;
+
+    sf->inter_sf.adaptive_rd_thresh = 4;
+    sf->interp_sf.disable_filter_search_var_thresh = 200;
+
+    sf->rd_sf.use_fast_coef_costing = 1;
+    sf->rd_sf.tx_domain_dist_level = 2;
+    sf->rd_sf.tx_domain_dist_thres_level = 2;
+    sf->winner_mode_sf.tx_size_search_level = 1;
+
+    sf->rt_sf.mode_search_skip_flags =
+        (cm->current_frame.frame_type == KEY_FRAME)
+            ? 0
+            : FLAG_SKIP_INTRA_DIRMISMATCH | FLAG_SKIP_INTRA_BESTINTER |
+                  FLAG_SKIP_COMP_BESTINTRA | FLAG_SKIP_INTRA_LOWVAR |
+                  FLAG_EARLY_TERMINATE;
+    sf->hl_sf.frame_parameter_update = 0;
+
+    sf->part_sf.default_max_partition_size = BLOCK_128X128;
+    sf->part_sf.default_min_partition_size = BLOCK_8X8;
+    sf->part_sf.max_intra_bsize = BLOCK_32X32;
+    sf->part_sf.partition_search_breakout_rate_thr = 500;
+    sf->part_sf.partition_search_type = VAR_BASED_PARTITION;
+    sf->part_sf.adjust_var_based_rd_partitioning = 2;
+
+    sf->mv_sf.search_method = FAST_DIAMOND;
+    sf->mv_sf.subpel_force_stop = QUARTER_PEL;
+    sf->mv_sf.use_fullpel_costlist = 1;
+    sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED_MORE;
+
+    sf->inter_sf.adaptive_mode_search = 2;
+    sf->inter_sf.inter_mode_rd_model_estimation = 2;
+
+    for (int i = 0; i < TX_SIZES; ++i) {
+      sf->intra_sf.intra_y_mode_mask[i] = INTRA_DC;
+      sf->intra_sf.intra_uv_mode_mask[i] = UV_INTRA_DC_CFL;
+    }
+
+    sf->tx_sf.tx_type_search.prune_mode = PRUNE_2D_MORE;
+    sf->tx_sf.use_inter_txb_hash = 0;
+    sf->tx_sf.refine_fast_tx_search_results = 0;
+
+    sf->rd_sf.optimize_coefficients = NO_TRELLIS_OPT;
+    sf->rd_sf.simple_model_rd_from_var = 1;
+
+    sf->lpf_sf.cdef_pick_method = CDEF_PICK_FROM_Q;
+    sf->lpf_sf.lpf_pick = LPF_PICK_FROM_Q;
+
+    sf->rt_sf.mode_search_skip_flags |= FLAG_SKIP_INTRA_DIRMISMATCH;
+    sf->rt_sf.num_inter_modes_for_tx_search = 5;
+    sf->rt_sf.skip_interp_filter_search = 1;
+    sf->rt_sf.use_comp_ref_nonrd = 0;
+    sf->rt_sf.use_real_time_ref_set = 1;
+    sf->rt_sf.use_simple_rd_model = 1;
+  }
+
+  if (speed >= 6) {
+    sf->part_sf.adjust_var_based_rd_partitioning = 1;
+  }
+
+  if (speed >= 7) {
+    sf->hl_sf.frame_parameter_update = 0;
+
+    sf->part_sf.default_max_partition_size = BLOCK_128X128;
+    sf->part_sf.default_min_partition_size = BLOCK_8X8;
+    sf->part_sf.partition_search_type = VAR_BASED_PARTITION;
+
+    sf->mv_sf.search_method = FAST_DIAMOND;
+    sf->mv_sf.subpel_force_stop = QUARTER_PEL;
+    sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED;
+
+    sf->inter_sf.inter_mode_rd_model_estimation = 2;
+
+    sf->lpf_sf.cdef_pick_method = CDEF_PICK_FROM_Q;
+    sf->lpf_sf.lpf_pick = LPF_PICK_FROM_Q;
+
+    sf->rt_sf.mode_search_skip_flags |= FLAG_SKIP_INTRA_DIRMISMATCH;
+    sf->rt_sf.nonrd_prune_ref_frame_search = 1;
+    sf->rt_sf.reuse_inter_pred_nonrd = 0;
+    sf->rt_sf.short_circuit_low_temp_var = 0;
+    sf->rt_sf.skip_interp_filter_search = 0;
+    sf->rt_sf.use_comp_ref_nonrd = 0;
+    sf->rt_sf.use_nonrd_altref_frame = 1;
+    sf->rt_sf.use_nonrd_pick_mode = 1;
+    sf->rt_sf.nonrd_check_partition_merge_mode = 1;
+    sf->rt_sf.nonrd_check_partition_split = 0;
+    sf->rt_sf.hybrid_intra_pickmode = 1;
+  }
+
+  if (speed >= 8) {
+    sf->rt_sf.estimate_motion_for_var_based_partition = 0;
+    sf->rt_sf.short_circuit_low_temp_var = 1;
+    sf->rt_sf.reuse_inter_pred_nonrd = 1;
+    sf->rt_sf.use_nonrd_altref_frame = 0;
+    sf->rt_sf.nonrd_prune_ref_frame_search = 2;
+    sf->rt_sf.nonrd_check_partition_merge_mode = 0;
+    sf->rt_sf.nonrd_check_partition_split = 0;
+    sf->rt_sf.use_modeled_non_rd_cost = 1;
+    sf->rt_sf.source_metrics_sb_nonrd = 1;
+    sf->interp_sf.cb_pred_filter_search = 1;
+  }
+}
+
+static AOM_INLINE void init_hl_sf(HIGH_LEVEL_SPEED_FEATURES *hl_sf) {
+  // best quality defaults
+  hl_sf->frame_parameter_update = 1;
+  hl_sf->recode_loop = ALLOW_RECODE;
+  hl_sf->disable_overlay_frames = 0;
+  hl_sf->adaptive_overlay_encoding = 1;
+  // Recode loop tolerance %.
+  hl_sf->recode_tolerance = 25;
+  hl_sf->high_precision_mv_usage = CURRENT_Q;
+  hl_sf->second_alt_ref_filtering = 1;
+}
+
+static AOM_INLINE void init_tpl_sf(TPL_SPEED_FEATURES *tpl_sf) {
+  tpl_sf->prune_intra_modes = 0;
+  tpl_sf->reduce_first_step_size = 0;
+  tpl_sf->skip_alike_starting_mv = 0;
+  tpl_sf->subpel_force_stop = EIGHTH_PEL;
+}
+
+static AOM_INLINE void init_gm_sf(GLOBAL_MOTION_SPEED_FEATURES *gm_sf) {
+  gm_sf->gm_erroradv_type = GM_ERRORADV_TR_0;
+  gm_sf->disable_adaptive_warp_error_thresh = 1;
+  gm_sf->selective_ref_gm = 1;
+  gm_sf->gm_search_type = GM_FULL_SEARCH;
+  gm_sf->gm_disable_recode = 0;
+  gm_sf->prune_ref_frame_for_gm_search = 0;
+}
+
+static AOM_INLINE void init_part_sf(PARTITION_SPEED_FEATURES *part_sf) {
+  part_sf->partition_search_type = SEARCH_PARTITION;
+  part_sf->less_rectangular_check_level = 0;
+  part_sf->use_square_partition_only_threshold = BLOCK_128X128;
+  part_sf->auto_max_partition_based_on_simple_motion = NOT_IN_USE;
+  part_sf->auto_min_partition_based_on_simple_motion = 0;
+  part_sf->default_max_partition_size = BLOCK_LARGEST;
+  part_sf->default_min_partition_size = BLOCK_4X4;
+  part_sf->adjust_var_based_rd_partitioning = 0;
+  part_sf->allow_partition_search_skip = 0;
+  part_sf->max_intra_bsize = BLOCK_LARGEST;
+  // This setting only takes effect when partition_search_type is set
+  // to FIXED_PARTITION.
+  part_sf->always_this_block_size = BLOCK_16X16;
+  // Recode loop tolerance %.
+  part_sf->partition_search_breakout_dist_thr = 0;
+  part_sf->partition_search_breakout_rate_thr = 0;
+  part_sf->prune_ext_partition_types_search_level = 0;
+  part_sf->ml_prune_rect_partition = 0;
+  part_sf->ml_prune_ab_partition = 0;
+  part_sf->ml_prune_4_partition = 0;
+  part_sf->ml_early_term_after_part_split_level = 0;
+  for (int i = 0; i < PARTITION_BLOCK_SIZES; ++i) {
+    part_sf->ml_partition_search_breakout_thresh[i] =
+        -1;  // -1 means not enabled.
+  }
+  part_sf->simple_motion_search_prune_agg = 0;
+  part_sf->simple_motion_search_split = 0;
+  part_sf->simple_motion_search_prune_rect = 0;
+  part_sf->simple_motion_search_early_term_none = 0;
+  part_sf->intra_cnn_split = 0;
+  part_sf->ext_partition_eval_thresh = BLOCK_8X8;
+  part_sf->prune_4_partition_using_split_info = 0;
+  part_sf->prune_ab_partition_using_split_info = 0;
+}
+
+static AOM_INLINE void init_mv_sf(MV_SPEED_FEATURES *mv_sf) {
+  mv_sf->full_pixel_search_level = 0;
+  mv_sf->adaptive_motion_search = 0;
+  mv_sf->auto_mv_step_size = 0;
+  mv_sf->exhaustive_searches_thresh = 0;
+  mv_sf->obmc_full_pixel_search_level = 0;
+  mv_sf->prune_mesh_search = 0;
+  mv_sf->reduce_search_range = 0;
+  mv_sf->search_method = NSTEP;
+  mv_sf->simple_motion_subpel_force_stop = EIGHTH_PEL;
+  mv_sf->subpel_force_stop = EIGHTH_PEL;
+  mv_sf->subpel_iters_per_step = 2;
+  mv_sf->subpel_search_method = SUBPEL_TREE;
+  mv_sf->use_accurate_subpel_search = USE_8_TAPS;
+  mv_sf->use_fullpel_costlist = 0;
+}
+
+static AOM_INLINE void init_inter_sf(INTER_MODE_SPEED_FEATURES *inter_sf) {
+  inter_sf->comp_inter_joint_search_thresh = BLOCK_4X4;
+  inter_sf->adaptive_rd_thresh = 0;
+  inter_sf->model_based_post_interp_filter_breakout = 0;
+  inter_sf->reduce_inter_modes = 0;
+  inter_sf->adaptive_mode_search = 0;
+  inter_sf->alt_ref_search_fp = 0;
+  inter_sf->selective_ref_frame = 0;
+  inter_sf->prune_ref_frame_for_rect_partitions = 0;
+  inter_sf->disable_wedge_search_edge_thresh = 0;
+  inter_sf->disable_wedge_search_var_thresh = 0;
+  inter_sf->fast_wedge_sign_estimate = 0;
+  inter_sf->prune_wedge_pred_diff_based = 0;
+  inter_sf->use_dist_wtd_comp_flag = DIST_WTD_COMP_ENABLED;
+  inter_sf->reuse_inter_intra_mode = 0;
+  inter_sf->disable_sb_level_coeff_cost_upd = 0;
+  inter_sf->disable_sb_level_mv_cost_upd = 0;
+  inter_sf->prune_inter_modes_based_on_tpl = 0;
+  inter_sf->prune_comp_search_by_single_result = 0;
+  inter_sf->skip_repeated_ref_mv = 0;
+  inter_sf->skip_repeated_newmv = 0;
+  inter_sf->skip_repeated_full_newmv = 0;
+  inter_sf->prune_single_motion_modes_by_simple_trans = 0;
+  inter_sf->inter_mode_rd_model_estimation = 0;
+  inter_sf->prune_compound_using_single_ref = 0;
+  inter_sf->prune_compound_using_neighbors = 0;
+  inter_sf->disable_onesided_comp = 0;
+  inter_sf->prune_mode_search_simple_translation = 0;
+  inter_sf->prune_comp_type_by_comp_avg = 0;
+  inter_sf->disable_interinter_wedge_newmv_search = 0;
+  inter_sf->enable_interinter_diffwtd_newmv_search = 0;
+  inter_sf->disable_smooth_interintra = 0;
+  inter_sf->prune_motion_mode_level = 0;
+  inter_sf->prune_warp_using_wmtype = 0;
+  inter_sf->disable_wedge_interintra_search = 0;
+  inter_sf->fast_interintra_wedge_search = 0;
+  inter_sf->prune_comp_type_by_model_rd = 0;
+  inter_sf->perform_best_rd_based_gating_for_chroma = 0;
+  inter_sf->prune_obmc_prob_thresh = 0;
+  inter_sf->disable_obmc = 0;
+  inter_sf->disable_interinter_wedge = 0;
+  inter_sf->prune_ref_mv_idx_search = 0;
+  inter_sf->prune_warped_prob_thresh = 0;
+  inter_sf->reuse_compound_type_decision = 0;
+  inter_sf->txfm_rd_gate_level = 0;
+  inter_sf->prune_inter_modes_if_skippable = 0;
+}
+
+static AOM_INLINE void init_interp_sf(INTERP_FILTER_SPEED_FEATURES *interp_sf) {
+  interp_sf->disable_filter_search_var_thresh = 0;
+  interp_sf->adaptive_interp_filter_search = 0;
+  interp_sf->use_fast_interpolation_filter_search = 0;
+  interp_sf->disable_dual_filter = 0;
+  interp_sf->use_interp_filter = 0;
+  interp_sf->skip_sharp_interp_filter_search = 0;
+}
+
+static AOM_INLINE void init_intra_sf(INTRA_MODE_SPEED_FEATURES *intra_sf) {
+  intra_sf->skip_intra_in_interframe = 1;
+  intra_sf->intra_pruning_with_hog = 0;
+  intra_sf->src_var_thresh_intra_skip = 1;
+  intra_sf->prune_palette_search_level = 0;
+
+  for (int i = 0; i < TX_SIZES; i++) {
+    intra_sf->intra_y_mode_mask[i] = INTRA_ALL;
+    intra_sf->intra_uv_mode_mask[i] = UV_INTRA_ALL;
+  }
+  intra_sf->disable_smooth_intra = 0;
+}
+
+static AOM_INLINE void init_tx_sf(TX_SPEED_FEATURES *tx_sf) {
+  tx_sf->inter_tx_size_search_init_depth_sqr = 0;
+  tx_sf->inter_tx_size_search_init_depth_rect = 0;
+  tx_sf->intra_tx_size_search_init_depth_rect = 0;
+  tx_sf->intra_tx_size_search_init_depth_sqr = 0;
+  tx_sf->tx_size_search_lgr_block = 0;
+  tx_sf->model_based_prune_tx_search_level = 0;
+  tx_sf->tx_type_search.prune_mode = PRUNE_2D_ACCURATE;
+  tx_sf->tx_type_search.ml_tx_split_thresh = 8500;
+  tx_sf->tx_type_search.use_skip_flag_prediction = 1;
+  tx_sf->tx_type_search.use_reduced_intra_txset = 0;
+  tx_sf->tx_type_search.fast_intra_tx_type_search = 0;
+  tx_sf->tx_type_search.fast_inter_tx_type_search = 0;
+  tx_sf->tx_type_search.skip_tx_search = 0;
+  tx_sf->tx_type_search.prune_tx_type_using_stats = 0;
+  tx_sf->tx_type_search.prune_tx_type_est_rd = 0;
+  tx_sf->tx_type_search.enable_winner_mode_tx_type_pruning = 0;
+  tx_sf->txb_split_cap = 1;
+  tx_sf->adaptive_txb_search_level = 0;
+  tx_sf->use_intra_txb_hash = 0;
+  tx_sf->use_inter_txb_hash = 1;
+  tx_sf->refine_fast_tx_search_results = 1;
+}
+
+static AOM_INLINE void init_rd_sf(RD_CALC_SPEED_FEATURES *rd_sf,
+                                  const AV1_COMP *cpi) {
+  if (cpi->oxcf.disable_trellis_quant == 3) {
+    rd_sf->optimize_coefficients = !is_lossless_requested(&cpi->oxcf)
+                                       ? NO_ESTIMATE_YRD_TRELLIS_OPT
+                                       : NO_TRELLIS_OPT;
+  } else if (cpi->oxcf.disable_trellis_quant == 2) {
+    rd_sf->optimize_coefficients = !is_lossless_requested(&cpi->oxcf)
+                                       ? FINAL_PASS_TRELLIS_OPT
+                                       : NO_TRELLIS_OPT;
+  } else if (cpi->oxcf.disable_trellis_quant == 0) {
+    if (is_lossless_requested(&cpi->oxcf)) {
+      rd_sf->optimize_coefficients = NO_TRELLIS_OPT;
+    } else {
+      rd_sf->optimize_coefficients = FULL_TRELLIS_OPT;
+    }
+  } else if (cpi->oxcf.disable_trellis_quant == 1) {
+    rd_sf->optimize_coefficients = NO_TRELLIS_OPT;
+  } else {
+    assert(0 && "Invalid disable_trellis_quant value");
+  }
+  // TODO(sarahparker) Pair this with a speed setting once experiments are done
+  rd_sf->trellis_eob_fast = 0;
+  rd_sf->use_mb_rd_hash = 1;
+  rd_sf->optimize_b_precheck = 0;
+  rd_sf->use_fast_coef_costing = 0;
+  rd_sf->simple_model_rd_from_var = 0;
+  rd_sf->tx_domain_dist_level = 0;
+  rd_sf->tx_domain_dist_thres_level = 0;
+  rd_sf->use_hash_based_trellis = 0;
+  rd_sf->perform_coeff_opt = 0;
+}
+
+static AOM_INLINE void init_winner_mode_sf(
+    WINNER_MODE_SPEED_FEATURES *winner_mode_sf) {
+  winner_mode_sf->motion_mode_for_winner_cand = 0;
+  // Set this at the appropriate speed levels
+  winner_mode_sf->tx_size_search_level = USE_FULL_RD;
+  winner_mode_sf->enable_winner_mode_for_coeff_opt = 0;
+  winner_mode_sf->enable_winner_mode_for_tx_size_srch = 0;
+  winner_mode_sf->enable_winner_mode_for_use_tx_domain_dist = 0;
+  winner_mode_sf->enable_multiwinner_mode_process = 0;
+}
+
+static AOM_INLINE void init_lpf_sf(LOOP_FILTER_SPEED_FEATURES *lpf_sf) {
+  lpf_sf->disable_loop_restoration_chroma = 0;
+  lpf_sf->prune_wiener_based_on_src_var = 0;
+  lpf_sf->prune_sgr_based_on_wiener = 0;
+  lpf_sf->enable_sgr_ep_pruning = 0;
+  lpf_sf->reduce_wiener_window_size = 0;
+  lpf_sf->lpf_pick = LPF_PICK_FROM_FULL_IMAGE;
+  lpf_sf->cdef_pick_method = CDEF_FULL_SEARCH;
+  // Set decoder side speed feature to use less dual sgr modes
+  lpf_sf->dual_sgr_penalty_level = 0;
+  lpf_sf->disable_lr_filter = 0;
+}
+
+static AOM_INLINE void init_rt_sf(REAL_TIME_SPEED_FEATURES *rt_sf) {
+  rt_sf->mode_search_skip_flags = 0;
+  rt_sf->skip_interp_filter_search = 0;
+  rt_sf->force_tx_search_off = 0;
+  rt_sf->num_inter_modes_for_tx_search = INT_MAX;
+  rt_sf->use_simple_rd_model = 0;
+  rt_sf->nonrd_check_partition_merge_mode = 0;
+  rt_sf->nonrd_check_partition_split = 0;
+}
+
+void av1_set_speed_features_framesize_dependent(AV1_COMP *cpi, int speed) {
+  SPEED_FEATURES *const sf = &cpi->sf;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+
+  if (oxcf->mode == GOOD) {
+    set_good_speed_feature_framesize_dependent(cpi, sf, speed);
+  } else if (oxcf->mode == REALTIME) {
+    set_rt_speed_feature_framesize_dependent(cpi, sf, speed);
+  }
+
+  // This is only used in motion vector unit test.
+  if (cpi->oxcf.motion_vector_unit_test == 1)
+    cpi->mv_search_params.find_fractional_mv_step = av1_return_max_sub_pixel_mv;
+  else if (cpi->oxcf.motion_vector_unit_test == 2)
+    cpi->mv_search_params.find_fractional_mv_step = av1_return_min_sub_pixel_mv;
+
+  MACROBLOCK *const x = &cpi->td.mb;
+  AV1_COMMON *const cm = &cpi->common;
+  x->min_partition_size = AOMMAX(sf->part_sf.default_min_partition_size,
+                                 dim_to_size(cpi->oxcf.min_partition_size));
+  x->max_partition_size = AOMMIN(sf->part_sf.default_max_partition_size,
+                                 dim_to_size(cpi->oxcf.max_partition_size));
+  x->min_partition_size = AOMMIN(x->min_partition_size, cm->seq_params.sb_size);
+  x->max_partition_size = AOMMIN(x->max_partition_size, cm->seq_params.sb_size);
+}
+
+void av1_set_speed_features_framesize_independent(AV1_COMP *cpi, int speed) {
+  AV1_COMMON *const cm = &cpi->common;
+  SPEED_FEATURES *const sf = &cpi->sf;
+  MACROBLOCK *const x = &cpi->td.mb;
+  WinnerModeParams *const winner_mode_params = &cpi->winner_mode_params;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  int i;
+
+  init_hl_sf(&sf->hl_sf);
+  init_tpl_sf(&sf->tpl_sf);
+  init_gm_sf(&sf->gm_sf);
+  init_part_sf(&sf->part_sf);
+  init_mv_sf(&sf->mv_sf);
+  init_inter_sf(&sf->inter_sf);
+  init_interp_sf(&sf->interp_sf);
+  init_intra_sf(&sf->intra_sf);
+  init_tx_sf(&sf->tx_sf);
+  init_rd_sf(&sf->rd_sf, cpi);
+  init_winner_mode_sf(&sf->winner_mode_sf);
+  init_lpf_sf(&sf->lpf_sf);
+  init_rt_sf(&sf->rt_sf);
+
+  if (oxcf->mode == GOOD)
+    set_good_speed_features_framesize_independent(cpi, sf, speed);
+  else if (oxcf->mode == REALTIME)
+    set_rt_speed_features_framesize_independent(cpi, sf, speed);
+
+  if (!cpi->seq_params_locked) {
+    cpi->common.seq_params.enable_dual_filter &=
+        !sf->interp_sf.disable_dual_filter;
+    cpi->common.seq_params.enable_restoration &= !sf->lpf_sf.disable_lr_filter;
+  }
+
+  // sf->part_sf.partition_search_breakout_dist_thr is set assuming max 64x64
+  // blocks. Normalise this if the blocks are bigger.
+  if (MAX_SB_SIZE_LOG2 > 6) {
+    sf->part_sf.partition_search_breakout_dist_thr <<=
+        2 * (MAX_SB_SIZE_LOG2 - 6);
+  }
+
+  const int mesh_speed = AOMMIN(speed, MAX_MESH_SPEED);
+  for (i = 0; i < MAX_MESH_STEP; ++i) {
+    sf->mv_sf.mesh_patterns[i].range =
+        good_quality_mesh_patterns[mesh_speed][i].range;
+    sf->mv_sf.mesh_patterns[i].interval =
+        good_quality_mesh_patterns[mesh_speed][i].interval;
+  }
+
+  // Update the mesh pattern of exhaustive motion search for intraBC
+  // Though intraBC mesh pattern is populated for all frame types, it is used
+  // only for intra frames of screen contents
+  for (i = 0; i < MAX_MESH_STEP; ++i) {
+    sf->mv_sf.intrabc_mesh_patterns[i].range =
+        intrabc_mesh_patterns[mesh_speed][i].range;
+    sf->mv_sf.intrabc_mesh_patterns[i].interval =
+        intrabc_mesh_patterns[mesh_speed][i].interval;
+  }
+
+  // Slow quant, dct and trellis not worthwhile for first pass
+  // so make sure they are always turned off.
+  if (is_stat_generation_stage(cpi))
+    sf->rd_sf.optimize_coefficients = NO_TRELLIS_OPT;
+
+  // No recode or trellis for 1 pass.
+  if (oxcf->pass == 0) sf->hl_sf.recode_loop = DISALLOW_RECODE;
+
+  MotionVectorSearchParams *const mv_search_params = &cpi->mv_search_params;
+  if (sf->mv_sf.subpel_search_method == SUBPEL_TREE) {
+    mv_search_params->find_fractional_mv_step = av1_find_best_sub_pixel_tree;
+  } else if (sf->mv_sf.subpel_search_method == SUBPEL_TREE_PRUNED) {
+    mv_search_params->find_fractional_mv_step =
+        av1_find_best_sub_pixel_tree_pruned;
+  } else if (sf->mv_sf.subpel_search_method == SUBPEL_TREE_PRUNED_MORE) {
+    mv_search_params->find_fractional_mv_step =
+        av1_find_best_sub_pixel_tree_pruned_more;
+  } else if (sf->mv_sf.subpel_search_method == SUBPEL_TREE_PRUNED_EVENMORE) {
+    mv_search_params->find_fractional_mv_step =
+        av1_find_best_sub_pixel_tree_pruned_evenmore;
+  }
+
+  x->min_partition_size = AOMMAX(sf->part_sf.default_min_partition_size,
+                                 dim_to_size(cpi->oxcf.min_partition_size));
+  x->max_partition_size = AOMMIN(sf->part_sf.default_max_partition_size,
+                                 dim_to_size(cpi->oxcf.max_partition_size));
+  x->min_partition_size = AOMMIN(x->min_partition_size, cm->seq_params.sb_size);
+  x->max_partition_size = AOMMIN(x->max_partition_size, cm->seq_params.sb_size);
+
+  // This is only used in motion vector unit test.
+  if (cpi->oxcf.motion_vector_unit_test == 1)
+    mv_search_params->find_fractional_mv_step = av1_return_max_sub_pixel_mv;
+  else if (cpi->oxcf.motion_vector_unit_test == 2)
+    mv_search_params->find_fractional_mv_step = av1_return_min_sub_pixel_mv;
+
+  // assert ensures that tx_domain_dist_level is accessed correctly
+  assert(cpi->sf.rd_sf.tx_domain_dist_thres_level >= 0 &&
+         cpi->sf.rd_sf.tx_domain_dist_thres_level < 3);
+  memcpy(winner_mode_params->tx_domain_dist_threshold,
+         tx_domain_dist_thresholds[cpi->sf.rd_sf.tx_domain_dist_thres_level],
+         sizeof(winner_mode_params->tx_domain_dist_threshold));
+
+  assert(cpi->sf.rd_sf.tx_domain_dist_level >= 0 &&
+         cpi->sf.rd_sf.tx_domain_dist_level < 3);
+  memcpy(winner_mode_params->use_transform_domain_distortion,
+         tx_domain_dist_types[cpi->sf.rd_sf.tx_domain_dist_level],
+         sizeof(winner_mode_params->use_transform_domain_distortion));
+
+  // assert ensures that coeff_opt_dist_thresholds is accessed correctly
+  assert(cpi->sf.rd_sf.perform_coeff_opt >= 0 &&
+         cpi->sf.rd_sf.perform_coeff_opt < 6);
+  memcpy(winner_mode_params->coeff_opt_dist_threshold,
+         coeff_opt_dist_thresholds[cpi->sf.rd_sf.perform_coeff_opt],
+         sizeof(winner_mode_params->coeff_opt_dist_threshold));
+
+  // assert ensures that predict_skip_levels is accessed correctly
+  assert(cpi->sf.tx_sf.tx_type_search.use_skip_flag_prediction >= 0 &&
+         cpi->sf.tx_sf.tx_type_search.use_skip_flag_prediction < 3);
+  memcpy(winner_mode_params->predict_skip_level,
+         predict_skip_levels[cpi->sf.tx_sf.tx_type_search
+                                 .use_skip_flag_prediction],
+         sizeof(winner_mode_params->predict_skip_level));
+
+  // assert ensures that tx_size_search_level is accessed correctly
+  assert(cpi->sf.winner_mode_sf.tx_size_search_level >= 0 &&
+         cpi->sf.winner_mode_sf.tx_size_search_level < 3);
+  memcpy(winner_mode_params->tx_size_search_methods,
+         tx_size_search_methods[cpi->sf.winner_mode_sf.tx_size_search_level],
+         sizeof(winner_mode_params->tx_size_search_methods));
+
+  if (cpi->oxcf.row_mt == 1 && (cpi->oxcf.max_threads > 1)) {
+    if (sf->inter_sf.inter_mode_rd_model_estimation == 1) {
+      // Revert to type 2
+      sf->inter_sf.inter_mode_rd_model_estimation = 2;
+    }
+  }
+}
+
+// Override some speed features based on qindex
+void av1_set_speed_features_qindex_dependent(AV1_COMP *cpi, int speed) {
+  AV1_COMMON *const cm = &cpi->common;
+  SPEED_FEATURES *const sf = &cpi->sf;
+  WinnerModeParams *const winner_mode_params = &cpi->winner_mode_params;
+  const int boosted = frame_is_boosted(cpi);
+  const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720;
+  if (is_720p_or_larger && cpi->oxcf.mode == GOOD && speed == 0) {
+    if (cm->quant_params.base_qindex <= 80) {
+      sf->rd_sf.perform_coeff_opt = 2;
+      memcpy(winner_mode_params->coeff_opt_dist_threshold,
+             coeff_opt_dist_thresholds[sf->rd_sf.perform_coeff_opt],
+             sizeof(winner_mode_params->coeff_opt_dist_threshold));
+      sf->part_sf.simple_motion_search_split =
+          cm->features.allow_screen_content_tools ? 1 : 2;
+      sf->tx_sf.inter_tx_size_search_init_depth_rect = 1;
+      sf->tx_sf.inter_tx_size_search_init_depth_sqr = 1;
+      sf->tx_sf.intra_tx_size_search_init_depth_rect = 1;
+    }
+  }
+
+  if (cpi->oxcf.mode == GOOD && speed >= 3) {
+    // Disable extended partitions for lower quantizers
+    if (cm->quant_params.base_qindex <= 100 &&
+        !cm->features.allow_screen_content_tools && !boosted) {
+      sf->part_sf.ext_partition_eval_thresh = BLOCK_128X128;
+    }
+  }
+
+  if (cpi->oxcf.mode == GOOD && speed >= 4) {
+    // Disable extended partitions for lower quantizers
+    const int qindex_thresh = boosted ? 80 : 120;
+    if (cm->quant_params.base_qindex <= qindex_thresh &&
+        !cm->features.allow_screen_content_tools &&
+        !frame_is_intra_only(&cpi->common)) {
+      sf->part_sf.ext_partition_eval_thresh = BLOCK_128X128;
+    }
+  }
+}
diff --git a/libs/libaom/src/av1/encoder/speed_features.h b/libs/libaom/src/av1/encoder/speed_features.h
new file mode 100644
index 000000000..d12c3c02e
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/speed_features.h
@@ -0,0 +1,1034 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_SPEED_FEATURES_H_
+#define AOM_AV1_ENCODER_SPEED_FEATURES_H_
+
+#include "av1/common/enums.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_MESH_STEP 4
+
+typedef struct MESH_PATTERN {
+  int range;
+  int interval;
+} MESH_PATTERN;
+
+enum {
+  GM_FULL_SEARCH,
+  GM_REDUCED_REF_SEARCH_SKIP_L2_L3,
+  GM_REDUCED_REF_SEARCH_SKIP_L2_L3_ARF2,
+  GM_DISABLE_SEARCH
+} UENUM1BYTE(GM_SEARCH_TYPE);
+
+enum {
+  GM_ERRORADV_TR_0,
+  GM_ERRORADV_TR_1,
+  GM_ERRORADV_TR_2,
+  GM_ERRORADV_TR_TYPES,
+} UENUM1BYTE(GM_ERRORADV_TYPE);
+
+enum {
+  FULL_TXFM_RD,
+  LOW_TXFM_RD,
+} UENUM1BYTE(TXFM_RD_MODEL);
+
+enum {
+  DIST_WTD_COMP_ENABLED,
+  DIST_WTD_COMP_SKIP_MV_SEARCH,
+  DIST_WTD_COMP_DISABLED,
+} UENUM1BYTE(DIST_WTD_COMP_FLAG);
+
+enum {
+  INTRA_ALL = (1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED) | (1 << D45_PRED) |
+              (1 << D135_PRED) | (1 << D113_PRED) | (1 << D157_PRED) |
+              (1 << D203_PRED) | (1 << D67_PRED) | (1 << SMOOTH_PRED) |
+              (1 << SMOOTH_V_PRED) | (1 << SMOOTH_H_PRED) | (1 << PAETH_PRED),
+  UV_INTRA_ALL =
+      (1 << UV_DC_PRED) | (1 << UV_V_PRED) | (1 << UV_H_PRED) |
+      (1 << UV_D45_PRED) | (1 << UV_D135_PRED) | (1 << UV_D113_PRED) |
+      (1 << UV_D157_PRED) | (1 << UV_D203_PRED) | (1 << UV_D67_PRED) |
+      (1 << UV_SMOOTH_PRED) | (1 << UV_SMOOTH_V_PRED) |
+      (1 << UV_SMOOTH_H_PRED) | (1 << UV_PAETH_PRED) | (1 << UV_CFL_PRED),
+  UV_INTRA_DC = (1 << UV_DC_PRED),
+  UV_INTRA_DC_CFL = (1 << UV_DC_PRED) | (1 << UV_CFL_PRED),
+  UV_INTRA_DC_TM = (1 << UV_DC_PRED) | (1 << UV_PAETH_PRED),
+  UV_INTRA_DC_PAETH_CFL =
+      (1 << UV_DC_PRED) | (1 << UV_PAETH_PRED) | (1 << UV_CFL_PRED),
+  UV_INTRA_DC_H_V = (1 << UV_DC_PRED) | (1 << UV_V_PRED) | (1 << UV_H_PRED),
+  UV_INTRA_DC_H_V_CFL = (1 << UV_DC_PRED) | (1 << UV_V_PRED) |
+                        (1 << UV_H_PRED) | (1 << UV_CFL_PRED),
+  UV_INTRA_DC_PAETH_H_V = (1 << UV_DC_PRED) | (1 << UV_PAETH_PRED) |
+                          (1 << UV_V_PRED) | (1 << UV_H_PRED),
+  UV_INTRA_DC_PAETH_H_V_CFL = (1 << UV_DC_PRED) | (1 << UV_PAETH_PRED) |
+                              (1 << UV_V_PRED) | (1 << UV_H_PRED) |
+                              (1 << UV_CFL_PRED),
+  INTRA_DC = (1 << DC_PRED),
+  INTRA_DC_TM = (1 << DC_PRED) | (1 << PAETH_PRED),
+  INTRA_DC_H_V = (1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED),
+  INTRA_DC_PAETH_H_V =
+      (1 << DC_PRED) | (1 << PAETH_PRED) | (1 << V_PRED) | (1 << H_PRED)
+};
+
+enum {
+  INTER_ALL = (1 << NEARESTMV) | (1 << NEARMV) | (1 << GLOBALMV) |
+              (1 << NEWMV) | (1 << NEAREST_NEARESTMV) | (1 << NEAR_NEARMV) |
+              (1 << NEW_NEWMV) | (1 << NEAREST_NEWMV) | (1 << NEAR_NEWMV) |
+              (1 << NEW_NEARMV) | (1 << NEW_NEARESTMV) | (1 << GLOBAL_GLOBALMV),
+  INTER_NEAREST_NEAR_ZERO = (1 << NEARESTMV) | (1 << NEARMV) | (1 << GLOBALMV) |
+                            (1 << NEAREST_NEARESTMV) | (1 << GLOBAL_GLOBALMV) |
+                            (1 << NEAREST_NEWMV) | (1 << NEW_NEARESTMV) |
+                            (1 << NEW_NEARMV) | (1 << NEAR_NEWMV) |
+                            (1 << NEAR_NEARMV),
+};
+
+enum {
+  DISABLE_ALL_INTER_SPLIT = (1 << THR_COMP_GA) | (1 << THR_COMP_LA) |
+                            (1 << THR_ALTR) | (1 << THR_GOLD) | (1 << THR_LAST),
+
+  DISABLE_ALL_SPLIT = (1 << THR_INTRA) | DISABLE_ALL_INTER_SPLIT,
+
+  DISABLE_COMPOUND_SPLIT = (1 << THR_COMP_GA) | (1 << THR_COMP_LA),
+
+  LAST_AND_INTRA_SPLIT_ONLY = (1 << THR_COMP_GA) | (1 << THR_COMP_LA) |
+                              (1 << THR_ALTR) | (1 << THR_GOLD)
+};
+
+enum {
+  TXFM_CODING_SF = 1,
+  INTER_PRED_SF = 2,
+  INTRA_PRED_SF = 4,
+  PARTITION_SF = 8,
+  LOOP_FILTER_SF = 16,
+  RD_SKIP_SF = 32,
+  RESERVE_2_SF = 64,
+  RESERVE_3_SF = 128,
+} UENUM1BYTE(DEV_SPEED_FEATURES);
+
+enum {
+  // No recode.
+  DISALLOW_RECODE = 0,
+  // Allow recode for KF and exceeding maximum frame bandwidth.
+  ALLOW_RECODE_KFMAXBW = 1,
+  // Allow recode only for KF/ARF/GF frames.
+  ALLOW_RECODE_KFARFGF = 2,
+  // Allow recode for all frames based on bitrate constraints.
+  ALLOW_RECODE = 3,
+} UENUM1BYTE(RECODE_LOOP_TYPE);
+
+enum {
+  SUBPEL_TREE = 0,
+  SUBPEL_TREE_PRUNED = 1,           // Prunes 1/2-pel searches
+  SUBPEL_TREE_PRUNED_MORE = 2,      // Prunes 1/2-pel searches more aggressively
+  SUBPEL_TREE_PRUNED_EVENMORE = 3,  // Prunes 1/2- and 1/4-pel searches
+  // Other methods to come
+} UENUM1BYTE(SUBPEL_SEARCH_METHODS);
+
+enum {
+  USE_FULL_RD = 0,
+  USE_FAST_RD,
+  USE_LARGESTALL,
+} UENUM1BYTE(TX_SIZE_SEARCH_METHOD);
+
+enum {
+  // Try the full image with different values.
+  LPF_PICK_FROM_FULL_IMAGE,
+  // Try the full image filter search with non-dual filter only.
+  LPF_PICK_FROM_FULL_IMAGE_NON_DUAL,
+  // Try a small portion of the image with different values.
+  LPF_PICK_FROM_SUBIMAGE,
+  // Estimate the level based on quantizer and frame type
+  LPF_PICK_FROM_Q,
+  // Pick 0 to disable LPF if LPF was enabled last frame
+  LPF_PICK_MINIMAL_LPF
+} UENUM1BYTE(LPF_PICK_METHOD);
+
+enum {
+  CDEF_FULL_SEARCH,
+  CDEF_FAST_SEARCH_LVL1,  // Search among a subset of all possible filters.
+  CDEF_FAST_SEARCH_LVL2,  // Search reduced subset of filters than Level 1.
+  CDEF_PICK_FROM_Q,       // Estimate filter strength based on quantizer.
+  CDEF_PICK_METHODS
+} UENUM1BYTE(CDEF_PICK_METHOD);
+
+enum {
+  // Terminate search early based on distortion so far compared to
+  // qp step, distortion in the neighborhood of the frame, etc.
+  FLAG_EARLY_TERMINATE = 1 << 0,
+
+  // Skips comp inter modes if the best so far is an intra mode.
+  FLAG_SKIP_COMP_BESTINTRA = 1 << 1,
+
+  // Skips oblique intra modes if the best so far is an inter mode.
+  FLAG_SKIP_INTRA_BESTINTER = 1 << 3,
+
+  // Skips oblique intra modes  at angles 27, 63, 117, 153 if the best
+  // intra so far is not one of the neighboring directions.
+  FLAG_SKIP_INTRA_DIRMISMATCH = 1 << 4,
+
+  // Skips intra modes other than DC_PRED if the source variance is small
+  FLAG_SKIP_INTRA_LOWVAR = 1 << 5,
+} UENUM1BYTE(MODE_SEARCH_SKIP_LOGIC);
+
+enum {
+  NO_PRUNE = 0,
+  // adaptively prunes the least perspective tx types out of all 16
+  // (tuned to provide negligible quality loss)
+  PRUNE_2D_ACCURATE = 1,
+  // similar, but applies much more aggressive pruning to get better speed-up
+  PRUNE_2D_FAST = 2,
+  PRUNE_2D_MORE = 3,
+  // More aggressive pruning based on tx type score and allowed tx count
+  PRUNE_2D_AGGRESSIVE = 4,
+} UENUM1BYTE(TX_TYPE_PRUNE_MODE);
+
+typedef struct {
+  TX_TYPE_PRUNE_MODE prune_mode;
+  int fast_intra_tx_type_search;
+  int fast_inter_tx_type_search;
+
+  // prune two least frequently chosen transforms for each intra mode
+  int use_reduced_intra_txset;
+
+  // Use a skip flag prediction model to detect blocks with skip = 1 early
+  // and avoid doing full TX type search for such blocks.
+  int use_skip_flag_prediction;
+
+  // Threshold used by the ML based method to predict TX block split decisions.
+  int ml_tx_split_thresh;
+
+  // skip remaining transform type search when we found the rdcost of skip is
+  // better than applying transform
+  int skip_tx_search;
+
+  // Prune tx type search using previous frame stats.
+  int prune_tx_type_using_stats;
+  // Prune tx type search using estimated RDcost
+  int prune_tx_type_est_rd;
+
+  // Flag used to control the winner mode processing for tx type pruning for
+  // inter blocks. It enables further tx type mode pruning based on ML model for
+  // mode evaluation and disables tx type mode pruning for winner mode
+  // processing.
+  int enable_winner_mode_tx_type_pruning;
+} TX_TYPE_SEARCH;
+
+enum {
+  // Search partitions using RD criterion
+  SEARCH_PARTITION,
+
+  // Always use a fixed size partition
+  FIXED_PARTITION,
+
+  REFERENCE_PARTITION,
+
+  VAR_BASED_PARTITION
+} UENUM1BYTE(PARTITION_SEARCH_TYPE);
+
+enum {
+  NOT_IN_USE,
+  DIRECT_PRED,
+  RELAXED_PRED,
+  ADAPT_PRED
+} UENUM1BYTE(MAX_PART_PRED_MODE);
+
+enum {
+  LAST_MV_DATA,
+  CURRENT_Q,
+  QTR_ONLY,
+} UENUM1BYTE(MV_PREC_LOGIC);
+
+typedef struct HIGH_LEVEL_SPEED_FEATURES {
+  // Frame level coding parameter update
+  int frame_parameter_update;
+
+  RECODE_LOOP_TYPE recode_loop;
+
+  // This feature controls the tolerence vs target used in deciding whether to
+  // recode a frame. It has no meaning if recode is disabled.
+  int recode_tolerance;
+
+  // Determine how motion vector precision is chosen. The possibilities are:
+  // LAST_MV_DATA: use the mv data from the last coded frame
+  // CURRENT_Q: use the current q as a threshold
+  // QTR_ONLY: use quarter pel precision only.
+  MV_PREC_LOGIC high_precision_mv_usage;
+
+  // Whether to disable overlay frames for filtered Altref frames,
+  // overiding oxcf->enable_overlay flag set as 1.
+  int disable_overlay_frames;
+
+  // Enable/disable adaptively deciding whether or not to encode ALTREF overlay
+  // frame.
+  int adaptive_overlay_encoding;
+
+  // Always set to 0. If on it enables 0 cost background transmission
+  // (except for the initial transmission of the segmentation). The feature is
+  // disabled because the addition of very large block sizes make the
+  // backgrounds very to cheap to encode, and the segmentation we have
+  // adds overhead.
+  int static_segmentation;
+
+  // Enable/disable second_alt_ref temporal filtering.
+  int second_alt_ref_filtering;
+} HIGH_LEVEL_SPEED_FEATURES;
+
+typedef struct TPL_SPEED_FEATURES {
+  // Prune the intra modes search by tpl.
+  // If set to 0, we will search all intra modes from DC_PRED to PAETH_PRED.
+  // If set to 1, we only search DC_PRED, V_PRED, and H_PRED.
+  int prune_intra_modes;
+  // This parameter controls which step in the n-step process we start at.
+  int reduce_first_step_size;
+  // Skip motion estimation based on the precision of center MVs and the
+  // difference between center MVs.
+  // If set to 0, motion estimation is skipped for duplicate center MVs
+  // (default). If set to 1, motion estimation is skipped for duplicate
+  // full-pixel center MVs. If set to 2, motion estimation is skipped if the
+  // difference between center MVs is less than the threshold.
+  int skip_alike_starting_mv;
+
+  // When to stop subpel search.
+  SUBPEL_FORCE_STOP subpel_force_stop;
+} TPL_SPEED_FEATURES;
+
+typedef struct GLOBAL_MOTION_SPEED_FEATURES {
+  // Global motion warp error threshold
+  GM_ERRORADV_TYPE gm_erroradv_type;
+
+  // Disable adaptive threshold for global motion warp error
+  int disable_adaptive_warp_error_thresh;
+
+  // Do not compute the global motion parameters for a LAST2_FRAME or
+  // LAST3_FRAME if the GOLDEN_FRAME is closer and it has a non identity
+  // global model.
+  int selective_ref_gm;
+
+  GM_SEARCH_TYPE gm_search_type;
+
+  // whether to disable the global motion recode loop
+  int gm_disable_recode;
+
+  // During global motion estimation, prune remaining reference frames in a
+  // given direction(past/future), if the evaluated ref_frame in that direction
+  // yields gm_type as INVALID/TRANSLATION/IDENTITY
+  int prune_ref_frame_for_gm_search;
+} GLOBAL_MOTION_SPEED_FEATURES;
+
+typedef struct PARTITION_SPEED_FEATURES {
+  PARTITION_SEARCH_TYPE partition_search_type;
+
+  // Used if partition_search_type = FIXED_SIZE_PARTITION
+  BLOCK_SIZE always_this_block_size;
+
+  // Prune extended partition types search
+  // Can take values 0 - 2, 0 referring to no pruning, and 1 - 2 increasing
+  // aggressiveness of pruning in order.
+  int prune_ext_partition_types_search_level;
+
+  // Use a ML model to prune horz and vert partitions
+  int ml_prune_rect_partition;
+
+  // Use a ML model to prune horz_a, horz_b, vert_a and vert_b partitions.
+  int ml_prune_ab_partition;
+
+  // Use a ML model to prune horz4 and vert4 partitions.
+  int ml_prune_4_partition;
+
+  // Use a ML model to adaptively terminate partition search after trying
+  // PARTITION_SPLIT. Can take values 0 - 2, 0 meaning not being enabled, and
+  // 1 - 2 increasing aggressiveness in order.
+  int ml_early_term_after_part_split_level;
+
+  // Skip rectangular partition test when partition type none gives better
+  // rd than partition type split. Can take values 0 - 2, 0 referring to no
+  // skipping, and 1 - 2 increasing aggressiveness of skipping in order.
+  int less_rectangular_check_level;
+
+  // Use square partition only beyond this block size.
+  BLOCK_SIZE use_square_partition_only_threshold;
+
+  // Sets min and max square partition levels for this superblock based on
+  // motion vector and prediction error distribution produced from 16x16
+  // simple motion search
+  MAX_PART_PRED_MODE auto_max_partition_based_on_simple_motion;
+  int auto_min_partition_based_on_simple_motion;
+
+  // Min and max square partition size we enable (block_size) as per auto
+  // min max, but also used by adjust partitioning, and pick_partitioning.
+  BLOCK_SIZE default_min_partition_size;
+  BLOCK_SIZE default_max_partition_size;
+
+  // Sets level of adjustmet of variace-based partitioning during
+  // rd_use_partition 0 - no partition adjusment, 1 - try to merge partitions
+  // for small blocks and high QP, 2 - always try to merge leaf partitions, 3 -
+  // try to merge and split leaf partitions
+  int adjust_var_based_rd_partitioning;
+
+  // Partition search early breakout thresholds.
+  int64_t partition_search_breakout_dist_thr;
+  int partition_search_breakout_rate_thr;
+
+  // Thresholds for ML based partition search breakout.
+  int ml_partition_search_breakout_thresh[PARTITION_BLOCK_SIZES];
+
+  // Allow skipping partition search for still image frame
+  int allow_partition_search_skip;
+
+  // The aggresiveness of pruning with simple_motion_search.
+  // Currently 0 is the lowest, and 2 the highest.
+  int simple_motion_search_prune_agg;
+
+  // Perform simple_motion_search on each possible subblock and use it to prune
+  // PARTITION_HORZ and PARTITION_VERT.
+  int simple_motion_search_prune_rect;
+
+  // Perform simple motion search before none_partition to decide if we
+  // want to remove all partitions other than PARTITION_SPLIT. If set to 0, this
+  // model is disabled. If set to 1, the model attempts to perform
+  // PARTITION_SPLIT only. If set to 2, the model also attempts to prune
+  // PARTITION_SPLIT.
+  int simple_motion_search_split;
+
+  // Use features from simple_motion_search to terminate prediction block
+  // partition after PARTITION_NONE
+  int simple_motion_search_early_term_none;
+
+  // This variable controls the maximum block size where intra blocks can be
+  // used in inter frames.
+  // TODO(aconverse): Fold this into one of the other many mode skips
+  BLOCK_SIZE max_intra_bsize;
+
+  // Use CNN with luma pixels on source frame on each of the 64x64 subblock to
+  // perform split/no_split decision on intra-frames.
+  int intra_cnn_split;
+
+  // Disable extended partition search for lower block sizes.
+  int ext_partition_eval_thresh;
+
+  // Prune 1:4 partition search based on winner info from split partitions
+  int prune_4_partition_using_split_info;
+
+  // Prune AB partition search using split and HORZ/VERT info
+  int prune_ab_partition_using_split_info;
+} PARTITION_SPEED_FEATURES;
+
+typedef struct MV_SPEED_FEATURES {
+  // Motion search method (Diamond, NSTEP, Hex, Big Diamond, Square, etc).
+  SEARCH_METHODS search_method;
+
+  // If this is set to 1, we limit the motion search range to 2 times the
+  // largest motion vector found in the last frame.
+  int auto_mv_step_size;
+
+  // Subpel_search_method can only be subpel_tree which does a subpixel
+  // logarithmic search that keeps stepping at 1/2 pixel units until
+  // you stop getting a gain, and then goes on to 1/4 and repeats
+  // the same process. Along the way it skips many diagonals.
+  SUBPEL_SEARCH_METHODS subpel_search_method;
+
+  // Maximum number of steps in logarithmic subpel search before giving up.
+  int subpel_iters_per_step;
+
+  // When to stop subpel search.
+  SUBPEL_FORCE_STOP subpel_force_stop;
+
+  // When to stop subpel search in simple motion search.
+  SUBPEL_FORCE_STOP simple_motion_subpel_force_stop;
+
+  // If true, sub-pixel search uses the exact convolve function used for final
+  // encoding and decoding; otherwise, it uses bilinear interpolation.
+  SUBPEL_SEARCH_TYPE use_accurate_subpel_search;
+
+  // TODO(jingning): combine the related motion search speed features
+  // This allows us to use motion search at other sizes as a starting
+  // point for this motion search and limits the search range around it.
+  int adaptive_motion_search;
+
+  // Threshold for allowing exhaustive motion search.
+  int exhaustive_searches_thresh;
+
+  // Pattern to be used for any exhaustive mesh searches (except intraBC ME).
+  MESH_PATTERN mesh_patterns[MAX_MESH_STEP];
+
+  // Pattern to be used for exhaustive mesh searches of intraBC ME.
+  MESH_PATTERN intrabc_mesh_patterns[MAX_MESH_STEP];
+
+  // Reduce single motion search range based on MV result of prior ref_mv_idx.
+  int reduce_search_range;
+
+  // Prune mesh search.
+  int prune_mesh_search;
+
+  // Use the rd cost around the best FULLPEL_MV to speed up subpel search
+  int use_fullpel_costlist;
+
+  // Set the full pixel search level of obmc
+  // 0: obmc_full_pixel_diamond
+  // 1: obmc_refining_search_sad (faster)
+  int obmc_full_pixel_search_level;
+
+  // Accurate full pixel motion search based on TPL stats.
+  int full_pixel_search_level;
+} MV_SPEED_FEATURES;
+
+typedef struct INTER_MODE_SPEED_FEATURES {
+  // 2-pass inter mode model estimation where the preliminary pass skips
+  // transform search and uses a model to estimate rd, while the final pass
+  // computes the full transform search. Two types of models are supported:
+  // 0: not used
+  // 1: used with online dynamic rd model
+  // 2: used with static rd model
+  int inter_mode_rd_model_estimation;
+
+  // Bypass transform search based on skip rd
+  int txfm_rd_gate_level;
+
+  // Limit the inter mode tested in the RD loop
+  int reduce_inter_modes;
+
+  // Adaptive prediction mode search
+  int adaptive_mode_search;
+
+  // This variable is used to cap the maximum number of times we skip testing a
+  // mode to be evaluated. A high value means we will be faster.
+  int adaptive_rd_thresh;
+
+  // Aggressively prune inter modes when best mode is skippable.
+  int prune_inter_modes_if_skippable;
+
+  // Drop less likely to be picked reference frames in the RD search.
+  // Has five levels for now: 0, 1, 2, 3 and 4, where higher levels prune more
+  // aggressively than lower ones. (0 means no pruning).
+  int selective_ref_frame;
+
+  // Prune reference frames for rectangular partitions.
+  // 0 implies no pruning
+  // 1 implies prune for extended partition
+  // 2 implies prune horiz, vert and extended partition
+  int prune_ref_frame_for_rect_partitions;
+
+  int alt_ref_search_fp;
+
+  // flag to skip NEWMV mode in drl if the motion search result is the same
+  int skip_repeated_newmv;
+
+  // Skip the current ref_mv in NEW_MV mode if we have already encountered
+  // another ref_mv in the drl such that:
+  //  1. The other drl has the same fullpel_mv during the SIMPLE_TRANSLATION
+  //     search process as the current fullpel_mv.
+  //  2. The rate needed to encode the current fullpel_mv is larger than that
+  //     for the other ref_mv.
+  int skip_repeated_full_newmv;
+
+  // This speed feature checks duplicate ref MVs among NEARESTMV, NEARMV,
+  // GLOBALMV and skips NEARMV or GLOBALMV (in order) if a duplicate is found
+  // TODO(any): Instead of skipping repeated ref mv, use the recalculated
+  // rd-cost based on mode rate and skip the mode evaluation
+  int skip_repeated_ref_mv;
+
+  // Flag used to control the ref_best_rd based gating for chroma
+  int perform_best_rd_based_gating_for_chroma;
+
+  // Skip certain motion modes (OBMC, warped, interintra) for single reference
+  // motion search, using the results of single ref SIMPLE_TRANSLATION
+  int prune_single_motion_modes_by_simple_trans;
+
+  // Reuse the inter_intra_mode search result from NEARESTMV mode to other
+  // single ref modes
+  int reuse_inter_intra_mode;
+
+  // prune wedge and compound segment approximate rd evaluation based on
+  // compound average modeled rd
+  int prune_comp_type_by_model_rd;
+
+  // prune wedge and compound segment approximate rd evaluation based on
+  // compound average rd/ref_best_rd
+  int prune_comp_type_by_comp_avg;
+
+  // Skip some ref frames in compound motion search by single motion search
+  // result. Has three levels for now: 0 referring to no skipping, and 1 - 3
+  // increasing aggressiveness of skipping in order.
+  // Note: The search order might affect the result. It assumes that the single
+  // reference modes are searched before compound modes. It is better to search
+  // same single inter mode as a group.
+  int prune_comp_search_by_single_result;
+
+  // If 1 we iterate finding a best reference for 2 ref frames together - via
+  // a log search that iterates 4 times (check around mv for last for best
+  // error of combined predictor then check around mv for alt). If 0 we
+  // we just use the best motion vector found for each frame by itself.
+  BLOCK_SIZE comp_inter_joint_search_thresh;
+
+  // Instead of performing a full MV search, do a simple translation first
+  // and only perform a full MV search on the motion vectors that performed
+  // well.
+  int prune_mode_search_simple_translation;
+
+  // Only search compound modes with at least one "good" reference frame.
+  // A reference frame is good if, after looking at its performance among
+  // the single reference modes, it is one of the two best performers.
+  int prune_compound_using_single_ref;
+
+  // Skip extended compound mode using ref frames of above and left neighbor
+  // blocks.
+  // 0 : no pruning
+  // 1 : prune extended compound mode (less aggressiveness)
+  // 2 : prune extended compound mode (high aggressiveness)
+  int prune_compound_using_neighbors;
+
+  // Based on previous ref_mv_idx search result, prune the following search.
+  int prune_ref_mv_idx_search;
+
+  // Disable one sided compound modes.
+  int disable_onesided_comp;
+
+  // Prune/gate motion mode evaluation based on token based rd
+  // during transform search for inter blocks
+  // Values are 0 (not used) , 1 - 3 with progressively increasing
+  // aggressiveness
+  int prune_motion_mode_level;
+
+  // Prune obmc search using previous frame stats.
+  int prune_obmc_prob_thresh;
+
+  // Disable obmc.
+  int disable_obmc;
+
+  // Gate warp evaluation for motions of type IDENTITY,
+  // TRANSLATION and AFFINE(based on number of warp neighbors)
+  int prune_warp_using_wmtype;
+
+  // Prune warped motion search using previous frame stats.
+  int prune_warped_prob_thresh;
+
+  // Enable/disable interintra wedge search.
+  int disable_wedge_interintra_search;
+
+  // De-couple wedge and mode search during interintra RDO.
+  int fast_interintra_wedge_search;
+
+  // Only enable wedge search if the edge strength is greater than
+  // this threshold. A value of 0 signals that this check is disabled.
+  unsigned int disable_wedge_search_edge_thresh;
+
+  // Only enable wedge search if the variance is above this threshold.
+  unsigned int disable_wedge_search_var_thresh;
+
+  // Whether fast wedge sign estimate is used
+  int fast_wedge_sign_estimate;
+
+  // Whether to prune wedge search based on predictor difference
+  int prune_wedge_pred_diff_based;
+
+  // Enable/disable ME for interinter wedge search.
+  int disable_interinter_wedge_newmv_search;
+
+  // Enable/disable ME for interinter diffwtd search. PSNR BD-rate gain of
+  // ~0.1 on the lowres test set, but ~15% slower computation.
+  int enable_interinter_diffwtd_newmv_search;
+
+  // Enable/disable smooth inter-intra mode
+  int disable_smooth_interintra;
+
+  // Disable interinter_wedge
+  int disable_interinter_wedge;
+
+  // Decide when and how to use joint_comp.
+  DIST_WTD_COMP_FLAG use_dist_wtd_comp_flag;
+
+  // Whether to override and disable sb level coeff cost updates, if
+  // cpi->oxcf.coeff_cost_upd_freq = COST_UPD_SB (i.e. set at SB level)
+  int disable_sb_level_coeff_cost_upd;
+
+  // Whether to override and disable sb level mv cost updates, if
+  // cpi->oxcf.coeff_cost_upd_freq = COST_UPD_SB (i.e. set at SB level)
+  int disable_sb_level_mv_cost_upd;
+
+  // Prune inter modes based on tpl stats
+  // 0 : no pruning
+  // 1 - 3 indicate increasing aggressiveness in order.
+  int prune_inter_modes_based_on_tpl;
+
+  // Model based breakout after interpolation filter search
+  // 0: no breakout
+  // 1: use model based rd breakout
+  int model_based_post_interp_filter_breakout;
+
+  // Reuse compound type rd decision when exact match is found
+  // 0: No reuse
+  // 1: Reuse the compound type decision
+  int reuse_compound_type_decision;
+} INTER_MODE_SPEED_FEATURES;
+
+typedef struct INTERP_FILTER_SPEED_FEATURES {
+  // A source variance threshold below which filter search is disabled
+  // Choose a very large value (UINT_MAX) to use 8-tap always
+  unsigned int disable_filter_search_var_thresh;
+
+  // Do limited interpolation filter search for dual filters, since best choice
+  // usually includes EIGHTTAP_REGULAR.
+  int use_fast_interpolation_filter_search;
+
+  // Disable dual filter
+  int disable_dual_filter;
+
+  // Save results of av1_interpolation_filter_search for a block
+  // Check mv and ref_frames before search, if they are very close with previous
+  // saved results, filter search can be skipped.
+  int use_interp_filter;
+
+  // skip sharp_filter evaluation based on regular and smooth filter rd for
+  // dual_filter=0 case
+  int skip_sharp_interp_filter_search;
+
+  int cb_pred_filter_search;
+
+  // adaptive interp_filter search to allow skip of certain filter types.
+  int adaptive_interp_filter_search;
+} INTERP_FILTER_SPEED_FEATURES;
+
+typedef struct INTRA_MODE_SPEED_FEATURES {
+  // These bit masks allow you to enable or disable intra modes for each
+  // transform size separately.
+  int intra_y_mode_mask[TX_SIZES];
+  int intra_uv_mode_mask[TX_SIZES];
+
+  // flag to allow skipping intra mode for inter frame prediction
+  int skip_intra_in_interframe;
+
+  // variance threshold for intra mode gating when inter turned out to be skip
+  // in inter frame prediction
+  unsigned int src_var_thresh_intra_skip;
+
+  // Prune intra mode candidates based on source block histogram of gradient.
+  int intra_pruning_with_hog;
+
+  // TODO(anyone): tune intra_pruning_with_hog_thresh for various speeds.
+  float intra_pruning_with_hog_thresh;
+
+  // Enable/disable smooth intra modes.
+  int disable_smooth_intra;
+
+  // prune palette search
+  // 0: No pruning
+  // 1: Perform coarse search to prune the palette colors. For winner colors,
+  // neighbors are also evaluated using a finer search.
+  // 2: Perform 2 way palette search from max colors to min colors (and min
+  // colors to remaining colors) and terminate the search if current number of
+  // palette colors is not the winner.
+  int prune_palette_search_level;
+} INTRA_MODE_SPEED_FEATURES;
+
+typedef struct TX_SPEED_FEATURES {
+  // Init search depth for square and rectangular transform partitions.
+  // Values:
+  // 0 - search full tree, 1: search 1 level, 2: search the highest level only
+  int inter_tx_size_search_init_depth_sqr;
+  int inter_tx_size_search_init_depth_rect;
+  int intra_tx_size_search_init_depth_sqr;
+  int intra_tx_size_search_init_depth_rect;
+
+  // If any dimension of a coding block size above 64, always search the
+  // largest transform only, since the largest transform block size is 64x64.
+  int tx_size_search_lgr_block;
+
+  TX_TYPE_SEARCH tx_type_search;
+
+  // Skip split transform block partition when the collocated bigger block
+  // is selected as all zero coefficients.
+  int txb_split_cap;
+
+  // Shortcut the transform block partition and type search when the target
+  // rdcost is relatively lower.
+  // Values are 0 (not used) , or 1 - 2 with progressively increasing
+  // aggressiveness
+  int adaptive_txb_search_level;
+
+  // Prune level for tx_size_type search for inter based on rd model
+  // 0: no pruning
+  // 1-2: progressively increasing aggressiveness of pruning
+  int model_based_prune_tx_search_level;
+
+  // Use hash table to store intra(keyframe only) txb transform search results
+  // to avoid repeated search on the same residue signal.
+  int use_intra_txb_hash;
+
+  // Use hash table to store inter txb transform search results
+  // to avoid repeated search on the same residue signal.
+  int use_inter_txb_hash;
+
+  // Refine TX type after fast TX search.
+  int refine_fast_tx_search_results;
+} TX_SPEED_FEATURES;
+
+typedef struct RD_CALC_SPEED_FEATURES {
+  // This feature controls whether we do the expensive context update and
+  // calculation in the rd coefficient costing loop.
+  int use_fast_coef_costing;
+
+  // Fast approximation of av1_model_rd_from_var_lapndz
+  int simple_model_rd_from_var;
+
+  // Whether to compute distortion in the image domain (slower but
+  // more accurate), or in the transform domain (faster but less acurate).
+  // 0: use image domain
+  // 1: use transform domain in tx_type search, and use image domain for
+  // RD_STATS
+  // 2: use transform domain
+  int tx_domain_dist_level;
+
+  // Transform domain distortion threshold level
+  int tx_domain_dist_thres_level;
+
+  // Trellis (dynamic programming) optimization of quantized values
+  TRELLIS_OPT_TYPE optimize_coefficients;
+
+  // Use a hash table to store previously computed optimized qcoeffs from
+  // expensive calls to optimize_txb.
+  int use_hash_based_trellis;
+
+  // Use hash table to store macroblock RD search results
+  // to avoid repeated search on the same residue signal.
+  int use_mb_rd_hash;
+
+  // Flag used to control the speed of the eob selection in trellis.
+  int trellis_eob_fast;
+
+  // Calculate RD cost before doing optimize_b, and skip if the cost is large.
+  int optimize_b_precheck;
+
+  // Flag used to control the extent of coeff R-D optimization
+  int perform_coeff_opt;
+} RD_CALC_SPEED_FEATURES;
+
+typedef struct WINNER_MODE_SPEED_FEATURES {
+  // Flag used to control the winner mode processing for better R-D optimization
+  // of quantized coeffs
+  int enable_winner_mode_for_coeff_opt;
+
+  // Flag used to control the winner mode processing for transform size
+  // search method
+  int enable_winner_mode_for_tx_size_srch;
+
+  // Control transform size search level
+  // Eval type: Default       Mode        Winner
+  // Level 0  : FULL RD     LARGEST ALL   FULL RD
+  // Level 1  : FAST RD     LARGEST ALL   FULL RD
+  // Level 2  : LARGEST ALL LARGEST ALL   FULL RD
+  int tx_size_search_level;
+
+  // Flag used to control the winner mode processing for use transform
+  // domain distortion
+  int enable_winner_mode_for_use_tx_domain_dist;
+
+  // Flag used to enable processing of multiple winner modes
+  int enable_multiwinner_mode_process;
+
+  // Motion mode for winner candidates:
+  // 0: speed feature OFF
+  // 1 / 2 : Use configured number of winner candidates
+  int motion_mode_for_winner_cand;
+} WINNER_MODE_SPEED_FEATURES;
+
+typedef struct LOOP_FILTER_SPEED_FEATURES {
+  // This feature controls how the loop filter level is determined.
+  LPF_PICK_METHOD lpf_pick;
+
+  // Control how the CDEF strength is determined.
+  CDEF_PICK_METHOD cdef_pick_method;
+
+  // Decoder side speed feature to add penalty for use of dual-sgr filters.
+  // Takes values 0 - 10, 0 indicating no penalty and each additional level
+  // adding a penalty of 1%
+  int dual_sgr_penalty_level;
+
+  // prune sgr ep using binary search like mechanism
+  int enable_sgr_ep_pruning;
+
+  // Disable loop restoration for Chroma plane
+  int disable_loop_restoration_chroma;
+
+  // Prune RESTORE_WIENER evaluation based on source variance
+  // 0 : no pruning
+  // 1 : conservative pruning
+  // 2 : aggressive pruning
+  int prune_wiener_based_on_src_var;
+
+  // Prune self-guided loop restoration based on wiener search results
+  // 0 : no pruning
+  // 1 : pruning based on rdcost ratio of RESTORE_WIENER and RESTORE_NONE
+  // 2 : pruning based on winner restoration type among RESTORE_WIENER and
+  // RESTORE_NONE
+  int prune_sgr_based_on_wiener;
+
+  // Reduce the wiener filter win size for luma
+  int reduce_wiener_window_size;
+
+  // Disable loop restoration filter
+  int disable_lr_filter;
+} LOOP_FILTER_SPEED_FEATURES;
+
+typedef struct REAL_TIME_SPEED_FEATURES {
+  // check intra prediction for non-RD mode.
+  int check_intra_pred_nonrd;
+
+  // Perform coarse ME before calculating variance in variance-based partition
+  int estimate_motion_for_var_based_partition;
+
+  // For nonrd_use_partition: mode of extra check of leaf partition
+  // 0 - don't check merge
+  // 1 - always check merge
+  // 2 - check merge and prune checking final split
+  int nonrd_check_partition_merge_mode;
+
+  // For nonrd_use_partition: check of leaf partition extra split
+  int nonrd_check_partition_split;
+
+  // Implements various heuristics to skip searching modes
+  // The heuristics selected are based on  flags
+  // defined in the MODE_SEARCH_SKIP_HEURISTICS enum
+  unsigned int mode_search_skip_flags;
+
+  // For nonrd: Reduces ref frame search.
+  // 0 - low level of search prune in non last frames
+  // 1 - pruned search in non last frames
+  // 2 - more pruned search in non last frames
+  int nonrd_prune_ref_frame_search;
+
+  // This flag controls the use of non-RD mode decision.
+  int use_nonrd_pick_mode;
+
+  // Use ALTREF frame in non-RD mode decision.
+  int use_nonrd_altref_frame;
+
+  // Use compound reference for non-RD mode.
+  int use_comp_ref_nonrd;
+
+  // use reduced ref set for real-time mode
+  int use_real_time_ref_set;
+
+  // Skip a number of expensive mode evaluations for blocks with very low
+  // temporal variance.
+  int short_circuit_low_temp_var;
+
+  // Use modeled (currently CurvFit model) RDCost for fast non-RD mode
+  int use_modeled_non_rd_cost;
+
+  // Reuse inter prediction in fast non-rd mode.
+  int reuse_inter_pred_nonrd;
+
+  // Number of best inter modes to search transform. INT_MAX - search all.
+  int num_inter_modes_for_tx_search;
+
+  // Forces TX search off for RDCost calulation.
+  int force_tx_search_off;
+
+  // Use interpolation filter search in non-RD mode decision.
+  int use_nonrd_filter_search;
+
+  // Use simplified RD model for interpolation search and Intra
+  int use_simple_rd_model;
+
+  // If set forces interpolation filter to EIGHTTAP_REGULAR
+  int skip_interp_filter_search;
+
+  // Use hybrid (rd for bsize < 16x16, otherwise nonrd) intra search for intra
+  // only frames.
+  int hybrid_intra_pickmode;
+
+  // Compute variance/sse on source difference, prior to encoding superblock.
+  int source_metrics_sb_nonrd;
+} REAL_TIME_SPEED_FEATURES;
+
+typedef struct SPEED_FEATURES {
+  /*
+   * Sequence/frame level speed features:
+   */
+  HIGH_LEVEL_SPEED_FEATURES hl_sf;
+
+  /*
+   * Speed features related to how tpl's searches are done.
+   */
+  TPL_SPEED_FEATURES tpl_sf;
+
+  /*
+   * Global motion speed features:
+   */
+  GLOBAL_MOTION_SPEED_FEATURES gm_sf;
+
+  /*
+   * Partition search speed features:
+   */
+  PARTITION_SPEED_FEATURES part_sf;
+
+  /*
+   * Motion search speed features:
+   */
+  MV_SPEED_FEATURES mv_sf;
+
+  /*
+   * Inter mode search speed features:
+   */
+  INTER_MODE_SPEED_FEATURES inter_sf;
+
+  /*
+   * Interpolation filter search speed features:
+   */
+  INTERP_FILTER_SPEED_FEATURES interp_sf;
+
+  /*
+   * Intra mode search speed features:
+   */
+  INTRA_MODE_SPEED_FEATURES intra_sf;
+
+  /*
+   * Transform size/type search speed features:
+   */
+  TX_SPEED_FEATURES tx_sf;
+
+  /*
+   * RD calculation speed features:
+   */
+  RD_CALC_SPEED_FEATURES rd_sf;
+
+  /*
+   * Two-pass mode evaluation features:
+   */
+  WINNER_MODE_SPEED_FEATURES winner_mode_sf;
+
+  /*
+   * In-loop filter speed features:
+   */
+  LOOP_FILTER_SPEED_FEATURES lpf_sf;
+
+  /*
+   * Real-time mode speed features:
+   */
+  REAL_TIME_SPEED_FEATURES rt_sf;
+} SPEED_FEATURES;
+
+struct AV1_COMP;
+
+void av1_set_speed_features_framesize_independent(struct AV1_COMP *cpi,
+                                                  int speed);
+void av1_set_speed_features_framesize_dependent(struct AV1_COMP *cpi,
+                                                int speed);
+void av1_set_speed_features_qindex_dependent(struct AV1_COMP *cpi, int speed);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_SPEED_FEATURES_H_
diff --git a/libs/libaom/src/av1/encoder/svc_layercontext.c b/libs/libaom/src/av1/encoder/svc_layercontext.c
new file mode 100644
index 000000000..b72d8aa73
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/svc_layercontext.c
@@ -0,0 +1,288 @@
+/*
+ *  Copyright (c) 2019, Alliance for Open Media. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+
+#include "av1/encoder/encoder.h"
+
+static void swap_ptr(void *a, void *b) {
+  void **a_p = (void **)a;
+  void **b_p = (void **)b;
+  void *c = *a_p;
+  *a_p = *b_p;
+  *b_p = c;
+}
+
+void av1_init_layer_context(AV1_COMP *const cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  SVC *const svc = &cpi->svc;
+  int mi_rows = cpi->common.mi_params.mi_rows;
+  int mi_cols = cpi->common.mi_params.mi_cols;
+  svc->base_framerate = 30.0;
+  svc->current_superframe = 0;
+
+  for (int sl = 0; sl < svc->number_spatial_layers; ++sl) {
+    for (int tl = 0; tl < svc->number_temporal_layers; ++tl) {
+      int layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
+      LAYER_CONTEXT *const lc = &svc->layer_context[layer];
+      RATE_CONTROL *const lrc = &lc->rc;
+      lrc->ni_av_qi = oxcf->worst_allowed_q;
+      lrc->total_actual_bits = 0;
+      lrc->total_target_vs_actual = 0;
+      lrc->ni_tot_qi = 0;
+      lrc->tot_q = 0.0;
+      lrc->avg_q = 0.0;
+      lrc->ni_frames = 0;
+      lrc->decimation_count = 0;
+      lrc->decimation_factor = 0;
+      lrc->worst_quality = av1_quantizer_to_qindex(lc->max_q);
+      lrc->best_quality = av1_quantizer_to_qindex(lc->min_q);
+      for (int i = 0; i < RATE_FACTOR_LEVELS; ++i) {
+        lrc->rate_correction_factors[i] = 1.0;
+      }
+      lc->target_bandwidth = lc->layer_target_bitrate;
+      lrc->last_q[INTER_FRAME] = lrc->worst_quality;
+      lrc->avg_frame_qindex[INTER_FRAME] = lrc->worst_quality;
+      lrc->avg_frame_qindex[KEY_FRAME] = lrc->worst_quality;
+      lrc->buffer_level =
+          oxcf->starting_buffer_level_ms * lc->target_bandwidth / 1000;
+      lrc->bits_off_target = lrc->buffer_level;
+      // Initialize the cyclic refresh parameters. If spatial layers are used
+      // (i.e., ss_number_layers > 1), these need to be updated per spatial
+      // layer. Cyclic refresh is only applied on base temporal layer.
+      if (svc->number_spatial_layers > 1 && tl == 0) {
+        size_t last_coded_q_map_size;
+        lc->sb_index = 0;
+        lc->actual_num_seg1_blocks = 0;
+        lc->actual_num_seg2_blocks = 0;
+        lc->counter_encode_maxq_scene_change = 0;
+        CHECK_MEM_ERROR(cm, lc->map,
+                        aom_malloc(mi_rows * mi_cols * sizeof(*lc->map)));
+        memset(lc->map, 0, mi_rows * mi_cols);
+        last_coded_q_map_size =
+            mi_rows * mi_cols * sizeof(*lc->last_coded_q_map);
+        CHECK_MEM_ERROR(cm, lc->last_coded_q_map,
+                        aom_malloc(last_coded_q_map_size));
+        assert(MAXQ <= 255);
+        memset(lc->last_coded_q_map, MAXQ, last_coded_q_map_size);
+      }
+    }
+  }
+}
+
+// Update the layer context from a change_config() call.
+void av1_update_layer_context_change_config(AV1_COMP *const cpi,
+                                            const int64_t target_bandwidth) {
+  const RATE_CONTROL *const rc = &cpi->rc;
+  SVC *const svc = &cpi->svc;
+  int layer = 0;
+  int64_t spatial_layer_target = 0;
+  float bitrate_alloc = 1.0;
+
+  for (int sl = 0; sl < svc->number_spatial_layers; ++sl) {
+    for (int tl = 0; tl < svc->number_temporal_layers; ++tl) {
+      layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
+      LAYER_CONTEXT *const lc = &svc->layer_context[layer];
+      svc->layer_context[layer].target_bandwidth = lc->layer_target_bitrate;
+    }
+    spatial_layer_target = svc->layer_context[layer].target_bandwidth;
+    for (int tl = 0; tl < svc->number_temporal_layers; ++tl) {
+      LAYER_CONTEXT *const lc =
+          &svc->layer_context[sl * svc->number_temporal_layers + tl];
+      RATE_CONTROL *const lrc = &lc->rc;
+      lc->spatial_layer_target_bandwidth = spatial_layer_target;
+      bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth;
+      lrc->starting_buffer_level =
+          (int64_t)(rc->starting_buffer_level * bitrate_alloc);
+      lrc->optimal_buffer_level =
+          (int64_t)(rc->optimal_buffer_level * bitrate_alloc);
+      lrc->maximum_buffer_size =
+          (int64_t)(rc->maximum_buffer_size * bitrate_alloc);
+      lrc->bits_off_target =
+          AOMMIN(lrc->bits_off_target, lrc->maximum_buffer_size);
+      lrc->buffer_level = AOMMIN(lrc->buffer_level, lrc->maximum_buffer_size);
+      lc->framerate = cpi->framerate / lc->framerate_factor;
+      lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate);
+      lrc->max_frame_bandwidth = rc->max_frame_bandwidth;
+      lrc->worst_quality = av1_quantizer_to_qindex(lc->max_q);
+      lrc->best_quality = av1_quantizer_to_qindex(lc->min_q);
+    }
+  }
+}
+
+static LAYER_CONTEXT *get_layer_context(AV1_COMP *const cpi) {
+  return &cpi->svc.layer_context[cpi->svc.spatial_layer_id *
+                                     cpi->svc.number_temporal_layers +
+                                 cpi->svc.temporal_layer_id];
+}
+
+void av1_update_temporal_layer_framerate(AV1_COMP *const cpi) {
+  SVC *const svc = &cpi->svc;
+  LAYER_CONTEXT *const lc = get_layer_context(cpi);
+  RATE_CONTROL *const lrc = &lc->rc;
+  const int tl = svc->temporal_layer_id;
+  lc->framerate = cpi->framerate / lc->framerate_factor;
+  lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate);
+  lrc->max_frame_bandwidth = cpi->rc.max_frame_bandwidth;
+  // Update the average layer frame size (non-cumulative per-frame-bw).
+  if (tl == 0) {
+    lc->avg_frame_size = lrc->avg_frame_bandwidth;
+  } else {
+    int prev_layer = svc->spatial_layer_id * svc->number_temporal_layers +
+                     svc->temporal_layer_id - 1;
+    LAYER_CONTEXT *const lcprev = &svc->layer_context[prev_layer];
+    const double prev_layer_framerate =
+        cpi->framerate / lcprev->framerate_factor;
+    const int64_t prev_layer_target_bandwidth = lcprev->layer_target_bitrate;
+    lc->avg_frame_size =
+        (int)((lc->target_bandwidth - prev_layer_target_bandwidth) /
+              (lc->framerate - prev_layer_framerate));
+  }
+}
+
+void av1_restore_layer_context(AV1_COMP *const cpi) {
+  GF_GROUP *const gf_group = &cpi->gf_group;
+  SVC *const svc = &cpi->svc;
+  LAYER_CONTEXT *const lc = get_layer_context(cpi);
+  const int old_frame_since_key = cpi->rc.frames_since_key;
+  const int old_frame_to_key = cpi->rc.frames_to_key;
+  // Restore layer rate control.
+  cpi->rc = lc->rc;
+  cpi->oxcf.target_bandwidth = lc->target_bandwidth;
+  gf_group->index = lc->group_index;
+  // Reset the frames_since_key and frames_to_key counters to their values
+  // before the layer restore. Keep these defined for the stream (not layer).
+  cpi->rc.frames_since_key = old_frame_since_key;
+  cpi->rc.frames_to_key = old_frame_to_key;
+  // For spatial-svc, allow cyclic-refresh to be applied on the spatial layers,
+  // for the base temporal layer.
+  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
+      svc->number_spatial_layers > 1 && svc->temporal_layer_id == 0) {
+    CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+    swap_ptr(&cr->map, &lc->map);
+    swap_ptr(&cr->last_coded_q_map, &lc->last_coded_q_map);
+    cr->sb_index = lc->sb_index;
+    cr->actual_num_seg1_blocks = lc->actual_num_seg1_blocks;
+    cr->actual_num_seg2_blocks = lc->actual_num_seg2_blocks;
+  }
+  svc->skip_nonzeromv_last = 0;
+  svc->skip_nonzeromv_gf = 0;
+  // For each reference (LAST/GOLDEN) set the skip_nonzero_last/gf frame flags.
+  // This is to skip testing nonzero-mv for that reference if it was last
+  // refreshed (i.e., buffer slot holding that reference was refreshed) on the
+  // previous spatial layer at the same time (current_superframe).
+  if (svc->external_ref_frame_config) {
+    int ref_frame_idx = svc->ref_idx[LAST_FRAME - 1];
+    if (svc->buffer_time_index[ref_frame_idx] == svc->current_superframe &&
+        svc->buffer_spatial_layer[ref_frame_idx] == svc->spatial_layer_id - 1)
+      svc->skip_nonzeromv_last = 1;
+    ref_frame_idx = svc->ref_idx[GOLDEN_FRAME - 1];
+    if (svc->buffer_time_index[ref_frame_idx] == svc->current_superframe &&
+        svc->buffer_spatial_layer[ref_frame_idx] == svc->spatial_layer_id - 1)
+      svc->skip_nonzeromv_gf = 1;
+  }
+}
+
+void av1_save_layer_context(AV1_COMP *const cpi) {
+  GF_GROUP *const gf_group = &cpi->gf_group;
+  SVC *const svc = &cpi->svc;
+  LAYER_CONTEXT *lc = get_layer_context(cpi);
+  lc->rc = cpi->rc;
+  lc->target_bandwidth = (int)cpi->oxcf.target_bandwidth;
+  lc->group_index = gf_group->index;
+  if (svc->spatial_layer_id == 0) svc->base_framerate = cpi->framerate;
+  // For spatial-svc, allow cyclic-refresh to be applied on the spatial layers,
+  // for the base temporal layer.
+  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
+      cpi->svc.number_spatial_layers > 1 && svc->temporal_layer_id == 0) {
+    CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+    signed char *temp = lc->map;
+    uint8_t *temp2 = lc->last_coded_q_map;
+    lc->map = cr->map;
+    cr->map = temp;
+    lc->last_coded_q_map = cr->last_coded_q_map;
+    cr->last_coded_q_map = temp2;
+    lc->sb_index = cr->sb_index;
+    lc->actual_num_seg1_blocks = cr->actual_num_seg1_blocks;
+    lc->actual_num_seg2_blocks = cr->actual_num_seg2_blocks;
+  }
+  // For any buffer slot that is refreshed, update it with
+  // the spatial_layer_id and the current_superframe.
+  if (cpi->common.current_frame.frame_type == KEY_FRAME) {
+    // All slots are refreshed on KEY.
+    for (unsigned int i = 0; i < REF_FRAMES; i++) {
+      svc->buffer_time_index[i] = svc->current_superframe;
+      svc->buffer_spatial_layer[i] = svc->spatial_layer_id;
+    }
+  } else if (cpi->svc.external_ref_frame_config) {
+    for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; i++) {
+      int ref_frame_map_idx = svc->ref_idx[i];
+      if (cpi->svc.refresh[ref_frame_map_idx]) {
+        svc->buffer_time_index[ref_frame_map_idx] = svc->current_superframe;
+        svc->buffer_spatial_layer[ref_frame_map_idx] = svc->spatial_layer_id;
+      }
+    }
+  }
+  if (svc->spatial_layer_id == svc->number_spatial_layers - 1)
+    svc->current_superframe++;
+}
+
+void av1_free_svc_cyclic_refresh(AV1_COMP *const cpi) {
+  SVC *const svc = &cpi->svc;
+  for (int sl = 0; sl < svc->number_spatial_layers; ++sl) {
+    for (int tl = 0; tl < svc->number_temporal_layers; ++tl) {
+      int layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
+      LAYER_CONTEXT *const lc = &svc->layer_context[layer];
+      if (lc->map) aom_free(lc->map);
+      if (lc->last_coded_q_map) aom_free(lc->last_coded_q_map);
+    }
+  }
+}
+
+// Reset on key frame: reset counters, references and buffer updates.
+void av1_svc_reset_temporal_layers(AV1_COMP *const cpi, int is_key) {
+  SVC *const svc = &cpi->svc;
+  LAYER_CONTEXT *lc = NULL;
+  for (int sl = 0; sl < svc->number_spatial_layers; ++sl) {
+    for (int tl = 0; tl < svc->number_temporal_layers; ++tl) {
+      lc = &cpi->svc.layer_context[sl * svc->number_temporal_layers + tl];
+      if (is_key) lc->frames_from_key_frame = 0;
+    }
+  }
+  av1_update_temporal_layer_framerate(cpi);
+  av1_restore_layer_context(cpi);
+}
+
+static void get_layer_resolution(const int width_org, const int height_org,
+                                 const int num, const int den, int *width_out,
+                                 int *height_out) {
+  int w, h;
+  if (width_out == NULL || height_out == NULL || den == 0) return;
+  w = width_org * num / den;
+  h = height_org * num / den;
+  // Make height and width even.
+  w += w % 2;
+  h += h % 2;
+  *width_out = w;
+  *height_out = h;
+}
+
+void av1_one_pass_cbr_svc_start_layer(AV1_COMP *const cpi) {
+  SVC *const svc = &cpi->svc;
+  LAYER_CONTEXT *lc = NULL;
+  int width = 0, height = 0;
+  lc = &svc->layer_context[svc->spatial_layer_id * svc->number_temporal_layers +
+                           svc->temporal_layer_id];
+  get_layer_resolution(cpi->oxcf.width, cpi->oxcf.height,
+                       lc->scaling_factor_num, lc->scaling_factor_den, &width,
+                       &height);
+  av1_set_size_literal(cpi, width, height);
+}
diff --git a/libs/libaom/src/av1/encoder/svc_layercontext.h b/libs/libaom/src/av1/encoder/svc_layercontext.h
new file mode 100644
index 000000000..7cb85a3c9
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/svc_layercontext.h
@@ -0,0 +1,99 @@
+/*
+ *  Copyright (c) 2019, Alliance for Open Media. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef AOM_AV1_ENCODER_SVC_LAYERCONTEXT_H_
+#define AOM_AV1_ENCODER_SVC_LAYERCONTEXT_H_
+
+#include "av1/encoder/aq_cyclicrefresh.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/ratectrl.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+  RATE_CONTROL rc;
+  int framerate_factor;
+  int64_t layer_target_bitrate;
+  int scaling_factor_num;
+  int scaling_factor_den;
+  int64_t target_bandwidth;
+  int64_t spatial_layer_target_bandwidth;
+  double framerate;
+  int avg_frame_size;
+  int max_q;
+  int min_q;
+  int frames_from_key_frame;
+  // Cyclic refresh parameters (aq-mode=3), that need to be updated per-frame.
+  int sb_index;
+  int8_t *map;
+  uint8_t *last_coded_q_map;
+  int actual_num_seg1_blocks;
+  int actual_num_seg2_blocks;
+  int counter_encode_maxq_scene_change;
+  uint8_t speed;
+  unsigned char group_index;
+} LAYER_CONTEXT;
+
+typedef struct SVC {
+  int spatial_layer_id;
+  int temporal_layer_id;
+  int number_spatial_layers;
+  int number_temporal_layers;
+  int external_ref_frame_config;
+  int non_reference_frame;
+  // LAST_FRAME (0), LAST2_FRAME(1), LAST3_FRAME(2), GOLDEN_FRAME(3),
+  // BWDREF_FRAME(4), ALTREF2_FRAME(5), ALTREF_FRAME(6).
+  int reference[INTER_REFS_PER_FRAME];
+  int ref_idx[INTER_REFS_PER_FRAME];
+  int refresh[REF_FRAMES];
+  double base_framerate;
+  unsigned int current_superframe;
+  unsigned int buffer_time_index[REF_FRAMES];
+  unsigned char buffer_spatial_layer[REF_FRAMES];
+  int skip_nonzeromv_last;
+  int skip_nonzeromv_gf;
+  // Layer context used for rate control in one pass temporal CBR mode or
+  // two pass spatial mode.
+  LAYER_CONTEXT layer_context[AOM_MAX_LAYERS];
+} SVC;
+
+struct AV1_COMP;
+
+// Initialize layer context data from init_config().
+void av1_init_layer_context(struct AV1_COMP *const cpi);
+
+// Update the layer context from a change_config() call.
+void av1_update_layer_context_change_config(struct AV1_COMP *const cpi,
+                                            const int64_t target_bandwidth);
+
+// Prior to encoding the frame, update framerate-related quantities
+// for the current temporal layer.
+void av1_update_temporal_layer_framerate(struct AV1_COMP *const cpi);
+
+// Prior to encoding the frame, set the layer context, for the current layer
+// to be encoded, to the cpi struct.
+void av1_restore_layer_context(struct AV1_COMP *const cpi);
+
+// Save the layer context after encoding the frame.
+void av1_save_layer_context(struct AV1_COMP *const cpi);
+
+void av1_free_svc_cyclic_refresh(struct AV1_COMP *const cpi);
+
+void av1_svc_reset_temporal_layers(struct AV1_COMP *const cpi, int is_key);
+
+void av1_one_pass_cbr_svc_start_layer(struct AV1_COMP *const cpi);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_SVC_LAYERCONTEXT_H_
diff --git a/libs/libaom/src/av1/encoder/temporal_filter.c b/libs/libaom/src/av1/encoder/temporal_filter.c
new file mode 100644
index 000000000..a637df559
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/temporal_filter.c
@@ -0,0 +1,1338 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <limits.h>
+
+#include "config/aom_config.h"
+
+#include "av1/common/alloccommon.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/common/odintrin.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/reconinter.h"
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/extend.h"
+#include "av1/encoder/firstpass.h"
+#include "av1/encoder/mcomp.h"
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/reconinter_enc.h"
+#include "av1/encoder/segmentation.h"
+#include "av1/encoder/temporal_filter.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/system_state.h"
+#include "aom_scale/aom_scale.h"
+
+// NOTE: All `tf` in this file means `temporal filtering`.
+
+// Does motion search for blocks in temporal filtering. This is the first step
+// for temporal filtering. More specifically, given a frame to be filtered and
+// another frame as reference, this function searches the reference frame to
+// find out the most alike block as that from the frame to be filtered. This
+// found block will be further used for weighted averaging.
+// NOTE: Besides doing motion search for the entire block, this function will
+// also do motion search for each 1/4 sub-block to get more precise prediction.
+// Inputs:
+//   cpi: Pointer to the composed information of input video.
+//   frame_to_filter: Pointer to the frame to be filtered.
+//   ref_frame: Pointer to the reference frame.
+//   block_size: Block size used for motion search.
+//   mb_row: Row index of the block in the entire frame.
+//   mb_col: Column index of the block in the entire frame.
+//   ref_mv: Reference motion vector, which is commonly inherited from the
+//           motion search result of previous frame.
+//   subblock_mvs: Pointer to the result motion vectors for 4 sub-blocks.
+//   subblock_mses: Pointer to the search errors (MSE) for 4 sub-blocks.
+// Returns:
+//   Search error (MSE) of the entire block.
+static int tf_motion_search(AV1_COMP *cpi,
+                            const YV12_BUFFER_CONFIG *frame_to_filter,
+                            const YV12_BUFFER_CONFIG *ref_frame,
+                            const BLOCK_SIZE block_size, const int mb_row,
+                            const int mb_col, MV *ref_mv, MV *subblock_mvs,
+                            int *subblock_mses) {
+  // Frame information
+  const int min_frame_size = AOMMIN(cpi->common.width, cpi->common.height);
+
+  // Block information (ONLY Y-plane is used for motion search).
+  const int mb_height = block_size_high[block_size];
+  const int mb_width = block_size_wide[block_size];
+  const int mb_pels = mb_height * mb_width;
+  const int y_stride = frame_to_filter->y_stride;
+  assert(y_stride == ref_frame->y_stride);
+  const int y_offset = mb_row * mb_height * y_stride + mb_col * mb_width;
+
+  // Save input state.
+  MACROBLOCK *const mb = &cpi->td.mb;
+  MACROBLOCKD *const mbd = &mb->e_mbd;
+  const struct buf_2d ori_src_buf = mb->plane[0].src;
+  const struct buf_2d ori_pre_buf = mbd->plane[0].pre[0];
+  const MV_COST_TYPE ori_mv_cost_type = mb->mv_cost_type;
+
+  // Parameters used for motion search.
+  FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
+  SUBPEL_MOTION_SEARCH_PARAMS ms_params;
+
+  const search_site_config ss_cfg =
+      cpi->mv_search_params.ss_cfg[SS_CFG_LOOKAHEAD];
+  const SEARCH_METHODS full_search_method = NSTEP;
+  const int step_param = av1_init_search_range(
+      AOMMAX(frame_to_filter->y_crop_width, frame_to_filter->y_crop_height));
+  const SUBPEL_SEARCH_TYPE subpel_search_type = USE_8_TAPS;
+  const int force_integer_mv = cpi->common.features.cur_frame_force_integer_mv;
+  const MV_COST_TYPE mv_cost_type =
+      min_frame_size >= 720
+          ? MV_COST_L1_HDRES
+          : (min_frame_size >= 480 ? MV_COST_L1_MIDRES : MV_COST_L1_LOWRES);
+
+  // Starting position for motion search.
+  FULLPEL_MV start_mv = get_fullmv_from_mv(ref_mv);
+  // Baseline position for motion search (used for rate distortion comparison).
+  const MV baseline_mv = kZeroMv;
+
+  // Setup.
+  mb->plane[0].src.buf = frame_to_filter->y_buffer + y_offset;
+  mb->plane[0].src.stride = y_stride;
+  mbd->plane[0].pre[0].buf = ref_frame->y_buffer + y_offset;
+  mbd->plane[0].pre[0].stride = y_stride;
+  // Unused intermediate results for motion search.
+  unsigned int sse, error;
+  int distortion;
+  int cost_list[5];
+
+  // Do motion search.
+  // NOTE: In `av1_full_pixel_search()` and `find_fractional_mv_step()`, the
+  // searched result will be stored in `mb->best_mv`.
+  int_mv best_mv;
+  int block_mse = INT_MAX;
+  mb->mv_cost_type = mv_cost_type;
+
+  av1_make_default_fullpel_ms_params(&full_ms_params, cpi, mb, block_size,
+                                     &baseline_mv, &ss_cfg);
+  full_ms_params.run_mesh_search = 1;
+  full_ms_params.search_method = full_search_method;
+  av1_full_pixel_search(start_mv, &full_ms_params, step_param,
+                        cond_cost_list(cpi, cost_list), &best_mv.as_fullmv,
+                        NULL);
+
+  // Since we are merely refining the result from full pixel search, we don't
+  // need regularization for subpel search
+  mb->mv_cost_type = MV_COST_NONE;
+  if (force_integer_mv == 1) {  // Only do full search on the entire block.
+    const int mv_row = best_mv.as_mv.row;
+    const int mv_col = best_mv.as_mv.col;
+    best_mv.as_mv.row = GET_MV_SUBPEL(mv_row);
+    best_mv.as_mv.col = GET_MV_SUBPEL(mv_col);
+    const int mv_offset = mv_row * y_stride + mv_col;
+    error = cpi->fn_ptr[block_size].vf(
+        ref_frame->y_buffer + y_offset + mv_offset, y_stride,
+        frame_to_filter->y_buffer + y_offset, y_stride, &sse);
+    block_mse = DIVIDE_AND_ROUND(error, mb_pels);
+    mb->e_mbd.mi[0]->mv[0] = best_mv;
+  } else {  // Do fractional search on the entire block and all sub-blocks.
+    av1_make_default_subpel_ms_params(&ms_params, cpi, mb, block_size,
+                                      &baseline_mv, cost_list);
+    ms_params.forced_stop = EIGHTH_PEL;
+    ms_params.var_params.subpel_search_type = subpel_search_type;
+    MV subpel_start_mv = get_mv_from_fullmv(&best_mv.as_fullmv);
+    error = cpi->mv_search_params.find_fractional_mv_step(
+        &mb->e_mbd, &cpi->common, &ms_params, subpel_start_mv, &best_mv.as_mv,
+        &distortion, &sse, NULL);
+    block_mse = DIVIDE_AND_ROUND(error, mb_pels);
+    mb->e_mbd.mi[0]->mv[0] = best_mv;
+    *ref_mv = best_mv.as_mv;
+    // On 4 sub-blocks.
+    const BLOCK_SIZE subblock_size = ss_size_lookup[block_size][1][1];
+    const int subblock_height = block_size_high[subblock_size];
+    const int subblock_width = block_size_wide[subblock_size];
+    const int subblock_pels = subblock_height * subblock_width;
+    start_mv = get_fullmv_from_mv(ref_mv);
+
+    int subblock_idx = 0;
+    for (int i = 0; i < mb_height; i += subblock_height) {
+      for (int j = 0; j < mb_width; j += subblock_width) {
+        const int offset = i * y_stride + j;
+        mb->plane[0].src.buf = frame_to_filter->y_buffer + y_offset + offset;
+        mbd->plane[0].pre[0].buf = ref_frame->y_buffer + y_offset + offset;
+        mb->mv_cost_type = mv_cost_type;
+
+        av1_make_default_fullpel_ms_params(
+            &full_ms_params, cpi, mb, subblock_size, &baseline_mv, &ss_cfg);
+        full_ms_params.run_mesh_search = 1;
+        full_ms_params.search_method = full_search_method;
+        av1_full_pixel_search(start_mv, &full_ms_params, step_param,
+                              cond_cost_list(cpi, cost_list),
+                              &best_mv.as_fullmv, NULL);
+
+        // Since we are merely refining the result from full pixel search, we
+        // don't need regularization for subpel search
+        mb->mv_cost_type = MV_COST_NONE;
+        av1_make_default_subpel_ms_params(&ms_params, cpi, mb, subblock_size,
+                                          &baseline_mv, cost_list);
+        ms_params.forced_stop = EIGHTH_PEL;
+        ms_params.var_params.subpel_search_type = subpel_search_type;
+        subpel_start_mv = get_mv_from_fullmv(&best_mv.as_fullmv);
+        error = cpi->mv_search_params.find_fractional_mv_step(
+            &mb->e_mbd, &cpi->common, &ms_params, subpel_start_mv,
+            &best_mv.as_mv, &distortion, &sse, NULL);
+        subblock_mses[subblock_idx] = DIVIDE_AND_ROUND(error, subblock_pels);
+        subblock_mvs[subblock_idx] = best_mv.as_mv;
+        ++subblock_idx;
+      }
+    }
+  }
+
+  // Restore input state.
+  mb->plane[0].src = ori_src_buf;
+  mbd->plane[0].pre[0] = ori_pre_buf;
+  mb->mv_cost_type = ori_mv_cost_type;
+
+  return block_mse;
+}
+
+// Helper function to get weight according to thresholds.
+static INLINE int get_weight_by_thresh(const int value, const int low,
+                                       const int high) {
+  return value < low ? 2 : value < high ? 1 : 0;
+}
+
+// Gets filter weight for blocks in temporal filtering. The weights will be
+// assigned based on the motion search errors.
+// NOTE: Besides assigning filter weight for the block, this function will also
+// determine whether to split the entire block into 4 sub-blocks for further
+// filtering.
+// TODO(any): Many magic numbers are used in this function. They may be tuned
+// to improve the performance.
+// Inputs:
+//   block_mse: Motion search error (MSE) for the entire block.
+//   subblock_mses: Pointer to the search errors (MSE) for 4 sub-blocks.
+//   is_second_arf: Whether the to-filter frame is the second ARF. This field
+//                  will affect the filter weight for the to-filter frame.
+//   subblock_filter_weights: Pointer to the assigned filter weight for each
+//                            sub-block. If not using sub-blocks, the first
+//                            element will be used for the entire block.
+// Returns: Whether to use 4 sub-blocks to replace the original block.
+static int tf_get_filter_weight(const int block_mse, const int *subblock_mses,
+                                const int is_second_arf,
+                                int *subblock_filter_weights) {
+  // `block_mse` is initialized as INT_MAX and will be overwritten after the
+  // motion search with reference frame, therefore INT_MAX can ONLY be accessed
+  // by to-filter frame.
+  if (block_mse == INT_MAX) {
+    const int weight = TF_ENABLE_PLANEWISE_STRATEGY
+                           ? TF_PLANEWISE_FILTER_WEIGHT_SCALE
+                           : is_second_arf ? 64 : 32;
+    subblock_filter_weights[0] = subblock_filter_weights[1] =
+        subblock_filter_weights[2] = subblock_filter_weights[3] = weight;
+    return 0;
+  }
+
+  const int thresh_low = is_second_arf ? 20 : 40;
+  const int thresh_high = is_second_arf ? 40 : 80;
+
+  int min_subblock_mse = INT_MAX;
+  int max_subblock_mse = INT_MIN;
+  int sum_subblock_mse = 0;
+  for (int i = 0; i < 4; ++i) {
+    sum_subblock_mse += subblock_mses[i];
+    min_subblock_mse = AOMMIN(min_subblock_mse, subblock_mses[i]);
+    max_subblock_mse = AOMMAX(max_subblock_mse, subblock_mses[i]);
+    subblock_filter_weights[i] =
+        get_weight_by_thresh(subblock_mses[i], thresh_low, thresh_high);
+  }
+
+  if (((block_mse * 15 < sum_subblock_mse * 4) &&
+       max_subblock_mse - min_subblock_mse < 48) ||
+      ((block_mse * 14 < sum_subblock_mse * 4) &&
+       max_subblock_mse - min_subblock_mse < 24)) {  // No split.
+    const int weight = get_weight_by_thresh(block_mse, thresh_low, thresh_high);
+    subblock_filter_weights[0] = subblock_filter_weights[1] =
+        subblock_filter_weights[2] = subblock_filter_weights[3] = weight;
+    return 0;
+  } else {  // Do split.
+    return 1;
+  }
+}
+
+// Helper function to determine whether a frame is encoded with high bit-depth.
+static INLINE int is_frame_high_bitdepth(const YV12_BUFFER_CONFIG *frame) {
+  return (frame->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
+}
+
+// Builds predictor for blocks in temporal filtering. This is the second step
+// for temporal filtering, which is to construct predictions from all reference
+// frames INCLUDING the frame to be filtered itself. These predictors are built
+// based on the motion search results (motion vector is set as 0 for the frame
+// to be filtered), and will be futher used for weighted averaging.
+// Inputs:
+//   ref_frame: Pointer to the reference frame (or the frame to be filtered).
+//   mbd: Pointer to the block for filtering. Besides containing the subsampling
+//        information of all planes, this field also gives the searched motion
+//        vector for the entire block, i.e., `mbd->mi[0]->mv[0]`. This vector
+//        should be 0 if the `ref_frame` itself is the frame to be filtered.
+//   block_size: Size of the block.
+//   mb_row: Row index of the block in the entire frame.
+//   mb_col: Column index of the block in the entire frame.
+//   num_planes: Number of planes in the frame.
+//   scale: Scaling factor.
+//   use_subblock: Whether to use 4 sub-blocks to replace the original block.
+//   subblock_mvs: The motion vectors for each sub-block (row-major order).
+//   pred: Pointer to the predictor to build.
+// Returns:
+//   Nothing will be returned. But the content to which `pred` points will be
+//   modified.
+static void tf_build_predictor(const YV12_BUFFER_CONFIG *ref_frame,
+                               const MACROBLOCKD *mbd,
+                               const BLOCK_SIZE block_size, const int mb_row,
+                               const int mb_col, const int num_planes,
+                               const struct scale_factors *scale,
+                               const int use_subblock, const MV *subblock_mvs,
+                               uint8_t *pred) {
+  assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
+
+  // Information of the entire block.
+  const int mb_height = block_size_high[block_size];  // Height.
+  const int mb_width = block_size_wide[block_size];   // Width.
+  const int mb_pels = mb_height * mb_width;           // Number of pixels.
+  const int mb_y = mb_height * mb_row;                // Y-coord (Top-left).
+  const int mb_x = mb_width * mb_col;                 // X-coord (Top-left).
+  const int bit_depth = mbd->bd;                      // Bit depth.
+  const int is_intrabc = 0;                           // Is intra-copied?
+  const int mb_mv_row = mbd->mi[0]->mv[0].as_mv.row;  // Motion vector (y).
+  const int mb_mv_col = mbd->mi[0]->mv[0].as_mv.col;  // Motion vector (x).
+  const MV mb_mv = { (int16_t)mb_mv_row, (int16_t)mb_mv_col };
+  const int is_high_bitdepth = is_frame_high_bitdepth(ref_frame);
+
+  // Information of each sub-block (actually in use).
+  const int num_blocks = use_subblock ? 2 : 1;  // Num of blocks on each side.
+  const int block_height = mb_height >> (num_blocks - 1);  // Height.
+  const int block_width = mb_width >> (num_blocks - 1);    // Width.
+
+  // Default interpolation filters.
+  const int_interpfilters interp_filters =
+      av1_broadcast_interp_filter(MULTITAP_SHARP);
+
+  // Handle Y-plane, U-plane and V-plane (if needed) in sequence.
+  int plane_offset = 0;
+  for (int plane = 0; plane < num_planes; ++plane) {
+    const int subsampling_y = mbd->plane[plane].subsampling_y;
+    const int subsampling_x = mbd->plane[plane].subsampling_x;
+    // Information of each sub-block in current plane.
+    const int plane_h = mb_height >> subsampling_y;  // Plane height.
+    const int plane_w = mb_width >> subsampling_x;   // Plane width.
+    const int plane_y = mb_y >> subsampling_y;       // Y-coord (Top-left).
+    const int plane_x = mb_x >> subsampling_x;       // X-coord (Top-left).
+    const int h = block_height >> subsampling_y;     // Sub-block height.
+    const int w = block_width >> subsampling_x;      // Sub-block width.
+    const int is_y_plane = (plane == 0);             // Is Y-plane?
+
+    const struct buf_2d ref_buf = { NULL, ref_frame->buffers[plane],
+                                    ref_frame->widths[is_y_plane ? 0 : 1],
+                                    ref_frame->heights[is_y_plane ? 0 : 1],
+                                    ref_frame->strides[is_y_plane ? 0 : 1] };
+
+    // Handle entire block or sub-blocks if needed.
+    int subblock_idx = 0;
+    for (int i = 0; i < plane_h; i += h) {
+      for (int j = 0; j < plane_w; j += w) {
+        // Choose proper motion vector.
+        const MV mv = use_subblock ? subblock_mvs[subblock_idx] : mb_mv;
+        assert(mv.row >= INT16_MIN && mv.row <= INT16_MAX &&
+               mv.col >= INT16_MIN && mv.col <= INT16_MAX);
+
+        const int y = plane_y + i;
+        const int x = plane_x + j;
+
+        // Build predictior for each sub-block on current plane.
+        InterPredParams inter_pred_params;
+        av1_init_inter_params(&inter_pred_params, w, h, y, x, subsampling_x,
+                              subsampling_y, bit_depth, is_high_bitdepth,
+                              is_intrabc, scale, &ref_buf, interp_filters);
+        inter_pred_params.conv_params = get_conv_params(0, plane, bit_depth);
+        av1_enc_build_one_inter_predictor(&pred[plane_offset + i * plane_w + j],
+                                          plane_w, &mv, &inter_pred_params);
+
+        ++subblock_idx;
+      }
+    }
+    plane_offset += mb_pels;
+  }
+}
+
+// Computes temporal filter weights and accumulators for the frame to be
+// filtered. More concretely, the filter weights for all pixels are the same.
+// Inputs:
+//   mbd: Pointer to the block for filtering, which is ONLY used to get
+//        subsampling information of all planes as well as the bit-depth.
+//   block_size: Size of the block.
+//   num_planes: Number of planes in the frame.
+//   filter_weight: Weight used for filtering.
+//   pred: Pointer to the well-built predictors.
+//   accum: Pointer to the pixel-wise accumulator for filtering.
+//   count: Pointer to the pixel-wise counter fot filtering.
+// Returns:
+//   Nothing will be returned. But the content to which `accum` and `pred`
+//   point will be modified.
+void av1_apply_temporal_filter_self(const MACROBLOCKD *mbd,
+                                    const BLOCK_SIZE block_size,
+                                    const int num_planes,
+                                    const int filter_weight,
+                                    const uint8_t *pred, uint32_t *accum,
+                                    uint16_t *count) {
+  assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
+
+  // Block information.
+  const int mb_height = block_size_high[block_size];
+  const int mb_width = block_size_wide[block_size];
+  const int mb_pels = mb_height * mb_width;
+  const int is_high_bitdepth = is_cur_buf_hbd(mbd);
+  const uint16_t *pred16 = CONVERT_TO_SHORTPTR(pred);
+
+  int plane_offset = 0;
+  for (int plane = 0; plane < num_planes; ++plane) {
+    const int subsampling_y = mbd->plane[plane].subsampling_y;
+    const int subsampling_x = mbd->plane[plane].subsampling_x;
+    const int h = mb_height >> subsampling_y;  // Plane height.
+    const int w = mb_width >> subsampling_x;   // Plane width.
+
+    int pred_idx = 0;
+    for (int i = 0; i < h; ++i) {
+      for (int j = 0; j < w; ++j) {
+        const int idx = plane_offset + pred_idx;  // Index with plane shift.
+        const int pred_value = is_high_bitdepth ? pred16[idx] : pred[idx];
+        accum[idx] += filter_weight * pred_value;
+        count[idx] += filter_weight;
+        ++pred_idx;
+      }
+    }
+    plane_offset += mb_pels;
+  }
+}
+
+// Function to compute pixel-wise squared difference between two buffers.
+// Inputs:
+//   ref: Pointer to reference buffer.
+//   ref_offset: Start position of reference buffer for computation.
+//   ref_stride: Stride for reference buffer.
+//   tgt: Pointer to target buffer.
+//   tgt_offset: Start position of target buffer for computation.
+//   tgt_stride: Stride for target buffer.
+//   height: Height of block for computation.
+//   width: Width of block for computation.
+//   is_high_bitdepth: Whether the two buffers point to high bit-depth frames.
+//   square_diff: Pointer to save the squared differces.
+// Returns:
+//   Nothing will be returned. But the content to which `square_diff` points
+//   will be modified.
+static INLINE void compute_square_diff(const uint8_t *ref, const int ref_offset,
+                                       const int ref_stride, const uint8_t *tgt,
+                                       const int tgt_offset,
+                                       const int tgt_stride, const int height,
+                                       const int width,
+                                       const int is_high_bitdepth,
+                                       uint32_t *square_diff) {
+  const uint16_t *ref16 = CONVERT_TO_SHORTPTR(ref);
+  const uint16_t *tgt16 = CONVERT_TO_SHORTPTR(tgt);
+
+  int ref_idx = 0;
+  int tgt_idx = 0;
+  int idx = 0;
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; ++j) {
+      const uint16_t ref_value = is_high_bitdepth ? ref16[ref_offset + ref_idx]
+                                                  : ref[ref_offset + ref_idx];
+      const uint16_t tgt_value = is_high_bitdepth ? tgt16[tgt_offset + tgt_idx]
+                                                  : tgt[tgt_offset + tgt_idx];
+      const uint32_t diff = (ref_value > tgt_value) ? (ref_value - tgt_value)
+                                                    : (tgt_value - ref_value);
+      square_diff[idx] = diff * diff;
+
+      ++ref_idx;
+      ++tgt_idx;
+      ++idx;
+    }
+    ref_idx += (ref_stride - width);
+    tgt_idx += (tgt_stride - width);
+  }
+}
+
+// Function to adjust the filter weight when use YUV strategy.
+// Inputs:
+//   filter_weight: Original filter weight.
+//   sum_square_diff: Sum of squared difference between input frame and
+//                    prediction. This field is computed pixel by pixel, and
+//                    is used as a reference for the filter weight adjustment.
+//   num_ref_pixels: Number of pixels used to compute the `sum_square_diff`.
+//                   This field should align with the above lookup tables
+//                   `filter_weight_adjustment_lookup_table_yuv` and
+//                   `highbd_filter_weight_adjustment_lookup_table_yuv`.
+//   strength: Strength for filter weight adjustment.
+// Returns:
+//   Adjusted filter weight which will finally be used for filtering.
+static INLINE int adjust_filter_weight_yuv(const int filter_weight,
+                                           const uint64_t sum_square_diff,
+                                           const int num_ref_pixels,
+                                           const int strength) {
+  int modifier =
+      (int)(AOMMIN(sum_square_diff * TF_YUV_FILTER_WEIGHT_SCALE, INT32_MAX)) /
+      num_ref_pixels;
+  const int rounding = (1 << strength) >> 1;
+  modifier = (modifier + rounding) >> strength;
+  return (modifier >= 16) ? 0 : (16 - modifier) * filter_weight;
+}
+
+// Applies temporal filter with YUV strategy.
+// Inputs:
+//   frame_to_filter: Pointer to the frame to be filtered, which is used as
+//                    reference to compute squared differece from the predictor.
+//   mbd: Pointer to the block for filtering, which is ONLY used to get
+//        subsampling information of all YUV planes.
+//   block_size: Size of the block.
+//   mb_row: Row index of the block in the entire frame.
+//   mb_col: Column index of the block in the entire frame.
+//   num_planes: Number of planes in the frame.
+//   strength: Strength for filter weight adjustment.
+//   use_subblock: Whether to use 4 sub-blocks to replace the original block.
+//   subblock_filter_weights: The filter weights for each sub-block (row-major
+//                            order). If `use_subblock` is set as 0, the first
+//                            weight will be applied to the entire block.
+//   pred: Pointer to the well-built predictors.
+//   accum: Pointer to the pixel-wise accumulator for filtering.
+//   count: Pointer to the pixel-wise counter fot filtering.
+// Returns:
+//   Nothing will be returned. But the content to which `accum` and `pred`
+//   point will be modified.
+void av1_apply_temporal_filter_yuv_c(
+    const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
+    const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
+    const int num_planes, const int strength, const int use_subblock,
+    const int *subblock_filter_weights, const uint8_t *pred, uint32_t *accum,
+    uint16_t *count) {
+  assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
+
+  // Block information.
+  const int mb_height = block_size_high[block_size];
+  const int mb_width = block_size_wide[block_size];
+  const int mb_pels = mb_height * mb_width;
+  const int is_high_bitdepth = is_frame_high_bitdepth(frame_to_filter);
+  const uint16_t *pred16 = CONVERT_TO_SHORTPTR(pred);
+
+  // Allocate memory for pixel-wise squared differences for all planes. They,
+  // regardless of the subsampling, are assigned with memory of size `mb_pels`.
+  uint32_t *square_diff =
+      aom_memalign(16, num_planes * mb_pels * sizeof(uint32_t));
+  memset(square_diff, 0, num_planes * mb_pels * sizeof(square_diff[0]));
+
+  int plane_offset = 0;
+  for (int plane = 0; plane < num_planes; ++plane) {
+    // Locate pixel on reference frame.
+    const int plane_h = mb_height >> mbd->plane[plane].subsampling_y;
+    const int plane_w = mb_width >> mbd->plane[plane].subsampling_x;
+    const int frame_stride = frame_to_filter->strides[plane == 0 ? 0 : 1];
+    const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w;
+    const uint8_t *ref = frame_to_filter->buffers[plane];
+    compute_square_diff(ref, frame_offset, frame_stride, pred, plane_offset,
+                        plane_w, plane_h, plane_w, is_high_bitdepth,
+                        square_diff + plane_offset);
+    plane_offset += mb_pels;
+  }
+
+  // Get window size for pixel-wise filtering.
+  assert(TF_YUV_FILTER_WINDOW_LENGTH % 2 == 1);
+  const int half_window = TF_YUV_FILTER_WINDOW_LENGTH >> 1;
+
+  // Handle planes in sequence.
+  plane_offset = 0;
+  for (int plane = 0; plane < num_planes; ++plane) {
+    const int subsampling_y = mbd->plane[plane].subsampling_y;
+    const int subsampling_x = mbd->plane[plane].subsampling_x;
+    const int h = mb_height >> subsampling_y;  // Plane height.
+    const int w = mb_width >> subsampling_x;   // Plane width.
+
+    // Perform filtering.
+    int pred_idx = 0;
+    for (int i = 0; i < h; ++i) {
+      for (int j = 0; j < w; ++j) {
+        // non-local mean approach
+        uint64_t sum_square_diff = 0;
+        int num_ref_pixels = 0;
+
+        for (int wi = -half_window; wi <= half_window; ++wi) {
+          for (int wj = -half_window; wj <= half_window; ++wj) {
+            const int y = i + wi;  // Y-coord on the current plane.
+            const int x = j + wj;  // X-coord on the current plane.
+            if (y >= 0 && y < h && x >= 0 && x < w) {
+              sum_square_diff += square_diff[plane_offset + y * w + x];
+              ++num_ref_pixels;
+            }
+          }
+        }
+
+        if (plane == 0) {  // Filter Y-plane using both U-plane and V-plane.
+          for (int p = 1; p < num_planes; ++p) {
+            const int ss_y_shift = mbd->plane[p].subsampling_y - subsampling_y;
+            const int ss_x_shift = mbd->plane[p].subsampling_x - subsampling_x;
+            const int yy = i >> ss_y_shift;  // Y-coord on UV-plane.
+            const int xx = j >> ss_x_shift;  // X-coord on UV-plane.
+            const int ww = w >> ss_x_shift;  // Width of UV-plane.
+            sum_square_diff += square_diff[p * mb_pels + yy * ww + xx];
+            ++num_ref_pixels;
+          }
+        } else {  // Filter U-plane and V-plane using Y-plane.
+          const int ss_y_shift = subsampling_y - mbd->plane[0].subsampling_y;
+          const int ss_x_shift = subsampling_x - mbd->plane[0].subsampling_x;
+          for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
+            for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
+              const int yy = (i << ss_y_shift) + ii;  // Y-coord on Y-plane.
+              const int xx = (j << ss_x_shift) + jj;  // X-coord on Y-plane.
+              const int ww = w << ss_x_shift;         // Width of Y-plane.
+              sum_square_diff += square_diff[yy * ww + xx];
+              ++num_ref_pixels;
+            }
+          }
+        }
+
+        // Base filter weight estimated by motion search error.
+        const int subblock_idx =
+            use_subblock ? (i >= h / 2) * 2 + (j >= w / 2) : 0;
+        const int filter_weight = subblock_filter_weights[subblock_idx];
+
+        const int idx = plane_offset + pred_idx;  // Index with plane shift.
+        const int pred_value = is_high_bitdepth ? pred16[idx] : pred[idx];
+        const int adjusted_weight = adjust_filter_weight_yuv(
+            filter_weight, sum_square_diff, num_ref_pixels, strength);
+        accum[idx] += adjusted_weight * pred_value;
+        count[idx] += adjusted_weight;
+
+        ++pred_idx;
+      }
+    }
+    plane_offset += mb_pels;
+  }
+
+  aom_free(square_diff);
+}
+
+// Applies temporal filter with plane-wise strategy.
+// The strategy of filter weight adjustment is different from the function
+// `av1_apply_temporal_filter_yuv_c()`.
+// Inputs:
+//   frame_to_filter: Pointer to the frame to be filtered, which is used as
+//                    reference to compute squared differece from the predictor.
+//   mbd: Pointer to the block for filtering, which is ONLY used to get
+//        subsampling information of all planes.
+//   block_size: Size of the block.
+//   mb_row: Row index of the block in the entire frame.
+//   mb_col: Column index of the block in the entire frame.
+//   num_planes: Number of planes in the frame.
+//   noise_levels: Pointer to the noise levels of the to-filter frame, estimated
+//                 with each plane (in Y, U, V order).
+//   use_subblock: Whether to use 4 sub-blocks to replace the original block.
+//   block_mse: Motion search error (MSE) for the entire block.
+//   subblock_mses: Pointer to the search errors (MSE) for 4 sub-blocks.
+//   q_factor: Quantization factor. This is actually the `q` defined in libaom,
+//             which is converted from `qindex`.
+//   pred: Pointer to the well-built predictors.
+//   accum: Pointer to the pixel-wise accumulator for filtering.
+//   count: Pointer to the pixel-wise counter fot filtering.
+// Returns:
+//   Nothing will be returned. But the content to which `accum` and `pred`
+//   point will be modified.
+void av1_apply_temporal_filter_planewise_c(
+    const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
+    const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
+    const int num_planes, const double *noise_levels, const int use_subblock,
+    const int block_mse, const int *subblock_mses, const int q_factor,
+    const uint8_t *pred, uint32_t *accum, uint16_t *count) {
+  assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
+
+  // Block information.
+  const int mb_height = block_size_high[block_size];
+  const int mb_width = block_size_wide[block_size];
+  const int mb_pels = mb_height * mb_width;
+  const int is_high_bitdepth = is_frame_high_bitdepth(frame_to_filter);
+  const uint16_t *pred16 = CONVERT_TO_SHORTPTR(pred);
+
+  // Allocate memory for pixel-wise squared differences for all planes. They,
+  // regardless of the subsampling, are assigned with memory of size `mb_pels`.
+  uint32_t *square_diff =
+      aom_memalign(16, num_planes * mb_pels * sizeof(uint32_t));
+  memset(square_diff, 0, num_planes * mb_pels * sizeof(square_diff[0]));
+
+  int plane_offset = 0;
+  for (int plane = 0; plane < num_planes; ++plane) {
+    // Locate pixel on reference frame.
+    const int plane_h = mb_height >> mbd->plane[plane].subsampling_y;
+    const int plane_w = mb_width >> mbd->plane[plane].subsampling_x;
+    const int frame_stride = frame_to_filter->strides[plane == 0 ? 0 : 1];
+    const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w;
+    const uint8_t *ref = frame_to_filter->buffers[plane];
+    compute_square_diff(ref, frame_offset, frame_stride, pred, plane_offset,
+                        plane_w, plane_h, plane_w, is_high_bitdepth,
+                        square_diff + plane_offset);
+    plane_offset += mb_pels;
+  }
+
+  // Get window size for pixel-wise filtering.
+  assert(TF_PLANEWISE_FILTER_WINDOW_LENGTH % 2 == 1);
+  const int half_window = TF_PLANEWISE_FILTER_WINDOW_LENGTH >> 1;
+
+  // Hyper-parameter for filter weight adjustment.
+  const int frame_height = frame_to_filter->heights[0]
+                           << mbd->plane[0].subsampling_y;
+  const int decay_control = frame_height >= 720 ? 4 : 3;
+
+  // Handle planes in sequence.
+  plane_offset = 0;
+  for (int plane = 0; plane < num_planes; ++plane) {
+    const int subsampling_y = mbd->plane[plane].subsampling_y;
+    const int subsampling_x = mbd->plane[plane].subsampling_x;
+    const int h = mb_height >> subsampling_y;  // Plane height.
+    const int w = mb_width >> subsampling_x;   // Plane width.
+
+    // Perform filtering.
+    int pred_idx = 0;
+    for (int i = 0; i < h; ++i) {
+      for (int j = 0; j < w; ++j) {
+        // non-local mean approach
+        uint64_t sum_square_diff = 0;
+        int num_ref_pixels = 0;
+
+        for (int wi = -half_window; wi <= half_window; ++wi) {
+          for (int wj = -half_window; wj <= half_window; ++wj) {
+            const int y = CLIP(i + wi, 0, h - 1);  // Y-coord on current plane.
+            const int x = CLIP(j + wj, 0, w - 1);  // X-coord on current plane.
+            sum_square_diff += square_diff[plane_offset + y * w + x];
+            ++num_ref_pixels;
+          }
+        }
+
+        // Filter U-plane and V-plane using Y-plane. This is because motion
+        // search is only done on Y-plane, so the information from Y-plane will
+        // be more accurate.
+        if (plane != 0) {
+          const int ss_y_shift = subsampling_y - mbd->plane[0].subsampling_y;
+          const int ss_x_shift = subsampling_x - mbd->plane[0].subsampling_x;
+          for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
+            for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
+              const int yy = (i << ss_y_shift) + ii;  // Y-coord on Y-plane.
+              const int xx = (j << ss_x_shift) + jj;  // X-coord on Y-plane.
+              const int ww = w << ss_x_shift;         // Width of Y-plane.
+              sum_square_diff += square_diff[yy * ww + xx];
+              ++num_ref_pixels;
+            }
+          }
+        }
+
+        // Scale down the difference for high bit depth input.
+        if (mbd->bd > 8) sum_square_diff >>= (mbd->bd - 8) * (mbd->bd - 8);
+        const double window_error = (double)(sum_square_diff) / num_ref_pixels;
+        const int subblock_idx = (i >= h / 2) * 2 + (j >= w / 2);
+        const double block_error =
+            (double)(use_subblock ? subblock_mses[subblock_idx] : block_mse);
+
+        // Control factor for non-local mean approach.
+        const double r =
+            (double)decay_control * (0.7 + log(noise_levels[plane] + 1.0));
+        const double q = AOMMIN((double)(q_factor * q_factor) / 256.0, 1);
+
+        // Compute filter weight.
+        const double scaled_diff =
+            AOMMAX(-(window_error + block_error / 10) / (2 * r * r * q), -15.0);
+        const int adjusted_weight =
+            (int)(exp(scaled_diff) * TF_PLANEWISE_FILTER_WEIGHT_SCALE);
+
+        const int idx = plane_offset + pred_idx;  // Index with plane shift.
+        const int pred_value = is_high_bitdepth ? pred16[idx] : pred[idx];
+        accum[idx] += adjusted_weight * pred_value;
+        count[idx] += adjusted_weight;
+
+        ++pred_idx;
+      }
+    }
+    plane_offset += mb_pels;
+  }
+
+  aom_free(square_diff);
+}
+
+// Computes temporal filter weights and accumulators from all reference frames
+// excluding the current frame to be filtered.
+// Inputs:
+//   frame_to_filter: Pointer to the frame to be filtered, which is used as
+//                    reference to compute squared differece from the predictor.
+//   mbd: Pointer to the block for filtering, which is ONLY used to get
+//        subsampling information of all planes and the bit-depth.
+//   block_size: Size of the block.
+//   mb_row: Row index of the block in the entire frame.
+//   mb_col: Column index of the block in the entire frame.
+//   num_planes: Number of planes in the frame.
+//   strength: Strength for filter weight adjustment. (Used in YUV strategy)
+//   use_subblock: Whether to use 4 sub-blocks to replace the original block.
+//                 (Used in YUV strategy)
+//   subblock_filter_weights: The filter weights for each sub-block (row-major
+//                            order). If `use_subblock` is set as 0, the first
+//                            weight will be applied to the entire block. (Used
+//                            in YUV strategy)
+//   noise_levels: Pointer to the noise levels of the to-filter frame, estimated
+//                 with each plane (in Y, U, V order). (Used in plane-wise
+//                 strategy)
+//   block_mse: Motion search error (MSE) for the entire block.
+//   subblock_mses: Pointer to the search errors (MSE) for 4 sub-blocks.
+//   q_factor: Quantization factor.
+//   pred: Pointer to the well-built predictors.
+//   accum: Pointer to the pixel-wise accumulator for filtering.
+//   count: Pointer to the pixel-wise counter fot filtering.
+// Returns:
+//   Nothing will be returned. But the content to which `accum` and `pred`
+//   point will be modified.
+void av1_apply_temporal_filter_others(
+    const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
+    const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
+    const int num_planes, const int strength, const int use_subblock,
+    const int *subblock_filter_weights, const double *noise_levels,
+    const int block_mse, const int *subblock_mses, const int q_factor,
+    const uint8_t *pred, uint32_t *accum, uint16_t *count) {
+  assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
+
+  if (TF_ENABLE_PLANEWISE_STRATEGY) {
+    // TODO(any): avx2 and sse2 version should be changed to align with C
+    // function before using.
+    if (is_frame_high_bitdepth(frame_to_filter) || block_size != BLOCK_32X32) {
+      av1_apply_temporal_filter_planewise_c(
+          frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes,
+          noise_levels, use_subblock, block_mse, subblock_mses, q_factor, pred,
+          accum, count);
+    } else {
+      av1_apply_temporal_filter_planewise(
+          frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes,
+          noise_levels, use_subblock, block_mse, subblock_mses, q_factor, pred,
+          accum, count);
+    }
+  } else {  // Commonly used for low-resolution video.
+    if (subblock_filter_weights[0] == 0 && subblock_filter_weights[1] == 0 &&
+        subblock_filter_weights[2] == 0 && subblock_filter_weights[3] == 0) {
+      return;
+    }
+    const int adj_strength = strength + 2 * (mbd->bd - 8);
+    if (num_planes == 3 && TF_YUV_FILTER_WEIGHT_SCALE == 3 &&
+        block_size != BLOCK_32X32) {
+      av1_apply_temporal_filter_yuv(frame_to_filter, mbd, block_size, mb_row,
+                                    mb_col, num_planes, adj_strength,
+                                    use_subblock, subblock_filter_weights, pred,
+                                    accum, count);
+    } else {
+      // TODO(any): sse4 version should be changed to align with C function
+      // before using.
+      av1_apply_temporal_filter_yuv_c(frame_to_filter, mbd, block_size, mb_row,
+                                      mb_col, num_planes, adj_strength,
+                                      use_subblock, subblock_filter_weights,
+                                      pred, accum, count);
+    }
+  }
+}
+
+// Normalizes the accumulated filtering result to produce the filtered frame.
+// Inputs:
+//   mbd: Pointer to the block for filtering, which is ONLY used to get
+//        subsampling information of all planes.
+//   block_size: Size of the block.
+//   mb_row: Row index of the block in the entire frame.
+//   mb_col: Column index of the block in the entire frame.
+//   num_planes: Number of planes in the frame.
+//   accum: Pointer to the pre-computed accumulator.
+//   count: Pointer to the pre-computed count.
+//   result_buffer: Pointer to result buffer.
+// Returns:
+//   Nothing will be returned. But the content to which `result_buffer` point
+//   will be modified.
+static void tf_normalize_filtered_frame(
+    const MACROBLOCKD *mbd, const BLOCK_SIZE block_size, const int mb_row,
+    const int mb_col, const int num_planes, const uint32_t *accum,
+    const uint16_t *count, YV12_BUFFER_CONFIG *result_buffer) {
+  assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
+
+  // Block information.
+  const int mb_height = block_size_high[block_size];
+  const int mb_width = block_size_wide[block_size];
+  const int mb_pels = mb_height * mb_width;
+  const int is_high_bitdepth = is_frame_high_bitdepth(result_buffer);
+
+  int plane_offset = 0;
+  for (int plane = 0; plane < num_planes; ++plane) {
+    const int plane_h = mb_height >> mbd->plane[plane].subsampling_y;
+    const int plane_w = mb_width >> mbd->plane[plane].subsampling_x;
+    const int frame_stride = result_buffer->strides[plane == 0 ? 0 : 1];
+    const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w;
+    uint8_t *const buf = result_buffer->buffers[plane];
+    uint16_t *const buf16 = CONVERT_TO_SHORTPTR(buf);
+
+    int plane_idx = 0;             // Pixel index on current plane (block-base).
+    int frame_idx = frame_offset;  // Pixel index on the entire frame.
+    for (int i = 0; i < plane_h; ++i) {
+      for (int j = 0; j < plane_w; ++j) {
+        const int idx = plane_idx + plane_offset;
+        const uint16_t rounding = count[idx] >> 1;
+        if (is_high_bitdepth) {
+          buf16[frame_idx] =
+              (uint16_t)OD_DIVU(accum[idx] + rounding, count[idx]);
+        } else {
+          buf[frame_idx] = (uint8_t)OD_DIVU(accum[idx] + rounding, count[idx]);
+        }
+        ++plane_idx;
+        ++frame_idx;
+      }
+      frame_idx += (frame_stride - plane_w);
+    }
+    plane_offset += mb_pels;
+  }
+}
+
+// Helper function to compute number of blocks on either side of the frame.
+static INLINE int get_num_blocks(const int frame_length, const int mb_length) {
+  return (frame_length + mb_length - 1) / mb_length;
+}
+
+typedef struct {
+  int64_t sum;
+  int64_t sse;
+} FRAME_DIFF;
+
+// Does temporal filter for a particular frame.
+// Inputs:
+//   cpi: Pointer to the composed information of input video.
+//   frames: Frame buffers used for temporal filtering.
+//   num_frames: Number of frames in the frame buffer.
+//   filter_frame_idx: Index of the frame to be filtered.
+//   is_key_frame: Whether the to-filter is a key frame.
+//   is_second_arf: Whether the to-filter frame is the second ARF. This field
+//                  is ONLY used for assigning filter weight.
+//   block_size: Block size used for temporal filtering.
+//   scale: Scaling factor.
+//   strength: Pre-estimated strength for filter weight adjustment.
+//   noise_levels: Pointer to the noise levels of the to-filter frame, estimated
+//                 with each plane (in Y, U, V order).
+// Returns:
+//   Difference between filtered frame and the original frame.
+static FRAME_DIFF tf_do_filtering(
+    AV1_COMP *cpi, YV12_BUFFER_CONFIG **frames, const int num_frames,
+    const int filter_frame_idx, const int is_key_frame, const int is_second_arf,
+    const BLOCK_SIZE block_size, const struct scale_factors *scale,
+    const int strength, const double *noise_levels) {
+  // Basic information.
+  const YV12_BUFFER_CONFIG *const frame_to_filter = frames[filter_frame_idx];
+  const int frame_height = frame_to_filter->y_crop_height;
+  const int frame_width = frame_to_filter->y_crop_width;
+  const int mb_height = block_size_high[block_size];
+  const int mb_width = block_size_wide[block_size];
+  const int mb_pels = mb_height * mb_width;
+  const int mb_rows = get_num_blocks(frame_height, mb_height);
+  const int mb_cols = get_num_blocks(frame_width, mb_width);
+  const int num_planes = av1_num_planes(&cpi->common);
+  const int mi_h = mi_size_high_log2[block_size];
+  const int mi_w = mi_size_wide_log2[block_size];
+  assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
+  const int is_high_bitdepth = is_frame_high_bitdepth(frame_to_filter);
+
+  // Save input state.
+  MACROBLOCK *const mb = &cpi->td.mb;
+  MACROBLOCKD *const mbd = &mb->e_mbd;
+  uint8_t *input_buffer[MAX_MB_PLANE];
+  for (int i = 0; i < num_planes; i++) {
+    input_buffer[i] = mbd->plane[i].pre[0].buf;
+  }
+  MB_MODE_INFO **input_mb_mode_info = mbd->mi;
+
+  // Setup.
+  mbd->block_ref_scale_factors[0] = scale;
+  mbd->block_ref_scale_factors[1] = scale;
+  // A temporary block info used to store state in temporal filtering process.
+  MB_MODE_INFO *tmp_mb_mode_info = (MB_MODE_INFO *)malloc(sizeof(MB_MODE_INFO));
+  memset(tmp_mb_mode_info, 0, sizeof(MB_MODE_INFO));
+  mbd->mi = &tmp_mb_mode_info;
+  mbd->mi[0]->motion_mode = SIMPLE_TRANSLATION;
+  // Allocate memory for predictor, accumulator and count.
+  uint8_t *pred8 = aom_memalign(32, num_planes * mb_pels * sizeof(uint8_t));
+  uint16_t *pred16 = aom_memalign(32, num_planes * mb_pels * sizeof(uint16_t));
+  uint32_t *accum = aom_memalign(16, num_planes * mb_pels * sizeof(uint32_t));
+  uint16_t *count = aom_memalign(16, num_planes * mb_pels * sizeof(uint16_t));
+  memset(pred8, 0, num_planes * mb_pels * sizeof(pred8[0]));
+  memset(pred16, 0, num_planes * mb_pels * sizeof(pred16[0]));
+  uint8_t *const pred = is_high_bitdepth ? CONVERT_TO_BYTEPTR(pred16) : pred8;
+
+  // Do filtering.
+  FRAME_DIFF diff = { 0, 0 };
+  // Perform temporal filtering block by block.
+  for (int mb_row = 0; mb_row < mb_rows; mb_row++) {
+    av1_set_mv_row_limits(&cpi->common.mi_params, &mb->mv_limits,
+                          (mb_row << mi_h), (mb_height >> MI_SIZE_LOG2),
+                          cpi->oxcf.border_in_pixels);
+    for (int mb_col = 0; mb_col < mb_cols; mb_col++) {
+      av1_set_mv_col_limits(&cpi->common.mi_params, &mb->mv_limits,
+                            (mb_col << mi_w), (mb_width >> MI_SIZE_LOG2),
+                            cpi->oxcf.border_in_pixels);
+      memset(accum, 0, num_planes * mb_pels * sizeof(accum[0]));
+      memset(count, 0, num_planes * mb_pels * sizeof(count[0]));
+      MV ref_mv = kZeroMv;  // Reference motion vector passed down along frames.
+      // Perform temporal filtering frame by frame.
+      for (int frame = 0; frame < num_frames; frame++) {
+        if (frames[frame] == NULL) continue;
+
+        // Motion search.
+        MV subblock_mvs[4] = { kZeroMv, kZeroMv, kZeroMv, kZeroMv };
+        int subblock_filter_weights[4] = { 0, 0, 0, 0 };
+        int block_mse = INT_MAX;
+        int subblock_mses[4] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX };
+
+        if (frame == filter_frame_idx) {  // Frame to be filtered.
+          // Set motion vector as 0 for the frame to be filtered.
+          mbd->mi[0]->mv[0].as_mv = kZeroMv;
+          // Change ref_mv sign for following frames.
+          ref_mv.row *= -1;
+          ref_mv.col *= -1;
+        } else {  // Other reference frames.
+          block_mse = tf_motion_search(cpi, frame_to_filter, frames[frame],
+                                       block_size, mb_row, mb_col, &ref_mv,
+                                       subblock_mvs, subblock_mses);
+          // Do not pass down the reference motion vector if error is too large.
+          const int thresh = AOMMIN(frame_height, frame_width) >= 720 ? 12 : 3;
+          if (block_mse > (thresh << (mbd->bd - 8))) {
+            ref_mv = kZeroMv;
+          }
+        }
+
+        // Build predictor.
+        int use_subblock = tf_get_filter_weight(
+            block_mse, subblock_mses, is_second_arf, subblock_filter_weights);
+        tf_build_predictor(frames[frame], mbd, block_size, mb_row, mb_col,
+                           num_planes, scale, use_subblock, subblock_mvs, pred);
+
+        // Perform weighted averaging.
+        if (frame == filter_frame_idx) {  // Frame to be filtered.
+          av1_apply_temporal_filter_self(mbd, block_size, num_planes,
+                                         subblock_filter_weights[0], pred,
+                                         accum, count);
+        } else {  // Other reference frames.
+          const FRAME_TYPE frame_type =
+              (cpi->common.current_frame.frame_number > 1) ? INTER_FRAME
+                                                           : KEY_FRAME;
+          const int q_factor =
+              (int)av1_convert_qindex_to_q(cpi->rc.avg_frame_qindex[frame_type],
+                                           cpi->common.seq_params.bit_depth);
+          av1_apply_temporal_filter_others(
+              frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes,
+              strength, use_subblock, subblock_filter_weights, noise_levels,
+              block_mse, subblock_mses, q_factor, pred, accum, count);
+        }
+      }
+
+      tf_normalize_filtered_frame(mbd, block_size, mb_row, mb_col, num_planes,
+                                  accum, count, &cpi->alt_ref_buffer);
+
+      if (!is_key_frame && cpi->sf.hl_sf.adaptive_overlay_encoding) {
+        const int y_height = mb_height >> mbd->plane[0].subsampling_y;
+        const int y_width = mb_width >> mbd->plane[0].subsampling_x;
+        const int source_y_stride = frame_to_filter->y_stride;
+        const int filter_y_stride = cpi->alt_ref_buffer.y_stride;
+        const int source_offset =
+            mb_row * y_height * source_y_stride + mb_col * y_width;
+        const int filter_offset =
+            mb_row * y_height * filter_y_stride + mb_col * y_width;
+        unsigned int sse = 0;
+        cpi->fn_ptr[block_size].vf(frame_to_filter->y_buffer + source_offset,
+                                   source_y_stride,
+                                   cpi->alt_ref_buffer.y_buffer + filter_offset,
+                                   filter_y_stride, &sse);
+        diff.sum += sse;
+        diff.sse += sse * sse;
+      }
+    }
+  }
+
+  // Restore input state
+  for (int i = 0; i < num_planes; i++) {
+    mbd->plane[i].pre[0].buf = input_buffer[i];
+  }
+  mbd->mi = input_mb_mode_info;
+
+  free(tmp_mb_mode_info);
+  aom_free(pred8);
+  aom_free(pred16);
+  aom_free(accum);
+  aom_free(count);
+
+  return diff;
+}
+
+// A constant number, sqrt(pi / 2),  used for noise estimation.
+static const double SQRT_PI_BY_2 = 1.25331413732;
+
+double av1_estimate_noise_from_single_plane(const YV12_BUFFER_CONFIG *frame,
+                                            const int plane,
+                                            const int bit_depth) {
+  const int is_y_plane = (plane == 0);
+  const int height = frame->crop_heights[is_y_plane ? 0 : 1];
+  const int width = frame->crop_widths[is_y_plane ? 0 : 1];
+  const int stride = frame->strides[is_y_plane ? 0 : 1];
+  const uint8_t *src = frame->buffers[plane];
+  const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
+  const int is_high_bitdepth = is_frame_high_bitdepth(frame);
+
+  int64_t accum = 0;
+  int count = 0;
+  for (int i = 1; i < height - 1; ++i) {
+    for (int j = 1; j < width - 1; ++j) {
+      // Setup a small 3x3 matrix.
+      const int center_idx = i * stride + j;
+      int mat[3][3];
+      for (int ii = -1; ii <= 1; ++ii) {
+        for (int jj = -1; jj <= 1; ++jj) {
+          const int idx = center_idx + ii * stride + jj;
+          mat[ii + 1][jj + 1] = is_high_bitdepth ? src16[idx] : src[idx];
+        }
+      }
+      // Compute sobel gradients.
+      const int Gx = (mat[0][0] - mat[0][2]) + (mat[2][0] - mat[2][2]) +
+                     2 * (mat[1][0] - mat[1][2]);
+      const int Gy = (mat[0][0] - mat[2][0]) + (mat[0][2] - mat[2][2]) +
+                     2 * (mat[0][1] - mat[2][1]);
+      const int Ga = ROUND_POWER_OF_TWO(abs(Gx) + abs(Gy), bit_depth - 8);
+      // Accumulate Laplacian.
+      if (Ga < NOISE_ESTIMATION_EDGE_THRESHOLD) {  // Only count smooth pixels.
+        const int v = 4 * mat[1][1] -
+                      2 * (mat[0][1] + mat[2][1] + mat[1][0] + mat[1][2]) +
+                      (mat[0][0] + mat[0][2] + mat[2][0] + mat[2][2]);
+        accum += ROUND_POWER_OF_TWO(abs(v), bit_depth - 8);
+        ++count;
+      }
+    }
+  }
+
+  // Return -1.0 (unreliable estimation) if there are too few smooth pixels.
+  return (count < 16) ? -1.0 : (double)accum / (6 * count) * SQRT_PI_BY_2;
+}
+
+// Estimates the strength for filter weight adjustment, which is used in YUV
+// strategy. This estimation is based on the pre-estimated noise level of the
+// to-filter frame.
+// Inputs:
+//   cpi: Pointer to the composed information of input video.
+//   noise_level: Noise level of the to-filter frame, estimated with Y-plane.
+//   group_boost: Boost level for the current group of frames.
+// Returns:
+//   Estimated strength which will be used for filter weight adjustment.
+static int tf_estimate_strength(const AV1_COMP *cpi, const double noise_level,
+                                const int group_boost) {
+  int strength = cpi->oxcf.arnr_strength;
+
+  // Adjust the strength based on the estimated noise level.
+  if (noise_level > 0) {       // Adjust when the noise level is reliable.
+    if (noise_level < 0.75) {  // Noise level lies in range (0, 0.75).
+      strength = strength - 2;
+    } else if (noise_level < 1.75) {  // Noise level lies in range [0.75, 1.75).
+      strength = strength - 1;
+    } else if (noise_level < 4.0) {  // Noise level lies in range [1.75, 4.0).
+      strength = strength + 0;
+    } else {  // Noise level lies in range [4.0, +inf).
+      strength = strength + 1;
+    }
+  }
+
+  // Adjust the strength based on active max q.
+  const FRAME_TYPE frame_type =
+      (cpi->common.current_frame.frame_number > 1) ? INTER_FRAME : KEY_FRAME;
+  const int q = (int)av1_convert_qindex_to_q(
+      cpi->rc.avg_frame_qindex[frame_type], cpi->common.seq_params.bit_depth);
+  strength = strength - AOMMAX(0, (16 - q) / 2);
+
+  return CLIP(strength, 0, group_boost / 300);
+}
+
+// Setups the frame buffer for temporal filtering. Basically, this fuction
+// determines how many frames will be used for temporal filtering and then
+// groups them into a buffer.
+// Inputs:
+//   cpi: Pointer to the composed information of input video.
+//   filter_frame_lookahead_idx: The index of the to-filter frame in the
+//                               lookahead buffer `cpi->lookahead`.
+//   is_second_arf: Whether the to-filter frame is the second ARF. This field
+//                  will affect the number of frames used for filtering.
+//   frames: Pointer to the frame buffer to setup.
+//   num_frames_for_filtering: Number of frames used for filtering.
+//   filter_frame_idx: Index of the to-filter frame in the setup frame buffer.
+// Returns:
+//   Nothing will be returned. But the frame buffer `frames`, number of frames
+//   in the buffer `num_frames_for_filtering`, and the index of the to-filter
+//   frame in the buffer `filter_frame_idx` will be updated in this function.
+static void tf_setup_filtering_buffer(const AV1_COMP *cpi,
+                                      const int filter_frame_lookahead_idx,
+                                      const int is_second_arf,
+                                      YV12_BUFFER_CONFIG **frames,
+                                      int *num_frames_for_filtering,
+                                      int *filter_frame_idx) {
+  int num_frames = 0;          // Number of frames used for filtering.
+  int num_frames_before = -1;  // Number of frames before the to-filter frame.
+  int filter_frame_offset;
+
+  if (filter_frame_lookahead_idx == -1) {  // Key frame.
+    num_frames = TF_NUM_FILTERING_FRAMES_FOR_KEY_FRAME;
+    num_frames_before = 0;
+    filter_frame_offset = filter_frame_lookahead_idx;
+  } else if (filter_frame_lookahead_idx < -1) {  // Key frame in one-pass mode.
+    num_frames = TF_NUM_FILTERING_FRAMES_FOR_KEY_FRAME;
+    num_frames_before = num_frames - 1;
+    filter_frame_offset = -filter_frame_lookahead_idx;
+  } else {
+    num_frames = cpi->oxcf.arnr_max_frames;
+    if (is_second_arf) {  // Only use 2 neighbours for the second ARF.
+      num_frames = AOMMIN(num_frames, 3);
+    }
+    if (num_frames > cpi->rc.gfu_boost / 150) {
+      num_frames = cpi->rc.gfu_boost / 150;
+      num_frames += !(num_frames & 1);
+    }
+    num_frames_before = AOMMIN(num_frames >> 1, filter_frame_lookahead_idx + 1);
+    const int lookahead_depth =
+        av1_lookahead_depth(cpi->lookahead, cpi->compressor_stage);
+    const int num_frames_after =
+        AOMMIN((num_frames - 1) >> 1,
+               lookahead_depth - filter_frame_lookahead_idx - 1);
+    num_frames = num_frames_before + 1 + num_frames_after;
+    filter_frame_offset = filter_frame_lookahead_idx;
+  }
+  *num_frames_for_filtering = num_frames;
+  *filter_frame_idx = num_frames_before;
+
+  // Setup the frame buffer.
+  for (int frame = 0; frame < num_frames; ++frame) {
+    const int lookahead_idx = frame - num_frames_before + filter_frame_offset;
+    struct lookahead_entry *buf = av1_lookahead_peek(
+        cpi->lookahead, lookahead_idx, cpi->compressor_stage);
+    frames[frame] = (buf == NULL) ? NULL : &buf->img;
+  }
+}
+
+int av1_temporal_filter(AV1_COMP *cpi, const int filter_frame_lookahead_idx,
+                        int *show_existing_arf) {
+  // Basic informaton of the current frame.
+  const GF_GROUP *const gf_group = &cpi->gf_group;
+  const uint8_t group_idx = gf_group->index;
+  const FRAME_UPDATE_TYPE update_type = gf_group->update_type[group_idx];
+  // Filter one more ARF if the lookahead index is leq 7 (w.r.t. 9-th frame).
+  // This frame is ALWAYS a show existing frame.
+  const int is_second_arf = (update_type == INTNL_ARF_UPDATE) &&
+                            (filter_frame_lookahead_idx >= 7) &&
+                            cpi->sf.hl_sf.second_alt_ref_filtering;
+  // TODO(anyone): Currently, we enforce the filtering strength on internal
+  // ARFs except the second ARF to be zero. We should investigate in which case
+  // it is more beneficial to use non-zero strength filtering.
+  if (update_type == INTNL_ARF_UPDATE && !is_second_arf) {
+    return 0;
+  }
+
+  // TODO(yunqing): For INTNL_ARF_UPDATE type, the following me initialization
+  // is used somewhere unexpectedly. Should be resolved later.
+  // Initialize errorperbit, sadperbit16 and sadperbit4.
+  const int rdmult = av1_compute_rd_mult_based_on_qindex(cpi, TF_QINDEX);
+  set_error_per_bit(&cpi->td.mb, rdmult);
+  av1_initialize_me_consts(cpi, &cpi->td.mb, TF_QINDEX);
+  av1_fill_mv_costs(cpi->common.fc,
+                    cpi->common.features.cur_frame_force_integer_mv,
+                    cpi->common.features.allow_high_precision_mv, &cpi->td.mb);
+
+  // Setup frame buffer for filtering.
+  YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS] = { NULL };
+  int num_frames_for_filtering = 0;
+  int filter_frame_idx = -1;
+  tf_setup_filtering_buffer(cpi, filter_frame_lookahead_idx, is_second_arf,
+                            frames, &num_frames_for_filtering,
+                            &filter_frame_idx);
+
+  // Estimate noise and strength.
+  const int bit_depth = cpi->common.seq_params.bit_depth;
+  const int num_planes = av1_num_planes(&cpi->common);
+  double noise_levels[MAX_MB_PLANE] = { 0 };
+  for (int plane = 0; plane < num_planes; ++plane) {
+    noise_levels[plane] = av1_estimate_noise_from_single_plane(
+        frames[filter_frame_idx], plane, bit_depth);
+  }
+  const int strength =
+      tf_estimate_strength(cpi, noise_levels[0], cpi->rc.gfu_boost);
+  if (filter_frame_lookahead_idx >= 0) {
+    cpi->common.showable_frame =
+        (strength == 0 && num_frames_for_filtering == 1) || is_second_arf ||
+        (cpi->oxcf.enable_overlay == 0 || cpi->sf.hl_sf.disable_overlay_frames);
+  }
+
+  // Do filtering.
+  const int is_key_frame = (filter_frame_lookahead_idx < 0);
+  FRAME_DIFF diff = { 0, 0 };
+  if (num_frames_for_filtering > 0 && frames[0] != NULL) {
+    // Setup scaling factors. Scaling on each of the arnr frames is not
+    // supported.
+    // ARF is produced at the native frame size and resized when coded.
+    struct scale_factors sf;
+    av1_setup_scale_factors_for_frame(
+        &sf, frames[0]->y_crop_width, frames[0]->y_crop_height,
+        frames[0]->y_crop_width, frames[0]->y_crop_height);
+    diff = tf_do_filtering(cpi, frames, num_frames_for_filtering,
+                           filter_frame_idx, is_key_frame, is_second_arf,
+                           TF_BLOCK_SIZE, &sf, strength, noise_levels);
+  }
+
+  if (is_key_frame) {  // Key frame should always be filtered.
+    return 1;
+  }
+
+  if ((show_existing_arf != NULL && cpi->sf.hl_sf.adaptive_overlay_encoding) ||
+      is_second_arf) {
+    const int frame_height = frames[filter_frame_idx]->y_crop_height;
+    const int frame_width = frames[filter_frame_idx]->y_crop_width;
+    const int block_height = block_size_high[TF_BLOCK_SIZE];
+    const int block_width = block_size_wide[TF_BLOCK_SIZE];
+    const int mb_rows = get_num_blocks(frame_height, block_height);
+    const int mb_cols = get_num_blocks(frame_width, block_width);
+    const int num_mbs = AOMMAX(1, mb_rows * mb_cols);
+    const float mean = (float)diff.sum / num_mbs;
+    const float std = (float)sqrt((float)diff.sse / num_mbs - mean * mean);
+
+    aom_clear_system_state();
+    // TODO(yunqing): This can be combined with TPL q calculation later.
+    cpi->rc.base_frame_target = gf_group->bit_allocation[group_idx];
+    av1_set_target_rate(cpi, cpi->common.width, cpi->common.height);
+    int top_index = 0;
+    int bottom_index = 0;
+    const int q = av1_rc_pick_q_and_bounds(cpi, &cpi->rc, cpi->oxcf.width,
+                                           cpi->oxcf.height, group_idx,
+                                           &bottom_index, &top_index);
+    const int ac_q = av1_ac_quant_QTX(q, 0, bit_depth);
+    const float threshold = 0.7f * ac_q * ac_q;
+
+    if (!is_second_arf) {
+      *show_existing_arf = 0;
+      if (mean < threshold && std < mean * 1.2) {
+        *show_existing_arf = 1;
+      }
+      cpi->common.showable_frame |= *show_existing_arf;
+    } else {
+      // Use source frame if the filtered frame becomes very different.
+      if (!(mean < threshold && std < mean * 1.2)) {
+        return 0;
+      }
+    }
+  }
+
+  return 1;
+}
diff --git a/libs/libaom/src/av1/encoder/temporal_filter.h b/libs/libaom/src/av1/encoder/temporal_filter.h
new file mode 100644
index 000000000..5a6bde259
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/temporal_filter.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_TEMPORAL_FILTER_H_
+#define AOM_AV1_ENCODER_TEMPORAL_FILTER_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// TODO(any): These two variables are only used in avx2, sse2, sse4
+// implementations, where the block size is still hard coded. This should be
+// fixed to align with the c implementation.
+#define BH 32
+#define BW 32
+
+// Block size used in temporal filtering.
+#define TF_BLOCK_SIZE BLOCK_32X32
+
+// Window size for YUV temporal filtering.
+// This is particually used for function `av1_apply_temporal_filter_yuv()`.
+#define TF_YUV_FILTER_WINDOW_LENGTH 3
+// A scale factor used in YUV temporal filtering for weight adjustment.
+#define TF_YUV_FILTER_WEIGHT_SCALE 3
+
+#define TF_ENABLE_PLANEWISE_STRATEGY 1
+// Window size for plane-wise temporal filtering.
+// This is particually used for function `av1_apply_temporal_filter_planewise()`
+#define TF_PLANEWISE_FILTER_WINDOW_LENGTH 5
+// A scale factor used in plane-wise temporal filtering to raise the filter
+// weight from `double` with range [0, 1] to `int` with range [0, 1000].
+#define TF_PLANEWISE_FILTER_WEIGHT_SCALE 1000
+
+#define NOISE_ESTIMATION_EDGE_THRESHOLD 50
+// Estimates noise level from a given frame using a single plane (Y, U, or V).
+// This is an adaptation of the mehtod in the following paper:
+// Shen-Chuan Tai, Shih-Ming Yang, "A fast method for image noise
+// estimation using Laplacian operator and adaptive edge detection",
+// Proc. 3rd International Symposium on Communications, Control and
+// Signal Processing, 2008, St Julians, Malta.
+// Inputs:
+//   frame: Pointer to the frame to estimate noise level from.
+//   plane: Index of the plane used for noise estimation. Commonly, 0 for
+//          Y-plane, 1 for U-plane, and 2 for V-plane.
+//   bit_depth: Actual bit-depth instead of the encoding bit-depth of the frame.
+// Returns:
+//   The estimated noise, or -1.0 if there are too few smooth pixels.
+double av1_estimate_noise_from_single_plane(const YV12_BUFFER_CONFIG *frame,
+                                            const int plane,
+                                            const int bit_depth);
+
+#define TF_QINDEX 128  // Q-index used in temporal filtering.
+#define TF_NUM_FILTERING_FRAMES_FOR_KEY_FRAME 7
+// Performs temporal filtering if needed.
+// NOTE: In this function, the lookahead index is different from the 0-based
+// real index. For example, if we want to filter the first frame in the
+// pre-fetched buffer `cpi->lookahead`, the lookahead index will be -1 instead
+// of 0. More concretely, 0 indicates the first LOOKAHEAD frame, which is the
+// second frame in the pre-fetched buffer. Another example: if we want to filter
+// the 17-th frame, which is an ARF, the lookahead index is 15 instead of 16.
+// Futhermore, negative number is used for key frame in one-pass mode, where key
+// frame is filtered with the frames before it instead of after it. For example,
+// -15 means to filter the 17-th frame, which is a key frame in one-pass mode.
+// Inputs:
+//   cpi: Pointer to the composed information of input video.
+//   filter_frame_lookahead_idx: The index of the to-filter frame in the
+//                               lookahead buffer `cpi->lookahead`.
+//   show_existing_arf: Whether to show existing ARF. This field will be updated
+//                      in this function.
+// Returns:
+//   Whether temporal filtering is successfully done.
+int av1_temporal_filter(AV1_COMP *cpi, const int filter_frame_lookahead_idx,
+                        int *show_existing_arf);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_TEMPORAL_FILTER_H_
diff --git a/libs/libaom/src/av1/encoder/tokenize.c b/libs/libaom/src/av1/encoder/tokenize.c
new file mode 100644
index 000000000..e67415349
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/tokenize.c
@@ -0,0 +1,242 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "aom_mem/aom_mem.h"
+
+#include "av1/common/entropy.h"
+#include "av1/common/pred_common.h"
+#include "av1/common/scan.h"
+#include "av1/common/seg_common.h"
+
+#include "av1/encoder/cost.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encodetxb.h"
+#include "av1/encoder/rdopt.h"
+#include "av1/encoder/tokenize.h"
+
+static int cost_and_tokenize_map(Av1ColorMapParam *param, TOKENEXTRA **t,
+                                 int plane, int calc_rate, int allow_update_cdf,
+                                 FRAME_COUNTS *counts, MapCdf map_pb_cdf) {
+  const uint8_t *const color_map = param->color_map;
+  MapCdf map_cdf = param->map_cdf;
+  ColorCost color_cost = param->color_cost;
+  const int plane_block_width = param->plane_width;
+  const int rows = param->rows;
+  const int cols = param->cols;
+  const int n = param->n_colors;
+  const int palette_size_idx = n - PALETTE_MIN_SIZE;
+  int this_rate = 0;
+  uint8_t color_order[PALETTE_MAX_SIZE];
+
+  (void)plane;
+  (void)counts;
+
+  for (int k = 1; k < rows + cols - 1; ++k) {
+    for (int j = AOMMIN(k, cols - 1); j >= AOMMAX(0, k - rows + 1); --j) {
+      int i = k - j;
+      int color_new_idx;
+      const int color_ctx = av1_get_palette_color_index_context(
+          color_map, plane_block_width, i, j, n, color_order, &color_new_idx);
+      assert(color_new_idx >= 0 && color_new_idx < n);
+      if (calc_rate) {
+        this_rate += (*color_cost)[palette_size_idx][color_ctx][color_new_idx];
+      } else {
+        (*t)->token = color_new_idx;
+        (*t)->color_map_cdf = map_pb_cdf[palette_size_idx][color_ctx];
+        ++(*t);
+        if (allow_update_cdf)
+          update_cdf(map_cdf[palette_size_idx][color_ctx], color_new_idx, n);
+#if CONFIG_ENTROPY_STATS
+        if (plane) {
+          ++counts->palette_uv_color_index[palette_size_idx][color_ctx]
+                                          [color_new_idx];
+        } else {
+          ++counts->palette_y_color_index[palette_size_idx][color_ctx]
+                                         [color_new_idx];
+        }
+#endif
+      }
+    }
+  }
+  if (calc_rate) return this_rate;
+  return 0;
+}
+
+static void get_palette_params(const MACROBLOCK *const x, int plane,
+                               BLOCK_SIZE bsize, Av1ColorMapParam *params) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  params->color_map = xd->plane[plane].color_index_map;
+  params->map_cdf = plane ? xd->tile_ctx->palette_uv_color_index_cdf
+                          : xd->tile_ctx->palette_y_color_index_cdf;
+  params->color_cost =
+      plane ? &x->palette_uv_color_cost : &x->palette_y_color_cost;
+  params->n_colors = pmi->palette_size[plane];
+  av1_get_block_dimensions(bsize, plane, xd, &params->plane_width, NULL,
+                           &params->rows, &params->cols);
+}
+
+static void get_color_map_params(const MACROBLOCK *const x, int plane,
+                                 BLOCK_SIZE bsize, TX_SIZE tx_size,
+                                 COLOR_MAP_TYPE type,
+                                 Av1ColorMapParam *params) {
+  (void)tx_size;
+  memset(params, 0, sizeof(*params));
+  switch (type) {
+    case PALETTE_MAP: get_palette_params(x, plane, bsize, params); break;
+    default: assert(0 && "Invalid color map type"); return;
+  }
+}
+
+int av1_cost_color_map(const MACROBLOCK *const x, int plane, BLOCK_SIZE bsize,
+                       TX_SIZE tx_size, COLOR_MAP_TYPE type) {
+  assert(plane == 0 || plane == 1);
+  Av1ColorMapParam color_map_params;
+  get_color_map_params(x, plane, bsize, tx_size, type, &color_map_params);
+  MapCdf map_pb_cdf = plane ? x->tile_pb_ctx->palette_uv_color_index_cdf
+                            : x->tile_pb_ctx->palette_y_color_index_cdf;
+  return cost_and_tokenize_map(&color_map_params, NULL, plane, 1, 0, NULL,
+                               map_pb_cdf);
+}
+
+void av1_tokenize_color_map(const MACROBLOCK *const x, int plane,
+                            TOKENEXTRA **t, BLOCK_SIZE bsize, TX_SIZE tx_size,
+                            COLOR_MAP_TYPE type, int allow_update_cdf,
+                            FRAME_COUNTS *counts) {
+  assert(plane == 0 || plane == 1);
+  Av1ColorMapParam color_map_params;
+  get_color_map_params(x, plane, bsize, tx_size, type, &color_map_params);
+  // The first color index does not use context or entropy.
+  (*t)->token = color_map_params.color_map[0];
+  (*t)->color_map_cdf = NULL;
+  ++(*t);
+  MapCdf map_pb_cdf = plane ? x->tile_pb_ctx->palette_uv_color_index_cdf
+                            : x->tile_pb_ctx->palette_y_color_index_cdf;
+  cost_and_tokenize_map(&color_map_params, t, plane, 0, allow_update_cdf,
+                        counts, map_pb_cdf);
+}
+
+static void tokenize_vartx(ThreadData *td, TX_SIZE tx_size,
+                           BLOCK_SIZE plane_bsize, int blk_row, int blk_col,
+                           int block, int plane, void *arg) {
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+  const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+  const TX_SIZE plane_tx_size =
+      plane ? av1_get_max_uv_txsize(mbmi->sb_type, pd->subsampling_x,
+                                    pd->subsampling_y)
+            : mbmi->inter_tx_size[av1_get_txb_size_index(plane_bsize, blk_row,
+                                                         blk_col)];
+
+  if (tx_size == plane_tx_size || plane) {
+    plane_bsize = get_plane_block_size(mbmi->sb_type, pd->subsampling_x,
+                                       pd->subsampling_y);
+    av1_update_and_record_txb_context(plane, block, blk_row, blk_col,
+                                      plane_bsize, tx_size, arg);
+
+  } else {
+    // Half the block size in transform block unit.
+    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+    const int bsw = tx_size_wide_unit[sub_txs];
+    const int bsh = tx_size_high_unit[sub_txs];
+    const int step = bsw * bsh;
+
+    assert(bsw > 0 && bsh > 0);
+
+    for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) {
+      for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
+        const int offsetr = blk_row + row;
+        const int offsetc = blk_col + col;
+
+        if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
+
+        tokenize_vartx(td, sub_txs, plane_bsize, offsetr, offsetc, block, plane,
+                       arg);
+        block += step;
+      }
+    }
+  }
+}
+
+void av1_tokenize_sb_vartx(const AV1_COMP *cpi, ThreadData *td,
+                           RUN_TYPE dry_run, BLOCK_SIZE bsize, int *rate,
+                           uint8_t allow_update_cdf) {
+  assert(bsize < BLOCK_SIZES_ALL);
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+  if (mi_row >= cm->mi_params.mi_rows || mi_col >= cm->mi_params.mi_cols)
+    return;
+
+  const int num_planes = av1_num_planes(cm);
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  struct tokenize_b_args arg = { cpi, td, 0, allow_update_cdf, dry_run };
+
+  if (mbmi->skip) {
+    av1_reset_entropy_context(xd, bsize, num_planes);
+    return;
+  }
+
+  for (int plane = 0; plane < num_planes; ++plane) {
+    if (plane && !xd->is_chroma_ref) break;
+    const struct macroblockd_plane *const pd = &xd->plane[plane];
+    const int ss_x = pd->subsampling_x;
+    const int ss_y = pd->subsampling_y;
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y);
+    assert(plane_bsize < BLOCK_SIZES_ALL);
+    const int mi_width = mi_size_wide[plane_bsize];
+    const int mi_height = mi_size_high[plane_bsize];
+    const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, plane);
+    const BLOCK_SIZE txb_size = txsize_to_bsize[max_tx_size];
+    const int bw = mi_size_wide[txb_size];
+    const int bh = mi_size_high[txb_size];
+    int block = 0;
+    const int step =
+        tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
+
+    const BLOCK_SIZE max_unit_bsize =
+        get_plane_block_size(BLOCK_64X64, ss_x, ss_y);
+    int mu_blocks_wide = mi_size_wide[max_unit_bsize];
+    int mu_blocks_high = mi_size_high[max_unit_bsize];
+
+    mu_blocks_wide = AOMMIN(mi_width, mu_blocks_wide);
+    mu_blocks_high = AOMMIN(mi_height, mu_blocks_high);
+
+    for (int idy = 0; idy < mi_height; idy += mu_blocks_high) {
+      for (int idx = 0; idx < mi_width; idx += mu_blocks_wide) {
+        const int unit_height = AOMMIN(mu_blocks_high + idy, mi_height);
+        const int unit_width = AOMMIN(mu_blocks_wide + idx, mi_width);
+        for (int blk_row = idy; blk_row < unit_height; blk_row += bh) {
+          for (int blk_col = idx; blk_col < unit_width; blk_col += bw) {
+            tokenize_vartx(td, max_tx_size, plane_bsize, blk_row, blk_col,
+                           block, plane, &arg);
+            block += step;
+          }
+        }
+      }
+    }
+  }
+  if (rate) *rate += arg.this_rate;
+}
diff --git a/libs/libaom/src/av1/encoder/tokenize.h b/libs/libaom/src/av1/encoder/tokenize.h
new file mode 100644
index 000000000..52caacbae
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/tokenize.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_TOKENIZE_H_
+#define AOM_AV1_ENCODER_TOKENIZE_H_
+
+#include "av1/common/entropy.h"
+#include "av1/encoder/block.h"
+#include "aom_dsp/bitwriter.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+  aom_cdf_prob *color_map_cdf;
+  uint8_t token;
+} TOKENEXTRA;
+
+struct AV1_COMP;
+struct ThreadData;
+struct FRAME_COUNTS;
+
+enum {
+  OUTPUT_ENABLED = 0,
+  DRY_RUN_NORMAL,
+  DRY_RUN_COSTCOEFFS,
+} UENUM1BYTE(RUN_TYPE);
+
+struct tokenize_b_args {
+  const struct AV1_COMP *cpi;
+  struct ThreadData *td;
+  int this_rate;
+  uint8_t allow_update_cdf;
+  RUN_TYPE dry_run;
+};
+
+// Note in all the tokenize functions rate if non NULL is incremented
+// with the coefficient token cost only if dry_run = DRY_RUN_COSTCOEFS,
+// otherwise rate is not incremented.
+void av1_tokenize_sb_vartx(const struct AV1_COMP *cpi, struct ThreadData *td,
+                           RUN_TYPE dry_run, BLOCK_SIZE bsize, int *rate,
+                           uint8_t allow_update_cdf);
+
+int av1_cost_color_map(const MACROBLOCK *const x, int plane, BLOCK_SIZE bsize,
+                       TX_SIZE tx_size, COLOR_MAP_TYPE type);
+
+void av1_tokenize_color_map(const MACROBLOCK *const x, int plane,
+                            TOKENEXTRA **t, BLOCK_SIZE bsize, TX_SIZE tx_size,
+                            COLOR_MAP_TYPE type, int allow_update_cdf,
+                            struct FRAME_COUNTS *counts);
+
+static INLINE int av1_get_tx_eob(const struct segmentation *seg, int segment_id,
+                                 TX_SIZE tx_size) {
+  const int eob_max = av1_get_max_eob(tx_size);
+  return segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_TOKENIZE_H_
diff --git a/libs/libaom/src/av1/encoder/tpl_model.c b/libs/libaom/src/av1/encoder/tpl_model.c
new file mode 100644
index 000000000..79b94f373
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/tpl_model.c
@@ -0,0 +1,1189 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdint.h>
+#include <float.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "aom/aom_codec.h"
+#include "aom_ports/system_state.h"
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/enums.h"
+#include "av1/common/idct.h"
+#include "av1/common/reconintra.h"
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encode_strategy.h"
+#include "av1/encoder/hybrid_fwd_txfm.h"
+#include "av1/encoder/rdopt.h"
+#include "av1/encoder/reconinter_enc.h"
+#include "av1/encoder/tpl_model.h"
+
+static AOM_INLINE void get_quantize_error(const MACROBLOCK *x, int plane,
+                                          const tran_low_t *coeff,
+                                          tran_low_t *qcoeff,
+                                          tran_low_t *dqcoeff, TX_SIZE tx_size,
+                                          uint16_t *eob, int64_t *recon_error,
+                                          int64_t *sse) {
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const SCAN_ORDER *const scan_order = &av1_default_scan_orders[tx_size];
+  int pix_num = 1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]];
+  const int shift = tx_size == TX_32X32 ? 0 : 2;
+
+  av1_quantize_fp(coeff, pix_num, p->zbin_QTX, p->round_fp_QTX, p->quant_fp_QTX,
+                  p->quant_shift_QTX, qcoeff, dqcoeff, p->dequant_QTX, eob,
+                  scan_order->scan, scan_order->iscan);
+
+  *recon_error = av1_block_error(coeff, dqcoeff, pix_num, sse) >> shift;
+  *recon_error = AOMMAX(*recon_error, 1);
+
+  *sse = (*sse) >> shift;
+  *sse = AOMMAX(*sse, 1);
+}
+
+static AOM_INLINE void tpl_fwd_txfm(const int16_t *src_diff, int bw,
+                                    tran_low_t *coeff, TX_SIZE tx_size,
+                                    int bit_depth, int is_hbd) {
+  TxfmParam txfm_param;
+  txfm_param.tx_type = DCT_DCT;
+  txfm_param.tx_size = tx_size;
+  txfm_param.lossless = 0;
+  txfm_param.tx_set_type = EXT_TX_SET_ALL16;
+
+  txfm_param.bd = bit_depth;
+  txfm_param.is_hbd = is_hbd;
+  av1_fwd_txfm(src_diff, coeff, bw, &txfm_param);
+}
+
+static AOM_INLINE int64_t tpl_get_satd_cost(const MACROBLOCK *x,
+                                            int16_t *src_diff, int diff_stride,
+                                            const uint8_t *src, int src_stride,
+                                            const uint8_t *dst, int dst_stride,
+                                            tran_low_t *coeff, int bw, int bh,
+                                            TX_SIZE tx_size) {
+  const MACROBLOCKD *xd = &x->e_mbd;
+  const int pix_num = bw * bh;
+
+  av1_subtract_block(xd, bh, bw, src_diff, diff_stride, src, src_stride, dst,
+                     dst_stride);
+  tpl_fwd_txfm(src_diff, bw, coeff, tx_size, xd->bd, is_cur_buf_hbd(xd));
+  return aom_satd(coeff, pix_num);
+}
+
+static int rate_estimator(const tran_low_t *qcoeff, int eob, TX_SIZE tx_size) {
+  const SCAN_ORDER *const scan_order = &av1_default_scan_orders[tx_size];
+
+  assert((1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]]) >= eob);
+
+  int rate_cost = 1;
+
+  for (int idx = 0; idx < eob; ++idx) {
+    int abs_level = abs(qcoeff[scan_order->scan[idx]]);
+    rate_cost += (int)(log(abs_level + 1.0) / log(2.0)) + 1;
+  }
+
+  return (rate_cost << AV1_PROB_COST_SHIFT);
+}
+
+static AOM_INLINE void txfm_quant_rdcost(
+    const MACROBLOCK *x, int16_t *src_diff, int diff_stride, uint8_t *src,
+    int src_stride, uint8_t *dst, int dst_stride, tran_low_t *coeff,
+    tran_low_t *qcoeff, tran_low_t *dqcoeff, int bw, int bh, TX_SIZE tx_size,
+    int *rate_cost, int64_t *recon_error, int64_t *sse) {
+  const MACROBLOCKD *xd = &x->e_mbd;
+  uint16_t eob;
+  av1_subtract_block(xd, bh, bw, src_diff, diff_stride, src, src_stride, dst,
+                     dst_stride);
+  tpl_fwd_txfm(src_diff, diff_stride, coeff, tx_size, xd->bd,
+               is_cur_buf_hbd(xd));
+
+  get_quantize_error(x, 0, coeff, qcoeff, dqcoeff, tx_size, &eob, recon_error,
+                     sse);
+
+  *rate_cost = rate_estimator(qcoeff, eob, tx_size);
+
+  av1_inverse_transform_block(xd, dqcoeff, 0, DCT_DCT, tx_size, dst, dst_stride,
+                              eob, 0);
+}
+
+static uint32_t motion_estimation(AV1_COMP *cpi, MACROBLOCK *x,
+                                  uint8_t *cur_frame_buf,
+                                  uint8_t *ref_frame_buf, int stride,
+                                  int stride_ref, BLOCK_SIZE bsize,
+                                  MV center_mv, int_mv *best_mv) {
+  AV1_COMMON *cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  TPL_SPEED_FEATURES *tpl_sf = &cpi->sf.tpl_sf;
+  int step_param;
+  uint32_t bestsme = UINT_MAX;
+  int distortion;
+  uint32_t sse;
+  int cost_list[5];
+  FULLPEL_MV start_mv = get_fullmv_from_mv(&center_mv);
+
+  // Setup frame pointers
+  x->plane[0].src.buf = cur_frame_buf;
+  x->plane[0].src.stride = stride;
+  xd->plane[0].pre[0].buf = ref_frame_buf;
+  xd->plane[0].pre[0].stride = stride_ref;
+
+  step_param = tpl_sf->reduce_first_step_size;
+  step_param = AOMMIN(step_param, MAX_MVSEARCH_STEPS - 2);
+
+  search_site_config *ss_cfg = &cpi->mv_search_params.ss_cfg[SS_CFG_SRC];
+  if (ss_cfg->stride != stride_ref)
+    ss_cfg = &cpi->mv_search_params.ss_cfg[SS_CFG_LOOKAHEAD];
+
+  assert(ss_cfg->stride == stride_ref);
+
+  FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
+  av1_make_default_fullpel_ms_params(&full_ms_params, cpi, x, bsize, &center_mv,
+                                     ss_cfg);
+
+  av1_full_pixel_search(start_mv, &full_ms_params, step_param,
+                        cond_cost_list(cpi, cost_list), &best_mv->as_fullmv,
+                        NULL);
+
+  SUBPEL_MOTION_SEARCH_PARAMS ms_params;
+  av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize, &center_mv,
+                                    cost_list);
+  ms_params.forced_stop = tpl_sf->subpel_force_stop;
+  ms_params.var_params.subpel_search_type = USE_2_TAPS;
+  ms_params.mv_cost_params.mv_cost_type = MV_COST_NONE;
+  MV subpel_start_mv = get_mv_from_fullmv(&best_mv->as_fullmv);
+  bestsme = cpi->mv_search_params.find_fractional_mv_step(
+      xd, cm, &ms_params, subpel_start_mv, &best_mv->as_mv, &distortion, &sse,
+      NULL);
+
+  return bestsme;
+}
+
+static int is_alike_mv(int_mv candidate_mv, int_mv *center_mvs,
+                       int center_mvs_count, int skip_alike_starting_mv) {
+  // MV difference threshold is in 1/8 precision.
+  const int mv_diff_thr[3] = { 1, (8 << 3), (16 << 3) };
+  int thr = mv_diff_thr[skip_alike_starting_mv];
+  int i;
+
+  for (i = 0; i < center_mvs_count; i++) {
+    if (abs(center_mvs[i].as_mv.col - candidate_mv.as_mv.col) < thr &&
+        abs(center_mvs[i].as_mv.row - candidate_mv.as_mv.row) < thr)
+      return 1;
+  }
+
+  return 0;
+}
+
+static AOM_INLINE void mode_estimation(
+    AV1_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, struct scale_factors *sf,
+    int frame_idx, int mi_row, int mi_col, BLOCK_SIZE bsize, TX_SIZE tx_size,
+    const YV12_BUFFER_CONFIG *ref_frame[],
+    const YV12_BUFFER_CONFIG *src_ref_frame[], TplDepStats *tpl_stats) {
+  AV1_COMMON *cm = &cpi->common;
+  const GF_GROUP *gf_group = &cpi->gf_group;
+
+  (void)gf_group;
+
+  TplParams *tpl_data = &cpi->tpl_data;
+  TplDepFrame *tpl_frame = &tpl_data->tpl_frame[frame_idx];
+  const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2;
+
+  const int bw = 4 << mi_size_wide_log2[bsize];
+  const int bh = 4 << mi_size_high_log2[bsize];
+  const int_interpfilters kernel =
+      av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+
+  int64_t best_intra_cost = INT64_MAX;
+  int64_t intra_cost;
+  PREDICTION_MODE best_mode = DC_PRED;
+
+  int mb_y_offset = mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE;
+  uint8_t *src_mb_buffer = xd->cur_buf->y_buffer + mb_y_offset;
+  const int src_stride = xd->cur_buf->y_stride;
+
+  const int dst_mb_offset =
+      mi_row * MI_SIZE * tpl_frame->rec_picture->y_stride + mi_col * MI_SIZE;
+  uint8_t *dst_buffer = tpl_frame->rec_picture->y_buffer + dst_mb_offset;
+  const int dst_buffer_stride = tpl_frame->rec_picture->y_stride;
+
+  // Temporaray buffers
+  DECLARE_ALIGNED(32, uint8_t, predictor8[MC_FLOW_NUM_PELS * 2]);
+  DECLARE_ALIGNED(32, int16_t, src_diff[MC_FLOW_NUM_PELS]);
+  DECLARE_ALIGNED(32, tran_low_t, coeff[MC_FLOW_NUM_PELS]);
+  DECLARE_ALIGNED(32, tran_low_t, qcoeff[MC_FLOW_NUM_PELS]);
+  DECLARE_ALIGNED(32, tran_low_t, dqcoeff[MC_FLOW_NUM_PELS]);
+  DECLARE_ALIGNED(32, tran_low_t, best_coeff[MC_FLOW_NUM_PELS]);
+  uint8_t *predictor =
+      is_cur_buf_hbd(xd) ? CONVERT_TO_BYTEPTR(predictor8) : predictor8;
+  int64_t recon_error = 1, sse = 1;
+
+  memset(tpl_stats, 0, sizeof(*tpl_stats));
+
+  const int mi_width = mi_size_wide[bsize];
+  const int mi_height = mi_size_high[bsize];
+  set_mode_info_offsets(&cpi->common.mi_params, &cpi->mbmi_ext_info, x, xd,
+                        mi_row, mi_col);
+  set_mi_row_col(xd, &xd->tile, mi_row, mi_height, mi_col, mi_width,
+                 cm->mi_params.mi_rows, cm->mi_params.mi_cols);
+  set_plane_n4(xd, mi_size_wide[bsize], mi_size_high[bsize],
+               av1_num_planes(cm));
+  xd->mi[0]->sb_type = bsize;
+  xd->mi[0]->motion_mode = SIMPLE_TRANSLATION;
+
+  // Intra prediction search
+  xd->mi[0]->ref_frame[0] = INTRA_FRAME;
+
+  // Pre-load the bottom left line.
+  if (xd->left_available &&
+      mi_row + tx_size_high_unit[tx_size] < xd->tile.mi_row_end) {
+#if CONFIG_AV1_HIGHBITDEPTH
+    if (is_cur_buf_hbd(xd)) {
+      uint16_t *dst = CONVERT_TO_SHORTPTR(dst_buffer);
+      for (int i = 0; i < bw; ++i)
+        dst[(bw + i) * dst_buffer_stride - 1] =
+            dst[(bw - 1) * dst_buffer_stride - 1];
+    } else {
+      for (int i = 0; i < bw; ++i)
+        dst_buffer[(bw + i) * dst_buffer_stride - 1] =
+            dst_buffer[(bw - 1) * dst_buffer_stride - 1];
+    }
+#else
+    for (int i = 0; i < bw; ++i)
+      dst_buffer[(bw + i) * dst_buffer_stride - 1] =
+          dst_buffer[(bw - 1) * dst_buffer_stride - 1];
+#endif
+  }
+
+  // if cpi->sf.tpl_sf.prune_intra_modes is on, then search only DC_PRED,
+  // H_PRED, and V_PRED
+  const PREDICTION_MODE last_intra_mode =
+      cpi->sf.tpl_sf.prune_intra_modes ? D45_PRED : INTRA_MODE_END;
+  for (PREDICTION_MODE mode = INTRA_MODE_START; mode < last_intra_mode;
+       ++mode) {
+    av1_predict_intra_block(cm, xd, block_size_wide[bsize],
+                            block_size_high[bsize], tx_size, mode, 0, 0,
+                            FILTER_INTRA_MODES, dst_buffer, dst_buffer_stride,
+                            predictor, bw, 0, 0, 0);
+
+    intra_cost = tpl_get_satd_cost(x, src_diff, bw, src_mb_buffer, src_stride,
+                                   predictor, bw, coeff, bw, bh, tx_size);
+
+    if (intra_cost < best_intra_cost) {
+      best_intra_cost = intra_cost;
+      best_mode = mode;
+    }
+  }
+
+  // Motion compensated prediction
+  xd->mi[0]->ref_frame[0] = INTRA_FRAME;
+
+  int best_rf_idx = -1;
+  int_mv best_mv;
+  int64_t inter_cost;
+  int64_t best_inter_cost = INT64_MAX;
+  int rf_idx;
+
+  best_mv.as_int = INVALID_MV;
+
+  for (rf_idx = 0; rf_idx < INTER_REFS_PER_FRAME; ++rf_idx) {
+    if (ref_frame[rf_idx] == NULL || src_ref_frame[rf_idx] == NULL) {
+      tpl_stats->mv[rf_idx].as_int = INVALID_MV;
+      continue;
+    }
+
+    const YV12_BUFFER_CONFIG *ref_frame_ptr = src_ref_frame[rf_idx];
+    int ref_mb_offset =
+        mi_row * MI_SIZE * ref_frame_ptr->y_stride + mi_col * MI_SIZE;
+    uint8_t *ref_mb = ref_frame_ptr->y_buffer + ref_mb_offset;
+    int ref_stride = ref_frame_ptr->y_stride;
+
+    int_mv best_rfidx_mv = { 0 };
+    uint32_t bestsme = UINT32_MAX;
+
+    int_mv center_mvs[4] = { { 0 } };
+    int refmv_count = 1;
+
+    if (xd->up_available) {
+      TplDepStats *ref_tpl_stats = &tpl_frame->tpl_stats_ptr[av1_tpl_ptr_pos(
+          mi_row - mi_height, mi_col, tpl_frame->stride, block_mis_log2)];
+      if (!is_alike_mv(ref_tpl_stats->mv[rf_idx], center_mvs, refmv_count,
+                       cpi->sf.tpl_sf.skip_alike_starting_mv)) {
+        center_mvs[refmv_count].as_int = ref_tpl_stats->mv[rf_idx].as_int;
+        ++refmv_count;
+      }
+    }
+
+    if (xd->left_available) {
+      TplDepStats *ref_tpl_stats = &tpl_frame->tpl_stats_ptr[av1_tpl_ptr_pos(
+          mi_row, mi_col - mi_width, tpl_frame->stride, block_mis_log2)];
+      if (!is_alike_mv(ref_tpl_stats->mv[rf_idx], center_mvs, refmv_count,
+                       cpi->sf.tpl_sf.skip_alike_starting_mv)) {
+        center_mvs[refmv_count].as_int = ref_tpl_stats->mv[rf_idx].as_int;
+        ++refmv_count;
+      }
+    }
+
+    if (xd->up_available && mi_col + mi_width < xd->tile.mi_col_end) {
+      TplDepStats *ref_tpl_stats = &tpl_frame->tpl_stats_ptr[av1_tpl_ptr_pos(
+          mi_row - mi_height, mi_col + mi_width, tpl_frame->stride,
+          block_mis_log2)];
+      if (!is_alike_mv(ref_tpl_stats->mv[rf_idx], center_mvs, refmv_count,
+                       cpi->sf.tpl_sf.skip_alike_starting_mv)) {
+        center_mvs[refmv_count].as_int = ref_tpl_stats->mv[rf_idx].as_int;
+        ++refmv_count;
+      }
+    }
+
+    for (int idx = 0; idx < refmv_count; ++idx) {
+      int_mv this_mv;
+      uint32_t thissme =
+          motion_estimation(cpi, x, src_mb_buffer, ref_mb, src_stride,
+                            ref_stride, bsize, center_mvs[idx].as_mv, &this_mv);
+
+      if (thissme < bestsme) {
+        bestsme = thissme;
+        best_rfidx_mv = this_mv;
+      }
+    }
+
+    tpl_stats->mv[rf_idx].as_int = best_rfidx_mv.as_int;
+
+    struct buf_2d ref_buf = { NULL, ref_frame_ptr->y_buffer,
+                              ref_frame_ptr->y_width, ref_frame_ptr->y_height,
+                              ref_frame_ptr->y_stride };
+    InterPredParams inter_pred_params;
+    av1_init_inter_params(&inter_pred_params, bw, bh, mi_row * MI_SIZE,
+                          mi_col * MI_SIZE, 0, 0, xd->bd, is_cur_buf_hbd(xd), 0,
+                          sf, &ref_buf, kernel);
+    inter_pred_params.conv_params = get_conv_params(0, 0, xd->bd);
+
+    av1_enc_build_one_inter_predictor(predictor, bw, &best_rfidx_mv.as_mv,
+                                      &inter_pred_params);
+
+    inter_cost = tpl_get_satd_cost(x, src_diff, bw, src_mb_buffer, src_stride,
+                                   predictor, bw, coeff, bw, bh, tx_size);
+    // Store inter cost for each ref frame
+    tpl_stats->pred_error[rf_idx] = AOMMAX(1, inter_cost);
+
+    if (inter_cost < best_inter_cost) {
+      memcpy(best_coeff, coeff, sizeof(best_coeff));
+      best_rf_idx = rf_idx;
+
+      best_inter_cost = inter_cost;
+      best_mv.as_int = best_rfidx_mv.as_int;
+      if (best_inter_cost < best_intra_cost) {
+        best_mode = NEWMV;
+        xd->mi[0]->ref_frame[0] = best_rf_idx + LAST_FRAME;
+        xd->mi[0]->mv[0].as_int = best_mv.as_int;
+      }
+    }
+  }
+
+  if (best_inter_cost < INT64_MAX) {
+    uint16_t eob;
+    get_quantize_error(x, 0, best_coeff, qcoeff, dqcoeff, tx_size, &eob,
+                       &recon_error, &sse);
+
+    const int rate_cost = rate_estimator(qcoeff, eob, tx_size);
+    tpl_stats->srcrf_rate = rate_cost << TPL_DEP_COST_SCALE_LOG2;
+  }
+
+  best_intra_cost = AOMMAX(best_intra_cost, 1);
+  if (frame_idx == 0) {
+    best_inter_cost = 0;
+  } else {
+    best_inter_cost = AOMMIN(best_intra_cost, best_inter_cost);
+  }
+  tpl_stats->inter_cost = best_inter_cost << TPL_DEP_COST_SCALE_LOG2;
+  tpl_stats->intra_cost = best_intra_cost << TPL_DEP_COST_SCALE_LOG2;
+
+  tpl_stats->srcrf_dist = recon_error << (TPL_DEP_COST_SCALE_LOG2);
+
+  // Final encode
+  if (is_inter_mode(best_mode)) {
+    const YV12_BUFFER_CONFIG *ref_frame_ptr = ref_frame[best_rf_idx];
+
+    InterPredParams inter_pred_params;
+    struct buf_2d ref_buf = { NULL, ref_frame_ptr->y_buffer,
+                              ref_frame_ptr->y_width, ref_frame_ptr->y_height,
+                              ref_frame_ptr->y_stride };
+    av1_init_inter_params(&inter_pred_params, bw, bh, mi_row * MI_SIZE,
+                          mi_col * MI_SIZE, 0, 0, xd->bd, is_cur_buf_hbd(xd), 0,
+                          sf, &ref_buf, kernel);
+    inter_pred_params.conv_params = get_conv_params(0, 0, xd->bd);
+
+    av1_enc_build_one_inter_predictor(dst_buffer, dst_buffer_stride,
+                                      &best_mv.as_mv, &inter_pred_params);
+  } else {
+    av1_predict_intra_block(cm, xd, block_size_wide[bsize],
+                            block_size_high[bsize], tx_size, best_mode, 0, 0,
+                            FILTER_INTRA_MODES, dst_buffer, dst_buffer_stride,
+                            dst_buffer, dst_buffer_stride, 0, 0, 0);
+  }
+
+  int rate_cost;
+  txfm_quant_rdcost(x, src_diff, bw, src_mb_buffer, src_stride, dst_buffer,
+                    dst_buffer_stride, coeff, qcoeff, dqcoeff, bw, bh, tx_size,
+                    &rate_cost, &recon_error, &sse);
+
+  tpl_stats->recrf_dist = recon_error << (TPL_DEP_COST_SCALE_LOG2);
+  tpl_stats->recrf_rate = rate_cost << TPL_DEP_COST_SCALE_LOG2;
+  if (!is_inter_mode(best_mode)) {
+    tpl_stats->srcrf_dist = recon_error << (TPL_DEP_COST_SCALE_LOG2);
+    tpl_stats->srcrf_rate = rate_cost << TPL_DEP_COST_SCALE_LOG2;
+  }
+  tpl_stats->recrf_dist = AOMMAX(tpl_stats->srcrf_dist, tpl_stats->recrf_dist);
+  tpl_stats->recrf_rate = AOMMAX(tpl_stats->srcrf_rate, tpl_stats->recrf_rate);
+
+  if (best_rf_idx >= 0) {
+    tpl_stats->mv[best_rf_idx].as_int = best_mv.as_int;
+    tpl_stats->ref_frame_index = best_rf_idx;
+  }
+
+  for (int idy = 0; idy < mi_height; ++idy) {
+    for (int idx = 0; idx < mi_width; ++idx) {
+      if ((xd->mb_to_right_edge >> (3 + MI_SIZE_LOG2)) + mi_width > idx &&
+          (xd->mb_to_bottom_edge >> (3 + MI_SIZE_LOG2)) + mi_height > idy) {
+        xd->mi[idx + idy * cm->mi_params.mi_stride] = xd->mi[0];
+      }
+    }
+  }
+}
+
+static int round_floor(int ref_pos, int bsize_pix) {
+  int round;
+  if (ref_pos < 0)
+    round = -(1 + (-ref_pos - 1) / bsize_pix);
+  else
+    round = ref_pos / bsize_pix;
+
+  return round;
+}
+
+static int get_overlap_area(int grid_pos_row, int grid_pos_col, int ref_pos_row,
+                            int ref_pos_col, int block, BLOCK_SIZE bsize) {
+  int width = 0, height = 0;
+  int bw = 4 << mi_size_wide_log2[bsize];
+  int bh = 4 << mi_size_high_log2[bsize];
+
+  switch (block) {
+    case 0:
+      width = grid_pos_col + bw - ref_pos_col;
+      height = grid_pos_row + bh - ref_pos_row;
+      break;
+    case 1:
+      width = ref_pos_col + bw - grid_pos_col;
+      height = grid_pos_row + bh - ref_pos_row;
+      break;
+    case 2:
+      width = grid_pos_col + bw - ref_pos_col;
+      height = ref_pos_row + bh - grid_pos_row;
+      break;
+    case 3:
+      width = ref_pos_col + bw - grid_pos_col;
+      height = ref_pos_row + bh - grid_pos_row;
+      break;
+    default: assert(0);
+  }
+
+  return width * height;
+}
+
+int av1_tpl_ptr_pos(int mi_row, int mi_col, int stride, uint8_t right_shift) {
+  return (mi_row >> right_shift) * stride + (mi_col >> right_shift);
+}
+
+static int64_t delta_rate_cost(int64_t delta_rate, int64_t recrf_dist,
+                               int64_t srcrf_dist, int pix_num) {
+  double beta = (double)srcrf_dist / recrf_dist;
+  int64_t rate_cost = delta_rate;
+
+  if (srcrf_dist <= 128) return rate_cost;
+
+  double dr =
+      (double)(delta_rate >> (TPL_DEP_COST_SCALE_LOG2 + AV1_PROB_COST_SHIFT)) /
+      pix_num;
+
+  double log_den = log(beta) / log(2.0) + 2.0 * dr;
+
+  if (log_den > log(10.0) / log(2.0)) {
+    rate_cost = (int64_t)((log(1.0 / beta) * pix_num) / log(2.0) / 2.0);
+    rate_cost <<= (TPL_DEP_COST_SCALE_LOG2 + AV1_PROB_COST_SHIFT);
+    return rate_cost;
+  }
+
+  double num = pow(2.0, log_den);
+  double den = num * beta + (1 - beta) * beta;
+
+  rate_cost = (int64_t)((pix_num * log(num / den)) / log(2.0) / 2.0);
+
+  rate_cost <<= (TPL_DEP_COST_SCALE_LOG2 + AV1_PROB_COST_SHIFT);
+
+  return rate_cost;
+}
+
+static AOM_INLINE void tpl_model_update_b(TplParams *const tpl_data, int mi_row,
+                                          int mi_col, const BLOCK_SIZE bsize,
+                                          int frame_idx) {
+  TplDepFrame *tpl_frame_ptr = &tpl_data->tpl_frame[frame_idx];
+  TplDepStats *tpl_ptr = tpl_frame_ptr->tpl_stats_ptr;
+  TplDepFrame *tpl_frame = tpl_data->tpl_frame;
+  const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2;
+  TplDepStats *tpl_stats_ptr = &tpl_ptr[av1_tpl_ptr_pos(
+      mi_row, mi_col, tpl_frame->stride, block_mis_log2)];
+
+  if (tpl_stats_ptr->ref_frame_index < 0) return;
+  const int ref_frame_index = tpl_stats_ptr->ref_frame_index;
+  TplDepFrame *ref_tpl_frame =
+      &tpl_frame[tpl_frame[frame_idx].ref_map_index[ref_frame_index]];
+  TplDepStats *ref_stats_ptr = ref_tpl_frame->tpl_stats_ptr;
+
+  if (tpl_frame[frame_idx].ref_map_index[ref_frame_index] < 0) return;
+
+  const FULLPEL_MV full_mv =
+      get_fullmv_from_mv(&tpl_stats_ptr->mv[ref_frame_index].as_mv);
+  const int ref_pos_row = mi_row * MI_SIZE + full_mv.row;
+  const int ref_pos_col = mi_col * MI_SIZE + full_mv.col;
+
+  const int bw = 4 << mi_size_wide_log2[bsize];
+  const int bh = 4 << mi_size_high_log2[bsize];
+  const int mi_height = mi_size_high[bsize];
+  const int mi_width = mi_size_wide[bsize];
+  const int pix_num = bw * bh;
+
+  // top-left on grid block location in pixel
+  int grid_pos_row_base = round_floor(ref_pos_row, bh) * bh;
+  int grid_pos_col_base = round_floor(ref_pos_col, bw) * bw;
+  int block;
+
+  int64_t cur_dep_dist = tpl_stats_ptr->recrf_dist - tpl_stats_ptr->srcrf_dist;
+  int64_t mc_dep_dist = (int64_t)(
+      tpl_stats_ptr->mc_dep_dist *
+      ((double)(tpl_stats_ptr->recrf_dist - tpl_stats_ptr->srcrf_dist) /
+       tpl_stats_ptr->recrf_dist));
+  int64_t delta_rate = tpl_stats_ptr->recrf_rate - tpl_stats_ptr->srcrf_rate;
+  int64_t mc_dep_rate =
+      delta_rate_cost(tpl_stats_ptr->mc_dep_rate, tpl_stats_ptr->recrf_dist,
+                      tpl_stats_ptr->srcrf_dist, pix_num);
+
+  for (block = 0; block < 4; ++block) {
+    int grid_pos_row = grid_pos_row_base + bh * (block >> 1);
+    int grid_pos_col = grid_pos_col_base + bw * (block & 0x01);
+
+    if (grid_pos_row >= 0 && grid_pos_row < ref_tpl_frame->mi_rows * MI_SIZE &&
+        grid_pos_col >= 0 && grid_pos_col < ref_tpl_frame->mi_cols * MI_SIZE) {
+      int overlap_area = get_overlap_area(
+          grid_pos_row, grid_pos_col, ref_pos_row, ref_pos_col, block, bsize);
+      int ref_mi_row = round_floor(grid_pos_row, bh) * mi_height;
+      int ref_mi_col = round_floor(grid_pos_col, bw) * mi_width;
+      const int step = 1 << block_mis_log2;
+
+      for (int idy = 0; idy < mi_height; idy += step) {
+        for (int idx = 0; idx < mi_width; idx += step) {
+          TplDepStats *des_stats = &ref_stats_ptr[av1_tpl_ptr_pos(
+              ref_mi_row + idy, ref_mi_col + idx, ref_tpl_frame->stride,
+              block_mis_log2)];
+          des_stats->mc_dep_dist +=
+              ((cur_dep_dist + mc_dep_dist) * overlap_area) / pix_num;
+          des_stats->mc_dep_rate +=
+              ((delta_rate + mc_dep_rate) * overlap_area) / pix_num;
+
+          assert(overlap_area >= 0);
+        }
+      }
+    }
+  }
+}
+
+static AOM_INLINE void tpl_model_update(TplParams *const tpl_data, int mi_row,
+                                        int mi_col, const BLOCK_SIZE bsize,
+                                        int frame_idx) {
+  const int mi_height = mi_size_high[bsize];
+  const int mi_width = mi_size_wide[bsize];
+  const int step = 1 << tpl_data->tpl_stats_block_mis_log2;
+  const BLOCK_SIZE tpl_block_size =
+      convert_length_to_bsize(MI_SIZE << tpl_data->tpl_stats_block_mis_log2);
+
+  for (int idy = 0; idy < mi_height; idy += step) {
+    for (int idx = 0; idx < mi_width; idx += step) {
+      tpl_model_update_b(tpl_data, mi_row + idy, mi_col + idx, tpl_block_size,
+                         frame_idx);
+    }
+  }
+}
+
+static AOM_INLINE void tpl_model_store(TplDepStats *tpl_stats_ptr, int mi_row,
+                                       int mi_col, BLOCK_SIZE bsize, int stride,
+                                       const TplDepStats *src_stats,
+                                       uint8_t block_mis_log2) {
+  const int mi_height = mi_size_high[bsize];
+  const int mi_width = mi_size_wide[bsize];
+  const int step = 1 << block_mis_log2;
+
+  int64_t intra_cost = src_stats->intra_cost / (mi_height * mi_width);
+  int64_t inter_cost = src_stats->inter_cost / (mi_height * mi_width);
+  int64_t srcrf_dist = src_stats->srcrf_dist / (mi_height * mi_width);
+  int64_t recrf_dist = src_stats->recrf_dist / (mi_height * mi_width);
+  int64_t srcrf_rate = src_stats->srcrf_rate / (mi_height * mi_width);
+  int64_t recrf_rate = src_stats->recrf_rate / (mi_height * mi_width);
+
+  intra_cost = AOMMAX(1, intra_cost);
+  inter_cost = AOMMAX(1, inter_cost);
+  srcrf_dist = AOMMAX(1, srcrf_dist);
+  recrf_dist = AOMMAX(1, recrf_dist);
+  srcrf_rate = AOMMAX(1, srcrf_rate);
+  recrf_rate = AOMMAX(1, recrf_rate);
+
+  for (int idy = 0; idy < mi_height; idy += step) {
+    TplDepStats *tpl_ptr = &tpl_stats_ptr[av1_tpl_ptr_pos(
+        mi_row + idy, mi_col, stride, block_mis_log2)];
+    for (int idx = 0; idx < mi_width; idx += step) {
+      tpl_ptr->intra_cost = intra_cost;
+      tpl_ptr->inter_cost = inter_cost;
+      tpl_ptr->srcrf_dist = srcrf_dist;
+      tpl_ptr->recrf_dist = recrf_dist;
+      tpl_ptr->srcrf_rate = srcrf_rate;
+      tpl_ptr->recrf_rate = recrf_rate;
+      memcpy(tpl_ptr->mv, src_stats->mv, sizeof(tpl_ptr->mv));
+      memcpy(tpl_ptr->pred_error, src_stats->pred_error,
+             sizeof(tpl_ptr->pred_error));
+      tpl_ptr->ref_frame_index = src_stats->ref_frame_index;
+      ++tpl_ptr;
+    }
+  }
+}
+
+static AOM_INLINE void mc_flow_dispenser(AV1_COMP *cpi, int frame_idx,
+                                         int pframe_qindex) {
+  const GF_GROUP *gf_group = &cpi->gf_group;
+  if (frame_idx == gf_group->size) return;
+  TplParams *const tpl_data = &cpi->tpl_data;
+  TplDepFrame *tpl_frame = &tpl_data->tpl_frame[frame_idx];
+  const YV12_BUFFER_CONFIG *this_frame = tpl_frame->gf_picture;
+  const YV12_BUFFER_CONFIG *ref_frame[7] = { NULL, NULL, NULL, NULL,
+                                             NULL, NULL, NULL };
+  const YV12_BUFFER_CONFIG *ref_frames_ordered[INTER_REFS_PER_FRAME];
+  int ref_frame_flags;
+  const YV12_BUFFER_CONFIG *src_frame[7] = { NULL, NULL, NULL, NULL,
+                                             NULL, NULL, NULL };
+
+  AV1_COMMON *cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  struct scale_factors sf;
+  int rdmult, idx;
+  ThreadData *td = &cpi->td;
+  MACROBLOCK *x = &td->mb;
+  MACROBLOCKD *xd = &x->e_mbd;
+  int mi_row, mi_col;
+  const BLOCK_SIZE bsize = convert_length_to_bsize(MC_FLOW_BSIZE_1D);
+  av1_tile_init(&xd->tile, cm, 0, 0);
+
+  const TX_SIZE tx_size = max_txsize_lookup[bsize];
+  const int mi_height = mi_size_high[bsize];
+  const int mi_width = mi_size_wide[bsize];
+
+  // Setup scaling factor
+  av1_setup_scale_factors_for_frame(
+      &sf, this_frame->y_crop_width, this_frame->y_crop_height,
+      this_frame->y_crop_width, this_frame->y_crop_height);
+
+  xd->cur_buf = this_frame;
+
+  for (idx = 0; idx < INTER_REFS_PER_FRAME; ++idx) {
+    ref_frame[idx] =
+        tpl_data->tpl_frame[tpl_frame->ref_map_index[idx]].rec_picture;
+    src_frame[idx] =
+        tpl_data->tpl_frame[tpl_frame->ref_map_index[idx]].gf_picture;
+  }
+
+  // Store the reference frames based on priority order
+  for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+    ref_frames_ordered[i] = ref_frame[ref_frame_priority_order[i] - 1];
+  }
+
+  // Work out which reference frame slots may be used.
+  ref_frame_flags = get_ref_frame_flags(&cpi->sf, ref_frames_ordered,
+                                        cpi->ext_flags.ref_frame_flags);
+
+  enforce_max_ref_frames(cpi, &ref_frame_flags);
+
+  // Prune reference frames
+  for (idx = 0; idx < INTER_REFS_PER_FRAME; ++idx) {
+    if ((ref_frame_flags & (1 << idx)) == 0) {
+      ref_frame[idx] = NULL;
+    }
+  }
+
+  // Make a temporary mbmi for tpl model
+  MB_MODE_INFO mbmi;
+  memset(&mbmi, 0, sizeof(mbmi));
+  MB_MODE_INFO *mbmi_ptr = &mbmi;
+  xd->mi = &mbmi_ptr;
+
+  xd->block_ref_scale_factors[0] = &sf;
+
+  const int base_qindex = pframe_qindex;
+  // Get rd multiplier set up.
+  rdmult = (int)av1_compute_rd_mult(cpi, base_qindex);
+  if (rdmult < 1) rdmult = 1;
+  set_error_per_bit(x, rdmult);
+  av1_initialize_me_consts(cpi, x, base_qindex);
+
+  tpl_frame->is_valid = 1;
+
+  cm->quant_params.base_qindex = base_qindex;
+  av1_frame_init_quantizer(cpi);
+
+  tpl_frame->base_rdmult =
+      av1_compute_rd_mult_based_on_qindex(cpi, pframe_qindex) / 6;
+
+  for (mi_row = 0; mi_row < mi_params->mi_rows; mi_row += mi_height) {
+    // Motion estimation row boundary
+    av1_set_mv_row_limits(mi_params, &x->mv_limits, mi_row, mi_height,
+                          cpi->oxcf.border_in_pixels);
+    xd->mb_to_top_edge = -GET_MV_SUBPEL(mi_row * MI_SIZE);
+    xd->mb_to_bottom_edge =
+        GET_MV_SUBPEL((mi_params->mi_rows - mi_height - mi_row) * MI_SIZE);
+    for (mi_col = 0; mi_col < mi_params->mi_cols; mi_col += mi_width) {
+      TplDepStats tpl_stats;
+
+      // Motion estimation column boundary
+      av1_set_mv_col_limits(mi_params, &x->mv_limits, mi_col, mi_width,
+                            cpi->oxcf.border_in_pixels);
+      xd->mb_to_left_edge = -GET_MV_SUBPEL(mi_col * MI_SIZE);
+      xd->mb_to_right_edge =
+          GET_MV_SUBPEL(mi_params->mi_cols - mi_width - mi_col);
+      mode_estimation(cpi, x, xd, &sf, frame_idx, mi_row, mi_col, bsize,
+                      tx_size, ref_frame, src_frame, &tpl_stats);
+
+      // Motion flow dependency dispenser.
+      tpl_model_store(tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize,
+                      tpl_frame->stride, &tpl_stats,
+                      tpl_data->tpl_stats_block_mis_log2);
+    }
+  }
+}
+
+static void mc_flow_synthesizer(AV1_COMP *cpi, int frame_idx) {
+  AV1_COMMON *cm = &cpi->common;
+
+  const GF_GROUP *gf_group = &cpi->gf_group;
+  if (frame_idx == gf_group->size) return;
+
+  TplParams *const tpl_data = &cpi->tpl_data;
+
+  const BLOCK_SIZE bsize = convert_length_to_bsize(MC_FLOW_BSIZE_1D);
+  const int mi_height = mi_size_high[bsize];
+  const int mi_width = mi_size_wide[bsize];
+
+  for (int mi_row = 0; mi_row < cm->mi_params.mi_rows; mi_row += mi_height) {
+    for (int mi_col = 0; mi_col < cm->mi_params.mi_cols; mi_col += mi_width) {
+      if (frame_idx) {
+        tpl_model_update(tpl_data, mi_row, mi_col, bsize, frame_idx);
+      }
+    }
+  }
+}
+
+static AOM_INLINE void init_gop_frames_for_tpl(
+    AV1_COMP *cpi, const EncodeFrameParams *const init_frame_params,
+    GF_GROUP *gf_group, int gop_eval, int *tpl_group_frames,
+    const EncodeFrameInput *const frame_input, int *pframe_qindex) {
+  AV1_COMMON *cm = &cpi->common;
+  int cur_frame_idx = gf_group->index;
+  *pframe_qindex = 0;
+
+  RefBufferStack ref_buffer_stack = cpi->ref_buffer_stack;
+  EncodeFrameParams frame_params = *init_frame_params;
+  TplParams *const tpl_data = &cpi->tpl_data;
+
+  int ref_picture_map[REF_FRAMES];
+
+  for (int i = 0; i < REF_FRAMES; ++i) {
+    if (frame_params.frame_type == KEY_FRAME || gop_eval) {
+      tpl_data->tpl_frame[-i - 1].gf_picture = NULL;
+      tpl_data->tpl_frame[-1 - 1].rec_picture = NULL;
+      tpl_data->tpl_frame[-i - 1].frame_display_index = 0;
+    } else {
+      tpl_data->tpl_frame[-i - 1].gf_picture = &cm->ref_frame_map[i]->buf;
+      tpl_data->tpl_frame[-i - 1].rec_picture = &cm->ref_frame_map[i]->buf;
+      tpl_data->tpl_frame[-i - 1].frame_display_index =
+          cm->ref_frame_map[i]->display_order_hint;
+    }
+
+    ref_picture_map[i] = -i - 1;
+  }
+
+  *tpl_group_frames = cur_frame_idx;
+
+  int gf_index;
+  int use_arf = gf_group->update_type[1] == ARF_UPDATE;
+  int anc_frame_offset = gf_group->cur_frame_idx[cur_frame_idx] + 1;
+  int process_frame_count = 0;
+  const int gop_length =
+      AOMMIN(gf_group->size - 1 + use_arf, MAX_LENGTH_TPL_FRAME_STATS - 1);
+  for (gf_index = cur_frame_idx; gf_index <= gop_length; ++gf_index) {
+    TplDepFrame *tpl_frame = &tpl_data->tpl_frame[gf_index];
+    FRAME_UPDATE_TYPE frame_update_type = gf_group->update_type[gf_index];
+
+    frame_params.show_frame = frame_update_type != ARF_UPDATE &&
+                              frame_update_type != INTNL_ARF_UPDATE;
+    frame_params.show_existing_frame =
+        frame_update_type == INTNL_OVERLAY_UPDATE ||
+        frame_update_type == OVERLAY_UPDATE;
+    frame_params.frame_type =
+        frame_update_type == KF_UPDATE ? KEY_FRAME : INTER_FRAME;
+
+    if (frame_update_type == LF_UPDATE)
+      *pframe_qindex = gf_group->q_val[gf_index];
+
+    if (gf_index == cur_frame_idx) {
+      tpl_frame->gf_picture = frame_input->source;
+      // frame display index = frame offset within the gf group + start frame of
+      // the gf group
+      tpl_frame->frame_display_index =
+          gf_group->frame_disp_idx[gf_index] +
+          cpi->common.current_frame.display_order_hint;
+    } else {
+      int frame_display_index = gf_index == gf_group->size
+                                    ? cpi->rc.baseline_gf_interval
+                                    : gf_group->frame_disp_idx[gf_index];
+      struct lookahead_entry *buf = av1_lookahead_peek(
+          cpi->lookahead, frame_display_index - anc_frame_offset,
+          cpi->compressor_stage);
+      if (buf == NULL) break;
+      tpl_frame->gf_picture = &buf->img;
+      // frame display index = frame offset within the gf group + start frame of
+      // the gf group
+      tpl_frame->frame_display_index =
+          frame_display_index + cpi->common.current_frame.display_order_hint;
+    }
+
+    if (frame_update_type != OVERLAY_UPDATE &&
+        frame_update_type != INTNL_OVERLAY_UPDATE) {
+      tpl_frame->rec_picture = &tpl_data->tpl_rec_pool[process_frame_count];
+      tpl_frame->tpl_stats_ptr = tpl_data->tpl_stats_pool[process_frame_count];
+      ++process_frame_count;
+    }
+
+    av1_get_ref_frames(cpi, &ref_buffer_stack);
+    int refresh_mask = av1_get_refresh_frame_flags(
+        cpi, &frame_params, frame_update_type, &ref_buffer_stack);
+
+    int refresh_frame_map_index = av1_get_refresh_ref_frame_map(refresh_mask);
+    av1_update_ref_frame_map(cpi, frame_update_type,
+                             frame_params.show_existing_frame,
+                             refresh_frame_map_index, &ref_buffer_stack);
+
+    for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i)
+      tpl_frame->ref_map_index[i - LAST_FRAME] =
+          ref_picture_map[cm->remapped_ref_idx[i - LAST_FRAME]];
+
+    if (refresh_mask) ref_picture_map[refresh_frame_map_index] = gf_index;
+
+    ++*tpl_group_frames;
+  }
+
+  if (cur_frame_idx == 0) return;
+
+  int extend_frame_count = 0;
+  int extend_frame_length =
+      AOMMIN(cpi->rc.baseline_gf_interval,
+             cpi->rc.frames_to_key - cpi->rc.baseline_gf_interval);
+  int frame_display_index = cpi->rc.baseline_gf_interval + 1;
+
+  for (; gf_index < MAX_LENGTH_TPL_FRAME_STATS &&
+         extend_frame_count < extend_frame_length;
+       ++gf_index) {
+    TplDepFrame *tpl_frame = &tpl_data->tpl_frame[gf_index];
+    FRAME_UPDATE_TYPE frame_update_type = LF_UPDATE;
+    frame_params.show_frame = frame_update_type != ARF_UPDATE &&
+                              frame_update_type != INTNL_ARF_UPDATE;
+    frame_params.show_existing_frame =
+        frame_update_type == INTNL_OVERLAY_UPDATE;
+    frame_params.frame_type = INTER_FRAME;
+
+    struct lookahead_entry *buf = av1_lookahead_peek(
+        cpi->lookahead, frame_display_index - anc_frame_offset,
+        cpi->compressor_stage);
+
+    if (buf == NULL) break;
+
+    tpl_frame->gf_picture = &buf->img;
+    tpl_frame->rec_picture = &tpl_data->tpl_rec_pool[process_frame_count];
+    tpl_frame->tpl_stats_ptr = tpl_data->tpl_stats_pool[process_frame_count];
+    ++process_frame_count;
+
+    // frame display index = frame offset within the gf group + start frame of
+    // the gf group
+    tpl_frame->frame_display_index =
+        frame_display_index + cpi->common.current_frame.display_order_hint;
+
+    gf_group->update_type[gf_index] = LF_UPDATE;
+    gf_group->q_val[gf_index] = *pframe_qindex;
+
+    av1_get_ref_frames(cpi, &ref_buffer_stack);
+    int refresh_mask = av1_get_refresh_frame_flags(
+        cpi, &frame_params, frame_update_type, &ref_buffer_stack);
+    int refresh_frame_map_index = av1_get_refresh_ref_frame_map(refresh_mask);
+    av1_update_ref_frame_map(cpi, frame_update_type,
+                             frame_params.show_existing_frame,
+                             refresh_frame_map_index, &ref_buffer_stack);
+
+    for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i)
+      tpl_frame->ref_map_index[i - LAST_FRAME] =
+          ref_picture_map[cm->remapped_ref_idx[i - LAST_FRAME]];
+
+    tpl_frame->ref_map_index[ALTREF_FRAME - LAST_FRAME] = -1;
+    tpl_frame->ref_map_index[LAST3_FRAME - LAST_FRAME] = -1;
+    tpl_frame->ref_map_index[BWDREF_FRAME - LAST_FRAME] = -1;
+    tpl_frame->ref_map_index[ALTREF2_FRAME - LAST_FRAME] = -1;
+
+    if (refresh_mask) ref_picture_map[refresh_frame_map_index] = gf_index;
+
+    ++*tpl_group_frames;
+    ++extend_frame_count;
+    ++frame_display_index;
+  }
+
+  av1_get_ref_frames(cpi, &cpi->ref_buffer_stack);
+}
+
+static AOM_INLINE void init_tpl_stats(TplParams *const tpl_data) {
+  for (int frame_idx = 0; frame_idx < MAX_LAG_BUFFERS; ++frame_idx) {
+    TplDepFrame *tpl_frame = &tpl_data->tpl_stats_buffer[frame_idx];
+    memset(tpl_data->tpl_stats_pool[frame_idx], 0,
+           tpl_frame->height * tpl_frame->width *
+               sizeof(*tpl_frame->tpl_stats_ptr));
+    tpl_frame->is_valid = 0;
+  }
+}
+
+int av1_tpl_setup_stats(AV1_COMP *cpi, int gop_eval,
+                        const EncodeFrameParams *const frame_params,
+                        const EncodeFrameInput *const frame_input) {
+  AV1_COMMON *cm = &cpi->common;
+  GF_GROUP *gf_group = &cpi->gf_group;
+  int bottom_index, top_index;
+  EncodeFrameParams this_frame_params = *frame_params;
+  TplParams *const tpl_data = &cpi->tpl_data;
+
+  if (cpi->superres_mode != SUPERRES_NONE) return 0;
+
+  cm->current_frame.frame_type = frame_params->frame_type;
+  for (int gf_index = gf_group->index; gf_index < gf_group->size; ++gf_index) {
+    av1_configure_buffer_updates(cpi, &this_frame_params,
+                                 gf_group->update_type[gf_index], 0);
+
+    cpi->refresh_golden_frame = this_frame_params.refresh_golden_frame;
+    cpi->refresh_bwd_ref_frame = this_frame_params.refresh_bwd_ref_frame;
+    cpi->refresh_alt_ref_frame = this_frame_params.refresh_alt_ref_frame;
+
+    cm->show_frame = gf_group->update_type[gf_index] != ARF_UPDATE &&
+                     gf_group->update_type[gf_index] != INTNL_ARF_UPDATE;
+
+    gf_group->q_val[gf_index] =
+        av1_rc_pick_q_and_bounds(cpi, &cpi->rc, cm->width, cm->height, gf_index,
+                                 &bottom_index, &top_index);
+
+    cm->current_frame.frame_type = INTER_FRAME;
+  }
+
+  int pframe_qindex;
+  int tpl_gf_group_frames;
+  init_gop_frames_for_tpl(cpi, frame_params, gf_group, gop_eval,
+                          &tpl_gf_group_frames, frame_input, &pframe_qindex);
+
+  cpi->rc.base_layer_qp = pframe_qindex;
+
+  init_tpl_stats(tpl_data);
+
+  // Backward propagation from tpl_group_frames to 1.
+  for (int frame_idx = gf_group->index; frame_idx < tpl_gf_group_frames;
+       ++frame_idx) {
+    if (gf_group->update_type[frame_idx] == INTNL_OVERLAY_UPDATE ||
+        gf_group->update_type[frame_idx] == OVERLAY_UPDATE)
+      continue;
+
+    mc_flow_dispenser(cpi, frame_idx, pframe_qindex);
+
+    aom_extend_frame_borders(tpl_data->tpl_frame[frame_idx].rec_picture,
+                             av1_num_planes(cm));
+  }
+
+  for (int frame_idx = tpl_gf_group_frames - 1; frame_idx >= gf_group->index;
+       --frame_idx) {
+    if (gf_group->update_type[frame_idx] == INTNL_OVERLAY_UPDATE ||
+        gf_group->update_type[frame_idx] == OVERLAY_UPDATE)
+      continue;
+
+    mc_flow_synthesizer(cpi, frame_idx);
+  }
+
+  av1_configure_buffer_updates(cpi, &this_frame_params,
+                               gf_group->update_type[gf_group->index], 0);
+  cm->current_frame.frame_type = frame_params->frame_type;
+  cm->show_frame = frame_params->show_frame;
+
+  if (cpi->common.tiles.large_scale) return 0;
+  if (gf_group->max_layer_depth_allowed == 0) return 1;
+
+  double beta[2] = { 0.0 };
+  for (int frame_idx = 1; frame_idx <= AOMMIN(tpl_gf_group_frames - 1, 2);
+       ++frame_idx) {
+    TplDepFrame *tpl_frame = &tpl_data->tpl_frame[frame_idx];
+    TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
+    int tpl_stride = tpl_frame->stride;
+    int64_t intra_cost_base = 0;
+    int64_t mc_dep_cost_base = 0;
+    const int step = 1 << tpl_data->tpl_stats_block_mis_log2;
+    const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width);
+
+    for (int row = 0; row < cm->mi_params.mi_rows; row += step) {
+      for (int col = 0; col < mi_cols_sr; col += step) {
+        TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos(
+            row, col, tpl_stride, tpl_data->tpl_stats_block_mis_log2)];
+        int64_t mc_dep_delta =
+            RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate,
+                   this_stats->mc_dep_dist);
+        intra_cost_base += (this_stats->recrf_dist << RDDIV_BITS);
+        mc_dep_cost_base +=
+            (this_stats->recrf_dist << RDDIV_BITS) + mc_dep_delta;
+      }
+    }
+    beta[frame_idx - 1] = (double)mc_dep_cost_base / intra_cost_base;
+  }
+
+  // Allow larger GOP size if the base layer ARF has higher dependency factor
+  // than the intermediate ARF and both ARFs have reasonably high dependency
+  // factors.
+  return (beta[0] >= beta[1] + 0.7) && beta[0] > 3.0;
+}
+
+void av1_tpl_rdmult_setup(AV1_COMP *cpi) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const GF_GROUP *const gf_group = &cpi->gf_group;
+  const int tpl_idx = gf_group->index;
+
+  assert(IMPLIES(gf_group->size > 0, tpl_idx < gf_group->size));
+
+  TplParams *const tpl_data = &cpi->tpl_data;
+  const TplDepFrame *const tpl_frame = &tpl_data->tpl_frame[tpl_idx];
+
+  if (!tpl_frame->is_valid) return;
+  if (cpi->superres_mode != SUPERRES_NONE) return;
+
+  const TplDepStats *const tpl_stats = tpl_frame->tpl_stats_ptr;
+  const int tpl_stride = tpl_frame->stride;
+  const int mi_cols_sr = av1_pixels_to_mi(cm->superres_upscaled_width);
+
+  const int block_size = BLOCK_16X16;
+  const int num_mi_w = mi_size_wide[block_size];
+  const int num_mi_h = mi_size_high[block_size];
+  const int num_cols = (mi_cols_sr + num_mi_w - 1) / num_mi_w;
+  const int num_rows = (cm->mi_params.mi_rows + num_mi_h - 1) / num_mi_h;
+  const double c = 1.2;
+  const int step = 1 << tpl_data->tpl_stats_block_mis_log2;
+
+  aom_clear_system_state();
+
+  // Loop through each 'block_size' X 'block_size' block.
+  for (int row = 0; row < num_rows; row++) {
+    for (int col = 0; col < num_cols; col++) {
+      double intra_cost = 0.0, mc_dep_cost = 0.0;
+      // Loop through each mi block.
+      for (int mi_row = row * num_mi_h; mi_row < (row + 1) * num_mi_h;
+           mi_row += step) {
+        for (int mi_col = col * num_mi_w; mi_col < (col + 1) * num_mi_w;
+             mi_col += step) {
+          if (mi_row >= cm->mi_params.mi_rows || mi_col >= mi_cols_sr) continue;
+          const TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos(
+              mi_row, mi_col, tpl_stride, tpl_data->tpl_stats_block_mis_log2)];
+          int64_t mc_dep_delta =
+              RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate,
+                     this_stats->mc_dep_dist);
+          intra_cost += (double)(this_stats->recrf_dist << RDDIV_BITS);
+          mc_dep_cost +=
+              (double)(this_stats->recrf_dist << RDDIV_BITS) + mc_dep_delta;
+        }
+      }
+      const double rk = intra_cost / mc_dep_cost;
+      const int index = row * num_cols + col;
+      cpi->tpl_rdmult_scaling_factors[index] = rk / cpi->rd.r0 + c;
+    }
+  }
+  aom_clear_system_state();
+}
+
+void av1_tpl_rdmult_setup_sb(AV1_COMP *cpi, MACROBLOCK *const x,
+                             BLOCK_SIZE sb_size, int mi_row, int mi_col) {
+  AV1_COMMON *const cm = &cpi->common;
+  assert(IMPLIES(cpi->gf_group.size > 0,
+                 cpi->gf_group.index < cpi->gf_group.size));
+  const int tpl_idx = cpi->gf_group.index;
+  TplDepFrame *tpl_frame = &cpi->tpl_data.tpl_frame[tpl_idx];
+
+  if (tpl_frame->is_valid == 0) return;
+  if (!is_frame_tpl_eligible(cpi)) return;
+  if (tpl_idx >= MAX_LAG_BUFFERS) return;
+  if (cpi->superres_mode != SUPERRES_NONE) return;
+  if (cpi->oxcf.aq_mode != NO_AQ) return;
+
+  const int bsize_base = BLOCK_16X16;
+  const int num_mi_w = mi_size_wide[bsize_base];
+  const int num_mi_h = mi_size_high[bsize_base];
+  const int num_cols = (cm->mi_params.mi_cols + num_mi_w - 1) / num_mi_w;
+  const int num_rows = (cm->mi_params.mi_rows + num_mi_h - 1) / num_mi_h;
+  const int num_bcols = (mi_size_wide[sb_size] + num_mi_w - 1) / num_mi_w;
+  const int num_brows = (mi_size_high[sb_size] + num_mi_h - 1) / num_mi_h;
+  int row, col;
+
+  double base_block_count = 0.0;
+  double log_sum = 0.0;
+
+  aom_clear_system_state();
+  for (row = mi_row / num_mi_w;
+       row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) {
+    for (col = mi_col / num_mi_h;
+         col < num_cols && col < mi_col / num_mi_h + num_bcols; ++col) {
+      const int index = row * num_cols + col;
+      log_sum += log(cpi->tpl_rdmult_scaling_factors[index]);
+      base_block_count += 1.0;
+    }
+  }
+
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const CommonQuantParams *quant_params = &cm->quant_params;
+  const int orig_rdmult = av1_compute_rd_mult(
+      cpi, quant_params->base_qindex + quant_params->y_dc_delta_q);
+  const int new_rdmult =
+      av1_compute_rd_mult(cpi, quant_params->base_qindex + xd->delta_qindex +
+                                   quant_params->y_dc_delta_q);
+  const double scaling_factor = (double)new_rdmult / (double)orig_rdmult;
+
+  double scale_adj = log(scaling_factor) - log_sum / base_block_count;
+  scale_adj = exp(scale_adj);
+
+  for (row = mi_row / num_mi_w;
+       row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) {
+    for (col = mi_col / num_mi_h;
+         col < num_cols && col < mi_col / num_mi_h + num_bcols; ++col) {
+      const int index = row * num_cols + col;
+      cpi->tpl_sb_rdmult_scaling_factors[index] =
+          scale_adj * cpi->tpl_rdmult_scaling_factors[index];
+    }
+  }
+  aom_clear_system_state();
+}
diff --git a/libs/libaom/src/av1/encoder/tpl_model.h b/libs/libaom/src/av1/encoder/tpl_model.h
new file mode 100644
index 000000000..11a61b649
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/tpl_model.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_TPL_MODEL_H_
+#define AOM_AV1_ENCODER_TPL_MODEL_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static INLINE BLOCK_SIZE convert_length_to_bsize(int length) {
+  switch (length) {
+    case 64: return BLOCK_64X64;
+    case 32: return BLOCK_32X32;
+    case 16: return BLOCK_16X16;
+    case 8: return BLOCK_8X8;
+    case 4: return BLOCK_4X4;
+    default:
+      assert(0 && "Invalid block size for tpl model");
+      return BLOCK_16X16;
+  }
+}
+
+int av1_tpl_setup_stats(AV1_COMP *cpi, int gop_eval,
+                        const EncodeFrameParams *const frame_params,
+                        const EncodeFrameInput *const frame_input);
+
+int av1_tpl_ptr_pos(int mi_row, int mi_col, int stride, uint8_t right_shift);
+
+void av1_tpl_rdmult_setup(AV1_COMP *cpi);
+
+void av1_tpl_rdmult_setup_sb(AV1_COMP *cpi, MACROBLOCK *const x,
+                             BLOCK_SIZE sb_size, int mi_row, int mi_col);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_TPL_MODEL_H_
diff --git a/libs/libaom/src/av1/encoder/tune_vmaf.c b/libs/libaom/src/av1/encoder/tune_vmaf.c
new file mode 100644
index 000000000..997f78e27
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/tune_vmaf.c
@@ -0,0 +1,794 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/encoder/tune_vmaf.h"
+
+#include "aom_dsp/psnr.h"
+#include "aom_dsp/vmaf.h"
+#include "aom_ports/system_state.h"
+#include "av1/encoder/extend.h"
+#include "av1/encoder/rdopt.h"
+
+static const double kBaselineVmaf = 97.42773;
+
+// TODO(sdeng): Add the SIMD implementation.
+static AOM_INLINE void highbd_unsharp_rect(const uint16_t *source,
+                                           int source_stride,
+                                           const uint16_t *blurred,
+                                           int blurred_stride, uint16_t *dst,
+                                           int dst_stride, int w, int h,
+                                           double amount, int bit_depth) {
+  const int max_value = (1 << bit_depth) - 1;
+  for (int i = 0; i < h; ++i) {
+    for (int j = 0; j < w; ++j) {
+      const double val =
+          (double)source[j] + amount * ((double)source[j] - (double)blurred[j]);
+      dst[j] = (uint16_t)clamp((int)(val + 0.5), 0, max_value);
+    }
+    source += source_stride;
+    blurred += blurred_stride;
+    dst += dst_stride;
+  }
+}
+
+static AOM_INLINE void unsharp_rect(const uint8_t *source, int source_stride,
+                                    const uint8_t *blurred, int blurred_stride,
+                                    uint8_t *dst, int dst_stride, int w, int h,
+                                    double amount) {
+  for (int i = 0; i < h; ++i) {
+    for (int j = 0; j < w; ++j) {
+      const double val =
+          (double)source[j] + amount * ((double)source[j] - (double)blurred[j]);
+      dst[j] = (uint8_t)clamp((int)(val + 0.5), 0, 255);
+    }
+    source += source_stride;
+    blurred += blurred_stride;
+    dst += dst_stride;
+  }
+}
+
+static AOM_INLINE void unsharp(const AV1_COMP *const cpi,
+                               const YV12_BUFFER_CONFIG *source,
+                               const YV12_BUFFER_CONFIG *blurred,
+                               const YV12_BUFFER_CONFIG *dst, double amount) {
+  const int bit_depth = cpi->td.mb.e_mbd.bd;
+  if (bit_depth > 8) {
+    highbd_unsharp_rect(CONVERT_TO_SHORTPTR(source->y_buffer), source->y_stride,
+                        CONVERT_TO_SHORTPTR(blurred->y_buffer),
+                        blurred->y_stride, CONVERT_TO_SHORTPTR(dst->y_buffer),
+                        dst->y_stride, source->y_width, source->y_height,
+                        amount, bit_depth);
+  } else {
+    unsharp_rect(source->y_buffer, source->y_stride, blurred->y_buffer,
+                 blurred->y_stride, dst->y_buffer, dst->y_stride,
+                 source->y_width, source->y_height, amount);
+  }
+}
+
+// 8-tap Gaussian convolution filter with sigma = 1.0, sums to 128,
+// all co-efficients must be even.
+DECLARE_ALIGNED(16, static const int16_t, gauss_filter[8]) = { 0,  8, 30, 52,
+                                                               30, 8, 0,  0 };
+static AOM_INLINE void gaussian_blur(const int bit_depth,
+                                     const YV12_BUFFER_CONFIG *source,
+                                     const YV12_BUFFER_CONFIG *dst) {
+  const int block_size = BLOCK_128X128;
+  const int block_w = mi_size_wide[block_size] * 4;
+  const int block_h = mi_size_high[block_size] * 4;
+  const int num_cols = (source->y_width + block_w - 1) / block_w;
+  const int num_rows = (source->y_height + block_h - 1) / block_h;
+  int row, col;
+
+  ConvolveParams conv_params = get_conv_params(0, 0, bit_depth);
+  InterpFilterParams filter = { .filter_ptr = gauss_filter,
+                                .taps = 8,
+                                .subpel_shifts = 0,
+                                .interp_filter = EIGHTTAP_REGULAR };
+
+  for (row = 0; row < num_rows; ++row) {
+    for (col = 0; col < num_cols; ++col) {
+      const int row_offset_y = row * block_h;
+      const int col_offset_y = col * block_w;
+
+      uint8_t *src_buf =
+          source->y_buffer + row_offset_y * source->y_stride + col_offset_y;
+      uint8_t *dst_buf =
+          dst->y_buffer + row_offset_y * dst->y_stride + col_offset_y;
+
+      if (bit_depth > 8) {
+        av1_highbd_convolve_2d_sr(
+            CONVERT_TO_SHORTPTR(src_buf), source->y_stride,
+            CONVERT_TO_SHORTPTR(dst_buf), dst->y_stride, block_w, block_h,
+            &filter, &filter, 0, 0, &conv_params, bit_depth);
+      } else {
+        av1_convolve_2d_sr(src_buf, source->y_stride, dst_buf, dst->y_stride,
+                           block_w, block_h, &filter, &filter, 0, 0,
+                           &conv_params);
+      }
+    }
+  }
+}
+
+static double frame_average_variance(const AV1_COMP *const cpi,
+                                     const YV12_BUFFER_CONFIG *const frame) {
+  const uint8_t *const y_buffer = frame->y_buffer;
+  const int y_stride = frame->y_stride;
+  const BLOCK_SIZE block_size = BLOCK_64X64;
+
+  const int block_w = mi_size_wide[block_size] * 4;
+  const int block_h = mi_size_high[block_size] * 4;
+  int row, col;
+  const int bit_depth = cpi->td.mb.e_mbd.bd;
+  double var = 0.0, var_count = 0.0;
+
+  // Loop through each block.
+  for (row = 0; row < frame->y_height / block_h; ++row) {
+    for (col = 0; col < frame->y_width / block_w; ++col) {
+      struct buf_2d buf;
+      const int row_offset_y = row * block_h;
+      const int col_offset_y = col * block_w;
+
+      buf.buf = (uint8_t *)y_buffer + row_offset_y * y_stride + col_offset_y;
+      buf.stride = y_stride;
+
+      if (bit_depth > 8) {
+        var += av1_high_get_sby_perpixel_variance(cpi, &buf, block_size,
+                                                  bit_depth);
+      } else {
+        var += av1_get_sby_perpixel_variance(cpi, &buf, block_size);
+      }
+      var_count += 1.0;
+    }
+  }
+  var /= var_count;
+  return var;
+}
+
+static double cal_approx_vmaf(const AV1_COMP *const cpi, double source_variance,
+                              YV12_BUFFER_CONFIG *const source,
+                              YV12_BUFFER_CONFIG *const sharpened) {
+  const int bit_depth = cpi->td.mb.e_mbd.bd;
+  double new_vmaf;
+  aom_calc_vmaf(cpi->oxcf.vmaf_model_path, source, sharpened, bit_depth,
+                &new_vmaf);
+  const double sharpened_var = frame_average_variance(cpi, sharpened);
+  return source_variance / sharpened_var * (new_vmaf - kBaselineVmaf);
+}
+
+static double find_best_frame_unsharp_amount_loop(
+    const AV1_COMP *const cpi, YV12_BUFFER_CONFIG *const source,
+    YV12_BUFFER_CONFIG *const blurred, YV12_BUFFER_CONFIG *const sharpened,
+    double best_vmaf, const double baseline_variance,
+    const double unsharp_amount_start, const double step_size,
+    const int max_loop_count, const double max_amount) {
+  const double min_amount = 0.0;
+  int loop_count = 0;
+  double approx_vmaf = best_vmaf;
+  double unsharp_amount = unsharp_amount_start;
+  do {
+    best_vmaf = approx_vmaf;
+    unsharp_amount += step_size;
+    if (unsharp_amount > max_amount || unsharp_amount < min_amount) break;
+    unsharp(cpi, source, blurred, sharpened, unsharp_amount);
+    approx_vmaf = cal_approx_vmaf(cpi, baseline_variance, source, sharpened);
+
+    loop_count++;
+  } while (approx_vmaf > best_vmaf && loop_count < max_loop_count);
+  unsharp_amount =
+      approx_vmaf > best_vmaf ? unsharp_amount : unsharp_amount - step_size;
+  return AOMMIN(max_amount, AOMMAX(unsharp_amount, min_amount));
+}
+
+static double find_best_frame_unsharp_amount(const AV1_COMP *const cpi,
+                                             YV12_BUFFER_CONFIG *const source,
+                                             YV12_BUFFER_CONFIG *const blurred,
+                                             const double unsharp_amount_start,
+                                             const double step_size,
+                                             const int max_loop_count,
+                                             const double max_filter_amount) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int width = source->y_width;
+  const int height = source->y_height;
+
+  YV12_BUFFER_CONFIG sharpened;
+  memset(&sharpened, 0, sizeof(sharpened));
+  aom_alloc_frame_buffer(
+      &sharpened, width, height, 1, 1, cm->seq_params.use_highbitdepth,
+      cpi->oxcf.border_in_pixels, cm->features.byte_alignment);
+
+  const double baseline_variance = frame_average_variance(cpi, source);
+  double unsharp_amount;
+  if (unsharp_amount_start <= step_size) {
+    unsharp_amount = find_best_frame_unsharp_amount_loop(
+        cpi, source, blurred, &sharpened, 0.0, baseline_variance, 0.0,
+        step_size, max_loop_count, max_filter_amount);
+  } else {
+    double a0 = unsharp_amount_start - step_size, a1 = unsharp_amount_start;
+    double v0, v1;
+    unsharp(cpi, source, blurred, &sharpened, a0);
+    v0 = cal_approx_vmaf(cpi, baseline_variance, source, &sharpened);
+    unsharp(cpi, source, blurred, &sharpened, a1);
+    v1 = cal_approx_vmaf(cpi, baseline_variance, source, &sharpened);
+    if (fabs(v0 - v1) < 0.01) {
+      unsharp_amount = a0;
+    } else if (v0 > v1) {
+      unsharp_amount = find_best_frame_unsharp_amount_loop(
+          cpi, source, blurred, &sharpened, v0, baseline_variance, a0,
+          -step_size, max_loop_count, max_filter_amount);
+    } else {
+      unsharp_amount = find_best_frame_unsharp_amount_loop(
+          cpi, source, blurred, &sharpened, v1, baseline_variance, a1,
+          step_size, max_loop_count, max_filter_amount);
+    }
+  }
+
+  aom_free_frame_buffer(&sharpened);
+  return unsharp_amount;
+}
+
+void av1_vmaf_frame_preprocessing(AV1_COMP *const cpi,
+                                  YV12_BUFFER_CONFIG *const source) {
+  aom_clear_system_state();
+  const AV1_COMMON *const cm = &cpi->common;
+  const int bit_depth = cpi->td.mb.e_mbd.bd;
+  const int width = source->y_width;
+  const int height = source->y_height;
+
+  YV12_BUFFER_CONFIG source_extended, blurred;
+  memset(&source_extended, 0, sizeof(source_extended));
+  memset(&blurred, 0, sizeof(blurred));
+  aom_alloc_frame_buffer(
+      &source_extended, width, height, 1, 1, cm->seq_params.use_highbitdepth,
+      cpi->oxcf.border_in_pixels, cm->features.byte_alignment);
+  aom_alloc_frame_buffer(
+      &blurred, width, height, 1, 1, cm->seq_params.use_highbitdepth,
+      cpi->oxcf.border_in_pixels, cm->features.byte_alignment);
+
+  av1_copy_and_extend_frame(source, &source_extended);
+  gaussian_blur(bit_depth, &source_extended, &blurred);
+  aom_free_frame_buffer(&source_extended);
+
+  const double best_frame_unsharp_amount = find_best_frame_unsharp_amount(
+      cpi, source, &blurred, cpi->last_frame_unsharp_amount, 0.05, 20, 1.01);
+  cpi->last_frame_unsharp_amount = best_frame_unsharp_amount;
+
+  unsharp(cpi, source, &blurred, source, best_frame_unsharp_amount);
+  aom_free_frame_buffer(&blurred);
+  aom_clear_system_state();
+}
+
+void av1_vmaf_blk_preprocessing(AV1_COMP *const cpi,
+                                YV12_BUFFER_CONFIG *const source) {
+  aom_clear_system_state();
+  const AV1_COMMON *const cm = &cpi->common;
+  const int width = source->y_width;
+  const int height = source->y_height;
+  const int bit_depth = cpi->td.mb.e_mbd.bd;
+
+  YV12_BUFFER_CONFIG source_extended, blurred;
+  memset(&blurred, 0, sizeof(blurred));
+  memset(&source_extended, 0, sizeof(source_extended));
+  aom_alloc_frame_buffer(
+      &blurred, width, height, 1, 1, cm->seq_params.use_highbitdepth,
+      cpi->oxcf.border_in_pixels, cm->features.byte_alignment);
+  aom_alloc_frame_buffer(
+      &source_extended, width, height, 1, 1, cm->seq_params.use_highbitdepth,
+      cpi->oxcf.border_in_pixels, cm->features.byte_alignment);
+
+  av1_copy_and_extend_frame(source, &source_extended);
+  gaussian_blur(bit_depth, &source_extended, &blurred);
+  aom_free_frame_buffer(&source_extended);
+
+  const double best_frame_unsharp_amount = find_best_frame_unsharp_amount(
+      cpi, source, &blurred, cpi->last_frame_unsharp_amount, 0.05, 20, 1.01);
+  cpi->last_frame_unsharp_amount = best_frame_unsharp_amount;
+
+  const int block_size = BLOCK_64X64;
+  const int block_w = mi_size_wide[block_size] * 4;
+  const int block_h = mi_size_high[block_size] * 4;
+  const int num_cols = (source->y_width + block_w - 1) / block_w;
+  const int num_rows = (source->y_height + block_h - 1) / block_h;
+  double *best_unsharp_amounts =
+      aom_malloc(sizeof(*best_unsharp_amounts) * num_cols * num_rows);
+  memset(best_unsharp_amounts, 0,
+         sizeof(*best_unsharp_amounts) * num_cols * num_rows);
+
+  YV12_BUFFER_CONFIG source_block, blurred_block;
+  memset(&source_block, 0, sizeof(source_block));
+  memset(&blurred_block, 0, sizeof(blurred_block));
+  aom_alloc_frame_buffer(
+      &source_block, block_w, block_h, 1, 1, cm->seq_params.use_highbitdepth,
+      cpi->oxcf.border_in_pixels, cm->features.byte_alignment);
+  aom_alloc_frame_buffer(
+      &blurred_block, block_w, block_h, 1, 1, cm->seq_params.use_highbitdepth,
+      cpi->oxcf.border_in_pixels, cm->features.byte_alignment);
+
+  for (int row = 0; row < num_rows; ++row) {
+    for (int col = 0; col < num_cols; ++col) {
+      const int row_offset_y = row * block_h;
+      const int col_offset_y = col * block_w;
+      const int block_width = AOMMIN(width - col_offset_y, block_w);
+      const int block_height = AOMMIN(height - row_offset_y, block_h);
+      const int index = col + row * num_cols;
+
+      if (bit_depth > 8) {
+        uint16_t *frame_src_buf = CONVERT_TO_SHORTPTR(source->y_buffer) +
+                                  row_offset_y * source->y_stride +
+                                  col_offset_y;
+        uint16_t *frame_blurred_buf = CONVERT_TO_SHORTPTR(blurred.y_buffer) +
+                                      row_offset_y * blurred.y_stride +
+                                      col_offset_y;
+        uint16_t *blurred_dst = CONVERT_TO_SHORTPTR(blurred_block.y_buffer);
+        uint16_t *src_dst = CONVERT_TO_SHORTPTR(source_block.y_buffer);
+
+        // Copy block from source frame.
+        for (int i = 0; i < block_h; ++i) {
+          for (int j = 0; j < block_w; ++j) {
+            if (i >= block_height || j >= block_width) {
+              src_dst[j] = 0;
+              blurred_dst[j] = 0;
+            } else {
+              src_dst[j] = frame_src_buf[j];
+              blurred_dst[j] = frame_blurred_buf[j];
+            }
+          }
+          frame_src_buf += source->y_stride;
+          frame_blurred_buf += blurred.y_stride;
+          src_dst += source_block.y_stride;
+          blurred_dst += blurred_block.y_stride;
+        }
+      } else {
+        uint8_t *frame_src_buf =
+            source->y_buffer + row_offset_y * source->y_stride + col_offset_y;
+        uint8_t *frame_blurred_buf =
+            blurred.y_buffer + row_offset_y * blurred.y_stride + col_offset_y;
+        uint8_t *blurred_dst = blurred_block.y_buffer;
+        uint8_t *src_dst = source_block.y_buffer;
+
+        // Copy block from source frame.
+        for (int i = 0; i < block_h; ++i) {
+          for (int j = 0; j < block_w; ++j) {
+            if (i >= block_height || j >= block_width) {
+              src_dst[j] = 0;
+              blurred_dst[j] = 0;
+            } else {
+              src_dst[j] = frame_src_buf[j];
+              blurred_dst[j] = frame_blurred_buf[j];
+            }
+          }
+          frame_src_buf += source->y_stride;
+          frame_blurred_buf += blurred.y_stride;
+          src_dst += source_block.y_stride;
+          blurred_dst += blurred_block.y_stride;
+        }
+      }
+
+      best_unsharp_amounts[index] = find_best_frame_unsharp_amount(
+          cpi, &source_block, &blurred_block, best_frame_unsharp_amount, 0.1, 3,
+          1.5);
+    }
+  }
+
+  // Apply best blur amounts
+  for (int row = 0; row < num_rows; ++row) {
+    for (int col = 0; col < num_cols; ++col) {
+      const int row_offset_y = row * block_h;
+      const int col_offset_y = col * block_w;
+      const int block_width = AOMMIN(source->y_width - col_offset_y, block_w);
+      const int block_height = AOMMIN(source->y_height - row_offset_y, block_h);
+      const int index = col + row * num_cols;
+
+      if (bit_depth > 8) {
+        uint16_t *src_buf = CONVERT_TO_SHORTPTR(source->y_buffer) +
+                            row_offset_y * source->y_stride + col_offset_y;
+        uint16_t *blurred_buf = CONVERT_TO_SHORTPTR(blurred.y_buffer) +
+                                row_offset_y * blurred.y_stride + col_offset_y;
+        highbd_unsharp_rect(src_buf, source->y_stride, blurred_buf,
+                            blurred.y_stride, src_buf, source->y_stride,
+                            block_width, block_height,
+                            best_unsharp_amounts[index], bit_depth);
+      } else {
+        uint8_t *src_buf =
+            source->y_buffer + row_offset_y * source->y_stride + col_offset_y;
+        uint8_t *blurred_buf =
+            blurred.y_buffer + row_offset_y * blurred.y_stride + col_offset_y;
+        unsharp_rect(src_buf, source->y_stride, blurred_buf, blurred.y_stride,
+                     src_buf, source->y_stride, block_width, block_height,
+                     best_unsharp_amounts[index]);
+      }
+    }
+  }
+
+  aom_free_frame_buffer(&source_block);
+  aom_free_frame_buffer(&blurred_block);
+  aom_free_frame_buffer(&blurred);
+  aom_free(best_unsharp_amounts);
+  aom_clear_system_state();
+}
+
+typedef struct FrameData {
+  const YV12_BUFFER_CONFIG *source, *blurred;
+  int block_w, block_h, num_rows, num_cols, row, col, bit_depth;
+} FrameData;
+
+// A callback function used to pass data to VMAF.
+// Returns 0 after reading a frame.
+// Returns 2 when there is no more frame to read.
+static int update_frame(float *ref_data, float *main_data, float *temp_data,
+                        int stride, void *user_data) {
+  FrameData *frames = (FrameData *)user_data;
+  const int width = frames->source->y_width;
+  const int height = frames->source->y_height;
+  const int row = frames->row;
+  const int col = frames->col;
+  const int num_rows = frames->num_rows;
+  const int num_cols = frames->num_cols;
+  const int block_w = frames->block_w;
+  const int block_h = frames->block_h;
+  const YV12_BUFFER_CONFIG *source = frames->source;
+  const YV12_BUFFER_CONFIG *blurred = frames->blurred;
+  const int bit_depth = frames->bit_depth;
+  const float scale_factor = 1.0f / (float)(1 << (bit_depth - 8));
+  (void)temp_data;
+  stride /= (int)sizeof(*ref_data);
+
+  for (int i = 0; i < height; ++i) {
+    float *ref, *main;
+    ref = ref_data + i * stride;
+    main = main_data + i * stride;
+    if (bit_depth == 8) {
+      uint8_t *src;
+      src = source->y_buffer + i * source->y_stride;
+      for (int j = 0; j < width; ++j) {
+        ref[j] = main[j] = (float)src[j];
+      }
+    } else {
+      uint16_t *src;
+      src = CONVERT_TO_SHORTPTR(source->y_buffer) + i * source->y_stride;
+      for (int j = 0; j < width; ++j) {
+        ref[j] = main[j] = scale_factor * (float)src[j];
+      }
+    }
+  }
+  if (row < num_rows && col < num_cols) {
+    // Set current block
+    const int row_offset = row * block_h;
+    const int col_offset = col * block_w;
+    const int block_width = AOMMIN(width - col_offset, block_w);
+    const int block_height = AOMMIN(height - row_offset, block_h);
+
+    float *main_buf = main_data + col_offset + row_offset * stride;
+    if (bit_depth == 8) {
+      uint8_t *blurred_buf =
+          blurred->y_buffer + row_offset * blurred->y_stride + col_offset;
+      for (int i = 0; i < block_height; ++i) {
+        for (int j = 0; j < block_width; ++j) {
+          main_buf[j] = (float)blurred_buf[j];
+        }
+        main_buf += stride;
+        blurred_buf += blurred->y_stride;
+      }
+    } else {
+      uint16_t *blurred_buf = CONVERT_TO_SHORTPTR(blurred->y_buffer) +
+                              row_offset * blurred->y_stride + col_offset;
+      for (int i = 0; i < block_height; ++i) {
+        for (int j = 0; j < block_width; ++j) {
+          main_buf[j] = scale_factor * (float)blurred_buf[j];
+        }
+        main_buf += stride;
+        blurred_buf += blurred->y_stride;
+      }
+    }
+
+    frames->col++;
+    if (frames->col >= num_cols) {
+      frames->col = 0;
+      frames->row++;
+    }
+    return 0;
+  } else {
+    return 2;
+  }
+}
+
+void av1_set_mb_vmaf_rdmult_scaling(AV1_COMP *cpi) {
+  AV1_COMMON *cm = &cpi->common;
+  const int y_width = cpi->source->y_width;
+  const int y_height = cpi->source->y_height;
+  const int resized_block_size = BLOCK_32X32;
+  const int resize_factor = 2;
+  const int bit_depth = cpi->td.mb.e_mbd.bd;
+
+  aom_clear_system_state();
+  YV12_BUFFER_CONFIG resized_source;
+  memset(&resized_source, 0, sizeof(resized_source));
+  aom_alloc_frame_buffer(
+      &resized_source, y_width / resize_factor, y_height / resize_factor, 1, 1,
+      cm->seq_params.use_highbitdepth, cpi->oxcf.border_in_pixels,
+      cm->features.byte_alignment);
+  av1_resize_and_extend_frame(cpi->source, &resized_source, bit_depth,
+                              av1_num_planes(cm));
+
+  const int resized_y_width = resized_source.y_width;
+  const int resized_y_height = resized_source.y_height;
+  const int resized_block_w = mi_size_wide[resized_block_size] * 4;
+  const int resized_block_h = mi_size_high[resized_block_size] * 4;
+  const int num_cols =
+      (resized_y_width + resized_block_w - 1) / resized_block_w;
+  const int num_rows =
+      (resized_y_height + resized_block_h - 1) / resized_block_h;
+
+  YV12_BUFFER_CONFIG blurred;
+  memset(&blurred, 0, sizeof(blurred));
+  aom_alloc_frame_buffer(&blurred, resized_y_width, resized_y_height, 1, 1,
+                         cm->seq_params.use_highbitdepth,
+                         cpi->oxcf.border_in_pixels,
+                         cm->features.byte_alignment);
+  gaussian_blur(bit_depth, &resized_source, &blurred);
+
+  double *scores = aom_malloc(sizeof(*scores) * (num_rows * num_cols));
+  memset(scores, 0, sizeof(*scores) * (num_rows * num_cols));
+  FrameData frame_data;
+  frame_data.source = &resized_source;
+  frame_data.blurred = &blurred;
+  frame_data.block_w = resized_block_w;
+  frame_data.block_h = resized_block_h;
+  frame_data.num_rows = num_rows;
+  frame_data.num_cols = num_cols;
+  frame_data.row = 0;
+  frame_data.col = 0;
+  frame_data.bit_depth = bit_depth;
+  aom_calc_vmaf_multi_frame(&frame_data, cpi->oxcf.vmaf_model_path,
+                            update_frame, resized_y_width, resized_y_height,
+                            bit_depth, scores);
+
+  // Loop through each 'block_size' block.
+  for (int row = 0; row < num_rows; ++row) {
+    for (int col = 0; col < num_cols; ++col) {
+      const int index = row * num_cols + col;
+      const int row_offset_y = row * resized_block_h;
+      const int col_offset_y = col * resized_block_w;
+
+      uint8_t *const orig_buf = resized_source.y_buffer +
+                                row_offset_y * resized_source.y_stride +
+                                col_offset_y;
+      uint8_t *const blurred_buf =
+          blurred.y_buffer + row_offset_y * blurred.y_stride + col_offset_y;
+
+      const double vmaf = scores[index];
+      const double dvmaf = kBaselineVmaf - vmaf;
+      unsigned int sse;
+      cpi->fn_ptr[resized_block_size].vf(orig_buf, resized_source.y_stride,
+                                         blurred_buf, blurred.y_stride, &sse);
+
+      const double mse =
+          (double)sse / (double)(resized_y_width * resized_y_height);
+      double weight;
+      const double eps = 0.01 / (num_rows * num_cols);
+      if (dvmaf < eps || mse < eps) {
+        weight = 1.0;
+      } else {
+        weight = mse / dvmaf;
+      }
+
+      // Normalize it with a data fitted model.
+      weight = 6.0 * (1.0 - exp(-0.05 * weight)) + 0.8;
+      cpi->vmaf_rdmult_scaling_factors[index] = weight;
+    }
+  }
+
+  aom_free_frame_buffer(&resized_source);
+  aom_free_frame_buffer(&blurred);
+  aom_free(scores);
+  aom_clear_system_state();
+}
+
+void av1_set_vmaf_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x,
+                         const BLOCK_SIZE bsize, const int mi_row,
+                         const int mi_col, int *const rdmult) {
+  const AV1_COMMON *const cm = &cpi->common;
+
+  const int bsize_base = BLOCK_64X64;
+  const int num_mi_w = mi_size_wide[bsize_base];
+  const int num_mi_h = mi_size_high[bsize_base];
+  const int num_cols = (cm->mi_params.mi_cols + num_mi_w - 1) / num_mi_w;
+  const int num_rows = (cm->mi_params.mi_rows + num_mi_h - 1) / num_mi_h;
+  const int num_bcols = (mi_size_wide[bsize] + num_mi_w - 1) / num_mi_w;
+  const int num_brows = (mi_size_high[bsize] + num_mi_h - 1) / num_mi_h;
+  int row, col;
+  double num_of_mi = 0.0;
+  double geom_mean_of_scale = 0.0;
+
+  aom_clear_system_state();
+  for (row = mi_row / num_mi_w;
+       row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) {
+    for (col = mi_col / num_mi_h;
+         col < num_cols && col < mi_col / num_mi_h + num_bcols; ++col) {
+      const int index = row * num_cols + col;
+      geom_mean_of_scale += log(cpi->vmaf_rdmult_scaling_factors[index]);
+      num_of_mi += 1.0;
+    }
+  }
+  geom_mean_of_scale = exp(geom_mean_of_scale / num_of_mi);
+
+  *rdmult = (int)((double)(*rdmult) * geom_mean_of_scale + 0.5);
+  *rdmult = AOMMAX(*rdmult, 0);
+  set_error_per_bit(x, *rdmult);
+  aom_clear_system_state();
+}
+
+// TODO(sdeng): replace them with the SIMD versions.
+static AOM_INLINE double highbd_image_sad_c(const uint16_t *src, int src_stride,
+                                            const uint16_t *ref, int ref_stride,
+                                            int w, int h) {
+  double accum = 0.0;
+  int i, j;
+
+  for (i = 0; i < h; ++i) {
+    for (j = 0; j < w; ++j) {
+      double img1px = src[i * src_stride + j];
+      double img2px = ref[i * ref_stride + j];
+
+      accum += fabs(img1px - img2px);
+    }
+  }
+
+  return accum / (double)(h * w);
+}
+
+static AOM_INLINE double image_sad_c(const uint8_t *src, int src_stride,
+                                     const uint8_t *ref, int ref_stride, int w,
+                                     int h) {
+  double accum = 0.0;
+  int i, j;
+
+  for (i = 0; i < h; ++i) {
+    for (j = 0; j < w; ++j) {
+      double img1px = src[i * src_stride + j];
+      double img2px = ref[i * ref_stride + j];
+
+      accum += fabs(img1px - img2px);
+    }
+  }
+
+  return accum / (double)(h * w);
+}
+
+static AOM_INLINE double calc_vmaf_motion_score(
+    const AV1_COMP *const cpi, const AV1_COMMON *const cm,
+    const YV12_BUFFER_CONFIG *const cur, const YV12_BUFFER_CONFIG *const last,
+    const YV12_BUFFER_CONFIG *const next) {
+  const int y_width = cur->y_width;
+  const int y_height = cur->y_height;
+  YV12_BUFFER_CONFIG blurred_cur, blurred_last, blurred_next;
+  const int bit_depth = cpi->td.mb.e_mbd.bd;
+
+  memset(&blurred_cur, 0, sizeof(blurred_cur));
+  memset(&blurred_last, 0, sizeof(blurred_last));
+  memset(&blurred_next, 0, sizeof(blurred_next));
+
+  aom_alloc_frame_buffer(
+      &blurred_cur, y_width, y_height, 1, 1, cm->seq_params.use_highbitdepth,
+      cpi->oxcf.border_in_pixels, cm->features.byte_alignment);
+  aom_alloc_frame_buffer(
+      &blurred_last, y_width, y_height, 1, 1, cm->seq_params.use_highbitdepth,
+      cpi->oxcf.border_in_pixels, cm->features.byte_alignment);
+  aom_alloc_frame_buffer(
+      &blurred_next, y_width, y_height, 1, 1, cm->seq_params.use_highbitdepth,
+      cpi->oxcf.border_in_pixels, cm->features.byte_alignment);
+
+  gaussian_blur(bit_depth, cur, &blurred_cur);
+  gaussian_blur(bit_depth, last, &blurred_last);
+  if (next) gaussian_blur(bit_depth, next, &blurred_next);
+
+  double motion1, motion2 = 65536.0;
+  if (bit_depth > 8) {
+    const float scale_factor = 1.0f / (float)(1 << (bit_depth - 8));
+    motion1 = highbd_image_sad_c(CONVERT_TO_SHORTPTR(blurred_cur.y_buffer),
+                                 blurred_cur.y_stride,
+                                 CONVERT_TO_SHORTPTR(blurred_last.y_buffer),
+                                 blurred_last.y_stride, y_width, y_height) *
+              scale_factor;
+    if (next) {
+      motion2 = highbd_image_sad_c(CONVERT_TO_SHORTPTR(blurred_cur.y_buffer),
+                                   blurred_cur.y_stride,
+                                   CONVERT_TO_SHORTPTR(blurred_next.y_buffer),
+                                   blurred_next.y_stride, y_width, y_height) *
+                scale_factor;
+    }
+  } else {
+    motion1 = image_sad_c(blurred_cur.y_buffer, blurred_cur.y_stride,
+                          blurred_last.y_buffer, blurred_last.y_stride, y_width,
+                          y_height);
+    if (next) {
+      motion2 = image_sad_c(blurred_cur.y_buffer, blurred_cur.y_stride,
+                            blurred_next.y_buffer, blurred_next.y_stride,
+                            y_width, y_height);
+    }
+  }
+
+  aom_free_frame_buffer(&blurred_cur);
+  aom_free_frame_buffer(&blurred_last);
+  aom_free_frame_buffer(&blurred_next);
+
+  return AOMMIN(motion1, motion2);
+}
+
+// Calculates the new qindex from the VMAF motion score. This is based on the
+// observation: when the motion score becomes higher, the VMAF score of the
+// same source and distorted frames would become higher.
+int av1_get_vmaf_base_qindex(const AV1_COMP *const cpi, int current_qindex) {
+  const AV1_COMMON *const cm = &cpi->common;
+  if (cm->current_frame.frame_number == 0 || cpi->oxcf.pass == 1) {
+    return current_qindex;
+  }
+  const int bit_depth = cpi->td.mb.e_mbd.bd;
+  const double approx_sse =
+      cpi->last_frame_ysse /
+      (double)((1 << (bit_depth - 8)) * (1 << (bit_depth - 8)));
+  const double approx_dvmaf = kBaselineVmaf - cpi->last_frame_vmaf;
+  const double sse_threshold =
+      0.01 * cpi->source->y_width * cpi->source->y_height;
+  const double vmaf_threshold = 0.01;
+  if (approx_sse < sse_threshold || approx_dvmaf < vmaf_threshold) {
+    return current_qindex;
+  }
+  aom_clear_system_state();
+  const GF_GROUP *gf_group = &cpi->gf_group;
+  YV12_BUFFER_CONFIG *cur_buf = cpi->source;
+  int src_index = 0;
+  if (cm->show_frame == 0) {
+    src_index = gf_group->arf_src_offset[gf_group->index];
+    struct lookahead_entry *cur_entry =
+        av1_lookahead_peek(cpi->lookahead, src_index, cpi->compressor_stage);
+    cur_buf = &cur_entry->img;
+  }
+  assert(cur_buf);
+
+  const struct lookahead_entry *last_entry =
+      av1_lookahead_peek(cpi->lookahead, src_index - 1, cpi->compressor_stage);
+  const struct lookahead_entry *next_entry =
+      av1_lookahead_peek(cpi->lookahead, src_index + 1, cpi->compressor_stage);
+  const YV12_BUFFER_CONFIG *next_buf = &next_entry->img;
+  const YV12_BUFFER_CONFIG *last_buf =
+      cm->show_frame ? cpi->last_source : &last_entry->img;
+
+  assert(last_buf);
+
+  const double motion =
+      calc_vmaf_motion_score(cpi, cm, cur_buf, last_buf, next_buf);
+
+  // Get dVMAF through a data fitted model.
+  const double dvmaf = 26.11 * (1.0 - exp(-0.06 * motion));
+  const double dsse = dvmaf * approx_sse / approx_dvmaf;
+
+  const double beta = approx_sse / (dsse + approx_sse);
+  const int offset = av1_get_deltaq_offset(cpi, current_qindex, beta);
+  int qindex = current_qindex + offset;
+
+  qindex = AOMMIN(qindex, MAXQ);
+  qindex = AOMMAX(qindex, MINQ);
+
+  aom_clear_system_state();
+  return qindex;
+}
+
+void av1_update_vmaf_curve(AV1_COMP *cpi, YV12_BUFFER_CONFIG *source,
+                           YV12_BUFFER_CONFIG *recon) {
+  const int bit_depth = cpi->td.mb.e_mbd.bd;
+  aom_calc_vmaf(cpi->oxcf.vmaf_model_path, source, recon, bit_depth,
+                &cpi->last_frame_vmaf);
+  if (bit_depth > 8) {
+    cpi->last_frame_ysse = (double)aom_highbd_get_y_sse(source, recon);
+  } else {
+    cpi->last_frame_ysse = (double)aom_get_y_sse(source, recon);
+  }
+}
diff --git a/libs/libaom/src/av1/encoder/tune_vmaf.h b/libs/libaom/src/av1/encoder/tune_vmaf.h
new file mode 100644
index 000000000..c4cf07224
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/tune_vmaf.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_TUNE_VMAF_H_
+#define AOM_AV1_ENCODER_TUNE_VMAF_H_
+
+#include "aom_scale/yv12config.h"
+#include "av1/encoder/encoder.h"
+
+void av1_vmaf_blk_preprocessing(AV1_COMP *cpi, YV12_BUFFER_CONFIG *source);
+
+void av1_vmaf_frame_preprocessing(AV1_COMP *cpi, YV12_BUFFER_CONFIG *source);
+
+void av1_set_mb_vmaf_rdmult_scaling(AV1_COMP *cpi);
+
+void av1_set_vmaf_rdmult(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+                         int mi_row, int mi_col, int *rdmult);
+
+int av1_get_vmaf_base_qindex(const AV1_COMP *cpi, int current_qindex);
+
+void av1_update_vmaf_curve(AV1_COMP *cpi, YV12_BUFFER_CONFIG *source,
+                           YV12_BUFFER_CONFIG *recon);
+
+#endif  // AOM_AV1_ENCODER_TUNE_VMAF_H_
diff --git a/libs/libaom/src/av1/encoder/tx_prune_model_weights.h b/libs/libaom/src/av1/encoder/tx_prune_model_weights.h
new file mode 100644
index 000000000..76efe9382
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/tx_prune_model_weights.h
@@ -0,0 +1,3320 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_TX_PRUNE_MODEL_WEIGHTS_H_
+#define AOM_AV1_ENCODER_TX_PRUNE_MODEL_WEIGHTS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/encoder/ml.h"
+
+/***************************CONFIG_NN_V2 (New)********************************/
+#if CONFIG_NN_V2
+// Tx type model for 4x4 block.
+static float av1_tx_type_nn_4x4_hor_layer0_weights[32] = {
+  -1.64947f, -1.54497f, -1.62832f, -0.17774f, -2.89498f, -0.72498f, 0.72036f,
+  0.17996f,  1.20000f,  -0.27654f, 0.77396f,  1.21684f,  -1.75909f, -0.51272f,
+  -1.25923f, 0.35005f,  -0.04257f, -0.23389f, -0.41841f, -0.08229f, 0.09503f,
+  2.73144f,  -0.16875f, -0.23482f, 0.02194f,  -0.26427f, 0.28049f,  0.21260f,
+  1.35792f,  0.27733f,  0.88660f,  -0.68304f,
+};
+
+static float av1_tx_type_nn_4x4_hor_layer0_bias[8] = {
+  1.38742f, 0.59540f,  -1.37622f, 1.92114f,
+  0.00000f, -0.38998f, -0.32726f, -0.15650f,
+};
+
+static float av1_tx_type_nn_4x4_hor_layer1_weights[32] = {
+  1.65254f,  1.00915f,  -0.89318f, -2.05142f, -0.23235f, 0.96781f,  -0.37145f,
+  -0.21056f, 1.13891f,  0.38675f,  0.87739f,  -1.42697f, 0.48015f,  0.61883f,
+  -0.03979f, 0.11487f,  0.48042f,  0.45200f,  -0.23242f, 0.75166f,  0.55458f,
+  0.39452f,  -0.35285f, 1.59120f,  -1.49221f, -0.48349f, -0.64692f, 1.49297f,
+  -0.26782f, -0.65416f, -0.10648f, 0.05568f,
+};
+
+static float av1_tx_type_nn_4x4_hor_layer1_bias[4] = {
+  4.07177f,
+  3.26961f,
+  0.58083f,
+  1.21199f,
+};
+
+static float av1_tx_type_nn_4x4_hor_layer0_out[8] = { 0 };
+static float av1_tx_type_nn_4x4_hor_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_4x4_hor = {
+  1,  // num_hidden_layers
+  {
+      // fc layer setting
+      {
+          // layer 0
+          4,                                      // num_inputs
+          8,                                      // num_outputs
+          av1_tx_type_nn_4x4_hor_layer0_weights,  // weights
+          av1_tx_type_nn_4x4_hor_layer0_bias,     // bias
+          RELU,                                   // activation
+          av1_tx_type_nn_4x4_hor_layer0_out,      // output
+          NULL,
+          NULL,
+          NULL,
+      },
+      {
+          8,  // num_inputs (!!same as num_outputs of last layer)
+          4,
+          av1_tx_type_nn_4x4_hor_layer1_weights,
+          av1_tx_type_nn_4x4_hor_layer1_bias,
+          NONE,
+          av1_tx_type_nn_4x4_hor_layer1_out,
+          NULL,
+          NULL,
+          NULL,
+      },
+  },
+  4,                                  // num_outputs
+  av1_tx_type_nn_4x4_hor_layer1_out,  // logits (!!same as last layer output)
+  SOFTMAX_CROSS_ENTROPY,
+};
+
+static float av1_tx_type_nn_4x4_ver_layer0_weights[32] = {
+  -0.02032f, 2.61610f,  0.02098f,  -0.30217f, 0.12637f,  0.11017f,  -3.01996f,
+  0.35144f,  1.93776f,  -0.20463f, 1.64102f,  -1.41986f, -3.66717f, -0.51655f,
+  0.43910f,  0.37778f,  -1.02634f, 0.85337f,  -0.69753f, 1.00206f,  2.11784f,
+  1.89427f,  1.92919f,  0.43201f,  -1.67358f, -1.67035f, -1.54623f, 0.16714f,
+  -0.06589f, -0.28142f, -0.33118f, 1.72227f,
+};
+
+static float av1_tx_type_nn_4x4_ver_layer0_bias[8] = {
+  -0.33685f, 0.22025f,  0.28140f, 0.56138f,
+  0.93489f,  -1.77048f, 1.34989f, -0.93747f,
+};
+
+static float av1_tx_type_nn_4x4_ver_layer1_weights[32] = {
+  -1.39506f, -1.06271f, -1.10886f, -1.69719f, 0.19699f,  -2.39850f, -1.26457f,
+  0.75328f,  -1.26005f, -0.82738f, -0.12015f, -1.02702f, 1.40828f,  -2.37739f,
+  -0.65639f, -0.71992f, -0.90453f, -1.12510f, -2.41362f, -1.16061f, -1.85577f,
+  -0.99165f, -1.91366f, 0.16785f,  0.34776f,  0.58154f,  -0.18217f, -0.29257f,
+  -0.86315f, -0.53336f, 0.30320f,  -1.32331f,
+};
+
+static float av1_tx_type_nn_4x4_ver_layer1_bias[4] = {
+  -1.31519f,
+  -3.26321f,
+  1.71794f,
+  -1.90778f,
+};
+
+static float av1_tx_type_nn_4x4_ver_layer0_out[8] = { 0 };
+static float av1_tx_type_nn_4x4_ver_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_4x4_ver = {
+  1,  // num_hidden_layers
+  {
+      // fc layer setting
+      {
+          // layer 0
+          4,                                      // num_inputs
+          8,                                      // num_outputs
+          av1_tx_type_nn_4x4_ver_layer0_weights,  // weights
+          av1_tx_type_nn_4x4_ver_layer0_bias,     // bias
+          RELU,                                   // activation
+          av1_tx_type_nn_4x4_ver_layer0_out,      // output
+          NULL,
+          NULL,
+          NULL,
+      },
+      {
+          8,  // num_inputs (!!same as num_outputs of last layer)
+          4,
+          av1_tx_type_nn_4x4_ver_layer1_weights,
+          av1_tx_type_nn_4x4_ver_layer1_bias,
+          NONE,
+          av1_tx_type_nn_4x4_ver_layer1_out,
+          NULL,
+          NULL,
+          NULL,
+      },
+  },
+  4,                                  // num_outputs
+  av1_tx_type_nn_4x4_ver_layer1_out,  // logits (!!same as last layer output)
+  SOFTMAX_CROSS_ENTROPY,
+};
+/******************************************************************************/
+
+// Tx type model for 4x8 block.
+static float av1_tx_type_nn_4x8_hor_layer0_weights[32] = {
+  0.00218f,  -0.41880f, -0.61215f, -0.92588f, 0.54291f,  -0.10898f, 0.70691f,
+  0.46819f,  -1.61598f, -0.08834f, -0.96839f, 1.18489f,  -0.45171f, -0.65445f,
+  -0.32179f, -0.10399f, 1.04379f,  0.91895f,  0.85589f,  0.08267f,  1.35388f,
+  -2.03096f, 0.08168f,  -0.06372f, -0.26732f, -0.48262f, -0.08682f, 2.44071f,
+  -1.35896f, -1.17121f, 1.68866f,  0.10357f,
+};
+
+static float av1_tx_type_nn_4x8_hor_layer0_bias[8] = {
+  2.93391f,  0.66831f, -0.21419f, 0.00000f,
+  -0.72878f, 0.15127f, -1.46755f, 0.16658f,
+};
+
+static float av1_tx_type_nn_4x8_hor_layer1_weights[32] = {
+  -1.52077f, -1.06243f, 0.35319f,  -0.49207f, 0.54524f,  0.44271f, 1.37117f,
+  -0.38957f, -1.28889f, -0.57133f, 0.04658f,  0.62278f,  0.37984f, 0.33247f,
+  1.65547f,  -0.56806f, -1.38645f, -0.76258f, 0.67926f,  0.08783f, -0.01443f,
+  0.34950f,  1.45812f,  -0.51332f, -1.41331f, -0.16453f, 0.05755f, 0.31405f,
+  -0.50191f, 0.18219f,  1.83664f,  -0.75276f,
+};
+
+static float av1_tx_type_nn_4x8_hor_layer1_bias[4] = {
+  -1.17455f,
+  -2.26089f,
+  -1.79863f,
+  -2.26333f,
+};
+
+static float av1_tx_type_nn_4x8_hor_layer0_out[8] = { 0 };
+static float av1_tx_type_nn_4x8_hor_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_4x8_hor = {
+  1,  // num_hidden_layers
+  {
+      // fc layer setting
+      {
+          // layer 0
+          4,                                      // num_inputs
+          8,                                      // num_outputs
+          av1_tx_type_nn_4x8_hor_layer0_weights,  // weights
+          av1_tx_type_nn_4x8_hor_layer0_bias,     // bias
+          RELU,                                   // activation
+          av1_tx_type_nn_4x8_hor_layer0_out,      // output
+          NULL,
+          NULL,
+          NULL,
+      },
+      {
+          8,  // num_inputs (!!same as num_outputs of last layer)
+          4,
+          av1_tx_type_nn_4x8_hor_layer1_weights,
+          av1_tx_type_nn_4x8_hor_layer1_bias,
+          NONE,
+          av1_tx_type_nn_4x8_hor_layer1_out,
+          NULL,
+          NULL,
+          NULL,
+      },
+  },
+  4,                                  // num_outputs
+  av1_tx_type_nn_4x8_hor_layer1_out,  // logits (!!same as last layer output)
+  SOFTMAX_CROSS_ENTROPY,
+};
+
+static float av1_tx_type_nn_4x8_ver_layer0_weights[128] = {
+  -0.00952f, -0.98858f, -0.93181f, 1.39594f,  0.96559f,  0.18162f,  -0.76064f,
+  -0.06066f, 0.07907f,  -0.09365f, -0.21313f, -0.02187f, -2.61707f, -2.68702f,
+  -0.10982f, 0.18559f,  1.17049f,  1.11387f,  1.12697f,  1.05804f,  1.12764f,
+  1.06318f,  1.12052f,  0.17406f,  1.83157f,  0.19362f,  0.46910f,  0.39608f,
+  0.33342f,  0.40083f,  0.27645f,  1.06864f,  -4.06645f, -0.38775f, -0.11070f,
+  0.03781f,  -0.09141f, 0.06185f,  -0.04852f, 0.20163f,  0.16784f,  0.16641f,
+  -0.50941f, -0.61087f, 2.07008f,  -0.82381f, -0.85558f, 0.05528f,  -0.10535f,
+  -2.81150f, 0.67038f,  0.43643f,  0.49062f,  -0.04465f, 0.90438f,  0.00977f,
+  0.46272f,  1.59751f,  0.95234f,  0.35086f,  0.85624f,  0.73149f,  1.67779f,
+  -2.21511f, -1.24746f, -1.09014f, -0.92441f, -1.22591f, -1.06961f, -0.95897f,
+  -1.24956f, 0.73797f,  1.23275f,  -0.60064f, -0.07851f, 0.14397f,  0.22110f,
+  -0.04422f, 0.14350f,  0.75926f,  0.35032f,  0.48104f,  2.81408f,  0.34662f,
+  0.42090f,  0.35521f,  -1.36804f, -0.14974f, -0.47696f, -0.07892f, 0.36910f,
+  0.32299f,  0.23916f,  0.06032f,  -0.17844f, -0.17558f, -1.42746f, -0.55828f,
+  -1.00418f, -0.64823f, -0.73654f, -0.85197f, -1.50989f, 1.69385f,  -0.04973f,
+  -0.09273f, 1.04249f,  0.79235f,  1.13229f,  0.99617f,  0.03851f,  0.56334f,
+  0.90795f,  1.08296f,  0.58519f,  1.74765f,  0.63971f,  1.35951f,  0.07803f,
+  -0.05127f, 0.26514f,  -0.84629f, -0.66343f, -2.10630f, 0.11017f,  2.18528f,
+  -0.21958f, 0.05970f,
+};
+
+static float av1_tx_type_nn_4x8_ver_layer0_bias[16] = {
+  0.04205f, 0.22260f, -1.03870f, -1.19568f, 0.44283f,  0.01143f,
+  0.00235f, 4.26772f, 0.44364f,  -0.33199f, -0.39076f, -0.35129f,
+  0.08288f, 0.18195f, -0.79890f, 0.10047f,
+};
+
+static float av1_tx_type_nn_4x8_ver_layer1_weights[64] = {
+  -0.38193f, -0.12095f, 1.57802f,  0.34932f,  -0.47333f, -0.12304f, -0.01736f,
+  -2.52445f, 0.18983f,  -0.64707f, -0.60889f, -0.53750f, 0.91666f,  -0.62823f,
+  -0.13377f, -0.43594f, -0.38618f, -0.01328f, 0.97457f,  1.48589f,  -1.03238f,
+  -0.33459f, -0.35108f, -2.42417f, 0.60229f,  0.06824f,  -0.75495f, 0.26902f,
+  0.65311f,  -0.23887f, -0.44604f, -0.55800f, -0.33842f, 0.04259f,  -0.59589f,
+  0.49738f,  -0.62301f, -0.30896f, -0.29602f, -2.57052f, 2.00943f,  -0.66490f,
+  -0.76312f, 0.28256f,  1.06311f,  -0.38364f, -0.63508f, -0.57609f, -0.88765f,
+  -1.04403f, -0.46531f, 0.34084f,  -1.20498f, -0.68352f, -0.72251f, -2.63242f,
+  -0.68736f, -0.37904f, -1.32371f, 0.47288f,  1.51904f,  0.78372f,  -1.01830f,
+  -1.01848f,
+};
+
+static float av1_tx_type_nn_4x8_ver_layer1_bias[4] = {
+  -1.45955f,
+  -2.08949f,
+  -1.24813f,
+  -1.55368f,
+};
+
+static float av1_tx_type_nn_4x8_ver_layer0_out[16] = { 0 };
+static float av1_tx_type_nn_4x8_ver_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_4x8_ver = {
+  1,  // num_hidden_layers
+  {
+      // fc layer setting
+      {
+          // layer 0
+          8,                                      // num_inputs
+          16,                                     // num_outputs
+          av1_tx_type_nn_4x8_ver_layer0_weights,  // weights
+          av1_tx_type_nn_4x8_ver_layer0_bias,     // bias
+          RELU,                                   // activation
+          av1_tx_type_nn_4x8_ver_layer0_out,      // output
+          NULL,
+          NULL,
+          NULL,
+      },
+      {
+          16,  // num_inputs (!!same as num_outputs of last layer)
+          4,
+          av1_tx_type_nn_4x8_ver_layer1_weights,
+          av1_tx_type_nn_4x8_ver_layer1_bias,
+          NONE,
+          av1_tx_type_nn_4x8_ver_layer1_out,
+          NULL,
+          NULL,
+          NULL,
+      },
+  },
+  4,                                  // num_outputs
+  av1_tx_type_nn_4x8_ver_layer1_out,  // logits (!!same as last layer output)
+  SOFTMAX_CROSS_ENTROPY,
+};
+
+/******************************************************************************/
+
+// Tx type model for 8x4 block.
+static float av1_tx_type_nn_8x4_hor_layer0_weights[128] = {
+  -0.22492f, 0.13341f,  -4.03243f, -0.64015f, 0.02783f,  0.60466f,  -0.13335f,
+  0.16828f,  0.12336f,  0.52904f,  1.18455f,  -0.32425f, 0.13052f,  0.93810f,
+  -3.71165f, 0.02990f,  -4.63558f, 0.05666f,  0.03524f,  -0.07449f, -0.44006f,
+  -0.33215f, -0.33713f, 0.08097f,  0.60873f,  0.29582f,  0.21696f,  -0.78729f,
+  -0.16757f, -0.26567f, -0.00720f, -1.11226f, 1.58189f,  1.58463f,  1.48536f,
+  1.54374f,  1.60069f,  1.46125f,  1.53932f,  0.05974f,  -1.82192f, 0.47043f,
+  0.38090f,  0.20833f,  -0.05637f, 0.05183f,  0.01323f,  -0.25662f, 0.78634f,
+  -0.55069f, -0.02975f, -1.29294f, -0.77192f, -2.34299f, -1.28074f, 0.77894f,
+  -1.69740f, -1.66032f, -1.44323f, -1.55063f, -1.50845f, -1.23690f, -1.80663f,
+  0.75079f,  2.32551f,  0.05878f,  0.80438f,  0.88584f,  0.69153f,  0.89060f,
+  0.73660f,  0.87259f,  -0.00745f, -1.30044f, -0.59430f, 2.07270f,  1.03307f,
+  -0.84697f, -1.19393f, 0.17549f,  -0.24978f, -3.67234f, 0.20781f,  -0.53946f,
+  -0.05068f, 0.88274f,  1.30371f,  0.10288f,  0.07585f,  0.12259f,  -0.30815f,
+  0.25437f,  -2.82096f, -2.69482f, 0.02370f,  0.12500f,  -0.21019f, -0.49220f,
+  0.03638f,  -0.29795f, 0.28645f,  -0.48432f, -0.38584f, -0.32148f, -0.47197f,
+  0.32437f,  0.32528f,  -0.19437f, 0.30383f,  -0.31879f, 0.26359f,  -0.12164f,
+  -0.43647f, -0.08288f, -0.33438f, -0.63608f, -0.46647f, -0.46574f, 0.47806f,
+  -0.49012f, -1.51234f, -1.13502f, -1.20470f, -1.02913f, -1.09182f, -0.93921f,
+  -1.85523f, 0.92532f,
+};
+
+static float av1_tx_type_nn_8x4_hor_layer0_bias[16] = {
+  0.36631f,  0.02901f,  0.64305f,  1.53074f, -1.40229f, 0.03852f,
+  -0.05043f, 0.89632f,  -1.23312f, 0.07036f, 0.17070f,  0.56250f,
+  -0.28958f, -0.32869f, -0.01704f, 0.68171f,
+};
+
+static float av1_tx_type_nn_8x4_hor_layer1_weights[64] = {
+  -0.49441f, -0.31960f, -0.84946f, -0.85800f, -2.37767f, 0.81373f,  -0.73172f,
+  -0.69337f, 0.88807f,  -0.49242f, -0.44717f, -0.11436f, 0.09978f,  0.15393f,
+  0.17083f,  1.44850f,  -0.20582f, -0.04906f, 0.42990f,  -0.61939f, -1.09692f,
+  -1.14885f, -1.36879f, -1.30828f, -0.59558f, -0.30903f, -0.08906f, 0.06953f,
+  0.15383f,  -0.04193f, -0.54858f, 1.82676f,  -0.22411f, 0.05264f,  -0.45848f,
+  -0.72985f, 0.87553f,  0.04116f,  -1.29774f, -2.63018f, 1.09089f,  -0.36048f,
+  -0.16725f, 0.11627f,  0.49918f,  0.07539f,  0.00763f,  0.73706f,  0.87800f,
+  0.57049f,  0.60969f,  1.02779f,  1.53339f,  -0.35915f, 0.06410f,  1.44582f,
+  0.09698f,  0.71888f,  0.60594f,  0.84103f,  -0.50440f, -0.38825f, 0.15626f,
+  -1.10654f,
+};
+
+static float av1_tx_type_nn_8x4_hor_layer1_bias[4] = {
+  -0.92861f,
+  -1.45151f,
+  -1.33588f,
+  -4.33853f,
+};
+
+static float av1_tx_type_nn_8x4_hor_layer0_out[16] = { 0 };
+static float av1_tx_type_nn_8x4_hor_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_8x4_hor = {
+  1,  // num_hidden_layers
+  {
+      // fc layer setting
+      {
+          // layer 0
+          8,                                      // num_inputs
+          16,                                     // num_outputs
+          av1_tx_type_nn_8x4_hor_layer0_weights,  // weights
+          av1_tx_type_nn_8x4_hor_layer0_bias,     // bias
+          RELU,                                   // activation
+          av1_tx_type_nn_8x4_hor_layer0_out,      // output
+          NULL,
+          NULL,
+          NULL,
+      },
+      {
+          16,  // num_inputs (!!same as num_outputs of last layer)
+          4,
+          av1_tx_type_nn_8x4_hor_layer1_weights,
+          av1_tx_type_nn_8x4_hor_layer1_bias,
+          NONE,
+          av1_tx_type_nn_8x4_hor_layer1_out,
+          NULL,
+          NULL,
+          NULL,
+      },
+  },
+  4,                                  // num_outputs
+  av1_tx_type_nn_8x4_hor_layer1_out,  // logits (!!same as last layer output)
+  SOFTMAX_CROSS_ENTROPY,
+};
+
+static float av1_tx_type_nn_8x4_ver_layer0_weights[32] = {
+  -1.10946f, 1.86574f,  -1.59343f, 0.27018f, -1.70676f, -0.73982f, -0.19021f,
+  -1.94208f, -2.29759f, -1.44402f, 0.28700f, -1.18340f, -1.50158f, -0.44175f,
+  -1.36831f, 1.00374f,  2.59312f,  0.50291f, -0.71042f, -0.12238f, -0.15901f,
+  -0.22807f, -0.67376f, -0.30215f, 0.54407f, -0.45538f, 1.18262f,  2.28687f,
+  1.66212f,  1.70826f,  1.55182f,  0.12230f,
+};
+
+static float av1_tx_type_nn_8x4_ver_layer0_bias[8] = {
+  0.10943f,  2.09789f, 2.16578f, 0.15766f,
+  -0.42461f, 0.00000f, 1.22090f, -1.28717f,
+};
+
+static float av1_tx_type_nn_8x4_ver_layer1_weights[32] = {
+  1.20426f,  -1.23237f, 2.41053f, -0.72488f, 1.25249f,  0.18018f,  -0.09586f,
+  2.17901f,  0.15364f,  1.21535f, -0.38263f, -0.74309f, 0.50551f,  -0.54208f,
+  0.59139f,  1.16095f,  0.55919f, -0.60183f, 1.18949f,  1.60787f,  0.54002f,
+  -0.10712f, -0.16153f, 0.16207f, -0.32338f, 2.68712f,  -2.83483f, -0.27086f,
+  -1.15005f, -0.39311f, 1.51236f, -1.68973f,
+};
+
+static float av1_tx_type_nn_8x4_ver_layer1_bias[4] = {
+  1.81013f,
+  1.10517f,
+  2.90059f,
+  0.95391f,
+};
+
+static float av1_tx_type_nn_8x4_ver_layer0_out[8] = { 0 };
+static float av1_tx_type_nn_8x4_ver_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_8x4_ver = {
+  1,  // num_hidden_layers
+  {
+      // fc layer setting
+      {
+          // layer 0
+          4,                                      // num_inputs
+          8,                                      // num_outputs
+          av1_tx_type_nn_8x4_ver_layer0_weights,  // weights
+          av1_tx_type_nn_8x4_ver_layer0_bias,     // bias
+          RELU,                                   // activation
+          av1_tx_type_nn_8x4_ver_layer0_out,      // output
+          NULL,
+          NULL,
+          NULL,
+      },
+      {
+          8,  // num_inputs (!!same as num_outputs of last layer)
+          4,
+          av1_tx_type_nn_8x4_ver_layer1_weights,
+          av1_tx_type_nn_8x4_ver_layer1_bias,
+          NONE,
+          av1_tx_type_nn_8x4_ver_layer1_out,
+          NULL,
+          NULL,
+          NULL,
+      },
+  },
+  4,                                  // num_outputs
+  av1_tx_type_nn_8x4_ver_layer1_out,  // logits (!!same as last layer output)
+  SOFTMAX_CROSS_ENTROPY,
+};
+/******************************************************************************/
+
+// Tx type model for 8x8 block.
+static float av1_tx_type_nn_8x8_hor_layer0_weights[128] = {
+  -0.85529f, 0.37619f,  0.12754f,  0.08622f,  0.45278f,  0.54929f,  1.60651f,
+  -0.62654f, -0.54929f, -0.10131f, -0.17569f, 0.13948f,  0.31695f,  -0.05616f,
+  0.20483f,  -0.36448f, 2.27203f,  -0.33087f, 0.47679f,  0.86888f,  0.39370f,
+  0.46239f,  0.01113f,  1.50327f,  -1.48226f, -1.69621f, -1.49777f, -1.38885f,
+  -1.37753f, -1.22681f, -1.70576f, 0.51329f,  -1.65662f, 1.74197f,  -0.13579f,
+  -0.13133f, -0.58396f, -0.55510f, -1.10709f, -2.34975f, 0.22445f,  -0.56491f,
+  -0.83432f, 0.13492f,  1.32147f,  2.85285f,  0.13819f,  0.03792f,  -1.30792f,
+  0.04155f,  -0.70644f, -0.43430f, -0.16212f, -0.86945f, -1.16976f, 1.68339f,
+  0.29540f,  0.01137f,  -0.25335f, -0.16856f, 0.12028f,  0.05207f,  0.39357f,
+  -0.01545f, -0.21980f, -1.94091f, -1.01315f, -0.68270f, -0.40590f, -0.67111f,
+  2.08283f,  0.19291f,  -4.81426f, -0.65044f, -0.24598f, 0.06371f,  -0.10272f,
+  -0.14502f, -0.06821f, 0.45202f,  0.21091f,  -0.80864f, 0.39255f,  1.79189f,
+  1.80453f,  1.10484f,  1.17608f,  0.96901f,  -0.35871f, -0.94311f, 0.63147f,
+  2.95157f,  0.45917f,  -0.42849f, -0.55643f, -0.06097f, 3.49299f,  -0.50972f,
+  0.11075f,  -0.08405f, -0.09274f, -0.22694f, -0.42426f, 0.48632f,  -1.61074f,
+  1.82998f,  0.37623f,  -1.20330f, -0.01142f, -1.33307f, -0.27492f, -2.23621f,
+  1.38846f,  1.42085f,  1.42568f,  1.36152f,  1.46910f,  1.27473f,  1.34752f,
+  0.12753f,  -1.08197f, -1.08280f, -0.79489f, -1.12338f, -1.06795f, -0.87857f,
+  -0.99892f, 1.09823f,
+};
+
+static float av1_tx_type_nn_8x8_hor_layer0_bias[16] = {
+  -0.49232f, -0.29685f, -1.44020f, 1.10940f,  1.16452f, -0.34862f,
+  -0.38761f, -0.36243f, 0.21776f,  0.28234f,  2.34269f, -0.04104f,
+  -0.26319f, 2.65579f,  -1.30137f, -0.01487f,
+};
+
+static float av1_tx_type_nn_8x8_hor_layer1_weights[64] = {
+  -0.38058f, -0.41295f, -1.26884f, -0.75560f, -1.57450f, 0.56072f,  -1.42322f,
+  -0.29106f, 0.07228f,  0.04391f,  1.61388f,  -0.03055f, 0.81637f,  2.06045f,
+  0.27119f,  -0.48328f, -0.45528f, -0.60534f, -1.61209f, -0.78157f, -1.65034f,
+  0.60958f,  -1.30523f, 0.25143f,  0.11398f,  0.37860f,  1.54829f,  0.02309f,
+  0.67288f,  2.11447f,  0.44845f,  -0.70406f, -0.67897f, -0.38759f, -1.30383f,
+  -1.22646f, -1.54571f, 0.60552f,  -1.52565f, 0.11469f,  0.17344f,  0.08622f,
+  1.57906f,  -0.00909f, 0.81634f,  2.04909f,  1.26466f,  -1.45741f, -0.75229f,
+  0.06200f,  -1.05835f, -0.66257f, -1.73766f, 0.99923f,  -1.87082f, 0.14580f,
+  0.49525f,  0.46839f,  1.32203f,  0.33923f,  0.97001f,  2.38584f,  1.58811f,
+  0.06161f,
+};
+
+static float av1_tx_type_nn_8x8_hor_layer1_bias[4] = {
+  1.70385f,
+  1.82373f,
+  1.78496f,
+  1.80826f,
+};
+
+static float av1_tx_type_nn_8x8_hor_layer0_out[16] = { 0 };
+static float av1_tx_type_nn_8x8_hor_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_8x8_hor = {
+  1,  // num_hidden_layers
+  {
+      // fc layer setting
+      {
+          // layer 0
+          8,                                      // num_inputs
+          16,                                     // num_outputs
+          av1_tx_type_nn_8x8_hor_layer0_weights,  // weights
+          av1_tx_type_nn_8x8_hor_layer0_bias,     // bias
+          RELU,                                   // activation
+          av1_tx_type_nn_8x8_hor_layer0_out,      // output
+          NULL,
+          NULL,
+          NULL,
+      },
+      {
+          16,  // num_inputs (!!same as num_outputs of last layer)
+          4,
+          av1_tx_type_nn_8x8_hor_layer1_weights,
+          av1_tx_type_nn_8x8_hor_layer1_bias,
+          NONE,
+          av1_tx_type_nn_8x8_hor_layer1_out,
+          NULL,
+          NULL,
+          NULL,
+      },
+  },
+  4,                                  // num_outputs
+  av1_tx_type_nn_8x8_hor_layer1_out,  // logits (!!same as last layer output)
+  SOFTMAX_CROSS_ENTROPY,
+};
+
+static float av1_tx_type_nn_8x8_ver_layer0_weights[128] = {
+  -0.67016f, -1.72366f, -1.86576f, -1.50962f, -1.70419f, -1.73964f, -1.84615f,
+  2.09681f,  -0.05081f, -0.61030f, 2.02541f,  0.60222f,  0.99936f,  2.02114f,
+  -0.53893f, -0.23757f, 0.73566f,  0.25443f,  0.00132f,  -0.74036f, -0.75351f,
+  -0.76964f, -1.71007f, -0.15770f, 1.60982f,  2.17638f,  0.90681f,  0.64973f,
+  0.85914f,  0.58786f,  -1.46228f, 0.05187f,  1.18804f,  0.30850f,  0.29512f,
+  0.40526f,  0.37635f,  0.32311f,  0.37471f,  1.12346f,  3.41856f,  -0.36653f,
+  0.42537f,  -0.19240f, 0.00155f,  0.30826f,  -0.02116f, -0.53435f, -0.34829f,
+  -0.52466f, -0.11521f, -0.29163f, -2.05689f, -2.87372f, -0.62626f, 0.09585f,
+  -0.75257f, 0.10057f,  1.43474f,  0.89450f,  0.75900f,  1.11147f,  1.00558f,
+  0.25886f,  2.22095f,  -0.17926f, 0.57161f,  0.39546f,  0.47846f,  0.40452f,
+  0.54298f,  0.45814f,  -3.62788f, -3.02374f, 0.03716f,  -0.13937f, -0.09415f,
+  -0.12463f, 0.05682f,  0.03672f,  1.20746f,  1.25003f,  1.27071f,  1.31883f,
+  1.27473f,  1.34943f,  1.23158f,  0.09039f,  0.19388f,  0.63420f,  2.79612f,
+  0.93803f,  -0.11323f, -0.02027f, 0.41286f,  -0.05979f, -3.80705f, -0.52451f,
+  -0.77098f, -0.68132f, -0.65559f, -0.60975f, -1.26165f, 0.25582f,  0.05346f,
+  0.61403f,  0.32140f,  -2.39831f, -1.42355f, 1.30541f,  1.02361f,  0.12930f,
+  -1.61469f, -0.77036f, -0.59144f, 1.27769f,  1.52068f,  0.82137f,  1.83159f,
+  -0.66626f, -0.69806f, -1.00564f, -0.85995f, -0.90889f, -0.84412f, -0.85712f,
+  -1.29848f, 0.39308f,
+};
+
+static float av1_tx_type_nn_8x8_ver_layer0_bias[16] = {
+  -0.14868f, -0.48343f, 3.94416f,  -0.78037f, -1.33789f, -0.60611f,
+  0.51793f,  0.44030f,  -0.71563f, 0.22561f,  -1.19083f, -0.46149f,
+  0.83015f,  0.06024f,  1.17180f,  0.65122f,
+};
+
+static float av1_tx_type_nn_8x8_ver_layer1_weights[64] = {
+  -1.42711f, -0.21683f, 2.12061f,  0.20489f,  -0.50228f, -0.24770f, 0.23391f,
+  1.03470f,  -0.44847f, -0.63225f, -0.21583f, -0.06467f, -0.21892f, -0.07786f,
+  1.43322f,  0.00280f,  -1.53057f, -0.18912f, 1.95333f,  0.31151f,  -2.07601f,
+  0.06776f,  0.25529f,  0.94800f,  -1.11453f, -0.20594f, -0.13281f, 0.01485f,
+  0.17650f,  -0.07955f, 1.43734f,  -0.23193f, -2.06463f, -0.21238f, 2.13707f,
+  0.30351f,  0.27594f,  -0.36245f, 0.19539f,  0.91045f,  -0.24068f, -0.37616f,
+  0.88792f,  0.02947f,  -0.16903f, -0.04932f, 1.51293f,  -0.95967f, -1.62903f,
+  0.05326f,  2.30703f,  0.64445f,  -1.09464f, -0.16623f, 1.00240f,  0.07548f,
+  -0.50406f, 0.63854f,  1.02340f,  0.49833f,  0.13671f,  0.26722f,  2.09516f,
+  -0.41305f,
+};
+
+static float av1_tx_type_nn_8x8_ver_layer1_bias[4] = {
+  2.14067f,
+  2.76699f,
+  2.04233f,
+  1.34803f,
+};
+
+static float av1_tx_type_nn_8x8_ver_layer0_out[16] = { 0 };
+static float av1_tx_type_nn_8x8_ver_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_8x8_ver = {
+  1,  // num_hidden_layers
+  {
+      // fc layer setting
+      {
+          // layer 0
+          8,                                      // num_inputs
+          16,                                     // num_outputs
+          av1_tx_type_nn_8x8_ver_layer0_weights,  // weights
+          av1_tx_type_nn_8x8_ver_layer0_bias,     // bias
+          RELU,                                   // activation
+          av1_tx_type_nn_8x8_ver_layer0_out,      // output
+          NULL,
+          NULL,
+          NULL,
+      },
+      {
+          16,  // num_inputs (!!same as num_outputs of last layer)
+          4,
+          av1_tx_type_nn_8x8_ver_layer1_weights,
+          av1_tx_type_nn_8x8_ver_layer1_bias,
+          NONE,
+          av1_tx_type_nn_8x8_ver_layer1_out,
+          NULL,
+          NULL,
+          NULL,
+      },
+  },
+  4,                                  // num_outputs
+  av1_tx_type_nn_8x8_ver_layer1_out,  // logits (!!same as last layer output)
+  SOFTMAX_CROSS_ENTROPY,
+};
+/******************************************************************************/
+
+// Tx type model for 8x16 block.
+static float av1_tx_type_nn_8x16_hor_layer0_weights[128] = {
+  -1.61872f, -1.58520f, -1.41236f, -1.53255f, -1.59794f, -1.25769f, -1.90043f,
+  0.73431f,  1.10135f,  0.47054f,  0.43230f,  -0.43009f, -0.09135f, -0.07289f,
+  -0.38785f, 1.23775f,  -0.35312f, 0.73789f,  0.88864f,  0.75957f,  0.62579f,
+  0.46974f,  0.21851f,  1.63821f,  -2.27289f, -0.68522f, -0.69814f, -0.84368f,
+  -0.91320f, -0.63055f, -1.03296f, 0.55778f,  -0.00071f, 1.27539f,  1.60068f,
+  1.40975f,  0.97372f,  0.92843f,  1.90853f,  0.12626f,  1.71953f,  1.41978f,
+  -0.12234f, -1.27058f, 0.76207f,  0.02495f,  -0.67038f, -0.05255f, 1.72923f,
+  1.47630f,  1.47058f,  1.47614f,  1.49354f,  1.66131f,  1.50801f,  0.17145f,
+  -2.30947f, -2.10850f, -1.25636f, -0.24900f, 0.72602f,  1.26572f,  0.97865f,
+  -0.65466f, 1.31129f,  0.26916f,  0.12139f,  -0.12761f, -0.39143f, -0.28134f,
+  0.06584f,  2.24418f,  0.22516f,  0.05011f,  -0.01671f, -0.29476f, -0.40326f,
+  0.21138f,  -0.11573f, -0.31154f, -0.36828f, 0.03694f,  -0.07172f, -0.63419f,
+  -3.14351f, -1.23125f, 0.65311f,  -0.11406f, 1.97287f,  -0.10422f, 0.83896f,
+  0.85033f,  0.49724f,  0.80482f,  0.51454f,  1.06447f,  0.76693f,  0.72599f,
+  -0.78573f, -0.53950f, 0.40894f,  0.00086f,  0.10784f,  -0.70498f, 1.16395f,
+  1.14597f,  1.13496f,  1.12177f,  1.02100f,  -1.37574f, -2.97144f, 0.33899f,
+  0.42013f,  0.86327f,  2.31983f,  2.04008f,  0.95503f,  0.15081f,  0.11530f,
+  -0.02574f, -4.77119f, 0.13257f,  -0.01704f, -0.23087f, -0.00825f, 0.07029f,
+  -0.28136f, 0.42556f,
+};
+
+static float av1_tx_type_nn_8x16_hor_layer0_bias[16] = {
+  0.93617f,  -0.24000f, -1.26821f, 0.78780f,  0.13690f, -0.21948f,
+  -1.45162f, 0.44584f,  -1.92582f, -0.23169f, 0.56004f, -1.19937f,
+  1.81560f,  -1.02643f, -0.81690f, 0.08302f,
+};
+
+static float av1_tx_type_nn_8x16_hor_layer1_weights[64] = {
+  0.06696f,  -0.11538f, -1.42029f, 0.32965f,  0.81046f,  0.01146f,  1.20945f,
+  -0.16899f, 0.53224f,  -0.40232f, 0.01786f,  -0.73242f, 1.29750f,  1.95185f,
+  0.70143f,  1.43287f,  0.76220f,  0.79937f,  -1.79011f, -1.15178f, 0.42526f,
+  -0.67519f, 0.77267f,  -0.30697f, 2.46004f,  -0.49828f, 0.02875f,  1.09972f,
+  1.47662f,  0.61719f,  0.61417f,  -0.12363f, 2.53048f,  0.00418f,  -1.38964f,
+  0.88117f,  0.39239f,  -0.19347f, -2.58600f, -0.33715f, 1.09323f,  -0.32127f,
+  0.02456f,  -0.19125f, 1.12728f,  0.66502f,  0.34296f,  1.14897f,  0.29967f,
+  1.19209f,  0.22108f,  -0.11975f, 1.49776f,  -1.34624f, -2.58478f, -1.34632f,
+  1.53207f,  0.45634f,  -1.48476f, 0.17489f,  0.71790f,  -2.12086f, -1.21778f,
+  -1.31243f,
+};
+
+static float av1_tx_type_nn_8x16_hor_layer1_bias[4] = {
+  0.83359f,
+  1.06875f,
+  1.77645f,
+  1.49570f,
+};
+
+static float av1_tx_type_nn_8x16_hor_layer0_out[16] = { 0 };
+static float av1_tx_type_nn_8x16_hor_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_8x16_hor = {
+  1,  // num_hidden_layers
+  {
+      // fc layer setting
+      {
+          // layer 0
+          8,                                       // num_inputs
+          16,                                      // num_outputs
+          av1_tx_type_nn_8x16_hor_layer0_weights,  // weights
+          av1_tx_type_nn_8x16_hor_layer0_bias,     // bias
+          RELU,                                    // activation
+          av1_tx_type_nn_8x16_hor_layer0_out,      // output
+          NULL,
+          NULL,
+          NULL,
+      },
+      {
+          16,  // num_inputs (!!same as num_outputs of last layer)
+          4,
+          av1_tx_type_nn_8x16_hor_layer1_weights,
+          av1_tx_type_nn_8x16_hor_layer1_bias,
+          NONE,
+          av1_tx_type_nn_8x16_hor_layer1_out,
+          NULL,
+          NULL,
+          NULL,
+      },
+  },
+  4,                                   // num_outputs
+  av1_tx_type_nn_8x16_hor_layer1_out,  // logits (!!same as last layer output)
+  SOFTMAX_CROSS_ENTROPY,
+};
+
+static float av1_tx_type_nn_8x16_ver_layer0_weights[128] = {
+  0.32858f,  -1.28887f, 0.25632f,  -0.05262f, 2.69203f,  -0.07004f, 1.37337f,
+  -0.05725f, -0.05659f, 0.05592f,  0.01039f,  -0.29343f, 1.58628f,  -0.30003f,
+  -3.43118f, 0.00272f,  1.70928f,  -0.76348f, 0.05889f,  -0.03263f, -0.07724f,
+  0.03523f,  -0.19890f, 1.18005f,  -0.03605f, -0.20530f, -4.00733f, 0.10210f,
+  -0.05368f, -0.17650f, -0.15317f, 0.06499f,  0.56705f,  1.04341f,  0.62890f,
+  0.73451f,  -0.22199f, 0.86659f,  0.78443f,  -0.61664f, -0.50606f, 0.30247f,
+  0.14455f,  0.39276f,  0.49203f,  0.65019f,  0.12269f,  1.64080f,  1.68289f,
+  1.42694f,  1.60825f,  1.58501f,  1.47252f,  1.62589f,  1.48218f,  0.17726f,
+  -0.04884f, 0.35376f,  -0.04796f, 0.32589f,  0.35087f,  0.35258f,  -0.46103f,
+  -0.31176f, -0.05203f, 0.07247f,  -0.26756f, 0.22019f,  0.03412f,  0.33773f,
+  0.29811f,  -0.11140f, 0.12831f,  -0.44673f, -0.09858f, 0.07889f,  0.15137f,
+  0.00347f,  -0.23394f, 0.08886f,  -0.31201f, -0.79912f, -0.51092f, 0.14123f,
+  -1.09599f, -4.26020f, -0.68675f, -0.02842f, -1.54538f, -1.28977f, -1.30558f,
+  -1.21074f, -1.37142f, -1.14743f, -1.85397f, 0.82985f,  -0.30681f, 0.04494f,
+  -0.24023f, -4.18053f, -0.16096f, -0.55492f, -0.27882f, 0.05829f,  -0.41224f,
+  -2.52088f, -0.56162f, -1.04547f, -1.70685f, -0.28842f, -1.43673f, -0.01468f,
+  -3.20585f, -0.69120f, -0.43931f, -0.46270f, -0.65885f, -0.55884f, -0.75138f,
+  0.36381f,  -5.70858f, -0.14548f, -0.15745f, -0.11812f, -0.07605f, -0.07693f,
+  -0.12236f, 0.16075f,
+};
+
+static float av1_tx_type_nn_8x16_ver_layer0_bias[16] = {
+  -0.35385f, 0.30491f,  -0.90011f, 0.42941f,  1.20928f, -0.88331f,
+  -1.48818f, -0.34785f, -0.32668f, -0.22695f, 0.89188f, 0.65521f,
+  0.57598f,  0.99819f,  0.75175f,  0.17044f,
+};
+
+static float av1_tx_type_nn_8x16_ver_layer1_weights[64] = {
+  -0.62913f, -0.34304f, 0.42963f,  -0.17440f, -1.44092f, 0.69142f,  -1.36067f,
+  0.52211f,  0.44658f,  -0.26501f, -0.41657f, 0.34428f,  -0.34390f, -0.58567f,
+  -0.84097f, -1.96311f, -0.37215f, -0.22250f, -1.23811f, -0.07247f, -0.81731f,
+  0.58755f,  -1.30559f, 0.39551f,  0.41743f,  -0.09940f, -0.33230f, 0.14458f,
+  -0.25139f, -0.54517f, 0.13469f,  -0.38157f, -0.39109f, -0.18205f, 0.06834f,
+  -0.08395f, -0.92187f, 0.56724f,  1.44381f,  0.53226f,  -0.22356f, 0.12285f,
+  -0.29418f, -1.86749f, -0.22372f, -0.60204f, -0.87746f, -1.16936f, 0.56884f,
+  0.62641f,  -0.11823f, 1.00395f,  1.64794f,  -0.64535f, 2.29322f,  -0.23397f,
+  0.17251f,  -0.35927f, 0.65631f,  -0.26812f, 0.80128f,  0.85748f,  0.47404f,
+  2.20547f,
+};
+
+static float av1_tx_type_nn_8x16_ver_layer1_bias[4] = {
+  -0.44080f,
+  -1.67455f,
+  -1.46332f,
+  -6.13206f,
+};
+
+static float av1_tx_type_nn_8x16_ver_layer0_out[16] = { 0 };
+static float av1_tx_type_nn_8x16_ver_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_8x16_ver = {
+  1,  // num_hidden_layers
+  {
+      // fc layer setting
+      {
+          // layer 0
+          8,                                       // num_inputs
+          16,                                      // num_outputs
+          av1_tx_type_nn_8x16_ver_layer0_weights,  // weights
+          av1_tx_type_nn_8x16_ver_layer0_bias,     // bias
+          RELU,                                    // activation
+          av1_tx_type_nn_8x16_ver_layer0_out,      // output
+          NULL,
+          NULL,
+          NULL,
+      },
+      {
+          16,  // num_inputs (!!same as num_outputs of last layer)
+          4,
+          av1_tx_type_nn_8x16_ver_layer1_weights,
+          av1_tx_type_nn_8x16_ver_layer1_bias,
+          NONE,
+          av1_tx_type_nn_8x16_ver_layer1_out,
+          NULL,
+          NULL,
+          NULL,
+      },
+  },
+  4,                                   // num_outputs
+  av1_tx_type_nn_8x16_ver_layer1_out,  // logits (!!same as last layer output)
+  SOFTMAX_CROSS_ENTROPY,
+};
+/******************************************************************************/
+
+// Tx type model for 16x8 block.
+static float av1_tx_type_nn_16x8_hor_layer0_weights[128] = {
+  0.02600f,  0.09786f,  -1.05107f, -0.35594f, -0.15658f, 2.99828f,  -0.07106f,
+  -0.10101f, -0.14412f, -0.83790f, -0.19434f, 2.28368f,  1.91727f,  -0.00956f,
+  -0.90640f, 0.09174f,  1.58895f,  1.38945f,  1.49431f,  1.51381f,  1.44803f,
+  1.53544f,  1.44694f,  0.17753f,  1.69735f,  -0.78652f, 0.31092f,  -0.23736f,
+  0.02231f,  -0.09884f, -0.00493f, 1.21189f,  -1.94382f, -0.34629f, -0.58309f,
+  0.72291f,  -0.30056f, 0.90660f,  -0.57495f, 3.07809f,  0.73644f,  1.43050f,
+  1.34356f,  -0.66554f, 0.50102f,  -0.64305f, 0.42044f,  -1.66165f, -0.05733f,
+  -2.51402f, -1.01067f, -0.33390f, -0.32986f, -0.92431f, 1.86281f,  -0.07290f,
+  -0.26290f, -0.68941f, 1.81156f,  0.66125f,  -2.09974f, 0.17032f,  -0.67461f,
+  -0.00876f, -1.50154f, 1.17153f,  1.00377f,  0.33022f,  0.74689f,  0.42878f,
+  0.61725f,  -0.83967f, 0.09467f,  -0.39892f, 0.33863f,  0.10656f,  -0.09249f,
+  -0.39757f, 0.48481f,  -0.35162f, 1.47014f,  1.67827f,  -1.84051f, 0.16291f,
+  -0.50135f, -2.29911f, -0.42217f, -0.13358f, 1.45899f,  -0.14743f, -0.02763f,
+  -0.28003f, -0.01364f, 0.21014f,  -0.29026f, -0.20198f, 1.38782f,  0.56731f,
+  0.27489f,  0.43227f,  0.41326f,  0.42721f,  0.87720f,  -1.90067f, -5.04951f,
+  -0.17638f, -0.58119f, -0.08954f, -0.13692f, -0.12325f, -0.38548f, 0.66462f,
+  -1.42377f, -1.21917f, -1.38193f, -1.36539f, -1.39378f, -1.19629f, -1.59812f,
+  0.28689f,  0.32394f,  0.52128f,  0.01013f,  -0.28948f, -0.26293f, -0.44331f,
+  -0.36570f, -0.50757f,
+};
+
+static float av1_tx_type_nn_16x8_hor_layer0_bias[16] = {
+  -0.08696f, -0.22110f, -1.43604f, -1.00451f, -1.51029f, 0.63736f,
+  0.45260f,  0.16229f,  4.01393f,  -0.21748f, 0.36411f,  -0.08764f,
+  -0.12329f, 0.08986f,  1.08117f,  -0.00220f,
+};
+
+static float av1_tx_type_nn_16x8_hor_layer1_weights[64] = {
+  0.55824f,  -0.14648f, 0.81947f,  -0.45867f, -1.86078f, -0.17291f, 0.34849f,
+  0.15153f,  1.75625f,  -0.25760f, 0.72015f,  -0.30059f, -0.57975f, 0.07609f,
+  -0.02036f, 0.07912f,  0.57080f,  -0.13792f, 0.74184f,  -0.87669f, -1.87572f,
+  -0.27270f, 0.39751f,  0.19652f,  2.03514f,  -0.32944f, 0.76251f,  0.04399f,
+  -0.63175f, 0.37420f,  0.08309f,  0.04466f,  0.60255f,  -0.12820f, 1.66065f,
+  -0.59496f, -1.94794f, -0.14847f, 0.39424f,  0.16273f,  1.80587f,  0.41197f,
+  0.74691f,  -0.21217f, -0.63173f, 0.09510f,  -0.35538f, -0.04407f, 0.92847f,
+  0.20141f,  1.68680f,  -0.56528f, -2.26960f, 0.12978f,  0.73748f,  0.42438f,
+  2.00673f,  -0.40189f, 0.95423f,  0.23234f,  -0.80953f, 0.65814f,  0.49444f,
+  -0.23347f,
+};
+
+static float av1_tx_type_nn_16x8_hor_layer1_bias[4] = {
+  3.57175f,
+  2.42612f,
+  3.31259f,
+  2.08287f,
+};
+
+static float av1_tx_type_nn_16x8_hor_layer0_out[16] = { 0 };
+static float av1_tx_type_nn_16x8_hor_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_16x8_hor = {
+  1,  // num_hidden_layers
+  {
+      // fc layer setting
+      {
+          // layer 0
+          8,                                       // num_inputs
+          16,                                      // num_outputs
+          av1_tx_type_nn_16x8_hor_layer0_weights,  // weights
+          av1_tx_type_nn_16x8_hor_layer0_bias,     // bias
+          RELU,                                    // activation
+          av1_tx_type_nn_16x8_hor_layer0_out,      // output
+          NULL,
+          NULL,
+          NULL,
+      },
+      {
+          16,  // num_inputs (!!same as num_outputs of last layer)
+          4,
+          av1_tx_type_nn_16x8_hor_layer1_weights,
+          av1_tx_type_nn_16x8_hor_layer1_bias,
+          NONE,
+          av1_tx_type_nn_16x8_hor_layer1_out,
+          NULL,
+          NULL,
+          NULL,
+      },
+  },
+  4,                                   // num_outputs
+  av1_tx_type_nn_16x8_hor_layer1_out,  // logits (!!same as last layer output)
+  SOFTMAX_CROSS_ENTROPY,
+};
+
+static float av1_tx_type_nn_16x8_ver_layer0_weights[128] = {
+  0.46633f,  1.55328f,  -0.11230f, -0.29571f, 0.18814f,  -1.52430f, -2.34660f,
+  0.08644f,  -1.97718f, -1.29140f, -1.12262f, -1.12985f, -1.25911f, -0.96506f,
+  -1.57129f, 0.96021f,  1.34192f,  1.28623f,  1.21655f,  1.28758f,  1.25482f,
+  1.30195f,  1.19190f,  0.09310f,  0.52072f,  0.91487f,  1.24100f,  1.61236f,
+  1.72166f,  2.20750f,  1.62379f,  -1.43936f, 0.50665f,  0.40213f,  0.66502f,
+  -1.66699f, -3.07618f, 0.05877f,  0.60987f,  -0.09995f, -0.10916f, 0.48049f,
+  0.23812f,  0.39847f,  -0.21682f, -0.63455f, 0.33453f,  -0.67939f, -4.14355f,
+  -0.62756f, -0.22502f, -0.17215f, 0.01062f,  0.27049f,  -0.10748f, 0.30945f,
+  2.72445f,  -0.89181f, -0.06800f, 0.20595f,  -0.73385f, 0.04071f,  -1.30294f,
+  1.83507f,  0.92570f,  0.69609f,  0.76285f,  0.69892f,  0.76409f,  0.63104f,
+  0.73397f,  1.09575f,  -0.20129f, -0.24022f, -0.24599f, -0.59107f, -0.88755f,
+  -0.68987f, -0.75495f, -1.31002f, -1.30237f, -0.94093f, -2.15678f, -1.49303f,
+  -1.17498f, -1.39952f, -0.91270f, -0.05587f, 1.02381f,  -0.75580f, -0.65263f,
+  -0.78996f, -0.71075f, -0.71018f, -0.70350f, -1.26196f, 2.34208f,  -0.53611f,
+  0.19752f,  -0.16842f, -0.24828f, 0.21857f,  0.08222f,  -2.55894f, -1.75702f,
+  0.11394f,  1.03083f,  0.79972f,  -1.54112f, -1.82341f, -0.57597f, -0.02077f,
+  -0.39616f, -0.00995f, -0.12809f, 0.01188f,  -0.25117f, 0.09202f,  0.09336f,
+  -0.05614f, -0.30039f, 0.25834f,  1.19944f,  1.22533f,  0.92330f,  0.75967f,
+  -0.81945f, -0.41647f,
+};
+
+static float av1_tx_type_nn_16x8_ver_layer0_bias[16] = {
+  0.17841f,  0.67315f,  -1.24450f, 3.13859f,  0.16203f, -0.14992f,
+  0.29553f,  -1.15567f, -0.71421f, 1.15977f,  1.14585f, 3.02460f,
+  -0.04510f, 0.48000f,  -0.09354f, -0.42422f,
+};
+
+static float av1_tx_type_nn_16x8_ver_layer1_weights[64] = {
+  0.29912f,  -0.10009f, -1.11478f, 1.76812f,  -0.27719f, 0.52148f,  0.17622f,
+  -1.17116f, 0.73397f,  -0.69279f, -0.11080f, 1.53751f,  -1.42003f, 0.14731f,
+  0.13592f,  -0.04883f, 0.39186f,  -0.13655f, -0.43994f, 1.82759f,  -0.25601f,
+  -0.15018f, 0.51920f,  -1.56070f, 0.31683f,  -0.79367f, -0.02904f, 1.28637f,
+  -1.15203f, 0.26627f,  0.42828f,  -0.24258f, 0.38647f,  -0.83352f, 0.32553f,
+  2.09522f,  -0.26822f, -0.42191f, 0.32825f,  -1.30748f, 1.50551f,  -0.52669f,
+  0.20045f,  1.69318f,  -1.47839f, 0.30802f,  -0.07290f, -0.28106f, 0.68192f,
+  -0.15522f, 1.12579f,  2.21921f,  0.09720f,  -0.50265f, 0.83165f,  -1.31721f,
+  0.72422f,  -1.24952f, 0.61653f,  2.04117f,  -1.42406f, 0.52568f,  -0.46180f,
+  -0.00873f,
+};
+
+static float av1_tx_type_nn_16x8_ver_layer1_bias[4] = {
+  3.34981f,
+  3.74710f,
+  1.38339f,
+  0.45176f,
+};
+
+static float av1_tx_type_nn_16x8_ver_layer0_out[16] = { 0 };
+static float av1_tx_type_nn_16x8_ver_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_16x8_ver = {
+  1,  // num_hidden_layers
+  {
+      // fc layer setting
+      {
+          // layer 0
+          8,                                       // num_inputs
+          16,                                      // num_outputs
+          av1_tx_type_nn_16x8_ver_layer0_weights,  // weights
+          av1_tx_type_nn_16x8_ver_layer0_bias,     // bias
+          RELU,                                    // activation
+          av1_tx_type_nn_16x8_ver_layer0_out,      // output
+          NULL,
+          NULL,
+          NULL,
+      },
+      {
+          16,  // num_inputs (!!same as num_outputs of last layer)
+          4,
+          av1_tx_type_nn_16x8_ver_layer1_weights,
+          av1_tx_type_nn_16x8_ver_layer1_bias,
+          NONE,
+          av1_tx_type_nn_16x8_ver_layer1_out,
+          NULL,
+          NULL,
+          NULL,
+      },
+  },
+  4,                                   // num_outputs
+  av1_tx_type_nn_16x8_ver_layer1_out,  // logits (!!same as last layer output)
+  SOFTMAX_CROSS_ENTROPY,
+};
+/******************************************************************************/
+
+// Tx type model for 16x16 block.
+static float av1_tx_type_nn_16x16_layer0_weights[128] = {
+  1.26592f,  1.36313f,  1.30956f,  1.29926f,  1.48816f,  1.68851f,  1.32000f,
+  0.13321f,  -0.22477f, -0.88906f, -0.19622f, 1.69605f,  1.22180f,  -1.57771f,
+  -1.15765f, 0.05710f,  -1.13355f, -0.85486f, -0.99971f, -0.91571f, -1.06031f,
+  -0.77952f, -1.15723f, 1.17809f,  1.35602f,  -0.05243f, -0.37596f, 0.26108f,
+  0.17611f,  -0.10323f, 0.77279f,  -0.48911f, -0.79308f, 0.55112f,  0.43918f,
+  0.27872f,  0.28714f,  0.45830f,  1.05689f,  0.03705f,  -2.49975f, -0.01940f,
+  0.05709f,  0.07942f,  -0.13290f, -0.10359f, 0.00143f,  0.37303f,  0.96470f,
+  0.53293f,  1.14459f,  0.89185f,  0.43378f,  0.47764f,  0.90924f,  0.15279f,
+  -0.15361f, 0.02949f,  0.42240f,  0.68143f,  0.89588f,  0.73754f,  0.10974f,
+  1.57755f,  -0.39870f, -0.32914f, 0.35638f,  0.34991f,  -0.00003f, -0.23373f,
+  0.29630f,  -0.76699f, -0.01356f, 0.04234f,  0.84253f,  1.92078f,  0.93160f,
+  0.71993f,  0.71604f,  0.76455f,  -1.59782f, 0.32332f,  1.11628f,  0.33062f,
+  -0.03728f, -0.05710f, 0.80447f,  -0.14719f, 1.34658f,  -0.05718f, 0.64015f,
+  0.21926f,  0.41653f,  0.12720f,  0.54092f,  1.39411f,  1.81819f,  -0.24513f,
+  0.00955f,  0.38011f,  -0.57787f, -0.41759f, 0.68834f,  -0.31783f, -0.40607f,
+  -0.10107f, -0.79374f, 0.75599f,  -0.16282f, -0.14490f, -0.20783f, -0.55019f,
+  -0.13793f, -0.22293f, 0.18305f,  0.12445f,  0.56830f,  0.24567f,  0.09278f,
+  0.70803f,  0.35803f,  -1.52676f, -0.89624f, 0.77665f,  0.19877f,  0.77175f,
+  0.50355f,  0.08592f,
+};
+
+static float av1_tx_type_nn_16x16_layer0_bias[16] = {
+  -1.31834f, 0.14346f,  -0.10062f, 0.84489f,  0.95617f,  -0.06720f,
+  -0.68502f, -0.91442f, -0.31932f, 0.25276f,  -0.15138f, -1.57661f,
+  -0.14062f, -0.42120f, 0.94573f,  -0.09287f,
+};
+
+static float av1_tx_type_nn_16x16_layer1_weights[64] = {
+  -1.80333f, -1.06353f, 0.55139f,  0.74644f,  0.13747f, -0.93018f, -0.10286f,
+  0.67133f,  0.24460f,  1.44583f,  0.02173f,  0.26037f, -0.73687f, 0.19566f,
+  0.61846f,  -0.58601f, -1.03196f, -0.74415f, 0.30041f, -0.41967f, 1.08740f,
+  0.96224f,  -0.59139f, 0.03813f,  0.05403f,  1.33427f, -0.54375f, -1.92181f,
+  0.54704f,  0.13608f,  0.22151f,  -0.38076f, 1.18390f, -0.77508f, -1.84283f,
+  1.00894f,  0.62318f,  -0.15296f, 1.27600f,  0.22822f, 0.12751f,  0.93910f,
+  -0.28502f, 0.53912f,  -0.96889f, 0.10182f,  0.81508f, -0.43028f, 2.67386f,
+  0.52204f,  0.49820f,  -0.41711f, 1.05038f,  1.12192f, 0.74349f,  -0.75417f,
+  -0.03718f, -0.35769f, 0.89651f,  0.63236f,  0.54215f, -0.07894f, 0.48274f,
+  1.08829f,
+};
+
+static float av1_tx_type_nn_16x16_layer1_bias[4] = {
+  0.81986f,
+  1.26865f,
+  0.11118f,
+  2.48404f,
+};
+
+static float av1_tx_type_nn_16x16_layer0_out[16] = { 0 };
+static float av1_tx_type_nn_16x16_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_16x16 = {
+  1,  // num_hidden_layers
+  {
+      // fc layer setting
+      {
+          // layer 0
+          8,                                    // num_inputs
+          16,                                   // num_outputs
+          av1_tx_type_nn_16x16_layer0_weights,  // weights
+          av1_tx_type_nn_16x16_layer0_bias,     // bias
+          RELU,                                 // activation
+          av1_tx_type_nn_16x16_layer0_out,      // output
+          NULL,
+          NULL,
+          NULL,
+      },
+      {
+          16,  // num_inputs (!!same as num_outputs of last layer)
+          4,
+          av1_tx_type_nn_16x16_layer1_weights,
+          av1_tx_type_nn_16x16_layer1_bias,
+          NONE,
+          av1_tx_type_nn_16x16_layer1_out,
+          NULL,
+          NULL,
+          NULL,
+      },
+  },
+  4,                                // num_outputs
+  av1_tx_type_nn_16x16_layer1_out,  // logits (!!same as last layer output)
+  SOFTMAX_CROSS_ENTROPY,
+};
+/******************************************************************************/
+
+// Tx type model for 4x16 block.
+static float av1_tx_type_nn_4x16_hor_layer0_weights[32] = {
+  0.36539f,  0.25667f,  0.01491f,  -0.21959f, 2.55105f,  0.17615f, 1.79884f,
+  1.65936f,  -0.44363f, 0.00706f,  -0.68004f, -0.64360f, 1.75760f, 1.91906f,
+  1.47682f,  0.09650f,  -3.59244f, -0.35004f, 0.93295f,  0.25806f, -0.08154f,
+  0.79332f,  0.79535f,  1.09467f,  1.57855f,  -0.51359f, 0.90553f, -1.67744f,
+  -1.74563f, -0.88830f, -1.77603f, 2.15935f,
+};
+
+static float av1_tx_type_nn_4x16_hor_layer0_bias[8] = {
+  -0.36435f, -2.22731f, -0.00837f, -1.34546f,
+  0.62806f,  -0.20675f, 4.91940f,  -0.56079f,
+};
+
+static float av1_tx_type_nn_4x16_hor_layer1_weights[32] = {
+  -0.57191f, -1.46418f, 0.67331f,  -1.15027f, 0.46288f,  0.81251f,  2.51768f,
+  -0.27147f, 0.00761f,  -2.15214f, -0.69650f, -0.50808f, 0.92832f,  0.45668f,
+  2.34201f,  -0.52941f, 0.51008f,  -1.55496f, -0.01371f, -0.12356f, 0.66624f,
+  0.88043f,  2.64862f,  -1.28024f, -0.17578f, -1.80034f, -0.32217f, 0.89519f,
+  1.28413f,  -0.30326f, 2.45329f,  -0.83335f,
+};
+
+static float av1_tx_type_nn_4x16_hor_layer1_bias[4] = {
+  2.33198f,
+  3.36245f,
+  1.62603f,
+  2.91056f,
+};
+
+static float av1_tx_type_nn_4x16_hor_layer0_out[8] = { 0 };
+static float av1_tx_type_nn_4x16_hor_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_4x16_hor = {
+  1,  // num_hidden_layers
+  {
+      // fc layer setting
+      {
+          // layer 0
+          4,                                       // num_inputs
+          8,                                       // num_outputs
+          av1_tx_type_nn_4x16_hor_layer0_weights,  // weights
+          av1_tx_type_nn_4x16_hor_layer0_bias,     // bias
+          RELU,                                    // activation
+          av1_tx_type_nn_4x16_hor_layer0_out,      // output
+          NULL,
+          NULL,
+          NULL,
+      },
+      {
+          8,  // num_inputs (!!same as num_outputs of last layer)
+          4,
+          av1_tx_type_nn_4x16_hor_layer1_weights,
+          av1_tx_type_nn_4x16_hor_layer1_bias,
+          NONE,
+          av1_tx_type_nn_4x16_hor_layer1_out,
+          NULL,
+          NULL,
+          NULL,
+      },
+  },
+  4,                                   // num_outputs
+  av1_tx_type_nn_4x16_hor_layer1_out,  // logits (!!same as last layer output)
+  SOFTMAX_CROSS_ENTROPY,
+};
+
+static float av1_tx_type_nn_4x16_ver_layer0_weights[128] = {
+  1.61392f,  1.41239f,  1.47646f,  1.47325f,  1.46110f,  1.49208f,  1.49414f,
+  0.12835f,  -0.76986f, 0.07087f,  -0.24572f, -0.93168f, 3.07935f,  -0.18183f,
+  -0.09831f, -0.07703f, -0.03222f, -0.25473f, -0.06090f, 2.93713f,  -0.38711f,
+  -0.12884f, -0.18329f, -0.06262f, -0.00327f, -0.02930f, -0.01641f, -0.00622f,
+  -0.03305f, -4.07069f, -2.76643f, 0.04413f,  -1.03176f, -0.19217f, -0.44980f,
+  -2.48615f, -2.58112f, -0.87695f, 0.16187f,  -0.04891f, -0.06854f, 1.08104f,
+  0.75245f,  1.49302f,  0.63363f,  1.45715f,  0.92574f,  1.72029f,  0.33326f,
+  3.86646f,  0.04422f,  0.41019f,  0.36212f,  0.56600f,  -1.01552f, 0.05128f,
+  0.40454f,  -1.05100f, -0.47461f, -1.33168f, -0.46145f, -1.36870f, -0.88838f,
+  -1.05358f, -0.18537f, -0.34357f, -0.03698f, 0.68905f,  0.41010f,  0.31223f,
+  -0.43382f, -0.74715f, 2.03366f,  -0.30419f, 0.45747f,  0.09526f,  0.31678f,
+  0.22915f,  0.21832f,  1.26385f,  -0.06814f, -0.71417f, -1.18947f, 0.03762f,
+  0.10936f,  2.97396f,  -0.42638f, -0.03123f, -5.49756f, -0.17029f, -0.11323f,
+  0.05173f,  -0.44274f, -0.15738f, 0.11311f,  0.43872f,  0.16837f,  -0.52849f,
+  2.90050f,  -0.54735f, -0.29591f, 1.24030f,  0.21696f,  -0.04443f, -1.60877f,
+  -1.36365f, -1.27432f, -1.52060f, -1.34397f, -1.13371f, -1.87554f, 0.80123f,
+  0.42820f,  -0.14157f, -2.73963f, -0.68040f, -0.35236f, 0.14490f,  2.23477f,
+  0.01370f,  -0.20426f, -1.51411f, -0.72293f, 0.64516f,  0.97638f,  0.32616f,
+  -0.27975f, -0.01149f,
+};
+
+static float av1_tx_type_nn_4x16_ver_layer0_bias[16] = {
+  -1.37863f, -0.05763f, -0.07041f, 0.15306f,  0.96026f,  -1.42105f,
+  -0.55822f, 1.04845f,  -0.17662f, -1.25345f, -0.11927f, 0.49845f,
+  -0.32530f, 0.73483f,  0.08322f,  -0.23890f,
+};
+
+static float av1_tx_type_nn_4x16_ver_layer1_weights[64] = {
+  0.27194f,  0.50607f,  0.49229f,  -0.48192f, 0.15667f,  -1.38891f, 0.38102f,
+  -0.58825f, -0.07337f, -0.52909f, 0.36975f,  0.28710f,  0.34992f,  -0.73630f,
+  0.30386f,  -0.58822f, 0.36127f,  0.57950f,  0.55878f,  -0.42796f, 0.19967f,
+  -1.45517f, 0.42529f,  -0.54630f, -0.38169f, -0.84899f, 0.41622f,  0.46935f,
+  0.39077f,  -0.75448f, 0.31698f,  -0.76187f, 0.97765f,  0.57052f,  0.55825f,
+  -0.54273f, 0.20466f,  -1.46347f, 0.41813f,  -0.55019f, -0.19948f, -0.57982f,
+  0.41206f,  0.32373f,  0.38537f,  -1.11657f, 0.32887f,  -0.76911f, 1.12259f,
+  0.72163f,  0.82603f,  0.37786f,  0.34976f,  -1.86642f, 0.59961f,  -0.16329f,
+  -0.36631f, -0.56814f, 0.60410f,  0.53158f,  0.56389f,  -0.70508f, 0.51009f,
+  -0.56513f,
+};
+
+static float av1_tx_type_nn_4x16_ver_layer1_bias[4] = {
+  4.60896f,
+  4.53551f,
+  4.53124f,
+  4.27435f,
+};
+
+static float av1_tx_type_nn_4x16_ver_layer0_out[16] = { 0 };
+static float av1_tx_type_nn_4x16_ver_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_4x16_ver = {
+  1,  // num_hidden_layers
+  {
+      // fc layer setting
+      {
+          // layer 0
+          8,                                       // num_inputs
+          16,                                      // num_outputs
+          av1_tx_type_nn_4x16_ver_layer0_weights,  // weights
+          av1_tx_type_nn_4x16_ver_layer0_bias,     // bias
+          RELU,                                    // activation
+          av1_tx_type_nn_4x16_ver_layer0_out,      // output
+          NULL,
+          NULL,
+          NULL,
+      },
+      {
+          16,  // num_inputs (!!same as num_outputs of last layer)
+          4,
+          av1_tx_type_nn_4x16_ver_layer1_weights,
+          av1_tx_type_nn_4x16_ver_layer1_bias,
+          NONE,
+          av1_tx_type_nn_4x16_ver_layer1_out,
+          NULL,
+          NULL,
+          NULL,
+      },
+  },
+  4,                                   // num_outputs
+  av1_tx_type_nn_4x16_ver_layer1_out,  // logits (!!same as last layer output)
+  SOFTMAX_CROSS_ENTROPY,
+};
+/******************************************************************************/
+
+// Tx type model for 16x4 block.
+static float av1_tx_type_nn_16x4_hor_layer0_weights[128] = {
+  1.45347f,  -0.15743f, 0.44236f,  0.25808f,  0.33944f,  0.38678f,  0.24428f,
+  1.67287f,  0.09539f,  -0.42940f, -0.31507f, -0.00154f, -2.98755f, -2.27744f,
+  -0.49183f, 0.09333f,  -0.99026f, -0.22157f, 0.53701f,  0.60447f,  0.15686f,
+  -0.04646f, 0.26341f,  2.12361f,  0.27090f,  -1.14716f, -0.64146f, -0.91604f,
+  -0.75335f, -0.60056f, -1.25084f, 1.68473f,  -3.24075f, -4.03867f, -2.07877f,
+  -0.02347f, 0.00333f,  -0.01259f, -0.00465f, 0.02526f,  0.36286f,  -0.10324f,
+  2.12780f,  -0.74584f, -1.05052f, 1.78467f,  -0.55065f, -0.03326f, 2.46781f,
+  1.18349f,  0.96015f,  1.01696f,  1.10584f,  1.07263f,  1.11531f,  -1.06413f,
+  0.32389f,  -1.87360f, -0.14435f, 1.77926f,  1.09966f,  -0.12680f, -0.61386f,
+  -0.09724f, -0.33095f, 1.12122f,  1.00791f,  1.52416f,  1.35004f,  1.32657f,
+  0.60950f,  -1.13538f, -0.38654f, 0.06473f,  2.10669f,  0.27734f,  -0.38359f,
+  -1.91455f, -1.22676f, 0.05786f,  0.97432f,  2.19967f,  0.50457f,  0.78976f,
+  0.95183f,  -0.32414f, 0.49437f,  -0.04506f, 0.18993f,  -0.07971f, 0.23889f,
+  -0.09872f, -0.66036f, 0.05377f,  2.69638f,  -0.08259f, -0.69210f, -1.08296f,
+  -1.96504f, -2.31947f, -0.80161f, -0.80456f, -1.35556f, -0.05323f, -4.42658f,
+  -0.30732f, -0.12043f, 0.11126f,  0.10771f,  -0.14956f, -0.02218f, 0.41016f,
+  1.16599f,  1.14629f,  1.12881f,  1.18676f,  1.24677f,  1.28695f,  1.11270f,
+  0.08233f,  1.75440f,  0.49228f,  -0.34858f, -0.17032f, 0.29288f,  0.47175f,
+  0.19055f,  -1.56413f,
+};
+
+static float av1_tx_type_nn_16x4_hor_layer0_bias[16] = {
+  -1.71227f, 0.47291f, -0.97536f, -0.66216f, 0.11729f,  -0.21451f,
+  2.75281f,  0.04318f, 2.03965f,  0.14618f,  -0.70483f, -0.24517f,
+  1.14048f,  0.33308f, -1.10886f, 0.41184f,
+};
+
+static float av1_tx_type_nn_16x4_hor_layer1_weights[64] = {
+  -1.17079f, 0.19096f,  -1.05753f, -0.30803f, -1.21680f, -0.67255f, 1.60115f,
+  0.05972f,  1.44759f,  -0.04068f, -0.26331f, 0.31400f,  0.96923f,  0.33443f,
+  -0.77215f, -0.91316f, -1.78928f, 0.21483f,  -1.24008f, -0.46190f, -0.12127f,
+  -0.62144f, 1.37593f,  0.08373f,  1.56215f,  0.00279f,  -0.14556f, 0.38710f,
+  0.96228f,  0.66433f,  -0.51798f, -0.80738f, -0.18539f, 0.19377f,  -1.03090f,
+  -1.51044f, -0.59485f, -0.62589f, 1.90742f,  0.09078f,  1.49113f,  0.00205f,
+  -0.15918f, 0.40827f,  1.08553f,  0.43431f,  0.33519f,  -1.12669f, -1.10274f,
+  0.80004f,  -1.83599f, -0.53134f, 2.00515f,  -0.32670f, 1.37124f,  0.51136f,
+  1.62563f,  0.24787f,  0.31757f,  0.81751f,  1.57262f,  0.83214f,  1.04661f,
+  -0.43819f,
+};
+
+static float av1_tx_type_nn_16x4_hor_layer1_bias[4] = {
+  2.32575f,
+  2.75703f,
+  1.12304f,
+  2.15567f,
+};
+
+static float av1_tx_type_nn_16x4_hor_layer0_out[16] = { 0 };
+static float av1_tx_type_nn_16x4_hor_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_16x4_hor = {
+  1,  // num_hidden_layers
+  {
+      // fc layer setting
+      {
+          // layer 0
+          8,                                       // num_inputs
+          16,                                      // num_outputs
+          av1_tx_type_nn_16x4_hor_layer0_weights,  // weights
+          av1_tx_type_nn_16x4_hor_layer0_bias,     // bias
+          RELU,                                    // activation
+          av1_tx_type_nn_16x4_hor_layer0_out,      // output
+          NULL,
+          NULL,
+          NULL,
+      },
+      {
+          16,  // num_inputs (!!same as num_outputs of last layer)
+          4,
+          av1_tx_type_nn_16x4_hor_layer1_weights,
+          av1_tx_type_nn_16x4_hor_layer1_bias,
+          NONE,
+          av1_tx_type_nn_16x4_hor_layer1_out,
+          NULL,
+          NULL,
+          NULL,
+      },
+  },
+  4,                                   // num_outputs
+  av1_tx_type_nn_16x4_hor_layer1_out,  // logits (!!same as last layer output)
+  SOFTMAX_CROSS_ENTROPY,
+};
+
+static float av1_tx_type_nn_16x4_ver_layer0_weights[32] = {
+  0.26047f,  0.99930f,  1.16484f,  -0.28196f, -2.67483f, -0.21456f, -0.16854f,
+  0.46375f,  1.47951f,  1.13735f,  1.12356f,  0.27385f,  0.50978f,  2.09967f,
+  -1.47386f, 0.01950f,  -0.06362f, 0.26014f,  1.04544f,  -0.03099f, 0.07478f,
+  -0.39701f, 0.05545f,  2.73633f,  -0.56305f, -0.02208f, -0.44517f, -0.00897f,
+  -0.17967f, -0.96622f, 0.42635f,  -1.04784f,
+};
+
+static float av1_tx_type_nn_16x4_ver_layer0_bias[8] = {
+  -0.52088f, 0.52844f,  -1.03655f, -0.30974f,
+  2.59952f,  -1.93604f, 0.00000f,  2.51787f,
+};
+
+static float av1_tx_type_nn_16x4_ver_layer1_weights[32] = {
+  0.10916f,  -0.21219f, -0.51340f, 0.69161f,  1.45988f,  -1.36942f, -0.40899f,
+  1.05136f,  -0.08486f, 0.10008f,  -0.55304f, 0.88012f,  1.61177f,  -1.64507f,
+  0.63428f,  1.15130f,  -0.17287f, -0.18592f, -0.01143f, 0.88293f,  1.73326f,
+  -1.63624f, 0.09359f,  1.18393f,  0.26531f,  0.22378f,  0.15170f,  1.06965f,
+  1.26814f,  -1.93873f, -0.00768f, 1.58309f,
+};
+
+static float av1_tx_type_nn_16x4_ver_layer1_bias[4] = {
+  2.34713f,
+  1.68667f,
+  1.25488f,
+  1.69812f,
+};
+
+static float av1_tx_type_nn_16x4_ver_layer0_out[8] = { 0 };
+static float av1_tx_type_nn_16x4_ver_layer1_out[4] = { 0 };
+
+static NN_CONFIG_V2 av1_tx_type_nnconfig_16x4_ver = {
+  1,  // num_hidden_layers
+  {
+      // fc layer setting
+      {
+          // layer 0
+          4,                                       // num_inputs
+          8,                                       // num_outputs
+          av1_tx_type_nn_16x4_ver_layer0_weights,  // weights
+          av1_tx_type_nn_16x4_ver_layer0_bias,     // bias
+          RELU,                                    // activation
+          av1_tx_type_nn_16x4_ver_layer0_out,      // output
+          NULL,
+          NULL,
+          NULL,
+      },
+      {
+          8,  // num_inputs (!!same as num_outputs of last layer)
+          4,
+          av1_tx_type_nn_16x4_ver_layer1_weights,
+          av1_tx_type_nn_16x4_ver_layer1_bias,
+          NONE,
+          av1_tx_type_nn_16x4_ver_layer1_out,
+          NULL,
+          NULL,
+          NULL,
+      },
+  },
+  4,                                   // num_outputs
+  av1_tx_type_nn_16x4_ver_layer1_out,  // logits (!!same as last layer output)
+  SOFTMAX_CROSS_ENTROPY,
+};
+/******************************************************************************/
+
+// Map tx_size to its corresponding neural net model for tx type prediction.
+static NN_CONFIG_V2 *av1_tx_type_nnconfig_map_hor[] = {
+  &av1_tx_type_nnconfig_4x4_hor,   // 4x4 transform
+  &av1_tx_type_nnconfig_8x8_hor,   // 8x8 transform
+  &av1_tx_type_nnconfig_16x16,     // 16x16 transform
+  NULL,                            // 32x32 transform
+  NULL,                            // 64x64 transform
+  &av1_tx_type_nnconfig_4x8_hor,   // 4x8 transform
+  &av1_tx_type_nnconfig_8x4_hor,   // 8x4 transform
+  &av1_tx_type_nnconfig_8x16_hor,  // 8x16 transform
+  &av1_tx_type_nnconfig_16x8_hor,  // 16x8 transform
+  NULL,                            // 16x32 transform
+  NULL,                            // 32x16 transform
+  NULL,                            // 32x64 transform
+  NULL,                            // 64x32 transform
+  &av1_tx_type_nnconfig_4x16_hor,  // 4x16 transform
+  &av1_tx_type_nnconfig_16x4_hor,  // 16x4 transform
+  NULL,                            // 8x32 transform
+  NULL,                            // 32x8 transform
+  NULL,                            // 16x64 transform
+  NULL,                            // 64x16 transform
+};
+
+static NN_CONFIG_V2 *av1_tx_type_nnconfig_map_ver[] = {
+  &av1_tx_type_nnconfig_4x4_ver,   // 4x4 transform
+  &av1_tx_type_nnconfig_8x8_ver,   // 8x8 transform
+  &av1_tx_type_nnconfig_16x16,     // 16x16 transform
+  NULL,                            // 32x32 transform
+  NULL,                            // 64x64 transform
+  &av1_tx_type_nnconfig_4x8_ver,   // 4x8 transform
+  &av1_tx_type_nnconfig_8x4_ver,   // 8x4 transform
+  &av1_tx_type_nnconfig_8x16_ver,  // 8x16 transform
+  &av1_tx_type_nnconfig_16x8_ver,  // 16x8 transform
+  NULL,                            // 16x32 transform
+  NULL,                            // 32x16 transform
+  NULL,                            // 32x64 transform
+  NULL,                            // 64x32 transform
+  &av1_tx_type_nnconfig_4x16_ver,  // 4x16 transform
+  &av1_tx_type_nnconfig_16x4_ver,  // 16x4 transform
+  NULL,                            // 8x32 transform
+  NULL,                            // 32x8 transform
+  NULL,                            // 16x64 transform
+  NULL,                            // 64x16 transform
+};
+#else
+/******************************CONFIG_NN***************************************/
+// Tx type model for 4x4 block.
+static const float av1_tx_type_nn_weights_4x4_hor_layer0[32] = {
+  -1.64947f, -1.54497f, -1.62832f, -0.17774f, -2.89498f, -0.72498f, 0.72036f,
+  0.17996f,  1.20000f,  -0.27654f, 0.77396f,  1.21684f,  -1.75909f, -0.51272f,
+  -1.25923f, 0.35005f,  -0.04257f, -0.23389f, -0.41841f, -0.08229f, 0.09503f,
+  2.73144f,  -0.16875f, -0.23482f, 0.02194f,  -0.26427f, 0.28049f,  0.21260f,
+  1.35792f,  0.27733f,  0.88660f,  -0.68304f,
+};
+
+static const float av1_tx_type_nn_bias_4x4_hor_layer0[8] = {
+  1.38742f, 0.59540f,  -1.37622f, 1.92114f,
+  0.00000f, -0.38998f, -0.32726f, -0.15650f,
+};
+
+static const float av1_tx_type_nn_weights_4x4_hor_layer1[32] = {
+  1.65254f,  1.00915f,  -0.89318f, -2.05142f, -0.23235f, 0.96781f,  -0.37145f,
+  -0.21056f, 1.13891f,  0.38675f,  0.87739f,  -1.42697f, 0.48015f,  0.61883f,
+  -0.03979f, 0.11487f,  0.48042f,  0.45200f,  -0.23242f, 0.75166f,  0.55458f,
+  0.39452f,  -0.35285f, 1.59120f,  -1.49221f, -0.48349f, -0.64692f, 1.49297f,
+  -0.26782f, -0.65416f, -0.10648f, 0.05568f,
+};
+
+static const float av1_tx_type_nn_bias_4x4_hor_layer1[4] = {
+  4.07177f,
+  3.26961f,
+  0.58083f,
+  1.21199f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_4x4_hor = {
+  4,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      8,
+  },  // num_hidden_nodes
+  { av1_tx_type_nn_weights_4x4_hor_layer0,
+    av1_tx_type_nn_weights_4x4_hor_layer1 },
+  { av1_tx_type_nn_bias_4x4_hor_layer0, av1_tx_type_nn_bias_4x4_hor_layer1 }
+};
+
+static const float av1_tx_type_nn_weights_4x4_ver_layer0[32] = {
+  -0.02032f, 2.61610f,  0.02098f,  -0.30217f, 0.12637f,  0.11017f,  -3.01996f,
+  0.35144f,  1.93776f,  -0.20463f, 1.64102f,  -1.41986f, -3.66717f, -0.51655f,
+  0.43910f,  0.37778f,  -1.02634f, 0.85337f,  -0.69753f, 1.00206f,  2.11784f,
+  1.89427f,  1.92919f,  0.43201f,  -1.67358f, -1.67035f, -1.54623f, 0.16714f,
+  -0.06589f, -0.28142f, -0.33118f, 1.72227f,
+};
+
+static const float av1_tx_type_nn_bias_4x4_ver_layer0[8] = {
+  -0.33685f, 0.22025f,  0.28140f, 0.56138f,
+  0.93489f,  -1.77048f, 1.34989f, -0.93747f,
+};
+
+static const float av1_tx_type_nn_weights_4x4_ver_layer1[32] = {
+  -1.39506f, -1.06271f, -1.10886f, -1.69719f, 0.19699f,  -2.39850f, -1.26457f,
+  0.75328f,  -1.26005f, -0.82738f, -0.12015f, -1.02702f, 1.40828f,  -2.37739f,
+  -0.65639f, -0.71992f, -0.90453f, -1.12510f, -2.41362f, -1.16061f, -1.85577f,
+  -0.99165f, -1.91366f, 0.16785f,  0.34776f,  0.58154f,  -0.18217f, -0.29257f,
+  -0.86315f, -0.53336f, 0.30320f,  -1.32331f,
+};
+
+static const float av1_tx_type_nn_bias_4x4_ver_layer1[4] = {
+  -1.31519f,
+  -3.26321f,
+  1.71794f,
+  -1.90778f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_4x4_ver = {
+  4,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      8,
+  },  // num_hidden_nodes
+  { av1_tx_type_nn_weights_4x4_ver_layer0,
+    av1_tx_type_nn_weights_4x4_ver_layer1 },
+  { av1_tx_type_nn_bias_4x4_ver_layer0, av1_tx_type_nn_bias_4x4_ver_layer1 }
+};
+/******************************************************************************/
+
+// Tx type model for 4x8 block.
+static const float av1_tx_type_nn_weights_4x8_hor_layer0[32] = {
+  0.00218f,  -0.41880f, -0.61215f, -0.92588f, 0.54291f,  -0.10898f, 0.70691f,
+  0.46819f,  -1.61598f, -0.08834f, -0.96839f, 1.18489f,  -0.45171f, -0.65445f,
+  -0.32179f, -0.10399f, 1.04379f,  0.91895f,  0.85589f,  0.08267f,  1.35388f,
+  -2.03096f, 0.08168f,  -0.06372f, -0.26732f, -0.48262f, -0.08682f, 2.44071f,
+  -1.35896f, -1.17121f, 1.68866f,  0.10357f,
+};
+
+static const float av1_tx_type_nn_bias_4x8_hor_layer0[8] = {
+  2.93391f,  0.66831f, -0.21419f, 0.00000f,
+  -0.72878f, 0.15127f, -1.46755f, 0.16658f,
+};
+
+static const float av1_tx_type_nn_weights_4x8_hor_layer1[32] = {
+  -1.52077f, -1.06243f, 0.35319f,  -0.49207f, 0.54524f,  0.44271f, 1.37117f,
+  -0.38957f, -1.28889f, -0.57133f, 0.04658f,  0.62278f,  0.37984f, 0.33247f,
+  1.65547f,  -0.56806f, -1.38645f, -0.76258f, 0.67926f,  0.08783f, -0.01443f,
+  0.34950f,  1.45812f,  -0.51332f, -1.41331f, -0.16453f, 0.05755f, 0.31405f,
+  -0.50191f, 0.18219f,  1.83664f,  -0.75276f,
+};
+
+static const float av1_tx_type_nn_bias_4x8_hor_layer1[4] = {
+  -1.17455f,
+  -2.26089f,
+  -1.79863f,
+  -2.26333f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_4x8_hor = {
+  4,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      8,
+  },  // num_hidden_nodes
+  { av1_tx_type_nn_weights_4x8_hor_layer0,
+    av1_tx_type_nn_weights_4x8_hor_layer1 },
+  { av1_tx_type_nn_bias_4x8_hor_layer0, av1_tx_type_nn_bias_4x8_hor_layer1 }
+};
+
+static const float av1_tx_type_nn_weights_4x8_ver_layer0[128] = {
+  -0.00952f, -0.98858f, -0.93181f, 1.39594f,  0.96559f,  0.18162f,  -0.76064f,
+  -0.06066f, 0.07907f,  -0.09365f, -0.21313f, -0.02187f, -2.61707f, -2.68702f,
+  -0.10982f, 0.18559f,  1.17049f,  1.11387f,  1.12697f,  1.05804f,  1.12764f,
+  1.06318f,  1.12052f,  0.17406f,  1.83157f,  0.19362f,  0.46910f,  0.39608f,
+  0.33342f,  0.40083f,  0.27645f,  1.06864f,  -4.06645f, -0.38775f, -0.11070f,
+  0.03781f,  -0.09141f, 0.06185f,  -0.04852f, 0.20163f,  0.16784f,  0.16641f,
+  -0.50941f, -0.61087f, 2.07008f,  -0.82381f, -0.85558f, 0.05528f,  -0.10535f,
+  -2.81150f, 0.67038f,  0.43643f,  0.49062f,  -0.04465f, 0.90438f,  0.00977f,
+  0.46272f,  1.59751f,  0.95234f,  0.35086f,  0.85624f,  0.73149f,  1.67779f,
+  -2.21511f, -1.24746f, -1.09014f, -0.92441f, -1.22591f, -1.06961f, -0.95897f,
+  -1.24956f, 0.73797f,  1.23275f,  -0.60064f, -0.07851f, 0.14397f,  0.22110f,
+  -0.04422f, 0.14350f,  0.75926f,  0.35032f,  0.48104f,  2.81408f,  0.34662f,
+  0.42090f,  0.35521f,  -1.36804f, -0.14974f, -0.47696f, -0.07892f, 0.36910f,
+  0.32299f,  0.23916f,  0.06032f,  -0.17844f, -0.17558f, -1.42746f, -0.55828f,
+  -1.00418f, -0.64823f, -0.73654f, -0.85197f, -1.50989f, 1.69385f,  -0.04973f,
+  -0.09273f, 1.04249f,  0.79235f,  1.13229f,  0.99617f,  0.03851f,  0.56334f,
+  0.90795f,  1.08296f,  0.58519f,  1.74765f,  0.63971f,  1.35951f,  0.07803f,
+  -0.05127f, 0.26514f,  -0.84629f, -0.66343f, -2.10630f, 0.11017f,  2.18528f,
+  -0.21958f, 0.05970f,
+};
+
+static const float av1_tx_type_nn_bias_4x8_ver_layer0[16] = {
+  0.04205f, 0.22260f, -1.03870f, -1.19568f, 0.44283f,  0.01143f,
+  0.00235f, 4.26772f, 0.44364f,  -0.33199f, -0.39076f, -0.35129f,
+  0.08288f, 0.18195f, -0.79890f, 0.10047f,
+};
+
+static const float av1_tx_type_nn_weights_4x8_ver_layer1[64] = {
+  -0.38193f, -0.12095f, 1.57802f,  0.34932f,  -0.47333f, -0.12304f, -0.01736f,
+  -2.52445f, 0.18983f,  -0.64707f, -0.60889f, -0.53750f, 0.91666f,  -0.62823f,
+  -0.13377f, -0.43594f, -0.38618f, -0.01328f, 0.97457f,  1.48589f,  -1.03238f,
+  -0.33459f, -0.35108f, -2.42417f, 0.60229f,  0.06824f,  -0.75495f, 0.26902f,
+  0.65311f,  -0.23887f, -0.44604f, -0.55800f, -0.33842f, 0.04259f,  -0.59589f,
+  0.49738f,  -0.62301f, -0.30896f, -0.29602f, -2.57052f, 2.00943f,  -0.66490f,
+  -0.76312f, 0.28256f,  1.06311f,  -0.38364f, -0.63508f, -0.57609f, -0.88765f,
+  -1.04403f, -0.46531f, 0.34084f,  -1.20498f, -0.68352f, -0.72251f, -2.63242f,
+  -0.68736f, -0.37904f, -1.32371f, 0.47288f,  1.51904f,  0.78372f,  -1.01830f,
+  -1.01848f,
+};
+
+static const float av1_tx_type_nn_bias_4x8_ver_layer1[4] = {
+  -1.45955f,
+  -2.08949f,
+  -1.24813f,
+  -1.55368f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_4x8_ver = {
+  8,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      16,
+  },  // num_hidden_nodes
+  { av1_tx_type_nn_weights_4x8_ver_layer0,
+    av1_tx_type_nn_weights_4x8_ver_layer1 },
+  { av1_tx_type_nn_bias_4x8_ver_layer0, av1_tx_type_nn_bias_4x8_ver_layer1 }
+};
+/******************************************************************************/
+
+// Tx type model for 8x4 block.
+static const float av1_tx_type_nn_weights_8x4_hor_layer0[128] = {
+  -0.22492f, 0.13341f,  -4.03243f, -0.64015f, 0.02783f,  0.60466f,  -0.13335f,
+  0.16828f,  0.12336f,  0.52904f,  1.18455f,  -0.32425f, 0.13052f,  0.93810f,
+  -3.71165f, 0.02990f,  -4.63558f, 0.05666f,  0.03524f,  -0.07449f, -0.44006f,
+  -0.33215f, -0.33713f, 0.08097f,  0.60873f,  0.29582f,  0.21696f,  -0.78729f,
+  -0.16757f, -0.26567f, -0.00720f, -1.11226f, 1.58189f,  1.58463f,  1.48536f,
+  1.54374f,  1.60069f,  1.46125f,  1.53932f,  0.05974f,  -1.82192f, 0.47043f,
+  0.38090f,  0.20833f,  -0.05637f, 0.05183f,  0.01323f,  -0.25662f, 0.78634f,
+  -0.55069f, -0.02975f, -1.29294f, -0.77192f, -2.34299f, -1.28074f, 0.77894f,
+  -1.69740f, -1.66032f, -1.44323f, -1.55063f, -1.50845f, -1.23690f, -1.80663f,
+  0.75079f,  2.32551f,  0.05878f,  0.80438f,  0.88584f,  0.69153f,  0.89060f,
+  0.73660f,  0.87259f,  -0.00745f, -1.30044f, -0.59430f, 2.07270f,  1.03307f,
+  -0.84697f, -1.19393f, 0.17549f,  -0.24978f, -3.67234f, 0.20781f,  -0.53946f,
+  -0.05068f, 0.88274f,  1.30371f,  0.10288f,  0.07585f,  0.12259f,  -0.30815f,
+  0.25437f,  -2.82096f, -2.69482f, 0.02370f,  0.12500f,  -0.21019f, -0.49220f,
+  0.03638f,  -0.29795f, 0.28645f,  -0.48432f, -0.38584f, -0.32148f, -0.47197f,
+  0.32437f,  0.32528f,  -0.19437f, 0.30383f,  -0.31879f, 0.26359f,  -0.12164f,
+  -0.43647f, -0.08288f, -0.33438f, -0.63608f, -0.46647f, -0.46574f, 0.47806f,
+  -0.49012f, -1.51234f, -1.13502f, -1.20470f, -1.02913f, -1.09182f, -0.93921f,
+  -1.85523f, 0.92532f,
+};
+
+static const float av1_tx_type_nn_bias_8x4_hor_layer0[16] = {
+  0.36631f,  0.02901f,  0.64305f,  1.53074f, -1.40229f, 0.03852f,
+  -0.05043f, 0.89632f,  -1.23312f, 0.07036f, 0.17070f,  0.56250f,
+  -0.28958f, -0.32869f, -0.01704f, 0.68171f,
+};
+
+static const float av1_tx_type_nn_weights_8x4_hor_layer1[64] = {
+  -0.49441f, -0.31960f, -0.84946f, -0.85800f, -2.37767f, 0.81373f,  -0.73172f,
+  -0.69337f, 0.88807f,  -0.49242f, -0.44717f, -0.11436f, 0.09978f,  0.15393f,
+  0.17083f,  1.44850f,  -0.20582f, -0.04906f, 0.42990f,  -0.61939f, -1.09692f,
+  -1.14885f, -1.36879f, -1.30828f, -0.59558f, -0.30903f, -0.08906f, 0.06953f,
+  0.15383f,  -0.04193f, -0.54858f, 1.82676f,  -0.22411f, 0.05264f,  -0.45848f,
+  -0.72985f, 0.87553f,  0.04116f,  -1.29774f, -2.63018f, 1.09089f,  -0.36048f,
+  -0.16725f, 0.11627f,  0.49918f,  0.07539f,  0.00763f,  0.73706f,  0.87800f,
+  0.57049f,  0.60969f,  1.02779f,  1.53339f,  -0.35915f, 0.06410f,  1.44582f,
+  0.09698f,  0.71888f,  0.60594f,  0.84103f,  -0.50440f, -0.38825f, 0.15626f,
+  -1.10654f,
+};
+
+static const float av1_tx_type_nn_bias_8x4_hor_layer1[4] = {
+  -0.92861f,
+  -1.45151f,
+  -1.33588f,
+  -4.33853f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_8x4_hor = {
+  8,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      16,
+  },  // num_hidden_nodes
+  { av1_tx_type_nn_weights_8x4_hor_layer0,
+    av1_tx_type_nn_weights_8x4_hor_layer1 },
+  { av1_tx_type_nn_bias_8x4_hor_layer0, av1_tx_type_nn_bias_8x4_hor_layer1 }
+};
+
+static const float av1_tx_type_nn_weights_8x4_ver_layer0[32] = {
+  -1.10946f, 1.86574f,  -1.59343f, 0.27018f, -1.70676f, -0.73982f, -0.19021f,
+  -1.94208f, -2.29759f, -1.44402f, 0.28700f, -1.18340f, -1.50158f, -0.44175f,
+  -1.36831f, 1.00374f,  2.59312f,  0.50291f, -0.71042f, -0.12238f, -0.15901f,
+  -0.22807f, -0.67376f, -0.30215f, 0.54407f, -0.45538f, 1.18262f,  2.28687f,
+  1.66212f,  1.70826f,  1.55182f,  0.12230f,
+};
+
+static const float av1_tx_type_nn_bias_8x4_ver_layer0[8] = {
+  0.10943f,  2.09789f, 2.16578f, 0.15766f,
+  -0.42461f, 0.00000f, 1.22090f, -1.28717f,
+};
+
+static const float av1_tx_type_nn_weights_8x4_ver_layer1[32] = {
+  1.20426f,  -1.23237f, 2.41053f, -0.72488f, 1.25249f,  0.18018f,  -0.09586f,
+  2.17901f,  0.15364f,  1.21535f, -0.38263f, -0.74309f, 0.50551f,  -0.54208f,
+  0.59139f,  1.16095f,  0.55919f, -0.60183f, 1.18949f,  1.60787f,  0.54002f,
+  -0.10712f, -0.16153f, 0.16207f, -0.32338f, 2.68712f,  -2.83483f, -0.27086f,
+  -1.15005f, -0.39311f, 1.51236f, -1.68973f,
+};
+
+static const float av1_tx_type_nn_bias_8x4_ver_layer1[4] = {
+  1.81013f,
+  1.10517f,
+  2.90059f,
+  0.95391f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_8x4_ver = {
+  4,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      8,
+  },  // num_hidden_nodes
+  { av1_tx_type_nn_weights_8x4_ver_layer0,
+    av1_tx_type_nn_weights_8x4_ver_layer1 },
+  { av1_tx_type_nn_bias_8x4_ver_layer0, av1_tx_type_nn_bias_8x4_ver_layer1 }
+};
+/******************************************************************************/
+
+// Tx type model for 8x8 block.
+static const float av1_tx_type_nn_weights_8x8_hor_layer0[128] = {
+  -0.85529f, 0.37619f,  0.12754f,  0.08622f,  0.45278f,  0.54929f,  1.60651f,
+  -0.62654f, -0.54929f, -0.10131f, -0.17569f, 0.13948f,  0.31695f,  -0.05616f,
+  0.20483f,  -0.36448f, 2.27203f,  -0.33087f, 0.47679f,  0.86888f,  0.39370f,
+  0.46239f,  0.01113f,  1.50327f,  -1.48226f, -1.69621f, -1.49777f, -1.38885f,
+  -1.37753f, -1.22681f, -1.70576f, 0.51329f,  -1.65662f, 1.74197f,  -0.13579f,
+  -0.13133f, -0.58396f, -0.55510f, -1.10709f, -2.34975f, 0.22445f,  -0.56491f,
+  -0.83432f, 0.13492f,  1.32147f,  2.85285f,  0.13819f,  0.03792f,  -1.30792f,
+  0.04155f,  -0.70644f, -0.43430f, -0.16212f, -0.86945f, -1.16976f, 1.68339f,
+  0.29540f,  0.01137f,  -0.25335f, -0.16856f, 0.12028f,  0.05207f,  0.39357f,
+  -0.01545f, -0.21980f, -1.94091f, -1.01315f, -0.68270f, -0.40590f, -0.67111f,
+  2.08283f,  0.19291f,  -4.81426f, -0.65044f, -0.24598f, 0.06371f,  -0.10272f,
+  -0.14502f, -0.06821f, 0.45202f,  0.21091f,  -0.80864f, 0.39255f,  1.79189f,
+  1.80453f,  1.10484f,  1.17608f,  0.96901f,  -0.35871f, -0.94311f, 0.63147f,
+  2.95157f,  0.45917f,  -0.42849f, -0.55643f, -0.06097f, 3.49299f,  -0.50972f,
+  0.11075f,  -0.08405f, -0.09274f, -0.22694f, -0.42426f, 0.48632f,  -1.61074f,
+  1.82998f,  0.37623f,  -1.20330f, -0.01142f, -1.33307f, -0.27492f, -2.23621f,
+  1.38846f,  1.42085f,  1.42568f,  1.36152f,  1.46910f,  1.27473f,  1.34752f,
+  0.12753f,  -1.08197f, -1.08280f, -0.79489f, -1.12338f, -1.06795f, -0.87857f,
+  -0.99892f, 1.09823f,
+};
+
+static const float av1_tx_type_nn_bias_8x8_hor_layer0[16] = {
+  -0.49232f, -0.29685f, -1.44020f, 1.10940f,  1.16452f, -0.34862f,
+  -0.38761f, -0.36243f, 0.21776f,  0.28234f,  2.34269f, -0.04104f,
+  -0.26319f, 2.65579f,  -1.30137f, -0.01487f,
+};
+
+static const float av1_tx_type_nn_weights_8x8_hor_layer1[64] = {
+  -0.38058f, -0.41295f, -1.26884f, -0.75560f, -1.57450f, 0.56072f,  -1.42322f,
+  -0.29106f, 0.07228f,  0.04391f,  1.61388f,  -0.03055f, 0.81637f,  2.06045f,
+  0.27119f,  -0.48328f, -0.45528f, -0.60534f, -1.61209f, -0.78157f, -1.65034f,
+  0.60958f,  -1.30523f, 0.25143f,  0.11398f,  0.37860f,  1.54829f,  0.02309f,
+  0.67288f,  2.11447f,  0.44845f,  -0.70406f, -0.67897f, -0.38759f, -1.30383f,
+  -1.22646f, -1.54571f, 0.60552f,  -1.52565f, 0.11469f,  0.17344f,  0.08622f,
+  1.57906f,  -0.00909f, 0.81634f,  2.04909f,  1.26466f,  -1.45741f, -0.75229f,
+  0.06200f,  -1.05835f, -0.66257f, -1.73766f, 0.99923f,  -1.87082f, 0.14580f,
+  0.49525f,  0.46839f,  1.32203f,  0.33923f,  0.97001f,  2.38584f,  1.58811f,
+  0.06161f,
+};
+
+static const float av1_tx_type_nn_bias_8x8_hor_layer1[4] = {
+  1.70385f,
+  1.82373f,
+  1.78496f,
+  1.80826f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_8x8_hor = {
+  8,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      16,
+  },  // num_hidden_nodes
+  { av1_tx_type_nn_weights_8x8_hor_layer0,
+    av1_tx_type_nn_weights_8x8_hor_layer1 },
+  { av1_tx_type_nn_bias_8x8_hor_layer0, av1_tx_type_nn_bias_8x8_hor_layer1 }
+};
+
+static const float av1_tx_type_nn_weights_8x8_ver_layer0[128] = {
+  -0.67016f, -1.72366f, -1.86576f, -1.50962f, -1.70419f, -1.73964f, -1.84615f,
+  2.09681f,  -0.05081f, -0.61030f, 2.02541f,  0.60222f,  0.99936f,  2.02114f,
+  -0.53893f, -0.23757f, 0.73566f,  0.25443f,  0.00132f,  -0.74036f, -0.75351f,
+  -0.76964f, -1.71007f, -0.15770f, 1.60982f,  2.17638f,  0.90681f,  0.64973f,
+  0.85914f,  0.58786f,  -1.46228f, 0.05187f,  1.18804f,  0.30850f,  0.29512f,
+  0.40526f,  0.37635f,  0.32311f,  0.37471f,  1.12346f,  3.41856f,  -0.36653f,
+  0.42537f,  -0.19240f, 0.00155f,  0.30826f,  -0.02116f, -0.53435f, -0.34829f,
+  -0.52466f, -0.11521f, -0.29163f, -2.05689f, -2.87372f, -0.62626f, 0.09585f,
+  -0.75257f, 0.10057f,  1.43474f,  0.89450f,  0.75900f,  1.11147f,  1.00558f,
+  0.25886f,  2.22095f,  -0.17926f, 0.57161f,  0.39546f,  0.47846f,  0.40452f,
+  0.54298f,  0.45814f,  -3.62788f, -3.02374f, 0.03716f,  -0.13937f, -0.09415f,
+  -0.12463f, 0.05682f,  0.03672f,  1.20746f,  1.25003f,  1.27071f,  1.31883f,
+  1.27473f,  1.34943f,  1.23158f,  0.09039f,  0.19388f,  0.63420f,  2.79612f,
+  0.93803f,  -0.11323f, -0.02027f, 0.41286f,  -0.05979f, -3.80705f, -0.52451f,
+  -0.77098f, -0.68132f, -0.65559f, -0.60975f, -1.26165f, 0.25582f,  0.05346f,
+  0.61403f,  0.32140f,  -2.39831f, -1.42355f, 1.30541f,  1.02361f,  0.12930f,
+  -1.61469f, -0.77036f, -0.59144f, 1.27769f,  1.52068f,  0.82137f,  1.83159f,
+  -0.66626f, -0.69806f, -1.00564f, -0.85995f, -0.90889f, -0.84412f, -0.85712f,
+  -1.29848f, 0.39308f,
+};
+
+static const float av1_tx_type_nn_bias_8x8_ver_layer0[16] = {
+  -0.14868f, -0.48343f, 3.94416f,  -0.78037f, -1.33789f, -0.60611f,
+  0.51793f,  0.44030f,  -0.71563f, 0.22561f,  -1.19083f, -0.46149f,
+  0.83015f,  0.06024f,  1.17180f,  0.65122f,
+};
+
+static const float av1_tx_type_nn_weights_8x8_ver_layer1[64] = {
+  -1.42711f, -0.21683f, 2.12061f,  0.20489f,  -0.50228f, -0.24770f, 0.23391f,
+  1.03470f,  -0.44847f, -0.63225f, -0.21583f, -0.06467f, -0.21892f, -0.07786f,
+  1.43322f,  0.00280f,  -1.53057f, -0.18912f, 1.95333f,  0.31151f,  -2.07601f,
+  0.06776f,  0.25529f,  0.94800f,  -1.11453f, -0.20594f, -0.13281f, 0.01485f,
+  0.17650f,  -0.07955f, 1.43734f,  -0.23193f, -2.06463f, -0.21238f, 2.13707f,
+  0.30351f,  0.27594f,  -0.36245f, 0.19539f,  0.91045f,  -0.24068f, -0.37616f,
+  0.88792f,  0.02947f,  -0.16903f, -0.04932f, 1.51293f,  -0.95967f, -1.62903f,
+  0.05326f,  2.30703f,  0.64445f,  -1.09464f, -0.16623f, 1.00240f,  0.07548f,
+  -0.50406f, 0.63854f,  1.02340f,  0.49833f,  0.13671f,  0.26722f,  2.09516f,
+  -0.41305f,
+};
+
+static const float av1_tx_type_nn_bias_8x8_ver_layer1[4] = {
+  2.14067f,
+  2.76699f,
+  2.04233f,
+  1.34803f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_8x8_ver = {
+  8,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      16,
+  },  // num_hidden_nodes
+  { av1_tx_type_nn_weights_8x8_ver_layer0,
+    av1_tx_type_nn_weights_8x8_ver_layer1 },
+  { av1_tx_type_nn_bias_8x8_ver_layer0, av1_tx_type_nn_bias_8x8_ver_layer1 }
+};
+/******************************************************************************/
+
+// Tx type model for 8x16 block.
+static const float av1_tx_type_nn_weights_8x16_hor_layer0[128] = {
+  -1.61872f, -1.58520f, -1.41236f, -1.53255f, -1.59794f, -1.25769f, -1.90043f,
+  0.73431f,  1.10135f,  0.47054f,  0.43230f,  -0.43009f, -0.09135f, -0.07289f,
+  -0.38785f, 1.23775f,  -0.35312f, 0.73789f,  0.88864f,  0.75957f,  0.62579f,
+  0.46974f,  0.21851f,  1.63821f,  -2.27289f, -0.68522f, -0.69814f, -0.84368f,
+  -0.91320f, -0.63055f, -1.03296f, 0.55778f,  -0.00071f, 1.27539f,  1.60068f,
+  1.40975f,  0.97372f,  0.92843f,  1.90853f,  0.12626f,  1.71953f,  1.41978f,
+  -0.12234f, -1.27058f, 0.76207f,  0.02495f,  -0.67038f, -0.05255f, 1.72923f,
+  1.47630f,  1.47058f,  1.47614f,  1.49354f,  1.66131f,  1.50801f,  0.17145f,
+  -2.30947f, -2.10850f, -1.25636f, -0.24900f, 0.72602f,  1.26572f,  0.97865f,
+  -0.65466f, 1.31129f,  0.26916f,  0.12139f,  -0.12761f, -0.39143f, -0.28134f,
+  0.06584f,  2.24418f,  0.22516f,  0.05011f,  -0.01671f, -0.29476f, -0.40326f,
+  0.21138f,  -0.11573f, -0.31154f, -0.36828f, 0.03694f,  -0.07172f, -0.63419f,
+  -3.14351f, -1.23125f, 0.65311f,  -0.11406f, 1.97287f,  -0.10422f, 0.83896f,
+  0.85033f,  0.49724f,  0.80482f,  0.51454f,  1.06447f,  0.76693f,  0.72599f,
+  -0.78573f, -0.53950f, 0.40894f,  0.00086f,  0.10784f,  -0.70498f, 1.16395f,
+  1.14597f,  1.13496f,  1.12177f,  1.02100f,  -1.37574f, -2.97144f, 0.33899f,
+  0.42013f,  0.86327f,  2.31983f,  2.04008f,  0.95503f,  0.15081f,  0.11530f,
+  -0.02574f, -4.77119f, 0.13257f,  -0.01704f, -0.23087f, -0.00825f, 0.07029f,
+  -0.28136f, 0.42556f,
+};
+
+static const float av1_tx_type_nn_bias_8x16_hor_layer0[16] = {
+  0.93617f,  -0.24000f, -1.26821f, 0.78780f,  0.13690f, -0.21948f,
+  -1.45162f, 0.44584f,  -1.92582f, -0.23169f, 0.56004f, -1.19937f,
+  1.81560f,  -1.02643f, -0.81690f, 0.08302f,
+};
+
+static const float av1_tx_type_nn_weights_8x16_hor_layer1[64] = {
+  0.06696f,  -0.11538f, -1.42029f, 0.32965f,  0.81046f,  0.01146f,  1.20945f,
+  -0.16899f, 0.53224f,  -0.40232f, 0.01786f,  -0.73242f, 1.29750f,  1.95185f,
+  0.70143f,  1.43287f,  0.76220f,  0.79937f,  -1.79011f, -1.15178f, 0.42526f,
+  -0.67519f, 0.77267f,  -0.30697f, 2.46004f,  -0.49828f, 0.02875f,  1.09972f,
+  1.47662f,  0.61719f,  0.61417f,  -0.12363f, 2.53048f,  0.00418f,  -1.38964f,
+  0.88117f,  0.39239f,  -0.19347f, -2.58600f, -0.33715f, 1.09323f,  -0.32127f,
+  0.02456f,  -0.19125f, 1.12728f,  0.66502f,  0.34296f,  1.14897f,  0.29967f,
+  1.19209f,  0.22108f,  -0.11975f, 1.49776f,  -1.34624f, -2.58478f, -1.34632f,
+  1.53207f,  0.45634f,  -1.48476f, 0.17489f,  0.71790f,  -2.12086f, -1.21778f,
+  -1.31243f,
+};
+
+static const float av1_tx_type_nn_bias_8x16_hor_layer1[4] = {
+  0.83359f,
+  1.06875f,
+  1.77645f,
+  1.49570f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_8x16_hor = {
+  8,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      16,
+  },  // num_hidden_nodes
+  { av1_tx_type_nn_weights_8x16_hor_layer0,
+    av1_tx_type_nn_weights_8x16_hor_layer1 },
+  { av1_tx_type_nn_bias_8x16_hor_layer0, av1_tx_type_nn_bias_8x16_hor_layer1 }
+};
+
+static const float av1_tx_type_nn_weights_8x16_ver_layer0[128] = {
+  0.32858f,  -1.28887f, 0.25632f,  -0.05262f, 2.69203f,  -0.07004f, 1.37337f,
+  -0.05725f, -0.05659f, 0.05592f,  0.01039f,  -0.29343f, 1.58628f,  -0.30003f,
+  -3.43118f, 0.00272f,  1.70928f,  -0.76348f, 0.05889f,  -0.03263f, -0.07724f,
+  0.03523f,  -0.19890f, 1.18005f,  -0.03605f, -0.20530f, -4.00733f, 0.10210f,
+  -0.05368f, -0.17650f, -0.15317f, 0.06499f,  0.56705f,  1.04341f,  0.62890f,
+  0.73451f,  -0.22199f, 0.86659f,  0.78443f,  -0.61664f, -0.50606f, 0.30247f,
+  0.14455f,  0.39276f,  0.49203f,  0.65019f,  0.12269f,  1.64080f,  1.68289f,
+  1.42694f,  1.60825f,  1.58501f,  1.47252f,  1.62589f,  1.48218f,  0.17726f,
+  -0.04884f, 0.35376f,  -0.04796f, 0.32589f,  0.35087f,  0.35258f,  -0.46103f,
+  -0.31176f, -0.05203f, 0.07247f,  -0.26756f, 0.22019f,  0.03412f,  0.33773f,
+  0.29811f,  -0.11140f, 0.12831f,  -0.44673f, -0.09858f, 0.07889f,  0.15137f,
+  0.00347f,  -0.23394f, 0.08886f,  -0.31201f, -0.79912f, -0.51092f, 0.14123f,
+  -1.09599f, -4.26020f, -0.68675f, -0.02842f, -1.54538f, -1.28977f, -1.30558f,
+  -1.21074f, -1.37142f, -1.14743f, -1.85397f, 0.82985f,  -0.30681f, 0.04494f,
+  -0.24023f, -4.18053f, -0.16096f, -0.55492f, -0.27882f, 0.05829f,  -0.41224f,
+  -2.52088f, -0.56162f, -1.04547f, -1.70685f, -0.28842f, -1.43673f, -0.01468f,
+  -3.20585f, -0.69120f, -0.43931f, -0.46270f, -0.65885f, -0.55884f, -0.75138f,
+  0.36381f,  -5.70858f, -0.14548f, -0.15745f, -0.11812f, -0.07605f, -0.07693f,
+  -0.12236f, 0.16075f,
+};
+
+static const float av1_tx_type_nn_bias_8x16_ver_layer0[16] = {
+  -0.35385f, 0.30491f,  -0.90011f, 0.42941f,  1.20928f, -0.88331f,
+  -1.48818f, -0.34785f, -0.32668f, -0.22695f, 0.89188f, 0.65521f,
+  0.57598f,  0.99819f,  0.75175f,  0.17044f,
+};
+
+static const float av1_tx_type_nn_weights_8x16_ver_layer1[64] = {
+  -0.62913f, -0.34304f, 0.42963f,  -0.17440f, -1.44092f, 0.69142f,  -1.36067f,
+  0.52211f,  0.44658f,  -0.26501f, -0.41657f, 0.34428f,  -0.34390f, -0.58567f,
+  -0.84097f, -1.96311f, -0.37215f, -0.22250f, -1.23811f, -0.07247f, -0.81731f,
+  0.58755f,  -1.30559f, 0.39551f,  0.41743f,  -0.09940f, -0.33230f, 0.14458f,
+  -0.25139f, -0.54517f, 0.13469f,  -0.38157f, -0.39109f, -0.18205f, 0.06834f,
+  -0.08395f, -0.92187f, 0.56724f,  1.44381f,  0.53226f,  -0.22356f, 0.12285f,
+  -0.29418f, -1.86749f, -0.22372f, -0.60204f, -0.87746f, -1.16936f, 0.56884f,
+  0.62641f,  -0.11823f, 1.00395f,  1.64794f,  -0.64535f, 2.29322f,  -0.23397f,
+  0.17251f,  -0.35927f, 0.65631f,  -0.26812f, 0.80128f,  0.85748f,  0.47404f,
+  2.20547f,
+};
+
+static const float av1_tx_type_nn_bias_8x16_ver_layer1[4] = {
+  -0.44080f,
+  -1.67455f,
+  -1.46332f,
+  -6.13206f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_8x16_ver = {
+  8,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      16,
+  },  // num_hidden_nodes
+  { av1_tx_type_nn_weights_8x16_ver_layer0,
+    av1_tx_type_nn_weights_8x16_ver_layer1 },
+  { av1_tx_type_nn_bias_8x16_ver_layer0, av1_tx_type_nn_bias_8x16_ver_layer1 }
+};
+/******************************************************************************/
+
+// Tx type model for 16x8 block.
+static const float av1_tx_type_nn_weights_16x8_hor_layer0[128] = {
+  0.02600f,  0.09786f,  -1.05107f, -0.35594f, -0.15658f, 2.99828f,  -0.07106f,
+  -0.10101f, -0.14412f, -0.83790f, -0.19434f, 2.28368f,  1.91727f,  -0.00956f,
+  -0.90640f, 0.09174f,  1.58895f,  1.38945f,  1.49431f,  1.51381f,  1.44803f,
+  1.53544f,  1.44694f,  0.17753f,  1.69735f,  -0.78652f, 0.31092f,  -0.23736f,
+  0.02231f,  -0.09884f, -0.00493f, 1.21189f,  -1.94382f, -0.34629f, -0.58309f,
+  0.72291f,  -0.30056f, 0.90660f,  -0.57495f, 3.07809f,  0.73644f,  1.43050f,
+  1.34356f,  -0.66554f, 0.50102f,  -0.64305f, 0.42044f,  -1.66165f, -0.05733f,
+  -2.51402f, -1.01067f, -0.33390f, -0.32986f, -0.92431f, 1.86281f,  -0.07290f,
+  -0.26290f, -0.68941f, 1.81156f,  0.66125f,  -2.09974f, 0.17032f,  -0.67461f,
+  -0.00876f, -1.50154f, 1.17153f,  1.00377f,  0.33022f,  0.74689f,  0.42878f,
+  0.61725f,  -0.83967f, 0.09467f,  -0.39892f, 0.33863f,  0.10656f,  -0.09249f,
+  -0.39757f, 0.48481f,  -0.35162f, 1.47014f,  1.67827f,  -1.84051f, 0.16291f,
+  -0.50135f, -2.29911f, -0.42217f, -0.13358f, 1.45899f,  -0.14743f, -0.02763f,
+  -0.28003f, -0.01364f, 0.21014f,  -0.29026f, -0.20198f, 1.38782f,  0.56731f,
+  0.27489f,  0.43227f,  0.41326f,  0.42721f,  0.87720f,  -1.90067f, -5.04951f,
+  -0.17638f, -0.58119f, -0.08954f, -0.13692f, -0.12325f, -0.38548f, 0.66462f,
+  -1.42377f, -1.21917f, -1.38193f, -1.36539f, -1.39378f, -1.19629f, -1.59812f,
+  0.28689f,  0.32394f,  0.52128f,  0.01013f,  -0.28948f, -0.26293f, -0.44331f,
+  -0.36570f, -0.50757f,
+};
+
+static const float av1_tx_type_nn_bias_16x8_hor_layer0[16] = {
+  -0.08696f, -0.22110f, -1.43604f, -1.00451f, -1.51029f, 0.63736f,
+  0.45260f,  0.16229f,  4.01393f,  -0.21748f, 0.36411f,  -0.08764f,
+  -0.12329f, 0.08986f,  1.08117f,  -0.00220f,
+};
+
+static const float av1_tx_type_nn_weights_16x8_hor_layer1[64] = {
+  0.55824f,  -0.14648f, 0.81947f,  -0.45867f, -1.86078f, -0.17291f, 0.34849f,
+  0.15153f,  1.75625f,  -0.25760f, 0.72015f,  -0.30059f, -0.57975f, 0.07609f,
+  -0.02036f, 0.07912f,  0.57080f,  -0.13792f, 0.74184f,  -0.87669f, -1.87572f,
+  -0.27270f, 0.39751f,  0.19652f,  2.03514f,  -0.32944f, 0.76251f,  0.04399f,
+  -0.63175f, 0.37420f,  0.08309f,  0.04466f,  0.60255f,  -0.12820f, 1.66065f,
+  -0.59496f, -1.94794f, -0.14847f, 0.39424f,  0.16273f,  1.80587f,  0.41197f,
+  0.74691f,  -0.21217f, -0.63173f, 0.09510f,  -0.35538f, -0.04407f, 0.92847f,
+  0.20141f,  1.68680f,  -0.56528f, -2.26960f, 0.12978f,  0.73748f,  0.42438f,
+  2.00673f,  -0.40189f, 0.95423f,  0.23234f,  -0.80953f, 0.65814f,  0.49444f,
+  -0.23347f,
+};
+
+static const float av1_tx_type_nn_bias_16x8_hor_layer1[4] = {
+  3.57175f,
+  2.42612f,
+  3.31259f,
+  2.08287f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_16x8_hor = {
+  8,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      16,
+  },  // num_hidden_nodes
+  { av1_tx_type_nn_weights_16x8_hor_layer0,
+    av1_tx_type_nn_weights_16x8_hor_layer1 },
+  { av1_tx_type_nn_bias_16x8_hor_layer0, av1_tx_type_nn_bias_16x8_hor_layer1 }
+};
+
+static const float av1_tx_type_nn_weights_16x8_ver_layer0[128] = {
+  0.46633f,  1.55328f,  -0.11230f, -0.29571f, 0.18814f,  -1.52430f, -2.34660f,
+  0.08644f,  -1.97718f, -1.29140f, -1.12262f, -1.12985f, -1.25911f, -0.96506f,
+  -1.57129f, 0.96021f,  1.34192f,  1.28623f,  1.21655f,  1.28758f,  1.25482f,
+  1.30195f,  1.19190f,  0.09310f,  0.52072f,  0.91487f,  1.24100f,  1.61236f,
+  1.72166f,  2.20750f,  1.62379f,  -1.43936f, 0.50665f,  0.40213f,  0.66502f,
+  -1.66699f, -3.07618f, 0.05877f,  0.60987f,  -0.09995f, -0.10916f, 0.48049f,
+  0.23812f,  0.39847f,  -0.21682f, -0.63455f, 0.33453f,  -0.67939f, -4.14355f,
+  -0.62756f, -0.22502f, -0.17215f, 0.01062f,  0.27049f,  -0.10748f, 0.30945f,
+  2.72445f,  -0.89181f, -0.06800f, 0.20595f,  -0.73385f, 0.04071f,  -1.30294f,
+  1.83507f,  0.92570f,  0.69609f,  0.76285f,  0.69892f,  0.76409f,  0.63104f,
+  0.73397f,  1.09575f,  -0.20129f, -0.24022f, -0.24599f, -0.59107f, -0.88755f,
+  -0.68987f, -0.75495f, -1.31002f, -1.30237f, -0.94093f, -2.15678f, -1.49303f,
+  -1.17498f, -1.39952f, -0.91270f, -0.05587f, 1.02381f,  -0.75580f, -0.65263f,
+  -0.78996f, -0.71075f, -0.71018f, -0.70350f, -1.26196f, 2.34208f,  -0.53611f,
+  0.19752f,  -0.16842f, -0.24828f, 0.21857f,  0.08222f,  -2.55894f, -1.75702f,
+  0.11394f,  1.03083f,  0.79972f,  -1.54112f, -1.82341f, -0.57597f, -0.02077f,
+  -0.39616f, -0.00995f, -0.12809f, 0.01188f,  -0.25117f, 0.09202f,  0.09336f,
+  -0.05614f, -0.30039f, 0.25834f,  1.19944f,  1.22533f,  0.92330f,  0.75967f,
+  -0.81945f, -0.41647f,
+};
+
+static const float av1_tx_type_nn_bias_16x8_ver_layer0[16] = {
+  0.17841f,  0.67315f,  -1.24450f, 3.13859f,  0.16203f, -0.14992f,
+  0.29553f,  -1.15567f, -0.71421f, 1.15977f,  1.14585f, 3.02460f,
+  -0.04510f, 0.48000f,  -0.09354f, -0.42422f,
+};
+
+static const float av1_tx_type_nn_weights_16x8_ver_layer1[64] = {
+  0.29912f,  -0.10009f, -1.11478f, 1.76812f,  -0.27719f, 0.52148f,  0.17622f,
+  -1.17116f, 0.73397f,  -0.69279f, -0.11080f, 1.53751f,  -1.42003f, 0.14731f,
+  0.13592f,  -0.04883f, 0.39186f,  -0.13655f, -0.43994f, 1.82759f,  -0.25601f,
+  -0.15018f, 0.51920f,  -1.56070f, 0.31683f,  -0.79367f, -0.02904f, 1.28637f,
+  -1.15203f, 0.26627f,  0.42828f,  -0.24258f, 0.38647f,  -0.83352f, 0.32553f,
+  2.09522f,  -0.26822f, -0.42191f, 0.32825f,  -1.30748f, 1.50551f,  -0.52669f,
+  0.20045f,  1.69318f,  -1.47839f, 0.30802f,  -0.07290f, -0.28106f, 0.68192f,
+  -0.15522f, 1.12579f,  2.21921f,  0.09720f,  -0.50265f, 0.83165f,  -1.31721f,
+  0.72422f,  -1.24952f, 0.61653f,  2.04117f,  -1.42406f, 0.52568f,  -0.46180f,
+  -0.00873f,
+};
+
+static const float av1_tx_type_nn_bias_16x8_ver_layer1[4] = {
+  3.34981f,
+  3.74710f,
+  1.38339f,
+  0.45176f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_16x8_ver = {
+  8,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      16,
+  },  // num_hidden_nodes
+  { av1_tx_type_nn_weights_16x8_ver_layer0,
+    av1_tx_type_nn_weights_16x8_ver_layer1 },
+  { av1_tx_type_nn_bias_16x8_ver_layer0, av1_tx_type_nn_bias_16x8_ver_layer1 }
+};
+/******************************************************************************/
+
+// Tx type model for 16x16 block.
+static const float av1_tx_type_nn_weights_16x16_layer0[128] = {
+  1.26592f,  1.36313f,  1.30956f,  1.29926f,  1.48816f,  1.68851f,  1.32000f,
+  0.13321f,  -0.22477f, -0.88906f, -0.19622f, 1.69605f,  1.22180f,  -1.57771f,
+  -1.15765f, 0.05710f,  -1.13355f, -0.85486f, -0.99971f, -0.91571f, -1.06031f,
+  -0.77952f, -1.15723f, 1.17809f,  1.35602f,  -0.05243f, -0.37596f, 0.26108f,
+  0.17611f,  -0.10323f, 0.77279f,  -0.48911f, -0.79308f, 0.55112f,  0.43918f,
+  0.27872f,  0.28714f,  0.45830f,  1.05689f,  0.03705f,  -2.49975f, -0.01940f,
+  0.05709f,  0.07942f,  -0.13290f, -0.10359f, 0.00143f,  0.37303f,  0.96470f,
+  0.53293f,  1.14459f,  0.89185f,  0.43378f,  0.47764f,  0.90924f,  0.15279f,
+  -0.15361f, 0.02949f,  0.42240f,  0.68143f,  0.89588f,  0.73754f,  0.10974f,
+  1.57755f,  -0.39870f, -0.32914f, 0.35638f,  0.34991f,  -0.00003f, -0.23373f,
+  0.29630f,  -0.76699f, -0.01356f, 0.04234f,  0.84253f,  1.92078f,  0.93160f,
+  0.71993f,  0.71604f,  0.76455f,  -1.59782f, 0.32332f,  1.11628f,  0.33062f,
+  -0.03728f, -0.05710f, 0.80447f,  -0.14719f, 1.34658f,  -0.05718f, 0.64015f,
+  0.21926f,  0.41653f,  0.12720f,  0.54092f,  1.39411f,  1.81819f,  -0.24513f,
+  0.00955f,  0.38011f,  -0.57787f, -0.41759f, 0.68834f,  -0.31783f, -0.40607f,
+  -0.10107f, -0.79374f, 0.75599f,  -0.16282f, -0.14490f, -0.20783f, -0.55019f,
+  -0.13793f, -0.22293f, 0.18305f,  0.12445f,  0.56830f,  0.24567f,  0.09278f,
+  0.70803f,  0.35803f,  -1.52676f, -0.89624f, 0.77665f,  0.19877f,  0.77175f,
+  0.50355f,  0.08592f,
+};
+
+static const float av1_tx_type_nn_bias_16x16_layer0[16] = {
+  -1.31834f, 0.14346f,  -0.10062f, 0.84489f,  0.95617f,  -0.06720f,
+  -0.68502f, -0.91442f, -0.31932f, 0.25276f,  -0.15138f, -1.57661f,
+  -0.14062f, -0.42120f, 0.94573f,  -0.09287f,
+};
+
+static const float av1_tx_type_nn_weights_16x16_layer1[64] = {
+  -1.80333f, -1.06353f, 0.55139f,  0.74644f,  0.13747f, -0.93018f, -0.10286f,
+  0.67133f,  0.24460f,  1.44583f,  0.02173f,  0.26037f, -0.73687f, 0.19566f,
+  0.61846f,  -0.58601f, -1.03196f, -0.74415f, 0.30041f, -0.41967f, 1.08740f,
+  0.96224f,  -0.59139f, 0.03813f,  0.05403f,  1.33427f, -0.54375f, -1.92181f,
+  0.54704f,  0.13608f,  0.22151f,  -0.38076f, 1.18390f, -0.77508f, -1.84283f,
+  1.00894f,  0.62318f,  -0.15296f, 1.27600f,  0.22822f, 0.12751f,  0.93910f,
+  -0.28502f, 0.53912f,  -0.96889f, 0.10182f,  0.81508f, -0.43028f, 2.67386f,
+  0.52204f,  0.49820f,  -0.41711f, 1.05038f,  1.12192f, 0.74349f,  -0.75417f,
+  -0.03718f, -0.35769f, 0.89651f,  0.63236f,  0.54215f, -0.07894f, 0.48274f,
+  1.08829f,
+};
+
+static const float av1_tx_type_nn_bias_16x16_layer1[4] = {
+  0.81986f,
+  1.26865f,
+  0.11118f,
+  2.48404f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_16x16 = {
+  8,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      16,
+  },  // num_hidden_nodes
+  {
+      av1_tx_type_nn_weights_16x16_layer0,
+      av1_tx_type_nn_weights_16x16_layer1,
+  },
+  {
+      av1_tx_type_nn_bias_16x16_layer0,
+      av1_tx_type_nn_bias_16x16_layer1,
+  },
+};
+/******************************************************************************/
+
+// Tx type model for 4x16 block.
+static const float av1_tx_type_nn_weights_4x16_hor_layer0[32] = {
+  0.36539f,  0.25667f,  0.01491f,  -0.21959f, 2.55105f,  0.17615f, 1.79884f,
+  1.65936f,  -0.44363f, 0.00706f,  -0.68004f, -0.64360f, 1.75760f, 1.91906f,
+  1.47682f,  0.09650f,  -3.59244f, -0.35004f, 0.93295f,  0.25806f, -0.08154f,
+  0.79332f,  0.79535f,  1.09467f,  1.57855f,  -0.51359f, 0.90553f, -1.67744f,
+  -1.74563f, -0.88830f, -1.77603f, 2.15935f,
+};
+
+static const float av1_tx_type_nn_bias_4x16_hor_layer0[8] = {
+  -0.36435f, -2.22731f, -0.00837f, -1.34546f,
+  0.62806f,  -0.20675f, 4.91940f,  -0.56079f,
+};
+
+static const float av1_tx_type_nn_weights_4x16_hor_layer1[32] = {
+  -0.57191f, -1.46418f, 0.67331f,  -1.15027f, 0.46288f,  0.81251f,  2.51768f,
+  -0.27147f, 0.00761f,  -2.15214f, -0.69650f, -0.50808f, 0.92832f,  0.45668f,
+  2.34201f,  -0.52941f, 0.51008f,  -1.55496f, -0.01371f, -0.12356f, 0.66624f,
+  0.88043f,  2.64862f,  -1.28024f, -0.17578f, -1.80034f, -0.32217f, 0.89519f,
+  1.28413f,  -0.30326f, 2.45329f,  -0.83335f,
+};
+
+static const float av1_tx_type_nn_bias_4x16_hor_layer1[4] = {
+  2.33198f,
+  3.36245f,
+  1.62603f,
+  2.91056f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_4x16_hor = {
+  4,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      8,
+  },  // num_hidden_nodes
+  { av1_tx_type_nn_weights_4x16_hor_layer0,
+    av1_tx_type_nn_weights_4x16_hor_layer1 },
+  { av1_tx_type_nn_bias_4x16_hor_layer0, av1_tx_type_nn_bias_4x16_hor_layer1 }
+};
+
+static const float av1_tx_type_nn_weights_4x16_ver_layer0[128] = {
+  1.61392f,  1.41239f,  1.47646f,  1.47325f,  1.46110f,  1.49208f,  1.49414f,
+  0.12835f,  -0.76986f, 0.07087f,  -0.24572f, -0.93168f, 3.07935f,  -0.18183f,
+  -0.09831f, -0.07703f, -0.03222f, -0.25473f, -0.06090f, 2.93713f,  -0.38711f,
+  -0.12884f, -0.18329f, -0.06262f, -0.00327f, -0.02930f, -0.01641f, -0.00622f,
+  -0.03305f, -4.07069f, -2.76643f, 0.04413f,  -1.03176f, -0.19217f, -0.44980f,
+  -2.48615f, -2.58112f, -0.87695f, 0.16187f,  -0.04891f, -0.06854f, 1.08104f,
+  0.75245f,  1.49302f,  0.63363f,  1.45715f,  0.92574f,  1.72029f,  0.33326f,
+  3.86646f,  0.04422f,  0.41019f,  0.36212f,  0.56600f,  -1.01552f, 0.05128f,
+  0.40454f,  -1.05100f, -0.47461f, -1.33168f, -0.46145f, -1.36870f, -0.88838f,
+  -1.05358f, -0.18537f, -0.34357f, -0.03698f, 0.68905f,  0.41010f,  0.31223f,
+  -0.43382f, -0.74715f, 2.03366f,  -0.30419f, 0.45747f,  0.09526f,  0.31678f,
+  0.22915f,  0.21832f,  1.26385f,  -0.06814f, -0.71417f, -1.18947f, 0.03762f,
+  0.10936f,  2.97396f,  -0.42638f, -0.03123f, -5.49756f, -0.17029f, -0.11323f,
+  0.05173f,  -0.44274f, -0.15738f, 0.11311f,  0.43872f,  0.16837f,  -0.52849f,
+  2.90050f,  -0.54735f, -0.29591f, 1.24030f,  0.21696f,  -0.04443f, -1.60877f,
+  -1.36365f, -1.27432f, -1.52060f, -1.34397f, -1.13371f, -1.87554f, 0.80123f,
+  0.42820f,  -0.14157f, -2.73963f, -0.68040f, -0.35236f, 0.14490f,  2.23477f,
+  0.01370f,  -0.20426f, -1.51411f, -0.72293f, 0.64516f,  0.97638f,  0.32616f,
+  -0.27975f, -0.01149f,
+};
+
+static const float av1_tx_type_nn_bias_4x16_ver_layer0[16] = {
+  -1.37863f, -0.05763f, -0.07041f, 0.15306f,  0.96026f,  -1.42105f,
+  -0.55822f, 1.04845f,  -0.17662f, -1.25345f, -0.11927f, 0.49845f,
+  -0.32530f, 0.73483f,  0.08322f,  -0.23890f,
+};
+
+static const float av1_tx_type_nn_weights_4x16_ver_layer1[64] = {
+  0.27194f,  0.50607f,  0.49229f,  -0.48192f, 0.15667f,  -1.38891f, 0.38102f,
+  -0.58825f, -0.07337f, -0.52909f, 0.36975f,  0.28710f,  0.34992f,  -0.73630f,
+  0.30386f,  -0.58822f, 0.36127f,  0.57950f,  0.55878f,  -0.42796f, 0.19967f,
+  -1.45517f, 0.42529f,  -0.54630f, -0.38169f, -0.84899f, 0.41622f,  0.46935f,
+  0.39077f,  -0.75448f, 0.31698f,  -0.76187f, 0.97765f,  0.57052f,  0.55825f,
+  -0.54273f, 0.20466f,  -1.46347f, 0.41813f,  -0.55019f, -0.19948f, -0.57982f,
+  0.41206f,  0.32373f,  0.38537f,  -1.11657f, 0.32887f,  -0.76911f, 1.12259f,
+  0.72163f,  0.82603f,  0.37786f,  0.34976f,  -1.86642f, 0.59961f,  -0.16329f,
+  -0.36631f, -0.56814f, 0.60410f,  0.53158f,  0.56389f,  -0.70508f, 0.51009f,
+  -0.56513f,
+};
+
+static const float av1_tx_type_nn_bias_4x16_ver_layer1[4] = {
+  4.60896f,
+  4.53551f,
+  4.53124f,
+  4.27435f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_4x16_ver = {
+  8,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      16,
+  },  // num_hidden_nodes
+  { av1_tx_type_nn_weights_4x16_ver_layer0,
+    av1_tx_type_nn_weights_4x16_ver_layer1 },
+  { av1_tx_type_nn_bias_4x16_ver_layer0, av1_tx_type_nn_bias_4x16_ver_layer1 }
+};
+/******************************************************************************/
+
+// Tx type model for 16x4 block.
+static const float av1_tx_type_nn_weights_16x4_hor_layer0[128] = {
+  1.45347f,  -0.15743f, 0.44236f,  0.25808f,  0.33944f,  0.38678f,  0.24428f,
+  1.67287f,  0.09539f,  -0.42940f, -0.31507f, -0.00154f, -2.98755f, -2.27744f,
+  -0.49183f, 0.09333f,  -0.99026f, -0.22157f, 0.53701f,  0.60447f,  0.15686f,
+  -0.04646f, 0.26341f,  2.12361f,  0.27090f,  -1.14716f, -0.64146f, -0.91604f,
+  -0.75335f, -0.60056f, -1.25084f, 1.68473f,  -3.24075f, -4.03867f, -2.07877f,
+  -0.02347f, 0.00333f,  -0.01259f, -0.00465f, 0.02526f,  0.36286f,  -0.10324f,
+  2.12780f,  -0.74584f, -1.05052f, 1.78467f,  -0.55065f, -0.03326f, 2.46781f,
+  1.18349f,  0.96015f,  1.01696f,  1.10584f,  1.07263f,  1.11531f,  -1.06413f,
+  0.32389f,  -1.87360f, -0.14435f, 1.77926f,  1.09966f,  -0.12680f, -0.61386f,
+  -0.09724f, -0.33095f, 1.12122f,  1.00791f,  1.52416f,  1.35004f,  1.32657f,
+  0.60950f,  -1.13538f, -0.38654f, 0.06473f,  2.10669f,  0.27734f,  -0.38359f,
+  -1.91455f, -1.22676f, 0.05786f,  0.97432f,  2.19967f,  0.50457f,  0.78976f,
+  0.95183f,  -0.32414f, 0.49437f,  -0.04506f, 0.18993f,  -0.07971f, 0.23889f,
+  -0.09872f, -0.66036f, 0.05377f,  2.69638f,  -0.08259f, -0.69210f, -1.08296f,
+  -1.96504f, -2.31947f, -0.80161f, -0.80456f, -1.35556f, -0.05323f, -4.42658f,
+  -0.30732f, -0.12043f, 0.11126f,  0.10771f,  -0.14956f, -0.02218f, 0.41016f,
+  1.16599f,  1.14629f,  1.12881f,  1.18676f,  1.24677f,  1.28695f,  1.11270f,
+  0.08233f,  1.75440f,  0.49228f,  -0.34858f, -0.17032f, 0.29288f,  0.47175f,
+  0.19055f,  -1.56413f,
+};
+
+static const float av1_tx_type_nn_bias_16x4_hor_layer0[16] = {
+  -1.71227f, 0.47291f, -0.97536f, -0.66216f, 0.11729f,  -0.21451f,
+  2.75281f,  0.04318f, 2.03965f,  0.14618f,  -0.70483f, -0.24517f,
+  1.14048f,  0.33308f, -1.10886f, 0.41184f,
+};
+
+static const float av1_tx_type_nn_weights_16x4_hor_layer1[64] = {
+  -1.17079f, 0.19096f,  -1.05753f, -0.30803f, -1.21680f, -0.67255f, 1.60115f,
+  0.05972f,  1.44759f,  -0.04068f, -0.26331f, 0.31400f,  0.96923f,  0.33443f,
+  -0.77215f, -0.91316f, -1.78928f, 0.21483f,  -1.24008f, -0.46190f, -0.12127f,
+  -0.62144f, 1.37593f,  0.08373f,  1.56215f,  0.00279f,  -0.14556f, 0.38710f,
+  0.96228f,  0.66433f,  -0.51798f, -0.80738f, -0.18539f, 0.19377f,  -1.03090f,
+  -1.51044f, -0.59485f, -0.62589f, 1.90742f,  0.09078f,  1.49113f,  0.00205f,
+  -0.15918f, 0.40827f,  1.08553f,  0.43431f,  0.33519f,  -1.12669f, -1.10274f,
+  0.80004f,  -1.83599f, -0.53134f, 2.00515f,  -0.32670f, 1.37124f,  0.51136f,
+  1.62563f,  0.24787f,  0.31757f,  0.81751f,  1.57262f,  0.83214f,  1.04661f,
+  -0.43819f,
+};
+
+static const float av1_tx_type_nn_bias_16x4_hor_layer1[4] = {
+  2.32575f,
+  2.75703f,
+  1.12304f,
+  2.15567f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_16x4_hor = {
+  8,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      16,
+  },  // num_hidden_nodes
+  { av1_tx_type_nn_weights_16x4_hor_layer0,
+    av1_tx_type_nn_weights_16x4_hor_layer1 },
+  { av1_tx_type_nn_bias_16x4_hor_layer0, av1_tx_type_nn_bias_16x4_hor_layer1 }
+};
+
+static const float av1_tx_type_nn_weights_16x4_ver_layer0[32] = {
+  0.26047f,  0.99930f,  1.16484f,  -0.28196f, -2.67483f, -0.21456f, -0.16854f,
+  0.46375f,  1.47951f,  1.13735f,  1.12356f,  0.27385f,  0.50978f,  2.09967f,
+  -1.47386f, 0.01950f,  -0.06362f, 0.26014f,  1.04544f,  -0.03099f, 0.07478f,
+  -0.39701f, 0.05545f,  2.73633f,  -0.56305f, -0.02208f, -0.44517f, -0.00897f,
+  -0.17967f, -0.96622f, 0.42635f,  -1.04784f,
+};
+
+static const float av1_tx_type_nn_bias_16x4_ver_layer0[8] = {
+  -0.52088f, 0.52844f,  -1.03655f, -0.30974f,
+  2.59952f,  -1.93604f, 0.00000f,  2.51787f,
+};
+
+static const float av1_tx_type_nn_weights_16x4_ver_layer1[32] = {
+  0.10916f,  -0.21219f, -0.51340f, 0.69161f,  1.45988f,  -1.36942f, -0.40899f,
+  1.05136f,  -0.08486f, 0.10008f,  -0.55304f, 0.88012f,  1.61177f,  -1.64507f,
+  0.63428f,  1.15130f,  -0.17287f, -0.18592f, -0.01143f, 0.88293f,  1.73326f,
+  -1.63624f, 0.09359f,  1.18393f,  0.26531f,  0.22378f,  0.15170f,  1.06965f,
+  1.26814f,  -1.93873f, -0.00768f, 1.58309f,
+};
+
+static const float av1_tx_type_nn_bias_16x4_ver_layer1[4] = {
+  2.34713f,
+  1.68667f,
+  1.25488f,
+  1.69812f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_16x4_ver = {
+  4,  // num_inputs
+  4,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      8,
+  },  // num_hidden_nodes
+  { av1_tx_type_nn_weights_16x4_ver_layer0,
+    av1_tx_type_nn_weights_16x4_ver_layer1 },
+  { av1_tx_type_nn_bias_16x4_ver_layer0, av1_tx_type_nn_bias_16x4_ver_layer1 }
+};
+/******************************************************************************/
+
+// Map tx_size to its corresponding neural net model for tx type prediction.
+static const NN_CONFIG *av1_tx_type_nnconfig_map_hor[] = {
+  &av1_tx_type_nnconfig_4x4_hor,   // 4x4 transform
+  &av1_tx_type_nnconfig_8x8_hor,   // 8x8 transform
+  &av1_tx_type_nnconfig_16x16,     // 16x16 transform
+  NULL,                            // 32x32 transform
+  NULL,                            // 64x64 transform
+  &av1_tx_type_nnconfig_4x8_hor,   // 4x8 transform
+  &av1_tx_type_nnconfig_8x4_hor,   // 8x4 transform
+  &av1_tx_type_nnconfig_8x16_hor,  // 8x16 transform
+  &av1_tx_type_nnconfig_16x8_hor,  // 16x8 transform
+  NULL,                            // 16x32 transform
+  NULL,                            // 32x16 transform
+  NULL,                            // 32x64 transform
+  NULL,                            // 64x32 transform
+  &av1_tx_type_nnconfig_4x16_hor,  // 4x16 transform
+  &av1_tx_type_nnconfig_16x4_hor,  // 16x4 transform
+  NULL,                            // 8x32 transform
+  NULL,                            // 32x8 transform
+  NULL,                            // 16x64 transform
+  NULL,                            // 64x16 transform
+};
+
+static const NN_CONFIG *av1_tx_type_nnconfig_map_ver[] = {
+  &av1_tx_type_nnconfig_4x4_ver,   // 4x4 transform
+  &av1_tx_type_nnconfig_8x8_ver,   // 8x8 transform
+  &av1_tx_type_nnconfig_16x16,     // 16x16 transform
+  NULL,                            // 32x32 transform
+  NULL,                            // 64x64 transform
+  &av1_tx_type_nnconfig_4x8_ver,   // 4x8 transform
+  &av1_tx_type_nnconfig_8x4_ver,   // 8x4 transform
+  &av1_tx_type_nnconfig_8x16_ver,  // 8x16 transform
+  &av1_tx_type_nnconfig_16x8_ver,  // 16x8 transform
+  NULL,                            // 16x32 transform
+  NULL,                            // 32x16 transform
+  NULL,                            // 32x64 transform
+  NULL,                            // 64x32 transform
+  &av1_tx_type_nnconfig_4x16_ver,  // 4x16 transform
+  &av1_tx_type_nnconfig_16x4_ver,  // 16x4 transform
+  NULL,                            // 8x32 transform
+  NULL,                            // 32x8 transform
+  NULL,                            // 16x64 transform
+  NULL,                            // 64x16 transform
+};
+#endif  // CONFIG_NN_V2
+
+// Tx split model for 4x8 block.
+static const float av1_tx_split_nn_weights_4x8_layer0[8 * 16] = {
+  0.068650f,  -0.732073f, -0.040361f, 0.322550f,  -0.021123f, 0.212518f,
+  -0.350546f, 0.435987f,  -0.111756f, -0.401568f, 0.069548f,  -0.313000f,
+  0.073918f,  -0.373805f, -0.775810f, -0.124753f, 0.181094f,  -0.602641f,
+  -0.026219f, -0.350112f, 0.020599f,  -0.311752f, -0.476482f, -0.669465f,
+  -0.310921f, 0.348869f,  -0.115984f, 0.154250f,  0.200485f,  -0.016689f,
+  0.020392f,  0.413810f,  0.634064f,  -0.627530f, 0.399178f,  -0.012284f,
+  0.472030f,  0.091087f,  -0.706100f, -0.447944f, -0.274226f, 0.445656f,
+  0.309339f,  0.505522f,  0.038496f,  -0.152809f, 0.408684f,  -0.068151f,
+  0.271612f,  0.353233f,  -0.150365f, 0.075212f,  -0.035096f, 0.346615f,
+  0.124382f,  0.477072f,  0.216288f,  0.070548f,  -0.106362f, 0.681613f,
+  -0.145502f, -0.218631f, -0.099248f, -0.001983f, -0.196819f, -0.969045f,
+  0.063009f,  -0.123053f, 0.104875f,  -0.137581f, -0.282933f, -0.003624f,
+  -0.315659f, -0.333523f, -0.503000f, -0.100063f, -0.536711f, -0.059978f,
+  -0.670248f, -0.353762f, 0.181109f,  0.289715f,  -0.071206f, 0.261141f,
+  0.052796f,  -0.114554f, -0.139214f, -0.261380f, 0.075984f,  -0.647925f,
+  -0.099528f, -0.677814f, 0.015712f,  -0.389385f, -0.095622f, -0.165117f,
+  -0.109454f, -0.175240f, -0.393914f, 0.212330f,  0.037822f,  0.248280f,
+  0.180197f,  0.110493f,  -0.525727f, -0.092329f, -0.524029f, -0.407364f,
+  -0.542373f, -0.435626f, -0.912194f, 0.062794f,  0.160433f,  0.741485f,
+  -0.103659f, -0.119327f, -0.055275f, 0.334358f,  0.014713f,  0.046327f,
+  0.831114f,  -0.576682f, 0.354369f,  -0.082088f, 0.452331f,  0.039730f,
+  -0.792429f, -0.385862f,
+};
+
+static const float av1_tx_split_nn_bias_4x8_layer0[16] = {
+  0.238621f,  2.186830f,  1.383035f,  -0.867139f, 1.257119f, -0.351571f,
+  -0.240650f, -0.971692f, 2.744843f,  1.116991f,  0.139062f, -0.165332f,
+  0.262171f,  -1.598153f, -1.427340f, -1.602306f,
+};
+
+static const float av1_tx_split_nn_weights_4x8_layer1[16] = {
+  -0.367134f, 1.373058f, -0.897039f, -0.326819f, -0.734030f, -0.290413f,
+  -0.501249f, 0.505321f, -0.537692f, -0.767893f, 0.268697f,  0.278987f,
+  0.085082f,  0.614986f, 0.847904f,  0.637578f,
+};
+
+static const float av1_tx_split_nn_bias_4x8_layer1[1] = {
+  0.20586078f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_4x8 = {
+  8,  // num_inputs
+  1,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      16,
+  },  // num_hidden_nodes
+  {
+      av1_tx_split_nn_weights_4x8_layer0,
+      av1_tx_split_nn_weights_4x8_layer1,
+  },
+  {
+      av1_tx_split_nn_bias_4x8_layer0,
+      av1_tx_split_nn_bias_4x8_layer1,
+  },
+};
+/******************************************************************************/
+
+// Tx split model for 8x8 block.
+static const float av1_tx_split_nn_weights_8x8_layer0[144] = {
+  0.177983f,  -0.938386f, -0.074460f, -0.221843f, -0.073182f, -0.295155f,
+  -0.098202f, -0.279510f, 0.001054f,  -0.119319f, -1.835282f, -0.581507f,
+  -1.222222f, -1.049006f, -0.807508f, -0.454252f, -0.774879f, -0.180607f,
+  -0.886976f, -0.231971f, -0.824677f, -0.351872f, -1.323819f, 0.235378f,
+  0.015331f,  -0.341818f, 0.145549f,  -0.348362f, 0.147647f,  -0.323400f,
+  0.047558f,  -0.553025f, -0.295485f, -0.330368f, -0.530605f, -0.407516f,
+  0.447740f,  0.782381f,  -0.179164f, -0.584675f, -0.052645f, 0.038656f,
+  -0.096783f, 0.038342f,  -0.170762f, -0.405844f, -0.552665f, -0.509866f,
+  0.757204f,  -1.296465f, 0.631015f,  0.009265f,  0.646192f,  0.044523f,
+  0.653161f,  0.033820f,  0.849639f,  -0.068555f, -1.036085f, -0.511652f,
+  0.104693f,  -1.458690f, 0.286051f,  -0.089800f, 0.381564f,  -0.302640f,
+  0.304465f,  -0.268706f, 0.432603f,  -0.117914f, -2.070031f, -0.565696f,
+  -0.073027f, -1.783570f, -0.318144f, -0.320990f, -0.343966f, -0.140996f,
+  -0.322977f, -0.232147f, -0.373210f, -0.158266f, -1.922305f, -0.634373f,
+  0.101894f,  -0.221847f, 0.018412f,  -0.423887f, -0.266684f, -0.444930f,
+  -0.196237f, 0.106638f,  -0.065834f, -0.538401f, -0.280772f, -0.620348f,
+  1.089957f,  -0.799928f, 0.504112f,  -0.165763f, 0.578741f,  -0.172653f,
+  0.547316f,  -0.143484f, 0.717220f,  -0.297190f, -1.237854f, -0.074819f,
+  -0.977304f, -0.484092f, -0.646427f, -0.451443f, -0.612126f, -0.224475f,
+  -0.731608f, -0.257077f, -0.665857f, -0.346742f, -1.216372f, 0.227267f,
+  0.231249f,  -1.693073f, -0.035899f, 0.380845f,  -0.058476f, 0.409405f,
+  -0.066679f, 0.406731f,  -0.068501f, 0.396748f,  0.639462f,  0.150834f,
+  -0.418659f, -1.421931f, 0.101889f,  0.083573f,  0.129746f,  0.134460f,
+  0.081185f,  0.127420f,  0.083664f,  0.051096f,  1.361688f,  0.386093f,
+};
+
+static const float av1_tx_split_nn_bias_8x8_layer0[12] = {
+  4.280443f, 2.218902f, -0.256953f, 3.161431f,  2.082548f, 2.506052f,
+  2.563224f, 1.421976f, -1.627813f, -1.436085f, 2.297265f, 1.500469f,
+};
+
+static const float av1_tx_split_nn_weights_8x8_layer1[12] = {
+  1.178833f,  -0.428527f, -0.078737f, 0.381434f, -0.466895f, -0.901745f,
+  -0.766968f, -0.356663f, 0.450146f,  0.509370f, -0.356604f, -0.443506f,
+};
+
+static const float av1_tx_split_nn_bias_8x8_layer1[1] = {
+  -0.156294f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_8x8 = {
+  12,  // num_inputs
+  1,   // num_outputs
+  1,   // num_hidden_layers
+  {
+      12,
+  },  // num_hidden_nodes
+  {
+      av1_tx_split_nn_weights_8x8_layer0,
+      av1_tx_split_nn_weights_8x8_layer1,
+  },
+  {
+      av1_tx_split_nn_bias_8x8_layer0,
+      av1_tx_split_nn_bias_8x8_layer1,
+  },
+};
+/******************************************************************************/
+
+// Tx split model for 8x16 block.
+static const float av1_tx_split_nn_weights_8x16_layer0[8 * 64] = {
+  0.374660f,  0.218905f,  -0.139779f, 0.212141f,  0.056517f,  0.051114f,
+  0.042860f,  -0.273258f, -0.340809f, 0.138983f,  -0.216996f, -0.241519f,
+  -0.123244f, 0.078577f,  -0.472273f, -0.194201f, 0.125056f,  0.239761f,
+  -0.332782f, 0.174782f,  -0.211400f, -0.129795f, 0.062195f,  0.113176f,
+  -0.008869f, 0.140764f,  0.059833f,  0.163826f,  0.359293f,  -0.109797f,
+  -0.022091f, -0.059536f, -0.188226f, 0.179709f,  0.031386f,  0.164790f,
+  0.214364f,  0.198555f,  0.152262f,  -0.242980f, 0.319367f,  -0.136902f,
+  0.046524f,  -0.043591f, 0.342178f,  -0.011757f, -0.014286f, 0.072871f,
+  -0.278314f, -0.345303f, -0.252103f, -0.107154f, -0.235101f, -0.106739f,
+  -0.120865f, -0.160042f, 0.240028f,  0.112902f,  -0.141587f, -0.703012f,
+  -0.136591f, 0.318993f,  -0.154417f, -0.054668f, 0.192870f,  0.176166f,
+  -0.029965f, 0.266942f,  -0.178384f, 0.038680f,  0.134403f,  -0.002426f,
+  0.534825f,  -0.070923f, 0.413281f,  0.418148f,  0.093729f,  0.016454f,
+  0.305358f,  -0.040512f, 0.069904f,  -0.227588f, -0.362220f, -0.031604f,
+  -0.394901f, 0.071506f,  -0.342833f, -0.142550f, -0.164005f, 0.182600f,
+  0.213062f,  0.076805f,  0.278758f,  0.125613f,  -0.035552f, 0.040971f,
+  0.182785f,  -0.227961f, -0.105413f, -0.074949f, -0.084629f, -0.254767f,
+  0.114657f,  0.047121f,  0.195902f,  0.264759f,  0.017799f,  0.210230f,
+  0.150749f,  -0.142142f, 0.182494f,  -0.142415f, -0.259782f, -0.114830f,
+  -0.198826f, 0.000061f,  -0.375668f, -0.276656f, -0.373202f, 0.210298f,
+  0.422680f,  0.066960f,  0.351106f,  -0.209034f, 0.367195f,  -0.110274f,
+  0.115573f,  -0.066642f, -0.389673f, -0.260447f, 0.056949f,  -0.180425f,
+  0.069922f,  -0.153506f, -0.097053f, -0.111757f, 0.094069f,  0.144837f,
+  -0.052984f, -0.506681f, -0.034474f, 0.279057f,  -0.105025f, 0.006656f,
+  -0.125017f, -0.114096f, 0.103153f,  -0.117402f, -0.359472f, 0.072534f,
+  0.110291f,  0.003088f,  -0.456897f, 0.038331f,  -0.322298f, 0.113942f,
+  -0.119916f, -0.194392f, 0.093167f,  0.193459f,  0.074671f,  0.033602f,
+  0.004440f,  -0.179578f, -0.036637f, -0.216172f, -0.296530f, -0.318992f,
+  0.319160f,  -0.066218f, 0.291246f,  0.181292f,  0.089914f,  0.025273f,
+  0.303128f,  0.019063f,  0.078545f,  -0.396919f, 0.014065f,  -0.122121f,
+  0.037107f,  -0.151886f, -0.299392f, -0.172207f, -0.124571f, -0.232553f,
+  0.102970f,  -0.225040f, 0.061059f,  -0.258188f, -0.469871f, -0.099607f,
+  -0.061524f, -0.213700f, 0.070237f,  -0.289134f, -0.238225f, 0.256403f,
+  -0.119344f, 0.067782f,  -0.398983f, -0.123975f, -0.200205f, -0.047038f,
+  0.026569f,  0.031037f,  0.094302f,  -0.101239f, 0.433307f,  -0.303612f,
+  0.088537f,  -0.164436f, 0.202471f,  -0.048592f, -0.251904f, 0.122577f,
+  -0.309874f, -0.263405f, -0.292503f, 0.216589f,  0.035378f,  0.136599f,
+  -0.145844f, -0.018211f, 0.174084f,  -0.449941f, -0.001428f, 0.064134f,
+  0.039652f,  0.111083f,  -0.246076f, -0.204733f, 0.056559f,  -0.000123f,
+  0.104049f,  0.138512f,  -0.128309f, 0.087855f,  0.232784f,  0.247138f,
+  0.162766f,  0.154829f,  0.313605f,  -0.164115f, -0.050844f, 0.156549f,
+  0.185279f,  -0.238962f, -0.308281f, -0.179592f, -0.193262f, 0.201670f,
+  -0.203399f, -0.096831f, -0.127867f, 0.310674f,  -0.008181f, 0.004078f,
+  -0.211038f, -0.193480f, -0.185639f, -0.150202f, -0.204858f, -0.240758f,
+  0.114268f,  -0.032535f, -0.052403f, -0.234333f, -0.064072f, -0.208444f,
+  -0.352853f, -0.224001f, -0.156330f, 0.215436f,  0.171846f,  0.291849f,
+  0.108832f,  0.046991f,  -0.127801f, 0.032485f,  0.141493f,  0.123319f,
+  -0.057250f, 0.315346f,  -0.061317f, -0.465086f, -0.130179f, -0.217841f,
+  -0.239089f, -0.073251f, -0.327718f, 0.054905f,  -0.283169f, -0.028900f,
+  0.071450f,  0.270072f,  0.248891f,  0.088052f,  0.253319f,  0.122808f,
+  0.175490f,  -0.147805f, 0.089169f,  -0.045457f, -0.330788f, 0.099791f,
+  -0.137376f, -0.195977f, -0.350942f, -0.284930f, -0.559037f, 0.030504f,
+  0.162554f,  -0.199100f, -0.050453f, -0.131320f, -0.077863f, -0.066253f,
+  -0.379723f, -0.424047f, -0.081182f, -0.252261f, -0.102815f, 0.058240f,
+  -0.182036f, 0.176772f,  -0.070823f, 0.216054f,  -0.211533f, -0.232992f,
+  0.279346f,  0.117984f,  0.236674f,  0.126625f,  -0.046220f, 0.044919f,
+  0.278492f,  0.083944f,  0.180512f,  0.217994f,  0.401170f,  -0.064417f,
+  0.011636f,  -0.139597f, -0.050020f, -0.268438f, -0.032803f, 0.024908f,
+  -0.085713f, -0.012984f, -0.055192f, -0.338657f, 0.045826f,  -0.312849f,
+  -0.023393f, -0.168800f, -0.030886f, -0.131816f, -0.253542f, -0.104812f,
+  -0.354389f, 0.169464f,  0.094151f,  -0.217122f, -0.456397f, 0.211478f,
+  0.219232f,  -0.155519f, -0.353700f, -0.264759f, -0.034709f, 0.034409f,
+  -0.148639f, -0.132850f, -0.216791f, -0.118492f, 0.173721f,  -0.144181f,
+  0.335028f,  0.176439f,  0.105980f,  0.169390f,  0.155615f,  -0.040618f,
+  -0.176029f, 0.155569f,  -0.184833f, -0.171099f, -0.178663f, -0.032051f,
+  -0.434334f, 0.092238f,  -0.263103f, 0.061804f,  -0.172957f, 0.005962f,
+  -0.100176f, 0.125898f,  0.048092f,  -0.088141f, 0.247196f,  -0.221601f,
+  -0.114474f, -0.124410f, -0.156393f, -0.181782f, -0.083562f, 0.034937f,
+  0.403401f,  -0.046200f, 0.322259f,  0.219678f,  0.109850f,  0.051837f,
+  0.196861f,  -0.019118f, 0.248818f,  -0.137567f, 0.127862f,  0.052293f,
+  0.298726f,  0.275788f,  0.015344f,  0.058714f,  0.283691f,  -0.053794f,
+  -0.123270f, -0.227761f, -0.141744f, -0.268515f, -0.007189f, -0.242117f,
+  -0.252396f, -0.069017f, 0.034803f,  -0.003388f, -0.262577f, 0.062115f,
+  -0.298393f, 0.215415f,  -0.153615f, 0.289902f,  0.085886f,  -0.504290f,
+  0.077178f,  0.150861f,  -0.228848f, -0.261020f, 0.198204f,  0.162113f,
+  0.346418f,  -0.286950f, 0.354756f,  -0.226419f, 0.024720f,  0.208037f,
+  0.107286f,  -0.110849f, 0.104415f,  -0.207725f, 0.063932f,  -0.037748f,
+  -0.167037f, -0.068282f, 0.320815f,  -0.051884f, 0.099989f,  -0.078388f,
+  0.127071f,  0.046675f,  -0.336571f, -0.273080f, 0.264694f,  -0.007352f,
+  -0.093828f, 0.094773f,  -0.144434f, 0.091795f,  -0.031615f, 0.056914f,
+  0.064673f,  -0.136669f, 0.344734f,  0.225926f,  0.283451f,  -0.068354f,
+  0.030572f,  0.180784f,  -0.378047f, -0.092962f, -0.083291f, 0.038970f,
+  0.052094f,  -0.017932f, 0.216302f,  -0.184396f, 0.079888f,  0.210406f,
+  -0.020627f, 0.244744f,  0.336972f,  -0.182914f, -0.220976f, -0.304225f,
+  -0.330974f, -0.370868f, -0.084935f, -0.136489f, -0.210082f, -0.188088f,
+  -0.408768f, 0.184693f,
+};
+
+static const float av1_tx_split_nn_bias_8x16_layer0[64] = {
+  -0.274107f, 0.445751f,  0.234359f,  0.291593f,  0.163298f,  0.183707f,
+  -0.548839f, -0.190779f, -0.163346f, -0.669028f, 0.399209f,  -0.354974f,
+  0.000000f,  -0.254630f, 0.220149f,  0.371104f,  0.789759f,  0.270300f,
+  0.195126f,  -0.206958f, 0.917708f,  -0.256232f, 1.131933f,  1.178944f,
+  0.461270f,  0.246169f,  -0.818614f, -0.111986f, 0.759355f,  0.154889f,
+  0.470299f,  -1.025250f, 0.678678f,  0.959346f,  -0.164105f, 0.544079f,
+  -0.448733f, 0.649221f,  -0.536672f, 0.962758f,  -0.256427f, 0.808664f,
+  -0.118694f, 0.684873f,  -0.015635f, -0.046469f, 0.075481f,  0.412647f,
+  0.454456f,  -0.107169f, 0.775235f,  -0.261629f, -1.194849f, 0.010093f,
+  -0.231289f, 0.658286f,  -0.769320f, 0.564545f,  0.482962f,  -0.131378f,
+  -0.255844f, -0.078400f, 0.476752f,  0.643001f,
+};
+
+static const float av1_tx_split_nn_weights_8x16_layer1[64] = {
+  -0.145065f, -0.145101f, 0.174786f,  0.196692f,  0.102025f,  -0.087735f,
+  0.386353f,  -0.660539f, -0.183940f, 0.490045f,  -0.276404f, -0.145669f,
+  0.209846f,  -0.085574f, -0.156821f, -0.377450f, -0.950010f, 0.450709f,
+  -0.108545f, -0.261181f, 1.435606f,  -0.176621f, -1.158548f, 2.035680f,
+  0.218069f,  -0.138629f, 0.305958f,  -0.277194f, -0.602468f, 0.203873f,
+  0.120720f,  0.216095f,  -0.434502f, -0.579746f, -0.239450f, 0.755529f,
+  0.545643f,  0.232091f,  0.330169f,  0.988136f,  -0.070465f, -0.345584f,
+  -0.162455f, -0.617064f, 0.123881f,  -0.201098f, 0.222756f,  0.112932f,
+  0.048647f,  -0.147890f, 0.394584f,  -0.262148f, 0.280564f,  -0.195432f,
+  -0.047515f, 1.133410f,  0.255415f,  -0.299032f, -0.397807f, -0.153246f,
+  -0.256734f, 0.177370f,  0.213522f,  -0.530158f,
+};
+
+static const float av1_tx_split_nn_bias_8x16_layer1[1] = {
+  0.14910713f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_8x16 = {
+  8,  // num_inputs
+  1,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      64,
+  },  // num_hidden_nodes
+  {
+      av1_tx_split_nn_weights_8x16_layer0,
+      av1_tx_split_nn_weights_8x16_layer1,
+  },
+  {
+      av1_tx_split_nn_bias_8x16_layer0,
+      av1_tx_split_nn_bias_8x16_layer1,
+  },
+};
+/******************************************************************************/
+
+// Tx split model for 16x16 block.
+static const float av1_tx_split_nn_weights_16x16_layer0[12 * 24] = {
+  -0.177215f, -0.297166f, 0.299924f,  0.207878f,  0.216871f,  0.173264f,
+  0.295464f,  0.048395f,  0.154731f,  0.305880f,  0.056787f,  -0.166617f,
+  0.115653f,  -0.529477f, -0.073995f, -0.211746f, -0.018169f, 0.000788f,
+  -0.024940f, -0.007055f, 0.001392f,  0.021678f,  -1.594600f, -0.099593f,
+  0.332930f,  0.103574f,  0.158249f,  0.182601f,  0.332665f,  0.226207f,
+  -0.139566f, 0.185531f,  0.099074f,  -0.185654f, -0.203121f, -0.285678f,
+  -0.313453f, -0.294452f, -0.143707f, -0.031265f, -0.453030f, -0.061874f,
+  -0.066150f, -0.099058f, -0.458879f, 0.127544f,  0.338314f,  -0.161350f,
+  0.030091f,  -0.075528f, 0.004320f,  0.353690f,  -0.013480f, -0.420402f,
+  -0.004659f, -0.329401f, -0.001745f, 0.227384f,  -0.055183f, 0.121405f,
+  0.160340f,  0.143603f,  -0.221813f, 0.079107f,  -0.657639f, -0.084348f,
+  -0.303414f, 0.046774f,  -0.367679f, 0.060005f,  0.168645f,  0.084421f,
+  -0.133625f, 0.301375f,  0.079412f,  -0.419303f, 0.017235f,  0.068637f,
+  0.018384f,  -0.428325f, -0.019753f, 0.149444f,  -0.474836f, -0.287162f,
+  0.198083f,  0.028292f,  -0.299092f, -0.005849f, -0.256245f, 0.233277f,
+  -0.217561f, -0.264003f, 0.269411f,  0.207032f,  -0.339411f, -0.198431f,
+  -0.028521f, 0.158076f,  0.177116f,  0.345702f,  -0.145132f, 0.064623f,
+  -0.090867f, 0.288816f,  -0.263198f, -0.071028f, -0.044546f, 0.380017f,
+  -0.014100f, -0.271192f, -0.318559f, 0.129015f,  -0.050314f, -0.093355f,
+  -0.578498f, 0.099090f,  -0.133080f, -0.029975f, -0.059828f, -0.157765f,
+  -0.321153f, -0.343671f, -0.242959f, 0.128304f,  0.017170f,  0.072787f,
+  -0.475838f, -0.003806f, -0.068615f, 0.150556f,  -0.159903f, -0.416513f,
+  0.218794f,  -0.290456f, -0.084569f, -0.170014f, -0.044414f, -0.153069f,
+  -0.077329f, -0.089747f, -0.096526f, 0.537952f,  0.134725f,  -0.006469f,
+  -0.323335f, -0.168183f, -0.107163f, -0.139954f, 0.011286f,  -0.021712f,
+  -0.513992f, 0.259135f,  -0.319808f, 0.077811f,  0.104613f,  0.370571f,
+  0.185244f,  0.065530f,  -0.091098f, -0.573741f, 0.111934f,  0.437417f,
+  -0.123691f, 0.220641f,  -0.024783f, -0.149460f, -0.354185f, -0.134127f,
+  0.038015f,  -0.380596f, 0.250980f,  0.142208f,  0.135170f,  -0.131129f,
+  -0.357556f, -0.530945f, 0.159672f,  -0.147025f, -0.377829f, -0.504508f,
+  -0.492870f, 0.020753f,  0.142818f,  0.025172f,  0.086140f,  0.091283f,
+  0.087491f,  -0.186415f, 0.177785f,  -0.195121f, -1.191148f, -0.477102f,
+  0.023371f,  0.227004f,  -0.023502f, -0.242913f, -0.074398f, -0.153480f,
+  0.162900f,  0.415509f,  -0.162565f, -0.131709f, -0.258852f, -0.252027f,
+  -0.080845f, -0.330274f, 0.021874f,  0.232398f,  0.069277f,  0.220567f,
+  -0.024237f, -0.366771f, 0.081673f,  -0.429906f, -0.302170f, 0.061045f,
+  0.352777f,  -0.230376f, 0.408153f,  0.064758f,  0.142051f,  0.007219f,
+  0.622878f,  0.212577f,  0.036489f,  0.081150f,  -0.284767f, 0.107763f,
+  -0.529786f, -0.072190f, -0.300421f, -0.287959f, -0.568900f, 0.011547f,
+  -0.131696f, -0.356854f, -0.587962f, -0.026598f, 0.405829f,  0.057565f,
+  0.414265f,  -0.159155f, 0.221456f,  0.146314f,  0.265776f,  -0.006516f,
+  0.473978f,  -0.186431f, 0.288672f,  -0.060437f, 0.083380f,  -0.205641f,
+  0.360016f,  0.222041f,  0.420011f,  0.024579f,  0.377546f,  0.250380f,
+  -0.069900f, 0.296743f,  0.073532f,  -0.243225f, -0.374987f, -0.387288f,
+  -0.237255f, -0.287013f, 0.417831f,  -0.252988f, -0.257652f, -0.066775f,
+  -0.253926f, 0.057841f,  0.346133f,  -0.157797f, -0.406028f, -0.286893f,
+  0.274507f,  -0.452561f, 0.143381f,  -0.097755f, 0.021242f,  0.034561f,
+  0.044115f,  0.004065f,  0.066729f,  0.043558f,  0.102991f,  -0.477574f,
+};
+
+static const float av1_tx_split_nn_bias_16x16_layer0[24] = {
+  -0.479033f, 1.467402f,  -0.366291f, 0.372511f,  0.715322f,  -0.605500f,
+  0.176848f,  0.032318f,  0.237429f,  -0.046047f, 0.452082f,  0.451805f,
+  -0.822845f, 0.636762f,  -0.057350f, 1.163978f,  0.728287f,  0.603654f,
+  -0.245519f, -0.893569f, -1.428185f, 0.808870f,  -0.076159f, 1.231976f,
+};
+
+static const float av1_tx_split_nn_weights_16x16_layer1[24] = {
+  -0.176161f, 1.670188f, -0.180755f, -0.321326f, 0.249728f,  -0.170504f,
+  -0.538432f, 0.033893f, 0.149842f,  0.404140f,  -0.377812f, 0.338838f,
+  -0.176091f, 0.249844f, -0.362533f, 1.412460f,  0.196862f,  0.278194f,
+  -0.140444f, 0.297746f, 0.172533f,  0.116470f,  -0.151656f, -0.603250f,
+};
+
+static const float av1_tx_split_nn_bias_16x16_layer1[1] = {
+  0.184803f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_16x16 = {
+  12,  // num_inputs
+  1,   // num_outputs
+  1,   // num_hidden_layers
+  {
+      24,
+  },  // num_hidden_nodes
+  {
+      av1_tx_split_nn_weights_16x16_layer0,
+      av1_tx_split_nn_weights_16x16_layer1,
+  },
+  {
+      av1_tx_split_nn_bias_16x16_layer0,
+      av1_tx_split_nn_bias_16x16_layer1,
+  },
+};
+/******************************************************************************/
+
+// Tx split model for 32x32 block.
+static const float av1_tx_split_nn_weights_32x32_layer0[12 * 32] = {
+  -0.439303f, 0.004813f,  -0.365052f, -0.116868f, -0.356716f, -0.196537f,
+  -0.196770f, -0.076096f, 0.357004f,  -0.044909f, -0.112910f, -0.129081f,
+  0.156725f,  -0.386346f, 0.038971f,  0.160696f,  0.204923f,  -0.384333f,
+  -0.319546f, 0.028179f,  -0.250524f, -0.289669f, -0.284138f, -0.258963f,
+  -0.180854f, -0.000807f, -0.029620f, -0.353134f, 0.212408f,  0.141414f,
+  0.303016f,  0.098066f,  0.482455f,  0.036069f,  -0.166279f, 0.210119f,
+  -0.086337f, -0.023550f, -0.250796f, -0.183945f, -0.393856f, 0.170608f,
+  -0.306403f, 0.026318f,  -0.277296f, 0.092684f,  -0.033584f, -0.018371f,
+  -0.025043f, -0.257659f, -0.139163f, -0.206949f, -0.190105f, 0.028053f,
+  0.361851f,  -0.364726f, -0.096771f, -0.184166f, -0.433228f, -0.182191f,
+  -0.097051f, 0.259172f,  0.016432f,  0.259358f,  0.145059f,  0.037196f,
+  0.091581f,  -0.219644f, 0.140384f,  -0.446837f, -0.234531f, 0.149508f,
+  -0.083429f, 0.186189f,  -0.099890f, -0.111277f, 0.495214f,  0.085053f,
+  -0.266613f, -0.051366f, 0.148593f,  0.111875f,  0.077787f,  -0.371653f,
+  -0.146157f, -0.229235f, 0.076203f,  0.488975f,  0.096771f,  -0.009483f,
+  0.192985f,  0.246273f,  -0.192671f, -0.557890f, -0.292650f, -0.088907f,
+  -0.106892f, -0.329659f, 0.012105f,  -0.359326f, 0.170723f,  -0.004357f,
+  0.171593f,  -0.478768f, -0.236016f, -0.035077f, 0.133731f,  0.137962f,
+  -0.397926f, -0.155164f, -0.276709f, -0.186602f, -0.258301f, 0.036965f,
+  -0.649359f, 0.127605f,  0.097930f,  0.182775f,  -0.313324f, 0.053349f,
+  0.204203f,  -0.222948f, -0.059008f, -0.049759f, -0.056848f, 0.087497f,
+  -0.039987f, -0.055042f, -0.041623f, -0.078424f, -0.317291f, -0.191398f,
+  0.632147f,  0.221825f,  0.268394f,  -0.096357f, 0.442545f,  -0.007117f,
+  -0.036125f, 0.000525f,  0.088092f,  -0.203653f, 0.086925f,  0.439141f,
+  0.329889f,  -0.370050f, -0.194306f, -0.207430f, 0.132779f,  -0.217614f,
+  -0.039444f, -0.053019f, -0.260725f, -0.116563f, -0.271048f, 0.283737f,
+  -0.007300f, 0.062257f,  -0.347865f, -0.296767f, -0.359123f, 0.230459f,
+  -0.189117f, -0.087622f, -0.561091f, 0.184182f,  -0.044980f, 0.012643f,
+  0.241672f,  0.050272f,  -0.204851f, -0.159285f, -0.064081f, -0.118666f,
+  -0.269471f, 0.231668f,  0.135749f,  -0.131162f, 0.062760f,  0.100949f,
+  0.074967f,  -0.056918f, 0.251707f,  0.034098f,  0.341290f,  -0.105027f,
+  0.313246f,  -0.092679f, -0.014632f, -0.390967f, 0.136881f,  -0.241554f,
+  0.097674f,  0.110832f,  -0.390245f, 0.017654f,  -0.506222f, 0.065252f,
+  0.244834f,  -0.171352f, -0.331702f, 0.111043f,  0.125217f,  -0.058116f,
+  -0.382595f, -0.052545f, 0.114261f,  -0.493617f, 0.243984f,  -0.171053f,
+  0.165009f,  -0.063020f, 0.096502f,  0.341339f,  -0.013443f, 0.056372f,
+  0.339284f,  0.398376f,  0.389409f,  0.257252f,  0.517368f,  0.078856f,
+  0.087716f,  -0.171092f, 0.227461f,  0.125307f,  -0.054423f, -0.143161f,
+  0.224041f,  -0.086477f, -0.092548f, 0.072392f,  -0.061608f, 0.258347f,
+  0.147033f,  -0.478244f, -0.204869f, 0.038552f,  -0.144563f, 0.224087f,
+  -0.296705f, 0.153889f,  -0.064624f, 0.085265f,  -0.103826f, 0.127971f,
+  0.019965f,  0.111937f,  -0.074187f, -0.029518f, -0.127305f, -0.012210f,
+  0.042714f,  0.070052f,  -0.202360f, 0.348144f,  -0.132097f, -0.209585f,
+  -0.248286f, -0.065774f, -0.089482f, -0.133226f, 0.325430f,  -0.013468f,
+  -0.406090f, -0.144936f, 0.208620f,  0.343445f,  -0.059639f, 0.114857f,
+  -0.069431f, -0.218725f, 0.190575f,  -0.368101f, 0.030030f,  0.062815f,
+  -0.239369f, -0.537852f, 0.022487f,  0.023038f,  0.190788f,  0.040123f,
+  -0.004304f, 0.060749f,  -0.108929f, 0.136796f,  -0.542875f, -0.227074f,
+  -0.182244f, 0.082559f,  0.019149f,  0.178854f,  0.120284f,  0.009070f,
+  0.068268f,  -0.544822f, 0.120536f,  0.354028f,  -0.119890f, -0.122055f,
+  -0.405335f, 0.122341f,  -0.304412f, 0.062405f,  -0.302568f, -0.276505f,
+  -0.120915f, -0.221841f, 0.282007f,  -0.253971f, 0.059517f,  -0.144976f,
+  0.149391f,  -0.047355f, -0.167742f, -0.392333f, -0.041132f, 0.342135f,
+  0.017485f,  0.021038f,  -0.023728f, -0.192181f, -0.103996f, 0.092873f,
+  -0.114365f, -0.397732f, -0.065421f, 0.053084f,  0.035201f,  0.053019f,
+  -0.105377f, -0.039500f, 0.131904f,  -0.123911f, -0.390328f, -0.125198f,
+  -0.000126f, 0.014864f,  -0.220187f, 0.084056f,  -0.492155f, -0.164979f,
+  0.133592f,  0.121519f,  -0.240813f, 0.186680f,  0.118673f,  0.235006f,
+  -0.239894f, -0.185759f, -0.336992f, 0.209620f,  -0.298845f, 0.127803f,
+  -0.083992f, 0.194340f,  -0.245378f, 0.212308f,  0.142512f,  -0.163324f,
+  0.383495f,  0.291065f,  0.286620f,  -0.239957f, 0.225127f,  -0.174424f,
+  0.297231f,  -0.045434f, 0.156444f,  -0.184273f, -0.204567f, 0.202551f,
+  0.370019f,  -0.073910f, 0.344897f,  0.063100f,  0.338547f,  -0.099145f,
+  0.391863f,  -0.214244f, -0.241734f, -0.281851f, -0.035133f, -0.153157f,
+};
+
+static const float av1_tx_split_nn_bias_32x32_layer0[32] = {
+  0.143343f,  -0.021982f, -0.314939f, 0.170867f,  -0.081248f, 0.125758f,
+  -0.355762f, 0.279798f,  1.027712f,  -0.434660f, 1.072005f,  0.668893f,
+  -0.031216f, -0.528650f, 0.328349f,  0.543645f,  -0.188810f, 0.221110f,
+  -1.638637f, 0.058045f,  -1.731105f, -0.444284f, 0.513693f,  0.890025f,
+  0.160288f,  0.393312f,  0.332856f,  -0.080767f, 0.299822f,  0.235876f,
+  0.254942f,  -0.017796f,
+};
+
+static const float av1_tx_split_nn_weights_32x32_layer1[32] = {
+  -0.090326f, -0.267553f, -0.026071f, 0.100912f,  0.279137f,  0.079064f,
+  -0.074885f, 0.053804f,  0.736810f,  -0.031693f, -0.970514f, 0.174069f,
+  0.095940f,  -0.065047f, 0.052911f,  0.176728f,  -0.058274f, 0.148364f,
+  -0.162210f, 0.093875f,  -0.367663f, 0.020876f,  0.137280f,  -1.099116f,
+  0.146854f,  0.075590f,  0.228534f,  0.141993f,  0.072143f,  0.101421f,
+  -0.068547f, -0.154148f,
+};
+
+static const float av1_tx_split_nn_bias_32x32_layer1[1] = {
+  0.316622f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_32x32 = {
+  12,  // num_inputs
+  1,   // num_outputs
+  1,   // num_hidden_layers
+  {
+      32,
+  },  // num_hidden_nodes
+  {
+      av1_tx_split_nn_weights_32x32_layer0,
+      av1_tx_split_nn_weights_32x32_layer1,
+  },
+  {
+      av1_tx_split_nn_bias_32x32_layer0,
+      av1_tx_split_nn_bias_32x32_layer1,
+  },
+};
+/******************************************************************************/
+
+// Tx split model for 64x64 block.
+static const float av1_tx_split_nn_weights_64x64_layer0[12 * 32] = {
+  -0.006828f, 0.149944f,  -0.017614f, -0.044599f, -0.024517f, 0.507698f,
+  0.001039f,  0.037164f,  0.015091f,  -0.306620f, -0.162047f, -0.369440f,
+  0.396310f,  0.087121f,  0.208609f,  -0.083068f, 0.493774f,  0.217682f,
+  0.377393f,  0.172879f,  0.397422f,  0.078919f,  0.741350f,  0.064169f,
+  -0.099989f, -0.192983f, -0.278230f, -0.310048f, -0.439965f, -0.226698f,
+  -0.436596f, -0.007551f, -0.396721f, 0.153570f,  -0.190838f, -0.071869f,
+  0.048799f,  -0.301301f, -0.005015f, 0.500480f,  -0.030622f, -0.559095f,
+  -0.032634f, -0.054160f, -0.056979f, -0.456545f, 0.306536f,  -0.411323f,
+  -0.005366f, -0.069496f, 0.019990f,  0.327931f,  -0.002516f, 0.393190f,
+  0.001759f,  0.035093f,  -0.030302f, -0.528984f, 0.174781f,  0.241462f,
+  -0.415427f, -0.164502f, 0.143065f,  -0.122595f, 0.082049f,  -0.143346f,
+  0.055642f,  -0.124701f, 0.004050f,  -0.216235f, -2.681730f, 0.101658f,
+  0.381239f,  0.465936f,  0.331154f,  0.301708f,  -0.360171f, 0.054886f,
+  -0.118658f, 0.287921f,  0.277859f,  0.203784f,  0.247809f,  0.656924f,
+  -0.354628f, 0.315081f,  0.105108f,  -0.510179f, 0.059267f,  0.061386f,
+  0.076423f,  0.347119f,  0.100134f,  0.028402f,  -0.118621f, -0.238689f,
+  0.080141f,  -0.138863f, 0.009009f,  -0.100526f, -0.138875f, 0.066992f,
+  0.005949f,  0.564336f,  0.046994f,  0.004655f,  0.366047f,  0.014695f,
+  -0.146928f, -0.024665f, -0.440357f, -0.109395f, 0.527231f,  -0.020925f,
+  -0.227236f, -0.068141f, 0.282009f,  0.040192f,  -0.267100f, 0.229228f,
+  0.133861f,  0.338706f,  -0.030178f, -0.040919f, -0.026343f, -0.330338f,
+  -0.066931f, -0.110580f, -0.072056f, 0.599457f,  -0.020738f, 0.169200f,
+  0.836240f,  -0.157548f, 0.386273f,  0.002404f,  0.329410f,  -0.007020f,
+  0.351705f,  -0.041259f, 0.388861f,  0.003899f,  0.582627f,  0.023572f,
+  0.409912f,  -0.158472f, 0.536383f,  0.525093f,  0.604247f,  0.439159f,
+  0.692832f,  0.046272f,  0.590367f,  -0.082166f, 0.262357f,  0.478671f,
+  0.031935f,  0.042675f,  0.120002f,  0.398616f,  -0.078967f, 0.227986f,
+  -0.044679f, 0.151061f,  -0.085564f, 0.220205f,  -0.265606f, -0.203623f,
+  0.204719f,  -0.125922f, 0.038544f,  -0.269379f, 0.025866f,  0.109967f,
+  0.019064f,  -0.237297f, -0.309746f, -0.329118f, -0.278368f, -0.063859f,
+  0.278496f,  0.018620f,  0.209971f,  0.296250f,  0.142850f,  0.288689f,
+  0.137084f,  0.130517f,  0.128171f,  -0.155396f, -0.008449f, -0.099845f,
+  0.173455f,  -0.059909f, -0.147318f, 0.102851f,  -0.251389f, -0.001448f,
+  0.103907f,  0.297273f,  -0.027846f, 0.028260f,  -0.382601f, 0.346695f,
+  -0.601641f, 0.162366f,  -0.477495f, -0.042731f, -0.387871f, -0.051791f,
+  -0.401498f, -0.048446f, -0.456270f, -0.062287f, 0.493919f,  0.003008f,
+  0.099917f,  -0.358525f, -0.094903f, -0.022811f, -0.062259f, 0.019455f,
+  -0.050644f, 0.020041f,  -0.132912f, -0.061578f, -3.083691f, -0.014961f,
+  -0.129115f, -0.710559f, 0.157213f,  -0.844037f, -0.121991f, -0.943386f,
+  -0.231269f, -0.003462f, 0.331478f,  -0.132703f, -1.285993f, -0.120957f,
+  -0.373755f, -0.322609f, 0.309059f,  -0.131523f, -0.118334f, -0.063805f,
+  -0.104251f, 0.012166f,  -0.094699f, -0.283753f, 0.128168f,  -0.526929f,
+  -0.050331f, 0.186153f,  0.005913f,  -0.221236f, 0.036363f,  0.160909f,
+  -0.001342f, -0.382749f, 0.037820f,  0.281689f,  -0.024275f, 0.028854f,
+  0.318291f,  0.318526f,  0.035778f,  0.034031f,  0.189663f,  -0.293367f,
+  0.082022f,  0.127923f,  0.078866f,  -0.081361f, -0.268117f, 0.246675f,
+  0.248605f,  -0.215479f, -0.073084f, 0.496140f,  -0.067327f, 0.396237f,
+  -0.120739f, 0.033752f,  -0.044120f, -0.218941f, -0.028078f, 0.195132f,
+  -0.040400f, 0.281604f,  -0.100471f, 0.415207f,  -0.258503f, -0.429749f,
+  0.150569f,  -0.010859f, 0.136448f,  0.026589f,  0.148466f,  0.110764f,
+  0.380967f,  0.009177f,  0.103075f,  0.116417f,  0.226273f,  -0.327746f,
+  0.169346f,  0.284553f,  -0.094986f, 0.312745f,  -0.147840f, 0.025062f,
+  -0.494482f, 0.112388f,  -0.213962f, 0.107050f,  -0.433371f, -0.096276f,
+  -0.244835f, -0.003518f, -0.459148f, -0.145080f, 0.017150f,  0.042846f,
+  -0.237479f, 0.104746f,  0.158677f,  0.358937f,  0.099921f,  0.277109f,
+  0.012410f,  -0.062897f, 0.116130f,  0.255309f,  0.341628f,  0.145002f,
+  -0.429344f, -0.016433f, -0.068985f, 0.285194f,  -0.286719f, -0.018298f,
+  -0.179369f, -0.194655f, -0.165380f, 0.026071f,  -0.428268f, -0.379929f,
+  -0.727543f, 0.179610f,  -0.963979f, -0.042026f, -0.616202f, 0.133401f,
+  -0.784966f, 0.061205f,  -0.713357f, 0.129795f,  0.120512f,  -0.339545f,
+  0.353557f,  0.114906f,  -0.329813f, -0.209987f, 0.085410f,  0.214313f,
+  -0.122082f, 0.335770f,  -0.020937f, 0.202456f,  0.289023f,  -0.421186f,
+  0.337905f,  0.407663f,  0.132771f,  0.071734f,  0.213914f,  0.128595f,
+  0.302659f,  -0.209501f, 0.217756f,  0.253079f,  -0.089505f, -0.205614f,
+};
+
+static const float av1_tx_split_nn_bias_64x64_layer0[32] = {
+  0.296914f,  -1.826816f, 0.346130f,  0.969520f,  -0.528154f, 1.175862f,
+  -0.075985f, -0.097323f, -0.233059f, 0.004846f,  0.401279f,  -2.272435f,
+  0.086257f,  0.414162f,  -0.194786f, -0.233887f, -0.113215f, -2.453546f,
+  0.861214f,  0.298361f,  0.267397f,  -0.158557f, -0.119911f, -0.098134f,
+  -0.339263f, 0.385871f,  -0.678123f, 0.263218f,  0.251611f,  -1.155773f,
+  -0.365437f, 0.229255f,
+};
+
+static const float av1_tx_split_nn_weights_64x64_layer1[32] = {
+  0.502104f,  -0.708023f, 0.419648f,  1.583418f,  0.419355f,  -1.462981f,
+  -0.439623f, 0.405691f,  0.823257f,  0.061654f,  0.750875f,  0.775031f,
+  -0.387909f, 0.447385f,  0.284690f,  0.353262f,  -0.224347f, 0.832864f,
+  -1.708491f, -1.042447f, -0.272829f, 0.540640f,  0.310509f,  0.723745f,
+  0.245592f,  -0.218417f, -0.597987f, -0.362301f, 0.702217f,  -0.692614f,
+  0.207812f,  0.513560f,
+};
+
+static const float av1_tx_split_nn_bias_64x64_layer1[1] = { -0.2307045f };
+
+static const NN_CONFIG av1_tx_split_nnconfig_64x64 = {
+  12,  // num_inputs
+  1,   // num_outputs
+  1,   // num_hidden_layers
+  {
+      32,
+  },  // num_hidden_nodes
+  {
+      av1_tx_split_nn_weights_64x64_layer0,
+      av1_tx_split_nn_weights_64x64_layer1,
+  },
+  {
+      av1_tx_split_nn_bias_64x64_layer0,
+      av1_tx_split_nn_bias_64x64_layer1,
+  },
+};
+/******************************************************************************/
+
+// Tx split model for 4x16 block.
+static const float av1_tx_split_nn_weights_4x16_layer0[8 * 16] = {
+  -1.344184f, -1.454625f, -0.703110f, -0.140570f, -0.841536f, -0.068131f,
+  -2.128968f, -0.655518f, 0.432180f,  0.879752f,  -0.222211f, 0.061615f,
+  -0.230969f, 0.569496f,  1.424188f,  0.598063f,  -0.436005f, -0.737606f,
+  -0.137875f, -0.085730f, -0.076512f, -0.583101f, -0.937377f, -0.203556f,
+  -0.215797f, -0.015361f, -0.124098f, -0.411917f, 0.340441f,  -0.331752f,
+  -0.472607f, -0.097714f, -0.930572f, -1.354713f, -0.550724f, 0.176212f,
+  -0.636060f, 0.183271f,  -0.610212f, 0.345895f,  -1.100906f, -1.605713f,
+  0.111888f,  -0.140937f, 0.063013f,  -0.013315f, -0.273472f, -0.255870f,
+  1.200328f,  0.274002f,  1.005776f,  0.322392f,  1.222373f,  0.158227f,
+  0.408810f,  0.145022f,  0.139842f,  -1.249412f, 0.286672f,  -0.635699f,
+  0.312562f,  -0.495606f, -1.117034f, -0.085107f, -0.097484f, -0.341521f,
+  -0.132199f, -0.863055f, 0.217579f,  -1.161425f, -0.302087f, -1.357271f,
+  -0.520724f, -1.211069f, -1.048729f, -0.333087f, -1.171527f, -0.280824f,
+  -2.057684f, -0.228755f, 0.606278f,  0.101198f,  -0.314847f, -1.303255f,
+  -0.294964f, 1.301923f,  0.041712f,  0.077593f,  -1.152746f, 0.495315f,
+  -0.751566f, 0.230249f,  -0.840661f, 0.100731f,  1.346269f,  0.649898f,
+  -1.432258f, -0.456710f, -1.018123f, -0.348559f, -1.225226f, -0.170717f,
+  -0.354072f, 0.068292f,  -0.234168f, 0.277503f,  0.179134f,  0.907420f,
+  0.354626f,  -0.627210f, 0.905779f,  0.512612f,  0.161190f,  -0.843177f,
+  0.014953f,  -0.354983f, 0.011116f,  -0.429598f, -1.017138f, -0.211432f,
+  0.941840f,  -0.281747f, 0.957776f,  -0.541914f, 1.041880f,  -0.433580f,
+  -1.416451f, -0.166467f,
+};
+
+static const float av1_tx_split_nn_bias_4x16_layer0[16] = {
+  3.086118f,  -3.235095f, 4.830956f,  -0.165706f, 0.955031f,  4.055783f,
+  -0.311489f, 4.660205f,  -0.576277f, -0.248111f, -0.790519f, -1.686412f,
+  -1.191704f, -3.800073f, 4.121552f,  -1.399397f,
+};
+
+static const float av1_tx_split_nn_weights_4x16_layer1[16] = {
+  -0.758677f, 0.388776f,  0.439906f,  0.011390f, -0.084319f, -0.667969f,
+  -0.467316f, -0.875491f, -0.160668f, 0.805292f, 0.114393f,  -0.549682f,
+  0.462109f,  0.343315f,  1.092593f,  0.483152f,
+};
+
+static const float av1_tx_split_nn_bias_4x16_layer1[1] = {
+  0.8205083f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_4x16 = {
+  8,  // num_inputs
+  1,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      16,
+  },  // num_hidden_nodes
+  {
+      av1_tx_split_nn_weights_4x16_layer0,
+      av1_tx_split_nn_weights_4x16_layer1,
+  },
+  {
+      av1_tx_split_nn_bias_4x16_layer0,
+      av1_tx_split_nn_bias_4x16_layer1,
+  },
+};
+/******************************************************************************/
+
+// Tx split model for 16x32 block.
+static const float av1_tx_split_nn_weights_16x32_layer0[8 * 32] = {
+  0.180713f,  0.033211f,  0.607561f,  0.138642f,  0.637204f,  -0.000940f,
+  0.012630f,  0.358109f,  0.022238f,  0.190418f,  0.079088f,  0.065925f,
+  0.038242f,  0.162380f,  -0.122728f, 0.379382f,  -0.303283f, -0.327550f,
+  0.029120f,  -0.284553f, 0.269588f,  -0.309805f, -0.241036f, -0.161103f,
+  -0.304887f, 0.239843f,  -0.149146f, 0.311234f,  -0.073640f, -0.132718f,
+  0.178901f,  0.474712f,  0.020280f,  0.063685f,  -0.609170f, -0.013658f,
+  -0.338074f, 0.250429f,  0.082978f,  -0.186315f, -0.788959f, 0.039859f,
+  -0.426461f, -0.001524f, -0.447211f, 0.378102f,  0.315617f,  0.017428f,
+  0.745494f,  -0.219024f, 0.512836f,  0.200522f,  0.680449f,  0.313686f,
+  -0.412569f, -0.132927f, 0.631120f,  0.042735f,  0.336153f,  0.044772f,
+  0.432606f,  0.175681f,  -0.634411f, -0.073509f, -0.040643f, -0.559260f,
+  -0.104034f, -0.570495f, -0.247365f, 0.063256f,  -0.582021f, -0.492585f,
+  -0.194955f, -0.207934f, -0.506627f, 0.021743f,  -0.416518f, 0.320876f,
+  0.115889f,  0.149399f,  -0.229376f, 0.095505f,  0.115191f,  -0.471921f,
+  0.113068f,  0.343684f,  -0.036831f, 0.021240f,  0.295112f,  0.031166f,
+  0.448201f,  -0.132241f, 0.164032f,  0.355572f,  0.072154f,  0.017335f,
+  -0.046113f, 0.178719f,  -0.026881f, -0.242590f, 0.055073f,  -0.012958f,
+  0.077904f,  0.351356f,  0.107655f,  0.260568f,  -0.080052f, -0.197553f,
+  0.085763f,  0.263416f,  -0.327741f, 0.158855f,  0.056899f,  -0.162121f,
+  0.339518f,  -0.571204f, 0.264966f,  -0.252214f, -0.202560f, -0.134213f,
+  -0.330188f, 0.009470f,  -0.468376f, -0.065240f, -0.307957f, 0.116479f,
+  -0.222238f, -0.458716f, 0.186493f,  -0.391415f, 0.118649f,  -0.104653f,
+  -0.259958f, -0.332081f, -0.403785f, -0.050147f, -0.573511f, 0.177117f,
+  -0.598358f, 0.164947f,  -0.119694f, -0.058520f, 0.203829f,  -0.267404f,
+  -0.048202f, -0.600006f, 0.181594f,  -0.731805f, 0.146417f,  -0.687148f,
+  -1.210525f, -0.450101f, -0.620635f, 0.208825f,  -0.611357f, 0.112202f,
+  -0.309468f, -0.323545f, 0.357770f,  0.308061f,  0.553199f,  0.049012f,
+  0.530093f,  -0.208597f, 0.607882f,  -0.058120f, -0.527634f, 0.018136f,
+  0.060753f,  0.118894f,  0.175649f,  0.014731f,  0.428318f,  -0.106465f,
+  -0.119077f, 0.080179f,  0.524997f,  0.368286f,  0.528286f,  0.213659f,
+  0.639286f,  0.195079f,  -0.049815f, -0.092008f, -0.302958f, 0.298149f,
+  -0.173870f, -0.145205f, -0.233589f, -0.303368f, 0.141275f,  0.325622f,
+  -0.115293f, 0.155188f,  0.047225f,  0.231050f,  -0.167447f, 0.349754f,
+  0.295544f,  -0.319466f, 0.095144f,  0.174612f,  -0.194652f, 0.305915f,
+  -0.239008f, -0.037453f, 0.280696f,  0.125850f,  0.749196f,  -0.101919f,
+  0.791808f,  -0.236811f, 0.064157f,  0.032865f,  -0.225911f, 0.350384f,
+  0.723183f,  -0.103992f, 0.483085f,  -0.123992f, 0.602138f,  0.023895f,
+  -0.692601f, -0.118387f, 0.162527f,  0.145178f,  -0.184702f, -0.017753f,
+  -0.159436f, 0.124105f,  -0.131067f, 0.310275f,  0.151499f,  0.138924f,
+  0.537459f,  0.263212f,  0.615896f,  0.281255f,  0.021293f,  -0.473459f,
+  0.210145f,  -0.056682f, 0.063658f,  0.377254f,  -0.314410f, -0.183487f,
+  0.300384f,  0.328471f,  0.164694f,  -0.159272f, -0.160942f, -0.502861f,
+  -0.129147f, 0.045916f,  -0.606865f, -0.101378f,
+};
+
+static const float av1_tx_split_nn_bias_16x32_layer0[32] = {
+  0.051664f,  -0.212487f, -0.077596f, -0.818467f, 0.638475f,  -0.759937f,
+  0.157198f,  0.989640f,  1.586035f,  0.431144f,  0.041605f,  0.543085f,
+  0.498379f,  0.320504f,  0.134233f,  0.670979f,  -0.105562f, -1.574879f,
+  1.261812f,  -0.287530f, -1.610592f, 0.730899f,  -0.894240f, -0.657790f,
+  0.270806f,  -0.181708f, 0.298578f,  0.817240f,  -0.221508f, -0.201771f,
+  -0.294389f, 1.456413f,
+};
+
+static const float av1_tx_split_nn_weights_16x32_layer1[32] = {
+  1.208914f,  0.324728f,  0.383352f,  -0.874321f, 0.172565f,  -0.580927f,
+  -0.432927f, 0.433698f,  -0.801935f, 0.672028f,  0.563493f,  0.260077f,
+  -0.200557f, -0.121638f, 0.530735f,  -0.525196f, 0.281799f,  0.624204f,
+  -0.662775f, -0.230887f, 0.980989f,  0.223437f,  -0.790591f, 0.600724f,
+  -0.273445f, 0.427635f,  -0.501641f, -0.878390f, 0.234731f,  -0.172550f,
+  0.418904f,  1.792187f,
+};
+
+static const float av1_tx_split_nn_bias_16x32_layer1[1] = {
+  -0.29233751f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_16x32 = {
+  8,  // num_inputs
+  1,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      32,
+  },  // num_hidden_nodes
+  {
+      av1_tx_split_nn_weights_16x32_layer0,
+      av1_tx_split_nn_weights_16x32_layer1,
+  },
+  {
+      av1_tx_split_nn_bias_16x32_layer0,
+      av1_tx_split_nn_bias_16x32_layer1,
+  },
+};
+/******************************************************************************/
+
+// Tx split model for 32x64 block.
+static const float av1_tx_split_nn_weights_32x64_layer0[8 * 32] = {
+  0.031614f,  -0.110926f, 0.052418f,  -0.702506f, 0.045708f,  0.238329f,
+  -0.021806f, -0.208128f, 0.509745f,  -0.293891f, 0.277788f,  0.113937f,
+  0.741576f,  0.062848f,  0.351878f,  0.212532f,  0.385842f,  0.081517f,
+  0.398502f,  -0.015156f, 0.242616f,  0.214619f,  -0.182678f, -0.170546f,
+  0.110605f,  -0.236749f, -0.023831f, -0.285243f, 0.147156f,  -0.257639f,
+  0.341355f,  -0.571641f, -0.721797f, 0.139588f,  -0.518494f, -0.206526f,
+  -0.570560f, -0.184295f, 0.110271f,  0.210292f,  -0.109132f, -0.001080f,
+  0.129251f,  -0.204230f, -0.396312f, -0.183024f, 0.421243f,  -0.013154f,
+  0.222627f,  0.169826f,  0.226037f,  0.218153f,  -0.343528f, 0.274906f,
+  -0.156632f, 0.250261f,  -0.484020f, 0.019909f,  -0.349575f, -0.286643f,
+  -0.507396f, 0.202446f,  -0.154110f, -0.292644f, 0.122666f,  0.306963f,
+  0.424895f,  0.005579f,  0.494094f,  -0.079551f, 0.473740f,  0.352414f,
+  -0.356917f, 0.264331f,  -0.554487f, 0.119978f,  0.012291f,  -0.141641f,
+  -0.254714f, -0.213723f, -0.116701f, -0.011267f, 0.190025f,  -0.118501f,
+  0.305151f,  -0.316782f, -0.220801f, -0.308420f, -0.324285f, 0.421329f,
+  -0.177066f, -0.055114f, 0.229698f,  -0.199523f, 0.054278f,  0.365020f,
+  -0.060586f, -0.300618f, 0.157563f,  -0.064338f, -0.005711f, -0.176991f,
+  -0.424502f, -0.111914f, 0.092608f,  0.126621f,  0.078547f,  0.148008f,
+  0.024221f,  0.124599f,  0.001343f,  0.059402f,  0.453753f,  0.047102f,
+  0.242544f,  0.055735f,  -0.067451f, -0.170061f, -0.170469f, -0.232173f,
+  0.214908f,  0.248889f,  0.544348f,  -0.084566f, 0.402478f,  0.298031f,
+  0.099038f,  -0.238019f, -0.475085f, -0.070042f, -0.754955f, -0.049095f,
+  -0.783801f, -0.099857f, -0.582008f, -0.055194f, -0.103655f, 0.143689f,
+  0.100219f,  0.293934f,  0.099271f,  -0.036320f, 0.356626f,  -0.261445f,
+  0.879544f,  0.000878f,  0.532920f,  -0.093918f, 0.508867f,  -0.040215f,
+  -0.789042f, -0.145380f, -0.090040f, -0.066636f, 0.015212f,  0.352989f,
+  -0.058831f, -0.164588f, 0.039890f,  0.122861f,  0.222508f,  0.061217f,
+  0.466487f,  0.022666f,  0.423777f,  -0.002200f, -0.656835f, -0.099760f,
+  -0.520606f, 0.303204f,  -0.563620f, -0.160922f, -0.243203f, 0.313354f,
+  -0.336516f, -0.206764f, -0.236040f, 0.325899f,  -0.418748f, 0.163205f,
+  -0.476242f, -0.121928f, 0.139178f,  -0.157193f, -0.531766f, -0.180202f,
+  -0.485254f, 0.187703f,  -0.440072f, 0.137854f,  0.029139f,  0.109530f,
+  -0.078475f, -0.360618f, -0.334672f, -0.350890f, -0.403976f, 0.180336f,
+  -0.304542f, 0.005123f,  0.413995f,  0.314639f,  0.342648f,  -0.293264f,
+  0.358135f,  -0.180425f, -0.369530f, -0.048413f, 0.498366f,  0.121875f,
+  0.270948f,  -0.187966f, 0.342503f,  0.174420f,  -0.352105f, 0.088080f,
+  0.008277f,  0.020275f,  -0.002381f, 0.504389f,  -0.018832f, -0.366047f,
+  -0.090947f, -0.168150f, 0.016184f,  -0.328914f, 0.089579f,  -0.017349f,
+  0.005844f,  -0.005010f, -1.857514f, -0.282426f, 0.010177f,  -0.214727f,
+  -0.182529f, 0.156943f,  -0.162032f, -0.472654f, 0.069432f,  0.016901f,
+  -0.767905f, 0.137129f,  -0.411463f, 0.049056f,  -0.431657f, -0.037641f,
+  0.785500f,  0.046225f,  0.195831f,  0.245204f,  0.368614f,  0.212261f,
+  0.440626f,  -0.158048f, -0.461031f, -0.146280f,
+};
+
+static const float av1_tx_split_nn_bias_32x64_layer0[32] = {
+  0.490777f,  -1.894238f, 0.621333f,  -0.076756f, 0.286298f, 0.286375f,
+  -0.126431f, -0.350034f, -1.017572f, 0.620125f,  0.408128f, 0.238756f,
+  -0.060728f, 0.210912f,  0.043124f,  0.445649f,  0.907025f, 0.360272f,
+  1.083101f,  -0.068952f, 1.062348f,  0.396354f,  0.280075f, 0.501732f,
+  0.328422f,  0.066241f,  0.474697f,  0.126313f,  0.741206f, 0.314796f,
+  0.552712f,  0.299410f,
+};
+
+static const float av1_tx_split_nn_weights_32x64_layer1[32] = {
+  1.033823f,  0.603439f,  0.304591f,  -0.279940f, -0.780909f, -0.132801f,
+  0.154059f,  0.662014f,  -0.718368f, 0.198733f,  0.039766f,  -0.208516f,
+  -0.104909f, -0.394209f, 0.081617f,  0.365041f,  -0.874960f, -0.063315f,
+  -1.189897f, 0.337225f,  0.410893f,  0.307519f,  0.221323f,  0.233895f,
+  0.469536f,  0.438557f,  0.280144f,  0.422423f,  -1.394513f, 0.781900f,
+  0.352981f,  0.111265f,
+};
+
+static const float av1_tx_split_nn_bias_32x64_layer1[1] = {
+  -0.18160765f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_32x64 = {
+  8,  // num_inputs
+  1,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      32,
+  },  // num_hidden_nodes
+  {
+      av1_tx_split_nn_weights_32x64_layer0,
+      av1_tx_split_nn_weights_32x64_layer1,
+  },
+  {
+      av1_tx_split_nn_bias_32x64_layer0,
+      av1_tx_split_nn_bias_32x64_layer1,
+  },
+};
+/******************************************************************************/
+
+// Tx split model for 8x32 block.
+static const float av1_tx_split_nn_weights_8x32_layer0[8 * 24] = {
+  -0.687846f, 0.121404f,  -0.372905f, 0.126770f,  -0.103298f, -0.101650f,
+  -0.148490f, -0.271740f, 0.682915f,  -0.079765f, 0.634347f,  -0.151503f,
+  0.287692f,  -0.079072f, -0.236948f, 0.065064f,  0.713383f,  0.397123f,
+  0.553621f,  0.368529f,  0.767663f,  -0.046601f, -0.392402f, -0.294822f,
+  -0.292325f, -0.010573f, -0.837945f, 0.050113f,  -0.811360f, 0.199162f,
+  0.150832f,  0.011602f,  0.369694f,  -0.225876f, 0.234113f,  -0.269808f,
+  0.303805f,  -0.190281f, -0.451136f, 0.209755f,  -0.308894f, 0.326956f,
+  0.313591f,  0.089923f,  -0.095754f, 0.390981f,  0.467366f,  0.169670f,
+  0.853322f,  0.054055f,  0.830319f,  -0.121918f, 0.262019f,  -0.093526f,
+  0.385558f,  0.419174f,  0.040198f,  -0.347030f, -0.450492f, -0.106764f,
+  0.487502f,  -0.204188f, 0.430374f,  -0.116388f, 0.236407f,  -0.157376f,
+  0.732294f,  -0.651387f, 0.347446f,  0.342575f,  0.048406f,  0.187657f,
+  0.434899f,  -0.447782f, 0.032728f,  -0.071168f, -0.255327f, 0.104174f,
+  0.095689f,  -0.431743f, 0.725694f,  0.031797f,  0.523171f,  0.061801f,
+  0.469804f,  -0.071068f, -0.059024f, -0.211937f, 0.392134f,  -0.321490f,
+  0.366060f,  -0.427798f, 0.166771f,  0.299652f,  0.044660f,  0.205142f,
+  0.039133f,  -0.051835f, -0.465475f, 0.216976f,  -0.341156f, 0.095358f,
+  0.230807f,  0.201674f,  0.279266f,  -0.713534f, -0.091690f, -0.569708f,
+  -0.119001f, 0.252160f,  -1.544578f, -0.284477f, 0.555348f,  0.226471f,
+  0.347690f,  0.034365f,  0.770835f,  -0.241859f, -0.130241f, 0.292936f,
+  0.396622f,  -0.417916f, 0.492224f,  0.125517f,  0.344824f,  0.232172f,
+  -0.432106f, -0.278745f, 0.035069f,  -0.307247f, -0.120760f, 0.170950f,
+  0.433601f,  0.044286f,  0.141463f,  -0.041382f, 0.529346f,  0.010868f,
+  -0.323674f, 0.185205f,  0.623459f,  0.232842f,  -0.406693f, -0.142944f,
+  0.222988f,  0.343634f,  0.065401f,  0.002621f,  0.805335f,  -0.426926f,
+  0.279181f,  0.131364f,  0.192339f,  -0.402391f, 0.544120f,  -0.060618f,
+  0.467780f,  0.165224f,  -0.373131f, 0.002427f,  0.688064f,  0.322317f,
+  0.259713f,  0.130583f,  0.185032f,  -0.189111f, -0.067821f, 0.010875f,
+  0.644724f,  -0.179291f, 0.463222f,  0.155230f,  0.721384f,  -0.046019f,
+  0.438501f,  0.440027f,  -0.462090f, -0.002039f, -0.468026f, -0.008890f,
+  -0.328530f, 0.370102f,  0.482531f,  0.043471f,  -0.469732f, -0.532663f,
+  0.122081f,  -0.379659f, 0.037219f,  -0.519913f, -0.128975f, -0.404365f,
+};
+
+static const float av1_tx_split_nn_bias_8x32_layer0[24] = {
+  -1.198965f, 0.395204f,  -0.408627f, -0.021654f, -0.658355f, 0.154525f,
+  -0.288354f, 1.207574f,  0.411608f,  0.964678f,  -1.176893f, 1.059006f,
+  -0.472969f, 2.087975f,  1.065536f,  0.595569f,  0.197907f,  -0.349938f,
+  1.013651f,  -0.931093f, -0.973595f, -0.459094f, -1.253062f, 1.624782f,
+};
+
+static const float av1_tx_split_nn_weights_8x32_layer1[24] = {
+  0.815787f,  -0.393465f, -0.483427f, -0.565592f, 0.493494f,  0.430229f,
+  -0.507073f, -0.251379f, -0.353418f, -0.495445f, 0.820029f,  0.649146f,
+  -0.487383f, 1.844503f,  0.480324f,  -0.982705f, -0.501446f, -0.220584f,
+  0.334299f,  0.802238f,  0.805838f,  -0.487848f, 0.300772f,  -1.232857f,
+};
+
+static const float av1_tx_split_nn_bias_8x32_layer1[1] = {
+  0.13435879f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_8x32 = {
+  8,  // num_inputs
+  1,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      24,
+  },  // num_hidden_nodes
+  {
+      av1_tx_split_nn_weights_8x32_layer0,
+      av1_tx_split_nn_weights_8x32_layer1,
+  },
+  {
+      av1_tx_split_nn_bias_8x32_layer0,
+      av1_tx_split_nn_bias_8x32_layer1,
+  },
+};
+/******************************************************************************/
+
+// Tx split model for 16x32 block.
+static const float av1_tx_split_nn_weights_16x64_layer0[8 * 16] = {
+  -0.378223f, -0.124216f, -0.514089f, -0.110117f, -0.585801f, -0.094838f,
+  -0.455385f, -0.220254f, -0.504568f, -0.082351f, -0.476420f, -0.253993f,
+  -0.454709f, -0.059461f, 0.210313f,  -0.155683f, 0.192968f,  -0.127804f,
+  0.471996f,  0.253377f,  0.472625f,  0.485322f,  0.150560f,  0.164868f,
+  -0.475587f, 0.447559f,  -0.455759f, -0.306665f, -0.194866f, -0.283716f,
+  -0.243897f, 0.293020f,  -0.308298f, -0.191904f, -0.468568f, 0.014053f,
+  -0.618848f, 0.096273f,  -0.444586f, 0.347750f,  -0.280643f, -0.062872f,
+  0.118661f,  0.540099f,  0.104141f,  -0.279300f, -0.098721f, -0.173427f,
+  -0.984558f, -0.424559f, -0.411928f, -0.120875f, -0.488999f, -0.050716f,
+  -0.523103f, 0.093620f,  -0.930396f, -0.431997f, -1.163297f, 0.190384f,
+  -0.422581f, -0.005354f, 0.450552f,  0.369210f,  0.562484f,  0.679922f,
+  0.282099f,  -0.039075f, 0.404196f,  0.006371f,  0.069679f,  -0.196160f,
+  -0.213675f, 0.275187f,  -0.104235f, -0.193090f, 0.003116f,  -0.252454f,
+  -0.094591f, 0.210439f,  -0.137070f, 0.145043f,  0.024558f,  0.121718f,
+  0.010138f,  0.301651f,  -0.377990f, 0.444414f,  0.001845f,  -0.095334f,
+  0.550259f,  0.087603f,  0.792492f,  -0.044584f, 0.641706f,  -0.328458f,
+  -0.447791f, 0.135376f,  0.356385f,  0.135748f,  0.310370f,  0.293757f,
+  -0.062000f, -0.056368f, 0.343930f,  0.312039f,  0.370763f,  0.452381f,
+  -0.023630f, -0.185909f, 0.422277f,  -0.006306f, 0.045166f,  0.423359f,
+  -0.157735f, -0.084901f, 0.219527f,  -0.209510f, 0.575057f,  0.249276f,
+  0.069267f,  0.233898f,  -0.229392f, 0.117197f,  -0.038551f, 0.293976f,
+  0.101996f,  0.120878f,
+};
+
+static const float av1_tx_split_nn_bias_16x64_layer0[16] = {
+  1.036995f,  0.160249f,  0.100264f,  0.694881f,  0.694677f,  0.128379f,
+  -0.843405f, -0.405515f, 0.104139f,  0.182980f,  -0.025472f, 0.901067f,
+  -0.299866f, -0.103079f, -0.190352f, -0.048121f,
+};
+
+static const float av1_tx_split_nn_weights_16x64_layer1[16] = {
+  -1.778868f, 0.174690f,  0.211991f, 0.712138f,  0.589352f,  0.466652f,
+  1.029146f,  -0.490044f, 0.483015f, 0.600215f,  -0.577776f, -0.755546f,
+  0.348337f,  -0.205082f, 0.347129f, -0.322277f,
+};
+
+static const float av1_tx_split_nn_bias_16x64_layer1[1] = {
+  0.04230947f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_16x64 = {
+  8,  // num_inputs
+  1,  // num_outputs
+  1,  // num_hidden_layers
+  {
+      16,
+  },  // num_hidden_nodes
+  {
+      av1_tx_split_nn_weights_16x64_layer0,
+      av1_tx_split_nn_weights_16x64_layer1,
+  },
+  {
+      av1_tx_split_nn_bias_16x64_layer0,
+      av1_tx_split_nn_bias_16x64_layer1,
+  },
+};
+/******************************************************************************/
+
+// Map block size to its corresponding neural net model for tx split prediction.
+static const NN_CONFIG *av1_tx_split_nnconfig_map[TX_SIZES_ALL] = {
+  NULL,                          // TX_4X4,
+  &av1_tx_split_nnconfig_8x8,    // TX_8X8,
+  &av1_tx_split_nnconfig_16x16,  // TX_16X16,
+  &av1_tx_split_nnconfig_32x32,  // TX_32X32,
+  &av1_tx_split_nnconfig_64x64,  // TX_64X64,
+  &av1_tx_split_nnconfig_4x8,    // TX_4X8,
+  &av1_tx_split_nnconfig_4x8,    // TX_8X4,
+  &av1_tx_split_nnconfig_8x16,   // TX_8X16,
+  &av1_tx_split_nnconfig_8x16,   // TX_16X8,
+  &av1_tx_split_nnconfig_16x32,  // TX_16X32,
+  &av1_tx_split_nnconfig_16x32,  // TX_32X16,
+  &av1_tx_split_nnconfig_32x64,  // TX_32X64,
+  &av1_tx_split_nnconfig_32x64,  // TX_64X32,
+  &av1_tx_split_nnconfig_4x16,   // TX_4X16,
+  &av1_tx_split_nnconfig_4x16,   // TX_16X4,
+  &av1_tx_split_nnconfig_8x32,   // TX_8X32,
+  &av1_tx_split_nnconfig_8x32,   // TX_32X8,
+  &av1_tx_split_nnconfig_16x64,  // TX_16X64,
+  &av1_tx_split_nnconfig_16x64,  // TX_64X16,
+};
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_TX_PRUNE_MODEL_WEIGHTS_H_
diff --git a/libs/libaom/src/av1/encoder/tx_search.c b/libs/libaom/src/av1/encoder/tx_search.c
new file mode 100644
index 000000000..65b9a2472
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/tx_search.c
@@ -0,0 +1,3602 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/cfl.h"
+#include "av1/common/reconintra.h"
+#include "av1/encoder/encodetxb.h"
+#include "av1/encoder/hybrid_fwd_txfm.h"
+#include "av1/common/idct.h"
+#include "av1/encoder/model_rd.h"
+#include "av1/encoder/random.h"
+#include "av1/encoder/rdopt_utils.h"
+#include "av1/encoder/tx_prune_model_weights.h"
+#include "av1/encoder/tx_search.h"
+
+struct rdcost_block_args {
+  const AV1_COMP *cpi;
+  MACROBLOCK *x;
+  ENTROPY_CONTEXT t_above[MAX_MIB_SIZE];
+  ENTROPY_CONTEXT t_left[MAX_MIB_SIZE];
+  RD_STATS rd_stats;
+  int64_t current_rd;
+  int64_t best_rd;
+  int exit_early;
+  int incomplete_exit;
+  int use_fast_coef_costing;
+  FAST_TX_SEARCH_MODE ftxs_mode;
+  int skip_trellis;
+};
+
+typedef struct {
+  int64_t rd;
+  int txb_entropy_ctx;
+  TX_TYPE tx_type;
+} TxCandidateInfo;
+
+typedef struct {
+  int leaf;
+  int8_t children[4];
+} RD_RECORD_IDX_NODE;
+
+// origin_threshold * 128 / 100
+static const uint32_t skip_pred_threshold[3][BLOCK_SIZES_ALL] = {
+  {
+      64, 64, 64, 70, 60, 60, 68, 68, 68, 68, 68,
+      68, 68, 68, 68, 68, 64, 64, 70, 70, 68, 68,
+  },
+  {
+      88, 88, 88, 86, 87, 87, 68, 68, 68, 68, 68,
+      68, 68, 68, 68, 68, 88, 88, 86, 86, 68, 68,
+  },
+  {
+      90, 93, 93, 90, 93, 93, 74, 74, 74, 74, 74,
+      74, 74, 74, 74, 74, 90, 90, 90, 90, 74, 74,
+  },
+};
+
+// lookup table for predict_skip_flag
+// int max_tx_size = max_txsize_rect_lookup[bsize];
+// if (tx_size_high[max_tx_size] > 16 || tx_size_wide[max_tx_size] > 16)
+//   max_tx_size = AOMMIN(max_txsize_lookup[bsize], TX_16X16);
+static const TX_SIZE max_predict_sf_tx_size[BLOCK_SIZES_ALL] = {
+  TX_4X4,   TX_4X8,   TX_8X4,   TX_8X8,   TX_8X16,  TX_16X8,
+  TX_16X16, TX_16X16, TX_16X16, TX_16X16, TX_16X16, TX_16X16,
+  TX_16X16, TX_16X16, TX_16X16, TX_16X16, TX_4X16,  TX_16X4,
+  TX_8X8,   TX_8X8,   TX_16X16, TX_16X16,
+};
+
+static int find_tx_size_rd_info(TXB_RD_RECORD *cur_record,
+                                const uint32_t hash) {
+  // Linear search through the circular buffer to find matching hash.
+  for (int i = cur_record->index_start - 1; i >= 0; i--) {
+    if (cur_record->hash_vals[i] == hash) return i;
+  }
+  for (int i = cur_record->num - 1; i >= cur_record->index_start; i--) {
+    if (cur_record->hash_vals[i] == hash) return i;
+  }
+  int index;
+  // If not found - add new RD info into the buffer and return its index
+  if (cur_record->num < TX_SIZE_RD_RECORD_BUFFER_LEN) {
+    index = (cur_record->index_start + cur_record->num) %
+            TX_SIZE_RD_RECORD_BUFFER_LEN;
+    cur_record->num++;
+  } else {
+    index = cur_record->index_start;
+    cur_record->index_start =
+        (cur_record->index_start + 1) % TX_SIZE_RD_RECORD_BUFFER_LEN;
+  }
+
+  cur_record->hash_vals[index] = hash;
+  av1_zero(cur_record->tx_rd_info[index]);
+  return index;
+}
+
+static const RD_RECORD_IDX_NODE rd_record_tree_8x8[] = {
+  { 1, { 0 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_8x16[] = {
+  { 0, { 1, 2, -1, -1 } },
+  { 1, { 0, 0, 0, 0 } },
+  { 1, { 0, 0, 0, 0 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_16x8[] = {
+  { 0, { 1, 2, -1, -1 } },
+  { 1, { 0 } },
+  { 1, { 0 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_16x16[] = {
+  { 0, { 1, 2, 3, 4 } }, { 1, { 0 } }, { 1, { 0 } }, { 1, { 0 } }, { 1, { 0 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_1_2[] = {
+  { 0, { 1, 2, -1, -1 } },
+  { 0, { 3, 4, 5, 6 } },
+  { 0, { 7, 8, 9, 10 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_2_1[] = {
+  { 0, { 1, 2, -1, -1 } },
+  { 0, { 3, 4, 7, 8 } },
+  { 0, { 5, 6, 9, 10 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_sqr[] = {
+  { 0, { 1, 2, 3, 4 } },     { 0, { 5, 6, 9, 10 } },    { 0, { 7, 8, 11, 12 } },
+  { 0, { 13, 14, 17, 18 } }, { 0, { 15, 16, 19, 20 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_64x128[] = {
+  { 0, { 2, 3, 4, 5 } },     { 0, { 6, 7, 8, 9 } },
+  { 0, { 10, 11, 14, 15 } }, { 0, { 12, 13, 16, 17 } },
+  { 0, { 18, 19, 22, 23 } }, { 0, { 20, 21, 24, 25 } },
+  { 0, { 26, 27, 30, 31 } }, { 0, { 28, 29, 32, 33 } },
+  { 0, { 34, 35, 38, 39 } }, { 0, { 36, 37, 40, 41 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_128x64[] = {
+  { 0, { 2, 3, 6, 7 } },     { 0, { 4, 5, 8, 9 } },
+  { 0, { 10, 11, 18, 19 } }, { 0, { 12, 13, 20, 21 } },
+  { 0, { 14, 15, 22, 23 } }, { 0, { 16, 17, 24, 25 } },
+  { 0, { 26, 27, 34, 35 } }, { 0, { 28, 29, 36, 37 } },
+  { 0, { 30, 31, 38, 39 } }, { 0, { 32, 33, 40, 41 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_128x128[] = {
+  { 0, { 4, 5, 8, 9 } },     { 0, { 6, 7, 10, 11 } },
+  { 0, { 12, 13, 16, 17 } }, { 0, { 14, 15, 18, 19 } },
+  { 0, { 20, 21, 28, 29 } }, { 0, { 22, 23, 30, 31 } },
+  { 0, { 24, 25, 32, 33 } }, { 0, { 26, 27, 34, 35 } },
+  { 0, { 36, 37, 44, 45 } }, { 0, { 38, 39, 46, 47 } },
+  { 0, { 40, 41, 48, 49 } }, { 0, { 42, 43, 50, 51 } },
+  { 0, { 52, 53, 60, 61 } }, { 0, { 54, 55, 62, 63 } },
+  { 0, { 56, 57, 64, 65 } }, { 0, { 58, 59, 66, 67 } },
+  { 0, { 68, 69, 76, 77 } }, { 0, { 70, 71, 78, 79 } },
+  { 0, { 72, 73, 80, 81 } }, { 0, { 74, 75, 82, 83 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_1_4[] = {
+  { 0, { 1, -1, 2, -1 } },
+  { 0, { 3, 4, -1, -1 } },
+  { 0, { 5, 6, -1, -1 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_4_1[] = {
+  { 0, { 1, 2, -1, -1 } },
+  { 0, { 3, 4, -1, -1 } },
+  { 0, { 5, 6, -1, -1 } },
+};
+
+static const RD_RECORD_IDX_NODE *rd_record_tree[BLOCK_SIZES_ALL] = {
+  NULL,                    // BLOCK_4X4
+  NULL,                    // BLOCK_4X8
+  NULL,                    // BLOCK_8X4
+  rd_record_tree_8x8,      // BLOCK_8X8
+  rd_record_tree_8x16,     // BLOCK_8X16
+  rd_record_tree_16x8,     // BLOCK_16X8
+  rd_record_tree_16x16,    // BLOCK_16X16
+  rd_record_tree_1_2,      // BLOCK_16X32
+  rd_record_tree_2_1,      // BLOCK_32X16
+  rd_record_tree_sqr,      // BLOCK_32X32
+  rd_record_tree_1_2,      // BLOCK_32X64
+  rd_record_tree_2_1,      // BLOCK_64X32
+  rd_record_tree_sqr,      // BLOCK_64X64
+  rd_record_tree_64x128,   // BLOCK_64X128
+  rd_record_tree_128x64,   // BLOCK_128X64
+  rd_record_tree_128x128,  // BLOCK_128X128
+  NULL,                    // BLOCK_4X16
+  NULL,                    // BLOCK_16X4
+  rd_record_tree_1_4,      // BLOCK_8X32
+  rd_record_tree_4_1,      // BLOCK_32X8
+  rd_record_tree_1_4,      // BLOCK_16X64
+  rd_record_tree_4_1,      // BLOCK_64X16
+};
+
+static const int rd_record_tree_size[BLOCK_SIZES_ALL] = {
+  0,                                                            // BLOCK_4X4
+  0,                                                            // BLOCK_4X8
+  0,                                                            // BLOCK_8X4
+  sizeof(rd_record_tree_8x8) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_8X8
+  sizeof(rd_record_tree_8x16) / sizeof(RD_RECORD_IDX_NODE),     // BLOCK_8X16
+  sizeof(rd_record_tree_16x8) / sizeof(RD_RECORD_IDX_NODE),     // BLOCK_16X8
+  sizeof(rd_record_tree_16x16) / sizeof(RD_RECORD_IDX_NODE),    // BLOCK_16X16
+  sizeof(rd_record_tree_1_2) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_16X32
+  sizeof(rd_record_tree_2_1) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_32X16
+  sizeof(rd_record_tree_sqr) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_32X32
+  sizeof(rd_record_tree_1_2) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_32X64
+  sizeof(rd_record_tree_2_1) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_64X32
+  sizeof(rd_record_tree_sqr) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_64X64
+  sizeof(rd_record_tree_64x128) / sizeof(RD_RECORD_IDX_NODE),   // BLOCK_64X128
+  sizeof(rd_record_tree_128x64) / sizeof(RD_RECORD_IDX_NODE),   // BLOCK_128X64
+  sizeof(rd_record_tree_128x128) / sizeof(RD_RECORD_IDX_NODE),  // BLOCK_128X128
+  0,                                                            // BLOCK_4X16
+  0,                                                            // BLOCK_16X4
+  sizeof(rd_record_tree_1_4) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_8X32
+  sizeof(rd_record_tree_4_1) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_32X8
+  sizeof(rd_record_tree_1_4) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_16X64
+  sizeof(rd_record_tree_4_1) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_64X16
+};
+
+static INLINE void init_rd_record_tree(TXB_RD_INFO_NODE *tree,
+                                       BLOCK_SIZE bsize) {
+  const RD_RECORD_IDX_NODE *rd_record = rd_record_tree[bsize];
+  const int size = rd_record_tree_size[bsize];
+  for (int i = 0; i < size; ++i) {
+    if (rd_record[i].leaf) {
+      av1_zero(tree[i].children);
+    } else {
+      for (int j = 0; j < 4; ++j) {
+        const int8_t idx = rd_record[i].children[j];
+        tree[i].children[j] = idx > 0 ? &tree[idx] : NULL;
+      }
+    }
+  }
+}
+
+// Go through all TX blocks that could be used in TX size search, compute
+// residual hash values for them and find matching RD info that stores previous
+// RD search results for these TX blocks. The idea is to prevent repeated
+// rate/distortion computations that happen because of the combination of
+// partition and TX size search. The resulting RD info records are returned in
+// the form of a quadtree for easier access in actual TX size search.
+static int find_tx_size_rd_records(MACROBLOCK *x, BLOCK_SIZE bsize,
+                                   TXB_RD_INFO_NODE *dst_rd_info) {
+  TXB_RD_RECORD *rd_records_table[4] = { x->txb_rd_record_8X8,
+                                         x->txb_rd_record_16X16,
+                                         x->txb_rd_record_32X32,
+                                         x->txb_rd_record_64X64 };
+  const TX_SIZE max_square_tx_size = max_txsize_lookup[bsize];
+  const int bw = block_size_wide[bsize];
+  const int bh = block_size_high[bsize];
+
+  // Hashing is performed only for square TX sizes larger than TX_4X4
+  if (max_square_tx_size < TX_8X8) return 0;
+  const int diff_stride = bw;
+  const struct macroblock_plane *const p = &x->plane[0];
+  const int16_t *diff = &p->src_diff[0];
+  init_rd_record_tree(dst_rd_info, bsize);
+  // Coordinates of the top-left corner of current block within the superblock
+  // measured in pixels:
+  const int mi_row = x->e_mbd.mi_row;
+  const int mi_col = x->e_mbd.mi_col;
+  const int mi_row_in_sb = (mi_row % MAX_MIB_SIZE) << MI_SIZE_LOG2;
+  const int mi_col_in_sb = (mi_col % MAX_MIB_SIZE) << MI_SIZE_LOG2;
+  int cur_rd_info_idx = 0;
+  int cur_tx_depth = 0;
+  TX_SIZE cur_tx_size = max_txsize_rect_lookup[bsize];
+  while (cur_tx_depth <= MAX_VARTX_DEPTH) {
+    const int cur_tx_bw = tx_size_wide[cur_tx_size];
+    const int cur_tx_bh = tx_size_high[cur_tx_size];
+    if (cur_tx_bw < 8 || cur_tx_bh < 8) break;
+    const TX_SIZE next_tx_size = sub_tx_size_map[cur_tx_size];
+    const int tx_size_idx = cur_tx_size - TX_8X8;
+    for (int row = 0; row < bh; row += cur_tx_bh) {
+      for (int col = 0; col < bw; col += cur_tx_bw) {
+        if (cur_tx_bw != cur_tx_bh) {
+          // Use dummy nodes for all rectangular transforms within the
+          // TX size search tree.
+          dst_rd_info[cur_rd_info_idx].rd_info_array = NULL;
+        } else {
+          // Get spatial location of this TX block within the superblock
+          // (measured in cur_tx_bsize units).
+          const int row_in_sb = (mi_row_in_sb + row) / cur_tx_bh;
+          const int col_in_sb = (mi_col_in_sb + col) / cur_tx_bw;
+
+          int16_t hash_data[MAX_SB_SQUARE];
+          int16_t *cur_hash_row = hash_data;
+          const int16_t *cur_diff_row = diff + row * diff_stride + col;
+          for (int i = 0; i < cur_tx_bh; i++) {
+            memcpy(cur_hash_row, cur_diff_row, sizeof(*hash_data) * cur_tx_bw);
+            cur_hash_row += cur_tx_bw;
+            cur_diff_row += diff_stride;
+          }
+          const int hash = av1_get_crc32c_value(&x->mb_rd_record.crc_calculator,
+                                                (uint8_t *)hash_data,
+                                                2 * cur_tx_bw * cur_tx_bh);
+          // Find corresponding RD info based on the hash value.
+          const int record_idx =
+              row_in_sb * (MAX_MIB_SIZE >> (tx_size_idx + 1)) + col_in_sb;
+          TXB_RD_RECORD *records = &rd_records_table[tx_size_idx][record_idx];
+          int idx = find_tx_size_rd_info(records, hash);
+          dst_rd_info[cur_rd_info_idx].rd_info_array =
+              &records->tx_rd_info[idx];
+        }
+        ++cur_rd_info_idx;
+      }
+    }
+    cur_tx_size = next_tx_size;
+    ++cur_tx_depth;
+  }
+  return 1;
+}
+
+static INLINE uint32_t get_block_residue_hash(MACROBLOCK *x, BLOCK_SIZE bsize) {
+  const int rows = block_size_high[bsize];
+  const int cols = block_size_wide[bsize];
+  const int16_t *diff = x->plane[0].src_diff;
+  const uint32_t hash = av1_get_crc32c_value(&x->mb_rd_record.crc_calculator,
+                                             (uint8_t *)diff, 2 * rows * cols);
+  return (hash << 5) + bsize;
+}
+
+static INLINE int32_t find_mb_rd_info(const MB_RD_RECORD *const mb_rd_record,
+                                      const int64_t ref_best_rd,
+                                      const uint32_t hash) {
+  int32_t match_index = -1;
+  if (ref_best_rd != INT64_MAX) {
+    for (int i = 0; i < mb_rd_record->num; ++i) {
+      const int index = (mb_rd_record->index_start + i) % RD_RECORD_BUFFER_LEN;
+      // If there is a match in the tx_rd_record, fetch the RD decision and
+      // terminate early.
+      if (mb_rd_record->tx_rd_info[index].hash_value == hash) {
+        match_index = index;
+        break;
+      }
+    }
+  }
+  return match_index;
+}
+
+static AOM_INLINE void fetch_tx_rd_info(int n4,
+                                        const MB_RD_INFO *const tx_rd_info,
+                                        RD_STATS *const rd_stats,
+                                        MACROBLOCK *const x) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  mbmi->tx_size = tx_rd_info->tx_size;
+  memcpy(x->blk_skip, tx_rd_info->blk_skip,
+         sizeof(tx_rd_info->blk_skip[0]) * n4);
+  av1_copy(mbmi->inter_tx_size, tx_rd_info->inter_tx_size);
+  av1_copy_array(xd->tx_type_map, tx_rd_info->tx_type_map, n4);
+  *rd_stats = tx_rd_info->rd_stats;
+}
+
+// Compute the pixel domain distortion from diff on all visible 4x4s in the
+// transform block.
+static INLINE int64_t pixel_diff_dist(const MACROBLOCK *x, int plane,
+                                      int blk_row, int blk_col,
+                                      const BLOCK_SIZE plane_bsize,
+                                      const BLOCK_SIZE tx_bsize,
+                                      unsigned int *block_mse_q8) {
+  int visible_rows, visible_cols;
+  const MACROBLOCKD *xd = &x->e_mbd;
+  get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize, NULL,
+                     NULL, &visible_cols, &visible_rows);
+  const int diff_stride = block_size_wide[plane_bsize];
+  const int16_t *diff = x->plane[plane].src_diff;
+
+  diff += ((blk_row * diff_stride + blk_col) << MI_SIZE_LOG2);
+  uint64_t sse =
+      aom_sum_squares_2d_i16(diff, diff_stride, visible_cols, visible_rows);
+  if (block_mse_q8 != NULL) {
+    if (visible_cols > 0 && visible_rows > 0)
+      *block_mse_q8 =
+          (unsigned int)((256 * sse) / (visible_cols * visible_rows));
+    else
+      *block_mse_q8 = UINT_MAX;
+  }
+  return sse;
+}
+
+// Uses simple features on top of DCT coefficients to quickly predict
+// whether optimal RD decision is to skip encoding the residual.
+// The sse value is stored in dist.
+static int predict_skip_flag(MACROBLOCK *x, BLOCK_SIZE bsize, int64_t *dist,
+                             int reduced_tx_set) {
+  const int bw = block_size_wide[bsize];
+  const int bh = block_size_high[bsize];
+  const MACROBLOCKD *xd = &x->e_mbd;
+  const int16_t dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd);
+
+  *dist = pixel_diff_dist(x, 0, 0, 0, bsize, bsize, NULL);
+
+  const int64_t mse = *dist / bw / bh;
+  // Normalized quantizer takes the transform upscaling factor (8 for tx size
+  // smaller than 32) into account.
+  const int16_t normalized_dc_q = dc_q >> 3;
+  const int64_t mse_thresh = (int64_t)normalized_dc_q * normalized_dc_q / 8;
+  // For faster early skip decision, use dist to compare against threshold so
+  // that quality risk is less for the skip=1 decision. Otherwise, use mse
+  // since the fwd_txfm coeff checks will take care of quality
+  // TODO(any): Use dist to return 0 when predict_skip_level is 1
+  int64_t pred_err = (x->predict_skip_level >= 2) ? *dist : mse;
+  // Predict not to skip when error is larger than threshold.
+  if (pred_err > mse_thresh) return 0;
+  // Return as skip otherwise for aggressive early skip
+  else if (x->predict_skip_level >= 2)
+    return 1;
+
+  const int max_tx_size = max_predict_sf_tx_size[bsize];
+  const int tx_h = tx_size_high[max_tx_size];
+  const int tx_w = tx_size_wide[max_tx_size];
+  DECLARE_ALIGNED(32, tran_low_t, coefs[32 * 32]);
+  TxfmParam param;
+  param.tx_type = DCT_DCT;
+  param.tx_size = max_tx_size;
+  param.bd = xd->bd;
+  param.is_hbd = is_cur_buf_hbd(xd);
+  param.lossless = 0;
+  param.tx_set_type = av1_get_ext_tx_set_type(
+      param.tx_size, is_inter_block(xd->mi[0]), reduced_tx_set);
+  const int bd_idx = (xd->bd == 8) ? 0 : ((xd->bd == 10) ? 1 : 2);
+  const uint32_t max_qcoef_thresh = skip_pred_threshold[bd_idx][bsize];
+  const int16_t *src_diff = x->plane[0].src_diff;
+  const int n_coeff = tx_w * tx_h;
+  const int16_t ac_q = av1_ac_quant_QTX(x->qindex, 0, xd->bd);
+  const uint32_t dc_thresh = max_qcoef_thresh * dc_q;
+  const uint32_t ac_thresh = max_qcoef_thresh * ac_q;
+  for (int row = 0; row < bh; row += tx_h) {
+    for (int col = 0; col < bw; col += tx_w) {
+      av1_fwd_txfm(src_diff + col, coefs, bw, &param);
+      // Operating on TX domain, not pixels; we want the QTX quantizers
+      const uint32_t dc_coef = (((uint32_t)abs(coefs[0])) << 7);
+      if (dc_coef >= dc_thresh) return 0;
+      for (int i = 1; i < n_coeff; ++i) {
+        const uint32_t ac_coef = (((uint32_t)abs(coefs[i])) << 7);
+        if (ac_coef >= ac_thresh) return 0;
+      }
+    }
+    src_diff += tx_h * bw;
+  }
+  return 1;
+}
+
+// Used to set proper context for early termination with skip = 1.
+static AOM_INLINE void set_skip_flag(MACROBLOCK *x, RD_STATS *rd_stats,
+                                     int bsize, int64_t dist) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int n4 = bsize_to_num_blk(bsize);
+  const TX_SIZE tx_size = max_txsize_rect_lookup[bsize];
+  memset(xd->tx_type_map, DCT_DCT, sizeof(xd->tx_type_map[0]) * n4);
+  memset(mbmi->inter_tx_size, tx_size, sizeof(mbmi->inter_tx_size));
+  mbmi->tx_size = tx_size;
+  for (int i = 0; i < n4; ++i) set_blk_skip(x, 0, i, 1);
+  rd_stats->skip = 1;
+  if (is_cur_buf_hbd(xd)) dist = ROUND_POWER_OF_TWO(dist, (xd->bd - 8) * 2);
+  rd_stats->dist = rd_stats->sse = (dist << 4);
+  // Though decision is to make the block as skip based on luma stats,
+  // it is possible that block becomes non skip after chroma rd. In addition
+  // intermediate non skip costs calculated by caller function will be
+  // incorrect, if rate is set as  zero (i.e., if zero_blk_rate is not
+  // accounted). Hence intermediate rate is populated to code the luma tx blks
+  // as skip, the caller function based on final rd decision (i.e., skip vs
+  // non-skip) sets the final rate accordingly. Here the rate populated
+  // corresponds to coding all the tx blocks with zero_blk_rate (based on max tx
+  // size possible) in the current block. Eg: For 128*128 block, rate would be
+  // 4 * zero_blk_rate where zero_blk_rate corresponds to coding of one 64x64 tx
+  // block as 'all zeros'
+  ENTROPY_CONTEXT ctxa[MAX_MIB_SIZE];
+  ENTROPY_CONTEXT ctxl[MAX_MIB_SIZE];
+  av1_get_entropy_contexts(bsize, &xd->plane[0], ctxa, ctxl);
+  ENTROPY_CONTEXT *ta = ctxa;
+  ENTROPY_CONTEXT *tl = ctxl;
+  const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+  TXB_CTX txb_ctx;
+  get_txb_ctx(bsize, tx_size, 0, ta, tl, &txb_ctx);
+  const int zero_blk_rate = x->coeff_costs[txs_ctx][PLANE_TYPE_Y]
+                                .txb_skip_cost[txb_ctx.txb_skip_ctx][1];
+  rd_stats->rate = zero_blk_rate *
+                   (block_size_wide[bsize] >> tx_size_wide_log2[tx_size]) *
+                   (block_size_high[bsize] >> tx_size_high_log2[tx_size]);
+}
+
+static AOM_INLINE void save_tx_rd_info(int n4, uint32_t hash,
+                                       const MACROBLOCK *const x,
+                                       const RD_STATS *const rd_stats,
+                                       MB_RD_RECORD *tx_rd_record) {
+  int index;
+  if (tx_rd_record->num < RD_RECORD_BUFFER_LEN) {
+    index =
+        (tx_rd_record->index_start + tx_rd_record->num) % RD_RECORD_BUFFER_LEN;
+    ++tx_rd_record->num;
+  } else {
+    index = tx_rd_record->index_start;
+    tx_rd_record->index_start =
+        (tx_rd_record->index_start + 1) % RD_RECORD_BUFFER_LEN;
+  }
+  MB_RD_INFO *const tx_rd_info = &tx_rd_record->tx_rd_info[index];
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  tx_rd_info->hash_value = hash;
+  tx_rd_info->tx_size = mbmi->tx_size;
+  memcpy(tx_rd_info->blk_skip, x->blk_skip,
+         sizeof(tx_rd_info->blk_skip[0]) * n4);
+  av1_copy(tx_rd_info->inter_tx_size, mbmi->inter_tx_size);
+  av1_copy_array(tx_rd_info->tx_type_map, xd->tx_type_map, n4);
+  tx_rd_info->rd_stats = *rd_stats;
+}
+
+static int get_search_init_depth(int mi_width, int mi_height, int is_inter,
+                                 const SPEED_FEATURES *sf,
+                                 int tx_size_search_method) {
+  if (tx_size_search_method == USE_LARGESTALL) return MAX_VARTX_DEPTH;
+
+  if (sf->tx_sf.tx_size_search_lgr_block) {
+    if (mi_width > mi_size_wide[BLOCK_64X64] ||
+        mi_height > mi_size_high[BLOCK_64X64])
+      return MAX_VARTX_DEPTH;
+  }
+
+  if (is_inter) {
+    return (mi_height != mi_width)
+               ? sf->tx_sf.inter_tx_size_search_init_depth_rect
+               : sf->tx_sf.inter_tx_size_search_init_depth_sqr;
+  } else {
+    return (mi_height != mi_width)
+               ? sf->tx_sf.intra_tx_size_search_init_depth_rect
+               : sf->tx_sf.intra_tx_size_search_init_depth_sqr;
+  }
+}
+
+static AOM_INLINE void select_tx_block(
+    const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, int blk_col, int block,
+    TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *ta,
+    ENTROPY_CONTEXT *tl, TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left,
+    RD_STATS *rd_stats, int64_t prev_level_rd, int64_t ref_best_rd,
+    int *is_cost_valid, FAST_TX_SEARCH_MODE ftxs_mode,
+    TXB_RD_INFO_NODE *rd_info_node);
+
+// NOTE: CONFIG_COLLECT_RD_STATS has 3 possible values
+// 0: Do not collect any RD stats
+// 1: Collect RD stats for transform units
+// 2: Collect RD stats for partition units
+#if CONFIG_COLLECT_RD_STATS
+
+static AOM_INLINE void get_energy_distribution_fine(
+    const AV1_COMP *cpi, BLOCK_SIZE bsize, const uint8_t *src, int src_stride,
+    const uint8_t *dst, int dst_stride, int need_4th, double *hordist,
+    double *verdist) {
+  const int bw = block_size_wide[bsize];
+  const int bh = block_size_high[bsize];
+  unsigned int esq[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+
+  if (bsize < BLOCK_16X16 || (bsize >= BLOCK_4X16 && bsize <= BLOCK_32X8)) {
+    // Special cases: calculate 'esq' values manually, as we don't have 'vf'
+    // functions for the 16 (very small) sub-blocks of this block.
+    const int w_shift = (bw == 4) ? 0 : (bw == 8) ? 1 : (bw == 16) ? 2 : 3;
+    const int h_shift = (bh == 4) ? 0 : (bh == 8) ? 1 : (bh == 16) ? 2 : 3;
+    assert(bw <= 32);
+    assert(bh <= 32);
+    assert(((bw - 1) >> w_shift) + (((bh - 1) >> h_shift) << 2) == 15);
+    if (cpi->common.seq_params.use_highbitdepth) {
+      const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
+      const uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
+      for (int i = 0; i < bh; ++i)
+        for (int j = 0; j < bw; ++j) {
+          const int index = (j >> w_shift) + ((i >> h_shift) << 2);
+          esq[index] +=
+              (src16[j + i * src_stride] - dst16[j + i * dst_stride]) *
+              (src16[j + i * src_stride] - dst16[j + i * dst_stride]);
+        }
+    } else {
+      for (int i = 0; i < bh; ++i)
+        for (int j = 0; j < bw; ++j) {
+          const int index = (j >> w_shift) + ((i >> h_shift) << 2);
+          esq[index] += (src[j + i * src_stride] - dst[j + i * dst_stride]) *
+                        (src[j + i * src_stride] - dst[j + i * dst_stride]);
+        }
+    }
+  } else {  // Calculate 'esq' values using 'vf' functions on the 16 sub-blocks.
+    const int f_index =
+        (bsize < BLOCK_SIZES) ? bsize - BLOCK_16X16 : bsize - BLOCK_8X16;
+    assert(f_index >= 0 && f_index < BLOCK_SIZES_ALL);
+    const BLOCK_SIZE subsize = (BLOCK_SIZE)f_index;
+    assert(block_size_wide[bsize] == 4 * block_size_wide[subsize]);
+    assert(block_size_high[bsize] == 4 * block_size_high[subsize]);
+    cpi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[0]);
+    cpi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride,
+                            &esq[1]);
+    cpi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride,
+                            &esq[2]);
+    cpi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
+                            dst_stride, &esq[3]);
+    src += bh / 4 * src_stride;
+    dst += bh / 4 * dst_stride;
+
+    cpi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[4]);
+    cpi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride,
+                            &esq[5]);
+    cpi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride,
+                            &esq[6]);
+    cpi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
+                            dst_stride, &esq[7]);
+    src += bh / 4 * src_stride;
+    dst += bh / 4 * dst_stride;
+
+    cpi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[8]);
+    cpi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride,
+                            &esq[9]);
+    cpi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride,
+                            &esq[10]);
+    cpi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
+                            dst_stride, &esq[11]);
+    src += bh / 4 * src_stride;
+    dst += bh / 4 * dst_stride;
+
+    cpi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[12]);
+    cpi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride,
+                            &esq[13]);
+    cpi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride,
+                            &esq[14]);
+    cpi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
+                            dst_stride, &esq[15]);
+  }
+
+  double total = (double)esq[0] + esq[1] + esq[2] + esq[3] + esq[4] + esq[5] +
+                 esq[6] + esq[7] + esq[8] + esq[9] + esq[10] + esq[11] +
+                 esq[12] + esq[13] + esq[14] + esq[15];
+  if (total > 0) {
+    const double e_recip = 1.0 / total;
+    hordist[0] = ((double)esq[0] + esq[4] + esq[8] + esq[12]) * e_recip;
+    hordist[1] = ((double)esq[1] + esq[5] + esq[9] + esq[13]) * e_recip;
+    hordist[2] = ((double)esq[2] + esq[6] + esq[10] + esq[14]) * e_recip;
+    if (need_4th) {
+      hordist[3] = ((double)esq[3] + esq[7] + esq[11] + esq[15]) * e_recip;
+    }
+    verdist[0] = ((double)esq[0] + esq[1] + esq[2] + esq[3]) * e_recip;
+    verdist[1] = ((double)esq[4] + esq[5] + esq[6] + esq[7]) * e_recip;
+    verdist[2] = ((double)esq[8] + esq[9] + esq[10] + esq[11]) * e_recip;
+    if (need_4th) {
+      verdist[3] = ((double)esq[12] + esq[13] + esq[14] + esq[15]) * e_recip;
+    }
+  } else {
+    hordist[0] = verdist[0] = 0.25;
+    hordist[1] = verdist[1] = 0.25;
+    hordist[2] = verdist[2] = 0.25;
+    if (need_4th) {
+      hordist[3] = verdist[3] = 0.25;
+    }
+  }
+}
+
+static double get_sse_norm(const int16_t *diff, int stride, int w, int h) {
+  double sum = 0.0;
+  for (int j = 0; j < h; ++j) {
+    for (int i = 0; i < w; ++i) {
+      const int err = diff[j * stride + i];
+      sum += err * err;
+    }
+  }
+  assert(w > 0 && h > 0);
+  return sum / (w * h);
+}
+
+static double get_sad_norm(const int16_t *diff, int stride, int w, int h) {
+  double sum = 0.0;
+  for (int j = 0; j < h; ++j) {
+    for (int i = 0; i < w; ++i) {
+      sum += abs(diff[j * stride + i]);
+    }
+  }
+  assert(w > 0 && h > 0);
+  return sum / (w * h);
+}
+
+static AOM_INLINE void get_2x2_normalized_sses_and_sads(
+    const AV1_COMP *const cpi, BLOCK_SIZE tx_bsize, const uint8_t *const src,
+    int src_stride, const uint8_t *const dst, int dst_stride,
+    const int16_t *const src_diff, int diff_stride, double *const sse_norm_arr,
+    double *const sad_norm_arr) {
+  const BLOCK_SIZE tx_bsize_half =
+      get_partition_subsize(tx_bsize, PARTITION_SPLIT);
+  if (tx_bsize_half == BLOCK_INVALID) {  // manually calculate stats
+    const int half_width = block_size_wide[tx_bsize] / 2;
+    const int half_height = block_size_high[tx_bsize] / 2;
+    for (int row = 0; row < 2; ++row) {
+      for (int col = 0; col < 2; ++col) {
+        const int16_t *const this_src_diff =
+            src_diff + row * half_height * diff_stride + col * half_width;
+        if (sse_norm_arr) {
+          sse_norm_arr[row * 2 + col] =
+              get_sse_norm(this_src_diff, diff_stride, half_width, half_height);
+        }
+        if (sad_norm_arr) {
+          sad_norm_arr[row * 2 + col] =
+              get_sad_norm(this_src_diff, diff_stride, half_width, half_height);
+        }
+      }
+    }
+  } else {  // use function pointers to calculate stats
+    const int half_width = block_size_wide[tx_bsize_half];
+    const int half_height = block_size_high[tx_bsize_half];
+    const int num_samples_half = half_width * half_height;
+    for (int row = 0; row < 2; ++row) {
+      for (int col = 0; col < 2; ++col) {
+        const uint8_t *const this_src =
+            src + row * half_height * src_stride + col * half_width;
+        const uint8_t *const this_dst =
+            dst + row * half_height * dst_stride + col * half_width;
+
+        if (sse_norm_arr) {
+          unsigned int this_sse;
+          cpi->fn_ptr[tx_bsize_half].vf(this_src, src_stride, this_dst,
+                                        dst_stride, &this_sse);
+          sse_norm_arr[row * 2 + col] = (double)this_sse / num_samples_half;
+        }
+
+        if (sad_norm_arr) {
+          const unsigned int this_sad = cpi->fn_ptr[tx_bsize_half].sdf(
+              this_src, src_stride, this_dst, dst_stride);
+          sad_norm_arr[row * 2 + col] = (double)this_sad / num_samples_half;
+        }
+      }
+    }
+  }
+}
+
+#if CONFIG_COLLECT_RD_STATS == 1
+static double get_mean(const int16_t *diff, int stride, int w, int h) {
+  double sum = 0.0;
+  for (int j = 0; j < h; ++j) {
+    for (int i = 0; i < w; ++i) {
+      sum += diff[j * stride + i];
+    }
+  }
+  assert(w > 0 && h > 0);
+  return sum / (w * h);
+}
+static AOM_INLINE void PrintTransformUnitStats(
+    const AV1_COMP *const cpi, MACROBLOCK *x, const RD_STATS *const rd_stats,
+    int blk_row, int blk_col, BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+    TX_TYPE tx_type, int64_t rd) {
+  if (rd_stats->rate == INT_MAX || rd_stats->dist == INT64_MAX) return;
+
+  // Generate small sample to restrict output size.
+  static unsigned int seed = 21743;
+  if (lcg_rand16(&seed) % 256 > 0) return;
+
+  const char output_file[] = "tu_stats.txt";
+  FILE *fout = fopen(output_file, "a");
+  if (!fout) return;
+
+  const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const int plane = 0;
+  struct macroblock_plane *const p = &x->plane[plane];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int txw = tx_size_wide[tx_size];
+  const int txh = tx_size_high[tx_size];
+  const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3;
+  const int q_step = p->dequant_QTX[1] >> dequant_shift;
+  const int num_samples = txw * txh;
+
+  const double rate_norm = (double)rd_stats->rate / num_samples;
+  const double dist_norm = (double)rd_stats->dist / num_samples;
+
+  fprintf(fout, "%g %g", rate_norm, dist_norm);
+
+  const int src_stride = p->src.stride;
+  const uint8_t *const src =
+      &p->src.buf[(blk_row * src_stride + blk_col) << MI_SIZE_LOG2];
+  const int dst_stride = pd->dst.stride;
+  const uint8_t *const dst =
+      &pd->dst.buf[(blk_row * dst_stride + blk_col) << MI_SIZE_LOG2];
+  unsigned int sse;
+  cpi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &sse);
+  const double sse_norm = (double)sse / num_samples;
+
+  const unsigned int sad =
+      cpi->fn_ptr[tx_bsize].sdf(src, src_stride, dst, dst_stride);
+  const double sad_norm = (double)sad / num_samples;
+
+  fprintf(fout, " %g %g", sse_norm, sad_norm);
+
+  const int diff_stride = block_size_wide[plane_bsize];
+  const int16_t *const src_diff =
+      &p->src_diff[(blk_row * diff_stride + blk_col) << MI_SIZE_LOG2];
+
+  double sse_norm_arr[4], sad_norm_arr[4];
+  get_2x2_normalized_sses_and_sads(cpi, tx_bsize, src, src_stride, dst,
+                                   dst_stride, src_diff, diff_stride,
+                                   sse_norm_arr, sad_norm_arr);
+  for (int i = 0; i < 4; ++i) {
+    fprintf(fout, " %g", sse_norm_arr[i]);
+  }
+  for (int i = 0; i < 4; ++i) {
+    fprintf(fout, " %g", sad_norm_arr[i]);
+  }
+
+  const TX_TYPE_1D tx_type_1d_row = htx_tab[tx_type];
+  const TX_TYPE_1D tx_type_1d_col = vtx_tab[tx_type];
+
+  fprintf(fout, " %d %d %d %d %d", q_step, tx_size_wide[tx_size],
+          tx_size_high[tx_size], tx_type_1d_row, tx_type_1d_col);
+
+  int model_rate;
+  int64_t model_dist;
+  model_rd_sse_fn[MODELRD_CURVFIT](cpi, x, tx_bsize, plane, sse, num_samples,
+                                   &model_rate, &model_dist);
+  const double model_rate_norm = (double)model_rate / num_samples;
+  const double model_dist_norm = (double)model_dist / num_samples;
+  fprintf(fout, " %g %g", model_rate_norm, model_dist_norm);
+
+  const double mean = get_mean(src_diff, diff_stride, txw, txh);
+  float hor_corr, vert_corr;
+  av1_get_horver_correlation_full(src_diff, diff_stride, txw, txh, &hor_corr,
+                                  &vert_corr);
+  fprintf(fout, " %g %g %g", mean, hor_corr, vert_corr);
+
+  double hdist[4] = { 0 }, vdist[4] = { 0 };
+  get_energy_distribution_fine(cpi, tx_bsize, src, src_stride, dst, dst_stride,
+                               1, hdist, vdist);
+  fprintf(fout, " %g %g %g %g %g %g %g %g", hdist[0], hdist[1], hdist[2],
+          hdist[3], vdist[0], vdist[1], vdist[2], vdist[3]);
+
+  fprintf(fout, " %d %" PRId64, x->rdmult, rd);
+
+  fprintf(fout, "\n");
+  fclose(fout);
+}
+#endif  // CONFIG_COLLECT_RD_STATS == 1
+
+#if CONFIG_COLLECT_RD_STATS >= 2
+static int64_t get_sse(const AV1_COMP *cpi, const MACROBLOCK *x) {
+  const AV1_COMMON *cm = &cpi->common;
+  const int num_planes = av1_num_planes(cm);
+  const MACROBLOCKD *xd = &x->e_mbd;
+  const MB_MODE_INFO *mbmi = xd->mi[0];
+  int64_t total_sse = 0;
+  for (int plane = 0; plane < num_planes; ++plane) {
+    const struct macroblock_plane *const p = &x->plane[plane];
+    const struct macroblockd_plane *const pd = &xd->plane[plane];
+    const BLOCK_SIZE bs = get_plane_block_size(mbmi->sb_type, pd->subsampling_x,
+                                               pd->subsampling_y);
+    unsigned int sse;
+
+    if (x->skip_chroma_rd && plane) continue;
+
+    cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride,
+                       &sse);
+    total_sse += sse;
+  }
+  total_sse <<= 4;
+  return total_sse;
+}
+
+static int get_est_rate_dist(const TileDataEnc *tile_data, BLOCK_SIZE bsize,
+                             int64_t sse, int *est_residue_cost,
+                             int64_t *est_dist) {
+  aom_clear_system_state();
+  const InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize];
+  if (md->ready) {
+    if (sse < md->dist_mean) {
+      *est_residue_cost = 0;
+      *est_dist = sse;
+    } else {
+      *est_dist = (int64_t)round(md->dist_mean);
+      const double est_ld = md->a * sse + md->b;
+      // Clamp estimated rate cost by INT_MAX / 2.
+      // TODO(angiebird@google.com): find better solution than clamping.
+      if (fabs(est_ld) < 1e-2) {
+        *est_residue_cost = INT_MAX / 2;
+      } else {
+        double est_residue_cost_dbl = ((sse - md->dist_mean) / est_ld);
+        if (est_residue_cost_dbl < 0) {
+          *est_residue_cost = 0;
+        } else {
+          *est_residue_cost =
+              (int)AOMMIN((int64_t)round(est_residue_cost_dbl), INT_MAX / 2);
+        }
+      }
+      if (*est_residue_cost <= 0) {
+        *est_residue_cost = 0;
+        *est_dist = sse;
+      }
+    }
+    return 1;
+  }
+  return 0;
+}
+
+static double get_highbd_diff_mean(const uint8_t *src8, int src_stride,
+                                   const uint8_t *dst8, int dst_stride, int w,
+                                   int h) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  double sum = 0.0;
+  for (int j = 0; j < h; ++j) {
+    for (int i = 0; i < w; ++i) {
+      const int diff = src[j * src_stride + i] - dst[j * dst_stride + i];
+      sum += diff;
+    }
+  }
+  assert(w > 0 && h > 0);
+  return sum / (w * h);
+}
+
+static double get_diff_mean(const uint8_t *src, int src_stride,
+                            const uint8_t *dst, int dst_stride, int w, int h) {
+  double sum = 0.0;
+  for (int j = 0; j < h; ++j) {
+    for (int i = 0; i < w; ++i) {
+      const int diff = src[j * src_stride + i] - dst[j * dst_stride + i];
+      sum += diff;
+    }
+  }
+  assert(w > 0 && h > 0);
+  return sum / (w * h);
+}
+
+static AOM_INLINE void PrintPredictionUnitStats(const AV1_COMP *const cpi,
+                                                const TileDataEnc *tile_data,
+                                                MACROBLOCK *x,
+                                                const RD_STATS *const rd_stats,
+                                                BLOCK_SIZE plane_bsize) {
+  if (rd_stats->rate == INT_MAX || rd_stats->dist == INT64_MAX) return;
+
+  if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1 &&
+      (tile_data == NULL ||
+       !tile_data->inter_mode_rd_models[plane_bsize].ready))
+    return;
+  (void)tile_data;
+  // Generate small sample to restrict output size.
+  static unsigned int seed = 95014;
+
+  if ((lcg_rand16(&seed) % (1 << (14 - num_pels_log2_lookup[plane_bsize]))) !=
+      1)
+    return;
+
+  const char output_file[] = "pu_stats.txt";
+  FILE *fout = fopen(output_file, "a");
+  if (!fout) return;
+
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int plane = 0;
+  struct macroblock_plane *const p = &x->plane[plane];
+  struct macroblockd_plane *pd = &xd->plane[plane];
+  const int diff_stride = block_size_wide[plane_bsize];
+  int bw, bh;
+  get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL, &bw,
+                     &bh);
+  const int num_samples = bw * bh;
+  const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3;
+  const int q_step = p->dequant_QTX[1] >> dequant_shift;
+  const int shift = (xd->bd - 8);
+
+  const double rate_norm = (double)rd_stats->rate / num_samples;
+  const double dist_norm = (double)rd_stats->dist / num_samples;
+  const double rdcost_norm =
+      (double)RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) / num_samples;
+
+  fprintf(fout, "%g %g %g", rate_norm, dist_norm, rdcost_norm);
+
+  const int src_stride = p->src.stride;
+  const uint8_t *const src = p->src.buf;
+  const int dst_stride = pd->dst.stride;
+  const uint8_t *const dst = pd->dst.buf;
+  const int16_t *const src_diff = p->src_diff;
+
+  int64_t sse = calculate_sse(xd, p, pd, bw, bh);
+  const double sse_norm = (double)sse / num_samples;
+
+  const unsigned int sad =
+      cpi->fn_ptr[plane_bsize].sdf(src, src_stride, dst, dst_stride);
+  const double sad_norm =
+      (double)sad / (1 << num_pels_log2_lookup[plane_bsize]);
+
+  fprintf(fout, " %g %g", sse_norm, sad_norm);
+
+  double sse_norm_arr[4], sad_norm_arr[4];
+  get_2x2_normalized_sses_and_sads(cpi, plane_bsize, src, src_stride, dst,
+                                   dst_stride, src_diff, diff_stride,
+                                   sse_norm_arr, sad_norm_arr);
+  if (shift) {
+    for (int k = 0; k < 4; ++k) sse_norm_arr[k] /= (1 << (2 * shift));
+    for (int k = 0; k < 4; ++k) sad_norm_arr[k] /= (1 << shift);
+  }
+  for (int i = 0; i < 4; ++i) {
+    fprintf(fout, " %g", sse_norm_arr[i]);
+  }
+  for (int i = 0; i < 4; ++i) {
+    fprintf(fout, " %g", sad_norm_arr[i]);
+  }
+
+  fprintf(fout, " %d %d %d %d", q_step, x->rdmult, bw, bh);
+
+  int model_rate;
+  int64_t model_dist;
+  model_rd_sse_fn[MODELRD_CURVFIT](cpi, x, plane_bsize, plane, sse, num_samples,
+                                   &model_rate, &model_dist);
+  const double model_rdcost_norm =
+      (double)RDCOST(x->rdmult, model_rate, model_dist) / num_samples;
+  const double model_rate_norm = (double)model_rate / num_samples;
+  const double model_dist_norm = (double)model_dist / num_samples;
+  fprintf(fout, " %g %g %g", model_rate_norm, model_dist_norm,
+          model_rdcost_norm);
+
+  double mean;
+  if (is_cur_buf_hbd(xd)) {
+    mean = get_highbd_diff_mean(p->src.buf, p->src.stride, pd->dst.buf,
+                                pd->dst.stride, bw, bh);
+  } else {
+    mean = get_diff_mean(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride,
+                         bw, bh);
+  }
+  mean /= (1 << shift);
+  float hor_corr, vert_corr;
+  av1_get_horver_correlation_full(src_diff, diff_stride, bw, bh, &hor_corr,
+                                  &vert_corr);
+  fprintf(fout, " %g %g %g", mean, hor_corr, vert_corr);
+
+  double hdist[4] = { 0 }, vdist[4] = { 0 };
+  get_energy_distribution_fine(cpi, plane_bsize, src, src_stride, dst,
+                               dst_stride, 1, hdist, vdist);
+  fprintf(fout, " %g %g %g %g %g %g %g %g", hdist[0], hdist[1], hdist[2],
+          hdist[3], vdist[0], vdist[1], vdist[2], vdist[3]);
+
+  if (cpi->sf.inter_sf.inter_mode_rd_model_estimation == 1) {
+    assert(tile_data->inter_mode_rd_models[plane_bsize].ready);
+    const int64_t overall_sse = get_sse(cpi, x);
+    int est_residue_cost = 0;
+    int64_t est_dist = 0;
+    get_est_rate_dist(tile_data, plane_bsize, overall_sse, &est_residue_cost,
+                      &est_dist);
+    const double est_residue_cost_norm = (double)est_residue_cost / num_samples;
+    const double est_dist_norm = (double)est_dist / num_samples;
+    const double est_rdcost_norm =
+        (double)RDCOST(x->rdmult, est_residue_cost, est_dist) / num_samples;
+    fprintf(fout, " %g %g %g", est_residue_cost_norm, est_dist_norm,
+            est_rdcost_norm);
+  }
+
+  fprintf(fout, "\n");
+  fclose(fout);
+}
+#endif  // CONFIG_COLLECT_RD_STATS >= 2
+#endif  // CONFIG_COLLECT_RD_STATS
+
+static AOM_INLINE void inverse_transform_block_facade(MACROBLOCKD *xd,
+                                                      int plane, int block,
+                                                      int blk_row, int blk_col,
+                                                      int eob,
+                                                      int reduced_tx_set) {
+  if (!eob) return;
+
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  tran_low_t *dqcoeff = pd->dqcoeff + BLOCK_OFFSET(block);
+  const PLANE_TYPE plane_type = get_plane_type(plane);
+  const TX_SIZE tx_size = av1_get_tx_size(plane, xd);
+  const TX_TYPE tx_type = av1_get_tx_type(xd, plane_type, blk_row, blk_col,
+                                          tx_size, reduced_tx_set);
+  const int dst_stride = pd->dst.stride;
+  uint8_t *dst = &pd->dst.buf[(blk_row * dst_stride + blk_col) << MI_SIZE_LOG2];
+  av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, dst,
+                              dst_stride, eob, reduced_tx_set);
+}
+
+static INLINE void recon_intra(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
+                               int block, int blk_row, int blk_col,
+                               BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                               const TXB_CTX *const txb_ctx, int skip_trellis,
+                               TX_TYPE best_tx_type, int do_quant,
+                               int *rate_cost, uint16_t best_eob) {
+  const AV1_COMMON *cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  const int is_inter = is_inter_block(mbmi);
+  if (!is_inter && best_eob &&
+      (blk_row + tx_size_high_unit[tx_size] < mi_size_high[plane_bsize] ||
+       blk_col + tx_size_wide_unit[tx_size] < mi_size_wide[plane_bsize])) {
+    // if the quantized coefficients are stored in the dqcoeff buffer, we don't
+    // need to do transform and quantization again.
+    if (do_quant) {
+      TxfmParam txfm_param_intra;
+      QUANT_PARAM quant_param_intra;
+      av1_setup_xform(cm, x, tx_size, best_tx_type, &txfm_param_intra);
+      av1_setup_quant(tx_size, !skip_trellis,
+                      skip_trellis
+                          ? (USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B
+                                                    : AV1_XFORM_QUANT_FP)
+                          : AV1_XFORM_QUANT_FP,
+                      cpi->oxcf.quant_b_adapt, &quant_param_intra);
+      av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, best_tx_type,
+                        &quant_param_intra);
+      av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize,
+                      &txfm_param_intra, &quant_param_intra);
+      if (quant_param_intra.use_optimize_b) {
+        av1_optimize_b(cpi, x, plane, block, tx_size, best_tx_type, txb_ctx,
+                       cpi->sf.rd_sf.trellis_eob_fast, rate_cost);
+      }
+    }
+
+    inverse_transform_block_facade(xd, plane, block, blk_row, blk_col,
+                                   x->plane[plane].eobs[block],
+                                   cm->features.reduced_tx_set_used);
+
+    // This may happen because of hash collision. The eob stored in the hash
+    // table is non-zero, but the real eob is zero. We need to make sure tx_type
+    // is DCT_DCT in this case.
+    if (plane == 0 && x->plane[plane].eobs[block] == 0 &&
+        best_tx_type != DCT_DCT) {
+      update_txk_array(xd, blk_row, blk_col, tx_size, DCT_DCT);
+    }
+  }
+}
+
+static unsigned pixel_dist_visible_only(
+    const AV1_COMP *const cpi, const MACROBLOCK *x, const uint8_t *src,
+    const int src_stride, const uint8_t *dst, const int dst_stride,
+    const BLOCK_SIZE tx_bsize, int txb_rows, int txb_cols, int visible_rows,
+    int visible_cols) {
+  unsigned sse;
+
+  if (txb_rows == visible_rows && txb_cols == visible_cols) {
+    cpi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &sse);
+    return sse;
+  }
+
+#if CONFIG_AV1_HIGHBITDEPTH
+  const MACROBLOCKD *xd = &x->e_mbd;
+  if (is_cur_buf_hbd(xd)) {
+    uint64_t sse64 = aom_highbd_sse_odd_size(src, src_stride, dst, dst_stride,
+                                             visible_cols, visible_rows);
+    return (unsigned int)ROUND_POWER_OF_TWO(sse64, (xd->bd - 8) * 2);
+  }
+#else
+  (void)x;
+#endif
+  sse = aom_sse_odd_size(src, src_stride, dst, dst_stride, visible_cols,
+                         visible_rows);
+  return sse;
+}
+
+// Compute the pixel domain distortion from src and dst on all visible 4x4s in
+// the
+// transform block.
+static unsigned pixel_dist(const AV1_COMP *const cpi, const MACROBLOCK *x,
+                           int plane, const uint8_t *src, const int src_stride,
+                           const uint8_t *dst, const int dst_stride,
+                           int blk_row, int blk_col,
+                           const BLOCK_SIZE plane_bsize,
+                           const BLOCK_SIZE tx_bsize) {
+  int txb_rows, txb_cols, visible_rows, visible_cols;
+  const MACROBLOCKD *xd = &x->e_mbd;
+
+  get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize,
+                     &txb_cols, &txb_rows, &visible_cols, &visible_rows);
+  assert(visible_rows > 0);
+  assert(visible_cols > 0);
+
+  unsigned sse = pixel_dist_visible_only(cpi, x, src, src_stride, dst,
+                                         dst_stride, tx_bsize, txb_rows,
+                                         txb_cols, visible_rows, visible_cols);
+
+  return sse;
+}
+
+static INLINE int64_t dist_block_px_domain(const AV1_COMP *cpi, MACROBLOCK *x,
+                                           int plane, BLOCK_SIZE plane_bsize,
+                                           int block, int blk_row, int blk_col,
+                                           TX_SIZE tx_size) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const uint16_t eob = p->eobs[block];
+  const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
+  const int bsw = block_size_wide[tx_bsize];
+  const int bsh = block_size_high[tx_bsize];
+  const int src_stride = x->plane[plane].src.stride;
+  const int dst_stride = xd->plane[plane].dst.stride;
+  // Scale the transform block index to pixel unit.
+  const int src_idx = (blk_row * src_stride + blk_col) << MI_SIZE_LOG2;
+  const int dst_idx = (blk_row * dst_stride + blk_col) << MI_SIZE_LOG2;
+  const uint8_t *src = &x->plane[plane].src.buf[src_idx];
+  const uint8_t *dst = &xd->plane[plane].dst.buf[dst_idx];
+  const tran_low_t *dqcoeff = pd->dqcoeff + BLOCK_OFFSET(block);
+
+  assert(cpi != NULL);
+  assert(tx_size_wide_log2[0] == tx_size_high_log2[0]);
+
+  uint8_t *recon;
+  DECLARE_ALIGNED(16, uint16_t, recon16[MAX_TX_SQUARE]);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (is_cur_buf_hbd(xd)) {
+    recon = CONVERT_TO_BYTEPTR(recon16);
+    av1_highbd_convolve_2d_copy_sr(CONVERT_TO_SHORTPTR(dst), dst_stride,
+                                   CONVERT_TO_SHORTPTR(recon), MAX_TX_SIZE, bsw,
+                                   bsh, NULL, NULL, 0, 0, NULL, xd->bd);
+  } else {
+    recon = (uint8_t *)recon16;
+    av1_convolve_2d_copy_sr(dst, dst_stride, recon, MAX_TX_SIZE, bsw, bsh, NULL,
+                            NULL, 0, 0, NULL);
+  }
+#else
+  recon = (uint8_t *)recon16;
+  av1_convolve_2d_copy_sr(dst, dst_stride, recon, MAX_TX_SIZE, bsw, bsh, NULL,
+                          NULL, 0, 0, NULL);
+#endif
+
+  const PLANE_TYPE plane_type = get_plane_type(plane);
+  TX_TYPE tx_type = av1_get_tx_type(xd, plane_type, blk_row, blk_col, tx_size,
+                                    cpi->common.features.reduced_tx_set_used);
+  av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, recon,
+                              MAX_TX_SIZE, eob,
+                              cpi->common.features.reduced_tx_set_used);
+
+  return 16 * pixel_dist(cpi, x, plane, src, src_stride, recon, MAX_TX_SIZE,
+                         blk_row, blk_col, plane_bsize, tx_bsize);
+}
+
+static uint32_t get_intra_txb_hash(MACROBLOCK *x, int plane, int blk_row,
+                                   int blk_col, BLOCK_SIZE plane_bsize,
+                                   TX_SIZE tx_size) {
+  int16_t tmp_data[64 * 64];
+  const int diff_stride = block_size_wide[plane_bsize];
+  const int16_t *diff = x->plane[plane].src_diff;
+  const int16_t *cur_diff_row = diff + 4 * blk_row * diff_stride + 4 * blk_col;
+  const int txb_w = tx_size_wide[tx_size];
+  const int txb_h = tx_size_high[tx_size];
+  uint8_t *hash_data = (uint8_t *)cur_diff_row;
+  if (txb_w != diff_stride) {
+    int16_t *cur_hash_row = tmp_data;
+    for (int i = 0; i < txb_h; i++) {
+      memcpy(cur_hash_row, cur_diff_row, sizeof(*diff) * txb_w);
+      cur_hash_row += txb_w;
+      cur_diff_row += diff_stride;
+    }
+    hash_data = (uint8_t *)tmp_data;
+  }
+  CRC32C *crc = &x->mb_rd_record.crc_calculator;
+  const uint32_t hash = av1_get_crc32c_value(crc, hash_data, 2 * txb_w * txb_h);
+  return (hash << 5) + tx_size;
+}
+
+// pruning thresholds for prune_txk_type and prune_txk_type_separ
+static const int prune_factors[5] = { 200, 200, 120, 80, 40 };  // scale 1000
+static const int mul_factors[5] = { 80, 80, 70, 50, 30 };       // scale 100
+
+static INLINE int is_intra_hash_match(const AV1_COMP *cpi, MACROBLOCK *x,
+                                      int plane, int blk_row, int blk_col,
+                                      BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                                      const TXB_CTX *const txb_ctx,
+                                      TXB_RD_INFO **intra_txb_rd_info,
+                                      const int tx_type_map_idx,
+                                      uint16_t *cur_joint_ctx) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  assert(cpi->sf.tx_sf.use_intra_txb_hash &&
+         frame_is_intra_only(&cpi->common) && !is_inter_block(xd->mi[0]) &&
+         plane == 0 && tx_size_wide[tx_size] == tx_size_high[tx_size]);
+  const uint32_t intra_hash =
+      get_intra_txb_hash(x, plane, blk_row, blk_col, plane_bsize, tx_size);
+  const int intra_hash_idx =
+      find_tx_size_rd_info(&x->txb_rd_record_intra, intra_hash);
+  *intra_txb_rd_info = &x->txb_rd_record_intra.tx_rd_info[intra_hash_idx];
+  *cur_joint_ctx = (txb_ctx->dc_sign_ctx << 8) + txb_ctx->txb_skip_ctx;
+  if ((*intra_txb_rd_info)->entropy_context == *cur_joint_ctx &&
+      x->txb_rd_record_intra.tx_rd_info[intra_hash_idx].valid) {
+    xd->tx_type_map[tx_type_map_idx] = (*intra_txb_rd_info)->tx_type;
+    const TX_TYPE ref_tx_type =
+        av1_get_tx_type(xd, get_plane_type(plane), blk_row, blk_col, tx_size,
+                        cpi->common.features.reduced_tx_set_used);
+    return (ref_tx_type == (*intra_txb_rd_info)->tx_type);
+  }
+  return 0;
+}
+
+// R-D costs are sorted in ascending order.
+static INLINE void sort_rd(int64_t rds[], int txk[], int len) {
+  int i, j, k;
+
+  for (i = 1; i <= len - 1; ++i) {
+    for (j = 0; j < i; ++j) {
+      if (rds[j] > rds[i]) {
+        int64_t temprd;
+        int tempi;
+
+        temprd = rds[i];
+        tempi = txk[i];
+
+        for (k = i; k > j; k--) {
+          rds[k] = rds[k - 1];
+          txk[k] = txk[k - 1];
+        }
+
+        rds[j] = temprd;
+        txk[j] = tempi;
+        break;
+      }
+    }
+  }
+}
+
+static INLINE void dist_block_tx_domain(MACROBLOCK *x, int plane, int block,
+                                        TX_SIZE tx_size, int64_t *out_dist,
+                                        int64_t *out_sse) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  // Transform domain distortion computation is more efficient as it does
+  // not involve an inverse transform, but it is less accurate.
+  const int buffer_length = av1_get_max_eob(tx_size);
+  int64_t this_sse;
+  // TX-domain results need to shift down to Q2/D10 to match pixel
+  // domain distortion values which are in Q2^2
+  int shift = (MAX_TX_SCALE - av1_get_tx_scale(tx_size)) * 2;
+  const int block_offset = BLOCK_OFFSET(block);
+  tran_low_t *const coeff = p->coeff + block_offset;
+  tran_low_t *const dqcoeff = pd->dqcoeff + block_offset;
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (is_cur_buf_hbd(xd))
+    *out_dist = av1_highbd_block_error(coeff, dqcoeff, buffer_length, &this_sse,
+                                       xd->bd);
+  else
+    *out_dist = av1_block_error(coeff, dqcoeff, buffer_length, &this_sse);
+#else
+  *out_dist = av1_block_error(coeff, dqcoeff, buffer_length, &this_sse);
+#endif
+  *out_dist = RIGHT_SIGNED_SHIFT(*out_dist, shift);
+  *out_sse = RIGHT_SIGNED_SHIFT(this_sse, shift);
+}
+
+uint16_t prune_txk_type_separ(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
+                              int block, TX_SIZE tx_size, int blk_row,
+                              int blk_col, BLOCK_SIZE plane_bsize, int *txk_map,
+                              int16_t allowed_tx_mask, int prune_factor,
+                              const TXB_CTX *const txb_ctx,
+                              int reduced_tx_set_used, int64_t ref_best_rd,
+                              int num_sel) {
+  const AV1_COMMON *cm = &cpi->common;
+
+  int idx;
+
+  int64_t rds_v[4];
+  int64_t rds_h[4];
+  int idx_v[4] = { 0, 1, 2, 3 };
+  int idx_h[4] = { 0, 1, 2, 3 };
+  int skip_v[4] = { 0 };
+  int skip_h[4] = { 0 };
+  const int idx_map[16] = {
+    DCT_DCT,      DCT_ADST,      DCT_FLIPADST,      V_DCT,
+    ADST_DCT,     ADST_ADST,     ADST_FLIPADST,     V_ADST,
+    FLIPADST_DCT, FLIPADST_ADST, FLIPADST_FLIPADST, V_FLIPADST,
+    H_DCT,        H_ADST,        H_FLIPADST,        IDTX
+  };
+
+  const int sel_pattern_v[16] = {
+    0, 0, 1, 1, 0, 2, 1, 2, 2, 0, 3, 1, 3, 2, 3, 3
+  };
+  const int sel_pattern_h[16] = {
+    0, 1, 0, 1, 2, 0, 2, 1, 2, 3, 0, 3, 1, 3, 2, 3
+  };
+
+  QUANT_PARAM quant_param;
+  TxfmParam txfm_param;
+  av1_setup_xform(cm, x, tx_size, DCT_DCT, &txfm_param);
+  av1_setup_quant(tx_size, 1, AV1_XFORM_QUANT_B, cpi->oxcf.quant_b_adapt,
+                  &quant_param);
+  int tx_type;
+  // to ensure we can try ones even outside of ext_tx_set of current block
+  // this function should only be called for size < 16
+  assert(txsize_sqr_up_map[tx_size] <= TX_16X16);
+  txfm_param.tx_set_type = EXT_TX_SET_ALL16;
+
+  int rate_cost = 0;
+  int64_t dist = 0, sse = 0;
+  // evaluate horizontal with vertical DCT
+  for (idx = 0; idx < 4; ++idx) {
+    tx_type = idx_map[idx];
+    txfm_param.tx_type = tx_type;
+
+    av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param,
+                    &quant_param);
+
+    dist_block_tx_domain(x, plane, block, tx_size, &dist, &sse);
+
+    rate_cost = av1_cost_coeffs_txb_laplacian(x, plane, block, tx_size, tx_type,
+                                              txb_ctx, reduced_tx_set_used, 0);
+
+    rds_h[idx] = RDCOST(x->rdmult, rate_cost, dist);
+
+    if ((rds_h[idx] - (rds_h[idx] >> 2)) > ref_best_rd) {
+      skip_h[idx] = 1;
+    }
+  }
+  sort_rd(rds_h, idx_h, 4);
+  for (idx = 1; idx < 4; idx++) {
+    if (rds_h[idx] > rds_h[0] * 1.2) skip_h[idx_h[idx]] = 1;
+  }
+
+  if (skip_h[idx_h[0]]) return (uint16_t)0xFFFF;
+
+  // evaluate vertical with the best horizontal chosen
+  rds_v[0] = rds_h[0];
+  int start_v = 1, end_v = 4;
+  const int *idx_map_v = idx_map + idx_h[0];
+
+  for (idx = start_v; idx < end_v; ++idx) {
+    tx_type = idx_map_v[idx_v[idx] * 4];
+    txfm_param.tx_type = tx_type;
+
+    av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param,
+                    &quant_param);
+
+    dist_block_tx_domain(x, plane, block, tx_size, &dist, &sse);
+
+    rate_cost = av1_cost_coeffs_txb_laplacian(x, plane, block, tx_size, tx_type,
+                                              txb_ctx, reduced_tx_set_used, 0);
+
+    rds_v[idx] = RDCOST(x->rdmult, rate_cost, dist);
+
+    if ((rds_v[idx] - (rds_v[idx] >> 2)) > ref_best_rd) {
+      skip_v[idx] = 1;
+    }
+  }
+  sort_rd(rds_v, idx_v, 4);
+  for (idx = 1; idx < 4; idx++) {
+    if (rds_v[idx] > rds_v[0] * 1.2) skip_v[idx_v[idx]] = 1;
+  }
+
+  // combine rd_h and rd_v to prune tx candidates
+  int i_v, i_h;
+  int64_t rds[16];
+  int num_cand = 0, last = TX_TYPES - 1;
+
+  for (int i = 0; i < 16; i++) {
+    i_v = sel_pattern_v[i];
+    i_h = sel_pattern_h[i];
+    tx_type = idx_map[idx_v[i_v] * 4 + idx_h[i_h]];
+    if (!(allowed_tx_mask & (1 << tx_type)) || skip_h[idx_h[i_h]] ||
+        skip_v[idx_v[i_v]]) {
+      txk_map[last] = tx_type;
+      last--;
+    } else {
+      txk_map[num_cand] = tx_type;
+      rds[num_cand] = rds_v[i_v] + rds_h[i_h];
+      if (rds[num_cand] == 0) rds[num_cand] = 1;
+      num_cand++;
+    }
+  }
+  sort_rd(rds, txk_map, num_cand);
+
+  uint16_t prune = (uint16_t)(~(1 << txk_map[0]));
+  num_sel = AOMMIN(num_sel, num_cand);
+
+  for (int i = 1; i < num_sel; i++) {
+    int64_t factor = 1800 * (rds[i] - rds[0]) / (rds[0]);
+    if (factor < (int64_t)prune_factor)
+      prune &= ~(1 << txk_map[i]);
+    else
+      break;
+  }
+  return prune;
+}
+
+uint16_t prune_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
+                        int block, TX_SIZE tx_size, int blk_row, int blk_col,
+                        BLOCK_SIZE plane_bsize, int *txk_map,
+                        uint16_t allowed_tx_mask, int prune_factor,
+                        const TXB_CTX *const txb_ctx, int reduced_tx_set_used) {
+  const AV1_COMMON *cm = &cpi->common;
+  int tx_type;
+
+  int64_t rds[TX_TYPES];
+
+  int num_cand = 0;
+  int last = TX_TYPES - 1;
+
+  TxfmParam txfm_param;
+  QUANT_PARAM quant_param;
+  av1_setup_xform(cm, x, tx_size, DCT_DCT, &txfm_param);
+  av1_setup_quant(tx_size, 1, AV1_XFORM_QUANT_B, cpi->oxcf.quant_b_adapt,
+                  &quant_param);
+
+  for (int idx = 0; idx < TX_TYPES; idx++) {
+    tx_type = idx;
+    int rate_cost = 0;
+    int64_t dist = 0, sse = 0;
+    if (!(allowed_tx_mask & (1 << tx_type))) {
+      txk_map[last] = tx_type;
+      last--;
+      continue;
+    }
+    txfm_param.tx_type = tx_type;
+
+    // do txfm and quantization
+    av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param,
+                    &quant_param);
+    // estimate rate cost
+    rate_cost = av1_cost_coeffs_txb_laplacian(x, plane, block, tx_size, tx_type,
+                                              txb_ctx, reduced_tx_set_used, 0);
+    // tx domain dist
+    dist_block_tx_domain(x, plane, block, tx_size, &dist, &sse);
+
+    txk_map[num_cand] = tx_type;
+    rds[num_cand] = RDCOST(x->rdmult, rate_cost, dist);
+    if (rds[num_cand] == 0) rds[num_cand] = 1;
+    num_cand++;
+  }
+
+  if (num_cand == 0) return (uint16_t)0xFFFF;
+
+  sort_rd(rds, txk_map, num_cand);
+  uint16_t prune = (uint16_t)(~(1 << txk_map[0]));
+
+  // 0 < prune_factor <= 1000 controls aggressiveness
+  int64_t factor = 0;
+  for (int idx = 1; idx < num_cand; idx++) {
+    factor = 1000 * (rds[idx] - rds[0]) / rds[0];
+    if (factor < (int64_t)prune_factor)
+      prune &= ~(1 << txk_map[idx]);
+    else
+      break;
+  }
+  return prune;
+}
+
+// These thresholds were calibrated to provide a certain number of TX types
+// pruned by the model on average, i.e. selecting a threshold with index i
+// will lead to pruning i+1 TX types on average
+static const float *prune_2D_adaptive_thresholds[] = {
+  // TX_4X4
+  (float[]){ 0.00549f, 0.01306f, 0.02039f, 0.02747f, 0.03406f, 0.04065f,
+             0.04724f, 0.05383f, 0.06067f, 0.06799f, 0.07605f, 0.08533f,
+             0.09778f, 0.11780f },
+  // TX_8X8
+  (float[]){ 0.00037f, 0.00183f, 0.00525f, 0.01038f, 0.01697f, 0.02502f,
+             0.03381f, 0.04333f, 0.05286f, 0.06287f, 0.07434f, 0.08850f,
+             0.10803f, 0.14124f },
+  // TX_16X16
+  (float[]){ 0.01404f, 0.02000f, 0.04211f, 0.05164f, 0.05798f, 0.06335f,
+             0.06897f, 0.07629f, 0.08875f, 0.11169f },
+  // TX_32X32
+  NULL,
+  // TX_64X64
+  NULL,
+  // TX_4X8
+  (float[]){ 0.00183f, 0.00745f, 0.01428f, 0.02185f, 0.02966f, 0.03723f,
+             0.04456f, 0.05188f, 0.05920f, 0.06702f, 0.07605f, 0.08704f,
+             0.10168f, 0.12585f },
+  // TX_8X4
+  (float[]){ 0.00085f, 0.00476f, 0.01135f, 0.01892f, 0.02698f, 0.03528f,
+             0.04358f, 0.05164f, 0.05994f, 0.06848f, 0.07849f, 0.09021f,
+             0.10583f, 0.13123f },
+  // TX_8X16
+  (float[]){ 0.00037f, 0.00232f, 0.00671f, 0.01257f, 0.01965f, 0.02722f,
+             0.03552f, 0.04382f, 0.05237f, 0.06189f, 0.07336f, 0.08728f,
+             0.10730f, 0.14221f },
+  // TX_16X8
+  (float[]){ 0.00061f, 0.00330f, 0.00818f, 0.01453f, 0.02185f, 0.02966f,
+             0.03772f, 0.04578f, 0.05383f, 0.06262f, 0.07288f, 0.08582f,
+             0.10339f, 0.13464f },
+  // TX_16X32
+  NULL,
+  // TX_32X16
+  NULL,
+  // TX_32X64
+  NULL,
+  // TX_64X32
+  NULL,
+  // TX_4X16
+  (float[]){ 0.00232f, 0.00671f, 0.01257f, 0.01941f, 0.02673f, 0.03430f,
+             0.04211f, 0.04968f, 0.05750f, 0.06580f, 0.07507f, 0.08655f,
+             0.10242f, 0.12878f },
+  // TX_16X4
+  (float[]){ 0.00110f, 0.00525f, 0.01208f, 0.01990f, 0.02795f, 0.03601f,
+             0.04358f, 0.05115f, 0.05896f, 0.06702f, 0.07629f, 0.08752f,
+             0.10217f, 0.12610f },
+  // TX_8X32
+  NULL,
+  // TX_32X8
+  NULL,
+  // TX_16X64
+  NULL,
+  // TX_64X16
+  NULL,
+};
+
+// Probablities are sorted in descending order.
+static INLINE void sort_probability(float prob[], int txk[], int len) {
+  int i, j, k;
+
+  for (i = 1; i <= len - 1; ++i) {
+    for (j = 0; j < i; ++j) {
+      if (prob[j] < prob[i]) {
+        float temp;
+        int tempi;
+
+        temp = prob[i];
+        tempi = txk[i];
+
+        for (k = i; k > j; k--) {
+          prob[k] = prob[k - 1];
+          txk[k] = txk[k - 1];
+        }
+
+        prob[j] = temp;
+        txk[j] = tempi;
+        break;
+      }
+    }
+  }
+}
+
+static INLINE float get_adaptive_thresholds(TX_SIZE tx_size,
+                                            TxSetType tx_set_type,
+                                            TX_TYPE_PRUNE_MODE prune_mode) {
+  const int prune_aggr_table[4][2] = { { 4, 1 }, { 6, 3 }, { 9, 6 }, { 9, 6 } };
+  int pruning_aggressiveness = 0;
+  if (tx_set_type == EXT_TX_SET_ALL16)
+    pruning_aggressiveness =
+        prune_aggr_table[prune_mode - PRUNE_2D_ACCURATE][0];
+  else if (tx_set_type == EXT_TX_SET_DTT9_IDTX_1DDCT)
+    pruning_aggressiveness =
+        prune_aggr_table[prune_mode - PRUNE_2D_ACCURATE][1];
+
+  return prune_2D_adaptive_thresholds[tx_size][pruning_aggressiveness];
+}
+
+static AOM_INLINE void get_energy_distribution_finer(const int16_t *diff,
+                                                     int stride, int bw, int bh,
+                                                     float *hordist,
+                                                     float *verdist) {
+  // First compute downscaled block energy values (esq); downscale factors
+  // are defined by w_shift and h_shift.
+  unsigned int esq[256];
+  const int w_shift = bw <= 8 ? 0 : 1;
+  const int h_shift = bh <= 8 ? 0 : 1;
+  const int esq_w = bw >> w_shift;
+  const int esq_h = bh >> h_shift;
+  const int esq_sz = esq_w * esq_h;
+  int i, j;
+  memset(esq, 0, esq_sz * sizeof(esq[0]));
+  if (w_shift) {
+    for (i = 0; i < bh; i++) {
+      unsigned int *cur_esq_row = esq + (i >> h_shift) * esq_w;
+      const int16_t *cur_diff_row = diff + i * stride;
+      for (j = 0; j < bw; j += 2) {
+        cur_esq_row[j >> 1] += (cur_diff_row[j] * cur_diff_row[j] +
+                                cur_diff_row[j + 1] * cur_diff_row[j + 1]);
+      }
+    }
+  } else {
+    for (i = 0; i < bh; i++) {
+      unsigned int *cur_esq_row = esq + (i >> h_shift) * esq_w;
+      const int16_t *cur_diff_row = diff + i * stride;
+      for (j = 0; j < bw; j++) {
+        cur_esq_row[j] += cur_diff_row[j] * cur_diff_row[j];
+      }
+    }
+  }
+
+  uint64_t total = 0;
+  for (i = 0; i < esq_sz; i++) total += esq[i];
+
+  // Output hordist and verdist arrays are normalized 1D projections of esq
+  if (total == 0) {
+    float hor_val = 1.0f / esq_w;
+    for (j = 0; j < esq_w - 1; j++) hordist[j] = hor_val;
+    float ver_val = 1.0f / esq_h;
+    for (i = 0; i < esq_h - 1; i++) verdist[i] = ver_val;
+    return;
+  }
+
+  const float e_recip = 1.0f / (float)total;
+  memset(hordist, 0, (esq_w - 1) * sizeof(hordist[0]));
+  memset(verdist, 0, (esq_h - 1) * sizeof(verdist[0]));
+  const unsigned int *cur_esq_row;
+  for (i = 0; i < esq_h - 1; i++) {
+    cur_esq_row = esq + i * esq_w;
+    for (j = 0; j < esq_w - 1; j++) {
+      hordist[j] += (float)cur_esq_row[j];
+      verdist[i] += (float)cur_esq_row[j];
+    }
+    verdist[i] += (float)cur_esq_row[j];
+  }
+  cur_esq_row = esq + i * esq_w;
+  for (j = 0; j < esq_w - 1; j++) hordist[j] += (float)cur_esq_row[j];
+
+  for (j = 0; j < esq_w - 1; j++) hordist[j] *= e_recip;
+  for (i = 0; i < esq_h - 1; i++) verdist[i] *= e_recip;
+}
+
+static void prune_tx_2D(MACROBLOCK *x, BLOCK_SIZE bsize, TX_SIZE tx_size,
+                        int blk_row, int blk_col, TxSetType tx_set_type,
+                        TX_TYPE_PRUNE_MODE prune_mode, int *txk_map,
+                        uint16_t *allowed_tx_mask) {
+  int tx_type_table_2D[16] = {
+    DCT_DCT,      DCT_ADST,      DCT_FLIPADST,      V_DCT,
+    ADST_DCT,     ADST_ADST,     ADST_FLIPADST,     V_ADST,
+    FLIPADST_DCT, FLIPADST_ADST, FLIPADST_FLIPADST, V_FLIPADST,
+    H_DCT,        H_ADST,        H_FLIPADST,        IDTX
+  };
+  if (tx_set_type != EXT_TX_SET_ALL16 &&
+      tx_set_type != EXT_TX_SET_DTT9_IDTX_1DDCT)
+    return;
+#if CONFIG_NN_V2
+  NN_CONFIG_V2 *nn_config_hor = av1_tx_type_nnconfig_map_hor[tx_size];
+  NN_CONFIG_V2 *nn_config_ver = av1_tx_type_nnconfig_map_ver[tx_size];
+#else
+  const NN_CONFIG *nn_config_hor = av1_tx_type_nnconfig_map_hor[tx_size];
+  const NN_CONFIG *nn_config_ver = av1_tx_type_nnconfig_map_ver[tx_size];
+#endif
+  if (!nn_config_hor || !nn_config_ver) return;  // Model not established yet.
+
+  aom_clear_system_state();
+  float hfeatures[16], vfeatures[16];
+  float hscores[4], vscores[4];
+  float scores_2D_raw[16];
+  float scores_2D[16];
+  const int bw = tx_size_wide[tx_size];
+  const int bh = tx_size_high[tx_size];
+  const int hfeatures_num = bw <= 8 ? bw : bw / 2;
+  const int vfeatures_num = bh <= 8 ? bh : bh / 2;
+  assert(hfeatures_num <= 16);
+  assert(vfeatures_num <= 16);
+
+  const struct macroblock_plane *const p = &x->plane[0];
+  const int diff_stride = block_size_wide[bsize];
+  const int16_t *diff = p->src_diff + 4 * blk_row * diff_stride + 4 * blk_col;
+  get_energy_distribution_finer(diff, diff_stride, bw, bh, hfeatures,
+                                vfeatures);
+  av1_get_horver_correlation_full(diff, diff_stride, bw, bh,
+                                  &hfeatures[hfeatures_num - 1],
+                                  &vfeatures[vfeatures_num - 1]);
+  aom_clear_system_state();
+#if CONFIG_NN_V2
+  av1_nn_predict_v2(hfeatures, nn_config_hor, 0, hscores);
+  av1_nn_predict_v2(vfeatures, nn_config_ver, 0, vscores);
+#else
+  av1_nn_predict(hfeatures, nn_config_hor, 1, hscores);
+  av1_nn_predict(vfeatures, nn_config_ver, 1, vscores);
+#endif
+  aom_clear_system_state();
+
+  for (int i = 0; i < 4; i++) {
+    float *cur_scores_2D = scores_2D_raw + i * 4;
+    cur_scores_2D[0] = vscores[i] * hscores[0];
+    cur_scores_2D[1] = vscores[i] * hscores[1];
+    cur_scores_2D[2] = vscores[i] * hscores[2];
+    cur_scores_2D[3] = vscores[i] * hscores[3];
+  }
+
+  av1_nn_softmax(scores_2D_raw, scores_2D, 16);
+
+  const float score_thresh =
+      get_adaptive_thresholds(tx_size, tx_set_type, prune_mode);
+
+  // Always keep the TX type with the highest score, prune all others with
+  // score below score_thresh.
+  int max_score_i = 0;
+  float max_score = 0.0f;
+  uint16_t allow_bitmask = 0;
+  float sum_score = 0.0;
+  // Calculate sum of allowed tx type score and Populate allow bit mask based
+  // on score_thresh and allowed_tx_mask
+  for (int tx_idx = 0; tx_idx < TX_TYPES; tx_idx++) {
+    int allow_tx_type = *allowed_tx_mask & (1 << tx_type_table_2D[tx_idx]);
+    if (scores_2D[tx_idx] > max_score && allow_tx_type) {
+      max_score = scores_2D[tx_idx];
+      max_score_i = tx_idx;
+    }
+    if (scores_2D[tx_idx] >= score_thresh && allow_tx_type) {
+      // Set allow mask based on score_thresh
+      allow_bitmask |= (1 << tx_type_table_2D[tx_idx]);
+
+      // Accumulate score of allowed tx type
+      sum_score += scores_2D[tx_idx];
+    }
+  }
+  if (!((allow_bitmask >> max_score_i) & 0x01)) {
+    // Set allow mask based on tx type with max score
+    allow_bitmask |= (1 << tx_type_table_2D[max_score_i]);
+    sum_score += scores_2D[max_score_i];
+  }
+  // Sort tx type probability of all types
+  sort_probability(scores_2D, tx_type_table_2D, TX_TYPES);
+
+  // Enable more pruning based on tx type probability and number of allowed tx
+  // types
+  if (prune_mode == PRUNE_2D_AGGRESSIVE) {
+    float temp_score = 0.0;
+    float score_ratio = 0.0;
+    int tx_idx, tx_count = 0;
+    const float inv_sum_score = 100 / sum_score;
+    // Get allowed tx types based on sorted probability score and tx count
+    for (tx_idx = 0; tx_idx < TX_TYPES; tx_idx++) {
+      // Skip the tx type which has more than 30% of cumulative
+      // probability and allowed tx type count is more than 2
+      if (score_ratio > 30.0 && tx_count >= 2) break;
+
+      // Calculate cumulative probability of allowed tx types
+      if (allow_bitmask & (1 << tx_type_table_2D[tx_idx])) {
+        // Calculate cumulative probability
+        temp_score += scores_2D[tx_idx];
+
+        // Calculate percentage of cumulative probability of allowed tx type
+        score_ratio = temp_score * inv_sum_score;
+        tx_count++;
+      }
+    }
+    // Set remaining tx types as pruned
+    for (; tx_idx < TX_TYPES; tx_idx++)
+      allow_bitmask &= ~(1 << tx_type_table_2D[tx_idx]);
+  }
+  memcpy(txk_map, tx_type_table_2D, sizeof(tx_type_table_2D));
+  *allowed_tx_mask = allow_bitmask;
+}
+
+static float get_dev(float mean, double x2_sum, int num) {
+  const float e_x2 = (float)(x2_sum / num);
+  const float diff = e_x2 - mean * mean;
+  const float dev = (diff > 0) ? sqrtf(diff) : 0;
+  return dev;
+}
+
+// Feature used by the model to predict tx split: the mean and standard
+// deviation values of the block and sub-blocks.
+static AOM_INLINE void get_mean_dev_features(const int16_t *data, int stride,
+                                             int bw, int bh, float *feature) {
+  const int16_t *const data_ptr = &data[0];
+  const int subh = (bh >= bw) ? (bh >> 1) : bh;
+  const int subw = (bw >= bh) ? (bw >> 1) : bw;
+  const int num = bw * bh;
+  const int sub_num = subw * subh;
+  int feature_idx = 2;
+  int total_x_sum = 0;
+  int64_t total_x2_sum = 0;
+  int blk_idx = 0;
+  double mean2_sum = 0.0f;
+  float dev_sum = 0.0f;
+
+  for (int row = 0; row < bh; row += subh) {
+    for (int col = 0; col < bw; col += subw) {
+      int x_sum;
+      int64_t x2_sum;
+      // TODO(any): Write a SIMD version. Clear registers.
+      aom_get_blk_sse_sum(data_ptr + row * stride + col, stride, subw, subh,
+                          &x_sum, &x2_sum);
+      total_x_sum += x_sum;
+      total_x2_sum += x2_sum;
+
+      aom_clear_system_state();
+      const float mean = (float)x_sum / sub_num;
+      const float dev = get_dev(mean, (double)x2_sum, sub_num);
+      feature[feature_idx++] = mean;
+      feature[feature_idx++] = dev;
+      mean2_sum += (double)(mean * mean);
+      dev_sum += dev;
+      blk_idx++;
+    }
+  }
+
+  const float lvl0_mean = (float)total_x_sum / num;
+  feature[0] = lvl0_mean;
+  feature[1] = get_dev(lvl0_mean, (double)total_x2_sum, num);
+
+  if (blk_idx > 1) {
+    // Deviation of means.
+    feature[feature_idx++] = get_dev(lvl0_mean, mean2_sum, blk_idx);
+    // Mean of deviations.
+    feature[feature_idx++] = dev_sum / blk_idx;
+  }
+}
+
+static int ml_predict_tx_split(MACROBLOCK *x, BLOCK_SIZE bsize, int blk_row,
+                               int blk_col, TX_SIZE tx_size) {
+  const NN_CONFIG *nn_config = av1_tx_split_nnconfig_map[tx_size];
+  if (!nn_config) return -1;
+
+  const int diff_stride = block_size_wide[bsize];
+  const int16_t *diff =
+      x->plane[0].src_diff + 4 * blk_row * diff_stride + 4 * blk_col;
+  const int bw = tx_size_wide[tx_size];
+  const int bh = tx_size_high[tx_size];
+  aom_clear_system_state();
+
+  float features[64] = { 0.0f };
+  get_mean_dev_features(diff, diff_stride, bw, bh, features);
+
+  float score = 0.0f;
+  av1_nn_predict(features, nn_config, 1, &score);
+  aom_clear_system_state();
+
+  int int_score = (int)(score * 10000);
+  return clamp(int_score, -80000, 80000);
+}
+
+static INLINE uint16_t
+get_tx_mask(const AV1_COMP *cpi, MACROBLOCK *x, int plane, int block,
+            int blk_row, int blk_col, BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+            const TXB_CTX *const txb_ctx, FAST_TX_SEARCH_MODE ftxs_mode,
+            int64_t ref_best_rd, TX_TYPE *allowed_txk_types, int *txk_map) {
+  const AV1_COMMON *cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  const int is_inter = is_inter_block(mbmi);
+  const int fast_tx_search = ftxs_mode & FTXS_DCT_AND_1D_DCT_ONLY;
+  // if txk_allowed = TX_TYPES, >1 tx types are allowed, else, if txk_allowed <
+  // TX_TYPES, only that specific tx type is allowed.
+  TX_TYPE txk_allowed = TX_TYPES;
+
+  if ((!is_inter && x->use_default_intra_tx_type) ||
+      (is_inter && x->use_default_inter_tx_type)) {
+    txk_allowed =
+        get_default_tx_type(0, xd, tx_size, cpi->is_screen_content_type);
+  } else if (x->rd_model == LOW_TXFM_RD) {
+    if (plane == 0) txk_allowed = DCT_DCT;
+  }
+
+  const TxSetType tx_set_type = av1_get_ext_tx_set_type(
+      tx_size, is_inter, cm->features.reduced_tx_set_used);
+
+  TX_TYPE uv_tx_type = DCT_DCT;
+  if (plane) {
+    // tx_type of PLANE_TYPE_UV should be the same as PLANE_TYPE_Y
+    uv_tx_type = txk_allowed =
+        av1_get_tx_type(xd, get_plane_type(plane), blk_row, blk_col, tx_size,
+                        cm->features.reduced_tx_set_used);
+  }
+  PREDICTION_MODE intra_dir =
+      mbmi->filter_intra_mode_info.use_filter_intra
+          ? fimode_to_intradir[mbmi->filter_intra_mode_info.filter_intra_mode]
+          : mbmi->mode;
+  uint16_t ext_tx_used_flag =
+      cpi->sf.tx_sf.tx_type_search.use_reduced_intra_txset &&
+              tx_set_type == EXT_TX_SET_DTT4_IDTX_1DDCT
+          ? av1_reduced_intra_tx_used_flag[intra_dir]
+          : av1_ext_tx_used_flag[tx_set_type];
+  if (xd->lossless[mbmi->segment_id] || txsize_sqr_up_map[tx_size] > TX_32X32 ||
+      ext_tx_used_flag == 0x0001 ||
+      (is_inter && cpi->oxcf.use_inter_dct_only) ||
+      (!is_inter && cpi->oxcf.use_intra_dct_only)) {
+    txk_allowed = DCT_DCT;
+  }
+
+  if (cpi->oxcf.enable_flip_idtx == 0) ext_tx_used_flag &= DCT_ADST_TX_MASK;
+
+  uint16_t allowed_tx_mask = 0;  // 1: allow; 0: skip.
+  if (txk_allowed < TX_TYPES) {
+    allowed_tx_mask = 1 << txk_allowed;
+    allowed_tx_mask &= ext_tx_used_flag;
+  } else if (fast_tx_search) {
+    allowed_tx_mask = 0x0c01;  // V_DCT, H_DCT, DCT_DCT
+    allowed_tx_mask &= ext_tx_used_flag;
+  } else {
+    assert(plane == 0);
+    allowed_tx_mask = ext_tx_used_flag;
+    int num_allowed = 0;
+    const FRAME_UPDATE_TYPE update_type = get_frame_update_type(&cpi->gf_group);
+    const int *tx_type_probs =
+        cpi->frame_probs.tx_type_probs[update_type][tx_size];
+    int i;
+
+    if (cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats) {
+      static const int thresh_arr[2][7] = { { 10, 15, 15, 10, 15, 15, 15 },
+                                            { 10, 17, 17, 10, 17, 17, 17 } };
+      const int thresh =
+          thresh_arr[cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats - 1]
+                    [update_type];
+      uint16_t prune = 0;
+      int max_prob = -1;
+      int max_idx = 0;
+      for (i = 0; i < TX_TYPES; i++) {
+        if (tx_type_probs[i] > max_prob && (allowed_tx_mask & (1 << i))) {
+          max_prob = tx_type_probs[i];
+          max_idx = i;
+        }
+        if (tx_type_probs[i] < thresh) prune |= (1 << i);
+      }
+      if ((prune >> max_idx) & 0x01) prune &= ~(1 << max_idx);
+      allowed_tx_mask &= (~prune);
+    }
+    for (i = 0; i < TX_TYPES; i++) {
+      if (allowed_tx_mask & (1 << i)) num_allowed++;
+    }
+    assert(num_allowed > 0);
+
+    if (num_allowed > 2 && cpi->sf.tx_sf.tx_type_search.prune_tx_type_est_rd) {
+      int pf = prune_factors[x->prune_mode];
+      int mf = mul_factors[x->prune_mode];
+      if (num_allowed <= 7) {
+        const uint16_t prune =
+            prune_txk_type(cpi, x, plane, block, tx_size, blk_row, blk_col,
+                           plane_bsize, txk_map, allowed_tx_mask, pf, txb_ctx,
+                           cm->features.reduced_tx_set_used);
+        allowed_tx_mask &= (~prune);
+      } else {
+        const int num_sel = (num_allowed * mf + 50) / 100;
+        const uint16_t prune = prune_txk_type_separ(
+            cpi, x, plane, block, tx_size, blk_row, blk_col, plane_bsize,
+            txk_map, allowed_tx_mask, pf, txb_ctx,
+            cm->features.reduced_tx_set_used, ref_best_rd, num_sel);
+
+        allowed_tx_mask &= (~prune);
+      }
+    } else {
+      assert(num_allowed > 0);
+      int allowed_tx_count = (x->prune_mode == PRUNE_2D_AGGRESSIVE) ? 1 : 5;
+      // !fast_tx_search && txk_end != txk_start && plane == 0
+      if (x->prune_mode >= PRUNE_2D_ACCURATE && is_inter &&
+          num_allowed > allowed_tx_count) {
+        prune_tx_2D(x, plane_bsize, tx_size, blk_row, blk_col, tx_set_type,
+                    x->prune_mode, txk_map, &allowed_tx_mask);
+      }
+    }
+  }
+
+  // Need to have at least one transform type allowed.
+  if (allowed_tx_mask == 0) {
+    txk_allowed = (plane ? uv_tx_type : DCT_DCT);
+    allowed_tx_mask = (1 << txk_allowed);
+  }
+
+  assert(IMPLIES(txk_allowed < TX_TYPES, allowed_tx_mask == 1 << txk_allowed));
+  *allowed_txk_types = txk_allowed;
+  return allowed_tx_mask;
+}
+
+#if CONFIG_RD_DEBUG
+static INLINE void update_txb_coeff_cost(RD_STATS *rd_stats, int plane,
+                                         TX_SIZE tx_size, int blk_row,
+                                         int blk_col, int txb_coeff_cost) {
+  (void)blk_row;
+  (void)blk_col;
+  (void)tx_size;
+  rd_stats->txb_coeff_cost[plane] += txb_coeff_cost;
+
+  {
+    const int txb_h = tx_size_high_unit[tx_size];
+    const int txb_w = tx_size_wide_unit[tx_size];
+    int idx, idy;
+    for (idy = 0; idy < txb_h; ++idy)
+      for (idx = 0; idx < txb_w; ++idx)
+        rd_stats->txb_coeff_cost_map[plane][blk_row + idy][blk_col + idx] = 0;
+
+    rd_stats->txb_coeff_cost_map[plane][blk_row][blk_col] = txb_coeff_cost;
+  }
+  assert(blk_row < TXB_COEFF_COST_MAP_SIZE);
+  assert(blk_col < TXB_COEFF_COST_MAP_SIZE);
+}
+#endif
+
+static INLINE int cost_coeffs(MACROBLOCK *x, int plane, int block,
+                              TX_SIZE tx_size, const TX_TYPE tx_type,
+                              const TXB_CTX *const txb_ctx,
+                              int use_fast_coef_costing,
+                              int reduced_tx_set_used) {
+#if TXCOEFF_COST_TIMER
+  struct aom_usec_timer timer;
+  aom_usec_timer_start(&timer);
+#endif
+  (void)use_fast_coef_costing;
+  const int cost = av1_cost_coeffs_txb(x, plane, block, tx_size, tx_type,
+                                       txb_ctx, reduced_tx_set_used);
+#if TXCOEFF_COST_TIMER
+  AV1_COMMON *tmp_cm = (AV1_COMMON *)&cpi->common;
+  aom_usec_timer_mark(&timer);
+  const int64_t elapsed_time = aom_usec_timer_elapsed(&timer);
+  tmp_cm->txcoeff_cost_timer += elapsed_time;
+  ++tmp_cm->txcoeff_cost_count;
+#endif
+  return cost;
+}
+
+// Search for the best transform type for a given transform block.
+// This function can be used for both inter and intra, both luma and chroma.
+static void search_tx_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
+                           int block, int blk_row, int blk_col,
+                           BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                           const TXB_CTX *const txb_ctx,
+                           FAST_TX_SEARCH_MODE ftxs_mode,
+                           int use_fast_coef_costing, int skip_trellis,
+                           int64_t ref_best_rd, RD_STATS *best_rd_stats) {
+  const AV1_COMMON *cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  MB_MODE_INFO *mbmi = xd->mi[0];
+  int64_t best_rd = INT64_MAX;
+  uint16_t best_eob = 0;
+  TX_TYPE best_tx_type = DCT_DCT;
+  int rate_cost = 0;
+  // The buffer used to swap dqcoeff in macroblockd_plane so we can keep dqcoeff
+  // of the best tx_type
+  DECLARE_ALIGNED(32, tran_low_t, this_dqcoeff[MAX_SB_SQUARE]);
+  tran_low_t *orig_dqcoeff = pd->dqcoeff;
+  tran_low_t *best_dqcoeff = this_dqcoeff;
+  const int tx_type_map_idx =
+      plane ? 0 : blk_row * xd->tx_type_map_stride + blk_col;
+  av1_invalid_rd_stats(best_rd_stats);
+
+  skip_trellis |= !is_trellis_used(cpi->optimize_seg_arr[xd->mi[0]->segment_id],
+                                   DRY_RUN_NORMAL);
+
+  // Hashing based speed feature for intra block. If the hash of the residue
+  // is found in the hash table, use the previous RD search results stored in
+  // the table and terminate early.
+  TXB_RD_INFO *intra_txb_rd_info = NULL;
+  uint16_t cur_joint_ctx = 0;
+  const int is_inter = is_inter_block(mbmi);
+  const int use_intra_txb_hash =
+      cpi->sf.tx_sf.use_intra_txb_hash && frame_is_intra_only(cm) &&
+      !is_inter && plane == 0 && tx_size_wide[tx_size] == tx_size_high[tx_size];
+  if (use_intra_txb_hash) {
+    const int mi_row = xd->mi_row;
+    const int mi_col = xd->mi_col;
+    const int within_border =
+        mi_row >= xd->tile.mi_row_start &&
+        (mi_row + mi_size_high[plane_bsize] < xd->tile.mi_row_end) &&
+        mi_col >= xd->tile.mi_col_start &&
+        (mi_col + mi_size_wide[plane_bsize] < xd->tile.mi_col_end);
+    if (within_border &&
+        is_intra_hash_match(cpi, x, plane, blk_row, blk_col, plane_bsize,
+                            tx_size, txb_ctx, &intra_txb_rd_info,
+                            tx_type_map_idx, &cur_joint_ctx)) {
+      best_rd_stats->rate = intra_txb_rd_info->rate;
+      best_rd_stats->dist = intra_txb_rd_info->dist;
+      best_rd_stats->sse = intra_txb_rd_info->sse;
+      best_rd_stats->skip = intra_txb_rd_info->eob == 0;
+      x->plane[plane].eobs[block] = intra_txb_rd_info->eob;
+      x->plane[plane].txb_entropy_ctx[block] =
+          intra_txb_rd_info->txb_entropy_ctx;
+      best_eob = intra_txb_rd_info->eob;
+      best_tx_type = intra_txb_rd_info->tx_type;
+      skip_trellis |= !intra_txb_rd_info->perform_block_coeff_opt;
+      update_txk_array(xd, blk_row, blk_col, tx_size, best_tx_type);
+      recon_intra(cpi, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
+                  txb_ctx, skip_trellis, best_tx_type, 1, &rate_cost, best_eob);
+      pd->dqcoeff = orig_dqcoeff;
+      return;
+    }
+  }
+
+  uint8_t best_txb_ctx = 0;
+  // txk_allowed = TX_TYPES: >1 tx types are allowed
+  // txk_allowed < TX_TYPES: only that specific tx type is allowed.
+  TX_TYPE txk_allowed = TX_TYPES;
+  int txk_map[TX_TYPES] = {
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+  };
+  // Bit mask to indicate which transform types are allowed in the RD search.
+  const uint16_t allowed_tx_mask =
+      get_tx_mask(cpi, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
+                  txb_ctx, ftxs_mode, ref_best_rd, &txk_allowed, txk_map);
+
+  unsigned int block_mse_q8;
+  int64_t block_sse = pixel_diff_dist(x, plane, blk_row, blk_col, plane_bsize,
+                                      txsize_to_bsize[tx_size], &block_mse_q8);
+  assert(block_mse_q8 != UINT_MAX);
+  if (is_cur_buf_hbd(xd)) {
+    block_sse = ROUND_POWER_OF_TWO(block_sse, (xd->bd - 8) * 2);
+    block_mse_q8 = ROUND_POWER_OF_TWO(block_mse_q8, (xd->bd - 8) * 2);
+  }
+  block_sse *= 16;
+  const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3;
+  const int qstep = x->plane[plane].dequant_QTX[1] >> dequant_shift;
+  // Use mse / qstep^2 based threshold logic to take decision of R-D
+  // optimization of coeffs. For smaller residuals, coeff optimization
+  // would be helpful. For larger residuals, R-D optimization may not be
+  // effective.
+  // TODO(any): Experiment with variance and mean based thresholds
+  const int perform_block_coeff_opt =
+      ((uint64_t)block_mse_q8 <=
+       (uint64_t)x->coeff_opt_dist_threshold * qstep * qstep);
+  skip_trellis |= !perform_block_coeff_opt;
+
+  // Flag to indicate if distortion should be calculated in transform domain or
+  // not during iterating through transform type candidates.
+  // Transform domain distortion is accurate for higher residuals.
+  // TODO(any): Experiment with variance and mean based thresholds
+  int use_transform_domain_distortion =
+      (x->use_transform_domain_distortion > 0) &&
+      (block_mse_q8 >= x->tx_domain_dist_threshold) &&
+      // Any 64-pt transforms only preserves half the coefficients.
+      // Therefore transform domain distortion is not valid for these
+      // transform sizes.
+      txsize_sqr_up_map[tx_size] != TX_64X64;
+  // Flag to indicate if an extra calculation of distortion in the pixel domain
+  // should be performed at the end, after the best transform type has been
+  // decided.
+  int calc_pixel_domain_distortion_final =
+      x->use_transform_domain_distortion == 1 &&
+      use_transform_domain_distortion && x->rd_model != LOW_TXFM_RD;
+  if (calc_pixel_domain_distortion_final &&
+      (txk_allowed < TX_TYPES || allowed_tx_mask == 0x0001))
+    calc_pixel_domain_distortion_final = use_transform_domain_distortion = 0;
+
+  const uint16_t *eobs_ptr = x->plane[plane].eobs;
+
+  TxfmParam txfm_param;
+  QUANT_PARAM quant_param;
+  av1_setup_xform(cm, x, tx_size, DCT_DCT, &txfm_param);
+  av1_setup_quant(tx_size, !skip_trellis,
+                  skip_trellis ? (USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B
+                                                         : AV1_XFORM_QUANT_FP)
+                               : AV1_XFORM_QUANT_FP,
+                  cpi->oxcf.quant_b_adapt, &quant_param);
+
+  // Iterate through all transform type candidates.
+  for (int idx = 0; idx < TX_TYPES; ++idx) {
+    const TX_TYPE tx_type = (TX_TYPE)txk_map[idx];
+    if (!(allowed_tx_mask & (1 << tx_type))) continue;
+    txfm_param.tx_type = tx_type;
+    if (av1_use_qmatrix(&cm->quant_params, xd, mbmi->segment_id)) {
+      av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, tx_type,
+                        &quant_param);
+    }
+    if (plane == 0) xd->tx_type_map[tx_type_map_idx] = tx_type;
+    RD_STATS this_rd_stats;
+    av1_invalid_rd_stats(&this_rd_stats);
+
+    av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param,
+                    &quant_param);
+
+    // Calculate rate cost of quantized coefficients.
+    if (quant_param.use_optimize_b) {
+      if (cpi->sf.rd_sf.optimize_b_precheck && best_rd < INT64_MAX &&
+          eobs_ptr[block] >= 4) {
+        // Calculate distortion quickly in transform domain.
+        dist_block_tx_domain(x, plane, block, tx_size, &this_rd_stats.dist,
+                             &this_rd_stats.sse);
+
+        const int64_t best_rd_ = AOMMIN(best_rd, ref_best_rd);
+        const int64_t dist_cost_estimate =
+            RDCOST(x->rdmult, 0, AOMMIN(this_rd_stats.dist, this_rd_stats.sse));
+        if (dist_cost_estimate - (dist_cost_estimate >> 3) > best_rd_) continue;
+      }
+      av1_optimize_b(cpi, x, plane, block, tx_size, tx_type, txb_ctx,
+                     cpi->sf.rd_sf.trellis_eob_fast, &rate_cost);
+    } else {
+      rate_cost =
+          cost_coeffs(x, plane, block, tx_size, tx_type, txb_ctx,
+                      use_fast_coef_costing, cm->features.reduced_tx_set_used);
+    }
+
+    // If rd cost based on coeff rate alone is already more than best_rd,
+    // terminate early.
+    if (RDCOST(x->rdmult, rate_cost, 0) > best_rd) continue;
+
+    // Calculate distortion.
+    if (eobs_ptr[block] == 0) {
+      // When eob is 0, pixel domain distortion is more efficient and accurate.
+      this_rd_stats.dist = this_rd_stats.sse = block_sse;
+    } else if (use_transform_domain_distortion) {
+      dist_block_tx_domain(x, plane, block, tx_size, &this_rd_stats.dist,
+                           &this_rd_stats.sse);
+    } else {
+      int64_t sse_diff = INT64_MAX;
+      // high_energy threshold assumes that every pixel within a txfm block
+      // has a residue energy of at least 25% of the maximum, i.e. 128 * 128
+      // for 8 bit, then the threshold is scaled based on input bit depth.
+      const int64_t high_energy_thresh =
+          ((int64_t)128 * 128 * tx_size_2d[tx_size]) << ((xd->bd - 8) * 2);
+      const int is_high_energy = (block_sse >= high_energy_thresh);
+      if (tx_size == TX_64X64 || is_high_energy) {
+        // Because 3 out 4 quadrants of transform coefficients are forced to
+        // zero, the inverse transform has a tendency to overflow. sse_diff
+        // is effectively the energy of those 3 quadrants, here we use it
+        // to decide if we should do pixel domain distortion. If the energy
+        // is mostly in first quadrant, then it is unlikely that we have
+        // overflow issue in inverse transform.
+        dist_block_tx_domain(x, plane, block, tx_size, &this_rd_stats.dist,
+                             &this_rd_stats.sse);
+        sse_diff = block_sse - this_rd_stats.sse;
+      }
+      if (tx_size != TX_64X64 || !is_high_energy ||
+          (sse_diff * 2) < this_rd_stats.sse) {
+        const int64_t tx_domain_dist = this_rd_stats.dist;
+        this_rd_stats.dist = dist_block_px_domain(
+            cpi, x, plane, plane_bsize, block, blk_row, blk_col, tx_size);
+        // For high energy blocks, occasionally, the pixel domain distortion
+        // can be artificially low due to clamping at reconstruction stage
+        // even when inverse transform output is hugely different from the
+        // actual residue.
+        if (is_high_energy && this_rd_stats.dist < tx_domain_dist)
+          this_rd_stats.dist = tx_domain_dist;
+      } else {
+        assert(sse_diff < INT64_MAX);
+        this_rd_stats.dist += sse_diff;
+      }
+      this_rd_stats.sse = block_sse;
+    }
+
+    this_rd_stats.rate = rate_cost;
+
+    const int64_t rd =
+        RDCOST(x->rdmult, this_rd_stats.rate, this_rd_stats.dist);
+
+    if (rd < best_rd) {
+      best_rd = rd;
+      *best_rd_stats = this_rd_stats;
+      best_tx_type = tx_type;
+      best_txb_ctx = x->plane[plane].txb_entropy_ctx[block];
+      best_eob = x->plane[plane].eobs[block];
+      // Swap dqcoeff buffers
+      tran_low_t *const tmp_dqcoeff = best_dqcoeff;
+      best_dqcoeff = pd->dqcoeff;
+      pd->dqcoeff = tmp_dqcoeff;
+    }
+
+#if CONFIG_COLLECT_RD_STATS == 1
+    if (plane == 0) {
+      PrintTransformUnitStats(cpi, x, &this_rd_stats, blk_row, blk_col,
+                              plane_bsize, tx_size, tx_type, rd);
+    }
+#endif  // CONFIG_COLLECT_RD_STATS == 1
+
+#if COLLECT_TX_SIZE_DATA
+    // Generate small sample to restrict output size.
+    static unsigned int seed = 21743;
+    if (lcg_rand16(&seed) % 200 == 0) {
+      FILE *fp = NULL;
+
+      if (within_border) {
+        fp = fopen(av1_tx_size_data_output_file, "a");
+      }
+
+      if (fp) {
+        // Transform info and RD
+        const int txb_w = tx_size_wide[tx_size];
+        const int txb_h = tx_size_high[tx_size];
+
+        // Residue signal.
+        const int diff_stride = block_size_wide[plane_bsize];
+        struct macroblock_plane *const p = &x->plane[plane];
+        const int16_t *src_diff =
+            &p->src_diff[(blk_row * diff_stride + blk_col) * 4];
+
+        for (int r = 0; r < txb_h; ++r) {
+          for (int c = 0; c < txb_w; ++c) {
+            fprintf(fp, "%d,", src_diff[c]);
+          }
+          src_diff += diff_stride;
+        }
+
+        fprintf(fp, "%d,%d,%d,%" PRId64, txb_w, txb_h, tx_type, rd);
+        fprintf(fp, "\n");
+        fclose(fp);
+      }
+    }
+#endif  // COLLECT_TX_SIZE_DATA
+
+    // If the current best RD cost is much worse than the reference RD cost,
+    // terminate early.
+    if (cpi->sf.tx_sf.adaptive_txb_search_level) {
+      if ((best_rd - (best_rd >> cpi->sf.tx_sf.adaptive_txb_search_level)) >
+          ref_best_rd) {
+        break;
+      }
+    }
+
+    // Terminate transform type search if the block has been quantized to
+    // all zero.
+    if (cpi->sf.tx_sf.tx_type_search.skip_tx_search && !best_eob) break;
+  }
+
+  assert(best_rd != INT64_MAX);
+
+  best_rd_stats->skip = best_eob == 0;
+  if (plane == 0) update_txk_array(xd, blk_row, blk_col, tx_size, best_tx_type);
+  x->plane[plane].txb_entropy_ctx[block] = best_txb_ctx;
+  x->plane[plane].eobs[block] = best_eob;
+
+  // Point dqcoeff to the quantized coefficients corresponding to the best
+  // transform type, then we can skip transform and quantization, e.g. in the
+  // final pixel domain distortion calculation and recon_intra().
+  pd->dqcoeff = best_dqcoeff;
+
+  if (calc_pixel_domain_distortion_final && best_eob) {
+    best_rd_stats->dist = dist_block_px_domain(
+        cpi, x, plane, plane_bsize, block, blk_row, blk_col, tx_size);
+    best_rd_stats->sse = block_sse;
+  }
+
+  if (intra_txb_rd_info != NULL) {
+    intra_txb_rd_info->valid = 1;
+    intra_txb_rd_info->entropy_context = cur_joint_ctx;
+    intra_txb_rd_info->rate = best_rd_stats->rate;
+    intra_txb_rd_info->dist = best_rd_stats->dist;
+    intra_txb_rd_info->sse = best_rd_stats->sse;
+    intra_txb_rd_info->eob = best_eob;
+    intra_txb_rd_info->txb_entropy_ctx = best_txb_ctx;
+    intra_txb_rd_info->perform_block_coeff_opt = perform_block_coeff_opt;
+    if (plane == 0) intra_txb_rd_info->tx_type = best_tx_type;
+  }
+
+  // Intra mode needs decoded pixels such that the next transform block
+  // can use them for prediction.
+  recon_intra(cpi, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
+              txb_ctx, skip_trellis, best_tx_type, 0, &rate_cost, best_eob);
+  pd->dqcoeff = orig_dqcoeff;
+}
+
+// Pick transform type for a luma transform block of tx_size. Note this function
+// is used only for inter-predicted blocks.
+static AOM_INLINE void tx_type_rd(const AV1_COMP *cpi, MACROBLOCK *x,
+                                  TX_SIZE tx_size, int blk_row, int blk_col,
+                                  int block, int plane_bsize, TXB_CTX *txb_ctx,
+                                  RD_STATS *rd_stats,
+                                  FAST_TX_SEARCH_MODE ftxs_mode,
+                                  int64_t ref_rdcost,
+                                  TXB_RD_INFO *rd_info_array) {
+  const struct macroblock_plane *const p = &x->plane[0];
+  const uint16_t cur_joint_ctx =
+      (txb_ctx->dc_sign_ctx << 8) + txb_ctx->txb_skip_ctx;
+  MACROBLOCKD *xd = &x->e_mbd;
+  assert(is_inter_block(xd->mi[0]));
+  const int tx_type_map_idx = blk_row * xd->tx_type_map_stride + blk_col;
+  // Look up RD and terminate early in case when we've already processed exactly
+  // the same residue with exactly the same entropy context.
+  if (rd_info_array != NULL && rd_info_array->valid &&
+      rd_info_array->entropy_context == cur_joint_ctx) {
+    xd->tx_type_map[tx_type_map_idx] = rd_info_array->tx_type;
+    const TX_TYPE ref_tx_type =
+        av1_get_tx_type(&x->e_mbd, get_plane_type(0), blk_row, blk_col, tx_size,
+                        cpi->common.features.reduced_tx_set_used);
+    if (ref_tx_type == rd_info_array->tx_type) {
+      rd_stats->rate += rd_info_array->rate;
+      rd_stats->dist += rd_info_array->dist;
+      rd_stats->sse += rd_info_array->sse;
+      rd_stats->skip &= rd_info_array->eob == 0;
+      p->eobs[block] = rd_info_array->eob;
+      p->txb_entropy_ctx[block] = rd_info_array->txb_entropy_ctx;
+      return;
+    }
+  }
+
+  RD_STATS this_rd_stats;
+  const int skip_trellis = 0;
+  search_tx_type(cpi, x, 0, block, blk_row, blk_col, plane_bsize, tx_size,
+                 txb_ctx, ftxs_mode, 0, skip_trellis, ref_rdcost,
+                 &this_rd_stats);
+
+  av1_merge_rd_stats(rd_stats, &this_rd_stats);
+
+  // Save RD results for possible reuse in future.
+  if (rd_info_array != NULL) {
+    rd_info_array->valid = 1;
+    rd_info_array->entropy_context = cur_joint_ctx;
+    rd_info_array->rate = this_rd_stats.rate;
+    rd_info_array->dist = this_rd_stats.dist;
+    rd_info_array->sse = this_rd_stats.sse;
+    rd_info_array->eob = p->eobs[block];
+    rd_info_array->txb_entropy_ctx = p->txb_entropy_ctx[block];
+    rd_info_array->tx_type = xd->tx_type_map[tx_type_map_idx];
+  }
+}
+
+static AOM_INLINE void try_tx_block_no_split(
+    const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, int blk_col, int block,
+    TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize,
+    const ENTROPY_CONTEXT *ta, const ENTROPY_CONTEXT *tl,
+    int txfm_partition_ctx, RD_STATS *rd_stats, int64_t ref_best_rd,
+    FAST_TX_SEARCH_MODE ftxs_mode, TXB_RD_INFO_NODE *rd_info_node,
+    TxCandidateInfo *no_split) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  struct macroblock_plane *const p = &x->plane[0];
+  const int bw = mi_size_wide[plane_bsize];
+  const ENTROPY_CONTEXT *const pta = ta + blk_col;
+  const ENTROPY_CONTEXT *const ptl = tl + blk_row;
+  const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+  TXB_CTX txb_ctx;
+  get_txb_ctx(plane_bsize, tx_size, 0, pta, ptl, &txb_ctx);
+  const int zero_blk_rate = x->coeff_costs[txs_ctx][PLANE_TYPE_Y]
+                                .txb_skip_cost[txb_ctx.txb_skip_ctx][1];
+  rd_stats->zero_rate = zero_blk_rate;
+  const int index = av1_get_txb_size_index(plane_bsize, blk_row, blk_col);
+  mbmi->inter_tx_size[index] = tx_size;
+  tx_type_rd(cpi, x, tx_size, blk_row, blk_col, block, plane_bsize, &txb_ctx,
+             rd_stats, ftxs_mode, ref_best_rd,
+             rd_info_node != NULL ? rd_info_node->rd_info_array : NULL);
+  assert(rd_stats->rate < INT_MAX);
+
+  const int pick_skip = !xd->lossless[mbmi->segment_id] &&
+                        (rd_stats->skip == 1 ||
+                         RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) >=
+                             RDCOST(x->rdmult, zero_blk_rate, rd_stats->sse));
+  if (pick_skip) {
+#if CONFIG_RD_DEBUG
+    update_txb_coeff_cost(rd_stats, 0, tx_size, blk_row, blk_col,
+                          zero_blk_rate - rd_stats->rate);
+#endif  // CONFIG_RD_DEBUG
+    rd_stats->rate = zero_blk_rate;
+    rd_stats->dist = rd_stats->sse;
+    p->eobs[block] = 0;
+    update_txk_array(xd, blk_row, blk_col, tx_size, DCT_DCT);
+  }
+  rd_stats->skip = pick_skip;
+  set_blk_skip(x, 0, blk_row * bw + blk_col, pick_skip);
+
+  if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH)
+    rd_stats->rate += x->txfm_partition_cost[txfm_partition_ctx][0];
+
+  no_split->rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+  no_split->txb_entropy_ctx = p->txb_entropy_ctx[block];
+  no_split->tx_type =
+      xd->tx_type_map[blk_row * xd->tx_type_map_stride + blk_col];
+}
+
+static AOM_INLINE void try_tx_block_split(
+    const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, int blk_col, int block,
+    TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *ta,
+    ENTROPY_CONTEXT *tl, TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left,
+    int txfm_partition_ctx, int64_t no_split_rd, int64_t ref_best_rd,
+    FAST_TX_SEARCH_MODE ftxs_mode, TXB_RD_INFO_NODE *rd_info_node,
+    RD_STATS *split_rd_stats) {
+  assert(tx_size < TX_SIZES_ALL);
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int max_blocks_high = max_block_high(xd, plane_bsize, 0);
+  const int max_blocks_wide = max_block_wide(xd, plane_bsize, 0);
+  const int txb_width = tx_size_wide_unit[tx_size];
+  const int txb_height = tx_size_high_unit[tx_size];
+  // Transform size after splitting current block.
+  const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+  const int sub_txb_width = tx_size_wide_unit[sub_txs];
+  const int sub_txb_height = tx_size_high_unit[sub_txs];
+  const int sub_step = sub_txb_width * sub_txb_height;
+  const int nblks = (txb_height / sub_txb_height) * (txb_width / sub_txb_width);
+  assert(nblks > 0);
+  av1_init_rd_stats(split_rd_stats);
+  split_rd_stats->rate = x->txfm_partition_cost[txfm_partition_ctx][1];
+
+  for (int r = 0, blk_idx = 0; r < txb_height; r += sub_txb_height) {
+    for (int c = 0; c < txb_width; c += sub_txb_width, ++blk_idx) {
+      assert(blk_idx < 4);
+      const int offsetr = blk_row + r;
+      const int offsetc = blk_col + c;
+      if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
+
+      RD_STATS this_rd_stats;
+      int this_cost_valid = 1;
+      select_tx_block(
+          cpi, x, offsetr, offsetc, block, sub_txs, depth + 1, plane_bsize, ta,
+          tl, tx_above, tx_left, &this_rd_stats, no_split_rd / nblks,
+          ref_best_rd - split_rd_stats->rdcost, &this_cost_valid, ftxs_mode,
+          (rd_info_node != NULL) ? rd_info_node->children[blk_idx] : NULL);
+      if (!this_cost_valid) {
+        split_rd_stats->rdcost = INT64_MAX;
+        return;
+      }
+      av1_merge_rd_stats(split_rd_stats, &this_rd_stats);
+      split_rd_stats->rdcost =
+          RDCOST(x->rdmult, split_rd_stats->rate, split_rd_stats->dist);
+      if (split_rd_stats->rdcost > ref_best_rd) {
+        split_rd_stats->rdcost = INT64_MAX;
+        return;
+      }
+      block += sub_step;
+    }
+  }
+}
+
+// Search for the best transform partition(recursive)/type for a given
+// inter-predicted luma block. The obtained transform selection will be saved
+// in xd->mi[0], the corresponding RD stats will be saved in rd_stats.
+static AOM_INLINE void select_tx_block(
+    const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, int blk_col, int block,
+    TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *ta,
+    ENTROPY_CONTEXT *tl, TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left,
+    RD_STATS *rd_stats, int64_t prev_level_rd, int64_t ref_best_rd,
+    int *is_cost_valid, FAST_TX_SEARCH_MODE ftxs_mode,
+    TXB_RD_INFO_NODE *rd_info_node) {
+  assert(tx_size < TX_SIZES_ALL);
+  av1_init_rd_stats(rd_stats);
+  if (ref_best_rd < 0) {
+    *is_cost_valid = 0;
+    return;
+  }
+
+  MACROBLOCKD *const xd = &x->e_mbd;
+  assert(blk_row < max_block_high(xd, plane_bsize, 0) &&
+         blk_col < max_block_wide(xd, plane_bsize, 0));
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int ctx = txfm_partition_context(tx_above + blk_col, tx_left + blk_row,
+                                         mbmi->sb_type, tx_size);
+  struct macroblock_plane *const p = &x->plane[0];
+
+  const int try_no_split =
+      cpi->oxcf.enable_tx64 || txsize_sqr_up_map[tx_size] != TX_64X64;
+  int try_split = tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH;
+  TxCandidateInfo no_split = { INT64_MAX, 0, TX_TYPES };
+
+  // Try using current block as a single transform block without split.
+  if (try_no_split) {
+    try_tx_block_no_split(cpi, x, blk_row, blk_col, block, tx_size, depth,
+                          plane_bsize, ta, tl, ctx, rd_stats, ref_best_rd,
+                          ftxs_mode, rd_info_node, &no_split);
+
+    // Speed features for early termination.
+    const int search_level = cpi->sf.tx_sf.adaptive_txb_search_level;
+    if (search_level) {
+      if ((no_split.rd - (no_split.rd >> (1 + search_level))) > ref_best_rd) {
+        *is_cost_valid = 0;
+        return;
+      }
+      if (no_split.rd - (no_split.rd >> (2 + search_level)) > prev_level_rd) {
+        try_split = 0;
+      }
+    }
+    if (cpi->sf.tx_sf.txb_split_cap) {
+      if (p->eobs[block] == 0) try_split = 0;
+    }
+  }
+
+  // ML based speed feature to skip searching for split transform blocks.
+  if (x->e_mbd.bd == 8 && try_split &&
+      !(ref_best_rd == INT64_MAX && no_split.rd == INT64_MAX)) {
+    const int threshold = cpi->sf.tx_sf.tx_type_search.ml_tx_split_thresh;
+    if (threshold >= 0) {
+      const int split_score =
+          ml_predict_tx_split(x, plane_bsize, blk_row, blk_col, tx_size);
+      if (split_score < -threshold) try_split = 0;
+    }
+  }
+
+  RD_STATS split_rd_stats;
+  split_rd_stats.rdcost = INT64_MAX;
+  // Try splitting current block into smaller transform blocks.
+  if (try_split) {
+    try_tx_block_split(cpi, x, blk_row, blk_col, block, tx_size, depth,
+                       plane_bsize, ta, tl, tx_above, tx_left, ctx, no_split.rd,
+                       AOMMIN(no_split.rd, ref_best_rd), ftxs_mode,
+                       rd_info_node, &split_rd_stats);
+  }
+
+  if (no_split.rd < split_rd_stats.rdcost) {
+    ENTROPY_CONTEXT *pta = ta + blk_col;
+    ENTROPY_CONTEXT *ptl = tl + blk_row;
+    p->txb_entropy_ctx[block] = no_split.txb_entropy_ctx;
+    av1_set_txb_context(x, 0, block, tx_size, pta, ptl);
+    txfm_partition_update(tx_above + blk_col, tx_left + blk_row, tx_size,
+                          tx_size);
+    for (int idy = 0; idy < tx_size_high_unit[tx_size]; ++idy) {
+      for (int idx = 0; idx < tx_size_wide_unit[tx_size]; ++idx) {
+        const int index =
+            av1_get_txb_size_index(plane_bsize, blk_row + idy, blk_col + idx);
+        mbmi->inter_tx_size[index] = tx_size;
+      }
+    }
+    mbmi->tx_size = tx_size;
+    update_txk_array(xd, blk_row, blk_col, tx_size, no_split.tx_type);
+    const int bw = mi_size_wide[plane_bsize];
+    set_blk_skip(x, 0, blk_row * bw + blk_col, rd_stats->skip);
+  } else {
+    *rd_stats = split_rd_stats;
+    if (split_rd_stats.rdcost == INT64_MAX) *is_cost_valid = 0;
+  }
+}
+
+static AOM_INLINE void choose_largest_tx_size(const AV1_COMP *const cpi,
+                                              MACROBLOCK *x, RD_STATS *rd_stats,
+                                              int64_t ref_best_rd,
+                                              BLOCK_SIZE bs) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  mbmi->tx_size = tx_size_from_tx_mode(bs, x->tx_mode_search_type);
+
+  // If tx64 is not enabled, we need to go down to the next available size
+  if (!cpi->oxcf.enable_tx64) {
+    static const TX_SIZE tx_size_max_32[TX_SIZES_ALL] = {
+      TX_4X4,    // 4x4 transform
+      TX_8X8,    // 8x8 transform
+      TX_16X16,  // 16x16 transform
+      TX_32X32,  // 32x32 transform
+      TX_32X32,  // 64x64 transform
+      TX_4X8,    // 4x8 transform
+      TX_8X4,    // 8x4 transform
+      TX_8X16,   // 8x16 transform
+      TX_16X8,   // 16x8 transform
+      TX_16X32,  // 16x32 transform
+      TX_32X16,  // 32x16 transform
+      TX_32X32,  // 32x64 transform
+      TX_32X32,  // 64x32 transform
+      TX_4X16,   // 4x16 transform
+      TX_16X4,   // 16x4 transform
+      TX_8X32,   // 8x32 transform
+      TX_32X8,   // 32x8 transform
+      TX_16X32,  // 16x64 transform
+      TX_32X16,  // 64x16 transform
+    };
+
+    mbmi->tx_size = tx_size_max_32[mbmi->tx_size];
+  }
+
+  const int skip_ctx = av1_get_skip_context(xd);
+  const int no_skip_flag_rate = x->skip_cost[skip_ctx][0];
+  const int skip_flag_rate = x->skip_cost[skip_ctx][1];
+  // Skip RDcost is used only for Inter blocks
+  const int64_t skip_rd =
+      is_inter_block(mbmi) ? RDCOST(x->rdmult, skip_flag_rate, 0) : INT64_MAX;
+  const int64_t no_skip_rd = RDCOST(x->rdmult, no_skip_flag_rate, 0);
+  const int skip_trellis = 0;
+  av1_txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd,
+                       AOMMIN(no_skip_rd, skip_rd), AOM_PLANE_Y, bs,
+                       mbmi->tx_size, cpi->sf.rd_sf.use_fast_coef_costing,
+                       FTXS_NONE, skip_trellis);
+}
+
+static AOM_INLINE void choose_smallest_tx_size(const AV1_COMP *const cpi,
+                                               MACROBLOCK *x,
+                                               RD_STATS *rd_stats,
+                                               int64_t ref_best_rd,
+                                               BLOCK_SIZE bs) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+
+  mbmi->tx_size = TX_4X4;
+  // TODO(any) : Pass this_rd based on skip/non-skip cost
+  const int skip_trellis = 0;
+  av1_txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, 0, 0, bs, mbmi->tx_size,
+                       cpi->sf.rd_sf.use_fast_coef_costing, FTXS_NONE,
+                       skip_trellis);
+}
+
+// Search for the best uniform transform size and type for current coding block.
+static AOM_INLINE void choose_tx_size_type_from_rd(const AV1_COMP *const cpi,
+                                                   MACROBLOCK *x,
+                                                   RD_STATS *rd_stats,
+                                                   int64_t ref_best_rd,
+                                                   BLOCK_SIZE bs) {
+  av1_invalid_rd_stats(rd_stats);
+
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const TX_SIZE max_rect_tx_size = max_txsize_rect_lookup[bs];
+  const int tx_select = x->tx_mode_search_type == TX_MODE_SELECT;
+  int start_tx;
+  // The split depth can be at most MAX_TX_DEPTH, so the init_depth controls
+  // how many times of splitting is allowed during the RD search.
+  int init_depth;
+
+  if (tx_select) {
+    start_tx = max_rect_tx_size;
+    init_depth = get_search_init_depth(mi_size_wide[bs], mi_size_high[bs],
+                                       is_inter_block(mbmi), &cpi->sf,
+                                       x->tx_size_search_method);
+  } else {
+    const TX_SIZE chosen_tx_size =
+        tx_size_from_tx_mode(bs, x->tx_mode_search_type);
+    start_tx = chosen_tx_size;
+    init_depth = MAX_TX_DEPTH;
+  }
+
+  const int skip_trellis = 0;
+  uint8_t best_txk_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+  TX_SIZE best_tx_size = max_rect_tx_size;
+  int64_t best_rd = INT64_MAX;
+  const int num_blks = bsize_to_num_blk(bs);
+  x->rd_model = FULL_TXFM_RD;
+  int64_t rd[MAX_TX_DEPTH + 1] = { INT64_MAX, INT64_MAX, INT64_MAX };
+  for (int tx_size = start_tx, depth = init_depth; depth <= MAX_TX_DEPTH;
+       depth++, tx_size = sub_tx_size_map[tx_size]) {
+    if (!cpi->oxcf.enable_tx64 && txsize_sqr_up_map[tx_size] == TX_64X64) {
+      continue;
+    }
+
+    RD_STATS this_rd_stats;
+    rd[depth] = av1_uniform_txfm_yrd(cpi, x, &this_rd_stats, ref_best_rd, bs,
+                                     tx_size, FTXS_NONE, skip_trellis);
+    if (rd[depth] < best_rd) {
+      av1_copy_array(best_blk_skip, x->blk_skip, num_blks);
+      av1_copy_array(best_txk_type_map, xd->tx_type_map, num_blks);
+      best_tx_size = tx_size;
+      best_rd = rd[depth];
+      *rd_stats = this_rd_stats;
+    }
+    if (tx_size == TX_4X4) break;
+    // If we are searching three depths, prune the smallest size depending
+    // on rd results for the first two depths for low contrast blocks.
+    if (depth > init_depth && depth != MAX_TX_DEPTH &&
+        x->source_variance < 256) {
+      if (rd[depth - 1] != INT64_MAX && rd[depth] > rd[depth - 1]) break;
+    }
+  }
+
+  if (rd_stats->rate != INT_MAX) {
+    mbmi->tx_size = best_tx_size;
+    av1_copy_array(xd->tx_type_map, best_txk_type_map, num_blks);
+    av1_copy_array(x->blk_skip, best_blk_skip, num_blks);
+  }
+}
+
+// Search for the best transform type for the given transform block in the
+// given plane/channel, and calculate the corresponding RD cost.
+static AOM_INLINE void block_rd_txfm(int plane, int block, int blk_row,
+                                     int blk_col, BLOCK_SIZE plane_bsize,
+                                     TX_SIZE tx_size, void *arg) {
+  struct rdcost_block_args *args = arg;
+  if (args->exit_early) {
+    args->incomplete_exit = 1;
+    return;
+  }
+
+  MACROBLOCK *const x = args->x;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int is_inter = is_inter_block(xd->mi[0]);
+  const AV1_COMP *cpi = args->cpi;
+  ENTROPY_CONTEXT *a = args->t_above + blk_col;
+  ENTROPY_CONTEXT *l = args->t_left + blk_row;
+  const AV1_COMMON *cm = &cpi->common;
+  RD_STATS this_rd_stats;
+  av1_init_rd_stats(&this_rd_stats);
+
+  if (!is_inter) {
+    av1_predict_intra_block_facade(cm, xd, plane, blk_col, blk_row, tx_size);
+    av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size);
+  }
+
+  TXB_CTX txb_ctx;
+  get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
+  search_tx_type(cpi, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
+                 &txb_ctx, args->ftxs_mode, args->use_fast_coef_costing,
+                 args->skip_trellis, args->best_rd - args->current_rd,
+                 &this_rd_stats);
+
+  if (plane == AOM_PLANE_Y && xd->cfl.store_y) {
+    assert(!is_inter || plane_bsize < BLOCK_8X8);
+    cfl_store_tx(xd, blk_row, blk_col, tx_size, plane_bsize);
+  }
+
+#if CONFIG_RD_DEBUG
+  update_txb_coeff_cost(&this_rd_stats, plane, tx_size, blk_row, blk_col,
+                        this_rd_stats.rate);
+#endif  // CONFIG_RD_DEBUG
+  av1_set_txb_context(x, plane, block, tx_size, a, l);
+
+  const int blk_idx =
+      blk_row * (block_size_wide[plane_bsize] >> MI_SIZE_LOG2) + blk_col;
+  if (plane == 0)
+    set_blk_skip(x, plane, blk_idx, x->plane[plane].eobs[block] == 0);
+  else
+    set_blk_skip(x, plane, blk_idx, 0);
+
+  int64_t rd;
+  if (is_inter) {
+    const int64_t no_skip_rd =
+        RDCOST(x->rdmult, this_rd_stats.rate, this_rd_stats.dist);
+    const int64_t skip_rd = RDCOST(x->rdmult, 0, this_rd_stats.sse);
+    rd = AOMMIN(no_skip_rd, skip_rd);
+    this_rd_stats.skip &= !x->plane[plane].eobs[block];
+  } else {
+    // Signal non-skip for Intra blocks
+    rd = RDCOST(x->rdmult, this_rd_stats.rate, this_rd_stats.dist);
+    this_rd_stats.skip = 0;
+  }
+
+  av1_merge_rd_stats(&args->rd_stats, &this_rd_stats);
+
+  args->current_rd += rd;
+  if (args->current_rd > args->best_rd) args->exit_early = 1;
+}
+
+// Search for the best transform type and return the transform coefficients RD
+// cost of current luma coding block with the given uniform transform size.
+int64_t av1_uniform_txfm_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
+                             RD_STATS *rd_stats, int64_t ref_best_rd,
+                             BLOCK_SIZE bs, TX_SIZE tx_size,
+                             FAST_TX_SEARCH_MODE ftxs_mode, int skip_trellis) {
+  assert(IMPLIES(is_rect_tx(tx_size), is_rect_tx_allowed_bsize(bs)));
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int is_inter = is_inter_block(mbmi);
+  const int tx_select = x->tx_mode_search_type == TX_MODE_SELECT &&
+                        block_signals_txsize(mbmi->sb_type);
+  int tx_size_rate = 0;
+  if (tx_select) {
+    const int ctx = txfm_partition_context(
+        xd->above_txfm_context, xd->left_txfm_context, mbmi->sb_type, tx_size);
+    tx_size_rate = is_inter ? x->txfm_partition_cost[ctx][0]
+                            : tx_size_cost(x, bs, tx_size);
+  }
+  const int skip_ctx = av1_get_skip_context(xd);
+  const int no_skip_flag_rate = x->skip_cost[skip_ctx][0];
+  const int skip_flag_rate = x->skip_cost[skip_ctx][1];
+  const int64_t skip_rd =
+      is_inter ? RDCOST(x->rdmult, skip_flag_rate, 0) : INT64_MAX;
+  const int64_t no_this_rd =
+      RDCOST(x->rdmult, no_skip_flag_rate + tx_size_rate, 0);
+
+  mbmi->tx_size = tx_size;
+  av1_txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd,
+                       AOMMIN(no_this_rd, skip_rd), AOM_PLANE_Y, bs, tx_size,
+                       cpi->sf.rd_sf.use_fast_coef_costing, ftxs_mode,
+                       skip_trellis);
+  if (rd_stats->rate == INT_MAX) return INT64_MAX;
+
+  int64_t rd;
+  // rdstats->rate should include all the rate except skip/non-skip cost as the
+  // same is accounted in the caller functions after rd evaluation of all
+  // planes. However the decisions should be done after considering the
+  // skip/non-skip header cost
+  if (rd_stats->skip && is_inter) {
+    rd = RDCOST(x->rdmult, skip_flag_rate, rd_stats->sse);
+  } else {
+    // Intra blocks are always signalled as non-skip
+    rd = RDCOST(x->rdmult, rd_stats->rate + no_skip_flag_rate + tx_size_rate,
+                rd_stats->dist);
+    rd_stats->rate += tx_size_rate;
+  }
+  // Check if forcing the block to skip transform leads to smaller RD cost.
+  if (is_inter && !rd_stats->skip && !xd->lossless[mbmi->segment_id]) {
+    int64_t temp_skip_rd = RDCOST(x->rdmult, skip_flag_rate, rd_stats->sse);
+    if (temp_skip_rd <= rd) {
+      rd = temp_skip_rd;
+      rd_stats->rate = 0;
+      rd_stats->dist = rd_stats->sse;
+      rd_stats->skip = 1;
+    }
+  }
+
+  return rd;
+}
+
+// Search for the best transform type for a luma inter-predicted block, given
+// the transform block partitions.
+// This function is used only when some speed features are enabled.
+static AOM_INLINE void tx_block_yrd(
+    const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, int blk_col, int block,
+    TX_SIZE tx_size, BLOCK_SIZE plane_bsize, int depth,
+    ENTROPY_CONTEXT *above_ctx, ENTROPY_CONTEXT *left_ctx,
+    TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left, int64_t ref_best_rd,
+    RD_STATS *rd_stats, FAST_TX_SEARCH_MODE ftxs_mode) {
+  assert(tx_size < TX_SIZES_ALL);
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  assert(is_inter_block(mbmi));
+  const int max_blocks_high = max_block_high(xd, plane_bsize, 0);
+  const int max_blocks_wide = max_block_wide(xd, plane_bsize, 0);
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+  const TX_SIZE plane_tx_size = mbmi->inter_tx_size[av1_get_txb_size_index(
+      plane_bsize, blk_row, blk_col)];
+  const int ctx = txfm_partition_context(tx_above + blk_col, tx_left + blk_row,
+                                         mbmi->sb_type, tx_size);
+
+  av1_init_rd_stats(rd_stats);
+  if (tx_size == plane_tx_size) {
+    ENTROPY_CONTEXT *ta = above_ctx + blk_col;
+    ENTROPY_CONTEXT *tl = left_ctx + blk_row;
+    const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+    TXB_CTX txb_ctx;
+    get_txb_ctx(plane_bsize, tx_size, 0, ta, tl, &txb_ctx);
+
+    const int zero_blk_rate = x->coeff_costs[txs_ctx][get_plane_type(0)]
+                                  .txb_skip_cost[txb_ctx.txb_skip_ctx][1];
+    rd_stats->zero_rate = zero_blk_rate;
+    tx_type_rd(cpi, x, tx_size, blk_row, blk_col, block, plane_bsize, &txb_ctx,
+               rd_stats, ftxs_mode, ref_best_rd, NULL);
+    const int mi_width = mi_size_wide[plane_bsize];
+    if (RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) >=
+            RDCOST(x->rdmult, zero_blk_rate, rd_stats->sse) ||
+        rd_stats->skip == 1) {
+      rd_stats->rate = zero_blk_rate;
+      rd_stats->dist = rd_stats->sse;
+      rd_stats->skip = 1;
+      set_blk_skip(x, 0, blk_row * mi_width + blk_col, 1);
+      x->plane[0].eobs[block] = 0;
+      x->plane[0].txb_entropy_ctx[block] = 0;
+      update_txk_array(xd, blk_row, blk_col, tx_size, DCT_DCT);
+    } else {
+      rd_stats->skip = 0;
+      set_blk_skip(x, 0, blk_row * mi_width + blk_col, 0);
+    }
+    if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH)
+      rd_stats->rate += x->txfm_partition_cost[ctx][0];
+    av1_set_txb_context(x, 0, block, tx_size, ta, tl);
+    txfm_partition_update(tx_above + blk_col, tx_left + blk_row, tx_size,
+                          tx_size);
+  } else {
+    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+    const int txb_width = tx_size_wide_unit[sub_txs];
+    const int txb_height = tx_size_high_unit[sub_txs];
+    const int step = txb_height * txb_width;
+    RD_STATS pn_rd_stats;
+    int64_t this_rd = 0;
+    assert(txb_width > 0 && txb_height > 0);
+
+    for (int row = 0; row < tx_size_high_unit[tx_size]; row += txb_height) {
+      for (int col = 0; col < tx_size_wide_unit[tx_size]; col += txb_width) {
+        const int offsetr = blk_row + row;
+        const int offsetc = blk_col + col;
+        if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
+
+        av1_init_rd_stats(&pn_rd_stats);
+        tx_block_yrd(cpi, x, offsetr, offsetc, block, sub_txs, plane_bsize,
+                     depth + 1, above_ctx, left_ctx, tx_above, tx_left,
+                     ref_best_rd - this_rd, &pn_rd_stats, ftxs_mode);
+        if (pn_rd_stats.rate == INT_MAX) {
+          av1_invalid_rd_stats(rd_stats);
+          return;
+        }
+        av1_merge_rd_stats(rd_stats, &pn_rd_stats);
+        this_rd += RDCOST(x->rdmult, pn_rd_stats.rate, pn_rd_stats.dist);
+        block += step;
+      }
+    }
+
+    if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH)
+      rd_stats->rate += x->txfm_partition_cost[ctx][1];
+  }
+}
+
+// search for tx type with tx sizes already decided for a inter-predicted luma
+// partition block. It's used only when some speed features are enabled.
+// Return value 0: early termination triggered, no valid rd cost available;
+//              1: rd cost values are valid.
+static int inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
+                           RD_STATS *rd_stats, BLOCK_SIZE bsize,
+                           int64_t ref_best_rd, FAST_TX_SEARCH_MODE ftxs_mode) {
+  if (ref_best_rd < 0) {
+    av1_invalid_rd_stats(rd_stats);
+    return 0;
+  }
+
+  av1_init_rd_stats(rd_stats);
+
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblockd_plane *const pd = &xd->plane[0];
+  const int mi_width = mi_size_wide[bsize];
+  const int mi_height = mi_size_high[bsize];
+  const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, bsize, 0);
+  const int bh = tx_size_high_unit[max_tx_size];
+  const int bw = tx_size_wide_unit[max_tx_size];
+  const int step = bw * bh;
+  const int init_depth = get_search_init_depth(mi_width, mi_height, 1, &cpi->sf,
+                                               x->tx_size_search_method);
+  ENTROPY_CONTEXT ctxa[MAX_MIB_SIZE];
+  ENTROPY_CONTEXT ctxl[MAX_MIB_SIZE];
+  TXFM_CONTEXT tx_above[MAX_MIB_SIZE];
+  TXFM_CONTEXT tx_left[MAX_MIB_SIZE];
+  av1_get_entropy_contexts(bsize, pd, ctxa, ctxl);
+  memcpy(tx_above, xd->above_txfm_context, sizeof(TXFM_CONTEXT) * mi_width);
+  memcpy(tx_left, xd->left_txfm_context, sizeof(TXFM_CONTEXT) * mi_height);
+
+  int64_t this_rd = 0;
+  for (int idy = 0, block = 0; idy < mi_height; idy += bh) {
+    for (int idx = 0; idx < mi_width; idx += bw) {
+      RD_STATS pn_rd_stats;
+      av1_init_rd_stats(&pn_rd_stats);
+      tx_block_yrd(cpi, x, idy, idx, block, max_tx_size, bsize, init_depth,
+                   ctxa, ctxl, tx_above, tx_left, ref_best_rd - this_rd,
+                   &pn_rd_stats, ftxs_mode);
+      if (pn_rd_stats.rate == INT_MAX) {
+        av1_invalid_rd_stats(rd_stats);
+        return 0;
+      }
+      av1_merge_rd_stats(rd_stats, &pn_rd_stats);
+      this_rd +=
+          AOMMIN(RDCOST(x->rdmult, pn_rd_stats.rate, pn_rd_stats.dist),
+                 RDCOST(x->rdmult, pn_rd_stats.zero_rate, pn_rd_stats.sse));
+      block += step;
+    }
+  }
+
+  const int skip_ctx = av1_get_skip_context(xd);
+  const int no_skip_flag_rate = x->skip_cost[skip_ctx][0];
+  const int skip_flag_rate = x->skip_cost[skip_ctx][1];
+  const int64_t skip_rd = RDCOST(x->rdmult, skip_flag_rate, rd_stats->sse);
+  this_rd =
+      RDCOST(x->rdmult, rd_stats->rate + no_skip_flag_rate, rd_stats->dist);
+  if (skip_rd < this_rd) {
+    this_rd = skip_rd;
+    rd_stats->rate = 0;
+    rd_stats->dist = rd_stats->sse;
+    rd_stats->skip = 1;
+  }
+
+  const int is_cost_valid = this_rd > ref_best_rd;
+  if (!is_cost_valid) {
+    // reset cost value
+    av1_invalid_rd_stats(rd_stats);
+  }
+  return is_cost_valid;
+}
+
+// Search for the best transform size and type for current inter-predicted
+// luma block with recursive transform block partitioning. The obtained
+// transform selection will be saved in xd->mi[0], the corresponding RD stats
+// will be saved in rd_stats. The returned value is the corresponding RD cost.
+static int64_t select_tx_size_and_type(const AV1_COMP *cpi, MACROBLOCK *x,
+                                       RD_STATS *rd_stats, BLOCK_SIZE bsize,
+                                       int64_t ref_best_rd,
+                                       TXB_RD_INFO_NODE *rd_info_tree) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  assert(is_inter_block(xd->mi[0]));
+  assert(bsize < BLOCK_SIZES_ALL);
+  const int fast_tx_search = x->tx_size_search_method > USE_FULL_RD;
+  int64_t rd_thresh = ref_best_rd;
+  if (fast_tx_search && rd_thresh < INT64_MAX) {
+    if (INT64_MAX - rd_thresh > (rd_thresh >> 3)) rd_thresh += (rd_thresh >> 3);
+  }
+  assert(rd_thresh > 0);
+  const FAST_TX_SEARCH_MODE ftxs_mode =
+      fast_tx_search ? FTXS_DCT_AND_1D_DCT_ONLY : FTXS_NONE;
+  const struct macroblockd_plane *const pd = &xd->plane[0];
+  assert(bsize < BLOCK_SIZES_ALL);
+  const int mi_width = mi_size_wide[bsize];
+  const int mi_height = mi_size_high[bsize];
+  ENTROPY_CONTEXT ctxa[MAX_MIB_SIZE];
+  ENTROPY_CONTEXT ctxl[MAX_MIB_SIZE];
+  TXFM_CONTEXT tx_above[MAX_MIB_SIZE];
+  TXFM_CONTEXT tx_left[MAX_MIB_SIZE];
+  av1_get_entropy_contexts(bsize, pd, ctxa, ctxl);
+  memcpy(tx_above, xd->above_txfm_context, sizeof(TXFM_CONTEXT) * mi_width);
+  memcpy(tx_left, xd->left_txfm_context, sizeof(TXFM_CONTEXT) * mi_height);
+  const int init_depth = get_search_init_depth(mi_width, mi_height, 1, &cpi->sf,
+                                               x->tx_size_search_method);
+  const TX_SIZE max_tx_size = max_txsize_rect_lookup[bsize];
+  const int bh = tx_size_high_unit[max_tx_size];
+  const int bw = tx_size_wide_unit[max_tx_size];
+  const int step = bw * bh;
+  const int skip_ctx = av1_get_skip_context(xd);
+  const int no_skip_flag_cost = x->skip_cost[skip_ctx][0];
+  const int skip_flag_cost = x->skip_cost[skip_ctx][1];
+  int64_t skip_rd = RDCOST(x->rdmult, skip_flag_cost, 0);
+  int64_t no_skip_rd = RDCOST(x->rdmult, no_skip_flag_cost, 0);
+  int block = 0;
+
+  av1_init_rd_stats(rd_stats);
+  for (int idy = 0; idy < max_block_high(xd, bsize, 0); idy += bh) {
+    for (int idx = 0; idx < max_block_wide(xd, bsize, 0); idx += bw) {
+      const int64_t best_rd_sofar =
+          (rd_thresh == INT64_MAX)
+              ? INT64_MAX
+              : (rd_thresh - (AOMMIN(skip_rd, no_skip_rd)));
+      int is_cost_valid = 1;
+      RD_STATS pn_rd_stats;
+      // Search for the best transform block size and type for the sub-block.
+      select_tx_block(cpi, x, idy, idx, block, max_tx_size, init_depth, bsize,
+                      ctxa, ctxl, tx_above, tx_left, &pn_rd_stats, INT64_MAX,
+                      best_rd_sofar, &is_cost_valid, ftxs_mode, rd_info_tree);
+      if (!is_cost_valid || pn_rd_stats.rate == INT_MAX) {
+        av1_invalid_rd_stats(rd_stats);
+        return INT64_MAX;
+      }
+      av1_merge_rd_stats(rd_stats, &pn_rd_stats);
+      skip_rd = RDCOST(x->rdmult, skip_flag_cost, rd_stats->sse);
+      no_skip_rd =
+          RDCOST(x->rdmult, rd_stats->rate + no_skip_flag_cost, rd_stats->dist);
+      block += step;
+      if (rd_info_tree != NULL) rd_info_tree += 1;
+    }
+  }
+
+  if (rd_stats->rate == INT_MAX) return INT64_MAX;
+
+  rd_stats->skip = (skip_rd <= no_skip_rd);
+
+  // If fast_tx_search is true, only DCT and 1D DCT were tested in
+  // select_inter_block_yrd() above. Do a better search for tx type with
+  // tx sizes already decided.
+  if (fast_tx_search && cpi->sf.tx_sf.refine_fast_tx_search_results) {
+    if (!inter_block_yrd(cpi, x, rd_stats, bsize, ref_best_rd, FTXS_NONE))
+      return INT64_MAX;
+  }
+
+  int64_t final_rd;
+  if (rd_stats->skip) {
+    final_rd = RDCOST(x->rdmult, skip_flag_cost, rd_stats->sse);
+  } else {
+    final_rd =
+        RDCOST(x->rdmult, rd_stats->rate + no_skip_flag_cost, rd_stats->dist);
+    if (!xd->lossless[xd->mi[0]->segment_id]) {
+      final_rd =
+          AOMMIN(final_rd, RDCOST(x->rdmult, skip_flag_cost, rd_stats->sse));
+    }
+  }
+
+  return final_rd;
+}
+
+// Return 1 to terminate transform search early. The decision is made based on
+// the comparison with the reference RD cost and the model-estimated RD cost.
+static AOM_INLINE int model_based_tx_search_prune(const AV1_COMP *cpi,
+                                                  MACROBLOCK *x,
+                                                  BLOCK_SIZE bsize,
+                                                  int64_t ref_best_rd) {
+  const int level = cpi->sf.tx_sf.model_based_prune_tx_search_level;
+  assert(level >= 0 && level <= 2);
+  int model_rate;
+  int64_t model_dist;
+  int model_skip;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  model_rd_sb_fn[MODELRD_TYPE_TX_SEARCH_PRUNE](
+      cpi, bsize, x, xd, 0, 0, &model_rate, &model_dist, &model_skip, NULL,
+      NULL, NULL, NULL);
+  if (model_skip) return 0;
+  const int64_t model_rd = RDCOST(x->rdmult, model_rate, model_dist);
+  // TODO(debargha, urvang): Improve the model and make the check below
+  // tighter.
+  static const int prune_factor_by8[] = { 3, 5 };
+  const int factor = prune_factor_by8[level - 1];
+  return ((model_rd * factor) >> 3) > ref_best_rd;
+}
+
+// Search for best transform size and type for luma inter blocks. The transform
+// block partitioning can be recursive resulting in non-uniform transform sizes.
+// The best transform size and type, if found, will be saved in the MB_MODE_INFO
+// structure, and the corresponding RD stats will be saved in rd_stats.
+void av1_pick_recursive_tx_size_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
+                                         RD_STATS *rd_stats, BLOCK_SIZE bsize,
+                                         int64_t ref_best_rd) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  assert(is_inter_block(xd->mi[0]));
+
+  av1_invalid_rd_stats(rd_stats);
+
+  // If modeled RD cost is a lot worse than the best so far, terminate early.
+  if (cpi->sf.tx_sf.model_based_prune_tx_search_level &&
+      ref_best_rd != INT64_MAX) {
+    if (model_based_tx_search_prune(cpi, x, bsize, ref_best_rd)) return;
+  }
+
+  // Hashing based speed feature. If the hash of the prediction residue block is
+  // found in the hash table, use previous search results and terminate early.
+  uint32_t hash = 0;
+  MB_RD_RECORD *mb_rd_record = NULL;
+  const int mi_row = x->e_mbd.mi_row;
+  const int mi_col = x->e_mbd.mi_col;
+  const int within_border =
+      mi_row >= xd->tile.mi_row_start &&
+      (mi_row + mi_size_high[bsize] < xd->tile.mi_row_end) &&
+      mi_col >= xd->tile.mi_col_start &&
+      (mi_col + mi_size_wide[bsize] < xd->tile.mi_col_end);
+  const int is_mb_rd_hash_enabled =
+      (within_border && cpi->sf.rd_sf.use_mb_rd_hash);
+  const int n4 = bsize_to_num_blk(bsize);
+  if (is_mb_rd_hash_enabled) {
+    hash = get_block_residue_hash(x, bsize);
+    mb_rd_record = &x->mb_rd_record;
+    const int match_index = find_mb_rd_info(mb_rd_record, ref_best_rd, hash);
+    if (match_index != -1) {
+      MB_RD_INFO *tx_rd_info = &mb_rd_record->tx_rd_info[match_index];
+      fetch_tx_rd_info(n4, tx_rd_info, rd_stats, x);
+      return;
+    }
+  }
+
+  // If we predict that skip is the optimal RD decision - set the respective
+  // context and terminate early.
+  int64_t dist;
+  if (x->predict_skip_level &&
+      predict_skip_flag(x, bsize, &dist,
+                        cpi->common.features.reduced_tx_set_used)) {
+    set_skip_flag(x, rd_stats, bsize, dist);
+    // Save the RD search results into tx_rd_record.
+    if (is_mb_rd_hash_enabled)
+      save_tx_rd_info(n4, hash, x, rd_stats, mb_rd_record);
+    return;
+  }
+#if CONFIG_SPEED_STATS
+  ++x->tx_search_count;
+#endif  // CONFIG_SPEED_STATS
+
+  // Pre-compute residue hashes (transform block level) and find existing or
+  // add new RD records to store and reuse rate and distortion values to speed
+  // up TX size/type search.
+  TXB_RD_INFO_NODE matched_rd_info[4 + 16 + 64];
+  int found_rd_info = 0;
+  if (ref_best_rd != INT64_MAX && within_border &&
+      cpi->sf.tx_sf.use_inter_txb_hash) {
+    found_rd_info = find_tx_size_rd_records(x, bsize, matched_rd_info);
+  }
+
+  const int64_t rd =
+      select_tx_size_and_type(cpi, x, rd_stats, bsize, ref_best_rd,
+                              found_rd_info ? matched_rd_info : NULL);
+
+  if (rd == INT64_MAX) {
+    // We should always find at least one candidate unless ref_best_rd is less
+    // than INT64_MAX (in which case, all the calls to select_tx_size_fix_type
+    // might have failed to find something better)
+    assert(ref_best_rd != INT64_MAX);
+    av1_invalid_rd_stats(rd_stats);
+    return;
+  }
+
+  // Save the RD search results into tx_rd_record.
+  if (is_mb_rd_hash_enabled) {
+    assert(mb_rd_record != NULL);
+    save_tx_rd_info(n4, hash, x, rd_stats, mb_rd_record);
+  }
+}
+
+// Search for the best transform size and type for current coding block, with
+// the assumption that all the transform blocks have a uniform size (VP9 style).
+// The selected transform size and type will be saved in the MB_MODE_INFO
+// structure; the corresponding RD stats will be saved in rd_stats.
+// This function may be used for both intra and inter predicted blocks.
+void av1_pick_uniform_tx_size_type_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                       RD_STATS *rd_stats, BLOCK_SIZE bs,
+                                       int64_t ref_best_rd) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  assert(bs == mbmi->sb_type);
+  const int is_inter = is_inter_block(mbmi);
+  const int mi_row = xd->mi_row;
+  const int mi_col = xd->mi_col;
+
+  av1_init_rd_stats(rd_stats);
+
+  // Hashing based speed feature for inter blocks. If the hash of the residue
+  // block is found in the table, use previously saved search results and
+  // terminate early.
+  uint32_t hash = 0;
+  MB_RD_RECORD *mb_rd_record = NULL;
+  const int num_blks = bsize_to_num_blk(bs);
+  if (is_inter && cpi->sf.rd_sf.use_mb_rd_hash) {
+    const int within_border =
+        mi_row >= xd->tile.mi_row_start &&
+        (mi_row + mi_size_high[bs] < xd->tile.mi_row_end) &&
+        mi_col >= xd->tile.mi_col_start &&
+        (mi_col + mi_size_wide[bs] < xd->tile.mi_col_end);
+    if (within_border) {
+      hash = get_block_residue_hash(x, bs);
+      mb_rd_record = &x->mb_rd_record;
+      const int match_index = find_mb_rd_info(mb_rd_record, ref_best_rd, hash);
+      if (match_index != -1) {
+        MB_RD_INFO *tx_rd_info = &mb_rd_record->tx_rd_info[match_index];
+        fetch_tx_rd_info(num_blks, tx_rd_info, rd_stats, x);
+        return;
+      }
+    }
+  }
+
+  // If we predict that skip is the optimal RD decision - set the respective
+  // context and terminate early.
+  int64_t dist;
+  if (x->predict_skip_level && is_inter && !xd->lossless[mbmi->segment_id] &&
+      predict_skip_flag(x, bs, &dist,
+                        cpi->common.features.reduced_tx_set_used)) {
+    // Populate rdstats as per skip decision
+    set_skip_flag(x, rd_stats, bs, dist);
+    // Save the RD search results into tx_rd_record.
+    if (mb_rd_record) {
+      save_tx_rd_info(num_blks, hash, x, rd_stats, mb_rd_record);
+    }
+    return;
+  }
+
+  if (xd->lossless[mbmi->segment_id]) {
+    // Lossless mode can only pick the smallest (4x4) transform size.
+    choose_smallest_tx_size(cpi, x, rd_stats, ref_best_rd, bs);
+  } else if (x->tx_size_search_method == USE_LARGESTALL) {
+    choose_largest_tx_size(cpi, x, rd_stats, ref_best_rd, bs);
+  } else {
+    choose_tx_size_type_from_rd(cpi, x, rd_stats, ref_best_rd, bs);
+  }
+
+  // Save the RD search results into tx_rd_record for possible reuse in future.
+  if (mb_rd_record) {
+    save_tx_rd_info(num_blks, hash, x, rd_stats, mb_rd_record);
+  }
+}
+
+// Calculate the transform coefficient RD cost for the given chroma coding block
+// Return value 0: early termination triggered, no valid rd cost available;
+//              1: rd cost values are valid.
+int av1_txfm_uvrd(const AV1_COMP *const cpi, MACROBLOCK *x, RD_STATS *rd_stats,
+                  BLOCK_SIZE bsize, int64_t ref_best_rd) {
+  av1_init_rd_stats(rd_stats);
+  if (ref_best_rd < 0) return 0;
+  if (!x->e_mbd.is_chroma_ref) return 1;
+
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_U];
+  const int is_inter = is_inter_block(mbmi);
+  int64_t this_rd = 0, skip_rd = 0;
+  const BLOCK_SIZE plane_bsize =
+      get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+
+  if (is_inter) {
+    for (int plane = 1; plane < MAX_MB_PLANE; ++plane)
+      av1_subtract_plane(x, plane_bsize, plane);
+  }
+
+  const int skip_trellis = 0;
+  const TX_SIZE uv_tx_size = av1_get_tx_size(AOM_PLANE_U, xd);
+  int is_cost_valid = 1;
+  for (int plane = 1; plane < MAX_MB_PLANE; ++plane) {
+    RD_STATS this_rd_stats;
+    int64_t chroma_ref_best_rd = ref_best_rd;
+    // For inter blocks, refined ref_best_rd is used for early exit
+    // For intra blocks, even though current rd crosses ref_best_rd, early
+    // exit is not recommended as current rd is used for gating subsequent
+    // modes as well (say, for angular modes)
+    // TODO(any): Extend the early exit mechanism for intra modes as well
+    if (cpi->sf.inter_sf.perform_best_rd_based_gating_for_chroma && is_inter &&
+        chroma_ref_best_rd != INT64_MAX)
+      chroma_ref_best_rd = ref_best_rd - AOMMIN(this_rd, skip_rd);
+    av1_txfm_rd_in_plane(x, cpi, &this_rd_stats, chroma_ref_best_rd, 0, plane,
+                         plane_bsize, uv_tx_size,
+                         cpi->sf.rd_sf.use_fast_coef_costing, FTXS_NONE,
+                         skip_trellis);
+    if (this_rd_stats.rate == INT_MAX) {
+      is_cost_valid = 0;
+      break;
+    }
+    av1_merge_rd_stats(rd_stats, &this_rd_stats);
+    this_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+    skip_rd = RDCOST(x->rdmult, 0, rd_stats->sse);
+    if (AOMMIN(this_rd, skip_rd) > ref_best_rd) {
+      is_cost_valid = 0;
+      break;
+    }
+  }
+
+  if (!is_cost_valid) {
+    // reset cost value
+    av1_invalid_rd_stats(rd_stats);
+  }
+
+  return is_cost_valid;
+}
+
+// Search for the best transform type and calculate the transform coefficients
+// RD cost of the current coding block with the specified (uniform) transform
+// size and channel. The RD results will be saved in rd_stats.
+void av1_txfm_rd_in_plane(MACROBLOCK *x, const AV1_COMP *cpi,
+                          RD_STATS *rd_stats, int64_t ref_best_rd,
+                          int64_t current_rd, int plane, BLOCK_SIZE plane_bsize,
+                          TX_SIZE tx_size, int use_fast_coef_costing,
+                          FAST_TX_SEARCH_MODE ftxs_mode, int skip_trellis) {
+  assert(IMPLIES(plane == 0, x->e_mbd.mi[0]->tx_size == tx_size));
+
+  if (!cpi->oxcf.enable_tx64 && txsize_sqr_up_map[tx_size] == TX_64X64) {
+    av1_invalid_rd_stats(rd_stats);
+    return;
+  }
+
+  if (current_rd > ref_best_rd) {
+    av1_invalid_rd_stats(rd_stats);
+    return;
+  }
+
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  struct rdcost_block_args args;
+  av1_zero(args);
+  args.x = x;
+  args.cpi = cpi;
+  args.best_rd = ref_best_rd;
+  args.current_rd = current_rd;
+  args.use_fast_coef_costing = use_fast_coef_costing;
+  args.ftxs_mode = ftxs_mode;
+  args.skip_trellis = skip_trellis;
+  av1_init_rd_stats(&args.rd_stats);
+
+  av1_get_entropy_contexts(plane_bsize, pd, args.t_above, args.t_left);
+  av1_foreach_transformed_block_in_plane(xd, plane_bsize, plane, block_rd_txfm,
+                                         &args);
+
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int is_inter = is_inter_block(mbmi);
+  const int invalid_rd = is_inter ? args.incomplete_exit : args.exit_early;
+
+  if (invalid_rd) {
+    av1_invalid_rd_stats(rd_stats);
+  } else {
+    *rd_stats = args.rd_stats;
+  }
+}
+
+// This function combines y and uv planes' transform search processes together
+// for inter-predicted blocks (including IntraBC), when the prediction is
+// already generated. It first does subtraction to obtain the prediction error.
+// Then it calls
+// av1_pick_recursive_tx_size_type_yrd/av1_pick_uniform_tx_size_type_yrd and
+// av1_txfm_uvrd sequentially and handles the early terminations
+// happening in those functions. At the end, it computes the
+// rd_stats/_y/_uv accordingly.
+int av1_txfm_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+                    RD_STATS *rd_stats, RD_STATS *rd_stats_y,
+                    RD_STATS *rd_stats_uv, int mode_rate, int64_t ref_best_rd) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int skip_ctx = av1_get_skip_context(xd);
+  const int skip_flag_cost[2] = { x->skip_cost[skip_ctx][0],
+                                  x->skip_cost[skip_ctx][1] };
+  const int64_t min_header_rate =
+      mode_rate + AOMMIN(skip_flag_cost[0], skip_flag_cost[1]);
+  // Account for minimum skip and non_skip rd.
+  // Eventually either one of them will be added to mode_rate
+  const int64_t min_header_rd_possible = RDCOST(x->rdmult, min_header_rate, 0);
+  if (min_header_rd_possible > ref_best_rd) {
+    av1_invalid_rd_stats(rd_stats_y);
+    return 0;
+  }
+
+  const AV1_COMMON *cm = &cpi->common;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  const int64_t mode_rd = RDCOST(x->rdmult, mode_rate, 0);
+  const int64_t rd_thresh =
+      ref_best_rd == INT64_MAX ? INT64_MAX : ref_best_rd - mode_rd;
+  av1_init_rd_stats(rd_stats);
+  av1_init_rd_stats(rd_stats_y);
+  rd_stats->rate = mode_rate;
+
+  // cost and distortion
+  av1_subtract_plane(x, bsize, 0);
+  if (x->tx_mode_search_type == TX_MODE_SELECT &&
+      !xd->lossless[mbmi->segment_id]) {
+    av1_pick_recursive_tx_size_type_yrd(cpi, x, rd_stats_y, bsize, rd_thresh);
+#if CONFIG_COLLECT_RD_STATS == 2
+    PrintPredictionUnitStats(cpi, tile_data, x, rd_stats_y, bsize);
+#endif  // CONFIG_COLLECT_RD_STATS == 2
+  } else {
+    av1_pick_uniform_tx_size_type_yrd(cpi, x, rd_stats_y, bsize, rd_thresh);
+    memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size));
+    for (int i = 0; i < xd->height * xd->width; ++i)
+      set_blk_skip(x, 0, i, rd_stats_y->skip);
+  }
+
+  if (rd_stats_y->rate == INT_MAX) return 0;
+
+  av1_merge_rd_stats(rd_stats, rd_stats_y);
+
+  const int64_t non_skip_rdcosty =
+      RDCOST(x->rdmult, rd_stats->rate + skip_flag_cost[0], rd_stats->dist);
+  const int64_t skip_rdcosty =
+      RDCOST(x->rdmult, mode_rate + skip_flag_cost[1], rd_stats->sse);
+  const int64_t min_rdcosty = AOMMIN(non_skip_rdcosty, skip_rdcosty);
+  if (min_rdcosty > ref_best_rd) {
+    const int64_t tokenonly_rdy =
+        AOMMIN(RDCOST(x->rdmult, rd_stats_y->rate, rd_stats_y->dist),
+               RDCOST(x->rdmult, 0, rd_stats_y->sse));
+    // Invalidate rd_stats_y to skip the rest of the motion modes search
+    if (tokenonly_rdy -
+            (tokenonly_rdy >> cpi->sf.inter_sf.prune_motion_mode_level) >
+        rd_thresh) {
+      av1_invalid_rd_stats(rd_stats_y);
+    }
+    return 0;
+  }
+
+  av1_init_rd_stats(rd_stats_uv);
+  const int num_planes = av1_num_planes(cm);
+  if (num_planes > 1) {
+    int64_t ref_best_chroma_rd = ref_best_rd;
+    // Calculate best rd cost possible for chroma
+    if (cpi->sf.inter_sf.perform_best_rd_based_gating_for_chroma &&
+        (ref_best_chroma_rd != INT64_MAX)) {
+      ref_best_chroma_rd =
+          (ref_best_chroma_rd - AOMMIN(non_skip_rdcosty, skip_rdcosty));
+    }
+    const int is_cost_valid_uv =
+        av1_txfm_uvrd(cpi, x, rd_stats_uv, bsize, ref_best_chroma_rd);
+    if (!is_cost_valid_uv) return 0;
+    av1_merge_rd_stats(rd_stats, rd_stats_uv);
+  }
+
+  int choose_skip = rd_stats->skip;
+  if (!choose_skip && !xd->lossless[mbmi->segment_id]) {
+    const int64_t rdcost_no_skip = RDCOST(
+        x->rdmult, rd_stats_y->rate + rd_stats_uv->rate + skip_flag_cost[0],
+        rd_stats->dist);
+    const int64_t rdcost_skip =
+        RDCOST(x->rdmult, skip_flag_cost[1], rd_stats->sse);
+    if (rdcost_no_skip >= rdcost_skip) choose_skip = 1;
+  }
+  if (choose_skip) {
+    rd_stats_y->rate = 0;
+    rd_stats_uv->rate = 0;
+    rd_stats->rate = mode_rate + skip_flag_cost[1];
+    rd_stats->dist = rd_stats->sse;
+    rd_stats_y->dist = rd_stats_y->sse;
+    rd_stats_uv->dist = rd_stats_uv->sse;
+    mbmi->skip = 1;
+    if (rd_stats->skip) {
+      const int64_t tmprd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+      if (tmprd > ref_best_rd) return 0;
+    }
+  } else {
+    rd_stats->rate += skip_flag_cost[0];
+    mbmi->skip = 0;
+  }
+
+  return 1;
+}
diff --git a/libs/libaom/src/av1/encoder/tx_search.h b/libs/libaom/src/av1/encoder/tx_search.h
new file mode 100644
index 000000000..82d56719d
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/tx_search.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_TRANSFORM_SEARCH_H_
+#define AOM_AV1_ENCODER_TRANSFORM_SEARCH_H_
+
+#include "av1/common/pred_common.h"
+#include "av1/encoder/encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Set this macro as 1 to collect data about tx size selection.
+#define COLLECT_TX_SIZE_DATA 0
+
+#if COLLECT_TX_SIZE_DATA
+static const char av1_tx_size_data_output_file[] = "tx_size_data.txt";
+#endif
+
+enum {
+  FTXS_NONE = 0,
+  FTXS_DCT_AND_1D_DCT_ONLY = 1 << 0,
+  FTXS_DISABLE_TRELLIS_OPT = 1 << 1,
+  FTXS_USE_TRANSFORM_DOMAIN = 1 << 2
+} UENUM1BYTE(FAST_TX_SEARCH_MODE);
+
+static AOM_INLINE int tx_size_cost(const MACROBLOCK *const x, BLOCK_SIZE bsize,
+                                   TX_SIZE tx_size) {
+  assert(bsize == x->e_mbd.mi[0]->sb_type);
+  if (x->tx_mode_search_type != TX_MODE_SELECT || !block_signals_txsize(bsize))
+    return 0;
+
+  const int32_t tx_size_cat = bsize_to_tx_size_cat(bsize);
+  const int depth = tx_size_to_depth(tx_size, bsize);
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const int tx_size_ctx = get_tx_size_context(xd);
+  return x->tx_size_cost[tx_size_cat][tx_size_ctx][depth];
+}
+
+int64_t av1_uniform_txfm_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
+                             RD_STATS *rd_stats, int64_t ref_best_rd,
+                             BLOCK_SIZE bs, TX_SIZE tx_size,
+                             FAST_TX_SEARCH_MODE ftxs_mode, int skip_trellis);
+
+void av1_pick_recursive_tx_size_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
+                                         RD_STATS *rd_stats, BLOCK_SIZE bsize,
+                                         int64_t ref_best_rd);
+
+void av1_pick_uniform_tx_size_type_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                       RD_STATS *rd_stats, BLOCK_SIZE bs,
+                                       int64_t ref_best_rd);
+
+int av1_txfm_uvrd(const AV1_COMP *const cpi, MACROBLOCK *x, RD_STATS *rd_stats,
+                  BLOCK_SIZE bsize, int64_t ref_best_rd);
+
+void av1_txfm_rd_in_plane(MACROBLOCK *x, const AV1_COMP *cpi,
+                          RD_STATS *rd_stats, int64_t ref_best_rd,
+                          int64_t this_rd, int plane, BLOCK_SIZE plane_bsize,
+                          TX_SIZE tx_size, int use_fast_coef_costing,
+                          FAST_TX_SEARCH_MODE ftxs_mode, int skip_trellis);
+
+int av1_txfm_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+                    RD_STATS *rd_stats, RD_STATS *rd_stats_y,
+                    RD_STATS *rd_stats_uv, int mode_rate, int64_t ref_best_rd);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_TRANSFORM_SEARCH_H_
diff --git a/libs/libaom/src/av1/encoder/use_flat_gop_model_params.h b/libs/libaom/src/av1/encoder/use_flat_gop_model_params.h
new file mode 100644
index 000000000..cf0776644
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/use_flat_gop_model_params.h
@@ -0,0 +1,233 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_USE_FLAT_GOP_MODEL_PARAMS_H_
+#define AOM_AV1_ENCODER_USE_FLAT_GOP_MODEL_PARAMS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/encoder/ml.h"
+
+// A binary classifier that returns true (score > 0) if it is better to use a
+// flat GOP structure, rather than a GOP structure that uses ALT-REFs and
+// internal ARFs.
+
+#define NUM_FEATURES 21
+#define NUM_HIDDEN_LAYERS 1
+#define NUM_HIDDEN_NODES_LAYER0 48
+#define NUM_LABELS 1
+
+static const float
+    av1_use_flat_gop_nn_weights_layer0[NUM_FEATURES *
+                                       NUM_HIDDEN_NODES_LAYER0] = {
+      0.3801f,  -2.1832f, 1.7469f,  2.0130f,  2.1264f,  -0.7293f, -0.2814f,
+      0.0692f,  -4.6589f, -1.4591f, 0.3023f,  -0.4310f, -0.1911f, -0.8284f,
+      -1.3322f, -0.4621f, -0.1148f, -0.3531f, -0.0794f, -0.3114f, -0.1664f,
+      -0.1615f, 0.2913f,  -0.0394f, -0.0620f, 0.1845f,  0.0204f,  -0.2124f,
+      -0.1233f, -0.1685f, 0.1215f,  -0.2372f, -0.2865f, -0.1976f, 0.2137f,
+      -0.1318f, -0.0324f, 0.0415f,  -0.1172f, 0.1077f,  -0.1135f, -0.2462f,
+      -0.0743f, -0.1584f, -0.3267f, -0.0566f, -0.1615f, -0.3931f, -0.5200f,
+      -0.1786f, -0.1811f, -0.2812f, -0.1986f, -0.4393f, -0.3941f, -0.2500f,
+      -0.2029f, -0.4605f, -0.4973f, -0.2238f, -0.2599f, -0.1951f, -0.2034f,
+      -0.3186f, -0.1368f, -0.5076f, -0.4718f, -0.1815f, -0.3338f, -0.0550f,
+      -0.3920f, -0.5328f, -0.1658f, -0.2194f, -0.2867f, -0.0916f, -0.1678f,
+      -0.1760f, -0.5055f, -0.2322f, -0.4668f, -0.0121f, -0.3903f, -0.2721f,
+      -0.1306f, 0.1199f,  0.2894f,  0.1098f,  -0.0155f, -0.0844f, 0.0421f,
+      -0.2364f, -0.1073f, -0.0878f, -0.2146f, -0.1713f, -0.2283f, 0.0342f,
+      0.0394f,  -0.2808f, -0.0048f, 0.2640f,  -0.1371f, 0.1709f,  0.0155f,
+      -0.3614f, -0.1843f, -0.3215f, -0.3121f, -0.2609f, -0.0254f, -0.2474f,
+      -0.4674f, -0.3674f, -0.2076f, 0.0149f,  -0.3304f, -0.2678f, -0.0465f,
+      -0.1326f, -0.4504f, -0.5101f, -0.1280f, -0.0416f, -0.4296f, -0.4568f,
+      -0.6762f, -2.8105f, 0.7249f,  1.4288f,  1.3731f,  0.3034f,  0.1841f,
+      -0.0912f, -0.1508f, 1.2637f,  -0.2009f, 0.3236f,  -0.2500f, -0.0736f,
+      0.8655f,  -0.2599f, 0.1150f,  -0.0368f, -0.1122f, -0.7650f, -0.2004f,
+      -0.0891f, -0.3832f, -0.2576f, -0.3532f, -0.1735f, -0.4018f, -0.0265f,
+      -0.2988f, 0.2555f,  -0.1041f, -0.3391f, -0.5316f, -0.0171f, -0.3232f,
+      -0.0565f, -0.3359f, -0.1842f, -0.0582f, 0.0073f,  -0.0278f, -0.5517f,
+      0.0892f,  -0.1354f, 0.0548f,  -0.0401f, -0.1697f, 0.0432f,  0.0832f,
+      -0.3538f, 0.2602f,  -0.0066f, -0.2130f, -0.3085f, 0.0025f,  0.2464f,
+      -0.0103f, -0.3082f, -0.1136f, -0.2359f, -0.3421f, 0.1335f,  -0.3016f,
+      -1.0355f, -1.0572f, -0.3316f, -0.1235f, -0.3730f, -0.1751f, -0.1921f,
+      0.0031f,  -0.6297f, -0.5179f, 0.1082f,  -0.3130f, -0.1120f, -0.5430f,
+      -0.1782f, 0.0534f,  -0.1052f, 0.1471f,  -0.7156f, -0.5453f, -0.5437f,
+      1.8709f,  1.9696f,  -1.0343f, -0.3150f, -0.8399f, -0.0052f, -0.1123f,
+      -0.1059f, 0.6755f,  1.2593f,  -0.2512f, -0.2053f, 0.0835f,  0.3261f,
+      -0.0172f, 0.1230f,  -0.3687f, 0.1993f,  0.9390f,  -0.0165f, 0.6856f,
+      -0.4372f, -0.4041f, -0.2869f, -0.3871f, -0.3587f, -0.2418f, 0.0518f,
+      0.0110f,  -1.4713f, -0.1307f, -0.3246f, -0.5091f, -0.4652f, -0.4288f,
+      -0.0763f, -0.1755f, 0.0662f,  -0.3026f, -0.4462f, -0.4123f, -0.2891f,
+      -0.2251f, -0.4925f, -0.3820f, -0.1840f, -0.2878f, -0.1973f, -0.1010f,
+      -0.1622f, -0.3108f, -0.5292f, -0.1017f, -0.0607f, -0.2426f, -0.6406f,
+      -0.3834f, -0.2313f, -0.2433f, -0.1773f, -0.1581f, -0.3295f, -0.3799f,
+      -0.4447f, -0.2389f, -0.4231f, -0.1498f, -0.0181f, -0.4429f, -0.3515f,
+      0.0425f,  -0.5280f, -0.3462f, -0.3659f, 0.0153f,  -0.1002f, -0.5057f,
+      -0.2134f, -0.2859f, -0.1988f, -0.4758f, 0.0967f,  -0.4784f, 0.1868f,
+      -0.4387f, -1.3376f, -0.4452f, 0.3837f,  0.1698f,  -0.7076f, -0.4320f,
+      0.0382f,  -1.8053f, -0.6589f, 0.1406f,  -0.4340f, 0.0641f,  -0.2558f,
+      -0.4496f, -0.5003f, -0.6241f, -0.2217f, -0.8312f, -0.6793f, -0.3563f,
+      0.5153f,  -0.7851f, 1.0570f,  0.9702f,  0.5238f,  -0.6932f, -0.4443f,
+      0.0407f,  -3.0961f, -0.8461f, 0.0562f,  -0.0642f, 0.2471f,  -0.5911f,
+      -0.7715f, -0.1574f, -0.0375f, -0.1951f, -0.3097f, -0.2040f, 0.0128f,
+      -0.0918f, -0.0698f, -0.0970f, -0.2946f, -0.1723f, -0.2569f, -0.4382f,
+      -0.5174f, -0.2058f, -0.2973f, -0.0858f, -0.2526f, -0.2648f, -0.2339f,
+      -0.3474f, 0.0607f,  0.0272f,  -0.3142f, -0.1306f, -0.4938f, -0.1894f,
+      -0.0551f, -0.1061f, -0.1613f, -0.1942f, 0.0590f,  -0.2009f, -0.1286f,
+      -0.2035f, -0.0393f, -0.0650f, -0.1110f, 0.0123f,  -0.1122f, -0.0246f,
+      -0.2042f, 0.0411f,  -0.2771f, -0.0189f, 0.0927f,  0.0286f,  -0.1559f,
+      -0.3217f, -0.1039f, 0.1471f,  0.2489f,  0.2085f,  -0.4199f, -0.2404f,
+      0.0358f,  -0.7567f, -0.2413f, -0.3437f, -0.2433f, -0.3687f, -0.1194f,
+      -0.4289f, -0.1138f, -0.0721f, -0.3461f, -0.0244f, -0.3530f, -0.2842f,
+      -0.3823f, -0.1238f, -0.5475f, -0.2688f, -0.0073f, 0.0491f,  -0.4500f,
+      0.0201f,  0.0303f,  -0.2160f, -0.4219f, -0.4831f, -0.4593f, -0.2304f,
+      -0.2082f, -0.0367f, -0.5226f, -0.0082f, -0.1867f, -0.1812f, -0.2753f,
+      2.6650f,  1.9698f,  -2.9425f, 1.2119f,  1.5000f,  0.3356f,  0.3905f,
+      -0.2006f, -1.4038f, -1.0917f, 0.1423f,  -0.3528f, 0.0888f,  0.5802f,
+      1.0977f,  0.1083f,  -0.0693f, -0.0784f, 0.4247f,  0.4108f,  0.4970f,
+      -0.7290f, -0.1659f, -0.0517f, 0.0776f,  -0.0550f, -0.2374f, -0.4245f,
+      -0.0165f, -0.6804f, -0.3211f, -0.3101f, -0.1883f, -0.0786f, -0.3971f,
+      -0.4130f, -0.0606f, 0.1432f,  -0.0518f, -0.4179f, -0.4949f, -0.3451f,
+      -0.7559f, -4.0792f, 1.5526f,  0.2824f,  0.6086f,  -0.2148f, 0.0959f,
+      0.0506f,  -5.5176f, -3.9702f, 0.1597f,  -0.1760f, -0.0627f, 0.1657f,
+      -1.2996f, -0.2899f, -0.0600f, -0.0531f, -1.5160f, -0.4837f, -1.6961f,
+      -0.1134f, -0.1838f, -0.3071f, -0.4215f, -0.4184f, 0.0192f,  -0.2128f,
+      -0.3094f, -0.2607f, -0.4855f, -0.1881f, 0.0258f,  -0.5085f, -0.3630f,
+      -0.4824f, -0.3762f, -0.3324f, -0.1134f, -0.3350f, 0.0217f,  -0.2803f,
+      -0.5669f, -0.5674f, -0.5441f, -0.5965f, -0.3062f, -0.4666f, -0.4079f,
+      -0.0065f, -0.7566f, -0.3437f, -0.2474f, -0.2360f, -0.5683f, -0.3853f,
+      -0.6670f, -0.4158f, -0.2831f, -0.3327f, -0.7419f, -0.6481f, -0.4004f,
+      -0.4025f, -0.6405f, -0.4265f, -0.0167f, 0.3195f,  -0.0822f, -0.4350f,
+      -0.0032f, -1.0448f, -0.4407f, 0.0488f,  0.0776f,  -0.3828f, -0.3380f,
+      -0.2983f, -0.2220f, -0.4105f, -0.2312f, -0.4166f, -0.3258f, -0.1424f,
+      -0.6588f, -0.9433f, 0.3402f,  0.5800f,  0.6368f,  -0.4298f, -0.5743f,
+      0.0822f,  -1.0843f, -0.1645f, -0.1990f, 0.0255f,  -0.1039f, -0.3673f,
+      0.4367f,  -0.5491f, -0.0932f, -0.0323f, -0.2405f, -0.2922f, -0.4019f,
+      -0.4936f, -1.2338f, 0.4681f,  0.7454f,  0.8181f,  -0.3680f, -0.1613f,
+      -0.0008f, -1.3326f, -0.0667f, 0.1569f,  -0.0978f, -0.3229f, -0.4222f,
+      0.0330f,  0.1064f,  -0.1325f, 0.0121f,  -0.3976f, -0.2254f, -0.3942f,
+      -0.4771f, -0.1887f, 0.1020f,  0.3331f,  0.3098f,  -0.1256f, -0.4736f,
+      0.0295f,  -0.3919f, -0.0931f, -0.2484f, -0.4629f, -0.2800f, -0.2851f,
+      -0.2243f, -0.3958f, -0.3053f, -0.6585f, -0.1159f, -0.2330f, -0.1989f,
+      0.2273f,  0.1963f,  0.0283f,  0.0198f,  -0.1298f, -0.0627f, -0.2753f,
+      -0.1552f, 0.2734f,  -0.0551f, -0.2927f, -0.3772f, -0.4522f, -0.0786f,
+      0.0079f,  0.1664f,  -0.0228f, -0.2908f, -0.1714f, 0.1223f,  -0.0680f,
+      -0.5048f, -0.0852f, -0.4653f, -0.5142f, -0.1818f, -0.1659f, 0.0678f,
+      -0.1296f, 0.0295f,  -0.3487f, -0.1224f, -0.2690f, -0.3217f, -0.1957f,
+      -0.3196f, -0.4530f, -0.1746f, -0.2307f, -0.0504f, -0.0131f, -0.4613f,
+      -0.1476f, -0.5596f, -0.3829f, -0.4302f, -0.2910f, -0.2182f, -0.0811f,
+      -0.3967f, -0.3912f, -0.0371f, -0.1109f, -0.0793f, -0.2063f, -0.0060f,
+      -0.0236f, -0.4098f, -0.0276f, -0.3352f, -0.1888f, -0.2439f, -0.3748f,
+      0.0371f,  0.8460f,  -0.5547f, -1.2680f, -1.1623f, -0.1740f, -0.4815f,
+      -0.0294f, 4.4764f,  0.3716f,  -0.2826f, -0.0549f, -0.2937f, 0.0632f,
+      0.0686f,  -0.4681f, -0.2555f, -0.2427f, -0.2261f, -0.1567f, -0.5199f,
+      -0.4079f, -0.0801f, -0.2075f, -0.3956f, -0.0307f, -0.3150f, -0.3490f,
+      -0.0379f, 0.3060f,  -0.1775f, -0.1651f, 0.0677f,  -0.1947f, 0.0032f,
+      -0.2014f, -0.1575f, -0.1289f, -0.0250f, -0.0762f, -0.2324f, -0.2895f,
+      -0.4531f, -0.4601f, -0.1718f, -0.3139f, -0.4350f, 0.0346f,  -0.0891f,
+      -0.1581f, 0.2123f,  -0.1074f, 0.0221f,  0.0951f,  0.1161f,  0.0245f,
+      -0.0701f, -0.1677f, -0.4170f, -0.2214f, -0.3419f, -0.4873f, -0.0701f,
+      -0.0613f, -0.1031f, 0.0141f,  -0.1299f, -0.3953f, -0.2182f, -0.2679f,
+      -0.0141f, 0.3392f,  -0.0722f, -0.2390f, 0.1638f,  -0.1596f, -0.1527f,
+      -0.3581f, -0.4037f, -0.0736f, 0.0397f,  -0.1288f, -0.1362f, -0.0249f,
+      -0.5099f, -0.4040f, -0.1893f, -0.0298f, -0.1332f, -0.1693f, -0.3301f,
+      -0.1058f, -0.1414f, -0.5737f, -0.2342f, -0.2560f, -0.3834f, -0.0917f,
+      -0.1334f, -0.5077f, -0.3666f, -0.2515f, -0.4824f, -0.4714f, -0.5723f,
+      -0.1361f, -0.5244f, -0.2468f, 0.0237f,  -0.1862f, -0.3124f, -0.0183f,
+      -0.4662f, -0.4444f, -0.5400f, -0.1730f, -0.0123f, -0.2134f, -0.1024f,
+      -0.0172f, -0.4430f, -0.1403f, -0.0751f, -0.2403f, -0.2100f, -0.0678f,
+      2.4232f,  1.9825f,  0.1260f,  1.9972f,  2.8061f,  0.3916f,  0.1842f,
+      -0.2603f, -1.6092f, -1.6037f, 0.1475f,  0.0516f,  -0.2593f, 0.0359f,
+      -0.1802f, 0.0159f,  -0.0529f, -0.0983f, 0.7638f,  0.5529f,  0.9662f,
+      -0.4049f, -0.6372f, 0.4907f,  0.7360f,  0.9271f,  -0.6879f, -0.1067f,
+      0.0323f,  -1.8447f, 0.2176f,  -0.1047f, -0.0048f, -0.1031f, -0.7931f,
+      -0.3059f, -0.4595f, -0.1287f, -0.4031f, 0.1441f,  -0.6651f, 0.2530f,
+      -0.4572f, -0.0614f, 0.0345f,  -0.0008f, 0.0333f,  -0.3431f, 0.0538f,
+      -0.2691f, 0.2930f,  -0.0820f, -0.0979f, -0.0307f, 0.1713f,  0.0783f,
+      -0.4337f, -0.2702f, -0.1677f, -0.1719f, -0.4669f, -0.2847f, -0.4495f,
+      -0.3692f, -0.2641f, -0.2833f, -0.1168f, -0.0523f, -0.2368f, -0.4922f,
+      -0.3453f, -0.4452f, -0.5212f, 0.0412f,  -0.3310f, -0.2656f, -0.4903f,
+      -0.3854f, -0.1009f, -0.1038f, -0.2350f, -0.4430f, -0.5097f, -0.1755f,
+      0.0110f,  -0.0712f, -0.0662f, -0.4493f, -0.2111f, -0.3402f, -0.3100f,
+      -0.2525f, -0.1856f, -0.2689f, -0.4288f, -0.3912f, -0.0754f, -0.5191f,
+      -0.0747f, -0.0626f, -0.4821f, -0.2014f, -0.3124f, -0.4858f, -0.1896f,
+      1.0673f,  -0.8529f, 13.7564f, 18.7299f, 19.0062f, -1.1047f, -0.8654f,
+      0.1089f,  -1.2958f, -0.7793f, 0.0780f,  -0.1679f, 0.0054f,  -1.2451f,
+      -0.1287f, 0.0082f,  -0.2960f, -0.0442f, 2.3817f,  0.4716f,  1.3862f,
+      -0.0782f, -0.1871f, -0.2596f, 0.0093f,  0.1451f,  -0.1124f, -0.2315f,
+      -0.2677f, -0.1086f, 0.2216f,  0.2928f,  0.0391f,  0.0372f,  -0.2551f,
+      0.0552f,  -0.1876f, -0.2361f, -0.1889f, -0.0279f, 0.1204f,  0.2016f,
+      -0.5787f, -0.5830f, 0.0530f,  -0.1452f, -0.4899f, -0.2937f, 0.1430f,
+      -0.2752f, -0.2320f, -0.1908f, -0.5538f, -0.0858f, -0.1378f, -0.1505f,
+      -0.3908f, -0.4732f, -0.3018f, 0.0244f,  -0.2392f, -0.2833f, -0.3997f,
+      -0.4495f, -0.2570f, -0.3189f, -0.1534f, -0.1040f, -0.5497f, -0.3524f,
+      -0.2053f, 0.2415f,  -0.5027f, 0.0288f,  -0.1904f, -0.2183f, -0.1062f,
+      -0.3560f, 0.0165f,  -0.4601f, -0.2144f, -0.0439f, -0.4913f, -0.3160f,
+      -0.1641f, 0.1010f,  -0.1044f, -0.4064f, -0.3580f, -0.4015f, 0.1010f,
+      -0.1973f, 0.6392f,  -0.5177f, -0.0472f, -0.1526f, 0.1533f,  -0.0819f,
+      -0.0252f, -0.0783f, 0.1301f,  0.0158f,  -0.2003f, -0.4700f, -0.2329f,
+    };
+
+static const float
+    av1_use_flat_gop_nn_biases_layer0[NUM_HIDDEN_NODES_LAYER0] = {
+      -1.113218f, 0.f,        -0.268537f, -0.268537f, 0.f,        -0.268534f,
+      -0.40681f,  -0.268537f, -0.061835f, -0.614956f, 0.984277f,  -0.280228f,
+      -0.354716f, -0.202312f, -0.772829f, -0.464005f, -0.230795f, 0.f,
+      -0.124187f, -0.265949f, 0.325168f,  -0.359008f, -2.455546f, -0.229222f,
+      -0.692233f, -0.29401f,  -0.632682f, -0.479061f, -0.166094f, 0.077291f,
+      -0.235293f, -0.268537f, 0.167899f,  -0.141991f, -0.210089f, -0.177294f,
+      -0.325401f, -0.268537f, 0.323627f,  -0.156593f, -0.218451f, -0.230792f,
+      -0.268537f, 0.833177f,  0.f,        -0.353177f, -0.260953f, -0.209537f,
+    };
+
+static const float
+    av1_use_flat_gop_nn_weights_layer1[NUM_HIDDEN_NODES_LAYER0 * NUM_LABELS] = {
+      -0.024695f, 0.146668f,  -0.02723f,  0.034577f,  -0.255426f, 0.22402f,
+      -0.112595f, -0.131262f, 0.091164f,  -0.045294f, 0.028304f,  -0.051683f,
+      0.310497f,  -0.077786f, -0.047873f, -0.057205f, -0.065119f, 0.227417f,
+      -0.051126f, -0.137241f, 0.035742f,  -0.058992f, -0.021466f, 0.107947f,
+      -0.077183f, -0.04144f,  0.003568f,  -0.027656f, 0.038196f,  0.19684f,
+      -0.128401f, 0.149629f,  0.024526f,  0.037376f,  0.090752f,  -0.061666f,
+      -0.15743f,  0.057773f,  -0.010582f, 0.120997f,  0.060368f,  0.210028f,
+      -0.192244f, -0.064764f, -0.237655f, 0.1852f,    -0.084281f, -0.010434f,
+    };
+
+static const float av1_use_flat_gop_nn_biases_layer1[NUM_LABELS] = {
+  -0.672434f,
+};
+
+static const NN_CONFIG av1_use_flat_gop_nn_config = {
+  NUM_FEATURES,
+  NUM_LABELS,
+  NUM_HIDDEN_LAYERS,
+  {
+      NUM_HIDDEN_NODES_LAYER0,
+  },
+  {
+      av1_use_flat_gop_nn_weights_layer0,
+      av1_use_flat_gop_nn_weights_layer1,
+  },
+  {
+      av1_use_flat_gop_nn_biases_layer0,
+      av1_use_flat_gop_nn_biases_layer1,
+  },
+};
+
+#undef NUM_FEATURES
+#undef NUM_HIDDEN_LAYERS
+#undef NUM_HIDDEN_NODES_LAYER0
+#undef NUM_LABELS
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_USE_FLAT_GOP_MODEL_PARAMS_H_
diff --git a/libs/libaom/src/av1/encoder/var_based_part.c b/libs/libaom/src/av1/encoder/var_based_part.c
new file mode 100644
index 000000000..e3cb1fa8f
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/var_based_part.c
@@ -0,0 +1,1006 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+#include <math.h>
+#include <stdbool.h>
+#include <stdio.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/binary_codes_writer.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/system_state.h"
+
+#include "av1/common/reconinter.h"
+#include "av1/common/blockd.h"
+
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/var_based_part.h"
+#include "av1/encoder/reconinter_enc.h"
+
+extern const uint8_t AV1_VAR_OFFS[];
+
+typedef struct {
+  VPVariance *part_variances;
+  VPartVar *split[4];
+} variance_node;
+
+static AOM_INLINE void tree_to_node(void *data, BLOCK_SIZE bsize,
+                                    variance_node *node) {
+  int i;
+  node->part_variances = NULL;
+  switch (bsize) {
+    case BLOCK_128X128: {
+      VP128x128 *vt = (VP128x128 *)data;
+      node->part_variances = &vt->part_variances;
+      for (i = 0; i < 4; i++)
+        node->split[i] = &vt->split[i].part_variances.none;
+      break;
+    }
+    case BLOCK_64X64: {
+      VP64x64 *vt = (VP64x64 *)data;
+      node->part_variances = &vt->part_variances;
+      for (i = 0; i < 4; i++)
+        node->split[i] = &vt->split[i].part_variances.none;
+      break;
+    }
+    case BLOCK_32X32: {
+      VP32x32 *vt = (VP32x32 *)data;
+      node->part_variances = &vt->part_variances;
+      for (i = 0; i < 4; i++)
+        node->split[i] = &vt->split[i].part_variances.none;
+      break;
+    }
+    case BLOCK_16X16: {
+      VP16x16 *vt = (VP16x16 *)data;
+      node->part_variances = &vt->part_variances;
+      for (i = 0; i < 4; i++)
+        node->split[i] = &vt->split[i].part_variances.none;
+      break;
+    }
+    case BLOCK_8X8: {
+      VP8x8 *vt = (VP8x8 *)data;
+      node->part_variances = &vt->part_variances;
+      for (i = 0; i < 4; i++)
+        node->split[i] = &vt->split[i].part_variances.none;
+      break;
+    }
+    default: {
+      VP4x4 *vt = (VP4x4 *)data;
+      assert(bsize == BLOCK_4X4);
+      node->part_variances = &vt->part_variances;
+      for (i = 0; i < 4; i++) node->split[i] = &vt->split[i];
+      break;
+    }
+  }
+}
+
+// Set variance values given sum square error, sum error, count.
+static AOM_INLINE void fill_variance(uint32_t s2, int32_t s, int c,
+                                     VPartVar *v) {
+  v->sum_square_error = s2;
+  v->sum_error = s;
+  v->log2_count = c;
+}
+
+static AOM_INLINE void get_variance(VPartVar *v) {
+  v->variance =
+      (int)(256 * (v->sum_square_error -
+                   (uint32_t)(((int64_t)v->sum_error * v->sum_error) >>
+                              v->log2_count)) >>
+            v->log2_count);
+}
+
+static AOM_INLINE void sum_2_variances(const VPartVar *a, const VPartVar *b,
+                                       VPartVar *r) {
+  assert(a->log2_count == b->log2_count);
+  fill_variance(a->sum_square_error + b->sum_square_error,
+                a->sum_error + b->sum_error, a->log2_count + 1, r);
+}
+
+static AOM_INLINE void fill_variance_tree(void *data, BLOCK_SIZE bsize) {
+  variance_node node;
+  memset(&node, 0, sizeof(node));
+  tree_to_node(data, bsize, &node);
+  sum_2_variances(node.split[0], node.split[1], &node.part_variances->horz[0]);
+  sum_2_variances(node.split[2], node.split[3], &node.part_variances->horz[1]);
+  sum_2_variances(node.split[0], node.split[2], &node.part_variances->vert[0]);
+  sum_2_variances(node.split[1], node.split[3], &node.part_variances->vert[1]);
+  sum_2_variances(&node.part_variances->vert[0], &node.part_variances->vert[1],
+                  &node.part_variances->none);
+}
+
+static AOM_INLINE void set_block_size(AV1_COMP *const cpi, MACROBLOCK *const x,
+                                      MACROBLOCKD *const xd, int mi_row,
+                                      int mi_col, BLOCK_SIZE bsize) {
+  if (cpi->common.mi_params.mi_cols > mi_col &&
+      cpi->common.mi_params.mi_rows > mi_row) {
+    set_mode_info_offsets(&cpi->common.mi_params, &cpi->mbmi_ext_info, x, xd,
+                          mi_row, mi_col);
+    xd->mi[0]->sb_type = bsize;
+  }
+}
+
+static int set_vt_partitioning(AV1_COMP *cpi, MACROBLOCK *const x,
+                               MACROBLOCKD *const xd,
+                               const TileInfo *const tile, void *data,
+                               BLOCK_SIZE bsize, int mi_row, int mi_col,
+                               int64_t threshold, BLOCK_SIZE bsize_min,
+                               int force_split) {
+  AV1_COMMON *const cm = &cpi->common;
+  variance_node vt;
+  const int block_width = mi_size_wide[bsize];
+  const int block_height = mi_size_high[bsize];
+
+  assert(block_height == block_width);
+  tree_to_node(data, bsize, &vt);
+
+  if (force_split == 1) return 0;
+
+  // For bsize=bsize_min (16x16/8x8 for 8x8/4x4 downsampling), select if
+  // variance is below threshold, otherwise split will be selected.
+  // No check for vert/horiz split as too few samples for variance.
+  if (bsize == bsize_min) {
+    // Variance already computed to set the force_split.
+    if (frame_is_intra_only(cm)) get_variance(&vt.part_variances->none);
+    if (mi_col + block_width <= tile->mi_col_end &&
+        mi_row + block_height <= tile->mi_row_end &&
+        vt.part_variances->none.variance < threshold) {
+      set_block_size(cpi, x, xd, mi_row, mi_col, bsize);
+      return 1;
+    }
+    return 0;
+  } else if (bsize > bsize_min) {
+    // Variance already computed to set the force_split.
+    if (frame_is_intra_only(cm)) get_variance(&vt.part_variances->none);
+    // For key frame: take split for bsize above 32X32 or very high variance.
+    if (frame_is_intra_only(cm) &&
+        (bsize > BLOCK_32X32 ||
+         vt.part_variances->none.variance > (threshold << 4))) {
+      return 0;
+    }
+    // If variance is low, take the bsize (no split).
+    if (mi_col + block_width <= tile->mi_col_end &&
+        mi_row + block_height <= tile->mi_row_end &&
+        vt.part_variances->none.variance < threshold) {
+      set_block_size(cpi, x, xd, mi_row, mi_col, bsize);
+      return 1;
+    }
+    // Check vertical split.
+    if (mi_row + block_height <= tile->mi_row_end &&
+        mi_col + block_width / 2 <= tile->mi_col_end) {
+      BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_VERT);
+      get_variance(&vt.part_variances->vert[0]);
+      get_variance(&vt.part_variances->vert[1]);
+      if (vt.part_variances->vert[0].variance < threshold &&
+          vt.part_variances->vert[1].variance < threshold &&
+          get_plane_block_size(subsize, xd->plane[1].subsampling_x,
+                               xd->plane[1].subsampling_y) < BLOCK_INVALID) {
+        set_block_size(cpi, x, xd, mi_row, mi_col, subsize);
+        set_block_size(cpi, x, xd, mi_row, mi_col + block_width / 2, subsize);
+        return 1;
+      }
+    }
+    // Check horizontal split.
+    if (mi_col + block_width <= tile->mi_col_end &&
+        mi_row + block_height / 2 <= tile->mi_row_end) {
+      BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_HORZ);
+      get_variance(&vt.part_variances->horz[0]);
+      get_variance(&vt.part_variances->horz[1]);
+      if (vt.part_variances->horz[0].variance < threshold &&
+          vt.part_variances->horz[1].variance < threshold &&
+          get_plane_block_size(subsize, xd->plane[1].subsampling_x,
+                               xd->plane[1].subsampling_y) < BLOCK_INVALID) {
+        set_block_size(cpi, x, xd, mi_row, mi_col, subsize);
+        set_block_size(cpi, x, xd, mi_row + block_height / 2, mi_col, subsize);
+        return 1;
+      }
+    }
+    return 0;
+  }
+  return 0;
+}
+
+static AOM_INLINE void fill_variance_8x8avg(const uint8_t *s, int sp,
+                                            const uint8_t *d, int dp,
+                                            int x16_idx, int y16_idx,
+                                            VP16x16 *vst,
+#if CONFIG_AV1_HIGHBITDEPTH
+                                            int highbd_flag,
+#endif
+                                            int pixels_wide, int pixels_high,
+                                            int is_key_frame) {
+  int k;
+  for (k = 0; k < 4; k++) {
+    int x8_idx = x16_idx + ((k & 1) << 3);
+    int y8_idx = y16_idx + ((k >> 1) << 3);
+    unsigned int sse = 0;
+    int sum = 0;
+    if (x8_idx < pixels_wide && y8_idx < pixels_high) {
+      int s_avg;
+      int d_avg = 128;
+#if CONFIG_AV1_HIGHBITDEPTH
+      if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {
+        s_avg = aom_highbd_avg_8x8(s + y8_idx * sp + x8_idx, sp);
+        if (!is_key_frame)
+          d_avg = aom_highbd_avg_8x8(d + y8_idx * dp + x8_idx, dp);
+      } else {
+        s_avg = aom_avg_8x8(s + y8_idx * sp + x8_idx, sp);
+        if (!is_key_frame) d_avg = aom_avg_8x8(d + y8_idx * dp + x8_idx, dp);
+      }
+#else
+      s_avg = aom_avg_8x8(s + y8_idx * sp + x8_idx, sp);
+      if (!is_key_frame) d_avg = aom_avg_8x8(d + y8_idx * dp + x8_idx, dp);
+#endif
+      sum = s_avg - d_avg;
+      sse = sum * sum;
+    }
+    fill_variance(sse, sum, 0, &vst->split[k].part_variances.none);
+  }
+}
+
+static int compute_minmax_8x8(const uint8_t *s, int sp, const uint8_t *d,
+                              int dp, int x16_idx, int y16_idx,
+#if CONFIG_AV1_HIGHBITDEPTH
+                              int highbd_flag,
+#endif
+                              int pixels_wide, int pixels_high) {
+  int k;
+  int minmax_max = 0;
+  int minmax_min = 255;
+  // Loop over the 4 8x8 subblocks.
+  for (k = 0; k < 4; k++) {
+    int x8_idx = x16_idx + ((k & 1) << 3);
+    int y8_idx = y16_idx + ((k >> 1) << 3);
+    int min = 0;
+    int max = 0;
+    if (x8_idx < pixels_wide && y8_idx < pixels_high) {
+#if CONFIG_AV1_HIGHBITDEPTH
+      if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {
+        aom_highbd_minmax_8x8(s + y8_idx * sp + x8_idx, sp,
+                              d + y8_idx * dp + x8_idx, dp, &min, &max);
+      } else {
+        aom_minmax_8x8(s + y8_idx * sp + x8_idx, sp, d + y8_idx * dp + x8_idx,
+                       dp, &min, &max);
+      }
+#else
+      aom_minmax_8x8(s + y8_idx * sp + x8_idx, sp, d + y8_idx * dp + x8_idx, dp,
+                     &min, &max);
+#endif
+      if ((max - min) > minmax_max) minmax_max = (max - min);
+      if ((max - min) < minmax_min) minmax_min = (max - min);
+    }
+  }
+  return (minmax_max - minmax_min);
+}
+
+static AOM_INLINE void fill_variance_4x4avg(const uint8_t *s, int sp,
+                                            const uint8_t *d, int dp,
+                                            int x8_idx, int y8_idx, VP8x8 *vst,
+#if CONFIG_AV1_HIGHBITDEPTH
+                                            int highbd_flag,
+#endif
+                                            int pixels_wide, int pixels_high,
+                                            int is_key_frame) {
+  int k;
+  for (k = 0; k < 4; k++) {
+    int x4_idx = x8_idx + ((k & 1) << 2);
+    int y4_idx = y8_idx + ((k >> 1) << 2);
+    unsigned int sse = 0;
+    int sum = 0;
+    if (x4_idx < pixels_wide && y4_idx < pixels_high) {
+      int s_avg;
+      int d_avg = 128;
+#if CONFIG_AV1_HIGHBITDEPTH
+      if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {
+        s_avg = aom_highbd_avg_4x4(s + y4_idx * sp + x4_idx, sp);
+        if (!is_key_frame)
+          d_avg = aom_highbd_avg_4x4(d + y4_idx * dp + x4_idx, dp);
+      } else {
+        s_avg = aom_avg_4x4(s + y4_idx * sp + x4_idx, sp);
+        if (!is_key_frame) d_avg = aom_avg_4x4(d + y4_idx * dp + x4_idx, dp);
+      }
+#else
+      s_avg = aom_avg_4x4(s + y4_idx * sp + x4_idx, sp);
+      if (!is_key_frame) d_avg = aom_avg_4x4(d + y4_idx * dp + x4_idx, dp);
+#endif
+
+      sum = s_avg - d_avg;
+      sse = sum * sum;
+    }
+    fill_variance(sse, sum, 0, &vst->split[k].part_variances.none);
+  }
+}
+
+// TODO(kyslov) Bring back threshold adjustment based on content state
+static int64_t scale_part_thresh_sumdiff(int64_t threshold_base, int speed,
+                                         int width, int height,
+                                         int content_state) {
+  (void)width;
+  (void)height;
+  (void)content_state;
+  if (speed >= 8) {
+    return (5 * threshold_base) >> 2;
+  }
+  return threshold_base;
+}
+
+// Set the variance split thresholds for following the block sizes:
+// 0 - threshold_128x128, 1 - threshold_64x64, 2 - threshold_32x32,
+// 3 - vbp_threshold_16x16. 4 - vbp_threshold_8x8 (to split to 4x4 partition) is
+// currently only used on key frame.
+static AOM_INLINE void set_vbp_thresholds(AV1_COMP *cpi, int64_t thresholds[],
+                                          int q, int content_state) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int is_key_frame = frame_is_intra_only(cm);
+  const int threshold_multiplier = is_key_frame ? 40 : 1;
+  int64_t threshold_base =
+      (int64_t)(threshold_multiplier *
+                cpi->enc_quant_dequant_params.dequants.y_dequant_QTX[q][1]);
+
+  if (is_key_frame) {
+    thresholds[0] = threshold_base;
+    thresholds[1] = threshold_base;
+    thresholds[2] = threshold_base >> 2;
+    thresholds[3] = threshold_base >> 2;
+    thresholds[4] = threshold_base << 2;
+  } else {
+    // Increase base variance threshold based on content_state/sum_diff level.
+    threshold_base = scale_part_thresh_sumdiff(
+        threshold_base, cpi->oxcf.speed, cm->width, cm->height, content_state);
+
+    thresholds[0] = threshold_base >> 1;
+    thresholds[1] = threshold_base;
+    thresholds[3] = threshold_base << cpi->oxcf.speed;
+    if (cm->width >= 1280 && cm->height >= 720)
+      thresholds[3] = thresholds[3] << 1;
+    if (cm->width * cm->height <= 352 * 288) {
+      int last_qindex = cpi->rc.last_q[INTER_FRAME];
+      if (last_qindex >= QINDEX_HIGH_THR) {
+        threshold_base = (5 * threshold_base) >> 1;
+        thresholds[1] = threshold_base >> 3;
+        thresholds[2] = threshold_base << 2;
+        thresholds[3] = threshold_base << 5;
+      } else if (last_qindex < QINDEX_LOW_THR) {
+        thresholds[1] = threshold_base >> 3;
+        thresholds[2] = threshold_base >> 1;
+        thresholds[3] = threshold_base << 3;
+      } else {
+        int64_t qi_diff_low = last_qindex - QINDEX_LOW_THR;
+        int64_t qi_diff_high = QINDEX_HIGH_THR - last_qindex;
+        int64_t threshold_diff = QINDEX_HIGH_THR - QINDEX_LOW_THR;
+        int64_t threshold_base_high = (5 * threshold_base) >> 1;
+
+        threshold_diff = threshold_diff > 0 ? threshold_diff : 1;
+        threshold_base = (qi_diff_low * threshold_base_high +
+                          qi_diff_high * threshold_base) /
+                         threshold_diff;
+        thresholds[1] = threshold_base >> 3;
+        thresholds[2] = ((qi_diff_low * threshold_base) +
+                         qi_diff_high * (threshold_base >> 1)) /
+                        threshold_diff;
+        thresholds[3] = ((qi_diff_low * (threshold_base << 5)) +
+                         qi_diff_high * (threshold_base << 3)) /
+                        threshold_diff;
+      }
+    } else if (cm->width < 1280 && cm->height < 720) {
+      thresholds[2] = (5 * threshold_base) >> 2;
+    } else if (cm->width < 1920 && cm->height < 1080) {
+      thresholds[2] = threshold_base << 1;
+    } else {
+      thresholds[2] = (5 * threshold_base) >> 1;
+    }
+  }
+}
+
+// Set temporal variance low flag for superblock 64x64.
+// Only first 25 in the array are used in this case.
+static AOM_INLINE void set_low_temp_var_flag_64x64(
+    CommonModeInfoParams *mi_params, MACROBLOCK *x, MACROBLOCKD *xd,
+    VP64x64 *vt, const int64_t thresholds[], int mi_col, int mi_row) {
+  if (xd->mi[0]->sb_type == BLOCK_64X64) {
+    if ((vt->part_variances).none.variance < (thresholds[0] >> 1))
+      x->variance_low[0] = 1;
+  } else if (xd->mi[0]->sb_type == BLOCK_64X32) {
+    for (int i = 0; i < 2; i++) {
+      if (vt->part_variances.horz[i].variance < (thresholds[0] >> 2))
+        x->variance_low[i + 1] = 1;
+    }
+  } else if (xd->mi[0]->sb_type == BLOCK_32X64) {
+    for (int i = 0; i < 2; i++) {
+      if (vt->part_variances.vert[i].variance < (thresholds[0] >> 2))
+        x->variance_low[i + 3] = 1;
+    }
+  } else {
+    static const int idx[4][2] = { { 0, 0 }, { 0, 8 }, { 8, 0 }, { 8, 8 } };
+    for (int i = 0; i < 4; i++) {
+      const int idx_str =
+          mi_params->mi_stride * (mi_row + idx[i][0]) + mi_col + idx[i][1];
+      MB_MODE_INFO **this_mi = mi_params->mi_grid_base + idx_str;
+
+      if (mi_params->mi_cols <= mi_col + idx[i][1] ||
+          mi_params->mi_rows <= mi_row + idx[i][0])
+        continue;
+
+      if (*this_mi == NULL) continue;
+
+      if ((*this_mi)->sb_type == BLOCK_32X32) {
+        int64_t threshold_32x32 = (5 * thresholds[1]) >> 3;
+        if (vt->split[i].part_variances.none.variance < threshold_32x32)
+          x->variance_low[i + 5] = 1;
+      } else {
+        // For 32x16 and 16x32 blocks, the flag is set on each 16x16 block
+        // inside.
+        if ((*this_mi)->sb_type == BLOCK_16X16 ||
+            (*this_mi)->sb_type == BLOCK_32X16 ||
+            (*this_mi)->sb_type == BLOCK_16X32) {
+          for (int j = 0; j < 4; j++) {
+            if (vt->split[i].split[j].part_variances.none.variance <
+                (thresholds[2] >> 8))
+              x->variance_low[(i << 2) + j + 9] = 1;
+          }
+        }
+      }
+    }
+  }
+}
+
+static AOM_INLINE void set_low_temp_var_flag_128x128(
+    CommonModeInfoParams *mi_params, MACROBLOCK *x, MACROBLOCKD *xd,
+    VP128x128 *vt, const int64_t thresholds[], int mi_col, int mi_row) {
+  if (xd->mi[0]->sb_type == BLOCK_128X128) {
+    if (vt->part_variances.none.variance < (thresholds[0] >> 1))
+      x->variance_low[0] = 1;
+  } else if (xd->mi[0]->sb_type == BLOCK_128X64) {
+    for (int i = 0; i < 2; i++) {
+      if (vt->part_variances.horz[i].variance < (thresholds[0] >> 2))
+        x->variance_low[i + 1] = 1;
+    }
+  } else if (xd->mi[0]->sb_type == BLOCK_64X128) {
+    for (int i = 0; i < 2; i++) {
+      if (vt->part_variances.vert[i].variance < (thresholds[0] >> 2))
+        x->variance_low[i + 3] = 1;
+    }
+  } else {
+    static const int idx64[4][2] = {
+      { 0, 0 }, { 0, 16 }, { 16, 0 }, { 16, 16 }
+    };
+    static const int idx32[4][2] = { { 0, 0 }, { 0, 8 }, { 8, 0 }, { 8, 8 } };
+    for (int i = 0; i < 4; i++) {
+      const int idx_str =
+          mi_params->mi_stride * (mi_row + idx64[i][0]) + mi_col + idx64[i][1];
+      MB_MODE_INFO **mi_64 = mi_params->mi_grid_base + idx_str;
+      if (*mi_64 == NULL) continue;
+      if (mi_params->mi_cols <= mi_col + idx64[i][1] ||
+          mi_params->mi_rows <= mi_row + idx64[i][0])
+        continue;
+      const int64_t threshold_64x64 = (5 * thresholds[1]) >> 3;
+      if ((*mi_64)->sb_type == BLOCK_64X64) {
+        if (vt->split[i].part_variances.none.variance < threshold_64x64)
+          x->variance_low[5 + i] = 1;
+      } else if ((*mi_64)->sb_type == BLOCK_64X32) {
+        for (int j = 0; j < 2; j++)
+          if (vt->split[i].part_variances.horz[j].variance <
+              (threshold_64x64 >> 1))
+            x->variance_low[9 + (i << 1) + j] = 1;
+      } else if ((*mi_64)->sb_type == BLOCK_32X64) {
+        for (int j = 0; j < 2; j++)
+          if (vt->split[i].part_variances.vert[j].variance <
+              (threshold_64x64 >> 1))
+            x->variance_low[17 + (i << 1) + j] = 1;
+      } else {
+        for (int k = 0; k < 4; k++) {
+          const int idx_str1 = mi_params->mi_stride * idx32[k][0] + idx32[k][1];
+          MB_MODE_INFO **mi_32 = mi_params->mi_grid_base + idx_str + idx_str1;
+          if (*mi_32 == NULL) continue;
+
+          if (mi_params->mi_cols <= mi_col + idx64[i][1] + idx32[k][1] ||
+              mi_params->mi_rows <= mi_row + idx64[i][0] + idx32[k][0])
+            continue;
+          const int64_t threshold_32x32 = (5 * thresholds[2]) >> 3;
+          if ((*mi_32)->sb_type == BLOCK_32X32) {
+            if (vt->split[i].split[k].part_variances.none.variance <
+                threshold_32x32)
+              x->variance_low[25 + (i << 2) + k] = 1;
+          } else {
+            // For 32x16 and 16x32 blocks, the flag is set on each 16x16 block
+            // inside.
+            if ((*mi_32)->sb_type == BLOCK_16X16 ||
+                (*mi_32)->sb_type == BLOCK_32X16 ||
+                (*mi_32)->sb_type == BLOCK_16X32) {
+              for (int j = 0; j < 4; j++) {
+                if (vt->split[i]
+                        .split[k]
+                        .split[j]
+                        .part_variances.none.variance < (thresholds[3] >> 8))
+                  x->variance_low[41 + (i << 4) + (k << 2) + j] = 1;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+static AOM_INLINE void set_low_temp_var_flag(
+    AV1_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, VP128x128 *vt,
+    int64_t thresholds[], MV_REFERENCE_FRAME ref_frame_partition, int mi_col,
+    int mi_row) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int mv_thr = cm->width > 640 ? 8 : 4;
+  // Check temporal variance for bsize >= 16x16, if LAST_FRAME was selected and
+  // int_pro mv is small. If the temporal variance is small set the flag
+  // variance_low for the block. The variance threshold can be adjusted, the
+  // higher the more aggressive.
+  if (ref_frame_partition == LAST_FRAME &&
+      (cpi->sf.rt_sf.short_circuit_low_temp_var == 1 ||
+       (cpi->sf.rt_sf.estimate_motion_for_var_based_partition &&
+        xd->mi[0]->mv[0].as_mv.col < mv_thr &&
+        xd->mi[0]->mv[0].as_mv.col > -mv_thr &&
+        xd->mi[0]->mv[0].as_mv.row < mv_thr &&
+        xd->mi[0]->mv[0].as_mv.row > -mv_thr))) {
+    const int is_small_sb = (cm->seq_params.sb_size == BLOCK_64X64);
+    if (is_small_sb)
+      set_low_temp_var_flag_64x64(&cm->mi_params, x, xd, &(vt->split[0]),
+                                  thresholds, mi_col, mi_row);
+    else
+      set_low_temp_var_flag_128x128(&cm->mi_params, x, xd, vt, thresholds,
+                                    mi_col, mi_row);
+  }
+}
+
+void av1_set_variance_partition_thresholds(AV1_COMP *cpi, int q,
+                                           int content_state) {
+  SPEED_FEATURES *const sf = &cpi->sf;
+  if (sf->part_sf.partition_search_type != VAR_BASED_PARTITION) {
+    return;
+  } else {
+    set_vbp_thresholds(cpi, cpi->vbp_info.thresholds, q, content_state);
+    // The threshold below is not changed locally.
+    cpi->vbp_info.threshold_minmax = 15 + (q >> 3);
+  }
+}
+
+static AOM_INLINE void chroma_check(AV1_COMP *cpi, MACROBLOCK *x,
+                                    BLOCK_SIZE bsize, unsigned int y_sad,
+                                    int is_key_frame) {
+  int i;
+  MACROBLOCKD *xd = &x->e_mbd;
+
+  if (is_key_frame || cpi->oxcf.monochrome) return;
+
+  for (i = 1; i <= 2; ++i) {
+    unsigned int uv_sad = UINT_MAX;
+    struct macroblock_plane *p = &x->plane[i];
+    struct macroblockd_plane *pd = &xd->plane[i];
+    const BLOCK_SIZE bs =
+        get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+
+    if (bs != BLOCK_INVALID)
+      uv_sad = cpi->fn_ptr[bs].sdf(p->src.buf, p->src.stride, pd->dst.buf,
+                                   pd->dst.stride);
+
+    x->color_sensitivity[i - 1] = uv_sad > (y_sad >> 2);
+  }
+}
+
+// This function chooses partitioning based on the variance between source and
+// reconstructed last, where variance is computed for down-sampled inputs.
+// TODO(kyslov): lot of things. Bring back noise estimation, brush up partition
+// selection and most of all - retune the thresholds
+int av1_choose_var_based_partitioning(AV1_COMP *cpi, const TileInfo *const tile,
+                                      ThreadData *td, MACROBLOCK *x, int mi_row,
+                                      int mi_col) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  const int64_t *const vbp_thresholds = cpi->vbp_info.thresholds;
+
+  int i, j, k, m;
+  VP128x128 *vt;
+  VP16x16 *vt2 = NULL;
+  unsigned char force_split[85];
+  int avg_32x32;
+  int max_var_32x32[4];
+  int min_var_32x32[4];
+  int var_32x32;
+  int var_64x64;
+  int min_var_64x64 = INT_MAX;
+  int max_var_64x64 = 0;
+  int avg_16x16[4][4];
+  int maxvar_16x16[4][4];
+  int minvar_16x16[4][4];
+  int64_t threshold_4x4avg;
+  int content_state = 0;
+  uint8_t *s;
+  const uint8_t *d;
+  int sp;
+  int dp;
+  // TODO(kyslov) Bring back compute_minmax_variance with content type detection
+  int compute_minmax_variance = 0;
+  int is_key_frame = frame_is_intra_only(cm);
+  int pixels_wide = 128, pixels_high = 128;
+  assert(cm->seq_params.sb_size == BLOCK_64X64 ||
+         cm->seq_params.sb_size == BLOCK_128X128);
+  const int is_small_sb = (cm->seq_params.sb_size == BLOCK_64X64);
+  const int num_64x64_blocks = is_small_sb ? 1 : 4;
+
+  unsigned int y_sad = UINT_MAX;
+  unsigned int y_sad_g = UINT_MAX;
+  BLOCK_SIZE bsize = is_small_sb ? BLOCK_64X64 : BLOCK_128X128;
+
+  // Ref frame used in partitioning.
+  MV_REFERENCE_FRAME ref_frame_partition = LAST_FRAME;
+
+  CHECK_MEM_ERROR(cm, vt, aom_malloc(sizeof(*vt)));
+
+  vt->split = td->vt64x64;
+
+  int64_t thresholds[5] = { vbp_thresholds[0], vbp_thresholds[1],
+                            vbp_thresholds[2], vbp_thresholds[3],
+                            vbp_thresholds[4] };
+
+  const int low_res = (cm->width <= 352 && cm->height <= 288);
+  int variance4x4downsample[64];
+  int segment_id;
+  const int num_planes = av1_num_planes(cm);
+
+  segment_id = xd->mi[0]->segment_id;
+
+  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled &&
+      cyclic_refresh_segment_id_boosted(segment_id) &&
+      cpi->sf.rt_sf.use_nonrd_pick_mode) {
+    int q = av1_get_qindex(&cm->seg, segment_id, cm->quant_params.base_qindex);
+    set_vbp_thresholds(cpi, thresholds, q, content_state);
+  } else {
+    set_vbp_thresholds(cpi, thresholds, cm->quant_params.base_qindex,
+                       content_state);
+  }
+
+  if (is_small_sb) {
+    pixels_wide = 64;
+    pixels_high = 64;
+  }
+
+  // For non keyframes, disable 4x4 average for low resolution when speed = 8
+  threshold_4x4avg = INT64_MAX;
+
+  if (xd->mb_to_right_edge < 0) pixels_wide += (xd->mb_to_right_edge >> 3);
+  if (xd->mb_to_bottom_edge < 0) pixels_high += (xd->mb_to_bottom_edge >> 3);
+
+  s = x->plane[0].src.buf;
+  sp = x->plane[0].src.stride;
+
+  // Index for force_split: 0 for 64x64, 1-4 for 32x32 blocks,
+  // 5-20 for the 16x16 blocks.
+  force_split[0] = 0;
+  memset(x->variance_low, 0, sizeof(x->variance_low));
+
+  if (!is_key_frame) {
+    // TODO(kyslov): we are assuming that the ref is LAST_FRAME! Check if it
+    // is!!
+    MB_MODE_INFO *mi = xd->mi[0];
+    const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, LAST_FRAME);
+    assert(yv12 != NULL);
+    const YV12_BUFFER_CONFIG *yv12_g = NULL;
+
+    // For non-SVC GOLDEN is another temporal reference. Check if it should be
+    // used as reference for partitioning.
+    if (!cpi->use_svc && (cpi->ref_frame_flags & AOM_GOLD_FLAG) &&
+        cpi->sf.rt_sf.use_nonrd_pick_mode) {
+      yv12_g = get_ref_frame_yv12_buf(cm, GOLDEN_FRAME);
+      if (yv12_g && yv12_g != yv12) {
+        av1_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col,
+                             get_ref_scale_factors(cm, GOLDEN_FRAME),
+                             num_planes);
+        y_sad_g = cpi->fn_ptr[bsize].sdf(
+            x->plane[0].src.buf, x->plane[0].src.stride,
+            xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride);
+      }
+    }
+
+    av1_setup_pre_planes(xd, 0, yv12, mi_row, mi_col,
+                         get_ref_scale_factors(cm, LAST_FRAME), num_planes);
+    mi->ref_frame[0] = LAST_FRAME;
+    mi->ref_frame[1] = NONE_FRAME;
+    mi->sb_type = cm->seq_params.sb_size;
+    mi->mv[0].as_int = 0;
+    mi->interp_filters = av1_broadcast_interp_filter(BILINEAR);
+    if (cpi->sf.rt_sf.estimate_motion_for_var_based_partition) {
+      if (xd->mb_to_right_edge >= 0 && xd->mb_to_bottom_edge >= 0) {
+        const MV dummy_mv = { 0, 0 };
+        y_sad = av1_int_pro_motion_estimation(cpi, x, cm->seq_params.sb_size,
+                                              mi_row, mi_col, &dummy_mv);
+      }
+    }
+    if (y_sad == UINT_MAX) {
+      y_sad = cpi->fn_ptr[bsize].sdf(
+          x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].pre[0].buf,
+          xd->plane[0].pre[0].stride);
+    }
+
+    // Pick the ref frame for partitioning, use golden frame only if its
+    // lower sad.
+    if (y_sad_g < 0.9 * y_sad) {
+      av1_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col,
+                           get_ref_scale_factors(cm, GOLDEN_FRAME), num_planes);
+      mi->ref_frame[0] = GOLDEN_FRAME;
+      mi->mv[0].as_int = 0;
+      y_sad = y_sad_g;
+      ref_frame_partition = GOLDEN_FRAME;
+      x->nonrd_prune_ref_frame_search = 0;
+    } else {
+      x->pred_mv[LAST_FRAME] = mi->mv[0].as_mv;
+      ref_frame_partition = LAST_FRAME;
+      x->nonrd_prune_ref_frame_search =
+          cpi->sf.rt_sf.nonrd_prune_ref_frame_search;
+    }
+
+    set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]);
+    av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL,
+                                  cm->seq_params.sb_size, AOM_PLANE_Y,
+                                  AOM_PLANE_Y);
+
+    d = xd->plane[0].dst.buf;
+    dp = xd->plane[0].dst.stride;
+  } else {
+    d = AV1_VAR_OFFS;
+    dp = 0;
+  }
+
+  if (low_res && threshold_4x4avg < INT64_MAX)
+    CHECK_MEM_ERROR(cm, vt2, aom_malloc(sizeof(*vt2)));
+  // Fill in the entire tree of 8x8 (or 4x4 under some conditions) variances
+  // for splits.
+  for (m = 0; m < num_64x64_blocks; m++) {
+    const int x64_idx = ((m & 1) << 6);
+    const int y64_idx = ((m >> 1) << 6);
+    const int m2 = m << 2;
+    force_split[m + 1] = 0;
+    max_var_32x32[m] = 0;
+    min_var_32x32[m] = INT_MAX;
+    for (i = 0; i < 4; i++) {
+      const int x32_idx = x64_idx + ((i & 1) << 5);
+      const int y32_idx = y64_idx + ((i >> 1) << 5);
+      const int i2 = (m2 + i) << 2;
+      force_split[5 + m2 + i] = 0;
+      avg_16x16[m][i] = 0;
+      maxvar_16x16[m][i] = 0;
+      minvar_16x16[m][i] = INT_MAX;
+      for (j = 0; j < 4; j++) {
+        const int x16_idx = x32_idx + ((j & 1) << 4);
+        const int y16_idx = y32_idx + ((j >> 1) << 4);
+        const int split_index = 21 + i2 + j;
+        VP16x16 *vst = &vt->split[m].split[i].split[j];
+        force_split[split_index] = 0;
+        variance4x4downsample[i2 + j] = 0;
+        if (!is_key_frame) {
+          fill_variance_8x8avg(s, sp, d, dp, x16_idx, y16_idx, vst,
+#if CONFIG_AV1_HIGHBITDEPTH
+                               xd->cur_buf->flags,
+#endif
+                               pixels_wide, pixels_high, is_key_frame);
+          fill_variance_tree(&vt->split[m].split[i].split[j], BLOCK_16X16);
+          get_variance(&vt->split[m].split[i].split[j].part_variances.none);
+          avg_16x16[m][i] +=
+              vt->split[m].split[i].split[j].part_variances.none.variance;
+          if (vt->split[m].split[i].split[j].part_variances.none.variance <
+              minvar_16x16[m][i])
+            minvar_16x16[m][i] =
+                vt->split[m].split[i].split[j].part_variances.none.variance;
+          if (vt->split[m].split[i].split[j].part_variances.none.variance >
+              maxvar_16x16[m][i])
+            maxvar_16x16[m][i] =
+                vt->split[m].split[i].split[j].part_variances.none.variance;
+          if (vt->split[m].split[i].split[j].part_variances.none.variance >
+              thresholds[3]) {
+            // 16X16 variance is above threshold for split, so force split to
+            // 8x8 for this 16x16 block (this also forces splits for upper
+            // levels).
+            force_split[split_index] = 1;
+            force_split[5 + m2 + i] = 1;
+            force_split[m + 1] = 1;
+            force_split[0] = 1;
+          } else if (compute_minmax_variance &&
+                     vt->split[m]
+                             .split[i]
+                             .split[j]
+                             .part_variances.none.variance > thresholds[2] &&
+                     !cyclic_refresh_segment_id_boosted(segment_id)) {
+            // We have some nominal amount of 16x16 variance (based on average),
+            // compute the minmax over the 8x8 sub-blocks, and if above
+            // threshold, force split to 8x8 block for this 16x16 block.
+            int minmax = compute_minmax_8x8(s, sp, d, dp, x16_idx, y16_idx,
+#if CONFIG_AV1_HIGHBITDEPTH
+                                            xd->cur_buf->flags,
+#endif
+                                            pixels_wide, pixels_high);
+            int thresh_minmax = (int)cpi->vbp_info.threshold_minmax;
+            if (minmax > thresh_minmax) {
+              force_split[split_index] = 1;
+              force_split[5 + m2 + i] = 1;
+              force_split[m + 1] = 1;
+              force_split[0] = 1;
+            }
+          }
+        }
+        if (is_key_frame) {
+          force_split[split_index] = 0;
+          // Go down to 4x4 down-sampling for variance.
+          variance4x4downsample[i2 + j] = 1;
+          for (k = 0; k < 4; k++) {
+            int x8_idx = x16_idx + ((k & 1) << 3);
+            int y8_idx = y16_idx + ((k >> 1) << 3);
+            VP8x8 *vst2 = is_key_frame ? &vst->split[k] : &vt2[i2 + j].split[k];
+            fill_variance_4x4avg(s, sp, d, dp, x8_idx, y8_idx, vst2,
+#if CONFIG_AV1_HIGHBITDEPTH
+                                 xd->cur_buf->flags,
+#endif
+                                 pixels_wide, pixels_high, is_key_frame);
+          }
+        }
+      }
+    }
+  }
+
+  // Fill the rest of the variance tree by summing split partition values.
+  for (m = 0; m < num_64x64_blocks; ++m) {
+    avg_32x32 = 0;
+    const int m2 = m << 2;
+    for (i = 0; i < 4; i++) {
+      const int i2 = (m2 + i) << 2;
+      for (j = 0; j < 4; j++) {
+        const int split_index = 21 + i2 + j;
+        if (variance4x4downsample[i2 + j] == 1) {
+          VP16x16 *vtemp =
+              (!is_key_frame) ? &vt2[i2 + j] : &vt->split[m].split[i].split[j];
+          for (k = 0; k < 4; k++)
+            fill_variance_tree(&vtemp->split[k], BLOCK_8X8);
+          fill_variance_tree(vtemp, BLOCK_16X16);
+          // If variance of this 16x16 block is above the threshold, force block
+          // to split. This also forces a split on the upper levels.
+          get_variance(&vtemp->part_variances.none);
+          if (vtemp->part_variances.none.variance > thresholds[3]) {
+            force_split[split_index] = 1;
+            force_split[5 + m2 + i] = 1;
+            force_split[m + 1] = 1;
+            force_split[0] = 1;
+          }
+        }
+      }
+      fill_variance_tree(&vt->split[m].split[i], BLOCK_32X32);
+      // If variance of this 32x32 block is above the threshold, or if its above
+      // (some threshold of) the average variance over the sub-16x16 blocks,
+      // then force this block to split. This also forces a split on the upper
+      // (64x64) level.
+      if (!force_split[5 + m2 + i]) {
+        get_variance(&vt->split[m].split[i].part_variances.none);
+        var_32x32 = vt->split[m].split[i].part_variances.none.variance;
+        max_var_32x32[m] = AOMMAX(var_32x32, max_var_32x32[m]);
+        min_var_32x32[m] = AOMMIN(var_32x32, min_var_32x32[m]);
+        if (vt->split[m].split[i].part_variances.none.variance >
+                thresholds[2] ||
+            (!is_key_frame &&
+             vt->split[m].split[i].part_variances.none.variance >
+                 (thresholds[2] >> 1) &&
+             vt->split[m].split[i].part_variances.none.variance >
+                 (avg_16x16[m][i] >> 1))) {
+          force_split[5 + m2 + i] = 1;
+          force_split[m + 1] = 1;
+          force_split[0] = 1;
+        } else if (!is_key_frame && cm->height <= 360 &&
+                   (maxvar_16x16[m][i] - minvar_16x16[m][i]) >
+                       (thresholds[2] >> 1) &&
+                   maxvar_16x16[m][i] > thresholds[2]) {
+          force_split[5 + m2 + i] = 1;
+          force_split[m + 1] = 1;
+          force_split[0] = 1;
+        }
+        avg_32x32 += var_32x32;
+      }
+    }
+    if (!force_split[1 + m]) {
+      fill_variance_tree(&vt->split[m], BLOCK_64X64);
+      get_variance(&vt->split[m].part_variances.none);
+      var_64x64 = vt->split[m].part_variances.none.variance;
+      max_var_64x64 = AOMMAX(var_64x64, max_var_64x64);
+      min_var_64x64 = AOMMIN(var_64x64, min_var_64x64);
+      // If variance of this 64x64 block is above (some threshold of) the
+      // average variance over the sub-32x32 blocks, then force this block to
+      // split. Only checking this for noise level >= medium for now.
+
+      if (!is_key_frame &&
+          (max_var_32x32[m] - min_var_32x32[m]) > 3 * (thresholds[1] >> 3) &&
+          max_var_32x32[m] > thresholds[1] >> 1)
+        force_split[1 + m] = 1;
+    }
+    if (is_small_sb) force_split[0] = 1;
+  }
+
+  if (!force_split[0]) {
+    fill_variance_tree(vt, BLOCK_128X128);
+    get_variance(&vt->part_variances.none);
+    if (!is_key_frame &&
+        (max_var_64x64 - min_var_64x64) > 3 * (thresholds[0] >> 3) &&
+        max_var_64x64 > thresholds[0] >> 1)
+      force_split[0] = 1;
+  }
+
+  if (mi_col + 32 > tile->mi_col_end || mi_row + 32 > tile->mi_row_end ||
+      !set_vt_partitioning(cpi, x, xd, tile, vt, BLOCK_128X128, mi_row, mi_col,
+                           thresholds[0], BLOCK_16X16, force_split[0])) {
+    for (m = 0; m < num_64x64_blocks; ++m) {
+      const int x64_idx = ((m & 1) << 4);
+      const int y64_idx = ((m >> 1) << 4);
+      const int m2 = m << 2;
+
+      // Now go through the entire structure, splitting every block size until
+      // we get to one that's got a variance lower than our threshold.
+      if (!set_vt_partitioning(cpi, x, xd, tile, &vt->split[m], BLOCK_64X64,
+                               mi_row + y64_idx, mi_col + x64_idx,
+                               thresholds[1], BLOCK_16X16,
+                               force_split[1 + m])) {
+        for (i = 0; i < 4; ++i) {
+          const int x32_idx = ((i & 1) << 3);
+          const int y32_idx = ((i >> 1) << 3);
+          const int i2 = (m2 + i) << 2;
+          if (!set_vt_partitioning(cpi, x, xd, tile, &vt->split[m].split[i],
+                                   BLOCK_32X32, (mi_row + y64_idx + y32_idx),
+                                   (mi_col + x64_idx + x32_idx), thresholds[2],
+                                   BLOCK_16X16, force_split[5 + m2 + i])) {
+            for (j = 0; j < 4; ++j) {
+              const int x16_idx = ((j & 1) << 2);
+              const int y16_idx = ((j >> 1) << 2);
+              const int split_index = 21 + i2 + j;
+              // For inter frames: if variance4x4downsample[] == 1 for this
+              // 16x16 block, then the variance is based on 4x4 down-sampling,
+              // so use vt2 in set_vt_partioning(), otherwise use vt.
+              VP16x16 *vtemp =
+                  (!is_key_frame && variance4x4downsample[i2 + j] == 1)
+                      ? &vt2[i2 + j]
+                      : &vt->split[m].split[i].split[j];
+              if (!set_vt_partitioning(cpi, x, xd, tile, vtemp, BLOCK_16X16,
+                                       mi_row + y64_idx + y32_idx + y16_idx,
+                                       mi_col + x64_idx + x32_idx + x16_idx,
+                                       thresholds[3], BLOCK_8X8,
+                                       force_split[split_index])) {
+                for (k = 0; k < 4; ++k) {
+                  const int x8_idx = (k & 1) << 1;
+                  const int y8_idx = (k >> 1) << 1;
+                  set_block_size(
+                      cpi, x, xd,
+                      (mi_row + y64_idx + y32_idx + y16_idx + y8_idx),
+                      (mi_col + x64_idx + x32_idx + x16_idx + x8_idx),
+                      BLOCK_8X8);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  if (cpi->sf.rt_sf.short_circuit_low_temp_var) {
+    set_low_temp_var_flag(cpi, x, xd, vt, thresholds, ref_frame_partition,
+                          mi_col, mi_row);
+  }
+  chroma_check(cpi, x, bsize, y_sad, is_key_frame);
+
+  if (vt2) aom_free(vt2);
+  if (vt) aom_free(vt);
+  return 0;
+}
diff --git a/libs/libaom/src/av1/encoder/var_based_part.h b/libs/libaom/src/av1/encoder/var_based_part.h
new file mode 100644
index 000000000..a80e25c32
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/var_based_part.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_VAR_BASED_PART_H_
+#define AOM_AV1_ENCODER_VAR_BASED_PART_H_
+
+#include <stdio.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "av1/encoder/encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define QINDEX_LOW_THR \
+  200  // Use low qindex variance partition thresholds when qindex is below this
+       // threshold
+#define QINDEX_HIGH_THR \
+  220  // Use high qindex variance partition thresholds when qindex is above
+       // this threshold
+
+void av1_set_variance_partition_thresholds(AV1_COMP *cpi, int q,
+                                           int content_state);
+
+int av1_choose_var_based_partitioning(AV1_COMP *cpi, const TileInfo *const tile,
+                                      ThreadData *td, MACROBLOCK *x, int mi_row,
+                                      int mi_col);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_VAR_BASED_PART_H_
diff --git a/libs/libaom/src/av1/encoder/wedge_utils.c b/libs/libaom/src/av1/encoder/wedge_utils.c
new file mode 100644
index 000000000..40670178d
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/wedge_utils.c
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+
+#include "aom_ports/mem.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+
+#include "av1/common/reconinter.h"
+
+#define MAX_MASK_VALUE (1 << WEDGE_WEIGHT_BITS)
+
+/**
+ * Computes SSE of a compound predictor constructed from 2 fundamental
+ * predictors p0 and p1 using blending with mask.
+ *
+ * r1:  Residuals of p1.
+ *      (source - p1)
+ * d:   Difference of p1 and p0.
+ *      (p1 - p0)
+ * m:   The blending mask
+ * N:   Number of pixels
+ *
+ * 'r1', 'd', and 'm' are contiguous.
+ *
+ * Computes:
+ *  Sum((MAX_MASK_VALUE*r1 + mask*d)**2), which is equivalent to:
+ *  Sum((mask*r0 + (MAX_MASK_VALUE-mask)*r1)**2),
+ *    where r0 is (source - p0), and r1 is (source - p1), which is in turn
+ *    is equivalent to:
+ *  Sum((source*MAX_MASK_VALUE - (mask*p0 + (MAX_MASK_VALUE-mask)*p1))**2),
+ *    which is the SSE of the residuals of the compound predictor scaled up by
+ *    MAX_MASK_VALUE**2.
+ *
+ * Note that we clamp the partial term in the loop to 16 bits signed. This is
+ * to facilitate equivalent SIMD implementation. It should have no effect if
+ * residuals are within 16 - WEDGE_WEIGHT_BITS (=10) signed, which always
+ * holds for 8 bit input, and on real input, it should hold practically always,
+ * as residuals are expected to be small.
+ */
+uint64_t av1_wedge_sse_from_residuals_c(const int16_t *r1, const int16_t *d,
+                                        const uint8_t *m, int N) {
+  uint64_t csse = 0;
+  int i;
+
+  for (i = 0; i < N; i++) {
+    int32_t t = MAX_MASK_VALUE * r1[i] + m[i] * d[i];
+    t = clamp(t, INT16_MIN, INT16_MAX);
+    csse += t * t;
+  }
+  return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS);
+}
+
+/**
+ * Choose the mask sign for a compound predictor.
+ *
+ * ds:    Difference of the squares of the residuals.
+ *        r0**2 - r1**2
+ * m:     The blending mask
+ * N:     Number of pixels
+ * limit: Pre-computed threshold value.
+ *        MAX_MASK_VALUE/2 * (sum(r0**2) - sum(r1**2))
+ *
+ * 'ds' and 'm' are contiguous.
+ *
+ * Returns true if the negated mask has lower SSE compared to the positive
+ * mask. Computation is based on:
+ *  Sum((mask*r0 + (MAX_MASK_VALUE-mask)*r1)**2)
+ *                                     >
+ *                                Sum(((MAX_MASK_VALUE-mask)*r0 + mask*r1)**2)
+ *
+ *  which can be simplified to:
+ *
+ *  Sum(mask*(r0**2 - r1**2)) > MAX_MASK_VALUE/2 * (sum(r0**2) - sum(r1**2))
+ *
+ *  The right hand side does not depend on the mask, and needs to be passed as
+ *  the 'limit' parameter.
+ *
+ *  After pre-computing (r0**2 - r1**2), which is passed in as 'ds', the left
+ *  hand side is simply a scalar product between an int16_t and uint8_t vector.
+ *
+ *  Note that for efficiency, ds is stored on 16 bits. Real input residuals
+ *  being small, this should not cause a noticeable issue.
+ */
+int8_t av1_wedge_sign_from_residuals_c(const int16_t *ds, const uint8_t *m,
+                                       int N, int64_t limit) {
+  int64_t acc = 0;
+
+  do {
+    acc += *ds++ * *m++;
+  } while (--N);
+
+  return acc > limit;
+}
+
+/**
+ * Compute the element-wise difference of the squares of 2 arrays.
+ *
+ * d: Difference of the squares of the inputs: a**2 - b**2
+ * a: First input array
+ * b: Second input array
+ * N: Number of elements
+ *
+ * 'd', 'a', and 'b' are contiguous.
+ *
+ * The result is saturated to signed 16 bits.
+ */
+void av1_wedge_compute_delta_squares_c(int16_t *d, const int16_t *a,
+                                       const int16_t *b, int N) {
+  int i;
+
+  for (i = 0; i < N; i++)
+    d[i] = clamp(a[i] * a[i] - b[i] * b[i], INT16_MIN, INT16_MAX);
+}
diff --git a/libs/libaom/src/av1/encoder/x86/av1_fwd_txfm1d_sse4.c b/libs/libaom/src/av1/encoder/x86/av1_fwd_txfm1d_sse4.c
new file mode 100644
index 000000000..62eaa3074
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/x86/av1_fwd_txfm1d_sse4.c
@@ -0,0 +1,1417 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/encoder/x86/av1_txfm1d_sse4.h"
+
+void av1_fdct32_sse4_1(__m128i *input, __m128i *output, int cos_bit,
+                       const int stride) {
+  __m128i buf0[32];
+  __m128i buf1[32];
+  const int32_t *cospi;
+
+  int startidx = 0 * stride;
+  int endidx = 31 * stride;
+  // stage 0
+  // stage 1
+  buf1[0] = _mm_add_epi32(input[startidx], input[endidx]);
+  buf1[31] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[1] = _mm_add_epi32(input[startidx], input[endidx]);
+  buf1[30] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[2] = _mm_add_epi32(input[startidx], input[endidx]);
+  buf1[29] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[3] = _mm_add_epi32(input[startidx], input[endidx]);
+  buf1[28] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[4] = _mm_add_epi32(input[startidx], input[endidx]);
+  buf1[27] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[5] = _mm_add_epi32(input[startidx], input[endidx]);
+  buf1[26] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[6] = _mm_add_epi32(input[startidx], input[endidx]);
+  buf1[25] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[7] = _mm_add_epi32(input[startidx], input[endidx]);
+  buf1[24] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[8] = _mm_add_epi32(input[startidx], input[endidx]);
+  buf1[23] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[9] = _mm_add_epi32(input[startidx], input[endidx]);
+  buf1[22] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[10] = _mm_add_epi32(input[startidx], input[endidx]);
+  buf1[21] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[11] = _mm_add_epi32(input[startidx], input[endidx]);
+  buf1[20] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[12] = _mm_add_epi32(input[startidx], input[endidx]);
+  buf1[19] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[13] = _mm_add_epi32(input[startidx], input[endidx]);
+  buf1[18] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[14] = _mm_add_epi32(input[startidx], input[endidx]);
+  buf1[17] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += stride;
+  endidx -= stride;
+  buf1[15] = _mm_add_epi32(input[startidx], input[endidx]);
+  buf1[16] = _mm_sub_epi32(input[startidx], input[endidx]);
+
+  // stage 2
+  cospi = cospi_arr(cos_bit);
+  buf0[0] = _mm_add_epi32(buf1[0], buf1[15]);
+  buf0[15] = _mm_sub_epi32(buf1[0], buf1[15]);
+  buf0[1] = _mm_add_epi32(buf1[1], buf1[14]);
+  buf0[14] = _mm_sub_epi32(buf1[1], buf1[14]);
+  buf0[2] = _mm_add_epi32(buf1[2], buf1[13]);
+  buf0[13] = _mm_sub_epi32(buf1[2], buf1[13]);
+  buf0[3] = _mm_add_epi32(buf1[3], buf1[12]);
+  buf0[12] = _mm_sub_epi32(buf1[3], buf1[12]);
+  buf0[4] = _mm_add_epi32(buf1[4], buf1[11]);
+  buf0[11] = _mm_sub_epi32(buf1[4], buf1[11]);
+  buf0[5] = _mm_add_epi32(buf1[5], buf1[10]);
+  buf0[10] = _mm_sub_epi32(buf1[5], buf1[10]);
+  buf0[6] = _mm_add_epi32(buf1[6], buf1[9]);
+  buf0[9] = _mm_sub_epi32(buf1[6], buf1[9]);
+  buf0[7] = _mm_add_epi32(buf1[7], buf1[8]);
+  buf0[8] = _mm_sub_epi32(buf1[7], buf1[8]);
+  buf0[16] = buf1[16];
+  buf0[17] = buf1[17];
+  buf0[18] = buf1[18];
+  buf0[19] = buf1[19];
+  btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[20], buf1[27], buf0[20],
+                      buf0[27], cos_bit);
+  btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[21], buf1[26], buf0[21],
+                      buf0[26], cos_bit);
+  btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[22], buf1[25], buf0[22],
+                      buf0[25], cos_bit);
+  btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[23], buf1[24], buf0[23],
+                      buf0[24], cos_bit);
+  buf0[28] = buf1[28];
+  buf0[29] = buf1[29];
+  buf0[30] = buf1[30];
+  buf0[31] = buf1[31];
+
+  // stage 3
+  cospi = cospi_arr(cos_bit);
+  buf1[0] = _mm_add_epi32(buf0[0], buf0[7]);
+  buf1[7] = _mm_sub_epi32(buf0[0], buf0[7]);
+  buf1[1] = _mm_add_epi32(buf0[1], buf0[6]);
+  buf1[6] = _mm_sub_epi32(buf0[1], buf0[6]);
+  buf1[2] = _mm_add_epi32(buf0[2], buf0[5]);
+  buf1[5] = _mm_sub_epi32(buf0[2], buf0[5]);
+  buf1[3] = _mm_add_epi32(buf0[3], buf0[4]);
+  buf1[4] = _mm_sub_epi32(buf0[3], buf0[4]);
+  buf1[8] = buf0[8];
+  buf1[9] = buf0[9];
+  btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[10], buf0[13], buf1[10],
+                      buf1[13], cos_bit);
+  btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[11], buf0[12], buf1[11],
+                      buf1[12], cos_bit);
+  buf1[14] = buf0[14];
+  buf1[15] = buf0[15];
+  buf1[16] = _mm_add_epi32(buf0[16], buf0[23]);
+  buf1[23] = _mm_sub_epi32(buf0[16], buf0[23]);
+  buf1[17] = _mm_add_epi32(buf0[17], buf0[22]);
+  buf1[22] = _mm_sub_epi32(buf0[17], buf0[22]);
+  buf1[18] = _mm_add_epi32(buf0[18], buf0[21]);
+  buf1[21] = _mm_sub_epi32(buf0[18], buf0[21]);
+  buf1[19] = _mm_add_epi32(buf0[19], buf0[20]);
+  buf1[20] = _mm_sub_epi32(buf0[19], buf0[20]);
+  buf1[24] = _mm_sub_epi32(buf0[31], buf0[24]);
+  buf1[31] = _mm_add_epi32(buf0[31], buf0[24]);
+  buf1[25] = _mm_sub_epi32(buf0[30], buf0[25]);
+  buf1[30] = _mm_add_epi32(buf0[30], buf0[25]);
+  buf1[26] = _mm_sub_epi32(buf0[29], buf0[26]);
+  buf1[29] = _mm_add_epi32(buf0[29], buf0[26]);
+  buf1[27] = _mm_sub_epi32(buf0[28], buf0[27]);
+  buf1[28] = _mm_add_epi32(buf0[28], buf0[27]);
+
+  // stage 4
+  cospi = cospi_arr(cos_bit);
+  buf0[0] = _mm_add_epi32(buf1[0], buf1[3]);
+  buf0[3] = _mm_sub_epi32(buf1[0], buf1[3]);
+  buf0[1] = _mm_add_epi32(buf1[1], buf1[2]);
+  buf0[2] = _mm_sub_epi32(buf1[1], buf1[2]);
+  buf0[4] = buf1[4];
+  btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[5], buf1[6], buf0[5], buf0[6],
+                      cos_bit);
+  buf0[7] = buf1[7];
+  buf0[8] = _mm_add_epi32(buf1[8], buf1[11]);
+  buf0[11] = _mm_sub_epi32(buf1[8], buf1[11]);
+  buf0[9] = _mm_add_epi32(buf1[9], buf1[10]);
+  buf0[10] = _mm_sub_epi32(buf1[9], buf1[10]);
+  buf0[12] = _mm_sub_epi32(buf1[15], buf1[12]);
+  buf0[15] = _mm_add_epi32(buf1[15], buf1[12]);
+  buf0[13] = _mm_sub_epi32(buf1[14], buf1[13]);
+  buf0[14] = _mm_add_epi32(buf1[14], buf1[13]);
+  buf0[16] = buf1[16];
+  buf0[17] = buf1[17];
+  btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[18], buf1[29], buf0[18],
+                      buf0[29], cos_bit);
+  btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[19], buf1[28], buf0[19],
+                      buf0[28], cos_bit);
+  btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[20], buf1[27], buf0[20],
+                      buf0[27], cos_bit);
+  btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[21], buf1[26], buf0[21],
+                      buf0[26], cos_bit);
+  buf0[22] = buf1[22];
+  buf0[23] = buf1[23];
+  buf0[24] = buf1[24];
+  buf0[25] = buf1[25];
+  buf0[30] = buf1[30];
+  buf0[31] = buf1[31];
+
+  // stage 5
+  cospi = cospi_arr(cos_bit);
+  btf_32_sse4_1_type0(cospi[32], cospi[32], buf0[0], buf0[1], buf1[0], buf1[1],
+                      cos_bit);
+  btf_32_sse4_1_type1(cospi[48], cospi[16], buf0[2], buf0[3], buf1[2], buf1[3],
+                      cos_bit);
+  buf1[4] = _mm_add_epi32(buf0[4], buf0[5]);
+  buf1[5] = _mm_sub_epi32(buf0[4], buf0[5]);
+  buf1[6] = _mm_sub_epi32(buf0[7], buf0[6]);
+  buf1[7] = _mm_add_epi32(buf0[7], buf0[6]);
+  buf1[8] = buf0[8];
+  btf_32_sse4_1_type0(-cospi[16], cospi[48], buf0[9], buf0[14], buf1[9],
+                      buf1[14], cos_bit);
+  btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf0[10], buf0[13], buf1[10],
+                      buf1[13], cos_bit);
+  buf1[11] = buf0[11];
+  buf1[12] = buf0[12];
+  buf1[15] = buf0[15];
+  buf1[16] = _mm_add_epi32(buf0[16], buf0[19]);
+  buf1[19] = _mm_sub_epi32(buf0[16], buf0[19]);
+  buf1[17] = _mm_add_epi32(buf0[17], buf0[18]);
+  buf1[18] = _mm_sub_epi32(buf0[17], buf0[18]);
+  buf1[20] = _mm_sub_epi32(buf0[23], buf0[20]);
+  buf1[23] = _mm_add_epi32(buf0[23], buf0[20]);
+  buf1[21] = _mm_sub_epi32(buf0[22], buf0[21]);
+  buf1[22] = _mm_add_epi32(buf0[22], buf0[21]);
+  buf1[24] = _mm_add_epi32(buf0[24], buf0[27]);
+  buf1[27] = _mm_sub_epi32(buf0[24], buf0[27]);
+  buf1[25] = _mm_add_epi32(buf0[25], buf0[26]);
+  buf1[26] = _mm_sub_epi32(buf0[25], buf0[26]);
+  buf1[28] = _mm_sub_epi32(buf0[31], buf0[28]);
+  buf1[31] = _mm_add_epi32(buf0[31], buf0[28]);
+  buf1[29] = _mm_sub_epi32(buf0[30], buf0[29]);
+  buf1[30] = _mm_add_epi32(buf0[30], buf0[29]);
+
+  // stage 6
+  cospi = cospi_arr(cos_bit);
+  buf0[0] = buf1[0];
+  buf0[1] = buf1[1];
+  buf0[2] = buf1[2];
+  buf0[3] = buf1[3];
+  btf_32_sse4_1_type1(cospi[56], cospi[8], buf1[4], buf1[7], buf0[4], buf0[7],
+                      cos_bit);
+  btf_32_sse4_1_type1(cospi[24], cospi[40], buf1[5], buf1[6], buf0[5], buf0[6],
+                      cos_bit);
+  buf0[8] = _mm_add_epi32(buf1[8], buf1[9]);
+  buf0[9] = _mm_sub_epi32(buf1[8], buf1[9]);
+  buf0[10] = _mm_sub_epi32(buf1[11], buf1[10]);
+  buf0[11] = _mm_add_epi32(buf1[11], buf1[10]);
+  buf0[12] = _mm_add_epi32(buf1[12], buf1[13]);
+  buf0[13] = _mm_sub_epi32(buf1[12], buf1[13]);
+  buf0[14] = _mm_sub_epi32(buf1[15], buf1[14]);
+  buf0[15] = _mm_add_epi32(buf1[15], buf1[14]);
+  buf0[16] = buf1[16];
+  btf_32_sse4_1_type0(-cospi[8], cospi[56], buf1[17], buf1[30], buf0[17],
+                      buf0[30], cos_bit);
+  btf_32_sse4_1_type0(-cospi[56], -cospi[8], buf1[18], buf1[29], buf0[18],
+                      buf0[29], cos_bit);
+  buf0[19] = buf1[19];
+  buf0[20] = buf1[20];
+  btf_32_sse4_1_type0(-cospi[40], cospi[24], buf1[21], buf1[26], buf0[21],
+                      buf0[26], cos_bit);
+  btf_32_sse4_1_type0(-cospi[24], -cospi[40], buf1[22], buf1[25], buf0[22],
+                      buf0[25], cos_bit);
+  buf0[23] = buf1[23];
+  buf0[24] = buf1[24];
+  buf0[27] = buf1[27];
+  buf0[28] = buf1[28];
+  buf0[31] = buf1[31];
+
+  // stage 7
+  cospi = cospi_arr(cos_bit);
+  buf1[0] = buf0[0];
+  buf1[1] = buf0[1];
+  buf1[2] = buf0[2];
+  buf1[3] = buf0[3];
+  buf1[4] = buf0[4];
+  buf1[5] = buf0[5];
+  buf1[6] = buf0[6];
+  buf1[7] = buf0[7];
+  btf_32_sse4_1_type1(cospi[60], cospi[4], buf0[8], buf0[15], buf1[8], buf1[15],
+                      cos_bit);
+  btf_32_sse4_1_type1(cospi[28], cospi[36], buf0[9], buf0[14], buf1[9],
+                      buf1[14], cos_bit);
+  btf_32_sse4_1_type1(cospi[44], cospi[20], buf0[10], buf0[13], buf1[10],
+                      buf1[13], cos_bit);
+  btf_32_sse4_1_type1(cospi[12], cospi[52], buf0[11], buf0[12], buf1[11],
+                      buf1[12], cos_bit);
+  buf1[16] = _mm_add_epi32(buf0[16], buf0[17]);
+  buf1[17] = _mm_sub_epi32(buf0[16], buf0[17]);
+  buf1[18] = _mm_sub_epi32(buf0[19], buf0[18]);
+  buf1[19] = _mm_add_epi32(buf0[19], buf0[18]);
+  buf1[20] = _mm_add_epi32(buf0[20], buf0[21]);
+  buf1[21] = _mm_sub_epi32(buf0[20], buf0[21]);
+  buf1[22] = _mm_sub_epi32(buf0[23], buf0[22]);
+  buf1[23] = _mm_add_epi32(buf0[23], buf0[22]);
+  buf1[24] = _mm_add_epi32(buf0[24], buf0[25]);
+  buf1[25] = _mm_sub_epi32(buf0[24], buf0[25]);
+  buf1[26] = _mm_sub_epi32(buf0[27], buf0[26]);
+  buf1[27] = _mm_add_epi32(buf0[27], buf0[26]);
+  buf1[28] = _mm_add_epi32(buf0[28], buf0[29]);
+  buf1[29] = _mm_sub_epi32(buf0[28], buf0[29]);
+  buf1[30] = _mm_sub_epi32(buf0[31], buf0[30]);
+  buf1[31] = _mm_add_epi32(buf0[31], buf0[30]);
+
+  // stage 8
+  cospi = cospi_arr(cos_bit);
+  buf0[0] = buf1[0];
+  buf0[1] = buf1[1];
+  buf0[2] = buf1[2];
+  buf0[3] = buf1[3];
+  buf0[4] = buf1[4];
+  buf0[5] = buf1[5];
+  buf0[6] = buf1[6];
+  buf0[7] = buf1[7];
+  buf0[8] = buf1[8];
+  buf0[9] = buf1[9];
+  buf0[10] = buf1[10];
+  buf0[11] = buf1[11];
+  buf0[12] = buf1[12];
+  buf0[13] = buf1[13];
+  buf0[14] = buf1[14];
+  buf0[15] = buf1[15];
+  btf_32_sse4_1_type1(cospi[62], cospi[2], buf1[16], buf1[31], buf0[16],
+                      buf0[31], cos_bit);
+  btf_32_sse4_1_type1(cospi[30], cospi[34], buf1[17], buf1[30], buf0[17],
+                      buf0[30], cos_bit);
+  btf_32_sse4_1_type1(cospi[46], cospi[18], buf1[18], buf1[29], buf0[18],
+                      buf0[29], cos_bit);
+  btf_32_sse4_1_type1(cospi[14], cospi[50], buf1[19], buf1[28], buf0[19],
+                      buf0[28], cos_bit);
+  btf_32_sse4_1_type1(cospi[54], cospi[10], buf1[20], buf1[27], buf0[20],
+                      buf0[27], cos_bit);
+  btf_32_sse4_1_type1(cospi[22], cospi[42], buf1[21], buf1[26], buf0[21],
+                      buf0[26], cos_bit);
+  btf_32_sse4_1_type1(cospi[38], cospi[26], buf1[22], buf1[25], buf0[22],
+                      buf0[25], cos_bit);
+  btf_32_sse4_1_type1(cospi[6], cospi[58], buf1[23], buf1[24], buf0[23],
+                      buf0[24], cos_bit);
+
+  startidx = 0 * stride;
+  endidx = 31 * stride;
+  // stage 9
+  output[startidx] = buf0[0];
+  output[endidx] = buf0[31];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[16];
+  output[endidx] = buf0[15];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[8];
+  output[endidx] = buf0[23];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[24];
+  output[endidx] = buf0[7];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[4];
+  output[endidx] = buf0[27];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[20];
+  output[endidx] = buf0[11];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[12];
+  output[endidx] = buf0[19];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[28];
+  output[endidx] = buf0[3];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[2];
+  output[endidx] = buf0[29];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[18];
+  output[endidx] = buf0[13];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[10];
+  output[endidx] = buf0[21];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[26];
+  output[endidx] = buf0[5];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[6];
+  output[endidx] = buf0[25];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[22];
+  output[endidx] = buf0[9];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[14];
+  output[endidx] = buf0[17];
+  startidx += stride;
+  endidx -= stride;
+  output[startidx] = buf0[30];
+  output[endidx] = buf0[1];
+}
+
+void av1_fadst4_sse4_1(const __m128i *input, __m128i *output,
+                       const int8_t cos_bit, const int8_t *stage_range) {
+  const int txfm_size = 4;
+  const int num_per_128 = 4;
+  const int32_t *cospi;
+  __m128i buf0[4];
+  __m128i buf1[4];
+  int col_num = txfm_size / num_per_128;
+  int col;
+  (void)stage_range;
+  for (col = 0; col < col_num; col++) {
+    // stage 0;
+    int32_t stage_idx = 0;
+    int j;
+    for (j = 0; j < 4; ++j) {
+      buf0[j] = input[j * col_num + col];
+    }
+
+    // stage 1
+    stage_idx++;
+    buf1[0] = buf0[3];
+    buf1[1] = buf0[0];
+    buf1[2] = buf0[1];
+    buf1[3] = buf0[2];
+
+    // stage 2
+    stage_idx++;
+
+    cospi = cospi_arr(cos_bit);
+    btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[0], buf1[1], buf0[0], buf0[1],
+                        cos_bit);
+    btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[2], buf1[3], buf0[2],
+                        buf0[3], cos_bit);
+
+    // stage 3
+    stage_idx++;
+    buf1[0] = _mm_add_epi32(buf0[0], buf0[2]);
+    buf1[2] = _mm_sub_epi32(buf0[0], buf0[2]);
+    buf1[1] = _mm_add_epi32(buf0[1], buf0[3]);
+    buf1[3] = _mm_sub_epi32(buf0[1], buf0[3]);
+
+    // stage 4
+    stage_idx++;
+
+    cospi = cospi_arr(cos_bit);
+    buf0[0] = buf1[0];
+    buf0[1] = buf1[1];
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2],
+                        buf0[3], cos_bit);
+
+    // stage 5
+    stage_idx++;
+    buf1[0] = buf0[0];
+    buf1[1] = _mm_sub_epi32(_mm_setzero_si128(), buf0[2]);
+    buf1[2] = buf0[3];
+    buf1[3] = _mm_sub_epi32(_mm_setzero_si128(), buf0[1]);
+
+    for (j = 0; j < 4; ++j) {
+      output[j * col_num + col] = buf1[j];
+    }
+  }
+}
+
+void av1_fdct64_sse4_1(__m128i *input, __m128i *output, int8_t cos_bit,
+                       const int instride, const int outstride) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+  __m128i cospi_m32 = _mm_set1_epi32(-cospi[32]);
+  __m128i cospi_p32 = _mm_set1_epi32(cospi[32]);
+  __m128i cospi_m16 = _mm_set1_epi32(-cospi[16]);
+  __m128i cospi_p48 = _mm_set1_epi32(cospi[48]);
+  __m128i cospi_m48 = _mm_set1_epi32(-cospi[48]);
+  __m128i cospi_p16 = _mm_set1_epi32(cospi[16]);
+  __m128i cospi_m08 = _mm_set1_epi32(-cospi[8]);
+  __m128i cospi_p56 = _mm_set1_epi32(cospi[56]);
+  __m128i cospi_m56 = _mm_set1_epi32(-cospi[56]);
+  __m128i cospi_m40 = _mm_set1_epi32(-cospi[40]);
+  __m128i cospi_p24 = _mm_set1_epi32(cospi[24]);
+  __m128i cospi_m24 = _mm_set1_epi32(-cospi[24]);
+  __m128i cospi_p08 = _mm_set1_epi32(cospi[8]);
+  __m128i cospi_p40 = _mm_set1_epi32(cospi[40]);
+  __m128i cospi_p60 = _mm_set1_epi32(cospi[60]);
+  __m128i cospi_p04 = _mm_set1_epi32(cospi[4]);
+  __m128i cospi_p28 = _mm_set1_epi32(cospi[28]);
+  __m128i cospi_p36 = _mm_set1_epi32(cospi[36]);
+  __m128i cospi_p44 = _mm_set1_epi32(cospi[44]);
+  __m128i cospi_p20 = _mm_set1_epi32(cospi[20]);
+  __m128i cospi_p12 = _mm_set1_epi32(cospi[12]);
+  __m128i cospi_p52 = _mm_set1_epi32(cospi[52]);
+  __m128i cospi_m04 = _mm_set1_epi32(-cospi[4]);
+  __m128i cospi_m60 = _mm_set1_epi32(-cospi[60]);
+  __m128i cospi_m36 = _mm_set1_epi32(-cospi[36]);
+  __m128i cospi_m28 = _mm_set1_epi32(-cospi[28]);
+  __m128i cospi_m20 = _mm_set1_epi32(-cospi[20]);
+  __m128i cospi_m44 = _mm_set1_epi32(-cospi[44]);
+  __m128i cospi_m52 = _mm_set1_epi32(-cospi[52]);
+  __m128i cospi_m12 = _mm_set1_epi32(-cospi[12]);
+  __m128i cospi_p62 = _mm_set1_epi32(cospi[62]);
+  __m128i cospi_p02 = _mm_set1_epi32(cospi[2]);
+  __m128i cospi_p30 = _mm_set1_epi32(cospi[30]);
+  __m128i cospi_p34 = _mm_set1_epi32(cospi[34]);
+  __m128i cospi_p46 = _mm_set1_epi32(cospi[46]);
+  __m128i cospi_p18 = _mm_set1_epi32(cospi[18]);
+  __m128i cospi_p14 = _mm_set1_epi32(cospi[14]);
+  __m128i cospi_p50 = _mm_set1_epi32(cospi[50]);
+  __m128i cospi_p54 = _mm_set1_epi32(cospi[54]);
+  __m128i cospi_p10 = _mm_set1_epi32(cospi[10]);
+  __m128i cospi_p22 = _mm_set1_epi32(cospi[22]);
+  __m128i cospi_p42 = _mm_set1_epi32(cospi[42]);
+  __m128i cospi_p38 = _mm_set1_epi32(cospi[38]);
+  __m128i cospi_p26 = _mm_set1_epi32(cospi[26]);
+  __m128i cospi_p06 = _mm_set1_epi32(cospi[6]);
+  __m128i cospi_p58 = _mm_set1_epi32(cospi[58]);
+  __m128i cospi_p63 = _mm_set1_epi32(cospi[63]);
+  __m128i cospi_p01 = _mm_set1_epi32(cospi[1]);
+  __m128i cospi_p31 = _mm_set1_epi32(cospi[31]);
+  __m128i cospi_p33 = _mm_set1_epi32(cospi[33]);
+  __m128i cospi_p47 = _mm_set1_epi32(cospi[47]);
+  __m128i cospi_p17 = _mm_set1_epi32(cospi[17]);
+  __m128i cospi_p15 = _mm_set1_epi32(cospi[15]);
+  __m128i cospi_p49 = _mm_set1_epi32(cospi[49]);
+  __m128i cospi_p55 = _mm_set1_epi32(cospi[55]);
+  __m128i cospi_p09 = _mm_set1_epi32(cospi[9]);
+  __m128i cospi_p23 = _mm_set1_epi32(cospi[23]);
+  __m128i cospi_p41 = _mm_set1_epi32(cospi[41]);
+  __m128i cospi_p39 = _mm_set1_epi32(cospi[39]);
+  __m128i cospi_p25 = _mm_set1_epi32(cospi[25]);
+  __m128i cospi_p07 = _mm_set1_epi32(cospi[7]);
+  __m128i cospi_p57 = _mm_set1_epi32(cospi[57]);
+  __m128i cospi_p59 = _mm_set1_epi32(cospi[59]);
+  __m128i cospi_p05 = _mm_set1_epi32(cospi[5]);
+  __m128i cospi_p27 = _mm_set1_epi32(cospi[27]);
+  __m128i cospi_p37 = _mm_set1_epi32(cospi[37]);
+  __m128i cospi_p43 = _mm_set1_epi32(cospi[43]);
+  __m128i cospi_p21 = _mm_set1_epi32(cospi[21]);
+  __m128i cospi_p11 = _mm_set1_epi32(cospi[11]);
+  __m128i cospi_p53 = _mm_set1_epi32(cospi[53]);
+  __m128i cospi_p51 = _mm_set1_epi32(cospi[51]);
+  __m128i cospi_p13 = _mm_set1_epi32(cospi[13]);
+  __m128i cospi_p19 = _mm_set1_epi32(cospi[19]);
+  __m128i cospi_p45 = _mm_set1_epi32(cospi[45]);
+  __m128i cospi_p35 = _mm_set1_epi32(cospi[35]);
+  __m128i cospi_p29 = _mm_set1_epi32(cospi[29]);
+  __m128i cospi_p03 = _mm_set1_epi32(cospi[3]);
+  __m128i cospi_p61 = _mm_set1_epi32(cospi[61]);
+
+  int startidx = 0 * instride;
+  int endidx = 63 * instride;
+  // stage 1
+  __m128i x1[64];
+  x1[0] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[63] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[1] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[62] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[2] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[61] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[3] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[60] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[4] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[59] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[5] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[58] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[6] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[57] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[7] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[56] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[8] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[55] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[9] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[54] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[10] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[53] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[11] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[52] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[12] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[51] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[13] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[50] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[14] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[49] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[15] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[48] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[16] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[47] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[17] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[46] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[18] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[45] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[19] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[44] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[20] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[43] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[21] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[42] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[22] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[41] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[23] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[40] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[24] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[39] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[25] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[38] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[26] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[37] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[27] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[36] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[28] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[35] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[29] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[34] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[30] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[33] = _mm_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[31] = _mm_add_epi32(input[startidx], input[endidx]);
+  x1[32] = _mm_sub_epi32(input[startidx], input[endidx]);
+
+  // stage 2
+  __m128i x2[64];
+  x2[0] = _mm_add_epi32(x1[0], x1[31]);
+  x2[31] = _mm_sub_epi32(x1[0], x1[31]);
+  x2[1] = _mm_add_epi32(x1[1], x1[30]);
+  x2[30] = _mm_sub_epi32(x1[1], x1[30]);
+  x2[2] = _mm_add_epi32(x1[2], x1[29]);
+  x2[29] = _mm_sub_epi32(x1[2], x1[29]);
+  x2[3] = _mm_add_epi32(x1[3], x1[28]);
+  x2[28] = _mm_sub_epi32(x1[3], x1[28]);
+  x2[4] = _mm_add_epi32(x1[4], x1[27]);
+  x2[27] = _mm_sub_epi32(x1[4], x1[27]);
+  x2[5] = _mm_add_epi32(x1[5], x1[26]);
+  x2[26] = _mm_sub_epi32(x1[5], x1[26]);
+  x2[6] = _mm_add_epi32(x1[6], x1[25]);
+  x2[25] = _mm_sub_epi32(x1[6], x1[25]);
+  x2[7] = _mm_add_epi32(x1[7], x1[24]);
+  x2[24] = _mm_sub_epi32(x1[7], x1[24]);
+  x2[8] = _mm_add_epi32(x1[8], x1[23]);
+  x2[23] = _mm_sub_epi32(x1[8], x1[23]);
+  x2[9] = _mm_add_epi32(x1[9], x1[22]);
+  x2[22] = _mm_sub_epi32(x1[9], x1[22]);
+  x2[10] = _mm_add_epi32(x1[10], x1[21]);
+  x2[21] = _mm_sub_epi32(x1[10], x1[21]);
+  x2[11] = _mm_add_epi32(x1[11], x1[20]);
+  x2[20] = _mm_sub_epi32(x1[11], x1[20]);
+  x2[12] = _mm_add_epi32(x1[12], x1[19]);
+  x2[19] = _mm_sub_epi32(x1[12], x1[19]);
+  x2[13] = _mm_add_epi32(x1[13], x1[18]);
+  x2[18] = _mm_sub_epi32(x1[13], x1[18]);
+  x2[14] = _mm_add_epi32(x1[14], x1[17]);
+  x2[17] = _mm_sub_epi32(x1[14], x1[17]);
+  x2[15] = _mm_add_epi32(x1[15], x1[16]);
+  x2[16] = _mm_sub_epi32(x1[15], x1[16]);
+  x2[32] = x1[32];
+  x2[33] = x1[33];
+  x2[34] = x1[34];
+  x2[35] = x1[35];
+  x2[36] = x1[36];
+  x2[37] = x1[37];
+  x2[38] = x1[38];
+  x2[39] = x1[39];
+  btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[40], x1[55], x2[40], x2[55],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[41], x1[54], x2[41], x2[54],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[42], x1[53], x2[42], x2[53],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[43], x1[52], x2[43], x2[52],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[44], x1[51], x2[44], x2[51],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[45], x1[50], x2[45], x2[50],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[46], x1[49], x2[46], x2[49],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[47], x1[48], x2[47], x2[48],
+                          __rounding, cos_bit);
+  x2[56] = x1[56];
+  x2[57] = x1[57];
+  x2[58] = x1[58];
+  x2[59] = x1[59];
+  x2[60] = x1[60];
+  x2[61] = x1[61];
+  x2[62] = x1[62];
+  x2[63] = x1[63];
+
+  // stage 3
+  __m128i x3[64];
+  x3[0] = _mm_add_epi32(x2[0], x2[15]);
+  x3[15] = _mm_sub_epi32(x2[0], x2[15]);
+  x3[1] = _mm_add_epi32(x2[1], x2[14]);
+  x3[14] = _mm_sub_epi32(x2[1], x2[14]);
+  x3[2] = _mm_add_epi32(x2[2], x2[13]);
+  x3[13] = _mm_sub_epi32(x2[2], x2[13]);
+  x3[3] = _mm_add_epi32(x2[3], x2[12]);
+  x3[12] = _mm_sub_epi32(x2[3], x2[12]);
+  x3[4] = _mm_add_epi32(x2[4], x2[11]);
+  x3[11] = _mm_sub_epi32(x2[4], x2[11]);
+  x3[5] = _mm_add_epi32(x2[5], x2[10]);
+  x3[10] = _mm_sub_epi32(x2[5], x2[10]);
+  x3[6] = _mm_add_epi32(x2[6], x2[9]);
+  x3[9] = _mm_sub_epi32(x2[6], x2[9]);
+  x3[7] = _mm_add_epi32(x2[7], x2[8]);
+  x3[8] = _mm_sub_epi32(x2[7], x2[8]);
+  x3[16] = x2[16];
+  x3[17] = x2[17];
+  x3[18] = x2[18];
+  x3[19] = x2[19];
+  btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x2[20], x2[27], x3[20], x3[27],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x2[21], x2[26], x3[21], x3[26],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x2[22], x2[25], x3[22], x3[25],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x2[23], x2[24], x3[23], x3[24],
+                          __rounding, cos_bit);
+  x3[28] = x2[28];
+  x3[29] = x2[29];
+  x3[30] = x2[30];
+  x3[31] = x2[31];
+  x3[32] = _mm_add_epi32(x2[32], x2[47]);
+  x3[47] = _mm_sub_epi32(x2[32], x2[47]);
+  x3[33] = _mm_add_epi32(x2[33], x2[46]);
+  x3[46] = _mm_sub_epi32(x2[33], x2[46]);
+  x3[34] = _mm_add_epi32(x2[34], x2[45]);
+  x3[45] = _mm_sub_epi32(x2[34], x2[45]);
+  x3[35] = _mm_add_epi32(x2[35], x2[44]);
+  x3[44] = _mm_sub_epi32(x2[35], x2[44]);
+  x3[36] = _mm_add_epi32(x2[36], x2[43]);
+  x3[43] = _mm_sub_epi32(x2[36], x2[43]);
+  x3[37] = _mm_add_epi32(x2[37], x2[42]);
+  x3[42] = _mm_sub_epi32(x2[37], x2[42]);
+  x3[38] = _mm_add_epi32(x2[38], x2[41]);
+  x3[41] = _mm_sub_epi32(x2[38], x2[41]);
+  x3[39] = _mm_add_epi32(x2[39], x2[40]);
+  x3[40] = _mm_sub_epi32(x2[39], x2[40]);
+  x3[48] = _mm_sub_epi32(x2[63], x2[48]);
+  x3[63] = _mm_add_epi32(x2[63], x2[48]);
+  x3[49] = _mm_sub_epi32(x2[62], x2[49]);
+  x3[62] = _mm_add_epi32(x2[62], x2[49]);
+  x3[50] = _mm_sub_epi32(x2[61], x2[50]);
+  x3[61] = _mm_add_epi32(x2[61], x2[50]);
+  x3[51] = _mm_sub_epi32(x2[60], x2[51]);
+  x3[60] = _mm_add_epi32(x2[60], x2[51]);
+  x3[52] = _mm_sub_epi32(x2[59], x2[52]);
+  x3[59] = _mm_add_epi32(x2[59], x2[52]);
+  x3[53] = _mm_sub_epi32(x2[58], x2[53]);
+  x3[58] = _mm_add_epi32(x2[58], x2[53]);
+  x3[54] = _mm_sub_epi32(x2[57], x2[54]);
+  x3[57] = _mm_add_epi32(x2[57], x2[54]);
+  x3[55] = _mm_sub_epi32(x2[56], x2[55]);
+  x3[56] = _mm_add_epi32(x2[56], x2[55]);
+
+  // stage 4
+  __m128i x4[64];
+  x4[0] = _mm_add_epi32(x3[0], x3[7]);
+  x4[7] = _mm_sub_epi32(x3[0], x3[7]);
+  x4[1] = _mm_add_epi32(x3[1], x3[6]);
+  x4[6] = _mm_sub_epi32(x3[1], x3[6]);
+  x4[2] = _mm_add_epi32(x3[2], x3[5]);
+  x4[5] = _mm_sub_epi32(x3[2], x3[5]);
+  x4[3] = _mm_add_epi32(x3[3], x3[4]);
+  x4[4] = _mm_sub_epi32(x3[3], x3[4]);
+  x4[8] = x3[8];
+  x4[9] = x3[9];
+  btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x3[10], x3[13], x4[10], x4[13],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x3[11], x3[12], x4[11], x4[12],
+                          __rounding, cos_bit);
+  x4[14] = x3[14];
+  x4[15] = x3[15];
+  x4[16] = _mm_add_epi32(x3[16], x3[23]);
+  x4[23] = _mm_sub_epi32(x3[16], x3[23]);
+  x4[17] = _mm_add_epi32(x3[17], x3[22]);
+  x4[22] = _mm_sub_epi32(x3[17], x3[22]);
+  x4[18] = _mm_add_epi32(x3[18], x3[21]);
+  x4[21] = _mm_sub_epi32(x3[18], x3[21]);
+  x4[19] = _mm_add_epi32(x3[19], x3[20]);
+  x4[20] = _mm_sub_epi32(x3[19], x3[20]);
+  x4[24] = _mm_sub_epi32(x3[31], x3[24]);
+  x4[31] = _mm_add_epi32(x3[31], x3[24]);
+  x4[25] = _mm_sub_epi32(x3[30], x3[25]);
+  x4[30] = _mm_add_epi32(x3[30], x3[25]);
+  x4[26] = _mm_sub_epi32(x3[29], x3[26]);
+  x4[29] = _mm_add_epi32(x3[29], x3[26]);
+  x4[27] = _mm_sub_epi32(x3[28], x3[27]);
+  x4[28] = _mm_add_epi32(x3[28], x3[27]);
+  x4[32] = x3[32];
+  x4[33] = x3[33];
+  x4[34] = x3[34];
+  x4[35] = x3[35];
+  btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x3[36], x3[59], x4[36], x4[59],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x3[37], x3[58], x4[37], x4[58],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x3[38], x3[57], x4[38], x4[57],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x3[39], x3[56], x4[39], x4[56],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x3[40], x3[55], x4[40], x4[55],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x3[41], x3[54], x4[41], x4[54],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x3[42], x3[53], x4[42], x4[53],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x3[43], x3[52], x4[43], x4[52],
+                          __rounding, cos_bit);
+  x4[44] = x3[44];
+  x4[45] = x3[45];
+  x4[46] = x3[46];
+  x4[47] = x3[47];
+  x4[48] = x3[48];
+  x4[49] = x3[49];
+  x4[50] = x3[50];
+  x4[51] = x3[51];
+  x4[60] = x3[60];
+  x4[61] = x3[61];
+  x4[62] = x3[62];
+  x4[63] = x3[63];
+
+  // stage 5
+  __m128i x5[64];
+  x5[0] = _mm_add_epi32(x4[0], x4[3]);
+  x5[3] = _mm_sub_epi32(x4[0], x4[3]);
+  x5[1] = _mm_add_epi32(x4[1], x4[2]);
+  x5[2] = _mm_sub_epi32(x4[1], x4[2]);
+  x5[4] = x4[4];
+  btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x4[5], x4[6], x5[5], x5[6],
+                          __rounding, cos_bit);
+  x5[7] = x4[7];
+  x5[8] = _mm_add_epi32(x4[8], x4[11]);
+  x5[11] = _mm_sub_epi32(x4[8], x4[11]);
+  x5[9] = _mm_add_epi32(x4[9], x4[10]);
+  x5[10] = _mm_sub_epi32(x4[9], x4[10]);
+  x5[12] = _mm_sub_epi32(x4[15], x4[12]);
+  x5[15] = _mm_add_epi32(x4[15], x4[12]);
+  x5[13] = _mm_sub_epi32(x4[14], x4[13]);
+  x5[14] = _mm_add_epi32(x4[14], x4[13]);
+  x5[16] = x4[16];
+  x5[17] = x4[17];
+  btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x4[18], x4[29], x5[18], x5[29],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x4[19], x4[28], x5[19], x5[28],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x4[20], x4[27], x5[20], x5[27],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x4[21], x4[26], x5[21], x5[26],
+                          __rounding, cos_bit);
+  x5[22] = x4[22];
+  x5[23] = x4[23];
+  x5[24] = x4[24];
+  x5[25] = x4[25];
+  x5[30] = x4[30];
+  x5[31] = x4[31];
+  x5[32] = _mm_add_epi32(x4[32], x4[39]);
+  x5[39] = _mm_sub_epi32(x4[32], x4[39]);
+  x5[33] = _mm_add_epi32(x4[33], x4[38]);
+  x5[38] = _mm_sub_epi32(x4[33], x4[38]);
+  x5[34] = _mm_add_epi32(x4[34], x4[37]);
+  x5[37] = _mm_sub_epi32(x4[34], x4[37]);
+  x5[35] = _mm_add_epi32(x4[35], x4[36]);
+  x5[36] = _mm_sub_epi32(x4[35], x4[36]);
+  x5[40] = _mm_sub_epi32(x4[47], x4[40]);
+  x5[47] = _mm_add_epi32(x4[47], x4[40]);
+  x5[41] = _mm_sub_epi32(x4[46], x4[41]);
+  x5[46] = _mm_add_epi32(x4[46], x4[41]);
+  x5[42] = _mm_sub_epi32(x4[45], x4[42]);
+  x5[45] = _mm_add_epi32(x4[45], x4[42]);
+  x5[43] = _mm_sub_epi32(x4[44], x4[43]);
+  x5[44] = _mm_add_epi32(x4[44], x4[43]);
+  x5[48] = _mm_add_epi32(x4[48], x4[55]);
+  x5[55] = _mm_sub_epi32(x4[48], x4[55]);
+  x5[49] = _mm_add_epi32(x4[49], x4[54]);
+  x5[54] = _mm_sub_epi32(x4[49], x4[54]);
+  x5[50] = _mm_add_epi32(x4[50], x4[53]);
+  x5[53] = _mm_sub_epi32(x4[50], x4[53]);
+  x5[51] = _mm_add_epi32(x4[51], x4[52]);
+  x5[52] = _mm_sub_epi32(x4[51], x4[52]);
+  x5[56] = _mm_sub_epi32(x4[63], x4[56]);
+  x5[63] = _mm_add_epi32(x4[63], x4[56]);
+  x5[57] = _mm_sub_epi32(x4[62], x4[57]);
+  x5[62] = _mm_add_epi32(x4[62], x4[57]);
+  x5[58] = _mm_sub_epi32(x4[61], x4[58]);
+  x5[61] = _mm_add_epi32(x4[61], x4[58]);
+  x5[59] = _mm_sub_epi32(x4[60], x4[59]);
+  x5[60] = _mm_add_epi32(x4[60], x4[59]);
+
+  // stage 6
+  __m128i x6[64];
+  btf_32_type0_sse4_1_new(cospi_p32, cospi_p32, x5[0], x5[1], x6[0], x6[1],
+                          __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p48, cospi_p16, x5[2], x5[3], x6[2], x6[3],
+                          __rounding, cos_bit);
+  x6[4] = _mm_add_epi32(x5[4], x5[5]);
+  x6[5] = _mm_sub_epi32(x5[4], x5[5]);
+  x6[6] = _mm_sub_epi32(x5[7], x5[6]);
+  x6[7] = _mm_add_epi32(x5[7], x5[6]);
+  x6[8] = x5[8];
+  btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x5[9], x5[14], x6[9], x6[14],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x5[10], x5[13], x6[10], x6[13],
+                          __rounding, cos_bit);
+  x6[11] = x5[11];
+  x6[12] = x5[12];
+  x6[15] = x5[15];
+  x6[16] = _mm_add_epi32(x5[16], x5[19]);
+  x6[19] = _mm_sub_epi32(x5[16], x5[19]);
+  x6[17] = _mm_add_epi32(x5[17], x5[18]);
+  x6[18] = _mm_sub_epi32(x5[17], x5[18]);
+  x6[20] = _mm_sub_epi32(x5[23], x5[20]);
+  x6[23] = _mm_add_epi32(x5[23], x5[20]);
+  x6[21] = _mm_sub_epi32(x5[22], x5[21]);
+  x6[22] = _mm_add_epi32(x5[22], x5[21]);
+  x6[24] = _mm_add_epi32(x5[24], x5[27]);
+  x6[27] = _mm_sub_epi32(x5[24], x5[27]);
+  x6[25] = _mm_add_epi32(x5[25], x5[26]);
+  x6[26] = _mm_sub_epi32(x5[25], x5[26]);
+  x6[28] = _mm_sub_epi32(x5[31], x5[28]);
+  x6[31] = _mm_add_epi32(x5[31], x5[28]);
+  x6[29] = _mm_sub_epi32(x5[30], x5[29]);
+  x6[30] = _mm_add_epi32(x5[30], x5[29]);
+  x6[32] = x5[32];
+  x6[33] = x5[33];
+  btf_32_type0_sse4_1_new(cospi_m08, cospi_p56, x5[34], x5[61], x6[34], x6[61],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m08, cospi_p56, x5[35], x5[60], x6[35], x6[60],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m56, cospi_m08, x5[36], x5[59], x6[36], x6[59],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m56, cospi_m08, x5[37], x5[58], x6[37], x6[58],
+                          __rounding, cos_bit);
+  x6[38] = x5[38];
+  x6[39] = x5[39];
+  x6[40] = x5[40];
+  x6[41] = x5[41];
+  btf_32_type0_sse4_1_new(cospi_m40, cospi_p24, x5[42], x5[53], x6[42], x6[53],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m40, cospi_p24, x5[43], x5[52], x6[43], x6[52],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m24, cospi_m40, x5[44], x5[51], x6[44], x6[51],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m24, cospi_m40, x5[45], x5[50], x6[45], x6[50],
+                          __rounding, cos_bit);
+  x6[46] = x5[46];
+  x6[47] = x5[47];
+  x6[48] = x5[48];
+  x6[49] = x5[49];
+  x6[54] = x5[54];
+  x6[55] = x5[55];
+  x6[56] = x5[56];
+  x6[57] = x5[57];
+  x6[62] = x5[62];
+  x6[63] = x5[63];
+
+  // stage 7
+  __m128i x7[64];
+  x7[0] = x6[0];
+  x7[1] = x6[1];
+  x7[2] = x6[2];
+  x7[3] = x6[3];
+  btf_32_type1_sse4_1_new(cospi_p56, cospi_p08, x6[4], x6[7], x7[4], x7[7],
+                          __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p24, cospi_p40, x6[5], x6[6], x7[5], x7[6],
+                          __rounding, cos_bit);
+  x7[8] = _mm_add_epi32(x6[8], x6[9]);
+  x7[9] = _mm_sub_epi32(x6[8], x6[9]);
+  x7[10] = _mm_sub_epi32(x6[11], x6[10]);
+  x7[11] = _mm_add_epi32(x6[11], x6[10]);
+  x7[12] = _mm_add_epi32(x6[12], x6[13]);
+  x7[13] = _mm_sub_epi32(x6[12], x6[13]);
+  x7[14] = _mm_sub_epi32(x6[15], x6[14]);
+  x7[15] = _mm_add_epi32(x6[15], x6[14]);
+  x7[16] = x6[16];
+  btf_32_type0_sse4_1_new(cospi_m08, cospi_p56, x6[17], x6[30], x7[17], x7[30],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m56, cospi_m08, x6[18], x6[29], x7[18], x7[29],
+                          __rounding, cos_bit);
+  x7[19] = x6[19];
+  x7[20] = x6[20];
+  btf_32_type0_sse4_1_new(cospi_m40, cospi_p24, x6[21], x6[26], x7[21], x7[26],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m24, cospi_m40, x6[22], x6[25], x7[22], x7[25],
+                          __rounding, cos_bit);
+  x7[23] = x6[23];
+  x7[24] = x6[24];
+  x7[27] = x6[27];
+  x7[28] = x6[28];
+  x7[31] = x6[31];
+  x7[32] = _mm_add_epi32(x6[32], x6[35]);
+  x7[35] = _mm_sub_epi32(x6[32], x6[35]);
+  x7[33] = _mm_add_epi32(x6[33], x6[34]);
+  x7[34] = _mm_sub_epi32(x6[33], x6[34]);
+  x7[36] = _mm_sub_epi32(x6[39], x6[36]);
+  x7[39] = _mm_add_epi32(x6[39], x6[36]);
+  x7[37] = _mm_sub_epi32(x6[38], x6[37]);
+  x7[38] = _mm_add_epi32(x6[38], x6[37]);
+  x7[40] = _mm_add_epi32(x6[40], x6[43]);
+  x7[43] = _mm_sub_epi32(x6[40], x6[43]);
+  x7[41] = _mm_add_epi32(x6[41], x6[42]);
+  x7[42] = _mm_sub_epi32(x6[41], x6[42]);
+  x7[44] = _mm_sub_epi32(x6[47], x6[44]);
+  x7[47] = _mm_add_epi32(x6[47], x6[44]);
+  x7[45] = _mm_sub_epi32(x6[46], x6[45]);
+  x7[46] = _mm_add_epi32(x6[46], x6[45]);
+  x7[48] = _mm_add_epi32(x6[48], x6[51]);
+  x7[51] = _mm_sub_epi32(x6[48], x6[51]);
+  x7[49] = _mm_add_epi32(x6[49], x6[50]);
+  x7[50] = _mm_sub_epi32(x6[49], x6[50]);
+  x7[52] = _mm_sub_epi32(x6[55], x6[52]);
+  x7[55] = _mm_add_epi32(x6[55], x6[52]);
+  x7[53] = _mm_sub_epi32(x6[54], x6[53]);
+  x7[54] = _mm_add_epi32(x6[54], x6[53]);
+  x7[56] = _mm_add_epi32(x6[56], x6[59]);
+  x7[59] = _mm_sub_epi32(x6[56], x6[59]);
+  x7[57] = _mm_add_epi32(x6[57], x6[58]);
+  x7[58] = _mm_sub_epi32(x6[57], x6[58]);
+  x7[60] = _mm_sub_epi32(x6[63], x6[60]);
+  x7[63] = _mm_add_epi32(x6[63], x6[60]);
+  x7[61] = _mm_sub_epi32(x6[62], x6[61]);
+  x7[62] = _mm_add_epi32(x6[62], x6[61]);
+
+  // stage 8
+  __m128i x8[64];
+  x8[0] = x7[0];
+  x8[1] = x7[1];
+  x8[2] = x7[2];
+  x8[3] = x7[3];
+  x8[4] = x7[4];
+  x8[5] = x7[5];
+  x8[6] = x7[6];
+  x8[7] = x7[7];
+  btf_32_type1_sse4_1_new(cospi_p60, cospi_p04, x7[8], x7[15], x8[8], x8[15],
+                          __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p28, cospi_p36, x7[9], x7[14], x8[9], x8[14],
+                          __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p44, cospi_p20, x7[10], x7[13], x8[10], x8[13],
+                          __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p12, cospi_p52, x7[11], x7[12], x8[11], x8[12],
+                          __rounding, cos_bit);
+  x8[16] = _mm_add_epi32(x7[16], x7[17]);
+  x8[17] = _mm_sub_epi32(x7[16], x7[17]);
+  x8[18] = _mm_sub_epi32(x7[19], x7[18]);
+  x8[19] = _mm_add_epi32(x7[19], x7[18]);
+  x8[20] = _mm_add_epi32(x7[20], x7[21]);
+  x8[21] = _mm_sub_epi32(x7[20], x7[21]);
+  x8[22] = _mm_sub_epi32(x7[23], x7[22]);
+  x8[23] = _mm_add_epi32(x7[23], x7[22]);
+  x8[24] = _mm_add_epi32(x7[24], x7[25]);
+  x8[25] = _mm_sub_epi32(x7[24], x7[25]);
+  x8[26] = _mm_sub_epi32(x7[27], x7[26]);
+  x8[27] = _mm_add_epi32(x7[27], x7[26]);
+  x8[28] = _mm_add_epi32(x7[28], x7[29]);
+  x8[29] = _mm_sub_epi32(x7[28], x7[29]);
+  x8[30] = _mm_sub_epi32(x7[31], x7[30]);
+  x8[31] = _mm_add_epi32(x7[31], x7[30]);
+  x8[32] = x7[32];
+  btf_32_type0_sse4_1_new(cospi_m04, cospi_p60, x7[33], x7[62], x8[33], x8[62],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m60, cospi_m04, x7[34], x7[61], x8[34], x8[61],
+                          __rounding, cos_bit);
+  x8[35] = x7[35];
+  x8[36] = x7[36];
+  btf_32_type0_sse4_1_new(cospi_m36, cospi_p28, x7[37], x7[58], x8[37], x8[58],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m28, cospi_m36, x7[38], x7[57], x8[38], x8[57],
+                          __rounding, cos_bit);
+  x8[39] = x7[39];
+  x8[40] = x7[40];
+  btf_32_type0_sse4_1_new(cospi_m20, cospi_p44, x7[41], x7[54], x8[41], x8[54],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m44, cospi_m20, x7[42], x7[53], x8[42], x8[53],
+                          __rounding, cos_bit);
+  x8[43] = x7[43];
+  x8[44] = x7[44];
+  btf_32_type0_sse4_1_new(cospi_m52, cospi_p12, x7[45], x7[50], x8[45], x8[50],
+                          __rounding, cos_bit);
+  btf_32_type0_sse4_1_new(cospi_m12, cospi_m52, x7[46], x7[49], x8[46], x8[49],
+                          __rounding, cos_bit);
+  x8[47] = x7[47];
+  x8[48] = x7[48];
+  x8[51] = x7[51];
+  x8[52] = x7[52];
+  x8[55] = x7[55];
+  x8[56] = x7[56];
+  x8[59] = x7[59];
+  x8[60] = x7[60];
+  x8[63] = x7[63];
+
+  // stage 9
+  __m128i x9[64];
+  x9[0] = x8[0];
+  x9[1] = x8[1];
+  x9[2] = x8[2];
+  x9[3] = x8[3];
+  x9[4] = x8[4];
+  x9[5] = x8[5];
+  x9[6] = x8[6];
+  x9[7] = x8[7];
+  x9[8] = x8[8];
+  x9[9] = x8[9];
+  x9[10] = x8[10];
+  x9[11] = x8[11];
+  x9[12] = x8[12];
+  x9[13] = x8[13];
+  x9[14] = x8[14];
+  x9[15] = x8[15];
+  btf_32_type1_sse4_1_new(cospi_p62, cospi_p02, x8[16], x8[31], x9[16], x9[31],
+                          __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p30, cospi_p34, x8[17], x8[30], x9[17], x9[30],
+                          __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p46, cospi_p18, x8[18], x8[29], x9[18], x9[29],
+                          __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p14, cospi_p50, x8[19], x8[28], x9[19], x9[28],
+                          __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p54, cospi_p10, x8[20], x8[27], x9[20], x9[27],
+                          __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p22, cospi_p42, x8[21], x8[26], x9[21], x9[26],
+                          __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p38, cospi_p26, x8[22], x8[25], x9[22], x9[25],
+                          __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p06, cospi_p58, x8[23], x8[24], x9[23], x9[24],
+                          __rounding, cos_bit);
+  x9[32] = _mm_add_epi32(x8[32], x8[33]);
+  x9[33] = _mm_sub_epi32(x8[32], x8[33]);
+  x9[34] = _mm_sub_epi32(x8[35], x8[34]);
+  x9[35] = _mm_add_epi32(x8[35], x8[34]);
+  x9[36] = _mm_add_epi32(x8[36], x8[37]);
+  x9[37] = _mm_sub_epi32(x8[36], x8[37]);
+  x9[38] = _mm_sub_epi32(x8[39], x8[38]);
+  x9[39] = _mm_add_epi32(x8[39], x8[38]);
+  x9[40] = _mm_add_epi32(x8[40], x8[41]);
+  x9[41] = _mm_sub_epi32(x8[40], x8[41]);
+  x9[42] = _mm_sub_epi32(x8[43], x8[42]);
+  x9[43] = _mm_add_epi32(x8[43], x8[42]);
+  x9[44] = _mm_add_epi32(x8[44], x8[45]);
+  x9[45] = _mm_sub_epi32(x8[44], x8[45]);
+  x9[46] = _mm_sub_epi32(x8[47], x8[46]);
+  x9[47] = _mm_add_epi32(x8[47], x8[46]);
+  x9[48] = _mm_add_epi32(x8[48], x8[49]);
+  x9[49] = _mm_sub_epi32(x8[48], x8[49]);
+  x9[50] = _mm_sub_epi32(x8[51], x8[50]);
+  x9[51] = _mm_add_epi32(x8[51], x8[50]);
+  x9[52] = _mm_add_epi32(x8[52], x8[53]);
+  x9[53] = _mm_sub_epi32(x8[52], x8[53]);
+  x9[54] = _mm_sub_epi32(x8[55], x8[54]);
+  x9[55] = _mm_add_epi32(x8[55], x8[54]);
+  x9[56] = _mm_add_epi32(x8[56], x8[57]);
+  x9[57] = _mm_sub_epi32(x8[56], x8[57]);
+  x9[58] = _mm_sub_epi32(x8[59], x8[58]);
+  x9[59] = _mm_add_epi32(x8[59], x8[58]);
+  x9[60] = _mm_add_epi32(x8[60], x8[61]);
+  x9[61] = _mm_sub_epi32(x8[60], x8[61]);
+  x9[62] = _mm_sub_epi32(x8[63], x8[62]);
+  x9[63] = _mm_add_epi32(x8[63], x8[62]);
+
+  // stage 10
+  __m128i x10[64];
+  x10[0] = x9[0];
+  x10[1] = x9[1];
+  x10[2] = x9[2];
+  x10[3] = x9[3];
+  x10[4] = x9[4];
+  x10[5] = x9[5];
+  x10[6] = x9[6];
+  x10[7] = x9[7];
+  x10[8] = x9[8];
+  x10[9] = x9[9];
+  x10[10] = x9[10];
+  x10[11] = x9[11];
+  x10[12] = x9[12];
+  x10[13] = x9[13];
+  x10[14] = x9[14];
+  x10[15] = x9[15];
+  x10[16] = x9[16];
+  x10[17] = x9[17];
+  x10[18] = x9[18];
+  x10[19] = x9[19];
+  x10[20] = x9[20];
+  x10[21] = x9[21];
+  x10[22] = x9[22];
+  x10[23] = x9[23];
+  x10[24] = x9[24];
+  x10[25] = x9[25];
+  x10[26] = x9[26];
+  x10[27] = x9[27];
+  x10[28] = x9[28];
+  x10[29] = x9[29];
+  x10[30] = x9[30];
+  x10[31] = x9[31];
+  btf_32_type1_sse4_1_new(cospi_p63, cospi_p01, x9[32], x9[63], x10[32],
+                          x10[63], __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p31, cospi_p33, x9[33], x9[62], x10[33],
+                          x10[62], __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p47, cospi_p17, x9[34], x9[61], x10[34],
+                          x10[61], __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p15, cospi_p49, x9[35], x9[60], x10[35],
+                          x10[60], __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p55, cospi_p09, x9[36], x9[59], x10[36],
+                          x10[59], __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p23, cospi_p41, x9[37], x9[58], x10[37],
+                          x10[58], __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p39, cospi_p25, x9[38], x9[57], x10[38],
+                          x10[57], __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p07, cospi_p57, x9[39], x9[56], x10[39],
+                          x10[56], __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p59, cospi_p05, x9[40], x9[55], x10[40],
+                          x10[55], __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p27, cospi_p37, x9[41], x9[54], x10[41],
+                          x10[54], __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p43, cospi_p21, x9[42], x9[53], x10[42],
+                          x10[53], __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p11, cospi_p53, x9[43], x9[52], x10[43],
+                          x10[52], __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p51, cospi_p13, x9[44], x9[51], x10[44],
+                          x10[51], __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p19, cospi_p45, x9[45], x9[50], x10[45],
+                          x10[50], __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p35, cospi_p29, x9[46], x9[49], x10[46],
+                          x10[49], __rounding, cos_bit);
+  btf_32_type1_sse4_1_new(cospi_p03, cospi_p61, x9[47], x9[48], x10[47],
+                          x10[48], __rounding, cos_bit);
+
+  startidx = 0 * outstride;
+  endidx = 63 * outstride;
+  // stage 11
+  output[startidx] = x10[0];
+  output[endidx] = x10[63];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[32];
+  output[endidx] = x10[31];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[16];
+  output[endidx] = x10[47];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[48];
+  output[endidx] = x10[15];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[8];
+  output[endidx] = x10[55];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[40];
+  output[endidx] = x10[23];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[24];
+  output[endidx] = x10[39];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[56];
+  output[endidx] = x10[7];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[4];
+  output[endidx] = x10[59];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[36];
+  output[endidx] = x10[27];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[20];
+  output[endidx] = x10[43];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[52];
+  output[endidx] = x10[11];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[12];
+  output[endidx] = x10[51];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[44];
+  output[endidx] = x10[19];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[28];
+  output[endidx] = x10[35];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[60];
+  output[endidx] = x10[3];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[2];
+  output[endidx] = x10[61];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[34];
+  output[endidx] = x10[29];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[18];
+  output[endidx] = x10[45];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[50];
+  output[endidx] = x10[13];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[10];
+  output[endidx] = x10[53];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[42];
+  output[endidx] = x10[21];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[26];
+  output[endidx] = x10[37];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[58];
+  output[endidx] = x10[5];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[6];
+  output[endidx] = x10[57];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[38];
+  output[endidx] = x10[25];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[22];
+  output[endidx] = x10[41];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[54];
+  output[endidx] = x10[9];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[14];
+  output[endidx] = x10[49];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[46];
+  output[endidx] = x10[17];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[30];
+  output[endidx] = x10[33];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x10[62];
+  output[endidx] = x10[1];
+}
+
+void av1_idtx32_sse4_1(__m128i *input, __m128i *output, int cos_bit,
+                       const int col_num) {
+  (void)cos_bit;
+  for (int i = 0; i < 32; i++) {
+    output[i * col_num] = _mm_slli_epi32(input[i * col_num], 2);
+  }
+}
diff --git a/libs/libaom/src/av1/encoder/x86/av1_fwd_txfm2d_avx2.c b/libs/libaom/src/av1/encoder/x86/av1_fwd_txfm2d_avx2.c
new file mode 100644
index 000000000..634d50bb2
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/x86/av1_fwd_txfm2d_avx2.c
@@ -0,0 +1,2814 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/enums.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/encoder/x86/av1_fwd_txfm_avx2.h"
+#include "av1/common/x86/av1_txfm_sse2.h"
+#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
+#include "av1/encoder/x86/av1_txfm1d_sse4.h"
+#include "av1/encoder/x86/av1_fwd_txfm_sse2.h"
+#include "aom_dsp/x86/txfm_common_avx2.h"
+
+static INLINE void fdct16x16_new_avx2(const __m256i *input, __m256i *output,
+                                      int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
+
+  __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+  __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
+  __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+  __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+  __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+  __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
+  __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
+  __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
+  __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
+  __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
+  __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
+  __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]);
+  __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
+  __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]);
+  __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]);
+  __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]);
+  __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
+
+  // stage 1
+  __m256i x1[16];
+  btf_16_adds_subs_out_avx2(&x1[0], &x1[15], input[0], input[15]);
+  btf_16_adds_subs_out_avx2(&x1[1], &x1[14], input[1], input[14]);
+  btf_16_adds_subs_out_avx2(&x1[2], &x1[13], input[2], input[13]);
+  btf_16_adds_subs_out_avx2(&x1[3], &x1[12], input[3], input[12]);
+  btf_16_adds_subs_out_avx2(&x1[4], &x1[11], input[4], input[11]);
+  btf_16_adds_subs_out_avx2(&x1[5], &x1[10], input[5], input[10]);
+  btf_16_adds_subs_out_avx2(&x1[6], &x1[9], input[6], input[9]);
+  btf_16_adds_subs_out_avx2(&x1[7], &x1[8], input[7], input[8]);
+
+  // stage 2
+  btf_16_adds_subs_avx2(&x1[0], &x1[7]);
+  btf_16_adds_subs_avx2(&x1[1], &x1[6]);
+  btf_16_adds_subs_avx2(&x1[2], &x1[5]);
+  btf_16_adds_subs_avx2(&x1[3], &x1[4]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[10], &x1[13], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[11], &x1[12], _r, cos_bit);
+
+  // stage 3
+  btf_16_adds_subs_avx2(&x1[0], &x1[3]);
+  btf_16_adds_subs_avx2(&x1[1], &x1[2]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[8], &x1[11]);
+  btf_16_adds_subs_avx2(&x1[9], &x1[10]);
+  btf_16_adds_subs_avx2(&x1[15], &x1[12]);
+  btf_16_adds_subs_avx2(&x1[14], &x1[13]);
+
+  // stage 4
+  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x1[2], &x1[3], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[4], &x1[5]);
+  btf_16_adds_subs_avx2(&x1[7], &x1[6]);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit);
+
+  // stage 5
+  btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x1[4], &x1[7], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x1[5], &x1[6], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[8], &x1[9]);
+  btf_16_adds_subs_avx2(&x1[11], &x1[10]);
+  btf_16_adds_subs_avx2(&x1[12], &x1[13]);
+  btf_16_adds_subs_avx2(&x1[15], &x1[14]);
+
+  // stage 6
+  btf_16_w16_avx2(cospi_p60_p04, cospi_m04_p60, &x1[8], &x1[15], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p28_p36, cospi_m36_p28, &x1[9], &x1[14], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p44_p20, cospi_m20_p44, &x1[10], &x1[13], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p12_p52, cospi_m52_p12, &x1[11], &x1[12], _r, cos_bit);
+
+  // stage 7
+  output[0] = x1[0];
+  output[1] = x1[8];
+  output[2] = x1[4];
+  output[3] = x1[12];
+  output[4] = x1[2];
+  output[5] = x1[10];
+  output[6] = x1[6];
+  output[7] = x1[14];
+  output[8] = x1[1];
+  output[9] = x1[9];
+  output[10] = x1[5];
+  output[11] = x1[13];
+  output[12] = x1[3];
+  output[13] = x1[11];
+  output[14] = x1[7];
+  output[15] = x1[15];
+}
+
+static INLINE void fdct16x32_avx2(const __m256i *input, __m256i *output,
+                                  int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
+
+  __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+  __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+  __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+  __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+  __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
+  __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
+  __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
+  __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
+  __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
+  __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
+  __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
+  __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
+  __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
+  __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]);
+  __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
+  __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]);
+  __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]);
+  __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]);
+  __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
+  __m256i cospi_p62_p02 = pair_set_w16_epi16(cospi[62], cospi[2]);
+  __m256i cospi_m02_p62 = pair_set_w16_epi16(-cospi[2], cospi[62]);
+  __m256i cospi_p30_p34 = pair_set_w16_epi16(cospi[30], cospi[34]);
+  __m256i cospi_m34_p30 = pair_set_w16_epi16(-cospi[34], cospi[30]);
+  __m256i cospi_p46_p18 = pair_set_w16_epi16(cospi[46], cospi[18]);
+  __m256i cospi_m18_p46 = pair_set_w16_epi16(-cospi[18], cospi[46]);
+  __m256i cospi_p14_p50 = pair_set_w16_epi16(cospi[14], cospi[50]);
+  __m256i cospi_m50_p14 = pair_set_w16_epi16(-cospi[50], cospi[14]);
+  __m256i cospi_p54_p10 = pair_set_w16_epi16(cospi[54], cospi[10]);
+  __m256i cospi_m10_p54 = pair_set_w16_epi16(-cospi[10], cospi[54]);
+  __m256i cospi_p22_p42 = pair_set_w16_epi16(cospi[22], cospi[42]);
+  __m256i cospi_m42_p22 = pair_set_w16_epi16(-cospi[42], cospi[22]);
+  __m256i cospi_p38_p26 = pair_set_w16_epi16(cospi[38], cospi[26]);
+  __m256i cospi_m26_p38 = pair_set_w16_epi16(-cospi[26], cospi[38]);
+  __m256i cospi_p06_p58 = pair_set_w16_epi16(cospi[6], cospi[58]);
+  __m256i cospi_m58_p06 = pair_set_w16_epi16(-cospi[58], cospi[6]);
+
+  // stage 1
+  __m256i x1[32];
+  btf_16_adds_subs_out_avx2(&x1[0], &x1[31], input[0], input[31]);
+  btf_16_adds_subs_out_avx2(&x1[1], &x1[30], input[1], input[30]);
+  btf_16_adds_subs_out_avx2(&x1[2], &x1[29], input[2], input[29]);
+  btf_16_adds_subs_out_avx2(&x1[3], &x1[28], input[3], input[28]);
+  btf_16_adds_subs_out_avx2(&x1[4], &x1[27], input[4], input[27]);
+  btf_16_adds_subs_out_avx2(&x1[5], &x1[26], input[5], input[26]);
+  btf_16_adds_subs_out_avx2(&x1[6], &x1[25], input[6], input[25]);
+  btf_16_adds_subs_out_avx2(&x1[7], &x1[24], input[7], input[24]);
+  btf_16_adds_subs_out_avx2(&x1[8], &x1[23], input[8], input[23]);
+  btf_16_adds_subs_out_avx2(&x1[9], &x1[22], input[9], input[22]);
+  btf_16_adds_subs_out_avx2(&x1[10], &x1[21], input[10], input[21]);
+  btf_16_adds_subs_out_avx2(&x1[11], &x1[20], input[11], input[20]);
+  btf_16_adds_subs_out_avx2(&x1[12], &x1[19], input[12], input[19]);
+  btf_16_adds_subs_out_avx2(&x1[13], &x1[18], input[13], input[18]);
+  btf_16_adds_subs_out_avx2(&x1[14], &x1[17], input[14], input[17]);
+  btf_16_adds_subs_out_avx2(&x1[15], &x1[16], input[15], input[16]);
+
+  // stage 2
+  btf_16_adds_subs_avx2(&x1[0], &x1[15]);
+  btf_16_adds_subs_avx2(&x1[1], &x1[14]);
+  btf_16_adds_subs_avx2(&x1[2], &x1[13]);
+  btf_16_adds_subs_avx2(&x1[3], &x1[12]);
+  btf_16_adds_subs_avx2(&x1[4], &x1[11]);
+  btf_16_adds_subs_avx2(&x1[5], &x1[10]);
+  btf_16_adds_subs_avx2(&x1[6], &x1[9]);
+  btf_16_adds_subs_avx2(&x1[7], &x1[8]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[20], &x1[27], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[21], &x1[26], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[22], &x1[25], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[23], &x1[24], _r, cos_bit);
+
+  // stage 3
+  btf_16_adds_subs_avx2(&x1[0], &x1[7]);
+  btf_16_adds_subs_avx2(&x1[1], &x1[6]);
+  btf_16_adds_subs_avx2(&x1[2], &x1[5]);
+  btf_16_adds_subs_avx2(&x1[3], &x1[4]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[10], &x1[13], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[11], &x1[12], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[16], &x1[23]);
+  btf_16_adds_subs_avx2(&x1[17], &x1[22]);
+  btf_16_adds_subs_avx2(&x1[18], &x1[21]);
+  btf_16_adds_subs_avx2(&x1[19], &x1[20]);
+  btf_16_adds_subs_avx2(&x1[31], &x1[24]);
+  btf_16_adds_subs_avx2(&x1[30], &x1[25]);
+  btf_16_adds_subs_avx2(&x1[29], &x1[26]);
+  btf_16_adds_subs_avx2(&x1[28], &x1[27]);
+
+  // stage 4
+  btf_16_adds_subs_avx2(&x1[0], &x1[3]);
+  btf_16_adds_subs_avx2(&x1[1], &x1[2]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[8], &x1[11]);
+  btf_16_adds_subs_avx2(&x1[9], &x1[10]);
+  btf_16_adds_subs_avx2(&x1[15], &x1[12]);
+  btf_16_adds_subs_avx2(&x1[14], &x1[13]);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[18], &x1[29], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[19], &x1[28], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[20], &x1[27], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[21], &x1[26], _r, cos_bit);
+
+  // stage 5
+  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x1[2], &x1[3], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[4], &x1[5]);
+  btf_16_adds_subs_avx2(&x1[7], &x1[6]);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[16], &x1[19]);
+  btf_16_adds_subs_avx2(&x1[17], &x1[18]);
+  btf_16_adds_subs_avx2(&x1[23], &x1[20]);
+  btf_16_adds_subs_avx2(&x1[22], &x1[21]);
+  btf_16_adds_subs_avx2(&x1[24], &x1[27]);
+  btf_16_adds_subs_avx2(&x1[25], &x1[26]);
+  btf_16_adds_subs_avx2(&x1[31], &x1[28]);
+  btf_16_adds_subs_avx2(&x1[30], &x1[29]);
+
+  // stage 6
+  btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x1[4], &x1[7], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x1[5], &x1[6], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[8], &x1[9]);
+  btf_16_adds_subs_avx2(&x1[11], &x1[10]);
+  btf_16_adds_subs_avx2(&x1[12], &x1[13]);
+  btf_16_adds_subs_avx2(&x1[15], &x1[14]);
+  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[17], &x1[30], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[18], &x1[29], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[21], &x1[26], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[22], &x1[25], _r, cos_bit);
+
+  // stage 7
+  btf_16_w16_avx2(cospi_p60_p04, cospi_m04_p60, &x1[8], &x1[15], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p28_p36, cospi_m36_p28, &x1[9], &x1[14], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p44_p20, cospi_m20_p44, &x1[10], &x1[13], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p12_p52, cospi_m52_p12, &x1[11], &x1[12], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[16], &x1[17]);
+  btf_16_adds_subs_avx2(&x1[19], &x1[18]);
+  btf_16_adds_subs_avx2(&x1[20], &x1[21]);
+  btf_16_adds_subs_avx2(&x1[23], &x1[22]);
+  btf_16_adds_subs_avx2(&x1[24], &x1[25]);
+  btf_16_adds_subs_avx2(&x1[27], &x1[26]);
+  btf_16_adds_subs_avx2(&x1[28], &x1[29]);
+  btf_16_adds_subs_avx2(&x1[31], &x1[30]);
+
+  // stage 8
+  btf_16_w16_avx2(cospi_p62_p02, cospi_m02_p62, &x1[16], &x1[31], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p30_p34, cospi_m34_p30, &x1[17], &x1[30], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p46_p18, cospi_m18_p46, &x1[18], &x1[29], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p14_p50, cospi_m50_p14, &x1[19], &x1[28], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p54_p10, cospi_m10_p54, &x1[20], &x1[27], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p22_p42, cospi_m42_p22, &x1[21], &x1[26], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p38_p26, cospi_m26_p38, &x1[22], &x1[25], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p06_p58, cospi_m58_p06, &x1[23], &x1[24], _r, cos_bit);
+
+  // stage 9
+  output[0] = x1[0];
+  output[1] = x1[16];
+  output[2] = x1[8];
+  output[3] = x1[24];
+  output[4] = x1[4];
+  output[5] = x1[20];
+  output[6] = x1[12];
+  output[7] = x1[28];
+  output[8] = x1[2];
+  output[9] = x1[18];
+  output[10] = x1[10];
+  output[11] = x1[26];
+  output[12] = x1[6];
+  output[13] = x1[22];
+  output[14] = x1[14];
+  output[15] = x1[30];
+  output[16] = x1[1];
+  output[17] = x1[17];
+  output[18] = x1[9];
+  output[19] = x1[25];
+  output[20] = x1[5];
+  output[21] = x1[21];
+  output[22] = x1[13];
+  output[23] = x1[29];
+  output[24] = x1[3];
+  output[25] = x1[19];
+  output[26] = x1[11];
+  output[27] = x1[27];
+  output[28] = x1[7];
+  output[29] = x1[23];
+  output[30] = x1[15];
+  output[31] = x1[31];
+}
+
+static INLINE void fdct16x64_new_avx2(const __m256i *input, __m256i *output,
+                                      int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
+
+  __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+  __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+  __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+  __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+  __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
+  __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
+  __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
+  __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
+  __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
+  __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
+  __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
+  __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
+  __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
+  __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]);
+  __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
+  __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]);
+  __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]);
+  __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]);
+  __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
+  __m256i cospi_m60_m04 = pair_set_w16_epi16(-cospi[60], -cospi[4]);
+  __m256i cospi_m28_m36 = pair_set_w16_epi16(-cospi[28], -cospi[36]);
+  __m256i cospi_m44_m20 = pair_set_w16_epi16(-cospi[44], -cospi[20]);
+  __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]);
+  __m256i cospi_p62_p02 = pair_set_w16_epi16(cospi[62], cospi[2]);
+  __m256i cospi_m02_p62 = pair_set_w16_epi16(-cospi[2], cospi[62]);
+  __m256i cospi_p30_p34 = pair_set_w16_epi16(cospi[30], cospi[34]);
+  __m256i cospi_m34_p30 = pair_set_w16_epi16(-cospi[34], cospi[30]);
+  __m256i cospi_p46_p18 = pair_set_w16_epi16(cospi[46], cospi[18]);
+  __m256i cospi_m18_p46 = pair_set_w16_epi16(-cospi[18], cospi[46]);
+  __m256i cospi_p14_p50 = pair_set_w16_epi16(cospi[14], cospi[50]);
+  __m256i cospi_m50_p14 = pair_set_w16_epi16(-cospi[50], cospi[14]);
+  __m256i cospi_p54_p10 = pair_set_w16_epi16(cospi[54], cospi[10]);
+  __m256i cospi_m10_p54 = pair_set_w16_epi16(-cospi[10], cospi[54]);
+  __m256i cospi_p22_p42 = pair_set_w16_epi16(cospi[22], cospi[42]);
+  __m256i cospi_m42_p22 = pair_set_w16_epi16(-cospi[42], cospi[22]);
+  __m256i cospi_p38_p26 = pair_set_w16_epi16(cospi[38], cospi[26]);
+  __m256i cospi_m26_p38 = pair_set_w16_epi16(-cospi[26], cospi[38]);
+  __m256i cospi_p06_p58 = pair_set_w16_epi16(cospi[6], cospi[58]);
+  __m256i cospi_m58_p06 = pair_set_w16_epi16(-cospi[58], cospi[6]);
+  __m256i cospi_p63_p01 = pair_set_w16_epi16(cospi[63], cospi[1]);
+  __m256i cospi_m01_p63 = pair_set_w16_epi16(-cospi[1], cospi[63]);
+  __m256i cospi_p31_p33 = pair_set_w16_epi16(cospi[31], cospi[33]);
+  __m256i cospi_m33_p31 = pair_set_w16_epi16(-cospi[33], cospi[31]);
+  __m256i cospi_p47_p17 = pair_set_w16_epi16(cospi[47], cospi[17]);
+  __m256i cospi_m17_p47 = pair_set_w16_epi16(-cospi[17], cospi[47]);
+  __m256i cospi_p15_p49 = pair_set_w16_epi16(cospi[15], cospi[49]);
+  __m256i cospi_m49_p15 = pair_set_w16_epi16(-cospi[49], cospi[15]);
+  __m256i cospi_p55_p09 = pair_set_w16_epi16(cospi[55], cospi[9]);
+  __m256i cospi_m09_p55 = pair_set_w16_epi16(-cospi[9], cospi[55]);
+  __m256i cospi_p23_p41 = pair_set_w16_epi16(cospi[23], cospi[41]);
+  __m256i cospi_m41_p23 = pair_set_w16_epi16(-cospi[41], cospi[23]);
+  __m256i cospi_p39_p25 = pair_set_w16_epi16(cospi[39], cospi[25]);
+  __m256i cospi_m25_p39 = pair_set_w16_epi16(-cospi[25], cospi[39]);
+  __m256i cospi_p07_p57 = pair_set_w16_epi16(cospi[7], cospi[57]);
+  __m256i cospi_m57_p07 = pair_set_w16_epi16(-cospi[57], cospi[7]);
+  __m256i cospi_p59_p05 = pair_set_w16_epi16(cospi[59], cospi[5]);
+  __m256i cospi_m05_p59 = pair_set_w16_epi16(-cospi[5], cospi[59]);
+  __m256i cospi_p27_p37 = pair_set_w16_epi16(cospi[27], cospi[37]);
+  __m256i cospi_m37_p27 = pair_set_w16_epi16(-cospi[37], cospi[27]);
+  __m256i cospi_p43_p21 = pair_set_w16_epi16(cospi[43], cospi[21]);
+  __m256i cospi_m21_p43 = pair_set_w16_epi16(-cospi[21], cospi[43]);
+  __m256i cospi_p11_p53 = pair_set_w16_epi16(cospi[11], cospi[53]);
+  __m256i cospi_m53_p11 = pair_set_w16_epi16(-cospi[53], cospi[11]);
+  __m256i cospi_p51_p13 = pair_set_w16_epi16(cospi[51], cospi[13]);
+  __m256i cospi_m13_p51 = pair_set_w16_epi16(-cospi[13], cospi[51]);
+  __m256i cospi_p19_p45 = pair_set_w16_epi16(cospi[19], cospi[45]);
+  __m256i cospi_m45_p19 = pair_set_w16_epi16(-cospi[45], cospi[19]);
+  __m256i cospi_p35_p29 = pair_set_w16_epi16(cospi[35], cospi[29]);
+  __m256i cospi_m29_p35 = pair_set_w16_epi16(-cospi[29], cospi[35]);
+  __m256i cospi_p03_p61 = pair_set_w16_epi16(cospi[3], cospi[61]);
+  __m256i cospi_m61_p03 = pair_set_w16_epi16(-cospi[61], cospi[3]);
+
+  // stage 1
+  __m256i x1[64];
+  btf_16_adds_subs_out_avx2(&x1[0], &x1[63], input[0], input[63]);
+  btf_16_adds_subs_out_avx2(&x1[1], &x1[62], input[1], input[62]);
+  btf_16_adds_subs_out_avx2(&x1[2], &x1[61], input[2], input[61]);
+  btf_16_adds_subs_out_avx2(&x1[3], &x1[60], input[3], input[60]);
+  btf_16_adds_subs_out_avx2(&x1[4], &x1[59], input[4], input[59]);
+  btf_16_adds_subs_out_avx2(&x1[5], &x1[58], input[5], input[58]);
+  btf_16_adds_subs_out_avx2(&x1[6], &x1[57], input[6], input[57]);
+  btf_16_adds_subs_out_avx2(&x1[7], &x1[56], input[7], input[56]);
+  btf_16_adds_subs_out_avx2(&x1[8], &x1[55], input[8], input[55]);
+  btf_16_adds_subs_out_avx2(&x1[9], &x1[54], input[9], input[54]);
+  btf_16_adds_subs_out_avx2(&x1[10], &x1[53], input[10], input[53]);
+  btf_16_adds_subs_out_avx2(&x1[11], &x1[52], input[11], input[52]);
+  btf_16_adds_subs_out_avx2(&x1[12], &x1[51], input[12], input[51]);
+  btf_16_adds_subs_out_avx2(&x1[13], &x1[50], input[13], input[50]);
+  btf_16_adds_subs_out_avx2(&x1[14], &x1[49], input[14], input[49]);
+  btf_16_adds_subs_out_avx2(&x1[15], &x1[48], input[15], input[48]);
+  btf_16_adds_subs_out_avx2(&x1[16], &x1[47], input[16], input[47]);
+  btf_16_adds_subs_out_avx2(&x1[17], &x1[46], input[17], input[46]);
+  btf_16_adds_subs_out_avx2(&x1[18], &x1[45], input[18], input[45]);
+  btf_16_adds_subs_out_avx2(&x1[19], &x1[44], input[19], input[44]);
+  btf_16_adds_subs_out_avx2(&x1[20], &x1[43], input[20], input[43]);
+  btf_16_adds_subs_out_avx2(&x1[21], &x1[42], input[21], input[42]);
+  btf_16_adds_subs_out_avx2(&x1[22], &x1[41], input[22], input[41]);
+  btf_16_adds_subs_out_avx2(&x1[23], &x1[40], input[23], input[40]);
+  btf_16_adds_subs_out_avx2(&x1[24], &x1[39], input[24], input[39]);
+  btf_16_adds_subs_out_avx2(&x1[25], &x1[38], input[25], input[38]);
+  btf_16_adds_subs_out_avx2(&x1[26], &x1[37], input[26], input[37]);
+  btf_16_adds_subs_out_avx2(&x1[27], &x1[36], input[27], input[36]);
+  btf_16_adds_subs_out_avx2(&x1[28], &x1[35], input[28], input[35]);
+  btf_16_adds_subs_out_avx2(&x1[29], &x1[34], input[29], input[34]);
+  btf_16_adds_subs_out_avx2(&x1[30], &x1[33], input[30], input[33]);
+  btf_16_adds_subs_out_avx2(&x1[31], &x1[32], input[31], input[32]);
+
+  // stage 2
+  btf_16_adds_subs_avx2(&x1[0], &x1[31]);
+  btf_16_adds_subs_avx2(&x1[1], &x1[30]);
+  btf_16_adds_subs_avx2(&x1[2], &x1[29]);
+  btf_16_adds_subs_avx2(&x1[3], &x1[28]);
+  btf_16_adds_subs_avx2(&x1[4], &x1[27]);
+  btf_16_adds_subs_avx2(&x1[5], &x1[26]);
+  btf_16_adds_subs_avx2(&x1[6], &x1[25]);
+  btf_16_adds_subs_avx2(&x1[7], &x1[24]);
+  btf_16_adds_subs_avx2(&x1[8], &x1[23]);
+  btf_16_adds_subs_avx2(&x1[9], &x1[22]);
+  btf_16_adds_subs_avx2(&x1[10], &x1[21]);
+  btf_16_adds_subs_avx2(&x1[11], &x1[20]);
+  btf_16_adds_subs_avx2(&x1[12], &x1[19]);
+  btf_16_adds_subs_avx2(&x1[13], &x1[18]);
+  btf_16_adds_subs_avx2(&x1[14], &x1[17]);
+  btf_16_adds_subs_avx2(&x1[15], &x1[16]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[40], &x1[55], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[41], &x1[54], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[42], &x1[53], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[43], &x1[52], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[44], &x1[51], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[45], &x1[50], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[46], &x1[49], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[47], &x1[48], _r, cos_bit);
+
+  // stage 3
+  btf_16_adds_subs_avx2(&x1[0], &x1[15]);
+  btf_16_adds_subs_avx2(&x1[1], &x1[14]);
+  btf_16_adds_subs_avx2(&x1[2], &x1[13]);
+  btf_16_adds_subs_avx2(&x1[3], &x1[12]);
+  btf_16_adds_subs_avx2(&x1[4], &x1[11]);
+  btf_16_adds_subs_avx2(&x1[5], &x1[10]);
+  btf_16_adds_subs_avx2(&x1[6], &x1[9]);
+  btf_16_adds_subs_avx2(&x1[7], &x1[8]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[20], &x1[27], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[21], &x1[26], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[22], &x1[25], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[23], &x1[24], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[32], &x1[47]);
+  btf_16_adds_subs_avx2(&x1[33], &x1[46]);
+  btf_16_adds_subs_avx2(&x1[34], &x1[45]);
+  btf_16_adds_subs_avx2(&x1[35], &x1[44]);
+  btf_16_adds_subs_avx2(&x1[36], &x1[43]);
+  btf_16_adds_subs_avx2(&x1[37], &x1[42]);
+  btf_16_adds_subs_avx2(&x1[38], &x1[41]);
+  btf_16_adds_subs_avx2(&x1[39], &x1[40]);
+  btf_16_adds_subs_avx2(&x1[63], &x1[48]);
+  btf_16_adds_subs_avx2(&x1[62], &x1[49]);
+  btf_16_adds_subs_avx2(&x1[61], &x1[50]);
+  btf_16_adds_subs_avx2(&x1[60], &x1[51]);
+  btf_16_adds_subs_avx2(&x1[59], &x1[52]);
+  btf_16_adds_subs_avx2(&x1[58], &x1[53]);
+  btf_16_adds_subs_avx2(&x1[57], &x1[54]);
+  btf_16_adds_subs_avx2(&x1[56], &x1[55]);
+
+  // stage 4
+  btf_16_adds_subs_avx2(&x1[0], &x1[7]);
+  btf_16_adds_subs_avx2(&x1[1], &x1[6]);
+  btf_16_adds_subs_avx2(&x1[2], &x1[5]);
+  btf_16_adds_subs_avx2(&x1[3], &x1[4]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[10], &x1[13], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[11], &x1[12], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[16], &x1[23]);
+  btf_16_adds_subs_avx2(&x1[17], &x1[22]);
+  btf_16_adds_subs_avx2(&x1[18], &x1[21]);
+  btf_16_adds_subs_avx2(&x1[19], &x1[20]);
+  btf_16_adds_subs_avx2(&x1[31], &x1[24]);
+  btf_16_adds_subs_avx2(&x1[30], &x1[25]);
+  btf_16_adds_subs_avx2(&x1[29], &x1[26]);
+  btf_16_adds_subs_avx2(&x1[28], &x1[27]);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[36], &x1[59], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[37], &x1[58], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[38], &x1[57], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[39], &x1[56], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[40], &x1[55], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[41], &x1[54], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[42], &x1[53], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[43], &x1[52], _r, cos_bit);
+
+  // stage 5
+  btf_16_adds_subs_avx2(&x1[0], &x1[3]);
+  btf_16_adds_subs_avx2(&x1[1], &x1[2]);
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[8], &x1[11]);
+  btf_16_adds_subs_avx2(&x1[9], &x1[10]);
+  btf_16_adds_subs_avx2(&x1[15], &x1[12]);
+  btf_16_adds_subs_avx2(&x1[14], &x1[13]);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[18], &x1[29], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[19], &x1[28], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[20], &x1[27], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[21], &x1[26], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[32], &x1[39]);
+  btf_16_adds_subs_avx2(&x1[33], &x1[38]);
+  btf_16_adds_subs_avx2(&x1[34], &x1[37]);
+  btf_16_adds_subs_avx2(&x1[35], &x1[36]);
+  btf_16_adds_subs_avx2(&x1[47], &x1[40]);
+  btf_16_adds_subs_avx2(&x1[46], &x1[41]);
+  btf_16_adds_subs_avx2(&x1[45], &x1[42]);
+  btf_16_adds_subs_avx2(&x1[44], &x1[43]);
+  btf_16_adds_subs_avx2(&x1[48], &x1[55]);
+  btf_16_adds_subs_avx2(&x1[49], &x1[54]);
+  btf_16_adds_subs_avx2(&x1[50], &x1[53]);
+  btf_16_adds_subs_avx2(&x1[51], &x1[52]);
+  btf_16_adds_subs_avx2(&x1[63], &x1[56]);
+  btf_16_adds_subs_avx2(&x1[62], &x1[57]);
+  btf_16_adds_subs_avx2(&x1[61], &x1[58]);
+  btf_16_adds_subs_avx2(&x1[60], &x1[59]);
+
+  // stage 6
+  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x1[2], &x1[3], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[4], &x1[5]);
+  btf_16_adds_subs_avx2(&x1[7], &x1[6]);
+  btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[16], &x1[19]);
+  btf_16_adds_subs_avx2(&x1[17], &x1[18]);
+  btf_16_adds_subs_avx2(&x1[23], &x1[20]);
+  btf_16_adds_subs_avx2(&x1[22], &x1[21]);
+  btf_16_adds_subs_avx2(&x1[24], &x1[27]);
+  btf_16_adds_subs_avx2(&x1[25], &x1[26]);
+  btf_16_adds_subs_avx2(&x1[31], &x1[28]);
+  btf_16_adds_subs_avx2(&x1[30], &x1[29]);
+  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[34], &x1[61], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[35], &x1[60], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[36], &x1[59], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[37], &x1[58], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[42], &x1[53], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[43], &x1[52], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[44], &x1[51], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[45], &x1[50], _r, cos_bit);
+
+  // stage 7
+  btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x1[4], &x1[7], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x1[5], &x1[6], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[8], &x1[9]);
+  btf_16_adds_subs_avx2(&x1[11], &x1[10]);
+  btf_16_adds_subs_avx2(&x1[12], &x1[13]);
+  btf_16_adds_subs_avx2(&x1[15], &x1[14]);
+  btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[17], &x1[30], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[18], &x1[29], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[21], &x1[26], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[22], &x1[25], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[32], &x1[35]);
+  btf_16_adds_subs_avx2(&x1[33], &x1[34]);
+  btf_16_adds_subs_avx2(&x1[39], &x1[36]);
+  btf_16_adds_subs_avx2(&x1[38], &x1[37]);
+  btf_16_adds_subs_avx2(&x1[40], &x1[43]);
+  btf_16_adds_subs_avx2(&x1[41], &x1[42]);
+  btf_16_adds_subs_avx2(&x1[47], &x1[44]);
+  btf_16_adds_subs_avx2(&x1[46], &x1[45]);
+  btf_16_adds_subs_avx2(&x1[48], &x1[51]);
+  btf_16_adds_subs_avx2(&x1[49], &x1[50]);
+  btf_16_adds_subs_avx2(&x1[55], &x1[52]);
+  btf_16_adds_subs_avx2(&x1[54], &x1[53]);
+  btf_16_adds_subs_avx2(&x1[56], &x1[59]);
+  btf_16_adds_subs_avx2(&x1[57], &x1[58]);
+  btf_16_adds_subs_avx2(&x1[63], &x1[60]);
+  btf_16_adds_subs_avx2(&x1[62], &x1[61]);
+
+  // stage 8
+  btf_16_w16_avx2(cospi_p60_p04, cospi_m04_p60, &x1[8], &x1[15], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p28_p36, cospi_m36_p28, &x1[9], &x1[14], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p44_p20, cospi_m20_p44, &x1[10], &x1[13], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p12_p52, cospi_m52_p12, &x1[11], &x1[12], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[16], &x1[17]);
+  btf_16_adds_subs_avx2(&x1[19], &x1[18]);
+  btf_16_adds_subs_avx2(&x1[20], &x1[21]);
+  btf_16_adds_subs_avx2(&x1[23], &x1[22]);
+  btf_16_adds_subs_avx2(&x1[24], &x1[25]);
+  btf_16_adds_subs_avx2(&x1[27], &x1[26]);
+  btf_16_adds_subs_avx2(&x1[28], &x1[29]);
+  btf_16_adds_subs_avx2(&x1[31], &x1[30]);
+  btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x1[33], &x1[62], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m60_m04, cospi_m04_p60, &x1[34], &x1[61], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m36_p28, cospi_p28_p36, &x1[37], &x1[58], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x1[38], &x1[57], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x1[41], &x1[54], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m44_m20, cospi_m20_p44, &x1[42], &x1[53], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m52_p12, cospi_p12_p52, &x1[45], &x1[50], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x1[46], &x1[49], _r, cos_bit);
+
+  // stage 9
+  btf_16_w16_avx2(cospi_p62_p02, cospi_m02_p62, &x1[16], &x1[31], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p30_p34, cospi_m34_p30, &x1[17], &x1[30], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p46_p18, cospi_m18_p46, &x1[18], &x1[29], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p14_p50, cospi_m50_p14, &x1[19], &x1[28], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p54_p10, cospi_m10_p54, &x1[20], &x1[27], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p22_p42, cospi_m42_p22, &x1[21], &x1[26], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p38_p26, cospi_m26_p38, &x1[22], &x1[25], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p06_p58, cospi_m58_p06, &x1[23], &x1[24], _r, cos_bit);
+  btf_16_adds_subs_avx2(&x1[32], &x1[33]);
+  btf_16_adds_subs_avx2(&x1[35], &x1[34]);
+  btf_16_adds_subs_avx2(&x1[36], &x1[37]);
+  btf_16_adds_subs_avx2(&x1[39], &x1[38]);
+  btf_16_adds_subs_avx2(&x1[40], &x1[41]);
+  btf_16_adds_subs_avx2(&x1[43], &x1[42]);
+  btf_16_adds_subs_avx2(&x1[44], &x1[45]);
+  btf_16_adds_subs_avx2(&x1[47], &x1[46]);
+  btf_16_adds_subs_avx2(&x1[48], &x1[49]);
+  btf_16_adds_subs_avx2(&x1[51], &x1[50]);
+  btf_16_adds_subs_avx2(&x1[52], &x1[53]);
+  btf_16_adds_subs_avx2(&x1[55], &x1[54]);
+  btf_16_adds_subs_avx2(&x1[56], &x1[57]);
+  btf_16_adds_subs_avx2(&x1[59], &x1[58]);
+  btf_16_adds_subs_avx2(&x1[60], &x1[61]);
+  btf_16_adds_subs_avx2(&x1[63], &x1[62]);
+
+  // stage 10
+  btf_16_w16_avx2(cospi_p63_p01, cospi_m01_p63, &x1[32], &x1[63], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p31_p33, cospi_m33_p31, &x1[33], &x1[62], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p47_p17, cospi_m17_p47, &x1[34], &x1[61], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p15_p49, cospi_m49_p15, &x1[35], &x1[60], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p55_p09, cospi_m09_p55, &x1[36], &x1[59], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p23_p41, cospi_m41_p23, &x1[37], &x1[58], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p39_p25, cospi_m25_p39, &x1[38], &x1[57], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p07_p57, cospi_m57_p07, &x1[39], &x1[56], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p59_p05, cospi_m05_p59, &x1[40], &x1[55], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p27_p37, cospi_m37_p27, &x1[41], &x1[54], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p43_p21, cospi_m21_p43, &x1[42], &x1[53], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p11_p53, cospi_m53_p11, &x1[43], &x1[52], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p51_p13, cospi_m13_p51, &x1[44], &x1[51], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p19_p45, cospi_m45_p19, &x1[45], &x1[50], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p35_p29, cospi_m29_p35, &x1[46], &x1[49], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p03_p61, cospi_m61_p03, &x1[47], &x1[48], _r, cos_bit);
+
+  // stage 11
+  output[0] = x1[0];
+  output[1] = x1[32];
+  output[2] = x1[16];
+  output[3] = x1[48];
+  output[4] = x1[8];
+  output[5] = x1[40];
+  output[6] = x1[24];
+  output[7] = x1[56];
+  output[8] = x1[4];
+  output[9] = x1[36];
+  output[10] = x1[20];
+  output[11] = x1[52];
+  output[12] = x1[12];
+  output[13] = x1[44];
+  output[14] = x1[28];
+  output[15] = x1[60];
+  output[16] = x1[2];
+  output[17] = x1[34];
+  output[18] = x1[18];
+  output[19] = x1[50];
+  output[20] = x1[10];
+  output[21] = x1[42];
+  output[22] = x1[26];
+  output[23] = x1[58];
+  output[24] = x1[6];
+  output[25] = x1[38];
+  output[26] = x1[22];
+  output[27] = x1[54];
+  output[28] = x1[14];
+  output[29] = x1[46];
+  output[30] = x1[30];
+  output[31] = x1[62];
+  output[32] = x1[1];
+  output[33] = x1[33];
+  output[34] = x1[17];
+  output[35] = x1[49];
+  output[36] = x1[9];
+  output[37] = x1[41];
+  output[38] = x1[25];
+  output[39] = x1[57];
+  output[40] = x1[5];
+  output[41] = x1[37];
+  output[42] = x1[21];
+  output[43] = x1[53];
+  output[44] = x1[13];
+  output[45] = x1[45];
+  output[46] = x1[29];
+  output[47] = x1[61];
+  output[48] = x1[3];
+  output[49] = x1[35];
+  output[50] = x1[19];
+  output[51] = x1[51];
+  output[52] = x1[11];
+  output[53] = x1[43];
+  output[54] = x1[27];
+  output[55] = x1[59];
+  output[56] = x1[7];
+  output[57] = x1[39];
+  output[58] = x1[23];
+  output[59] = x1[55];
+  output[60] = x1[15];
+  output[61] = x1[47];
+  output[62] = x1[31];
+  output[63] = x1[63];
+}
+
+static INLINE void fdct32_avx2(const __m256i *input, __m256i *output,
+                               int8_t cos_bit) {
+  __m256i x1[32];
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
+  // stage 0
+  // stage 1
+  btf_32_add_sub_out_avx2(&x1[0], &x1[31], input[0], input[31]);
+  btf_32_add_sub_out_avx2(&x1[1], &x1[30], input[1], input[30]);
+  btf_32_add_sub_out_avx2(&x1[2], &x1[29], input[2], input[29]);
+  btf_32_add_sub_out_avx2(&x1[3], &x1[28], input[3], input[28]);
+  btf_32_add_sub_out_avx2(&x1[4], &x1[27], input[4], input[27]);
+  btf_32_add_sub_out_avx2(&x1[5], &x1[26], input[5], input[26]);
+  btf_32_add_sub_out_avx2(&x1[6], &x1[25], input[6], input[25]);
+  btf_32_add_sub_out_avx2(&x1[7], &x1[24], input[7], input[24]);
+  btf_32_add_sub_out_avx2(&x1[8], &x1[23], input[8], input[23]);
+  btf_32_add_sub_out_avx2(&x1[9], &x1[22], input[9], input[22]);
+  btf_32_add_sub_out_avx2(&x1[10], &x1[21], input[10], input[21]);
+  btf_32_add_sub_out_avx2(&x1[11], &x1[20], input[11], input[20]);
+  btf_32_add_sub_out_avx2(&x1[12], &x1[19], input[12], input[19]);
+  btf_32_add_sub_out_avx2(&x1[13], &x1[18], input[13], input[18]);
+  btf_32_add_sub_out_avx2(&x1[14], &x1[17], input[14], input[17]);
+  btf_32_add_sub_out_avx2(&x1[15], &x1[16], input[15], input[16]);
+
+  // stage 2
+  btf_32_add_sub_avx2(&x1[0], &x1[15]);
+  btf_32_add_sub_avx2(&x1[1], &x1[14]);
+  btf_32_add_sub_avx2(&x1[2], &x1[13]);
+  btf_32_add_sub_avx2(&x1[3], &x1[12]);
+  btf_32_add_sub_avx2(&x1[4], &x1[11]);
+  btf_32_add_sub_avx2(&x1[5], &x1[10]);
+  btf_32_add_sub_avx2(&x1[6], &x1[9]);
+  btf_32_add_sub_avx2(&x1[7], &x1[8]);
+  btf_32_avx2_type0(-cospi[32], cospi[32], &x1[20], &x1[27], _r, cos_bit);
+  btf_32_avx2_type0(-cospi[32], cospi[32], &x1[21], &x1[26], _r, cos_bit);
+  btf_32_avx2_type0(-cospi[32], cospi[32], &x1[22], &x1[25], _r, cos_bit);
+  btf_32_avx2_type0(-cospi[32], cospi[32], &x1[23], &x1[24], _r, cos_bit);
+
+  // stage 3
+  btf_32_add_sub_avx2(&x1[0], &x1[7]);
+  btf_32_add_sub_avx2(&x1[1], &x1[6]);
+  btf_32_add_sub_avx2(&x1[2], &x1[5]);
+  btf_32_add_sub_avx2(&x1[3], &x1[4]);
+  btf_32_avx2_type0(-cospi[32], cospi[32], &x1[10], &x1[13], _r, cos_bit);
+  btf_32_avx2_type0(-cospi[32], cospi[32], &x1[11], &x1[12], _r, cos_bit);
+  btf_32_add_sub_avx2(&x1[16], &x1[23]);
+  btf_32_add_sub_avx2(&x1[17], &x1[22]);
+  btf_32_add_sub_avx2(&x1[18], &x1[21]);
+  btf_32_add_sub_avx2(&x1[19], &x1[20]);
+  btf_32_add_sub_avx2(&x1[31], &x1[24]);
+  btf_32_add_sub_avx2(&x1[30], &x1[25]);
+  btf_32_add_sub_avx2(&x1[29], &x1[26]);
+  btf_32_add_sub_avx2(&x1[28], &x1[27]);
+
+  // stage 4
+  btf_32_add_sub_avx2(&x1[0], &x1[3]);
+  btf_32_add_sub_avx2(&x1[1], &x1[2]);
+  btf_32_avx2_type0(-cospi[32], cospi[32], &x1[5], &x1[6], _r, cos_bit);
+  btf_32_add_sub_avx2(&x1[8], &x1[11]);
+  btf_32_add_sub_avx2(&x1[9], &x1[10]);
+  btf_32_add_sub_avx2(&x1[15], &x1[12]);
+  btf_32_add_sub_avx2(&x1[14], &x1[13]);
+  btf_32_avx2_type0(-cospi[16], cospi[48], &x1[18], &x1[29], _r, cos_bit);
+  btf_32_avx2_type0(-cospi[16], cospi[48], &x1[19], &x1[28], _r, cos_bit);
+  btf_32_avx2_type0(-cospi[48], -cospi[16], &x1[20], &x1[27], _r, cos_bit);
+  btf_32_avx2_type0(-cospi[48], -cospi[16], &x1[21], &x1[26], _r, cos_bit);
+
+  // stage 5
+  btf_32_avx2_type0(cospi[32], cospi[32], &x1[0], &x1[1], _r, cos_bit);
+  btf_32_avx2_type1(cospi[48], cospi[16], &x1[2], &x1[3], _r, cos_bit);
+  btf_32_add_sub_avx2(&x1[4], &x1[5]);
+  btf_32_add_sub_avx2(&x1[7], &x1[6]);
+  btf_32_avx2_type0(-cospi[16], cospi[48], &x1[9], &x1[14], _r, cos_bit);
+  btf_32_avx2_type0(-cospi[48], -cospi[16], &x1[10], &x1[13], _r, cos_bit);
+  btf_32_add_sub_avx2(&x1[16], &x1[19]);
+  btf_32_add_sub_avx2(&x1[17], &x1[18]);
+  btf_32_add_sub_avx2(&x1[23], &x1[20]);
+  btf_32_add_sub_avx2(&x1[22], &x1[21]);
+  btf_32_add_sub_avx2(&x1[24], &x1[27]);
+  btf_32_add_sub_avx2(&x1[25], &x1[26]);
+  btf_32_add_sub_avx2(&x1[31], &x1[28]);
+  btf_32_add_sub_avx2(&x1[30], &x1[29]);
+
+  // stage 6
+  btf_32_avx2_type1(cospi[56], cospi[8], &x1[4], &x1[7], _r, cos_bit);
+  btf_32_avx2_type1(cospi[24], cospi[40], &x1[5], &x1[6], _r, cos_bit);
+  btf_32_add_sub_avx2(&x1[8], &x1[9]);
+  btf_32_add_sub_avx2(&x1[11], &x1[10]);
+  btf_32_add_sub_avx2(&x1[12], &x1[13]);
+  btf_32_add_sub_avx2(&x1[15], &x1[14]);
+  btf_32_avx2_type0(-cospi[8], cospi[56], &x1[17], &x1[30], _r, cos_bit);
+  btf_32_avx2_type0(-cospi[56], -cospi[8], &x1[18], &x1[29], _r, cos_bit);
+  btf_32_avx2_type0(-cospi[40], cospi[24], &x1[21], &x1[26], _r, cos_bit);
+  btf_32_avx2_type0(-cospi[24], -cospi[40], &x1[22], &x1[25], _r, cos_bit);
+
+  // stage 7
+  btf_32_avx2_type1(cospi[60], cospi[4], &x1[8], &x1[15], _r, cos_bit);
+  btf_32_avx2_type1(cospi[28], cospi[36], &x1[9], &x1[14], _r, cos_bit);
+  btf_32_avx2_type1(cospi[44], cospi[20], &x1[10], &x1[13], _r, cos_bit);
+  btf_32_avx2_type1(cospi[12], cospi[52], &x1[11], &x1[12], _r, cos_bit);
+  btf_32_add_sub_avx2(&x1[16], &x1[17]);
+  btf_32_add_sub_avx2(&x1[19], &x1[18]);
+  btf_32_add_sub_avx2(&x1[20], &x1[21]);
+  btf_32_add_sub_avx2(&x1[23], &x1[22]);
+  btf_32_add_sub_avx2(&x1[24], &x1[25]);
+  btf_32_add_sub_avx2(&x1[27], &x1[26]);
+  btf_32_add_sub_avx2(&x1[28], &x1[29]);
+  btf_32_add_sub_avx2(&x1[31], &x1[30]);
+
+  // stage 8
+  btf_32_avx2_type1(cospi[62], cospi[2], &x1[16], &x1[31], _r, cos_bit);
+  btf_32_avx2_type1(cospi[30], cospi[34], &x1[17], &x1[30], _r, cos_bit);
+  btf_32_avx2_type1(cospi[46], cospi[18], &x1[18], &x1[29], _r, cos_bit);
+  btf_32_avx2_type1(cospi[14], cospi[50], &x1[19], &x1[28], _r, cos_bit);
+  btf_32_avx2_type1(cospi[54], cospi[10], &x1[20], &x1[27], _r, cos_bit);
+  btf_32_avx2_type1(cospi[22], cospi[42], &x1[21], &x1[26], _r, cos_bit);
+  btf_32_avx2_type1(cospi[38], cospi[26], &x1[22], &x1[25], _r, cos_bit);
+  btf_32_avx2_type1(cospi[6], cospi[58], &x1[23], &x1[24], _r, cos_bit);
+
+  // stage 9
+  output[0] = x1[0];
+  output[1] = x1[16];
+  output[2] = x1[8];
+  output[3] = x1[24];
+  output[4] = x1[4];
+  output[5] = x1[20];
+  output[6] = x1[12];
+  output[7] = x1[28];
+  output[8] = x1[2];
+  output[9] = x1[18];
+  output[10] = x1[10];
+  output[11] = x1[26];
+  output[12] = x1[6];
+  output[13] = x1[22];
+  output[14] = x1[14];
+  output[15] = x1[30];
+  output[16] = x1[1];
+  output[17] = x1[17];
+  output[18] = x1[9];
+  output[19] = x1[25];
+  output[20] = x1[5];
+  output[21] = x1[21];
+  output[22] = x1[13];
+  output[23] = x1[29];
+  output[24] = x1[3];
+  output[25] = x1[19];
+  output[26] = x1[11];
+  output[27] = x1[27];
+  output[28] = x1[7];
+  output[29] = x1[23];
+  output[30] = x1[15];
+  output[31] = x1[31];
+}
+
+static INLINE void fdct64_new_avx2(const __m256i *input, __m256i *output,
+                                   int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
+
+  __m256i cospi_m32 = _mm256_set1_epi32(-cospi[32]);
+  __m256i cospi_p32 = _mm256_set1_epi32(cospi[32]);
+  __m256i cospi_m16 = _mm256_set1_epi32(-cospi[16]);
+  __m256i cospi_p48 = _mm256_set1_epi32(cospi[48]);
+  __m256i cospi_m48 = _mm256_set1_epi32(-cospi[48]);
+  __m256i cospi_p16 = _mm256_set1_epi32(cospi[16]);
+  __m256i cospi_m08 = _mm256_set1_epi32(-cospi[8]);
+  __m256i cospi_p56 = _mm256_set1_epi32(cospi[56]);
+  __m256i cospi_m56 = _mm256_set1_epi32(-cospi[56]);
+  __m256i cospi_m40 = _mm256_set1_epi32(-cospi[40]);
+  __m256i cospi_p24 = _mm256_set1_epi32(cospi[24]);
+  __m256i cospi_m24 = _mm256_set1_epi32(-cospi[24]);
+  __m256i cospi_p08 = _mm256_set1_epi32(cospi[8]);
+  __m256i cospi_p40 = _mm256_set1_epi32(cospi[40]);
+  __m256i cospi_p60 = _mm256_set1_epi32(cospi[60]);
+  __m256i cospi_p04 = _mm256_set1_epi32(cospi[4]);
+  __m256i cospi_p28 = _mm256_set1_epi32(cospi[28]);
+  __m256i cospi_p36 = _mm256_set1_epi32(cospi[36]);
+  __m256i cospi_p44 = _mm256_set1_epi32(cospi[44]);
+  __m256i cospi_p20 = _mm256_set1_epi32(cospi[20]);
+  __m256i cospi_p12 = _mm256_set1_epi32(cospi[12]);
+  __m256i cospi_p52 = _mm256_set1_epi32(cospi[52]);
+  __m256i cospi_m04 = _mm256_set1_epi32(-cospi[4]);
+  __m256i cospi_m60 = _mm256_set1_epi32(-cospi[60]);
+  __m256i cospi_m36 = _mm256_set1_epi32(-cospi[36]);
+  __m256i cospi_m28 = _mm256_set1_epi32(-cospi[28]);
+  __m256i cospi_m20 = _mm256_set1_epi32(-cospi[20]);
+  __m256i cospi_m44 = _mm256_set1_epi32(-cospi[44]);
+  __m256i cospi_m52 = _mm256_set1_epi32(-cospi[52]);
+  __m256i cospi_m12 = _mm256_set1_epi32(-cospi[12]);
+  __m256i cospi_p62 = _mm256_set1_epi32(cospi[62]);
+  __m256i cospi_p02 = _mm256_set1_epi32(cospi[2]);
+  __m256i cospi_p30 = _mm256_set1_epi32(cospi[30]);
+  __m256i cospi_p34 = _mm256_set1_epi32(cospi[34]);
+  __m256i cospi_p46 = _mm256_set1_epi32(cospi[46]);
+  __m256i cospi_p18 = _mm256_set1_epi32(cospi[18]);
+  __m256i cospi_p14 = _mm256_set1_epi32(cospi[14]);
+  __m256i cospi_p50 = _mm256_set1_epi32(cospi[50]);
+  __m256i cospi_p54 = _mm256_set1_epi32(cospi[54]);
+  __m256i cospi_p10 = _mm256_set1_epi32(cospi[10]);
+  __m256i cospi_p22 = _mm256_set1_epi32(cospi[22]);
+  __m256i cospi_p42 = _mm256_set1_epi32(cospi[42]);
+  __m256i cospi_p38 = _mm256_set1_epi32(cospi[38]);
+  __m256i cospi_p26 = _mm256_set1_epi32(cospi[26]);
+  __m256i cospi_p06 = _mm256_set1_epi32(cospi[6]);
+  __m256i cospi_p58 = _mm256_set1_epi32(cospi[58]);
+  __m256i cospi_p63 = _mm256_set1_epi32(cospi[63]);
+  __m256i cospi_p01 = _mm256_set1_epi32(cospi[1]);
+  __m256i cospi_p31 = _mm256_set1_epi32(cospi[31]);
+  __m256i cospi_p33 = _mm256_set1_epi32(cospi[33]);
+  __m256i cospi_p47 = _mm256_set1_epi32(cospi[47]);
+  __m256i cospi_p17 = _mm256_set1_epi32(cospi[17]);
+  __m256i cospi_p15 = _mm256_set1_epi32(cospi[15]);
+  __m256i cospi_p49 = _mm256_set1_epi32(cospi[49]);
+  __m256i cospi_p55 = _mm256_set1_epi32(cospi[55]);
+  __m256i cospi_p09 = _mm256_set1_epi32(cospi[9]);
+  __m256i cospi_p23 = _mm256_set1_epi32(cospi[23]);
+  __m256i cospi_p41 = _mm256_set1_epi32(cospi[41]);
+  __m256i cospi_p39 = _mm256_set1_epi32(cospi[39]);
+  __m256i cospi_p25 = _mm256_set1_epi32(cospi[25]);
+  __m256i cospi_p07 = _mm256_set1_epi32(cospi[7]);
+  __m256i cospi_p57 = _mm256_set1_epi32(cospi[57]);
+  __m256i cospi_p59 = _mm256_set1_epi32(cospi[59]);
+  __m256i cospi_p05 = _mm256_set1_epi32(cospi[5]);
+  __m256i cospi_p27 = _mm256_set1_epi32(cospi[27]);
+  __m256i cospi_p37 = _mm256_set1_epi32(cospi[37]);
+  __m256i cospi_p43 = _mm256_set1_epi32(cospi[43]);
+  __m256i cospi_p21 = _mm256_set1_epi32(cospi[21]);
+  __m256i cospi_p11 = _mm256_set1_epi32(cospi[11]);
+  __m256i cospi_p53 = _mm256_set1_epi32(cospi[53]);
+  __m256i cospi_p51 = _mm256_set1_epi32(cospi[51]);
+  __m256i cospi_p13 = _mm256_set1_epi32(cospi[13]);
+  __m256i cospi_p19 = _mm256_set1_epi32(cospi[19]);
+  __m256i cospi_p45 = _mm256_set1_epi32(cospi[45]);
+  __m256i cospi_p35 = _mm256_set1_epi32(cospi[35]);
+  __m256i cospi_p29 = _mm256_set1_epi32(cospi[29]);
+  __m256i cospi_p03 = _mm256_set1_epi32(cospi[3]);
+  __m256i cospi_p61 = _mm256_set1_epi32(cospi[61]);
+
+  // stage 1
+  __m256i x1[64];
+  btf_32_add_sub_out_avx2(&x1[0], &x1[63], input[0], input[63]);
+  btf_32_add_sub_out_avx2(&x1[1], &x1[62], input[1], input[62]);
+  btf_32_add_sub_out_avx2(&x1[2], &x1[61], input[2], input[61]);
+  btf_32_add_sub_out_avx2(&x1[3], &x1[60], input[3], input[60]);
+  btf_32_add_sub_out_avx2(&x1[4], &x1[59], input[4], input[59]);
+  btf_32_add_sub_out_avx2(&x1[5], &x1[58], input[5], input[58]);
+  btf_32_add_sub_out_avx2(&x1[6], &x1[57], input[6], input[57]);
+  btf_32_add_sub_out_avx2(&x1[7], &x1[56], input[7], input[56]);
+  btf_32_add_sub_out_avx2(&x1[8], &x1[55], input[8], input[55]);
+  btf_32_add_sub_out_avx2(&x1[9], &x1[54], input[9], input[54]);
+  btf_32_add_sub_out_avx2(&x1[10], &x1[53], input[10], input[53]);
+  btf_32_add_sub_out_avx2(&x1[11], &x1[52], input[11], input[52]);
+  btf_32_add_sub_out_avx2(&x1[12], &x1[51], input[12], input[51]);
+  btf_32_add_sub_out_avx2(&x1[13], &x1[50], input[13], input[50]);
+  btf_32_add_sub_out_avx2(&x1[14], &x1[49], input[14], input[49]);
+  btf_32_add_sub_out_avx2(&x1[15], &x1[48], input[15], input[48]);
+  btf_32_add_sub_out_avx2(&x1[16], &x1[47], input[16], input[47]);
+  btf_32_add_sub_out_avx2(&x1[17], &x1[46], input[17], input[46]);
+  btf_32_add_sub_out_avx2(&x1[18], &x1[45], input[18], input[45]);
+  btf_32_add_sub_out_avx2(&x1[19], &x1[44], input[19], input[44]);
+  btf_32_add_sub_out_avx2(&x1[20], &x1[43], input[20], input[43]);
+  btf_32_add_sub_out_avx2(&x1[21], &x1[42], input[21], input[42]);
+  btf_32_add_sub_out_avx2(&x1[22], &x1[41], input[22], input[41]);
+  btf_32_add_sub_out_avx2(&x1[23], &x1[40], input[23], input[40]);
+  btf_32_add_sub_out_avx2(&x1[24], &x1[39], input[24], input[39]);
+  btf_32_add_sub_out_avx2(&x1[25], &x1[38], input[25], input[38]);
+  btf_32_add_sub_out_avx2(&x1[26], &x1[37], input[26], input[37]);
+  btf_32_add_sub_out_avx2(&x1[27], &x1[36], input[27], input[36]);
+  btf_32_add_sub_out_avx2(&x1[28], &x1[35], input[28], input[35]);
+  btf_32_add_sub_out_avx2(&x1[29], &x1[34], input[29], input[34]);
+  btf_32_add_sub_out_avx2(&x1[30], &x1[33], input[30], input[33]);
+  btf_32_add_sub_out_avx2(&x1[31], &x1[32], input[31], input[32]);
+
+  // stage 2
+  btf_32_add_sub_avx2(&x1[0], &x1[31]);
+  btf_32_add_sub_avx2(&x1[1], &x1[30]);
+  btf_32_add_sub_avx2(&x1[2], &x1[29]);
+  btf_32_add_sub_avx2(&x1[3], &x1[28]);
+  btf_32_add_sub_avx2(&x1[4], &x1[27]);
+  btf_32_add_sub_avx2(&x1[5], &x1[26]);
+  btf_32_add_sub_avx2(&x1[6], &x1[25]);
+  btf_32_add_sub_avx2(&x1[7], &x1[24]);
+  btf_32_add_sub_avx2(&x1[8], &x1[23]);
+  btf_32_add_sub_avx2(&x1[9], &x1[22]);
+  btf_32_add_sub_avx2(&x1[10], &x1[21]);
+  btf_32_add_sub_avx2(&x1[11], &x1[20]);
+  btf_32_add_sub_avx2(&x1[12], &x1[19]);
+  btf_32_add_sub_avx2(&x1[13], &x1[18]);
+  btf_32_add_sub_avx2(&x1[14], &x1[17]);
+  btf_32_add_sub_avx2(&x1[15], &x1[16]);
+  btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[40], &x1[55], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[41], &x1[54], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[42], &x1[53], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[43], &x1[52], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[44], &x1[51], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[45], &x1[50], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[46], &x1[49], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[47], &x1[48], _r, cos_bit);
+
+  // stage 3
+  btf_32_add_sub_avx2(&x1[0], &x1[15]);
+  btf_32_add_sub_avx2(&x1[1], &x1[14]);
+  btf_32_add_sub_avx2(&x1[2], &x1[13]);
+  btf_32_add_sub_avx2(&x1[3], &x1[12]);
+  btf_32_add_sub_avx2(&x1[4], &x1[11]);
+  btf_32_add_sub_avx2(&x1[5], &x1[10]);
+  btf_32_add_sub_avx2(&x1[6], &x1[9]);
+  btf_32_add_sub_avx2(&x1[7], &x1[8]);
+  btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[20], &x1[27], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[21], &x1[26], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[22], &x1[25], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[23], &x1[24], _r, cos_bit);
+  btf_32_add_sub_avx2(&x1[32], &x1[47]);
+  btf_32_add_sub_avx2(&x1[33], &x1[46]);
+  btf_32_add_sub_avx2(&x1[34], &x1[45]);
+  btf_32_add_sub_avx2(&x1[35], &x1[44]);
+  btf_32_add_sub_avx2(&x1[36], &x1[43]);
+  btf_32_add_sub_avx2(&x1[37], &x1[42]);
+  btf_32_add_sub_avx2(&x1[38], &x1[41]);
+  btf_32_add_sub_avx2(&x1[39], &x1[40]);
+  btf_32_add_sub_avx2(&x1[63], &x1[48]);
+  btf_32_add_sub_avx2(&x1[62], &x1[49]);
+  btf_32_add_sub_avx2(&x1[61], &x1[50]);
+  btf_32_add_sub_avx2(&x1[60], &x1[51]);
+  btf_32_add_sub_avx2(&x1[59], &x1[52]);
+  btf_32_add_sub_avx2(&x1[58], &x1[53]);
+  btf_32_add_sub_avx2(&x1[57], &x1[54]);
+  btf_32_add_sub_avx2(&x1[56], &x1[55]);
+
+  // stage 4
+  btf_32_add_sub_avx2(&x1[0], &x1[7]);
+  btf_32_add_sub_avx2(&x1[1], &x1[6]);
+  btf_32_add_sub_avx2(&x1[2], &x1[5]);
+  btf_32_add_sub_avx2(&x1[3], &x1[4]);
+  btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[10], &x1[13], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[11], &x1[12], _r, cos_bit);
+  btf_32_add_sub_avx2(&x1[16], &x1[23]);
+  btf_32_add_sub_avx2(&x1[17], &x1[22]);
+  btf_32_add_sub_avx2(&x1[18], &x1[21]);
+  btf_32_add_sub_avx2(&x1[19], &x1[20]);
+  btf_32_add_sub_avx2(&x1[31], &x1[24]);
+  btf_32_add_sub_avx2(&x1[30], &x1[25]);
+  btf_32_add_sub_avx2(&x1[29], &x1[26]);
+  btf_32_add_sub_avx2(&x1[28], &x1[27]);
+  btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[36], &x1[59], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[37], &x1[58], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[38], &x1[57], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[39], &x1[56], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[40], &x1[55], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[41], &x1[54], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[42], &x1[53], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[43], &x1[52], _r, cos_bit);
+
+  // stage 5
+  btf_32_add_sub_avx2(&x1[0], &x1[3]);
+  btf_32_add_sub_avx2(&x1[1], &x1[2]);
+  btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[5], &x1[6], _r, cos_bit);
+  btf_32_add_sub_avx2(&x1[8], &x1[11]);
+  btf_32_add_sub_avx2(&x1[9], &x1[10]);
+  btf_32_add_sub_avx2(&x1[15], &x1[12]);
+  btf_32_add_sub_avx2(&x1[14], &x1[13]);
+  btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[18], &x1[29], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[19], &x1[28], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[20], &x1[27], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[21], &x1[26], _r, cos_bit);
+  btf_32_add_sub_avx2(&x1[32], &x1[39]);
+  btf_32_add_sub_avx2(&x1[33], &x1[38]);
+  btf_32_add_sub_avx2(&x1[34], &x1[37]);
+  btf_32_add_sub_avx2(&x1[35], &x1[36]);
+  btf_32_add_sub_avx2(&x1[47], &x1[40]);
+  btf_32_add_sub_avx2(&x1[46], &x1[41]);
+  btf_32_add_sub_avx2(&x1[45], &x1[42]);
+  btf_32_add_sub_avx2(&x1[44], &x1[43]);
+  btf_32_add_sub_avx2(&x1[48], &x1[55]);
+  btf_32_add_sub_avx2(&x1[49], &x1[54]);
+  btf_32_add_sub_avx2(&x1[50], &x1[53]);
+  btf_32_add_sub_avx2(&x1[51], &x1[52]);
+  btf_32_add_sub_avx2(&x1[63], &x1[56]);
+  btf_32_add_sub_avx2(&x1[62], &x1[57]);
+  btf_32_add_sub_avx2(&x1[61], &x1[58]);
+  btf_32_add_sub_avx2(&x1[60], &x1[59]);
+
+  // stage 6
+  btf_32_avx2_type0_new(cospi_p32, cospi_p32, &x1[0], &x1[1], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p48, cospi_p16, &x1[2], &x1[3], _r, cos_bit);
+  btf_32_add_sub_avx2(&x1[4], &x1[5]);
+  btf_32_add_sub_avx2(&x1[7], &x1[6]);
+  btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[9], &x1[14], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[10], &x1[13], _r, cos_bit);
+  btf_32_add_sub_avx2(&x1[16], &x1[19]);
+  btf_32_add_sub_avx2(&x1[17], &x1[18]);
+  btf_32_add_sub_avx2(&x1[23], &x1[20]);
+  btf_32_add_sub_avx2(&x1[22], &x1[21]);
+  btf_32_add_sub_avx2(&x1[24], &x1[27]);
+  btf_32_add_sub_avx2(&x1[25], &x1[26]);
+  btf_32_add_sub_avx2(&x1[31], &x1[28]);
+  btf_32_add_sub_avx2(&x1[30], &x1[29]);
+  btf_32_avx2_type0_new(cospi_m08, cospi_p56, &x1[34], &x1[61], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m08, cospi_p56, &x1[35], &x1[60], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m56, cospi_m08, &x1[36], &x1[59], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m56, cospi_m08, &x1[37], &x1[58], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m40, cospi_p24, &x1[42], &x1[53], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m40, cospi_p24, &x1[43], &x1[52], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m24, cospi_m40, &x1[44], &x1[51], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m24, cospi_m40, &x1[45], &x1[50], _r, cos_bit);
+
+  // stage 7
+  btf_32_avx2_type1_new(cospi_p56, cospi_p08, &x1[4], &x1[7], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p24, cospi_p40, &x1[5], &x1[6], _r, cos_bit);
+  btf_32_add_sub_avx2(&x1[8], &x1[9]);
+  btf_32_add_sub_avx2(&x1[11], &x1[10]);
+  btf_32_add_sub_avx2(&x1[12], &x1[13]);
+  btf_32_add_sub_avx2(&x1[15], &x1[14]);
+  btf_32_avx2_type0_new(cospi_m08, cospi_p56, &x1[17], &x1[30], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m56, cospi_m08, &x1[18], &x1[29], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m40, cospi_p24, &x1[21], &x1[26], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m24, cospi_m40, &x1[22], &x1[25], _r, cos_bit);
+  btf_32_add_sub_avx2(&x1[32], &x1[35]);
+  btf_32_add_sub_avx2(&x1[33], &x1[34]);
+  btf_32_add_sub_avx2(&x1[39], &x1[36]);
+  btf_32_add_sub_avx2(&x1[38], &x1[37]);
+  btf_32_add_sub_avx2(&x1[40], &x1[43]);
+  btf_32_add_sub_avx2(&x1[41], &x1[42]);
+  btf_32_add_sub_avx2(&x1[47], &x1[44]);
+  btf_32_add_sub_avx2(&x1[46], &x1[45]);
+  btf_32_add_sub_avx2(&x1[48], &x1[51]);
+  btf_32_add_sub_avx2(&x1[49], &x1[50]);
+  btf_32_add_sub_avx2(&x1[55], &x1[52]);
+  btf_32_add_sub_avx2(&x1[54], &x1[53]);
+  btf_32_add_sub_avx2(&x1[56], &x1[59]);
+  btf_32_add_sub_avx2(&x1[57], &x1[58]);
+  btf_32_add_sub_avx2(&x1[63], &x1[60]);
+  btf_32_add_sub_avx2(&x1[62], &x1[61]);
+
+  // stage 8
+  btf_32_avx2_type1_new(cospi_p60, cospi_p04, &x1[8], &x1[15], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p28, cospi_p36, &x1[9], &x1[14], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p44, cospi_p20, &x1[10], &x1[13], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p12, cospi_p52, &x1[11], &x1[12], _r, cos_bit);
+  btf_32_add_sub_avx2(&x1[16], &x1[17]);
+  btf_32_add_sub_avx2(&x1[19], &x1[18]);
+  btf_32_add_sub_avx2(&x1[20], &x1[21]);
+  btf_32_add_sub_avx2(&x1[23], &x1[22]);
+  btf_32_add_sub_avx2(&x1[24], &x1[25]);
+  btf_32_add_sub_avx2(&x1[27], &x1[26]);
+  btf_32_add_sub_avx2(&x1[28], &x1[29]);
+  btf_32_add_sub_avx2(&x1[31], &x1[30]);
+  btf_32_avx2_type0_new(cospi_m04, cospi_p60, &x1[33], &x1[62], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m60, cospi_m04, &x1[34], &x1[61], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m36, cospi_p28, &x1[37], &x1[58], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m28, cospi_m36, &x1[38], &x1[57], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m20, cospi_p44, &x1[41], &x1[54], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m44, cospi_m20, &x1[42], &x1[53], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m52, cospi_p12, &x1[45], &x1[50], _r, cos_bit);
+  btf_32_avx2_type0_new(cospi_m12, cospi_m52, &x1[46], &x1[49], _r, cos_bit);
+
+  // stage 9
+  btf_32_avx2_type1_new(cospi_p62, cospi_p02, &x1[16], &x1[31], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p30, cospi_p34, &x1[17], &x1[30], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p46, cospi_p18, &x1[18], &x1[29], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p14, cospi_p50, &x1[19], &x1[28], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p54, cospi_p10, &x1[20], &x1[27], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p22, cospi_p42, &x1[21], &x1[26], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p38, cospi_p26, &x1[22], &x1[25], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p06, cospi_p58, &x1[23], &x1[24], _r, cos_bit);
+  btf_32_add_sub_avx2(&x1[32], &x1[33]);
+  btf_32_add_sub_avx2(&x1[35], &x1[34]);
+  btf_32_add_sub_avx2(&x1[36], &x1[37]);
+  btf_32_add_sub_avx2(&x1[39], &x1[38]);
+  btf_32_add_sub_avx2(&x1[40], &x1[41]);
+  btf_32_add_sub_avx2(&x1[43], &x1[42]);
+  btf_32_add_sub_avx2(&x1[44], &x1[45]);
+  btf_32_add_sub_avx2(&x1[47], &x1[46]);
+  btf_32_add_sub_avx2(&x1[48], &x1[49]);
+  btf_32_add_sub_avx2(&x1[51], &x1[50]);
+  btf_32_add_sub_avx2(&x1[52], &x1[53]);
+  btf_32_add_sub_avx2(&x1[55], &x1[54]);
+  btf_32_add_sub_avx2(&x1[56], &x1[57]);
+  btf_32_add_sub_avx2(&x1[59], &x1[58]);
+  btf_32_add_sub_avx2(&x1[60], &x1[61]);
+  btf_32_add_sub_avx2(&x1[63], &x1[62]);
+
+  // stage 10
+  btf_32_avx2_type1_new(cospi_p63, cospi_p01, &x1[32], &x1[63], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p31, cospi_p33, &x1[33], &x1[62], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p47, cospi_p17, &x1[34], &x1[61], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p15, cospi_p49, &x1[35], &x1[60], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p55, cospi_p09, &x1[36], &x1[59], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p23, cospi_p41, &x1[37], &x1[58], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p39, cospi_p25, &x1[38], &x1[57], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p07, cospi_p57, &x1[39], &x1[56], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p59, cospi_p05, &x1[40], &x1[55], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p27, cospi_p37, &x1[41], &x1[54], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p43, cospi_p21, &x1[42], &x1[53], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p11, cospi_p53, &x1[43], &x1[52], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p51, cospi_p13, &x1[44], &x1[51], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p19, cospi_p45, &x1[45], &x1[50], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p35, cospi_p29, &x1[46], &x1[49], _r, cos_bit);
+  btf_32_avx2_type1_new(cospi_p03, cospi_p61, &x1[47], &x1[48], _r, cos_bit);
+
+  // stage 11
+  output[0] = x1[0];
+  output[1] = x1[32];
+  output[2] = x1[16];
+  output[3] = x1[48];
+  output[4] = x1[8];
+  output[5] = x1[40];
+  output[6] = x1[24];
+  output[7] = x1[56];
+  output[8] = x1[4];
+  output[9] = x1[36];
+  output[10] = x1[20];
+  output[11] = x1[52];
+  output[12] = x1[12];
+  output[13] = x1[44];
+  output[14] = x1[28];
+  output[15] = x1[60];
+  output[16] = x1[2];
+  output[17] = x1[34];
+  output[18] = x1[18];
+  output[19] = x1[50];
+  output[20] = x1[10];
+  output[21] = x1[42];
+  output[22] = x1[26];
+  output[23] = x1[58];
+  output[24] = x1[6];
+  output[25] = x1[38];
+  output[26] = x1[22];
+  output[27] = x1[54];
+  output[28] = x1[14];
+  output[29] = x1[46];
+  output[30] = x1[30];
+  output[31] = x1[62];
+  output[32] = x1[1];
+  output[33] = x1[33];
+  output[34] = x1[17];
+  output[35] = x1[49];
+  output[36] = x1[9];
+  output[37] = x1[41];
+  output[38] = x1[25];
+  output[39] = x1[57];
+  output[40] = x1[5];
+  output[41] = x1[37];
+  output[42] = x1[21];
+  output[43] = x1[53];
+  output[44] = x1[13];
+  output[45] = x1[45];
+  output[46] = x1[29];
+  output[47] = x1[61];
+  output[48] = x1[3];
+  output[49] = x1[35];
+  output[50] = x1[19];
+  output[51] = x1[51];
+  output[52] = x1[11];
+  output[53] = x1[43];
+  output[54] = x1[27];
+  output[55] = x1[59];
+  output[56] = x1[7];
+  output[57] = x1[39];
+  output[58] = x1[23];
+  output[59] = x1[55];
+  output[60] = x1[15];
+  output[61] = x1[47];
+  output[62] = x1[31];
+  output[63] = x1[63];
+}
+
+static INLINE void fadst16x16_new_avx2(const __m256i *input, __m256i *output,
+                                       int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m256i __zero = _mm256_setzero_si256();
+  const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
+
+  __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
+  __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
+  __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
+  __m256i cospi_m48_p16 = pair_set_w16_epi16(-cospi[48], cospi[16]);
+  __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
+  __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
+  __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]);
+  __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]);
+  __m256i cospi_m56_p08 = pair_set_w16_epi16(-cospi[56], cospi[8]);
+  __m256i cospi_m24_p40 = pair_set_w16_epi16(-cospi[24], cospi[40]);
+  __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]);
+  __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]);
+  __m256i cospi_p10_p54 = pair_set_w16_epi16(cospi[10], cospi[54]);
+  __m256i cospi_p54_m10 = pair_set_w16_epi16(cospi[54], -cospi[10]);
+  __m256i cospi_p18_p46 = pair_set_w16_epi16(cospi[18], cospi[46]);
+  __m256i cospi_p46_m18 = pair_set_w16_epi16(cospi[46], -cospi[18]);
+  __m256i cospi_p26_p38 = pair_set_w16_epi16(cospi[26], cospi[38]);
+  __m256i cospi_p38_m26 = pair_set_w16_epi16(cospi[38], -cospi[26]);
+  __m256i cospi_p34_p30 = pair_set_w16_epi16(cospi[34], cospi[30]);
+  __m256i cospi_p30_m34 = pair_set_w16_epi16(cospi[30], -cospi[34]);
+  __m256i cospi_p42_p22 = pair_set_w16_epi16(cospi[42], cospi[22]);
+  __m256i cospi_p22_m42 = pair_set_w16_epi16(cospi[22], -cospi[42]);
+  __m256i cospi_p50_p14 = pair_set_w16_epi16(cospi[50], cospi[14]);
+  __m256i cospi_p14_m50 = pair_set_w16_epi16(cospi[14], -cospi[50]);
+  __m256i cospi_p58_p06 = pair_set_w16_epi16(cospi[58], cospi[6]);
+  __m256i cospi_p06_m58 = pair_set_w16_epi16(cospi[6], -cospi[58]);
+
+  // stage 1
+  __m256i x1[16];
+  x1[0] = input[0];
+  x1[1] = _mm256_subs_epi16(__zero, input[15]);
+  x1[2] = _mm256_subs_epi16(__zero, input[7]);
+  x1[3] = input[8];
+  x1[4] = _mm256_subs_epi16(__zero, input[3]);
+  x1[5] = input[12];
+  x1[6] = input[4];
+  x1[7] = _mm256_subs_epi16(__zero, input[11]);
+  x1[8] = _mm256_subs_epi16(__zero, input[1]);
+  x1[9] = input[14];
+  x1[10] = input[6];
+  x1[11] = _mm256_subs_epi16(__zero, input[9]);
+  x1[12] = input[2];
+  x1[13] = _mm256_subs_epi16(__zero, input[13]);
+  x1[14] = _mm256_subs_epi16(__zero, input[5]);
+  x1[15] = input[10];
+
+  // stage 2
+  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[2], &x1[3], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[6], &x1[7], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[10], &x1[11], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[14], &x1[15], _r, cos_bit);
+
+  // stage 3
+  btf_16_adds_subs_avx2(&x1[0], &x1[2]);
+  btf_16_adds_subs_avx2(&x1[1], &x1[3]);
+  btf_16_adds_subs_avx2(&x1[4], &x1[6]);
+  btf_16_adds_subs_avx2(&x1[5], &x1[7]);
+  btf_16_adds_subs_avx2(&x1[8], &x1[10]);
+  btf_16_adds_subs_avx2(&x1[9], &x1[11]);
+  btf_16_adds_subs_avx2(&x1[12], &x1[14]);
+  btf_16_adds_subs_avx2(&x1[13], &x1[15]);
+
+  // stage 4
+  btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[4], &x1[5], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x1[6], &x1[7], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[12], &x1[13], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x1[14], &x1[15], _r, cos_bit);
+
+  // stage 5
+  btf_16_adds_subs_avx2(&x1[0], &x1[4]);
+  btf_16_adds_subs_avx2(&x1[1], &x1[5]);
+  btf_16_adds_subs_avx2(&x1[2], &x1[6]);
+  btf_16_adds_subs_avx2(&x1[3], &x1[7]);
+  btf_16_adds_subs_avx2(&x1[8], &x1[12]);
+  btf_16_adds_subs_avx2(&x1[9], &x1[13]);
+  btf_16_adds_subs_avx2(&x1[10], &x1[14]);
+  btf_16_adds_subs_avx2(&x1[11], &x1[15]);
+
+  // stage 6
+  btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x1[8], &x1[9], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p40_p24, cospi_p24_m40, &x1[10], &x1[11], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m56_p08, cospi_p08_p56, &x1[12], &x1[13], _r, cos_bit);
+  btf_16_w16_avx2(cospi_m24_p40, cospi_p40_p24, &x1[14], &x1[15], _r, cos_bit);
+
+  // stage 7
+  btf_16_adds_subs_avx2(&x1[0], &x1[8]);
+  btf_16_adds_subs_avx2(&x1[1], &x1[9]);
+  btf_16_adds_subs_avx2(&x1[2], &x1[10]);
+  btf_16_adds_subs_avx2(&x1[3], &x1[11]);
+  btf_16_adds_subs_avx2(&x1[4], &x1[12]);
+  btf_16_adds_subs_avx2(&x1[5], &x1[13]);
+  btf_16_adds_subs_avx2(&x1[6], &x1[14]);
+  btf_16_adds_subs_avx2(&x1[7], &x1[15]);
+
+  // stage 8
+  btf_16_w16_avx2(cospi_p02_p62, cospi_p62_m02, &x1[0], &x1[1], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p10_p54, cospi_p54_m10, &x1[2], &x1[3], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p18_p46, cospi_p46_m18, &x1[4], &x1[5], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p26_p38, cospi_p38_m26, &x1[6], &x1[7], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p34_p30, cospi_p30_m34, &x1[8], &x1[9], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p42_p22, cospi_p22_m42, &x1[10], &x1[11], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p50_p14, cospi_p14_m50, &x1[12], &x1[13], _r, cos_bit);
+  btf_16_w16_avx2(cospi_p58_p06, cospi_p06_m58, &x1[14], &x1[15], _r, cos_bit);
+
+  // stage 9
+  output[0] = x1[1];
+  output[1] = x1[14];
+  output[2] = x1[3];
+  output[3] = x1[12];
+  output[4] = x1[5];
+  output[5] = x1[10];
+  output[6] = x1[7];
+  output[7] = x1[8];
+  output[8] = x1[9];
+  output[9] = x1[6];
+  output[10] = x1[11];
+  output[11] = x1[4];
+  output[12] = x1[13];
+  output[13] = x1[2];
+  output[14] = x1[15];
+  output[15] = x1[0];
+}
+
+static INLINE void fidentity16x16_new_avx2(const __m256i *input,
+                                           __m256i *output, int8_t cos_bit) {
+  (void)cos_bit;
+  const __m256i one = _mm256_set1_epi16(1);
+
+  for (int i = 0; i < 16; ++i) {
+    const __m256i a_lo = _mm256_unpacklo_epi16(input[i], one);
+    const __m256i a_hi = _mm256_unpackhi_epi16(input[i], one);
+    const __m256i b_lo = scale_round_avx2(a_lo, 2 * NewSqrt2);
+    const __m256i b_hi = scale_round_avx2(a_hi, 2 * NewSqrt2);
+    output[i] = _mm256_packs_epi32(b_lo, b_hi);
+  }
+}
+
+static INLINE void fidentity16x32_avx2(const __m256i *input, __m256i *output,
+                                       int8_t cos_bit) {
+  (void)cos_bit;
+  for (int i = 0; i < 32; ++i) {
+    output[i] = _mm256_slli_epi16(input[i], 2);
+  }
+}
+
+static INLINE void transpose_32_8x8_avx2(int stride, const __m256i *inputA,
+                                         __m256i *output) {
+  __m256i temp0 = _mm256_unpacklo_epi32(inputA[0], inputA[2]);
+  __m256i temp1 = _mm256_unpackhi_epi32(inputA[0], inputA[2]);
+  __m256i temp2 = _mm256_unpacklo_epi32(inputA[1], inputA[3]);
+  __m256i temp3 = _mm256_unpackhi_epi32(inputA[1], inputA[3]);
+  __m256i temp4 = _mm256_unpacklo_epi32(inputA[4], inputA[6]);
+  __m256i temp5 = _mm256_unpackhi_epi32(inputA[4], inputA[6]);
+  __m256i temp6 = _mm256_unpacklo_epi32(inputA[5], inputA[7]);
+  __m256i temp7 = _mm256_unpackhi_epi32(inputA[5], inputA[7]);
+
+  __m256i t0 = _mm256_unpacklo_epi32(temp0, temp2);
+  __m256i t1 = _mm256_unpackhi_epi32(temp0, temp2);
+  __m256i t2 = _mm256_unpacklo_epi32(temp1, temp3);
+  __m256i t3 = _mm256_unpackhi_epi32(temp1, temp3);
+  __m256i t4 = _mm256_unpacklo_epi32(temp4, temp6);
+  __m256i t5 = _mm256_unpackhi_epi32(temp4, temp6);
+  __m256i t6 = _mm256_unpacklo_epi32(temp5, temp7);
+  __m256i t7 = _mm256_unpackhi_epi32(temp5, temp7);
+
+  output[0 * stride] = _mm256_permute2x128_si256(t0, t4, 0x20);
+  output[1 * stride] = _mm256_permute2x128_si256(t1, t5, 0x20);
+  output[2 * stride] = _mm256_permute2x128_si256(t2, t6, 0x20);
+  output[3 * stride] = _mm256_permute2x128_si256(t3, t7, 0x20);
+  output[4 * stride] = _mm256_permute2x128_si256(t0, t4, 0x31);
+  output[5 * stride] = _mm256_permute2x128_si256(t1, t5, 0x31);
+  output[6 * stride] = _mm256_permute2x128_si256(t2, t6, 0x31);
+  output[7 * stride] = _mm256_permute2x128_si256(t3, t7, 0x31);
+}
+
+// Store 8 16 bit values. Sign extend the values.
+static INLINE void store_buffer_16bit_to_32bit_w16_avx2(const __m256i *const in,
+                                                        int32_t *out,
+                                                        const int stride,
+                                                        const int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    _mm256_store_si256((__m256i *)(out),
+                       _mm256_cvtepi16_epi32(_mm256_castsi256_si128(in[i])));
+    _mm256_store_si256(
+        (__m256i *)(out + 8),
+        _mm256_cvtepi16_epi32(_mm256_extracti128_si256(in[i], 1)));
+    out += stride;
+  }
+}
+
+static INLINE void store_rect_16bit_to_32bit_avx2(const __m256i a,
+                                                  int32_t *const b) {
+  const __m256i one = _mm256_set1_epi16(1);
+  const __m256i a_reoder = _mm256_permute4x64_epi64(a, 0xd8);
+  const __m256i a_lo = _mm256_unpacklo_epi16(a_reoder, one);
+  const __m256i a_hi = _mm256_unpackhi_epi16(a_reoder, one);
+  const __m256i b_lo = scale_round_avx2(a_lo, NewSqrt2);
+  const __m256i b_hi = scale_round_avx2(a_hi, NewSqrt2);
+  _mm256_store_si256((__m256i *)b, b_lo);
+  _mm256_store_si256((__m256i *)(b + 8), b_hi);
+}
+
+static INLINE void store_rect_buffer_16bit_to_32bit_w16_avx2(
+    const __m256i *const in, int32_t *const out, const int stride,
+    const int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    store_rect_16bit_to_32bit_avx2(in[i], out + i * stride);
+  }
+}
+
+typedef void (*transform_1d_avx2)(const __m256i *input, __m256i *output,
+                                  int8_t cos_bit);
+
+static const transform_1d_avx2 col_txfm16x32_arr[TX_TYPES] = {
+  fdct16x32_avx2,       // DCT_DCT
+  NULL,                 // ADST_DCT
+  NULL,                 // DCT_ADST
+  NULL,                 // ADST_ADST
+  NULL,                 // FLIPADST_DCT
+  NULL,                 // DCT_FLIPADST
+  NULL,                 // FLIPADST_FLIPADST
+  NULL,                 // ADST_FLIPADST
+  NULL,                 // FLIPADST_ADST
+  fidentity16x32_avx2,  // IDTX
+  fdct16x32_avx2,       // V_DCT
+  fidentity16x32_avx2,  // H_DCT
+  NULL,                 // V_ADST
+  NULL,                 // H_ADST
+  NULL,                 // V_FLIPADST
+  NULL                  // H_FLIPADST
+};
+
+static const transform_1d_avx2 row_txfm16x32_arr[TX_TYPES] = {
+  fdct16x32_avx2,       // DCT_DCT
+  NULL,                 // ADST_DCT
+  NULL,                 // DCT_ADST
+  NULL,                 // ADST_ADST
+  NULL,                 // FLIPADST_DCT
+  NULL,                 // DCT_FLIPADST
+  NULL,                 // FLIPADST_FLIPADST
+  NULL,                 // ADST_FLIPADST
+  NULL,                 // FLIPADST_ADST
+  fidentity16x32_avx2,  // IDTX
+  fidentity16x32_avx2,  // V_DCT
+  fdct16x32_avx2,       // H_DCT
+  NULL,                 // V_ADST
+  NULL,                 // H_ADST
+  NULL,                 // V_FLIPADST
+  NULL                  // H_FLIPADST
+};
+
+static const transform_1d_avx2 col_txfm16x16_arr[TX_TYPES] = {
+  fdct16x16_new_avx2,       // DCT_DCT
+  fadst16x16_new_avx2,      // ADST_DCT
+  fdct16x16_new_avx2,       // DCT_ADST
+  fadst16x16_new_avx2,      // ADST_ADST
+  fadst16x16_new_avx2,      // FLIPADST_DCT
+  fdct16x16_new_avx2,       // DCT_FLIPADST
+  fadst16x16_new_avx2,      // FLIPADST_FLIPADST
+  fadst16x16_new_avx2,      // ADST_FLIPADST
+  fadst16x16_new_avx2,      // FLIPADST_ADST
+  fidentity16x16_new_avx2,  // IDTX
+  fdct16x16_new_avx2,       // V_DCT
+  fidentity16x16_new_avx2,  // H_DCT
+  fadst16x16_new_avx2,      // V_ADST
+  fidentity16x16_new_avx2,  // H_ADST
+  fadst16x16_new_avx2,      // V_FLIPADST
+  fidentity16x16_new_avx2   // H_FLIPADST
+};
+
+static const transform_1d_avx2 row_txfm16x16_arr[TX_TYPES] = {
+  fdct16x16_new_avx2,       // DCT_DCT
+  fdct16x16_new_avx2,       // ADST_DCT
+  fadst16x16_new_avx2,      // DCT_ADST
+  fadst16x16_new_avx2,      // ADST_ADST
+  fdct16x16_new_avx2,       // FLIPADST_DCT
+  fadst16x16_new_avx2,      // DCT_FLIPADST
+  fadst16x16_new_avx2,      // FLIPADST_FLIPADST
+  fadst16x16_new_avx2,      // ADST_FLIPADST
+  fadst16x16_new_avx2,      // FLIPADST_ADST
+  fidentity16x16_new_avx2,  // IDTX
+  fidentity16x16_new_avx2,  // V_DCT
+  fdct16x16_new_avx2,       // H_DCT
+  fidentity16x16_new_avx2,  // V_ADST
+  fadst16x16_new_avx2,      // H_ADST
+  fidentity16x16_new_avx2,  // V_FLIPADST
+  fadst16x16_new_avx2       // H_FLIPADST
+};
+
+static void lowbd_fwd_txfm2d_16x16_avx2(const int16_t *input, int32_t *output,
+                                        int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  const TX_SIZE tx_size = TX_16X16;
+  __m256i buf0[16], buf1[16];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const transform_1d_avx2 col_txfm = col_txfm16x16_arr[tx_type];
+  const transform_1d_avx2 row_txfm = row_txfm16x16_arr[tx_type];
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  const int32_t i = 0;
+  if (ud_flip) {
+    load_buffer_16bit_to_16bit_flip_avx2(input + 16 * i, stride, buf0, height);
+  } else {
+    load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
+  }
+  round_shift_16bit_w16_avx2(buf0, height, shift[0]);
+  col_txfm(buf0, buf0, cos_bit_col);
+  round_shift_16bit_w16_avx2(buf0, height, shift[1]);
+  transpose_16bit_16x16_avx2(buf0, buf1 + 0 * width + 16 * i);
+
+  __m256i *buf;
+  if (lr_flip) {
+    buf = buf0;
+    flip_buf_avx2(buf1 + width * i, buf, width);
+  } else {
+    buf = buf1 + width * i;
+  }
+  row_txfm(buf, buf, cos_bit_row);
+  round_shift_16bit_w16_avx2(buf, width, shift[2]);
+  transpose_16bit_16x16_avx2(buf, buf);
+  store_buffer_16bit_to_32bit_w16_avx2(buf, output + 16 * width * i, width, 16);
+}
+
+static void lowbd_fwd_txfm2d_32x32_avx2(const int16_t *input, int32_t *output,
+                                        int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  const TX_SIZE tx_size = TX_32X32;
+  __m256i buf0[32], buf1[128];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const transform_1d_avx2 col_txfm = col_txfm16x32_arr[tx_type];
+  const transform_1d_avx2 row_txfm = row_txfm16x32_arr[tx_type];
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  for (int i = 0; i < 2; i++) {
+    if (ud_flip) {
+      load_buffer_16bit_to_16bit_flip_avx2(input + 16 * i, stride, buf0,
+                                           height);
+    } else {
+      load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
+    }
+    round_shift_16bit_w16_avx2(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col);
+    round_shift_16bit_w16_avx2(buf0, height, shift[1]);
+    transpose_16bit_16x16_avx2(buf0 + 0 * 16, buf1 + 0 * width + 16 * i);
+    transpose_16bit_16x16_avx2(buf0 + 1 * 16, buf1 + 1 * width + 16 * i);
+  }
+
+  for (int i = 0; i < 2; i++) {
+    __m256i *buf;
+    if (lr_flip) {
+      buf = buf0;
+      flip_buf_avx2(buf1 + width * i, buf, width);
+    } else {
+      buf = buf1 + width * i;
+    }
+    row_txfm(buf, buf, cos_bit_row);
+    round_shift_16bit_w16_avx2(buf, width, shift[2]);
+    transpose_16bit_16x16_avx2(buf, buf);
+    store_buffer_16bit_to_32bit_w16_avx2(buf, output + 16 * width * i, width,
+                                         16);
+    transpose_16bit_16x16_avx2(buf + 16, buf + 16);
+    store_buffer_16bit_to_32bit_w16_avx2(buf + 16, output + 16 * width * i + 16,
+                                         width, 16);
+  }
+}
+
+static void lowbd_fwd_txfm2d_64x64_avx2(const int16_t *input, int32_t *output,
+                                        int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  (void)tx_type;
+  assert(tx_type == DCT_DCT);
+  const TX_SIZE tx_size = TX_64X64;
+  __m256i buf0[64], buf1[256];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const transform_1d_avx2 col_txfm = fdct16x64_new_avx2;
+  const int width_div16 = (width >> 4);
+  const int height_div16 = (height >> 4);
+
+  for (int i = 0; i < width_div16; i++) {
+    load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
+    round_shift_16bit_w16_avx2(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col);
+    round_shift_16bit_w16_avx2(buf0, height, shift[1]);
+    for (int j = 0; j < AOMMIN(2, height_div16); ++j) {
+      transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i);
+    }
+  }
+
+  for (int i = 0; i < AOMMIN(2, height_div16); i++) {
+    __m256i bufA[64];
+    __m256i bufB[64];
+    __m128i *buf = (__m128i *)(buf1 + width * i);
+    for (int j = 0; j < width; ++j) {
+      bufA[j] = _mm256_cvtepi16_epi32(buf[j * 2]);
+      bufB[j] = _mm256_cvtepi16_epi32(buf[j * 2 + 1]);
+    }
+    fdct64_new_avx2(bufA, bufA, cos_bit_row);
+    fdct64_new_avx2(bufB, bufB, cos_bit_row);
+    av1_round_shift_array_32_avx2(bufA, bufA, 32, -shift[2]);
+    av1_round_shift_array_32_avx2(bufB, bufB, 32, -shift[2]);
+
+    int32_t *output8 = output + 16 * 32 * i;
+    for (int j = 0; j < 4; ++j) {
+      __m256i *out = (__m256i *)(output8 + 8 * j);
+      transpose_32_8x8_avx2(4, bufA + 8 * j, out);
+      transpose_32_8x8_avx2(4, bufB + 8 * j, out + 8 * 4);
+    }
+  }
+}
+
+static void lowbd_fwd_txfm2d_16x32_avx2(const int16_t *input, int32_t *output,
+                                        int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  const TX_SIZE tx_size = TX_16X32;
+  __m256i buf0[32], buf1[32];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const transform_1d_avx2 col_txfm = col_txfm16x32_arr[tx_type];
+  const transform_1d_avx2 row_txfm = row_txfm16x16_arr[tx_type];
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  if (ud_flip) {
+    load_buffer_16bit_to_16bit_flip_avx2(input, stride, buf0, height);
+  } else {
+    load_buffer_16bit_to_16bit_avx2(input, stride, buf0, height);
+  }
+  round_shift_16bit_w16_avx2(buf0, height, shift[0]);
+  col_txfm(buf0, buf0, cos_bit_col);
+  round_shift_16bit_w16_avx2(buf0, height, shift[1]);
+  transpose_16bit_16x16_avx2(buf0, buf1);
+  transpose_16bit_16x16_avx2(buf0 + 16, buf1 + 16);
+
+  for (int i = 0; i < 2; i++) {
+    __m256i *buf;
+    if (lr_flip) {
+      buf = buf0;
+      flip_buf_avx2(buf1 + width * i, buf, width);
+    } else {
+      buf = buf1 + width * i;
+    }
+    row_txfm(buf, buf, cos_bit_row);
+    round_shift_16bit_w16_avx2(buf, width, shift[2]);
+    transpose_16bit_16x16_avx2(buf, buf);
+    store_rect_buffer_16bit_to_32bit_w16_avx2(buf, output + 16 * width * i,
+                                              width, 16);
+  }
+}
+
+static void lowbd_fwd_txfm2d_32x16_avx2(const int16_t *input, int32_t *output,
+                                        int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m256i buf0[32], buf1[64];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X16];
+  const int txw_idx = get_txw_idx(TX_32X16);
+  const int txh_idx = get_txh_idx(TX_32X16);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 32;
+  const int height = 16;
+  const transform_1d_avx2 col_txfm = col_txfm16x16_arr[tx_type];
+  const transform_1d_avx2 row_txfm = row_txfm16x32_arr[tx_type];
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  for (int i = 0; i < 2; i++) {
+    if (ud_flip) {
+      load_buffer_16bit_to_16bit_flip_avx2(input + 16 * i, stride, buf0,
+                                           height);
+    } else {
+      load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
+    }
+    round_shift_16bit_w16_avx2(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col);
+    round_shift_16bit_w16_avx2(buf0, height, shift[1]);
+    transpose_16bit_16x16_avx2(buf0, buf1 + 0 * width + 16 * i);
+  }
+
+  __m256i *buf;
+  if (lr_flip) {
+    buf = buf0;
+    flip_buf_avx2(buf1, buf, width);
+  } else {
+    buf = buf1;
+  }
+  row_txfm(buf, buf, cos_bit_row);
+  round_shift_16bit_w16_avx2(buf, width, shift[2]);
+  transpose_16bit_16x16_avx2(buf, buf);
+  store_rect_buffer_16bit_to_32bit_w16_avx2(buf, output, width, 16);
+
+  transpose_16bit_16x16_avx2(buf + 16, buf + 16);
+  store_rect_buffer_16bit_to_32bit_w16_avx2(buf + 16, output + 16, width, 16);
+}
+
+static void lowbd_fwd_txfm2d_64x32_avx2(const int16_t *input, int32_t *output,
+                                        int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  const TX_SIZE tx_size = TX_64X32;
+  __m256i buf0[64], buf1[256];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const transform_1d_avx2 col_txfm = col_txfm16x32_arr[tx_type];
+  const int width_div16 = (width >> 4);
+  const int height_div16 = (height >> 4);
+
+  for (int i = 0; i < width_div16; i++) {
+    load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
+    round_shift_16bit_w16_avx2(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col);
+    round_shift_16bit_w16_avx2(buf0, height, shift[1]);
+    for (int j = 0; j < AOMMIN(4, height_div16); ++j) {
+      transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i);
+    }
+  }
+  assert(tx_type == DCT_DCT);
+  for (int i = 0; i < AOMMIN(2, height_div16); i++) {
+    __m256i bufA[64];
+    __m256i bufB[64];
+    __m128i *buf = (__m128i *)(buf1 + width * i);
+    for (int j = 0; j < width; ++j) {
+      bufA[j] = _mm256_cvtepi16_epi32(buf[j * 2]);
+      bufB[j] = _mm256_cvtepi16_epi32(buf[j * 2 + 1]);
+    }
+    fdct64_new_avx2(bufA, bufA, cos_bit_row);
+    fdct64_new_avx2(bufB, bufB, cos_bit_row);
+    av1_round_shift_rect_array_32_avx2(bufA, bufA, 32, -shift[2], NewSqrt2);
+    av1_round_shift_rect_array_32_avx2(bufB, bufB, 32, -shift[2], NewSqrt2);
+
+    int32_t *output8 = output + 16 * 32 * i;
+    for (int j = 0; j < 4; ++j) {
+      __m256i *out = (__m256i *)(output8 + 8 * j);
+      transpose_32_8x8_avx2(4, bufA + 8 * j, out);
+      transpose_32_8x8_avx2(4, bufB + 8 * j, out + 8 * 4);
+    }
+  }
+}
+
+static void lowbd_fwd_txfm2d_32x64_avx2(const int16_t *input, int32_t *output,
+                                        int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  (void)tx_type;
+  assert(tx_type == DCT_DCT);
+  const TX_SIZE tx_size = TX_32X64;
+  __m256i buf0[64], buf1[256];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const transform_1d_avx2 col_txfm = fdct16x64_new_avx2;
+  const int width_div16 = (width >> 4);
+  const int height_div16 = (height >> 4);
+
+  for (int i = 0; i < width_div16; i++) {
+    load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
+    round_shift_16bit_w16_avx2(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col);
+    round_shift_16bit_w16_avx2(buf0, height, shift[1]);
+    for (int j = 0; j < AOMMIN(2, height_div16); ++j) {
+      transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i);
+    }
+  }
+
+  for (int i = 0; i < AOMMIN(2, height_div16); i++) {
+    __m256i bufA[32];
+    __m256i bufB[32];
+    __m128i *buf = (__m128i *)(buf1 + width * i);
+    for (int j = 0; j < width; ++j) {
+      bufA[j] = _mm256_cvtepi16_epi32(buf[j * 2]);
+      bufB[j] = _mm256_cvtepi16_epi32(buf[j * 2 + 1]);
+    }
+    fdct32_avx2(bufA, bufA, cos_bit_row);
+    fdct32_avx2(bufB, bufB, cos_bit_row);
+    av1_round_shift_rect_array_32_avx2(bufA, bufA, 32, -shift[2], NewSqrt2);
+    av1_round_shift_rect_array_32_avx2(bufB, bufB, 32, -shift[2], NewSqrt2);
+
+    int32_t *output8 = output + 16 * 32 * i;
+    for (int j = 0; j < 4; ++j) {
+      __m256i *out = (__m256i *)(output8 + 8 * j);
+      transpose_32_8x8_avx2(4, bufA + 8 * j, out);
+      transpose_32_8x8_avx2(4, bufB + 8 * j, out + 8 * 4);
+    }
+  }
+}
+
+static void lowbd_fwd_txfm2d_16x64_avx2(const int16_t *input, int32_t *output,
+                                        int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  (void)tx_type;
+  assert(tx_type == DCT_DCT);
+  const TX_SIZE tx_size = TX_16X64;
+  __m256i buf0[64], buf1[64];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const transform_1d_avx2 col_txfm = fdct16x64_new_avx2;
+  const transform_1d_avx2 row_txfm = fdct16x16_new_avx2;
+  const int width_div16 = (width >> 4);
+  const int height_div16 = (height >> 4);
+
+  for (int i = 0; i < width_div16; i++) {
+    load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
+    round_shift_16bit_w16_avx2(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col);
+    round_shift_16bit_w16_avx2(buf0, height, shift[1]);
+    for (int j = 0; j < height_div16; ++j) {
+      transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i);
+    }
+  }
+
+  for (int i = 0; i < AOMMIN(4, height_div16); i++) {
+    __m256i *buf = buf1 + width * i;
+    row_txfm(buf, buf, cos_bit_row);
+    round_shift_16bit_w16_avx2(buf, width, shift[2]);
+    int32_t *output16 = output + 16 * width * i;
+    for (int j = 0; j < width_div16; ++j) {
+      __m256i *buf16 = buf + 16 * j;
+      transpose_16bit_16x16_avx2(buf16, buf16);
+      store_buffer_16bit_to_32bit_w16_avx2(buf16, output16 + 16 * j, width, 16);
+    }
+  }
+  // Zero out the bottom 16x32 area.
+  memset(output + 16 * 32, 0, 16 * 32 * sizeof(*output));
+}
+
+static void lowbd_fwd_txfm2d_64x16_avx2(const int16_t *input, int32_t *output,
+                                        int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  (void)tx_type;
+  assert(tx_type == DCT_DCT);
+  const TX_SIZE tx_size = TX_64X16;
+  __m256i buf0[64], buf1[64];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const transform_1d_avx2 col_txfm = fdct16x16_new_avx2;
+  const transform_1d_avx2 row_txfm = fdct16x64_new_avx2;
+  const int width_div16 = (width >> 4);
+  const int height_div16 = (height >> 4);
+
+  for (int i = 0; i < width_div16; i++) {
+    load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
+    round_shift_16bit_w16_avx2(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col);
+    round_shift_16bit_w16_avx2(buf0, height, shift[1]);
+    for (int j = 0; j < height_div16; ++j) {
+      transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i);
+    }
+  }
+
+  for (int i = 0; i < height_div16; i++) {
+    __m256i *buf = buf1 + width * i;
+    row_txfm(buf, buf, cos_bit_row);
+    round_shift_16bit_w16_avx2(buf, width, shift[2]);
+    int32_t *output16 = output + 16 * 32 * i;
+    for (int j = 0; j < 2; ++j) {
+      __m256i *buf16 = buf + 16 * j;
+      transpose_16bit_16x16_avx2(buf16, buf16);
+      store_buffer_16bit_to_32bit_w16_avx2(buf16, output16 + 16 * j, 32, 16);
+    }
+  }
+}
+
+static INLINE void btf_16_avx2(__m256i *w0, __m256i *w1, __m256i *in0,
+                               __m256i *in1, __m128i *out0, __m128i *out1,
+                               __m128i *out2, __m128i *out3,
+                               const __m256i *__rounding, int8_t *cos_bit) {
+  __m256i t0 = _mm256_unpacklo_epi16(*in0, *in1);
+  __m256i t1 = _mm256_unpackhi_epi16(*in0, *in1);
+  __m256i u0 = _mm256_madd_epi16(t0, *w0);
+  __m256i u1 = _mm256_madd_epi16(t1, *w0);
+  __m256i v0 = _mm256_madd_epi16(t0, *w1);
+  __m256i v1 = _mm256_madd_epi16(t1, *w1);
+
+  __m256i a0 = _mm256_add_epi32(u0, *__rounding);
+  __m256i a1 = _mm256_add_epi32(u1, *__rounding);
+  __m256i b0 = _mm256_add_epi32(v0, *__rounding);
+  __m256i b1 = _mm256_add_epi32(v1, *__rounding);
+
+  __m256i c0 = _mm256_srai_epi32(a0, *cos_bit);
+  __m256i c1 = _mm256_srai_epi32(a1, *cos_bit);
+  __m256i d0 = _mm256_srai_epi32(b0, *cos_bit);
+  __m256i d1 = _mm256_srai_epi32(b1, *cos_bit);
+
+  __m256i temp0 = _mm256_packs_epi32(c0, c1);
+  __m256i temp1 = _mm256_packs_epi32(d0, d1);
+
+  *out0 = _mm256_castsi256_si128(temp0);
+  *out1 = _mm256_castsi256_si128(temp1);
+  *out2 = _mm256_extracti128_si256(temp0, 0x01);
+  *out3 = _mm256_extracti128_si256(temp1, 0x01);
+}
+
+static INLINE void fdct8x8_new_avx2(const __m256i *input, __m256i *output,
+                                    int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m256i __rounding = _mm256_set1_epi32(1 << (cos_bit - 1));
+
+  __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+  __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
+  __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+  __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+  __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
+  __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
+  __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
+  __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
+
+  // stage 1
+  __m256i x1[8];
+  x1[0] = _mm256_adds_epi16(input[0], input[7]);
+  x1[7] = _mm256_subs_epi16(input[0], input[7]);
+  x1[1] = _mm256_adds_epi16(input[1], input[6]);
+  x1[6] = _mm256_subs_epi16(input[1], input[6]);
+  x1[2] = _mm256_adds_epi16(input[2], input[5]);
+  x1[5] = _mm256_subs_epi16(input[2], input[5]);
+  x1[3] = _mm256_adds_epi16(input[3], input[4]);
+  x1[4] = _mm256_subs_epi16(input[3], input[4]);
+
+  // stage 2
+  __m256i x2[8];
+  x2[0] = _mm256_adds_epi16(x1[0], x1[3]);
+  x2[3] = _mm256_subs_epi16(x1[0], x1[3]);
+  x2[1] = _mm256_adds_epi16(x1[1], x1[2]);
+  x2[2] = _mm256_subs_epi16(x1[1], x1[2]);
+  x2[4] = x1[4];
+  btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], __rounding,
+                  cos_bit);
+  x2[5] = x1[5];
+  x2[6] = x1[6];
+  x2[7] = x1[7];
+
+  // stage 3
+  __m256i x3[8];
+  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x2[0], &x2[1], __rounding,
+                  cos_bit);
+  x3[0] = x2[0];
+  x3[1] = x2[1];
+  btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x2[2], &x2[3], __rounding,
+                  cos_bit);
+  x3[2] = x2[2];
+  x3[3] = x2[3];
+  x3[4] = _mm256_adds_epi16(x2[4], x2[5]);
+  x3[5] = _mm256_subs_epi16(x2[4], x2[5]);
+  x3[6] = _mm256_subs_epi16(x2[7], x2[6]);
+  x3[7] = _mm256_adds_epi16(x2[7], x2[6]);
+
+  // stage 4
+  __m256i x4[8];
+  x4[0] = x3[0];
+  x4[1] = x3[1];
+  x4[2] = x3[2];
+  x4[3] = x3[3];
+  btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x3[4], &x3[7], __rounding,
+                  cos_bit);
+  x4[4] = x3[4];
+  x4[7] = x3[7];
+  btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x3[5], &x3[6], __rounding,
+                  cos_bit);
+  x4[5] = x3[5];
+  x4[6] = x3[6];
+  // stage 5
+  output[0] = x4[0];
+  output[1] = x4[4];
+  output[2] = x4[2];
+  output[3] = x4[6];
+  output[4] = x4[1];
+  output[5] = x4[5];
+  output[6] = x4[3];
+  output[7] = x4[7];
+}
+
+static INLINE void fadst8x8_new_avx2(const __m256i *input, __m256i *output,
+                                     int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m256i __zero = _mm256_setzero_si256();
+  const __m256i __rounding = _mm256_set1_epi32(1 << (cos_bit - 1));
+
+  __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+  __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
+  __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
+  __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
+  __m256i cospi_m48_p16 = pair_set_w16_epi16(-cospi[48], cospi[16]);
+  __m256i cospi_p04_p60 = pair_set_w16_epi16(cospi[4], cospi[60]);
+  __m256i cospi_p60_m04 = pair_set_w16_epi16(cospi[60], -cospi[4]);
+  __m256i cospi_p20_p44 = pair_set_w16_epi16(cospi[20], cospi[44]);
+  __m256i cospi_p44_m20 = pair_set_w16_epi16(cospi[44], -cospi[20]);
+  __m256i cospi_p36_p28 = pair_set_w16_epi16(cospi[36], cospi[28]);
+  __m256i cospi_p28_m36 = pair_set_w16_epi16(cospi[28], -cospi[36]);
+  __m256i cospi_p52_p12 = pair_set_w16_epi16(cospi[52], cospi[12]);
+  __m256i cospi_p12_m52 = pair_set_w16_epi16(cospi[12], -cospi[52]);
+
+  // stage 1
+  __m256i x1[8];
+  x1[0] = input[0];
+  x1[1] = _mm256_subs_epi16(__zero, input[7]);
+  x1[2] = _mm256_subs_epi16(__zero, input[3]);
+  x1[3] = input[4];
+  x1[4] = _mm256_subs_epi16(__zero, input[1]);
+  x1[5] = input[6];
+  x1[6] = input[2];
+  x1[7] = _mm256_subs_epi16(__zero, input[5]);
+
+  // stage 2
+  __m256i x2[8];
+  x2[0] = x1[0];
+  x2[1] = x1[1];
+  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[2], &x1[3], __rounding,
+                  cos_bit);
+  x2[2] = x1[2];
+  x2[3] = x1[3];
+  x2[4] = x1[4];
+  x2[5] = x1[5];
+  btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[6], &x1[7], __rounding,
+                  cos_bit);
+  x2[6] = x1[6];
+  x2[7] = x1[7];
+
+  // stage 3
+  __m256i x3[8];
+  x3[0] = _mm256_adds_epi16(x2[0], x2[2]);
+  x3[2] = _mm256_subs_epi16(x2[0], x2[2]);
+  x3[1] = _mm256_adds_epi16(x2[1], x2[3]);
+  x3[3] = _mm256_subs_epi16(x2[1], x2[3]);
+  x3[4] = _mm256_adds_epi16(x2[4], x2[6]);
+  x3[6] = _mm256_subs_epi16(x2[4], x2[6]);
+  x3[5] = _mm256_adds_epi16(x2[5], x2[7]);
+  x3[7] = _mm256_subs_epi16(x2[5], x2[7]);
+
+  // stage 4
+  __m256i x4[8];
+  x4[0] = x3[0];
+  x4[1] = x3[1];
+  x4[2] = x3[2];
+  x4[3] = x3[3];
+  btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x3[4], &x3[5], __rounding,
+                  cos_bit);
+  x4[4] = x3[4];
+  x4[5] = x3[5];
+  btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x3[6], &x3[7], __rounding,
+                  cos_bit);
+  x4[6] = x3[6];
+  x4[7] = x3[7];
+
+  // stage 5
+  __m256i x5[8];
+  x5[0] = _mm256_adds_epi16(x4[0], x4[4]);
+  x5[4] = _mm256_subs_epi16(x4[0], x4[4]);
+  x5[1] = _mm256_adds_epi16(x4[1], x4[5]);
+  x5[5] = _mm256_subs_epi16(x4[1], x4[5]);
+  x5[2] = _mm256_adds_epi16(x4[2], x4[6]);
+  x5[6] = _mm256_subs_epi16(x4[2], x4[6]);
+  x5[3] = _mm256_adds_epi16(x4[3], x4[7]);
+  x5[7] = _mm256_subs_epi16(x4[3], x4[7]);
+
+  // stage 6
+  __m256i x6[8];
+  btf_16_w16_avx2(cospi_p04_p60, cospi_p60_m04, &x5[0], &x5[1], __rounding,
+                  cos_bit);
+  x6[0] = x5[0];
+  x6[1] = x5[1];
+  btf_16_w16_avx2(cospi_p20_p44, cospi_p44_m20, &x5[2], &x5[3], __rounding,
+                  cos_bit);
+  x6[2] = x5[2];
+  x6[3] = x5[3];
+  btf_16_w16_avx2(cospi_p36_p28, cospi_p28_m36, &x5[4], &x5[5], __rounding,
+                  cos_bit);
+  x6[4] = x5[4];
+  x6[5] = x5[5];
+  btf_16_w16_avx2(cospi_p52_p12, cospi_p12_m52, &x5[6], &x5[7], __rounding,
+                  cos_bit);
+  x6[6] = x5[6];
+  x6[7] = x5[7];
+
+  // stage 7
+  output[0] = x6[1];
+  output[1] = x6[6];
+  output[2] = x6[3];
+  output[3] = x6[4];
+  output[4] = x6[5];
+  output[5] = x6[2];
+  output[6] = x6[7];
+  output[7] = x6[0];
+}
+
+static INLINE void fidentity8x8_new_avx2(const __m256i *input, __m256i *output,
+                                         int8_t cos_bit) {
+  (void)cos_bit;
+
+  output[0] = _mm256_adds_epi16(input[0], input[0]);
+  output[1] = _mm256_adds_epi16(input[1], input[1]);
+  output[2] = _mm256_adds_epi16(input[2], input[2]);
+  output[3] = _mm256_adds_epi16(input[3], input[3]);
+  output[4] = _mm256_adds_epi16(input[4], input[4]);
+  output[5] = _mm256_adds_epi16(input[5], input[5]);
+  output[6] = _mm256_adds_epi16(input[6], input[6]);
+  output[7] = _mm256_adds_epi16(input[7], input[7]);
+}
+
+static INLINE void fdct8x16_new_avx2(const __m128i *input, __m128i *output,
+                                     int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m256i __rounding_256 = _mm256_set1_epi32(1 << (cos_bit - 1));
+  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+  __m128i temp0, temp1, temp2, temp3;
+  __m256i in0, in1;
+  __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+  __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+  __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+  __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+  __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+  __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+  __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+  __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+  __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
+  __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
+  __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]);
+  __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
+  __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
+  __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
+  __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]);
+  __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
+
+  __m256i cospi_arr[12];
+
+  cospi_arr[0] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m32_p32),
+                                         cospi_m32_p32, 0x1);
+  cospi_arr[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_p32),
+                                         cospi_p32_p32, 0x1);
+  cospi_arr[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_p32),
+                                         cospi_p48_p16, 0x1);
+  cospi_arr[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_m32),
+                                         cospi_m16_p48, 0x1);
+  cospi_arr[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m16_p48),
+                                         cospi_m48_m16, 0x1);
+  cospi_arr[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p48_p16),
+                                         cospi_m16_p48, 0x1);
+  cospi_arr[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p56_p08),
+                                         cospi_p24_p40, 0x1);
+  cospi_arr[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m08_p56),
+                                         cospi_m40_p24, 0x1);
+  cospi_arr[8] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p60_p04),
+                                         cospi_p28_p36, 0x1);
+  cospi_arr[9] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m04_p60),
+                                         cospi_m36_p28, 0x1);
+  cospi_arr[10] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p44_p20),
+                                          cospi_p12_p52, 0x1);
+  cospi_arr[11] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m20_p44),
+                                          cospi_m52_p12, 0x1);
+
+  __m256i x[8];
+  x[0] =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(input[0]), input[1], 0x1);
+  x[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[15]), input[14],
+                                 0x1);
+  x[2] =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(input[2]), input[3], 0x1);
+  x[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[13]), input[12],
+                                 0x1);
+  x[4] =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(input[5]), input[4], 0x1);
+  x[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[10]), input[11],
+                                 0x1);
+  x[6] =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(input[7]), input[6], 0x1);
+  x[7] =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(input[8]), input[9], 0x1);
+
+  // stage 1
+  __m256i x1[8];
+  x1[0] = _mm256_adds_epi16(x[0], x[1]);
+  x1[7] = _mm256_subs_epi16(x[0], x[1]);
+  x1[1] = _mm256_adds_epi16(x[2], x[3]);
+  x1[6] = _mm256_subs_epi16(x[2], x[3]);
+  x1[2] = _mm256_adds_epi16(x[4], x[5]);
+  x1[5] = _mm256_subs_epi16(x[4], x[5]);
+  x1[3] = _mm256_adds_epi16(x[6], x[7]);
+  x1[4] = _mm256_subs_epi16(x[6], x[7]);
+
+  // stage 2
+  __m256i x2[8];
+  x2[0] = _mm256_adds_epi16(x1[0], x1[3]);
+  x2[7] = _mm256_subs_epi16(x1[0], x1[3]);
+  x2[1] = _mm256_adds_epi16(x1[1], x1[2]);
+  x2[6] = _mm256_subs_epi16(x1[1], x1[2]);
+  x2[2] = x1[4];
+  x2[3] = x1[7];
+  btf_16_avx2(&cospi_arr[0], &cospi_arr[1], &x1[5], &x1[6], &temp0, &temp1,
+              &temp2, &temp3, &__rounding_256, &cos_bit);
+  x2[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp0, 0x1);
+  x2[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp3), temp1, 0x1);
+
+  // stage 3
+  __m256i x3[8];
+  x2[1] = _mm256_permute4x64_epi64(x2[1], 0x4e);
+  x3[0] = _mm256_adds_epi16(x2[0], x2[1]);
+  x3[1] = _mm256_subs_epi16(x2[0], x2[1]);
+  x3[2] = _mm256_blend_epi32(x2[7], x2[6], 0xf0);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, _mm256_castsi256_si128(x2[6]),
+              _mm256_extractf128_si256(x2[7], 0x01), temp0, temp1);
+  x3[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp1), temp0, 0x1);
+  x3[3] = _mm256_adds_epi16(x2[2], x2[4]);
+  x3[4] = _mm256_subs_epi16(x2[2], x2[4]);
+  x3[5] = _mm256_adds_epi16(x2[3], x2[5]);
+  x3[6] = _mm256_subs_epi16(x2[3], x2[5]);
+
+  // stage 4
+  __m256i x4[8];
+  x4[0] = _mm256_blend_epi32(x3[0], x3[1], 0xf0);
+  x4[1] = _mm256_permute2f128_si256(x3[0], x3[1], 0x21);
+  btf_16_avx2(&cospi_arr[2], &cospi_arr[3], &x4[0], &x4[1], &output[0],
+              &output[8], &output[4], &output[12], &__rounding_256, &cos_bit);
+  x4[2] = _mm256_adds_epi16(x3[2], x3[7]);
+  x4[3] = _mm256_subs_epi16(x3[2], x3[7]);
+  x4[4] = _mm256_permute2f128_si256(x3[3], x3[4], 0x20);
+  x4[5] = _mm256_permute2f128_si256(x3[6], x3[5], 0x20);
+  in0 = _mm256_permute2f128_si256(x3[3], x3[4], 0x31);
+  in1 = _mm256_permute2f128_si256(x3[5], x3[6], 0x31);
+  btf_16_avx2(&cospi_arr[4], &cospi_arr[5], &in0, &in1, &temp0, &temp1, &temp2,
+              &temp3, &__rounding_256, &cos_bit);
+
+  x4[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp2, 0x1);
+  x4[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp3), temp1, 0x1);
+
+  // stage 5
+  __m256i x5[4];
+  in0 = _mm256_permute2f128_si256(x4[2], x4[3], 0x31);
+  in1 = _mm256_permute2f128_si256(x4[2], x4[3], 0x20);
+  btf_16_avx2(&cospi_arr[6], &cospi_arr[7], &in0, &in1, &output[2], &output[14],
+              &output[10], &output[6], &__rounding_256, &cos_bit);
+  x5[0] = _mm256_adds_epi16(x4[4], x4[6]);
+  x5[1] = _mm256_subs_epi16(x4[4], x4[6]);
+  x5[2] = _mm256_adds_epi16(x4[5], x4[7]);
+  x5[3] = _mm256_subs_epi16(x4[5], x4[7]);
+
+  // stage 6
+  in0 = _mm256_permute2f128_si256(x5[0], x5[1], 0x20);
+  in1 = _mm256_permute2f128_si256(x5[2], x5[3], 0x31);
+  btf_16_avx2(&cospi_arr[8], &cospi_arr[9], &in0, &in1, &output[1], &output[15],
+              &output[9], &output[7], &__rounding_256, &cos_bit);
+  in0 = _mm256_permute2f128_si256(x5[1], x5[0], 0x31);
+  in1 = _mm256_permute2f128_si256(x5[3], x5[2], 0x20);
+  btf_16_avx2(&cospi_arr[10], &cospi_arr[11], &in0, &in1, &output[5],
+              &output[11], &output[13], &output[3], &__rounding_256, &cos_bit);
+}
+
+static INLINE void fadst8x16_new_avx2(const __m128i *input, __m128i *output,
+                                      int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m256i __zero = _mm256_setzero_si256();
+  const __m256i __rounding_256 = _mm256_set1_epi32(1 << (cos_bit - 1));
+  __m256i in0, in1;
+  __m128i temp0, temp1, temp2, temp3;
+
+  __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+  __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+  __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
+  __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+  __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+  __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
+  __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
+  __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]);
+  __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]);
+  __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
+  __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
+  __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
+  __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
+  __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
+  __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
+  __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
+  __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
+  __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
+  __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
+  __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
+  __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
+  __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
+  __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
+  __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
+  __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
+
+  __m256i cospi_arr[20];
+
+  cospi_arr[0] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_p32),
+                                         cospi_p32_p32, 0x1);
+  cospi_arr[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_m32),
+                                         cospi_p32_m32, 0x1);
+  cospi_arr[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_p32),
+                                         cospi_p32_p32, 0x1);
+  cospi_arr[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_m32),
+                                         cospi_p32_m32, 0x1);
+  cospi_arr[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p16_p48),
+                                         cospi_m48_p16, 0x1);
+  cospi_arr[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p48_m16),
+                                         cospi_p16_p48, 0x1);
+  cospi_arr[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p16_p48),
+                                         cospi_m48_p16, 0x1);
+  cospi_arr[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p48_m16),
+                                         cospi_p16_p48, 0x1);
+  cospi_arr[8] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p08_p56),
+                                         cospi_p40_p24, 0x1);
+  cospi_arr[9] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p56_m08),
+                                         cospi_p24_m40, 0x1);
+  cospi_arr[10] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m56_p08),
+                                          cospi_m24_p40, 0x1);
+  cospi_arr[11] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p08_p56),
+                                          cospi_p40_p24, 0x1);
+  cospi_arr[12] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p02_p62),
+                                          cospi_p10_p54, 0x1);
+  cospi_arr[13] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p62_m02),
+                                          cospi_p54_m10, 0x1);
+  cospi_arr[14] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p18_p46),
+                                          cospi_p26_p38, 0x1);
+  cospi_arr[15] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p46_m18),
+                                          cospi_p38_m26, 0x1);
+  cospi_arr[16] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p34_p30),
+                                          cospi_p42_p22, 0x1);
+  cospi_arr[17] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p30_m34),
+                                          cospi_p22_m42, 0x1);
+  cospi_arr[18] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p50_p14),
+                                          cospi_p58_p06, 0x1);
+  cospi_arr[19] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p14_m50),
+                                          cospi_p06_m58, 0x1);
+
+  __m256i x[8];
+  x[0] =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(input[0]), input[4], 0x1);
+  x[1] =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(input[2]), input[6], 0x1);
+  x[2] =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(input[8]), input[12], 0x1);
+  x[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[10]), input[14],
+                                 0x1);
+  x[4] =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(input[1]), input[9], 0x1);
+  x[5] =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(input[3]), input[11], 0x1);
+  x[6] =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(input[5]), input[13], 0x1);
+  x[7] =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(input[7]), input[15], 0x1);
+
+  // stage 1
+  __m256i x1[8];
+  x1[0] = x[0];
+  x1[1] = _mm256_subs_epi16(__zero, x[7]);
+  x1[2] = x[2];
+  x1[3] = _mm256_subs_epi16(__zero, x[5]);
+  x1[4] = _mm256_subs_epi16(__zero, x[4]);
+  x1[5] = x[3];
+  x1[6] = _mm256_subs_epi16(__zero, x[6]);
+  x1[7] = x[1];
+
+  // stage 2
+  __m256i x2[8];
+  x2[0] = _mm256_blend_epi32(x1[0], x1[1], 0xf0);
+  x2[3] = _mm256_blend_epi32(x1[3], x1[2], 0xf0);
+  x2[4] = _mm256_blend_epi32(x1[4], x1[5], 0xf0);
+  x2[7] = _mm256_blend_epi32(x1[7], x1[6], 0xf0);
+  in0 = _mm256_blend_epi32(x1[1], x1[0], 0xf0);
+  in1 = _mm256_blend_epi32(x1[2], x1[3], 0xf0);
+  btf_16_avx2(&cospi_arr[0], &cospi_arr[1], &in0, &in1, &temp0, &temp1, &temp2,
+              &temp3, &__rounding_256, &cos_bit);
+  x2[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1);
+  x2[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1);
+  in0 = _mm256_permute2f128_si256(x1[7], x1[6], 0x21);
+  in1 = _mm256_permute2f128_si256(x1[4], x1[5], 0x21);
+  btf_16_avx2(&cospi_arr[2], &cospi_arr[3], &in0, &in1, &temp0, &temp1, &temp2,
+              &temp3, &__rounding_256, &cos_bit);
+  x2[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1);
+  x2[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1);
+
+  // stage 3
+  __m256i x3[8];
+  x3[0] = _mm256_adds_epi16(x2[0], x2[1]);
+  x3[1] = _mm256_subs_epi16(x2[0], x2[1]);
+  x3[2] = _mm256_adds_epi16(x2[3], x2[2]);
+  x3[3] = _mm256_subs_epi16(x2[3], x2[2]);
+  x3[4] = _mm256_adds_epi16(x2[4], x2[5]);
+  x3[5] = _mm256_subs_epi16(x2[4], x2[5]);
+  x3[6] = _mm256_adds_epi16(x2[7], x2[6]);
+  x3[7] = _mm256_subs_epi16(x2[7], x2[6]);
+
+  // stage 4
+  __m256i x4[8];
+  x4[0] = x3[0];
+  x4[1] = x3[1];
+  x4[4] = x3[4];
+  x4[5] = x3[5];
+  in0 = _mm256_permute2f128_si256(x3[2], x3[3], 0x20);
+  in1 = _mm256_permute2f128_si256(x3[2], x3[3], 0x31);
+  btf_16_avx2(&cospi_arr[4], &cospi_arr[5], &in0, &in1, &temp0, &temp1, &temp2,
+              &temp3, &__rounding_256, &cos_bit);
+  x4[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1);
+  x4[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1);
+  in0 = _mm256_permute2f128_si256(x3[6], x3[7], 0x20);
+  in1 = _mm256_permute2f128_si256(x3[6], x3[7], 0x31);
+  btf_16_avx2(&cospi_arr[6], &cospi_arr[7], &in0, &in1, &temp0, &temp1, &temp2,
+              &temp3, &__rounding_256, &cos_bit);
+  x4[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1);
+  x4[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1);
+
+  // stage 5
+  __m256i x5[8];
+  x5[0] = _mm256_adds_epi16(x4[0], x4[2]);
+  x5[1] = _mm256_subs_epi16(x4[0], x4[2]);
+  x5[2] = _mm256_adds_epi16(x4[1], x4[3]);
+  x5[3] = _mm256_subs_epi16(x4[1], x4[3]);
+  x5[4] = _mm256_adds_epi16(x4[4], x4[6]);
+  x5[5] = _mm256_subs_epi16(x4[4], x4[6]);
+  x5[6] = _mm256_adds_epi16(x4[5], x4[7]);
+  x5[7] = _mm256_subs_epi16(x4[5], x4[7]);
+
+  // stage 6
+  __m256i x6[8];
+  x6[0] = x5[0];
+  x6[1] = x5[2];
+  x6[2] = x5[1];
+  x6[3] = x5[3];
+  in0 = _mm256_permute2f128_si256(x5[4], x5[6], 0x20);
+  in1 = _mm256_permute2f128_si256(x5[4], x5[6], 0x31);
+  btf_16_avx2(&cospi_arr[8], &cospi_arr[9], &in0, &in1, &temp0, &temp1, &temp2,
+              &temp3, &__rounding_256, &cos_bit);
+  x6[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1);
+  x6[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1);
+  in0 = _mm256_permute2f128_si256(x5[5], x5[7], 0x20);
+  in1 = _mm256_permute2f128_si256(x5[5], x5[7], 0x31);
+  btf_16_avx2(&cospi_arr[10], &cospi_arr[11], &in0, &in1, &temp0, &temp1,
+              &temp2, &temp3, &__rounding_256, &cos_bit);
+  x6[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1);
+  x6[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1);
+
+  // stage 7
+  __m256i x7[8];
+  x7[0] = _mm256_adds_epi16(x6[0], x6[4]);
+  x7[1] = _mm256_subs_epi16(x6[0], x6[4]);
+  x7[2] = _mm256_adds_epi16(x6[1], x6[5]);
+  x7[3] = _mm256_subs_epi16(x6[1], x6[5]);
+  x7[4] = _mm256_adds_epi16(x6[2], x6[6]);
+  x7[5] = _mm256_subs_epi16(x6[2], x6[6]);
+  x7[6] = _mm256_adds_epi16(x6[3], x6[7]);
+  x7[7] = _mm256_subs_epi16(x6[3], x6[7]);
+
+  // stage 8
+  in0 = _mm256_permute2f128_si256(x7[0], x7[2], 0x20);
+  in1 = _mm256_permute2f128_si256(x7[0], x7[2], 0x31);
+  btf_16_avx2(&cospi_arr[12], &cospi_arr[13], &in0, &in1, &output[15],
+              &output[0], &output[13], &output[2], &__rounding_256, &cos_bit);
+  in0 = _mm256_permute2f128_si256(x7[4], x7[6], 0x20);
+  in1 = _mm256_permute2f128_si256(x7[4], x7[6], 0x31);
+  btf_16_avx2(&cospi_arr[14], &cospi_arr[15], &in0, &in1, &output[11],
+              &output[4], &output[9], &output[6], &__rounding_256, &cos_bit);
+  in0 = _mm256_permute2f128_si256(x7[1], x7[3], 0x20);
+  in1 = _mm256_permute2f128_si256(x7[1], x7[3], 0x31);
+  btf_16_avx2(&cospi_arr[16], &cospi_arr[17], &in0, &in1, &output[7],
+              &output[8], &output[5], &output[10], &__rounding_256, &cos_bit);
+  in0 = _mm256_permute2f128_si256(x7[5], x7[7], 0x20);
+  in1 = _mm256_permute2f128_si256(x7[5], x7[7], 0x31);
+  btf_16_avx2(&cospi_arr[18], &cospi_arr[19], &in0, &in1, &output[3],
+              &output[12], &output[1], &output[14], &__rounding_256, &cos_bit);
+}
+
+static INLINE void fidentity8x16_new_avx2(const __m128i *input, __m128i *output,
+                                          int8_t cos_bit) {
+  (void)cos_bit;
+  const __m256i one = _mm256_set1_epi16(1);
+  __m256i temp;
+  for (int i = 0; i < 16; i += 2) {
+    temp = _mm256_insertf128_si256(_mm256_castsi128_si256(input[i]),
+                                   input[i + 1], 0x1);
+    const __m256i a_lo = _mm256_unpacklo_epi16(temp, one);
+    const __m256i a_hi = _mm256_unpackhi_epi16(temp, one);
+    const __m256i b_lo = scale_round_avx2(a_lo, 2 * NewSqrt2);
+    const __m256i b_hi = scale_round_avx2(a_hi, 2 * NewSqrt2);
+    temp = _mm256_packs_epi32(b_lo, b_hi);
+    output[i] = _mm256_castsi256_si128(temp);
+    output[i + 1] = _mm256_extractf128_si256(temp, 0x1);
+  }
+}
+
+static const transform_1d_avx2 row_txfm8x16_arr[TX_TYPES] = {
+  fdct8x8_new_avx2,       // DCT_DCT
+  fdct8x8_new_avx2,       // ADST_DCT
+  fadst8x8_new_avx2,      // DCT_ADST
+  fadst8x8_new_avx2,      // ADST_ADST
+  fdct8x8_new_avx2,       // FLIPADST_DCT
+  fadst8x8_new_avx2,      // DCT_FLIPADST
+  fadst8x8_new_avx2,      // FLIPADST_FLIPADST
+  fadst8x8_new_avx2,      // ADST_FLIPADST
+  fadst8x8_new_avx2,      // FLIPADST_ADST
+  fidentity8x8_new_avx2,  // IDTX
+  fidentity8x8_new_avx2,  // V_DCT
+  fdct8x8_new_avx2,       // H_DCT
+  fidentity8x8_new_avx2,  // V_ADST
+  fadst8x8_new_avx2,      // H_ADST
+  fidentity8x8_new_avx2,  // V_FLIPADST
+  fadst8x8_new_avx2       // H_FLIPADST
+};
+
+static const transform_1d_sse2 col_txfm8x16_arr[TX_TYPES] = {
+  fdct8x16_new_avx2,       // DCT_DCT
+  fadst8x16_new_avx2,      // ADST_DCT
+  fdct8x16_new_avx2,       // DCT_ADST
+  fadst8x16_new_avx2,      // ADST_ADST
+  fadst8x16_new_avx2,      // FLIPADST_DCT
+  fdct8x16_new_avx2,       // DCT_FLIPADST
+  fadst8x16_new_avx2,      // FLIPADST_FLIPADST
+  fadst8x16_new_avx2,      // ADST_FLIPADST
+  fadst8x16_new_avx2,      // FLIPADST_ADST
+  fidentity8x16_new_avx2,  // IDTX
+  fdct8x16_new_avx2,       // V_DCT
+  fidentity8x16_new_avx2,  // H_DCT
+  fadst8x16_new_avx2,      // V_ADST
+  fidentity8x16_new_avx2,  // H_ADST
+  fadst8x16_new_avx2,      // V_FLIPADST
+  fidentity8x16_new_avx2   // H_FLIPADST
+};
+
+static const transform_1d_avx2 col_txfm16x8_arr[TX_TYPES] = {
+  fdct8x8_new_avx2,       // DCT_DCT
+  fadst8x8_new_avx2,      // ADST_DCT
+  fdct8x8_new_avx2,       // DCT_ADST
+  fadst8x8_new_avx2,      // ADST_ADST
+  fadst8x8_new_avx2,      // FLIPADST_DCT
+  fdct8x8_new_avx2,       // DCT_FLIPADST
+  fadst8x8_new_avx2,      // FLIPADST_FLIPADST
+  fadst8x8_new_avx2,      // ADST_FLIPADST
+  fadst8x8_new_avx2,      // FLIPADST_ADST
+  fidentity8x8_new_avx2,  // IDTX
+  fdct8x8_new_avx2,       // V_DCT
+  fidentity8x8_new_avx2,  // H_DCT
+  fadst8x8_new_avx2,      // V_ADST
+  fidentity8x8_new_avx2,  // H_ADST
+  fadst8x8_new_avx2,      // V_FLIPADST
+  fidentity8x8_new_avx2,  // H_FLIPADST
+};
+
+static const transform_1d_sse2 row_txfm16x8_arr[TX_TYPES] = {
+  fdct8x16_new_avx2,       // DCT_DCT
+  fdct8x16_new_avx2,       // ADST_DCT
+  fadst8x16_new_avx2,      // DCT_ADST
+  fadst8x16_new_avx2,      // ADST_ADST
+  fdct8x16_new_avx2,       // FLIPADST_DCT
+  fadst8x16_new_avx2,      // DCT_FLIPADST
+  fadst8x16_new_avx2,      // FLIPADST_FLIPADST
+  fadst8x16_new_avx2,      // ADST_FLIPADST
+  fadst8x16_new_avx2,      // FLIPADST_ADST
+  fidentity8x16_new_avx2,  // IDTX
+  fidentity8x16_new_avx2,  // V_DCT
+  fdct8x16_new_avx2,       // H_DCT
+  fidentity8x16_new_avx2,  // V_ADST
+  fadst8x16_new_avx2,      // H_ADST
+  fidentity8x16_new_avx2,  // V_FLIPADST
+  fadst8x16_new_avx2       // H_FLIPADST
+};
+
+static void lowbd_fwd_txfm2d_8x16_avx2(const int16_t *input, int32_t *output,
+                                       int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf0[16], buf1[16];
+  __m256i buf2[8];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X16];
+  const int txw_idx = get_txw_idx(TX_8X16);
+  const int txh_idx = get_txh_idx(TX_8X16);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 8;
+  const int height = 16;
+  const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type];
+  const transform_1d_avx2 row_txfm = row_txfm8x16_arr[tx_type];
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  if (ud_flip) {
+    load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
+  } else {
+    load_buffer_16bit_to_16bit(input, stride, buf0, height);
+  }
+  round_shift_16bit(buf0, height, shift[0]);
+  col_txfm(buf0, buf0, cos_bit_col);
+  round_shift_16bit(buf0, height, shift[1]);
+  transpose_16bit_8x8(buf0, buf1);
+  transpose_16bit_8x8(buf0 + 8, buf1 + 8);
+
+  __m128i *bufl, *bufu;
+  if (lr_flip) {
+    bufl = buf0;
+    bufu = buf0 + 8;
+    flip_buf_sse2(buf1 + width * 0, bufl, width);
+    flip_buf_sse2(buf1 + width * 1, bufu, width);
+  } else {
+    bufl = buf1 + width * 0;
+    bufu = buf1 + width * 1;
+  }
+  pack_reg(bufl, bufu, buf2);
+  row_txfm(buf2, buf2, cos_bit_row);
+  round_shift_16bit_w16_avx2(buf2, width, shift[2]);
+  transpose_16bit_16x8_avx2(buf2, buf2);
+  store_rect_buffer_16bit_to_32bit_w8_avx2(buf2, output, width, 8);
+}
+
+static void lowbd_fwd_txfm2d_16x8_avx2(const int16_t *input, int32_t *output,
+                                       int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf0[16], buf1[16];
+  __m256i buf2[8];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X8];
+  const int txw_idx = get_txw_idx(TX_16X8);
+  const int txh_idx = get_txh_idx(TX_16X8);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 16;
+  const int height = 8;
+  const transform_1d_avx2 col_txfm = col_txfm16x8_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm16x8_arr[tx_type];
+  __m128i *buf;
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  if (ud_flip) {
+    load_buffer_16bit_to_16bit_flip(input + 8 * 0, stride, buf0, height);
+    load_buffer_16bit_to_16bit_flip(input + 8 * 1, stride, &buf0[8], height);
+  } else {
+    load_buffer_16bit_to_16bit(input + 8 * 0, stride, buf0, height);
+    load_buffer_16bit_to_16bit(input + 8 * 1, stride, &buf0[8], height);
+  }
+  pack_reg(buf0, &buf0[8], buf2);
+  round_shift_16bit_w16_avx2(buf2, height, shift[0]);
+  col_txfm(buf2, buf2, cos_bit_col);
+  round_shift_16bit_w16_avx2(buf2, height, shift[1]);
+  transpose_16bit_16x8_avx2(buf2, buf2);
+  extract_reg(buf2, buf1);
+
+  if (lr_flip) {
+    buf = buf0;
+    flip_buf_sse2(buf1, buf, width);
+  } else {
+    buf = buf1;
+  }
+  row_txfm(buf, buf, cos_bit_row);
+  round_shift_16bit(buf, width, shift[2]);
+  transpose_16bit_8x8(buf, buf);
+  store_rect_buffer_16bit_to_32bit_w8(buf, output, width, height);
+  transpose_16bit_8x8(buf + 8, buf + 8);
+  store_rect_buffer_16bit_to_32bit_w8(buf + 8, output + 8, width, height);
+}
+
+static FwdTxfm2dFunc fwd_txfm2d_func_ls[TX_SIZES_ALL] = {
+  av1_lowbd_fwd_txfm2d_4x4_sse2,   // 4x4 transform
+  av1_lowbd_fwd_txfm2d_8x8_sse2,   // 8x8 transform
+  lowbd_fwd_txfm2d_16x16_avx2,     // 16x16 transform
+  lowbd_fwd_txfm2d_32x32_avx2,     // 32x32 transform
+  lowbd_fwd_txfm2d_64x64_avx2,     // 64x64 transform
+  av1_lowbd_fwd_txfm2d_4x8_sse2,   // 4x8 transform
+  av1_lowbd_fwd_txfm2d_8x4_sse2,   // 8x4 transform
+  lowbd_fwd_txfm2d_8x16_avx2,      // 8x16 transform
+  lowbd_fwd_txfm2d_16x8_avx2,      // 16x8 transform
+  lowbd_fwd_txfm2d_16x32_avx2,     // 16x32 transform
+  lowbd_fwd_txfm2d_32x16_avx2,     // 32x16 transform
+  lowbd_fwd_txfm2d_32x64_avx2,     // 32x64 transform
+  lowbd_fwd_txfm2d_64x32_avx2,     // 64x32 transform
+  av1_lowbd_fwd_txfm2d_4x16_sse2,  // 4x16 transform
+  av1_lowbd_fwd_txfm2d_16x4_sse2,  // 16x4 transform
+  av1_lowbd_fwd_txfm2d_8x32_sse2,  // 8x32 transform
+  av1_lowbd_fwd_txfm2d_32x8_sse2,  // 32x8 transform
+  lowbd_fwd_txfm2d_16x64_avx2,     // 16x64 transform
+  lowbd_fwd_txfm2d_64x16_avx2,     // 64x16 transform
+};
+
+void av1_lowbd_fwd_txfm_avx2(const int16_t *src_diff, tran_low_t *coeff,
+                             int diff_stride, TxfmParam *txfm_param) {
+  FwdTxfm2dFunc fwd_txfm2d_func = fwd_txfm2d_func_ls[txfm_param->tx_size];
+  if ((fwd_txfm2d_func == NULL) ||
+      (txfm_param->lossless && txfm_param->tx_size == TX_4X4)) {
+    av1_lowbd_fwd_txfm_c(src_diff, coeff, diff_stride, txfm_param);
+  } else {
+    fwd_txfm2d_func(src_diff, coeff, diff_stride, txfm_param->tx_type,
+                    txfm_param->bd);
+  }
+}
diff --git a/libs/libaom/src/av1/encoder/x86/av1_fwd_txfm2d_sse4.c b/libs/libaom/src/av1/encoder/x86/av1_fwd_txfm2d_sse4.c
new file mode 100644
index 000000000..0bc3fbc2d
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/x86/av1_fwd_txfm2d_sse4.c
@@ -0,0 +1,364 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/enums.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/common/x86/av1_txfm_sse2.h"
+#include "av1/common/x86/highbd_txfm_utility_sse4.h"
+#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
+#include "av1/encoder/x86/av1_txfm1d_sse4.h"
+#include "av1/encoder/x86/av1_fwd_txfm_sse2.h"
+
+static INLINE void int16_array_with_stride_to_int32_array_without_stride(
+    const int16_t *input, int stride, int32_t *output, int txfm1d_size) {
+  int r, c;
+  for (r = 0; r < txfm1d_size; r++) {
+    for (c = 0; c < txfm1d_size; c++) {
+      output[r * txfm1d_size + c] = (int32_t)input[r * stride + c];
+    }
+  }
+}
+
+typedef void (*TxfmFuncSSE2)(__m128i *input, __m128i *output,
+                             const int8_t cos_bit, const int8_t *stage_range);
+
+static void fdct32_sse4_1(__m128i *input, __m128i *output, const int8_t cos_bit,
+                          const int8_t *stage_range) {
+  const int txfm_size = 32;
+  const int num_per_128 = 4;
+  int col_num = txfm_size / num_per_128;
+  int col;
+  (void)stage_range;
+  for (col = 0; col < col_num; col++) {
+    av1_fdct32_sse4_1((input + col), (output + col), cos_bit, col_num);
+  }
+}
+
+static void fdct64_new_sse4_1(__m128i *input, __m128i *output,
+                              const int8_t cos_bit, const int8_t *stage_range) {
+  const int txfm_size = 64;
+  const int num_per_128 = 4;
+  int col_num = txfm_size / num_per_128;
+  (void)stage_range;
+  for (int col = 0; col < col_num; col++) {
+    av1_fdct64_sse4_1((input + col), (output + col), cos_bit, col_num, col_num);
+  }
+}
+static void idtx32x32_sse4_1(__m128i *input, __m128i *output,
+                             const int8_t cos_bit, const int8_t *stage_range) {
+  (void)stage_range;
+
+  for (int i = 0; i < 8; i++) {
+    av1_idtx32_sse4_1(&input[i * 32], &output[i * 32], cos_bit, 1);
+  }
+}
+
+static INLINE TxfmFuncSSE2 fwd_txfm_type_to_func(TXFM_TYPE txfm_type) {
+  switch (txfm_type) {
+    case TXFM_TYPE_DCT32: return fdct32_sse4_1; break;
+    case TXFM_TYPE_DCT64: return fdct64_new_sse4_1; break;
+    case TXFM_TYPE_IDENTITY32: return idtx32x32_sse4_1; break;
+    default: assert(0);
+  }
+  return NULL;
+}
+
+static INLINE void fwd_txfm2d_sse4_1(const int16_t *input, int32_t *output,
+                                     const int stride,
+                                     const TXFM_2D_FLIP_CFG *cfg,
+                                     int32_t *txfm_buf) {
+  // TODO(sarahparker) This does not currently support rectangular transforms
+  // and will break without splitting txfm_size out into row and col size.
+  // Rectangular transforms use c code only, so it should be ok for now.
+  // It will be corrected when there are sse implementations for rectangular
+  // transforms.
+  assert(cfg->tx_size < TX_SIZES);
+  const int txfm_size = tx_size_wide[cfg->tx_size];
+  const int8_t *shift = cfg->shift;
+  const int8_t *stage_range_col = cfg->stage_range_col;
+  const int8_t *stage_range_row = cfg->stage_range_row;
+  const int8_t cos_bit_col = cfg->cos_bit_col;
+  const int8_t cos_bit_row = cfg->cos_bit_row;
+  const TxfmFuncSSE2 txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col);
+  const TxfmFuncSSE2 txfm_func_row = fwd_txfm_type_to_func(cfg->txfm_type_row);
+
+  __m128i *buf_128 = (__m128i *)txfm_buf;
+  __m128i *out_128 = (__m128i *)output;
+  int num_per_128 = 4;
+  int txfm2d_size_128 = txfm_size * txfm_size / num_per_128;
+
+  int16_array_with_stride_to_int32_array_without_stride(input, stride, txfm_buf,
+                                                        txfm_size);
+  av1_round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[0]);
+  txfm_func_col(out_128, buf_128, cos_bit_col, stage_range_col);
+  av1_round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[1]);
+  transpose_32(txfm_size, out_128, buf_128);
+  txfm_func_row(buf_128, out_128, cos_bit_row, stage_range_row);
+  av1_round_shift_array_32_sse4_1(out_128, buf_128, txfm2d_size_128, -shift[2]);
+  transpose_32(txfm_size, buf_128, out_128);
+}
+
+static INLINE void fwd_txfm2d_64x64_sse4_1(const int16_t *input,
+                                           int32_t *output, const int stride,
+                                           const TXFM_2D_FLIP_CFG *cfg,
+                                           int32_t *txfm_buf) {
+  assert(cfg->tx_size < TX_SIZES);
+  const int txfm_size = tx_size_wide[cfg->tx_size];
+  const int8_t *shift = cfg->shift;
+  const int8_t *stage_range_col = cfg->stage_range_col;
+  const int8_t cos_bit_col = cfg->cos_bit_col;
+  const int8_t cos_bit_row = cfg->cos_bit_row;
+  const TxfmFuncSSE2 txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col);
+  __m128i *buf_128 = (__m128i *)txfm_buf;
+  __m128i *out_128 = (__m128i *)output;
+
+  const int num_per_128 = 4;
+  int txfm2d_size_128 = txfm_size * txfm_size / num_per_128;
+  int col_num = txfm_size / num_per_128;
+
+  int16_array_with_stride_to_int32_array_without_stride(input, stride, output,
+                                                        txfm_size);
+  /*col wise transform*/
+  txfm_func_col(out_128, buf_128, cos_bit_col, stage_range_col);
+  av1_round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[1]);
+  transpose_32(txfm_size, out_128, buf_128);
+
+  /*row wise transform*/
+  for (int col = 0; col < (col_num >> 1); col++) {
+    av1_fdct64_sse4_1((buf_128 + col), (out_128 + col), cos_bit_row, col_num,
+                      (col_num >> 1));
+  }
+
+  txfm2d_size_128 = (col_num >> 1) * (txfm_size >> 1);
+  av1_round_shift_array_32_sse4_1(out_128, buf_128, txfm2d_size_128, -shift[2]);
+  transpose_8nx8n(buf_128, out_128, 32, 32);
+}
+
+void av1_fwd_txfm2d_32x32_sse4_1(const int16_t *input, int32_t *output,
+                                 int stride, TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(16, int32_t, txfm_buf[1024]);
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_32X32, &cfg);
+  (void)bd;
+  fwd_txfm2d_sse4_1(input, output, stride, &cfg, txfm_buf);
+}
+
+void av1_fwd_txfm2d_64x64_sse4_1(const int16_t *input, int32_t *output,
+                                 int stride, TX_TYPE tx_type, int bd) {
+  DECLARE_ALIGNED(16, int32_t, txfm_buf[4096]);
+  TXFM_2D_FLIP_CFG cfg;
+  av1_get_fwd_txfm_cfg(tx_type, TX_64X64, &cfg);
+  (void)bd;
+  fwd_txfm2d_64x64_sse4_1(input, output, stride, &cfg, txfm_buf);
+}
+
+static INLINE void transpose_32_4x4x2(int stride, const __m128i *inputA,
+                                      const __m128i *inputB, __m128i *output) {
+  __m128i temp0 = _mm_unpacklo_epi32(inputA[0], inputA[2]);
+  __m128i temp1 = _mm_unpackhi_epi32(inputA[0], inputA[2]);
+  __m128i temp2 = _mm_unpacklo_epi32(inputA[1], inputA[3]);
+  __m128i temp3 = _mm_unpackhi_epi32(inputA[1], inputA[3]);
+
+  output[0 * stride] = _mm_unpacklo_epi32(temp0, temp2);
+  output[1 * stride] = _mm_unpackhi_epi32(temp0, temp2);
+  output[2 * stride] = _mm_unpacklo_epi32(temp1, temp3);
+  output[3 * stride] = _mm_unpackhi_epi32(temp1, temp3);
+
+  temp0 = _mm_unpacklo_epi32(inputB[0], inputB[2]);
+  temp1 = _mm_unpackhi_epi32(inputB[0], inputB[2]);
+  temp2 = _mm_unpacklo_epi32(inputB[1], inputB[3]);
+  temp3 = _mm_unpackhi_epi32(inputB[1], inputB[3]);
+
+  output[4 * stride] = _mm_unpacklo_epi32(temp0, temp2);
+  output[5 * stride] = _mm_unpackhi_epi32(temp0, temp2);
+  output[6 * stride] = _mm_unpacklo_epi32(temp1, temp3);
+  output[7 * stride] = _mm_unpackhi_epi32(temp1, temp3);
+}
+
+static void lowbd_fwd_txfm2d_64x64_sse4_1(const int16_t *input, int32_t *output,
+                                          int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  (void)tx_type;
+  assert(tx_type == DCT_DCT);
+  const TX_SIZE tx_size = TX_64X64;
+  __m128i buf0[64], buf1[512];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const transform_1d_sse2 col_txfm = av1_fdct8x64_new_sse2;
+  const int width_div8 = (width >> 3);
+  const int height_div8 = (height >> 3);
+
+  for (int i = 0; i < width_div8; i++) {
+    load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+    round_shift_16bit(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col);
+    round_shift_16bit(buf0, height, shift[1]);
+    for (int j = 0; j < AOMMIN(4, height_div8); ++j) {
+      transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
+    }
+  }
+  for (int i = 0; i < AOMMIN(4, height_div8); i++) {
+    __m128i bufA[64];
+    __m128i bufB[64];
+    __m128i *buf = buf1 + width * i;
+    for (int j = 0; j < width; ++j) {
+      bufA[j] = _mm_cvtepi16_epi32(buf[j]);
+      bufB[j] = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(buf[j], buf[j]));
+    }
+    av1_fdct64_sse4_1(bufA, bufA, cos_bit_row, 1, 1);
+    av1_fdct64_sse4_1(bufB, bufB, cos_bit_row, 1, 1);
+    av1_round_shift_array_32_sse4_1(bufA, bufA, 32, -shift[2]);
+    av1_round_shift_array_32_sse4_1(bufB, bufB, 32, -shift[2]);
+
+    int32_t *output8 = output + 8 * 32 * i;
+    for (int j = 0; j < width_div8; ++j) {
+      __m128i *out = (__m128i *)(output8 + 4 * j);
+      transpose_32_4x4x2(8, bufA + 4 * j, bufB + 4 * j, out);
+    }
+  }
+}
+
+static void lowbd_fwd_txfm2d_64x32_sse4_1(const int16_t *input, int32_t *output,
+                                          int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  const TX_SIZE tx_size = TX_64X32;
+  __m128i buf0[64], buf1[256];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const transform_1d_sse2 col_txfm = col_txfm8x32_arr[tx_type];
+  const int width_div8 = (width >> 3);
+  const int height_div8 = (height >> 3);
+
+  for (int i = 0; i < width_div8; i++) {
+    load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+    round_shift_16bit(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col);
+    round_shift_16bit(buf0, height, shift[1]);
+    for (int j = 0; j < AOMMIN(4, height_div8); ++j) {
+      transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
+    }
+  }
+  assert(tx_type == DCT_DCT);
+  for (int i = 0; i < AOMMIN(4, height_div8); i++) {
+    __m128i bufA[64];
+    __m128i bufB[64];
+    __m128i *buf = buf1 + width * i;
+    for (int j = 0; j < width; ++j) {
+      bufA[j] = _mm_cvtepi16_epi32(buf[j]);
+      bufB[j] = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(buf[j], buf[j]));
+    }
+    av1_fdct64_sse4_1(bufA, bufA, cos_bit_row, 1, 1);
+    av1_fdct64_sse4_1(bufB, bufB, cos_bit_row, 1, 1);
+    av1_round_shift_rect_array_32_sse4_1(bufA, bufA, 32, -shift[2], NewSqrt2);
+    av1_round_shift_rect_array_32_sse4_1(bufB, bufB, 32, -shift[2], NewSqrt2);
+
+    int32_t *output8 = output + 8 * 32 * i;
+    for (int j = 0; j < width_div8; ++j) {
+      __m128i *out = (__m128i *)(output8 + 4 * j);
+      transpose_32_4x4x2(8, bufA + 4 * j, bufB + 4 * j, out);
+    }
+  }
+}
+
+static void lowbd_fwd_txfm2d_32x64_sse4_1(const int16_t *input, int32_t *output,
+                                          int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  (void)tx_type;
+  assert(tx_type == DCT_DCT);
+  const TX_SIZE tx_size = TX_32X64;
+  __m128i buf0[64], buf1[256];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const transform_1d_sse2 col_txfm = av1_fdct8x64_new_sse2;
+  const int width_div8 = (width >> 3);
+  const int height_div8 = (height >> 3);
+
+  for (int i = 0; i < width_div8; i++) {
+    load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+    round_shift_16bit(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col);
+    round_shift_16bit(buf0, height, shift[1]);
+    for (int j = 0; j < AOMMIN(4, height_div8); ++j) {
+      transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
+    }
+  }
+
+  for (int i = 0; i < AOMMIN(4, height_div8); i++) {
+    __m128i bufA[32];
+    __m128i bufB[32];
+    __m128i *buf = buf1 + width * i;
+    for (int j = 0; j < width; ++j) {
+      bufA[j] = _mm_cvtepi16_epi32(buf[j]);
+      bufB[j] = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(buf[j], buf[j]));
+    }
+    av1_fdct32_sse4_1(bufA, bufA, cos_bit_row, 1);
+    av1_fdct32_sse4_1(bufB, bufB, cos_bit_row, 1);
+    av1_round_shift_rect_array_32_sse4_1(bufA, bufA, 32, -shift[2], NewSqrt2);
+    av1_round_shift_rect_array_32_sse4_1(bufB, bufB, 32, -shift[2], NewSqrt2);
+
+    int32_t *output8 = output + 8 * 32 * i;
+    for (int j = 0; j < (32 / 4); ++j) {
+      __m128i *out = (__m128i *)(output8 + 4 * j);
+      transpose_32_4x4x2(8, bufA + 4 * j, bufB + 4 * j, out);
+    }
+  }
+}
+
+static FwdTxfm2dFunc fwd_txfm2d_func_ls[TX_SIZES_ALL] = {
+  av1_lowbd_fwd_txfm2d_4x4_sse2,    // 4x4 transform
+  av1_lowbd_fwd_txfm2d_8x8_sse2,    // 8x8 transform
+  av1_lowbd_fwd_txfm2d_16x16_sse2,  // 16x16 transform
+  av1_lowbd_fwd_txfm2d_32x32_sse2,  // 32x32 transform
+  lowbd_fwd_txfm2d_64x64_sse4_1,    // 64x64 transform
+  av1_lowbd_fwd_txfm2d_4x8_sse2,    // 4x8 transform
+  av1_lowbd_fwd_txfm2d_8x4_sse2,    // 8x4 transform
+  av1_lowbd_fwd_txfm2d_8x16_sse2,   // 8x16 transform
+  av1_lowbd_fwd_txfm2d_16x8_sse2,   // 16x8 transform
+  av1_lowbd_fwd_txfm2d_16x32_sse2,  // 16x32 transform
+  av1_lowbd_fwd_txfm2d_32x16_sse2,  // 32x16 transform
+  lowbd_fwd_txfm2d_32x64_sse4_1,    // 32x64 transform
+  lowbd_fwd_txfm2d_64x32_sse4_1,    // 64x32 transform
+  av1_lowbd_fwd_txfm2d_4x16_sse2,   // 4x16 transform
+  av1_lowbd_fwd_txfm2d_16x4_sse2,   // 16x4 transform
+  av1_lowbd_fwd_txfm2d_8x32_sse2,   // 8x32 transform
+  av1_lowbd_fwd_txfm2d_32x8_sse2,   // 32x8 transform
+  av1_lowbd_fwd_txfm2d_16x64_sse2,  // 16x64 transform
+  av1_lowbd_fwd_txfm2d_64x16_sse2,  // 64x16 transform
+};
+
+void av1_lowbd_fwd_txfm_sse4_1(const int16_t *src_diff, tran_low_t *coeff,
+                               int diff_stride, TxfmParam *txfm_param) {
+  FwdTxfm2dFunc fwd_txfm2d_func = fwd_txfm2d_func_ls[txfm_param->tx_size];
+  if ((fwd_txfm2d_func == NULL) ||
+      (txfm_param->lossless && txfm_param->tx_size == TX_4X4)) {
+    av1_lowbd_fwd_txfm_c(src_diff, coeff, diff_stride, txfm_param);
+  } else {
+    fwd_txfm2d_func(src_diff, coeff, diff_stride, txfm_param->tx_type,
+                    txfm_param->bd);
+  }
+}
diff --git a/libs/libaom/src/av1/encoder/x86/av1_fwd_txfm_avx2.h b/libs/libaom/src/av1/encoder/x86/av1_fwd_txfm_avx2.h
new file mode 100644
index 000000000..aaad76e5a
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/x86/av1_fwd_txfm_avx2.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_AVX2_H_
+#define AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_AVX2_H_
+#include <immintrin.h>
+
+// out0 = in0*w0 + in1*w1
+// out1 = -in1*w0 + in0*w1
+static INLINE void btf_32_avx2_type0(const int32_t w0, const int32_t w1,
+                                     __m256i *in0, __m256i *in1,
+                                     const __m256i _r, const int32_t cos_bit) {
+  __m256i _in0 = *in0;
+  __m256i _in1 = *in1;
+  const __m256i ww0 = _mm256_set1_epi32(w0);
+  const __m256i ww1 = _mm256_set1_epi32(w1);
+  const __m256i in0_w0 = _mm256_mullo_epi32(_in0, ww0);
+  const __m256i in1_w1 = _mm256_mullo_epi32(_in1, ww1);
+  __m256i temp0 = _mm256_add_epi32(in0_w0, in1_w1);
+  temp0 = _mm256_add_epi32(temp0, _r);
+  *in0 = _mm256_srai_epi32(temp0, cos_bit);
+  const __m256i in0_w1 = _mm256_mullo_epi32(_in0, ww1);
+  const __m256i in1_w0 = _mm256_mullo_epi32(_in1, ww0);
+  __m256i temp1 = _mm256_sub_epi32(in0_w1, in1_w0);
+  temp1 = _mm256_add_epi32(temp1, _r);
+  *in1 = _mm256_srai_epi32(temp1, cos_bit);
+}
+
+static INLINE void btf_32_avx2_type1(const int32_t w0, const int32_t w1,
+                                     __m256i *in0, __m256i *in1,
+                                     const __m256i _r, const int32_t cos_bit) {
+  __m256i _in0 = *in0;
+  __m256i _in1 = *in1;
+  const __m256i ww0 = _mm256_set1_epi32(w0);
+  const __m256i ww1 = _mm256_set1_epi32(w1);
+  const __m256i in0_w0 = _mm256_mullo_epi32(_in0, ww0);
+  const __m256i in1_w1 = _mm256_mullo_epi32(_in1, ww1);
+  __m256i temp0 = _mm256_add_epi32(in0_w0, in1_w1);
+  temp0 = _mm256_add_epi32(temp0, _r);
+  *in0 = _mm256_srai_epi32(temp0, cos_bit);
+  const __m256i in0_w1 = _mm256_mullo_epi32(_in0, ww1);
+  const __m256i in1_w0 = _mm256_mullo_epi32(_in1, ww0);
+  __m256i temp1 = _mm256_sub_epi32(in1_w0, in0_w1);
+  temp1 = _mm256_add_epi32(temp1, _r);
+  *in1 = _mm256_srai_epi32(temp1, cos_bit);
+}
+
+// out0 = in0*w0 + in1*w1
+// out1 = -in1*w0 + in0*w1
+static INLINE void btf_32_avx2_type0_new(const __m256i ww0, const __m256i ww1,
+                                         __m256i *in0, __m256i *in1,
+                                         const __m256i _r,
+                                         const int32_t cos_bit) {
+  __m256i _in0 = *in0;
+  __m256i _in1 = *in1;
+  const __m256i in0_w0 = _mm256_mullo_epi32(_in0, ww0);
+  const __m256i in1_w1 = _mm256_mullo_epi32(_in1, ww1);
+  __m256i temp0 = _mm256_add_epi32(in0_w0, in1_w1);
+  temp0 = _mm256_add_epi32(temp0, _r);
+  *in0 = _mm256_srai_epi32(temp0, cos_bit);
+  const __m256i in0_w1 = _mm256_mullo_epi32(_in0, ww1);
+  const __m256i in1_w0 = _mm256_mullo_epi32(_in1, ww0);
+  __m256i temp1 = _mm256_sub_epi32(in0_w1, in1_w0);
+  temp1 = _mm256_add_epi32(temp1, _r);
+  *in1 = _mm256_srai_epi32(temp1, cos_bit);
+}
+
+// out0 = in0*w0 + in1*w1
+// out1 = in1*w0 - in0*w1
+static INLINE void btf_32_avx2_type1_new(const __m256i ww0, const __m256i ww1,
+                                         __m256i *in0, __m256i *in1,
+                                         const __m256i _r,
+                                         const int32_t cos_bit) {
+  __m256i _in0 = *in0;
+  __m256i _in1 = *in1;
+  const __m256i in0_w0 = _mm256_mullo_epi32(_in0, ww0);
+  const __m256i in1_w1 = _mm256_mullo_epi32(_in1, ww1);
+  __m256i temp0 = _mm256_add_epi32(in0_w0, in1_w1);
+  temp0 = _mm256_add_epi32(temp0, _r);
+  *in0 = _mm256_srai_epi32(temp0, cos_bit);
+  const __m256i in0_w1 = _mm256_mullo_epi32(_in0, ww1);
+  const __m256i in1_w0 = _mm256_mullo_epi32(_in1, ww0);
+  __m256i temp1 = _mm256_sub_epi32(in1_w0, in0_w1);
+  temp1 = _mm256_add_epi32(temp1, _r);
+  *in1 = _mm256_srai_epi32(temp1, cos_bit);
+}
+
+#endif  // AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_AVX2_H_
diff --git a/libs/libaom/src/av1/encoder/x86/av1_fwd_txfm_sse2.c b/libs/libaom/src/av1/encoder/x86/av1_fwd_txfm_sse2.c
new file mode 100644
index 000000000..694e6131c
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/x86/av1_fwd_txfm_sse2.c
@@ -0,0 +1,2891 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/x86/av1_txfm_sse2.h"
+#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
+#include "av1/encoder/x86/av1_fwd_txfm_sse2.h"
+
+// TODO(linfengz): refine fdct4x8 and fadst4x8 optimization (if possible).
+
+static void fdct4x4_new_sse2(const __m128i *input, __m128i *output,
+                             int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+  const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+  __m128i u[4], v[4];
+
+  u[0] = _mm_unpacklo_epi16(input[0], input[1]);
+  u[1] = _mm_unpacklo_epi16(input[3], input[2]);
+
+  v[0] = _mm_add_epi16(u[0], u[1]);
+  v[1] = _mm_sub_epi16(u[0], u[1]);
+
+  u[0] = _mm_madd_epi16(v[0], cospi_p32_p32);  // 0
+  u[1] = _mm_madd_epi16(v[0], cospi_p32_m32);  // 2
+  u[2] = _mm_madd_epi16(v[1], cospi_p16_p48);  // 1
+  u[3] = _mm_madd_epi16(v[1], cospi_p48_m16);  // 3
+
+  v[0] = _mm_add_epi32(u[0], __rounding);
+  v[1] = _mm_add_epi32(u[1], __rounding);
+  v[2] = _mm_add_epi32(u[2], __rounding);
+  v[3] = _mm_add_epi32(u[3], __rounding);
+  u[0] = _mm_srai_epi32(v[0], cos_bit);
+  u[1] = _mm_srai_epi32(v[1], cos_bit);
+  u[2] = _mm_srai_epi32(v[2], cos_bit);
+  u[3] = _mm_srai_epi32(v[3], cos_bit);
+
+  output[0] = _mm_packs_epi32(u[0], u[1]);
+  output[1] = _mm_packs_epi32(u[2], u[3]);
+  output[2] = _mm_srli_si128(output[0], 8);
+  output[3] = _mm_srli_si128(output[1], 8);
+}
+
+static void fdct8x4_new_sse2(const __m128i *input, __m128i *output,
+                             int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+  __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+  __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+
+  // stage 1
+  __m128i x1[4];
+  x1[0] = _mm_adds_epi16(input[0], input[3]);
+  x1[3] = _mm_subs_epi16(input[0], input[3]);
+  x1[1] = _mm_adds_epi16(input[1], input[2]);
+  x1[2] = _mm_subs_epi16(input[1], input[2]);
+
+  // stage 2
+  __m128i x2[4];
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[0], x1[1], x2[0], x2[1]);
+  btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x1[2], x1[3], x2[2], x2[3]);
+
+  // stage 3
+  output[0] = x2[0];
+  output[1] = x2[2];
+  output[2] = x2[1];
+  output[3] = x2[3];
+}
+
+static void fdct4x8_new_sse2(const __m128i *input, __m128i *output,
+                             int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+  __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+  __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+  __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+  __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+  __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+  __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+  __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+
+  // stage 1
+  __m128i x1[8];
+  x1[0] = _mm_adds_epi16(input[0], input[7]);
+  x1[7] = _mm_subs_epi16(input[0], input[7]);
+  x1[1] = _mm_adds_epi16(input[1], input[6]);
+  x1[6] = _mm_subs_epi16(input[1], input[6]);
+  x1[2] = _mm_adds_epi16(input[2], input[5]);
+  x1[5] = _mm_subs_epi16(input[2], input[5]);
+  x1[3] = _mm_adds_epi16(input[3], input[4]);
+  x1[4] = _mm_subs_epi16(input[3], input[4]);
+
+  // stage 2
+  __m128i x2[8];
+  x2[0] = _mm_adds_epi16(x1[0], x1[3]);
+  x2[3] = _mm_subs_epi16(x1[0], x1[3]);
+  x2[1] = _mm_adds_epi16(x1[1], x1[2]);
+  x2[2] = _mm_subs_epi16(x1[1], x1[2]);
+  x2[4] = x1[4];
+  btf_16_w4_sse2(&cospi_m32_p32, &cospi_p32_p32, __rounding, cos_bit, &x1[5],
+                 &x1[6], &x2[5], &x2[6]);
+  x2[7] = x1[7];
+
+  // stage 3
+  __m128i x3[8];
+  btf_16_w4_sse2(&cospi_p32_p32, &cospi_p32_m32, __rounding, cos_bit, &x2[0],
+                 &x2[1], &x3[0], &x3[1]);
+  btf_16_w4_sse2(&cospi_p48_p16, &cospi_m16_p48, __rounding, cos_bit, &x2[2],
+                 &x2[3], &x3[2], &x3[3]);
+  x3[4] = _mm_adds_epi16(x2[4], x2[5]);
+  x3[5] = _mm_subs_epi16(x2[4], x2[5]);
+  x3[6] = _mm_subs_epi16(x2[7], x2[6]);
+  x3[7] = _mm_adds_epi16(x2[7], x2[6]);
+
+  // stage 4
+  __m128i x4[8];
+  x4[0] = x3[0];
+  x4[1] = x3[1];
+  x4[2] = x3[2];
+  x4[3] = x3[3];
+  btf_16_w4_sse2(&cospi_p56_p08, &cospi_m08_p56, __rounding, cos_bit, &x3[4],
+                 &x3[7], &x4[4], &x4[7]);
+  btf_16_w4_sse2(&cospi_p24_p40, &cospi_m40_p24, __rounding, cos_bit, &x3[5],
+                 &x3[6], &x4[5], &x4[6]);
+
+  // stage 5
+  output[0] = x4[0];
+  output[1] = x4[4];
+  output[2] = x4[2];
+  output[3] = x4[6];
+  output[4] = x4[1];
+  output[5] = x4[5];
+  output[6] = x4[3];
+  output[7] = x4[7];
+}
+
+static void fdct8x8_new_sse2(const __m128i *input, __m128i *output,
+                             int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+  __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+  __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+  __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+  __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+  __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+  __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+  __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+
+  // stage 1
+  __m128i x1[8];
+  x1[0] = _mm_adds_epi16(input[0], input[7]);
+  x1[7] = _mm_subs_epi16(input[0], input[7]);
+  x1[1] = _mm_adds_epi16(input[1], input[6]);
+  x1[6] = _mm_subs_epi16(input[1], input[6]);
+  x1[2] = _mm_adds_epi16(input[2], input[5]);
+  x1[5] = _mm_subs_epi16(input[2], input[5]);
+  x1[3] = _mm_adds_epi16(input[3], input[4]);
+  x1[4] = _mm_subs_epi16(input[3], input[4]);
+
+  // stage 2
+  __m128i x2[8];
+  x2[0] = _mm_adds_epi16(x1[0], x1[3]);
+  x2[3] = _mm_subs_epi16(x1[0], x1[3]);
+  x2[1] = _mm_adds_epi16(x1[1], x1[2]);
+  x2[2] = _mm_subs_epi16(x1[1], x1[2]);
+  x2[4] = x1[4];
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[5], x1[6], x2[5], x2[6]);
+  x2[7] = x1[7];
+
+  // stage 3
+  __m128i x3[8];
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x2[0], x2[1], x3[0], x3[1]);
+  btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x2[2], x2[3], x3[2], x3[3]);
+  x3[4] = _mm_adds_epi16(x2[4], x2[5]);
+  x3[5] = _mm_subs_epi16(x2[4], x2[5]);
+  x3[6] = _mm_subs_epi16(x2[7], x2[6]);
+  x3[7] = _mm_adds_epi16(x2[7], x2[6]);
+
+  // stage 4
+  __m128i x4[8];
+  x4[0] = x3[0];
+  x4[1] = x3[1];
+  x4[2] = x3[2];
+  x4[3] = x3[3];
+  btf_16_sse2(cospi_p56_p08, cospi_m08_p56, x3[4], x3[7], x4[4], x4[7]);
+  btf_16_sse2(cospi_p24_p40, cospi_m40_p24, x3[5], x3[6], x4[5], x4[6]);
+
+  // stage 5
+  output[0] = x4[0];
+  output[1] = x4[4];
+  output[2] = x4[2];
+  output[3] = x4[6];
+  output[4] = x4[1];
+  output[5] = x4[5];
+  output[6] = x4[3];
+  output[7] = x4[7];
+}
+
+static void fdct8x16_new_sse2(const __m128i *input, __m128i *output,
+                              int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+  __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+  __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+  __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+  __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+  __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+  __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+  __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+  __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+  __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
+  __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
+  __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]);
+  __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
+  __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
+  __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
+  __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]);
+  __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
+
+  // stage 1
+  __m128i x1[16];
+  x1[0] = _mm_adds_epi16(input[0], input[15]);
+  x1[15] = _mm_subs_epi16(input[0], input[15]);
+  x1[1] = _mm_adds_epi16(input[1], input[14]);
+  x1[14] = _mm_subs_epi16(input[1], input[14]);
+  x1[2] = _mm_adds_epi16(input[2], input[13]);
+  x1[13] = _mm_subs_epi16(input[2], input[13]);
+  x1[3] = _mm_adds_epi16(input[3], input[12]);
+  x1[12] = _mm_subs_epi16(input[3], input[12]);
+  x1[4] = _mm_adds_epi16(input[4], input[11]);
+  x1[11] = _mm_subs_epi16(input[4], input[11]);
+  x1[5] = _mm_adds_epi16(input[5], input[10]);
+  x1[10] = _mm_subs_epi16(input[5], input[10]);
+  x1[6] = _mm_adds_epi16(input[6], input[9]);
+  x1[9] = _mm_subs_epi16(input[6], input[9]);
+  x1[7] = _mm_adds_epi16(input[7], input[8]);
+  x1[8] = _mm_subs_epi16(input[7], input[8]);
+
+  // stage 2
+  __m128i x2[16];
+  x2[0] = _mm_adds_epi16(x1[0], x1[7]);
+  x2[7] = _mm_subs_epi16(x1[0], x1[7]);
+  x2[1] = _mm_adds_epi16(x1[1], x1[6]);
+  x2[6] = _mm_subs_epi16(x1[1], x1[6]);
+  x2[2] = _mm_adds_epi16(x1[2], x1[5]);
+  x2[5] = _mm_subs_epi16(x1[2], x1[5]);
+  x2[3] = _mm_adds_epi16(x1[3], x1[4]);
+  x2[4] = _mm_subs_epi16(x1[3], x1[4]);
+  x2[8] = x1[8];
+  x2[9] = x1[9];
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[10], x1[13], x2[10], x2[13]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[11], x1[12], x2[11], x2[12]);
+  x2[14] = x1[14];
+  x2[15] = x1[15];
+
+  // stage 3
+  __m128i x3[16];
+  x3[0] = _mm_adds_epi16(x2[0], x2[3]);
+  x3[3] = _mm_subs_epi16(x2[0], x2[3]);
+  x3[1] = _mm_adds_epi16(x2[1], x2[2]);
+  x3[2] = _mm_subs_epi16(x2[1], x2[2]);
+  x3[4] = x2[4];
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[5], x2[6], x3[5], x3[6]);
+  x3[7] = x2[7];
+  x3[8] = _mm_adds_epi16(x2[8], x2[11]);
+  x3[11] = _mm_subs_epi16(x2[8], x2[11]);
+  x3[9] = _mm_adds_epi16(x2[9], x2[10]);
+  x3[10] = _mm_subs_epi16(x2[9], x2[10]);
+  x3[12] = _mm_subs_epi16(x2[15], x2[12]);
+  x3[15] = _mm_adds_epi16(x2[15], x2[12]);
+  x3[13] = _mm_subs_epi16(x2[14], x2[13]);
+  x3[14] = _mm_adds_epi16(x2[14], x2[13]);
+
+  // stage 4
+  __m128i x4[16];
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x3[0], x3[1], x4[0], x4[1]);
+  btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x3[2], x3[3], x4[2], x4[3]);
+  x4[4] = _mm_adds_epi16(x3[4], x3[5]);
+  x4[5] = _mm_subs_epi16(x3[4], x3[5]);
+  x4[6] = _mm_subs_epi16(x3[7], x3[6]);
+  x4[7] = _mm_adds_epi16(x3[7], x3[6]);
+  x4[8] = x3[8];
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[9], x3[14], x4[9], x4[14]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[10], x3[13], x4[10], x4[13]);
+  x4[11] = x3[11];
+  x4[12] = x3[12];
+  x4[15] = x3[15];
+
+  // stage 5
+  __m128i x5[16];
+  x5[0] = x4[0];
+  x5[1] = x4[1];
+  x5[2] = x4[2];
+  x5[3] = x4[3];
+  btf_16_sse2(cospi_p56_p08, cospi_m08_p56, x4[4], x4[7], x5[4], x5[7]);
+  btf_16_sse2(cospi_p24_p40, cospi_m40_p24, x4[5], x4[6], x5[5], x5[6]);
+  x5[8] = _mm_adds_epi16(x4[8], x4[9]);
+  x5[9] = _mm_subs_epi16(x4[8], x4[9]);
+  x5[10] = _mm_subs_epi16(x4[11], x4[10]);
+  x5[11] = _mm_adds_epi16(x4[11], x4[10]);
+  x5[12] = _mm_adds_epi16(x4[12], x4[13]);
+  x5[13] = _mm_subs_epi16(x4[12], x4[13]);
+  x5[14] = _mm_subs_epi16(x4[15], x4[14]);
+  x5[15] = _mm_adds_epi16(x4[15], x4[14]);
+
+  // stage 6
+  __m128i x6[16];
+  x6[0] = x5[0];
+  x6[1] = x5[1];
+  x6[2] = x5[2];
+  x6[3] = x5[3];
+  x6[4] = x5[4];
+  x6[5] = x5[5];
+  x6[6] = x5[6];
+  x6[7] = x5[7];
+  btf_16_sse2(cospi_p60_p04, cospi_m04_p60, x5[8], x5[15], x6[8], x6[15]);
+  btf_16_sse2(cospi_p28_p36, cospi_m36_p28, x5[9], x5[14], x6[9], x6[14]);
+  btf_16_sse2(cospi_p44_p20, cospi_m20_p44, x5[10], x5[13], x6[10], x6[13]);
+  btf_16_sse2(cospi_p12_p52, cospi_m52_p12, x5[11], x5[12], x6[11], x6[12]);
+
+  // stage 7
+  output[0] = x6[0];
+  output[1] = x6[8];
+  output[2] = x6[4];
+  output[3] = x6[12];
+  output[4] = x6[2];
+  output[5] = x6[10];
+  output[6] = x6[6];
+  output[7] = x6[14];
+  output[8] = x6[1];
+  output[9] = x6[9];
+  output[10] = x6[5];
+  output[11] = x6[13];
+  output[12] = x6[3];
+  output[13] = x6[11];
+  output[14] = x6[7];
+  output[15] = x6[15];
+}
+
+void av1_fdct8x32_new_sse2(const __m128i *input, __m128i *output,
+                           int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+  __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+  __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+  __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+  __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+  __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+  __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+  __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+  __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+  __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
+  __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
+  __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
+  __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
+  __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]);
+  __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
+  __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
+  __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
+  __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]);
+  __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
+  __m128i cospi_p62_p02 = pair_set_epi16(cospi[62], cospi[2]);
+  __m128i cospi_m02_p62 = pair_set_epi16(-cospi[2], cospi[62]);
+  __m128i cospi_p30_p34 = pair_set_epi16(cospi[30], cospi[34]);
+  __m128i cospi_m34_p30 = pair_set_epi16(-cospi[34], cospi[30]);
+  __m128i cospi_p46_p18 = pair_set_epi16(cospi[46], cospi[18]);
+  __m128i cospi_m18_p46 = pair_set_epi16(-cospi[18], cospi[46]);
+  __m128i cospi_p14_p50 = pair_set_epi16(cospi[14], cospi[50]);
+  __m128i cospi_m50_p14 = pair_set_epi16(-cospi[50], cospi[14]);
+  __m128i cospi_p54_p10 = pair_set_epi16(cospi[54], cospi[10]);
+  __m128i cospi_m10_p54 = pair_set_epi16(-cospi[10], cospi[54]);
+  __m128i cospi_p22_p42 = pair_set_epi16(cospi[22], cospi[42]);
+  __m128i cospi_m42_p22 = pair_set_epi16(-cospi[42], cospi[22]);
+  __m128i cospi_p38_p26 = pair_set_epi16(cospi[38], cospi[26]);
+  __m128i cospi_m26_p38 = pair_set_epi16(-cospi[26], cospi[38]);
+  __m128i cospi_p06_p58 = pair_set_epi16(cospi[6], cospi[58]);
+  __m128i cospi_m58_p06 = pair_set_epi16(-cospi[58], cospi[6]);
+
+  // stage 1
+  __m128i x1[32];
+  x1[0] = _mm_adds_epi16(input[0], input[31]);
+  x1[31] = _mm_subs_epi16(input[0], input[31]);
+  x1[1] = _mm_adds_epi16(input[1], input[30]);
+  x1[30] = _mm_subs_epi16(input[1], input[30]);
+  x1[2] = _mm_adds_epi16(input[2], input[29]);
+  x1[29] = _mm_subs_epi16(input[2], input[29]);
+  x1[3] = _mm_adds_epi16(input[3], input[28]);
+  x1[28] = _mm_subs_epi16(input[3], input[28]);
+  x1[4] = _mm_adds_epi16(input[4], input[27]);
+  x1[27] = _mm_subs_epi16(input[4], input[27]);
+  x1[5] = _mm_adds_epi16(input[5], input[26]);
+  x1[26] = _mm_subs_epi16(input[5], input[26]);
+  x1[6] = _mm_adds_epi16(input[6], input[25]);
+  x1[25] = _mm_subs_epi16(input[6], input[25]);
+  x1[7] = _mm_adds_epi16(input[7], input[24]);
+  x1[24] = _mm_subs_epi16(input[7], input[24]);
+  x1[8] = _mm_adds_epi16(input[8], input[23]);
+  x1[23] = _mm_subs_epi16(input[8], input[23]);
+  x1[9] = _mm_adds_epi16(input[9], input[22]);
+  x1[22] = _mm_subs_epi16(input[9], input[22]);
+  x1[10] = _mm_adds_epi16(input[10], input[21]);
+  x1[21] = _mm_subs_epi16(input[10], input[21]);
+  x1[11] = _mm_adds_epi16(input[11], input[20]);
+  x1[20] = _mm_subs_epi16(input[11], input[20]);
+  x1[12] = _mm_adds_epi16(input[12], input[19]);
+  x1[19] = _mm_subs_epi16(input[12], input[19]);
+  x1[13] = _mm_adds_epi16(input[13], input[18]);
+  x1[18] = _mm_subs_epi16(input[13], input[18]);
+  x1[14] = _mm_adds_epi16(input[14], input[17]);
+  x1[17] = _mm_subs_epi16(input[14], input[17]);
+  x1[15] = _mm_adds_epi16(input[15], input[16]);
+  x1[16] = _mm_subs_epi16(input[15], input[16]);
+
+  // stage 2
+  __m128i x2[32];
+  x2[0] = _mm_adds_epi16(x1[0], x1[15]);
+  x2[15] = _mm_subs_epi16(x1[0], x1[15]);
+  x2[1] = _mm_adds_epi16(x1[1], x1[14]);
+  x2[14] = _mm_subs_epi16(x1[1], x1[14]);
+  x2[2] = _mm_adds_epi16(x1[2], x1[13]);
+  x2[13] = _mm_subs_epi16(x1[2], x1[13]);
+  x2[3] = _mm_adds_epi16(x1[3], x1[12]);
+  x2[12] = _mm_subs_epi16(x1[3], x1[12]);
+  x2[4] = _mm_adds_epi16(x1[4], x1[11]);
+  x2[11] = _mm_subs_epi16(x1[4], x1[11]);
+  x2[5] = _mm_adds_epi16(x1[5], x1[10]);
+  x2[10] = _mm_subs_epi16(x1[5], x1[10]);
+  x2[6] = _mm_adds_epi16(x1[6], x1[9]);
+  x2[9] = _mm_subs_epi16(x1[6], x1[9]);
+  x2[7] = _mm_adds_epi16(x1[7], x1[8]);
+  x2[8] = _mm_subs_epi16(x1[7], x1[8]);
+  x2[16] = x1[16];
+  x2[17] = x1[17];
+  x2[18] = x1[18];
+  x2[19] = x1[19];
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[20], x1[27], x2[20], x2[27]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[21], x1[26], x2[21], x2[26]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[22], x1[25], x2[22], x2[25]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[23], x1[24], x2[23], x2[24]);
+  x2[28] = x1[28];
+  x2[29] = x1[29];
+  x2[30] = x1[30];
+  x2[31] = x1[31];
+
+  // stage 3
+  __m128i x3[32];
+  x3[0] = _mm_adds_epi16(x2[0], x2[7]);
+  x3[7] = _mm_subs_epi16(x2[0], x2[7]);
+  x3[1] = _mm_adds_epi16(x2[1], x2[6]);
+  x3[6] = _mm_subs_epi16(x2[1], x2[6]);
+  x3[2] = _mm_adds_epi16(x2[2], x2[5]);
+  x3[5] = _mm_subs_epi16(x2[2], x2[5]);
+  x3[3] = _mm_adds_epi16(x2[3], x2[4]);
+  x3[4] = _mm_subs_epi16(x2[3], x2[4]);
+  x3[8] = x2[8];
+  x3[9] = x2[9];
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[10], x2[13], x3[10], x3[13]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[11], x2[12], x3[11], x3[12]);
+  x3[14] = x2[14];
+  x3[15] = x2[15];
+  x3[16] = _mm_adds_epi16(x2[16], x2[23]);
+  x3[23] = _mm_subs_epi16(x2[16], x2[23]);
+  x3[17] = _mm_adds_epi16(x2[17], x2[22]);
+  x3[22] = _mm_subs_epi16(x2[17], x2[22]);
+  x3[18] = _mm_adds_epi16(x2[18], x2[21]);
+  x3[21] = _mm_subs_epi16(x2[18], x2[21]);
+  x3[19] = _mm_adds_epi16(x2[19], x2[20]);
+  x3[20] = _mm_subs_epi16(x2[19], x2[20]);
+  x3[24] = _mm_subs_epi16(x2[31], x2[24]);
+  x3[31] = _mm_adds_epi16(x2[31], x2[24]);
+  x3[25] = _mm_subs_epi16(x2[30], x2[25]);
+  x3[30] = _mm_adds_epi16(x2[30], x2[25]);
+  x3[26] = _mm_subs_epi16(x2[29], x2[26]);
+  x3[29] = _mm_adds_epi16(x2[29], x2[26]);
+  x3[27] = _mm_subs_epi16(x2[28], x2[27]);
+  x3[28] = _mm_adds_epi16(x2[28], x2[27]);
+
+  // stage 4
+  __m128i x4[32];
+  x4[0] = _mm_adds_epi16(x3[0], x3[3]);
+  x4[3] = _mm_subs_epi16(x3[0], x3[3]);
+  x4[1] = _mm_adds_epi16(x3[1], x3[2]);
+  x4[2] = _mm_subs_epi16(x3[1], x3[2]);
+  x4[4] = x3[4];
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x3[5], x3[6], x4[5], x4[6]);
+  x4[7] = x3[7];
+  x4[8] = _mm_adds_epi16(x3[8], x3[11]);
+  x4[11] = _mm_subs_epi16(x3[8], x3[11]);
+  x4[9] = _mm_adds_epi16(x3[9], x3[10]);
+  x4[10] = _mm_subs_epi16(x3[9], x3[10]);
+  x4[12] = _mm_subs_epi16(x3[15], x3[12]);
+  x4[15] = _mm_adds_epi16(x3[15], x3[12]);
+  x4[13] = _mm_subs_epi16(x3[14], x3[13]);
+  x4[14] = _mm_adds_epi16(x3[14], x3[13]);
+  x4[16] = x3[16];
+  x4[17] = x3[17];
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[18], x3[29], x4[18], x4[29]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[19], x3[28], x4[19], x4[28]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[20], x3[27], x4[20], x4[27]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[21], x3[26], x4[21], x4[26]);
+  x4[22] = x3[22];
+  x4[23] = x3[23];
+  x4[24] = x3[24];
+  x4[25] = x3[25];
+  x4[30] = x3[30];
+  x4[31] = x3[31];
+
+  // stage 5
+  __m128i x5[32];
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x4[0], x4[1], x5[0], x5[1]);
+  btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x4[2], x4[3], x5[2], x5[3]);
+  x5[4] = _mm_adds_epi16(x4[4], x4[5]);
+  x5[5] = _mm_subs_epi16(x4[4], x4[5]);
+  x5[6] = _mm_subs_epi16(x4[7], x4[6]);
+  x5[7] = _mm_adds_epi16(x4[7], x4[6]);
+  x5[8] = x4[8];
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x4[9], x4[14], x5[9], x5[14]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x4[10], x4[13], x5[10], x5[13]);
+  x5[11] = x4[11];
+  x5[12] = x4[12];
+  x5[15] = x4[15];
+  x5[16] = _mm_adds_epi16(x4[16], x4[19]);
+  x5[19] = _mm_subs_epi16(x4[16], x4[19]);
+  x5[17] = _mm_adds_epi16(x4[17], x4[18]);
+  x5[18] = _mm_subs_epi16(x4[17], x4[18]);
+  x5[20] = _mm_subs_epi16(x4[23], x4[20]);
+  x5[23] = _mm_adds_epi16(x4[23], x4[20]);
+  x5[21] = _mm_subs_epi16(x4[22], x4[21]);
+  x5[22] = _mm_adds_epi16(x4[22], x4[21]);
+  x5[24] = _mm_adds_epi16(x4[24], x4[27]);
+  x5[27] = _mm_subs_epi16(x4[24], x4[27]);
+  x5[25] = _mm_adds_epi16(x4[25], x4[26]);
+  x5[26] = _mm_subs_epi16(x4[25], x4[26]);
+  x5[28] = _mm_subs_epi16(x4[31], x4[28]);
+  x5[31] = _mm_adds_epi16(x4[31], x4[28]);
+  x5[29] = _mm_subs_epi16(x4[30], x4[29]);
+  x5[30] = _mm_adds_epi16(x4[30], x4[29]);
+
+  // stage 6
+  __m128i x6[32];
+  x6[0] = x5[0];
+  x6[1] = x5[1];
+  x6[2] = x5[2];
+  x6[3] = x5[3];
+  btf_16_sse2(cospi_p56_p08, cospi_m08_p56, x5[4], x5[7], x6[4], x6[7]);
+  btf_16_sse2(cospi_p24_p40, cospi_m40_p24, x5[5], x5[6], x6[5], x6[6]);
+  x6[8] = _mm_adds_epi16(x5[8], x5[9]);
+  x6[9] = _mm_subs_epi16(x5[8], x5[9]);
+  x6[10] = _mm_subs_epi16(x5[11], x5[10]);
+  x6[11] = _mm_adds_epi16(x5[11], x5[10]);
+  x6[12] = _mm_adds_epi16(x5[12], x5[13]);
+  x6[13] = _mm_subs_epi16(x5[12], x5[13]);
+  x6[14] = _mm_subs_epi16(x5[15], x5[14]);
+  x6[15] = _mm_adds_epi16(x5[15], x5[14]);
+  x6[16] = x5[16];
+  btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x5[17], x5[30], x6[17], x6[30]);
+  btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x5[18], x5[29], x6[18], x6[29]);
+  x6[19] = x5[19];
+  x6[20] = x5[20];
+  btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x5[21], x5[26], x6[21], x6[26]);
+  btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x5[22], x5[25], x6[22], x6[25]);
+  x6[23] = x5[23];
+  x6[24] = x5[24];
+  x6[27] = x5[27];
+  x6[28] = x5[28];
+  x6[31] = x5[31];
+
+  // stage 7
+  __m128i x7[32];
+  x7[0] = x6[0];
+  x7[1] = x6[1];
+  x7[2] = x6[2];
+  x7[3] = x6[3];
+  x7[4] = x6[4];
+  x7[5] = x6[5];
+  x7[6] = x6[6];
+  x7[7] = x6[7];
+  btf_16_sse2(cospi_p60_p04, cospi_m04_p60, x6[8], x6[15], x7[8], x7[15]);
+  btf_16_sse2(cospi_p28_p36, cospi_m36_p28, x6[9], x6[14], x7[9], x7[14]);
+  btf_16_sse2(cospi_p44_p20, cospi_m20_p44, x6[10], x6[13], x7[10], x7[13]);
+  btf_16_sse2(cospi_p12_p52, cospi_m52_p12, x6[11], x6[12], x7[11], x7[12]);
+  x7[16] = _mm_adds_epi16(x6[16], x6[17]);
+  x7[17] = _mm_subs_epi16(x6[16], x6[17]);
+  x7[18] = _mm_subs_epi16(x6[19], x6[18]);
+  x7[19] = _mm_adds_epi16(x6[19], x6[18]);
+  x7[20] = _mm_adds_epi16(x6[20], x6[21]);
+  x7[21] = _mm_subs_epi16(x6[20], x6[21]);
+  x7[22] = _mm_subs_epi16(x6[23], x6[22]);
+  x7[23] = _mm_adds_epi16(x6[23], x6[22]);
+  x7[24] = _mm_adds_epi16(x6[24], x6[25]);
+  x7[25] = _mm_subs_epi16(x6[24], x6[25]);
+  x7[26] = _mm_subs_epi16(x6[27], x6[26]);
+  x7[27] = _mm_adds_epi16(x6[27], x6[26]);
+  x7[28] = _mm_adds_epi16(x6[28], x6[29]);
+  x7[29] = _mm_subs_epi16(x6[28], x6[29]);
+  x7[30] = _mm_subs_epi16(x6[31], x6[30]);
+  x7[31] = _mm_adds_epi16(x6[31], x6[30]);
+
+  // stage 8
+  __m128i x8[32];
+  x8[0] = x7[0];
+  x8[1] = x7[1];
+  x8[2] = x7[2];
+  x8[3] = x7[3];
+  x8[4] = x7[4];
+  x8[5] = x7[5];
+  x8[6] = x7[6];
+  x8[7] = x7[7];
+  x8[8] = x7[8];
+  x8[9] = x7[9];
+  x8[10] = x7[10];
+  x8[11] = x7[11];
+  x8[12] = x7[12];
+  x8[13] = x7[13];
+  x8[14] = x7[14];
+  x8[15] = x7[15];
+  btf_16_sse2(cospi_p62_p02, cospi_m02_p62, x7[16], x7[31], x8[16], x8[31]);
+  btf_16_sse2(cospi_p30_p34, cospi_m34_p30, x7[17], x7[30], x8[17], x8[30]);
+  btf_16_sse2(cospi_p46_p18, cospi_m18_p46, x7[18], x7[29], x8[18], x8[29]);
+  btf_16_sse2(cospi_p14_p50, cospi_m50_p14, x7[19], x7[28], x8[19], x8[28]);
+  btf_16_sse2(cospi_p54_p10, cospi_m10_p54, x7[20], x7[27], x8[20], x8[27]);
+  btf_16_sse2(cospi_p22_p42, cospi_m42_p22, x7[21], x7[26], x8[21], x8[26]);
+  btf_16_sse2(cospi_p38_p26, cospi_m26_p38, x7[22], x7[25], x8[22], x8[25]);
+  btf_16_sse2(cospi_p06_p58, cospi_m58_p06, x7[23], x7[24], x8[23], x8[24]);
+
+  // stage 9
+  output[0] = x8[0];
+  output[1] = x8[16];
+  output[2] = x8[8];
+  output[3] = x8[24];
+  output[4] = x8[4];
+  output[5] = x8[20];
+  output[6] = x8[12];
+  output[7] = x8[28];
+  output[8] = x8[2];
+  output[9] = x8[18];
+  output[10] = x8[10];
+  output[11] = x8[26];
+  output[12] = x8[6];
+  output[13] = x8[22];
+  output[14] = x8[14];
+  output[15] = x8[30];
+  output[16] = x8[1];
+  output[17] = x8[17];
+  output[18] = x8[9];
+  output[19] = x8[25];
+  output[20] = x8[5];
+  output[21] = x8[21];
+  output[22] = x8[13];
+  output[23] = x8[29];
+  output[24] = x8[3];
+  output[25] = x8[19];
+  output[26] = x8[11];
+  output[27] = x8[27];
+  output[28] = x8[7];
+  output[29] = x8[23];
+  output[30] = x8[15];
+  output[31] = x8[31];
+}
+
+void av1_fdct8x64_new_sse2(const __m128i *input, __m128i *output,
+                           int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+  __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+  __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+  __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+  __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+  __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+  __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+  __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
+  __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+  __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+  __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
+  __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
+  __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
+  __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]);
+  __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
+  __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
+  __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
+  __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]);
+  __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
+  __m128i cospi_m60_m04 = pair_set_epi16(-cospi[60], -cospi[4]);
+  __m128i cospi_m28_m36 = pair_set_epi16(-cospi[28], -cospi[36]);
+  __m128i cospi_m44_m20 = pair_set_epi16(-cospi[44], -cospi[20]);
+  __m128i cospi_m12_m52 = pair_set_epi16(-cospi[12], -cospi[52]);
+  __m128i cospi_p62_p02 = pair_set_epi16(cospi[62], cospi[2]);
+  __m128i cospi_m02_p62 = pair_set_epi16(-cospi[2], cospi[62]);
+  __m128i cospi_p30_p34 = pair_set_epi16(cospi[30], cospi[34]);
+  __m128i cospi_m34_p30 = pair_set_epi16(-cospi[34], cospi[30]);
+  __m128i cospi_p46_p18 = pair_set_epi16(cospi[46], cospi[18]);
+  __m128i cospi_m18_p46 = pair_set_epi16(-cospi[18], cospi[46]);
+  __m128i cospi_p14_p50 = pair_set_epi16(cospi[14], cospi[50]);
+  __m128i cospi_m50_p14 = pair_set_epi16(-cospi[50], cospi[14]);
+  __m128i cospi_p54_p10 = pair_set_epi16(cospi[54], cospi[10]);
+  __m128i cospi_m10_p54 = pair_set_epi16(-cospi[10], cospi[54]);
+  __m128i cospi_p22_p42 = pair_set_epi16(cospi[22], cospi[42]);
+  __m128i cospi_m42_p22 = pair_set_epi16(-cospi[42], cospi[22]);
+  __m128i cospi_p38_p26 = pair_set_epi16(cospi[38], cospi[26]);
+  __m128i cospi_m26_p38 = pair_set_epi16(-cospi[26], cospi[38]);
+  __m128i cospi_p06_p58 = pair_set_epi16(cospi[6], cospi[58]);
+  __m128i cospi_m58_p06 = pair_set_epi16(-cospi[58], cospi[6]);
+  __m128i cospi_p63_p01 = pair_set_epi16(cospi[63], cospi[1]);
+  __m128i cospi_m01_p63 = pair_set_epi16(-cospi[1], cospi[63]);
+  __m128i cospi_p31_p33 = pair_set_epi16(cospi[31], cospi[33]);
+  __m128i cospi_m33_p31 = pair_set_epi16(-cospi[33], cospi[31]);
+  __m128i cospi_p47_p17 = pair_set_epi16(cospi[47], cospi[17]);
+  __m128i cospi_m17_p47 = pair_set_epi16(-cospi[17], cospi[47]);
+  __m128i cospi_p15_p49 = pair_set_epi16(cospi[15], cospi[49]);
+  __m128i cospi_m49_p15 = pair_set_epi16(-cospi[49], cospi[15]);
+  __m128i cospi_p55_p09 = pair_set_epi16(cospi[55], cospi[9]);
+  __m128i cospi_m09_p55 = pair_set_epi16(-cospi[9], cospi[55]);
+  __m128i cospi_p23_p41 = pair_set_epi16(cospi[23], cospi[41]);
+  __m128i cospi_m41_p23 = pair_set_epi16(-cospi[41], cospi[23]);
+  __m128i cospi_p39_p25 = pair_set_epi16(cospi[39], cospi[25]);
+  __m128i cospi_m25_p39 = pair_set_epi16(-cospi[25], cospi[39]);
+  __m128i cospi_p07_p57 = pair_set_epi16(cospi[7], cospi[57]);
+  __m128i cospi_m57_p07 = pair_set_epi16(-cospi[57], cospi[7]);
+  __m128i cospi_p59_p05 = pair_set_epi16(cospi[59], cospi[5]);
+  __m128i cospi_m05_p59 = pair_set_epi16(-cospi[5], cospi[59]);
+  __m128i cospi_p27_p37 = pair_set_epi16(cospi[27], cospi[37]);
+  __m128i cospi_m37_p27 = pair_set_epi16(-cospi[37], cospi[27]);
+  __m128i cospi_p43_p21 = pair_set_epi16(cospi[43], cospi[21]);
+  __m128i cospi_m21_p43 = pair_set_epi16(-cospi[21], cospi[43]);
+  __m128i cospi_p11_p53 = pair_set_epi16(cospi[11], cospi[53]);
+  __m128i cospi_m53_p11 = pair_set_epi16(-cospi[53], cospi[11]);
+  __m128i cospi_p51_p13 = pair_set_epi16(cospi[51], cospi[13]);
+  __m128i cospi_m13_p51 = pair_set_epi16(-cospi[13], cospi[51]);
+  __m128i cospi_p19_p45 = pair_set_epi16(cospi[19], cospi[45]);
+  __m128i cospi_m45_p19 = pair_set_epi16(-cospi[45], cospi[19]);
+  __m128i cospi_p35_p29 = pair_set_epi16(cospi[35], cospi[29]);
+  __m128i cospi_m29_p35 = pair_set_epi16(-cospi[29], cospi[35]);
+  __m128i cospi_p03_p61 = pair_set_epi16(cospi[3], cospi[61]);
+  __m128i cospi_m61_p03 = pair_set_epi16(-cospi[61], cospi[3]);
+
+  // stage 1
+  __m128i x1[64];
+  x1[0] = _mm_adds_epi16(input[0], input[63]);
+  x1[63] = _mm_subs_epi16(input[0], input[63]);
+  x1[1] = _mm_adds_epi16(input[1], input[62]);
+  x1[62] = _mm_subs_epi16(input[1], input[62]);
+  x1[2] = _mm_adds_epi16(input[2], input[61]);
+  x1[61] = _mm_subs_epi16(input[2], input[61]);
+  x1[3] = _mm_adds_epi16(input[3], input[60]);
+  x1[60] = _mm_subs_epi16(input[3], input[60]);
+  x1[4] = _mm_adds_epi16(input[4], input[59]);
+  x1[59] = _mm_subs_epi16(input[4], input[59]);
+  x1[5] = _mm_adds_epi16(input[5], input[58]);
+  x1[58] = _mm_subs_epi16(input[5], input[58]);
+  x1[6] = _mm_adds_epi16(input[6], input[57]);
+  x1[57] = _mm_subs_epi16(input[6], input[57]);
+  x1[7] = _mm_adds_epi16(input[7], input[56]);
+  x1[56] = _mm_subs_epi16(input[7], input[56]);
+  x1[8] = _mm_adds_epi16(input[8], input[55]);
+  x1[55] = _mm_subs_epi16(input[8], input[55]);
+  x1[9] = _mm_adds_epi16(input[9], input[54]);
+  x1[54] = _mm_subs_epi16(input[9], input[54]);
+  x1[10] = _mm_adds_epi16(input[10], input[53]);
+  x1[53] = _mm_subs_epi16(input[10], input[53]);
+  x1[11] = _mm_adds_epi16(input[11], input[52]);
+  x1[52] = _mm_subs_epi16(input[11], input[52]);
+  x1[12] = _mm_adds_epi16(input[12], input[51]);
+  x1[51] = _mm_subs_epi16(input[12], input[51]);
+  x1[13] = _mm_adds_epi16(input[13], input[50]);
+  x1[50] = _mm_subs_epi16(input[13], input[50]);
+  x1[14] = _mm_adds_epi16(input[14], input[49]);
+  x1[49] = _mm_subs_epi16(input[14], input[49]);
+  x1[15] = _mm_adds_epi16(input[15], input[48]);
+  x1[48] = _mm_subs_epi16(input[15], input[48]);
+  x1[16] = _mm_adds_epi16(input[16], input[47]);
+  x1[47] = _mm_subs_epi16(input[16], input[47]);
+  x1[17] = _mm_adds_epi16(input[17], input[46]);
+  x1[46] = _mm_subs_epi16(input[17], input[46]);
+  x1[18] = _mm_adds_epi16(input[18], input[45]);
+  x1[45] = _mm_subs_epi16(input[18], input[45]);
+  x1[19] = _mm_adds_epi16(input[19], input[44]);
+  x1[44] = _mm_subs_epi16(input[19], input[44]);
+  x1[20] = _mm_adds_epi16(input[20], input[43]);
+  x1[43] = _mm_subs_epi16(input[20], input[43]);
+  x1[21] = _mm_adds_epi16(input[21], input[42]);
+  x1[42] = _mm_subs_epi16(input[21], input[42]);
+  x1[22] = _mm_adds_epi16(input[22], input[41]);
+  x1[41] = _mm_subs_epi16(input[22], input[41]);
+  x1[23] = _mm_adds_epi16(input[23], input[40]);
+  x1[40] = _mm_subs_epi16(input[23], input[40]);
+  x1[24] = _mm_adds_epi16(input[24], input[39]);
+  x1[39] = _mm_subs_epi16(input[24], input[39]);
+  x1[25] = _mm_adds_epi16(input[25], input[38]);
+  x1[38] = _mm_subs_epi16(input[25], input[38]);
+  x1[26] = _mm_adds_epi16(input[26], input[37]);
+  x1[37] = _mm_subs_epi16(input[26], input[37]);
+  x1[27] = _mm_adds_epi16(input[27], input[36]);
+  x1[36] = _mm_subs_epi16(input[27], input[36]);
+  x1[28] = _mm_adds_epi16(input[28], input[35]);
+  x1[35] = _mm_subs_epi16(input[28], input[35]);
+  x1[29] = _mm_adds_epi16(input[29], input[34]);
+  x1[34] = _mm_subs_epi16(input[29], input[34]);
+  x1[30] = _mm_adds_epi16(input[30], input[33]);
+  x1[33] = _mm_subs_epi16(input[30], input[33]);
+  x1[31] = _mm_adds_epi16(input[31], input[32]);
+  x1[32] = _mm_subs_epi16(input[31], input[32]);
+
+  // stage 2
+  __m128i x2[64];
+  x2[0] = _mm_adds_epi16(x1[0], x1[31]);
+  x2[31] = _mm_subs_epi16(x1[0], x1[31]);
+  x2[1] = _mm_adds_epi16(x1[1], x1[30]);
+  x2[30] = _mm_subs_epi16(x1[1], x1[30]);
+  x2[2] = _mm_adds_epi16(x1[2], x1[29]);
+  x2[29] = _mm_subs_epi16(x1[2], x1[29]);
+  x2[3] = _mm_adds_epi16(x1[3], x1[28]);
+  x2[28] = _mm_subs_epi16(x1[3], x1[28]);
+  x2[4] = _mm_adds_epi16(x1[4], x1[27]);
+  x2[27] = _mm_subs_epi16(x1[4], x1[27]);
+  x2[5] = _mm_adds_epi16(x1[5], x1[26]);
+  x2[26] = _mm_subs_epi16(x1[5], x1[26]);
+  x2[6] = _mm_adds_epi16(x1[6], x1[25]);
+  x2[25] = _mm_subs_epi16(x1[6], x1[25]);
+  x2[7] = _mm_adds_epi16(x1[7], x1[24]);
+  x2[24] = _mm_subs_epi16(x1[7], x1[24]);
+  x2[8] = _mm_adds_epi16(x1[8], x1[23]);
+  x2[23] = _mm_subs_epi16(x1[8], x1[23]);
+  x2[9] = _mm_adds_epi16(x1[9], x1[22]);
+  x2[22] = _mm_subs_epi16(x1[9], x1[22]);
+  x2[10] = _mm_adds_epi16(x1[10], x1[21]);
+  x2[21] = _mm_subs_epi16(x1[10], x1[21]);
+  x2[11] = _mm_adds_epi16(x1[11], x1[20]);
+  x2[20] = _mm_subs_epi16(x1[11], x1[20]);
+  x2[12] = _mm_adds_epi16(x1[12], x1[19]);
+  x2[19] = _mm_subs_epi16(x1[12], x1[19]);
+  x2[13] = _mm_adds_epi16(x1[13], x1[18]);
+  x2[18] = _mm_subs_epi16(x1[13], x1[18]);
+  x2[14] = _mm_adds_epi16(x1[14], x1[17]);
+  x2[17] = _mm_subs_epi16(x1[14], x1[17]);
+  x2[15] = _mm_adds_epi16(x1[15], x1[16]);
+  x2[16] = _mm_subs_epi16(x1[15], x1[16]);
+  x2[32] = x1[32];
+  x2[33] = x1[33];
+  x2[34] = x1[34];
+  x2[35] = x1[35];
+  x2[36] = x1[36];
+  x2[37] = x1[37];
+  x2[38] = x1[38];
+  x2[39] = x1[39];
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[40], x1[55], x2[40], x2[55]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[41], x1[54], x2[41], x2[54]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[42], x1[53], x2[42], x2[53]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[43], x1[52], x2[43], x2[52]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[44], x1[51], x2[44], x2[51]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[45], x1[50], x2[45], x2[50]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[46], x1[49], x2[46], x2[49]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[47], x1[48], x2[47], x2[48]);
+  x2[56] = x1[56];
+  x2[57] = x1[57];
+  x2[58] = x1[58];
+  x2[59] = x1[59];
+  x2[60] = x1[60];
+  x2[61] = x1[61];
+  x2[62] = x1[62];
+  x2[63] = x1[63];
+
+  // stage 3
+  __m128i x3[64];
+  x3[0] = _mm_adds_epi16(x2[0], x2[15]);
+  x3[15] = _mm_subs_epi16(x2[0], x2[15]);
+  x3[1] = _mm_adds_epi16(x2[1], x2[14]);
+  x3[14] = _mm_subs_epi16(x2[1], x2[14]);
+  x3[2] = _mm_adds_epi16(x2[2], x2[13]);
+  x3[13] = _mm_subs_epi16(x2[2], x2[13]);
+  x3[3] = _mm_adds_epi16(x2[3], x2[12]);
+  x3[12] = _mm_subs_epi16(x2[3], x2[12]);
+  x3[4] = _mm_adds_epi16(x2[4], x2[11]);
+  x3[11] = _mm_subs_epi16(x2[4], x2[11]);
+  x3[5] = _mm_adds_epi16(x2[5], x2[10]);
+  x3[10] = _mm_subs_epi16(x2[5], x2[10]);
+  x3[6] = _mm_adds_epi16(x2[6], x2[9]);
+  x3[9] = _mm_subs_epi16(x2[6], x2[9]);
+  x3[7] = _mm_adds_epi16(x2[7], x2[8]);
+  x3[8] = _mm_subs_epi16(x2[7], x2[8]);
+  x3[16] = x2[16];
+  x3[17] = x2[17];
+  x3[18] = x2[18];
+  x3[19] = x2[19];
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[20], x2[27], x3[20], x3[27]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[21], x2[26], x3[21], x3[26]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[22], x2[25], x3[22], x3[25]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[23], x2[24], x3[23], x3[24]);
+  x3[28] = x2[28];
+  x3[29] = x2[29];
+  x3[30] = x2[30];
+  x3[31] = x2[31];
+  x3[32] = _mm_adds_epi16(x2[32], x2[47]);
+  x3[47] = _mm_subs_epi16(x2[32], x2[47]);
+  x3[33] = _mm_adds_epi16(x2[33], x2[46]);
+  x3[46] = _mm_subs_epi16(x2[33], x2[46]);
+  x3[34] = _mm_adds_epi16(x2[34], x2[45]);
+  x3[45] = _mm_subs_epi16(x2[34], x2[45]);
+  x3[35] = _mm_adds_epi16(x2[35], x2[44]);
+  x3[44] = _mm_subs_epi16(x2[35], x2[44]);
+  x3[36] = _mm_adds_epi16(x2[36], x2[43]);
+  x3[43] = _mm_subs_epi16(x2[36], x2[43]);
+  x3[37] = _mm_adds_epi16(x2[37], x2[42]);
+  x3[42] = _mm_subs_epi16(x2[37], x2[42]);
+  x3[38] = _mm_adds_epi16(x2[38], x2[41]);
+  x3[41] = _mm_subs_epi16(x2[38], x2[41]);
+  x3[39] = _mm_adds_epi16(x2[39], x2[40]);
+  x3[40] = _mm_subs_epi16(x2[39], x2[40]);
+  x3[48] = _mm_subs_epi16(x2[63], x2[48]);
+  x3[63] = _mm_adds_epi16(x2[63], x2[48]);
+  x3[49] = _mm_subs_epi16(x2[62], x2[49]);
+  x3[62] = _mm_adds_epi16(x2[62], x2[49]);
+  x3[50] = _mm_subs_epi16(x2[61], x2[50]);
+  x3[61] = _mm_adds_epi16(x2[61], x2[50]);
+  x3[51] = _mm_subs_epi16(x2[60], x2[51]);
+  x3[60] = _mm_adds_epi16(x2[60], x2[51]);
+  x3[52] = _mm_subs_epi16(x2[59], x2[52]);
+  x3[59] = _mm_adds_epi16(x2[59], x2[52]);
+  x3[53] = _mm_subs_epi16(x2[58], x2[53]);
+  x3[58] = _mm_adds_epi16(x2[58], x2[53]);
+  x3[54] = _mm_subs_epi16(x2[57], x2[54]);
+  x3[57] = _mm_adds_epi16(x2[57], x2[54]);
+  x3[55] = _mm_subs_epi16(x2[56], x2[55]);
+  x3[56] = _mm_adds_epi16(x2[56], x2[55]);
+
+  // stage 4
+  __m128i x4[64];
+  x4[0] = _mm_adds_epi16(x3[0], x3[7]);
+  x4[7] = _mm_subs_epi16(x3[0], x3[7]);
+  x4[1] = _mm_adds_epi16(x3[1], x3[6]);
+  x4[6] = _mm_subs_epi16(x3[1], x3[6]);
+  x4[2] = _mm_adds_epi16(x3[2], x3[5]);
+  x4[5] = _mm_subs_epi16(x3[2], x3[5]);
+  x4[3] = _mm_adds_epi16(x3[3], x3[4]);
+  x4[4] = _mm_subs_epi16(x3[3], x3[4]);
+  x4[8] = x3[8];
+  x4[9] = x3[9];
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x3[10], x3[13], x4[10], x4[13]);
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x3[11], x3[12], x4[11], x4[12]);
+  x4[14] = x3[14];
+  x4[15] = x3[15];
+  x4[16] = _mm_adds_epi16(x3[16], x3[23]);
+  x4[23] = _mm_subs_epi16(x3[16], x3[23]);
+  x4[17] = _mm_adds_epi16(x3[17], x3[22]);
+  x4[22] = _mm_subs_epi16(x3[17], x3[22]);
+  x4[18] = _mm_adds_epi16(x3[18], x3[21]);
+  x4[21] = _mm_subs_epi16(x3[18], x3[21]);
+  x4[19] = _mm_adds_epi16(x3[19], x3[20]);
+  x4[20] = _mm_subs_epi16(x3[19], x3[20]);
+  x4[24] = _mm_subs_epi16(x3[31], x3[24]);
+  x4[31] = _mm_adds_epi16(x3[31], x3[24]);
+  x4[25] = _mm_subs_epi16(x3[30], x3[25]);
+  x4[30] = _mm_adds_epi16(x3[30], x3[25]);
+  x4[26] = _mm_subs_epi16(x3[29], x3[26]);
+  x4[29] = _mm_adds_epi16(x3[29], x3[26]);
+  x4[27] = _mm_subs_epi16(x3[28], x3[27]);
+  x4[28] = _mm_adds_epi16(x3[28], x3[27]);
+  x4[32] = x3[32];
+  x4[33] = x3[33];
+  x4[34] = x3[34];
+  x4[35] = x3[35];
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[36], x3[59], x4[36], x4[59]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[37], x3[58], x4[37], x4[58]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[38], x3[57], x4[38], x4[57]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[39], x3[56], x4[39], x4[56]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[40], x3[55], x4[40], x4[55]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[41], x3[54], x4[41], x4[54]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[42], x3[53], x4[42], x4[53]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[43], x3[52], x4[43], x4[52]);
+  x4[44] = x3[44];
+  x4[45] = x3[45];
+  x4[46] = x3[46];
+  x4[47] = x3[47];
+  x4[48] = x3[48];
+  x4[49] = x3[49];
+  x4[50] = x3[50];
+  x4[51] = x3[51];
+  x4[60] = x3[60];
+  x4[61] = x3[61];
+  x4[62] = x3[62];
+  x4[63] = x3[63];
+
+  // stage 5
+  __m128i x5[64];
+  x5[0] = _mm_adds_epi16(x4[0], x4[3]);
+  x5[3] = _mm_subs_epi16(x4[0], x4[3]);
+  x5[1] = _mm_adds_epi16(x4[1], x4[2]);
+  x5[2] = _mm_subs_epi16(x4[1], x4[2]);
+  x5[4] = x4[4];
+  btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x4[5], x4[6], x5[5], x5[6]);
+  x5[7] = x4[7];
+  x5[8] = _mm_adds_epi16(x4[8], x4[11]);
+  x5[11] = _mm_subs_epi16(x4[8], x4[11]);
+  x5[9] = _mm_adds_epi16(x4[9], x4[10]);
+  x5[10] = _mm_subs_epi16(x4[9], x4[10]);
+  x5[12] = _mm_subs_epi16(x4[15], x4[12]);
+  x5[15] = _mm_adds_epi16(x4[15], x4[12]);
+  x5[13] = _mm_subs_epi16(x4[14], x4[13]);
+  x5[14] = _mm_adds_epi16(x4[14], x4[13]);
+  x5[16] = x4[16];
+  x5[17] = x4[17];
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x4[18], x4[29], x5[18], x5[29]);
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x4[19], x4[28], x5[19], x5[28]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x4[20], x4[27], x5[20], x5[27]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x4[21], x4[26], x5[21], x5[26]);
+  x5[22] = x4[22];
+  x5[23] = x4[23];
+  x5[24] = x4[24];
+  x5[25] = x4[25];
+  x5[30] = x4[30];
+  x5[31] = x4[31];
+  x5[32] = _mm_adds_epi16(x4[32], x4[39]);
+  x5[39] = _mm_subs_epi16(x4[32], x4[39]);
+  x5[33] = _mm_adds_epi16(x4[33], x4[38]);
+  x5[38] = _mm_subs_epi16(x4[33], x4[38]);
+  x5[34] = _mm_adds_epi16(x4[34], x4[37]);
+  x5[37] = _mm_subs_epi16(x4[34], x4[37]);
+  x5[35] = _mm_adds_epi16(x4[35], x4[36]);
+  x5[36] = _mm_subs_epi16(x4[35], x4[36]);
+  x5[40] = _mm_subs_epi16(x4[47], x4[40]);
+  x5[47] = _mm_adds_epi16(x4[47], x4[40]);
+  x5[41] = _mm_subs_epi16(x4[46], x4[41]);
+  x5[46] = _mm_adds_epi16(x4[46], x4[41]);
+  x5[42] = _mm_subs_epi16(x4[45], x4[42]);
+  x5[45] = _mm_adds_epi16(x4[45], x4[42]);
+  x5[43] = _mm_subs_epi16(x4[44], x4[43]);
+  x5[44] = _mm_adds_epi16(x4[44], x4[43]);
+  x5[48] = _mm_adds_epi16(x4[48], x4[55]);
+  x5[55] = _mm_subs_epi16(x4[48], x4[55]);
+  x5[49] = _mm_adds_epi16(x4[49], x4[54]);
+  x5[54] = _mm_subs_epi16(x4[49], x4[54]);
+  x5[50] = _mm_adds_epi16(x4[50], x4[53]);
+  x5[53] = _mm_subs_epi16(x4[50], x4[53]);
+  x5[51] = _mm_adds_epi16(x4[51], x4[52]);
+  x5[52] = _mm_subs_epi16(x4[51], x4[52]);
+  x5[56] = _mm_subs_epi16(x4[63], x4[56]);
+  x5[63] = _mm_adds_epi16(x4[63], x4[56]);
+  x5[57] = _mm_subs_epi16(x4[62], x4[57]);
+  x5[62] = _mm_adds_epi16(x4[62], x4[57]);
+  x5[58] = _mm_subs_epi16(x4[61], x4[58]);
+  x5[61] = _mm_adds_epi16(x4[61], x4[58]);
+  x5[59] = _mm_subs_epi16(x4[60], x4[59]);
+  x5[60] = _mm_adds_epi16(x4[60], x4[59]);
+
+  // stage 6
+  __m128i x6[64];
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x5[0], x5[1], x6[0], x6[1]);
+  btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x5[2], x5[3], x6[2], x6[3]);
+  x6[4] = _mm_adds_epi16(x5[4], x5[5]);
+  x6[5] = _mm_subs_epi16(x5[4], x5[5]);
+  x6[6] = _mm_subs_epi16(x5[7], x5[6]);
+  x6[7] = _mm_adds_epi16(x5[7], x5[6]);
+  x6[8] = x5[8];
+  btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x5[9], x5[14], x6[9], x6[14]);
+  btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x5[10], x5[13], x6[10], x6[13]);
+  x6[11] = x5[11];
+  x6[12] = x5[12];
+  x6[15] = x5[15];
+  x6[16] = _mm_adds_epi16(x5[16], x5[19]);
+  x6[19] = _mm_subs_epi16(x5[16], x5[19]);
+  x6[17] = _mm_adds_epi16(x5[17], x5[18]);
+  x6[18] = _mm_subs_epi16(x5[17], x5[18]);
+  x6[20] = _mm_subs_epi16(x5[23], x5[20]);
+  x6[23] = _mm_adds_epi16(x5[23], x5[20]);
+  x6[21] = _mm_subs_epi16(x5[22], x5[21]);
+  x6[22] = _mm_adds_epi16(x5[22], x5[21]);
+  x6[24] = _mm_adds_epi16(x5[24], x5[27]);
+  x6[27] = _mm_subs_epi16(x5[24], x5[27]);
+  x6[25] = _mm_adds_epi16(x5[25], x5[26]);
+  x6[26] = _mm_subs_epi16(x5[25], x5[26]);
+  x6[28] = _mm_subs_epi16(x5[31], x5[28]);
+  x6[31] = _mm_adds_epi16(x5[31], x5[28]);
+  x6[29] = _mm_subs_epi16(x5[30], x5[29]);
+  x6[30] = _mm_adds_epi16(x5[30], x5[29]);
+  x6[32] = x5[32];
+  x6[33] = x5[33];
+  btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x5[34], x5[61], x6[34], x6[61]);
+  btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x5[35], x5[60], x6[35], x6[60]);
+  btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x5[36], x5[59], x6[36], x6[59]);
+  btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x5[37], x5[58], x6[37], x6[58]);
+  x6[38] = x5[38];
+  x6[39] = x5[39];
+  x6[40] = x5[40];
+  x6[41] = x5[41];
+  btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x5[42], x5[53], x6[42], x6[53]);
+  btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x5[43], x5[52], x6[43], x6[52]);
+  btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x5[44], x5[51], x6[44], x6[51]);
+  btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x5[45], x5[50], x6[45], x6[50]);
+  x6[46] = x5[46];
+  x6[47] = x5[47];
+  x6[48] = x5[48];
+  x6[49] = x5[49];
+  x6[54] = x5[54];
+  x6[55] = x5[55];
+  x6[56] = x5[56];
+  x6[57] = x5[57];
+  x6[62] = x5[62];
+  x6[63] = x5[63];
+
+  // stage 7
+  __m128i x7[64];
+  x7[0] = x6[0];
+  x7[1] = x6[1];
+  x7[2] = x6[2];
+  x7[3] = x6[3];
+  btf_16_sse2(cospi_p56_p08, cospi_m08_p56, x6[4], x6[7], x7[4], x7[7]);
+  btf_16_sse2(cospi_p24_p40, cospi_m40_p24, x6[5], x6[6], x7[5], x7[6]);
+  x7[8] = _mm_adds_epi16(x6[8], x6[9]);
+  x7[9] = _mm_subs_epi16(x6[8], x6[9]);
+  x7[10] = _mm_subs_epi16(x6[11], x6[10]);
+  x7[11] = _mm_adds_epi16(x6[11], x6[10]);
+  x7[12] = _mm_adds_epi16(x6[12], x6[13]);
+  x7[13] = _mm_subs_epi16(x6[12], x6[13]);
+  x7[14] = _mm_subs_epi16(x6[15], x6[14]);
+  x7[15] = _mm_adds_epi16(x6[15], x6[14]);
+  x7[16] = x6[16];
+  btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x6[17], x6[30], x7[17], x7[30]);
+  btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x6[18], x6[29], x7[18], x7[29]);
+  x7[19] = x6[19];
+  x7[20] = x6[20];
+  btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x6[21], x6[26], x7[21], x7[26]);
+  btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x6[22], x6[25], x7[22], x7[25]);
+  x7[23] = x6[23];
+  x7[24] = x6[24];
+  x7[27] = x6[27];
+  x7[28] = x6[28];
+  x7[31] = x6[31];
+  x7[32] = _mm_adds_epi16(x6[32], x6[35]);
+  x7[35] = _mm_subs_epi16(x6[32], x6[35]);
+  x7[33] = _mm_adds_epi16(x6[33], x6[34]);
+  x7[34] = _mm_subs_epi16(x6[33], x6[34]);
+  x7[36] = _mm_subs_epi16(x6[39], x6[36]);
+  x7[39] = _mm_adds_epi16(x6[39], x6[36]);
+  x7[37] = _mm_subs_epi16(x6[38], x6[37]);
+  x7[38] = _mm_adds_epi16(x6[38], x6[37]);
+  x7[40] = _mm_adds_epi16(x6[40], x6[43]);
+  x7[43] = _mm_subs_epi16(x6[40], x6[43]);
+  x7[41] = _mm_adds_epi16(x6[41], x6[42]);
+  x7[42] = _mm_subs_epi16(x6[41], x6[42]);
+  x7[44] = _mm_subs_epi16(x6[47], x6[44]);
+  x7[47] = _mm_adds_epi16(x6[47], x6[44]);
+  x7[45] = _mm_subs_epi16(x6[46], x6[45]);
+  x7[46] = _mm_adds_epi16(x6[46], x6[45]);
+  x7[48] = _mm_adds_epi16(x6[48], x6[51]);
+  x7[51] = _mm_subs_epi16(x6[48], x6[51]);
+  x7[49] = _mm_adds_epi16(x6[49], x6[50]);
+  x7[50] = _mm_subs_epi16(x6[49], x6[50]);
+  x7[52] = _mm_subs_epi16(x6[55], x6[52]);
+  x7[55] = _mm_adds_epi16(x6[55], x6[52]);
+  x7[53] = _mm_subs_epi16(x6[54], x6[53]);
+  x7[54] = _mm_adds_epi16(x6[54], x6[53]);
+  x7[56] = _mm_adds_epi16(x6[56], x6[59]);
+  x7[59] = _mm_subs_epi16(x6[56], x6[59]);
+  x7[57] = _mm_adds_epi16(x6[57], x6[58]);
+  x7[58] = _mm_subs_epi16(x6[57], x6[58]);
+  x7[60] = _mm_subs_epi16(x6[63], x6[60]);
+  x7[63] = _mm_adds_epi16(x6[63], x6[60]);
+  x7[61] = _mm_subs_epi16(x6[62], x6[61]);
+  x7[62] = _mm_adds_epi16(x6[62], x6[61]);
+
+  // stage 8
+  __m128i x8[64];
+  x8[0] = x7[0];
+  x8[1] = x7[1];
+  x8[2] = x7[2];
+  x8[3] = x7[3];
+  x8[4] = x7[4];
+  x8[5] = x7[5];
+  x8[6] = x7[6];
+  x8[7] = x7[7];
+  btf_16_sse2(cospi_p60_p04, cospi_m04_p60, x7[8], x7[15], x8[8], x8[15]);
+  btf_16_sse2(cospi_p28_p36, cospi_m36_p28, x7[9], x7[14], x8[9], x8[14]);
+  btf_16_sse2(cospi_p44_p20, cospi_m20_p44, x7[10], x7[13], x8[10], x8[13]);
+  btf_16_sse2(cospi_p12_p52, cospi_m52_p12, x7[11], x7[12], x8[11], x8[12]);
+  x8[16] = _mm_adds_epi16(x7[16], x7[17]);
+  x8[17] = _mm_subs_epi16(x7[16], x7[17]);
+  x8[18] = _mm_subs_epi16(x7[19], x7[18]);
+  x8[19] = _mm_adds_epi16(x7[19], x7[18]);
+  x8[20] = _mm_adds_epi16(x7[20], x7[21]);
+  x8[21] = _mm_subs_epi16(x7[20], x7[21]);
+  x8[22] = _mm_subs_epi16(x7[23], x7[22]);
+  x8[23] = _mm_adds_epi16(x7[23], x7[22]);
+  x8[24] = _mm_adds_epi16(x7[24], x7[25]);
+  x8[25] = _mm_subs_epi16(x7[24], x7[25]);
+  x8[26] = _mm_subs_epi16(x7[27], x7[26]);
+  x8[27] = _mm_adds_epi16(x7[27], x7[26]);
+  x8[28] = _mm_adds_epi16(x7[28], x7[29]);
+  x8[29] = _mm_subs_epi16(x7[28], x7[29]);
+  x8[30] = _mm_subs_epi16(x7[31], x7[30]);
+  x8[31] = _mm_adds_epi16(x7[31], x7[30]);
+  x8[32] = x7[32];
+  btf_16_sse2(cospi_m04_p60, cospi_p60_p04, x7[33], x7[62], x8[33], x8[62]);
+  btf_16_sse2(cospi_m60_m04, cospi_m04_p60, x7[34], x7[61], x8[34], x8[61]);
+  x8[35] = x7[35];
+  x8[36] = x7[36];
+  btf_16_sse2(cospi_m36_p28, cospi_p28_p36, x7[37], x7[58], x8[37], x8[58]);
+  btf_16_sse2(cospi_m28_m36, cospi_m36_p28, x7[38], x7[57], x8[38], x8[57]);
+  x8[39] = x7[39];
+  x8[40] = x7[40];
+  btf_16_sse2(cospi_m20_p44, cospi_p44_p20, x7[41], x7[54], x8[41], x8[54]);
+  btf_16_sse2(cospi_m44_m20, cospi_m20_p44, x7[42], x7[53], x8[42], x8[53]);
+  x8[43] = x7[43];
+  x8[44] = x7[44];
+  btf_16_sse2(cospi_m52_p12, cospi_p12_p52, x7[45], x7[50], x8[45], x8[50]);
+  btf_16_sse2(cospi_m12_m52, cospi_m52_p12, x7[46], x7[49], x8[46], x8[49]);
+  x8[47] = x7[47];
+  x8[48] = x7[48];
+  x8[51] = x7[51];
+  x8[52] = x7[52];
+  x8[55] = x7[55];
+  x8[56] = x7[56];
+  x8[59] = x7[59];
+  x8[60] = x7[60];
+  x8[63] = x7[63];
+
+  // stage 9
+  __m128i x9[64];
+  x9[0] = x8[0];
+  x9[1] = x8[1];
+  x9[2] = x8[2];
+  x9[3] = x8[3];
+  x9[4] = x8[4];
+  x9[5] = x8[5];
+  x9[6] = x8[6];
+  x9[7] = x8[7];
+  x9[8] = x8[8];
+  x9[9] = x8[9];
+  x9[10] = x8[10];
+  x9[11] = x8[11];
+  x9[12] = x8[12];
+  x9[13] = x8[13];
+  x9[14] = x8[14];
+  x9[15] = x8[15];
+  btf_16_sse2(cospi_p62_p02, cospi_m02_p62, x8[16], x8[31], x9[16], x9[31]);
+  btf_16_sse2(cospi_p30_p34, cospi_m34_p30, x8[17], x8[30], x9[17], x9[30]);
+  btf_16_sse2(cospi_p46_p18, cospi_m18_p46, x8[18], x8[29], x9[18], x9[29]);
+  btf_16_sse2(cospi_p14_p50, cospi_m50_p14, x8[19], x8[28], x9[19], x9[28]);
+  btf_16_sse2(cospi_p54_p10, cospi_m10_p54, x8[20], x8[27], x9[20], x9[27]);
+  btf_16_sse2(cospi_p22_p42, cospi_m42_p22, x8[21], x8[26], x9[21], x9[26]);
+  btf_16_sse2(cospi_p38_p26, cospi_m26_p38, x8[22], x8[25], x9[22], x9[25]);
+  btf_16_sse2(cospi_p06_p58, cospi_m58_p06, x8[23], x8[24], x9[23], x9[24]);
+  x9[32] = _mm_adds_epi16(x8[32], x8[33]);
+  x9[33] = _mm_subs_epi16(x8[32], x8[33]);
+  x9[34] = _mm_subs_epi16(x8[35], x8[34]);
+  x9[35] = _mm_adds_epi16(x8[35], x8[34]);
+  x9[36] = _mm_adds_epi16(x8[36], x8[37]);
+  x9[37] = _mm_subs_epi16(x8[36], x8[37]);
+  x9[38] = _mm_subs_epi16(x8[39], x8[38]);
+  x9[39] = _mm_adds_epi16(x8[39], x8[38]);
+  x9[40] = _mm_adds_epi16(x8[40], x8[41]);
+  x9[41] = _mm_subs_epi16(x8[40], x8[41]);
+  x9[42] = _mm_subs_epi16(x8[43], x8[42]);
+  x9[43] = _mm_adds_epi16(x8[43], x8[42]);
+  x9[44] = _mm_adds_epi16(x8[44], x8[45]);
+  x9[45] = _mm_subs_epi16(x8[44], x8[45]);
+  x9[46] = _mm_subs_epi16(x8[47], x8[46]);
+  x9[47] = _mm_adds_epi16(x8[47], x8[46]);
+  x9[48] = _mm_adds_epi16(x8[48], x8[49]);
+  x9[49] = _mm_subs_epi16(x8[48], x8[49]);
+  x9[50] = _mm_subs_epi16(x8[51], x8[50]);
+  x9[51] = _mm_adds_epi16(x8[51], x8[50]);
+  x9[52] = _mm_adds_epi16(x8[52], x8[53]);
+  x9[53] = _mm_subs_epi16(x8[52], x8[53]);
+  x9[54] = _mm_subs_epi16(x8[55], x8[54]);
+  x9[55] = _mm_adds_epi16(x8[55], x8[54]);
+  x9[56] = _mm_adds_epi16(x8[56], x8[57]);
+  x9[57] = _mm_subs_epi16(x8[56], x8[57]);
+  x9[58] = _mm_subs_epi16(x8[59], x8[58]);
+  x9[59] = _mm_adds_epi16(x8[59], x8[58]);
+  x9[60] = _mm_adds_epi16(x8[60], x8[61]);
+  x9[61] = _mm_subs_epi16(x8[60], x8[61]);
+  x9[62] = _mm_subs_epi16(x8[63], x8[62]);
+  x9[63] = _mm_adds_epi16(x8[63], x8[62]);
+
+  // stage 10
+  __m128i x10[64];
+  x10[0] = x9[0];
+  x10[1] = x9[1];
+  x10[2] = x9[2];
+  x10[3] = x9[3];
+  x10[4] = x9[4];
+  x10[5] = x9[5];
+  x10[6] = x9[6];
+  x10[7] = x9[7];
+  x10[8] = x9[8];
+  x10[9] = x9[9];
+  x10[10] = x9[10];
+  x10[11] = x9[11];
+  x10[12] = x9[12];
+  x10[13] = x9[13];
+  x10[14] = x9[14];
+  x10[15] = x9[15];
+  x10[16] = x9[16];
+  x10[17] = x9[17];
+  x10[18] = x9[18];
+  x10[19] = x9[19];
+  x10[20] = x9[20];
+  x10[21] = x9[21];
+  x10[22] = x9[22];
+  x10[23] = x9[23];
+  x10[24] = x9[24];
+  x10[25] = x9[25];
+  x10[26] = x9[26];
+  x10[27] = x9[27];
+  x10[28] = x9[28];
+  x10[29] = x9[29];
+  x10[30] = x9[30];
+  x10[31] = x9[31];
+  btf_16_sse2(cospi_p63_p01, cospi_m01_p63, x9[32], x9[63], x10[32], x10[63]);
+  btf_16_sse2(cospi_p31_p33, cospi_m33_p31, x9[33], x9[62], x10[33], x10[62]);
+  btf_16_sse2(cospi_p47_p17, cospi_m17_p47, x9[34], x9[61], x10[34], x10[61]);
+  btf_16_sse2(cospi_p15_p49, cospi_m49_p15, x9[35], x9[60], x10[35], x10[60]);
+  btf_16_sse2(cospi_p55_p09, cospi_m09_p55, x9[36], x9[59], x10[36], x10[59]);
+  btf_16_sse2(cospi_p23_p41, cospi_m41_p23, x9[37], x9[58], x10[37], x10[58]);
+  btf_16_sse2(cospi_p39_p25, cospi_m25_p39, x9[38], x9[57], x10[38], x10[57]);
+  btf_16_sse2(cospi_p07_p57, cospi_m57_p07, x9[39], x9[56], x10[39], x10[56]);
+  btf_16_sse2(cospi_p59_p05, cospi_m05_p59, x9[40], x9[55], x10[40], x10[55]);
+  btf_16_sse2(cospi_p27_p37, cospi_m37_p27, x9[41], x9[54], x10[41], x10[54]);
+  btf_16_sse2(cospi_p43_p21, cospi_m21_p43, x9[42], x9[53], x10[42], x10[53]);
+  btf_16_sse2(cospi_p11_p53, cospi_m53_p11, x9[43], x9[52], x10[43], x10[52]);
+  btf_16_sse2(cospi_p51_p13, cospi_m13_p51, x9[44], x9[51], x10[44], x10[51]);
+  btf_16_sse2(cospi_p19_p45, cospi_m45_p19, x9[45], x9[50], x10[45], x10[50]);
+  btf_16_sse2(cospi_p35_p29, cospi_m29_p35, x9[46], x9[49], x10[46], x10[49]);
+  btf_16_sse2(cospi_p03_p61, cospi_m61_p03, x9[47], x9[48], x10[47], x10[48]);
+
+  // stage 11
+  output[0] = x10[0];
+  output[1] = x10[32];
+  output[2] = x10[16];
+  output[3] = x10[48];
+  output[4] = x10[8];
+  output[5] = x10[40];
+  output[6] = x10[24];
+  output[7] = x10[56];
+  output[8] = x10[4];
+  output[9] = x10[36];
+  output[10] = x10[20];
+  output[11] = x10[52];
+  output[12] = x10[12];
+  output[13] = x10[44];
+  output[14] = x10[28];
+  output[15] = x10[60];
+  output[16] = x10[2];
+  output[17] = x10[34];
+  output[18] = x10[18];
+  output[19] = x10[50];
+  output[20] = x10[10];
+  output[21] = x10[42];
+  output[22] = x10[26];
+  output[23] = x10[58];
+  output[24] = x10[6];
+  output[25] = x10[38];
+  output[26] = x10[22];
+  output[27] = x10[54];
+  output[28] = x10[14];
+  output[29] = x10[46];
+  output[30] = x10[30];
+  output[31] = x10[62];
+  output[32] = x10[1];
+  output[33] = x10[33];
+  output[34] = x10[17];
+  output[35] = x10[49];
+  output[36] = x10[9];
+  output[37] = x10[41];
+  output[38] = x10[25];
+  output[39] = x10[57];
+  output[40] = x10[5];
+  output[41] = x10[37];
+  output[42] = x10[21];
+  output[43] = x10[53];
+  output[44] = x10[13];
+  output[45] = x10[45];
+  output[46] = x10[29];
+  output[47] = x10[61];
+  output[48] = x10[3];
+  output[49] = x10[35];
+  output[50] = x10[19];
+  output[51] = x10[51];
+  output[52] = x10[11];
+  output[53] = x10[43];
+  output[54] = x10[27];
+  output[55] = x10[59];
+  output[56] = x10[7];
+  output[57] = x10[39];
+  output[58] = x10[23];
+  output[59] = x10[55];
+  output[60] = x10[15];
+  output[61] = x10[47];
+  output[62] = x10[31];
+  output[63] = x10[63];
+}
+
+static void fadst4x4_new_sse2(const __m128i *input, __m128i *output,
+                              int8_t cos_bit) {
+  const int32_t *sinpi = sinpi_arr(cos_bit);
+  const __m128i sinpi_p01_p02 = pair_set_epi16(sinpi[1], sinpi[2]);
+  const __m128i sinpi_p04_m01 = pair_set_epi16(sinpi[4], -sinpi[1]);
+  const __m128i sinpi_p03_p04 = pair_set_epi16(sinpi[3], sinpi[4]);
+  const __m128i sinpi_m03_p02 = pair_set_epi16(-sinpi[3], sinpi[2]);
+  const __m128i sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi[3]);
+  const __m128i __zero = _mm_set1_epi16(0);
+  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+  const __m128i in7 = _mm_add_epi16(input[0], input[1]);
+  __m128i u[8], v[8];
+
+  u[0] = _mm_unpacklo_epi16(input[0], input[1]);
+  u[1] = _mm_unpacklo_epi16(input[2], input[3]);
+  u[2] = _mm_unpacklo_epi16(in7, __zero);
+  u[3] = _mm_unpacklo_epi16(input[2], __zero);
+  u[4] = _mm_unpacklo_epi16(input[3], __zero);
+
+  v[0] = _mm_madd_epi16(u[0], sinpi_p01_p02);  // s0 + s2
+  v[1] = _mm_madd_epi16(u[1], sinpi_p03_p04);  // s4 + s5
+  v[2] = _mm_madd_epi16(u[2], sinpi_p03_p03);  // x1
+  v[3] = _mm_madd_epi16(u[0], sinpi_p04_m01);  // s1 - s3
+  v[4] = _mm_madd_epi16(u[1], sinpi_m03_p02);  // -s4 + s6
+  v[5] = _mm_madd_epi16(u[3], sinpi_p03_p03);  // s4
+  v[6] = _mm_madd_epi16(u[4], sinpi_p03_p03);
+
+  u[0] = _mm_add_epi32(v[0], v[1]);
+  u[1] = _mm_sub_epi32(v[2], v[6]);
+  u[2] = _mm_add_epi32(v[3], v[4]);
+  u[3] = _mm_sub_epi32(u[2], u[0]);
+  u[4] = _mm_slli_epi32(v[5], 2);
+  u[5] = _mm_sub_epi32(u[4], v[5]);
+  u[6] = _mm_add_epi32(u[3], u[5]);
+
+  v[0] = _mm_add_epi32(u[0], __rounding);
+  v[1] = _mm_add_epi32(u[1], __rounding);
+  v[2] = _mm_add_epi32(u[2], __rounding);
+  v[3] = _mm_add_epi32(u[6], __rounding);
+
+  u[0] = _mm_srai_epi32(v[0], cos_bit);
+  u[1] = _mm_srai_epi32(v[1], cos_bit);
+  u[2] = _mm_srai_epi32(v[2], cos_bit);
+  u[3] = _mm_srai_epi32(v[3], cos_bit);
+
+  output[0] = _mm_packs_epi32(u[0], u[2]);
+  output[1] = _mm_packs_epi32(u[1], u[3]);
+  output[2] = _mm_srli_si128(output[0], 8);
+  output[3] = _mm_srli_si128(output[1], 8);
+}
+
+static void fadst4x8_new_sse2(const __m128i *input, __m128i *output,
+                              int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m128i __zero = _mm_setzero_si128();
+  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+  __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+  __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+  __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
+  __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
+  __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
+  __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
+  __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
+  __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
+  __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
+  __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
+  __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
+
+  // stage 1
+  __m128i x1[8];
+  x1[0] = input[0];
+  x1[1] = _mm_subs_epi16(__zero, input[7]);
+  x1[2] = _mm_subs_epi16(__zero, input[3]);
+  x1[3] = input[4];
+  x1[4] = _mm_subs_epi16(__zero, input[1]);
+  x1[5] = input[6];
+  x1[6] = input[2];
+  x1[7] = _mm_subs_epi16(__zero, input[5]);
+
+  // stage 2
+  __m128i x2[8];
+  x2[0] = x1[0];
+  x2[1] = x1[1];
+  btf_16_w4_sse2(&cospi_p32_p32, &cospi_p32_m32, __rounding, cos_bit, &x1[2],
+                 &x1[3], &x2[2], &x2[3]);
+  x2[4] = x1[4];
+  x2[5] = x1[5];
+  btf_16_w4_sse2(&cospi_p32_p32, &cospi_p32_m32, __rounding, cos_bit, &x1[6],
+                 &x1[7], &x2[6], &x2[7]);
+
+  // stage 3
+  __m128i x3[8];
+  x3[0] = _mm_adds_epi16(x2[0], x2[2]);
+  x3[2] = _mm_subs_epi16(x2[0], x2[2]);
+  x3[1] = _mm_adds_epi16(x2[1], x2[3]);
+  x3[3] = _mm_subs_epi16(x2[1], x2[3]);
+  x3[4] = _mm_adds_epi16(x2[4], x2[6]);
+  x3[6] = _mm_subs_epi16(x2[4], x2[6]);
+  x3[5] = _mm_adds_epi16(x2[5], x2[7]);
+  x3[7] = _mm_subs_epi16(x2[5], x2[7]);
+
+  // stage 4
+  __m128i x4[8];
+  x4[0] = x3[0];
+  x4[1] = x3[1];
+  x4[2] = x3[2];
+  x4[3] = x3[3];
+  btf_16_w4_sse2(&cospi_p16_p48, &cospi_p48_m16, __rounding, cos_bit, &x3[4],
+                 &x3[5], &x4[4], &x4[5]);
+  btf_16_w4_sse2(&cospi_m48_p16, &cospi_p16_p48, __rounding, cos_bit, &x3[6],
+                 &x3[7], &x4[6], &x4[7]);
+
+  // stage 5
+  __m128i x5[8];
+  x5[0] = _mm_adds_epi16(x4[0], x4[4]);
+  x5[4] = _mm_subs_epi16(x4[0], x4[4]);
+  x5[1] = _mm_adds_epi16(x4[1], x4[5]);
+  x5[5] = _mm_subs_epi16(x4[1], x4[5]);
+  x5[2] = _mm_adds_epi16(x4[2], x4[6]);
+  x5[6] = _mm_subs_epi16(x4[2], x4[6]);
+  x5[3] = _mm_adds_epi16(x4[3], x4[7]);
+  x5[7] = _mm_subs_epi16(x4[3], x4[7]);
+
+  // stage 6
+  __m128i x6[8];
+  btf_16_w4_sse2(&cospi_p04_p60, &cospi_p60_m04, __rounding, cos_bit, &x5[0],
+                 &x5[1], &x6[0], &x6[1]);
+  btf_16_w4_sse2(&cospi_p20_p44, &cospi_p44_m20, __rounding, cos_bit, &x5[2],
+                 &x5[3], &x6[2], &x6[3]);
+  btf_16_w4_sse2(&cospi_p36_p28, &cospi_p28_m36, __rounding, cos_bit, &x5[4],
+                 &x5[5], &x6[4], &x6[5]);
+  btf_16_w4_sse2(&cospi_p52_p12, &cospi_p12_m52, __rounding, cos_bit, &x5[6],
+                 &x5[7], &x6[6], &x6[7]);
+
+  // stage 7
+  output[0] = x6[1];
+  output[1] = x6[6];
+  output[2] = x6[3];
+  output[3] = x6[4];
+  output[4] = x6[5];
+  output[5] = x6[2];
+  output[6] = x6[7];
+  output[7] = x6[0];
+}
+
+static void fadst8x4_new_sse2(const __m128i *input, __m128i *output,
+                              int8_t cos_bit) {
+  const int32_t *sinpi = sinpi_arr(cos_bit);
+  const __m128i sinpi_p01_p02 = pair_set_epi16(sinpi[1], sinpi[2]);
+  const __m128i sinpi_p04_m01 = pair_set_epi16(sinpi[4], -sinpi[1]);
+  const __m128i sinpi_p03_p04 = pair_set_epi16(sinpi[3], sinpi[4]);
+  const __m128i sinpi_m03_p02 = pair_set_epi16(-sinpi[3], sinpi[2]);
+  const __m128i sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi[3]);
+  const __m128i __zero = _mm_set1_epi16(0);
+  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+  const __m128i in7 = _mm_add_epi16(input[0], input[1]);
+  __m128i u_lo[8], u_hi[8], v_lo[8], v_hi[8];
+
+  u_lo[0] = _mm_unpacklo_epi16(input[0], input[1]);
+  u_hi[0] = _mm_unpackhi_epi16(input[0], input[1]);
+  u_lo[1] = _mm_unpacklo_epi16(input[2], input[3]);
+  u_hi[1] = _mm_unpackhi_epi16(input[2], input[3]);
+  u_lo[2] = _mm_unpacklo_epi16(in7, __zero);
+  u_hi[2] = _mm_unpackhi_epi16(in7, __zero);
+  u_lo[3] = _mm_unpacklo_epi16(input[2], __zero);
+  u_hi[3] = _mm_unpackhi_epi16(input[2], __zero);
+  u_lo[4] = _mm_unpacklo_epi16(input[3], __zero);
+  u_hi[4] = _mm_unpackhi_epi16(input[3], __zero);
+
+  v_lo[0] = _mm_madd_epi16(u_lo[0], sinpi_p01_p02);  // s0 + s2
+  v_hi[0] = _mm_madd_epi16(u_hi[0], sinpi_p01_p02);  // s0 + s2
+  v_lo[1] = _mm_madd_epi16(u_lo[1], sinpi_p03_p04);  // s4 + s5
+  v_hi[1] = _mm_madd_epi16(u_hi[1], sinpi_p03_p04);  // s4 + s5
+  v_lo[2] = _mm_madd_epi16(u_lo[2], sinpi_p03_p03);  // x1
+  v_hi[2] = _mm_madd_epi16(u_hi[2], sinpi_p03_p03);  // x1
+  v_lo[3] = _mm_madd_epi16(u_lo[0], sinpi_p04_m01);  // s1 - s3
+  v_hi[3] = _mm_madd_epi16(u_hi[0], sinpi_p04_m01);  // s1 - s3
+  v_lo[4] = _mm_madd_epi16(u_lo[1], sinpi_m03_p02);  // -s4 + s6
+  v_hi[4] = _mm_madd_epi16(u_hi[1], sinpi_m03_p02);  // -s4 + s6
+  v_lo[5] = _mm_madd_epi16(u_lo[3], sinpi_p03_p03);  // s4
+  v_hi[5] = _mm_madd_epi16(u_hi[3], sinpi_p03_p03);  // s4
+  v_lo[6] = _mm_madd_epi16(u_lo[4], sinpi_p03_p03);
+  v_hi[6] = _mm_madd_epi16(u_hi[4], sinpi_p03_p03);
+
+  u_lo[0] = _mm_add_epi32(v_lo[0], v_lo[1]);
+  u_hi[0] = _mm_add_epi32(v_hi[0], v_hi[1]);
+  u_lo[1] = _mm_sub_epi32(v_lo[2], v_lo[6]);
+  u_hi[1] = _mm_sub_epi32(v_hi[2], v_hi[6]);
+  u_lo[2] = _mm_add_epi32(v_lo[3], v_lo[4]);
+  u_hi[2] = _mm_add_epi32(v_hi[3], v_hi[4]);
+  u_lo[3] = _mm_sub_epi32(u_lo[2], u_lo[0]);
+  u_hi[3] = _mm_sub_epi32(u_hi[2], u_hi[0]);
+  u_lo[4] = _mm_slli_epi32(v_lo[5], 2);
+  u_hi[4] = _mm_slli_epi32(v_hi[5], 2);
+  u_lo[5] = _mm_sub_epi32(u_lo[4], v_lo[5]);
+  u_hi[5] = _mm_sub_epi32(u_hi[4], v_hi[5]);
+  u_lo[6] = _mm_add_epi32(u_lo[3], u_lo[5]);
+  u_hi[6] = _mm_add_epi32(u_hi[3], u_hi[5]);
+
+  v_lo[0] = _mm_add_epi32(u_lo[0], __rounding);
+  v_hi[0] = _mm_add_epi32(u_hi[0], __rounding);
+  v_lo[1] = _mm_add_epi32(u_lo[1], __rounding);
+  v_hi[1] = _mm_add_epi32(u_hi[1], __rounding);
+  v_lo[2] = _mm_add_epi32(u_lo[2], __rounding);
+  v_hi[2] = _mm_add_epi32(u_hi[2], __rounding);
+  v_lo[3] = _mm_add_epi32(u_lo[6], __rounding);
+  v_hi[3] = _mm_add_epi32(u_hi[6], __rounding);
+
+  u_lo[0] = _mm_srai_epi32(v_lo[0], cos_bit);
+  u_hi[0] = _mm_srai_epi32(v_hi[0], cos_bit);
+  u_lo[1] = _mm_srai_epi32(v_lo[1], cos_bit);
+  u_hi[1] = _mm_srai_epi32(v_hi[1], cos_bit);
+  u_lo[2] = _mm_srai_epi32(v_lo[2], cos_bit);
+  u_hi[2] = _mm_srai_epi32(v_hi[2], cos_bit);
+  u_lo[3] = _mm_srai_epi32(v_lo[3], cos_bit);
+  u_hi[3] = _mm_srai_epi32(v_hi[3], cos_bit);
+
+  output[0] = _mm_packs_epi32(u_lo[0], u_hi[0]);
+  output[1] = _mm_packs_epi32(u_lo[1], u_hi[1]);
+  output[2] = _mm_packs_epi32(u_lo[2], u_hi[2]);
+  output[3] = _mm_packs_epi32(u_lo[3], u_hi[3]);
+}
+
+static void fadst8x8_new_sse2(const __m128i *input, __m128i *output,
+                              int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m128i __zero = _mm_setzero_si128();
+  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+  __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+  __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+  __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
+  __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
+  __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
+  __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
+  __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
+  __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
+  __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
+  __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
+  __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
+
+  // stage 1
+  __m128i x1[8];
+  x1[0] = input[0];
+  x1[1] = _mm_subs_epi16(__zero, input[7]);
+  x1[2] = _mm_subs_epi16(__zero, input[3]);
+  x1[3] = input[4];
+  x1[4] = _mm_subs_epi16(__zero, input[1]);
+  x1[5] = input[6];
+  x1[6] = input[2];
+  x1[7] = _mm_subs_epi16(__zero, input[5]);
+
+  // stage 2
+  __m128i x2[8];
+  x2[0] = x1[0];
+  x2[1] = x1[1];
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[2], x1[3], x2[2], x2[3]);
+  x2[4] = x1[4];
+  x2[5] = x1[5];
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[6], x1[7], x2[6], x2[7]);
+
+  // stage 3
+  __m128i x3[8];
+  x3[0] = _mm_adds_epi16(x2[0], x2[2]);
+  x3[2] = _mm_subs_epi16(x2[0], x2[2]);
+  x3[1] = _mm_adds_epi16(x2[1], x2[3]);
+  x3[3] = _mm_subs_epi16(x2[1], x2[3]);
+  x3[4] = _mm_adds_epi16(x2[4], x2[6]);
+  x3[6] = _mm_subs_epi16(x2[4], x2[6]);
+  x3[5] = _mm_adds_epi16(x2[5], x2[7]);
+  x3[7] = _mm_subs_epi16(x2[5], x2[7]);
+
+  // stage 4
+  __m128i x4[8];
+  x4[0] = x3[0];
+  x4[1] = x3[1];
+  x4[2] = x3[2];
+  x4[3] = x3[3];
+  btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x3[4], x3[5], x4[4], x4[5]);
+  btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x3[6], x3[7], x4[6], x4[7]);
+
+  // stage 5
+  __m128i x5[8];
+  x5[0] = _mm_adds_epi16(x4[0], x4[4]);
+  x5[4] = _mm_subs_epi16(x4[0], x4[4]);
+  x5[1] = _mm_adds_epi16(x4[1], x4[5]);
+  x5[5] = _mm_subs_epi16(x4[1], x4[5]);
+  x5[2] = _mm_adds_epi16(x4[2], x4[6]);
+  x5[6] = _mm_subs_epi16(x4[2], x4[6]);
+  x5[3] = _mm_adds_epi16(x4[3], x4[7]);
+  x5[7] = _mm_subs_epi16(x4[3], x4[7]);
+
+  // stage 6
+  __m128i x6[8];
+  btf_16_sse2(cospi_p04_p60, cospi_p60_m04, x5[0], x5[1], x6[0], x6[1]);
+  btf_16_sse2(cospi_p20_p44, cospi_p44_m20, x5[2], x5[3], x6[2], x6[3]);
+  btf_16_sse2(cospi_p36_p28, cospi_p28_m36, x5[4], x5[5], x6[4], x6[5]);
+  btf_16_sse2(cospi_p52_p12, cospi_p12_m52, x5[6], x5[7], x6[6], x6[7]);
+
+  // stage 7
+  output[0] = x6[1];
+  output[1] = x6[6];
+  output[2] = x6[3];
+  output[3] = x6[4];
+  output[4] = x6[5];
+  output[5] = x6[2];
+  output[6] = x6[7];
+  output[7] = x6[0];
+}
+
+static void fadst8x16_new_sse2(const __m128i *input, __m128i *output,
+                               int8_t cos_bit) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m128i __zero = _mm_setzero_si128();
+  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+  __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+  __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+  __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+  __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+  __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
+  __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+  __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+  __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
+  __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
+  __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]);
+  __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]);
+  __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
+  __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
+  __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
+  __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
+  __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
+  __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
+  __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
+  __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
+  __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
+  __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
+  __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
+  __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
+  __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
+  __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
+  __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
+  __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
+
+  // stage 1
+  __m128i x1[16];
+  x1[0] = input[0];
+  x1[1] = _mm_subs_epi16(__zero, input[15]);
+  x1[2] = _mm_subs_epi16(__zero, input[7]);
+  x1[3] = input[8];
+  x1[4] = _mm_subs_epi16(__zero, input[3]);
+  x1[5] = input[12];
+  x1[6] = input[4];
+  x1[7] = _mm_subs_epi16(__zero, input[11]);
+  x1[8] = _mm_subs_epi16(__zero, input[1]);
+  x1[9] = input[14];
+  x1[10] = input[6];
+  x1[11] = _mm_subs_epi16(__zero, input[9]);
+  x1[12] = input[2];
+  x1[13] = _mm_subs_epi16(__zero, input[13]);
+  x1[14] = _mm_subs_epi16(__zero, input[5]);
+  x1[15] = input[10];
+
+  // stage 2
+  __m128i x2[16];
+  x2[0] = x1[0];
+  x2[1] = x1[1];
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[2], x1[3], x2[2], x2[3]);
+  x2[4] = x1[4];
+  x2[5] = x1[5];
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[6], x1[7], x2[6], x2[7]);
+  x2[8] = x1[8];
+  x2[9] = x1[9];
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[10], x1[11], x2[10], x2[11]);
+  x2[12] = x1[12];
+  x2[13] = x1[13];
+  btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[14], x1[15], x2[14], x2[15]);
+
+  // stage 3
+  __m128i x3[16];
+  x3[0] = _mm_adds_epi16(x2[0], x2[2]);
+  x3[2] = _mm_subs_epi16(x2[0], x2[2]);
+  x3[1] = _mm_adds_epi16(x2[1], x2[3]);
+  x3[3] = _mm_subs_epi16(x2[1], x2[3]);
+  x3[4] = _mm_adds_epi16(x2[4], x2[6]);
+  x3[6] = _mm_subs_epi16(x2[4], x2[6]);
+  x3[5] = _mm_adds_epi16(x2[5], x2[7]);
+  x3[7] = _mm_subs_epi16(x2[5], x2[7]);
+  x3[8] = _mm_adds_epi16(x2[8], x2[10]);
+  x3[10] = _mm_subs_epi16(x2[8], x2[10]);
+  x3[9] = _mm_adds_epi16(x2[9], x2[11]);
+  x3[11] = _mm_subs_epi16(x2[9], x2[11]);
+  x3[12] = _mm_adds_epi16(x2[12], x2[14]);
+  x3[14] = _mm_subs_epi16(x2[12], x2[14]);
+  x3[13] = _mm_adds_epi16(x2[13], x2[15]);
+  x3[15] = _mm_subs_epi16(x2[13], x2[15]);
+
+  // stage 4
+  __m128i x4[16];
+  x4[0] = x3[0];
+  x4[1] = x3[1];
+  x4[2] = x3[2];
+  x4[3] = x3[3];
+  btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x3[4], x3[5], x4[4], x4[5]);
+  btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x3[6], x3[7], x4[6], x4[7]);
+  x4[8] = x3[8];
+  x4[9] = x3[9];
+  x4[10] = x3[10];
+  x4[11] = x3[11];
+  btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x3[12], x3[13], x4[12], x4[13]);
+  btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x3[14], x3[15], x4[14], x4[15]);
+
+  // stage 5
+  __m128i x5[16];
+  x5[0] = _mm_adds_epi16(x4[0], x4[4]);
+  x5[4] = _mm_subs_epi16(x4[0], x4[4]);
+  x5[1] = _mm_adds_epi16(x4[1], x4[5]);
+  x5[5] = _mm_subs_epi16(x4[1], x4[5]);
+  x5[2] = _mm_adds_epi16(x4[2], x4[6]);
+  x5[6] = _mm_subs_epi16(x4[2], x4[6]);
+  x5[3] = _mm_adds_epi16(x4[3], x4[7]);
+  x5[7] = _mm_subs_epi16(x4[3], x4[7]);
+  x5[8] = _mm_adds_epi16(x4[8], x4[12]);
+  x5[12] = _mm_subs_epi16(x4[8], x4[12]);
+  x5[9] = _mm_adds_epi16(x4[9], x4[13]);
+  x5[13] = _mm_subs_epi16(x4[9], x4[13]);
+  x5[10] = _mm_adds_epi16(x4[10], x4[14]);
+  x5[14] = _mm_subs_epi16(x4[10], x4[14]);
+  x5[11] = _mm_adds_epi16(x4[11], x4[15]);
+  x5[15] = _mm_subs_epi16(x4[11], x4[15]);
+
+  // stage 6
+  __m128i x6[16];
+  x6[0] = x5[0];
+  x6[1] = x5[1];
+  x6[2] = x5[2];
+  x6[3] = x5[3];
+  x6[4] = x5[4];
+  x6[5] = x5[5];
+  x6[6] = x5[6];
+  x6[7] = x5[7];
+  btf_16_sse2(cospi_p08_p56, cospi_p56_m08, x5[8], x5[9], x6[8], x6[9]);
+  btf_16_sse2(cospi_p40_p24, cospi_p24_m40, x5[10], x5[11], x6[10], x6[11]);
+  btf_16_sse2(cospi_m56_p08, cospi_p08_p56, x5[12], x5[13], x6[12], x6[13]);
+  btf_16_sse2(cospi_m24_p40, cospi_p40_p24, x5[14], x5[15], x6[14], x6[15]);
+
+  // stage 7
+  __m128i x7[16];
+  x7[0] = _mm_adds_epi16(x6[0], x6[8]);
+  x7[8] = _mm_subs_epi16(x6[0], x6[8]);
+  x7[1] = _mm_adds_epi16(x6[1], x6[9]);
+  x7[9] = _mm_subs_epi16(x6[1], x6[9]);
+  x7[2] = _mm_adds_epi16(x6[2], x6[10]);
+  x7[10] = _mm_subs_epi16(x6[2], x6[10]);
+  x7[3] = _mm_adds_epi16(x6[3], x6[11]);
+  x7[11] = _mm_subs_epi16(x6[3], x6[11]);
+  x7[4] = _mm_adds_epi16(x6[4], x6[12]);
+  x7[12] = _mm_subs_epi16(x6[4], x6[12]);
+  x7[5] = _mm_adds_epi16(x6[5], x6[13]);
+  x7[13] = _mm_subs_epi16(x6[5], x6[13]);
+  x7[6] = _mm_adds_epi16(x6[6], x6[14]);
+  x7[14] = _mm_subs_epi16(x6[6], x6[14]);
+  x7[7] = _mm_adds_epi16(x6[7], x6[15]);
+  x7[15] = _mm_subs_epi16(x6[7], x6[15]);
+
+  // stage 8
+  __m128i x8[16];
+  btf_16_sse2(cospi_p02_p62, cospi_p62_m02, x7[0], x7[1], x8[0], x8[1]);
+  btf_16_sse2(cospi_p10_p54, cospi_p54_m10, x7[2], x7[3], x8[2], x8[3]);
+  btf_16_sse2(cospi_p18_p46, cospi_p46_m18, x7[4], x7[5], x8[4], x8[5]);
+  btf_16_sse2(cospi_p26_p38, cospi_p38_m26, x7[6], x7[7], x8[6], x8[7]);
+  btf_16_sse2(cospi_p34_p30, cospi_p30_m34, x7[8], x7[9], x8[8], x8[9]);
+  btf_16_sse2(cospi_p42_p22, cospi_p22_m42, x7[10], x7[11], x8[10], x8[11]);
+  btf_16_sse2(cospi_p50_p14, cospi_p14_m50, x7[12], x7[13], x8[12], x8[13]);
+  btf_16_sse2(cospi_p58_p06, cospi_p06_m58, x7[14], x7[15], x8[14], x8[15]);
+
+  // stage 9
+  output[0] = x8[1];
+  output[1] = x8[14];
+  output[2] = x8[3];
+  output[3] = x8[12];
+  output[4] = x8[5];
+  output[5] = x8[10];
+  output[6] = x8[7];
+  output[7] = x8[8];
+  output[8] = x8[9];
+  output[9] = x8[6];
+  output[10] = x8[11];
+  output[11] = x8[4];
+  output[12] = x8[13];
+  output[13] = x8[2];
+  output[14] = x8[15];
+  output[15] = x8[0];
+}
+
+static const transform_1d_sse2 col_txfm4x4_arr[TX_TYPES] = {
+  fdct4x4_new_sse2,       // DCT_DCT
+  fadst4x4_new_sse2,      // ADST_DCT
+  fdct4x4_new_sse2,       // DCT_ADST
+  fadst4x4_new_sse2,      // ADST_ADST
+  fadst4x4_new_sse2,      // FLIPADST_DCT
+  fdct4x4_new_sse2,       // DCT_FLIPADST
+  fadst4x4_new_sse2,      // FLIPADST_FLIPADST
+  fadst4x4_new_sse2,      // ADST_FLIPADST
+  fadst4x4_new_sse2,      // FLIPADST_ADST
+  fidentity4x4_new_sse2,  // IDTX
+  fdct4x4_new_sse2,       // V_DCT
+  fidentity4x4_new_sse2,  // H_DCT
+  fadst4x4_new_sse2,      // V_ADST
+  fidentity4x4_new_sse2,  // H_ADST
+  fadst4x4_new_sse2,      // V_FLIPADST
+  fidentity4x4_new_sse2   // H_FLIPADST
+};
+
+static const transform_1d_sse2 row_txfm4x4_arr[TX_TYPES] = {
+  fdct4x4_new_sse2,       // DCT_DCT
+  fdct4x4_new_sse2,       // ADST_DCT
+  fadst4x4_new_sse2,      // DCT_ADST
+  fadst4x4_new_sse2,      // ADST_ADST
+  fdct4x4_new_sse2,       // FLIPADST_DCT
+  fadst4x4_new_sse2,      // DCT_FLIPADST
+  fadst4x4_new_sse2,      // FLIPADST_FLIPADST
+  fadst4x4_new_sse2,      // ADST_FLIPADST
+  fadst4x4_new_sse2,      // FLIPADST_ADST
+  fidentity4x4_new_sse2,  // IDTX
+  fidentity4x4_new_sse2,  // V_DCT
+  fdct4x4_new_sse2,       // H_DCT
+  fidentity4x4_new_sse2,  // V_ADST
+  fadst4x4_new_sse2,      // H_ADST
+  fidentity4x4_new_sse2,  // V_FLIPADST
+  fadst4x4_new_sse2       // H_FLIPADST
+};
+
+static const transform_1d_sse2 col_txfm4x8_arr[TX_TYPES] = {
+  fdct4x8_new_sse2,       // DCT_DCT
+  fadst4x8_new_sse2,      // ADST_DCT
+  fdct4x8_new_sse2,       // DCT_ADST
+  fadst4x8_new_sse2,      // ADST_ADST
+  fadst4x8_new_sse2,      // FLIPADST_DCT
+  fdct4x8_new_sse2,       // DCT_FLIPADST
+  fadst4x8_new_sse2,      // FLIPADST_FLIPADST
+  fadst4x8_new_sse2,      // ADST_FLIPADST
+  fadst4x8_new_sse2,      // FLIPADST_ADST
+  fidentity8x8_new_sse2,  // IDTX
+  fdct4x8_new_sse2,       // V_DCT
+  fidentity8x8_new_sse2,  // H_DCT
+  fadst4x8_new_sse2,      // V_ADST
+  fidentity8x8_new_sse2,  // H_ADST
+  fadst4x8_new_sse2,      // V_FLIPADST
+  fidentity8x8_new_sse2   // H_FLIPADST
+};
+
+static const transform_1d_sse2 row_txfm8x4_arr[TX_TYPES] = {
+  fdct8x4_new_sse2,       // DCT_DCT
+  fdct8x4_new_sse2,       // ADST_DCT
+  fadst8x4_new_sse2,      // DCT_ADST
+  fadst8x4_new_sse2,      // ADST_ADST
+  fdct8x4_new_sse2,       // FLIPADST_DCT
+  fadst8x4_new_sse2,      // DCT_FLIPADST
+  fadst8x4_new_sse2,      // FLIPADST_FLIPADST
+  fadst8x4_new_sse2,      // ADST_FLIPADST
+  fadst8x4_new_sse2,      // FLIPADST_ADST
+  fidentity8x4_new_sse2,  // IDTX
+  fidentity8x4_new_sse2,  // V_DCT
+  fdct8x4_new_sse2,       // H_DCT
+  fidentity8x4_new_sse2,  // V_ADST
+  fadst8x4_new_sse2,      // H_ADST
+  fidentity8x4_new_sse2,  // V_FLIPADST
+  fadst8x4_new_sse2       // H_FLIPADST
+};
+
+static const transform_1d_sse2 col_txfm8x4_arr[TX_TYPES] = {
+  fdct8x4_new_sse2,       // DCT_DCT
+  fadst8x4_new_sse2,      // ADST_DCT
+  fdct8x4_new_sse2,       // DCT_ADST
+  fadst8x4_new_sse2,      // ADST_ADST
+  fadst8x4_new_sse2,      // FLIPADST_DCT
+  fdct8x4_new_sse2,       // DCT_FLIPADST
+  fadst8x4_new_sse2,      // FLIPADST_FLIPADST
+  fadst8x4_new_sse2,      // ADST_FLIPADST
+  fadst8x4_new_sse2,      // FLIPADST_ADST
+  fidentity8x4_new_sse2,  // IDTX
+  fdct8x4_new_sse2,       // V_DCT
+  fidentity8x4_new_sse2,  // H_DCT
+  fadst8x4_new_sse2,      // V_ADST
+  fidentity8x4_new_sse2,  // H_ADST
+  fadst8x4_new_sse2,      // V_FLIPADST
+  fidentity8x4_new_sse2   // H_FLIPADST
+};
+
+static const transform_1d_sse2 row_txfm4x8_arr[TX_TYPES] = {
+  fdct4x8_new_sse2,       // DCT_DCT
+  fdct4x8_new_sse2,       // ADST_DCT
+  fadst4x8_new_sse2,      // DCT_ADST
+  fadst4x8_new_sse2,      // ADST_ADST
+  fdct4x8_new_sse2,       // FLIPADST_DCT
+  fadst4x8_new_sse2,      // DCT_FLIPADST
+  fadst4x8_new_sse2,      // FLIPADST_FLIPADST
+  fadst4x8_new_sse2,      // ADST_FLIPADST
+  fadst4x8_new_sse2,      // FLIPADST_ADST
+  fidentity8x8_new_sse2,  // IDTX
+  fidentity8x8_new_sse2,  // V_DCT
+  fdct4x8_new_sse2,       // H_DCT
+  fidentity8x8_new_sse2,  // V_ADST
+  fadst4x8_new_sse2,      // H_ADST
+  fidentity8x8_new_sse2,  // V_FLIPADST
+  fadst4x8_new_sse2       // H_FLIPADST
+};
+
+static const transform_1d_sse2 col_txfm8x8_arr[TX_TYPES] = {
+  fdct8x8_new_sse2,       // DCT_DCT
+  fadst8x8_new_sse2,      // ADST_DCT
+  fdct8x8_new_sse2,       // DCT_ADST
+  fadst8x8_new_sse2,      // ADST_ADST
+  fadst8x8_new_sse2,      // FLIPADST_DCT
+  fdct8x8_new_sse2,       // DCT_FLIPADST
+  fadst8x8_new_sse2,      // FLIPADST_FLIPADST
+  fadst8x8_new_sse2,      // ADST_FLIPADST
+  fadst8x8_new_sse2,      // FLIPADST_ADST
+  fidentity8x8_new_sse2,  // IDTX
+  fdct8x8_new_sse2,       // V_DCT
+  fidentity8x8_new_sse2,  // H_DCT
+  fadst8x8_new_sse2,      // V_ADST
+  fidentity8x8_new_sse2,  // H_ADST
+  fadst8x8_new_sse2,      // V_FLIPADST
+  fidentity8x8_new_sse2,  // H_FLIPADST
+};
+
+static const transform_1d_sse2 row_txfm8x8_arr[TX_TYPES] = {
+  fdct8x8_new_sse2,       // DCT_DCT
+  fdct8x8_new_sse2,       // ADST_DCT
+  fadst8x8_new_sse2,      // DCT_ADST
+  fadst8x8_new_sse2,      // ADST_ADST
+  fdct8x8_new_sse2,       // FLIPADST_DCT
+  fadst8x8_new_sse2,      // DCT_FLIPADST
+  fadst8x8_new_sse2,      // FLIPADST_FLIPADST
+  fadst8x8_new_sse2,      // ADST_FLIPADST
+  fadst8x8_new_sse2,      // FLIPADST_ADST
+  fidentity8x8_new_sse2,  // IDTX
+  fidentity8x8_new_sse2,  // V_DCT
+  fdct8x8_new_sse2,       // H_DCT
+  fidentity8x8_new_sse2,  // V_ADST
+  fadst8x8_new_sse2,      // H_ADST
+  fidentity8x8_new_sse2,  // V_FLIPADST
+  fadst8x8_new_sse2       // H_FLIPADST
+};
+
+static const transform_1d_sse2 col_txfm8x16_arr[TX_TYPES] = {
+  fdct8x16_new_sse2,       // DCT_DCT
+  fadst8x16_new_sse2,      // ADST_DCT
+  fdct8x16_new_sse2,       // DCT_ADST
+  fadst8x16_new_sse2,      // ADST_ADST
+  fadst8x16_new_sse2,      // FLIPADST_DCT
+  fdct8x16_new_sse2,       // DCT_FLIPADST
+  fadst8x16_new_sse2,      // FLIPADST_FLIPADST
+  fadst8x16_new_sse2,      // ADST_FLIPADST
+  fadst8x16_new_sse2,      // FLIPADST_ADST
+  fidentity8x16_new_sse2,  // IDTX
+  fdct8x16_new_sse2,       // V_DCT
+  fidentity8x16_new_sse2,  // H_DCT
+  fadst8x16_new_sse2,      // V_ADST
+  fidentity8x16_new_sse2,  // H_ADST
+  fadst8x16_new_sse2,      // V_FLIPADST
+  fidentity8x16_new_sse2   // H_FLIPADST
+};
+
+static const transform_1d_sse2 row_txfm8x16_arr[TX_TYPES] = {
+  fdct8x16_new_sse2,       // DCT_DCT
+  fdct8x16_new_sse2,       // ADST_DCT
+  fadst8x16_new_sse2,      // DCT_ADST
+  fadst8x16_new_sse2,      // ADST_ADST
+  fdct8x16_new_sse2,       // FLIPADST_DCT
+  fadst8x16_new_sse2,      // DCT_FLIPADST
+  fadst8x16_new_sse2,      // FLIPADST_FLIPADST
+  fadst8x16_new_sse2,      // ADST_FLIPADST
+  fadst8x16_new_sse2,      // FLIPADST_ADST
+  fidentity8x16_new_sse2,  // IDTX
+  fidentity8x16_new_sse2,  // V_DCT
+  fdct8x16_new_sse2,       // H_DCT
+  fidentity8x16_new_sse2,  // V_ADST
+  fadst8x16_new_sse2,      // H_ADST
+  fidentity8x16_new_sse2,  // V_FLIPADST
+  fadst8x16_new_sse2       // H_FLIPADST
+};
+
+static const transform_1d_sse2 row_txfm8x32_arr[TX_TYPES] = {
+  av1_fdct8x32_new_sse2,   // DCT_DCT
+  NULL,                    // ADST_DCT
+  NULL,                    // DCT_ADST
+  NULL,                    // ADST_ADST
+  NULL,                    // FLIPADST_DCT
+  NULL,                    // DCT_FLIPADST
+  NULL,                    // FLIPADST_FLIPADST
+  NULL,                    // ADST_FLIPADST
+  NULL,                    // FLIPADST_ADST
+  fidentity8x32_new_sse2,  // IDTX
+  fidentity8x32_new_sse2,  // V_DCT
+  av1_fdct8x32_new_sse2,   // H_DCT
+  NULL,                    // V_ADST
+  NULL,                    // H_ADST
+  NULL,                    // V_FLIPADST
+  NULL                     // H_FLIPADST
+};
+
+void av1_lowbd_fwd_txfm2d_4x4_sse2(const int16_t *input, int32_t *output,
+                                   int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf0[4], buf1[4], *buf;
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X4];
+  const int txw_idx = get_txw_idx(TX_4X4);
+  const int txh_idx = get_txh_idx(TX_4X4);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 4;
+  const int height = 4;
+  const transform_1d_sse2 col_txfm = col_txfm4x4_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm4x4_arr[tx_type];
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  if (ud_flip) {
+    load_buffer_16bit_to_16bit_w4_flip(input, stride, buf0, height);
+  } else {
+    load_buffer_16bit_to_16bit_w4(input, stride, buf0, height);
+  }
+  round_shift_16bit(buf0, height, shift[0]);
+  col_txfm(buf0, buf0, cos_bit_col);
+  round_shift_16bit(buf0, height, shift[1]);
+  transpose_16bit_4x4(buf0, buf1);
+
+  if (lr_flip) {
+    buf = buf0;
+    flip_buf_sse2(buf1, buf, width);
+  } else {
+    buf = buf1;
+  }
+  row_txfm(buf, buf, cos_bit_row);
+  round_shift_16bit(buf, width, shift[2]);
+  transpose_16bit_4x4(buf, buf);
+  store_buffer_16bit_to_32bit_w4(buf, output, width, height);
+}
+
+void av1_lowbd_fwd_txfm2d_4x8_sse2(const int16_t *input, int32_t *output,
+                                   int stride, TX_TYPE tx_type, int bd) {
+  (void)stride;
+  (void)bd;
+  __m128i buf0[8], buf1[8], *buf;
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X8];
+  const int txw_idx = get_txw_idx(TX_4X8);
+  const int txh_idx = get_txh_idx(TX_4X8);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 4;
+  const int height = 8;
+  const transform_1d_sse2 col_txfm = col_txfm4x8_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm8x4_arr[tx_type];
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  if (ud_flip) {
+    load_buffer_16bit_to_16bit_w4_flip(input, stride, buf0, height);
+  } else {
+    load_buffer_16bit_to_16bit_w4(input, stride, buf0, height);
+  }
+  round_shift_16bit(buf0, height, shift[0]);
+  col_txfm(buf0, buf0, cos_bit_col);
+  round_shift_16bit(buf0, height, shift[1]);
+  transpose_16bit_4x8(buf0, buf1);
+
+  if (lr_flip) {
+    buf = buf0;
+    flip_buf_sse2(buf1, buf, width);
+  } else {
+    buf = buf1;
+  }
+  row_txfm(buf, buf, cos_bit_row);
+  round_shift_16bit(buf, width, shift[2]);
+  transpose_16bit_8x4(buf, buf);
+  store_rect_buffer_16bit_to_32bit_w4(buf, output, width, height);
+}
+
+void av1_lowbd_fwd_txfm2d_4x16_sse2(const int16_t *input, int32_t *output,
+                                    int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf0[16], buf1[16];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X16];
+  const int txw_idx = get_txw_idx(TX_4X16);
+  const int txh_idx = get_txh_idx(TX_4X16);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 4;
+  const int height = 16;
+  const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm8x4_arr[tx_type];
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  if (ud_flip) {
+    load_buffer_16bit_to_16bit_w4_flip(input, stride, buf0, height);
+  } else {
+    load_buffer_16bit_to_16bit_w4(input, stride, buf0, height);
+  }
+  round_shift_16bit(buf0, height, shift[0]);
+  col_txfm(buf0, buf0, cos_bit_col);
+  round_shift_16bit(buf0, height, shift[1]);
+  transpose_16bit_4x8(buf0, buf1);
+  transpose_16bit_4x8(buf0 + 8, buf1 + 8);
+
+  for (int i = 0; i < 2; i++) {
+    __m128i *buf;
+    if (lr_flip) {
+      buf = buf0;
+      flip_buf_sse2(buf1 + 8 * i, buf, width);
+    } else {
+      buf = buf1 + 8 * i;
+    }
+    row_txfm(buf, buf, cos_bit_row);
+    round_shift_16bit(buf, width, shift[2]);
+    transpose_16bit_8x4(buf, buf);
+    store_buffer_16bit_to_32bit_w4(buf, output + 8 * width * i, width, 8);
+  }
+}
+
+void av1_lowbd_fwd_txfm2d_8x4_sse2(const int16_t *input, int32_t *output,
+                                   int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf0[8], buf1[8], *buf;
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X4];
+  const int txw_idx = get_txw_idx(TX_8X4);
+  const int txh_idx = get_txh_idx(TX_8X4);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 8;
+  const int height = 4;
+  const transform_1d_sse2 col_txfm = col_txfm8x4_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm4x8_arr[tx_type];
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  if (ud_flip)
+    load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
+  else
+    load_buffer_16bit_to_16bit(input, stride, buf0, height);
+  round_shift_16bit(buf0, height, shift[0]);
+  col_txfm(buf0, buf0, cos_bit_col);
+  round_shift_16bit(buf0, height, shift[1]);
+  transpose_16bit_8x8(buf0, buf1);
+
+  if (lr_flip) {
+    buf = buf0;
+    flip_buf_sse2(buf1, buf, width);
+  } else {
+    buf = buf1;
+  }
+  row_txfm(buf, buf, cos_bit_row);
+  round_shift_16bit(buf, width, shift[2]);
+  transpose_16bit_8x8(buf, buf);
+  store_rect_buffer_16bit_to_32bit_w8(buf, output, width, height);
+}
+
+void av1_lowbd_fwd_txfm2d_8x8_sse2(const int16_t *input, int32_t *output,
+                                   int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf0[8], buf1[8], *buf;
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X8];
+  const int txw_idx = get_txw_idx(TX_8X8);
+  const int txh_idx = get_txh_idx(TX_8X8);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 8;
+  const int height = 8;
+  const transform_1d_sse2 col_txfm = col_txfm8x8_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm8x8_arr[tx_type];
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  if (ud_flip)
+    load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
+  else
+    load_buffer_16bit_to_16bit(input, stride, buf0, height);
+  round_shift_16bit(buf0, height, shift[0]);
+  col_txfm(buf0, buf0, cos_bit_col);
+  round_shift_16bit(buf0, height, shift[1]);
+  transpose_16bit_8x8(buf0, buf1);
+
+  if (lr_flip) {
+    buf = buf0;
+    flip_buf_sse2(buf1, buf, width);
+  } else {
+    buf = buf1;
+  }
+  row_txfm(buf, buf, cos_bit_row);
+  round_shift_16bit(buf, width, shift[2]);
+  transpose_16bit_8x8(buf, buf);
+  store_buffer_16bit_to_32bit_w8(buf, output, width, height);
+}
+
+void av1_lowbd_fwd_txfm2d_8x16_sse2(const int16_t *input, int32_t *output,
+                                    int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf0[16], buf1[16];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X16];
+  const int txw_idx = get_txw_idx(TX_8X16);
+  const int txh_idx = get_txh_idx(TX_8X16);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 8;
+  const int height = 16;
+  const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm8x8_arr[tx_type];
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  if (ud_flip) {
+    load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
+  } else {
+    load_buffer_16bit_to_16bit(input, stride, buf0, height);
+  }
+  round_shift_16bit(buf0, height, shift[0]);
+  col_txfm(buf0, buf0, cos_bit_col);
+  round_shift_16bit(buf0, height, shift[1]);
+  transpose_16bit_8x8(buf0, buf1);
+  transpose_16bit_8x8(buf0 + 8, buf1 + 8);
+
+  for (int i = 0; i < 2; i++) {
+    __m128i *buf;
+    if (lr_flip) {
+      buf = buf0;
+      flip_buf_sse2(buf1 + width * i, buf, width);
+    } else {
+      buf = buf1 + width * i;
+    }
+    row_txfm(buf, buf, cos_bit_row);
+    round_shift_16bit(buf, width, shift[2]);
+    transpose_16bit_8x8(buf, buf);
+    store_rect_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width, 8);
+  }
+}
+
+void av1_lowbd_fwd_txfm2d_8x32_sse2(const int16_t *input, int32_t *output,
+                                    int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf0[32], buf1[32];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X32];
+  const int txw_idx = get_txw_idx(TX_8X32);
+  const int txh_idx = get_txh_idx(TX_8X32);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 8;
+  const int height = 32;
+  const transform_1d_sse2 col_txfm = col_txfm8x32_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm8x8_arr[tx_type];
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  if (ud_flip) {
+    load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
+  } else {
+    load_buffer_16bit_to_16bit(input, stride, buf0, height);
+  }
+  round_shift_16bit(buf0, height, shift[0]);
+  col_txfm(buf0, buf0, cos_bit_col);
+  round_shift_16bit(buf0, height, shift[1]);
+  transpose_16bit_8x8(buf0, buf1);
+  transpose_16bit_8x8(buf0 + 8, buf1 + 8);
+  transpose_16bit_8x8(buf0 + 16, buf1 + 16);
+  transpose_16bit_8x8(buf0 + 24, buf1 + 24);
+
+  for (int i = 0; i < 4; i++) {
+    __m128i *buf;
+    if (lr_flip) {
+      buf = buf0;
+      flip_buf_sse2(buf1 + width * i, buf, width);
+    } else {
+      buf = buf1 + width * i;
+    }
+    row_txfm(buf, buf, cos_bit_row);
+    round_shift_16bit(buf, width, shift[2]);
+    transpose_16bit_8x8(buf, buf);
+    store_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width, 8);
+  }
+}
+
+void av1_lowbd_fwd_txfm2d_16x4_sse2(const int16_t *input, int32_t *output,
+                                    int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf0[16], buf1[16];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X4];
+  const int txw_idx = get_txw_idx(TX_16X4);
+  const int txh_idx = get_txh_idx(TX_16X4);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 16;
+  const int height = 4;
+  const transform_1d_sse2 col_txfm = col_txfm8x4_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm8x16_arr[tx_type];
+  __m128i *buf;
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  for (int i = 0; i < 2; i++) {
+    if (ud_flip) {
+      load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+    } else {
+      load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+    }
+    round_shift_16bit(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col);
+    round_shift_16bit(buf0, height, shift[1]);
+    transpose_16bit_8x4(buf0, buf1 + 8 * i);
+  }
+
+  if (lr_flip) {
+    buf = buf0;
+    flip_buf_sse2(buf1, buf, width);
+  } else {
+    buf = buf1;
+  }
+  row_txfm(buf, buf, cos_bit_row);
+  round_shift_16bit(buf, width, shift[2]);
+  transpose_16bit_4x8(buf, buf);
+  store_buffer_16bit_to_32bit_w8(buf, output, width, height);
+  transpose_16bit_4x8(buf + 8, buf + 8);
+  store_buffer_16bit_to_32bit_w8(buf + 8, output + 8, width, height);
+}
+
+void av1_lowbd_fwd_txfm2d_16x8_sse2(const int16_t *input, int32_t *output,
+                                    int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf0[16], buf1[16];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X8];
+  const int txw_idx = get_txw_idx(TX_16X8);
+  const int txh_idx = get_txh_idx(TX_16X8);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 16;
+  const int height = 8;
+  const transform_1d_sse2 col_txfm = col_txfm8x8_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm8x16_arr[tx_type];
+  __m128i *buf;
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  for (int i = 0; i < 2; i++) {
+    if (ud_flip) {
+      load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+    } else {
+      load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+    }
+    round_shift_16bit(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col);
+    round_shift_16bit(buf0, height, shift[1]);
+    transpose_16bit_8x8(buf0, buf1 + 8 * i);
+  }
+
+  if (lr_flip) {
+    buf = buf0;
+    flip_buf_sse2(buf1, buf, width);
+  } else {
+    buf = buf1;
+  }
+  row_txfm(buf, buf, cos_bit_row);
+  round_shift_16bit(buf, width, shift[2]);
+  transpose_16bit_8x8(buf, buf);
+  store_rect_buffer_16bit_to_32bit_w8(buf, output, width, height);
+  transpose_16bit_8x8(buf + 8, buf + 8);
+  store_rect_buffer_16bit_to_32bit_w8(buf + 8, output + 8, width, height);
+}
+
+void av1_lowbd_fwd_txfm2d_16x16_sse2(const int16_t *input, int32_t *output,
+                                     int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf0[16], buf1[32];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X16];
+  const int txw_idx = get_txw_idx(TX_16X16);
+  const int txh_idx = get_txh_idx(TX_16X16);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 16;
+  const int height = 16;
+  const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm8x16_arr[tx_type];
+  int ud_flip, lr_flip;
+
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  for (int i = 0; i < 2; i++) {
+    if (ud_flip) {
+      load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+    } else {
+      load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+    }
+    round_shift_16bit(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col);
+    round_shift_16bit(buf0, height, shift[1]);
+    transpose_16bit_8x8(buf0, buf1 + 0 * width + 8 * i);
+    transpose_16bit_8x8(buf0 + 8, buf1 + 1 * width + 8 * i);
+  }
+
+  for (int i = 0; i < 2; i++) {
+    __m128i *buf;
+    if (lr_flip) {
+      buf = buf0;
+      flip_buf_sse2(buf1 + width * i, buf, width);
+    } else {
+      buf = buf1 + width * i;
+    }
+    row_txfm(buf, buf, cos_bit_row);
+    round_shift_16bit(buf, width, shift[2]);
+    transpose_16bit_8x8(buf, buf);
+    store_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width, 8);
+    transpose_16bit_8x8(buf + 8, buf + 8);
+    store_buffer_16bit_to_32bit_w8(buf + 8, output + 8 * width * i + 8, width,
+                                   8);
+  }
+}
+
+void av1_lowbd_fwd_txfm2d_16x32_sse2(const int16_t *input, int32_t *output,
+                                     int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf0[32], buf1[64];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X32];
+  const int txw_idx = get_txw_idx(TX_16X32);
+  const int txh_idx = get_txh_idx(TX_16X32);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 16;
+  const int height = 32;
+  const transform_1d_sse2 col_txfm = col_txfm8x32_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm8x16_arr[tx_type];
+
+  if (col_txfm != NULL && row_txfm != NULL) {
+    int ud_flip, lr_flip;
+    get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+    for (int i = 0; i < 2; i++) {
+      if (ud_flip) {
+        load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+      } else {
+        load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+      }
+      round_shift_16bit(buf0, height, shift[0]);
+      col_txfm(buf0, buf0, cos_bit_col);
+      round_shift_16bit(buf0, height, shift[1]);
+      transpose_16bit_8x8(buf0 + 0 * 8, buf1 + 0 * width + 8 * i);
+      transpose_16bit_8x8(buf0 + 1 * 8, buf1 + 1 * width + 8 * i);
+      transpose_16bit_8x8(buf0 + 2 * 8, buf1 + 2 * width + 8 * i);
+      transpose_16bit_8x8(buf0 + 3 * 8, buf1 + 3 * width + 8 * i);
+    }
+
+    for (int i = 0; i < 4; i++) {
+      __m128i *buf;
+      if (lr_flip) {
+        buf = buf0;
+        flip_buf_sse2(buf1 + width * i, buf, width);
+      } else {
+        buf = buf1 + width * i;
+      }
+      row_txfm(buf, buf, cos_bit_row);
+      round_shift_16bit(buf, width, shift[2]);
+      transpose_16bit_8x8(buf, buf);
+      store_rect_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width,
+                                          8);
+      transpose_16bit_8x8(buf + 8, buf + 8);
+      store_rect_buffer_16bit_to_32bit_w8(buf + 8, output + 8 * width * i + 8,
+                                          width, 8);
+    }
+  } else {
+    av1_fwd_txfm2d_16x32_c(input, output, stride, tx_type, bd);
+  }
+}
+
+void av1_lowbd_fwd_txfm2d_32x8_sse2(const int16_t *input, int32_t *output,
+                                    int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf0[32], buf1[32];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X8];
+  const int txw_idx = get_txw_idx(TX_32X8);
+  const int txh_idx = get_txh_idx(TX_32X8);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 32;
+  const int height = 8;
+  const transform_1d_sse2 col_txfm = col_txfm8x8_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm8x32_arr[tx_type];
+
+  if (col_txfm != NULL && row_txfm != NULL) {
+    int ud_flip, lr_flip;
+    get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+    for (int i = 0; i < 4; i++) {
+      if (ud_flip) {
+        load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+      } else {
+        load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+      }
+      round_shift_16bit(buf0, height, shift[0]);
+      col_txfm(buf0, buf0, cos_bit_col);
+      round_shift_16bit(buf0, height, shift[1]);
+      transpose_16bit_8x8(buf0, buf1 + 0 * width + 8 * i);
+    }
+
+    for (int i = 0; i < 1; i++) {
+      __m128i *buf;
+      if (lr_flip) {
+        buf = buf0;
+        flip_buf_sse2(buf1 + width * i, buf, width);
+      } else {
+        buf = buf1 + width * i;
+      }
+      row_txfm(buf, buf, cos_bit_row);
+      round_shift_16bit(buf, width, shift[2]);
+      transpose_16bit_8x8(buf, buf);
+      store_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width,
+                                     height);
+      transpose_16bit_8x8(buf + 8, buf + 8);
+      store_buffer_16bit_to_32bit_w8(buf + 8, output + 8 * width * i + 8, width,
+                                     height);
+      transpose_16bit_8x8(buf + 16, buf + 16);
+      store_buffer_16bit_to_32bit_w8(buf + 16, output + 8 * width * i + 16,
+                                     width, height);
+      transpose_16bit_8x8(buf + 24, buf + 24);
+      store_buffer_16bit_to_32bit_w8(buf + 24, output + 8 * width * i + 24,
+                                     width, height);
+    }
+  } else {
+    av1_fwd_txfm2d_32x16_c(input, output, stride, tx_type, bd);
+  }
+}
+
+void av1_lowbd_fwd_txfm2d_32x16_sse2(const int16_t *input, int32_t *output,
+                                     int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf0[32], buf1[64];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X16];
+  const int txw_idx = get_txw_idx(TX_32X16);
+  const int txh_idx = get_txh_idx(TX_32X16);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 32;
+  const int height = 16;
+  const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm8x32_arr[tx_type];
+
+  if (col_txfm != NULL && row_txfm != NULL) {
+    int ud_flip, lr_flip;
+    get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+    for (int i = 0; i < 4; i++) {
+      if (ud_flip) {
+        load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+      } else {
+        load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+      }
+      round_shift_16bit(buf0, height, shift[0]);
+      col_txfm(buf0, buf0, cos_bit_col);
+      round_shift_16bit(buf0, height, shift[1]);
+      transpose_16bit_8x8(buf0, buf1 + 0 * width + 8 * i);
+      transpose_16bit_8x8(buf0 + 8, buf1 + 1 * width + 8 * i);
+    }
+
+    for (int i = 0; i < 2; i++) {
+      __m128i *buf;
+      if (lr_flip) {
+        buf = buf0;
+        flip_buf_sse2(buf1 + width * i, buf, width);
+      } else {
+        buf = buf1 + width * i;
+      }
+      row_txfm(buf, buf, cos_bit_row);
+      round_shift_16bit(buf, width, shift[2]);
+      transpose_16bit_8x8(buf, buf);
+      store_rect_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width,
+                                          8);
+      transpose_16bit_8x8(buf + 8, buf + 8);
+      store_rect_buffer_16bit_to_32bit_w8(buf + 8, output + 8 * width * i + 8,
+                                          width, 8);
+      transpose_16bit_8x8(buf + 16, buf + 16);
+      store_rect_buffer_16bit_to_32bit_w8(buf + 16, output + 8 * width * i + 16,
+                                          width, 8);
+      transpose_16bit_8x8(buf + 24, buf + 24);
+      store_rect_buffer_16bit_to_32bit_w8(buf + 24, output + 8 * width * i + 24,
+                                          width, 8);
+    }
+  } else {
+    av1_fwd_txfm2d_32x16_c(input, output, stride, tx_type, bd);
+  }
+}
+
+void av1_lowbd_fwd_txfm2d_32x32_sse2(const int16_t *input, int32_t *output,
+                                     int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf0[32], buf1[128];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X32];
+  const int txw_idx = get_txw_idx(TX_32X32);
+  const int txh_idx = get_txh_idx(TX_32X32);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = 32;
+  const int height = 32;
+  const transform_1d_sse2 col_txfm = col_txfm8x32_arr[tx_type];
+  const transform_1d_sse2 row_txfm = row_txfm8x32_arr[tx_type];
+
+  if (col_txfm != NULL && row_txfm != NULL) {
+    int ud_flip, lr_flip;
+    get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+    for (int i = 0; i < 4; i++) {
+      if (ud_flip) {
+        load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+      } else {
+        load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+      }
+      round_shift_16bit(buf0, height, shift[0]);
+      col_txfm(buf0, buf0, cos_bit_col);
+      round_shift_16bit(buf0, height, shift[1]);
+      transpose_16bit_8x8(buf0 + 0 * 8, buf1 + 0 * width + 8 * i);
+      transpose_16bit_8x8(buf0 + 1 * 8, buf1 + 1 * width + 8 * i);
+      transpose_16bit_8x8(buf0 + 2 * 8, buf1 + 2 * width + 8 * i);
+      transpose_16bit_8x8(buf0 + 3 * 8, buf1 + 3 * width + 8 * i);
+    }
+
+    for (int i = 0; i < 4; i++) {
+      __m128i *buf;
+      if (lr_flip) {
+        buf = buf0;
+        flip_buf_sse2(buf1 + width * i, buf, width);
+      } else {
+        buf = buf1 + width * i;
+      }
+      row_txfm(buf, buf, cos_bit_row);
+      round_shift_16bit(buf, width, shift[2]);
+      transpose_16bit_8x8(buf, buf);
+      store_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width, 8);
+      transpose_16bit_8x8(buf + 8, buf + 8);
+      store_buffer_16bit_to_32bit_w8(buf + 8, output + 8 * width * i + 8, width,
+                                     8);
+      transpose_16bit_8x8(buf + 16, buf + 16);
+      store_buffer_16bit_to_32bit_w8(buf + 16, output + 8 * width * i + 16,
+                                     width, 8);
+      transpose_16bit_8x8(buf + 24, buf + 24);
+      store_buffer_16bit_to_32bit_w8(buf + 24, output + 8 * width * i + 24,
+                                     width, 8);
+    }
+  } else {
+    av1_fwd_txfm2d_32x32_c(input, output, stride, tx_type, bd);
+  }
+}
+
+void av1_lowbd_fwd_txfm2d_64x16_sse2(const int16_t *input, int32_t *output,
+                                     int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  (void)tx_type;
+  assert(tx_type == DCT_DCT);
+  const TX_SIZE tx_size = TX_64X16;
+  __m128i buf0[64], buf1[128];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const transform_1d_sse2 col_txfm = fdct8x16_new_sse2;
+  const transform_1d_sse2 row_txfm = av1_fdct8x64_new_sse2;
+  const int width_div8 = (width >> 3);
+  const int height_div8 = (height >> 3);
+
+  for (int i = 0; i < width_div8; i++) {
+    load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+    round_shift_16bit(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col);
+    round_shift_16bit(buf0, height, shift[1]);
+    for (int j = 0; j < height_div8; ++j) {
+      transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
+    }
+  }
+
+  for (int i = 0; i < height_div8; i++) {
+    __m128i *buf = buf1 + width * i;
+    row_txfm(buf, buf, cos_bit_row);
+    round_shift_16bit(buf, width, shift[2]);
+    int32_t *output8 = output + 8 * 32 * i;
+    for (int j = 0; j < 4; ++j) {
+      __m128i *buf8 = buf + 8 * j;
+      transpose_16bit_8x8(buf8, buf8);
+      store_buffer_16bit_to_32bit_w8(buf8, output8 + 8 * j, 32, 8);
+    }
+  }
+}
+
+void av1_lowbd_fwd_txfm2d_16x64_sse2(const int16_t *input, int32_t *output,
+                                     int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  (void)tx_type;
+  assert(tx_type == DCT_DCT);
+  const TX_SIZE tx_size = TX_16X64;
+  __m128i buf0[64], buf1[128];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const transform_1d_sse2 col_txfm = av1_fdct8x64_new_sse2;
+  const transform_1d_sse2 row_txfm = fdct8x16_new_sse2;
+  const int width_div8 = (width >> 3);
+  const int height_div8 = (height >> 3);
+
+  for (int i = 0; i < width_div8; i++) {
+    load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+    round_shift_16bit(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col);
+    round_shift_16bit(buf0, height, shift[1]);
+    for (int j = 0; j < height_div8; ++j) {
+      transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
+    }
+  }
+
+  for (int i = 0; i < AOMMIN(4, height_div8); i++) {
+    __m128i *buf = buf1 + width * i;
+    row_txfm(buf, buf, cos_bit_row);
+    round_shift_16bit(buf, width, shift[2]);
+    int32_t *output8 = output + 8 * width * i;
+    for (int j = 0; j < width_div8; ++j) {
+      __m128i *buf8 = buf + 8 * j;
+      transpose_16bit_8x8(buf8, buf8);
+      store_buffer_16bit_to_32bit_w8(buf8, output8 + 8 * j, width, 8);
+    }
+  }
+  // Zero out the bottom 16x32 area.
+  memset(output + 16 * 32, 0, 16 * 32 * sizeof(*output));
+}
+
+static FwdTxfm2dFunc fwd_txfm2d_func_ls[TX_SIZES_ALL] = {
+  av1_lowbd_fwd_txfm2d_4x4_sse2,    // 4x4 transform
+  av1_lowbd_fwd_txfm2d_8x8_sse2,    // 8x8 transform
+  av1_lowbd_fwd_txfm2d_16x16_sse2,  // 16x16 transform
+  av1_lowbd_fwd_txfm2d_32x32_sse2,  // 32x32 transform
+  NULL,                             // 64x64 transform
+  av1_lowbd_fwd_txfm2d_4x8_sse2,    // 4x8 transform
+  av1_lowbd_fwd_txfm2d_8x4_sse2,    // 8x4 transform
+  av1_lowbd_fwd_txfm2d_8x16_sse2,   // 8x16 transform
+  av1_lowbd_fwd_txfm2d_16x8_sse2,   // 16x8 transform
+  av1_lowbd_fwd_txfm2d_16x32_sse2,  // 16x32 transform
+  av1_lowbd_fwd_txfm2d_32x16_sse2,  // 32x16 transform
+  NULL,                             // 32x64 transform
+  NULL,                             // 64x32 transform
+  av1_lowbd_fwd_txfm2d_4x16_sse2,   // 4x16 transform
+  av1_lowbd_fwd_txfm2d_16x4_sse2,   // 16x4 transform
+  av1_lowbd_fwd_txfm2d_8x32_sse2,   // 8x32 transform
+  av1_lowbd_fwd_txfm2d_32x8_sse2,   // 32x8 transform
+  av1_lowbd_fwd_txfm2d_16x64_sse2,  // 16x64 transform
+  av1_lowbd_fwd_txfm2d_64x16_sse2,  // 64x16 transform
+};
+
+void av1_lowbd_fwd_txfm_sse2(const int16_t *src_diff, tran_low_t *coeff,
+                             int diff_stride, TxfmParam *txfm_param) {
+  FwdTxfm2dFunc fwd_txfm2d_func = fwd_txfm2d_func_ls[txfm_param->tx_size];
+
+  if ((fwd_txfm2d_func == NULL) ||
+      (txfm_param->lossless && txfm_param->tx_size == TX_4X4))
+    av1_lowbd_fwd_txfm_c(src_diff, coeff, diff_stride, txfm_param);
+  else
+    fwd_txfm2d_func(src_diff, coeff, diff_stride, txfm_param->tx_type,
+                    txfm_param->bd);
+}
diff --git a/libs/libaom/src/av1/encoder/x86/av1_fwd_txfm_sse2.h b/libs/libaom/src/av1/encoder/x86/av1_fwd_txfm_sse2.h
new file mode 100644
index 000000000..a0e32f538
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/x86/av1_fwd_txfm_sse2.h
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_SSE2_H_
+#define AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_SSE2_H_
+
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/transpose_sse2.h"
+#include "aom_dsp/x86/txfm_common_sse2.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_fdct8x32_new_sse2(const __m128i *input, __m128i *output,
+                           int8_t cos_bit);
+void av1_fdct8x64_new_sse2(const __m128i *input, __m128i *output,
+                           int8_t cos_bit);
+
+static INLINE void fidentity4x4_new_sse2(const __m128i *const input,
+                                         __m128i *const output,
+                                         const int8_t cos_bit) {
+  (void)cos_bit;
+  const __m128i one = _mm_set1_epi16(1);
+
+  for (int i = 0; i < 4; ++i) {
+    const __m128i a = _mm_unpacklo_epi16(input[i], one);
+    const __m128i b = scale_round_sse2(a, NewSqrt2);
+    output[i] = _mm_packs_epi32(b, b);
+  }
+}
+
+static INLINE void fidentity8x4_new_sse2(const __m128i *const input,
+                                         __m128i *const output,
+                                         const int8_t cos_bit) {
+  (void)cos_bit;
+  const __m128i one = _mm_set1_epi16(1);
+
+  for (int i = 0; i < 4; ++i) {
+    const __m128i a_lo = _mm_unpacklo_epi16(input[i], one);
+    const __m128i a_hi = _mm_unpackhi_epi16(input[i], one);
+    const __m128i b_lo = scale_round_sse2(a_lo, NewSqrt2);
+    const __m128i b_hi = scale_round_sse2(a_hi, NewSqrt2);
+    output[i] = _mm_packs_epi32(b_lo, b_hi);
+  }
+}
+
+static INLINE void fidentity8x8_new_sse2(const __m128i *input, __m128i *output,
+                                         int8_t cos_bit) {
+  (void)cos_bit;
+
+  output[0] = _mm_adds_epi16(input[0], input[0]);
+  output[1] = _mm_adds_epi16(input[1], input[1]);
+  output[2] = _mm_adds_epi16(input[2], input[2]);
+  output[3] = _mm_adds_epi16(input[3], input[3]);
+  output[4] = _mm_adds_epi16(input[4], input[4]);
+  output[5] = _mm_adds_epi16(input[5], input[5]);
+  output[6] = _mm_adds_epi16(input[6], input[6]);
+  output[7] = _mm_adds_epi16(input[7], input[7]);
+}
+
+static INLINE void fidentity8x16_new_sse2(const __m128i *input, __m128i *output,
+                                          int8_t cos_bit) {
+  (void)cos_bit;
+  const __m128i one = _mm_set1_epi16(1);
+
+  for (int i = 0; i < 16; ++i) {
+    const __m128i a_lo = _mm_unpacklo_epi16(input[i], one);
+    const __m128i a_hi = _mm_unpackhi_epi16(input[i], one);
+    const __m128i b_lo = scale_round_sse2(a_lo, 2 * NewSqrt2);
+    const __m128i b_hi = scale_round_sse2(a_hi, 2 * NewSqrt2);
+    output[i] = _mm_packs_epi32(b_lo, b_hi);
+  }
+}
+
+static INLINE void fidentity8x32_new_sse2(const __m128i *input, __m128i *output,
+                                          int8_t cos_bit) {
+  (void)cos_bit;
+  for (int i = 0; i < 32; ++i) {
+    output[i] = _mm_slli_epi16(input[i], 2);
+  }
+}
+
+static const transform_1d_sse2 col_txfm8x32_arr[TX_TYPES] = {
+  av1_fdct8x32_new_sse2,   // DCT_DCT
+  NULL,                    // ADST_DCT
+  NULL,                    // DCT_ADST
+  NULL,                    // ADST_ADST
+  NULL,                    // FLIPADST_DCT
+  NULL,                    // DCT_FLIPADST
+  NULL,                    // FLIPADST_FLIPADST
+  NULL,                    // ADST_FLIPADST
+  NULL,                    // FLIPADST_ADST
+  fidentity8x32_new_sse2,  // IDTX
+  av1_fdct8x32_new_sse2,   // V_DCT
+  fidentity8x32_new_sse2,  // H_DCT
+  NULL,                    // V_ADST
+  NULL,                    // H_ADST
+  NULL,                    // V_FLIPADST
+  NULL                     // H_FLIPADST
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_SSE2_H_
diff --git a/libs/libaom/src/av1/encoder/x86/av1_highbd_quantize_avx2.c b/libs/libaom/src/av1/encoder/x86/av1_highbd_quantize_avx2.c
new file mode 100644
index 000000000..b58911fcb
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/x86/av1_highbd_quantize_avx2.c
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+static INLINE void init_one_qp(const __m128i *p, __m256i *qp) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i dc = _mm_unpacklo_epi16(*p, zero);
+  const __m128i ac = _mm_unpackhi_epi16(*p, zero);
+  *qp = _mm256_insertf128_si256(_mm256_castsi128_si256(dc), ac, 1);
+}
+
+static INLINE void update_qp(__m256i *qp) {
+  qp[0] = _mm256_permute2x128_si256(qp[0], qp[0], 0x11);
+  qp[1] = _mm256_permute2x128_si256(qp[1], qp[1], 0x11);
+  qp[2] = _mm256_permute2x128_si256(qp[2], qp[2], 0x11);
+}
+
+static INLINE void init_qp(const int16_t *round_ptr, const int16_t *quant_ptr,
+                           const int16_t *dequant_ptr, int log_scale,
+                           __m256i *qp) {
+  __m128i round = _mm_loadu_si128((const __m128i *)round_ptr);
+  if (log_scale) {
+    const __m128i round_scale = _mm_set1_epi16(1 << (15 - log_scale));
+    round = _mm_mulhrs_epi16(round, round_scale);
+  }
+  const __m128i quant = _mm_loadu_si128((const __m128i *)quant_ptr);
+  const __m128i dequant = _mm_loadu_si128((const __m128i *)dequant_ptr);
+
+  init_one_qp(&round, &qp[0]);
+  init_one_qp(&quant, &qp[1]);
+  init_one_qp(&dequant, &qp[2]);
+}
+
+static INLINE void quantize(const __m256i *qp, __m256i *c,
+                            const int16_t *iscan_ptr, int log_scale,
+                            tran_low_t *qcoeff, tran_low_t *dqcoeff,
+                            __m256i *eob) {
+  const __m256i abs_coeff = _mm256_abs_epi32(*c);
+  __m256i q = _mm256_add_epi32(abs_coeff, qp[0]);
+
+  __m256i q_lo = _mm256_mul_epi32(q, qp[1]);
+  __m256i q_hi = _mm256_srli_epi64(q, 32);
+  const __m256i qp_hi = _mm256_srli_epi64(qp[1], 32);
+  q_hi = _mm256_mul_epi32(q_hi, qp_hi);
+  q_lo = _mm256_srli_epi64(q_lo, 16 - log_scale);
+  q_hi = _mm256_srli_epi64(q_hi, 16 - log_scale);
+  q_hi = _mm256_slli_epi64(q_hi, 32);
+  q = _mm256_or_si256(q_lo, q_hi);
+  const __m256i abs_s = _mm256_slli_epi32(abs_coeff, 1 + log_scale);
+  const __m256i mask = _mm256_cmpgt_epi32(qp[2], abs_s);
+  q = _mm256_andnot_si256(mask, q);
+
+  __m256i dq = _mm256_mullo_epi32(q, qp[2]);
+  dq = _mm256_srai_epi32(dq, log_scale);
+  q = _mm256_sign_epi32(q, *c);
+  dq = _mm256_sign_epi32(dq, *c);
+
+  _mm256_storeu_si256((__m256i *)qcoeff, q);
+  _mm256_storeu_si256((__m256i *)dqcoeff, dq);
+
+  const __m128i isc = _mm_loadu_si128((const __m128i *)iscan_ptr);
+  const __m128i zr = _mm_setzero_si128();
+  const __m128i lo = _mm_unpacklo_epi16(isc, zr);
+  const __m128i hi = _mm_unpackhi_epi16(isc, zr);
+  const __m256i iscan =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
+
+  const __m256i zero = _mm256_setzero_si256();
+  const __m256i zc = _mm256_cmpeq_epi32(dq, zero);
+  const __m256i nz = _mm256_cmpeq_epi32(zc, zero);
+  __m256i cur_eob = _mm256_sub_epi32(iscan, nz);
+  cur_eob = _mm256_and_si256(cur_eob, nz);
+  *eob = _mm256_max_epi32(cur_eob, *eob);
+}
+
+void av1_highbd_quantize_fp_avx2(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan, int log_scale) {
+  (void)scan;
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  const unsigned int step = 8;
+  __m256i qp[3], coeff;
+
+  init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, qp);
+  coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+
+  __m256i eob = _mm256_setzero_si256();
+  quantize(qp, &coeff, iscan, log_scale, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+  coeff_ptr += step;
+  qcoeff_ptr += step;
+  dqcoeff_ptr += step;
+  iscan += step;
+  n_coeffs -= step;
+
+  update_qp(qp);
+  while (n_coeffs > 0) {
+    coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+    quantize(qp, &coeff, iscan, log_scale, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+    coeff_ptr += step;
+    qcoeff_ptr += step;
+    dqcoeff_ptr += step;
+    iscan += step;
+    n_coeffs -= step;
+  }
+  {
+    __m256i eob_s;
+    eob_s = _mm256_shuffle_epi32(eob, 0xe);
+    eob = _mm256_max_epi16(eob, eob_s);
+    eob_s = _mm256_shufflelo_epi16(eob, 0xe);
+    eob = _mm256_max_epi16(eob, eob_s);
+    eob_s = _mm256_shufflelo_epi16(eob, 1);
+    eob = _mm256_max_epi16(eob, eob_s);
+    const __m128i final_eob = _mm_max_epi16(_mm256_castsi256_si128(eob),
+                                            _mm256_extractf128_si256(eob, 1));
+    *eob_ptr = _mm_extract_epi16(final_eob, 0);
+  }
+}
diff --git a/libs/libaom/src/av1/encoder/x86/av1_highbd_quantize_sse4.c b/libs/libaom/src/av1/encoder/x86/av1_highbd_quantize_sse4.c
new file mode 100644
index 000000000..40b3b460b
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/x86/av1_highbd_quantize_sse4.c
@@ -0,0 +1,195 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h>
+#include <stdint.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/x86/synonyms.h"
+
+// Coefficient quantization phase 1
+// param[0-2] : rounding/quan/dequan constants
+static INLINE void quantize_coeff_phase1(__m128i *coeff, const __m128i *param,
+                                         const int shift, const int scale,
+                                         __m128i *qcoeff, __m128i *dquan,
+                                         __m128i *sign) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi32(1);
+
+  *sign = _mm_cmplt_epi32(*coeff, zero);
+  *sign = _mm_or_si128(*sign, one);
+  *coeff = _mm_abs_epi32(*coeff);
+
+  qcoeff[0] = _mm_add_epi32(*coeff, param[0]);
+  qcoeff[1] = _mm_unpackhi_epi32(qcoeff[0], zero);
+  qcoeff[0] = _mm_unpacklo_epi32(qcoeff[0], zero);
+
+  qcoeff[0] = _mm_mul_epi32(qcoeff[0], param[1]);
+  qcoeff[0] = _mm_srli_epi64(qcoeff[0], shift);
+  dquan[0] = _mm_mul_epi32(qcoeff[0], param[2]);
+  dquan[0] = _mm_srli_epi64(dquan[0], scale);
+  const __m128i abs_s = _mm_slli_epi32(*coeff, 1 + scale);
+  qcoeff[2] = _mm_cmplt_epi32(abs_s, param[3]);
+}
+
+// Coefficient quantization phase 2
+static INLINE void quantize_coeff_phase2(__m128i *qcoeff, __m128i *dquan,
+                                         const __m128i *sign,
+                                         const __m128i *param, const int shift,
+                                         const int scale, tran_low_t *qAddr,
+                                         tran_low_t *dqAddr) {
+  __m128i mask0L = _mm_set_epi32(-1, -1, 0, 0);
+  __m128i mask0H = _mm_set_epi32(0, 0, -1, -1);
+
+  qcoeff[1] = _mm_mul_epi32(qcoeff[1], param[1]);
+  qcoeff[1] = _mm_srli_epi64(qcoeff[1], shift);
+  dquan[1] = _mm_mul_epi32(qcoeff[1], param[2]);
+  dquan[1] = _mm_srli_epi64(dquan[1], scale);
+
+  // combine L&H
+  qcoeff[0] = _mm_shuffle_epi32(qcoeff[0], 0xd8);
+  qcoeff[1] = _mm_shuffle_epi32(qcoeff[1], 0x8d);
+
+  qcoeff[0] = _mm_and_si128(qcoeff[0], mask0H);
+  qcoeff[1] = _mm_and_si128(qcoeff[1], mask0L);
+
+  dquan[0] = _mm_shuffle_epi32(dquan[0], 0xd8);
+  dquan[1] = _mm_shuffle_epi32(dquan[1], 0x8d);
+
+  dquan[0] = _mm_and_si128(dquan[0], mask0H);
+  dquan[1] = _mm_and_si128(dquan[1], mask0L);
+
+  qcoeff[0] = _mm_or_si128(qcoeff[0], qcoeff[1]);
+  dquan[0] = _mm_or_si128(dquan[0], dquan[1]);
+
+  qcoeff[0] = _mm_sign_epi32(qcoeff[0], *sign);
+  dquan[0] = _mm_sign_epi32(dquan[0], *sign);
+  qcoeff[0] = _mm_andnot_si128(qcoeff[2], qcoeff[0]);
+  dquan[0] = _mm_andnot_si128(qcoeff[2], dquan[0]);
+  _mm_storeu_si128((__m128i *)qAddr, qcoeff[0]);
+  _mm_storeu_si128((__m128i *)dqAddr, dquan[0]);
+}
+
+static INLINE void find_eob(tran_low_t *qcoeff_ptr, const int16_t *iscan,
+                            __m128i *eob) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i mask, iscanIdx;
+  const __m128i q0 = _mm_loadu_si128((__m128i const *)qcoeff_ptr);
+  const __m128i q1 = _mm_loadu_si128((__m128i const *)(qcoeff_ptr + 4));
+  __m128i nz_flag0 = _mm_cmpeq_epi32(q0, zero);
+  __m128i nz_flag1 = _mm_cmpeq_epi32(q1, zero);
+
+  nz_flag0 = _mm_cmpeq_epi32(nz_flag0, zero);
+  nz_flag1 = _mm_cmpeq_epi32(nz_flag1, zero);
+
+  mask = _mm_packs_epi32(nz_flag0, nz_flag1);
+  iscanIdx = _mm_loadu_si128((__m128i const *)iscan);
+  iscanIdx = _mm_sub_epi16(iscanIdx, mask);
+  iscanIdx = _mm_and_si128(iscanIdx, mask);
+  *eob = _mm_max_epi16(*eob, iscanIdx);
+}
+
+static INLINE uint16_t get_accumulated_eob(__m128i *eob) {
+  __m128i eob_shuffled;
+  uint16_t eobValue;
+  eob_shuffled = _mm_shuffle_epi32(*eob, 0xe);
+  *eob = _mm_max_epi16(*eob, eob_shuffled);
+  eob_shuffled = _mm_shufflelo_epi16(*eob, 0xe);
+  *eob = _mm_max_epi16(*eob, eob_shuffled);
+  eob_shuffled = _mm_shufflelo_epi16(*eob, 0x1);
+  *eob = _mm_max_epi16(*eob, eob_shuffled);
+  eobValue = _mm_extract_epi16(*eob, 0);
+  return eobValue;
+}
+
+void av1_highbd_quantize_fp_sse4_1(
+    const tran_low_t *coeff_ptr, intptr_t count, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan, int log_scale) {
+  __m128i coeff[2], qcoeff[3], dequant[2], qparam[4], coeff_sign;
+  __m128i eob = _mm_setzero_si128();
+  const tran_low_t *src = coeff_ptr;
+  tran_low_t *quanAddr = qcoeff_ptr;
+  tran_low_t *dquanAddr = dqcoeff_ptr;
+  const int shift = 16 - log_scale;
+  const int coeff_stride = 4;
+  const int quan_stride = coeff_stride;
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  (void)scan;
+
+  memset(quanAddr, 0, count * sizeof(quanAddr[0]));
+  memset(dquanAddr, 0, count * sizeof(dquanAddr[0]));
+
+  coeff[0] = _mm_loadu_si128((__m128i const *)src);
+  const int round1 = ROUND_POWER_OF_TWO(round_ptr[1], log_scale);
+  const int round0 = ROUND_POWER_OF_TWO(round_ptr[0], log_scale);
+
+  qparam[0] = _mm_set_epi32(round1, round1, round1, round0);
+  qparam[1] = xx_set_64_from_32i(quant_ptr[1], quant_ptr[0]);
+  qparam[2] = xx_set_64_from_32i(dequant_ptr[1], dequant_ptr[0]);
+  qparam[3] = _mm_set_epi32(dequant_ptr[1], dequant_ptr[1], dequant_ptr[1],
+                            dequant_ptr[0]);
+
+  // DC and first 3 AC
+  quantize_coeff_phase1(&coeff[0], qparam, shift, log_scale, qcoeff, dequant,
+                        &coeff_sign);
+
+  // update round/quan/dquan for AC
+  qparam[0] = _mm_unpackhi_epi64(qparam[0], qparam[0]);
+  qparam[1] = xx_set1_64_from_32i(quant_ptr[1]);
+  qparam[2] = xx_set1_64_from_32i(dequant_ptr[1]);
+  qparam[3] = _mm_set1_epi32(dequant_ptr[1]);
+  quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift, log_scale,
+                        quanAddr, dquanAddr);
+
+  // next 4 AC
+  coeff[1] = _mm_loadu_si128((__m128i const *)(src + coeff_stride));
+  quantize_coeff_phase1(&coeff[1], qparam, shift, log_scale, qcoeff, dequant,
+                        &coeff_sign);
+  quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift, log_scale,
+                        quanAddr + quan_stride, dquanAddr + quan_stride);
+
+  find_eob(quanAddr, iscan, &eob);
+
+  count -= 8;
+
+  // loop for the rest of AC
+  while (count > 0) {
+    src += coeff_stride << 1;
+    quanAddr += quan_stride << 1;
+    dquanAddr += quan_stride << 1;
+    iscan += quan_stride << 1;
+
+    coeff[0] = _mm_loadu_si128((__m128i const *)src);
+    coeff[1] = _mm_loadu_si128((__m128i const *)(src + coeff_stride));
+
+    quantize_coeff_phase1(&coeff[0], qparam, shift, log_scale, qcoeff, dequant,
+                          &coeff_sign);
+    quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift,
+                          log_scale, quanAddr, dquanAddr);
+
+    quantize_coeff_phase1(&coeff[1], qparam, shift, log_scale, qcoeff, dequant,
+                          &coeff_sign);
+    quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift,
+                          log_scale, quanAddr + quan_stride,
+                          dquanAddr + quan_stride);
+
+    find_eob(quanAddr, iscan, &eob);
+
+    count -= 8;
+  }
+  *eob_ptr = get_accumulated_eob(&eob);
+}
diff --git a/libs/libaom/src/av1/encoder/x86/av1_quantize_avx2.c b/libs/libaom/src/av1/encoder/x86/av1_quantize_avx2.c
new file mode 100644
index 000000000..f5f7ee115
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/x86/av1_quantize_avx2.c
@@ -0,0 +1,445 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+static INLINE void read_coeff(const tran_low_t *coeff, __m256i *c) {
+  if (sizeof(tran_low_t) == 4) {
+    const __m256i x0 = _mm256_loadu_si256((const __m256i *)coeff);
+    const __m256i x1 = _mm256_loadu_si256((const __m256i *)coeff + 1);
+    *c = _mm256_packs_epi32(x0, x1);
+    *c = _mm256_permute4x64_epi64(*c, 0xD8);
+  } else {
+    *c = _mm256_loadu_si256((const __m256i *)coeff);
+  }
+}
+
+static INLINE void write_zero(tran_low_t *qcoeff) {
+  const __m256i zero = _mm256_setzero_si256();
+  if (sizeof(tran_low_t) == 4) {
+    _mm256_storeu_si256((__m256i *)qcoeff, zero);
+    _mm256_storeu_si256((__m256i *)qcoeff + 1, zero);
+  } else {
+    _mm256_storeu_si256((__m256i *)qcoeff, zero);
+  }
+}
+
+static INLINE void init_one_qp(const __m128i *p, __m256i *qp) {
+  const __m128i ac = _mm_unpackhi_epi64(*p, *p);
+  *qp = _mm256_insertf128_si256(_mm256_castsi128_si256(*p), ac, 1);
+}
+
+static INLINE void init_qp(const int16_t *round_ptr, const int16_t *quant_ptr,
+                           const int16_t *dequant_ptr, int log_scale,
+                           __m256i *thr, __m256i *qp) {
+  __m128i round = _mm_loadu_si128((const __m128i *)round_ptr);
+  const __m128i quant = _mm_loadu_si128((const __m128i *)quant_ptr);
+  const __m128i dequant = _mm_loadu_si128((const __m128i *)dequant_ptr);
+
+  if (log_scale > 0) {
+    const __m128i rnd = _mm_set1_epi16((int16_t)1 << (log_scale - 1));
+    round = _mm_add_epi16(round, rnd);
+    round = _mm_srai_epi16(round, log_scale);
+  }
+
+  init_one_qp(&round, &qp[0]);
+  init_one_qp(&quant, &qp[1]);
+
+  if (log_scale == 1) {
+    qp[1] = _mm256_slli_epi16(qp[1], log_scale);
+  }
+
+  init_one_qp(&dequant, &qp[2]);
+  *thr = _mm256_srai_epi16(qp[2], 1 + log_scale);
+}
+
+static INLINE void update_qp(int log_scale, __m256i *thr, __m256i *qp) {
+  qp[0] = _mm256_permute2x128_si256(qp[0], qp[0], 0x11);
+  qp[1] = _mm256_permute2x128_si256(qp[1], qp[1], 0x11);
+  qp[2] = _mm256_permute2x128_si256(qp[2], qp[2], 0x11);
+  *thr = _mm256_srai_epi16(qp[2], 1 + log_scale);
+}
+
+#define store_quan(q, addr)                               \
+  do {                                                    \
+    __m256i sign_bits = _mm256_srai_epi16(q, 15);         \
+    __m256i y0 = _mm256_unpacklo_epi16(q, sign_bits);     \
+    __m256i y1 = _mm256_unpackhi_epi16(q, sign_bits);     \
+    __m256i x0 = _mm256_permute2x128_si256(y0, y1, 0x20); \
+    __m256i x1 = _mm256_permute2x128_si256(y0, y1, 0x31); \
+    _mm256_storeu_si256((__m256i *)addr, x0);             \
+    _mm256_storeu_si256((__m256i *)addr + 1, x1);         \
+  } while (0)
+
+#define store_two_quan(q, addr1, dq, addr2)      \
+  do {                                           \
+    if (sizeof(tran_low_t) == 4) {               \
+      store_quan(q, addr1);                      \
+      store_quan(dq, addr2);                     \
+    } else {                                     \
+      _mm256_storeu_si256((__m256i *)addr1, q);  \
+      _mm256_storeu_si256((__m256i *)addr2, dq); \
+    }                                            \
+  } while (0)
+
+static INLINE uint16_t quant_gather_eob(__m256i eob) {
+  const __m128i eob_lo = _mm256_castsi256_si128(eob);
+  const __m128i eob_hi = _mm256_extractf128_si256(eob, 1);
+  __m128i eob_s = _mm_max_epi16(eob_lo, eob_hi);
+  eob_s = _mm_subs_epu16(_mm_set1_epi16(INT16_MAX), eob_s);
+  eob_s = _mm_minpos_epu16(eob_s);
+  return INT16_MAX - _mm_extract_epi16(eob_s, 0);
+}
+
+static INLINE void quantize(const __m256i *thr, const __m256i *qp, __m256i *c,
+                            const int16_t *iscan_ptr, tran_low_t *qcoeff,
+                            tran_low_t *dqcoeff, __m256i *eob) {
+  const __m256i abs_coeff = _mm256_abs_epi16(*c);
+  __m256i mask = _mm256_cmpgt_epi16(abs_coeff, *thr);
+  mask = _mm256_or_si256(mask, _mm256_cmpeq_epi16(abs_coeff, *thr));
+  const int nzflag = _mm256_movemask_epi8(mask);
+
+  if (nzflag) {
+    __m256i q = _mm256_adds_epi16(abs_coeff, qp[0]);
+    q = _mm256_mulhi_epi16(q, qp[1]);
+    q = _mm256_sign_epi16(q, *c);
+    const __m256i dq = _mm256_mullo_epi16(q, qp[2]);
+
+    store_two_quan(q, qcoeff, dq, dqcoeff);
+    const __m256i zero = _mm256_setzero_si256();
+    const __m256i iscan = _mm256_loadu_si256((const __m256i *)iscan_ptr);
+    const __m256i zero_coeff = _mm256_cmpeq_epi16(dq, zero);
+    const __m256i nzero_coeff = _mm256_cmpeq_epi16(zero_coeff, zero);
+    __m256i cur_eob = _mm256_sub_epi16(iscan, nzero_coeff);
+    cur_eob = _mm256_and_si256(cur_eob, nzero_coeff);
+    *eob = _mm256_max_epi16(*eob, cur_eob);
+  } else {
+    write_zero(qcoeff);
+    write_zero(dqcoeff);
+  }
+}
+
+static INLINE __m256i scan_eob_256(const __m256i *iscan_ptr,
+                                   __m256i *coeff256) {
+  const __m256i iscan = _mm256_loadu_si256(iscan_ptr);
+  const __m256i zero256 = _mm256_setzero_si256();
+  const __m256i zero_coeff0 = _mm256_cmpeq_epi16(*coeff256, zero256);
+  const __m256i nzero_coeff0 = _mm256_cmpeq_epi16(zero_coeff0, zero256);
+  // Add one to convert from indices to counts
+  const __m256i iscan_plus_one = _mm256_sub_epi16(iscan, nzero_coeff0);
+  return _mm256_and_si256(iscan_plus_one, nzero_coeff0);
+}
+
+static INLINE int16_t accumulate_eob(__m128i eob) {
+  __m128i eob_shuffled;
+  eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
+  eob = _mm_max_epi16(eob, eob_shuffled);
+  eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
+  eob = _mm_max_epi16(eob, eob_shuffled);
+  eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
+  eob = _mm_max_epi16(eob, eob_shuffled);
+  return _mm_extract_epi16(eob, 1);
+}
+
+static INLINE void store_zero_tran_low(int16_t *a) {
+  const __m256i zero = _mm256_setzero_si256();
+  _mm256_storeu_si256((__m256i *)(a), zero);
+}
+
+void av1_quantize_lp_avx2(const int16_t *coeff_ptr, intptr_t n_coeffs,
+                          const int16_t *round_ptr, const int16_t *quant_ptr,
+                          int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+                          const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                          const int16_t *scan) {
+  __m128i eob;
+  __m256i round256, quant256, dequant256;
+  __m256i eob256, thr256;
+
+  coeff_ptr += n_coeffs;
+  scan += n_coeffs;
+  qcoeff_ptr += n_coeffs;
+  dqcoeff_ptr += n_coeffs;
+  n_coeffs = -n_coeffs;
+
+  {
+    __m256i coeff256;
+
+    // Setup global values
+    {
+      const __m128i round = _mm_load_si128((const __m128i *)round_ptr);
+      const __m128i quant = _mm_load_si128((const __m128i *)quant_ptr);
+      const __m128i dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+      round256 = _mm256_castsi128_si256(round);
+      round256 = _mm256_permute4x64_epi64(round256, 0x54);
+
+      quant256 = _mm256_castsi128_si256(quant);
+      quant256 = _mm256_permute4x64_epi64(quant256, 0x54);
+
+      dequant256 = _mm256_castsi128_si256(dequant);
+      dequant256 = _mm256_permute4x64_epi64(dequant256, 0x54);
+    }
+
+    {
+      __m256i qcoeff256;
+      __m256i qtmp256;
+      coeff256 = _mm256_loadu_si256((const __m256i *)(coeff_ptr + n_coeffs));
+      qcoeff256 = _mm256_abs_epi16(coeff256);
+      qcoeff256 = _mm256_adds_epi16(qcoeff256, round256);
+      qtmp256 = _mm256_mulhi_epi16(qcoeff256, quant256);
+      qcoeff256 = _mm256_sign_epi16(qtmp256, coeff256);
+      _mm256_storeu_si256((__m256i *)(qcoeff_ptr + n_coeffs), qcoeff256);
+      coeff256 = _mm256_mullo_epi16(qcoeff256, dequant256);
+      _mm256_storeu_si256((__m256i *)(dqcoeff_ptr + n_coeffs), coeff256);
+    }
+
+    eob256 = scan_eob_256((const __m256i *)(scan + n_coeffs), &coeff256);
+    n_coeffs += 8 * 2;
+  }
+
+  // remove dc constants
+  dequant256 = _mm256_permute2x128_si256(dequant256, dequant256, 0x31);
+  quant256 = _mm256_permute2x128_si256(quant256, quant256, 0x31);
+  round256 = _mm256_permute2x128_si256(round256, round256, 0x31);
+
+  thr256 = _mm256_srai_epi16(dequant256, 1);
+
+  // AC only loop
+  while (n_coeffs < 0) {
+    __m256i coeff256 =
+        _mm256_loadu_si256((const __m256i *)(coeff_ptr + n_coeffs));
+    __m256i qcoeff256 = _mm256_abs_epi16(coeff256);
+    int32_t nzflag =
+        _mm256_movemask_epi8(_mm256_cmpgt_epi16(qcoeff256, thr256));
+
+    if (nzflag) {
+      __m256i qtmp256;
+      qcoeff256 = _mm256_adds_epi16(qcoeff256, round256);
+      qtmp256 = _mm256_mulhi_epi16(qcoeff256, quant256);
+      qcoeff256 = _mm256_sign_epi16(qtmp256, coeff256);
+      _mm256_storeu_si256((__m256i *)(qcoeff_ptr + n_coeffs), qcoeff256);
+      coeff256 = _mm256_mullo_epi16(qcoeff256, dequant256);
+      _mm256_storeu_si256((__m256i *)(dqcoeff_ptr + n_coeffs), coeff256);
+      eob256 = _mm256_max_epi16(
+          eob256, scan_eob_256((const __m256i *)(scan + n_coeffs), &coeff256));
+    } else {
+      store_zero_tran_low(qcoeff_ptr + n_coeffs);
+      store_zero_tran_low(dqcoeff_ptr + n_coeffs);
+    }
+    n_coeffs += 8 * 2;
+  }
+
+  eob = _mm_max_epi16(_mm256_castsi256_si128(eob256),
+                      _mm256_extracti128_si256(eob256, 1));
+
+  *eob_ptr = accumulate_eob(eob);
+}
+
+void av1_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                          const int16_t *zbin_ptr, const int16_t *round_ptr,
+                          const int16_t *quant_ptr,
+                          const int16_t *quant_shift_ptr,
+                          tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                          const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                          const int16_t *scan_ptr, const int16_t *iscan_ptr) {
+  (void)scan_ptr;
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  const unsigned int step = 16;
+
+  __m256i qp[3];
+  __m256i coeff, thr;
+  const int log_scale = 0;
+
+  init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp);
+  read_coeff(coeff_ptr, &coeff);
+
+  __m256i eob = _mm256_setzero_si256();
+  quantize(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+  coeff_ptr += step;
+  qcoeff_ptr += step;
+  dqcoeff_ptr += step;
+  iscan_ptr += step;
+  n_coeffs -= step;
+
+  update_qp(log_scale, &thr, qp);
+
+  while (n_coeffs > 0) {
+    read_coeff(coeff_ptr, &coeff);
+    quantize(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+    coeff_ptr += step;
+    qcoeff_ptr += step;
+    dqcoeff_ptr += step;
+    iscan_ptr += step;
+    n_coeffs -= step;
+  }
+  *eob_ptr = quant_gather_eob(eob);
+}
+
+static INLINE void quantize_32x32(const __m256i *thr, const __m256i *qp,
+                                  __m256i *c, const int16_t *iscan_ptr,
+                                  tran_low_t *qcoeff, tran_low_t *dqcoeff,
+                                  __m256i *eob) {
+  const __m256i abs_coeff = _mm256_abs_epi16(*c);
+  __m256i mask = _mm256_cmpgt_epi16(abs_coeff, *thr);
+  mask = _mm256_or_si256(mask, _mm256_cmpeq_epi16(abs_coeff, *thr));
+  const int nzflag = _mm256_movemask_epi8(mask);
+
+  if (nzflag) {
+    __m256i q = _mm256_adds_epi16(abs_coeff, qp[0]);
+    q = _mm256_mulhi_epu16(q, qp[1]);
+
+    __m256i dq = _mm256_mullo_epi16(q, qp[2]);
+    dq = _mm256_srli_epi16(dq, 1);
+
+    q = _mm256_sign_epi16(q, *c);
+    dq = _mm256_sign_epi16(dq, *c);
+
+    store_two_quan(q, qcoeff, dq, dqcoeff);
+    const __m256i zero = _mm256_setzero_si256();
+    const __m256i iscan = _mm256_loadu_si256((const __m256i *)iscan_ptr);
+    const __m256i zero_coeff = _mm256_cmpeq_epi16(dq, zero);
+    const __m256i nzero_coeff = _mm256_cmpeq_epi16(zero_coeff, zero);
+    __m256i cur_eob = _mm256_sub_epi16(iscan, nzero_coeff);
+    cur_eob = _mm256_and_si256(cur_eob, nzero_coeff);
+    *eob = _mm256_max_epi16(*eob, cur_eob);
+  } else {
+    write_zero(qcoeff);
+    write_zero(dqcoeff);
+  }
+}
+
+void av1_quantize_fp_32x32_avx2(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan_ptr, const int16_t *iscan_ptr) {
+  (void)scan_ptr;
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  const unsigned int step = 16;
+
+  __m256i qp[3];
+  __m256i coeff, thr;
+  const int log_scale = 1;
+
+  init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp);
+  read_coeff(coeff_ptr, &coeff);
+
+  __m256i eob = _mm256_setzero_si256();
+  quantize_32x32(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+  coeff_ptr += step;
+  qcoeff_ptr += step;
+  dqcoeff_ptr += step;
+  iscan_ptr += step;
+  n_coeffs -= step;
+
+  update_qp(log_scale, &thr, qp);
+
+  while (n_coeffs > 0) {
+    read_coeff(coeff_ptr, &coeff);
+    quantize_32x32(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+    coeff_ptr += step;
+    qcoeff_ptr += step;
+    dqcoeff_ptr += step;
+    iscan_ptr += step;
+    n_coeffs -= step;
+  }
+  *eob_ptr = quant_gather_eob(eob);
+}
+
+static INLINE void quantize_64x64(const __m256i *thr, const __m256i *qp,
+                                  __m256i *c, const int16_t *iscan_ptr,
+                                  tran_low_t *qcoeff, tran_low_t *dqcoeff,
+                                  __m256i *eob) {
+  const __m256i abs_coeff = _mm256_abs_epi16(*c);
+  __m256i mask = _mm256_cmpgt_epi16(abs_coeff, *thr);
+  mask = _mm256_or_si256(mask, _mm256_cmpeq_epi16(abs_coeff, *thr));
+  const int nzflag = _mm256_movemask_epi8(mask);
+
+  if (nzflag) {
+    __m256i q = _mm256_adds_epi16(abs_coeff, qp[0]);
+    __m256i qh = _mm256_mulhi_epi16(q, qp[1]);
+    __m256i ql = _mm256_mullo_epi16(q, qp[1]);
+    qh = _mm256_slli_epi16(qh, 2);
+    ql = _mm256_srli_epi16(ql, 14);
+    q = _mm256_or_si256(qh, ql);
+    const __m256i dqh = _mm256_slli_epi16(_mm256_mulhi_epi16(q, qp[2]), 14);
+    const __m256i dql = _mm256_srli_epi16(_mm256_mullo_epi16(q, qp[2]), 2);
+    __m256i dq = _mm256_or_si256(dqh, dql);
+
+    q = _mm256_sign_epi16(q, *c);
+    dq = _mm256_sign_epi16(dq, *c);
+
+    store_two_quan(q, qcoeff, dq, dqcoeff);
+    const __m256i zero = _mm256_setzero_si256();
+    const __m256i iscan = _mm256_loadu_si256((const __m256i *)iscan_ptr);
+    const __m256i zero_coeff = _mm256_cmpeq_epi16(dq, zero);
+    const __m256i nzero_coeff = _mm256_cmpeq_epi16(zero_coeff, zero);
+    __m256i cur_eob = _mm256_sub_epi16(iscan, nzero_coeff);
+    cur_eob = _mm256_and_si256(cur_eob, nzero_coeff);
+    *eob = _mm256_max_epi16(*eob, cur_eob);
+  } else {
+    write_zero(qcoeff);
+    write_zero(dqcoeff);
+  }
+}
+
+void av1_quantize_fp_64x64_avx2(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan_ptr, const int16_t *iscan_ptr) {
+  (void)scan_ptr;
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  const unsigned int step = 16;
+
+  __m256i qp[3];
+  __m256i coeff, thr;
+  const int log_scale = 2;
+
+  init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp);
+  read_coeff(coeff_ptr, &coeff);
+
+  __m256i eob = _mm256_setzero_si256();
+  quantize_64x64(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+  coeff_ptr += step;
+  qcoeff_ptr += step;
+  dqcoeff_ptr += step;
+  iscan_ptr += step;
+  n_coeffs -= step;
+
+  update_qp(log_scale, &thr, qp);
+
+  while (n_coeffs > 0) {
+    read_coeff(coeff_ptr, &coeff);
+    quantize_64x64(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+    coeff_ptr += step;
+    qcoeff_ptr += step;
+    dqcoeff_ptr += step;
+    iscan_ptr += step;
+    n_coeffs -= step;
+  }
+  *eob_ptr = quant_gather_eob(eob);
+}
diff --git a/libs/libaom/src/av1/encoder/x86/av1_quantize_sse2.c b/libs/libaom/src/av1/encoder/x86/av1_quantize_sse2.c
new file mode 100644
index 000000000..5497c7eb7
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/x86/av1_quantize_sse2.c
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+#include <xmmintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+
+static INLINE void read_coeff(const tran_low_t *coeff, intptr_t offset,
+                              __m128i *c0, __m128i *c1) {
+  const tran_low_t *addr = coeff + offset;
+  if (sizeof(tran_low_t) == 4) {
+    const __m128i x0 = _mm_load_si128((const __m128i *)addr);
+    const __m128i x1 = _mm_load_si128((const __m128i *)addr + 1);
+    const __m128i x2 = _mm_load_si128((const __m128i *)addr + 2);
+    const __m128i x3 = _mm_load_si128((const __m128i *)addr + 3);
+    *c0 = _mm_packs_epi32(x0, x1);
+    *c1 = _mm_packs_epi32(x2, x3);
+  } else {
+    *c0 = _mm_load_si128((const __m128i *)addr);
+    *c1 = _mm_load_si128((const __m128i *)addr + 1);
+  }
+}
+
+static INLINE void write_qcoeff(const __m128i *qc0, const __m128i *qc1,
+                                tran_low_t *qcoeff, intptr_t offset) {
+  tran_low_t *addr = qcoeff + offset;
+  if (sizeof(tran_low_t) == 4) {
+    const __m128i zero = _mm_setzero_si128();
+    __m128i sign_bits = _mm_cmplt_epi16(*qc0, zero);
+    __m128i y0 = _mm_unpacklo_epi16(*qc0, sign_bits);
+    __m128i y1 = _mm_unpackhi_epi16(*qc0, sign_bits);
+    _mm_store_si128((__m128i *)addr, y0);
+    _mm_store_si128((__m128i *)addr + 1, y1);
+
+    sign_bits = _mm_cmplt_epi16(*qc1, zero);
+    y0 = _mm_unpacklo_epi16(*qc1, sign_bits);
+    y1 = _mm_unpackhi_epi16(*qc1, sign_bits);
+    _mm_store_si128((__m128i *)addr + 2, y0);
+    _mm_store_si128((__m128i *)addr + 3, y1);
+  } else {
+    _mm_store_si128((__m128i *)addr, *qc0);
+    _mm_store_si128((__m128i *)addr + 1, *qc1);
+  }
+}
+
+static INLINE void write_zero(tran_low_t *qcoeff, intptr_t offset) {
+  const __m128i zero = _mm_setzero_si128();
+  tran_low_t *addr = qcoeff + offset;
+  if (sizeof(tran_low_t) == 4) {
+    _mm_store_si128((__m128i *)addr, zero);
+    _mm_store_si128((__m128i *)addr + 1, zero);
+    _mm_store_si128((__m128i *)addr + 2, zero);
+    _mm_store_si128((__m128i *)addr + 3, zero);
+  } else {
+    _mm_store_si128((__m128i *)addr, zero);
+    _mm_store_si128((__m128i *)addr + 1, zero);
+  }
+}
+
+static INLINE void quantize(const int16_t *iscan_ptr,
+                            const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                            const __m128i *round0, const __m128i *round1,
+                            const __m128i *quant0, const __m128i *quant1,
+                            const __m128i *dequant0, const __m128i *dequant1,
+                            const __m128i *thr0, const __m128i *thr1,
+                            __m128i *eob) {
+  __m128i coeff0, coeff1;
+  // Do DC and first 15 AC
+  read_coeff(coeff_ptr, n_coeffs, &coeff0, &coeff1);
+
+  // Poor man's sign extract
+  const __m128i coeff0_sign = _mm_srai_epi16(coeff0, 15);
+  const __m128i coeff1_sign = _mm_srai_epi16(coeff1, 15);
+  __m128i qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+  __m128i qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+  qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+  qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+  const __m128i mask0 = _mm_or_si128(_mm_cmpgt_epi16(qcoeff0, *thr0),
+                                     _mm_cmpeq_epi16(qcoeff0, *thr0));
+  const __m128i mask1 = _mm_or_si128(_mm_cmpgt_epi16(qcoeff1, *thr1),
+                                     _mm_cmpeq_epi16(qcoeff1, *thr1));
+  const int nzflag = _mm_movemask_epi8(mask0) | _mm_movemask_epi8(mask1);
+
+  if (nzflag) {
+    qcoeff0 = _mm_adds_epi16(qcoeff0, *round0);
+    qcoeff1 = _mm_adds_epi16(qcoeff1, *round1);
+    const __m128i qtmp0 = _mm_mulhi_epi16(qcoeff0, *quant0);
+    const __m128i qtmp1 = _mm_mulhi_epi16(qcoeff1, *quant1);
+
+    // Reinsert signs
+    qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
+    qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
+    qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+    qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+    write_qcoeff(&qcoeff0, &qcoeff1, qcoeff_ptr, n_coeffs);
+
+    coeff0 = _mm_mullo_epi16(qcoeff0, *dequant0);
+    coeff1 = _mm_mullo_epi16(qcoeff1, *dequant1);
+
+    write_qcoeff(&coeff0, &coeff1, dqcoeff_ptr, n_coeffs);
+
+    const __m128i zero = _mm_setzero_si128();
+    // Scan for eob
+    const __m128i zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+    const __m128i zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+    const __m128i nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+    const __m128i nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+    const __m128i iscan0 =
+        _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
+    const __m128i iscan1 =
+        _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
+    // Add one to convert from indices to counts
+    const __m128i iscan0_nz = _mm_sub_epi16(iscan0, nzero_coeff0);
+    const __m128i iscan1_nz = _mm_sub_epi16(iscan1, nzero_coeff1);
+    const __m128i eob0 = _mm_and_si128(iscan0_nz, nzero_coeff0);
+    const __m128i eob1 = _mm_and_si128(iscan1_nz, nzero_coeff1);
+    const __m128i eob2 = _mm_max_epi16(eob0, eob1);
+    *eob = _mm_max_epi16(*eob, eob2);
+  } else {
+    write_zero(qcoeff_ptr, n_coeffs);
+    write_zero(dqcoeff_ptr, n_coeffs);
+  }
+}
+
+void av1_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                          const int16_t *zbin_ptr, const int16_t *round_ptr,
+                          const int16_t *quant_ptr,
+                          const int16_t *quant_shift_ptr,
+                          tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                          const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                          const int16_t *scan_ptr, const int16_t *iscan_ptr) {
+  (void)scan_ptr;
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+
+  coeff_ptr += n_coeffs;
+  iscan_ptr += n_coeffs;
+  qcoeff_ptr += n_coeffs;
+  dqcoeff_ptr += n_coeffs;
+  n_coeffs = -n_coeffs;
+
+  const __m128i round0 = _mm_load_si128((const __m128i *)round_ptr);
+  const __m128i round1 = _mm_unpackhi_epi64(round0, round0);
+  const __m128i quant0 = _mm_load_si128((const __m128i *)quant_ptr);
+  const __m128i quant1 = _mm_unpackhi_epi64(quant0, quant0);
+  const __m128i dequant0 = _mm_load_si128((const __m128i *)dequant_ptr);
+  const __m128i dequant1 = _mm_unpackhi_epi64(dequant0, dequant0);
+  const __m128i thr0 = _mm_srai_epi16(dequant0, 1);
+  const __m128i thr1 = _mm_srai_epi16(dequant1, 1);
+  __m128i eob = _mm_setzero_si128();
+
+  quantize(iscan_ptr, coeff_ptr, n_coeffs, qcoeff_ptr, dqcoeff_ptr, &round0,
+           &round1, &quant0, &quant1, &dequant0, &dequant1, &thr0, &thr1, &eob);
+
+  n_coeffs += 8 * 2;
+
+  // AC only loop
+  while (n_coeffs < 0) {
+    quantize(iscan_ptr, coeff_ptr, n_coeffs, qcoeff_ptr, dqcoeff_ptr, &round1,
+             &round1, &quant1, &quant1, &dequant1, &dequant1, &thr1, &thr1,
+             &eob);
+    n_coeffs += 8 * 2;
+  }
+
+  // Accumulate EOB
+  {
+    __m128i eob_shuffled;
+    eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
+    eob = _mm_max_epi16(eob, eob_shuffled);
+    eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
+    eob = _mm_max_epi16(eob, eob_shuffled);
+    eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
+    eob = _mm_max_epi16(eob, eob_shuffled);
+    *eob_ptr = _mm_extract_epi16(eob, 1);
+  }
+}
diff --git a/libs/libaom/src/av1/encoder/x86/av1_quantize_ssse3_x86_64.asm b/libs/libaom/src/av1/encoder/x86/av1_quantize_ssse3_x86_64.asm
new file mode 100644
index 000000000..ad4ae274e
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/x86/av1_quantize_ssse3_x86_64.asm
@@ -0,0 +1,204 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%define private_prefix av1
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_1: times 8 dw 1
+
+SECTION .text
+
+%macro QUANTIZE_FP 2
+cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
+                                shift, qcoeff, dqcoeff, dequant, \
+                                eob, scan, iscan
+  cmp                    dword skipm, 0
+  jne .blank
+
+  ; actual quantize loop - setup pointers, rounders, etc.
+  movifnidn                   coeffq, coeffmp
+  movifnidn                  ncoeffq, ncoeffmp
+  mov                             r2, dequantmp
+  movifnidn                    zbinq, zbinmp
+  movifnidn                   roundq, roundmp
+  movifnidn                   quantq, quantmp
+  mova                            m1, [roundq]             ; m1 = round
+  mova                            m2, [quantq]             ; m2 = quant
+%ifidn %1, fp_32x32
+  pcmpeqw                         m5, m5
+  psrlw                           m5, 15
+  paddw                           m1, m5
+  psrlw                           m1, 1                    ; m1 = (m1 + 1) / 2
+%endif
+  mova                            m3, [r2q]                ; m3 = dequant
+  mov                             r3, qcoeffmp
+  mov                             r4, dqcoeffmp
+  mov                             r5, iscanmp
+%ifidn %1, fp_32x32
+  psllw                           m2, 1
+%endif
+  pxor                            m5, m5                   ; m5 = dedicated zero
+
+  lea                         coeffq, [  coeffq+ncoeffq*2]
+  lea                            r5q, [  r5q+ncoeffq*2]
+  lea                            r3q, [ r3q+ncoeffq*2]
+  lea                            r4q, [r4q+ncoeffq*2]
+  neg                        ncoeffq
+
+  ; get DC and first 15 AC coeffs
+  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
+  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
+  pabsw                           m6, m9                   ; m6 = abs(m9)
+  pabsw                          m11, m10                  ; m11 = abs(m10)
+  pcmpeqw                         m7, m7
+
+  paddsw                          m6, m1                   ; m6 += round
+  punpckhqdq                      m1, m1
+  paddsw                         m11, m1                   ; m11 += round
+  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
+  punpckhqdq                      m2, m2
+  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
+  psignw                          m8, m9                   ; m8 = reinsert sign
+  psignw                         m13, m10                  ; m13 = reinsert sign
+  mova            [r3q+ncoeffq*2+ 0], m8
+  mova            [r3q+ncoeffq*2+16], m13
+%ifidn %1, fp_32x32
+  pabsw                           m8, m8
+  pabsw                          m13, m13
+%endif
+  pmullw                          m8, m3                   ; r4[i] = r3[i] * q
+  punpckhqdq                      m3, m3
+  pmullw                         m13, m3                   ; r4[i] = r3[i] * q
+%ifidn %1, fp_32x32
+  psrlw                           m8, 1
+  psrlw                          m13, 1
+  psignw                          m8, m9
+  psignw                         m13, m10
+  psrlw                           m0, m3, 2
+%else
+  psrlw                           m0, m3, 1
+%endif
+  mova            [r4q+ncoeffq*2+ 0], m8
+  mova            [r4q+ncoeffq*2+16], m13
+  pcmpeqw                         m8, m5                   ; m8 = c[i] == 0
+  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
+  mova                            m6, [  r5q+ncoeffq*2+ 0] ; m6 = scan[i]
+  mova                           m11, [  r5q+ncoeffq*2+16] ; m11 = scan[i]
+  psubw                           m6, m7                   ; m6 = scan[i] + 1
+  psubw                          m11, m7                   ; m11 = scan[i] + 1
+  pandn                           m8, m6                   ; m8 = max(eob)
+  pandn                          m13, m11                  ; m13 = max(eob)
+  pmaxsw                          m8, m13
+  add                        ncoeffq, mmsize
+  jz .accumulate_eob
+
+.ac_only_loop:
+  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
+  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
+  pabsw                           m6, m9                   ; m6 = abs(m9)
+  pabsw                          m11, m10                  ; m11 = abs(m10)
+
+  pcmpgtw                         m7, m6,  m0
+  pcmpgtw                        m12, m11, m0
+  pmovmskb                       r6d, m7
+  pmovmskb                       r2d, m12
+
+  or                              r6, r2
+  jz .skip_iter
+
+  pcmpeqw                         m7, m7
+
+  paddsw                          m6, m1                   ; m6 += round
+  paddsw                         m11, m1                   ; m11 += round
+  pmulhw                         m14, m6, m2               ; m14 = m6*q>>16
+  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
+  psignw                         m14, m9                   ; m14 = reinsert sign
+  psignw                         m13, m10                  ; m13 = reinsert sign
+  mova            [r3q+ncoeffq*2+ 0], m14
+  mova            [r3q+ncoeffq*2+16], m13
+%ifidn %1, fp_32x32
+  pabsw                          m14, m14
+  pabsw                          m13, m13
+%endif
+  pmullw                         m14, m3                   ; r4[i] = r3[i] * q
+  pmullw                         m13, m3                   ; r4[i] = r3[i] * q
+%ifidn %1, fp_32x32
+  psrlw                          m14, 1
+  psrlw                          m13, 1
+  psignw                         m14, m9
+  psignw                         m13, m10
+%endif
+  mova            [r4q+ncoeffq*2+ 0], m14
+  mova            [r4q+ncoeffq*2+16], m13
+  pcmpeqw                        m14, m5                   ; m14 = c[i] == 0
+  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
+  mova                            m6, [  r5q+ncoeffq*2+ 0] ; m6 = scan[i]
+  mova                           m11, [  r5q+ncoeffq*2+16] ; m11 = scan[i]
+  psubw                           m6, m7                   ; m6 = scan[i] + 1
+  psubw                          m11, m7                   ; m11 = scan[i] + 1
+  pandn                          m14, m6                   ; m14 = max(eob)
+  pandn                          m13, m11                  ; m13 = max(eob)
+  pmaxsw                          m8, m14
+  pmaxsw                          m8, m13
+  add                        ncoeffq, mmsize
+  jl .ac_only_loop
+
+  jmp .accumulate_eob
+.skip_iter:
+  mova            [r3q+ncoeffq*2+ 0], m5
+  mova            [r3q+ncoeffq*2+16], m5
+  mova            [r4q+ncoeffq*2+ 0], m5
+  mova            [r4q+ncoeffq*2+16], m5
+  add                        ncoeffq, mmsize
+  jl .ac_only_loop
+
+.accumulate_eob:
+  ; horizontally accumulate/max eobs and write into [eob] memory pointer
+  mov                             r2, eobmp
+  pshufd                          m7, m8, 0xe
+  pmaxsw                          m8, m7
+  pshuflw                         m7, m8, 0xe
+  pmaxsw                          m8, m7
+  pshuflw                         m7, m8, 0x1
+  pmaxsw                          m8, m7
+  pextrw                          r6, m8, 0
+  mov                           [r2], r6
+  RET
+
+  ; skip-block, i.e. just write all zeroes
+.blank:
+  mov                             r0, dqcoeffmp
+  movifnidn                  ncoeffq, ncoeffmp
+  mov                             r2, qcoeffmp
+  mov                             r3, eobmp
+
+  lea                            r0q, [r0q+ncoeffq*2]
+  lea                            r2q, [r2q+ncoeffq*2]
+  neg                        ncoeffq
+  pxor                            m7, m7
+.blank_loop:
+  mova            [r0q+ncoeffq*2+ 0], m7
+  mova            [r0q+ncoeffq*2+16], m7
+  mova            [r2q+ncoeffq*2+ 0], m7
+  mova            [r2q+ncoeffq*2+16], m7
+  add                        ncoeffq, mmsize
+  jl .blank_loop
+  mov                     word [r3q], 0
+  RET
+%endmacro
+
+INIT_XMM ssse3
+QUANTIZE_FP fp, 7
+QUANTIZE_FP fp_32x32, 7
diff --git a/libs/libaom/src/av1/encoder/x86/av1_ssim_opt_x86_64.asm b/libs/libaom/src/av1/encoder/x86/av1_ssim_opt_x86_64.asm
new file mode 100644
index 000000000..faa2a232a
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/x86/av1_ssim_opt_x86_64.asm
@@ -0,0 +1,222 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "aom_ports/x86_abi_support.asm"
+
+; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr
+%macro TABULATE_SSIM 0
+        paddusw         xmm15, xmm3  ; sum_s
+        paddusw         xmm14, xmm4  ; sum_r
+        movdqa          xmm1, xmm3
+        pmaddwd         xmm1, xmm1
+        paddd           xmm13, xmm1 ; sum_sq_s
+        movdqa          xmm2, xmm4
+        pmaddwd         xmm2, xmm2
+        paddd           xmm12, xmm2 ; sum_sq_r
+        pmaddwd         xmm3, xmm4
+        paddd           xmm11, xmm3  ; sum_sxr
+%endmacro
+
+; Sum across the register %1 starting with q words
+%macro SUM_ACROSS_Q 1
+        movdqa          xmm2,%1
+        punpckldq       %1,xmm0
+        punpckhdq       xmm2,xmm0
+        paddq           %1,xmm2
+        movdqa          xmm2,%1
+        punpcklqdq      %1,xmm0
+        punpckhqdq      xmm2,xmm0
+        paddq           %1,xmm2
+%endmacro
+
+; Sum across the register %1 starting with q words
+%macro SUM_ACROSS_W 1
+        movdqa          xmm1, %1
+        punpcklwd       %1,xmm0
+        punpckhwd       xmm1,xmm0
+        paddd           %1, xmm1
+        SUM_ACROSS_Q    %1
+%endmacro
+
+SECTION .text
+
+;void ssim_parms_sse2(
+;    unsigned char *s,
+;    int sp,
+;    unsigned char *r,
+;    int rp
+;    unsigned long *sum_s,
+;    unsigned long *sum_r,
+;    unsigned long *sum_sq_s,
+;    unsigned long *sum_sq_r,
+;    unsigned long *sum_sxr);
+;
+; TODO: Use parm passing through structure, probably don't need the pxors
+; ( calling app will initialize to 0 ) could easily fit everything in sse2
+; without too much hastle, and can probably do better estimates with psadw
+; or pavgb At this point this is just meant to be first pass for calculating
+; all the parms needed for 16x16 ssim so we can play with dssim as distortion
+; in mode selection code.
+global sym(av1_ssim_parms_16x16_sse2) PRIVATE
+sym(av1_ssim_parms_16x16_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 9
+    SAVE_XMM 15
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    mov             rsi,        arg(0) ;s
+    mov             rcx,        arg(1) ;sp
+    mov             rdi,        arg(2) ;r
+    mov             rax,        arg(3) ;rp
+
+    pxor            xmm0, xmm0
+    pxor            xmm15,xmm15  ;sum_s
+    pxor            xmm14,xmm14  ;sum_r
+    pxor            xmm13,xmm13  ;sum_sq_s
+    pxor            xmm12,xmm12  ;sum_sq_r
+    pxor            xmm11,xmm11  ;sum_sxr
+
+    mov             rdx, 16      ;row counter
+.NextRow:
+
+    ;grab source and reference pixels
+    movdqu          xmm5, [rsi]
+    movdqu          xmm6, [rdi]
+    movdqa          xmm3, xmm5
+    movdqa          xmm4, xmm6
+    punpckhbw       xmm3, xmm0 ; high_s
+    punpckhbw       xmm4, xmm0 ; high_r
+
+    TABULATE_SSIM
+
+    movdqa          xmm3, xmm5
+    movdqa          xmm4, xmm6
+    punpcklbw       xmm3, xmm0 ; low_s
+    punpcklbw       xmm4, xmm0 ; low_r
+
+    TABULATE_SSIM
+
+    add             rsi, rcx   ; next s row
+    add             rdi, rax   ; next r row
+
+    dec             rdx        ; counter
+    jnz .NextRow
+
+    SUM_ACROSS_W    xmm15
+    SUM_ACROSS_W    xmm14
+    SUM_ACROSS_Q    xmm13
+    SUM_ACROSS_Q    xmm12
+    SUM_ACROSS_Q    xmm11
+
+    mov             rdi,arg(4)
+    movd            [rdi], xmm15;
+    mov             rdi,arg(5)
+    movd            [rdi], xmm14;
+    mov             rdi,arg(6)
+    movd            [rdi], xmm13;
+    mov             rdi,arg(7)
+    movd            [rdi], xmm12;
+    mov             rdi,arg(8)
+    movd            [rdi], xmm11;
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void ssim_parms_sse2(
+;    unsigned char *s,
+;    int sp,
+;    unsigned char *r,
+;    int rp
+;    unsigned long *sum_s,
+;    unsigned long *sum_r,
+;    unsigned long *sum_sq_s,
+;    unsigned long *sum_sq_r,
+;    unsigned long *sum_sxr);
+;
+; TODO: Use parm passing through structure, probably don't need the pxors
+; ( calling app will initialize to 0 ) could easily fit everything in sse2
+; without too much hastle, and can probably do better estimates with psadw
+; or pavgb At this point this is just meant to be first pass for calculating
+; all the parms needed for 16x16 ssim so we can play with dssim as distortion
+; in mode selection code.
+global sym(av1_ssim_parms_8x8_sse2) PRIVATE
+sym(av1_ssim_parms_8x8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 9
+    SAVE_XMM 15
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    mov             rsi,        arg(0) ;s
+    mov             rcx,        arg(1) ;sp
+    mov             rdi,        arg(2) ;r
+    mov             rax,        arg(3) ;rp
+
+    pxor            xmm0, xmm0
+    pxor            xmm15,xmm15  ;sum_s
+    pxor            xmm14,xmm14  ;sum_r
+    pxor            xmm13,xmm13  ;sum_sq_s
+    pxor            xmm12,xmm12  ;sum_sq_r
+    pxor            xmm11,xmm11  ;sum_sxr
+
+    mov             rdx, 8      ;row counter
+.NextRow:
+
+    ;grab source and reference pixels
+    movq            xmm3, [rsi]
+    movq            xmm4, [rdi]
+    punpcklbw       xmm3, xmm0 ; low_s
+    punpcklbw       xmm4, xmm0 ; low_r
+
+    TABULATE_SSIM
+
+    add             rsi, rcx   ; next s row
+    add             rdi, rax   ; next r row
+
+    dec             rdx        ; counter
+    jnz .NextRow
+
+    SUM_ACROSS_W    xmm15
+    SUM_ACROSS_W    xmm14
+    SUM_ACROSS_Q    xmm13
+    SUM_ACROSS_Q    xmm12
+    SUM_ACROSS_Q    xmm11
+
+    mov             rdi,arg(4)
+    movd            [rdi], xmm15;
+    mov             rdi,arg(5)
+    movd            [rdi], xmm14;
+    mov             rdi,arg(6)
+    movd            [rdi], xmm13;
+    mov             rdi,arg(7)
+    movd            [rdi], xmm12;
+    mov             rdi,arg(8)
+    movd            [rdi], xmm11;
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
diff --git a/libs/libaom/src/av1/encoder/x86/av1_txfm1d_sse4.h b/libs/libaom/src/av1/encoder/x86/av1_txfm1d_sse4.h
new file mode 100644
index 000000000..7a0f32898
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/x86/av1_txfm1d_sse4.h
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_X86_AV1_TXFM1D_SSE4_H_
+#define AOM_AV1_ENCODER_X86_AV1_TXFM1D_SSE4_H_
+
+#include <smmintrin.h>
+#include "av1/common/av1_txfm.h"
+#include "av1/common/x86/av1_txfm_sse4.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_fdct4_sse4_1(const __m128i *input, __m128i *output,
+                      const int8_t cos_bit, const int8_t *stage_range);
+void av1_fdct8_sse4_1(const __m128i *input, __m128i *output,
+                      const int8_t cos_bit, const int8_t *stage_range);
+void av1_fdct16_sse4_1(const __m128i *input, __m128i *output,
+                       const int8_t cos_bit, const int8_t *stage_range);
+void av1_fdct32_sse4_1(__m128i *input, __m128i *output, int cos_bit,
+                       const int stride);
+void av1_fdct64_sse4_1(__m128i *input, __m128i *output, int8_t cos_bit,
+                       const int instride, const int outstride);
+void av1_fadst4_sse4_1(const __m128i *input, __m128i *output,
+                       const int8_t cos_bit, const int8_t *stage_range);
+void av1_fadst8_sse4_1(const __m128i *input, __m128i *output,
+                       const int8_t cos_bit, const int8_t *stage_range);
+void av1_fadst16_sse4_1(const __m128i *input, __m128i *output,
+                        const int8_t cos_bit, const int8_t *stage_range);
+
+void av1_idct4_sse4_1(const __m128i *input, __m128i *output,
+                      const int8_t cos_bit, const int8_t *stage_range);
+void av1_idct8_sse4_1(const __m128i *input, __m128i *output,
+                      const int8_t cos_bit, const int8_t *stage_range);
+void av1_idct16_sse4_1(const __m128i *input, __m128i *output,
+                       const int8_t cos_bit, const int8_t *stage_range);
+void av1_idct32_sse4_1(const __m128i *input, __m128i *output,
+                       const int8_t cos_bit, const int8_t *stage_range);
+void av1_idct64_sse4_1(const __m128i *input, __m128i *output,
+                       const int8_t cos_bit, const int8_t *stage_range);
+
+void av1_iadst4_sse4_1(const __m128i *input, __m128i *output,
+                       const int8_t cos_bit, const int8_t *stage_range);
+void av1_iadst8_sse4_1(const __m128i *input, __m128i *output,
+                       const int8_t cos_bit, const int8_t *stage_range);
+void av1_iadst16_sse4_1(const __m128i *input, __m128i *output,
+                        const int8_t cos_bit, const int8_t *stage_range);
+
+void av1_idtx32_sse4_1(__m128i *input, __m128i *output, int cos_bit,
+                       const int col_num);
+
+static INLINE void transpose_32_4x4(int stride, const __m128i *input,
+                                    __m128i *output) {
+  __m128i temp0 = _mm_unpacklo_epi32(input[0 * stride], input[2 * stride]);
+  __m128i temp1 = _mm_unpackhi_epi32(input[0 * stride], input[2 * stride]);
+  __m128i temp2 = _mm_unpacklo_epi32(input[1 * stride], input[3 * stride]);
+  __m128i temp3 = _mm_unpackhi_epi32(input[1 * stride], input[3 * stride]);
+
+  output[0 * stride] = _mm_unpacklo_epi32(temp0, temp2);
+  output[1 * stride] = _mm_unpackhi_epi32(temp0, temp2);
+  output[2 * stride] = _mm_unpacklo_epi32(temp1, temp3);
+  output[3 * stride] = _mm_unpackhi_epi32(temp1, temp3);
+}
+
+// the entire input block can be represent by a grid of 4x4 blocks
+// each 4x4 blocks can be represent by 4 vertical __m128i
+// we first transpose each 4x4 block internally
+// then transpose the grid
+static INLINE void transpose_32(int txfm_size, const __m128i *input,
+                                __m128i *output) {
+  const int num_per_128 = 4;
+  const int row_size = txfm_size;
+  const int col_size = txfm_size / num_per_128;
+  int r, c;
+
+  // transpose each 4x4 block internally
+  for (r = 0; r < row_size; r += 4) {
+    for (c = 0; c < col_size; c++) {
+      transpose_32_4x4(col_size, &input[r * col_size + c],
+                       &output[c * 4 * col_size + r / 4]);
+    }
+  }
+}
+
+// out0 = in0*w0 + in1*w1
+// out1 = -in1*w0 + in0*w1
+#define btf_32_sse4_1_type0(w0, w1, in0, in1, out0, out1, bit) \
+  do {                                                         \
+    const __m128i ww0 = _mm_set1_epi32(w0);                    \
+    const __m128i ww1 = _mm_set1_epi32(w1);                    \
+    const __m128i in0_w0 = _mm_mullo_epi32(in0, ww0);          \
+    const __m128i in1_w1 = _mm_mullo_epi32(in1, ww1);          \
+    out0 = _mm_add_epi32(in0_w0, in1_w1);                      \
+    out0 = av1_round_shift_32_sse4_1(out0, bit);               \
+    const __m128i in0_w1 = _mm_mullo_epi32(in0, ww1);          \
+    const __m128i in1_w0 = _mm_mullo_epi32(in1, ww0);          \
+    out1 = _mm_sub_epi32(in0_w1, in1_w0);                      \
+    out1 = av1_round_shift_32_sse4_1(out1, bit);               \
+  } while (0)
+
+// out0 = in0*w0 + in1*w1
+// out1 = in1*w0 - in0*w1
+#define btf_32_sse4_1_type1(w0, w1, in0, in1, out0, out1, bit) \
+  do {                                                         \
+    btf_32_sse4_1_type0(w1, w0, in1, in0, out0, out1, bit);    \
+  } while (0)
+
+// out0 = in0*w0 + in1*w1
+// out1 = -in1*w0 + in0*w1
+#define btf_32_type0_sse4_1_new(ww0, ww1, in0, in1, out0, out1, r, bit) \
+  do {                                                                  \
+    const __m128i in0_w0 = _mm_mullo_epi32(in0, ww0);                   \
+    const __m128i in1_w1 = _mm_mullo_epi32(in1, ww1);                   \
+    out0 = _mm_add_epi32(in0_w0, in1_w1);                               \
+    out0 = _mm_add_epi32(out0, r);                                      \
+    out0 = _mm_srai_epi32(out0, bit);                                   \
+    const __m128i in0_w1 = _mm_mullo_epi32(in0, ww1);                   \
+    const __m128i in1_w0 = _mm_mullo_epi32(in1, ww0);                   \
+    out1 = _mm_sub_epi32(in0_w1, in1_w0);                               \
+    out1 = _mm_add_epi32(out1, r);                                      \
+    out1 = _mm_srai_epi32(out1, bit);                                   \
+  } while (0)
+
+// out0 = in0*w0 + in1*w1
+// out1 = in1*w0 - in0*w1
+#define btf_32_type1_sse4_1_new(ww0, ww1, in0, in1, out0, out1, r, bit) \
+  do {                                                                  \
+    btf_32_type0_sse4_1_new(ww1, ww0, in1, in0, out0, out1, r, bit);    \
+  } while (0)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // AOM_AV1_ENCODER_X86_AV1_TXFM1D_SSE4_H_
diff --git a/libs/libaom/src/av1/encoder/x86/corner_match_avx2.c b/libs/libaom/src/av1/encoder/x86/corner_match_avx2.c
new file mode 100644
index 000000000..8d7eb3f03
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/x86/corner_match_avx2.c
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+
+#include <immintrin.h>
+#include "config/av1_rtcd.h"
+
+#include "aom_ports/mem.h"
+#include "aom_ports/system_state.h"
+#include "av1/encoder/corner_match.h"
+
+DECLARE_ALIGNED(16, static const uint8_t,
+                byte_mask[16]) = { 255, 255, 255, 255, 255, 255, 255, 255,
+                                   255, 255, 255, 255, 255, 0,   0,   0 };
+#if MATCH_SZ != 13
+#error "Need to change byte_mask in corner_match_sse4.c if MATCH_SZ != 13"
+#endif
+
+/* Compute corr(im1, im2) * MATCH_SZ * stddev(im1), where the
+correlation/standard deviation are taken over MATCH_SZ by MATCH_SZ windows
+of each image, centered at (x1, y1) and (x2, y2) respectively.
+*/
+double av1_compute_cross_correlation_avx2(unsigned char *im1, int stride1,
+                                          int x1, int y1, unsigned char *im2,
+                                          int stride2, int x2, int y2) {
+  int i, stride1_i = 0, stride2_i = 0;
+  __m256i temp1, sum_vec, sumsq2_vec, cross_vec, v, v1_1, v2_1;
+  const __m128i mask = _mm_load_si128((__m128i *)byte_mask);
+  const __m256i zero = _mm256_setzero_si256();
+  __m128i v1, v2;
+
+  sum_vec = zero;
+  sumsq2_vec = zero;
+  cross_vec = zero;
+
+  im1 += (y1 - MATCH_SZ_BY2) * stride1 + (x1 - MATCH_SZ_BY2);
+  im2 += (y2 - MATCH_SZ_BY2) * stride2 + (x2 - MATCH_SZ_BY2);
+
+  for (i = 0; i < MATCH_SZ; ++i) {
+    v1 = _mm_and_si128(_mm_loadu_si128((__m128i *)&im1[stride1_i]), mask);
+    v1_1 = _mm256_cvtepu8_epi16(v1);
+    v2 = _mm_and_si128(_mm_loadu_si128((__m128i *)&im2[stride2_i]), mask);
+    v2_1 = _mm256_cvtepu8_epi16(v2);
+
+    v = _mm256_insertf128_si256(_mm256_castsi128_si256(v1), v2, 1);
+    sumsq2_vec = _mm256_add_epi32(sumsq2_vec, _mm256_madd_epi16(v2_1, v2_1));
+
+    sum_vec = _mm256_add_epi16(sum_vec, _mm256_sad_epu8(v, zero));
+    cross_vec = _mm256_add_epi32(cross_vec, _mm256_madd_epi16(v1_1, v2_1));
+    stride1_i += stride1;
+    stride2_i += stride2;
+  }
+  __m256i sum_vec1 = _mm256_srli_si256(sum_vec, 8);
+  sum_vec = _mm256_add_epi32(sum_vec, sum_vec1);
+  int sum1_acc = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_vec));
+  int sum2_acc = _mm256_extract_epi32(sum_vec, 4);
+
+  __m256i unp_low = _mm256_unpacklo_epi64(sumsq2_vec, cross_vec);
+  __m256i unp_hig = _mm256_unpackhi_epi64(sumsq2_vec, cross_vec);
+  temp1 = _mm256_add_epi32(unp_low, unp_hig);
+
+  __m128i low_sumsq = _mm256_castsi256_si128(temp1);
+  low_sumsq = _mm_add_epi32(low_sumsq, _mm256_extractf128_si256(temp1, 1));
+  low_sumsq = _mm_add_epi32(low_sumsq, _mm_srli_epi64(low_sumsq, 32));
+  int sumsq2_acc = _mm_cvtsi128_si32(low_sumsq);
+  int cross_acc = _mm_extract_epi32(low_sumsq, 2);
+
+  int var2 = sumsq2_acc * MATCH_SZ_SQ - sum2_acc * sum2_acc;
+  int cov = cross_acc * MATCH_SZ_SQ - sum1_acc * sum2_acc;
+  aom_clear_system_state();
+  return cov / sqrt((double)var2);
+}
diff --git a/libs/libaom/src/av1/encoder/x86/corner_match_sse4.c b/libs/libaom/src/av1/encoder/x86/corner_match_sse4.c
new file mode 100644
index 000000000..5c9ca207e
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/x86/corner_match_sse4.c
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+#include <memory.h>
+#include <math.h>
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom_ports/mem.h"
+#include "aom_ports/system_state.h"
+#include "av1/encoder/corner_match.h"
+
+DECLARE_ALIGNED(16, static const uint8_t,
+                byte_mask[16]) = { 255, 255, 255, 255, 255, 255, 255, 255,
+                                   255, 255, 255, 255, 255, 0,   0,   0 };
+#if MATCH_SZ != 13
+#error "Need to change byte_mask in corner_match_sse4.c if MATCH_SZ != 13"
+#endif
+
+/* Compute corr(im1, im2) * MATCH_SZ * stddev(im1), where the
+   correlation/standard deviation are taken over MATCH_SZ by MATCH_SZ windows
+   of each image, centered at (x1, y1) and (x2, y2) respectively.
+*/
+double av1_compute_cross_correlation_sse4_1(unsigned char *im1, int stride1,
+                                            int x1, int y1, unsigned char *im2,
+                                            int stride2, int x2, int y2) {
+  int i;
+  // 2 16-bit partial sums in lanes 0, 4 (== 2 32-bit partial sums in lanes 0,
+  // 2)
+  __m128i sum1_vec = _mm_setzero_si128();
+  __m128i sum2_vec = _mm_setzero_si128();
+  // 4 32-bit partial sums of squares
+  __m128i sumsq2_vec = _mm_setzero_si128();
+  __m128i cross_vec = _mm_setzero_si128();
+
+  const __m128i mask = _mm_load_si128((__m128i *)byte_mask);
+  const __m128i zero = _mm_setzero_si128();
+
+  im1 += (y1 - MATCH_SZ_BY2) * stride1 + (x1 - MATCH_SZ_BY2);
+  im2 += (y2 - MATCH_SZ_BY2) * stride2 + (x2 - MATCH_SZ_BY2);
+
+  for (i = 0; i < MATCH_SZ; ++i) {
+    const __m128i v1 =
+        _mm_and_si128(_mm_loadu_si128((__m128i *)&im1[i * stride1]), mask);
+    const __m128i v2 =
+        _mm_and_si128(_mm_loadu_si128((__m128i *)&im2[i * stride2]), mask);
+
+    // Using the 'sad' intrinsic here is a bit faster than adding
+    // v1_l + v1_r and v2_l + v2_r, plus it avoids the need for a 16->32 bit
+    // conversion step later, for a net speedup of ~10%
+    sum1_vec = _mm_add_epi16(sum1_vec, _mm_sad_epu8(v1, zero));
+    sum2_vec = _mm_add_epi16(sum2_vec, _mm_sad_epu8(v2, zero));
+
+    const __m128i v1_l = _mm_cvtepu8_epi16(v1);
+    const __m128i v1_r = _mm_cvtepu8_epi16(_mm_srli_si128(v1, 8));
+    const __m128i v2_l = _mm_cvtepu8_epi16(v2);
+    const __m128i v2_r = _mm_cvtepu8_epi16(_mm_srli_si128(v2, 8));
+
+    sumsq2_vec = _mm_add_epi32(
+        sumsq2_vec,
+        _mm_add_epi32(_mm_madd_epi16(v2_l, v2_l), _mm_madd_epi16(v2_r, v2_r)));
+    cross_vec = _mm_add_epi32(
+        cross_vec,
+        _mm_add_epi32(_mm_madd_epi16(v1_l, v2_l), _mm_madd_epi16(v1_r, v2_r)));
+  }
+
+  // Now we can treat the four registers (sum1_vec, sum2_vec, sumsq2_vec,
+  // cross_vec)
+  // as holding 4 32-bit elements each, which we want to sum horizontally.
+  // We do this by transposing and then summing vertically.
+  __m128i tmp_0 = _mm_unpacklo_epi32(sum1_vec, sum2_vec);
+  __m128i tmp_1 = _mm_unpackhi_epi32(sum1_vec, sum2_vec);
+  __m128i tmp_2 = _mm_unpacklo_epi32(sumsq2_vec, cross_vec);
+  __m128i tmp_3 = _mm_unpackhi_epi32(sumsq2_vec, cross_vec);
+
+  __m128i tmp_4 = _mm_unpacklo_epi64(tmp_0, tmp_2);
+  __m128i tmp_5 = _mm_unpackhi_epi64(tmp_0, tmp_2);
+  __m128i tmp_6 = _mm_unpacklo_epi64(tmp_1, tmp_3);
+  __m128i tmp_7 = _mm_unpackhi_epi64(tmp_1, tmp_3);
+
+  __m128i res =
+      _mm_add_epi32(_mm_add_epi32(tmp_4, tmp_5), _mm_add_epi32(tmp_6, tmp_7));
+
+  int sum1 = _mm_extract_epi32(res, 0);
+  int sum2 = _mm_extract_epi32(res, 1);
+  int sumsq2 = _mm_extract_epi32(res, 2);
+  int cross = _mm_extract_epi32(res, 3);
+
+  int var2 = sumsq2 * MATCH_SZ_SQ - sum2 * sum2;
+  int cov = cross * MATCH_SZ_SQ - sum1 * sum2;
+  aom_clear_system_state();
+  return cov / sqrt((double)var2);
+}
diff --git a/libs/libaom/src/av1/encoder/x86/dct_sse2.asm b/libs/libaom/src/av1/encoder/x86/dct_sse2.asm
new file mode 100644
index 000000000..b18554818
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/x86/dct_sse2.asm
@@ -0,0 +1,82 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+%define private_prefix av1
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+%macro TRANSFORM_COLS 0
+  paddw           m0,        m1
+  movq            m4,        m0
+  psubw           m3,        m2
+  psubw           m4,        m3
+  psraw           m4,        1
+  movq            m5,        m4
+  psubw           m5,        m1 ;b1
+  psubw           m4,        m2 ;c1
+  psubw           m0,        m4
+  paddw           m3,        m5
+                                ; m0 a0
+  SWAP            1,         4  ; m1 c1
+  SWAP            2,         3  ; m2 d1
+  SWAP            3,         5  ; m3 b1
+%endmacro
+
+%macro TRANSPOSE_4X4 0
+                                ; 00 01 02 03
+                                ; 10 11 12 13
+                                ; 20 21 22 23
+                                ; 30 31 32 33
+  punpcklwd       m0,        m1 ; 00 10 01 11  02 12 03 13
+  punpcklwd       m2,        m3 ; 20 30 21 31  22 32 23 33
+  mova            m1,        m0
+  punpckldq       m0,        m2 ; 00 10 20 30  01 11 21 31
+  punpckhdq       m1,        m2 ; 02 12 22 32  03 13 23 33
+%endmacro
+
+INIT_XMM sse2
+cglobal fwht4x4, 3, 4, 8, input, output, stride
+  lea             r3q,       [inputq + strideq*4]
+  movq            m0,        [inputq] ;a1
+  movq            m1,        [inputq + strideq*2] ;b1
+  movq            m2,        [r3q] ;c1
+  movq            m3,        [r3q + strideq*2] ;d1
+
+  TRANSFORM_COLS
+  TRANSPOSE_4X4
+  SWAP            1,         2
+  psrldq          m1,        m0, 8
+  psrldq          m3,        m2, 8
+  TRANSFORM_COLS
+  TRANSPOSE_4X4
+
+  psllw           m0,        2
+  psllw           m1,        2
+
+  ; sign extension
+  mova            m2,             m0
+  mova            m3,             m1
+  punpcklwd       m0,             m0
+  punpcklwd       m1,             m1
+  punpckhwd       m2,             m2
+  punpckhwd       m3,             m3
+  psrad           m0,             16
+  psrad           m1,             16
+  psrad           m2,             16
+  psrad           m3,             16
+  mova            [outputq],      m0
+  mova            [outputq + 16], m2
+  mova            [outputq + 32], m1
+  mova            [outputq + 48], m3
+
+  RET
diff --git a/libs/libaom/src/av1/encoder/x86/encodetxb_avx2.c b/libs/libaom/src/av1/encoder/x86/encodetxb_avx2.c
new file mode 100644
index 000000000..30a412909
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/x86/encodetxb_avx2.c
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>  // SSE2
+#include <smmintrin.h>  /* SSE4.1 */
+#include <immintrin.h>  /* AVX2 */
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/mem_sse2.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/common/txb_common.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+
+void av1_txb_init_levels_avx2(const tran_low_t *const coeff, const int width,
+                              const int height, uint8_t *const levels) {
+  const int stride = width + TX_PAD_HOR;
+  const __m256i y_zeros = _mm256_setzero_si256();
+
+  const int32_t bottom_len = sizeof(*levels) * (TX_PAD_BOTTOM * stride);
+  uint8_t *bottom_buf_end = levels + (height + TX_PAD_BOTTOM) * stride;
+  uint8_t *bottom_buf = bottom_buf_end - ((bottom_len + 31) & (~31));
+
+  do {
+    yy_storeu_256(bottom_buf, y_zeros);
+    bottom_buf += 32;
+  } while (bottom_buf < bottom_buf_end);
+
+  int i = 0;
+  uint8_t *ls = levels;
+  const tran_low_t *cf = coeff;
+  if (width == 4) {
+    do {
+      const __m256i c0 = yy_loadu_256(cf);
+      const __m256i c1 = yy_loadu_256(cf + 8);
+      const __m256i abs01 = _mm256_abs_epi16(_mm256_packs_epi32(c0, c1));
+      const __m256i abs01_8 = _mm256_packs_epi16(abs01, y_zeros);
+      const __m256i res_ = _mm256_shuffle_epi32(abs01_8, 0xd8);
+      const __m256i res = _mm256_permute4x64_epi64(res_, 0xd8);
+      yy_storeu_256(ls, res);
+      ls += 32;
+      cf += 16;
+      i += 4;
+    } while (i < height);
+  } else if (width == 8) {
+    do {
+      const __m256i coeffA = yy_loadu_256(cf);
+      const __m256i coeffB = yy_loadu_256(cf + 8);
+      const __m256i coeffC = yy_loadu_256(cf + 16);
+      const __m256i coeffD = yy_loadu_256(cf + 24);
+      const __m256i coeffAB = _mm256_packs_epi32(coeffA, coeffB);
+      const __m256i coeffCD = _mm256_packs_epi32(coeffC, coeffD);
+      const __m256i absAB = _mm256_abs_epi16(coeffAB);
+      const __m256i absCD = _mm256_abs_epi16(coeffCD);
+      const __m256i absABCD = _mm256_packs_epi16(absAB, absCD);
+      const __m256i res_ = _mm256_permute4x64_epi64(absABCD, 0xd8);
+      const __m256i res = _mm256_shuffle_epi32(res_, 0xd8);
+      const __m128i res0 = _mm256_castsi256_si128(res);
+      const __m128i res1 = _mm256_extracti128_si256(res, 1);
+      xx_storel_64(ls, res0);
+      *(int32_t *)(ls + width) = 0;
+      xx_storel_64(ls + stride, _mm_srli_si128(res0, 8));
+      *(int32_t *)(ls + width + stride) = 0;
+      xx_storel_64(ls + stride * 2, res1);
+      *(int32_t *)(ls + width + stride * 2) = 0;
+      xx_storel_64(ls + stride * 3, _mm_srli_si128(res1, 8));
+      *(int32_t *)(ls + width + stride * 3) = 0;
+      cf += 32;
+      ls += stride << 2;
+      i += 4;
+    } while (i < height);
+  } else if (width == 16) {
+    do {
+      const __m256i coeffA = yy_loadu_256(cf);
+      const __m256i coeffB = yy_loadu_256(cf + 8);
+      const __m256i coeffC = yy_loadu_256(cf + 16);
+      const __m256i coeffD = yy_loadu_256(cf + 24);
+      const __m256i coeffAB = _mm256_packs_epi32(coeffA, coeffB);
+      const __m256i coeffCD = _mm256_packs_epi32(coeffC, coeffD);
+      const __m256i absAB = _mm256_abs_epi16(coeffAB);
+      const __m256i absCD = _mm256_abs_epi16(coeffCD);
+      const __m256i absABCD = _mm256_packs_epi16(absAB, absCD);
+      const __m256i res_ = _mm256_permute4x64_epi64(absABCD, 0xd8);
+      const __m256i res = _mm256_shuffle_epi32(res_, 0xd8);
+      xx_storeu_128(ls, _mm256_castsi256_si128(res));
+      xx_storeu_128(ls + stride, _mm256_extracti128_si256(res, 1));
+      cf += 32;
+      *(int32_t *)(ls + width) = 0;
+      *(int32_t *)(ls + stride + width) = 0;
+      ls += stride << 1;
+      i += 2;
+    } while (i < height);
+  } else {
+    do {
+      const __m256i coeffA = yy_loadu_256(cf);
+      const __m256i coeffB = yy_loadu_256(cf + 8);
+      const __m256i coeffC = yy_loadu_256(cf + 16);
+      const __m256i coeffD = yy_loadu_256(cf + 24);
+      const __m256i coeffAB = _mm256_packs_epi32(coeffA, coeffB);
+      const __m256i coeffCD = _mm256_packs_epi32(coeffC, coeffD);
+      const __m256i absAB = _mm256_abs_epi16(coeffAB);
+      const __m256i absCD = _mm256_abs_epi16(coeffCD);
+      const __m256i absABCD = _mm256_packs_epi16(absAB, absCD);
+      const __m256i res_ = _mm256_permute4x64_epi64(absABCD, 0xd8);
+      const __m256i res = _mm256_shuffle_epi32(res_, 0xd8);
+      yy_storeu_256(ls, res);
+      cf += 32;
+      *(int32_t *)(ls + width) = 0;
+      ls += stride;
+      i += 1;
+    } while (i < height);
+  }
+}
diff --git a/libs/libaom/src/av1/encoder/x86/encodetxb_sse2.c b/libs/libaom/src/av1/encoder/x86/encodetxb_sse2.c
new file mode 100644
index 000000000..394befb7b
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/x86/encodetxb_sse2.c
@@ -0,0 +1,505 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>  // SSE2
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/mem_sse2.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/common/txb_common.h"
+
+static INLINE void load_levels_4x4x5_sse2(const uint8_t *const src,
+                                          const int stride,
+                                          const ptrdiff_t *const offsets,
+                                          __m128i *const level) {
+  level[0] = load_8bit_4x4_to_1_reg_sse2(src + 1, stride);
+  level[1] = load_8bit_4x4_to_1_reg_sse2(src + stride, stride);
+  level[2] = load_8bit_4x4_to_1_reg_sse2(src + offsets[0], stride);
+  level[3] = load_8bit_4x4_to_1_reg_sse2(src + offsets[1], stride);
+  level[4] = load_8bit_4x4_to_1_reg_sse2(src + offsets[2], stride);
+}
+
+static INLINE void load_levels_8x2x5_sse2(const uint8_t *const src,
+                                          const int stride,
+                                          const ptrdiff_t *const offsets,
+                                          __m128i *const level) {
+  level[0] = load_8bit_8x2_to_1_reg_sse2(src + 1, stride);
+  level[1] = load_8bit_8x2_to_1_reg_sse2(src + stride, stride);
+  level[2] = load_8bit_8x2_to_1_reg_sse2(src + offsets[0], stride);
+  level[3] = load_8bit_8x2_to_1_reg_sse2(src + offsets[1], stride);
+  level[4] = load_8bit_8x2_to_1_reg_sse2(src + offsets[2], stride);
+}
+
+static INLINE void load_levels_16x1x5_sse2(const uint8_t *const src,
+                                           const int stride,
+                                           const ptrdiff_t *const offsets,
+                                           __m128i *const level) {
+  level[0] = _mm_loadu_si128((__m128i *)(src + 1));
+  level[1] = _mm_loadu_si128((__m128i *)(src + stride));
+  level[2] = _mm_loadu_si128((__m128i *)(src + offsets[0]));
+  level[3] = _mm_loadu_si128((__m128i *)(src + offsets[1]));
+  level[4] = _mm_loadu_si128((__m128i *)(src + offsets[2]));
+}
+
+static INLINE __m128i get_coeff_contexts_kernel_sse2(__m128i *const level) {
+  const __m128i const_3 = _mm_set1_epi8(3);
+  const __m128i const_4 = _mm_set1_epi8(4);
+  __m128i count;
+
+  count = _mm_min_epu8(level[0], const_3);
+  level[1] = _mm_min_epu8(level[1], const_3);
+  level[2] = _mm_min_epu8(level[2], const_3);
+  level[3] = _mm_min_epu8(level[3], const_3);
+  level[4] = _mm_min_epu8(level[4], const_3);
+  count = _mm_add_epi8(count, level[1]);
+  count = _mm_add_epi8(count, level[2]);
+  count = _mm_add_epi8(count, level[3]);
+  count = _mm_add_epi8(count, level[4]);
+  count = _mm_avg_epu8(count, _mm_setzero_si128());
+  count = _mm_min_epu8(count, const_4);
+  return count;
+}
+
+static INLINE void get_4_nz_map_contexts_2d(const uint8_t *levels,
+                                            const int height,
+                                            const ptrdiff_t *const offsets,
+                                            int8_t *const coeff_contexts) {
+  const int stride = 4 + TX_PAD_HOR;
+  const __m128i pos_to_offset_large = _mm_set1_epi8(21);
+  __m128i pos_to_offset =
+      (height == 4)
+          ? _mm_setr_epi8(0, 1, 6, 6, 1, 6, 6, 21, 6, 6, 21, 21, 6, 21, 21, 21)
+          : _mm_setr_epi8(0, 11, 11, 11, 11, 11, 11, 11, 6, 6, 21, 21, 6, 21,
+                          21, 21);
+  __m128i count;
+  __m128i level[5];
+  int8_t *cc = coeff_contexts;
+  int row = height;
+
+  assert(!(height % 4));
+
+  do {
+    load_levels_4x4x5_sse2(levels, stride, offsets, level);
+    count = get_coeff_contexts_kernel_sse2(level);
+    count = _mm_add_epi8(count, pos_to_offset);
+    _mm_store_si128((__m128i *)cc, count);
+    pos_to_offset = pos_to_offset_large;
+    levels += 4 * stride;
+    cc += 16;
+    row -= 4;
+  } while (row);
+
+  coeff_contexts[0] = 0;
+}
+
+static INLINE void get_4_nz_map_contexts_hor(const uint8_t *levels,
+                                             const int height,
+                                             const ptrdiff_t *const offsets,
+                                             int8_t *coeff_contexts) {
+  const int stride = 4 + TX_PAD_HOR;
+  const __m128i pos_to_offset =
+      _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10);
+  __m128i count;
+  __m128i level[5];
+  int row = height;
+
+  assert(!(height % 4));
+
+  do {
+    load_levels_4x4x5_sse2(levels, stride, offsets, level);
+    count = get_coeff_contexts_kernel_sse2(level);
+    count = _mm_add_epi8(count, pos_to_offset);
+    _mm_store_si128((__m128i *)coeff_contexts, count);
+    levels += 4 * stride;
+    coeff_contexts += 16;
+    row -= 4;
+  } while (row);
+}
+
+static INLINE void get_4_nz_map_contexts_ver(const uint8_t *levels,
+                                             const int height,
+                                             const ptrdiff_t *const offsets,
+                                             int8_t *coeff_contexts) {
+  const int stride = 4 + TX_PAD_HOR;
+  const __m128i pos_to_offset_large = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 10);
+  __m128i pos_to_offset =
+      _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
+                    SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
+                    SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5,
+                    SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10);
+  __m128i count;
+  __m128i level[5];
+  int row = height;
+
+  assert(!(height % 4));
+
+  do {
+    load_levels_4x4x5_sse2(levels, stride, offsets, level);
+    count = get_coeff_contexts_kernel_sse2(level);
+    count = _mm_add_epi8(count, pos_to_offset);
+    _mm_store_si128((__m128i *)coeff_contexts, count);
+    pos_to_offset = pos_to_offset_large;
+    levels += 4 * stride;
+    coeff_contexts += 16;
+    row -= 4;
+  } while (row);
+}
+
+static INLINE void get_8_coeff_contexts_2d(const uint8_t *levels,
+                                           const int height,
+                                           const ptrdiff_t *const offsets,
+                                           int8_t *coeff_contexts) {
+  const int stride = 8 + TX_PAD_HOR;
+  int8_t *cc = coeff_contexts;
+  int row = height;
+  __m128i count;
+  __m128i level[5];
+  __m128i pos_to_offset[3];
+
+  assert(!(height % 2));
+
+  if (height == 8) {
+    pos_to_offset[0] =
+        _mm_setr_epi8(0, 1, 6, 6, 21, 21, 21, 21, 1, 6, 6, 21, 21, 21, 21, 21);
+    pos_to_offset[1] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21,
+                                     21, 21, 21, 21, 21);
+  } else if (height < 8) {
+    pos_to_offset[0] = _mm_setr_epi8(0, 16, 6, 6, 21, 21, 21, 21, 16, 16, 6, 21,
+                                     21, 21, 21, 21);
+    pos_to_offset[1] = _mm_setr_epi8(16, 16, 21, 21, 21, 21, 21, 21, 16, 16, 21,
+                                     21, 21, 21, 21, 21);
+  } else {
+    pos_to_offset[0] = _mm_setr_epi8(0, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+                                     11, 11, 11, 11, 11);
+    pos_to_offset[1] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21,
+                                     21, 21, 21, 21, 21);
+  }
+  pos_to_offset[2] = _mm_set1_epi8(21);
+
+  do {
+    load_levels_8x2x5_sse2(levels, stride, offsets, level);
+    count = get_coeff_contexts_kernel_sse2(level);
+    count = _mm_add_epi8(count, pos_to_offset[0]);
+    _mm_store_si128((__m128i *)cc, count);
+    pos_to_offset[0] = pos_to_offset[1];
+    pos_to_offset[1] = pos_to_offset[2];
+    levels += 2 * stride;
+    cc += 16;
+    row -= 2;
+  } while (row);
+
+  coeff_contexts[0] = 0;
+}
+
+static INLINE void get_8_coeff_contexts_hor(const uint8_t *levels,
+                                            const int height,
+                                            const ptrdiff_t *const offsets,
+                                            int8_t *coeff_contexts) {
+  const int stride = 8 + TX_PAD_HOR;
+  const __m128i pos_to_offset =
+      _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10);
+  int row = height;
+  __m128i count;
+  __m128i level[5];
+
+  assert(!(height % 2));
+
+  do {
+    load_levels_8x2x5_sse2(levels, stride, offsets, level);
+    count = get_coeff_contexts_kernel_sse2(level);
+    count = _mm_add_epi8(count, pos_to_offset);
+    _mm_store_si128((__m128i *)coeff_contexts, count);
+    levels += 2 * stride;
+    coeff_contexts += 16;
+    row -= 2;
+  } while (row);
+}
+
+static INLINE void get_8_coeff_contexts_ver(const uint8_t *levels,
+                                            const int height,
+                                            const ptrdiff_t *const offsets,
+                                            int8_t *coeff_contexts) {
+  const int stride = 8 + TX_PAD_HOR;
+  const __m128i pos_to_offset_large = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 10);
+  __m128i pos_to_offset =
+      _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
+                    SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
+                    SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
+                    SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
+                    SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5,
+                    SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5,
+                    SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5,
+                    SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5);
+  int row = height;
+  __m128i count;
+  __m128i level[5];
+
+  assert(!(height % 2));
+
+  do {
+    load_levels_8x2x5_sse2(levels, stride, offsets, level);
+    count = get_coeff_contexts_kernel_sse2(level);
+    count = _mm_add_epi8(count, pos_to_offset);
+    _mm_store_si128((__m128i *)coeff_contexts, count);
+    pos_to_offset = pos_to_offset_large;
+    levels += 2 * stride;
+    coeff_contexts += 16;
+    row -= 2;
+  } while (row);
+}
+
+static INLINE void get_16n_coeff_contexts_2d(const uint8_t *levels,
+                                             const int real_width,
+                                             const int real_height,
+                                             const int width, const int height,
+                                             const ptrdiff_t *const offsets,
+                                             int8_t *coeff_contexts) {
+  const int stride = width + TX_PAD_HOR;
+  int8_t *cc = coeff_contexts;
+  int row = height;
+  __m128i pos_to_offset[5];
+  __m128i pos_to_offset_large[3];
+  __m128i count;
+  __m128i level[5];
+
+  assert(!(width % 16));
+
+  pos_to_offset_large[2] = _mm_set1_epi8(21);
+  if (real_width == real_height) {
+    pos_to_offset[0] = _mm_setr_epi8(0, 1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21,
+                                     21, 21, 21, 21);
+    pos_to_offset[1] = _mm_setr_epi8(1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21,
+                                     21, 21, 21, 21, 21);
+    pos_to_offset[2] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+                                     21, 21, 21, 21, 21);
+    pos_to_offset[3] = _mm_setr_epi8(6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+                                     21, 21, 21, 21, 21);
+    pos_to_offset[4] = pos_to_offset_large[0] = pos_to_offset_large[1] =
+        pos_to_offset_large[2];
+  } else if (real_width > real_height) {
+    pos_to_offset[0] = _mm_setr_epi8(0, 16, 6, 6, 21, 21, 21, 21, 21, 21, 21,
+                                     21, 21, 21, 21, 21);
+    pos_to_offset[1] = _mm_setr_epi8(16, 16, 6, 21, 21, 21, 21, 21, 21, 21, 21,
+                                     21, 21, 21, 21, 21);
+    pos_to_offset[2] = pos_to_offset[3] = pos_to_offset[4] = _mm_setr_epi8(
+        16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21);
+    pos_to_offset_large[0] = pos_to_offset_large[1] = pos_to_offset_large[2];
+  } else {  // real_width < real_height
+    pos_to_offset[0] = pos_to_offset[1] = _mm_setr_epi8(
+        11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11);
+    pos_to_offset[2] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+                                     21, 21, 21, 21, 21);
+    pos_to_offset[3] = _mm_setr_epi8(6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+                                     21, 21, 21, 21, 21);
+    pos_to_offset[4] = pos_to_offset_large[2];
+    pos_to_offset_large[0] = pos_to_offset_large[1] = _mm_set1_epi8(11);
+  }
+
+  do {
+    int w = width;
+
+    do {
+      load_levels_16x1x5_sse2(levels, stride, offsets, level);
+      count = get_coeff_contexts_kernel_sse2(level);
+      count = _mm_add_epi8(count, pos_to_offset[0]);
+      _mm_store_si128((__m128i *)cc, count);
+      levels += 16;
+      cc += 16;
+      w -= 16;
+      pos_to_offset[0] = pos_to_offset_large[0];
+    } while (w);
+
+    pos_to_offset[0] = pos_to_offset[1];
+    pos_to_offset[1] = pos_to_offset[2];
+    pos_to_offset[2] = pos_to_offset[3];
+    pos_to_offset[3] = pos_to_offset[4];
+    pos_to_offset_large[0] = pos_to_offset_large[1];
+    pos_to_offset_large[1] = pos_to_offset_large[2];
+    levels += TX_PAD_HOR;
+  } while (--row);
+
+  coeff_contexts[0] = 0;
+}
+
+static INLINE void get_16n_coeff_contexts_hor(const uint8_t *levels,
+                                              const int width, const int height,
+                                              const ptrdiff_t *const offsets,
+                                              int8_t *coeff_contexts) {
+  const int stride = width + TX_PAD_HOR;
+  const __m128i pos_to_offset_large =
+      _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                    SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10);
+  __m128i count;
+  __m128i level[5];
+  int row = height;
+
+  assert(!(width % 16));
+
+  do {
+    __m128i pos_to_offset =
+        _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+                      SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                      SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                      SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                      SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                      SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                      SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+                      SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10);
+    int w = width;
+
+    do {
+      load_levels_16x1x5_sse2(levels, stride, offsets, level);
+      count = get_coeff_contexts_kernel_sse2(level);
+      count = _mm_add_epi8(count, pos_to_offset);
+      _mm_store_si128((__m128i *)coeff_contexts, count);
+      pos_to_offset = pos_to_offset_large;
+      levels += 16;
+      coeff_contexts += 16;
+      w -= 16;
+    } while (w);
+
+    levels += TX_PAD_HOR;
+  } while (--row);
+}
+
+static INLINE void get_16n_coeff_contexts_ver(const uint8_t *levels,
+                                              const int width, const int height,
+                                              const ptrdiff_t *const offsets,
+                                              int8_t *coeff_contexts) {
+  const int stride = width + TX_PAD_HOR;
+  __m128i pos_to_offset[3];
+  __m128i count;
+  __m128i level[5];
+  int row = height;
+
+  assert(!(width % 16));
+
+  pos_to_offset[0] = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 0);
+  pos_to_offset[1] = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 5);
+  pos_to_offset[2] = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 10);
+
+  do {
+    int w = width;
+
+    do {
+      load_levels_16x1x5_sse2(levels, stride, offsets, level);
+      count = get_coeff_contexts_kernel_sse2(level);
+      count = _mm_add_epi8(count, pos_to_offset[0]);
+      _mm_store_si128((__m128i *)coeff_contexts, count);
+      levels += 16;
+      coeff_contexts += 16;
+      w -= 16;
+    } while (w);
+
+    pos_to_offset[0] = pos_to_offset[1];
+    pos_to_offset[1] = pos_to_offset[2];
+    levels += TX_PAD_HOR;
+  } while (--row);
+}
+
+// Note: levels[] must be in the range [0, 127], inclusive.
+void av1_get_nz_map_contexts_sse2(const uint8_t *const levels,
+                                  const int16_t *const scan, const uint16_t eob,
+                                  const TX_SIZE tx_size,
+                                  const TX_CLASS tx_class,
+                                  int8_t *const coeff_contexts) {
+  const int last_idx = eob - 1;
+  if (!last_idx) {
+    coeff_contexts[0] = 0;
+    return;
+  }
+
+  const int real_width = tx_size_wide[tx_size];
+  const int real_height = tx_size_high[tx_size];
+  const int width = get_txb_wide(tx_size);
+  const int height = get_txb_high(tx_size);
+  const int stride = width + TX_PAD_HOR;
+  ptrdiff_t offsets[3];
+
+  /* coeff_contexts must be 16 byte aligned. */
+  assert(!((intptr_t)coeff_contexts & 0xf));
+
+  if (tx_class == TX_CLASS_2D) {
+    offsets[0] = 0 * stride + 2;
+    offsets[1] = 1 * stride + 1;
+    offsets[2] = 2 * stride + 0;
+
+    if (width == 4) {
+      get_4_nz_map_contexts_2d(levels, height, offsets, coeff_contexts);
+    } else if (width == 8) {
+      get_8_coeff_contexts_2d(levels, height, offsets, coeff_contexts);
+    } else if (width == 16) {
+      get_16n_coeff_contexts_2d(levels, real_width, real_height, width, height,
+                                offsets, coeff_contexts);
+    } else {
+      get_16n_coeff_contexts_2d(levels, real_width, real_height, width, height,
+                                offsets, coeff_contexts);
+    }
+  } else if (tx_class == TX_CLASS_HORIZ) {
+    offsets[0] = 2;
+    offsets[1] = 3;
+    offsets[2] = 4;
+    if (width == 4) {
+      get_4_nz_map_contexts_hor(levels, height, offsets, coeff_contexts);
+    } else if (width == 8) {
+      get_8_coeff_contexts_hor(levels, height, offsets, coeff_contexts);
+    } else {
+      get_16n_coeff_contexts_hor(levels, width, height, offsets,
+                                 coeff_contexts);
+    }
+  } else {  // TX_CLASS_VERT
+    offsets[0] = 2 * stride;
+    offsets[1] = 3 * stride;
+    offsets[2] = 4 * stride;
+    if (width == 4) {
+      get_4_nz_map_contexts_ver(levels, height, offsets, coeff_contexts);
+    } else if (width == 8) {
+      get_8_coeff_contexts_ver(levels, height, offsets, coeff_contexts);
+    } else {
+      get_16n_coeff_contexts_ver(levels, width, height, offsets,
+                                 coeff_contexts);
+    }
+  }
+
+  const int bwl = get_txb_bwl(tx_size);
+  const int pos = scan[last_idx];
+  if (last_idx <= (height << bwl) / 8)
+    coeff_contexts[pos] = 1;
+  else if (last_idx <= (height << bwl) / 4)
+    coeff_contexts[pos] = 2;
+  else
+    coeff_contexts[pos] = 3;
+}
diff --git a/libs/libaom/src/av1/encoder/x86/encodetxb_sse4.c b/libs/libaom/src/av1/encoder/x86/encodetxb_sse4.c
new file mode 100644
index 000000000..aeb57f2cd
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/x86/encodetxb_sse4.c
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>  // SSE2
+#include <smmintrin.h>  /* SSE4.1 */
+
+#include "aom/aom_integer.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/common/txb_common.h"
+#include "aom_dsp/x86/synonyms.h"
+
+void av1_txb_init_levels_sse4_1(const tran_low_t *const coeff, const int width,
+                                const int height, uint8_t *const levels) {
+  const int stride = width + TX_PAD_HOR;
+  const __m128i zeros = _mm_setzero_si128();
+
+  const int32_t bottom_len = sizeof(*levels) * (TX_PAD_BOTTOM * stride);
+  uint8_t *bottom_buf = levels + stride * height;
+  uint8_t *bottom_buf_end = bottom_buf + bottom_len;
+  do {
+    _mm_storeu_si128((__m128i *)(bottom_buf), zeros);
+    bottom_buf += 16;
+  } while (bottom_buf < bottom_buf_end);
+
+  int i = 0;
+  uint8_t *ls = levels;
+  const tran_low_t *cf = coeff;
+  if (width == 4) {
+    do {
+      const __m128i coeffA = xx_loadu_128(cf);
+      const __m128i coeffB = xx_loadu_128(cf + 4);
+      const __m128i coeffAB = _mm_packs_epi32(coeffA, coeffB);
+      const __m128i absAB = _mm_abs_epi16(coeffAB);
+      const __m128i absAB8 = _mm_packs_epi16(absAB, zeros);
+      const __m128i lsAB = _mm_unpacklo_epi32(absAB8, zeros);
+      xx_storeu_128(ls, lsAB);
+      ls += (stride << 1);
+      cf += (width << 1);
+      i += 2;
+    } while (i < height);
+  } else if (width == 8) {
+    do {
+      const __m128i coeffA = xx_loadu_128(cf);
+      const __m128i coeffB = xx_loadu_128(cf + 4);
+      const __m128i coeffAB = _mm_packs_epi32(coeffA, coeffB);
+      const __m128i absAB = _mm_abs_epi16(coeffAB);
+      const __m128i absAB8 = _mm_packs_epi16(absAB, zeros);
+      xx_storeu_128(ls, absAB8);
+      ls += stride;
+      cf += width;
+      i += 1;
+    } while (i < height);
+  } else {
+    do {
+      int j = 0;
+      do {
+        const __m128i coeffA = xx_loadu_128(cf);
+        const __m128i coeffB = xx_loadu_128(cf + 4);
+        const __m128i coeffC = xx_loadu_128(cf + 8);
+        const __m128i coeffD = xx_loadu_128(cf + 12);
+        const __m128i coeffAB = _mm_packs_epi32(coeffA, coeffB);
+        const __m128i coeffCD = _mm_packs_epi32(coeffC, coeffD);
+        const __m128i absAB = _mm_abs_epi16(coeffAB);
+        const __m128i absCD = _mm_abs_epi16(coeffCD);
+        const __m128i absABCD = _mm_packs_epi16(absAB, absCD);
+        xx_storeu_128(ls + j, absABCD);
+        j += 16;
+        cf += 16;
+      } while (j < width);
+      *(int32_t *)(ls + width) = 0;
+      ls += stride;
+      i += 1;
+    } while (i < height);
+  }
+}
diff --git a/libs/libaom/src/av1/encoder/x86/error_intrin_avx2.c b/libs/libaom/src/av1/encoder/x86/error_intrin_avx2.c
new file mode 100644
index 000000000..12dda3ad0
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/x86/error_intrin_avx2.c
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>  // AVX2
+
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+
+static INLINE void read_coeff(const tran_low_t *coeff, intptr_t offset,
+                              __m256i *c) {
+  const tran_low_t *addr = coeff + offset;
+
+  if (sizeof(tran_low_t) == 4) {
+    const __m256i x0 = _mm256_loadu_si256((const __m256i *)addr);
+    const __m256i x1 = _mm256_loadu_si256((const __m256i *)addr + 1);
+    const __m256i y = _mm256_packs_epi32(x0, x1);
+    *c = _mm256_permute4x64_epi64(y, 0xD8);
+  } else {
+    *c = _mm256_loadu_si256((const __m256i *)addr);
+  }
+}
+
+int64_t av1_block_error_lp_avx2(const int16_t *coeff, const int16_t *dqcoeff,
+                                intptr_t block_size) {
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i sse_256 = zero;
+  __m256i sse_hi;
+  __m128i sse_128;
+  int64_t sse;
+
+  if (block_size == 16) {
+    // Load 16 elements for coeff and dqcoeff.
+    const __m256i _coeff = _mm256_loadu_si256((const __m256i *)coeff);
+    const __m256i _dqcoeff = _mm256_loadu_si256((const __m256i *)dqcoeff);
+    // dqcoeff - coeff
+    const __m256i diff = _mm256_sub_epi16(_dqcoeff, _coeff);
+    // madd (dqcoeff - coeff)
+    const __m256i error_lo = _mm256_madd_epi16(diff, diff);
+    // Save the higher 64 bit of each 128 bit lane.
+    const __m256i error_hi = _mm256_srli_si256(error_lo, 8);
+    // Add the higher 64 bit to the low 64 bit.
+    const __m256i error = _mm256_add_epi32(error_lo, error_hi);
+    // Expand each double word in the lower 64 bits to quad word.
+    sse_256 = _mm256_unpacklo_epi32(error, zero);
+  } else {
+    for (int i = 0; i < block_size; i += 16) {
+      // Load 16 elements for coeff and dqcoeff.
+      const __m256i _coeff = _mm256_loadu_si256((const __m256i *)coeff);
+      const __m256i _dqcoeff = _mm256_loadu_si256((const __m256i *)dqcoeff);
+      const __m256i diff = _mm256_sub_epi16(_dqcoeff, _coeff);
+      const __m256i error = _mm256_madd_epi16(diff, diff);
+      // Expand each double word of madd (dqcoeff - coeff) to quad word.
+      const __m256i exp_error_lo = _mm256_unpacklo_epi32(error, zero);
+      const __m256i exp_error_hi = _mm256_unpackhi_epi32(error, zero);
+      // Add each quad word of madd (dqcoeff - coeff).
+      sse_256 = _mm256_add_epi64(sse_256, exp_error_lo);
+      sse_256 = _mm256_add_epi64(sse_256, exp_error_hi);
+      coeff += 16;
+      dqcoeff += 16;
+    }
+  }
+  // Save the higher 64 bit of each 128 bit lane.
+  sse_hi = _mm256_srli_si256(sse_256, 8);
+  // Add the higher 64 bit to the low 64 bit.
+  sse_256 = _mm256_add_epi64(sse_256, sse_hi);
+
+  // Add each 64 bit from each of the 128 bit lane of the 256 bit.
+  sse_128 = _mm_add_epi64(_mm256_castsi256_si128(sse_256),
+                          _mm256_extractf128_si256(sse_256, 1));
+
+  // Store the results.
+  _mm_storel_epi64((__m128i *)&sse, sse_128);
+  return sse;
+}
+
+int64_t av1_block_error_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff,
+                             intptr_t block_size, int64_t *ssz) {
+  __m256i sse_reg, ssz_reg, coeff_reg, dqcoeff_reg;
+  __m256i exp_dqcoeff_lo, exp_dqcoeff_hi, exp_coeff_lo, exp_coeff_hi;
+  __m256i sse_reg_64hi, ssz_reg_64hi;
+  __m128i sse_reg128, ssz_reg128;
+  int64_t sse;
+  int i;
+  const __m256i zero_reg = _mm256_setzero_si256();
+
+  // init sse and ssz registerd to zero
+  sse_reg = _mm256_setzero_si256();
+  ssz_reg = _mm256_setzero_si256();
+
+  for (i = 0; i < block_size; i += 16) {
+    // load 32 bytes from coeff and dqcoeff
+    read_coeff(coeff, i, &coeff_reg);
+    read_coeff(dqcoeff, i, &dqcoeff_reg);
+    // dqcoeff - coeff
+    dqcoeff_reg = _mm256_sub_epi16(dqcoeff_reg, coeff_reg);
+    // madd (dqcoeff - coeff)
+    dqcoeff_reg = _mm256_madd_epi16(dqcoeff_reg, dqcoeff_reg);
+    // madd coeff
+    coeff_reg = _mm256_madd_epi16(coeff_reg, coeff_reg);
+    // expand each double word of madd (dqcoeff - coeff) to quad word
+    exp_dqcoeff_lo = _mm256_unpacklo_epi32(dqcoeff_reg, zero_reg);
+    exp_dqcoeff_hi = _mm256_unpackhi_epi32(dqcoeff_reg, zero_reg);
+    // expand each double word of madd (coeff) to quad word
+    exp_coeff_lo = _mm256_unpacklo_epi32(coeff_reg, zero_reg);
+    exp_coeff_hi = _mm256_unpackhi_epi32(coeff_reg, zero_reg);
+    // add each quad word of madd (dqcoeff - coeff) and madd (coeff)
+    sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_lo);
+    ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_lo);
+    sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_hi);
+    ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_hi);
+  }
+  // save the higher 64 bit of each 128 bit lane
+  sse_reg_64hi = _mm256_srli_si256(sse_reg, 8);
+  ssz_reg_64hi = _mm256_srli_si256(ssz_reg, 8);
+  // add the higher 64 bit to the low 64 bit
+  sse_reg = _mm256_add_epi64(sse_reg, sse_reg_64hi);
+  ssz_reg = _mm256_add_epi64(ssz_reg, ssz_reg_64hi);
+
+  // add each 64 bit from each of the 128 bit lane of the 256 bit
+  sse_reg128 = _mm_add_epi64(_mm256_castsi256_si128(sse_reg),
+                             _mm256_extractf128_si256(sse_reg, 1));
+
+  ssz_reg128 = _mm_add_epi64(_mm256_castsi256_si128(ssz_reg),
+                             _mm256_extractf128_si256(ssz_reg, 1));
+
+  // store the results
+  _mm_storel_epi64((__m128i *)(&sse), sse_reg128);
+
+  _mm_storel_epi64((__m128i *)(ssz), ssz_reg128);
+  _mm256_zeroupper();
+  return sse;
+}
diff --git a/libs/libaom/src/av1/encoder/x86/error_sse2.asm b/libs/libaom/src/av1/encoder/x86/error_sse2.asm
new file mode 100644
index 000000000..f4b496897
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/x86/error_sse2.asm
@@ -0,0 +1,88 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+; Increment %1 by sizeof() tran_low_t * %2.
+%macro INCREMENT_ELEMENTS_TRAN_LOW 2
+  lea %1, [%1 + %2 * 4]
+%endmacro
+
+; Load %2 + %3 into m%1.
+; %3 is the offset in elements, not bytes.
+; If tran_low_t is 16 bits (low bit depth configuration) then load the value
+; directly. If tran_low_t is 32 bits (high bit depth configuration) then pack
+; the values down to 16 bits.
+%macro LOAD_TRAN_LOW 3
+  mova     m%1, [%2 + (%3) * 4]
+  packssdw m%1, [%2 + (%3) * 4 + 16]
+%endmacro
+
+%define private_prefix av1
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+; int64_t av1_block_error(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size,
+;                         int64_t *ssz)
+
+INIT_XMM sse2
+cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz
+  pxor      m4, m4                 ; sse accumulator
+  pxor      m6, m6                 ; ssz accumulator
+  pxor      m5, m5                 ; dedicated zero register
+.loop:
+  LOAD_TRAN_LOW 2, uqcq, 0
+  LOAD_TRAN_LOW 0, dqcq, 0
+  LOAD_TRAN_LOW 3, uqcq, 8
+  LOAD_TRAN_LOW 1, dqcq, 8
+  INCREMENT_ELEMENTS_TRAN_LOW uqcq, 16
+  INCREMENT_ELEMENTS_TRAN_LOW dqcq, 16
+  sub    sizeq, 16
+  psubw     m0, m2
+  psubw     m1, m3
+  ; individual errors are max. 15bit+sign, so squares are 30bit, and
+  ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
+  pmaddwd   m0, m0
+  pmaddwd   m1, m1
+  pmaddwd   m2, m2
+  pmaddwd   m3, m3
+  ; the sum of 2 31bit integers will fit in a 32bit unsigned integer
+  paddd     m0, m1
+  paddd     m2, m3
+  ; accumulate in 64bit
+  punpckldq m7, m0, m5
+  punpckhdq m0, m5
+  paddq     m4, m7
+  punpckldq m7, m2, m5
+  paddq     m4, m0
+  punpckhdq m2, m5
+  paddq     m6, m7
+  paddq     m6, m2
+  jg .loop
+
+  ; accumulate horizontally and store in return value
+  movhlps   m5, m4
+  movhlps   m7, m6
+  paddq     m4, m5
+  paddq     m6, m7
+%if ARCH_X86_64
+  movq    rax, m4
+  movq [sszq], m6
+%else
+  mov     eax, sszm
+  pshufd   m5, m4, 0x1
+  movq  [eax], m6
+  movd    eax, m4
+  movd    edx, m5
+%endif
+  RET
diff --git a/libs/libaom/src/av1/encoder/x86/hash_sse42.c b/libs/libaom/src/av1/encoder/x86/hash_sse42.c
new file mode 100644
index 000000000..65fa46311
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/x86/hash_sse42.c
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdint.h>
+#include <smmintrin.h>
+
+// Byte-boundary alignment issues
+#define ALIGN_SIZE 8
+#define ALIGN_MASK (ALIGN_SIZE - 1)
+
+#define CALC_CRC(op, crc, type, buf, len) \
+  while ((len) >= sizeof(type)) {         \
+    (crc) = op((crc), *(type *)(buf));    \
+    (len) -= sizeof(type);                \
+    buf += sizeof(type);                  \
+  }
+
+/**
+ * Calculates 32-bit CRC for the input buffer
+ * polynomial is 0x11EDC6F41
+ * @return A 32-bit unsigned integer representing the CRC
+ */
+uint32_t av1_get_crc32c_value_sse4_2(void *crc_calculator, uint8_t *p,
+                                     size_t len) {
+  (void)crc_calculator;
+  const uint8_t *buf = p;
+  uint32_t crc = 0xFFFFFFFF;
+
+  // Align the input to the word boundary
+  for (; (len > 0) && ((intptr_t)buf & ALIGN_MASK); len--, buf++) {
+    crc = _mm_crc32_u8(crc, *buf);
+  }
+
+#ifdef __x86_64__
+  uint64_t crc64 = crc;
+  CALC_CRC(_mm_crc32_u64, crc64, uint64_t, buf, len);
+  crc = (uint32_t)crc64;
+#endif
+  CALC_CRC(_mm_crc32_u32, crc, uint32_t, buf, len);
+  CALC_CRC(_mm_crc32_u16, crc, uint16_t, buf, len);
+  CALC_CRC(_mm_crc32_u8, crc, uint8_t, buf, len);
+  return (crc ^= 0xFFFFFFFF);
+}
diff --git a/libs/libaom/src/av1/encoder/x86/highbd_block_error_intrin_avx2.c b/libs/libaom/src/av1/encoder/x86/highbd_block_error_intrin_avx2.c
new file mode 100644
index 000000000..ee3714d8a
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/x86/highbd_block_error_intrin_avx2.c
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+#include <stdio.h>
+#include "aom/aom_integer.h"
+#include "av1/common/common.h"
+
+int64_t av1_highbd_block_error_avx2(const tran_low_t *coeff,
+                                    const tran_low_t *dqcoeff,
+                                    intptr_t block_size, int64_t *ssz,
+                                    int bps) {
+  int i;
+  int64_t temp1[8];
+  int64_t error = 0, sqcoeff = 0;
+  const int shift = 2 * (bps - 8);
+  const int rounding = shift > 0 ? 1 << (shift - 1) : 0;
+
+  for (i = 0; i < block_size; i += 16) {
+    __m256i mm256_coeff = _mm256_loadu_si256((__m256i *)(coeff + i));
+    __m256i mm256_coeff2 = _mm256_loadu_si256((__m256i *)(coeff + i + 8));
+    __m256i mm256_dqcoeff = _mm256_loadu_si256((__m256i *)(dqcoeff + i));
+    __m256i mm256_dqcoeff2 = _mm256_loadu_si256((__m256i *)(dqcoeff + i + 8));
+
+    __m256i diff1 = _mm256_sub_epi32(mm256_coeff, mm256_dqcoeff);
+    __m256i diff2 = _mm256_sub_epi32(mm256_coeff2, mm256_dqcoeff2);
+    __m256i diff1h = _mm256_srli_epi64(diff1, 32);
+    __m256i diff2h = _mm256_srli_epi64(diff2, 32);
+    __m256i res = _mm256_mul_epi32(diff1, diff1);
+    __m256i res1 = _mm256_mul_epi32(diff1h, diff1h);
+    __m256i res2 = _mm256_mul_epi32(diff2, diff2);
+    __m256i res3 = _mm256_mul_epi32(diff2h, diff2h);
+    __m256i res_diff = _mm256_add_epi64(_mm256_add_epi64(res, res1),
+                                        _mm256_add_epi64(res2, res3));
+    __m256i mm256_coeffh = _mm256_srli_epi64(mm256_coeff, 32);
+    __m256i mm256_coeffh2 = _mm256_srli_epi64(mm256_coeff2, 32);
+    res = _mm256_mul_epi32(mm256_coeff, mm256_coeff);
+    res1 = _mm256_mul_epi32(mm256_coeffh, mm256_coeffh);
+    res2 = _mm256_mul_epi32(mm256_coeff2, mm256_coeff2);
+    res3 = _mm256_mul_epi32(mm256_coeffh2, mm256_coeffh2);
+    __m256i res_sqcoeff = _mm256_add_epi64(_mm256_add_epi64(res, res1),
+                                           _mm256_add_epi64(res2, res3));
+    _mm256_storeu_si256((__m256i *)temp1, res_diff);
+    _mm256_storeu_si256((__m256i *)temp1 + 1, res_sqcoeff);
+
+    error += temp1[0] + temp1[1] + temp1[2] + temp1[3];
+    sqcoeff += temp1[4] + temp1[5] + temp1[6] + temp1[7];
+  }
+  assert(error >= 0 && sqcoeff >= 0);
+  error = (error + rounding) >> shift;
+  sqcoeff = (sqcoeff + rounding) >> shift;
+
+  *ssz = sqcoeff;
+  return error;
+}
diff --git a/libs/libaom/src/av1/encoder/x86/highbd_block_error_intrin_sse2.c b/libs/libaom/src/av1/encoder/x86/highbd_block_error_intrin_sse2.c
new file mode 100644
index 000000000..4579e4e4a
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/x86/highbd_block_error_intrin_sse2.c
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+#include <stdio.h>
+
+#include "av1/common/common.h"
+
+int64_t av1_highbd_block_error_sse2(const tran_low_t *coeff,
+                                    const tran_low_t *dqcoeff,
+                                    intptr_t block_size, int64_t *ssz,
+                                    int bps) {
+  int i, j, test;
+  uint32_t temp[4];
+  __m128i max, min, cmp0, cmp1, cmp2, cmp3;
+  int64_t error = 0, sqcoeff = 0;
+  const int shift = 2 * (bps - 8);
+  const int rounding = shift > 0 ? 1 << (shift - 1) : 0;
+
+  for (i = 0; i < block_size; i += 8) {
+    // Load the data into xmm registers
+    __m128i mm_coeff = _mm_load_si128((__m128i *)(coeff + i));
+    __m128i mm_coeff2 = _mm_load_si128((__m128i *)(coeff + i + 4));
+    __m128i mm_dqcoeff = _mm_load_si128((__m128i *)(dqcoeff + i));
+    __m128i mm_dqcoeff2 = _mm_load_si128((__m128i *)(dqcoeff + i + 4));
+    // Check if any values require more than 15 bit
+    max = _mm_set1_epi32(0x3fff);
+    min = _mm_set1_epi32(0xffffc000);
+    cmp0 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff, max),
+                         _mm_cmplt_epi32(mm_coeff, min));
+    cmp1 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff2, max),
+                         _mm_cmplt_epi32(mm_coeff2, min));
+    cmp2 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff, max),
+                         _mm_cmplt_epi32(mm_dqcoeff, min));
+    cmp3 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff2, max),
+                         _mm_cmplt_epi32(mm_dqcoeff2, min));
+    test = _mm_movemask_epi8(
+        _mm_or_si128(_mm_or_si128(cmp0, cmp1), _mm_or_si128(cmp2, cmp3)));
+
+    if (!test) {
+      __m128i mm_diff, error_sse2, sqcoeff_sse2;
+      mm_coeff = _mm_packs_epi32(mm_coeff, mm_coeff2);
+      mm_dqcoeff = _mm_packs_epi32(mm_dqcoeff, mm_dqcoeff2);
+      mm_diff = _mm_sub_epi16(mm_coeff, mm_dqcoeff);
+      error_sse2 = _mm_madd_epi16(mm_diff, mm_diff);
+      sqcoeff_sse2 = _mm_madd_epi16(mm_coeff, mm_coeff);
+      _mm_storeu_si128((__m128i *)temp, error_sse2);
+      error = error + temp[0] + temp[1] + temp[2] + temp[3];
+      _mm_storeu_si128((__m128i *)temp, sqcoeff_sse2);
+      sqcoeff += temp[0] + temp[1] + temp[2] + temp[3];
+    } else {
+      for (j = 0; j < 8; j++) {
+        const int64_t diff = coeff[i + j] - dqcoeff[i + j];
+        error += diff * diff;
+        sqcoeff += (int64_t)coeff[i + j] * (int64_t)coeff[i + j];
+      }
+    }
+  }
+  assert(error >= 0 && sqcoeff >= 0);
+  error = (error + rounding) >> shift;
+  sqcoeff = (sqcoeff + rounding) >> shift;
+
+  *ssz = sqcoeff;
+  return error;
+}
diff --git a/libs/libaom/src/av1/encoder/x86/highbd_fwd_txfm_avx2.c b/libs/libaom/src/av1/encoder/x86/highbd_fwd_txfm_avx2.c
new file mode 100644
index 000000000..a81378cfe
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/x86/highbd_fwd_txfm_avx2.c
@@ -0,0 +1,3167 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <assert.h>
+#include <immintrin.h> /*AVX2*/
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
+#include "aom_dsp/txfm_common.h"
+#include "aom_ports/mem.h"
+#include "aom_dsp/x86/txfm_common_sse2.h"
+#include "aom_dsp/x86/txfm_common_avx2.h"
+
+static INLINE void load_buffer_8x8_avx2(const int16_t *input, __m256i *out,
+                                        int stride, int flipud, int fliplr,
+                                        int shift) {
+  __m128i out1[8];
+  if (!flipud) {
+    out1[0] = _mm_load_si128((const __m128i *)(input + 0 * stride));
+    out1[1] = _mm_load_si128((const __m128i *)(input + 1 * stride));
+    out1[2] = _mm_load_si128((const __m128i *)(input + 2 * stride));
+    out1[3] = _mm_load_si128((const __m128i *)(input + 3 * stride));
+    out1[4] = _mm_load_si128((const __m128i *)(input + 4 * stride));
+    out1[5] = _mm_load_si128((const __m128i *)(input + 5 * stride));
+    out1[6] = _mm_load_si128((const __m128i *)(input + 6 * stride));
+    out1[7] = _mm_load_si128((const __m128i *)(input + 7 * stride));
+
+  } else {
+    out1[7] = _mm_load_si128((const __m128i *)(input + 0 * stride));
+    out1[6] = _mm_load_si128((const __m128i *)(input + 1 * stride));
+    out1[5] = _mm_load_si128((const __m128i *)(input + 2 * stride));
+    out1[4] = _mm_load_si128((const __m128i *)(input + 3 * stride));
+    out1[3] = _mm_load_si128((const __m128i *)(input + 4 * stride));
+    out1[2] = _mm_load_si128((const __m128i *)(input + 5 * stride));
+    out1[1] = _mm_load_si128((const __m128i *)(input + 6 * stride));
+    out1[0] = _mm_load_si128((const __m128i *)(input + 7 * stride));
+  }
+  if (!fliplr) {
+    out[0] = _mm256_cvtepi16_epi32(out1[0]);
+    out[1] = _mm256_cvtepi16_epi32(out1[1]);
+    out[2] = _mm256_cvtepi16_epi32(out1[2]);
+    out[3] = _mm256_cvtepi16_epi32(out1[3]);
+    out[4] = _mm256_cvtepi16_epi32(out1[4]);
+    out[5] = _mm256_cvtepi16_epi32(out1[5]);
+    out[6] = _mm256_cvtepi16_epi32(out1[6]);
+    out[7] = _mm256_cvtepi16_epi32(out1[7]);
+
+  } else {
+    out[0] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[0]));
+    out[1] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[1]));
+    out[2] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[2]));
+    out[3] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[3]));
+    out[4] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[4]));
+    out[5] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[5]));
+    out[6] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[6]));
+    out[7] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[7]));
+  }
+  out[0] = _mm256_slli_epi32(out[0], shift);
+  out[1] = _mm256_slli_epi32(out[1], shift);
+  out[2] = _mm256_slli_epi32(out[2], shift);
+  out[3] = _mm256_slli_epi32(out[3], shift);
+  out[4] = _mm256_slli_epi32(out[4], shift);
+  out[5] = _mm256_slli_epi32(out[5], shift);
+  out[6] = _mm256_slli_epi32(out[6], shift);
+  out[7] = _mm256_slli_epi32(out[7], shift);
+}
+static INLINE void col_txfm_8x8_rounding(__m256i *in, int shift) {
+  const __m256i rounding = _mm256_set1_epi32(1 << (shift - 1));
+
+  in[0] = _mm256_add_epi32(in[0], rounding);
+  in[1] = _mm256_add_epi32(in[1], rounding);
+  in[2] = _mm256_add_epi32(in[2], rounding);
+  in[3] = _mm256_add_epi32(in[3], rounding);
+  in[4] = _mm256_add_epi32(in[4], rounding);
+  in[5] = _mm256_add_epi32(in[5], rounding);
+  in[6] = _mm256_add_epi32(in[6], rounding);
+  in[7] = _mm256_add_epi32(in[7], rounding);
+
+  in[0] = _mm256_srai_epi32(in[0], shift);
+  in[1] = _mm256_srai_epi32(in[1], shift);
+  in[2] = _mm256_srai_epi32(in[2], shift);
+  in[3] = _mm256_srai_epi32(in[3], shift);
+  in[4] = _mm256_srai_epi32(in[4], shift);
+  in[5] = _mm256_srai_epi32(in[5], shift);
+  in[6] = _mm256_srai_epi32(in[6], shift);
+  in[7] = _mm256_srai_epi32(in[7], shift);
+}
+static INLINE void load_buffer_8x16_avx2(const int16_t *input, __m256i *out,
+                                         int stride, int flipud, int fliplr,
+                                         int shift) {
+  const int16_t *topL = input;
+  const int16_t *botL = input + 8 * stride;
+
+  const int16_t *tmp;
+
+  if (flipud) {
+    tmp = topL;
+    topL = botL;
+    botL = tmp;
+  }
+  load_buffer_8x8_avx2(topL, out, stride, flipud, fliplr, shift);
+  load_buffer_8x8_avx2(botL, out + 8, stride, flipud, fliplr, shift);
+}
+static INLINE void load_buffer_16xn_avx2(const int16_t *input, __m256i *out,
+                                         int stride, int height, int outstride,
+                                         int flipud, int fliplr) {
+  __m256i out1[64];
+  if (!flipud) {
+    for (int i = 0; i < height; i++) {
+      out1[i] = _mm256_loadu_si256((const __m256i *)(input + i * stride));
+    }
+  } else {
+    for (int i = 0; i < height; i++) {
+      out1[(height - 1) - i] =
+          _mm256_loadu_si256((const __m256i *)(input + i * stride));
+    }
+  }
+  if (!fliplr) {
+    for (int i = 0; i < height; i++) {
+      out[i * outstride] =
+          _mm256_cvtepi16_epi32(_mm256_castsi256_si128(out1[i]));
+      out[i * outstride + 1] =
+          _mm256_cvtepi16_epi32(_mm256_extractf128_si256(out1[i], 1));
+    }
+  } else {
+    for (int i = 0; i < height; i++) {
+      out[i * outstride + 1] = _mm256_cvtepi16_epi32(
+          mm_reverse_epi16(_mm256_castsi256_si128(out1[i])));
+      out[i * outstride + 0] = _mm256_cvtepi16_epi32(
+          mm_reverse_epi16(_mm256_extractf128_si256(out1[i], 1)));
+    }
+  }
+}
+
+static void fwd_txfm_transpose_8x8_avx2(const __m256i *in, __m256i *out,
+                                        const int instride,
+                                        const int outstride) {
+  __m256i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m256i x0, x1;
+
+  u0 = _mm256_unpacklo_epi32(in[0 * instride], in[1 * instride]);
+  u1 = _mm256_unpackhi_epi32(in[0 * instride], in[1 * instride]);
+
+  u2 = _mm256_unpacklo_epi32(in[2 * instride], in[3 * instride]);
+  u3 = _mm256_unpackhi_epi32(in[2 * instride], in[3 * instride]);
+
+  u4 = _mm256_unpacklo_epi32(in[4 * instride], in[5 * instride]);
+  u5 = _mm256_unpackhi_epi32(in[4 * instride], in[5 * instride]);
+
+  u6 = _mm256_unpacklo_epi32(in[6 * instride], in[7 * instride]);
+  u7 = _mm256_unpackhi_epi32(in[6 * instride], in[7 * instride]);
+
+  x0 = _mm256_unpacklo_epi64(u0, u2);
+  x1 = _mm256_unpacklo_epi64(u4, u6);
+  out[0 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x20);
+  out[4 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x31);
+
+  x0 = _mm256_unpackhi_epi64(u0, u2);
+  x1 = _mm256_unpackhi_epi64(u4, u6);
+  out[1 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x20);
+  out[5 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x31);
+
+  x0 = _mm256_unpacklo_epi64(u1, u3);
+  x1 = _mm256_unpacklo_epi64(u5, u7);
+  out[2 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x20);
+  out[6 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x31);
+
+  x0 = _mm256_unpackhi_epi64(u1, u3);
+  x1 = _mm256_unpackhi_epi64(u5, u7);
+  out[3 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x20);
+  out[7 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x31);
+}
+static INLINE void round_shift_32_8xn_avx2(__m256i *in, int size, int bit,
+                                           int stride) {
+  if (bit < 0) {
+    bit = -bit;
+    __m256i round = _mm256_set1_epi32(1 << (bit - 1));
+    for (int i = 0; i < size; ++i) {
+      in[stride * i] = _mm256_add_epi32(in[stride * i], round);
+      in[stride * i] = _mm256_srai_epi32(in[stride * i], bit);
+    }
+  } else if (bit > 0) {
+    for (int i = 0; i < size; ++i) {
+      in[stride * i] = _mm256_slli_epi32(in[stride * i], bit);
+    }
+  }
+}
+static INLINE void store_buffer_avx2(const __m256i *const in, int32_t *out,
+                                     const int stride, const int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    _mm256_store_si256((__m256i *)(out), in[i]);
+    out += stride;
+  }
+}
+static INLINE void fwd_txfm_transpose_16x16_avx2(const __m256i *in,
+                                                 __m256i *out) {
+  fwd_txfm_transpose_8x8_avx2(&in[0], &out[0], 2, 2);
+  fwd_txfm_transpose_8x8_avx2(&in[1], &out[16], 2, 2);
+  fwd_txfm_transpose_8x8_avx2(&in[16], &out[1], 2, 2);
+  fwd_txfm_transpose_8x8_avx2(&in[17], &out[17], 2, 2);
+}
+
+static INLINE __m256i av1_half_btf_avx2(const __m256i *w0, const __m256i *n0,
+                                        const __m256i *w1, const __m256i *n1,
+                                        const __m256i *rounding, int bit) {
+  __m256i x, y;
+
+  x = _mm256_mullo_epi32(*w0, *n0);
+  y = _mm256_mullo_epi32(*w1, *n1);
+  x = _mm256_add_epi32(x, y);
+  x = _mm256_add_epi32(x, *rounding);
+  x = _mm256_srai_epi32(x, bit);
+  return x;
+}
+#define btf_32_avx2_type0(w0, w1, in0, in1, out0, out1, bit) \
+  do {                                                       \
+    const __m256i ww0 = _mm256_set1_epi32(w0);               \
+    const __m256i ww1 = _mm256_set1_epi32(w1);               \
+    const __m256i in0_w0 = _mm256_mullo_epi32(in0, ww0);     \
+    const __m256i in1_w1 = _mm256_mullo_epi32(in1, ww1);     \
+    out0 = _mm256_add_epi32(in0_w0, in1_w1);                 \
+    round_shift_32_8xn_avx2(&out0, 1, -bit, 1);              \
+    const __m256i in0_w1 = _mm256_mullo_epi32(in0, ww1);     \
+    const __m256i in1_w0 = _mm256_mullo_epi32(in1, ww0);     \
+    out1 = _mm256_sub_epi32(in0_w1, in1_w0);                 \
+    round_shift_32_8xn_avx2(&out1, 1, -bit, 1);              \
+  } while (0)
+
+#define btf_32_type0_avx2_new(ww0, ww1, in0, in1, out0, out1, r, bit) \
+  do {                                                                \
+    const __m256i in0_w0 = _mm256_mullo_epi32(in0, ww0);              \
+    const __m256i in1_w1 = _mm256_mullo_epi32(in1, ww1);              \
+    out0 = _mm256_add_epi32(in0_w0, in1_w1);                          \
+    out0 = _mm256_add_epi32(out0, r);                                 \
+    out0 = _mm256_srai_epi32(out0, bit);                              \
+    const __m256i in0_w1 = _mm256_mullo_epi32(in0, ww1);              \
+    const __m256i in1_w0 = _mm256_mullo_epi32(in1, ww0);              \
+    out1 = _mm256_sub_epi32(in0_w1, in1_w0);                          \
+    out1 = _mm256_add_epi32(out1, r);                                 \
+    out1 = _mm256_srai_epi32(out1, bit);                              \
+  } while (0)
+
+typedef void (*transform_1d_avx2)(__m256i *in, __m256i *out,
+                                  const int8_t cos_bit, int instride,
+                                  int outstride);
+static void fdct8_avx2(__m256i *in, __m256i *out, const int8_t bit,
+                       const int col_num, const int outstride) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+  const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
+  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+  const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+  const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+  __m256i u[8], v[8];
+  for (int col = 0; col < col_num; ++col) {
+    u[0] = _mm256_add_epi32(in[0 * col_num + col], in[7 * col_num + col]);
+    v[7] = _mm256_sub_epi32(in[0 * col_num + col], in[7 * col_num + col]);
+    u[1] = _mm256_add_epi32(in[1 * col_num + col], in[6 * col_num + col]);
+    u[6] = _mm256_sub_epi32(in[1 * col_num + col], in[6 * col_num + col]);
+    u[2] = _mm256_add_epi32(in[2 * col_num + col], in[5 * col_num + col]);
+    u[5] = _mm256_sub_epi32(in[2 * col_num + col], in[5 * col_num + col]);
+    u[3] = _mm256_add_epi32(in[3 * col_num + col], in[4 * col_num + col]);
+    v[4] = _mm256_sub_epi32(in[3 * col_num + col], in[4 * col_num + col]);
+    v[0] = _mm256_add_epi32(u[0], u[3]);
+    v[3] = _mm256_sub_epi32(u[0], u[3]);
+    v[1] = _mm256_add_epi32(u[1], u[2]);
+    v[2] = _mm256_sub_epi32(u[1], u[2]);
+
+    v[5] = _mm256_mullo_epi32(u[5], cospim32);
+    v[6] = _mm256_mullo_epi32(u[6], cospi32);
+    v[5] = _mm256_add_epi32(v[5], v[6]);
+    v[5] = _mm256_add_epi32(v[5], rnding);
+    v[5] = _mm256_srai_epi32(v[5], bit);
+
+    u[0] = _mm256_mullo_epi32(u[5], cospi32);
+    v[6] = _mm256_mullo_epi32(u[6], cospim32);
+    v[6] = _mm256_sub_epi32(u[0], v[6]);
+    v[6] = _mm256_add_epi32(v[6], rnding);
+    v[6] = _mm256_srai_epi32(v[6], bit);
+
+    // stage 3
+    // type 0
+    v[0] = _mm256_mullo_epi32(v[0], cospi32);
+    v[1] = _mm256_mullo_epi32(v[1], cospi32);
+    u[0] = _mm256_add_epi32(v[0], v[1]);
+    u[0] = _mm256_add_epi32(u[0], rnding);
+    u[0] = _mm256_srai_epi32(u[0], bit);
+
+    u[1] = _mm256_sub_epi32(v[0], v[1]);
+    u[1] = _mm256_add_epi32(u[1], rnding);
+    u[1] = _mm256_srai_epi32(u[1], bit);
+
+    // type 1
+    v[0] = _mm256_mullo_epi32(v[2], cospi48);
+    v[1] = _mm256_mullo_epi32(v[3], cospi16);
+    u[2] = _mm256_add_epi32(v[0], v[1]);
+    u[2] = _mm256_add_epi32(u[2], rnding);
+    u[2] = _mm256_srai_epi32(u[2], bit);
+
+    v[0] = _mm256_mullo_epi32(v[2], cospi16);
+    v[1] = _mm256_mullo_epi32(v[3], cospi48);
+    u[3] = _mm256_sub_epi32(v[1], v[0]);
+    u[3] = _mm256_add_epi32(u[3], rnding);
+    u[3] = _mm256_srai_epi32(u[3], bit);
+
+    u[4] = _mm256_add_epi32(v[4], v[5]);
+    u[5] = _mm256_sub_epi32(v[4], v[5]);
+    u[6] = _mm256_sub_epi32(v[7], v[6]);
+    u[7] = _mm256_add_epi32(v[7], v[6]);
+
+    // stage 4
+    // stage 5
+    v[0] = _mm256_mullo_epi32(u[4], cospi56);
+    v[1] = _mm256_mullo_epi32(u[7], cospi8);
+    v[0] = _mm256_add_epi32(v[0], v[1]);
+    v[0] = _mm256_add_epi32(v[0], rnding);
+    out[1 * outstride + col] = _mm256_srai_epi32(v[0], bit);  // buf0[4]
+
+    v[0] = _mm256_mullo_epi32(u[4], cospi8);
+    v[1] = _mm256_mullo_epi32(u[7], cospi56);
+    v[0] = _mm256_sub_epi32(v[1], v[0]);
+    v[0] = _mm256_add_epi32(v[0], rnding);
+    out[7 * outstride + col] = _mm256_srai_epi32(v[0], bit);  // buf0[7]
+
+    v[0] = _mm256_mullo_epi32(u[5], cospi24);
+    v[1] = _mm256_mullo_epi32(u[6], cospi40);
+    v[0] = _mm256_add_epi32(v[0], v[1]);
+    v[0] = _mm256_add_epi32(v[0], rnding);
+    out[5 * outstride + col] = _mm256_srai_epi32(v[0], bit);  // buf0[5]
+
+    v[0] = _mm256_mullo_epi32(u[5], cospi40);
+    v[1] = _mm256_mullo_epi32(u[6], cospi24);
+    v[0] = _mm256_sub_epi32(v[1], v[0]);
+    v[0] = _mm256_add_epi32(v[0], rnding);
+    out[3 * outstride + col] = _mm256_srai_epi32(v[0], bit);  // buf0[6]
+
+    out[0 * outstride + col] = u[0];  // buf0[0]
+    out[4 * outstride + col] = u[1];  // buf0[1]
+    out[2 * outstride + col] = u[2];  // buf0[2]
+    out[6 * outstride + col] = u[3];  // buf0[3]
+  }
+}
+static void fadst8_avx2(__m256i *in, __m256i *out, const int8_t bit,
+                        const int col_num, const int outstirde) {
+  (void)col_num;
+  const int32_t *cospi = cospi_arr(bit);
+  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+  const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
+  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+  const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+  const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
+  const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
+  const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
+  const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
+  const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
+  const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
+  const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
+  const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
+  const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
+  const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
+  const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
+  const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
+  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m256i v0, v1, v2, v3, v4, v5, v6, v7;
+  __m256i x, y;
+  for (int col = 0; col < col_num; ++col) {
+    u0 = in[0 * col_num + col];
+    u1 = _mm256_sub_epi32(zero, in[7 * col_num + col]);
+    u2 = _mm256_sub_epi32(zero, in[3 * col_num + col]);
+    u3 = in[4 * col_num + col];
+    u4 = _mm256_sub_epi32(zero, in[1 * col_num + col]);
+    u5 = in[6 * col_num + col];
+    u6 = in[2 * col_num + col];
+    u7 = _mm256_sub_epi32(zero, in[5 * col_num + col]);
+
+    // stage 2
+    v0 = u0;
+    v1 = u1;
+
+    x = _mm256_mullo_epi32(u2, cospi32);
+    y = _mm256_mullo_epi32(u3, cospi32);
+    v2 = _mm256_add_epi32(x, y);
+    v2 = _mm256_add_epi32(v2, rnding);
+    v2 = _mm256_srai_epi32(v2, bit);
+
+    v3 = _mm256_sub_epi32(x, y);
+    v3 = _mm256_add_epi32(v3, rnding);
+    v3 = _mm256_srai_epi32(v3, bit);
+
+    v4 = u4;
+    v5 = u5;
+
+    x = _mm256_mullo_epi32(u6, cospi32);
+    y = _mm256_mullo_epi32(u7, cospi32);
+    v6 = _mm256_add_epi32(x, y);
+    v6 = _mm256_add_epi32(v6, rnding);
+    v6 = _mm256_srai_epi32(v6, bit);
+
+    v7 = _mm256_sub_epi32(x, y);
+    v7 = _mm256_add_epi32(v7, rnding);
+    v7 = _mm256_srai_epi32(v7, bit);
+
+    // stage 3
+    u0 = _mm256_add_epi32(v0, v2);
+    u1 = _mm256_add_epi32(v1, v3);
+    u2 = _mm256_sub_epi32(v0, v2);
+    u3 = _mm256_sub_epi32(v1, v3);
+    u4 = _mm256_add_epi32(v4, v6);
+    u5 = _mm256_add_epi32(v5, v7);
+    u6 = _mm256_sub_epi32(v4, v6);
+    u7 = _mm256_sub_epi32(v5, v7);
+
+    // stage 4
+    v0 = u0;
+    v1 = u1;
+    v2 = u2;
+    v3 = u3;
+
+    x = _mm256_mullo_epi32(u4, cospi16);
+    y = _mm256_mullo_epi32(u5, cospi48);
+    v4 = _mm256_add_epi32(x, y);
+    v4 = _mm256_add_epi32(v4, rnding);
+    v4 = _mm256_srai_epi32(v4, bit);
+
+    x = _mm256_mullo_epi32(u4, cospi48);
+    y = _mm256_mullo_epi32(u5, cospim16);
+    v5 = _mm256_add_epi32(x, y);
+    v5 = _mm256_add_epi32(v5, rnding);
+    v5 = _mm256_srai_epi32(v5, bit);
+
+    x = _mm256_mullo_epi32(u6, cospim48);
+    y = _mm256_mullo_epi32(u7, cospi16);
+    v6 = _mm256_add_epi32(x, y);
+    v6 = _mm256_add_epi32(v6, rnding);
+    v6 = _mm256_srai_epi32(v6, bit);
+
+    x = _mm256_mullo_epi32(u6, cospi16);
+    y = _mm256_mullo_epi32(u7, cospi48);
+    v7 = _mm256_add_epi32(x, y);
+    v7 = _mm256_add_epi32(v7, rnding);
+    v7 = _mm256_srai_epi32(v7, bit);
+
+    // stage 5
+    u0 = _mm256_add_epi32(v0, v4);
+    u1 = _mm256_add_epi32(v1, v5);
+    u2 = _mm256_add_epi32(v2, v6);
+    u3 = _mm256_add_epi32(v3, v7);
+    u4 = _mm256_sub_epi32(v0, v4);
+    u5 = _mm256_sub_epi32(v1, v5);
+    u6 = _mm256_sub_epi32(v2, v6);
+    u7 = _mm256_sub_epi32(v3, v7);
+
+    // stage 6
+    x = _mm256_mullo_epi32(u0, cospi4);
+    y = _mm256_mullo_epi32(u1, cospi60);
+    v0 = _mm256_add_epi32(x, y);
+    v0 = _mm256_add_epi32(v0, rnding);
+    v0 = _mm256_srai_epi32(v0, bit);
+
+    x = _mm256_mullo_epi32(u0, cospi60);
+    y = _mm256_mullo_epi32(u1, cospim4);
+    v1 = _mm256_add_epi32(x, y);
+    v1 = _mm256_add_epi32(v1, rnding);
+    v1 = _mm256_srai_epi32(v1, bit);
+
+    x = _mm256_mullo_epi32(u2, cospi20);
+    y = _mm256_mullo_epi32(u3, cospi44);
+    v2 = _mm256_add_epi32(x, y);
+    v2 = _mm256_add_epi32(v2, rnding);
+    v2 = _mm256_srai_epi32(v2, bit);
+
+    x = _mm256_mullo_epi32(u2, cospi44);
+    y = _mm256_mullo_epi32(u3, cospim20);
+    v3 = _mm256_add_epi32(x, y);
+    v3 = _mm256_add_epi32(v3, rnding);
+    v3 = _mm256_srai_epi32(v3, bit);
+
+    x = _mm256_mullo_epi32(u4, cospi36);
+    y = _mm256_mullo_epi32(u5, cospi28);
+    v4 = _mm256_add_epi32(x, y);
+    v4 = _mm256_add_epi32(v4, rnding);
+    v4 = _mm256_srai_epi32(v4, bit);
+
+    x = _mm256_mullo_epi32(u4, cospi28);
+    y = _mm256_mullo_epi32(u5, cospim36);
+    v5 = _mm256_add_epi32(x, y);
+    v5 = _mm256_add_epi32(v5, rnding);
+    v5 = _mm256_srai_epi32(v5, bit);
+
+    x = _mm256_mullo_epi32(u6, cospi52);
+    y = _mm256_mullo_epi32(u7, cospi12);
+    v6 = _mm256_add_epi32(x, y);
+    v6 = _mm256_add_epi32(v6, rnding);
+    v6 = _mm256_srai_epi32(v6, bit);
+
+    x = _mm256_mullo_epi32(u6, cospi12);
+    y = _mm256_mullo_epi32(u7, cospim52);
+    v7 = _mm256_add_epi32(x, y);
+    v7 = _mm256_add_epi32(v7, rnding);
+    v7 = _mm256_srai_epi32(v7, bit);
+
+    // stage 7
+    out[0 * outstirde + col] = v1;
+    out[1 * outstirde + col] = v6;
+    out[2 * outstirde + col] = v3;
+    out[3 * outstirde + col] = v4;
+    out[4 * outstirde + col] = v5;
+    out[5 * outstirde + col] = v2;
+    out[6 * outstirde + col] = v7;
+    out[7 * outstirde + col] = v0;
+  }
+}
+static void idtx8_avx2(__m256i *in, __m256i *out, const int8_t bit, int col_num,
+                       int outstride) {
+  (void)bit;
+  (void)outstride;
+  int num_iters = 8 * col_num;
+  for (int i = 0; i < num_iters; i += 8) {
+    out[i] = _mm256_add_epi32(in[i], in[i]);
+    out[i + 1] = _mm256_add_epi32(in[i + 1], in[i + 1]);
+    out[i + 2] = _mm256_add_epi32(in[i + 2], in[i + 2]);
+    out[i + 3] = _mm256_add_epi32(in[i + 3], in[i + 3]);
+    out[i + 4] = _mm256_add_epi32(in[i + 4], in[i + 4]);
+    out[i + 5] = _mm256_add_epi32(in[i + 5], in[i + 5]);
+    out[i + 6] = _mm256_add_epi32(in[i + 6], in[i + 6]);
+    out[i + 7] = _mm256_add_epi32(in[i + 7], in[i + 7]);
+  }
+}
+void av1_fwd_txfm2d_8x8_avx2(const int16_t *input, int32_t *coeff, int stride,
+                             TX_TYPE tx_type, int bd) {
+  __m256i in[8], out[8];
+  const TX_SIZE tx_size = TX_8X8;
+  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int width = tx_size_wide[tx_size];
+  const int width_div8 = (width >> 3);
+
+  switch (tx_type) {
+    case DCT_DCT:
+      load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
+      fdct8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                 width_div8);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      fdct8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                 width_div8);
+      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      store_buffer_avx2(in, coeff, 8, 8);
+      break;
+    case ADST_DCT:
+      load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
+      fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      fdct8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                 width_div8);
+      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      store_buffer_avx2(in, coeff, 8, 8);
+      break;
+    case DCT_ADST:
+      load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
+      fdct8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                 width_div8);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      fadst8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      store_buffer_avx2(in, coeff, 8, 8);
+      break;
+    case ADST_ADST:
+      load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
+      fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      fadst8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      store_buffer_avx2(in, coeff, 8, 8);
+      break;
+    case FLIPADST_DCT:
+      load_buffer_8x8_avx2(input, in, stride, 1, 0, shift[0]);
+      fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      fdct8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                 width_div8);
+      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      store_buffer_avx2(in, coeff, 8, 8);
+      break;
+    case DCT_FLIPADST:
+      load_buffer_8x8_avx2(input, in, stride, 0, 1, shift[0]);
+      fdct8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                 width_div8);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      fadst8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      store_buffer_avx2(in, coeff, 8, 8);
+      break;
+    case FLIPADST_FLIPADST:
+      load_buffer_8x8_avx2(input, in, stride, 1, 1, shift[0]);
+      fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      fadst8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      store_buffer_avx2(in, coeff, 8, 8);
+      break;
+    case ADST_FLIPADST:
+      load_buffer_8x8_avx2(input, in, stride, 0, 1, shift[0]);
+      fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      fadst8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      store_buffer_avx2(in, coeff, 8, 8);
+      break;
+    case FLIPADST_ADST:
+      load_buffer_8x8_avx2(input, in, stride, 1, 0, shift[0]);
+      fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      fadst8_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      store_buffer_avx2(in, coeff, 8, 8);
+      break;
+    case IDTX:
+      load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
+      idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                 width_div8);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      idtx8_avx2(out, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                 width_div8);
+      store_buffer_avx2(in, coeff, 8, 8);
+      break;
+    case V_DCT:
+      load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
+      fdct8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                 width_div8);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      idtx8_avx2(out, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                 width_div8);
+      store_buffer_avx2(in, coeff, 8, 8);
+      break;
+    case H_DCT:
+      load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
+      idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                 width_div8);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      fdct8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                 width_div8);
+      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      store_buffer_avx2(in, coeff, 8, 8);
+      break;
+    case V_ADST:
+      load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
+      fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      idtx8_avx2(out, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                 width_div8);
+      store_buffer_avx2(in, coeff, 8, 8);
+      break;
+    case H_ADST:
+      load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
+      idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                 width_div8);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      store_buffer_avx2(in, coeff, 8, 8);
+      break;
+    case V_FLIPADST:
+      load_buffer_8x8_avx2(input, in, stride, 1, 0, shift[0]);
+      fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      idtx8_avx2(out, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                 width_div8);
+      store_buffer_avx2(in, coeff, 8, 8);
+      break;
+    case H_FLIPADST:
+      load_buffer_8x8_avx2(input, in, stride, 0, 1, shift[0]);
+      idtx8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                 width_div8);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      fadst8_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+      store_buffer_avx2(in, coeff, 8, 8);
+      break;
+    default: assert(0);
+  }
+  (void)bd;
+}
+
+static void fdct16_avx2(__m256i *in, __m256i *out, const int8_t bit,
+                        const int col_num, const int outstride) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+  const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
+  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+  const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+  const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
+  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+  const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+  const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+  const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
+  const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
+  const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
+  const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
+  const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
+  const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
+  const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
+  const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
+  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+  __m256i u[16], v[16], x;
+  int col;
+
+  // Calculate the column 0, 1, 2, 3
+  for (col = 0; col < col_num; ++col) {
+    // stage 0
+    // stage 1
+    u[0] = _mm256_add_epi32(in[0 * col_num + col], in[15 * col_num + col]);
+    u[15] = _mm256_sub_epi32(in[0 * col_num + col], in[15 * col_num + col]);
+    u[1] = _mm256_add_epi32(in[1 * col_num + col], in[14 * col_num + col]);
+    u[14] = _mm256_sub_epi32(in[1 * col_num + col], in[14 * col_num + col]);
+    u[2] = _mm256_add_epi32(in[2 * col_num + col], in[13 * col_num + col]);
+    u[13] = _mm256_sub_epi32(in[2 * col_num + col], in[13 * col_num + col]);
+    u[3] = _mm256_add_epi32(in[3 * col_num + col], in[12 * col_num + col]);
+    u[12] = _mm256_sub_epi32(in[3 * col_num + col], in[12 * col_num + col]);
+    u[4] = _mm256_add_epi32(in[4 * col_num + col], in[11 * col_num + col]);
+    u[11] = _mm256_sub_epi32(in[4 * col_num + col], in[11 * col_num + col]);
+    u[5] = _mm256_add_epi32(in[5 * col_num + col], in[10 * col_num + col]);
+    u[10] = _mm256_sub_epi32(in[5 * col_num + col], in[10 * col_num + col]);
+    u[6] = _mm256_add_epi32(in[6 * col_num + col], in[9 * col_num + col]);
+    u[9] = _mm256_sub_epi32(in[6 * col_num + col], in[9 * col_num + col]);
+    u[7] = _mm256_add_epi32(in[7 * col_num + col], in[8 * col_num + col]);
+    u[8] = _mm256_sub_epi32(in[7 * col_num + col], in[8 * col_num + col]);
+
+    // stage 2
+    v[0] = _mm256_add_epi32(u[0], u[7]);
+    v[7] = _mm256_sub_epi32(u[0], u[7]);
+    v[1] = _mm256_add_epi32(u[1], u[6]);
+    v[6] = _mm256_sub_epi32(u[1], u[6]);
+    v[2] = _mm256_add_epi32(u[2], u[5]);
+    v[5] = _mm256_sub_epi32(u[2], u[5]);
+    v[3] = _mm256_add_epi32(u[3], u[4]);
+    v[4] = _mm256_sub_epi32(u[3], u[4]);
+    v[8] = u[8];
+    v[9] = u[9];
+
+    v[10] = _mm256_mullo_epi32(u[10], cospim32);
+    x = _mm256_mullo_epi32(u[13], cospi32);
+    v[10] = _mm256_add_epi32(v[10], x);
+    v[10] = _mm256_add_epi32(v[10], rnding);
+    v[10] = _mm256_srai_epi32(v[10], bit);
+
+    v[13] = _mm256_mullo_epi32(u[10], cospi32);
+    x = _mm256_mullo_epi32(u[13], cospim32);
+    v[13] = _mm256_sub_epi32(v[13], x);
+    v[13] = _mm256_add_epi32(v[13], rnding);
+    v[13] = _mm256_srai_epi32(v[13], bit);
+
+    v[11] = _mm256_mullo_epi32(u[11], cospim32);
+    x = _mm256_mullo_epi32(u[12], cospi32);
+    v[11] = _mm256_add_epi32(v[11], x);
+    v[11] = _mm256_add_epi32(v[11], rnding);
+    v[11] = _mm256_srai_epi32(v[11], bit);
+
+    v[12] = _mm256_mullo_epi32(u[11], cospi32);
+    x = _mm256_mullo_epi32(u[12], cospim32);
+    v[12] = _mm256_sub_epi32(v[12], x);
+    v[12] = _mm256_add_epi32(v[12], rnding);
+    v[12] = _mm256_srai_epi32(v[12], bit);
+    v[14] = u[14];
+    v[15] = u[15];
+
+    // stage 3
+    u[0] = _mm256_add_epi32(v[0], v[3]);
+    u[3] = _mm256_sub_epi32(v[0], v[3]);
+    u[1] = _mm256_add_epi32(v[1], v[2]);
+    u[2] = _mm256_sub_epi32(v[1], v[2]);
+    u[4] = v[4];
+
+    u[5] = _mm256_mullo_epi32(v[5], cospim32);
+    x = _mm256_mullo_epi32(v[6], cospi32);
+    u[5] = _mm256_add_epi32(u[5], x);
+    u[5] = _mm256_add_epi32(u[5], rnding);
+    u[5] = _mm256_srai_epi32(u[5], bit);
+
+    u[6] = _mm256_mullo_epi32(v[5], cospi32);
+    x = _mm256_mullo_epi32(v[6], cospim32);
+    u[6] = _mm256_sub_epi32(u[6], x);
+    u[6] = _mm256_add_epi32(u[6], rnding);
+    u[6] = _mm256_srai_epi32(u[6], bit);
+
+    u[7] = v[7];
+    u[8] = _mm256_add_epi32(v[8], v[11]);
+    u[11] = _mm256_sub_epi32(v[8], v[11]);
+    u[9] = _mm256_add_epi32(v[9], v[10]);
+    u[10] = _mm256_sub_epi32(v[9], v[10]);
+    u[12] = _mm256_sub_epi32(v[15], v[12]);
+    u[15] = _mm256_add_epi32(v[15], v[12]);
+    u[13] = _mm256_sub_epi32(v[14], v[13]);
+    u[14] = _mm256_add_epi32(v[14], v[13]);
+
+    // stage 4
+    u[0] = _mm256_mullo_epi32(u[0], cospi32);
+    u[1] = _mm256_mullo_epi32(u[1], cospi32);
+    v[0] = _mm256_add_epi32(u[0], u[1]);
+    v[0] = _mm256_add_epi32(v[0], rnding);
+    v[0] = _mm256_srai_epi32(v[0], bit);
+
+    v[1] = _mm256_sub_epi32(u[0], u[1]);
+    v[1] = _mm256_add_epi32(v[1], rnding);
+    v[1] = _mm256_srai_epi32(v[1], bit);
+
+    v[2] = _mm256_mullo_epi32(u[2], cospi48);
+    x = _mm256_mullo_epi32(u[3], cospi16);
+    v[2] = _mm256_add_epi32(v[2], x);
+    v[2] = _mm256_add_epi32(v[2], rnding);
+    v[2] = _mm256_srai_epi32(v[2], bit);
+
+    v[3] = _mm256_mullo_epi32(u[2], cospi16);
+    x = _mm256_mullo_epi32(u[3], cospi48);
+    v[3] = _mm256_sub_epi32(x, v[3]);
+    v[3] = _mm256_add_epi32(v[3], rnding);
+    v[3] = _mm256_srai_epi32(v[3], bit);
+
+    v[4] = _mm256_add_epi32(u[4], u[5]);
+    v[5] = _mm256_sub_epi32(u[4], u[5]);
+    v[6] = _mm256_sub_epi32(u[7], u[6]);
+    v[7] = _mm256_add_epi32(u[7], u[6]);
+    v[8] = u[8];
+
+    v[9] = _mm256_mullo_epi32(u[9], cospim16);
+    x = _mm256_mullo_epi32(u[14], cospi48);
+    v[9] = _mm256_add_epi32(v[9], x);
+    v[9] = _mm256_add_epi32(v[9], rnding);
+    v[9] = _mm256_srai_epi32(v[9], bit);
+
+    v[14] = _mm256_mullo_epi32(u[9], cospi48);
+    x = _mm256_mullo_epi32(u[14], cospim16);
+    v[14] = _mm256_sub_epi32(v[14], x);
+    v[14] = _mm256_add_epi32(v[14], rnding);
+    v[14] = _mm256_srai_epi32(v[14], bit);
+
+    v[10] = _mm256_mullo_epi32(u[10], cospim48);
+    x = _mm256_mullo_epi32(u[13], cospim16);
+    v[10] = _mm256_add_epi32(v[10], x);
+    v[10] = _mm256_add_epi32(v[10], rnding);
+    v[10] = _mm256_srai_epi32(v[10], bit);
+
+    v[13] = _mm256_mullo_epi32(u[10], cospim16);
+    x = _mm256_mullo_epi32(u[13], cospim48);
+    v[13] = _mm256_sub_epi32(v[13], x);
+    v[13] = _mm256_add_epi32(v[13], rnding);
+    v[13] = _mm256_srai_epi32(v[13], bit);
+
+    v[11] = u[11];
+    v[12] = u[12];
+    v[15] = u[15];
+
+    // stage 5
+    u[0] = v[0];
+    u[1] = v[1];
+    u[2] = v[2];
+    u[3] = v[3];
+
+    u[4] = _mm256_mullo_epi32(v[4], cospi56);
+    x = _mm256_mullo_epi32(v[7], cospi8);
+    u[4] = _mm256_add_epi32(u[4], x);
+    u[4] = _mm256_add_epi32(u[4], rnding);
+    u[4] = _mm256_srai_epi32(u[4], bit);
+
+    u[7] = _mm256_mullo_epi32(v[4], cospi8);
+    x = _mm256_mullo_epi32(v[7], cospi56);
+    u[7] = _mm256_sub_epi32(x, u[7]);
+    u[7] = _mm256_add_epi32(u[7], rnding);
+    u[7] = _mm256_srai_epi32(u[7], bit);
+
+    u[5] = _mm256_mullo_epi32(v[5], cospi24);
+    x = _mm256_mullo_epi32(v[6], cospi40);
+    u[5] = _mm256_add_epi32(u[5], x);
+    u[5] = _mm256_add_epi32(u[5], rnding);
+    u[5] = _mm256_srai_epi32(u[5], bit);
+
+    u[6] = _mm256_mullo_epi32(v[5], cospi40);
+    x = _mm256_mullo_epi32(v[6], cospi24);
+    u[6] = _mm256_sub_epi32(x, u[6]);
+    u[6] = _mm256_add_epi32(u[6], rnding);
+    u[6] = _mm256_srai_epi32(u[6], bit);
+
+    u[8] = _mm256_add_epi32(v[8], v[9]);
+    u[9] = _mm256_sub_epi32(v[8], v[9]);
+    u[10] = _mm256_sub_epi32(v[11], v[10]);
+    u[11] = _mm256_add_epi32(v[11], v[10]);
+    u[12] = _mm256_add_epi32(v[12], v[13]);
+    u[13] = _mm256_sub_epi32(v[12], v[13]);
+    u[14] = _mm256_sub_epi32(v[15], v[14]);
+    u[15] = _mm256_add_epi32(v[15], v[14]);
+
+    // stage 6
+    v[0] = u[0];
+    v[1] = u[1];
+    v[2] = u[2];
+    v[3] = u[3];
+    v[4] = u[4];
+    v[5] = u[5];
+    v[6] = u[6];
+    v[7] = u[7];
+
+    v[8] = _mm256_mullo_epi32(u[8], cospi60);
+    x = _mm256_mullo_epi32(u[15], cospi4);
+    v[8] = _mm256_add_epi32(v[8], x);
+    v[8] = _mm256_add_epi32(v[8], rnding);
+    v[8] = _mm256_srai_epi32(v[8], bit);
+
+    v[15] = _mm256_mullo_epi32(u[8], cospi4);
+    x = _mm256_mullo_epi32(u[15], cospi60);
+    v[15] = _mm256_sub_epi32(x, v[15]);
+    v[15] = _mm256_add_epi32(v[15], rnding);
+    v[15] = _mm256_srai_epi32(v[15], bit);
+
+    v[9] = _mm256_mullo_epi32(u[9], cospi28);
+    x = _mm256_mullo_epi32(u[14], cospi36);
+    v[9] = _mm256_add_epi32(v[9], x);
+    v[9] = _mm256_add_epi32(v[9], rnding);
+    v[9] = _mm256_srai_epi32(v[9], bit);
+
+    v[14] = _mm256_mullo_epi32(u[9], cospi36);
+    x = _mm256_mullo_epi32(u[14], cospi28);
+    v[14] = _mm256_sub_epi32(x, v[14]);
+    v[14] = _mm256_add_epi32(v[14], rnding);
+    v[14] = _mm256_srai_epi32(v[14], bit);
+
+    v[10] = _mm256_mullo_epi32(u[10], cospi44);
+    x = _mm256_mullo_epi32(u[13], cospi20);
+    v[10] = _mm256_add_epi32(v[10], x);
+    v[10] = _mm256_add_epi32(v[10], rnding);
+    v[10] = _mm256_srai_epi32(v[10], bit);
+
+    v[13] = _mm256_mullo_epi32(u[10], cospi20);
+    x = _mm256_mullo_epi32(u[13], cospi44);
+    v[13] = _mm256_sub_epi32(x, v[13]);
+    v[13] = _mm256_add_epi32(v[13], rnding);
+    v[13] = _mm256_srai_epi32(v[13], bit);
+
+    v[11] = _mm256_mullo_epi32(u[11], cospi12);
+    x = _mm256_mullo_epi32(u[12], cospi52);
+    v[11] = _mm256_add_epi32(v[11], x);
+    v[11] = _mm256_add_epi32(v[11], rnding);
+    v[11] = _mm256_srai_epi32(v[11], bit);
+
+    v[12] = _mm256_mullo_epi32(u[11], cospi52);
+    x = _mm256_mullo_epi32(u[12], cospi12);
+    v[12] = _mm256_sub_epi32(x, v[12]);
+    v[12] = _mm256_add_epi32(v[12], rnding);
+    v[12] = _mm256_srai_epi32(v[12], bit);
+
+    out[0 * outstride + col] = v[0];
+    out[1 * outstride + col] = v[8];
+    out[2 * outstride + col] = v[4];
+    out[3 * outstride + col] = v[12];
+    out[4 * outstride + col] = v[2];
+    out[5 * outstride + col] = v[10];
+    out[6 * outstride + col] = v[6];
+    out[7 * outstride + col] = v[14];
+    out[8 * outstride + col] = v[1];
+    out[9 * outstride + col] = v[9];
+    out[10 * outstride + col] = v[5];
+    out[11 * outstride + col] = v[13];
+    out[12 * outstride + col] = v[3];
+    out[13 * outstride + col] = v[11];
+    out[14 * outstride + col] = v[7];
+    out[15 * outstride + col] = v[15];
+  }
+}
+static void fadst16_avx2(__m256i *in, __m256i *out, const int8_t bit,
+                         const int num_cols, const int outstride) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+  const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+  const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+  const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
+  const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+  const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+  const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+  const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
+  const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
+  const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+  const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
+  const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
+  const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+  const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
+  const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
+  const __m256i cospim2 = _mm256_set1_epi32(-cospi[2]);
+  const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
+  const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
+  const __m256i cospim10 = _mm256_set1_epi32(-cospi[10]);
+  const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
+  const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
+  const __m256i cospim18 = _mm256_set1_epi32(-cospi[18]);
+  const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
+  const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
+  const __m256i cospim26 = _mm256_set1_epi32(-cospi[26]);
+  const __m256i cospi34 = _mm256_set1_epi32(cospi[34]);
+  const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
+  const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]);
+  const __m256i cospi42 = _mm256_set1_epi32(cospi[42]);
+  const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
+  const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]);
+  const __m256i cospi50 = _mm256_set1_epi32(cospi[50]);
+  const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
+  const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
+  const __m256i cospi58 = _mm256_set1_epi32(cospi[58]);
+  const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
+  const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
+  const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+  const __m256i zero = _mm256_setzero_si256();
+
+  __m256i u[16], v[16], x, y;
+  int col;
+
+  for (col = 0; col < num_cols; ++col) {
+    // stage 0
+    // stage 1
+    u[0] = in[0 * num_cols + col];
+    u[1] = _mm256_sub_epi32(zero, in[15 * num_cols + col]);
+    u[2] = _mm256_sub_epi32(zero, in[7 * num_cols + col]);
+    u[3] = in[8 * num_cols + col];
+    u[4] = _mm256_sub_epi32(zero, in[3 * num_cols + col]);
+    u[5] = in[12 * num_cols + col];
+    u[6] = in[4 * num_cols + col];
+    u[7] = _mm256_sub_epi32(zero, in[11 * num_cols + col]);
+    u[8] = _mm256_sub_epi32(zero, in[1 * num_cols + col]);
+    u[9] = in[14 * num_cols + col];
+    u[10] = in[6 * num_cols + col];
+    u[11] = _mm256_sub_epi32(zero, in[9 * num_cols + col]);
+    u[12] = in[2 * num_cols + col];
+    u[13] = _mm256_sub_epi32(zero, in[13 * num_cols + col]);
+    u[14] = _mm256_sub_epi32(zero, in[5 * num_cols + col]);
+    u[15] = in[10 * num_cols + col];
+
+    // stage 2
+    v[0] = u[0];
+    v[1] = u[1];
+
+    x = _mm256_mullo_epi32(u[2], cospi32);
+    y = _mm256_mullo_epi32(u[3], cospi32);
+    v[2] = _mm256_add_epi32(x, y);
+    v[2] = _mm256_add_epi32(v[2], rnding);
+    v[2] = _mm256_srai_epi32(v[2], bit);
+
+    v[3] = _mm256_sub_epi32(x, y);
+    v[3] = _mm256_add_epi32(v[3], rnding);
+    v[3] = _mm256_srai_epi32(v[3], bit);
+
+    v[4] = u[4];
+    v[5] = u[5];
+
+    x = _mm256_mullo_epi32(u[6], cospi32);
+    y = _mm256_mullo_epi32(u[7], cospi32);
+    v[6] = _mm256_add_epi32(x, y);
+    v[6] = _mm256_add_epi32(v[6], rnding);
+    v[6] = _mm256_srai_epi32(v[6], bit);
+
+    v[7] = _mm256_sub_epi32(x, y);
+    v[7] = _mm256_add_epi32(v[7], rnding);
+    v[7] = _mm256_srai_epi32(v[7], bit);
+
+    v[8] = u[8];
+    v[9] = u[9];
+
+    x = _mm256_mullo_epi32(u[10], cospi32);
+    y = _mm256_mullo_epi32(u[11], cospi32);
+    v[10] = _mm256_add_epi32(x, y);
+    v[10] = _mm256_add_epi32(v[10], rnding);
+    v[10] = _mm256_srai_epi32(v[10], bit);
+
+    v[11] = _mm256_sub_epi32(x, y);
+    v[11] = _mm256_add_epi32(v[11], rnding);
+    v[11] = _mm256_srai_epi32(v[11], bit);
+
+    v[12] = u[12];
+    v[13] = u[13];
+
+    x = _mm256_mullo_epi32(u[14], cospi32);
+    y = _mm256_mullo_epi32(u[15], cospi32);
+    v[14] = _mm256_add_epi32(x, y);
+    v[14] = _mm256_add_epi32(v[14], rnding);
+    v[14] = _mm256_srai_epi32(v[14], bit);
+
+    v[15] = _mm256_sub_epi32(x, y);
+    v[15] = _mm256_add_epi32(v[15], rnding);
+    v[15] = _mm256_srai_epi32(v[15], bit);
+
+    // stage 3
+    u[0] = _mm256_add_epi32(v[0], v[2]);
+    u[1] = _mm256_add_epi32(v[1], v[3]);
+    u[2] = _mm256_sub_epi32(v[0], v[2]);
+    u[3] = _mm256_sub_epi32(v[1], v[3]);
+    u[4] = _mm256_add_epi32(v[4], v[6]);
+    u[5] = _mm256_add_epi32(v[5], v[7]);
+    u[6] = _mm256_sub_epi32(v[4], v[6]);
+    u[7] = _mm256_sub_epi32(v[5], v[7]);
+    u[8] = _mm256_add_epi32(v[8], v[10]);
+    u[9] = _mm256_add_epi32(v[9], v[11]);
+    u[10] = _mm256_sub_epi32(v[8], v[10]);
+    u[11] = _mm256_sub_epi32(v[9], v[11]);
+    u[12] = _mm256_add_epi32(v[12], v[14]);
+    u[13] = _mm256_add_epi32(v[13], v[15]);
+    u[14] = _mm256_sub_epi32(v[12], v[14]);
+    u[15] = _mm256_sub_epi32(v[13], v[15]);
+
+    // stage 4
+    v[0] = u[0];
+    v[1] = u[1];
+    v[2] = u[2];
+    v[3] = u[3];
+    v[4] = av1_half_btf_avx2(&cospi16, &u[4], &cospi48, &u[5], &rnding, bit);
+    v[5] = av1_half_btf_avx2(&cospi48, &u[4], &cospim16, &u[5], &rnding, bit);
+    v[6] = av1_half_btf_avx2(&cospim48, &u[6], &cospi16, &u[7], &rnding, bit);
+    v[7] = av1_half_btf_avx2(&cospi16, &u[6], &cospi48, &u[7], &rnding, bit);
+    v[8] = u[8];
+    v[9] = u[9];
+    v[10] = u[10];
+    v[11] = u[11];
+    v[12] = av1_half_btf_avx2(&cospi16, &u[12], &cospi48, &u[13], &rnding, bit);
+    v[13] =
+        av1_half_btf_avx2(&cospi48, &u[12], &cospim16, &u[13], &rnding, bit);
+    v[14] =
+        av1_half_btf_avx2(&cospim48, &u[14], &cospi16, &u[15], &rnding, bit);
+    v[15] = av1_half_btf_avx2(&cospi16, &u[14], &cospi48, &u[15], &rnding, bit);
+
+    // stage 5
+    u[0] = _mm256_add_epi32(v[0], v[4]);
+    u[1] = _mm256_add_epi32(v[1], v[5]);
+    u[2] = _mm256_add_epi32(v[2], v[6]);
+    u[3] = _mm256_add_epi32(v[3], v[7]);
+    u[4] = _mm256_sub_epi32(v[0], v[4]);
+    u[5] = _mm256_sub_epi32(v[1], v[5]);
+    u[6] = _mm256_sub_epi32(v[2], v[6]);
+    u[7] = _mm256_sub_epi32(v[3], v[7]);
+    u[8] = _mm256_add_epi32(v[8], v[12]);
+    u[9] = _mm256_add_epi32(v[9], v[13]);
+    u[10] = _mm256_add_epi32(v[10], v[14]);
+    u[11] = _mm256_add_epi32(v[11], v[15]);
+    u[12] = _mm256_sub_epi32(v[8], v[12]);
+    u[13] = _mm256_sub_epi32(v[9], v[13]);
+    u[14] = _mm256_sub_epi32(v[10], v[14]);
+    u[15] = _mm256_sub_epi32(v[11], v[15]);
+
+    // stage 6
+    v[0] = u[0];
+    v[1] = u[1];
+    v[2] = u[2];
+    v[3] = u[3];
+    v[4] = u[4];
+    v[5] = u[5];
+    v[6] = u[6];
+    v[7] = u[7];
+    v[8] = av1_half_btf_avx2(&cospi8, &u[8], &cospi56, &u[9], &rnding, bit);
+    v[9] = av1_half_btf_avx2(&cospi56, &u[8], &cospim8, &u[9], &rnding, bit);
+    v[10] = av1_half_btf_avx2(&cospi40, &u[10], &cospi24, &u[11], &rnding, bit);
+    v[11] =
+        av1_half_btf_avx2(&cospi24, &u[10], &cospim40, &u[11], &rnding, bit);
+    v[12] = av1_half_btf_avx2(&cospim56, &u[12], &cospi8, &u[13], &rnding, bit);
+    v[13] = av1_half_btf_avx2(&cospi8, &u[12], &cospi56, &u[13], &rnding, bit);
+    v[14] =
+        av1_half_btf_avx2(&cospim24, &u[14], &cospi40, &u[15], &rnding, bit);
+    v[15] = av1_half_btf_avx2(&cospi40, &u[14], &cospi24, &u[15], &rnding, bit);
+
+    // stage 7
+    u[0] = _mm256_add_epi32(v[0], v[8]);
+    u[1] = _mm256_add_epi32(v[1], v[9]);
+    u[2] = _mm256_add_epi32(v[2], v[10]);
+    u[3] = _mm256_add_epi32(v[3], v[11]);
+    u[4] = _mm256_add_epi32(v[4], v[12]);
+    u[5] = _mm256_add_epi32(v[5], v[13]);
+    u[6] = _mm256_add_epi32(v[6], v[14]);
+    u[7] = _mm256_add_epi32(v[7], v[15]);
+    u[8] = _mm256_sub_epi32(v[0], v[8]);
+    u[9] = _mm256_sub_epi32(v[1], v[9]);
+    u[10] = _mm256_sub_epi32(v[2], v[10]);
+    u[11] = _mm256_sub_epi32(v[3], v[11]);
+    u[12] = _mm256_sub_epi32(v[4], v[12]);
+    u[13] = _mm256_sub_epi32(v[5], v[13]);
+    u[14] = _mm256_sub_epi32(v[6], v[14]);
+    u[15] = _mm256_sub_epi32(v[7], v[15]);
+
+    // stage 8
+    v[0] = av1_half_btf_avx2(&cospi2, &u[0], &cospi62, &u[1], &rnding, bit);
+    v[1] = av1_half_btf_avx2(&cospi62, &u[0], &cospim2, &u[1], &rnding, bit);
+    v[2] = av1_half_btf_avx2(&cospi10, &u[2], &cospi54, &u[3], &rnding, bit);
+    v[3] = av1_half_btf_avx2(&cospi54, &u[2], &cospim10, &u[3], &rnding, bit);
+    v[4] = av1_half_btf_avx2(&cospi18, &u[4], &cospi46, &u[5], &rnding, bit);
+    v[5] = av1_half_btf_avx2(&cospi46, &u[4], &cospim18, &u[5], &rnding, bit);
+    v[6] = av1_half_btf_avx2(&cospi26, &u[6], &cospi38, &u[7], &rnding, bit);
+    v[7] = av1_half_btf_avx2(&cospi38, &u[6], &cospim26, &u[7], &rnding, bit);
+    v[8] = av1_half_btf_avx2(&cospi34, &u[8], &cospi30, &u[9], &rnding, bit);
+    v[9] = av1_half_btf_avx2(&cospi30, &u[8], &cospim34, &u[9], &rnding, bit);
+    v[10] = av1_half_btf_avx2(&cospi42, &u[10], &cospi22, &u[11], &rnding, bit);
+    v[11] =
+        av1_half_btf_avx2(&cospi22, &u[10], &cospim42, &u[11], &rnding, bit);
+    v[12] = av1_half_btf_avx2(&cospi50, &u[12], &cospi14, &u[13], &rnding, bit);
+    v[13] =
+        av1_half_btf_avx2(&cospi14, &u[12], &cospim50, &u[13], &rnding, bit);
+    v[14] = av1_half_btf_avx2(&cospi58, &u[14], &cospi6, &u[15], &rnding, bit);
+    v[15] = av1_half_btf_avx2(&cospi6, &u[14], &cospim58, &u[15], &rnding, bit);
+
+    // stage 9
+    out[0 * outstride + col] = v[1];
+    out[1 * outstride + col] = v[14];
+    out[2 * outstride + col] = v[3];
+    out[3 * outstride + col] = v[12];
+    out[4 * outstride + col] = v[5];
+    out[5 * outstride + col] = v[10];
+    out[6 * outstride + col] = v[7];
+    out[7 * outstride + col] = v[8];
+    out[8 * outstride + col] = v[9];
+    out[9 * outstride + col] = v[6];
+    out[10 * outstride + col] = v[11];
+    out[11 * outstride + col] = v[4];
+    out[12 * outstride + col] = v[13];
+    out[13 * outstride + col] = v[2];
+    out[14 * outstride + col] = v[15];
+    out[15 * outstride + col] = v[0];
+  }
+}
+static void idtx16_avx2(__m256i *in, __m256i *out, const int8_t bit,
+                        int col_num, const int outstride) {
+  (void)bit;
+  (void)outstride;
+  __m256i fact = _mm256_set1_epi32(2 * NewSqrt2);
+  __m256i offset = _mm256_set1_epi32(1 << (NewSqrt2Bits - 1));
+  __m256i a_low;
+
+  int num_iters = 16 * col_num;
+  for (int i = 0; i < num_iters; i++) {
+    a_low = _mm256_mullo_epi32(in[i], fact);
+    a_low = _mm256_add_epi32(a_low, offset);
+    out[i] = _mm256_srai_epi32(a_low, NewSqrt2Bits);
+  }
+}
+static const transform_1d_avx2 col_highbd_txfm8x16_arr[TX_TYPES] = {
+  fdct16_avx2,   // DCT_DCT
+  fadst16_avx2,  // ADST_DCT
+  fdct16_avx2,   // DCT_ADST
+  fadst16_avx2,  // ADST_ADST
+  fadst16_avx2,  // FLIPADST_DCT
+  fdct16_avx2,   // DCT_FLIPADST
+  fadst16_avx2,  // FLIPADST_FLIPADST
+  fadst16_avx2,  // ADST_FLIPADST
+  fadst16_avx2,  // FLIPADST_ADST
+  idtx16_avx2,   // IDTX
+  fdct16_avx2,   // V_DCT
+  idtx16_avx2,   // H_DCT
+  fadst16_avx2,  // V_ADST
+  idtx16_avx2,   // H_ADST
+  fadst16_avx2,  // V_FLIPADST
+  idtx16_avx2    // H_FLIPADST
+};
+static const transform_1d_avx2 row_highbd_txfm8x8_arr[TX_TYPES] = {
+  fdct8_avx2,   // DCT_DCT
+  fdct8_avx2,   // ADST_DCT
+  fadst8_avx2,  // DCT_ADST
+  fadst8_avx2,  // ADST_ADST
+  fdct8_avx2,   // FLIPADST_DCT
+  fadst8_avx2,  // DCT_FLIPADST
+  fadst8_avx2,  // FLIPADST_FLIPADST
+  fadst8_avx2,  // ADST_FLIPADST
+  fadst8_avx2,  // FLIPADST_ADST
+  idtx8_avx2,   // IDTX
+  idtx8_avx2,   // V_DCT
+  fdct8_avx2,   // H_DCT
+  idtx8_avx2,   // V_ADST
+  fadst8_avx2,  // H_ADST
+  idtx8_avx2,   // V_FLIPADST
+  fadst8_avx2   // H_FLIPADST
+};
+void av1_fwd_txfm2d_8x16_avx2(const int16_t *input, int32_t *coeff, int stride,
+                              TX_TYPE tx_type, int bd) {
+  __m256i in[16], out[16];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X16];
+  const int txw_idx = get_txw_idx(TX_8X16);
+  const int txh_idx = get_txh_idx(TX_8X16);
+  const transform_1d_avx2 col_txfm = col_highbd_txfm8x16_arr[tx_type];
+  const transform_1d_avx2 row_txfm = row_highbd_txfm8x8_arr[tx_type];
+  const int8_t bit = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  load_buffer_8x16_avx2(input, in, stride, ud_flip, lr_flip, shift[0]);
+  col_txfm(in, out, bit, 1, 1);
+  col_txfm_8x8_rounding(out, -shift[1]);
+  col_txfm_8x8_rounding(&out[8], -shift[1]);
+  fwd_txfm_transpose_8x8_avx2(out, in, 1, 2);
+  fwd_txfm_transpose_8x8_avx2(&out[8], &in[1], 1, 2);
+  row_txfm(in, out, bit, 2, 2);
+  fwd_txfm_transpose_8x8_avx2(out, in, 2, 1);
+  fwd_txfm_transpose_8x8_avx2(&out[1], &in[8], 2, 1);
+  av1_round_shift_rect_array_32_avx2(in, in, 16, -shift[2], NewSqrt2);
+  store_buffer_avx2(in, coeff, 8, 16);
+  (void)bd;
+}
+static const transform_1d_avx2 col_highbd_txfm8x8_arr[TX_TYPES] = {
+  fdct8_avx2,   // DCT_DCT
+  fadst8_avx2,  // ADST_DCT
+  fdct8_avx2,   // DCT_ADST
+  fadst8_avx2,  // ADST_ADST
+  fadst8_avx2,  // FLIPADST_DCT
+  fdct8_avx2,   // DCT_FLIPADST
+  fadst8_avx2,  // FLIPADST_FLIPADST
+  fadst8_avx2,  // ADST_FLIPADST
+  fadst8_avx2,  // FLIPADST_ADST
+  idtx8_avx2,   // IDTX
+  fdct8_avx2,   // V_DCT
+  idtx8_avx2,   // H_DCT
+  fadst8_avx2,  // V_ADST
+  idtx8_avx2,   // H_ADST
+  fadst8_avx2,  // V_FLIPADST
+  idtx8_avx2    // H_FLIPADST
+};
+static const transform_1d_avx2 row_highbd_txfm8x16_arr[TX_TYPES] = {
+  fdct16_avx2,   // DCT_DCT
+  fdct16_avx2,   // ADST_DCT
+  fadst16_avx2,  // DCT_ADST
+  fadst16_avx2,  // ADST_ADST
+  fdct16_avx2,   // FLIPADST_DCT
+  fadst16_avx2,  // DCT_FLIPADST
+  fadst16_avx2,  // FLIPADST_FLIPADST
+  fadst16_avx2,  // ADST_FLIPADST
+  fadst16_avx2,  // FLIPADST_ADST
+  idtx16_avx2,   // IDTX
+  idtx16_avx2,   // V_DCT
+  fdct16_avx2,   // H_DCT
+  idtx16_avx2,   // V_ADST
+  fadst16_avx2,  // H_ADST
+  idtx16_avx2,   // V_FLIPADST
+  fadst16_avx2   // H_FLIPADST
+};
+void av1_fwd_txfm2d_16x8_avx2(const int16_t *input, int32_t *coeff, int stride,
+                              TX_TYPE tx_type, int bd) {
+  __m256i in[16], out[16];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X8];
+  const int txw_idx = get_txw_idx(TX_16X8);
+  const int txh_idx = get_txh_idx(TX_16X8);
+  const transform_1d_avx2 col_txfm = col_highbd_txfm8x8_arr[tx_type];
+  const transform_1d_avx2 row_txfm = row_highbd_txfm8x16_arr[tx_type];
+  const int8_t bit = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  load_buffer_16xn_avx2(input, in, stride, 8, 2, ud_flip, lr_flip);
+  round_shift_32_8xn_avx2(in, 16, shift[0], 1);
+  col_txfm(in, out, bit, 2, 2);
+  round_shift_32_8xn_avx2(out, 16, shift[1], 1);
+  fwd_txfm_transpose_8x8_avx2(out, in, 2, 1);
+  fwd_txfm_transpose_8x8_avx2(&out[1], &in[8], 2, 1);
+  row_txfm(in, out, bit, 1, 1);
+  fwd_txfm_transpose_8x8_avx2(out, in, 1, 2);
+  fwd_txfm_transpose_8x8_avx2(&out[8], &in[1], 1, 2);
+  av1_round_shift_rect_array_32_avx2(in, in, 16, -shift[2], NewSqrt2);
+  store_buffer_avx2(in, coeff, 8, 16);
+  (void)bd;
+}
+void av1_fwd_txfm2d_16x16_avx2(const int16_t *input, int32_t *coeff, int stride,
+                               TX_TYPE tx_type, int bd) {
+  __m256i in[32], out[32];
+  const TX_SIZE tx_size = TX_16X16;
+  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const int width_div8 = (width >> 3);
+  const int width_div16 = (width >> 4);
+  const int size = (height << 1);
+  switch (tx_type) {
+    case DCT_DCT:
+      load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
+      round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+      fdct16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+      fwd_txfm_transpose_16x16_avx2(out, in);
+      fdct16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      fwd_txfm_transpose_16x16_avx2(out, in);
+      store_buffer_avx2(in, coeff, 8, 32);
+      break;
+    case ADST_DCT:
+      load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
+      round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+      fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                   width_div8);
+      round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+      fwd_txfm_transpose_16x16_avx2(out, in);
+      fdct16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      fwd_txfm_transpose_16x16_avx2(out, in);
+      store_buffer_avx2(in, coeff, 8, 32);
+      break;
+    case DCT_ADST:
+      load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
+      round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+      fdct16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+      fwd_txfm_transpose_16x16_avx2(out, in);
+      fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                   width_div8);
+      fwd_txfm_transpose_16x16_avx2(out, in);
+      store_buffer_avx2(in, coeff, 8, 32);
+      break;
+    case ADST_ADST:
+      load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
+      round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+      fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                   width_div8);
+      round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+      fwd_txfm_transpose_16x16_avx2(out, in);
+      fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                   width_div8);
+      fwd_txfm_transpose_16x16_avx2(out, in);
+      store_buffer_avx2(in, coeff, 8, 32);
+      break;
+    case FLIPADST_DCT:
+      load_buffer_16xn_avx2(input, in, stride, height, width_div8, 1, 0);
+      round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+      fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                   width_div8);
+      round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+      fwd_txfm_transpose_16x16_avx2(out, in);
+      fdct16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      fwd_txfm_transpose_16x16_avx2(out, in);
+      store_buffer_avx2(in, coeff, 8, 32);
+      break;
+    case DCT_FLIPADST:
+      load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 1);
+      round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+      fdct16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+      fwd_txfm_transpose_16x16_avx2(out, in);
+      fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                   width_div8);
+      fwd_txfm_transpose_16x16_avx2(out, in);
+      store_buffer_avx2(in, coeff, 8, 32);
+      break;
+    case FLIPADST_FLIPADST:
+      load_buffer_16xn_avx2(input, in, stride, height, width_div8, 1, 1);
+      round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+      fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                   width_div8);
+      round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+      fwd_txfm_transpose_16x16_avx2(out, in);
+      fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                   width_div8);
+      fwd_txfm_transpose_16x16_avx2(out, in);
+      store_buffer_avx2(in, coeff, 8, 32);
+      break;
+    case ADST_FLIPADST:
+      load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 1);
+      round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+      fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                   width_div8);
+      round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+      fwd_txfm_transpose_16x16_avx2(out, in);
+      fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                   width_div8);
+      fwd_txfm_transpose_16x16_avx2(out, in);
+      store_buffer_avx2(in, coeff, 8, 32);
+      break;
+    case FLIPADST_ADST:
+      load_buffer_16xn_avx2(input, in, stride, height, width_div8, 1, 0);
+      round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+      fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                   width_div8);
+      round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+      fwd_txfm_transpose_16x16_avx2(out, in);
+      fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                   width_div8);
+      fwd_txfm_transpose_16x16_avx2(out, in);
+      store_buffer_avx2(in, coeff, 8, 32);
+      break;
+    case IDTX:
+      load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
+      round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+      idtx16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+      idtx16_avx2(out, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      store_buffer_avx2(in, coeff, 8, 32);
+      break;
+    case V_DCT:
+      load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
+      round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+      fdct16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+      idtx16_avx2(out, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      store_buffer_avx2(in, coeff, 8, 32);
+      break;
+    case H_DCT:
+      load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
+      round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+      idtx16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+      fwd_txfm_transpose_16x16_avx2(out, in);
+      fdct16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      fwd_txfm_transpose_16x16_avx2(out, in);
+      store_buffer_avx2(in, coeff, 8, 32);
+      break;
+    case V_ADST:
+      load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
+      round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+      fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                   width_div8);
+      round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+      idtx16_avx2(out, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      store_buffer_avx2(in, coeff, 8, 32);
+      break;
+    case H_ADST:
+      load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
+      round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+      idtx16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+      fwd_txfm_transpose_16x16_avx2(out, in);
+      fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                   width_div8);
+      fwd_txfm_transpose_16x16_avx2(out, in);
+      store_buffer_avx2(in, coeff, 8, 32);
+      break;
+    case V_FLIPADST:
+      load_buffer_16xn_avx2(input, in, stride, height, width_div8, 1, 0);
+      round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+      fadst16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                   width_div8);
+      round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+      idtx16_avx2(out, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      store_buffer_avx2(in, coeff, 8, 32);
+      break;
+    case H_FLIPADST:
+      load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 1);
+      round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+      idtx16_avx2(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+                  width_div8);
+      round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+      fwd_txfm_transpose_16x16_avx2(out, in);
+      fadst16_avx2(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+                   width_div8);
+      fwd_txfm_transpose_16x16_avx2(out, in);
+      store_buffer_avx2(in, coeff, 8, 32);
+      break;
+    default: assert(0);
+  }
+  (void)bd;
+}
+static INLINE void fdct32_avx2(__m256i *input, __m256i *output,
+                               const int8_t cos_bit, const int instride,
+                               const int outstride) {
+  __m256i buf0[32];
+  __m256i buf1[32];
+  const int32_t *cospi;
+  int startidx = 0 * instride;
+  int endidx = 31 * instride;
+  // stage 0
+  // stage 1
+  buf1[0] = _mm256_add_epi32(input[startidx], input[endidx]);
+  buf1[31] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  buf1[1] = _mm256_add_epi32(input[startidx], input[endidx]);
+  buf1[30] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  buf1[2] = _mm256_add_epi32(input[startidx], input[endidx]);
+  buf1[29] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  buf1[3] = _mm256_add_epi32(input[startidx], input[endidx]);
+  buf1[28] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  buf1[4] = _mm256_add_epi32(input[startidx], input[endidx]);
+  buf1[27] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  buf1[5] = _mm256_add_epi32(input[startidx], input[endidx]);
+  buf1[26] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  buf1[6] = _mm256_add_epi32(input[startidx], input[endidx]);
+  buf1[25] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  buf1[7] = _mm256_add_epi32(input[startidx], input[endidx]);
+  buf1[24] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  buf1[8] = _mm256_add_epi32(input[startidx], input[endidx]);
+  buf1[23] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  buf1[9] = _mm256_add_epi32(input[startidx], input[endidx]);
+  buf1[22] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  buf1[10] = _mm256_add_epi32(input[startidx], input[endidx]);
+  buf1[21] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  buf1[11] = _mm256_add_epi32(input[startidx], input[endidx]);
+  buf1[20] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  buf1[12] = _mm256_add_epi32(input[startidx], input[endidx]);
+  buf1[19] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  buf1[13] = _mm256_add_epi32(input[startidx], input[endidx]);
+  buf1[18] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  buf1[14] = _mm256_add_epi32(input[startidx], input[endidx]);
+  buf1[17] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  buf1[15] = _mm256_add_epi32(input[startidx], input[endidx]);
+  buf1[16] = _mm256_sub_epi32(input[startidx], input[endidx]);
+
+  // stage 2
+  cospi = cospi_arr(cos_bit);
+  buf0[0] = _mm256_add_epi32(buf1[0], buf1[15]);
+  buf0[15] = _mm256_sub_epi32(buf1[0], buf1[15]);
+  buf0[1] = _mm256_add_epi32(buf1[1], buf1[14]);
+  buf0[14] = _mm256_sub_epi32(buf1[1], buf1[14]);
+  buf0[2] = _mm256_add_epi32(buf1[2], buf1[13]);
+  buf0[13] = _mm256_sub_epi32(buf1[2], buf1[13]);
+  buf0[3] = _mm256_add_epi32(buf1[3], buf1[12]);
+  buf0[12] = _mm256_sub_epi32(buf1[3], buf1[12]);
+  buf0[4] = _mm256_add_epi32(buf1[4], buf1[11]);
+  buf0[11] = _mm256_sub_epi32(buf1[4], buf1[11]);
+  buf0[5] = _mm256_add_epi32(buf1[5], buf1[10]);
+  buf0[10] = _mm256_sub_epi32(buf1[5], buf1[10]);
+  buf0[6] = _mm256_add_epi32(buf1[6], buf1[9]);
+  buf0[9] = _mm256_sub_epi32(buf1[6], buf1[9]);
+  buf0[7] = _mm256_add_epi32(buf1[7], buf1[8]);
+  buf0[8] = _mm256_sub_epi32(buf1[7], buf1[8]);
+  buf0[16] = buf1[16];
+  buf0[17] = buf1[17];
+  buf0[18] = buf1[18];
+  buf0[19] = buf1[19];
+  btf_32_avx2_type0(-cospi[32], cospi[32], buf1[20], buf1[27], buf0[20],
+                    buf0[27], cos_bit);
+  btf_32_avx2_type0(-cospi[32], cospi[32], buf1[21], buf1[26], buf0[21],
+                    buf0[26], cos_bit);
+  btf_32_avx2_type0(-cospi[32], cospi[32], buf1[22], buf1[25], buf0[22],
+                    buf0[25], cos_bit);
+  btf_32_avx2_type0(-cospi[32], cospi[32], buf1[23], buf1[24], buf0[23],
+                    buf0[24], cos_bit);
+  buf0[28] = buf1[28];
+  buf0[29] = buf1[29];
+  buf0[30] = buf1[30];
+  buf0[31] = buf1[31];
+
+  // stage 3
+  cospi = cospi_arr(cos_bit);
+  buf1[0] = _mm256_add_epi32(buf0[0], buf0[7]);
+  buf1[7] = _mm256_sub_epi32(buf0[0], buf0[7]);
+  buf1[1] = _mm256_add_epi32(buf0[1], buf0[6]);
+  buf1[6] = _mm256_sub_epi32(buf0[1], buf0[6]);
+  buf1[2] = _mm256_add_epi32(buf0[2], buf0[5]);
+  buf1[5] = _mm256_sub_epi32(buf0[2], buf0[5]);
+  buf1[3] = _mm256_add_epi32(buf0[3], buf0[4]);
+  buf1[4] = _mm256_sub_epi32(buf0[3], buf0[4]);
+  buf1[8] = buf0[8];
+  buf1[9] = buf0[9];
+  btf_32_avx2_type0(-cospi[32], cospi[32], buf0[10], buf0[13], buf1[10],
+                    buf1[13], cos_bit);
+  btf_32_avx2_type0(-cospi[32], cospi[32], buf0[11], buf0[12], buf1[11],
+                    buf1[12], cos_bit);
+  buf1[14] = buf0[14];
+  buf1[15] = buf0[15];
+  buf1[16] = _mm256_add_epi32(buf0[16], buf0[23]);
+  buf1[23] = _mm256_sub_epi32(buf0[16], buf0[23]);
+  buf1[17] = _mm256_add_epi32(buf0[17], buf0[22]);
+  buf1[22] = _mm256_sub_epi32(buf0[17], buf0[22]);
+  buf1[18] = _mm256_add_epi32(buf0[18], buf0[21]);
+  buf1[21] = _mm256_sub_epi32(buf0[18], buf0[21]);
+  buf1[19] = _mm256_add_epi32(buf0[19], buf0[20]);
+  buf1[20] = _mm256_sub_epi32(buf0[19], buf0[20]);
+  buf1[24] = _mm256_sub_epi32(buf0[31], buf0[24]);
+  buf1[31] = _mm256_add_epi32(buf0[31], buf0[24]);
+  buf1[25] = _mm256_sub_epi32(buf0[30], buf0[25]);
+  buf1[30] = _mm256_add_epi32(buf0[30], buf0[25]);
+  buf1[26] = _mm256_sub_epi32(buf0[29], buf0[26]);
+  buf1[29] = _mm256_add_epi32(buf0[29], buf0[26]);
+  buf1[27] = _mm256_sub_epi32(buf0[28], buf0[27]);
+  buf1[28] = _mm256_add_epi32(buf0[28], buf0[27]);
+
+  // stage 4
+  cospi = cospi_arr(cos_bit);
+  buf0[0] = _mm256_add_epi32(buf1[0], buf1[3]);
+  buf0[3] = _mm256_sub_epi32(buf1[0], buf1[3]);
+  buf0[1] = _mm256_add_epi32(buf1[1], buf1[2]);
+  buf0[2] = _mm256_sub_epi32(buf1[1], buf1[2]);
+  buf0[4] = buf1[4];
+  btf_32_avx2_type0(-cospi[32], cospi[32], buf1[5], buf1[6], buf0[5], buf0[6],
+                    cos_bit);
+  buf0[7] = buf1[7];
+  buf0[8] = _mm256_add_epi32(buf1[8], buf1[11]);
+  buf0[11] = _mm256_sub_epi32(buf1[8], buf1[11]);
+  buf0[9] = _mm256_add_epi32(buf1[9], buf1[10]);
+  buf0[10] = _mm256_sub_epi32(buf1[9], buf1[10]);
+  buf0[12] = _mm256_sub_epi32(buf1[15], buf1[12]);
+  buf0[15] = _mm256_add_epi32(buf1[15], buf1[12]);
+  buf0[13] = _mm256_sub_epi32(buf1[14], buf1[13]);
+  buf0[14] = _mm256_add_epi32(buf1[14], buf1[13]);
+  buf0[16] = buf1[16];
+  buf0[17] = buf1[17];
+  btf_32_avx2_type0(-cospi[16], cospi[48], buf1[18], buf1[29], buf0[18],
+                    buf0[29], cos_bit);
+  btf_32_avx2_type0(-cospi[16], cospi[48], buf1[19], buf1[28], buf0[19],
+                    buf0[28], cos_bit);
+  btf_32_avx2_type0(-cospi[48], -cospi[16], buf1[20], buf1[27], buf0[20],
+                    buf0[27], cos_bit);
+  btf_32_avx2_type0(-cospi[48], -cospi[16], buf1[21], buf1[26], buf0[21],
+                    buf0[26], cos_bit);
+  buf0[22] = buf1[22];
+  buf0[23] = buf1[23];
+  buf0[24] = buf1[24];
+  buf0[25] = buf1[25];
+  buf0[30] = buf1[30];
+  buf0[31] = buf1[31];
+
+  // stage 5
+  cospi = cospi_arr(cos_bit);
+  btf_32_avx2_type0(cospi[32], cospi[32], buf0[0], buf0[1], buf1[0], buf1[1],
+                    cos_bit);
+  btf_32_avx2_type0(cospi[16], cospi[48], buf0[3], buf0[2], buf1[2], buf1[3],
+                    cos_bit);
+  buf1[4] = _mm256_add_epi32(buf0[4], buf0[5]);
+  buf1[5] = _mm256_sub_epi32(buf0[4], buf0[5]);
+  buf1[6] = _mm256_sub_epi32(buf0[7], buf0[6]);
+  buf1[7] = _mm256_add_epi32(buf0[7], buf0[6]);
+  buf1[8] = buf0[8];
+  btf_32_avx2_type0(-cospi[16], cospi[48], buf0[9], buf0[14], buf1[9], buf1[14],
+                    cos_bit);
+  btf_32_avx2_type0(-cospi[48], -cospi[16], buf0[10], buf0[13], buf1[10],
+                    buf1[13], cos_bit);
+  buf1[11] = buf0[11];
+  buf1[12] = buf0[12];
+  buf1[15] = buf0[15];
+  buf1[16] = _mm256_add_epi32(buf0[16], buf0[19]);
+  buf1[19] = _mm256_sub_epi32(buf0[16], buf0[19]);
+  buf1[17] = _mm256_add_epi32(buf0[17], buf0[18]);
+  buf1[18] = _mm256_sub_epi32(buf0[17], buf0[18]);
+  buf1[20] = _mm256_sub_epi32(buf0[23], buf0[20]);
+  buf1[23] = _mm256_add_epi32(buf0[23], buf0[20]);
+  buf1[21] = _mm256_sub_epi32(buf0[22], buf0[21]);
+  buf1[22] = _mm256_add_epi32(buf0[22], buf0[21]);
+  buf1[24] = _mm256_add_epi32(buf0[24], buf0[27]);
+  buf1[27] = _mm256_sub_epi32(buf0[24], buf0[27]);
+  buf1[25] = _mm256_add_epi32(buf0[25], buf0[26]);
+  buf1[26] = _mm256_sub_epi32(buf0[25], buf0[26]);
+  buf1[28] = _mm256_sub_epi32(buf0[31], buf0[28]);
+  buf1[31] = _mm256_add_epi32(buf0[31], buf0[28]);
+  buf1[29] = _mm256_sub_epi32(buf0[30], buf0[29]);
+  buf1[30] = _mm256_add_epi32(buf0[30], buf0[29]);
+
+  // stage 6
+  cospi = cospi_arr(cos_bit);
+  buf0[0] = buf1[0];
+  buf0[1] = buf1[1];
+  buf0[2] = buf1[2];
+  buf0[3] = buf1[3];
+  btf_32_avx2_type0(cospi[8], cospi[56], buf1[7], buf1[4], buf0[4], buf0[7],
+                    cos_bit);
+  btf_32_avx2_type0(cospi[40], cospi[24], buf1[6], buf1[5], buf0[5], buf0[6],
+                    cos_bit);
+  buf0[8] = _mm256_add_epi32(buf1[8], buf1[9]);
+  buf0[9] = _mm256_sub_epi32(buf1[8], buf1[9]);
+  buf0[10] = _mm256_sub_epi32(buf1[11], buf1[10]);
+  buf0[11] = _mm256_add_epi32(buf1[11], buf1[10]);
+  buf0[12] = _mm256_add_epi32(buf1[12], buf1[13]);
+  buf0[13] = _mm256_sub_epi32(buf1[12], buf1[13]);
+  buf0[14] = _mm256_sub_epi32(buf1[15], buf1[14]);
+  buf0[15] = _mm256_add_epi32(buf1[15], buf1[14]);
+  buf0[16] = buf1[16];
+  btf_32_avx2_type0(-cospi[8], cospi[56], buf1[17], buf1[30], buf0[17],
+                    buf0[30], cos_bit);
+  btf_32_avx2_type0(-cospi[56], -cospi[8], buf1[18], buf1[29], buf0[18],
+                    buf0[29], cos_bit);
+  buf0[19] = buf1[19];
+  buf0[20] = buf1[20];
+  btf_32_avx2_type0(-cospi[40], cospi[24], buf1[21], buf1[26], buf0[21],
+                    buf0[26], cos_bit);
+  btf_32_avx2_type0(-cospi[24], -cospi[40], buf1[22], buf1[25], buf0[22],
+                    buf0[25], cos_bit);
+  buf0[23] = buf1[23];
+  buf0[24] = buf1[24];
+  buf0[27] = buf1[27];
+  buf0[28] = buf1[28];
+  buf0[31] = buf1[31];
+
+  // stage 7
+  cospi = cospi_arr(cos_bit);
+  buf1[0] = buf0[0];
+  buf1[1] = buf0[1];
+  buf1[2] = buf0[2];
+  buf1[3] = buf0[3];
+  buf1[4] = buf0[4];
+  buf1[5] = buf0[5];
+  buf1[6] = buf0[6];
+  buf1[7] = buf0[7];
+  btf_32_avx2_type0(cospi[4], cospi[60], buf0[15], buf0[8], buf1[8], buf1[15],
+                    cos_bit);
+  btf_32_avx2_type0(cospi[36], cospi[28], buf0[14], buf0[9], buf1[9], buf1[14],
+                    cos_bit);
+  btf_32_avx2_type0(cospi[20], cospi[44], buf0[13], buf0[10], buf1[10],
+                    buf1[13], cos_bit);
+  btf_32_avx2_type0(cospi[52], cospi[12], buf0[12], buf0[11], buf1[11],
+                    buf1[12], cos_bit);
+  buf1[16] = _mm256_add_epi32(buf0[16], buf0[17]);
+  buf1[17] = _mm256_sub_epi32(buf0[16], buf0[17]);
+  buf1[18] = _mm256_sub_epi32(buf0[19], buf0[18]);
+  buf1[19] = _mm256_add_epi32(buf0[19], buf0[18]);
+  buf1[20] = _mm256_add_epi32(buf0[20], buf0[21]);
+  buf1[21] = _mm256_sub_epi32(buf0[20], buf0[21]);
+  buf1[22] = _mm256_sub_epi32(buf0[23], buf0[22]);
+  buf1[23] = _mm256_add_epi32(buf0[23], buf0[22]);
+  buf1[24] = _mm256_add_epi32(buf0[24], buf0[25]);
+  buf1[25] = _mm256_sub_epi32(buf0[24], buf0[25]);
+  buf1[26] = _mm256_sub_epi32(buf0[27], buf0[26]);
+  buf1[27] = _mm256_add_epi32(buf0[27], buf0[26]);
+  buf1[28] = _mm256_add_epi32(buf0[28], buf0[29]);
+  buf1[29] = _mm256_sub_epi32(buf0[28], buf0[29]);
+  buf1[30] = _mm256_sub_epi32(buf0[31], buf0[30]);
+  buf1[31] = _mm256_add_epi32(buf0[31], buf0[30]);
+
+  // stage 8
+  cospi = cospi_arr(cos_bit);
+  buf0[0] = buf1[0];
+  buf0[1] = buf1[1];
+  buf0[2] = buf1[2];
+  buf0[3] = buf1[3];
+  buf0[4] = buf1[4];
+  buf0[5] = buf1[5];
+  buf0[6] = buf1[6];
+  buf0[7] = buf1[7];
+  buf0[8] = buf1[8];
+  buf0[9] = buf1[9];
+  buf0[10] = buf1[10];
+  buf0[11] = buf1[11];
+  buf0[12] = buf1[12];
+  buf0[13] = buf1[13];
+  buf0[14] = buf1[14];
+  buf0[15] = buf1[15];
+  btf_32_avx2_type0(cospi[2], cospi[62], buf1[31], buf1[16], buf0[16], buf0[31],
+                    cos_bit);
+  btf_32_avx2_type0(cospi[34], cospi[30], buf1[30], buf1[17], buf0[17],
+                    buf0[30], cos_bit);
+  btf_32_avx2_type0(cospi[18], cospi[46], buf1[29], buf1[18], buf0[18],
+                    buf0[29], cos_bit);
+  btf_32_avx2_type0(cospi[50], cospi[14], buf1[28], buf1[19], buf0[19],
+                    buf0[28], cos_bit);
+  btf_32_avx2_type0(cospi[10], cospi[54], buf1[27], buf1[20], buf0[20],
+                    buf0[27], cos_bit);
+  btf_32_avx2_type0(cospi[42], cospi[22], buf1[26], buf1[21], buf0[21],
+                    buf0[26], cos_bit);
+  btf_32_avx2_type0(cospi[26], cospi[38], buf1[25], buf1[22], buf0[22],
+                    buf0[25], cos_bit);
+  btf_32_avx2_type0(cospi[58], cospi[6], buf1[24], buf1[23], buf0[23], buf0[24],
+                    cos_bit);
+
+  startidx = 0 * outstride;
+  endidx = 31 * outstride;
+  // stage 9
+  output[startidx] = buf0[0];
+  output[endidx] = buf0[31];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = buf0[16];
+  output[endidx] = buf0[15];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = buf0[8];
+  output[endidx] = buf0[23];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = buf0[24];
+  output[endidx] = buf0[7];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = buf0[4];
+  output[endidx] = buf0[27];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = buf0[20];
+  output[endidx] = buf0[11];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = buf0[12];
+  output[endidx] = buf0[19];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = buf0[28];
+  output[endidx] = buf0[3];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = buf0[2];
+  output[endidx] = buf0[29];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = buf0[18];
+  output[endidx] = buf0[13];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = buf0[10];
+  output[endidx] = buf0[21];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = buf0[26];
+  output[endidx] = buf0[5];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = buf0[6];
+  output[endidx] = buf0[25];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = buf0[22];
+  output[endidx] = buf0[9];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = buf0[14];
+  output[endidx] = buf0[17];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = buf0[30];
+  output[endidx] = buf0[1];
+}
+static INLINE void idtx32x32_avx2(__m256i *input, __m256i *output,
+                                  const int8_t cos_bit, int instride,
+                                  int outstride) {
+  (void)cos_bit;
+  for (int i = 0; i < 32; i += 8) {
+    output[i * outstride] = _mm256_slli_epi32(input[i * instride], 2);
+    output[(i + 1) * outstride] =
+        _mm256_slli_epi32(input[(i + 1) * instride], 2);
+    output[(i + 2) * outstride] =
+        _mm256_slli_epi32(input[(i + 2) * instride], 2);
+    output[(i + 3) * outstride] =
+        _mm256_slli_epi32(input[(i + 3) * instride], 2);
+    output[(i + 4) * outstride] =
+        _mm256_slli_epi32(input[(i + 4) * instride], 2);
+    output[(i + 5) * outstride] =
+        _mm256_slli_epi32(input[(i + 5) * instride], 2);
+    output[(i + 6) * outstride] =
+        _mm256_slli_epi32(input[(i + 6) * instride], 2);
+    output[(i + 7) * outstride] =
+        _mm256_slli_epi32(input[(i + 7) * instride], 2);
+  }
+}
+static const transform_1d_avx2 col_txfm8x32_arr[TX_TYPES] = {
+  fdct32_avx2,     // DCT_DCT
+  NULL,            // ADST_DCT
+  NULL,            // DCT_ADST
+  NULL,            // ADST_ADST
+  NULL,            // FLIPADST_DCT
+  NULL,            // DCT_FLIPADST
+  NULL,            // FLIPADST_FLIPADST
+  NULL,            // ADST_FLIPADST
+  NULL,            // FLIPADST_ADST
+  idtx32x32_avx2,  // IDTX
+  NULL,            // V_DCT
+  NULL,            // H_DCT
+  NULL,            // V_ADST
+  NULL,            // H_ADST
+  NULL,            // V_FLIPADST
+  NULL             // H_FLIPADST
+};
+static const transform_1d_avx2 row_txfm8x32_arr[TX_TYPES] = {
+  fdct32_avx2,     // DCT_DCT
+  NULL,            // ADST_DCT
+  NULL,            // DCT_ADST
+  NULL,            // ADST_ADST
+  NULL,            // FLIPADST_DCT
+  NULL,            // DCT_FLIPADST
+  NULL,            // FLIPADST_FLIPADST
+  NULL,            // ADST_FLIPADST
+  NULL,            // FLIPADST_ADST
+  idtx32x32_avx2,  // IDTX
+  NULL,            // V_DCT
+  NULL,            // H_DCT
+  NULL,            // V_ADST
+  NULL,            // H_ADST
+  NULL,            // V_FLIPADST
+  NULL             // H_FLIPADST
+};
+void av1_fwd_txfm2d_32x32_avx2(const int16_t *input, int32_t *output,
+                               int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m256i buf0[128], buf1[128];
+  const int tx_size = TX_32X32;
+  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const transform_1d_avx2 col_txfm = col_txfm8x32_arr[tx_type];
+  const transform_1d_avx2 row_txfm = row_txfm8x32_arr[tx_type];
+  int r, c;
+  const int width_div16 = (width >> 4);
+  const int width_div8 = (width >> 3);
+
+  for (int i = 0; i < width_div16; i++) {
+    load_buffer_16xn_avx2(input + (i << 4), &buf0[(i << 1)], stride, height,
+                          width_div8, 0, 0);
+    round_shift_32_8xn_avx2(&buf0[(i << 1)], height, shift[0], width_div8);
+    round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], height, shift[0], width_div8);
+    col_txfm(&buf0[(i << 1)], &buf0[(i << 1)], cos_bit_col, width_div8,
+             width_div8);
+    col_txfm(&buf0[(i << 1) + 1], &buf0[(i << 1) + 1], cos_bit_col, width_div8,
+             width_div8);
+    round_shift_32_8xn_avx2(&buf0[(i << 1)], height, shift[1], width_div8);
+    round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], height, shift[1], width_div8);
+  }
+
+  for (r = 0; r < height; r += 8) {
+    for (c = 0; c < width_div8; c++) {
+      fwd_txfm_transpose_8x8_avx2(&buf0[r * width_div8 + c],
+                                  &buf1[c * 8 * width_div8 + (r >> 3)],
+                                  width_div8, width_div8);
+    }
+  }
+
+  for (int i = 0; i < width_div16; i++) {
+    row_txfm(&buf1[(i << 1)], &buf1[(i << 1)], cos_bit_row, width_div8,
+             width_div8);
+    row_txfm(&buf1[(i << 1) + 1], &buf1[(i << 1) + 1], cos_bit_row, width_div8,
+             width_div8);
+    round_shift_32_8xn_avx2(&buf1[(i << 1)], height, shift[2], width_div8);
+    round_shift_32_8xn_avx2(&buf1[(i << 1) + 1], height, shift[2], width_div8);
+  }
+
+  for (r = 0; r < height; r += 8) {
+    for (c = 0; c < width_div8; c++) {
+      fwd_txfm_transpose_8x8_avx2(&buf1[r * width_div8 + c],
+                                  &buf0[c * 8 * width_div8 + (r >> 3)],
+                                  width_div8, width_div8);
+    }
+  }
+
+  store_buffer_avx2(buf0, output, 8, 128);
+}
+static INLINE void fdct64_stage2_avx2(__m256i *x1, __m256i *x2,
+                                      __m256i *cospi_m32, __m256i *cospi_p32,
+                                      const __m256i *__rounding,
+                                      int8_t cos_bit) {
+  x2[0] = _mm256_add_epi32(x1[0], x1[31]);
+  x2[31] = _mm256_sub_epi32(x1[0], x1[31]);
+  x2[1] = _mm256_add_epi32(x1[1], x1[30]);
+  x2[30] = _mm256_sub_epi32(x1[1], x1[30]);
+  x2[2] = _mm256_add_epi32(x1[2], x1[29]);
+  x2[29] = _mm256_sub_epi32(x1[2], x1[29]);
+  x2[3] = _mm256_add_epi32(x1[3], x1[28]);
+  x2[28] = _mm256_sub_epi32(x1[3], x1[28]);
+  x2[4] = _mm256_add_epi32(x1[4], x1[27]);
+  x2[27] = _mm256_sub_epi32(x1[4], x1[27]);
+  x2[5] = _mm256_add_epi32(x1[5], x1[26]);
+  x2[26] = _mm256_sub_epi32(x1[5], x1[26]);
+  x2[6] = _mm256_add_epi32(x1[6], x1[25]);
+  x2[25] = _mm256_sub_epi32(x1[6], x1[25]);
+  x2[7] = _mm256_add_epi32(x1[7], x1[24]);
+  x2[24] = _mm256_sub_epi32(x1[7], x1[24]);
+  x2[8] = _mm256_add_epi32(x1[8], x1[23]);
+  x2[23] = _mm256_sub_epi32(x1[8], x1[23]);
+  x2[9] = _mm256_add_epi32(x1[9], x1[22]);
+  x2[22] = _mm256_sub_epi32(x1[9], x1[22]);
+  x2[10] = _mm256_add_epi32(x1[10], x1[21]);
+  x2[21] = _mm256_sub_epi32(x1[10], x1[21]);
+  x2[11] = _mm256_add_epi32(x1[11], x1[20]);
+  x2[20] = _mm256_sub_epi32(x1[11], x1[20]);
+  x2[12] = _mm256_add_epi32(x1[12], x1[19]);
+  x2[19] = _mm256_sub_epi32(x1[12], x1[19]);
+  x2[13] = _mm256_add_epi32(x1[13], x1[18]);
+  x2[18] = _mm256_sub_epi32(x1[13], x1[18]);
+  x2[14] = _mm256_add_epi32(x1[14], x1[17]);
+  x2[17] = _mm256_sub_epi32(x1[14], x1[17]);
+  x2[15] = _mm256_add_epi32(x1[15], x1[16]);
+  x2[16] = _mm256_sub_epi32(x1[15], x1[16]);
+  x2[32] = x1[32];
+  x2[33] = x1[33];
+  x2[34] = x1[34];
+  x2[35] = x1[35];
+  x2[36] = x1[36];
+  x2[37] = x1[37];
+  x2[38] = x1[38];
+  x2[39] = x1[39];
+  btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[40], x1[55], x2[40], x2[55],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[41], x1[54], x2[41], x2[54],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[42], x1[53], x2[42], x2[53],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[43], x1[52], x2[43], x2[52],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[44], x1[51], x2[44], x2[51],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[45], x1[50], x2[45], x2[50],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[46], x1[49], x2[46], x2[49],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[47], x1[48], x2[47], x2[48],
+                        *__rounding, cos_bit);
+  x2[56] = x1[56];
+  x2[57] = x1[57];
+  x2[58] = x1[58];
+  x2[59] = x1[59];
+  x2[60] = x1[60];
+  x2[61] = x1[61];
+  x2[62] = x1[62];
+  x2[63] = x1[63];
+}
+static INLINE void fdct64_stage3_avx2(__m256i *x2, __m256i *x3,
+                                      __m256i *cospi_m32, __m256i *cospi_p32,
+                                      const __m256i *__rounding,
+                                      int8_t cos_bit) {
+  x3[0] = _mm256_add_epi32(x2[0], x2[15]);
+  x3[15] = _mm256_sub_epi32(x2[0], x2[15]);
+  x3[1] = _mm256_add_epi32(x2[1], x2[14]);
+  x3[14] = _mm256_sub_epi32(x2[1], x2[14]);
+  x3[2] = _mm256_add_epi32(x2[2], x2[13]);
+  x3[13] = _mm256_sub_epi32(x2[2], x2[13]);
+  x3[3] = _mm256_add_epi32(x2[3], x2[12]);
+  x3[12] = _mm256_sub_epi32(x2[3], x2[12]);
+  x3[4] = _mm256_add_epi32(x2[4], x2[11]);
+  x3[11] = _mm256_sub_epi32(x2[4], x2[11]);
+  x3[5] = _mm256_add_epi32(x2[5], x2[10]);
+  x3[10] = _mm256_sub_epi32(x2[5], x2[10]);
+  x3[6] = _mm256_add_epi32(x2[6], x2[9]);
+  x3[9] = _mm256_sub_epi32(x2[6], x2[9]);
+  x3[7] = _mm256_add_epi32(x2[7], x2[8]);
+  x3[8] = _mm256_sub_epi32(x2[7], x2[8]);
+  x3[16] = x2[16];
+  x3[17] = x2[17];
+  x3[18] = x2[18];
+  x3[19] = x2[19];
+  btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x2[20], x2[27], x3[20], x3[27],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x2[21], x2[26], x3[21], x3[26],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x2[22], x2[25], x3[22], x3[25],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x2[23], x2[24], x3[23], x3[24],
+                        *__rounding, cos_bit);
+  x3[28] = x2[28];
+  x3[29] = x2[29];
+  x3[30] = x2[30];
+  x3[31] = x2[31];
+  x3[32] = _mm256_add_epi32(x2[32], x2[47]);
+  x3[47] = _mm256_sub_epi32(x2[32], x2[47]);
+  x3[33] = _mm256_add_epi32(x2[33], x2[46]);
+  x3[46] = _mm256_sub_epi32(x2[33], x2[46]);
+  x3[34] = _mm256_add_epi32(x2[34], x2[45]);
+  x3[45] = _mm256_sub_epi32(x2[34], x2[45]);
+  x3[35] = _mm256_add_epi32(x2[35], x2[44]);
+  x3[44] = _mm256_sub_epi32(x2[35], x2[44]);
+  x3[36] = _mm256_add_epi32(x2[36], x2[43]);
+  x3[43] = _mm256_sub_epi32(x2[36], x2[43]);
+  x3[37] = _mm256_add_epi32(x2[37], x2[42]);
+  x3[42] = _mm256_sub_epi32(x2[37], x2[42]);
+  x3[38] = _mm256_add_epi32(x2[38], x2[41]);
+  x3[41] = _mm256_sub_epi32(x2[38], x2[41]);
+  x3[39] = _mm256_add_epi32(x2[39], x2[40]);
+  x3[40] = _mm256_sub_epi32(x2[39], x2[40]);
+  x3[48] = _mm256_sub_epi32(x2[63], x2[48]);
+  x3[63] = _mm256_add_epi32(x2[63], x2[48]);
+  x3[49] = _mm256_sub_epi32(x2[62], x2[49]);
+  x3[62] = _mm256_add_epi32(x2[62], x2[49]);
+  x3[50] = _mm256_sub_epi32(x2[61], x2[50]);
+  x3[61] = _mm256_add_epi32(x2[61], x2[50]);
+  x3[51] = _mm256_sub_epi32(x2[60], x2[51]);
+  x3[60] = _mm256_add_epi32(x2[60], x2[51]);
+  x3[52] = _mm256_sub_epi32(x2[59], x2[52]);
+  x3[59] = _mm256_add_epi32(x2[59], x2[52]);
+  x3[53] = _mm256_sub_epi32(x2[58], x2[53]);
+  x3[58] = _mm256_add_epi32(x2[58], x2[53]);
+  x3[54] = _mm256_sub_epi32(x2[57], x2[54]);
+  x3[57] = _mm256_add_epi32(x2[57], x2[54]);
+  x3[55] = _mm256_sub_epi32(x2[56], x2[55]);
+  x3[56] = _mm256_add_epi32(x2[56], x2[55]);
+}
+static INLINE void fdct64_stage4_avx2(__m256i *x3, __m256i *x4,
+                                      __m256i *cospi_m32, __m256i *cospi_p32,
+                                      __m256i *cospi_m16, __m256i *cospi_p48,
+                                      __m256i *cospi_m48,
+                                      const __m256i *__rounding,
+                                      int8_t cos_bit) {
+  x4[0] = _mm256_add_epi32(x3[0], x3[7]);
+  x4[7] = _mm256_sub_epi32(x3[0], x3[7]);
+  x4[1] = _mm256_add_epi32(x3[1], x3[6]);
+  x4[6] = _mm256_sub_epi32(x3[1], x3[6]);
+  x4[2] = _mm256_add_epi32(x3[2], x3[5]);
+  x4[5] = _mm256_sub_epi32(x3[2], x3[5]);
+  x4[3] = _mm256_add_epi32(x3[3], x3[4]);
+  x4[4] = _mm256_sub_epi32(x3[3], x3[4]);
+  x4[8] = x3[8];
+  x4[9] = x3[9];
+  btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x3[10], x3[13], x4[10], x4[13],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x3[11], x3[12], x4[11], x4[12],
+                        *__rounding, cos_bit);
+  x4[14] = x3[14];
+  x4[15] = x3[15];
+  x4[16] = _mm256_add_epi32(x3[16], x3[23]);
+  x4[23] = _mm256_sub_epi32(x3[16], x3[23]);
+  x4[17] = _mm256_add_epi32(x3[17], x3[22]);
+  x4[22] = _mm256_sub_epi32(x3[17], x3[22]);
+  x4[18] = _mm256_add_epi32(x3[18], x3[21]);
+  x4[21] = _mm256_sub_epi32(x3[18], x3[21]);
+  x4[19] = _mm256_add_epi32(x3[19], x3[20]);
+  x4[20] = _mm256_sub_epi32(x3[19], x3[20]);
+  x4[24] = _mm256_sub_epi32(x3[31], x3[24]);
+  x4[31] = _mm256_add_epi32(x3[31], x3[24]);
+  x4[25] = _mm256_sub_epi32(x3[30], x3[25]);
+  x4[30] = _mm256_add_epi32(x3[30], x3[25]);
+  x4[26] = _mm256_sub_epi32(x3[29], x3[26]);
+  x4[29] = _mm256_add_epi32(x3[29], x3[26]);
+  x4[27] = _mm256_sub_epi32(x3[28], x3[27]);
+  x4[28] = _mm256_add_epi32(x3[28], x3[27]);
+  x4[32] = x3[32];
+  x4[33] = x3[33];
+  x4[34] = x3[34];
+  x4[35] = x3[35];
+  btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x3[36], x3[59], x4[36], x4[59],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x3[37], x3[58], x4[37], x4[58],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x3[38], x3[57], x4[38], x4[57],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x3[39], x3[56], x4[39], x4[56],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x3[40], x3[55], x4[40], x4[55],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x3[41], x3[54], x4[41], x4[54],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x3[42], x3[53], x4[42], x4[53],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x3[43], x3[52], x4[43], x4[52],
+                        *__rounding, cos_bit);
+  x4[44] = x3[44];
+  x4[45] = x3[45];
+  x4[46] = x3[46];
+  x4[47] = x3[47];
+  x4[48] = x3[48];
+  x4[49] = x3[49];
+  x4[50] = x3[50];
+  x4[51] = x3[51];
+  x4[60] = x3[60];
+  x4[61] = x3[61];
+  x4[62] = x3[62];
+  x4[63] = x3[63];
+}
+static INLINE void fdct64_stage5_avx2(__m256i *x4, __m256i *x5,
+                                      __m256i *cospi_m32, __m256i *cospi_p32,
+                                      __m256i *cospi_m16, __m256i *cospi_p48,
+                                      __m256i *cospi_m48,
+                                      const __m256i *__rounding,
+                                      int8_t cos_bit) {
+  x5[0] = _mm256_add_epi32(x4[0], x4[3]);
+  x5[3] = _mm256_sub_epi32(x4[0], x4[3]);
+  x5[1] = _mm256_add_epi32(x4[1], x4[2]);
+  x5[2] = _mm256_sub_epi32(x4[1], x4[2]);
+  x5[4] = x4[4];
+  btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x4[5], x4[6], x5[5], x5[6],
+                        *__rounding, cos_bit);
+  x5[7] = x4[7];
+  x5[8] = _mm256_add_epi32(x4[8], x4[11]);
+  x5[11] = _mm256_sub_epi32(x4[8], x4[11]);
+  x5[9] = _mm256_add_epi32(x4[9], x4[10]);
+  x5[10] = _mm256_sub_epi32(x4[9], x4[10]);
+  x5[12] = _mm256_sub_epi32(x4[15], x4[12]);
+  x5[15] = _mm256_add_epi32(x4[15], x4[12]);
+  x5[13] = _mm256_sub_epi32(x4[14], x4[13]);
+  x5[14] = _mm256_add_epi32(x4[14], x4[13]);
+  x5[16] = x4[16];
+  x5[17] = x4[17];
+  btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x4[18], x4[29], x5[18], x5[29],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x4[19], x4[28], x5[19], x5[28],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x4[20], x4[27], x5[20], x5[27],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x4[21], x4[26], x5[21], x5[26],
+                        *__rounding, cos_bit);
+  x5[22] = x4[22];
+  x5[23] = x4[23];
+  x5[24] = x4[24];
+  x5[25] = x4[25];
+  x5[30] = x4[30];
+  x5[31] = x4[31];
+  x5[32] = _mm256_add_epi32(x4[32], x4[39]);
+  x5[39] = _mm256_sub_epi32(x4[32], x4[39]);
+  x5[33] = _mm256_add_epi32(x4[33], x4[38]);
+  x5[38] = _mm256_sub_epi32(x4[33], x4[38]);
+  x5[34] = _mm256_add_epi32(x4[34], x4[37]);
+  x5[37] = _mm256_sub_epi32(x4[34], x4[37]);
+  x5[35] = _mm256_add_epi32(x4[35], x4[36]);
+  x5[36] = _mm256_sub_epi32(x4[35], x4[36]);
+  x5[40] = _mm256_sub_epi32(x4[47], x4[40]);
+  x5[47] = _mm256_add_epi32(x4[47], x4[40]);
+  x5[41] = _mm256_sub_epi32(x4[46], x4[41]);
+  x5[46] = _mm256_add_epi32(x4[46], x4[41]);
+  x5[42] = _mm256_sub_epi32(x4[45], x4[42]);
+  x5[45] = _mm256_add_epi32(x4[45], x4[42]);
+  x5[43] = _mm256_sub_epi32(x4[44], x4[43]);
+  x5[44] = _mm256_add_epi32(x4[44], x4[43]);
+  x5[48] = _mm256_add_epi32(x4[48], x4[55]);
+  x5[55] = _mm256_sub_epi32(x4[48], x4[55]);
+  x5[49] = _mm256_add_epi32(x4[49], x4[54]);
+  x5[54] = _mm256_sub_epi32(x4[49], x4[54]);
+  x5[50] = _mm256_add_epi32(x4[50], x4[53]);
+  x5[53] = _mm256_sub_epi32(x4[50], x4[53]);
+  x5[51] = _mm256_add_epi32(x4[51], x4[52]);
+  x5[52] = _mm256_sub_epi32(x4[51], x4[52]);
+  x5[56] = _mm256_sub_epi32(x4[63], x4[56]);
+  x5[63] = _mm256_add_epi32(x4[63], x4[56]);
+  x5[57] = _mm256_sub_epi32(x4[62], x4[57]);
+  x5[62] = _mm256_add_epi32(x4[62], x4[57]);
+  x5[58] = _mm256_sub_epi32(x4[61], x4[58]);
+  x5[61] = _mm256_add_epi32(x4[61], x4[58]);
+  x5[59] = _mm256_sub_epi32(x4[60], x4[59]);
+  x5[60] = _mm256_add_epi32(x4[60], x4[59]);
+}
+static INLINE void fdct64_stage6_avx2(
+    __m256i *x5, __m256i *x6, __m256i *cospi_p16, __m256i *cospi_p32,
+    __m256i *cospi_m16, __m256i *cospi_p48, __m256i *cospi_m48,
+    __m256i *cospi_m08, __m256i *cospi_p56, __m256i *cospi_m56,
+    __m256i *cospi_m40, __m256i *cospi_p24, __m256i *cospi_m24,
+    const __m256i *__rounding, int8_t cos_bit) {
+  btf_32_type0_avx2_new(*cospi_p32, *cospi_p32, x5[0], x5[1], x6[0], x6[1],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_p16, *cospi_p48, x5[3], x5[2], x6[2], x6[3],
+                        *__rounding, cos_bit);
+  x6[4] = _mm256_add_epi32(x5[4], x5[5]);
+  x6[5] = _mm256_sub_epi32(x5[4], x5[5]);
+  x6[6] = _mm256_sub_epi32(x5[7], x5[6]);
+  x6[7] = _mm256_add_epi32(x5[7], x5[6]);
+  x6[8] = x5[8];
+  btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x5[9], x5[14], x6[9], x6[14],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x5[10], x5[13], x6[10], x6[13],
+                        *__rounding, cos_bit);
+  x6[11] = x5[11];
+  x6[12] = x5[12];
+  x6[15] = x5[15];
+  x6[16] = _mm256_add_epi32(x5[16], x5[19]);
+  x6[19] = _mm256_sub_epi32(x5[16], x5[19]);
+  x6[17] = _mm256_add_epi32(x5[17], x5[18]);
+  x6[18] = _mm256_sub_epi32(x5[17], x5[18]);
+  x6[20] = _mm256_sub_epi32(x5[23], x5[20]);
+  x6[23] = _mm256_add_epi32(x5[23], x5[20]);
+  x6[21] = _mm256_sub_epi32(x5[22], x5[21]);
+  x6[22] = _mm256_add_epi32(x5[22], x5[21]);
+  x6[24] = _mm256_add_epi32(x5[24], x5[27]);
+  x6[27] = _mm256_sub_epi32(x5[24], x5[27]);
+  x6[25] = _mm256_add_epi32(x5[25], x5[26]);
+  x6[26] = _mm256_sub_epi32(x5[25], x5[26]);
+  x6[28] = _mm256_sub_epi32(x5[31], x5[28]);
+  x6[31] = _mm256_add_epi32(x5[31], x5[28]);
+  x6[29] = _mm256_sub_epi32(x5[30], x5[29]);
+  x6[30] = _mm256_add_epi32(x5[30], x5[29]);
+  x6[32] = x5[32];
+  x6[33] = x5[33];
+  btf_32_type0_avx2_new(*cospi_m08, *cospi_p56, x5[34], x5[61], x6[34], x6[61],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m08, *cospi_p56, x5[35], x5[60], x6[35], x6[60],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m56, *cospi_m08, x5[36], x5[59], x6[36], x6[59],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m56, *cospi_m08, x5[37], x5[58], x6[37], x6[58],
+                        *__rounding, cos_bit);
+  x6[38] = x5[38];
+  x6[39] = x5[39];
+  x6[40] = x5[40];
+  x6[41] = x5[41];
+  btf_32_type0_avx2_new(*cospi_m40, *cospi_p24, x5[42], x5[53], x6[42], x6[53],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m40, *cospi_p24, x5[43], x5[52], x6[43], x6[52],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m24, *cospi_m40, x5[44], x5[51], x6[44], x6[51],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m24, *cospi_m40, x5[45], x5[50], x6[45], x6[50],
+                        *__rounding, cos_bit);
+  x6[46] = x5[46];
+  x6[47] = x5[47];
+  x6[48] = x5[48];
+  x6[49] = x5[49];
+  x6[54] = x5[54];
+  x6[55] = x5[55];
+  x6[56] = x5[56];
+  x6[57] = x5[57];
+  x6[62] = x5[62];
+  x6[63] = x5[63];
+}
+static INLINE void fdct64_stage7_avx2(__m256i *x6, __m256i *x7,
+                                      __m256i *cospi_p08, __m256i *cospi_p56,
+                                      __m256i *cospi_p40, __m256i *cospi_p24,
+                                      __m256i *cospi_m08, __m256i *cospi_m56,
+                                      __m256i *cospi_m40, __m256i *cospi_m24,
+                                      const __m256i *__rounding,
+                                      int8_t cos_bit) {
+  x7[0] = x6[0];
+  x7[1] = x6[1];
+  x7[2] = x6[2];
+  x7[3] = x6[3];
+  btf_32_type0_avx2_new(*cospi_p08, *cospi_p56, x6[7], x6[4], x7[4], x7[7],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_p40, *cospi_p24, x6[6], x6[5], x7[5], x7[6],
+                        *__rounding, cos_bit);
+  x7[8] = _mm256_add_epi32(x6[8], x6[9]);
+  x7[9] = _mm256_sub_epi32(x6[8], x6[9]);
+  x7[10] = _mm256_sub_epi32(x6[11], x6[10]);
+  x7[11] = _mm256_add_epi32(x6[11], x6[10]);
+  x7[12] = _mm256_add_epi32(x6[12], x6[13]);
+  x7[13] = _mm256_sub_epi32(x6[12], x6[13]);
+  x7[14] = _mm256_sub_epi32(x6[15], x6[14]);
+  x7[15] = _mm256_add_epi32(x6[15], x6[14]);
+  x7[16] = x6[16];
+  btf_32_type0_avx2_new(*cospi_m08, *cospi_p56, x6[17], x6[30], x7[17], x7[30],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m56, *cospi_m08, x6[18], x6[29], x7[18], x7[29],
+                        *__rounding, cos_bit);
+  x7[19] = x6[19];
+  x7[20] = x6[20];
+  btf_32_type0_avx2_new(*cospi_m40, *cospi_p24, x6[21], x6[26], x7[21], x7[26],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(*cospi_m24, *cospi_m40, x6[22], x6[25], x7[22], x7[25],
+                        *__rounding, cos_bit);
+  x7[23] = x6[23];
+  x7[24] = x6[24];
+  x7[27] = x6[27];
+  x7[28] = x6[28];
+  x7[31] = x6[31];
+  x7[32] = _mm256_add_epi32(x6[32], x6[35]);
+  x7[35] = _mm256_sub_epi32(x6[32], x6[35]);
+  x7[33] = _mm256_add_epi32(x6[33], x6[34]);
+  x7[34] = _mm256_sub_epi32(x6[33], x6[34]);
+  x7[36] = _mm256_sub_epi32(x6[39], x6[36]);
+  x7[39] = _mm256_add_epi32(x6[39], x6[36]);
+  x7[37] = _mm256_sub_epi32(x6[38], x6[37]);
+  x7[38] = _mm256_add_epi32(x6[38], x6[37]);
+  x7[40] = _mm256_add_epi32(x6[40], x6[43]);
+  x7[43] = _mm256_sub_epi32(x6[40], x6[43]);
+  x7[41] = _mm256_add_epi32(x6[41], x6[42]);
+  x7[42] = _mm256_sub_epi32(x6[41], x6[42]);
+  x7[44] = _mm256_sub_epi32(x6[47], x6[44]);
+  x7[47] = _mm256_add_epi32(x6[47], x6[44]);
+  x7[45] = _mm256_sub_epi32(x6[46], x6[45]);
+  x7[46] = _mm256_add_epi32(x6[46], x6[45]);
+  x7[48] = _mm256_add_epi32(x6[48], x6[51]);
+  x7[51] = _mm256_sub_epi32(x6[48], x6[51]);
+  x7[49] = _mm256_add_epi32(x6[49], x6[50]);
+  x7[50] = _mm256_sub_epi32(x6[49], x6[50]);
+  x7[52] = _mm256_sub_epi32(x6[55], x6[52]);
+  x7[55] = _mm256_add_epi32(x6[55], x6[52]);
+  x7[53] = _mm256_sub_epi32(x6[54], x6[53]);
+  x7[54] = _mm256_add_epi32(x6[54], x6[53]);
+  x7[56] = _mm256_add_epi32(x6[56], x6[59]);
+  x7[59] = _mm256_sub_epi32(x6[56], x6[59]);
+  x7[57] = _mm256_add_epi32(x6[57], x6[58]);
+  x7[58] = _mm256_sub_epi32(x6[57], x6[58]);
+  x7[60] = _mm256_sub_epi32(x6[63], x6[60]);
+  x7[63] = _mm256_add_epi32(x6[63], x6[60]);
+  x7[61] = _mm256_sub_epi32(x6[62], x6[61]);
+  x7[62] = _mm256_add_epi32(x6[62], x6[61]);
+}
+static INLINE void fdct64_stage8_avx2(__m256i *x7, __m256i *x8,
+                                      const int32_t *cospi,
+                                      const __m256i *__rounding,
+                                      int8_t cos_bit) {
+  __m256i cospi_p60 = _mm256_set1_epi32(cospi[60]);
+  __m256i cospi_p04 = _mm256_set1_epi32(cospi[4]);
+  __m256i cospi_p28 = _mm256_set1_epi32(cospi[28]);
+  __m256i cospi_p36 = _mm256_set1_epi32(cospi[36]);
+  __m256i cospi_p44 = _mm256_set1_epi32(cospi[44]);
+  __m256i cospi_p20 = _mm256_set1_epi32(cospi[20]);
+  __m256i cospi_p12 = _mm256_set1_epi32(cospi[12]);
+  __m256i cospi_p52 = _mm256_set1_epi32(cospi[52]);
+  __m256i cospi_m04 = _mm256_set1_epi32(-cospi[4]);
+  __m256i cospi_m60 = _mm256_set1_epi32(-cospi[60]);
+  __m256i cospi_m36 = _mm256_set1_epi32(-cospi[36]);
+  __m256i cospi_m28 = _mm256_set1_epi32(-cospi[28]);
+  __m256i cospi_m20 = _mm256_set1_epi32(-cospi[20]);
+  __m256i cospi_m44 = _mm256_set1_epi32(-cospi[44]);
+  __m256i cospi_m52 = _mm256_set1_epi32(-cospi[52]);
+  __m256i cospi_m12 = _mm256_set1_epi32(-cospi[12]);
+
+  x8[0] = x7[0];
+  x8[1] = x7[1];
+  x8[2] = x7[2];
+  x8[3] = x7[3];
+  x8[4] = x7[4];
+  x8[5] = x7[5];
+  x8[6] = x7[6];
+  x8[7] = x7[7];
+
+  btf_32_type0_avx2_new(cospi_p04, cospi_p60, x7[15], x7[8], x8[8], x8[15],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_p36, cospi_p28, x7[14], x7[9], x8[9], x8[14],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_p20, cospi_p44, x7[13], x7[10], x8[10], x8[13],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_p52, cospi_p12, x7[12], x7[11], x8[11], x8[12],
+                        *__rounding, cos_bit);
+  x8[16] = _mm256_add_epi32(x7[16], x7[17]);
+  x8[17] = _mm256_sub_epi32(x7[16], x7[17]);
+  x8[18] = _mm256_sub_epi32(x7[19], x7[18]);
+  x8[19] = _mm256_add_epi32(x7[19], x7[18]);
+  x8[20] = _mm256_add_epi32(x7[20], x7[21]);
+  x8[21] = _mm256_sub_epi32(x7[20], x7[21]);
+  x8[22] = _mm256_sub_epi32(x7[23], x7[22]);
+  x8[23] = _mm256_add_epi32(x7[23], x7[22]);
+  x8[24] = _mm256_add_epi32(x7[24], x7[25]);
+  x8[25] = _mm256_sub_epi32(x7[24], x7[25]);
+  x8[26] = _mm256_sub_epi32(x7[27], x7[26]);
+  x8[27] = _mm256_add_epi32(x7[27], x7[26]);
+  x8[28] = _mm256_add_epi32(x7[28], x7[29]);
+  x8[29] = _mm256_sub_epi32(x7[28], x7[29]);
+  x8[30] = _mm256_sub_epi32(x7[31], x7[30]);
+  x8[31] = _mm256_add_epi32(x7[31], x7[30]);
+  x8[32] = x7[32];
+  btf_32_type0_avx2_new(cospi_m04, cospi_p60, x7[33], x7[62], x8[33], x8[62],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_m60, cospi_m04, x7[34], x7[61], x8[34], x8[61],
+                        *__rounding, cos_bit);
+  x8[35] = x7[35];
+  x8[36] = x7[36];
+  btf_32_type0_avx2_new(cospi_m36, cospi_p28, x7[37], x7[58], x8[37], x8[58],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_m28, cospi_m36, x7[38], x7[57], x8[38], x8[57],
+                        *__rounding, cos_bit);
+  x8[39] = x7[39];
+  x8[40] = x7[40];
+  btf_32_type0_avx2_new(cospi_m20, cospi_p44, x7[41], x7[54], x8[41], x8[54],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_m44, cospi_m20, x7[42], x7[53], x8[42], x8[53],
+                        *__rounding, cos_bit);
+  x8[43] = x7[43];
+  x8[44] = x7[44];
+  btf_32_type0_avx2_new(cospi_m52, cospi_p12, x7[45], x7[50], x8[45], x8[50],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_m12, cospi_m52, x7[46], x7[49], x8[46], x8[49],
+                        *__rounding, cos_bit);
+  x8[47] = x7[47];
+  x8[48] = x7[48];
+  x8[51] = x7[51];
+  x8[52] = x7[52];
+  x8[55] = x7[55];
+  x8[56] = x7[56];
+  x8[59] = x7[59];
+  x8[60] = x7[60];
+  x8[63] = x7[63];
+}
+static INLINE void fdct64_stage9_avx2(__m256i *x8, __m256i *x9,
+                                      const int32_t *cospi,
+                                      const __m256i *__rounding,
+                                      int8_t cos_bit) {
+  __m256i cospi_p62 = _mm256_set1_epi32(cospi[62]);
+  __m256i cospi_p02 = _mm256_set1_epi32(cospi[2]);
+  __m256i cospi_p30 = _mm256_set1_epi32(cospi[30]);
+  __m256i cospi_p34 = _mm256_set1_epi32(cospi[34]);
+  __m256i cospi_p46 = _mm256_set1_epi32(cospi[46]);
+  __m256i cospi_p18 = _mm256_set1_epi32(cospi[18]);
+  __m256i cospi_p14 = _mm256_set1_epi32(cospi[14]);
+  __m256i cospi_p50 = _mm256_set1_epi32(cospi[50]);
+  __m256i cospi_p54 = _mm256_set1_epi32(cospi[54]);
+  __m256i cospi_p10 = _mm256_set1_epi32(cospi[10]);
+  __m256i cospi_p22 = _mm256_set1_epi32(cospi[22]);
+  __m256i cospi_p42 = _mm256_set1_epi32(cospi[42]);
+  __m256i cospi_p38 = _mm256_set1_epi32(cospi[38]);
+  __m256i cospi_p26 = _mm256_set1_epi32(cospi[26]);
+  __m256i cospi_p06 = _mm256_set1_epi32(cospi[6]);
+  __m256i cospi_p58 = _mm256_set1_epi32(cospi[58]);
+
+  x9[0] = x8[0];
+  x9[1] = x8[1];
+  x9[2] = x8[2];
+  x9[3] = x8[3];
+  x9[4] = x8[4];
+  x9[5] = x8[5];
+  x9[6] = x8[6];
+  x9[7] = x8[7];
+  x9[8] = x8[8];
+  x9[9] = x8[9];
+  x9[10] = x8[10];
+  x9[11] = x8[11];
+  x9[12] = x8[12];
+  x9[13] = x8[13];
+  x9[14] = x8[14];
+  x9[15] = x8[15];
+  btf_32_type0_avx2_new(cospi_p02, cospi_p62, x8[31], x8[16], x9[16], x9[31],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_p34, cospi_p30, x8[30], x8[17], x9[17], x9[30],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_p18, cospi_p46, x8[29], x8[18], x9[18], x9[29],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_p50, cospi_p14, x8[28], x8[19], x9[19], x9[28],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_p10, cospi_p54, x8[27], x8[20], x9[20], x9[27],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_p42, cospi_p22, x8[26], x8[21], x9[21], x9[26],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_p26, cospi_p38, x8[25], x8[22], x9[22], x9[25],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_p58, cospi_p06, x8[24], x8[23], x9[23], x9[24],
+                        *__rounding, cos_bit);
+  x9[32] = _mm256_add_epi32(x8[32], x8[33]);
+  x9[33] = _mm256_sub_epi32(x8[32], x8[33]);
+  x9[34] = _mm256_sub_epi32(x8[35], x8[34]);
+  x9[35] = _mm256_add_epi32(x8[35], x8[34]);
+  x9[36] = _mm256_add_epi32(x8[36], x8[37]);
+  x9[37] = _mm256_sub_epi32(x8[36], x8[37]);
+  x9[38] = _mm256_sub_epi32(x8[39], x8[38]);
+  x9[39] = _mm256_add_epi32(x8[39], x8[38]);
+  x9[40] = _mm256_add_epi32(x8[40], x8[41]);
+  x9[41] = _mm256_sub_epi32(x8[40], x8[41]);
+  x9[42] = _mm256_sub_epi32(x8[43], x8[42]);
+  x9[43] = _mm256_add_epi32(x8[43], x8[42]);
+  x9[44] = _mm256_add_epi32(x8[44], x8[45]);
+  x9[45] = _mm256_sub_epi32(x8[44], x8[45]);
+  x9[46] = _mm256_sub_epi32(x8[47], x8[46]);
+  x9[47] = _mm256_add_epi32(x8[47], x8[46]);
+  x9[48] = _mm256_add_epi32(x8[48], x8[49]);
+  x9[49] = _mm256_sub_epi32(x8[48], x8[49]);
+  x9[50] = _mm256_sub_epi32(x8[51], x8[50]);
+  x9[51] = _mm256_add_epi32(x8[51], x8[50]);
+  x9[52] = _mm256_add_epi32(x8[52], x8[53]);
+  x9[53] = _mm256_sub_epi32(x8[52], x8[53]);
+  x9[54] = _mm256_sub_epi32(x8[55], x8[54]);
+  x9[55] = _mm256_add_epi32(x8[55], x8[54]);
+  x9[56] = _mm256_add_epi32(x8[56], x8[57]);
+  x9[57] = _mm256_sub_epi32(x8[56], x8[57]);
+  x9[58] = _mm256_sub_epi32(x8[59], x8[58]);
+  x9[59] = _mm256_add_epi32(x8[59], x8[58]);
+  x9[60] = _mm256_add_epi32(x8[60], x8[61]);
+  x9[61] = _mm256_sub_epi32(x8[60], x8[61]);
+  x9[62] = _mm256_sub_epi32(x8[63], x8[62]);
+  x9[63] = _mm256_add_epi32(x8[63], x8[62]);
+}
+static INLINE void fdct64_stage10_avx2(__m256i *x9, __m256i *x10,
+                                       const int32_t *cospi,
+                                       const __m256i *__rounding,
+                                       int8_t cos_bit) {
+  __m256i cospi_p63 = _mm256_set1_epi32(cospi[63]);
+  __m256i cospi_p01 = _mm256_set1_epi32(cospi[1]);
+  __m256i cospi_p31 = _mm256_set1_epi32(cospi[31]);
+  __m256i cospi_p33 = _mm256_set1_epi32(cospi[33]);
+  __m256i cospi_p47 = _mm256_set1_epi32(cospi[47]);
+  __m256i cospi_p17 = _mm256_set1_epi32(cospi[17]);
+  __m256i cospi_p15 = _mm256_set1_epi32(cospi[15]);
+  __m256i cospi_p49 = _mm256_set1_epi32(cospi[49]);
+  __m256i cospi_p55 = _mm256_set1_epi32(cospi[55]);
+  __m256i cospi_p09 = _mm256_set1_epi32(cospi[9]);
+  __m256i cospi_p23 = _mm256_set1_epi32(cospi[23]);
+  __m256i cospi_p41 = _mm256_set1_epi32(cospi[41]);
+  __m256i cospi_p39 = _mm256_set1_epi32(cospi[39]);
+  __m256i cospi_p25 = _mm256_set1_epi32(cospi[25]);
+  __m256i cospi_p07 = _mm256_set1_epi32(cospi[7]);
+  __m256i cospi_p57 = _mm256_set1_epi32(cospi[57]);
+  __m256i cospi_p59 = _mm256_set1_epi32(cospi[59]);
+  __m256i cospi_p05 = _mm256_set1_epi32(cospi[5]);
+  __m256i cospi_p27 = _mm256_set1_epi32(cospi[27]);
+  __m256i cospi_p37 = _mm256_set1_epi32(cospi[37]);
+  __m256i cospi_p43 = _mm256_set1_epi32(cospi[43]);
+  __m256i cospi_p21 = _mm256_set1_epi32(cospi[21]);
+  __m256i cospi_p11 = _mm256_set1_epi32(cospi[11]);
+  __m256i cospi_p53 = _mm256_set1_epi32(cospi[53]);
+  __m256i cospi_p51 = _mm256_set1_epi32(cospi[51]);
+  __m256i cospi_p13 = _mm256_set1_epi32(cospi[13]);
+  __m256i cospi_p19 = _mm256_set1_epi32(cospi[19]);
+  __m256i cospi_p45 = _mm256_set1_epi32(cospi[45]);
+  __m256i cospi_p35 = _mm256_set1_epi32(cospi[35]);
+  __m256i cospi_p29 = _mm256_set1_epi32(cospi[29]);
+  __m256i cospi_p03 = _mm256_set1_epi32(cospi[3]);
+  __m256i cospi_p61 = _mm256_set1_epi32(cospi[61]);
+
+  x10[0] = x9[0];
+  x10[1] = x9[1];
+  x10[2] = x9[2];
+  x10[3] = x9[3];
+  x10[4] = x9[4];
+  x10[5] = x9[5];
+  x10[6] = x9[6];
+  x10[7] = x9[7];
+  x10[8] = x9[8];
+  x10[9] = x9[9];
+  x10[10] = x9[10];
+  x10[11] = x9[11];
+  x10[12] = x9[12];
+  x10[13] = x9[13];
+  x10[14] = x9[14];
+  x10[15] = x9[15];
+  x10[16] = x9[16];
+  x10[17] = x9[17];
+  x10[18] = x9[18];
+  x10[19] = x9[19];
+  x10[20] = x9[20];
+  x10[21] = x9[21];
+  x10[22] = x9[22];
+  x10[23] = x9[23];
+  x10[24] = x9[24];
+  x10[25] = x9[25];
+  x10[26] = x9[26];
+  x10[27] = x9[27];
+  x10[28] = x9[28];
+  x10[29] = x9[29];
+  x10[30] = x9[30];
+  x10[31] = x9[31];
+  btf_32_type0_avx2_new(cospi_p01, cospi_p63, x9[63], x9[32], x10[32], x10[63],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_p33, cospi_p31, x9[62], x9[33], x10[33], x10[62],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_p17, cospi_p47, x9[61], x9[34], x10[34], x10[61],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_p49, cospi_p15, x9[60], x9[35], x10[35], x10[60],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_p09, cospi_p55, x9[59], x9[36], x10[36], x10[59],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_p41, cospi_p23, x9[58], x9[37], x10[37], x10[58],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_p25, cospi_p39, x9[57], x9[38], x10[38], x10[57],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_p57, cospi_p07, x9[56], x9[39], x10[39], x10[56],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_p05, cospi_p59, x9[55], x9[40], x10[40], x10[55],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_p37, cospi_p27, x9[54], x9[41], x10[41], x10[54],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_p21, cospi_p43, x9[53], x9[42], x10[42], x10[53],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_p53, cospi_p11, x9[52], x9[43], x10[43], x10[52],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_p13, cospi_p51, x9[51], x9[44], x10[44], x10[51],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_p45, cospi_p19, x9[50], x9[45], x10[45], x10[50],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_p29, cospi_p35, x9[49], x9[46], x10[46], x10[49],
+                        *__rounding, cos_bit);
+  btf_32_type0_avx2_new(cospi_p61, cospi_p03, x9[48], x9[47], x10[47], x10[48],
+                        *__rounding, cos_bit);
+}
+static void fdct64_avx2(__m256i *input, __m256i *output, int8_t cos_bit,
+                        const int instride, const int outstride) {
+  const int32_t *cospi = cospi_arr(cos_bit);
+  const __m256i __rounding = _mm256_set1_epi32(1 << (cos_bit - 1));
+  __m256i cospi_m32 = _mm256_set1_epi32(-cospi[32]);
+  __m256i cospi_p32 = _mm256_set1_epi32(cospi[32]);
+  __m256i cospi_m16 = _mm256_set1_epi32(-cospi[16]);
+  __m256i cospi_p48 = _mm256_set1_epi32(cospi[48]);
+  __m256i cospi_m48 = _mm256_set1_epi32(-cospi[48]);
+  __m256i cospi_p16 = _mm256_set1_epi32(cospi[16]);
+  __m256i cospi_m08 = _mm256_set1_epi32(-cospi[8]);
+  __m256i cospi_p56 = _mm256_set1_epi32(cospi[56]);
+  __m256i cospi_m56 = _mm256_set1_epi32(-cospi[56]);
+  __m256i cospi_m40 = _mm256_set1_epi32(-cospi[40]);
+  __m256i cospi_p24 = _mm256_set1_epi32(cospi[24]);
+  __m256i cospi_m24 = _mm256_set1_epi32(-cospi[24]);
+  __m256i cospi_p08 = _mm256_set1_epi32(cospi[8]);
+  __m256i cospi_p40 = _mm256_set1_epi32(cospi[40]);
+
+  int startidx = 0 * instride;
+  int endidx = 63 * instride;
+  // stage 1
+  __m256i x1[64];
+  x1[0] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[63] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[1] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[62] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[2] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[61] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[3] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[60] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[4] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[59] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[5] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[58] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[6] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[57] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[7] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[56] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[8] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[55] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[9] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[54] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[10] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[53] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[11] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[52] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[12] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[51] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[13] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[50] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[14] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[49] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[15] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[48] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[16] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[47] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[17] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[46] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[18] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[45] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[19] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[44] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[20] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[43] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[21] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[42] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[22] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[41] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[23] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[40] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[24] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[39] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[25] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[38] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[26] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[37] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[27] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[36] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[28] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[35] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[29] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[34] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[30] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[33] = _mm256_sub_epi32(input[startidx], input[endidx]);
+  startidx += instride;
+  endidx -= instride;
+  x1[31] = _mm256_add_epi32(input[startidx], input[endidx]);
+  x1[32] = _mm256_sub_epi32(input[startidx], input[endidx]);
+
+  // stage 2
+  __m256i x2[64];
+  fdct64_stage2_avx2(x1, x2, &cospi_m32, &cospi_p32, &__rounding, cos_bit);
+  // stage 3
+  fdct64_stage3_avx2(x2, x1, &cospi_m32, &cospi_p32, &__rounding, cos_bit);
+  // stage 4
+  fdct64_stage4_avx2(x1, x2, &cospi_m32, &cospi_p32, &cospi_m16, &cospi_p48,
+                     &cospi_m48, &__rounding, cos_bit);
+  // stage 5
+  fdct64_stage5_avx2(x2, x1, &cospi_m32, &cospi_p32, &cospi_m16, &cospi_p48,
+                     &cospi_m48, &__rounding, cos_bit);
+  // stage 6
+  fdct64_stage6_avx2(x1, x2, &cospi_p16, &cospi_p32, &cospi_m16, &cospi_p48,
+                     &cospi_m48, &cospi_m08, &cospi_p56, &cospi_m56, &cospi_m40,
+                     &cospi_p24, &cospi_m24, &__rounding, cos_bit);
+  // stage 7
+  fdct64_stage7_avx2(x2, x1, &cospi_p08, &cospi_p56, &cospi_p40, &cospi_p24,
+                     &cospi_m08, &cospi_m56, &cospi_m40, &cospi_m24,
+                     &__rounding, cos_bit);
+  // stage 8
+  fdct64_stage8_avx2(x1, x2, cospi, &__rounding, cos_bit);
+  // stage 9
+  fdct64_stage9_avx2(x2, x1, cospi, &__rounding, cos_bit);
+  // stage 10
+  fdct64_stage10_avx2(x1, x2, cospi, &__rounding, cos_bit);
+
+  startidx = 0 * outstride;
+  endidx = 63 * outstride;
+
+  // stage 11
+  output[startidx] = x2[0];
+  output[endidx] = x2[63];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[32];
+  output[endidx] = x2[31];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[16];
+  output[endidx] = x2[47];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[48];
+  output[endidx] = x2[15];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[8];
+  output[endidx] = x2[55];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[40];
+  output[endidx] = x2[23];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[24];
+  output[endidx] = x2[39];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[56];
+  output[endidx] = x2[7];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[4];
+  output[endidx] = x2[59];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[36];
+  output[endidx] = x2[27];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[20];
+  output[endidx] = x2[43];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[52];
+  output[endidx] = x2[11];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[12];
+  output[endidx] = x2[51];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[44];
+  output[endidx] = x2[19];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[28];
+  output[endidx] = x2[35];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[60];
+  output[endidx] = x2[3];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[2];
+  output[endidx] = x2[61];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[34];
+  output[endidx] = x2[29];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[18];
+  output[endidx] = x2[45];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[50];
+  output[endidx] = x2[13];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[10];
+  output[endidx] = x2[53];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[42];
+  output[endidx] = x2[21];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[26];
+  output[endidx] = x2[37];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[58];
+  output[endidx] = x2[5];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[6];
+  output[endidx] = x2[57];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[38];
+  output[endidx] = x2[25];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[22];
+  output[endidx] = x2[41];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[54];
+  output[endidx] = x2[9];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[14];
+  output[endidx] = x2[49];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[46];
+  output[endidx] = x2[17];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[30];
+  output[endidx] = x2[33];
+  startidx += outstride;
+  endidx -= outstride;
+  output[startidx] = x2[62];
+  output[endidx] = x2[1];
+}
+void av1_fwd_txfm2d_64x64_avx2(const int16_t *input, int32_t *output,
+                               int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  (void)tx_type;
+  assert(tx_type == DCT_DCT);
+  const TX_SIZE tx_size = TX_64X64;
+  __m256i buf0[512], buf1[512];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int width = tx_size_wide[tx_size];
+  const int height = tx_size_high[tx_size];
+  const transform_1d_avx2 col_txfm = fdct64_avx2;
+  const transform_1d_avx2 row_txfm = fdct64_avx2;
+  const int width_div16 = (width >> 4);
+  const int width_div8 = (width >> 3);
+  int r, c;
+  for (int i = 0; i < width_div16; i++) {
+    load_buffer_16xn_avx2(input + (i << 4), &buf0[i << 1], stride, height,
+                          width_div8, 0, 0);
+    round_shift_32_8xn_avx2(&buf0[i << 1], height, shift[0], width_div8);
+    round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], height, shift[0], width_div8);
+    col_txfm(&buf0[i << 1], &buf0[i << 1], cos_bit_col, width_div8, width_div8);
+    col_txfm(&buf0[(i << 1) + 1], &buf0[(i << 1) + 1], cos_bit_col, width_div8,
+             width_div8);
+    round_shift_32_8xn_avx2(&buf0[i << 1], height, shift[1], width_div8);
+    round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], height, shift[1], width_div8);
+  }
+
+  for (r = 0; r < height; r += 8) {
+    for (c = 0; c < width_div8; c++) {
+      fwd_txfm_transpose_8x8_avx2(&buf0[r * width_div8 + c],
+                                  &buf1[c * 8 * width_div8 + (r >> 3)],
+                                  width_div8, width_div8);
+    }
+  }
+
+  for (int i = 0; i < 2; i++) {
+    row_txfm(&buf1[i << 1], &buf0[i << 1], cos_bit_row, width_div8,
+             width_div16);
+    row_txfm(&buf1[(i << 1) + 1], &buf0[(i << 1) + 1], cos_bit_row, width_div8,
+             width_div16);
+    round_shift_32_8xn_avx2(&buf0[i << 1], (height >> 1), shift[2],
+                            width_div16);
+    round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], (height >> 1), shift[2],
+                            width_div16);
+  }
+
+  for (r = 0; r < (height >> 1); r += 8) {
+    for (c = 0; c < width_div16; c++) {
+      fwd_txfm_transpose_8x8_avx2(&buf0[r * width_div16 + c],
+                                  &buf1[c * 8 * width_div16 + (r >> 3)],
+                                  width_div16, width_div16);
+    }
+  }
+  store_buffer_avx2(buf1, output, 8, 128);
+}
diff --git a/libs/libaom/src/av1/encoder/x86/highbd_fwd_txfm_sse4.c b/libs/libaom/src/av1/encoder/x86/highbd_fwd_txfm_sse4.c
new file mode 100644
index 000000000..73afc5d03
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/x86/highbd_fwd_txfm_sse4.c
@@ -0,0 +1,2604 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <assert.h>
+#include <smmintrin.h> /* SSE4.1 */
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "av1/common/av1_txfm.h"
+#include "av1/common/x86/highbd_txfm_utility_sse4.h"
+#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
+#include "av1/encoder/x86/av1_txfm1d_sse4.h"
+#include "aom_dsp/txfm_common.h"
+#include "aom_dsp/x86/txfm_common_sse2.h"
+#include "aom_ports/mem.h"
+
+static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in,
+                                   int stride, int flipud, int fliplr,
+                                   int shift) {
+  if (!flipud) {
+    in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+    in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+    in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
+    in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
+  } else {
+    in[0] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
+    in[1] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
+    in[2] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+    in[3] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+  }
+
+  if (fliplr) {
+    in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
+    in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
+    in[2] = _mm_shufflelo_epi16(in[2], 0x1b);
+    in[3] = _mm_shufflelo_epi16(in[3], 0x1b);
+  }
+
+  in[0] = _mm_cvtepi16_epi32(in[0]);
+  in[1] = _mm_cvtepi16_epi32(in[1]);
+  in[2] = _mm_cvtepi16_epi32(in[2]);
+  in[3] = _mm_cvtepi16_epi32(in[3]);
+
+  in[0] = _mm_slli_epi32(in[0], shift);
+  in[1] = _mm_slli_epi32(in[1], shift);
+  in[2] = _mm_slli_epi32(in[2], shift);
+  in[3] = _mm_slli_epi32(in[3], shift);
+}
+
+// We only use stage-2 bit;
+// shift[0] is used in load_buffer_4x4()
+// shift[1] is used in txfm_func_col()
+// shift[2] is used in txfm_func_row()
+static void fdct4x4_sse4_1(__m128i *in, __m128i *out, int bit,
+                           const int num_col) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  __m128i s0, s1, s2, s3;
+  __m128i u0, u1, u2, u3;
+  __m128i v0, v1, v2, v3;
+
+  int endidx = 3 * num_col;
+  s0 = _mm_add_epi32(in[0], in[endidx]);
+  s3 = _mm_sub_epi32(in[0], in[endidx]);
+  endidx -= num_col;
+  s1 = _mm_add_epi32(in[num_col], in[endidx]);
+  s2 = _mm_sub_epi32(in[num_col], in[endidx]);
+
+  // btf_32_sse4_1_type0(cospi32, cospi32, s[01], u[02], bit);
+  u0 = _mm_mullo_epi32(s0, cospi32);
+  u1 = _mm_mullo_epi32(s1, cospi32);
+  u2 = _mm_add_epi32(u0, u1);
+  v0 = _mm_sub_epi32(u0, u1);
+
+  u3 = _mm_add_epi32(u2, rnding);
+  v1 = _mm_add_epi32(v0, rnding);
+
+  u0 = _mm_srai_epi32(u3, bit);
+  u2 = _mm_srai_epi32(v1, bit);
+
+  // btf_32_sse4_1_type1(cospi48, cospi16, s[23], u[13], bit);
+  v0 = _mm_mullo_epi32(s2, cospi48);
+  v1 = _mm_mullo_epi32(s3, cospi16);
+  v2 = _mm_add_epi32(v0, v1);
+
+  v3 = _mm_add_epi32(v2, rnding);
+  u1 = _mm_srai_epi32(v3, bit);
+
+  v0 = _mm_mullo_epi32(s2, cospi16);
+  v1 = _mm_mullo_epi32(s3, cospi48);
+  v2 = _mm_sub_epi32(v1, v0);
+
+  v3 = _mm_add_epi32(v2, rnding);
+  u3 = _mm_srai_epi32(v3, bit);
+
+  // Note: shift[1] and shift[2] are zeros
+
+  // Transpose 4x4 32-bit
+  v0 = _mm_unpacklo_epi32(u0, u1);
+  v1 = _mm_unpackhi_epi32(u0, u1);
+  v2 = _mm_unpacklo_epi32(u2, u3);
+  v3 = _mm_unpackhi_epi32(u2, u3);
+
+  out[0] = _mm_unpacklo_epi64(v0, v2);
+  out[1] = _mm_unpackhi_epi64(v0, v2);
+  out[2] = _mm_unpacklo_epi64(v1, v3);
+  out[3] = _mm_unpackhi_epi64(v1, v3);
+}
+
+static INLINE void write_buffer_4x4(__m128i *res, int32_t *output) {
+  _mm_store_si128((__m128i *)(output + 0 * 4), res[0]);
+  _mm_store_si128((__m128i *)(output + 1 * 4), res[1]);
+  _mm_store_si128((__m128i *)(output + 2 * 4), res[2]);
+  _mm_store_si128((__m128i *)(output + 3 * 4), res[3]);
+}
+
+static void fadst4x4_sse4_1(__m128i *in, __m128i *out, int bit,
+                            const int num_col) {
+  const int32_t *sinpi = sinpi_arr(bit);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const __m128i sinpi1 = _mm_set1_epi32((int)sinpi[1]);
+  const __m128i sinpi2 = _mm_set1_epi32((int)sinpi[2]);
+  const __m128i sinpi3 = _mm_set1_epi32((int)sinpi[3]);
+  const __m128i sinpi4 = _mm_set1_epi32((int)sinpi[4]);
+  __m128i t;
+  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
+  __m128i x0, x1, x2, x3;
+  __m128i u0, u1, u2, u3;
+  __m128i v0, v1, v2, v3;
+
+  int idx = 0 * num_col;
+  s0 = _mm_mullo_epi32(in[idx], sinpi1);
+  s1 = _mm_mullo_epi32(in[idx], sinpi4);
+  t = _mm_add_epi32(in[idx], in[idx + num_col]);
+  idx += num_col;
+  s2 = _mm_mullo_epi32(in[idx], sinpi2);
+  s3 = _mm_mullo_epi32(in[idx], sinpi1);
+  idx += num_col;
+  s4 = _mm_mullo_epi32(in[idx], sinpi3);
+  idx += num_col;
+  s5 = _mm_mullo_epi32(in[idx], sinpi4);
+  s6 = _mm_mullo_epi32(in[idx], sinpi2);
+  s7 = _mm_sub_epi32(t, in[idx]);
+
+  t = _mm_add_epi32(s0, s2);
+  x0 = _mm_add_epi32(t, s5);
+  x1 = _mm_mullo_epi32(s7, sinpi3);
+  t = _mm_sub_epi32(s1, s3);
+  x2 = _mm_add_epi32(t, s6);
+  x3 = s4;
+
+  s0 = _mm_add_epi32(x0, x3);
+  s1 = x1;
+  s2 = _mm_sub_epi32(x2, x3);
+  t = _mm_sub_epi32(x2, x0);
+  s3 = _mm_add_epi32(t, x3);
+
+  u0 = _mm_add_epi32(s0, rnding);
+  u0 = _mm_srai_epi32(u0, bit);
+
+  u1 = _mm_add_epi32(s1, rnding);
+  u1 = _mm_srai_epi32(u1, bit);
+
+  u2 = _mm_add_epi32(s2, rnding);
+  u2 = _mm_srai_epi32(u2, bit);
+
+  u3 = _mm_add_epi32(s3, rnding);
+  u3 = _mm_srai_epi32(u3, bit);
+
+  v0 = _mm_unpacklo_epi32(u0, u1);
+  v1 = _mm_unpackhi_epi32(u0, u1);
+  v2 = _mm_unpacklo_epi32(u2, u3);
+  v3 = _mm_unpackhi_epi32(u2, u3);
+
+  out[0] = _mm_unpacklo_epi64(v0, v2);
+  out[1] = _mm_unpackhi_epi64(v0, v2);
+  out[2] = _mm_unpacklo_epi64(v1, v3);
+  out[3] = _mm_unpackhi_epi64(v1, v3);
+}
+static void idtx4x4_sse4_1(__m128i *in, __m128i *out, int bit, int col_num) {
+  (void)bit;
+  __m128i fact = _mm_set1_epi32(NewSqrt2);
+  __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1));
+  __m128i a_low;
+  __m128i v[4];
+
+  for (int i = 0; i < 4; i++) {
+    a_low = _mm_mullo_epi32(in[i * col_num], fact);
+    a_low = _mm_add_epi32(a_low, offset);
+    out[i] = _mm_srai_epi32(a_low, NewSqrt2Bits);
+  }
+
+  // Transpose for 4x4
+  v[0] = _mm_unpacklo_epi32(out[0], out[1]);
+  v[1] = _mm_unpackhi_epi32(out[0], out[1]);
+  v[2] = _mm_unpacklo_epi32(out[2], out[3]);
+  v[3] = _mm_unpackhi_epi32(out[2], out[3]);
+
+  out[0] = _mm_unpacklo_epi64(v[0], v[2]);
+  out[1] = _mm_unpackhi_epi64(v[0], v[2]);
+  out[2] = _mm_unpacklo_epi64(v[1], v[3]);
+  out[3] = _mm_unpackhi_epi64(v[1], v[3]);
+}
+void av1_fwd_txfm2d_4x4_sse4_1(const int16_t *input, int32_t *coeff,
+                               int input_stride, TX_TYPE tx_type, int bd) {
+  __m128i in[4];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X4];
+  const int txw_idx = get_txw_idx(TX_4X4);
+  const int txh_idx = get_txh_idx(TX_4X4);
+
+  switch (tx_type) {
+    case DCT_DCT:
+      load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
+      fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+      fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      write_buffer_4x4(in, coeff);
+      break;
+    case ADST_DCT:
+      load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
+      fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+      fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      write_buffer_4x4(in, coeff);
+      break;
+    case DCT_ADST:
+      load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
+      fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+      fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      write_buffer_4x4(in, coeff);
+      break;
+    case ADST_ADST:
+      load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
+      fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+      fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      write_buffer_4x4(in, coeff);
+      break;
+    case FLIPADST_DCT:
+      load_buffer_4x4(input, in, input_stride, 1, 0, shift[0]);
+      fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+      fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      write_buffer_4x4(in, coeff);
+      break;
+    case DCT_FLIPADST:
+      load_buffer_4x4(input, in, input_stride, 0, 1, shift[0]);
+      fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+      fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      write_buffer_4x4(in, coeff);
+      break;
+    case FLIPADST_FLIPADST:
+      load_buffer_4x4(input, in, input_stride, 1, 1, shift[0]);
+      fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+      fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      write_buffer_4x4(in, coeff);
+      break;
+    case ADST_FLIPADST:
+      load_buffer_4x4(input, in, input_stride, 0, 1, shift[0]);
+      fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+      fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      write_buffer_4x4(in, coeff);
+      break;
+    case FLIPADST_ADST:
+      load_buffer_4x4(input, in, input_stride, 1, 0, shift[0]);
+      fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+      fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      write_buffer_4x4(in, coeff);
+      break;
+    case IDTX:
+      load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
+      idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+      idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      write_buffer_4x4(in, coeff);
+      break;
+    case V_DCT:
+      load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
+      fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+      idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      write_buffer_4x4(in, coeff);
+      break;
+    case H_DCT:
+      load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
+      idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      fdct4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+      write_buffer_4x4(in, coeff);
+      break;
+    case V_ADST:
+      load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
+      fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+      idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      write_buffer_4x4(in, coeff);
+      break;
+    case H_ADST:
+      load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
+      idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_col[txw_idx][txh_idx], 1);
+      write_buffer_4x4(in, coeff);
+      break;
+    case V_FLIPADST:
+      load_buffer_4x4(input, in, input_stride, 1, 0, shift[0]);
+      fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      write_buffer_4x4(in, coeff);
+      break;
+    case H_FLIPADST:
+      load_buffer_4x4(input, in, input_stride, 0, 1, shift[0]);
+      idtx4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      fadst4x4_sse4_1(in, in, av1_fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      write_buffer_4x4(in, coeff);
+      break;
+    default: assert(0);
+  }
+  (void)bd;
+}
+
+static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in,
+                                   int stride, int flipud, int fliplr,
+                                   int shift) {
+  __m128i u;
+  if (!flipud) {
+    in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride));
+    in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride));
+    in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride));
+    in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride));
+    in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride));
+    in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride));
+    in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride));
+    in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride));
+  } else {
+    in[0] = _mm_load_si128((const __m128i *)(input + 7 * stride));
+    in[1] = _mm_load_si128((const __m128i *)(input + 6 * stride));
+    in[2] = _mm_load_si128((const __m128i *)(input + 5 * stride));
+    in[3] = _mm_load_si128((const __m128i *)(input + 4 * stride));
+    in[4] = _mm_load_si128((const __m128i *)(input + 3 * stride));
+    in[5] = _mm_load_si128((const __m128i *)(input + 2 * stride));
+    in[6] = _mm_load_si128((const __m128i *)(input + 1 * stride));
+    in[7] = _mm_load_si128((const __m128i *)(input + 0 * stride));
+  }
+
+  if (fliplr) {
+    in[0] = mm_reverse_epi16(in[0]);
+    in[1] = mm_reverse_epi16(in[1]);
+    in[2] = mm_reverse_epi16(in[2]);
+    in[3] = mm_reverse_epi16(in[3]);
+    in[4] = mm_reverse_epi16(in[4]);
+    in[5] = mm_reverse_epi16(in[5]);
+    in[6] = mm_reverse_epi16(in[6]);
+    in[7] = mm_reverse_epi16(in[7]);
+  }
+
+  u = _mm_unpackhi_epi64(in[4], in[4]);
+  in[8] = _mm_cvtepi16_epi32(in[4]);
+  in[9] = _mm_cvtepi16_epi32(u);
+
+  u = _mm_unpackhi_epi64(in[5], in[5]);
+  in[10] = _mm_cvtepi16_epi32(in[5]);
+  in[11] = _mm_cvtepi16_epi32(u);
+
+  u = _mm_unpackhi_epi64(in[6], in[6]);
+  in[12] = _mm_cvtepi16_epi32(in[6]);
+  in[13] = _mm_cvtepi16_epi32(u);
+
+  u = _mm_unpackhi_epi64(in[7], in[7]);
+  in[14] = _mm_cvtepi16_epi32(in[7]);
+  in[15] = _mm_cvtepi16_epi32(u);
+
+  u = _mm_unpackhi_epi64(in[3], in[3]);
+  in[6] = _mm_cvtepi16_epi32(in[3]);
+  in[7] = _mm_cvtepi16_epi32(u);
+
+  u = _mm_unpackhi_epi64(in[2], in[2]);
+  in[4] = _mm_cvtepi16_epi32(in[2]);
+  in[5] = _mm_cvtepi16_epi32(u);
+
+  u = _mm_unpackhi_epi64(in[1], in[1]);
+  in[2] = _mm_cvtepi16_epi32(in[1]);
+  in[3] = _mm_cvtepi16_epi32(u);
+
+  u = _mm_unpackhi_epi64(in[0], in[0]);
+  in[0] = _mm_cvtepi16_epi32(in[0]);
+  in[1] = _mm_cvtepi16_epi32(u);
+
+  in[0] = _mm_slli_epi32(in[0], shift);
+  in[1] = _mm_slli_epi32(in[1], shift);
+  in[2] = _mm_slli_epi32(in[2], shift);
+  in[3] = _mm_slli_epi32(in[3], shift);
+  in[4] = _mm_slli_epi32(in[4], shift);
+  in[5] = _mm_slli_epi32(in[5], shift);
+  in[6] = _mm_slli_epi32(in[6], shift);
+  in[7] = _mm_slli_epi32(in[7], shift);
+
+  in[8] = _mm_slli_epi32(in[8], shift);
+  in[9] = _mm_slli_epi32(in[9], shift);
+  in[10] = _mm_slli_epi32(in[10], shift);
+  in[11] = _mm_slli_epi32(in[11], shift);
+  in[12] = _mm_slli_epi32(in[12], shift);
+  in[13] = _mm_slli_epi32(in[13], shift);
+  in[14] = _mm_slli_epi32(in[14], shift);
+  in[15] = _mm_slli_epi32(in[15], shift);
+}
+
+static INLINE void col_txfm_8x8_rounding(__m128i *in, int shift) {
+  const __m128i rounding = _mm_set1_epi32(1 << (shift - 1));
+
+  in[0] = _mm_add_epi32(in[0], rounding);
+  in[1] = _mm_add_epi32(in[1], rounding);
+  in[2] = _mm_add_epi32(in[2], rounding);
+  in[3] = _mm_add_epi32(in[3], rounding);
+  in[4] = _mm_add_epi32(in[4], rounding);
+  in[5] = _mm_add_epi32(in[5], rounding);
+  in[6] = _mm_add_epi32(in[6], rounding);
+  in[7] = _mm_add_epi32(in[7], rounding);
+  in[8] = _mm_add_epi32(in[8], rounding);
+  in[9] = _mm_add_epi32(in[9], rounding);
+  in[10] = _mm_add_epi32(in[10], rounding);
+  in[11] = _mm_add_epi32(in[11], rounding);
+  in[12] = _mm_add_epi32(in[12], rounding);
+  in[13] = _mm_add_epi32(in[13], rounding);
+  in[14] = _mm_add_epi32(in[14], rounding);
+  in[15] = _mm_add_epi32(in[15], rounding);
+
+  in[0] = _mm_srai_epi32(in[0], shift);
+  in[1] = _mm_srai_epi32(in[1], shift);
+  in[2] = _mm_srai_epi32(in[2], shift);
+  in[3] = _mm_srai_epi32(in[3], shift);
+  in[4] = _mm_srai_epi32(in[4], shift);
+  in[5] = _mm_srai_epi32(in[5], shift);
+  in[6] = _mm_srai_epi32(in[6], shift);
+  in[7] = _mm_srai_epi32(in[7], shift);
+  in[8] = _mm_srai_epi32(in[8], shift);
+  in[9] = _mm_srai_epi32(in[9], shift);
+  in[10] = _mm_srai_epi32(in[10], shift);
+  in[11] = _mm_srai_epi32(in[11], shift);
+  in[12] = _mm_srai_epi32(in[12], shift);
+  in[13] = _mm_srai_epi32(in[13], shift);
+  in[14] = _mm_srai_epi32(in[14], shift);
+  in[15] = _mm_srai_epi32(in[15], shift);
+}
+
+static INLINE void col_txfm_4x8_rounding(__m128i *in, int shift) {
+  const __m128i rounding = _mm_set1_epi32(1 << (shift - 1));
+
+  in[0] = _mm_add_epi32(in[0], rounding);
+  in[1] = _mm_add_epi32(in[1], rounding);
+  in[2] = _mm_add_epi32(in[2], rounding);
+  in[3] = _mm_add_epi32(in[3], rounding);
+  in[4] = _mm_add_epi32(in[4], rounding);
+  in[5] = _mm_add_epi32(in[5], rounding);
+  in[6] = _mm_add_epi32(in[6], rounding);
+  in[7] = _mm_add_epi32(in[7], rounding);
+
+  in[0] = _mm_srai_epi32(in[0], shift);
+  in[1] = _mm_srai_epi32(in[1], shift);
+  in[2] = _mm_srai_epi32(in[2], shift);
+  in[3] = _mm_srai_epi32(in[3], shift);
+  in[4] = _mm_srai_epi32(in[4], shift);
+  in[5] = _mm_srai_epi32(in[5], shift);
+  in[6] = _mm_srai_epi32(in[6], shift);
+  in[7] = _mm_srai_epi32(in[7], shift);
+}
+
+static INLINE void write_buffer_8x8(const __m128i *res, int32_t *output) {
+  _mm_store_si128((__m128i *)(output + 0 * 4), res[0]);
+  _mm_store_si128((__m128i *)(output + 1 * 4), res[1]);
+  _mm_store_si128((__m128i *)(output + 2 * 4), res[2]);
+  _mm_store_si128((__m128i *)(output + 3 * 4), res[3]);
+
+  _mm_store_si128((__m128i *)(output + 4 * 4), res[4]);
+  _mm_store_si128((__m128i *)(output + 5 * 4), res[5]);
+  _mm_store_si128((__m128i *)(output + 6 * 4), res[6]);
+  _mm_store_si128((__m128i *)(output + 7 * 4), res[7]);
+
+  _mm_store_si128((__m128i *)(output + 8 * 4), res[8]);
+  _mm_store_si128((__m128i *)(output + 9 * 4), res[9]);
+  _mm_store_si128((__m128i *)(output + 10 * 4), res[10]);
+  _mm_store_si128((__m128i *)(output + 11 * 4), res[11]);
+
+  _mm_store_si128((__m128i *)(output + 12 * 4), res[12]);
+  _mm_store_si128((__m128i *)(output + 13 * 4), res[13]);
+  _mm_store_si128((__m128i *)(output + 14 * 4), res[14]);
+  _mm_store_si128((__m128i *)(output + 15 * 4), res[15]);
+}
+
+static INLINE void write_buffer_16x8(const __m128i *res, int32_t *output,
+                                     const int stride) {
+  _mm_storeu_si128((__m128i *)(output), res[0]);
+  _mm_storeu_si128((__m128i *)(output + 4), res[1]);
+  _mm_storeu_si128((__m128i *)(output + stride), res[2]);
+  _mm_storeu_si128((__m128i *)(output + stride + 4), res[3]);
+
+  _mm_storeu_si128((__m128i *)(output + (stride * 2)), res[4]);
+  _mm_storeu_si128((__m128i *)(output + (stride * 2) + 4), res[5]);
+  _mm_storeu_si128((__m128i *)(output + (stride * 3)), res[6]);
+  _mm_storeu_si128((__m128i *)(output + (stride * 3) + 4), res[7]);
+
+  _mm_storeu_si128((__m128i *)(output + (stride * 4)), res[8]);
+  _mm_storeu_si128((__m128i *)(output + (stride * 4) + 4), res[9]);
+  _mm_storeu_si128((__m128i *)(output + (stride * 5)), res[10]);
+  _mm_storeu_si128((__m128i *)(output + (stride * 5) + 4), res[11]);
+
+  _mm_storeu_si128((__m128i *)(output + (stride * 6)), res[12]);
+  _mm_storeu_si128((__m128i *)(output + (stride * 6) + 4), res[13]);
+  _mm_storeu_si128((__m128i *)(output + (stride * 7)), res[14]);
+  _mm_storeu_si128((__m128i *)(output + (stride * 7) + 4), res[15]);
+}
+
+static void fdct4x8_sse4_1(__m128i *in, __m128i *out, int bit,
+                           const int col_num) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  __m128i u[8], v[8];
+
+  int startidx = 0 * col_num;
+  int endidx = 7 * col_num;
+  // Even 8 points 0, 2, ..., 14
+  // stage 0
+  // stage 1
+  u[0] = _mm_add_epi32(in[startidx], in[endidx]);
+  v[7] = _mm_sub_epi32(in[startidx], in[endidx]);  // v[7]
+  startidx += col_num;
+  endidx -= col_num;
+  u[1] = _mm_add_epi32(in[startidx], in[endidx]);
+  u[6] = _mm_sub_epi32(in[startidx], in[endidx]);
+  startidx += col_num;
+  endidx -= col_num;
+  u[2] = _mm_add_epi32(in[startidx], in[endidx]);
+  u[5] = _mm_sub_epi32(in[startidx], in[endidx]);
+  startidx += col_num;
+  endidx -= col_num;
+  u[3] = _mm_add_epi32(in[startidx], in[endidx]);
+  v[4] = _mm_sub_epi32(in[startidx], in[endidx]);  // v[4]
+
+  // stage 2
+  v[0] = _mm_add_epi32(u[0], u[3]);
+  v[3] = _mm_sub_epi32(u[0], u[3]);
+  v[1] = _mm_add_epi32(u[1], u[2]);
+  v[2] = _mm_sub_epi32(u[1], u[2]);
+
+  v[5] = _mm_mullo_epi32(u[5], cospim32);
+  v[6] = _mm_mullo_epi32(u[6], cospi32);
+  v[5] = _mm_add_epi32(v[5], v[6]);
+  v[5] = _mm_add_epi32(v[5], rnding);
+  v[5] = _mm_srai_epi32(v[5], bit);
+
+  u[0] = _mm_mullo_epi32(u[5], cospi32);
+  v[6] = _mm_mullo_epi32(u[6], cospim32);
+  v[6] = _mm_sub_epi32(u[0], v[6]);
+  v[6] = _mm_add_epi32(v[6], rnding);
+  v[6] = _mm_srai_epi32(v[6], bit);
+
+  // stage 3
+  // type 0
+  v[0] = _mm_mullo_epi32(v[0], cospi32);
+  v[1] = _mm_mullo_epi32(v[1], cospi32);
+  u[0] = _mm_add_epi32(v[0], v[1]);
+  u[0] = _mm_add_epi32(u[0], rnding);
+  u[0] = _mm_srai_epi32(u[0], bit);
+
+  u[1] = _mm_sub_epi32(v[0], v[1]);
+  u[1] = _mm_add_epi32(u[1], rnding);
+  u[1] = _mm_srai_epi32(u[1], bit);
+
+  // type 1
+  v[0] = _mm_mullo_epi32(v[2], cospi48);
+  v[1] = _mm_mullo_epi32(v[3], cospi16);
+  u[2] = _mm_add_epi32(v[0], v[1]);
+  u[2] = _mm_add_epi32(u[2], rnding);
+  u[2] = _mm_srai_epi32(u[2], bit);
+
+  v[0] = _mm_mullo_epi32(v[2], cospi16);
+  v[1] = _mm_mullo_epi32(v[3], cospi48);
+  u[3] = _mm_sub_epi32(v[1], v[0]);
+  u[3] = _mm_add_epi32(u[3], rnding);
+  u[3] = _mm_srai_epi32(u[3], bit);
+
+  u[4] = _mm_add_epi32(v[4], v[5]);
+  u[5] = _mm_sub_epi32(v[4], v[5]);
+  u[6] = _mm_sub_epi32(v[7], v[6]);
+  u[7] = _mm_add_epi32(v[7], v[6]);
+
+  // stage 4
+  // stage 5
+  v[0] = _mm_mullo_epi32(u[4], cospi56);
+  v[1] = _mm_mullo_epi32(u[7], cospi8);
+  v[0] = _mm_add_epi32(v[0], v[1]);
+  v[0] = _mm_add_epi32(v[0], rnding);
+  out[1 * col_num] = _mm_srai_epi32(v[0], bit);  // buf0[4]
+
+  v[0] = _mm_mullo_epi32(u[4], cospi8);
+  v[1] = _mm_mullo_epi32(u[7], cospi56);
+  v[0] = _mm_sub_epi32(v[1], v[0]);
+  v[0] = _mm_add_epi32(v[0], rnding);
+  out[7 * col_num] = _mm_srai_epi32(v[0], bit);  // buf0[7]
+
+  v[0] = _mm_mullo_epi32(u[5], cospi24);
+  v[1] = _mm_mullo_epi32(u[6], cospi40);
+  v[0] = _mm_add_epi32(v[0], v[1]);
+  v[0] = _mm_add_epi32(v[0], rnding);
+  out[5 * col_num] = _mm_srai_epi32(v[0], bit);  // buf0[5]
+
+  v[0] = _mm_mullo_epi32(u[5], cospi40);
+  v[1] = _mm_mullo_epi32(u[6], cospi24);
+  v[0] = _mm_sub_epi32(v[1], v[0]);
+  v[0] = _mm_add_epi32(v[0], rnding);
+  out[3 * col_num] = _mm_srai_epi32(v[0], bit);  // buf0[6]
+
+  out[0 * col_num] = u[0];  // buf0[0]
+  out[4 * col_num] = u[1];  // buf0[1]
+  out[2 * col_num] = u[2];  // buf0[2]
+  out[6 * col_num] = u[3];  // buf0[3]
+}
+
+static void fdct8x8_sse4_1(__m128i *in, __m128i *out, int bit,
+                           const int col_num) {
+  fdct4x8_sse4_1(in, out, bit, col_num);
+  fdct4x8_sse4_1(in + 1, out + 1, bit, col_num);
+}
+
+static void fadst8x8_sse4_1(__m128i *in, __m128i *out, int bit,
+                            const int col_num) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+  const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
+  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+  const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+  const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
+  const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+  const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+  const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+  const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+  const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+  const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+  const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const __m128i zero = _mm_setzero_si128();
+  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+  __m128i x, y;
+  int col;
+
+  // Note:
+  //  Even column: 0, 2, ..., 14
+  //  Odd column: 1, 3, ..., 15
+  //  one even column plus one odd column constructs one row (8 coeffs)
+  //  total we have 8 rows (8x8).
+  for (col = 0; col < col_num; ++col) {
+    // stage 0
+    // stage 1
+    u0 = in[col_num * 0 + col];
+    u1 = _mm_sub_epi32(zero, in[col_num * 7 + col]);
+    u2 = _mm_sub_epi32(zero, in[col_num * 3 + col]);
+    u3 = in[col_num * 4 + col];
+    u4 = _mm_sub_epi32(zero, in[col_num * 1 + col]);
+    u5 = in[col_num * 6 + col];
+    u6 = in[col_num * 2 + col];
+    u7 = _mm_sub_epi32(zero, in[col_num * 5 + col]);
+
+    // stage 2
+    v0 = u0;
+    v1 = u1;
+
+    x = _mm_mullo_epi32(u2, cospi32);
+    y = _mm_mullo_epi32(u3, cospi32);
+    v2 = _mm_add_epi32(x, y);
+    v2 = _mm_add_epi32(v2, rnding);
+    v2 = _mm_srai_epi32(v2, bit);
+
+    v3 = _mm_sub_epi32(x, y);
+    v3 = _mm_add_epi32(v3, rnding);
+    v3 = _mm_srai_epi32(v3, bit);
+
+    v4 = u4;
+    v5 = u5;
+
+    x = _mm_mullo_epi32(u6, cospi32);
+    y = _mm_mullo_epi32(u7, cospi32);
+    v6 = _mm_add_epi32(x, y);
+    v6 = _mm_add_epi32(v6, rnding);
+    v6 = _mm_srai_epi32(v6, bit);
+
+    v7 = _mm_sub_epi32(x, y);
+    v7 = _mm_add_epi32(v7, rnding);
+    v7 = _mm_srai_epi32(v7, bit);
+
+    // stage 3
+    u0 = _mm_add_epi32(v0, v2);
+    u1 = _mm_add_epi32(v1, v3);
+    u2 = _mm_sub_epi32(v0, v2);
+    u3 = _mm_sub_epi32(v1, v3);
+    u4 = _mm_add_epi32(v4, v6);
+    u5 = _mm_add_epi32(v5, v7);
+    u6 = _mm_sub_epi32(v4, v6);
+    u7 = _mm_sub_epi32(v5, v7);
+
+    // stage 4
+    v0 = u0;
+    v1 = u1;
+    v2 = u2;
+    v3 = u3;
+
+    x = _mm_mullo_epi32(u4, cospi16);
+    y = _mm_mullo_epi32(u5, cospi48);
+    v4 = _mm_add_epi32(x, y);
+    v4 = _mm_add_epi32(v4, rnding);
+    v4 = _mm_srai_epi32(v4, bit);
+
+    x = _mm_mullo_epi32(u4, cospi48);
+    y = _mm_mullo_epi32(u5, cospim16);
+    v5 = _mm_add_epi32(x, y);
+    v5 = _mm_add_epi32(v5, rnding);
+    v5 = _mm_srai_epi32(v5, bit);
+
+    x = _mm_mullo_epi32(u6, cospim48);
+    y = _mm_mullo_epi32(u7, cospi16);
+    v6 = _mm_add_epi32(x, y);
+    v6 = _mm_add_epi32(v6, rnding);
+    v6 = _mm_srai_epi32(v6, bit);
+
+    x = _mm_mullo_epi32(u6, cospi16);
+    y = _mm_mullo_epi32(u7, cospi48);
+    v7 = _mm_add_epi32(x, y);
+    v7 = _mm_add_epi32(v7, rnding);
+    v7 = _mm_srai_epi32(v7, bit);
+
+    // stage 5
+    u0 = _mm_add_epi32(v0, v4);
+    u1 = _mm_add_epi32(v1, v5);
+    u2 = _mm_add_epi32(v2, v6);
+    u3 = _mm_add_epi32(v3, v7);
+    u4 = _mm_sub_epi32(v0, v4);
+    u5 = _mm_sub_epi32(v1, v5);
+    u6 = _mm_sub_epi32(v2, v6);
+    u7 = _mm_sub_epi32(v3, v7);
+
+    // stage 6
+    x = _mm_mullo_epi32(u0, cospi4);
+    y = _mm_mullo_epi32(u1, cospi60);
+    v0 = _mm_add_epi32(x, y);
+    v0 = _mm_add_epi32(v0, rnding);
+    v0 = _mm_srai_epi32(v0, bit);
+
+    x = _mm_mullo_epi32(u0, cospi60);
+    y = _mm_mullo_epi32(u1, cospim4);
+    v1 = _mm_add_epi32(x, y);
+    v1 = _mm_add_epi32(v1, rnding);
+    v1 = _mm_srai_epi32(v1, bit);
+
+    x = _mm_mullo_epi32(u2, cospi20);
+    y = _mm_mullo_epi32(u3, cospi44);
+    v2 = _mm_add_epi32(x, y);
+    v2 = _mm_add_epi32(v2, rnding);
+    v2 = _mm_srai_epi32(v2, bit);
+
+    x = _mm_mullo_epi32(u2, cospi44);
+    y = _mm_mullo_epi32(u3, cospim20);
+    v3 = _mm_add_epi32(x, y);
+    v3 = _mm_add_epi32(v3, rnding);
+    v3 = _mm_srai_epi32(v3, bit);
+
+    x = _mm_mullo_epi32(u4, cospi36);
+    y = _mm_mullo_epi32(u5, cospi28);
+    v4 = _mm_add_epi32(x, y);
+    v4 = _mm_add_epi32(v4, rnding);
+    v4 = _mm_srai_epi32(v4, bit);
+
+    x = _mm_mullo_epi32(u4, cospi28);
+    y = _mm_mullo_epi32(u5, cospim36);
+    v5 = _mm_add_epi32(x, y);
+    v5 = _mm_add_epi32(v5, rnding);
+    v5 = _mm_srai_epi32(v5, bit);
+
+    x = _mm_mullo_epi32(u6, cospi52);
+    y = _mm_mullo_epi32(u7, cospi12);
+    v6 = _mm_add_epi32(x, y);
+    v6 = _mm_add_epi32(v6, rnding);
+    v6 = _mm_srai_epi32(v6, bit);
+
+    x = _mm_mullo_epi32(u6, cospi12);
+    y = _mm_mullo_epi32(u7, cospim52);
+    v7 = _mm_add_epi32(x, y);
+    v7 = _mm_add_epi32(v7, rnding);
+    v7 = _mm_srai_epi32(v7, bit);
+
+    // stage 7
+    out[col_num * 0 + col] = v1;
+    out[col_num * 1 + col] = v6;
+    out[col_num * 2 + col] = v3;
+    out[col_num * 3 + col] = v4;
+    out[col_num * 4 + col] = v5;
+    out[col_num * 5 + col] = v2;
+    out[col_num * 6 + col] = v7;
+    out[col_num * 7 + col] = v0;
+  }
+}
+static void idtx8x8_sse4_1(__m128i *in, __m128i *out, int bit, int col_num) {
+  (void)bit;
+
+  for (int i = 0; i < col_num; i += 1) {
+    out[0 + 8 * i] = _mm_add_epi32(in[0 + 8 * i], in[0 + 8 * i]);
+    out[1 + 8 * i] = _mm_add_epi32(in[1 + 8 * i], in[1 + 8 * i]);
+    out[2 + 8 * i] = _mm_add_epi32(in[2 + 8 * i], in[2 + 8 * i]);
+    out[3 + 8 * i] = _mm_add_epi32(in[3 + 8 * i], in[3 + 8 * i]);
+    out[4 + 8 * i] = _mm_add_epi32(in[4 + 8 * i], in[4 + 8 * i]);
+    out[5 + 8 * i] = _mm_add_epi32(in[5 + 8 * i], in[5 + 8 * i]);
+    out[6 + 8 * i] = _mm_add_epi32(in[6 + 8 * i], in[6 + 8 * i]);
+    out[7 + 8 * i] = _mm_add_epi32(in[7 + 8 * i], in[7 + 8 * i]);
+  }
+}
+static void idtx32x8_sse4_1(__m128i *in, __m128i *out, int bit, int col_num) {
+  (void)bit;
+  (void)col_num;
+  for (int j = 0; j < 2; j++) {
+    out[j + 8 * 0] = _mm_add_epi32(in[j + 8 * 0], in[j + 8 * 0]);
+    out[j + 8 * 1] = _mm_add_epi32(in[j + 8 * 1], in[j + 8 * 1]);
+    out[j + 8 * 2] = _mm_add_epi32(in[j + 8 * 2], in[j + 8 * 2]);
+    out[j + 8 * 3] = _mm_add_epi32(in[j + 8 * 3], in[j + 8 * 3]);
+    out[j + 8 * 4] = _mm_add_epi32(in[j + 8 * 4], in[j + 8 * 4]);
+    out[j + 8 * 5] = _mm_add_epi32(in[j + 8 * 5], in[j + 8 * 5]);
+    out[j + 8 * 6] = _mm_add_epi32(in[j + 8 * 6], in[j + 8 * 6]);
+    out[j + 8 * 7] = _mm_add_epi32(in[j + 8 * 7], in[j + 8 * 7]);
+  }
+}
+void av1_fwd_txfm2d_8x8_sse4_1(const int16_t *input, int32_t *coeff, int stride,
+                               TX_TYPE tx_type, int bd) {
+  __m128i in[16], out[16];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X8];
+  const int txw_idx = get_txw_idx(TX_8X8);
+  const int txh_idx = get_txh_idx(TX_8X8);
+
+  switch (tx_type) {
+    case DCT_DCT:
+      load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+      fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      transpose_8x8(out, in);
+      fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case ADST_DCT:
+      load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+      fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      transpose_8x8(out, in);
+      fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case DCT_ADST:
+      load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+      fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      transpose_8x8(out, in);
+      fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case ADST_ADST:
+      load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+      fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      transpose_8x8(out, in);
+      fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case FLIPADST_DCT:
+      load_buffer_8x8(input, in, stride, 1, 0, shift[0]);
+      fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      transpose_8x8(out, in);
+      fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case DCT_FLIPADST:
+      load_buffer_8x8(input, in, stride, 0, 1, shift[0]);
+      fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      transpose_8x8(out, in);
+      fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case FLIPADST_FLIPADST:
+      load_buffer_8x8(input, in, stride, 1, 1, shift[0]);
+      fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      transpose_8x8(out, in);
+      fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case ADST_FLIPADST:
+      load_buffer_8x8(input, in, stride, 0, 1, shift[0]);
+      fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      transpose_8x8(out, in);
+      fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case FLIPADST_ADST:
+      load_buffer_8x8(input, in, stride, 1, 0, shift[0]);
+      fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      transpose_8x8(out, in);
+      fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], 2);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case IDTX:
+      load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+      idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      transpose_8x8(out, in);
+      idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case V_DCT:
+      load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+      fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      transpose_8x8(out, in);
+      idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case H_DCT:
+      load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+      idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      transpose_8x8(out, in);
+      fdct8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case V_ADST:
+      load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+      fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      transpose_8x8(out, in);
+      idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case H_ADST:
+      load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+      idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      transpose_8x8(out, in);
+      fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case V_FLIPADST:
+      load_buffer_8x8(input, in, stride, 1, 0, shift[0]);
+      fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      transpose_8x8(out, in);
+      idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case H_FLIPADST:
+      load_buffer_8x8(input, in, stride, 0, 1, shift[0]);
+      idtx8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      transpose_8x8(out, in);
+      fadst8x8_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    default: assert(0);
+  }
+  (void)bd;
+}
+
+// Hybrid Transform 16x16
+
+static INLINE void convert_8x8_to_16x16(const __m128i *in, __m128i *out) {
+  int row_index = 0;
+  int dst_index = 0;
+  int src_index = 0;
+
+  // row 0, 1, .., 7
+  do {
+    out[dst_index] = in[src_index];
+    out[dst_index + 1] = in[src_index + 1];
+    out[dst_index + 2] = in[src_index + 16];
+    out[dst_index + 3] = in[src_index + 17];
+    dst_index += 4;
+    src_index += 2;
+    row_index += 1;
+  } while (row_index < 8);
+
+  // row 8, 9, ..., 15
+  src_index += 16;
+  do {
+    out[dst_index] = in[src_index];
+    out[dst_index + 1] = in[src_index + 1];
+    out[dst_index + 2] = in[src_index + 16];
+    out[dst_index + 3] = in[src_index + 17];
+    dst_index += 4;
+    src_index += 2;
+    row_index += 1;
+  } while (row_index < 16);
+}
+
+static INLINE void load_buffer_16x16(const int16_t *input, __m128i *out,
+                                     int stride, int flipud, int fliplr,
+                                     int shift) {
+  __m128i in[64];
+  // Load 4 8x8 blocks
+  const int16_t *topL = input;
+  const int16_t *topR = input + 8;
+  const int16_t *botL = input + 8 * stride;
+  const int16_t *botR = input + 8 * stride + 8;
+
+  const int16_t *tmp;
+
+  if (flipud) {
+    // Swap left columns
+    tmp = topL;
+    topL = botL;
+    botL = tmp;
+    // Swap right columns
+    tmp = topR;
+    topR = botR;
+    botR = tmp;
+  }
+
+  if (fliplr) {
+    // Swap top rows
+    tmp = topL;
+    topL = topR;
+    topR = tmp;
+    // Swap bottom rows
+    tmp = botL;
+    botL = botR;
+    botR = tmp;
+  }
+
+  // load first 8 columns
+  load_buffer_8x8(topL, &in[0], stride, flipud, fliplr, shift);
+  load_buffer_8x8(botL, &in[32], stride, flipud, fliplr, shift);
+
+  // load second 8 columns
+  load_buffer_8x8(topR, &in[16], stride, flipud, fliplr, shift);
+  load_buffer_8x8(botR, &in[48], stride, flipud, fliplr, shift);
+
+  convert_8x8_to_16x16(in, out);
+}
+
+static INLINE void load_buffer_8x16(const int16_t *input, __m128i *out,
+                                    int stride, int flipud, int fliplr,
+                                    int shift) {
+  const int16_t *topL = input;
+  const int16_t *botL = input + 8 * stride;
+
+  const int16_t *tmp;
+
+  if (flipud) {
+    tmp = topL;
+    topL = botL;
+    botL = tmp;
+  }
+
+  load_buffer_8x8(topL, out, stride, flipud, fliplr, shift);
+  load_buffer_8x8(botL, out + 16, stride, flipud, fliplr, shift);
+}
+
+static INLINE void load_buffer_8x4(const int16_t *input, __m128i *out,
+                                   int stride, int flipud, int fliplr,
+                                   int shift) {
+  const int16_t *topL = input;
+  const int16_t *topR = input + 4;
+
+  const int16_t *tmp;
+
+  if (fliplr) {
+    tmp = topL;
+    topL = topR;
+    topR = tmp;
+  }
+
+  load_buffer_4x4(topL, out, stride, flipud, fliplr, shift);
+  load_buffer_4x4(topR, out + 4, stride, flipud, fliplr, shift);
+}
+
+static INLINE void load_buffer_16x4(const int16_t *input, __m128i *out,
+                                    int stride, int flipud, int fliplr,
+                                    int shift) {
+  const int16_t *topL = input;
+  const int16_t *topR = input + 8;
+
+  const int16_t *tmp;
+
+  if (fliplr) {
+    tmp = topL;
+    topL = topR;
+    topR = tmp;
+  }
+
+  load_buffer_8x4(topL, out, stride, flipud, fliplr, shift);
+  load_buffer_8x4(topR, out + 8, stride, flipud, fliplr, shift);
+}
+
+static INLINE void load_buffer_4x8(const int16_t *input, __m128i *out,
+                                   int stride, int flipud, int fliplr,
+                                   int shift) {
+  const int16_t *topL = input;
+  const int16_t *botL = input + 4 * stride;
+
+  const int16_t *tmp;
+
+  if (flipud) {
+    tmp = topL;
+    topL = botL;
+    botL = tmp;
+  }
+
+  load_buffer_4x4(topL, out, stride, flipud, fliplr, shift);
+  load_buffer_4x4(botL, out + 4, stride, flipud, fliplr, shift);
+}
+
+static INLINE void load_buffer_4x16(const int16_t *input, __m128i *out,
+                                    const int stride, const int flipud,
+                                    const int fliplr, const int shift) {
+  const int16_t *topL = input;
+  const int16_t *botL = input + 8 * stride;
+
+  const int16_t *tmp;
+
+  if (flipud) {
+    tmp = topL;
+    topL = botL;
+    botL = tmp;
+  }
+  load_buffer_4x8(topL, out, stride, flipud, fliplr, shift);
+  load_buffer_4x8(botL, out + 8, stride, flipud, fliplr, shift);
+}
+
+static INLINE void load_buffer_32x8n(const int16_t *input, __m128i *out,
+                                     int stride, int flipud, int fliplr,
+                                     int shift, const int height) {
+  const int16_t *in = input;
+  __m128i *output = out;
+  for (int col = 0; col < height; col++) {
+    in = input + col * stride;
+    output = out + col * 8;
+    load_buffer_4x4(in, output, 4, flipud, fliplr, shift);
+    load_buffer_4x4((in + 16), (output + 4), 4, flipud, fliplr, shift);
+  }
+}
+
+static void fdct16x16_sse4_1(__m128i *in, __m128i *out, int bit,
+                             const int col_num) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+  const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+  const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+  const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+  const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+  const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+  const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  __m128i u[16], v[16], x;
+  int col;
+
+  // Calculate the column 0, 1, 2, 3
+  for (col = 0; col < col_num; ++col) {
+    // stage 0
+    // stage 1
+    u[0] = _mm_add_epi32(in[0 * col_num + col], in[15 * col_num + col]);
+    u[15] = _mm_sub_epi32(in[0 * col_num + col], in[15 * col_num + col]);
+    u[1] = _mm_add_epi32(in[1 * col_num + col], in[14 * col_num + col]);
+    u[14] = _mm_sub_epi32(in[1 * col_num + col], in[14 * col_num + col]);
+    u[2] = _mm_add_epi32(in[2 * col_num + col], in[13 * col_num + col]);
+    u[13] = _mm_sub_epi32(in[2 * col_num + col], in[13 * col_num + col]);
+    u[3] = _mm_add_epi32(in[3 * col_num + col], in[12 * col_num + col]);
+    u[12] = _mm_sub_epi32(in[3 * col_num + col], in[12 * col_num + col]);
+    u[4] = _mm_add_epi32(in[4 * col_num + col], in[11 * col_num + col]);
+    u[11] = _mm_sub_epi32(in[4 * col_num + col], in[11 * col_num + col]);
+    u[5] = _mm_add_epi32(in[5 * col_num + col], in[10 * col_num + col]);
+    u[10] = _mm_sub_epi32(in[5 * col_num + col], in[10 * col_num + col]);
+    u[6] = _mm_add_epi32(in[6 * col_num + col], in[9 * col_num + col]);
+    u[9] = _mm_sub_epi32(in[6 * col_num + col], in[9 * col_num + col]);
+    u[7] = _mm_add_epi32(in[7 * col_num + col], in[8 * col_num + col]);
+    u[8] = _mm_sub_epi32(in[7 * col_num + col], in[8 * col_num + col]);
+
+    // stage 2
+    v[0] = _mm_add_epi32(u[0], u[7]);
+    v[7] = _mm_sub_epi32(u[0], u[7]);
+    v[1] = _mm_add_epi32(u[1], u[6]);
+    v[6] = _mm_sub_epi32(u[1], u[6]);
+    v[2] = _mm_add_epi32(u[2], u[5]);
+    v[5] = _mm_sub_epi32(u[2], u[5]);
+    v[3] = _mm_add_epi32(u[3], u[4]);
+    v[4] = _mm_sub_epi32(u[3], u[4]);
+    v[8] = u[8];
+    v[9] = u[9];
+
+    v[10] = _mm_mullo_epi32(u[10], cospim32);
+    x = _mm_mullo_epi32(u[13], cospi32);
+    v[10] = _mm_add_epi32(v[10], x);
+    v[10] = _mm_add_epi32(v[10], rnding);
+    v[10] = _mm_srai_epi32(v[10], bit);
+
+    v[13] = _mm_mullo_epi32(u[10], cospi32);
+    x = _mm_mullo_epi32(u[13], cospim32);
+    v[13] = _mm_sub_epi32(v[13], x);
+    v[13] = _mm_add_epi32(v[13], rnding);
+    v[13] = _mm_srai_epi32(v[13], bit);
+
+    v[11] = _mm_mullo_epi32(u[11], cospim32);
+    x = _mm_mullo_epi32(u[12], cospi32);
+    v[11] = _mm_add_epi32(v[11], x);
+    v[11] = _mm_add_epi32(v[11], rnding);
+    v[11] = _mm_srai_epi32(v[11], bit);
+
+    v[12] = _mm_mullo_epi32(u[11], cospi32);
+    x = _mm_mullo_epi32(u[12], cospim32);
+    v[12] = _mm_sub_epi32(v[12], x);
+    v[12] = _mm_add_epi32(v[12], rnding);
+    v[12] = _mm_srai_epi32(v[12], bit);
+    v[14] = u[14];
+    v[15] = u[15];
+
+    // stage 3
+    u[0] = _mm_add_epi32(v[0], v[3]);
+    u[3] = _mm_sub_epi32(v[0], v[3]);
+    u[1] = _mm_add_epi32(v[1], v[2]);
+    u[2] = _mm_sub_epi32(v[1], v[2]);
+    u[4] = v[4];
+
+    u[5] = _mm_mullo_epi32(v[5], cospim32);
+    x = _mm_mullo_epi32(v[6], cospi32);
+    u[5] = _mm_add_epi32(u[5], x);
+    u[5] = _mm_add_epi32(u[5], rnding);
+    u[5] = _mm_srai_epi32(u[5], bit);
+
+    u[6] = _mm_mullo_epi32(v[5], cospi32);
+    x = _mm_mullo_epi32(v[6], cospim32);
+    u[6] = _mm_sub_epi32(u[6], x);
+    u[6] = _mm_add_epi32(u[6], rnding);
+    u[6] = _mm_srai_epi32(u[6], bit);
+
+    u[7] = v[7];
+    u[8] = _mm_add_epi32(v[8], v[11]);
+    u[11] = _mm_sub_epi32(v[8], v[11]);
+    u[9] = _mm_add_epi32(v[9], v[10]);
+    u[10] = _mm_sub_epi32(v[9], v[10]);
+    u[12] = _mm_sub_epi32(v[15], v[12]);
+    u[15] = _mm_add_epi32(v[15], v[12]);
+    u[13] = _mm_sub_epi32(v[14], v[13]);
+    u[14] = _mm_add_epi32(v[14], v[13]);
+
+    // stage 4
+    u[0] = _mm_mullo_epi32(u[0], cospi32);
+    u[1] = _mm_mullo_epi32(u[1], cospi32);
+    v[0] = _mm_add_epi32(u[0], u[1]);
+    v[0] = _mm_add_epi32(v[0], rnding);
+    v[0] = _mm_srai_epi32(v[0], bit);
+
+    v[1] = _mm_sub_epi32(u[0], u[1]);
+    v[1] = _mm_add_epi32(v[1], rnding);
+    v[1] = _mm_srai_epi32(v[1], bit);
+
+    v[2] = _mm_mullo_epi32(u[2], cospi48);
+    x = _mm_mullo_epi32(u[3], cospi16);
+    v[2] = _mm_add_epi32(v[2], x);
+    v[2] = _mm_add_epi32(v[2], rnding);
+    v[2] = _mm_srai_epi32(v[2], bit);
+
+    v[3] = _mm_mullo_epi32(u[2], cospi16);
+    x = _mm_mullo_epi32(u[3], cospi48);
+    v[3] = _mm_sub_epi32(x, v[3]);
+    v[3] = _mm_add_epi32(v[3], rnding);
+    v[3] = _mm_srai_epi32(v[3], bit);
+
+    v[4] = _mm_add_epi32(u[4], u[5]);
+    v[5] = _mm_sub_epi32(u[4], u[5]);
+    v[6] = _mm_sub_epi32(u[7], u[6]);
+    v[7] = _mm_add_epi32(u[7], u[6]);
+    v[8] = u[8];
+
+    v[9] = _mm_mullo_epi32(u[9], cospim16);
+    x = _mm_mullo_epi32(u[14], cospi48);
+    v[9] = _mm_add_epi32(v[9], x);
+    v[9] = _mm_add_epi32(v[9], rnding);
+    v[9] = _mm_srai_epi32(v[9], bit);
+
+    v[14] = _mm_mullo_epi32(u[9], cospi48);
+    x = _mm_mullo_epi32(u[14], cospim16);
+    v[14] = _mm_sub_epi32(v[14], x);
+    v[14] = _mm_add_epi32(v[14], rnding);
+    v[14] = _mm_srai_epi32(v[14], bit);
+
+    v[10] = _mm_mullo_epi32(u[10], cospim48);
+    x = _mm_mullo_epi32(u[13], cospim16);
+    v[10] = _mm_add_epi32(v[10], x);
+    v[10] = _mm_add_epi32(v[10], rnding);
+    v[10] = _mm_srai_epi32(v[10], bit);
+
+    v[13] = _mm_mullo_epi32(u[10], cospim16);
+    x = _mm_mullo_epi32(u[13], cospim48);
+    v[13] = _mm_sub_epi32(v[13], x);
+    v[13] = _mm_add_epi32(v[13], rnding);
+    v[13] = _mm_srai_epi32(v[13], bit);
+
+    v[11] = u[11];
+    v[12] = u[12];
+    v[15] = u[15];
+
+    // stage 5
+    u[0] = v[0];
+    u[1] = v[1];
+    u[2] = v[2];
+    u[3] = v[3];
+
+    u[4] = _mm_mullo_epi32(v[4], cospi56);
+    x = _mm_mullo_epi32(v[7], cospi8);
+    u[4] = _mm_add_epi32(u[4], x);
+    u[4] = _mm_add_epi32(u[4], rnding);
+    u[4] = _mm_srai_epi32(u[4], bit);
+
+    u[7] = _mm_mullo_epi32(v[4], cospi8);
+    x = _mm_mullo_epi32(v[7], cospi56);
+    u[7] = _mm_sub_epi32(x, u[7]);
+    u[7] = _mm_add_epi32(u[7], rnding);
+    u[7] = _mm_srai_epi32(u[7], bit);
+
+    u[5] = _mm_mullo_epi32(v[5], cospi24);
+    x = _mm_mullo_epi32(v[6], cospi40);
+    u[5] = _mm_add_epi32(u[5], x);
+    u[5] = _mm_add_epi32(u[5], rnding);
+    u[5] = _mm_srai_epi32(u[5], bit);
+
+    u[6] = _mm_mullo_epi32(v[5], cospi40);
+    x = _mm_mullo_epi32(v[6], cospi24);
+    u[6] = _mm_sub_epi32(x, u[6]);
+    u[6] = _mm_add_epi32(u[6], rnding);
+    u[6] = _mm_srai_epi32(u[6], bit);
+
+    u[8] = _mm_add_epi32(v[8], v[9]);
+    u[9] = _mm_sub_epi32(v[8], v[9]);
+    u[10] = _mm_sub_epi32(v[11], v[10]);
+    u[11] = _mm_add_epi32(v[11], v[10]);
+    u[12] = _mm_add_epi32(v[12], v[13]);
+    u[13] = _mm_sub_epi32(v[12], v[13]);
+    u[14] = _mm_sub_epi32(v[15], v[14]);
+    u[15] = _mm_add_epi32(v[15], v[14]);
+
+    // stage 6
+    v[0] = u[0];
+    v[1] = u[1];
+    v[2] = u[2];
+    v[3] = u[3];
+    v[4] = u[4];
+    v[5] = u[5];
+    v[6] = u[6];
+    v[7] = u[7];
+
+    v[8] = _mm_mullo_epi32(u[8], cospi60);
+    x = _mm_mullo_epi32(u[15], cospi4);
+    v[8] = _mm_add_epi32(v[8], x);
+    v[8] = _mm_add_epi32(v[8], rnding);
+    v[8] = _mm_srai_epi32(v[8], bit);
+
+    v[15] = _mm_mullo_epi32(u[8], cospi4);
+    x = _mm_mullo_epi32(u[15], cospi60);
+    v[15] = _mm_sub_epi32(x, v[15]);
+    v[15] = _mm_add_epi32(v[15], rnding);
+    v[15] = _mm_srai_epi32(v[15], bit);
+
+    v[9] = _mm_mullo_epi32(u[9], cospi28);
+    x = _mm_mullo_epi32(u[14], cospi36);
+    v[9] = _mm_add_epi32(v[9], x);
+    v[9] = _mm_add_epi32(v[9], rnding);
+    v[9] = _mm_srai_epi32(v[9], bit);
+
+    v[14] = _mm_mullo_epi32(u[9], cospi36);
+    x = _mm_mullo_epi32(u[14], cospi28);
+    v[14] = _mm_sub_epi32(x, v[14]);
+    v[14] = _mm_add_epi32(v[14], rnding);
+    v[14] = _mm_srai_epi32(v[14], bit);
+
+    v[10] = _mm_mullo_epi32(u[10], cospi44);
+    x = _mm_mullo_epi32(u[13], cospi20);
+    v[10] = _mm_add_epi32(v[10], x);
+    v[10] = _mm_add_epi32(v[10], rnding);
+    v[10] = _mm_srai_epi32(v[10], bit);
+
+    v[13] = _mm_mullo_epi32(u[10], cospi20);
+    x = _mm_mullo_epi32(u[13], cospi44);
+    v[13] = _mm_sub_epi32(x, v[13]);
+    v[13] = _mm_add_epi32(v[13], rnding);
+    v[13] = _mm_srai_epi32(v[13], bit);
+
+    v[11] = _mm_mullo_epi32(u[11], cospi12);
+    x = _mm_mullo_epi32(u[12], cospi52);
+    v[11] = _mm_add_epi32(v[11], x);
+    v[11] = _mm_add_epi32(v[11], rnding);
+    v[11] = _mm_srai_epi32(v[11], bit);
+
+    v[12] = _mm_mullo_epi32(u[11], cospi52);
+    x = _mm_mullo_epi32(u[12], cospi12);
+    v[12] = _mm_sub_epi32(x, v[12]);
+    v[12] = _mm_add_epi32(v[12], rnding);
+    v[12] = _mm_srai_epi32(v[12], bit);
+
+    out[0 * col_num + col] = v[0];
+    out[1 * col_num + col] = v[8];
+    out[2 * col_num + col] = v[4];
+    out[3 * col_num + col] = v[12];
+    out[4 * col_num + col] = v[2];
+    out[5 * col_num + col] = v[10];
+    out[6 * col_num + col] = v[6];
+    out[7 * col_num + col] = v[14];
+    out[8 * col_num + col] = v[1];
+    out[9 * col_num + col] = v[9];
+    out[10 * col_num + col] = v[5];
+    out[11 * col_num + col] = v[13];
+    out[12 * col_num + col] = v[3];
+    out[13 * col_num + col] = v[11];
+    out[14 * col_num + col] = v[7];
+    out[15 * col_num + col] = v[15];
+  }
+}
+
+static void fadst16x16_sse4_1(__m128i *in, __m128i *out, int bit,
+                              const int num_cols) {
+  const int32_t *cospi = cospi_arr(bit);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+  const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+  const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+  const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+  const __m128i cospim2 = _mm_set1_epi32(-cospi[2]);
+  const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+  const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+  const __m128i cospim10 = _mm_set1_epi32(-cospi[10]);
+  const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
+  const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
+  const __m128i cospim18 = _mm_set1_epi32(-cospi[18]);
+  const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
+  const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
+  const __m128i cospim26 = _mm_set1_epi32(-cospi[26]);
+  const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
+  const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
+  const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
+  const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
+  const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
+  const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
+  const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
+  const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+  const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
+  const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
+  const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+  const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const __m128i zero = _mm_setzero_si128();
+
+  __m128i u[16], v[16], x, y;
+  int col;
+
+  for (col = 0; col < num_cols; ++col) {
+    // stage 0
+    // stage 1
+    u[0] = in[0 * num_cols + col];
+    u[1] = _mm_sub_epi32(zero, in[15 * num_cols + col]);
+    u[2] = _mm_sub_epi32(zero, in[7 * num_cols + col]);
+    u[3] = in[8 * num_cols + col];
+    u[4] = _mm_sub_epi32(zero, in[3 * num_cols + col]);
+    u[5] = in[12 * num_cols + col];
+    u[6] = in[4 * num_cols + col];
+    u[7] = _mm_sub_epi32(zero, in[11 * num_cols + col]);
+    u[8] = _mm_sub_epi32(zero, in[1 * num_cols + col]);
+    u[9] = in[14 * num_cols + col];
+    u[10] = in[6 * num_cols + col];
+    u[11] = _mm_sub_epi32(zero, in[9 * num_cols + col]);
+    u[12] = in[2 * num_cols + col];
+    u[13] = _mm_sub_epi32(zero, in[13 * num_cols + col]);
+    u[14] = _mm_sub_epi32(zero, in[5 * num_cols + col]);
+    u[15] = in[10 * num_cols + col];
+
+    // stage 2
+    v[0] = u[0];
+    v[1] = u[1];
+
+    x = _mm_mullo_epi32(u[2], cospi32);
+    y = _mm_mullo_epi32(u[3], cospi32);
+    v[2] = _mm_add_epi32(x, y);
+    v[2] = _mm_add_epi32(v[2], rnding);
+    v[2] = _mm_srai_epi32(v[2], bit);
+
+    v[3] = _mm_sub_epi32(x, y);
+    v[3] = _mm_add_epi32(v[3], rnding);
+    v[3] = _mm_srai_epi32(v[3], bit);
+
+    v[4] = u[4];
+    v[5] = u[5];
+
+    x = _mm_mullo_epi32(u[6], cospi32);
+    y = _mm_mullo_epi32(u[7], cospi32);
+    v[6] = _mm_add_epi32(x, y);
+    v[6] = _mm_add_epi32(v[6], rnding);
+    v[6] = _mm_srai_epi32(v[6], bit);
+
+    v[7] = _mm_sub_epi32(x, y);
+    v[7] = _mm_add_epi32(v[7], rnding);
+    v[7] = _mm_srai_epi32(v[7], bit);
+
+    v[8] = u[8];
+    v[9] = u[9];
+
+    x = _mm_mullo_epi32(u[10], cospi32);
+    y = _mm_mullo_epi32(u[11], cospi32);
+    v[10] = _mm_add_epi32(x, y);
+    v[10] = _mm_add_epi32(v[10], rnding);
+    v[10] = _mm_srai_epi32(v[10], bit);
+
+    v[11] = _mm_sub_epi32(x, y);
+    v[11] = _mm_add_epi32(v[11], rnding);
+    v[11] = _mm_srai_epi32(v[11], bit);
+
+    v[12] = u[12];
+    v[13] = u[13];
+
+    x = _mm_mullo_epi32(u[14], cospi32);
+    y = _mm_mullo_epi32(u[15], cospi32);
+    v[14] = _mm_add_epi32(x, y);
+    v[14] = _mm_add_epi32(v[14], rnding);
+    v[14] = _mm_srai_epi32(v[14], bit);
+
+    v[15] = _mm_sub_epi32(x, y);
+    v[15] = _mm_add_epi32(v[15], rnding);
+    v[15] = _mm_srai_epi32(v[15], bit);
+
+    // stage 3
+    u[0] = _mm_add_epi32(v[0], v[2]);
+    u[1] = _mm_add_epi32(v[1], v[3]);
+    u[2] = _mm_sub_epi32(v[0], v[2]);
+    u[3] = _mm_sub_epi32(v[1], v[3]);
+    u[4] = _mm_add_epi32(v[4], v[6]);
+    u[5] = _mm_add_epi32(v[5], v[7]);
+    u[6] = _mm_sub_epi32(v[4], v[6]);
+    u[7] = _mm_sub_epi32(v[5], v[7]);
+    u[8] = _mm_add_epi32(v[8], v[10]);
+    u[9] = _mm_add_epi32(v[9], v[11]);
+    u[10] = _mm_sub_epi32(v[8], v[10]);
+    u[11] = _mm_sub_epi32(v[9], v[11]);
+    u[12] = _mm_add_epi32(v[12], v[14]);
+    u[13] = _mm_add_epi32(v[13], v[15]);
+    u[14] = _mm_sub_epi32(v[12], v[14]);
+    u[15] = _mm_sub_epi32(v[13], v[15]);
+
+    // stage 4
+    v[0] = u[0];
+    v[1] = u[1];
+    v[2] = u[2];
+    v[3] = u[3];
+    v[4] = half_btf_sse4_1(&cospi16, &u[4], &cospi48, &u[5], &rnding, bit);
+    v[5] = half_btf_sse4_1(&cospi48, &u[4], &cospim16, &u[5], &rnding, bit);
+    v[6] = half_btf_sse4_1(&cospim48, &u[6], &cospi16, &u[7], &rnding, bit);
+    v[7] = half_btf_sse4_1(&cospi16, &u[6], &cospi48, &u[7], &rnding, bit);
+    v[8] = u[8];
+    v[9] = u[9];
+    v[10] = u[10];
+    v[11] = u[11];
+    v[12] = half_btf_sse4_1(&cospi16, &u[12], &cospi48, &u[13], &rnding, bit);
+    v[13] = half_btf_sse4_1(&cospi48, &u[12], &cospim16, &u[13], &rnding, bit);
+    v[14] = half_btf_sse4_1(&cospim48, &u[14], &cospi16, &u[15], &rnding, bit);
+    v[15] = half_btf_sse4_1(&cospi16, &u[14], &cospi48, &u[15], &rnding, bit);
+
+    // stage 5
+    u[0] = _mm_add_epi32(v[0], v[4]);
+    u[1] = _mm_add_epi32(v[1], v[5]);
+    u[2] = _mm_add_epi32(v[2], v[6]);
+    u[3] = _mm_add_epi32(v[3], v[7]);
+    u[4] = _mm_sub_epi32(v[0], v[4]);
+    u[5] = _mm_sub_epi32(v[1], v[5]);
+    u[6] = _mm_sub_epi32(v[2], v[6]);
+    u[7] = _mm_sub_epi32(v[3], v[7]);
+    u[8] = _mm_add_epi32(v[8], v[12]);
+    u[9] = _mm_add_epi32(v[9], v[13]);
+    u[10] = _mm_add_epi32(v[10], v[14]);
+    u[11] = _mm_add_epi32(v[11], v[15]);
+    u[12] = _mm_sub_epi32(v[8], v[12]);
+    u[13] = _mm_sub_epi32(v[9], v[13]);
+    u[14] = _mm_sub_epi32(v[10], v[14]);
+    u[15] = _mm_sub_epi32(v[11], v[15]);
+
+    // stage 6
+    v[0] = u[0];
+    v[1] = u[1];
+    v[2] = u[2];
+    v[3] = u[3];
+    v[4] = u[4];
+    v[5] = u[5];
+    v[6] = u[6];
+    v[7] = u[7];
+    v[8] = half_btf_sse4_1(&cospi8, &u[8], &cospi56, &u[9], &rnding, bit);
+    v[9] = half_btf_sse4_1(&cospi56, &u[8], &cospim8, &u[9], &rnding, bit);
+    v[10] = half_btf_sse4_1(&cospi40, &u[10], &cospi24, &u[11], &rnding, bit);
+    v[11] = half_btf_sse4_1(&cospi24, &u[10], &cospim40, &u[11], &rnding, bit);
+    v[12] = half_btf_sse4_1(&cospim56, &u[12], &cospi8, &u[13], &rnding, bit);
+    v[13] = half_btf_sse4_1(&cospi8, &u[12], &cospi56, &u[13], &rnding, bit);
+    v[14] = half_btf_sse4_1(&cospim24, &u[14], &cospi40, &u[15], &rnding, bit);
+    v[15] = half_btf_sse4_1(&cospi40, &u[14], &cospi24, &u[15], &rnding, bit);
+
+    // stage 7
+    u[0] = _mm_add_epi32(v[0], v[8]);
+    u[1] = _mm_add_epi32(v[1], v[9]);
+    u[2] = _mm_add_epi32(v[2], v[10]);
+    u[3] = _mm_add_epi32(v[3], v[11]);
+    u[4] = _mm_add_epi32(v[4], v[12]);
+    u[5] = _mm_add_epi32(v[5], v[13]);
+    u[6] = _mm_add_epi32(v[6], v[14]);
+    u[7] = _mm_add_epi32(v[7], v[15]);
+    u[8] = _mm_sub_epi32(v[0], v[8]);
+    u[9] = _mm_sub_epi32(v[1], v[9]);
+    u[10] = _mm_sub_epi32(v[2], v[10]);
+    u[11] = _mm_sub_epi32(v[3], v[11]);
+    u[12] = _mm_sub_epi32(v[4], v[12]);
+    u[13] = _mm_sub_epi32(v[5], v[13]);
+    u[14] = _mm_sub_epi32(v[6], v[14]);
+    u[15] = _mm_sub_epi32(v[7], v[15]);
+
+    // stage 8
+    v[0] = half_btf_sse4_1(&cospi2, &u[0], &cospi62, &u[1], &rnding, bit);
+    v[1] = half_btf_sse4_1(&cospi62, &u[0], &cospim2, &u[1], &rnding, bit);
+    v[2] = half_btf_sse4_1(&cospi10, &u[2], &cospi54, &u[3], &rnding, bit);
+    v[3] = half_btf_sse4_1(&cospi54, &u[2], &cospim10, &u[3], &rnding, bit);
+    v[4] = half_btf_sse4_1(&cospi18, &u[4], &cospi46, &u[5], &rnding, bit);
+    v[5] = half_btf_sse4_1(&cospi46, &u[4], &cospim18, &u[5], &rnding, bit);
+    v[6] = half_btf_sse4_1(&cospi26, &u[6], &cospi38, &u[7], &rnding, bit);
+    v[7] = half_btf_sse4_1(&cospi38, &u[6], &cospim26, &u[7], &rnding, bit);
+    v[8] = half_btf_sse4_1(&cospi34, &u[8], &cospi30, &u[9], &rnding, bit);
+    v[9] = half_btf_sse4_1(&cospi30, &u[8], &cospim34, &u[9], &rnding, bit);
+    v[10] = half_btf_sse4_1(&cospi42, &u[10], &cospi22, &u[11], &rnding, bit);
+    v[11] = half_btf_sse4_1(&cospi22, &u[10], &cospim42, &u[11], &rnding, bit);
+    v[12] = half_btf_sse4_1(&cospi50, &u[12], &cospi14, &u[13], &rnding, bit);
+    v[13] = half_btf_sse4_1(&cospi14, &u[12], &cospim50, &u[13], &rnding, bit);
+    v[14] = half_btf_sse4_1(&cospi58, &u[14], &cospi6, &u[15], &rnding, bit);
+    v[15] = half_btf_sse4_1(&cospi6, &u[14], &cospim58, &u[15], &rnding, bit);
+
+    // stage 9
+    out[0 * num_cols + col] = v[1];
+    out[1 * num_cols + col] = v[14];
+    out[2 * num_cols + col] = v[3];
+    out[3 * num_cols + col] = v[12];
+    out[4 * num_cols + col] = v[5];
+    out[5 * num_cols + col] = v[10];
+    out[6 * num_cols + col] = v[7];
+    out[7 * num_cols + col] = v[8];
+    out[8 * num_cols + col] = v[9];
+    out[9 * num_cols + col] = v[6];
+    out[10 * num_cols + col] = v[11];
+    out[11 * num_cols + col] = v[4];
+    out[12 * num_cols + col] = v[13];
+    out[13 * num_cols + col] = v[2];
+    out[14 * num_cols + col] = v[15];
+    out[15 * num_cols + col] = v[0];
+  }
+}
+
+static void col_txfm_16x16_rounding(__m128i *in, int shift) {
+  // Note:
+  //  We split 16x16 rounding into 4 sections of 8x8 rounding,
+  //  instead of 4 columns
+  col_txfm_8x8_rounding(&in[0], shift);
+  col_txfm_8x8_rounding(&in[16], shift);
+  col_txfm_8x8_rounding(&in[32], shift);
+  col_txfm_8x8_rounding(&in[48], shift);
+}
+
+static void col_txfm_8x16_rounding(__m128i *in, int shift) {
+  col_txfm_8x8_rounding(&in[0], shift);
+  col_txfm_8x8_rounding(&in[16], shift);
+}
+
+static void write_buffer_16x16(const __m128i *in, int32_t *output) {
+  const int size_8x8 = 16 * 4;
+  write_buffer_8x8(&in[0], output);
+  output += size_8x8;
+  write_buffer_8x8(&in[16], output);
+  output += size_8x8;
+  write_buffer_8x8(&in[32], output);
+  output += size_8x8;
+  write_buffer_8x8(&in[48], output);
+}
+static void idtx16x16_sse4_1(__m128i *in, __m128i *out, int bit, int col_num) {
+  (void)bit;
+  __m128i fact = _mm_set1_epi32(2 * NewSqrt2);
+  __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1));
+  __m128i a_low;
+
+  int num_iters = 16 * col_num;
+  for (int i = 0; i < num_iters; i++) {
+    a_low = _mm_mullo_epi32(in[i], fact);
+    a_low = _mm_add_epi32(a_low, offset);
+    out[i] = _mm_srai_epi32(a_low, NewSqrt2Bits);
+  }
+}
+void av1_fwd_txfm2d_16x16_sse4_1(const int16_t *input, int32_t *coeff,
+                                 int stride, TX_TYPE tx_type, int bd) {
+  __m128i in[64], out[64];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X16];
+  const int txw_idx = get_txw_idx(TX_16X16);
+  const int txh_idx = get_txh_idx(TX_16X16);
+  const int col_num = 4;
+  switch (tx_type) {
+    case DCT_DCT:
+      load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+      fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      col_txfm_16x16_rounding(out, -shift[1]);
+      transpose_16x16(out, in);
+      fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case ADST_DCT:
+      load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+      fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx],
+                        col_num);
+      col_txfm_16x16_rounding(out, -shift[1]);
+      transpose_16x16(out, in);
+      fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case DCT_ADST:
+      load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+      fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      col_txfm_16x16_rounding(out, -shift[1]);
+      transpose_16x16(out, in);
+      fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx],
+                        col_num);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case ADST_ADST:
+      load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+      fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx],
+                        col_num);
+      col_txfm_16x16_rounding(out, -shift[1]);
+      transpose_16x16(out, in);
+      fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx],
+                        col_num);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case FLIPADST_DCT:
+      load_buffer_16x16(input, in, stride, 1, 0, shift[0]);
+      fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx],
+                        col_num);
+      col_txfm_16x16_rounding(out, -shift[1]);
+      transpose_16x16(out, in);
+      fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case DCT_FLIPADST:
+      load_buffer_16x16(input, in, stride, 0, 1, shift[0]);
+      fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      col_txfm_16x16_rounding(out, -shift[1]);
+      transpose_16x16(out, in);
+      fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx],
+                        col_num);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case FLIPADST_FLIPADST:
+      load_buffer_16x16(input, in, stride, 1, 1, shift[0]);
+      fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx],
+                        col_num);
+      col_txfm_16x16_rounding(out, -shift[1]);
+      transpose_16x16(out, in);
+      fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx],
+                        col_num);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case ADST_FLIPADST:
+      load_buffer_16x16(input, in, stride, 0, 1, shift[0]);
+      fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx],
+                        col_num);
+      col_txfm_16x16_rounding(out, -shift[1]);
+      transpose_16x16(out, in);
+      fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx],
+                        col_num);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case FLIPADST_ADST:
+      load_buffer_16x16(input, in, stride, 1, 0, shift[0]);
+      fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx],
+                        col_num);
+      col_txfm_16x16_rounding(out, -shift[1]);
+      transpose_16x16(out, in);
+      fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx],
+                        col_num);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case IDTX:
+      load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+      idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      col_txfm_16x16_rounding(out, -shift[1]);
+      transpose_16x16(out, in);
+      idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case V_DCT:
+      load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+      fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      col_txfm_16x16_rounding(out, -shift[1]);
+      transpose_16x16(out, in);
+      idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case H_DCT:
+      load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+      idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      col_txfm_16x16_rounding(out, -shift[1]);
+      transpose_16x16(out, in);
+      fdct16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case V_ADST:
+      load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+      fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx],
+                        col_num);
+      col_txfm_16x16_rounding(out, -shift[1]);
+      transpose_16x16(out, in);
+      idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case H_ADST:
+      load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+      idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      col_txfm_16x16_rounding(out, -shift[1]);
+      transpose_16x16(out, in);
+      fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx],
+                        col_num);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case V_FLIPADST:
+      load_buffer_16x16(input, in, stride, 1, 0, shift[0]);
+      fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx],
+                        col_num);
+      col_txfm_16x16_rounding(out, -shift[1]);
+      transpose_16x16(out, in);
+      idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case H_FLIPADST:
+      load_buffer_16x16(input, in, stride, 0, 1, shift[0]);
+      idtx16x16_sse4_1(in, out, av1_fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      col_txfm_16x16_rounding(out, -shift[1]);
+      transpose_16x16(out, in);
+      fadst16x16_sse4_1(in, out, av1_fwd_cos_bit_row[txw_idx][txh_idx],
+                        col_num);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    default: assert(0);
+  }
+  (void)bd;
+}
+
+static INLINE void flip_buf_sse4_1(__m128i *in, __m128i *out, int size) {
+  for (int i = 0; i < size; i += 2) in[30 - i] = out[i];
+  for (int i = 1; i < size; i += 2) in[size - i] = out[i];
+}
+
+static const fwd_transform_1d_sse4_1 col_highbd_txfm8x8_arr[TX_TYPES] = {
+  fdct8x8_sse4_1,   // DCT_DCT
+  fadst8x8_sse4_1,  // ADST_DCT
+  fdct8x8_sse4_1,   // DCT_ADST
+  fadst8x8_sse4_1,  // ADST_ADST
+  fadst8x8_sse4_1,  // FLIPADST_DCT
+  fdct8x8_sse4_1,   // DCT_FLIPADST
+  fadst8x8_sse4_1,  // FLIPADST_FLIPADST
+  fadst8x8_sse4_1,  // ADST_FLIPADST
+  fadst8x8_sse4_1,  // FLIPADST_ADST
+  idtx8x8_sse4_1,   // IDTX
+  fdct8x8_sse4_1,   // V_DCT
+  idtx8x8_sse4_1,   // H_DCT
+  fadst8x8_sse4_1,  // V_ADST
+  idtx8x8_sse4_1,   // H_ADST
+  fadst8x8_sse4_1,  // V_FLIPADST
+  idtx8x8_sse4_1    // H_FLIPADST
+};
+static const fwd_transform_1d_sse4_1 row_highbd_txfm32x8_arr[TX_TYPES] = {
+  fdct8x8_sse4_1,   // DCT_DCT
+  NULL,             // ADST_DCT
+  NULL,             // DCT_ADST
+  NULL,             // ADST_ADST
+  NULL,             // FLIPADST_DCT
+  NULL,             // DCT_FLIPADST
+  NULL,             // FLIPADST_FLIPADST
+  NULL,             // ADST_FLIPADST
+  NULL,             // FLIPADST-ADST
+  idtx32x8_sse4_1,  // IDTX
+  NULL,             // V_DCT
+  NULL,             // H_DCT
+  NULL,             // V_ADST
+  NULL,             // H_ADST
+  NULL,             // V_FLIPADST
+  NULL,             // H_FLIPADST
+};
+static const fwd_transform_1d_sse4_1 col_highbd_txfm4x8_arr[TX_TYPES] = {
+  fdct4x8_sse4_1,   // DCT_DCT
+  fadst8x8_sse4_1,  // ADST_DCT
+  fdct4x8_sse4_1,   // DCT_ADST
+  fadst8x8_sse4_1,  // ADST_ADST
+  fadst8x8_sse4_1,  // FLIPADST_DCT
+  fdct4x8_sse4_1,   // DCT_FLIPADST
+  fadst8x8_sse4_1,  // FLIPADST_FLIPADST
+  fadst8x8_sse4_1,  // ADST_FLIPADST
+  fadst8x8_sse4_1,  // FLIPADST_ADST
+  idtx8x8_sse4_1,   // IDTX
+  fdct4x8_sse4_1,   // V_DCT
+  idtx8x8_sse4_1,   // H_DCT
+  fadst8x8_sse4_1,  // V_ADST
+  idtx8x8_sse4_1,   // H_ADST
+  fadst8x8_sse4_1,  // V_FLIPADST
+  idtx8x8_sse4_1    // H_FLIPADST
+};
+
+static const fwd_transform_1d_sse4_1 row_highbd_txfm8x16_arr[TX_TYPES] = {
+  fdct16x16_sse4_1,   // DCT_DCT
+  fdct16x16_sse4_1,   // ADST_DCT
+  fadst16x16_sse4_1,  // DCT_ADST
+  fadst16x16_sse4_1,  // ADST_ADST
+  fdct16x16_sse4_1,   // FLIPADST_DCT
+  fadst16x16_sse4_1,  // DCT_FLIPADST
+  fadst16x16_sse4_1,  // FLIPADST_FLIPADST
+  fadst16x16_sse4_1,  // ADST_FLIPADST
+  fadst16x16_sse4_1,  // FLIPADST_ADST
+  idtx16x16_sse4_1,   // IDTX
+  idtx16x16_sse4_1,   // V_DCT
+  fdct16x16_sse4_1,   // H_DCT
+  idtx16x16_sse4_1,   // V_ADST
+  fadst16x16_sse4_1,  // H_ADST
+  idtx16x16_sse4_1,   // V_FLIPADST
+  fadst16x16_sse4_1   // H_FLIPADST
+};
+
+static const fwd_transform_1d_sse4_1 col_highbd_txfm8x16_arr[TX_TYPES] = {
+  fdct16x16_sse4_1,   // DCT_DCT
+  fadst16x16_sse4_1,  // ADST_DCT
+  fdct16x16_sse4_1,   // DCT_ADST
+  fadst16x16_sse4_1,  // ADST_ADST
+  fadst16x16_sse4_1,  // FLIPADST_DCT
+  fdct16x16_sse4_1,   // DCT_FLIPADST
+  fadst16x16_sse4_1,  // FLIPADST_FLIPADST
+  fadst16x16_sse4_1,  // ADST_FLIPADST
+  fadst16x16_sse4_1,  // FLIPADST_ADST
+  idtx16x16_sse4_1,   // IDTX
+  fdct16x16_sse4_1,   // V_DCT
+  idtx16x16_sse4_1,   // H_DCT
+  fadst16x16_sse4_1,  // V_ADST
+  idtx16x16_sse4_1,   // H_ADST
+  fadst16x16_sse4_1,  // V_FLIPADST
+  idtx16x16_sse4_1    // H_FLIPADST
+};
+static const fwd_transform_1d_sse4_1 row_highbd_txfm8x8_arr[TX_TYPES] = {
+  fdct8x8_sse4_1,   // DCT_DCT
+  fdct8x8_sse4_1,   // ADST_DCT
+  fadst8x8_sse4_1,  // DCT_ADST
+  fadst8x8_sse4_1,  // ADST_ADST
+  fdct8x8_sse4_1,   // FLIPADST_DCT
+  fadst8x8_sse4_1,  // DCT_FLIPADST
+  fadst8x8_sse4_1,  // FLIPADST_FLIPADST
+  fadst8x8_sse4_1,  // ADST_FLIPADST
+  fadst8x8_sse4_1,  // FLIPADST_ADST
+  idtx8x8_sse4_1,   // IDTX
+  idtx8x8_sse4_1,   // V_DCT
+  fdct8x8_sse4_1,   // H_DCT
+  idtx8x8_sse4_1,   // V_ADST
+  fadst8x8_sse4_1,  // H_ADST
+  idtx8x8_sse4_1,   // V_FLIPADST
+  fadst8x8_sse4_1   // H_FLIPADST
+};
+
+static const fwd_transform_1d_sse4_1 row_highbd_txfm4x8_arr[TX_TYPES] = {
+  fdct4x8_sse4_1,   // DCT_DCT
+  fdct4x8_sse4_1,   // ADST_DCT
+  fadst8x8_sse4_1,  // DCT_ADST
+  fadst8x8_sse4_1,  // ADST_ADST
+  fdct4x8_sse4_1,   // FLIPADST_DCT
+  fadst8x8_sse4_1,  // DCT_FLIPADST
+  fadst8x8_sse4_1,  // FLIPADST_FLIPADST
+  fadst8x8_sse4_1,  // ADST_FLIPADST
+  fadst8x8_sse4_1,  // FLIPADST_ADST
+  idtx8x8_sse4_1,   // IDTX
+  idtx8x8_sse4_1,   // V_DCT
+  fdct4x8_sse4_1,   // H_DCT
+  idtx8x8_sse4_1,   // V_ADST
+  fadst8x8_sse4_1,  // H_ADST
+  idtx8x8_sse4_1,   // V_FLIPADST
+  fadst8x8_sse4_1   // H_FLIPADST
+};
+
+static const fwd_transform_1d_sse4_1 row_highbd_txfm4x4_arr[TX_TYPES] = {
+  fdct4x4_sse4_1,   // DCT_DCT
+  fdct4x4_sse4_1,   // ADST_DCT
+  fadst4x4_sse4_1,  // DCT_ADST
+  fadst4x4_sse4_1,  // ADST_ADST
+  fdct4x4_sse4_1,   // FLIPADST_DCT
+  fadst4x4_sse4_1,  // DCT_FLIPADST
+  fadst4x4_sse4_1,  // FLIPADST_FLIPADST
+  fadst4x4_sse4_1,  // ADST_FLIPADST
+  fadst4x4_sse4_1,  // FLIPADST_ADST
+  idtx4x4_sse4_1,   // IDTX
+  idtx4x4_sse4_1,   // V_DCT
+  fdct4x4_sse4_1,   // H_DCT
+  idtx4x4_sse4_1,   // V_ADST
+  fadst4x4_sse4_1,  // H_ADST
+  idtx4x4_sse4_1,   // V_FLIPADST
+  fadst4x4_sse4_1   // H_FLIPADST
+};
+
+static const fwd_transform_1d_sse4_1 col_highbd_txfm4x4_arr[TX_TYPES] = {
+  fdct4x4_sse4_1,   // DCT_DCT
+  fadst4x4_sse4_1,  // ADST_DCT
+  fdct4x4_sse4_1,   // DCT_ADST
+  fadst4x4_sse4_1,  // ADST_ADST
+  fadst4x4_sse4_1,  // FLIPADST_DCT
+  fdct4x4_sse4_1,   // DCT_FLIPADST
+  fadst4x4_sse4_1,  // FLIPADST_FLIPADST
+  fadst4x4_sse4_1,  // ADST_FLIPADST
+  fadst4x4_sse4_1,  // FLIPADST_ADST
+  idtx4x4_sse4_1,   // IDTX
+  fdct4x4_sse4_1,   // V_DCT
+  idtx4x4_sse4_1,   // H_DCT
+  fadst4x4_sse4_1,  // V_ADST
+  idtx4x4_sse4_1,   // H_ADST
+  fadst4x4_sse4_1,  // V_FLIPADST
+  idtx4x4_sse4_1    // H_FLIPADST
+};
+
+static const fwd_transform_1d_sse4_1 col_highbd_txfm8x32_arr[TX_TYPES] = {
+  av1_fdct32_sse4_1,  // DCT_DCT
+  NULL,               // ADST_DCT
+  NULL,               // DCT_ADST
+  NULL,               // ADST_ADST
+  NULL,               // FLIPADST_DCT
+  NULL,               // DCT_FLIPADST
+  NULL,               // FLIPADST_FLIPADST
+  NULL,               // ADST_FLIPADST
+  NULL,               // FLIPADST_ADST
+  av1_idtx32_sse4_1,  // IDTX
+  NULL,               // V_DCT
+  NULL,               // H_DCT
+  NULL,               // V_ADST
+  NULL,               // H_ADST
+  NULL,               // V_FLIPADST
+  NULL                // H_FLIPADST
+};
+
+static const fwd_transform_1d_sse4_1 row_highbd_txfm8x32_arr[TX_TYPES] = {
+  fdct16x16_sse4_1,  // DCT_DCT
+  NULL,              // ADST_DCT
+  NULL,              // DCT_ADST
+  NULL,              // ADST_ADST
+  NULL,              // FLIPADST_DCT
+  NULL,              // DCT_FLIPADST
+  NULL,              // FLIPADST_FLIPADST
+  NULL,              // ADST_FLIPADST
+  NULL,              // FLIPADST_ADST
+  idtx16x16_sse4_1,  // IDTX
+  NULL,              // V_DCT
+  NULL,              // H_DCT
+  NULL,              // V_ADST
+  NULL,              // H_ADST
+  NULL,              // V_FLIPADST
+  NULL               // H_FLIPADST
+};
+
+void av1_fwd_txfm2d_16x8_sse4_1(const int16_t *input, int32_t *coeff,
+                                int stride, TX_TYPE tx_type, int bd) {
+  __m128i in[32], out[32];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X8];
+  const int txw_idx = get_txw_idx(TX_16X8);
+  const int txh_idx = get_txh_idx(TX_16X8);
+  const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x8_arr[tx_type];
+  const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm8x16_arr[tx_type];
+  int bit = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  for (int i = 0; i < 2; i++) {
+    load_buffer_8x8(input + i * 8, in, stride, ud_flip, 0, shift[0]);
+    col_txfm(in, in, bit, 2);
+    col_txfm_8x8_rounding(in, -shift[1]);
+    transpose_8x8(in, out + i * 16);
+  }
+
+  if (lr_flip) {
+    flip_buf_sse4_1(in, out, 32);
+    row_txfm(in, out, bit, 2);
+  } else {
+    row_txfm(out, out, bit, 2);
+  }
+
+  for (int i = 0; i < 2; i++) {
+    transpose_8x8(out + i * 16, in);
+    av1_round_shift_rect_array_32_sse4_1(in, in, 16, -shift[2], NewSqrt2);
+    write_buffer_16x8(in, coeff + i * 8, 16);
+  }
+
+  (void)bd;
+}
+
+void av1_fwd_txfm2d_8x16_sse4_1(const int16_t *input, int32_t *coeff,
+                                int stride, TX_TYPE tx_type, int bd) {
+  __m128i in[32], out[32];
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X16];
+  const int txw_idx = get_txw_idx(TX_8X16);
+  const int txh_idx = get_txh_idx(TX_8X16);
+  const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x16_arr[tx_type];
+  const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm8x8_arr[tx_type];
+  int bit = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  load_buffer_8x16(input, in, stride, ud_flip, lr_flip, shift[0]);
+  col_txfm(in, in, bit, 2);
+  col_txfm_8x16_rounding(in, -shift[1]);
+  transpose_8x8(in, out);
+  transpose_8x8(in + 16, out + 16);
+
+  for (int i = 0; i < 2; i++) {
+    row_txfm(out + i * 16, out, bit, 2);
+    transpose_8x8(out, in);
+    av1_round_shift_rect_array_32_sse4_1(in, in, 16, -shift[2], NewSqrt2);
+    write_buffer_8x8(in, coeff + i * 64);
+  }
+
+  (void)bd;
+}
+
+void av1_fwd_txfm2d_4x16_sse4_1(const int16_t *input, int32_t *coeff,
+                                int stride, TX_TYPE tx_type, int bd) {
+  __m128i in[16];
+  __m128i *outcoeff128 = (__m128i *)coeff;
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X16];
+  const int txw_idx = get_txw_idx(TX_4X16);
+  const int txh_idx = get_txh_idx(TX_4X16);
+  const int txfm_size_col = tx_size_wide[TX_4X16];
+  const int txfm_size_row = tx_size_high[TX_4X16];
+  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x16_arr[tx_type];
+  const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm4x4_arr[tx_type];
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  // col transform
+  load_buffer_4x16(input, in, stride, ud_flip, lr_flip, shift[0]);
+  col_txfm(in, outcoeff128, bitcol, 1);
+  col_txfm_8x8_rounding(outcoeff128, -shift[1]);
+  transpose_8nx8n(outcoeff128, in, txfm_size_col, txfm_size_row);
+
+  // row transform
+  for (int i = 0; i < txfm_size_col; i++) {
+    row_txfm(in + i, outcoeff128 + i * txfm_size_col, bitrow, txfm_size_col);
+  }
+  (void)bd;
+}
+
+void av1_fwd_txfm2d_16x4_sse4_1(const int16_t *input, int32_t *coeff,
+                                int stride, TX_TYPE tx_type, int bd) {
+  __m128i in[16];
+  __m128i *outcoeff128 = (__m128i *)coeff;
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X4];
+  const int txw_idx = get_txw_idx(TX_16X4);
+  const int txh_idx = get_txh_idx(TX_16X4);
+  const int txfm_size_col = tx_size_wide[TX_16X4];
+  const int txfm_size_row = tx_size_high[TX_16X4];
+  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm4x4_arr[tx_type];
+  const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm8x16_arr[tx_type];
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  // col transform
+  load_buffer_16x4(input, in, stride, ud_flip, lr_flip, shift[0]);
+
+  for (int i = 0; i < txfm_size_row; i++) {
+    col_txfm(in + i * txfm_size_row, outcoeff128 + i * txfm_size_row, bitcol,
+             1);
+  }
+  col_txfm_8x8_rounding(outcoeff128, -shift[1]);
+
+  // row transform
+  row_txfm(outcoeff128, in, bitrow, 1);
+  transpose_8nx8n(in, outcoeff128, txfm_size_row, txfm_size_col);
+  (void)bd;
+}
+
+void av1_fwd_txfm2d_16x32_sse4_1(const int16_t *input, int32_t *coeff,
+                                 int stride, TX_TYPE tx_type, int bd) {
+  __m128i in[128];
+  __m128i *outcoef128 = (__m128i *)coeff;
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X32];
+  const int txw_idx = get_txw_idx(TX_16X32);
+  const int txh_idx = get_txh_idx(TX_16X32);
+  const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x32_arr[tx_type];
+  const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm8x32_arr[tx_type];
+  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+
+  // column transform
+  load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+  load_buffer_16x16(input + 16 * stride, in + 64, stride, 0, 0, shift[0]);
+
+  for (int i = 0; i < 4; i++) {
+    col_txfm((in + i), (in + i), bitcol, 4);
+  }
+  col_txfm_16x16_rounding(&in[0], -shift[1]);
+  col_txfm_16x16_rounding(&in[64], -shift[1]);
+  transpose_8nx8n(in, outcoef128, 16, 32);
+
+  // row transform
+  row_txfm(outcoef128, in, bitrow, 8);
+  transpose_8nx8n(in, outcoef128, 32, 16);
+  av1_round_shift_rect_array_32_sse4_1(outcoef128, outcoef128, 128, -shift[2],
+                                       NewSqrt2);
+  (void)bd;
+}
+
+void av1_fwd_txfm2d_32x64_sse4_1(const int16_t *input, int32_t *coeff,
+                                 int stride, TX_TYPE tx_type, int bd) {
+  (void)tx_type;
+  __m128i in[512];
+  __m128i *outcoef128 = (__m128i *)coeff;
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X64];
+  const int txw_idx = get_txw_idx(TX_32X64);
+  const int txh_idx = get_txh_idx(TX_32X64);
+  const int txfm_size_col = tx_size_wide[TX_32X64];
+  const int txfm_size_row = tx_size_high[TX_32X64];
+  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int num_row = txfm_size_row >> 2;
+  const int num_col = txfm_size_col >> 2;
+
+  // column transform
+  load_buffer_32x8n(input, in, stride, 0, 0, shift[0], txfm_size_row);
+  for (int i = 0; i < num_col; i++) {
+    av1_fdct64_sse4_1((in + i), (in + i), bitcol, num_col, num_col);
+  }
+  for (int i = 0; i < num_col; i++) {
+    col_txfm_16x16_rounding((in + i * txfm_size_row), -shift[1]);
+  }
+  transpose_8nx8n(in, outcoef128, txfm_size_col, txfm_size_row);
+
+  // row transform
+  for (int i = 0; i < num_row; i++) {
+    av1_fdct32_sse4_1((outcoef128 + i), (in + i), bitrow, num_row);
+  }
+  transpose_8nx8n(in, outcoef128, txfm_size_row, txfm_size_col);
+  av1_round_shift_rect_array_32_sse4_1(outcoef128, outcoef128, 512, -shift[2],
+                                       NewSqrt2);
+  (void)bd;
+}
+
+void av1_fwd_txfm2d_64x32_sse4_1(const int16_t *input, int32_t *coeff,
+                                 int stride, TX_TYPE tx_type, int bd) {
+  (void)tx_type;
+  __m128i in[512];
+  __m128i *outcoef128 = (__m128i *)coeff;
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_64X32];
+  const int txw_idx = get_txw_idx(TX_64X32);
+  const int txh_idx = get_txh_idx(TX_64X32);
+  const int txfm_size_col = tx_size_wide[TX_64X32];
+  const int txfm_size_row = tx_size_high[TX_64X32];
+  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const int num_row = txfm_size_row >> 2;
+  const int num_col = txfm_size_col >> 2;
+
+  // column transform
+  for (int i = 0; i < 32; i++) {
+    load_buffer_4x4(input + 0 + i * stride, in + 0 + i * 16, 4, 0, 0, shift[0]);
+    load_buffer_4x4(input + 16 + i * stride, in + 4 + i * 16, 4, 0, 0,
+                    shift[0]);
+    load_buffer_4x4(input + 32 + i * stride, in + 8 + i * 16, 4, 0, 0,
+                    shift[0]);
+    load_buffer_4x4(input + 48 + i * stride, in + 12 + i * 16, 4, 0, 0,
+                    shift[0]);
+  }
+
+  for (int i = 0; i < num_col; i++) {
+    av1_fdct32_sse4_1((in + i), (in + i), bitcol, num_col);
+  }
+
+  for (int i = 0; i < num_row; i++) {
+    col_txfm_16x16_rounding((in + i * txfm_size_col), -shift[1]);
+  }
+  transpose_8nx8n(in, outcoef128, txfm_size_col, txfm_size_row);
+
+  // row transform
+  for (int i = 0; i < num_row; i++) {
+    av1_fdct64_sse4_1((outcoef128 + i), (in + i), bitrow, num_row, num_row);
+  }
+  transpose_8nx8n(in, outcoef128, txfm_size_row, txfm_size_col >> 1);
+  av1_round_shift_rect_array_32_sse4_1(outcoef128, outcoef128, 512 >> 1,
+                                       -shift[2], NewSqrt2);
+  (void)bd;
+}
+
+void av1_fwd_txfm2d_32x16_sse4_1(const int16_t *input, int32_t *coeff,
+                                 int stride, TX_TYPE tx_type, int bd) {
+  __m128i in[128];
+  __m128i *outcoef128 = (__m128i *)coeff;
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X16];
+  const int txw_idx = get_txw_idx(TX_32X16);
+  const int txh_idx = get_txh_idx(TX_32X16);
+  const fwd_transform_1d_sse4_1 col_txfm = row_highbd_txfm8x32_arr[tx_type];
+  const fwd_transform_1d_sse4_1 row_txfm = col_highbd_txfm8x32_arr[tx_type];
+  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+
+  // column transform
+  load_buffer_32x8n(input, in, stride, 0, 0, shift[0], 16);
+  col_txfm(in, in, bitcol, 8);
+  col_txfm_16x16_rounding(&in[0], -shift[1]);
+  col_txfm_16x16_rounding(&in[64], -shift[1]);
+  transpose_8nx8n(in, outcoef128, 32, 16);
+
+  // row transform
+  for (int i = 0; i < 4; i++) {
+    row_txfm((outcoef128 + i), (in + i), bitrow, 4);
+  }
+  transpose_8nx8n(in, outcoef128, 16, 32);
+  av1_round_shift_rect_array_32_sse4_1(outcoef128, outcoef128, 128, -shift[2],
+                                       NewSqrt2);
+  (void)bd;
+}
+
+void av1_fwd_txfm2d_8x32_sse4_1(const int16_t *input, int32_t *coeff,
+                                int stride, TX_TYPE tx_type, int bd) {
+  __m128i in[64];
+  __m128i *outcoef128 = (__m128i *)coeff;
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X32];
+  const int txw_idx = get_txw_idx(TX_8X32);
+  const int txh_idx = get_txh_idx(TX_8X32);
+  const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x32_arr[tx_type];
+  const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm32x8_arr[tx_type];
+  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+
+  const int txfm_size_col = tx_size_wide[TX_8X32];
+  const int txfm_size_row = tx_size_high[TX_8X32];
+  const int num_col = txfm_size_col >> 2;
+
+  // column transform
+  load_buffer_8x16(input, in, stride, 0, 0, shift[0]);
+  load_buffer_8x16(input + (txfm_size_row >> 1) * stride, in + txfm_size_row,
+                   stride, 0, 0, shift[0]);
+
+  for (int i = 0; i < num_col; i++) {
+    col_txfm((in + i), (in + i), bitcol, num_col);
+  }
+  col_txfm_16x16_rounding(in, -shift[1]);
+  transpose_8nx8n(in, outcoef128, txfm_size_col, txfm_size_row);
+
+  // row transform
+  for (int i = 0; i < txfm_size_col; i += 2) {
+    row_txfm((outcoef128 + i), (in + i), bitrow, txfm_size_col);
+  }
+  transpose_8nx8n(in, outcoef128, txfm_size_row, txfm_size_col);
+  (void)bd;
+}
+
+void av1_fwd_txfm2d_32x8_sse4_1(const int16_t *input, int32_t *coeff,
+                                int stride, TX_TYPE tx_type, int bd) {
+  __m128i in[64];
+  __m128i *outcoef128 = (__m128i *)coeff;
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X8];
+  const int txw_idx = get_txw_idx(TX_32X8);
+  const int txh_idx = get_txh_idx(TX_32X8);
+  const fwd_transform_1d_sse4_1 col_txfm = row_highbd_txfm32x8_arr[tx_type];
+  const fwd_transform_1d_sse4_1 row_txfm = col_highbd_txfm8x32_arr[tx_type];
+  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+
+  const int txfm_size_col = tx_size_wide[TX_32X8];
+  const int txfm_size_row = tx_size_high[TX_32X8];
+  const int num_col = txfm_size_row >> 2;
+
+  // column transform
+  load_buffer_32x8n(input, in, stride, 0, 0, shift[0], 8);
+  for (int i = 0; i < txfm_size_row; i += 2) {
+    col_txfm((in + i), (in + i), bitcol, txfm_size_row);
+  }
+
+  col_txfm_16x16_rounding(&in[0], -shift[1]);
+  transpose_8nx8n(in, outcoef128, txfm_size_col, txfm_size_row);
+
+  // row transform
+  for (int i = 0; i < num_col; i++) {
+    row_txfm((outcoef128 + i), (in + i), bitrow, num_col);
+  }
+  transpose_8nx8n(in, outcoef128, txfm_size_row, txfm_size_col);
+  (void)bd;
+}
+
+void av1_fwd_txfm2d_4x8_sse4_1(const int16_t *input, int32_t *coeff, int stride,
+                               TX_TYPE tx_type, int bd) {
+  __m128i in[8];
+  __m128i *outcoeff128 = (__m128i *)coeff;
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_4X8];
+  const int txw_idx = get_txw_idx(TX_4X8);
+  const int txh_idx = get_txh_idx(TX_4X8);
+  const int txfm_size_col = tx_size_wide[TX_4X8];
+  const int txfm_size_row = tx_size_high[TX_4X8];
+  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm4x8_arr[tx_type];
+  const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm4x4_arr[tx_type];
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  load_buffer_4x8(input, in, stride, ud_flip, lr_flip, shift[0]);
+  col_txfm(in, in, bitcol, 1);
+  col_txfm_4x8_rounding(in, -shift[1]);
+  transpose_8nx8n(in, outcoeff128, txfm_size_col, txfm_size_row);
+
+  for (int i = 0; i < 2; i++) {
+    row_txfm(outcoeff128 + i, in + i * txfm_size_col, bitrow, 2);
+  }
+  av1_round_shift_rect_array_32_sse4_1(in, outcoeff128, txfm_size_row,
+                                       -shift[2], NewSqrt2);
+  (void)bd;
+}
+
+void av1_fwd_txfm2d_8x4_sse4_1(const int16_t *input, int32_t *coeff, int stride,
+                               TX_TYPE tx_type, int bd) {
+  __m128i in[8];
+  __m128i *outcoeff128 = (__m128i *)coeff;
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X4];
+  const int txw_idx = get_txw_idx(TX_8X4);
+  const int txh_idx = get_txh_idx(TX_8X4);
+  const int txfm_size_col = tx_size_wide[TX_8X4];
+  const int txfm_size_row = tx_size_high[TX_8X4];
+  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm4x4_arr[tx_type];
+  const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm4x8_arr[tx_type];
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  // col tranform
+  load_buffer_8x4(input, in, stride, ud_flip, lr_flip, shift[0]);
+  for (int i = 0; i < 2; i++) {
+    col_txfm(in + i * txfm_size_row, in + i * txfm_size_row, bitcol, 1);
+  }
+  col_txfm_4x8_rounding(in, -shift[1]);
+
+  // row tranform
+  row_txfm(in, outcoeff128, bitrow, 1);
+  av1_round_shift_rect_array_32_sse4_1(outcoeff128, in, txfm_size_col,
+                                       -shift[2], NewSqrt2);
+  transpose_8nx8n(in, outcoeff128, txfm_size_row, txfm_size_col);
+  (void)bd;
+}
+
+void av1_fwd_txfm2d_16x64_sse4_1(const int16_t *input, int32_t *coeff,
+                                 int stride, TX_TYPE tx_type, int bd) {
+  __m128i in[256];
+  __m128i *outcoeff128 = (__m128i *)coeff;
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X64];
+  const int txw_idx = get_txw_idx(TX_16X64);
+  const int txh_idx = get_txh_idx(TX_16X64);
+  const int txfm_size_col = tx_size_wide[TX_16X64];
+  const int txfm_size_row = tx_size_high[TX_16X64];
+  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  const int num_col = txfm_size_col >> 2;
+  // col tranform
+  for (int i = 0; i < txfm_size_row; i += num_col) {
+    load_buffer_4x4(input + (i + 0) * stride, in + (i + 0) * num_col, num_col,
+                    ud_flip, lr_flip, shift[0]);
+    load_buffer_4x4(input + (i + 1) * stride, in + (i + 1) * num_col, num_col,
+                    ud_flip, lr_flip, shift[0]);
+    load_buffer_4x4(input + (i + 2) * stride, in + (i + 2) * num_col, num_col,
+                    ud_flip, lr_flip, shift[0]);
+    load_buffer_4x4(input + (i + 3) * stride, in + (i + 3) * num_col, num_col,
+                    ud_flip, lr_flip, shift[0]);
+  }
+
+  for (int i = 0; i < num_col; i++) {
+    av1_fdct64_sse4_1(in + i, outcoeff128 + i, bitcol, num_col, num_col);
+  }
+
+  col_txfm_16x16_rounding(outcoeff128, -shift[1]);
+  col_txfm_16x16_rounding(outcoeff128 + 64, -shift[1]);
+  col_txfm_16x16_rounding(outcoeff128 + 128, -shift[1]);
+  col_txfm_16x16_rounding(outcoeff128 + 192, -shift[1]);
+
+  transpose_8nx8n(outcoeff128, in, txfm_size_col, 32);
+  fdct16x16_sse4_1(in, in, bitrow, 8);
+  transpose_8nx8n(in, outcoeff128, 32, txfm_size_col);
+  memset(coeff + txfm_size_col * 32, 0, txfm_size_col * 32 * sizeof(*coeff));
+  (void)bd;
+}
+
+void av1_fwd_txfm2d_64x16_sse4_1(const int16_t *input, int32_t *coeff,
+                                 int stride, TX_TYPE tx_type, int bd) {
+  __m128i in[256];
+  __m128i *outcoeff128 = (__m128i *)coeff;
+  const int8_t *shift = av1_fwd_txfm_shift_ls[TX_64X16];
+  const int txw_idx = get_txw_idx(TX_64X16);
+  const int txh_idx = get_txh_idx(TX_64X16);
+  const int txfm_size_col = tx_size_wide[TX_64X16];
+  const int txfm_size_row = tx_size_high[TX_64X16];
+  int bitcol = av1_fwd_cos_bit_col[txw_idx][txh_idx];
+  int bitrow = av1_fwd_cos_bit_row[txw_idx][txh_idx];
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  // col tranform
+  for (int i = 0; i < txfm_size_row; i++) {
+    load_buffer_4x4(input + 0 + i * stride, in + 0 + i * txfm_size_row, 4,
+                    ud_flip, lr_flip, shift[0]);
+    load_buffer_4x4(input + 16 + i * stride, in + 4 + i * txfm_size_row, 4,
+                    ud_flip, lr_flip, shift[0]);
+    load_buffer_4x4(input + 32 + i * stride, in + 8 + i * txfm_size_row, 4,
+                    ud_flip, lr_flip, shift[0]);
+    load_buffer_4x4(input + 48 + i * stride, in + 12 + i * txfm_size_row, 4,
+                    ud_flip, lr_flip, shift[0]);
+  }
+
+  fdct16x16_sse4_1(in, outcoeff128, bitcol, txfm_size_row);
+  col_txfm_16x16_rounding(outcoeff128, -shift[1]);
+  col_txfm_16x16_rounding(outcoeff128 + 64, -shift[1]);
+  col_txfm_16x16_rounding(outcoeff128 + 128, -shift[1]);
+  col_txfm_16x16_rounding(outcoeff128 + 192, -shift[1]);
+
+  transpose_8nx8n(outcoeff128, in, txfm_size_col, txfm_size_row);
+  for (int i = 0; i < 4; i++) {
+    av1_fdct64_sse4_1(in + i, in + i, bitrow, 4, 4);
+  }
+  transpose_8nx8n(in, outcoeff128, txfm_size_row, 32);
+  (void)bd;
+}
diff --git a/libs/libaom/src/av1/encoder/x86/ml_sse3.c b/libs/libaom/src/av1/encoder/x86/ml_sse3.c
new file mode 100644
index 000000000..89b1e6a05
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/x86/ml_sse3.c
@@ -0,0 +1,244 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdbool.h>
+#include <assert.h>
+#include <pmmintrin.h>
+
+#include "config/av1_rtcd.h"
+#include "av1/encoder/ml.h"
+
+// In order to avoid the high-latency of swapping between FPU and SIMD
+// operations, we keep the result in a 128-bit register even though we only
+// care about a single value.
+static void nn_propagate_8to1(const float *const inputs,
+                              const float *const weights,
+                              __m128 *const output) {
+  const __m128 inputs_h = _mm_loadu_ps(&inputs[4]);
+  const __m128 inputs_l = _mm_loadu_ps(inputs);
+
+  const __m128 weights_h = _mm_loadu_ps(&weights[4]);
+  const __m128 weights_l = _mm_loadu_ps(weights);
+
+  const __m128 mul_h = _mm_mul_ps(inputs_h, weights_h);
+  const __m128 mul_l = _mm_mul_ps(inputs_l, weights_l);
+  // [7 6 5 4] [3 2 1 0] (weight and input indices)
+
+  const __m128 vadd = _mm_add_ps(mul_l, mul_h);
+  // [7+3 6+2 5+1 4+0]
+  const __m128 hadd1 = _mm_hadd_ps(vadd, vadd);
+  // [7+6+3+2 5+4+1+0 7+6+3+2 5+4+1+0]
+  const __m128 hadd2 = _mm_hadd_ps(hadd1, hadd1);
+  // [7+6+5+4+3+2+1+0 7+6+5+4+3+2+1+0 7+6+5+4+3+2+1+0 7+6+5+4+3+2+1+0]
+  *output = _mm_add_ps(*output, hadd2);
+}
+
+static void nn_propagate_4to1(const float *const inputs,
+                              const float *const weights,
+                              __m128 *const output) {
+  const __m128 inputs128 = _mm_loadu_ps(inputs);
+
+  const __m128 weights128 = _mm_loadu_ps(weights);
+
+  const __m128 mul = _mm_mul_ps(inputs128, weights128);
+  // [3 2 1 0] (weight and input indices)
+
+  const __m128 hadd1 = _mm_hadd_ps(mul, mul);
+  // [3+2 1+0 3+2 1+0]
+  const __m128 hadd2 = _mm_hadd_ps(hadd1, hadd1);
+  // [3+2+1+0 3+2+1+0 3+2+1+0 3+2+1+0]
+  *output = _mm_add_ps(*output, hadd2);
+}
+
+static void nn_propagate_4to4(const float *const inputs,
+                              const float *const weights, __m128 *const outputs,
+                              const int num_inputs) {
+  const __m128 inputs128 = _mm_loadu_ps(inputs);
+
+  __m128 hadd[2];
+  for (int i = 0; i < 2; i++) {  // For each pair of outputs
+    const __m128 weight0 = _mm_loadu_ps(&weights[2 * i * num_inputs]);
+    const __m128 mul0 = _mm_mul_ps(weight0, inputs128);
+    const __m128 weight1 = _mm_loadu_ps(&weights[(2 * i + 1) * num_inputs]);
+    const __m128 mul1 = _mm_mul_ps(weight1, inputs128);
+    hadd[i] = _mm_hadd_ps(mul0, mul1);
+  }
+  // hadd[0] = [7+6 5+4 3+2 1+0] (weight indices)
+  // hadd[1] = [15+14 13+12 11+10 9+8]
+
+  const __m128 hh = _mm_hadd_ps(hadd[0], hadd[1]);
+  // [15+14+13+12 11+10+9+8 7+6+5+4 3+2+1+0]
+
+  *outputs = _mm_add_ps(*outputs, hh);
+}
+
+static void nn_propagate_4to8(const float *const inputs,
+                              const float *const weights, __m128 *const out_h,
+                              __m128 *const out_l, const int num_inputs) {
+  const __m128 inputs128 = _mm_loadu_ps(inputs);
+
+  __m128 hadd[4];
+  for (int i = 0; i < 4; i++) {  // For each pair of outputs
+    const __m128 weight0 = _mm_loadu_ps(&weights[2 * i * num_inputs]);
+    const __m128 weight1 = _mm_loadu_ps(&weights[(2 * i + 1) * num_inputs]);
+    const __m128 mul0 = _mm_mul_ps(inputs128, weight0);
+    const __m128 mul1 = _mm_mul_ps(inputs128, weight1);
+    hadd[i] = _mm_hadd_ps(mul0, mul1);
+  }
+  // hadd[0] = [7+6 5+4 3+2 1+0] (weight indices)
+  // hadd[1] = [15+14 13+12 11+10 9+8]
+  // hadd[2] = [23+22 21+20 19+18 17+16]
+  // hadd[3] = [31+30 29+28 27+26 25+24]
+
+  const __m128 hh0 = _mm_hadd_ps(hadd[0], hadd[1]);
+  // [15+14+13+12 11+10+9+8 7+6+5+4 3+2+1+0]
+  const __m128 hh1 = _mm_hadd_ps(hadd[2], hadd[3]);
+  // [31+30+29+28 27+26+25+24 23+22+21+20 19+18+17+16]
+
+  *out_h = _mm_add_ps(*out_h, hh1);
+  *out_l = _mm_add_ps(*out_l, hh0);
+}
+
+static void nn_propagate_8to4(const float *const inputs,
+                              const float *const weights, __m128 *const outputs,
+                              const int num_inputs) {
+  const __m128 inputs_h = _mm_loadu_ps(inputs + 4);
+  const __m128 inputs_l = _mm_loadu_ps(inputs);
+  // [7 6 5 4] [3 2 1 0] (input indices)
+
+  __m128 add[4];
+  for (int i = 0; i < 4; i++) {  // For each output:
+    const __m128 weight_h = _mm_loadu_ps(&weights[i * num_inputs + 4]);
+    const __m128 weight_l = _mm_loadu_ps(&weights[i * num_inputs]);
+    const __m128 mul_h = _mm_mul_ps(inputs_h, weight_h);
+    const __m128 mul_l = _mm_mul_ps(inputs_l, weight_l);
+    add[i] = _mm_add_ps(mul_l, mul_h);
+  }
+  // add[0] = [7+3 6+2 5+1 4+0]
+  // add[1] = [15+11 14+10 13+9 12+8]
+  // add[2] = [23+19 22+18 21+17 20+16]
+  // add[3] = [31+27 30+26 29+25 28+24]
+
+  const __m128 hadd_h = _mm_hadd_ps(add[2], add[3]);
+  // [31+30+27+26 29+28+25+24 23+22+19+18 21+20+17+16]
+  const __m128 hadd_l = _mm_hadd_ps(add[0], add[1]);
+  // [15+14+11+10 13+12+9+8 7+6+3+2 5+4+1+0]
+
+  const __m128 haddhadd = _mm_hadd_ps(hadd_l, hadd_h);
+  // [31+30+29+28+27+26+25+24 23+22+21+20+19+18+17+16
+  //  15+14+13+12+11+10+9+8 7+6+5+4+3+2+1+0]
+
+  *outputs = _mm_add_ps(*outputs, haddhadd);
+}
+
+static void nn_activate8(__m128 *out_h, __m128 *out_l) {
+  const __m128 zero = _mm_setzero_ps();
+  *out_h = _mm_max_ps(*out_h, zero);
+  *out_l = _mm_max_ps(*out_l, zero);
+}
+
+static void nn_activate4(__m128 *x) { *x = _mm_max_ps(*x, _mm_setzero_ps()); }
+
+// Calculate prediction based on the given input features and neural net config.
+// Assume there are no more than NN_MAX_NODES_PER_LAYER nodes in each hidden
+// layer.
+void av1_nn_predict_sse3(const float *input_nodes,
+                         const NN_CONFIG *const nn_config, int reduce_prec,
+                         float *const output) {
+  float buf[2][NN_MAX_NODES_PER_LAYER];
+  int buf_index = 0;
+  int num_inputs = nn_config->num_inputs;
+
+  // Hidden layers, except the final iteration is the output layer.
+  for (int layer = 0; layer <= nn_config->num_hidden_layers; layer++) {
+    const float *layer_weights = nn_config->weights[layer];
+    const float *layer_bias = nn_config->bias[layer];
+    bool output_layer = (layer == nn_config->num_hidden_layers);
+    float *const output_nodes = output_layer ? output : &buf[buf_index][0];
+    const int num_outputs = output_layer ? nn_config->num_outputs
+                                         : nn_config->num_hidden_nodes[layer];
+
+    if (num_inputs % 4 == 0 && num_outputs % 8 == 0) {
+      for (int out = 0; out < num_outputs; out += 8) {
+        __m128 out_h = _mm_loadu_ps(&layer_bias[out + 4]);
+        __m128 out_l = _mm_loadu_ps(&layer_bias[out]);
+        for (int in = 0; in < num_inputs; in += 4) {
+          nn_propagate_4to8(&input_nodes[in],
+                            &layer_weights[out * num_inputs + in], &out_h,
+                            &out_l, num_inputs);
+        }
+        if (!output_layer) nn_activate8(&out_h, &out_l);
+        _mm_storeu_ps(&output_nodes[out + 4], out_h);
+        _mm_storeu_ps(&output_nodes[out], out_l);
+      }
+    } else if (num_inputs % 8 == 0 && num_outputs % 4 == 0) {
+      for (int out = 0; out < num_outputs; out += 4) {
+        __m128 outputs = _mm_loadu_ps(&layer_bias[out]);
+        for (int in = 0; in < num_inputs; in += 8) {
+          nn_propagate_8to4(&input_nodes[in],
+                            &layer_weights[out * num_inputs + in], &outputs,
+                            num_inputs);
+        }
+        if (!output_layer) nn_activate4(&outputs);
+        _mm_storeu_ps(&output_nodes[out], outputs);
+      }
+    } else if (num_inputs % 4 == 0 && num_outputs % 4 == 0) {
+      for (int out = 0; out < num_outputs; out += 4) {
+        __m128 outputs = _mm_loadu_ps(&layer_bias[out]);
+        for (int in = 0; in < num_inputs; in += 4) {
+          nn_propagate_4to4(&input_nodes[in],
+                            &layer_weights[out * num_inputs + in], &outputs,
+                            num_inputs);
+        }
+        if (!output_layer) nn_activate4(&outputs);
+        _mm_storeu_ps(&output_nodes[out], outputs);
+      }
+    } else if (num_inputs % 8 == 0) {
+      for (int out = 0; out < num_outputs; out++) {
+        __m128 total = _mm_load1_ps(&layer_bias[out]);
+        for (int in = 0; in < num_inputs; in += 8) {
+          nn_propagate_8to1(&input_nodes[in],
+                            &layer_weights[out * num_inputs + in], &total);
+        }
+        if (!output_layer) nn_activate4(&total);
+        output_nodes[out] = _mm_cvtss_f32(total);
+      }
+    } else if (num_inputs % 4 == 0) {
+      for (int out = 0; out < num_outputs; out++) {
+        __m128 total = _mm_load1_ps(&layer_bias[out]);
+        for (int in = 0; in < num_inputs; in += 4) {
+          nn_propagate_4to1(&input_nodes[in],
+                            &layer_weights[out * num_inputs + in], &total);
+        }
+        if (!output_layer) nn_activate4(&total);
+        output_nodes[out] = _mm_cvtss_f32(total);
+      }
+    } else {
+      // Use SSE instructions for scalar operations to avoid the latency of
+      // swapping between SIMD and FPU modes.
+      for (int out = 0; out < num_outputs; out++) {
+        __m128 total = _mm_load1_ps(&layer_bias[out]);
+        for (int in_node = 0; in_node < num_inputs; in_node++) {
+          __m128 input = _mm_load1_ps(&input_nodes[in_node]);
+          __m128 weight =
+              _mm_load1_ps(&layer_weights[num_inputs * out + in_node]);
+          total = _mm_add_ps(total, _mm_mul_ps(input, weight));
+        }
+        if (!output_layer) nn_activate4(&total);
+        output_nodes[out] = _mm_cvtss_f32(total);
+      }
+    }
+    input_nodes = output_nodes;
+    num_inputs = num_outputs;
+    buf_index = 1 - buf_index;
+  }
+  if (reduce_prec) av1_nn_output_prec_reduce(output, nn_config->num_outputs);
+}
diff --git a/libs/libaom/src/av1/encoder/x86/pickrst_avx2.c b/libs/libaom/src/av1/encoder/x86/pickrst_avx2.c
new file mode 100644
index 000000000..f8703a23c
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/x86/pickrst_avx2.c
@@ -0,0 +1,1084 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>  // AVX2
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+#include "aom_dsp/x86/transpose_sse2.h"
+
+#include "config/av1_rtcd.h"
+#include "av1/common/restoration.h"
+#include "av1/encoder/pickrst.h"
+
+static INLINE void acc_stat_avx2(int32_t *dst, const uint8_t *src,
+                                 const __m128i *shuffle, const __m256i *kl) {
+  const __m128i s = _mm_shuffle_epi8(xx_loadu_128(src), *shuffle);
+  const __m256i d0 = _mm256_madd_epi16(*kl, _mm256_cvtepu8_epi16(s));
+  const __m256i dst0 = yy_load_256(dst);
+  const __m256i r0 = _mm256_add_epi32(dst0, d0);
+  yy_store_256(dst, r0);
+}
+
+static INLINE void acc_stat_win7_one_line_avx2(
+    const uint8_t *dgd, const uint8_t *src, int h_start, int h_end,
+    int dgd_stride, const __m128i *shuffle, int32_t *sumX,
+    int32_t sumY[WIENER_WIN][WIENER_WIN], int32_t M_int[WIENER_WIN][WIENER_WIN],
+    int32_t H_int[WIENER_WIN2][WIENER_WIN * 8]) {
+  int j, k, l;
+  const int wiener_win = WIENER_WIN;
+  for (j = h_start; j < h_end; j += 2) {
+    const uint8_t X1 = src[j];
+    const uint8_t X2 = src[j + 1];
+    *sumX += X1 + X2;
+    const uint8_t *dgd_ij = dgd + j;
+    for (k = 0; k < wiener_win; k++) {
+      const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride;
+      for (l = 0; l < wiener_win; l++) {
+        int32_t *H_ = &H_int[(l * wiener_win + k)][0];
+        const uint8_t D1 = dgd_ijk[l];
+        const uint8_t D2 = dgd_ijk[l + 1];
+        sumY[k][l] += D1 + D2;
+        M_int[k][l] += D1 * X1 + D2 * X2;
+
+        const __m256i kl =
+            _mm256_cvtepu8_epi16(_mm_set1_epi16(*((uint16_t *)(dgd_ijk + l))));
+        acc_stat_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
+        acc_stat_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
+        acc_stat_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
+        acc_stat_avx2(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl);
+        acc_stat_avx2(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl);
+        acc_stat_avx2(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle, &kl);
+        acc_stat_avx2(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle, &kl);
+      }
+    }
+  }
+}
+
+static INLINE void compute_stats_win7_opt_avx2(
+    const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, int v_start,
+    int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H) {
+  int i, j, k, l, m, n;
+  const int wiener_win = WIENER_WIN;
+  const int pixel_count = (h_end - h_start) * (v_end - v_start);
+  const int wiener_win2 = wiener_win * wiener_win;
+  const int wiener_halfwin = (wiener_win >> 1);
+  uint8_t avg = find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+
+  int32_t M_int32[WIENER_WIN][WIENER_WIN] = { { 0 } };
+  int64_t M_int64[WIENER_WIN][WIENER_WIN] = { { 0 } };
+
+  DECLARE_ALIGNED(32, int32_t,
+                  H_int32[WIENER_WIN2][WIENER_WIN * 8]) = { { 0 } };
+  int64_t H_int64[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } };
+  int32_t sumY[WIENER_WIN][WIENER_WIN] = { { 0 } };
+  int32_t sumX = 0;
+  const uint8_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
+
+  const __m128i shuffle = xx_loadu_128(g_shuffle_stats_data);
+  for (j = v_start; j < v_end; j += 64) {
+    const int vert_end = AOMMIN(64, v_end - j) + j;
+    for (i = j; i < vert_end; i++) {
+      acc_stat_win7_one_line_avx2(
+          dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end,
+          dgd_stride, &shuffle, &sumX, sumY, M_int32, H_int32);
+    }
+    for (k = 0; k < wiener_win; ++k) {
+      for (l = 0; l < wiener_win; ++l) {
+        M_int64[k][l] += M_int32[k][l];
+        M_int32[k][l] = 0;
+      }
+    }
+    for (k = 0; k < WIENER_WIN2; ++k) {
+      for (l = 0; l < WIENER_WIN * 8; ++l) {
+        H_int64[k][l] += H_int32[k][l];
+        H_int32[k][l] = 0;
+      }
+    }
+  }
+
+  const int64_t avg_square_sum = (int64_t)avg * (int64_t)avg * pixel_count;
+  for (k = 0; k < wiener_win; k++) {
+    for (l = 0; l < wiener_win; l++) {
+      const int32_t idx0 = l * wiener_win + k;
+      M[idx0] =
+          M_int64[k][l] + (avg_square_sum - (int64_t)avg * (sumX + sumY[k][l]));
+      int64_t *H_ = H + idx0 * wiener_win2;
+      int64_t *H_int_ = &H_int64[idx0][0];
+      for (m = 0; m < wiener_win; m++) {
+        for (n = 0; n < wiener_win; n++) {
+          H_[m * wiener_win + n] = H_int_[n * 8 + m] + avg_square_sum -
+                                   (int64_t)avg * (sumY[k][l] + sumY[n][m]);
+        }
+      }
+    }
+  }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE void acc_stat_highbd_avx2(int64_t *dst, const uint16_t *dgd,
+                                        const __m256i *shuffle,
+                                        const __m256i *dgd_ijkl) {
+  // Load two 128-bit chunks from dgd
+  const __m256i s0 = _mm256_inserti128_si256(
+      _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)dgd)),
+      _mm_loadu_si128((__m128i *)(dgd + 4)), 1);
+  // s0 = [11 10 9 8 7 6 5 4] [7 6 5 4 3 2 1 0] as u16 (values are dgd indices)
+  // The weird order is so the shuffle stays within 128-bit lanes
+
+  // Shuffle 16x u16 values within lanes according to the mask:
+  // [0 1 1 2 2 3 3 4] [0 1 1 2 2 3 3 4]
+  // (Actually we shuffle u8 values as there's no 16-bit shuffle)
+  const __m256i s1 = _mm256_shuffle_epi8(s0, *shuffle);
+  // s1 = [8 7 7 6 6 5 5 4] [4 3 3 2 2 1 1 0] as u16 (values are dgd indices)
+
+  // Multiply 16x 16-bit integers in dgd_ijkl and s1, resulting in 16x 32-bit
+  // integers then horizontally add pairs of these integers resulting in 8x
+  // 32-bit integers
+  const __m256i d0 = _mm256_madd_epi16(*dgd_ijkl, s1);
+  // d0 = [a b c d] [e f g h] as u32
+
+  // Take the lower-half of d0, extend to u64, add it on to dst (H)
+  const __m256i d0l = _mm256_cvtepu32_epi64(_mm256_extracti128_si256(d0, 0));
+  // d0l = [a b] [c d] as u64
+  const __m256i dst0 = yy_load_256(dst);
+  yy_store_256(dst, _mm256_add_epi64(d0l, dst0));
+
+  // Take the upper-half of d0, extend to u64, add it on to dst (H)
+  const __m256i d0h = _mm256_cvtepu32_epi64(_mm256_extracti128_si256(d0, 1));
+  // d0h = [e f] [g h] as u64
+  const __m256i dst1 = yy_load_256(dst + 4);
+  yy_store_256(dst + 4, _mm256_add_epi64(d0h, dst1));
+}
+
+static INLINE void acc_stat_highbd_win7_one_line_avx2(
+    const uint16_t *dgd, const uint16_t *src, int h_start, int h_end,
+    int dgd_stride, const __m256i *shuffle, int32_t *sumX,
+    int32_t sumY[WIENER_WIN][WIENER_WIN], int64_t M_int[WIENER_WIN][WIENER_WIN],
+    int64_t H_int[WIENER_WIN2][WIENER_WIN * 8]) {
+  int j, k, l;
+  const int wiener_win = WIENER_WIN;
+  for (j = h_start; j < h_end; j += 2) {
+    const uint16_t X1 = src[j];
+    const uint16_t X2 = src[j + 1];
+    *sumX += X1 + X2;
+    const uint16_t *dgd_ij = dgd + j;
+    for (k = 0; k < wiener_win; k++) {
+      const uint16_t *dgd_ijk = dgd_ij + k * dgd_stride;
+      for (l = 0; l < wiener_win; l++) {
+        int64_t *H_ = &H_int[(l * wiener_win + k)][0];
+        const uint16_t D1 = dgd_ijk[l];
+        const uint16_t D2 = dgd_ijk[l + 1];
+        sumY[k][l] += D1 + D2;
+        M_int[k][l] += D1 * X1 + D2 * X2;
+
+        // Load two u16 values from dgd_ijkl combined as a u32,
+        // then broadcast to 8x u32 slots of a 256
+        const __m256i dgd_ijkl =
+            _mm256_set1_epi32(*((uint32_t *)(dgd_ijk + l)));
+        // dgd_ijkl = [y x y x y x y x] [y x y x y x y x] where each is a u16
+
+        acc_stat_highbd_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle,
+                             &dgd_ijkl);
+        acc_stat_highbd_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle,
+                             &dgd_ijkl);
+        acc_stat_highbd_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle,
+                             &dgd_ijkl);
+        acc_stat_highbd_avx2(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle,
+                             &dgd_ijkl);
+        acc_stat_highbd_avx2(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle,
+                             &dgd_ijkl);
+        acc_stat_highbd_avx2(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle,
+                             &dgd_ijkl);
+        acc_stat_highbd_avx2(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle,
+                             &dgd_ijkl);
+      }
+    }
+  }
+}
+
+static INLINE void compute_stats_highbd_win7_opt_avx2(
+    const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end,
+    int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M,
+    int64_t *H, aom_bit_depth_t bit_depth) {
+  int i, j, k, l, m, n;
+  const int wiener_win = WIENER_WIN;
+  const int pixel_count = (h_end - h_start) * (v_end - v_start);
+  const int wiener_win2 = wiener_win * wiener_win;
+  const int wiener_halfwin = (wiener_win >> 1);
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *dgd = CONVERT_TO_SHORTPTR(dgd8);
+  const uint16_t avg =
+      find_average_highbd(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+
+  int64_t M_int[WIENER_WIN][WIENER_WIN] = { { 0 } };
+  DECLARE_ALIGNED(32, int64_t, H_int[WIENER_WIN2][WIENER_WIN * 8]) = { { 0 } };
+  int32_t sumY[WIENER_WIN][WIENER_WIN] = { { 0 } };
+  int32_t sumX = 0;
+  const uint16_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
+
+  const __m256i shuffle = yy_loadu_256(g_shuffle_stats_highbd_data);
+  for (j = v_start; j < v_end; j += 64) {
+    const int vert_end = AOMMIN(64, v_end - j) + j;
+    for (i = j; i < vert_end; i++) {
+      acc_stat_highbd_win7_one_line_avx2(
+          dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end,
+          dgd_stride, &shuffle, &sumX, sumY, M_int, H_int);
+    }
+  }
+
+  uint8_t bit_depth_divider = 1;
+  if (bit_depth == AOM_BITS_12)
+    bit_depth_divider = 16;
+  else if (bit_depth == AOM_BITS_10)
+    bit_depth_divider = 4;
+
+  const int64_t avg_square_sum = (int64_t)avg * (int64_t)avg * pixel_count;
+  for (k = 0; k < wiener_win; k++) {
+    for (l = 0; l < wiener_win; l++) {
+      const int32_t idx0 = l * wiener_win + k;
+      M[idx0] = (M_int[k][l] +
+                 (avg_square_sum - (int64_t)avg * (sumX + sumY[k][l]))) /
+                bit_depth_divider;
+      int64_t *H_ = H + idx0 * wiener_win2;
+      int64_t *H_int_ = &H_int[idx0][0];
+      for (m = 0; m < wiener_win; m++) {
+        for (n = 0; n < wiener_win; n++) {
+          H_[m * wiener_win + n] =
+              (H_int_[n * 8 + m] +
+               (avg_square_sum - (int64_t)avg * (sumY[k][l] + sumY[n][m]))) /
+              bit_depth_divider;
+        }
+      }
+    }
+  }
+}
+
+static INLINE void acc_stat_highbd_win5_one_line_avx2(
+    const uint16_t *dgd, const uint16_t *src, int h_start, int h_end,
+    int dgd_stride, const __m256i *shuffle, int32_t *sumX,
+    int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA],
+    int64_t M_int[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA],
+    int64_t H_int[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8]) {
+  int j, k, l;
+  const int wiener_win = WIENER_WIN_CHROMA;
+  for (j = h_start; j < h_end; j += 2) {
+    const uint16_t X1 = src[j];
+    const uint16_t X2 = src[j + 1];
+    *sumX += X1 + X2;
+    const uint16_t *dgd_ij = dgd + j;
+    for (k = 0; k < wiener_win; k++) {
+      const uint16_t *dgd_ijk = dgd_ij + k * dgd_stride;
+      for (l = 0; l < wiener_win; l++) {
+        int64_t *H_ = &H_int[(l * wiener_win + k)][0];
+        const uint16_t D1 = dgd_ijk[l];
+        const uint16_t D2 = dgd_ijk[l + 1];
+        sumY[k][l] += D1 + D2;
+        M_int[k][l] += D1 * X1 + D2 * X2;
+
+        // Load two u16 values from dgd_ijkl combined as a u32,
+        // then broadcast to 8x u32 slots of a 256
+        const __m256i dgd_ijkl =
+            _mm256_set1_epi32(*((uint32_t *)(dgd_ijk + l)));
+        // dgd_ijkl = [x y x y x y x y] [x y x y x y x y] where each is a u16
+
+        acc_stat_highbd_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle,
+                             &dgd_ijkl);
+        acc_stat_highbd_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle,
+                             &dgd_ijkl);
+        acc_stat_highbd_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle,
+                             &dgd_ijkl);
+        acc_stat_highbd_avx2(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle,
+                             &dgd_ijkl);
+        acc_stat_highbd_avx2(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle,
+                             &dgd_ijkl);
+      }
+    }
+  }
+}
+
+static INLINE void compute_stats_highbd_win5_opt_avx2(
+    const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end,
+    int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M,
+    int64_t *H, aom_bit_depth_t bit_depth) {
+  int i, j, k, l, m, n;
+  const int wiener_win = WIENER_WIN_CHROMA;
+  const int pixel_count = (h_end - h_start) * (v_end - v_start);
+  const int wiener_win2 = wiener_win * wiener_win;
+  const int wiener_halfwin = (wiener_win >> 1);
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *dgd = CONVERT_TO_SHORTPTR(dgd8);
+  const uint16_t avg =
+      find_average_highbd(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+
+  int64_t M_int64[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+  DECLARE_ALIGNED(
+      32, int64_t,
+      H_int64[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8]) = { { 0 } };
+  int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+  int32_t sumX = 0;
+  const uint16_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
+
+  const __m256i shuffle = yy_loadu_256(g_shuffle_stats_highbd_data);
+  for (j = v_start; j < v_end; j += 64) {
+    const int vert_end = AOMMIN(64, v_end - j) + j;
+    for (i = j; i < vert_end; i++) {
+      acc_stat_highbd_win5_one_line_avx2(
+          dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end,
+          dgd_stride, &shuffle, &sumX, sumY, M_int64, H_int64);
+    }
+  }
+
+  uint8_t bit_depth_divider = 1;
+  if (bit_depth == AOM_BITS_12)
+    bit_depth_divider = 16;
+  else if (bit_depth == AOM_BITS_10)
+    bit_depth_divider = 4;
+
+  const int64_t avg_square_sum = (int64_t)avg * (int64_t)avg * pixel_count;
+  for (k = 0; k < wiener_win; k++) {
+    for (l = 0; l < wiener_win; l++) {
+      const int32_t idx0 = l * wiener_win + k;
+      M[idx0] = (M_int64[k][l] +
+                 (avg_square_sum - (int64_t)avg * (sumX + sumY[k][l]))) /
+                bit_depth_divider;
+      int64_t *H_ = H + idx0 * wiener_win2;
+      int64_t *H_int_ = &H_int64[idx0][0];
+      for (m = 0; m < wiener_win; m++) {
+        for (n = 0; n < wiener_win; n++) {
+          H_[m * wiener_win + n] =
+              (H_int_[n * 8 + m] +
+               (avg_square_sum - (int64_t)avg * (sumY[k][l] + sumY[n][m]))) /
+              bit_depth_divider;
+        }
+      }
+    }
+  }
+}
+
+void av1_compute_stats_highbd_avx2(int wiener_win, const uint8_t *dgd8,
+                                   const uint8_t *src8, int h_start, int h_end,
+                                   int v_start, int v_end, int dgd_stride,
+                                   int src_stride, int64_t *M, int64_t *H,
+                                   aom_bit_depth_t bit_depth) {
+  if (wiener_win == WIENER_WIN) {
+    compute_stats_highbd_win7_opt_avx2(dgd8, src8, h_start, h_end, v_start,
+                                       v_end, dgd_stride, src_stride, M, H,
+                                       bit_depth);
+  } else if (wiener_win == WIENER_WIN_CHROMA) {
+    compute_stats_highbd_win5_opt_avx2(dgd8, src8, h_start, h_end, v_start,
+                                       v_end, dgd_stride, src_stride, M, H,
+                                       bit_depth);
+  } else {
+    av1_compute_stats_highbd_c(wiener_win, dgd8, src8, h_start, h_end, v_start,
+                               v_end, dgd_stride, src_stride, M, H, bit_depth);
+  }
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+static INLINE void acc_stat_win5_one_line_avx2(
+    const uint8_t *dgd, const uint8_t *src, int h_start, int h_end,
+    int dgd_stride, const __m128i *shuffle, int32_t *sumX,
+    int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA],
+    int32_t M_int[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA],
+    int32_t H_int[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8]) {
+  int j, k, l;
+  const int wiener_win = WIENER_WIN_CHROMA;
+  for (j = h_start; j < h_end; j += 2) {
+    const uint8_t X1 = src[j];
+    const uint8_t X2 = src[j + 1];
+    *sumX += X1 + X2;
+    const uint8_t *dgd_ij = dgd + j;
+    for (k = 0; k < wiener_win; k++) {
+      const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride;
+      for (l = 0; l < wiener_win; l++) {
+        int32_t *H_ = &H_int[(l * wiener_win + k)][0];
+        const uint8_t D1 = dgd_ijk[l];
+        const uint8_t D2 = dgd_ijk[l + 1];
+        sumY[k][l] += D1 + D2;
+        M_int[k][l] += D1 * X1 + D2 * X2;
+
+        const __m256i kl =
+            _mm256_cvtepu8_epi16(_mm_set1_epi16(*((uint16_t *)(dgd_ijk + l))));
+        acc_stat_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
+        acc_stat_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
+        acc_stat_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
+        acc_stat_avx2(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl);
+        acc_stat_avx2(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl);
+      }
+    }
+  }
+}
+
+static INLINE void compute_stats_win5_opt_avx2(
+    const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, int v_start,
+    int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H) {
+  int i, j, k, l, m, n;
+  const int wiener_win = WIENER_WIN_CHROMA;
+  const int pixel_count = (h_end - h_start) * (v_end - v_start);
+  const int wiener_win2 = wiener_win * wiener_win;
+  const int wiener_halfwin = (wiener_win >> 1);
+  uint8_t avg = find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+
+  int32_t M_int32[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+  int64_t M_int64[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+  DECLARE_ALIGNED(
+      32, int32_t,
+      H_int32[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8]) = { { 0 } };
+  int64_t H_int64[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } };
+  int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+  int32_t sumX = 0;
+  const uint8_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
+
+  const __m128i shuffle = xx_loadu_128(g_shuffle_stats_data);
+  for (j = v_start; j < v_end; j += 64) {
+    const int vert_end = AOMMIN(64, v_end - j) + j;
+    for (i = j; i < vert_end; i++) {
+      acc_stat_win5_one_line_avx2(
+          dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end,
+          dgd_stride, &shuffle, &sumX, sumY, M_int32, H_int32);
+    }
+    for (k = 0; k < wiener_win; ++k) {
+      for (l = 0; l < wiener_win; ++l) {
+        M_int64[k][l] += M_int32[k][l];
+        M_int32[k][l] = 0;
+      }
+    }
+    for (k = 0; k < WIENER_WIN2_CHROMA; ++k) {
+      for (l = 0; l < WIENER_WIN_CHROMA * 8; ++l) {
+        H_int64[k][l] += H_int32[k][l];
+        H_int32[k][l] = 0;
+      }
+    }
+  }
+
+  const int64_t avg_square_sum = (int64_t)avg * (int64_t)avg * pixel_count;
+  for (k = 0; k < wiener_win; k++) {
+    for (l = 0; l < wiener_win; l++) {
+      const int32_t idx0 = l * wiener_win + k;
+      M[idx0] =
+          M_int64[k][l] + (avg_square_sum - (int64_t)avg * (sumX + sumY[k][l]));
+      int64_t *H_ = H + idx0 * wiener_win2;
+      int64_t *H_int_ = &H_int64[idx0][0];
+      for (m = 0; m < wiener_win; m++) {
+        for (n = 0; n < wiener_win; n++) {
+          H_[m * wiener_win + n] = H_int_[n * 8 + m] + avg_square_sum -
+                                   (int64_t)avg * (sumY[k][l] + sumY[n][m]);
+        }
+      }
+    }
+  }
+}
+
+void av1_compute_stats_avx2(int wiener_win, const uint8_t *dgd,
+                            const uint8_t *src, int h_start, int h_end,
+                            int v_start, int v_end, int dgd_stride,
+                            int src_stride, int64_t *M, int64_t *H) {
+  if (wiener_win == WIENER_WIN) {
+    compute_stats_win7_opt_avx2(dgd, src, h_start, h_end, v_start, v_end,
+                                dgd_stride, src_stride, M, H);
+  } else if (wiener_win == WIENER_WIN_CHROMA) {
+    compute_stats_win5_opt_avx2(dgd, src, h_start, h_end, v_start, v_end,
+                                dgd_stride, src_stride, M, H);
+  } else {
+    av1_compute_stats_c(wiener_win, dgd, src, h_start, h_end, v_start, v_end,
+                        dgd_stride, src_stride, M, H);
+  }
+}
+
+static INLINE __m256i pair_set_epi16(int a, int b) {
+  return _mm256_set1_epi32(
+      (int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16)));
+}
+
+int64_t av1_lowbd_pixel_proj_error_avx2(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+    int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) {
+  int i, j, k;
+  const int32_t shift = SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS;
+  const __m256i rounding = _mm256_set1_epi32(1 << (shift - 1));
+  __m256i sum64 = _mm256_setzero_si256();
+  const uint8_t *src = src8;
+  const uint8_t *dat = dat8;
+  int64_t err = 0;
+  if (params->r[0] > 0 && params->r[1] > 0) {
+    __m256i xq_coeff = pair_set_epi16(xq[0], xq[1]);
+    for (i = 0; i < height; ++i) {
+      __m256i sum32 = _mm256_setzero_si256();
+      for (j = 0; j <= width - 16; j += 16) {
+        const __m256i d0 = _mm256_cvtepu8_epi16(xx_loadu_128(dat + j));
+        const __m256i s0 = _mm256_cvtepu8_epi16(xx_loadu_128(src + j));
+        const __m256i flt0_16b = _mm256_permute4x64_epi64(
+            _mm256_packs_epi32(yy_loadu_256(flt0 + j),
+                               yy_loadu_256(flt0 + j + 8)),
+            0xd8);
+        const __m256i flt1_16b = _mm256_permute4x64_epi64(
+            _mm256_packs_epi32(yy_loadu_256(flt1 + j),
+                               yy_loadu_256(flt1 + j + 8)),
+            0xd8);
+        const __m256i u0 = _mm256_slli_epi16(d0, SGRPROJ_RST_BITS);
+        const __m256i flt0_0_sub_u = _mm256_sub_epi16(flt0_16b, u0);
+        const __m256i flt1_0_sub_u = _mm256_sub_epi16(flt1_16b, u0);
+        const __m256i v0 = _mm256_madd_epi16(
+            xq_coeff, _mm256_unpacklo_epi16(flt0_0_sub_u, flt1_0_sub_u));
+        const __m256i v1 = _mm256_madd_epi16(
+            xq_coeff, _mm256_unpackhi_epi16(flt0_0_sub_u, flt1_0_sub_u));
+        const __m256i vr0 =
+            _mm256_srai_epi32(_mm256_add_epi32(v0, rounding), shift);
+        const __m256i vr1 =
+            _mm256_srai_epi32(_mm256_add_epi32(v1, rounding), shift);
+        const __m256i e0 = _mm256_sub_epi16(
+            _mm256_add_epi16(_mm256_packs_epi32(vr0, vr1), d0), s0);
+        const __m256i err0 = _mm256_madd_epi16(e0, e0);
+        sum32 = _mm256_add_epi32(sum32, err0);
+      }
+      for (k = j; k < width; ++k) {
+        const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
+        int32_t v = xq[0] * (flt0[k] - u) + xq[1] * (flt1[k] - u);
+        const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
+        err += ((int64_t)e * e);
+      }
+      dat += dat_stride;
+      src += src_stride;
+      flt0 += flt0_stride;
+      flt1 += flt1_stride;
+      const __m256i sum64_0 =
+          _mm256_cvtepi32_epi64(_mm256_castsi256_si128(sum32));
+      const __m256i sum64_1 =
+          _mm256_cvtepi32_epi64(_mm256_extracti128_si256(sum32, 1));
+      sum64 = _mm256_add_epi64(sum64, sum64_0);
+      sum64 = _mm256_add_epi64(sum64, sum64_1);
+    }
+  } else if (params->r[0] > 0 || params->r[1] > 0) {
+    const int xq_active = (params->r[0] > 0) ? xq[0] : xq[1];
+    const __m256i xq_coeff =
+        pair_set_epi16(xq_active, (-xq_active * (1 << SGRPROJ_RST_BITS)));
+    const int32_t *flt = (params->r[0] > 0) ? flt0 : flt1;
+    const int flt_stride = (params->r[0] > 0) ? flt0_stride : flt1_stride;
+    for (i = 0; i < height; ++i) {
+      __m256i sum32 = _mm256_setzero_si256();
+      for (j = 0; j <= width - 16; j += 16) {
+        const __m256i d0 = _mm256_cvtepu8_epi16(xx_loadu_128(dat + j));
+        const __m256i s0 = _mm256_cvtepu8_epi16(xx_loadu_128(src + j));
+        const __m256i flt_16b = _mm256_permute4x64_epi64(
+            _mm256_packs_epi32(yy_loadu_256(flt + j),
+                               yy_loadu_256(flt + j + 8)),
+            0xd8);
+        const __m256i v0 =
+            _mm256_madd_epi16(xq_coeff, _mm256_unpacklo_epi16(flt_16b, d0));
+        const __m256i v1 =
+            _mm256_madd_epi16(xq_coeff, _mm256_unpackhi_epi16(flt_16b, d0));
+        const __m256i vr0 =
+            _mm256_srai_epi32(_mm256_add_epi32(v0, rounding), shift);
+        const __m256i vr1 =
+            _mm256_srai_epi32(_mm256_add_epi32(v1, rounding), shift);
+        const __m256i e0 = _mm256_sub_epi16(
+            _mm256_add_epi16(_mm256_packs_epi32(vr0, vr1), d0), s0);
+        const __m256i err0 = _mm256_madd_epi16(e0, e0);
+        sum32 = _mm256_add_epi32(sum32, err0);
+      }
+      for (k = j; k < width; ++k) {
+        const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
+        int32_t v = xq_active * (flt[k] - u);
+        const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
+        err += ((int64_t)e * e);
+      }
+      dat += dat_stride;
+      src += src_stride;
+      flt += flt_stride;
+      const __m256i sum64_0 =
+          _mm256_cvtepi32_epi64(_mm256_castsi256_si128(sum32));
+      const __m256i sum64_1 =
+          _mm256_cvtepi32_epi64(_mm256_extracti128_si256(sum32, 1));
+      sum64 = _mm256_add_epi64(sum64, sum64_0);
+      sum64 = _mm256_add_epi64(sum64, sum64_1);
+    }
+  } else {
+    __m256i sum32 = _mm256_setzero_si256();
+    for (i = 0; i < height; ++i) {
+      for (j = 0; j <= width - 16; j += 16) {
+        const __m256i d0 = _mm256_cvtepu8_epi16(xx_loadu_128(dat + j));
+        const __m256i s0 = _mm256_cvtepu8_epi16(xx_loadu_128(src + j));
+        const __m256i diff0 = _mm256_sub_epi16(d0, s0);
+        const __m256i err0 = _mm256_madd_epi16(diff0, diff0);
+        sum32 = _mm256_add_epi32(sum32, err0);
+      }
+      for (k = j; k < width; ++k) {
+        const int32_t e = (int32_t)(dat[k]) - src[k];
+        err += ((int64_t)e * e);
+      }
+      dat += dat_stride;
+      src += src_stride;
+    }
+    const __m256i sum64_0 =
+        _mm256_cvtepi32_epi64(_mm256_castsi256_si128(sum32));
+    const __m256i sum64_1 =
+        _mm256_cvtepi32_epi64(_mm256_extracti128_si256(sum32, 1));
+    sum64 = _mm256_add_epi64(sum64_0, sum64_1);
+  }
+  int64_t sum[4];
+  yy_storeu_256(sum, sum64);
+  err += sum[0] + sum[1] + sum[2] + sum[3];
+  return err;
+}
+
+// When params->r[0] > 0 and params->r[1] > 0. In this case all elements of
+// C and H need to be computed.
+static AOM_INLINE void calc_proj_params_r0_r1_avx2(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+    int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) {
+  const int size = width * height;
+  const uint8_t *src = src8;
+  const uint8_t *dat = dat8;
+  __m256i h00, h01, h11, c0, c1;
+  const __m256i zero = _mm256_setzero_si256();
+  h01 = h11 = c0 = c1 = h00 = zero;
+
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; j += 8) {
+      const __m256i u_load = _mm256_cvtepu8_epi32(
+          _mm_loadl_epi64((__m128i *)(dat + i * dat_stride + j)));
+      const __m256i s_load = _mm256_cvtepu8_epi32(
+          _mm_loadl_epi64((__m128i *)(src + i * src_stride + j)));
+      __m256i f1 = _mm256_loadu_si256((__m256i *)(flt0 + i * flt0_stride + j));
+      __m256i f2 = _mm256_loadu_si256((__m256i *)(flt1 + i * flt1_stride + j));
+      __m256i d = _mm256_slli_epi32(u_load, SGRPROJ_RST_BITS);
+      __m256i s = _mm256_slli_epi32(s_load, SGRPROJ_RST_BITS);
+      s = _mm256_sub_epi32(s, d);
+      f1 = _mm256_sub_epi32(f1, d);
+      f2 = _mm256_sub_epi32(f2, d);
+
+      const __m256i h00_even = _mm256_mul_epi32(f1, f1);
+      const __m256i h00_odd = _mm256_mul_epi32(_mm256_srli_epi64(f1, 32),
+                                               _mm256_srli_epi64(f1, 32));
+      h00 = _mm256_add_epi64(h00, h00_even);
+      h00 = _mm256_add_epi64(h00, h00_odd);
+
+      const __m256i h01_even = _mm256_mul_epi32(f1, f2);
+      const __m256i h01_odd = _mm256_mul_epi32(_mm256_srli_epi64(f1, 32),
+                                               _mm256_srli_epi64(f2, 32));
+      h01 = _mm256_add_epi64(h01, h01_even);
+      h01 = _mm256_add_epi64(h01, h01_odd);
+
+      const __m256i h11_even = _mm256_mul_epi32(f2, f2);
+      const __m256i h11_odd = _mm256_mul_epi32(_mm256_srli_epi64(f2, 32),
+                                               _mm256_srli_epi64(f2, 32));
+      h11 = _mm256_add_epi64(h11, h11_even);
+      h11 = _mm256_add_epi64(h11, h11_odd);
+
+      const __m256i c0_even = _mm256_mul_epi32(f1, s);
+      const __m256i c0_odd =
+          _mm256_mul_epi32(_mm256_srli_epi64(f1, 32), _mm256_srli_epi64(s, 32));
+      c0 = _mm256_add_epi64(c0, c0_even);
+      c0 = _mm256_add_epi64(c0, c0_odd);
+
+      const __m256i c1_even = _mm256_mul_epi32(f2, s);
+      const __m256i c1_odd =
+          _mm256_mul_epi32(_mm256_srli_epi64(f2, 32), _mm256_srli_epi64(s, 32));
+      c1 = _mm256_add_epi64(c1, c1_even);
+      c1 = _mm256_add_epi64(c1, c1_odd);
+    }
+  }
+
+  __m256i c_low = _mm256_unpacklo_epi64(c0, c1);
+  const __m256i c_high = _mm256_unpackhi_epi64(c0, c1);
+  c_low = _mm256_add_epi64(c_low, c_high);
+  const __m128i c_128bit = _mm_add_epi64(_mm256_extracti128_si256(c_low, 1),
+                                         _mm256_castsi256_si128(c_low));
+
+  __m256i h0x_low = _mm256_unpacklo_epi64(h00, h01);
+  const __m256i h0x_high = _mm256_unpackhi_epi64(h00, h01);
+  h0x_low = _mm256_add_epi64(h0x_low, h0x_high);
+  const __m128i h0x_128bit = _mm_add_epi64(_mm256_extracti128_si256(h0x_low, 1),
+                                           _mm256_castsi256_si128(h0x_low));
+
+  // Using the symmetric properties of H,  calculations of H[1][0] are not
+  // needed.
+  __m256i h1x_low = _mm256_unpacklo_epi64(zero, h11);
+  const __m256i h1x_high = _mm256_unpackhi_epi64(zero, h11);
+  h1x_low = _mm256_add_epi64(h1x_low, h1x_high);
+  const __m128i h1x_128bit = _mm_add_epi64(_mm256_extracti128_si256(h1x_low, 1),
+                                           _mm256_castsi256_si128(h1x_low));
+
+  xx_storeu_128(C, c_128bit);
+  xx_storeu_128(H[0], h0x_128bit);
+  xx_storeu_128(H[1], h1x_128bit);
+
+  H[0][0] /= size;
+  H[0][1] /= size;
+  H[1][1] /= size;
+
+  // Since H is a symmetric matrix
+  H[1][0] = H[0][1];
+  C[0] /= size;
+  C[1] /= size;
+}
+
+// When only params->r[0] > 0. In this case only H[0][0] and C[0] are
+// non-zero and need to be computed.
+static AOM_INLINE void calc_proj_params_r0_avx2(const uint8_t *src8, int width,
+                                                int height, int src_stride,
+                                                const uint8_t *dat8,
+                                                int dat_stride, int32_t *flt0,
+                                                int flt0_stride,
+                                                int64_t H[2][2], int64_t C[2]) {
+  const int size = width * height;
+  const uint8_t *src = src8;
+  const uint8_t *dat = dat8;
+  __m256i h00, c0;
+  const __m256i zero = _mm256_setzero_si256();
+  c0 = h00 = zero;
+
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; j += 8) {
+      const __m256i u_load = _mm256_cvtepu8_epi32(
+          _mm_loadl_epi64((__m128i *)(dat + i * dat_stride + j)));
+      const __m256i s_load = _mm256_cvtepu8_epi32(
+          _mm_loadl_epi64((__m128i *)(src + i * src_stride + j)));
+      __m256i f1 = _mm256_loadu_si256((__m256i *)(flt0 + i * flt0_stride + j));
+      __m256i d = _mm256_slli_epi32(u_load, SGRPROJ_RST_BITS);
+      __m256i s = _mm256_slli_epi32(s_load, SGRPROJ_RST_BITS);
+      s = _mm256_sub_epi32(s, d);
+      f1 = _mm256_sub_epi32(f1, d);
+
+      const __m256i h00_even = _mm256_mul_epi32(f1, f1);
+      const __m256i h00_odd = _mm256_mul_epi32(_mm256_srli_epi64(f1, 32),
+                                               _mm256_srli_epi64(f1, 32));
+      h00 = _mm256_add_epi64(h00, h00_even);
+      h00 = _mm256_add_epi64(h00, h00_odd);
+
+      const __m256i c0_even = _mm256_mul_epi32(f1, s);
+      const __m256i c0_odd =
+          _mm256_mul_epi32(_mm256_srli_epi64(f1, 32), _mm256_srli_epi64(s, 32));
+      c0 = _mm256_add_epi64(c0, c0_even);
+      c0 = _mm256_add_epi64(c0, c0_odd);
+    }
+  }
+  const __m128i h00_128bit = _mm_add_epi64(_mm256_extracti128_si256(h00, 1),
+                                           _mm256_castsi256_si128(h00));
+  const __m128i h00_val =
+      _mm_add_epi64(h00_128bit, _mm_srli_si128(h00_128bit, 8));
+
+  const __m128i c0_128bit = _mm_add_epi64(_mm256_extracti128_si256(c0, 1),
+                                          _mm256_castsi256_si128(c0));
+  const __m128i c0_val = _mm_add_epi64(c0_128bit, _mm_srli_si128(c0_128bit, 8));
+
+  const __m128i c = _mm_unpacklo_epi64(c0_val, _mm256_castsi256_si128(zero));
+  const __m128i h0x = _mm_unpacklo_epi64(h00_val, _mm256_castsi256_si128(zero));
+
+  xx_storeu_128(C, c);
+  xx_storeu_128(H[0], h0x);
+
+  H[0][0] /= size;
+  C[0] /= size;
+}
+
+// When only params->r[1] > 0. In this case only H[1][1] and C[1] are
+// non-zero and need to be computed.
+static AOM_INLINE void calc_proj_params_r1_avx2(const uint8_t *src8, int width,
+                                                int height, int src_stride,
+                                                const uint8_t *dat8,
+                                                int dat_stride, int32_t *flt1,
+                                                int flt1_stride,
+                                                int64_t H[2][2], int64_t C[2]) {
+  const int size = width * height;
+  const uint8_t *src = src8;
+  const uint8_t *dat = dat8;
+  __m256i h11, c1;
+  const __m256i zero = _mm256_setzero_si256();
+  c1 = h11 = zero;
+
+  for (int i = 0; i < height; ++i) {
+    for (int j = 0; j < width; j += 8) {
+      const __m256i u_load = _mm256_cvtepu8_epi32(
+          _mm_loadl_epi64((__m128i *)(dat + i * dat_stride + j)));
+      const __m256i s_load = _mm256_cvtepu8_epi32(
+          _mm_loadl_epi64((__m128i *)(src + i * src_stride + j)));
+      __m256i f2 = _mm256_loadu_si256((__m256i *)(flt1 + i * flt1_stride + j));
+      __m256i d = _mm256_slli_epi32(u_load, SGRPROJ_RST_BITS);
+      __m256i s = _mm256_slli_epi32(s_load, SGRPROJ_RST_BITS);
+      s = _mm256_sub_epi32(s, d);
+      f2 = _mm256_sub_epi32(f2, d);
+
+      const __m256i h11_even = _mm256_mul_epi32(f2, f2);
+      const __m256i h11_odd = _mm256_mul_epi32(_mm256_srli_epi64(f2, 32),
+                                               _mm256_srli_epi64(f2, 32));
+      h11 = _mm256_add_epi64(h11, h11_even);
+      h11 = _mm256_add_epi64(h11, h11_odd);
+
+      const __m256i c1_even = _mm256_mul_epi32(f2, s);
+      const __m256i c1_odd =
+          _mm256_mul_epi32(_mm256_srli_epi64(f2, 32), _mm256_srli_epi64(s, 32));
+      c1 = _mm256_add_epi64(c1, c1_even);
+      c1 = _mm256_add_epi64(c1, c1_odd);
+    }
+  }
+
+  const __m128i h11_128bit = _mm_add_epi64(_mm256_extracti128_si256(h11, 1),
+                                           _mm256_castsi256_si128(h11));
+  const __m128i h11_val =
+      _mm_add_epi64(h11_128bit, _mm_srli_si128(h11_128bit, 8));
+
+  const __m128i c1_128bit = _mm_add_epi64(_mm256_extracti128_si256(c1, 1),
+                                          _mm256_castsi256_si128(c1));
+  const __m128i c1_val = _mm_add_epi64(c1_128bit, _mm_srli_si128(c1_128bit, 8));
+
+  const __m128i c = _mm_unpacklo_epi64(_mm256_castsi256_si128(zero), c1_val);
+  const __m128i h1x = _mm_unpacklo_epi64(_mm256_castsi256_si128(zero), h11_val);
+
+  xx_storeu_128(C, c);
+  xx_storeu_128(H[1], h1x);
+
+  H[1][1] /= size;
+  C[1] /= size;
+}
+
+// AVX2 variant of av1_calc_proj_params_c.
+void av1_calc_proj_params_avx2(const uint8_t *src8, int width, int height,
+                               int src_stride, const uint8_t *dat8,
+                               int dat_stride, int32_t *flt0, int flt0_stride,
+                               int32_t *flt1, int flt1_stride, int64_t H[2][2],
+                               int64_t C[2], const sgr_params_type *params) {
+  if ((params->r[0] > 0) && (params->r[1] > 0)) {
+    calc_proj_params_r0_r1_avx2(src8, width, height, src_stride, dat8,
+                                dat_stride, flt0, flt0_stride, flt1,
+                                flt1_stride, H, C);
+  } else if (params->r[0] > 0) {
+    calc_proj_params_r0_avx2(src8, width, height, src_stride, dat8, dat_stride,
+                             flt0, flt0_stride, H, C);
+  } else if (params->r[1] > 0) {
+    calc_proj_params_r1_avx2(src8, width, height, src_stride, dat8, dat_stride,
+                             flt1, flt1_stride, H, C);
+  }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+int64_t av1_highbd_pixel_proj_error_avx2(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+    int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) {
+  int i, j, k;
+  const int32_t shift = SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS;
+  const __m256i rounding = _mm256_set1_epi32(1 << (shift - 1));
+  __m256i sum64 = _mm256_setzero_si256();
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+  int64_t err = 0;
+  if (params->r[0] > 0 && params->r[1] > 0) {  // Both filters are enabled
+    const __m256i xq0 = _mm256_set1_epi32(xq[0]);
+    const __m256i xq1 = _mm256_set1_epi32(xq[1]);
+    for (i = 0; i < height; ++i) {
+      __m256i sum32 = _mm256_setzero_si256();
+      for (j = 0; j <= width - 16; j += 16) {  // Process 16 pixels at a time
+        // Load 16 pixels each from source image and corrupted image
+        const __m256i s0 = yy_loadu_256(src + j);
+        const __m256i d0 = yy_loadu_256(dat + j);
+        // s0 = [15 14 13 12 11 10 9 8] [7 6 5 4 3 2 1 0] as u16 (indices)
+
+        // Shift-up each pixel to match filtered image scaling
+        const __m256i u0 = _mm256_slli_epi16(d0, SGRPROJ_RST_BITS);
+
+        // Split u0 into two halves and pad each from u16 to i32
+        const __m256i u0l = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(u0));
+        const __m256i u0h =
+            _mm256_cvtepu16_epi32(_mm256_extracti128_si256(u0, 1));
+        // u0h, u0l = [15 14 13 12] [11 10 9 8], [7 6 5 4] [3 2 1 0] as u32
+
+        // Load 16 pixels from each filtered image
+        const __m256i flt0l = yy_loadu_256(flt0 + j);
+        const __m256i flt0h = yy_loadu_256(flt0 + j + 8);
+        const __m256i flt1l = yy_loadu_256(flt1 + j);
+        const __m256i flt1h = yy_loadu_256(flt1 + j + 8);
+        // flt?l, flt?h = [15 14 13 12] [11 10 9 8], [7 6 5 4] [3 2 1 0] as u32
+
+        // Subtract shifted corrupt image from each filtered image
+        const __m256i flt0l_subu = _mm256_sub_epi32(flt0l, u0l);
+        const __m256i flt0h_subu = _mm256_sub_epi32(flt0h, u0h);
+        const __m256i flt1l_subu = _mm256_sub_epi32(flt1l, u0l);
+        const __m256i flt1h_subu = _mm256_sub_epi32(flt1h, u0h);
+
+        // Multiply basis vectors by appropriate coefficients
+        const __m256i v0l = _mm256_mullo_epi32(flt0l_subu, xq0);
+        const __m256i v0h = _mm256_mullo_epi32(flt0h_subu, xq0);
+        const __m256i v1l = _mm256_mullo_epi32(flt1l_subu, xq1);
+        const __m256i v1h = _mm256_mullo_epi32(flt1h_subu, xq1);
+
+        // Add together the contributions from the two basis vectors
+        const __m256i vl = _mm256_add_epi32(v0l, v1l);
+        const __m256i vh = _mm256_add_epi32(v0h, v1h);
+
+        // Right-shift v with appropriate rounding
+        const __m256i vrl =
+            _mm256_srai_epi32(_mm256_add_epi32(vl, rounding), shift);
+        const __m256i vrh =
+            _mm256_srai_epi32(_mm256_add_epi32(vh, rounding), shift);
+        // vrh, vrl = [15 14 13 12] [11 10 9 8], [7 6 5 4] [3 2 1 0]
+
+        // Saturate each i32 to an i16 then combine both halves
+        // The permute (control=[3 1 2 0]) fixes weird ordering from AVX lanes
+        const __m256i vr =
+            _mm256_permute4x64_epi64(_mm256_packs_epi32(vrl, vrh), 0xd8);
+        // intermediate = [15 14 13 12 7 6 5 4] [11 10 9 8 3 2 1 0]
+        // vr = [15 14 13 12 11 10 9 8] [7 6 5 4 3 2 1 0]
+
+        // Add twin-subspace-sgr-filter to corrupt image then subtract source
+        const __m256i e0 = _mm256_sub_epi16(_mm256_add_epi16(vr, d0), s0);
+
+        // Calculate squared error and add adjacent values
+        const __m256i err0 = _mm256_madd_epi16(e0, e0);
+
+        sum32 = _mm256_add_epi32(sum32, err0);
+      }
+
+      const __m256i sum32l =
+          _mm256_cvtepu32_epi64(_mm256_castsi256_si128(sum32));
+      sum64 = _mm256_add_epi64(sum64, sum32l);
+      const __m256i sum32h =
+          _mm256_cvtepu32_epi64(_mm256_extracti128_si256(sum32, 1));
+      sum64 = _mm256_add_epi64(sum64, sum32h);
+
+      // Process remaining pixels in this row (modulo 16)
+      for (k = j; k < width; ++k) {
+        const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
+        int32_t v = xq[0] * (flt0[k] - u) + xq[1] * (flt1[k] - u);
+        const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
+        err += ((int64_t)e * e);
+      }
+      dat += dat_stride;
+      src += src_stride;
+      flt0 += flt0_stride;
+      flt1 += flt1_stride;
+    }
+  } else if (params->r[0] > 0 || params->r[1] > 0) {  // Only one filter enabled
+    const int32_t xq_on = (params->r[0] > 0) ? xq[0] : xq[1];
+    const __m256i xq_active = _mm256_set1_epi32(xq_on);
+    const __m256i xq_inactive =
+        _mm256_set1_epi32(-xq_on * (1 << SGRPROJ_RST_BITS));
+    const int32_t *flt = (params->r[0] > 0) ? flt0 : flt1;
+    const int flt_stride = (params->r[0] > 0) ? flt0_stride : flt1_stride;
+    for (i = 0; i < height; ++i) {
+      __m256i sum32 = _mm256_setzero_si256();
+      for (j = 0; j <= width - 16; j += 16) {
+        // Load 16 pixels from source image
+        const __m256i s0 = yy_loadu_256(src + j);
+        // s0 = [15 14 13 12 11 10 9 8] [7 6 5 4 3 2 1 0] as u16
+
+        // Load 16 pixels from corrupted image and pad each u16 to i32
+        const __m256i d0 = yy_loadu_256(dat + j);
+        const __m256i d0h =
+            _mm256_cvtepu16_epi32(_mm256_extracti128_si256(d0, 1));
+        const __m256i d0l = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(d0));
+        // d0 = [15 14 13 12 11 10 9 8] [7 6 5 4 3 2 1 0] as u16
+        // d0h, d0l = [15 14 13 12] [11 10 9 8], [7 6 5 4] [3 2 1 0] as i32
+
+        // Load 16 pixels from the filtered image
+        const __m256i flth = yy_loadu_256(flt + j + 8);
+        const __m256i fltl = yy_loadu_256(flt + j);
+        // flth, fltl = [15 14 13 12] [11 10 9 8], [7 6 5 4] [3 2 1 0] as i32
+
+        const __m256i flth_xq = _mm256_mullo_epi32(flth, xq_active);
+        const __m256i fltl_xq = _mm256_mullo_epi32(fltl, xq_active);
+        const __m256i d0h_xq = _mm256_mullo_epi32(d0h, xq_inactive);
+        const __m256i d0l_xq = _mm256_mullo_epi32(d0l, xq_inactive);
+
+        const __m256i vh = _mm256_add_epi32(flth_xq, d0h_xq);
+        const __m256i vl = _mm256_add_epi32(fltl_xq, d0l_xq);
+
+        // Shift this down with appropriate rounding
+        const __m256i vrh =
+            _mm256_srai_epi32(_mm256_add_epi32(vh, rounding), shift);
+        const __m256i vrl =
+            _mm256_srai_epi32(_mm256_add_epi32(vl, rounding), shift);
+        // vrh, vrl = [15 14 13 12] [11 10 9 8], [7 6 5 4] [3 2 1 0] as i32
+
+        // Saturate each i32 to an i16 then combine both halves
+        // The permute (control=[3 1 2 0]) fixes weird ordering from AVX lanes
+        const __m256i vr =
+            _mm256_permute4x64_epi64(_mm256_packs_epi32(vrl, vrh), 0xd8);
+        // intermediate = [15 14 13 12 7 6 5 4] [11 10 9 8 3 2 1 0] as u16
+        // vr = [15 14 13 12 11 10 9 8] [7 6 5 4 3 2 1 0] as u16
+
+        // Subtract twin-subspace-sgr filtered from source image to get error
+        const __m256i e0 = _mm256_sub_epi16(_mm256_add_epi16(vr, d0), s0);
+
+        // Calculate squared error and add adjacent values
+        const __m256i err0 = _mm256_madd_epi16(e0, e0);
+
+        sum32 = _mm256_add_epi32(sum32, err0);
+      }
+
+      const __m256i sum32l =
+          _mm256_cvtepu32_epi64(_mm256_castsi256_si128(sum32));
+      sum64 = _mm256_add_epi64(sum64, sum32l);
+      const __m256i sum32h =
+          _mm256_cvtepu32_epi64(_mm256_extracti128_si256(sum32, 1));
+      sum64 = _mm256_add_epi64(sum64, sum32h);
+
+      // Process remaining pixels in this row (modulo 16)
+      for (k = j; k < width; ++k) {
+        const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
+        int32_t v = xq_on * (flt[k] - u);
+        const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
+        err += ((int64_t)e * e);
+      }
+      dat += dat_stride;
+      src += src_stride;
+      flt += flt_stride;
+    }
+  } else {  // Neither filter is enabled
+    for (i = 0; i < height; ++i) {
+      __m256i sum32 = _mm256_setzero_si256();
+      for (j = 0; j <= width - 32; j += 32) {
+        // Load 2x16 u16 from source image
+        const __m256i s0l = yy_loadu_256(src + j);
+        const __m256i s0h = yy_loadu_256(src + j + 16);
+
+        // Load 2x16 u16 from corrupted image
+        const __m256i d0l = yy_loadu_256(dat + j);
+        const __m256i d0h = yy_loadu_256(dat + j + 16);
+
+        // Subtract corrupted image from source image
+        const __m256i diffl = _mm256_sub_epi16(d0l, s0l);
+        const __m256i diffh = _mm256_sub_epi16(d0h, s0h);
+
+        // Square error and add adjacent values
+        const __m256i err0l = _mm256_madd_epi16(diffl, diffl);
+        const __m256i err0h = _mm256_madd_epi16(diffh, diffh);
+
+        sum32 = _mm256_add_epi32(sum32, err0l);
+        sum32 = _mm256_add_epi32(sum32, err0h);
+      }
+
+      const __m256i sum32l =
+          _mm256_cvtepu32_epi64(_mm256_castsi256_si128(sum32));
+      sum64 = _mm256_add_epi64(sum64, sum32l);
+      const __m256i sum32h =
+          _mm256_cvtepu32_epi64(_mm256_extracti128_si256(sum32, 1));
+      sum64 = _mm256_add_epi64(sum64, sum32h);
+
+      // Process remaining pixels (modulu 16)
+      for (k = j; k < width; ++k) {
+        const int32_t e = (int32_t)(dat[k]) - src[k];
+        err += ((int64_t)e * e);
+      }
+      dat += dat_stride;
+      src += src_stride;
+    }
+  }
+
+  // Sum 4 values from sum64l and sum64h into err
+  int64_t sum[4];
+  yy_storeu_256(sum, sum64);
+  err += sum[0] + sum[1] + sum[2] + sum[3];
+  return err;
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/libs/libaom/src/av1/encoder/x86/pickrst_sse4.c b/libs/libaom/src/av1/encoder/x86/pickrst_sse4.c
new file mode 100644
index 000000000..a2f65a50c
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/x86/pickrst_sse4.c
@@ -0,0 +1,833 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+#include "aom_dsp/x86/synonyms.h"
+
+#include "config/av1_rtcd.h"
+#include "av1/common/restoration.h"
+#include "av1/encoder/pickrst.h"
+
+static INLINE void acc_stat_sse41(int32_t *dst, const uint8_t *src,
+                                  const __m128i *shuffle, const __m128i *kl) {
+  const __m128i s = _mm_shuffle_epi8(xx_loadu_128(src), *shuffle);
+  const __m128i d0 = _mm_madd_epi16(*kl, _mm_cvtepu8_epi16(s));
+  const __m128i d1 =
+      _mm_madd_epi16(*kl, _mm_cvtepu8_epi16(_mm_srli_si128(s, 8)));
+  const __m128i dst0 = xx_loadu_128(dst);
+  const __m128i dst1 = xx_loadu_128(dst + 4);
+  const __m128i r0 = _mm_add_epi32(dst0, d0);
+  const __m128i r1 = _mm_add_epi32(dst1, d1);
+  xx_storeu_128(dst, r0);
+  xx_storeu_128(dst + 4, r1);
+}
+
+static INLINE void acc_stat_win7_one_line_sse4_1(
+    const uint8_t *dgd, const uint8_t *src, int h_start, int h_end,
+    int dgd_stride, const __m128i *shuffle, int32_t *sumX,
+    int32_t sumY[WIENER_WIN][WIENER_WIN], int32_t M_int[WIENER_WIN][WIENER_WIN],
+    int32_t H_int[WIENER_WIN2][WIENER_WIN * 8]) {
+  const int wiener_win = 7;
+  int j, k, l;
+  for (j = h_start; j < h_end; j += 2) {
+    const uint8_t *dgd_ij = dgd + j;
+    const uint8_t X1 = src[j];
+    const uint8_t X2 = src[j + 1];
+    *sumX += X1 + X2;
+    for (k = 0; k < wiener_win; k++) {
+      const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride;
+      for (l = 0; l < wiener_win; l++) {
+        int32_t *H_ = &H_int[(l * wiener_win + k)][0];
+        const uint8_t D1 = dgd_ijk[l];
+        const uint8_t D2 = dgd_ijk[l + 1];
+        sumY[k][l] += D1 + D2;
+        M_int[k][l] += D1 * X1 + D2 * X2;
+
+        const __m128i kl =
+            _mm_cvtepu8_epi16(_mm_set1_epi16(*((uint16_t *)(dgd_ijk + l))));
+        acc_stat_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
+        acc_stat_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
+        acc_stat_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
+        acc_stat_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl);
+        acc_stat_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl);
+        acc_stat_sse41(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle, &kl);
+        acc_stat_sse41(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle, &kl);
+      }
+    }
+  }
+}
+
+static INLINE void compute_stats_win7_opt_sse4_1(
+    const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, int v_start,
+    int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H) {
+  int i, j, k, l, m, n;
+  const int wiener_win = WIENER_WIN;
+  const int pixel_count = (h_end - h_start) * (v_end - v_start);
+  const int wiener_win2 = wiener_win * wiener_win;
+  const int wiener_halfwin = (wiener_win >> 1);
+  const uint8_t avg =
+      find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+
+  int32_t M_int32[WIENER_WIN][WIENER_WIN] = { { 0 } };
+  int64_t M_int64[WIENER_WIN][WIENER_WIN] = { { 0 } };
+  int32_t H_int32[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } };
+  int64_t H_int64[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } };
+  int32_t sumY[WIENER_WIN][WIENER_WIN] = { { 0 } };
+  int32_t sumX = 0;
+  const uint8_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
+
+  const __m128i shuffle = xx_loadu_128(g_shuffle_stats_data);
+  for (j = v_start; j < v_end; j += 64) {
+    const int vert_end = AOMMIN(64, v_end - j) + j;
+    for (i = j; i < vert_end; i++) {
+      acc_stat_win7_one_line_sse4_1(
+          dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end,
+          dgd_stride, &shuffle, &sumX, sumY, M_int32, H_int32);
+    }
+    for (k = 0; k < wiener_win; ++k) {
+      for (l = 0; l < wiener_win; ++l) {
+        M_int64[k][l] += M_int32[k][l];
+        M_int32[k][l] = 0;
+      }
+    }
+    for (k = 0; k < WIENER_WIN2; ++k) {
+      for (l = 0; l < WIENER_WIN * 8; ++l) {
+        H_int64[k][l] += H_int32[k][l];
+        H_int32[k][l] = 0;
+      }
+    }
+  }
+
+  const int64_t avg_square_sum = (int64_t)avg * (int64_t)avg * pixel_count;
+  for (k = 0; k < wiener_win; k++) {
+    for (l = 0; l < wiener_win; l++) {
+      const int32_t idx0 = l * wiener_win + k;
+      M[idx0] =
+          M_int64[k][l] + (avg_square_sum - (int64_t)avg * (sumX + sumY[k][l]));
+      int64_t *H_ = H + idx0 * wiener_win2;
+      int64_t *H_int_ = &H_int64[idx0][0];
+      for (m = 0; m < wiener_win; m++) {
+        for (n = 0; n < wiener_win; n++) {
+          H_[m * wiener_win + n] = H_int_[n * 8 + m] + avg_square_sum -
+                                   (int64_t)avg * (sumY[k][l] + sumY[n][m]);
+        }
+      }
+    }
+  }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE void acc_stat_highbd_sse41(int64_t *dst, const uint16_t *dgd,
+                                         const __m128i *shuffle,
+                                         const __m128i *dgd_ijkl) {
+  // Load 256 bits from dgd in two chunks
+  const __m128i s0l = xx_loadu_128(dgd);
+  const __m128i s0h = xx_loadu_128(dgd + 4);
+  // s0l = [7 6 5 4 3 2 1 0] as u16 values (dgd indices)
+  // s0h = [11 10 9 8 7 6 5 4] as u16 values (dgd indices)
+  // (Slightly strange order so we can apply the same shuffle to both halves)
+
+  // Shuffle the u16 values in each half (actually using 8-bit shuffle mask)
+  const __m128i s1l = _mm_shuffle_epi8(s0l, *shuffle);
+  const __m128i s1h = _mm_shuffle_epi8(s0h, *shuffle);
+  // s1l = [4 3 3 2 2 1 1 0] as u16 values (dgd indices)
+  // s1h = [8 7 7 6 6 5 5 4] as u16 values (dgd indices)
+
+  // Multiply s1 by dgd_ijkl resulting in 8x u32 values
+  // Horizontally add pairs of u32 resulting in 4x u32
+  const __m128i dl = _mm_madd_epi16(*dgd_ijkl, s1l);
+  const __m128i dh = _mm_madd_epi16(*dgd_ijkl, s1h);
+  // dl = [d c b a] as u32 values
+  // dh = [h g f e] as u32 values
+
+  // Add these 8x u32 results on to dst in four parts
+  const __m128i dll = _mm_cvtepu32_epi64(dl);
+  const __m128i dlh = _mm_cvtepu32_epi64(_mm_srli_si128(dl, 8));
+  const __m128i dhl = _mm_cvtepu32_epi64(dh);
+  const __m128i dhh = _mm_cvtepu32_epi64(_mm_srli_si128(dh, 8));
+  // dll = [b a] as u64 values, etc.
+
+  const __m128i rll = _mm_add_epi64(xx_loadu_128(dst), dll);
+  xx_storeu_128(dst, rll);
+  const __m128i rlh = _mm_add_epi64(xx_loadu_128(dst + 2), dlh);
+  xx_storeu_128(dst + 2, rlh);
+  const __m128i rhl = _mm_add_epi64(xx_loadu_128(dst + 4), dhl);
+  xx_storeu_128(dst + 4, rhl);
+  const __m128i rhh = _mm_add_epi64(xx_loadu_128(dst + 6), dhh);
+  xx_storeu_128(dst + 6, rhh);
+}
+
+static INLINE void acc_stat_highbd_win7_one_line_sse4_1(
+    const uint16_t *dgd, const uint16_t *src, int h_start, int h_end,
+    int dgd_stride, const __m128i *shuffle, int32_t *sumX,
+    int32_t sumY[WIENER_WIN][WIENER_WIN], int64_t M_int[WIENER_WIN][WIENER_WIN],
+    int64_t H_int[WIENER_WIN2][WIENER_WIN * 8]) {
+  int j, k, l;
+  const int wiener_win = WIENER_WIN;
+  for (j = h_start; j < h_end; j += 2) {
+    const uint16_t X1 = src[j];
+    const uint16_t X2 = src[j + 1];
+    *sumX += X1 + X2;
+    const uint16_t *dgd_ij = dgd + j;
+    for (k = 0; k < wiener_win; k++) {
+      const uint16_t *dgd_ijk = dgd_ij + k * dgd_stride;
+      for (l = 0; l < wiener_win; l++) {
+        int64_t *H_ = &H_int[(l * wiener_win + k)][0];
+        const uint16_t D1 = dgd_ijk[l];
+        const uint16_t D2 = dgd_ijk[l + 1];
+        sumY[k][l] += D1 + D2;
+        M_int[k][l] += D1 * X1 + D2 * X2;
+
+        // Load two u16 values from dgd as a single u32
+        // Then broadcast to 4x u32 slots of a 128
+        const __m128i dgd_ijkl = _mm_set1_epi32(*((uint32_t *)(dgd_ijk + l)));
+        // dgd_ijkl = [y x y x y x y x] as u16
+
+        acc_stat_highbd_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle,
+                              &dgd_ijkl);
+        acc_stat_highbd_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle,
+                              &dgd_ijkl);
+        acc_stat_highbd_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle,
+                              &dgd_ijkl);
+        acc_stat_highbd_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle,
+                              &dgd_ijkl);
+        acc_stat_highbd_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle,
+                              &dgd_ijkl);
+        acc_stat_highbd_sse41(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle,
+                              &dgd_ijkl);
+        acc_stat_highbd_sse41(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle,
+                              &dgd_ijkl);
+      }
+    }
+  }
+}
+
+static INLINE void compute_stats_highbd_win7_opt_sse4_1(
+    const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end,
+    int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M,
+    int64_t *H, aom_bit_depth_t bit_depth) {
+  int i, j, k, l, m, n;
+  const int wiener_win = WIENER_WIN;
+  const int pixel_count = (h_end - h_start) * (v_end - v_start);
+  const int wiener_win2 = wiener_win * wiener_win;
+  const int wiener_halfwin = (wiener_win >> 1);
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *dgd = CONVERT_TO_SHORTPTR(dgd8);
+  const uint16_t avg =
+      find_average_highbd(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+
+  int64_t M_int[WIENER_WIN][WIENER_WIN] = { { 0 } };
+  int64_t H_int[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } };
+  int32_t sumY[WIENER_WIN][WIENER_WIN] = { { 0 } };
+  int32_t sumX = 0;
+  const uint16_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
+
+  // Load just half of the 256-bit shuffle control used for the AVX2 version
+  const __m128i shuffle = xx_loadu_128(g_shuffle_stats_highbd_data);
+  for (j = v_start; j < v_end; j += 64) {
+    const int vert_end = AOMMIN(64, v_end - j) + j;
+    for (i = j; i < vert_end; i++) {
+      acc_stat_highbd_win7_one_line_sse4_1(
+          dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end,
+          dgd_stride, &shuffle, &sumX, sumY, M_int, H_int);
+    }
+  }
+
+  uint8_t bit_depth_divider = 1;
+  if (bit_depth == AOM_BITS_12)
+    bit_depth_divider = 16;
+  else if (bit_depth == AOM_BITS_10)
+    bit_depth_divider = 4;
+
+  const int64_t avg_square_sum = (int64_t)avg * (int64_t)avg * pixel_count;
+  for (k = 0; k < wiener_win; k++) {
+    for (l = 0; l < wiener_win; l++) {
+      const int32_t idx0 = l * wiener_win + k;
+      M[idx0] = (M_int[k][l] +
+                 (avg_square_sum - (int64_t)avg * (sumX + sumY[k][l]))) /
+                bit_depth_divider;
+      int64_t *H_ = H + idx0 * wiener_win2;
+      int64_t *H_int_ = &H_int[idx0][0];
+      for (m = 0; m < wiener_win; m++) {
+        for (n = 0; n < wiener_win; n++) {
+          H_[m * wiener_win + n] =
+              (H_int_[n * 8 + m] +
+               (avg_square_sum - (int64_t)avg * (sumY[k][l] + sumY[n][m]))) /
+              bit_depth_divider;
+        }
+      }
+    }
+  }
+}
+
+static INLINE void acc_stat_highbd_win5_one_line_sse4_1(
+    const uint16_t *dgd, const uint16_t *src, int h_start, int h_end,
+    int dgd_stride, const __m128i *shuffle, int32_t *sumX,
+    int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA],
+    int64_t M_int[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA],
+    int64_t H_int[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8]) {
+  int j, k, l;
+  const int wiener_win = WIENER_WIN_CHROMA;
+  for (j = h_start; j < h_end; j += 2) {
+    const uint16_t X1 = src[j];
+    const uint16_t X2 = src[j + 1];
+    *sumX += X1 + X2;
+    const uint16_t *dgd_ij = dgd + j;
+    for (k = 0; k < wiener_win; k++) {
+      const uint16_t *dgd_ijk = dgd_ij + k * dgd_stride;
+      for (l = 0; l < wiener_win; l++) {
+        int64_t *H_ = &H_int[(l * wiener_win + k)][0];
+        const uint16_t D1 = dgd_ijk[l];
+        const uint16_t D2 = dgd_ijk[l + 1];
+        sumY[k][l] += D1 + D2;
+        M_int[k][l] += D1 * X1 + D2 * X2;
+
+        // Load two u16 values from dgd as a single u32
+        // then broadcast to 4x u32 slots of a 128
+        const __m128i dgd_ijkl = _mm_set1_epi32(*((uint32_t *)(dgd_ijk + l)));
+        // dgd_ijkl = [y x y x y x y x] as u16
+
+        acc_stat_highbd_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle,
+                              &dgd_ijkl);
+        acc_stat_highbd_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle,
+                              &dgd_ijkl);
+        acc_stat_highbd_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle,
+                              &dgd_ijkl);
+        acc_stat_highbd_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle,
+                              &dgd_ijkl);
+        acc_stat_highbd_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle,
+                              &dgd_ijkl);
+      }
+    }
+  }
+}
+
+static INLINE void compute_stats_highbd_win5_opt_sse4_1(
+    const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end,
+    int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M,
+    int64_t *H, aom_bit_depth_t bit_depth) {
+  int i, j, k, l, m, n;
+  const int wiener_win = WIENER_WIN_CHROMA;
+  const int pixel_count = (h_end - h_start) * (v_end - v_start);
+  const int wiener_win2 = wiener_win * wiener_win;
+  const int wiener_halfwin = (wiener_win >> 1);
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *dgd = CONVERT_TO_SHORTPTR(dgd8);
+  const uint16_t avg =
+      find_average_highbd(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+
+  int64_t M_int[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+  int64_t H_int[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } };
+  int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+  int32_t sumX = 0;
+  const uint16_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
+
+  // Load just half of the 256-bit shuffle control used for the AVX2 version
+  const __m128i shuffle = xx_loadu_128(g_shuffle_stats_highbd_data);
+  for (j = v_start; j < v_end; j += 64) {
+    const int vert_end = AOMMIN(64, v_end - j) + j;
+    for (i = j; i < vert_end; i++) {
+      acc_stat_highbd_win5_one_line_sse4_1(
+          dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end,
+          dgd_stride, &shuffle, &sumX, sumY, M_int, H_int);
+    }
+  }
+
+  uint8_t bit_depth_divider = 1;
+  if (bit_depth == AOM_BITS_12)
+    bit_depth_divider = 16;
+  else if (bit_depth == AOM_BITS_10)
+    bit_depth_divider = 4;
+
+  const int64_t avg_square_sum = (int64_t)avg * (int64_t)avg * pixel_count;
+  for (k = 0; k < wiener_win; k++) {
+    for (l = 0; l < wiener_win; l++) {
+      const int32_t idx0 = l * wiener_win + k;
+      M[idx0] = (M_int[k][l] +
+                 (avg_square_sum - (int64_t)avg * (sumX + sumY[k][l]))) /
+                bit_depth_divider;
+      int64_t *H_ = H + idx0 * wiener_win2;
+      int64_t *H_int_ = &H_int[idx0][0];
+      for (m = 0; m < wiener_win; m++) {
+        for (n = 0; n < wiener_win; n++) {
+          H_[m * wiener_win + n] =
+              (H_int_[n * 8 + m] +
+               (avg_square_sum - (int64_t)avg * (sumY[k][l] + sumY[n][m]))) /
+              bit_depth_divider;
+        }
+      }
+    }
+  }
+}
+
+void av1_compute_stats_highbd_sse4_1(int wiener_win, const uint8_t *dgd8,
+                                     const uint8_t *src8, int h_start,
+                                     int h_end, int v_start, int v_end,
+                                     int dgd_stride, int src_stride, int64_t *M,
+                                     int64_t *H, aom_bit_depth_t bit_depth) {
+  if (wiener_win == WIENER_WIN) {
+    compute_stats_highbd_win7_opt_sse4_1(dgd8, src8, h_start, h_end, v_start,
+                                         v_end, dgd_stride, src_stride, M, H,
+                                         bit_depth);
+  } else if (wiener_win == WIENER_WIN_CHROMA) {
+    compute_stats_highbd_win5_opt_sse4_1(dgd8, src8, h_start, h_end, v_start,
+                                         v_end, dgd_stride, src_stride, M, H,
+                                         bit_depth);
+  } else {
+    av1_compute_stats_highbd_c(wiener_win, dgd8, src8, h_start, h_end, v_start,
+                               v_end, dgd_stride, src_stride, M, H, bit_depth);
+  }
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+static INLINE void acc_stat_win5_one_line_sse4_1(
+    const uint8_t *dgd, const uint8_t *src, int h_start, int h_end,
+    int dgd_stride, const __m128i *shuffle, int32_t *sumX,
+    int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA],
+    int32_t M_int[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA],
+    int32_t H_int[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8]) {
+  const int wiener_win = WIENER_WIN_CHROMA;
+  int j, k, l;
+  for (j = h_start; j < h_end; j += 2) {
+    const uint8_t *dgd_ij = dgd + j;
+    const uint8_t X1 = src[j];
+    const uint8_t X2 = src[j + 1];
+    *sumX += X1 + X2;
+    for (k = 0; k < wiener_win; k++) {
+      const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride;
+      for (l = 0; l < wiener_win; l++) {
+        int32_t *H_ = &H_int[(l * wiener_win + k)][0];
+        const uint8_t D1 = dgd_ijk[l];
+        const uint8_t D2 = dgd_ijk[l + 1];
+        sumY[k][l] += D1 + D2;
+        M_int[k][l] += D1 * X1 + D2 * X2;
+
+        const __m128i kl =
+            _mm_cvtepu8_epi16(_mm_set1_epi16(*((uint16_t *)(dgd_ijk + l))));
+        acc_stat_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
+        acc_stat_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
+        acc_stat_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
+        acc_stat_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl);
+        acc_stat_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl);
+      }
+    }
+  }
+}
+
+static INLINE void compute_stats_win5_opt_sse4_1(
+    const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, int v_start,
+    int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H) {
+  int i, j, k, l, m, n;
+  const int wiener_win = WIENER_WIN_CHROMA;
+  const int pixel_count = (h_end - h_start) * (v_end - v_start);
+  const int wiener_win2 = wiener_win * wiener_win;
+  const int wiener_halfwin = (wiener_win >> 1);
+  const uint8_t avg =
+      find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+
+  int32_t M_int32[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+  int64_t M_int64[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+  int32_t H_int32[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } };
+  int64_t H_int64[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } };
+  int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+  int32_t sumX = 0;
+  const uint8_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
+
+  const __m128i shuffle = xx_loadu_128(g_shuffle_stats_data);
+  for (j = v_start; j < v_end; j += 64) {
+    const int vert_end = AOMMIN(64, v_end - j) + j;
+    for (i = j; i < vert_end; i++) {
+      acc_stat_win5_one_line_sse4_1(
+          dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end,
+          dgd_stride, &shuffle, &sumX, sumY, M_int32, H_int32);
+    }
+    for (k = 0; k < wiener_win; ++k) {
+      for (l = 0; l < wiener_win; ++l) {
+        M_int64[k][l] += M_int32[k][l];
+        M_int32[k][l] = 0;
+      }
+    }
+    for (k = 0; k < WIENER_WIN_CHROMA * WIENER_WIN_CHROMA; ++k) {
+      for (l = 0; l < WIENER_WIN_CHROMA * 8; ++l) {
+        H_int64[k][l] += H_int32[k][l];
+        H_int32[k][l] = 0;
+      }
+    }
+  }
+
+  const int64_t avg_square_sum = (int64_t)avg * (int64_t)avg * pixel_count;
+  for (k = 0; k < wiener_win; k++) {
+    for (l = 0; l < wiener_win; l++) {
+      const int32_t idx0 = l * wiener_win + k;
+      M[idx0] =
+          M_int64[k][l] + (avg_square_sum - (int64_t)avg * (sumX + sumY[k][l]));
+      int64_t *H_ = H + idx0 * wiener_win2;
+      int64_t *H_int_ = &H_int64[idx0][0];
+      for (m = 0; m < wiener_win; m++) {
+        for (n = 0; n < wiener_win; n++) {
+          H_[m * wiener_win + n] = H_int_[n * 8 + m] + avg_square_sum -
+                                   (int64_t)avg * (sumY[k][l] + sumY[n][m]);
+        }
+      }
+    }
+  }
+}
+void av1_compute_stats_sse4_1(int wiener_win, const uint8_t *dgd,
+                              const uint8_t *src, int h_start, int h_end,
+                              int v_start, int v_end, int dgd_stride,
+                              int src_stride, int64_t *M, int64_t *H) {
+  if (wiener_win == WIENER_WIN) {
+    compute_stats_win7_opt_sse4_1(dgd, src, h_start, h_end, v_start, v_end,
+                                  dgd_stride, src_stride, M, H);
+  } else if (wiener_win == WIENER_WIN_CHROMA) {
+    compute_stats_win5_opt_sse4_1(dgd, src, h_start, h_end, v_start, v_end,
+                                  dgd_stride, src_stride, M, H);
+  } else {
+    av1_compute_stats_c(wiener_win, dgd, src, h_start, h_end, v_start, v_end,
+                        dgd_stride, src_stride, M, H);
+  }
+}
+
+static INLINE __m128i pair_set_epi16(int a, int b) {
+  return _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16)));
+}
+
+int64_t av1_lowbd_pixel_proj_error_sse4_1(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+    int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) {
+  int i, j, k;
+  const int32_t shift = SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS;
+  const __m128i rounding = _mm_set1_epi32(1 << (shift - 1));
+  __m128i sum64 = _mm_setzero_si128();
+  const uint8_t *src = src8;
+  const uint8_t *dat = dat8;
+  int64_t err = 0;
+  if (params->r[0] > 0 && params->r[1] > 0) {
+    __m128i xq_coeff = pair_set_epi16(xq[0], xq[1]);
+    for (i = 0; i < height; ++i) {
+      __m128i sum32 = _mm_setzero_si128();
+      for (j = 0; j <= width - 8; j += 8) {
+        const __m128i d0 = _mm_cvtepu8_epi16(xx_loadl_64(dat + j));
+        const __m128i s0 = _mm_cvtepu8_epi16(xx_loadl_64(src + j));
+        const __m128i flt0_16b =
+            _mm_packs_epi32(xx_loadu_128(flt0 + j), xx_loadu_128(flt0 + j + 4));
+        const __m128i flt1_16b =
+            _mm_packs_epi32(xx_loadu_128(flt1 + j), xx_loadu_128(flt1 + j + 4));
+        const __m128i u0 = _mm_slli_epi16(d0, SGRPROJ_RST_BITS);
+        const __m128i flt0_0_sub_u = _mm_sub_epi16(flt0_16b, u0);
+        const __m128i flt1_0_sub_u = _mm_sub_epi16(flt1_16b, u0);
+        const __m128i v0 = _mm_madd_epi16(
+            xq_coeff, _mm_unpacklo_epi16(flt0_0_sub_u, flt1_0_sub_u));
+        const __m128i v1 = _mm_madd_epi16(
+            xq_coeff, _mm_unpackhi_epi16(flt0_0_sub_u, flt1_0_sub_u));
+        const __m128i vr0 = _mm_srai_epi32(_mm_add_epi32(v0, rounding), shift);
+        const __m128i vr1 = _mm_srai_epi32(_mm_add_epi32(v1, rounding), shift);
+        const __m128i e0 =
+            _mm_sub_epi16(_mm_add_epi16(_mm_packs_epi32(vr0, vr1), d0), s0);
+        const __m128i err0 = _mm_madd_epi16(e0, e0);
+        sum32 = _mm_add_epi32(sum32, err0);
+      }
+      for (k = j; k < width; ++k) {
+        const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
+        int32_t v = xq[0] * (flt0[k] - u) + xq[1] * (flt1[k] - u);
+        const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
+        err += ((int64_t)e * e);
+      }
+      dat += dat_stride;
+      src += src_stride;
+      flt0 += flt0_stride;
+      flt1 += flt1_stride;
+      const __m128i sum64_0 = _mm_cvtepi32_epi64(sum32);
+      const __m128i sum64_1 = _mm_cvtepi32_epi64(_mm_srli_si128(sum32, 8));
+      sum64 = _mm_add_epi64(sum64, sum64_0);
+      sum64 = _mm_add_epi64(sum64, sum64_1);
+    }
+  } else if (params->r[0] > 0 || params->r[1] > 0) {
+    const int xq_active = (params->r[0] > 0) ? xq[0] : xq[1];
+    const __m128i xq_coeff =
+        pair_set_epi16(xq_active, -(xq_active << SGRPROJ_RST_BITS));
+    const int32_t *flt = (params->r[0] > 0) ? flt0 : flt1;
+    const int flt_stride = (params->r[0] > 0) ? flt0_stride : flt1_stride;
+    for (i = 0; i < height; ++i) {
+      __m128i sum32 = _mm_setzero_si128();
+      for (j = 0; j <= width - 8; j += 8) {
+        const __m128i d0 = _mm_cvtepu8_epi16(xx_loadl_64(dat + j));
+        const __m128i s0 = _mm_cvtepu8_epi16(xx_loadl_64(src + j));
+        const __m128i flt_16b =
+            _mm_packs_epi32(xx_loadu_128(flt + j), xx_loadu_128(flt + j + 4));
+        const __m128i v0 =
+            _mm_madd_epi16(xq_coeff, _mm_unpacklo_epi16(flt_16b, d0));
+        const __m128i v1 =
+            _mm_madd_epi16(xq_coeff, _mm_unpackhi_epi16(flt_16b, d0));
+        const __m128i vr0 = _mm_srai_epi32(_mm_add_epi32(v0, rounding), shift);
+        const __m128i vr1 = _mm_srai_epi32(_mm_add_epi32(v1, rounding), shift);
+        const __m128i e0 =
+            _mm_sub_epi16(_mm_add_epi16(_mm_packs_epi32(vr0, vr1), d0), s0);
+        const __m128i err0 = _mm_madd_epi16(e0, e0);
+        sum32 = _mm_add_epi32(sum32, err0);
+      }
+      for (k = j; k < width; ++k) {
+        const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
+        int32_t v = xq_active * (flt[k] - u);
+        const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
+        err += ((int64_t)e * e);
+      }
+      dat += dat_stride;
+      src += src_stride;
+      flt += flt_stride;
+      const __m128i sum64_0 = _mm_cvtepi32_epi64(sum32);
+      const __m128i sum64_1 = _mm_cvtepi32_epi64(_mm_srli_si128(sum32, 8));
+      sum64 = _mm_add_epi64(sum64, sum64_0);
+      sum64 = _mm_add_epi64(sum64, sum64_1);
+    }
+  } else {
+    __m128i sum32 = _mm_setzero_si128();
+    for (i = 0; i < height; ++i) {
+      for (j = 0; j <= width - 16; j += 16) {
+        const __m128i d = xx_loadu_128(dat + j);
+        const __m128i s = xx_loadu_128(src + j);
+        const __m128i d0 = _mm_cvtepu8_epi16(d);
+        const __m128i d1 = _mm_cvtepu8_epi16(_mm_srli_si128(d, 8));
+        const __m128i s0 = _mm_cvtepu8_epi16(s);
+        const __m128i s1 = _mm_cvtepu8_epi16(_mm_srli_si128(s, 8));
+        const __m128i diff0 = _mm_sub_epi16(d0, s0);
+        const __m128i diff1 = _mm_sub_epi16(d1, s1);
+        const __m128i err0 = _mm_madd_epi16(diff0, diff0);
+        const __m128i err1 = _mm_madd_epi16(diff1, diff1);
+        sum32 = _mm_add_epi32(sum32, err0);
+        sum32 = _mm_add_epi32(sum32, err1);
+      }
+      for (k = j; k < width; ++k) {
+        const int32_t e = (int32_t)(dat[k]) - src[k];
+        err += ((int64_t)e * e);
+      }
+      dat += dat_stride;
+      src += src_stride;
+    }
+    const __m128i sum64_0 = _mm_cvtepi32_epi64(sum32);
+    const __m128i sum64_1 = _mm_cvtepi32_epi64(_mm_srli_si128(sum32, 8));
+    sum64 = _mm_add_epi64(sum64_0, sum64_1);
+  }
+  int64_t sum[2];
+  xx_storeu_128(sum, sum64);
+  err += sum[0] + sum[1];
+  return err;
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+int64_t av1_highbd_pixel_proj_error_sse4_1(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+    int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) {
+  int i, j, k;
+  const int32_t shift = SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS;
+  const __m128i rounding = _mm_set1_epi32(1 << (shift - 1));
+  __m128i sum64 = _mm_setzero_si128();
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+  int64_t err = 0;
+  if (params->r[0] > 0 && params->r[1] > 0) {  // Both filters are enabled
+    const __m128i xq0 = _mm_set1_epi32(xq[0]);
+    const __m128i xq1 = _mm_set1_epi32(xq[1]);
+
+    for (i = 0; i < height; ++i) {
+      __m128i sum32 = _mm_setzero_si128();
+      for (j = 0; j <= width - 8; j += 8) {
+        // Load 8x pixels from source image
+        const __m128i s0 = xx_loadu_128(src + j);
+        // s0 = [7 6 5 4 3 2 1 0] as i16 (indices of src[])
+
+        // Load 8x pixels from corrupted image
+        const __m128i d0 = xx_loadu_128(dat + j);
+        // d0 = [7 6 5 4 3 2 1 0] as i16 (indices of dat[])
+
+        // Shift each pixel value up by SGRPROJ_RST_BITS
+        const __m128i u0 = _mm_slli_epi16(d0, SGRPROJ_RST_BITS);
+
+        // Split u0 into two halves and pad each from u16 to i32
+        const __m128i u0l = _mm_cvtepu16_epi32(u0);
+        const __m128i u0h = _mm_cvtepu16_epi32(_mm_srli_si128(u0, 8));
+        // u0h = [7 6 5 4] as i32, u0l = [3 2 1 0] as i32, all dat[] indices
+
+        // Load 8 pixels from first and second filtered images
+        const __m128i flt0l = xx_loadu_128(flt0 + j);
+        const __m128i flt0h = xx_loadu_128(flt0 + j + 4);
+        const __m128i flt1l = xx_loadu_128(flt1 + j);
+        const __m128i flt1h = xx_loadu_128(flt1 + j + 4);
+        // flt0 = [7 6 5 4] [3 2 1 0] as i32 (indices of flt0+j)
+        // flt1 = [7 6 5 4] [3 2 1 0] as i32 (indices of flt1+j)
+
+        // Subtract shifted corrupt image from each filtered image
+        // This gives our two basis vectors for the projection
+        const __m128i flt0l_subu = _mm_sub_epi32(flt0l, u0l);
+        const __m128i flt0h_subu = _mm_sub_epi32(flt0h, u0h);
+        const __m128i flt1l_subu = _mm_sub_epi32(flt1l, u0l);
+        const __m128i flt1h_subu = _mm_sub_epi32(flt1h, u0h);
+        // flt?h_subu = [ f[7]-u[7] f[6]-u[6] f[5]-u[5] f[4]-u[4] ] as i32
+        // flt?l_subu = [ f[3]-u[3] f[2]-u[2] f[1]-u[1] f[0]-u[0] ] as i32
+
+        // Multiply each basis vector by the corresponding coefficient
+        const __m128i v0l = _mm_mullo_epi32(flt0l_subu, xq0);
+        const __m128i v0h = _mm_mullo_epi32(flt0h_subu, xq0);
+        const __m128i v1l = _mm_mullo_epi32(flt1l_subu, xq1);
+        const __m128i v1h = _mm_mullo_epi32(flt1h_subu, xq1);
+
+        // Add together the contribution from each scaled basis vector
+        const __m128i vl = _mm_add_epi32(v0l, v1l);
+        const __m128i vh = _mm_add_epi32(v0h, v1h);
+
+        // Right-shift v with appropriate rounding
+        const __m128i vrl = _mm_srai_epi32(_mm_add_epi32(vl, rounding), shift);
+        const __m128i vrh = _mm_srai_epi32(_mm_add_epi32(vh, rounding), shift);
+
+        // Saturate each i32 value to i16 and combine lower and upper halves
+        const __m128i vr = _mm_packs_epi32(vrl, vrh);
+
+        // Add twin-subspace-sgr-filter to corrupt image then subtract source
+        const __m128i e0 = _mm_sub_epi16(_mm_add_epi16(vr, d0), s0);
+
+        // Calculate squared error and add adjacent values
+        const __m128i err0 = _mm_madd_epi16(e0, e0);
+
+        sum32 = _mm_add_epi32(sum32, err0);
+      }
+
+      const __m128i sum32l = _mm_cvtepu32_epi64(sum32);
+      sum64 = _mm_add_epi64(sum64, sum32l);
+      const __m128i sum32h = _mm_cvtepu32_epi64(_mm_srli_si128(sum32, 8));
+      sum64 = _mm_add_epi64(sum64, sum32h);
+
+      // Process remaining pixels in this row (modulo 8)
+      for (k = j; k < width; ++k) {
+        const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
+        int32_t v = xq[0] * (flt0[k] - u) + xq[1] * (flt1[k] - u);
+        const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
+        err += ((int64_t)e * e);
+      }
+      dat += dat_stride;
+      src += src_stride;
+      flt0 += flt0_stride;
+      flt1 += flt1_stride;
+    }
+  } else if (params->r[0] > 0 || params->r[1] > 0) {  // Only one filter enabled
+    const int32_t xq_on = (params->r[0] > 0) ? xq[0] : xq[1];
+    const __m128i xq_active = _mm_set1_epi32(xq_on);
+    const __m128i xq_inactive =
+        _mm_set1_epi32(-xq_on * (1 << SGRPROJ_RST_BITS));
+    const int32_t *flt = (params->r[0] > 0) ? flt0 : flt1;
+    const int flt_stride = (params->r[0] > 0) ? flt0_stride : flt1_stride;
+    for (i = 0; i < height; ++i) {
+      __m128i sum32 = _mm_setzero_si128();
+      for (j = 0; j <= width - 8; j += 8) {
+        // Load 8x pixels from source image
+        const __m128i s0 = xx_loadu_128(src + j);
+        // s0 = [7 6 5 4 3 2 1 0] as u16 (indices of src[])
+
+        // Load 8x pixels from corrupted image and pad each u16 to i32
+        const __m128i d0 = xx_loadu_128(dat + j);
+        const __m128i d0h = _mm_cvtepu16_epi32(_mm_srli_si128(d0, 8));
+        const __m128i d0l = _mm_cvtepu16_epi32(d0);
+        // d0h, d0l = [7 6 5 4], [3 2 1 0] as u32 (indices of dat[])
+
+        // Load 8 pixels from the filtered image
+        const __m128i flth = xx_loadu_128(flt + j + 4);
+        const __m128i fltl = xx_loadu_128(flt + j);
+        // flth, fltl = [7 6 5 4], [3 2 1 0] as i32 (indices of flt+j)
+
+        const __m128i flth_xq = _mm_mullo_epi32(flth, xq_active);
+        const __m128i fltl_xq = _mm_mullo_epi32(fltl, xq_active);
+        const __m128i d0h_xq = _mm_mullo_epi32(d0h, xq_inactive);
+        const __m128i d0l_xq = _mm_mullo_epi32(d0l, xq_inactive);
+
+        const __m128i vh = _mm_add_epi32(flth_xq, d0h_xq);
+        const __m128i vl = _mm_add_epi32(fltl_xq, d0l_xq);
+        // vh = [ xq0(f[7]-d[7]) xq0(f[6]-d[6]) xq0(f[5]-d[5]) xq0(f[4]-d[4]) ]
+        // vl = [ xq0(f[3]-d[3]) xq0(f[2]-d[2]) xq0(f[1]-d[1]) xq0(f[0]-d[0]) ]
+
+        // Shift this down with appropriate rounding
+        const __m128i vrh = _mm_srai_epi32(_mm_add_epi32(vh, rounding), shift);
+        const __m128i vrl = _mm_srai_epi32(_mm_add_epi32(vl, rounding), shift);
+
+        // Saturate vr0 and vr1 from i32 to i16 then pack together
+        const __m128i vr = _mm_packs_epi32(vrl, vrh);
+
+        // Subtract twin-subspace-sgr filtered from source image to get error
+        const __m128i e0 = _mm_sub_epi16(_mm_add_epi16(vr, d0), s0);
+
+        // Calculate squared error and add adjacent values
+        const __m128i err0 = _mm_madd_epi16(e0, e0);
+
+        sum32 = _mm_add_epi32(sum32, err0);
+      }
+
+      const __m128i sum32l = _mm_cvtepu32_epi64(sum32);
+      sum64 = _mm_add_epi64(sum64, sum32l);
+      const __m128i sum32h = _mm_cvtepu32_epi64(_mm_srli_si128(sum32, 8));
+      sum64 = _mm_add_epi64(sum64, sum32h);
+
+      // Process remaining pixels in this row (modulo 8)
+      for (k = j; k < width; ++k) {
+        const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
+        int32_t v = xq_on * (flt[k] - u);
+        const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
+        err += ((int64_t)e * e);
+      }
+      dat += dat_stride;
+      src += src_stride;
+      flt += flt_stride;
+    }
+  } else {  // Neither filter is enabled
+    for (i = 0; i < height; ++i) {
+      __m128i sum32 = _mm_setzero_si128();
+      for (j = 0; j <= width - 16; j += 16) {
+        // Load 2x8 u16 from source image
+        const __m128i s0 = xx_loadu_128(src + j);
+        const __m128i s1 = xx_loadu_128(src + j + 8);
+        // Load 2x8 u16 from corrupted image
+        const __m128i d0 = xx_loadu_128(dat + j);
+        const __m128i d1 = xx_loadu_128(dat + j + 8);
+
+        // Subtract corrupted image from source image
+        const __m128i diff0 = _mm_sub_epi16(d0, s0);
+        const __m128i diff1 = _mm_sub_epi16(d1, s1);
+
+        // Square error and add adjacent values
+        const __m128i err0 = _mm_madd_epi16(diff0, diff0);
+        const __m128i err1 = _mm_madd_epi16(diff1, diff1);
+
+        sum32 = _mm_add_epi32(sum32, err0);
+        sum32 = _mm_add_epi32(sum32, err1);
+      }
+
+      const __m128i sum32l = _mm_cvtepu32_epi64(sum32);
+      sum64 = _mm_add_epi64(sum64, sum32l);
+      const __m128i sum32h = _mm_cvtepu32_epi64(_mm_srli_si128(sum32, 8));
+      sum64 = _mm_add_epi64(sum64, sum32h);
+
+      // Process remaining pixels (modulu 8)
+      for (k = j; k < width; ++k) {
+        const int32_t e = (int32_t)(dat[k]) - src[k];
+        err += ((int64_t)e * e);
+      }
+      dat += dat_stride;
+      src += src_stride;
+    }
+  }
+
+  // Sum 4 values from sum64l and sum64h into err
+  int64_t sum[2];
+  xx_storeu_128(sum, sum64);
+  err += sum[0] + sum[1];
+  return err;
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/libs/libaom/src/av1/encoder/x86/rdopt_avx2.c b/libs/libaom/src/av1/encoder/x86/rdopt_avx2.c
new file mode 100644
index 000000000..f588badc7
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/x86/rdopt_avx2.c
@@ -0,0 +1,256 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+#include "aom_dsp/x86/synonyms_avx2.h"
+#include "aom_ports/system_state.h"
+
+#include "config/av1_rtcd.h"
+#include "av1/encoder/rdopt.h"
+
+// Process horizontal and vertical correlations in a 4x4 block of pixels.
+// We actually use the 4x4 pixels to calculate correlations corresponding to
+// the top-left 3x3 pixels, so this function must be called with 1x1 overlap,
+// moving the window along/down by 3 pixels at a time.
+INLINE static void horver_correlation_4x4(const int16_t *diff, int stride,
+                                          __m256i *xy_sum_32,
+                                          __m256i *xz_sum_32, __m256i *x_sum_32,
+                                          __m256i *x2_sum_32) {
+  // Pixels in this 4x4   [ a b c d ]
+  // are referred to as:  [ e f g h ]
+  //                      [ i j k l ]
+  //                      [ m n o p ]
+
+  const __m256i pixels = _mm256_set_epi64x(
+      *(uint64_t *)&diff[0 * stride], *(uint64_t *)&diff[1 * stride],
+      *(uint64_t *)&diff[2 * stride], *(uint64_t *)&diff[3 * stride]);
+  // pixels = [d c b a h g f e] [l k j i p o n m] as i16
+
+  const __m256i slli = _mm256_slli_epi64(pixels, 16);
+  // slli = [c b a 0 g f e 0] [k j i 0 o n m 0] as i16
+
+  const __m256i madd_xy = _mm256_madd_epi16(pixels, slli);
+  // madd_xy = [bc+cd ab fg+gh ef] [jk+kl ij no+op mn] as i32
+  *xy_sum_32 = _mm256_add_epi32(*xy_sum_32, madd_xy);
+
+  // Permute control [3 2] [1 0] => [2 1] [0 0], 0b10010000 = 0x90
+  const __m256i perm = _mm256_permute4x64_epi64(slli, 0x90);
+  // perm = [g f e 0 k j i 0] [o n m 0 o n m 0] as i16
+
+  const __m256i madd_xz = _mm256_madd_epi16(slli, perm);
+  // madd_xz = [cg+bf ae gk+fj ei] [ko+jn im oo+nn mm] as i32
+  *xz_sum_32 = _mm256_add_epi32(*xz_sum_32, madd_xz);
+
+  // Sum every element in slli (and then also their squares)
+  const __m256i madd1_slli = _mm256_madd_epi16(slli, _mm256_set1_epi16(1));
+  // madd1_slli = [c+b a g+f e] [k+j i o+n m] as i32
+  *x_sum_32 = _mm256_add_epi32(*x_sum_32, madd1_slli);
+
+  const __m256i madd_slli = _mm256_madd_epi16(slli, slli);
+  // madd_slli = [cc+bb aa gg+ff ee] [kk+jj ii oo+nn mm] as i32
+  *x2_sum_32 = _mm256_add_epi32(*x2_sum_32, madd_slli);
+}
+
+void av1_get_horver_correlation_full_avx2(const int16_t *diff, int stride,
+                                          int width, int height, float *hcorr,
+                                          float *vcorr) {
+  // The following notation is used:
+  // x - current pixel
+  // y - right neighbour pixel
+  // z - below neighbour pixel
+  // w - down-right neighbour pixel
+  int64_t xy_sum = 0, xz_sum = 0;
+  int64_t x_sum = 0, x2_sum = 0;
+
+  // Process horizontal and vertical correlations through the body in 4x4
+  // blocks.  This excludes the final row and column and possibly one extra
+  // column depending how 3 divides into width and height
+  int32_t xy_xz_tmp[8] = { 0 }, x_x2_tmp[8] = { 0 };
+  __m256i xy_sum_32 = _mm256_setzero_si256();
+  __m256i xz_sum_32 = _mm256_setzero_si256();
+  __m256i x_sum_32 = _mm256_setzero_si256();
+  __m256i x2_sum_32 = _mm256_setzero_si256();
+  for (int i = 0; i <= height - 4; i += 3) {
+    for (int j = 0; j <= width - 4; j += 3) {
+      horver_correlation_4x4(&diff[i * stride + j], stride, &xy_sum_32,
+                             &xz_sum_32, &x_sum_32, &x2_sum_32);
+    }
+    const __m256i hadd_xy_xz = _mm256_hadd_epi32(xy_sum_32, xz_sum_32);
+    // hadd_xy_xz = [ae+bf+cg ei+fj+gk ab+bc+cd ef+fg+gh]
+    //              [im+jn+ko mm+nn+oo ij+jk+kl mn+no+op] as i32
+    yy_storeu_256(xy_xz_tmp, hadd_xy_xz);
+    xy_sum += (int64_t)xy_xz_tmp[5] + xy_xz_tmp[4] + xy_xz_tmp[1];
+    xz_sum += (int64_t)xy_xz_tmp[7] + xy_xz_tmp[6] + xy_xz_tmp[3];
+
+    const __m256i hadd_x_x2 = _mm256_hadd_epi32(x_sum_32, x2_sum_32);
+    // hadd_x_x2 = [aa+bb+cc ee+ff+gg a+b+c e+f+g]
+    //             [ii+jj+kk mm+nn+oo i+j+k m+n+o] as i32
+    yy_storeu_256(x_x2_tmp, hadd_x_x2);
+    x_sum += (int64_t)x_x2_tmp[5] + x_x2_tmp[4] + x_x2_tmp[1];
+    x2_sum += (int64_t)x_x2_tmp[7] + x_x2_tmp[6] + x_x2_tmp[3];
+
+    xy_sum_32 = _mm256_setzero_si256();
+    xz_sum_32 = _mm256_setzero_si256();
+    x_sum_32 = _mm256_setzero_si256();
+    x2_sum_32 = _mm256_setzero_si256();
+  }
+
+  // x_sum now covers every pixel except the final 1-2 rows and 1-2 cols
+  int64_t x_finalrow = 0, x_finalcol = 0, x2_finalrow = 0, x2_finalcol = 0;
+
+  // Do we have 2 rows remaining or just the one?  Note that width and height
+  // are powers of 2, so each modulo 3 must be 1 or 2.
+  if (height % 3 == 1) {  // Just horiz corrs on the final row
+    const int16_t x0 = diff[(height - 1) * stride];
+    x_sum += x0;
+    x_finalrow += x0;
+    x2_sum += x0 * x0;
+    x2_finalrow += x0 * x0;
+    for (int j = 0; j < width - 1; ++j) {
+      const int16_t x = diff[(height - 1) * stride + j];
+      const int16_t y = diff[(height - 1) * stride + j + 1];
+      xy_sum += x * y;
+      x_sum += y;
+      x2_sum += y * y;
+      x_finalrow += y;
+      x2_finalrow += y * y;
+    }
+  } else {  // Two rows remaining to do
+    const int16_t x0 = diff[(height - 2) * stride];
+    const int16_t z0 = diff[(height - 1) * stride];
+    x_sum += x0 + z0;
+    x2_sum += x0 * x0 + z0 * z0;
+    x_finalrow += z0;
+    x2_finalrow += z0 * z0;
+    for (int j = 0; j < width - 1; ++j) {
+      const int16_t x = diff[(height - 2) * stride + j];
+      const int16_t y = diff[(height - 2) * stride + j + 1];
+      const int16_t z = diff[(height - 1) * stride + j];
+      const int16_t w = diff[(height - 1) * stride + j + 1];
+
+      // Horizontal and vertical correlations for the penultimate row:
+      xy_sum += x * y;
+      xz_sum += x * z;
+
+      // Now just horizontal correlations for the final row:
+      xy_sum += z * w;
+
+      x_sum += y + w;
+      x2_sum += y * y + w * w;
+      x_finalrow += w;
+      x2_finalrow += w * w;
+    }
+  }
+
+  // Do we have 2 columns remaining or just the one?
+  if (width % 3 == 1) {  // Just vert corrs on the final col
+    const int16_t x0 = diff[width - 1];
+    x_sum += x0;
+    x_finalcol += x0;
+    x2_sum += x0 * x0;
+    x2_finalcol += x0 * x0;
+    for (int i = 0; i < height - 1; ++i) {
+      const int16_t x = diff[i * stride + width - 1];
+      const int16_t z = diff[(i + 1) * stride + width - 1];
+      xz_sum += x * z;
+      x_finalcol += z;
+      x2_finalcol += z * z;
+      // So the bottom-right elements don't get counted twice:
+      if (i < height - (height % 3 == 1 ? 2 : 3)) {
+        x_sum += z;
+        x2_sum += z * z;
+      }
+    }
+  } else {  // Two cols remaining
+    const int16_t x0 = diff[width - 2];
+    const int16_t y0 = diff[width - 1];
+    x_sum += x0 + y0;
+    x2_sum += x0 * x0 + y0 * y0;
+    x_finalcol += y0;
+    x2_finalcol += y0 * y0;
+    for (int i = 0; i < height - 1; ++i) {
+      const int16_t x = diff[i * stride + width - 2];
+      const int16_t y = diff[i * stride + width - 1];
+      const int16_t z = diff[(i + 1) * stride + width - 2];
+      const int16_t w = diff[(i + 1) * stride + width - 1];
+
+      // Horizontal and vertical correlations for the penultimate col:
+      // Skip these on the last iteration of this loop if we also had two
+      // rows remaining, otherwise the final horizontal and vertical correlation
+      // get erroneously processed twice
+      if (i < height - 2 || height % 3 == 1) {
+        xy_sum += x * y;
+        xz_sum += x * z;
+      }
+
+      x_finalcol += w;
+      x2_finalcol += w * w;
+      // So the bottom-right elements don't get counted twice:
+      if (i < height - (height % 3 == 1 ? 2 : 3)) {
+        x_sum += z + w;
+        x2_sum += z * z + w * w;
+      }
+
+      // Now just vertical correlations for the final column:
+      xz_sum += y * w;
+    }
+  }
+
+  // Calculate the simple sums and squared-sums
+  int64_t x_firstrow = 0, x_firstcol = 0;
+  int64_t x2_firstrow = 0, x2_firstcol = 0;
+
+  for (int j = 0; j < width; ++j) {
+    x_firstrow += diff[j];
+    x2_firstrow += diff[j] * diff[j];
+  }
+  for (int i = 0; i < height; ++i) {
+    x_firstcol += diff[i * stride];
+    x2_firstcol += diff[i * stride] * diff[i * stride];
+  }
+
+  int64_t xhor_sum = x_sum - x_finalcol;
+  int64_t xver_sum = x_sum - x_finalrow;
+  int64_t y_sum = x_sum - x_firstcol;
+  int64_t z_sum = x_sum - x_firstrow;
+  int64_t x2hor_sum = x2_sum - x2_finalcol;
+  int64_t x2ver_sum = x2_sum - x2_finalrow;
+  int64_t y2_sum = x2_sum - x2_firstcol;
+  int64_t z2_sum = x2_sum - x2_firstrow;
+
+  aom_clear_system_state();
+
+  const float num_hor = (float)(height * (width - 1));
+  const float num_ver = (float)((height - 1) * width);
+
+  const float xhor_var_n = x2hor_sum - (xhor_sum * xhor_sum) / num_hor;
+  const float xver_var_n = x2ver_sum - (xver_sum * xver_sum) / num_ver;
+
+  const float y_var_n = y2_sum - (y_sum * y_sum) / num_hor;
+  const float z_var_n = z2_sum - (z_sum * z_sum) / num_ver;
+
+  const float xy_var_n = xy_sum - (xhor_sum * y_sum) / num_hor;
+  const float xz_var_n = xz_sum - (xver_sum * z_sum) / num_ver;
+
+  if (xhor_var_n > 0 && y_var_n > 0) {
+    *hcorr = xy_var_n / sqrtf(xhor_var_n * y_var_n);
+    *hcorr = *hcorr < 0 ? 0 : *hcorr;
+  } else {
+    *hcorr = 1.0;
+  }
+  if (xver_var_n > 0 && z_var_n > 0) {
+    *vcorr = xz_var_n / sqrtf(xver_var_n * z_var_n);
+    *vcorr = *vcorr < 0 ? 0 : *vcorr;
+  } else {
+    *vcorr = 1.0;
+  }
+}
diff --git a/libs/libaom/src/av1/encoder/x86/rdopt_sse4.c b/libs/libaom/src/av1/encoder/x86/rdopt_sse4.c
new file mode 100644
index 000000000..67d94b4ca
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/x86/rdopt_sse4.c
@@ -0,0 +1,275 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_ports/system_state.h"
+
+#include "config/av1_rtcd.h"
+#include "av1/encoder/rdopt.h"
+
+// Process horizontal and vertical correlations in a 4x4 block of pixels.
+// We actually use the 4x4 pixels to calculate correlations corresponding to
+// the top-left 3x3 pixels, so this function must be called with 1x1 overlap,
+// moving the window along/down by 3 pixels at a time.
+INLINE static void horver_correlation_4x4(const int16_t *diff, int stride,
+                                          __m128i *xy_sum_32,
+                                          __m128i *xz_sum_32, __m128i *x_sum_32,
+                                          __m128i *x2_sum_32) {
+  // Pixels in this 4x4   [ a b c d ]
+  // are referred to as:  [ e f g h ]
+  //                      [ i j k l ]
+  //                      [ m n o p ]
+
+  const __m128i pixelsa = _mm_set_epi64x(*(uint64_t *)&diff[0 * stride],
+                                         *(uint64_t *)&diff[2 * stride]);
+  const __m128i pixelsb = _mm_set_epi64x(*(uint64_t *)&diff[1 * stride],
+                                         *(uint64_t *)&diff[3 * stride]);
+  // pixelsa = [d c b a l k j i] as i16
+  // pixelsb = [h g f e p o n m] as i16
+
+  const __m128i slli_a = _mm_slli_epi64(pixelsa, 16);
+  const __m128i slli_b = _mm_slli_epi64(pixelsb, 16);
+  // slli_a = [c b a 0 k j i 0] as i16
+  // slli_b = [g f e 0 o n m 0] as i16
+
+  const __m128i xy_madd_a = _mm_madd_epi16(pixelsa, slli_a);
+  const __m128i xy_madd_b = _mm_madd_epi16(pixelsb, slli_b);
+  // xy_madd_a = [bc+cd ab jk+kl ij] as i32
+  // xy_madd_b = [fg+gh ef no+op mn] as i32
+
+  const __m128i xy32 = _mm_hadd_epi32(xy_madd_b, xy_madd_a);
+  // xy32 = [ab+bc+cd ij+jk+kl ef+fg+gh mn+no+op] as i32
+  *xy_sum_32 = _mm_add_epi32(*xy_sum_32, xy32);
+
+  const __m128i xz_madd_a = _mm_madd_epi16(slli_a, slli_b);
+  // xz_madd_a = [bf+cg ae jn+ko im] i32
+
+  const __m128i swap_b = _mm_srli_si128(slli_b, 8);
+  // swap_b = [0 0 0 0 g f e 0] as i16
+  const __m128i xz_madd_b = _mm_madd_epi16(slli_a, swap_b);
+  // xz_madd_b = [0 0 gk+fj ei] i32
+
+  const __m128i xz32 = _mm_hadd_epi32(xz_madd_b, xz_madd_a);
+  // xz32 = [ae+bf+cg im+jn+ko 0 ei+fj+gk] i32
+  *xz_sum_32 = _mm_add_epi32(*xz_sum_32, xz32);
+
+  // Now calculate the straight sums, x_sum += a+b+c+e+f+g+i+j+k
+  // (sum up every element in slli_a and swap_b)
+  const __m128i sum_slli_a = _mm_hadd_epi16(slli_a, slli_a);
+  const __m128i sum_slli_a32 = _mm_cvtepi16_epi32(sum_slli_a);
+  // sum_slli_a32 = [c+b a k+j i] as i32
+  const __m128i swap_b32 = _mm_cvtepi16_epi32(swap_b);
+  // swap_b32 = [g f e 0] as i32
+  *x_sum_32 = _mm_add_epi32(*x_sum_32, sum_slli_a32);
+  *x_sum_32 = _mm_add_epi32(*x_sum_32, swap_b32);
+  // sum = [c+b+g a+f k+j+e i] as i32
+
+  // Also sum their squares
+  const __m128i slli_a_2 = _mm_madd_epi16(slli_a, slli_a);
+  const __m128i swap_b_2 = _mm_madd_epi16(swap_b, swap_b);
+  // slli_a_2 = [c2+b2 a2 k2+j2 i2]
+  // swap_b_2 = [0 0 g2+f2 e2]
+  const __m128i sum2 = _mm_hadd_epi32(slli_a_2, swap_b_2);
+  // sum2 = [0 g2+f2+e2 c2+b2+a2 k2+j2+i2]
+  *x2_sum_32 = _mm_add_epi32(*x2_sum_32, sum2);
+}
+
+void av1_get_horver_correlation_full_sse4_1(const int16_t *diff, int stride,
+                                            int width, int height, float *hcorr,
+                                            float *vcorr) {
+  // The following notation is used:
+  // x - current pixel
+  // y - right neighbour pixel
+  // z - below neighbour pixel
+  // w - down-right neighbour pixel
+  int64_t xy_sum = 0, xz_sum = 0;
+  int64_t x_sum = 0, x2_sum = 0;
+
+  // Process horizontal and vertical correlations through the body in 4x4
+  // blocks.  This excludes the final row and column and possibly one extra
+  // column depending how 3 divides into width and height
+  int32_t xy_tmp[4] = { 0 }, xz_tmp[4] = { 0 };
+  int32_t x_tmp[4] = { 0 }, x2_tmp[4] = { 0 };
+  __m128i xy_sum_32 = _mm_setzero_si128();
+  __m128i xz_sum_32 = _mm_setzero_si128();
+  __m128i x_sum_32 = _mm_setzero_si128();
+  __m128i x2_sum_32 = _mm_setzero_si128();
+  for (int i = 0; i <= height - 4; i += 3) {
+    for (int j = 0; j <= width - 4; j += 3) {
+      horver_correlation_4x4(&diff[i * stride + j], stride, &xy_sum_32,
+                             &xz_sum_32, &x_sum_32, &x2_sum_32);
+    }
+    xx_storeu_128(xy_tmp, xy_sum_32);
+    xx_storeu_128(xz_tmp, xz_sum_32);
+    xx_storeu_128(x_tmp, x_sum_32);
+    xx_storeu_128(x2_tmp, x2_sum_32);
+    xy_sum += (int64_t)xy_tmp[3] + xy_tmp[2] + xy_tmp[1];
+    xz_sum += (int64_t)xz_tmp[3] + xz_tmp[2] + xz_tmp[0];
+    x_sum += (int64_t)x_tmp[3] + x_tmp[2] + x_tmp[1] + x_tmp[0];
+    x2_sum += (int64_t)x2_tmp[2] + x2_tmp[1] + x2_tmp[0];
+    xy_sum_32 = _mm_setzero_si128();
+    xz_sum_32 = _mm_setzero_si128();
+    x_sum_32 = _mm_setzero_si128();
+    x2_sum_32 = _mm_setzero_si128();
+  }
+
+  // x_sum now covers every pixel except the final 1-2 rows and 1-2 cols
+  int64_t x_finalrow = 0, x_finalcol = 0, x2_finalrow = 0, x2_finalcol = 0;
+
+  // Do we have 2 rows remaining or just the one?  Note that width and height
+  // are powers of 2, so each modulo 3 must be 1 or 2.
+  if (height % 3 == 1) {  // Just horiz corrs on the final row
+    const int16_t x0 = diff[(height - 1) * stride];
+    x_sum += x0;
+    x_finalrow += x0;
+    x2_sum += x0 * x0;
+    x2_finalrow += x0 * x0;
+    for (int j = 0; j < width - 1; ++j) {
+      const int16_t x = diff[(height - 1) * stride + j];
+      const int16_t y = diff[(height - 1) * stride + j + 1];
+      xy_sum += x * y;
+      x_sum += y;
+      x2_sum += y * y;
+      x_finalrow += y;
+      x2_finalrow += y * y;
+    }
+  } else {  // Two rows remaining to do
+    const int16_t x0 = diff[(height - 2) * stride];
+    const int16_t z0 = diff[(height - 1) * stride];
+    x_sum += x0 + z0;
+    x2_sum += x0 * x0 + z0 * z0;
+    x_finalrow += z0;
+    x2_finalrow += z0 * z0;
+    for (int j = 0; j < width - 1; ++j) {
+      const int16_t x = diff[(height - 2) * stride + j];
+      const int16_t y = diff[(height - 2) * stride + j + 1];
+      const int16_t z = diff[(height - 1) * stride + j];
+      const int16_t w = diff[(height - 1) * stride + j + 1];
+
+      // Horizontal and vertical correlations for the penultimate row:
+      xy_sum += x * y;
+      xz_sum += x * z;
+
+      // Now just horizontal correlations for the final row:
+      xy_sum += z * w;
+
+      x_sum += y + w;
+      x2_sum += y * y + w * w;
+      x_finalrow += w;
+      x2_finalrow += w * w;
+    }
+  }
+
+  // Do we have 2 columns remaining or just the one?
+  if (width % 3 == 1) {  // Just vert corrs on the final col
+    const int16_t x0 = diff[width - 1];
+    x_sum += x0;
+    x_finalcol += x0;
+    x2_sum += x0 * x0;
+    x2_finalcol += x0 * x0;
+    for (int i = 0; i < height - 1; ++i) {
+      const int16_t x = diff[i * stride + width - 1];
+      const int16_t z = diff[(i + 1) * stride + width - 1];
+      xz_sum += x * z;
+      x_finalcol += z;
+      x2_finalcol += z * z;
+      // So the bottom-right elements don't get counted twice:
+      if (i < height - (height % 3 == 1 ? 2 : 3)) {
+        x_sum += z;
+        x2_sum += z * z;
+      }
+    }
+  } else {  // Two cols remaining
+    const int16_t x0 = diff[width - 2];
+    const int16_t y0 = diff[width - 1];
+    x_sum += x0 + y0;
+    x2_sum += x0 * x0 + y0 * y0;
+    x_finalcol += y0;
+    x2_finalcol += y0 * y0;
+    for (int i = 0; i < height - 1; ++i) {
+      const int16_t x = diff[i * stride + width - 2];
+      const int16_t y = diff[i * stride + width - 1];
+      const int16_t z = diff[(i + 1) * stride + width - 2];
+      const int16_t w = diff[(i + 1) * stride + width - 1];
+
+      // Horizontal and vertical correlations for the penultimate col:
+      // Skip these on the last iteration of this loop if we also had two
+      // rows remaining, otherwise the final horizontal and vertical correlation
+      // get erroneously processed twice
+      if (i < height - 2 || height % 3 == 1) {
+        xy_sum += x * y;
+        xz_sum += x * z;
+      }
+
+      x_finalcol += w;
+      x2_finalcol += w * w;
+      // So the bottom-right elements don't get counted twice:
+      if (i < height - (height % 3 == 1 ? 2 : 3)) {
+        x_sum += z + w;
+        x2_sum += z * z + w * w;
+      }
+
+      // Now just vertical correlations for the final column:
+      xz_sum += y * w;
+    }
+  }
+
+  // Calculate the simple sums and squared-sums
+  int64_t x_firstrow = 0, x_firstcol = 0;
+  int64_t x2_firstrow = 0, x2_firstcol = 0;
+
+  for (int j = 0; j < width; ++j) {
+    x_firstrow += diff[j];
+    x2_firstrow += diff[j] * diff[j];
+  }
+  for (int i = 0; i < height; ++i) {
+    x_firstcol += diff[i * stride];
+    x2_firstcol += diff[i * stride] * diff[i * stride];
+  }
+
+  int64_t xhor_sum = x_sum - x_finalcol;
+  int64_t xver_sum = x_sum - x_finalrow;
+  int64_t y_sum = x_sum - x_firstcol;
+  int64_t z_sum = x_sum - x_firstrow;
+  int64_t x2hor_sum = x2_sum - x2_finalcol;
+  int64_t x2ver_sum = x2_sum - x2_finalrow;
+  int64_t y2_sum = x2_sum - x2_firstcol;
+  int64_t z2_sum = x2_sum - x2_firstrow;
+
+  aom_clear_system_state();
+
+  const float num_hor = (float)(height * (width - 1));
+  const float num_ver = (float)((height - 1) * width);
+
+  const float xhor_var_n = x2hor_sum - (xhor_sum * xhor_sum) / num_hor;
+  const float xver_var_n = x2ver_sum - (xver_sum * xver_sum) / num_ver;
+
+  const float y_var_n = y2_sum - (y_sum * y_sum) / num_hor;
+  const float z_var_n = z2_sum - (z_sum * z_sum) / num_ver;
+
+  const float xy_var_n = xy_sum - (xhor_sum * y_sum) / num_hor;
+  const float xz_var_n = xz_sum - (xver_sum * z_sum) / num_ver;
+
+  if (xhor_var_n > 0 && y_var_n > 0) {
+    *hcorr = xy_var_n / sqrtf(xhor_var_n * y_var_n);
+    *hcorr = *hcorr < 0 ? 0 : *hcorr;
+  } else {
+    *hcorr = 1.0;
+  }
+  if (xver_var_n > 0 && z_var_n > 0) {
+    *vcorr = xz_var_n / sqrtf(xver_var_n * z_var_n);
+    *vcorr = *vcorr < 0 ? 0 : *vcorr;
+  } else {
+    *vcorr = 1.0;
+  }
+}
diff --git a/libs/libaom/src/av1/encoder/x86/temporal_filter_avx2.c b/libs/libaom/src/av1/encoder/x86/temporal_filter_avx2.c
new file mode 100644
index 000000000..847f7283c
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/x86/temporal_filter_avx2.c
@@ -0,0 +1,284 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "config/av1_rtcd.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/temporal_filter.h"
+
+#define SSE_STRIDE (BW + 2)
+
+DECLARE_ALIGNED(32, static const uint32_t, sse_bytemask[4][8]) = {
+  { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0, 0, 0 },
+  { 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0, 0 },
+  { 0, 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0 },
+  { 0, 0, 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, shufflemask_16b[2][16]) = {
+  { 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 },
+  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 10, 11, 10, 11 }
+};
+
+static AOM_FORCE_INLINE void get_squared_error_16x16_avx2(
+    const uint8_t *frame1, const unsigned int stride, const uint8_t *frame2,
+    const unsigned int stride2, const int block_width, const int block_height,
+    uint16_t *frame_sse, const unsigned int sse_stride) {
+  (void)block_width;
+  const uint8_t *src1 = frame1;
+  const uint8_t *src2 = frame2;
+  uint16_t *dst = frame_sse;
+  for (int i = 0; i < block_height; i++) {
+    __m128i vf1_128, vf2_128;
+    __m256i vf1, vf2, vdiff1, vsqdiff1;
+
+    vf1_128 = _mm_loadu_si128((__m128i *)(src1));
+    vf2_128 = _mm_loadu_si128((__m128i *)(src2));
+    vf1 = _mm256_cvtepu8_epi16(vf1_128);
+    vf2 = _mm256_cvtepu8_epi16(vf2_128);
+    vdiff1 = _mm256_sub_epi16(vf1, vf2);
+    vsqdiff1 = _mm256_mullo_epi16(vdiff1, vdiff1);
+
+    _mm256_storeu_si256((__m256i *)(dst), vsqdiff1);
+    // Set zero to uninitialized memory to avoid uninitialized loads later
+    *(uint32_t *)(dst + 16) = _mm_cvtsi128_si32(_mm_setzero_si128());
+
+    src1 += stride, src2 += stride2;
+    dst += sse_stride;
+  }
+}
+
+static AOM_FORCE_INLINE void get_squared_error_32x32_avx2(
+    const uint8_t *frame1, const unsigned int stride, const uint8_t *frame2,
+    const unsigned int stride2, const int block_width, const int block_height,
+    uint16_t *frame_sse, const unsigned int sse_stride) {
+  (void)block_width;
+  const uint8_t *src1 = frame1;
+  const uint8_t *src2 = frame2;
+  uint16_t *dst = frame_sse;
+  for (int i = 0; i < block_height; i++) {
+    __m256i vsrc1, vsrc2, vmin, vmax, vdiff, vdiff1, vdiff2, vres1, vres2;
+
+    vsrc1 = _mm256_loadu_si256((__m256i *)src1);
+    vsrc2 = _mm256_loadu_si256((__m256i *)src2);
+    vmax = _mm256_max_epu8(vsrc1, vsrc2);
+    vmin = _mm256_min_epu8(vsrc1, vsrc2);
+    vdiff = _mm256_subs_epu8(vmax, vmin);
+
+    __m128i vtmp1 = _mm256_castsi256_si128(vdiff);
+    __m128i vtmp2 = _mm256_extracti128_si256(vdiff, 1);
+    vdiff1 = _mm256_cvtepu8_epi16(vtmp1);
+    vdiff2 = _mm256_cvtepu8_epi16(vtmp2);
+
+    vres1 = _mm256_mullo_epi16(vdiff1, vdiff1);
+    vres2 = _mm256_mullo_epi16(vdiff2, vdiff2);
+    _mm256_storeu_si256((__m256i *)(dst), vres1);
+    _mm256_storeu_si256((__m256i *)(dst + 16), vres2);
+    // Set zero to uninitialized memory to avoid uninitialized loads later
+    *(uint32_t *)(dst + 32) = _mm_cvtsi128_si32(_mm_setzero_si128());
+
+    src1 += stride;
+    src2 += stride2;
+    dst += sse_stride;
+  }
+}
+
+static AOM_FORCE_INLINE __m256i xx_load_and_pad(uint16_t *src, int col,
+                                                int block_width) {
+  __m128i v128tmp = _mm_loadu_si128((__m128i *)(src));
+  if (col == 0) {
+    // For the first column, replicate the first element twice to the left
+    v128tmp = _mm_shuffle_epi8(v128tmp, *(__m128i *)shufflemask_16b[0]);
+  }
+  if (col == block_width - 4) {
+    // For the last column, replicate the last element twice to the right
+    v128tmp = _mm_shuffle_epi8(v128tmp, *(__m128i *)shufflemask_16b[1]);
+  }
+  return _mm256_cvtepu16_epi32(v128tmp);
+}
+
+static AOM_FORCE_INLINE int32_t xx_mask_and_hadd(__m256i vsum, int i) {
+  // Mask the required 5 values inside the vector
+  __m256i vtmp = _mm256_and_si256(vsum, *(__m256i *)sse_bytemask[i]);
+  __m128i v128a, v128b;
+  // Extract 256b as two 128b registers A and B
+  v128a = _mm256_castsi256_si128(vtmp);
+  v128b = _mm256_extracti128_si256(vtmp, 1);
+  // A = [A0+B0, A1+B1, A2+B2, A3+B3]
+  v128a = _mm_add_epi32(v128a, v128b);
+  // B = [A2+B2, A3+B3, 0, 0]
+  v128b = _mm_srli_si128(v128a, 8);
+  // A = [A0+B0+A2+B2, A1+B1+A3+B3, X, X]
+  v128a = _mm_add_epi32(v128a, v128b);
+  // B = [A1+B1+A3+B3, 0, 0, 0]
+  v128b = _mm_srli_si128(v128a, 4);
+  // A = [A0+B0+A2+B2+A1+B1+A3+B3, X, X, X]
+  v128a = _mm_add_epi32(v128a, v128b);
+  return _mm_extract_epi32(v128a, 0);
+}
+
+static void apply_temporal_filter_planewise(
+    const uint8_t *frame1, const unsigned int stride, const uint8_t *frame2,
+    const unsigned int stride2, const int block_width, const int block_height,
+    const double sigma, const int decay_control, const int use_subblock,
+    const int block_mse, const int *subblock_mses, const int q_factor,
+    unsigned int *accumulator, uint16_t *count, uint16_t *luma_sq_error,
+    uint16_t *chroma_sq_error, int plane, int ss_x_shift, int ss_y_shift) {
+  assert(TF_PLANEWISE_FILTER_WINDOW_LENGTH == 5);
+  assert(((block_width == 16) || (block_width == 32)) &&
+         ((block_height == 16) || (block_height == 32)));
+  if (plane > PLANE_TYPE_Y) assert(chroma_sq_error != NULL);
+
+  uint32_t acc_5x5_sse[BH][BW];
+  const double h = decay_control * (0.7 + log(sigma + 1.0));
+  const double q = AOMMIN((double)(q_factor * q_factor) / 256.0, 1);
+  uint16_t *frame_sse =
+      (plane == PLANE_TYPE_Y) ? luma_sq_error : chroma_sq_error;
+
+  if (block_width == 32) {
+    get_squared_error_32x32_avx2(frame1, stride, frame2, stride2, block_width,
+                                 block_height, frame_sse, SSE_STRIDE);
+  } else {
+    get_squared_error_16x16_avx2(frame1, stride, frame2, stride2, block_width,
+                                 block_height, frame_sse, SSE_STRIDE);
+  }
+
+  __m256i vsrc[5];
+
+  // Traverse 4 columns at a time
+  // First and last columns will require padding
+  for (int col = 0; col < block_width; col += 4) {
+    uint16_t *src = (col) ? frame_sse + col - 2 : frame_sse;
+
+    // Load and pad(for first and last col) 3 rows from the top
+    for (int i = 2; i < 5; i++) {
+      vsrc[i] = xx_load_and_pad(src, col, block_width);
+      src += SSE_STRIDE;
+    }
+
+    // Copy first row to first 2 vectors
+    vsrc[0] = vsrc[2];
+    vsrc[1] = vsrc[2];
+
+    for (int row = 0; row < block_height; row++) {
+      __m256i vsum = _mm256_setzero_si256();
+
+      // Add 5 consecutive rows
+      for (int i = 0; i < 5; i++) {
+        vsum = _mm256_add_epi32(vsum, vsrc[i]);
+      }
+
+      // Push all elements by one element to the top
+      for (int i = 0; i < 4; i++) {
+        vsrc[i] = vsrc[i + 1];
+      }
+
+      // Load next row to the last element
+      if (row <= block_height - 4) {
+        vsrc[4] = xx_load_and_pad(src, col, block_width);
+        src += SSE_STRIDE;
+      } else {
+        vsrc[4] = vsrc[3];
+      }
+
+      // Accumulate the sum horizontally
+      for (int i = 0; i < 4; i++) {
+        acc_5x5_sse[row][col + i] = xx_mask_and_hadd(vsum, i);
+      }
+    }
+  }
+
+  for (int i = 0, k = 0; i < block_height; i++) {
+    for (int j = 0; j < block_width; j++, k++) {
+      const int pixel_value = frame2[i * stride2 + j];
+
+      int diff_sse = acc_5x5_sse[i][j];
+      int num_ref_pixels =
+          TF_PLANEWISE_FILTER_WINDOW_LENGTH * TF_PLANEWISE_FILTER_WINDOW_LENGTH;
+
+      // Filter U-plane and V-plane using Y-plane. This is because motion
+      // search is only done on Y-plane, so the information from Y-plane will
+      // be more accurate.
+      if (plane != PLANE_TYPE_Y) {
+        for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
+          for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
+            const int yy = (i << ss_y_shift) + ii;  // Y-coord on Y-plane.
+            const int xx = (j << ss_x_shift) + jj;  // X-coord on Y-plane.
+            diff_sse += luma_sq_error[yy * SSE_STRIDE + xx];
+            ++num_ref_pixels;
+          }
+        }
+      }
+
+      const double window_error = (double)(diff_sse) / num_ref_pixels;
+      const int subblock_idx =
+          (i >= block_height / 2) * 2 + (j >= block_width / 2);
+      const double block_error =
+          (double)(use_subblock ? subblock_mses[subblock_idx] : block_mse);
+
+      const double scaled_diff =
+          AOMMAX(-(window_error + block_error / 10) / (2 * h * h * q), -15.0);
+      const int adjusted_weight =
+          (int)(exp(scaled_diff) * TF_PLANEWISE_FILTER_WEIGHT_SCALE);
+
+      count[k] += adjusted_weight;
+      accumulator[k] += adjusted_weight * pixel_value;
+    }
+  }
+}
+
+void av1_apply_temporal_filter_planewise_avx2(
+    const YV12_BUFFER_CONFIG *ref_frame, const MACROBLOCKD *mbd,
+    const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
+    const int num_planes, const double *noise_levels, const int use_subblock,
+    const int block_mse, const int *subblock_mses, const int q_factor,
+    const uint8_t *pred, uint32_t *accum, uint16_t *count) {
+  const int is_high_bitdepth = ref_frame->flags & YV12_FLAG_HIGHBITDEPTH;
+  if (is_high_bitdepth) {
+    assert(0 && "Only support low bit-depth with avx2!");
+  }
+  assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
+
+  const int frame_height = ref_frame->heights[0] << mbd->plane[0].subsampling_y;
+  const int decay_control = frame_height >= 720 ? 4 : 3;
+
+  const int mb_height = block_size_high[block_size];
+  const int mb_width = block_size_wide[block_size];
+  const int mb_pels = mb_height * mb_width;
+  uint16_t luma_sq_error[SSE_STRIDE * BH];
+  uint16_t *chroma_sq_error =
+      (num_planes > 0)
+          ? (uint16_t *)aom_malloc(SSE_STRIDE * BH * sizeof(uint16_t))
+          : NULL;
+
+  for (int plane = 0; plane < num_planes; ++plane) {
+    const uint32_t plane_h = mb_height >> mbd->plane[plane].subsampling_y;
+    const uint32_t plane_w = mb_width >> mbd->plane[plane].subsampling_x;
+    const uint32_t frame_stride = ref_frame->strides[plane == 0 ? 0 : 1];
+    const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w;
+
+    const uint8_t *ref = ref_frame->buffers[plane] + frame_offset;
+    const int ss_x_shift =
+        mbd->plane[plane].subsampling_x - mbd->plane[0].subsampling_x;
+    const int ss_y_shift =
+        mbd->plane[plane].subsampling_y - mbd->plane[0].subsampling_y;
+
+    apply_temporal_filter_planewise(
+        ref, frame_stride, pred + mb_pels * plane, plane_w, plane_w, plane_h,
+        noise_levels[plane], decay_control, use_subblock, block_mse,
+        subblock_mses, q_factor, accum + mb_pels * plane,
+        count + mb_pels * plane, luma_sq_error, chroma_sq_error, plane,
+        ss_x_shift, ss_y_shift);
+  }
+  if (chroma_sq_error != NULL) aom_free(chroma_sq_error);
+}
diff --git a/libs/libaom/src/av1/encoder/x86/temporal_filter_constants.h b/libs/libaom/src/av1/encoder/x86/temporal_filter_constants.h
new file mode 100644
index 000000000..7cd61d75e
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/x86/temporal_filter_constants.h
@@ -0,0 +1,407 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_X86_TEMPORAL_FILTER_CONSTANTS_H_
+#define AOM_AV1_ENCODER_X86_TEMPORAL_FILTER_CONSTANTS_H_
+
+// Division using multiplication and shifting. The C implementation does:
+// modifier *= 3;
+// modifier /= index;
+// where 'modifier' is a set of summed values and 'index' is the number of
+// summed values.
+//
+// This equation works out to (m * 3) / i which reduces to:
+// m * 3/4
+// m * 1/2
+// m * 1/3
+//
+// By pairing the multiply with a down shift by 16 (_mm_mulhi_epu16):
+// m * C / 65536
+// we can create a C to replicate the division.
+//
+// m * 49152 / 65536 = m * 3/4
+// m * 32758 / 65536 = m * 1/2
+// m * 21846 / 65536 = m * 0.3333
+//
+// These are loaded using an instruction expecting int16_t values but are used
+// with _mm_mulhi_epu16(), which treats them as unsigned.
+#define NEIGHBOR_CONSTANT_4 (int16_t)49152
+#define NEIGHBOR_CONSTANT_5 (int16_t)39322
+#define NEIGHBOR_CONSTANT_6 (int16_t)32768
+#define NEIGHBOR_CONSTANT_7 (int16_t)28087
+#define NEIGHBOR_CONSTANT_8 (int16_t)24576
+#define NEIGHBOR_CONSTANT_9 (int16_t)21846
+#define NEIGHBOR_CONSTANT_10 (int16_t)19661
+#define NEIGHBOR_CONSTANT_11 (int16_t)17874
+#define NEIGHBOR_CONSTANT_13 (int16_t)15124
+
+DECLARE_ALIGNED(16, static const int16_t, LEFT_CORNER_NEIGHBORS_PLUS_1[8]) = {
+  NEIGHBOR_CONSTANT_5, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7,
+  NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7,
+  NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7
+};
+
+DECLARE_ALIGNED(16, static const int16_t, RIGHT_CORNER_NEIGHBORS_PLUS_1[8]) = {
+  NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7,
+  NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7,
+  NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_5
+};
+
+DECLARE_ALIGNED(16, static const int16_t, LEFT_EDGE_NEIGHBORS_PLUS_1[8]) = {
+  NEIGHBOR_CONSTANT_7,  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10
+};
+
+DECLARE_ALIGNED(16, static const int16_t, RIGHT_EDGE_NEIGHBORS_PLUS_1[8]) = {
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_7
+};
+
+DECLARE_ALIGNED(16, static const int16_t, MIDDLE_EDGE_NEIGHBORS_PLUS_1[8]) = {
+  NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7,
+  NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7,
+  NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7
+};
+
+DECLARE_ALIGNED(16, static const int16_t, MIDDLE_CENTER_NEIGHBORS_PLUS_1[8]) = {
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10
+};
+
+DECLARE_ALIGNED(16, static const int16_t, LEFT_CORNER_NEIGHBORS_PLUS_2[8]) = {
+  NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
+  NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
+  NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8
+};
+
+DECLARE_ALIGNED(16, static const int16_t, RIGHT_CORNER_NEIGHBORS_PLUS_2[8]) = {
+  NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
+  NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
+  NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_6
+};
+
+DECLARE_ALIGNED(16, static const int16_t, LEFT_EDGE_NEIGHBORS_PLUS_2[8]) = {
+  NEIGHBOR_CONSTANT_8,  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
+  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
+  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11
+};
+
+DECLARE_ALIGNED(16, static const int16_t, RIGHT_EDGE_NEIGHBORS_PLUS_2[8]) = {
+  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
+  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
+  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_8
+};
+
+DECLARE_ALIGNED(16, static const int16_t, MIDDLE_EDGE_NEIGHBORS_PLUS_2[8]) = {
+  NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
+  NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
+  NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8
+};
+
+DECLARE_ALIGNED(16, static const int16_t, MIDDLE_CENTER_NEIGHBORS_PLUS_2[8]) = {
+  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
+  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
+  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11
+};
+
+DECLARE_ALIGNED(16, static const int16_t, TWO_CORNER_NEIGHBORS_PLUS_2[8]) = {
+  NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
+  NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
+  NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_6
+};
+
+DECLARE_ALIGNED(16, static const int16_t, TWO_EDGE_NEIGHBORS_PLUS_2[8]) = {
+  NEIGHBOR_CONSTANT_8,  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
+  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
+  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_8
+};
+
+DECLARE_ALIGNED(16, static const int16_t, LEFT_CORNER_NEIGHBORS_PLUS_4[8]) = {
+  NEIGHBOR_CONSTANT_8,  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10
+};
+
+DECLARE_ALIGNED(16, static const int16_t, RIGHT_CORNER_NEIGHBORS_PLUS_4[8]) = {
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_8
+};
+
+DECLARE_ALIGNED(16, static const int16_t, LEFT_EDGE_NEIGHBORS_PLUS_4[8]) = {
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
+  NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
+  NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13
+};
+
+DECLARE_ALIGNED(16, static const int16_t, RIGHT_EDGE_NEIGHBORS_PLUS_4[8]) = {
+  NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
+  NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
+  NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_10
+};
+
+DECLARE_ALIGNED(16, static const int16_t, MIDDLE_EDGE_NEIGHBORS_PLUS_4[8]) = {
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10
+};
+
+DECLARE_ALIGNED(16, static const int16_t, MIDDLE_CENTER_NEIGHBORS_PLUS_4[8]) = {
+  NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
+  NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
+  NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13
+};
+
+DECLARE_ALIGNED(16, static const int16_t, TWO_CORNER_NEIGHBORS_PLUS_4[8]) = {
+  NEIGHBOR_CONSTANT_8,  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_8
+};
+
+DECLARE_ALIGNED(16, static const int16_t, TWO_EDGE_NEIGHBORS_PLUS_4[8]) = {
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
+  NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
+  NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_10
+};
+
+static const int16_t *const LUMA_LEFT_COLUMN_NEIGHBORS[2] = {
+  LEFT_CORNER_NEIGHBORS_PLUS_2, LEFT_EDGE_NEIGHBORS_PLUS_2
+};
+
+static const int16_t *const LUMA_MIDDLE_COLUMN_NEIGHBORS[2] = {
+  MIDDLE_EDGE_NEIGHBORS_PLUS_2, MIDDLE_CENTER_NEIGHBORS_PLUS_2
+};
+
+static const int16_t *const LUMA_RIGHT_COLUMN_NEIGHBORS[2] = {
+  RIGHT_CORNER_NEIGHBORS_PLUS_2, RIGHT_EDGE_NEIGHBORS_PLUS_2
+};
+
+static const int16_t *const CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS[2] = {
+  LEFT_CORNER_NEIGHBORS_PLUS_1, LEFT_EDGE_NEIGHBORS_PLUS_1
+};
+
+static const int16_t *const CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS[2] = {
+  MIDDLE_EDGE_NEIGHBORS_PLUS_1, MIDDLE_CENTER_NEIGHBORS_PLUS_1
+};
+
+static const int16_t *const CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS[2] = {
+  RIGHT_CORNER_NEIGHBORS_PLUS_1, RIGHT_EDGE_NEIGHBORS_PLUS_1
+};
+
+static const int16_t *const CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS[2] = {
+  LEFT_CORNER_NEIGHBORS_PLUS_2, LEFT_EDGE_NEIGHBORS_PLUS_2
+};
+
+static const int16_t *const CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS[2] = {
+  MIDDLE_EDGE_NEIGHBORS_PLUS_2, MIDDLE_CENTER_NEIGHBORS_PLUS_2
+};
+
+static const int16_t *const CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS[2] = {
+  RIGHT_CORNER_NEIGHBORS_PLUS_2, RIGHT_EDGE_NEIGHBORS_PLUS_2
+};
+
+static const int16_t *const CHROMA_SINGLE_SS_SINGLE_COLUMN_NEIGHBORS[2] = {
+  TWO_CORNER_NEIGHBORS_PLUS_2, TWO_EDGE_NEIGHBORS_PLUS_2
+};
+
+static const int16_t *const CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS[2] = {
+  LEFT_CORNER_NEIGHBORS_PLUS_4, LEFT_EDGE_NEIGHBORS_PLUS_4
+};
+
+static const int16_t *const CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS[2] = {
+  MIDDLE_EDGE_NEIGHBORS_PLUS_4, MIDDLE_CENTER_NEIGHBORS_PLUS_4
+};
+
+static const int16_t *const CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS[2] = {
+  RIGHT_CORNER_NEIGHBORS_PLUS_4, RIGHT_EDGE_NEIGHBORS_PLUS_4
+};
+
+static const int16_t *const CHROMA_DOUBLE_SS_SINGLE_COLUMN_NEIGHBORS[2] = {
+  TWO_CORNER_NEIGHBORS_PLUS_4, TWO_EDGE_NEIGHBORS_PLUS_4
+};
+
+#define HIGHBD_NEIGHBOR_CONSTANT_4 (uint32_t)3221225472U
+#define HIGHBD_NEIGHBOR_CONSTANT_5 (uint32_t)2576980378U
+#define HIGHBD_NEIGHBOR_CONSTANT_6 (uint32_t)2147483648U
+#define HIGHBD_NEIGHBOR_CONSTANT_7 (uint32_t)1840700270U
+#define HIGHBD_NEIGHBOR_CONSTANT_8 (uint32_t)1610612736U
+#define HIGHBD_NEIGHBOR_CONSTANT_9 (uint32_t)1431655766U
+#define HIGHBD_NEIGHBOR_CONSTANT_10 (uint32_t)1288490189U
+#define HIGHBD_NEIGHBOR_CONSTANT_11 (uint32_t)1171354718U
+#define HIGHBD_NEIGHBOR_CONSTANT_13 (uint32_t)991146300U
+
+DECLARE_ALIGNED(16, static const uint32_t,
+                HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_1[4]) = {
+  HIGHBD_NEIGHBOR_CONSTANT_5, HIGHBD_NEIGHBOR_CONSTANT_7,
+  HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_7
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+                HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_1[4]) = {
+  HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_7,
+  HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_5
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+                HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_1[4]) = {
+  HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_10,
+  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+                HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_1[4]) = {
+  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10,
+  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_7
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+                HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_1[4]) = {
+  HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_7,
+  HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_7
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+                HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_1[4]) = {
+  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10,
+  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+                HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_2[4]) = {
+  HIGHBD_NEIGHBOR_CONSTANT_6, HIGHBD_NEIGHBOR_CONSTANT_8,
+  HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_8
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+                HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_2[4]) = {
+  HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_8,
+  HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_6
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+                HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_2[4]) = {
+  HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_11,
+  HIGHBD_NEIGHBOR_CONSTANT_11, HIGHBD_NEIGHBOR_CONSTANT_11
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+                HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_2[4]) = {
+  HIGHBD_NEIGHBOR_CONSTANT_11, HIGHBD_NEIGHBOR_CONSTANT_11,
+  HIGHBD_NEIGHBOR_CONSTANT_11, HIGHBD_NEIGHBOR_CONSTANT_8
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+                HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_2[4]) = {
+  HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_8,
+  HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_8
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+                HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_2[4]) = {
+  HIGHBD_NEIGHBOR_CONSTANT_11, HIGHBD_NEIGHBOR_CONSTANT_11,
+  HIGHBD_NEIGHBOR_CONSTANT_11, HIGHBD_NEIGHBOR_CONSTANT_11
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+                HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_4[4]) = {
+  HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_10,
+  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+                HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_4[4]) = {
+  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10,
+  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_8
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+                HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_4[4]) = {
+  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_13,
+  HIGHBD_NEIGHBOR_CONSTANT_13, HIGHBD_NEIGHBOR_CONSTANT_13
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+                HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_4[4]) = {
+  HIGHBD_NEIGHBOR_CONSTANT_13, HIGHBD_NEIGHBOR_CONSTANT_13,
+  HIGHBD_NEIGHBOR_CONSTANT_13, HIGHBD_NEIGHBOR_CONSTANT_10
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+                HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_4[4]) = {
+  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10,
+  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+                HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_4[4]) = {
+  HIGHBD_NEIGHBOR_CONSTANT_13, HIGHBD_NEIGHBOR_CONSTANT_13,
+  HIGHBD_NEIGHBOR_CONSTANT_13, HIGHBD_NEIGHBOR_CONSTANT_13
+};
+
+static const uint32_t *const HIGHBD_LUMA_LEFT_COLUMN_NEIGHBORS[2] = {
+  HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_2, HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_2
+};
+
+static const uint32_t *const HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS[2] = {
+  HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_2, HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_2
+};
+
+static const uint32_t *const HIGHBD_LUMA_RIGHT_COLUMN_NEIGHBORS[2] = {
+  HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_2, HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_2
+};
+
+static const uint32_t *const HIGHBD_CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS[2] = {
+  HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_1, HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_1
+};
+
+static const uint32_t *const HIGHBD_CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS[2] = {
+  HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_1, HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_1
+};
+
+static const uint32_t *const HIGHBD_CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS[2] = {
+  HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_1, HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_1
+};
+
+static const uint32_t
+    *const HIGHBD_CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS[2] = {
+      HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_2, HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_2
+    };
+
+static const uint32_t
+    *const HIGHBD_CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS[2] = {
+      HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_2, HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_2
+    };
+
+static const uint32_t
+    *const HIGHBD_CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS[2] = {
+      HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_2, HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_2
+    };
+
+static const uint32_t
+    *const HIGHBD_CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS[2] = {
+      HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_4, HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_4
+    };
+
+static const uint32_t
+    *const HIGHBD_CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS[2] = {
+      HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_4, HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_4
+    };
+
+static const uint32_t
+    *const HIGHBD_CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS[2] = {
+      HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_4, HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_4
+    };
+
+#define DIST_STRIDE ((BW) + 2)
+#endif  // AOM_AV1_ENCODER_X86_TEMPORAL_FILTER_CONSTANTS_H_
diff --git a/libs/libaom/src/av1/encoder/x86/temporal_filter_sse2.c b/libs/libaom/src/av1/encoder/x86/temporal_filter_sse2.c
new file mode 100644
index 000000000..1722fac86
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/x86/temporal_filter_sse2.c
@@ -0,0 +1,262 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+
+#include "config/av1_rtcd.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/temporal_filter.h"
+
+// For the squared error buffer, keep a padding for 4 samples
+#define SSE_STRIDE (BW + 4)
+
+DECLARE_ALIGNED(32, static const uint32_t, sse_bytemask_2x4[4][2][4]) = {
+  { { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF },
+    { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 } },
+  { { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF },
+    { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 } },
+  { { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF },
+    { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 } },
+  { { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF },
+    { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF } }
+};
+
+static void get_squared_error(const uint8_t *frame1, const unsigned int stride,
+                              const uint8_t *frame2, const unsigned int stride2,
+                              const int block_width, const int block_height,
+                              uint16_t *frame_sse,
+                              const unsigned int dst_stride) {
+  const uint8_t *src1 = frame1;
+  const uint8_t *src2 = frame2;
+  uint16_t *dst = frame_sse;
+
+  for (int i = 0; i < block_height; i++) {
+    for (int j = 0; j < block_width; j += 16) {
+      // Set zero to uninitialized memory to avoid uninitialized loads later
+      *(uint32_t *)(dst) = _mm_cvtsi128_si32(_mm_setzero_si128());
+
+      __m128i vsrc1 = _mm_loadu_si128((__m128i *)(src1 + j));
+      __m128i vsrc2 = _mm_loadu_si128((__m128i *)(src2 + j));
+
+      __m128i vmax = _mm_max_epu8(vsrc1, vsrc2);
+      __m128i vmin = _mm_min_epu8(vsrc1, vsrc2);
+      __m128i vdiff = _mm_subs_epu8(vmax, vmin);
+
+      __m128i vzero = _mm_setzero_si128();
+      __m128i vdiff1 = _mm_unpacklo_epi8(vdiff, vzero);
+      __m128i vdiff2 = _mm_unpackhi_epi8(vdiff, vzero);
+
+      __m128i vres1 = _mm_mullo_epi16(vdiff1, vdiff1);
+      __m128i vres2 = _mm_mullo_epi16(vdiff2, vdiff2);
+
+      _mm_storeu_si128((__m128i *)(dst + j + 2), vres1);
+      _mm_storeu_si128((__m128i *)(dst + j + 10), vres2);
+    }
+
+    // Set zero to uninitialized memory to avoid uninitialized loads later
+    *(uint32_t *)(dst + block_width + 2) =
+        _mm_cvtsi128_si32(_mm_setzero_si128());
+
+    src1 += stride;
+    src2 += stride2;
+    dst += dst_stride;
+  }
+}
+
+static void xx_load_and_pad(uint16_t *src, __m128i *dstvec, int col,
+                            int block_width) {
+  __m128i vtmp = _mm_loadu_si128((__m128i *)src);
+  __m128i vzero = _mm_setzero_si128();
+  __m128i vtmp1 = _mm_unpacklo_epi16(vtmp, vzero);
+  __m128i vtmp2 = _mm_unpackhi_epi16(vtmp, vzero);
+  // For the first column, replicate the first element twice to the left
+  dstvec[0] = (col) ? vtmp1 : _mm_shuffle_epi32(vtmp1, 0xEA);
+  // For the last column, replicate the last element twice to the right
+  dstvec[1] = (col < block_width - 4) ? vtmp2 : _mm_shuffle_epi32(vtmp2, 0x54);
+}
+
+static int32_t xx_mask_and_hadd(__m128i vsum1, __m128i vsum2, int i) {
+  __m128i veca, vecb;
+  // Mask and obtain the required 5 values inside the vector
+  veca = _mm_and_si128(vsum1, *(__m128i *)sse_bytemask_2x4[i][0]);
+  vecb = _mm_and_si128(vsum2, *(__m128i *)sse_bytemask_2x4[i][1]);
+  // A = [A0+B0, A1+B1, A2+B2, A3+B3]
+  veca = _mm_add_epi32(veca, vecb);
+  // B = [A2+B2, A3+B3, 0, 0]
+  vecb = _mm_srli_si128(veca, 8);
+  // A = [A0+B0+A2+B2, A1+B1+A3+B3, X, X]
+  veca = _mm_add_epi32(veca, vecb);
+  // B = [A1+B1+A3+B3, 0, 0, 0]
+  vecb = _mm_srli_si128(veca, 4);
+  // A = [A0+B0+A2+B2+A1+B1+A3+B3, X, X, X]
+  veca = _mm_add_epi32(veca, vecb);
+  return _mm_cvtsi128_si32(veca);
+}
+
+static void apply_temporal_filter_planewise(
+    const uint8_t *frame1, const unsigned int stride, const uint8_t *frame2,
+    const unsigned int stride2, const int block_width, const int block_height,
+    const double sigma, const int decay_control, const int use_subblock,
+    const int block_mse, const int *subblock_mses, const int q_factor,
+    unsigned int *accumulator, uint16_t *count, uint16_t *luma_sq_error,
+    uint16_t *chroma_sq_error, int plane, int ss_x_shift, int ss_y_shift) {
+  assert(TF_PLANEWISE_FILTER_WINDOW_LENGTH == 5);
+  assert(((block_width == 16) || (block_width == 32)) &&
+         ((block_height == 16) || (block_height == 32)));
+  if (plane > PLANE_TYPE_Y) assert(chroma_sq_error != NULL);
+
+  uint32_t acc_5x5_sse[BH][BW];
+  const double h = decay_control * (0.7 + log(sigma + 1.0));
+  const double q = AOMMIN((double)(q_factor * q_factor) / 256.0, 1);
+  uint16_t *frame_sse =
+      (plane == PLANE_TYPE_Y) ? luma_sq_error : chroma_sq_error;
+
+  get_squared_error(frame1, stride, frame2, stride2, block_width, block_height,
+                    frame_sse, SSE_STRIDE);
+
+  __m128i vsrc[5][2];
+
+  // Traverse 4 columns at a time
+  // First and last columns will require padding
+  for (int col = 0; col < block_width; col += 4) {
+    uint16_t *src = frame_sse + col;
+
+    // Load and pad(for first and last col) 3 rows from the top
+    for (int i = 2; i < 5; i++) {
+      xx_load_and_pad(src, vsrc[i], col, block_width);
+      src += SSE_STRIDE;
+    }
+
+    // Padding for top 2 rows
+    vsrc[0][0] = vsrc[2][0];
+    vsrc[0][1] = vsrc[2][1];
+    vsrc[1][0] = vsrc[2][0];
+    vsrc[1][1] = vsrc[2][1];
+
+    for (int row = 0; row < block_height; row++) {
+      __m128i vsum1 = _mm_setzero_si128();
+      __m128i vsum2 = _mm_setzero_si128();
+
+      // Add 5 consecutive rows
+      for (int i = 0; i < 5; i++) {
+        vsum1 = _mm_add_epi32(vsrc[i][0], vsum1);
+        vsum2 = _mm_add_epi32(vsrc[i][1], vsum2);
+      }
+
+      // Push all elements by one element to the top
+      for (int i = 0; i < 4; i++) {
+        vsrc[i][0] = vsrc[i + 1][0];
+        vsrc[i][1] = vsrc[i + 1][1];
+      }
+
+      if (row <= block_height - 4) {
+        // Load next row
+        xx_load_and_pad(src, vsrc[4], col, block_width);
+        src += SSE_STRIDE;
+      } else {
+        // Padding for bottom 2 rows
+        vsrc[4][0] = vsrc[3][0];
+        vsrc[4][1] = vsrc[3][1];
+      }
+
+      // Accumulate the sum horizontally
+      for (int i = 0; i < 4; i++) {
+        acc_5x5_sse[row][col + i] = xx_mask_and_hadd(vsum1, vsum2, i);
+      }
+    }
+  }
+
+  for (int i = 0, k = 0; i < block_height; i++) {
+    for (int j = 0; j < block_width; j++, k++) {
+      const int pixel_value = frame2[i * stride2 + j];
+
+      int diff_sse = acc_5x5_sse[i][j];
+      int num_ref_pixels =
+          TF_PLANEWISE_FILTER_WINDOW_LENGTH * TF_PLANEWISE_FILTER_WINDOW_LENGTH;
+
+      // Filter U-plane and V-plane using Y-plane. This is because motion
+      // search is only done on Y-plane, so the information from Y-plane will
+      // be more accurate.
+      if (plane != PLANE_TYPE_Y) {
+        for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
+          for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
+            const int yy = (i << ss_y_shift) + ii;      // Y-coord on Y-plane.
+            const int xx = (j << ss_x_shift) + jj + 2;  // X-coord on Y-plane.
+            const int ww = SSE_STRIDE;                  // Stride of Y-plane.
+            diff_sse += luma_sq_error[yy * ww + xx];
+            ++num_ref_pixels;
+          }
+        }
+      }
+
+      const double window_error = (double)(diff_sse) / num_ref_pixels;
+      const int subblock_idx =
+          (i >= block_height / 2) * 2 + (j >= block_width / 2);
+      const double block_error =
+          (double)(use_subblock ? subblock_mses[subblock_idx] : block_mse);
+
+      const double scaled_diff =
+          AOMMAX(-(window_error + block_error / 10) / (2 * h * h * q), -15.0);
+      const int adjusted_weight =
+          (int)(exp(scaled_diff) * TF_PLANEWISE_FILTER_WEIGHT_SCALE);
+
+      count[k] += adjusted_weight;
+      accumulator[k] += adjusted_weight * pixel_value;
+    }
+  }
+}
+
+void av1_apply_temporal_filter_planewise_sse2(
+    const YV12_BUFFER_CONFIG *ref_frame, const MACROBLOCKD *mbd,
+    const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
+    const int num_planes, const double *noise_levels, const int use_subblock,
+    const int block_mse, const int *subblock_mses, const int q_factor,
+    const uint8_t *pred, uint32_t *accum, uint16_t *count) {
+  const int is_high_bitdepth = ref_frame->flags & YV12_FLAG_HIGHBITDEPTH;
+  if (is_high_bitdepth) {
+    assert(0 && "Only support low bit-depth with sse2!");
+  }
+  assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
+
+  const int frame_height = ref_frame->heights[0] << mbd->plane[0].subsampling_y;
+  const int decay_control = frame_height >= 720 ? 4 : 3;
+
+  const int mb_height = block_size_high[block_size];
+  const int mb_width = block_size_wide[block_size];
+  const int mb_pels = mb_height * mb_width;
+  uint16_t luma_sq_error[SSE_STRIDE * BH];
+  uint16_t *chroma_sq_error =
+      (num_planes > 0)
+          ? (uint16_t *)aom_malloc(SSE_STRIDE * BH * sizeof(uint16_t))
+          : NULL;
+
+  for (int plane = 0; plane < num_planes; ++plane) {
+    const uint32_t plane_h = mb_height >> mbd->plane[plane].subsampling_y;
+    const uint32_t plane_w = mb_width >> mbd->plane[plane].subsampling_x;
+    const uint32_t frame_stride = ref_frame->strides[plane == 0 ? 0 : 1];
+    const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w;
+
+    const uint8_t *ref = ref_frame->buffers[plane] + frame_offset;
+    const int ss_x_shift =
+        mbd->plane[plane].subsampling_x - mbd->plane[0].subsampling_x;
+    const int ss_y_shift =
+        mbd->plane[plane].subsampling_y - mbd->plane[0].subsampling_y;
+
+    apply_temporal_filter_planewise(
+        ref, frame_stride, pred + mb_pels * plane, plane_w, plane_w, plane_h,
+        noise_levels[plane], decay_control, use_subblock, block_mse,
+        subblock_mses, q_factor, accum + mb_pels * plane,
+        count + mb_pels * plane, luma_sq_error, chroma_sq_error, plane,
+        ss_x_shift, ss_y_shift);
+  }
+  if (chroma_sq_error != NULL) aom_free(chroma_sq_error);
+}
diff --git a/libs/libaom/src/av1/encoder/x86/temporal_filter_sse4.c b/libs/libaom/src/av1/encoder/x86/temporal_filter_sse4.c
new file mode 100644
index 000000000..e3f9f5f27
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/x86/temporal_filter_sse4.c
@@ -0,0 +1,2044 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <smmintrin.h>
+
+#include "config/av1_rtcd.h"
+#include "aom/aom_integer.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/temporal_filter.h"
+#include "av1/encoder/x86/temporal_filter_constants.h"
+
+//////////////////////////
+// Low bit-depth Begins //
+//////////////////////////
+
+// Read in 8 pixels from a and b as 8-bit unsigned integers, compute the
+// difference squared, and store as unsigned 16-bit integer to dst.
+static INLINE void store_dist_8(const uint8_t *a, const uint8_t *b,
+                                uint16_t *dst) {
+  const __m128i a_reg = _mm_loadl_epi64((const __m128i *)a);
+  const __m128i b_reg = _mm_loadl_epi64((const __m128i *)b);
+
+  const __m128i a_first = _mm_cvtepu8_epi16(a_reg);
+  const __m128i b_first = _mm_cvtepu8_epi16(b_reg);
+
+  __m128i dist_first;
+
+  dist_first = _mm_sub_epi16(a_first, b_first);
+  dist_first = _mm_mullo_epi16(dist_first, dist_first);
+
+  _mm_storeu_si128((__m128i *)dst, dist_first);
+}
+
+static INLINE void store_dist_16(const uint8_t *a, const uint8_t *b,
+                                 uint16_t *dst) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i a_reg = _mm_loadu_si128((const __m128i *)a);
+  const __m128i b_reg = _mm_loadu_si128((const __m128i *)b);
+
+  const __m128i a_first = _mm_cvtepu8_epi16(a_reg);
+  const __m128i a_second = _mm_unpackhi_epi8(a_reg, zero);
+  const __m128i b_first = _mm_cvtepu8_epi16(b_reg);
+  const __m128i b_second = _mm_unpackhi_epi8(b_reg, zero);
+
+  __m128i dist_first, dist_second;
+
+  dist_first = _mm_sub_epi16(a_first, b_first);
+  dist_second = _mm_sub_epi16(a_second, b_second);
+  dist_first = _mm_mullo_epi16(dist_first, dist_first);
+  dist_second = _mm_mullo_epi16(dist_second, dist_second);
+
+  _mm_storeu_si128((__m128i *)dst, dist_first);
+  _mm_storeu_si128((__m128i *)(dst + 8), dist_second);
+}
+
+static INLINE void read_dist_8(const uint16_t *dist, __m128i *dist_reg) {
+  *dist_reg = _mm_loadu_si128((const __m128i *)dist);
+}
+
+static INLINE void read_dist_16(const uint16_t *dist, __m128i *reg_first,
+                                __m128i *reg_second) {
+  read_dist_8(dist, reg_first);
+  read_dist_8(dist + 8, reg_second);
+}
+
+// Average the value based on the number of values summed (9 for pixels away
+// from the border, 4 for pixels in corners, and 6 for other edge values).
+//
+// Add in the rounding factor and shift, clamp to 16, invert and shift. Multiply
+// by weight.
+static __m128i average_8(__m128i sum, const __m128i *mul_constants,
+                         const int strength, const int rounding,
+                         const int weight) {
+  // _mm_srl_epi16 uses the lower 64 bit value for the shift.
+  const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength);
+  const __m128i rounding_u16 = _mm_set1_epi16(rounding);
+  const __m128i weight_u16 = _mm_set1_epi16(weight);
+  const __m128i sixteen = _mm_set1_epi16(16);
+
+  // modifier * 3 / index;
+  sum = _mm_mulhi_epu16(sum, *mul_constants);
+
+  sum = _mm_adds_epu16(sum, rounding_u16);
+  sum = _mm_srl_epi16(sum, strength_u128);
+
+  // The maximum input to this comparison is UINT16_MAX * NEIGHBOR_CONSTANT_4
+  // >> 16 (also NEIGHBOR_CONSTANT_4 -1) which is 49151 / 0xbfff / -16385
+  // So this needs to use the epu16 version which did not come until SSE4.
+  sum = _mm_min_epu16(sum, sixteen);
+
+  sum = _mm_sub_epi16(sixteen, sum);
+
+  return _mm_mullo_epi16(sum, weight_u16);
+}
+
+static __m128i average_4_4(__m128i sum, const __m128i *mul_constants,
+                           const int strength, const int rounding,
+                           const int weight_0, const int weight_1) {
+  // _mm_srl_epi16 uses the lower 64 bit value for the shift.
+  const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength);
+  const __m128i rounding_u16 = _mm_set1_epi16(rounding);
+  const __m128i weight_u16 =
+      _mm_setr_epi16(weight_0, weight_0, weight_0, weight_0, weight_1, weight_1,
+                     weight_1, weight_1);
+  const __m128i sixteen = _mm_set1_epi16(16);
+
+  // modifier * 3 / index;
+  sum = _mm_mulhi_epu16(sum, *mul_constants);
+
+  sum = _mm_adds_epu16(sum, rounding_u16);
+  sum = _mm_srl_epi16(sum, strength_u128);
+
+  // The maximum input to this comparison is UINT16_MAX * NEIGHBOR_CONSTANT_4
+  // >> 16 (also NEIGHBOR_CONSTANT_4 -1) which is 49151 / 0xbfff / -16385
+  // So this needs to use the epu16 version which did not come until SSE4.
+  sum = _mm_min_epu16(sum, sixteen);
+
+  sum = _mm_sub_epi16(sixteen, sum);
+
+  return _mm_mullo_epi16(sum, weight_u16);
+}
+
+static INLINE void average_16(__m128i *sum_0_u16, __m128i *sum_1_u16,
+                              const __m128i *mul_constants_0,
+                              const __m128i *mul_constants_1,
+                              const int strength, const int rounding,
+                              const int weight) {
+  const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength);
+  const __m128i rounding_u16 = _mm_set1_epi16(rounding);
+  const __m128i weight_u16 = _mm_set1_epi16(weight);
+  const __m128i sixteen = _mm_set1_epi16(16);
+  __m128i input_0, input_1;
+
+  input_0 = _mm_mulhi_epu16(*sum_0_u16, *mul_constants_0);
+  input_0 = _mm_adds_epu16(input_0, rounding_u16);
+
+  input_1 = _mm_mulhi_epu16(*sum_1_u16, *mul_constants_1);
+  input_1 = _mm_adds_epu16(input_1, rounding_u16);
+
+  input_0 = _mm_srl_epi16(input_0, strength_u128);
+  input_1 = _mm_srl_epi16(input_1, strength_u128);
+
+  input_0 = _mm_min_epu16(input_0, sixteen);
+  input_1 = _mm_min_epu16(input_1, sixteen);
+  input_0 = _mm_sub_epi16(sixteen, input_0);
+  input_1 = _mm_sub_epi16(sixteen, input_1);
+
+  *sum_0_u16 = _mm_mullo_epi16(input_0, weight_u16);
+  *sum_1_u16 = _mm_mullo_epi16(input_1, weight_u16);
+}
+
+// Add 'sum_u16' to 'count'. Multiply by 'pred' and add to 'accumulator.'
+static void accumulate_and_store_8(const __m128i sum_u16, const uint8_t *pred,
+                                   uint16_t *count, uint32_t *accumulator) {
+  const __m128i pred_u8 = _mm_loadl_epi64((const __m128i *)pred);
+  const __m128i zero = _mm_setzero_si128();
+  __m128i count_u16 = _mm_loadu_si128((const __m128i *)count);
+  __m128i pred_u16 = _mm_cvtepu8_epi16(pred_u8);
+  __m128i pred_0_u32, pred_1_u32;
+  __m128i accum_0_u32, accum_1_u32;
+
+  count_u16 = _mm_adds_epu16(count_u16, sum_u16);
+  _mm_storeu_si128((__m128i *)count, count_u16);
+
+  pred_u16 = _mm_mullo_epi16(sum_u16, pred_u16);
+
+  pred_0_u32 = _mm_cvtepu16_epi32(pred_u16);
+  pred_1_u32 = _mm_unpackhi_epi16(pred_u16, zero);
+
+  accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator);
+  accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4));
+
+  accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32);
+  accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32);
+
+  _mm_storeu_si128((__m128i *)accumulator, accum_0_u32);
+  _mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32);
+}
+
+static INLINE void accumulate_and_store_16(const __m128i sum_0_u16,
+                                           const __m128i sum_1_u16,
+                                           const uint8_t *pred, uint16_t *count,
+                                           uint32_t *accumulator) {
+  const __m128i pred_u8 = _mm_loadu_si128((const __m128i *)pred);
+  const __m128i zero = _mm_setzero_si128();
+  __m128i count_0_u16 = _mm_loadu_si128((const __m128i *)count),
+          count_1_u16 = _mm_loadu_si128((const __m128i *)(count + 8));
+  __m128i pred_0_u16 = _mm_cvtepu8_epi16(pred_u8),
+          pred_1_u16 = _mm_unpackhi_epi8(pred_u8, zero);
+  __m128i pred_0_u32, pred_1_u32, pred_2_u32, pred_3_u32;
+  __m128i accum_0_u32, accum_1_u32, accum_2_u32, accum_3_u32;
+
+  count_0_u16 = _mm_adds_epu16(count_0_u16, sum_0_u16);
+  _mm_storeu_si128((__m128i *)count, count_0_u16);
+
+  count_1_u16 = _mm_adds_epu16(count_1_u16, sum_1_u16);
+  _mm_storeu_si128((__m128i *)(count + 8), count_1_u16);
+
+  pred_0_u16 = _mm_mullo_epi16(sum_0_u16, pred_0_u16);
+  pred_1_u16 = _mm_mullo_epi16(sum_1_u16, pred_1_u16);
+
+  pred_0_u32 = _mm_cvtepu16_epi32(pred_0_u16);
+  pred_1_u32 = _mm_unpackhi_epi16(pred_0_u16, zero);
+  pred_2_u32 = _mm_cvtepu16_epi32(pred_1_u16);
+  pred_3_u32 = _mm_unpackhi_epi16(pred_1_u16, zero);
+
+  accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator);
+  accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4));
+  accum_2_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 8));
+  accum_3_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 12));
+
+  accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32);
+  accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32);
+  accum_2_u32 = _mm_add_epi32(pred_2_u32, accum_2_u32);
+  accum_3_u32 = _mm_add_epi32(pred_3_u32, accum_3_u32);
+
+  _mm_storeu_si128((__m128i *)accumulator, accum_0_u32);
+  _mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32);
+  _mm_storeu_si128((__m128i *)(accumulator + 8), accum_2_u32);
+  _mm_storeu_si128((__m128i *)(accumulator + 12), accum_3_u32);
+}
+
+// Read in 8 pixels from y_dist. For each index i, compute y_dist[i-1] +
+// y_dist[i] + y_dist[i+1] and store in sum as 16-bit unsigned int.
+static INLINE void get_sum_8(const uint16_t *y_dist, __m128i *sum) {
+  __m128i dist_reg, dist_left, dist_right;
+
+  dist_reg = _mm_loadu_si128((const __m128i *)y_dist);
+  dist_left = _mm_loadu_si128((const __m128i *)(y_dist - 1));
+  dist_right = _mm_loadu_si128((const __m128i *)(y_dist + 1));
+
+  *sum = _mm_adds_epu16(dist_reg, dist_left);
+  *sum = _mm_adds_epu16(*sum, dist_right);
+}
+
+// Read in 16 pixels from y_dist. For each index i, compute y_dist[i-1] +
+// y_dist[i] + y_dist[i+1]. Store the result for first 8 pixels in sum_first and
+// the rest in sum_second.
+static INLINE void get_sum_16(const uint16_t *y_dist, __m128i *sum_first,
+                              __m128i *sum_second) {
+  get_sum_8(y_dist, sum_first);
+  get_sum_8(y_dist + 8, sum_second);
+}
+
+// Read in a row of chroma values corresponds to a row of 16 luma values.
+static INLINE void read_chroma_dist_row_16(int ss_x, const uint16_t *u_dist,
+                                           const uint16_t *v_dist,
+                                           __m128i *u_first, __m128i *u_second,
+                                           __m128i *v_first,
+                                           __m128i *v_second) {
+  if (!ss_x) {
+    // If there is no chroma subsampling in the horizontal direction, then we
+    // need to load 16 entries from chroma.
+    read_dist_16(u_dist, u_first, u_second);
+    read_dist_16(v_dist, v_first, v_second);
+  } else {  // ss_x == 1
+    // Otherwise, we only need to load 8 entries
+    __m128i u_reg, v_reg;
+
+    read_dist_8(u_dist, &u_reg);
+
+    *u_first = _mm_unpacklo_epi16(u_reg, u_reg);
+    *u_second = _mm_unpackhi_epi16(u_reg, u_reg);
+
+    read_dist_8(v_dist, &v_reg);
+
+    *v_first = _mm_unpacklo_epi16(v_reg, v_reg);
+    *v_second = _mm_unpackhi_epi16(v_reg, v_reg);
+  }
+}
+
+// Horizontal add unsigned 16-bit ints in src and store them as signed 32-bit
+// int in dst.
+static INLINE void hadd_epu16(__m128i *src, __m128i *dst) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i shift_right = _mm_srli_si128(*src, 2);
+
+  const __m128i odd = _mm_blend_epi16(shift_right, zero, 170);
+  const __m128i even = _mm_blend_epi16(*src, zero, 170);
+
+  *dst = _mm_add_epi32(even, odd);
+}
+
+// Add a row of luma distortion to 8 corresponding chroma mods.
+static INLINE void add_luma_dist_to_8_chroma_mod(const uint16_t *y_dist,
+                                                 int ss_x, int ss_y,
+                                                 __m128i *u_mod,
+                                                 __m128i *v_mod) {
+  __m128i y_reg;
+  if (!ss_x) {
+    read_dist_8(y_dist, &y_reg);
+    if (ss_y == 1) {
+      __m128i y_tmp;
+      read_dist_8(y_dist + DIST_STRIDE, &y_tmp);
+
+      y_reg = _mm_adds_epu16(y_reg, y_tmp);
+    }
+  } else {
+    __m128i y_first, y_second;
+    read_dist_16(y_dist, &y_first, &y_second);
+    if (ss_y == 1) {
+      __m128i y_tmp_0, y_tmp_1;
+      read_dist_16(y_dist + DIST_STRIDE, &y_tmp_0, &y_tmp_1);
+
+      y_first = _mm_adds_epu16(y_first, y_tmp_0);
+      y_second = _mm_adds_epu16(y_second, y_tmp_1);
+    }
+
+    hadd_epu16(&y_first, &y_first);
+    hadd_epu16(&y_second, &y_second);
+
+    y_reg = _mm_packus_epi32(y_first, y_second);
+  }
+
+  *u_mod = _mm_adds_epu16(*u_mod, y_reg);
+  *v_mod = _mm_adds_epu16(*v_mod, y_reg);
+}
+
+// Apply temporal filter to the luma components. This performs temporal
+// filtering on a luma block of 16 X block_height. Use blk_fw as an array of
+// size 4 for the weights for each of the 4 subblocks if blk_fw is not NULL,
+// else use top_weight for top half, and bottom weight for bottom half.
+static void apply_temporal_filter_luma_16(
+    const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
+    int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
+    int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
+    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
+    int ss_x, int ss_y, int strength, int use_whole_blk, uint32_t *y_accum,
+    uint16_t *y_count, const uint16_t *y_dist, const uint16_t *u_dist,
+    const uint16_t *v_dist, const int16_t *const *neighbors_first,
+    const int16_t *const *neighbors_second, int top_weight, int bottom_weight,
+    const int *blk_fw) {
+  const int rounding = (1 << strength) >> 1;
+  int weight = top_weight;
+
+  __m128i mul_first, mul_second;
+
+  __m128i sum_row_1_first, sum_row_1_second;
+  __m128i sum_row_2_first, sum_row_2_second;
+  __m128i sum_row_3_first, sum_row_3_second;
+
+  __m128i u_first, u_second;
+  __m128i v_first, v_second;
+
+  __m128i sum_row_first;
+  __m128i sum_row_second;
+
+  // Loop variables
+  unsigned int h;
+
+  assert(strength >= 0);
+  assert(strength <= 6);
+
+  assert(block_width == 16);
+
+  (void)block_width;
+
+  // First row
+  mul_first = _mm_loadu_si128((const __m128i *)neighbors_first[0]);
+  mul_second = _mm_loadu_si128((const __m128i *)neighbors_second[0]);
+
+  // Add luma values
+  get_sum_16(y_dist, &sum_row_2_first, &sum_row_2_second);
+  get_sum_16(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second);
+
+  sum_row_first = _mm_adds_epu16(sum_row_2_first, sum_row_3_first);
+  sum_row_second = _mm_adds_epu16(sum_row_2_second, sum_row_3_second);
+
+  // Add chroma values
+  read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second, &v_first,
+                          &v_second);
+
+  sum_row_first = _mm_adds_epu16(sum_row_first, u_first);
+  sum_row_second = _mm_adds_epu16(sum_row_second, u_second);
+
+  sum_row_first = _mm_adds_epu16(sum_row_first, v_first);
+  sum_row_second = _mm_adds_epu16(sum_row_second, v_second);
+
+  // Get modifier and store result
+  if (blk_fw) {
+    sum_row_first =
+        average_8(sum_row_first, &mul_first, strength, rounding, blk_fw[0]);
+    sum_row_second =
+        average_8(sum_row_second, &mul_second, strength, rounding, blk_fw[1]);
+  } else {
+    average_16(&sum_row_first, &sum_row_second, &mul_first, &mul_second,
+               strength, rounding, weight);
+  }
+  accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count,
+                          y_accum);
+
+  y_src += y_src_stride;
+  y_pre += y_pre_stride;
+  y_count += y_pre_stride;
+  y_accum += y_pre_stride;
+  y_dist += DIST_STRIDE;
+
+  u_src += uv_src_stride;
+  u_pre += uv_pre_stride;
+  u_dist += DIST_STRIDE;
+  v_src += uv_src_stride;
+  v_pre += uv_pre_stride;
+  v_dist += DIST_STRIDE;
+
+  // Then all the rows except the last one
+  mul_first = _mm_loadu_si128((const __m128i *)neighbors_first[1]);
+  mul_second = _mm_loadu_si128((const __m128i *)neighbors_second[1]);
+
+  for (h = 1; h < block_height - 1; ++h) {
+    // Move the weight to bottom half
+    if (!use_whole_blk && h == block_height / 2) {
+      if (blk_fw) {
+        blk_fw += 2;
+      } else {
+        weight = bottom_weight;
+      }
+    }
+    // Shift the rows up
+    sum_row_1_first = sum_row_2_first;
+    sum_row_1_second = sum_row_2_second;
+    sum_row_2_first = sum_row_3_first;
+    sum_row_2_second = sum_row_3_second;
+
+    // Add luma values to the modifier
+    sum_row_first = _mm_adds_epu16(sum_row_1_first, sum_row_2_first);
+    sum_row_second = _mm_adds_epu16(sum_row_1_second, sum_row_2_second);
+
+    get_sum_16(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second);
+
+    sum_row_first = _mm_adds_epu16(sum_row_first, sum_row_3_first);
+    sum_row_second = _mm_adds_epu16(sum_row_second, sum_row_3_second);
+
+    // Add chroma values to the modifier
+    if (ss_y == 0 || h % 2 == 0) {
+      // Only calculate the new chroma distortion if we are at a pixel that
+      // corresponds to a new chroma row
+      read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second,
+                              &v_first, &v_second);
+
+      u_src += uv_src_stride;
+      u_pre += uv_pre_stride;
+      u_dist += DIST_STRIDE;
+      v_src += uv_src_stride;
+      v_pre += uv_pre_stride;
+      v_dist += DIST_STRIDE;
+    }
+
+    sum_row_first = _mm_adds_epu16(sum_row_first, u_first);
+    sum_row_second = _mm_adds_epu16(sum_row_second, u_second);
+    sum_row_first = _mm_adds_epu16(sum_row_first, v_first);
+    sum_row_second = _mm_adds_epu16(sum_row_second, v_second);
+
+    // Get modifier and store result
+    if (blk_fw) {
+      sum_row_first =
+          average_8(sum_row_first, &mul_first, strength, rounding, blk_fw[0]);
+      sum_row_second =
+          average_8(sum_row_second, &mul_second, strength, rounding, blk_fw[1]);
+    } else {
+      average_16(&sum_row_first, &sum_row_second, &mul_first, &mul_second,
+                 strength, rounding, weight);
+    }
+    accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count,
+                            y_accum);
+
+    y_src += y_src_stride;
+    y_pre += y_pre_stride;
+    y_count += y_pre_stride;
+    y_accum += y_pre_stride;
+    y_dist += DIST_STRIDE;
+  }
+
+  // The last row
+  mul_first = _mm_loadu_si128((const __m128i *)neighbors_first[0]);
+  mul_second = _mm_loadu_si128((const __m128i *)neighbors_second[0]);
+
+  // Shift the rows up
+  sum_row_1_first = sum_row_2_first;
+  sum_row_1_second = sum_row_2_second;
+  sum_row_2_first = sum_row_3_first;
+  sum_row_2_second = sum_row_3_second;
+
+  // Add luma values to the modifier
+  sum_row_first = _mm_adds_epu16(sum_row_1_first, sum_row_2_first);
+  sum_row_second = _mm_adds_epu16(sum_row_1_second, sum_row_2_second);
+
+  // Add chroma values to the modifier
+  if (ss_y == 0) {
+    // Only calculate the new chroma distortion if we are at a pixel that
+    // corresponds to a new chroma row
+    read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second, &v_first,
+                            &v_second);
+  }
+
+  sum_row_first = _mm_adds_epu16(sum_row_first, u_first);
+  sum_row_second = _mm_adds_epu16(sum_row_second, u_second);
+  sum_row_first = _mm_adds_epu16(sum_row_first, v_first);
+  sum_row_second = _mm_adds_epu16(sum_row_second, v_second);
+
+  // Get modifier and store result
+  if (blk_fw) {
+    sum_row_first =
+        average_8(sum_row_first, &mul_first, strength, rounding, blk_fw[0]);
+    sum_row_second =
+        average_8(sum_row_second, &mul_second, strength, rounding, blk_fw[1]);
+  } else {
+    average_16(&sum_row_first, &sum_row_second, &mul_first, &mul_second,
+               strength, rounding, weight);
+  }
+  accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count,
+                          y_accum);
+}
+
+// Perform temporal filter for the luma component.
+static void apply_temporal_filter_luma(
+    const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
+    int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
+    int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
+    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
+    int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk,
+    uint32_t *y_accum, uint16_t *y_count, const uint16_t *y_dist,
+    const uint16_t *u_dist, const uint16_t *v_dist) {
+  unsigned int blk_col = 0, uv_blk_col = 0;
+  const unsigned int blk_col_step = 16, uv_blk_col_step = 16 >> ss_x;
+  const unsigned int mid_width = block_width >> 1,
+                     last_width = block_width - blk_col_step;
+  int top_weight = blk_fw[0],
+      bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2];
+  const int16_t *const *neighbors_first;
+  const int16_t *const *neighbors_second;
+
+  if (block_width == 16) {
+    // Special Case: The blockwidth is 16 and we are operating on a row of 16
+    // chroma pixels. In this case, we can't use the usualy left-midle-right
+    // pattern. We also don't support splitting now.
+    neighbors_first = LUMA_LEFT_COLUMN_NEIGHBORS;
+    neighbors_second = LUMA_RIGHT_COLUMN_NEIGHBORS;
+    if (use_whole_blk) {
+      apply_temporal_filter_luma_16(
+          y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+          u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
+          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, 16,
+          block_height, ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col,
+          y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col,
+          v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight,
+          bottom_weight, NULL);
+    } else {
+      apply_temporal_filter_luma_16(
+          y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+          u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
+          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, 16,
+          block_height, ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col,
+          y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col,
+          v_dist + uv_blk_col, neighbors_first, neighbors_second, 0, 0, blk_fw);
+    }
+
+    return;
+  }
+
+  // Left
+  neighbors_first = LUMA_LEFT_COLUMN_NEIGHBORS;
+  neighbors_second = LUMA_MIDDLE_COLUMN_NEIGHBORS;
+  apply_temporal_filter_luma_16(
+      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
+      v_pre + uv_blk_col, uv_pre_stride, 16, block_height, ss_x, ss_y, strength,
+      use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+      u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+      neighbors_second, top_weight, bottom_weight, NULL);
+
+  blk_col += blk_col_step;
+  uv_blk_col += uv_blk_col_step;
+
+  // Middle First
+  neighbors_first = LUMA_MIDDLE_COLUMN_NEIGHBORS;
+  for (; blk_col < mid_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    apply_temporal_filter_luma_16(
+        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
+        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, 16, block_height,
+        ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col,
+        y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col,
+        v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight,
+        bottom_weight, NULL);
+  }
+
+  if (!use_whole_blk) {
+    top_weight = blk_fw[1];
+    bottom_weight = blk_fw[3];
+  }
+
+  // Middle Second
+  for (; blk_col < last_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    apply_temporal_filter_luma_16(
+        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
+        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, 16, block_height,
+        ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col,
+        y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col,
+        v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight,
+        bottom_weight, NULL);
+  }
+
+  // Right
+  neighbors_second = LUMA_RIGHT_COLUMN_NEIGHBORS;
+  apply_temporal_filter_luma_16(
+      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
+      v_pre + uv_blk_col, uv_pre_stride, 16, block_height, ss_x, ss_y, strength,
+      use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+      u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+      neighbors_second, top_weight, bottom_weight, NULL);
+}
+
+// Apply temporal filter to the chroma components. This performs temporal
+// filtering on a chroma block of 8 X uv_height. If blk_fw is not NULL, use
+// blk_fw as an array of size 4 for the weights for each of the 4 subblocks,
+// else use top_weight for top half, and bottom weight for bottom half.
+static void apply_temporal_filter_chroma_8(
+    const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
+    int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
+    int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
+    int uv_pre_stride, unsigned int uv_block_width,
+    unsigned int uv_block_height, int ss_x, int ss_y, int strength,
+    uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
+    const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist,
+    const int16_t *const *neighbors, int top_weight, int bottom_weight,
+    const int *blk_fw) {
+  const int rounding = (1 << strength) >> 1;
+  int weight = top_weight;
+
+  __m128i mul;
+
+  __m128i u_sum_row_1, u_sum_row_2, u_sum_row_3;
+  __m128i v_sum_row_1, v_sum_row_2, v_sum_row_3;
+
+  __m128i u_sum_row, v_sum_row;
+
+  // Loop variable
+  unsigned int h;
+
+  (void)uv_block_width;
+
+  // First row
+  mul = _mm_loadu_si128((const __m128i *)neighbors[0]);
+
+  // Add chroma values
+  get_sum_8(u_dist, &u_sum_row_2);
+  get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3);
+
+  u_sum_row = _mm_adds_epu16(u_sum_row_2, u_sum_row_3);
+
+  get_sum_8(v_dist, &v_sum_row_2);
+  get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3);
+
+  v_sum_row = _mm_adds_epu16(v_sum_row_2, v_sum_row_3);
+
+  // Add luma values
+  add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row);
+
+  // Get modifier and store result
+  if (blk_fw) {
+    u_sum_row =
+        average_4_4(u_sum_row, &mul, strength, rounding, blk_fw[0], blk_fw[1]);
+    v_sum_row =
+        average_4_4(v_sum_row, &mul, strength, rounding, blk_fw[0], blk_fw[1]);
+  } else {
+    u_sum_row = average_8(u_sum_row, &mul, strength, rounding, weight);
+    v_sum_row = average_8(v_sum_row, &mul, strength, rounding, weight);
+  }
+  accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum);
+  accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum);
+
+  u_src += uv_src_stride;
+  u_pre += uv_pre_stride;
+  u_dist += DIST_STRIDE;
+  v_src += uv_src_stride;
+  v_pre += uv_pre_stride;
+  v_dist += DIST_STRIDE;
+  u_count += uv_pre_stride;
+  u_accum += uv_pre_stride;
+  v_count += uv_pre_stride;
+  v_accum += uv_pre_stride;
+
+  y_src += y_src_stride * (1 + ss_y);
+  y_pre += y_pre_stride * (1 + ss_y);
+  y_dist += DIST_STRIDE * (1 + ss_y);
+
+  // Then all the rows except the last one
+  mul = _mm_loadu_si128((const __m128i *)neighbors[1]);
+
+  for (h = 1; h < uv_block_height - 1; ++h) {
+    // Move the weight pointer to the bottom half of the blocks
+    if (h == uv_block_height / 2) {
+      if (blk_fw) {
+        blk_fw += 2;
+      } else {
+        weight = bottom_weight;
+      }
+    }
+
+    // Shift the rows up
+    u_sum_row_1 = u_sum_row_2;
+    u_sum_row_2 = u_sum_row_3;
+
+    v_sum_row_1 = v_sum_row_2;
+    v_sum_row_2 = v_sum_row_3;
+
+    // Add chroma values
+    u_sum_row = _mm_adds_epu16(u_sum_row_1, u_sum_row_2);
+    get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3);
+    u_sum_row = _mm_adds_epu16(u_sum_row, u_sum_row_3);
+
+    v_sum_row = _mm_adds_epu16(v_sum_row_1, v_sum_row_2);
+    get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3);
+    v_sum_row = _mm_adds_epu16(v_sum_row, v_sum_row_3);
+
+    // Add luma values
+    add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row);
+
+    // Get modifier and store result
+    if (blk_fw) {
+      u_sum_row = average_4_4(u_sum_row, &mul, strength, rounding, blk_fw[0],
+                              blk_fw[1]);
+      v_sum_row = average_4_4(v_sum_row, &mul, strength, rounding, blk_fw[0],
+                              blk_fw[1]);
+    } else {
+      u_sum_row = average_8(u_sum_row, &mul, strength, rounding, weight);
+      v_sum_row = average_8(v_sum_row, &mul, strength, rounding, weight);
+    }
+
+    accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum);
+    accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum);
+
+    u_src += uv_src_stride;
+    u_pre += uv_pre_stride;
+    u_dist += DIST_STRIDE;
+    v_src += uv_src_stride;
+    v_pre += uv_pre_stride;
+    v_dist += DIST_STRIDE;
+    u_count += uv_pre_stride;
+    u_accum += uv_pre_stride;
+    v_count += uv_pre_stride;
+    v_accum += uv_pre_stride;
+
+    y_src += y_src_stride * (1 + ss_y);
+    y_pre += y_pre_stride * (1 + ss_y);
+    y_dist += DIST_STRIDE * (1 + ss_y);
+  }
+
+  // The last row
+  mul = _mm_loadu_si128((const __m128i *)neighbors[0]);
+
+  // Shift the rows up
+  u_sum_row_1 = u_sum_row_2;
+  u_sum_row_2 = u_sum_row_3;
+
+  v_sum_row_1 = v_sum_row_2;
+  v_sum_row_2 = v_sum_row_3;
+
+  // Add chroma values
+  u_sum_row = _mm_adds_epu16(u_sum_row_1, u_sum_row_2);
+  v_sum_row = _mm_adds_epu16(v_sum_row_1, v_sum_row_2);
+
+  // Add luma values
+  add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row);
+
+  // Get modifier and store result
+  if (blk_fw) {
+    u_sum_row =
+        average_4_4(u_sum_row, &mul, strength, rounding, blk_fw[0], blk_fw[1]);
+    v_sum_row =
+        average_4_4(v_sum_row, &mul, strength, rounding, blk_fw[0], blk_fw[1]);
+  } else {
+    u_sum_row = average_8(u_sum_row, &mul, strength, rounding, weight);
+    v_sum_row = average_8(v_sum_row, &mul, strength, rounding, weight);
+  }
+
+  accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum);
+  accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum);
+}
+
+// Perform temporal filter for the chroma components.
+static void apply_temporal_filter_chroma(
+    const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
+    int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
+    int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
+    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
+    int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk,
+    uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
+    const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist) {
+  const unsigned int uv_width = block_width >> ss_x,
+                     uv_height = block_height >> ss_y;
+
+  unsigned int blk_col = 0, uv_blk_col = 0;
+  const unsigned int uv_blk_col_step = 8, blk_col_step = 8 << ss_x;
+  const unsigned int uv_mid_width = uv_width >> 1,
+                     uv_last_width = uv_width - uv_blk_col_step;
+  int top_weight = blk_fw[0],
+      bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2];
+  const int16_t *const *neighbors;
+
+  if (uv_width == 8) {
+    // Special Case: We are subsampling in x direction on a 16x16 block. Since
+    // we are operating on a row of 8 chroma pixels, we can't use the usual
+    // left-middle-right pattern.
+    assert(ss_x);
+
+    if (ss_y) {
+      neighbors = CHROMA_DOUBLE_SS_SINGLE_COLUMN_NEIGHBORS;
+    } else {
+      neighbors = CHROMA_SINGLE_SS_SINGLE_COLUMN_NEIGHBORS;
+    }
+
+    if (use_whole_blk) {
+      apply_temporal_filter_chroma_8(
+          y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+          u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
+          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+          uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+          u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+          y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors,
+          top_weight, bottom_weight, NULL);
+    } else {
+      apply_temporal_filter_chroma_8(
+          y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+          u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
+          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+          uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+          u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+          y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors,
+          0, 0, blk_fw);
+    }
+
+    return;
+  }
+
+  // Left
+  if (ss_x && ss_y) {
+    neighbors = CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS;
+  } else if (ss_x || ss_y) {
+    neighbors = CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS;
+  } else {
+    neighbors = CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS;
+  }
+
+  apply_temporal_filter_chroma_8(
+      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
+      v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y,
+      strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+      v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+      u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
+      bottom_weight, NULL);
+
+  blk_col += blk_col_step;
+  uv_blk_col += uv_blk_col_step;
+
+  // Middle First
+  if (ss_x && ss_y) {
+    neighbors = CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+  } else if (ss_x || ss_y) {
+    neighbors = CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+  } else {
+    neighbors = CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS;
+  }
+
+  for (; uv_blk_col < uv_mid_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    apply_temporal_filter_chroma_8(
+        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
+        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+        uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+        u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+        y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors,
+        top_weight, bottom_weight, NULL);
+  }
+
+  if (!use_whole_blk) {
+    top_weight = blk_fw[1];
+    bottom_weight = blk_fw[3];
+  }
+
+  // Middle Second
+  for (; uv_blk_col < uv_last_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    apply_temporal_filter_chroma_8(
+        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
+        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+        uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+        u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+        y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors,
+        top_weight, bottom_weight, NULL);
+  }
+
+  // Right
+  if (ss_x && ss_y) {
+    neighbors = CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS;
+  } else if (ss_x || ss_y) {
+    neighbors = CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS;
+  } else {
+    neighbors = CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS;
+  }
+
+  apply_temporal_filter_chroma_8(
+      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
+      v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y,
+      strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+      v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+      u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
+      bottom_weight, NULL);
+}
+
+static void apply_temporal_filter_yuv(
+    const YV12_BUFFER_CONFIG *ref_frame, const MACROBLOCKD *mbd,
+    const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
+    const int strength, const int use_subblock,
+    const int *subblock_filter_weights, const uint8_t *pred, uint32_t *accum,
+    uint16_t *count) {
+  const int use_whole_blk = !use_subblock;
+  const int *blk_fw = subblock_filter_weights;
+
+  // Block information (Y-plane).
+  const unsigned int block_height = block_size_high[block_size];
+  const unsigned int block_width = block_size_wide[block_size];
+  const int mb_pels = block_height * block_width;
+  const int y_src_stride = ref_frame->y_stride;
+  const int y_pre_stride = block_width;
+  const int mb_y_src_offset =
+      mb_row * block_height * ref_frame->y_stride + mb_col * block_width;
+
+  // Block information (UV-plane).
+  const int ss_y = mbd->plane[1].subsampling_y;
+  const int ss_x = mbd->plane[1].subsampling_x;
+  const unsigned int uv_height = block_height >> ss_y;
+  const unsigned int uv_width = block_width >> ss_x;
+  const int uv_src_stride = ref_frame->uv_stride;
+  const int uv_pre_stride = block_width >> ss_x;
+  const int mb_uv_src_offset =
+      mb_row * uv_height * ref_frame->uv_stride + mb_col * uv_width;
+
+  const uint8_t *y_src = ref_frame->y_buffer + mb_y_src_offset;
+  const uint8_t *u_src = ref_frame->u_buffer + mb_uv_src_offset;
+  const uint8_t *v_src = ref_frame->v_buffer + mb_uv_src_offset;
+  const uint8_t *y_pre = pred;
+  const uint8_t *u_pre = pred + mb_pels;
+  const uint8_t *v_pre = pred + mb_pels * 2;
+  uint32_t *y_accum = accum;
+  uint32_t *u_accum = accum + mb_pels;
+  uint32_t *v_accum = accum + mb_pels * 2;
+  uint16_t *y_count = count;
+  uint16_t *u_count = count + mb_pels;
+  uint16_t *v_count = count + mb_pels * 2;
+
+  const unsigned int chroma_height = block_height >> ss_y,
+                     chroma_width = block_width >> ss_x;
+
+  DECLARE_ALIGNED(16, uint16_t, y_dist[BH * DIST_STRIDE]) = { 0 };
+  DECLARE_ALIGNED(16, uint16_t, u_dist[BH * DIST_STRIDE]) = { 0 };
+  DECLARE_ALIGNED(16, uint16_t, v_dist[BH * DIST_STRIDE]) = { 0 };
+  const int *blk_fw_ptr = blk_fw;
+
+  uint16_t *y_dist_ptr = y_dist + 1, *u_dist_ptr = u_dist + 1,
+           *v_dist_ptr = v_dist + 1;
+  const uint8_t *y_src_ptr = y_src, *u_src_ptr = u_src, *v_src_ptr = v_src;
+  const uint8_t *y_pre_ptr = y_pre, *u_pre_ptr = u_pre, *v_pre_ptr = v_pre;
+
+  // Loop variables
+  unsigned int row, blk_col;
+
+  assert(block_width <= BW && "block width too large");
+  assert(block_height <= BH && "block height too large");
+  assert(block_width % 16 == 0 && "block width must be multiple of 16");
+  assert(block_height % 2 == 0 && "block height must be even");
+  assert((ss_x == 0 || ss_x == 1) && (ss_y == 0 || ss_y == 1) &&
+         "invalid chroma subsampling");
+  assert(strength >= 0 && strength <= 6 && "invalid temporal filter strength");
+  assert(blk_fw[0] >= 0 && "filter weight must be positive");
+  assert(
+      (use_whole_blk || (blk_fw[1] >= 0 && blk_fw[2] >= 0 && blk_fw[3] >= 0)) &&
+      "subblock filter weight must be positive");
+  assert(blk_fw[0] <= 2 && "sublock filter weight must be less than 2");
+  assert(
+      (use_whole_blk || (blk_fw[1] <= 2 && blk_fw[2] <= 2 && blk_fw[3] <= 2)) &&
+      "subblock filter weight must be less than 2");
+
+  // Precompute the difference sqaured
+  for (row = 0; row < block_height; row++) {
+    for (blk_col = 0; blk_col < block_width; blk_col += 16) {
+      store_dist_16(y_src_ptr + blk_col, y_pre_ptr + blk_col,
+                    y_dist_ptr + blk_col);
+    }
+    y_src_ptr += y_src_stride;
+    y_pre_ptr += y_pre_stride;
+    y_dist_ptr += DIST_STRIDE;
+  }
+
+  for (row = 0; row < chroma_height; row++) {
+    for (blk_col = 0; blk_col < chroma_width; blk_col += 8) {
+      store_dist_8(u_src_ptr + blk_col, u_pre_ptr + blk_col,
+                   u_dist_ptr + blk_col);
+      store_dist_8(v_src_ptr + blk_col, v_pre_ptr + blk_col,
+                   v_dist_ptr + blk_col);
+    }
+
+    u_src_ptr += uv_src_stride;
+    u_pre_ptr += uv_pre_stride;
+    u_dist_ptr += DIST_STRIDE;
+    v_src_ptr += uv_src_stride;
+    v_pre_ptr += uv_pre_stride;
+    v_dist_ptr += DIST_STRIDE;
+  }
+
+  y_dist_ptr = y_dist + 1;
+  u_dist_ptr = u_dist + 1;
+  v_dist_ptr = v_dist + 1;
+
+  apply_temporal_filter_luma(y_src, y_src_stride, y_pre, y_pre_stride, u_src,
+                             v_src, uv_src_stride, u_pre, v_pre, uv_pre_stride,
+                             block_width, block_height, ss_x, ss_y, strength,
+                             blk_fw_ptr, use_whole_blk, y_accum, y_count,
+                             y_dist_ptr, u_dist_ptr, v_dist_ptr);
+
+  apply_temporal_filter_chroma(
+      y_src, y_src_stride, y_pre, y_pre_stride, u_src, v_src, uv_src_stride,
+      u_pre, v_pre, uv_pre_stride, block_width, block_height, ss_x, ss_y,
+      strength, blk_fw_ptr, use_whole_blk, u_accum, u_count, v_accum, v_count,
+      y_dist_ptr, u_dist_ptr, v_dist_ptr);
+}
+
+////////////////////////
+// Low bit-depth Ends //
+////////////////////////
+
+///////////////////////////
+// High bit-depth Begins //
+///////////////////////////
+
+// Compute (a-b)**2 for 8 pixels with size 16-bit
+static INLINE void highbd_store_dist_8(const uint16_t *a, const uint16_t *b,
+                                       uint32_t *dst) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i a_reg = _mm_loadu_si128((const __m128i *)a);
+  const __m128i b_reg = _mm_loadu_si128((const __m128i *)b);
+
+  const __m128i a_first = _mm_cvtepu16_epi32(a_reg);
+  const __m128i a_second = _mm_unpackhi_epi16(a_reg, zero);
+  const __m128i b_first = _mm_cvtepu16_epi32(b_reg);
+  const __m128i b_second = _mm_unpackhi_epi16(b_reg, zero);
+
+  __m128i dist_first, dist_second;
+
+  dist_first = _mm_sub_epi32(a_first, b_first);
+  dist_second = _mm_sub_epi32(a_second, b_second);
+  dist_first = _mm_mullo_epi32(dist_first, dist_first);
+  dist_second = _mm_mullo_epi32(dist_second, dist_second);
+
+  _mm_storeu_si128((__m128i *)dst, dist_first);
+  _mm_storeu_si128((__m128i *)(dst + 4), dist_second);
+}
+
+// Sum up three neighboring distortions for the pixels
+static INLINE void highbd_get_sum_4(const uint32_t *dist, __m128i *sum) {
+  __m128i dist_reg, dist_left, dist_right;
+
+  dist_reg = _mm_loadu_si128((const __m128i *)dist);
+  dist_left = _mm_loadu_si128((const __m128i *)(dist - 1));
+  dist_right = _mm_loadu_si128((const __m128i *)(dist + 1));
+
+  *sum = _mm_add_epi32(dist_reg, dist_left);
+  *sum = _mm_add_epi32(*sum, dist_right);
+}
+
+static INLINE void highbd_get_sum_8(const uint32_t *dist, __m128i *sum_first,
+                                    __m128i *sum_second) {
+  highbd_get_sum_4(dist, sum_first);
+  highbd_get_sum_4(dist + 4, sum_second);
+}
+
+// Average the value based on the number of values summed (9 for pixels away
+// from the border, 4 for pixels in corners, and 6 for other edge values, plus
+// however many values from y/uv plane are).
+//
+// Add in the rounding factor and shift, clamp to 16, invert and shift. Multiply
+// by weight.
+static INLINE void highbd_average_4(__m128i *output, const __m128i *sum,
+                                    const __m128i *mul_constants,
+                                    const int strength, const int rounding,
+                                    const int weight) {
+  // _mm_srl_epi16 uses the lower 64 bit value for the shift.
+  const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength);
+  const __m128i rounding_u32 = _mm_set1_epi32(rounding);
+  const __m128i weight_u32 = _mm_set1_epi32(weight);
+  const __m128i sixteen = _mm_set1_epi32(16);
+  const __m128i zero = _mm_setzero_si128();
+
+  // modifier * 3 / index;
+  const __m128i sum_lo = _mm_unpacklo_epi32(*sum, zero);
+  const __m128i sum_hi = _mm_unpackhi_epi32(*sum, zero);
+  const __m128i const_lo = _mm_unpacklo_epi32(*mul_constants, zero);
+  const __m128i const_hi = _mm_unpackhi_epi32(*mul_constants, zero);
+
+  const __m128i mul_lo = _mm_mul_epu32(sum_lo, const_lo);
+  const __m128i mul_lo_div = _mm_srli_epi64(mul_lo, 32);
+  const __m128i mul_hi = _mm_mul_epu32(sum_hi, const_hi);
+  const __m128i mul_hi_div = _mm_srli_epi64(mul_hi, 32);
+
+  // Now we have
+  //   mul_lo: 00 a1 00 a0
+  //   mul_hi: 00 a3 00 a2
+  // Unpack as 64 bit words to get even and odd elements
+  //   unpack_lo: 00 a2 00 a0
+  //   unpack_hi: 00 a3 00 a1
+  // Then we can shift and OR the results to get everything in 32-bits
+  const __m128i mul_even = _mm_unpacklo_epi64(mul_lo_div, mul_hi_div);
+  const __m128i mul_odd = _mm_unpackhi_epi64(mul_lo_div, mul_hi_div);
+  const __m128i mul_odd_shift = _mm_slli_si128(mul_odd, 4);
+  const __m128i mul = _mm_or_si128(mul_even, mul_odd_shift);
+
+  // Round
+  *output = _mm_add_epi32(mul, rounding_u32);
+  *output = _mm_srl_epi32(*output, strength_u128);
+
+  // Multiply with the weight
+  *output = _mm_min_epu32(*output, sixteen);
+  *output = _mm_sub_epi32(sixteen, *output);
+  *output = _mm_mullo_epi32(*output, weight_u32);
+}
+
+static INLINE void highbd_average_8(__m128i *output_0, __m128i *output_1,
+                                    const __m128i *sum_0_u32,
+                                    const __m128i *sum_1_u32,
+                                    const __m128i *mul_constants_0,
+                                    const __m128i *mul_constants_1,
+                                    const int strength, const int rounding,
+                                    const int weight) {
+  highbd_average_4(output_0, sum_0_u32, mul_constants_0, strength, rounding,
+                   weight);
+  highbd_average_4(output_1, sum_1_u32, mul_constants_1, strength, rounding,
+                   weight);
+}
+
+// Add 'sum_u32' to 'count'. Multiply by 'pred' and add to 'accumulator.'
+static INLINE void highbd_accumulate_and_store_8(const __m128i sum_first_u32,
+                                                 const __m128i sum_second_u32,
+                                                 const uint16_t *pred,
+                                                 uint16_t *count,
+                                                 uint32_t *accumulator) {
+  // Cast down to 16-bit ints
+  const __m128i sum_u16 = _mm_packus_epi32(sum_first_u32, sum_second_u32);
+  const __m128i zero = _mm_setzero_si128();
+
+  __m128i pred_u16 = _mm_loadu_si128((const __m128i *)pred);
+  __m128i count_u16 = _mm_loadu_si128((const __m128i *)count);
+
+  __m128i pred_0_u32, pred_1_u32;
+  __m128i accum_0_u32, accum_1_u32;
+
+  count_u16 = _mm_adds_epu16(count_u16, sum_u16);
+  _mm_storeu_si128((__m128i *)count, count_u16);
+
+  pred_u16 = _mm_mullo_epi16(sum_u16, pred_u16);
+
+  pred_0_u32 = _mm_cvtepu16_epi32(pred_u16);
+  pred_1_u32 = _mm_unpackhi_epi16(pred_u16, zero);
+
+  accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator);
+  accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4));
+
+  accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32);
+  accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32);
+
+  _mm_storeu_si128((__m128i *)accumulator, accum_0_u32);
+  _mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32);
+}
+
+static INLINE void highbd_read_dist_4(const uint32_t *dist, __m128i *dist_reg) {
+  *dist_reg = _mm_loadu_si128((const __m128i *)dist);
+}
+
+static INLINE void highbd_read_dist_8(const uint32_t *dist, __m128i *reg_first,
+                                      __m128i *reg_second) {
+  highbd_read_dist_4(dist, reg_first);
+  highbd_read_dist_4(dist + 4, reg_second);
+}
+
+static INLINE void highbd_read_chroma_dist_row_8(
+    int ss_x, const uint32_t *u_dist, const uint32_t *v_dist, __m128i *u_first,
+    __m128i *u_second, __m128i *v_first, __m128i *v_second) {
+  if (!ss_x) {
+    // If there is no chroma subsampling in the horizontal direction, then we
+    // need to load 8 entries from chroma.
+    highbd_read_dist_8(u_dist, u_first, u_second);
+    highbd_read_dist_8(v_dist, v_first, v_second);
+  } else {  // ss_x == 1
+    // Otherwise, we only need to load 8 entries
+    __m128i u_reg, v_reg;
+
+    highbd_read_dist_4(u_dist, &u_reg);
+
+    *u_first = _mm_unpacklo_epi32(u_reg, u_reg);
+    *u_second = _mm_unpackhi_epi32(u_reg, u_reg);
+
+    highbd_read_dist_4(v_dist, &v_reg);
+
+    *v_first = _mm_unpacklo_epi32(v_reg, v_reg);
+    *v_second = _mm_unpackhi_epi32(v_reg, v_reg);
+  }
+}
+
+static void highbd_apply_temporal_filter_luma_8(
+    const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre,
+    int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src,
+    int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre,
+    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
+    int ss_x, int ss_y, int strength, int use_whole_blk, uint32_t *y_accum,
+    uint16_t *y_count, const uint32_t *y_dist, const uint32_t *u_dist,
+    const uint32_t *v_dist, const uint32_t *const *neighbors_first,
+    const uint32_t *const *neighbors_second, int top_weight,
+    int bottom_weight) {
+  const int rounding = (1 << strength) >> 1;
+  int weight = top_weight;
+
+  __m128i mul_first, mul_second;
+
+  __m128i sum_row_1_first, sum_row_1_second;
+  __m128i sum_row_2_first, sum_row_2_second;
+  __m128i sum_row_3_first, sum_row_3_second;
+
+  __m128i u_first, u_second;
+  __m128i v_first, v_second;
+
+  __m128i sum_row_first;
+  __m128i sum_row_second;
+
+  // Loop variables
+  unsigned int h;
+
+  assert(strength >= 0 && strength <= 14 &&
+         "invalid adjusted temporal filter strength");
+  assert(block_width == 8);
+
+  (void)block_width;
+
+  // First row
+  mul_first = _mm_loadu_si128((const __m128i *)neighbors_first[0]);
+  mul_second = _mm_loadu_si128((const __m128i *)neighbors_second[0]);
+
+  // Add luma values
+  highbd_get_sum_8(y_dist, &sum_row_2_first, &sum_row_2_second);
+  highbd_get_sum_8(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second);
+
+  // We don't need to saturate here because the maximum value is UINT12_MAX ** 2
+  // * 9 ~= 2**24 * 9 < 2 ** 28 < INT32_MAX
+  sum_row_first = _mm_add_epi32(sum_row_2_first, sum_row_3_first);
+  sum_row_second = _mm_add_epi32(sum_row_2_second, sum_row_3_second);
+
+  // Add chroma values
+  highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second,
+                                &v_first, &v_second);
+
+  // Max value here is 2 ** 24 * (9 + 2), so no saturation is needed
+  sum_row_first = _mm_add_epi32(sum_row_first, u_first);
+  sum_row_second = _mm_add_epi32(sum_row_second, u_second);
+
+  sum_row_first = _mm_add_epi32(sum_row_first, v_first);
+  sum_row_second = _mm_add_epi32(sum_row_second, v_second);
+
+  // Get modifier and store result
+  highbd_average_8(&sum_row_first, &sum_row_second, &sum_row_first,
+                   &sum_row_second, &mul_first, &mul_second, strength, rounding,
+                   weight);
+
+  highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count,
+                                y_accum);
+
+  y_src += y_src_stride;
+  y_pre += y_pre_stride;
+  y_count += y_pre_stride;
+  y_accum += y_pre_stride;
+  y_dist += DIST_STRIDE;
+
+  u_src += uv_src_stride;
+  u_pre += uv_pre_stride;
+  u_dist += DIST_STRIDE;
+  v_src += uv_src_stride;
+  v_pre += uv_pre_stride;
+  v_dist += DIST_STRIDE;
+
+  // Then all the rows except the last one
+  mul_first = _mm_loadu_si128((const __m128i *)neighbors_first[1]);
+  mul_second = _mm_loadu_si128((const __m128i *)neighbors_second[1]);
+
+  for (h = 1; h < block_height - 1; ++h) {
+    // Move the weight to bottom half
+    if (!use_whole_blk && h == block_height / 2) {
+      weight = bottom_weight;
+    }
+    // Shift the rows up
+    sum_row_1_first = sum_row_2_first;
+    sum_row_1_second = sum_row_2_second;
+    sum_row_2_first = sum_row_3_first;
+    sum_row_2_second = sum_row_3_second;
+
+    // Add luma values to the modifier
+    sum_row_first = _mm_add_epi32(sum_row_1_first, sum_row_2_first);
+    sum_row_second = _mm_add_epi32(sum_row_1_second, sum_row_2_second);
+
+    highbd_get_sum_8(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second);
+
+    sum_row_first = _mm_add_epi32(sum_row_first, sum_row_3_first);
+    sum_row_second = _mm_add_epi32(sum_row_second, sum_row_3_second);
+
+    // Add chroma values to the modifier
+    if (ss_y == 0 || h % 2 == 0) {
+      // Only calculate the new chroma distortion if we are at a pixel that
+      // corresponds to a new chroma row
+      highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second,
+                                    &v_first, &v_second);
+
+      u_src += uv_src_stride;
+      u_pre += uv_pre_stride;
+      u_dist += DIST_STRIDE;
+      v_src += uv_src_stride;
+      v_pre += uv_pre_stride;
+      v_dist += DIST_STRIDE;
+    }
+
+    sum_row_first = _mm_add_epi32(sum_row_first, u_first);
+    sum_row_second = _mm_add_epi32(sum_row_second, u_second);
+    sum_row_first = _mm_add_epi32(sum_row_first, v_first);
+    sum_row_second = _mm_add_epi32(sum_row_second, v_second);
+
+    // Get modifier and store result
+    highbd_average_8(&sum_row_first, &sum_row_second, &sum_row_first,
+                     &sum_row_second, &mul_first, &mul_second, strength,
+                     rounding, weight);
+    highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count,
+                                  y_accum);
+
+    y_src += y_src_stride;
+    y_pre += y_pre_stride;
+    y_count += y_pre_stride;
+    y_accum += y_pre_stride;
+    y_dist += DIST_STRIDE;
+  }
+
+  // The last row
+  mul_first = _mm_loadu_si128((const __m128i *)neighbors_first[0]);
+  mul_second = _mm_loadu_si128((const __m128i *)neighbors_second[0]);
+
+  // Shift the rows up
+  sum_row_1_first = sum_row_2_first;
+  sum_row_1_second = sum_row_2_second;
+  sum_row_2_first = sum_row_3_first;
+  sum_row_2_second = sum_row_3_second;
+
+  // Add luma values to the modifier
+  sum_row_first = _mm_add_epi32(sum_row_1_first, sum_row_2_first);
+  sum_row_second = _mm_add_epi32(sum_row_1_second, sum_row_2_second);
+
+  // Add chroma values to the modifier
+  if (ss_y == 0) {
+    // Only calculate the new chroma distortion if we are at a pixel that
+    // corresponds to a new chroma row
+    highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second,
+                                  &v_first, &v_second);
+  }
+
+  sum_row_first = _mm_add_epi32(sum_row_first, u_first);
+  sum_row_second = _mm_add_epi32(sum_row_second, u_second);
+  sum_row_first = _mm_add_epi32(sum_row_first, v_first);
+  sum_row_second = _mm_add_epi32(sum_row_second, v_second);
+
+  // Get modifier and store result
+  highbd_average_8(&sum_row_first, &sum_row_second, &sum_row_first,
+                   &sum_row_second, &mul_first, &mul_second, strength, rounding,
+                   weight);
+  highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count,
+                                y_accum);
+}
+
+// Perform temporal filter for the luma component.
+static void highbd_apply_temporal_filter_luma(
+    const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre,
+    int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src,
+    int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre,
+    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
+    int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk,
+    uint32_t *y_accum, uint16_t *y_count, const uint32_t *y_dist,
+    const uint32_t *u_dist, const uint32_t *v_dist) {
+  unsigned int blk_col = 0, uv_blk_col = 0;
+  const unsigned int blk_col_step = 8, uv_blk_col_step = 8 >> ss_x;
+  const unsigned int mid_width = block_width >> 1,
+                     last_width = block_width - blk_col_step;
+  int top_weight = blk_fw[0],
+      bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2];
+  const uint32_t *const *neighbors_first;
+  const uint32_t *const *neighbors_second;
+
+  // Left
+  neighbors_first = HIGHBD_LUMA_LEFT_COLUMN_NEIGHBORS;
+  neighbors_second = HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS;
+  highbd_apply_temporal_filter_luma_8(
+      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
+      v_pre + uv_blk_col, uv_pre_stride, blk_col_step, block_height, ss_x, ss_y,
+      strength, use_whole_blk, y_accum + blk_col, y_count + blk_col,
+      y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+      neighbors_first, neighbors_second, top_weight, bottom_weight);
+
+  blk_col += blk_col_step;
+  uv_blk_col += uv_blk_col_step;
+
+  // Middle First
+  neighbors_first = HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS;
+  for (; blk_col < mid_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    highbd_apply_temporal_filter_luma_8(
+        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
+        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, blk_col_step,
+        block_height, ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col,
+        y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col,
+        v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight,
+        bottom_weight);
+  }
+
+  if (!use_whole_blk) {
+    top_weight = blk_fw[1];
+    bottom_weight = blk_fw[3];
+  }
+
+  // Middle Second
+  for (; blk_col < last_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    highbd_apply_temporal_filter_luma_8(
+        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
+        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, blk_col_step,
+        block_height, ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col,
+        y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col,
+        v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight,
+        bottom_weight);
+  }
+
+  // Right
+  neighbors_second = HIGHBD_LUMA_RIGHT_COLUMN_NEIGHBORS;
+  highbd_apply_temporal_filter_luma_8(
+      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
+      v_pre + uv_blk_col, uv_pre_stride, blk_col_step, block_height, ss_x, ss_y,
+      strength, use_whole_blk, y_accum + blk_col, y_count + blk_col,
+      y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+      neighbors_first, neighbors_second, top_weight, bottom_weight);
+}
+
+// Add a row of luma distortion that corresponds to 8 chroma mods. If we are
+// subsampling in x direction, then we have 16 lumas, else we have 8.
+static INLINE void highbd_add_luma_dist_to_8_chroma_mod(
+    const uint32_t *y_dist, int ss_x, int ss_y, __m128i *u_mod_fst,
+    __m128i *u_mod_snd, __m128i *v_mod_fst, __m128i *v_mod_snd) {
+  __m128i y_reg_fst, y_reg_snd;
+  if (!ss_x) {
+    highbd_read_dist_8(y_dist, &y_reg_fst, &y_reg_snd);
+    if (ss_y == 1) {
+      __m128i y_tmp_fst, y_tmp_snd;
+      highbd_read_dist_8(y_dist + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd);
+      y_reg_fst = _mm_add_epi32(y_reg_fst, y_tmp_fst);
+      y_reg_snd = _mm_add_epi32(y_reg_snd, y_tmp_snd);
+    }
+  } else {
+    // Temporary
+    __m128i y_fst, y_snd;
+
+    // First 8
+    highbd_read_dist_8(y_dist, &y_fst, &y_snd);
+    if (ss_y == 1) {
+      __m128i y_tmp_fst, y_tmp_snd;
+      highbd_read_dist_8(y_dist + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd);
+
+      y_fst = _mm_add_epi32(y_fst, y_tmp_fst);
+      y_snd = _mm_add_epi32(y_snd, y_tmp_snd);
+    }
+
+    y_reg_fst = _mm_hadd_epi32(y_fst, y_snd);
+
+    // Second 8
+    highbd_read_dist_8(y_dist + 8, &y_fst, &y_snd);
+    if (ss_y == 1) {
+      __m128i y_tmp_fst, y_tmp_snd;
+      highbd_read_dist_8(y_dist + 8 + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd);
+
+      y_fst = _mm_add_epi32(y_fst, y_tmp_fst);
+      y_snd = _mm_add_epi32(y_snd, y_tmp_snd);
+    }
+
+    y_reg_snd = _mm_hadd_epi32(y_fst, y_snd);
+  }
+
+  *u_mod_fst = _mm_add_epi32(*u_mod_fst, y_reg_fst);
+  *u_mod_snd = _mm_add_epi32(*u_mod_snd, y_reg_snd);
+  *v_mod_fst = _mm_add_epi32(*v_mod_fst, y_reg_fst);
+  *v_mod_snd = _mm_add_epi32(*v_mod_snd, y_reg_snd);
+}
+
+// Apply temporal filter to the chroma components. This performs temporal
+// filtering on a chroma block of 8 X uv_height. If blk_fw is not NULL, use
+// blk_fw as an array of size 4 for the weights for each of the 4 subblocks,
+// else use top_weight for top half, and bottom weight for bottom half.
+static void highbd_apply_temporal_filter_chroma_8(
+    const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre,
+    int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src,
+    int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre,
+    int uv_pre_stride, unsigned int uv_block_width,
+    unsigned int uv_block_height, int ss_x, int ss_y, int strength,
+    uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
+    const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist,
+    const uint32_t *const *neighbors_fst, const uint32_t *const *neighbors_snd,
+    int top_weight, int bottom_weight, const int *blk_fw) {
+  const int rounding = (1 << strength) >> 1;
+  int weight = top_weight;
+
+  __m128i mul_fst, mul_snd;
+
+  __m128i u_sum_row_1_fst, u_sum_row_2_fst, u_sum_row_3_fst;
+  __m128i v_sum_row_1_fst, v_sum_row_2_fst, v_sum_row_3_fst;
+  __m128i u_sum_row_1_snd, u_sum_row_2_snd, u_sum_row_3_snd;
+  __m128i v_sum_row_1_snd, v_sum_row_2_snd, v_sum_row_3_snd;
+
+  __m128i u_sum_row_fst, v_sum_row_fst;
+  __m128i u_sum_row_snd, v_sum_row_snd;
+
+  // Loop variable
+  unsigned int h;
+
+  (void)uv_block_width;
+
+  // First row
+  mul_fst = _mm_loadu_si128((const __m128i *)neighbors_fst[0]);
+  mul_snd = _mm_loadu_si128((const __m128i *)neighbors_snd[0]);
+
+  // Add chroma values
+  highbd_get_sum_8(u_dist, &u_sum_row_2_fst, &u_sum_row_2_snd);
+  highbd_get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3_fst, &u_sum_row_3_snd);
+
+  u_sum_row_fst = _mm_add_epi32(u_sum_row_2_fst, u_sum_row_3_fst);
+  u_sum_row_snd = _mm_add_epi32(u_sum_row_2_snd, u_sum_row_3_snd);
+
+  highbd_get_sum_8(v_dist, &v_sum_row_2_fst, &v_sum_row_2_snd);
+  highbd_get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3_fst, &v_sum_row_3_snd);
+
+  v_sum_row_fst = _mm_add_epi32(v_sum_row_2_fst, v_sum_row_3_fst);
+  v_sum_row_snd = _mm_add_epi32(v_sum_row_2_snd, v_sum_row_3_snd);
+
+  // Add luma values
+  highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst,
+                                       &u_sum_row_snd, &v_sum_row_fst,
+                                       &v_sum_row_snd);
+
+  // Get modifier and store result
+  if (blk_fw) {
+    highbd_average_4(&u_sum_row_fst, &u_sum_row_fst, &mul_fst, strength,
+                     rounding, blk_fw[0]);
+    highbd_average_4(&u_sum_row_snd, &u_sum_row_snd, &mul_snd, strength,
+                     rounding, blk_fw[1]);
+
+    highbd_average_4(&v_sum_row_fst, &v_sum_row_fst, &mul_fst, strength,
+                     rounding, blk_fw[0]);
+    highbd_average_4(&v_sum_row_snd, &v_sum_row_snd, &mul_snd, strength,
+                     rounding, blk_fw[1]);
+
+  } else {
+    highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, &u_sum_row_fst,
+                     &u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+                     weight);
+    highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, &v_sum_row_fst,
+                     &v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+                     weight);
+  }
+  highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count,
+                                u_accum);
+  highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count,
+                                v_accum);
+
+  u_src += uv_src_stride;
+  u_pre += uv_pre_stride;
+  u_dist += DIST_STRIDE;
+  v_src += uv_src_stride;
+  v_pre += uv_pre_stride;
+  v_dist += DIST_STRIDE;
+  u_count += uv_pre_stride;
+  u_accum += uv_pre_stride;
+  v_count += uv_pre_stride;
+  v_accum += uv_pre_stride;
+
+  y_src += y_src_stride * (1 + ss_y);
+  y_pre += y_pre_stride * (1 + ss_y);
+  y_dist += DIST_STRIDE * (1 + ss_y);
+
+  // Then all the rows except the last one
+  mul_fst = _mm_loadu_si128((const __m128i *)neighbors_fst[1]);
+  mul_snd = _mm_loadu_si128((const __m128i *)neighbors_snd[1]);
+
+  for (h = 1; h < uv_block_height - 1; ++h) {
+    // Move the weight pointer to the bottom half of the blocks
+    if (h == uv_block_height / 2) {
+      if (blk_fw) {
+        blk_fw += 2;
+      } else {
+        weight = bottom_weight;
+      }
+    }
+
+    // Shift the rows up
+    u_sum_row_1_fst = u_sum_row_2_fst;
+    u_sum_row_2_fst = u_sum_row_3_fst;
+    u_sum_row_1_snd = u_sum_row_2_snd;
+    u_sum_row_2_snd = u_sum_row_3_snd;
+
+    v_sum_row_1_fst = v_sum_row_2_fst;
+    v_sum_row_2_fst = v_sum_row_3_fst;
+    v_sum_row_1_snd = v_sum_row_2_snd;
+    v_sum_row_2_snd = v_sum_row_3_snd;
+
+    // Add chroma values
+    u_sum_row_fst = _mm_add_epi32(u_sum_row_1_fst, u_sum_row_2_fst);
+    u_sum_row_snd = _mm_add_epi32(u_sum_row_1_snd, u_sum_row_2_snd);
+    highbd_get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3_fst, &u_sum_row_3_snd);
+    u_sum_row_fst = _mm_add_epi32(u_sum_row_fst, u_sum_row_3_fst);
+    u_sum_row_snd = _mm_add_epi32(u_sum_row_snd, u_sum_row_3_snd);
+
+    v_sum_row_fst = _mm_add_epi32(v_sum_row_1_fst, v_sum_row_2_fst);
+    v_sum_row_snd = _mm_add_epi32(v_sum_row_1_snd, v_sum_row_2_snd);
+    highbd_get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3_fst, &v_sum_row_3_snd);
+    v_sum_row_fst = _mm_add_epi32(v_sum_row_fst, v_sum_row_3_fst);
+    v_sum_row_snd = _mm_add_epi32(v_sum_row_snd, v_sum_row_3_snd);
+
+    // Add luma values
+    highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst,
+                                         &u_sum_row_snd, &v_sum_row_fst,
+                                         &v_sum_row_snd);
+
+    // Get modifier and store result
+    if (blk_fw) {
+      highbd_average_4(&u_sum_row_fst, &u_sum_row_fst, &mul_fst, strength,
+                       rounding, blk_fw[0]);
+      highbd_average_4(&u_sum_row_snd, &u_sum_row_snd, &mul_snd, strength,
+                       rounding, blk_fw[1]);
+
+      highbd_average_4(&v_sum_row_fst, &v_sum_row_fst, &mul_fst, strength,
+                       rounding, blk_fw[0]);
+      highbd_average_4(&v_sum_row_snd, &v_sum_row_snd, &mul_snd, strength,
+                       rounding, blk_fw[1]);
+
+    } else {
+      highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, &u_sum_row_fst,
+                       &u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+                       weight);
+      highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, &v_sum_row_fst,
+                       &v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+                       weight);
+    }
+
+    highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count,
+                                  u_accum);
+    highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count,
+                                  v_accum);
+
+    u_src += uv_src_stride;
+    u_pre += uv_pre_stride;
+    u_dist += DIST_STRIDE;
+    v_src += uv_src_stride;
+    v_pre += uv_pre_stride;
+    v_dist += DIST_STRIDE;
+    u_count += uv_pre_stride;
+    u_accum += uv_pre_stride;
+    v_count += uv_pre_stride;
+    v_accum += uv_pre_stride;
+
+    y_src += y_src_stride * (1 + ss_y);
+    y_pre += y_pre_stride * (1 + ss_y);
+    y_dist += DIST_STRIDE * (1 + ss_y);
+  }
+
+  // The last row
+  mul_fst = _mm_loadu_si128((const __m128i *)neighbors_fst[0]);
+  mul_snd = _mm_loadu_si128((const __m128i *)neighbors_snd[0]);
+
+  // Shift the rows up
+  u_sum_row_1_fst = u_sum_row_2_fst;
+  u_sum_row_2_fst = u_sum_row_3_fst;
+  u_sum_row_1_snd = u_sum_row_2_snd;
+  u_sum_row_2_snd = u_sum_row_3_snd;
+
+  v_sum_row_1_fst = v_sum_row_2_fst;
+  v_sum_row_2_fst = v_sum_row_3_fst;
+  v_sum_row_1_snd = v_sum_row_2_snd;
+  v_sum_row_2_snd = v_sum_row_3_snd;
+
+  // Add chroma values
+  u_sum_row_fst = _mm_add_epi32(u_sum_row_1_fst, u_sum_row_2_fst);
+  v_sum_row_fst = _mm_add_epi32(v_sum_row_1_fst, v_sum_row_2_fst);
+  u_sum_row_snd = _mm_add_epi32(u_sum_row_1_snd, u_sum_row_2_snd);
+  v_sum_row_snd = _mm_add_epi32(v_sum_row_1_snd, v_sum_row_2_snd);
+
+  // Add luma values
+  highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst,
+                                       &u_sum_row_snd, &v_sum_row_fst,
+                                       &v_sum_row_snd);
+
+  // Get modifier and store result
+  if (blk_fw) {
+    highbd_average_4(&u_sum_row_fst, &u_sum_row_fst, &mul_fst, strength,
+                     rounding, blk_fw[0]);
+    highbd_average_4(&u_sum_row_snd, &u_sum_row_snd, &mul_snd, strength,
+                     rounding, blk_fw[1]);
+
+    highbd_average_4(&v_sum_row_fst, &v_sum_row_fst, &mul_fst, strength,
+                     rounding, blk_fw[0]);
+    highbd_average_4(&v_sum_row_snd, &v_sum_row_snd, &mul_snd, strength,
+                     rounding, blk_fw[1]);
+
+  } else {
+    highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, &u_sum_row_fst,
+                     &u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+                     weight);
+    highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, &v_sum_row_fst,
+                     &v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+                     weight);
+  }
+
+  highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count,
+                                u_accum);
+  highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count,
+                                v_accum);
+}
+
+// Perform temporal filter for the chroma components.
+static void highbd_apply_temporal_filter_chroma(
+    const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre,
+    int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src,
+    int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre,
+    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
+    int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk,
+    uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
+    const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist) {
+  const unsigned int uv_width = block_width >> ss_x,
+                     uv_height = block_height >> ss_y;
+
+  unsigned int blk_col = 0, uv_blk_col = 0;
+  const unsigned int uv_blk_col_step = 8, blk_col_step = 8 << ss_x;
+  const unsigned int uv_mid_width = uv_width >> 1,
+                     uv_last_width = uv_width - uv_blk_col_step;
+  int top_weight = blk_fw[0],
+      bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2];
+  const uint32_t *const *neighbors_fst;
+  const uint32_t *const *neighbors_snd;
+
+  if (uv_width == 8) {
+    // Special Case: We are subsampling in x direction on a 16x16 block. Since
+    // we are operating on a row of 8 chroma pixels, we can't use the usual
+    // left-middle-right pattern.
+    assert(ss_x);
+
+    if (ss_y) {
+      neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS;
+      neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS;
+    } else {
+      neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS;
+      neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS;
+    }
+
+    if (use_whole_blk) {
+      highbd_apply_temporal_filter_chroma_8(
+          y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+          u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
+          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+          uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+          u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+          y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+          neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL);
+    } else {
+      highbd_apply_temporal_filter_chroma_8(
+          y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+          u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
+          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+          uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+          u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+          y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+          neighbors_fst, neighbors_snd, 0, 0, blk_fw);
+    }
+
+    return;
+  }
+
+  // Left
+  if (ss_x && ss_y) {
+    neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS;
+    neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+  } else if (ss_x || ss_y) {
+    neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS;
+    neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+  } else {
+    neighbors_fst = HIGHBD_CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS;
+    neighbors_snd = HIGHBD_CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS;
+  }
+
+  highbd_apply_temporal_filter_chroma_8(
+      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
+      v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y,
+      strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+      v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+      u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst, neighbors_snd,
+      top_weight, bottom_weight, NULL);
+
+  blk_col += blk_col_step;
+  uv_blk_col += uv_blk_col_step;
+
+  // Middle First
+  if (ss_x && ss_y) {
+    neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+  } else if (ss_x || ss_y) {
+    neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+  } else {
+    neighbors_fst = HIGHBD_CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS;
+  }
+
+  for (; uv_blk_col < uv_mid_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    highbd_apply_temporal_filter_chroma_8(
+        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
+        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+        uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+        u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+        y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+        neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL);
+  }
+
+  if (!use_whole_blk) {
+    top_weight = blk_fw[1];
+    bottom_weight = blk_fw[3];
+  }
+
+  // Middle Second
+  for (; uv_blk_col < uv_last_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    highbd_apply_temporal_filter_chroma_8(
+        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
+        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+        uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+        u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+        y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+        neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL);
+  }
+
+  // Right
+  if (ss_x && ss_y) {
+    neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS;
+  } else if (ss_x || ss_y) {
+    neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS;
+  } else {
+    neighbors_snd = HIGHBD_CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS;
+  }
+
+  highbd_apply_temporal_filter_chroma_8(
+      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
+      v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y,
+      strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+      v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+      u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst, neighbors_snd,
+      top_weight, bottom_weight, NULL);
+}
+
+static void highbd_apply_temporal_filter_yuv(
+    const YV12_BUFFER_CONFIG *ref_frame, const MACROBLOCKD *mbd,
+    const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
+    const int strength, const int use_subblock,
+    const int *subblock_filter_weights, const uint8_t *pred, uint32_t *accum,
+    uint16_t *count) {
+  const int use_whole_blk = !use_subblock;
+  const int *blk_fw = subblock_filter_weights;
+
+  // Block information (Y-plane).
+  const unsigned int block_height = block_size_high[block_size];
+  const unsigned int block_width = block_size_wide[block_size];
+  const int mb_pels = block_height * block_width;
+  const int y_src_stride = ref_frame->y_stride;
+  const int y_pre_stride = block_width;
+  const int mb_y_src_offset =
+      mb_row * block_height * ref_frame->y_stride + mb_col * block_width;
+
+  // Block information (UV-plane).
+  const int ss_y = mbd->plane[1].subsampling_y;
+  const int ss_x = mbd->plane[1].subsampling_x;
+  const unsigned int uv_height = block_height >> ss_y;
+  const unsigned int uv_width = block_width >> ss_x;
+  const int uv_src_stride = ref_frame->uv_stride;
+  const int uv_pre_stride = block_width >> ss_x;
+  const int mb_uv_src_offset =
+      mb_row * uv_height * ref_frame->uv_stride + mb_col * uv_width;
+
+  const uint8_t *y_src = ref_frame->y_buffer + mb_y_src_offset;
+  const uint8_t *u_src = ref_frame->u_buffer + mb_uv_src_offset;
+  const uint8_t *v_src = ref_frame->v_buffer + mb_uv_src_offset;
+  const uint8_t *y_pre = pred;
+  const uint8_t *u_pre = pred + mb_pels;
+  const uint8_t *v_pre = pred + mb_pels * 2;
+  uint32_t *y_accum = accum;
+  uint32_t *u_accum = accum + mb_pels;
+  uint32_t *v_accum = accum + mb_pels * 2;
+  uint16_t *y_count = count;
+  uint16_t *u_count = count + mb_pels;
+  uint16_t *v_count = count + mb_pels * 2;
+
+  const unsigned int chroma_height = block_height >> ss_y,
+                     chroma_width = block_width >> ss_x;
+
+  DECLARE_ALIGNED(16, uint32_t, y_dist[BH * DIST_STRIDE]) = { 0 };
+  DECLARE_ALIGNED(16, uint32_t, u_dist[BH * DIST_STRIDE]) = { 0 };
+  DECLARE_ALIGNED(16, uint32_t, v_dist[BH * DIST_STRIDE]) = { 0 };
+
+  uint32_t *y_dist_ptr = y_dist + 1, *u_dist_ptr = u_dist + 1,
+           *v_dist_ptr = v_dist + 1;
+  const uint16_t *y_src_ptr = CONVERT_TO_SHORTPTR(y_src),
+                 *u_src_ptr = CONVERT_TO_SHORTPTR(u_src),
+                 *v_src_ptr = CONVERT_TO_SHORTPTR(v_src);
+  const uint16_t *y_pre_ptr = CONVERT_TO_SHORTPTR(y_pre),
+                 *u_pre_ptr = CONVERT_TO_SHORTPTR(u_pre),
+                 *v_pre_ptr = CONVERT_TO_SHORTPTR(v_pre);
+
+  // Loop variables
+  unsigned int row, blk_col;
+
+  assert(block_width <= BW && "block width too large");
+  assert(block_height <= BH && "block height too large");
+  assert(block_width % 16 == 0 && "block width must be multiple of 16");
+  assert(block_height % 2 == 0 && "block height must be even");
+  assert((ss_x == 0 || ss_x == 1) && (ss_y == 0 || ss_y == 1) &&
+         "invalid chroma subsampling");
+  assert(strength >= 0 && strength <= 14 &&
+         "invalid adjusted temporal filter strength");
+  assert(blk_fw[0] >= 0 && "filter weight must be positive");
+  assert(
+      (use_whole_blk || (blk_fw[1] >= 0 && blk_fw[2] >= 0 && blk_fw[3] >= 0)) &&
+      "subblock filter weight must be positive");
+  assert(blk_fw[0] <= 2 && "sublock filter weight must be less than 2");
+  assert(
+      (use_whole_blk || (blk_fw[1] <= 2 && blk_fw[2] <= 2 && blk_fw[3] <= 2)) &&
+      "subblock filter weight must be less than 2");
+
+  // Precompute the difference squared
+  for (row = 0; row < block_height; row++) {
+    for (blk_col = 0; blk_col < block_width; blk_col += 8) {
+      highbd_store_dist_8(y_src_ptr + blk_col, y_pre_ptr + blk_col,
+                          y_dist_ptr + blk_col);
+    }
+    y_src_ptr += y_src_stride;
+    y_pre_ptr += y_pre_stride;
+    y_dist_ptr += DIST_STRIDE;
+  }
+
+  for (row = 0; row < chroma_height; row++) {
+    for (blk_col = 0; blk_col < chroma_width; blk_col += 8) {
+      highbd_store_dist_8(u_src_ptr + blk_col, u_pre_ptr + blk_col,
+                          u_dist_ptr + blk_col);
+      highbd_store_dist_8(v_src_ptr + blk_col, v_pre_ptr + blk_col,
+                          v_dist_ptr + blk_col);
+    }
+
+    u_src_ptr += uv_src_stride;
+    u_pre_ptr += uv_pre_stride;
+    u_dist_ptr += DIST_STRIDE;
+    v_src_ptr += uv_src_stride;
+    v_pre_ptr += uv_pre_stride;
+    v_dist_ptr += DIST_STRIDE;
+  }
+
+  y_src_ptr = CONVERT_TO_SHORTPTR(y_src),
+  u_src_ptr = CONVERT_TO_SHORTPTR(u_src),
+  v_src_ptr = CONVERT_TO_SHORTPTR(v_src);
+  y_pre_ptr = CONVERT_TO_SHORTPTR(y_pre),
+  u_pre_ptr = CONVERT_TO_SHORTPTR(u_pre),
+  v_pre_ptr = CONVERT_TO_SHORTPTR(v_pre);
+
+  y_dist_ptr = y_dist + 1;
+  u_dist_ptr = u_dist + 1;
+  v_dist_ptr = v_dist + 1;
+
+  highbd_apply_temporal_filter_luma(
+      y_src_ptr, y_src_stride, y_pre_ptr, y_pre_stride, u_src_ptr, v_src_ptr,
+      uv_src_stride, u_pre_ptr, v_pre_ptr, uv_pre_stride, block_width,
+      block_height, ss_x, ss_y, strength, blk_fw, use_whole_blk, y_accum,
+      y_count, y_dist_ptr, u_dist_ptr, v_dist_ptr);
+
+  highbd_apply_temporal_filter_chroma(
+      y_src_ptr, y_src_stride, y_pre_ptr, y_pre_stride, u_src_ptr, v_src_ptr,
+      uv_src_stride, u_pre_ptr, v_pre_ptr, uv_pre_stride, block_width,
+      block_height, ss_x, ss_y, strength, blk_fw, use_whole_blk, u_accum,
+      u_count, v_accum, v_count, y_dist_ptr, u_dist_ptr, v_dist_ptr);
+}
+
+/////////////////////////
+// High bit-depth Ends //
+/////////////////////////
+
+void av1_apply_temporal_filter_yuv_sse4_1(
+    const YV12_BUFFER_CONFIG *ref_frame, const MACROBLOCKD *mbd,
+    const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
+    const int num_planes, const int strength, const int use_subblock,
+    const int *subblock_filter_weights, const uint8_t *pred, uint32_t *accum,
+    uint16_t *count) {
+  const int is_high_bitdepth = ref_frame->flags & YV12_FLAG_HIGHBITDEPTH;
+  // TODO(any): Need to support when `num_planes != 3`, like C implementation.
+  assert(num_planes == 3);
+  (void)num_planes;
+  if (is_high_bitdepth) {
+    highbd_apply_temporal_filter_yuv(
+        ref_frame, mbd, block_size, mb_row, mb_col, strength, use_subblock,
+        subblock_filter_weights, pred, accum, count);
+  } else {
+    apply_temporal_filter_yuv(ref_frame, mbd, block_size, mb_row, mb_col,
+                              strength, use_subblock, subblock_filter_weights,
+                              pred, accum, count);
+  }
+}
diff --git a/libs/libaom/src/av1/encoder/x86/wedge_utils_avx2.c b/libs/libaom/src/av1/encoder/x86/wedge_utils_avx2.c
new file mode 100644
index 000000000..c06bad8f7
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/x86/wedge_utils_avx2.c
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+#include <smmintrin.h>
+
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+#include "aom/aom_integer.h"
+
+#include "av1/common/reconinter.h"
+
+#define MAX_MASK_VALUE (1 << WEDGE_WEIGHT_BITS)
+
+/**
+ * See av1_wedge_sse_from_residuals_c
+ */
+uint64_t av1_wedge_sse_from_residuals_avx2(const int16_t *r1, const int16_t *d,
+                                           const uint8_t *m, int N) {
+  int n = -N;
+
+  uint64_t csse;
+
+  const __m256i v_mask_max_w = _mm256_set1_epi16(MAX_MASK_VALUE);
+  const __m256i v_zext_q = yy_set1_64_from_32i(0xffffffff);
+
+  __m256i v_acc0_q = _mm256_setzero_si256();
+
+  assert(N % 64 == 0);
+
+  r1 += N;
+  d += N;
+  m += N;
+
+  do {
+    const __m256i v_r0_w = _mm256_lddqu_si256((__m256i *)(r1 + n));
+    const __m256i v_d0_w = _mm256_lddqu_si256((__m256i *)(d + n));
+    const __m128i v_m01_b = _mm_lddqu_si128((__m128i *)(m + n));
+
+    const __m256i v_rd0l_w = _mm256_unpacklo_epi16(v_d0_w, v_r0_w);
+    const __m256i v_rd0h_w = _mm256_unpackhi_epi16(v_d0_w, v_r0_w);
+    const __m256i v_m0_w = _mm256_cvtepu8_epi16(v_m01_b);
+
+    const __m256i v_m0l_w = _mm256_unpacklo_epi16(v_m0_w, v_mask_max_w);
+    const __m256i v_m0h_w = _mm256_unpackhi_epi16(v_m0_w, v_mask_max_w);
+
+    const __m256i v_t0l_d = _mm256_madd_epi16(v_rd0l_w, v_m0l_w);
+    const __m256i v_t0h_d = _mm256_madd_epi16(v_rd0h_w, v_m0h_w);
+
+    const __m256i v_t0_w = _mm256_packs_epi32(v_t0l_d, v_t0h_d);
+
+    const __m256i v_sq0_d = _mm256_madd_epi16(v_t0_w, v_t0_w);
+
+    const __m256i v_sum0_q = _mm256_add_epi64(
+        _mm256_and_si256(v_sq0_d, v_zext_q), _mm256_srli_epi64(v_sq0_d, 32));
+
+    v_acc0_q = _mm256_add_epi64(v_acc0_q, v_sum0_q);
+
+    n += 16;
+  } while (n);
+
+  v_acc0_q = _mm256_add_epi64(v_acc0_q, _mm256_srli_si256(v_acc0_q, 8));
+  __m128i v_acc_q_0 = _mm256_castsi256_si128(v_acc0_q);
+  __m128i v_acc_q_1 = _mm256_extracti128_si256(v_acc0_q, 1);
+  v_acc_q_0 = _mm_add_epi64(v_acc_q_0, v_acc_q_1);
+#if ARCH_X86_64
+  csse = (uint64_t)_mm_extract_epi64(v_acc_q_0, 0);
+#else
+  xx_storel_64(&csse, v_acc_q_0);
+#endif
+
+  return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS);
+}
+
+/**
+ * See av1_wedge_sign_from_residuals_c
+ */
+int8_t av1_wedge_sign_from_residuals_avx2(const int16_t *ds, const uint8_t *m,
+                                          int N, int64_t limit) {
+  int64_t acc;
+  __m256i v_acc0_d = _mm256_setzero_si256();
+
+  // Input size limited to 8192 by the use of 32 bit accumulators and m
+  // being between [0, 64]. Overflow might happen at larger sizes,
+  // though it is practically impossible on real video input.
+  assert(N < 8192);
+  assert(N % 64 == 0);
+
+  do {
+    const __m256i v_m01_b = _mm256_lddqu_si256((__m256i *)(m));
+    const __m256i v_m23_b = _mm256_lddqu_si256((__m256i *)(m + 32));
+
+    const __m256i v_d0_w = _mm256_lddqu_si256((__m256i *)(ds));
+    const __m256i v_d1_w = _mm256_lddqu_si256((__m256i *)(ds + 16));
+    const __m256i v_d2_w = _mm256_lddqu_si256((__m256i *)(ds + 32));
+    const __m256i v_d3_w = _mm256_lddqu_si256((__m256i *)(ds + 48));
+
+    const __m256i v_m0_w =
+        _mm256_cvtepu8_epi16(_mm256_castsi256_si128(v_m01_b));
+    const __m256i v_m1_w =
+        _mm256_cvtepu8_epi16(_mm256_extracti128_si256(v_m01_b, 1));
+    const __m256i v_m2_w =
+        _mm256_cvtepu8_epi16(_mm256_castsi256_si128(v_m23_b));
+    const __m256i v_m3_w =
+        _mm256_cvtepu8_epi16(_mm256_extracti128_si256(v_m23_b, 1));
+
+    const __m256i v_p0_d = _mm256_madd_epi16(v_d0_w, v_m0_w);
+    const __m256i v_p1_d = _mm256_madd_epi16(v_d1_w, v_m1_w);
+    const __m256i v_p2_d = _mm256_madd_epi16(v_d2_w, v_m2_w);
+    const __m256i v_p3_d = _mm256_madd_epi16(v_d3_w, v_m3_w);
+
+    const __m256i v_p01_d = _mm256_add_epi32(v_p0_d, v_p1_d);
+    const __m256i v_p23_d = _mm256_add_epi32(v_p2_d, v_p3_d);
+
+    const __m256i v_p0123_d = _mm256_add_epi32(v_p01_d, v_p23_d);
+
+    v_acc0_d = _mm256_add_epi32(v_acc0_d, v_p0123_d);
+
+    ds += 64;
+    m += 64;
+
+    N -= 64;
+  } while (N);
+
+  __m256i v_sign_d = _mm256_srai_epi32(v_acc0_d, 31);
+  v_acc0_d = _mm256_add_epi64(_mm256_unpacklo_epi32(v_acc0_d, v_sign_d),
+                              _mm256_unpackhi_epi32(v_acc0_d, v_sign_d));
+
+  __m256i v_acc_q = _mm256_add_epi64(v_acc0_d, _mm256_srli_si256(v_acc0_d, 8));
+
+  __m128i v_acc_q_0 = _mm256_castsi256_si128(v_acc_q);
+  __m128i v_acc_q_1 = _mm256_extracti128_si256(v_acc_q, 1);
+  v_acc_q_0 = _mm_add_epi64(v_acc_q_0, v_acc_q_1);
+
+#if ARCH_X86_64
+  acc = (uint64_t)_mm_extract_epi64(v_acc_q_0, 0);
+#else
+  xx_storel_64(&acc, v_acc_q_0);
+#endif
+
+  return acc > limit;
+}
+
+/**
+ * av1_wedge_compute_delta_squares_c
+ */
+void av1_wedge_compute_delta_squares_avx2(int16_t *d, const int16_t *a,
+                                          const int16_t *b, int N) {
+  const __m256i v_neg_w = _mm256_set1_epi32(0xffff0001);
+
+  assert(N % 64 == 0);
+
+  do {
+    const __m256i v_a0_w = _mm256_lddqu_si256((__m256i *)(a));
+    const __m256i v_b0_w = _mm256_lddqu_si256((__m256i *)(b));
+    const __m256i v_a1_w = _mm256_lddqu_si256((__m256i *)(a + 16));
+    const __m256i v_b1_w = _mm256_lddqu_si256((__m256i *)(b + 16));
+    const __m256i v_a2_w = _mm256_lddqu_si256((__m256i *)(a + 32));
+    const __m256i v_b2_w = _mm256_lddqu_si256((__m256i *)(b + 32));
+    const __m256i v_a3_w = _mm256_lddqu_si256((__m256i *)(a + 48));
+    const __m256i v_b3_w = _mm256_lddqu_si256((__m256i *)(b + 48));
+
+    const __m256i v_ab0l_w = _mm256_unpacklo_epi16(v_a0_w, v_b0_w);
+    const __m256i v_ab0h_w = _mm256_unpackhi_epi16(v_a0_w, v_b0_w);
+    const __m256i v_ab1l_w = _mm256_unpacklo_epi16(v_a1_w, v_b1_w);
+    const __m256i v_ab1h_w = _mm256_unpackhi_epi16(v_a1_w, v_b1_w);
+    const __m256i v_ab2l_w = _mm256_unpacklo_epi16(v_a2_w, v_b2_w);
+    const __m256i v_ab2h_w = _mm256_unpackhi_epi16(v_a2_w, v_b2_w);
+    const __m256i v_ab3l_w = _mm256_unpacklo_epi16(v_a3_w, v_b3_w);
+    const __m256i v_ab3h_w = _mm256_unpackhi_epi16(v_a3_w, v_b3_w);
+
+    // Negate top word of pairs
+    const __m256i v_abl0n_w = _mm256_sign_epi16(v_ab0l_w, v_neg_w);
+    const __m256i v_abh0n_w = _mm256_sign_epi16(v_ab0h_w, v_neg_w);
+    const __m256i v_abl1n_w = _mm256_sign_epi16(v_ab1l_w, v_neg_w);
+    const __m256i v_abh1n_w = _mm256_sign_epi16(v_ab1h_w, v_neg_w);
+    const __m256i v_abl2n_w = _mm256_sign_epi16(v_ab2l_w, v_neg_w);
+    const __m256i v_abh2n_w = _mm256_sign_epi16(v_ab2h_w, v_neg_w);
+    const __m256i v_abl3n_w = _mm256_sign_epi16(v_ab3l_w, v_neg_w);
+    const __m256i v_abh3n_w = _mm256_sign_epi16(v_ab3h_w, v_neg_w);
+
+    const __m256i v_r0l_w = _mm256_madd_epi16(v_ab0l_w, v_abl0n_w);
+    const __m256i v_r0h_w = _mm256_madd_epi16(v_ab0h_w, v_abh0n_w);
+    const __m256i v_r1l_w = _mm256_madd_epi16(v_ab1l_w, v_abl1n_w);
+    const __m256i v_r1h_w = _mm256_madd_epi16(v_ab1h_w, v_abh1n_w);
+    const __m256i v_r2l_w = _mm256_madd_epi16(v_ab2l_w, v_abl2n_w);
+    const __m256i v_r2h_w = _mm256_madd_epi16(v_ab2h_w, v_abh2n_w);
+    const __m256i v_r3l_w = _mm256_madd_epi16(v_ab3l_w, v_abl3n_w);
+    const __m256i v_r3h_w = _mm256_madd_epi16(v_ab3h_w, v_abh3n_w);
+
+    const __m256i v_r0_w = _mm256_packs_epi32(v_r0l_w, v_r0h_w);
+    const __m256i v_r1_w = _mm256_packs_epi32(v_r1l_w, v_r1h_w);
+    const __m256i v_r2_w = _mm256_packs_epi32(v_r2l_w, v_r2h_w);
+    const __m256i v_r3_w = _mm256_packs_epi32(v_r3l_w, v_r3h_w);
+
+    _mm256_store_si256((__m256i *)(d), v_r0_w);
+    _mm256_store_si256((__m256i *)(d + 16), v_r1_w);
+    _mm256_store_si256((__m256i *)(d + 32), v_r2_w);
+    _mm256_store_si256((__m256i *)(d + 48), v_r3_w);
+
+    a += 64;
+    b += 64;
+    d += 64;
+    N -= 64;
+  } while (N);
+}
diff --git a/libs/libaom/src/av1/encoder/x86/wedge_utils_sse2.c b/libs/libaom/src/av1/encoder/x86/wedge_utils_sse2.c
new file mode 100644
index 000000000..f3f4b8a75
--- /dev/null
+++ b/libs/libaom/src/av1/encoder/x86/wedge_utils_sse2.c
@@ -0,0 +1,254 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "aom_dsp/x86/synonyms.h"
+
+#include "aom/aom_integer.h"
+
+#include "av1/common/reconinter.h"
+
+#define MAX_MASK_VALUE (1 << WEDGE_WEIGHT_BITS)
+
+/**
+ * See av1_wedge_sse_from_residuals_c
+ */
+uint64_t av1_wedge_sse_from_residuals_sse2(const int16_t *r1, const int16_t *d,
+                                           const uint8_t *m, int N) {
+  int n = -N;
+  int n8 = n + 8;
+
+  uint64_t csse;
+
+  const __m128i v_mask_max_w = _mm_set1_epi16(MAX_MASK_VALUE);
+  const __m128i v_zext_q = xx_set1_64_from_32i(0xffffffff);
+
+  __m128i v_acc0_q = _mm_setzero_si128();
+
+  assert(N % 64 == 0);
+
+  r1 += N;
+  d += N;
+  m += N;
+
+  do {
+    const __m128i v_r0_w = xx_load_128(r1 + n);
+    const __m128i v_r1_w = xx_load_128(r1 + n8);
+    const __m128i v_d0_w = xx_load_128(d + n);
+    const __m128i v_d1_w = xx_load_128(d + n8);
+    const __m128i v_m01_b = xx_load_128(m + n);
+
+    const __m128i v_rd0l_w = _mm_unpacklo_epi16(v_d0_w, v_r0_w);
+    const __m128i v_rd0h_w = _mm_unpackhi_epi16(v_d0_w, v_r0_w);
+    const __m128i v_rd1l_w = _mm_unpacklo_epi16(v_d1_w, v_r1_w);
+    const __m128i v_rd1h_w = _mm_unpackhi_epi16(v_d1_w, v_r1_w);
+    const __m128i v_m0_w = _mm_unpacklo_epi8(v_m01_b, _mm_setzero_si128());
+    const __m128i v_m1_w = _mm_unpackhi_epi8(v_m01_b, _mm_setzero_si128());
+
+    const __m128i v_m0l_w = _mm_unpacklo_epi16(v_m0_w, v_mask_max_w);
+    const __m128i v_m0h_w = _mm_unpackhi_epi16(v_m0_w, v_mask_max_w);
+    const __m128i v_m1l_w = _mm_unpacklo_epi16(v_m1_w, v_mask_max_w);
+    const __m128i v_m1h_w = _mm_unpackhi_epi16(v_m1_w, v_mask_max_w);
+
+    const __m128i v_t0l_d = _mm_madd_epi16(v_rd0l_w, v_m0l_w);
+    const __m128i v_t0h_d = _mm_madd_epi16(v_rd0h_w, v_m0h_w);
+    const __m128i v_t1l_d = _mm_madd_epi16(v_rd1l_w, v_m1l_w);
+    const __m128i v_t1h_d = _mm_madd_epi16(v_rd1h_w, v_m1h_w);
+
+    const __m128i v_t0_w = _mm_packs_epi32(v_t0l_d, v_t0h_d);
+    const __m128i v_t1_w = _mm_packs_epi32(v_t1l_d, v_t1h_d);
+
+    const __m128i v_sq0_d = _mm_madd_epi16(v_t0_w, v_t0_w);
+    const __m128i v_sq1_d = _mm_madd_epi16(v_t1_w, v_t1_w);
+
+    const __m128i v_sum0_q = _mm_add_epi64(_mm_and_si128(v_sq0_d, v_zext_q),
+                                           _mm_srli_epi64(v_sq0_d, 32));
+    const __m128i v_sum1_q = _mm_add_epi64(_mm_and_si128(v_sq1_d, v_zext_q),
+                                           _mm_srli_epi64(v_sq1_d, 32));
+
+    v_acc0_q = _mm_add_epi64(v_acc0_q, v_sum0_q);
+    v_acc0_q = _mm_add_epi64(v_acc0_q, v_sum1_q);
+
+    n8 += 16;
+    n += 16;
+  } while (n);
+
+  v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_srli_si128(v_acc0_q, 8));
+
+#if ARCH_X86_64
+  csse = (uint64_t)_mm_cvtsi128_si64(v_acc0_q);
+#else
+  xx_storel_64(&csse, v_acc0_q);
+#endif
+
+  return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS);
+}
+
+/**
+ * See av1_wedge_sign_from_residuals_c
+ */
+int8_t av1_wedge_sign_from_residuals_sse2(const int16_t *ds, const uint8_t *m,
+                                          int N, int64_t limit) {
+  int64_t acc;
+
+  __m128i v_sign_d;
+  __m128i v_acc0_d = _mm_setzero_si128();
+  __m128i v_acc1_d = _mm_setzero_si128();
+  __m128i v_acc_q;
+
+  // Input size limited to 8192 by the use of 32 bit accumulators and m
+  // being between [0, 64]. Overflow might happen at larger sizes,
+  // though it is practically impossible on real video input.
+  assert(N < 8192);
+  assert(N % 64 == 0);
+
+  do {
+    const __m128i v_m01_b = xx_load_128(m);
+    const __m128i v_m23_b = xx_load_128(m + 16);
+    const __m128i v_m45_b = xx_load_128(m + 32);
+    const __m128i v_m67_b = xx_load_128(m + 48);
+
+    const __m128i v_d0_w = xx_load_128(ds);
+    const __m128i v_d1_w = xx_load_128(ds + 8);
+    const __m128i v_d2_w = xx_load_128(ds + 16);
+    const __m128i v_d3_w = xx_load_128(ds + 24);
+    const __m128i v_d4_w = xx_load_128(ds + 32);
+    const __m128i v_d5_w = xx_load_128(ds + 40);
+    const __m128i v_d6_w = xx_load_128(ds + 48);
+    const __m128i v_d7_w = xx_load_128(ds + 56);
+
+    const __m128i v_m0_w = _mm_unpacklo_epi8(v_m01_b, _mm_setzero_si128());
+    const __m128i v_m1_w = _mm_unpackhi_epi8(v_m01_b, _mm_setzero_si128());
+    const __m128i v_m2_w = _mm_unpacklo_epi8(v_m23_b, _mm_setzero_si128());
+    const __m128i v_m3_w = _mm_unpackhi_epi8(v_m23_b, _mm_setzero_si128());
+    const __m128i v_m4_w = _mm_unpacklo_epi8(v_m45_b, _mm_setzero_si128());
+    const __m128i v_m5_w = _mm_unpackhi_epi8(v_m45_b, _mm_setzero_si128());
+    const __m128i v_m6_w = _mm_unpacklo_epi8(v_m67_b, _mm_setzero_si128());
+    const __m128i v_m7_w = _mm_unpackhi_epi8(v_m67_b, _mm_setzero_si128());
+
+    const __m128i v_p0_d = _mm_madd_epi16(v_d0_w, v_m0_w);
+    const __m128i v_p1_d = _mm_madd_epi16(v_d1_w, v_m1_w);
+    const __m128i v_p2_d = _mm_madd_epi16(v_d2_w, v_m2_w);
+    const __m128i v_p3_d = _mm_madd_epi16(v_d3_w, v_m3_w);
+    const __m128i v_p4_d = _mm_madd_epi16(v_d4_w, v_m4_w);
+    const __m128i v_p5_d = _mm_madd_epi16(v_d5_w, v_m5_w);
+    const __m128i v_p6_d = _mm_madd_epi16(v_d6_w, v_m6_w);
+    const __m128i v_p7_d = _mm_madd_epi16(v_d7_w, v_m7_w);
+
+    const __m128i v_p01_d = _mm_add_epi32(v_p0_d, v_p1_d);
+    const __m128i v_p23_d = _mm_add_epi32(v_p2_d, v_p3_d);
+    const __m128i v_p45_d = _mm_add_epi32(v_p4_d, v_p5_d);
+    const __m128i v_p67_d = _mm_add_epi32(v_p6_d, v_p7_d);
+
+    const __m128i v_p0123_d = _mm_add_epi32(v_p01_d, v_p23_d);
+    const __m128i v_p4567_d = _mm_add_epi32(v_p45_d, v_p67_d);
+
+    v_acc0_d = _mm_add_epi32(v_acc0_d, v_p0123_d);
+    v_acc1_d = _mm_add_epi32(v_acc1_d, v_p4567_d);
+
+    ds += 64;
+    m += 64;
+
+    N -= 64;
+  } while (N);
+
+  v_sign_d = _mm_cmplt_epi32(v_acc0_d, _mm_setzero_si128());
+  v_acc0_d = _mm_add_epi64(_mm_unpacklo_epi32(v_acc0_d, v_sign_d),
+                           _mm_unpackhi_epi32(v_acc0_d, v_sign_d));
+
+  v_sign_d = _mm_cmplt_epi32(v_acc1_d, _mm_setzero_si128());
+  v_acc1_d = _mm_add_epi64(_mm_unpacklo_epi32(v_acc1_d, v_sign_d),
+                           _mm_unpackhi_epi32(v_acc1_d, v_sign_d));
+
+  v_acc_q = _mm_add_epi64(v_acc0_d, v_acc1_d);
+
+  v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8));
+
+#if ARCH_X86_64
+  acc = (uint64_t)_mm_cvtsi128_si64(v_acc_q);
+#else
+  xx_storel_64(&acc, v_acc_q);
+#endif
+
+  return acc > limit;
+}
+
+// Negate under mask
+static INLINE __m128i negm_epi16(__m128i v_v_w, __m128i v_mask_w) {
+  return _mm_sub_epi16(_mm_xor_si128(v_v_w, v_mask_w), v_mask_w);
+}
+
+/**
+ * av1_wedge_compute_delta_squares_c
+ */
+void av1_wedge_compute_delta_squares_sse2(int16_t *d, const int16_t *a,
+                                          const int16_t *b, int N) {
+  const __m128i v_neg_w = _mm_set_epi16((short)0xffff, 0, (short)0xffff, 0,
+                                        (short)0xffff, 0, (short)0xffff, 0);
+
+  assert(N % 64 == 0);
+
+  do {
+    const __m128i v_a0_w = xx_load_128(a);
+    const __m128i v_b0_w = xx_load_128(b);
+    const __m128i v_a1_w = xx_load_128(a + 8);
+    const __m128i v_b1_w = xx_load_128(b + 8);
+    const __m128i v_a2_w = xx_load_128(a + 16);
+    const __m128i v_b2_w = xx_load_128(b + 16);
+    const __m128i v_a3_w = xx_load_128(a + 24);
+    const __m128i v_b3_w = xx_load_128(b + 24);
+
+    const __m128i v_ab0l_w = _mm_unpacklo_epi16(v_a0_w, v_b0_w);
+    const __m128i v_ab0h_w = _mm_unpackhi_epi16(v_a0_w, v_b0_w);
+    const __m128i v_ab1l_w = _mm_unpacklo_epi16(v_a1_w, v_b1_w);
+    const __m128i v_ab1h_w = _mm_unpackhi_epi16(v_a1_w, v_b1_w);
+    const __m128i v_ab2l_w = _mm_unpacklo_epi16(v_a2_w, v_b2_w);
+    const __m128i v_ab2h_w = _mm_unpackhi_epi16(v_a2_w, v_b2_w);
+    const __m128i v_ab3l_w = _mm_unpacklo_epi16(v_a3_w, v_b3_w);
+    const __m128i v_ab3h_w = _mm_unpackhi_epi16(v_a3_w, v_b3_w);
+
+    // Negate top word of pairs
+    const __m128i v_abl0n_w = negm_epi16(v_ab0l_w, v_neg_w);
+    const __m128i v_abh0n_w = negm_epi16(v_ab0h_w, v_neg_w);
+    const __m128i v_abl1n_w = negm_epi16(v_ab1l_w, v_neg_w);
+    const __m128i v_abh1n_w = negm_epi16(v_ab1h_w, v_neg_w);
+    const __m128i v_abl2n_w = negm_epi16(v_ab2l_w, v_neg_w);
+    const __m128i v_abh2n_w = negm_epi16(v_ab2h_w, v_neg_w);
+    const __m128i v_abl3n_w = negm_epi16(v_ab3l_w, v_neg_w);
+    const __m128i v_abh3n_w = negm_epi16(v_ab3h_w, v_neg_w);
+
+    const __m128i v_r0l_w = _mm_madd_epi16(v_ab0l_w, v_abl0n_w);
+    const __m128i v_r0h_w = _mm_madd_epi16(v_ab0h_w, v_abh0n_w);
+    const __m128i v_r1l_w = _mm_madd_epi16(v_ab1l_w, v_abl1n_w);
+    const __m128i v_r1h_w = _mm_madd_epi16(v_ab1h_w, v_abh1n_w);
+    const __m128i v_r2l_w = _mm_madd_epi16(v_ab2l_w, v_abl2n_w);
+    const __m128i v_r2h_w = _mm_madd_epi16(v_ab2h_w, v_abh2n_w);
+    const __m128i v_r3l_w = _mm_madd_epi16(v_ab3l_w, v_abl3n_w);
+    const __m128i v_r3h_w = _mm_madd_epi16(v_ab3h_w, v_abh3n_w);
+
+    const __m128i v_r0_w = _mm_packs_epi32(v_r0l_w, v_r0h_w);
+    const __m128i v_r1_w = _mm_packs_epi32(v_r1l_w, v_r1h_w);
+    const __m128i v_r2_w = _mm_packs_epi32(v_r2l_w, v_r2h_w);
+    const __m128i v_r3_w = _mm_packs_epi32(v_r3l_w, v_r3h_w);
+
+    xx_store_128(d, v_r0_w);
+    xx_store_128(d + 8, v_r1_w);
+    xx_store_128(d + 16, v_r2_w);
+    xx_store_128(d + 24, v_r3_w);
+
+    a += 32;
+    b += 32;
+    d += 32;
+    N -= 32;
+  } while (N);
+}
diff --git a/libs/libaom/src/av1/exports_com b/libs/libaom/src/av1/exports_com
new file mode 100644
index 000000000..5c8e0e09d
--- /dev/null
+++ b/libs/libaom/src/av1/exports_com
@@ -0,0 +1,2 @@
+text aom_read_obu_header_and_size
+text av1_resize_frame420
diff --git a/libs/libaom/src/av1/exports_dec b/libs/libaom/src/av1/exports_dec
new file mode 100644
index 000000000..daabf6766
--- /dev/null
+++ b/libs/libaom/src/av1/exports_dec
@@ -0,0 +1,3 @@
+data aom_codec_av1_dx_algo
+text aom_codec_av1_dx
+text av1_add_film_grain
diff --git a/libs/libaom/src/av1/exports_enc b/libs/libaom/src/av1/exports_enc
new file mode 100644
index 000000000..dc4a9eae7
--- /dev/null
+++ b/libs/libaom/src/av1/exports_enc
@@ -0,0 +1,2 @@
+data aom_codec_av1_cx_algo
+text aom_codec_av1_cx
diff --git a/libs/libaom/src/av1/exports_ident b/libs/libaom/src/av1/exports_ident
new file mode 100644
index 000000000..b523a679d
--- /dev/null
+++ b/libs/libaom/src/av1/exports_ident
@@ -0,0 +1,2 @@
+text ifd_init
+text ifd_inspect
diff --git a/libs/libaom/src/av1/exports_test b/libs/libaom/src/av1/exports_test
new file mode 100644
index 000000000..dab377575
--- /dev/null
+++ b/libs/libaom/src/av1/exports_test
@@ -0,0 +1,2 @@
+text av1_get_fwd_txfm_cfg
+text av1_rtcd
diff --git a/libs/libaom/src/build/.gitattributes b/libs/libaom/src/build/.gitattributes
new file mode 100644
index 000000000..03db79bc0
--- /dev/null
+++ b/libs/libaom/src/build/.gitattributes
@@ -0,0 +1,2 @@
+*-vs8/*.rules -crlf
+*-msvs/*.rules -crlf
diff --git a/libs/libaom/src/build/cmake/aom_config.c.template b/libs/libaom/src/build/cmake/aom_config.c.template
new file mode 100644
index 000000000..62f0a10ab
--- /dev/null
+++ b/libs/libaom/src/build/cmake/aom_config.c.template
@@ -0,0 +1,13 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include "aom/aom_codec.h"
+static const char* const cfg = "${AOM_CMAKE_CONFIG}";
+const char *aom_codec_build_config(void) {return cfg;}
diff --git a/libs/libaom/src/build/cmake/aom_config_defaults.cmake b/libs/libaom/src/build/cmake/aom_config_defaults.cmake
new file mode 100644
index 000000000..f9e70eb24
--- /dev/null
+++ b/libs/libaom/src/build/cmake/aom_config_defaults.cmake
@@ -0,0 +1,193 @@
+#
+# Copyright (c) 2016, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+
+include("${AOM_ROOT}/build/cmake/util.cmake")
+
+# This file sets default values for libaom configuration variables. All libaom
+# config variables are added to the CMake variable cache via the macros provided
+# in util.cmake.
+
+#
+# The variables in this section of the file are detected at configuration time,
+# but can be overridden via the use of CONFIG_* and ENABLE_* values also defined
+# in this file.
+#
+
+set_aom_detect_var(INLINE "" "Sets INLINE value for current target.")
+
+# CPUs.
+set_aom_detect_var(ARCH_ARM 0 "Enables ARM architecture.")
+set_aom_detect_var(ARCH_MIPS 0 "Enables MIPS architecture.")
+set_aom_detect_var(ARCH_PPC 0 "Enables PPC architecture.")
+set_aom_detect_var(ARCH_X86 0 "Enables X86 architecture.")
+set_aom_detect_var(ARCH_X86_64 0 "Enables X86_64 architecture.")
+
+# ARM feature flags.
+set_aom_detect_var(HAVE_NEON 0 "Enables NEON intrinsics optimizations.")
+
+# MIPS feature flags.
+set_aom_detect_var(HAVE_DSPR2 0 "Enables DSPR2 optimizations.")
+set_aom_detect_var(HAVE_MIPS32 0 "Enables MIPS32 optimizations.")
+set_aom_detect_var(HAVE_MIPS64 0 "Enables MIPS64 optimizations. ")
+set_aom_detect_var(HAVE_MSA 0 "Enables MSA optimizations.")
+
+# PPC feature flags.
+set_aom_detect_var(HAVE_VSX 0 "Enables VSX optimizations.")
+
+# x86/x86_64 feature flags.
+set_aom_detect_var(HAVE_AVX 0 "Enables AVX optimizations.")
+set_aom_detect_var(HAVE_AVX2 0 "Enables AVX2 optimizations.")
+set_aom_detect_var(HAVE_MMX 0 "Enables MMX optimizations. ")
+set_aom_detect_var(HAVE_SSE 0 "Enables SSE optimizations.")
+set_aom_detect_var(HAVE_SSE2 0 "Enables SSE2 optimizations.")
+set_aom_detect_var(HAVE_SSE3 0 "Enables SSE3 optimizations.")
+set_aom_detect_var(HAVE_SSE4_1 0 "Enables SSE 4.1 optimizations.")
+set_aom_detect_var(HAVE_SSE4_2 0 "Enables SSE 4.2 optimizations.")
+set_aom_detect_var(HAVE_SSSE3 0 "Enables SSSE3 optimizations.")
+
+# Flags describing the build environment.
+set_aom_detect_var(HAVE_FEXCEPT 0
+                   "Internal flag, GNU fenv.h present for target.")
+set_aom_detect_var(HAVE_PTHREAD_H 0 "Internal flag, target pthread support.")
+set_aom_detect_var(HAVE_UNISTD_H 0
+                   "Internal flag, unistd.h present for target.")
+set_aom_detect_var(HAVE_WXWIDGETS 0 "WxWidgets present.")
+
+#
+# Variables in this section can be set from the CMake command line or from
+# within the CMake GUI. The variables control libaom features.
+#
+
+# Build configuration flags.
+set_aom_config_var(AOM_RTCD_FLAGS ""
+                   "Arguments to pass to rtcd.pl. Separate with ';'")
+set_aom_config_var(CONFIG_AV1_DECODER 1 "Enable AV1 decoder.")
+set_aom_config_var(CONFIG_AV1_ENCODER 1 "Enable AV1 encoder.")
+set_aom_config_var(CONFIG_BIG_ENDIAN 0 "Internal flag.")
+set_aom_config_var(CONFIG_GCC 0 "Building with GCC (detect).")
+set_aom_config_var(CONFIG_GCOV 0 "Enable gcov support.")
+set_aom_config_var(CONFIG_GPROF 0 "Enable gprof support.")
+set_aom_config_var(CONFIG_LIBYUV 1 "Enables libyuv scaling/conversion support.")
+
+set_aom_config_var(CONFIG_MULTITHREAD 1 "Multithread support.")
+set_aom_config_var(CONFIG_OS_SUPPORT 0 "Internal flag.")
+set_aom_config_var(CONFIG_PIC 0 "Build with PIC enabled.")
+set_aom_config_var(CONFIG_RUNTIME_CPU_DETECT 1 "Runtime CPU detection support.")
+set_aom_config_var(CONFIG_SHARED 0 "Build shared libs.")
+set_aom_config_var(CONFIG_WEBM_IO 1 "Enables WebM support.")
+
+# Debugging flags.
+set_aom_config_var(CONFIG_BITSTREAM_DEBUG 0 "Bitstream debugging flag.")
+set_aom_config_var(CONFIG_DEBUG 0 "Debug build flag.")
+set_aom_config_var(CONFIG_MISMATCH_DEBUG 0 "Mismatch debugging flag.")
+
+# AV1 feature flags.
+set_aom_config_var(CONFIG_ACCOUNTING 0 "Enables bit accounting.")
+set_aom_config_var(CONFIG_ANALYZER 0 "Enables bit stream analyzer.")
+set_aom_config_var(CONFIG_COEFFICIENT_RANGE_CHECKING 0
+                   "Coefficient range check.")
+set_aom_config_var(CONFIG_DENOISE 1
+                   "Denoise/noise modeling support in encoder.")
+set_aom_config_var(CONFIG_INSPECTION 0 "Enables bitstream inspection.")
+set_aom_config_var(CONFIG_INTERNAL_STATS 0 "Enables internal encoder stats.")
+set_aom_config_var(FORCE_HIGHBITDEPTH_DECODING 0
+                   "Force high bitdepth decoding pipeline on 8-bit input.")
+mark_as_advanced(FORCE_HIGHBITDEPTH_DECODING)
+set_aom_config_var(CONFIG_MAX_DECODE_PROFILE 2
+                   "Max profile to support decoding.")
+set_aom_config_var(CONFIG_NORMAL_TILE_MODE 0 "Only enables normal tile mode.")
+set_aom_config_var(CONFIG_SIZE_LIMIT 0 "Limit max decode width/height.")
+set_aom_config_var(CONFIG_SPATIAL_RESAMPLING 1 "Spatial resampling.")
+set_aom_config_var(DECODE_HEIGHT_LIMIT 0 "Set limit for decode height.")
+set_aom_config_var(DECODE_WIDTH_LIMIT 0 "Set limit for decode width.")
+set_aom_config_var(CONFIG_TUNE_VMAF 0 "Enable encoding tuning for VMAF.")
+
+# AV1 experiment flags.
+set_aom_config_var(CONFIG_SPEED_STATS 0 "AV1 experiment flag.")
+set_aom_config_var(CONFIG_COLLECT_RD_STATS 0 "AV1 experiment flag.")
+set_aom_config_var(CONFIG_DIST_8X8 0 "AV1 experiment flag.")
+set_aom_config_var(CONFIG_ENTROPY_STATS 0 "AV1 experiment flag.")
+set_aom_config_var(CONFIG_INTER_STATS_ONLY 0 "AV1 experiment flag.")
+set_aom_config_var(CONFIG_RD_DEBUG 0 "AV1 experiment flag.")
+set_aom_config_var(CONFIG_SHARP_SETTINGS 0 "AV1 experiment flag.")
+set_aom_config_var(CONFIG_DISABLE_FULL_PIXEL_SPLIT_8X8 1
+                   "Disable full_pixel_motion_search_based_split on BLOCK_8X8.")
+set_aom_config_var(CONFIG_COLLECT_PARTITION_STATS 0
+                   "Collect stats on partition decisions.")
+set_aom_config_var(CONFIG_COLLECT_COMPONENT_TIMING 0
+                   "Collect encoding component timing information.")
+set_aom_config_var(CONFIG_LPF_MASK 0
+                   "Enable the use loop filter bitmasks for optimizations.")
+set_aom_config_var(CONFIG_HTB_TRELLIS 0
+                   "Enable the use of hash table for trellis optimizations.")
+set_aom_config_var(CONFIG_REALTIME_ONLY 0
+                   "Build for RTC-only to reduce binary size.")
+set_aom_config_var(CONFIG_AV1_HIGHBITDEPTH 1
+                   "Build with high bitdepth support.")
+set_aom_config_var(CONFIG_NN_V2 0 "Fully-connected neural nets ver.2.")
+set_aom_config_var(CONFIG_SUPERRES_IN_RECODE 1
+                   "Enable encoding both full-res and superres in recode loop"
+                   "when SUPERRES_AUTO mode is used.")
+#
+# Variables in this section control optional features of the build system.
+#
+set_aom_option_var(ENABLE_CCACHE "Enable ccache support." OFF)
+set_aom_option_var(ENABLE_DECODE_PERF_TESTS "Enables decoder performance tests"
+                   OFF)
+set_aom_option_var(ENABLE_DISTCC "Enable distcc support." OFF)
+set_aom_option_var(ENABLE_DOCS
+                   "Enable documentation generation (doxygen required)." ON)
+set_aom_option_var(ENABLE_ENCODE_PERF_TESTS "Enables encoder performance tests"
+                   OFF)
+set_aom_option_var(ENABLE_EXAMPLES "Enables build of example code." ON)
+set_aom_option_var(ENABLE_GOMA "Enable goma support." OFF)
+set_aom_option_var(
+  ENABLE_IDE_TEST_HOSTING
+  "Enables running tests within IDEs like Visual Studio and Xcode." OFF)
+set_aom_option_var(ENABLE_NASM "Use nasm instead of yasm for x86 assembly." OFF)
+set_aom_option_var(ENABLE_TESTDATA "Enables unit test data download targets."
+                   ON)
+set_aom_option_var(ENABLE_TESTS "Enables unit tests." ON)
+set_aom_option_var(ENABLE_TOOLS "Enable applications in tools sub directory."
+                   ON)
+set_aom_option_var(ENABLE_WERROR "Converts warnings to errors at compile time."
+                   OFF)
+
+# ARM assembly/intrinsics flags.
+set_aom_option_var(ENABLE_NEON "Enables NEON optimizations on ARM targets." ON)
+
+# MIPS assembly/intrinsics flags.
+set_aom_option_var(ENABLE_DSPR2 "Enables DSPR2 optimizations on MIPS targets."
+                   OFF)
+set_aom_option_var(ENABLE_MSA "Enables MSA optimizations on MIPS targets." OFF)
+
+# VSX intrinsics flags.
+set_aom_option_var(ENABLE_VSX "Enables VSX optimizations on PowerPC targets."
+                   ON)
+
+# x86/x86_64 assembly/intrinsics flags.
+set_aom_option_var(ENABLE_MMX "Enables MMX optimizations on x86/x86_64 targets."
+                   ON)
+set_aom_option_var(ENABLE_SSE "Enables SSE optimizations on x86/x86_64 targets."
+                   ON)
+set_aom_option_var(ENABLE_SSE2
+                   "Enables SSE2 optimizations on x86/x86_64 targets." ON)
+set_aom_option_var(ENABLE_SSE3
+                   "Enables SSE3 optimizations on x86/x86_64 targets." ON)
+set_aom_option_var(ENABLE_SSSE3
+                   "Enables SSSE3 optimizations on x86/x86_64 targets." ON)
+set_aom_option_var(ENABLE_SSE4_1
+                   "Enables SSE4_1 optimizations on x86/x86_64 targets." ON)
+set_aom_option_var(ENABLE_SSE4_2
+                   "Enables SSE4_2 optimizations on x86/x86_64 targets." ON)
+set_aom_option_var(ENABLE_AVX "Enables AVX optimizations on x86/x86_64 targets."
+                   ON)
+set_aom_option_var(ENABLE_AVX2
+                   "Enables AVX2 optimizations on x86/x86_64 targets." ON)
diff --git a/libs/libaom/src/build/cmake/aom_configure.cmake b/libs/libaom/src/build/cmake/aom_configure.cmake
new file mode 100644
index 000000000..224a46587
--- /dev/null
+++ b/libs/libaom/src/build/cmake/aom_configure.cmake
@@ -0,0 +1,399 @@
+#
+# Copyright (c) 2016, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_AOM_CONFIGURE_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_AOM_CONFIGURE_CMAKE_
+set(AOM_BUILD_CMAKE_AOM_CONFIGURE_CMAKE_ 1)
+
+include(FindGit)
+include(FindPerl)
+include(FindThreads)
+
+include("${AOM_ROOT}/build/cmake/aom_config_defaults.cmake")
+include("${AOM_ROOT}/build/cmake/aom_experiment_deps.cmake")
+include("${AOM_ROOT}/build/cmake/aom_optimization.cmake")
+include("${AOM_ROOT}/build/cmake/compiler_flags.cmake")
+include("${AOM_ROOT}/build/cmake/compiler_tests.cmake")
+include("${AOM_ROOT}/build/cmake/util.cmake")
+
+if(DEFINED CONFIG_LOWBITDEPTH)
+  message(WARNING "CONFIG_LOWBITDEPTH has been removed. \
+    Use -DFORCE_HIGHBITDEPTH_DECODING=1 instead of -DCONFIG_LOWBITDEPTH=0 \
+    and -DFORCE_HIGHBITDEPTH_DECODING=0 instead of -DCONFIG_LOWBITDEPTH=1.")
+  if(NOT CONFIG_LOWBITDEPTH)
+    set(FORCE_HIGHBITDEPTH_DECODING
+        1
+        CACHE STRING "${cmake_cmdline_helpstring}" FORCE)
+  endif()
+endif()
+
+if(FORCE_HIGHBITDEPTH_DECODING AND NOT CONFIG_AV1_HIGHBITDEPTH)
+  change_config_and_warn(CONFIG_AV1_HIGHBITDEPTH 1
+                         "FORCE_HIGHBITDEPTH_DECODING")
+endif()
+
+# Generate the user config settings.
+list(APPEND aom_build_vars ${AOM_CONFIG_VARS} ${AOM_OPTION_VARS})
+foreach(cache_var ${aom_build_vars})
+  get_property(cache_var_helpstring CACHE ${cache_var} PROPERTY HELPSTRING)
+  if("${cache_var_helpstring}" STREQUAL "${cmake_cmdline_helpstring}")
+    set(AOM_CMAKE_CONFIG "${AOM_CMAKE_CONFIG} -D${cache_var}=${${cache_var}}")
+  endif()
+endforeach()
+string(STRIP "${AOM_CMAKE_CONFIG}" AOM_CMAKE_CONFIG)
+
+# Detect target CPU.
+if(NOT AOM_TARGET_CPU)
+  string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" cpu_lowercase)
+  if("${cpu_lowercase}" STREQUAL "amd64"
+     OR "${cpu_lowercase}" STREQUAL "x86_64")
+    if(${CMAKE_SIZEOF_VOID_P} EQUAL 4)
+      set(AOM_TARGET_CPU "x86")
+    elseif(${CMAKE_SIZEOF_VOID_P} EQUAL 8)
+      set(AOM_TARGET_CPU "x86_64")
+    else()
+      message(
+        FATAL_ERROR "--- Unexpected pointer size (${CMAKE_SIZEOF_VOID_P}) for\n"
+                    "      CMAKE_SYSTEM_NAME=${CMAKE_SYSTEM_NAME}\n"
+                    "      CMAKE_SYSTEM_PROCESSOR=${CMAKE_SYSTEM_PROCESSOR}\n"
+                    "      CMAKE_GENERATOR=${CMAKE_GENERATOR}\n")
+    endif()
+  elseif("${cpu_lowercase}" STREQUAL "i386"
+         OR "${cpu_lowercase}" STREQUAL "x86")
+    set(AOM_TARGET_CPU "x86")
+  elseif("${cpu_lowercase}" MATCHES "^arm"
+         OR "${cpu_lowercase}" MATCHES "^mips")
+    set(AOM_TARGET_CPU "${cpu_lowercase}")
+  elseif("${cpu_lowercase}" MATCHES "aarch64")
+    set(AOM_TARGET_CPU "arm64")
+  elseif("${cpu_lowercase}" MATCHES "^ppc")
+    set(AOM_TARGET_CPU "ppc")
+  else()
+    message(WARNING "The architecture ${CMAKE_SYSTEM_PROCESSOR} is not "
+                    "supported, falling back to the generic target")
+    set(AOM_TARGET_CPU "generic")
+  endif()
+endif()
+
+if(CMAKE_TOOLCHAIN_FILE) # Add toolchain file to config string.
+  if(IS_ABSOLUTE "${CMAKE_TOOLCHAIN_FILE}")
+    file(RELATIVE_PATH toolchain_path "${AOM_CONFIG_DIR}"
+         "${CMAKE_TOOLCHAIN_FILE}")
+  else()
+    set(toolchain_path "${CMAKE_TOOLCHAIN_FILE}")
+  endif()
+  set(toolchain_string "-DCMAKE_TOOLCHAIN_FILE=\\\"${toolchain_path}\\\"")
+  set(AOM_CMAKE_CONFIG "${toolchain_string} ${AOM_CMAKE_CONFIG}")
+else()
+
+  # Add detected CPU to the config string.
+  set(AOM_CMAKE_CONFIG "-DAOM_TARGET_CPU=${AOM_TARGET_CPU} ${AOM_CMAKE_CONFIG}")
+endif()
+set(AOM_CMAKE_CONFIG "-G \\\"${CMAKE_GENERATOR}\\\" ${AOM_CMAKE_CONFIG}")
+file(RELATIVE_PATH source_path "${AOM_CONFIG_DIR}" "${AOM_ROOT}")
+set(AOM_CMAKE_CONFIG "cmake ${source_path} ${AOM_CMAKE_CONFIG}")
+string(STRIP "${AOM_CMAKE_CONFIG}" AOM_CMAKE_CONFIG)
+
+message("--- aom_configure: Detected CPU: ${AOM_TARGET_CPU}")
+set(AOM_TARGET_SYSTEM ${CMAKE_SYSTEM_NAME})
+
+if("${CMAKE_BUILD_TYPE}" MATCHES "Deb")
+  set(CONFIG_DEBUG 1)
+endif()
+
+if(BUILD_SHARED_LIBS)
+  set(CONFIG_PIC 1)
+  set(CONFIG_SHARED 1)
+endif()
+
+if(NOT MSVC)
+  if(CONFIG_PIC)
+
+    # TODO(tomfinegan): clang needs -pie in CMAKE_EXE_LINKER_FLAGS for this to
+    # work.
+    set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+    if("${AOM_TARGET_SYSTEM}" STREQUAL "Linux"
+       AND "${AOM_TARGET_CPU}" MATCHES "^armv[78]")
+      set(AOM_AS_FLAGS ${AOM_AS_FLAGS} --defsym PIC=1)
+    else()
+      set(AOM_AS_FLAGS ${AOM_AS_FLAGS} -DPIC)
+    endif()
+  endif()
+endif()
+
+if("${AOM_TARGET_CPU}" STREQUAL "x86" OR "${AOM_TARGET_CPU}" STREQUAL "x86_64")
+  find_program(AS_EXECUTABLE yasm $ENV{YASM_PATH})
+  if(NOT AS_EXECUTABLE OR ENABLE_NASM)
+    unset(AS_EXECUTABLE CACHE)
+    find_program(AS_EXECUTABLE nasm $ENV{NASM_PATH})
+    if(AS_EXECUTABLE)
+      test_nasm()
+    endif()
+  endif()
+
+  if(NOT AS_EXECUTABLE)
+    message(
+      FATAL_ERROR
+        "Unable to find assembler. Install 'yasm' or 'nasm.' "
+        "To build without optimizations, add -DAOM_TARGET_CPU=generic to "
+        "your cmake command line.")
+  endif()
+  get_asm_obj_format("objformat")
+  set(AOM_AS_FLAGS -f ${objformat} ${AOM_AS_FLAGS})
+  string(STRIP "${AOM_AS_FLAGS}" AOM_AS_FLAGS)
+elseif("${AOM_TARGET_CPU}" MATCHES "arm")
+  if("${AOM_TARGET_SYSTEM}" STREQUAL "Darwin")
+    set(AS_EXECUTABLE as)
+    set(AOM_AS_FLAGS -arch ${AOM_TARGET_CPU} -isysroot ${CMAKE_OSX_SYSROOT})
+  elseif("${AOM_TARGET_SYSTEM}" STREQUAL "Windows")
+    if(NOT AS_EXECUTABLE)
+      set(AS_EXECUTABLE ${CMAKE_C_COMPILER} -c -mimplicit-it=always)
+    endif()
+  else()
+    if(NOT AS_EXECUTABLE)
+      set(AS_EXECUTABLE as)
+    endif()
+  endif()
+  find_program(as_executable_found ${AS_EXECUTABLE})
+  if(NOT as_executable_found)
+    message(
+      FATAL_ERROR
+        "Unable to find assembler and optimizations are enabled."
+        "Searched for ${AS_EXECUTABLE}. Install it, add it to your path, or "
+        "set the assembler directly by adding -DAS_EXECUTABLE=<assembler path> "
+        "to your CMake command line."
+        "To build without optimizations, add -DAOM_TARGET_CPU=generic to your "
+        "cmake command line.")
+  endif()
+  string(STRIP "${AOM_AS_FLAGS}" AOM_AS_FLAGS)
+endif()
+
+if(CONFIG_ANALYZER)
+  include(FindwxWidgets)
+  find_package(wxWidgets REQUIRED adv base core)
+  include(${wxWidgets_USE_FILE})
+endif()
+
+if(NOT MSVC AND CMAKE_C_COMPILER_ID MATCHES "GNU\|Clang")
+  set(CONFIG_GCC 1)
+endif()
+
+if(CONFIG_GCOV)
+  message("--- Testing for CONFIG_GCOV support.")
+  require_linker_flag("-fprofile-arcs -ftest-coverage")
+  require_compiler_flag("-fprofile-arcs -ftest-coverage" YES)
+endif()
+
+if(CONFIG_GPROF)
+  message("--- Testing for CONFIG_GPROF support.")
+  require_compiler_flag("-pg" YES)
+endif()
+
+if("${AOM_TARGET_SYSTEM}" MATCHES "Darwin\|Linux\|Windows\|Android")
+  set(CONFIG_OS_SUPPORT 1)
+endif()
+
+# The default _WIN32_WINNT value in MinGW is 0x0502 (Windows XP with SP2). Set
+# it to 0x0601 (Windows 7).
+if("${AOM_TARGET_SYSTEM}" STREQUAL "Windows")
+  add_compiler_flag_if_supported("-D_WIN32_WINNT=0x0601")
+endif()
+
+#
+# Fix CONFIG_* dependencies. This must be done before including cpu.cmake to
+# ensure RTCD_CONFIG_* are properly set.
+fix_experiment_configs()
+
+# Test compiler support.
+aom_get_inline("INLINE")
+
+# Don't just check for pthread.h, but use the result of the full pthreads
+# including a linking check in FindThreads above.
+set(HAVE_PTHREAD_H ${CMAKE_USE_PTHREADS_INIT})
+aom_check_source_compiles("unistd_check" "#include <unistd.h>" HAVE_UNISTD_H)
+
+if(NOT MSVC)
+  aom_push_var(CMAKE_REQUIRED_LIBRARIES "m")
+  aom_check_c_compiles("fenv_check" "#define _GNU_SOURCE
+                        #include <fenv.h>
+                        void unused(void) {
+                          (void)unused;
+                          (void)feenableexcept(FE_DIVBYZERO | FE_INVALID);
+                        }" HAVE_FEXCEPT)
+  aom_pop_var(CMAKE_REQUIRED_LIBRARIES)
+endif()
+
+include("${AOM_ROOT}/build/cmake/cpu.cmake")
+
+if(ENABLE_CCACHE)
+  set_compiler_launcher(ENABLE_CCACHE ccache)
+endif()
+
+if(ENABLE_DISTCC)
+  set_compiler_launcher(ENABLE_DISTCC distcc)
+endif()
+
+if(ENABLE_GOMA)
+  set_compiler_launcher(ENABLE_GOMA gomacc)
+endif()
+
+if(NOT CONFIG_AV1_DECODER AND NOT CONFIG_AV1_ENCODER)
+  message(FATAL_ERROR "Decoder and encoder disabled, nothing to build.")
+endif()
+
+if(DECODE_HEIGHT_LIMIT OR DECODE_WIDTH_LIMIT)
+  change_config_and_warn(CONFIG_SIZE_LIMIT 1
+                         "DECODE_HEIGHT_LIMIT and DECODE_WIDTH_LIMIT")
+endif()
+
+if(CONFIG_SIZE_LIMIT)
+  if(NOT DECODE_HEIGHT_LIMIT OR NOT DECODE_WIDTH_LIMIT)
+    message(FATAL_ERROR "When setting CONFIG_SIZE_LIMIT, DECODE_HEIGHT_LIMIT "
+                        "and DECODE_WIDTH_LIMIT must be set.")
+  endif()
+endif()
+
+# Test compiler flags.
+if(MSVC)
+  add_compiler_flag_if_supported("/W3")
+
+  # Disable MSVC warnings that suggest making code non-portable.
+  add_compiler_flag_if_supported("/wd4996")
+  if(ENABLE_WERROR)
+    add_compiler_flag_if_supported("/WX")
+  endif()
+else()
+  require_c_flag("-std=c99" YES)
+  require_cxx_flag_nomsvc("-std=c++11" YES)
+  add_compiler_flag_if_supported("-Wall")
+  add_compiler_flag_if_supported("-Wdisabled-optimization")
+  add_compiler_flag_if_supported("-Wextra")
+  add_compiler_flag_if_supported("-Wfloat-conversion")
+  add_c_flag_if_supported("-Wimplicit-function-declaration")
+  add_compiler_flag_if_supported("-Wlogical-op")
+  add_compiler_flag_if_supported("-Wpointer-arith")
+  add_compiler_flag_if_supported("-Wshorten-64-to-32")
+  add_compiler_flag_if_supported("-Wsign-compare")
+  add_compiler_flag_if_supported("-Wstring-conversion")
+  add_compiler_flag_if_supported("-Wtype-limits")
+  add_compiler_flag_if_supported("-Wuninitialized")
+  add_compiler_flag_if_supported("-Wunused")
+  add_compiler_flag_if_supported("-Wvla")
+
+  if(CMAKE_C_COMPILER_ID MATCHES "GNU"
+     AND "${SANITIZE}" MATCHES "address|undefined")
+
+    # This combination has more stack overhead, so we account for it by
+    # providing higher stack limit than usual.
+    add_c_flag_if_supported("-Wstack-usage=170000")
+    add_cxx_flag_if_supported("-Wstack-usage=270000")
+  elseif(CONFIG_RD_DEBUG) # Another case where higher stack usage is expected.
+    add_c_flag_if_supported("-Wstack-usage=117000")
+    add_cxx_flag_if_supported("-Wstack-usage=240000")
+  else()
+    add_c_flag_if_supported("-Wstack-usage=100000")
+    add_cxx_flag_if_supported("-Wstack-usage=240000")
+  endif()
+
+  # Add -Wshadow only for C files to avoid massive gtest warning spam.
+  add_c_flag_if_supported("-Wshadow")
+
+  # Add -Wundef only for C files to avoid massive gtest warning spam.
+  add_c_flag_if_supported("-Wundef")
+
+  # Quiet gcc 6 vs 7 abi warnings:
+  # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=77728
+  if("${AOM_TARGET_CPU}" MATCHES "arm")
+    add_cxx_flag_if_supported("-Wno-psabi")
+  endif()
+
+  if(ENABLE_WERROR)
+    add_compiler_flag_if_supported("-Werror")
+  endif()
+
+  if("${CMAKE_BUILD_TYPE}" MATCHES "Rel")
+    add_compiler_flag_if_supported("-U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=0")
+  endif()
+  add_compiler_flag_if_supported("-D_LARGEFILE_SOURCE")
+  add_compiler_flag_if_supported("-D_FILE_OFFSET_BITS=64")
+endif()
+
+set(AOM_LIB_LINK_TYPE PUBLIC)
+if(EMSCRIPTEN)
+
+  # Avoid CMake generation time errors resulting from collisions with the form
+  # of target_link_libraries() used by Emscripten.cmake.
+  unset(AOM_LIB_LINK_TYPE)
+endif()
+
+# Generate aom_config templates.
+set(aom_config_asm_template "${AOM_CONFIG_DIR}/config/aom_config.asm.cmake")
+set(aom_config_h_template "${AOM_CONFIG_DIR}/config/aom_config.h.cmake")
+execute_process(
+  COMMAND ${CMAKE_COMMAND}
+          -DAOM_CONFIG_DIR=${AOM_CONFIG_DIR} -DAOM_ROOT=${AOM_ROOT} -P
+          "${AOM_ROOT}/build/cmake/generate_aom_config_templates.cmake")
+
+# Generate aom_config.{asm,h}.
+configure_file("${aom_config_asm_template}"
+               "${AOM_CONFIG_DIR}/config/aom_config.asm")
+configure_file("${aom_config_h_template}"
+               "${AOM_CONFIG_DIR}/config/aom_config.h")
+
+# Read the current git hash.
+find_package(Git)
+if(NOT GIT_FOUND)
+  message("--- Git missing, version will be read from CHANGELOG.")
+endif()
+
+configure_file("${AOM_ROOT}/build/cmake/aom_config.c.template"
+               "${AOM_CONFIG_DIR}/config/aom_config.c")
+
+# Find Perl and generate the RTCD sources.
+find_package(Perl)
+if(NOT PERL_FOUND)
+  message(FATAL_ERROR "Perl is required to build libaom.")
+endif()
+
+set(AOM_RTCD_CONFIG_FILE_LIST "${AOM_ROOT}/aom_dsp/aom_dsp_rtcd_defs.pl"
+                              "${AOM_ROOT}/aom_scale/aom_scale_rtcd.pl"
+                              "${AOM_ROOT}/av1/common/av1_rtcd_defs.pl")
+set(AOM_RTCD_HEADER_FILE_LIST "${AOM_CONFIG_DIR}/config/aom_dsp_rtcd.h"
+                              "${AOM_CONFIG_DIR}/config/aom_scale_rtcd.h"
+                              "${AOM_CONFIG_DIR}/config/av1_rtcd.h")
+set(AOM_RTCD_SOURCE_FILE_LIST "${AOM_ROOT}/aom_dsp/aom_dsp_rtcd.c"
+                              "${AOM_ROOT}/aom_scale/aom_scale_rtcd.c"
+                              "${AOM_ROOT}/av1/common/av1_rtcd.c")
+set(AOM_RTCD_SYMBOL_LIST aom_dsp_rtcd aom_scale_rtcd av1_rtcd)
+list(LENGTH AOM_RTCD_SYMBOL_LIST AOM_RTCD_CUSTOM_COMMAND_COUNT)
+math(EXPR AOM_RTCD_CUSTOM_COMMAND_COUNT "${AOM_RTCD_CUSTOM_COMMAND_COUNT} - 1")
+
+foreach(NUM RANGE ${AOM_RTCD_CUSTOM_COMMAND_COUNT})
+  list(GET AOM_RTCD_CONFIG_FILE_LIST ${NUM} AOM_RTCD_CONFIG_FILE)
+  list(GET AOM_RTCD_HEADER_FILE_LIST ${NUM} AOM_RTCD_HEADER_FILE)
+  list(GET AOM_RTCD_SOURCE_FILE_LIST ${NUM} AOM_RTCD_SOURCE_FILE)
+  list(GET AOM_RTCD_SYMBOL_LIST ${NUM} AOM_RTCD_SYMBOL)
+  execute_process(
+    COMMAND
+      ${PERL_EXECUTABLE} "${AOM_ROOT}/build/cmake/rtcd.pl"
+      --arch=${AOM_TARGET_CPU}
+      --sym=${AOM_RTCD_SYMBOL} ${AOM_RTCD_FLAGS}
+      --config=${AOM_CONFIG_DIR}/config/aom_config.h ${AOM_RTCD_CONFIG_FILE}
+    OUTPUT_FILE ${AOM_RTCD_HEADER_FILE})
+endforeach()
+
+# Generate aom_version.h.
+execute_process(COMMAND ${CMAKE_COMMAND}
+                        -DAOM_CONFIG_DIR=${AOM_CONFIG_DIR}
+                        -DAOM_ROOT=${AOM_ROOT}
+                        -DGIT_EXECUTABLE=${GIT_EXECUTABLE}
+                        -DPERL_EXECUTABLE=${PERL_EXECUTABLE} -P
+                        "${AOM_ROOT}/build/cmake/version.cmake")
diff --git a/libs/libaom/src/build/cmake/aom_experiment_deps.cmake b/libs/libaom/src/build/cmake/aom_experiment_deps.cmake
new file mode 100644
index 000000000..2e3615791
--- /dev/null
+++ b/libs/libaom/src/build/cmake/aom_experiment_deps.cmake
@@ -0,0 +1,28 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_AOM_EXPERIMENT_DEPS_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_AOM_EXPERIMENT_DEPS_CMAKE_
+set(AOM_BUILD_CMAKE_AOM_EXPERIMENT_DEPS_CMAKE_ 1)
+
+# Adjusts CONFIG_* CMake variables to address conflicts between active AV1
+# experiments.
+macro(fix_experiment_configs)
+
+  if(CONFIG_ANALYZER)
+    change_config_and_warn(CONFIG_INSPECTION 1 CONFIG_ANALYZER)
+  endif()
+
+  if(CONFIG_DIST_8X8 AND CONFIG_MULTITHREAD)
+    change_config_and_warn(CONFIG_DIST_8X8 0 CONFIG_MULTITHREAD)
+  endif()
+
+endmacro()
diff --git a/libs/libaom/src/build/cmake/aom_install.cmake b/libs/libaom/src/build/cmake/aom_install.cmake
new file mode 100644
index 000000000..cd40fe424
--- /dev/null
+++ b/libs/libaom/src/build/cmake/aom_install.cmake
@@ -0,0 +1,96 @@
+#
+# Copyright (c) 2018, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+list(APPEND AOM_INSTALL_INCS "${AOM_ROOT}/aom/aom.h"
+            "${AOM_ROOT}/aom/aom_codec.h" "${AOM_ROOT}/aom/aom_frame_buffer.h"
+            "${AOM_ROOT}/aom/aom_image.h" "${AOM_ROOT}/aom/aom_integer.h"
+            "${AOM_ROOT}/aom/aom.h")
+
+if(CONFIG_AV1_DECODER)
+  list(APPEND AOM_INSTALL_INCS "${AOM_ROOT}/aom/aom_decoder.h"
+              "${AOM_ROOT}/aom/aomdx.h")
+endif()
+
+if(CONFIG_AV1_ENCODER)
+  list(APPEND AOM_INSTALL_INCS "${AOM_ROOT}/aom/aomcx.h"
+              "${AOM_ROOT}/aom/aom_encoder.h")
+endif()
+
+# Generate aom.pc and setup dependencies to ensure it is created when necessary.
+# Note: aom.pc generation uses GNUInstallDirs:
+# https://cmake.org/cmake/help/latest/module/GNUInstallDirs.html
+macro(setup_aom_install_targets)
+  if(NOT (MSVC OR XCODE))
+    include("GNUInstallDirs")
+    set(AOM_PKG_CONFIG_FILE "${AOM_CONFIG_DIR}/aom.pc")
+
+    # Create a dummy library target for creating aom.pc.
+    create_dummy_source_file(aom_pc c AOM_PKG_CONFIG_SOURCES)
+    add_library(aom_pc ${AOM_PKG_CONFIG_SOURCES})
+
+    # Setup a rule to generate aom.pc.
+    add_custom_command(
+      OUTPUT "${AOM_PKG_CONFIG_FILE}"
+      COMMAND ${CMAKE_COMMAND} ARGS
+              -DAOM_CONFIG_DIR=${AOM_CONFIG_DIR} -DAOM_ROOT=${AOM_ROOT}
+              -DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}
+              -DCMAKE_INSTALL_BINDIR=${CMAKE_INSTALL_BINDIR}
+              -DCMAKE_INSTALL_INCLUDEDIR=${CMAKE_INSTALL_INCLUDEDIR}
+              -DCMAKE_INSTALL_LIBDIR=${CMAKE_INSTALL_LIBDIR}
+              -DCMAKE_PROJECT_NAME=${CMAKE_PROJECT_NAME}
+              -DCONFIG_MULTITHREAD=${CONFIG_MULTITHREAD}
+              -DHAVE_PTHREAD_H=${HAVE_PTHREAD_H} -P
+              "${AOM_ROOT}/build/cmake/pkg_config.cmake"
+      COMMENT "Writing aom.pc"
+      VERBATIM)
+
+    # Explicitly add a dependency on the pkg-config file to ensure it's built.
+    get_property(aom_pc_sources TARGET aom_pc PROPERTY SOURCES)
+    set_source_files_properties(${aom_pc_sources} OBJECT_DEPENDS
+                                "${AOM_PKG_CONFIG_FILE}")
+
+    # Our pkg-config file carries version information: add a dependency on the
+    # version rule.
+    add_dependencies(aom_pc aom_version)
+
+    if(CONFIG_AV1_DECODER)
+      if(ENABLE_EXAMPLES)
+        list(APPEND AOM_INSTALL_BINS aomdec)
+      endif()
+    endif()
+
+    if(CONFIG_AV1_ENCODER)
+      if(ENABLE_EXAMPLES)
+        list(APPEND AOM_INSTALL_BINS aomenc)
+      endif()
+    endif()
+
+    if(BUILD_SHARED_LIBS)
+      set(AOM_INSTALL_LIBS aom aom_static)
+    else()
+      set(AOM_INSTALL_LIBS aom)
+    endif()
+
+    # Setup the install rules.
+    install(
+      FILES ${AOM_INSTALL_INCS}
+      DESTINATION "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR}/aom")
+    install(
+      FILES "${AOM_PKG_CONFIG_FILE}"
+      DESTINATION "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}/pkgconfig")
+    install(TARGETS ${AOM_INSTALL_LIBS} DESTINATION
+                    "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
+
+    if(ENABLE_EXAMPLES)
+      install(TARGETS ${AOM_INSTALL_BINS} DESTINATION
+                      "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR}")
+    endif()
+  endif()
+endmacro()
diff --git a/libs/libaom/src/build/cmake/aom_optimization.cmake b/libs/libaom/src/build/cmake/aom_optimization.cmake
new file mode 100644
index 000000000..d8b258f1e
--- /dev/null
+++ b/libs/libaom/src/build/cmake/aom_optimization.cmake
@@ -0,0 +1,240 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_AOM_OPTIMIZATION_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_AOM_OPTIMIZATION_CMAKE_
+set(AOM_BUILD_CMAKE_AOM_OPTIMIZATION_CMAKE_ 1)
+
+include("${AOM_ROOT}/build/cmake/util.cmake")
+
+# Translate $flag to one which MSVC understands, and write the new flag to the
+# variable named by $translated_flag (or unset it, when MSVC needs no flag).
+function(get_msvc_intrinsic_flag flag translated_flag)
+  if("${flag}" STREQUAL "-mavx")
+    set(${translated_flag} "/arch:AVX" PARENT_SCOPE)
+  elseif("${flag}" STREQUAL "-mavx2")
+    set(${translated_flag} "/arch:AVX2" PARENT_SCOPE)
+  else()
+
+    # MSVC does not need flags for intrinsics flavors other than AVX/AVX2.
+    unset(${translated_flag} PARENT_SCOPE)
+  endif()
+endfunction()
+
+# Adds an object library target. Terminates generation if $flag is not supported
+# by the current compiler. $flag is the intrinsics flag required by the current
+# compiler, and is added to the compile flags for all sources in $sources.
+# $opt_name is used to name the target. $target_to_update is made dependent upon
+# the created target.
+#
+# Note: this function always updates the aom, and aom_static targets because
+# OBJECT libraries have rules that disallow the direct addition of .o files to
+# them as dependencies. Static and shared libraries do not have this limitation.
+function(add_intrinsics_object_library flag opt_name target_to_update sources)
+  if("${${sources}}" STREQUAL "")
+    return()
+  endif()
+  set(target_name ${target_to_update}_${opt_name}_intrinsics)
+  add_library(${target_name} OBJECT ${${sources}})
+
+  if(MSVC)
+    get_msvc_intrinsic_flag(${flag} "flag")
+  endif()
+
+  if("${flag}" STREQUAL "-mavx2")
+    unset(FLAG_SUPPORTED)
+    check_c_compiler_flag("-mno-avx256-split-unaligned-load" FLAG_SUPPORTED)
+    if(${FLAG_SUPPORTED})
+      set(flag "${flag} -mno-avx256-split-unaligned-load")
+    endif()
+
+    unset(FLAG_SUPPORTED)
+    check_c_compiler_flag("-mno-avx256-split-unaligned-store" FLAG_SUPPORTED)
+    if(${FLAG_SUPPORTED})
+      set(flag "${flag} -mno-avx256-split-unaligned-store")
+    endif()
+  endif()
+
+  if(flag)
+    separate_arguments(flag)
+    target_compile_options(${target_name} PUBLIC ${flag})
+  endif()
+
+  target_sources(aom PRIVATE $<TARGET_OBJECTS:${target_name}>)
+  if(BUILD_SHARED_LIBS)
+    target_sources(aom_static PRIVATE $<TARGET_OBJECTS:${target_name}>)
+  endif()
+
+  # Add the new lib target to the global list of aom library targets.
+  list(APPEND AOM_LIB_TARGETS ${target_name})
+  set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} PARENT_SCOPE)
+endfunction()
+
+# Adds sources in list named by $sources to $target and adds $flag to the
+# compile flags for each source file.
+function(add_intrinsics_source_to_target flag target sources)
+  target_sources(${target} PRIVATE ${${sources}})
+  if(MSVC)
+    get_msvc_intrinsic_flag(${flag} "flag")
+  endif()
+  if(flag)
+    foreach(source ${${sources}})
+      set_property(SOURCE ${source} APPEND PROPERTY COMPILE_FLAGS ${flag})
+    endforeach()
+  endif()
+endfunction()
+
+# Writes object format for the current target to the var named by $out_format,
+# or terminates the build when the object format for the current target is
+# unknown.
+function(get_asm_obj_format out_format)
+  if("${AOM_TARGET_CPU}" STREQUAL "x86_64")
+    if("${AOM_TARGET_SYSTEM}" STREQUAL "Darwin")
+      set(objformat "macho64")
+    elseif("${AOM_TARGET_SYSTEM}" STREQUAL "MSYS"
+           OR "${AOM_TARGET_SYSTEM}" STREQUAL "Windows")
+      set(objformat "win64")
+    else()
+      set(objformat "elf64")
+    endif()
+  elseif("${AOM_TARGET_CPU}" STREQUAL "x86")
+    if("${AOM_TARGET_SYSTEM}" STREQUAL "Darwin")
+      set(objformat "macho32")
+    elseif("${AOM_TARGET_SYSTEM}" STREQUAL "MSYS"
+           OR "${AOM_TARGET_SYSTEM}" STREQUAL "Windows")
+      set(objformat "win32")
+    else()
+      set(objformat "elf32")
+    endif()
+  else()
+    message(
+      FATAL_ERROR "Unknown obj format: ${AOM_TARGET_CPU}-${AOM_TARGET_SYSTEM}")
+  endif()
+
+  set(${out_format} ${objformat} PARENT_SCOPE)
+endfunction()
+
+# Adds library target named $lib_name for ASM files in variable named by
+# $asm_sources. Builds an output directory path from $lib_name. Links $lib_name
+# into the aom library target(s). Generates a dummy C file with a dummy function
+# to ensure that all cmake generators can determine the linker language, and
+# that build tools don't complain that an object exposes no symbols.
+function(add_asm_library lib_name asm_sources)
+  if("${${asm_sources}}" STREQUAL "")
+    return()
+  endif()
+  set(asm_lib_obj_dir "${AOM_CONFIG_DIR}/asm_objects/${lib_name}")
+  if(NOT EXISTS "${asm_lib_obj_dir}")
+    file(MAKE_DIRECTORY "${asm_lib_obj_dir}")
+  endif()
+
+  # TODO(tomfinegan): If cmake ever allows addition of .o files to OBJECT lib
+  # targets, make this OBJECT instead of STATIC to hide the target from
+  # consumers of the AOM cmake build.
+  add_library(${lib_name} STATIC ${${asm_sources}})
+
+  foreach(asm_source ${${asm_sources}})
+    get_filename_component(asm_source_name "${asm_source}" NAME)
+    set(asm_object "${asm_lib_obj_dir}/${asm_source_name}.o")
+    add_custom_command(OUTPUT "${asm_object}"
+                       COMMAND ${AS_EXECUTABLE} ARGS ${AOM_AS_FLAGS}
+                               -I${AOM_ROOT}/ -I${AOM_CONFIG_DIR}/ -o
+                               "${asm_object}" "${asm_source}"
+                       DEPENDS "${asm_source}"
+                       COMMENT "Building ASM object ${asm_object}"
+                       WORKING_DIRECTORY "${AOM_CONFIG_DIR}"
+                       VERBATIM)
+    target_sources(aom PRIVATE "${asm_object}")
+    if(BUILD_SHARED_LIBS)
+      target_sources(aom_static PRIVATE "${asm_object}")
+    endif()
+  endforeach()
+
+  # The above created a target containing only ASM sources. Cmake needs help
+  # here to determine the linker language. Add a dummy C file to force the
+  # linker language to C. We don't bother with setting the LINKER_LANGUAGE
+  # property on the library target because not all generators obey it (looking
+  # at you, xcode generator).
+  add_dummy_source_file_to_target("${lib_name}" "c")
+
+  # Add the new lib target to the global list of aom library targets.
+  list(APPEND AOM_LIB_TARGETS ${lib_name})
+  set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} PARENT_SCOPE)
+endfunction()
+
+# Terminates generation if nasm found in PATH does not meet requirements.
+# Currently checks only for presence of required object formats and support for
+# the -Ox argument (multipass optimization).
+function(test_nasm)
+  execute_process(COMMAND ${AS_EXECUTABLE} -hf OUTPUT_VARIABLE nasm_helptext)
+
+  if(NOT "${nasm_helptext}" MATCHES "-Ox")
+    message(
+      FATAL_ERROR "Unsupported nasm: multipass optimization not supported.")
+  endif()
+
+  if("${AOM_TARGET_CPU}" STREQUAL "x86")
+    if("${AOM_TARGET_SYSTEM}" STREQUAL "Darwin")
+      if(NOT "${nasm_helptext}" MATCHES "macho32")
+        message(
+          FATAL_ERROR "Unsupported nasm: macho32 object format not supported.")
+      endif()
+    elseif("${AOM_TARGET_SYSTEM}" STREQUAL "MSYS"
+           OR "${AOM_TARGET_SYSTEM}" STREQUAL "Windows")
+      if(NOT "${nasm_helptext}" MATCHES "win32")
+        message(
+          FATAL_ERROR "Unsupported nasm: win32 object format not supported.")
+      endif()
+    else()
+      if(NOT "${nasm_helptext}" MATCHES "elf32")
+        message(
+          FATAL_ERROR "Unsupported nasm: elf32 object format not supported.")
+      endif()
+    endif()
+  else()
+    if("${AOM_TARGET_SYSTEM}" STREQUAL "Darwin")
+      if(NOT "${nasm_helptext}" MATCHES "macho64")
+        message(
+          FATAL_ERROR "Unsupported nasm: macho64 object format not supported.")
+      endif()
+    elseif("${AOM_TARGET_SYSTEM}" STREQUAL "MSYS"
+           OR "${AOM_TARGET_SYSTEM}" STREQUAL "Windows")
+      if(NOT "${nasm_helptext}" MATCHES "win64")
+        message(
+          FATAL_ERROR "Unsupported nasm: win64 object format not supported.")
+      endif()
+    else()
+      if(NOT "${nasm_helptext}" MATCHES "elf64")
+        message(
+          FATAL_ERROR "Unsupported nasm: elf64 object format not supported.")
+      endif()
+    endif()
+  endif()
+endfunction()
+
+# Adds build command for generation of rtcd C source files using
+# build/cmake/rtcd.pl. $config is the input perl file, $output is the output C
+# include file, $source is the C source file, and $symbol is used for the symbol
+# argument passed to rtcd.pl.
+function(add_rtcd_build_step config output source symbol)
+  add_custom_command(
+    OUTPUT ${output}
+    COMMAND ${PERL_EXECUTABLE} ARGS "${AOM_ROOT}/build/cmake/rtcd.pl"
+            --arch=${AOM_TARGET_CPU}
+            --sym=${symbol} ${AOM_RTCD_FLAGS}
+            --config=${AOM_CONFIG_DIR}/config/aom_config.h ${config} > ${output}
+    DEPENDS ${config}
+    COMMENT "Generating ${output}"
+    WORKING_DIRECTORY ${AOM_CONFIG_DIR}
+    VERBATIM)
+  set_property(SOURCE ${source} PROPERTY OBJECT_DEPENDS ${output})
+  set_property(SOURCE ${output} PROPERTY GENERATED)
+endfunction()
diff --git a/libs/libaom/src/build/cmake/compiler_flags.cmake b/libs/libaom/src/build/cmake/compiler_flags.cmake
new file mode 100644
index 000000000..24484bcad
--- /dev/null
+++ b/libs/libaom/src/build/cmake/compiler_flags.cmake
@@ -0,0 +1,373 @@
+#
+# Copyright (c) 2016, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_COMPILER_FLAGS_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_COMPILER_FLAGS_CMAKE_
+set(AOM_BUILD_CMAKE_COMPILER_FLAGS_CMAKE_ 1)
+
+include(CheckCCompilerFlag)
+include(CheckCXXCompilerFlag)
+include("${AOM_ROOT}/build/cmake/compiler_tests.cmake")
+
+# Strings used to cache flags.
+set(AOM_C_FLAGS)
+set(AOM_CXX_FLAGS)
+set(AOM_EXE_LINKER_FLAGS)
+set(AOM_FAILED_C_FLAGS)
+set(AOM_FAILED_CXX_FLAGS)
+
+# Sets variable named by $out_is_present to YES in the caller's scope when $flag
+# is found in the string variable named by $flag_cache. Sets the var to NO
+# otherwise.
+function(is_flag_present flag_cache flag out_is_present)
+  string(FIND "${${flag_cache}}" "${flag}" flag_pos)
+  if(${flag_pos} EQUAL -1)
+    set(${out_is_present} NO PARENT_SCOPE)
+  else()
+    set(${out_is_present} YES PARENT_SCOPE)
+  endif()
+endfunction()
+
+# Appends $flag to $flags. Ignores scope via use of FORCE with set() call.
+function(append_flag flags flag)
+  string(FIND "${${flags}}" "${flag}" found)
+  if(${found} EQUAL -1)
+    set(${flags} "${${flags}} ${flag}" CACHE STRING "" FORCE)
+  endif()
+endfunction()
+
+# Checks C compiler for support of $c_flag. Adds $c_flag to all
+# $CMAKE_C_FLAGS_<CONFIG>s stored in AOM_C_CONFIGS when the compile test passes.
+# Caches $c_flag in $AOM_C_FLAGS or $AOM_FAILED_C_FLAGS depending on test
+# outcome.
+function(add_c_flag_if_supported c_flag)
+  if(DEBUG_CMAKE_DISABLE_COMPILER_TESTS)
+    return()
+  endif()
+
+  is_flag_present(AOM_C_FLAGS "${c_flag}" flag_ok)
+  is_flag_present(AOM_FAILED_C_FLAGS "${c_flag}" flag_failed)
+  if(${flag_ok} OR ${flag_failed})
+    return()
+  endif()
+
+  unset(C_FLAG_SUPPORTED CACHE)
+  message("Checking C compiler flag support for: " ${c_flag})
+  check_c_compiler_flag("${c_flag}" C_FLAG_SUPPORTED)
+
+  if(${C_FLAG_SUPPORTED})
+    append_flag(AOM_C_FLAGS "${c_flag}")
+    foreach(config ${AOM_C_CONFIGS})
+      unset(C_FLAG_FOUND)
+      append_flag("${config}" "${c_flag}")
+    endforeach()
+  else()
+    append_flag(AOM_FAILED_C_FLAGS "${c_flag}")
+  endif()
+endfunction()
+
+# Checks C++ compiler for support of $cxx_flag. Adds $cxx_flag to all
+# $CMAKE_CXX_FLAGS_<CONFIG>s stored in AOM_CXX_CONFIGS when the compile test
+# passes. Caches $cxx_flag in $AOM_CXX_FLAGS or $AOM_FAILED_CXX_FLAGS depending
+# on test outcome.
+function(add_cxx_flag_if_supported cxx_flag)
+  if(DEBUG_CMAKE_DISABLE_COMPILER_TESTS)
+    return()
+  endif()
+
+  is_flag_present(AOM_CXX_FLAGS "${cxx_flag}" flag_ok)
+  is_flag_present(AOM_FAILED_CXX_FLAGS "${cxx_flag}" flag_failed)
+  if(${flag_ok} OR ${flag_failed})
+    return()
+  endif()
+
+  unset(CXX_FLAG_SUPPORTED CACHE)
+  message("Checking C++ compiler flag support for: " ${cxx_flag})
+  check_cxx_compiler_flag("${cxx_flag}" CXX_FLAG_SUPPORTED)
+
+  if(${CXX_FLAG_SUPPORTED})
+    append_flag(AOM_CXX_FLAGS "${cxx_flag}")
+    foreach(config ${AOM_CXX_CONFIGS})
+      unset(CXX_FLAG_FOUND)
+      append_flag("${config}" "${cxx_flag}")
+    endforeach()
+  else()
+    append_flag(AOM_FAILED_CXX_FLAGS "${cxx_flag}")
+  endif()
+endfunction()
+
+# Convenience method for adding a flag to both the C and C++ compiler command
+# lines.
+function(add_compiler_flag_if_supported flag)
+  add_c_flag_if_supported(${flag})
+  add_cxx_flag_if_supported(${flag})
+endfunction()
+
+# Checks C compiler for support of $c_flag and terminates generation when
+# support is not present.
+function(require_c_flag c_flag update_c_flags)
+  if(DEBUG_CMAKE_DISABLE_COMPILER_TESTS)
+    return()
+  endif()
+
+  is_flag_present(AOM_C_FLAGS "${c_flag}" flag_ok)
+  if(${flag_ok})
+    return()
+  endif()
+
+  if(NOT "${AOM_EXE_LINKER_FLAGS}" STREQUAL "")
+    aom_push_var(CMAKE_EXE_LINKER_FLAGS "${AOM_EXE_LINKER_FLAGS}")
+  endif()
+
+  unset(HAVE_C_FLAG CACHE)
+  message("Checking C compiler flag support for: " ${c_flag})
+  check_c_compiler_flag("${c_flag}" HAVE_C_FLAG)
+  if(NOT HAVE_C_FLAG)
+    message(
+      FATAL_ERROR "${PROJECT_NAME} requires support for C flag: ${c_flag}.")
+  endif()
+
+  if(NOT "${AOM_EXE_LINKER_FLAGS}" STREQUAL "")
+    aom_pop_var(CMAKE_EXE_LINKER_FLAGS)
+  endif()
+
+  append_flag(AOM_C_FLAGS "${c_flag}")
+  if(update_c_flags)
+    foreach(config ${AOM_C_CONFIGS})
+      set(${config} "${${config}} ${c_flag}" CACHE STRING "" FORCE)
+    endforeach()
+  endif()
+endfunction()
+
+# Checks CXX compiler for support of $cxx_flag and terminates generation when
+# support is not present.
+function(require_cxx_flag cxx_flag update_cxx_flags)
+  if(DEBUG_CMAKE_DISABLE_COMPILER_TESTS)
+    return()
+  endif()
+
+  is_flag_present(AOM_CXX_FLAGS "${cxx_flag}" flag_ok)
+  if(${flag_ok})
+    return()
+  endif()
+
+  if(NOT "${AOM_EXE_LINKER_FLAGS}" STREQUAL "")
+    aom_push_var(CMAKE_EXE_LINKER_FLAGS "${AOM_EXE_LINKER_FLAGS}")
+  endif()
+
+  unset(HAVE_CXX_FLAG CACHE)
+  message("Checking C compiler flag support for: " ${cxx_flag})
+  check_cxx_compiler_flag("${cxx_flag}" HAVE_CXX_FLAG)
+  if(NOT HAVE_CXX_FLAG)
+    message(
+      FATAL_ERROR "${PROJECT_NAME} requires support for C flag: ${cxx_flag}.")
+  endif()
+
+  if(NOT "${AOM_EXE_LINKER_FLAGS}" STREQUAL "")
+    aom_pop_var(CMAKE_EXE_LINKER_FLAGS)
+  endif()
+
+  append_flag(AOM_CXX_FLAGS "${cxx_flag}")
+  if(update_cxx_flags)
+    foreach(config ${AOM_CXX_CONFIGS})
+      set(${config} "${${config}} ${cxx_flag}" CACHE STRING "" FORCE)
+    endforeach()
+  endif()
+endfunction()
+
+# Checks for support of $flag by both the C and CXX compilers. Terminates
+# generation when support is not present in both compilers.
+function(require_compiler_flag flag update_cmake_flags)
+  require_c_flag(${flag} ${update_cmake_flags})
+  require_cxx_flag(${flag} ${update_cmake_flags})
+endfunction()
+
+# Checks only non-MSVC targets for support of $c_flag and terminates generation
+# when support is not present.
+function(require_c_flag_nomsvc c_flag update_c_flags)
+  if(NOT MSVC)
+    require_c_flag(${c_flag} ${update_c_flags})
+  endif()
+endfunction()
+
+# Checks only non-MSVC targets for support of $cxx_flag and terminates
+# generation when support is not present.
+function(require_cxx_flag_nomsvc cxx_flag update_cxx_flags)
+  if(NOT MSVC)
+    require_cxx_flag(${cxx_flag} ${update_cxx_flags})
+  endif()
+endfunction()
+
+# Checks only non-MSVC targets for support of $flag by both the C and CXX
+# compilers. Terminates generation when support is not present in both
+# compilers.
+function(require_compiler_flag_nomsvc flag update_cmake_flags)
+  require_c_flag_nomsvc(${flag} ${update_cmake_flags})
+  require_cxx_flag_nomsvc(${flag} ${update_cmake_flags})
+endfunction()
+
+# Adds $preproc_def to C compiler command line (as -D$preproc_def) if not
+# already present.
+function(add_c_preproc_definition preproc_def)
+  set(preproc_def "-D${preproc_def}")
+  is_flag_present(AOM_C_FLAGS "${preproc_def}" flag_cached)
+  if(${flag_cached})
+    return()
+  endif()
+
+  foreach(config ${AOM_C_CONFIGS})
+    set(${config} "${${config}} ${preproc_def}" CACHE STRING "" FORCE)
+  endforeach()
+endfunction()
+
+# Adds $preproc_def to CXX compiler command line (as -D$preproc_def) if not
+# already present.
+function(add_cxx_preproc_definition preproc_def)
+  set(preproc_def "-D${preproc_def}")
+  is_flag_present(AOM_CXX_FLAGS "${preproc_def}" flag_cached)
+  if(${flag_cached})
+    return()
+  endif()
+
+  foreach(config ${AOM_CXX_CONFIGS})
+    set(${config} "${${config}} ${preproc_def}" CACHE STRING "" FORCE)
+  endforeach()
+endfunction()
+
+# Adds $preproc_def to C and CXX compiler command line (as -D$preproc_def) if
+# not already present.
+function(add_preproc_definition preproc_def)
+  add_c_preproc_definition(${preproc_def})
+  add_cxx_preproc_definition(${preproc_def})
+endfunction()
+
+# Adds $flag to assembler command line.
+function(append_as_flag flag)
+  is_flag_present(AOM_AS_FLAGS "${flag}" flag_cached)
+  if(${flag_cached})
+    return()
+  endif()
+  append_flag(AOM_AS_FLAGS "${flag}")
+endfunction()
+
+# Adds $flag to the C compiler command line.
+function(append_c_flag flag)
+  is_flag_present(AOM_C_FLAGS "${flag}" flag_cached)
+  if(${flag_cached})
+    return()
+  endif()
+
+  foreach(config ${AOM_C_CONFIGS})
+    append_flag(${config} "${flag}")
+  endforeach()
+endfunction()
+
+# Adds $flag to the CXX compiler command line.
+function(append_cxx_flag flag)
+  is_flag_present(AOM_CXX_FLAGS "${flag}" flag_cached)
+  if(${flag_cached})
+    return()
+  endif()
+
+  foreach(config ${AOM_CXX_CONFIGS})
+    append_flag(${config} "${flag}")
+  endforeach()
+endfunction()
+
+# Adds $flag to the C and CXX compiler command lines.
+function(append_compiler_flag flag)
+  append_c_flag(${flag})
+  append_cxx_flag(${flag})
+endfunction()
+
+# Adds $flag to the executable linker command line when not present.
+function(append_exe_linker_flag flag)
+  is_flag_present(AOM_EXE_LINKER_FLAGS "${flag}" flag_cached)
+  if(${flag_cached})
+    return()
+  endif()
+
+  append_flag(AOM_EXE_LINKER_FLAGS "${flag}")
+  foreach(config ${AOM_EXE_LINKER_CONFIGS})
+    append_flag(${config} "${flag}")
+  endforeach()
+endfunction()
+
+# Adds $flag to the link flags for $target.
+function(append_link_flag_to_target target flag)
+  unset(target_link_flags)
+  get_target_property(target_link_flags ${target} LINK_FLAGS)
+
+  if(target_link_flags)
+    is_flag_present(target_link_flags "${flag}" flag_found)
+    if(${flag_found})
+      return()
+    endif()
+    set(target_link_flags "${target_link_flags} ${flag}")
+  else()
+    set(target_link_flags "${flag}")
+  endif()
+
+  set_target_properties(${target} PROPERTIES LINK_FLAGS ${target_link_flags})
+endfunction()
+
+# Adds $flag to executable linker flags, and makes sure C/CXX builds still work.
+function(require_linker_flag flag)
+  if(DEBUG_CMAKE_DISABLE_COMPILER_TESTS)
+    return()
+  endif()
+
+  append_exe_linker_flag(${flag})
+
+  unset(c_passed)
+  aom_check_c_compiles("LINKER_FLAG_C_TEST(${flag})" "" c_passed)
+  unset(cxx_passed)
+  aom_check_cxx_compiles("LINKER_FLAG_CXX_TEST(${flag})" "" cxx_passed)
+
+  if(NOT c_passed OR NOT cxx_passed)
+    message(FATAL_ERROR "Linker flag test for ${flag} failed.")
+  endif()
+endfunction()
+
+# Appends flags in $AOM_EXTRA_<TYPE>_FLAGS variables to the flags used at build
+# time.
+function(set_user_flags)
+
+  # Linker flags are handled first because some C/CXX flags require that a
+  # linker flag is present at link time.
+  if(AOM_EXTRA_EXE_LINKER_FLAGS)
+    is_flag_present(AOM_EXE_LINKER_FLAGS "${AOM_EXTRA_EXE_LINKER_FLAGS}"
+                    extra_present)
+    if(NOT ${extra_present})
+      require_linker_flag("${AOM_EXTRA_EXE_LINKER_FLAGS}")
+    endif()
+  endif()
+  if(AOM_EXTRA_AS_FLAGS)
+
+    # TODO(tomfinegan): assembler flag testing would be a good thing to have.
+    is_flag_present(AOM_AS_FLAGS "${AOM_EXTRA_AS_FLAGS}" extra_present)
+    if(NOT ${extra_present})
+      append_flag(AOM_AS_FLAGS "${AOM_EXTRA_AS_FLAGS}")
+    endif()
+  endif()
+  if(AOM_EXTRA_C_FLAGS)
+    is_flag_present(AOM_C_FLAGS "${AOM_EXTRA_C_FLAGS}" extra_present)
+    if(NOT ${extra_present})
+      require_c_flag("${AOM_EXTRA_C_FLAGS}" YES)
+    endif()
+  endif()
+  if(AOM_EXTRA_CXX_FLAGS)
+    is_flag_present(AOM_CXX_FLAGS "${AOM_EXTRA_CXX_FLAGS}" extra_present)
+    if(NOT ${extra_present})
+      require_cxx_flag("${AOM_EXTRA_CXX_FLAGS}" YES)
+    endif()
+  endif()
+endfunction()
diff --git a/libs/libaom/src/build/cmake/compiler_tests.cmake b/libs/libaom/src/build/cmake/compiler_tests.cmake
new file mode 100644
index 000000000..040283225
--- /dev/null
+++ b/libs/libaom/src/build/cmake/compiler_tests.cmake
@@ -0,0 +1,179 @@
+#
+# Copyright (c) 2016, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_COMPILER_TESTS_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_COMPILER_TESTS_CMAKE_
+set(AOM_BUILD_CMAKE_COMPILER_TESTS_CMAKE_ 1)
+
+include(CheckCSourceCompiles)
+include(CheckCXXSourceCompiles)
+
+# CMake passes command line flags like this:
+#
+# * $compiler $lang_flags $lang_flags_config ...
+#
+# To ensure the flags tested here and elsewhere are obeyed a list of active
+# build configuration types is built, and flags are applied to the flag strings
+# for each configuration currently active for C and CXX builds as determined by
+# reading $CMAKE_CONFIGURATION_TYPES and $CMAKE_BUILD_TYPE. When
+# $CMAKE_CONFIGURATION_TYPES is non-empty a multi- configuration generator is in
+# use: currently this includes MSVC and Xcode. For other generators
+# $CMAKE_BUILD_TYPE is used. For both cases AOM_<LANG>_CONFIGS is populated with
+# CMake string variable names that contain flags for the currently available
+# configuration(s).
+unset(AOM_C_CONFIGS)
+unset(AOM_CXX_CONFIGS)
+list(LENGTH CMAKE_CONFIGURATION_TYPES num_configs)
+if(${num_configs} GREATER 0)
+  foreach(config ${CMAKE_CONFIGURATION_TYPES})
+    string(TOUPPER ${config} config)
+    list(APPEND AOM_C_CONFIGS "CMAKE_C_FLAGS_${config}")
+    list(APPEND AOM_CXX_CONFIGS "CMAKE_CXX_FLAGS_${config}")
+    list(APPEND AOM_EXE_LINKER_CONFIGS "CMAKE_EXE_LINKER_FLAGS_${config}")
+  endforeach()
+else()
+  string(TOUPPER ${CMAKE_BUILD_TYPE} config)
+  set(AOM_C_CONFIGS "CMAKE_C_FLAGS_${config}")
+  set(AOM_CXX_CONFIGS "CMAKE_CXX_FLAGS_${config}")
+  set(AOM_EXE_LINKER_CONFIGS "CMAKE_EXE_LINKER_FLAGS_${config}")
+endif()
+
+# The basic main() function used in all compile tests.
+set(AOM_C_MAIN "\nint main(void) { return 0; }")
+set(AOM_CXX_MAIN "\nint main() { return 0; }")
+
+# Strings containing the names of passed and failed tests.
+set(AOM_C_PASSED_TESTS)
+set(AOM_C_FAILED_TESTS)
+set(AOM_CXX_PASSED_TESTS)
+set(AOM_CXX_FAILED_TESTS)
+
+function(aom_push_var var new_value)
+  set(SAVED_${var} ${${var}} PARENT_SCOPE)
+  set(${var} "${${var}} ${new_value}" PARENT_SCOPE)
+endfunction()
+
+function(aom_pop_var var)
+  set(var ${SAVED_${var}} PARENT_SCOPE)
+  unset(SAVED_${var} PARENT_SCOPE)
+endfunction()
+
+# Confirms $test_source compiles and stores $test_name in one of
+# $AOM_C_PASSED_TESTS or $AOM_C_FAILED_TESTS depending on out come. When the
+# test passes $result_var is set to 1. When it fails $result_var is unset. The
+# test is not run if the test name is found in either of the passed or failed
+# test variables.
+function(aom_check_c_compiles test_name test_source result_var)
+  if(DEBUG_CMAKE_DISABLE_COMPILER_TESTS)
+    return()
+  endif()
+
+  unset(C_TEST_PASSED CACHE)
+  unset(C_TEST_FAILED CACHE)
+  string(FIND "${AOM_C_PASSED_TESTS}" "${test_name}" C_TEST_PASSED)
+  string(FIND "${AOM_C_FAILED_TESTS}" "${test_name}" C_TEST_FAILED)
+  if(${C_TEST_PASSED} EQUAL -1 AND ${C_TEST_FAILED} EQUAL -1)
+    unset(C_TEST_COMPILED CACHE)
+    message("Running C compiler test: ${test_name}")
+    check_c_source_compiles("${test_source} ${AOM_C_MAIN}" C_TEST_COMPILED)
+    set(${result_var} ${C_TEST_COMPILED} PARENT_SCOPE)
+
+    if(C_TEST_COMPILED)
+      set(AOM_C_PASSED_TESTS
+          "${AOM_C_PASSED_TESTS} ${test_name}"
+          CACHE STRING "" FORCE)
+    else()
+      set(AOM_C_FAILED_TESTS
+          "${AOM_C_FAILED_TESTS} ${test_name}"
+          CACHE STRING "" FORCE)
+      message("C Compiler test ${test_name} failed.")
+    endif()
+  elseif(NOT ${C_TEST_PASSED} EQUAL -1)
+    set(${result_var} 1 PARENT_SCOPE)
+  else() # ${C_TEST_FAILED} NOT EQUAL -1
+    unset(${result_var} PARENT_SCOPE)
+  endif()
+endfunction()
+
+# Confirms $test_source compiles and stores $test_name in one of
+# $AOM_CXX_PASSED_TESTS or $AOM_CXX_FAILED_TESTS depending on out come. When the
+# test passes $result_var is set to 1. When it fails $result_var is unset. The
+# test is not run if the test name is found in either of the passed or failed
+# test variables.
+function(aom_check_cxx_compiles test_name test_source result_var)
+  if(DEBUG_CMAKE_DISABLE_COMPILER_TESTS)
+    return()
+  endif()
+
+  unset(CXX_TEST_PASSED CACHE)
+  unset(CXX_TEST_FAILED CACHE)
+  string(FIND "${AOM_CXX_PASSED_TESTS}" "${test_name}" CXX_TEST_PASSED)
+  string(FIND "${AOM_CXX_FAILED_TESTS}" "${test_name}" CXX_TEST_FAILED)
+  if(${CXX_TEST_PASSED} EQUAL -1 AND ${CXX_TEST_FAILED} EQUAL -1)
+    unset(CXX_TEST_COMPILED CACHE)
+    message("Running CXX compiler test: ${test_name}")
+    check_cxx_source_compiles("${test_source} ${AOM_CXX_MAIN}"
+                              CXX_TEST_COMPILED)
+    set(${result_var} ${CXX_TEST_COMPILED} PARENT_SCOPE)
+
+    if(CXX_TEST_COMPILED)
+      set(AOM_CXX_PASSED_TESTS
+          "${AOM_CXX_PASSED_TESTS} ${test_name}"
+          CACHE STRING "" FORCE)
+    else()
+      set(AOM_CXX_FAILED_TESTS
+          "${AOM_CXX_FAILED_TESTS} ${test_name}"
+          CACHE STRING "" FORCE)
+      message("CXX Compiler test ${test_name} failed.")
+    endif()
+  elseif(NOT ${CXX_TEST_PASSED} EQUAL -1)
+    set(${result_var} 1 PARENT_SCOPE)
+  else() # ${CXX_TEST_FAILED} NOT EQUAL -1
+    unset(${result_var} PARENT_SCOPE)
+  endif()
+endfunction()
+
+# Convenience function that confirms $test_source compiles as C and C++.
+# $result_var is set to 1 when both tests are successful, and 0 when one or both
+# tests fail. Note: This function is intended to be used to write to result
+# variables that are expanded via configure_file(). $result_var is set to 1 or 0
+# to allow direct usage of the value in generated source files.
+function(aom_check_source_compiles test_name test_source result_var)
+  unset(C_PASSED)
+  unset(CXX_PASSED)
+  aom_check_c_compiles(${test_name} ${test_source} C_PASSED)
+  aom_check_cxx_compiles(${test_name} ${test_source} CXX_PASSED)
+  if(C_PASSED AND CXX_PASSED)
+    set(${result_var} 1 PARENT_SCOPE)
+  else()
+    set(${result_var} 0 PARENT_SCOPE)
+  endif()
+endfunction()
+
+# When inline support is detected for the current compiler the supported
+# inlining keyword is written to $result in caller scope.
+function(aom_get_inline result)
+  aom_check_source_compiles("inline_check_1"
+                            "static inline void function(void) {}"
+                            HAVE_INLINE_1)
+  if(HAVE_INLINE_1 EQUAL 1)
+    set(${result} "inline" PARENT_SCOPE)
+    return()
+  endif()
+
+  # Check __inline.
+  aom_check_source_compiles("inline_check_2"
+                            "static __inline void function(void) {}"
+                            HAVE_INLINE_2)
+  if(HAVE_INLINE_2 EQUAL 1)
+    set(${result} "__inline" PARENT_SCOPE)
+  endif()
+endfunction()
diff --git a/libs/libaom/src/build/cmake/cpu.cmake b/libs/libaom/src/build/cmake/cpu.cmake
new file mode 100644
index 000000000..ef2d7552b
--- /dev/null
+++ b/libs/libaom/src/build/cmake/cpu.cmake
@@ -0,0 +1,82 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+
+if("${AOM_TARGET_CPU}" MATCHES "^arm")
+  set(ARCH_ARM 1)
+  set(RTCD_ARCH_ARM "yes")
+
+  if(ENABLE_NEON)
+    set(HAVE_NEON 1)
+    set(RTCD_HAVE_NEON "yes")
+  else()
+    set(HAVE_NEON 0)
+    set(AOM_RTCD_FLAGS ${AOM_RTCD_FLAGS} --disable-neon)
+  endif()
+elseif("${AOM_TARGET_CPU}" MATCHES "^mips")
+  set(ARCH_MIPS 1)
+  set(RTCD_ARCH_MIPS "yes")
+
+  if("${AOM_TARGET_CPU}" STREQUAL "mips32")
+    set(HAVE_MIPS32 1)
+    set(RTCD_HAVE_MIPS32 "yes")
+  elseif("${AOM_TARGET_CPU}" STREQUAL "mips64")
+    set(HAVE_MIPS64 1)
+    set(RTCD_HAVE_MIPS64 "yes")
+  endif()
+
+  # HAVE_DSPR2 is set by mips toolchain files.
+  if(ENABLE_DSPR2 AND HAVE_DSPR2)
+    set(RTCD_HAVE_DSPR2 "yes")
+  else()
+    set(HAVE_DSPR2 0)
+    set(AOM_RTCD_FLAGS ${AOM_RTCD_FLAGS} --disable-dspr2)
+  endif()
+
+  # HAVE_MSA is set by mips toolchain files.
+  if(ENABLE_MSA AND HAVE_MSA)
+    set(RTCD_HAVE_MSA "yes")
+  else()
+    set(HAVE_MSA 0)
+    set(AOM_RTCD_FLAGS ${AOM_RTCD_FLAGS} --disable-msa)
+  endif()
+elseif("${AOM_TARGET_CPU}" MATCHES "ppc")
+  set(ARCH_PPC 1)
+  set(RTCD_ARCH_PPC "yes")
+
+  if(ENABLE_VSX)
+    set(HAVE_VSX 1)
+    set(RTCD_HAVE_VSX "yes")
+  else()
+    set(HAVE_VSX 0)
+    set(AOM_RTCD_FLAGS ${AOM_RTCD_FLAGS} --disable-vsx)
+  endif()
+elseif("${AOM_TARGET_CPU}" MATCHES "^x86")
+  if("${AOM_TARGET_CPU}" STREQUAL "x86")
+    set(ARCH_X86 1)
+    set(RTCD_ARCH_X86 "yes")
+  elseif("${AOM_TARGET_CPU}" STREQUAL "x86_64")
+    set(ARCH_X86_64 1)
+    set(RTCD_ARCH_X86_64 "yes")
+  endif()
+
+  set(X86_FLAVORS "MMX;SSE;SSE2;SSE3;SSSE3;SSE4_1;SSE4_2;AVX;AVX2")
+  foreach(flavor ${X86_FLAVORS})
+    if(ENABLE_${flavor} AND NOT disable_remaining_flavors)
+      set(HAVE_${flavor} 1)
+      set(RTCD_HAVE_${flavor} "yes")
+    else()
+      set(disable_remaining_flavors 1)
+      set(HAVE_${flavor} 0)
+      string(TOLOWER ${flavor} flavor)
+      set(AOM_RTCD_FLAGS ${AOM_RTCD_FLAGS} --disable-${flavor})
+    endif()
+  endforeach()
+endif()
diff --git a/libs/libaom/src/build/cmake/dist.cmake b/libs/libaom/src/build/cmake/dist.cmake
new file mode 100644
index 000000000..5b9fc95d4
--- /dev/null
+++ b/libs/libaom/src/build/cmake/dist.cmake
@@ -0,0 +1,64 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+cmake_minimum_required(VERSION 3.5)
+
+# Converts spaces in $in_string to semicolons and writes the output to
+# $out_string. In CMake's eyes this converts the input string to a list.
+function(listify_string in_string out_string)
+  string(REPLACE " " ";" ${out_string} ${in_string})
+  set(${out_string} "${${out_string}}" PARENT_SCOPE)
+endfunction()
+
+set(REQUIRED_ARGS "AOM_ROOT" "AOM_CONFIG_DIR" "AOM_DIST_DIR" "AOM_DIST_INCLUDES"
+                  "AOM_DIST_LIBS" "ENABLE_DOCS")
+
+foreach(arg ${REQUIRED_ARGS})
+  if("${${arg}}" STREQUAL "")
+    message(FATAL_ERROR "${arg} must not be empty.")
+  endif()
+endforeach()
+
+if(ENABLE_DOCS)
+  file(INSTALL "${AOM_CONFIG_DIR}/docs" DESTINATION "${AOM_DIST_DIR}")
+endif()
+
+if(AOM_DIST_EXAMPLES)
+  listify_string("${AOM_DIST_EXAMPLES}" "AOM_DIST_EXAMPLES")
+  foreach(example ${AOM_DIST_EXAMPLES})
+    if(NOT "${example}" MATCHES "aomdec\|aomenc")
+      file(INSTALL "${example}" DESTINATION "${AOM_DIST_DIR}/bin/examples")
+    endif()
+  endforeach()
+endif()
+
+if(AOM_DIST_TOOLS)
+  listify_string("${AOM_DIST_TOOLS}" "AOM_DIST_TOOLS")
+  foreach(tool ${AOM_DIST_TOOLS})
+    file(INSTALL "${tool}" DESTINATION "${AOM_DIST_DIR}/bin/tools")
+  endforeach()
+endif()
+
+if(AOM_DIST_APPS)
+  listify_string("${AOM_DIST_APPS}" "AOM_DIST_APPS")
+  foreach(app ${AOM_DIST_APPS})
+    file(INSTALL "${app}" DESTINATION "${AOM_DIST_DIR}/bin")
+  endforeach()
+endif()
+
+listify_string("${AOM_DIST_INCLUDES}" "AOM_DIST_INCLUDES")
+foreach(inc ${AOM_DIST_INCLUDES})
+  file(INSTALL "${inc}" DESTINATION "${AOM_DIST_DIR}/include/aom")
+endforeach()
+
+listify_string("${AOM_DIST_LIBS}" "AOM_DIST_LIBS")
+foreach(lib ${AOM_DIST_LIBS})
+  file(INSTALL "${lib}" DESTINATION "${AOM_DIST_DIR}/lib")
+endforeach()
diff --git a/libs/libaom/src/build/cmake/exports.cmake b/libs/libaom/src/build/cmake/exports.cmake
new file mode 100644
index 000000000..fa7842c9d
--- /dev/null
+++ b/libs/libaom/src/build/cmake/exports.cmake
@@ -0,0 +1,74 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_EXPORTS_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_EXPORTS_CMAKE_
+set(AOM_BUILD_CMAKE_EXPORTS_CMAKE_ 1)
+
+include("${AOM_ROOT}/build/cmake/exports_sources.cmake")
+
+# Creates the custom target which handles generation of the symbol export lists.
+function(setup_exports_target)
+  if("${AOM_TARGET_SYSTEM}" STREQUAL "Darwin")
+    set(symbol_file_ext "syms")
+  elseif("${AOM_TARGET_SYSTEM}" MATCHES "Windows\|MSYS" AND MSVC)
+    set(symbol_file_ext "def")
+  else()
+    set(symbol_file_ext "ver")
+  endif()
+
+  set(aom_sym_file "${AOM_CONFIG_DIR}/libaom.${symbol_file_ext}")
+
+  add_custom_target(generate_exports
+                    COMMAND ${CMAKE_COMMAND}
+                            -DAOM_ROOT="${AOM_ROOT}"
+                            -DAOM_CONFIG_DIR="${AOM_CONFIG_DIR}"
+                            -DAOM_TARGET_SYSTEM=${AOM_TARGET_SYSTEM}
+                            -DAOM_SYM_FILE="${aom_sym_file}"
+                            -DAOM_MSVC=${MSVC}
+                            -DAOM_XCODE=${XCODE}
+                            -DCONFIG_NAME=$<CONFIG>
+                            -DCONFIG_AV1_DECODER=${CONFIG_AV1_DECODER}
+                            -DCONFIG_AV1_ENCODER=${CONFIG_AV1_ENCODER}
+                            -DCONFIG_INSPECTION=${CONFIG_INSPECTION}
+                            -DENABLE_TESTS=${ENABLE_TESTS}
+                            -P
+                            "${AOM_ROOT}/build/cmake/generate_exports.cmake"
+                    SOURCES ${AOM_EXPORTS_SOURCES}
+                    DEPENDS ${AOM_EXPORTS_SOURCES})
+
+  # Make libaom depend on the exports file, and set flags to pick it up when
+  # creating the dylib.
+  add_dependencies(aom generate_exports)
+
+  if(APPLE)
+    set_property(TARGET aom
+                 APPEND_STRING
+                 PROPERTY LINK_FLAGS "-exported_symbols_list ${aom_sym_file}")
+  elseif(WIN32)
+    if(NOT MSVC)
+      set_property(TARGET aom
+                   APPEND_STRING
+                   PROPERTY LINK_FLAGS "-Wl,--version-script ${aom_sym_file}")
+    else()
+      set_property(TARGET aom
+                   APPEND_STRING
+                   PROPERTY LINK_FLAGS "/DEF:${aom_sym_file}")
+    endif()
+
+    # TODO(tomfinegan): Sort out the import lib situation and flags for MSVC.
+
+  else()
+    set_property(TARGET aom
+                 APPEND_STRING
+                 PROPERTY LINK_FLAGS "-Wl,--version-script,${aom_sym_file}")
+  endif()
+endfunction()
diff --git a/libs/libaom/src/build/cmake/exports_sources.cmake b/libs/libaom/src/build/cmake/exports_sources.cmake
new file mode 100644
index 000000000..46bf001d8
--- /dev/null
+++ b/libs/libaom/src/build/cmake/exports_sources.cmake
@@ -0,0 +1,35 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_EXPORTS_SOURCES_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_EXPORTS_SOURCES_CMAKE_
+set(AOM_BUILD_CMAKE_EXPORTS_SOURCES_CMAKE_ 1)
+
+list(APPEND AOM_EXPORTS_SOURCES "${AOM_ROOT}/aom/exports_com"
+            "${AOM_ROOT}/av1/exports_com")
+
+if(CONFIG_AV1_DECODER)
+  list(APPEND AOM_EXPORTS_SOURCES "${AOM_ROOT}/aom/exports_dec"
+              "${AOM_ROOT}/av1/exports_dec")
+  if(CONFIG_INSPECTION)
+    list(APPEND AOM_EXPORTS_SOURCES "${AOM_ROOT}/av1/exports_ident")
+  endif()
+endif()
+
+if(CONFIG_AV1_ENCODER)
+  list(APPEND AOM_EXPORTS_SOURCES "${AOM_ROOT}/aom/exports_enc"
+              "${AOM_ROOT}/av1/exports_enc")
+endif()
+
+if(ENABLE_TESTS)
+  list(APPEND AOM_EXPORTS_SOURCES "${AOM_ROOT}/aom/exports_test"
+              "${AOM_ROOT}/av1/exports_test")
+endif()
diff --git a/libs/libaom/src/build/cmake/generate_aom_config_templates.cmake b/libs/libaom/src/build/cmake/generate_aom_config_templates.cmake
new file mode 100644
index 000000000..529daaf02
--- /dev/null
+++ b/libs/libaom/src/build/cmake/generate_aom_config_templates.cmake
@@ -0,0 +1,92 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+cmake_minimum_required(VERSION 3.5)
+
+string(TIMESTAMP year "%Y")
+set(asm_file_header_block "\;
+\; Copyright (c) ${year}, Alliance for Open Media. All rights reserved
+\;
+\; This source code is subject to the terms of the BSD 2 Clause License and
+\; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+\; was not distributed with this source code in the LICENSE file, you can
+\; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+\; Media Patent License 1.0 was not distributed with this source code in the
+\; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+\;
+")
+set(h_file_header_block "/*
+ * Copyright (c) ${year}, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+\#ifndef AOM_CONFIG_H_
+\#define AOM_CONFIG_H_
+")
+set(cmake_file_header_block "##
+## Copyright (c) ${year}, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+")
+
+# Terminates cmake execution when $var_name is an empty string, or the variable
+# name it contains does not expand to an existing directory.
+function(check_directory_var var_name)
+  if("${var_name}" STREQUAL "")
+    message(FATAL_ERROR "The CMake variable ${var_name} must be defined.")
+  endif()
+
+  if(NOT EXISTS "${${var_name}}")
+    message(FATAL_ERROR "${${var_name}} (${var_name}) missing.")
+  endif()
+endfunction()
+
+check_directory_var(AOM_CONFIG_DIR)
+check_directory_var(AOM_ROOT)
+
+set(AOM_DEFAULTS "${AOM_ROOT}/build/cmake/aom_config_defaults.cmake")
+if(NOT EXISTS "${AOM_DEFAULTS}")
+  message(
+    FATAL_ERROR "Configuration default values file (${AOM_DEFAULTS}) missing.")
+endif()
+
+include("${AOM_ROOT}/build/cmake/aom_config_defaults.cmake")
+list(APPEND aom_build_vars ${AOM_DETECT_VARS} ${AOM_CONFIG_VARS})
+list(SORT aom_build_vars)
+
+set(aom_config_h_template "${AOM_CONFIG_DIR}/config/aom_config.h.cmake")
+file(WRITE "${aom_config_h_template}" ${h_file_header_block})
+foreach(aom_var ${aom_build_vars})
+  if(NOT "${aom_var}" STREQUAL "AOM_RTCD_FLAGS")
+    file(APPEND "${aom_config_h_template}"
+         "\#define ${aom_var} \${${aom_var}}\n")
+  endif()
+endforeach()
+file(APPEND "${aom_config_h_template}" "\#endif  // AOM_CONFIG_H_")
+
+set(aom_asm_config_template "${AOM_CONFIG_DIR}/config/aom_config.asm.cmake")
+file(WRITE "${aom_asm_config_template}" ${asm_file_header_block})
+foreach(aom_var ${aom_build_vars})
+  if(NOT "${aom_var}" STREQUAL "INLINE"
+     AND NOT "${aom_var}" STREQUAL "AOM_RTCD_FLAGS")
+    file(APPEND "${aom_asm_config_template}" "${aom_var} equ \${${aom_var}}\n")
+  endif()
+endforeach()
diff --git a/libs/libaom/src/build/cmake/generate_exports.cmake b/libs/libaom/src/build/cmake/generate_exports.cmake
new file mode 100644
index 000000000..f1d15a0fa
--- /dev/null
+++ b/libs/libaom/src/build/cmake/generate_exports.cmake
@@ -0,0 +1,66 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+cmake_minimum_required(VERSION 3.5)
+
+set(REQUIRED_ARGS "AOM_ROOT" "AOM_CONFIG_DIR" "AOM_TARGET_SYSTEM" "AOM_SYM_FILE"
+                  "CONFIG_AV1_DECODER" "CONFIG_AV1_ENCODER")
+
+foreach(arg ${REQUIRED_ARGS})
+  if("${${arg}}" STREQUAL "")
+    message(FATAL_ERROR "${arg} must not be empty.")
+  endif()
+endforeach()
+
+include("${AOM_ROOT}/build/cmake/exports_sources.cmake")
+
+if("${AOM_TARGET_SYSTEM}" STREQUAL "Darwin")
+  set(symbol_prefix "_")
+elseif("${AOM_TARGET_SYSTEM}" MATCHES "Windows\|MSYS" AND AOM_MSVC)
+  file(WRITE "${AOM_SYM_FILE}" "LIBRARY aom\n" "EXPORTS\n")
+else()
+  set(symbol_suffix ";")
+endif()
+
+set(aom_sym_file "${AOM_SYM_FILE}")
+
+if("${AOM_TARGET_SYSTEM}" STREQUAL "Darwin")
+  file(REMOVE "${aom_sym_file}")
+elseif("${AOM_TARGET_SYSTEM}" MATCHES "Windows\|MSYS" AND AOM_MSVC)
+  file(WRITE "${aom_sym_file}" "LIBRARY aom\n" "EXPORTS\n")
+else()
+  file(WRITE "${aom_sym_file}" "{\nglobal:\n")
+endif()
+
+foreach(export_file ${AOM_EXPORTS_SOURCES})
+  file(STRINGS "${export_file}" exported_file_data)
+  set(exported_symbols "${exported_symbols} ${exported_file_data};")
+  string(STRIP "${exported_symbols}" exported_symbols)
+endforeach()
+
+foreach(exported_symbol ${exported_symbols})
+  string(STRIP "${exported_symbol}" exported_symbol)
+  if("${AOM_TARGET_SYSTEM}" MATCHES "Windows\|MSYS" AND AOM_MSVC)
+    string(SUBSTRING ${exported_symbol} 0 4 export_type)
+    string(COMPARE EQUAL "${export_type}" "data" is_data)
+    if(is_data)
+      set(symbol_suffix " DATA")
+    else()
+      set(symbol_suffix "")
+    endif()
+  endif()
+  string(REGEX REPLACE "text \|data " "" "exported_symbol" "${exported_symbol}")
+  set(exported_symbol "  ${symbol_prefix}${exported_symbol}${symbol_suffix}")
+  file(APPEND "${aom_sym_file}" "${exported_symbol}\n")
+endforeach()
+
+if("${aom_sym_file}" MATCHES "ver$")
+  file(APPEND "${aom_sym_file}" " \nlocal:\n  *;\n};")
+endif()
diff --git a/libs/libaom/src/build/cmake/ios-Info.plist b/libs/libaom/src/build/cmake/ios-Info.plist
new file mode 100644
index 000000000..300e3e310
--- /dev/null
+++ b/libs/libaom/src/build/cmake/ios-Info.plist
@@ -0,0 +1,37 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+	<key>CFBundleDevelopmentRegion</key>
+	<string>en</string>
+	<key>CFBundleExecutable</key>
+	<string>AOM</string>
+	<key>CFBundleIdentifier</key>
+	<string>org.webmproject.AOM</string>
+	<key>CFBundleInfoDictionaryVersion</key>
+	<string>6.0</string>
+	<key>CFBundleName</key>
+	<string>AOM</string>
+	<key>CFBundlePackageType</key>
+	<string>FMWK</string>
+	<key>CFBundleShortVersionString</key>
+	<string>${VERSION}</string>
+	<key>CFBundleSignature</key>
+	<string>????</string>
+	<key>CFBundleSupportedPlatforms</key>
+	<array>
+		<string>iPhoneOS</string>
+	</array>
+	<key>CFBundleVersion</key>
+	<string>${VERSION}</string>
+	<key>MinimumOSVersion</key>
+	<string>${IOS_VERSION_MIN}</string>
+	<key>UIDeviceFamily</key>
+	<array>
+		<integer>1</integer>
+		<integer>2</integer>
+	</array>
+	<key>AOMFullVersion</key>
+	<string>${FULLVERSION}</string>
+</dict>
+</plist>
diff --git a/libs/libaom/src/build/cmake/iosbuild.sh b/libs/libaom/src/build/cmake/iosbuild.sh
new file mode 100644
index 000000000..167ece200
--- /dev/null
+++ b/libs/libaom/src/build/cmake/iosbuild.sh
@@ -0,0 +1,384 @@
+#!/bin/sh
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+## This script generates 'AOM.framework'. An iOS app can encode and decode AVx
+## video by including 'AOM.framework'.
+##
+## Run iosbuild.sh to create 'AOM.framework' in the current directory.
+##
+set -e
+devnull='> /dev/null 2>&1'
+
+BUILD_ROOT="_iosbuild"
+CONFIGURE_ARGS="--disable-docs
+                --disable-examples
+                --disable-libyuv
+                --disable-unit-tests"
+DIST_DIR="_dist"
+FRAMEWORK_DIR="AOM.framework"
+FRAMEWORK_LIB="AOM.framework/AOM"
+HEADER_DIR="${FRAMEWORK_DIR}/Headers/aom"
+SCRIPT_DIR=$(dirname "$0")
+LIBAOM_SOURCE_DIR=$(cd ${SCRIPT_DIR}/../..; pwd)
+LIPO=$(xcrun -sdk iphoneos${SDK} -find lipo)
+ORIG_PWD="$(pwd)"
+ARM_TARGETS="arm64-darwin-gcc
+             armv7-darwin-gcc
+             armv7s-darwin-gcc"
+SIM_TARGETS="x86-iphonesimulator-gcc
+             x86_64-iphonesimulator-gcc"
+OSX_TARGETS="x86-darwin16-gcc
+             x86_64-darwin16-gcc"
+TARGETS="${ARM_TARGETS} ${SIM_TARGETS}"
+
+# Configures for the target specified by $1, and invokes make with the dist
+# target using $ as the distribution output directory.
+build_target() {
+  local target="$1"
+  local old_pwd="$(pwd)"
+  local target_specific_flags=""
+
+  vlog "***Building target: ${target}***"
+
+  case "${target}" in
+    x86-*)
+      target_specific_flags="--enable-pic"
+      vlog "Enabled PIC for ${target}"
+      ;;
+  esac
+
+  mkdir "${target}"
+  cd "${target}"
+  # TODO(tomfinegan@google.com): switch to cmake.
+  eval "${LIBAOM_SOURCE_DIR}/configure" --target="${target}" \
+    ${CONFIGURE_ARGS} ${EXTRA_CONFIGURE_ARGS} ${target_specific_flags} \
+    ${devnull}
+  export DIST_DIR
+  eval make dist ${devnull}
+  cd "${old_pwd}"
+
+  vlog "***Done building target: ${target}***"
+}
+
+# Returns the preprocessor symbol for the target specified by $1.
+target_to_preproc_symbol() {
+  target="$1"
+  case "${target}" in
+    arm64-*)
+      echo "__aarch64__"
+      ;;
+    armv7-*)
+      echo "__ARM_ARCH_7A__"
+      ;;
+    armv7s-*)
+      echo "__ARM_ARCH_7S__"
+      ;;
+    x86-*)
+      echo "__i386__"
+      ;;
+    x86_64-*)
+      echo "__x86_64__"
+      ;;
+    *)
+      echo "#error ${target} unknown/unsupported"
+      return 1
+      ;;
+  esac
+}
+
+# Create a aom_config.h shim that, based on preprocessor settings for the
+# current target CPU, includes the real aom_config.h for the current target.
+# $1 is the list of targets.
+create_aom_framework_config_shim() {
+  local targets="$1"
+  local config_file="${HEADER_DIR}/aom_config.h"
+  local preproc_symbol=""
+  local target=""
+  local include_guard="AOM_FRAMEWORK_HEADERS_AOM_AOM_CONFIG_H_"
+
+  local file_header="/*
+ *  Copyright (c) $(date +%Y), Alliance for Open Media. All rights reserved.
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/* GENERATED FILE: DO NOT EDIT! */
+
+#ifndef ${include_guard}
+#define ${include_guard}
+
+#if defined"
+
+  printf "%s" "${file_header}" > "${config_file}"
+  for target in ${targets}; do
+    preproc_symbol=$(target_to_preproc_symbol "${target}")
+    printf " ${preproc_symbol}\n" >> "${config_file}"
+    printf "#define AOM_FRAMEWORK_TARGET \"${target}\"\n" >> "${config_file}"
+    printf "#include \"AOM/aom/${target}/aom_config.h\"\n" >> "${config_file}"
+    printf "#elif defined" >> "${config_file}"
+    mkdir "${HEADER_DIR}/${target}"
+    cp -p "${BUILD_ROOT}/${target}/aom_config.h" "${HEADER_DIR}/${target}"
+  done
+
+  # Consume the last line of output from the loop: We don't want it.
+  sed -i '' -e '$d' "${config_file}"
+
+  printf "#endif\n\n" >> "${config_file}"
+  printf "#endif  // ${include_guard}" >> "${config_file}"
+}
+
+# Verifies that $FRAMEWORK_LIB fat library contains requested builds.
+verify_framework_targets() {
+  local requested_cpus=""
+  local cpu=""
+
+  # Extract CPU from full target name.
+  for target; do
+    cpu="${target%%-*}"
+    if [ "${cpu}" = "x86" ]; then
+      # lipo -info outputs i386 for libaom x86 targets.
+      cpu="i386"
+    fi
+    requested_cpus="${requested_cpus}${cpu} "
+  done
+
+  # Get target CPUs present in framework library.
+  local targets_built=$(${LIPO} -info ${FRAMEWORK_LIB})
+
+  # $LIPO -info outputs a string like the following:
+  #   Architectures in the fat file: $FRAMEWORK_LIB <architectures>
+  # Capture only the architecture strings.
+  targets_built=${targets_built##*: }
+
+  # Sort CPU strings to make the next step a simple string compare.
+  local actual=$(echo ${targets_built} | tr " " "\n" | sort | tr "\n" " ")
+  local requested=$(echo ${requested_cpus} | tr " " "\n" | sort | tr "\n" " ")
+
+  vlog "Requested ${FRAMEWORK_LIB} CPUs: ${requested}"
+  vlog "Actual ${FRAMEWORK_LIB} CPUs: ${actual}"
+
+  if [ "${requested}" != "${actual}" ]; then
+    elog "Actual ${FRAMEWORK_LIB} targets do not match requested target list."
+    elog "  Requested target CPUs: ${requested}"
+    elog "  Actual target CPUs: ${actual}"
+    return 1
+  fi
+}
+
+# Configures and builds each target specified by $1, and then builds
+# AOM.framework.
+build_framework() {
+  local lib_list=""
+  local targets="$1"
+  local target=""
+  local target_dist_dir=""
+
+  # Clean up from previous build(s).
+  rm -rf "${BUILD_ROOT}" "${FRAMEWORK_DIR}"
+
+  # Create output dirs.
+  mkdir -p "${BUILD_ROOT}"
+  mkdir -p "${HEADER_DIR}"
+
+  cd "${BUILD_ROOT}"
+
+  for target in ${targets}; do
+    build_target "${target}"
+    target_dist_dir="${BUILD_ROOT}/${target}/${DIST_DIR}"
+    if [ "${ENABLE_SHARED}" = "yes" ]; then
+      local suffix="dylib"
+    else
+      local suffix="a"
+    fi
+    lib_list="${lib_list} ${target_dist_dir}/lib/libaom.${suffix}"
+  done
+
+  cd "${ORIG_PWD}"
+
+  # The basic libaom API includes are all the same; just grab the most recent
+  # set.
+  cp -p "${target_dist_dir}"/include/aom/* "${HEADER_DIR}"
+
+  # Build the fat library.
+  ${LIPO} -create ${lib_list} -output ${FRAMEWORK_DIR}/AOM
+
+  # Create the aom_config.h shim that allows usage of aom_config.h from
+  # within AOM.framework.
+  create_aom_framework_config_shim "${targets}"
+
+  # Copy in aom_version.h.
+  cp -p "${BUILD_ROOT}/${target}/aom_version.h" "${HEADER_DIR}"
+
+  if [ "${ENABLE_SHARED}" = "yes" ]; then
+    # Adjust the dylib's name so dynamic linking in apps works as expected.
+    install_name_tool -id '@rpath/AOM.framework/AOM' ${FRAMEWORK_DIR}/AOM
+
+    # Copy in Info.plist.
+    cat "${SCRIPT_DIR}/ios-Info.plist" \
+      | sed "s/\${FULLVERSION}/${FULLVERSION}/g" \
+      | sed "s/\${VERSION}/${VERSION}/g" \
+      | sed "s/\${IOS_VERSION_MIN}/${IOS_VERSION_MIN}/g" \
+      > "${FRAMEWORK_DIR}/Info.plist"
+  fi
+
+  # Confirm AOM.framework/AOM contains the targets requested.
+  verify_framework_targets ${targets}
+
+  vlog "Created fat library ${FRAMEWORK_LIB} containing:"
+  for lib in ${lib_list}; do
+    vlog "  $(echo ${lib} | awk -F / '{print $2, $NF}')"
+  done
+}
+
+# Trap function. Cleans up the subtree used to build all targets contained in
+# $TARGETS.
+cleanup() {
+  local res=$?
+  cd "${ORIG_PWD}"
+
+  if [ $res -ne 0 ]; then
+    elog "build exited with error ($res)"
+  fi
+
+  if [ "${PRESERVE_BUILD_OUTPUT}" != "yes" ]; then
+    rm -rf "${BUILD_ROOT}"
+  fi
+}
+
+print_list() {
+  local indent="$1"
+  shift
+  local list="$@"
+  for entry in ${list}; do
+    echo "${indent}${entry}"
+  done
+}
+
+iosbuild_usage() {
+cat << EOF
+  Usage: ${0##*/} [arguments]
+    --help: Display this message and exit.
+    --enable-shared: Build a dynamic framework for use on iOS 8 or later.
+    --extra-configure-args <args>: Extra args to pass when configuring libaom.
+    --macosx: Uses darwin16 targets instead of iphonesimulator targets for x86
+              and x86_64. Allows linking to framework when builds target MacOSX
+              instead of iOS.
+    --preserve-build-output: Do not delete the build directory.
+    --show-build-output: Show output from each library build.
+    --targets <targets>: Override default target list. Defaults:
+$(print_list "        " ${TARGETS})
+    --test-link: Confirms all targets can be linked. Functionally identical to
+                 passing --enable-examples via --extra-configure-args.
+    --verbose: Output information about the environment and each stage of the
+               build.
+EOF
+}
+
+elog() {
+  echo "${0##*/} failed because: $@" 1>&2
+}
+
+vlog() {
+  if [ "${VERBOSE}" = "yes" ]; then
+    echo "$@"
+  fi
+}
+
+trap cleanup EXIT
+
+# Parse the command line.
+while [ -n "$1" ]; do
+  case "$1" in
+    --extra-configure-args)
+      EXTRA_CONFIGURE_ARGS="$2"
+      shift
+      ;;
+    --help)
+      iosbuild_usage
+      exit
+      ;;
+    --enable-shared)
+      ENABLE_SHARED=yes
+      ;;
+    --preserve-build-output)
+      PRESERVE_BUILD_OUTPUT=yes
+      ;;
+    --show-build-output)
+      devnull=
+      ;;
+    --test-link)
+      EXTRA_CONFIGURE_ARGS="${EXTRA_CONFIGURE_ARGS} --enable-examples"
+      ;;
+    --targets)
+      TARGETS="$2"
+      shift
+      ;;
+    --macosx)
+      TARGETS="${ARM_TARGETS} ${OSX_TARGETS}"
+      ;;
+    --verbose)
+      VERBOSE=yes
+      ;;
+    *)
+      iosbuild_usage
+      exit 1
+      ;;
+  esac
+  shift
+done
+
+if [ "${ENABLE_SHARED}" = "yes" ]; then
+  CONFIGURE_ARGS="--enable-shared ${CONFIGURE_ARGS}"
+fi
+
+FULLVERSION=$("${SCRIPT_DIR}"/version.sh --bare "${LIBAOM_SOURCE_DIR}")
+VERSION=$(echo "${FULLVERSION}" | sed -E 's/^v([0-9]+\.[0-9]+\.[0-9]+).*$/\1/')
+
+if [ "$ENABLE_SHARED" = "yes" ]; then
+  IOS_VERSION_OPTIONS="--enable-shared"
+  IOS_VERSION_MIN="8.0"
+else
+  IOS_VERSION_OPTIONS=""
+  IOS_VERSION_MIN="6.0"
+fi
+
+if [ "${VERBOSE}" = "yes" ]; then
+cat << EOF
+  BUILD_ROOT=${BUILD_ROOT}
+  DIST_DIR=${DIST_DIR}
+  CONFIGURE_ARGS=${CONFIGURE_ARGS}
+  EXTRA_CONFIGURE_ARGS=${EXTRA_CONFIGURE_ARGS}
+  FRAMEWORK_DIR=${FRAMEWORK_DIR}
+  FRAMEWORK_LIB=${FRAMEWORK_LIB}
+  HEADER_DIR=${HEADER_DIR}
+  LIBAOM_SOURCE_DIR=${LIBAOM_SOURCE_DIR}
+  LIPO=${LIPO}
+  MAKEFLAGS=${MAKEFLAGS}
+  ORIG_PWD=${ORIG_PWD}
+  PRESERVE_BUILD_OUTPUT=${PRESERVE_BUILD_OUTPUT}
+  TARGETS="$(print_list "" ${TARGETS})"
+  ENABLE_SHARED=${ENABLE_SHARED}
+  OSX_TARGETS="${OSX_TARGETS}"
+  SIM_TARGETS="${SIM_TARGETS}"
+  SCRIPT_DIR="${SCRIPT_DIR}"
+  FULLVERSION="${FULLVERSION}"
+  VERSION="${VERSION}"
+  IOS_VERSION_MIN="${IOS_VERSION_MIN}"
+EOF
+fi
+
+build_framework "${TARGETS}"
+echo "Successfully built '${FRAMEWORK_DIR}' for:"
+print_list "" ${TARGETS}
diff --git a/libs/libaom/src/build/cmake/msvc_runtime.cmake b/libs/libaom/src/build/cmake/msvc_runtime.cmake
new file mode 100644
index 000000000..9e4cbea43
--- /dev/null
+++ b/libs/libaom/src/build/cmake/msvc_runtime.cmake
@@ -0,0 +1,37 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_MSVC_RUNTIME_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_MSVC_RUNTIME_CMAKE_
+set(AOM_BUILD_CMAKE_MSVC_RUNTIME_CMAKE_ 1)
+
+if(MSVC)
+
+  # CMake defaults to producing code linked to the DLL MSVC runtime. That will
+  # not work with googletest, and isn't what we want anyway.
+  if(NOT "${MSVC_RUNTIME}" STREQUAL "dll")
+    foreach(flag_var
+            CMAKE_C_FLAGS
+            CMAKE_C_FLAGS_DEBUG
+            CMAKE_C_FLAGS_RELEASE
+            CMAKE_C_FLAGS_MINSIZEREL
+            CMAKE_C_FLAGS_RELWITHDEBINFO
+            CMAKE_CXX_FLAGS
+            CMAKE_CXX_FLAGS_DEBUG
+            CMAKE_CXX_FLAGS_RELEASE
+            CMAKE_CXX_FLAGS_MINSIZEREL
+            CMAKE_CXX_FLAGS_RELWITHDEBINFO)
+      if(${flag_var} MATCHES "/MD")
+        string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
+      endif(${flag_var} MATCHES "/MD")
+    endforeach(flag_var)
+  endif()
+endif()
diff --git a/libs/libaom/src/build/cmake/pkg_config.cmake b/libs/libaom/src/build/cmake/pkg_config.cmake
new file mode 100644
index 000000000..c3914d79e
--- /dev/null
+++ b/libs/libaom/src/build/cmake/pkg_config.cmake
@@ -0,0 +1,62 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+cmake_minimum_required(VERSION 3.5)
+
+set(REQUIRED_ARGS "AOM_ROOT" "AOM_CONFIG_DIR" "CMAKE_INSTALL_PREFIX"
+                  "CMAKE_INSTALL_BINDIR" "CMAKE_INSTALL_INCLUDEDIR"
+                  "CMAKE_INSTALL_LIBDIR" "CMAKE_PROJECT_NAME"
+                  "CONFIG_MULTITHREAD" "HAVE_PTHREAD_H")
+
+foreach(arg ${REQUIRED_ARGS})
+  if("${${arg}}" STREQUAL "")
+    message(FATAL_ERROR "${arg} must not be empty.")
+  endif()
+endforeach()
+
+include("${AOM_ROOT}/build/cmake/util.cmake")
+
+extract_version_string("${AOM_CONFIG_DIR}/config/aom_version.h" aom_version)
+
+# Create a version string suitable for comparison using the RPM version compare
+# algorithm: strip out everything after the number.
+string(FIND "${aom_version}" "-" dash_pos)
+if(${dash_pos} EQUAL -1)
+  set(package_version "${aom_version}")
+else()
+  string(SUBSTRING "${aom_version}" 0 ${dash_pos} package_version)
+endif()
+
+# Write pkg-config info.
+set(prefix "${CMAKE_INSTALL_PREFIX}")
+set(bindir "${CMAKE_INSTALL_BINDIR}")
+set(includedir "${CMAKE_INSTALL_INCLUDEDIR}")
+set(libdir "${CMAKE_INSTALL_LIBDIR}")
+set(pkgconfig_file "${AOM_CONFIG_DIR}/aom.pc")
+string(TOLOWER ${CMAKE_PROJECT_NAME} pkg_name)
+file(WRITE "${pkgconfig_file}" "# libaom pkg-config.\n")
+file(APPEND "${pkgconfig_file}" "prefix=${prefix}\n")
+file(APPEND "${pkgconfig_file}" "exec_prefix=\${prefix}\n")
+file(APPEND "${pkgconfig_file}" "includedir=\${prefix}/${includedir}\n")
+file(APPEND "${pkgconfig_file}" "libdir=\${exec_prefix}/${libdir}\n\n")
+file(APPEND "${pkgconfig_file}" "Name: ${pkg_name}\n")
+file(
+  APPEND "${pkgconfig_file}"
+  "Description: Alliance for Open Media AV1 codec library v${aom_version}.\n")
+file(APPEND "${pkgconfig_file}" "Version: ${package_version}\n")
+file(APPEND "${pkgconfig_file}" "Requires:\n")
+file(APPEND "${pkgconfig_file}" "Conflicts:\n")
+file(APPEND "${pkgconfig_file}" "Libs: -L\${libdir} -l${pkg_name}\n")
+if(CONFIG_MULTITHREAD AND HAVE_PTHREAD_H)
+  file(APPEND "${pkgconfig_file}" "Libs.private: -lm -lpthread\n")
+else()
+  file(APPEND "${pkgconfig_file}" "Libs.private: -lm\n")
+endif()
+file(APPEND "${pkgconfig_file}" "Cflags: -I\${includedir}\n")
diff --git a/libs/libaom/src/build/cmake/rtcd.pl b/libs/libaom/src/build/cmake/rtcd.pl
new file mode 100644
index 000000000..dafccdca9
--- /dev/null
+++ b/libs/libaom/src/build/cmake/rtcd.pl
@@ -0,0 +1,467 @@
+#!/usr/bin/env perl
+##
+## Copyright (c) 2017, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+no strict 'refs';
+use warnings;
+use Getopt::Long;
+Getopt::Long::Configure("auto_help") if $Getopt::Long::VERSION > 2.32;
+
+my %ALL_FUNCS = ();
+my @ALL_ARCHS;
+my @ALL_FORWARD_DECLS;
+my @REQUIRES;
+
+my %opts = ();
+my %disabled = ();
+my %required = ();
+
+my @argv;
+foreach (@ARGV) {
+  $disabled{$1} = 1, next if /--disable-(.*)/;
+  $required{$1} = 1, next if /--require-(.*)/;
+  push @argv, $_;
+}
+
+# NB: use GetOptions() instead of GetOptionsFromArray() for compatibility.
+@ARGV = @argv;
+GetOptions(
+  \%opts,
+  'arch=s',
+  'sym=s',
+  'config=s',
+);
+
+foreach my $opt (qw/arch config/) {
+  if (!defined($opts{$opt})) {
+    warn "--$opt is required!\n";
+    Getopt::Long::HelpMessage('-exit' => 1);
+  }
+}
+
+foreach my $defs_file (@ARGV) {
+  if (!-f $defs_file) {
+    warn "$defs_file: $!\n";
+    Getopt::Long::HelpMessage('-exit' => 1);
+  }
+}
+
+open CONFIG_FILE, $opts{config} or
+  die "Error opening config file '$opts{config}': $!\n";
+
+my %config = ();
+while (<CONFIG_FILE>) {
+  next if !/^#define\s+(?:CONFIG_|HAVE_)/;
+  chomp;
+  my @line_components = split /\s/;
+  scalar @line_components > 2 or
+    die "Invalid input passed to rtcd.pl via $opts{config}.";
+  # $line_components[0] = #define
+  # $line_components[1] = flag name (CONFIG_SOMETHING or HAVE_SOMETHING)
+  # $line_components[2] = flag value (0 or 1)
+  $config{$line_components[1]} = "$line_components[2]" eq "1" ? "yes" : "";
+}
+close CONFIG_FILE;
+
+#
+# Routines for the RTCD DSL to call
+#
+sub aom_config($) {
+  return (defined $config{$_[0]}) ? $config{$_[0]} : "";
+}
+
+sub specialize {
+  if (@_ <= 1) {
+    die "'specialize' must be called with a function name and at least one ",
+        "architecture ('C' is implied): \n@_\n";
+  }
+  my $fn=$_[0];
+  shift;
+  foreach my $opt (@_) {
+    eval "\$${fn}_${opt}=${fn}_${opt}";
+  }
+}
+
+sub add_proto {
+  my $fn = splice(@_, -2, 1);
+  $ALL_FUNCS{$fn} = \@_;
+  specialize $fn, "c";
+}
+
+sub require {
+  foreach my $fn (keys %ALL_FUNCS) {
+    foreach my $opt (@_) {
+      my $ofn = eval "\$${fn}_${opt}";
+      next if !$ofn;
+
+      # if we already have a default, then we can disable it, as we know
+      # we can do better.
+      my $best = eval "\$${fn}_default";
+      if ($best) {
+        my $best_ofn = eval "\$${best}";
+        if ($best_ofn && "$best_ofn" ne "$ofn") {
+          eval "\$${best}_link = 'false'";
+        }
+      }
+      eval "\$${fn}_default=${fn}_${opt}";
+      eval "\$${fn}_${opt}_link='true'";
+    }
+  }
+}
+
+sub forward_decls {
+  push @ALL_FORWARD_DECLS, @_;
+}
+
+#
+# Include the user's directives
+#
+foreach my $f (@ARGV) {
+  open FILE, "<", $f or die "cannot open $f: $!\n";
+  my $contents = join('', <FILE>);
+  close FILE;
+  eval $contents or warn "eval failed: $@\n";
+}
+
+#
+# Process the directives according to the command line
+#
+sub process_forward_decls() {
+  foreach (@ALL_FORWARD_DECLS) {
+    $_->();
+  }
+}
+
+sub determine_indirection {
+  aom_config("CONFIG_RUNTIME_CPU_DETECT") eq "yes" or &require(@ALL_ARCHS);
+  foreach my $fn (keys %ALL_FUNCS) {
+    my $n = "";
+    my @val = @{$ALL_FUNCS{$fn}};
+    my $args = pop @val;
+    my $rtyp = "@val";
+    my $dfn = eval "\$${fn}_default";
+    $dfn = eval "\$${dfn}";
+    foreach my $opt (@_) {
+      my $ofn = eval "\$${fn}_${opt}";
+      next if !$ofn;
+      my $link = eval "\$${fn}_${opt}_link";
+      next if $link && $link eq "false";
+      $n .= "x";
+    }
+    if ($n eq "x") {
+      eval "\$${fn}_indirect = 'false'";
+    } else {
+      eval "\$${fn}_indirect = 'true'";
+    }
+  }
+}
+
+sub declare_function_pointers {
+  foreach my $fn (sort keys %ALL_FUNCS) {
+    my @val = @{$ALL_FUNCS{$fn}};
+    my $args = pop @val;
+    my $rtyp = "@val";
+    my $dfn = eval "\$${fn}_default";
+    $dfn = eval "\$${dfn}";
+    foreach my $opt (@_) {
+      my $ofn = eval "\$${fn}_${opt}";
+      next if !$ofn;
+      print "$rtyp ${ofn}($args);\n";
+    }
+    if (eval "\$${fn}_indirect" eq "false") {
+      print "#define ${fn} ${dfn}\n";
+    } else {
+      print "RTCD_EXTERN $rtyp (*${fn})($args);\n";
+    }
+    print "\n";
+  }
+}
+
+sub set_function_pointers {
+  foreach my $fn (sort keys %ALL_FUNCS) {
+    my @val = @{$ALL_FUNCS{$fn}};
+    my $args = pop @val;
+    my $rtyp = "@val";
+    my $dfn = eval "\$${fn}_default";
+    $dfn = eval "\$${dfn}";
+    if (eval "\$${fn}_indirect" eq "true") {
+      print "    $fn = $dfn;\n";
+      foreach my $opt (@_) {
+        my $ofn = eval "\$${fn}_${opt}";
+        next if !$ofn;
+        next if "$ofn" eq "$dfn";
+        my $link = eval "\$${fn}_${opt}_link";
+        next if $link && $link eq "false";
+        my $cond = eval "\$have_${opt}";
+        print "    if (${cond}) $fn = $ofn;\n"
+      }
+    }
+  }
+}
+
+sub filter {
+  my @filtered;
+  foreach (@_) { push @filtered, $_ unless $disabled{$_}; }
+  return @filtered;
+}
+
+#
+# Helper functions for generating the arch specific RTCD files
+#
+sub common_top() {
+  my $include_guard = uc($opts{sym})."_H_";
+  print <<EOF;
+// This file is generated. Do not edit.
+#ifndef ${include_guard}
+#define ${include_guard}
+
+#ifdef RTCD_C
+#define RTCD_EXTERN
+#else
+#define RTCD_EXTERN extern
+#endif
+
+EOF
+
+process_forward_decls();
+print <<EOF;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+EOF
+declare_function_pointers("c", @ALL_ARCHS);
+
+print <<EOF;
+void $opts{sym}(void);
+
+EOF
+}
+
+sub common_bottom() {
+  print <<EOF;
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif
+EOF
+}
+
+sub x86() {
+  determine_indirection("c", @ALL_ARCHS);
+
+  # Assign the helper variable for each enabled extension
+  foreach my $opt (@ALL_ARCHS) {
+    my $opt_uc = uc $opt;
+    eval "\$have_${opt}=\"flags & HAS_${opt_uc}\"";
+  }
+
+  common_top;
+  print <<EOF;
+#ifdef RTCD_C
+#include "aom_ports/x86.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = x86_simd_caps();
+
+    (void)flags;
+
+EOF
+
+  set_function_pointers("c", @ALL_ARCHS);
+
+  print <<EOF;
+}
+#endif
+EOF
+  common_bottom;
+}
+
+sub arm() {
+  determine_indirection("c", @ALL_ARCHS);
+
+  # Assign the helper variable for each enabled extension
+  foreach my $opt (@ALL_ARCHS) {
+    my $opt_uc = uc $opt;
+    eval "\$have_${opt}=\"flags & HAS_${opt_uc}\"";
+  }
+
+  common_top;
+  print <<EOF;
+#include "config/aom_config.h"
+
+#ifdef RTCD_C
+#include "aom_ports/arm.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = aom_arm_cpu_caps();
+
+    (void)flags;
+
+EOF
+
+  set_function_pointers("c", @ALL_ARCHS);
+
+  print <<EOF;
+}
+#endif
+EOF
+  common_bottom;
+}
+
+sub mips() {
+  determine_indirection("c", @ALL_ARCHS);
+
+  # Assign the helper variable for each enabled extension
+  foreach my $opt (@ALL_ARCHS) {
+    my $opt_uc = uc $opt;
+    eval "\$have_${opt}=\"flags & HAS_${opt_uc}\"";
+  }
+
+  common_top;
+
+  print <<EOF;
+#include "config/aom_config.h"
+
+#ifdef RTCD_C
+static void setup_rtcd_internal(void)
+{
+EOF
+
+  set_function_pointers("c", @ALL_ARCHS);
+
+  print <<EOF;
+#if HAVE_DSPR2
+void aom_dsputil_static_init();
+aom_dsputil_static_init();
+#endif
+}
+#endif
+EOF
+  common_bottom;
+}
+
+sub ppc() {
+  determine_indirection("c", @ALL_ARCHS);
+
+  # Assign the helper variable for each enabled extension
+  foreach my $opt (@ALL_ARCHS) {
+    my $opt_uc = uc $opt;
+    eval "\$have_${opt}=\"flags & HAS_${opt_uc}\"";
+  }
+
+  common_top;
+
+  print <<EOF;
+#include "config/aom_config.h"
+
+#ifdef RTCD_C
+#include "aom_ports/ppc.h"
+static void setup_rtcd_internal(void)
+{
+  int flags = ppc_simd_caps();
+
+  (void)flags;
+
+EOF
+
+  set_function_pointers("c", @ALL_ARCHS);
+
+  print <<EOF;
+}
+#endif
+EOF
+  common_bottom;
+}
+
+sub unoptimized() {
+  determine_indirection "c";
+  common_top;
+  print <<EOF;
+#include "config/aom_config.h"
+
+#ifdef RTCD_C
+static void setup_rtcd_internal(void)
+{
+EOF
+
+  set_function_pointers "c";
+
+  print <<EOF;
+}
+#endif
+EOF
+  common_bottom;
+}
+
+#
+# Main Driver
+#
+
+&require("c");
+&require(keys %required);
+if ($opts{arch} eq 'x86') {
+  @ALL_ARCHS = filter(qw/mmx sse sse2 sse3 ssse3 sse4_1 sse4_2 avx avx2/);
+  x86;
+} elsif ($opts{arch} eq 'x86_64') {
+  @ALL_ARCHS = filter(qw/mmx sse sse2 sse3 ssse3 sse4_1 sse4_2 avx avx2/);
+  @REQUIRES = filter(qw/mmx sse sse2/);
+  &require(@REQUIRES);
+  x86;
+} elsif ($opts{arch} eq 'mips32' || $opts{arch} eq 'mips64') {
+  @ALL_ARCHS = filter("$opts{arch}");
+  if (aom_config("HAVE_DSPR2") eq "yes") {
+    @ALL_ARCHS = filter("$opts{arch}", qw/dspr2/);
+  } elsif (aom_config("HAVE_MSA") eq "yes") {
+    @ALL_ARCHS = filter("$opts{arch}", qw/msa/);
+  }
+  mips;
+} elsif ($opts{arch} =~ /armv[78]\w?/) {
+  @ALL_ARCHS = filter(qw/neon/);
+  arm;
+} elsif ($opts{arch} eq 'arm64' ) {
+  @ALL_ARCHS = filter(qw/neon/);
+  &require("neon");
+  arm;
+} elsif ($opts{arch} eq 'ppc') {
+  @ALL_ARCHS = filter(qw/vsx/);
+  ppc;
+} else {
+  unoptimized;
+}
+
+__END__
+
+=head1 NAME
+
+rtcd -
+
+=head1 SYNOPSIS
+
+Usage: rtcd.pl [options] FILE
+
+See 'perldoc rtcd.pl' for more details.
+
+=head1 DESCRIPTION
+
+Reads the Run Time CPU Detections definitions from FILE and generates a
+C header file on stdout.
+
+=head1 OPTIONS
+
+Options:
+  --arch=ARCH       Architecture to generate defs for (required)
+  --disable-EXT     Disable support for EXT extensions
+  --require-EXT     Require support for EXT extensions
+  --sym=SYMBOL      Unique symbol to use for RTCD initialization function
+  --config=FILE     Path to file containing C preprocessor directives to parse
diff --git a/libs/libaom/src/build/cmake/sanitizers.cmake b/libs/libaom/src/build/cmake/sanitizers.cmake
new file mode 100644
index 000000000..bcb600ce4
--- /dev/null
+++ b/libs/libaom/src/build/cmake/sanitizers.cmake
@@ -0,0 +1,46 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_SANITIZERS_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_SANITIZERS_CMAKE_
+set(AOM_BUILD_CMAKE_SANITIZERS_CMAKE_ 1)
+
+if(MSVC OR NOT SANITIZE)
+  return()
+endif()
+
+include("${AOM_ROOT}/build/cmake/compiler_flags.cmake")
+
+string(TOLOWER ${SANITIZE} SANITIZE)
+
+# Require the sanitizer requested. cfi sanitizer requires all the flags in order
+# for the compiler to accept it.
+if("${SANITIZE}" MATCHES "cfi" AND CMAKE_C_COMPILER_ID MATCHES "Clang")
+  require_linker_flag("-fsanitize=${SANITIZE} -flto -fno-sanitize-trap=cfi \
+    -fuse-ld=gold" YES)
+  require_compiler_flag("-fsanitize=${SANITIZE} -flto -fvisibility=hidden \
+    -fno-sanitize-trap=cfi" YES)
+else()
+  require_linker_flag("-fsanitize=${SANITIZE}")
+  require_compiler_flag("-fsanitize=${SANITIZE}" YES)
+endif()
+
+# Make callstacks accurate.
+require_compiler_flag("-fno-omit-frame-pointer -fno-optimize-sibling-calls" YES)
+
+# Fix link errors due to missing rt compiler lib in 32-bit builds.
+# http://llvm.org/bugs/show_bug.cgi?id=17693
+if(CMAKE_C_COMPILER_ID MATCHES "Clang")
+  if(${CMAKE_SIZEOF_VOID_P} EQUAL 4
+     AND "${SANITIZE}" MATCHES "integer|undefined")
+    require_linker_flag("--rtlib=compiler-rt -lgcc_s")
+  endif()
+endif()
diff --git a/libs/libaom/src/build/cmake/toolchains/arm-ios-common.cmake b/libs/libaom/src/build/cmake/toolchains/arm-ios-common.cmake
new file mode 100644
index 000000000..053e33a27
--- /dev/null
+++ b/libs/libaom/src/build/cmake/toolchains/arm-ios-common.cmake
@@ -0,0 +1,26 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_TOOLCHAINS_ARM_IOS_COMMON_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_ARM_IOS_COMMON_CMAKE_
+set(AOM_BUILD_CMAKE_ARM_IOS_COMMON_CMAKE_ 1)
+
+set(CMAKE_SYSTEM_NAME "Darwin")
+set(CMAKE_OSX_SYSROOT iphoneos)
+set(CMAKE_C_COMPILER clang)
+set(CMAKE_C_COMPILER_ARG1 "-arch ${CMAKE_SYSTEM_PROCESSOR}")
+set(CMAKE_CXX_COMPILER clang++)
+set(CMAKE_CXX_COMPILER_ARG1 "-arch ${CMAKE_SYSTEM_PROCESSOR}")
+
+# No runtime cpu detect for arm*-ios targets.
+set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE STRING "")
+
+# TODO(tomfinegan): Handle bit code embedding.
diff --git a/libs/libaom/src/build/cmake/toolchains/arm64-android-clang.cmake b/libs/libaom/src/build/cmake/toolchains/arm64-android-clang.cmake
new file mode 100644
index 000000000..c13b1d96c
--- /dev/null
+++ b/libs/libaom/src/build/cmake/toolchains/arm64-android-clang.cmake
@@ -0,0 +1,48 @@
+#
+# Copyright (c) 2019, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_TOOLCHAINS_ARM64_ANDROID_CLANG_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_ARM64_ANDROID_CLANG_CMAKE_
+set(AOM_BUILD_CMAKE_TOOLCHAINS_ARM64_ANDROID_CLANG_CMAKE_ 1)
+
+if(NOT ANDROID_PLATFORM)
+  set(ANDROID_PLATFORM android-21)
+endif()
+
+if(NOT ANDROID_ABI)
+  set(ANDROID_ABI arm64-v8a)
+endif()
+
+set(AS_EXECUTABLE as)
+
+# Toolchain files don't have access to cached variables:
+# https://gitlab.kitware.com/cmake/cmake/issues/16170. Set an intermediate
+# environment variable when loaded the first time.
+if(AOM_ANDROID_NDK_PATH)
+  set(ENV{_AOM_ANDROID_NDK_PATH} "${AOM_ANDROID_NDK_PATH}")
+else()
+  set(AOM_ANDROID_NDK_PATH "$ENV{_AOM_ANDROID_NDK_PATH}")
+endif()
+
+if("${AOM_ANDROID_NDK_PATH}" STREQUAL "")
+  message(FATAL_ERROR "AOM_ANDROID_NDK_PATH not set.")
+  return()
+endif()
+
+include("${AOM_ANDROID_NDK_PATH}/build/cmake/android.toolchain.cmake")
+
+# No intrinsics flag required for arm64-android-clang.
+set(AOM_NEON_INTRIN_FLAG "")
+
+# No runtime cpu detect for arm64-android-clang.
+set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE STRING "")
+
+set(CMAKE_SYSTEM_NAME "Android")
diff --git a/libs/libaom/src/build/cmake/toolchains/arm64-ios.cmake b/libs/libaom/src/build/cmake/toolchains/arm64-ios.cmake
new file mode 100644
index 000000000..6feb1090f
--- /dev/null
+++ b/libs/libaom/src/build/cmake/toolchains/arm64-ios.cmake
@@ -0,0 +1,23 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_TOOLCHAINS_ARM64_IOS_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_ARM64_IOS_CMAKE_
+set(AOM_BUILD_CMAKE_TOOLCHAINS_ARM64_IOS_CMAKE_ 1)
+
+if(XCODE) # TODO(tomfinegan): Handle arm builds in Xcode.
+  message(FATAL_ERROR "This toolchain does not support Xcode.")
+endif()
+
+set(CMAKE_SYSTEM_PROCESSOR "arm64")
+set(CMAKE_OSX_ARCHITECTURES "arm64")
+
+include("${CMAKE_CURRENT_LIST_DIR}/arm-ios-common.cmake")
diff --git a/libs/libaom/src/build/cmake/toolchains/arm64-linux-gcc.cmake b/libs/libaom/src/build/cmake/toolchains/arm64-linux-gcc.cmake
new file mode 100644
index 000000000..a6c9543db
--- /dev/null
+++ b/libs/libaom/src/build/cmake/toolchains/arm64-linux-gcc.cmake
@@ -0,0 +1,36 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_TOOLCHAINS_ARM64_LINUX_GCC_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_ARM64_LINUX_GCC_CMAKE_
+set(AOM_BUILD_CMAKE_TOOLCHAINS_ARM64_LINUX_GCC_CMAKE_ 1)
+
+set(CMAKE_SYSTEM_NAME "Linux")
+
+if("${CROSS}" STREQUAL "")
+
+  # Default the cross compiler prefix to something known to work.
+  set(CROSS aarch64-linux-gnu-)
+endif()
+
+set(CMAKE_C_COMPILER ${CROSS}gcc)
+set(CMAKE_CXX_COMPILER ${CROSS}g++)
+set(AS_EXECUTABLE ${CROSS}as)
+set(CMAKE_C_COMPILER_ARG1 "-march=armv8-a")
+set(CMAKE_CXX_COMPILER_ARG1 "-march=armv8-a")
+set(AOM_AS_FLAGS "-march=armv8-a")
+set(CMAKE_SYSTEM_PROCESSOR "arm64")
+
+# No intrinsics flag required for arm64-linux-gcc.
+set(AOM_NEON_INTRIN_FLAG "")
+
+# No runtime cpu detect for arm64-linux-gcc.
+set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE STRING "")
diff --git a/libs/libaom/src/build/cmake/toolchains/arm64-mingw-gcc.cmake b/libs/libaom/src/build/cmake/toolchains/arm64-mingw-gcc.cmake
new file mode 100644
index 000000000..a8e15cb31
--- /dev/null
+++ b/libs/libaom/src/build/cmake/toolchains/arm64-mingw-gcc.cmake
@@ -0,0 +1,29 @@
+#
+# Copyright (c) 2018, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_TOOLCHAINS_ARM64_MINGW_GCC_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_ARM64_MINGW_GCC_CMAKE_
+set(AOM_BUILD_CMAKE_TOOLCHAINS_ARM64_MINGW_GCC_CMAKE_ 1)
+
+set(CMAKE_SYSTEM_PROCESSOR "arm64")
+set(CMAKE_SYSTEM_NAME "Windows")
+
+if("${CROSS}" STREQUAL "")
+  set(CROSS aarch64-w64-mingw32-)
+endif()
+
+set(CMAKE_C_COMPILER ${CROSS}gcc)
+set(CMAKE_CXX_COMPILER ${CROSS}g++)
+set(CMAKE_AR ${CROSS}ar CACHE FILEPATH Archiver)
+set(CMAKE_RANLIB ${CROSS}ranlib CACHE FILEPATH Indexer)
+
+# No runtime cpu detect for arm64-mingw-gcc.
+set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE STRING "")
diff --git a/libs/libaom/src/build/cmake/toolchains/armv7-ios.cmake b/libs/libaom/src/build/cmake/toolchains/armv7-ios.cmake
new file mode 100644
index 000000000..11f7e160d
--- /dev/null
+++ b/libs/libaom/src/build/cmake/toolchains/armv7-ios.cmake
@@ -0,0 +1,31 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_TOOLCHAINS_ARMV7_IOS_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_ARMV7_IOS_CMAKE_
+set(AOM_BUILD_CMAKE_TOOLCHAINS_ARMV7_IOS_CMAKE_ 1)
+
+if(XCODE)
+
+  # TODO(tomfinegan): Handle arm builds in Xcode.
+  message(FATAL_ERROR "This toolchain does not support Xcode.")
+endif()
+
+set(CMAKE_SYSTEM_PROCESSOR "armv7")
+set(CMAKE_OSX_ARCHITECTURES "armv7")
+
+include("${CMAKE_CURRENT_LIST_DIR}/arm-ios-common.cmake")
+
+# No intrinsics flag required for armv7s-ios.
+set(AOM_NEON_INTRIN_FLAG "")
+
+# No runtime cpu detect for armv7s-ios.
+set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE STRING "")
diff --git a/libs/libaom/src/build/cmake/toolchains/armv7-linux-gcc.cmake b/libs/libaom/src/build/cmake/toolchains/armv7-linux-gcc.cmake
new file mode 100644
index 000000000..b898b4b78
--- /dev/null
+++ b/libs/libaom/src/build/cmake/toolchains/armv7-linux-gcc.cmake
@@ -0,0 +1,40 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_TOOLCHAINS_ARMV7_LINUX_GCC_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_ARMV7_LINUX_GCC_CMAKE_
+set(AOM_BUILD_CMAKE_TOOLCHAINS_ARMV7_LINUX_GCC_CMAKE_ 1)
+
+set(CMAKE_SYSTEM_NAME "Linux")
+
+if("${CROSS}" STREQUAL "")
+
+  # Default the cross compiler prefix to something known to work.
+  set(CROSS arm-linux-gnueabihf-)
+endif()
+
+if(NOT ${CROSS} MATCHES hf-$)
+  set(AOM_EXTRA_TOOLCHAIN_FLAGS "-mfloat-abi=softfp")
+endif()
+
+set(CMAKE_C_COMPILER ${CROSS}gcc)
+set(CMAKE_CXX_COMPILER ${CROSS}g++)
+set(AS_EXECUTABLE ${CROSS}as)
+set(CMAKE_C_COMPILER_ARG1 "-march=armv7-a ${AOM_EXTRA_TOOLCHAIN_FLAGS}")
+set(CMAKE_CXX_COMPILER_ARG1 "-march=armv7-a ${AOM_EXTRA_TOOLCHAIN_FLAGS}")
+set(AOM_AS_FLAGS --defsym ARCHITECTURE=7 -march=armv7-a -mfpu=neon
+                 ${AOM_EXTRA_TOOLCHAIN_FLAGS})
+set(CMAKE_SYSTEM_PROCESSOR "armv7")
+
+set(AOM_NEON_INTRIN_FLAG "-mfpu=neon")
+
+# No runtime cpu detect for armv7-linux-gcc.
+set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE STRING "")
diff --git a/libs/libaom/src/build/cmake/toolchains/armv7-mingw-gcc.cmake b/libs/libaom/src/build/cmake/toolchains/armv7-mingw-gcc.cmake
new file mode 100644
index 000000000..2dc4b1882
--- /dev/null
+++ b/libs/libaom/src/build/cmake/toolchains/armv7-mingw-gcc.cmake
@@ -0,0 +1,29 @@
+#
+# Copyright (c) 2018, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_TOOLCHAINS_ARMV7_MINGW_GCC_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_ARMV7_MINGW_GCC_CMAKE_
+set(AOM_BUILD_CMAKE_TOOLCHAINS_ARMV7_MINGW_GCC_CMAKE_ 1)
+
+set(CMAKE_SYSTEM_PROCESSOR "armv7")
+set(CMAKE_SYSTEM_NAME "Windows")
+
+if("${CROSS}" STREQUAL "")
+  set(CROSS armv7-w64-mingw32-)
+endif()
+
+set(CMAKE_C_COMPILER ${CROSS}gcc)
+set(CMAKE_CXX_COMPILER ${CROSS}g++)
+set(CMAKE_AR ${CROSS}ar CACHE FILEPATH Archiver)
+set(CMAKE_RANLIB ${CROSS}ranlib CACHE FILEPATH Indexer)
+
+# No runtime cpu detect for armv7-mingw-gcc.
+set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE STRING "")
diff --git a/libs/libaom/src/build/cmake/toolchains/armv7s-ios.cmake b/libs/libaom/src/build/cmake/toolchains/armv7s-ios.cmake
new file mode 100644
index 000000000..faa2933cf
--- /dev/null
+++ b/libs/libaom/src/build/cmake/toolchains/armv7s-ios.cmake
@@ -0,0 +1,31 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_TOOLCHAINS_ARMV7S_IOS_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_ARMV7S_IOS_CMAKE_
+set(AOM_BUILD_CMAKE_TOOLCHAINS_ARMV7S_IOS_CMAKE_ 1)
+
+if(XCODE)
+
+  # TODO(tomfinegan): Handle arm builds in Xcode.
+  message(FATAL_ERROR "This toolchain does not support Xcode.")
+endif()
+
+set(CMAKE_SYSTEM_PROCESSOR "armv7s")
+set(CMAKE_OSX_ARCHITECTURES "armv7s")
+
+include("${CMAKE_CURRENT_LIST_DIR}/arm-ios-common.cmake")
+
+# No intrinsics flag required for armv7s-ios.
+set(AOM_NEON_INTRIN_FLAG "")
+
+# No runtime cpu detect for armv7s-ios.
+set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE STRING "")
diff --git a/libs/libaom/src/build/cmake/toolchains/ios-simulator-common.cmake b/libs/libaom/src/build/cmake/toolchains/ios-simulator-common.cmake
new file mode 100644
index 000000000..76e0bd140
--- /dev/null
+++ b/libs/libaom/src/build/cmake/toolchains/ios-simulator-common.cmake
@@ -0,0 +1,23 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_TOOLCHAINS_IOS_SIMULATOR_COMMON_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_IOS_SIMULATOR_COMMON_CMAKE_
+set(AOM_BUILD_CMAKE_IOS_SIMULATOR_COMMON_CMAKE_ 1)
+
+set(CMAKE_SYSTEM_NAME "Darwin")
+set(CMAKE_OSX_SYSROOT iphonesimulator)
+set(CMAKE_C_COMPILER clang)
+set(CMAKE_C_COMPILER_ARG1 "-arch ${CMAKE_SYSTEM_PROCESSOR}")
+set(CMAKE_CXX_COMPILER clang++)
+set(CMAKE_CXX_COMPILER_ARG1 "-arch ${CMAKE_SYSTEM_PROCESSOR}")
+
+# TODO(tomfinegan): Handle bit code embedding.
diff --git a/libs/libaom/src/build/cmake/toolchains/mips32-linux-gcc.cmake b/libs/libaom/src/build/cmake/toolchains/mips32-linux-gcc.cmake
new file mode 100644
index 000000000..c644eec8c
--- /dev/null
+++ b/libs/libaom/src/build/cmake/toolchains/mips32-linux-gcc.cmake
@@ -0,0 +1,77 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_TOOLCHAINS_MIPS32_LINUX_GCC_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_MIPS32_LINUX_GCC_CMAKE_
+set(AOM_BUILD_CMAKE_TOOLCHAINS_MIPS32_LINUX_GCC_CMAKE_ 1)
+
+set(CMAKE_SYSTEM_NAME "Linux")
+
+if(ENABLE_DSPR2 AND ENABLE_MSA)
+  message(FATAL_ERROR "ENABLE_DSPR2 and ENABLE_MSA cannot be combined.")
+endif()
+
+if(ENABLE_DSPR2)
+  set(HAVE_DSPR2 1 CACHE BOOL "" FORCE)
+
+  if("${CROSS}" STREQUAL "")
+
+    # Default the cross compiler prefix to something known to work.
+    set(CROSS mips-linux-gnu-)
+  endif()
+
+  set(MIPS_CFLAGS "-mdspr2")
+  set(MIPS_CXXFLAGS "-mdspr2")
+elseif(ENABLE_MSA)
+  set(HAVE_MSA 1 CACHE BOOL "" FORCE)
+
+  if("${CROSS}" STREQUAL "")
+
+    # Default the cross compiler prefix to something known to work.
+    set(CROSS mips-mti-linux-gnu-)
+  endif()
+
+  set(MIPS_CFLAGS "-mmsa")
+  set(MIPS_CXXFLAGS "-mmsa")
+endif()
+
+if("${CROSS}" STREQUAL "")
+
+  # TODO(tomfinegan): Make it possible to turn this off. The $CROSS prefix won't
+  # be desired on a mips host.  Default cross compiler prefix to something that
+  # might work for an  unoptimized build.
+  set(CROSS mips-linux-gnu-)
+endif()
+
+if("${MIPS_CPU}" STREQUAL "")
+  set(MIPS_CFLAGS "${MIPS_CFLAGS} -mips32r2")
+  set(MIPS_CXXFLAGS "${MIPS_CXXFLAGS} -mips32r2")
+elseif("${MIPS_CPU}" STREQUAL "p5600")
+  set(P56_FLAGS
+      "-mips32r5 -mload-store-pairs -msched-weight -mhard-float -mfp64")
+  set(MIPS_CFLAGS "${MIPS_CFLAGS} ${P56_FLAGS}")
+  set(MIPS_CXXFLAGS "${MIPS_CXXFLAGS} ${P56_FLAGS}")
+  set(CMAKE_EXE_LINKER_FLAGS "-mfp64 ${CMAKE_EXE_LINKER_FLAGS}")
+endif()
+
+set(CMAKE_C_COMPILER ${CROSS}gcc)
+set(CMAKE_CXX_COMPILER ${CROSS}g++)
+set(AS_EXECUTABLE ${CROSS}as)
+set(CMAKE_C_COMPILER_ARG1 "-EL ${MIPS_CFLAGS}")
+set(CMAKE_CXX_COMPILER_ARG1 "-EL ${MIPS_CXXFLAGS}")
+set(CMAKE_SYSTEM_PROCESSOR "mips32")
+
+# No runtime cpu detect for mips32-linux-gcc.
+if(CONFIG_RUNTIME_CPU_DETECT)
+  message("--- CONFIG_RUNTIME_CPU_DETECT not supported for mips32 targets.")
+endif()
+
+set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE STRING "" FORCE)
diff --git a/libs/libaom/src/build/cmake/toolchains/mips64-linux-gcc.cmake b/libs/libaom/src/build/cmake/toolchains/mips64-linux-gcc.cmake
new file mode 100644
index 000000000..442d91099
--- /dev/null
+++ b/libs/libaom/src/build/cmake/toolchains/mips64-linux-gcc.cmake
@@ -0,0 +1,54 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_TOOLCHAINS_MIPS64_LINUX_GCC_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_MIPS64_LINUX_GCC_CMAKE_
+set(AOM_BUILD_CMAKE_TOOLCHAINS_MIPS64_LINUX_GCC_CMAKE_ 1)
+
+set(CMAKE_SYSTEM_NAME "Linux")
+
+if("${CROSS}" STREQUAL "")
+
+  # TODO(tomfinegan): Make it possible to turn this off. The $CROSS prefix won't
+  # be desired on a mips host.
+  #
+  # Default the cross compiler prefix to something known to work.
+  set(CROSS mips-img-linux-gnu-)
+endif()
+
+if(ENABLE_MSA)
+  set(HAVE_MSA 1 CACHE BOOL "" FORCE)
+  set(MIPS_CFLAGS "-mmsa")
+  set(MIPS_CXXFLAGS "-mmsa")
+endif()
+
+if("${MIPS_CPU}" STREQUAL "i6400" OR "${MIPS_CPU}" STREQUAL "p6600")
+  set(MIPS_CPU_FLAGS "-mips64r6 -mabi=64 -mload-store-pairs -msched-weight")
+  set(MIPS_CPU_FLAGS "${MIPS_CPU_FLAGS} -mhard-float -mfp64")
+  set(MIPS_CFLAGS "${MIPS_CFLAGS} ${MIPS_CPU_FLAGS}")
+  set(MIPS_CXXFLAGS "${MIPS_CXXFLAGS} ${MIPS_CPU_FLAGS}")
+  set(CMAKE_EXE_LINKER_FLAGS
+      "-mips64r6 -mabi64 -mfp64 ${CMAKE_EXE_LINKER_FLAGS}")
+endif()
+
+set(CMAKE_C_COMPILER ${CROSS}gcc)
+set(CMAKE_CXX_COMPILER ${CROSS}g++)
+set(AS_EXECUTABLE ${CROSS}as)
+set(CMAKE_C_COMPILER_ARG1 "-EL ${MIPS_CFLAGS}")
+set(CMAKE_CXX_COMPILER_ARG1 "-EL ${MIPS_CXXFLAGS}")
+set(CMAKE_SYSTEM_PROCESSOR "mips64")
+
+# No runtime cpu detect for mips64-linux-gcc.
+if(CONFIG_RUNTIME_CPU_DETECT)
+  message("--- CONFIG_RUNTIME_CPU_DETECT not supported for mips64 targets.")
+endif()
+
+set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE STRING "" FORCE)
diff --git a/libs/libaom/src/build/cmake/toolchains/ppc-linux-gcc.cmake b/libs/libaom/src/build/cmake/toolchains/ppc-linux-gcc.cmake
new file mode 100644
index 000000000..54db99bb4
--- /dev/null
+++ b/libs/libaom/src/build/cmake/toolchains/ppc-linux-gcc.cmake
@@ -0,0 +1,29 @@
+#
+# Copyright (c) 2018, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_TOOLCHAINS_PPC_LINUX_GCC_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_PPC_LINUX_GCC_CMAKE_
+set(AOM_BUILD_CMAKE_TOOLCHAINS_PPC_LINUX_GCC_CMAKE_ 1)
+
+set(CMAKE_SYSTEM_NAME "Linux")
+
+if("${CROSS}" STREQUAL "")
+
+  # Default the cross compiler prefix to something known to work.
+  set(CROSS powerpc64le-unknown-linux-gnu-)
+endif()
+
+set(CMAKE_C_COMPILER ${CROSS}gcc)
+set(CMAKE_CXX_COMPILER ${CROSS}g++)
+set(AS_EXECUTABLE ${CROSS}as)
+set(CMAKE_SYSTEM_PROCESSOR "ppc")
+
+set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE STRING "")
diff --git a/libs/libaom/src/build/cmake/toolchains/x86-ios-simulator.cmake b/libs/libaom/src/build/cmake/toolchains/x86-ios-simulator.cmake
new file mode 100644
index 000000000..caacb8c38
--- /dev/null
+++ b/libs/libaom/src/build/cmake/toolchains/x86-ios-simulator.cmake
@@ -0,0 +1,28 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_TOOLCHAINS_X86_IOS_SIMULATOR_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_X86_IOS_SIMULATOR_CMAKE_
+set(AOM_BUILD_CMAKE_TOOLCHAINS_X86_IOS_SIMULATOR_CMAKE_ 1)
+
+if(XCODE)
+
+  # TODO(tomfinegan): Handle ios sim builds in Xcode.
+  message(FATAL_ERROR "This toolchain does not support Xcode.")
+endif()
+
+set(CMAKE_SYSTEM_PROCESSOR "i386")
+set(CMAKE_OSX_ARCHITECTURES "i386")
+
+# Avoid noisy PIC/PIE warnings.
+set(CONFIG_PIC 1 CACHE STRING "")
+
+include("${CMAKE_CURRENT_LIST_DIR}/ios-simulator-common.cmake")
diff --git a/libs/libaom/src/build/cmake/toolchains/x86-linux.cmake b/libs/libaom/src/build/cmake/toolchains/x86-linux.cmake
new file mode 100644
index 000000000..c2a700bfe
--- /dev/null
+++ b/libs/libaom/src/build/cmake/toolchains/x86-linux.cmake
@@ -0,0 +1,19 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_TOOLCHAINS_X86_LINUX_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_X86_LINUX_CMAKE_
+set(AOM_BUILD_CMAKE_TOOLCHAINS_X86_LINUX_CMAKE_ 1)
+
+set(CMAKE_SYSTEM_PROCESSOR "x86")
+set(CMAKE_SYSTEM_NAME "Linux")
+set(CMAKE_C_COMPILER_ARG1 "-m32")
+set(CMAKE_CXX_COMPILER_ARG1 "-m32")
diff --git a/libs/libaom/src/build/cmake/toolchains/x86-macos.cmake b/libs/libaom/src/build/cmake/toolchains/x86-macos.cmake
new file mode 100644
index 000000000..095ef18e7
--- /dev/null
+++ b/libs/libaom/src/build/cmake/toolchains/x86-macos.cmake
@@ -0,0 +1,18 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+set(CMAKE_SYSTEM_PROCESSOR "x86")
+set(CMAKE_SYSTEM_NAME "Darwin")
+set(CMAKE_OSX_ARCHITECTURES "i386")
+set(CMAKE_C_COMPILER_ARG1 "-arch i386")
+set(CMAKE_CXX_COMPILER_ARG1 "-arch i386")
+
+# Apple tools always complain in 32 bit mode without PIC.
+set(CONFIG_PIC 1 CACHE STRING "")
diff --git a/libs/libaom/src/build/cmake/toolchains/x86-mingw-gcc.cmake b/libs/libaom/src/build/cmake/toolchains/x86-mingw-gcc.cmake
new file mode 100644
index 000000000..4839c9d45
--- /dev/null
+++ b/libs/libaom/src/build/cmake/toolchains/x86-mingw-gcc.cmake
@@ -0,0 +1,28 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_TOOLCHAINS_X86_MINGW_GCC_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_X86_MINGW_GCC_CMAKE_
+set(AOM_BUILD_CMAKE_TOOLCHAINS_X86_MINGW_GCC_CMAKE_ 1)
+
+set(CMAKE_SYSTEM_PROCESSOR "x86")
+set(CMAKE_SYSTEM_NAME "Windows")
+set(CMAKE_C_COMPILER_ARG1 "-m32")
+set(CMAKE_CXX_COMPILER_ARG1 "-m32")
+
+if("${CROSS}" STREQUAL "")
+  set(CROSS i686-w64-mingw32-)
+endif()
+
+set(CMAKE_C_COMPILER ${CROSS}gcc)
+set(CMAKE_CXX_COMPILER ${CROSS}g++)
+set(CMAKE_AR ${CROSS}ar CACHE FILEPATH Archiver)
+set(CMAKE_RANLIB ${CROSS}ranlib CACHE FILEPATH Indexer)
diff --git a/libs/libaom/src/build/cmake/toolchains/x86_64-ios-simulator.cmake b/libs/libaom/src/build/cmake/toolchains/x86_64-ios-simulator.cmake
new file mode 100644
index 000000000..d4b40ed09
--- /dev/null
+++ b/libs/libaom/src/build/cmake/toolchains/x86_64-ios-simulator.cmake
@@ -0,0 +1,25 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_TOOLCHAINS_X86_64_IOS_SIMULATOR_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_X86_64_IOS_SIMULATOR_CMAKE_
+set(AOM_BUILD_CMAKE_TOOLCHAINS_X86_64_IOS_SIMULATOR_CMAKE_ 1)
+
+if(XCODE)
+
+  # TODO(tomfinegan): Handle ios sim builds in Xcode.
+  message(FATAL_ERROR "This toolchain does not support Xcode.")
+endif()
+
+set(CMAKE_SYSTEM_PROCESSOR "x86_64")
+set(CMAKE_OSX_ARCHITECTURES "x86_64")
+
+include("${CMAKE_CURRENT_LIST_DIR}/ios-simulator-common.cmake")
diff --git a/libs/libaom/src/build/cmake/toolchains/x86_64-mingw-gcc.cmake b/libs/libaom/src/build/cmake/toolchains/x86_64-mingw-gcc.cmake
new file mode 100644
index 000000000..4b2d28deb
--- /dev/null
+++ b/libs/libaom/src/build/cmake/toolchains/x86_64-mingw-gcc.cmake
@@ -0,0 +1,26 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_TOOLCHAINS_X86_64_MINGW_GCC_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_TOOLCHAINS_X86_64_MINGW_GCC_CMAKE_
+set(AOM_BUILD_CMAKE_TOOLCHAINS_X86_64_MINGW_GCC_CMAKE_ 1)
+
+set(CMAKE_SYSTEM_PROCESSOR "x86_64")
+set(CMAKE_SYSTEM_NAME "Windows")
+
+if("${CROSS}" STREQUAL "")
+  set(CROSS x86_64-w64-mingw32-)
+endif()
+
+set(CMAKE_C_COMPILER ${CROSS}gcc)
+set(CMAKE_CXX_COMPILER ${CROSS}g++)
+set(CMAKE_AR ${CROSS}ar CACHE FILEPATH Archiver)
+set(CMAKE_RANLIB ${CROSS}ranlib CACHE FILEPATH Indexer)
diff --git a/libs/libaom/src/build/cmake/util.cmake b/libs/libaom/src/build/cmake/util.cmake
new file mode 100644
index 000000000..9b3da84dc
--- /dev/null
+++ b/libs/libaom/src/build/cmake/util.cmake
@@ -0,0 +1,172 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_BUILD_CMAKE_UTIL_CMAKE_)
+  return()
+endif() # AOM_BUILD_CMAKE_UTIL_CMAKE_
+set(AOM_BUILD_CMAKE_UTIL_CMAKE_ 1)
+
+# Directory where generated sources will be written.
+set(AOM_GEN_SRC_DIR "${AOM_CONFIG_DIR}/gen_src")
+
+# Creates dummy source file in $AOM_GEN_SRC_DIR named $basename.$extension and
+# returns the full path to the dummy source file via appending it to the list
+# variable referred to by $out_file_list_var parameter.
+macro(create_dummy_source_file basename extension out_file_list_var)
+  set(dummy_source_file "${AOM_GEN_SRC_DIR}/${basename}_dummy.${extension}")
+  file(WRITE "${dummy_source_file}"
+       "// Generated file. DO NOT EDIT!\n"
+       "// ${target_name} needs a ${extension} file to force link language, \n"
+       "// or to silence a harmless CMake warning: Ignore me.\n"
+       "void aom_${target_name}_dummy_function(void) {}\n")
+  list(APPEND "${out_file_list_var}" "${dummy_source_file}")
+endmacro()
+
+# Convenience function for adding a dummy source file to $target_name using
+# $extension as the file extension. Wraps create_dummy_source_file().
+function(add_dummy_source_file_to_target target_name extension)
+  create_dummy_source_file("${target_name}" "${extension}"
+                           "dummy_source_file_list")
+  target_sources(${target_name} PRIVATE ${dummy_source_file_list})
+endfunction()
+
+# Sets the value of the variable referenced by $feature to $value, and reports
+# the change to the user via call to message(WARNING ...). $cause is expected to
+# be a configuration variable that conflicts with $feature in some way. This
+# function is a noop if $feature is already set to $value.
+function(change_config_and_warn feature value cause)
+  if(${feature} EQUAL ${value})
+    return()
+  endif()
+  set(${feature} ${value} PARENT_SCOPE)
+  if(${value} EQUAL 1)
+    set(verb "Enabled")
+    set(reason "required for")
+  else()
+    set(verb "Disabled")
+    set(reason "incompatible with")
+  endif()
+  set(warning_message "${verb} ${feature}, ${reason} ${cause}.")
+  message(WARNING "--- ${warning_message}")
+endfunction()
+
+# Extracts the version string from $version_file and returns it to the user via
+# $version_string_out_var. To achieve this VERSION_STRING_NOSP is located in
+# $version_file and then everything but the string literal assigned to the
+# variable is removed. Quotes and the leading 'v' are stripped from the returned
+# string.
+function(extract_version_string version_file version_string_out_var)
+  file(STRINGS "${version_file}" aom_version REGEX "VERSION_STRING_NOSP")
+  string(REPLACE "#define VERSION_STRING_NOSP " "" aom_version "${aom_version}")
+  string(REPLACE "\"" "" aom_version "${aom_version}")
+  string(REPLACE " " "" aom_version "${aom_version}")
+  string(FIND "${aom_version}" "v" v_pos)
+  if(${v_pos} EQUAL 0)
+    string(SUBSTRING "${aom_version}" 1 -1 aom_version)
+  endif()
+  set("${version_string_out_var}" "${aom_version}" PARENT_SCOPE)
+endfunction()
+
+# Sets CMake compiler launcher to $launcher_name when $launcher_name is found in
+# $PATH. Warns user about ignoring build flag $launcher_flag when $launcher_name
+# is not found in $PATH.
+function(set_compiler_launcher launcher_flag launcher_name)
+  find_program(launcher_path "${launcher_name}")
+  if(launcher_path)
+    set(CMAKE_C_COMPILER_LAUNCHER "${launcher_path}" PARENT_SCOPE)
+    set(CMAKE_CXX_COMPILER_LAUNCHER "${launcher_path}" PARENT_SCOPE)
+    message("--- Using ${launcher_name} as compiler launcher.")
+  else()
+    message(
+      WARNING "--- Cannot find ${launcher_name}, ${launcher_flag} ignored.")
+  endif()
+endfunction()
+
+# Sentinel value used to detect when a variable has been set via the -D argument
+# passed to CMake on the command line.
+set(cmake_cmdline_helpstring "No help, variable specified on the command line.")
+
+# Wrapper macro for set() that does some book keeping to help with storage of
+# build configuration information.
+#
+# Sets the default value for variable $name when the value of $name has not
+# already been set via the CMake command line.
+#
+# The names of variables defaulted through this macro are added to
+# $AOM_CONFIG_VARS to facilitate build logging and diagnostics.
+macro(set_aom_detect_var name value helpstring)
+  unset(list_index)
+  list(FIND AOM_DETECT_VARS ${name} list_index)
+  if(${list_index} EQUAL -1)
+    list(APPEND AOM_DETECT_VARS ${name})
+  endif()
+
+  # Update the variable only when it does not carry the CMake assigned help
+  # string for variables specified via the command line.
+  unset(cache_helpstring)
+  get_property(cache_helpstring CACHE ${name} PROPERTY HELPSTRING)
+  if(NOT "${cache_helpstring}" STREQUAL "${cmake_cmdline_helpstring}")
+    set(${name} ${value} CACHE STRING "${helpstring}")
+    mark_as_advanced(${name})
+  else()
+    message(
+      WARNING
+        "${name} has been set by CMake, but it may be overridden by the build "
+        "system during environment detection")
+  endif()
+endmacro()
+
+# Wrapper macro for set() that does some book keeping to help with storage of
+# build configuration information.
+#
+# Sets the default value for variable $name when the value of $name has not
+# already been set via the CMake command line.
+#
+# The names of variables defaulted through this macro are added to
+# $AOM_CONFIG_VARS to facilitate build logging and diagnostics.
+macro(set_aom_config_var name value helpstring)
+  unset(list_index)
+  list(FIND AOM_CONFIG_VARS ${name} list_index)
+  if(${list_index} EQUAL -1)
+    list(APPEND AOM_CONFIG_VARS ${name})
+  endif()
+
+  # Update the variable only when it does not carry the CMake assigned help
+  # string for variables specified via the command line.
+  unset(cache_helpstring)
+  get_property(cache_helpstring CACHE ${name} PROPERTY HELPSTRING)
+  if(NOT "${cache_helpstring}" STREQUAL "${cmake_cmdline_helpstring}")
+    set(${name} ${value} CACHE STRING "${helpstring}")
+  endif()
+endmacro()
+
+# Wrapper macro for option() that does some book keeping to help with storage of
+# build configuration information.
+#
+# Sets the default value for variable $name when the value of $name has not
+# already been set via the CMake command line.
+#
+# The names of variables defaulted through this macro are added to
+# $AOM_OPTION_VARS to facilitate build logging and diagnostics.
+macro(set_aom_option_var name helpstring value)
+  unset(list_index)
+  list(FIND AOM_OPTION_VARS ${name} list_index)
+  if(${list_index} EQUAL -1)
+    list(APPEND AOM_OPTION_VARS ${name})
+  endif()
+
+  # Update the variable only when it does not carry the CMake assigned help
+  # string for variables specified via the command line.
+  unset(cache_helpstring)
+  get_property(cache_helpstring CACHE ${name} PROPERTY HELPSTRING)
+  if(NOT "${cache_helpstring}" STREQUAL "${cmake_cmdline_helpstring}")
+    option(${name} "${helpstring}" ${value})
+  endif()
+endmacro()
diff --git a/libs/libaom/src/build/cmake/version.cmake b/libs/libaom/src/build/cmake/version.cmake
new file mode 100644
index 000000000..dd953a37a
--- /dev/null
+++ b/libs/libaom/src/build/cmake/version.cmake
@@ -0,0 +1,65 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+cmake_minimum_required(VERSION 3.5)
+
+set(REQUIRED_ARGS "AOM_ROOT" "AOM_CONFIG_DIR" "GIT_EXECUTABLE"
+                  "PERL_EXECUTABLE")
+
+foreach(arg ${REQUIRED_ARGS})
+  if("${${arg}}" STREQUAL "")
+    message(FATAL_ERROR "${arg} must not be empty.")
+  endif()
+endforeach()
+
+include("${AOM_ROOT}/build/cmake/util.cmake")
+
+# Generate the version string for this run.
+unset(aom_version)
+if(EXISTS "${GIT_EXECUTABLE}")
+  execute_process(COMMAND ${GIT_EXECUTABLE} --git-dir=${AOM_ROOT}/.git describe
+                  OUTPUT_VARIABLE aom_version
+                  ERROR_QUIET
+                  RESULT_VARIABLE version_check_result)
+
+  if(${version_check_result} EQUAL 0)
+    string(STRIP "${aom_version}" aom_version)
+
+    # Remove the leading 'v' from the version string.
+    string(FIND "${aom_version}" "v" v_pos)
+    if(${v_pos} EQUAL 0)
+      string(SUBSTRING "${aom_version}" 1 -1 aom_version)
+    endif()
+  else()
+    set(aom_version "")
+  endif()
+endif()
+
+if("${aom_version}" STREQUAL "")
+  set(aom_version "${AOM_ROOT}/CHANGELOG")
+endif()
+
+unset(last_aom_version)
+set(version_file "${AOM_CONFIG_DIR}/config/aom_version.h")
+if(EXISTS "${version_file}")
+  extract_version_string("${version_file}" last_aom_version)
+  if("${aom_version}" MATCHES "CHANGELOG$")
+    set(aom_version "${last_aom_version}")
+  endif()
+endif()
+
+if(NOT "${aom_version}" STREQUAL "${last_aom_version}")
+  # TODO(tomfinegan): Perl dependency is unnecessary. CMake can do everything
+  # that is done by version.pl on its own (if a bit more verbosely...).
+  execute_process(COMMAND ${PERL_EXECUTABLE}
+                          "${AOM_ROOT}/build/cmake/version.pl"
+                          --version_data=${aom_version}
+                          --version_filename=${version_file} VERBATIM)
+endif()
diff --git a/libs/libaom/src/build/cmake/version.pl b/libs/libaom/src/build/cmake/version.pl
new file mode 100644
index 000000000..7d23f2b27
--- /dev/null
+++ b/libs/libaom/src/build/cmake/version.pl
@@ -0,0 +1,112 @@
+#!/usr/bin/env perl
+##
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+use strict;
+use warnings;
+use 5.010;
+use Getopt::Long;
+
+my $git_desc = '';
+my $version_data;
+my $version_filename;
+GetOptions('version_data=s' => \$version_data,
+           'version_filename=s' => \$version_filename) or
+  die("Invalid arg(s): $!");
+
+if (!defined $version_data || length($version_data) == 0 ||
+    !defined $version_filename || length($version_filename) == 0) {
+  die("--version_data and --version_filename are required.");
+}
+
+# Determine if $version_data is a filename or a git tag/description.
+my $version_string;
+chomp($version_data);
+if (-r $version_data) {
+  # $version_data is the path to the CHANGELOG. Parse the most recent version.
+  my $changelog_filename = $version_data;
+  open(my $changelog_file, '<', $changelog_filename) or
+    die("Unable to open CHANGELOG @ $changelog_filename: $!.");
+
+  while (my $line = <$changelog_file>) {
+    my @split_line = split(" ", $line, 3);
+    next if @split_line < 2;
+    $version_string = $split_line[1];
+    last if substr($version_string, 0, 1) eq "v";
+  }
+  close($changelog_file);
+} else {
+  # $version_data is either a tag name or a full git description, one of:
+  # tagName OR tagName-commitsSinceTag-shortCommitHash
+  # In either case we want the first element of the array returned by split.
+  $version_string = (split("-", $version_data))[0];
+  $git_desc = $version_data;
+}
+
+if (substr($version_string, 0, 1) eq "v") {
+  $version_string = substr($version_string, 1);
+}
+
+my @version_components = split('\.', $version_string, 4);
+my $version_major = $version_components[0];
+my $version_minor = $version_components[1];
+my $version_patch = $version_components[2];
+
+my $version_extra = "";
+if (length($git_desc) > 0) {
+  my @git_desc_components = split('-', $git_desc, 2);
+  $version_extra = $git_desc_components[1];
+}
+
+open(my $version_file, '>', $version_filename) or
+  die("Cannot open $version_filename: $!");
+
+my $version_packed = "((VERSION_MAJOR << 16) | (VERSION_MINOR << 8) | (VERSION_PATCH))";
+my $year = (localtime)[5] + 1900;
+my $lic_block = << "EOF";
+/*
+ * Copyright (c) $year, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+EOF
+
+select $version_file;
+if (length($git_desc)) {
+  print << "EOF";
+$lic_block
+#define VERSION_MAJOR $version_major
+#define VERSION_MINOR $version_minor
+#define VERSION_PATCH $version_patch
+#define VERSION_EXTRA \"$version_extra\"
+#define VERSION_PACKED \\
+  $version_packed
+#define VERSION_STRING_NOSP \"$git_desc\"
+#define VERSION_STRING \" $git_desc\"
+EOF
+} else {
+  print << "EOF";
+$lic_block
+#define VERSION_MAJOR $version_major
+#define VERSION_MINOR $version_minor
+#define VERSION_PATCH $version_patch
+#define VERSION_EXTRA \"$version_extra\"
+#define VERSION_PACKED \\
+  $version_packed
+#define VERSION_STRING_NOSP \"v$version_string\"
+#define VERSION_STRING \" v$version_string\"
+EOF
+}
+close($version_file);
diff --git a/libs/libaom/src/codereview.settings b/libs/libaom/src/codereview.settings
new file mode 100644
index 000000000..185e9344c
--- /dev/null
+++ b/libs/libaom/src/codereview.settings
@@ -0,0 +1,4 @@
+# This file is used by git cl to get repository specific information.
+GERRIT_HOST: True
+CODE_REVIEW_SERVER: aomedia-review.googlesource.com
+GERRIT_SQUASH_UPLOADS: False
diff --git a/libs/libaom/src/common/args.c b/libs/libaom/src/common/args.c
new file mode 100644
index 000000000..ec2a86353
--- /dev/null
+++ b/libs/libaom/src/common/args.c
@@ -0,0 +1,343 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "common/args.h"
+
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+
+#include "aom/aom_integer.h"
+#include "aom_ports/msvc.h"
+#include "aom/aom_codec.h"
+
+#if defined(__GNUC__) && __GNUC__
+extern void die(const char *fmt, ...) __attribute__((noreturn));
+#else
+extern void die(const char *fmt, ...);
+#endif
+
+struct arg arg_init(char **argv) {
+  struct arg a;
+
+  a.argv = argv;
+  a.argv_step = 1;
+  a.name = NULL;
+  a.val = NULL;
+  a.def = NULL;
+  return a;
+}
+
+char *ignore_front_spaces(const char *str) {
+  while (str[0] == ' ' || str[0] == '\t') ++str;
+  return (char *)str;
+}
+
+void ignore_end_spaces(char *str) {
+  char *end = str + strlen(str);
+  while (end > str && (end[0] == ' ' || end[0] == '\t' || end[0] == '\n' ||
+                       end[0] == '\r' || end[0] == '\0'))
+    --end;
+  if (end >= str) end[1] = '\0';
+}
+
+static const char kSbSizeWarningString[] =
+    "super_block_size has to be 64 or 128.";
+static const char kMinpartWarningString[] =
+    "min_partition_size has to be smaller or equal to max_partition_size.";
+static const char kMaxpartWarningString[] =
+    "max_partition_size has to be smaller or equal to super_block_size.";
+
+int parse_cfg(const char *file, cfg_options_t *config) {
+  char line[1024 * 10];
+  FILE *f = fopen(file, "r");
+  if (!f) return 1;
+
+#define GET_PARAMS(field)          \
+  if (strcmp(left, #field) == 0) { \
+    config->field = atoi(right);   \
+    continue;                      \
+  }
+
+  while (fgets(line, sizeof(line) - 1, f)) {
+    char *actual_line = ignore_front_spaces(line);
+    char *left, *right, *comment;
+    size_t length = strlen(actual_line);
+
+    if (length == 0 || actual_line[0] == '#') continue;
+    right = strchr(actual_line, '=');
+    if (right == NULL) continue;
+    right[0] = '\0';
+
+    left = ignore_front_spaces(actual_line);
+    right = ignore_front_spaces(right + 1);
+
+    comment = strchr(right, '#');
+    if (comment != NULL) comment[0] = '\0';
+
+    ignore_end_spaces(left);
+    ignore_end_spaces(right);
+
+    GET_PARAMS(super_block_size);
+    GET_PARAMS(max_partition_size);
+    GET_PARAMS(min_partition_size);
+    GET_PARAMS(disable_ab_partition_type);
+    GET_PARAMS(disable_rect_partition_type);
+    GET_PARAMS(disable_1to4_partition_type);
+    GET_PARAMS(disable_flip_idtx);
+    GET_PARAMS(disable_cdef);
+    GET_PARAMS(disable_lr);
+    GET_PARAMS(disable_obmc);
+    GET_PARAMS(disable_warp_motion);
+    GET_PARAMS(disable_global_motion);
+    GET_PARAMS(disable_dist_wtd_comp);
+    GET_PARAMS(disable_diff_wtd_comp);
+    GET_PARAMS(disable_inter_intra_comp);
+    GET_PARAMS(disable_masked_comp);
+    GET_PARAMS(disable_one_sided_comp);
+    GET_PARAMS(disable_palette);
+    GET_PARAMS(disable_intrabc);
+    GET_PARAMS(disable_cfl);
+    GET_PARAMS(disable_smooth_intra);
+    GET_PARAMS(disable_filter_intra);
+    GET_PARAMS(disable_dual_filter);
+    GET_PARAMS(disable_intra_angle_delta);
+    GET_PARAMS(disable_intra_edge_filter);
+    GET_PARAMS(disable_tx_64x64);
+    GET_PARAMS(disable_smooth_inter_intra);
+    GET_PARAMS(disable_inter_inter_wedge);
+    GET_PARAMS(disable_inter_intra_wedge);
+    GET_PARAMS(disable_paeth_intra);
+    GET_PARAMS(disable_trellis_quant);
+    GET_PARAMS(disable_ref_frame_mv);
+    GET_PARAMS(reduced_reference_set);
+    GET_PARAMS(reduced_tx_type_set);
+
+    fprintf(stderr, "\nInvalid parameter: %s", left);
+    exit(-1);
+  }
+
+  if (config->super_block_size != 128 && config->super_block_size != 64) {
+    fprintf(stderr, "\n%s", kSbSizeWarningString);
+    exit(-1);
+  }
+  if (config->min_partition_size > config->max_partition_size) {
+    fprintf(stderr, "\n%s", kMinpartWarningString);
+    exit(-1);
+  }
+  if (config->max_partition_size > config->super_block_size) {
+    fprintf(stderr, "\n%s", kMaxpartWarningString);
+    exit(-1);
+  }
+
+  fclose(f);
+  config->init_by_cfg_file = 1;
+
+  return 0;
+}
+
+int arg_match(struct arg *arg_, const struct arg_def *def, char **argv) {
+  struct arg arg;
+
+  if (!argv[0] || argv[0][0] != '-') return 0;
+
+  arg = arg_init(argv);
+
+  if (def->short_name && strlen(arg.argv[0]) == strlen(def->short_name) + 1 &&
+      !strcmp(arg.argv[0] + 1, def->short_name)) {
+    arg.name = arg.argv[0] + 1;
+    arg.val = def->has_val ? arg.argv[1] : NULL;
+    arg.argv_step = def->has_val ? 2 : 1;
+  } else if (def->long_name) {
+    const size_t name_len = strlen(def->long_name);
+
+    if (strlen(arg.argv[0]) >= name_len + 2 && arg.argv[0][1] == '-' &&
+        !strncmp(arg.argv[0] + 2, def->long_name, name_len) &&
+        (arg.argv[0][name_len + 2] == '=' ||
+         arg.argv[0][name_len + 2] == '\0')) {
+      arg.name = arg.argv[0] + 2;
+      arg.val = arg.name[name_len] == '=' ? arg.name + name_len + 1 : NULL;
+      arg.argv_step = 1;
+    }
+  }
+
+  if (arg.name && !arg.val && def->has_val)
+    die("Error: option %s requires argument.\n", arg.name);
+
+  if (arg.name && arg.val && !def->has_val)
+    die("Error: option %s requires no argument.\n", arg.name);
+
+  if (arg.name && (arg.val || !def->has_val)) {
+    arg.def = def;
+    *arg_ = arg;
+    return 1;
+  }
+
+  return 0;
+}
+
+const char *arg_next(struct arg *arg) {
+  if (arg->argv[0]) arg->argv += arg->argv_step;
+
+  return *arg->argv;
+}
+
+char **argv_dup(int argc, const char **argv) {
+  char **new_argv = malloc((argc + 1) * sizeof(*argv));
+
+  memcpy(new_argv, argv, argc * sizeof(*argv));
+  new_argv[argc] = NULL;
+  return new_argv;
+}
+
+void arg_show_usage(FILE *fp, const struct arg_def *const *defs) {
+  char option_text[40] = { 0 };
+
+  for (; *defs; defs++) {
+    const struct arg_def *def = *defs;
+    char *short_val = def->has_val ? " <arg>" : "";
+    char *long_val = def->has_val ? "=<arg>" : "";
+
+    if (def->short_name && def->long_name) {
+      char *comma = def->has_val ? "," : ",      ";
+
+      snprintf(option_text, 37, "-%s%s%s --%s%6s", def->short_name, short_val,
+               comma, def->long_name, long_val);
+    } else if (def->short_name)
+      snprintf(option_text, 37, "-%s%s", def->short_name, short_val);
+    else if (def->long_name)
+      snprintf(option_text, 37, "          --%s%s", def->long_name, long_val);
+
+    fprintf(fp, "  %-37s\t%s\n", option_text, def->desc);
+
+    if (def->enums) {
+      const struct arg_enum_list *listptr;
+
+      fprintf(fp, "  %-37s\t  ", "");
+
+      for (listptr = def->enums; listptr->name; listptr++)
+        fprintf(fp, "%s%s", listptr->name, listptr[1].name ? ", " : "\n");
+    }
+  }
+}
+
+unsigned int arg_parse_uint(const struct arg *arg) {
+  char *endptr;
+  const unsigned long rawval = strtoul(arg->val, &endptr, 10);  // NOLINT
+
+  if (arg->val[0] != '\0' && endptr[0] == '\0') {
+    if (rawval <= UINT_MAX) return (unsigned int)rawval;
+
+    die("Option %s: Value %lu out of range for unsigned int\n", arg->name,
+        rawval);
+  }
+
+  die("Option %s: Invalid character '%c'\n", arg->name, *endptr);
+  return 0;
+}
+
+int arg_parse_int(const struct arg *arg) {
+  char *endptr;
+  const long rawval = strtol(arg->val, &endptr, 10);  // NOLINT
+
+  if (arg->val[0] != '\0' && endptr[0] == '\0') {
+    if (rawval >= INT_MIN && rawval <= INT_MAX) return (int)rawval;
+
+    die("Option %s: Value %ld out of range for signed int\n", arg->name,
+        rawval);
+  }
+
+  die("Option %s: Invalid character '%c'\n", arg->name, *endptr);
+  return 0;
+}
+
+struct aom_rational arg_parse_rational(const struct arg *arg) {
+  long int rawval;
+  char *endptr;
+  struct aom_rational rat;
+
+  /* parse numerator */
+  rawval = strtol(arg->val, &endptr, 10);
+
+  if (arg->val[0] != '\0' && endptr[0] == '/') {
+    if (rawval >= INT_MIN && rawval <= INT_MAX)
+      rat.num = (int)rawval;
+    else
+      die("Option %s: Value %ld out of range for signed int\n", arg->name,
+          rawval);
+  } else
+    die("Option %s: Expected / at '%c'\n", arg->name, *endptr);
+
+  /* parse denominator */
+  rawval = strtol(endptr + 1, &endptr, 10);
+
+  if (arg->val[0] != '\0' && endptr[0] == '\0') {
+    if (rawval >= INT_MIN && rawval <= INT_MAX)
+      rat.den = (int)rawval;
+    else
+      die("Option %s: Value %ld out of range for signed int\n", arg->name,
+          rawval);
+  } else
+    die("Option %s: Invalid character '%c'\n", arg->name, *endptr);
+
+  return rat;
+}
+
+int arg_parse_enum(const struct arg *arg) {
+  const struct arg_enum_list *listptr;
+  long int rawval;
+  char *endptr;
+
+  /* First see if the value can be parsed as a raw value */
+  rawval = strtol(arg->val, &endptr, 10);
+  if (arg->val[0] != '\0' && endptr[0] == '\0') {
+    /* Got a raw value, make sure it's valid */
+    for (listptr = arg->def->enums; listptr->name; listptr++)
+      if (listptr->val == rawval) return (int)rawval;
+  }
+
+  /* Next see if it can be parsed as a string */
+  for (listptr = arg->def->enums; listptr->name; listptr++)
+    if (!strcmp(arg->val, listptr->name)) return listptr->val;
+
+  die("Option %s: Invalid value '%s'\n", arg->name, arg->val);
+  return 0;
+}
+
+int arg_parse_enum_or_int(const struct arg *arg) {
+  if (arg->def->enums) return arg_parse_enum(arg);
+  return arg_parse_int(arg);
+}
+
+// parse a comma separated list of at most n integers
+// return the number of elements in the list
+int arg_parse_list(const struct arg *arg, int *list, int n) {
+  const char *ptr = arg->val;
+  char *endptr;
+  int i = 0;
+
+  while (ptr[0] != '\0') {
+    int32_t rawval = (int32_t)strtol(ptr, &endptr, 10);
+    if (rawval < INT_MIN || rawval > INT_MAX) {
+      die("Option %s: Value %ld out of range for signed int\n", arg->name,
+          rawval);
+    } else if (i >= n) {
+      die("Option %s: List has more than %d entries\n", arg->name, n);
+    } else if (*endptr == ',') {
+      endptr++;
+    } else if (*endptr != '\0') {
+      die("Option %s: Bad list separator '%c'\n", arg->name, *endptr);
+    }
+    list[i++] = (int)rawval;
+    ptr = endptr;
+  }
+  return i;
+}
diff --git a/libs/libaom/src/common/args.h b/libs/libaom/src/common/args.h
new file mode 100644
index 000000000..286f7dd1a
--- /dev/null
+++ b/libs/libaom/src/common/args.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_COMMON_ARGS_H_
+#define AOM_COMMON_ARGS_H_
+#include <stdio.h>
+
+#include "aom/aom_codec.h"
+#include "aom/aom_encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct arg {
+  char **argv;
+  const char *name;
+  const char *val;
+  unsigned int argv_step;
+  const struct arg_def *def;
+};
+
+struct arg_enum_list {
+  const char *name;
+  int val;
+};
+#define ARG_ENUM_LIST_END \
+  { 0 }
+
+typedef struct arg_def {
+  const char *short_name;
+  const char *long_name;
+  int has_val;
+  const char *desc;
+  const struct arg_enum_list *enums;
+} arg_def_t;
+#define ARG_DEF(s, l, v, d) \
+  { s, l, v, d, NULL }
+#define ARG_DEF_ENUM(s, l, v, d, e) \
+  { s, l, v, d, e }
+#define ARG_DEF_LIST_END \
+  { 0 }
+
+struct arg arg_init(char **argv);
+int arg_match(struct arg *arg_, const struct arg_def *def, char **argv);
+char *ignore_front_spaces(const char *str);
+void ignore_end_spaces(char *str);
+int parse_cfg(const char *file, cfg_options_t *config);
+const char *arg_next(struct arg *arg);
+void arg_show_usage(FILE *fp, const struct arg_def *const *defs);
+char **argv_dup(int argc, const char **argv);
+
+unsigned int arg_parse_uint(const struct arg *arg);
+int arg_parse_int(const struct arg *arg);
+struct aom_rational arg_parse_rational(const struct arg *arg);
+int arg_parse_enum(const struct arg *arg);
+int arg_parse_enum_or_int(const struct arg *arg);
+int arg_parse_list(const struct arg *arg, int *list, int n);
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_COMMON_ARGS_H_
diff --git a/libs/libaom/src/common/av1_config.c b/libs/libaom/src/common/av1_config.c
new file mode 100644
index 000000000..9f5b02015
--- /dev/null
+++ b/libs/libaom/src/common/av1_config.c
@@ -0,0 +1,511 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <stdio.h>
+#include <string.h>
+
+#include "aom/aom_image.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/bitreader_buffer.h"
+#include "aom_dsp/bitwriter_buffer.h"
+#include "av1/common/obu_util.h"
+#include "common/av1_config.h"
+#include "config/aom_config.h"
+
+// Helper macros to reduce verbosity required to check for read errors.
+//
+// Note that when using these macros, even single line if statements should use
+// curly braces to avoid unexpected behavior because all but the
+// AV1C_POP_ERROR_HANDLER_DATA() macro consist of multiple statements.
+#define AV1C_READ_BIT_OR_RETURN_ERROR(field)                                   \
+  int field = 0;                                                               \
+  do {                                                                         \
+    field = aom_rb_read_bit(reader);                                           \
+    if (result == -1) {                                                        \
+      fprintf(stderr,                                                          \
+              "av1c: Error reading bit for " #field ", value=%d result=%d.\n", \
+              field, result);                                                  \
+      return -1;                                                               \
+    }                                                                          \
+  } while (0)
+
+#define AV1C_READ_BITS_OR_RETURN_ERROR(field, length) \
+  int field = 0;                                      \
+  do {                                                \
+    field = aom_rb_read_literal(reader, (length));    \
+    if (result == -1) {                               \
+      fprintf(stderr,                                 \
+              "av1c: Could not read bits for " #field \
+              ", value=%d result=%d.\n",              \
+              field, result);                         \
+      return -1;                                      \
+    }                                                 \
+  } while (0)
+
+// Helper macros for setting/restoring the error handler data in
+// aom_read_bit_buffer.
+#define AV1C_PUSH_ERROR_HANDLER_DATA(new_data)                \
+  void *original_error_handler_data = NULL;                   \
+  do {                                                        \
+    original_error_handler_data = reader->error_handler_data; \
+    reader->error_handler_data = &new_data;                   \
+  } while (0)
+
+#define AV1C_POP_ERROR_HANDLER_DATA()                         \
+  do {                                                        \
+    reader->error_handler_data = original_error_handler_data; \
+  } while (0)
+
+static const size_t kAv1cSize = 4;
+
+static void bitreader_error_handler(void *data) {
+  int *error_val = (int *)data;
+  *error_val = -1;
+}
+
+// Parse the AV1 timing_info() structure:
+// timing_info( ) {
+//   num_units_in_display_tick       f(32)
+//   time_scale                      f(32)
+//   equal_picture_interval          f(1)
+//   if (equal_picture_interval)
+//     num_ticks_per_picture_minus_1 uvlc()
+//   }
+static int parse_timing_info(struct aom_read_bit_buffer *reader) {
+  int result = 0;
+  AV1C_PUSH_ERROR_HANDLER_DATA(result);
+
+  AV1C_READ_BITS_OR_RETURN_ERROR(num_units_in_display_tick, 32);
+  AV1C_READ_BITS_OR_RETURN_ERROR(time_scale, 32);
+
+  AV1C_READ_BIT_OR_RETURN_ERROR(equal_picture_interval);
+  if (equal_picture_interval) {
+    uint32_t num_ticks_per_picture_minus_1 = aom_rb_read_uvlc(reader);
+    if (result == -1) {
+      fprintf(stderr,
+              "av1c: Could not read bits for "
+              "num_ticks_per_picture_minus_1, value=%u.\n",
+              num_ticks_per_picture_minus_1);
+      return result;
+    }
+  }
+
+  AV1C_POP_ERROR_HANDLER_DATA();
+  return result;
+}
+
+// Parse the AV1 decoder_model_info() structure:
+// decoder_model_info( ) {
+//   buffer_delay_length_minus_1            f(5)
+//   num_units_in_decoding_tick             f(32)
+//   buffer_removal_time_length_minus_1     f(5)
+//   frame_presentation_time_length_minus_1 f(5)
+// }
+//
+// Returns -1 upon failure, or the value of buffer_delay_length_minus_1 + 1.
+static int parse_decoder_model_info(struct aom_read_bit_buffer *reader) {
+  int result = 0;
+  AV1C_PUSH_ERROR_HANDLER_DATA(result);
+
+  AV1C_READ_BITS_OR_RETURN_ERROR(buffer_delay_length_minus_1, 5);
+  AV1C_READ_BITS_OR_RETURN_ERROR(num_units_in_decoding_tick, 32);
+  AV1C_READ_BITS_OR_RETURN_ERROR(buffer_removal_time_length_minus_1, 5);
+  AV1C_READ_BITS_OR_RETURN_ERROR(frame_presentation_time_length_minus_1, 5);
+
+  AV1C_POP_ERROR_HANDLER_DATA();
+  return buffer_delay_length_minus_1 + 1;
+}
+
+// Parse the AV1 operating_parameters_info() structure:
+// operating_parameters_info( op ) {
+//   n = buffer_delay_length_minus_1 + 1
+//   decoder_buffer_delay[ op ] f(n)
+//   encoder_buffer_delay[ op ] f(n)
+//   low_delay_mode_flag[ op ] f(1)
+// }
+static int parse_operating_parameters_info(struct aom_read_bit_buffer *reader,
+                                           int buffer_delay_length_minus_1) {
+  int result = 0;
+  AV1C_PUSH_ERROR_HANDLER_DATA(result);
+
+  const int buffer_delay_length = buffer_delay_length_minus_1 + 1;
+  AV1C_READ_BITS_OR_RETURN_ERROR(decoder_buffer_delay, buffer_delay_length);
+  AV1C_READ_BITS_OR_RETURN_ERROR(encoder_buffer_delay, buffer_delay_length);
+  AV1C_READ_BIT_OR_RETURN_ERROR(low_delay_mode_flag);
+
+  AV1C_POP_ERROR_HANDLER_DATA();
+  return result;
+}
+
+// Parse the AV1 color_config() structure..See:
+// https://aomediacodec.github.io/av1-spec/av1-spec.pdf#page=44
+static int parse_color_config(struct aom_read_bit_buffer *reader,
+                              Av1Config *config) {
+  int result = 0;
+  AV1C_PUSH_ERROR_HANDLER_DATA(result);
+
+  AV1C_READ_BIT_OR_RETURN_ERROR(high_bitdepth);
+  config->high_bitdepth = high_bitdepth;
+
+  int bit_depth = 0;
+  if (config->seq_profile == 2 && config->high_bitdepth) {
+    AV1C_READ_BIT_OR_RETURN_ERROR(twelve_bit);
+    config->twelve_bit = twelve_bit;
+    bit_depth = config->twelve_bit ? 12 : 10;
+  } else {
+    bit_depth = config->high_bitdepth ? 10 : 8;
+  }
+
+  if (config->seq_profile != 1) {
+    AV1C_READ_BIT_OR_RETURN_ERROR(mono_chrome);
+    config->monochrome = mono_chrome;
+  }
+
+  int color_primaries = AOM_CICP_CP_UNSPECIFIED;
+  int transfer_characteristics = AOM_CICP_TC_UNSPECIFIED;
+  int matrix_coefficients = AOM_CICP_MC_UNSPECIFIED;
+
+  AV1C_READ_BIT_OR_RETURN_ERROR(color_description_present_flag);
+  if (color_description_present_flag) {
+    AV1C_READ_BITS_OR_RETURN_ERROR(color_primaries_val, 8);
+    color_primaries = color_primaries_val;
+    AV1C_READ_BITS_OR_RETURN_ERROR(transfer_characteristics_val, 8);
+    transfer_characteristics = transfer_characteristics_val;
+    AV1C_READ_BITS_OR_RETURN_ERROR(matrix_coefficients_val, 8);
+    matrix_coefficients = matrix_coefficients_val;
+  }
+
+  if (config->monochrome) {
+    AV1C_READ_BIT_OR_RETURN_ERROR(color_range);
+    config->chroma_subsampling_x = 1;
+    config->chroma_subsampling_y = 1;
+  } else if (color_primaries == AOM_CICP_CP_BT_709 &&
+             transfer_characteristics == AOM_CICP_TC_SRGB &&
+             matrix_coefficients == AOM_CICP_MC_IDENTITY) {
+    config->chroma_subsampling_x = 0;
+    config->chroma_subsampling_y = 0;
+  } else {
+    AV1C_READ_BIT_OR_RETURN_ERROR(color_range);
+    if (config->seq_profile == 0) {
+      config->chroma_subsampling_x = 1;
+      config->chroma_subsampling_y = 1;
+    } else if (config->seq_profile == 1) {
+      config->chroma_subsampling_x = 0;
+      config->chroma_subsampling_y = 0;
+    } else {
+      if (bit_depth == 12) {
+        AV1C_READ_BIT_OR_RETURN_ERROR(subsampling_x);
+        config->chroma_subsampling_x = subsampling_x;
+        if (subsampling_x) {
+          AV1C_READ_BIT_OR_RETURN_ERROR(subsampling_y);
+          config->chroma_subsampling_y = subsampling_y;
+        } else {
+          config->chroma_subsampling_y = 0;
+        }
+      } else {
+        config->chroma_subsampling_x = 1;
+        config->chroma_subsampling_y = 0;
+      }
+    }
+
+    if (config->chroma_subsampling_x && config->chroma_subsampling_y) {
+      AV1C_READ_BITS_OR_RETURN_ERROR(chroma_sample_position, 2);
+      config->chroma_sample_position = chroma_sample_position;
+    }
+  }
+
+  if (!config->monochrome) {
+    AV1C_READ_BIT_OR_RETURN_ERROR(separate_uv_delta_q);
+  }
+
+  AV1C_POP_ERROR_HANDLER_DATA();
+  return result;
+}
+
+// Parse AV1 Sequence Header OBU. See:
+// https://aomediacodec.github.io/av1-spec/av1-spec.pdf#page=41
+static int parse_sequence_header(const uint8_t *const buffer, size_t length,
+                                 Av1Config *config) {
+  int result = 0;
+  // The reader instance is local to this function, but a pointer to the
+  // reader instance is used within this function and throughout this file to
+  // allow use of the helper macros that reduce parse error checking verbosity.
+  struct aom_read_bit_buffer reader_instance = { buffer, buffer + length, 0,
+                                                 &result,
+                                                 bitreader_error_handler };
+  struct aom_read_bit_buffer *reader = &reader_instance;
+
+  AV1C_READ_BITS_OR_RETURN_ERROR(seq_profile, 3);
+  config->seq_profile = seq_profile;
+  AV1C_READ_BIT_OR_RETURN_ERROR(still_picture);
+  AV1C_READ_BIT_OR_RETURN_ERROR(reduced_still_picture_header);
+  if (reduced_still_picture_header) {
+    config->initial_presentation_delay_present = 0;
+    AV1C_READ_BITS_OR_RETURN_ERROR(seq_level_idx_0, 5);
+    config->seq_level_idx_0 = seq_level_idx_0;
+    config->seq_tier_0 = 0;
+  } else {
+    int has_decoder_model = 0;
+    int buffer_delay_length = 0;
+
+    AV1C_READ_BIT_OR_RETURN_ERROR(timing_info_present_flag);
+    if (timing_info_present_flag) {
+      if (parse_timing_info(reader) != 0) return -1;
+
+      AV1C_READ_BIT_OR_RETURN_ERROR(decoder_model_info_present_flag);
+      if (decoder_model_info_present_flag &&
+          (buffer_delay_length = parse_decoder_model_info(reader)) == -1) {
+        return -1;
+      }
+      has_decoder_model = 1;
+    }
+
+    AV1C_READ_BIT_OR_RETURN_ERROR(initial_presentation_delay_present);
+    config->initial_presentation_delay_present =
+        initial_presentation_delay_present;
+
+    AV1C_READ_BITS_OR_RETURN_ERROR(operating_points_cnt_minus_1, 5);
+    const int num_operating_points = operating_points_cnt_minus_1 + 1;
+
+    for (int op_index = 0; op_index < num_operating_points; ++op_index) {
+      AV1C_READ_BITS_OR_RETURN_ERROR(operating_point_idc, 12);
+      AV1C_READ_BITS_OR_RETURN_ERROR(seq_level_idx, 5);
+
+      int seq_tier = 0;
+      if (seq_level_idx > 7) {
+        AV1C_READ_BIT_OR_RETURN_ERROR(seq_tier_this_op);
+        seq_tier = seq_tier_this_op;
+      }
+
+      if (has_decoder_model) {
+        AV1C_READ_BIT_OR_RETURN_ERROR(decoder_model_present_for_op);
+        if (decoder_model_present_for_op) {
+          if (parse_operating_parameters_info(reader, buffer_delay_length) ==
+              -1) {
+            return -1;
+          }
+        }
+      }
+
+      if (config->initial_presentation_delay_present) {
+        // Skip the initial presentation delay bits if present since this
+        // function has no access to the data required to properly set the
+        // field.
+        AV1C_READ_BIT_OR_RETURN_ERROR(
+            initial_presentation_delay_present_for_this_op);
+        if (initial_presentation_delay_present_for_this_op) {
+          AV1C_READ_BITS_OR_RETURN_ERROR(initial_presentation_delay_minus_1, 4);
+        }
+      }
+
+      if (op_index == 0) {
+        // Av1Config needs only the values from the first operating point.
+        config->seq_level_idx_0 = seq_level_idx;
+        config->seq_tier_0 = seq_tier;
+        config->initial_presentation_delay_present = 0;
+        config->initial_presentation_delay_minus_one = 0;
+      }
+    }
+  }
+
+  AV1C_READ_BITS_OR_RETURN_ERROR(frame_width_bits_minus_1, 4);
+  AV1C_READ_BITS_OR_RETURN_ERROR(frame_height_bits_minus_1, 4);
+  AV1C_READ_BITS_OR_RETURN_ERROR(max_frame_width_minus_1,
+                                 frame_width_bits_minus_1 + 1);
+  AV1C_READ_BITS_OR_RETURN_ERROR(max_frame_height_minus_1,
+                                 frame_height_bits_minus_1 + 1);
+
+  uint8_t frame_id_numbers_present = 0;
+  if (!reduced_still_picture_header) {
+    AV1C_READ_BIT_OR_RETURN_ERROR(frame_id_numbers_present_flag);
+    frame_id_numbers_present = frame_id_numbers_present_flag;
+  }
+
+  if (frame_id_numbers_present) {
+    AV1C_READ_BITS_OR_RETURN_ERROR(delta_frame_id_length_minus_2, 4);
+    AV1C_READ_BITS_OR_RETURN_ERROR(additional_frame_id_length_minus_1, 3);
+  }
+
+  AV1C_READ_BIT_OR_RETURN_ERROR(use_128x128_superblock);
+  AV1C_READ_BIT_OR_RETURN_ERROR(enable_filter_intra);
+  AV1C_READ_BIT_OR_RETURN_ERROR(enable_intra_edge_filter);
+
+  if (!reduced_still_picture_header) {
+    AV1C_READ_BIT_OR_RETURN_ERROR(enable_interintra_compound);
+    AV1C_READ_BIT_OR_RETURN_ERROR(enable_masked_compound);
+    AV1C_READ_BIT_OR_RETURN_ERROR(enable_warped_motion);
+    AV1C_READ_BIT_OR_RETURN_ERROR(enable_dual_filter);
+
+    AV1C_READ_BIT_OR_RETURN_ERROR(enable_order_hint);
+    if (enable_order_hint) {
+      AV1C_READ_BIT_OR_RETURN_ERROR(enable_dist_wtd_comp);
+      AV1C_READ_BIT_OR_RETURN_ERROR(enable_ref_frame_mvs);
+    }
+
+    const int SELECT_SCREEN_CONTENT_TOOLS = 2;
+    int seq_force_screen_content_tools = SELECT_SCREEN_CONTENT_TOOLS;
+    AV1C_READ_BIT_OR_RETURN_ERROR(seq_choose_screen_content_tools);
+    if (!seq_choose_screen_content_tools) {
+      AV1C_READ_BIT_OR_RETURN_ERROR(seq_force_screen_content_tools_val);
+      seq_force_screen_content_tools = seq_force_screen_content_tools_val;
+    }
+
+    if (seq_force_screen_content_tools > 0) {
+      AV1C_READ_BIT_OR_RETURN_ERROR(seq_choose_integer_mv);
+
+      if (!seq_choose_integer_mv) {
+        AV1C_READ_BIT_OR_RETURN_ERROR(seq_force_integer_mv);
+      }
+    }
+
+    if (enable_order_hint) {
+      AV1C_READ_BITS_OR_RETURN_ERROR(order_hint_bits_minus_1, 3);
+    }
+  }
+
+  AV1C_READ_BIT_OR_RETURN_ERROR(enable_superres);
+  AV1C_READ_BIT_OR_RETURN_ERROR(enable_cdef);
+  AV1C_READ_BIT_OR_RETURN_ERROR(enable_restoration);
+
+  if (parse_color_config(reader, config) != 0) {
+    fprintf(stderr, "av1c: color_config() parse failed.\n");
+    return -1;
+  }
+
+  AV1C_READ_BIT_OR_RETURN_ERROR(film_grain_params_present);
+  return 0;
+}
+
+int get_av1config_from_obu(const uint8_t *buffer, size_t length, int is_annexb,
+                           Av1Config *config) {
+  if (!buffer || length == 0 || !config) {
+    return -1;
+  }
+
+  ObuHeader obu_header;
+  memset(&obu_header, 0, sizeof(obu_header));
+
+  size_t sequence_header_length = 0;
+  size_t obu_header_length = 0;
+  if (aom_read_obu_header_and_size(buffer, length, is_annexb, &obu_header,
+                                   &sequence_header_length,
+                                   &obu_header_length) != AOM_CODEC_OK ||
+      obu_header.type != OBU_SEQUENCE_HEADER ||
+      sequence_header_length + obu_header_length > length) {
+    return -1;
+  }
+
+  memset(config, 0, sizeof(*config));
+  config->marker = 1;
+  config->version = 1;
+  return parse_sequence_header(buffer + obu_header_length,
+                               sequence_header_length, config);
+}
+
+int read_av1config(const uint8_t *buffer, size_t buffer_length,
+                   size_t *bytes_read, Av1Config *config) {
+  if (!buffer || buffer_length < kAv1cSize || !bytes_read || !config) return -1;
+
+  *bytes_read = 0;
+
+  int result = 0;
+  struct aom_read_bit_buffer reader_instance = { buffer, buffer + buffer_length,
+                                                 0, &result,
+                                                 bitreader_error_handler };
+  struct aom_read_bit_buffer *reader = &reader_instance;
+
+  memset(config, 0, sizeof(*config));
+
+  AV1C_READ_BIT_OR_RETURN_ERROR(marker);
+  config->marker = marker;
+
+  AV1C_READ_BITS_OR_RETURN_ERROR(version, 7);
+  config->version = version;
+
+  AV1C_READ_BITS_OR_RETURN_ERROR(seq_profile, 3);
+  config->seq_profile = seq_profile;
+
+  AV1C_READ_BITS_OR_RETURN_ERROR(seq_level_idx_0, 5);
+  config->seq_level_idx_0 = seq_level_idx_0;
+
+  AV1C_READ_BIT_OR_RETURN_ERROR(seq_tier_0);
+  config->seq_tier_0 = seq_tier_0;
+
+  AV1C_READ_BIT_OR_RETURN_ERROR(high_bitdepth);
+  config->high_bitdepth = high_bitdepth;
+
+  AV1C_READ_BIT_OR_RETURN_ERROR(twelve_bit);
+  config->twelve_bit = twelve_bit;
+
+  AV1C_READ_BIT_OR_RETURN_ERROR(monochrome);
+  config->monochrome = monochrome;
+
+  AV1C_READ_BIT_OR_RETURN_ERROR(chroma_subsampling_x);
+  config->chroma_subsampling_x = chroma_subsampling_x;
+
+  AV1C_READ_BIT_OR_RETURN_ERROR(chroma_subsampling_y);
+  config->chroma_subsampling_y = chroma_subsampling_y;
+
+  AV1C_READ_BITS_OR_RETURN_ERROR(chroma_sample_position, 2);
+  config->chroma_sample_position = chroma_sample_position;
+
+  AV1C_READ_BITS_OR_RETURN_ERROR(reserved, 3);
+
+  AV1C_READ_BIT_OR_RETURN_ERROR(initial_presentation_delay_present);
+  config->initial_presentation_delay_present =
+      initial_presentation_delay_present;
+
+  AV1C_READ_BITS_OR_RETURN_ERROR(initial_presentation_delay_minus_one, 4);
+  config->initial_presentation_delay_minus_one =
+      initial_presentation_delay_minus_one;
+
+  *bytes_read = aom_rb_bytes_read(reader);
+
+  return 0;
+}
+
+int write_av1config(const Av1Config *config, size_t capacity,
+                    size_t *bytes_written, uint8_t *buffer) {
+  if (!config || !buffer || capacity < kAv1cSize || !bytes_written) return -1;
+
+  *bytes_written = 0;
+  memset(buffer, 0, kAv1cSize);
+
+  struct aom_write_bit_buffer writer = { buffer, 0 };
+
+  aom_wb_write_bit(&writer, config->marker);
+  aom_wb_write_literal(&writer, config->version, 7);
+  aom_wb_write_literal(&writer, config->seq_profile, 3);
+  aom_wb_write_literal(&writer, config->seq_level_idx_0, 5);
+  aom_wb_write_bit(&writer, config->seq_tier_0);
+  aom_wb_write_bit(&writer, config->high_bitdepth);
+  aom_wb_write_bit(&writer, config->twelve_bit);
+  aom_wb_write_bit(&writer, config->monochrome);
+  aom_wb_write_bit(&writer, config->chroma_subsampling_x);
+  aom_wb_write_bit(&writer, config->chroma_subsampling_y);
+  aom_wb_write_literal(&writer, config->chroma_sample_position, 2);
+  aom_wb_write_literal(&writer, 0, 3);  // reserved
+  aom_wb_write_bit(&writer, config->initial_presentation_delay_present);
+
+  if (config->initial_presentation_delay_present) {
+    aom_wb_write_literal(&writer, config->initial_presentation_delay_minus_one,
+                         4);
+  } else {
+    aom_wb_write_literal(&writer, 0, 4);  // reserved
+  }
+
+  *bytes_written = aom_wb_bytes_written(&writer);
+  return 0;
+}
+
+#undef AV1C_READ_BIT_OR_RETURN_ERROR
+#undef AV1C_READ_BITS_OR_RETURN_ERROR
+#undef AV1C_PUSH_ERROR_HANDLER_DATA
+#undef AV1C_POP_ERROR_HANDLER_DATA
diff --git a/libs/libaom/src/common/av1_config.h b/libs/libaom/src/common/av1_config.h
new file mode 100644
index 000000000..a15bedb30
--- /dev/null
+++ b/libs/libaom/src/common/av1_config.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_COMMON_AV1_CONFIG_H_
+#define AOM_COMMON_AV1_CONFIG_H_
+
+#include "aom/aom_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Struct representing ISOBMFF/Matroska AV1 config. See:
+// https://aomediacodec.github.io/av1-isobmff/#av1codecconfigurationbox-syntax
+//
+// The AV1 config has the following format:
+//
+// unsigned int (1) marker = 1;
+// unsigned int (7) version = 1;
+// unsigned int (3) seq_profile;
+// unsigned int (5) seq_level_idx_0;
+// unsigned int (1) seq_tier_0;
+// unsigned int (1) high_bitdepth;
+// unsigned int (1) twelve_bit;
+// unsigned int (1) monochrome;
+// unsigned int (1) chroma_subsampling_x;
+// unsigned int (1) chroma_subsampling_y;
+// unsigned int (2) chroma_sample_position;
+// unsigned int (3) reserved = 0;
+//
+// unsigned int (1) initial_presentation_delay_present;
+// if (initial_presentation_delay_present) {
+//   unsigned int (4) initial_presentation_delay_minus_one;
+// } else {
+//   unsigned int (4) reserved = 0;
+// }
+//
+// unsigned int (8)[] configOBUs;
+//
+// Note: get_av1config_from_obu() does not currently store 'configOBUs' data, so
+// the field is omitted.
+typedef struct _Av1Config {
+  uint8_t marker;
+  uint8_t version;
+  uint8_t seq_profile;
+  uint8_t seq_level_idx_0;
+  uint8_t seq_tier_0;
+  uint8_t high_bitdepth;
+  uint8_t twelve_bit;
+  uint8_t monochrome;
+  uint8_t chroma_subsampling_x;
+  uint8_t chroma_subsampling_y;
+  uint8_t chroma_sample_position;
+  uint8_t initial_presentation_delay_present;
+  uint8_t initial_presentation_delay_minus_one;
+} Av1Config;
+
+// Attempts to parse a Sequence Header OBU and set the paramenters of 'config'.
+// Returns 0 upon success, and -1 upon failure. 'buffer' can contain multiple
+// OBUs, but the Sequence Header OBU must be the first OBU within the buffer.
+int get_av1config_from_obu(const uint8_t *buffer, size_t length, int is_annexb,
+                           Av1Config *config);
+
+// Attempts to parse an AV1 config from 'buffer'. Returns 0 upon success.
+// Returns -1 when 'buffer_length' is less than 4, when passed NULL pointers, or
+// when parsing of 'buffer' fails.
+int read_av1config(const uint8_t *buffer, size_t buffer_length,
+                   size_t *bytes_read, Av1Config *config);
+
+// Writes 'config' to 'buffer'. Returns 0 upon successful write to 'buffer'.
+// Returns -1 when passed NULL pointers or when 'capacity' insufficient.
+int write_av1config(const Av1Config *config, size_t capacity,
+                    size_t *bytes_written, uint8_t *buffer);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif  // AOM_COMMON_AV1_CONFIG_H_
diff --git a/libs/libaom/src/common/ivfdec.c b/libs/libaom/src/common/ivfdec.c
new file mode 100644
index 000000000..80d73b04c
--- /dev/null
+++ b/libs/libaom/src/common/ivfdec.c
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "common/ivfdec.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom_ports/mem_ops.h"
+#include "aom_ports/sanitizer.h"
+
+static const char *IVF_SIGNATURE = "DKIF";
+
+static void fix_framerate(int *num, int *den) {
+  if (*den <= 0 || *den >= 1000000000 || *num <= 0 || *num >= 1000) {
+    // framerate seems to be invalid, just default to 30fps.
+    *num = 30;
+    *den = 1;
+  }
+}
+
+int file_is_ivf(struct AvxInputContext *input_ctx) {
+  char raw_hdr[32];
+  int is_ivf = 0;
+
+  if (fread(raw_hdr, 1, 32, input_ctx->file) == 32) {
+    if (memcmp(IVF_SIGNATURE, raw_hdr, 4) == 0) {
+      is_ivf = 1;
+
+      if (mem_get_le16(raw_hdr + 4) != 0) {
+        fprintf(stderr,
+                "Error: Unrecognized IVF version! This file may not"
+                " decode properly.");
+      }
+
+      input_ctx->fourcc = mem_get_le32(raw_hdr + 8);
+      input_ctx->width = mem_get_le16(raw_hdr + 12);
+      input_ctx->height = mem_get_le16(raw_hdr + 14);
+      input_ctx->framerate.numerator = mem_get_le32(raw_hdr + 16);
+      input_ctx->framerate.denominator = mem_get_le32(raw_hdr + 20);
+      fix_framerate(&input_ctx->framerate.numerator,
+                    &input_ctx->framerate.denominator);
+    }
+  }
+
+  if (!is_ivf) {
+    rewind(input_ctx->file);
+    input_ctx->detect.buf_read = 0;
+  } else {
+    input_ctx->detect.position = 4;
+  }
+  return is_ivf;
+}
+
+int ivf_read_frame(FILE *infile, uint8_t **buffer, size_t *bytes_read,
+                   size_t *buffer_size, aom_codec_pts_t *pts) {
+  char raw_header[IVF_FRAME_HDR_SZ] = { 0 };
+  size_t frame_size = 0;
+
+  if (fread(raw_header, IVF_FRAME_HDR_SZ, 1, infile) != 1) {
+    if (!feof(infile)) warn("Failed to read frame size");
+  } else {
+    frame_size = mem_get_le32(raw_header);
+
+    if (frame_size > 256 * 1024 * 1024) {
+      warn("Read invalid frame size (%u)", (unsigned int)frame_size);
+      frame_size = 0;
+    }
+
+    if (frame_size > *buffer_size) {
+      uint8_t *new_buffer = (uint8_t *)realloc(*buffer, 2 * frame_size);
+
+      if (new_buffer) {
+        *buffer = new_buffer;
+        *buffer_size = 2 * frame_size;
+      } else {
+        warn("Failed to allocate compressed data buffer");
+        frame_size = 0;
+      }
+    }
+
+    if (pts) {
+      *pts = mem_get_le32(&raw_header[4]);
+      *pts += ((aom_codec_pts_t)mem_get_le32(&raw_header[8]) << 32);
+    }
+  }
+
+  if (!feof(infile)) {
+    ASAN_UNPOISON_MEMORY_REGION(*buffer, *buffer_size);
+    if (fread(*buffer, 1, frame_size, infile) != frame_size) {
+      warn("Failed to read full frame");
+      return 1;
+    }
+
+    ASAN_POISON_MEMORY_REGION(*buffer + frame_size, *buffer_size - frame_size);
+    *bytes_read = frame_size;
+    return 0;
+  }
+
+  return 1;
+}
diff --git a/libs/libaom/src/common/ivfdec.h b/libs/libaom/src/common/ivfdec.h
new file mode 100644
index 000000000..dbc77331f
--- /dev/null
+++ b/libs/libaom/src/common/ivfdec.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_COMMON_IVFDEC_H_
+#define AOM_COMMON_IVFDEC_H_
+
+#include "aom/aom_codec.h"
+#include "common/tools_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int file_is_ivf(struct AvxInputContext *input);
+int ivf_read_frame(FILE *infile, uint8_t **buffer, size_t *bytes_read,
+                   size_t *buffer_size, aom_codec_pts_t *pts);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif  // AOM_COMMON_IVFDEC_H_
diff --git a/libs/libaom/src/common/ivfenc.c b/libs/libaom/src/common/ivfenc.c
new file mode 100644
index 000000000..64715f4d7
--- /dev/null
+++ b/libs/libaom/src/common/ivfenc.c
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "common/ivfenc.h"
+
+#include "aom/aom_encoder.h"
+#include "aom_ports/mem_ops.h"
+
+void ivf_write_file_header(FILE *outfile, const struct aom_codec_enc_cfg *cfg,
+                           unsigned int fourcc, int frame_cnt) {
+  char header[32];
+
+  header[0] = 'D';
+  header[1] = 'K';
+  header[2] = 'I';
+  header[3] = 'F';
+  mem_put_le16(header + 4, 0);                     // version
+  mem_put_le16(header + 6, 32);                    // header size
+  mem_put_le32(header + 8, fourcc);                // fourcc
+  mem_put_le16(header + 12, cfg->g_w);             // width
+  mem_put_le16(header + 14, cfg->g_h);             // height
+  mem_put_le32(header + 16, cfg->g_timebase.den);  // rate
+  mem_put_le32(header + 20, cfg->g_timebase.num);  // scale
+  mem_put_le32(header + 24, frame_cnt);            // length
+  mem_put_le32(header + 28, 0);                    // unused
+
+  fwrite(header, 1, 32, outfile);
+}
+
+void ivf_write_frame_header(FILE *outfile, int64_t pts, size_t frame_size) {
+  char header[12];
+
+  mem_put_le32(header, (int)frame_size);
+  mem_put_le32(header + 4, (int)(pts & 0xFFFFFFFF));
+  mem_put_le32(header + 8, (int)(pts >> 32));
+  fwrite(header, 1, 12, outfile);
+}
+
+void ivf_write_frame_size(FILE *outfile, size_t frame_size) {
+  char header[4];
+
+  mem_put_le32(header, (int)frame_size);
+  fwrite(header, 1, 4, outfile);
+}
diff --git a/libs/libaom/src/common/ivfenc.h b/libs/libaom/src/common/ivfenc.h
new file mode 100644
index 000000000..8f6d947d4
--- /dev/null
+++ b/libs/libaom/src/common/ivfenc.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_COMMON_IVFENC_H_
+#define AOM_COMMON_IVFENC_H_
+
+#include "common/tools_common.h"
+
+struct aom_codec_enc_cfg;
+struct aom_codec_cx_pkt;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void ivf_write_file_header(FILE *outfile, const struct aom_codec_enc_cfg *cfg,
+                           uint32_t fourcc, int frame_cnt);
+
+void ivf_write_frame_header(FILE *outfile, int64_t pts, size_t frame_size);
+
+void ivf_write_frame_size(FILE *outfile, size_t frame_size);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif  // AOM_COMMON_IVFENC_H_
diff --git a/libs/libaom/src/common/md5_utils.c b/libs/libaom/src/common/md5_utils.c
new file mode 100644
index 000000000..b69e1cc72
--- /dev/null
+++ b/libs/libaom/src/common/md5_utils.c
@@ -0,0 +1,249 @@
+/*
+ * This code implements the MD5 message-digest algorithm.
+ * The algorithm is due to Ron Rivest.  This code was
+ * written by Colin Plumb in 1993, no copyright is claimed.
+ * This code is in the public domain; do with it what you wish.
+ *
+ * Equivalent code is available from RSA Data Security, Inc.
+ * This code has been tested against that, and is equivalent,
+ * except that you don't need to include two pages of legalese
+ * with every copy.
+ *
+ * To compute the message digest of a chunk of bytes, declare an
+ * MD5Context structure, pass it to MD5Init, call MD5Update as
+ * needed on buffers full of bytes, and then call MD5Final, which
+ * will fill a supplied 16-byte array with the digest.
+ *
+ * Changed so as no longer to depend on Colin Plumb's `usual.h' header
+ * definitions
+ *  - Ian Jackson <ian@chiark.greenend.org.uk>.
+ * Still in the public domain.
+ */
+
+#include <string.h> /* for memcpy() */
+
+#include "common/md5_utils.h"
+
+static void byteSwap(UWORD32 *buf, unsigned words) {
+  md5byte *p;
+
+  /* Only swap bytes for big endian machines */
+  int i = 1;
+
+  if (*(char *)&i == 1) return;
+
+  p = (md5byte *)buf;
+
+  do {
+    *buf++ = (UWORD32)((unsigned)p[3] << 8 | p[2]) << 16 |
+             ((unsigned)p[1] << 8 | p[0]);
+    p += 4;
+  } while (--words);
+}
+
+/*
+ * Start MD5 accumulation.  Set bit count to 0 and buffer to mysterious
+ * initialization constants.
+ */
+void MD5Init(struct MD5Context *ctx) {
+  ctx->buf[0] = 0x67452301;
+  ctx->buf[1] = 0xefcdab89;
+  ctx->buf[2] = 0x98badcfe;
+  ctx->buf[3] = 0x10325476;
+
+  ctx->bytes[0] = 0;
+  ctx->bytes[1] = 0;
+}
+
+/*
+ * Update context to reflect the concatenation of another buffer full
+ * of bytes.
+ */
+void MD5Update(struct MD5Context *ctx, md5byte const *buf, unsigned len) {
+  UWORD32 t;
+
+  /* Update byte count */
+
+  t = ctx->bytes[0];
+
+  if ((ctx->bytes[0] = t + len) < t)
+    ctx->bytes[1]++; /* Carry from low to high */
+
+  t = 64 - (t & 0x3f); /* Space available in ctx->in (at least 1) */
+
+  if (t > len) {
+    memcpy((md5byte *)ctx->in + 64 - t, buf, len);
+    return;
+  }
+
+  /* First chunk is an odd size */
+  memcpy((md5byte *)ctx->in + 64 - t, buf, t);
+  byteSwap(ctx->in, 16);
+  MD5Transform(ctx->buf, ctx->in);
+  buf += t;
+  len -= t;
+
+  /* Process data in 64-byte chunks */
+  while (len >= 64) {
+    memcpy(ctx->in, buf, 64);
+    byteSwap(ctx->in, 16);
+    MD5Transform(ctx->buf, ctx->in);
+    buf += 64;
+    len -= 64;
+  }
+
+  /* Handle any remaining bytes of data. */
+  memcpy(ctx->in, buf, len);
+}
+
+/*
+ * Final wrapup - pad to 64-byte boundary with the bit pattern
+ * 1 0* (64-bit count of bits processed, MSB-first)
+ */
+void MD5Final(md5byte digest[16], struct MD5Context *ctx) {
+  int count = ctx->bytes[0] & 0x3f; /* Number of bytes in ctx->in */
+  md5byte *p = (md5byte *)ctx->in + count;
+
+  /* Set the first char of padding to 0x80.  There is always room. */
+  *p++ = 0x80;
+
+  /* Bytes of padding needed to make 56 bytes (-8..55) */
+  count = 56 - 1 - count;
+
+  if (count < 0) { /* Padding forces an extra block */
+    memset(p, 0, count + 8);
+    byteSwap(ctx->in, 16);
+    MD5Transform(ctx->buf, ctx->in);
+    p = (md5byte *)ctx->in;
+    count = 56;
+  }
+
+  memset(p, 0, count);
+  byteSwap(ctx->in, 14);
+
+  /* Append length in bits and transform */
+  ctx->in[14] = ctx->bytes[0] << 3;
+  ctx->in[15] = ctx->bytes[1] << 3 | ctx->bytes[0] >> 29;
+  MD5Transform(ctx->buf, ctx->in);
+
+  byteSwap(ctx->buf, 4);
+  memcpy(digest, ctx->buf, 16);
+  memset(ctx, 0, sizeof(*ctx)); /* In case it's sensitive */
+}
+
+#ifndef ASM_MD5
+
+/* The four core functions - F1 is optimized somewhat */
+
+/* #define F1(x, y, z) (x & y | ~x & z) */
+#define F1(x, y, z) (z ^ (x & (y ^ z)))
+#define F2(x, y, z) F1(z, x, y)
+#define F3(x, y, z) (x ^ y ^ z)
+#define F4(x, y, z) (y ^ (x | ~z))
+
+/* This is the central step in the MD5 algorithm. */
+#define MD5STEP(f, w, x, y, z, in, s) \
+  (w += f(x, y, z) + in, w = (w << s | w >> (32 - s)) + x)
+
+#if defined(__clang__) && defined(__has_attribute)
+#if __has_attribute(no_sanitize)
+#define AOM_NO_UNSIGNED_OVERFLOW_CHECK \
+  __attribute__((no_sanitize("unsigned-integer-overflow")))
+#endif
+#endif
+
+#ifndef AOM_NO_UNSIGNED_OVERFLOW_CHECK
+#define AOM_NO_UNSIGNED_OVERFLOW_CHECK
+#endif
+
+/*
+ * The core of the MD5 algorithm, this alters an existing MD5 hash to
+ * reflect the addition of 16 longwords of new data.  MD5Update blocks
+ * the data and converts bytes into longwords for this routine.
+ */
+AOM_NO_UNSIGNED_OVERFLOW_CHECK void MD5Transform(UWORD32 buf[4],
+                                                 UWORD32 const in[16]) {
+  register UWORD32 a, b, c, d;
+
+  a = buf[0];
+  b = buf[1];
+  c = buf[2];
+  d = buf[3];
+
+  MD5STEP(F1, a, b, c, d, in[0] + 0xd76aa478, 7);
+  MD5STEP(F1, d, a, b, c, in[1] + 0xe8c7b756, 12);
+  MD5STEP(F1, c, d, a, b, in[2] + 0x242070db, 17);
+  MD5STEP(F1, b, c, d, a, in[3] + 0xc1bdceee, 22);
+  MD5STEP(F1, a, b, c, d, in[4] + 0xf57c0faf, 7);
+  MD5STEP(F1, d, a, b, c, in[5] + 0x4787c62a, 12);
+  MD5STEP(F1, c, d, a, b, in[6] + 0xa8304613, 17);
+  MD5STEP(F1, b, c, d, a, in[7] + 0xfd469501, 22);
+  MD5STEP(F1, a, b, c, d, in[8] + 0x698098d8, 7);
+  MD5STEP(F1, d, a, b, c, in[9] + 0x8b44f7af, 12);
+  MD5STEP(F1, c, d, a, b, in[10] + 0xffff5bb1, 17);
+  MD5STEP(F1, b, c, d, a, in[11] + 0x895cd7be, 22);
+  MD5STEP(F1, a, b, c, d, in[12] + 0x6b901122, 7);
+  MD5STEP(F1, d, a, b, c, in[13] + 0xfd987193, 12);
+  MD5STEP(F1, c, d, a, b, in[14] + 0xa679438e, 17);
+  MD5STEP(F1, b, c, d, a, in[15] + 0x49b40821, 22);
+
+  MD5STEP(F2, a, b, c, d, in[1] + 0xf61e2562, 5);
+  MD5STEP(F2, d, a, b, c, in[6] + 0xc040b340, 9);
+  MD5STEP(F2, c, d, a, b, in[11] + 0x265e5a51, 14);
+  MD5STEP(F2, b, c, d, a, in[0] + 0xe9b6c7aa, 20);
+  MD5STEP(F2, a, b, c, d, in[5] + 0xd62f105d, 5);
+  MD5STEP(F2, d, a, b, c, in[10] + 0x02441453, 9);
+  MD5STEP(F2, c, d, a, b, in[15] + 0xd8a1e681, 14);
+  MD5STEP(F2, b, c, d, a, in[4] + 0xe7d3fbc8, 20);
+  MD5STEP(F2, a, b, c, d, in[9] + 0x21e1cde6, 5);
+  MD5STEP(F2, d, a, b, c, in[14] + 0xc33707d6, 9);
+  MD5STEP(F2, c, d, a, b, in[3] + 0xf4d50d87, 14);
+  MD5STEP(F2, b, c, d, a, in[8] + 0x455a14ed, 20);
+  MD5STEP(F2, a, b, c, d, in[13] + 0xa9e3e905, 5);
+  MD5STEP(F2, d, a, b, c, in[2] + 0xfcefa3f8, 9);
+  MD5STEP(F2, c, d, a, b, in[7] + 0x676f02d9, 14);
+  MD5STEP(F2, b, c, d, a, in[12] + 0x8d2a4c8a, 20);
+
+  MD5STEP(F3, a, b, c, d, in[5] + 0xfffa3942, 4);
+  MD5STEP(F3, d, a, b, c, in[8] + 0x8771f681, 11);
+  MD5STEP(F3, c, d, a, b, in[11] + 0x6d9d6122, 16);
+  MD5STEP(F3, b, c, d, a, in[14] + 0xfde5380c, 23);
+  MD5STEP(F3, a, b, c, d, in[1] + 0xa4beea44, 4);
+  MD5STEP(F3, d, a, b, c, in[4] + 0x4bdecfa9, 11);
+  MD5STEP(F3, c, d, a, b, in[7] + 0xf6bb4b60, 16);
+  MD5STEP(F3, b, c, d, a, in[10] + 0xbebfbc70, 23);
+  MD5STEP(F3, a, b, c, d, in[13] + 0x289b7ec6, 4);
+  MD5STEP(F3, d, a, b, c, in[0] + 0xeaa127fa, 11);
+  MD5STEP(F3, c, d, a, b, in[3] + 0xd4ef3085, 16);
+  MD5STEP(F3, b, c, d, a, in[6] + 0x04881d05, 23);
+  MD5STEP(F3, a, b, c, d, in[9] + 0xd9d4d039, 4);
+  MD5STEP(F3, d, a, b, c, in[12] + 0xe6db99e5, 11);
+  MD5STEP(F3, c, d, a, b, in[15] + 0x1fa27cf8, 16);
+  MD5STEP(F3, b, c, d, a, in[2] + 0xc4ac5665, 23);
+
+  MD5STEP(F4, a, b, c, d, in[0] + 0xf4292244, 6);
+  MD5STEP(F4, d, a, b, c, in[7] + 0x432aff97, 10);
+  MD5STEP(F4, c, d, a, b, in[14] + 0xab9423a7, 15);
+  MD5STEP(F4, b, c, d, a, in[5] + 0xfc93a039, 21);
+  MD5STEP(F4, a, b, c, d, in[12] + 0x655b59c3, 6);
+  MD5STEP(F4, d, a, b, c, in[3] + 0x8f0ccc92, 10);
+  MD5STEP(F4, c, d, a, b, in[10] + 0xffeff47d, 15);
+  MD5STEP(F4, b, c, d, a, in[1] + 0x85845dd1, 21);
+  MD5STEP(F4, a, b, c, d, in[8] + 0x6fa87e4f, 6);
+  MD5STEP(F4, d, a, b, c, in[15] + 0xfe2ce6e0, 10);
+  MD5STEP(F4, c, d, a, b, in[6] + 0xa3014314, 15);
+  MD5STEP(F4, b, c, d, a, in[13] + 0x4e0811a1, 21);
+  MD5STEP(F4, a, b, c, d, in[4] + 0xf7537e82, 6);
+  MD5STEP(F4, d, a, b, c, in[11] + 0xbd3af235, 10);
+  MD5STEP(F4, c, d, a, b, in[2] + 0x2ad7d2bb, 15);
+  MD5STEP(F4, b, c, d, a, in[9] + 0xeb86d391, 21);
+
+  buf[0] += a;
+  buf[1] += b;
+  buf[2] += c;
+  buf[3] += d;
+}
+
+#undef AOM_NO_UNSIGNED_OVERFLOW_CHECK
+
+#endif
diff --git a/libs/libaom/src/common/md5_utils.h b/libs/libaom/src/common/md5_utils.h
new file mode 100644
index 000000000..144fa3ad2
--- /dev/null
+++ b/libs/libaom/src/common/md5_utils.h
@@ -0,0 +1,49 @@
+/*
+ * This is the header file for the MD5 message-digest algorithm.
+ * The algorithm is due to Ron Rivest.  This code was
+ * written by Colin Plumb in 1993, no copyright is claimed.
+ * This code is in the public domain; do with it what you wish.
+ *
+ * Equivalent code is available from RSA Data Security, Inc.
+ * This code has been tested against that, and is equivalent,
+ * except that you don't need to include two pages of legalese
+ * with every copy.
+ *
+ * To compute the message digest of a chunk of bytes, declare an
+ * MD5Context structure, pass it to MD5Init, call MD5Update as
+ * needed on buffers full of bytes, and then call MD5Final, which
+ * will fill a supplied 16-byte array with the digest.
+ *
+ * Changed so as no longer to depend on Colin Plumb's `usual.h'
+ * header definitions
+ *  - Ian Jackson <ian@chiark.greenend.org.uk>.
+ * Still in the public domain.
+ */
+
+#ifndef AOM_COMMON_MD5_UTILS_H_
+#define AOM_COMMON_MD5_UTILS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define md5byte unsigned char
+#define UWORD32 unsigned int
+
+typedef struct MD5Context MD5Context;
+struct MD5Context {
+  UWORD32 buf[4];
+  UWORD32 bytes[2];
+  UWORD32 in[16];
+};
+
+void MD5Init(struct MD5Context *context);
+void MD5Update(struct MD5Context *context, md5byte const *buf, unsigned len);
+void MD5Final(unsigned char digest[16], struct MD5Context *context);
+void MD5Transform(UWORD32 buf[4], UWORD32 const in[16]);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_COMMON_MD5_UTILS_H_
diff --git a/libs/libaom/src/common/obudec.c b/libs/libaom/src/common/obudec.c
new file mode 100644
index 000000000..650f9973b
--- /dev/null
+++ b/libs/libaom/src/common/obudec.c
@@ -0,0 +1,486 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include "common/obudec.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/mem_ops.h"
+#include "av1/common/common.h"
+#include "av1/common/obu_util.h"
+
+#define OBU_BUFFER_SIZE (500 * 1024)
+
+#define OBU_HEADER_SIZE 1
+#define OBU_EXTENSION_SIZE 1
+#define OBU_MAX_LENGTH_FIELD_SIZE 8
+
+#define OBU_MAX_HEADER_SIZE \
+  (OBU_HEADER_SIZE + OBU_EXTENSION_SIZE + 2 * OBU_MAX_LENGTH_FIELD_SIZE)
+
+#define OBU_DETECTION_SIZE \
+  (OBU_HEADER_SIZE + OBU_EXTENSION_SIZE + 4 * OBU_MAX_LENGTH_FIELD_SIZE)
+
+// Reads unsigned LEB128 integer and returns 0 upon successful read and decode.
+// Stores raw bytes in 'value_buffer', length of the number in 'value_length',
+// and decoded value in 'value'.
+static int obudec_read_leb128(FILE *f, uint8_t *value_buffer,
+                              size_t *value_length, uint64_t *value) {
+  if (!f || !value_buffer || !value_length || !value) return -1;
+  size_t len;
+  for (len = 0; len < OBU_MAX_LENGTH_FIELD_SIZE; ++len) {
+    const size_t num_read = fread(&value_buffer[len], 1, 1, f);
+    if (num_read == 0) {
+      if (len == 0 && feof(f)) {
+        *value_length = 0;
+        return 0;
+      }
+      // Ran out of data before completing read of value.
+      return -1;
+    }
+    if ((value_buffer[len] >> 7) == 0) {
+      ++len;
+      *value_length = len;
+      break;
+    }
+  }
+
+  return aom_uleb_decode(value_buffer, len, value, NULL);
+}
+
+// Reads OBU header from 'f'. The 'buffer_capacity' passed in must be large
+// enough to store an OBU header with extension (2 bytes). Raw OBU data is
+// written to 'obu_data', parsed OBU header values are written to 'obu_header',
+// and total bytes read from file are written to 'bytes_read'. Returns 0 for
+// success, and non-zero on failure. When end of file is reached, the return
+// value is 0 and the 'bytes_read' value is set to 0.
+static int obudec_read_obu_header(FILE *f, size_t buffer_capacity,
+                                  int is_annexb, uint8_t *obu_data,
+                                  ObuHeader *obu_header, size_t *bytes_read) {
+  if (!f || buffer_capacity < (OBU_HEADER_SIZE + OBU_EXTENSION_SIZE) ||
+      !obu_data || !obu_header || !bytes_read) {
+    return -1;
+  }
+  *bytes_read = fread(obu_data, 1, 1, f);
+
+  if (feof(f) && *bytes_read == 0) {
+    return 0;
+  } else if (*bytes_read != 1) {
+    fprintf(stderr, "obudec: Failure reading OBU header.\n");
+    return -1;
+  }
+
+  const int has_extension = (obu_data[0] >> 2) & 0x1;
+  if (has_extension) {
+    if (fread(&obu_data[1], 1, 1, f) != 1) {
+      fprintf(stderr, "obudec: Failure reading OBU extension.");
+      return -1;
+    }
+    ++*bytes_read;
+  }
+
+  size_t obu_bytes_parsed = 0;
+  const aom_codec_err_t parse_result = aom_read_obu_header(
+      obu_data, *bytes_read, &obu_bytes_parsed, obu_header, is_annexb);
+  if (parse_result != AOM_CODEC_OK || *bytes_read != obu_bytes_parsed) {
+    fprintf(stderr, "obudec: Error parsing OBU header.\n");
+    return -1;
+  }
+
+  return 0;
+}
+
+// Reads OBU payload from 'f' and returns 0 for success when all payload bytes
+// are read from the file. Payload data is written to 'obu_data', and actual
+// bytes read added to 'bytes_read'.
+static int obudec_read_obu_payload(FILE *f, size_t payload_length,
+                                   uint8_t *obu_data, size_t *bytes_read) {
+  if (!f || payload_length == 0 || !obu_data || !bytes_read) return -1;
+
+  if (fread(obu_data, 1, payload_length, f) != payload_length) {
+    fprintf(stderr, "obudec: Failure reading OBU payload.\n");
+    return -1;
+  }
+
+  *bytes_read += payload_length;
+  return 0;
+}
+
+static int obudec_read_obu_header_and_size(FILE *f, size_t buffer_capacity,
+                                           int is_annexb, uint8_t *buffer,
+                                           size_t *bytes_read,
+                                           size_t *payload_length,
+                                           ObuHeader *obu_header) {
+  const size_t kMinimumBufferSize = OBU_MAX_HEADER_SIZE;
+  if (!f || !buffer || !bytes_read || !payload_length || !obu_header ||
+      buffer_capacity < kMinimumBufferSize) {
+    return -1;
+  }
+
+  size_t leb128_length_obu = 0;
+  size_t leb128_length_payload = 0;
+  uint64_t obu_size = 0;
+  if (is_annexb) {
+    if (obudec_read_leb128(f, &buffer[0], &leb128_length_obu, &obu_size) != 0) {
+      fprintf(stderr, "obudec: Failure reading OBU size length.\n");
+      return -1;
+    } else if (leb128_length_obu == 0) {
+      *payload_length = 0;
+      return 0;
+    }
+    if (obu_size > UINT32_MAX) {
+      fprintf(stderr, "obudec: OBU payload length too large.\n");
+      return -1;
+    }
+  }
+
+  size_t header_size = 0;
+  if (obudec_read_obu_header(f, buffer_capacity - leb128_length_obu, is_annexb,
+                             buffer + leb128_length_obu, obu_header,
+                             &header_size) != 0) {
+    return -1;
+  } else if (header_size == 0) {
+    *payload_length = 0;
+    return 0;
+  }
+
+  if (!obu_header->has_size_field) {
+    assert(is_annexb);
+    if (obu_size < header_size) {
+      fprintf(stderr, "obudec: OBU size is too small.\n");
+      return -1;
+    }
+    *payload_length = (size_t)obu_size - header_size;
+  } else {
+    uint64_t u64_payload_length = 0;
+    if (obudec_read_leb128(f, &buffer[leb128_length_obu + header_size],
+                           &leb128_length_payload, &u64_payload_length) != 0) {
+      fprintf(stderr, "obudec: Failure reading OBU payload length.\n");
+      return -1;
+    }
+    if (u64_payload_length > UINT32_MAX) {
+      fprintf(stderr, "obudec: OBU payload length too large.\n");
+      return -1;
+    }
+
+    *payload_length = (size_t)u64_payload_length;
+  }
+
+  *bytes_read = leb128_length_obu + header_size + leb128_length_payload;
+  return 0;
+}
+
+static int obudec_grow_buffer(size_t growth_amount, uint8_t **obu_buffer,
+                              size_t *obu_buffer_capacity) {
+  if (!*obu_buffer || !obu_buffer_capacity || growth_amount == 0) {
+    return -1;
+  }
+
+  const size_t capacity = *obu_buffer_capacity;
+  if (SIZE_MAX - growth_amount < capacity) {
+    fprintf(stderr, "obudec: cannot grow buffer, capacity will roll over.\n");
+    return -1;
+  }
+
+  const size_t new_capacity = capacity + growth_amount;
+
+#if defined AOM_MAX_ALLOCABLE_MEMORY
+  if (new_capacity > AOM_MAX_ALLOCABLE_MEMORY) {
+    fprintf(stderr, "obudec: OBU size exceeds max alloc size.\n");
+    return -1;
+  }
+#endif
+
+  uint8_t *new_buffer = (uint8_t *)realloc(*obu_buffer, new_capacity);
+  if (!new_buffer) {
+    fprintf(stderr, "obudec: Failed to allocate compressed data buffer.\n");
+    return -1;
+  }
+
+  *obu_buffer = new_buffer;
+  *obu_buffer_capacity = new_capacity;
+  return 0;
+}
+
+static int obudec_read_one_obu(FILE *f, uint8_t **obu_buffer,
+                               size_t obu_bytes_buffered,
+                               size_t *obu_buffer_capacity, size_t *obu_length,
+                               ObuHeader *obu_header, int is_annexb) {
+  if (!f || !(*obu_buffer) || !obu_buffer_capacity || !obu_length ||
+      !obu_header) {
+    return -1;
+  }
+
+  size_t bytes_read = 0;
+  size_t obu_payload_length = 0;
+  size_t available_buffer_capacity = *obu_buffer_capacity - obu_bytes_buffered;
+
+  if (available_buffer_capacity < OBU_MAX_HEADER_SIZE) {
+    if (obudec_grow_buffer(AOMMAX(*obu_buffer_capacity, OBU_MAX_HEADER_SIZE),
+                           obu_buffer, obu_buffer_capacity) != 0) {
+      *obu_length = bytes_read;
+      return -1;
+    }
+    available_buffer_capacity +=
+        AOMMAX(*obu_buffer_capacity, OBU_MAX_HEADER_SIZE);
+  }
+
+  const int status = obudec_read_obu_header_and_size(
+      f, available_buffer_capacity, is_annexb, *obu_buffer + obu_bytes_buffered,
+      &bytes_read, &obu_payload_length, obu_header);
+  if (status < 0) return status;
+
+  if (obu_payload_length > SIZE_MAX - bytes_read) return -1;
+
+  if (obu_payload_length > 256 * 1024 * 1024) {
+    fprintf(stderr, "obudec: Read invalid OBU size (%u)\n",
+            (unsigned int)obu_payload_length);
+    *obu_length = bytes_read + obu_payload_length;
+    return -1;
+  }
+
+  if (bytes_read + obu_payload_length > available_buffer_capacity &&
+      obudec_grow_buffer(AOMMAX(*obu_buffer_capacity, obu_payload_length),
+                         obu_buffer, obu_buffer_capacity) != 0) {
+    *obu_length = bytes_read + obu_payload_length;
+    return -1;
+  }
+
+  if (obu_payload_length > 0 &&
+      obudec_read_obu_payload(f, obu_payload_length,
+                              *obu_buffer + obu_bytes_buffered + bytes_read,
+                              &bytes_read) != 0) {
+    return -1;
+  }
+
+  *obu_length = bytes_read;
+  return 0;
+}
+
+int file_is_obu(struct ObuDecInputContext *obu_ctx) {
+  if (!obu_ctx || !obu_ctx->avx_ctx) return 0;
+
+  struct AvxInputContext *avx_ctx = obu_ctx->avx_ctx;
+  uint8_t detect_buf[OBU_DETECTION_SIZE] = { 0 };
+  const int is_annexb = obu_ctx->is_annexb;
+  FILE *f = avx_ctx->file;
+  size_t payload_length = 0;
+  ObuHeader obu_header;
+  memset(&obu_header, 0, sizeof(obu_header));
+  size_t length_of_unit_size = 0;
+  size_t annexb_header_length = 0;
+  uint64_t unit_size = 0;
+
+  if (is_annexb) {
+    // read the size of first temporal unit
+    if (obudec_read_leb128(f, &detect_buf[0], &length_of_unit_size,
+                           &unit_size) != 0) {
+      fprintf(stderr, "obudec: Failure reading temporal unit header\n");
+      return 0;
+    }
+
+    // read the size of first frame unit
+    if (obudec_read_leb128(f, &detect_buf[length_of_unit_size],
+                           &annexb_header_length, &unit_size) != 0) {
+      fprintf(stderr, "obudec: Failure reading frame unit header\n");
+      return 0;
+    }
+    annexb_header_length += length_of_unit_size;
+  }
+
+  size_t bytes_read = 0;
+  if (obudec_read_obu_header_and_size(
+          f, OBU_DETECTION_SIZE - annexb_header_length, is_annexb,
+          &detect_buf[annexb_header_length], &bytes_read, &payload_length,
+          &obu_header) != 0) {
+    fprintf(stderr, "obudec: Failure reading first OBU.\n");
+    rewind(f);
+    return 0;
+  }
+
+  if (is_annexb) {
+    bytes_read += annexb_header_length;
+  }
+
+  if (obu_header.type != OBU_TEMPORAL_DELIMITER &&
+      obu_header.type != OBU_SEQUENCE_HEADER) {
+    return 0;
+  }
+
+  if (obu_header.has_size_field) {
+    if (obu_header.type == OBU_TEMPORAL_DELIMITER && payload_length != 0) {
+      fprintf(
+          stderr,
+          "obudec: Invalid OBU_TEMPORAL_DELIMITER payload length (non-zero).");
+      rewind(f);
+      return 0;
+    }
+  } else if (!is_annexb) {
+    fprintf(stderr, "obudec: OBU size fields required, cannot decode input.\n");
+    rewind(f);
+    return 0;
+  }
+
+  // Appears that input is valid Section 5 AV1 stream.
+  obu_ctx->buffer = (uint8_t *)malloc(OBU_BUFFER_SIZE);
+  if (!obu_ctx->buffer) {
+    fprintf(stderr, "Out of memory.\n");
+    rewind(f);
+    return 0;
+  }
+  obu_ctx->buffer_capacity = OBU_BUFFER_SIZE;
+
+  memcpy(obu_ctx->buffer, &detect_buf[0], bytes_read);
+  obu_ctx->bytes_buffered = bytes_read;
+  // If the first OBU is a SEQUENCE_HEADER, then it will have a payload.
+  // We need to read this in so that our buffer only contains complete OBUs.
+  if (payload_length > 0) {
+    if (payload_length > (obu_ctx->buffer_capacity - bytes_read)) {
+      fprintf(stderr, "obudec: First OBU's payload is too large\n");
+      rewind(f);
+      return 0;
+    }
+
+    size_t payload_bytes = 0;
+    const int status = obudec_read_obu_payload(
+        f, payload_length, &obu_ctx->buffer[bytes_read], &payload_bytes);
+    if (status < 0) {
+      rewind(f);
+      return 0;
+    }
+    obu_ctx->bytes_buffered += payload_bytes;
+  }
+  return 1;
+}
+
+int obudec_read_temporal_unit(struct ObuDecInputContext *obu_ctx,
+                              uint8_t **buffer, size_t *bytes_read,
+                              size_t *buffer_size) {
+  FILE *f = obu_ctx->avx_ctx->file;
+  if (!f) return -1;
+
+  *buffer_size = 0;
+  *bytes_read = 0;
+
+  if (feof(f)) {
+    return 1;
+  }
+
+  size_t tu_size;
+  size_t obu_size = 0;
+  size_t length_of_temporal_unit_size = 0;
+  uint8_t tuheader[OBU_MAX_LENGTH_FIELD_SIZE] = { 0 };
+
+  if (obu_ctx->is_annexb) {
+    uint64_t size = 0;
+
+    if (obu_ctx->bytes_buffered == 0) {
+      if (obudec_read_leb128(f, &tuheader[0], &length_of_temporal_unit_size,
+                             &size) != 0) {
+        fprintf(stderr, "obudec: Failure reading temporal unit header\n");
+        return -1;
+      }
+      if (size == 0 && feof(f)) {
+        return 1;
+      }
+    } else {
+      // temporal unit size was already stored in buffer
+      if (aom_uleb_decode(obu_ctx->buffer, obu_ctx->bytes_buffered, &size,
+                          &length_of_temporal_unit_size) != 0) {
+        fprintf(stderr, "obudec: Failure reading temporal unit header\n");
+        return -1;
+      }
+    }
+
+    if (size > UINT32_MAX || size + length_of_temporal_unit_size > UINT32_MAX) {
+      fprintf(stderr, "obudec: TU too large.\n");
+      return -1;
+    }
+
+    size += length_of_temporal_unit_size;
+    tu_size = (size_t)size;
+  } else {
+    while (1) {
+      ObuHeader obu_header;
+      memset(&obu_header, 0, sizeof(obu_header));
+
+      if (obudec_read_one_obu(f, &obu_ctx->buffer, obu_ctx->bytes_buffered,
+                              &obu_ctx->buffer_capacity, &obu_size, &obu_header,
+                              0) != 0) {
+        fprintf(stderr, "obudec: read_one_obu failed in TU loop\n");
+        return -1;
+      }
+
+      if (obu_header.type == OBU_TEMPORAL_DELIMITER || obu_size == 0) {
+        tu_size = obu_ctx->bytes_buffered;
+        break;
+      } else {
+        obu_ctx->bytes_buffered += obu_size;
+      }
+    }
+  }
+
+#if defined AOM_MAX_ALLOCABLE_MEMORY
+  if (tu_size > AOM_MAX_ALLOCABLE_MEMORY) {
+    fprintf(stderr, "obudec: Temporal Unit size exceeds max alloc size.\n");
+    return -1;
+  }
+#endif
+  if (tu_size > 0) {
+    uint8_t *new_buffer = (uint8_t *)realloc(*buffer, tu_size);
+    if (!new_buffer) {
+      free(*buffer);
+      fprintf(stderr, "obudec: Out of memory.\n");
+      return -1;
+    }
+    *buffer = new_buffer;
+  }
+  *bytes_read = tu_size;
+  *buffer_size = tu_size;
+
+  if (!obu_ctx->is_annexb) {
+    memcpy(*buffer, obu_ctx->buffer, tu_size);
+
+    // At this point, (obu_ctx->buffer + obu_ctx->bytes_buffered + obu_size)
+    // points to the end of the buffer.
+    memmove(obu_ctx->buffer, obu_ctx->buffer + obu_ctx->bytes_buffered,
+            obu_size);
+    obu_ctx->bytes_buffered = obu_size;
+  } else {
+    if (!feof(f)) {
+      size_t data_size;
+      size_t offset;
+      if (!obu_ctx->bytes_buffered) {
+        data_size = tu_size - length_of_temporal_unit_size;
+        memcpy(*buffer, &tuheader[0], length_of_temporal_unit_size);
+        offset = length_of_temporal_unit_size;
+      } else {
+        const size_t copy_size = AOMMIN(obu_ctx->bytes_buffered, tu_size);
+        memcpy(*buffer, obu_ctx->buffer, copy_size);
+        offset = copy_size;
+        data_size = tu_size - copy_size;
+        obu_ctx->bytes_buffered -= copy_size;
+      }
+
+      if (fread(*buffer + offset, 1, data_size, f) != data_size) {
+        fprintf(stderr, "obudec: Failed to read full temporal unit\n");
+        return -1;
+      }
+    }
+  }
+  return 0;
+}
+
+void obudec_free(struct ObuDecInputContext *obu_ctx) { free(obu_ctx->buffer); }
diff --git a/libs/libaom/src/common/obudec.h b/libs/libaom/src/common/obudec.h
new file mode 100644
index 000000000..b2adb1e3d
--- /dev/null
+++ b/libs/libaom/src/common/obudec.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_COMMON_OBUDEC_H_
+#define AOM_COMMON_OBUDEC_H_
+
+#include "common/tools_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct ObuDecInputContext {
+  struct AvxInputContext *avx_ctx;
+  uint8_t *buffer;
+  size_t buffer_capacity;
+  size_t bytes_buffered;
+  int is_annexb;
+};
+
+// Returns 1 when file data starts (if Annex B stream, after reading the
+// size of the OBU) with what appears to be a Temporal Delimiter
+// OBU as defined by Section 5 of the AV1 bitstream specification.
+int file_is_obu(struct ObuDecInputContext *obu_ctx);
+
+// Reads one Temporal Unit from the input file. Returns 0 when a TU is
+// successfully read, 1 when end of file is reached, and less than 0 when an
+// error occurs. Stores TU data in 'buffer'. Reallocs buffer to match TU size,
+// returns buffer capacity via 'buffer_size', and returns size of buffered data
+// via 'bytes_read'.
+int obudec_read_temporal_unit(struct ObuDecInputContext *obu_ctx,
+                              uint8_t **buffer, size_t *bytes_read,
+                              size_t *buffer_size);
+
+void obudec_free(struct ObuDecInputContext *obu_ctx);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif  // AOM_COMMON_OBUDEC_H_
diff --git a/libs/libaom/src/common/rawenc.c b/libs/libaom/src/common/rawenc.c
new file mode 100644
index 000000000..b72132c2e
--- /dev/null
+++ b/libs/libaom/src/common/rawenc.c
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdbool.h>
+#include "common/rawenc.h"
+
+#define BATCH_SIZE 8
+// When writing greyscale color, batch 8 writes for low bit-depth, 4 writes
+// for high bit-depth.
+static const uint8_t batched[BATCH_SIZE] = { 128, 128, 128, 128,
+                                             128, 128, 128, 128 };
+static const uint8_t batched_hbd[BATCH_SIZE] = {
+  0, 128, 0, 128, 0, 128, 0, 128
+};
+
+// Interface to writing to either a file or MD5Context. Takes a pointer to
+// either the file or MD5Context, the buffer, the size of each element, and
+// number of elements to write. Note that size and nmemb (last two args) must
+// be unsigned int, as the interface to MD5Update requires that.
+typedef void (*WRITER)(void *, const uint8_t *, unsigned int, unsigned int);
+
+static void write_file(void *fp, const uint8_t *buffer, unsigned int size,
+                       unsigned int nmemb) {
+  fwrite(buffer, size, nmemb, (FILE *)fp);
+}
+
+static void write_md5(void *md5, const uint8_t *buffer, unsigned int size,
+                      unsigned int nmemb) {
+  MD5Update((MD5Context *)md5, buffer, size * nmemb);
+}
+
+// Writes out n greyscale values.
+static void write_greyscale(const bool high_bitdepth, int n, WRITER writer_func,
+                            void *file_or_md5) {
+  const uint8_t *b = batched;
+  if (high_bitdepth) {
+    b = batched_hbd;
+  }
+  const int num_batched_writes =
+      high_bitdepth ? n / (BATCH_SIZE / 2) : n / BATCH_SIZE;
+  for (int i = 0; i < num_batched_writes; ++i) {
+    writer_func(file_or_md5, b, sizeof(uint8_t), BATCH_SIZE);
+  }
+  const int remaining = high_bitdepth ? n % (BATCH_SIZE / 2) : n % BATCH_SIZE;
+  for (int i = 0; i < remaining; ++i) {
+    if (high_bitdepth) {
+      writer_func(file_or_md5, batched_hbd, sizeof(uint8_t), 2);
+    } else {
+      writer_func(file_or_md5, batched, sizeof(uint8_t), 1);
+    }
+  }
+}
+
+// Encapsulates the logic for writing raw data to either an image file or
+// to an MD5 context.
+static void raw_write_image_file_or_md5(const aom_image_t *img,
+                                        const int *planes, const int num_planes,
+                                        void *file_or_md5, WRITER writer_func) {
+  const bool high_bitdepth = img->fmt & AOM_IMG_FMT_HIGHBITDEPTH;
+  const int bytes_per_sample = high_bitdepth ? 2 : 1;
+  for (int i = 0; i < num_planes; ++i) {
+    const int plane = planes[i];
+    const int w = aom_img_plane_width(img, plane);
+    const int h = aom_img_plane_height(img, plane);
+    // If we're on a color plane and the output is monochrome, write a greyscale
+    // value. Since there are only YUV planes, compare against Y.
+    if (img->monochrome && plane != AOM_PLANE_Y) {
+      write_greyscale(high_bitdepth, w * h, writer_func, file_or_md5);
+      continue;
+    }
+    const unsigned char *buf = img->planes[plane];
+    const int stride = img->stride[plane];
+    for (int y = 0; y < h; ++y) {
+      writer_func(file_or_md5, buf, bytes_per_sample, w);
+      buf += stride;
+    }
+  }
+}
+
+void raw_write_image_file(const aom_image_t *img, const int *planes,
+                          const int num_planes, FILE *file) {
+  raw_write_image_file_or_md5(img, planes, num_planes, file, write_file);
+}
+
+void raw_update_image_md5(const aom_image_t *img, const int *planes,
+                          const int num_planes, MD5Context *md5) {
+  raw_write_image_file_or_md5(img, planes, num_planes, md5, write_md5);
+}
diff --git a/libs/libaom/src/common/rawenc.h b/libs/libaom/src/common/rawenc.h
new file mode 100644
index 000000000..cf5e00e6f
--- /dev/null
+++ b/libs/libaom/src/common/rawenc.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_COMMON_RAWENC_H_
+#define AOM_COMMON_RAWENC_H_
+
+#include "aom/aom_decoder.h"
+#include "common/md5_utils.h"
+#include "common/tools_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void raw_write_image_file(const aom_image_t *img, const int *planes,
+                          const int num_planes, FILE *file);
+void raw_update_image_md5(const aom_image_t *img, const int *planes,
+                          const int num_planes, MD5Context *md5);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_COMMON_RAWENC_H_
diff --git a/libs/libaom/src/common/tools_common.c b/libs/libaom/src/common/tools_common.c
new file mode 100644
index 000000000..51c1c52a1
--- /dev/null
+++ b/libs/libaom/src/common/tools_common.c
@@ -0,0 +1,508 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "common/tools_common.h"
+
+#include <math.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#if CONFIG_AV1_ENCODER
+#include "aom/aomcx.h"
+#endif
+
+#if CONFIG_AV1_DECODER
+#include "aom/aomdx.h"
+#endif
+
+#if defined(_WIN32) || defined(__OS2__)
+#include <io.h>
+#include <fcntl.h>
+
+#ifdef __OS2__
+#define _setmode setmode
+#define _fileno fileno
+#define _O_BINARY O_BINARY
+#endif
+#endif
+
+#define LOG_ERROR(label)               \
+  do {                                 \
+    const char *l = label;             \
+    va_list ap;                        \
+    va_start(ap, fmt);                 \
+    if (l) fprintf(stderr, "%s: ", l); \
+    vfprintf(stderr, fmt, ap);         \
+    fprintf(stderr, "\n");             \
+    va_end(ap);                        \
+  } while (0)
+
+FILE *set_binary_mode(FILE *stream) {
+  (void)stream;
+#if defined(_WIN32) || defined(__OS2__)
+  _setmode(_fileno(stream), _O_BINARY);
+#endif
+  return stream;
+}
+
+void die(const char *fmt, ...) {
+  LOG_ERROR(NULL);
+  usage_exit();
+}
+
+void fatal(const char *fmt, ...) {
+  LOG_ERROR("Fatal");
+  exit(EXIT_FAILURE);
+}
+
+void warn(const char *fmt, ...) { LOG_ERROR("Warning"); }
+
+void die_codec(aom_codec_ctx_t *ctx, const char *s) {
+  const char *detail = aom_codec_error_detail(ctx);
+
+  printf("%s: %s\n", s, aom_codec_error(ctx));
+  if (detail) printf("    %s\n", detail);
+  exit(EXIT_FAILURE);
+}
+
+int read_yuv_frame(struct AvxInputContext *input_ctx, aom_image_t *yuv_frame) {
+  FILE *f = input_ctx->file;
+  struct FileTypeDetectionBuffer *detect = &input_ctx->detect;
+  int plane = 0;
+  int shortread = 0;
+  const int bytespp = (yuv_frame->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1;
+
+  for (plane = 0; plane < 3; ++plane) {
+    uint8_t *ptr;
+    const int w = aom_img_plane_width(yuv_frame, plane);
+    const int h = aom_img_plane_height(yuv_frame, plane);
+    int r;
+
+    /* Determine the correct plane based on the image format. The for-loop
+     * always counts in Y,U,V order, but this may not match the order of
+     * the data on disk.
+     */
+    switch (plane) {
+      case 1:
+        ptr =
+            yuv_frame->planes[yuv_frame->fmt == AOM_IMG_FMT_YV12 ? AOM_PLANE_V
+                                                                 : AOM_PLANE_U];
+        break;
+      case 2:
+        ptr =
+            yuv_frame->planes[yuv_frame->fmt == AOM_IMG_FMT_YV12 ? AOM_PLANE_U
+                                                                 : AOM_PLANE_V];
+        break;
+      default: ptr = yuv_frame->planes[plane];
+    }
+
+    for (r = 0; r < h; ++r) {
+      size_t needed = w * bytespp;
+      size_t buf_position = 0;
+      const size_t left = detect->buf_read - detect->position;
+      if (left > 0) {
+        const size_t more = (left < needed) ? left : needed;
+        memcpy(ptr, detect->buf + detect->position, more);
+        buf_position = more;
+        needed -= more;
+        detect->position += more;
+      }
+      if (needed > 0) {
+        shortread |= (fread(ptr + buf_position, 1, needed, f) < needed);
+      }
+
+      ptr += yuv_frame->stride[plane];
+    }
+  }
+
+  return shortread;
+}
+
+#if CONFIG_AV1_ENCODER
+static const AvxInterface aom_encoders[] = {
+  { "av1", AV1_FOURCC, &aom_codec_av1_cx },
+};
+
+int get_aom_encoder_count(void) {
+  return sizeof(aom_encoders) / sizeof(aom_encoders[0]);
+}
+
+const AvxInterface *get_aom_encoder_by_index(int i) { return &aom_encoders[i]; }
+
+const AvxInterface *get_aom_encoder_by_name(const char *name) {
+  int i;
+
+  for (i = 0; i < get_aom_encoder_count(); ++i) {
+    const AvxInterface *encoder = get_aom_encoder_by_index(i);
+    if (strcmp(encoder->name, name) == 0) return encoder;
+  }
+
+  return NULL;
+}
+
+// large scale tile encoding
+static const AvxInterface aom_lst_encoder = { "av1", LST_FOURCC,
+                                              &aom_codec_av1_cx };
+const AvxInterface *get_aom_lst_encoder(void) { return &aom_lst_encoder; }
+#endif  // CONFIG_AV1_ENCODER
+
+#if CONFIG_AV1_DECODER
+static const AvxInterface aom_decoders[] = {
+  { "av1", AV1_FOURCC, &aom_codec_av1_dx },
+};
+
+int get_aom_decoder_count(void) {
+  return sizeof(aom_decoders) / sizeof(aom_decoders[0]);
+}
+
+const AvxInterface *get_aom_decoder_by_index(int i) { return &aom_decoders[i]; }
+
+const AvxInterface *get_aom_decoder_by_name(const char *name) {
+  int i;
+
+  for (i = 0; i < get_aom_decoder_count(); ++i) {
+    const AvxInterface *const decoder = get_aom_decoder_by_index(i);
+    if (strcmp(decoder->name, name) == 0) return decoder;
+  }
+
+  return NULL;
+}
+
+const AvxInterface *get_aom_decoder_by_fourcc(uint32_t fourcc) {
+  int i;
+
+  for (i = 0; i < get_aom_decoder_count(); ++i) {
+    const AvxInterface *const decoder = get_aom_decoder_by_index(i);
+    if (decoder->fourcc == fourcc) return decoder;
+  }
+
+  return NULL;
+}
+#endif  // CONFIG_AV1_DECODER
+
+void aom_img_write(const aom_image_t *img, FILE *file) {
+  int plane;
+
+  for (plane = 0; plane < 3; ++plane) {
+    const unsigned char *buf = img->planes[plane];
+    const int stride = img->stride[plane];
+    const int w = aom_img_plane_width(img, plane) *
+                  ((img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1);
+    const int h = aom_img_plane_height(img, plane);
+    int y;
+
+    for (y = 0; y < h; ++y) {
+      fwrite(buf, 1, w, file);
+      buf += stride;
+    }
+  }
+}
+
+int aom_img_read(aom_image_t *img, FILE *file) {
+  int plane;
+
+  for (plane = 0; plane < 3; ++plane) {
+    unsigned char *buf = img->planes[plane];
+    const int stride = img->stride[plane];
+    const int w = aom_img_plane_width(img, plane) *
+                  ((img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1);
+    const int h = aom_img_plane_height(img, plane);
+    int y;
+
+    for (y = 0; y < h; ++y) {
+      if (fread(buf, 1, w, file) != (size_t)w) return 0;
+      buf += stride;
+    }
+  }
+
+  return 1;
+}
+
+// TODO(dkovalev) change sse_to_psnr signature: double -> int64_t
+double sse_to_psnr(double samples, double peak, double sse) {
+  static const double kMaxPSNR = 100.0;
+
+  if (sse > 0.0) {
+    const double psnr = 10.0 * log10(samples * peak * peak / sse);
+    return psnr > kMaxPSNR ? kMaxPSNR : psnr;
+  } else {
+    return kMaxPSNR;
+  }
+}
+
+// TODO(debargha): Consolidate the functions below into a separate file.
+static void highbd_img_upshift(aom_image_t *dst, const aom_image_t *src,
+                               int input_shift) {
+  // Note the offset is 1 less than half.
+  const int offset = input_shift > 0 ? (1 << (input_shift - 1)) - 1 : 0;
+  int plane;
+  if (dst->d_w != src->d_w || dst->d_h != src->d_h ||
+      dst->x_chroma_shift != src->x_chroma_shift ||
+      dst->y_chroma_shift != src->y_chroma_shift || dst->fmt != src->fmt ||
+      input_shift < 0) {
+    fatal("Unsupported image conversion");
+  }
+  switch (src->fmt) {
+    case AOM_IMG_FMT_I42016:
+    case AOM_IMG_FMT_I42216:
+    case AOM_IMG_FMT_I44416: break;
+    default: fatal("Unsupported image conversion"); break;
+  }
+  for (plane = 0; plane < 3; plane++) {
+    int w = src->d_w;
+    int h = src->d_h;
+    int x, y;
+    if (plane) {
+      w = (w + src->x_chroma_shift) >> src->x_chroma_shift;
+      h = (h + src->y_chroma_shift) >> src->y_chroma_shift;
+    }
+    for (y = 0; y < h; y++) {
+      const uint16_t *p_src =
+          (const uint16_t *)(src->planes[plane] + y * src->stride[plane]);
+      uint16_t *p_dst =
+          (uint16_t *)(dst->planes[plane] + y * dst->stride[plane]);
+      for (x = 0; x < w; x++) *p_dst++ = (*p_src++ << input_shift) + offset;
+    }
+  }
+}
+
+static void lowbd_img_upshift(aom_image_t *dst, const aom_image_t *src,
+                              int input_shift) {
+  // Note the offset is 1 less than half.
+  const int offset = input_shift > 0 ? (1 << (input_shift - 1)) - 1 : 0;
+  int plane;
+  if (dst->d_w != src->d_w || dst->d_h != src->d_h ||
+      dst->x_chroma_shift != src->x_chroma_shift ||
+      dst->y_chroma_shift != src->y_chroma_shift ||
+      dst->fmt != src->fmt + AOM_IMG_FMT_HIGHBITDEPTH || input_shift < 0) {
+    fatal("Unsupported image conversion");
+  }
+  switch (src->fmt) {
+    case AOM_IMG_FMT_YV12:
+    case AOM_IMG_FMT_I420:
+    case AOM_IMG_FMT_I422:
+    case AOM_IMG_FMT_I444: break;
+    default: fatal("Unsupported image conversion"); break;
+  }
+  for (plane = 0; plane < 3; plane++) {
+    int w = src->d_w;
+    int h = src->d_h;
+    int x, y;
+    if (plane) {
+      w = (w + src->x_chroma_shift) >> src->x_chroma_shift;
+      h = (h + src->y_chroma_shift) >> src->y_chroma_shift;
+    }
+    for (y = 0; y < h; y++) {
+      const uint8_t *p_src = src->planes[plane] + y * src->stride[plane];
+      uint16_t *p_dst =
+          (uint16_t *)(dst->planes[plane] + y * dst->stride[plane]);
+      for (x = 0; x < w; x++) {
+        *p_dst++ = (*p_src++ << input_shift) + offset;
+      }
+    }
+  }
+}
+
+void aom_img_upshift(aom_image_t *dst, const aom_image_t *src,
+                     int input_shift) {
+  if (src->fmt & AOM_IMG_FMT_HIGHBITDEPTH) {
+    highbd_img_upshift(dst, src, input_shift);
+  } else {
+    lowbd_img_upshift(dst, src, input_shift);
+  }
+}
+
+void aom_img_truncate_16_to_8(aom_image_t *dst, const aom_image_t *src) {
+  int plane;
+  if (dst->fmt + AOM_IMG_FMT_HIGHBITDEPTH != src->fmt || dst->d_w != src->d_w ||
+      dst->d_h != src->d_h || dst->x_chroma_shift != src->x_chroma_shift ||
+      dst->y_chroma_shift != src->y_chroma_shift) {
+    fatal("Unsupported image conversion");
+  }
+  switch (dst->fmt) {
+    case AOM_IMG_FMT_I420:
+    case AOM_IMG_FMT_I422:
+    case AOM_IMG_FMT_I444: break;
+    default: fatal("Unsupported image conversion"); break;
+  }
+  for (plane = 0; plane < 3; plane++) {
+    int w = src->d_w;
+    int h = src->d_h;
+    int x, y;
+    if (plane) {
+      w = (w + src->x_chroma_shift) >> src->x_chroma_shift;
+      h = (h + src->y_chroma_shift) >> src->y_chroma_shift;
+    }
+    for (y = 0; y < h; y++) {
+      const uint16_t *p_src =
+          (const uint16_t *)(src->planes[plane] + y * src->stride[plane]);
+      uint8_t *p_dst = dst->planes[plane] + y * dst->stride[plane];
+      for (x = 0; x < w; x++) {
+        *p_dst++ = (uint8_t)(*p_src++);
+      }
+    }
+  }
+}
+
+static void highbd_img_downshift(aom_image_t *dst, const aom_image_t *src,
+                                 int down_shift) {
+  int plane;
+  if (dst->d_w != src->d_w || dst->d_h != src->d_h ||
+      dst->x_chroma_shift != src->x_chroma_shift ||
+      dst->y_chroma_shift != src->y_chroma_shift || dst->fmt != src->fmt ||
+      down_shift < 0) {
+    fatal("Unsupported image conversion");
+  }
+  switch (src->fmt) {
+    case AOM_IMG_FMT_I42016:
+    case AOM_IMG_FMT_I42216:
+    case AOM_IMG_FMT_I44416: break;
+    default: fatal("Unsupported image conversion"); break;
+  }
+  for (plane = 0; plane < 3; plane++) {
+    int w = src->d_w;
+    int h = src->d_h;
+    int x, y;
+    if (plane) {
+      w = (w + src->x_chroma_shift) >> src->x_chroma_shift;
+      h = (h + src->y_chroma_shift) >> src->y_chroma_shift;
+    }
+    for (y = 0; y < h; y++) {
+      const uint16_t *p_src =
+          (const uint16_t *)(src->planes[plane] + y * src->stride[plane]);
+      uint16_t *p_dst =
+          (uint16_t *)(dst->planes[plane] + y * dst->stride[plane]);
+      for (x = 0; x < w; x++) *p_dst++ = *p_src++ >> down_shift;
+    }
+  }
+}
+
+static void lowbd_img_downshift(aom_image_t *dst, const aom_image_t *src,
+                                int down_shift) {
+  int plane;
+  if (dst->d_w != src->d_w || dst->d_h != src->d_h ||
+      dst->x_chroma_shift != src->x_chroma_shift ||
+      dst->y_chroma_shift != src->y_chroma_shift ||
+      src->fmt != dst->fmt + AOM_IMG_FMT_HIGHBITDEPTH || down_shift < 0) {
+    fatal("Unsupported image conversion");
+  }
+  switch (dst->fmt) {
+    case AOM_IMG_FMT_I420:
+    case AOM_IMG_FMT_I422:
+    case AOM_IMG_FMT_I444: break;
+    default: fatal("Unsupported image conversion"); break;
+  }
+  for (plane = 0; plane < 3; plane++) {
+    int w = src->d_w;
+    int h = src->d_h;
+    int x, y;
+    if (plane) {
+      w = (w + src->x_chroma_shift) >> src->x_chroma_shift;
+      h = (h + src->y_chroma_shift) >> src->y_chroma_shift;
+    }
+    for (y = 0; y < h; y++) {
+      const uint16_t *p_src =
+          (const uint16_t *)(src->planes[plane] + y * src->stride[plane]);
+      uint8_t *p_dst = dst->planes[plane] + y * dst->stride[plane];
+      for (x = 0; x < w; x++) {
+        *p_dst++ = *p_src++ >> down_shift;
+      }
+    }
+  }
+}
+
+void aom_img_downshift(aom_image_t *dst, const aom_image_t *src,
+                       int down_shift) {
+  if (dst->fmt & AOM_IMG_FMT_HIGHBITDEPTH) {
+    highbd_img_downshift(dst, src, down_shift);
+  } else {
+    lowbd_img_downshift(dst, src, down_shift);
+  }
+}
+
+static int img_shifted_realloc_required(const aom_image_t *img,
+                                        const aom_image_t *shifted,
+                                        aom_img_fmt_t required_fmt) {
+  return img->d_w != shifted->d_w || img->d_h != shifted->d_h ||
+         required_fmt != shifted->fmt;
+}
+
+void aom_shift_img(unsigned int output_bit_depth, aom_image_t **img_ptr,
+                   aom_image_t **img_shifted_ptr) {
+  aom_image_t *img = *img_ptr;
+  aom_image_t *img_shifted = *img_shifted_ptr;
+
+  const aom_img_fmt_t shifted_fmt = output_bit_depth == 8
+                                        ? img->fmt & ~AOM_IMG_FMT_HIGHBITDEPTH
+                                        : img->fmt | AOM_IMG_FMT_HIGHBITDEPTH;
+
+  if (shifted_fmt != img->fmt || output_bit_depth != img->bit_depth) {
+    if (img_shifted &&
+        img_shifted_realloc_required(img, img_shifted, shifted_fmt)) {
+      aom_img_free(img_shifted);
+      img_shifted = NULL;
+    }
+    if (img_shifted) {
+      img_shifted->monochrome = img->monochrome;
+    }
+    if (!img_shifted) {
+      img_shifted = aom_img_alloc(NULL, shifted_fmt, img->d_w, img->d_h, 16);
+      img_shifted->bit_depth = output_bit_depth;
+      img_shifted->monochrome = img->monochrome;
+      img_shifted->csp = img->csp;
+    }
+    if (output_bit_depth > img->bit_depth) {
+      aom_img_upshift(img_shifted, img, output_bit_depth - img->bit_depth);
+    } else {
+      aom_img_downshift(img_shifted, img, img->bit_depth - output_bit_depth);
+    }
+    *img_shifted_ptr = img_shifted;
+    *img_ptr = img_shifted;
+  }
+}
+
+// Related to I420, NV12 format has one luma "luminance" plane Y and one plane
+// with U and V values interleaved.
+void aom_img_write_nv12(const aom_image_t *img, FILE *file) {
+  // Y plane
+  const unsigned char *buf = img->planes[0];
+  int stride = img->stride[0];
+  int w = aom_img_plane_width(img, 0) *
+          ((img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1);
+  int h = aom_img_plane_height(img, 0);
+  int x, y;
+
+  for (y = 0; y < h; ++y) {
+    fwrite(buf, 1, w, file);
+    buf += stride;
+  }
+
+  // Interleaved U and V plane
+  const unsigned char *ubuf = img->planes[1];
+  const unsigned char *vbuf = img->planes[2];
+  const size_t size = (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1;
+  stride = img->stride[1];
+  w = aom_img_plane_width(img, 1);
+  h = aom_img_plane_height(img, 1);
+
+  for (y = 0; y < h; ++y) {
+    for (x = 0; x < w; ++x) {
+      fwrite(ubuf, size, 1, file);
+      fwrite(vbuf, size, 1, file);
+      ubuf += size;
+      vbuf += size;
+    }
+    ubuf += (stride - w * size);
+    vbuf += (stride - w * size);
+  }
+}
diff --git a/libs/libaom/src/common/tools_common.h b/libs/libaom/src/common/tools_common.h
new file mode 100644
index 000000000..1ed004521
--- /dev/null
+++ b/libs/libaom/src/common/tools_common.h
@@ -0,0 +1,264 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_COMMON_TOOLS_COMMON_H_
+#define AOM_COMMON_TOOLS_COMMON_H_
+
+#include <stdio.h>
+
+#include "config/aom_config.h"
+
+#include "aom/aom_codec.h"
+#include "aom/aom_image.h"
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/msvc.h"
+
+#if CONFIG_AV1_ENCODER
+#include "common/y4minput.h"
+#endif
+
+#if defined(_MSC_VER)
+/* MSVS uses _f{seek,tell}i64. */
+#define fseeko _fseeki64
+#define ftello _ftelli64
+typedef int64_t FileOffset;
+#elif defined(_WIN32)
+#include <sys/types.h> /* NOLINT*/
+/* MinGW uses f{seek,tell}o64 for large files. */
+#define fseeko fseeko64
+#define ftello ftello64
+typedef off64_t FileOffset;
+#elif CONFIG_OS_SUPPORT
+#include <sys/types.h> /* NOLINT*/
+typedef off_t FileOffset;
+/* Use 32-bit file operations in WebM file format when building ARM
+ * executables (.axf) with RVCT. */
+#else
+#define fseeko fseek
+#define ftello ftell
+typedef long FileOffset; /* NOLINT */
+#endif /* CONFIG_OS_SUPPORT */
+
+#if CONFIG_OS_SUPPORT
+#if defined(_MSC_VER)
+#include <io.h> /* NOLINT */
+#define isatty _isatty
+#define fileno _fileno
+#else
+#include <unistd.h> /* NOLINT */
+#endif              /* _MSC_VER */
+#endif              /* CONFIG_OS_SUPPORT */
+
+#define LITERALU64(hi, lo) ((((uint64_t)hi) << 32) | lo)
+
+#ifndef PATH_MAX
+#define PATH_MAX 512
+#endif
+
+#define IVF_FRAME_HDR_SZ (4 + 8) /* 4 byte size + 8 byte timestamp */
+#define IVF_FILE_HDR_SZ 32
+
+#define RAW_FRAME_HDR_SZ sizeof(uint32_t)
+
+#define AV1_FOURCC 0x31305641
+
+enum VideoFileType {
+  FILE_TYPE_OBU,
+  FILE_TYPE_RAW,
+  FILE_TYPE_IVF,
+  FILE_TYPE_Y4M,
+  FILE_TYPE_WEBM
+};
+
+// Used in lightfield example.
+enum {
+  YUV1D,  // 1D tile output for conformance test.
+  YUV,    // Tile output in YUV format.
+  NV12,   // Tile output in NV12 format.
+} UENUM1BYTE(OUTPUT_FORMAT);
+
+// The fourcc for large_scale_tile encoding is "LSTC".
+#define LST_FOURCC 0x4354534c
+
+struct FileTypeDetectionBuffer {
+  char buf[4];
+  size_t buf_read;
+  size_t position;
+};
+
+struct AvxRational {
+  int numerator;
+  int denominator;
+};
+
+struct AvxInputContext {
+  const char *filename;
+  FILE *file;
+  int64_t length;
+  struct FileTypeDetectionBuffer detect;
+  enum VideoFileType file_type;
+  uint32_t width;
+  uint32_t height;
+  struct AvxRational pixel_aspect_ratio;
+  aom_img_fmt_t fmt;
+  aom_bit_depth_t bit_depth;
+  int only_i420;
+  uint32_t fourcc;
+  struct AvxRational framerate;
+#if CONFIG_AV1_ENCODER
+  y4m_input y4m;
+#endif
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(__GNUC__)
+#define AOM_NO_RETURN __attribute__((noreturn))
+#else
+#define AOM_NO_RETURN
+#endif
+
+/* Sets a stdio stream into binary mode */
+FILE *set_binary_mode(FILE *stream);
+
+void die(const char *fmt, ...) AOM_NO_RETURN;
+void fatal(const char *fmt, ...) AOM_NO_RETURN;
+void warn(const char *fmt, ...);
+
+void die_codec(aom_codec_ctx_t *ctx, const char *s) AOM_NO_RETURN;
+
+/* The tool including this file must define usage_exit() */
+void usage_exit(void) AOM_NO_RETURN;
+
+#undef AOM_NO_RETURN
+
+int read_yuv_frame(struct AvxInputContext *input_ctx, aom_image_t *yuv_frame);
+
+///////////////////////////////////////////////////////////////////////////////
+// A description of the interfaces used to access the AOM codecs
+///////////////////////////////////////////////////////////////////////////////
+//
+// There are three levels of interfaces used to access the AOM codec: the
+// AVXInterface, the aom_codec_iface, and the aom_codec_ctx. Each of these
+// is described in detail here.
+//
+//
+// 1. AVXInterface
+//    (Related files: common/tools_common.c,  common/tools_common.h)
+//
+// The high-level interface to the AVx encoders / decoders. Each AvxInterface
+// contains the name of the codec (e.g., "av1"), the four character code
+// associated with it, and a function pointer to the actual interface (see the
+// documentation on aom_codec_iface_t for more info). This API
+// is meant for lookup / iteration over all known codecs.
+//
+// For the encoder, call get_aom_encoder_by_name(...) if you know the name
+// (e.g., "av1"); to iterate over all known encoders, use
+// get_aom_encoder_count() and get_aom_encoder_by_index(i). To get the
+// encoder specifically for large scale tile encoding, use
+// get_aom_lst_encoder().
+//
+// For the decoder, similar functions are available. There is also a
+// get_aom_decoder_by_fourcc(fourcc) to get the decoder based on the four
+// character codes.
+//
+// The main purpose of the AVXInterface is to get a reference to the
+// aom_codec_interface_t, pointed to by its codec_interface variable.
+//
+//
+// 2. aom_codec_iface_t
+//    (Related files: aom/aom_codec.h, aom/src/aom_codec.c,
+//    aom/internal/aom_codec_internal.h, av1/av1_cx_iface.c,
+//    av1/av1_dx_iface.c)
+//
+// Used to initialize the codec context, which contains the configuration for
+// for modifying the encoder/decoder during run-time. See the documentation of
+// aom/aom_codec.h for more details. For the most part, users will call the
+// helper functions listed there, such as aom_codec_iface_name,
+// aom_codec_get_caps, etc., to interact with it.
+//
+// The main purpose of the aom_codec_iface_t is to provide a way to generate
+// a default codec config, find out what capabilities the implementation has,
+// and create an aom_codec_ctx_t (which is actually used to interact with the
+// codec).
+//
+// Note that the implementations of the aom_codec_iface_t are located in
+// av1/av1_cx_iface.c and av1/av1_dx_iface.c
+//
+//
+// 3. aom_codec_ctx_t
+//  (Related files: aom/aom_codec.h, av1/av1_cx_iface.c, av1/av1_dx_iface.c,
+//   aom/aomcx.h, aom/aomdx.h, aom/src/aom_encoder.c, aom/src/aom_decoder.c)
+//
+// The actual interface between user code and the codec. It stores the name
+// of the codec, a pointer back to the aom_codec_iface_t that initialized it,
+// initialization flags, a config for either encoder or the decoder, and a
+// pointer to internal data.
+//
+// The codec is configured / queried through calls to aom_codec_control,
+// which takes a control code (listed in aomcx.h and aomdx.h) and a parameter.
+// In the case of "getter" control codes, the parameter is modified to have
+// the requested value; in the case of "setter" control codes, the codec's
+// configuration is changed based on the parameter. Note that a aom_codec_err_t
+// is returned, which indicates if the operation was successful or not.
+//
+// Note that for the encoder, the aom_codec_alg_priv_t points to the
+// the aom_codec_alg_priv structure in av1/av1_cx_iface.c, and for the decoder,
+// the struct in av1/av1_dx_iface.c. Variables such as AV1_COMP cpi are stored
+// here and also used in the core algorithm.
+//
+// At the end, aom_codec_destroy should be called for each initialized
+// aom_codec_ctx_t.
+
+typedef struct AvxInterface {
+  const char *const name;
+  const uint32_t fourcc;
+  // Pointer to a function of zero arguments that returns an aom_codec_iface_t
+  // pointer. E.g.:
+  //   aom_codec_iface_t *codec = interface->codec_interface();
+  aom_codec_iface_t *(*const codec_interface)();
+} AvxInterface;
+
+int get_aom_encoder_count(void);
+// Lookup the interface by index -- it must be the case that
+// i < get_aom_encoder_count()
+const AvxInterface *get_aom_encoder_by_index(int i);
+// Lookup the interface by name -- returns NULL if no match.
+const AvxInterface *get_aom_encoder_by_name(const char *name);
+const AvxInterface *get_aom_lst_encoder(void);
+
+int get_aom_decoder_count(void);
+const AvxInterface *get_aom_decoder_by_index(int i);
+const AvxInterface *get_aom_decoder_by_name(const char *name);
+// Lookup the interface by the fourcc -- returns NULL if no match.
+const AvxInterface *get_aom_decoder_by_fourcc(uint32_t fourcc);
+
+void aom_img_write(const aom_image_t *img, FILE *file);
+int aom_img_read(aom_image_t *img, FILE *file);
+
+double sse_to_psnr(double samples, double peak, double mse);
+void aom_img_upshift(aom_image_t *dst, const aom_image_t *src, int input_shift);
+void aom_img_downshift(aom_image_t *dst, const aom_image_t *src,
+                       int down_shift);
+void aom_shift_img(unsigned int output_bit_depth, aom_image_t **img_ptr,
+                   aom_image_t **img_shifted_ptr);
+void aom_img_truncate_16_to_8(aom_image_t *dst, const aom_image_t *src);
+
+// Output in NV12 format.
+void aom_img_write_nv12(const aom_image_t *img, FILE *file);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif  // AOM_COMMON_TOOLS_COMMON_H_
diff --git a/libs/libaom/src/common/video_common.h b/libs/libaom/src/common/video_common.h
new file mode 100644
index 000000000..bf95031be
--- /dev/null
+++ b/libs/libaom/src/common/video_common.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_COMMON_VIDEO_COMMON_H_
+#define AOM_COMMON_VIDEO_COMMON_H_
+
+#include "common/tools_common.h"
+
+typedef struct {
+  uint32_t codec_fourcc;
+  int frame_width;
+  int frame_height;
+  struct AvxRational time_base;
+  unsigned int is_annexb;
+} AvxVideoInfo;
+
+#endif  // AOM_COMMON_VIDEO_COMMON_H_
diff --git a/libs/libaom/src/common/video_reader.c b/libs/libaom/src/common/video_reader.c
new file mode 100644
index 000000000..7b021bc40
--- /dev/null
+++ b/libs/libaom/src/common/video_reader.c
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include "aom_ports/mem_ops.h"
+#include "common/ivfdec.h"
+#include "common/obudec.h"
+#include "common/tools_common.h"
+#include "common/video_reader.h"
+#include "common/webmdec.h"
+
+struct AvxVideoReaderStruct {
+  AvxVideoInfo info;
+  struct AvxInputContext input_ctx;
+  struct ObuDecInputContext obu_ctx;
+  struct WebmInputContext webm_ctx;
+  uint8_t *buffer;
+  size_t buffer_size;
+  size_t frame_size;
+  aom_codec_pts_t pts;
+};
+
+AvxVideoReader *aom_video_reader_open(const char *filename) {
+  AvxVideoReader *reader = NULL;
+  FILE *const file = fopen(filename, "rb");
+  if (!file) return NULL;  // Can't open file
+
+  reader = (AvxVideoReader *)calloc(1, sizeof(*reader));
+  if (!reader) {
+    fclose(file);
+    return NULL;  // Can't allocate AvxVideoReader
+  }
+
+  reader->input_ctx.filename = filename;
+  reader->input_ctx.file = file;
+  reader->obu_ctx.avx_ctx = &reader->input_ctx;
+  reader->obu_ctx.is_annexb = 1;
+
+  if (file_is_ivf(&reader->input_ctx)) {
+    reader->input_ctx.file_type = FILE_TYPE_IVF;
+    reader->info.codec_fourcc = reader->input_ctx.fourcc;
+    reader->info.frame_width = reader->input_ctx.width;
+    reader->info.frame_height = reader->input_ctx.height;
+#if CONFIG_WEBM_IO
+  } else if (file_is_webm(&reader->webm_ctx, &reader->input_ctx)) {
+    reader->input_ctx.file_type = FILE_TYPE_WEBM;
+    reader->info.codec_fourcc = reader->input_ctx.fourcc;
+    reader->info.frame_width = reader->input_ctx.width;
+    reader->info.frame_height = reader->input_ctx.height;
+#endif
+  } else if (file_is_obu(&reader->obu_ctx)) {
+    reader->input_ctx.file_type = FILE_TYPE_OBU;
+    // assume AV1
+    reader->info.codec_fourcc = AV1_FOURCC;
+    reader->info.is_annexb = reader->obu_ctx.is_annexb;
+  } else {
+    fclose(file);
+    free(reader);
+    return NULL;  // Unknown file type
+  }
+
+  return reader;
+}
+
+void aom_video_reader_close(AvxVideoReader *reader) {
+  if (reader) {
+    fclose(reader->input_ctx.file);
+    if (reader->input_ctx.file_type == FILE_TYPE_OBU) {
+      obudec_free(&reader->obu_ctx);
+    }
+    free(reader->buffer);
+    free(reader);
+  }
+}
+
+int aom_video_reader_read_frame(AvxVideoReader *reader) {
+  if (reader->input_ctx.file_type == FILE_TYPE_IVF) {
+    return !ivf_read_frame(reader->input_ctx.file, &reader->buffer,
+                           &reader->frame_size, &reader->buffer_size,
+                           &reader->pts);
+  } else if (reader->input_ctx.file_type == FILE_TYPE_OBU) {
+    return !obudec_read_temporal_unit(&reader->obu_ctx, &reader->buffer,
+                                      &reader->frame_size,
+                                      &reader->buffer_size);
+#if CONFIG_WEBM_IO
+  } else if (reader->input_ctx.file_type == FILE_TYPE_WEBM) {
+    return !webm_read_frame(&reader->webm_ctx, &reader->buffer,
+                            &reader->frame_size, &reader->buffer_size);
+#endif
+  } else {
+    assert(0);
+    return 0;
+  }
+}
+
+const uint8_t *aom_video_reader_get_frame(AvxVideoReader *reader,
+                                          size_t *size) {
+  if (size) *size = reader->frame_size;
+
+  return reader->buffer;
+}
+
+int64_t aom_video_reader_get_frame_pts(AvxVideoReader *reader) {
+  return (int64_t)reader->pts;
+}
+
+FILE *aom_video_reader_get_file(AvxVideoReader *reader) {
+  return reader->input_ctx.file;
+}
+
+const AvxVideoInfo *aom_video_reader_get_info(AvxVideoReader *reader) {
+  return &reader->info;
+}
+
+void aom_video_reader_set_fourcc(AvxVideoReader *reader, uint32_t fourcc) {
+  reader->info.codec_fourcc = fourcc;
+}
diff --git a/libs/libaom/src/common/video_reader.h b/libs/libaom/src/common/video_reader.h
new file mode 100644
index 000000000..9ab439e8a
--- /dev/null
+++ b/libs/libaom/src/common/video_reader.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_COMMON_VIDEO_READER_H_
+#define AOM_COMMON_VIDEO_READER_H_
+
+#include "common/video_common.h"
+
+// The following code is work in progress. It is going to  support transparent
+// reading of input files. Right now only IVF format is supported for
+// simplicity. The main goal the API is to be simple and easy to use in example
+// code and in aomenc/aomdec later. All low-level details like memory
+// buffer management are hidden from API users.
+struct AvxVideoReaderStruct;
+typedef struct AvxVideoReaderStruct AvxVideoReader;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Opens the input file for reading and inspects it to determine file type.
+// Returns an opaque AvxVideoReader* upon success, or NULL upon failure.
+// Right now only IVF format is supported.
+AvxVideoReader *aom_video_reader_open(const char *filename);
+
+// Frees all resources associated with AvxVideoReader* returned from
+// aom_video_reader_open() call.
+void aom_video_reader_close(AvxVideoReader *reader);
+
+// Reads frame from the file and stores it in internal buffer.
+int aom_video_reader_read_frame(AvxVideoReader *reader);
+
+// Returns the pointer to memory buffer with frame data read by last call to
+// aom_video_reader_read_frame().
+const uint8_t *aom_video_reader_get_frame(AvxVideoReader *reader, size_t *size);
+
+// Returns the pts of the frame.
+int64_t aom_video_reader_get_frame_pts(AvxVideoReader *reader);
+// Return the reader file.
+FILE *aom_video_reader_get_file(AvxVideoReader *reader);
+
+// Fills AvxVideoInfo with information from opened video file.
+const AvxVideoInfo *aom_video_reader_get_info(AvxVideoReader *reader);
+
+// Set fourcc.
+void aom_video_reader_set_fourcc(AvxVideoReader *reader, uint32_t fourcc);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_COMMON_VIDEO_READER_H_
diff --git a/libs/libaom/src/common/video_writer.c b/libs/libaom/src/common/video_writer.c
new file mode 100644
index 000000000..1d4328ae1
--- /dev/null
+++ b/libs/libaom/src/common/video_writer.c
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include "common/video_writer.h"
+
+#include <stdlib.h>
+
+#include "aom/aom_encoder.h"
+#include "common/ivfenc.h"
+
+struct AvxVideoWriterStruct {
+  AvxVideoInfo info;
+  FILE *file;
+  int frame_count;
+};
+
+static void write_header(FILE *file, const AvxVideoInfo *info,
+                         int frame_count) {
+  struct aom_codec_enc_cfg cfg;
+  cfg.g_w = info->frame_width;
+  cfg.g_h = info->frame_height;
+  cfg.g_timebase.num = info->time_base.numerator;
+  cfg.g_timebase.den = info->time_base.denominator;
+
+  ivf_write_file_header(file, &cfg, info->codec_fourcc, frame_count);
+}
+
+AvxVideoWriter *aom_video_writer_open(const char *filename,
+                                      AvxContainer container,
+                                      const AvxVideoInfo *info) {
+  if (container == kContainerIVF) {
+    AvxVideoWriter *writer = NULL;
+    FILE *const file = fopen(filename, "wb");
+    if (!file) return NULL;
+
+    writer = malloc(sizeof(*writer));
+    if (!writer) {
+      fclose(file);
+      return NULL;
+    }
+    writer->frame_count = 0;
+    writer->info = *info;
+    writer->file = file;
+
+    write_header(writer->file, info, 0);
+
+    return writer;
+  }
+
+  return NULL;
+}
+
+void aom_video_writer_close(AvxVideoWriter *writer) {
+  if (writer) {
+    // Rewriting frame header with real frame count
+    rewind(writer->file);
+    write_header(writer->file, &writer->info, writer->frame_count);
+
+    fclose(writer->file);
+    free(writer);
+  }
+}
+
+int aom_video_writer_write_frame(AvxVideoWriter *writer, const uint8_t *buffer,
+                                 size_t size, int64_t pts) {
+  ivf_write_frame_header(writer->file, pts, size);
+  if (fwrite(buffer, 1, size, writer->file) != size) return 0;
+
+  ++writer->frame_count;
+
+  return 1;
+}
+
+void aom_video_writer_set_fourcc(AvxVideoWriter *writer, uint32_t fourcc) {
+  writer->info.codec_fourcc = fourcc;
+}
diff --git a/libs/libaom/src/common/video_writer.h b/libs/libaom/src/common/video_writer.h
new file mode 100644
index 000000000..8712d47a5
--- /dev/null
+++ b/libs/libaom/src/common/video_writer.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_COMMON_VIDEO_WRITER_H_
+#define AOM_COMMON_VIDEO_WRITER_H_
+
+#include "common/video_common.h"
+
+enum { kContainerIVF } UENUM1BYTE(AvxContainer);
+
+struct AvxVideoWriterStruct;
+typedef struct AvxVideoWriterStruct AvxVideoWriter;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Finds and opens writer for specified container format.
+// Returns an opaque AvxVideoWriter* upon success, or NULL upon failure.
+// Right now only IVF format is supported.
+AvxVideoWriter *aom_video_writer_open(const char *filename,
+                                      AvxContainer container,
+                                      const AvxVideoInfo *info);
+
+// Frees all resources associated with AvxVideoWriter* returned from
+// aom_video_writer_open() call.
+void aom_video_writer_close(AvxVideoWriter *writer);
+
+// Writes frame bytes to the file.
+int aom_video_writer_write_frame(AvxVideoWriter *writer, const uint8_t *buffer,
+                                 size_t size, int64_t pts);
+// Set fourcc.
+void aom_video_writer_set_fourcc(AvxVideoWriter *writer, uint32_t fourcc);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_COMMON_VIDEO_WRITER_H_
diff --git a/libs/libaom/src/common/warnings.c b/libs/libaom/src/common/warnings.c
new file mode 100644
index 000000000..2facee252
--- /dev/null
+++ b/libs/libaom/src/common/warnings.c
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "common/warnings.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom/aom_encoder.h"
+#include "apps/aomenc.h"
+#include "common/tools_common.h"
+
+static const char quantizer_warning_string[] =
+    "Bad quantizer values. Quantizer values should not be equal, and should "
+    "differ by at least 8.";
+
+struct WarningListNode {
+  const char *warning_string;
+  struct WarningListNode *next_warning;
+};
+
+struct WarningList {
+  struct WarningListNode *warning_node;
+};
+
+static void add_warning(const char *warning_string,
+                        struct WarningList *warning_list) {
+  struct WarningListNode **node = &warning_list->warning_node;
+
+  struct WarningListNode *new_node = malloc(sizeof(*new_node));
+  if (new_node == NULL) {
+    fatal("Unable to allocate warning node.");
+  }
+
+  new_node->warning_string = warning_string;
+  new_node->next_warning = NULL;
+
+  while (*node != NULL) node = &(*node)->next_warning;
+
+  *node = new_node;
+}
+
+static void free_warning_list(struct WarningList *warning_list) {
+  while (warning_list->warning_node != NULL) {
+    struct WarningListNode *const node = warning_list->warning_node;
+    warning_list->warning_node = node->next_warning;
+    free(node);
+  }
+}
+
+static int continue_prompt(int num_warnings) {
+  int c;
+  fprintf(stderr,
+          "%d encoder configuration warning(s). Continue? (y to continue) ",
+          num_warnings);
+  c = getchar();
+  return c == 'y';
+}
+
+static void check_quantizer(int min_q, int max_q,
+                            struct WarningList *warning_list) {
+  const int lossless = min_q == 0 && max_q == 0;
+  if (!lossless && (min_q == max_q || abs(max_q - min_q) < 8))
+    add_warning(quantizer_warning_string, warning_list);
+}
+
+void check_encoder_config(int disable_prompt,
+                          const struct AvxEncoderConfig *global_config,
+                          const struct aom_codec_enc_cfg *stream_config) {
+  int num_warnings = 0;
+  struct WarningListNode *warning = NULL;
+  struct WarningList warning_list = { 0 };
+  (void)global_config;
+  check_quantizer(stream_config->rc_min_quantizer,
+                  stream_config->rc_max_quantizer, &warning_list);
+  /* Count and print warnings. */
+  for (warning = warning_list.warning_node; warning != NULL;
+       warning = warning->next_warning, ++num_warnings) {
+    warn(warning->warning_string);
+  }
+
+  free_warning_list(&warning_list);
+
+  if (num_warnings) {
+    if (!disable_prompt && !continue_prompt(num_warnings)) exit(EXIT_FAILURE);
+  }
+}
diff --git a/libs/libaom/src/common/warnings.h b/libs/libaom/src/common/warnings.h
new file mode 100644
index 000000000..36f1fe070
--- /dev/null
+++ b/libs/libaom/src/common/warnings.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_COMMON_WARNINGS_H_
+#define AOM_COMMON_WARNINGS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct aom_codec_enc_cfg;
+struct AvxEncoderConfig;
+
+/*
+ * Checks config for improperly used settings. Warns user upon encountering
+ * settings that will lead to poor output quality. Prompts user to continue
+ * when warnings are issued.
+ */
+void check_encoder_config(int disable_prompt,
+                          const struct AvxEncoderConfig *global_config,
+                          const struct aom_codec_enc_cfg *stream_config);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_COMMON_WARNINGS_H_
diff --git a/libs/libaom/src/common/webmdec.cc b/libs/libaom/src/common/webmdec.cc
new file mode 100644
index 000000000..33bda5902
--- /dev/null
+++ b/libs/libaom/src/common/webmdec.cc
@@ -0,0 +1,248 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "common/webmdec.h"
+
+#include <cassert>
+#include <cstring>
+#include <cstdio>
+
+#include "third_party/libwebm/mkvparser/mkvparser.h"
+#include "third_party/libwebm/mkvparser/mkvreader.h"
+
+namespace {
+
+void reset(struct WebmInputContext *const webm_ctx) {
+  if (webm_ctx->reader != NULL) {
+    mkvparser::MkvReader *const reader =
+        reinterpret_cast<mkvparser::MkvReader *>(webm_ctx->reader);
+    delete reader;
+  }
+  if (webm_ctx->segment != NULL) {
+    mkvparser::Segment *const segment =
+        reinterpret_cast<mkvparser::Segment *>(webm_ctx->segment);
+    delete segment;
+  }
+  if (webm_ctx->buffer != NULL) {
+    delete[] webm_ctx->buffer;
+  }
+  webm_ctx->reader = NULL;
+  webm_ctx->segment = NULL;
+  webm_ctx->buffer = NULL;
+  webm_ctx->cluster = NULL;
+  webm_ctx->block_entry = NULL;
+  webm_ctx->block = NULL;
+  webm_ctx->block_frame_index = 0;
+  webm_ctx->video_track_index = 0;
+  webm_ctx->timestamp_ns = 0;
+  webm_ctx->is_key_frame = false;
+}
+
+void get_first_cluster(struct WebmInputContext *const webm_ctx) {
+  mkvparser::Segment *const segment =
+      reinterpret_cast<mkvparser::Segment *>(webm_ctx->segment);
+  const mkvparser::Cluster *const cluster = segment->GetFirst();
+  webm_ctx->cluster = cluster;
+}
+
+void rewind_and_reset(struct WebmInputContext *const webm_ctx,
+                      struct AvxInputContext *const aom_ctx) {
+  rewind(aom_ctx->file);
+  reset(webm_ctx);
+}
+
+}  // namespace
+
+int file_is_webm(struct WebmInputContext *webm_ctx,
+                 struct AvxInputContext *aom_ctx) {
+  mkvparser::MkvReader *const reader = new mkvparser::MkvReader(aom_ctx->file);
+  webm_ctx->reader = reader;
+  webm_ctx->reached_eos = 0;
+
+  mkvparser::EBMLHeader header;
+  long long pos = 0;
+  if (header.Parse(reader, pos) < 0) {
+    rewind_and_reset(webm_ctx, aom_ctx);
+    return 0;
+  }
+
+  mkvparser::Segment *segment;
+  if (mkvparser::Segment::CreateInstance(reader, pos, segment)) {
+    rewind_and_reset(webm_ctx, aom_ctx);
+    return 0;
+  }
+  webm_ctx->segment = segment;
+  if (segment->Load() < 0) {
+    rewind_and_reset(webm_ctx, aom_ctx);
+    return 0;
+  }
+
+  const mkvparser::Tracks *const tracks = segment->GetTracks();
+  const mkvparser::VideoTrack *video_track = NULL;
+  for (unsigned long i = 0; i < tracks->GetTracksCount(); ++i) {
+    const mkvparser::Track *const track = tracks->GetTrackByIndex(i);
+    if (track->GetType() == mkvparser::Track::kVideo) {
+      video_track = static_cast<const mkvparser::VideoTrack *>(track);
+      webm_ctx->video_track_index = static_cast<int>(track->GetNumber());
+      break;
+    }
+  }
+
+  if (video_track == NULL || video_track->GetCodecId() == NULL) {
+    rewind_and_reset(webm_ctx, aom_ctx);
+    return 0;
+  }
+
+  if (!strncmp(video_track->GetCodecId(), "V_AV1", 5)) {
+    aom_ctx->fourcc = AV1_FOURCC;
+  } else {
+    rewind_and_reset(webm_ctx, aom_ctx);
+    return 0;
+  }
+
+  aom_ctx->framerate.denominator = 0;
+  aom_ctx->framerate.numerator = 0;
+  aom_ctx->width = static_cast<uint32_t>(video_track->GetWidth());
+  aom_ctx->height = static_cast<uint32_t>(video_track->GetHeight());
+
+  get_first_cluster(webm_ctx);
+
+  return 1;
+}
+
+int webm_read_frame(struct WebmInputContext *webm_ctx, uint8_t **buffer,
+                    size_t *bytes_read, size_t *buffer_size) {
+  assert(webm_ctx->buffer == *buffer);
+  // This check is needed for frame parallel decoding, in which case this
+  // function could be called even after it has reached end of input stream.
+  if (webm_ctx->reached_eos) {
+    return 1;
+  }
+  mkvparser::Segment *const segment =
+      reinterpret_cast<mkvparser::Segment *>(webm_ctx->segment);
+  const mkvparser::Cluster *cluster =
+      reinterpret_cast<const mkvparser::Cluster *>(webm_ctx->cluster);
+  const mkvparser::Block *block =
+      reinterpret_cast<const mkvparser::Block *>(webm_ctx->block);
+  const mkvparser::BlockEntry *block_entry =
+      reinterpret_cast<const mkvparser::BlockEntry *>(webm_ctx->block_entry);
+  bool block_entry_eos = false;
+  do {
+    long status = 0;
+    bool get_new_block = false;
+    if (block_entry == NULL && !block_entry_eos) {
+      status = cluster->GetFirst(block_entry);
+      get_new_block = true;
+    } else if (block_entry_eos || block_entry->EOS()) {
+      cluster = segment->GetNext(cluster);
+      if (cluster == NULL || cluster->EOS()) {
+        *bytes_read = 0;
+        webm_ctx->reached_eos = 1;
+        return 1;
+      }
+      status = cluster->GetFirst(block_entry);
+      block_entry_eos = false;
+      get_new_block = true;
+    } else if (block == NULL ||
+               webm_ctx->block_frame_index == block->GetFrameCount() ||
+               block->GetTrackNumber() != webm_ctx->video_track_index) {
+      status = cluster->GetNext(block_entry, block_entry);
+      if (block_entry == NULL || block_entry->EOS()) {
+        block_entry_eos = true;
+        continue;
+      }
+      get_new_block = true;
+    }
+    if (status || block_entry == NULL) {
+      return -1;
+    }
+    if (get_new_block) {
+      block = block_entry->GetBlock();
+      if (block == NULL) return -1;
+      webm_ctx->block_frame_index = 0;
+    }
+  } while (block_entry_eos ||
+           block->GetTrackNumber() != webm_ctx->video_track_index);
+
+  webm_ctx->cluster = cluster;
+  webm_ctx->block_entry = block_entry;
+  webm_ctx->block = block;
+
+  const mkvparser::Block::Frame &frame =
+      block->GetFrame(webm_ctx->block_frame_index);
+  ++webm_ctx->block_frame_index;
+  if (frame.len > static_cast<long>(*buffer_size)) {
+    delete[] * buffer;
+    *buffer = new uint8_t[frame.len];
+    webm_ctx->buffer = *buffer;
+    if (*buffer == NULL) {
+      return -1;
+    }
+    *buffer_size = frame.len;
+  }
+  *bytes_read = frame.len;
+  webm_ctx->timestamp_ns = block->GetTime(cluster);
+  webm_ctx->is_key_frame = block->IsKey();
+
+  mkvparser::MkvReader *const reader =
+      reinterpret_cast<mkvparser::MkvReader *>(webm_ctx->reader);
+  return frame.Read(reader, *buffer) ? -1 : 0;
+}
+
+// Calculate the greatest common divisor between two numbers.
+static int gcd(int a, int b) {
+  int remainder;
+  while (b > 0) {
+    remainder = a % b;
+    a = b;
+    b = remainder;
+  }
+  return a;
+}
+
+int webm_guess_framerate(struct WebmInputContext *webm_ctx,
+                         struct AvxInputContext *aom_ctx) {
+  uint32_t i = 0;
+  uint8_t *buffer = NULL;
+  size_t buffer_size = 0;
+  size_t bytes_read = 0;
+  assert(webm_ctx->buffer == NULL);
+  while (webm_ctx->timestamp_ns < 1000000000 && i < 50) {
+    if (webm_read_frame(webm_ctx, &buffer, &bytes_read, &buffer_size)) {
+      break;
+    }
+    ++i;
+  }
+  aom_ctx->framerate.numerator = (i - 1) * 1000000;
+  aom_ctx->framerate.denominator =
+      static_cast<int>(webm_ctx->timestamp_ns / 1000);
+  // Fraction might be represented in large numbers, like 49000000/980000
+  // for 50fps. Simplify as much as possible.
+  int g = gcd(aom_ctx->framerate.numerator, aom_ctx->framerate.denominator);
+  if (g != 0) {
+    aom_ctx->framerate.numerator /= g;
+    aom_ctx->framerate.denominator /= g;
+  }
+
+  delete[] buffer;
+  webm_ctx->buffer = NULL;
+
+  get_first_cluster(webm_ctx);
+  webm_ctx->block = NULL;
+  webm_ctx->block_entry = NULL;
+  webm_ctx->block_frame_index = 0;
+  webm_ctx->timestamp_ns = 0;
+  webm_ctx->reached_eos = 0;
+
+  return 0;
+}
+
+void webm_free(struct WebmInputContext *webm_ctx) { reset(webm_ctx); }
diff --git a/libs/libaom/src/common/webmdec.h b/libs/libaom/src/common/webmdec.h
new file mode 100644
index 000000000..5ac75cb30
--- /dev/null
+++ b/libs/libaom/src/common/webmdec.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_COMMON_WEBMDEC_H_
+#define AOM_COMMON_WEBMDEC_H_
+
+#include "common/tools_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct AvxInputContext;
+
+struct WebmInputContext {
+  void *reader;
+  void *segment;
+  uint8_t *buffer;
+  const void *cluster;
+  const void *block_entry;
+  const void *block;
+  int block_frame_index;
+  int video_track_index;
+  uint64_t timestamp_ns;
+  int is_key_frame;
+  int reached_eos;
+};
+
+// Checks if the input is a WebM file. If so, initializes WebMInputContext so
+// that webm_read_frame can be called to retrieve a video frame.
+// Returns 1 on success and 0 on failure or input is not WebM file.
+// TODO(vigneshv): Refactor this function into two smaller functions specific
+// to their task.
+int file_is_webm(struct WebmInputContext *webm_ctx,
+                 struct AvxInputContext *aom_ctx);
+
+// Reads a WebM Video Frame. Memory for the buffer is created, owned and managed
+// by this function. For the first call, |buffer| should be NULL and
+// |*buffer_size| should be 0. Once all the frames are read and used,
+// webm_free() should be called, otherwise there will be a leak.
+// Parameters:
+//      webm_ctx - WebmInputContext object
+//      buffer - pointer where the frame data will be filled.
+//      bytes_read - pointer to bytes read.
+//      buffer_size - pointer to buffer size.
+// Return values:
+//      0 - Success
+//      1 - End of Stream
+//     -1 - Error
+int webm_read_frame(struct WebmInputContext *webm_ctx, uint8_t **buffer,
+                    size_t *bytes_read, size_t *buffer_size);
+
+// Guesses the frame rate of the input file based on the container timestamps.
+int webm_guess_framerate(struct WebmInputContext *webm_ctx,
+                         struct AvxInputContext *aom_ctx);
+
+// Resets the WebMInputContext.
+void webm_free(struct WebmInputContext *webm_ctx);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_COMMON_WEBMDEC_H_
diff --git a/libs/libaom/src/common/webmenc.cc b/libs/libaom/src/common/webmenc.cc
new file mode 100644
index 000000000..6ae7df646
--- /dev/null
+++ b/libs/libaom/src/common/webmenc.cc
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "common/webmenc.h"
+
+#include <stdio.h>
+
+#include <string>
+
+#include "common/av1_config.h"
+#include "third_party/libwebm/mkvmuxer/mkvmuxer.h"
+#include "third_party/libwebm/mkvmuxer/mkvmuxerutil.h"
+#include "third_party/libwebm/mkvmuxer/mkvwriter.h"
+
+namespace {
+const uint64_t kDebugTrackUid = 0xDEADBEEF;
+const int kVideoTrackNumber = 1;
+}  // namespace
+
+int write_webm_file_header(struct WebmOutputContext *webm_ctx,
+                           aom_codec_ctx_t *encoder_ctx,
+                           const aom_codec_enc_cfg_t *cfg,
+                           stereo_format_t stereo_fmt, unsigned int fourcc,
+                           const struct AvxRational *par) {
+  mkvmuxer::MkvWriter *const writer = new mkvmuxer::MkvWriter(webm_ctx->stream);
+  mkvmuxer::Segment *const segment = new mkvmuxer::Segment();
+  if (!writer || !segment) {
+    fprintf(stderr, "webmenc> mkvmuxer objects alloc failed, out of memory?\n");
+    return -1;
+  }
+
+  bool ok = segment->Init(writer);
+  if (!ok) {
+    fprintf(stderr, "webmenc> mkvmuxer Init failed.\n");
+    return -1;
+  }
+
+  segment->set_mode(mkvmuxer::Segment::kFile);
+  segment->OutputCues(true);
+
+  mkvmuxer::SegmentInfo *const info = segment->GetSegmentInfo();
+  if (!info) {
+    fprintf(stderr, "webmenc> Cannot retrieve Segment Info.\n");
+    return -1;
+  }
+
+  const uint64_t kTimecodeScale = 1000000;
+  info->set_timecode_scale(kTimecodeScale);
+  std::string version = "aomenc";
+  if (!webm_ctx->debug) {
+    version.append(std::string(" ") + aom_codec_version_str());
+  }
+  info->set_writing_app(version.c_str());
+
+  const uint64_t video_track_id =
+      segment->AddVideoTrack(static_cast<int>(cfg->g_w),
+                             static_cast<int>(cfg->g_h), kVideoTrackNumber);
+  mkvmuxer::VideoTrack *const video_track = static_cast<mkvmuxer::VideoTrack *>(
+      segment->GetTrackByNumber(video_track_id));
+
+  if (!video_track) {
+    fprintf(stderr, "webmenc> Video track creation failed.\n");
+    return -1;
+  }
+
+  ok = false;
+  aom_fixed_buf_t *obu_sequence_header =
+      aom_codec_get_global_headers(encoder_ctx);
+  if (obu_sequence_header) {
+    Av1Config av1_config;
+    if (get_av1config_from_obu(
+            reinterpret_cast<const uint8_t *>(obu_sequence_header->buf),
+            obu_sequence_header->sz, false, &av1_config) == 0) {
+      uint8_t av1_config_buffer[4] = { 0 };
+      size_t bytes_written = 0;
+      if (write_av1config(&av1_config, sizeof(av1_config_buffer),
+                          &bytes_written, av1_config_buffer) == 0) {
+        ok = video_track->SetCodecPrivate(av1_config_buffer,
+                                          sizeof(av1_config_buffer));
+      }
+    }
+    free(obu_sequence_header->buf);
+    free(obu_sequence_header);
+  }
+  if (!ok) {
+    fprintf(stderr, "webmenc> Unable to set AV1 config.\n");
+    return -1;
+  }
+
+  ok = video_track->SetStereoMode(stereo_fmt);
+  if (!ok) {
+    fprintf(stderr, "webmenc> Unable to set stereo mode.\n");
+    return -1;
+  }
+
+  if (fourcc != AV1_FOURCC) {
+    fprintf(stderr, "webmenc> Unsupported codec (unknown 4 CC).\n");
+    return -1;
+  }
+  video_track->set_codec_id("V_AV1");
+
+  if (par->numerator > 1 || par->denominator > 1) {
+    // TODO(fgalligan): Add support of DisplayUnit, Display Aspect Ratio type
+    // to WebM format.
+    const uint64_t display_width = static_cast<uint64_t>(
+        ((cfg->g_w * par->numerator * 1.0) / par->denominator) + .5);
+    video_track->set_display_width(display_width);
+    video_track->set_display_height(cfg->g_h);
+  }
+
+  if (webm_ctx->debug) {
+    video_track->set_uid(kDebugTrackUid);
+  }
+
+  webm_ctx->writer = writer;
+  webm_ctx->segment = segment;
+
+  return 0;
+}
+
+int write_webm_block(struct WebmOutputContext *webm_ctx,
+                     const aom_codec_enc_cfg_t *cfg,
+                     const aom_codec_cx_pkt_t *pkt) {
+  if (!webm_ctx->segment) {
+    fprintf(stderr, "webmenc> segment is NULL.\n");
+    return -1;
+  }
+  mkvmuxer::Segment *const segment =
+      reinterpret_cast<mkvmuxer::Segment *>(webm_ctx->segment);
+  int64_t pts_ns = pkt->data.frame.pts * 1000000000ll * cfg->g_timebase.num /
+                   cfg->g_timebase.den;
+  if (pts_ns <= webm_ctx->last_pts_ns) pts_ns = webm_ctx->last_pts_ns + 1000000;
+  webm_ctx->last_pts_ns = pts_ns;
+
+  if (!segment->AddFrame(static_cast<uint8_t *>(pkt->data.frame.buf),
+                         pkt->data.frame.sz, kVideoTrackNumber, pts_ns,
+                         pkt->data.frame.flags & AOM_FRAME_IS_KEY)) {
+    fprintf(stderr, "webmenc> AddFrame failed.\n");
+    return -1;
+  }
+  return 0;
+}
+
+int write_webm_file_footer(struct WebmOutputContext *webm_ctx) {
+  if (!webm_ctx->writer || !webm_ctx->segment) {
+    fprintf(stderr, "webmenc> segment or writer NULL.\n");
+    return -1;
+  }
+  mkvmuxer::MkvWriter *const writer =
+      reinterpret_cast<mkvmuxer::MkvWriter *>(webm_ctx->writer);
+  mkvmuxer::Segment *const segment =
+      reinterpret_cast<mkvmuxer::Segment *>(webm_ctx->segment);
+  const bool ok = segment->Finalize();
+  delete segment;
+  delete writer;
+  webm_ctx->writer = NULL;
+  webm_ctx->segment = NULL;
+
+  if (!ok) {
+    fprintf(stderr, "webmenc> Segment::Finalize failed.\n");
+    return -1;
+  }
+
+  return 0;
+}
diff --git a/libs/libaom/src/common/webmenc.h b/libs/libaom/src/common/webmenc.h
new file mode 100644
index 000000000..a4aa992b0
--- /dev/null
+++ b/libs/libaom/src/common/webmenc.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_COMMON_WEBMENC_H_
+#define AOM_COMMON_WEBMENC_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "tools_common.h"
+#include "aom/aom_encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct WebmOutputContext {
+  int debug;
+  FILE *stream;
+  int64_t last_pts_ns;
+  void *writer;
+  void *segment;
+};
+
+/* Stereo 3D packed frame format */
+enum {
+  STEREO_FORMAT_MONO = 0,
+  STEREO_FORMAT_LEFT_RIGHT = 1,
+  STEREO_FORMAT_BOTTOM_TOP = 2,
+  STEREO_FORMAT_TOP_BOTTOM = 3,
+  STEREO_FORMAT_RIGHT_LEFT = 11
+} UENUM1BYTE(stereo_format_t);
+
+// The following functions wrap libwebm's mkvmuxer. All functions return 0 upon
+// success, or -1 upon failure.
+
+int write_webm_file_header(struct WebmOutputContext *webm_ctx,
+                           aom_codec_ctx_t *encoder_ctx,
+                           const aom_codec_enc_cfg_t *cfg,
+                           stereo_format_t stereo_fmt, unsigned int fourcc,
+                           const struct AvxRational *par);
+
+int write_webm_block(struct WebmOutputContext *webm_ctx,
+                     const aom_codec_enc_cfg_t *cfg,
+                     const aom_codec_cx_pkt_t *pkt);
+
+int write_webm_file_footer(struct WebmOutputContext *webm_ctx);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_COMMON_WEBMENC_H_
diff --git a/libs/libaom/src/common/y4menc.c b/libs/libaom/src/common/y4menc.c
new file mode 100644
index 000000000..e3f5d5b38
--- /dev/null
+++ b/libs/libaom/src/common/y4menc.c
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "common/rawenc.h"
+#include "common/y4menc.h"
+
+// Returns the Y4M name associated with the monochrome colorspace.
+static const char *monochrome_colorspace(unsigned int bit_depth) {
+  switch (bit_depth) {
+    case 8: return "Cmono";
+    case 9: return "Cmono9";
+    case 10: return "Cmono10";
+    case 12: return "Cmono12";
+    case 16: return "Cmono16";
+    default: assert(0); return NULL;
+  }
+}
+
+// Return the Y4M name of the 8-bit colorspace, given the chroma position and
+// image format.
+const char *colorspace8(aom_chroma_sample_position_t csp, aom_img_fmt_t fmt) {
+  switch (fmt) {
+    case AOM_IMG_FMT_I444: return "C444";
+    case AOM_IMG_FMT_I422: return "C422";
+    default:
+      if (csp == AOM_CSP_VERTICAL) {
+        return "C420mpeg2 XYSCSS=420MPEG2";
+      } else if (csp == AOM_CSP_COLOCATED) {
+        // Note that Y4M does not have a dedicated header for colocated chroma,
+        // and that FFMPEG interprets C420 as C420jpeg.
+        return "C420";
+      } else {
+        return "C420jpeg";
+      }
+  }
+}
+
+// Return the Y4M name of the colorspace, given the bit depth and image format.
+static const char *colorspace(unsigned int bit_depth,
+                              aom_chroma_sample_position_t csp,
+                              aom_img_fmt_t fmt) {
+  switch (bit_depth) {
+    case 8: return colorspace8(csp, fmt);
+    case 9:
+      return fmt == AOM_IMG_FMT_I44416
+                 ? "C444p9 XYSCSS=444P9"
+                 : fmt == AOM_IMG_FMT_I42216 ? "C422p9 XYSCSS=422P9"
+                                             : "C420p9 XYSCSS=420P9";
+    case 10:
+      return fmt == AOM_IMG_FMT_I44416
+                 ? "C444p10 XYSCSS=444P10"
+                 : fmt == AOM_IMG_FMT_I42216 ? "C422p10 XYSCSS=422P10"
+                                             : "C420p10 XYSCSS=420P10";
+    case 12:
+      return fmt == AOM_IMG_FMT_I44416
+                 ? "C444p12 XYSCSS=444P12"
+                 : fmt == AOM_IMG_FMT_I42216 ? "C422p12 XYSCSS=422P12"
+                                             : "C420p12 XYSCSS=420P12";
+    case 14:
+      return fmt == AOM_IMG_FMT_I44416
+                 ? "C444p14 XYSCSS=444P14"
+                 : fmt == AOM_IMG_FMT_I42216 ? "C422p14 XYSCSS=422P14"
+                                             : "C420p14 XYSCSS=420P14";
+    case 16:
+      return fmt == AOM_IMG_FMT_I44416
+                 ? "C444p16 XYSCSS=444P16"
+                 : fmt == AOM_IMG_FMT_I42216 ? "C422p16 XYSCSS=422P16"
+                                             : "C420p16 XYSCSS=420P16";
+    default: assert(0); return NULL;
+  }
+}
+
+int y4m_write_file_header(char *buf, size_t len, int width, int height,
+                          const struct AvxRational *framerate, int monochrome,
+                          aom_chroma_sample_position_t csp, aom_img_fmt_t fmt,
+                          unsigned int bit_depth) {
+  const char *color = monochrome ? monochrome_colorspace(bit_depth)
+                                 : colorspace(bit_depth, csp, fmt);
+  return snprintf(buf, len, "YUV4MPEG2 W%u H%u F%u:%u I%c %s\n", width, height,
+                  framerate->numerator, framerate->denominator, 'p', color);
+}
+
+int y4m_write_frame_header(char *buf, size_t len) {
+  return snprintf(buf, len, "FRAME\n");
+}
+
+void y4m_write_image_file(const aom_image_t *img, const int *planes,
+                          FILE *file) {
+  int num_planes = img->monochrome ? 1 : 3;
+  raw_write_image_file(img, planes, num_planes, file);
+}
+
+void y4m_update_image_md5(const aom_image_t *img, const int *planes,
+                          MD5Context *md5) {
+  int num_planes = img->monochrome ? 1 : 3;
+  raw_update_image_md5(img, planes, num_planes, md5);
+}
diff --git a/libs/libaom/src/common/y4menc.h b/libs/libaom/src/common/y4menc.h
new file mode 100644
index 000000000..f6d5fd86b
--- /dev/null
+++ b/libs/libaom/src/common/y4menc.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_COMMON_Y4MENC_H_
+#define AOM_COMMON_Y4MENC_H_
+
+#include "aom/aom_decoder.h"
+#include "common/md5_utils.h"
+#include "common/tools_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define Y4M_BUFFER_SIZE 128
+
+int y4m_write_file_header(char *buf, size_t len, int width, int height,
+                          const struct AvxRational *framerate, int monochrome,
+                          aom_chroma_sample_position_t csp, aom_img_fmt_t fmt,
+                          unsigned int bit_depth);
+int y4m_write_frame_header(char *buf, size_t len);
+void y4m_write_image_file(const aom_image_t *img, const int *planes,
+                          FILE *file);
+void y4m_update_image_md5(const aom_image_t *img, const int *planes,
+                          MD5Context *md5);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_COMMON_Y4MENC_H_
diff --git a/libs/libaom/src/common/y4minput.c b/libs/libaom/src/common/y4minput.c
new file mode 100644
index 000000000..f3dfaafc6
--- /dev/null
+++ b/libs/libaom/src/common/y4minput.c
@@ -0,0 +1,1153 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ *
+ * Based on code from the OggTheora software codec source code,
+ * Copyright (C) 2002-2010 The Xiph.Org Foundation and contributors.
+ */
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom/aom_integer.h"
+#include "aom_ports/msvc.h"
+#include "y4minput.h"
+
+// Reads 'size' bytes from 'file' into 'buf' with some fault tolerance.
+// Returns true on success.
+static int file_read(void *buf, size_t size, FILE *file) {
+  const int kMaxRetries = 5;
+  int retry_count = 0;
+  int file_error;
+  size_t len = 0;
+  do {
+    const size_t n = fread((uint8_t *)buf + len, 1, size - len, file);
+    len += n;
+    file_error = ferror(file);
+    if (file_error) {
+      if (errno == EINTR || errno == EAGAIN) {
+        clearerr(file);
+        continue;
+      } else {
+        fprintf(stderr, "Error reading file: %u of %u bytes read, %d: %s\n",
+                (uint32_t)len, (uint32_t)size, errno, strerror(errno));
+        return 0;
+      }
+    }
+  } while (!feof(file) && len < size && ++retry_count < kMaxRetries);
+
+  if (!feof(file) && len != size) {
+    fprintf(stderr,
+            "Error reading file: %u of %u bytes read,"
+            " error: %d, retries: %d, %d: %s\n",
+            (uint32_t)len, (uint32_t)size, file_error, retry_count, errno,
+            strerror(errno));
+  }
+  return len == size;
+}
+
+static int y4m_parse_tags(y4m_input *_y4m, char *_tags) {
+  int got_w;
+  int got_h;
+  int got_fps;
+  int got_interlace;
+  int got_par;
+  int got_chroma;
+  char *p;
+  char *q;
+  got_w = got_h = got_fps = got_interlace = got_par = got_chroma = 0;
+  for (p = _tags;; p = q) {
+    /*Skip any leading spaces.*/
+    while (*p == ' ') p++;
+    /*If that's all we have, stop.*/
+    if (p[0] == '\0') break;
+    /*Find the end of this tag.*/
+    for (q = p + 1; *q != '\0' && *q != ' '; q++) {
+    }
+    /*Process the tag.*/
+    switch (p[0]) {
+      case 'W': {
+        if (sscanf(p + 1, "%d", &_y4m->pic_w) != 1) return -1;
+        got_w = 1;
+      } break;
+      case 'H': {
+        if (sscanf(p + 1, "%d", &_y4m->pic_h) != 1) return -1;
+        got_h = 1;
+      } break;
+      case 'F': {
+        if (sscanf(p + 1, "%d:%d", &_y4m->fps_n, &_y4m->fps_d) != 2) {
+          return -1;
+        }
+        got_fps = 1;
+      } break;
+      case 'I': {
+        _y4m->interlace = p[1];
+        got_interlace = 1;
+      } break;
+      case 'A': {
+        if (sscanf(p + 1, "%d:%d", &_y4m->par_n, &_y4m->par_d) != 2) {
+          return -1;
+        }
+        got_par = 1;
+      } break;
+      case 'C': {
+        if (q - p > 16) return -1;
+        memcpy(_y4m->chroma_type, p + 1, q - p - 1);
+        _y4m->chroma_type[q - p - 1] = '\0';
+        got_chroma = 1;
+      } break;
+        /*Ignore unknown tags.*/
+    }
+  }
+  if (!got_w || !got_h || !got_fps) return -1;
+  if (!got_interlace) _y4m->interlace = '?';
+  if (!got_par) _y4m->par_n = _y4m->par_d = 0;
+  /*Chroma-type is not specified in older files, e.g., those generated by
+     mplayer.*/
+  if (!got_chroma)
+    snprintf(_y4m->chroma_type, sizeof(_y4m->chroma_type), "420");
+  return 0;
+}
+
+/*All anti-aliasing filters in the following conversion functions are based on
+   one of two window functions:
+  The 6-tap Lanczos window (for down-sampling and shifts):
+   sinc(\pi*t)*sinc(\pi*t/3), |t|<3  (sinc(t)==sin(t)/t)
+   0,                         |t|>=3
+  The 4-tap Mitchell window (for up-sampling):
+   7|t|^3-12|t|^2+16/3,             |t|<1
+   -(7/3)|x|^3+12|x|^2-20|x|+32/3,  |t|<2
+   0,                               |t|>=2
+  The number of taps is intentionally kept small to reduce computational
+   overhead and limit ringing.
+
+  The taps from these filters are scaled so that their sum is 1, and the
+  result is scaled by 128 and rounded to integers to create a filter whose
+   intermediate values fit inside 16 bits.
+  Coefficients are rounded in such a way as to ensure their sum is still 128,
+   which is usually equivalent to normal rounding.
+
+  Conversions which require both horizontal and vertical filtering could
+   have these steps pipelined, for less memory consumption and better cache
+   performance, but we do them separately for simplicity.*/
+#define OC_MINI(_a, _b) ((_a) > (_b) ? (_b) : (_a))
+#define OC_MAXI(_a, _b) ((_a) < (_b) ? (_b) : (_a))
+#define OC_CLAMPI(_a, _b, _c) (OC_MAXI(_a, OC_MINI(_b, _c)))
+
+/*420jpeg chroma samples are sited like:
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |   BR  |       |   BR  |
+  |       |       |       |
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |   BR  |       |   BR  |
+  |       |       |       |
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+
+  420mpeg2 chroma samples are sited like:
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  BR      |       BR      |
+  |       |       |       |
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  BR      |       BR      |
+  |       |       |       |
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+
+  We use a resampling filter to shift the site locations one quarter pixel (at
+   the chroma plane's resolution) to the right.
+  The 4:2:2 modes look exactly the same, except there are twice as many chroma
+   lines, and they are vertically co-sited with the luma samples in both the
+   mpeg2 and jpeg cases (thus requiring no vertical resampling).*/
+static void y4m_42xmpeg2_42xjpeg_helper(unsigned char *_dst,
+                                        const unsigned char *_src, int _c_w,
+                                        int _c_h) {
+  int y;
+  int x;
+  for (y = 0; y < _c_h; y++) {
+    /*Filter: [4 -17 114 35 -9 1]/128, derived from a 6-tap Lanczos
+       window.*/
+    for (x = 0; x < OC_MINI(_c_w, 2); x++) {
+      _dst[x] = (unsigned char)OC_CLAMPI(
+          0,
+          (4 * _src[0] - 17 * _src[OC_MAXI(x - 1, 0)] + 114 * _src[x] +
+           35 * _src[OC_MINI(x + 1, _c_w - 1)] -
+           9 * _src[OC_MINI(x + 2, _c_w - 1)] + _src[OC_MINI(x + 3, _c_w - 1)] +
+           64) >>
+              7,
+          255);
+    }
+    for (; x < _c_w - 3; x++) {
+      _dst[x] = (unsigned char)OC_CLAMPI(
+          0,
+          (4 * _src[x - 2] - 17 * _src[x - 1] + 114 * _src[x] +
+           35 * _src[x + 1] - 9 * _src[x + 2] + _src[x + 3] + 64) >>
+              7,
+          255);
+    }
+    for (; x < _c_w; x++) {
+      _dst[x] = (unsigned char)OC_CLAMPI(
+          0,
+          (4 * _src[x - 2] - 17 * _src[x - 1] + 114 * _src[x] +
+           35 * _src[OC_MINI(x + 1, _c_w - 1)] -
+           9 * _src[OC_MINI(x + 2, _c_w - 1)] + _src[_c_w - 1] + 64) >>
+              7,
+          255);
+    }
+    _dst += _c_w;
+    _src += _c_w;
+  }
+}
+
+/*Handles both 422 and 420mpeg2 to 422jpeg and 420jpeg, respectively.*/
+static void y4m_convert_42xmpeg2_42xjpeg(y4m_input *_y4m, unsigned char *_dst,
+                                         unsigned char *_aux) {
+  int c_w;
+  int c_h;
+  int c_sz;
+  int pli;
+  /*Skip past the luma data.*/
+  _dst += _y4m->pic_w * _y4m->pic_h;
+  /*Compute the size of each chroma plane.*/
+  c_w = (_y4m->pic_w + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h;
+  c_h = (_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v;
+  c_sz = c_w * c_h;
+  for (pli = 1; pli < 3; pli++) {
+    y4m_42xmpeg2_42xjpeg_helper(_dst, _aux, c_w, c_h);
+    _dst += c_sz;
+    _aux += c_sz;
+  }
+}
+
+/*This format is only used for interlaced content, but is included for
+   completeness.
+
+  420jpeg chroma samples are sited like:
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |   BR  |       |   BR  |
+  |       |       |       |
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |   BR  |       |   BR  |
+  |       |       |       |
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+
+  420paldv chroma samples are sited like:
+  YR------Y-------YR------Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+  YB------Y-------YB------Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+  YR------Y-------YR------Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+  YB------Y-------YB------Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+
+  We use a resampling filter to shift the site locations one quarter pixel (at
+   the chroma plane's resolution) to the right.
+  Then we use another filter to move the C_r location down one quarter pixel,
+   and the C_b location up one quarter pixel.*/
+static void y4m_convert_42xpaldv_42xjpeg(y4m_input *_y4m, unsigned char *_dst,
+                                         unsigned char *_aux) {
+  unsigned char *tmp;
+  int c_w;
+  int c_h;
+  int c_sz;
+  int pli;
+  int y;
+  int x;
+  /*Skip past the luma data.*/
+  _dst += _y4m->pic_w * _y4m->pic_h;
+  /*Compute the size of each chroma plane.*/
+  c_w = (_y4m->pic_w + 1) / 2;
+  c_h = (_y4m->pic_h + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h;
+  c_sz = c_w * c_h;
+  tmp = _aux + 2 * c_sz;
+  for (pli = 1; pli < 3; pli++) {
+    /*First do the horizontal re-sampling.
+      This is the same as the mpeg2 case, except that after the horizontal
+       case, we need to apply a second vertical filter.*/
+    y4m_42xmpeg2_42xjpeg_helper(tmp, _aux, c_w, c_h);
+    _aux += c_sz;
+    switch (pli) {
+      case 1: {
+        /*Slide C_b up a quarter-pel.
+          This is the same filter used above, but in the other order.*/
+        for (x = 0; x < c_w; x++) {
+          for (y = 0; y < OC_MINI(c_h, 3); y++) {
+            _dst[y * c_w] = (unsigned char)OC_CLAMPI(
+                0,
+                (tmp[0] - 9 * tmp[OC_MAXI(y - 2, 0) * c_w] +
+                 35 * tmp[OC_MAXI(y - 1, 0) * c_w] + 114 * tmp[y * c_w] -
+                 17 * tmp[OC_MINI(y + 1, c_h - 1) * c_w] +
+                 4 * tmp[OC_MINI(y + 2, c_h - 1) * c_w] + 64) >>
+                    7,
+                255);
+          }
+          for (; y < c_h - 2; y++) {
+            _dst[y * c_w] = (unsigned char)OC_CLAMPI(
+                0,
+                (tmp[(y - 3) * c_w] - 9 * tmp[(y - 2) * c_w] +
+                 35 * tmp[(y - 1) * c_w] + 114 * tmp[y * c_w] -
+                 17 * tmp[(y + 1) * c_w] + 4 * tmp[(y + 2) * c_w] + 64) >>
+                    7,
+                255);
+          }
+          for (; y < c_h; y++) {
+            _dst[y * c_w] = (unsigned char)OC_CLAMPI(
+                0,
+                (tmp[(y - 3) * c_w] - 9 * tmp[(y - 2) * c_w] +
+                 35 * tmp[(y - 1) * c_w] + 114 * tmp[y * c_w] -
+                 17 * tmp[OC_MINI(y + 1, c_h - 1) * c_w] +
+                 4 * tmp[(c_h - 1) * c_w] + 64) >>
+                    7,
+                255);
+          }
+          _dst++;
+          tmp++;
+        }
+        _dst += c_sz - c_w;
+        tmp -= c_w;
+      } break;
+      case 2: {
+        /*Slide C_r down a quarter-pel.
+          This is the same as the horizontal filter.*/
+        for (x = 0; x < c_w; x++) {
+          for (y = 0; y < OC_MINI(c_h, 2); y++) {
+            _dst[y * c_w] = (unsigned char)OC_CLAMPI(
+                0,
+                (4 * tmp[0] - 17 * tmp[OC_MAXI(y - 1, 0) * c_w] +
+                 114 * tmp[y * c_w] + 35 * tmp[OC_MINI(y + 1, c_h - 1) * c_w] -
+                 9 * tmp[OC_MINI(y + 2, c_h - 1) * c_w] +
+                 tmp[OC_MINI(y + 3, c_h - 1) * c_w] + 64) >>
+                    7,
+                255);
+          }
+          for (; y < c_h - 3; y++) {
+            _dst[y * c_w] = (unsigned char)OC_CLAMPI(
+                0,
+                (4 * tmp[(y - 2) * c_w] - 17 * tmp[(y - 1) * c_w] +
+                 114 * tmp[y * c_w] + 35 * tmp[(y + 1) * c_w] -
+                 9 * tmp[(y + 2) * c_w] + tmp[(y + 3) * c_w] + 64) >>
+                    7,
+                255);
+          }
+          for (; y < c_h; y++) {
+            _dst[y * c_w] = (unsigned char)OC_CLAMPI(
+                0,
+                (4 * tmp[(y - 2) * c_w] - 17 * tmp[(y - 1) * c_w] +
+                 114 * tmp[y * c_w] + 35 * tmp[OC_MINI(y + 1, c_h - 1) * c_w] -
+                 9 * tmp[OC_MINI(y + 2, c_h - 1) * c_w] + tmp[(c_h - 1) * c_w] +
+                 64) >>
+                    7,
+                255);
+          }
+          _dst++;
+          tmp++;
+        }
+      } break;
+    }
+    /*For actual interlaced material, this would have to be done separately on
+       each field, and the shift amounts would be different.
+      C_r moves down 1/8, C_b up 3/8 in the top field, and C_r moves down 3/8,
+       C_b up 1/8 in the bottom field.
+      The corresponding filters would be:
+       Down 1/8 (reverse order for up): [3 -11 125 15 -4 0]/128
+       Down 3/8 (reverse order for up): [4 -19 98 56 -13 2]/128*/
+  }
+}
+
+/*Perform vertical filtering to reduce a single plane from 4:2:2 to 4:2:0.
+  This is used as a helper by several conversion routines.*/
+static void y4m_422jpeg_420jpeg_helper(unsigned char *_dst,
+                                       const unsigned char *_src, int _c_w,
+                                       int _c_h) {
+  int y;
+  int x;
+  /*Filter: [3 -17 78 78 -17 3]/128, derived from a 6-tap Lanczos window.*/
+  for (x = 0; x < _c_w; x++) {
+    for (y = 0; y < OC_MINI(_c_h, 2); y += 2) {
+      _dst[(y >> 1) * _c_w] =
+          OC_CLAMPI(0,
+                    (64 * _src[0] + 78 * _src[OC_MINI(1, _c_h - 1) * _c_w] -
+                     17 * _src[OC_MINI(2, _c_h - 1) * _c_w] +
+                     3 * _src[OC_MINI(3, _c_h - 1) * _c_w] + 64) >>
+                        7,
+                    255);
+    }
+    for (; y < _c_h - 3; y += 2) {
+      _dst[(y >> 1) * _c_w] =
+          OC_CLAMPI(0,
+                    (3 * (_src[(y - 2) * _c_w] + _src[(y + 3) * _c_w]) -
+                     17 * (_src[(y - 1) * _c_w] + _src[(y + 2) * _c_w]) +
+                     78 * (_src[y * _c_w] + _src[(y + 1) * _c_w]) + 64) >>
+                        7,
+                    255);
+    }
+    for (; y < _c_h; y += 2) {
+      _dst[(y >> 1) * _c_w] = OC_CLAMPI(
+          0,
+          (3 * (_src[(y - 2) * _c_w] + _src[(_c_h - 1) * _c_w]) -
+           17 * (_src[(y - 1) * _c_w] + _src[OC_MINI(y + 2, _c_h - 1) * _c_w]) +
+           78 * (_src[y * _c_w] + _src[OC_MINI(y + 1, _c_h - 1) * _c_w]) +
+           64) >>
+              7,
+          255);
+    }
+    _src++;
+    _dst++;
+  }
+}
+
+/*420jpeg chroma samples are sited like:
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |   BR  |       |   BR  |
+  |       |       |       |
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |   BR  |       |   BR  |
+  |       |       |       |
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+
+  422jpeg chroma samples are sited like:
+  Y---BR--Y-------Y---BR--Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+  Y---BR--Y-------Y---BR--Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+  Y---BR--Y-------Y---BR--Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+  Y---BR--Y-------Y---BR--Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+
+  We use a resampling filter to decimate the chroma planes by two in the
+   vertical direction.*/
+static void y4m_convert_422jpeg_420jpeg(y4m_input *_y4m, unsigned char *_dst,
+                                        unsigned char *_aux) {
+  int c_w;
+  int c_h;
+  int c_sz;
+  int dst_c_w;
+  int dst_c_h;
+  int dst_c_sz;
+  int pli;
+  /*Skip past the luma data.*/
+  _dst += _y4m->pic_w * _y4m->pic_h;
+  /*Compute the size of each chroma plane.*/
+  c_w = (_y4m->pic_w + _y4m->src_c_dec_h - 1) / _y4m->src_c_dec_h;
+  c_h = _y4m->pic_h;
+  dst_c_w = (_y4m->pic_w + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h;
+  dst_c_h = (_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v;
+  c_sz = c_w * c_h;
+  dst_c_sz = dst_c_w * dst_c_h;
+  for (pli = 1; pli < 3; pli++) {
+    y4m_422jpeg_420jpeg_helper(_dst, _aux, c_w, c_h);
+    _aux += c_sz;
+    _dst += dst_c_sz;
+  }
+}
+
+/*420jpeg chroma samples are sited like:
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |   BR  |       |   BR  |
+  |       |       |       |
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |   BR  |       |   BR  |
+  |       |       |       |
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+
+  422 chroma samples are sited like:
+  YBR-----Y-------YBR-----Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+  YBR-----Y-------YBR-----Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+  YBR-----Y-------YBR-----Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+  YBR-----Y-------YBR-----Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+
+  We use a resampling filter to shift the original site locations one quarter
+   pixel (at the original chroma resolution) to the right.
+  Then we use a second resampling filter to decimate the chroma planes by two
+   in the vertical direction.*/
+static void y4m_convert_422_420jpeg(y4m_input *_y4m, unsigned char *_dst,
+                                    unsigned char *_aux) {
+  unsigned char *tmp;
+  int c_w;
+  int c_h;
+  int c_sz;
+  int dst_c_h;
+  int dst_c_sz;
+  int pli;
+  /*Skip past the luma data.*/
+  _dst += _y4m->pic_w * _y4m->pic_h;
+  /*Compute the size of each chroma plane.*/
+  c_w = (_y4m->pic_w + _y4m->src_c_dec_h - 1) / _y4m->src_c_dec_h;
+  c_h = _y4m->pic_h;
+  dst_c_h = (_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v;
+  c_sz = c_w * c_h;
+  dst_c_sz = c_w * dst_c_h;
+  tmp = _aux + 2 * c_sz;
+  for (pli = 1; pli < 3; pli++) {
+    /*In reality, the horizontal and vertical steps could be pipelined, for
+       less memory consumption and better cache performance, but we do them
+       separately for simplicity.*/
+    /*First do horizontal filtering (convert to 422jpeg)*/
+    y4m_42xmpeg2_42xjpeg_helper(tmp, _aux, c_w, c_h);
+    /*Now do the vertical filtering.*/
+    y4m_422jpeg_420jpeg_helper(_dst, tmp, c_w, c_h);
+    _aux += c_sz;
+    _dst += dst_c_sz;
+  }
+}
+
+/*420jpeg chroma samples are sited like:
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |   BR  |       |   BR  |
+  |       |       |       |
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |   BR  |       |   BR  |
+  |       |       |       |
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+
+  411 chroma samples are sited like:
+  YBR-----Y-------Y-------Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+  YBR-----Y-------Y-------Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+  YBR-----Y-------Y-------Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+  YBR-----Y-------Y-------Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+
+  We use a filter to resample at site locations one eighth pixel (at the source
+   chroma plane's horizontal resolution) and five eighths of a pixel to the
+   right.
+  Then we use another filter to decimate the planes by 2 in the vertical
+   direction.*/
+static void y4m_convert_411_420jpeg(y4m_input *_y4m, unsigned char *_dst,
+                                    unsigned char *_aux) {
+  unsigned char *tmp;
+  int c_w;
+  int c_h;
+  int c_sz;
+  int dst_c_w;
+  int dst_c_h;
+  int dst_c_sz;
+  int tmp_sz;
+  int pli;
+  int y;
+  int x;
+  /*Skip past the luma data.*/
+  _dst += _y4m->pic_w * _y4m->pic_h;
+  /*Compute the size of each chroma plane.*/
+  c_w = (_y4m->pic_w + _y4m->src_c_dec_h - 1) / _y4m->src_c_dec_h;
+  c_h = _y4m->pic_h;
+  dst_c_w = (_y4m->pic_w + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h;
+  dst_c_h = (_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v;
+  c_sz = c_w * c_h;
+  dst_c_sz = dst_c_w * dst_c_h;
+  tmp_sz = dst_c_w * c_h;
+  tmp = _aux + 2 * c_sz;
+  for (pli = 1; pli < 3; pli++) {
+    /*In reality, the horizontal and vertical steps could be pipelined, for
+       less memory consumption and better cache performance, but we do them
+       separately for simplicity.*/
+    /*First do horizontal filtering (convert to 422jpeg)*/
+    for (y = 0; y < c_h; y++) {
+      /*Filters: [1 110 18 -1]/128 and [-3 50 86 -5]/128, both derived from a
+         4-tap Mitchell window.*/
+      for (x = 0; x < OC_MINI(c_w, 1); x++) {
+        tmp[x << 1] = (unsigned char)OC_CLAMPI(
+            0,
+            (111 * _aux[0] + 18 * _aux[OC_MINI(1, c_w - 1)] -
+             _aux[OC_MINI(2, c_w - 1)] + 64) >>
+                7,
+            255);
+        tmp[x << 1 | 1] = (unsigned char)OC_CLAMPI(
+            0,
+            (47 * _aux[0] + 86 * _aux[OC_MINI(1, c_w - 1)] -
+             5 * _aux[OC_MINI(2, c_w - 1)] + 64) >>
+                7,
+            255);
+      }
+      for (; x < c_w - 2; x++) {
+        tmp[x << 1] =
+            (unsigned char)OC_CLAMPI(0,
+                                     (_aux[x - 1] + 110 * _aux[x] +
+                                      18 * _aux[x + 1] - _aux[x + 2] + 64) >>
+                                         7,
+                                     255);
+        tmp[x << 1 | 1] = (unsigned char)OC_CLAMPI(
+            0,
+            (-3 * _aux[x - 1] + 50 * _aux[x] + 86 * _aux[x + 1] -
+             5 * _aux[x + 2] + 64) >>
+                7,
+            255);
+      }
+      for (; x < c_w; x++) {
+        tmp[x << 1] = (unsigned char)OC_CLAMPI(
+            0,
+            (_aux[x - 1] + 110 * _aux[x] + 18 * _aux[OC_MINI(x + 1, c_w - 1)] -
+             _aux[c_w - 1] + 64) >>
+                7,
+            255);
+        if ((x << 1 | 1) < dst_c_w) {
+          tmp[x << 1 | 1] = (unsigned char)OC_CLAMPI(
+              0,
+              (-3 * _aux[x - 1] + 50 * _aux[x] +
+               86 * _aux[OC_MINI(x + 1, c_w - 1)] - 5 * _aux[c_w - 1] + 64) >>
+                  7,
+              255);
+        }
+      }
+      tmp += dst_c_w;
+      _aux += c_w;
+    }
+    tmp -= tmp_sz;
+    /*Now do the vertical filtering.*/
+    y4m_422jpeg_420jpeg_helper(_dst, tmp, dst_c_w, c_h);
+    _dst += dst_c_sz;
+  }
+}
+
+/*Convert 444 to 420jpeg.*/
+static void y4m_convert_444_420jpeg(y4m_input *_y4m, unsigned char *_dst,
+                                    unsigned char *_aux) {
+  unsigned char *tmp;
+  int c_w;
+  int c_h;
+  int c_sz;
+  int dst_c_w;
+  int dst_c_h;
+  int dst_c_sz;
+  int tmp_sz;
+  int pli;
+  int y;
+  int x;
+  /*Skip past the luma data.*/
+  _dst += _y4m->pic_w * _y4m->pic_h;
+  /*Compute the size of each chroma plane.*/
+  c_w = (_y4m->pic_w + _y4m->src_c_dec_h - 1) / _y4m->src_c_dec_h;
+  c_h = _y4m->pic_h;
+  dst_c_w = (_y4m->pic_w + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h;
+  dst_c_h = (_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v;
+  c_sz = c_w * c_h;
+  dst_c_sz = dst_c_w * dst_c_h;
+  tmp_sz = dst_c_w * c_h;
+  tmp = _aux + 2 * c_sz;
+  for (pli = 1; pli < 3; pli++) {
+    /*Filter: [3 -17 78 78 -17 3]/128, derived from a 6-tap Lanczos window.*/
+    for (y = 0; y < c_h; y++) {
+      for (x = 0; x < OC_MINI(c_w, 2); x += 2) {
+        tmp[x >> 1] = OC_CLAMPI(0,
+                                (64 * _aux[0] + 78 * _aux[OC_MINI(1, c_w - 1)] -
+                                 17 * _aux[OC_MINI(2, c_w - 1)] +
+                                 3 * _aux[OC_MINI(3, c_w - 1)] + 64) >>
+                                    7,
+                                255);
+      }
+      for (; x < c_w - 3; x += 2) {
+        tmp[x >> 1] = OC_CLAMPI(0,
+                                (3 * (_aux[x - 2] + _aux[x + 3]) -
+                                 17 * (_aux[x - 1] + _aux[x + 2]) +
+                                 78 * (_aux[x] + _aux[x + 1]) + 64) >>
+                                    7,
+                                255);
+      }
+      for (; x < c_w; x += 2) {
+        tmp[x >> 1] =
+            OC_CLAMPI(0,
+                      (3 * (_aux[x - 2] + _aux[c_w - 1]) -
+                       17 * (_aux[x - 1] + _aux[OC_MINI(x + 2, c_w - 1)]) +
+                       78 * (_aux[x] + _aux[OC_MINI(x + 1, c_w - 1)]) + 64) >>
+                          7,
+                      255);
+      }
+      tmp += dst_c_w;
+      _aux += c_w;
+    }
+    tmp -= tmp_sz;
+    /*Now do the vertical filtering.*/
+    y4m_422jpeg_420jpeg_helper(_dst, tmp, dst_c_w, c_h);
+    _dst += dst_c_sz;
+  }
+}
+
+/*The image is padded with empty chroma components at 4:2:0.*/
+static void y4m_convert_mono_420jpeg(y4m_input *_y4m, unsigned char *_dst,
+                                     unsigned char *_aux) {
+  int c_sz;
+  (void)_aux;
+  _dst += _y4m->pic_w * _y4m->pic_h;
+  c_sz = ((_y4m->pic_w + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h) *
+         ((_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v);
+  memset(_dst, 128, c_sz * 2);
+}
+
+/*No conversion function needed.*/
+static void y4m_convert_null(y4m_input *_y4m, unsigned char *_dst,
+                             unsigned char *_aux) {
+  (void)_y4m;
+  (void)_dst;
+  (void)_aux;
+}
+
+int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip,
+                   aom_chroma_sample_position_t csp, int only_420) {
+  char buffer[80] = { 0 };
+  int ret;
+  int i;
+  /*Read until newline, or 80 cols, whichever happens first.*/
+  for (i = 0; i < 79; i++) {
+    if (_nskip > 0) {
+      buffer[i] = *_skip++;
+      _nskip--;
+    } else {
+      if (!file_read(buffer + i, 1, _fin)) return -1;
+    }
+    if (buffer[i] == '\n') break;
+  }
+  /*We skipped too much header data.*/
+  if (_nskip > 0) return -1;
+  if (i == 79) {
+    fprintf(stderr, "Error parsing header; not a YUV2MPEG2 file?\n");
+    return -1;
+  }
+  buffer[i] = '\0';
+  if (memcmp(buffer, "YUV4MPEG", 8)) {
+    fprintf(stderr, "Incomplete magic for YUV4MPEG file.\n");
+    return -1;
+  }
+  if (buffer[8] != '2') {
+    fprintf(stderr, "Incorrect YUV input file version; YUV4MPEG2 required.\n");
+  }
+  ret = y4m_parse_tags(_y4m, buffer + 5);
+  if (ret < 0) {
+    fprintf(stderr, "Error parsing YUV4MPEG2 header.\n");
+    return ret;
+  }
+  if (_y4m->interlace == '?') {
+    fprintf(stderr,
+            "Warning: Input video interlacing format unknown; "
+            "assuming progressive scan.\n");
+  } else if (_y4m->interlace != 'p') {
+    fprintf(stderr,
+            "Input video is interlaced; "
+            "Only progressive scan handled.\n");
+    return -1;
+  }
+  /* Only support vertical chroma sample position if the input format is
+   * already 420mpeg2. Colocated is not supported in Y4M.
+   */
+  if (csp == AOM_CSP_VERTICAL && strcmp(_y4m->chroma_type, "420mpeg2") != 0) {
+    fprintf(stderr,
+            "Vertical chroma sample position only supported "
+            "for 420mpeg2 input\n");
+    return -1;
+  }
+  if (csp == AOM_CSP_COLOCATED) {
+    fprintf(stderr, "Colocated chroma sample position not supported in Y4M\n");
+    return -1;
+  }
+  _y4m->aom_fmt = AOM_IMG_FMT_I420;
+  _y4m->bps = 12;
+  _y4m->bit_depth = 8;
+  if (strcmp(_y4m->chroma_type, "420") == 0 ||
+      strcmp(_y4m->chroma_type, "420jpeg") == 0) {
+    _y4m->src_c_dec_h = _y4m->dst_c_dec_h = _y4m->src_c_dec_v =
+        _y4m->dst_c_dec_v = 2;
+    _y4m->dst_buf_read_sz =
+        _y4m->pic_w * _y4m->pic_h +
+        2 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2);
+    /* Natively supported: no conversion required. */
+    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
+    _y4m->convert = y4m_convert_null;
+  } else if (strcmp(_y4m->chroma_type, "420p10") == 0) {
+    _y4m->src_c_dec_h = 2;
+    _y4m->dst_c_dec_h = 2;
+    _y4m->src_c_dec_v = 2;
+    _y4m->dst_c_dec_v = 2;
+    _y4m->dst_buf_read_sz =
+        2 * (_y4m->pic_w * _y4m->pic_h +
+             2 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2));
+    /* Natively supported: no conversion required. */
+    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
+    _y4m->convert = y4m_convert_null;
+    _y4m->bit_depth = 10;
+    _y4m->bps = 15;
+    _y4m->aom_fmt = AOM_IMG_FMT_I42016;
+    if (only_420) {
+      fprintf(stderr, "Unsupported conversion from 420p10 to 420jpeg\n");
+      return -1;
+    }
+  } else if (strcmp(_y4m->chroma_type, "420p12") == 0) {
+    _y4m->src_c_dec_h = 2;
+    _y4m->dst_c_dec_h = 2;
+    _y4m->src_c_dec_v = 2;
+    _y4m->dst_c_dec_v = 2;
+    _y4m->dst_buf_read_sz =
+        2 * (_y4m->pic_w * _y4m->pic_h +
+             2 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2));
+    /* Natively supported: no conversion required. */
+    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
+    _y4m->convert = y4m_convert_null;
+    _y4m->bit_depth = 12;
+    _y4m->bps = 18;
+    _y4m->aom_fmt = AOM_IMG_FMT_I42016;
+    if (only_420) {
+      fprintf(stderr, "Unsupported conversion from 420p12 to 420jpeg\n");
+      return -1;
+    }
+  } else if (strcmp(_y4m->chroma_type, "420mpeg2") == 0) {
+    _y4m->src_c_dec_h = _y4m->dst_c_dec_h = _y4m->src_c_dec_v =
+        _y4m->dst_c_dec_v = 2;
+    _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
+    /*Chroma filter required: read into the aux buf first.*/
+    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz =
+        2 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2);
+    _y4m->convert = y4m_convert_null;
+    if (csp != AOM_CSP_VERTICAL) {
+      _y4m->convert = y4m_convert_42xmpeg2_42xjpeg;
+      snprintf(_y4m->chroma_type, sizeof(_y4m->chroma_type), "420");
+    }
+  } else if (strcmp(_y4m->chroma_type, "420paldv") == 0) {
+    _y4m->src_c_dec_h = _y4m->dst_c_dec_h = _y4m->src_c_dec_v =
+        _y4m->dst_c_dec_v = 2;
+    _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
+    /*Chroma filter required: read into the aux buf first.
+      We need to make two filter passes, so we need some extra space in the
+       aux buffer.*/
+    _y4m->aux_buf_sz = 3 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2);
+    _y4m->aux_buf_read_sz =
+        2 * ((_y4m->pic_w + 1) / 2) * ((_y4m->pic_h + 1) / 2);
+    _y4m->convert = y4m_convert_42xpaldv_42xjpeg;
+  } else if (strcmp(_y4m->chroma_type, "422jpeg") == 0) {
+    _y4m->src_c_dec_h = _y4m->dst_c_dec_h = 2;
+    _y4m->src_c_dec_v = 1;
+    _y4m->dst_c_dec_v = 2;
+    _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
+    /*Chroma filter required: read into the aux buf first.*/
+    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz =
+        2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
+    _y4m->convert = y4m_convert_422jpeg_420jpeg;
+  } else if (strcmp(_y4m->chroma_type, "422") == 0) {
+    _y4m->src_c_dec_h = 2;
+    _y4m->src_c_dec_v = 1;
+    if (only_420) {
+      _y4m->dst_c_dec_h = 2;
+      _y4m->dst_c_dec_v = 2;
+      _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
+      /*Chroma filter required: read into the aux buf first.
+        We need to make two filter passes, so we need some extra space in the
+         aux buffer.*/
+      _y4m->aux_buf_read_sz = 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
+      _y4m->aux_buf_sz =
+          _y4m->aux_buf_read_sz + ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
+      _y4m->convert = y4m_convert_422_420jpeg;
+    } else {
+      _y4m->aom_fmt = AOM_IMG_FMT_I422;
+      _y4m->bps = 16;
+      _y4m->dst_c_dec_h = _y4m->src_c_dec_h;
+      _y4m->dst_c_dec_v = _y4m->src_c_dec_v;
+      _y4m->dst_buf_read_sz =
+          _y4m->pic_w * _y4m->pic_h + 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
+      /*Natively supported: no conversion required.*/
+      _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
+      _y4m->convert = y4m_convert_null;
+    }
+  } else if (strcmp(_y4m->chroma_type, "422p10") == 0) {
+    _y4m->src_c_dec_h = 2;
+    _y4m->src_c_dec_v = 1;
+    _y4m->aom_fmt = AOM_IMG_FMT_I42216;
+    _y4m->bps = 20;
+    _y4m->bit_depth = 10;
+    _y4m->dst_c_dec_h = _y4m->src_c_dec_h;
+    _y4m->dst_c_dec_v = _y4m->src_c_dec_v;
+    _y4m->dst_buf_read_sz = 2 * (_y4m->pic_w * _y4m->pic_h +
+                                 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h);
+    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
+    _y4m->convert = y4m_convert_null;
+    if (only_420) {
+      fprintf(stderr, "Unsupported conversion from 422p10 to 420jpeg\n");
+      return -1;
+    }
+  } else if (strcmp(_y4m->chroma_type, "422p12") == 0) {
+    _y4m->src_c_dec_h = 2;
+    _y4m->src_c_dec_v = 1;
+    _y4m->aom_fmt = AOM_IMG_FMT_I42216;
+    _y4m->bps = 24;
+    _y4m->bit_depth = 12;
+    _y4m->dst_c_dec_h = _y4m->src_c_dec_h;
+    _y4m->dst_c_dec_v = _y4m->src_c_dec_v;
+    _y4m->dst_buf_read_sz = 2 * (_y4m->pic_w * _y4m->pic_h +
+                                 2 * ((_y4m->pic_w + 1) / 2) * _y4m->pic_h);
+    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
+    _y4m->convert = y4m_convert_null;
+    if (only_420) {
+      fprintf(stderr, "Unsupported conversion from 422p12 to 420jpeg\n");
+      return -1;
+    }
+  } else if (strcmp(_y4m->chroma_type, "411") == 0) {
+    _y4m->src_c_dec_h = 4;
+    _y4m->dst_c_dec_h = 2;
+    _y4m->src_c_dec_v = 1;
+    _y4m->dst_c_dec_v = 2;
+    _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
+    /*Chroma filter required: read into the aux buf first.
+      We need to make two filter passes, so we need some extra space in the
+       aux buffer.*/
+    _y4m->aux_buf_read_sz = 2 * ((_y4m->pic_w + 3) / 4) * _y4m->pic_h;
+    _y4m->aux_buf_sz =
+        _y4m->aux_buf_read_sz + ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
+    _y4m->convert = y4m_convert_411_420jpeg;
+  } else if (strcmp(_y4m->chroma_type, "444") == 0) {
+    _y4m->src_c_dec_h = 1;
+    _y4m->src_c_dec_v = 1;
+    if (only_420) {
+      _y4m->dst_c_dec_h = 2;
+      _y4m->dst_c_dec_v = 2;
+      _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
+      /*Chroma filter required: read into the aux buf first.
+        We need to make two filter passes, so we need some extra space in the
+         aux buffer.*/
+      _y4m->aux_buf_read_sz = 2 * _y4m->pic_w * _y4m->pic_h;
+      _y4m->aux_buf_sz =
+          _y4m->aux_buf_read_sz + ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
+      _y4m->convert = y4m_convert_444_420jpeg;
+    } else {
+      _y4m->aom_fmt = AOM_IMG_FMT_I444;
+      _y4m->bps = 24;
+      _y4m->dst_c_dec_h = _y4m->src_c_dec_h;
+      _y4m->dst_c_dec_v = _y4m->src_c_dec_v;
+      _y4m->dst_buf_read_sz = 3 * _y4m->pic_w * _y4m->pic_h;
+      /*Natively supported: no conversion required.*/
+      _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
+      _y4m->convert = y4m_convert_null;
+    }
+  } else if (strcmp(_y4m->chroma_type, "444p10") == 0) {
+    _y4m->src_c_dec_h = 1;
+    _y4m->src_c_dec_v = 1;
+    _y4m->aom_fmt = AOM_IMG_FMT_I44416;
+    _y4m->bps = 30;
+    _y4m->bit_depth = 10;
+    _y4m->dst_c_dec_h = _y4m->src_c_dec_h;
+    _y4m->dst_c_dec_v = _y4m->src_c_dec_v;
+    _y4m->dst_buf_read_sz = 2 * 3 * _y4m->pic_w * _y4m->pic_h;
+    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
+    _y4m->convert = y4m_convert_null;
+    if (only_420) {
+      fprintf(stderr, "Unsupported conversion from 444p10 to 420jpeg\n");
+      return -1;
+    }
+  } else if (strcmp(_y4m->chroma_type, "444p12") == 0) {
+    _y4m->src_c_dec_h = 1;
+    _y4m->src_c_dec_v = 1;
+    _y4m->aom_fmt = AOM_IMG_FMT_I44416;
+    _y4m->bps = 36;
+    _y4m->bit_depth = 12;
+    _y4m->dst_c_dec_h = _y4m->src_c_dec_h;
+    _y4m->dst_c_dec_v = _y4m->src_c_dec_v;
+    _y4m->dst_buf_read_sz = 2 * 3 * _y4m->pic_w * _y4m->pic_h;
+    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
+    _y4m->convert = y4m_convert_null;
+    if (only_420) {
+      fprintf(stderr, "Unsupported conversion from 444p12 to 420jpeg\n");
+      return -1;
+    }
+  } else if (strcmp(_y4m->chroma_type, "444alpha") == 0) {
+    _y4m->src_c_dec_h = 1;
+    _y4m->src_c_dec_v = 1;
+    if (only_420) {
+      _y4m->dst_c_dec_h = 2;
+      _y4m->dst_c_dec_v = 2;
+      _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
+      /*Chroma filter required: read into the aux buf first.
+        We need to make two filter passes, so we need some extra space in the
+         aux buffer.
+        The extra plane also gets read into the aux buf.
+        It will be discarded.*/
+      _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 3 * _y4m->pic_w * _y4m->pic_h;
+      _y4m->convert = y4m_convert_444_420jpeg;
+    } else {
+      fprintf(stderr, "Unsupported format: 444A\n");
+      return -1;
+    }
+  } else if (strcmp(_y4m->chroma_type, "mono") == 0) {
+    _y4m->src_c_dec_h = _y4m->src_c_dec_v = 0;
+    _y4m->dst_c_dec_h = _y4m->dst_c_dec_v = 2;
+    _y4m->dst_buf_read_sz = _y4m->pic_w * _y4m->pic_h;
+    /*No extra space required, but we need to clear the chroma planes.*/
+    _y4m->aux_buf_sz = _y4m->aux_buf_read_sz = 0;
+    _y4m->convert = y4m_convert_mono_420jpeg;
+  } else {
+    fprintf(stderr, "Unknown chroma sampling type: %s\n", _y4m->chroma_type);
+    return -1;
+  }
+  /*The size of the final frame buffers is always computed from the
+     destination chroma decimation type.*/
+  _y4m->dst_buf_sz =
+      _y4m->pic_w * _y4m->pic_h +
+      2 * ((_y4m->pic_w + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h) *
+          ((_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v);
+  if (_y4m->bit_depth == 8)
+    _y4m->dst_buf = (unsigned char *)malloc(_y4m->dst_buf_sz);
+  else
+    _y4m->dst_buf = (unsigned char *)malloc(2 * _y4m->dst_buf_sz);
+
+  if (_y4m->aux_buf_sz > 0)
+    _y4m->aux_buf = (unsigned char *)malloc(_y4m->aux_buf_sz);
+  return 0;
+}
+
+void y4m_input_close(y4m_input *_y4m) {
+  free(_y4m->dst_buf);
+  free(_y4m->aux_buf);
+}
+
+int y4m_input_fetch_frame(y4m_input *_y4m, FILE *_fin, aom_image_t *_img) {
+  char frame[6];
+  int pic_sz;
+  int c_w;
+  int c_h;
+  int c_sz;
+  int bytes_per_sample = _y4m->bit_depth > 8 ? 2 : 1;
+  /*Read and skip the frame header.*/
+  if (!file_read(frame, 6, _fin)) return 0;
+  if (memcmp(frame, "FRAME", 5)) {
+    fprintf(stderr, "Loss of framing in Y4M input data\n");
+    return -1;
+  }
+  if (frame[5] != '\n') {
+    char c;
+    int j;
+    for (j = 0; j < 79 && file_read(&c, 1, _fin) && c != '\n'; j++) {
+    }
+    if (j == 79) {
+      fprintf(stderr, "Error parsing Y4M frame header\n");
+      return -1;
+    }
+  }
+  /*Read the frame data that needs no conversion.*/
+  if (!file_read(_y4m->dst_buf, _y4m->dst_buf_read_sz, _fin)) {
+    fprintf(stderr, "Error reading Y4M frame data.\n");
+    return -1;
+  }
+  /*Read the frame data that does need conversion.*/
+  if (!file_read(_y4m->aux_buf, _y4m->aux_buf_read_sz, _fin)) {
+    fprintf(stderr, "Error reading Y4M frame data.\n");
+    return -1;
+  }
+  /*Now convert the just read frame.*/
+  (*_y4m->convert)(_y4m, _y4m->dst_buf, _y4m->aux_buf);
+  /*Fill in the frame buffer pointers.
+    We don't use aom_img_wrap() because it forces padding for odd picture
+     sizes, which would require a separate fread call for every row.*/
+  memset(_img, 0, sizeof(*_img));
+  /*Y4M has the planes in Y'CbCr order, which libaom calls Y, U, and V.*/
+  _img->fmt = _y4m->aom_fmt;
+  _img->w = _img->d_w = _y4m->pic_w;
+  _img->h = _img->d_h = _y4m->pic_h;
+  _img->x_chroma_shift = _y4m->dst_c_dec_h >> 1;
+  _img->y_chroma_shift = _y4m->dst_c_dec_v >> 1;
+  _img->bps = _y4m->bps;
+
+  /*Set up the buffer pointers.*/
+  pic_sz = _y4m->pic_w * _y4m->pic_h * bytes_per_sample;
+  c_w = (_y4m->pic_w + _y4m->dst_c_dec_h - 1) / _y4m->dst_c_dec_h;
+  c_w *= bytes_per_sample;
+  c_h = (_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v;
+  c_sz = c_w * c_h;
+  _img->stride[AOM_PLANE_Y] = _y4m->pic_w * bytes_per_sample;
+  _img->stride[AOM_PLANE_U] = _img->stride[AOM_PLANE_V] = c_w;
+  _img->planes[AOM_PLANE_Y] = _y4m->dst_buf;
+  _img->planes[AOM_PLANE_U] = _y4m->dst_buf + pic_sz;
+  _img->planes[AOM_PLANE_V] = _y4m->dst_buf + pic_sz + c_sz;
+  return 1;
+}
diff --git a/libs/libaom/src/common/y4minput.h b/libs/libaom/src/common/y4minput.h
new file mode 100644
index 000000000..f6c5a3d3a
--- /dev/null
+++ b/libs/libaom/src/common/y4minput.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ *
+ * Based on code from the OggTheora software codec source code,
+ * Copyright (C) 2002-2010 The Xiph.Org Foundation and contributors.
+ */
+
+#ifndef AOM_COMMON_Y4MINPUT_H_
+#define AOM_COMMON_Y4MINPUT_H_
+
+#include <stdio.h>
+#include "aom/aom_image.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct y4m_input y4m_input;
+
+/*The function used to perform chroma conversion.*/
+typedef void (*y4m_convert_func)(y4m_input *_y4m, unsigned char *_dst,
+                                 unsigned char *_src);
+
+struct y4m_input {
+  int pic_w;
+  int pic_h;
+  int fps_n;
+  int fps_d;
+  int par_n;
+  int par_d;
+  char interlace;
+  int src_c_dec_h;
+  int src_c_dec_v;
+  int dst_c_dec_h;
+  int dst_c_dec_v;
+  char chroma_type[16];
+  /*The size of each converted frame buffer.*/
+  size_t dst_buf_sz;
+  /*The amount to read directly into the converted frame buffer.*/
+  size_t dst_buf_read_sz;
+  /*The size of the auxilliary buffer.*/
+  size_t aux_buf_sz;
+  /*The amount to read into the auxilliary buffer.*/
+  size_t aux_buf_read_sz;
+  y4m_convert_func convert;
+  unsigned char *dst_buf;
+  unsigned char *aux_buf;
+  enum aom_img_fmt aom_fmt;
+  int bps;
+  unsigned int bit_depth;
+};
+
+/**
+ * Open the input file, treating it as Y4M. y4m_input is filled in after
+ * reading it. Note that chroma-sample-position should only be set for 420
+ * input, and the input chroma is shifted if necessary. The code does not
+ * support the conversion from co-located to vertical.
+ */
+int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip,
+                   aom_chroma_sample_position_t csp, int only_420);
+void y4m_input_close(y4m_input *_y4m);
+int y4m_input_fetch_frame(y4m_input *_y4m, FILE *_fin, aom_image_t *img);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_COMMON_Y4MINPUT_H_
diff --git a/libs/libaom/src/docs.cmake b/libs/libaom/src/docs.cmake
new file mode 100644
index 000000000..28ca5c026
--- /dev/null
+++ b/libs/libaom/src/docs.cmake
@@ -0,0 +1,257 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_DOCS_CMAKE_)
+  return()
+endif() # AOM_DOCS_CMAKE_
+set(AOM_DOCS_CMAKE_ 1)
+
+cmake_minimum_required(VERSION 3.5)
+
+set(AOM_DOXYFILE "${AOM_CONFIG_DIR}/doxyfile")
+set(AOM_DOXYGEN_CONFIG_TEMPLATE "libs.doxy_template")
+set(AOM_DOXYGEN_OUTPUT_DIR "${AOM_CONFIG_DIR}/dox")
+set(AOM_DOXYGEN_SECTIONS "av1")
+
+set(AOM_DOXYGEN_SOURCES "${AOM_ROOT}/aom/aom.h" "${AOM_ROOT}/aom/aom_codec.h"
+                        "${AOM_ROOT}/aom/aom_decoder.h"
+                        "${AOM_ROOT}/aom/aom_encoder.h"
+                        "${AOM_ROOT}/aom/aom_frame_buffer.h"
+                        "${AOM_ROOT}/aom/aom_image.h"
+                        "${AOM_ROOT}/aom/aom_integer.h"
+                        "${AOM_ROOT}/keywords.dox" "${AOM_ROOT}/mainpage.dox"
+                        "${AOM_ROOT}/usage.dox")
+
+if(CONFIG_AV1_DECODER)
+  set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES}
+                                  "${AOM_ROOT}/apps/aomdec.c"
+                                  "${AOM_ROOT}/examples/decode_to_md5.c"
+                                  "${AOM_ROOT}/examples/decode_with_drops.c"
+                                  "${AOM_ROOT}/examples/simple_decoder.c")
+
+  set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS}
+                                       "Full featured decoder."
+                                       "Frame by frame MD5 checksum."
+                                       "Drops frames while decoding."
+                                       "Simplified decoder loop.")
+
+  set(AOM_DOXYGEN_SECTIONS ${AOM_DOXYGEN_SECTIONS} "av1_decoder decoder")
+
+  set(AOM_DOXYGEN_SOURCES ${AOM_DOXYGEN_SOURCES} "${AOM_ROOT}/aom/aomdx.h"
+                          "${AOM_ROOT}/usage_dx.dox")
+
+  if(CONFIG_ANALYZER)
+    set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES}
+                                    "${AOM_ROOT}/examples/analyzer.cc")
+
+    set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS}
+                                         "Bitstream analyzer.")
+  endif()
+
+  if(CONFIG_INSPECTION)
+    set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES}
+                                    "${AOM_ROOT}/examples/inspect.c")
+
+    set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS}
+                                         "Bitstream inspector.")
+  endif()
+endif()
+
+if(CONFIG_AV1_ENCODER)
+  set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES}
+                                  "${AOM_ROOT}/apps/aomenc.c"
+                                  "${AOM_ROOT}/examples/lossless_encoder.c"
+                                  "${AOM_ROOT}/examples/set_maps.c"
+                                  "${AOM_ROOT}/examples/simple_encoder.c"
+                                  "${AOM_ROOT}/examples/twopass_encoder.c")
+
+  set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS}
+                                       "Full featured encoder."
+                                       "Simplified lossless encoder."
+                                       "Set active and ROI maps."
+                                       "Simplified encoder loop."
+                                       "Two-pass encoder loop.")
+
+  set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES}
+                                  "${AOM_ROOT}/examples/scalable_encoder.c")
+
+  set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS}
+                                       "Scalable encoder loop.")
+
+  set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES}
+                                  "${AOM_ROOT}/examples/svc_encoder_rtc.c")
+
+  set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS}
+                                       "Layered encoder for RTC.")
+
+  set(AOM_DOXYGEN_SECTIONS ${AOM_DOXYGEN_SECTIONS} "av1_encoder encoder")
+
+  set(AOM_DOXYGEN_SOURCES ${AOM_DOXYGEN_SOURCES} "${AOM_ROOT}/aom/aomcx.h"
+                          "${AOM_ROOT}/usage_cx.dox")
+endif()
+
+if(CONFIG_AV1_DECODER AND CONFIG_AV1_ENCODER)
+  set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES}
+                                  "${AOM_ROOT}/examples/aom_cx_set_ref.c")
+
+  set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS}
+                                       "Set encoder reference frame.")
+endif()
+
+if(CONFIG_AV1_ENCODER)
+  set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES}
+                                  "${AOM_ROOT}/examples/lightfield_encoder.c")
+
+  set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS}
+                                       "Lightfield encoder example.")
+endif()
+
+if(CONFIG_AV1_DECODER)
+  set(AOM_DOXYGEN_EXAMPLE_SOURCES
+      ${AOM_DOXYGEN_EXAMPLE_SOURCES}
+      "${AOM_ROOT}/examples/lightfield_tile_list_decoder.c")
+
+  set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS}
+                                       "Lightfield tile list decoder example.")
+endif()
+
+if(CONFIG_AV1_DECODER)
+  set(AOM_DOXYGEN_EXAMPLE_SOURCES ${AOM_DOXYGEN_EXAMPLE_SOURCES}
+                                  "${AOM_ROOT}/examples/lightfield_decoder.c")
+
+  set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS}
+                                       "Lightfield decoder example.")
+endif()
+
+if(CONFIG_AV1_DECODER AND CONFIG_AV1_ENCODER)
+  set(AOM_DOXYGEN_EXAMPLE_SOURCES
+      ${AOM_DOXYGEN_EXAMPLE_SOURCES}
+      "${AOM_ROOT}/examples/lightfield_bitstream_parsing.c")
+
+  set(AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS}
+                                       "Lightfield bitstream parsing example.")
+endif()
+
+# Iterates over list named by $list_name and appends each item to $AOM_DOXYFILE
+# as values assigned to $var_name with no line breaks between list items.
+# Appends a new line after the entire config variable is expanded.
+function(write_cmake_list_to_doxygen_config_var var_name list_name)
+  unset(output_string)
+  foreach(list_item ${${list_name}})
+    set(output_string "${output_string} ${list_item} ")
+  endforeach()
+  string(STRIP "${output_string}" output_string)
+  file(APPEND "${AOM_DOXYFILE}" "${var_name} += ${output_string}\n")
+endfunction()
+
+function(get_name file_path name_var)
+  get_filename_component(file_basename ${file_path} NAME)
+  get_filename_component(${name_var} ${file_basename} NAME_WE)
+  set(${name_var} ${${name_var}} PARENT_SCOPE)
+endfunction()
+
+function(setup_documentation_targets)
+
+  # Sanity check: the lengths of these lists must match.
+  list(LENGTH AOM_DOXYGEN_EXAMPLE_SOURCES num_sources)
+  list(LENGTH AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS num_descs)
+  if(NOT ${num_sources} EQUAL ${num_descs})
+    message(FATAL_ERROR "Unqeual example and description totals.")
+  endif()
+
+  # Take the list of examples and produce example_basename.dox for each file in
+  # the list.
+  file(MAKE_DIRECTORY "${AOM_DOXYGEN_OUTPUT_DIR}")
+  foreach(example_file ${AOM_DOXYGEN_EXAMPLE_SOURCES})
+    unset(example_basename)
+    get_name("${example_file}" "example_name")
+    set(example_dox "${AOM_DOXYGEN_OUTPUT_DIR}/${example_name}.dox")
+    set(dox_string "/*!\\page example_${example_name} ${example_name}\n")
+    set(dox_string "${dox_string} \\includelineno ${example_file}\n*/\n")
+    file(WRITE "${example_dox}" ${dox_string})
+    set(AOM_DOXYGEN_SOURCES ${AOM_DOXYGEN_SOURCES} "${example_dox}")
+  endforeach()
+
+  # Generate samples.dox, an index page that refers to the example_basename.dox
+  # files that were just created.
+  set(samples_header "
+/*!\\page samples Sample Code
+This SDK includes a number of sample applications. Each sample documents a
+feature of the SDK in both prose and the associated C code. The following
+samples are included:
+")
+
+  set(utils_desc "
+In addition, the SDK contains a number of utilities. Since these utilities are
+built upon the concepts described in the sample code listed above, they are not
+documented in pieces like the samples are. Their source is included here for
+reference. The following utilities are included:
+")
+
+  # Write the description for the samples section.
+  set(samples_dox "${AOM_CONFIG_DIR}/samples.dox")
+  file(WRITE "${samples_dox}" "${samples_header}\n")
+
+  # Iterate over $AOM_DOXYGEN_EXAMPLE_SOURCES and
+  # $AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS and massage example names as required by
+  # AV1's doxygen setup.
+  math(EXPR max_example_index "${num_sources} - 1")
+  foreach(NUM RANGE ${max_example_index})
+    list(GET AOM_DOXYGEN_EXAMPLE_SOURCES ${NUM} ex_name)
+    get_name("${ex_name}" "ex_name")
+
+    # AV1's doxygen lists aomdec and aomenc as utils apart from the examples.
+    # Save the indexes for another pass.
+    if("${ex_name}" MATCHES "aomdec\|aomenc")
+      set(util_indexes "${util_indexes}" "${NUM}")
+      continue()
+    endif()
+    list(GET AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${NUM} ex_desc)
+    file(APPEND "${samples_dox}" " - \\subpage example_${ex_name} ${ex_desc}\n")
+  endforeach()
+
+  # Write the description and index for the utils.
+  file(APPEND "${samples_dox}" "${utils_desc}\n")
+  foreach(util_index ${util_indexes})
+    list(GET AOM_DOXYGEN_EXAMPLE_SOURCES ${util_index} ex_name)
+    get_name("${ex_name}" "ex_name")
+    list(GET AOM_DOXYGEN_EXAMPLE_DESCRIPTIONS ${util_index} ex_desc)
+    file(APPEND "${samples_dox}" " - \\subpage example_${ex_name} ${ex_desc}\n")
+  endforeach()
+  file(APPEND "${samples_dox}" "*/")
+
+  # Add $samples_dox to the doxygen inputs.
+  get_filename_component(samples_dox ${samples_dox} NAME)
+  set(AOM_DOXYGEN_SOURCES ${AOM_DOXYGEN_SOURCES} ${samples_dox})
+
+  # Generate libaom's doxyfile.
+  file(WRITE "${AOM_DOXYFILE}" "##\n## GENERATED FILE. DO NOT EDIT\n##\n")
+  file(READ "${AOM_ROOT}/${AOM_DOXYGEN_CONFIG_TEMPLATE}" doxygen_template_data)
+  file(APPEND "${AOM_DOXYFILE}" ${doxygen_template_data})
+  file(APPEND "${AOM_DOXYFILE}"
+       "EXAMPLE_PATH += ${AOM_ROOT} ${AOM_ROOT}/examples\n")
+  file(APPEND "${AOM_DOXYFILE}"
+       "INCLUDE_PATH += ${AOM_CONFIG_DIR} ${AOM_ROOT}\n")
+  file(APPEND "${AOM_DOXYFILE}"
+       "STRIP_FROM_PATH += ${AOM_ROOT} ${AOM_CONFIG_DIR}\n")
+  write_cmake_list_to_doxygen_config_var("INPUT" "AOM_DOXYGEN_SOURCES")
+  write_cmake_list_to_doxygen_config_var("ENABLED_SECTIONS"
+                                         "AOM_DOXYGEN_SECTIONS")
+
+  # Add the doxygen generation rule.
+  add_custom_target(docs ALL
+                    COMMAND "${DOXYGEN_EXECUTABLE}" "${AOM_DOXYFILE}"
+                    DEPENDS "${AOM_DOXYFILE}" ${AOM_DOXYGEN_SOURCES}
+                            ${AOM_DOXYGEN_EXAMPLE_SOURCES}
+                            "${AOM_DOXYGEN_CONFIG_TEMPLATE}"
+                    SOURCES "${AOM_DOXYFILE}" ${AOM_DOXYGEN_SOURCES}
+                            ${AOM_DOXYGEN_EXAMPLE_SOURCES}
+                            "${AOM_DOXYGEN_CONFIG_TEMPLATE}")
+endfunction()
diff --git a/libs/libaom/src/examples/analyzer.cc b/libs/libaom/src/examples/analyzer.cc
new file mode 100644
index 000000000..35988211e
--- /dev/null
+++ b/libs/libaom/src/examples/analyzer.cc
@@ -0,0 +1,723 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <wx/wx.h>
+#include <wx/aboutdlg.h>
+#include <wx/cmdline.h>
+#include <wx/dcbuffer.h>
+
+#include "aom/aom_decoder.h"
+#include "aom/aomdx.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/decoder/accounting.h"
+#include "av1/decoder/inspection.h"
+#include "common/tools_common.h"
+#include "common/video_reader.h"
+
+#define OD_SIGNMASK(a) (-((a) < 0))
+#define OD_FLIPSIGNI(a, b) (((a) + OD_SIGNMASK(b)) ^ OD_SIGNMASK(b))
+#define OD_DIV_ROUND(x, y) (((x) + OD_FLIPSIGNI((y) >> 1, x)) / (y))
+
+enum {
+  OD_LUMA_MASK = 1 << 0,
+  OD_CB_MASK = 1 << 1,
+  OD_CR_MASK = 1 << 2,
+  OD_ALL_MASK = OD_LUMA_MASK | OD_CB_MASK | OD_CR_MASK
+};
+
+class AV1Decoder {
+ private:
+  FILE *input;
+  wxString path;
+
+  AvxVideoReader *reader;
+  const AvxVideoInfo *info;
+  const AvxInterface *decoder;
+
+  insp_frame_data frame_data;
+
+  aom_codec_ctx_t codec;
+  bool show_padding;
+
+ public:
+  aom_image_t *image;
+  int frame;
+
+  int plane_mask;
+
+  AV1Decoder();
+  ~AV1Decoder();
+
+  bool open(const wxString &path);
+  void close();
+  bool step();
+
+  int getWidthPadding() const;
+  int getHeightPadding() const;
+  void togglePadding();
+  int getWidth() const;
+  int getHeight() const;
+
+  bool getAccountingStruct(Accounting **acct);
+  bool setInspectionCallback();
+
+  static void inspect(void *decoder, void *data);
+};
+
+AV1Decoder::AV1Decoder()
+    : reader(NULL), info(NULL), decoder(NULL), show_padding(false), image(NULL),
+      frame(0) {}
+
+AV1Decoder::~AV1Decoder() {}
+
+void AV1Decoder::togglePadding() { show_padding = !show_padding; }
+
+bool AV1Decoder::open(const wxString &path) {
+  reader = aom_video_reader_open(path.mb_str());
+  if (!reader) {
+    fprintf(stderr, "Failed to open %s for reading.", path.mb_str().data());
+    return false;
+  }
+  this->path = path;
+  info = aom_video_reader_get_info(reader);
+  decoder = get_aom_decoder_by_fourcc(info->codec_fourcc);
+  if (!decoder) {
+    fprintf(stderr, "Unknown input codec.");
+    return false;
+  }
+  printf("Using %s\n", aom_codec_iface_name(decoder->codec_interface()));
+  if (aom_codec_dec_init(&codec, decoder->codec_interface(), NULL, 0)) {
+    fprintf(stderr, "Failed to initialize decoder.");
+    return false;
+  }
+  ifd_init(&frame_data, info->frame_width, info->frame_height);
+  setInspectionCallback();
+  return true;
+}
+
+void AV1Decoder::close() {}
+
+bool AV1Decoder::step() {
+  if (aom_video_reader_read_frame(reader)) {
+    size_t frame_size;
+    const unsigned char *frame_data;
+    frame_data = aom_video_reader_get_frame(reader, &frame_size);
+    if (aom_codec_decode(&codec, frame_data, frame_size, NULL)) {
+      fprintf(stderr, "Failed to decode frame.");
+      return false;
+    } else {
+      aom_codec_iter_t iter = NULL;
+      image = aom_codec_get_frame(&codec, &iter);
+      if (image != NULL) {
+        frame++;
+        return true;
+      }
+      return false;
+    }
+  }
+  return false;
+}
+
+int AV1Decoder::getWidth() const {
+  return info->frame_width + 2 * getWidthPadding();
+}
+
+int AV1Decoder::getWidthPadding() const {
+  return show_padding ? AOMMAX(info->frame_width + 16,
+                               ALIGN_POWER_OF_TWO(info->frame_width, 6)) -
+                            info->frame_width
+                      : 0;
+}
+
+int AV1Decoder::getHeight() const {
+  return info->frame_height + 2 * getHeightPadding();
+}
+
+int AV1Decoder::getHeightPadding() const {
+  return show_padding ? AOMMAX(info->frame_height + 16,
+                               ALIGN_POWER_OF_TWO(info->frame_height, 6)) -
+                            info->frame_height
+                      : 0;
+}
+
+bool AV1Decoder::getAccountingStruct(Accounting **accounting) {
+  return aom_codec_control(&codec, AV1_GET_ACCOUNTING, accounting) ==
+         AOM_CODEC_OK;
+}
+
+bool AV1Decoder::setInspectionCallback() {
+  aom_inspect_init ii;
+  ii.inspect_cb = AV1Decoder::inspect;
+  ii.inspect_ctx = (void *)this;
+  return aom_codec_control(&codec, AV1_SET_INSPECTION_CALLBACK, &ii) ==
+         AOM_CODEC_OK;
+}
+
+void AV1Decoder::inspect(void *pbi, void *data) {
+  AV1Decoder *decoder = (AV1Decoder *)data;
+  ifd_inspect(&decoder->frame_data, pbi, 0);
+}
+
+#define MIN_ZOOM (1)
+#define MAX_ZOOM (4)
+
+class AnalyzerPanel : public wxPanel {
+  DECLARE_EVENT_TABLE()
+
+ private:
+  AV1Decoder decoder;
+  const wxString path;
+
+  int zoom;
+  unsigned char *pixels;
+
+  const bool bit_accounting;
+  double *bpp_q3;
+
+  int plane_mask;
+
+  // The display size is the decode size, scaled by the zoom.
+  int getDisplayWidth() const;
+  int getDisplayHeight() const;
+
+  bool updateDisplaySize();
+
+  void computeBitsPerPixel();
+
+ public:
+  AnalyzerPanel(wxWindow *parent, const wxString &path,
+                const bool bit_accounting);
+  ~AnalyzerPanel();
+
+  bool open(const wxString &path);
+  void close();
+  void render();
+  void togglePadding();
+  bool nextFrame();
+  void refresh();
+
+  int getZoom() const;
+  bool setZoom(int zoom);
+
+  void setShowPlane(bool show_plane, int mask);
+
+  void onPaint(wxPaintEvent &event);  // NOLINT
+};
+
+BEGIN_EVENT_TABLE(AnalyzerPanel, wxPanel)
+EVT_PAINT(AnalyzerPanel::onPaint)
+END_EVENT_TABLE()
+
+AnalyzerPanel::AnalyzerPanel(wxWindow *parent, const wxString &path,
+                             const bool bit_accounting)
+    : wxPanel(parent), path(path), zoom(0), pixels(NULL),
+      bit_accounting(bit_accounting), bpp_q3(NULL), plane_mask(OD_ALL_MASK) {}
+
+AnalyzerPanel::~AnalyzerPanel() { close(); }
+
+void AnalyzerPanel::setShowPlane(bool show_plane, int mask) {
+  if (show_plane) {
+    plane_mask |= mask;
+  } else {
+    plane_mask &= ~mask;
+  }
+}
+
+void AnalyzerPanel::render() {
+  aom_image_t *img = decoder.image;
+  const int hbd = !!(img->fmt & AOM_IMG_FMT_HIGHBITDEPTH);
+  int y_stride = img->stride[0] >> hbd;
+  int cb_stride = img->stride[1] >> hbd;
+  int cr_stride = img->stride[2] >> hbd;
+  int p_stride = 3 * getDisplayWidth();
+  unsigned char *y_row = img->planes[0];
+  unsigned char *cb_row = img->planes[1];
+  unsigned char *cr_row = img->planes[2];
+  uint16_t *y_row16 = reinterpret_cast<uint16_t *>(y_row);
+  uint16_t *cb_row16 = reinterpret_cast<uint16_t *>(cb_row);
+  uint16_t *cr_row16 = reinterpret_cast<uint16_t *>(cr_row);
+  unsigned char *p_row = pixels;
+  int y_width_padding = decoder.getWidthPadding();
+  int cb_width_padding = y_width_padding >> 1;
+  int cr_width_padding = y_width_padding >> 1;
+  int y_height_padding = decoder.getHeightPadding();
+  int cb_height_padding = y_height_padding >> 1;
+  int cr_height_padding = y_height_padding >> 1;
+  for (int j = 0; j < decoder.getHeight(); j++) {
+    unsigned char *y = y_row - y_stride * y_height_padding;
+    unsigned char *cb = cb_row - cb_stride * cb_height_padding;
+    unsigned char *cr = cr_row - cr_stride * cr_height_padding;
+    uint16_t *y16 = y_row16 - y_stride * y_height_padding;
+    uint16_t *cb16 = cb_row16 - cb_stride * cb_height_padding;
+    uint16_t *cr16 = cr_row16 - cr_stride * cr_height_padding;
+    unsigned char *p = p_row;
+    for (int i = 0; i < decoder.getWidth(); i++) {
+      int64_t yval;
+      int64_t cbval;
+      int64_t crval;
+      int pmask;
+      unsigned rval;
+      unsigned gval;
+      unsigned bval;
+      if (hbd) {
+        yval = *(y16 - y_width_padding);
+        cbval = *(cb16 - cb_width_padding);
+        crval = *(cr16 - cr_width_padding);
+      } else {
+        yval = *(y - y_width_padding);
+        cbval = *(cb - cb_width_padding);
+        crval = *(cr - cr_width_padding);
+      }
+      pmask = plane_mask;
+      if (pmask & OD_LUMA_MASK) {
+        yval -= 16;
+      } else {
+        yval = 128;
+      }
+      cbval = ((pmask & OD_CB_MASK) >> 1) * (cbval - 128);
+      crval = ((pmask & OD_CR_MASK) >> 2) * (crval - 128);
+      /*This is intentionally slow and very accurate.*/
+      rval = OD_CLAMPI(
+          0,
+          (int32_t)OD_DIV_ROUND(
+              2916394880000LL * yval + 4490222169144LL * crval, 9745792000LL),
+          65535);
+      gval = OD_CLAMPI(0,
+                       (int32_t)OD_DIV_ROUND(2916394880000LL * yval -
+                                                 534117096223LL * cbval -
+                                                 1334761232047LL * crval,
+                                             9745792000LL),
+                       65535);
+      bval = OD_CLAMPI(
+          0,
+          (int32_t)OD_DIV_ROUND(
+              2916394880000LL * yval + 5290866304968LL * cbval, 9745792000LL),
+          65535);
+      unsigned char *px_row = p;
+      for (int v = 0; v < zoom; v++) {
+        unsigned char *px = px_row;
+        for (int u = 0; u < zoom; u++) {
+          *(px + 0) = (unsigned char)(rval >> 8);
+          *(px + 1) = (unsigned char)(gval >> 8);
+          *(px + 2) = (unsigned char)(bval >> 8);
+          px += 3;
+        }
+        px_row += p_stride;
+      }
+      if (hbd) {
+        int dc = ((y16 - y_row16) & 1) | (1 - img->x_chroma_shift);
+        y16++;
+        cb16 += dc;
+        cr16 += dc;
+      } else {
+        int dc = ((y - y_row) & 1) | (1 - img->x_chroma_shift);
+        y++;
+        cb += dc;
+        cr += dc;
+      }
+      p += zoom * 3;
+    }
+    int dc = -((j & 1) | (1 - img->y_chroma_shift));
+    if (hbd) {
+      y_row16 += y_stride;
+      cb_row16 += dc & cb_stride;
+      cr_row16 += dc & cr_stride;
+    } else {
+      y_row += y_stride;
+      cb_row += dc & cb_stride;
+      cr_row += dc & cr_stride;
+    }
+    p_row += zoom * p_stride;
+  }
+}
+
+void AnalyzerPanel::computeBitsPerPixel() {
+  Accounting *acct;
+  double bpp_total;
+  int totals_q3[MAX_SYMBOL_TYPES] = { 0 };
+  int sym_count[MAX_SYMBOL_TYPES] = { 0 };
+  decoder.getAccountingStruct(&acct);
+  for (int j = 0; j < decoder.getHeight(); j++) {
+    for (int i = 0; i < decoder.getWidth(); i++) {
+      bpp_q3[j * decoder.getWidth() + i] = 0.0;
+    }
+  }
+  bpp_total = 0;
+  for (int i = 0; i < acct->syms.num_syms; i++) {
+    AccountingSymbol *s;
+    s = &acct->syms.syms[i];
+    totals_q3[s->id] += s->bits;
+    sym_count[s->id] += s->samples;
+  }
+  printf("=== Frame: %-3i ===\n", decoder.frame - 1);
+  for (int i = 0; i < acct->syms.dictionary.num_strs; i++) {
+    if (totals_q3[i]) {
+      printf("%30s = %10.3f (%f bit/symbol)\n", acct->syms.dictionary.strs[i],
+             (float)totals_q3[i] / 8, (float)totals_q3[i] / 8 / sym_count[i]);
+    }
+  }
+  printf("\n");
+}
+
+void AnalyzerPanel::togglePadding() {
+  decoder.togglePadding();
+  updateDisplaySize();
+}
+
+bool AnalyzerPanel::nextFrame() {
+  if (decoder.step()) {
+    refresh();
+    return true;
+  }
+  return false;
+}
+
+void AnalyzerPanel::refresh() {
+  if (bit_accounting) {
+    computeBitsPerPixel();
+  }
+  render();
+}
+
+int AnalyzerPanel::getDisplayWidth() const { return zoom * decoder.getWidth(); }
+
+int AnalyzerPanel::getDisplayHeight() const {
+  return zoom * decoder.getHeight();
+}
+
+bool AnalyzerPanel::updateDisplaySize() {
+  unsigned char *p = (unsigned char *)malloc(
+      sizeof(*p) * 3 * getDisplayWidth() * getDisplayHeight());
+  if (p == NULL) {
+    return false;
+  }
+  free(pixels);
+  pixels = p;
+  SetSize(getDisplayWidth(), getDisplayHeight());
+  return true;
+}
+
+bool AnalyzerPanel::open(const wxString &path) {
+  if (!decoder.open(path)) {
+    return false;
+  }
+  if (!setZoom(MIN_ZOOM)) {
+    return false;
+  }
+  if (bit_accounting) {
+    bpp_q3 = (double *)malloc(sizeof(*bpp_q3) * decoder.getWidth() *
+                              decoder.getHeight());
+    if (bpp_q3 == NULL) {
+      fprintf(stderr, "Could not allocate memory for bit accounting\n");
+      close();
+      return false;
+    }
+  }
+  if (!nextFrame()) {
+    close();
+    return false;
+  }
+  SetFocus();
+  return true;
+}
+
+void AnalyzerPanel::close() {
+  decoder.close();
+  free(pixels);
+  pixels = NULL;
+  free(bpp_q3);
+  bpp_q3 = NULL;
+}
+
+int AnalyzerPanel::getZoom() const { return zoom; }
+
+bool AnalyzerPanel::setZoom(int z) {
+  if (z <= MAX_ZOOM && z >= MIN_ZOOM && zoom != z) {
+    int old_zoom = zoom;
+    zoom = z;
+    if (!updateDisplaySize()) {
+      zoom = old_zoom;
+      return false;
+    }
+    return true;
+  }
+  return false;
+}
+
+void AnalyzerPanel::onPaint(wxPaintEvent &) {
+  wxBitmap bmp(wxImage(getDisplayWidth(), getDisplayHeight(), pixels, true));
+  wxBufferedPaintDC dc(this, bmp);
+}
+
+class AnalyzerFrame : public wxFrame {
+  DECLARE_EVENT_TABLE()
+
+ private:
+  AnalyzerPanel *panel;
+  const bool bit_accounting;
+
+  wxMenu *fileMenu;
+  wxMenu *viewMenu;
+  wxMenu *playbackMenu;
+
+ public:
+  AnalyzerFrame(const bool bit_accounting);  // NOLINT
+
+  void onOpen(wxCommandEvent &event);   // NOLINT
+  void onClose(wxCommandEvent &event);  // NOLINT
+  void onQuit(wxCommandEvent &event);   // NOLINT
+
+  void onTogglePadding(wxCommandEvent &event);  // NOLINT
+  void onZoomIn(wxCommandEvent &event);         // NOLINT
+  void onZoomOut(wxCommandEvent &event);        // NOLINT
+  void onActualSize(wxCommandEvent &event);     // NOLINT
+
+  void onToggleViewMenuCheckBox(wxCommandEvent &event);          // NOLINT
+  void onResetAndToggleViewMenuCheckBox(wxCommandEvent &event);  // NOLINT
+
+  void onNextFrame(wxCommandEvent &event);  // NOLINT
+  void onGotoFrame(wxCommandEvent &event);  // NOLINT
+  void onRestart(wxCommandEvent &event);    // NOLINT
+
+  void onAbout(wxCommandEvent &event);  // NOLINT
+
+  bool open(const wxString &path);
+  bool setZoom(int zoom);
+  void updateViewMenu();
+};
+
+enum {
+  wxID_NEXT_FRAME = 6000,
+  wxID_SHOW_Y,
+  wxID_SHOW_U,
+  wxID_SHOW_V,
+  wxID_GOTO_FRAME,
+  wxID_RESTART,
+  wxID_ACTUAL_SIZE,
+  wxID_PADDING
+};
+
+BEGIN_EVENT_TABLE(AnalyzerFrame, wxFrame)
+EVT_MENU(wxID_OPEN, AnalyzerFrame::onOpen)
+EVT_MENU(wxID_CLOSE, AnalyzerFrame::onClose)
+EVT_MENU(wxID_EXIT, AnalyzerFrame::onQuit)
+EVT_MENU(wxID_PADDING, AnalyzerFrame::onTogglePadding)
+EVT_MENU(wxID_ZOOM_IN, AnalyzerFrame::onZoomIn)
+EVT_MENU(wxID_ZOOM_OUT, AnalyzerFrame::onZoomOut)
+EVT_MENU(wxID_ACTUAL_SIZE, AnalyzerFrame::onActualSize)
+EVT_MENU(wxID_SHOW_Y, AnalyzerFrame::onResetAndToggleViewMenuCheckBox)
+EVT_MENU(wxID_SHOW_U, AnalyzerFrame::onResetAndToggleViewMenuCheckBox)
+EVT_MENU(wxID_SHOW_V, AnalyzerFrame::onResetAndToggleViewMenuCheckBox)
+EVT_MENU(wxID_NEXT_FRAME, AnalyzerFrame::onNextFrame)
+EVT_MENU(wxID_GOTO_FRAME, AnalyzerFrame::onGotoFrame)
+EVT_MENU(wxID_RESTART, AnalyzerFrame::onRestart)
+EVT_MENU(wxID_ABOUT, AnalyzerFrame::onAbout)
+END_EVENT_TABLE()
+
+AnalyzerFrame::AnalyzerFrame(const bool bit_accounting)
+    : wxFrame(NULL, wxID_ANY, _("AV1 Stream Analyzer"), wxDefaultPosition,
+              wxDefaultSize, wxDEFAULT_FRAME_STYLE),
+      panel(NULL), bit_accounting(bit_accounting) {
+  wxMenuBar *mb = new wxMenuBar();
+
+  fileMenu = new wxMenu();
+  fileMenu->Append(wxID_OPEN, _("&Open...\tCtrl-O"), _("Open AV1 file"));
+  fileMenu->Append(wxID_CLOSE, _("&Close\tCtrl-W"), _("Close AV1 file"));
+  fileMenu->Enable(wxID_CLOSE, false);
+  fileMenu->Append(wxID_EXIT, _("E&xit\tCtrl-Q"), _("Quit this program"));
+  mb->Append(fileMenu, _("&File"));
+
+  wxAcceleratorEntry entries[2];
+  entries[0].Set(wxACCEL_CTRL, (int)'=', wxID_ZOOM_IN);
+  entries[1].Set(wxACCEL_CTRL | wxACCEL_SHIFT, (int)'-', wxID_ZOOM_OUT);
+  wxAcceleratorTable accel(2, entries);
+  this->SetAcceleratorTable(accel);
+
+  viewMenu = new wxMenu();
+  +viewMenu->Append(wxID_PADDING, _("Toggle padding\tCtrl-p"),
+                    _("Show padding"));
+  viewMenu->Append(wxID_ZOOM_IN, _("Zoom-In\tCtrl-+"), _("Double image size"));
+  viewMenu->Append(wxID_ZOOM_OUT, _("Zoom-Out\tCtrl--"), _("Half image size"));
+  viewMenu->Append(wxID_ACTUAL_SIZE, _("Actual size\tCtrl-0"),
+                   _("Actual size of the frame"));
+  viewMenu->AppendSeparator();
+  viewMenu->AppendCheckItem(wxID_SHOW_Y, _("&Y plane\tCtrl-Y"),
+                            _("Show Y plane"));
+  viewMenu->AppendCheckItem(wxID_SHOW_U, _("&U plane\tCtrl-U"),
+                            _("Show U plane"));
+  viewMenu->AppendCheckItem(wxID_SHOW_V, _("&V plane\tCtrl-V"),
+                            _("Show V plane"));
+  mb->Append(viewMenu, _("&View"));
+
+  playbackMenu = new wxMenu();
+  playbackMenu->Append(wxID_NEXT_FRAME, _("Next frame\tCtrl-."),
+                       _("Go to next frame"));
+  /*playbackMenu->Append(wxID_RESTART, _("&Restart\tCtrl-R"),
+                       _("Set video to frame 0"));
+  playbackMenu->Append(wxID_GOTO_FRAME, _("Jump to Frame\tCtrl-J"),
+                       _("Go to frame number"));*/
+  mb->Append(playbackMenu, _("&Playback"));
+
+  wxMenu *helpMenu = new wxMenu();
+  helpMenu->Append(wxID_ABOUT, _("&About...\tF1"), _("Show about dialog"));
+  mb->Append(helpMenu, _("&Help"));
+
+  SetMenuBar(mb);
+
+  CreateStatusBar(1);
+}
+
+void AnalyzerFrame::onOpen(wxCommandEvent &WXUNUSED(event)) {
+  wxFileDialog openFileDialog(this, _("Open file"), wxEmptyString,
+                              wxEmptyString, _("AV1 files (*.ivf)|*.ivf"),
+                              wxFD_OPEN | wxFD_FILE_MUST_EXIST);
+  if (openFileDialog.ShowModal() != wxID_CANCEL) {
+    open(openFileDialog.GetPath());
+  }
+}
+
+void AnalyzerFrame::onClose(wxCommandEvent &WXUNUSED(event)) {}
+
+void AnalyzerFrame::onQuit(wxCommandEvent &WXUNUSED(event)) { Close(true); }
+
+void AnalyzerFrame::onTogglePadding(wxCommandEvent &WXUNUSED(event)) {
+  panel->togglePadding();
+  SetClientSize(panel->GetSize());
+  panel->render();
+  panel->Refresh();
+}
+
+void AnalyzerFrame::onZoomIn(wxCommandEvent &WXUNUSED(event)) {
+  setZoom(panel->getZoom() + 1);
+}
+
+void AnalyzerFrame::onZoomOut(wxCommandEvent &WXUNUSED(event)) {
+  setZoom(panel->getZoom() - 1);
+}
+
+void AnalyzerFrame::onActualSize(wxCommandEvent &WXUNUSED(event)) {
+  setZoom(MIN_ZOOM);
+}
+
+void AnalyzerFrame::onToggleViewMenuCheckBox(wxCommandEvent &event) {  // NOLINT
+  GetMenuBar()->Check(event.GetId(), event.IsChecked());
+  updateViewMenu();
+}
+
+void AnalyzerFrame::onResetAndToggleViewMenuCheckBox(
+    wxCommandEvent &event) {  // NOLINT
+  int id = event.GetId();
+  if (id != wxID_SHOW_Y && id != wxID_SHOW_U && id != wxID_SHOW_V) {
+    GetMenuBar()->Check(wxID_SHOW_Y, true);
+    GetMenuBar()->Check(wxID_SHOW_U, true);
+    GetMenuBar()->Check(wxID_SHOW_V, true);
+  }
+  onToggleViewMenuCheckBox(event);
+}
+
+void AnalyzerFrame::onNextFrame(wxCommandEvent &WXUNUSED(event)) {
+  panel->nextFrame();
+  panel->Refresh(false);
+}
+
+void AnalyzerFrame::onGotoFrame(wxCommandEvent &WXUNUSED(event)) {}
+
+void AnalyzerFrame::onRestart(wxCommandEvent &WXUNUSED(event)) {}
+
+void AnalyzerFrame::onAbout(wxCommandEvent &WXUNUSED(event)) {
+  wxAboutDialogInfo info;
+  info.SetName(_("AV1 Bitstream Analyzer"));
+  info.SetVersion(_("0.1-beta"));
+  info.SetDescription(
+      _("This program implements a bitstream analyzer for AV1"));
+  info.SetCopyright(
+      wxT("(C) 2017 Alliance for Open Media <negge@mozilla.com>"));
+  wxAboutBox(info);
+}
+
+bool AnalyzerFrame::open(const wxString &path) {
+  panel = new AnalyzerPanel(this, path, bit_accounting);
+  if (panel->open(path)) {
+    SetClientSize(panel->GetSize());
+    return true;
+  } else {
+    delete panel;
+    return false;
+  }
+}
+
+bool AnalyzerFrame::setZoom(int zoom) {
+  if (panel->setZoom(zoom)) {
+    GetMenuBar()->Enable(wxID_ACTUAL_SIZE, zoom != MIN_ZOOM);
+    GetMenuBar()->Enable(wxID_ZOOM_IN, zoom != MAX_ZOOM);
+    GetMenuBar()->Enable(wxID_ZOOM_OUT, zoom != MIN_ZOOM);
+    SetClientSize(panel->GetSize());
+    panel->render();
+    panel->Refresh();
+    return true;
+  }
+  return false;
+}
+
+void AnalyzerFrame::updateViewMenu() {
+  panel->setShowPlane(GetMenuBar()->IsChecked(wxID_SHOW_Y), OD_LUMA_MASK);
+  panel->setShowPlane(GetMenuBar()->IsChecked(wxID_SHOW_U), OD_CB_MASK);
+  panel->setShowPlane(GetMenuBar()->IsChecked(wxID_SHOW_V), OD_CR_MASK);
+  SetClientSize(panel->GetSize());
+  panel->render();
+  panel->Refresh(false);
+}
+
+class Analyzer : public wxApp {
+ private:
+  AnalyzerFrame *frame;
+
+ public:
+  void OnInitCmdLine(wxCmdLineParser &parser);    // NOLINT
+  bool OnCmdLineParsed(wxCmdLineParser &parser);  // NOLINT
+};
+
+static const wxCmdLineEntryDesc CMD_LINE_DESC[] = {
+  { wxCMD_LINE_SWITCH, _("h"), _("help"), _("Display this help and exit."),
+    wxCMD_LINE_VAL_NONE, wxCMD_LINE_OPTION_HELP },
+  { wxCMD_LINE_SWITCH, _("a"), _("bit-accounting"), _("Enable bit accounting"),
+    wxCMD_LINE_VAL_NONE, wxCMD_LINE_PARAM_OPTIONAL },
+  { wxCMD_LINE_PARAM, NULL, NULL, _("input.ivf"), wxCMD_LINE_VAL_STRING,
+    wxCMD_LINE_PARAM_OPTIONAL },
+  { wxCMD_LINE_NONE }
+};
+
+void Analyzer::OnInitCmdLine(wxCmdLineParser &parser) {  // NOLINT
+  parser.SetDesc(CMD_LINE_DESC);
+  parser.SetSwitchChars(_("-"));
+}
+
+bool Analyzer::OnCmdLineParsed(wxCmdLineParser &parser) {  // NOLINT
+  bool bit_accounting = parser.Found(_("a"));
+  if (bit_accounting && !CONFIG_ACCOUNTING) {
+    fprintf(stderr,
+            "Bit accounting support not found. "
+            "Recompile with:\n./cmake -DCONFIG_ACCOUNTING=1\n");
+    return false;
+  }
+  frame = new AnalyzerFrame(parser.Found(_("a")));
+  frame->Show();
+  if (parser.GetParamCount() > 0) {
+    return frame->open(parser.GetParam(0));
+  }
+  return true;
+}
+
+void usage_exit(void) {
+  fprintf(stderr, "uhh\n");
+  exit(EXIT_FAILURE);
+}
+
+IMPLEMENT_APP(Analyzer)
diff --git a/libs/libaom/src/examples/aom_cx_set_ref.c b/libs/libaom/src/examples/aom_cx_set_ref.c
new file mode 100644
index 000000000..2f4f6586f
--- /dev/null
+++ b/libs/libaom/src/examples/aom_cx_set_ref.c
@@ -0,0 +1,383 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// AV1 Set Reference Frame
+// ============================
+//
+// This is an example demonstrating how to overwrite the AV1 encoder's
+// internal reference frame. In the sample we set the last frame to the
+// current frame. This technique could be used to bounce between two cameras.
+//
+// The decoder would also have to set the reference frame to the same value
+// on the same frame, or the video will become corrupt. The 'test_decode'
+// variable is set to 1 in this example that tests if the encoder and decoder
+// results are matching.
+//
+// Usage
+// -----
+// This example encodes a raw video. And the last argument passed in specifies
+// the frame number to update the reference frame on. For example, run
+// examples/aom_cx_set_ref av1 352 288 in.yuv out.ivf 4 30
+// The parameter is parsed as follows:
+//
+//
+// Extra Variables
+// ---------------
+// This example maintains the frame number passed on the command line
+// in the `update_frame_num` variable.
+//
+//
+// Configuration
+// -------------
+//
+// The reference frame is updated on the frame specified on the command
+// line.
+//
+// Observing The Effects
+// ---------------------
+// The encoder and decoder results should be matching when the same reference
+// frame setting operation is done in both encoder and decoder. Otherwise,
+// the encoder/decoder mismatch would be seen.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom/aom_decoder.h"
+#include "aom/aom_encoder.h"
+#include "aom/aomcx.h"
+#include "aom_scale/yv12config.h"
+#include "common/tools_common.h"
+#include "common/video_writer.h"
+#include "examples/encoder_util.h"
+
+static const char *exec_name;
+
+void usage_exit() {
+  fprintf(stderr,
+          "Usage: %s <codec> <width> <height> <infile> <outfile> "
+          "<frame> <limit(optional)>\n",
+          exec_name);
+  exit(EXIT_FAILURE);
+}
+
+static void testing_decode(aom_codec_ctx_t *encoder, aom_codec_ctx_t *decoder,
+                           unsigned int frame_out, int *mismatch_seen) {
+  aom_image_t enc_img, dec_img;
+
+  if (*mismatch_seen) return;
+
+  /* Get the internal reference frame */
+  if (aom_codec_control(encoder, AV1_GET_NEW_FRAME_IMAGE, &enc_img))
+    die_codec(encoder, "Failed to get encoder reference frame");
+  if (aom_codec_control(decoder, AV1_GET_NEW_FRAME_IMAGE, &dec_img))
+    die_codec(decoder, "Failed to get decoder reference frame");
+
+  if ((enc_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) !=
+      (dec_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH)) {
+    if (enc_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) {
+      aom_image_t enc_hbd_img;
+      aom_img_alloc(&enc_hbd_img, enc_img.fmt - AOM_IMG_FMT_HIGHBITDEPTH,
+                    enc_img.d_w, enc_img.d_h, 16);
+      aom_img_truncate_16_to_8(&enc_hbd_img, &enc_img);
+      enc_img = enc_hbd_img;
+    }
+    if (dec_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) {
+      aom_image_t dec_hbd_img;
+      aom_img_alloc(&dec_hbd_img, dec_img.fmt - AOM_IMG_FMT_HIGHBITDEPTH,
+                    dec_img.d_w, dec_img.d_h, 16);
+      aom_img_truncate_16_to_8(&dec_hbd_img, &dec_img);
+      dec_img = dec_hbd_img;
+    }
+  }
+
+  if (!aom_compare_img(&enc_img, &dec_img)) {
+    int y[4], u[4], v[4];
+    if (enc_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) {
+      aom_find_mismatch_high(&enc_img, &dec_img, y, u, v);
+    } else {
+      aom_find_mismatch(&enc_img, &dec_img, y, u, v);
+    }
+
+    printf(
+        "Encode/decode mismatch on frame %d at"
+        " Y[%d, %d] {%d/%d},"
+        " U[%d, %d] {%d/%d},"
+        " V[%d, %d] {%d/%d}",
+        frame_out, y[0], y[1], y[2], y[3], u[0], u[1], u[2], u[3], v[0], v[1],
+        v[2], v[3]);
+    *mismatch_seen = 1;
+  }
+
+  aom_img_free(&enc_img);
+  aom_img_free(&dec_img);
+}
+
+static int encode_frame(aom_codec_ctx_t *ecodec, aom_image_t *img,
+                        unsigned int frame_in, AvxVideoWriter *writer,
+                        int test_decode, aom_codec_ctx_t *dcodec,
+                        unsigned int *frame_out, int *mismatch_seen,
+                        aom_image_t *ext_ref) {
+  int got_pkts = 0;
+  aom_codec_iter_t iter = NULL;
+  const aom_codec_cx_pkt_t *pkt = NULL;
+  int got_data;
+  const aom_codec_err_t res = aom_codec_encode(ecodec, img, frame_in, 1, 0);
+  if (res != AOM_CODEC_OK) die_codec(ecodec, "Failed to encode frame");
+
+  got_data = 0;
+
+  while ((pkt = aom_codec_get_cx_data(ecodec, &iter)) != NULL) {
+    got_pkts = 1;
+
+    if (pkt->kind == AOM_CODEC_CX_FRAME_PKT) {
+      const int keyframe = (pkt->data.frame.flags & AOM_FRAME_IS_KEY) != 0;
+
+      ++*frame_out;
+
+      if (!aom_video_writer_write_frame(writer, pkt->data.frame.buf,
+                                        pkt->data.frame.sz,
+                                        pkt->data.frame.pts)) {
+        die_codec(ecodec, "Failed to write compressed frame");
+      }
+      printf(keyframe ? "K" : ".");
+      fflush(stdout);
+      got_data = 1;
+
+      // Decode 1 frame.
+      if (test_decode) {
+        if (aom_codec_decode(dcodec, pkt->data.frame.buf,
+                             (unsigned int)pkt->data.frame.sz, NULL))
+          die_codec(dcodec, "Failed to decode frame.");
+
+        // Copy out first decoded frame, and use it as reference later.
+        if (*frame_out == 1 && ext_ref != NULL)
+          if (aom_codec_control(dcodec, AV1_COPY_NEW_FRAME_IMAGE, ext_ref))
+            die_codec(dcodec, "Failed to get decoder new frame");
+      }
+    }
+  }
+
+  // Mismatch checking
+  if (got_data && test_decode) {
+    testing_decode(ecodec, dcodec, *frame_out, mismatch_seen);
+  }
+
+  return got_pkts;
+}
+
+int main(int argc, char **argv) {
+  FILE *infile = NULL;
+  // Encoder
+  aom_codec_ctx_t ecodec;
+  aom_codec_enc_cfg_t cfg;
+  unsigned int frame_in = 0;
+  aom_image_t raw;
+  aom_image_t raw_shift;
+  aom_image_t ext_ref;
+  aom_codec_err_t res;
+  AvxVideoInfo info;
+  AvxVideoWriter *writer = NULL;
+  const AvxInterface *encoder = NULL;
+  int flags = 0;
+  int allocated_raw_shift = 0;
+  aom_img_fmt_t raw_fmt = AOM_IMG_FMT_I420;
+  aom_img_fmt_t ref_fmt = AOM_IMG_FMT_I420;
+
+  // Test encoder/decoder mismatch.
+  int test_decode = 1;
+  // Decoder
+  aom_codec_ctx_t dcodec;
+  unsigned int frame_out = 0;
+
+  // The frame number to set reference frame on
+  unsigned int update_frame_num = 0;
+  int mismatch_seen = 0;
+
+  const int fps = 30;
+  const int bitrate = 500;
+
+  const char *codec_arg = NULL;
+  const char *width_arg = NULL;
+  const char *height_arg = NULL;
+  const char *infile_arg = NULL;
+  const char *outfile_arg = NULL;
+  const char *update_frame_num_arg = NULL;
+  unsigned int limit = 0;
+  exec_name = argv[0];
+
+  // Clear explicitly, as simply assigning "{ 0 }" generates
+  // "missing-field-initializers" warning in some compilers.
+  memset(&ecodec, 0, sizeof(ecodec));
+  memset(&cfg, 0, sizeof(cfg));
+  memset(&info, 0, sizeof(info));
+
+  if (argc < 7) die("Invalid number of arguments");
+
+  codec_arg = argv[1];
+  width_arg = argv[2];
+  height_arg = argv[3];
+  infile_arg = argv[4];
+  outfile_arg = argv[5];
+  update_frame_num_arg = argv[6];
+
+  encoder = get_aom_encoder_by_name(codec_arg);
+  if (!encoder) die("Unsupported codec.");
+
+  update_frame_num = (unsigned int)strtoul(update_frame_num_arg, NULL, 0);
+  // In AV1, the reference buffers (cm->buffer_pool->frame_bufs[i].buf) are
+  // allocated while calling aom_codec_encode(), thus, setting reference for
+  // 1st frame isn't supported.
+  if (update_frame_num <= 1) {
+    die("Couldn't parse frame number '%s'\n", update_frame_num_arg);
+  }
+
+  if (argc > 7) {
+    limit = (unsigned int)strtoul(argv[7], NULL, 0);
+    if (update_frame_num > limit)
+      die("Update frame number couldn't larger than limit\n");
+  }
+
+  info.codec_fourcc = encoder->fourcc;
+  info.frame_width = (int)strtol(width_arg, NULL, 0);
+  info.frame_height = (int)strtol(height_arg, NULL, 0);
+  info.time_base.numerator = 1;
+  info.time_base.denominator = fps;
+
+  if (info.frame_width <= 0 || info.frame_height <= 0) {
+    die("Invalid frame size: %dx%d", info.frame_width, info.frame_height);
+  }
+
+  // In this test, the bit depth of input video is 8-bit, and the input format
+  // is AOM_IMG_FMT_I420.
+  if (!aom_img_alloc(&raw, raw_fmt, info.frame_width, info.frame_height, 32)) {
+    die("Failed to allocate image.");
+  }
+
+  if (FORCE_HIGHBITDEPTH_DECODING) ref_fmt |= AOM_IMG_FMT_HIGHBITDEPTH;
+  // Allocate memory with the border so that it can be used as a reference.
+  if (!aom_img_alloc_with_border(&ext_ref, ref_fmt, info.frame_width,
+                                 info.frame_height, 32, 8,
+                                 AOM_BORDER_IN_PIXELS)) {
+    die("Failed to allocate image.");
+  }
+
+  printf("Using %s\n", aom_codec_iface_name(encoder->codec_interface()));
+
+  res = aom_codec_enc_config_default(encoder->codec_interface(), &cfg, 0);
+  if (res) die_codec(&ecodec, "Failed to get default codec config.");
+
+  cfg.g_w = info.frame_width;
+  cfg.g_h = info.frame_height;
+  cfg.g_timebase.num = info.time_base.numerator;
+  cfg.g_timebase.den = info.time_base.denominator;
+  cfg.rc_target_bitrate = bitrate;
+  cfg.g_lag_in_frames = 3;
+  cfg.g_bit_depth = AOM_BITS_8;
+
+  flags |= (cfg.g_bit_depth > AOM_BITS_8 || FORCE_HIGHBITDEPTH_DECODING)
+               ? AOM_CODEC_USE_HIGHBITDEPTH
+               : 0;
+
+  writer = aom_video_writer_open(outfile_arg, kContainerIVF, &info);
+  if (!writer) die("Failed to open %s for writing.", outfile_arg);
+
+  if (!(infile = fopen(infile_arg, "rb")))
+    die("Failed to open %s for reading.", infile_arg);
+
+  if (aom_codec_enc_init(&ecodec, encoder->codec_interface(), &cfg, flags))
+    die_codec(&ecodec, "Failed to initialize encoder");
+
+  // Disable alt_ref.
+  if (aom_codec_control(&ecodec, AOME_SET_ENABLEAUTOALTREF, 0))
+    die_codec(&ecodec, "Failed to set enable auto alt ref");
+
+  if (test_decode) {
+    const AvxInterface *decoder = get_aom_decoder_by_name(codec_arg);
+    if (aom_codec_dec_init(&dcodec, decoder->codec_interface(), NULL, 0))
+      die_codec(&dcodec, "Failed to initialize decoder.");
+  }
+
+  // Encode frames.
+  while (aom_img_read(&raw, infile)) {
+    if (limit && frame_in >= limit) break;
+    aom_image_t *frame_to_encode;
+
+    if (FORCE_HIGHBITDEPTH_DECODING) {
+      // Need to allocate larger buffer to use hbd internal.
+      int input_shift = 0;
+      if (!allocated_raw_shift) {
+        aom_img_alloc(&raw_shift, raw_fmt | AOM_IMG_FMT_HIGHBITDEPTH,
+                      info.frame_width, info.frame_height, 32);
+        allocated_raw_shift = 1;
+      }
+      aom_img_upshift(&raw_shift, &raw, input_shift);
+      frame_to_encode = &raw_shift;
+    } else {
+      frame_to_encode = &raw;
+    }
+
+    if (update_frame_num > 1 && frame_out + 1 == update_frame_num) {
+      av1_ref_frame_t ref;
+      ref.idx = 0;
+      ref.use_external_ref = 0;
+      ref.img = ext_ref;
+      // Set reference frame in encoder.
+      if (aom_codec_control(&ecodec, AV1_SET_REFERENCE, &ref))
+        die_codec(&ecodec, "Failed to set encoder reference frame");
+      printf(" <SET_REF>");
+
+      // If set_reference in decoder is commented out, the enc/dec mismatch
+      // would be seen.
+      if (test_decode) {
+        ref.use_external_ref = 1;
+        if (aom_codec_control(&dcodec, AV1_SET_REFERENCE, &ref))
+          die_codec(&dcodec, "Failed to set decoder reference frame");
+      }
+    }
+
+    encode_frame(&ecodec, frame_to_encode, frame_in, writer, test_decode,
+                 &dcodec, &frame_out, &mismatch_seen, &ext_ref);
+    frame_in++;
+    if (mismatch_seen) break;
+  }
+
+  // Flush encoder.
+  if (!mismatch_seen)
+    while (encode_frame(&ecodec, NULL, frame_in, writer, test_decode, &dcodec,
+                        &frame_out, &mismatch_seen, NULL)) {
+    }
+
+  printf("\n");
+  fclose(infile);
+  printf("Processed %d frames.\n", frame_out);
+
+  if (test_decode) {
+    if (!mismatch_seen)
+      printf("Encoder/decoder results are matching.\n");
+    else
+      printf("Encoder/decoder results are NOT matching.\n");
+  }
+
+  if (test_decode)
+    if (aom_codec_destroy(&dcodec))
+      die_codec(&dcodec, "Failed to destroy decoder");
+
+  if (allocated_raw_shift) aom_img_free(&raw_shift);
+  aom_img_free(&ext_ref);
+  aom_img_free(&raw);
+  if (aom_codec_destroy(&ecodec))
+    die_codec(&ecodec, "Failed to destroy encoder.");
+
+  aom_video_writer_close(writer);
+
+  return EXIT_SUCCESS;
+}
diff --git a/libs/libaom/src/examples/av1_dec_fuzzer.cc b/libs/libaom/src/examples/av1_dec_fuzzer.cc
new file mode 100644
index 000000000..1cddc8cc1
--- /dev/null
+++ b/libs/libaom/src/examples/av1_dec_fuzzer.cc
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*
+ * See build_av1_dec_fuzzer.sh for building instructions.
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <algorithm>
+#include <memory>
+#include "config/aom_config.h"
+#include "aom/aom_decoder.h"
+#include "aom/aomdx.h"
+#include "aom_ports/mem_ops.h"
+
+#define IVF_FRAME_HDR_SZ (4 + 8) /* 4 byte size + 8 byte timestamp */
+#define IVF_FILE_HDR_SZ 32
+
+extern "C" void usage_exit(void) { exit(EXIT_FAILURE); }
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
+  if (size <= IVF_FILE_HDR_SZ) {
+    return 0;
+  }
+
+  const aom_codec_iface_t *codec_interface = aom_codec_av1_dx();
+  aom_codec_ctx_t codec;
+  // Set thread count in the range [1, 64].
+  const unsigned int threads = (data[IVF_FILE_HDR_SZ] & 0x3f) + 1;
+  aom_codec_dec_cfg_t cfg = { threads, 0, 0, !FORCE_HIGHBITDEPTH_DECODING };
+  if (aom_codec_dec_init(&codec, codec_interface, &cfg, 0)) {
+    return 0;
+  }
+
+  data += IVF_FILE_HDR_SZ;
+  size -= IVF_FILE_HDR_SZ;
+
+  while (size > IVF_FRAME_HDR_SZ) {
+    size_t frame_size = mem_get_le32(data);
+    size -= IVF_FRAME_HDR_SZ;
+    data += IVF_FRAME_HDR_SZ;
+    frame_size = std::min(size, frame_size);
+
+    const aom_codec_err_t err =
+        aom_codec_decode(&codec, data, frame_size, nullptr);
+    static_cast<void>(err);
+    aom_codec_iter_t iter = nullptr;
+    aom_image_t *img = nullptr;
+    while ((img = aom_codec_get_frame(&codec, &iter)) != nullptr) {
+    }
+    data += frame_size;
+    size -= frame_size;
+  }
+  aom_codec_destroy(&codec);
+  return 0;
+}
diff --git a/libs/libaom/src/examples/build_av1_dec_fuzzer.sh b/libs/libaom/src/examples/build_av1_dec_fuzzer.sh
new file mode 100644
index 000000000..0dcb254da
--- /dev/null
+++ b/libs/libaom/src/examples/build_av1_dec_fuzzer.sh
@@ -0,0 +1,70 @@
+#!/bin/bash
+#
+# Copyright (c) 2019, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and
+# the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+# was not distributed with this source code in the LICENSE file, you can
+# obtain it at www.aomedia.org/license/software. If the Alliance for Open
+# Media Patent License 1.0 was not distributed with this source code in the
+# PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+#
+###############################################################################
+# Fuzzer for libaom decoder.
+# ==========================
+# Requirements
+# ---------------------
+# Clang6.0 or above (must support -fsanitize=fuzzer -fsanitize=fuzzer-no-link)
+#
+# References:
+# ---------------------
+# http://llvm.org/docs/LibFuzzer.html
+# https://github.com/google/oss-fuzz
+#
+# Steps to build / run
+# ---------------------
+
+set -eu
+
+# Have a copy of AOM and a build directory ready.
+if [[ $# -ne 2 ]]; then
+  echo "Pass in the AOM source tree as first argument, and a build directory "
+  echo "as the second argument. The AOM source tree can be obtained via: "
+  echo "  git clone https://aomedia.googlesource.com/aom"
+  exit 2
+fi
+if [[ -z "$CC" ]]; then
+  echo "Set the CC environment variable to point to your C compiler."
+  exit 2
+fi
+if [[ -z "$CXX" ]]; then
+  echo "Set the CXX environment variable to point to your C++ compiler."
+  exit 2
+fi
+
+AOM_DIR=$1
+BUILD_DIR=$2
+# Run CMake with address sanitizer enabled and build the codec.
+# Enable DO_RANGE_CHECK_CLAMP to suppress the noise of integer overflows
+# in the transform functions. Also set memory limits.
+EXTRA_C_FLAGS='-DDO_RANGE_CHECK_CLAMP=1 -DAOM_MAX_ALLOCABLE_MEMORY=1073741824'
+cd "${BUILD_DIR}"
+cmake "${AOM_DIR}" -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCONFIG_PIC=1 \
+  -DCONFIG_SCALABILITY=0 -DFORCE_HIGHBITDEPTH_DECODING=0 \
+  -DCONFIG_AV1_ENCODER=0 -DENABLE_EXAMPLES=0 -DENABLE_DOCS=0 -DENABLE_TESTS=0 \
+  -DCONFIG_SIZE_LIMIT=1 -DDECODE_HEIGHT_LIMIT=12288 -DDECODE_WIDTH_LIMIT=12288 \
+  -DAOM_EXTRA_C_FLAGS="${EXTRA_C_FLAGS}" \
+  -DAOM_EXTRA_CXX_FLAGS="${EXTRA_C_FLAGS}" -DSANITIZE=fuzzer-no-link,address
+
+# Build the codec.
+make -j$(nproc)
+
+# Build the av1 fuzzer
+$CXX -std=c++11 -DDECODER=av1 -I${AOM_DIR} -I${BUILD_DIR} \
+    -fsanitize=fuzzer,address -Wl,--start-group \
+    ${AOM_DIR}/examples/av1_dec_fuzzer.cc -o ${BUILD_DIR}/av1_dec_fuzzer \
+    ${BUILD_DIR}/libaom.a -Wl,--end-group
+
+echo "Fuzzer built at ${BUILD_DIR}/av1_dec_fuzzer."
+echo "Create a corpus directory, copy IVF files in there, and run:"
+echo "  av1_dec_fuzzer CORPUS_DIR"
diff --git a/libs/libaom/src/examples/decode_to_md5.c b/libs/libaom/src/examples/decode_to_md5.c
new file mode 100644
index 000000000..bc127b78d
--- /dev/null
+++ b/libs/libaom/src/examples/decode_to_md5.c
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// Frame-by-frame MD5 Checksum
+// ===========================
+//
+// This example builds upon the simple decoder loop to show how checksums
+// of the decoded output can be generated. These are used for validating
+// decoder implementations against the reference implementation, for example.
+//
+// MD5 algorithm
+// -------------
+// The Message-Digest 5 (MD5) is a well known hash function. We have provided
+// an implementation derived from the RSA Data Security, Inc. MD5 Message-Digest
+// Algorithm for your use. Our implmentation only changes the interface of this
+// reference code. You must include the `md5_utils.h` header for access to these
+// functions.
+//
+// Processing The Decoded Data
+// ---------------------------
+// Each row of the image is passed to the MD5 accumulator. First the Y plane
+// is processed, then U, then V. It is important to honor the image's `stride`
+// values.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom/aom_decoder.h"
+#include "aom/aomdx.h"
+#include "common/md5_utils.h"
+#include "common/tools_common.h"
+#include "common/video_reader.h"
+
+static void get_image_md5(const aom_image_t *img, unsigned char digest[16]) {
+  int plane, y;
+  MD5Context md5;
+
+  MD5Init(&md5);
+
+  for (plane = 0; plane < 3; ++plane) {
+    const unsigned char *buf = img->planes[plane];
+    const int stride = img->stride[plane];
+    const int w = plane ? (img->d_w + 1) >> 1 : img->d_w;
+    const int h = plane ? (img->d_h + 1) >> 1 : img->d_h;
+
+    for (y = 0; y < h; ++y) {
+      MD5Update(&md5, buf, w);
+      buf += stride;
+    }
+  }
+
+  MD5Final(digest, &md5);
+}
+
+static void print_md5(FILE *stream, unsigned char digest[16]) {
+  int i;
+
+  for (i = 0; i < 16; ++i) fprintf(stream, "%02x", digest[i]);
+}
+
+static const char *exec_name;
+
+void usage_exit(void) {
+  fprintf(stderr, "Usage: %s <infile> <outfile>\n", exec_name);
+  exit(EXIT_FAILURE);
+}
+
+int main(int argc, char **argv) {
+  int frame_cnt = 0;
+  FILE *outfile = NULL;
+  aom_codec_ctx_t codec;
+  AvxVideoReader *reader = NULL;
+  const AvxVideoInfo *info = NULL;
+  const AvxInterface *decoder = NULL;
+
+  exec_name = argv[0];
+
+  if (argc != 3) die("Invalid number of arguments.");
+
+  reader = aom_video_reader_open(argv[1]);
+  if (!reader) die("Failed to open %s for reading.", argv[1]);
+
+  if (!(outfile = fopen(argv[2], "wb")))
+    die("Failed to open %s for writing.", argv[2]);
+
+  info = aom_video_reader_get_info(reader);
+
+  decoder = get_aom_decoder_by_fourcc(info->codec_fourcc);
+  if (!decoder) die("Unknown input codec.");
+
+  printf("Using %s\n", aom_codec_iface_name(decoder->codec_interface()));
+
+  if (aom_codec_dec_init(&codec, decoder->codec_interface(), NULL, 0))
+    die_codec(&codec, "Failed to initialize decoder");
+
+  while (aom_video_reader_read_frame(reader)) {
+    aom_codec_iter_t iter = NULL;
+    aom_image_t *img = NULL;
+    size_t frame_size = 0;
+    const unsigned char *frame =
+        aom_video_reader_get_frame(reader, &frame_size);
+    if (aom_codec_decode(&codec, frame, frame_size, NULL))
+      die_codec(&codec, "Failed to decode frame");
+
+    while ((img = aom_codec_get_frame(&codec, &iter)) != NULL) {
+      unsigned char digest[16];
+
+      get_image_md5(img, digest);
+      print_md5(outfile, digest);
+      fprintf(outfile, "  img-%dx%d-%04d.i420\n", img->d_w, img->d_h,
+              ++frame_cnt);
+    }
+  }
+
+  printf("Processed %d frames.\n", frame_cnt);
+  if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec.");
+
+  aom_video_reader_close(reader);
+
+  fclose(outfile);
+  return EXIT_SUCCESS;
+}
diff --git a/libs/libaom/src/examples/decode_with_drops.c b/libs/libaom/src/examples/decode_with_drops.c
new file mode 100644
index 000000000..214401958
--- /dev/null
+++ b/libs/libaom/src/examples/decode_with_drops.c
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// Decode With Drops Example
+// =========================
+//
+// This is an example utility which drops a series of frames, as specified
+// on the command line. This is useful for observing the error recovery
+// features of the codec.
+//
+// Usage
+// -----
+// This example adds a single argument to the `simple_decoder` example,
+// which specifies the range or pattern of frames to drop. The parameter is
+// parsed as follows:
+//
+// Dropping A Range Of Frames
+// --------------------------
+// To drop a range of frames, specify the starting frame and the ending
+// frame to drop, separated by a dash. The following command will drop
+// frames 5 through 10 (base 1).
+//
+//  $ ./decode_with_drops in.ivf out.i420 5-10
+//
+//
+// Dropping A Pattern Of Frames
+// ----------------------------
+// To drop a pattern of frames, specify the number of frames to drop and
+// the number of frames after which to repeat the pattern, separated by
+// a forward-slash. The following command will drop 3 of 7 frames.
+// Specifically, it will decode 4 frames, then drop 3 frames, and then
+// repeat.
+//
+//  $ ./decode_with_drops in.ivf out.i420 3/7
+//
+//
+// Extra Variables
+// ---------------
+// This example maintains the pattern passed on the command line in the
+// `n`, `m`, and `is_range` variables:
+//
+//
+// Making The Drop Decision
+// ------------------------
+// The example decides whether to drop the frame based on the current
+// frame number, immediately before decoding the frame.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom/aom_decoder.h"
+#include "aom/aomdx.h"
+#include "common/tools_common.h"
+#include "common/video_reader.h"
+
+static const char *exec_name;
+
+void usage_exit(void) {
+  fprintf(stderr, "Usage: %s <infile> <outfile> <N-M|N/M>\n", exec_name);
+  exit(EXIT_FAILURE);
+}
+
+int main(int argc, char **argv) {
+  int frame_cnt = 0;
+  FILE *outfile = NULL;
+  aom_codec_ctx_t codec;
+  const AvxInterface *decoder = NULL;
+  AvxVideoReader *reader = NULL;
+  const AvxVideoInfo *info = NULL;
+  int n = 0;
+  int m = 0;
+  int is_range = 0;
+  char *nptr = NULL;
+
+  exec_name = argv[0];
+
+  if (argc != 4) die("Invalid number of arguments.");
+
+  reader = aom_video_reader_open(argv[1]);
+  if (!reader) die("Failed to open %s for reading.", argv[1]);
+
+  if (!(outfile = fopen(argv[2], "wb")))
+    die("Failed to open %s for writing.", argv[2]);
+
+  n = (int)strtol(argv[3], &nptr, 0);
+  m = (int)strtol(nptr + 1, NULL, 0);
+  is_range = (*nptr == '-');
+  if (!n || !m || (*nptr != '-' && *nptr != '/'))
+    die("Couldn't parse pattern %s.\n", argv[3]);
+
+  info = aom_video_reader_get_info(reader);
+
+  decoder = get_aom_decoder_by_fourcc(info->codec_fourcc);
+  if (!decoder) die("Unknown input codec.");
+
+  printf("Using %s\n", aom_codec_iface_name(decoder->codec_interface()));
+
+  if (aom_codec_dec_init(&codec, decoder->codec_interface(), NULL, 0))
+    die_codec(&codec, "Failed to initialize decoder.");
+
+  while (aom_video_reader_read_frame(reader)) {
+    aom_codec_iter_t iter = NULL;
+    aom_image_t *img = NULL;
+    size_t frame_size = 0;
+    int skip;
+    const unsigned char *frame =
+        aom_video_reader_get_frame(reader, &frame_size);
+    ++frame_cnt;
+
+    skip = (is_range && frame_cnt >= n && frame_cnt <= m) ||
+           (!is_range && m - (frame_cnt - 1) % m <= n);
+
+    if (!skip) {
+      putc('.', stdout);
+      if (aom_codec_decode(&codec, frame, frame_size, NULL))
+        die_codec(&codec, "Failed to decode frame.");
+
+      while ((img = aom_codec_get_frame(&codec, &iter)) != NULL)
+        aom_img_write(img, outfile);
+    } else {
+      putc('X', stdout);
+    }
+
+    fflush(stdout);
+  }
+
+  printf("Processed %d frames.\n", frame_cnt);
+  if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec.");
+
+  printf("Play: ffplay -f rawvideo -pix_fmt yuv420p -s %dx%d %s\n",
+         info->frame_width, info->frame_height, argv[2]);
+
+  aom_video_reader_close(reader);
+  fclose(outfile);
+
+  return EXIT_SUCCESS;
+}
diff --git a/libs/libaom/src/examples/encoder_util.c b/libs/libaom/src/examples/encoder_util.c
new file mode 100644
index 000000000..e43b37250
--- /dev/null
+++ b/libs/libaom/src/examples/encoder_util.c
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// Utility functions used by encoder binaries.
+
+#include "examples/encoder_util.h"
+
+#include <assert.h>
+#include <string.h>
+
+#include "aom/aom_integer.h"
+
+#define mmin(a, b) ((a) < (b) ? (a) : (b))
+
+static void find_mismatch_plane(const aom_image_t *const img1,
+                                const aom_image_t *const img2, int plane,
+                                int use_highbitdepth, int loc[4]) {
+  const unsigned char *const p1 = img1->planes[plane];
+  const int p1_stride = img1->stride[plane] >> use_highbitdepth;
+  const unsigned char *const p2 = img2->planes[plane];
+  const int p2_stride = img2->stride[plane] >> use_highbitdepth;
+  const uint32_t bsize = 64;
+  const int is_y_plane = (plane == AOM_PLANE_Y);
+  const uint32_t bsizex = is_y_plane ? bsize : bsize >> img1->x_chroma_shift;
+  const uint32_t bsizey = is_y_plane ? bsize : bsize >> img1->y_chroma_shift;
+  const uint32_t c_w =
+      is_y_plane ? img1->d_w
+                 : (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift;
+  const uint32_t c_h =
+      is_y_plane ? img1->d_h
+                 : (img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift;
+  assert(img1->d_w == img2->d_w && img1->d_h == img2->d_h);
+  assert(img1->x_chroma_shift == img2->x_chroma_shift &&
+         img1->y_chroma_shift == img2->y_chroma_shift);
+  loc[0] = loc[1] = loc[2] = loc[3] = -1;
+  if (img1->monochrome && img2->monochrome && plane) return;
+  int match = 1;
+  uint32_t i, j;
+  for (i = 0; match && i < c_h; i += bsizey) {
+    for (j = 0; match && j < c_w; j += bsizex) {
+      const int si =
+          is_y_plane ? mmin(i + bsizey, c_h) - i : mmin(i + bsizey, c_h - i);
+      const int sj =
+          is_y_plane ? mmin(j + bsizex, c_w) - j : mmin(j + bsizex, c_w - j);
+      int k, l;
+      for (k = 0; match && k < si; ++k) {
+        for (l = 0; match && l < sj; ++l) {
+          const int row = i + k;
+          const int col = j + l;
+          const int offset1 = row * p1_stride + col;
+          const int offset2 = row * p2_stride + col;
+          const int val1 = use_highbitdepth
+                               ? p1[2 * offset1] | (p1[2 * offset1 + 1] << 8)
+                               : p1[offset1];
+          const int val2 = use_highbitdepth
+                               ? p2[2 * offset2] | (p2[2 * offset2 + 1] << 8)
+                               : p2[offset2];
+          if (val1 != val2) {
+            loc[0] = row;
+            loc[1] = col;
+            loc[2] = val1;
+            loc[3] = val2;
+            match = 0;
+            break;
+          }
+        }
+      }
+    }
+  }
+}
+
+static void find_mismatch_helper(const aom_image_t *const img1,
+                                 const aom_image_t *const img2,
+                                 int use_highbitdepth, int yloc[4], int uloc[4],
+                                 int vloc[4]) {
+  find_mismatch_plane(img1, img2, AOM_PLANE_Y, use_highbitdepth, yloc);
+  find_mismatch_plane(img1, img2, AOM_PLANE_U, use_highbitdepth, uloc);
+  find_mismatch_plane(img1, img2, AOM_PLANE_V, use_highbitdepth, vloc);
+}
+
+void aom_find_mismatch_high(const aom_image_t *const img1,
+                            const aom_image_t *const img2, int yloc[4],
+                            int uloc[4], int vloc[4]) {
+  find_mismatch_helper(img1, img2, 1, yloc, uloc, vloc);
+}
+
+void aom_find_mismatch(const aom_image_t *const img1,
+                       const aom_image_t *const img2, int yloc[4], int uloc[4],
+                       int vloc[4]) {
+  find_mismatch_helper(img1, img2, 0, yloc, uloc, vloc);
+}
+
+int aom_compare_img(const aom_image_t *const img1,
+                    const aom_image_t *const img2) {
+  assert(img1->cp == img2->cp);
+  assert(img1->tc == img2->tc);
+  assert(img1->mc == img2->mc);
+  assert(img1->monochrome == img2->monochrome);
+
+  int num_planes = img1->monochrome ? 1 : 3;
+
+  uint32_t l_w = img1->d_w;
+  uint32_t c_w = (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift;
+  const uint32_t c_h =
+      (img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift;
+  int match = 1;
+
+  match &= (img1->fmt == img2->fmt);
+  match &= (img1->d_w == img2->d_w);
+  match &= (img1->d_h == img2->d_h);
+  if (img1->fmt & AOM_IMG_FMT_HIGHBITDEPTH) {
+    l_w *= 2;
+    c_w *= 2;
+  }
+
+  for (int plane = 0; plane < num_planes; ++plane) {
+    uint32_t height = plane ? c_h : img1->d_h;
+    uint32_t width = plane ? c_w : l_w;
+
+    for (uint32_t i = 0; i < height; ++i) {
+      match &=
+          (memcmp(img1->planes[plane] + i * img1->stride[plane],
+                  img2->planes[plane] + i * img2->stride[plane], width) == 0);
+    }
+  }
+
+  return match;
+}
diff --git a/libs/libaom/src/examples/encoder_util.h b/libs/libaom/src/examples/encoder_util.h
new file mode 100644
index 000000000..a6bb3fb48
--- /dev/null
+++ b/libs/libaom/src/examples/encoder_util.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// Utility functions used by encoder binaries.
+
+#ifndef AOM_EXAMPLES_ENCODER_UTIL_H_
+#define AOM_EXAMPLES_ENCODER_UTIL_H_
+
+#include "aom/aom_image.h"
+
+// Returns mismatch location (?loc[0],?loc[1]) and the values at that location
+// in img1 (?loc[2]) and img2 (?loc[3]).
+void aom_find_mismatch_high(const aom_image_t *const img1,
+                            const aom_image_t *const img2, int yloc[4],
+                            int uloc[4], int vloc[4]);
+
+void aom_find_mismatch(const aom_image_t *const img1,
+                       const aom_image_t *const img2, int yloc[4], int uloc[4],
+                       int vloc[4]);
+
+// Returns 1 if the two images match.
+int aom_compare_img(const aom_image_t *const img1,
+                    const aom_image_t *const img2);
+
+#endif  // AOM_EXAMPLES_ENCODER_UTIL_H_
diff --git a/libs/libaom/src/examples/inspect.c b/libs/libaom/src/examples/inspect.c
new file mode 100644
index 000000000..526bdc16c
--- /dev/null
+++ b/libs/libaom/src/examples/inspect.c
@@ -0,0 +1,958 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// Inspect Decoder
+// ================
+//
+// This is a simple decoder loop that writes JSON stats to stdout. This tool
+// can also be compiled with Emscripten and used as a library.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#ifdef __EMSCRIPTEN__
+#include <emscripten.h>
+#else
+#define EMSCRIPTEN_KEEPALIVE
+#endif
+
+#include "config/aom_config.h"
+
+#include "aom/aom_decoder.h"
+#include "aom/aomdx.h"
+#include "av1/common/av1_common_int.h"
+
+#if CONFIG_ACCOUNTING
+#include "av1/decoder/accounting.h"
+#endif
+
+#include "av1/decoder/inspection.h"
+#include "common/args.h"
+#include "common/tools_common.h"
+#include "common/video_common.h"
+#include "common/video_reader.h"
+
+// Max JSON buffer size.
+const int MAX_BUFFER = 1024 * 1024 * 256;
+
+typedef enum {
+  ACCOUNTING_LAYER = 1,
+  BLOCK_SIZE_LAYER = 1 << 1,
+  TRANSFORM_SIZE_LAYER = 1 << 2,
+  TRANSFORM_TYPE_LAYER = 1 << 3,
+  MODE_LAYER = 1 << 4,
+  SKIP_LAYER = 1 << 5,
+  FILTER_LAYER = 1 << 6,
+  CDEF_LAYER = 1 << 7,
+  REFERENCE_FRAME_LAYER = 1 << 8,
+  MOTION_VECTORS_LAYER = 1 << 9,
+  UV_MODE_LAYER = 1 << 10,
+  CFL_LAYER = 1 << 11,
+  DUAL_FILTER_LAYER = 1 << 12,
+  Q_INDEX_LAYER = 1 << 13,
+  SEGMENT_ID_LAYER = 1 << 14,
+  MOTION_MODE_LAYER = 1 << 15,
+  COMPOUND_TYPE_LAYER = 1 << 16,
+  INTRABC_LAYER = 1 << 17,
+  PALETTE_LAYER = 1 << 18,
+  UV_PALETTE_LAYER = 1 << 19,
+  ALL_LAYERS = (1 << 20) - 1
+} LayerType;
+
+static LayerType layers = 0;
+
+static int stop_after = 0;
+static int compress = 0;
+
+static const arg_def_t limit_arg =
+    ARG_DEF(NULL, "limit", 1, "Stop decoding after n frames");
+static const arg_def_t dump_all_arg = ARG_DEF("A", "all", 0, "Dump All");
+static const arg_def_t compress_arg =
+    ARG_DEF("x", "compress", 0, "Compress JSON using RLE");
+static const arg_def_t dump_accounting_arg =
+    ARG_DEF("a", "accounting", 0, "Dump Accounting");
+static const arg_def_t dump_block_size_arg =
+    ARG_DEF("bs", "blockSize", 0, "Dump Block Size");
+static const arg_def_t dump_motion_vectors_arg =
+    ARG_DEF("mv", "motionVectors", 0, "Dump Motion Vectors");
+static const arg_def_t dump_transform_size_arg =
+    ARG_DEF("ts", "transformSize", 0, "Dump Transform Size");
+static const arg_def_t dump_transform_type_arg =
+    ARG_DEF("tt", "transformType", 0, "Dump Transform Type");
+static const arg_def_t dump_mode_arg = ARG_DEF("m", "mode", 0, "Dump Mode");
+static const arg_def_t dump_motion_mode_arg =
+    ARG_DEF("mm", "motion_mode", 0, "Dump Motion Modes");
+static const arg_def_t dump_compound_type_arg =
+    ARG_DEF("ct", "compound_type", 0, "Dump Compound Types");
+static const arg_def_t dump_uv_mode_arg =
+    ARG_DEF("uvm", "uv_mode", 0, "Dump UV Intra Prediction Modes");
+static const arg_def_t dump_skip_arg = ARG_DEF("s", "skip", 0, "Dump Skip");
+static const arg_def_t dump_filter_arg =
+    ARG_DEF("f", "filter", 0, "Dump Filter");
+static const arg_def_t dump_cdef_arg = ARG_DEF("c", "cdef", 0, "Dump CDEF");
+static const arg_def_t dump_cfl_arg =
+    ARG_DEF("cfl", "chroma_from_luma", 0, "Dump Chroma from Luma Alphas");
+static const arg_def_t dump_dual_filter_type_arg =
+    ARG_DEF("df", "dualFilterType", 0, "Dump Dual Filter Type");
+static const arg_def_t dump_reference_frame_arg =
+    ARG_DEF("r", "referenceFrame", 0, "Dump Reference Frame");
+static const arg_def_t dump_delta_q_arg =
+    ARG_DEF("dq", "delta_q", 0, "Dump QIndex");
+static const arg_def_t dump_seg_id_arg =
+    ARG_DEF("si", "seg_id", 0, "Dump Segment ID");
+static const arg_def_t dump_intrabc_arg =
+    ARG_DEF("ibc", "intrabc", 0, "Dump If IntraBC Is Used");
+static const arg_def_t dump_palette_arg =
+    ARG_DEF("plt", "palette", 0, "Dump Palette Size");
+static const arg_def_t dump_uv_palette_arg =
+    ARG_DEF("uvp", "uv_palette", 0, "Dump UV Palette Size");
+static const arg_def_t usage_arg = ARG_DEF("h", "help", 0, "Help");
+static const arg_def_t skip_non_transform_arg = ARG_DEF(
+    "snt", "skip_non_transform", 1, "Skip is counted as a non transform.");
+static const arg_def_t combined_arg =
+    ARG_DEF("comb", "combined", 1, "combinining parameters into one output.");
+
+int combined_parm_list[15];
+int combined_parm_count = 0;
+
+static const arg_def_t *main_args[] = { &limit_arg,
+                                        &dump_all_arg,
+                                        &compress_arg,
+#if CONFIG_ACCOUNTING
+                                        &dump_accounting_arg,
+#endif
+                                        &dump_block_size_arg,
+                                        &dump_transform_size_arg,
+                                        &dump_transform_type_arg,
+                                        &dump_mode_arg,
+                                        &dump_uv_mode_arg,
+                                        &dump_motion_mode_arg,
+                                        &dump_compound_type_arg,
+                                        &dump_skip_arg,
+                                        &dump_filter_arg,
+                                        &dump_cdef_arg,
+                                        &dump_dual_filter_type_arg,
+                                        &dump_cfl_arg,
+                                        &dump_reference_frame_arg,
+                                        &dump_motion_vectors_arg,
+                                        &dump_delta_q_arg,
+                                        &dump_seg_id_arg,
+                                        &dump_intrabc_arg,
+                                        &dump_palette_arg,
+                                        &dump_uv_palette_arg,
+                                        &usage_arg,
+                                        &skip_non_transform_arg,
+                                        &combined_arg,
+                                        NULL };
+#define ENUM(name) \
+  { #name, name }
+#define LAST_ENUM \
+  { NULL, 0 }
+typedef struct map_entry {
+  const char *name;
+  int value;
+} map_entry;
+
+const map_entry refs_map[] = {
+  ENUM(INTRA_FRAME),   ENUM(LAST_FRAME),   ENUM(LAST2_FRAME),
+  ENUM(LAST3_FRAME),   ENUM(GOLDEN_FRAME), ENUM(BWDREF_FRAME),
+  ENUM(ALTREF2_FRAME), ENUM(ALTREF_FRAME), LAST_ENUM
+};
+
+const map_entry block_size_map[] = {
+  ENUM(BLOCK_4X4),     ENUM(BLOCK_4X8),    ENUM(BLOCK_8X4),
+  ENUM(BLOCK_8X8),     ENUM(BLOCK_8X16),   ENUM(BLOCK_16X8),
+  ENUM(BLOCK_16X16),   ENUM(BLOCK_16X32),  ENUM(BLOCK_32X16),
+  ENUM(BLOCK_32X32),   ENUM(BLOCK_32X64),  ENUM(BLOCK_64X32),
+  ENUM(BLOCK_64X64),   ENUM(BLOCK_64X128), ENUM(BLOCK_128X64),
+  ENUM(BLOCK_128X128), ENUM(BLOCK_4X16),   ENUM(BLOCK_16X4),
+  ENUM(BLOCK_8X32),    ENUM(BLOCK_32X8),   ENUM(BLOCK_16X64),
+  ENUM(BLOCK_64X16),   LAST_ENUM
+};
+
+#define TX_SKIP -1
+
+const map_entry tx_size_map[] = {
+  ENUM(TX_4X4),   ENUM(TX_8X8),   ENUM(TX_16X16), ENUM(TX_32X32),
+  ENUM(TX_64X64), ENUM(TX_4X8),   ENUM(TX_8X4),   ENUM(TX_8X16),
+  ENUM(TX_16X8),  ENUM(TX_16X32), ENUM(TX_32X16), ENUM(TX_32X64),
+  ENUM(TX_64X32), ENUM(TX_4X16),  ENUM(TX_16X4),  ENUM(TX_8X32),
+  ENUM(TX_32X8),  ENUM(TX_16X64), ENUM(TX_64X16), LAST_ENUM
+};
+
+const map_entry tx_type_map[] = { ENUM(DCT_DCT),
+                                  ENUM(ADST_DCT),
+                                  ENUM(DCT_ADST),
+                                  ENUM(ADST_ADST),
+                                  ENUM(FLIPADST_DCT),
+                                  ENUM(DCT_FLIPADST),
+                                  ENUM(FLIPADST_FLIPADST),
+                                  ENUM(ADST_FLIPADST),
+                                  ENUM(FLIPADST_ADST),
+                                  ENUM(IDTX),
+                                  ENUM(V_DCT),
+                                  ENUM(H_DCT),
+                                  ENUM(V_ADST),
+                                  ENUM(H_ADST),
+                                  ENUM(V_FLIPADST),
+                                  ENUM(H_FLIPADST),
+                                  LAST_ENUM };
+const map_entry dual_filter_map[] = { ENUM(REG_REG),       ENUM(REG_SMOOTH),
+                                      ENUM(REG_SHARP),     ENUM(SMOOTH_REG),
+                                      ENUM(SMOOTH_SMOOTH), ENUM(SMOOTH_SHARP),
+                                      ENUM(SHARP_REG),     ENUM(SHARP_SMOOTH),
+                                      ENUM(SHARP_SHARP),   LAST_ENUM };
+
+const map_entry prediction_mode_map[] = {
+  ENUM(DC_PRED),     ENUM(V_PRED),        ENUM(H_PRED),
+  ENUM(D45_PRED),    ENUM(D135_PRED),     ENUM(D113_PRED),
+  ENUM(D157_PRED),   ENUM(D203_PRED),     ENUM(D67_PRED),
+  ENUM(SMOOTH_PRED), ENUM(SMOOTH_V_PRED), ENUM(SMOOTH_H_PRED),
+  ENUM(PAETH_PRED),  ENUM(NEARESTMV),     ENUM(NEARMV),
+  ENUM(GLOBALMV),    ENUM(NEWMV),         ENUM(NEAREST_NEARESTMV),
+  ENUM(NEAR_NEARMV), ENUM(NEAREST_NEWMV), ENUM(NEW_NEARESTMV),
+  ENUM(NEAR_NEWMV),  ENUM(NEW_NEARMV),    ENUM(GLOBAL_GLOBALMV),
+  ENUM(NEW_NEWMV),   ENUM(INTRA_INVALID), LAST_ENUM
+};
+
+const map_entry motion_mode_map[] = { ENUM(SIMPLE_TRANSLATION),
+                                      ENUM(OBMC_CAUSAL),    // 2-sided OBMC
+                                      ENUM(WARPED_CAUSAL),  // 2-sided WARPED
+                                      LAST_ENUM };
+
+const map_entry compound_type_map[] = { ENUM(COMPOUND_AVERAGE),
+                                        ENUM(COMPOUND_WEDGE),
+                                        ENUM(COMPOUND_DIFFWTD), LAST_ENUM };
+
+const map_entry uv_prediction_mode_map[] = {
+  ENUM(UV_DC_PRED),       ENUM(UV_V_PRED),
+  ENUM(UV_H_PRED),        ENUM(UV_D45_PRED),
+  ENUM(UV_D135_PRED),     ENUM(UV_D113_PRED),
+  ENUM(UV_D157_PRED),     ENUM(UV_D203_PRED),
+  ENUM(UV_D67_PRED),      ENUM(UV_SMOOTH_PRED),
+  ENUM(UV_SMOOTH_V_PRED), ENUM(UV_SMOOTH_H_PRED),
+  ENUM(UV_PAETH_PRED),    ENUM(UV_CFL_PRED),
+  ENUM(UV_MODE_INVALID),  LAST_ENUM
+};
+#define NO_SKIP 0
+#define SKIP 1
+
+const map_entry skip_map[] = { ENUM(SKIP), ENUM(NO_SKIP), LAST_ENUM };
+
+const map_entry intrabc_map[] = { { "INTRABC", 1 },
+                                  { "NO_INTRABC", 0 },
+                                  LAST_ENUM };
+
+const map_entry palette_map[] = {
+  { "ZERO_COLORS", 0 },  { "TWO_COLORS", 2 },   { "THREE_COLORS", 3 },
+  { "FOUR_COLORS", 4 },  { "FIVE_COLORS", 5 },  { "SIX_COLORS", 6 },
+  { "SEVEN_COLORS", 7 }, { "EIGHT_COLORS", 8 }, LAST_ENUM
+};
+
+const map_entry config_map[] = { ENUM(MI_SIZE), LAST_ENUM };
+
+static const char *exec_name;
+
+struct parm_offset {
+  char parm[60];
+  char offset;
+};
+struct parm_offset parm_offsets[] = {
+  { "blockSize", offsetof(insp_mi_data, sb_type) },
+  { "transformSize", offsetof(insp_mi_data, tx_size) },
+  { "transformType", offsetof(insp_mi_data, tx_type) },
+  { "dualFilterType", offsetof(insp_mi_data, dual_filter_type) },
+  { "mode", offsetof(insp_mi_data, mode) },
+  { "uv_mode", offsetof(insp_mi_data, uv_mode) },
+  { "motion_mode", offsetof(insp_mi_data, motion_mode) },
+  { "compound_type", offsetof(insp_mi_data, compound_type) },
+  { "referenceFrame", offsetof(insp_mi_data, ref_frame) },
+  { "skip", offsetof(insp_mi_data, skip) },
+};
+int parm_count = sizeof(parm_offsets) / sizeof(parm_offsets[0]);
+
+int convert_to_indices(char *str, int *indices, int maxCount, int *count) {
+  *count = 0;
+  do {
+    char *comma = strchr(str, ',');
+    int length = (comma ? (int)(comma - str) : (int)strlen(str));
+    int i;
+    for (i = 0; i < parm_count; ++i) {
+      if (!strncmp(str, parm_offsets[i].parm, length)) {
+        break;
+      }
+    }
+    if (i == parm_count) return 0;
+    indices[(*count)++] = i;
+    if (*count > maxCount) return 0;
+    str += length + 1;
+  } while (strlen(str) > 0);
+  return 1;
+}
+
+insp_frame_data frame_data;
+int frame_count = 0;
+int decoded_frame_count = 0;
+aom_codec_ctx_t codec;
+AvxVideoReader *reader = NULL;
+const AvxVideoInfo *info = NULL;
+aom_image_t *img = NULL;
+
+void on_frame_decoded_dump(char *json) {
+#ifdef __EMSCRIPTEN__
+  EM_ASM_({ Module.on_frame_decoded_json($0); }, json);
+#else
+  printf("%s", json);
+#endif
+}
+
+// Writing out the JSON buffer using snprintf is very slow, especially when
+// compiled with emscripten, these functions speed things up quite a bit.
+int put_str(char *buffer, const char *str) {
+  int i;
+  for (i = 0; str[i] != '\0'; i++) {
+    buffer[i] = str[i];
+  }
+  return i;
+}
+
+int put_str_with_escape(char *buffer, const char *str) {
+  int i;
+  int j = 0;
+  for (i = 0; str[i] != '\0'; i++) {
+    if (str[i] < ' ') {
+      continue;
+    } else if (str[i] == '"' || str[i] == '\\') {
+      buffer[j++] = '\\';
+    }
+    buffer[j++] = str[i];
+  }
+  return j;
+}
+
+int put_num(char *buffer, char prefix, int num, char suffix) {
+  int i = 0;
+  char *buf = buffer;
+  int is_neg = 0;
+  if (prefix) {
+    buf[i++] = prefix;
+  }
+  if (num == 0) {
+    buf[i++] = '0';
+  } else {
+    if (num < 0) {
+      num = -num;
+      is_neg = 1;
+    }
+    int s = i;
+    while (num != 0) {
+      buf[i++] = '0' + (num % 10);
+      num = num / 10;
+    }
+    if (is_neg) {
+      buf[i++] = '-';
+    }
+    int e = i - 1;
+    while (s < e) {
+      int t = buf[s];
+      buf[s] = buf[e];
+      buf[e] = t;
+      s++;
+      e--;
+    }
+  }
+  if (suffix) {
+    buf[i++] = suffix;
+  }
+  return i;
+}
+
+int put_map(char *buffer, const map_entry *map) {
+  char *buf = buffer;
+  const map_entry *entry = map;
+  while (entry->name != NULL) {
+    *(buf++) = '"';
+    buf += put_str(buf, entry->name);
+    *(buf++) = '"';
+    buf += put_num(buf, ':', entry->value, 0);
+    entry++;
+    if (entry->name != NULL) {
+      *(buf++) = ',';
+    }
+  }
+  return (int)(buf - buffer);
+}
+
+int put_reference_frame(char *buffer) {
+  const int mi_rows = frame_data.mi_rows;
+  const int mi_cols = frame_data.mi_cols;
+  char *buf = buffer;
+  int r, c, t;
+  buf += put_str(buf, "  \"referenceFrameMap\": {");
+  buf += put_map(buf, refs_map);
+  buf += put_str(buf, "},\n");
+  buf += put_str(buf, "  \"referenceFrame\": [");
+  for (r = 0; r < mi_rows; ++r) {
+    *(buf++) = '[';
+    for (c = 0; c < mi_cols; ++c) {
+      insp_mi_data *mi = &frame_data.mi_grid[r * mi_cols + c];
+      buf += put_num(buf, '[', mi->ref_frame[0], 0);
+      buf += put_num(buf, ',', mi->ref_frame[1], ']');
+      if (compress) {  // RLE
+        for (t = c + 1; t < mi_cols; ++t) {
+          insp_mi_data *next_mi = &frame_data.mi_grid[r * mi_cols + t];
+          if (mi->ref_frame[0] != next_mi->ref_frame[0] ||
+              mi->ref_frame[1] != next_mi->ref_frame[1]) {
+            break;
+          }
+        }
+        if (t - c > 1) {
+          *(buf++) = ',';
+          buf += put_num(buf, '[', t - c - 1, ']');
+          c = t - 1;
+        }
+      }
+      if (c < mi_cols - 1) *(buf++) = ',';
+    }
+    *(buf++) = ']';
+    if (r < mi_rows - 1) *(buf++) = ',';
+  }
+  buf += put_str(buf, "],\n");
+  return (int)(buf - buffer);
+}
+
+int put_motion_vectors(char *buffer) {
+  const int mi_rows = frame_data.mi_rows;
+  const int mi_cols = frame_data.mi_cols;
+  char *buf = buffer;
+  int r, c, t;
+  buf += put_str(buf, "  \"motionVectors\": [");
+  for (r = 0; r < mi_rows; ++r) {
+    *(buf++) = '[';
+    for (c = 0; c < mi_cols; ++c) {
+      insp_mi_data *mi = &frame_data.mi_grid[r * mi_cols + c];
+      buf += put_num(buf, '[', mi->mv[0].col, 0);
+      buf += put_num(buf, ',', mi->mv[0].row, 0);
+      buf += put_num(buf, ',', mi->mv[1].col, 0);
+      buf += put_num(buf, ',', mi->mv[1].row, ']');
+      if (compress) {  // RLE
+        for (t = c + 1; t < mi_cols; ++t) {
+          insp_mi_data *next_mi = &frame_data.mi_grid[r * mi_cols + t];
+          if (mi->mv[0].col != next_mi->mv[0].col ||
+              mi->mv[0].row != next_mi->mv[0].row ||
+              mi->mv[1].col != next_mi->mv[1].col ||
+              mi->mv[1].row != next_mi->mv[1].row) {
+            break;
+          }
+        }
+        if (t - c > 1) {
+          *(buf++) = ',';
+          buf += put_num(buf, '[', t - c - 1, ']');
+          c = t - 1;
+        }
+      }
+      if (c < mi_cols - 1) *(buf++) = ',';
+    }
+    *(buf++) = ']';
+    if (r < mi_rows - 1) *(buf++) = ',';
+  }
+  buf += put_str(buf, "],\n");
+  return (int)(buf - buffer);
+}
+
+int put_combined(char *buffer) {
+  const int mi_rows = frame_data.mi_rows;
+  const int mi_cols = frame_data.mi_cols;
+  char *buf = buffer;
+  int r, c, p;
+  buf += put_str(buf, "  \"");
+  for (p = 0; p < combined_parm_count; ++p) {
+    if (p) buf += put_str(buf, "&");
+    buf += put_str(buf, parm_offsets[combined_parm_list[p]].parm);
+  }
+  buf += put_str(buf, "\": [");
+  for (r = 0; r < mi_rows; ++r) {
+    *(buf++) = '[';
+    for (c = 0; c < mi_cols; ++c) {
+      insp_mi_data *mi = &frame_data.mi_grid[r * mi_cols + c];
+      *(buf++) = '[';
+      for (p = 0; p < combined_parm_count; ++p) {
+        if (p) *(buf++) = ',';
+        int16_t *v = (int16_t *)(((int8_t *)mi) +
+                                 parm_offsets[combined_parm_list[p]].offset);
+        buf += put_num(buf, 0, v[0], 0);
+      }
+      *(buf++) = ']';
+      if (c < mi_cols - 1) *(buf++) = ',';
+    }
+    *(buf++) = ']';
+    if (r < mi_rows - 1) *(buf++) = ',';
+  }
+  buf += put_str(buf, "],\n");
+  return (int)(buf - buffer);
+}
+
+int put_block_info(char *buffer, const map_entry *map, const char *name,
+                   size_t offset, int len) {
+  const int mi_rows = frame_data.mi_rows;
+  const int mi_cols = frame_data.mi_cols;
+  char *buf = buffer;
+  int r, c, t, i;
+  if (compress && len == 1) {
+    die("Can't encode scalars as arrays when RLE compression is enabled.");
+    return -1;
+  }
+  if (map) {
+    buf += snprintf(buf, MAX_BUFFER, "  \"%sMap\": {", name);
+    buf += put_map(buf, map);
+    buf += put_str(buf, "},\n");
+  }
+  buf += snprintf(buf, MAX_BUFFER, "  \"%s\": [", name);
+  for (r = 0; r < mi_rows; ++r) {
+    *(buf++) = '[';
+    for (c = 0; c < mi_cols; ++c) {
+      insp_mi_data *mi = &frame_data.mi_grid[r * mi_cols + c];
+      int16_t *v = (int16_t *)(((int8_t *)mi) + offset);
+      if (len == 0) {
+        buf += put_num(buf, 0, v[0], 0);
+      } else {
+        buf += put_str(buf, "[");
+        for (i = 0; i < len; i++) {
+          buf += put_num(buf, 0, v[i], 0);
+          if (i < len - 1) {
+            buf += put_str(buf, ",");
+          }
+        }
+        buf += put_str(buf, "]");
+      }
+      if (compress) {  // RLE
+        for (t = c + 1; t < mi_cols; ++t) {
+          insp_mi_data *next_mi = &frame_data.mi_grid[r * mi_cols + t];
+          int16_t *nv = (int16_t *)(((int8_t *)next_mi) + offset);
+          int same = 0;
+          if (len == 0) {
+            same = v[0] == nv[0];
+          } else {
+            for (i = 0; i < len; i++) {
+              same = v[i] == nv[i];
+              if (!same) {
+                break;
+              }
+            }
+          }
+          if (!same) {
+            break;
+          }
+        }
+        if (t - c > 1) {
+          *(buf++) = ',';
+          buf += put_num(buf, '[', t - c - 1, ']');
+          c = t - 1;
+        }
+      }
+      if (c < mi_cols - 1) *(buf++) = ',';
+    }
+    *(buf++) = ']';
+    if (r < mi_rows - 1) *(buf++) = ',';
+  }
+  buf += put_str(buf, "],\n");
+  return (int)(buf - buffer);
+}
+
+#if CONFIG_ACCOUNTING
+int put_accounting(char *buffer) {
+  char *buf = buffer;
+  int i;
+  const Accounting *accounting = frame_data.accounting;
+  if (accounting == NULL) {
+    printf("XXX\n");
+    return 0;
+  }
+  const int num_syms = accounting->syms.num_syms;
+  const int num_strs = accounting->syms.dictionary.num_strs;
+  buf += put_str(buf, "  \"symbolsMap\": [");
+  for (i = 0; i < num_strs; i++) {
+    buf += snprintf(buf, MAX_BUFFER, "\"%s\"",
+                    accounting->syms.dictionary.strs[i]);
+    if (i < num_strs - 1) *(buf++) = ',';
+  }
+  buf += put_str(buf, "],\n");
+  buf += put_str(buf, "  \"symbols\": [\n    ");
+  AccountingSymbolContext context;
+  context.x = -2;
+  context.y = -2;
+  AccountingSymbol *sym;
+  for (i = 0; i < num_syms; i++) {
+    sym = &accounting->syms.syms[i];
+    if (memcmp(&context, &sym->context, sizeof(AccountingSymbolContext)) != 0) {
+      buf += put_num(buf, '[', sym->context.x, 0);
+      buf += put_num(buf, ',', sym->context.y, ']');
+    } else {
+      buf += put_num(buf, '[', sym->id, 0);
+      buf += put_num(buf, ',', sym->bits, 0);
+      buf += put_num(buf, ',', sym->samples, ']');
+    }
+    context = sym->context;
+    if (i < num_syms - 1) *(buf++) = ',';
+  }
+  buf += put_str(buf, "],\n");
+  return (int)(buf - buffer);
+}
+#endif
+
+int skip_non_transform = 0;
+
+void inspect(void *pbi, void *data) {
+  /* Fetch frame data. */
+  ifd_inspect(&frame_data, pbi, skip_non_transform);
+
+  // Show existing frames just show a reference buffer we've already decoded.
+  // There's no information to show.
+  if (frame_data.show_existing_frame) return;
+
+  (void)data;
+  // We allocate enough space and hope we don't write out of bounds. Totally
+  // unsafe but this speeds things up, especially when compiled to Javascript.
+  char *buffer = aom_malloc(MAX_BUFFER);
+  char *buf = buffer;
+  buf += put_str(buf, "{\n");
+  if (layers & BLOCK_SIZE_LAYER) {
+    buf += put_block_info(buf, block_size_map, "blockSize",
+                          offsetof(insp_mi_data, sb_type), 0);
+  }
+  if (layers & TRANSFORM_SIZE_LAYER) {
+    buf += put_block_info(buf, tx_size_map, "transformSize",
+                          offsetof(insp_mi_data, tx_size), 0);
+  }
+  if (layers & TRANSFORM_TYPE_LAYER) {
+    buf += put_block_info(buf, tx_type_map, "transformType",
+                          offsetof(insp_mi_data, tx_type), 0);
+  }
+  if (layers & DUAL_FILTER_LAYER) {
+    buf += put_block_info(buf, dual_filter_map, "dualFilterType",
+                          offsetof(insp_mi_data, dual_filter_type), 0);
+  }
+  if (layers & MODE_LAYER) {
+    buf += put_block_info(buf, prediction_mode_map, "mode",
+                          offsetof(insp_mi_data, mode), 0);
+  }
+  if (layers & UV_MODE_LAYER) {
+    buf += put_block_info(buf, uv_prediction_mode_map, "uv_mode",
+                          offsetof(insp_mi_data, uv_mode), 0);
+  }
+  if (layers & MOTION_MODE_LAYER) {
+    buf += put_block_info(buf, motion_mode_map, "motion_mode",
+                          offsetof(insp_mi_data, motion_mode), 0);
+  }
+  if (layers & COMPOUND_TYPE_LAYER) {
+    buf += put_block_info(buf, compound_type_map, "compound_type",
+                          offsetof(insp_mi_data, compound_type), 0);
+  }
+  if (layers & SKIP_LAYER) {
+    buf +=
+        put_block_info(buf, skip_map, "skip", offsetof(insp_mi_data, skip), 0);
+  }
+  if (layers & FILTER_LAYER) {
+    buf +=
+        put_block_info(buf, NULL, "filter", offsetof(insp_mi_data, filter), 2);
+  }
+  if (layers & CDEF_LAYER) {
+    buf += put_block_info(buf, NULL, "cdef_level",
+                          offsetof(insp_mi_data, cdef_level), 0);
+    buf += put_block_info(buf, NULL, "cdef_strength",
+                          offsetof(insp_mi_data, cdef_strength), 0);
+  }
+  if (layers & CFL_LAYER) {
+    buf += put_block_info(buf, NULL, "cfl_alpha_idx",
+                          offsetof(insp_mi_data, cfl_alpha_idx), 0);
+    buf += put_block_info(buf, NULL, "cfl_alpha_sign",
+                          offsetof(insp_mi_data, cfl_alpha_sign), 0);
+  }
+  if (layers & Q_INDEX_LAYER) {
+    buf += put_block_info(buf, NULL, "delta_q",
+                          offsetof(insp_mi_data, current_qindex), 0);
+  }
+  if (layers & SEGMENT_ID_LAYER) {
+    buf += put_block_info(buf, NULL, "seg_id",
+                          offsetof(insp_mi_data, segment_id), 0);
+  }
+  if (layers & MOTION_VECTORS_LAYER) {
+    buf += put_motion_vectors(buf);
+  }
+  if (layers & INTRABC_LAYER) {
+    buf += put_block_info(buf, intrabc_map, "intrabc",
+                          offsetof(insp_mi_data, intrabc), 0);
+  }
+  if (layers & PALETTE_LAYER) {
+    buf += put_block_info(buf, palette_map, "palette",
+                          offsetof(insp_mi_data, palette), 0);
+  }
+  if (layers & UV_PALETTE_LAYER) {
+    buf += put_block_info(buf, palette_map, "uv_palette",
+                          offsetof(insp_mi_data, uv_palette), 0);
+  }
+  if (combined_parm_count > 0) buf += put_combined(buf);
+  if (layers & REFERENCE_FRAME_LAYER) {
+    buf += put_block_info(buf, refs_map, "referenceFrame",
+                          offsetof(insp_mi_data, ref_frame), 2);
+  }
+#if CONFIG_ACCOUNTING
+  if (layers & ACCOUNTING_LAYER) {
+    buf += put_accounting(buf);
+  }
+#endif
+  buf +=
+      snprintf(buf, MAX_BUFFER, "  \"frame\": %d,\n", frame_data.frame_number);
+  buf += snprintf(buf, MAX_BUFFER, "  \"showFrame\": %d,\n",
+                  frame_data.show_frame);
+  buf += snprintf(buf, MAX_BUFFER, "  \"frameType\": %d,\n",
+                  frame_data.frame_type);
+  buf += snprintf(buf, MAX_BUFFER, "  \"baseQIndex\": %d,\n",
+                  frame_data.base_qindex);
+  buf += snprintf(buf, MAX_BUFFER, "  \"tileCols\": %d,\n",
+                  frame_data.tile_mi_cols);
+  buf += snprintf(buf, MAX_BUFFER, "  \"tileRows\": %d,\n",
+                  frame_data.tile_mi_rows);
+  buf += snprintf(buf, MAX_BUFFER, "  \"deltaQPresentFlag\": %d,\n",
+                  frame_data.delta_q_present_flag);
+  buf += snprintf(buf, MAX_BUFFER, "  \"deltaQRes\": %d,\n",
+                  frame_data.delta_q_res);
+  buf += put_str(buf, "  \"config\": {");
+  buf += put_map(buf, config_map);
+  buf += put_str(buf, "},\n");
+  buf += put_str(buf, "  \"configString\": \"");
+  buf += put_str_with_escape(buf, aom_codec_build_config());
+  buf += put_str(buf, "\"\n");
+  decoded_frame_count++;
+  buf += put_str(buf, "},\n");
+  *(buf++) = 0;
+  on_frame_decoded_dump(buffer);
+  aom_free(buffer);
+}
+
+void ifd_init_cb() {
+  aom_inspect_init ii;
+  ii.inspect_cb = inspect;
+  ii.inspect_ctx = NULL;
+  aom_codec_control(&codec, AV1_SET_INSPECTION_CALLBACK, &ii);
+}
+
+EMSCRIPTEN_KEEPALIVE
+int open_file(char *file) {
+  if (file == NULL) {
+    // The JS analyzer puts the .ivf file at this location.
+    file = "/tmp/input.ivf";
+  }
+  reader = aom_video_reader_open(file);
+  if (!reader) die("Failed to open %s for reading.", file);
+  info = aom_video_reader_get_info(reader);
+  const AvxInterface *decoder = get_aom_decoder_by_fourcc(info->codec_fourcc);
+  if (!decoder) die("Unknown input codec.");
+  fprintf(stderr, "Using %s\n",
+          aom_codec_iface_name(decoder->codec_interface()));
+  if (aom_codec_dec_init(&codec, decoder->codec_interface(), NULL, 0))
+    die_codec(&codec, "Failed to initialize decoder.");
+  ifd_init(&frame_data, info->frame_width, info->frame_height);
+  ifd_init_cb();
+  return EXIT_SUCCESS;
+}
+
+Av1DecodeReturn adr;
+int have_frame = 0;
+const unsigned char *frame;
+const unsigned char *end_frame;
+size_t frame_size = 0;
+
+EMSCRIPTEN_KEEPALIVE
+int read_frame() {
+  img = NULL;
+
+  // This loop skips over any frames that are show_existing_frames,  as
+  // there is nothing to analyze.
+  do {
+    if (!have_frame) {
+      if (!aom_video_reader_read_frame(reader)) return EXIT_FAILURE;
+      frame = aom_video_reader_get_frame(reader, &frame_size);
+
+      have_frame = 1;
+      end_frame = frame + frame_size;
+    }
+
+    if (aom_codec_decode(&codec, frame, (unsigned int)frame_size, &adr) !=
+        AOM_CODEC_OK) {
+      die_codec(&codec, "Failed to decode frame.");
+    }
+
+    frame = adr.buf;
+    if (frame == end_frame) have_frame = 0;
+  } while (adr.show_existing);
+
+  int got_any_frames = 0;
+  aom_image_t *frame_img;
+  struct av1_ref_frame ref_dec;
+  ref_dec.idx = adr.idx;
+
+  // ref_dec.idx is the index to the reference buffer idx to AV1_GET_REFERENCE
+  // if its -1 the decoder didn't update any reference buffer and the only
+  // way to see the frame is aom_codec_get_frame.
+  if (ref_dec.idx == -1) {
+    aom_codec_iter_t iter = NULL;
+    img = frame_img = aom_codec_get_frame(&codec, &iter);
+    ++frame_count;
+    got_any_frames = 1;
+  } else if (!aom_codec_control(&codec, AV1_GET_REFERENCE, &ref_dec)) {
+    img = frame_img = &ref_dec.img;
+    ++frame_count;
+    got_any_frames = 1;
+  }
+  if (!got_any_frames) {
+    return EXIT_FAILURE;
+  }
+  return EXIT_SUCCESS;
+}
+
+EMSCRIPTEN_KEEPALIVE
+const char *get_aom_codec_build_config() { return aom_codec_build_config(); }
+
+EMSCRIPTEN_KEEPALIVE
+int get_bit_depth() { return img->bit_depth; }
+
+EMSCRIPTEN_KEEPALIVE
+int get_bits_per_sample() { return img->bps; }
+
+EMSCRIPTEN_KEEPALIVE
+int get_image_format() { return img->fmt; }
+
+EMSCRIPTEN_KEEPALIVE
+unsigned char *get_plane(int plane) { return img->planes[plane]; }
+
+EMSCRIPTEN_KEEPALIVE
+int get_plane_stride(int plane) { return img->stride[plane]; }
+
+EMSCRIPTEN_KEEPALIVE
+int get_plane_width(int plane) { return aom_img_plane_width(img, plane); }
+
+EMSCRIPTEN_KEEPALIVE
+int get_plane_height(int plane) { return aom_img_plane_height(img, plane); }
+
+EMSCRIPTEN_KEEPALIVE
+int get_frame_width() { return info->frame_width; }
+
+EMSCRIPTEN_KEEPALIVE
+int get_frame_height() { return info->frame_height; }
+
+static void parse_args(char **argv) {
+  char **argi, **argj;
+  struct arg arg;
+  (void)dump_accounting_arg;
+  (void)dump_cdef_arg;
+  for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) {
+    arg.argv_step = 1;
+    if (arg_match(&arg, &dump_block_size_arg, argi)) layers |= BLOCK_SIZE_LAYER;
+#if CONFIG_ACCOUNTING
+    else if (arg_match(&arg, &dump_accounting_arg, argi))
+      layers |= ACCOUNTING_LAYER;
+#endif
+    else if (arg_match(&arg, &dump_transform_size_arg, argi))
+      layers |= TRANSFORM_SIZE_LAYER;
+    else if (arg_match(&arg, &dump_transform_type_arg, argi))
+      layers |= TRANSFORM_TYPE_LAYER;
+    else if (arg_match(&arg, &dump_mode_arg, argi))
+      layers |= MODE_LAYER;
+    else if (arg_match(&arg, &dump_uv_mode_arg, argi))
+      layers |= UV_MODE_LAYER;
+    else if (arg_match(&arg, &dump_motion_mode_arg, argi))
+      layers |= MOTION_MODE_LAYER;
+    else if (arg_match(&arg, &dump_compound_type_arg, argi))
+      layers |= COMPOUND_TYPE_LAYER;
+    else if (arg_match(&arg, &dump_skip_arg, argi))
+      layers |= SKIP_LAYER;
+    else if (arg_match(&arg, &dump_filter_arg, argi))
+      layers |= FILTER_LAYER;
+    else if (arg_match(&arg, &dump_cdef_arg, argi))
+      layers |= CDEF_LAYER;
+    else if (arg_match(&arg, &dump_cfl_arg, argi))
+      layers |= CFL_LAYER;
+    else if (arg_match(&arg, &dump_reference_frame_arg, argi))
+      layers |= REFERENCE_FRAME_LAYER;
+    else if (arg_match(&arg, &dump_motion_vectors_arg, argi))
+      layers |= MOTION_VECTORS_LAYER;
+    else if (arg_match(&arg, &dump_dual_filter_type_arg, argi))
+      layers |= DUAL_FILTER_LAYER;
+    else if (arg_match(&arg, &dump_delta_q_arg, argi))
+      layers |= Q_INDEX_LAYER;
+    else if (arg_match(&arg, &dump_seg_id_arg, argi))
+      layers |= SEGMENT_ID_LAYER;
+    else if (arg_match(&arg, &dump_intrabc_arg, argi))
+      layers |= INTRABC_LAYER;
+    else if (arg_match(&arg, &dump_palette_arg, argi))
+      layers |= PALETTE_LAYER;
+    else if (arg_match(&arg, &dump_uv_palette_arg, argi))
+      layers |= UV_PALETTE_LAYER;
+    else if (arg_match(&arg, &dump_all_arg, argi))
+      layers |= ALL_LAYERS;
+    else if (arg_match(&arg, &compress_arg, argi))
+      compress = 1;
+    else if (arg_match(&arg, &usage_arg, argi))
+      usage_exit();
+    else if (arg_match(&arg, &limit_arg, argi))
+      stop_after = arg_parse_uint(&arg);
+    else if (arg_match(&arg, &skip_non_transform_arg, argi))
+      skip_non_transform = arg_parse_uint(&arg);
+    else if (arg_match(&arg, &combined_arg, argi))
+      convert_to_indices(
+          (char *)arg.val, combined_parm_list,
+          sizeof(combined_parm_list) / sizeof(combined_parm_list[0]),
+          &combined_parm_count);
+    else
+      argj++;
+  }
+}
+
+static const char *exec_name;
+
+void usage_exit(void) {
+  fprintf(stderr, "Usage: %s src_filename <options>\n", exec_name);
+  fprintf(stderr, "\nOptions:\n");
+  arg_show_usage(stderr, main_args);
+  exit(EXIT_FAILURE);
+}
+
+EMSCRIPTEN_KEEPALIVE
+int main(int argc, char **argv) {
+  exec_name = argv[0];
+  parse_args(argv);
+  if (argc >= 2) {
+    open_file(argv[1]);
+    printf("[\n");
+    while (1) {
+      if (stop_after && (decoded_frame_count >= stop_after)) break;
+      if (read_frame()) break;
+    }
+    printf("null\n");
+    printf("]");
+  } else {
+    usage_exit();
+  }
+}
+
+EMSCRIPTEN_KEEPALIVE
+void quit() {
+  if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec");
+  aom_video_reader_close(reader);
+}
+
+EMSCRIPTEN_KEEPALIVE
+void set_layers(LayerType v) { layers = v; }
+
+EMSCRIPTEN_KEEPALIVE
+void set_compress(int v) { compress = v; }
diff --git a/libs/libaom/src/examples/lightfield_bitstream_parsing.c b/libs/libaom/src/examples/lightfield_bitstream_parsing.c
new file mode 100644
index 000000000..ffcbcb9cb
--- /dev/null
+++ b/libs/libaom/src/examples/lightfield_bitstream_parsing.c
@@ -0,0 +1,414 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// Lightfield Bitstream Parsing
+// ============================
+//
+// This is a lightfield bitstream parsing example. It takes an input file
+// containing the whole compressed lightfield bitstream(ivf file) and a text
+// file containing a stream of tiles to decode and then constructs and outputs
+// a new bitstream that can be decoded by an AV1 decoder. The output bitstream
+// contains reference frames(i.e. anchor frames), camera frame header, and
+// tile list OBUs. num_references is the number of anchor frames coded at the
+// beginning of the light field file.  After running the lightfield encoder,
+// run lightfield bitstream parsing:
+// examples/lightfield_bitstream_parsing vase10x10.ivf vase_tile_list.ivf 4
+//   tile_list.txt
+//
+// The tile_list.txt is expected to be of the form:
+// Frame <frame_index0>
+// <image_index0> <anchor_index0> <tile_col0> <tile_row0>
+// <image_index1> <anchor_index1> <tile_col1> <tile_row1>
+// ...
+// Frame <frame_index1)
+// ...
+//
+// The "Frame" markers indicate a new render frame and thus a new tile list
+// will be started and the old one flushed.  The image_indexN, anchor_indexN,
+// tile_colN, and tile_rowN identify an individual tile to be decoded and
+// to use anchor_indexN anchor image for MCP.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom/aom_decoder.h"
+#include "aom/aom_encoder.h"
+#include "aom/aom_integer.h"
+#include "aom/aomdx.h"
+#include "aom_dsp/bitwriter_buffer.h"
+#include "common/tools_common.h"
+#include "common/video_reader.h"
+#include "common/video_writer.h"
+
+#define MAX_TILES 512
+
+static const char *exec_name;
+
+void usage_exit(void) {
+  fprintf(stderr, "Usage: %s <infile> <outfile> <num_references> <tile_list>\n",
+          exec_name);
+  exit(EXIT_FAILURE);
+}
+
+#define ALIGN_POWER_OF_TWO(value, n) \
+  (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
+
+const int output_frame_width = 512;
+const int output_frame_height = 512;
+
+// Spec:
+// typedef struct {
+//   uint8_t anchor_frame_idx;
+//   uint8_t tile_row;
+//   uint8_t tile_col;
+//   uint16_t coded_tile_data_size_minus_1;
+//   uint8_t *coded_tile_data;
+// } TILE_LIST_ENTRY;
+
+// Tile list entry provided by the application
+typedef struct {
+  int image_idx;
+  int reference_idx;
+  int tile_col;
+  int tile_row;
+} TILE_LIST_INFO;
+
+static int get_image_bps(aom_img_fmt_t fmt) {
+  switch (fmt) {
+    case AOM_IMG_FMT_I420: return 12;
+    case AOM_IMG_FMT_I422: return 16;
+    case AOM_IMG_FMT_I444: return 24;
+    case AOM_IMG_FMT_I42016: return 24;
+    case AOM_IMG_FMT_I42216: return 32;
+    case AOM_IMG_FMT_I44416: return 48;
+    default: die("Invalid image format");
+  }
+  return 0;
+}
+
+void process_tile_list(const TILE_LIST_INFO *tiles, int num_tiles,
+                       aom_codec_pts_t tl_pts, unsigned char **frames,
+                       const size_t *frame_sizes, aom_codec_ctx_t *codec,
+                       unsigned char *tl_buf, AvxVideoWriter *writer,
+                       uint8_t output_frame_width_in_tiles_minus_1,
+                       uint8_t output_frame_height_in_tiles_minus_1) {
+  unsigned char *tl = tl_buf;
+  struct aom_write_bit_buffer wb = { tl, 0 };
+  unsigned char *saved_obu_size_loc = NULL;
+  uint32_t tile_list_obu_header_size = 0;
+  uint32_t tile_list_obu_size = 0;
+  int num_tiles_minus_1 = num_tiles - 1;
+  int i;
+
+  // Write the tile list OBU header that is 1 byte long.
+  aom_wb_write_literal(&wb, 0, 1);  // forbidden bit.
+  aom_wb_write_literal(&wb, 8, 4);  // tile list OBU: "1000"
+  aom_wb_write_literal(&wb, 0, 1);  // obu_extension = 0
+  aom_wb_write_literal(&wb, 1, 1);  // obu_has_size_field
+  aom_wb_write_literal(&wb, 0, 1);  // reserved
+  tl++;
+  tile_list_obu_header_size++;
+
+  // Write the OBU size using a fixed length_field_size of 4 bytes.
+  saved_obu_size_loc = tl;
+  // aom_wb_write_unsigned_literal(&wb, data, bits) requires that bits <= 32.
+  aom_wb_write_unsigned_literal(&wb, 0, 32);
+  tl += 4;
+  tile_list_obu_header_size += 4;
+
+  // write_tile_list_obu()
+  aom_wb_write_literal(&wb, output_frame_width_in_tiles_minus_1, 8);
+  aom_wb_write_literal(&wb, output_frame_height_in_tiles_minus_1, 8);
+  aom_wb_write_literal(&wb, num_tiles_minus_1, 16);
+  tl += 4;
+  tile_list_obu_size += 4;
+
+  // Write each tile's data
+  for (i = 0; i <= num_tiles_minus_1; i++) {
+    aom_tile_data tile_data = { 0, NULL, 0 };
+
+    int image_idx = tiles[i].image_idx;
+    int ref_idx = tiles[i].reference_idx;
+    int tc = tiles[i].tile_col;
+    int tr = tiles[i].tile_row;
+
+    // Reset bit writer to the right location.
+    wb.bit_buffer = tl;
+    wb.bit_offset = 0;
+
+    size_t frame_size = frame_sizes[image_idx];
+    const unsigned char *frame = frames[image_idx];
+
+    AOM_CODEC_CONTROL_TYPECHECKED(codec, AV1_SET_DECODE_TILE_ROW, tr);
+    AOM_CODEC_CONTROL_TYPECHECKED(codec, AV1_SET_DECODE_TILE_COL, tc);
+
+    aom_codec_err_t aom_status =
+        aom_codec_decode(codec, frame, frame_size, NULL);
+    if (aom_status) die_codec(codec, "Failed to decode tile.");
+
+    AOM_CODEC_CONTROL_TYPECHECKED(codec, AV1D_GET_TILE_DATA, &tile_data);
+
+    // Copy over tile info.
+    //  uint8_t anchor_frame_idx;
+    //  uint8_t tile_row;
+    //  uint8_t tile_col;
+    //  uint16_t coded_tile_data_size_minus_1;
+    //  uint8_t *coded_tile_data;
+    uint32_t tile_info_bytes = 5;
+    aom_wb_write_literal(&wb, ref_idx, 8);
+    aom_wb_write_literal(&wb, tr, 8);
+    aom_wb_write_literal(&wb, tc, 8);
+    aom_wb_write_literal(&wb, (int)tile_data.coded_tile_data_size - 1, 16);
+    tl += tile_info_bytes;
+
+    memcpy(tl, (uint8_t *)tile_data.coded_tile_data,
+           tile_data.coded_tile_data_size);
+    tl += tile_data.coded_tile_data_size;
+
+    tile_list_obu_size +=
+        tile_info_bytes + (uint32_t)tile_data.coded_tile_data_size;
+  }
+
+  // Write tile list OBU size.
+  size_t bytes_written = 0;
+  if (aom_uleb_encode_fixed_size(tile_list_obu_size, 4, 4, saved_obu_size_loc,
+                                 &bytes_written))
+    die_codec(codec, "Failed to encode the tile list obu size.");
+
+  // Copy the tile list.
+  if (!aom_video_writer_write_frame(
+          writer, tl_buf, tile_list_obu_header_size + tile_list_obu_size,
+          tl_pts))
+    die_codec(codec, "Failed to copy compressed tile list.");
+}
+
+int main(int argc, char **argv) {
+  aom_codec_ctx_t codec;
+  AvxVideoReader *reader = NULL;
+  AvxVideoWriter *writer = NULL;
+  const AvxInterface *decoder = NULL;
+  const AvxVideoInfo *info = NULL;
+  int num_references;
+  int i;
+  aom_codec_pts_t pts;
+  const char *tile_list_file = NULL;
+
+  exec_name = argv[0];
+  if (argc != 5) die("Invalid number of arguments.");
+
+  reader = aom_video_reader_open(argv[1]);
+  if (!reader) die("Failed to open %s for reading.", argv[1]);
+
+  num_references = (int)strtol(argv[3], NULL, 0);
+  info = aom_video_reader_get_info(reader);
+
+  aom_video_reader_set_fourcc(reader, AV1_FOURCC);
+
+  // The writer to write out ivf file in tile list OBU, which can be decoded by
+  // AV1 decoder.
+  writer = aom_video_writer_open(argv[2], kContainerIVF, info);
+  if (!writer) die("Failed to open %s for writing", argv[2]);
+
+  tile_list_file = argv[4];
+
+  decoder = get_aom_decoder_by_fourcc(info->codec_fourcc);
+  if (!decoder) die("Unknown input codec.");
+  printf("Using %s\n", aom_codec_iface_name(decoder->codec_interface()));
+
+  if (aom_codec_dec_init(&codec, decoder->codec_interface(), NULL, 0))
+    die_codec(&codec, "Failed to initialize decoder.");
+
+  // Decode anchor frames.
+  AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1_SET_TILE_MODE, 0);
+
+  printf("Reading %d reference images.\n", num_references);
+  for (i = 0; i < num_references; ++i) {
+    aom_video_reader_read_frame(reader);
+
+    size_t frame_size = 0;
+    const unsigned char *frame =
+        aom_video_reader_get_frame(reader, &frame_size);
+    pts = (aom_codec_pts_t)aom_video_reader_get_frame_pts(reader);
+
+    // Copy references bitstream directly.
+    if (!aom_video_writer_write_frame(writer, frame, frame_size, pts))
+      die_codec(&codec, "Failed to copy compressed anchor frame.");
+
+    if (aom_codec_decode(&codec, frame, frame_size, NULL))
+      die_codec(&codec, "Failed to decode frame.");
+  }
+
+  // Decode camera frames.
+  AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1_SET_TILE_MODE, 1);
+  AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_EXT_TILE_DEBUG, 1);
+
+  FILE *infile = aom_video_reader_get_file(reader);
+  // Record the offset of the first camera image.
+  const FileOffset camera_frame_pos = ftello(infile);
+
+  printf("Loading compressed frames into memory.\n");
+
+  // Count the frames in the lightfield.
+  int num_frames = 0;
+  while (aom_video_reader_read_frame(reader)) {
+    ++num_frames;
+  }
+  if (num_frames < 1) die("Input light field has no frames.");
+
+  // Read all of the lightfield frames into memory.
+  unsigned char **frames =
+      (unsigned char **)malloc(num_frames * sizeof(unsigned char *));
+  size_t *frame_sizes = (size_t *)malloc(num_frames * sizeof(size_t));
+  // Seek to the first camera image.
+  fseeko(infile, camera_frame_pos, SEEK_SET);
+  for (int f = 0; f < num_frames; ++f) {
+    aom_video_reader_read_frame(reader);
+    size_t frame_size = 0;
+    const unsigned char *frame =
+        aom_video_reader_get_frame(reader, &frame_size);
+    frames[f] = (unsigned char *)malloc(frame_size * sizeof(unsigned char));
+    memcpy(frames[f], frame, frame_size);
+    frame_sizes[f] = frame_size;
+  }
+  printf("Read %d frames.\n", num_frames);
+
+  // Copy first camera frame for getting camera frame header. This is done
+  // only once.
+  {
+    size_t frame_size = frame_sizes[0];
+    const unsigned char *frame = frames[0];
+    pts = num_references;
+    aom_tile_data frame_header_info = { 0, NULL, 0 };
+
+    // Need to decode frame header to get camera frame header info. So, here
+    // decoding 1 tile is enough.
+    AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1_SET_DECODE_TILE_ROW, 0);
+    AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1_SET_DECODE_TILE_COL, 0);
+
+    aom_codec_err_t aom_status =
+        aom_codec_decode(&codec, frame, frame_size, NULL);
+    if (aom_status) die_codec(&codec, "Failed to decode tile.");
+
+    AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_GET_FRAME_HEADER_INFO,
+                                  &frame_header_info);
+
+    size_t obu_size_offset =
+        (uint8_t *)frame_header_info.coded_tile_data - frame;
+    size_t length_field_size = frame_header_info.coded_tile_data_size;
+    // Remove ext-tile tile info.
+    uint32_t frame_header_size = (uint32_t)frame_header_info.extra_size - 1;
+    size_t bytes_to_copy =
+        obu_size_offset + length_field_size + frame_header_size;
+
+    unsigned char *frame_hdr_buf = (unsigned char *)malloc(bytes_to_copy);
+    if (frame_hdr_buf == NULL)
+      die_codec(&codec, "Failed to allocate frame header buffer.");
+
+    memcpy(frame_hdr_buf, frame, bytes_to_copy);
+
+    // Update frame header OBU size.
+    size_t bytes_written = 0;
+    if (aom_uleb_encode_fixed_size(
+            frame_header_size, length_field_size, length_field_size,
+            frame_hdr_buf + obu_size_offset, &bytes_written))
+      die_codec(&codec, "Failed to encode the tile list obu size.");
+
+    // Copy camera frame header bitstream.
+    if (!aom_video_writer_write_frame(writer, frame_hdr_buf, bytes_to_copy,
+                                      pts))
+      die_codec(&codec, "Failed to copy compressed camera frame header.");
+    free(frame_hdr_buf);
+  }
+
+  // Read out the image format.
+  aom_img_fmt_t ref_fmt = 0;
+  if (AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_GET_IMG_FORMAT, &ref_fmt))
+    die_codec(&codec, "Failed to get the image format");
+  const int bps = get_image_bps(ref_fmt);
+  if (!bps) die_codec(&codec, "Invalid image format.");
+  // read out the tile size.
+  unsigned int tile_size = 0;
+  if (AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_GET_TILE_SIZE, &tile_size))
+    die_codec(&codec, "Failed to get the tile size");
+  const unsigned int tile_width = tile_size >> 16;
+  const unsigned int tile_height = tile_size & 65535;
+  // Allocate a buffer to store tile list bitstream.
+  const size_t data_sz = MAX_TILES * ALIGN_POWER_OF_TWO(tile_width, 5) *
+                         ALIGN_POWER_OF_TWO(tile_height, 5) * bps / 8;
+
+  unsigned char *tl_buf = (unsigned char *)malloc(data_sz);
+  if (tl_buf == NULL) die_codec(&codec, "Failed to allocate tile list buffer.");
+
+  aom_codec_pts_t tl_pts = num_references;
+  const uint8_t output_frame_width_in_tiles_minus_1 =
+      output_frame_width / tile_width - 1;
+  const uint8_t output_frame_height_in_tiles_minus_1 =
+      output_frame_height / tile_height - 1;
+
+  printf("Reading tile list from file.\n");
+  char line[1024];
+  FILE *tile_list_fptr = fopen(tile_list_file, "r");
+  if (!tile_list_fptr) die_codec(&codec, "Failed to open tile list file.");
+  int num_tiles = 0;
+  TILE_LIST_INFO tiles[MAX_TILES];
+  while ((fgets(line, 1024, tile_list_fptr)) != NULL) {
+    if (line[0] == 'F' || num_tiles >= MAX_TILES) {
+      // Flush existing tile list and start another, either because we hit a
+      // new render frame or because we've hit our max number of tiles per list.
+      if (num_tiles > 0) {
+        process_tile_list(tiles, num_tiles, tl_pts, frames, frame_sizes, &codec,
+                          tl_buf, writer, output_frame_width_in_tiles_minus_1,
+                          output_frame_height_in_tiles_minus_1);
+        ++tl_pts;
+      }
+      num_tiles = 0;
+    }
+    if (line[0] == 'F') {
+      continue;
+    }
+    if (sscanf(line, "%d %d %d %d", &tiles[num_tiles].image_idx,
+               &tiles[num_tiles].reference_idx, &tiles[num_tiles].tile_col,
+               &tiles[num_tiles].tile_row) == 4) {
+      if (tiles[num_tiles].image_idx >= num_frames) {
+        die("Tile list image_idx out of bounds: %d >= %d.",
+            tiles[num_tiles].image_idx, num_frames);
+      }
+      if (tiles[num_tiles].reference_idx >= num_references) {
+        die("Tile list reference_idx out of bounds: %d >= %d.",
+            tiles[num_tiles].reference_idx, num_references);
+      }
+      ++num_tiles;
+    }
+  }
+  if (num_tiles > 0) {
+    // Flush out the last tile list.
+    process_tile_list(tiles, num_tiles, tl_pts, frames, frame_sizes, &codec,
+                      tl_buf, writer, output_frame_width_in_tiles_minus_1,
+                      output_frame_height_in_tiles_minus_1);
+    ++tl_pts;
+  }
+
+  const int num_tile_lists = (int)(tl_pts - pts);
+  printf("Finished processing tile lists.  Num tile lists: %d.\n",
+         num_tile_lists);
+  free(tl_buf);
+  for (int f = 0; f < num_frames; ++f) {
+    free(frames[f]);
+  }
+  free(frame_sizes);
+  free(frames);
+  if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec");
+  aom_video_writer_close(writer);
+  aom_video_reader_close(reader);
+
+  return EXIT_SUCCESS;
+}
diff --git a/libs/libaom/src/examples/lightfield_decoder.c b/libs/libaom/src/examples/lightfield_decoder.c
new file mode 100644
index 000000000..a292e9c75
--- /dev/null
+++ b/libs/libaom/src/examples/lightfield_decoder.c
@@ -0,0 +1,364 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// Lightfield Decoder
+// ==================
+//
+// This is an example of a simple lightfield decoder. It builds upon the
+// simple_decoder.c example.  It takes an input file containing the compressed
+// data (in ivf format), treating it as a lightfield instead of a video; and a
+// text file with a list of tiles to decode. There is an optional parameter
+// allowing to choose the output format, and the supported formats are
+// YUV1D(default), YUV, and NV12.
+// After running the lightfield encoder, run lightfield decoder to decode a
+// batch of tiles:
+// examples/lightfield_decoder vase10x10.ivf vase_reference.yuv 4 tile_list.txt
+// 0(optional)
+// The tile_list.txt is expected to be of the form:
+// Frame <frame_index0>
+// <image_index0> <anchor_index0> <tile_col0> <tile_row0>
+// <image_index1> <anchor_index1> <tile_col1> <tile_row1>
+// ...
+// Frame <frame_index1)
+// ...
+//
+// The "Frame" markers indicate a new render frame and thus a new tile list
+// will be started and the old one flushed.  The image_indexN, anchor_indexN,
+// tile_colN, and tile_rowN identify an individual tile to be decoded and
+// to use anchor_indexN anchor image for MCP.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom/aom_decoder.h"
+#include "aom/aomdx.h"
+#include "aom_scale/yv12config.h"
+#include "av1/common/enums.h"
+#include "common/tools_common.h"
+#include "common/video_reader.h"
+
+static const char *exec_name;
+
+void usage_exit(void) {
+  fprintf(stderr,
+          "Usage: %s <infile> <outfile> <num_references> <tile_list> <output "
+          "format(optional)>\n",
+          exec_name);
+  exit(EXIT_FAILURE);
+}
+
+// Output frame size
+const int output_frame_width = 512;
+const int output_frame_height = 512;
+
+static void aom_img_copy_tile(const aom_image_t *src, const aom_image_t *dst,
+                              int dst_row_offset, int dst_col_offset) {
+  const int shift = (src->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 1 : 0;
+  int plane;
+
+  for (plane = 0; plane < 3; ++plane) {
+    const unsigned char *src_buf = src->planes[plane];
+    const int src_stride = src->stride[plane];
+    unsigned char *dst_buf = dst->planes[plane];
+    const int dst_stride = dst->stride[plane];
+    const int roffset =
+        (plane > 0) ? dst_row_offset >> dst->y_chroma_shift : dst_row_offset;
+    const int coffset =
+        (plane > 0) ? dst_col_offset >> dst->x_chroma_shift : dst_col_offset;
+
+    // col offset needs to be adjusted for HBD.
+    dst_buf += roffset * dst_stride + (coffset << shift);
+
+    const int w = (aom_img_plane_width(src, plane) << shift);
+    const int h = aom_img_plane_height(src, plane);
+    int y;
+
+    for (y = 0; y < h; ++y) {
+      memcpy(dst_buf, src_buf, w);
+      src_buf += src_stride;
+      dst_buf += dst_stride;
+    }
+  }
+}
+
+void decode_tile(aom_codec_ctx_t *codec, const unsigned char *frame,
+                 size_t frame_size, int tr, int tc, int ref_idx,
+                 aom_image_t *reference_images, aom_image_t *output,
+                 int *tile_idx, unsigned int *output_bit_depth,
+                 aom_image_t **img_ptr, int output_format) {
+  AOM_CODEC_CONTROL_TYPECHECKED(codec, AV1_SET_TILE_MODE, 1);
+  AOM_CODEC_CONTROL_TYPECHECKED(codec, AV1D_EXT_TILE_DEBUG, 1);
+  AOM_CODEC_CONTROL_TYPECHECKED(codec, AV1_SET_DECODE_TILE_ROW, tr);
+  AOM_CODEC_CONTROL_TYPECHECKED(codec, AV1_SET_DECODE_TILE_COL, tc);
+
+  av1_ref_frame_t ref;
+  ref.idx = 0;
+  ref.use_external_ref = 1;
+  ref.img = reference_images[ref_idx];
+  if (AOM_CODEC_CONTROL_TYPECHECKED(codec, AV1_SET_REFERENCE, &ref)) {
+    die_codec(codec, "Failed to set reference frame.");
+  }
+
+  aom_codec_err_t aom_status = aom_codec_decode(codec, frame, frame_size, NULL);
+  if (aom_status) die_codec(codec, "Failed to decode tile.");
+
+  aom_codec_iter_t iter = NULL;
+  aom_image_t *img = aom_codec_get_frame(codec, &iter);
+  if (!img) die_codec(codec, "Failed to get frame.");
+  *img_ptr = img;
+
+  // aom_img_alloc() sets bit_depth as follows:
+  // output->bit_depth = (fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 16 : 8;
+  // Use img->bit_depth(read from bitstream), so that aom_shift_img()
+  // works as expected.
+  output->bit_depth = img->bit_depth;
+  *output_bit_depth = img->bit_depth;
+
+  if (output_format != YUV1D) {
+    // read out the tile size.
+    unsigned int tile_size = 0;
+    if (AOM_CODEC_CONTROL_TYPECHECKED(codec, AV1D_GET_TILE_SIZE, &tile_size))
+      die_codec(codec, "Failed to get the tile size");
+    const unsigned int tile_width = tile_size >> 16;
+    const unsigned int tile_height = tile_size & 65535;
+    const uint32_t output_frame_width_in_tiles =
+        output_frame_width / tile_width;
+
+    // Copy the tile to the output frame.
+    const int row_offset =
+        (*tile_idx / output_frame_width_in_tiles) * tile_height;
+    const int col_offset =
+        (*tile_idx % output_frame_width_in_tiles) * tile_width;
+
+    aom_img_copy_tile(img, output, row_offset, col_offset);
+    (*tile_idx)++;
+  }
+}
+
+static void img_write_to_file(const aom_image_t *img, FILE *file,
+                              int output_format) {
+  if (output_format == YUV)
+    aom_img_write(img, file);
+  else if (output_format == NV12)
+    aom_img_write_nv12(img, file);
+  else
+    die("Invalid output format");
+}
+
+int main(int argc, char **argv) {
+  FILE *outfile = NULL;
+  aom_codec_ctx_t codec;
+  AvxVideoReader *reader = NULL;
+  const AvxInterface *decoder = NULL;
+  const AvxVideoInfo *info = NULL;
+  int num_references;
+  aom_img_fmt_t ref_fmt = 0;
+  aom_image_t reference_images[MAX_EXTERNAL_REFERENCES];
+  aom_image_t output;
+  aom_image_t *output_shifted = NULL;
+  size_t frame_size = 0;
+  const unsigned char *frame = NULL;
+  int i, j;
+  const char *tile_list_file = NULL;
+  int output_format = YUV1D;
+  exec_name = argv[0];
+
+  if (argc < 5) die("Invalid number of arguments.");
+
+  reader = aom_video_reader_open(argv[1]);
+  if (!reader) die("Failed to open %s for reading.", argv[1]);
+
+  if (!(outfile = fopen(argv[2], "wb")))
+    die("Failed to open %s for writing.", argv[2]);
+
+  num_references = (int)strtol(argv[3], NULL, 0);
+  tile_list_file = argv[4];
+
+  if (argc > 5) output_format = (int)strtol(argv[5], NULL, 0);
+  if (output_format < YUV1D || output_format > NV12)
+    die("Output format out of range [0, 2]");
+
+  info = aom_video_reader_get_info(reader);
+
+  if (info->codec_fourcc == LST_FOURCC)
+    decoder = get_aom_decoder_by_fourcc(AV1_FOURCC);
+  else
+    die("Unknown input codec.");
+  printf("Using %s\n", aom_codec_iface_name(decoder->codec_interface()));
+
+  if (aom_codec_dec_init(&codec, decoder->codec_interface(), NULL, 0))
+    die_codec(&codec, "Failed to initialize decoder.");
+
+  if (AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_SET_IS_ANNEXB,
+                                    info->is_annexb)) {
+    die("Failed to set annex b status");
+  }
+
+  // Decode anchor frames.
+  AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1_SET_TILE_MODE, 0);
+  for (i = 0; i < num_references; ++i) {
+    aom_video_reader_read_frame(reader);
+    frame = aom_video_reader_get_frame(reader, &frame_size);
+    if (aom_codec_decode(&codec, frame, frame_size, NULL))
+      die_codec(&codec, "Failed to decode frame.");
+
+    if (i == 0) {
+      if (AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_GET_IMG_FORMAT, &ref_fmt))
+        die_codec(&codec, "Failed to get the image format");
+
+      int frame_res[2];
+      if (AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_GET_FRAME_SIZE, frame_res))
+        die_codec(&codec, "Failed to get the image frame size");
+
+      // Allocate memory to store decoded references. Allocate memory with the
+      // border so that it can be used as a reference.
+      for (j = 0; j < num_references; j++) {
+        unsigned int border = AOM_DEC_BORDER_IN_PIXELS;
+        if (!aom_img_alloc_with_border(&reference_images[j], ref_fmt,
+                                       frame_res[0], frame_res[1], 32, 8,
+                                       border)) {
+          die("Failed to allocate references.");
+        }
+      }
+    }
+
+    if (AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1_COPY_NEW_FRAME_IMAGE,
+                                      &reference_images[i]))
+      die_codec(&codec, "Failed to copy decoded reference frame");
+
+    aom_codec_iter_t iter = NULL;
+    aom_image_t *img = NULL;
+    while ((img = aom_codec_get_frame(&codec, &iter)) != NULL) {
+      char name[1024];
+      snprintf(name, sizeof(name), "ref_%d.yuv", i);
+      printf("writing ref image to %s, %d, %d\n", name, img->d_w, img->d_h);
+      FILE *ref_file = fopen(name, "wb");
+      aom_img_write(img, ref_file);
+      fclose(ref_file);
+    }
+  }
+
+  FILE *infile = aom_video_reader_get_file(reader);
+  // Record the offset of the first camera image.
+  const FileOffset camera_frame_pos = ftello(infile);
+
+  printf("Loading compressed frames into memory.\n");
+
+  // Count the frames in the lightfield.
+  int num_frames = 0;
+  while (aom_video_reader_read_frame(reader)) {
+    ++num_frames;
+  }
+  if (num_frames < 1) die("Input light field has no frames.");
+
+  // Read all of the lightfield frames into memory.
+  unsigned char **frames =
+      (unsigned char **)malloc(num_frames * sizeof(unsigned char *));
+  size_t *frame_sizes = (size_t *)malloc(num_frames * sizeof(size_t));
+  // Seek to the first camera image.
+  fseeko(infile, camera_frame_pos, SEEK_SET);
+  for (int f = 0; f < num_frames; ++f) {
+    aom_video_reader_read_frame(reader);
+    frame = aom_video_reader_get_frame(reader, &frame_size);
+    frames[f] = (unsigned char *)malloc(frame_size * sizeof(unsigned char));
+    memcpy(frames[f], frame, frame_size);
+    frame_sizes[f] = frame_size;
+  }
+  printf("Read %d frames.\n", num_frames);
+
+  if (output_format != YUV1D) {
+    // Allocate the output frame.
+    aom_img_fmt_t out_fmt = ref_fmt;
+    if (FORCE_HIGHBITDEPTH_DECODING) out_fmt |= AOM_IMG_FMT_HIGHBITDEPTH;
+    if (!aom_img_alloc(&output, out_fmt, output_frame_width,
+                       output_frame_height, 32))
+      die("Failed to allocate output image.");
+  }
+
+  printf("Decoding tile list from file.\n");
+  char line[1024];
+  FILE *tile_list_fptr = fopen(tile_list_file, "r");
+  if (!tile_list_fptr) die_codec(&codec, "Failed to open tile list file.");
+  int tile_list_cnt = 0;
+  int tile_list_writes = 0;
+  int tile_idx = 0;
+  aom_image_t *out = NULL;
+  unsigned int output_bit_depth = 0;
+
+  while ((fgets(line, 1024, tile_list_fptr)) != NULL) {
+    if (line[0] == 'F') {
+      if (output_format != YUV1D) {
+        // Write out the tile list.
+        if (tile_list_cnt) {
+          out = &output;
+          if (output_bit_depth != 0)
+            aom_shift_img(output_bit_depth, &out, &output_shifted);
+          img_write_to_file(out, outfile, output_format);
+          tile_list_writes++;
+        }
+
+        tile_list_cnt++;
+        tile_idx = 0;
+        // Then memset the frame.
+        memset(output.img_data, 0, output.sz);
+      }
+      continue;
+    }
+
+    int image_idx, ref_idx, tc, tr;
+    sscanf(line, "%d %d %d %d", &image_idx, &ref_idx, &tc, &tr);
+    if (image_idx >= num_frames) {
+      die("Tile list image_idx out of bounds: %d >= %d.", image_idx,
+          num_frames);
+    }
+    if (ref_idx >= num_references) {
+      die("Tile list ref_idx out of bounds: %d >= %d.", ref_idx,
+          num_references);
+    }
+    frame = frames[image_idx];
+    frame_size = frame_sizes[image_idx];
+
+    aom_image_t *img = NULL;
+    decode_tile(&codec, frame, frame_size, tr, tc, ref_idx, reference_images,
+                &output, &tile_idx, &output_bit_depth, &img, output_format);
+    if (output_format == YUV1D) {
+      out = img;
+      if (output_bit_depth != 0)
+        aom_shift_img(output_bit_depth, &out, &output_shifted);
+      aom_img_write(out, outfile);
+    }
+  }
+
+  if (output_format != YUV1D) {
+    // Write out the last tile list.
+    if (tile_list_writes < tile_list_cnt) {
+      out = &output;
+      if (output_bit_depth != 0)
+        aom_shift_img(output_bit_depth, &out, &output_shifted);
+      img_write_to_file(out, outfile, output_format);
+    }
+  }
+
+  if (output_shifted) aom_img_free(output_shifted);
+  if (output_format != YUV1D) aom_img_free(&output);
+  for (i = 0; i < num_references; i++) aom_img_free(&reference_images[i]);
+  for (int f = 0; f < num_frames; ++f) {
+    free(frames[f]);
+  }
+  free(frame_sizes);
+  free(frames);
+  if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec");
+  aom_video_reader_close(reader);
+  fclose(outfile);
+
+  return EXIT_SUCCESS;
+}
diff --git a/libs/libaom/src/examples/lightfield_encoder.c b/libs/libaom/src/examples/lightfield_encoder.c
new file mode 100644
index 000000000..e80fe24f6
--- /dev/null
+++ b/libs/libaom/src/examples/lightfield_encoder.c
@@ -0,0 +1,522 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// Lightfield Encoder
+// ==================
+//
+// This is an example of a simple lightfield encoder.  It builds upon the
+// twopass_encoder.c example. It takes an input file in YV12 format,
+// treating it as a planar lightfield instead of a video. The img_width
+// and img_height arguments are the dimensions of the lightfield images,
+// while the lf_width and lf_height arguments are the number of
+// lightfield images in each dimension. The lf_blocksize determines the
+// number of reference images used for MCP. For example, 5 means that there
+// is a reference image for every 5x5 lightfield image block. All images
+// within a block will use the center image in that block as the reference
+// image for MCP.
+// Run "make test" to download lightfield test data: vase10x10.yuv.
+// Run lightfield encoder to encode whole lightfield:
+// examples/lightfield_encoder 1024 1024 vase10x10.yuv vase10x10.ivf 10 10 5
+
+// Note: In bitstream.c and encoder.c, define EXT_TILE_DEBUG as 1 will print
+// out the uncompressed header and the frame contexts, which can be used to
+// test the bit exactness of the headers and the frame contexts for large scale
+// tile coded frames.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom/aom_encoder.h"
+#include "aom/aomcx.h"
+#include "aom_scale/yv12config.h"
+#include "av1/common/enums.h"
+#include "common/tools_common.h"
+#include "common/video_writer.h"
+
+static const char *exec_name;
+
+void usage_exit(void) {
+  fprintf(stderr,
+          "Usage: %s <img_width> <img_height> <infile> <outfile> "
+          "<lf_width> <lf_height> <lf_blocksize>\n",
+          exec_name);
+  exit(EXIT_FAILURE);
+}
+
+static int img_size_bytes(aom_image_t *img) {
+  int image_size_bytes = 0;
+  int plane;
+  for (plane = 0; plane < 3; ++plane) {
+    const int w = aom_img_plane_width(img, plane) *
+                  ((img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1);
+    const int h = aom_img_plane_height(img, plane);
+    image_size_bytes += w * h;
+  }
+  return image_size_bytes;
+}
+
+static int get_frame_stats(aom_codec_ctx_t *ctx, const aom_image_t *img,
+                           aom_codec_pts_t pts, unsigned int duration,
+                           aom_enc_frame_flags_t flags,
+                           aom_fixed_buf_t *stats) {
+  int got_pkts = 0;
+  aom_codec_iter_t iter = NULL;
+  const aom_codec_cx_pkt_t *pkt = NULL;
+  const aom_codec_err_t res = aom_codec_encode(ctx, img, pts, duration, flags);
+  if (res != AOM_CODEC_OK) die_codec(ctx, "Failed to get frame stats.");
+
+  while ((pkt = aom_codec_get_cx_data(ctx, &iter)) != NULL) {
+    got_pkts = 1;
+
+    if (pkt->kind == AOM_CODEC_STATS_PKT) {
+      const uint8_t *const pkt_buf = pkt->data.twopass_stats.buf;
+      const size_t pkt_size = pkt->data.twopass_stats.sz;
+      stats->buf = realloc(stats->buf, stats->sz + pkt_size);
+      memcpy((uint8_t *)stats->buf + stats->sz, pkt_buf, pkt_size);
+      stats->sz += pkt_size;
+    }
+  }
+
+  return got_pkts;
+}
+
+static int encode_frame(aom_codec_ctx_t *ctx, const aom_image_t *img,
+                        aom_codec_pts_t pts, unsigned int duration,
+                        aom_enc_frame_flags_t flags, AvxVideoWriter *writer) {
+  int got_pkts = 0;
+  aom_codec_iter_t iter = NULL;
+  const aom_codec_cx_pkt_t *pkt = NULL;
+  const aom_codec_err_t res = aom_codec_encode(ctx, img, pts, duration, flags);
+  if (res != AOM_CODEC_OK) die_codec(ctx, "Failed to encode frame.");
+
+  while ((pkt = aom_codec_get_cx_data(ctx, &iter)) != NULL) {
+    got_pkts = 1;
+    if (pkt->kind == AOM_CODEC_CX_FRAME_PKT) {
+      const int keyframe = (pkt->data.frame.flags & AOM_FRAME_IS_KEY) != 0;
+
+      if (!aom_video_writer_write_frame(writer, pkt->data.frame.buf,
+                                        pkt->data.frame.sz,
+                                        pkt->data.frame.pts))
+        die_codec(ctx, "Failed to write compressed frame.");
+      printf(keyframe ? "K" : ".");
+      fflush(stdout);
+    }
+  }
+
+  return got_pkts;
+}
+
+static void get_raw_image(aom_image_t **frame_to_encode, aom_image_t *raw,
+                          aom_image_t *raw_shift) {
+  if (FORCE_HIGHBITDEPTH_DECODING) {
+    // Need to allocate larger buffer to use hbd internal.
+    int input_shift = 0;
+    aom_img_upshift(raw_shift, raw, input_shift);
+    *frame_to_encode = raw_shift;
+  } else {
+    *frame_to_encode = raw;
+  }
+}
+
+static aom_fixed_buf_t pass0(aom_image_t *raw, FILE *infile,
+                             const AvxInterface *encoder,
+                             const aom_codec_enc_cfg_t *cfg, int lf_width,
+                             int lf_height, int lf_blocksize, int flags,
+                             aom_image_t *raw_shift) {
+  aom_codec_ctx_t codec;
+  int frame_count = 0;
+  int image_size_bytes = img_size_bytes(raw);
+  int u_blocks, v_blocks;
+  int bu, bv;
+  aom_fixed_buf_t stats = { NULL, 0 };
+  aom_image_t *frame_to_encode;
+
+  if (aom_codec_enc_init(&codec, encoder->codec_interface(), cfg, flags))
+    die_codec(&codec, "Failed to initialize encoder");
+  if (aom_codec_control(&codec, AOME_SET_ENABLEAUTOALTREF, 0))
+    die_codec(&codec, "Failed to turn off auto altref");
+  if (aom_codec_control(&codec, AV1E_SET_FRAME_PARALLEL_DECODING, 0))
+    die_codec(&codec, "Failed to set frame parallel decoding");
+
+  // How many reference images we need to encode.
+  u_blocks = (lf_width + lf_blocksize - 1) / lf_blocksize;
+  v_blocks = (lf_height + lf_blocksize - 1) / lf_blocksize;
+
+  printf("\n First pass: ");
+
+  for (bv = 0; bv < v_blocks; ++bv) {
+    for (bu = 0; bu < u_blocks; ++bu) {
+      const int block_u_min = bu * lf_blocksize;
+      const int block_v_min = bv * lf_blocksize;
+      int block_u_end = (bu + 1) * lf_blocksize;
+      int block_v_end = (bv + 1) * lf_blocksize;
+      int u_block_size, v_block_size;
+      int block_ref_u, block_ref_v;
+
+      block_u_end = block_u_end < lf_width ? block_u_end : lf_width;
+      block_v_end = block_v_end < lf_height ? block_v_end : lf_height;
+      u_block_size = block_u_end - block_u_min;
+      v_block_size = block_v_end - block_v_min;
+      block_ref_u = block_u_min + u_block_size / 2;
+      block_ref_v = block_v_min + v_block_size / 2;
+
+      printf("A%d, ", (block_ref_u + block_ref_v * lf_width));
+      fseek(infile, (block_ref_u + block_ref_v * lf_width) * image_size_bytes,
+            SEEK_SET);
+      aom_img_read(raw, infile);
+      get_raw_image(&frame_to_encode, raw, raw_shift);
+
+      // Reference frames can be encoded encoded without tiles.
+      ++frame_count;
+      get_frame_stats(&codec, frame_to_encode, frame_count, 1,
+                      AOM_EFLAG_NO_REF_LAST2 | AOM_EFLAG_NO_REF_LAST3 |
+                          AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF |
+                          AOM_EFLAG_NO_REF_BWD | AOM_EFLAG_NO_REF_ARF2 |
+                          AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF |
+                          AOM_EFLAG_NO_UPD_ARF,
+                      &stats);
+    }
+  }
+
+  if (aom_codec_control(&codec, AV1E_SET_FRAME_PARALLEL_DECODING, 1))
+    die_codec(&codec, "Failed to set frame parallel decoding");
+
+  for (bv = 0; bv < v_blocks; ++bv) {
+    for (bu = 0; bu < u_blocks; ++bu) {
+      const int block_u_min = bu * lf_blocksize;
+      const int block_v_min = bv * lf_blocksize;
+      int block_u_end = (bu + 1) * lf_blocksize;
+      int block_v_end = (bv + 1) * lf_blocksize;
+      int u, v;
+      block_u_end = block_u_end < lf_width ? block_u_end : lf_width;
+      block_v_end = block_v_end < lf_height ? block_v_end : lf_height;
+      for (v = block_v_min; v < block_v_end; ++v) {
+        for (u = block_u_min; u < block_u_end; ++u) {
+          printf("C%d, ", (u + v * lf_width));
+          fseek(infile, (u + v * lf_width) * image_size_bytes, SEEK_SET);
+          aom_img_read(raw, infile);
+          get_raw_image(&frame_to_encode, raw, raw_shift);
+
+          ++frame_count;
+          get_frame_stats(&codec, frame_to_encode, frame_count, 1,
+                          AOM_EFLAG_NO_REF_LAST2 | AOM_EFLAG_NO_REF_LAST3 |
+                              AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF |
+                              AOM_EFLAG_NO_REF_BWD | AOM_EFLAG_NO_REF_ARF2 |
+                              AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF |
+                              AOM_EFLAG_NO_UPD_ARF | AOM_EFLAG_NO_UPD_ENTROPY,
+                          &stats);
+        }
+      }
+    }
+  }
+  // Flush encoder.
+  // No ARF, this should not be needed.
+  while (get_frame_stats(&codec, NULL, frame_count, 1, 0, &stats)) {
+  }
+
+  if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec.");
+
+  printf("\nFirst pass complete. Processed %d frames.\n", frame_count);
+
+  return stats;
+}
+
+static void pass1(aom_image_t *raw, FILE *infile, const char *outfile_name,
+                  const AvxInterface *encoder, aom_codec_enc_cfg_t *cfg,
+                  int lf_width, int lf_height, int lf_blocksize, int flags,
+                  aom_image_t *raw_shift) {
+  AvxVideoInfo info = { encoder->fourcc,
+                        cfg->g_w,
+                        cfg->g_h,
+                        { cfg->g_timebase.num, cfg->g_timebase.den },
+                        0 };
+  AvxVideoWriter *writer = NULL;
+  aom_codec_ctx_t codec;
+  int frame_count = 0;
+  int image_size_bytes = img_size_bytes(raw);
+  int bu, bv;
+  int u_blocks, v_blocks;
+  aom_image_t *frame_to_encode;
+  aom_image_t reference_images[MAX_EXTERNAL_REFERENCES];
+  int reference_image_num = 0;
+  int i;
+
+  writer = aom_video_writer_open(outfile_name, kContainerIVF, &info);
+  if (!writer) die("Failed to open %s for writing", outfile_name);
+
+  if (aom_codec_enc_init(&codec, encoder->codec_interface(), cfg, flags))
+    die_codec(&codec, "Failed to initialize encoder");
+  if (aom_codec_control(&codec, AOME_SET_ENABLEAUTOALTREF, 0))
+    die_codec(&codec, "Failed to turn off auto altref");
+  if (aom_codec_control(&codec, AV1E_SET_FRAME_PARALLEL_DECODING, 0))
+    die_codec(&codec, "Failed to set frame parallel decoding");
+  if (aom_codec_control(&codec, AV1E_ENABLE_EXT_TILE_DEBUG, 1))
+    die_codec(&codec, "Failed to enable encoder ext_tile debug");
+  if (aom_codec_control(&codec, AOME_SET_CPUUSED, 1))
+    die_codec(&codec, "Failed to set cpu-used");
+
+  // Note: The superblock is a sequence parameter and has to be the same for 1
+  // sequence. In lightfield application, must choose the superblock size(either
+  // 64x64 or 128x128) before the encoding starts. Otherwise, the default is
+  // AOM_SUPERBLOCK_SIZE_DYNAMIC, and the superblock size will be set to 64x64
+  // internally.
+  if (aom_codec_control(&codec, AV1E_SET_SUPERBLOCK_SIZE,
+                        AOM_SUPERBLOCK_SIZE_64X64))
+    die_codec(&codec, "Failed to set SB size");
+
+  u_blocks = (lf_width + lf_blocksize - 1) / lf_blocksize;
+  v_blocks = (lf_height + lf_blocksize - 1) / lf_blocksize;
+
+  reference_image_num = u_blocks * v_blocks;
+  // Set the max gf group length so the references are guaranteed to be in
+  // a different gf group than any of the regular frames. This avoids using
+  // both vbr and constant quality mode in a single group. The number of
+  // references now cannot surpass 17 because of the enforced MAX_GF_INTERVAL of
+  // 16. If it is necessary to exceed this reference frame limit, one will have
+  // to do some additional handling to ensure references are in separate gf
+  // groups from the regular frames.
+  if (aom_codec_control(&codec, AV1E_SET_MAX_GF_INTERVAL,
+                        reference_image_num - 1))
+    die_codec(&codec, "Failed to set max gf interval");
+  aom_img_fmt_t ref_fmt = AOM_IMG_FMT_I420;
+  if (FORCE_HIGHBITDEPTH_DECODING) ref_fmt |= AOM_IMG_FMT_HIGHBITDEPTH;
+  // Allocate memory with the border so that it can be used as a reference.
+  int border_in_pixels =
+      (codec.config.enc->rc_resize_mode || codec.config.enc->rc_superres_mode)
+          ? AOM_BORDER_IN_PIXELS
+          : AOM_ENC_NO_SCALE_BORDER;
+  for (i = 0; i < reference_image_num; i++) {
+    if (!aom_img_alloc_with_border(&reference_images[i], ref_fmt, cfg->g_w,
+                                   cfg->g_h, 32, 8, border_in_pixels)) {
+      die("Failed to allocate image.");
+    }
+  }
+
+  printf("\n Second pass: ");
+
+  // Encode reference images first.
+  printf("Encoding Reference Images\n");
+  for (bv = 0; bv < v_blocks; ++bv) {
+    for (bu = 0; bu < u_blocks; ++bu) {
+      const int block_u_min = bu * lf_blocksize;
+      const int block_v_min = bv * lf_blocksize;
+      int block_u_end = (bu + 1) * lf_blocksize;
+      int block_v_end = (bv + 1) * lf_blocksize;
+      int u_block_size, v_block_size;
+      int block_ref_u, block_ref_v;
+
+      block_u_end = block_u_end < lf_width ? block_u_end : lf_width;
+      block_v_end = block_v_end < lf_height ? block_v_end : lf_height;
+      u_block_size = block_u_end - block_u_min;
+      v_block_size = block_v_end - block_v_min;
+      block_ref_u = block_u_min + u_block_size / 2;
+      block_ref_v = block_v_min + v_block_size / 2;
+
+      printf("A%d, ", (block_ref_u + block_ref_v * lf_width));
+      fseek(infile, (block_ref_u + block_ref_v * lf_width) * image_size_bytes,
+            SEEK_SET);
+      aom_img_read(raw, infile);
+
+      get_raw_image(&frame_to_encode, raw, raw_shift);
+
+      // Reference frames may be encoded without tiles.
+      ++frame_count;
+      printf("Encoding reference image %d of %d\n", bv * u_blocks + bu,
+             u_blocks * v_blocks);
+      encode_frame(&codec, frame_to_encode, frame_count, 1,
+                   AOM_EFLAG_NO_REF_LAST2 | AOM_EFLAG_NO_REF_LAST3 |
+                       AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF |
+                       AOM_EFLAG_NO_REF_BWD | AOM_EFLAG_NO_REF_ARF2 |
+                       AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF |
+                       AOM_EFLAG_NO_UPD_ARF | AOM_EFLAG_NO_UPD_ENTROPY,
+                   writer);
+
+      if (aom_codec_control(&codec, AV1_COPY_NEW_FRAME_IMAGE,
+                            &reference_images[frame_count - 1]))
+        die_codec(&codec, "Failed to copy decoder reference frame");
+    }
+  }
+
+  cfg->large_scale_tile = 1;
+  // Fixed q encoding for camera frames.
+  cfg->rc_end_usage = AOM_Q;
+  if (aom_codec_enc_config_set(&codec, cfg))
+    die_codec(&codec, "Failed to configure encoder");
+
+  // The fixed q value used in encoding.
+  if (aom_codec_control(&codec, AOME_SET_CQ_LEVEL, 36))
+    die_codec(&codec, "Failed to set cq level");
+  if (aom_codec_control(&codec, AV1E_SET_FRAME_PARALLEL_DECODING, 1))
+    die_codec(&codec, "Failed to set frame parallel decoding");
+  if (aom_codec_control(&codec, AV1E_SET_SINGLE_TILE_DECODING, 1))
+    die_codec(&codec, "Failed to turn on single tile decoding");
+  // Set tile_columns and tile_rows to MAX values, which guarantees the tile
+  // size of 64 x 64 pixels(i.e. 1 SB) for <= 4k resolution.
+  if (aom_codec_control(&codec, AV1E_SET_TILE_COLUMNS, 6))
+    die_codec(&codec, "Failed to set tile width");
+  if (aom_codec_control(&codec, AV1E_SET_TILE_ROWS, 6))
+    die_codec(&codec, "Failed to set tile height");
+
+  for (bv = 0; bv < v_blocks; ++bv) {
+    for (bu = 0; bu < u_blocks; ++bu) {
+      const int block_u_min = bu * lf_blocksize;
+      const int block_v_min = bv * lf_blocksize;
+      int block_u_end = (bu + 1) * lf_blocksize;
+      int block_v_end = (bv + 1) * lf_blocksize;
+      int u, v;
+      block_u_end = block_u_end < lf_width ? block_u_end : lf_width;
+      block_v_end = block_v_end < lf_height ? block_v_end : lf_height;
+      for (v = block_v_min; v < block_v_end; ++v) {
+        for (u = block_u_min; u < block_u_end; ++u) {
+          av1_ref_frame_t ref;
+          ref.idx = 0;
+          ref.use_external_ref = 1;
+          ref.img = reference_images[bv * u_blocks + bu];
+          if (aom_codec_control(&codec, AV1_SET_REFERENCE, &ref))
+            die_codec(&codec, "Failed to set reference frame");
+
+          printf("C%d, ", (u + v * lf_width));
+          fseek(infile, (u + v * lf_width) * image_size_bytes, SEEK_SET);
+          aom_img_read(raw, infile);
+          get_raw_image(&frame_to_encode, raw, raw_shift);
+
+          ++frame_count;
+          printf("Encoding image %d of %d\n",
+                 frame_count - (u_blocks * v_blocks), lf_width * lf_height);
+          encode_frame(&codec, frame_to_encode, frame_count, 1,
+                       AOM_EFLAG_NO_REF_LAST2 | AOM_EFLAG_NO_REF_LAST3 |
+                           AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF |
+                           AOM_EFLAG_NO_REF_BWD | AOM_EFLAG_NO_REF_ARF2 |
+                           AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF |
+                           AOM_EFLAG_NO_UPD_ARF | AOM_EFLAG_NO_UPD_ENTROPY,
+                       writer);
+        }
+      }
+    }
+  }
+
+  // Flush encoder.
+  // No ARF, this should not be needed.
+  while (encode_frame(&codec, NULL, -1, 1, 0, writer)) {
+  }
+
+  for (i = 0; i < reference_image_num; i++) aom_img_free(&reference_images[i]);
+
+  if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec.");
+
+  // Modify large_scale_file fourcc.
+  if (cfg->large_scale_tile == 1)
+    aom_video_writer_set_fourcc(writer, LST_FOURCC);
+  aom_video_writer_close(writer);
+
+  printf("\nSecond pass complete. Processed %d frames.\n", frame_count);
+}
+
+int main(int argc, char **argv) {
+  FILE *infile = NULL;
+  int w, h;
+  // The number of lightfield images in the u and v dimensions.
+  int lf_width, lf_height;
+  // Defines how many images refer to the same reference image for MCP.
+  // lf_blocksize X lf_blocksize images will all use the reference image
+  // in the middle of the block of images.
+  int lf_blocksize;
+  aom_codec_ctx_t codec;
+  aom_codec_enc_cfg_t cfg;
+  aom_image_t raw;
+  aom_image_t raw_shift;
+  aom_codec_err_t res;
+  aom_fixed_buf_t stats;
+  int flags = 0;
+
+  const AvxInterface *encoder = NULL;
+  const int fps = 30;
+  const int bitrate = 200;  // kbit/s
+  const char *const width_arg = argv[1];
+  const char *const height_arg = argv[2];
+  const char *const infile_arg = argv[3];
+  const char *const outfile_arg = argv[4];
+  const char *const lf_width_arg = argv[5];
+  const char *const lf_height_arg = argv[6];
+  const char *lf_blocksize_arg = argv[7];
+  exec_name = argv[0];
+
+  if (argc < 8) die("Invalid number of arguments");
+
+  encoder = get_aom_encoder_by_name("av1");
+  if (!encoder) die("Unsupported codec.");
+
+  w = (int)strtol(width_arg, NULL, 0);
+  h = (int)strtol(height_arg, NULL, 0);
+  lf_width = (int)strtol(lf_width_arg, NULL, 0);
+  lf_height = (int)strtol(lf_height_arg, NULL, 0);
+  lf_blocksize = (int)strtol(lf_blocksize_arg, NULL, 0);
+  lf_blocksize = lf_blocksize < lf_width ? lf_blocksize : lf_width;
+  lf_blocksize = lf_blocksize < lf_height ? lf_blocksize : lf_height;
+
+  if (w <= 0 || h <= 0 || (w % 2) != 0 || (h % 2) != 0)
+    die("Invalid frame size: %dx%d", w, h);
+  if (lf_width <= 0 || lf_height <= 0)
+    die("Invalid lf_width and/or lf_height: %dx%d", lf_width, lf_height);
+  if (lf_blocksize <= 0) die("Invalid lf_blocksize: %d", lf_blocksize);
+
+  if (!aom_img_alloc(&raw, AOM_IMG_FMT_I420, w, h, 32)) {
+    die("Failed to allocate image.");
+  }
+  if (FORCE_HIGHBITDEPTH_DECODING) {
+    // Need to allocate larger buffer to use hbd internal.
+    aom_img_alloc(&raw_shift, AOM_IMG_FMT_I420 | AOM_IMG_FMT_HIGHBITDEPTH, w, h,
+                  32);
+  }
+
+  printf("Using %s\n", aom_codec_iface_name(encoder->codec_interface()));
+
+  // Configuration
+  res = aom_codec_enc_config_default(encoder->codec_interface(), &cfg, 0);
+  if (res) die_codec(&codec, "Failed to get default codec config.");
+
+  cfg.g_w = w;
+  cfg.g_h = h;
+  cfg.g_timebase.num = 1;
+  cfg.g_timebase.den = fps;
+  cfg.rc_target_bitrate = bitrate;
+  cfg.g_error_resilient = 0;  // This is required.
+  cfg.g_lag_in_frames = 0;    // need to set this since default is 19.
+  cfg.kf_mode = AOM_KF_DISABLED;
+  cfg.large_scale_tile = 0;  // Only set it to 1 for camera frame encoding.
+  cfg.g_bit_depth = AOM_BITS_8;
+  flags |= (cfg.g_bit_depth > AOM_BITS_8 || FORCE_HIGHBITDEPTH_DECODING)
+               ? AOM_CODEC_USE_HIGHBITDEPTH
+               : 0;
+
+  if (!(infile = fopen(infile_arg, "rb")))
+    die("Failed to open %s for reading", infile_arg);
+
+  // Pass 0
+  cfg.g_pass = AOM_RC_FIRST_PASS;
+  stats = pass0(&raw, infile, encoder, &cfg, lf_width, lf_height, lf_blocksize,
+                flags, &raw_shift);
+
+  // Pass 1
+  rewind(infile);
+  cfg.g_pass = AOM_RC_LAST_PASS;
+  cfg.rc_twopass_stats_in = stats;
+  pass1(&raw, infile, outfile_arg, encoder, &cfg, lf_width, lf_height,
+        lf_blocksize, flags, &raw_shift);
+  free(stats.buf);
+
+  if (FORCE_HIGHBITDEPTH_DECODING) aom_img_free(&raw_shift);
+  aom_img_free(&raw);
+  fclose(infile);
+
+  return EXIT_SUCCESS;
+}
diff --git a/libs/libaom/src/examples/lightfield_tile_list_decoder.c b/libs/libaom/src/examples/lightfield_tile_list_decoder.c
new file mode 100644
index 000000000..3b928df2c
--- /dev/null
+++ b/libs/libaom/src/examples/lightfield_tile_list_decoder.c
@@ -0,0 +1,227 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// Lightfield Tile List Decoder
+// ============================
+//
+// This is a lightfield tile list decoder example. It takes an input file that
+// contains the anchor frames that are references of the coded tiles, the camera
+// frame header, and tile list OBUs that include the tile information and the
+// compressed tile data. This input file is reconstructed from the encoded
+// lightfield ivf file, and is decodable by AV1 decoder. num_references is
+// the number of anchor frames coded at the beginning of the light field file.
+// num_tile_lists is the number of tile lists need to be decoded. There is an
+// optional parameter allowing to choose the output format, and the supported
+// formats are YUV1D(default), YUV, and NV12.
+// Run lightfield tile list decoder to decode an AV1 tile list file:
+// examples/lightfield_tile_list_decoder vase_tile_list.ivf vase_tile_list.yuv
+// 4 2 0(optional)
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include "aom/aom_decoder.h"
+#include "aom/aomdx.h"
+#include "aom_scale/yv12config.h"
+#include "av1/common/enums.h"
+#include "common/tools_common.h"
+#include "common/video_reader.h"
+
+static const char *exec_name;
+
+void usage_exit(void) {
+  fprintf(stderr,
+          "Usage: %s <infile> <outfile> <num_references> <num_tile_lists> "
+          "<output format(optional)>\n",
+          exec_name);
+  exit(EXIT_FAILURE);
+}
+
+static void write_tile_yuv1d(aom_codec_ctx_t *codec, const aom_image_t *img,
+                             FILE *file) {
+  // read out the tile size.
+  unsigned int tile_size = 0;
+  if (AOM_CODEC_CONTROL_TYPECHECKED(codec, AV1D_GET_TILE_SIZE, &tile_size))
+    die_codec(codec, "Failed to get the tile size");
+  const unsigned int tile_width = tile_size >> 16;
+  const unsigned int tile_height = tile_size & 65535;
+  const uint32_t output_frame_width_in_tiles = img->d_w / tile_width;
+
+  unsigned int tile_count = 0;
+  if (AOM_CODEC_CONTROL_TYPECHECKED(codec, AV1D_GET_TILE_COUNT, &tile_count))
+    die_codec(codec, "Failed to get the tile size");
+
+  // Write tile to file.
+  const int shift = (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 1 : 0;
+  unsigned int tile_idx;
+
+  for (tile_idx = 0; tile_idx < tile_count; ++tile_idx) {
+    const int row_offset =
+        (tile_idx / output_frame_width_in_tiles) * tile_height;
+    const int col_offset =
+        (tile_idx % output_frame_width_in_tiles) * tile_width;
+    int plane;
+
+    for (plane = 0; plane < 3; ++plane) {
+      const unsigned char *buf = img->planes[plane];
+      const int stride = img->stride[plane];
+      const int roffset =
+          (plane > 0) ? row_offset >> img->y_chroma_shift : row_offset;
+      const int coffset =
+          (plane > 0) ? col_offset >> img->x_chroma_shift : col_offset;
+      const int w = (plane > 0) ? ((tile_width >> img->x_chroma_shift) << shift)
+                                : (tile_width << shift);
+      const int h =
+          (plane > 0) ? (tile_height >> img->y_chroma_shift) : tile_height;
+      int y;
+
+      // col offset needs to be adjusted for HBD.
+      buf += roffset * stride + (coffset << shift);
+
+      for (y = 0; y < h; ++y) {
+        fwrite(buf, 1, w, file);
+        buf += stride;
+      }
+    }
+  }
+}
+
+int main(int argc, char **argv) {
+  FILE *outfile = NULL;
+  aom_codec_ctx_t codec;
+  AvxVideoReader *reader = NULL;
+  const AvxInterface *decoder = NULL;
+  const AvxVideoInfo *info = NULL;
+  int num_references;
+  int num_tile_lists;
+  aom_image_t reference_images[MAX_EXTERNAL_REFERENCES];
+  size_t frame_size = 0;
+  const unsigned char *frame = NULL;
+  int output_format = YUV1D;
+  int i, j, n;
+
+  exec_name = argv[0];
+
+  if (argc < 5) die("Invalid number of arguments.");
+
+  reader = aom_video_reader_open(argv[1]);
+  if (!reader) die("Failed to open %s for reading.", argv[1]);
+
+  if (!(outfile = fopen(argv[2], "wb")))
+    die("Failed to open %s for writing.", argv[2]);
+
+  num_references = (int)strtol(argv[3], NULL, 0);
+  num_tile_lists = (int)strtol(argv[4], NULL, 0);
+
+  if (argc > 5) output_format = (int)strtol(argv[5], NULL, 0);
+  if (output_format < YUV1D || output_format > NV12)
+    die("Output format out of range [0, 2]");
+
+  info = aom_video_reader_get_info(reader);
+
+  decoder = get_aom_decoder_by_fourcc(info->codec_fourcc);
+  if (!decoder) die("Unknown input codec.");
+  printf("Using %s\n", aom_codec_iface_name(decoder->codec_interface()));
+
+  if (aom_codec_dec_init(&codec, decoder->codec_interface(), NULL, 0))
+    die_codec(&codec, "Failed to initialize decoder.");
+
+  if (AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_SET_IS_ANNEXB,
+                                    info->is_annexb)) {
+    die("Failed to set annex b status");
+  }
+
+  // Decode anchor frames.
+  AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1_SET_TILE_MODE, 0);
+  for (i = 0; i < num_references; ++i) {
+    aom_video_reader_read_frame(reader);
+    frame = aom_video_reader_get_frame(reader, &frame_size);
+    if (aom_codec_decode(&codec, frame, frame_size, NULL))
+      die_codec(&codec, "Failed to decode frame.");
+
+    if (i == 0) {
+      aom_img_fmt_t ref_fmt = 0;
+      if (AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_GET_IMG_FORMAT, &ref_fmt))
+        die_codec(&codec, "Failed to get the image format");
+
+      int frame_res[2];
+      if (AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_GET_FRAME_SIZE, frame_res))
+        die_codec(&codec, "Failed to get the image frame size");
+
+      // Allocate memory to store decoded references. Allocate memory with the
+      // border so that it can be used as a reference.
+      for (j = 0; j < num_references; j++) {
+        unsigned int border = AOM_DEC_BORDER_IN_PIXELS;
+        if (!aom_img_alloc_with_border(&reference_images[j], ref_fmt,
+                                       frame_res[0], frame_res[1], 32, 8,
+                                       border)) {
+          die("Failed to allocate references.");
+        }
+      }
+    }
+
+    if (AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1_COPY_NEW_FRAME_IMAGE,
+                                      &reference_images[i]))
+      die_codec(&codec, "Failed to copy decoded reference frame");
+
+    aom_codec_iter_t iter = NULL;
+    aom_image_t *img = NULL;
+    while ((img = aom_codec_get_frame(&codec, &iter)) != NULL) {
+      char name[1024];
+      snprintf(name, sizeof(name), "ref_%d.yuv", i);
+      printf("writing ref image to %s, %d, %d\n", name, img->d_w, img->d_h);
+      FILE *ref_file = fopen(name, "wb");
+      aom_img_write(img, ref_file);
+      fclose(ref_file);
+    }
+  }
+
+  // Decode the lightfield.
+  AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1_SET_TILE_MODE, 1);
+
+  // Set external references.
+  av1_ext_ref_frame_t set_ext_ref = { &reference_images[0], num_references };
+  AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_SET_EXT_REF_PTR, &set_ext_ref);
+  // Must decode the camera frame header first.
+  aom_video_reader_read_frame(reader);
+  frame = aom_video_reader_get_frame(reader, &frame_size);
+  if (aom_codec_decode(&codec, frame, frame_size, NULL))
+    die_codec(&codec, "Failed to decode the frame.");
+  // Decode tile lists one by one.
+  for (n = 0; n < num_tile_lists; n++) {
+    aom_video_reader_read_frame(reader);
+    frame = aom_video_reader_get_frame(reader, &frame_size);
+
+    if (aom_codec_decode(&codec, frame, frame_size, NULL))
+      die_codec(&codec, "Failed to decode the tile list.");
+    aom_codec_iter_t iter = NULL;
+    aom_image_t *img = aom_codec_get_frame(&codec, &iter);
+    if (!img) die_codec(&codec, "Failed to get frame.");
+
+    if (output_format == YUV1D)
+      // write the tile to the output file in 1D format.
+      write_tile_yuv1d(&codec, img, outfile);
+    else if (output_format == YUV)
+      aom_img_write(img, outfile);
+    else
+      // NV12 output format
+      aom_img_write_nv12(img, outfile);
+  }
+
+  for (i = 0; i < num_references; i++) aom_img_free(&reference_images[i]);
+  if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec");
+  aom_video_reader_close(reader);
+  fclose(outfile);
+
+  return EXIT_SUCCESS;
+}
diff --git a/libs/libaom/src/examples/lossless_encoder.c b/libs/libaom/src/examples/lossless_encoder.c
new file mode 100644
index 000000000..e0253d2b3
--- /dev/null
+++ b/libs/libaom/src/examples/lossless_encoder.c
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom/aom_encoder.h"
+#include "aom/aomcx.h"
+#include "common/tools_common.h"
+#include "common/video_writer.h"
+
+static const char *exec_name;
+
+void usage_exit(void) {
+  fprintf(stderr,
+          "lossless_encoder: Example demonstrating lossless "
+          "encoding feature. Supports raw input only.\n");
+  fprintf(stderr, "Usage: %s <width> <height> <infile> <outfile>\n", exec_name);
+  exit(EXIT_FAILURE);
+}
+
+static int encode_frame(aom_codec_ctx_t *codec, aom_image_t *img,
+                        int frame_index, int flags, AvxVideoWriter *writer) {
+  int got_pkts = 0;
+  aom_codec_iter_t iter = NULL;
+  const aom_codec_cx_pkt_t *pkt = NULL;
+  const aom_codec_err_t res =
+      aom_codec_encode(codec, img, frame_index, 1, flags);
+  if (res != AOM_CODEC_OK) die_codec(codec, "Failed to encode frame");
+
+  while ((pkt = aom_codec_get_cx_data(codec, &iter)) != NULL) {
+    got_pkts = 1;
+
+    if (pkt->kind == AOM_CODEC_CX_FRAME_PKT) {
+      const int keyframe = (pkt->data.frame.flags & AOM_FRAME_IS_KEY) != 0;
+      if (!aom_video_writer_write_frame(writer, pkt->data.frame.buf,
+                                        pkt->data.frame.sz,
+                                        pkt->data.frame.pts)) {
+        die_codec(codec, "Failed to write compressed frame");
+      }
+      printf(keyframe ? "K" : ".");
+      fflush(stdout);
+    }
+  }
+
+  return got_pkts;
+}
+
+int main(int argc, char **argv) {
+  FILE *infile = NULL;
+  aom_codec_ctx_t codec;
+  aom_codec_enc_cfg_t cfg;
+  int frame_count = 0;
+  aom_image_t raw;
+  aom_codec_err_t res;
+  AvxVideoInfo info;
+  AvxVideoWriter *writer = NULL;
+  const AvxInterface *encoder = NULL;
+  const int fps = 30;
+
+  exec_name = argv[0];
+
+  // Clear explicitly, as simply assigning "{ 0 }" generates
+  // "missing-field-initializers" warning in some compilers.
+  memset(&info, 0, sizeof(info));
+
+  if (argc < 5) die("Invalid number of arguments");
+
+  encoder = get_aom_encoder_by_name("av1");
+  if (!encoder) die("Unsupported codec.");
+
+  info.codec_fourcc = encoder->fourcc;
+  info.frame_width = (int)strtol(argv[1], NULL, 0);
+  info.frame_height = (int)strtol(argv[2], NULL, 0);
+  info.time_base.numerator = 1;
+  info.time_base.denominator = fps;
+
+  if (info.frame_width <= 0 || info.frame_height <= 0 ||
+      (info.frame_width % 2) != 0 || (info.frame_height % 2) != 0) {
+    die("Invalid frame size: %dx%d", info.frame_width, info.frame_height);
+  }
+
+  if (!aom_img_alloc(&raw, AOM_IMG_FMT_I420, info.frame_width,
+                     info.frame_height, 1)) {
+    die("Failed to allocate image.");
+  }
+
+  printf("Using %s\n", aom_codec_iface_name(encoder->codec_interface()));
+
+  res = aom_codec_enc_config_default(encoder->codec_interface(), &cfg, 0);
+  if (res) die_codec(&codec, "Failed to get default codec config.");
+
+  cfg.g_w = info.frame_width;
+  cfg.g_h = info.frame_height;
+  cfg.g_timebase.num = info.time_base.numerator;
+  cfg.g_timebase.den = info.time_base.denominator;
+
+  writer = aom_video_writer_open(argv[4], kContainerIVF, &info);
+  if (!writer) die("Failed to open %s for writing.", argv[4]);
+
+  if (!(infile = fopen(argv[3], "rb")))
+    die("Failed to open %s for reading.", argv[3]);
+
+  if (aom_codec_enc_init(&codec, encoder->codec_interface(), &cfg, 0))
+    die_codec(&codec, "Failed to initialize encoder");
+
+  if (AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1E_SET_LOSSLESS, 1))
+    die_codec(&codec, "Failed to use lossless mode");
+
+  // Encode frames.
+  while (aom_img_read(&raw, infile)) {
+    encode_frame(&codec, &raw, frame_count++, 0, writer);
+  }
+
+  // Flush encoder.
+  while (encode_frame(&codec, NULL, -1, 0, writer)) {
+  }
+
+  printf("\n");
+  fclose(infile);
+  printf("Processed %d frames.\n", frame_count);
+
+  aom_img_free(&raw);
+  if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec.");
+
+  aom_video_writer_close(writer);
+
+  return EXIT_SUCCESS;
+}
diff --git a/libs/libaom/src/examples/noise_model.c b/libs/libaom/src/examples/noise_model.c
new file mode 100644
index 000000000..d07443f9d
--- /dev/null
+++ b/libs/libaom/src/examples/noise_model.c
@@ -0,0 +1,432 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\file
+ * \brief This is an sample binary to create noise params from input video.
+ *
+ * To allow for external denoising applications, this sample binary illustrates
+ * how to create a film grain table (film grain params as a function of time)
+ * from an input video and its corresponding denoised source.
+ *
+ * The --output-grain-table file can be passed as input to the encoder (in
+ * aomenc this is done through the "--film-grain-table" parameter).
+ *
+ * As an example, where the input source is an 854x480 yuv420p 8-bit video
+ * named "input.854_480.yuv" you would use steps similar to the following:
+ *
+ * # Run your denoiser (e.g, using hqdn3d filter):
+ * ffmpeg -vcodec rawvideo -video_size 854x480 -i input.854_480.yuv \
+ *    -vf hqdn3d=5:5:5:5 -vcodec rawvideo -an -f rawvideo \
+ *    denoised.854_480.yuv
+ *
+ * # Model the noise between the denoised version and original source:
+ * ./examples/noise_model --fps=25/1 --width=854 --height=480 --i420 \
+ *    --input-denoised=denoised.854_480.yuv --input=original.854_480.yuv \
+ *    --output-grain-table=film_grain.tbl
+ *
+ * # Encode with your favorite settings (including the grain table):
+ * aomenc --limit=100  --cpu-used=4 --input-bit-depth=8                  \
+ *    --i420 -w 854 -h 480 --end-usage=q --cq-level=25 --lag-in-frames=25 \
+ *    --auto-alt-ref=2 --bit-depth=8 --film-grain-table=film_grain.tbl \
+ *    -o denoised_with_grain_params.ivf denoised.854_480.yuv
+ */
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom/aom_encoder.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+#if CONFIG_AV1_DECODER
+#include "aom_dsp/grain_synthesis.h"
+#endif
+
+#include "aom_dsp/grain_table.h"
+#include "aom_dsp/noise_model.h"
+#include "aom_dsp/noise_util.h"
+#include "aom_mem/aom_mem.h"
+#include "common/args.h"
+#include "common/tools_common.h"
+#include "common/video_writer.h"
+
+static const char *exec_name;
+
+void usage_exit(void) {
+  fprintf(stderr,
+          "Usage: %s --input=<input> --input-denoised=<denoised> "
+          "--output-grain-table=<outfile> "
+          "See comments in noise_model.c for more information.\n",
+          exec_name);
+  exit(EXIT_FAILURE);
+}
+
+static const arg_def_t help =
+    ARG_DEF(NULL, "help", 0, "Show usage options and exit");
+static const arg_def_t width_arg =
+    ARG_DEF("w", "width", 1, "Input width (if rawvideo)");
+static const arg_def_t height_arg =
+    ARG_DEF("h", "height", 1, "Input height (if rawvideo)");
+static const arg_def_t skip_frames_arg =
+    ARG_DEF("s", "skip-frames", 1, "Number of frames to skip (default = 1)");
+static const arg_def_t fps_arg = ARG_DEF(NULL, "fps", 1, "Frame rate");
+static const arg_def_t input_arg = ARG_DEF("-i", "input", 1, "Input filename");
+static const arg_def_t output_grain_table_arg =
+    ARG_DEF("n", "output-grain-table", 1, "Output noise file");
+static const arg_def_t input_denoised_arg =
+    ARG_DEF("d", "input-denoised", 1, "Input denoised filename (YUV) only");
+static const arg_def_t flat_block_finder_arg =
+    ARG_DEF("b", "flat-block-finder", 1, "Run the flat block finder");
+static const arg_def_t block_size_arg =
+    ARG_DEF("b", "block-size", 1, "Block size");
+static const arg_def_t bit_depth_arg =
+    ARG_DEF(NULL, "bit-depth", 1, "Bit depth of input");
+static const arg_def_t use_i420 =
+    ARG_DEF(NULL, "i420", 0, "Input file (and denoised) is I420 (default)");
+static const arg_def_t use_i422 =
+    ARG_DEF(NULL, "i422", 0, "Input file (and denoised) is I422");
+static const arg_def_t use_i444 =
+    ARG_DEF(NULL, "i444", 0, "Input file (and denoised) is I444");
+static const arg_def_t debug_file_arg =
+    ARG_DEF(NULL, "debug-file", 1, "File to output debug info");
+
+typedef struct {
+  int width;
+  int height;
+  struct aom_rational fps;
+  const char *input;
+  const char *input_denoised;
+  const char *output_grain_table;
+  int img_fmt;
+  int block_size;
+  int bit_depth;
+  int run_flat_block_finder;
+  int force_flat_psd;
+  int skip_frames;
+  const char *debug_file;
+} noise_model_args_t;
+
+static void parse_args(noise_model_args_t *noise_args, int *argc, char **argv) {
+  struct arg arg;
+  static const arg_def_t *main_args[] = { &help,
+                                          &input_arg,
+                                          &fps_arg,
+                                          &width_arg,
+                                          &height_arg,
+                                          &block_size_arg,
+                                          &output_grain_table_arg,
+                                          &input_denoised_arg,
+                                          &use_i420,
+                                          &use_i422,
+                                          &use_i444,
+                                          &debug_file_arg,
+                                          NULL };
+  for (int argi = *argc + 1; *argv; argi++, argv++) {
+    if (arg_match(&arg, &help, argv)) {
+      fprintf(stdout, "\nOptions:\n");
+      arg_show_usage(stdout, main_args);
+      exit(0);
+    } else if (arg_match(&arg, &width_arg, argv)) {
+      noise_args->width = atoi(arg.val);
+    } else if (arg_match(&arg, &height_arg, argv)) {
+      noise_args->height = atoi(arg.val);
+    } else if (arg_match(&arg, &input_arg, argv)) {
+      noise_args->input = arg.val;
+    } else if (arg_match(&arg, &input_denoised_arg, argv)) {
+      noise_args->input_denoised = arg.val;
+    } else if (arg_match(&arg, &output_grain_table_arg, argv)) {
+      noise_args->output_grain_table = arg.val;
+    } else if (arg_match(&arg, &block_size_arg, argv)) {
+      noise_args->block_size = atoi(arg.val);
+    } else if (arg_match(&arg, &bit_depth_arg, argv)) {
+      noise_args->bit_depth = atoi(arg.val);
+    } else if (arg_match(&arg, &flat_block_finder_arg, argv)) {
+      noise_args->run_flat_block_finder = atoi(arg.val);
+    } else if (arg_match(&arg, &fps_arg, argv)) {
+      noise_args->fps = arg_parse_rational(&arg);
+    } else if (arg_match(&arg, &use_i420, argv)) {
+      noise_args->img_fmt = AOM_IMG_FMT_I420;
+    } else if (arg_match(&arg, &use_i422, argv)) {
+      noise_args->img_fmt = AOM_IMG_FMT_I422;
+    } else if (arg_match(&arg, &use_i444, argv)) {
+      noise_args->img_fmt = AOM_IMG_FMT_I444;
+    } else if (arg_match(&arg, &skip_frames_arg, argv)) {
+      noise_args->skip_frames = atoi(arg.val);
+    } else if (arg_match(&arg, &debug_file_arg, argv)) {
+      noise_args->debug_file = arg.val;
+    } else {
+      fprintf(stdout, "Unknown arg: %s\n\nUsage:\n", *argv);
+      arg_show_usage(stdout, main_args);
+      exit(0);
+    }
+  }
+  if (noise_args->bit_depth > 8) {
+    noise_args->img_fmt |= AOM_IMG_FMT_HIGHBITDEPTH;
+  }
+}
+
+#if CONFIG_AV1_DECODER
+static void print_variance_y(FILE *debug_file, aom_image_t *raw,
+                             aom_image_t *denoised, const uint8_t *flat_blocks,
+                             int block_size, aom_film_grain_t *grain) {
+  aom_image_t renoised;
+  grain->apply_grain = 1;
+  grain->random_seed = 7391;
+  grain->bit_depth = raw->bit_depth;
+  aom_img_alloc(&renoised, raw->fmt, raw->w, raw->h, 1);
+
+  if (av1_add_film_grain(grain, denoised, &renoised)) {
+    fprintf(stderr, "Internal failure in av1_add_film_grain().\n");
+    aom_img_free(&renoised);
+    return;
+  }
+
+  const int num_blocks_w = (raw->w + block_size - 1) / block_size;
+  const int num_blocks_h = (raw->h + block_size - 1) / block_size;
+  fprintf(debug_file, "x = [");
+  for (int by = 0; by < num_blocks_h; by++) {
+    for (int bx = 0; bx < num_blocks_w; bx++) {
+      double block_mean = 0;
+      double noise_std = 0, noise_mean = 0;
+      double renoise_std = 0, renoise_mean = 0;
+      for (int yi = 0; yi < block_size; ++yi) {
+        const int y = by * block_size + yi;
+        for (int xi = 0; xi < block_size; ++xi) {
+          const int x = bx * block_size + xi;
+          const double noise_v = (raw->planes[0][y * raw->stride[0] + x] -
+                                  denoised->planes[0][y * raw->stride[0] + x]);
+          noise_mean += noise_v;
+          noise_std += noise_v * noise_v;
+
+          block_mean += raw->planes[0][y * raw->stride[0] + x];
+
+          const double renoise_v =
+              (renoised.planes[0][y * raw->stride[0] + x] -
+               denoised->planes[0][y * raw->stride[0] + x]);
+          renoise_mean += renoise_v;
+          renoise_std += renoise_v * renoise_v;
+        }
+      }
+      int n = (block_size * block_size);
+      block_mean /= n;
+      noise_mean /= n;
+      renoise_mean /= n;
+      noise_std = sqrt(noise_std / n - noise_mean * noise_mean);
+      renoise_std = sqrt(renoise_std / n - renoise_mean * renoise_mean);
+      fprintf(debug_file, "%d %3.2lf %3.2lf %3.2lf  ",
+              flat_blocks[by * num_blocks_w + bx], block_mean, noise_std,
+              renoise_std);
+    }
+    fprintf(debug_file, "\n");
+  }
+  fprintf(debug_file, "];\n");
+
+  if (raw->fmt & AOM_IMG_FMT_HIGHBITDEPTH) {
+    fprintf(stderr,
+            "Detailed debug info not supported for high bit"
+            "depth formats\n");
+  } else {
+    fprintf(debug_file, "figure(2); clf;\n");
+    fprintf(debug_file,
+            "scatter(x(:, 2:4:end), x(:, 3:4:end), 'r'); hold on;\n");
+    fprintf(debug_file, "scatter(x(:, 2:4:end), x(:, 4:4:end), 'b');\n");
+    fprintf(debug_file,
+            "plot(linspace(0, 255, length(noise_strength_0)), "
+            "noise_strength_0, 'b');\n");
+    fprintf(debug_file,
+            "title('Scatter plot of intensity vs noise strength');\n");
+    fprintf(debug_file,
+            "legend('Actual', 'Estimated', 'Estimated strength');\n");
+    fprintf(debug_file, "figure(3); clf;\n");
+    fprintf(debug_file, "scatter(x(:, 3:4:end), x(:, 4:4:end), 'k');\n");
+    fprintf(debug_file, "title('Actual vs Estimated');\n");
+    fprintf(debug_file, "pause(3);\n");
+  }
+  aom_img_free(&renoised);
+}
+#endif
+
+static void print_debug_info(FILE *debug_file, aom_image_t *raw,
+                             aom_image_t *denoised, uint8_t *flat_blocks,
+                             int block_size, aom_noise_model_t *noise_model) {
+  (void)raw;
+  (void)denoised;
+  (void)flat_blocks;
+  (void)block_size;
+  fprintf(debug_file, "figure(3); clf;\n");
+  fprintf(debug_file, "figure(2); clf;\n");
+  fprintf(debug_file, "figure(1); clf;\n");
+  for (int c = 0; c < 3; ++c) {
+    fprintf(debug_file, "noise_strength_%d = [\n", c);
+    const aom_equation_system_t *eqns =
+        &noise_model->combined_state[c].strength_solver.eqns;
+    for (int k = 0; k < eqns->n; ++k) {
+      fprintf(debug_file, "%lf ", eqns->x[k]);
+    }
+    fprintf(debug_file, "];\n");
+    fprintf(debug_file, "plot(noise_strength_%d); hold on;\n", c);
+  }
+  fprintf(debug_file, "legend('Y', 'cb', 'cr');\n");
+  fprintf(debug_file, "title('Noise strength function');\n");
+
+#if CONFIG_AV1_DECODER
+  aom_film_grain_t grain;
+  aom_noise_model_get_grain_parameters(noise_model, &grain);
+  print_variance_y(debug_file, raw, denoised, flat_blocks, block_size, &grain);
+#endif
+  fflush(debug_file);
+}
+
+int main(int argc, char *argv[]) {
+  noise_model_args_t args = { 0,  0, { 25, 1 }, 0, 0, 0,   AOM_IMG_FMT_I420,
+                              32, 8, 1,         0, 1, NULL };
+  aom_image_t raw, denoised;
+  FILE *infile = NULL;
+  AvxVideoInfo info;
+
+  memset(&info, 0, sizeof(info));
+
+  exec_name = argv[0];
+  parse_args(&args, &argc, argv + 1);
+
+  info.frame_width = args.width;
+  info.frame_height = args.height;
+  info.time_base.numerator = args.fps.den;
+  info.time_base.denominator = args.fps.num;
+
+  if (info.frame_width <= 0 || info.frame_height <= 0 ||
+      (info.frame_width % 2) != 0 || (info.frame_height % 2) != 0) {
+    die("Invalid frame size: %dx%d", info.frame_width, info.frame_height);
+  }
+  if (!aom_img_alloc(&raw, args.img_fmt, info.frame_width, info.frame_height,
+                     1)) {
+    die("Failed to allocate image.");
+  }
+  if (!aom_img_alloc(&denoised, args.img_fmt, info.frame_width,
+                     info.frame_height, 1)) {
+    die("Failed to allocate image.");
+  }
+  infile = fopen(args.input, "rb");
+  if (!infile) {
+    die("Failed to open input file:", args.input);
+  }
+  fprintf(stderr, "Bit depth: %d  stride:%d\n", args.bit_depth, raw.stride[0]);
+
+  const int high_bd = args.bit_depth > 8;
+  const int block_size = args.block_size;
+  aom_flat_block_finder_t block_finder;
+  aom_flat_block_finder_init(&block_finder, block_size, args.bit_depth,
+                             high_bd);
+
+  const int num_blocks_w = (info.frame_width + block_size - 1) / block_size;
+  const int num_blocks_h = (info.frame_height + block_size - 1) / block_size;
+  uint8_t *flat_blocks = (uint8_t *)aom_malloc(num_blocks_w * num_blocks_h);
+  // Sets the random seed on the first entry in the output table
+  int16_t random_seed = 7391;
+  aom_noise_model_t noise_model;
+  aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, 3, args.bit_depth,
+                                      high_bd };
+  aom_noise_model_init(&noise_model, params);
+
+  FILE *denoised_file = 0;
+  if (args.input_denoised) {
+    denoised_file = fopen(args.input_denoised, "rb");
+    if (!denoised_file)
+      die("Unable to open input_denoised: %s", args.input_denoised);
+  } else {
+    die("--input-denoised file must be specified");
+  }
+  FILE *debug_file = 0;
+  if (args.debug_file) {
+    debug_file = fopen(args.debug_file, "w");
+  }
+  aom_film_grain_table_t grain_table = { 0, 0 };
+
+  int64_t prev_timestamp = 0;
+  int frame_count = 0;
+  while (aom_img_read(&raw, infile)) {
+    if (args.input_denoised) {
+      if (!aom_img_read(&denoised, denoised_file)) {
+        die("Unable to read input denoised file");
+      }
+    }
+    if (frame_count % args.skip_frames == 0) {
+      int num_flat_blocks = num_blocks_w * num_blocks_h;
+      memset(flat_blocks, 1, num_flat_blocks);
+      if (args.run_flat_block_finder) {
+        memset(flat_blocks, 0, num_flat_blocks);
+        num_flat_blocks = aom_flat_block_finder_run(
+            &block_finder, raw.planes[0], info.frame_width, info.frame_height,
+            info.frame_width, flat_blocks);
+        fprintf(stdout, "Num flat blocks %d\n", num_flat_blocks);
+      }
+
+      const uint8_t *planes[3] = { raw.planes[0], raw.planes[1],
+                                   raw.planes[2] };
+      uint8_t *denoised_planes[3] = { denoised.planes[0], denoised.planes[1],
+                                      denoised.planes[2] };
+      int strides[3] = { raw.stride[0] >> high_bd, raw.stride[1] >> high_bd,
+                         raw.stride[2] >> high_bd };
+      int chroma_sub[3] = { raw.x_chroma_shift, raw.y_chroma_shift, 0 };
+
+      fprintf(stdout, "Updating noise model...\n");
+      aom_noise_status_t status = aom_noise_model_update(
+          &noise_model, (const uint8_t *const *)planes,
+          (const uint8_t *const *)denoised_planes, info.frame_width,
+          info.frame_height, strides, chroma_sub, flat_blocks, block_size);
+
+      int64_t cur_timestamp =
+          frame_count * 10000000ULL * args.fps.den / args.fps.num;
+      if (status == AOM_NOISE_STATUS_DIFFERENT_NOISE_TYPE) {
+        fprintf(stdout,
+                "Noise type is different, updating parameters for time "
+                "[ %" PRId64 ", %" PRId64 ")\n",
+                prev_timestamp, cur_timestamp);
+        aom_film_grain_t grain;
+        aom_noise_model_get_grain_parameters(&noise_model, &grain);
+        grain.random_seed = random_seed;
+        random_seed = 0;
+        aom_film_grain_table_append(&grain_table, prev_timestamp, cur_timestamp,
+                                    &grain);
+        aom_noise_model_save_latest(&noise_model);
+        prev_timestamp = cur_timestamp;
+      }
+      if (debug_file) {
+        print_debug_info(debug_file, &raw, &denoised, flat_blocks, block_size,
+                         &noise_model);
+      }
+      fprintf(stdout, "Done noise model update, status = %d\n", status);
+    }
+    frame_count++;
+  }
+
+  aom_film_grain_t grain;
+  aom_noise_model_get_grain_parameters(&noise_model, &grain);
+  grain.random_seed = random_seed;
+  aom_film_grain_table_append(&grain_table, prev_timestamp, INT64_MAX, &grain);
+  if (args.output_grain_table) {
+    struct aom_internal_error_info error_info;
+    if (AOM_CODEC_OK != aom_film_grain_table_write(&grain_table,
+                                                   args.output_grain_table,
+                                                   &error_info)) {
+      die("Unable to write output film grain table");
+    }
+  }
+  aom_film_grain_table_free(&grain_table);
+
+  if (infile) fclose(infile);
+  if (denoised_file) fclose(denoised_file);
+  if (debug_file) fclose(debug_file);
+  aom_img_free(&raw);
+  aom_img_free(&denoised);
+
+  return EXIT_SUCCESS;
+}
diff --git a/libs/libaom/src/examples/resize_util.c b/libs/libaom/src/examples/resize_util.c
new file mode 100644
index 000000000..5692c2062
--- /dev/null
+++ b/libs/libaom/src/examples/resize_util.c
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "av1/common/resize.h"
+#include "common/tools_common.h"
+
+static const char *exec_name = NULL;
+
+static void usage() {
+  printf("Usage:\n");
+  printf("%s <input_yuv> <width>x<height> <target_width>x<target_height> ",
+         exec_name);
+  printf("<output_yuv> [<frames>]\n");
+}
+
+void usage_exit(void) {
+  usage();
+  exit(EXIT_FAILURE);
+}
+
+static int parse_dim(char *v, int *width, int *height) {
+  char *x = strchr(v, 'x');
+  if (x == NULL) x = strchr(v, 'X');
+  if (x == NULL) return 0;
+  *width = atoi(v);
+  *height = atoi(&x[1]);
+  if (*width <= 0 || *height <= 0)
+    return 0;
+  else
+    return 1;
+}
+
+int main(int argc, char *argv[]) {
+  char *fin, *fout;
+  FILE *fpin, *fpout;
+  uint8_t *inbuf, *outbuf;
+  uint8_t *inbuf_u, *outbuf_u;
+  uint8_t *inbuf_v, *outbuf_v;
+  int f, frames;
+  int width, height, target_width, target_height;
+
+  exec_name = argv[0];
+
+  if (argc < 5) {
+    printf("Incorrect parameters:\n");
+    usage();
+    return 1;
+  }
+
+  fin = argv[1];
+  fout = argv[4];
+  if (!parse_dim(argv[2], &width, &height)) {
+    printf("Incorrect parameters: %s\n", argv[2]);
+    usage();
+    return 1;
+  }
+  if (!parse_dim(argv[3], &target_width, &target_height)) {
+    printf("Incorrect parameters: %s\n", argv[3]);
+    usage();
+    return 1;
+  }
+
+  fpin = fopen(fin, "rb");
+  if (fpin == NULL) {
+    printf("Can't open file %s to read\n", fin);
+    usage();
+    return 1;
+  }
+  fpout = fopen(fout, "wb");
+  if (fpout == NULL) {
+    fclose(fpin);
+    printf("Can't open file %s to write\n", fout);
+    usage();
+    return 1;
+  }
+  if (argc >= 6)
+    frames = atoi(argv[5]);
+  else
+    frames = INT_MAX;
+
+  printf("Input size:  %dx%d\n", width, height);
+  printf("Target size: %dx%d, Frames: ", target_width, target_height);
+  if (frames == INT_MAX)
+    printf("All\n");
+  else
+    printf("%d\n", frames);
+
+  inbuf = (uint8_t *)malloc(width * height * 3 / 2);
+  outbuf = (uint8_t *)malloc(target_width * target_height * 3 / 2);
+  inbuf_u = inbuf + width * height;
+  inbuf_v = inbuf_u + width * height / 4;
+  outbuf_u = outbuf + target_width * target_height;
+  outbuf_v = outbuf_u + target_width * target_height / 4;
+  f = 0;
+  while (f < frames) {
+    if (fread(inbuf, width * height * 3 / 2, 1, fpin) != 1) break;
+    av1_resize_frame420(inbuf, width, inbuf_u, inbuf_v, width / 2, height,
+                        width, outbuf, target_width, outbuf_u, outbuf_v,
+                        target_width / 2, target_height, target_width);
+    fwrite(outbuf, target_width * target_height * 3 / 2, 1, fpout);
+    f++;
+  }
+  printf("%d frames processed\n", f);
+  fclose(fpin);
+  fclose(fpout);
+
+  free(inbuf);
+  free(outbuf);
+  return 0;
+}
diff --git a/libs/libaom/src/examples/scalable_decoder.c b/libs/libaom/src/examples/scalable_decoder.c
new file mode 100644
index 000000000..c22924223
--- /dev/null
+++ b/libs/libaom/src/examples/scalable_decoder.c
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// Scalable Decoder
+// ==============
+//
+// This is an example of a scalable decoder loop. It takes a 2-spatial-layer
+// input file
+// containing the compressed data (in OBU format), passes it through the
+// decoder, and writes the decompressed frames to disk. The base layer and
+// enhancement layers are stored as separate files, out_lyr0.yuv and
+// out_lyr1.yuv, respectively.
+//
+// Standard Includes
+// -----------------
+// For decoders, you only have to include `aom_decoder.h` and then any
+// header files for the specific codecs you use. In this case, we're using
+// av1.
+//
+// Initializing The Codec
+// ----------------------
+// The libaom decoder is initialized by the call to aom_codec_dec_init().
+// Determining the codec interface to use is handled by AvxVideoReader and the
+// functions prefixed with aom_video_reader_. Discussion of those functions is
+// beyond the scope of this example, but the main gist is to open the input file
+// and parse just enough of it to determine if it's a AVx file and which AVx
+// codec is contained within the file.
+// Note the NULL pointer passed to aom_codec_dec_init(). We do that in this
+// example because we want the algorithm to determine the stream configuration
+// (width/height) and allocate memory automatically.
+//
+// Decoding A Frame
+// ----------------
+// Once the frame has been read into memory, it is decoded using the
+// `aom_codec_decode` function. The call takes a pointer to the data
+// (`frame`) and the length of the data (`frame_size`). No application data
+// is associated with the frame in this example, so the `user_priv`
+// parameter is NULL. The `deadline` parameter is left at zero for this
+// example. This parameter is generally only used when doing adaptive post
+// processing.
+//
+// Codecs may produce a variable number of output frames for every call to
+// `aom_codec_decode`. These frames are retrieved by the
+// `aom_codec_get_frame` iterator function. The iterator variable `iter` is
+// initialized to NULL each time `aom_codec_decode` is called.
+// `aom_codec_get_frame` is called in a loop, returning a pointer to a
+// decoded image or NULL to indicate the end of list.
+//
+// Processing The Decoded Data
+// ---------------------------
+// In this example, we simply write the encoded data to disk. It is
+// important to honor the image's `stride` values.
+//
+// Cleanup
+// -------
+// The `aom_codec_destroy` call frees any memory allocated by the codec.
+//
+// Error Handling
+// --------------
+// This example does not special case any error return codes. If there was
+// an error, a descriptive message is printed and the program exits. With
+// few exceptions, aom_codec functions return an enumerated error status,
+// with the value `0` indicating success.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom/aom_decoder.h"
+#include "aom/aomdx.h"
+#include "common/obudec.h"
+#include "common/tools_common.h"
+#include "common/video_reader.h"
+
+static const char *exec_name;
+
+#define MAX_LAYERS 5
+
+void usage_exit(void) {
+  fprintf(stderr, "Usage: %s <infile>\n", exec_name);
+  exit(EXIT_FAILURE);
+}
+
+int main(int argc, char **argv) {
+  int frame_cnt = 0;
+  FILE *outfile[MAX_LAYERS];
+  char filename[80];
+  aom_codec_ctx_t codec;
+  const AvxInterface *decoder = NULL;
+  FILE *inputfile = NULL;
+  uint8_t *buf = NULL;
+  size_t bytes_in_buffer = 0;
+  size_t buffer_size = 0;
+  struct AvxInputContext aom_input_ctx;
+  struct ObuDecInputContext obu_ctx = { &aom_input_ctx, NULL, 0, 0, 0 };
+  aom_codec_stream_info_t si;
+  uint8_t tmpbuf[32];
+  unsigned int i;
+
+  exec_name = argv[0];
+
+  if (argc != 2) die("Invalid number of arguments.");
+
+  if (!(inputfile = fopen(argv[1], "rb")))
+    die("Failed to open %s for read.", argv[1]);
+  obu_ctx.avx_ctx->file = inputfile;
+  obu_ctx.avx_ctx->filename = argv[1];
+
+  decoder = get_aom_decoder_by_index(0);
+  printf("Using %s\n", aom_codec_iface_name(decoder->codec_interface()));
+
+  if (aom_codec_dec_init(&codec, decoder->codec_interface(), NULL, 0))
+    die_codec(&codec, "Failed to initialize decoder.");
+
+  if (aom_codec_control(&codec, AV1D_SET_OUTPUT_ALL_LAYERS, 1)) {
+    die_codec(&codec, "Failed to set output_all_layers control.");
+  }
+
+  // peak sequence header OBU to get number of spatial layers
+  const size_t ret = fread(tmpbuf, 1, 32, inputfile);
+  if (ret != 32) die_codec(&codec, "Input is not a valid obu file");
+  si.is_annexb = 0;
+  if (aom_codec_peek_stream_info(decoder->codec_interface(), tmpbuf, 32, &si)) {
+    die_codec(&codec, "Input is not a valid obu file");
+  }
+  fseek(inputfile, -32, SEEK_CUR);
+
+  if (!file_is_obu(&obu_ctx))
+    die_codec(&codec, "Input is not a valid obu file");
+
+  // open base layer output yuv file
+  snprintf(filename, sizeof(filename), "out_lyr%d.yuv", 0);
+  if (!(outfile[0] = fopen(filename, "wb")))
+    die("Failed top open output for writing.");
+
+  // open any enhancement layer output yuv files
+  for (i = 1; i < si.number_spatial_layers; i++) {
+    snprintf(filename, sizeof(filename), "out_lyr%d.yuv", i);
+    if (!(outfile[i] = fopen(filename, "wb")))
+      die("Failed to open output for writing.");
+  }
+
+  while (!obudec_read_temporal_unit(&obu_ctx, &buf, &bytes_in_buffer,
+                                    &buffer_size)) {
+    aom_codec_iter_t iter = NULL;
+    aom_image_t *img = NULL;
+    if (aom_codec_decode(&codec, buf, bytes_in_buffer, NULL))
+      die_codec(&codec, "Failed to decode frame.");
+
+    while ((img = aom_codec_get_frame(&codec, &iter)) != NULL) {
+      aom_image_t *img_shifted =
+          aom_img_alloc(NULL, AOM_IMG_FMT_I420, img->d_w, img->d_h, 16);
+      img_shifted->bit_depth = 8;
+      aom_img_downshift(img_shifted, img,
+                        img->bit_depth - img_shifted->bit_depth);
+      if (img->spatial_id == 0) {
+        printf("Writing        base layer 0 %d\n", frame_cnt);
+        aom_img_write(img_shifted, outfile[0]);
+      } else if (img->spatial_id <= (int)(si.number_spatial_layers - 1)) {
+        printf("Writing enhancement layer %d %d\n", img->spatial_id, frame_cnt);
+        aom_img_write(img_shifted, outfile[img->spatial_id]);
+      } else {
+        die_codec(&codec, "Invalid bitstream. Layer id exceeds layer count");
+      }
+      if (img->spatial_id == (int)(si.number_spatial_layers - 1)) ++frame_cnt;
+    }
+  }
+
+  printf("Processed %d frames.\n", frame_cnt);
+  if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec");
+
+  for (i = 0; i < si.number_spatial_layers; i++) fclose(outfile[i]);
+
+  fclose(inputfile);
+
+  return EXIT_SUCCESS;
+}
diff --git a/libs/libaom/src/examples/scalable_encoder.c b/libs/libaom/src/examples/scalable_encoder.c
new file mode 100644
index 000000000..7af03e29f
--- /dev/null
+++ b/libs/libaom/src/examples/scalable_encoder.c
@@ -0,0 +1,289 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// Scalable Encoder
+// ==============
+//
+// This is an example of a scalable encoder loop. It takes two input files in
+// YV12 format, passes it through the encoder, and writes the compressed
+// frames to disk in OBU format.
+//
+// Getting The Default Configuration
+// ---------------------------------
+// Encoders have the notion of "usage profiles." For example, an encoder
+// may want to publish default configurations for both a video
+// conferencing application and a best quality offline encoder. These
+// obviously have very different default settings. Consult the
+// documentation for your codec to see if it provides any default
+// configurations. All codecs provide a default configuration, number 0,
+// which is valid for material in the vacinity of QCIF/QVGA.
+//
+// Updating The Configuration
+// ---------------------------------
+// Almost all applications will want to update the default configuration
+// with settings specific to their usage. Here we set the width and height
+// of the video file to that specified on the command line. We also scale
+// the default bitrate based on the ratio between the default resolution
+// and the resolution specified on the command line.
+//
+// Encoding A Frame
+// ----------------
+// The frame is read as a continuous block (size = width * height * 3 / 2)
+// from the input file. If a frame was read (the input file has not hit
+// EOF) then the frame is passed to the encoder. Otherwise, a NULL
+// is passed, indicating the End-Of-Stream condition to the encoder. The
+// `frame_cnt` is reused as the presentation time stamp (PTS) and each
+// frame is shown for one frame-time in duration. The flags parameter is
+// unused in this example.
+
+// Forced Keyframes
+// ----------------
+// Keyframes can be forced by setting the AOM_EFLAG_FORCE_KF bit of the
+// flags passed to `aom_codec_control()`. In this example, we force a
+// keyframe every <keyframe-interval> frames. Note, the output stream can
+// contain additional keyframes beyond those that have been forced using the
+// AOM_EFLAG_FORCE_KF flag because of automatic keyframe placement by the
+// encoder.
+//
+// Processing The Encoded Data
+// ---------------------------
+// Each packet of type `AOM_CODEC_CX_FRAME_PKT` contains the encoded data
+// for this frame. We write a IVF frame header, followed by the raw data.
+//
+// Cleanup
+// -------
+// The `aom_codec_destroy` call frees any memory allocated by the codec.
+//
+// Error Handling
+// --------------
+// This example does not special case any error return codes. If there was
+// an error, a descriptive message is printed and the program exits. With
+// few exeptions, aom_codec functions return an enumerated error status,
+// with the value `0` indicating success.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom/aom_encoder.h"
+#include "aom/aomcx.h"
+#include "av1/common/enums.h"
+#include "common/tools_common.h"
+#include "common/video_writer.h"
+
+static const char *exec_name;
+
+void usage_exit(void) {
+  fprintf(stderr,
+          "Usage: %s <codec> <width> <height> <infile0> <infile1> "
+          "<outfile> <frames to encode>\n"
+          "See comments in scalable_encoder.c for more information.\n",
+          exec_name);
+  exit(EXIT_FAILURE);
+}
+
+static int encode_frame(aom_codec_ctx_t *codec, aom_image_t *img,
+                        int frame_index, int flags, FILE *outfile) {
+  int got_pkts = 0;
+  aom_codec_iter_t iter = NULL;
+  const aom_codec_cx_pkt_t *pkt = NULL;
+  const aom_codec_err_t res =
+      aom_codec_encode(codec, img, frame_index, 1, flags);
+  if (res != AOM_CODEC_OK) die_codec(codec, "Failed to encode frame");
+
+  while ((pkt = aom_codec_get_cx_data(codec, &iter)) != NULL) {
+    got_pkts = 1;
+
+    if (pkt->kind == AOM_CODEC_CX_FRAME_PKT) {
+      const int keyframe = (pkt->data.frame.flags & AOM_FRAME_IS_KEY) != 0;
+      if (fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, outfile) !=
+          pkt->data.frame.sz) {
+        die_codec(codec, "Failed to write compressed frame");
+      }
+      printf(keyframe ? "K" : ".");
+      printf(" %6d\n", (int)pkt->data.frame.sz);
+      fflush(stdout);
+    }
+  }
+
+  return got_pkts;
+}
+
+int main(int argc, char **argv) {
+  FILE *infile0 = NULL;
+  FILE *infile1 = NULL;
+  aom_codec_ctx_t codec;
+  aom_codec_enc_cfg_t cfg;
+  int frame_count = 0;
+  aom_image_t raw0, raw1;
+  aom_codec_err_t res;
+  AvxVideoInfo info;
+  const AvxInterface *encoder = NULL;
+  const int fps = 30;
+  const int bitrate = 200;
+  int keyframe_interval = 0;
+  int max_frames = 0;
+  int frames_encoded = 0;
+  const char *codec_arg = NULL;
+  const char *width_arg = NULL;
+  const char *height_arg = NULL;
+  const char *infile0_arg = NULL;
+  const char *infile1_arg = NULL;
+  const char *outfile_arg = NULL;
+  //  const char *keyframe_interval_arg = NULL;
+  FILE *outfile = NULL;
+
+  exec_name = argv[0];
+
+  // Clear explicitly, as simply assigning "{ 0 }" generates
+  // "missing-field-initializers" warning in some compilers.
+  memset(&info, 0, sizeof(info));
+
+  if (argc != 8) die("Invalid number of arguments");
+
+  codec_arg = argv[1];
+  width_arg = argv[2];
+  height_arg = argv[3];
+  infile0_arg = argv[4];
+  infile1_arg = argv[5];
+  outfile_arg = argv[6];
+  max_frames = (int)strtol(argv[7], NULL, 0);
+
+  encoder = get_aom_encoder_by_name(codec_arg);
+  if (!encoder) die("Unsupported codec.");
+
+  info.codec_fourcc = encoder->fourcc;
+  info.frame_width = (int)strtol(width_arg, NULL, 0);
+  info.frame_height = (int)strtol(height_arg, NULL, 0);
+  info.time_base.numerator = 1;
+  info.time_base.denominator = fps;
+
+  if (info.frame_width <= 0 || info.frame_height <= 0 ||
+      (info.frame_width % 2) != 0 || (info.frame_height % 2) != 0) {
+    die("Invalid frame size: %dx%d", info.frame_width, info.frame_height);
+  }
+
+  if (!aom_img_alloc(&raw0, AOM_IMG_FMT_I420, info.frame_width,
+                     info.frame_height, 1)) {
+    die("Failed to allocate image for layer 0.");
+  }
+  if (!aom_img_alloc(&raw1, AOM_IMG_FMT_I420, info.frame_width,
+                     info.frame_height, 1)) {
+    die("Failed to allocate image for layer 1.");
+  }
+
+  //  keyframe_interval = (int)strtol(keyframe_interval_arg, NULL, 0);
+  keyframe_interval = 100;
+  if (keyframe_interval < 0) die("Invalid keyframe interval value.");
+
+  printf("Using %s\n", aom_codec_iface_name(encoder->codec_interface()));
+
+  res = aom_codec_enc_config_default(encoder->codec_interface(), &cfg, 0);
+  if (res) die_codec(&codec, "Failed to get default codec config.");
+
+  cfg.g_w = info.frame_width;
+  cfg.g_h = info.frame_height;
+  cfg.g_timebase.num = info.time_base.numerator;
+  cfg.g_timebase.den = info.time_base.denominator;
+  cfg.rc_target_bitrate = bitrate;
+  cfg.g_error_resilient = 0;
+  cfg.g_lag_in_frames = 0;
+  cfg.rc_end_usage = AOM_Q;
+  cfg.save_as_annexb = 0;
+
+  outfile = fopen(outfile_arg, "wb");
+  if (!outfile) die("Failed to open %s for writing.", outfile_arg);
+
+  if (!(infile0 = fopen(infile0_arg, "rb")))
+    die("Failed to open %s for reading.", infile0_arg);
+  if (!(infile1 = fopen(infile1_arg, "rb")))
+    die("Failed to open %s for reading.", infile0_arg);
+
+  if (aom_codec_enc_init(&codec, encoder->codec_interface(), &cfg, 0))
+    die_codec(&codec, "Failed to initialize encoder");
+  if (aom_codec_control(&codec, AOME_SET_CPUUSED, 8))
+    die_codec(&codec, "Failed to set cpu to 8");
+
+  if (aom_codec_control(&codec, AV1E_SET_TILE_COLUMNS, 2))
+    die_codec(&codec, "Failed to set tile columns to 2");
+  if (aom_codec_control(&codec, AV1E_SET_NUM_TG, 3))
+    die_codec(&codec, "Failed to set num of tile groups to 3");
+
+  if (aom_codec_control(&codec, AOME_SET_NUMBER_SPATIAL_LAYERS, 2))
+    die_codec(&codec, "Failed to set number of spatial layers to 2");
+
+  // Encode frames.
+  while (aom_img_read(&raw0, infile0)) {
+    int flags = 0;
+
+    // configure and encode base layer
+
+    if (keyframe_interval > 0 && frames_encoded % keyframe_interval == 0)
+      flags |= AOM_EFLAG_FORCE_KF;
+    else
+      // use previous base layer (LAST) as sole reference
+      // save this frame as LAST to be used as reference by enhanmcent layer
+      // and next base layer
+      flags |= AOM_EFLAG_NO_REF_LAST2 | AOM_EFLAG_NO_REF_LAST3 |
+               AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF |
+               AOM_EFLAG_NO_REF_BWD | AOM_EFLAG_NO_REF_ARF2 |
+               AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF |
+               AOM_EFLAG_NO_UPD_ENTROPY;
+    cfg.g_w = info.frame_width;
+    cfg.g_h = info.frame_height;
+    if (aom_codec_enc_config_set(&codec, &cfg))
+      die_codec(&codec, "Failed to set enc cfg for layer 0");
+    if (aom_codec_control(&codec, AOME_SET_SPATIAL_LAYER_ID, 0))
+      die_codec(&codec, "Failed to set layer id to 0");
+    if (aom_codec_control(&codec, AOME_SET_CQ_LEVEL, 62))
+      die_codec(&codec, "Failed to set cq level");
+    encode_frame(&codec, &raw0, frame_count++, flags, outfile);
+
+    // configure and encode enhancement layer
+
+    //  use LAST (base layer) as sole reference
+    flags = AOM_EFLAG_NO_REF_LAST2 | AOM_EFLAG_NO_REF_LAST3 |
+            AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF | AOM_EFLAG_NO_REF_BWD |
+            AOM_EFLAG_NO_REF_ARF2 | AOM_EFLAG_NO_UPD_LAST |
+            AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF |
+            AOM_EFLAG_NO_UPD_ENTROPY;
+    cfg.g_w = info.frame_width;
+    cfg.g_h = info.frame_height;
+    aom_img_read(&raw1, infile1);
+    if (aom_codec_enc_config_set(&codec, &cfg))
+      die_codec(&codec, "Failed to set enc cfg for layer 1");
+    if (aom_codec_control(&codec, AOME_SET_SPATIAL_LAYER_ID, 1))
+      die_codec(&codec, "Failed to set layer id to 1");
+    if (aom_codec_control(&codec, AOME_SET_CQ_LEVEL, 10))
+      die_codec(&codec, "Failed to set cq level");
+    encode_frame(&codec, &raw1, frame_count++, flags, outfile);
+
+    frames_encoded++;
+
+    if (max_frames > 0 && frames_encoded >= max_frames) break;
+  }
+
+  // Flush encoder.
+  while (encode_frame(&codec, NULL, -1, 0, outfile)) continue;
+
+  printf("\n");
+  fclose(infile0);
+  fclose(infile1);
+  printf("Processed %d frames.\n", frame_count / 2);
+
+  aom_img_free(&raw0);
+  aom_img_free(&raw1);
+  if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec.");
+
+  fclose(outfile);
+
+  return EXIT_SUCCESS;
+}
diff --git a/libs/libaom/src/examples/set_maps.c b/libs/libaom/src/examples/set_maps.c
new file mode 100644
index 000000000..9aeb96e43
--- /dev/null
+++ b/libs/libaom/src/examples/set_maps.c
@@ -0,0 +1,208 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// AOM Set Active and ROI Maps
+// ===========================
+//
+// This is an example demonstrating how to control the AOM encoder's
+// ROI and Active maps.
+//
+// ROI (Reigon of Interest) maps are a way for the application to assign
+// each macroblock in the image to a region, and then set quantizer and
+// filtering parameters on that image.
+//
+// Active maps are a way for the application to specify on a
+// macroblock-by-macroblock basis whether there is any activity in that
+// macroblock.
+//
+//
+// Configuration
+// -------------
+// An ROI map is set on frame 22. If the width of the image in macroblocks
+// is evenly divisble by 4, then the output will appear to have distinct
+// columns, where the quantizer, loopfilter, and static threshold differ
+// from column to column.
+//
+// An active map is set on frame 33. If the width of the image in macroblocks
+// is evenly divisble by 4, then the output will appear to have distinct
+// columns, where one column will have motion and the next will not.
+//
+// The active map is cleared on frame 44.
+//
+// Observing The Effects
+// ---------------------
+// Use the `simple_decoder` example to decode this sample, and observe
+// the change in the image at frames 22, 33, and 44.
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom/aom_encoder.h"
+#include "aom/aomcx.h"
+#include "common/tools_common.h"
+#include "common/video_writer.h"
+
+static const char *exec_name;
+
+void usage_exit(void) {
+  fprintf(stderr, "Usage: %s <codec> <width> <height> <infile> <outfile>\n",
+          exec_name);
+  exit(EXIT_FAILURE);
+}
+
+static void set_active_map(const aom_codec_enc_cfg_t *cfg,
+                           aom_codec_ctx_t *codec) {
+  unsigned int i;
+  aom_active_map_t map = { 0, 0, 0 };
+
+  map.rows = (cfg->g_h + 15) / 16;
+  map.cols = (cfg->g_w + 15) / 16;
+
+  map.active_map = (uint8_t *)malloc(map.rows * map.cols);
+  for (i = 0; i < map.rows * map.cols; ++i) map.active_map[i] = i % 2;
+
+  if (aom_codec_control(codec, AOME_SET_ACTIVEMAP, &map))
+    die_codec(codec, "Failed to set active map");
+
+  free(map.active_map);
+}
+
+static void unset_active_map(const aom_codec_enc_cfg_t *cfg,
+                             aom_codec_ctx_t *codec) {
+  aom_active_map_t map = { 0, 0, 0 };
+
+  map.rows = (cfg->g_h + 15) / 16;
+  map.cols = (cfg->g_w + 15) / 16;
+  map.active_map = NULL;
+
+  if (aom_codec_control(codec, AOME_SET_ACTIVEMAP, &map))
+    die_codec(codec, "Failed to set active map");
+}
+
+static int encode_frame(aom_codec_ctx_t *codec, aom_image_t *img,
+                        int frame_index, AvxVideoWriter *writer) {
+  int got_pkts = 0;
+  aom_codec_iter_t iter = NULL;
+  const aom_codec_cx_pkt_t *pkt = NULL;
+  const aom_codec_err_t res = aom_codec_encode(codec, img, frame_index, 1, 0);
+  if (res != AOM_CODEC_OK) die_codec(codec, "Failed to encode frame");
+
+  while ((pkt = aom_codec_get_cx_data(codec, &iter)) != NULL) {
+    got_pkts = 1;
+
+    if (pkt->kind == AOM_CODEC_CX_FRAME_PKT) {
+      const int keyframe = (pkt->data.frame.flags & AOM_FRAME_IS_KEY) != 0;
+      if (!aom_video_writer_write_frame(writer, pkt->data.frame.buf,
+                                        pkt->data.frame.sz,
+                                        pkt->data.frame.pts)) {
+        die_codec(codec, "Failed to write compressed frame");
+      }
+
+      printf(keyframe ? "K" : ".");
+      fflush(stdout);
+    }
+  }
+
+  return got_pkts;
+}
+
+int main(int argc, char **argv) {
+  FILE *infile = NULL;
+  aom_codec_ctx_t codec;
+  aom_codec_enc_cfg_t cfg;
+  int frame_count = 0;
+  const int limit = 15;
+  aom_image_t raw;
+  aom_codec_err_t res;
+  AvxVideoInfo info;
+  AvxVideoWriter *writer = NULL;
+  const AvxInterface *encoder = NULL;
+  const int fps = 2;  // TODO(dkovalev) add command line argument
+  const double bits_per_pixel_per_frame = 0.067;
+
+  exec_name = argv[0];
+  if (argc != 6) die("Invalid number of arguments");
+
+  memset(&info, 0, sizeof(info));
+
+  encoder = get_aom_encoder_by_name(argv[1]);
+  if (encoder == NULL) {
+    die("Unsupported codec.");
+  }
+  assert(encoder != NULL);
+  info.codec_fourcc = encoder->fourcc;
+  info.frame_width = (int)strtol(argv[2], NULL, 0);
+  info.frame_height = (int)strtol(argv[3], NULL, 0);
+  info.time_base.numerator = 1;
+  info.time_base.denominator = fps;
+
+  if (info.frame_width <= 0 || info.frame_height <= 0 ||
+      (info.frame_width % 2) != 0 || (info.frame_height % 2) != 0) {
+    die("Invalid frame size: %dx%d", info.frame_width, info.frame_height);
+  }
+
+  if (!aom_img_alloc(&raw, AOM_IMG_FMT_I420, info.frame_width,
+                     info.frame_height, 1)) {
+    die("Failed to allocate image.");
+  }
+
+  printf("Using %s\n", aom_codec_iface_name(encoder->codec_interface()));
+
+  res = aom_codec_enc_config_default(encoder->codec_interface(), &cfg, 0);
+  if (res) die_codec(&codec, "Failed to get default codec config.");
+
+  cfg.g_w = info.frame_width;
+  cfg.g_h = info.frame_height;
+  cfg.g_timebase.num = info.time_base.numerator;
+  cfg.g_timebase.den = info.time_base.denominator;
+  cfg.rc_target_bitrate =
+      (unsigned int)(bits_per_pixel_per_frame * cfg.g_w * cfg.g_h * fps / 1000);
+  cfg.g_lag_in_frames = 0;
+
+  writer = aom_video_writer_open(argv[5], kContainerIVF, &info);
+  if (!writer) die("Failed to open %s for writing.", argv[5]);
+
+  if (!(infile = fopen(argv[4], "rb")))
+    die("Failed to open %s for reading.", argv[4]);
+
+  if (aom_codec_enc_init(&codec, encoder->codec_interface(), &cfg, 0))
+    die_codec(&codec, "Failed to initialize encoder");
+
+  // Encode frames.
+  while (aom_img_read(&raw, infile) && frame_count < limit) {
+    ++frame_count;
+
+    if (frame_count == 5) {
+      set_active_map(&cfg, &codec);
+    } else if (frame_count == 11) {
+      unset_active_map(&cfg, &codec);
+    }
+
+    encode_frame(&codec, &raw, frame_count, writer);
+  }
+
+  // Flush encoder.
+  while (encode_frame(&codec, NULL, -1, writer)) {
+  }
+
+  printf("\n");
+  fclose(infile);
+  printf("Processed %d frames.\n", frame_count);
+
+  aom_img_free(&raw);
+  if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec.");
+
+  aom_video_writer_close(writer);
+
+  return EXIT_SUCCESS;
+}
diff --git a/libs/libaom/src/examples/simple_decoder.c b/libs/libaom/src/examples/simple_decoder.c
new file mode 100644
index 000000000..d098d1e0b
--- /dev/null
+++ b/libs/libaom/src/examples/simple_decoder.c
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// Simple Decoder
+// ==============
+//
+// This is an example of a simple decoder loop. It takes an input file
+// containing the compressed data (in IVF format), passes it through the
+// decoder, and writes the decompressed frames to disk. Other decoder
+// examples build upon this one.
+//
+// The details of the IVF format have been elided from this example for
+// simplicity of presentation, as IVF files will not generally be used by
+// your application. In general, an IVF file consists of a file header,
+// followed by a variable number of frames. Each frame consists of a frame
+// header followed by a variable length payload. The length of the payload
+// is specified in the first four bytes of the frame header. The payload is
+// the raw compressed data.
+//
+// Standard Includes
+// -----------------
+// For decoders, you only have to include `aom_decoder.h` and then any
+// header files for the specific codecs you use. In this case, we're using
+// aom.
+//
+// Initializing The Codec
+// ----------------------
+// The libaom decoder is initialized by the call to aom_codec_dec_init().
+// Determining the codec interface to use is handled by AvxVideoReader and the
+// functions prefixed with aom_video_reader_. Discussion of those functions is
+// beyond the scope of this example, but the main gist is to open the input file
+// and parse just enough of it to determine if it's a AVx file and which AVx
+// codec is contained within the file.
+// Note the NULL pointer passed to aom_codec_dec_init(). We do that in this
+// example because we want the algorithm to determine the stream configuration
+// (width/height) and allocate memory automatically.
+//
+// Decoding A Frame
+// ----------------
+// Once the frame has been read into memory, it is decoded using the
+// `aom_codec_decode` function. The call takes a pointer to the data
+// (`frame`) and the length of the data (`frame_size`). No application data
+// is associated with the frame in this example, so the `user_priv`
+// parameter is NULL.
+//
+// Codecs may produce a variable number of output frames for every call to
+// `aom_codec_decode`. These frames are retrieved by the
+// `aom_codec_get_frame` iterator function. The iterator variable `iter` is
+// initialized to NULL each time `aom_codec_decode` is called.
+// `aom_codec_get_frame` is called in a loop, returning a pointer to a
+// decoded image or NULL to indicate the end of list.
+//
+// Processing The Decoded Data
+// ---------------------------
+// In this example, we simply write the encoded data to disk. It is
+// important to honor the image's `stride` values.
+//
+// Cleanup
+// -------
+// The `aom_codec_destroy` call frees any memory allocated by the codec.
+//
+// Error Handling
+// --------------
+// This example does not special case any error return codes. If there was
+// an error, a descriptive message is printed and the program exits. With
+// few exceptions, aom_codec functions return an enumerated error status,
+// with the value `0` indicating success.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom/aom_decoder.h"
+#include "common/tools_common.h"
+#include "common/video_reader.h"
+
+static const char *exec_name;
+
+void usage_exit(void) {
+  fprintf(stderr, "Usage: %s <infile> <outfile>\n", exec_name);
+  exit(EXIT_FAILURE);
+}
+
+int main(int argc, char **argv) {
+  int frame_cnt = 0;
+  FILE *outfile = NULL;
+  aom_codec_ctx_t codec;
+  AvxVideoReader *reader = NULL;
+  const AvxInterface *decoder = NULL;
+  const AvxVideoInfo *info = NULL;
+
+  exec_name = argv[0];
+
+  if (argc != 3) die("Invalid number of arguments.");
+
+  reader = aom_video_reader_open(argv[1]);
+  if (!reader) die("Failed to open %s for reading.", argv[1]);
+
+  if (!(outfile = fopen(argv[2], "wb")))
+    die("Failed to open %s for writing.", argv[2]);
+
+  info = aom_video_reader_get_info(reader);
+
+  decoder = get_aom_decoder_by_fourcc(info->codec_fourcc);
+  if (!decoder) die("Unknown input codec.");
+
+  printf("Using %s\n", aom_codec_iface_name(decoder->codec_interface()));
+
+  if (aom_codec_dec_init(&codec, decoder->codec_interface(), NULL, 0))
+    die_codec(&codec, "Failed to initialize decoder.");
+
+  while (aom_video_reader_read_frame(reader)) {
+    aom_codec_iter_t iter = NULL;
+    aom_image_t *img = NULL;
+    size_t frame_size = 0;
+    const unsigned char *frame =
+        aom_video_reader_get_frame(reader, &frame_size);
+    if (aom_codec_decode(&codec, frame, frame_size, NULL))
+      die_codec(&codec, "Failed to decode frame.");
+
+    while ((img = aom_codec_get_frame(&codec, &iter)) != NULL) {
+      aom_img_write(img, outfile);
+      ++frame_cnt;
+    }
+  }
+
+  printf("Processed %d frames.\n", frame_cnt);
+  if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec");
+
+  printf("Play: ffplay -f rawvideo -pix_fmt yuv420p -s %dx%d %s\n",
+         info->frame_width, info->frame_height, argv[2]);
+
+  aom_video_reader_close(reader);
+
+  fclose(outfile);
+
+  return EXIT_SUCCESS;
+}
diff --git a/libs/libaom/src/examples/simple_encoder.c b/libs/libaom/src/examples/simple_encoder.c
new file mode 100644
index 000000000..01a37cf0c
--- /dev/null
+++ b/libs/libaom/src/examples/simple_encoder.c
@@ -0,0 +1,249 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// Simple Encoder
+// ==============
+//
+// This is an example of a simple encoder loop. It takes an input file in
+// YV12 format, passes it through the encoder, and writes the compressed
+// frames to disk in IVF format. Other decoder examples build upon this
+// one.
+//
+// The details of the IVF format have been elided from this example for
+// simplicity of presentation, as IVF files will not generally be used by
+// your application. In general, an IVF file consists of a file header,
+// followed by a variable number of frames. Each frame consists of a frame
+// header followed by a variable length payload. The length of the payload
+// is specified in the first four bytes of the frame header. The payload is
+// the raw compressed data.
+//
+// Standard Includes
+// -----------------
+// For encoders, you only have to include `aom_encoder.h` and then any
+// header files for the specific codecs you use. In this case, we're using
+// aom.
+//
+// Getting The Default Configuration
+// ---------------------------------
+// Encoders have the notion of "usage profiles." For example, an encoder
+// may want to publish default configurations for both a video
+// conferencing application and a best quality offline encoder. These
+// obviously have very different default settings. Consult the
+// documentation for your codec to see if it provides any default
+// configurations. All codecs provide a default configuration, number 0,
+// which is valid for material in the vacinity of QCIF/QVGA.
+//
+// Updating The Configuration
+// ---------------------------------
+// Almost all applications will want to update the default configuration
+// with settings specific to their usage. Here we set the width and height
+// of the video file to that specified on the command line. We also scale
+// the default bitrate based on the ratio between the default resolution
+// and the resolution specified on the command line.
+//
+// Initializing The Codec
+// ----------------------
+// The encoder is initialized by the following code.
+//
+// Encoding A Frame
+// ----------------
+// The frame is read as a continuous block (size width * height * 3 / 2)
+// from the input file. If a frame was read (the input file has not hit
+// EOF) then the frame is passed to the encoder. Otherwise, a NULL
+// is passed, indicating the End-Of-Stream condition to the encoder. The
+// `frame_cnt` is reused as the presentation time stamp (PTS) and each
+// frame is shown for one frame-time in duration. The flags parameter is
+// unused in this example.
+
+// Forced Keyframes
+// ----------------
+// Keyframes can be forced by setting the AOM_EFLAG_FORCE_KF bit of the
+// flags passed to `aom_codec_control()`. In this example, we force a
+// keyframe every <keyframe-interval> frames. Note, the output stream can
+// contain additional keyframes beyond those that have been forced using the
+// AOM_EFLAG_FORCE_KF flag because of automatic keyframe placement by the
+// encoder.
+//
+// Processing The Encoded Data
+// ---------------------------
+// Each packet of type `AOM_CODEC_CX_FRAME_PKT` contains the encoded data
+// for this frame. We write a IVF frame header, followed by the raw data.
+//
+// Cleanup
+// -------
+// The `aom_codec_destroy` call frees any memory allocated by the codec.
+//
+// Error Handling
+// --------------
+// This example does not special case any error return codes. If there was
+// an error, a descriptive message is printed and the program exits. With
+// few exeptions, aom_codec functions return an enumerated error status,
+// with the value `0` indicating success.
+//
+// Error Resiliency Features
+// -------------------------
+// Error resiliency is controlled by the g_error_resilient member of the
+// configuration structure. Use the `decode_with_drops` example to decode with
+// frames 5-10 dropped. Compare the output for a file encoded with this example
+// versus one encoded with the `simple_encoder` example.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom/aom_encoder.h"
+#include "common/tools_common.h"
+#include "common/video_writer.h"
+
+static const char *exec_name;
+
+void usage_exit(void) {
+  fprintf(stderr,
+          "Usage: %s <codec> <width> <height> <infile> <outfile> "
+          "<keyframe-interval> <error-resilient> <frames to encode>\n"
+          "See comments in simple_encoder.c for more information.\n",
+          exec_name);
+  exit(EXIT_FAILURE);
+}
+
+static int encode_frame(aom_codec_ctx_t *codec, aom_image_t *img,
+                        int frame_index, int flags, AvxVideoWriter *writer) {
+  int got_pkts = 0;
+  aom_codec_iter_t iter = NULL;
+  const aom_codec_cx_pkt_t *pkt = NULL;
+  const aom_codec_err_t res =
+      aom_codec_encode(codec, img, frame_index, 1, flags);
+  if (res != AOM_CODEC_OK) die_codec(codec, "Failed to encode frame");
+
+  while ((pkt = aom_codec_get_cx_data(codec, &iter)) != NULL) {
+    got_pkts = 1;
+
+    if (pkt->kind == AOM_CODEC_CX_FRAME_PKT) {
+      const int keyframe = (pkt->data.frame.flags & AOM_FRAME_IS_KEY) != 0;
+      if (!aom_video_writer_write_frame(writer, pkt->data.frame.buf,
+                                        pkt->data.frame.sz,
+                                        pkt->data.frame.pts)) {
+        die_codec(codec, "Failed to write compressed frame");
+      }
+      printf(keyframe ? "K" : ".");
+      fflush(stdout);
+    }
+  }
+
+  return got_pkts;
+}
+
+// TODO(tomfinegan): Improve command line parsing and add args for bitrate/fps.
+int main(int argc, char **argv) {
+  FILE *infile = NULL;
+  aom_codec_ctx_t codec;
+  aom_codec_enc_cfg_t cfg;
+  int frame_count = 0;
+  aom_image_t raw;
+  aom_codec_err_t res;
+  AvxVideoInfo info;
+  AvxVideoWriter *writer = NULL;
+  const AvxInterface *encoder = NULL;
+  const int fps = 30;
+  const int bitrate = 200;
+  int keyframe_interval = 0;
+  int max_frames = 0;
+  int frames_encoded = 0;
+  const char *codec_arg = NULL;
+  const char *width_arg = NULL;
+  const char *height_arg = NULL;
+  const char *infile_arg = NULL;
+  const char *outfile_arg = NULL;
+  const char *keyframe_interval_arg = NULL;
+
+  exec_name = argv[0];
+
+  // Clear explicitly, as simply assigning "{ 0 }" generates
+  // "missing-field-initializers" warning in some compilers.
+  memset(&info, 0, sizeof(info));
+
+  if (argc != 9) die("Invalid number of arguments");
+
+  codec_arg = argv[1];
+  width_arg = argv[2];
+  height_arg = argv[3];
+  infile_arg = argv[4];
+  outfile_arg = argv[5];
+  keyframe_interval_arg = argv[6];
+  max_frames = (int)strtol(argv[8], NULL, 0);
+
+  encoder = get_aom_encoder_by_name(codec_arg);
+  if (!encoder) die("Unsupported codec.");
+
+  info.codec_fourcc = encoder->fourcc;
+  info.frame_width = (int)strtol(width_arg, NULL, 0);
+  info.frame_height = (int)strtol(height_arg, NULL, 0);
+  info.time_base.numerator = 1;
+  info.time_base.denominator = fps;
+
+  if (info.frame_width <= 0 || info.frame_height <= 0 ||
+      (info.frame_width % 2) != 0 || (info.frame_height % 2) != 0) {
+    die("Invalid frame size: %dx%d", info.frame_width, info.frame_height);
+  }
+
+  if (!aom_img_alloc(&raw, AOM_IMG_FMT_I420, info.frame_width,
+                     info.frame_height, 1)) {
+    die("Failed to allocate image.");
+  }
+
+  keyframe_interval = (int)strtol(keyframe_interval_arg, NULL, 0);
+  if (keyframe_interval < 0) die("Invalid keyframe interval value.");
+
+  printf("Using %s\n", aom_codec_iface_name(encoder->codec_interface()));
+
+  res = aom_codec_enc_config_default(encoder->codec_interface(), &cfg, 0);
+  if (res) die_codec(&codec, "Failed to get default codec config.");
+
+  cfg.g_w = info.frame_width;
+  cfg.g_h = info.frame_height;
+  cfg.g_timebase.num = info.time_base.numerator;
+  cfg.g_timebase.den = info.time_base.denominator;
+  cfg.rc_target_bitrate = bitrate;
+  cfg.g_error_resilient = (aom_codec_er_flags_t)strtoul(argv[7], NULL, 0);
+
+  writer = aom_video_writer_open(outfile_arg, kContainerIVF, &info);
+  if (!writer) die("Failed to open %s for writing.", outfile_arg);
+
+  if (!(infile = fopen(infile_arg, "rb")))
+    die("Failed to open %s for reading.", infile_arg);
+
+  if (aom_codec_enc_init(&codec, encoder->codec_interface(), &cfg, 0))
+    die_codec(&codec, "Failed to initialize encoder");
+
+  // Encode frames.
+  while (aom_img_read(&raw, infile)) {
+    int flags = 0;
+    if (keyframe_interval > 0 && frame_count % keyframe_interval == 0)
+      flags |= AOM_EFLAG_FORCE_KF;
+    encode_frame(&codec, &raw, frame_count++, flags, writer);
+    frames_encoded++;
+    if (max_frames > 0 && frames_encoded >= max_frames) break;
+  }
+
+  // Flush encoder.
+  while (encode_frame(&codec, NULL, -1, 0, writer)) continue;
+
+  printf("\n");
+  fclose(infile);
+  printf("Processed %d frames.\n", frame_count);
+
+  aom_img_free(&raw);
+  if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec.");
+
+  aom_video_writer_close(writer);
+
+  return EXIT_SUCCESS;
+}
diff --git a/libs/libaom/src/examples/svc_encoder_rtc.c b/libs/libaom/src/examples/svc_encoder_rtc.c
new file mode 100644
index 000000000..1316c6c1e
--- /dev/null
+++ b/libs/libaom/src/examples/svc_encoder_rtc.c
@@ -0,0 +1,907 @@
+/*
+ *  Copyright (c) 2019, Alliance for Open Media. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+//  This is an example demonstrating how to implement a multi-layer AOM
+//  encoding scheme for RTC video applications.
+
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom/aom_encoder.h"
+#include "aom/aomcx.h"
+#include "av1/common/enums.h"
+#include "common/tools_common.h"
+#include "common/video_writer.h"
+#include "aom_ports/aom_timer.h"
+
+#define zero(Dest) memset(&(Dest), 0, sizeof(Dest));
+
+static const char *exec_name;
+
+void usage_exit(void) { exit(EXIT_FAILURE); }
+
+static int mode_to_num_temporal_layers[10] = { 1, 2, 3, 3, 2, 1, 1, 3, 3, 3 };
+static int mode_to_num_spatial_layers[10] = { 1, 1, 1, 1, 1, 2, 3, 3, 3, 3 };
+static int mode_to_num_layers[10] = { 1, 2, 3, 3, 2, 2, 3, 9, 9, 9 };
+
+// For rate control encoding stats.
+struct RateControlMetrics {
+  // Number of input frames per layer.
+  int layer_input_frames[AOM_MAX_TS_LAYERS];
+  // Number of encoded non-key frames per layer.
+  int layer_enc_frames[AOM_MAX_TS_LAYERS];
+  // Framerate per layer layer (cumulative).
+  double layer_framerate[AOM_MAX_TS_LAYERS];
+  // Target average frame size per layer (per-frame-bandwidth per layer).
+  double layer_pfb[AOM_MAX_LAYERS];
+  // Actual average frame size per layer.
+  double layer_avg_frame_size[AOM_MAX_LAYERS];
+  // Average rate mismatch per layer (|target - actual| / target).
+  double layer_avg_rate_mismatch[AOM_MAX_LAYERS];
+  // Actual encoding bitrate per layer (cumulative across temporal layers).
+  double layer_encoding_bitrate[AOM_MAX_LAYERS];
+  // Average of the short-time encoder actual bitrate.
+  // TODO(marpan): Should we add these short-time stats for each layer?
+  double avg_st_encoding_bitrate;
+  // Variance of the short-time encoder actual bitrate.
+  double variance_st_encoding_bitrate;
+  // Window (number of frames) for computing short-timee encoding bitrate.
+  int window_size;
+  // Number of window measurements.
+  int window_count;
+  int layer_target_bitrate[AOM_MAX_LAYERS];
+};
+
+static int read_frame(struct AvxInputContext *input_ctx, aom_image_t *img) {
+  FILE *f = input_ctx->file;
+  y4m_input *y4m = &input_ctx->y4m;
+  int shortread = 0;
+
+  if (input_ctx->file_type == FILE_TYPE_Y4M) {
+    if (y4m_input_fetch_frame(y4m, f, img) < 1) return 0;
+  } else {
+    shortread = read_yuv_frame(input_ctx, img);
+  }
+
+  return !shortread;
+}
+
+static int file_is_y4m(const char detect[4]) {
+  if (memcmp(detect, "YUV4", 4) == 0) {
+    return 1;
+  }
+  return 0;
+}
+
+static int fourcc_is_ivf(const char detect[4]) {
+  if (memcmp(detect, "DKIF", 4) == 0) {
+    return 1;
+  }
+  return 0;
+}
+
+static void close_input_file(struct AvxInputContext *input) {
+  fclose(input->file);
+  if (input->file_type == FILE_TYPE_Y4M) y4m_input_close(&input->y4m);
+}
+
+static void open_input_file(struct AvxInputContext *input,
+                            aom_chroma_sample_position_t csp) {
+  /* Parse certain options from the input file, if possible */
+  input->file = strcmp(input->filename, "-") ? fopen(input->filename, "rb")
+                                             : set_binary_mode(stdin);
+
+  if (!input->file) fatal("Failed to open input file");
+
+  if (!fseeko(input->file, 0, SEEK_END)) {
+    /* Input file is seekable. Figure out how long it is, so we can get
+     * progress info.
+     */
+    input->length = ftello(input->file);
+    rewind(input->file);
+  }
+
+  /* Default to 1:1 pixel aspect ratio. */
+  input->pixel_aspect_ratio.numerator = 1;
+  input->pixel_aspect_ratio.denominator = 1;
+
+  /* For RAW input sources, these bytes will applied on the first frame
+   *  in read_frame().
+   */
+  input->detect.buf_read = fread(input->detect.buf, 1, 4, input->file);
+  input->detect.position = 0;
+
+  if (input->detect.buf_read == 4 && file_is_y4m(input->detect.buf)) {
+    if (y4m_input_open(&input->y4m, input->file, input->detect.buf, 4, csp,
+                       input->only_i420) >= 0) {
+      input->file_type = FILE_TYPE_Y4M;
+      input->width = input->y4m.pic_w;
+      input->height = input->y4m.pic_h;
+      input->pixel_aspect_ratio.numerator = input->y4m.par_n;
+      input->pixel_aspect_ratio.denominator = input->y4m.par_d;
+      input->framerate.numerator = input->y4m.fps_n;
+      input->framerate.denominator = input->y4m.fps_d;
+      input->fmt = input->y4m.aom_fmt;
+      input->bit_depth = input->y4m.bit_depth;
+    } else {
+      fatal("Unsupported Y4M stream.");
+    }
+  } else if (input->detect.buf_read == 4 && fourcc_is_ivf(input->detect.buf)) {
+    fatal("IVF is not supported as input.");
+  } else {
+    input->file_type = FILE_TYPE_RAW;
+  }
+}
+
+// Note: these rate control metrics assume only 1 key frame in the
+// sequence (i.e., first frame only). So for temporal pattern# 7
+// (which has key frame for every frame on base layer), the metrics
+// computation will be off/wrong.
+// TODO(marpan): Update these metrics to account for multiple key frames
+// in the stream.
+static void set_rate_control_metrics(struct RateControlMetrics *rc,
+                                     double framerate,
+                                     unsigned int ss_number_layers,
+                                     unsigned int ts_number_layers) {
+  int ts_rate_decimator[AOM_MAX_TS_LAYERS] = { 1 };
+  ts_rate_decimator[0] = 1;
+  if (ts_number_layers == 2) {
+    ts_rate_decimator[0] = 2;
+    ts_rate_decimator[1] = 1;
+  }
+  if (ts_number_layers == 3) {
+    ts_rate_decimator[0] = 4;
+    ts_rate_decimator[1] = 2;
+    ts_rate_decimator[2] = 1;
+  }
+  // Set the layer (cumulative) framerate and the target layer (non-cumulative)
+  // per-frame-bandwidth, for the rate control encoding stats below.
+  for (unsigned int sl = 0; sl < ss_number_layers; ++sl) {
+    unsigned int i = sl * ts_number_layers;
+    rc->layer_framerate[0] = framerate / ts_rate_decimator[0];
+    rc->layer_pfb[i] =
+        1000.0 * rc->layer_target_bitrate[i] / rc->layer_framerate[0];
+    for (unsigned int tl = 0; tl < ts_number_layers; ++tl) {
+      i = sl * ts_number_layers + tl;
+      if (tl > 0) {
+        rc->layer_framerate[tl] = framerate / ts_rate_decimator[tl];
+        rc->layer_pfb[i] =
+            1000.0 *
+            (rc->layer_target_bitrate[i] - rc->layer_target_bitrate[i - 1]) /
+            (rc->layer_framerate[tl] - rc->layer_framerate[tl - 1]);
+      }
+      rc->layer_input_frames[tl] = 0;
+      rc->layer_enc_frames[tl] = 0;
+      rc->layer_encoding_bitrate[i] = 0.0;
+      rc->layer_avg_frame_size[i] = 0.0;
+      rc->layer_avg_rate_mismatch[i] = 0.0;
+    }
+  }
+  rc->window_count = 0;
+  rc->window_size = 15;
+  rc->avg_st_encoding_bitrate = 0.0;
+  rc->variance_st_encoding_bitrate = 0.0;
+}
+
+static void printout_rate_control_summary(struct RateControlMetrics *rc,
+                                          int frame_cnt,
+                                          unsigned int ss_number_layers,
+                                          unsigned int ts_number_layers) {
+  int tot_num_frames = 0;
+  double perc_fluctuation = 0.0;
+  printf("Total number of processed frames: %d\n\n", frame_cnt - 1);
+  printf("Rate control layer stats for %d layer(s):\n\n", ts_number_layers);
+  for (unsigned int sl = 0; sl < ss_number_layers; ++sl) {
+    tot_num_frames = 0;
+    for (unsigned int tl = 0; tl < ts_number_layers; ++tl) {
+      unsigned int i = sl * ts_number_layers + tl;
+      const int num_dropped =
+          tl > 0 ? rc->layer_input_frames[tl] - rc->layer_enc_frames[tl]
+                 : rc->layer_input_frames[tl] - rc->layer_enc_frames[tl] - 1;
+      tot_num_frames += rc->layer_input_frames[tl];
+      rc->layer_encoding_bitrate[i] = 0.001 * rc->layer_framerate[tl] *
+                                      rc->layer_encoding_bitrate[i] /
+                                      tot_num_frames;
+      rc->layer_avg_frame_size[i] =
+          rc->layer_avg_frame_size[i] / rc->layer_enc_frames[tl];
+      rc->layer_avg_rate_mismatch[i] =
+          100.0 * rc->layer_avg_rate_mismatch[i] / rc->layer_enc_frames[tl];
+      printf("For layer#: %d %d \n", sl, tl);
+      printf("Bitrate (target vs actual): %d %f\n", rc->layer_target_bitrate[i],
+             rc->layer_encoding_bitrate[i]);
+      printf("Average frame size (target vs actual): %f %f\n", rc->layer_pfb[i],
+             rc->layer_avg_frame_size[i]);
+      printf("Average rate_mismatch: %f\n", rc->layer_avg_rate_mismatch[i]);
+      printf(
+          "Number of input frames, encoded (non-key) frames, "
+          "and perc dropped frames: %d %d %f\n",
+          rc->layer_input_frames[tl], rc->layer_enc_frames[tl],
+          100.0 * num_dropped / rc->layer_input_frames[tl]);
+      printf("\n");
+    }
+  }
+  rc->avg_st_encoding_bitrate = rc->avg_st_encoding_bitrate / rc->window_count;
+  rc->variance_st_encoding_bitrate =
+      rc->variance_st_encoding_bitrate / rc->window_count -
+      (rc->avg_st_encoding_bitrate * rc->avg_st_encoding_bitrate);
+  perc_fluctuation = 100.0 * sqrt(rc->variance_st_encoding_bitrate) /
+                     rc->avg_st_encoding_bitrate;
+  printf("Short-time stats, for window of %d frames:\n", rc->window_size);
+  printf("Average, rms-variance, and percent-fluct: %f %f %f\n",
+         rc->avg_st_encoding_bitrate, sqrt(rc->variance_st_encoding_bitrate),
+         perc_fluctuation);
+  if (frame_cnt - 1 != tot_num_frames)
+    die("Error: Number of input frames not equal to output!\n");
+}
+
+// Layer pattern configuration.
+static int set_layer_pattern(int layering_mode, int superframe_cnt,
+                             aom_svc_layer_id_t *layer_id,
+                             aom_svc_ref_frame_config_t *ref_frame_config,
+                             int *use_svc_control, int spatial_layer_id,
+                             int is_key_frame, int ksvc_mode) {
+  int i;
+  int shift = (layering_mode == 7) ? 2 : 0;
+  *use_svc_control = 1;
+  layer_id->spatial_layer_id = spatial_layer_id;
+  // Set the referende map buffer idx for the 7 references:
+  // LAST_FRAME (0), LAST2_FRAME(1), LAST3_FRAME(2), GOLDEN_FRAME(3),
+  // BWDREF_FRAME(4), ALTREF2_FRAME(5), ALTREF_FRAME(6).
+  for (i = 0; i < INTER_REFS_PER_FRAME; i++) ref_frame_config->ref_idx[i] = i;
+  for (i = 0; i < INTER_REFS_PER_FRAME; i++) ref_frame_config->reference[i] = 0;
+  for (i = 0; i < REF_FRAMES; i++) ref_frame_config->refresh[i] = 0;
+  // Note for this layered patterns only use LAST and GF for prediction in
+  // non-rd mode (speed >= 7).
+  int layer_flags = AOM_EFLAG_NO_REF_LAST2 | AOM_EFLAG_NO_REF_LAST3 |
+                    AOM_EFLAG_NO_REF_ARF | AOM_EFLAG_NO_REF_BWD |
+                    AOM_EFLAG_NO_REF_ARF2;
+  if (ksvc_mode) {
+    // Same pattern as case 8.
+    layering_mode = 8;
+    if (!is_key_frame)
+      // No inter-layer prediction on inter-frames.
+      layer_flags |= AOM_EFLAG_NO_REF_GF;
+  }
+  switch (layering_mode) {
+    case 0:
+      // 1-layer: update LAST on every frame, reference LAST and GF.
+      layer_id->temporal_layer_id = 0;
+      ref_frame_config->refresh[0] = 1;
+      break;
+    case 1:
+      // 2-temporal layer.
+      //    1    3    5
+      //  0    2    4
+      if (superframe_cnt % 2 == 0) {
+        layer_id->temporal_layer_id = 0;
+        // Update LAST on layer 0, reference LAST and GF.
+        ref_frame_config->refresh[0] = 1;
+      } else {
+        layer_id->temporal_layer_id = 1;
+        // No updates on layer 1, only reference LAST (TL0).
+        layer_flags |= AOM_EFLAG_NO_REF_GF;
+      }
+      break;
+    case 2:
+      // 3-temporal layer:
+      //   1    3   5    7
+      //     2        6
+      // 0        4        8
+      if (superframe_cnt % 4 == 0) {
+        // Base layer.
+        layer_id->temporal_layer_id = 0;
+        // Update LAST on layer 0, reference LAST and GF.
+        ref_frame_config->refresh[0] = 1;
+      } else if ((superframe_cnt - 1) % 4 == 0) {
+        layer_id->temporal_layer_id = 2;
+        // First top layer: no updates, only reference LAST (TL0).
+        layer_flags |= AOM_EFLAG_NO_REF_GF;
+      } else if ((superframe_cnt - 2) % 4 == 0) {
+        layer_id->temporal_layer_id = 1;
+        // Middle layer (TL1): update LAST2, only reference LAST (TL0).
+        ref_frame_config->refresh[1] = 1;
+        layer_flags |= AOM_EFLAG_NO_REF_GF;
+      } else if ((superframe_cnt - 3) % 4 == 0) {
+        layer_id->temporal_layer_id = 2;
+        // Second top layer: no updates, only reference LAST.
+        // Set buffer idx for LAST to slot 1, since that was the slot
+        // updated in previous frame. So LAST is TL1 frame.
+        ref_frame_config->ref_idx[0] = 1;
+        ref_frame_config->ref_idx[1] = 0;
+        layer_flags |= AOM_EFLAG_NO_REF_GF;
+      }
+      break;
+    case 3:
+      // 3-temporal layer: but middle layer updates GF, so 2nd TL2 will
+      // only reference GF (not LAST). Other frames only reference LAST.
+      //   1    3   5    7
+      //     2        6
+      // 0        4        8
+      if (superframe_cnt % 4 == 0) {
+        // Base layer.
+        layer_id->temporal_layer_id = 0;
+        // Update LAST on layer 0, only reference LAST.
+        ref_frame_config->refresh[0] = 1;
+        layer_flags |= AOM_EFLAG_NO_REF_GF;
+      } else if ((superframe_cnt - 1) % 4 == 0) {
+        layer_id->temporal_layer_id = 2;
+        // First top layer: no updates, only reference LAST (TL0).
+        layer_flags |= AOM_EFLAG_NO_REF_GF;
+      } else if ((superframe_cnt - 2) % 4 == 0) {
+        layer_id->temporal_layer_id = 1;
+        // Middle layer (TL1): update GF, only reference LAST (TL0).
+        ref_frame_config->refresh[3] = 1;
+        layer_flags |= AOM_EFLAG_NO_REF_GF;
+      } else if ((superframe_cnt - 3) % 4 == 0) {
+        layer_id->temporal_layer_id = 2;
+        // Second top layer: no updates, only reference GF.
+        layer_flags |= AOM_EFLAG_NO_REF_LAST;
+      }
+      break;
+    case 4:
+      // 2-temporal layer with the old update flags, not with the new
+      // SVC control.
+      *use_svc_control = 0;
+      //    1    3    5
+      //  0    2    4
+      if (superframe_cnt % 2 == 0) {
+        layer_id->temporal_layer_id = 0;
+        // Update LAST on layer 0, reference LAST and GF.
+        layer_flags |= AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF;
+      } else {
+        layer_id->temporal_layer_id = 1;
+        // No updates on layer 1, only reference LAST (TL0).
+        layer_flags |= AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF |
+                       AOM_EFLAG_NO_UPD_ARF | AOM_EFLAG_NO_REF_GF;
+      }
+      break;
+    case 5:
+      // 2 spatial layers, 1 temporal.
+      layer_id->temporal_layer_id = 0;
+      if (layer_id->spatial_layer_id == 0) {
+        // Reference LAST, update LAST.
+        ref_frame_config->refresh[0] = 1;
+        layer_flags |= AOM_EFLAG_NO_REF_GF;
+      } else if (layer_id->spatial_layer_id == 1) {
+        // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1
+        // and GOLDEN to slot 0. Update slot 1 (LAST).
+        ref_frame_config->ref_idx[0] = 1;
+        ref_frame_config->ref_idx[3] = 0;
+        ref_frame_config->refresh[1] = 1;
+      }
+      break;
+    case 6:
+      // 3 spatial layers, 1 temporal.
+      // Note for this case, we set the buffer idx for all references to be
+      // either LAST or GOLDEN, which are always valid references, since decoder
+      // will check if any of the 7 references is valid scale in
+      // valid_ref_frame_size().
+      layer_id->temporal_layer_id = 0;
+      if (layer_id->spatial_layer_id == 0) {
+        // Reference LAST, update LAST. Set all buffer_idx to 0.
+        for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+          ref_frame_config->ref_idx[i] = 0;
+        ref_frame_config->refresh[0] = 1;
+        layer_flags |= AOM_EFLAG_NO_REF_GF;
+      } else if (layer_id->spatial_layer_id == 1) {
+        // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1
+        // and GOLDEN (and all other refs) to slot 0.
+        // Update slot 1 (LAST).
+        for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+          ref_frame_config->ref_idx[i] = 0;
+        ref_frame_config->ref_idx[0] = 1;
+        ref_frame_config->refresh[1] = 1;
+      } else if (layer_id->spatial_layer_id == 2) {
+        // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2
+        // and GOLDEN (and all other refs) to slot 1.
+        // Update slot 2 (LAST).
+        for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+          ref_frame_config->ref_idx[i] = 1;
+        ref_frame_config->ref_idx[0] = 2;
+        ref_frame_config->refresh[2] = 1;
+      }
+      break;
+    case 7:
+      // 3 spatial and 3 temporal layer.
+      // Same as case 8 but overalap in the buffer slot updates.
+      // (shift = 2). The slots 3 and 4 updated by first TL2 are
+      // reused for update in TL1 superframe.
+      // Note for this case, frame order hint must be disabled for
+      // lower resolutios (operating points > 0) to be decoedable.
+    case 8:
+      // 3 spatial and 3 temporal layer.
+      // No overlap in buffer updates between TL2 and TL1.
+      // TL2 updates slot 3 and 4, TL1 updates 5, 6, 7.
+      // Set the references via the svc_ref_frame_config control.
+      layer_flags = 0;
+      // Always reference LAST.
+      ref_frame_config->reference[0] = 1;
+      if (superframe_cnt % 4 == 0) {
+        // Base temporal layer.
+        layer_id->temporal_layer_id = 0;
+        if (layer_id->spatial_layer_id == 0) {
+          // Reference LAST, update LAST.
+          // Set all buffer_idx to 0.
+          for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+            ref_frame_config->ref_idx[i] = 0;
+          ref_frame_config->refresh[0] = 1;
+        } else if (layer_id->spatial_layer_id == 1) {
+          // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1,
+          // GOLDEN (and all other refs) to slot 0.
+          // Update slot 1 (LAST).
+          for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+            ref_frame_config->ref_idx[i] = 0;
+          ref_frame_config->ref_idx[0] = 1;
+          ref_frame_config->refresh[1] = 1;
+        } else if (layer_id->spatial_layer_id == 2) {
+          // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2,
+          // GOLDEN (and all other refs) to slot 1.
+          // Update slot 2 (LAST).
+          for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+            ref_frame_config->ref_idx[i] = 1;
+          ref_frame_config->ref_idx[0] = 2;
+          ref_frame_config->refresh[2] = 1;
+        }
+      } else if ((superframe_cnt - 1) % 4 == 0) {
+        // First top temporal enhancement layer.
+        layer_id->temporal_layer_id = 2;
+        if (layer_id->spatial_layer_id == 0) {
+          // Reference LAST (slot 0).
+          // Set GOLDEN to slot 3 and update slot 3.
+          // Set all other buffer_idx to slot 0.
+          for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+            ref_frame_config->ref_idx[i] = 0;
+          ref_frame_config->ref_idx[3] = 3;
+          ref_frame_config->refresh[3] = 1;
+        } else if (layer_id->spatial_layer_id == 1) {
+          // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1,
+          // GOLDEN (and all other refs) to slot 3.
+          // Set LAST2 to slot 4 and Update slot 4.
+          for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+            ref_frame_config->ref_idx[i] = 3;
+          ref_frame_config->ref_idx[0] = 1;
+          ref_frame_config->ref_idx[1] = 4;
+          ref_frame_config->refresh[4] = 1;
+        } else if (layer_id->spatial_layer_id == 2) {
+          // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2,
+          // GOLDEN (and all other refs) to slot 4.
+          // No update.
+          for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+            ref_frame_config->ref_idx[i] = 4;
+          ref_frame_config->ref_idx[0] = 2;
+        }
+      } else if ((superframe_cnt - 2) % 4 == 0) {
+        // Middle temporal enhancement layer.
+        layer_id->temporal_layer_id = 1;
+        if (layer_id->spatial_layer_id == 0) {
+          // Reference LAST.
+          // Set all buffer_idx to 0.
+          // Set GOLDEN to slot 5 and update slot 5.
+          for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+            ref_frame_config->ref_idx[i] = 0;
+          ref_frame_config->ref_idx[3] = 5 - shift;
+          ref_frame_config->refresh[5 - shift] = 1;
+        } else if (layer_id->spatial_layer_id == 1) {
+          // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1,
+          // GOLDEN (and all other refs) to slot 5.
+          // Set LAST2 to slot 6 and update slot 6.
+          for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+            ref_frame_config->ref_idx[i] = 5 - shift;
+          ref_frame_config->ref_idx[0] = 1;
+          ref_frame_config->ref_idx[2] = 6 - shift;
+          ref_frame_config->refresh[6 - shift] = 1;
+        } else if (layer_id->spatial_layer_id == 2) {
+          // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2,
+          // GOLDEN (and all other refs) to slot 6.
+          // Set LAST2 to slot 6 and update slot 7.
+          for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+            ref_frame_config->ref_idx[i] = 6 - shift;
+          ref_frame_config->ref_idx[0] = 2;
+          ref_frame_config->ref_idx[2] = 7 - shift;
+          ref_frame_config->refresh[7 - shift] = 1;
+        }
+      } else if ((superframe_cnt - 3) % 4 == 0) {
+        // Second top temporal enhancement layer.
+        layer_id->temporal_layer_id = 2;
+        if (layer_id->spatial_layer_id == 0) {
+          // Set LAST to slot 5 and reference LAST.
+          // Set GOLDEN to slot 3 and update slot 3.
+          // Set all other buffer_idx to 0.
+          for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+            ref_frame_config->ref_idx[i] = 0;
+          ref_frame_config->ref_idx[0] = 5 - shift;
+          ref_frame_config->ref_idx[3] = 3;
+          ref_frame_config->refresh[3] = 1;
+        } else if (layer_id->spatial_layer_id == 1) {
+          // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 6,
+          // GOLDEN to slot 3. Set LAST2 to slot 4 and update slot 4.
+          for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+            ref_frame_config->ref_idx[i] = 0;
+          ref_frame_config->ref_idx[0] = 6 - shift;
+          ref_frame_config->ref_idx[3] = 3;
+          ref_frame_config->ref_idx[1] = 4;
+          ref_frame_config->refresh[4] = 1;
+        } else if (layer_id->spatial_layer_id == 2) {
+          // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 7,
+          // GOLDEN to slot 4. No update.
+          for (i = 0; i < INTER_REFS_PER_FRAME; i++)
+            ref_frame_config->ref_idx[i] = 0;
+          ref_frame_config->ref_idx[0] = 7 - shift;
+          ref_frame_config->ref_idx[3] = 4;
+        }
+      }
+      if (layer_id->spatial_layer_id > 0)
+        ref_frame_config->reference[3] = 1;  // Reference GOLDEN.
+      break;
+    default: assert(0); die("Error: Unsupported temporal layering mode!\n");
+  }
+  return layer_flags;
+}
+
+int main(int argc, char **argv) {
+  AvxVideoWriter *outfile[AOM_MAX_LAYERS] = { NULL };
+  aom_codec_ctx_t codec;
+  aom_codec_enc_cfg_t cfg;
+  int frame_cnt = 0;
+  aom_image_t raw;
+  aom_codec_err_t res;
+  unsigned int width;
+  unsigned int height;
+  uint32_t error_resilient = 0;
+  int speed;
+  int frame_avail;
+  int got_data = 0;
+  int flags = 0;
+  unsigned i;
+  int pts = 0;             // PTS starts at 0.
+  int frame_duration = 1;  // 1 timebase tick per frame.
+  int layering_mode = 0;
+  aom_svc_layer_id_t layer_id;
+  aom_svc_params_t svc_params;
+  aom_svc_ref_frame_config_t ref_frame_config;
+  const AvxInterface *encoder = NULL;
+  struct AvxInputContext input_ctx;
+  struct RateControlMetrics rc;
+  int64_t cx_time = 0;
+  const int min_args_base = 13;
+  const int min_args = min_args_base;
+  double sum_bitrate = 0.0;
+  double sum_bitrate2 = 0.0;
+  double framerate = 30.0;
+  int use_svc_control = 1;
+  zero(rc.layer_target_bitrate);
+  memset(&layer_id, 0, sizeof(aom_svc_layer_id_t));
+  memset(&input_ctx, 0, sizeof(input_ctx));
+  memset(&svc_params, 0, sizeof(svc_params));
+
+  // Flag to test dynamic scaling of source frames for single
+  // spatial stream, using the scaling_mode control.
+  const int test_dynamic_scaling_single_layer = 0;
+
+  /* Setup default input stream settings */
+  input_ctx.framerate.numerator = 30;
+  input_ctx.framerate.denominator = 1;
+  input_ctx.only_i420 = 1;
+  input_ctx.bit_depth = 0;
+  unsigned int ts_number_layers = 1;
+  unsigned int ss_number_layers = 1;
+  exec_name = argv[0];
+  // Check usage and arguments.
+  if (argc < min_args) {
+    die("Usage: %s <infile> <outfile> <codec_type(av1)> <width> <height> "
+        "<rate_num> <rate_den> <speed> <frame_drop_threshold> "
+        "<error_resilient> <threads> <mode> "
+        "<Rate_0> ... <Rate_nlayers-1>\n",
+        argv[0]);
+  }
+
+  encoder = get_aom_encoder_by_name(argv[3]);
+
+  width = (unsigned int)strtoul(argv[4], NULL, 0);
+  height = (unsigned int)strtoul(argv[5], NULL, 0);
+  if (width < 16 || width % 2 || height < 16 || height % 2) {
+    die("Invalid resolution: %d x %d", width, height);
+  }
+
+  layering_mode = (int)strtol(argv[12], NULL, 0);
+  if (layering_mode < 0 || layering_mode > 13) {
+    die("Invalid layering mode (0..12) %s", argv[12]);
+  }
+
+  if (argc != min_args + mode_to_num_layers[layering_mode]) {
+    die("Invalid number of arguments");
+  }
+
+  ts_number_layers = mode_to_num_temporal_layers[layering_mode];
+  ss_number_layers = mode_to_num_spatial_layers[layering_mode];
+
+  input_ctx.filename = argv[1];
+  open_input_file(&input_ctx, 0);
+
+  // Y4M reader has its own allocation.
+  if (input_ctx.file_type != FILE_TYPE_Y4M) {
+    if (!aom_img_alloc(&raw, AOM_IMG_FMT_I420, width, height, 32)) {
+      die("Failed to allocate image", width, height);
+    }
+  }
+
+  // Populate encoder configuration.
+  res = aom_codec_enc_config_default(encoder->codec_interface(), &cfg, 0);
+  if (res) {
+    printf("Failed to get config: %s\n", aom_codec_err_to_string(res));
+    return EXIT_FAILURE;
+  }
+
+  // Update the default configuration with our settings.
+  cfg.g_w = width;
+  cfg.g_h = height;
+
+  // Timebase format e.g. 30fps: numerator=1, demoninator = 30.
+  cfg.g_timebase.num = (int)strtol(argv[6], NULL, 0);
+  cfg.g_timebase.den = (int)strtol(argv[7], NULL, 0);
+
+  speed = (int)strtol(argv[8], NULL, 0);
+  if (speed < 0 || speed > 8) {
+    die("Invalid speed setting: must be positive");
+  }
+
+  for (i = min_args_base;
+       (int)i < min_args_base + mode_to_num_layers[layering_mode]; ++i) {
+    rc.layer_target_bitrate[i - 13] = (int)strtol(argv[i], NULL, 0);
+    svc_params.layer_target_bitrate[i - 13] = rc.layer_target_bitrate[i - 13];
+  }
+
+  cfg.rc_target_bitrate =
+      svc_params.layer_target_bitrate[ss_number_layers * ts_number_layers - 1];
+
+  svc_params.framerate_factor[0] = 1;
+  if (ts_number_layers == 2) {
+    svc_params.framerate_factor[0] = 2;
+    svc_params.framerate_factor[1] = 1;
+  } else if (ts_number_layers == 3) {
+    svc_params.framerate_factor[0] = 4;
+    svc_params.framerate_factor[1] = 2;
+    svc_params.framerate_factor[2] = 1;
+  }
+
+  // Real time parameters.
+  cfg.g_usage = AOM_USAGE_REALTIME;
+
+  cfg.rc_dropframe_thresh = (unsigned int)strtoul(argv[9], NULL, 0);
+  cfg.rc_end_usage = AOM_CBR;
+  cfg.rc_min_quantizer = 2;
+  cfg.rc_max_quantizer = 52;
+  cfg.rc_undershoot_pct = 50;
+  cfg.rc_overshoot_pct = 50;
+  cfg.rc_buf_initial_sz = 600;
+  cfg.rc_buf_optimal_sz = 600;
+  cfg.rc_buf_sz = 1000;
+
+  // Use 1 thread as default.
+  cfg.g_threads = (unsigned int)strtoul(argv[11], NULL, 0);
+
+  error_resilient = (uint32_t)strtoul(argv[10], NULL, 0);
+  if (error_resilient != 0 && error_resilient != 1) {
+    die("Invalid value for error resilient (0, 1): %d.", error_resilient);
+  }
+  // Enable error resilient mode.
+  cfg.g_error_resilient = error_resilient;
+  cfg.g_lag_in_frames = 0;
+  cfg.kf_mode = AOM_KF_AUTO;
+
+  // Disable automatic keyframe placement.
+  cfg.kf_min_dist = cfg.kf_max_dist = 3000;
+
+  framerate = cfg.g_timebase.den / cfg.g_timebase.num;
+  set_rate_control_metrics(&rc, framerate, ss_number_layers, ts_number_layers);
+
+  if (input_ctx.file_type == FILE_TYPE_Y4M) {
+    if (input_ctx.width != cfg.g_w || input_ctx.height != cfg.g_h) {
+      die("Incorrect width or height: %d x %d", cfg.g_w, cfg.g_h);
+    }
+    if (input_ctx.framerate.numerator != cfg.g_timebase.den ||
+        input_ctx.framerate.denominator != cfg.g_timebase.num) {
+      die("Incorrect framerate: numerator %d denominator %d",
+          cfg.g_timebase.num, cfg.g_timebase.den);
+    }
+  }
+
+  // Open an output file for each stream.
+  for (unsigned int sl = 0; sl < ss_number_layers; ++sl) {
+    for (unsigned tl = 0; tl < ts_number_layers; ++tl) {
+      i = sl * ts_number_layers + tl;
+      char file_name[PATH_MAX];
+      AvxVideoInfo info;
+      info.codec_fourcc = encoder->fourcc;
+      info.frame_width = cfg.g_w;
+      info.frame_height = cfg.g_h;
+      info.time_base.numerator = cfg.g_timebase.num;
+      info.time_base.denominator = cfg.g_timebase.den;
+
+      snprintf(file_name, sizeof(file_name), "%s_%d.av1", argv[2], i);
+      outfile[i] = aom_video_writer_open(file_name, kContainerIVF, &info);
+      if (!outfile[i]) die("Failed to open %s for writing", file_name);
+      assert(outfile[i] != NULL);
+    }
+  }
+
+  // Initialize codec.
+  if (aom_codec_enc_init(&codec, encoder->codec_interface(), &cfg, 0))
+    die_codec(&codec, "Failed to initialize encoder");
+
+  aom_codec_control(&codec, AOME_SET_CPUUSED, speed);
+  aom_codec_control(&codec, AV1E_SET_AQ_MODE, 3);
+  aom_codec_control(&codec, AV1E_SET_GF_CBR_BOOST_PCT, 0);
+  aom_codec_control(&codec, AV1E_SET_ENABLE_CDEF, 1);
+  aom_codec_control(&codec, AV1E_SET_ENABLE_ORDER_HINT, 0);
+  aom_codec_control(&codec, AV1E_SET_ENABLE_TPL_MODEL, 0);
+  aom_codec_control(&codec, AV1E_SET_DELTAQ_MODE, 0);
+
+  svc_params.number_spatial_layers = ss_number_layers;
+  svc_params.number_temporal_layers = ts_number_layers;
+  for (i = 0; i < ss_number_layers * ts_number_layers; ++i) {
+    svc_params.max_quantizers[i] = cfg.rc_max_quantizer;
+    svc_params.min_quantizers[i] = cfg.rc_min_quantizer;
+  }
+  for (i = 0; i < ss_number_layers; ++i) {
+    svc_params.scaling_factor_num[i] = 1;
+    svc_params.scaling_factor_den[i] = 1;
+  }
+  if (ss_number_layers == 2) {
+    svc_params.scaling_factor_num[0] = 1;
+    svc_params.scaling_factor_den[0] = 2;
+  } else if (ss_number_layers == 3) {
+    svc_params.scaling_factor_num[0] = 1;
+    svc_params.scaling_factor_den[0] = 4;
+    svc_params.scaling_factor_num[1] = 1;
+    svc_params.scaling_factor_den[1] = 2;
+  }
+
+  aom_codec_control(&codec, AV1E_SET_SVC_PARAMS, &svc_params);
+
+  // This controls the maximum target size of the key frame.
+  // For generating smaller key frames, use a smaller max_intra_size_pct
+  // value, like 100 or 200.
+  {
+    const int max_intra_size_pct = 300;
+    aom_codec_control(&codec, AOME_SET_MAX_INTRA_BITRATE_PCT,
+                      max_intra_size_pct);
+  }
+
+  frame_avail = 1;
+  while (frame_avail || got_data) {
+    struct aom_usec_timer timer;
+    frame_avail = read_frame(&input_ctx, &raw);
+    int is_key_frame = (frame_cnt % cfg.kf_max_dist) == 0;
+    // Loop over spatial layers.
+    for (unsigned int slx = 0; slx < ss_number_layers; slx++) {
+      aom_codec_iter_t iter = NULL;
+      const aom_codec_cx_pkt_t *pkt;
+      int layer = 0;
+
+      // Set the reference/update flags, layer_id, and reference_map
+      // buffer index.
+      flags = set_layer_pattern(layering_mode, frame_cnt, &layer_id,
+                                &ref_frame_config, &use_svc_control, slx,
+                                is_key_frame, (layering_mode == 9));
+      aom_codec_control(&codec, AV1E_SET_SVC_LAYER_ID, &layer_id);
+      if (use_svc_control)
+        aom_codec_control(&codec, AV1E_SET_SVC_REF_FRAME_CONFIG,
+                          &ref_frame_config);
+
+      layer = slx * ts_number_layers + layer_id.temporal_layer_id;
+      if (frame_avail && slx == 0) ++rc.layer_input_frames[layer];
+
+      if (test_dynamic_scaling_single_layer) {
+        if (frame_cnt >= 200 && frame_cnt <= 400) {
+          // Scale source down by 2x2.
+          struct aom_scaling_mode mode = { AOME_ONETWO, AOME_ONETWO };
+          aom_codec_control(&codec, AOME_SET_SCALEMODE, &mode);
+        } else {
+          // Source back up to original resolution (no scaling).
+          struct aom_scaling_mode mode = { AOME_NORMAL, AOME_NORMAL };
+          aom_codec_control(&codec, AOME_SET_SCALEMODE, &mode);
+        }
+      }
+
+      // Do the layer encode.
+      aom_usec_timer_start(&timer);
+      if (aom_codec_encode(&codec, frame_avail ? &raw : NULL, pts, 1, flags))
+        die_codec(&codec, "Failed to encode frame");
+      aom_usec_timer_mark(&timer);
+      cx_time += aom_usec_timer_elapsed(&timer);
+
+      got_data = 0;
+      while ((pkt = aom_codec_get_cx_data(&codec, &iter))) {
+        got_data = 1;
+        switch (pkt->kind) {
+          case AOM_CODEC_CX_FRAME_PKT:
+            for (unsigned int sl = layer_id.spatial_layer_id;
+                 sl < ss_number_layers; ++sl) {
+              for (unsigned tl = layer_id.temporal_layer_id;
+                   tl < ts_number_layers; ++tl) {
+                unsigned int j = sl * ts_number_layers + tl;
+                aom_video_writer_write_frame(outfile[j], pkt->data.frame.buf,
+                                             pkt->data.frame.sz, pts);
+                if (sl == (unsigned int)layer_id.spatial_layer_id)
+                  rc.layer_encoding_bitrate[j] += 8.0 * pkt->data.frame.sz;
+                // Keep count of rate control stats per layer (for non-key).
+                if (tl == (unsigned int)layer_id.temporal_layer_id &&
+                    sl == (unsigned int)layer_id.spatial_layer_id &&
+                    !(pkt->data.frame.flags & AOM_FRAME_IS_KEY)) {
+                  rc.layer_avg_frame_size[j] += 8.0 * pkt->data.frame.sz;
+                  rc.layer_avg_rate_mismatch[j] +=
+                      fabs(8.0 * pkt->data.frame.sz - rc.layer_pfb[j]) /
+                      rc.layer_pfb[j];
+                  if (slx == 0) ++rc.layer_enc_frames[tl];
+                }
+              }
+            }
+
+            // Update for short-time encoding bitrate states, for moving window
+            // of size rc->window, shifted by rc->window / 2.
+            // Ignore first window segment, due to key frame.
+            // For spatial layers: only do this for top/highest SL.
+            if (frame_cnt > rc.window_size && slx == ss_number_layers - 1) {
+              sum_bitrate += 0.001 * 8.0 * pkt->data.frame.sz * framerate;
+              rc.window_size = (rc.window_size <= 0) ? 1 : rc.window_size;
+              if (frame_cnt % rc.window_size == 0) {
+                rc.window_count += 1;
+                rc.avg_st_encoding_bitrate += sum_bitrate / rc.window_size;
+                rc.variance_st_encoding_bitrate +=
+                    (sum_bitrate / rc.window_size) *
+                    (sum_bitrate / rc.window_size);
+                sum_bitrate = 0.0;
+              }
+            }
+            // Second shifted window.
+            if (frame_cnt > rc.window_size + rc.window_size / 2 &&
+                slx == ss_number_layers - 1) {
+              sum_bitrate2 += 0.001 * 8.0 * pkt->data.frame.sz * framerate;
+              if (frame_cnt > 2 * rc.window_size &&
+                  frame_cnt % rc.window_size == 0) {
+                rc.window_count += 1;
+                rc.avg_st_encoding_bitrate += sum_bitrate2 / rc.window_size;
+                rc.variance_st_encoding_bitrate +=
+                    (sum_bitrate2 / rc.window_size) *
+                    (sum_bitrate2 / rc.window_size);
+                sum_bitrate2 = 0.0;
+              }
+            }
+            break;
+          default: break;
+        }
+      }
+    }  // loop over spatial layers
+    ++frame_cnt;
+    pts += frame_duration;
+  }
+  close_input_file(&input_ctx);
+  printout_rate_control_summary(&rc, frame_cnt, ss_number_layers,
+                                ts_number_layers);
+  printf("\n");
+  printf("Frame cnt and encoding time/FPS stats for encoding: %d %f %f\n",
+         frame_cnt, 1000 * (float)cx_time / (double)(frame_cnt * 1000000),
+         1000000 * (double)frame_cnt / (double)cx_time);
+
+  if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec");
+
+  // Try to rewrite the output file headers with the actual frame count.
+  for (i = 0; i < ss_number_layers * ts_number_layers; ++i)
+    aom_video_writer_close(outfile[i]);
+
+  if (input_ctx.file_type != FILE_TYPE_Y4M) {
+    aom_img_free(&raw);
+  }
+  return EXIT_SUCCESS;
+}
diff --git a/libs/libaom/src/examples/twopass_encoder.c b/libs/libaom/src/examples/twopass_encoder.c
new file mode 100644
index 000000000..a03bc6cc2
--- /dev/null
+++ b/libs/libaom/src/examples/twopass_encoder.c
@@ -0,0 +1,250 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// Two Pass Encoder
+// ================
+//
+// This is an example of a two pass encoder loop. It takes an input file in
+// YV12 format, passes it through the encoder twice, and writes the compressed
+// frames to disk in IVF format. It builds upon the simple_encoder example.
+//
+// Twopass Variables
+// -----------------
+// Twopass mode needs to track the current pass number and the buffer of
+// statistics packets.
+//
+// Updating The Configuration
+// ---------------------------------
+// In two pass mode, the configuration has to be updated on each pass. The
+// statistics buffer is passed on the last pass.
+//
+// Encoding A Frame
+// ----------------
+// Encoding a frame in two pass mode is identical to the simple encoder
+// example.
+//
+// Processing Statistics Packets
+// -----------------------------
+// Each packet of type `AOM_CODEC_CX_FRAME_PKT` contains the encoded data
+// for this frame. We write a IVF frame header, followed by the raw data.
+//
+//
+// Pass Progress Reporting
+// -----------------------------
+// It's sometimes helpful to see when each pass completes.
+//
+//
+// Clean-up
+// -----------------------------
+// Destruction of the encoder instance must be done on each pass. The
+// raw image should be destroyed at the end as usual.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom/aom_encoder.h"
+#include "common/tools_common.h"
+#include "common/video_writer.h"
+
+static const char *exec_name;
+
+void usage_exit(void) {
+  fprintf(stderr,
+          "Usage: %s <codec> <width> <height> <infile> <outfile> "
+          "<limit(optional)>\n",
+          exec_name);
+  exit(EXIT_FAILURE);
+}
+
+static int get_frame_stats(aom_codec_ctx_t *ctx, const aom_image_t *img,
+                           aom_codec_pts_t pts, unsigned int duration,
+                           aom_enc_frame_flags_t flags,
+                           aom_fixed_buf_t *stats) {
+  int got_pkts = 0;
+  aom_codec_iter_t iter = NULL;
+  const aom_codec_cx_pkt_t *pkt = NULL;
+  const aom_codec_err_t res = aom_codec_encode(ctx, img, pts, duration, flags);
+  if (res != AOM_CODEC_OK) die_codec(ctx, "Failed to get frame stats.");
+
+  while ((pkt = aom_codec_get_cx_data(ctx, &iter)) != NULL) {
+    got_pkts = 1;
+
+    if (pkt->kind == AOM_CODEC_STATS_PKT) {
+      const uint8_t *const pkt_buf = pkt->data.twopass_stats.buf;
+      const size_t pkt_size = pkt->data.twopass_stats.sz;
+      stats->buf = realloc(stats->buf, stats->sz + pkt_size);
+      memcpy((uint8_t *)stats->buf + stats->sz, pkt_buf, pkt_size);
+      stats->sz += pkt_size;
+    }
+  }
+
+  return got_pkts;
+}
+
+static int encode_frame(aom_codec_ctx_t *ctx, const aom_image_t *img,
+                        aom_codec_pts_t pts, unsigned int duration,
+                        aom_enc_frame_flags_t flags, AvxVideoWriter *writer) {
+  int got_pkts = 0;
+  aom_codec_iter_t iter = NULL;
+  const aom_codec_cx_pkt_t *pkt = NULL;
+  const aom_codec_err_t res = aom_codec_encode(ctx, img, pts, duration, flags);
+  if (res != AOM_CODEC_OK) die_codec(ctx, "Failed to encode frame.");
+
+  while ((pkt = aom_codec_get_cx_data(ctx, &iter)) != NULL) {
+    got_pkts = 1;
+    if (pkt->kind == AOM_CODEC_CX_FRAME_PKT) {
+      const int keyframe = (pkt->data.frame.flags & AOM_FRAME_IS_KEY) != 0;
+
+      if (!aom_video_writer_write_frame(writer, pkt->data.frame.buf,
+                                        pkt->data.frame.sz,
+                                        pkt->data.frame.pts))
+        die_codec(ctx, "Failed to write compressed frame.");
+      printf(keyframe ? "K" : ".");
+      fflush(stdout);
+    }
+  }
+
+  return got_pkts;
+}
+
+static aom_fixed_buf_t pass0(aom_image_t *raw, FILE *infile,
+                             const AvxInterface *encoder,
+                             const aom_codec_enc_cfg_t *cfg, int limit) {
+  aom_codec_ctx_t codec;
+  int frame_count = 0;
+  aom_fixed_buf_t stats = { NULL, 0 };
+
+  if (aom_codec_enc_init(&codec, encoder->codec_interface(), cfg, 0))
+    die_codec(&codec, "Failed to initialize encoder");
+
+  // Calculate frame statistics.
+  while (aom_img_read(raw, infile) && frame_count < limit) {
+    ++frame_count;
+    get_frame_stats(&codec, raw, frame_count, 1, 0, &stats);
+  }
+
+  // Flush encoder.
+  while (get_frame_stats(&codec, NULL, frame_count, 1, 0, &stats)) {
+  }
+
+  printf("Pass 0 complete. Processed %d frames.\n", frame_count);
+  if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec.");
+
+  return stats;
+}
+
+static void pass1(aom_image_t *raw, FILE *infile, const char *outfile_name,
+                  const AvxInterface *encoder, const aom_codec_enc_cfg_t *cfg,
+                  int limit) {
+  AvxVideoInfo info = { encoder->fourcc,
+                        cfg->g_w,
+                        cfg->g_h,
+                        { cfg->g_timebase.num, cfg->g_timebase.den },
+                        0 };
+  AvxVideoWriter *writer = NULL;
+  aom_codec_ctx_t codec;
+  int frame_count = 0;
+
+  writer = aom_video_writer_open(outfile_name, kContainerIVF, &info);
+  if (!writer) die("Failed to open %s for writing", outfile_name);
+
+  if (aom_codec_enc_init(&codec, encoder->codec_interface(), cfg, 0))
+    die_codec(&codec, "Failed to initialize encoder");
+
+  // Encode frames.
+  while (aom_img_read(raw, infile) && frame_count < limit) {
+    ++frame_count;
+    encode_frame(&codec, raw, frame_count, 1, 0, writer);
+  }
+
+  // Flush encoder.
+  while (encode_frame(&codec, NULL, -1, 1, 0, writer)) {
+  }
+
+  printf("\n");
+
+  if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec.");
+
+  aom_video_writer_close(writer);
+
+  printf("Pass 1 complete. Processed %d frames.\n", frame_count);
+}
+
+int main(int argc, char **argv) {
+  FILE *infile = NULL;
+  int w, h;
+  aom_codec_ctx_t codec;
+  aom_codec_enc_cfg_t cfg;
+  aom_image_t raw;
+  aom_codec_err_t res;
+  aom_fixed_buf_t stats;
+
+  const AvxInterface *encoder = NULL;
+  const int fps = 30;       // TODO(dkovalev) add command line argument
+  const int bitrate = 200;  // kbit/s TODO(dkovalev) add command line argument
+  const char *const codec_arg = argv[1];
+  const char *const width_arg = argv[2];
+  const char *const height_arg = argv[3];
+  const char *const infile_arg = argv[4];
+  const char *const outfile_arg = argv[5];
+  int limit = 0;
+  exec_name = argv[0];
+
+  if (argc < 6) die("Invalid number of arguments");
+
+  if (argc > 6) limit = (int)strtol(argv[6], NULL, 0);
+
+  if (limit == 0) limit = 100;
+
+  encoder = get_aom_encoder_by_name(codec_arg);
+  if (!encoder) die("Unsupported codec.");
+
+  w = (int)strtol(width_arg, NULL, 0);
+  h = (int)strtol(height_arg, NULL, 0);
+
+  if (w <= 0 || h <= 0 || (w % 2) != 0 || (h % 2) != 0)
+    die("Invalid frame size: %dx%d", w, h);
+
+  if (!aom_img_alloc(&raw, AOM_IMG_FMT_I420, w, h, 1))
+    die("Failed to allocate image", w, h);
+
+  printf("Using %s\n", aom_codec_iface_name(encoder->codec_interface()));
+
+  // Configuration
+  res = aom_codec_enc_config_default(encoder->codec_interface(), &cfg, 0);
+  if (res) die_codec(&codec, "Failed to get default codec config.");
+
+  cfg.g_w = w;
+  cfg.g_h = h;
+  cfg.g_timebase.num = 1;
+  cfg.g_timebase.den = fps;
+  cfg.rc_target_bitrate = bitrate;
+
+  if (!(infile = fopen(infile_arg, "rb")))
+    die("Failed to open %s for reading", infile_arg);
+
+  // Pass 0
+  cfg.g_pass = AOM_RC_FIRST_PASS;
+  stats = pass0(&raw, infile, encoder, &cfg, limit);
+
+  // Pass 1
+  rewind(infile);
+  cfg.g_pass = AOM_RC_LAST_PASS;
+  cfg.rc_twopass_stats_in = stats;
+  pass1(&raw, infile, outfile_arg, encoder, &cfg, limit);
+  free(stats.buf);
+
+  aom_img_free(&raw);
+  fclose(infile);
+
+  return EXIT_SUCCESS;
+}
diff --git a/libs/libaom/src/keywords.dox b/libs/libaom/src/keywords.dox
new file mode 100644
index 000000000..56f536890
--- /dev/null
+++ b/libs/libaom/src/keywords.dox
@@ -0,0 +1,51 @@
+/*!\page rfc2119 RFC2119 Keywords
+
+      The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL
+      NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED",  "MAY", and
+      "OPTIONAL" in this document are to be interpreted as described in
+      <a href="http://www.ietf.org/rfc/rfc2119.txt">RFC 2119.</a>
+
+Specifically, the following definitions are used:
+
+\section MUST
+\anchor REQUIRED
+\anchor SHALL
+   This word, or the terms "REQUIRED" or "SHALL", mean that the
+   definition is an absolute requirement of the specification.
+
+\section MUSTNOT MUST NOT
+\anchor SHALLNOT
+   This phrase, or the phrase "SHALL NOT", mean that the
+   definition is an absolute prohibition of the specification.
+
+\section SHOULD
+\anchor RECOMMENDED
+   This word, or the adjective "RECOMMENDED", mean that there
+   may exist valid reasons in particular circumstances to ignore a
+   particular item, but the full implications must be understood and
+   carefully weighed before choosing a different course.
+
+\section SHOULDNOT SHOULD NOT
+\anchor NOTRECOMMENDED
+   This phrase, or the phrase "NOT RECOMMENDED" mean that
+   there may exist valid reasons in particular circumstances when the
+   particular behavior is acceptable or even useful, but the full
+   implications should be understood and the case carefully weighed
+   before implementing any behavior described with this label.
+
+\section MAY
+\anchor OPTIONAL
+   This word, or the adjective "OPTIONAL", mean that an item is
+   truly optional.  One vendor may choose to include the item because a
+   particular marketplace requires it or because the vendor feels that
+   it enhances the product while another vendor may omit the same item.
+   An implementation which does not include a particular option \ref MUST be
+   prepared to interoperate with another implementation which does
+   include the option, though perhaps with reduced functionality. In the
+   same vein an implementation which does include a particular option
+   \ref MUST be prepared to interoperate with another implementation which
+   does not include the option (except, of course, for the feature the
+   option provides.)
+
+
+*/
diff --git a/libs/libaom/src/libs.doxy_template b/libs/libaom/src/libs.doxy_template
new file mode 100644
index 000000000..c522e21d3
--- /dev/null
+++ b/libs/libaom/src/libs.doxy_template
@@ -0,0 +1,1260 @@
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+
+# Doxyfile 1.5.4
+
+# This file describes the settings to be used by the documentation system
+# doxygen (www.doxygen.org) for a project
+#
+# All text after a hash (#) is considered a comment and will be ignored
+# The format is:
+#       TAG = value [value, ...]
+# For lists items can also be appended using:
+#       TAG += value [value, ...]
+# Values that contain spaces should be placed between quotes (" ")
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+
+# This tag specifies the encoding used for all characters in the config file that
+# follow. The default is UTF-8 which is also the encoding used for all text before
+# the first occurrence of this tag. Doxygen uses libiconv (or the iconv built into
+# libc) for the transcoding. See http://www.gnu.org/software/libiconv for the list of
+# possible encodings.
+
+DOXYFILE_ENCODING      = UTF-8
+
+# The PROJECT_NAME tag is a single word (or a sequence of words surrounded
+# by quotes) that should identify the project.
+
+PROJECT_NAME           = "AOMedia Codec SDK"
+
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
+# base path where the generated documentation will be put.
+# If a relative path is entered, it will be relative to the location
+# where doxygen was started. If left blank the current directory will be used.
+
+OUTPUT_DIRECTORY       = docs
+
+# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create
+# 4096 sub-directories (in 2 levels) under the output directory of each output
+# format and will distribute the generated files over these directories.
+# Enabling this option can be useful when feeding doxygen a huge amount of
+# source files, where putting all generated files in the same directory would
+# otherwise cause performance problems for the file system.
+
+CREATE_SUBDIRS         = NO
+
+# The OUTPUT_LANGUAGE tag is used to specify the language in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all constant output in the proper language.
+# The default language is English, other supported languages are:
+# Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional,
+# Croatian, Czech, Danish, Dutch, Finnish, French, German, Greek, Hungarian,
+# Italian, Japanese, Japanese-en (Japanese with English messages), Korean,
+# Korean-en, Lithuanian, Norwegian, Polish, Portuguese, Romanian, Russian,
+# Serbian, Slovak, Slovene, Spanish, Swedish, and Ukrainian.
+
+OUTPUT_LANGUAGE        = English
+
+# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will
+# include brief member descriptions after the members that are listed in
+# the file and class documentation (similar to java_doc).
+# Set to NO to disable this.
+
+BRIEF_MEMBER_DESC      = YES
+
+# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend
+# the brief description of a member or function before the detailed description.
+# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
+# brief descriptions will be completely suppressed.
+
+REPEAT_BRIEF           = YES
+
+# This tag implements a quasi-intelligent brief description abbreviator
+# that is used to form the text in various listings. Each string
+# in this list, if found as the leading text of the brief description, will be
+# stripped from the text and the result after processing the whole list, is
+# used as the annotated text. Otherwise, the brief description is used as-is.
+# If left blank, the following values are used ("$name" is automatically
+# replaced with the name of the entity): "The $name class" "The $name widget"
+# "The $name file" "is" "provides" "specifies" "contains"
+# "represents" "a" "an" "the"
+
+ABBREVIATE_BRIEF       =
+
+# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
+# Doxygen will generate a detailed section even if there is only a brief
+# description.
+
+ALWAYS_DETAILED_SEC    = NO
+
+# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
+# inherited members of a class in the documentation of that class as if those
+# members were ordinary class members. Constructors, destructors and assignment
+# operators of the base classes will not be shown.
+
+INLINE_INHERITED_MEMB  = NO
+
+# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full
+# path before files name in the file list and in the header files. If set
+# to NO the shortest path that makes the file name unique will be used.
+
+FULL_PATH_NAMES        = YES
+
+# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag
+# can be used to strip a user-defined part of the path. Stripping is
+# only done if one of the specified strings matches the left-hand part of
+# the path. The tag can be used to show relative paths in the file list.
+# If left blank the directory from which doxygen is run is used as the
+# path to strip.
+
+STRIP_FROM_PATH        =
+
+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of
+# the path mentioned in the documentation of a class, which tells
+# the reader which header file to include in order to use a class.
+# If left blank only the name of the header file containing the class
+# definition is used. Otherwise one should specify the include paths that
+# are normally passed to the compiler using the -I flag.
+
+STRIP_FROM_INC_PATH    =
+
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter
+# (but less readable) file names. This can be useful is your file systems
+# doesn't support long names like on DOS, Mac, or CD-ROM.
+
+SHORT_NAMES            = NO
+
+# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen
+# will interpret the first line (until the first dot) of a java_doc-style
+# comment as the brief description. If set to NO, the java_doc
+# comments will behave just like regular Qt-style comments
+# (thus requiring an explicit @brief command for a brief description.)
+
+JAVADOC_AUTOBRIEF      = NO
+
+# If the QT_AUTOBRIEF tag is set to YES then Doxygen will
+# interpret the first line (until the first dot) of a Qt-style
+# comment as the brief description. If set to NO, the comments
+# will behave just like regular Qt-style comments (thus requiring
+# an explicit \brief command for a brief description.)
+
+QT_AUTOBRIEF           = NO
+
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen
+# treat a multi-line C++ special comment block (i.e. a block of //! or ///
+# comments) as a brief description. This used to be the default behaviour.
+# The new default is to treat a multi-line C++ comment block as a detailed
+# description. Set this tag to YES if you prefer the old behaviour instead.
+
+MULTILINE_CPP_IS_BRIEF = NO
+
+# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented
+# member inherits the documentation from any documented member that it
+# re-implements.
+
+INHERIT_DOCS           = YES
+
+# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce
+# a new page for each member. If set to NO, the documentation of a member will
+# be part of the file/class/namespace that contains it.
+
+SEPARATE_MEMBER_PAGES  = NO
+
+# The TAB_SIZE tag can be used to set the number of spaces in a tab.
+# Doxygen uses this value to replace tabs by spaces in code fragments.
+
+TAB_SIZE               = 4
+
+# This tag can be used to specify a number of aliases that acts
+# as commands in the documentation. An alias has the form "name=value".
+# For example adding "sideeffect=\par Side Effects:\n" will allow you to
+# put the command \sideeffect (or @sideeffect) in the documentation, which
+# will result in a user-defined paragraph with heading "Side Effects:".
+# You can put \n's in the value part of an alias to insert newlines.
+
+ALIASES                =
+
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C
+# sources only. Doxygen will then generate output that is more tailored for C.
+# For instance, some of the names that are used will be different. The list
+# of all members will be omitted, etc.
+
+OPTIMIZE_OUTPUT_FOR_C  = YES
+
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java
+# sources only. Doxygen will then generate output that is more tailored for Java.
+# For instance, namespaces will be presented as packages, qualified scopes
+# will look different, etc.
+
+OPTIMIZE_OUTPUT_JAVA   = NO
+
+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want to
+# include (a tag file for) the STL sources as input, then you should
+# set this tag to YES in order to let doxygen match functions declarations and
+# definitions whose arguments contain STL classes (e.g. func(std::string); v.s.
+# func(std::string) {}). This also make the inheritance and collaboration
+# diagrams that involve STL classes more complete and accurate.
+
+BUILTIN_STL_SUPPORT    = NO
+
+# If you use Microsoft's C++/CLI language, you should set this option to YES to
+# enable parsing support.
+
+CPP_CLI_SUPPORT        = NO
+
+# Set the SIP_SUPPORT tag to YES if your project consists of sip sources only.
+# Doxygen will parse them like normal C++ but will assume all classes use public
+# instead of private inheritance when no explicit protection keyword is present.
+
+SIP_SUPPORT            = NO
+
+# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
+# tag is set to YES, then doxygen will reuse the documentation of the first
+# member in the group (if any) for the other members of the group. By default
+# all members of a group must be documented explicitly.
+
+DISTRIBUTE_GROUP_DOC   = NO
+
+# Set the SUBGROUPING tag to YES (the defqault) to allow class member groups of
+# the same type (for instance a group of public functions) to be put as a
+# subgroup of that type (e.g. under the Public Functions section). Set it to
+# NO to prevent subgrouping. Alternatively, this can be done per class using
+# the \nosubgrouping command.
+
+SUBGROUPING            = YES
+
+# When TYPEDEF_HIDES_STRUCT is enabled, a typedef of a struct (or union) is
+# documented as struct with the name of the typedef. So
+# typedef struct type_s {} type_t, will appear in the documentation as a struct
+# with name type_t. When disabled the typedef will appear as a member of a file,
+# namespace, or class. And the struct will be named type_s. This can typically
+# be useful for C code where the coding convention is that all structs are
+# typedef'ed and only the typedef is referenced never the struct's name.
+
+TYPEDEF_HIDES_STRUCT   = NO
+
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+
+# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in
+# documentation are documented, even if no documentation was available.
+# Private class members and static file members will be hidden unless
+# the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES
+
+EXTRACT_ALL            = NO
+
+# If the EXTRACT_PRIVATE tag is set to YES all private members of a class
+# will be included in the documentation.
+
+EXTRACT_PRIVATE        = NO
+
+# If the EXTRACT_STATIC tag is set to YES all static members of a file
+# will be included in the documentation.
+
+EXTRACT_STATIC         = NO
+
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs)
+# defined locally in source files will be included in the documentation.
+# If set to NO only classes defined in header files are included.
+
+EXTRACT_LOCAL_CLASSES  = YES
+
+# This flag is only useful for Objective-C code. When set to YES local
+# methods, which are defined in the implementation section but not in
+# the interface are included in the documentation.
+# If set to NO (the default) only methods in the interface are included.
+
+EXTRACT_LOCAL_METHODS  = NO
+
+# If this flag is set to YES, the members of anonymous namespaces will be extracted
+# and appear in the documentation as a namespace called 'anonymous_namespace{file}',
+# where file will be replaced with the base name of the file that contains the anonymous
+# namespace. By default anonymous namespace are hidden.
+
+EXTRACT_ANON_NSPACES   = NO
+
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all
+# undocumented members of documented classes, files or namespaces.
+# If set to NO (the default) these members will be included in the
+# various overviews, but no documentation section is generated.
+# This option has no effect if EXTRACT_ALL is enabled.
+
+HIDE_UNDOC_MEMBERS     = NO
+
+# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all
+# undocumented classes that are normally visible in the class hierarchy.
+# If set to NO (the default) these classes will be included in the various
+# overviews. This option has no effect if EXTRACT_ALL is enabled.
+
+HIDE_UNDOC_CLASSES     = NO
+
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all
+# friend (class|struct|union) declarations.
+# If set to NO (the default) these declarations will be included in the
+# documentation.
+
+HIDE_FRIEND_COMPOUNDS  = NO
+
+# If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any
+# documentation blocks found inside the body of a function.
+# If set to NO (the default) these blocks will be appended to the
+# function's detailed documentation block.
+
+HIDE_IN_BODY_DOCS      = NO
+
+# The INTERNAL_DOCS tag determines if documentation
+# that is typed after a \internal command is included. If the tag is set
+# to NO (the default) then the documentation will be excluded.
+# Set it to YES to include the internal documentation.
+
+INTERNAL_DOCS          = NO
+
+# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate
+# file names in lower-case letters. If set to YES upper-case letters are also
+# allowed. This is useful if you have classes or files whose names only differ
+# in case and if your file system supports case sensitive file names. Windows
+# and Mac users are advised to set this option to NO.
+
+CASE_SENSE_NAMES       = YES
+
+# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen
+# will show members with their full class and namespace scopes in the
+# documentation. If set to YES the scope will be hidden.
+
+HIDE_SCOPE_NAMES       = NO
+
+# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen
+# will put a list of the files that are included by a file in the documentation
+# of that file.
+
+SHOW_INCLUDE_FILES     = YES
+
+# If the INLINE_INFO tag is set to YES (the default) then a tag [inline]
+# is inserted in the documentation for inline members.
+
+INLINE_INFO            = YES
+
+# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen
+# will sort the (detailed) documentation of file and class members
+# alphabetically by member name. If set to NO the members will appear in
+# declaration order.
+
+SORT_MEMBER_DOCS       = NO
+
+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the
+# brief documentation of file, namespace and class members alphabetically
+# by member name. If set to NO (the default) the members will appear in
+# declaration order.
+
+SORT_BRIEF_DOCS        = NO
+
+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be
+# sorted by fully-qualified names, including namespaces. If set to
+# NO (the default), the class list will be sorted only by class name,
+# not including the namespace part.
+# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
+# Note: This option applies only to the class list, not to the
+# alphabetical list.
+
+SORT_BY_SCOPE_NAME     = NO
+
+# The GENERATE_TODOLIST tag can be used to enable (YES) or
+# disable (NO) the todo list. This list is created by putting \todo
+# commands in the documentation.
+
+GENERATE_TODOLIST      = YES
+
+# The GENERATE_TESTLIST tag can be used to enable (YES) or
+# disable (NO) the test list. This list is created by putting \test
+# commands in the documentation.
+
+GENERATE_TESTLIST      = YES
+
+# The GENERATE_BUGLIST tag can be used to enable (YES) or
+# disable (NO) the bug list. This list is created by putting \bug
+# commands in the documentation.
+
+GENERATE_BUGLIST       = YES
+
+# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or
+# disable (NO) the deprecated list. This list is created by putting
+# \deprecated commands in the documentation.
+
+GENERATE_DEPRECATEDLIST= YES
+
+# The ENABLED_SECTIONS tag can be used to enable conditional
+# documentation sections, marked by \if sectionname ... \endif.
+
+ENABLED_SECTIONS       =
+
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines
+# the initial value of a variable or define consists of for it to appear in
+# the documentation. If the initializer consists of more lines than specified
+# here it will be hidden. Use a value of 0 to hide initializers completely.
+# The appearance of the initializer of individual variables and defines in the
+# documentation can be controlled using \showinitializer or \hideinitializer
+# command in the documentation regardless of this setting.
+
+MAX_INITIALIZER_LINES  = 30
+
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated
+# at the bottom of the documentation of classes and structs. If set to YES the
+# list will mention the files that were used to generate the documentation.
+
+SHOW_USED_FILES        = YES
+
+# The FILE_VERSION_FILTER tag can be used to specify a program or script that
+# doxygen should invoke to get the current version for each file (typically from the
+# version control system). Doxygen will invoke the program by executing (via
+# popen()) the command <command> <input-file>, where <command> is the value of
+# the FILE_VERSION_FILTER tag, and <input-file> is the name of an input file
+# provided by doxygen. Whatever the program writes to standard output
+# is used as the file version. See the manual for examples.
+
+FILE_VERSION_FILTER    =
+
+#---------------------------------------------------------------------------
+# configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+
+# The QUIET tag can be used to turn on/off the messages that are generated
+# by doxygen. Possible values are YES and NO. If left blank NO is used.
+
+QUIET                  = YES
+
+# The WARNINGS tag can be used to turn on/off the warning messages that are
+# generated by doxygen. Possible values are YES and NO. If left blank
+# NO is used.
+
+WARNINGS               = YES
+
+# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings
+# for undocumented members. If EXTRACT_ALL is set to YES then this flag will
+# automatically be disabled.
+
+WARN_IF_UNDOCUMENTED   = YES
+
+# If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for
+# potential errors in the documentation, such as not documenting some
+# parameters in a documented function, or documenting parameters that
+# don't exist or using markup commands wrongly.
+
+WARN_IF_DOC_ERROR      = YES
+
+# This WARN_NO_PARAMDOC option can be abled to get warnings for
+# functions that are documented, but have no documentation for their parameters
+# or return value. If set to NO (the default) doxygen will only warn about
+# wrong or incomplete parameter documentation, but not about the absence of
+# documentation.
+
+WARN_NO_PARAMDOC       = NO
+
+# The WARN_FORMAT tag determines the format of the warning messages that
+# doxygen can produce. The string should contain the $file, $line, and $text
+# tags, which will be replaced by the file and line number from which the
+# warning originated and the warning text. Optionally the format may contain
+# $version, which will be replaced by the version of the file (if it could
+# be obtained via FILE_VERSION_FILTER)
+
+WARN_FORMAT            = "$file:$line: $text"
+
+# The WARN_LOGFILE tag can be used to specify a file to which warning
+# and error messages should be written. If left blank the output is written
+# to stderr.
+
+WARN_LOGFILE           =
+
+#---------------------------------------------------------------------------
+# configuration options related to the input files
+#---------------------------------------------------------------------------
+
+# The INPUT tag can be used to specify the files and/or directories that contain
+# documented source files. You may enter file names like "myfile.cpp" or
+# directories like "/usr/src/myproject". Separate the files or directories
+# with spaces.
+
+INPUT =
+
+# This tag can be used to specify the character encoding of the source files that
+# doxygen parses. Internally doxygen uses the UTF-8 encoding, which is also the default
+# input encoding. Doxygen uses libiconv (or the iconv built into libc) for the transcoding.
+# See http://www.gnu.org/software/libiconv for the list of possible encodings.
+
+INPUT_ENCODING         = UTF-8
+
+# If the value of the INPUT tag contains directories, you can use the
+# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
+# and *.h) to filter out the source-files in the directories. If left
+# blank the following patterns are tested:
+# *.c *.cc *.cxx *.cpp *.c++ *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh *.hxx
+# *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.py *.f90
+
+FILE_PATTERNS          =
+
+# The RECURSIVE tag can be used to turn specify whether or not subdirectories
+# should be searched for input files as well. Possible values are YES and NO.
+# If left blank NO is used.
+
+RECURSIVE              = NO
+
+# The EXCLUDE tag can be used to specify files and/or directories that should
+# excluded from the INPUT source files. This way you can easily exclude a
+# subdirectory from a directory tree whose root is specified with the INPUT tag.
+
+EXCLUDE                =
+
+# The EXCLUDE_SYMLINKS tag can be used select whether or not files or
+# directories that are symbolic links (a Unix filesystem feature) are excluded
+# from the input.
+
+EXCLUDE_SYMLINKS       = NO
+
+# If the value of the INPUT tag contains directories, you can use the
+# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
+# certain files from those directories. Note that the wildcards are matched
+# against the file with absolute path, so to exclude all test directories
+# for example use the pattern */test/*
+
+EXCLUDE_PATTERNS       =
+
+# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
+# (namespaces, classes, functions, etc.) that should be excluded from the output.
+# The symbol name can be a fully qualified name, a word, or if the wildcard * is used,
+# a substring. Examples: ANamespace, AClass, AClass::ANamespace, ANamespace::*Test
+
+EXCLUDE_SYMBOLS        =
+
+# The EXAMPLE_PATH tag can be used to specify one or more files or
+# directories that contain example code fragments that are included (see
+# the \include command).
+
+EXAMPLE_PATH           =
+
+# If the value of the EXAMPLE_PATH tag contains directories, you can use the
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
+# and *.h) to filter out the source-files in the directories. If left
+# blank all files are included.
+
+EXAMPLE_PATTERNS       =
+
+# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
+# searched for input files to be used with the \include or \dontinclude
+# commands irrespective of the value of the RECURSIVE tag.
+# Possible values are YES and NO. If left blank NO is used.
+
+EXAMPLE_RECURSIVE      = NO
+
+# The IMAGE_PATH tag can be used to specify one or more files or
+# directories that contain image that are included in the documentation (see
+# the \image command).
+
+IMAGE_PATH             =
+
+# The INPUT_FILTER tag can be used to specify a program that doxygen should
+# invoke to filter for each input file. Doxygen will invoke the filter program
+# by executing (via popen()) the command <filter> <input-file>, where <filter>
+# is the value of the INPUT_FILTER tag, and <input-file> is the name of an
+# input file. Doxygen will then use the output that the filter program writes
+# to standard output.  If FILTER_PATTERNS is specified, this tag will be
+# ignored.
+
+INPUT_FILTER           =
+
+# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
+# basis.  Doxygen will compare the file name with each pattern and apply the
+# filter if there is a match.  The filters are a list of the form:
+# pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further
+# info on how filters are used. If FILTER_PATTERNS is empty, INPUT_FILTER
+# is applied to all files.
+
+FILTER_PATTERNS        =
+
+# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
+# INPUT_FILTER) will be used to filter the input files when producing source
+# files to browse (i.e. when SOURCE_BROWSER is set to YES).
+
+FILTER_SOURCE_FILES    = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to source browsing
+#---------------------------------------------------------------------------
+
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will
+# be generated. Documented entities will be cross-referenced with these sources.
+# Note: To get rid of all source code in the generated output, make sure also
+# VERBATIM_HEADERS is set to NO. If you have enabled CALL_GRAPH or CALLER_GRAPH
+# then you must also enable this option. If you don't then doxygen will produce
+# a warning and turn it on anyway
+
+SOURCE_BROWSER         = NO
+
+# Setting the INLINE_SOURCES tag to YES will include the body
+# of functions and classes directly in the documentation.
+
+INLINE_SOURCES         = NO
+
+# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct
+# doxygen to hide any special comment blocks from generated source code
+# fragments. Normal C and C++ comments will always remain visible.
+
+STRIP_CODE_COMMENTS    = YES
+
+# If the REFERENCED_BY_RELATION tag is set to YES (the default)
+# then for each documented function all documented
+# functions referencing it will be listed.
+
+REFERENCED_BY_RELATION = YES
+
+# If the REFERENCES_RELATION tag is set to YES (the default)
+# then for each documented function all documented entities
+# called/used by that function will be listed.
+
+REFERENCES_RELATION    = YES
+
+# If the REFERENCES_LINK_SOURCE tag is set to YES (the default)
+# and SOURCE_BROWSER tag is set to YES, then the hyperlinks from
+# functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will
+# link to the source code.  Otherwise they will link to the documentstion.
+
+REFERENCES_LINK_SOURCE = YES
+
+# If the USE_HTAGS tag is set to YES then the references to source code
+# will point to the HTML generated by the htags(1) tool instead of doxygen
+# built-in source browser. The htags tool is part of GNU's global source
+# tagging system (see http://www.gnu.org/software/global/global.html). You
+# will need version 4.8.6 or higher.
+
+USE_HTAGS              = NO
+
+# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen
+# will generate a verbatim copy of the header file for each class for
+# which an include is specified. Set to NO to disable this.
+
+VERBATIM_HEADERS       = YES
+
+#---------------------------------------------------------------------------
+# configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index
+# of all compounds will be generated. Enable this if the project
+# contains a lot of classes, structs, unions or interfaces.
+
+ALPHABETICAL_INDEX     = NO
+
+# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then
+# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns
+# in which this list will be split (can be a number in the range [1..20])
+
+COLS_IN_ALPHA_INDEX    = 5
+
+# In case all classes in a project start with a common prefix, all
+# classes will be put under the same header in the alphabetical index.
+# The IGNORE_PREFIX tag can be used to specify one or more prefixes that
+# should be ignored while generating the index headers.
+
+IGNORE_PREFIX          =
+
+#---------------------------------------------------------------------------
+# configuration options related to the HTML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_HTML tag is set to YES (the default) Doxygen will
+# generate HTML output.
+
+GENERATE_HTML          = YES
+
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `html' will be used as the default path.
+
+HTML_OUTPUT            = html
+
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for
+# each generated HTML page (for example: .htm,.php,.asp). If it is left blank
+# doxygen will generate files with .html extension.
+
+HTML_FILE_EXTENSION    = .html
+
+# The HTML_HEADER tag can be used to specify a personal HTML header for
+# each generated HTML page. If it is left blank doxygen will generate a
+# standard header.
+
+HTML_HEADER            =
+
+# The HTML_FOOTER tag can be used to specify a personal HTML footer for
+# each generated HTML page. If it is left blank doxygen will generate a
+# standard footer.
+
+HTML_FOOTER            =
+
+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading
+# style sheet that is used by each HTML page. It can be used to
+# fine-tune the look of the HTML output. If the tag is left blank doxygen
+# will generate a default style sheet. Note that doxygen will try to copy
+# the style sheet file to the HTML output directory, so don't put your own
+# stylesheet in the HTML output directory as well, or it will be erased!
+
+HTML_STYLESHEET        =
+
+# If the GENERATE_HTMLHELP tag is set to YES, additional index files
+# will be generated that can be used as input for tools like the
+# Microsoft HTML help workshop to generate a compressed HTML help file (.chm)
+# of the generated HTML documentation.
+
+GENERATE_HTMLHELP      = NO
+
+# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
+# documentation will contain sections that can be hidden and shown after the
+# page has loaded. For this to work a browser that supports
+# java_script and DHTML is required (for instance Mozilla 1.0+, Firefox
+# Netscape 6.0+, Internet explorer 5.0+, Konqueror, or Safari).
+
+HTML_DYNAMIC_SECTIONS  = NO
+
+# If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can
+# be used to specify the file name of the resulting .chm file. You
+# can add a path in front of the file if the result should not be
+# written to the html output directory.
+
+CHM_FILE               =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can
+# be used to specify the location (absolute path including file name) of
+# the HTML help compiler (hhc.exe). If non-empty doxygen will try to run
+# the HTML help compiler on the generated index.hhp.
+
+HHC_LOCATION           =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag
+# controls if a separate .chi index file is generated (YES) or that
+# it should be included in the master .chm file (NO).
+
+GENERATE_CHI           = NO
+
+# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag
+# controls whether a binary table of contents is generated (YES) or a
+# normal table of contents (NO) in the .chm file.
+
+BINARY_TOC             = NO
+
+# The TOC_EXPAND flag can be set to YES to add extra items for group members
+# to the contents of the HTML help documentation and to the tree view.
+
+TOC_EXPAND             = NO
+
+# The DISABLE_INDEX tag can be used to turn on/off the condensed index at
+# top of each HTML page. The value NO (the default) enables the index and
+# the value YES disables it.
+
+DISABLE_INDEX          = NO
+
+# This tag can be used to set the number of enum values (range [1..20])
+# that doxygen will group on one line in the generated HTML documentation.
+
+ENUM_VALUES_PER_LINE   = 4
+
+# If the GENERATE_TREEVIEW tag is set to YES, a side panel will be
+# generated containing a tree-like index structure (just like the one that
+# is generated for HTML Help). For this to work a browser that supports
+# java_script, DHTML, CSS and frames is required (for instance Mozilla 1.0+,
+# Netscape 6.0+, Internet explorer 5.0+, or Konqueror). Windows users are
+# probably better off using the HTML help feature.
+
+GENERATE_TREEVIEW      = NO
+
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be
+# used to set the initial width (in pixels) of the frame in which the tree
+# is shown.
+
+TREEVIEW_WIDTH         = 250
+
+#---------------------------------------------------------------------------
+# configuration options related to the la_te_x output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_LATEX tag is set to YES (the default) Doxygen will
+# generate Latex output.
+
+GENERATE_LATEX         = YES
+
+# The LATEX_OUTPUT tag is used to specify where the la_te_x docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `latex' will be used as the default path.
+
+LATEX_OUTPUT           = latex
+
+# The LATEX_CMD_NAME tag can be used to specify the la_te_x command name to be
+# invoked. If left blank `latex' will be used as the default command name.
+
+LATEX_CMD_NAME         = latex
+
+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to
+# generate index for la_te_x. If left blank `makeindex' will be used as the
+# default command name.
+
+MAKEINDEX_CMD_NAME     = makeindex
+
+# If the COMPACT_LATEX tag is set to YES Doxygen generates more compact
+# la_te_x documents. This may be useful for small projects and may help to
+# save some trees in general.
+
+COMPACT_LATEX          = YES
+
+# The PAPER_TYPE tag can be used to set the paper type that is used
+# by the printer. Possible values are: a4, a4wide, letter, legal and
+# executive. If left blank a4wide will be used.
+
+PAPER_TYPE             = letter
+
+# The EXTRA_PACKAGES tag can be to specify one or more names of la_te_x
+# packages that should be included in the la_te_x output.
+
+EXTRA_PACKAGES         =
+
+# The LATEX_HEADER tag can be used to specify a personal la_te_x header for
+# the generated latex document. The header should contain everything until
+# the first chapter. If it is left blank doxygen will generate a
+# standard header. Notice: only use this tag if you know what you are doing!
+
+LATEX_HEADER           =
+
+# If the PDF_HYPERLINKS tag is set to YES, the la_te_x that is generated
+# is prepared for conversion to pdf (using ps2pdf). The pdf file will
+# contain links (just like the HTML output) instead of page references
+# This makes the output suitable for online browsing using a pdf viewer.
+
+PDF_HYPERLINKS         = YES
+
+# If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of
+# plain latex in the generated Makefile. Set this option to YES to get a
+# higher quality PDF documentation.
+
+USE_PDFLATEX           = YES
+
+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode.
+# command to the generated la_te_x files. This will instruct la_te_x to keep
+# running if errors occur, instead of asking the user for help.
+# This option is also used when generating formulas in HTML.
+
+LATEX_BATCHMODE        = NO
+
+# If LATEX_HIDE_INDICES is set to YES then doxygen will not
+# include the index chapters (such as File Index, Compound Index, etc.)
+# in the output.
+
+LATEX_HIDE_INDICES     = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the RTF output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output
+# The RTF output is optimized for Word 97 and may not look very pretty with
+# other RTF readers or editors.
+
+GENERATE_RTF           = NO
+
+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `rtf' will be used as the default path.
+
+RTF_OUTPUT             = rtf
+
+# If the COMPACT_RTF tag is set to YES Doxygen generates more compact
+# RTF documents. This may be useful for small projects and may help to
+# save some trees in general.
+
+COMPACT_RTF            = NO
+
+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated
+# will contain hyperlink fields. The RTF file will
+# contain links (just like the HTML output) instead of page references.
+# This makes the output suitable for online browsing using WORD or other
+# programs which support those fields.
+# Note: wordpad (write) and others do not support links.
+
+RTF_HYPERLINKS         = NO
+
+# Load stylesheet definitions from file. Syntax is similar to doxygen's
+# config file, i.e. a series of assignments. You only have to provide
+# replacements, missing definitions are set to their default value.
+
+RTF_STYLESHEET_FILE    =
+
+# Set optional variables used in the generation of an rtf document.
+# Syntax is similar to doxygen's config file.
+
+RTF_EXTENSIONS_FILE    =
+
+#---------------------------------------------------------------------------
+# configuration options related to the man page output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_MAN tag is set to YES (the default) Doxygen will
+# generate man pages
+
+GENERATE_MAN           = NO
+
+# The MAN_OUTPUT tag is used to specify where the man pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `man' will be used as the default path.
+
+MAN_OUTPUT             = man
+
+# The MAN_EXTENSION tag determines the extension that is added to
+# the generated man pages (default is the subroutine's section .3)
+
+MAN_EXTENSION          = .3
+
+# If the MAN_LINKS tag is set to YES and Doxygen generates man output,
+# then it will generate one additional man file for each entity
+# documented in the real man page(s). These additional files
+# only source the real man page, but without them the man command
+# would be unable to find the correct page. The default is NO.
+
+MAN_LINKS              = YES
+
+#---------------------------------------------------------------------------
+# configuration options for the auto_gen Definitions output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will
+# generate an auto_gen Definitions (see autogen.sf.net) file
+# that captures the structure of the code including all
+# documentation. Note that this feature is still experimental
+# and incomplete at the moment.
+
+GENERATE_AUTOGEN_DEF   = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_PERLMOD tag is set to YES Doxygen will
+# generate a Perl module file that captures the structure of
+# the code including all documentation. Note that this
+# feature is still experimental and incomplete at the
+# moment.
+
+GENERATE_PERLMOD       = NO
+
+# If the PERLMOD_LATEX tag is set to YES Doxygen will generate
+# the necessary Makefile rules, Perl scripts and la_te_x code to be able
+# to generate PDF and DVI output from the Perl module output.
+
+PERLMOD_LATEX          = NO
+
+# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be
+# nicely formatted so it can be parsed by a human reader.  This is useful
+# if you want to understand what is going on.  On the other hand, if this
+# tag is set to NO the size of the Perl module output will be much smaller
+# and Perl will parse it just the same.
+
+PERLMOD_PRETTY         = YES
+
+# The names of the make variables in the generated doxyrules.make file
+# are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX.
+# This is useful so different doxyrules.make files included by the same
+# Makefile don't overwrite each other's variables.
+
+PERLMOD_MAKEVAR_PREFIX =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+
+# If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will
+# evaluate all C-preprocessor directives found in the sources and include
+# files.
+
+ENABLE_PREPROCESSING   = YES
+
+# If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro
+# names in the source code. If set to NO (the default) only conditional
+# compilation will be performed. Macro expansion can be done in a controlled
+# way by setting EXPAND_ONLY_PREDEF to YES.
+
+MACRO_EXPANSION        = YES
+
+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES
+# then the macro expansion is limited to the macros specified with the
+# PREDEFINED and EXPAND_AS_DEFINED tags.
+
+EXPAND_ONLY_PREDEF     = NO
+
+# If the SEARCH_INCLUDES tag is set to YES (the default) the includes files
+# in the INCLUDE_PATH (see below) will be search if a #include is found.
+
+SEARCH_INCLUDES        = YES
+
+# The INCLUDE_PATH tag can be used to specify one or more directories that
+# contain include files that are not input files but should be processed by
+# the preprocessor.
+
+INCLUDE_PATH           =
+
+# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
+# patterns (like *.h and *.hpp) to filter out the header-files in the
+# directories. If left blank, the patterns specified with FILE_PATTERNS will
+# be used.
+
+INCLUDE_FILE_PATTERNS  = *.h
+
+# The PREDEFINED tag can be used to specify one or more macro names that
+# are defined before the preprocessor is started (similar to the -D option of
+# gcc). The argument of the tag is a list of macros of the form: name
+# or name=definition (no spaces). If the definition and the = are
+# omitted =1 is assumed. To prevent a macro definition from being
+# undefined via #undef or recursively expanded use the := operator
+# instead of the = operator.
+
+PREDEFINED             =
+
+# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then
+# this tag can be used to specify a list of macro names that should be expanded.
+# The macro definition that is found in the sources will be used.
+# Use the PREDEFINED tag if you want to use a different macro definition.
+
+EXPAND_AS_DEFINED      =
+
+# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then
+# doxygen's preprocessor will remove all function-like macros that are alone
+# on a line, have an all uppercase name, and do not end with a semicolon. Such
+# function macros are typically used for boiler-plate code, and will confuse
+# the parser if not removed.
+
+SKIP_FUNCTION_MACROS   = YES
+
+#---------------------------------------------------------------------------
+# Configuration::additions related to external references
+#---------------------------------------------------------------------------
+
+# The TAGFILES option can be used to specify one or more tagfiles.
+# Optionally an initial location of the external documentation
+# can be added for each tagfile. The format of a tag file without
+# this location is as follows:
+#   TAGFILES = file1 file2 ...
+# Adding location for the tag files is done as follows:
+#   TAGFILES = file1=loc1 "file2 = loc2" ...
+# where "loc1" and "loc2" can be relative or absolute paths or
+# URLs. If a location is present for each tag, the installdox tool
+# does not have to be run to correct the links.
+# Note that each tag file must have a unique name
+# (where the name does NOT include the path)
+# If a tag file is not located in the directory in which doxygen
+# is run, you must also specify the path to the tagfile here.
+
+TAGFILES               =
+
+# When a file name is specified after GENERATE_TAGFILE, doxygen will create
+# a tag file that is based on the input files it reads.
+
+GENERATE_TAGFILE       =
+
+# If the ALLEXTERNALS tag is set to YES all external classes will be listed
+# in the class index. If set to NO only the inherited external classes
+# will be listed.
+
+ALLEXTERNALS           = NO
+
+# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed
+# in the modules index. If set to NO, only the current project's groups will
+# be listed.
+
+EXTERNAL_GROUPS        = YES
+
+# The PERL_PATH should be the absolute path and name of the perl script
+# interpreter (i.e. the result of `which perl').
+
+PERL_PATH              = /usr/bin/perl
+
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+
+# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will
+# generate a inheritance diagram (in HTML, RTF and la_te_x) for classes with base
+# or super classes. Setting the tag to NO turns the diagrams off. Note that
+# this option is superseded by the HAVE_DOT option below. This is only a
+# fallback. It is recommended to install and use dot, since it yields more
+# powerful graphs.
+
+CLASS_DIAGRAMS         = YES
+
+# You can define message sequence charts within doxygen comments using the \msc
+# command. Doxygen will then run the mscgen tool (see http://www.mcternan.me.uk/mscgen/) to
+# produce the chart and insert it in the documentation. The MSCGEN_PATH tag allows you to
+# specify the directory where the mscgen tool resides. If left empty the tool is assumed to
+# be found in the default search path.
+
+MSCGEN_PATH            =
+
+# If set to YES, the inheritance and collaboration graphs will hide
+# inheritance and usage relations if the target is undocumented
+# or is not a class.
+
+HIDE_UNDOC_RELATIONS   = YES
+
+# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
+# available from the path. This tool is part of Graphviz, a graph visualization
+# toolkit from AT&T and Lucent Bell Labs. The other options in this section
+# have no effect if this option is set to NO (the default)
+
+HAVE_DOT               = NO
+
+# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for each documented class showing the direct and
+# indirect inheritance relations. Setting this tag to YES will force the
+# the CLASS_DIAGRAMS tag to NO.
+
+CLASS_GRAPH            = YES
+
+# If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for each documented class showing the direct and
+# indirect implementation dependencies (inheritance, containment, and
+# class references variables) of the class with other documented classes.
+
+COLLABORATION_GRAPH    = YES
+
+# If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for groups, showing the direct groups dependencies
+
+GROUP_GRAPHS           = YES
+
+# If the UML_LOOK tag is set to YES doxygen will generate inheritance and
+# collaboration diagrams in a style similar to the OMG's Unified Modeling
+# Language.
+
+UML_LOOK               = NO
+
+# If set to YES, the inheritance and collaboration graphs will show the
+# relations between templates and their instances.
+
+TEMPLATE_RELATIONS     = NO
+
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT
+# tags are set to YES then doxygen will generate a graph for each documented
+# file showing the direct and indirect include dependencies of the file with
+# other documented files.
+
+INCLUDE_GRAPH          = YES
+
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and
+# HAVE_DOT tags are set to YES then doxygen will generate a graph for each
+# documented header file showing the documented files that directly or
+# indirectly include this file.
+
+INCLUDED_BY_GRAPH      = YES
+
+# If the CALL_GRAPH, SOURCE_BROWSER and HAVE_DOT tags are set to YES then doxygen will
+# generate a call dependency graph for every global function or class method.
+# Note that enabling this option will significantly increase the time of a run.
+# So in most cases it will be better to enable call graphs for selected
+# functions only using the \callgraph command.
+
+CALL_GRAPH             = NO
+
+# If the CALLER_GRAPH, SOURCE_BROWSER and HAVE_DOT tags are set to YES then doxygen will
+# generate a caller dependency graph for every global function or class method.
+# Note that enabling this option will significantly increase the time of a run.
+# So in most cases it will be better to enable caller graphs for selected
+# functions only using the \callergraph command.
+
+CALLER_GRAPH           = NO
+
+# If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen
+# will graphical hierarchy of all classes instead of a textual one.
+
+GRAPHICAL_HIERARCHY    = YES
+
+# If the DIRECTORY_GRAPH, SHOW_DIRECTORIES and HAVE_DOT tags are set to YES
+# then doxygen will show the dependencies a directory has on other directories
+# in a graphical way. The dependency relations are determined by the #include
+# relations between the files in the directories.
+
+DIRECTORY_GRAPH        = YES
+
+# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
+# generated by dot. Possible values are png, jpg, or gif
+# If left blank png will be used.
+
+DOT_IMAGE_FORMAT       = png
+
+# The tag DOT_PATH can be used to specify the path where the dot tool can be
+# found. If left blank, it is assumed the dot tool can be found in the path.
+
+DOT_PATH               =
+
+# The DOTFILE_DIRS tag can be used to specify one or more directories that
+# contain dot files that are included in the documentation (see the
+# \dotfile command).
+
+DOTFILE_DIRS           =
+
+# The MAX_DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of
+# nodes that will be shown in the graph. If the number of nodes in a graph
+# becomes larger than this value, doxygen will truncate the graph, which is
+# visualized by representing a node as a red box. Note that doxygen if the number
+# of direct children of the root node in a graph is already larger than
+# MAX_DOT_GRAPH_NOTES then the graph will not be shown at all. Also note
+# that the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
+
+DOT_GRAPH_MAX_NODES    = 50
+
+# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the
+# graphs generated by dot. A depth value of 3 means that only nodes reachable
+# from the root by following a path via at most 3 edges will be shown. Nodes
+# that lay further from the root node will be omitted. Note that setting this
+# option to 1 or 2 may greatly reduce the computation time needed for large
+# code bases. Also note that the size of a graph can be further restricted by
+# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
+
+MAX_DOT_GRAPH_DEPTH    = 0
+
+# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
+# background. This is disabled by default, which results in a white background.
+# Warning: Depending on the platform used, enabling this option may lead to
+# badly anti-aliased labels on the edges of a graph (i.e. they become hard to
+# read).
+
+DOT_TRANSPARENT        = YES
+
+# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output
+# files in one run (i.e. multiple -o and -T options on the command line). This
+# makes dot run faster, but since only newer versions of dot (>1.8.10)
+# support this, this feature is disabled by default.
+
+DOT_MULTI_TARGETS      = NO
+
+# If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will
+# generate a legend page explaining the meaning of the various boxes and
+# arrows in the dot generated graphs.
+
+GENERATE_LEGEND        = YES
+
+# If the DOT_CLEANUP tag is set to YES (the default) Doxygen will
+# remove the intermediate dot files that are used to generate
+# the various graphs.
+
+DOT_CLEANUP            = YES
+
+#---------------------------------------------------------------------------
+# Configuration::additions related to the search engine
+#---------------------------------------------------------------------------
+
+# The SEARCHENGINE tag specifies whether or not a search engine should be
+# used. If set to NO the values of all tags below this one will be ignored.
+
+SEARCHENGINE           = NO
diff --git a/libs/libaom/src/mainpage.dox b/libs/libaom/src/mainpage.dox
new file mode 100644
index 000000000..03a299ae1
--- /dev/null
+++ b/libs/libaom/src/mainpage.dox
@@ -0,0 +1,52 @@
+/*!\mainpage AMedia Codec SDK
+
+  \section main_contents Page Contents
+  - \ref main_intro
+  - \ref main_startpoints
+  - \ref main_support
+
+  \section main_intro Introduction
+  Welcome to the AMedia Codec SDK. This SDK allows you to integrate your
+  applications with the AOM and AV1 video codecs.
+
+  This distribution of the AOMedia Codec SDK includes the following support:
+
+  \if aom_encoder
+  - \ref aom_encoder
+  \endif
+  \if aom_decoder
+  - \ref aom_decoder
+  \endif
+
+
+  \section main_startpoints Starting Points
+  - Consult the \ref changelog for a complete list of improvements in this
+    release.
+  - \ref readme contains instructions on compiling the sample applications.
+  - Read the \ref usage "usage" for a narrative on codec usage.
+  - Read the \ref samples "sample code" for examples of how to interact with the
+    codec.
+  - \ref codec reference
+  \if encoder
+  - \ref encoder reference
+  \endif
+  \if decoder
+  - \ref decoder reference
+  \endif
+
+  \section main_support Support Options & FAQ
+  The AOMedia project is an open source project supported by its community. For
+  questions about this SDK, please mail the apps-devel@webmproject.org list.
+  To contribute, see http://www.webmproject.org/code/contribute and mail
+  codec-devel@webmproject.org.
+*/
+
+/*!\page changelog CHANGELOG
+   \verbinclude CHANGELOG
+*/
+
+/*!\page readme README.md
+   \include README.md
+*/
+
+/*!\defgroup codecs Supported Codecs */
diff --git a/libs/libaom/src/stats/aomstats.c b/libs/libaom/src/stats/aomstats.c
new file mode 100644
index 000000000..4a15adf02
--- /dev/null
+++ b/libs/libaom/src/stats/aomstats.c
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "stats/aomstats.h"
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "common/tools_common.h"
+
+int stats_open_file(stats_io_t *stats, const char *fpf, int pass) {
+  int res;
+  stats->pass = pass;
+
+  if (pass == 0) {
+    stats->file = fopen(fpf, "wb");
+    stats->buf.sz = 0;
+    stats->buf.buf = NULL;
+    res = (stats->file != NULL);
+  } else {
+    size_t nbytes;
+
+    stats->file = fopen(fpf, "rb");
+
+    if (stats->file == NULL) fatal("First-pass stats file does not exist!");
+
+    if (fseek(stats->file, 0, SEEK_END))
+      fatal("First-pass stats file must be seekable!");
+
+    stats->buf.sz = stats->buf_alloc_sz = ftell(stats->file);
+    rewind(stats->file);
+
+    stats->buf.buf = malloc(stats->buf_alloc_sz);
+
+    if (!stats->buf.buf)
+      fatal("Failed to allocate first-pass stats buffer (%lu bytes)",
+            (unsigned int)stats->buf_alloc_sz);
+
+    nbytes = fread(stats->buf.buf, 1, stats->buf.sz, stats->file);
+    res = (nbytes == stats->buf.sz);
+  }
+
+  return res;
+}
+
+int stats_open_mem(stats_io_t *stats, int pass) {
+  int res;
+  stats->pass = pass;
+
+  if (!pass) {
+    stats->buf.sz = 0;
+    stats->buf_alloc_sz = 64 * 1024;
+    stats->buf.buf = malloc(stats->buf_alloc_sz);
+  }
+
+  stats->buf_ptr = stats->buf.buf;
+  res = (stats->buf.buf != NULL);
+  return res;
+}
+
+void stats_close(stats_io_t *stats, int last_pass) {
+  if (stats->file) {
+    if (stats->pass == last_pass) {
+      free(stats->buf.buf);
+    }
+
+    fclose(stats->file);
+    stats->file = NULL;
+  } else {
+    if (stats->pass == last_pass) free(stats->buf.buf);
+  }
+}
+
+void stats_write(stats_io_t *stats, const void *pkt, size_t len) {
+  if (stats->file) {
+    (void)fwrite(pkt, 1, len, stats->file);
+  } else {
+    if (stats->buf.sz + len > stats->buf_alloc_sz) {
+      size_t new_sz = stats->buf_alloc_sz + 64 * 1024;
+      char *new_ptr = realloc(stats->buf.buf, new_sz);
+
+      if (new_ptr) {
+        stats->buf_ptr = new_ptr + (stats->buf_ptr - (char *)stats->buf.buf);
+        stats->buf.buf = new_ptr;
+        stats->buf_alloc_sz = new_sz;
+      } else {
+        fatal("Failed to realloc firstpass stats buffer.");
+      }
+    }
+
+    memcpy(stats->buf_ptr, pkt, len);
+    stats->buf.sz += len;
+    stats->buf_ptr += len;
+  }
+}
+
+aom_fixed_buf_t stats_get(stats_io_t *stats) { return stats->buf; }
diff --git a/libs/libaom/src/stats/aomstats.h b/libs/libaom/src/stats/aomstats.h
new file mode 100644
index 000000000..b9c71871a
--- /dev/null
+++ b/libs/libaom/src/stats/aomstats.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_STATS_AOMSTATS_H_
+#define AOM_STATS_AOMSTATS_H_
+
+#include <stdio.h>
+
+#include "aom/aom_encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* This structure is used to abstract the different ways of handling
+ * first pass statistics
+ */
+typedef struct {
+  aom_fixed_buf_t buf;
+  int pass;
+  FILE *file;
+  char *buf_ptr;
+  size_t buf_alloc_sz;
+} stats_io_t;
+
+int stats_open_file(stats_io_t *stats, const char *fpf, int pass);
+int stats_open_mem(stats_io_t *stats, int pass);
+void stats_close(stats_io_t *stats, int last_pass);
+void stats_write(stats_io_t *stats, const void *pkt, size_t len);
+aom_fixed_buf_t stats_get(stats_io_t *stats);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_STATS_AOMSTATS_H_
diff --git a/libs/libaom/src/stats/rate_hist.c b/libs/libaom/src/stats/rate_hist.c
new file mode 100644
index 000000000..71eb78b72
--- /dev/null
+++ b/libs/libaom/src/stats/rate_hist.c
@@ -0,0 +1,271 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "stats/rate_hist.h"
+
+#include <assert.h>
+#include <stdlib.h>
+#include <limits.h>
+#include <stdio.h>
+#include <math.h>
+
+#define RATE_BINS 100
+#define HIST_BAR_MAX 40
+
+struct hist_bucket {
+  int low;
+  int high;
+  int count;
+};
+
+struct rate_hist {
+  int64_t *pts;
+  int *sz;
+  int samples;
+  int frames;
+  struct hist_bucket bucket[RATE_BINS];
+  int total;
+};
+
+struct rate_hist *init_rate_histogram(const aom_codec_enc_cfg_t *cfg,
+                                      const aom_rational_t *fps) {
+  int i;
+  struct rate_hist *hist = malloc(sizeof(*hist));
+
+  // Determine the number of samples in the buffer. Use the file's framerate
+  // to determine the number of frames in rc_buf_sz milliseconds, with an
+  // adjustment (5/4) to account for alt-refs
+  hist->samples = cfg->rc_buf_sz * 5 / 4 * fps->num / fps->den / 1000;
+
+  // prevent division by zero
+  if (hist->samples == 0) hist->samples = 1;
+
+  hist->frames = 0;
+  hist->total = 0;
+
+  hist->pts = calloc(hist->samples, sizeof(*hist->pts));
+  hist->sz = calloc(hist->samples, sizeof(*hist->sz));
+  for (i = 0; i < RATE_BINS; i++) {
+    hist->bucket[i].low = INT_MAX;
+    hist->bucket[i].high = 0;
+    hist->bucket[i].count = 0;
+  }
+
+  return hist;
+}
+
+void destroy_rate_histogram(struct rate_hist *hist) {
+  if (hist) {
+    free(hist->pts);
+    free(hist->sz);
+    free(hist);
+  }
+}
+
+void update_rate_histogram(struct rate_hist *hist,
+                           const aom_codec_enc_cfg_t *cfg,
+                           const aom_codec_cx_pkt_t *pkt) {
+  int i;
+  int64_t then = 0;
+  int64_t avg_bitrate = 0;
+  int64_t sum_sz = 0;
+  const int64_t now = pkt->data.frame.pts * 1000 *
+                      (uint64_t)cfg->g_timebase.num /
+                      (uint64_t)cfg->g_timebase.den;
+
+  int idx = hist->frames++ % hist->samples;
+  hist->pts[idx] = now;
+  hist->sz[idx] = (int)pkt->data.frame.sz;
+
+  if (now < cfg->rc_buf_initial_sz) return;
+
+  if (!cfg->rc_target_bitrate) return;
+
+  then = now;
+
+  /* Sum the size over the past rc_buf_sz ms */
+  for (i = hist->frames; i > 0 && hist->frames - i < hist->samples; i--) {
+    const int i_idx = (i - 1) % hist->samples;
+
+    then = hist->pts[i_idx];
+    if (now - then > cfg->rc_buf_sz) break;
+    sum_sz += hist->sz[i_idx];
+  }
+
+  if (now == then) return;
+
+  avg_bitrate = sum_sz * 8 * 1000 / (now - then);
+  idx = (int)(avg_bitrate * (RATE_BINS / 2) / (cfg->rc_target_bitrate * 1000));
+  if (idx < 0) idx = 0;
+  if (idx > RATE_BINS - 1) idx = RATE_BINS - 1;
+  if (hist->bucket[idx].low > avg_bitrate)
+    hist->bucket[idx].low = (int)avg_bitrate;
+  if (hist->bucket[idx].high < avg_bitrate)
+    hist->bucket[idx].high = (int)avg_bitrate;
+  hist->bucket[idx].count++;
+  hist->total++;
+}
+
+static int merge_hist_buckets(struct hist_bucket *bucket, int max_buckets,
+                              int *num_buckets) {
+  int small_bucket = 0, merge_bucket = INT_MAX, big_bucket = 0;
+  int buckets = *num_buckets;
+  int i;
+
+  /* Find the extrema for this list of buckets */
+  big_bucket = small_bucket = 0;
+  for (i = 0; i < buckets; i++) {
+    if (bucket[i].count < bucket[small_bucket].count) small_bucket = i;
+    if (bucket[i].count > bucket[big_bucket].count) big_bucket = i;
+  }
+
+  /* If we have too many buckets, merge the smallest with an adjacent
+   * bucket.
+   */
+  while (buckets > max_buckets) {
+    int last_bucket = buckets - 1;
+
+    /* merge the small bucket with an adjacent one. */
+    if (small_bucket == 0)
+      merge_bucket = 1;
+    else if (small_bucket == last_bucket)
+      merge_bucket = last_bucket - 1;
+    else if (bucket[small_bucket - 1].count < bucket[small_bucket + 1].count)
+      merge_bucket = small_bucket - 1;
+    else
+      merge_bucket = small_bucket + 1;
+
+    assert(abs(merge_bucket - small_bucket) <= 1);
+    assert(small_bucket < buckets);
+    assert(big_bucket < buckets);
+    assert(merge_bucket < buckets);
+
+    if (merge_bucket < small_bucket) {
+      bucket[merge_bucket].high = bucket[small_bucket].high;
+      bucket[merge_bucket].count += bucket[small_bucket].count;
+    } else {
+      bucket[small_bucket].high = bucket[merge_bucket].high;
+      bucket[small_bucket].count += bucket[merge_bucket].count;
+      merge_bucket = small_bucket;
+    }
+
+    assert(bucket[merge_bucket].low != bucket[merge_bucket].high);
+
+    buckets--;
+
+    /* Remove the merge_bucket from the list, and find the new small
+     * and big buckets while we're at it
+     */
+    big_bucket = small_bucket = 0;
+    for (i = 0; i < buckets; i++) {
+      if (i > merge_bucket) bucket[i] = bucket[i + 1];
+
+      if (bucket[i].count < bucket[small_bucket].count) small_bucket = i;
+      if (bucket[i].count > bucket[big_bucket].count) big_bucket = i;
+    }
+  }
+
+  *num_buckets = buckets;
+  return bucket[big_bucket].count;
+}
+
+static void show_histogram(const struct hist_bucket *bucket, int buckets,
+                           int total, int scale) {
+  const char *pat1, *pat2;
+  int i;
+
+  switch ((int)(log(bucket[buckets - 1].high) / log(10)) + 1) {
+    case 1:
+    case 2:
+      pat1 = "%4d %2s: ";
+      pat2 = "%4d-%2d: ";
+      break;
+    case 3:
+      pat1 = "%5d %3s: ";
+      pat2 = "%5d-%3d: ";
+      break;
+    case 4:
+      pat1 = "%6d %4s: ";
+      pat2 = "%6d-%4d: ";
+      break;
+    case 5:
+      pat1 = "%7d %5s: ";
+      pat2 = "%7d-%5d: ";
+      break;
+    case 6:
+      pat1 = "%8d %6s: ";
+      pat2 = "%8d-%6d: ";
+      break;
+    case 7:
+      pat1 = "%9d %7s: ";
+      pat2 = "%9d-%7d: ";
+      break;
+    default:
+      pat1 = "%12d %10s: ";
+      pat2 = "%12d-%10d: ";
+      break;
+  }
+
+  for (i = 0; i < buckets; i++) {
+    int len;
+    int j;
+    float pct;
+
+    pct = (float)(100.0 * bucket[i].count / total);
+    len = HIST_BAR_MAX * bucket[i].count / scale;
+    if (len < 1) len = 1;
+    assert(len <= HIST_BAR_MAX);
+
+    if (bucket[i].low == bucket[i].high)
+      fprintf(stderr, pat1, bucket[i].low, "");
+    else
+      fprintf(stderr, pat2, bucket[i].low, bucket[i].high);
+
+    for (j = 0; j < HIST_BAR_MAX; j++) fprintf(stderr, j < len ? "=" : " ");
+    fprintf(stderr, "\t%5d (%6.2f%%)\n", bucket[i].count, pct);
+  }
+}
+
+void show_q_histogram(const int counts[64], int max_buckets) {
+  struct hist_bucket bucket[64];
+  int buckets = 0;
+  int total = 0;
+  int scale;
+  int i;
+
+  for (i = 0; i < 64; i++) {
+    if (counts[i]) {
+      bucket[buckets].low = bucket[buckets].high = i;
+      bucket[buckets].count = counts[i];
+      buckets++;
+      total += counts[i];
+    }
+  }
+
+  fprintf(stderr, "\nQuantizer Selection:\n");
+  scale = merge_hist_buckets(bucket, max_buckets, &buckets);
+  show_histogram(bucket, buckets, total, scale);
+}
+
+void show_rate_histogram(struct rate_hist *hist, const aom_codec_enc_cfg_t *cfg,
+                         int max_buckets) {
+  int i, scale;
+  int buckets = 0;
+
+  for (i = 0; i < RATE_BINS; i++) {
+    if (hist->bucket[i].low == INT_MAX) continue;
+    hist->bucket[buckets++] = hist->bucket[i];
+  }
+
+  fprintf(stderr, "\nRate (over %dms window):\n", cfg->rc_buf_sz);
+  scale = merge_hist_buckets(hist->bucket, max_buckets, &buckets);
+  show_histogram(hist->bucket, buckets, hist->total, scale);
+}
diff --git a/libs/libaom/src/stats/rate_hist.h b/libs/libaom/src/stats/rate_hist.h
new file mode 100644
index 000000000..55b8c5d43
--- /dev/null
+++ b/libs/libaom/src/stats/rate_hist.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_STATS_RATE_HIST_H_
+#define AOM_STATS_RATE_HIST_H_
+
+#include "aom/aom_encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct rate_hist;
+
+struct rate_hist *init_rate_histogram(const aom_codec_enc_cfg_t *cfg,
+                                      const aom_rational_t *fps);
+
+void destroy_rate_histogram(struct rate_hist *hist);
+
+void update_rate_histogram(struct rate_hist *hist,
+                           const aom_codec_enc_cfg_t *cfg,
+                           const aom_codec_cx_pkt_t *pkt);
+
+void show_q_histogram(const int counts[64], int max_buckets);
+
+void show_rate_histogram(struct rate_hist *hist, const aom_codec_enc_cfg_t *cfg,
+                         int max_buckets);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_STATS_RATE_HIST_H_
diff --git a/libs/libaom/src/test/accounting_test.cc b/libs/libaom/src/test/accounting_test.cc
new file mode 100644
index 000000000..8b5c8af13
--- /dev/null
+++ b/libs/libaom/src/test/accounting_test.cc
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "test/acm_random.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/bitreader.h"
+#include "aom_dsp/bitwriter.h"
+
+using libaom_test::ACMRandom;
+
+TEST(AV1, TestAccounting) {
+  const int kBufferSize = 10000;
+  const int kSymbols = 1024;
+  aom_writer bw;
+  uint8_t bw_buffer[kBufferSize];
+  aom_start_encode(&bw, bw_buffer);
+  for (int i = 0; i < kSymbols; i++) {
+    aom_write(&bw, 0, 32);
+    aom_write(&bw, 0, 32);
+    aom_write(&bw, 0, 32);
+  }
+  aom_stop_encode(&bw);
+  aom_reader br;
+  aom_reader_init(&br, bw_buffer, bw.pos);
+
+  Accounting accounting;
+  aom_accounting_init(&accounting);
+  br.accounting = &accounting;
+  for (int i = 0; i < kSymbols; i++) {
+    aom_read(&br, 32, "A");
+  }
+  // Consecutive symbols that are the same are coalesced.
+  GTEST_ASSERT_EQ(accounting.syms.num_syms, 1);
+  GTEST_ASSERT_EQ(accounting.syms.syms[0].samples, (unsigned int)kSymbols);
+
+  aom_accounting_reset(&accounting);
+  GTEST_ASSERT_EQ(accounting.syms.num_syms, 0);
+
+  // Should record 2 * kSymbols accounting symbols.
+  aom_reader_init(&br, bw_buffer, bw.pos);
+  br.accounting = &accounting;
+  for (int i = 0; i < kSymbols; i++) {
+    aom_read(&br, 32, "A");
+    aom_read(&br, 32, "B");
+    aom_read(&br, 32, "B");
+  }
+  GTEST_ASSERT_EQ(accounting.syms.num_syms, kSymbols * 2);
+  uint32_t tell_frac = aom_reader_tell_frac(&br);
+  for (int i = 0; i < accounting.syms.num_syms; i++) {
+    tell_frac -= accounting.syms.syms[i].bits;
+  }
+  GTEST_ASSERT_EQ(tell_frac, 0U);
+
+  GTEST_ASSERT_EQ(aom_accounting_dictionary_lookup(&accounting, "A"),
+                  aom_accounting_dictionary_lookup(&accounting, "A"));
+
+  // Check for collisions. The current aom_accounting_hash function returns
+  // the same hash code for AB and BA.
+  GTEST_ASSERT_NE(aom_accounting_dictionary_lookup(&accounting, "AB"),
+                  aom_accounting_dictionary_lookup(&accounting, "BA"));
+}
diff --git a/libs/libaom/src/test/acm_random.h b/libs/libaom/src/test/acm_random.h
new file mode 100644
index 000000000..8b1d51aef
--- /dev/null
+++ b/libs/libaom/src/test/acm_random.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_TEST_ACM_RANDOM_H_
+#define AOM_TEST_ACM_RANDOM_H_
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "aom/aom_integer.h"
+
+namespace libaom_test {
+
+class ACMRandom {
+ public:
+  ACMRandom() : random_(DeterministicSeed()) {}
+
+  explicit ACMRandom(int seed) : random_(seed) {}
+
+  void Reset(int seed) { random_.Reseed(seed); }
+
+  // Generates a random 31-bit unsigned integer from [0, 2^31).
+  uint32_t Rand31(void) {
+    return random_.Generate(testing::internal::Random::kMaxRange);
+  }
+
+  uint16_t Rand16(void) {
+    const uint32_t value =
+        random_.Generate(testing::internal::Random::kMaxRange);
+    return (value >> 15) & 0xffff;
+  }
+
+  int16_t Rand15Signed(void) {
+    const uint32_t value =
+        random_.Generate(testing::internal::Random::kMaxRange);
+    return (value >> 17) & 0xffff;
+  }
+
+  uint16_t Rand12(void) {
+    const uint32_t value =
+        random_.Generate(testing::internal::Random::kMaxRange);
+    // There's a bit more entropy in the upper bits of this implementation.
+    return (value >> 19) & 0xfff;
+  }
+
+  int16_t Rand9Signed(void) {
+    // Use 9 bits: values between 255 (0x0FF) and -256 (0x100).
+    const uint32_t value = random_.Generate(512);
+    return static_cast<int16_t>(value) - 256;
+  }
+
+  uint8_t Rand8(void) {
+    const uint32_t value =
+        random_.Generate(testing::internal::Random::kMaxRange);
+    // There's a bit more entropy in the upper bits of this implementation.
+    return (value >> 23) & 0xff;
+  }
+
+  uint8_t Rand8Extremes(void) {
+    // Returns a random value near 0 or near 255, to better exercise
+    // saturation behavior.
+    const uint8_t r = Rand8();
+    return static_cast<uint8_t>((r < 128) ? r << 4 : r >> 4);
+  }
+
+  int PseudoUniform(int range) { return random_.Generate(range); }
+
+  int operator()(int n) { return PseudoUniform(n); }
+
+  static int DeterministicSeed(void) { return 0xbaba; }
+
+ private:
+  testing::internal::Random random_;
+};
+
+}  // namespace libaom_test
+
+#endif  // AOM_TEST_ACM_RANDOM_H_
diff --git a/libs/libaom/src/test/active_map_test.cc b/libs/libaom/src/test/active_map_test.cc
new file mode 100644
index 000000000..0f8a7329e
--- /dev/null
+++ b/libs/libaom/src/test/active_map_test.cc
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <climits>
+#include <vector>
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+
+namespace {
+
+class ActiveMapTest
+    : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode, int>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  static const int kWidth = 208;
+  static const int kHeight = 144;
+
+  ActiveMapTest() : EncoderTest(GET_PARAM(0)) {}
+  virtual ~ActiveMapTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(GET_PARAM(1));
+    cpu_used_ = GET_PARAM(2);
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(AOME_SET_CPUUSED, cpu_used_);
+    } else if (video->frame() == 3) {
+      aom_active_map_t map = aom_active_map_t();
+      /* clang-format off */
+      uint8_t active_map[9 * 13] = {
+        1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
+        1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
+        1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
+        1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
+        0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1,
+        0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1,
+        0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1,
+        0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1,
+        1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0,
+      };
+      /* clang-format on */
+      map.cols = (kWidth + 15) / 16;
+      map.rows = (kHeight + 15) / 16;
+      ASSERT_EQ(map.cols, 13u);
+      ASSERT_EQ(map.rows, 9u);
+      map.active_map = active_map;
+      encoder->Control(AOME_SET_ACTIVEMAP, &map);
+    } else if (video->frame() == 15) {
+      aom_active_map_t map = aom_active_map_t();
+      map.cols = (kWidth + 15) / 16;
+      map.rows = (kHeight + 15) / 16;
+      map.active_map = NULL;
+      encoder->Control(AOME_SET_ACTIVEMAP, &map);
+    }
+  }
+
+  void DoTest() {
+    // Validate that this non multiple of 64 wide clip encodes
+    cfg_.g_lag_in_frames = 0;
+    cfg_.rc_target_bitrate = 400;
+    cfg_.rc_resize_mode = 0;
+    cfg_.g_pass = AOM_RC_ONE_PASS;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.kf_max_dist = 90000;
+    ::libaom_test::I420VideoSource video("hantro_odd.yuv", kWidth, kHeight, 30,
+                                         1, 0, 20);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+
+  int cpu_used_;
+};
+
+TEST_P(ActiveMapTest, Test) { DoTest(); }
+
+class ActiveMapTestLarge : public ActiveMapTest {};
+
+TEST_P(ActiveMapTestLarge, Test) { DoTest(); }
+
+AV1_INSTANTIATE_TEST_CASE(ActiveMapTestLarge,
+                          ::testing::Values(::libaom_test::kRealTime),
+                          ::testing::Range(0, 5));
+
+AV1_INSTANTIATE_TEST_CASE(ActiveMapTest,
+                          ::testing::Values(::libaom_test::kRealTime),
+                          ::testing::Range(5, 9));
+
+}  // namespace
diff --git a/libs/libaom/src/test/altref_test.cc b/libs/libaom/src/test/altref_test.cc
new file mode 100644
index 000000000..43df39fb6
--- /dev/null
+++ b/libs/libaom/src/test/altref_test.cc
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+namespace {
+
+class AltRefForcedKeyTestLarge
+    : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode, int>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  AltRefForcedKeyTestLarge()
+      : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+        cpu_used_(GET_PARAM(2)), forced_kf_frame_num_(1), frame_num_(0) {}
+  virtual ~AltRefForcedKeyTestLarge() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(encoding_mode_);
+    cfg_.rc_end_usage = AOM_VBR;
+    cfg_.g_threads = 0;
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(AOME_SET_CPUUSED, cpu_used_);
+      encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+#if CONFIG_AV1_ENCODER
+      // override test default for tile columns if necessary.
+      if (GET_PARAM(0) == &libaom_test::kAV1) {
+        encoder->Control(AV1E_SET_TILE_COLUMNS, 6);
+      }
+#endif
+    }
+    frame_flags_ =
+        (video->frame() == forced_kf_frame_num_) ? AOM_EFLAG_FORCE_KF : 0;
+  }
+
+  virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
+    if (frame_num_ == forced_kf_frame_num_) {
+      ASSERT_EQ(pkt->data.frame.flags & AOM_FRAME_IS_KEY,
+                static_cast<aom_codec_frame_flags_t>(AOM_FRAME_IS_KEY))
+          << "Frame #" << frame_num_ << " isn't a keyframe!";
+    }
+    ++frame_num_;
+  }
+
+  ::libaom_test::TestMode encoding_mode_;
+  int cpu_used_;
+  unsigned int forced_kf_frame_num_;
+  unsigned int frame_num_;
+};
+
+TEST_P(AltRefForcedKeyTestLarge, Frame1IsKey) {
+  const aom_rational timebase = { 1, 30 };
+  const int lag_values[] = { 3, 15, 25, -1 };
+
+  forced_kf_frame_num_ = 1;
+  for (int i = 0; lag_values[i] != -1; ++i) {
+    frame_num_ = 0;
+    cfg_.g_lag_in_frames = lag_values[i];
+    libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       timebase.den, timebase.num, 0, 30);
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+}
+
+TEST_P(AltRefForcedKeyTestLarge, ForcedFrameIsKey) {
+  const aom_rational timebase = { 1, 30 };
+  const int lag_values[] = { 3, 15, 25, -1 };
+
+  for (int i = 0; lag_values[i] != -1; ++i) {
+    frame_num_ = 0;
+    forced_kf_frame_num_ = lag_values[i] - 1;
+    cfg_.g_lag_in_frames = lag_values[i];
+    libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       timebase.den, timebase.num, 0, 30);
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+}
+
+AV1_INSTANTIATE_TEST_CASE(AltRefForcedKeyTestLarge,
+                          ::testing::Values(::libaom_test::kOnePassGood),
+                          ::testing::Values(2, 5));
+
+}  // namespace
diff --git a/libs/libaom/src/test/aom_integer_test.cc b/libs/libaom/src/test/aom_integer_test.cc
new file mode 100644
index 000000000..d5dfad946
--- /dev/null
+++ b/libs/libaom/src/test/aom_integer_test.cc
@@ -0,0 +1,177 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom/aom_integer.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+const uint64_t kMaximumLeb128CodedSize = 8;
+const uint8_t kLeb128PadByte = 0x80;  // Binary: 10000000
+const uint64_t kMaximumLeb128Value = UINT32_MAX;
+const uint32_t kSizeTestNumValues = 6;
+const uint32_t kSizeTestExpectedSizes[kSizeTestNumValues] = {
+  1, 1, 2, 3, 4, 5
+};
+const uint64_t kSizeTestInputs[kSizeTestNumValues] = { 0,        0x7f,
+                                                       0x3fff,   0x1fffff,
+                                                       0xffffff, 0x10000000 };
+
+const uint8_t kOutOfRangeLeb128Value[5] = { 0x80, 0x80, 0x80, 0x80,
+                                            0x10 };  // UINT32_MAX + 1
+}  // namespace
+
+TEST(AomLeb128, DecodeTest) {
+  const size_t num_leb128_bytes = 3;
+  const uint8_t leb128_bytes[num_leb128_bytes] = { 0xE5, 0x8E, 0x26 };
+  const uint64_t expected_value = 0x98765;  // 624485
+  const size_t expected_length = 3;
+  uint64_t value = ~0ULL;  // make sure value is cleared by the function
+  size_t length;
+  ASSERT_EQ(
+      aom_uleb_decode(&leb128_bytes[0], num_leb128_bytes, &value, &length), 0);
+  ASSERT_EQ(expected_value, value);
+  ASSERT_EQ(expected_length, length);
+
+  // Make sure the decoder stops on the last marked LEB128 byte.
+  aom_uleb_decode(&leb128_bytes[0], num_leb128_bytes + 1, &value, &length);
+  ASSERT_EQ(expected_value, value);
+  ASSERT_EQ(expected_length, length);
+}
+
+TEST(AomLeb128, EncodeTest) {
+  const uint32_t test_value = 0x98765;  // 624485
+  const uint8_t expected_bytes[3] = { 0xE5, 0x8E, 0x26 };
+  const size_t kWriteBufferSize = 4;
+  uint8_t write_buffer[kWriteBufferSize] = { 0 };
+  size_t bytes_written = 0;
+  ASSERT_EQ(aom_uleb_encode(test_value, kWriteBufferSize, &write_buffer[0],
+                            &bytes_written),
+            0);
+  ASSERT_EQ(bytes_written, 3u);
+  for (size_t i = 0; i < bytes_written; ++i) {
+    ASSERT_EQ(write_buffer[i], expected_bytes[i]);
+  }
+}
+
+TEST(AomLeb128, EncodeDecodeTest) {
+  const uint32_t value = 0x98765;  // 624485
+  const size_t kWriteBufferSize = 4;
+  uint8_t write_buffer[kWriteBufferSize] = { 0 };
+  size_t bytes_written = 0;
+  ASSERT_EQ(aom_uleb_encode(value, kWriteBufferSize, &write_buffer[0],
+                            &bytes_written),
+            0);
+  ASSERT_EQ(bytes_written, 3u);
+  uint64_t decoded_value;
+  size_t decoded_length;
+  aom_uleb_decode(&write_buffer[0], bytes_written, &decoded_value,
+                  &decoded_length);
+  ASSERT_EQ(value, decoded_value);
+  ASSERT_EQ(bytes_written, decoded_length);
+}
+
+TEST(AomLeb128, FixedSizeEncodeTest) {
+  const uint32_t test_value = 0x123;
+  const uint8_t expected_bytes[4] = { 0xa3, 0x82, 0x80, 0x00 };
+  const size_t kWriteBufferSize = 4;
+  uint8_t write_buffer[kWriteBufferSize] = { 0 };
+  size_t bytes_written = 0;
+  ASSERT_EQ(0, aom_uleb_encode_fixed_size(test_value, kWriteBufferSize,
+                                          kWriteBufferSize, &write_buffer[0],
+                                          &bytes_written));
+  ASSERT_EQ(kWriteBufferSize, bytes_written);
+  for (size_t i = 0; i < bytes_written; ++i) {
+    ASSERT_EQ(write_buffer[i], expected_bytes[i]);
+  }
+}
+
+TEST(AomLeb128, FixedSizeEncodeDecodeTest) {
+  const uint32_t value = 0x1;
+  const size_t kWriteBufferSize = 4;
+  uint8_t write_buffer[kWriteBufferSize] = { 0 };
+  size_t bytes_written = 0;
+  ASSERT_EQ(
+      aom_uleb_encode_fixed_size(value, kWriteBufferSize, kWriteBufferSize,
+                                 &write_buffer[0], &bytes_written),
+      0);
+  ASSERT_EQ(bytes_written, 4u);
+  uint64_t decoded_value;
+  size_t decoded_length;
+  aom_uleb_decode(&write_buffer[0], bytes_written, &decoded_value,
+                  &decoded_length);
+  ASSERT_EQ(value, decoded_value);
+  ASSERT_EQ(bytes_written, decoded_length);
+}
+
+TEST(AomLeb128, SizeTest) {
+  for (size_t i = 0; i < kSizeTestNumValues; ++i) {
+    ASSERT_EQ(kSizeTestExpectedSizes[i],
+              aom_uleb_size_in_bytes(kSizeTestInputs[i]));
+  }
+}
+
+TEST(AomLeb128, DecodeFailTest) {
+  // Input buffer containing what would be a valid 9 byte LEB128 encoded
+  // unsigned integer.
+  const uint8_t kAllPadBytesBuffer[kMaximumLeb128CodedSize + 1] = {
+    kLeb128PadByte, kLeb128PadByte, kLeb128PadByte,
+    kLeb128PadByte, kLeb128PadByte, kLeb128PadByte,
+    kLeb128PadByte, kLeb128PadByte, 0
+  };
+  uint64_t decoded_value;
+
+  // Test that decode fails when result would be valid 9 byte integer.
+  ASSERT_EQ(aom_uleb_decode(&kAllPadBytesBuffer[0], kMaximumLeb128CodedSize + 1,
+                            &decoded_value, NULL),
+            -1);
+
+  // Test that encoded value missing terminator byte within available buffer
+  // range causes decode error.
+  ASSERT_EQ(aom_uleb_decode(&kAllPadBytesBuffer[0], kMaximumLeb128CodedSize,
+                            &decoded_value, NULL),
+            -1);
+
+  // Test that LEB128 input that decodes to a value larger than 32-bits fails.
+  size_t value_size = 0;
+  ASSERT_EQ(aom_uleb_decode(&kOutOfRangeLeb128Value[0],
+                            sizeof(kOutOfRangeLeb128Value), &decoded_value,
+                            &value_size),
+            -1);
+}
+
+TEST(AomLeb128, EncodeFailTest) {
+  const size_t kWriteBufferSize = 4;
+  const uint32_t kValidTestValue = 1;
+  uint8_t write_buffer[kWriteBufferSize] = { 0 };
+  size_t coded_size = 0;
+  ASSERT_EQ(
+      aom_uleb_encode(kValidTestValue, kWriteBufferSize, NULL, &coded_size),
+      -1);
+  ASSERT_EQ(aom_uleb_encode(kValidTestValue, kWriteBufferSize, &write_buffer[0],
+                            NULL),
+            -1);
+
+  const uint32_t kValueOutOfRangeForBuffer = 0xFFFFFFFF;
+  ASSERT_EQ(aom_uleb_encode(kValueOutOfRangeForBuffer, kWriteBufferSize,
+                            &write_buffer[0], &coded_size),
+            -1);
+
+  const uint64_t kValueOutOfRange = kMaximumLeb128Value + 1;
+  ASSERT_EQ(aom_uleb_encode(kValueOutOfRange, kWriteBufferSize,
+                            &write_buffer[0], &coded_size),
+            -1);
+
+  const size_t kPadSizeOutOfRange = 5;
+  ASSERT_EQ(aom_uleb_encode_fixed_size(kValidTestValue, kWriteBufferSize,
+                                       kPadSizeOutOfRange, &write_buffer[0],
+                                       &coded_size),
+            -1);
+}
diff --git a/libs/libaom/src/test/aomcx_set_ref.sh b/libs/libaom/src/test/aomcx_set_ref.sh
new file mode 100644
index 000000000..f51b73c58
--- /dev/null
+++ b/libs/libaom/src/test/aomcx_set_ref.sh
@@ -0,0 +1,58 @@
+#!/bin/sh
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+## This file tests the libaom aom_cx_set_ref example. To add new tests to this
+## file, do the following:
+##   1. Write a shell function (this is your test).
+##   2. Add the function to aom_cx_set_ref_tests (on a new line).
+##
+. $(dirname $0)/tools_common.sh
+
+# Environment check: $YUV_RAW_INPUT is required.
+aom_cx_set_ref_verify_environment() {
+  if [ ! -e "${YUV_RAW_INPUT}" ]; then
+    echo "Libaom test data must exist in LIBAOM_TEST_DATA_PATH."
+    return 1
+  fi
+}
+
+# Runs aom_cx_set_ref and updates the reference frame before encoding frame 90.
+# $1 is the codec name, which aom_cx_set_ref does not support at present: It's
+# currently used only to name the output file.
+# TODO(tomfinegan): Pass the codec param once the example is updated to support
+# AV1.
+aom_set_ref() {
+  local encoder="${LIBAOM_BIN_PATH}/aom_cx_set_ref${AOM_TEST_EXE_SUFFIX}"
+  local codec="$1"
+  local output_file="${AOM_TEST_OUTPUT_DIR}/aom_cx_set_ref_${codec}.ivf"
+  local ref_frame_num=4
+  local limit=10
+  if [ ! -x "${encoder}" ]; then
+    elog "${encoder} does not exist or is not executable."
+    return 1
+  fi
+
+  eval "${AOM_TEST_PREFIX}" "${encoder}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \
+      "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" \
+      "${ref_frame_num}" "${limit}" ${devnull}
+
+  [ -e "${output_file}" ] || return 1
+}
+
+aom_cx_set_ref_av1() {
+  if [ "$(av1_encode_available)" = "yes" ]; then
+    aom_set_ref av1 || return 1
+  fi
+}
+
+aom_cx_set_ref_tests="aom_cx_set_ref_av1"
+
+run_tests aom_cx_set_ref_verify_environment "${aom_cx_set_ref_tests}"
+
diff --git a/libs/libaom/src/test/aomdec.sh b/libs/libaom/src/test/aomdec.sh
new file mode 100644
index 000000000..927142287
--- /dev/null
+++ b/libs/libaom/src/test/aomdec.sh
@@ -0,0 +1,147 @@
+#!/bin/sh
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+## This file tests aomdec. To add new tests to this file, do the following:
+##   1. Write a shell function (this is your test).
+##   2. Add the function to aomdec_tests (on a new line).
+##
+. $(dirname $0)/tools_common.sh
+
+# Environment check: Make sure input is available.
+aomdec_verify_environment() {
+  if [ "$(av1_encode_available)" != "yes" ] ; then
+    if [ ! -e "${AV1_IVF_FILE}" ] || \
+       [ ! -e "${AV1_OBU_ANNEXB_FILE}" ] || \
+       [ ! -e "${AV1_OBU_SEC5_FILE}" ] || \
+       [ ! -e "${AV1_WEBM_FILE}" ]; then
+      elog "Libaom test data must exist before running this test script when " \
+           " encoding is disabled. "
+      return 1
+    fi
+  fi
+  if [ -z "$(aom_tool_path aomdec)" ]; then
+    elog "aomdec not found. It must exist in LIBAOM_BIN_PATH or its parent."
+    return 1
+  fi
+}
+
+# Wrapper function for running aomdec with pipe input. Requires that
+# LIBAOM_BIN_PATH points to the directory containing aomdec. $1 is used as the
+# input file path and shifted away. All remaining parameters are passed through
+# to aomdec.
+aomdec_pipe() {
+  local input="$1"
+  shift
+  if [ ! -e "${input}" ]; then
+    elog "Input file ($input) missing in aomdec_pipe()"
+    return 1
+  fi
+  cat "${file}" | aomdec - "$@" ${devnull}
+}
+
+
+# Wrapper function for running aomdec. Requires that LIBAOM_BIN_PATH points to
+# the directory containing aomdec. $1 one is used as the input file path and
+# shifted away. All remaining parameters are passed through to aomdec.
+aomdec() {
+  local decoder="$(aom_tool_path aomdec)"
+  local input="$1"
+  shift
+  eval "${AOM_TEST_PREFIX}" "${decoder}" "$input" "$@" ${devnull}
+}
+
+aomdec_can_decode_av1() {
+  if [ "$(av1_decode_available)" = "yes" ]; then
+    echo yes
+  fi
+}
+
+aomdec_av1_ivf() {
+  if [ "$(aomdec_can_decode_av1)" = "yes" ]; then
+    local file="${AV1_IVF_FILE}"
+    if [ ! -e "${file}" ]; then
+      encode_yuv_raw_input_av1 "${file}" --ivf
+    fi
+    aomdec "${AV1_IVF_FILE}" --summary --noblit
+  fi
+}
+
+aomdec_av1_ivf_error_resilient() {
+  if [ "$(aomdec_can_decode_av1)" = "yes" ]; then
+    local file="av1.error-resilient.ivf"
+    if [ ! -e "${file}" ]; then
+      encode_yuv_raw_input_av1 "${file}" --ivf --error-resilient=1
+    fi
+    aomdec "${file}" --summary --noblit
+  fi
+}
+
+aomdec_av1_ivf_multithread() {
+  if [ "$(aomdec_can_decode_av1)" = "yes" ]; then
+    local file="${AV1_IVF_FILE}"
+    if [ ! -e "${file}" ]; then
+      encode_yuv_raw_input_av1 "${file}" --ivf
+    fi
+    for threads in 2 3 4 5 6 7 8; do
+      aomdec "${file}" --summary --noblit --threads=$threads
+    done
+  fi
+}
+
+aomdec_aom_ivf_pipe_input() {
+  if [ "$(aomdec_can_decode_av1)" = "yes" ]; then
+    local file="${AV1_IVF_FILE}"
+    if [ ! -e "${file}" ]; then
+      encode_yuv_raw_input_av1 "${file}" --ivf
+    fi
+    aomdec_pipe "${AV1_IVF_FILE}" --summary --noblit
+  fi
+}
+
+aomdec_av1_obu_annexb() {
+  if [ "$(aomdec_can_decode_av1)" = "yes" ]; then
+    local file="${AV1_OBU_ANNEXB_FILE}"
+    if [ ! -e "${file}" ]; then
+      encode_yuv_raw_input_av1 "${file}" --obu --annexb=1
+    fi
+    aomdec "${file}" --summary --noblit --annexb
+  fi
+}
+
+aomdec_av1_obu_section5() {
+  if [ "$(aomdec_can_decode_av1)" = "yes" ]; then
+    local file="${AV1_OBU_SEC5_FILE}"
+    if [ ! -e "${file}" ]; then
+      encode_yuv_raw_input_av1 "${file}" --obu
+    fi
+    aomdec "${file}" --summary --noblit
+  fi
+}
+
+aomdec_av1_webm() {
+  if [ "$(aomdec_can_decode_av1)" = "yes" ] && \
+     [ "$(webm_io_available)" = "yes" ]; then
+    local file="${AV1_WEBM_FILE}"
+    if [ ! -e "${file}" ]; then
+      encode_yuv_raw_input_av1 "${file}"
+    fi
+    aomdec "${AV1_WEBM_FILE}" --summary --noblit
+  fi
+}
+
+aomdec_tests="aomdec_av1_ivf
+              aomdec_av1_ivf_error_resilient
+              aomdec_av1_ivf_multithread
+              aomdec_aom_ivf_pipe_input
+              aomdec_av1_obu_annexb
+              aomdec_av1_obu_section5
+              aomdec_av1_webm"
+
+run_tests aomdec_verify_environment "${aomdec_tests}"
diff --git a/libs/libaom/src/test/aomenc.sh b/libs/libaom/src/test/aomenc.sh
new file mode 100644
index 000000000..b030397a3
--- /dev/null
+++ b/libs/libaom/src/test/aomenc.sh
@@ -0,0 +1,269 @@
+#!/bin/sh
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+## This file tests aomenc using hantro_collage_w352h288.yuv as input. To add
+## new tests to this file, do the following:
+##   1. Write a shell function (this is your test).
+##   2. Add the function to aomenc_tests (on a new line).
+##
+. $(dirname $0)/tools_common.sh
+
+# Environment check: Make sure input is available.
+aomenc_verify_environment() {
+  if [ ! -e "${YUV_RAW_INPUT}" ]; then
+    elog "The file ${YUV_RAW_INPUT##*/} must exist in LIBAOM_TEST_DATA_PATH."
+    return 1
+  fi
+  if [ "$(aomenc_can_encode_av1)" = "yes" ]; then
+    if [ ! -e "${Y4M_NOSQ_PAR_INPUT}" ]; then
+      elog "The file ${Y4M_NOSQ_PAR_INPUT##*/} must exist in"
+      elog "LIBAOM_TEST_DATA_PATH."
+      return 1
+    fi
+  fi
+  if [ -z "$(aom_tool_path aomenc)" ]; then
+    elog "aomenc not found. It must exist in LIBAOM_BIN_PATH or its parent."
+    return 1
+  fi
+}
+
+aomenc_can_encode_av1() {
+  if [ "$(av1_encode_available)" = "yes" ]; then
+    echo yes
+  fi
+}
+
+aomenc_can_encode_av1() {
+  if [ "$(av1_encode_available)" = "yes" ]; then
+    echo yes
+  fi
+}
+
+# Utilities that echo aomenc input file parameters.
+y4m_input_non_square_par() {
+  echo ""${Y4M_NOSQ_PAR_INPUT}""
+}
+
+y4m_input_720p() {
+  echo ""${Y4M_720P_INPUT}""
+}
+
+# Wrapper function for running aomenc with pipe input. Requires that
+# LIBAOM_BIN_PATH points to the directory containing aomenc. $1 is used as the
+# input file path and shifted away. All remaining parameters are passed through
+# to aomenc.
+aomenc_pipe() {
+  local encoder="$(aom_tool_path aomenc)"
+  local input="$1"
+  shift
+  cat "${input}" | eval "${AOM_TEST_PREFIX}" "${encoder}" - \
+    --test-decode=fatal \
+    "$@" ${devnull}
+}
+
+# Wrapper function for running aomenc. Requires that LIBAOM_BIN_PATH points to
+# the directory containing aomenc. $1 one is used as the input file path and
+# shifted away. All remaining parameters are passed through to aomenc.
+aomenc() {
+  local encoder="$(aom_tool_path aomenc)"
+  local input="$1"
+  shift
+  eval "${AOM_TEST_PREFIX}" "${encoder}" "${input}" \
+    --test-decode=fatal \
+    "$@" ${devnull}
+}
+
+aomenc_av1_ivf() {
+  if [ "$(aomenc_can_encode_av1)" = "yes" ]; then
+    local output="${AV1_IVF_FILE}"
+    if [ -e "${AV1_IVF_FILE}" ]; then
+      output="${AOM_TEST_OUTPUT_DIR}/av1_test.ivf"
+    fi
+    aomenc $(yuv_raw_input) \
+      $(aomenc_encode_test_fast_params) \
+      --ivf \
+      --output="${output}"
+
+    if [ ! -e "${output}" ]; then
+      elog "Output file does not exist."
+      return 1
+    fi
+  fi
+}
+
+aomenc_av1_obu_annexb() {
+   if [ "$(aomenc_can_encode_av1)" = "yes" ]; then
+    local output="${AV1_OBU_ANNEXB_FILE}"
+    if [ -e "${AV1_OBU_ANNEXB_FILE}" ]; then
+      output="${AOM_TEST_OUTPUT_DIR}/av1_test.annexb.obu"
+    fi
+    aomenc $(yuv_raw_input) \
+      $(aomenc_encode_test_fast_params) \
+      --obu \
+      --annexb=1 \
+      --output="${output}"
+
+    if [ ! -e "${output}" ]; then
+      elog "Output file does not exist."
+      return 1
+    fi
+  fi
+}
+
+aomenc_av1_obu_section5() {
+   if [ "$(aomenc_can_encode_av1)" = "yes" ]; then
+    local output="${AV1_OBU_SEC5_FILE}"
+    if [ -e "${AV1_OBU_SEC5_FILE}" ]; then
+      output="${AOM_TEST_OUTPUT_DIR}/av1_test.section5.obu"
+    fi
+    aomenc $(yuv_raw_input) \
+      $(aomenc_encode_test_fast_params) \
+      --obu \
+      --output="${output}"
+
+    if [ ! -e "${output}" ]; then
+      elog "Output file does not exist."
+      return 1
+    fi
+  fi
+}
+
+aomenc_av1_webm() {
+  if [ "$(aomenc_can_encode_av1)" = "yes" ] && \
+     [ "$(webm_io_available)" = "yes" ]; then
+    local output="${AV1_WEBM_FILE}"
+    if [ -e "${AV1_WEBM_FILE}" ]; then
+      output="${AOM_TEST_OUTPUT_DIR}/av1_test.webm"
+    fi
+    aomenc $(yuv_raw_input) \
+      $(aomenc_encode_test_fast_params) \
+      --output="${output}"
+
+    if [ ! -e "${output}" ]; then
+      elog "Output file does not exist."
+      return 1
+    fi
+  fi
+}
+
+aomenc_av1_webm_1pass() {
+  if [ "$(aomenc_can_encode_av1)" = "yes" ] && \
+     [ "$(webm_io_available)" = "yes" ]; then
+    local output="${AOM_TEST_OUTPUT_DIR}/av1_test.webm"
+    aomenc $(yuv_raw_input) \
+      $(aomenc_encode_test_fast_params) \
+      --passes=1 \
+      --output="${output}"
+
+    if [ ! -e "${output}" ]; then
+      elog "Output file does not exist."
+      return 1
+    fi
+  fi
+}
+
+aomenc_av1_ivf_lossless() {
+  if [ "$(aomenc_can_encode_av1)" = "yes" ]; then
+    local output="${AOM_TEST_OUTPUT_DIR}/av1_lossless.ivf"
+    aomenc $(yuv_raw_input) \
+      $(aomenc_encode_test_fast_params) \
+      --ivf \
+      --output="${output}" \
+      --lossless=1
+
+    if [ ! -e "${output}" ]; then
+      elog "Output file does not exist."
+      return 1
+    fi
+  fi
+}
+
+aomenc_av1_ivf_minq0_maxq0() {
+  if [ "$(aomenc_can_encode_av1)" = "yes" ]; then
+    local output="${AOM_TEST_OUTPUT_DIR}/av1_lossless_minq0_maxq0.ivf"
+    aomenc $(yuv_raw_input) \
+      $(aomenc_encode_test_fast_params) \
+      --ivf \
+      --output="${output}" \
+      --min-q=0 \
+      --max-q=0
+
+    if [ ! -e "${output}" ]; then
+      elog "Output file does not exist."
+      return 1
+    fi
+  fi
+}
+
+aomenc_av1_webm_lag5_frames10() {
+  if [ "$(aomenc_can_encode_av1)" = "yes" ] && \
+     [ "$(webm_io_available)" = "yes" ]; then
+    local lag_total_frames=10
+    local lag_frames=5
+    local output="${AOM_TEST_OUTPUT_DIR}/av1_lag5_frames10.webm"
+    aomenc $(yuv_raw_input) \
+      $(aomenc_encode_test_fast_params) \
+      --limit=${lag_total_frames} \
+      --lag-in-frames=${lag_frames} \
+      --output="${output}"
+
+    if [ ! -e "${output}" ]; then
+      elog "Output file does not exist."
+      return 1
+    fi
+  fi
+}
+
+# TODO(fgalligan): Test that DisplayWidth is different than video width.
+aomenc_av1_webm_non_square_par() {
+  if [ "$(aomenc_can_encode_av1)" = "yes" ] && \
+     [ "$(webm_io_available)" = "yes" ]; then
+    local output="${AOM_TEST_OUTPUT_DIR}/av1_non_square_par.webm"
+    aomenc $(y4m_input_non_square_par) \
+      $(aomenc_encode_test_fast_params) \
+      --output="${output}"
+
+    if [ ! -e "${output}" ]; then
+      elog "Output file does not exist."
+      return 1
+    fi
+  fi
+}
+
+aomenc_av1_webm_cdf_update_mode() {
+  if [ "$(aomenc_can_encode_av1)" = "yes" ] && \
+     [ "$(webm_io_available)" = "yes" ]; then
+    for mode in 0 1 2; do
+      local output="${AOM_TEST_OUTPUT_DIR}/cdf_mode_${mode}.webm"
+      aomenc $(yuv_raw_input) \
+        $(aomenc_encode_test_fast_params) \
+        --cdf-update-mode=${mode} \
+        --output="${output}"
+
+      if [ ! -e "${output}" ]; then
+        elog "Output file does not exist."
+        return 1
+      fi
+    done
+  fi
+}
+
+aomenc_tests="aomenc_av1_ivf
+              aomenc_av1_obu_annexb
+              aomenc_av1_obu_section5
+              aomenc_av1_webm
+              aomenc_av1_webm_1pass
+              aomenc_av1_ivf_lossless
+              aomenc_av1_ivf_minq0_maxq0
+              aomenc_av1_webm_lag5_frames10
+              aomenc_av1_webm_non_square_par
+              aomenc_av1_webm_cdf_update_mode"
+
+run_tests aomenc_verify_environment "${aomenc_tests}"
diff --git a/libs/libaom/src/test/aq_segment_test.cc b/libs/libaom/src/test/aq_segment_test.cc
new file mode 100644
index 000000000..83bfdb670
--- /dev/null
+++ b/libs/libaom/src/test/aq_segment_test.cc
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_config.h"
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+
+namespace {
+
+class AqSegmentTest
+    : public ::libaom_test::CodecTestWith3Params<libaom_test::TestMode, int,
+                                                 int>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  AqSegmentTest() : EncoderTest(GET_PARAM(0)) {}
+  virtual ~AqSegmentTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(GET_PARAM(1));
+    set_cpu_used_ = GET_PARAM(2);
+    aq_mode_ = 0;
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
+      encoder->Control(AV1E_SET_AQ_MODE, aq_mode_);
+      encoder->Control(AV1E_SET_DELTAQ_MODE, deltaq_mode_);
+      encoder->Control(AOME_SET_MAX_INTRA_BITRATE_PCT, 100);
+    }
+  }
+
+  void DoTest(int aq_mode) {
+    aq_mode_ = aq_mode;
+    deltaq_mode_ = 0;
+    cfg_.kf_max_dist = 12;
+    cfg_.rc_min_quantizer = 8;
+    cfg_.rc_max_quantizer = 56;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_lag_in_frames = 6;
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_target_bitrate = 300;
+    ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+                                         288, 30, 1, 0, 15);
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+
+  int set_cpu_used_;
+  int aq_mode_;
+  int deltaq_mode_;
+};
+
+// Validate that this AQ segmentation mode (1-variance_aq, 2-complexity_aq,
+// 3-cyclic_refresh_aq) encodes and decodes without a mismatch.
+TEST_P(AqSegmentTest, TestNoMisMatch) { DoTest(GET_PARAM(3)); }
+
+class AqSegmentTestLarge : public AqSegmentTest {};
+
+TEST_P(AqSegmentTestLarge, TestNoMisMatch) { DoTest(GET_PARAM(3)); }
+
+// Validate that this delta q mode
+// encodes and decodes without a mismatch.
+TEST_P(AqSegmentTest, TestNoMisMatchExtDeltaQ) {
+  cfg_.rc_end_usage = AOM_CQ;
+  aq_mode_ = 0;
+  deltaq_mode_ = 2;
+  ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 15);
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+AV1_INSTANTIATE_TEST_CASE(AqSegmentTest,
+                          ::testing::Values(::libaom_test::kRealTime,
+                                            ::libaom_test::kOnePassGood),
+                          ::testing::Range(5, 9), ::testing::Range(0, 4));
+AV1_INSTANTIATE_TEST_CASE(AqSegmentTestLarge,
+                          ::testing::Values(::libaom_test::kRealTime,
+                                            ::libaom_test::kOnePassGood),
+                          ::testing::Range(3, 5), ::testing::Range(0, 4));
+}  // namespace
diff --git a/libs/libaom/src/test/arf_freq_test.cc b/libs/libaom/src/test/arf_freq_test.cc
new file mode 100644
index 000000000..0780cd712
--- /dev/null
+++ b/libs/libaom/src/test/arf_freq_test.cc
@@ -0,0 +1,225 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <memory>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+#include "test/yuv_video_source.h"
+#include "av1/encoder/ratectrl.h"
+
+namespace {
+
+const unsigned int kFrames = 100;
+const int kBitrate = 500;
+
+#define ARF_NOT_SEEN 1000001
+#define ARF_SEEN_ONCE 1000000
+
+typedef struct {
+  const char *filename;
+  unsigned int width;
+  unsigned int height;
+  unsigned int framerate_num;
+  unsigned int framerate_den;
+  unsigned int input_bit_depth;
+  aom_img_fmt fmt;
+  aom_bit_depth_t bit_depth;
+  unsigned int profile;
+} TestVideoParam;
+
+typedef struct {
+  libaom_test::TestMode mode;
+  int cpu_used;
+} TestEncodeParam;
+
+const TestVideoParam kTestVectors[] = {
+  // artificially increase framerate to trigger default check
+  { "hantro_collage_w352h288.yuv", 352, 288, 5000, 1, 8, AOM_IMG_FMT_I420,
+    AOM_BITS_8, 0 },
+  { "hantro_collage_w352h288.yuv", 352, 288, 30, 1, 8, AOM_IMG_FMT_I420,
+    AOM_BITS_8, 0 },
+  { "rush_hour_444.y4m", 352, 288, 30, 1, 8, AOM_IMG_FMT_I444, AOM_BITS_8, 1 },
+  // Add list of profile 2/3 test videos here ...
+};
+
+const TestEncodeParam kEncodeVectors[] = {
+  { ::libaom_test::kOnePassGood, 2 }, { ::libaom_test::kOnePassGood, 5 },
+  { ::libaom_test::kTwoPassGood, 1 }, { ::libaom_test::kTwoPassGood, 2 },
+  { ::libaom_test::kTwoPassGood, 5 }, { ::libaom_test::kRealTime, 5 },
+};
+
+const int kMinArfVectors[] = {
+  // NOTE: 0 refers to the default built-in logic in:
+  //       av1_rc_get_default_min_gf_interval(...)
+  0, 4, 8, 12, 15
+};
+
+int is_extension_y4m(const char *filename) {
+  const char *dot = strrchr(filename, '.');
+  if (!dot || dot == filename)
+    return 0;
+  else
+    return !strcmp(dot, ".y4m");
+}
+
+class ArfFreqTestLarge
+    : public ::libaom_test::CodecTestWith3Params<TestVideoParam,
+                                                 TestEncodeParam, int>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  ArfFreqTestLarge()
+      : EncoderTest(GET_PARAM(0)), test_video_param_(GET_PARAM(1)),
+        test_encode_param_(GET_PARAM(2)), min_arf_requested_(GET_PARAM(3)) {}
+
+  virtual ~ArfFreqTestLarge() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(test_encode_param_.mode);
+    if (test_encode_param_.mode != ::libaom_test::kRealTime) {
+      cfg_.g_lag_in_frames = 25;
+      cfg_.rc_end_usage = AOM_VBR;
+    } else {
+      cfg_.g_lag_in_frames = 0;
+      cfg_.rc_end_usage = AOM_CBR;
+      cfg_.rc_buf_sz = 1000;
+      cfg_.rc_buf_initial_sz = 500;
+      cfg_.rc_buf_optimal_sz = 600;
+    }
+  }
+
+  virtual void BeginPassHook(unsigned int) {
+    min_run_ = ARF_NOT_SEEN;
+    run_of_visible_frames_ = 0;
+  }
+
+  int GetNumFramesInPkt(const aom_codec_cx_pkt_t *pkt) {
+    const uint8_t *buffer = reinterpret_cast<uint8_t *>(pkt->data.frame.buf);
+    const uint8_t marker = buffer[pkt->data.frame.sz - 1];
+    const int mag = ((marker >> 3) & 3) + 1;
+    int frames = (marker & 0x7) + 1;
+    const unsigned int index_sz = 2 + mag * frames;
+    // Check for superframe or not.
+    // Assume superframe has only one visible frame, the rest being
+    // invisible. If superframe index is not found, then there is only
+    // one frame.
+    if (!((marker & 0xe0) == 0xc0 && pkt->data.frame.sz >= index_sz &&
+          buffer[pkt->data.frame.sz - index_sz] == marker)) {
+      frames = 1;
+    }
+    return frames;
+  }
+
+  virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
+    if (pkt->kind != AOM_CODEC_CX_FRAME_PKT) return;
+    const int frames = GetNumFramesInPkt(pkt);
+    if (frames == 1) {
+      run_of_visible_frames_++;
+    } else if (frames == 2) {
+      if (min_run_ == ARF_NOT_SEEN) {
+        min_run_ = ARF_SEEN_ONCE;
+      } else if (min_run_ == ARF_SEEN_ONCE ||
+                 run_of_visible_frames_ < min_run_) {
+        min_run_ = run_of_visible_frames_;
+      }
+      run_of_visible_frames_ = 1;
+    } else {
+      min_run_ = 0;
+      run_of_visible_frames_ = 1;
+    }
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1);
+      encoder->Control(AV1E_SET_TILE_COLUMNS, 4);
+      encoder->Control(AOME_SET_CPUUSED, test_encode_param_.cpu_used);
+      encoder->Control(AV1E_SET_MIN_GF_INTERVAL, min_arf_requested_);
+      if (test_encode_param_.mode != ::libaom_test::kRealTime) {
+        encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+        encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
+        encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
+      }
+    }
+  }
+
+  int GetMinVisibleRun() const { return min_run_; }
+
+  int GetMinArfDistanceRequested() const {
+    if (min_arf_requested_)
+      return min_arf_requested_;
+    else
+      return av1_rc_get_default_min_gf_interval(
+          test_video_param_.width, test_video_param_.height,
+          (double)test_video_param_.framerate_num /
+              test_video_param_.framerate_den);
+  }
+
+  TestVideoParam test_video_param_;
+  TestEncodeParam test_encode_param_;
+
+ private:
+  int min_arf_requested_;
+  int min_run_;
+  int run_of_visible_frames_;
+};
+
+TEST_P(ArfFreqTestLarge, MinArfFreqTest) {
+  cfg_.rc_target_bitrate = kBitrate;
+  cfg_.g_error_resilient = 0;
+  cfg_.g_profile = test_video_param_.profile;
+  cfg_.g_input_bit_depth = test_video_param_.input_bit_depth;
+  cfg_.g_bit_depth = test_video_param_.bit_depth;
+  init_flags_ = AOM_CODEC_USE_PSNR;
+  if (cfg_.g_bit_depth > 8) init_flags_ |= AOM_CODEC_USE_HIGHBITDEPTH;
+
+  std::unique_ptr<libaom_test::VideoSource> video;
+  if (is_extension_y4m(test_video_param_.filename)) {
+    video.reset(new libaom_test::Y4mVideoSource(test_video_param_.filename, 0,
+                                                kFrames));
+  } else {
+    video.reset(new libaom_test::YUVVideoSource(
+        test_video_param_.filename, test_video_param_.fmt,
+        test_video_param_.width, test_video_param_.height,
+        test_video_param_.framerate_num, test_video_param_.framerate_den, 0,
+        kFrames));
+  }
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+  const int min_run = GetMinVisibleRun();
+  const int min_arf_dist_requested = GetMinArfDistanceRequested();
+  if (min_run != ARF_NOT_SEEN && min_run != ARF_SEEN_ONCE) {
+    const int min_arf_dist = min_run + 1;
+    EXPECT_GE(min_arf_dist, min_arf_dist_requested);
+  }
+}
+
+#if CONFIG_AV1_ENCODER
+// TODO(angiebird): 25-29 fail in high bitdepth mode.
+// TODO(zoeliu): This ArfFreqTest does not work with BWDREF_FRAME, as
+// BWDREF_FRAME is also a non-show frame, and the minimum run between two
+// consecutive BWDREF_FRAME's may vary between 1 and any arbitrary positive
+// number as long as it does not exceed the gf_group interval.
+INSTANTIATE_TEST_SUITE_P(
+    DISABLED_AV1, ArfFreqTestLarge,
+    ::testing::Combine(
+        ::testing::Values(
+            static_cast<const libaom_test::CodecFactory *>(&libaom_test::kAV1)),
+        ::testing::ValuesIn(kTestVectors), ::testing::ValuesIn(kEncodeVectors),
+        ::testing::ValuesIn(kMinArfVectors)));
+#endif  // CONFIG_AV1_ENCODER
+}  // namespace
diff --git a/libs/libaom/src/test/av1_common_int_test.cc b/libs/libaom/src/test/av1_common_int_test.cc
new file mode 100644
index 000000000..dde2542e3
--- /dev/null
+++ b/libs/libaom/src/test/av1_common_int_test.cc
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "av1/common/av1_common_int.h"
+
+TEST(AV1CommonInt, TestGetTxSize) {
+  for (int t = TX_4X4; t < TX_SIZES_ALL; t++) {
+    TX_SIZE t2 = get_tx_size(tx_size_wide[t], tx_size_high[t]);
+    GTEST_ASSERT_EQ(tx_size_wide[t], tx_size_wide[t2]);
+    GTEST_ASSERT_EQ(tx_size_high[t], tx_size_high[t2]);
+  }
+}
diff --git a/libs/libaom/src/test/av1_config_test.cc b/libs/libaom/src/test/av1_config_test.cc
new file mode 100644
index 000000000..fca980f06
--- /dev/null
+++ b/libs/libaom/src/test/av1_config_test.cc
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <string.h>
+
+#include "common/av1_config.h"
+#include "test/util.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+//
+// Input buffers containing exactly one Sequence Header OBU.
+//
+// Each buffer is named according to the OBU storage format (Annex-B vs Low
+// Overhead Bitstream Format) and the type of Sequence Header OBU ("Full"
+// Sequence Header OBUs vs Sequence Header OBUs with the
+// reduced_still_image_flag set).
+//
+const uint8_t kAnnexBFullSequenceHeaderObu[] = { 0x0c, 0x08, 0x00, 0x00, 0x00,
+                                                 0x04, 0x45, 0x7e, 0x3e, 0xff,
+                                                 0xfc, 0xc0, 0x20 };
+const uint8_t kAnnexBReducedStillImageSequenceHeaderObu[] = {
+  0x08, 0x08, 0x18, 0x22, 0x2b, 0xf1, 0xfe, 0xc0, 0x20
+};
+
+const uint8_t kLobfFullSequenceHeaderObu[] = { 0x0a, 0x0b, 0x00, 0x00, 0x00,
+                                               0x04, 0x45, 0x7e, 0x3e, 0xff,
+                                               0xfc, 0xc0, 0x20 };
+
+const uint8_t kLobfReducedStillImageSequenceHeaderObu[] = { 0x0a, 0x07, 0x18,
+                                                            0x22, 0x2b, 0xf1,
+                                                            0xfe, 0xc0, 0x20 };
+
+const uint8_t kAv1cAllZero[] = { 0, 0, 0, 0 };
+
+// The size of AV1 config when no configOBUs are present at the end of the
+// configuration structure.
+const size_t kAv1cNoConfigObusSize = 4;
+
+bool VerifyAv1c(const uint8_t *const obu_buffer, size_t obu_buffer_length,
+                bool is_annexb) {
+  Av1Config av1_config;
+  memset(&av1_config, 0, sizeof(av1_config));
+  bool parse_ok = get_av1config_from_obu(obu_buffer, obu_buffer_length,
+                                         is_annexb, &av1_config) == 0;
+  if (parse_ok) {
+    EXPECT_EQ(1, av1_config.marker);
+    EXPECT_EQ(1, av1_config.version);
+    EXPECT_EQ(0, av1_config.seq_profile);
+    EXPECT_EQ(0, av1_config.seq_level_idx_0);
+    EXPECT_EQ(0, av1_config.seq_tier_0);
+    EXPECT_EQ(0, av1_config.high_bitdepth);
+    EXPECT_EQ(0, av1_config.twelve_bit);
+    EXPECT_EQ(0, av1_config.monochrome);
+    EXPECT_EQ(1, av1_config.chroma_subsampling_x);
+    EXPECT_EQ(1, av1_config.chroma_subsampling_y);
+    EXPECT_EQ(0, av1_config.chroma_sample_position);
+    EXPECT_EQ(0, av1_config.initial_presentation_delay_present);
+    EXPECT_EQ(0, av1_config.initial_presentation_delay_minus_one);
+  }
+  return parse_ok && ::testing::Test::HasFailure() == false;
+}
+
+TEST(Av1Config, ObuInvalidInputs) {
+  Av1Config av1_config;
+  memset(&av1_config, 0, sizeof(av1_config));
+  ASSERT_EQ(-1, get_av1config_from_obu(NULL, 0, 0, NULL));
+  ASSERT_EQ(-1,
+            get_av1config_from_obu(&kLobfFullSequenceHeaderObu[0], 0, 0, NULL));
+  ASSERT_EQ(
+      -1, get_av1config_from_obu(&kLobfFullSequenceHeaderObu[0],
+                                 sizeof(kLobfFullSequenceHeaderObu), 0, NULL));
+  ASSERT_EQ(-1, get_av1config_from_obu(NULL, sizeof(kLobfFullSequenceHeaderObu),
+                                       0, NULL));
+  ASSERT_EQ(-1, get_av1config_from_obu(&kLobfFullSequenceHeaderObu[0], 0, 0,
+                                       &av1_config));
+}
+
+TEST(Av1Config, ReadInvalidInputs) {
+  Av1Config av1_config;
+  memset(&av1_config, 0, sizeof(av1_config));
+  size_t bytes_read = 0;
+  ASSERT_EQ(-1, read_av1config(NULL, 0, NULL, NULL));
+  ASSERT_EQ(-1, read_av1config(NULL, 4, NULL, NULL));
+  ASSERT_EQ(-1, read_av1config(&kAv1cAllZero[0], 0, NULL, NULL));
+  ASSERT_EQ(-1, read_av1config(&kAv1cAllZero[0], 4, &bytes_read, NULL));
+  ASSERT_EQ(-1, read_av1config(NULL, 4, &bytes_read, &av1_config));
+}
+
+TEST(Av1Config, WriteInvalidInputs) {
+  Av1Config av1_config;
+  memset(&av1_config, 0, sizeof(av1_config));
+  size_t bytes_written = 0;
+  uint8_t av1c_buffer[4] = { 0 };
+  ASSERT_EQ(-1, write_av1config(NULL, 0, NULL, NULL));
+  ASSERT_EQ(-1, write_av1config(&av1_config, 0, NULL, NULL));
+  ASSERT_EQ(-1, write_av1config(&av1_config, 0, &bytes_written, NULL));
+
+  ASSERT_EQ(-1,
+            write_av1config(&av1_config, 0, &bytes_written, &av1c_buffer[0]));
+  ASSERT_EQ(-1, write_av1config(&av1_config, 4, &bytes_written, NULL));
+}
+
+TEST(Av1Config, GetAv1ConfigFromLobfObu) {
+  // Test parsing of a Sequence Header OBU with the reduced_still_picture_header
+  // unset-- aka a full Sequence Header OBU.
+  ASSERT_TRUE(VerifyAv1c(kLobfFullSequenceHeaderObu,
+                         sizeof(kLobfFullSequenceHeaderObu), false));
+
+  // Test parsing of a reduced still image Sequence Header OBU.
+  ASSERT_TRUE(VerifyAv1c(kLobfReducedStillImageSequenceHeaderObu,
+                         sizeof(kLobfReducedStillImageSequenceHeaderObu),
+                         false));
+}
+
+TEST(Av1Config, GetAv1ConfigFromAnnexBObu) {
+  // Test parsing of a Sequence Header OBU with the reduced_still_picture_header
+  // unset-- aka a full Sequence Header OBU.
+  ASSERT_TRUE(VerifyAv1c(kAnnexBFullSequenceHeaderObu,
+                         sizeof(kAnnexBFullSequenceHeaderObu), true));
+
+  // Test parsing of a reduced still image Sequence Header OBU.
+  ASSERT_TRUE(VerifyAv1c(kAnnexBReducedStillImageSequenceHeaderObu,
+                         sizeof(kAnnexBReducedStillImageSequenceHeaderObu),
+                         true));
+}
+
+TEST(Av1Config, ReadWriteConfig) {
+  Av1Config av1_config;
+  memset(&av1_config, 0, sizeof(av1_config));
+
+  // Test writing out the AV1 config.
+  size_t bytes_written = 0;
+  uint8_t av1c_buffer[4] = { 0 };
+  ASSERT_EQ(0, write_av1config(&av1_config, sizeof(av1c_buffer), &bytes_written,
+                               &av1c_buffer[0]));
+  ASSERT_EQ(kAv1cNoConfigObusSize, bytes_written);
+  for (size_t i = 0; i < kAv1cNoConfigObusSize; ++i) {
+    ASSERT_EQ(kAv1cAllZero[i], av1c_buffer[i])
+        << "Mismatch in output Av1Config at offset=" << i;
+  }
+
+  // Test reading the AV1 config.
+  size_t bytes_read = 0;
+  ASSERT_EQ(0, read_av1config(&kAv1cAllZero[0], sizeof(kAv1cAllZero),
+                              &bytes_read, &av1_config));
+  ASSERT_EQ(kAv1cNoConfigObusSize, bytes_read);
+  ASSERT_EQ(0, write_av1config(&av1_config, sizeof(av1c_buffer), &bytes_written,
+                               &av1c_buffer[0]));
+  for (size_t i = 0; i < kAv1cNoConfigObusSize; ++i) {
+    ASSERT_EQ(kAv1cAllZero[i], av1c_buffer[i])
+        << "Mismatch in output Av1Config at offset=" << i;
+  }
+}
+
+}  // namespace
diff --git a/libs/libaom/src/test/av1_convolve_2d_test.cc b/libs/libaom/src/test/av1_convolve_2d_test.cc
new file mode 100644
index 000000000..50a58f06d
--- /dev/null
+++ b/libs/libaom/src/test/av1_convolve_2d_test.cc
@@ -0,0 +1,261 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tuple>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/av1_convolve_2d_test_util.h"
+
+using libaom_test::ACMRandom;
+using libaom_test::AV1Convolve2D::AV1Convolve2DSrTest;
+using libaom_test::AV1Convolve2D::AV1JntConvolve2DTest;
+#if CONFIG_AV1_HIGHBITDEPTH
+using libaom_test::AV1HighbdConvolve2D::AV1HighbdConvolve2DSrTest;
+using libaom_test::AV1HighbdConvolve2D::AV1HighbdJntConvolve2DTest;
+#endif
+using std::make_tuple;
+using std::tuple;
+
+namespace {
+
+TEST_P(AV1Convolve2DSrTest, DISABLED_Speed) { RunSpeedTest(GET_PARAM(0)); }
+
+TEST_P(AV1Convolve2DSrTest, CheckOutput) { RunCheckOutput(GET_PARAM(0)); }
+
+INSTANTIATE_TEST_SUITE_P(
+    C_COPY, AV1Convolve2DSrTest,
+    libaom_test::AV1Convolve2D::BuildParams(av1_convolve_2d_copy_sr_c, 0, 0));
+INSTANTIATE_TEST_SUITE_P(
+    C_X, AV1Convolve2DSrTest,
+    libaom_test::AV1Convolve2D::BuildParams(av1_convolve_x_sr_c, 1, 0));
+INSTANTIATE_TEST_SUITE_P(
+    C_Y, AV1Convolve2DSrTest,
+    libaom_test::AV1Convolve2D::BuildParams(av1_convolve_y_sr_c, 0, 1));
+INSTANTIATE_TEST_SUITE_P(
+    C, AV1Convolve2DSrTest,
+    libaom_test::AV1Convolve2D::BuildParams(av1_convolve_2d_sr_c, 1, 1));
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(SSE2_COPY, AV1Convolve2DSrTest,
+                         libaom_test::AV1Convolve2D::BuildParams(
+                             av1_convolve_2d_copy_sr_sse2, 0, 0));
+INSTANTIATE_TEST_SUITE_P(
+    SSE2_X, AV1Convolve2DSrTest,
+    libaom_test::AV1Convolve2D::BuildParams(av1_convolve_x_sr_sse2, 1, 0));
+INSTANTIATE_TEST_SUITE_P(
+    SSE2_Y, AV1Convolve2DSrTest,
+    libaom_test::AV1Convolve2D::BuildParams(av1_convolve_y_sr_sse2, 0, 1));
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, AV1Convolve2DSrTest,
+    libaom_test::AV1Convolve2D::BuildParams(av1_convolve_2d_sr_sse2, 1, 1));
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2_COPY, AV1Convolve2DSrTest,
+                         libaom_test::AV1Convolve2D::BuildParams(
+                             av1_convolve_2d_copy_sr_avx2, 0, 0));
+INSTANTIATE_TEST_SUITE_P(
+    AVX2_X, AV1Convolve2DSrTest,
+    libaom_test::AV1Convolve2D::BuildParams(av1_convolve_x_sr_avx2, 1, 0));
+
+INSTANTIATE_TEST_SUITE_P(
+    AVX2_Y, AV1Convolve2DSrTest,
+    libaom_test::AV1Convolve2D::BuildParams(av1_convolve_y_sr_avx2, 0, 1));
+
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, AV1Convolve2DSrTest,
+    libaom_test::AV1Convolve2D::BuildParams(av1_convolve_2d_sr_avx2, 1, 1));
+#endif  // HAVE_AVX2
+#endif  // HAVE_SSE2
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+    NEON_X, AV1Convolve2DSrTest,
+    libaom_test::AV1Convolve2D::BuildParams(av1_convolve_x_sr_neon, 1, 0));
+
+INSTANTIATE_TEST_SUITE_P(
+    NEON_Y, AV1Convolve2DSrTest,
+    libaom_test::AV1Convolve2D::BuildParams(av1_convolve_y_sr_neon, 0, 1));
+
+INSTANTIATE_TEST_SUITE_P(
+    NEON, AV1Convolve2DSrTest,
+    libaom_test::AV1Convolve2D::BuildParams(av1_convolve_2d_sr_neon, 1, 1));
+
+INSTANTIATE_TEST_SUITE_P(NEON_COPY, AV1Convolve2DSrTest,
+                         libaom_test::AV1Convolve2D::BuildParams(
+                             av1_convolve_2d_copy_sr_neon, 0, 0));
+#endif  // HAVE_NEON
+
+TEST_P(AV1JntConvolve2DTest, CheckOutput) { RunCheckOutput(GET_PARAM(0)); }
+TEST_P(AV1JntConvolve2DTest, DISABLED_Speed) { RunSpeedTest(GET_PARAM(0)); }
+
+INSTANTIATE_TEST_SUITE_P(C_COPY, AV1JntConvolve2DTest,
+                         libaom_test::AV1Convolve2D::BuildParams(
+                             av1_dist_wtd_convolve_2d_copy_c, 0, 0));
+
+INSTANTIATE_TEST_SUITE_P(
+    C_X, AV1JntConvolve2DTest,
+    libaom_test::AV1Convolve2D::BuildParams(av1_dist_wtd_convolve_x_c, 1, 0));
+
+INSTANTIATE_TEST_SUITE_P(
+    C_Y, AV1JntConvolve2DTest,
+    libaom_test::AV1Convolve2D::BuildParams(av1_dist_wtd_convolve_y_c, 0, 1));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(SSE2_COPY, AV1JntConvolve2DTest,
+                         libaom_test::AV1Convolve2D::BuildParams(
+                             av1_dist_wtd_convolve_2d_copy_sse2, 0, 0));
+INSTANTIATE_TEST_SUITE_P(SSE2, AV1JntConvolve2DTest,
+                         libaom_test::AV1Convolve2D::BuildParams(
+                             av1_dist_wtd_convolve_2d_sse2, 1, 1));
+
+INSTANTIATE_TEST_SUITE_P(SSE2_X, AV1JntConvolve2DTest,
+                         libaom_test::AV1Convolve2D::BuildParams(
+                             av1_dist_wtd_convolve_x_sse2, 1, 0));
+
+INSTANTIATE_TEST_SUITE_P(SSE2_Y, AV1JntConvolve2DTest,
+                         libaom_test::AV1Convolve2D::BuildParams(
+                             av1_dist_wtd_convolve_y_sse2, 0, 1));
+
+#if HAVE_SSSE3
+INSTANTIATE_TEST_SUITE_P(SSSE3, AV1JntConvolve2DTest,
+                         libaom_test::AV1Convolve2D::BuildParams(
+                             av1_dist_wtd_convolve_2d_ssse3, 1, 1));
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2_COPY, AV1JntConvolve2DTest,
+                         libaom_test::AV1Convolve2D::BuildParams(
+                             av1_dist_wtd_convolve_2d_copy_avx2, 0, 0));
+INSTANTIATE_TEST_SUITE_P(AVX2_X, AV1JntConvolve2DTest,
+                         libaom_test::AV1Convolve2D::BuildParams(
+                             av1_dist_wtd_convolve_x_avx2, 1, 0));
+
+INSTANTIATE_TEST_SUITE_P(AVX2_Y, AV1JntConvolve2DTest,
+                         libaom_test::AV1Convolve2D::BuildParams(
+                             av1_dist_wtd_convolve_y_avx2, 0, 1));
+
+INSTANTIATE_TEST_SUITE_P(AVX2, AV1JntConvolve2DTest,
+                         libaom_test::AV1Convolve2D::BuildParams(
+                             av1_dist_wtd_convolve_2d_avx2, 1, 1));
+#endif  // HAVE_AVX2
+#endif  // HAVE_SSSE3
+#endif  // HAVE_SSE2
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON_COPY, AV1JntConvolve2DTest,
+                         libaom_test::AV1Convolve2D::BuildParams(
+                             av1_dist_wtd_convolve_2d_copy_neon, 0, 0));
+
+INSTANTIATE_TEST_SUITE_P(NEON, AV1JntConvolve2DTest,
+                         libaom_test::AV1Convolve2D::BuildParams(
+                             av1_dist_wtd_convolve_2d_neon, 1, 1));
+INSTANTIATE_TEST_SUITE_P(NEON_X, AV1JntConvolve2DTest,
+                         libaom_test::AV1Convolve2D::BuildParams(
+                             av1_dist_wtd_convolve_x_neon, 1, 0));
+
+INSTANTIATE_TEST_SUITE_P(NEON_Y, AV1JntConvolve2DTest,
+                         libaom_test::AV1Convolve2D::BuildParams(
+                             av1_dist_wtd_convolve_y_neon, 0, 1));
+#endif  // HAVE_NEON
+
+#if CONFIG_AV1_HIGHBITDEPTH
+TEST_P(AV1HighbdConvolve2DSrTest, CheckOutput) { RunCheckOutput(GET_PARAM(1)); }
+TEST_P(AV1HighbdConvolve2DSrTest, DISABLED_Speed) {
+  RunSpeedTest(GET_PARAM(1));
+}
+
+INSTANTIATE_TEST_SUITE_P(C_X, AV1HighbdConvolve2DSrTest,
+                         libaom_test::AV1HighbdConvolve2D::BuildParams(
+                             av1_highbd_convolve_x_sr_c, 1, 0));
+
+INSTANTIATE_TEST_SUITE_P(C_Y, AV1HighbdConvolve2DSrTest,
+                         libaom_test::AV1HighbdConvolve2D::BuildParams(
+                             av1_highbd_convolve_y_sr_c, 0, 1));
+
+INSTANTIATE_TEST_SUITE_P(C_COPY, AV1HighbdConvolve2DSrTest,
+                         libaom_test::AV1HighbdConvolve2D::BuildParams(
+                             av1_highbd_convolve_2d_copy_sr_c, 0, 0));
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(SSE2_COPY, AV1HighbdConvolve2DSrTest,
+                         libaom_test::AV1HighbdConvolve2D::BuildParams(
+                             av1_highbd_convolve_2d_copy_sr_sse2, 0, 0));
+#if HAVE_SSSE3
+INSTANTIATE_TEST_SUITE_P(SSSE3, AV1HighbdConvolve2DSrTest,
+                         libaom_test::AV1HighbdConvolve2D::BuildParams(
+                             av1_highbd_convolve_2d_sr_ssse3, 1, 1));
+INSTANTIATE_TEST_SUITE_P(SSSE3_X, AV1HighbdConvolve2DSrTest,
+                         libaom_test::AV1HighbdConvolve2D::BuildParams(
+                             av1_highbd_convolve_x_sr_ssse3, 1, 0));
+INSTANTIATE_TEST_SUITE_P(SSSE3_Y, AV1HighbdConvolve2DSrTest,
+                         libaom_test::AV1HighbdConvolve2D::BuildParams(
+                             av1_highbd_convolve_y_sr_ssse3, 0, 1));
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, AV1HighbdConvolve2DSrTest,
+                         libaom_test::AV1HighbdConvolve2D::BuildParams(
+                             av1_highbd_convolve_2d_sr_avx2, 1, 1));
+INSTANTIATE_TEST_SUITE_P(AVX2_X, AV1HighbdConvolve2DSrTest,
+                         libaom_test::AV1HighbdConvolve2D::BuildParams(
+                             av1_highbd_convolve_x_sr_avx2, 1, 0));
+INSTANTIATE_TEST_SUITE_P(AVX2_Y, AV1HighbdConvolve2DSrTest,
+                         libaom_test::AV1HighbdConvolve2D::BuildParams(
+                             av1_highbd_convolve_y_sr_avx2, 0, 1));
+INSTANTIATE_TEST_SUITE_P(AVX2_COPY, AV1HighbdConvolve2DSrTest,
+                         libaom_test::AV1HighbdConvolve2D::BuildParams(
+                             av1_highbd_convolve_2d_copy_sr_avx2, 0, 0));
+#endif  // HAVE_AVX2
+#endif  // HAVE_SSSE3
+#endif  // HAVE_SSE2
+TEST_P(AV1HighbdJntConvolve2DTest, CheckOutput) {
+  RunCheckOutput(GET_PARAM(1));
+}
+
+TEST_P(AV1HighbdJntConvolve2DTest, DISABLED_Speed) {
+  RunSpeedTest(GET_PARAM(1));
+}
+
+INSTANTIATE_TEST_SUITE_P(C_X, AV1HighbdJntConvolve2DTest,
+                         libaom_test::AV1HighbdConvolve2D::BuildParams(
+                             av1_highbd_dist_wtd_convolve_x_c, 1, 0));
+
+INSTANTIATE_TEST_SUITE_P(C_Y, AV1HighbdJntConvolve2DTest,
+                         libaom_test::AV1HighbdConvolve2D::BuildParams(
+                             av1_highbd_dist_wtd_convolve_y_c, 0, 1));
+
+INSTANTIATE_TEST_SUITE_P(C_COPY, AV1HighbdJntConvolve2DTest,
+                         libaom_test::AV1HighbdConvolve2D::BuildParams(
+                             av1_highbd_dist_wtd_convolve_2d_copy_c, 0, 0));
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE4_1_COPY, AV1HighbdJntConvolve2DTest,
+                         libaom_test::AV1HighbdConvolve2D::BuildParams(
+                             av1_highbd_dist_wtd_convolve_2d_copy_sse4_1, 0,
+                             0));
+INSTANTIATE_TEST_SUITE_P(SSE4_1, AV1HighbdJntConvolve2DTest,
+                         libaom_test::AV1HighbdConvolve2D::BuildParams(
+                             av1_highbd_dist_wtd_convolve_2d_sse4_1, 1, 1));
+INSTANTIATE_TEST_SUITE_P(SSE4_1_X, AV1HighbdJntConvolve2DTest,
+                         libaom_test::AV1HighbdConvolve2D::BuildParams(
+                             av1_highbd_dist_wtd_convolve_x_sse4_1, 1, 0));
+INSTANTIATE_TEST_SUITE_P(SSE4_1_Y, AV1HighbdJntConvolve2DTest,
+                         libaom_test::AV1HighbdConvolve2D::BuildParams(
+                             av1_highbd_dist_wtd_convolve_y_sse4_1, 0, 1));
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2_COPY, AV1HighbdJntConvolve2DTest,
+                         libaom_test::AV1HighbdConvolve2D::BuildParams(
+                             av1_highbd_dist_wtd_convolve_2d_copy_avx2, 0, 0));
+INSTANTIATE_TEST_SUITE_P(AVX2, AV1HighbdJntConvolve2DTest,
+                         libaom_test::AV1HighbdConvolve2D::BuildParams(
+                             av1_highbd_dist_wtd_convolve_2d_avx2, 1, 1));
+INSTANTIATE_TEST_SUITE_P(AVX2_X, AV1HighbdJntConvolve2DTest,
+                         libaom_test::AV1HighbdConvolve2D::BuildParams(
+                             av1_highbd_dist_wtd_convolve_x_avx2, 1, 0));
+INSTANTIATE_TEST_SUITE_P(AVX2_Y, AV1HighbdJntConvolve2DTest,
+                         libaom_test::AV1HighbdConvolve2D::BuildParams(
+                             av1_highbd_dist_wtd_convolve_y_avx2, 0, 1));
+#endif  // HAVE_AVX2
+#endif  // HAVE_SSE4_1
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+}  // namespace
diff --git a/libs/libaom/src/test/av1_convolve_2d_test_util.cc b/libs/libaom/src/test/av1_convolve_2d_test_util.cc
new file mode 100644
index 000000000..6f103d3f6
--- /dev/null
+++ b/libs/libaom/src/test/av1_convolve_2d_test_util.cc
@@ -0,0 +1,708 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "test/av1_convolve_2d_test_util.h"
+
+#include "aom_ports/aom_timer.h"
+#include "av1/common/common_data.h"
+#include "av1/common/convolve.h"
+
+using std::make_tuple;
+using std::tuple;
+
+namespace libaom_test {
+
+const int kMaxSize = 128 + 32;  // padding
+namespace AV1Convolve2D {
+
+::testing::internal::ParamGenerator<Convolve2DParam> BuildParams(
+    convolve_2d_func filter, int has_subx, int has_suby) {
+  return ::testing::Combine(::testing::Values(filter),
+                            ::testing::Values(has_subx),
+                            ::testing::Values(has_suby),
+                            ::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL));
+}
+
+AV1Convolve2DSrTest::~AV1Convolve2DSrTest() {}
+void AV1Convolve2DSrTest::SetUp() {
+  rnd_.Reset(ACMRandom::DeterministicSeed());
+}
+
+void AV1Convolve2DSrTest::TearDown() { libaom_test::ClearSystemState(); }
+
+void AV1Convolve2DSrTest::RunCheckOutput(convolve_2d_func test_impl) {
+  const int w = kMaxSize, h = kMaxSize;
+  const int has_subx = GET_PARAM(1);
+  const int has_suby = GET_PARAM(2);
+  const int block_idx = GET_PARAM(3);
+  int hfilter, vfilter, subx, suby;
+  uint8_t input[kMaxSize * kMaxSize];
+  DECLARE_ALIGNED(32, uint8_t, output[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, uint8_t, output2[MAX_SB_SQUARE]);
+
+  for (int i = 0; i < h; ++i)
+    for (int j = 0; j < w; ++j) input[i * w + j] = rnd_.Rand8();
+  for (int i = 0; i < MAX_SB_SQUARE; ++i)
+    output[i] = output2[i] = static_cast<uint8_t>(rnd_.Rand31());
+
+  // Make sure that sizes 2xN and Nx2 are also tested for chroma.
+  const int num_sizes =
+      (block_size_wide[block_idx] == 4 || block_size_high[block_idx] == 4) ? 2
+                                                                           : 1;
+  for (int shift = 0; shift < num_sizes; ++shift) {  // luma and chroma
+    const int out_w = block_size_wide[block_idx] >> shift;
+    const int out_h = block_size_high[block_idx] >> shift;
+    for (hfilter = EIGHTTAP_REGULAR; hfilter < INTERP_FILTERS_ALL; ++hfilter) {
+      for (vfilter = EIGHTTAP_REGULAR; vfilter < INTERP_FILTERS_ALL;
+           ++vfilter) {
+        const InterpFilterParams *filter_params_x =
+            av1_get_interp_filter_params_with_block_size((InterpFilter)hfilter,
+                                                         out_w);
+        const InterpFilterParams *filter_params_y =
+            av1_get_interp_filter_params_with_block_size((InterpFilter)vfilter,
+                                                         out_h);
+        for (int do_average = 0; do_average < 1; ++do_average) {
+          ConvolveParams conv_params1 =
+              get_conv_params_no_round(do_average, 0, NULL, 0, 0, 8);
+          ConvolveParams conv_params2 =
+              get_conv_params_no_round(do_average, 0, NULL, 0, 0, 8);
+
+          const int subx_range = has_subx ? 16 : 1;
+          const int suby_range = has_suby ? 16 : 1;
+          for (subx = 0; subx < subx_range; ++subx) {
+            for (suby = 0; suby < suby_range; ++suby) {
+              // Choose random locations within the source block
+              const int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7);
+              const int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7);
+              av1_convolve_2d_sr_c(input + offset_r * w + offset_c, w, output,
+                                   MAX_SB_SIZE, out_w, out_h, filter_params_x,
+                                   filter_params_y, subx, suby, &conv_params1);
+              test_impl(input + offset_r * w + offset_c, w, output2,
+                        MAX_SB_SIZE, out_w, out_h, filter_params_x,
+                        filter_params_y, subx, suby, &conv_params2);
+
+              if (memcmp(output, output2, sizeof(output))) {
+                for (int i = 0; i < MAX_SB_SIZE; ++i) {
+                  for (int j = 0; j < MAX_SB_SIZE; ++j) {
+                    int idx = i * MAX_SB_SIZE + j;
+                    ASSERT_EQ(output[idx], output2[idx])
+                        << out_w << "x" << out_h << " Pixel mismatch at index "
+                        << idx << " = (" << i << ", " << j
+                        << "), sub pixel offset = (" << suby << ", " << subx
+                        << ")";
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+void AV1Convolve2DSrTest::RunSpeedTest(convolve_2d_func test_impl) {
+  const int w = kMaxSize, h = kMaxSize;
+  const int has_subx = GET_PARAM(1);
+  const int has_suby = GET_PARAM(2);
+  const int block_idx = GET_PARAM(3);
+
+  uint8_t input[kMaxSize * kMaxSize];
+  DECLARE_ALIGNED(32, uint8_t, output[MAX_SB_SQUARE]);
+
+  for (int i = 0; i < h; ++i)
+    for (int j = 0; j < w; ++j) input[i * w + j] = rnd_.Rand8();
+
+  int hfilter = EIGHTTAP_REGULAR, vfilter = EIGHTTAP_REGULAR;
+  int subx = 0, suby = 0;
+
+  const int do_average = 0;
+  ConvolveParams conv_params2 =
+      get_conv_params_no_round(do_average, 0, NULL, 0, 0, 8);
+
+  // Make sure that sizes 2xN and Nx2 are also tested for chroma.
+  const int num_sizes =
+      (block_size_wide[block_idx] == 4 || block_size_high[block_idx] == 4) ? 2
+                                                                           : 1;
+  for (int shift = 0; shift < num_sizes; ++shift) {  // luma and chroma
+    const int out_w = block_size_wide[block_idx] >> shift;
+    const int out_h = block_size_high[block_idx] >> shift;
+    const int num_loops = 1000000000 / (out_w + out_h);
+
+    const InterpFilterParams *filter_params_x =
+        av1_get_interp_filter_params_with_block_size((InterpFilter)hfilter,
+                                                     out_w);
+    const InterpFilterParams *filter_params_y =
+        av1_get_interp_filter_params_with_block_size((InterpFilter)vfilter,
+                                                     out_h);
+
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+
+    for (int i = 0; i < num_loops; ++i)
+      test_impl(input, w, output, MAX_SB_SIZE, out_w, out_h, filter_params_x,
+                filter_params_y, subx, suby, &conv_params2);
+
+    aom_usec_timer_mark(&timer);
+    const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+    printf("%d,%d convolve %3dx%-3d: %7.2f us\n", has_subx, has_suby, out_w,
+           out_h, 1000.0 * elapsed_time / num_loops);
+  }
+}
+
+AV1JntConvolve2DTest::~AV1JntConvolve2DTest() {}
+void AV1JntConvolve2DTest::SetUp() {
+  rnd_.Reset(ACMRandom::DeterministicSeed());
+}
+
+void AV1JntConvolve2DTest::TearDown() { libaom_test::ClearSystemState(); }
+
+void AV1JntConvolve2DTest::RunCheckOutput(convolve_2d_func test_impl) {
+  const int w = kMaxSize, h = kMaxSize;
+  const int has_subx = GET_PARAM(1);
+  const int has_suby = GET_PARAM(2);
+  const int block_idx = GET_PARAM(3);
+  int hfilter, vfilter, subx, suby;
+  uint8_t input[kMaxSize * kMaxSize];
+  DECLARE_ALIGNED(32, CONV_BUF_TYPE, output1[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, CONV_BUF_TYPE, output2[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(16, uint8_t, output8_1[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(16, uint8_t, output8_2[MAX_SB_SQUARE]);
+
+  for (int i = 0; i < h; ++i)
+    for (int j = 0; j < w; ++j) input[i * w + j] = rnd_.Rand8();
+  for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+    output1[i] = output2[i] = rnd_.Rand16();
+    output8_1[i] = output8_2[i] = rnd_.Rand8();
+  }
+
+  const int out_w = block_size_wide[block_idx];
+  const int out_h = block_size_high[block_idx];
+  for (hfilter = EIGHTTAP_REGULAR; hfilter < INTERP_FILTERS_ALL; ++hfilter) {
+    for (vfilter = EIGHTTAP_REGULAR; vfilter < INTERP_FILTERS_ALL; ++vfilter) {
+      const InterpFilterParams *filter_params_x =
+          av1_get_interp_filter_params_with_block_size((InterpFilter)hfilter,
+                                                       out_w);
+      const InterpFilterParams *filter_params_y =
+          av1_get_interp_filter_params_with_block_size((InterpFilter)vfilter,
+                                                       out_h);
+      for (int do_average = 0; do_average <= 1; ++do_average) {
+        ConvolveParams conv_params1 =
+            get_conv_params_no_round(do_average, 0, output1, MAX_SB_SIZE, 1, 8);
+        ConvolveParams conv_params2 =
+            get_conv_params_no_round(do_average, 0, output2, MAX_SB_SIZE, 1, 8);
+
+        // Test special case where dist_wtd_comp_avg is not used
+        conv_params1.use_dist_wtd_comp_avg = 0;
+        conv_params2.use_dist_wtd_comp_avg = 0;
+
+        const int subx_range = has_subx ? 16 : 1;
+        const int suby_range = has_suby ? 16 : 1;
+        for (subx = 0; subx < subx_range; ++subx) {
+          for (suby = 0; suby < suby_range; ++suby) {
+            // Choose random locations within the source block
+            const int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7);
+            const int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7);
+            av1_dist_wtd_convolve_2d_c(input + offset_r * w + offset_c, w,
+                                       output8_1, MAX_SB_SIZE, out_w, out_h,
+                                       filter_params_x, filter_params_y, subx,
+                                       suby, &conv_params1);
+            test_impl(input + offset_r * w + offset_c, w, output8_2,
+                      MAX_SB_SIZE, out_w, out_h, filter_params_x,
+                      filter_params_y, subx, suby, &conv_params2);
+
+            for (int i = 0; i < out_h; ++i) {
+              for (int j = 0; j < out_w; ++j) {
+                int idx = i * MAX_SB_SIZE + j;
+                ASSERT_EQ(output1[idx], output2[idx])
+                    << "Mismatch at unit tests for av1_dist_wtd_convolve_2d\n"
+                    << out_w << "x" << out_h << " Pixel mismatch at index "
+                    << idx << " = (" << i << ", " << j
+                    << "), sub pixel offset = (" << suby << ", " << subx << ")";
+              }
+            }
+
+            if (memcmp(output8_1, output8_2, sizeof(output8_1))) {
+              for (int i = 0; i < MAX_SB_SIZE; ++i) {
+                for (int j = 0; j < MAX_SB_SIZE; ++j) {
+                  int idx = i * MAX_SB_SIZE + j;
+                  ASSERT_EQ(output8_1[idx], output8_2[idx])
+                      << out_w << "x" << out_h << " Pixel mismatch at index "
+                      << idx << " = (" << i << ", " << j
+                      << "), sub pixel offset = (" << suby << ", " << subx
+                      << ")";
+                }
+              }
+            }
+          }
+        }
+
+        // Test different combination of fwd and bck offset weights
+        for (int k = 0; k < 2; ++k) {
+          for (int l = 0; l < 4; ++l) {
+            conv_params1.use_dist_wtd_comp_avg = 1;
+            conv_params2.use_dist_wtd_comp_avg = 1;
+            conv_params1.fwd_offset = quant_dist_lookup_table[k][l][0];
+            conv_params1.bck_offset = quant_dist_lookup_table[k][l][1];
+            conv_params2.fwd_offset = quant_dist_lookup_table[k][l][0];
+            conv_params2.bck_offset = quant_dist_lookup_table[k][l][1];
+
+            for (subx = 0; subx < subx_range; ++subx) {
+              for (suby = 0; suby < suby_range; ++suby) {
+                // Choose random locations within the source block
+                const int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7);
+                const int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7);
+                av1_dist_wtd_convolve_2d_c(input + offset_r * w + offset_c, w,
+                                           output8_1, MAX_SB_SIZE, out_w, out_h,
+                                           filter_params_x, filter_params_y,
+                                           subx, suby, &conv_params1);
+                test_impl(input + offset_r * w + offset_c, w, output8_2,
+                          MAX_SB_SIZE, out_w, out_h, filter_params_x,
+                          filter_params_y, subx, suby, &conv_params2);
+
+                for (int i = 0; i < out_h; ++i) {
+                  for (int j = 0; j < out_w; ++j) {
+                    int idx = i * MAX_SB_SIZE + j;
+                    ASSERT_EQ(output1[idx], output2[idx])
+                        << "Mismatch at unit tests for "
+                           "av1_dist_wtd_convolve_2d\n"
+                        << out_w << "x" << out_h << " Pixel mismatch at index "
+                        << idx << " = (" << i << ", " << j
+                        << "), sub pixel offset = (" << suby << ", " << subx
+                        << ")";
+                  }
+                }
+                if (memcmp(output8_1, output8_2, sizeof(output8_1))) {
+                  for (int i = 0; i < MAX_SB_SIZE; ++i) {
+                    for (int j = 0; j < MAX_SB_SIZE; ++j) {
+                      int idx = i * MAX_SB_SIZE + j;
+                      ASSERT_EQ(output8_1[idx], output8_2[idx])
+                          << out_w << "x" << out_h
+                          << " Pixel mismatch at index " << idx << " = (" << i
+                          << ", " << j << "), sub pixel offset = (" << suby
+                          << ", " << subx << ")";
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+void AV1JntConvolve2DTest::RunSpeedTest(convolve_2d_func test_impl) {
+  const int w = kMaxSize, h = kMaxSize;
+  const int has_subx = GET_PARAM(1);
+  const int has_suby = GET_PARAM(2);
+  const int block_idx = GET_PARAM(3);
+
+  int subx = 0, suby = 0;
+  uint8_t input[kMaxSize * kMaxSize];
+  DECLARE_ALIGNED(32, CONV_BUF_TYPE, output[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(16, uint8_t, output8[MAX_SB_SQUARE]);
+  int hfilter = EIGHTTAP_REGULAR, vfilter = EIGHTTAP_REGULAR;
+  for (int i = 0; i < h; ++i)
+    for (int j = 0; j < w; ++j) input[i * w + j] = rnd_.Rand8();
+  for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+    output[i] = rnd_.Rand16();
+    output8[i] = rnd_.Rand8();
+  }
+
+  const int out_w = block_size_wide[block_idx];
+  const int out_h = block_size_high[block_idx];
+  const int num_loops = 1000000000 / (out_w + out_h);
+  const int do_average = 0;
+
+  const InterpFilterParams *filter_params_x =
+      av1_get_interp_filter_params_with_block_size((InterpFilter)hfilter,
+                                                   out_w);
+  const InterpFilterParams *filter_params_y =
+      av1_get_interp_filter_params_with_block_size((InterpFilter)vfilter,
+                                                   out_h);
+
+  ConvolveParams conv_params =
+      get_conv_params_no_round(do_average, 0, output, MAX_SB_SIZE, 1, 8);
+
+  conv_params.use_dist_wtd_comp_avg = 0;
+
+  // Choose random locations within the source block
+  const int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7);
+  const int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7);
+
+  aom_usec_timer timer;
+  aom_usec_timer_start(&timer);
+
+  for (int i = 0; i < num_loops; ++i)
+    test_impl(input + offset_r * w + offset_c, w, output8, MAX_SB_SIZE, out_w,
+              out_h, filter_params_x, filter_params_y, subx, suby,
+              &conv_params);
+
+  aom_usec_timer_mark(&timer);
+  const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+  printf("%d,%d convolve %3dx%-3d: %7.2f us\n", has_subx, has_suby, out_w,
+         out_h, 1000.0 * elapsed_time / num_loops);
+}
+}  // namespace AV1Convolve2D
+
+#if CONFIG_AV1_HIGHBITDEPTH
+namespace AV1HighbdConvolve2D {
+::testing::internal::ParamGenerator<HighbdConvolve2DParam> BuildParams(
+    highbd_convolve_2d_func filter, int has_subx, int has_suby) {
+  return ::testing::Combine(
+      ::testing::Range(8, 13, 2), ::testing::Values(filter),
+      ::testing::Values(has_subx), ::testing::Values(has_suby),
+      ::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL));
+}
+
+AV1HighbdConvolve2DSrTest::~AV1HighbdConvolve2DSrTest() {}
+void AV1HighbdConvolve2DSrTest::SetUp() {
+  rnd_.Reset(ACMRandom::DeterministicSeed());
+}
+
+void AV1HighbdConvolve2DSrTest::TearDown() { libaom_test::ClearSystemState(); }
+
+void AV1HighbdConvolve2DSrTest::RunSpeedTest(
+    highbd_convolve_2d_func test_impl) {
+  const int w = kMaxSize, h = kMaxSize;
+  const int bd = GET_PARAM(0);
+  const int has_subx = GET_PARAM(2);
+  const int has_suby = GET_PARAM(3);
+  const int block_idx = GET_PARAM(4);
+  int hfilter, vfilter, subx, suby;
+  uint16_t input[kMaxSize * kMaxSize];
+  DECLARE_ALIGNED(32, uint16_t, output[MAX_SB_SQUARE]);
+
+  for (int i = 0; i < h; ++i)
+    for (int j = 0; j < w; ++j)
+      input[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
+
+  hfilter = EIGHTTAP_REGULAR;
+  vfilter = EIGHTTAP_REGULAR;
+  int do_average = 0;
+
+  const int offset_r = 3;
+  const int offset_c = 3;
+  subx = 0;
+  suby = 0;
+
+  ConvolveParams conv_params =
+      get_conv_params_no_round(do_average, 0, NULL, 0, 0, bd);
+
+  // Make sure that sizes 2xN and Nx2 are also tested for chroma.
+  const int num_sizes =
+      (block_size_wide[block_idx] == 4 || block_size_high[block_idx] == 4) ? 2
+                                                                           : 1;
+
+  for (int shift = 0; shift < num_sizes; ++shift) {  // luma and chroma
+    const int out_w = block_size_wide[block_idx] >> shift;
+    const int out_h = block_size_high[block_idx] >> shift;
+    const int num_loops = 1000000000 / (out_w + out_h);
+
+    const InterpFilterParams *filter_params_x =
+        av1_get_interp_filter_params_with_block_size((InterpFilter)hfilter,
+                                                     out_w);
+    const InterpFilterParams *filter_params_y =
+        av1_get_interp_filter_params_with_block_size((InterpFilter)vfilter,
+                                                     out_h);
+
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < num_loops; ++i)
+      test_impl(input + offset_r * w + offset_c, w, output, MAX_SB_SIZE, out_w,
+                out_h, filter_params_x, filter_params_y, subx, suby,
+                &conv_params, bd);
+
+    aom_usec_timer_mark(&timer);
+    const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+    printf("%d,%d convolve %3dx%-3d: %7.2f us\n", has_subx, has_suby, out_w,
+           out_h, 1000.0 * elapsed_time / num_loops);
+  }
+}
+
+void AV1HighbdConvolve2DSrTest::RunCheckOutput(
+    highbd_convolve_2d_func test_impl) {
+  const int w = kMaxSize, h = kMaxSize;
+  const int bd = GET_PARAM(0);
+  const int has_subx = GET_PARAM(2);
+  const int has_suby = GET_PARAM(3);
+  const int block_idx = GET_PARAM(4);
+  int hfilter, vfilter, subx, suby;
+  uint16_t input[kMaxSize * kMaxSize];
+  DECLARE_ALIGNED(32, uint16_t, output[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, uint16_t, output2[MAX_SB_SQUARE]);
+
+  for (int i = 0; i < h; ++i)
+    for (int j = 0; j < w; ++j)
+      input[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
+  for (int i = 0; i < MAX_SB_SQUARE; ++i)
+    output[i] = output2[i] = static_cast<int16_t>(rnd_.Rand31());
+
+  // Make sure that sizes 2xN and Nx2 are also tested for chroma.
+  const int num_sizes =
+      (block_size_wide[block_idx] == 4 || block_size_high[block_idx] == 4) ? 2
+                                                                           : 1;
+  for (int shift = 0; shift < num_sizes; ++shift) {  // luma and chroma
+    const int out_w = block_size_wide[block_idx] >> shift;
+    const int out_h = block_size_high[block_idx] >> shift;
+    for (hfilter = EIGHTTAP_REGULAR; hfilter < INTERP_FILTERS_ALL; ++hfilter) {
+      for (vfilter = EIGHTTAP_REGULAR; vfilter < INTERP_FILTERS_ALL;
+           ++vfilter) {
+        const InterpFilterParams *filter_params_x =
+            av1_get_interp_filter_params_with_block_size((InterpFilter)hfilter,
+                                                         out_w);
+        const InterpFilterParams *filter_params_y =
+            av1_get_interp_filter_params_with_block_size((InterpFilter)vfilter,
+                                                         out_h);
+        for (int do_average = 0; do_average < 1; ++do_average) {
+          ConvolveParams conv_params1 =
+              get_conv_params_no_round(do_average, 0, NULL, 0, 0, bd);
+          ConvolveParams conv_params2 =
+              get_conv_params_no_round(do_average, 0, NULL, 0, 0, bd);
+
+          const int subx_range = has_subx ? 16 : 1;
+          const int suby_range = has_suby ? 16 : 1;
+          for (subx = 0; subx < subx_range; ++subx) {
+            for (suby = 0; suby < suby_range; ++suby) {
+              // Choose random locations within the source block
+              const int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7);
+              const int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7);
+              av1_highbd_convolve_2d_sr_c(input + offset_r * w + offset_c, w,
+                                          output, MAX_SB_SIZE, out_w, out_h,
+                                          filter_params_x, filter_params_y,
+                                          subx, suby, &conv_params1, bd);
+              test_impl(input + offset_r * w + offset_c, w, output2,
+                        MAX_SB_SIZE, out_w, out_h, filter_params_x,
+                        filter_params_y, subx, suby, &conv_params2, bd);
+
+              if (memcmp(output, output2, sizeof(output))) {
+                for (int i = 0; i < MAX_SB_SIZE; ++i) {
+                  for (int j = 0; j < MAX_SB_SIZE; ++j) {
+                    int idx = i * MAX_SB_SIZE + j;
+                    ASSERT_EQ(output[idx], output2[idx])
+                        << out_w << "x" << out_h << " Pixel mismatch at index "
+                        << idx << " = (" << i << ", " << j
+                        << "), sub pixel offset = (" << suby << ", " << subx
+                        << ")";
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+AV1HighbdJntConvolve2DTest::~AV1HighbdJntConvolve2DTest() {}
+void AV1HighbdJntConvolve2DTest::SetUp() {
+  rnd_.Reset(ACMRandom::DeterministicSeed());
+}
+
+void AV1HighbdJntConvolve2DTest::TearDown() { libaom_test::ClearSystemState(); }
+
+void AV1HighbdJntConvolve2DTest::RunSpeedTest(
+    highbd_convolve_2d_func test_impl) {
+  const int w = kMaxSize, h = kMaxSize;
+  const int bd = GET_PARAM(0);
+  const int block_idx = GET_PARAM(4);
+  int hfilter, vfilter, subx, suby;
+  uint16_t input[kMaxSize * kMaxSize];
+  DECLARE_ALIGNED(32, CONV_BUF_TYPE, output[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, uint16_t, output16[MAX_SB_SQUARE]);
+
+  for (int i = 0; i < h; ++i)
+    for (int j = 0; j < w; ++j)
+      input[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
+  for (int i = 0; i < MAX_SB_SQUARE; ++i) output[i] = rnd_.Rand16();
+  hfilter = EIGHTTAP_REGULAR;
+  vfilter = EIGHTTAP_REGULAR;
+  int do_average = 0;
+  const int out_w = block_size_wide[block_idx];
+  const int out_h = block_size_high[block_idx];
+
+  const InterpFilterParams *filter_params_x =
+      av1_get_interp_filter_params_with_block_size((InterpFilter)hfilter,
+                                                   out_w);
+  const InterpFilterParams *filter_params_y =
+      av1_get_interp_filter_params_with_block_size((InterpFilter)vfilter,
+                                                   out_h);
+
+  ConvolveParams conv_params =
+      get_conv_params_no_round(do_average, 0, output, MAX_SB_SIZE, 1, bd);
+
+  // Test special case where dist_wtd_comp_avg is not used
+  conv_params.use_dist_wtd_comp_avg = 0;
+
+  subx = 0;
+  suby = 0;
+  // Choose random locations within the source block
+  const int offset_r = 3;
+  const int offset_c = 3;
+
+  const int num_loops = 1000000000 / (out_w + out_h);
+  aom_usec_timer timer;
+  aom_usec_timer_start(&timer);
+  for (int i = 0; i < num_loops; ++i)
+    test_impl(input + offset_r * w + offset_c, w, output16, MAX_SB_SIZE, out_w,
+              out_h, filter_params_x, filter_params_y, subx, suby, &conv_params,
+              bd);
+
+  aom_usec_timer_mark(&timer);
+  const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+  printf("convolve %3dx%-3d: %7.2f us\n", out_w, out_h,
+         1000.0 * elapsed_time / num_loops);
+}
+
+void AV1HighbdJntConvolve2DTest::RunCheckOutput(
+    highbd_convolve_2d_func test_impl) {
+  const int w = kMaxSize, h = kMaxSize;
+  const int bd = GET_PARAM(0);
+  const int has_subx = GET_PARAM(2);
+  const int has_suby = GET_PARAM(3);
+  const int block_idx = GET_PARAM(4);
+  int hfilter, vfilter, subx, suby;
+  uint16_t input[kMaxSize * kMaxSize];
+  DECLARE_ALIGNED(32, CONV_BUF_TYPE, output1[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, CONV_BUF_TYPE, output2[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, uint16_t, output16_1[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, uint16_t, output16_2[MAX_SB_SQUARE]);
+
+  for (int i = 0; i < h; ++i)
+    for (int j = 0; j < w; ++j)
+      input[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
+  for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+    output1[i] = output2[i] = rnd_.Rand16();
+    output16_1[i] = output16_2[i] = rnd_.Rand16();
+  }
+
+  const int out_w = block_size_wide[block_idx];
+  const int out_h = block_size_high[block_idx];
+  for (hfilter = EIGHTTAP_REGULAR; hfilter < INTERP_FILTERS_ALL; ++hfilter) {
+    for (vfilter = EIGHTTAP_REGULAR; vfilter < INTERP_FILTERS_ALL; ++vfilter) {
+      const InterpFilterParams *filter_params_x =
+          av1_get_interp_filter_params_with_block_size((InterpFilter)hfilter,
+                                                       out_w);
+      const InterpFilterParams *filter_params_y =
+          av1_get_interp_filter_params_with_block_size((InterpFilter)vfilter,
+                                                       out_h);
+      for (int do_average = 0; do_average <= 1; ++do_average) {
+        ConvolveParams conv_params1 = get_conv_params_no_round(
+            do_average, 0, output1, MAX_SB_SIZE, 1, bd);
+        ConvolveParams conv_params2 = get_conv_params_no_round(
+            do_average, 0, output2, MAX_SB_SIZE, 1, bd);
+
+        // Test special case where dist_wtd_comp_avg is not used
+        conv_params1.use_dist_wtd_comp_avg = 0;
+        conv_params2.use_dist_wtd_comp_avg = 0;
+
+        const int subx_range = has_subx ? 16 : 1;
+        const int suby_range = has_suby ? 16 : 1;
+        for (subx = 0; subx < subx_range; ++subx) {
+          for (suby = 0; suby < suby_range; ++suby) {
+            // Choose random locations within the source block
+            const int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7);
+            const int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7);
+            av1_highbd_dist_wtd_convolve_2d_c(
+                input + offset_r * w + offset_c, w, output16_1, MAX_SB_SIZE,
+                out_w, out_h, filter_params_x, filter_params_y, subx, suby,
+                &conv_params1, bd);
+            test_impl(input + offset_r * w + offset_c, w, output16_2,
+                      MAX_SB_SIZE, out_w, out_h, filter_params_x,
+                      filter_params_y, subx, suby, &conv_params2, bd);
+
+            for (int i = 0; i < out_h; ++i) {
+              for (int j = 0; j < out_w; ++j) {
+                int idx = i * MAX_SB_SIZE + j;
+                ASSERT_EQ(output1[idx], output2[idx])
+                    << out_w << "x" << out_h << " Pixel mismatch at index "
+                    << idx << " = (" << i << ", " << j
+                    << "), sub pixel offset = (" << suby << ", " << subx << ")";
+              }
+            }
+
+            if (memcmp(output16_1, output16_2, sizeof(output16_1))) {
+              for (int i = 0; i < MAX_SB_SIZE; ++i) {
+                for (int j = 0; j < MAX_SB_SIZE; ++j) {
+                  int idx = i * MAX_SB_SIZE + j;
+                  ASSERT_EQ(output16_1[idx], output16_2[idx])
+                      << out_w << "x" << out_h << " Pixel mismatch at index "
+                      << idx << " = (" << i << ", " << j
+                      << "), sub pixel offset = (" << suby << ", " << subx
+                      << ")";
+                }
+              }
+            }
+          }
+        }
+
+        // Test different combination of fwd and bck offset weights
+        for (int k = 0; k < 2; ++k) {
+          for (int l = 0; l < 4; ++l) {
+            conv_params1.use_dist_wtd_comp_avg = 1;
+            conv_params2.use_dist_wtd_comp_avg = 1;
+            conv_params1.fwd_offset = quant_dist_lookup_table[k][l][0];
+            conv_params1.bck_offset = quant_dist_lookup_table[k][l][1];
+            conv_params2.fwd_offset = quant_dist_lookup_table[k][l][0];
+            conv_params2.bck_offset = quant_dist_lookup_table[k][l][1];
+
+            const int subx_range = has_subx ? 16 : 1;
+            const int suby_range = has_suby ? 16 : 1;
+            for (subx = 0; subx < subx_range; ++subx) {
+              for (suby = 0; suby < suby_range; ++suby) {
+                // Choose random locations within the source block
+                const int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7);
+                const int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7);
+                av1_highbd_dist_wtd_convolve_2d_c(
+                    input + offset_r * w + offset_c, w, output16_1, MAX_SB_SIZE,
+                    out_w, out_h, filter_params_x, filter_params_y, subx, suby,
+                    &conv_params1, bd);
+                test_impl(input + offset_r * w + offset_c, w, output16_2,
+                          MAX_SB_SIZE, out_w, out_h, filter_params_x,
+                          filter_params_y, subx, suby, &conv_params2, bd);
+
+                for (int i = 0; i < out_h; ++i) {
+                  for (int j = 0; j < out_w; ++j) {
+                    int idx = i * MAX_SB_SIZE + j;
+                    ASSERT_EQ(output1[idx], output2[idx])
+                        << out_w << "x" << out_h << " Pixel mismatch at index "
+                        << idx << " = (" << i << ", " << j
+                        << "), sub pixel offset = (" << suby << ", " << subx
+                        << ")";
+                  }
+                }
+
+                if (memcmp(output16_1, output16_2, sizeof(output16_1))) {
+                  for (int i = 0; i < MAX_SB_SIZE; ++i) {
+                    for (int j = 0; j < MAX_SB_SIZE; ++j) {
+                      int idx = i * MAX_SB_SIZE + j;
+                      ASSERT_EQ(output16_1[idx], output16_2[idx])
+                          << out_w << "x" << out_h
+                          << " Pixel mismatch at index " << idx << " = (" << i
+                          << ", " << j << "), sub pixel offset = (" << suby
+                          << ", " << subx << ")";
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+}  // namespace AV1HighbdConvolve2D
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+}  // namespace libaom_test
diff --git a/libs/libaom/src/test/av1_convolve_2d_test_util.h b/libs/libaom/src/test/av1_convolve_2d_test_util.h
new file mode 100644
index 000000000..3c19cfed3
--- /dev/null
+++ b/libs/libaom/src/test/av1_convolve_2d_test_util.h
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_TEST_AV1_CONVOLVE_2D_TEST_UTIL_H_
+#define AOM_TEST_AV1_CONVOLVE_2D_TEST_UTIL_H_
+
+#include <tuple>
+
+#include "config/av1_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/acm_random.h"
+#include "test/util.h"
+
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+
+namespace libaom_test {
+
+namespace AV1Convolve2D {
+
+typedef void (*convolve_2d_func)(const uint8_t *src, int src_stride,
+                                 uint8_t *dst, int dst_stride, int w, int h,
+                                 const InterpFilterParams *filter_params_x,
+                                 const InterpFilterParams *filter_params_y,
+                                 const int subpel_x_qn, const int subpel_y_qn,
+                                 ConvolveParams *conv_params);
+
+typedef std::tuple<convolve_2d_func, int, int, BLOCK_SIZE> Convolve2DParam;
+
+::testing::internal::ParamGenerator<Convolve2DParam> BuildParams(
+    convolve_2d_func filter, int subx_exist, int suby_exist);
+
+class AV1Convolve2DSrTest : public ::testing::TestWithParam<Convolve2DParam> {
+ public:
+  virtual ~AV1Convolve2DSrTest();
+  virtual void SetUp();
+
+  virtual void TearDown();
+
+ protected:
+  void RunCheckOutput(convolve_2d_func test_impl);
+  void RunSpeedTest(convolve_2d_func test_impl);
+
+  libaom_test::ACMRandom rnd_;
+};
+
+class AV1JntConvolve2DTest : public ::testing::TestWithParam<Convolve2DParam> {
+ public:
+  virtual ~AV1JntConvolve2DTest();
+  virtual void SetUp();
+
+  virtual void TearDown();
+
+ protected:
+  void RunCheckOutput(convolve_2d_func test_impl);
+  void RunSpeedTest(convolve_2d_func test_impl);
+
+  libaom_test::ACMRandom rnd_;
+};
+}  // namespace AV1Convolve2D
+
+#if CONFIG_AV1_HIGHBITDEPTH
+namespace AV1HighbdConvolve2D {
+typedef void (*highbd_convolve_2d_func)(
+    const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+    int h, const InterpFilterParams *filter_params_x,
+    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+    const int subpel_y_qn, ConvolveParams *conv_params, int bd);
+
+typedef std::tuple<int, highbd_convolve_2d_func, int, int, BLOCK_SIZE>
+    HighbdConvolve2DParam;
+
+::testing::internal::ParamGenerator<HighbdConvolve2DParam> BuildParams(
+    highbd_convolve_2d_func filter, int subx_exist, int suby_exist);
+
+class AV1HighbdConvolve2DSrTest
+    : public ::testing::TestWithParam<HighbdConvolve2DParam> {
+ public:
+  virtual ~AV1HighbdConvolve2DSrTest();
+  virtual void SetUp();
+
+  virtual void TearDown();
+
+ protected:
+  void RunCheckOutput(highbd_convolve_2d_func test_impl);
+  void RunSpeedTest(highbd_convolve_2d_func test_impl);
+
+  libaom_test::ACMRandom rnd_;
+};
+
+class AV1HighbdJntConvolve2DTest
+    : public ::testing::TestWithParam<HighbdConvolve2DParam> {
+ public:
+  virtual ~AV1HighbdJntConvolve2DTest();
+  virtual void SetUp();
+
+  virtual void TearDown();
+
+ protected:
+  void RunCheckOutput(highbd_convolve_2d_func test_impl);
+  void RunSpeedTest(highbd_convolve_2d_func test_impl);
+
+  libaom_test::ACMRandom rnd_;
+};
+}  // namespace AV1HighbdConvolve2D
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+}  // namespace libaom_test
+
+#endif  // AOM_TEST_AV1_CONVOLVE_2D_TEST_UTIL_H_
diff --git a/libs/libaom/src/test/av1_convolve_scale_test.cc b/libs/libaom/src/test/av1_convolve_scale_test.cc
new file mode 100644
index 000000000..ffd0bab33
--- /dev/null
+++ b/libs/libaom/src/test/av1_convolve_scale_test.cc
@@ -0,0 +1,532 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tuple>
+#include <vector>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/av1_rtcd.h"
+
+#include "aom_ports/aom_timer.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+
+#include "av1/common/common_data.h"
+
+namespace {
+const int kTestIters = 10;
+const int kPerfIters = 1000;
+
+const int kVPad = 32;
+const int kHPad = 32;
+const int kXStepQn = 16;
+const int kYStepQn = 20;
+
+using libaom_test::ACMRandom;
+using std::make_tuple;
+using std::tuple;
+
+enum NTaps { EIGHT_TAP, TEN_TAP, TWELVE_TAP };
+int NTapsToInt(NTaps ntaps) { return 8 + static_cast<int>(ntaps) * 2; }
+
+// A 16-bit filter with a configurable number of taps.
+class TestFilter {
+ public:
+  void set(NTaps ntaps, bool backwards);
+
+  InterpFilterParams params_;
+
+ private:
+  std::vector<int16_t> coeffs_;
+};
+
+void TestFilter::set(NTaps ntaps, bool backwards) {
+  const int n = NTapsToInt(ntaps);
+  assert(n >= 8 && n <= 12);
+
+  // The filter has n * SUBPEL_SHIFTS proper elements and an extra 8 bogus
+  // elements at the end so that convolutions can read off the end safely.
+  coeffs_.resize(n * SUBPEL_SHIFTS + 8);
+
+  // The coefficients are pretty much arbitrary, but convolutions shouldn't
+  // over or underflow. For the first filter (subpels = 0), we use an
+  // increasing or decreasing ramp (depending on the backwards parameter). We
+  // don't want any zero coefficients, so we make it have an x-intercept at -1
+  // or n. To ensure absence of under/overflow, we normalise the area under the
+  // ramp to be I = 1 << FILTER_BITS (so that convolving a constant function
+  // gives the identity).
+  //
+  // When increasing, the function has the form:
+  //
+  //   f(x) = A * (x + 1)
+  //
+  // Summing and rearranging for A gives A = 2 * I / (n * (n + 1)). If the
+  // filter is reversed, we have the same A but with formula
+  //
+  //   g(x) = A * (n - x)
+  const int I = 1 << FILTER_BITS;
+  const float A = 2.f * I / (n * (n + 1.f));
+  for (int i = 0; i < n; ++i) {
+    coeffs_[i] = static_cast<int16_t>(A * (backwards ? (n - i) : (i + 1)));
+  }
+
+  // For the other filters, make them slightly different by swapping two
+  // columns. Filter k will have the columns (k % n) and (7 * k) % n swapped.
+  const size_t filter_size = sizeof(coeffs_[0] * n);
+  int16_t *const filter0 = &coeffs_[0];
+  for (int k = 1; k < SUBPEL_SHIFTS; ++k) {
+    int16_t *filterk = &coeffs_[k * n];
+    memcpy(filterk, filter0, filter_size);
+
+    const int idx0 = k % n;
+    const int idx1 = (7 * k) % n;
+
+    const int16_t tmp = filterk[idx0];
+    filterk[idx0] = filterk[idx1];
+    filterk[idx1] = tmp;
+  }
+
+  // Finally, write some rubbish at the end to make sure we don't use it.
+  for (int i = 0; i < 8; ++i) coeffs_[n * SUBPEL_SHIFTS + i] = 123 + i;
+
+  // Fill in params
+  params_.filter_ptr = &coeffs_[0];
+  params_.taps = n;
+  // These are ignored by the functions being tested. Set them to whatever.
+  params_.subpel_shifts = SUBPEL_SHIFTS;
+  params_.interp_filter = EIGHTTAP_REGULAR;
+}
+
+template <typename SrcPixel>
+class TestImage {
+ public:
+  TestImage(int w, int h, int bd) : w_(w), h_(h), bd_(bd) {
+    assert(bd < 16);
+    assert(bd <= 8 * static_cast<int>(sizeof(SrcPixel)));
+
+    // Pad width by 2*kHPad and then round up to the next multiple of 16
+    // to get src_stride_. Add another 16 for dst_stride_ (to make sure
+    // something goes wrong if we use the wrong one)
+    src_stride_ = (w_ + 2 * kHPad + 15) & ~15;
+    dst_stride_ = src_stride_ + 16;
+
+    // Allocate image data
+    src_data_.resize(2 * src_block_size());
+    dst_data_.resize(2 * dst_block_size());
+    dst_16_data_.resize(2 * dst_block_size());
+  }
+
+  void Initialize(ACMRandom *rnd);
+  void Check() const;
+
+  int src_stride() const { return src_stride_; }
+  int dst_stride() const { return dst_stride_; }
+
+  int src_block_size() const { return (h_ + 2 * kVPad) * src_stride(); }
+  int dst_block_size() const { return (h_ + 2 * kVPad) * dst_stride(); }
+
+  const SrcPixel *GetSrcData(bool ref, bool borders) const {
+    const SrcPixel *block = &src_data_[ref ? 0 : src_block_size()];
+    return borders ? block : block + kHPad + src_stride_ * kVPad;
+  }
+
+  SrcPixel *GetDstData(bool ref, bool borders) {
+    SrcPixel *block = &dst_data_[ref ? 0 : dst_block_size()];
+    return borders ? block : block + kHPad + dst_stride_ * kVPad;
+  }
+
+  CONV_BUF_TYPE *GetDst16Data(bool ref, bool borders) {
+    CONV_BUF_TYPE *block = &dst_16_data_[ref ? 0 : dst_block_size()];
+    return borders ? block : block + kHPad + dst_stride_ * kVPad;
+  }
+
+ private:
+  int w_, h_, bd_;
+  int src_stride_, dst_stride_;
+
+  std::vector<SrcPixel> src_data_;
+  std::vector<SrcPixel> dst_data_;
+  std::vector<CONV_BUF_TYPE> dst_16_data_;
+};
+
+template <typename Pixel>
+void FillEdge(ACMRandom *rnd, int num_pixels, int bd, bool trash, Pixel *data) {
+  if (!trash) {
+    memset(data, 0, sizeof(*data) * num_pixels);
+    return;
+  }
+  const Pixel mask = (1 << bd) - 1;
+  for (int i = 0; i < num_pixels; ++i) data[i] = rnd->Rand16() & mask;
+}
+
+template <typename Pixel>
+void PrepBuffers(ACMRandom *rnd, int w, int h, int stride, int bd,
+                 bool trash_edges, Pixel *data) {
+  assert(rnd);
+  const Pixel mask = (1 << bd) - 1;
+
+  // Fill in the first buffer with random data
+  // Top border
+  FillEdge(rnd, stride * kVPad, bd, trash_edges, data);
+  for (int r = 0; r < h; ++r) {
+    Pixel *row_data = data + (kVPad + r) * stride;
+    // Left border, contents, right border
+    FillEdge(rnd, kHPad, bd, trash_edges, row_data);
+    for (int c = 0; c < w; ++c) row_data[kHPad + c] = rnd->Rand16() & mask;
+    FillEdge(rnd, kHPad, bd, trash_edges, row_data + kHPad + w);
+  }
+  // Bottom border
+  FillEdge(rnd, stride * kVPad, bd, trash_edges, data + stride * (kVPad + h));
+
+  const int bpp = sizeof(*data);
+  const int block_elts = stride * (h + 2 * kVPad);
+  const int block_size = bpp * block_elts;
+
+  // Now copy that to the second buffer
+  memcpy(data + block_elts, data, block_size);
+}
+
+template <typename SrcPixel>
+void TestImage<SrcPixel>::Initialize(ACMRandom *rnd) {
+  PrepBuffers(rnd, w_, h_, src_stride_, bd_, false, &src_data_[0]);
+  PrepBuffers(rnd, w_, h_, dst_stride_, bd_, true, &dst_data_[0]);
+  PrepBuffers(rnd, w_, h_, dst_stride_, bd_, true, &dst_16_data_[0]);
+}
+
+template <typename SrcPixel>
+void TestImage<SrcPixel>::Check() const {
+  // If memcmp returns 0, there's nothing to do.
+  const int num_pixels = dst_block_size();
+  const SrcPixel *ref_dst = &dst_data_[0];
+  const SrcPixel *tst_dst = &dst_data_[num_pixels];
+
+  const CONV_BUF_TYPE *ref_16_dst = &dst_16_data_[0];
+  const CONV_BUF_TYPE *tst_16_dst = &dst_16_data_[num_pixels];
+
+  if (0 == memcmp(ref_dst, tst_dst, sizeof(*ref_dst) * num_pixels)) {
+    if (0 == memcmp(ref_16_dst, tst_16_dst, sizeof(*ref_16_dst) * num_pixels))
+      return;
+  }
+  // Otherwise, iterate through the buffer looking for differences (including
+  // the edges)
+  const int stride = dst_stride_;
+  for (int r = 0; r < h_ + 2 * kVPad; ++r) {
+    for (int c = 0; c < w_ + 2 * kHPad; ++c) {
+      const int32_t ref_value = ref_dst[r * stride + c];
+      const int32_t tst_value = tst_dst[r * stride + c];
+
+      EXPECT_EQ(tst_value, ref_value)
+          << "Error at row: " << (r - kVPad) << ", col: " << (c - kHPad);
+    }
+  }
+
+  for (int r = 0; r < h_ + 2 * kVPad; ++r) {
+    for (int c = 0; c < w_ + 2 * kHPad; ++c) {
+      const int32_t ref_value = ref_16_dst[r * stride + c];
+      const int32_t tst_value = tst_16_dst[r * stride + c];
+
+      EXPECT_EQ(tst_value, ref_value)
+          << "Error in 16 bit buffer "
+          << "Error at row: " << (r - kVPad) << ", col: " << (c - kHPad);
+    }
+  }
+}
+
+typedef tuple<int, int> BlockDimension;
+
+struct BaseParams {
+  BaseParams(BlockDimension dims, NTaps ntaps_x, NTaps ntaps_y, bool avg)
+      : dims(dims), ntaps_x(ntaps_x), ntaps_y(ntaps_y), avg(avg) {}
+
+  BlockDimension dims;
+  NTaps ntaps_x, ntaps_y;
+  bool avg;
+};
+
+template <typename SrcPixel>
+class ConvolveScaleTestBase : public ::testing::Test {
+ public:
+  ConvolveScaleTestBase() : image_(NULL) {}
+  virtual ~ConvolveScaleTestBase() { delete image_; }
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+
+  // Implemented by subclasses (SetUp depends on the parameters passed
+  // in and RunOne depends on the function to be tested. These can't
+  // be templated for low/high bit depths because they have different
+  // numbers of parameters)
+  virtual void SetUp() = 0;
+  virtual void RunOne(bool ref) = 0;
+
+ protected:
+  void SetParams(const BaseParams &params, int bd) {
+    width_ = std::get<0>(params.dims);
+    height_ = std::get<1>(params.dims);
+    ntaps_x_ = params.ntaps_x;
+    ntaps_y_ = params.ntaps_y;
+    bd_ = bd;
+    avg_ = params.avg;
+
+    filter_x_.set(ntaps_x_, false);
+    filter_y_.set(ntaps_y_, true);
+    convolve_params_ =
+        get_conv_params_no_round(avg_ != false, 0, NULL, 0, 1, bd);
+
+    delete image_;
+    image_ = new TestImage<SrcPixel>(width_, height_, bd_);
+  }
+
+  void SetConvParamOffset(int i, int j, int is_compound, int do_average,
+                          int use_dist_wtd_comp_avg) {
+    if (i == -1 && j == -1) {
+      convolve_params_.use_dist_wtd_comp_avg = use_dist_wtd_comp_avg;
+      convolve_params_.is_compound = is_compound;
+      convolve_params_.do_average = do_average;
+    } else {
+      convolve_params_.use_dist_wtd_comp_avg = use_dist_wtd_comp_avg;
+      convolve_params_.fwd_offset = quant_dist_lookup_table[i][j][0];
+      convolve_params_.bck_offset = quant_dist_lookup_table[i][j][1];
+      convolve_params_.is_compound = is_compound;
+      convolve_params_.do_average = do_average;
+    }
+  }
+
+  void Run() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    for (int i = 0; i < kTestIters; ++i) {
+      int is_compound = 0;
+      SetConvParamOffset(-1, -1, is_compound, 0, 0);
+      Prep(&rnd);
+      RunOne(true);
+      RunOne(false);
+      image_->Check();
+
+      is_compound = 1;
+      for (int do_average = 0; do_average < 2; do_average++) {
+        for (int use_dist_wtd_comp_avg = 0; use_dist_wtd_comp_avg < 2;
+             use_dist_wtd_comp_avg++) {
+          for (int j = 0; j < 2; ++j) {
+            for (int k = 0; k < 4; ++k) {
+              SetConvParamOffset(j, k, is_compound, do_average,
+                                 use_dist_wtd_comp_avg);
+              Prep(&rnd);
+              RunOne(true);
+              RunOne(false);
+              image_->Check();
+            }
+          }
+        }
+      }
+    }
+  }
+
+  void SpeedTest() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    Prep(&rnd);
+
+    aom_usec_timer ref_timer;
+    aom_usec_timer_start(&ref_timer);
+    for (int i = 0; i < kPerfIters; ++i) RunOne(true);
+    aom_usec_timer_mark(&ref_timer);
+    const int64_t ref_time = aom_usec_timer_elapsed(&ref_timer);
+
+    aom_usec_timer tst_timer;
+    aom_usec_timer_start(&tst_timer);
+    for (int i = 0; i < kPerfIters; ++i) RunOne(false);
+    aom_usec_timer_mark(&tst_timer);
+    const int64_t tst_time = aom_usec_timer_elapsed(&tst_timer);
+
+    std::cout << "[          ] C time = " << ref_time / 1000
+              << " ms, SIMD time = " << tst_time / 1000 << " ms\n";
+
+    EXPECT_GT(ref_time, tst_time)
+        << "Error: CDEFSpeedTest, SIMD slower than C.\n"
+        << "C time: " << ref_time << " us\n"
+        << "SIMD time: " << tst_time << " us\n";
+  }
+
+  static int RandomSubpel(ACMRandom *rnd) {
+    const uint8_t subpel_mode = rnd->Rand8();
+    if ((subpel_mode & 7) == 0) {
+      return 0;
+    } else if ((subpel_mode & 7) == 1) {
+      return SCALE_SUBPEL_SHIFTS - 1;
+    } else {
+      return 1 + rnd->PseudoUniform(SCALE_SUBPEL_SHIFTS - 2);
+    }
+  }
+
+  void Prep(ACMRandom *rnd) {
+    assert(rnd);
+
+    // Choose subpel_x_ and subpel_y_. They should be less than
+    // SCALE_SUBPEL_SHIFTS; we also want to add extra weight to "interesting"
+    // values: 0 and SCALE_SUBPEL_SHIFTS - 1
+    subpel_x_ = RandomSubpel(rnd);
+    subpel_y_ = RandomSubpel(rnd);
+
+    image_->Initialize(rnd);
+  }
+
+  int width_, height_, bd_;
+  NTaps ntaps_x_, ntaps_y_;
+  bool avg_;
+  int subpel_x_, subpel_y_;
+  TestFilter filter_x_, filter_y_;
+  TestImage<SrcPixel> *image_;
+  ConvolveParams convolve_params_;
+};
+
+typedef tuple<int, int> BlockDimension;
+
+typedef void (*LowbdConvolveFunc)(const uint8_t *src, int src_stride,
+                                  uint8_t *dst, int dst_stride, int w, int h,
+                                  const InterpFilterParams *filter_params_x,
+                                  const InterpFilterParams *filter_params_y,
+                                  const int subpel_x_qn, const int x_step_qn,
+                                  const int subpel_y_qn, const int y_step_qn,
+                                  ConvolveParams *conv_params);
+
+// Test parameter list:
+//  <tst_fun, dims, ntaps_x, ntaps_y, avg>
+typedef tuple<LowbdConvolveFunc, BlockDimension, NTaps, NTaps, bool>
+    LowBDParams;
+
+class LowBDConvolveScaleTest
+    : public ConvolveScaleTestBase<uint8_t>,
+      public ::testing::WithParamInterface<LowBDParams> {
+ public:
+  virtual ~LowBDConvolveScaleTest() {}
+
+  void SetUp() {
+    tst_fun_ = GET_PARAM(0);
+
+    const BlockDimension &block = GET_PARAM(1);
+    const NTaps ntaps_x = GET_PARAM(2);
+    const NTaps ntaps_y = GET_PARAM(3);
+    const int bd = 8;
+    const bool avg = GET_PARAM(4);
+
+    SetParams(BaseParams(block, ntaps_x, ntaps_y, avg), bd);
+  }
+
+  void RunOne(bool ref) {
+    const uint8_t *src = image_->GetSrcData(ref, false);
+    uint8_t *dst = image_->GetDstData(ref, false);
+    convolve_params_.dst = image_->GetDst16Data(ref, false);
+    const int src_stride = image_->src_stride();
+    const int dst_stride = image_->dst_stride();
+    if (ref) {
+      av1_convolve_2d_scale_c(src, src_stride, dst, dst_stride, width_, height_,
+                              &filter_x_.params_, &filter_y_.params_, subpel_x_,
+                              kXStepQn, subpel_y_, kYStepQn, &convolve_params_);
+    } else {
+      tst_fun_(src, src_stride, dst, dst_stride, width_, height_,
+               &filter_x_.params_, &filter_y_.params_, subpel_x_, kXStepQn,
+               subpel_y_, kYStepQn, &convolve_params_);
+    }
+  }
+
+ private:
+  LowbdConvolveFunc tst_fun_;
+};
+
+const BlockDimension kBlockDim[] = {
+  make_tuple(2, 2),    make_tuple(2, 4),    make_tuple(4, 4),
+  make_tuple(4, 8),    make_tuple(8, 4),    make_tuple(8, 8),
+  make_tuple(8, 16),   make_tuple(16, 8),   make_tuple(16, 16),
+  make_tuple(16, 32),  make_tuple(32, 16),  make_tuple(32, 32),
+  make_tuple(32, 64),  make_tuple(64, 32),  make_tuple(64, 64),
+  make_tuple(64, 128), make_tuple(128, 64), make_tuple(128, 128),
+};
+
+const NTaps kNTaps[] = { EIGHT_TAP };
+
+TEST_P(LowBDConvolveScaleTest, Check) { Run(); }
+TEST_P(LowBDConvolveScaleTest, DISABLED_Speed) { SpeedTest(); }
+
+INSTANTIATE_TEST_SUITE_P(
+    SSE4_1, LowBDConvolveScaleTest,
+    ::testing::Combine(::testing::Values(av1_convolve_2d_scale_sse4_1),
+                       ::testing::ValuesIn(kBlockDim),
+                       ::testing::ValuesIn(kNTaps), ::testing::ValuesIn(kNTaps),
+                       ::testing::Bool()));
+
+#if CONFIG_AV1_HIGHBITDEPTH
+typedef void (*HighbdConvolveFunc)(const uint16_t *src, int src_stride,
+                                   uint16_t *dst, int dst_stride, int w, int h,
+                                   const InterpFilterParams *filter_params_x,
+                                   const InterpFilterParams *filter_params_y,
+                                   const int subpel_x_qn, const int x_step_qn,
+                                   const int subpel_y_qn, const int y_step_qn,
+                                   ConvolveParams *conv_params, int bd);
+
+// Test parameter list:
+//  <tst_fun, dims, ntaps_x, ntaps_y, avg, bd>
+typedef tuple<HighbdConvolveFunc, BlockDimension, NTaps, NTaps, bool, int>
+    HighBDParams;
+
+class HighBDConvolveScaleTest
+    : public ConvolveScaleTestBase<uint16_t>,
+      public ::testing::WithParamInterface<HighBDParams> {
+ public:
+  virtual ~HighBDConvolveScaleTest() {}
+
+  void SetUp() {
+    tst_fun_ = GET_PARAM(0);
+
+    const BlockDimension &block = GET_PARAM(1);
+    const NTaps ntaps_x = GET_PARAM(2);
+    const NTaps ntaps_y = GET_PARAM(3);
+    const bool avg = GET_PARAM(4);
+    const int bd = GET_PARAM(5);
+
+    SetParams(BaseParams(block, ntaps_x, ntaps_y, avg), bd);
+  }
+
+  void RunOne(bool ref) {
+    const uint16_t *src = image_->GetSrcData(ref, false);
+    uint16_t *dst = image_->GetDstData(ref, false);
+    convolve_params_.dst = image_->GetDst16Data(ref, false);
+    const int src_stride = image_->src_stride();
+    const int dst_stride = image_->dst_stride();
+
+    if (ref) {
+      av1_highbd_convolve_2d_scale_c(
+          src, src_stride, dst, dst_stride, width_, height_, &filter_x_.params_,
+          &filter_y_.params_, subpel_x_, kXStepQn, subpel_y_, kYStepQn,
+          &convolve_params_, bd_);
+    } else {
+      tst_fun_(src, src_stride, dst, dst_stride, width_, height_,
+               &filter_x_.params_, &filter_y_.params_, subpel_x_, kXStepQn,
+               subpel_y_, kYStepQn, &convolve_params_, bd_);
+    }
+  }
+
+ private:
+  HighbdConvolveFunc tst_fun_;
+};
+
+const int kBDs[] = { 8, 10, 12 };
+
+TEST_P(HighBDConvolveScaleTest, Check) { Run(); }
+TEST_P(HighBDConvolveScaleTest, DISABLED_Speed) { SpeedTest(); }
+
+INSTANTIATE_TEST_SUITE_P(
+    SSE4_1, HighBDConvolveScaleTest,
+    ::testing::Combine(::testing::Values(av1_highbd_convolve_2d_scale_sse4_1),
+                       ::testing::ValuesIn(kBlockDim),
+                       ::testing::ValuesIn(kNTaps), ::testing::ValuesIn(kNTaps),
+                       ::testing::Bool(), ::testing::ValuesIn(kBDs)));
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+}  // namespace
diff --git a/libs/libaom/src/test/av1_encoder_parms_get_to_decoder.cc b/libs/libaom/src/test/av1_encoder_parms_get_to_decoder.cc
new file mode 100644
index 000000000..76b82f58f
--- /dev/null
+++ b/libs/libaom/src/test/av1_encoder_parms_get_to_decoder.cc
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <memory>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+
+#include "aom/aom_decoder.h"
+#include "av1/decoder/decoder.h"
+
+namespace {
+
+const int kMaxPsnr = 100;
+
+struct ParamPassingTestVideo {
+  const char *name;
+  uint32_t width;
+  uint32_t height;
+  uint32_t bitrate;
+  int frames;
+};
+
+const ParamPassingTestVideo kAV1ParamPassingTestVector = {
+  "niklas_1280_720_30.y4m", 1280, 720, 600, 3
+};
+
+struct EncodeParameters {
+  int32_t lossless;
+  aom_color_primaries_t color_primaries;
+  aom_transfer_characteristics_t transfer_characteristics;
+  aom_matrix_coefficients_t matrix_coefficients;
+  aom_color_range_t color_range;
+  aom_chroma_sample_position_t chroma_sample_position;
+  int32_t render_size[2];
+};
+
+const EncodeParameters kAV1EncodeParameterSet[] = {
+  { 1,
+    AOM_CICP_CP_BT_709,
+    AOM_CICP_TC_BT_709,
+    AOM_CICP_MC_BT_709,
+    AOM_CR_STUDIO_RANGE,
+    AOM_CSP_UNKNOWN,
+    { 0, 0 } },
+  { 0,
+    AOM_CICP_CP_BT_470_M,
+    AOM_CICP_TC_BT_470_M,
+    AOM_CICP_MC_BT_470_B_G,
+    AOM_CR_FULL_RANGE,
+    AOM_CSP_VERTICAL,
+    { 0, 0 } },
+  { 1,
+    AOM_CICP_CP_BT_601,
+    AOM_CICP_TC_BT_601,
+    AOM_CICP_MC_BT_601,
+    AOM_CR_STUDIO_RANGE,
+    AOM_CSP_COLOCATED,
+    { 0, 0 } },
+  { 0,
+    AOM_CICP_CP_BT_2020,
+    AOM_CICP_TC_BT_2020_10_BIT,
+    AOM_CICP_MC_BT_2020_NCL,
+    AOM_CR_FULL_RANGE,
+    AOM_CSP_RESERVED,
+    { 640, 480 } },
+};
+
+class AVxEncoderParmsGetToDecoder
+    : public ::libaom_test::EncoderTest,
+      public ::libaom_test::CodecTestWithParam<EncodeParameters> {
+ protected:
+  AVxEncoderParmsGetToDecoder()
+      : EncoderTest(GET_PARAM(0)), encode_parms(GET_PARAM(1)) {}
+
+  virtual ~AVxEncoderParmsGetToDecoder() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(::libaom_test::kTwoPassGood);
+    cfg_.g_lag_in_frames = 25;
+    test_video_ = kAV1ParamPassingTestVector;
+    cfg_.rc_target_bitrate = test_video_.bitrate;
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(AV1E_SET_COLOR_PRIMARIES, encode_parms.color_primaries);
+      encoder->Control(AV1E_SET_TRANSFER_CHARACTERISTICS,
+                       encode_parms.transfer_characteristics);
+      encoder->Control(AV1E_SET_MATRIX_COEFFICIENTS,
+                       encode_parms.matrix_coefficients);
+      encoder->Control(AV1E_SET_COLOR_RANGE, encode_parms.color_range);
+      encoder->Control(AV1E_SET_CHROMA_SAMPLE_POSITION,
+                       encode_parms.chroma_sample_position);
+      encoder->Control(AV1E_SET_LOSSLESS, encode_parms.lossless);
+      if (encode_parms.render_size[0] > 0 && encode_parms.render_size[1] > 0) {
+        encoder->Control(AV1E_SET_RENDER_SIZE, encode_parms.render_size);
+      }
+    }
+  }
+
+  virtual void DecompressedFrameHook(const aom_image_t &img,
+                                     aom_codec_pts_t pts) {
+    (void)pts;
+    if (encode_parms.render_size[0] > 0 && encode_parms.render_size[1] > 0) {
+      EXPECT_EQ(encode_parms.render_size[0], (int)img.r_w);
+      EXPECT_EQ(encode_parms.render_size[1], (int)img.r_h);
+    }
+    EXPECT_EQ(encode_parms.color_primaries, img.cp);
+    EXPECT_EQ(encode_parms.transfer_characteristics, img.tc);
+    EXPECT_EQ(encode_parms.matrix_coefficients, img.mc);
+    EXPECT_EQ(encode_parms.color_range, img.range);
+    EXPECT_EQ(encode_parms.chroma_sample_position, img.csp);
+  }
+
+  virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+    if (encode_parms.lossless) {
+      EXPECT_EQ(kMaxPsnr, pkt->data.psnr.psnr[0]);
+    }
+  }
+
+  virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
+                                  libaom_test::Decoder *decoder) {
+    EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
+    return AOM_CODEC_OK == res_dec;
+  }
+
+  ParamPassingTestVideo test_video_;
+
+ private:
+  EncodeParameters encode_parms;
+};
+
+TEST_P(AVxEncoderParmsGetToDecoder, BitstreamParms) {
+  init_flags_ = AOM_CODEC_USE_PSNR;
+
+  std::unique_ptr<libaom_test::VideoSource> video(
+      new libaom_test::Y4mVideoSource(test_video_.name, 0, test_video_.frames));
+  ASSERT_TRUE(video.get() != NULL);
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+}
+
+AV1_INSTANTIATE_TEST_CASE(AVxEncoderParmsGetToDecoder,
+                          ::testing::ValuesIn(kAV1EncodeParameterSet));
+}  // namespace
diff --git a/libs/libaom/src/test/av1_ext_tile_test.cc b/libs/libaom/src/test/av1_ext_tile_test.cc
new file mode 100644
index 000000000..424d2f065
--- /dev/null
+++ b/libs/libaom/src/test/av1_ext_tile_test.cc
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <string>
+#include <vector>
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/md5_helper.h"
+#include "test/util.h"
+
+namespace {
+// The number of frames to be encoded/decoded
+const int kLimit = 8;
+// Skip 1 frame to check the frame decoding independency.
+const int kSkip = 5;
+const int kTileSize = 1;
+const int kTIleSizeInPixels = (kTileSize << 6);
+// Fake width and height so that they can be multiples of the tile size.
+const int kImgWidth = 704;
+const int kImgHeight = 576;
+
+// This test tests large scale tile coding case. Non-large-scale tile coding
+// is tested by the tile_independence test.
+class AV1ExtTileTest
+    : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode, int>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  AV1ExtTileTest()
+      : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+        set_cpu_used_(GET_PARAM(2)) {
+    init_flags_ = AOM_CODEC_USE_PSNR;
+    aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t();
+    cfg.w = kImgWidth;
+    cfg.h = kImgHeight;
+    cfg.allow_lowbitdepth = 1;
+
+    decoder_ = codec_->CreateDecoder(cfg, 0);
+    decoder_->Control(AV1_SET_TILE_MODE, 1);
+    decoder_->Control(AV1D_EXT_TILE_DEBUG, 1);
+    decoder_->Control(AV1_SET_DECODE_TILE_ROW, -1);
+    decoder_->Control(AV1_SET_DECODE_TILE_COL, -1);
+
+    // Allocate buffer to store tile image.
+    aom_img_alloc(&tile_img_, AOM_IMG_FMT_I420, kImgWidth, kImgHeight, 32);
+
+    md5_.clear();
+    tile_md5_.clear();
+  }
+
+  virtual ~AV1ExtTileTest() {
+    aom_img_free(&tile_img_);
+    delete decoder_;
+  }
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(encoding_mode_);
+
+    cfg_.g_lag_in_frames = 0;
+    cfg_.rc_end_usage = AOM_VBR;
+    cfg_.g_error_resilient = 1;
+
+    cfg_.rc_max_quantizer = 56;
+    cfg_.rc_min_quantizer = 0;
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      // Encode setting
+      encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
+      encoder->Control(AOME_SET_ENABLEAUTOALTREF, 0);
+      encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1);
+
+      // TODO(yunqingwang): test single_tile_decoding = 0.
+      encoder->Control(AV1E_SET_SINGLE_TILE_DECODING, 1);
+      // Always use 64x64 max partition.
+      encoder->Control(AV1E_SET_SUPERBLOCK_SIZE, AOM_SUPERBLOCK_SIZE_64X64);
+      // Set tile_columns and tile_rows to MAX values, which guarantees the tile
+      // size of 64 x 64 pixels(i.e. 1 SB) for <= 4k resolution.
+      encoder->Control(AV1E_SET_TILE_COLUMNS, 6);
+      encoder->Control(AV1E_SET_TILE_ROWS, 6);
+    }
+
+    if (video->frame() == 1) {
+      frame_flags_ =
+          AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF;
+    }
+  }
+
+  virtual void DecompressedFrameHook(const aom_image_t &img,
+                                     aom_codec_pts_t pts) {
+    // Skip 1 already decoded frame to be consistent with the decoder in this
+    // test.
+    if (pts == (aom_codec_pts_t)kSkip) return;
+
+    // Calculate MD5 as the reference.
+    ::libaom_test::MD5 md5_res;
+    md5_res.Add(&img);
+    md5_.push_back(md5_res.Get());
+  }
+
+  virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
+    // Skip decoding 1 frame.
+    if (pkt->data.frame.pts == (aom_codec_pts_t)kSkip) return;
+
+    bool IsLastFrame = (pkt->data.frame.pts == (aom_codec_pts_t)(kLimit - 1));
+
+    // Decode the first (kLimit - 1) frames as whole frame, and decode the last
+    // frame in single tiles.
+    for (int r = 0; r < kImgHeight / kTIleSizeInPixels; ++r) {
+      for (int c = 0; c < kImgWidth / kTIleSizeInPixels; ++c) {
+        if (!IsLastFrame) {
+          decoder_->Control(AV1_SET_DECODE_TILE_ROW, -1);
+          decoder_->Control(AV1_SET_DECODE_TILE_COL, -1);
+        } else {
+          decoder_->Control(AV1_SET_DECODE_TILE_ROW, r);
+          decoder_->Control(AV1_SET_DECODE_TILE_COL, c);
+        }
+
+        const aom_codec_err_t res = decoder_->DecodeFrame(
+            reinterpret_cast<uint8_t *>(pkt->data.frame.buf),
+            pkt->data.frame.sz);
+        if (res != AOM_CODEC_OK) {
+          abort_ = true;
+          ASSERT_EQ(AOM_CODEC_OK, res);
+        }
+        const aom_image_t *img = decoder_->GetDxData().Next();
+
+        if (!IsLastFrame) {
+          if (img) {
+            ::libaom_test::MD5 md5_res;
+            md5_res.Add(img);
+            tile_md5_.push_back(md5_res.Get());
+          }
+          break;
+        }
+
+        const int kMaxMBPlane = 3;
+        for (int plane = 0; plane < kMaxMBPlane; ++plane) {
+          const int shift = (plane == 0) ? 0 : 1;
+          int tile_height = kTIleSizeInPixels >> shift;
+          int tile_width = kTIleSizeInPixels >> shift;
+
+          for (int tr = 0; tr < tile_height; ++tr) {
+            memcpy(tile_img_.planes[plane] +
+                       tile_img_.stride[plane] * (r * tile_height + tr) +
+                       c * tile_width,
+                   img->planes[plane] + img->stride[plane] * tr, tile_width);
+          }
+        }
+      }
+
+      if (!IsLastFrame) break;
+    }
+
+    if (IsLastFrame) {
+      ::libaom_test::MD5 md5_res;
+      md5_res.Add(&tile_img_);
+      tile_md5_.push_back(md5_res.Get());
+    }
+  }
+
+  void TestRoundTrip() {
+    ::libaom_test::I420VideoSource video(
+        "hantro_collage_w352h288.yuv", kImgWidth, kImgHeight, 30, 1, 0, kLimit);
+    cfg_.rc_target_bitrate = 500;
+    cfg_.g_error_resilient = AOM_ERROR_RESILIENT_DEFAULT;
+    cfg_.large_scale_tile = 1;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.g_threads = 1;
+
+    // Tile encoding
+    init_flags_ = AOM_CODEC_USE_PSNR;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+    // Compare to check if two vectors are equal.
+    ASSERT_EQ(md5_, tile_md5_);
+  }
+
+  ::libaom_test::TestMode encoding_mode_;
+  int set_cpu_used_;
+  ::libaom_test::Decoder *decoder_;
+  aom_image_t tile_img_;
+  std::vector<std::string> md5_;
+  std::vector<std::string> tile_md5_;
+};
+
+TEST_P(AV1ExtTileTest, DecoderResultTest) { TestRoundTrip(); }
+
+AV1_INSTANTIATE_TEST_CASE(
+    // Now only test 2-pass mode.
+    AV1ExtTileTest, ::testing::Values(::libaom_test::kTwoPassGood),
+    ::testing::Range(1, 4));
+
+class AV1ExtTileTestLarge : public AV1ExtTileTest {};
+
+TEST_P(AV1ExtTileTestLarge, DecoderResultTest) { TestRoundTrip(); }
+
+AV1_INSTANTIATE_TEST_CASE(
+    // Now only test 2-pass mode.
+    AV1ExtTileTestLarge, ::testing::Values(::libaom_test::kTwoPassGood),
+    ::testing::Range(0, 1));
+}  // namespace
diff --git a/libs/libaom/src/test/av1_fwd_txfm1d_test.cc b/libs/libaom/src/test/av1_fwd_txfm1d_test.cc
new file mode 100644
index 000000000..abc46ed5a
--- /dev/null
+++ b/libs/libaom/src/test/av1_fwd_txfm1d_test.cc
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/encoder/av1_fwd_txfm1d.h"
+#include "test/av1_txfm_test.h"
+
+using libaom_test::ACMRandom;
+using libaom_test::input_base;
+using libaom_test::reference_hybrid_1d;
+using libaom_test::TYPE_ADST;
+using libaom_test::TYPE_DCT;
+using libaom_test::TYPE_IDTX;
+using libaom_test::TYPE_TXFM;
+
+namespace {
+const int txfm_type_num = 3;
+const TYPE_TXFM txfm_type_ls[txfm_type_num] = { TYPE_DCT, TYPE_ADST,
+                                                TYPE_IDTX };
+
+const int txfm_size_num = 5;
+
+const int txfm_size_ls[] = { 4, 8, 16, 32, 64 };
+
+const TxfmFunc fwd_txfm_func_ls[][txfm_type_num] = {
+  { av1_fdct4, av1_fadst4, av1_fidentity4_c },
+  { av1_fdct8, av1_fadst8, av1_fidentity8_c },
+  { av1_fdct16, av1_fadst16, av1_fidentity16_c },
+  { av1_fdct32, NULL, av1_fidentity32_c },
+  { av1_fdct64, NULL, NULL },
+};
+
+// the maximum stage number of fwd/inv 1d dct/adst txfm is 12
+const int8_t cos_bit = 14;
+const int8_t range_bit[12] = { 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20 };
+
+TEST(av1_fwd_txfm1d, round_shift) {
+  EXPECT_EQ(round_shift(7, 1), 4);
+  EXPECT_EQ(round_shift(-7, 1), -3);
+
+  EXPECT_EQ(round_shift(7, 2), 2);
+  EXPECT_EQ(round_shift(-7, 2), -2);
+
+  EXPECT_EQ(round_shift(8, 2), 2);
+  EXPECT_EQ(round_shift(-8, 2), -2);
+}
+
+TEST(av1_fwd_txfm1d, av1_cospi_arr_data) {
+  for (int i = 0; i < 7; i++) {
+    for (int j = 0; j < 64; j++) {
+      EXPECT_EQ(av1_cospi_arr_data[i][j],
+                (int32_t)round(cos(PI * j / 128) * (1 << (cos_bit_min + i))));
+    }
+  }
+}
+
+TEST(av1_fwd_txfm1d, accuracy) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  for (int si = 0; si < txfm_size_num; ++si) {
+    int txfm_size = txfm_size_ls[si];
+    int32_t *input = new int32_t[txfm_size];
+    int32_t *output = new int32_t[txfm_size];
+    double *ref_input = new double[txfm_size];
+    double *ref_output = new double[txfm_size];
+
+    for (int ti = 0; ti < txfm_type_num; ++ti) {
+      TYPE_TXFM txfm_type = txfm_type_ls[ti];
+      TxfmFunc fwd_txfm_func = fwd_txfm_func_ls[si][ti];
+      int max_error = 7;
+
+      const int count_test_block = 5000;
+      if (fwd_txfm_func != NULL) {
+        for (int ti = 0; ti < count_test_block; ++ti) {
+          for (int ni = 0; ni < txfm_size; ++ni) {
+            input[ni] = rnd.Rand16() % input_base - rnd.Rand16() % input_base;
+            ref_input[ni] = static_cast<double>(input[ni]);
+          }
+
+          fwd_txfm_func(input, output, cos_bit, range_bit);
+          reference_hybrid_1d(ref_input, ref_output, txfm_size, txfm_type);
+
+          for (int ni = 0; ni < txfm_size; ++ni) {
+            ASSERT_LE(
+                abs(output[ni] - static_cast<int32_t>(round(ref_output[ni]))),
+                max_error)
+                << "tx size = " << txfm_size << ", tx type = " << txfm_type;
+          }
+        }
+      }
+    }
+
+    delete[] input;
+    delete[] output;
+    delete[] ref_input;
+    delete[] ref_output;
+  }
+}
+}  // namespace
diff --git a/libs/libaom/src/test/av1_fwd_txfm2d_test.cc b/libs/libaom/src/test/av1_fwd_txfm2d_test.cc
new file mode 100644
index 000000000..dd6066576
--- /dev/null
+++ b/libs/libaom/src/test/av1_fwd_txfm2d_test.cc
@@ -0,0 +1,583 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <tuple>
+#include <vector>
+
+#include "config/av1_rtcd.h"
+
+#include "test/acm_random.h"
+#include "test/util.h"
+#include "test/av1_txfm_test.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/encoder/hybrid_fwd_txfm.h"
+
+using libaom_test::ACMRandom;
+using libaom_test::bd;
+using libaom_test::compute_avg_abs_error;
+using libaom_test::input_base;
+using libaom_test::TYPE_TXFM;
+
+using std::vector;
+
+namespace {
+// tx_type_, tx_size_, max_error_, max_avg_error_
+typedef std::tuple<TX_TYPE, TX_SIZE, double, double> AV1FwdTxfm2dParam;
+
+class AV1FwdTxfm2d : public ::testing::TestWithParam<AV1FwdTxfm2dParam> {
+ public:
+  virtual void SetUp() {
+    tx_type_ = GET_PARAM(0);
+    tx_size_ = GET_PARAM(1);
+    max_error_ = GET_PARAM(2);
+    max_avg_error_ = GET_PARAM(3);
+    count_ = 500;
+    TXFM_2D_FLIP_CFG fwd_txfm_flip_cfg;
+    av1_get_fwd_txfm_cfg(tx_type_, tx_size_, &fwd_txfm_flip_cfg);
+    amplify_factor_ = libaom_test::get_amplification_factor(tx_type_, tx_size_);
+    tx_width_ = tx_size_wide[fwd_txfm_flip_cfg.tx_size];
+    tx_height_ = tx_size_high[fwd_txfm_flip_cfg.tx_size];
+    ud_flip_ = fwd_txfm_flip_cfg.ud_flip;
+    lr_flip_ = fwd_txfm_flip_cfg.lr_flip;
+
+    fwd_txfm_ = libaom_test::fwd_txfm_func_ls[tx_size_];
+    txfm2d_size_ = tx_width_ * tx_height_;
+    input_ = reinterpret_cast<int16_t *>(
+        aom_memalign(16, sizeof(input_[0]) * txfm2d_size_));
+    output_ = reinterpret_cast<int32_t *>(
+        aom_memalign(16, sizeof(output_[0]) * txfm2d_size_));
+    ref_input_ = reinterpret_cast<double *>(
+        aom_memalign(16, sizeof(ref_input_[0]) * txfm2d_size_));
+    ref_output_ = reinterpret_cast<double *>(
+        aom_memalign(16, sizeof(ref_output_[0]) * txfm2d_size_));
+  }
+
+  void RunFwdAccuracyCheck() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    double avg_abs_error = 0;
+    for (int ci = 0; ci < count_; ci++) {
+      for (int ni = 0; ni < txfm2d_size_; ++ni) {
+        input_[ni] = rnd.Rand16() % input_base;
+        ref_input_[ni] = static_cast<double>(input_[ni]);
+        output_[ni] = 0;
+        ref_output_[ni] = 0;
+      }
+
+      fwd_txfm_(input_, output_, tx_width_, tx_type_, bd);
+
+      if (lr_flip_ && ud_flip_) {
+        libaom_test::fliplrud(ref_input_, tx_width_, tx_height_, tx_width_);
+      } else if (lr_flip_) {
+        libaom_test::fliplr(ref_input_, tx_width_, tx_height_, tx_width_);
+      } else if (ud_flip_) {
+        libaom_test::flipud(ref_input_, tx_width_, tx_height_, tx_width_);
+      }
+
+      libaom_test::reference_hybrid_2d(ref_input_, ref_output_, tx_type_,
+                                       tx_size_);
+
+      double actual_max_error = 0;
+      for (int ni = 0; ni < txfm2d_size_; ++ni) {
+        ref_output_[ni] = round(ref_output_[ni]);
+        const double this_error =
+            fabs(output_[ni] - ref_output_[ni]) / amplify_factor_;
+        actual_max_error = AOMMAX(actual_max_error, this_error);
+      }
+      EXPECT_GE(max_error_, actual_max_error)
+          << "tx_size = " << tx_size_ << ", tx_type = " << tx_type_;
+      if (actual_max_error > max_error_) {  // exit early.
+        break;
+      }
+
+      avg_abs_error += compute_avg_abs_error<int32_t, double>(
+          output_, ref_output_, txfm2d_size_);
+    }
+
+    avg_abs_error /= amplify_factor_;
+    avg_abs_error /= count_;
+    EXPECT_GE(max_avg_error_, avg_abs_error)
+        << "tx_size = " << tx_size_ << ", tx_type = " << tx_type_;
+  }
+
+  virtual void TearDown() {
+    aom_free(input_);
+    aom_free(output_);
+    aom_free(ref_input_);
+    aom_free(ref_output_);
+  }
+
+ private:
+  double max_error_;
+  double max_avg_error_;
+  int count_;
+  double amplify_factor_;
+  TX_TYPE tx_type_;
+  TX_SIZE tx_size_;
+  int tx_width_;
+  int tx_height_;
+  int txfm2d_size_;
+  FwdTxfm2dFunc fwd_txfm_;
+  int16_t *input_;
+  int32_t *output_;
+  double *ref_input_;
+  double *ref_output_;
+  int ud_flip_;  // flip upside down
+  int lr_flip_;  // flip left to right
+};
+
+static double avg_error_ls[TX_SIZES_ALL] = {
+  0.5,   // 4x4 transform
+  0.5,   // 8x8 transform
+  1.2,   // 16x16 transform
+  6.1,   // 32x32 transform
+  3.4,   // 64x64 transform
+  0.57,  // 4x8 transform
+  0.68,  // 8x4 transform
+  0.92,  // 8x16 transform
+  1.1,   // 16x8 transform
+  4.1,   // 16x32 transform
+  6,     // 32x16 transform
+  3.5,   // 32x64 transform
+  5.7,   // 64x32 transform
+  0.6,   // 4x16 transform
+  0.9,   // 16x4 transform
+  1.2,   // 8x32 transform
+  1.7,   // 32x8 transform
+  2.0,   // 16x64 transform
+  4.7,   // 64x16 transform
+};
+
+static double max_error_ls[TX_SIZES_ALL] = {
+  3,    // 4x4 transform
+  5,    // 8x8 transform
+  11,   // 16x16 transform
+  70,   // 32x32 transform
+  64,   // 64x64 transform
+  3.9,  // 4x8 transform
+  4.3,  // 8x4 transform
+  12,   // 8x16 transform
+  12,   // 16x8 transform
+  32,   // 16x32 transform
+  46,   // 32x16 transform
+  136,  // 32x64 transform
+  136,  // 64x32 transform
+  5,    // 4x16 transform
+  6,    // 16x4 transform
+  21,   // 8x32 transform
+  13,   // 32x8 transform
+  30,   // 16x64 transform
+  36,   // 64x16 transform
+};
+
+vector<AV1FwdTxfm2dParam> GetTxfm2dParamList() {
+  vector<AV1FwdTxfm2dParam> param_list;
+  for (int s = 0; s < TX_SIZES; ++s) {
+    const double max_error = max_error_ls[s];
+    const double avg_error = avg_error_ls[s];
+    for (int t = 0; t < TX_TYPES; ++t) {
+      const TX_TYPE tx_type = static_cast<TX_TYPE>(t);
+      const TX_SIZE tx_size = static_cast<TX_SIZE>(s);
+      if (libaom_test::IsTxSizeTypeValid(tx_size, tx_type)) {
+        param_list.push_back(
+            AV1FwdTxfm2dParam(tx_type, tx_size, max_error, avg_error));
+      }
+    }
+  }
+  return param_list;
+}
+
+INSTANTIATE_TEST_SUITE_P(C, AV1FwdTxfm2d,
+                         ::testing::ValuesIn(GetTxfm2dParamList()));
+
+TEST_P(AV1FwdTxfm2d, RunFwdAccuracyCheck) { RunFwdAccuracyCheck(); }
+
+TEST(AV1FwdTxfm2d, CfgTest) {
+  for (int bd_idx = 0; bd_idx < BD_NUM; ++bd_idx) {
+    int bd = libaom_test::bd_arr[bd_idx];
+    int8_t low_range = libaom_test::low_range_arr[bd_idx];
+    int8_t high_range = libaom_test::high_range_arr[bd_idx];
+    for (int tx_size = 0; tx_size < TX_SIZES_ALL; ++tx_size) {
+      for (int tx_type = 0; tx_type < TX_TYPES; ++tx_type) {
+        if (libaom_test::IsTxSizeTypeValid(static_cast<TX_SIZE>(tx_size),
+                                           static_cast<TX_TYPE>(tx_type)) ==
+            false) {
+          continue;
+        }
+        TXFM_2D_FLIP_CFG cfg;
+        av1_get_fwd_txfm_cfg(static_cast<TX_TYPE>(tx_type),
+                             static_cast<TX_SIZE>(tx_size), &cfg);
+        int8_t stage_range_col[MAX_TXFM_STAGE_NUM];
+        int8_t stage_range_row[MAX_TXFM_STAGE_NUM];
+        av1_gen_fwd_stage_range(stage_range_col, stage_range_row, &cfg, bd);
+        libaom_test::txfm_stage_range_check(stage_range_col, cfg.stage_num_col,
+                                            cfg.cos_bit_col, low_range,
+                                            high_range);
+        libaom_test::txfm_stage_range_check(stage_range_row, cfg.stage_num_row,
+                                            cfg.cos_bit_row, low_range,
+                                            high_range);
+      }
+    }
+  }
+}
+
+typedef void (*lowbd_fwd_txfm_func)(const int16_t *src_diff, tran_low_t *coeff,
+                                    int diff_stride, TxfmParam *txfm_param);
+
+void AV1FwdTxfm2dMatchTest(TX_SIZE tx_size, lowbd_fwd_txfm_func target_func) {
+  const int bd = 8;
+  TxfmParam param;
+  memset(&param, 0, sizeof(param));
+  const int rows = tx_size_high[tx_size];
+  const int cols = tx_size_wide[tx_size];
+  // printf("%d x %d\n", cols, rows);
+  for (int tx_type = 0; tx_type < TX_TYPES; ++tx_type) {
+    if (libaom_test::IsTxSizeTypeValid(
+            tx_size, static_cast<TX_TYPE>(tx_type)) == false) {
+      continue;
+    }
+
+    FwdTxfm2dFunc ref_func = libaom_test::fwd_txfm_func_ls[tx_size];
+    if (ref_func != NULL) {
+      DECLARE_ALIGNED(32, int16_t, input[64 * 64]) = { 0 };
+      DECLARE_ALIGNED(32, int32_t, output[64 * 64]);
+      DECLARE_ALIGNED(32, int32_t, ref_output[64 * 64]);
+      int input_stride = 64;
+      ACMRandom rnd(ACMRandom::DeterministicSeed());
+      for (int cnt = 0; cnt < 500; ++cnt) {
+        if (cnt == 0) {
+          for (int r = 0; r < rows; ++r) {
+            for (int c = 0; c < cols; ++c) {
+              input[r * input_stride + c] = (1 << bd) - 1;
+            }
+          }
+        } else {
+          for (int r = 0; r < rows; ++r) {
+            for (int c = 0; c < cols; ++c) {
+              input[r * input_stride + c] = rnd.Rand16() % (1 << bd);
+            }
+          }
+        }
+        param.tx_type = (TX_TYPE)tx_type;
+        param.tx_size = (TX_SIZE)tx_size;
+        param.tx_set_type = EXT_TX_SET_ALL16;
+        param.bd = bd;
+        ref_func(input, ref_output, input_stride, (TX_TYPE)tx_type, bd);
+        target_func(input, output, input_stride, &param);
+        const int check_rows = AOMMIN(32, rows);
+        const int check_cols = AOMMIN(32, rows * cols / check_rows);
+        for (int r = 0; r < check_rows; ++r) {
+          for (int c = 0; c < check_cols; ++c) {
+            ASSERT_EQ(ref_output[r * check_cols + c],
+                      output[r * check_cols + c])
+                << "[" << r << "," << c << "] cnt:" << cnt
+                << " tx_size: " << tx_size << " tx_type: " << tx_type;
+          }
+        }
+      }
+    }
+  }
+}
+
+void AV1FwdTxfm2dSpeedTest(TX_SIZE tx_size, lowbd_fwd_txfm_func target_func) {
+  TxfmParam param;
+  memset(&param, 0, sizeof(param));
+  const int rows = tx_size_high[tx_size];
+  const int cols = tx_size_wide[tx_size];
+  const int num_loops = 1000000 / (rows * cols);
+
+  for (int i = 0; i < 2; ++i) {
+    const int bd = 8;
+    for (int tx_type = 0; tx_type < TX_TYPES; ++tx_type) {
+      if (libaom_test::IsTxSizeTypeValid(
+              tx_size, static_cast<TX_TYPE>(tx_type)) == false) {
+        continue;
+      }
+
+      FwdTxfm2dFunc ref_func = libaom_test::fwd_txfm_func_ls[tx_size];
+      if (ref_func != NULL) {
+        DECLARE_ALIGNED(32, int16_t, input[64 * 64]) = { 0 };
+        DECLARE_ALIGNED(32, int32_t, output[64 * 64]);
+        DECLARE_ALIGNED(32, int32_t, ref_output[64 * 64]);
+        int input_stride = 64;
+        ACMRandom rnd(ACMRandom::DeterministicSeed());
+
+        for (int r = 0; r < rows; ++r) {
+          for (int c = 0; c < cols; ++c) {
+            input[r * input_stride + c] = rnd.Rand16() % (1 << bd);
+          }
+        }
+
+        param.tx_type = (TX_TYPE)tx_type;
+        param.tx_size = (TX_SIZE)tx_size;
+        param.tx_set_type = EXT_TX_SET_ALL16;
+        param.bd = bd;
+
+        aom_usec_timer ref_timer, test_timer;
+
+        aom_usec_timer_start(&ref_timer);
+        for (int i = 0; i < num_loops; ++i) {
+          ref_func(input, ref_output, input_stride, (TX_TYPE)tx_type, bd);
+        }
+        aom_usec_timer_mark(&ref_timer);
+        const int elapsed_time_c =
+            static_cast<int>(aom_usec_timer_elapsed(&ref_timer));
+
+        aom_usec_timer_start(&test_timer);
+        for (int i = 0; i < num_loops; ++i) {
+          target_func(input, output, input_stride, &param);
+        }
+        aom_usec_timer_mark(&test_timer);
+        const int elapsed_time_simd =
+            static_cast<int>(aom_usec_timer_elapsed(&test_timer));
+
+        printf(
+            "txfm_size[%d] \t txfm_type[%d] \t c_time=%d \t simd_time=%d \t "
+            "gain=%d \n",
+            tx_size, tx_type, elapsed_time_c, elapsed_time_simd,
+            (elapsed_time_c / elapsed_time_simd));
+      }
+    }
+  }
+}
+
+typedef std::tuple<TX_SIZE, lowbd_fwd_txfm_func> LbdFwdTxfm2dParam;
+
+class AV1FwdTxfm2dTest : public ::testing::TestWithParam<LbdFwdTxfm2dParam> {};
+
+TEST_P(AV1FwdTxfm2dTest, match) {
+  AV1FwdTxfm2dMatchTest(GET_PARAM(0), GET_PARAM(1));
+}
+TEST_P(AV1FwdTxfm2dTest, DISABLED_Speed) {
+  AV1FwdTxfm2dSpeedTest(GET_PARAM(0), GET_PARAM(1));
+}
+using ::testing::Combine;
+using ::testing::Values;
+using ::testing::ValuesIn;
+
+#if HAVE_SSE2
+static TX_SIZE fwd_txfm_for_sse2[] = {
+  TX_4X4,
+  TX_8X8,
+  TX_16X16,
+  TX_32X32,
+  // TX_64X64,
+  TX_4X8,
+  TX_8X4,
+  TX_8X16,
+  TX_16X8,
+  TX_16X32,
+  TX_32X16,
+  // TX_32X64,
+  // TX_64X32,
+  TX_4X16,
+  TX_16X4,
+  TX_8X32,
+  TX_32X8,
+  TX_16X64,
+  TX_64X16,
+};
+
+INSTANTIATE_TEST_SUITE_P(SSE2, AV1FwdTxfm2dTest,
+                         Combine(ValuesIn(fwd_txfm_for_sse2),
+                                 Values(av1_lowbd_fwd_txfm_sse2)));
+#endif  // HAVE_SSE2
+
+#if HAVE_SSE4_1
+static TX_SIZE fwd_txfm_for_sse41[] = {
+  TX_4X4,
+  TX_64X64,
+  TX_32X64,
+  TX_64X32,
+};
+
+INSTANTIATE_TEST_SUITE_P(SSE4_1, AV1FwdTxfm2dTest,
+                         Combine(ValuesIn(fwd_txfm_for_sse41),
+                                 Values(av1_lowbd_fwd_txfm_sse4_1)));
+#endif  // HAVE_SSE4_1
+
+#if HAVE_AVX2
+static TX_SIZE fwd_txfm_for_avx2[] = {
+  TX_4X4,  TX_8X8,  TX_16X16, TX_32X32, TX_64X64, TX_4X8,   TX_8X4,
+  TX_8X16, TX_16X8, TX_16X32, TX_32X16, TX_32X64, TX_64X32, TX_4X16,
+  TX_16X4, TX_8X32, TX_32X8,  TX_16X64, TX_64X16,
+};
+
+INSTANTIATE_TEST_SUITE_P(AVX2, AV1FwdTxfm2dTest,
+                         Combine(ValuesIn(fwd_txfm_for_avx2),
+                                 Values(av1_lowbd_fwd_txfm_avx2)));
+#endif  // HAVE_AVX2
+
+typedef void (*Highbd_fwd_txfm_func)(const int16_t *src_diff, tran_low_t *coeff,
+                                     int diff_stride, TxfmParam *txfm_param);
+
+void AV1HighbdFwdTxfm2dMatchTest(TX_SIZE tx_size,
+                                 Highbd_fwd_txfm_func target_func) {
+  const int bd_ar[2] = { 10, 12 };
+  TxfmParam param;
+  memset(&param, 0, sizeof(param));
+  const int rows = tx_size_high[tx_size];
+  const int cols = tx_size_wide[tx_size];
+  for (int i = 0; i < 2; ++i) {
+    const int bd = bd_ar[i];
+    for (int tx_type = 0; tx_type < TX_TYPES; ++tx_type) {
+      if (libaom_test::IsTxSizeTypeValid(
+              tx_size, static_cast<TX_TYPE>(tx_type)) == false) {
+        continue;
+      }
+
+      FwdTxfm2dFunc ref_func = libaom_test::fwd_txfm_func_ls[tx_size];
+      if (ref_func != NULL) {
+        DECLARE_ALIGNED(32, int16_t, input[64 * 64]) = { 0 };
+        DECLARE_ALIGNED(32, int32_t, output[64 * 64]);
+        DECLARE_ALIGNED(32, int32_t, ref_output[64 * 64]);
+        int input_stride = 64;
+        ACMRandom rnd(ACMRandom::DeterministicSeed());
+        for (int cnt = 0; cnt < 500; ++cnt) {
+          if (cnt == 0) {
+            for (int r = 0; r < rows; ++r) {
+              for (int c = 0; c < cols; ++c) {
+                input[r * input_stride + c] = (1 << bd) - 1;
+              }
+            }
+          } else {
+            for (int r = 0; r < rows; ++r) {
+              for (int c = 0; c < cols; ++c) {
+                input[r * input_stride + c] = rnd.Rand16() % (1 << bd);
+              }
+            }
+          }
+          param.tx_type = (TX_TYPE)tx_type;
+          param.tx_size = (TX_SIZE)tx_size;
+          param.tx_set_type = EXT_TX_SET_ALL16;
+          param.bd = bd;
+
+          ref_func(input, ref_output, input_stride, (TX_TYPE)tx_type, bd);
+          target_func(input, output, input_stride, &param);
+          const int check_rows = AOMMIN(32, rows);
+          const int check_cols = AOMMIN(32, rows * cols / check_rows);
+          for (int r = 0; r < check_rows; ++r) {
+            for (int c = 0; c < check_cols; ++c) {
+              ASSERT_EQ(ref_output[r * check_cols + c],
+                        output[r * check_cols + c])
+                  << "[" << r << "," << c << "] cnt:" << cnt
+                  << " tx_size: " << tx_size << " tx_type: " << tx_type;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+void AV1HighbdFwdTxfm2dSpeedTest(TX_SIZE tx_size,
+                                 Highbd_fwd_txfm_func target_func) {
+  const int bd_ar[2] = { 10, 12 };
+  TxfmParam param;
+  memset(&param, 0, sizeof(param));
+  const int rows = tx_size_high[tx_size];
+  const int cols = tx_size_wide[tx_size];
+  const int num_loops = 1000000 / (rows * cols);
+
+  for (int i = 0; i < 2; ++i) {
+    const int bd = bd_ar[i];
+    for (int tx_type = 0; tx_type < TX_TYPES; ++tx_type) {
+      if (libaom_test::IsTxSizeTypeValid(
+              tx_size, static_cast<TX_TYPE>(tx_type)) == false) {
+        continue;
+      }
+
+      FwdTxfm2dFunc ref_func = libaom_test::fwd_txfm_func_ls[tx_size];
+      if (ref_func != NULL) {
+        DECLARE_ALIGNED(32, int16_t, input[64 * 64]) = { 0 };
+        DECLARE_ALIGNED(32, int32_t, output[64 * 64]);
+        DECLARE_ALIGNED(32, int32_t, ref_output[64 * 64]);
+        int input_stride = 64;
+        ACMRandom rnd(ACMRandom::DeterministicSeed());
+
+        for (int r = 0; r < rows; ++r) {
+          for (int c = 0; c < cols; ++c) {
+            input[r * input_stride + c] = rnd.Rand16() % (1 << bd);
+          }
+        }
+
+        param.tx_type = (TX_TYPE)tx_type;
+        param.tx_size = (TX_SIZE)tx_size;
+        param.tx_set_type = EXT_TX_SET_ALL16;
+        param.bd = bd;
+
+        aom_usec_timer ref_timer, test_timer;
+
+        aom_usec_timer_start(&ref_timer);
+        for (int i = 0; i < num_loops; ++i) {
+          ref_func(input, ref_output, input_stride, (TX_TYPE)tx_type, bd);
+        }
+        aom_usec_timer_mark(&ref_timer);
+        const int elapsed_time_c =
+            static_cast<int>(aom_usec_timer_elapsed(&ref_timer));
+
+        aom_usec_timer_start(&test_timer);
+        for (int i = 0; i < num_loops; ++i) {
+          target_func(input, output, input_stride, &param);
+        }
+        aom_usec_timer_mark(&test_timer);
+        const int elapsed_time_simd =
+            static_cast<int>(aom_usec_timer_elapsed(&test_timer));
+
+        printf(
+            "txfm_size[%d] \t txfm_type[%d] \t c_time=%d \t simd_time=%d \t "
+            "gain=%d \n",
+            tx_size, tx_type, elapsed_time_c, elapsed_time_simd,
+            (elapsed_time_c / elapsed_time_simd));
+      }
+    }
+  }
+}
+
+typedef std::tuple<TX_SIZE, Highbd_fwd_txfm_func> HighbdFwdTxfm2dParam;
+
+class AV1HighbdFwdTxfm2dTest
+    : public ::testing::TestWithParam<HighbdFwdTxfm2dParam> {};
+
+TEST_P(AV1HighbdFwdTxfm2dTest, match) {
+  AV1HighbdFwdTxfm2dMatchTest(GET_PARAM(0), GET_PARAM(1));
+}
+
+TEST_P(AV1HighbdFwdTxfm2dTest, DISABLED_Speed) {
+  AV1HighbdFwdTxfm2dSpeedTest(GET_PARAM(0), GET_PARAM(1));
+}
+
+using ::testing::Combine;
+using ::testing::Values;
+using ::testing::ValuesIn;
+
+#if HAVE_SSE4_1
+static TX_SIZE Highbd_fwd_txfm_for_sse4_1[] = {
+  TX_4X4,  TX_8X8,  TX_16X16, TX_32X32, TX_64X64, TX_4X8,   TX_8X4,
+  TX_8X16, TX_16X8, TX_16X32, TX_32X16, TX_32X64, TX_64X32, TX_4X16,
+  TX_16X4, TX_8X32, TX_32X8,  TX_16X64, TX_64X16,
+};
+
+INSTANTIATE_TEST_SUITE_P(SSE4_1, AV1HighbdFwdTxfm2dTest,
+                         Combine(ValuesIn(Highbd_fwd_txfm_for_sse4_1),
+                                 Values(av1_highbd_fwd_txfm)));
+#endif  // HAVE_SSE4_1
+#if HAVE_AVX2
+static TX_SIZE Highbd_fwd_txfm_for_avx2[] = { TX_8X8,   TX_16X16, TX_32X32,
+                                              TX_64X64, TX_8X16,  TX_16X8 };
+
+INSTANTIATE_TEST_SUITE_P(AVX2, AV1HighbdFwdTxfm2dTest,
+                         Combine(ValuesIn(Highbd_fwd_txfm_for_avx2),
+                                 Values(av1_highbd_fwd_txfm)));
+#endif  // HAVE_AVX2
+}  // namespace
diff --git a/libs/libaom/src/test/av1_highbd_iht_test.cc b/libs/libaom/src/test/av1_highbd_iht_test.cc
new file mode 100644
index 000000000..8fea500db
--- /dev/null
+++ b/libs/libaom/src/test/av1_highbd_iht_test.cc
@@ -0,0 +1,362 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tuple>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/av1_rtcd.h"
+
+#include "test/acm_random.h"
+#include "test/av1_txfm_test.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "av1/common/enums.h"
+#include "av1/common/scan.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/mem.h"
+
+namespace {
+
+using libaom_test::ACMRandom;
+using std::tuple;
+
+typedef void (*HbdHtFunc)(const int16_t *input, int32_t *output, int stride,
+                          TX_TYPE tx_type, int bd);
+
+typedef void (*IHbdHtFunc)(const int32_t *coeff, uint16_t *output, int stride,
+                           TX_TYPE tx_type, int bd);
+static const char *tx_type_name[] = {
+  "DCT_DCT",
+  "ADST_DCT",
+  "DCT_ADST",
+  "ADST_ADST",
+  "FLIPADST_DCT",
+  "DCT_FLIPADST",
+  "FLIPADST_FLIPADST",
+  "ADST_FLIPADST",
+  "FLIPADST_ADST",
+  "IDTX",
+  "V_DCT",
+  "H_DCT",
+  "V_ADST",
+  "H_ADST",
+  "V_FLIPADST",
+  "H_FLIPADST",
+};
+// Test parameter argument list:
+//   <transform reference function,
+//    optimized inverse transform function,
+//    inverse transform reference function,
+//    num_coeffs,
+//    tx_type,
+//    bit_depth>
+typedef tuple<HbdHtFunc, IHbdHtFunc, IHbdHtFunc, int, TX_TYPE, int> IHbdHtParam;
+
+class AV1HighbdInvHTNxN : public ::testing::TestWithParam<IHbdHtParam> {
+ public:
+  virtual ~AV1HighbdInvHTNxN() {}
+
+  virtual void SetUp() {
+    txfm_ref_ = GET_PARAM(0);
+    inv_txfm_ = GET_PARAM(1);
+    inv_txfm_ref_ = GET_PARAM(2);
+    num_coeffs_ = GET_PARAM(3);
+    tx_type_ = GET_PARAM(4);
+    bit_depth_ = GET_PARAM(5);
+
+    input_ = reinterpret_cast<int16_t *>(
+        aom_memalign(16, sizeof(input_[0]) * num_coeffs_));
+
+    // Note:
+    // Inverse transform input buffer is 32-byte aligned
+    // Refer to <root>/av1/encoder/context_tree.c, function,
+    // void alloc_mode_context().
+    coeffs_ = reinterpret_cast<int32_t *>(
+        aom_memalign(32, sizeof(coeffs_[0]) * num_coeffs_));
+    output_ = reinterpret_cast<uint16_t *>(
+        aom_memalign(32, sizeof(output_[0]) * num_coeffs_));
+    output_ref_ = reinterpret_cast<uint16_t *>(
+        aom_memalign(32, sizeof(output_ref_[0]) * num_coeffs_));
+  }
+
+  virtual void TearDown() {
+    aom_free(input_);
+    aom_free(coeffs_);
+    aom_free(output_);
+    aom_free(output_ref_);
+    libaom_test::ClearSystemState();
+  }
+
+ protected:
+  void RunBitexactCheck();
+
+ private:
+  int GetStride() const {
+    if (16 == num_coeffs_) {
+      return 4;
+    } else if (64 == num_coeffs_) {
+      return 8;
+    } else if (256 == num_coeffs_) {
+      return 16;
+    } else if (1024 == num_coeffs_) {
+      return 32;
+    } else if (4096 == num_coeffs_) {
+      return 64;
+    } else {
+      return 0;
+    }
+  }
+
+  HbdHtFunc txfm_ref_;
+  IHbdHtFunc inv_txfm_;
+  IHbdHtFunc inv_txfm_ref_;
+  int num_coeffs_;
+  TX_TYPE tx_type_;
+  int bit_depth_;
+
+  int16_t *input_;
+  int32_t *coeffs_;
+  uint16_t *output_;
+  uint16_t *output_ref_;
+};
+
+void AV1HighbdInvHTNxN::RunBitexactCheck() {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int stride = GetStride();
+  const int num_tests = 20000;
+  const uint16_t mask = (1 << bit_depth_) - 1;
+
+  for (int i = 0; i < num_tests; ++i) {
+    for (int j = 0; j < num_coeffs_; ++j) {
+      input_[j] = (rnd.Rand16() & mask) - (rnd.Rand16() & mask);
+      output_ref_[j] = rnd.Rand16() & mask;
+      output_[j] = output_ref_[j];
+    }
+
+    txfm_ref_(input_, coeffs_, stride, tx_type_, bit_depth_);
+    inv_txfm_ref_(coeffs_, output_ref_, stride, tx_type_, bit_depth_);
+    ASM_REGISTER_STATE_CHECK(
+        inv_txfm_(coeffs_, output_, stride, tx_type_, bit_depth_));
+
+    for (int j = 0; j < num_coeffs_; ++j) {
+      EXPECT_EQ(output_ref_[j], output_[j])
+          << "Not bit-exact result at index: " << j << " At test block: " << i;
+    }
+  }
+}
+
+TEST_P(AV1HighbdInvHTNxN, InvTransResultCheck) { RunBitexactCheck(); }
+
+using std::make_tuple;
+
+#if HAVE_SSE4_1
+#define PARAM_LIST_4X4                                   \
+  &av1_fwd_txfm2d_4x4_c, &av1_inv_txfm2d_add_4x4_sse4_1, \
+      &av1_inv_txfm2d_add_4x4_c, 16
+
+const IHbdHtParam kArrayIhtParam[] = {
+  // 4x4
+  make_tuple(PARAM_LIST_4X4, DCT_DCT, 10),
+  make_tuple(PARAM_LIST_4X4, DCT_DCT, 12),
+  make_tuple(PARAM_LIST_4X4, ADST_DCT, 10),
+  make_tuple(PARAM_LIST_4X4, ADST_DCT, 12),
+  make_tuple(PARAM_LIST_4X4, DCT_ADST, 10),
+  make_tuple(PARAM_LIST_4X4, DCT_ADST, 12),
+  make_tuple(PARAM_LIST_4X4, ADST_ADST, 10),
+  make_tuple(PARAM_LIST_4X4, ADST_ADST, 12),
+  make_tuple(PARAM_LIST_4X4, FLIPADST_DCT, 10),
+  make_tuple(PARAM_LIST_4X4, FLIPADST_DCT, 12),
+  make_tuple(PARAM_LIST_4X4, DCT_FLIPADST, 10),
+  make_tuple(PARAM_LIST_4X4, DCT_FLIPADST, 12),
+  make_tuple(PARAM_LIST_4X4, FLIPADST_FLIPADST, 10),
+  make_tuple(PARAM_LIST_4X4, FLIPADST_FLIPADST, 12),
+  make_tuple(PARAM_LIST_4X4, ADST_FLIPADST, 10),
+  make_tuple(PARAM_LIST_4X4, ADST_FLIPADST, 12),
+  make_tuple(PARAM_LIST_4X4, FLIPADST_ADST, 10),
+  make_tuple(PARAM_LIST_4X4, FLIPADST_ADST, 12),
+};
+
+INSTANTIATE_TEST_SUITE_P(SSE4_1, AV1HighbdInvHTNxN,
+                         ::testing::ValuesIn(kArrayIhtParam));
+#endif  // HAVE_SSE4_1
+
+typedef void (*HighbdInvTxfm2dFunc)(const int32_t *input, uint8_t *output,
+                                    int stride, const TxfmParam *txfm_param);
+
+typedef std::tuple<const HighbdInvTxfm2dFunc> AV1HighbdInvTxfm2dParam;
+class AV1HighbdInvTxfm2d
+    : public ::testing::TestWithParam<AV1HighbdInvTxfm2dParam> {
+ public:
+  virtual void SetUp() { target_func_ = GET_PARAM(0); }
+  void RunAV1InvTxfm2dTest(TX_TYPE tx_type, TX_SIZE tx_size, int run_times,
+                           int bit_depth, int gt_int16 = 0);
+
+ private:
+  HighbdInvTxfm2dFunc target_func_;
+};
+
+void AV1HighbdInvTxfm2d::RunAV1InvTxfm2dTest(TX_TYPE tx_type_, TX_SIZE tx_size_,
+                                             int run_times, int bit_depth_,
+                                             int gt_int16) {
+  FwdTxfm2dFunc fwd_func_ = libaom_test::fwd_txfm_func_ls[tx_size_];
+  TxfmParam txfm_param;
+  const int BLK_WIDTH = 64;
+  const int BLK_SIZE = BLK_WIDTH * BLK_WIDTH;
+  DECLARE_ALIGNED(16, int16_t, input[BLK_SIZE]) = { 0 };
+  DECLARE_ALIGNED(32, int32_t, inv_input[BLK_SIZE]) = { 0 };
+  DECLARE_ALIGNED(32, uint16_t, output[BLK_SIZE]) = { 0 };
+  DECLARE_ALIGNED(32, uint16_t, ref_output[BLK_SIZE]) = { 0 };
+  int stride = BLK_WIDTH;
+  int rows = tx_size_high[tx_size_];
+  int cols = tx_size_wide[tx_size_];
+  const int rows_nonezero = AOMMIN(32, rows);
+  const int cols_nonezero = AOMMIN(32, cols);
+  const uint16_t mask = (1 << bit_depth_) - 1;
+  run_times /= (rows * cols);
+  run_times = AOMMAX(1, run_times);
+  const SCAN_ORDER *scan_order = get_default_scan(tx_size_, tx_type_);
+  const int16_t *scan = scan_order->scan;
+  const int16_t eobmax = rows_nonezero * cols_nonezero;
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  int randTimes = run_times == 1 ? (eobmax) : 1;
+
+  txfm_param.tx_type = tx_type_;
+  txfm_param.tx_size = tx_size_;
+  txfm_param.lossless = 0;
+  txfm_param.bd = bit_depth_;
+  txfm_param.is_hbd = 1;
+  txfm_param.tx_set_type = EXT_TX_SET_ALL16;
+
+  for (int cnt = 0; cnt < randTimes; ++cnt) {
+    for (int r = 0; r < BLK_WIDTH; ++r) {
+      for (int c = 0; c < BLK_WIDTH; ++c) {
+        input[r * cols + c] = (rnd.Rand16() & mask) - (rnd.Rand16() & mask);
+        output[r * stride + c] = rnd.Rand16() & mask;
+
+        ref_output[r * stride + c] = output[r * stride + c];
+      }
+    }
+    fwd_func_(input, inv_input, stride, tx_type_, bit_depth_);
+
+    // produce eob input by setting high freq coeffs to zero
+    const int eob = AOMMIN(cnt + 1, eobmax);
+    for (int i = eob; i < eobmax; i++) {
+      inv_input[scan[i]] = 0;
+    }
+    txfm_param.eob = eob;
+    if (gt_int16) {
+      const uint16_t inv_input_mask =
+          static_cast<uint16_t>((1 << (bit_depth_ + 7)) - 1);
+      for (int i = 0; i < eob; i++) {
+        inv_input[scan[i]] = (rnd.Rand31() & inv_input_mask);
+      }
+    }
+
+    aom_usec_timer ref_timer, test_timer;
+    aom_usec_timer_start(&ref_timer);
+    for (int i = 0; i < run_times; ++i) {
+      av1_highbd_inv_txfm_add_c(inv_input, CONVERT_TO_BYTEPTR(ref_output),
+                                stride, &txfm_param);
+    }
+    aom_usec_timer_mark(&ref_timer);
+    const int elapsed_time_c =
+        static_cast<int>(aom_usec_timer_elapsed(&ref_timer));
+
+    aom_usec_timer_start(&test_timer);
+    for (int i = 0; i < run_times; ++i) {
+      target_func_(inv_input, CONVERT_TO_BYTEPTR(output), stride, &txfm_param);
+    }
+    aom_usec_timer_mark(&test_timer);
+    const int elapsed_time_simd =
+        static_cast<int>(aom_usec_timer_elapsed(&test_timer));
+    if (run_times > 10) {
+      printf(
+          "txfm_size[%d] \t txfm_type[%d] \t c_time=%d \t simd_time=%d \t "
+          "gain=%d \n",
+          tx_size_, tx_type_, elapsed_time_c, elapsed_time_simd,
+          (elapsed_time_c / elapsed_time_simd));
+    } else {
+      for (int r = 0; r < rows; ++r) {
+        for (int c = 0; c < cols; ++c) {
+          ASSERT_EQ(ref_output[r * stride + c], output[r * stride + c])
+              << "[" << r << "," << c << "] " << cnt
+              << " tx_size: " << static_cast<int>(tx_size_)
+              << " bit_depth_: " << bit_depth_
+              << " tx_type: " << tx_type_name[tx_type_] << " eob " << eob;
+        }
+      }
+    }
+  }
+}
+
+TEST_P(AV1HighbdInvTxfm2d, match) {
+  int bitdepth_ar[3] = { 8, 10, 12 };
+  for (int k = 0; k < 3; ++k) {
+    int bd = bitdepth_ar[k];
+    for (int j = 0; j < (int)(TX_SIZES_ALL); ++j) {
+      for (int i = 0; i < (int)TX_TYPES; ++i) {
+        if (libaom_test::IsTxSizeTypeValid(static_cast<TX_SIZE>(j),
+                                           static_cast<TX_TYPE>(i))) {
+          RunAV1InvTxfm2dTest(static_cast<TX_TYPE>(i), static_cast<TX_SIZE>(j),
+                              1, bd);
+        }
+      }
+    }
+  }
+}
+
+TEST_P(AV1HighbdInvTxfm2d, gt_int16) {
+  int bitdepth_ar[3] = { 8, 10, 12 };
+  static const TX_TYPE types[] = {
+    DCT_DCT, ADST_DCT, FLIPADST_DCT, IDTX, V_DCT, H_DCT, H_ADST, H_FLIPADST
+  };
+  for (int k = 0; k < 3; ++k) {
+    int bd = bitdepth_ar[k];
+    for (int j = 0; j < (int)(TX_SIZES_ALL); ++j) {
+      const TX_SIZE sz = static_cast<TX_SIZE>(j);
+      for (uint8_t i = 0; i < sizeof(types) / sizeof(TX_TYPE); ++i) {
+        const TX_TYPE tp = types[i];
+        if (libaom_test::IsTxSizeTypeValid(sz, tp)) {
+          RunAV1InvTxfm2dTest(tp, sz, 1, bd, 1);
+        }
+      }
+    }
+  }
+}
+
+TEST_P(AV1HighbdInvTxfm2d, DISABLED_Speed) {
+  int bitdepth_ar[2] = { 10, 12 };
+  for (int k = 0; k < 2; ++k) {
+    int bd = bitdepth_ar[k];
+    for (int j = 0; j < (int)(TX_SIZES_ALL); ++j) {
+      for (int i = 0; i < (int)TX_TYPES; ++i) {
+        if (libaom_test::IsTxSizeTypeValid(static_cast<TX_SIZE>(j),
+                                           static_cast<TX_TYPE>(i))) {
+          RunAV1InvTxfm2dTest(static_cast<TX_TYPE>(i), static_cast<TX_SIZE>(j),
+                              1000000, bd);
+        }
+      }
+    }
+  }
+}
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE4_1, AV1HighbdInvTxfm2d,
+                         ::testing::Values(av1_highbd_inv_txfm_add_sse4_1));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, AV1HighbdInvTxfm2d,
+                         ::testing::Values(av1_highbd_inv_txfm_add_avx2));
+#endif
+}  // namespace
diff --git a/libs/libaom/src/test/av1_horz_only_frame_superres_test.cc b/libs/libaom/src/test/av1_horz_only_frame_superres_test.cc
new file mode 100644
index 000000000..115fc84c0
--- /dev/null
+++ b/libs/libaom/src/test/av1_horz_only_frame_superres_test.cc
@@ -0,0 +1,365 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tuple>
+#include <vector>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/av1_rtcd.h"
+
+#include "aom_ports/aom_timer.h"
+#include "av1/common/convolve.h"
+#include "av1/common/resize.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+
+namespace {
+const int kTestIters = 10;
+const int kPerfIters = 1000;
+
+const int kVPad = 32;
+const int kHPad = 32;
+
+using libaom_test::ACMRandom;
+using std::make_tuple;
+using std::tuple;
+
+template <typename Pixel>
+class TestImage {
+ public:
+  TestImage(int w_src, int h, int superres_denom, int x0, int bd)
+      : w_src_(w_src), h_(h), superres_denom_(superres_denom), x0_(x0),
+        bd_(bd) {
+    assert(bd < 16);
+    assert(bd <= 8 * static_cast<int>(sizeof(Pixel)));
+    assert(9 <= superres_denom && superres_denom <= 16);
+    assert(SCALE_NUMERATOR == 8);
+    assert(0 <= x0_ && x0_ <= RS_SCALE_SUBPEL_MASK);
+
+    w_dst_ = w_src_;
+    av1_calculate_unscaled_superres_size(&w_dst_, NULL, superres_denom);
+
+    src_stride_ = ALIGN_POWER_OF_TWO(w_src_ + 2 * kHPad, 4);
+    dst_stride_ = ALIGN_POWER_OF_TWO(w_dst_ + 2 * kHPad, 4);
+
+    // Allocate image data
+    src_data_.resize(2 * src_block_size());
+    dst_data_.resize(2 * dst_block_size());
+  }
+
+  void Initialize(ACMRandom *rnd);
+  void Check() const;
+
+  int src_stride() const { return src_stride_; }
+  int dst_stride() const { return dst_stride_; }
+
+  int src_block_size() const { return (h_ + 2 * kVPad) * src_stride(); }
+  int dst_block_size() const { return (h_ + 2 * kVPad) * dst_stride(); }
+
+  int src_width() const { return w_src_; }
+  int dst_width() const { return w_dst_; }
+  int height() const { return h_; }
+  int x0() const { return x0_; }
+
+  const Pixel *GetSrcData(bool ref, bool borders) const {
+    const Pixel *block = &src_data_[ref ? 0 : src_block_size()];
+    return borders ? block : block + kHPad + src_stride_ * kVPad;
+  }
+
+  Pixel *GetDstData(bool ref, bool borders) {
+    Pixel *block = &dst_data_[ref ? 0 : dst_block_size()];
+    return borders ? block : block + kHPad + dst_stride_ * kVPad;
+  }
+
+ private:
+  int w_src_, w_dst_, h_, superres_denom_, x0_, bd_;
+  int src_stride_, dst_stride_;
+
+  std::vector<Pixel> src_data_;
+  std::vector<Pixel> dst_data_;
+};
+
+template <typename Pixel>
+void FillEdge(ACMRandom *rnd, int num_pixels, int bd, bool trash, Pixel *data) {
+  if (!trash) {
+    memset(data, 0, sizeof(*data) * num_pixels);
+    return;
+  }
+  const Pixel mask = (1 << bd) - 1;
+  for (int i = 0; i < num_pixels; ++i) data[i] = rnd->Rand16() & mask;
+}
+
+template <typename Pixel>
+void PrepBuffers(ACMRandom *rnd, int w, int h, int stride, int bd,
+                 bool trash_edges, Pixel *data) {
+  assert(rnd);
+  const Pixel mask = (1 << bd) - 1;
+
+  // Fill in the first buffer with random data
+  // Top border
+  FillEdge(rnd, stride * kVPad, bd, trash_edges, data);
+  for (int r = 0; r < h; ++r) {
+    Pixel *row_data = data + (kVPad + r) * stride;
+    // Left border, contents, right border
+    FillEdge(rnd, kHPad, bd, trash_edges, row_data);
+    for (int c = 0; c < w; ++c) row_data[kHPad + c] = rnd->Rand16() & mask;
+    FillEdge(rnd, kHPad, bd, trash_edges, row_data + kHPad + w);
+  }
+  // Bottom border
+  FillEdge(rnd, stride * kVPad, bd, trash_edges, data + stride * (kVPad + h));
+
+  const int bpp = sizeof(*data);
+  const int block_elts = stride * (h + 2 * kVPad);
+  const int block_size = bpp * block_elts;
+
+  // Now copy that to the second buffer
+  memcpy(data + block_elts, data, block_size);
+}
+
+template <typename Pixel>
+void TestImage<Pixel>::Initialize(ACMRandom *rnd) {
+  PrepBuffers(rnd, w_src_, h_, src_stride_, bd_, false, &src_data_[0]);
+  PrepBuffers(rnd, w_dst_, h_, dst_stride_, bd_, true, &dst_data_[0]);
+}
+
+template <typename Pixel>
+void TestImage<Pixel>::Check() const {
+  const int num_pixels = dst_block_size();
+  const Pixel *ref_dst = &dst_data_[0];
+  const Pixel *tst_dst = &dst_data_[num_pixels];
+
+  // If memcmp returns 0, there's nothing to do.
+  if (0 == memcmp(ref_dst, tst_dst, sizeof(*ref_dst) * num_pixels)) return;
+
+  // Otherwise, iterate through the buffer looking for differences, *ignoring
+  // the edges*
+  const int stride = dst_stride_;
+  for (int r = kVPad; r < h_ + kVPad; ++r) {
+    for (int c = kVPad; c < w_dst_ + kHPad; ++c) {
+      const int32_t ref_value = ref_dst[r * stride + c];
+      const int32_t tst_value = tst_dst[r * stride + c];
+
+      EXPECT_EQ(tst_value, ref_value)
+          << "Error at row: " << (r - kVPad) << ", col: " << (c - kHPad)
+          << ", superres_denom: " << superres_denom_ << ", height: " << h_
+          << ", src_width: " << w_src_ << ", dst_width: " << w_dst_
+          << ", x0: " << x0_;
+    }
+  }
+}
+
+template <typename Pixel>
+class ConvolveHorizRSTestBase : public ::testing::Test {
+ public:
+  ConvolveHorizRSTestBase() : image_(NULL) {}
+  virtual ~ConvolveHorizRSTestBase() {}
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+
+  // Implemented by subclasses (SetUp depends on the parameters passed
+  // in and RunOne depends on the function to be tested. These can't
+  // be templated for low/high bit depths because they have different
+  // numbers of parameters)
+  virtual void SetUp() = 0;
+  virtual void RunOne(bool ref) = 0;
+
+ protected:
+  void SetBitDepth(int bd) { bd_ = bd; }
+
+  void CorrectnessTest() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    for (int i = 0; i < kTestIters; ++i) {
+      for (int superres_denom = 9; superres_denom <= 16; superres_denom++) {
+        // Get a random height between 512 and 767
+        int height = rnd.Rand8() + 512;
+
+        // Get a random src width between 128 and 383
+        int width_src = rnd.Rand8() + 128;
+
+        // x0 is normally calculated by get_upscale_convolve_x0 in
+        // av1/common/resize.c. However, this test should work for
+        // any value of x0 between 0 and RS_SCALE_SUBPEL_MASK
+        // (inclusive), so we choose one at random.
+        int x0 = rnd.Rand16() % (RS_SCALE_SUBPEL_MASK + 1);
+
+        image_ =
+            new TestImage<Pixel>(width_src, height, superres_denom, x0, bd_);
+
+        Prep(&rnd);
+        RunOne(true);
+        RunOne(false);
+        image_->Check();
+
+        delete image_;
+      }
+    }
+  }
+
+  void SpeedTest() {
+    // Pick some specific parameters to test
+    int height = 767;
+    int width_src = 129;
+    int superres_denom = 13;
+    int x0 = RS_SCALE_SUBPEL_MASK >> 1;
+
+    image_ = new TestImage<Pixel>(width_src, height, superres_denom, x0, bd_);
+
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    Prep(&rnd);
+
+    aom_usec_timer ref_timer;
+    aom_usec_timer_start(&ref_timer);
+    for (int i = 0; i < kPerfIters; ++i) RunOne(true);
+    aom_usec_timer_mark(&ref_timer);
+    const int64_t ref_time = aom_usec_timer_elapsed(&ref_timer);
+
+    aom_usec_timer tst_timer;
+    aom_usec_timer_start(&tst_timer);
+    for (int i = 0; i < kPerfIters; ++i) RunOne(false);
+    aom_usec_timer_mark(&tst_timer);
+    const int64_t tst_time = aom_usec_timer_elapsed(&tst_timer);
+
+    std::cout << "[          ] C time = " << ref_time / 1000
+              << " ms, SIMD time = " << tst_time / 1000 << " ms\n";
+
+    EXPECT_GT(ref_time, tst_time)
+        << "Error: ConvolveHorizRSTest (Speed Test), SIMD slower than C.\n"
+        << "C time: " << ref_time << " us\n"
+        << "SIMD time: " << tst_time << " us\n";
+  }
+
+  void Prep(ACMRandom *rnd) {
+    assert(rnd);
+    image_->Initialize(rnd);
+  }
+
+  int bd_;
+  TestImage<Pixel> *image_;
+};
+
+typedef void (*LowBDConvolveHorizRsFunc)(const uint8_t *src, int src_stride,
+                                         uint8_t *dst, int dst_stride, int w,
+                                         int h, const int16_t *x_filters,
+                                         const int x0_qn, const int x_step_qn);
+
+// Test parameter list:
+//  <tst_fun_>
+typedef tuple<LowBDConvolveHorizRsFunc> LowBDParams;
+
+class LowBDConvolveHorizRSTest
+    : public ConvolveHorizRSTestBase<uint8_t>,
+      public ::testing::WithParamInterface<LowBDParams> {
+ public:
+  virtual ~LowBDConvolveHorizRSTest() {}
+
+  void SetUp() {
+    tst_fun_ = GET_PARAM(0);
+    const int bd = 8;
+    SetBitDepth(bd);
+  }
+
+  void RunOne(bool ref) {
+    const uint8_t *src = image_->GetSrcData(ref, false);
+    uint8_t *dst = image_->GetDstData(ref, false);
+    const int src_stride = image_->src_stride();
+    const int dst_stride = image_->dst_stride();
+    const int width_src = image_->src_width();
+    const int width_dst = image_->dst_width();
+    const int height = image_->height();
+    const int x0_qn = image_->x0();
+
+    const int32_t x_step_qn =
+        av1_get_upscale_convolve_step(width_src, width_dst);
+
+    if (ref) {
+      av1_convolve_horiz_rs_c(src, src_stride, dst, dst_stride, width_dst,
+                              height, &av1_resize_filter_normative[0][0], x0_qn,
+                              x_step_qn);
+    } else {
+      tst_fun_(src, src_stride, dst, dst_stride, width_dst, height,
+               &av1_resize_filter_normative[0][0], x0_qn, x_step_qn);
+    }
+  }
+
+ private:
+  LowBDConvolveHorizRsFunc tst_fun_;
+};
+
+TEST_P(LowBDConvolveHorizRSTest, Correctness) { CorrectnessTest(); }
+TEST_P(LowBDConvolveHorizRSTest, DISABLED_Speed) { SpeedTest(); }
+
+INSTANTIATE_TEST_SUITE_P(SSE4_1, LowBDConvolveHorizRSTest,
+                         ::testing::Values(av1_convolve_horiz_rs_sse4_1));
+
+#if CONFIG_AV1_HIGHBITDEPTH
+typedef void (*HighBDConvolveHorizRsFunc)(const uint16_t *src, int src_stride,
+                                          uint16_t *dst, int dst_stride, int w,
+                                          int h, const int16_t *x_filters,
+                                          const int x0_qn, const int x_step_qn,
+                                          int bd);
+
+// Test parameter list:
+//  <tst_fun_, bd_>
+typedef tuple<HighBDConvolveHorizRsFunc, int> HighBDParams;
+
+class HighBDConvolveHorizRSTest
+    : public ConvolveHorizRSTestBase<uint16_t>,
+      public ::testing::WithParamInterface<HighBDParams> {
+ public:
+  virtual ~HighBDConvolveHorizRSTest() {}
+
+  void SetUp() {
+    tst_fun_ = GET_PARAM(0);
+    const int bd = GET_PARAM(1);
+    SetBitDepth(bd);
+  }
+
+  void RunOne(bool ref) {
+    const uint16_t *src = image_->GetSrcData(ref, false);
+    uint16_t *dst = image_->GetDstData(ref, false);
+    const int src_stride = image_->src_stride();
+    const int dst_stride = image_->dst_stride();
+    const int width_src = image_->src_width();
+    const int width_dst = image_->dst_width();
+    const int height = image_->height();
+    const int x0_qn = image_->x0();
+
+    const int32_t x_step_qn =
+        av1_get_upscale_convolve_step(width_src, width_dst);
+
+    if (ref) {
+      av1_highbd_convolve_horiz_rs_c(
+          src, src_stride, dst, dst_stride, width_dst, height,
+          &av1_resize_filter_normative[0][0], x0_qn, x_step_qn, bd_);
+    } else {
+      tst_fun_(src, src_stride, dst, dst_stride, width_dst, height,
+               &av1_resize_filter_normative[0][0], x0_qn, x_step_qn, bd_);
+    }
+  }
+
+ private:
+  HighBDConvolveHorizRsFunc tst_fun_;
+};
+
+const int kBDs[] = { 8, 10, 12 };
+
+TEST_P(HighBDConvolveHorizRSTest, Correctness) { CorrectnessTest(); }
+TEST_P(HighBDConvolveHorizRSTest, DISABLED_Speed) { SpeedTest(); }
+
+INSTANTIATE_TEST_SUITE_P(
+    SSE4_1, HighBDConvolveHorizRSTest,
+    ::testing::Combine(::testing::Values(av1_highbd_convolve_horiz_rs_sse4_1),
+                       ::testing::ValuesIn(kBDs)));
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+}  // namespace
diff --git a/libs/libaom/src/test/av1_inv_txfm1d_test.cc b/libs/libaom/src/test/av1_inv_txfm1d_test.cc
new file mode 100644
index 000000000..01d4a4d7f
--- /dev/null
+++ b/libs/libaom/src/test/av1_inv_txfm1d_test.cc
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+
+#include "test/av1_txfm_test.h"
+#include "test/util.h"
+#include "av1/common/av1_inv_txfm1d.h"
+#include "av1/encoder/av1_fwd_txfm1d.h"
+
+typedef TX_SIZE TxSize;
+
+using libaom_test::ACMRandom;
+using libaom_test::input_base;
+
+namespace {
+const int txfm_type_num = 2;
+const int txfm_size_ls[] = { 4, 8, 16, 32, 64 };
+
+const TxfmFunc fwd_txfm_func_ls[][txfm_type_num] = {
+  { av1_fdct4, av1_fadst4 },   { av1_fdct8, av1_fadst8 },
+  { av1_fdct16, av1_fadst16 }, { av1_fdct32, NULL },
+  { av1_fdct64, NULL },
+};
+
+const TxfmFunc inv_txfm_func_ls[][txfm_type_num] = {
+  { av1_idct4, av1_iadst4 },   { av1_idct8, av1_iadst8 },
+  { av1_idct16, av1_iadst16 }, { av1_idct32, NULL },
+  { av1_idct64, NULL },
+};
+
+// the maximum stage number of fwd/inv 1d dct/adst txfm is 12
+const int8_t cos_bit = 13;
+const int8_t range_bit[12] = { 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20 };
+
+void reference_idct_1d_int(const int32_t *in, int32_t *out, int size) {
+  double input[64];
+  for (int i = 0; i < size; ++i) input[i] = in[i];
+
+  double output[64];
+  libaom_test::reference_idct_1d(input, output, size);
+
+  for (int i = 0; i < size; ++i) {
+    ASSERT_GE(output[i], INT32_MIN);
+    ASSERT_LE(output[i], INT32_MAX);
+    out[i] = static_cast<int32_t>(round(output[i]));
+  }
+}
+
+void random_matrix(int32_t *dst, int len, ACMRandom *rnd) {
+  const int bits = 16;
+  const int maxVal = (1 << (bits - 1)) - 1;
+  const int minVal = -(1 << (bits - 1));
+  for (int i = 0; i < len; ++i) {
+    if (rnd->Rand8() % 10)
+      dst[i] = minVal + rnd->Rand16() % (1 << bits);
+    else
+      dst[i] = rnd->Rand8() % 2 ? minVal : maxVal;
+  }
+}
+
+TEST(av1_inv_txfm1d, InvAccuracyCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int count_test_block = 20000;
+  const int max_error[] = { 6, 10, 19, 31, 40 };
+  ASSERT_EQ(NELEMENTS(max_error), TX_SIZES);
+  ASSERT_EQ(NELEMENTS(inv_txfm_func_ls), TX_SIZES);
+  for (int k = 0; k < count_test_block; ++k) {
+    // choose a random transform to test
+    const TxSize tx_size = static_cast<TxSize>(rnd.Rand8() % TX_SIZES);
+    const int tx_size_pix = txfm_size_ls[tx_size];
+    const TxfmFunc inv_txfm_func = inv_txfm_func_ls[tx_size][0];
+
+    int32_t input[64];
+    random_matrix(input, tx_size_pix, &rnd);
+
+    // 64x64 transform assumes last 32 values are zero.
+    memset(input + 32, 0, 32 * sizeof(input[0]));
+
+    int32_t ref_output[64];
+    memset(ref_output, 0, sizeof(ref_output));
+    reference_idct_1d_int(input, ref_output, tx_size_pix);
+
+    int32_t output[64];
+    memset(output, 0, sizeof(output));
+    inv_txfm_func(input, output, cos_bit, range_bit);
+
+    for (int i = 0; i < tx_size_pix; ++i) {
+      EXPECT_LE(abs(output[i] - ref_output[i]), max_error[tx_size])
+          << "tx_size = " << tx_size << ", i = " << i
+          << ", output[i] = " << output[i]
+          << ", ref_output[i] = " << ref_output[i];
+    }
+  }
+}
+
+static INLINE int get_max_bit(int x) {
+  int max_bit = -1;
+  while (x) {
+    x = x >> 1;
+    max_bit++;
+  }
+  return max_bit;
+}
+
+TEST(av1_inv_txfm1d, get_max_bit) {
+  int max_bit = get_max_bit(8);
+  EXPECT_EQ(max_bit, 3);
+}
+
+TEST(av1_inv_txfm1d, round_trip) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  for (int si = 0; si < NELEMENTS(fwd_txfm_func_ls); ++si) {
+    int txfm_size = txfm_size_ls[si];
+
+    for (int ti = 0; ti < txfm_type_num; ++ti) {
+      TxfmFunc fwd_txfm_func = fwd_txfm_func_ls[si][ti];
+      TxfmFunc inv_txfm_func = inv_txfm_func_ls[si][ti];
+      int max_error = 2;
+
+      if (!fwd_txfm_func) continue;
+
+      const int count_test_block = 5000;
+      for (int ci = 0; ci < count_test_block; ++ci) {
+        int32_t input[64];
+        int32_t output[64];
+        int32_t round_trip_output[64];
+
+        ASSERT_LE(txfm_size, NELEMENTS(input));
+
+        for (int ni = 0; ni < txfm_size; ++ni) {
+          input[ni] = rnd.Rand16() % input_base - rnd.Rand16() % input_base;
+        }
+
+        fwd_txfm_func(input, output, cos_bit, range_bit);
+        inv_txfm_func(output, round_trip_output, cos_bit, range_bit);
+
+        for (int ni = 0; ni < txfm_size; ++ni) {
+          int node_err =
+              abs(input[ni] - round_shift(round_trip_output[ni],
+                                          get_max_bit(txfm_size) - 1));
+          EXPECT_LE(node_err, max_error);
+        }
+      }
+    }
+  }
+}
+
+}  // namespace
diff --git a/libs/libaom/src/test/av1_inv_txfm2d_test.cc b/libs/libaom/src/test/av1_inv_txfm2d_test.cc
new file mode 100644
index 000000000..eacdf85d4
--- /dev/null
+++ b/libs/libaom/src/test/av1_inv_txfm2d_test.cc
@@ -0,0 +1,422 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <tuple>
+#include <vector>
+
+#include "config/av1_rtcd.h"
+
+#include "aom_ports/aom_timer.h"
+#include "av1/common/av1_inv_txfm1d_cfg.h"
+#include "av1/common/scan.h"
+#include "test/acm_random.h"
+#include "test/av1_txfm_test.h"
+#include "test/util.h"
+
+using libaom_test::ACMRandom;
+using libaom_test::bd;
+using libaom_test::compute_avg_abs_error;
+using libaom_test::input_base;
+using libaom_test::InvTxfm2dFunc;
+using libaom_test::LbdInvTxfm2dFunc;
+
+using ::testing::Combine;
+using ::testing::Range;
+using ::testing::Values;
+
+using std::vector;
+
+typedef TX_TYPE TxType;
+typedef TX_SIZE TxSize;
+
+namespace {
+
+static const char *tx_type_name[] = {
+  "DCT_DCT",
+  "ADST_DCT",
+  "DCT_ADST",
+  "ADST_ADST",
+  "FLIPADST_DCT",
+  "DCT_FLIPADST",
+  "FLIPADST_FLIPADST",
+  "ADST_FLIPADST",
+  "FLIPADST_ADST",
+  "IDTX",
+  "V_DCT",
+  "H_DCT",
+  "V_ADST",
+  "H_ADST",
+  "V_FLIPADST",
+  "H_FLIPADST",
+};
+
+// AV1InvTxfm2dParam argument list:
+// tx_type_, tx_size_, max_error_, max_avg_error_
+typedef std::tuple<TxType, TxSize, int, double> AV1InvTxfm2dParam;
+
+class AV1InvTxfm2d : public ::testing::TestWithParam<AV1InvTxfm2dParam> {
+ public:
+  virtual void SetUp() {
+    tx_type_ = GET_PARAM(0);
+    tx_size_ = GET_PARAM(1);
+    max_error_ = GET_PARAM(2);
+    max_avg_error_ = GET_PARAM(3);
+  }
+
+  void RunRoundtripCheck() {
+    int tx_w = tx_size_wide[tx_size_];
+    int tx_h = tx_size_high[tx_size_];
+    int txfm2d_size = tx_w * tx_h;
+    const FwdTxfm2dFunc fwd_txfm_func = libaom_test::fwd_txfm_func_ls[tx_size_];
+    const InvTxfm2dFunc inv_txfm_func = libaom_test::inv_txfm_func_ls[tx_size_];
+    double avg_abs_error = 0;
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+
+    const int count = 500;
+
+    for (int ci = 0; ci < count; ci++) {
+      DECLARE_ALIGNED(16, int16_t, input[64 * 64]) = { 0 };
+      ASSERT_LE(txfm2d_size, NELEMENTS(input));
+
+      for (int ni = 0; ni < txfm2d_size; ++ni) {
+        if (ci == 0) {
+          int extreme_input = input_base - 1;
+          input[ni] = extreme_input;  // extreme case
+        } else {
+          input[ni] = rnd.Rand16() % input_base;
+        }
+      }
+
+      DECLARE_ALIGNED(16, uint16_t, expected[64 * 64]) = { 0 };
+      ASSERT_LE(txfm2d_size, NELEMENTS(expected));
+      if (TxfmUsesApproximation()) {
+        // Compare reference forward HT + inverse HT vs forward HT + inverse HT.
+        double ref_input[64 * 64];
+        ASSERT_LE(txfm2d_size, NELEMENTS(ref_input));
+        for (int ni = 0; ni < txfm2d_size; ++ni) {
+          ref_input[ni] = input[ni];
+        }
+        double ref_coeffs[64 * 64] = { 0 };
+        ASSERT_LE(txfm2d_size, NELEMENTS(ref_coeffs));
+        ASSERT_EQ(tx_type_, static_cast<TxType>(DCT_DCT));
+        libaom_test::reference_hybrid_2d(ref_input, ref_coeffs, tx_type_,
+                                         tx_size_);
+        DECLARE_ALIGNED(16, int32_t, ref_coeffs_int[64 * 64]) = { 0 };
+        ASSERT_LE(txfm2d_size, NELEMENTS(ref_coeffs_int));
+        for (int ni = 0; ni < txfm2d_size; ++ni) {
+          ref_coeffs_int[ni] = (int32_t)round(ref_coeffs[ni]);
+        }
+        inv_txfm_func(ref_coeffs_int, expected, tx_w, tx_type_, bd);
+      } else {
+        // Compare original input vs forward HT + inverse HT.
+        for (int ni = 0; ni < txfm2d_size; ++ni) {
+          expected[ni] = input[ni];
+        }
+      }
+
+      DECLARE_ALIGNED(16, int32_t, coeffs[64 * 64]) = { 0 };
+      ASSERT_LE(txfm2d_size, NELEMENTS(coeffs));
+      fwd_txfm_func(input, coeffs, tx_w, tx_type_, bd);
+
+      DECLARE_ALIGNED(16, uint16_t, actual[64 * 64]) = { 0 };
+      ASSERT_LE(txfm2d_size, NELEMENTS(actual));
+      inv_txfm_func(coeffs, actual, tx_w, tx_type_, bd);
+
+      double actual_max_error = 0;
+      for (int ni = 0; ni < txfm2d_size; ++ni) {
+        const double this_error = abs(expected[ni] - actual[ni]);
+        actual_max_error = AOMMAX(actual_max_error, this_error);
+      }
+      EXPECT_GE(max_error_, actual_max_error)
+          << " tx_w: " << tx_w << " tx_h " << tx_h << " tx_type: " << tx_type_;
+      if (actual_max_error > max_error_) {  // exit early.
+        break;
+      }
+      avg_abs_error += compute_avg_abs_error<uint16_t, uint16_t>(
+          expected, actual, txfm2d_size);
+    }
+
+    avg_abs_error /= count;
+    EXPECT_GE(max_avg_error_, avg_abs_error)
+        << " tx_w: " << tx_w << " tx_h " << tx_h << " tx_type: " << tx_type_;
+  }
+
+ private:
+  bool TxfmUsesApproximation() {
+    if (tx_size_wide[tx_size_] == 64 || tx_size_high[tx_size_] == 64) {
+      return true;
+    }
+    return false;
+  }
+
+  int max_error_;
+  double max_avg_error_;
+  TxType tx_type_;
+  TxSize tx_size_;
+};
+
+static int max_error_ls[TX_SIZES_ALL] = {
+  2,  // 4x4 transform
+  2,  // 8x8 transform
+  2,  // 16x16 transform
+  4,  // 32x32 transform
+  3,  // 64x64 transform
+  2,  // 4x8 transform
+  2,  // 8x4 transform
+  2,  // 8x16 transform
+  2,  // 16x8 transform
+  3,  // 16x32 transform
+  3,  // 32x16 transform
+  5,  // 32x64 transform
+  5,  // 64x32 transform
+  2,  // 4x16 transform
+  2,  // 16x4 transform
+  2,  // 8x32 transform
+  2,  // 32x8 transform
+  3,  // 16x64 transform
+  3,  // 64x16 transform
+};
+
+static double avg_error_ls[TX_SIZES_ALL] = {
+  0.002,  // 4x4 transform
+  0.05,   // 8x8 transform
+  0.07,   // 16x16 transform
+  0.4,    // 32x32 transform
+  0.3,    // 64x64 transform
+  0.02,   // 4x8 transform
+  0.02,   // 8x4 transform
+  0.04,   // 8x16 transform
+  0.07,   // 16x8 transform
+  0.4,    // 16x32 transform
+  0.5,    // 32x16 transform
+  0.38,   // 32x64 transform
+  0.39,   // 64x32 transform
+  0.2,    // 4x16 transform
+  0.2,    // 16x4 transform
+  0.2,    // 8x32 transform
+  0.2,    // 32x8 transform
+  0.38,   // 16x64 transform
+  0.38,   // 64x16 transform
+};
+
+vector<AV1InvTxfm2dParam> GetInvTxfm2dParamList() {
+  vector<AV1InvTxfm2dParam> param_list;
+  for (int s = 0; s < TX_SIZES; ++s) {
+    const int max_error = max_error_ls[s];
+    const double avg_error = avg_error_ls[s];
+    for (int t = 0; t < TX_TYPES; ++t) {
+      const TxType tx_type = static_cast<TxType>(t);
+      const TxSize tx_size = static_cast<TxSize>(s);
+      if (libaom_test::IsTxSizeTypeValid(tx_size, tx_type)) {
+        param_list.push_back(
+            AV1InvTxfm2dParam(tx_type, tx_size, max_error, avg_error));
+      }
+    }
+  }
+  return param_list;
+}
+
+INSTANTIATE_TEST_SUITE_P(C, AV1InvTxfm2d,
+                         ::testing::ValuesIn(GetInvTxfm2dParamList()));
+
+TEST_P(AV1InvTxfm2d, RunRoundtripCheck) { RunRoundtripCheck(); }
+
+TEST(AV1InvTxfm2d, CfgTest) {
+  for (int bd_idx = 0; bd_idx < BD_NUM; ++bd_idx) {
+    int bd = libaom_test::bd_arr[bd_idx];
+    int8_t low_range = libaom_test::low_range_arr[bd_idx];
+    int8_t high_range = libaom_test::high_range_arr[bd_idx];
+    for (int tx_size = 0; tx_size < TX_SIZES_ALL; ++tx_size) {
+      for (int tx_type = 0; tx_type < TX_TYPES; ++tx_type) {
+        if (libaom_test::IsTxSizeTypeValid(static_cast<TxSize>(tx_size),
+                                           static_cast<TxType>(tx_type)) ==
+            false) {
+          continue;
+        }
+        TXFM_2D_FLIP_CFG cfg;
+        av1_get_inv_txfm_cfg(static_cast<TxType>(tx_type),
+                             static_cast<TxSize>(tx_size), &cfg);
+        int8_t stage_range_col[MAX_TXFM_STAGE_NUM];
+        int8_t stage_range_row[MAX_TXFM_STAGE_NUM];
+        av1_gen_inv_stage_range(stage_range_col, stage_range_row, &cfg,
+                                static_cast<TxSize>(tx_size), bd);
+        libaom_test::txfm_stage_range_check(stage_range_col, cfg.stage_num_col,
+                                            cfg.cos_bit_col, low_range,
+                                            high_range);
+        libaom_test::txfm_stage_range_check(stage_range_row, cfg.stage_num_row,
+                                            cfg.cos_bit_row, low_range,
+                                            high_range);
+      }
+    }
+  }
+}
+
+typedef std::tuple<const LbdInvTxfm2dFunc> AV1LbdInvTxfm2dParam;
+class AV1LbdInvTxfm2d : public ::testing::TestWithParam<AV1LbdInvTxfm2dParam> {
+ public:
+  virtual void SetUp() { target_func_ = GET_PARAM(0); }
+  void RunAV1InvTxfm2dTest(TxType tx_type, TxSize tx_size, int run_times,
+                           int gt_int16 = 0);
+
+ private:
+  LbdInvTxfm2dFunc target_func_;
+};
+
+void AV1LbdInvTxfm2d::RunAV1InvTxfm2dTest(TxType tx_type, TxSize tx_size,
+                                          int run_times, int gt_int16) {
+  FwdTxfm2dFunc fwd_func_ = libaom_test::fwd_txfm_func_ls[tx_size];
+  InvTxfm2dFunc ref_func_ = libaom_test::inv_txfm_func_ls[tx_size];
+  if (fwd_func_ == NULL || ref_func_ == NULL || target_func_ == NULL) {
+    return;
+  }
+  const int bd = 8;
+  const int BLK_WIDTH = 64;
+  const int BLK_SIZE = BLK_WIDTH * BLK_WIDTH;
+  DECLARE_ALIGNED(16, int16_t, input[BLK_SIZE]) = { 0 };
+  DECLARE_ALIGNED(32, int32_t, inv_input[BLK_SIZE]) = { 0 };
+  DECLARE_ALIGNED(16, uint8_t, output[BLK_SIZE]) = { 0 };
+  DECLARE_ALIGNED(16, uint16_t, ref_output[BLK_SIZE]) = { 0 };
+  int stride = BLK_WIDTH;
+  int rows = tx_size_high[tx_size];
+  int cols = tx_size_wide[tx_size];
+  const int rows_nonezero = AOMMIN(32, rows);
+  const int cols_nonezero = AOMMIN(32, cols);
+  run_times /= (rows * cols);
+  run_times = AOMMAX(1, run_times);
+  const SCAN_ORDER *scan_order = get_default_scan(tx_size, tx_type);
+  const int16_t *scan = scan_order->scan;
+  const int16_t eobmax = rows_nonezero * cols_nonezero;
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  int randTimes = run_times == 1 ? (eobmax + 500) : 1;
+
+  for (int cnt = 0; cnt < randTimes; ++cnt) {
+    const int16_t max_in = (1 << (bd)) - 1;
+    for (int r = 0; r < BLK_WIDTH; ++r) {
+      for (int c = 0; c < BLK_WIDTH; ++c) {
+        input[r * cols + c] = (cnt == 0) ? max_in : rnd.Rand8Extremes();
+        output[r * stride + c] = (cnt == 0) ? 128 : rnd.Rand8();
+        ref_output[r * stride + c] = output[r * stride + c];
+      }
+    }
+    fwd_func_(input, inv_input, stride, tx_type, bd);
+
+    // produce eob input by setting high freq coeffs to zero
+    const int eob = AOMMIN(cnt + 1, eobmax);
+    for (int i = eob; i < eobmax; i++) {
+      inv_input[scan[i]] = 0;
+    }
+    if (gt_int16) {
+      inv_input[scan[eob - 1]] = ((int32_t)INT16_MAX * 100 / 141);
+    }
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < run_times; ++i) {
+      ref_func_(inv_input, ref_output, stride, tx_type, bd);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < run_times; ++i) {
+      target_func_(inv_input, output, stride, tx_type, tx_size, eob);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    if (run_times > 10) {
+      printf("txfm[%d] %3dx%-3d:%7.2f/%7.2fns", tx_type, cols, rows, time1,
+             time2);
+      printf("(%3.2f)\n", time1 / time2);
+    }
+    for (int r = 0; r < rows; ++r) {
+      for (int c = 0; c < cols; ++c) {
+        uint8_t ref_value = static_cast<uint8_t>(ref_output[r * stride + c]);
+        if (ref_value != output[r * stride + c]) {
+          printf(" ");
+        }
+        ASSERT_EQ(ref_value, output[r * stride + c])
+            << "[" << r << "," << c << "] " << cnt
+            << " tx_size: " << static_cast<int>(tx_size)
+            << " tx_type: " << tx_type_name[tx_type] << " eob " << eob;
+      }
+    }
+  }
+}
+
+TEST_P(AV1LbdInvTxfm2d, match) {
+  for (int j = 0; j < (int)(TX_SIZES_ALL); ++j) {
+    for (int i = 0; i < (int)TX_TYPES; ++i) {
+      if (libaom_test::IsTxSizeTypeValid(static_cast<TxSize>(j),
+                                         static_cast<TxType>(i))) {
+        RunAV1InvTxfm2dTest(static_cast<TxType>(i), static_cast<TxSize>(j), 1);
+      }
+    }
+  }
+}
+
+TEST_P(AV1LbdInvTxfm2d, gt_int16) {
+  static const TxType types[] = { DCT_DCT, ADST_DCT, FLIPADST_DCT, IDTX,
+                                  V_DCT,   H_DCT,    H_ADST,       H_FLIPADST };
+  for (int j = 0; j < (int)(TX_SIZES_ALL); ++j) {
+    const TxSize sz = static_cast<TxSize>(j);
+    for (uint8_t i = 0; i < sizeof(types) / sizeof(types[0]); ++i) {
+      const TxType tp = types[i];
+      if (libaom_test::IsTxSizeTypeValid(sz, tp)) {
+        RunAV1InvTxfm2dTest(tp, sz, 1, 1);
+      }
+    }
+  }
+}
+
+TEST_P(AV1LbdInvTxfm2d, DISABLED_Speed) {
+  for (int j = 1; j < (int)(TX_SIZES_ALL); ++j) {
+    for (int i = 0; i < (int)TX_TYPES; ++i) {
+      if (libaom_test::IsTxSizeTypeValid(static_cast<TxSize>(j),
+                                         static_cast<TxType>(i))) {
+        RunAV1InvTxfm2dTest(static_cast<TxType>(i), static_cast<TxSize>(j),
+                            10000000);
+      }
+    }
+  }
+}
+
+#if HAVE_SSSE3
+#if defined(_MSC_VER) || defined(__SSSE3__)
+#include "av1/common/x86/av1_inv_txfm_ssse3.h"
+INSTANTIATE_TEST_SUITE_P(SSSE3, AV1LbdInvTxfm2d,
+                         ::testing::Values(av1_lowbd_inv_txfm2d_add_ssse3));
+#endif  // _MSC_VER || __SSSE3__
+#endif  // HAVE_SSSE3
+
+#if HAVE_AVX2
+extern "C" void av1_lowbd_inv_txfm2d_add_avx2(const int32_t *input,
+                                              uint8_t *output, int stride,
+                                              TxType tx_type, TxSize tx_size,
+                                              int eob);
+
+INSTANTIATE_TEST_SUITE_P(AVX2, AV1LbdInvTxfm2d,
+                         ::testing::Values(av1_lowbd_inv_txfm2d_add_avx2));
+#endif  // HAVE_AVX2
+
+// TODO(yunqing): Re-enable this unit test for NEON version after the functions
+// are fixed.
+#if HAVE_NEON
+extern "C" void av1_lowbd_inv_txfm2d_add_neon(const int32_t *input,
+                                              uint8_t *output, int stride,
+                                              TX_TYPE tx_type, TX_SIZE tx_size,
+                                              int eob);
+
+INSTANTIATE_TEST_SUITE_P(NEON, AV1LbdInvTxfm2d,
+                         ::testing::Values(av1_lowbd_inv_txfm2d_add_neon));
+#endif  // HAVE_NEON
+
+}  // namespace
diff --git a/libs/libaom/src/test/av1_nn_predict_test.cc b/libs/libaom/src/test/av1_nn_predict_test.cc
new file mode 100644
index 000000000..c03cba8c5
--- /dev/null
+++ b/libs/libaom/src/test/av1_nn_predict_test.cc
@@ -0,0 +1,217 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tuple>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "aom/aom_integer.h"
+#include "aom_ports/aom_timer.h"
+#include "av1/encoder/ml.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+#include "test/util.h"
+#include "test/register_state_check.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+
+namespace {
+typedef void (*NnPredict_Func)(const float *const input_nodes,
+                               const NN_CONFIG *const nn_config,
+                               int reduce_prec, float *const output);
+
+typedef std::tuple<const NnPredict_Func> NnPredictTestParam;
+
+const float epsilon = 1e-3f;  // Error threshold for functional equivalence
+
+class NnPredictTest : public ::testing::TestWithParam<NnPredictTestParam> {
+ public:
+  virtual void SetUp() {
+    const int MAX_NODES2 = NN_MAX_NODES_PER_LAYER * NN_MAX_NODES_PER_LAYER;
+    // Allocate two massive buffers on the heap for edge weights and node bias
+    // Then set-up the double-dimension arrays pointing into the big buffers
+    weights_buf = (float *)aom_malloc(MAX_NODES2 * (NN_MAX_HIDDEN_LAYERS + 1) *
+                                      sizeof(*weights_buf));
+    bias_buf =
+        (float *)aom_malloc(NN_MAX_NODES_PER_LAYER *
+                            (NN_MAX_HIDDEN_LAYERS + 1) * sizeof(*bias_buf));
+    ASSERT_NE(weights_buf, nullptr);
+    ASSERT_NE(bias_buf, nullptr);
+    for (int i = 0; i < NN_MAX_HIDDEN_LAYERS + 1; i++) {
+      weights[i] = &weights_buf[i * MAX_NODES2];
+      bias[i] = &bias_buf[i * NN_MAX_NODES_PER_LAYER];
+    }
+    target_func_ = GET_PARAM(0);
+  }
+  virtual void TearDown() {
+    aom_free(weights_buf);
+    aom_free(bias_buf);
+  }
+  void RunNnPredictTest(const NN_CONFIG *const shape);
+  void RunNnPredictSpeedTest(const NN_CONFIG *const shape, const int run_times);
+  void RunNnPredictTest_all(const NN_CONFIG *const shapes,
+                            const int num_shapes);
+  void RunNnPredictSpeedTest_all(const NN_CONFIG *const shapes,
+                                 const int num_shapes, const int run_times);
+
+ private:
+  NnPredict_Func target_func_;
+  libaom_test::ACMRandom rng_;
+  float *weights[NN_MAX_HIDDEN_LAYERS + 1] = { 0 };
+  float *bias[NN_MAX_HIDDEN_LAYERS + 1] = { 0 };
+  float *weights_buf = nullptr, *bias_buf = nullptr;
+};
+
+void NnPredictTest::RunNnPredictTest(const NN_CONFIG *const shape) {
+  libaom_test::ClearSystemState();
+  float inputs[NN_MAX_NODES_PER_LAYER] = { 0 };
+  float outputs_test[NN_MAX_NODES_PER_LAYER] = { 0 };
+  float outputs_ref[NN_MAX_NODES_PER_LAYER] = { 0 };
+
+  NN_CONFIG nn_config;
+  memcpy(&nn_config, shape, sizeof(nn_config));
+
+  char shape_str[32] = { 0 };
+  snprintf(shape_str, sizeof(shape_str), "%d", shape->num_inputs);
+  for (int layer = 0; layer < shape->num_hidden_layers; layer++)
+    snprintf(&shape_str[strlen(shape_str)],
+             sizeof(shape_str) - strlen(shape_str), "x%d",
+             shape->num_hidden_nodes[layer]);
+  snprintf(&shape_str[strlen(shape_str)], sizeof(shape_str) - strlen(shape_str),
+           "x%d", shape->num_outputs);
+
+  for (int i = 0; i < NN_MAX_HIDDEN_LAYERS + 1; i++) {
+    nn_config.weights[i] = weights[i];
+    nn_config.bias[i] = bias[i];
+  }
+
+  for (int iter = 0; iter < 10000 && !HasFatalFailure(); ++iter) {
+    for (int node = 0; node < shape->num_inputs; node++) {
+      inputs[node] = ((float)rng_.Rand31() - (1 << 30)) / (1u << 31);
+    }
+    for (int layer = 0; layer < shape->num_hidden_layers; layer++) {
+      for (int node = 0; node < NN_MAX_NODES_PER_LAYER; node++) {
+        bias[layer][node] = ((float)rng_.Rand31() - (1 << 30)) / (1u << 31);
+      }
+      for (int node = 0; node < NN_MAX_NODES_PER_LAYER * NN_MAX_NODES_PER_LAYER;
+           node++) {
+        weights[layer][node] = ((float)rng_.Rand31() - (1 << 30)) / (1u << 31);
+      }
+    }
+    // Now the outputs:
+    int layer = shape->num_hidden_layers;
+    for (int node = 0; node < NN_MAX_NODES_PER_LAYER; node++) {
+      bias[layer][node] = ((float)rng_.Rand31() - (1 << 30)) / (1u << 31);
+    }
+    for (int node = 0; node < NN_MAX_NODES_PER_LAYER * NN_MAX_NODES_PER_LAYER;
+         node++) {
+      weights[layer][node] = ((float)rng_.Rand31() - (1 << 30)) / (1u << 31);
+    }
+
+    av1_nn_predict_c(inputs, &nn_config, 0, outputs_ref);
+    target_func_(inputs, &nn_config, 0, outputs_test);
+    libaom_test::ClearSystemState();
+
+    for (int node = 0; node < shape->num_outputs; node++) {
+      if (outputs_ref[node] < epsilon) {
+        ASSERT_LE(outputs_test[node], epsilon)
+            << "Reference output was near-zero, test output was not ("
+            << shape_str << ")";
+      } else {
+        const float error = outputs_ref[node] - outputs_test[node];
+        const float relative_error = fabsf(error / outputs_ref[node]);
+        ASSERT_LE(relative_error, epsilon)
+            << "Excessive relative error between reference and test ("
+            << shape_str << ")";
+      }
+    }
+  }
+}
+
+void NnPredictTest::RunNnPredictSpeedTest(const NN_CONFIG *const shape,
+                                          const int run_times) {
+  libaom_test::ClearSystemState();
+  float inputs[NN_MAX_NODES_PER_LAYER] = { 0 };
+  float outputs_test[NN_MAX_NODES_PER_LAYER] = { 0 };
+  float outputs_ref[NN_MAX_NODES_PER_LAYER] = { 0 };
+
+  NN_CONFIG nn_config;
+  memcpy(&nn_config, shape, sizeof(nn_config));
+
+  for (int i = 0; i < NN_MAX_HIDDEN_LAYERS; i++) {
+    nn_config.weights[i] = weights[i];
+    nn_config.bias[i] = bias[i];
+  }
+  // Don't bother actually changing the values for inputs/weights/bias: it
+  // shouldn't make any difference for a speed test.
+
+  aom_usec_timer timer;
+  aom_usec_timer_start(&timer);
+  for (int i = 0; i < run_times; ++i) {
+    av1_nn_predict_c(inputs, &nn_config, 0, outputs_ref);
+  }
+  aom_usec_timer_mark(&timer);
+  const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+  aom_usec_timer_start(&timer);
+  for (int i = 0; i < run_times; ++i) {
+    target_func_(inputs, &nn_config, 0, outputs_test);
+  }
+  aom_usec_timer_mark(&timer);
+  libaom_test::ClearSystemState();
+  const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+
+  printf("%d", shape->num_inputs);
+  for (int layer = 0; layer < shape->num_hidden_layers; layer++)
+    printf("x%d", shape->num_hidden_nodes[layer]);
+  printf("x%d: ", shape->num_outputs);
+  printf("%7.2f/%7.2fns (%3.2f)\n", time1, time2, time1 / time2);
+}
+
+// This is all the neural network shapes observed executed in a few different
+// runs of the encoder.  It also conveniently covers all the kernels
+// implemented.
+static const NN_CONFIG shapes[] = {
+  { 10, 16, 1, { 64 }, { 0 }, { 0 } }, { 12, 1, 1, { 12 }, { 0 }, { 0 } },
+  { 12, 1, 1, { 24 }, { 0 }, { 0 } },  { 12, 1, 1, { 32 }, { 0 }, { 0 } },
+  { 18, 4, 1, { 24 }, { 0 }, { 0 } },  { 18, 4, 1, { 32 }, { 0 }, { 0 } },
+  { 4, 1, 1, { 16 }, { 0 }, { 0 } },   { 8, 1, 1, { 16 }, { 0 }, { 0 } },
+  { 8, 4, 1, { 16 }, { 0 }, { 0 } },   { 8, 1, 1, { 24 }, { 0 }, { 0 } },
+  { 8, 1, 1, { 32 }, { 0 }, { 0 } },   { 8, 1, 1, { 64 }, { 0 }, { 0 } },
+  { 9, 3, 1, { 32 }, { 0 }, { 0 } },   { 4, 4, 1, { 8 }, { 0 }, { 0 } },
+};
+
+void NnPredictTest::RunNnPredictTest_all(const NN_CONFIG *const shapes,
+                                         const int num_shapes) {
+  for (int i = 0; i < num_shapes; i++) RunNnPredictTest(&shapes[i]);
+}
+
+void NnPredictTest::RunNnPredictSpeedTest_all(const NN_CONFIG *const shapes,
+                                              const int num_shapes,
+                                              const int run_times) {
+  for (int i = 0; i < num_shapes; i++)
+    NnPredictTest::RunNnPredictSpeedTest(&shapes[i], run_times);
+}
+
+TEST_P(NnPredictTest, RandomValues) {
+  RunNnPredictTest_all(shapes, sizeof(shapes) / sizeof(*shapes));
+}
+
+TEST_P(NnPredictTest, DISABLED_Speed) {
+  RunNnPredictSpeedTest_all(shapes, sizeof(shapes) / sizeof(*shapes), 10000000);
+}
+
+#if HAVE_SSE3
+INSTANTIATE_TEST_SUITE_P(SSE3, NnPredictTest,
+                         ::testing::Values(av1_nn_predict_sse3));
+#endif
+
+}  // namespace
diff --git a/libs/libaom/src/test/av1_quantize_test.cc b/libs/libaom/src/test/av1_quantize_test.cc
new file mode 100644
index 000000000..39a3c33d8
--- /dev/null
+++ b/libs/libaom/src/test/av1_quantize_test.cc
@@ -0,0 +1,239 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <stdlib.h>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "av1/common/scan.h"
+
+namespace {
+
+typedef void (*QuantizeFpFunc)(
+    const tran_low_t *coeff_ptr, intptr_t count, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan, int log_scale);
+
+struct QuantizeFuncParams {
+  QuantizeFuncParams(QuantizeFpFunc qF = NULL, QuantizeFpFunc qRefF = NULL,
+                     int count = 16)
+      : qFunc(qF), qFuncRef(qRefF), coeffCount(count) {}
+  QuantizeFpFunc qFunc;
+  QuantizeFpFunc qFuncRef;
+  int coeffCount;
+};
+
+using libaom_test::ACMRandom;
+
+const int numTests = 1000;
+const int maxSize = 1024;
+const int roundFactorRange = 127;
+const int dequantRange = 32768;
+const int coeffRange = (1 << 20) - 1;
+
+class AV1QuantizeTest : public ::testing::TestWithParam<QuantizeFuncParams> {
+ public:
+  void RunQuantizeTest() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[maxSize]);
+    DECLARE_ALIGNED(16, int16_t, zbin_ptr[8]);
+    DECLARE_ALIGNED(16, int16_t, round_ptr[8]);
+    DECLARE_ALIGNED(16, int16_t, quant_ptr[8]);
+    DECLARE_ALIGNED(16, int16_t, quant_shift_ptr[8]);
+    DECLARE_ALIGNED(16, tran_low_t, qcoeff_ptr[maxSize]);
+    DECLARE_ALIGNED(16, tran_low_t, dqcoeff_ptr[maxSize]);
+    DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[maxSize]);
+    DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[maxSize]);
+    DECLARE_ALIGNED(16, int16_t, dequant_ptr[8]);
+    uint16_t eob;
+    uint16_t ref_eob;
+    int err_count_total = 0;
+    int first_failure = -1;
+    int count = params_.coeffCount;
+    const TX_SIZE txSize = getTxSize(count);
+    int log_scale = (txSize == TX_32X32);
+    QuantizeFpFunc quanFunc = params_.qFunc;
+    QuantizeFpFunc quanFuncRef = params_.qFuncRef;
+
+    const SCAN_ORDER scanOrder = av1_default_scan_orders[txSize];
+    for (int i = 0; i < numTests; i++) {
+      int err_count = 0;
+      ref_eob = eob = UINT16_MAX;
+      for (int j = 0; j < count; j++) {
+        coeff_ptr[j] = rnd(coeffRange);
+      }
+
+      for (int j = 0; j < 2; j++) {
+        zbin_ptr[j] = rnd.Rand16();
+        quant_shift_ptr[j] = rnd.Rand16();
+        // int16_t positive
+        dequant_ptr[j] = abs(rnd(dequantRange));
+        quant_ptr[j] = static_cast<int16_t>((1 << 16) / dequant_ptr[j]);
+        round_ptr[j] = (abs(rnd(roundFactorRange)) * dequant_ptr[j]) >> 7;
+      }
+      for (int j = 2; j < 8; ++j) {
+        zbin_ptr[j] = zbin_ptr[1];
+        quant_shift_ptr[j] = quant_shift_ptr[1];
+        dequant_ptr[j] = dequant_ptr[1];
+        quant_ptr[j] = quant_ptr[1];
+        round_ptr[j] = round_ptr[1];
+      }
+      quanFuncRef(coeff_ptr, count, zbin_ptr, round_ptr, quant_ptr,
+                  quant_shift_ptr, ref_qcoeff_ptr, ref_dqcoeff_ptr, dequant_ptr,
+                  &ref_eob, scanOrder.scan, scanOrder.iscan, log_scale);
+
+      ASM_REGISTER_STATE_CHECK(
+          quanFunc(coeff_ptr, count, zbin_ptr, round_ptr, quant_ptr,
+                   quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, &eob,
+                   scanOrder.scan, scanOrder.iscan, log_scale));
+
+      for (int j = 0; j < count; ++j) {
+        err_count += (ref_qcoeff_ptr[j] != qcoeff_ptr[j]) |
+                     (ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]);
+        ASSERT_EQ(ref_qcoeff_ptr[j], qcoeff_ptr[j])
+            << "qcoeff error: i = " << i << " j = " << j << "\n";
+        EXPECT_EQ(ref_dqcoeff_ptr[j], dqcoeff_ptr[j])
+            << "dqcoeff error: i = " << i << " j = " << j << "\n";
+      }
+      EXPECT_EQ(ref_eob, eob) << "eob error: "
+                              << "i = " << i << "\n";
+      err_count += (ref_eob != eob);
+      if (err_count && !err_count_total) {
+        first_failure = i;
+      }
+      err_count_total += err_count;
+    }
+    EXPECT_EQ(0, err_count_total)
+        << "Error: Quantization Test, C output doesn't match SSE2 output. "
+        << "First failed at test case " << first_failure;
+  }
+
+  void RunEobTest() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[maxSize]);
+    DECLARE_ALIGNED(16, int16_t, zbin_ptr[8]);
+    DECLARE_ALIGNED(16, int16_t, round_ptr[8]);
+    DECLARE_ALIGNED(16, int16_t, quant_ptr[8]);
+    DECLARE_ALIGNED(16, int16_t, quant_shift_ptr[8]);
+    DECLARE_ALIGNED(16, tran_low_t, qcoeff_ptr[maxSize]);
+    DECLARE_ALIGNED(16, tran_low_t, dqcoeff_ptr[maxSize]);
+    DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[maxSize]);
+    DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[maxSize]);
+    DECLARE_ALIGNED(16, int16_t, dequant_ptr[8]);
+    uint16_t eob;
+    uint16_t ref_eob;
+    int count = params_.coeffCount;
+    const TX_SIZE txSize = getTxSize(count);
+    int log_scale = (txSize == TX_32X32);
+    QuantizeFpFunc quanFunc = params_.qFunc;
+    QuantizeFpFunc quanFuncRef = params_.qFuncRef;
+    const SCAN_ORDER scanOrder = av1_default_scan_orders[txSize];
+
+    for (int i = 0; i < numTests; i++) {
+      ref_eob = eob = UINT16_MAX;
+      for (int j = 0; j < count; j++) {
+        coeff_ptr[j] = 0;
+      }
+
+      coeff_ptr[rnd(count)] = rnd(coeffRange);
+      coeff_ptr[rnd(count)] = rnd(coeffRange);
+      coeff_ptr[rnd(count)] = rnd(coeffRange);
+
+      for (int j = 0; j < 2; j++) {
+        zbin_ptr[j] = rnd.Rand16();
+        quant_shift_ptr[j] = rnd.Rand16();
+        // int16_t positive
+        dequant_ptr[j] = abs(rnd(dequantRange));
+        quant_ptr[j] = (1 << 16) / dequant_ptr[j];
+        round_ptr[j] = (abs(rnd(roundFactorRange)) * dequant_ptr[j]) >> 7;
+      }
+      for (int j = 2; j < 8; ++j) {
+        zbin_ptr[j] = zbin_ptr[1];
+        quant_shift_ptr[j] = quant_shift_ptr[1];
+        dequant_ptr[j] = dequant_ptr[1];
+        quant_ptr[j] = quant_ptr[1];
+        round_ptr[j] = round_ptr[1];
+      }
+
+      quanFuncRef(coeff_ptr, count, zbin_ptr, round_ptr, quant_ptr,
+                  quant_shift_ptr, ref_qcoeff_ptr, ref_dqcoeff_ptr, dequant_ptr,
+                  &ref_eob, scanOrder.scan, scanOrder.iscan, log_scale);
+
+      ASM_REGISTER_STATE_CHECK(
+          quanFunc(coeff_ptr, count, zbin_ptr, round_ptr, quant_ptr,
+                   quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, &eob,
+                   scanOrder.scan, scanOrder.iscan, log_scale));
+      EXPECT_EQ(ref_eob, eob) << "eob error: "
+                              << "i = " << i << "\n";
+    }
+  }
+
+  virtual void SetUp() { params_ = GetParam(); }
+
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+
+  virtual ~AV1QuantizeTest() {}
+
+ private:
+  TX_SIZE getTxSize(int count) {
+    switch (count) {
+      case 16: return TX_4X4;
+      case 64: return TX_8X8;
+      case 256: return TX_16X16;
+      case 1024: return TX_32X32;
+      default: return TX_4X4;
+    }
+  }
+
+  QuantizeFuncParams params_;
+};
+
+TEST_P(AV1QuantizeTest, BitExactCheck) { RunQuantizeTest(); }
+TEST_P(AV1QuantizeTest, EobVerify) { RunEobTest(); }
+
+#if HAVE_SSE4_1
+const QuantizeFuncParams qfps[4] = {
+  QuantizeFuncParams(&av1_highbd_quantize_fp_sse4_1, &av1_highbd_quantize_fp_c,
+                     16),
+  QuantizeFuncParams(&av1_highbd_quantize_fp_sse4_1, &av1_highbd_quantize_fp_c,
+                     64),
+  QuantizeFuncParams(&av1_highbd_quantize_fp_sse4_1, &av1_highbd_quantize_fp_c,
+                     256),
+  QuantizeFuncParams(&av1_highbd_quantize_fp_sse4_1, &av1_highbd_quantize_fp_c,
+                     1024),
+};
+
+INSTANTIATE_TEST_SUITE_P(SSE4_1, AV1QuantizeTest, ::testing::ValuesIn(qfps));
+#endif  // HAVE_SSE4_1
+
+#if HAVE_AVX2
+const QuantizeFuncParams qfps_avx2[4] = {
+  QuantizeFuncParams(&av1_highbd_quantize_fp_avx2, &av1_highbd_quantize_fp_c,
+                     16),
+  QuantizeFuncParams(&av1_highbd_quantize_fp_avx2, &av1_highbd_quantize_fp_c,
+                     64),
+  QuantizeFuncParams(&av1_highbd_quantize_fp_avx2, &av1_highbd_quantize_fp_c,
+                     256),
+  QuantizeFuncParams(&av1_highbd_quantize_fp_avx2, &av1_highbd_quantize_fp_c,
+                     1024),
+};
+
+INSTANTIATE_TEST_SUITE_P(AVX2, AV1QuantizeTest, ::testing::ValuesIn(qfps_avx2));
+#endif  // HAVE_AVX2
+
+}  // namespace
diff --git a/libs/libaom/src/test/av1_round_shift_array_test.cc b/libs/libaom/src/test/av1_round_shift_array_test.cc
new file mode 100644
index 000000000..993fa9f19
--- /dev/null
+++ b/libs/libaom/src/test/av1_round_shift_array_test.cc
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <tuple>
+
+#include "config/av1_rtcd.h"
+
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/mem.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/util.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace AV1CompRoundShift {
+
+typedef void (*comp_round_shift_array_func)(int32_t *arr, int size, int bit);
+
+#if HAVE_SSE4_1 || HAVE_NEON
+const int kValidBitCheck[] = {
+  -4, -3, -2, -1, 0, 1, 2, 3, 4,
+};
+#endif  // HAVE_SSE4_1 || HAVE_NEON
+
+typedef std::tuple<comp_round_shift_array_func, BLOCK_SIZE, int>
+    CompRoundShiftParam;
+
+class AV1CompRoundShiftTest
+    : public ::testing::TestWithParam<CompRoundShiftParam> {
+ public:
+  ~AV1CompRoundShiftTest();
+
+  void SetUp() { rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed()); }
+  void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+  void RunCheckOutput(comp_round_shift_array_func test_impl, BLOCK_SIZE bsize,
+                      int bit);
+  void RunSpeedTest(comp_round_shift_array_func test_impl, BLOCK_SIZE bsize,
+                    int bit);
+
+  libaom_test::ACMRandom rnd_;
+};
+
+AV1CompRoundShiftTest::~AV1CompRoundShiftTest() { ; }
+
+void AV1CompRoundShiftTest::RunCheckOutput(
+    comp_round_shift_array_func test_impl, BLOCK_SIZE bsize, int bit) {
+  const int w = block_size_wide[bsize];
+  const int h = block_size_high[bsize];
+  const int blk_wd = 64;
+  DECLARE_ALIGNED(32, int32_t, pred_[blk_wd]);
+  DECLARE_ALIGNED(32, int32_t, ref_buffer_[blk_wd]);
+  for (int i = 0; i < (blk_wd); ++i) {
+    ref_buffer_[i] = pred_[i] = rnd_.Rand31() / 16;
+  }
+  av1_round_shift_array_c(ref_buffer_, w, bit);
+  test_impl(pred_, w, bit);
+  for (int x = 0; x < w; ++x) {
+    ASSERT_EQ(ref_buffer_[x], pred_[x]) << w << "x" << h << "mismatch @"
+                                        << "(" << x << ")";
+  }
+}
+
+void AV1CompRoundShiftTest::RunSpeedTest(comp_round_shift_array_func test_impl,
+                                         BLOCK_SIZE bsize, int bit) {
+  const int w = block_size_wide[bsize];
+  const int h = block_size_high[bsize];
+  const int blk_wd = 64;
+  DECLARE_ALIGNED(32, int32_t, ref_buffer_[blk_wd]);
+  for (int i = 0; i < (blk_wd); ++i) {
+    ref_buffer_[i] = rnd_.Rand31();
+  }
+
+  const int num_loops = 1000000000 / (w + h);
+  comp_round_shift_array_func funcs[2] = { av1_round_shift_array_c, test_impl };
+  double elapsed_time[2] = { 0 };
+  for (int i = 0; i < 2; ++i) {
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    comp_round_shift_array_func func = funcs[i];
+    for (int j = 0; j < num_loops; ++j) {
+      func(ref_buffer_, w, bit);
+    }
+    aom_usec_timer_mark(&timer);
+    double time = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    elapsed_time[i] = 1000.0 * time / num_loops;
+  }
+  printf("av1_round_shift_array %3dx%-3d: bit : %d %7.2f/%7.2fns", w, h, bit,
+         elapsed_time[0], elapsed_time[1]);
+  printf("(%3.2f)\n", elapsed_time[0] / elapsed_time[1]);
+}
+
+TEST_P(AV1CompRoundShiftTest, CheckOutput) {
+  RunCheckOutput(GET_PARAM(0), GET_PARAM(1), GET_PARAM(2));
+}
+
+TEST_P(AV1CompRoundShiftTest, DISABLED_Speed) {
+  RunSpeedTest(GET_PARAM(0), GET_PARAM(1), GET_PARAM(2));
+}
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(
+    SSE4_1, AV1CompRoundShiftTest,
+    ::testing::Combine(::testing::Values(&av1_round_shift_array_sse4_1),
+                       ::testing::ValuesIn(txsize_to_bsize),
+                       ::testing::ValuesIn(kValidBitCheck)));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+    NEON, AV1CompRoundShiftTest,
+    ::testing::Combine(::testing::Values(&av1_round_shift_array_neon),
+                       ::testing::ValuesIn(txsize_to_bsize),
+                       ::testing::ValuesIn(kValidBitCheck)));
+#endif
+
+};  // namespace AV1CompRoundShift
diff --git a/libs/libaom/src/test/av1_txfm_test.cc b/libs/libaom/src/test/av1_txfm_test.cc
new file mode 100644
index 000000000..aedd45d13
--- /dev/null
+++ b/libs/libaom/src/test/av1_txfm_test.cc
@@ -0,0 +1,371 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdio.h>
+#include "test/av1_txfm_test.h"
+
+namespace libaom_test {
+
+int get_txfm1d_size(TX_SIZE tx_size) { return tx_size_wide[tx_size]; }
+
+void get_txfm1d_type(TX_TYPE txfm2d_type, TYPE_TXFM *type0, TYPE_TXFM *type1) {
+  switch (txfm2d_type) {
+    case DCT_DCT:
+      *type0 = TYPE_DCT;
+      *type1 = TYPE_DCT;
+      break;
+    case ADST_DCT:
+      *type0 = TYPE_ADST;
+      *type1 = TYPE_DCT;
+      break;
+    case DCT_ADST:
+      *type0 = TYPE_DCT;
+      *type1 = TYPE_ADST;
+      break;
+    case ADST_ADST:
+      *type0 = TYPE_ADST;
+      *type1 = TYPE_ADST;
+      break;
+    case FLIPADST_DCT:
+      *type0 = TYPE_ADST;
+      *type1 = TYPE_DCT;
+      break;
+    case DCT_FLIPADST:
+      *type0 = TYPE_DCT;
+      *type1 = TYPE_ADST;
+      break;
+    case FLIPADST_FLIPADST:
+      *type0 = TYPE_ADST;
+      *type1 = TYPE_ADST;
+      break;
+    case ADST_FLIPADST:
+      *type0 = TYPE_ADST;
+      *type1 = TYPE_ADST;
+      break;
+    case FLIPADST_ADST:
+      *type0 = TYPE_ADST;
+      *type1 = TYPE_ADST;
+      break;
+    case IDTX:
+      *type0 = TYPE_IDTX;
+      *type1 = TYPE_IDTX;
+      break;
+    case H_DCT:
+      *type0 = TYPE_IDTX;
+      *type1 = TYPE_DCT;
+      break;
+    case V_DCT:
+      *type0 = TYPE_DCT;
+      *type1 = TYPE_IDTX;
+      break;
+    case H_ADST:
+      *type0 = TYPE_IDTX;
+      *type1 = TYPE_ADST;
+      break;
+    case V_ADST:
+      *type0 = TYPE_ADST;
+      *type1 = TYPE_IDTX;
+      break;
+    case H_FLIPADST:
+      *type0 = TYPE_IDTX;
+      *type1 = TYPE_ADST;
+      break;
+    case V_FLIPADST:
+      *type0 = TYPE_ADST;
+      *type1 = TYPE_IDTX;
+      break;
+    default:
+      *type0 = TYPE_DCT;
+      *type1 = TYPE_DCT;
+      assert(0);
+      break;
+  }
+}
+
+double Sqrt2 = pow(2, 0.5);
+double invSqrt2 = 1 / pow(2, 0.5);
+
+double dct_matrix(double n, double k, int size) {
+  return cos(PI * (2 * n + 1) * k / (2 * size));
+}
+
+void reference_dct_1d(const double *in, double *out, int size) {
+  for (int k = 0; k < size; ++k) {
+    out[k] = 0;
+    for (int n = 0; n < size; ++n) {
+      out[k] += in[n] * dct_matrix(n, k, size);
+    }
+    if (k == 0) out[k] = out[k] * invSqrt2;
+  }
+}
+
+void reference_idct_1d(const double *in, double *out, int size) {
+  for (int k = 0; k < size; ++k) {
+    out[k] = 0;
+    for (int n = 0; n < size; ++n) {
+      if (n == 0)
+        out[k] += invSqrt2 * in[n] * dct_matrix(k, n, size);
+      else
+        out[k] += in[n] * dct_matrix(k, n, size);
+    }
+  }
+}
+
+// TODO(any): Copied from the old 'fadst4' (same as the new 'av1_fadst4'
+// function). Should be replaced by a proper reference function that takes
+// 'double' input & output.
+static void fadst4_new(const tran_low_t *input, tran_low_t *output) {
+  tran_high_t x0, x1, x2, x3;
+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+  x0 = input[0];
+  x1 = input[1];
+  x2 = input[2];
+  x3 = input[3];
+
+  if (!(x0 | x1 | x2 | x3)) {
+    output[0] = output[1] = output[2] = output[3] = 0;
+    return;
+  }
+
+  s0 = sinpi_1_9 * x0;
+  s1 = sinpi_4_9 * x0;
+  s2 = sinpi_2_9 * x1;
+  s3 = sinpi_1_9 * x1;
+  s4 = sinpi_3_9 * x2;
+  s5 = sinpi_4_9 * x3;
+  s6 = sinpi_2_9 * x3;
+  s7 = x0 + x1 - x3;
+
+  x0 = s0 + s2 + s5;
+  x1 = sinpi_3_9 * s7;
+  x2 = s1 - s3 + s6;
+  x3 = s4;
+
+  s0 = x0 + x3;
+  s1 = x1;
+  s2 = x2 - x3;
+  s3 = x2 - x0 + x3;
+
+  // 1-D transform scaling factor is sqrt(2).
+  output[0] = (tran_low_t)fdct_round_shift(s0);
+  output[1] = (tran_low_t)fdct_round_shift(s1);
+  output[2] = (tran_low_t)fdct_round_shift(s2);
+  output[3] = (tran_low_t)fdct_round_shift(s3);
+}
+
+void reference_adst_1d(const double *in, double *out, int size) {
+  if (size == 4) {  // Special case.
+    tran_low_t int_input[4];
+    for (int i = 0; i < 4; ++i) {
+      int_input[i] = static_cast<tran_low_t>(round(in[i]));
+    }
+    tran_low_t int_output[4];
+    fadst4_new(int_input, int_output);
+    for (int i = 0; i < 4; ++i) {
+      out[i] = int_output[i];
+    }
+    return;
+  }
+
+  for (int k = 0; k < size; ++k) {
+    out[k] = 0;
+    for (int n = 0; n < size; ++n) {
+      out[k] += in[n] * sin(PI * (2 * n + 1) * (2 * k + 1) / (4 * size));
+    }
+  }
+}
+
+void reference_idtx_1d(const double *in, double *out, int size) {
+  double scale = 0;
+  if (size == 4)
+    scale = Sqrt2;
+  else if (size == 8)
+    scale = 2;
+  else if (size == 16)
+    scale = 2 * Sqrt2;
+  else if (size == 32)
+    scale = 4;
+  else if (size == 64)
+    scale = 4 * Sqrt2;
+  for (int k = 0; k < size; ++k) {
+    out[k] = in[k] * scale;
+  }
+}
+
+void reference_hybrid_1d(double *in, double *out, int size, int type) {
+  if (type == TYPE_DCT)
+    reference_dct_1d(in, out, size);
+  else if (type == TYPE_ADST)
+    reference_adst_1d(in, out, size);
+  else
+    reference_idtx_1d(in, out, size);
+}
+
+double get_amplification_factor(TX_TYPE tx_type, TX_SIZE tx_size) {
+  TXFM_2D_FLIP_CFG fwd_txfm_flip_cfg;
+  av1_get_fwd_txfm_cfg(tx_type, tx_size, &fwd_txfm_flip_cfg);
+  const int tx_width = tx_size_wide[fwd_txfm_flip_cfg.tx_size];
+  const int tx_height = tx_size_high[fwd_txfm_flip_cfg.tx_size];
+  const int8_t *shift = fwd_txfm_flip_cfg.shift;
+  const int amplify_bit = shift[0] + shift[1] + shift[2];
+  double amplify_factor =
+      amplify_bit >= 0 ? (1 << amplify_bit) : (1.0 / (1 << -amplify_bit));
+
+  // For rectangular transforms, we need to multiply by an extra factor.
+  const int rect_type = get_rect_tx_log_ratio(tx_width, tx_height);
+  if (abs(rect_type) == 1) {
+    amplify_factor *= pow(2, 0.5);
+  }
+  return amplify_factor;
+}
+
+void reference_hybrid_2d(double *in, double *out, TX_TYPE tx_type,
+                         TX_SIZE tx_size) {
+  // Get transform type and size of each dimension.
+  TYPE_TXFM type0;
+  TYPE_TXFM type1;
+  get_txfm1d_type(tx_type, &type0, &type1);
+  const int tx_width = tx_size_wide[tx_size];
+  const int tx_height = tx_size_high[tx_size];
+
+  double *const temp_in = new double[AOMMAX(tx_width, tx_height)];
+  double *const temp_out = new double[AOMMAX(tx_width, tx_height)];
+  double *const out_interm = new double[tx_width * tx_height];
+  const int stride = tx_width;
+
+  // Transform columns.
+  for (int c = 0; c < tx_width; ++c) {
+    for (int r = 0; r < tx_height; ++r) {
+      temp_in[r] = in[r * stride + c];
+    }
+    reference_hybrid_1d(temp_in, temp_out, tx_height, type0);
+    for (int r = 0; r < tx_height; ++r) {
+      out_interm[r * stride + c] = temp_out[r];
+    }
+  }
+
+  // Transform rows.
+  for (int r = 0; r < tx_height; ++r) {
+    reference_hybrid_1d(out_interm + r * stride, out + r * stride, tx_width,
+                        type1);
+  }
+
+  delete[] temp_in;
+  delete[] temp_out;
+  delete[] out_interm;
+
+  // These transforms use an approximate 2D DCT transform, by only keeping the
+  // top-left quarter of the coefficients, and repacking them in the first
+  // quarter indices.
+  // TODO(urvang): Refactor this code.
+  if (tx_width == 64 && tx_height == 64) {  // tx_size == TX_64X64
+    // Zero out top-right 32x32 area.
+    for (int row = 0; row < 32; ++row) {
+      memset(out + row * 64 + 32, 0, 32 * sizeof(*out));
+    }
+    // Zero out the bottom 64x32 area.
+    memset(out + 32 * 64, 0, 32 * 64 * sizeof(*out));
+    // Re-pack non-zero coeffs in the first 32x32 indices.
+    for (int row = 1; row < 32; ++row) {
+      memcpy(out + row * 32, out + row * 64, 32 * sizeof(*out));
+    }
+  } else if (tx_width == 32 && tx_height == 64) {  // tx_size == TX_32X64
+    // Zero out the bottom 32x32 area.
+    memset(out + 32 * 32, 0, 32 * 32 * sizeof(*out));
+    // Note: no repacking needed here.
+  } else if (tx_width == 64 && tx_height == 32) {  // tx_size == TX_64X32
+    // Zero out right 32x32 area.
+    for (int row = 0; row < 32; ++row) {
+      memset(out + row * 64 + 32, 0, 32 * sizeof(*out));
+    }
+    // Re-pack non-zero coeffs in the first 32x32 indices.
+    for (int row = 1; row < 32; ++row) {
+      memcpy(out + row * 32, out + row * 64, 32 * sizeof(*out));
+    }
+  } else if (tx_width == 16 && tx_height == 64) {  // tx_size == TX_16X64
+    // Zero out the bottom 16x32 area.
+    memset(out + 16 * 32, 0, 16 * 32 * sizeof(*out));
+    // Note: no repacking needed here.
+  } else if (tx_width == 64 && tx_height == 16) {  // tx_size == TX_64X16
+    // Zero out right 32x16 area.
+    for (int row = 0; row < 16; ++row) {
+      memset(out + row * 64 + 32, 0, 32 * sizeof(*out));
+    }
+    // Re-pack non-zero coeffs in the first 32x16 indices.
+    for (int row = 1; row < 16; ++row) {
+      memcpy(out + row * 32, out + row * 64, 32 * sizeof(*out));
+    }
+  }
+
+  // Apply appropriate scale.
+  const double amplify_factor = get_amplification_factor(tx_type, tx_size);
+  for (int c = 0; c < tx_width; ++c) {
+    for (int r = 0; r < tx_height; ++r) {
+      out[r * stride + c] *= amplify_factor;
+    }
+  }
+}
+
+template <typename Type>
+void fliplr(Type *dest, int width, int height, int stride) {
+  for (int r = 0; r < height; ++r) {
+    for (int c = 0; c < width / 2; ++c) {
+      const Type tmp = dest[r * stride + c];
+      dest[r * stride + c] = dest[r * stride + width - 1 - c];
+      dest[r * stride + width - 1 - c] = tmp;
+    }
+  }
+}
+
+template <typename Type>
+void flipud(Type *dest, int width, int height, int stride) {
+  for (int c = 0; c < width; ++c) {
+    for (int r = 0; r < height / 2; ++r) {
+      const Type tmp = dest[r * stride + c];
+      dest[r * stride + c] = dest[(height - 1 - r) * stride + c];
+      dest[(height - 1 - r) * stride + c] = tmp;
+    }
+  }
+}
+
+template <typename Type>
+void fliplrud(Type *dest, int width, int height, int stride) {
+  for (int r = 0; r < height / 2; ++r) {
+    for (int c = 0; c < width; ++c) {
+      const Type tmp = dest[r * stride + c];
+      dest[r * stride + c] = dest[(height - 1 - r) * stride + width - 1 - c];
+      dest[(height - 1 - r) * stride + width - 1 - c] = tmp;
+    }
+  }
+}
+
+template void fliplr<double>(double *dest, int width, int height, int stride);
+template void flipud<double>(double *dest, int width, int height, int stride);
+template void fliplrud<double>(double *dest, int width, int height, int stride);
+
+int bd_arr[BD_NUM] = { 8, 10, 12 };
+
+int8_t low_range_arr[BD_NUM] = { 18, 32, 32 };
+int8_t high_range_arr[BD_NUM] = { 32, 32, 32 };
+
+void txfm_stage_range_check(const int8_t *stage_range, int stage_num,
+                            int8_t cos_bit, int low_range, int high_range) {
+  for (int i = 0; i < stage_num; ++i) {
+    EXPECT_LE(stage_range[i], low_range);
+    ASSERT_LE(stage_range[i] + cos_bit, high_range) << "stage = " << i;
+  }
+  for (int i = 0; i < stage_num - 1; ++i) {
+    // make sure there is no overflow while doing half_btf()
+    ASSERT_LE(stage_range[i + 1] + cos_bit, high_range) << "stage = " << i;
+  }
+}
+}  // namespace libaom_test
diff --git a/libs/libaom/src/test/av1_txfm_test.h b/libs/libaom/src/test/av1_txfm_test.h
new file mode 100644
index 000000000..5a56d28f1
--- /dev/null
+++ b/libs/libaom/src/test/av1_txfm_test.h
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_TEST_AV1_TXFM_TEST_H_
+#define AOM_TEST_AV1_TXFM_TEST_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#ifdef _MSC_VER
+#define _USE_MATH_DEFINES
+#endif
+#include <math.h>
+
+#include "config/av1_rtcd.h"
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "test/acm_random.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/common/blockd.h"
+#include "av1/common/enums.h"
+
+namespace libaom_test {
+enum {
+  TYPE_DCT = 0,
+  TYPE_ADST,
+  TYPE_IDTX,
+  TYPE_IDCT,
+  TYPE_IADST,
+  TYPE_LAST
+} UENUM1BYTE(TYPE_TXFM);
+
+int get_txfm1d_size(TX_SIZE tx_size);
+
+void get_txfm1d_type(TX_TYPE txfm2d_type, TYPE_TXFM *type0, TYPE_TXFM *type1);
+
+void reference_dct_1d(const double *in, double *out, int size);
+void reference_idct_1d(const double *in, double *out, int size);
+
+void reference_adst_1d(const double *in, double *out, int size);
+
+void reference_hybrid_1d(double *in, double *out, int size, int type);
+
+double get_amplification_factor(TX_TYPE tx_type, TX_SIZE tx_size);
+
+void reference_hybrid_2d(double *in, double *out, TX_TYPE tx_type,
+                         TX_SIZE tx_size);
+template <typename Type1, typename Type2>
+static double compute_avg_abs_error(const Type1 *a, const Type2 *b,
+                                    const int size) {
+  double error = 0;
+  for (int i = 0; i < size; i++) {
+    error += fabs(static_cast<double>(a[i]) - static_cast<double>(b[i]));
+  }
+  error = error / size;
+  return error;
+}
+
+template <typename Type>
+void fliplr(Type *dest, int width, int height, int stride);
+
+template <typename Type>
+void flipud(Type *dest, int width, int height, int stride);
+
+template <typename Type>
+void fliplrud(Type *dest, int width, int height, int stride);
+
+typedef void (*TxfmFunc)(const int32_t *in, int32_t *out, const int8_t cos_bit,
+                         const int8_t *range_bit);
+
+typedef void (*InvTxfm2dFunc)(const int32_t *, uint16_t *, int, TX_TYPE, int);
+typedef void (*LbdInvTxfm2dFunc)(const int32_t *, uint8_t *, int, TX_TYPE,
+                                 TX_SIZE, int);
+
+static const int bd = 10;
+static const int input_base = (1 << bd);
+
+static INLINE bool IsTxSizeTypeValid(TX_SIZE tx_size, TX_TYPE tx_type) {
+  const TX_SIZE tx_size_sqr_up = txsize_sqr_up_map[tx_size];
+  TxSetType tx_set_type;
+  if (tx_size_sqr_up > TX_32X32) {
+    tx_set_type = EXT_TX_SET_DCTONLY;
+  } else if (tx_size_sqr_up == TX_32X32) {
+    tx_set_type = EXT_TX_SET_DCT_IDTX;
+  } else {
+    tx_set_type = EXT_TX_SET_ALL16;
+  }
+  return av1_ext_tx_used[tx_set_type][tx_type] != 0;
+}
+
+#if CONFIG_AV1_ENCODER
+
+static const FwdTxfm2dFunc fwd_txfm_func_ls[TX_SIZES_ALL] = {
+  av1_fwd_txfm2d_4x4_c,   av1_fwd_txfm2d_8x8_c,   av1_fwd_txfm2d_16x16_c,
+  av1_fwd_txfm2d_32x32_c, av1_fwd_txfm2d_64x64_c, av1_fwd_txfm2d_4x8_c,
+  av1_fwd_txfm2d_8x4_c,   av1_fwd_txfm2d_8x16_c,  av1_fwd_txfm2d_16x8_c,
+  av1_fwd_txfm2d_16x32_c, av1_fwd_txfm2d_32x16_c, av1_fwd_txfm2d_32x64_c,
+  av1_fwd_txfm2d_64x32_c, av1_fwd_txfm2d_4x16_c,  av1_fwd_txfm2d_16x4_c,
+  av1_fwd_txfm2d_8x32_c,  av1_fwd_txfm2d_32x8_c,  av1_fwd_txfm2d_16x64_c,
+  av1_fwd_txfm2d_64x16_c,
+};
+#endif
+
+static const InvTxfm2dFunc inv_txfm_func_ls[TX_SIZES_ALL] = {
+  av1_inv_txfm2d_add_4x4_c,   av1_inv_txfm2d_add_8x8_c,
+  av1_inv_txfm2d_add_16x16_c, av1_inv_txfm2d_add_32x32_c,
+  av1_inv_txfm2d_add_64x64_c, av1_inv_txfm2d_add_4x8_c,
+  av1_inv_txfm2d_add_8x4_c,   av1_inv_txfm2d_add_8x16_c,
+  av1_inv_txfm2d_add_16x8_c,  av1_inv_txfm2d_add_16x32_c,
+  av1_inv_txfm2d_add_32x16_c, av1_inv_txfm2d_add_32x64_c,
+  av1_inv_txfm2d_add_64x32_c, av1_inv_txfm2d_add_4x16_c,
+  av1_inv_txfm2d_add_16x4_c,  av1_inv_txfm2d_add_8x32_c,
+  av1_inv_txfm2d_add_32x8_c,  av1_inv_txfm2d_add_16x64_c,
+  av1_inv_txfm2d_add_64x16_c,
+};
+
+#define BD_NUM 3
+
+extern int bd_arr[];
+extern int8_t low_range_arr[];
+extern int8_t high_range_arr[];
+
+void txfm_stage_range_check(const int8_t *stage_range, int stage_num,
+                            const int8_t cos_bit, int low_range,
+                            int high_range);
+}  // namespace libaom_test
+#endif  // AOM_TEST_AV1_TXFM_TEST_H_
diff --git a/libs/libaom/src/test/av1_wedge_utils_test.cc b/libs/libaom/src/test/av1_wedge_utils_test.cc
new file mode 100644
index 000000000..f9dc838ff
--- /dev/null
+++ b/libs/libaom/src/test/av1_wedge_utils_test.cc
@@ -0,0 +1,391 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+
+#include "av1/common/enums.h"
+
+#include "test/acm_random.h"
+#include "test/function_equivalence_test.h"
+#include "test/register_state_check.h"
+
+#define WEDGE_WEIGHT_BITS 6
+#define MAX_MASK_VALUE (1 << (WEDGE_WEIGHT_BITS))
+
+using libaom_test::ACMRandom;
+using libaom_test::FunctionEquivalenceTest;
+
+namespace {
+
+static const int16_t kInt13Max = (1 << 12) - 1;
+
+//////////////////////////////////////////////////////////////////////////////
+// av1_wedge_sse_from_residuals - functionality
+//////////////////////////////////////////////////////////////////////////////
+
+class WedgeUtilsSSEFuncTest : public testing::Test {
+ protected:
+  WedgeUtilsSSEFuncTest() : rng_(ACMRandom::DeterministicSeed()) {}
+
+  static const int kIterations = 1000;
+
+  ACMRandom rng_;
+};
+
+static void equiv_blend_residuals(int16_t *r, const int16_t *r0,
+                                  const int16_t *r1, const uint8_t *m, int N) {
+  for (int i = 0; i < N; i++) {
+    const int32_t m0 = m[i];
+    const int32_t m1 = MAX_MASK_VALUE - m0;
+    const int16_t R = m0 * r0[i] + m1 * r1[i];
+    // Note that this rounding is designed to match the result
+    // you would get when actually blending the 2 predictors and computing
+    // the residuals.
+    r[i] = ROUND_POWER_OF_TWO(R - 1, WEDGE_WEIGHT_BITS);
+  }
+}
+
+static uint64_t equiv_sse_from_residuals(const int16_t *r0, const int16_t *r1,
+                                         const uint8_t *m, int N) {
+  uint64_t acc = 0;
+  for (int i = 0; i < N; i++) {
+    const int32_t m0 = m[i];
+    const int32_t m1 = MAX_MASK_VALUE - m0;
+    const int16_t R = m0 * r0[i] + m1 * r1[i];
+    const int32_t r = ROUND_POWER_OF_TWO(R - 1, WEDGE_WEIGHT_BITS);
+    acc += r * r;
+  }
+  return acc;
+}
+
+TEST_F(WedgeUtilsSSEFuncTest, ResidualBlendingEquiv) {
+  DECLARE_ALIGNED(32, uint8_t, s[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, uint8_t, p0[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, uint8_t, p1[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, uint8_t, p[MAX_SB_SQUARE]);
+
+  DECLARE_ALIGNED(32, int16_t, r0[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int16_t, r_ref[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int16_t, r_tst[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, uint8_t, m[MAX_SB_SQUARE]);
+
+  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+      s[i] = rng_.Rand8();
+      m[i] = rng_(MAX_MASK_VALUE + 1);
+    }
+
+    const int w = 1 << (rng_(MAX_SB_SIZE_LOG2 + 1 - 3) + 3);
+    const int h = 1 << (rng_(MAX_SB_SIZE_LOG2 + 1 - 3) + 3);
+    const int N = w * h;
+
+    for (int j = 0; j < N; j++) {
+      p0[j] = clamp(s[j] + rng_(33) - 16, 0, UINT8_MAX);
+      p1[j] = clamp(s[j] + rng_(33) - 16, 0, UINT8_MAX);
+    }
+
+    aom_blend_a64_mask(p, w, p0, w, p1, w, m, w, w, h, 0, 0);
+
+    aom_subtract_block(h, w, r0, w, s, w, p0, w);
+    aom_subtract_block(h, w, r1, w, s, w, p1, w);
+
+    aom_subtract_block(h, w, r_ref, w, s, w, p, w);
+    equiv_blend_residuals(r_tst, r0, r1, m, N);
+
+    for (int i = 0; i < N; ++i) ASSERT_EQ(r_ref[i], r_tst[i]);
+
+    uint64_t ref_sse = aom_sum_squares_i16(r_ref, N);
+    uint64_t tst_sse = equiv_sse_from_residuals(r0, r1, m, N);
+
+    ASSERT_EQ(ref_sse, tst_sse);
+  }
+}
+
+static uint64_t sse_from_residuals(const int16_t *r0, const int16_t *r1,
+                                   const uint8_t *m, int N) {
+  uint64_t acc = 0;
+  for (int i = 0; i < N; i++) {
+    const int32_t m0 = m[i];
+    const int32_t m1 = MAX_MASK_VALUE - m0;
+    const int32_t r = m0 * r0[i] + m1 * r1[i];
+    acc += r * r;
+  }
+  return ROUND_POWER_OF_TWO(acc, 2 * WEDGE_WEIGHT_BITS);
+}
+
+TEST_F(WedgeUtilsSSEFuncTest, ResidualBlendingMethod) {
+  DECLARE_ALIGNED(32, int16_t, r0[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int16_t, d[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, uint8_t, m[MAX_SB_SQUARE]);
+
+  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+      r1[i] = rng_(2 * INT8_MAX - 2 * INT8_MIN + 1) + 2 * INT8_MIN;
+      d[i] = rng_(2 * INT8_MAX - 2 * INT8_MIN + 1) + 2 * INT8_MIN;
+      m[i] = rng_(MAX_MASK_VALUE + 1);
+    }
+
+    const int N = 64 * (rng_(MAX_SB_SQUARE / 64) + 1);
+
+    for (int i = 0; i < N; i++) r0[i] = r1[i] + d[i];
+
+    const uint64_t ref_res = sse_from_residuals(r0, r1, m, N);
+    const uint64_t tst_res = av1_wedge_sse_from_residuals(r1, d, m, N);
+
+    ASSERT_EQ(ref_res, tst_res);
+  }
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// av1_wedge_sse_from_residuals - optimizations
+//////////////////////////////////////////////////////////////////////////////
+
+typedef uint64_t (*FSSE)(const int16_t *r1, const int16_t *d, const uint8_t *m,
+                         int N);
+typedef libaom_test::FuncParam<FSSE> TestFuncsFSSE;
+
+class WedgeUtilsSSEOptTest : public FunctionEquivalenceTest<FSSE> {
+ protected:
+  static const int kIterations = 10000;
+};
+
+TEST_P(WedgeUtilsSSEOptTest, RandomValues) {
+  DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int16_t, d[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, uint8_t, m[MAX_SB_SQUARE]);
+
+  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+      r1[i] = rng_(2 * kInt13Max + 1) - kInt13Max;
+      d[i] = rng_(2 * kInt13Max + 1) - kInt13Max;
+      m[i] = rng_(MAX_MASK_VALUE + 1);
+    }
+
+    const int N = 64 * (rng_(MAX_SB_SQUARE / 64) + 1);
+
+    const uint64_t ref_res = params_.ref_func(r1, d, m, N);
+    uint64_t tst_res;
+    ASM_REGISTER_STATE_CHECK(tst_res = params_.tst_func(r1, d, m, N));
+
+    ASSERT_EQ(ref_res, tst_res);
+  }
+}
+
+TEST_P(WedgeUtilsSSEOptTest, ExtremeValues) {
+  DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int16_t, d[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, uint8_t, m[MAX_SB_SQUARE]);
+
+  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    if (rng_(2)) {
+      for (int i = 0; i < MAX_SB_SQUARE; ++i) r1[i] = kInt13Max;
+    } else {
+      for (int i = 0; i < MAX_SB_SQUARE; ++i) r1[i] = -kInt13Max;
+    }
+
+    if (rng_(2)) {
+      for (int i = 0; i < MAX_SB_SQUARE; ++i) d[i] = kInt13Max;
+    } else {
+      for (int i = 0; i < MAX_SB_SQUARE; ++i) d[i] = -kInt13Max;
+    }
+
+    for (int i = 0; i < MAX_SB_SQUARE; ++i) m[i] = MAX_MASK_VALUE;
+
+    const int N = 64 * (rng_(MAX_SB_SQUARE / 64) + 1);
+
+    const uint64_t ref_res = params_.ref_func(r1, d, m, N);
+    uint64_t tst_res;
+    ASM_REGISTER_STATE_CHECK(tst_res = params_.tst_func(r1, d, m, N));
+
+    ASSERT_EQ(ref_res, tst_res);
+  }
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// av1_wedge_sign_from_residuals
+//////////////////////////////////////////////////////////////////////////////
+
+typedef int8_t (*FSign)(const int16_t *ds, const uint8_t *m, int N,
+                        int64_t limit);
+typedef libaom_test::FuncParam<FSign> TestFuncsFSign;
+
+class WedgeUtilsSignOptTest : public FunctionEquivalenceTest<FSign> {
+ protected:
+  static const int kIterations = 10000;
+  static const int kMaxSize = 8196;  // Size limited by SIMD implementation.
+};
+
+TEST_P(WedgeUtilsSignOptTest, RandomValues) {
+  DECLARE_ALIGNED(32, int16_t, r0[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int16_t, ds[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, uint8_t, m[MAX_SB_SQUARE]);
+
+  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+      r0[i] = rng_(2 * kInt13Max + 1) - kInt13Max;
+      r1[i] = rng_(2 * kInt13Max + 1) - kInt13Max;
+      m[i] = rng_(MAX_MASK_VALUE + 1);
+    }
+
+    const int maxN = AOMMIN(kMaxSize, MAX_SB_SQUARE);
+    const int N = 64 * (rng_(maxN / 64 - 1) + 1);
+
+    int64_t limit;
+    limit = (int64_t)aom_sum_squares_i16(r0, N);
+    limit -= (int64_t)aom_sum_squares_i16(r1, N);
+    limit *= (1 << WEDGE_WEIGHT_BITS) / 2;
+
+    for (int i = 0; i < N; i++)
+      ds[i] = clamp(r0[i] * r0[i] - r1[i] * r1[i], INT16_MIN, INT16_MAX);
+
+    const int ref_res = params_.ref_func(ds, m, N, limit);
+    int tst_res;
+    ASM_REGISTER_STATE_CHECK(tst_res = params_.tst_func(ds, m, N, limit));
+
+    ASSERT_EQ(ref_res, tst_res);
+  }
+}
+
+TEST_P(WedgeUtilsSignOptTest, ExtremeValues) {
+  DECLARE_ALIGNED(32, int16_t, r0[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int16_t, ds[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, uint8_t, m[MAX_SB_SQUARE]);
+
+  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    switch (rng_(4)) {
+      case 0:
+        for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+          r0[i] = 0;
+          r1[i] = kInt13Max;
+        }
+        break;
+      case 1:
+        for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+          r0[i] = kInt13Max;
+          r1[i] = 0;
+        }
+        break;
+      case 2:
+        for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+          r0[i] = 0;
+          r1[i] = -kInt13Max;
+        }
+        break;
+      default:
+        for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+          r0[i] = -kInt13Max;
+          r1[i] = 0;
+        }
+        break;
+    }
+
+    for (int i = 0; i < MAX_SB_SQUARE; ++i) m[i] = MAX_MASK_VALUE;
+
+    const int maxN = AOMMIN(kMaxSize, MAX_SB_SQUARE);
+    const int N = 64 * (rng_(maxN / 64 - 1) + 1);
+
+    int64_t limit;
+    limit = (int64_t)aom_sum_squares_i16(r0, N);
+    limit -= (int64_t)aom_sum_squares_i16(r1, N);
+    limit *= (1 << WEDGE_WEIGHT_BITS) / 2;
+
+    for (int i = 0; i < N; i++)
+      ds[i] = clamp(r0[i] * r0[i] - r1[i] * r1[i], INT16_MIN, INT16_MAX);
+
+    const int ref_res = params_.ref_func(ds, m, N, limit);
+    int tst_res;
+    ASM_REGISTER_STATE_CHECK(tst_res = params_.tst_func(ds, m, N, limit));
+
+    ASSERT_EQ(ref_res, tst_res);
+  }
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// av1_wedge_compute_delta_squares
+//////////////////////////////////////////////////////////////////////////////
+
+typedef void (*FDS)(int16_t *d, const int16_t *a, const int16_t *b, int N);
+typedef libaom_test::FuncParam<FDS> TestFuncsFDS;
+
+class WedgeUtilsDeltaSquaresOptTest : public FunctionEquivalenceTest<FDS> {
+ protected:
+  static const int kIterations = 10000;
+};
+
+TEST_P(WedgeUtilsDeltaSquaresOptTest, RandomValues) {
+  DECLARE_ALIGNED(32, int16_t, a[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int16_t, b[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int16_t, d_ref[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int16_t, d_tst[MAX_SB_SQUARE]);
+
+  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+      a[i] = rng_.Rand16();
+      b[i] = rng_(2 * INT16_MAX + 1) - INT16_MAX;
+    }
+
+    const int N = 64 * (rng_(MAX_SB_SQUARE / 64) + 1);
+
+    memset(&d_ref, INT16_MAX, sizeof(d_ref));
+    memset(&d_tst, INT16_MAX, sizeof(d_tst));
+
+    params_.ref_func(d_ref, a, b, N);
+    ASM_REGISTER_STATE_CHECK(params_.tst_func(d_tst, a, b, N));
+
+    for (int i = 0; i < MAX_SB_SQUARE; ++i) ASSERT_EQ(d_ref[i], d_tst[i]);
+  }
+}
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, WedgeUtilsSSEOptTest,
+    ::testing::Values(TestFuncsFSSE(av1_wedge_sse_from_residuals_c,
+                                    av1_wedge_sse_from_residuals_sse2)));
+
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, WedgeUtilsSignOptTest,
+    ::testing::Values(TestFuncsFSign(av1_wedge_sign_from_residuals_c,
+                                     av1_wedge_sign_from_residuals_sse2)));
+
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, WedgeUtilsDeltaSquaresOptTest,
+    ::testing::Values(TestFuncsFDS(av1_wedge_compute_delta_squares_c,
+                                   av1_wedge_compute_delta_squares_sse2)));
+#endif  // HAVE_SSE2
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, WedgeUtilsSSEOptTest,
+    ::testing::Values(TestFuncsFSSE(av1_wedge_sse_from_residuals_sse2,
+                                    av1_wedge_sse_from_residuals_avx2)));
+
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, WedgeUtilsSignOptTest,
+    ::testing::Values(TestFuncsFSign(av1_wedge_sign_from_residuals_sse2,
+                                     av1_wedge_sign_from_residuals_avx2)));
+
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, WedgeUtilsDeltaSquaresOptTest,
+    ::testing::Values(TestFuncsFDS(av1_wedge_compute_delta_squares_sse2,
+                                   av1_wedge_compute_delta_squares_avx2)));
+#endif  // HAVE_AVX2
+
+}  // namespace
diff --git a/libs/libaom/src/test/avg_test.cc b/libs/libaom/src/test/avg_test.cc
new file mode 100644
index 000000000..1742aec5f
--- /dev/null
+++ b/libs/libaom/src/test/avg_test.cc
@@ -0,0 +1,291 @@
+/*
+ *  Copyright (c) 2019, Alliance for Open Media. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include <tuple>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+
+namespace {
+
+using libaom_test::ACMRandom;
+
+template <typename Pixel>
+class AverageTestBase : public ::testing::Test {
+ public:
+  AverageTestBase(int width, int height)
+      : width_(width), height_(height), source_data_(NULL), source_stride_(0),
+        bit_depth_(8) {}
+
+  virtual void TearDown() {
+    aom_free(source_data_);
+    source_data_ = NULL;
+    libaom_test::ClearSystemState();
+  }
+
+ protected:
+  // Handle blocks up to 4 blocks 64x64 with stride up to 128
+  static const int kDataAlignment = 16;
+  static const int kDataBlockSize = 64 * 128;
+
+  virtual void SetUp() {
+    source_data_ = static_cast<Pixel *>(
+        aom_memalign(kDataAlignment, kDataBlockSize * sizeof(source_data_[0])));
+    ASSERT_TRUE(source_data_ != NULL);
+    source_stride_ = (width_ + 31) & ~31;
+    bit_depth_ = 8;
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+  }
+
+  // Sum Pixels
+  static unsigned int ReferenceAverage8x8(const Pixel *source, int pitch) {
+    unsigned int average = 0;
+    for (int h = 0; h < 8; ++h) {
+      for (int w = 0; w < 8; ++w) average += source[h * pitch + w];
+    }
+    return (average + 32) >> 6;
+  }
+
+  static unsigned int ReferenceAverage4x4(const Pixel *source, int pitch) {
+    unsigned int average = 0;
+    for (int h = 0; h < 4; ++h) {
+      for (int w = 0; w < 4; ++w) average += source[h * pitch + w];
+    }
+    return (average + 8) >> 4;
+  }
+
+  void FillConstant(Pixel fill_constant) {
+    for (int i = 0; i < width_ * height_; ++i) {
+      source_data_[i] = fill_constant;
+    }
+  }
+
+  void FillRandom() {
+    for (int i = 0; i < width_ * height_; ++i) {
+      source_data_[i] = rnd_.Rand16() & ((1 << bit_depth_) - 1);
+    }
+  }
+
+  int width_, height_;
+  Pixel *source_data_;
+  int source_stride_;
+  int bit_depth_;
+
+  ACMRandom rnd_;
+};
+typedef unsigned int (*AverageFunction)(const uint8_t *s, int pitch);
+
+// Arguments: width, height, pitch, block size, avg function.
+typedef std::tuple<int, int, int, int, AverageFunction> AvgFunc;
+
+class AverageTest : public AverageTestBase<uint8_t>,
+                    public ::testing::WithParamInterface<AvgFunc> {
+ public:
+  AverageTest() : AverageTestBase(GET_PARAM(0), GET_PARAM(1)) {}
+
+ protected:
+  void CheckAverages() {
+    const int block_size = GET_PARAM(3);
+    unsigned int expected = 0;
+    if (block_size == 8) {
+      expected =
+          ReferenceAverage8x8(source_data_ + GET_PARAM(2), source_stride_);
+    } else if (block_size == 4) {
+      expected =
+          ReferenceAverage4x4(source_data_ + GET_PARAM(2), source_stride_);
+    }
+
+    unsigned int actual;
+    ASM_REGISTER_STATE_CHECK(
+        actual = GET_PARAM(4)(source_data_ + GET_PARAM(2), source_stride_));
+
+    EXPECT_EQ(expected, actual);
+  }
+};
+
+TEST_P(AverageTest, MinValue) {
+  FillConstant(0);
+  CheckAverages();
+}
+
+TEST_P(AverageTest, MaxValue) {
+  FillConstant(255);
+  CheckAverages();
+}
+
+TEST_P(AverageTest, Random) {
+  // The reference frame, but not the source frame, may be unaligned for
+  // certain types of searches.
+  for (int i = 0; i < 1000; i++) {
+    FillRandom();
+    CheckAverages();
+  }
+}
+
+typedef void (*IntProRowFunc)(int16_t hbuf[16], uint8_t const *ref,
+                              const int ref_stride, const int height);
+
+// Params: height, asm function, c function.
+typedef std::tuple<int, IntProRowFunc, IntProRowFunc> IntProRowParam;
+
+class IntProRowTest : public AverageTestBase<uint8_t>,
+                      public ::testing::WithParamInterface<IntProRowParam> {
+ public:
+  IntProRowTest()
+      : AverageTestBase(16, GET_PARAM(0)), hbuf_asm_(NULL), hbuf_c_(NULL) {
+    asm_func_ = GET_PARAM(1);
+    c_func_ = GET_PARAM(2);
+  }
+
+ protected:
+  virtual void SetUp() {
+    source_data_ = static_cast<uint8_t *>(
+        aom_memalign(kDataAlignment, kDataBlockSize * sizeof(source_data_[0])));
+    ASSERT_TRUE(source_data_ != NULL);
+
+    hbuf_asm_ = static_cast<int16_t *>(
+        aom_memalign(kDataAlignment, sizeof(*hbuf_asm_) * 16));
+    hbuf_c_ = static_cast<int16_t *>(
+        aom_memalign(kDataAlignment, sizeof(*hbuf_c_) * 16));
+  }
+
+  virtual void TearDown() {
+    aom_free(source_data_);
+    source_data_ = NULL;
+    aom_free(hbuf_c_);
+    hbuf_c_ = NULL;
+    aom_free(hbuf_asm_);
+    hbuf_asm_ = NULL;
+  }
+
+  void RunComparison() {
+    ASM_REGISTER_STATE_CHECK(c_func_(hbuf_c_, source_data_, 0, height_));
+    ASM_REGISTER_STATE_CHECK(asm_func_(hbuf_asm_, source_data_, 0, height_));
+    EXPECT_EQ(0, memcmp(hbuf_c_, hbuf_asm_, sizeof(*hbuf_c_) * 16))
+        << "Output mismatch";
+  }
+
+ private:
+  IntProRowFunc asm_func_;
+  IntProRowFunc c_func_;
+  int16_t *hbuf_asm_;
+  int16_t *hbuf_c_;
+};
+
+typedef int16_t (*IntProColFunc)(uint8_t const *ref, const int width);
+
+// Params: width, asm function, c function.
+typedef std::tuple<int, IntProColFunc, IntProColFunc> IntProColParam;
+
+class IntProColTest : public AverageTestBase<uint8_t>,
+                      public ::testing::WithParamInterface<IntProColParam> {
+ public:
+  IntProColTest() : AverageTestBase(GET_PARAM(0), 1), sum_asm_(0), sum_c_(0) {
+    asm_func_ = GET_PARAM(1);
+    c_func_ = GET_PARAM(2);
+  }
+
+ protected:
+  void RunComparison() {
+    ASM_REGISTER_STATE_CHECK(sum_c_ = c_func_(source_data_, width_));
+    ASM_REGISTER_STATE_CHECK(sum_asm_ = asm_func_(source_data_, width_));
+    EXPECT_EQ(sum_c_, sum_asm_) << "Output mismatch";
+  }
+
+ private:
+  IntProColFunc asm_func_;
+  IntProColFunc c_func_;
+  int16_t sum_asm_;
+  int16_t sum_c_;
+};
+
+TEST_P(IntProRowTest, MinValue) {
+  FillConstant(0);
+  RunComparison();
+}
+
+TEST_P(IntProRowTest, MaxValue) {
+  FillConstant(255);
+  RunComparison();
+}
+
+TEST_P(IntProRowTest, Random) {
+  FillRandom();
+  RunComparison();
+}
+
+TEST_P(IntProColTest, MinValue) {
+  FillConstant(0);
+  RunComparison();
+}
+
+TEST_P(IntProColTest, MaxValue) {
+  FillConstant(255);
+  RunComparison();
+}
+
+TEST_P(IntProColTest, Random) {
+  FillRandom();
+  RunComparison();
+}
+
+using std::make_tuple;
+
+INSTANTIATE_TEST_SUITE_P(
+    C, AverageTest,
+    ::testing::Values(make_tuple(16, 16, 1, 8, &aom_avg_8x8_c),
+                      make_tuple(16, 16, 1, 4, &aom_avg_4x4_c)));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, AverageTest,
+    ::testing::Values(make_tuple(16, 16, 0, 8, &aom_avg_8x8_sse2),
+                      make_tuple(16, 16, 5, 8, &aom_avg_8x8_sse2),
+                      make_tuple(32, 32, 15, 8, &aom_avg_8x8_sse2),
+                      make_tuple(16, 16, 0, 4, &aom_avg_4x4_sse2),
+                      make_tuple(16, 16, 5, 4, &aom_avg_4x4_sse2),
+                      make_tuple(32, 32, 15, 4, &aom_avg_4x4_sse2)));
+
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, IntProRowTest,
+    ::testing::Values(make_tuple(16, &aom_int_pro_row_sse2, &aom_int_pro_row_c),
+                      make_tuple(32, &aom_int_pro_row_sse2, &aom_int_pro_row_c),
+                      make_tuple(64, &aom_int_pro_row_sse2, &aom_int_pro_row_c),
+                      make_tuple(128, &aom_int_pro_row_sse2,
+                                 &aom_int_pro_row_c)));
+
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, IntProColTest,
+    ::testing::Values(make_tuple(16, &aom_int_pro_col_sse2, &aom_int_pro_col_c),
+                      make_tuple(32, &aom_int_pro_col_sse2, &aom_int_pro_col_c),
+                      make_tuple(64, &aom_int_pro_col_sse2, &aom_int_pro_col_c),
+                      make_tuple(128, &aom_int_pro_col_sse2,
+                                 &aom_int_pro_col_c)));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+    NEON, AverageTest,
+    ::testing::Values(make_tuple(16, 16, 0, 8, &aom_avg_8x8_neon),
+                      make_tuple(16, 16, 5, 8, &aom_avg_8x8_neon),
+                      make_tuple(32, 32, 15, 8, &aom_avg_8x8_neon),
+                      make_tuple(16, 16, 0, 4, &aom_avg_4x4_neon),
+                      make_tuple(16, 16, 5, 4, &aom_avg_4x4_neon),
+                      make_tuple(32, 32, 15, 4, &aom_avg_4x4_neon)));
+#endif
+
+}  // namespace
diff --git a/libs/libaom/src/test/best_encode.sh b/libs/libaom/src/test/best_encode.sh
new file mode 100644
index 000000000..fe31a01cb
--- /dev/null
+++ b/libs/libaom/src/test/best_encode.sh
@@ -0,0 +1,103 @@
+#!/bin/bash
+#
+# Copyright (c) 2016, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and
+# the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+# was not distributed with this source code in the LICENSE file, you can
+# obtain it at www.aomedia.org/license/software. If the Alliance for Open
+# Media Patent License 1.0 was not distributed with this source code in the
+# PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+#
+# Author: jimbankoski@google.com (Jim Bankoski)
+
+if [[ $# -ne 2 ]]; then
+  echo "Encodes a file using best known settings (slow!)"
+  echo "  Usage:    be [FILE] [BITRATE]"
+  echo "  Example:  be akiyo_cif.y4m 200"
+  exit
+fi
+
+f=$1  # file is first parameter
+b=$2  # bitrate is second parameter
+
+if [[ -e $f.fpf ]]; then
+  # First-pass file found, do second pass only
+  aomenc \
+    $f \
+    -o $f-$b.av1.webm \
+    -p 2 \
+    --pass=2 \
+    --fpf=$f.fpf \
+    --best \
+    --cpu-used=0 \
+    --target-bitrate=$b \
+    --auto-alt-ref=1 \
+    -v \
+    --minsection-pct=0 \
+    --maxsection-pct=800 \
+    --lag-in-frames=25 \
+    --kf-min-dist=0 \
+    --kf-max-dist=99999 \
+    --static-thresh=0 \
+    --min-q=0 \
+    --max-q=63 \
+    --drop-frame=0 \
+    --bias-pct=50 \
+    --minsection-pct=0 \
+    --maxsection-pct=800 \
+    --psnr \
+    --arnr-maxframes=7 \
+    --arnr-strength=3 \
+    --arnr-type=3
+else
+  # No first-pass file found, do 2-pass encode
+  aomenc \
+    $f \
+    -o $f-$b.av1.webm \
+    -p 2 \
+    --pass=1 \
+    --fpf=$f.fpf \
+    --best \
+    --cpu-used=0 \
+    --target-bitrate=$b \
+    --auto-alt-ref=1 \
+    -v \
+    --minsection-pct=0 \
+    --maxsection-pct=800 \
+    --lag-in-frames=25 \
+    --kf-min-dist=0 \
+    --kf-max-dist=99999 \
+    --static-thresh=0 \
+    --min-q=0 \
+    --max-q=63 \
+    --drop-frame=0
+
+  aomenc \
+    $f \
+    -o $f-$b.av1.webm \
+    -p 2 \
+    --pass=2 \
+    --fpf=$f.fpf \
+    --best \
+    --cpu-used=0 \
+    --target-bitrate=$b \
+    --auto-alt-ref=1 \
+    -v \
+    --minsection-pct=0 \
+    --maxsection-pct=800 \
+    --lag-in-frames=25 \
+    --kf-min-dist=0 \
+    --kf-max-dist=99999 \
+    --static-thresh=0 \
+    --min-q=0 \
+    --max-q=63 \
+    --drop-frame=0 \
+    --bias-pct=50 \
+    --minsection-pct=0 \
+    --maxsection-pct=800 \
+    --psnr \
+    --arnr-maxframes=7 \
+    --arnr-strength=3 \
+    --arnr-type=3
+fi
diff --git a/libs/libaom/src/test/binary_codes_test.cc b/libs/libaom/src/test/binary_codes_test.cc
new file mode 100644
index 000000000..45660cf85
--- /dev/null
+++ b/libs/libaom/src/test/binary_codes_test.cc
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+
+#include "test/acm_random.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/bitreader.h"
+#include "aom_dsp/bitwriter.h"
+#include "aom_dsp/binary_codes_reader.h"
+#include "aom_dsp/binary_codes_writer.h"
+
+#define ACCT_STR __func__
+
+using libaom_test::ACMRandom;
+
+namespace {
+
+// Test for Finite subexponential code with reference
+TEST(AV1, TestPrimitiveRefsubexpfin) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int kBufferSize = 65536;
+  aom_writer bw;
+  uint8_t bw_buffer[kBufferSize];
+  const uint16_t kRanges = 8;
+  const uint16_t kSubexpParams = 6;
+  const uint16_t kReferences = 8;
+  const uint16_t kValues = 16;
+  uint16_t enc_values[kRanges][kSubexpParams][kReferences][kValues][4];
+  const uint16_t range_vals[kRanges] = { 1, 13, 64, 120, 230, 420, 1100, 8000 };
+  aom_start_encode(&bw, bw_buffer);
+  for (int n = 0; n < kRanges; ++n) {
+    const uint16_t range = range_vals[n];
+    for (int k = 0; k < kSubexpParams; ++k) {
+      for (int r = 0; r < kReferences; ++r) {
+        const uint16_t ref = rnd(range);
+        for (int v = 0; v < kValues; ++v) {
+          const uint16_t value = rnd(range);
+          enc_values[n][k][r][v][0] = range;
+          enc_values[n][k][r][v][1] = k;
+          enc_values[n][k][r][v][2] = ref;
+          enc_values[n][k][r][v][3] = value;
+          aom_write_primitive_refsubexpfin(&bw, range, k, ref, value);
+        }
+      }
+    }
+  }
+  aom_stop_encode(&bw);
+  aom_reader br;
+  aom_reader_init(&br, bw_buffer, bw.pos);
+  GTEST_ASSERT_GE(aom_reader_tell(&br), 0u);
+  GTEST_ASSERT_LE(aom_reader_tell(&br), 1u);
+  for (int n = 0; n < kRanges; ++n) {
+    for (int k = 0; k < kSubexpParams; ++k) {
+      for (int r = 0; r < kReferences; ++r) {
+        for (int v = 0; v < kValues; ++v) {
+          const uint16_t range = enc_values[n][k][r][v][0];
+          assert(k == enc_values[n][k][r][v][1]);
+          const uint16_t ref = enc_values[n][k][r][v][2];
+          const uint16_t value =
+              aom_read_primitive_refsubexpfin(&br, range, k, ref, ACCT_STR);
+          GTEST_ASSERT_EQ(value, enc_values[n][k][r][v][3]);
+        }
+      }
+    }
+  }
+}
+// TODO(debargha): Adds tests for other primitives
+}  // namespace
diff --git a/libs/libaom/src/test/blend_a64_mask_1d_test.cc b/libs/libaom/src/test/blend_a64_mask_1d_test.cc
new file mode 100644
index 000000000..1b6350c79
--- /dev/null
+++ b/libs/libaom/src/test/blend_a64_mask_1d_test.cc
@@ -0,0 +1,340 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/register_state_check.h"
+#include "test/function_equivalence_test.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+
+#include "av1/common/enums.h"
+
+#include "aom_dsp/blend.h"
+
+using libaom_test::FunctionEquivalenceTest;
+
+namespace {
+
+template <typename F, typename T>
+class BlendA64Mask1DTest : public FunctionEquivalenceTest<F> {
+ public:
+  static const int kIterations = 10000;
+  static const int kMaxWidth = MAX_SB_SIZE * 5;  // * 5 to cover longer strides
+  static const int kMaxHeight = MAX_SB_SIZE;
+  static const int kBufSize = kMaxWidth * kMaxHeight;
+  static const int kMaxMaskWidth = 2 * MAX_SB_SIZE;
+  static const int kMaxMaskSize = kMaxMaskWidth;
+
+  virtual ~BlendA64Mask1DTest() {}
+
+  virtual void Execute(const T *p_src0, const T *p_src1) = 0;
+
+  void Common() {
+    w_ = 2 << this->rng_(MAX_SB_SIZE_LOG2);
+    h_ = 2 << this->rng_(MAX_SB_SIZE_LOG2);
+
+    dst_offset_ = this->rng_(33);
+    dst_stride_ = this->rng_(kMaxWidth + 1 - w_) + w_;
+
+    src0_offset_ = this->rng_(33);
+    src0_stride_ = this->rng_(kMaxWidth + 1 - w_) + w_;
+
+    src1_offset_ = this->rng_(33);
+    src1_stride_ = this->rng_(kMaxWidth + 1 - w_) + w_;
+
+    T *p_src0;
+    T *p_src1;
+
+    switch (this->rng_(3)) {
+      case 0:  // Separate sources
+        p_src0 = src0_;
+        p_src1 = src1_;
+        break;
+      case 1:  // src0 == dst
+        p_src0 = dst_tst_;
+        src0_stride_ = dst_stride_;
+        src0_offset_ = dst_offset_;
+        p_src1 = src1_;
+        break;
+      case 2:  // src1 == dst
+        p_src0 = src0_;
+        p_src1 = dst_tst_;
+        src1_stride_ = dst_stride_;
+        src1_offset_ = dst_offset_;
+        break;
+      default: FAIL();
+    }
+
+    Execute(p_src0, p_src1);
+
+    for (int r = 0; r < h_; ++r) {
+      for (int c = 0; c < w_; ++c) {
+        ASSERT_EQ(dst_ref_[dst_offset_ + r * dst_stride_ + c],
+                  dst_tst_[dst_offset_ + r * dst_stride_ + c]);
+      }
+    }
+  }
+
+  T dst_ref_[kBufSize];
+  T dst_tst_[kBufSize];
+  uint32_t dst_stride_;
+  uint32_t dst_offset_;
+
+  T src0_[kBufSize];
+  uint32_t src0_stride_;
+  uint32_t src0_offset_;
+
+  T src1_[kBufSize];
+  uint32_t src1_stride_;
+  uint32_t src1_offset_;
+
+  uint8_t mask_[kMaxMaskSize];
+
+  int w_;
+  int h_;
+};
+
+//////////////////////////////////////////////////////////////////////////////
+// 8 bit version
+//////////////////////////////////////////////////////////////////////////////
+
+typedef void (*F8B)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+                    uint32_t src0_stride, const uint8_t *src1,
+                    uint32_t src1_stride, const uint8_t *mask, int w, int h);
+typedef libaom_test::FuncParam<F8B> TestFuncs;
+
+class BlendA64Mask1DTest8B : public BlendA64Mask1DTest<F8B, uint8_t> {
+ protected:
+  void Execute(const uint8_t *p_src0, const uint8_t *p_src1) {
+    params_.ref_func(dst_ref_ + dst_offset_, dst_stride_, p_src0 + src0_offset_,
+                     src0_stride_, p_src1 + src1_offset_, src1_stride_, mask_,
+                     w_, h_);
+    ASM_REGISTER_STATE_CHECK(params_.tst_func(
+        dst_tst_ + dst_offset_, dst_stride_, p_src0 + src0_offset_,
+        src0_stride_, p_src1 + src1_offset_, src1_stride_, mask_, w_, h_));
+  }
+};
+
+TEST_P(BlendA64Mask1DTest8B, RandomValues) {
+  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    for (int i = 0; i < kBufSize; ++i) {
+      dst_ref_[i] = rng_.Rand8();
+      dst_tst_[i] = rng_.Rand8();
+
+      src0_[i] = rng_.Rand8();
+      src1_[i] = rng_.Rand8();
+    }
+
+    for (int i = 0; i < kMaxMaskSize; ++i)
+      mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1);
+
+    Common();
+  }
+}
+
+TEST_P(BlendA64Mask1DTest8B, ExtremeValues) {
+  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    for (int i = 0; i < kBufSize; ++i) {
+      dst_ref_[i] = rng_(2) + 254;
+      dst_tst_[i] = rng_(2) + 254;
+      src0_[i] = rng_(2) + 254;
+      src1_[i] = rng_(2) + 254;
+    }
+
+    for (int i = 0; i < kMaxMaskSize; ++i)
+      mask_[i] = rng_(2) + AOM_BLEND_A64_MAX_ALPHA - 1;
+
+    Common();
+  }
+}
+
+static void blend_a64_hmask_ref(uint8_t *dst, uint32_t dst_stride,
+                                const uint8_t *src0, uint32_t src0_stride,
+                                const uint8_t *src1, uint32_t src1_stride,
+                                const uint8_t *mask, int w, int h) {
+  uint8_t mask2d[BlendA64Mask1DTest8B::kMaxMaskSize]
+                [BlendA64Mask1DTest8B::kMaxMaskSize];
+
+  for (int row = 0; row < h; ++row)
+    for (int col = 0; col < w; ++col) mask2d[row][col] = mask[col];
+
+  aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride,
+                       &mask2d[0][0], BlendA64Mask1DTest8B::kMaxMaskSize, w, h,
+                       0, 0);
+}
+
+static void blend_a64_vmask_ref(uint8_t *dst, uint32_t dst_stride,
+                                const uint8_t *src0, uint32_t src0_stride,
+                                const uint8_t *src1, uint32_t src1_stride,
+                                const uint8_t *mask, int w, int h) {
+  uint8_t mask2d[BlendA64Mask1DTest8B::kMaxMaskSize]
+                [BlendA64Mask1DTest8B::kMaxMaskSize];
+
+  for (int row = 0; row < h; ++row)
+    for (int col = 0; col < w; ++col) mask2d[row][col] = mask[row];
+
+  aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride,
+                       &mask2d[0][0], BlendA64Mask1DTest8B::kMaxMaskSize, w, h,
+                       0, 0);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    C, BlendA64Mask1DTest8B,
+    ::testing::Values(TestFuncs(blend_a64_hmask_ref, aom_blend_a64_hmask_c),
+                      TestFuncs(blend_a64_vmask_ref, aom_blend_a64_vmask_c)));
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(
+    SSE4_1, BlendA64Mask1DTest8B,
+    ::testing::Values(
+        TestFuncs(blend_a64_hmask_ref, aom_blend_a64_hmask_sse4_1),
+        TestFuncs(blend_a64_vmask_ref, aom_blend_a64_vmask_sse4_1)));
+#endif  // HAVE_SSE4_1
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+    NEON, BlendA64Mask1DTest8B,
+    ::testing::Values(TestFuncs(blend_a64_hmask_ref, aom_blend_a64_hmask_neon),
+                      TestFuncs(blend_a64_vmask_ref,
+                                aom_blend_a64_vmask_neon)));
+#endif  // HAVE_NEON
+
+//////////////////////////////////////////////////////////////////////////////
+// High bit-depth version
+//////////////////////////////////////////////////////////////////////////////
+#if CONFIG_AV1_HIGHBITDEPTH
+typedef void (*FHBD)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+                     uint32_t src0_stride, const uint8_t *src1,
+                     uint32_t src1_stride, const uint8_t *mask, int w, int h,
+                     int bd);
+typedef libaom_test::FuncParam<FHBD> TestFuncsHBD;
+
+class BlendA64Mask1DTestHBD : public BlendA64Mask1DTest<FHBD, uint16_t> {
+ protected:
+  void Execute(const uint16_t *p_src0, const uint16_t *p_src1) {
+    params_.ref_func(CONVERT_TO_BYTEPTR(dst_ref_ + dst_offset_), dst_stride_,
+                     CONVERT_TO_BYTEPTR(p_src0 + src0_offset_), src0_stride_,
+                     CONVERT_TO_BYTEPTR(p_src1 + src1_offset_), src1_stride_,
+                     mask_, w_, h_, bit_depth_);
+    ASM_REGISTER_STATE_CHECK(params_.tst_func(
+        CONVERT_TO_BYTEPTR(dst_tst_ + dst_offset_), dst_stride_,
+        CONVERT_TO_BYTEPTR(p_src0 + src0_offset_), src0_stride_,
+        CONVERT_TO_BYTEPTR(p_src1 + src1_offset_), src1_stride_, mask_, w_, h_,
+        bit_depth_));
+  }
+
+  int bit_depth_;
+};
+
+TEST_P(BlendA64Mask1DTestHBD, RandomValues) {
+  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    switch (rng_(3)) {
+      case 0: bit_depth_ = 8; break;
+      case 1: bit_depth_ = 10; break;
+      default: bit_depth_ = 12; break;
+    }
+
+    const int hi = 1 << bit_depth_;
+
+    for (int i = 0; i < kBufSize; ++i) {
+      dst_ref_[i] = rng_(hi);
+      dst_tst_[i] = rng_(hi);
+      src0_[i] = rng_(hi);
+      src1_[i] = rng_(hi);
+    }
+
+    for (int i = 0; i < kMaxMaskSize; ++i)
+      mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1);
+
+    Common();
+  }
+}
+
+TEST_P(BlendA64Mask1DTestHBD, ExtremeValues) {
+  for (int iter = 0; iter < 1000 && !HasFatalFailure(); ++iter) {
+    switch (rng_(3)) {
+      case 0: bit_depth_ = 8; break;
+      case 1: bit_depth_ = 10; break;
+      default: bit_depth_ = 12; break;
+    }
+
+    const int hi = 1 << bit_depth_;
+    const int lo = hi - 2;
+
+    for (int i = 0; i < kBufSize; ++i) {
+      dst_ref_[i] = rng_(hi - lo) + lo;
+      dst_tst_[i] = rng_(hi - lo) + lo;
+      src0_[i] = rng_(hi - lo) + lo;
+      src1_[i] = rng_(hi - lo) + lo;
+    }
+
+    for (int i = 0; i < kMaxMaskSize; ++i)
+      mask_[i] = rng_(2) + AOM_BLEND_A64_MAX_ALPHA - 1;
+
+    Common();
+  }
+}
+
+static void highbd_blend_a64_hmask_ref(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, int w, int h, int bd) {
+  uint8_t mask2d[BlendA64Mask1DTestHBD::kMaxMaskSize]
+                [BlendA64Mask1DTestHBD::kMaxMaskSize];
+
+  for (int row = 0; row < h; ++row)
+    for (int col = 0; col < w; ++col) mask2d[row][col] = mask[col];
+
+  aom_highbd_blend_a64_mask_c(
+      dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask2d[0][0],
+      BlendA64Mask1DTestHBD::kMaxMaskSize, w, h, 0, 0, bd);
+}
+
+static void highbd_blend_a64_vmask_ref(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, int w, int h, int bd) {
+  uint8_t mask2d[BlendA64Mask1DTestHBD::kMaxMaskSize]
+                [BlendA64Mask1DTestHBD::kMaxMaskSize];
+
+  for (int row = 0; row < h; ++row)
+    for (int col = 0; col < w; ++col) mask2d[row][col] = mask[row];
+
+  aom_highbd_blend_a64_mask_c(
+      dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask2d[0][0],
+      BlendA64Mask1DTestHBD::kMaxMaskSize, w, h, 0, 0, bd);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    C, BlendA64Mask1DTestHBD,
+    ::testing::Values(TestFuncsHBD(highbd_blend_a64_hmask_ref,
+                                   aom_highbd_blend_a64_hmask_c),
+                      TestFuncsHBD(highbd_blend_a64_vmask_ref,
+                                   aom_highbd_blend_a64_vmask_c)));
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(
+    SSE4_1, BlendA64Mask1DTestHBD,
+    ::testing::Values(TestFuncsHBD(highbd_blend_a64_hmask_ref,
+                                   aom_highbd_blend_a64_hmask_sse4_1),
+                      TestFuncsHBD(highbd_blend_a64_vmask_ref,
+                                   aom_highbd_blend_a64_vmask_sse4_1)));
+#endif  // HAVE_SSE4_1
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+}  // namespace
diff --git a/libs/libaom/src/test/blend_a64_mask_test.cc b/libs/libaom/src/test/blend_a64_mask_test.cc
new file mode 100644
index 000000000..5c2c291fd
--- /dev/null
+++ b/libs/libaom/src/test/blend_a64_mask_test.cc
@@ -0,0 +1,620 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/register_state_check.h"
+#include "test/function_equivalence_test.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+
+#include "av1/common/enums.h"
+
+#include "aom_dsp/blend.h"
+
+using libaom_test::FunctionEquivalenceTest;
+
+namespace {
+
+template <typename BlendA64Func, typename SrcPixel, typename DstPixel>
+class BlendA64MaskTest : public FunctionEquivalenceTest<BlendA64Func> {
+ protected:
+  static const int kIterations = 10000;
+  static const int kMaxWidth = MAX_SB_SIZE * 5;  // * 5 to cover longer strides
+  static const int kMaxHeight = MAX_SB_SIZE;
+  static const int kBufSize = kMaxWidth * kMaxHeight;
+  static const int kMaxMaskWidth = 2 * MAX_SB_SIZE;
+  static const int kMaxMaskSize = kMaxMaskWidth * kMaxMaskWidth;
+
+  virtual ~BlendA64MaskTest() {}
+
+  virtual void Execute(const SrcPixel *p_src0, const SrcPixel *p_src1,
+                       int run_times) = 0;
+
+  template <typename Pixel>
+  void GetSources(Pixel **src0, Pixel **src1, Pixel * /*dst*/, int run_times) {
+    if (run_times > 1) {
+      *src0 = src0_;
+      *src1 = src1_;
+      return;
+    }
+    switch (this->rng_(3)) {
+      case 0:  // Separate sources
+        *src0 = src0_;
+        *src1 = src1_;
+        break;
+      case 1:  // src0 == dst
+        *src0 = dst_tst_;
+        src0_stride_ = dst_stride_;
+        src0_offset_ = dst_offset_;
+        *src1 = src1_;
+        break;
+      case 2:  // src1 == dst
+        *src0 = src0_;
+        *src1 = dst_tst_;
+        src1_stride_ = dst_stride_;
+        src1_offset_ = dst_offset_;
+        break;
+      default: FAIL();
+    }
+  }
+
+  void GetSources(uint16_t **src0, uint16_t **src1, uint8_t * /*dst*/,
+                  int /*run_times*/) {
+    *src0 = src0_;
+    *src1 = src1_;
+  }
+
+  uint8_t Rand1() { return this->rng_.Rand8() & 1; }
+
+  void RunOneTest(int block_size, int subx, int suby, int run_times) {
+    w_ = block_size_wide[block_size];
+    h_ = block_size_high[block_size];
+    run_times = run_times > 1 ? run_times / w_ : 1;
+    ASSERT_GT(run_times, 0);
+    subx_ = subx;
+    suby_ = suby;
+
+    dst_offset_ = this->rng_(33);
+    dst_stride_ = this->rng_(kMaxWidth + 1 - w_) + w_;
+
+    src0_offset_ = this->rng_(33);
+    src0_stride_ = this->rng_(kMaxWidth + 1 - w_) + w_;
+
+    src1_offset_ = this->rng_(33);
+    src1_stride_ = this->rng_(kMaxWidth + 1 - w_) + w_;
+
+    mask_stride_ =
+        this->rng_(kMaxWidth + 1 - w_ * (subx_ ? 2 : 1)) + w_ * (subx_ ? 2 : 1);
+
+    SrcPixel *p_src0;
+    SrcPixel *p_src1;
+
+    p_src0 = src0_;
+    p_src1 = src1_;
+
+    GetSources(&p_src0, &p_src1, &dst_ref_[0], run_times);
+
+    Execute(p_src0, p_src1, run_times);
+
+    for (int r = 0; r < h_; ++r) {
+      for (int c = 0; c < w_; ++c) {
+        ASSERT_EQ(dst_ref_[dst_offset_ + r * dst_stride_ + c],
+                  dst_tst_[dst_offset_ + r * dst_stride_ + c])
+            << w_ << "x" << h_ << " subx " << subx_ << " suby " << suby_
+            << " r: " << r << " c: " << c;
+      }
+    }
+  }
+
+  void RunTest(int block_size, int run_times) {
+    subx_ = Rand1();
+    suby_ = Rand1();
+    RunOneTest(block_size, subx_, suby_, run_times);
+  }
+
+  DstPixel dst_ref_[kBufSize];
+  DstPixel dst_tst_[kBufSize];
+  uint32_t dst_stride_;
+  uint32_t dst_offset_;
+
+  SrcPixel src0_[kBufSize];
+  uint32_t src0_stride_;
+  uint32_t src0_offset_;
+
+  SrcPixel src1_[kBufSize];
+  uint32_t src1_stride_;
+  uint32_t src1_offset_;
+
+  uint8_t mask_[kMaxMaskSize];
+  size_t mask_stride_;
+
+  int w_;
+  int h_;
+
+  int suby_;
+  int subx_;
+};
+
+//////////////////////////////////////////////////////////////////////////////
+// 8 bit version
+//////////////////////////////////////////////////////////////////////////////
+
+typedef void (*F8B)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+                    uint32_t src0_stride, const uint8_t *src1,
+                    uint32_t src1_stride, const uint8_t *mask,
+                    uint32_t mask_stride, int w, int h, int subx, int suby);
+typedef libaom_test::FuncParam<F8B> TestFuncs;
+
+class BlendA64MaskTest8B : public BlendA64MaskTest<F8B, uint8_t, uint8_t> {
+ protected:
+  void Execute(const uint8_t *p_src0, const uint8_t *p_src1, int run_times) {
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < run_times; ++i) {
+      params_.ref_func(dst_ref_ + dst_offset_, dst_stride_,
+                       p_src0 + src0_offset_, src0_stride_,
+                       p_src1 + src1_offset_, src1_stride_, mask_,
+                       kMaxMaskWidth, w_, h_, subx_, suby_);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < run_times; ++i) {
+      params_.tst_func(dst_tst_ + dst_offset_, dst_stride_,
+                       p_src0 + src0_offset_, src0_stride_,
+                       p_src1 + src1_offset_, src1_stride_, mask_,
+                       kMaxMaskWidth, w_, h_, subx_, suby_);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    if (run_times > 1) {
+      printf("%3dx%-3d subx %d suby %d :%7.2f/%7.2fns", w_, h_, subx_, suby_,
+             time1, time2);
+      printf("(%3.2f)\n", time1 / time2);
+    }
+  }
+};
+
+TEST_P(BlendA64MaskTest8B, RandomValues) {
+  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    int bsize = rng_.Rand8() % BLOCK_SIZES_ALL;
+    for (int i = 0; i < kBufSize; ++i) {
+      dst_ref_[i] = rng_.Rand8();
+      dst_tst_[i] = rng_.Rand8();
+
+      src0_[i] = rng_.Rand8();
+      src1_[i] = rng_.Rand8();
+    }
+
+    for (int i = 0; i < kMaxMaskSize; ++i)
+      mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1);
+
+    RunTest(bsize, 1);
+  }
+}
+
+TEST_P(BlendA64MaskTest8B, ExtremeValues) {
+  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    int bsize = rng_.Rand8() % BLOCK_SIZES_ALL;
+    for (int i = 0; i < kBufSize; ++i) {
+      dst_ref_[i] = rng_(2) + 254;
+      dst_tst_[i] = rng_(2) + 254;
+      src0_[i] = rng_(2) + 254;
+      src1_[i] = rng_(2) + 254;
+    }
+
+    for (int i = 0; i < kMaxMaskSize; ++i)
+      mask_[i] = rng_(2) + AOM_BLEND_A64_MAX_ALPHA - 1;
+
+    RunTest(bsize, 1);
+  }
+}
+TEST_P(BlendA64MaskTest8B, DISABLED_Speed) {
+  const int kRunTimes = 10000000;
+  for (int bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) {
+    for (int i = 0; i < kBufSize; ++i) {
+      dst_ref_[i] = rng_.Rand8();
+      dst_tst_[i] = rng_.Rand8();
+
+      src0_[i] = rng_.Rand8();
+      src1_[i] = rng_.Rand8();
+    }
+
+    for (int i = 0; i < kMaxMaskSize; ++i)
+      mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1);
+
+    RunOneTest(bsize, 1, 1, kRunTimes);
+    RunOneTest(bsize, 1, 0, kRunTimes);
+    RunOneTest(bsize, 0, 1, kRunTimes);
+    RunOneTest(bsize, 0, 0, kRunTimes);
+  }
+}
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE4_1, BlendA64MaskTest8B,
+                         ::testing::Values(TestFuncs(
+                             aom_blend_a64_mask_c, aom_blend_a64_mask_sse4_1)));
+#endif  // HAVE_SSE4_1
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, BlendA64MaskTest8B,
+                         ::testing::Values(TestFuncs(aom_blend_a64_mask_sse4_1,
+                                                     aom_blend_a64_mask_avx2)));
+#endif  // HAVE_AVX2
+
+//////////////////////////////////////////////////////////////////////////////
+// 8 bit _d16 version
+//////////////////////////////////////////////////////////////////////////////
+
+typedef void (*F8B_D16)(uint8_t *dst, uint32_t dst_stride, const uint16_t *src0,
+                        uint32_t src0_stride, const uint16_t *src1,
+                        uint32_t src1_stride, const uint8_t *mask,
+                        uint32_t mask_stride, int w, int h, int subx, int suby,
+                        ConvolveParams *conv_params);
+typedef libaom_test::FuncParam<F8B_D16> TestFuncs_d16;
+
+class BlendA64MaskTest8B_d16
+    : public BlendA64MaskTest<F8B_D16, uint16_t, uint8_t> {
+ protected:
+  // max number of bits used by the source
+  static const int kSrcMaxBitsMask = 0x3fff;
+
+  void Execute(const uint16_t *p_src0, const uint16_t *p_src1, int run_times) {
+    ConvolveParams conv_params;
+    conv_params.round_0 = ROUND0_BITS;
+    conv_params.round_1 = COMPOUND_ROUND1_BITS;
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < run_times; ++i) {
+      params_.ref_func(dst_ref_ + dst_offset_, dst_stride_,
+                       p_src0 + src0_offset_, src0_stride_,
+                       p_src1 + src1_offset_, src1_stride_, mask_,
+                       kMaxMaskWidth, w_, h_, subx_, suby_, &conv_params);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < run_times; ++i) {
+      params_.tst_func(dst_tst_ + dst_offset_, dst_stride_,
+                       p_src0 + src0_offset_, src0_stride_,
+                       p_src1 + src1_offset_, src1_stride_, mask_,
+                       kMaxMaskWidth, w_, h_, subx_, suby_, &conv_params);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    if (run_times > 1) {
+      printf("%3dx%-3d subx %d suby %d :%7.2f/%7.2fns", w_, h_, subx_, suby_,
+             time1, time2);
+      printf("(%3.2f)\n", time1 / time2);
+    }
+  }
+};
+
+TEST_P(BlendA64MaskTest8B_d16, RandomValues) {
+  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    int bsize = rng_.Rand8() % BLOCK_SIZES_ALL;
+    for (int i = 0; i < kBufSize; ++i) {
+      dst_ref_[i] = rng_.Rand8();
+      dst_tst_[i] = rng_.Rand8();
+
+      src0_[i] = rng_.Rand16() & kSrcMaxBitsMask;
+      src1_[i] = rng_.Rand16() & kSrcMaxBitsMask;
+    }
+
+    for (int i = 0; i < kMaxMaskSize; ++i)
+      mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1);
+
+    RunTest(bsize, 1);
+  }
+}
+
+TEST_P(BlendA64MaskTest8B_d16, ExtremeValues) {
+  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    int bsize = rng_.Rand8() % BLOCK_SIZES_ALL;
+    for (int i = 0; i < kBufSize; ++i) {
+      dst_ref_[i] = 255;
+      dst_tst_[i] = 255;
+
+      src0_[i] = kSrcMaxBitsMask;
+      src1_[i] = kSrcMaxBitsMask;
+    }
+
+    for (int i = 0; i < kMaxMaskSize; ++i)
+      mask_[i] = AOM_BLEND_A64_MAX_ALPHA - 1;
+
+    RunTest(bsize, 1);
+  }
+}
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(
+    SSE4_1, BlendA64MaskTest8B_d16,
+    ::testing::Values(TestFuncs_d16(aom_lowbd_blend_a64_d16_mask_c,
+                                    aom_lowbd_blend_a64_d16_mask_sse4_1)));
+#endif  // HAVE_SSE4_1
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, BlendA64MaskTest8B_d16,
+    ::testing::Values(TestFuncs_d16(aom_lowbd_blend_a64_d16_mask_c,
+                                    aom_lowbd_blend_a64_d16_mask_avx2)));
+#endif  // HAVE_AVX2
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+    NEON, BlendA64MaskTest8B_d16,
+    ::testing::Values(TestFuncs_d16(aom_lowbd_blend_a64_d16_mask_c,
+                                    aom_lowbd_blend_a64_d16_mask_neon)));
+#endif  // HAVE_NEON
+
+//////////////////////////////////////////////////////////////////////////////
+// High bit-depth version
+//////////////////////////////////////////////////////////////////////////////
+#if CONFIG_AV1_HIGHBITDEPTH
+typedef void (*FHBD)(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+                     uint32_t src0_stride, const uint8_t *src1,
+                     uint32_t src1_stride, const uint8_t *mask,
+                     uint32_t mask_stride, int w, int h, int subx, int suby,
+                     int bd);
+typedef libaom_test::FuncParam<FHBD> TestFuncsHBD;
+
+class BlendA64MaskTestHBD : public BlendA64MaskTest<FHBD, uint16_t, uint16_t> {
+ protected:
+  void Execute(const uint16_t *p_src0, const uint16_t *p_src1, int run_times) {
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < run_times; ++i) {
+      params_.ref_func(CONVERT_TO_BYTEPTR(dst_ref_ + dst_offset_), dst_stride_,
+                       CONVERT_TO_BYTEPTR(p_src0 + src0_offset_), src0_stride_,
+                       CONVERT_TO_BYTEPTR(p_src1 + src1_offset_), src1_stride_,
+                       mask_, kMaxMaskWidth, w_, h_, subx_, suby_, bit_depth_);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < run_times; ++i) {
+      params_.tst_func(CONVERT_TO_BYTEPTR(dst_tst_ + dst_offset_), dst_stride_,
+                       CONVERT_TO_BYTEPTR(p_src0 + src0_offset_), src0_stride_,
+                       CONVERT_TO_BYTEPTR(p_src1 + src1_offset_), src1_stride_,
+                       mask_, kMaxMaskWidth, w_, h_, subx_, suby_, bit_depth_);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    if (run_times > 1) {
+      printf("%3dx%-3d subx %d suby %d :%7.2f/%7.2fns", w_, h_, subx_, suby_,
+             time1, time2);
+      printf("(%3.2f)\n", time1 / time2);
+    }
+  }
+
+  int bit_depth_;
+};
+
+TEST_P(BlendA64MaskTestHBD, RandomValues) {
+  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    int bsize = rng_.Rand8() % BLOCK_SIZES_ALL;
+    switch (rng_(3)) {
+      case 0: bit_depth_ = 8; break;
+      case 1: bit_depth_ = 10; break;
+      default: bit_depth_ = 12; break;
+    }
+
+    const int hi = 1 << bit_depth_;
+
+    for (int i = 0; i < kBufSize; ++i) {
+      dst_ref_[i] = rng_(hi);
+      dst_tst_[i] = rng_(hi);
+      src0_[i] = rng_(hi);
+      src1_[i] = rng_(hi);
+    }
+
+    for (int i = 0; i < kMaxMaskSize; ++i)
+      mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1);
+
+    RunTest(bsize, 1);
+  }
+}
+
+TEST_P(BlendA64MaskTestHBD, ExtremeValues) {
+  for (int iter = 0; iter < 1000 && !HasFatalFailure(); ++iter) {
+    int bsize = rng_.Rand8() % BLOCK_SIZES_ALL;
+    switch (rng_(3)) {
+      case 0: bit_depth_ = 8; break;
+      case 1: bit_depth_ = 10; break;
+      default: bit_depth_ = 12; break;
+    }
+
+    const int hi = 1 << bit_depth_;
+    const int lo = hi - 2;
+
+    for (int i = 0; i < kBufSize; ++i) {
+      dst_ref_[i] = rng_(hi - lo) + lo;
+      dst_tst_[i] = rng_(hi - lo) + lo;
+      src0_[i] = rng_(hi - lo) + lo;
+      src1_[i] = rng_(hi - lo) + lo;
+    }
+
+    for (int i = 0; i < kMaxMaskSize; ++i)
+      mask_[i] = rng_(2) + AOM_BLEND_A64_MAX_ALPHA - 1;
+
+    RunTest(bsize, 1);
+  }
+}
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(
+    SSE4_1, BlendA64MaskTestHBD,
+    ::testing::Values(TestFuncsHBD(aom_highbd_blend_a64_mask_c,
+                                   aom_highbd_blend_a64_mask_sse4_1)));
+#endif  // HAVE_SSE4_1
+
+//////////////////////////////////////////////////////////////////////////////
+// HBD _d16 version
+//////////////////////////////////////////////////////////////////////////////
+
+typedef void (*FHBD_D16)(uint8_t *dst, uint32_t dst_stride,
+                         const CONV_BUF_TYPE *src0, uint32_t src0_stride,
+                         const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+                         const uint8_t *mask, uint32_t mask_stride, int w,
+                         int h, int subx, int suby, ConvolveParams *conv_params,
+                         const int bd);
+typedef libaom_test::FuncParam<FHBD_D16> TestFuncsHBD_d16;
+
+class BlendA64MaskTestHBD_d16
+    : public BlendA64MaskTest<FHBD_D16, uint16_t, uint16_t> {
+ protected:
+  // max number of bits used by the source
+  static const int kSrcMaxBitsMask = (1 << 14) - 1;
+  static const int kSrcMaxBitsMaskHBD = (1 << 16) - 1;
+
+  void Execute(const uint16_t *p_src0, const uint16_t *p_src1, int run_times) {
+    ASSERT_GT(run_times, 0) << "Cannot run 0 iterations of the test.";
+    ConvolveParams conv_params;
+    conv_params.round_0 = (bit_depth_ == 12) ? ROUND0_BITS + 2 : ROUND0_BITS;
+    conv_params.round_1 = COMPOUND_ROUND1_BITS;
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < run_times; ++i) {
+      params_.ref_func(CONVERT_TO_BYTEPTR(dst_ref_ + dst_offset_), dst_stride_,
+                       p_src0 + src0_offset_, src0_stride_,
+                       p_src1 + src1_offset_, src1_stride_, mask_,
+                       kMaxMaskWidth, w_, h_, subx_, suby_, &conv_params,
+                       bit_depth_);
+    }
+    if (params_.tst_func) {
+      aom_usec_timer_mark(&timer);
+      const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+      aom_usec_timer_start(&timer);
+      for (int i = 0; i < run_times; ++i) {
+        params_.tst_func(CONVERT_TO_BYTEPTR(dst_tst_ + dst_offset_),
+                         dst_stride_, p_src0 + src0_offset_, src0_stride_,
+                         p_src1 + src1_offset_, src1_stride_, mask_,
+                         kMaxMaskWidth, w_, h_, subx_, suby_, &conv_params,
+                         bit_depth_);
+      }
+      aom_usec_timer_mark(&timer);
+      const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+      if (run_times > 1) {
+        printf("%3dx%-3d subx %d suby %d :%7.2f/%7.2fns", w_, h_, subx_, suby_,
+               time1, time2);
+        printf("(%3.2f)\n", time1 / time2);
+      }
+    }
+  }
+
+  int bit_depth_;
+  int src_max_bits_mask_;
+};
+
+TEST_P(BlendA64MaskTestHBD_d16, RandomValues) {
+  if (params_.tst_func == NULL) return;
+  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    int bsize = rng_.Rand8() % BLOCK_SIZES_ALL;
+    switch (rng_(3)) {
+      case 0: bit_depth_ = 8; break;
+      case 1: bit_depth_ = 10; break;
+      default: bit_depth_ = 12; break;
+    }
+    src_max_bits_mask_ =
+        (bit_depth_ == 8) ? kSrcMaxBitsMask : kSrcMaxBitsMaskHBD;
+
+    for (int i = 0; i < kBufSize; ++i) {
+      dst_ref_[i] = rng_.Rand8();
+      dst_tst_[i] = rng_.Rand8();
+
+      src0_[i] = rng_.Rand16() & src_max_bits_mask_;
+      src1_[i] = rng_.Rand16() & src_max_bits_mask_;
+    }
+
+    for (int i = 0; i < kMaxMaskSize; ++i)
+      mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1);
+
+    RunTest(bsize, 1);
+  }
+}
+// TODO (Scott LaVarnway), fix this test
+TEST_P(BlendA64MaskTestHBD_d16, DISABLED_SaturatedValues) {
+  for (int bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) {
+    for (bit_depth_ = 8; bit_depth_ <= 12; bit_depth_ += 2) {
+      src_max_bits_mask_ =
+          (bit_depth_ == 8) ? kSrcMaxBitsMask : kSrcMaxBitsMaskHBD;
+
+      for (int i = 0; i < kBufSize; ++i) {
+        dst_ref_[i] = 0;
+        dst_tst_[i] = (1 << bit_depth_) - 1;
+
+        src0_[i] = src_max_bits_mask_;
+        src1_[i] = src_max_bits_mask_;
+      }
+
+      for (int i = 0; i < kMaxMaskSize; ++i) mask_[i] = AOM_BLEND_A64_MAX_ALPHA;
+
+      RunTest(bsize, 1);
+    }
+  }
+}
+TEST_P(BlendA64MaskTestHBD_d16, DISABLED_Speed) {
+  const int kRunTimes = 10000000;
+  for (int bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) {
+    for (bit_depth_ = 8; bit_depth_ <= 12; bit_depth_ += 2) {
+      for (int i = 0; i < kBufSize; ++i) {
+        dst_ref_[i] = rng_.Rand12() % (1 << bit_depth_);
+        dst_tst_[i] = rng_.Rand12() % (1 << bit_depth_);
+
+        src0_[i] = rng_.Rand16();
+        src1_[i] = rng_.Rand16();
+      }
+
+      for (int i = 0; i < kMaxMaskSize; ++i)
+        mask_[i] = rng_(AOM_BLEND_A64_MAX_ALPHA + 1);
+
+      RunOneTest(bsize, 1, 1, kRunTimes);
+      RunOneTest(bsize, 0, 0, kRunTimes);
+    }
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    C, BlendA64MaskTestHBD_d16,
+    ::testing::Values(TestFuncsHBD_d16(aom_highbd_blend_a64_d16_mask_c, NULL)));
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(
+    SSE4_1, BlendA64MaskTestHBD_d16,
+    ::testing::Values(TestFuncsHBD_d16(aom_highbd_blend_a64_d16_mask_c,
+                                       aom_highbd_blend_a64_d16_mask_sse4_1)));
+#endif  // HAVE_SSE4_1
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, BlendA64MaskTestHBD_d16,
+    ::testing::Values(TestFuncsHBD_d16(aom_highbd_blend_a64_d16_mask_c,
+                                       aom_highbd_blend_a64_d16_mask_avx2)));
+#endif  // HAVE_AVX2
+
+// TODO(slavarnway): Enable the following in the avx2 commit. (56501)
+#if 0
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+    SSE4_1, BlendA64MaskTestHBD,
+    ::testing::Values(TestFuncsHBD(aom_highbd_blend_a64_mask_c,
+                                   aom_highbd_blend_a64_mask_avx2)));
+#endif  // HAVE_AVX2
+#endif
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+}  // namespace
diff --git a/libs/libaom/src/test/blockd_test.cc b/libs/libaom/src/test/blockd_test.cc
new file mode 100644
index 000000000..17e696863
--- /dev/null
+++ b/libs/libaom/src/test/blockd_test.cc
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/blockd.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+// Verify the optimized implementation of get_partition_subsize() produces the
+// same results as the Partition_Subsize lookup table in the spec.
+TEST(BlockdTest, GetPartitionSubsize) {
+  // The Partition_Subsize table in the spec (Section 9.3. Conversion tables).
+  /* clang-format off */
+  static const BLOCK_SIZE kPartitionSubsize[10][BLOCK_SIZES_ALL] = {
+    {
+                                    BLOCK_4X4,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X8,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X16,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X32,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X64,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_128X128,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID
+    }, {
+                                    BLOCK_INVALID,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X4,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X8,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X16,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X32,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_128X64,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID
+    }, {
+                                    BLOCK_INVALID,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X8,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X16,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X32,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X64,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X128,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID
+    }, {
+                                    BLOCK_INVALID,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X4,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X8,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X16,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X32,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X64,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID
+    }, {
+                                    BLOCK_INVALID,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X8,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X16,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X32,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_128X64,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID
+    }, {
+                                    BLOCK_INVALID,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X8,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X16,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X32,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_128X64,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID
+    }, {
+                                    BLOCK_INVALID,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X16,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X32,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X64,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X128,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID
+    }, {
+                                    BLOCK_INVALID,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X16,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X32,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X64,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X128,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID
+    }, {
+                                    BLOCK_INVALID,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X4,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X8,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X16,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID
+    }, {
+                                    BLOCK_INVALID,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X16,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X32,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X64,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
+      BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID
+    }
+  };
+  /* clang-format on */
+
+  for (int partition = 0; partition < 10; partition++) {
+    for (int bsize = BLOCK_4X4; bsize < BLOCK_SIZES_ALL; bsize++) {
+      EXPECT_EQ(kPartitionSubsize[partition][bsize],
+                get_partition_subsize(static_cast<BLOCK_SIZE>(bsize),
+                                      static_cast<PARTITION_TYPE>(partition)));
+    }
+  }
+}
diff --git a/libs/libaom/src/test/boolcoder_test.cc b/libs/libaom/src/test/boolcoder_test.cc
new file mode 100644
index 000000000..680ec1877
--- /dev/null
+++ b/libs/libaom/src/test/boolcoder_test.cc
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "test/acm_random.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/bitreader.h"
+#include "aom_dsp/bitwriter.h"
+
+using libaom_test::ACMRandom;
+
+namespace {
+const int num_tests = 10;
+}  // namespace
+
+TEST(AV1, TestBitIO) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  for (int n = 0; n < num_tests; ++n) {
+    for (int method = 0; method <= 7; ++method) {  // we generate various proba
+      const int kBitsToTest = 1000;
+      uint8_t probas[kBitsToTest];
+
+      for (int i = 0; i < kBitsToTest; ++i) {
+        const int parity = i & 1;
+        /* clang-format off */
+        probas[i] =
+          (method == 0) ? 0 : (method == 1) ? 255 :
+          (method == 2) ? 128 :
+          (method == 3) ? rnd.Rand8() :
+          (method == 4) ? (parity ? 0 : 255) :
+            // alternate between low and high proba:
+            (method == 5) ? (parity ? rnd(128) : 255 - rnd(128)) :
+            (method == 6) ?
+            (parity ? rnd(64) : 255 - rnd(64)) :
+            (parity ? rnd(32) : 255 - rnd(32));
+        /* clang-format on */
+      }
+      for (int bit_method = 0; bit_method <= 3; ++bit_method) {
+        const int random_seed = 6432;
+        const int kBufferSize = 10000;
+        ACMRandom bit_rnd(random_seed);
+        aom_writer bw;
+        uint8_t bw_buffer[kBufferSize];
+        aom_start_encode(&bw, bw_buffer);
+
+        int bit = (bit_method == 0) ? 0 : (bit_method == 1) ? 1 : 0;
+        for (int i = 0; i < kBitsToTest; ++i) {
+          if (bit_method == 2) {
+            bit = (i & 1);
+          } else if (bit_method == 3) {
+            bit = bit_rnd(2);
+          }
+          aom_write(&bw, bit, static_cast<int>(probas[i]));
+        }
+
+        aom_stop_encode(&bw);
+
+        aom_reader br;
+        aom_reader_init(&br, bw_buffer, bw.pos);
+        bit_rnd.Reset(random_seed);
+        for (int i = 0; i < kBitsToTest; ++i) {
+          if (bit_method == 2) {
+            bit = (i & 1);
+          } else if (bit_method == 3) {
+            bit = bit_rnd(2);
+          }
+          GTEST_ASSERT_EQ(aom_read(&br, probas[i], NULL), bit)
+              << "pos: " << i << " / " << kBitsToTest
+              << " bit_method: " << bit_method << " method: " << method;
+        }
+      }
+    }
+  }
+}
+
+#define FRAC_DIFF_TOTAL_ERROR 0.18
+
+TEST(AV1, TestTell) {
+  const int kBufferSize = 10000;
+  aom_writer bw;
+  uint8_t bw_buffer[kBufferSize];
+  const int kSymbols = 1024;
+  // Coders are noisier at low probabilities, so we start at p = 4.
+  for (int p = 4; p < 256; p++) {
+    double probability = p / 256.;
+    aom_start_encode(&bw, bw_buffer);
+    for (int i = 0; i < kSymbols; i++) {
+      aom_write(&bw, 0, p);
+    }
+    aom_stop_encode(&bw);
+    aom_reader br;
+    aom_reader_init(&br, bw_buffer, bw.pos);
+    uint32_t last_tell = aom_reader_tell(&br);
+    uint32_t last_tell_frac = aom_reader_tell_frac(&br);
+    double frac_diff_total = 0;
+    GTEST_ASSERT_GE(aom_reader_tell(&br), 0u);
+    GTEST_ASSERT_LE(aom_reader_tell(&br), 1u);
+    ASSERT_FALSE(aom_reader_has_overflowed(&br));
+    for (int i = 0; i < kSymbols; i++) {
+      aom_read(&br, p, NULL);
+      uint32_t tell = aom_reader_tell(&br);
+      uint32_t tell_frac = aom_reader_tell_frac(&br);
+      GTEST_ASSERT_GE(tell, last_tell)
+          << "tell: " << tell << ", last_tell: " << last_tell;
+      GTEST_ASSERT_GE(tell_frac, last_tell_frac)
+          << "tell_frac: " << tell_frac
+          << ", last_tell_frac: " << last_tell_frac;
+      // Frac tell should round up to tell.
+      GTEST_ASSERT_EQ(tell, (tell_frac + 7) >> 3);
+      last_tell = tell;
+      frac_diff_total +=
+          fabs(((tell_frac - last_tell_frac) / 8.0) + log2(probability));
+      last_tell_frac = tell_frac;
+    }
+    const uint32_t expected = (uint32_t)(-kSymbols * log2(probability));
+    // Last tell should be close to the expected value.
+    GTEST_ASSERT_LE(last_tell, expected + 20) << " last_tell: " << last_tell;
+    // The average frac_diff error should be pretty small.
+    GTEST_ASSERT_LE(frac_diff_total / kSymbols, FRAC_DIFF_TOTAL_ERROR)
+        << " frac_diff_total: " << frac_diff_total;
+    ASSERT_FALSE(aom_reader_has_overflowed(&br));
+  }
+}
+
+TEST(AV1, TestHasOverflowed) {
+  const int kBufferSize = 10000;
+  aom_writer bw;
+  uint8_t bw_buffer[kBufferSize];
+  const int kSymbols = 1024;
+  // Coders are noisier at low probabilities, so we start at p = 4.
+  for (int p = 4; p < 256; p++) {
+    aom_start_encode(&bw, bw_buffer);
+    for (int i = 0; i < kSymbols; i++) {
+      aom_write(&bw, 1, p);
+    }
+    aom_stop_encode(&bw);
+    aom_reader br;
+    aom_reader_init(&br, bw_buffer, bw.pos);
+    ASSERT_FALSE(aom_reader_has_overflowed(&br));
+    for (int i = 0; i < kSymbols; i++) {
+      GTEST_ASSERT_EQ(aom_read(&br, p, NULL), 1);
+      ASSERT_FALSE(aom_reader_has_overflowed(&br));
+    }
+    // In the worst case, the encoder uses just a tiny fraction of the last
+    // byte in the buffer. So to guarantee that aom_reader_has_overflowed()
+    // returns true, we have to consume very nearly 8 additional bits of data.
+    // In the worse case, one of the bits in that byte will be 1, and the rest
+    // will be zero. Once we are past that 1 bit, when the probability of
+    // reading zero symbol from aom_read() is high, each additional symbol read
+    // will consume very little additional data (in the case that p == 255,
+    // approximately -log_2(255/256) ~= 0.0056 bits). In that case it would
+    // take around 178 calls to consume more than 8 bits. That is only an upper
+    // bound. In practice we are not guaranteed to hit the worse case and can
+    // get away with 174 calls.
+    for (int i = 0; i < 174; i++) {
+      aom_read(&br, p, NULL);
+    }
+    ASSERT_TRUE(aom_reader_has_overflowed(&br));
+  }
+}
diff --git a/libs/libaom/src/test/borders_test.cc b/libs/libaom/src/test/borders_test.cc
new file mode 100644
index 000000000..31eacab12
--- /dev/null
+++ b/libs/libaom/src/test/borders_test.cc
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <climits>
+#include <vector>
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+
+namespace {
+
+class BordersTestLarge
+    : public ::libaom_test::CodecTestWithParam<libaom_test::TestMode>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  BordersTestLarge() : EncoderTest(GET_PARAM(0)) {}
+  virtual ~BordersTestLarge() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(GET_PARAM(1));
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(AOME_SET_CPUUSED, 1);
+      encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+      encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
+      encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
+    }
+  }
+
+  virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
+    if (pkt->data.frame.flags & AOM_FRAME_IS_KEY) {
+    }
+  }
+};
+
+TEST_P(BordersTestLarge, TestEncodeHighBitrate) {
+  // Validate that this non multiple of 64 wide clip encodes and decodes
+  // without a mismatch when passing in a very low max q.  This pushes
+  // the encoder to producing lots of big partitions which will likely
+  // extend into the border and test the border condition.
+  cfg_.g_lag_in_frames = 25;
+  cfg_.rc_2pass_vbr_minsection_pct = 5;
+  cfg_.rc_2pass_vbr_maxsection_pct = 2000;
+  cfg_.rc_target_bitrate = 2000;
+  cfg_.rc_max_quantizer = 10;
+
+  ::libaom_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0,
+                                       10);
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+TEST_P(BordersTestLarge, TestLowBitrate) {
+  // Validate that this clip encodes and decodes without a mismatch
+  // when passing in a very high min q.  This pushes the encoder to producing
+  // lots of small partitions which might will test the other condition.
+
+  cfg_.g_lag_in_frames = 25;
+  cfg_.rc_2pass_vbr_minsection_pct = 5;
+  cfg_.rc_2pass_vbr_maxsection_pct = 2000;
+  cfg_.rc_target_bitrate = 200;
+  cfg_.rc_min_quantizer = 40;
+
+  ::libaom_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0,
+                                       10);
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+AV1_INSTANTIATE_TEST_CASE(BordersTestLarge,
+                          ::testing::Values(::libaom_test::kTwoPassGood));
+}  // namespace
diff --git a/libs/libaom/src/test/cdef_test.cc b/libs/libaom/src/test/cdef_test.cc
new file mode 100644
index 000000000..a2ec1e31e
--- /dev/null
+++ b/libs/libaom/src/test/cdef_test.cc
@@ -0,0 +1,426 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <cstdlib>
+#include <string>
+#include <tuple>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_ports/aom_timer.h"
+#include "av1/common/cdef_block.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+
+using libaom_test::ACMRandom;
+
+namespace {
+
+typedef std::tuple<cdef_filter_block_func, cdef_filter_block_func, BLOCK_SIZE,
+                   int, int>
+    cdef_dir_param_t;
+
+class CDEFBlockTest : public ::testing::TestWithParam<cdef_dir_param_t> {
+ public:
+  virtual ~CDEFBlockTest() {}
+  virtual void SetUp() {
+    cdef = GET_PARAM(0);
+    ref_cdef = GET_PARAM(1);
+    bsize = GET_PARAM(2);
+    boundary = GET_PARAM(3);
+    depth = GET_PARAM(4);
+  }
+
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+  int bsize;
+  int boundary;
+  int depth;
+  cdef_filter_block_func cdef;
+  cdef_filter_block_func ref_cdef;
+};
+
+typedef CDEFBlockTest CDEFSpeedTest;
+
+void test_cdef(int bsize, int iterations, cdef_filter_block_func cdef,
+               cdef_filter_block_func ref_cdef, int boundary, int depth) {
+  const int size = 8;
+  const int ysize = size + 2 * CDEF_VBORDER;
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED(16, uint16_t, s[ysize * CDEF_BSTRIDE]);
+  DECLARE_ALIGNED(16, static uint16_t, d[size * size]);
+  DECLARE_ALIGNED(16, static uint16_t, ref_d[size * size]);
+  memset(ref_d, 0, sizeof(ref_d));
+  memset(d, 0, sizeof(d));
+
+  int error = 0, pristrength = 0, secstrength, dir;
+  int pridamping, secdamping, bits, level, count,
+      errdepth = 0, errpristrength = 0, errsecstrength = 0, errboundary = 0,
+      errpridamping = 0, errsecdamping = 0;
+  unsigned int pos = 0;
+
+  const unsigned int max_pos = size * size >> static_cast<int>(depth == 8);
+  for (pridamping = 3 + depth - 8; pridamping < 7 - 3 * !!boundary + depth - 8;
+       pridamping++) {
+    for (secdamping = 3 + depth - 8;
+         secdamping < 7 - 3 * !!boundary + depth - 8; secdamping++) {
+      for (count = 0; count < iterations; count++) {
+        for (level = 0; level < (1 << depth) && !error;
+             level += (2 + 6 * !!boundary) << (depth - 8)) {
+          for (bits = 1; bits <= depth && !error; bits += 1 + 3 * !!boundary) {
+            for (unsigned int i = 0; i < sizeof(s) / sizeof(*s); i++)
+              s[i] = clamp((rnd.Rand16() & ((1 << bits) - 1)) + level, 0,
+                           (1 << depth) - 1);
+            if (boundary) {
+              if (boundary & 1) {  // Left
+                for (int i = 0; i < ysize; i++)
+                  for (int j = 0; j < CDEF_HBORDER; j++)
+                    s[i * CDEF_BSTRIDE + j] = CDEF_VERY_LARGE;
+              }
+              if (boundary & 2) {  // Right
+                for (int i = 0; i < ysize; i++)
+                  for (int j = CDEF_HBORDER + size; j < CDEF_BSTRIDE; j++)
+                    s[i * CDEF_BSTRIDE + j] = CDEF_VERY_LARGE;
+              }
+              if (boundary & 4) {  // Above
+                for (int i = 0; i < CDEF_VBORDER; i++)
+                  for (int j = 0; j < CDEF_BSTRIDE; j++)
+                    s[i * CDEF_BSTRIDE + j] = CDEF_VERY_LARGE;
+              }
+              if (boundary & 8) {  // Below
+                for (int i = CDEF_VBORDER + size; i < ysize; i++)
+                  for (int j = 0; j < CDEF_BSTRIDE; j++)
+                    s[i * CDEF_BSTRIDE + j] = CDEF_VERY_LARGE;
+              }
+            }
+            for (dir = 0; dir < 8; dir++) {
+              for (pristrength = 0; pristrength <= 19 << (depth - 8) && !error;
+                   pristrength += (1 + 4 * !!boundary) << (depth - 8)) {
+                if (pristrength == 16) pristrength = 19;
+                for (secstrength = 0; secstrength <= 4 << (depth - 8) && !error;
+                     secstrength += 1 << (depth - 8)) {
+                  if (secstrength == 3 << (depth - 8)) continue;
+                  ref_cdef(depth == 8 ? (uint8_t *)ref_d : 0, ref_d, size,
+                           s + CDEF_HBORDER + CDEF_VBORDER * CDEF_BSTRIDE,
+                           pristrength, secstrength, dir, pridamping,
+                           secdamping, bsize, depth - 8);
+                  // If cdef and ref_cdef are the same, we're just testing
+                  // speed
+                  if (cdef != ref_cdef)
+                    ASM_REGISTER_STATE_CHECK(
+                        cdef(depth == 8 ? (uint8_t *)d : 0, d, size,
+                             s + CDEF_HBORDER + CDEF_VBORDER * CDEF_BSTRIDE,
+                             pristrength, secstrength, dir, pridamping,
+                             secdamping, bsize, depth - 8));
+                  if (ref_cdef != cdef) {
+                    for (pos = 0; pos < max_pos && !error; pos++) {
+                      error = ref_d[pos] != d[pos];
+                      errdepth = depth;
+                      errpristrength = pristrength;
+                      errsecstrength = secstrength;
+                      errboundary = boundary;
+                      errpridamping = pridamping;
+                      errsecdamping = secdamping;
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  pos--;
+  EXPECT_EQ(0, error) << "Error: CDEFBlockTest, SIMD and C mismatch."
+                      << std::endl
+                      << "First error at " << pos % size << "," << pos / size
+                      << " (" << (int16_t)ref_d[pos] << " : " << (int16_t)d[pos]
+                      << ") " << std::endl
+                      << "pristrength: " << errpristrength << std::endl
+                      << "pridamping: " << errpridamping << std::endl
+                      << "secstrength: " << errsecstrength << std::endl
+                      << "secdamping: " << errsecdamping << std::endl
+                      << "depth: " << errdepth << std::endl
+                      << "size: " << bsize << std::endl
+                      << "boundary: " << errboundary << std::endl
+                      << std::endl;
+}
+
+void test_cdef_speed(int bsize, int iterations, cdef_filter_block_func cdef,
+                     cdef_filter_block_func ref_cdef, int boundary, int depth) {
+  aom_usec_timer ref_timer;
+  aom_usec_timer timer;
+
+  aom_usec_timer_start(&ref_timer);
+  test_cdef(bsize, iterations, ref_cdef, ref_cdef, boundary, depth);
+  aom_usec_timer_mark(&ref_timer);
+  int ref_elapsed_time = (int)aom_usec_timer_elapsed(&ref_timer);
+
+  aom_usec_timer_start(&timer);
+  test_cdef(bsize, iterations, cdef, cdef, boundary, depth);
+  aom_usec_timer_mark(&timer);
+  int elapsed_time = (int)aom_usec_timer_elapsed(&timer);
+
+  EXPECT_GT(ref_elapsed_time, elapsed_time)
+      << "Error: CDEFSpeedTest, SIMD slower than C." << std::endl
+      << "C time: " << ref_elapsed_time << " us" << std::endl
+      << "SIMD time: " << elapsed_time << " us" << std::endl;
+}
+
+typedef int (*find_dir_t)(const uint16_t *img, int stride, int32_t *var,
+                          int coeff_shift);
+
+typedef std::tuple<find_dir_t, find_dir_t> find_dir_param_t;
+
+class CDEFFindDirTest : public ::testing::TestWithParam<find_dir_param_t> {
+ public:
+  virtual ~CDEFFindDirTest() {}
+  virtual void SetUp() {
+    finddir = GET_PARAM(0);
+    ref_finddir = GET_PARAM(1);
+  }
+
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+  find_dir_t finddir;
+  find_dir_t ref_finddir;
+};
+
+typedef CDEFFindDirTest CDEFFindDirSpeedTest;
+
+void test_finddir(int (*finddir)(const uint16_t *img, int stride, int32_t *var,
+                                 int coeff_shift),
+                  int (*ref_finddir)(const uint16_t *img, int stride,
+                                     int32_t *var, int coeff_shift)) {
+  const int size = 8;
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED(16, uint16_t, s[size * size]);
+
+  int error = 0;
+  int depth, bits, level, count, errdepth = 0;
+  int ref_res = 0, res = 0;
+  int32_t ref_var = 0, var = 0;
+
+  for (depth = 8; depth <= 12 && !error; depth += 2) {
+    for (count = 0; count < 512 && !error; count++) {
+      for (level = 0; level < (1 << depth) && !error;
+           level += 1 << (depth - 8)) {
+        for (bits = 1; bits <= depth && !error; bits++) {
+          for (unsigned int i = 0; i < sizeof(s) / sizeof(*s); i++)
+            s[i] = clamp((rnd.Rand16() & ((1 << bits) - 1)) + level, 0,
+                         (1 << depth) - 1);
+          for (int c = 0; c < 1 + 9 * (finddir == ref_finddir); c++)
+            ref_res = ref_finddir(s, size, &ref_var, depth - 8);
+          if (finddir != ref_finddir)
+            ASM_REGISTER_STATE_CHECK(res = finddir(s, size, &var, depth - 8));
+          if (ref_finddir != finddir) {
+            if (res != ref_res || var != ref_var) error = 1;
+            errdepth = depth;
+          }
+        }
+      }
+    }
+  }
+
+  EXPECT_EQ(0, error) << "Error: CDEFFindDirTest, SIMD and C mismatch."
+                      << std::endl
+                      << "return: " << res << " : " << ref_res << std::endl
+                      << "var: " << var << " : " << ref_var << std::endl
+                      << "depth: " << errdepth << std::endl
+                      << std::endl;
+}
+
+void test_finddir_speed(int (*finddir)(const uint16_t *img, int stride,
+                                       int32_t *var, int coeff_shift),
+                        int (*ref_finddir)(const uint16_t *img, int stride,
+                                           int32_t *var, int coeff_shift)) {
+  aom_usec_timer ref_timer;
+  aom_usec_timer timer;
+
+  aom_usec_timer_start(&ref_timer);
+  test_finddir(ref_finddir, ref_finddir);
+  aom_usec_timer_mark(&ref_timer);
+  int ref_elapsed_time = (int)aom_usec_timer_elapsed(&ref_timer);
+
+  aom_usec_timer_start(&timer);
+  test_finddir(finddir, finddir);
+  aom_usec_timer_mark(&timer);
+  int elapsed_time = (int)aom_usec_timer_elapsed(&timer);
+
+  EXPECT_GT(ref_elapsed_time, elapsed_time)
+      << "Error: CDEFFindDirSpeedTest, SIMD slower than C." << std::endl
+      << "C time: " << ref_elapsed_time << " us" << std::endl
+      << "SIMD time: " << elapsed_time << " us" << std::endl;
+}
+
+TEST_P(CDEFBlockTest, TestSIMDNoMismatch) {
+  test_cdef(bsize, 1, cdef, ref_cdef, boundary, depth);
+}
+
+TEST_P(CDEFSpeedTest, DISABLED_TestSpeed) {
+  test_cdef_speed(bsize, 4, cdef, ref_cdef, boundary, depth);
+}
+
+TEST_P(CDEFFindDirTest, TestSIMDNoMismatch) {
+  test_finddir(finddir, ref_finddir);
+}
+
+TEST_P(CDEFFindDirSpeedTest, DISABLED_TestSpeed) {
+  test_finddir_speed(finddir, ref_finddir);
+}
+
+using std::make_tuple;
+
+// VS compiling for 32 bit targets does not support vector types in
+// structs as arguments, which makes the v256 type of the intrinsics
+// hard to support, so optimizations for this target are disabled.
+#if defined(_WIN64) || !defined(_MSC_VER) || defined(__clang__)
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, CDEFBlockTest,
+    ::testing::Combine(::testing::Values(&cdef_filter_block_sse2),
+                       ::testing::Values(&cdef_filter_block_c),
+                       ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
+                                         BLOCK_8X8),
+                       ::testing::Range(0, 16), ::testing::Range(8, 13, 2)));
+INSTANTIATE_TEST_SUITE_P(SSE2, CDEFFindDirTest,
+                         ::testing::Values(make_tuple(&cdef_find_dir_sse2,
+                                                      &cdef_find_dir_c)));
+#endif
+#if HAVE_SSSE3
+INSTANTIATE_TEST_SUITE_P(
+    SSSE3, CDEFBlockTest,
+    ::testing::Combine(::testing::Values(&cdef_filter_block_ssse3),
+                       ::testing::Values(&cdef_filter_block_c),
+                       ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
+                                         BLOCK_8X8),
+                       ::testing::Range(0, 16), ::testing::Range(8, 13, 2)));
+INSTANTIATE_TEST_SUITE_P(SSSE3, CDEFFindDirTest,
+                         ::testing::Values(make_tuple(&cdef_find_dir_ssse3,
+                                                      &cdef_find_dir_c)));
+#endif
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(
+    SSE4_1, CDEFBlockTest,
+    ::testing::Combine(::testing::Values(&cdef_filter_block_sse4_1),
+                       ::testing::Values(&cdef_filter_block_c),
+                       ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
+                                         BLOCK_8X8),
+                       ::testing::Range(0, 16), ::testing::Range(8, 13, 2)));
+INSTANTIATE_TEST_SUITE_P(SSE4_1, CDEFFindDirTest,
+                         ::testing::Values(make_tuple(&cdef_find_dir_sse4_1,
+                                                      &cdef_find_dir_c)));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, CDEFBlockTest,
+    ::testing::Combine(::testing::Values(&cdef_filter_block_avx2),
+                       ::testing::Values(&cdef_filter_block_c),
+                       ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
+                                         BLOCK_8X8),
+                       ::testing::Range(0, 16), ::testing::Range(8, 13, 2)));
+INSTANTIATE_TEST_SUITE_P(AVX2, CDEFFindDirTest,
+                         ::testing::Values(make_tuple(&cdef_find_dir_avx2,
+                                                      &cdef_find_dir_c)));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+    NEON, CDEFBlockTest,
+    ::testing::Combine(::testing::Values(&cdef_filter_block_neon),
+                       ::testing::Values(&cdef_filter_block_c),
+                       ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
+                                         BLOCK_8X8),
+                       ::testing::Range(0, 16), ::testing::Range(8, 13, 2)));
+INSTANTIATE_TEST_SUITE_P(NEON, CDEFFindDirTest,
+                         ::testing::Values(make_tuple(&cdef_find_dir_neon,
+                                                      &cdef_find_dir_c)));
+#endif
+
+// Test speed for all supported architectures
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, CDEFSpeedTest,
+    ::testing::Combine(::testing::Values(&cdef_filter_block_sse2),
+                       ::testing::Values(&cdef_filter_block_c),
+                       ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
+                                         BLOCK_8X8),
+                       ::testing::Range(0, 16), ::testing::Range(8, 13, 2)));
+INSTANTIATE_TEST_SUITE_P(SSE2, CDEFFindDirSpeedTest,
+                         ::testing::Values(make_tuple(&cdef_find_dir_sse2,
+                                                      &cdef_find_dir_c)));
+#endif
+
+#if HAVE_SSSE3
+INSTANTIATE_TEST_SUITE_P(
+    SSSE3, CDEFSpeedTest,
+    ::testing::Combine(::testing::Values(&cdef_filter_block_ssse3),
+                       ::testing::Values(&cdef_filter_block_c),
+                       ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
+                                         BLOCK_8X8),
+                       ::testing::Range(0, 16), ::testing::Range(8, 13, 2)));
+INSTANTIATE_TEST_SUITE_P(SSSE3, CDEFFindDirSpeedTest,
+                         ::testing::Values(make_tuple(&cdef_find_dir_ssse3,
+                                                      &cdef_find_dir_c)));
+#endif
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(
+    SSE4_1, CDEFSpeedTest,
+    ::testing::Combine(::testing::Values(&cdef_filter_block_sse4_1),
+                       ::testing::Values(&cdef_filter_block_c),
+                       ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
+                                         BLOCK_8X8),
+                       ::testing::Range(0, 16), ::testing::Range(8, 13, 2)));
+INSTANTIATE_TEST_SUITE_P(SSE4_1, CDEFFindDirSpeedTest,
+                         ::testing::Values(make_tuple(&cdef_find_dir_sse4_1,
+                                                      &cdef_find_dir_c)));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, CDEFSpeedTest,
+    ::testing::Combine(::testing::Values(&cdef_filter_block_avx2),
+                       ::testing::Values(&cdef_filter_block_c),
+                       ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
+                                         BLOCK_8X8),
+                       ::testing::Range(0, 16), ::testing::Range(8, 13, 2)));
+INSTANTIATE_TEST_SUITE_P(AVX2, CDEFFindDirSpeedTest,
+                         ::testing::Values(make_tuple(&cdef_find_dir_avx2,
+                                                      &cdef_find_dir_c)));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+    NEON, CDEFSpeedTest,
+    ::testing::Combine(::testing::Values(&cdef_filter_block_neon),
+                       ::testing::Values(&cdef_filter_block_c),
+                       ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
+                                         BLOCK_8X8),
+                       ::testing::Range(0, 16), ::testing::Range(8, 13, 2)));
+INSTANTIATE_TEST_SUITE_P(NEON, CDEFFindDirSpeedTest,
+                         ::testing::Values(make_tuple(&cdef_find_dir_neon,
+                                                      &cdef_find_dir_c)));
+#endif
+
+#endif  // defined(_WIN64) || !defined(_MSC_VER)
+}  // namespace
diff --git a/libs/libaom/src/test/cfl_test.cc b/libs/libaom/src/test/cfl_test.cc
new file mode 100644
index 000000000..d2973159c
--- /dev/null
+++ b/libs/libaom/src/test/cfl_test.cc
@@ -0,0 +1,585 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tuple>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/av1_rtcd.h"
+
+#include "aom_ports/aom_timer.h"
+#include "test/util.h"
+#include "test/acm_random.h"
+
+using std::make_tuple;
+
+using libaom_test::ACMRandom;
+
+#define NUM_ITERATIONS (100)
+#define NUM_ITERATIONS_SPEED (INT16_MAX)
+
+#define ALL_CFL_TX_SIZES(function)                           \
+  make_tuple(static_cast<TX_SIZE>(TX_4X4), &function),       \
+      make_tuple(static_cast<TX_SIZE>(TX_4X8), &function),   \
+      make_tuple(static_cast<TX_SIZE>(TX_4X16), &function),  \
+      make_tuple(static_cast<TX_SIZE>(TX_8X4), &function),   \
+      make_tuple(static_cast<TX_SIZE>(TX_8X8), &function),   \
+      make_tuple(static_cast<TX_SIZE>(TX_8X16), &function),  \
+      make_tuple(static_cast<TX_SIZE>(TX_8X32), &function),  \
+      make_tuple(static_cast<TX_SIZE>(TX_16X4), &function),  \
+      make_tuple(static_cast<TX_SIZE>(TX_16X8), &function),  \
+      make_tuple(static_cast<TX_SIZE>(TX_16X16), &function), \
+      make_tuple(static_cast<TX_SIZE>(TX_16X32), &function), \
+      make_tuple(static_cast<TX_SIZE>(TX_32X8), &function),  \
+      make_tuple(static_cast<TX_SIZE>(TX_32X16), &function), \
+      make_tuple(static_cast<TX_SIZE>(TX_32X32), &function)
+
+#define ALL_CFL_TX_SIZES_SUBSAMPLE(fun420, fun422, fun444)                   \
+  make_tuple(static_cast<TX_SIZE>(TX_4X4), &fun420, &fun422, &fun444),       \
+      make_tuple(static_cast<TX_SIZE>(TX_4X8), &fun420, &fun422, &fun444),   \
+      make_tuple(static_cast<TX_SIZE>(TX_4X16), &fun420, &fun422, &fun444),  \
+      make_tuple(static_cast<TX_SIZE>(TX_8X4), &fun420, &fun422, &fun444),   \
+      make_tuple(static_cast<TX_SIZE>(TX_8X8), &fun420, &fun422, &fun444),   \
+      make_tuple(static_cast<TX_SIZE>(TX_8X16), &fun420, &fun422, &fun444),  \
+      make_tuple(static_cast<TX_SIZE>(TX_8X32), &fun420, &fun422, &fun444),  \
+      make_tuple(static_cast<TX_SIZE>(TX_16X4), &fun420, &fun422, &fun444),  \
+      make_tuple(static_cast<TX_SIZE>(TX_16X8), &fun420, &fun422, &fun444),  \
+      make_tuple(static_cast<TX_SIZE>(TX_16X16), &fun420, &fun422, &fun444), \
+      make_tuple(static_cast<TX_SIZE>(TX_16X32), &fun420, &fun422, &fun444), \
+      make_tuple(static_cast<TX_SIZE>(TX_32X8), &fun420, &fun422, &fun444),  \
+      make_tuple(static_cast<TX_SIZE>(TX_32X16), &fun420, &fun422, &fun444), \
+      make_tuple(static_cast<TX_SIZE>(TX_32X32), &fun420, &fun422, &fun444)
+
+namespace {
+
+template <typename A>
+static void assert_eq(const A *a, const A *b, int width, int height) {
+  for (int j = 0; j < height; j++) {
+    for (int i = 0; i < width; i++) {
+      ASSERT_EQ(a[j * CFL_BUF_LINE + i], b[j * CFL_BUF_LINE + i]);
+    }
+  }
+}
+
+static void assertFaster(int ref_elapsed_time, int elapsed_time) {
+  EXPECT_GT(ref_elapsed_time, elapsed_time)
+      << "Error: CFLSubtractSpeedTest, SIMD slower than C." << std::endl
+      << "C time: " << ref_elapsed_time << " us" << std::endl
+      << "SIMD time: " << elapsed_time << " us" << std::endl;
+}
+
+static void printSpeed(int ref_elapsed_time, int elapsed_time, int width,
+                       int height) {
+  std::cout.precision(2);
+  std::cout << "[          ] " << width << "x" << height
+            << ": C time = " << ref_elapsed_time
+            << " us, SIMD time = " << elapsed_time << " us"
+            << " (~" << ref_elapsed_time / (double)elapsed_time << "x) "
+            << std::endl;
+}
+
+class CFLTest {
+ public:
+  virtual ~CFLTest() {}
+  void init(TX_SIZE tx) {
+    tx_size = tx;
+    width = tx_size_wide[tx_size];
+    height = tx_size_high[tx_size];
+    rnd.Reset(ACMRandom::DeterministicSeed());
+  }
+
+ protected:
+  TX_SIZE tx_size;
+  int width;
+  int height;
+  ACMRandom rnd;
+};
+
+template <typename I>
+class CFLTestWithData : public CFLTest {
+ public:
+  virtual ~CFLTestWithData() {}
+
+ protected:
+  I data[CFL_BUF_SQUARE];
+  I data_ref[CFL_BUF_SQUARE];
+  void randData(I (ACMRandom::*random)()) {
+    for (int j = 0; j < this->height; j++) {
+      for (int i = 0; i < this->width; i++) {
+        const I d = (this->rnd.*random)();
+        data[j * CFL_BUF_LINE + i] = d;
+        data_ref[j * CFL_BUF_LINE + i] = d;
+      }
+    }
+  }
+};
+
+template <typename I>
+class CFLTestWithAlignedData : public CFLTest {
+ public:
+  CFLTestWithAlignedData() {
+    chroma_pels_ref =
+        reinterpret_cast<I *>(aom_memalign(32, sizeof(I) * CFL_BUF_SQUARE));
+    chroma_pels =
+        reinterpret_cast<I *>(aom_memalign(32, sizeof(I) * CFL_BUF_SQUARE));
+    sub_luma_pels_ref = reinterpret_cast<int16_t *>(
+        aom_memalign(32, sizeof(int16_t) * CFL_BUF_SQUARE));
+    sub_luma_pels = reinterpret_cast<int16_t *>(
+        aom_memalign(32, sizeof(int16_t) * CFL_BUF_SQUARE));
+    memset(chroma_pels_ref, 0, sizeof(I) * CFL_BUF_SQUARE);
+    memset(chroma_pels, 0, sizeof(I) * CFL_BUF_SQUARE);
+    memset(sub_luma_pels_ref, 0, sizeof(int16_t) * CFL_BUF_SQUARE);
+    memset(sub_luma_pels, 0, sizeof(int16_t) * CFL_BUF_SQUARE);
+  }
+  ~CFLTestWithAlignedData() {
+    aom_free(chroma_pels_ref);
+    aom_free(sub_luma_pels_ref);
+    aom_free(chroma_pels);
+    aom_free(sub_luma_pels);
+  }
+
+ protected:
+  I *chroma_pels_ref;
+  I *chroma_pels;
+  int16_t *sub_luma_pels_ref;
+  int16_t *sub_luma_pels;
+  int alpha_q3;
+  I dc;
+  void randData(int bd) {
+    alpha_q3 = this->rnd(33) - 16;
+    dc = this->rnd(1 << bd);
+    for (int j = 0; j < this->height; j++) {
+      for (int i = 0; i < this->width; i++) {
+        chroma_pels[j * CFL_BUF_LINE + i] = dc;
+        chroma_pels_ref[j * CFL_BUF_LINE + i] = dc;
+        sub_luma_pels_ref[j * CFL_BUF_LINE + i] =
+            sub_luma_pels[j * CFL_BUF_LINE + i] = this->rnd(1 << (bd + 3));
+      }
+    }
+  }
+};
+
+typedef cfl_subtract_average_fn (*sub_avg_fn)(TX_SIZE tx_size);
+typedef std::tuple<TX_SIZE, sub_avg_fn> sub_avg_param;
+class CFLSubAvgTest : public ::testing::TestWithParam<sub_avg_param>,
+                      public CFLTestWithData<int16_t> {
+ public:
+  virtual void SetUp() {
+    CFLTest::init(std::get<0>(this->GetParam()));
+    sub_avg = std::get<1>(this->GetParam())(tx_size);
+    sub_avg_ref = cfl_get_subtract_average_fn_c(tx_size);
+  }
+  virtual ~CFLSubAvgTest() {}
+
+ protected:
+  cfl_subtract_average_fn sub_avg;
+  cfl_subtract_average_fn sub_avg_ref;
+};
+
+TEST_P(CFLSubAvgTest, SubAvgTest) {
+  for (int it = 0; it < NUM_ITERATIONS; it++) {
+    randData(&ACMRandom::Rand15Signed);
+    sub_avg((uint16_t *)data, data);
+    sub_avg_ref((uint16_t *)data_ref, data_ref);
+    assert_eq<int16_t>(data, data_ref, width, height);
+  }
+}
+
+TEST_P(CFLSubAvgTest, DISABLED_SubAvgSpeedTest) {
+  aom_usec_timer ref_timer;
+  aom_usec_timer timer;
+  randData(&ACMRandom::Rand15Signed);
+  aom_usec_timer_start(&ref_timer);
+  for (int k = 0; k < NUM_ITERATIONS_SPEED; k++) {
+    sub_avg_ref((uint16_t *)data_ref, data_ref);
+  }
+  aom_usec_timer_mark(&ref_timer);
+  int ref_elapsed_time = (int)aom_usec_timer_elapsed(&ref_timer);
+  aom_usec_timer_start(&timer);
+  for (int k = 0; k < NUM_ITERATIONS_SPEED; k++) {
+    sub_avg((uint16_t *)data, data);
+  }
+  aom_usec_timer_mark(&timer);
+  int elapsed_time = (int)aom_usec_timer_elapsed(&timer);
+  printSpeed(ref_elapsed_time, elapsed_time, width, height);
+  assertFaster(ref_elapsed_time, elapsed_time);
+}
+
+template <typename S, typename T, typename I>
+class CFLSubsampleTest : public ::testing::TestWithParam<S>,
+                         public CFLTestWithData<I> {
+ public:
+  virtual void SetUp() {
+    CFLTest::init(std::get<0>(this->GetParam()));
+    fun_420 = std::get<1>(this->GetParam())(this->tx_size);
+    fun_422 = std::get<2>(this->GetParam())(this->tx_size);
+    fun_444 = std::get<3>(this->GetParam())(this->tx_size);
+  }
+
+ protected:
+  T fun_420;
+  T fun_422;
+  T fun_444;
+  T fun_420_ref;
+  T fun_422_ref;
+  T fun_444_ref;
+
+  void subsampleTest(T fun, T fun_ref, int sub_width, int sub_height,
+                     I (ACMRandom::*random)()) {
+    uint16_t sub_luma_pels[CFL_BUF_SQUARE];
+    uint16_t sub_luma_pels_ref[CFL_BUF_SQUARE];
+
+    for (int it = 0; it < NUM_ITERATIONS; it++) {
+      CFLTestWithData<I>::randData(random);
+      fun(this->data, CFL_BUF_LINE, sub_luma_pels);
+      fun_ref(this->data_ref, CFL_BUF_LINE, sub_luma_pels_ref);
+      assert_eq<uint16_t>(sub_luma_pels, sub_luma_pels_ref, sub_width,
+                          sub_height);
+    }
+  }
+
+  void subsampleSpeedTest(T fun, T fun_ref, I (ACMRandom::*random)()) {
+    uint16_t sub_luma_pels[CFL_BUF_SQUARE];
+    uint16_t sub_luma_pels_ref[CFL_BUF_SQUARE];
+    aom_usec_timer ref_timer;
+    aom_usec_timer timer;
+
+    CFLTestWithData<I>::randData(random);
+    aom_usec_timer_start(&ref_timer);
+    for (int k = 0; k < NUM_ITERATIONS_SPEED; k++) {
+      fun_ref(this->data_ref, CFL_BUF_LINE, sub_luma_pels);
+    }
+    aom_usec_timer_mark(&ref_timer);
+    int ref_elapsed_time = (int)aom_usec_timer_elapsed(&ref_timer);
+    aom_usec_timer_start(&timer);
+    for (int k = 0; k < NUM_ITERATIONS_SPEED; k++) {
+      fun(this->data, CFL_BUF_LINE, sub_luma_pels_ref);
+    }
+    aom_usec_timer_mark(&timer);
+    int elapsed_time = (int)aom_usec_timer_elapsed(&timer);
+    printSpeed(ref_elapsed_time, elapsed_time, this->width, this->height);
+    assertFaster(ref_elapsed_time, elapsed_time);
+  }
+};
+
+typedef cfl_subsample_lbd_fn (*get_subsample_lbd_fn)(TX_SIZE tx_size);
+typedef std::tuple<TX_SIZE, get_subsample_lbd_fn, get_subsample_lbd_fn,
+                   get_subsample_lbd_fn>
+    subsample_lbd_param;
+class CFLSubsampleLBDTest
+    : public CFLSubsampleTest<subsample_lbd_param, cfl_subsample_lbd_fn,
+                              uint8_t> {
+ public:
+  virtual ~CFLSubsampleLBDTest() {}
+  virtual void SetUp() {
+    CFLSubsampleTest::SetUp();
+    fun_420_ref = cfl_get_luma_subsampling_420_lbd_c(tx_size);
+    fun_422_ref = cfl_get_luma_subsampling_422_lbd_c(tx_size);
+    fun_444_ref = cfl_get_luma_subsampling_444_lbd_c(tx_size);
+  }
+};
+
+TEST_P(CFLSubsampleLBDTest, SubsampleLBD420Test) {
+  subsampleTest(fun_420, fun_420_ref, width >> 1, height >> 1,
+                &ACMRandom::Rand8);
+}
+
+TEST_P(CFLSubsampleLBDTest, DISABLED_SubsampleLBD420SpeedTest) {
+  subsampleSpeedTest(fun_420, fun_420_ref, &ACMRandom::Rand8);
+}
+
+TEST_P(CFLSubsampleLBDTest, SubsampleLBD422Test) {
+  subsampleTest(fun_422, fun_422_ref, width >> 1, height, &ACMRandom::Rand8);
+}
+
+TEST_P(CFLSubsampleLBDTest, DISABLED_SubsampleLBD422SpeedTest) {
+  subsampleSpeedTest(fun_422, fun_422_ref, &ACMRandom::Rand8);
+}
+
+TEST_P(CFLSubsampleLBDTest, SubsampleLBD444Test) {
+  subsampleTest(fun_444, fun_444_ref, width, height, &ACMRandom::Rand8);
+}
+
+TEST_P(CFLSubsampleLBDTest, DISABLED_SubsampleLBD444SpeedTest) {
+  subsampleSpeedTest(fun_444, fun_444_ref, &ACMRandom::Rand8);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+typedef cfl_subsample_hbd_fn (*get_subsample_hbd_fn)(TX_SIZE tx_size);
+typedef std::tuple<TX_SIZE, get_subsample_hbd_fn, get_subsample_hbd_fn,
+                   get_subsample_hbd_fn>
+    subsample_hbd_param;
+class CFLSubsampleHBDTest
+    : public CFLSubsampleTest<subsample_hbd_param, cfl_subsample_hbd_fn,
+                              uint16_t> {
+ public:
+  virtual ~CFLSubsampleHBDTest() {}
+  virtual void SetUp() {
+    CFLSubsampleTest::SetUp();
+    fun_420_ref = cfl_get_luma_subsampling_420_hbd_c(tx_size);
+    fun_422_ref = cfl_get_luma_subsampling_422_hbd_c(tx_size);
+    fun_444_ref = cfl_get_luma_subsampling_444_hbd_c(tx_size);
+  }
+};
+
+TEST_P(CFLSubsampleHBDTest, SubsampleHBD420Test) {
+  subsampleTest(fun_420, fun_420_ref, width >> 1, height >> 1,
+                &ACMRandom::Rand12);
+}
+
+TEST_P(CFLSubsampleHBDTest, DISABLED_SubsampleHBD420SpeedTest) {
+  subsampleSpeedTest(fun_420, fun_420_ref, &ACMRandom::Rand12);
+}
+
+TEST_P(CFLSubsampleHBDTest, SubsampleHBD422Test) {
+  subsampleTest(fun_422, fun_422_ref, width >> 1, height, &ACMRandom::Rand12);
+}
+
+TEST_P(CFLSubsampleHBDTest, DISABLED_SubsampleHBD422SpeedTest) {
+  subsampleSpeedTest(fun_422, fun_422_ref, &ACMRandom::Rand12);
+}
+
+TEST_P(CFLSubsampleHBDTest, SubsampleHBD444Test) {
+  subsampleTest(fun_444, fun_444_ref, width, height, &ACMRandom::Rand12);
+}
+
+TEST_P(CFLSubsampleHBDTest, DISABLED_SubsampleHBD444SpeedTest) {
+  subsampleSpeedTest(fun_444, fun_444_ref, &ACMRandom::Rand12);
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+typedef cfl_predict_lbd_fn (*get_predict_fn)(TX_SIZE tx_size);
+typedef std::tuple<TX_SIZE, get_predict_fn> predict_param;
+class CFLPredictTest : public ::testing::TestWithParam<predict_param>,
+                       public CFLTestWithAlignedData<uint8_t> {
+ public:
+  virtual void SetUp() {
+    CFLTest::init(std::get<0>(this->GetParam()));
+    predict = std::get<1>(this->GetParam())(tx_size);
+    predict_ref = cfl_get_predict_lbd_fn_c(tx_size);
+  }
+  virtual ~CFLPredictTest() {}
+
+ protected:
+  cfl_predict_lbd_fn predict;
+  cfl_predict_lbd_fn predict_ref;
+};
+
+TEST_P(CFLPredictTest, PredictTest) {
+  for (int it = 0; it < NUM_ITERATIONS; it++) {
+    randData(8);
+    predict(sub_luma_pels, chroma_pels, CFL_BUF_LINE, alpha_q3);
+    predict_ref(sub_luma_pels_ref, chroma_pels_ref, CFL_BUF_LINE, alpha_q3);
+    assert_eq<uint8_t>(chroma_pels, chroma_pels_ref, width, height);
+  }
+}
+TEST_P(CFLPredictTest, DISABLED_PredictSpeedTest) {
+  aom_usec_timer ref_timer;
+  aom_usec_timer timer;
+  randData(8);
+  aom_usec_timer_start(&ref_timer);
+  for (int k = 0; k < NUM_ITERATIONS_SPEED; k++) {
+    predict_ref(sub_luma_pels_ref, chroma_pels_ref, CFL_BUF_LINE, alpha_q3);
+  }
+  aom_usec_timer_mark(&ref_timer);
+  int ref_elapsed_time = (int)aom_usec_timer_elapsed(&ref_timer);
+
+  aom_usec_timer_start(&timer);
+  for (int k = 0; k < NUM_ITERATIONS_SPEED; k++) {
+    predict(sub_luma_pels, chroma_pels, CFL_BUF_LINE, alpha_q3);
+  }
+  aom_usec_timer_mark(&timer);
+  int elapsed_time = (int)aom_usec_timer_elapsed(&timer);
+  printSpeed(ref_elapsed_time, elapsed_time, width, height);
+  assertFaster(ref_elapsed_time, elapsed_time);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+typedef cfl_predict_hbd_fn (*get_predict_fn_hbd)(TX_SIZE tx_size);
+typedef std::tuple<TX_SIZE, get_predict_fn_hbd> predict_param_hbd;
+class CFLPredictHBDTest : public ::testing::TestWithParam<predict_param_hbd>,
+                          public CFLTestWithAlignedData<uint16_t> {
+ public:
+  virtual void SetUp() {
+    CFLTest::init(std::get<0>(this->GetParam()));
+    predict = std::get<1>(this->GetParam())(tx_size);
+    predict_ref = cfl_get_predict_hbd_fn_c(tx_size);
+  }
+  virtual ~CFLPredictHBDTest() {}
+
+ protected:
+  cfl_predict_hbd_fn predict;
+  cfl_predict_hbd_fn predict_ref;
+};
+
+TEST_P(CFLPredictHBDTest, PredictHBDTest) {
+  int bd = 12;
+  for (int it = 0; it < NUM_ITERATIONS; it++) {
+    randData(bd);
+    predict(sub_luma_pels, chroma_pels, CFL_BUF_LINE, alpha_q3, bd);
+    predict_ref(sub_luma_pels_ref, chroma_pels_ref, CFL_BUF_LINE, alpha_q3, bd);
+    assert_eq<uint16_t>(chroma_pels, chroma_pels_ref, width, height);
+  }
+}
+TEST_P(CFLPredictHBDTest, DISABLED_PredictHBDSpeedTest) {
+  aom_usec_timer ref_timer;
+  aom_usec_timer timer;
+  const int bd = 12;
+  randData(bd);
+  aom_usec_timer_start(&ref_timer);
+  for (int k = 0; k < NUM_ITERATIONS_SPEED; k++) {
+    predict_ref(sub_luma_pels_ref, chroma_pels_ref, CFL_BUF_LINE, alpha_q3, bd);
+  }
+  aom_usec_timer_mark(&ref_timer);
+  int ref_elapsed_time = (int)aom_usec_timer_elapsed(&ref_timer);
+
+  aom_usec_timer_start(&timer);
+  for (int k = 0; k < NUM_ITERATIONS_SPEED; k++) {
+    predict(sub_luma_pels, chroma_pels, CFL_BUF_LINE, alpha_q3, bd);
+  }
+  aom_usec_timer_mark(&timer);
+  int elapsed_time = (int)aom_usec_timer_elapsed(&timer);
+  printSpeed(ref_elapsed_time, elapsed_time, width, height);
+  assertFaster(ref_elapsed_time, elapsed_time);
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+#if HAVE_SSE2
+const sub_avg_param sub_avg_sizes_sse2[] = { ALL_CFL_TX_SIZES(
+    cfl_get_subtract_average_fn_sse2) };
+
+INSTANTIATE_TEST_SUITE_P(SSE2, CFLSubAvgTest,
+                         ::testing::ValuesIn(sub_avg_sizes_sse2));
+
+#endif
+
+#if HAVE_SSSE3
+const subsample_lbd_param subsample_lbd_sizes_ssse3[] = {
+  ALL_CFL_TX_SIZES_SUBSAMPLE(cfl_get_luma_subsampling_420_lbd_ssse3,
+                             cfl_get_luma_subsampling_422_lbd_ssse3,
+                             cfl_get_luma_subsampling_444_lbd_ssse3)
+};
+
+const predict_param predict_sizes_ssse3[] = { ALL_CFL_TX_SIZES(
+    cfl_get_predict_lbd_fn_ssse3) };
+
+INSTANTIATE_TEST_SUITE_P(SSSE3, CFLSubsampleLBDTest,
+                         ::testing::ValuesIn(subsample_lbd_sizes_ssse3));
+
+INSTANTIATE_TEST_SUITE_P(SSSE3, CFLPredictTest,
+                         ::testing::ValuesIn(predict_sizes_ssse3));
+
+#if CONFIG_AV1_HIGHBITDEPTH
+const subsample_hbd_param subsample_hbd_sizes_ssse3[] = {
+  ALL_CFL_TX_SIZES_SUBSAMPLE(cfl_get_luma_subsampling_420_hbd_ssse3,
+                             cfl_get_luma_subsampling_422_hbd_ssse3,
+                             cfl_get_luma_subsampling_444_hbd_ssse3)
+};
+
+const predict_param_hbd predict_sizes_hbd_ssse3[] = { ALL_CFL_TX_SIZES(
+    cfl_get_predict_hbd_fn_ssse3) };
+
+INSTANTIATE_TEST_SUITE_P(SSSE3, CFLSubsampleHBDTest,
+                         ::testing::ValuesIn(subsample_hbd_sizes_ssse3));
+
+INSTANTIATE_TEST_SUITE_P(SSSE3, CFLPredictHBDTest,
+                         ::testing::ValuesIn(predict_sizes_hbd_ssse3));
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+#endif  // HAVE_SSSE3
+
+#if HAVE_AVX2
+const sub_avg_param sub_avg_sizes_avx2[] = { ALL_CFL_TX_SIZES(
+    cfl_get_subtract_average_fn_avx2) };
+
+const subsample_lbd_param subsample_lbd_sizes_avx2[] = {
+  ALL_CFL_TX_SIZES_SUBSAMPLE(cfl_get_luma_subsampling_420_lbd_avx2,
+                             cfl_get_luma_subsampling_422_lbd_avx2,
+                             cfl_get_luma_subsampling_444_lbd_avx2)
+};
+
+const predict_param predict_sizes_avx2[] = { ALL_CFL_TX_SIZES(
+    cfl_get_predict_lbd_fn_avx2) };
+
+INSTANTIATE_TEST_SUITE_P(AVX2, CFLSubAvgTest,
+                         ::testing::ValuesIn(sub_avg_sizes_avx2));
+
+INSTANTIATE_TEST_SUITE_P(AVX2, CFLSubsampleLBDTest,
+                         ::testing::ValuesIn(subsample_lbd_sizes_avx2));
+
+INSTANTIATE_TEST_SUITE_P(AVX2, CFLPredictTest,
+                         ::testing::ValuesIn(predict_sizes_avx2));
+
+#if CONFIG_AV1_HIGHBITDEPTH
+const subsample_hbd_param subsample_hbd_sizes_avx2[] = {
+  ALL_CFL_TX_SIZES_SUBSAMPLE(cfl_get_luma_subsampling_420_hbd_avx2,
+                             cfl_get_luma_subsampling_422_hbd_avx2,
+                             cfl_get_luma_subsampling_444_hbd_avx2)
+};
+
+const predict_param_hbd predict_sizes_hbd_avx2[] = { ALL_CFL_TX_SIZES(
+    cfl_get_predict_hbd_fn_avx2) };
+
+INSTANTIATE_TEST_SUITE_P(AVX2, CFLSubsampleHBDTest,
+                         ::testing::ValuesIn(subsample_hbd_sizes_avx2));
+
+INSTANTIATE_TEST_SUITE_P(AVX2, CFLPredictHBDTest,
+                         ::testing::ValuesIn(predict_sizes_hbd_avx2));
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+#endif  // HAVE_AVX2
+
+#if HAVE_NEON
+const sub_avg_param sub_avg_sizes_neon[] = { ALL_CFL_TX_SIZES(
+    cfl_get_subtract_average_fn_neon) };
+
+const predict_param predict_sizes_neon[] = { ALL_CFL_TX_SIZES(
+    cfl_get_predict_lbd_fn_neon) };
+
+const subsample_lbd_param subsample_lbd_sizes_neon[] = {
+  ALL_CFL_TX_SIZES_SUBSAMPLE(cfl_get_luma_subsampling_420_lbd_neon,
+                             cfl_get_luma_subsampling_422_lbd_neon,
+                             cfl_get_luma_subsampling_444_lbd_neon)
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON, CFLSubAvgTest,
+                         ::testing::ValuesIn(sub_avg_sizes_neon));
+
+INSTANTIATE_TEST_SUITE_P(NEON, CFLSubsampleLBDTest,
+                         ::testing::ValuesIn(subsample_lbd_sizes_neon));
+
+INSTANTIATE_TEST_SUITE_P(NEON, CFLPredictTest,
+                         ::testing::ValuesIn(predict_sizes_neon));
+
+#if CONFIG_AV1_HIGHBITDEPTH
+const subsample_hbd_param subsample_hbd_sizes_neon[] = {
+  ALL_CFL_TX_SIZES_SUBSAMPLE(cfl_get_luma_subsampling_420_hbd_neon,
+                             cfl_get_luma_subsampling_422_hbd_neon,
+                             cfl_get_luma_subsampling_444_hbd_neon)
+};
+
+const predict_param_hbd predict_sizes_hbd_neon[] = { ALL_CFL_TX_SIZES(
+    cfl_get_predict_hbd_fn_neon) };
+
+INSTANTIATE_TEST_SUITE_P(NEON, CFLSubsampleHBDTest,
+                         ::testing::ValuesIn(subsample_hbd_sizes_neon));
+
+INSTANTIATE_TEST_SUITE_P(NEON, CFLPredictHBDTest,
+                         ::testing::ValuesIn(predict_sizes_hbd_neon));
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+#endif  // HAVE_NEON
+
+#if HAVE_VSX
+const sub_avg_param sub_avg_sizes_vsx[] = { ALL_CFL_TX_SIZES(
+    cfl_get_subtract_average_fn_vsx) };
+
+INSTANTIATE_TEST_SUITE_P(VSX, CFLSubAvgTest,
+                         ::testing::ValuesIn(sub_avg_sizes_vsx));
+#endif
+}  // namespace
diff --git a/libs/libaom/src/test/clear_system_state.h b/libs/libaom/src/test/clear_system_state.h
new file mode 100644
index 000000000..d38ff5dd5
--- /dev/null
+++ b/libs/libaom/src/test/clear_system_state.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_TEST_CLEAR_SYSTEM_STATE_H_
+#define AOM_TEST_CLEAR_SYSTEM_STATE_H_
+
+#include "config/aom_config.h"
+
+#if ARCH_X86 || ARCH_X86_64
+#include "aom_ports/x86.h"
+#endif
+
+namespace libaom_test {
+
+// Reset system to a known state. This function should be used for all non-API
+// test cases.
+inline void ClearSystemState() {
+#if ARCH_X86 || ARCH_X86_64
+  aom_reset_mmx_state();
+#endif
+}
+
+}  // namespace libaom_test
+#endif  // AOM_TEST_CLEAR_SYSTEM_STATE_H_
diff --git a/libs/libaom/src/test/cnn_test.cc b/libs/libaom/src/test/cnn_test.cc
new file mode 100644
index 000000000..4410493d3
--- /dev/null
+++ b/libs/libaom/src/test/cnn_test.cc
@@ -0,0 +1,2496 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/av1_rtcd.h"
+
+#include "av1/encoder/cnn.h"
+
+#define SQR(x) ((x) * (x))
+
+// Best possible pixelwise guarenteed preicison given each float has at most
+// 3 specified decimals.
+#define PIXELWISE_FLOAT_TOL 1E-2
+
+#define MSE_FLOAT_TOL 1E-6
+#define MSE_INT_TOL 0
+
+namespace {
+
+class CNNTest : public ::testing::Test {
+ protected:
+  static void RunCNNTest(int image_width, int image_height, const float *input,
+                         const float *expected, const CNN_CONFIG *cnn_config,
+                         int in_stride, CNN_THREAD_DATA *thread_data,
+                         double tolerance) {
+    int out_width, out_height, out_channels;
+    av1_find_cnn_output_size(image_width, image_height, cnn_config, &out_width,
+                             &out_height, &out_channels);
+
+    const int out_size = out_width * out_height;
+    const int out_stride = out_width;
+
+    float *output_ =
+        (float *)aom_malloc(sizeof(*output_) * out_size * out_channels);
+    float *output[CNN_MAX_CHANNELS] = { nullptr };
+    for (int channel = 0; channel < out_channels; ++channel) {
+      output[channel] = output_ + (channel * out_size);
+    }
+    const int num_outputs = 1;
+    const int output_chs[1] = { out_channels };
+    const int output_strides[1] = { out_stride };
+    CNN_MULTI_OUT output_struct = { num_outputs, output_chs, output_strides,
+                                    output };
+
+    RunMultiOutCNNTest(&input, image_width, image_height, in_stride, cnn_config,
+                       thread_data, &output_struct, &expected, tolerance);
+
+    aom_free(output_);
+  }
+
+  static void RunMultiOutCNNTest(const float **input, int image_width,
+                                 int image_height, int in_stride,
+                                 const CNN_CONFIG *cnn_config,
+                                 CNN_THREAD_DATA *thread_data,
+                                 CNN_MULTI_OUT *output, const float **expected,
+                                 double tolerance) {
+    const int num_outputs = output->num_outputs;
+    const int *output_chs = output->output_channels;
+
+    int *out_widths = (int *)aom_calloc(num_outputs, sizeof(*out_widths));
+    int *out_heights = (int *)aom_calloc(num_outputs, sizeof(*out_heights));
+    int *not_used = (int *)aom_calloc(num_outputs, sizeof(*not_used));
+
+    av1_find_cnn_output_size(image_width, image_height, cnn_config, out_widths,
+                             out_heights, not_used);
+    av1_cnn_predict(input, image_width, image_height, in_stride, cnn_config,
+                    thread_data, output);
+
+    int channel_offset = 0;
+    for (int output_idx = 0; output_idx < num_outputs; output_idx++) {
+      const float *expected_out = expected[output_idx];
+      const int curr_output_chs = output_chs[output_idx];
+      const int out_size = out_widths[output_idx] * out_heights[output_idx];
+
+      double mse = 0;
+      int expected_ite = 0;
+      for (int channel = 0; channel < curr_output_chs; ++channel) {
+        const float *buf_out = output->output_buffer[channel_offset];
+
+        for (int i = 0; i < out_size; ++i) {
+          EXPECT_NEAR(expected_out[expected_ite], buf_out[i],
+                      PIXELWISE_FLOAT_TOL)
+              << " output " << output_idx << " channel " << channel << " pixel "
+              << expected_ite % out_size << ": " << expected_out[expected_ite]
+              << "/" << buf_out[i] << std::endl;
+          mse += SQR(expected_out[expected_ite] - buf_out[i]);
+          expected_ite++;
+        }
+
+        channel_offset++;
+      }
+      mse /= (out_size * curr_output_chs);
+      EXPECT_LE(mse, tolerance) << " output " << output_idx << std::endl;
+    }
+
+    aom_free(out_widths);
+    aom_free(out_heights);
+    aom_free(not_used);
+  }
+
+  static void AssignLayerWeightsBiases(CNN_CONFIG *cnn_config, float *weights,
+                                       float *bias) {
+    size_t weight_offset = 0;
+    size_t bias_offset = 0;
+    for (int layer = 0; layer < cnn_config->num_layers; ++layer) {
+      CNN_LAYER_CONFIG *layer_config = &cnn_config->layer_config[layer];
+      layer_config->weights = weights + weight_offset;
+      layer_config->bias = bias + bias_offset;
+      weight_offset += layer_config->filter_width *
+                       layer_config->filter_height * layer_config->in_channels *
+                       layer_config->out_channels;
+      bias_offset += layer_config->out_channels;
+
+      ASSERT_NE(layer_config->weights, nullptr);
+      ASSERT_NE(layer_config->bias, nullptr);
+    }
+  }
+};
+
+}  // namespace
+
+TEST_F(CNNTest, TestMultilayerConvolution) {
+  int image_height = 16;
+  int image_width = 16;
+  int filter_height = 5;
+  int filter_width = 4;
+
+  float input[] = {
+    -3, 1,  -3, 2,  -2, -2, 2,  -2, 1,  -2, -3, 1,  2,  2,  2,  -2, 0,  1,  -1,
+    -3, -1, -1, 1,  0,  -3, 1,  0,  -1, 1,  0,  0,  -3, -3, -3, 0,  2,  1,  -1,
+    2,  0,  1,  -3, -1, 2,  2,  1,  -2, 0,  -1, 0,  -2, -2, -1, 1,  0,  0,  0,
+    -2, -2, -2, 1,  1,  -2, 1,  1,  -2, -2, 1,  -2, -1, -2, -3, 2,  -3, -1, 1,
+    0,  -2, -2, -2, 1,  -2, -2, -1, -1, 2,  2,  2,  -1, 1,  -3, -3, 0,  2,  0,
+    2,  1,  -3, -3, 1,  2,  2,  1,  -2, -3, 0,  -3, 0,  -3, -2, 0,  1,  1,  0,
+    -3, 2,  -1, 2,  1,  0,  1,  -2, 1,  -1, -1, 2,  0,  -2, -3, 1,  1,  -2, -1,
+    -3, -3, -1, 0,  -3, -2, 0,  0,  1,  0,  -3, -2, -1, 1,  0,  2,  1,  0,  -3,
+    -2, -3, -3, -1, 0,  -2, 2,  -1, -3, 0,  -1, -1, 2,  0,  -3, -2, -1, 0,  0,
+    1,  -2, 1,  2,  1,  2,  2,  -3, 2,  -1, 0,  0,  -1, 0,  2,  2,  -1, 2,  -2,
+    1,  1,  -3, -3, 1,  -1, -1, -2, 2,  -2, -2, 2,  -1, -3, 2,  -3, 1,  -1, -1,
+    -3, 1,  -1, 1,  0,  -3, -3, 1,  -3, -3, 0,  2,  2,  -2, -1, 2,  0,  2,  1,
+    -1, -3, 0,  0,  -1, -1, 1,  0,  2,  0,  -3, 2,  1,  0,  1,  -3, 2,  -3, -3,
+    -1, -3, -3, 2,  0,  2,  -2, 1,  -1,
+  };
+
+  float weights[] = {
+    -2, 2,  -2, 2,  -1, -3, 2,  2,  0,  0,  -3, -1, -2, -3, 1,  -1, 0,  0,  0,
+    2,  -2, 2,  -2, -3, 1,  1,  1,  -3, -1, 0,  1,  2,  -2, 0,  -1, -3, -1, -2,
+    2,  -3, -3, 1,  -2, -3, 0,  2,  1,  -3, -3, -1, -3, -2, -1, -3, -1, -3, -2,
+    -1, -3, -1, -2, -2, -3, 2,  0,  -3, 0,  -3, -3, 1,  -3, -1, 0,  -1, 1,  1,
+    -1, 1,  -2, 0,  2,  0,  -3, 1,  -1, -1, 2,  0,  1,  -3, -3, 1,  2,  -3, -3,
+    1,  -3, 2,  0,  -3, 1,  2,  2,  -2, -1, -2, 1,  1,  0,  -2, -2, 1,  2,  -1,
+    -3, 1,  -2, 2,  -3, -2, -3, 2,  1,  0,  -2, 0,  1,  -3, 2,  -2, -2, 0,  2,
+    -3, 2,  0,  0,  1,  -2, 1,  1,  -2, -1, -2, 1,  -2, 0,  -2, -2, 0,  -1, -1,
+    -3, -3, -3, 1,  -3, -2, 2,  -1, 2,  0,  2,  -2, 2,  -2, 1,  -3, -3, -1, 0,
+    2,  2,  1,  -1, -3, -1, -3, 2,  1,  -2, 0,  -3, -1, -3, -1, 2,  1,  0,  2,
+    -1, 1,  0,  1,  2,  -1, -2, 2,  1,  -3, -1, -3, 0,  1,  -2, 0,  -2, -3, 0,
+    -2, 2,  2,  0,  0,  2,  -3, 2,  -3, -2, 1,  2,  -3, -3, -1, -3, 0,  -3, -3,
+    -2, -2, -2, 0,  0,  1,  0,  0,  -1, 0,  0,  -3, 0,  -3, -1, -2, 1,  -2, -1,
+    2,  -2, 0,  0,  1,  0,  -2, -1, 0,  -3, 1,  0,  -1, -3, 1,  -1, 1,  -1, -3,
+    1,  0,  1,  1,  -1, 2,  2,  0,  0,  1,  -3, 2,  -2, -2, -3, -2, -1, -2, 2,
+    0,  2,  -2, -3, -1, -3, 2,  2,  -1, 2,  2,  -1, 0,  -3, 1,
+  };
+
+  float bias[] = {
+    1, -1, 0, 1, 1, 1, -2,
+  };
+
+  float expected_same[] = {
+    -1125, 2926,  6406,  631,   -1244, 97,    -1454, 2526,  1065,  3292,  3464,
+    2553,  -330,  532,   1038,  1182,  -402,  3758,  3392,  9854,  4365,  1408,
+    4736,  3134,  3838,  2409,  3221,  4350,  6750,  4045,  815,   1188,  2959,
+    9802,  9590,  4572,  5740,  4253,  1701,  7974,  7012,  6854,  7093,  3907,
+    4539,  3886,  4267,  3505,  465,   7824,  9219,  10026, 7968,  957,   2295,
+    5594,  10811, 9641,  5950,  10043, 8783,  3132,  1421,  1110,  4108,  13929,
+    10660, -84,   -61,   3932,  -180,  6811,  13393, 15147, 15640, 9337,  6961,
+    3808,  1604,  1398,  1047,  6739,  10144, 6517,  4698,  2678,  7389,  2595,
+    5248,  12075, 11272, 13951, 8820,  1090,  2199,  2206,  2788,  12116, 6683,
+    2612,  -291,  3183,  9414,  12316, 14524, 12333, 13208, 7832,  4664,  4657,
+    3534,  1298,  -666,  4250,  7707,  9103,  5760,  688,   9571,  15782, 14203,
+    14878, 17339, 14684, 8690,  5671,  875,   1429,  1531,  6173,  2984,  5558,
+    2996,  7928,  6733,  16117, 15262, 12757, 7980,  3923,  4795,  5973,  2051,
+    455,   -1922, 1816,  5906,  3321,  10908, 10910, 7377,  12204, 12809, 11195,
+    7451,  6666,  74,    -1645, -35,   -391,  3813,  7324,  892,   1656,  6095,
+    12193, 14648, 12156, 14663, 10251, 10325, 7821,  3925,  323,   697,   442,
+    1324,  4669,  7002,  5485,  5171,  5086,  10582, 11053, 9709,  11353, 8543,
+    5256,  2873,  235,   -628,  1496,  1878,  -867,  3420,  6865,  5937,  10182,
+    13277, 10069, 10789, 5998,  624,   -2082, 4417,  1258,  -1080, -819,  -1430,
+    1033,  5220,  6335,  8471,  8980,  11908, 14430, 12584, 8404,  1576,  -803,
+    985,   1481,  1367,  -193,  873,   3684,  2288,  6676,  9477,  11155, 9602,
+    9707,  10507, 4739,  3174,  -575,  -178,  3002,  1710,  423,   -477,  554,
+    3088,  2029,  5113,  5000,  3771,  6090,  5365,  1185,  2855,  399,   -312,
+    -1577, 176,   955,
+  };
+
+  float expected_replicate[] = {
+    13768, 13528, 12999, 6906,  4618,  4043,  2611,  9955,  6685,  4776,  2753,
+    1036,  3063,  4544,  5183,  7349,  12451, 12501, 9131,  12753, 8908,  4058,
+    6299,  7542,  7115,  3307,  3360,  3543,  9754,  7808,  5991,  9019,  14320,
+    14919, 12492, 6871,  7373,  3336,  2085,  10604, 9377,  6882,  5009,  3103,
+    6220,  6278,  7588,  10196, 11045, 11563, 11842, 11911, 8279,  2030,  1858,
+    6368,  12123, 9909,  6347,  10345, 9365,  4038,  1673,  3051,  16492, 16649,
+    12276, 408,   -301,  4122,  -654,  7864,  14038, 15279, 15315, 9744,  8243,
+    5298,  746,   380,   9824,  9124,  10895, 6640,  4712,  2669,  6980,  2759,
+    5385,  12345, 11336, 13129, 8600,  2370,  3682,  5219,  12407, 13123, 6784,
+    2612,  -291,  3183,  9414,  12316, 14524, 12333, 13397, 7543,  3916,  4153,
+    4477,  4314,  7983,  8418,  9163,  9103,  5760,  688,   9571,  15782, 14203,
+    14878, 17718, 14570, 7940,  6642,  5094,  7133,  9964,  10219, 3224,  5558,
+    2996,  7928,  6733,  16117, 15262, 12757, 7958,  4401,  5187,  5476,  5529,
+    6055,  2206,  3909,  6015,  3321,  10908, 10910, 7377,  12204, 12809, 11195,
+    6967,  6840,  481,   -1600, 274,   1,     10373, 8514,  1123,  2117,  6758,
+    12736, 16223, 13585, 15988, 11771, 10600, 7918,  4156,  2840,  3111,  3287,
+    6359,  7652,  8813,  6530,  6967,  7789,  13671, 13990, 13247, 13241, 9836,
+    5251,  3024,  2313,  1834,  4187,  2637,  -1312, 2139,  7378,  7665,  11933,
+    15591, 15314, 15678, 9531,  2820,  -1516, 3400,  1314,  22,    363,   -2896,
+    -898,  5906,  7308,  10650, 12975, 16978, 20370, 18817, 12381, 4118,  -861,
+    -137,  236,   1802,  1632,  -350,  2334,  3400,  8680,  14064, 18216, 18675,
+    21765, 22871, 11491, 4937,  -1555, -11,   1669,  2392,  3265,  -5254, -217,
+    5001,  8063,  13444, 18884, 19706, 22794, 21064, 9545,  6689,  -7,    289,
+    -2021, 504,   2347,
+  };
+
+  float expected_valid[] = {
+    2612,  -291,  3183,  9414,  12316, 14524, 12333, 9103,  5760,  688,
+    9571,  15782, 14203, 14878, 5558,  2996,  7928,  6733,  16117, 15262,
+    12757, 3321,  10908, 10910, 7377,  12204, 12809, 11195,
+  };
+
+  CNN_CONFIG cnn_config = { 3,
+                            0,
+                            0,
+                            0,
+                            0,
+                            {
+                                {
+                                    1,
+                                    filter_width,
+                                    filter_height,
+                                    3,
+                                    1,
+                                    1,
+                                    0,
+                                    nullptr,
+                                    nullptr,
+                                    PADDING_SAME_ZERO,
+                                    NONE,
+                                    0,
+                                    0,
+                                    BRANCH_NO_COPY,
+                                    BRANCH_NOC,
+                                    {},
+                                    {},
+                                    -1,
+                                },
+                                {
+                                    3,
+                                    filter_width,
+                                    filter_height,
+                                    3,
+                                    1,
+                                    1,
+                                    0,
+                                    nullptr,
+                                    nullptr,
+                                    PADDING_SAME_ZERO,
+                                    NONE,
+                                    0,
+                                    0,
+                                    BRANCH_NO_COPY,
+                                    BRANCH_NOC,
+                                    {},
+                                    {},
+                                    -1,
+                                },
+                                {
+                                    3,
+                                    filter_width,
+                                    filter_height,
+                                    1,
+                                    1,
+                                    1,
+                                    0,
+                                    nullptr,
+                                    nullptr,
+                                    PADDING_SAME_ZERO,
+                                    NONE,
+                                    0,
+                                    0,
+                                    BRANCH_NO_COPY,
+                                    BRANCH_NOC,
+                                    {},
+                                    {},
+                                    0,
+                                },
+                            } };
+
+  // Weights and biases need to be specified separately because
+  // of the offset.
+  AssignLayerWeightsBiases(&cnn_config, weights, bias);
+
+  CNN_THREAD_DATA thread_data = { 1, NULL };
+
+  RunCNNTest(image_width, image_height, input, expected_same, &cnn_config,
+             image_width, &thread_data, MSE_INT_TOL);
+
+  for (int i = 0; i < cnn_config.num_layers; ++i) {
+    cnn_config.layer_config[i].pad = PADDING_SAME_REPLICATE;
+  }
+
+  RunCNNTest(image_width, image_height, input, expected_replicate, &cnn_config,
+             image_width, &thread_data, MSE_INT_TOL);
+
+  for (int i = 0; i < cnn_config.num_layers; ++i) {
+    cnn_config.layer_config[i].pad = PADDING_VALID;
+  }
+
+  RunCNNTest(image_width, image_height, input, expected_valid, &cnn_config,
+             image_width, &thread_data, MSE_INT_TOL);
+}
+
+TEST_F(CNNTest, TestRELUSingleLayer) {
+  int image_width = 8;
+  int image_height = 8;
+  int filter_height = 5;
+  int filter_width = 4;
+  float input[] = {
+    0, -2, -3, 1,  -1, 2,  -2, 1,  -3, -1, 0,  1,  -2, -3, -2, -2,
+    1, -3, 2,  -3, -1, -1, 2,  0,  -2, -3, 0,  -2, -3, 1,  -1, -1,
+    2, -2, 0,  -2, -3, -3, 1,  1,  -1, 1,  0,  1,  -3, 0,  2,  2,
+    0, -3, 1,  -3, 2,  -2, 1,  -1, -1, -2, -3, -2, -1, -3, -2, -1,
+  };
+  float expected_same[] = {
+    9,  0,  1,  1,  0,  3,  0,  19, 0,  12, 10, 0,  0,  0,  5, 0,
+    0,  18, 21, 7,  19, 4,  3,  0,  0,  9,  16, 0,  11, 16, 0, 11,
+    12, 2,  0,  11, 0,  16, 6,  0,  8,  22, 13, 10, 12, 0,  0, 0,
+    0,  1,  2,  12, 29, 6,  10, 0,  13, 0,  0,  5,  8,  10, 0, 0,
+  };
+  float expected_replicate[] = {
+    18, 17, 12, 2,  0,  0,  5,  11, 0,  17, 22, 6,  0,  0,  17, 0,
+    0,  18, 21, 7,  19, 4,  3,  5,  3,  9,  16, 0,  11, 16, 0,  3,
+    3,  2,  0,  11, 0,  16, 6,  0,  17, 22, 13, 10, 12, 0,  0,  0,
+    0,  4,  1,  10, 30, 7,  10, 0,  23, 8,  0,  13, 15, 19, 8,  10,
+  };
+  float expected_valid[] = {
+    18, 21, 7, 19, 4, 9, 16, 0, 11, 16, 2, 0, 11, 0, 16, 22, 13, 10, 12, 0,
+  };
+  float weights[] = {
+    -2, -3, 1, 2, 2, -2, -3, 0, -3, 2, 2, -3, -3, -2, 0, 1, 2, 0, -1, -1,
+  };
+  float bias[] = { -3 };
+
+  CNN_CONFIG cnn_config = { 1,
+                            0,
+                            0,
+                            0,
+                            0,
+                            { {
+                                1,
+                                filter_width,
+                                filter_height,
+                                1,
+                                1,
+                                1,
+                                0,
+                                weights,
+                                bias,
+                                PADDING_SAME_ZERO,
+                                RELU,
+                                0,
+                                0,
+                                BRANCH_NO_COPY,
+                                BRANCH_NOC,
+                                {},
+                                {},
+                                0,
+                            } } };
+
+  CNN_THREAD_DATA thread_data = { 1, NULL };
+
+  RunCNNTest(image_width, image_height, input, expected_same, &cnn_config,
+             image_width, &thread_data, MSE_INT_TOL);
+
+  cnn_config.layer_config[0].pad = PADDING_SAME_REPLICATE;
+
+  RunCNNTest(image_width, image_height, input, expected_replicate, &cnn_config,
+             image_width, &thread_data, MSE_INT_TOL);
+
+  cnn_config.layer_config[0].pad = PADDING_VALID;
+
+  RunCNNTest(image_width, image_height, input, expected_valid, &cnn_config,
+             image_width, &thread_data, MSE_INT_TOL);
+}
+
+TEST_F(CNNTest, TestVaryingStridesVaryingDimImages) {
+  float weights[] = {
+    1,  -5, -3, -4, -1, 1,  2,  -3, 2,  2,  -1, 1,  -5, 1,  1,
+    -3, -5, 3,  1,  4,  -2, -5, -2, -3, -5, 0,  -1, -5, 2,  -2,
+    -2, 1,  -2, -4, 1,  3,  -2, 2,  0,  -3, 2,  -3, -2, -3,
+  };
+  float bias[] = { 2 };
+
+  CNN_CONFIG cnn_config = { 1,
+                            0,
+                            0,
+                            0,
+                            0,
+                            {
+                                {
+                                    1,
+                                    4,
+                                    11,
+                                    1,
+                                    7,
+                                    6,
+                                    0,
+                                    weights,
+                                    bias,
+                                    PADDING_SAME_ZERO,
+                                    NONE,
+                                    0,
+                                    0,
+                                    BRANCH_NO_COPY,
+                                    BRANCH_NOC,
+                                    {},
+                                    {},
+                                    0,
+                                },
+                            } };
+
+  int image_height = 24;
+  int image_width = 17;
+  float input[] = {
+    -1, -3, 4,  4,  -5, 4,  3,  -5, -1, -3, 4,  -4, 2,  -3, 3,  -5, 2,  -1, -5,
+    1,  -1, 3,  1,  -3, -3, 4,  0,  2,  -3, -5, -5, -4, 0,  -5, -2, -3, -1, -2,
+    2,  -5, 4,  4,  0,  -4, -3, 1,  -3, -5, -4, -4, 1,  -2, -3, 3,  -3, -3, -1,
+    -5, -5, -2, 3,  1,  -1, -5, -5, 1,  -4, -2, -1, -2, -4, -4, 2,  -2, 2,  1,
+    -2, -4, -1, 1,  -2, -5, 3,  -2, -1, -1, -5, -3, 1,  -2, -2, -3, -1, -2, -4,
+    -2, 1,  -4, -1, 4,  3,  -4, 0,  4,  2,  2,  4,  -3, -5, 2,  2,  1,  -1, -4,
+    -2, 1,  3,  2,  0,  4,  -1, -3, 2,  1,  -4, 2,  2,  -4, -2, 0,  -2, -1, 4,
+    4,  2,  3,  -4, 2,  -4, -5, 4,  -1, -3, -1, 0,  -4, 1,  3,  -1, -3, -5, 3,
+    -2, -4, 1,  2,  -2, -3, -3, -5, 1,  -3, -1, 0,  -1, 3,  -4, -1, -5, -5, 1,
+    0,  0,  -2, -2, 2,  -2, 0,  0,  2,  0,  -3, 0,  -1, -4, -4, -1, 3,  -4, -4,
+    -1, 0,  -5, -3, -2, 4,  -3, -4, -4, 0,  -5, 1,  -2, -3, -3, -4, 4,  3,  4,
+    3,  3,  -1, 3,  1,  -3, -2, 3,  3,  0,  2,  -4, -3, 2,  2,  0,  -2, 4,  -2,
+    2,  -2, -1, -4, -2, 2,  -4, 3,  -1, 4,  1,  1,  4,  -1, -4, -4, 1,  1,  -2,
+    4,  -1, 3,  2,  -3, 4,  3,  1,  4,  0,  -4, 2,  0,  2,  4,  -2, -2, 4,  2,
+    -1, -2, 1,  -3, 2,  3,  -5, -3, 4,  4,  2,  -5, -4, -5, -2, -4, 2,  0,  2,
+    -5, 4,  -4, -2, -5, 2,  1,  0,  4,  1,  -2, -3, -4, -3, -4, 3,  3,  2,  0,
+    -3, 1,  -5, 4,  0,  4,  -1, 3,  -5, -5, -2, -1, -1, 4,  3,  3,  4,  3,  -4,
+    4,  -3, -3, -1, -4, -1, -4, -1, -2, 4,  -2, -4, 4,  4,  -3, -4, -1, 1,  2,
+    -1, -2, -2, 3,  2,  2,  -3, 0,  -1, 0,  3,  2,  -5, 0,  -4, 0,  0,  2,  -4,
+    -1, -1, 0,  -2, 0,  1,  0,  0,  4,  -5, -1, -5, 2,  -1, 0,  2,  -1, 1,  3,
+    -3, -5, -2, -3, 4,  -2, -2, -1, -3, -4, -1, -2, -4, 1,  4,  -3, -2, -1, 3,
+    -3, -2, 3,  2,  1,  -4, -3, -5, 1,
+  };
+  float expected_1[] = {
+    41, -26, 5, 76, 13, 83, -21, 53, -54, -14, 21, 121,
+  };
+
+  CNN_THREAD_DATA thread_data = { 1, NULL };
+
+  RunCNNTest(image_width, image_height, input, expected_1, &cnn_config,
+             image_width, &thread_data, MSE_INT_TOL);
+
+  cnn_config.layer_config[0].skip_width = 6;
+  cnn_config.layer_config[0].skip_height = 7;
+
+  float expected_2[] = {
+    21, -50, 41, 20, 72, 127, -21, 103, 62, -37, 83, -3,
+  };
+  RunCNNTest(image_width, image_height, input, expected_2, &cnn_config,
+             image_width, &thread_data, MSE_INT_TOL);
+
+  cnn_config.layer_config[0].skip_width = 3;
+  cnn_config.layer_config[0].skip_height = 10;
+
+  float expected_3[] = {
+    -26, -21, -35, 69, 49,  4,  -51, -43, -56,
+    -41, 15,  -44, 40, -62, 63, 38,  27,  47,
+  };
+  RunCNNTest(image_width, image_height, input, expected_3, &cnn_config,
+             image_width, &thread_data, MSE_INT_TOL);
+
+  cnn_config.layer_config[0].skip_width = 10;
+  cnn_config.layer_config[0].skip_height = 3;
+
+  float expected_4[] = {
+    21, 49, 28, 87, 50, 40, 102, 81, 58, 85, 51, 66, 36, 19, -37, -45,
+  };
+
+  RunCNNTest(image_width, image_height, input, expected_4, &cnn_config,
+             image_width, &thread_data, MSE_INT_TOL);
+}
+
+TEST_F(CNNTest, TestMaxPool) {
+  int image_width = 8;
+  int image_height = 8;
+  int stride = 3;
+  float input[] = {
+    1,  -4, -4, 8, 0, 7, -5, -2, 8, 2, 2, 8,  5,  -1, -1, 9,
+    -3, 0,  -2, 0, 6, 3, -4, 8,  7, 8, 7, -1, 4,  -1, 0,  2,
+    -5, -2, 8,  5, 5, 4, 2,  7,  4, 6, 2, 8,  8,  -4, -3, -4,
+    -3, -1, 2,  3, 3, 6, -5, 8,  9, 5, 0, -2, -1, 6,  5,  7,
+  };
+
+  float expected[] = {
+    49, 58, 70, 68, 68, 70, 48, 57, 88,
+  };
+
+  float weights[] = {
+    3, 1, 3, 4, -1, 5, -2, 1, -4,
+  };
+
+  float bias[] = {
+    -3,
+  };
+
+  CNN_CONFIG cnn_config = { 1,
+                            0,
+                            0,
+                            0,
+                            0,
+                            { {
+                                1,
+                                3,
+                                3,
+                                1,
+                                stride,
+                                stride,
+                                1,
+                                weights,
+                                bias,
+                                PADDING_SAME_ZERO,
+                                NONE,
+                                0,
+                                0,
+                                BRANCH_NO_COPY,
+                                BRANCH_NOC,
+                                {},
+                                {},
+                                0,
+                            } } };
+
+  CNN_THREAD_DATA thread_data = { 1, NULL };
+
+  RunCNNTest(image_width, image_height, input, expected, &cnn_config,
+             image_width, &thread_data, MSE_INT_TOL);
+}
+
+TEST_F(CNNTest, TestDeconvolveNonActivationSingleLayerSingleKernel) {
+  int image_width = 4;
+  int image_height = 7;
+  float input[] = {
+    9,  6,   181, 9,  218, 30, 80,  108, 68,  216, 70, 128, 179, 228,
+    33, 212, 34,  14, 48,  27, 230, 23,  202, 113, 80, 56,  122, 112,
+  };
+
+  float expected_1_same[] = {
+    15,   -30,  36,   -525,  377, -193, 558, 531,  6,   -24,  -15,  124,
+    166,  -561, -356, -754,  -3,  -3,   -3,  -3,   -3,  -3,   -3,   -3,
+    433,  -311, 711,  381,   247, -317, 453, 129,  215, -627, -409, -885,
+    17,   -255, -55,  -647,  -3,  -3,   -3,  -3,   -3,  -3,   -3,   -3,
+    133,  -719, 633,  -225,  785, 191,  463, 79,   65,  9,    77,   -853,
+    -365, -949, -15,  -667,  -3,  -3,   -3,  -3,   -3,  -3,   -3,   -3,
+    355,  -866, 990,  207,   747, 12,   520, -116, 176, -312, -133, -1370,
+    -426, -802, 143,  -771,  -3,  -3,   -3,  -3,   -3,  -3,   -3,   -3,
+    65,   -79,  127,  -59,   135, -90,  195, 114,  31,  -91,  -57,  -133,
+    17,   -176, -72,  -276,  -3,  -3,   -3,  -3,   -3,  -3,   -3,   -3,
+    457,  -302, 733,  58,    470, -475, 829, 490,  227, -670, -440, -790,
+    153,  -588, -294, -1150, -3,  -3,   -3,  -3,   -3,  -3,   -3,   -3,
+    157,  -251, 349,  -185,  409, -293, 587, 251,  77,  -187, -107, -369,
+    7,    -481, -135, -827,  -3,  -3,   -3,  -3,   -3,  -3,   -3,   -3,
+  };
+  float expected_1_valid[] = {
+    -30,  15,   -30,  36,   -525,  377,  -193,  558,  531,  24,   24,   6,
+    6,    -24,  -15,  124,  166,   -561, -356,  -754, -21,  -39,  -3,   -3,
+    -3,   -3,   -3,   -3,   -3,    -3,   -3,    -3,   -3,   -657, 433,  -311,
+    711,  381,  247,  -317, 453,   129,  321,   321,  215,  215,  -627, -409,
+    -885, 17,   -255, -55,  -647,  -219, -435,  -3,   -3,   -3,   -3,   -3,
+    -3,   -3,   -3,   -3,   -3,    -3,   -207,  133,  -719, 633,  -225, 785,
+    191,  463,  79,   381,  381,   65,   65,    9,    77,   -853, -365, -949,
+    -15,  -667, -259, -515, -3,    -3,   -3,    -3,   -3,   -3,   -3,   -3,
+    -3,   -3,   -3,   -540, 355,   -866, 990,   207,  747,  12,   520,  -116,
+    633,  633,  176,  176,  -312,  -133, -1370, -426, -802, 143,  -771, -427,
+    -851, -3,   -3,   -3,   -3,    -3,   -3,    -3,   -3,   -3,   -3,   -3,
+    -105, 65,   -79,  127,  -59,   135,  -90,   195,  114,  78,   78,   31,
+    31,   -91,  -57,  -133, 17,    -176, -72,   -276, -57,  -111, -3,   -3,
+    -3,   -3,   -3,   -3,   -3,    -3,   -3,    -3,   -3,   -693, 457,  -302,
+    733,  58,   470,  -475, 829,   490,  336,   336,  227,  227,  -670, -440,
+    -790, 153,  -588, -294, -1150, -229, -455,  -3,   -3,   -3,   -3,   -3,
+    -3,   -3,   -3,   -3,   -3,    -3,   -243,  157,  -251, 349,  -185, 409,
+    -293, 587,  251,  333,  333,   77,   77,    -187, -107, -369, 7,    -481,
+    -135, -827, -227, -451,
+  };
+  float weights_1[] = { -3, 2, -1, 3, 3, 1, 1, -3, -2, -4 };
+  float bias_1[] = { -3 };
+
+  CNN_CONFIG cnn_config = { 1,
+                            0,
+                            0,
+                            0,
+                            0,
+                            { {
+                                1,
+                                5,
+                                2,
+                                1,
+                                2,
+                                3,
+                                0,
+                                weights_1,
+                                bias_1,
+                                PADDING_SAME_ZERO,
+                                NONE,
+                                1,
+                                0,
+                                BRANCH_NO_COPY,
+                                BRANCH_NOC,
+                                {},
+                                {},
+                                0,
+                            } } };
+
+  CNN_THREAD_DATA thread_data = { 1, NULL };
+
+  RunCNNTest(image_width, image_height, input, expected_1_same, &cnn_config,
+             image_width, &thread_data, MSE_INT_TOL);
+
+  // Change padding to valid
+  cnn_config.layer_config[0].pad = PADDING_VALID;
+
+  RunCNNTest(image_width, image_height, input, expected_1_valid, &cnn_config,
+             image_width, &thread_data, MSE_INT_TOL);
+
+  float expected_12_same[] = {
+    15,  -12,  6,    36,   -9,   -528, 377,  -184, 513,  558,  -12,  24,
+    6,   -30,  -15,  -33,  -21,  166,  154,  -546, -356, -718, -30,  -21,
+    433, -221, 561,  711,  -33,  -153, 247,  -83,  -87,  453,  -111, 321,
+    215, -657, -409, -845, -93,  17,   -43,  -243, -55,  -215, -327, -219,
+    133, -71,  -447, 633,  -219, 435,  785,  -73,  -177, 463,  -131, 381,
+    65,  -207, 77,   -59,  -651, -365, -797, -213, -15,  -155, -387, -259,
+    355, -182, -150, 990,  -231, 582,  747,  -36,  -540, 520,  -215, 633,
+    176, -540, -133, -491, -687, -426, -882, -102, 143,  77,   -639, -427,
+    65,  -37,  57,   127,  -17,  -105, 135,  -51,  60,   195,  -30,  78,
+    31,  -105, -57,  -125, -45,  17,   -11,  -147, -72,  -168, -84,  -57,
+    457, -233, 618,  733,  -26,  -540, 470,  -205, 264,  829,  -116, 336,
+    227, -693, -440, -900, -72,  153,  107,  -609, -294, -698, -342, -229,
+    157, -83,  69,   349,  -59,  -201, 409,  -125, 27,   587,  -115, 333,
+    77,  -243, -107, -267, -171, 7,    -105, -369, -135, -379, -339, -227,
+  };
+  float expected_12_valid[] = {
+    -30,  15,   -12,  6,    36,   -9,   -528, 377,  -184, 513,  558,  -12,
+    24,   24,   6,    6,    -30,  -15,  -33,  -21,  166,  154,  -546, -356,
+    -718, -30,  -21,  -39,  -657, 433,  -221, 561,  711,  -33,  -153, 247,
+    -83,  -87,  453,  -111, 321,  321,  215,  215,  -657, -409, -845, -93,
+    17,   -43,  -243, -55,  -215, -327, -219, -435, -207, 133,  -71,  -447,
+    633,  -219, 435,  785,  -73,  -177, 463,  -131, 381,  381,  65,   65,
+    -207, 77,   -59,  -651, -365, -797, -213, -15,  -155, -387, -259, -515,
+    -540, 355,  -182, -150, 990,  -231, 582,  747,  -36,  -540, 520,  -215,
+    633,  633,  176,  176,  -540, -133, -491, -687, -426, -882, -102, 143,
+    77,   -639, -427, -851, -105, 65,   -37,  57,   127,  -17,  -105, 135,
+    -51,  60,   195,  -30,  78,   78,   31,   31,   -105, -57,  -125, -45,
+    17,   -11,  -147, -72,  -168, -84,  -57,  -111, -693, 457,  -233, 618,
+    733,  -26,  -540, 470,  -205, 264,  829,  -116, 336,  336,  227,  227,
+    -693, -440, -900, -72,  153,  107,  -609, -294, -698, -342, -229, -455,
+    -243, 157,  -83,  69,   349,  -59,  -201, 409,  -125, 27,   587,  -115,
+    333,  333,  77,   77,   -243, -107, -267, -171, 7,    -105, -369, -135,
+    -379, -339, -227, -451,
+  };
+
+  // Change skip_width, skip_height to {2, 3}
+  cnn_config.layer_config[0].skip_width = 3;
+  cnn_config.layer_config[0].skip_height = 2;
+  // Set padding to same
+  cnn_config.layer_config[0].pad = PADDING_SAME_ZERO;
+
+  RunCNNTest(image_width, image_height, input, expected_12_same, &cnn_config,
+             image_width, &thread_data, MSE_INT_TOL);
+
+  // Change padding to valid
+  cnn_config.layer_config[0].pad = PADDING_VALID;
+  RunCNNTest(image_width, image_height, input, expected_12_valid, &cnn_config,
+             image_width, &thread_data, MSE_INT_TOL);
+
+  cnn_config.layer_config[0].filter_width = 4;
+  cnn_config.layer_config[0].filter_height = 3;
+  float weights_2[] = { -1, -3, -1, -3, 0, 2, -2, 4, 3, 0, 1, 4 };
+  float bias_2[] = { -4 };
+  cnn_config.layer_config[0].weights = weights_2;
+  cnn_config.layer_config[0].bias = bias_2;
+
+  cnn_config.layer_config[0].skip_width = 5;
+  cnn_config.layer_config[0].skip_height = 2;
+  float expected_2_same[] = {
+    -13,  -31,  -13,  -31,  -4,   -10,  -22,  -10,  -22,  -4,   -185, -547,
+    -185, -547, -4,   -13,  -31,  -13,  -31,  -4,   -4,   14,   -22,  32,
+    -4,   -4,   8,    -16,  20,   -4,   -4,   358,  -366, 720,  -4,   -4,
+    14,   -22,  32,   -4,   -195, -658, -213, -622, -4,   -16,  -94,  -28,
+    -70,  -4,   459,  -244, 97,   480,  -4,   -85,  -328, -103, -292, -4,
+    -4,   432,  -440, 868,  -4,   -4,   56,   -64,  116,  -4,   -4,   156,
+    -164, 316,  -4,   -4,   212,  -220, 428,  -4,   582,  -208, 146,  664,
+    -4,   -130, -652, -190, -532, -4,   166,  -214, 6,    106,  -4,   192,
+    -388, -24,  44,   -4,   -4,   132,  -140, 268,  -4,   -4,   428,  -436,
+    860,  -4,   -4,   136,  -144, 276,  -4,   -4,   252,  -260, 508,  -4,
+    21,   -541, -115, -269, -4,   416,  -688, -16,  176,  -4,   173,  -103,
+    33,   177,  -4,   168,  -640, -88,  -128, -4,   -4,   354,  -362, 712,
+    -4,   -4,   452,  -460, 908,  -4,   -4,   62,   -70,  128,  -4,   -4,
+    420,  -428, 844,  -4,   499,  -106, 141,  610,  -4,   666,  -46,  210,
+    866,  -4,   47,   -148, -19,  -16,  -4,   605,  -85,  181,  763,  -4,
+    -4,   64,   -72,  132,  -4,   -4,   24,   -32,  52,   -4,   -4,   92,
+    -100, 188,  -4,   -4,   50,   -58,  104,  -4,   -132, -694, -200, -558,
+    -4,   15,   -73,  -13,  -17,  -4,   -62,  -610, -158, -418, -4,   -36,
+    -343, -90,  -235, -4,   -4,   456,  -464, 916,  -4,   -4,   42,   -50,
+    88,   -4,   -4,   400,  -408, 804,  -4,   -4,   222,  -230, 448,  -4,
+    606,  -244, 146,  676,  -4,   9,    -172, -37,  -80,  -4,   480,  -370,
+    76,   438,  -4,   223,  -340, -3,   112,  -4,   -4,   156,  -164, 316,
+    -4,   -4,   108,  -116, 220,  -4,   -4,   240,  -248, 484,  -4,   -4,
+    220,  -228, 444,  -4,
+  };
+  float expected_2_valid[] = {
+    -13,  -31,  -13,  -31,  -4,   -10,  -22,  -10,  -22,  -4,   -185, -547,
+    -185, -547, -4,   -13,  -31,  -13,  -31,  -4,   14,   -22,  32,   -4,
+    -4,   8,    -16,  20,   -4,   -4,   358,  -366, 720,  -4,   -4,   14,
+    -22,  32,   -195, -658, -213, -622, -4,   -16,  -94,  -28,  -70,  -4,
+    459,  -244, 97,   480,  -4,   -85,  -328, -103, -292, -4,   432,  -440,
+    868,  -4,   -4,   56,   -64,  116,  -4,   -4,   156,  -164, 316,  -4,
+    -4,   212,  -220, 428,  582,  -208, 146,  664,  -4,   -130, -652, -190,
+    -532, -4,   166,  -214, 6,    106,  -4,   192,  -388, -24,  44,   -4,
+    132,  -140, 268,  -4,   -4,   428,  -436, 860,  -4,   -4,   136,  -144,
+    276,  -4,   -4,   252,  -260, 508,  21,   -541, -115, -269, -4,   416,
+    -688, -16,  176,  -4,   173,  -103, 33,   177,  -4,   168,  -640, -88,
+    -128, -4,   354,  -362, 712,  -4,   -4,   452,  -460, 908,  -4,   -4,
+    62,   -70,  128,  -4,   -4,   420,  -428, 844,  499,  -106, 141,  610,
+    -4,   666,  -46,  210,  866,  -4,   47,   -148, -19,  -16,  -4,   605,
+    -85,  181,  763,  -4,   64,   -72,  132,  -4,   -4,   24,   -32,  52,
+    -4,   -4,   92,   -100, 188,  -4,   -4,   50,   -58,  104,  -132, -694,
+    -200, -558, -4,   15,   -73,  -13,  -17,  -4,   -62,  -610, -158, -418,
+    -4,   -36,  -343, -90,  -235, -4,   456,  -464, 916,  -4,   -4,   42,
+    -50,  88,   -4,   -4,   400,  -408, 804,  -4,   -4,   222,  -230, 448,
+    606,  -244, 146,  676,  -4,   9,    -172, -37,  -80,  -4,   480,  -370,
+    76,   438,  -4,   223,  -340, -3,   112,  -4,   156,  -164, 316,  -4,
+    -4,   108,  -116, 220,  -4,   -4,   240,  -248, 484,  -4,   -4,   220,
+    -228, 444,  236,  -4,   76,   316,  -4,   164,  -4,   52,   220,  -4,
+    362,  -4,   118,  484,  -4,   332,  -4,   108,  444,
+  };
+  // Set padding to same
+  cnn_config.layer_config[0].pad = PADDING_SAME_ZERO;
+
+  RunCNNTest(image_width, image_height, input, expected_2_same, &cnn_config,
+             image_width, &thread_data, MSE_INT_TOL);
+
+  cnn_config.layer_config[0].pad = PADDING_VALID;
+
+  RunCNNTest(image_width, image_height, input, expected_2_valid, &cnn_config,
+             image_width, &thread_data, MSE_INT_TOL);
+
+  cnn_config.layer_config[0].skip_width = 2;
+  cnn_config.layer_config[0].skip_height = 5;
+  float expected_21_same[] = {
+    -31,  -19,  -49,   -191, -565, -194, -574, -13,  14,   -22,  44,   -16,
+    382,  -366, 738,   -22,  -4,   23,   32,   545,  20,   204,  720,  5,
+    -4,   -4,   -4,    -4,   -4,   -4,   -4,   -4,   -4,   -4,   -4,   -4,
+    -4,   -4,   -4,    -4,   -658, -252, -748, -114, -334, -192, -568, -112,
+    432,  -440, 928,   -64,  276,  -164, 532,  -220, -4,   304,  868,  266,
+    116,  400,  316,   104,  -4,   -4,   -4,   -4,   -4,   -4,   -4,   -4,
+    -4,   -4,   -4,    -4,   -4,   -4,   -4,   -4,   -208, -288, -856, -290,
+    -862, -202, -598,  -132, 132,  -140, 700,  -436, 1000, -144, 532,  -260,
+    -4,   712,  268,   422,  860,  450,  276,  124,  -4,   -4,   -4,   -4,
+    -4,   -4,   -4,    -4,   -4,   -4,   -4,   -4,   -4,   -4,   -4,   -4,
+    -541, -411, -1225, -265, -787, -249, -739, -216, 354,  -362, 1168, -460,
+    974,  -70,  552,   -428, -4,   859,  712,  323,  908,  665,  128,  208,
+    -4,   -4,   -4,    -4,   -4,   -4,   -4,   -4,   -4,   -4,   -4,   -4,
+    -4,   -4,   -4,    -4,   -106, -52,  -148, -66,  -190, -79,  -229, -31,
+    64,   -72,  160,   -32,  148,  -100, 242,  -58,  -4,   72,   132,  154,
+    52,   125,  188,   23,   -4,   -4,   -4,   -4,   -4,   -4,   -4,   -4,
+    -4,   -4,   -4,    -4,   -4,   -4,   -4,   -4,   -694, -257, -763, -229,
+    -679, -319, -949,  -117, 456,  -464, 962,  -50,  492,  -408, 1030, -230,
+    -4,   295,  916,   625,  88,   537,  804,  109,  -4,   -4,   -4,   -4,
+    -4,   -4,   -4,    -4,   -4,   -4,   -4,   -4,   -4,   -4,   -4,   -4,
+    -244, -140, -412,  -182, -538, -238, -706, -116, 156,  -164, 428,  -116,
+    464,  -248, 708,   -228, -4,   244,  316,  418,  220,  454,  484,  108,
+    -4,   -4,   -4,    -4,   -4,   -4,   -4,   -4,   -4,   -4,   -4,   -4,
+    -4,   -4,   -4,    -4,
+  };
+  float expected_21_valid[] = {
+    -13,  -31,  -19,  -49,  -191, -565, -194, -574, -13,  -31,   -4,   14,
+    -22,  44,   -16,  382,  -366, 738,  -22,  32,   23,   -4,    23,   32,
+    545,  20,   204,  720,  5,    32,   -4,   -4,   -4,   -4,    -4,   -4,
+    -4,   -4,   -4,   -4,   -4,   -4,   -4,   -4,   -4,   -4,    -4,   -4,
+    -4,   -4,   -222, -658, -252, -748, -114, -334, -192, -568,  -112, -328,
+    -4,   432,  -440, 928,  -64,  276,  -164, 532,  -220, 428,   650,  -4,
+    304,  868,  266,  116,  400,  316,  104,  428,  -4,   -4,    -4,   -4,
+    -4,   -4,   -4,   -4,   -4,   -4,   -4,   -4,   -4,   -4,    -4,   -4,
+    -4,   -4,   -4,   -4,   -72,  -208, -288, -856, -290, -862,  -202, -598,
+    -132, -388, -4,   132,  -140, 700,  -436, 1000, -144, 532,   -260, 508,
+    200,  -4,   712,  268,  422,  860,  450,  276,  124,  508,   -4,   -4,
+    -4,   -4,   -4,   -4,   -4,   -4,   -4,   -4,   -4,   -4,    -4,   -4,
+    -4,   -4,   -4,   -4,   -4,   -4,   -183, -541, -411, -1225, -265, -787,
+    -249, -739, -216, -640, -4,   354,  -362, 1168, -460, 974,   -70,  552,
+    -428, 844,  533,  -4,   859,  712,  323,  908,  665,  128,   208,  844,
+    -4,   -4,   -4,   -4,   -4,   -4,   -4,   -4,   -4,   -4,    -4,   -4,
+    -4,   -4,   -4,   -4,   -4,   -4,   -4,   -4,   -38,  -106,  -52,  -148,
+    -66,  -190, -79,  -229, -31,  -85,  -4,   64,   -72,  160,   -32,  148,
+    -100, 242,  -58,  104,  98,   -4,   72,   132,  154,  52,    125,  188,
+    23,   104,  -4,   -4,   -4,   -4,   -4,   -4,   -4,   -4,    -4,   -4,
+    -4,   -4,   -4,   -4,   -4,   -4,   -4,   -4,   -4,   -4,    -234, -694,
+    -257, -763, -229, -679, -319, -949, -117, -343, -4,   456,   -464, 962,
+    -50,  492,  -408, 1030, -230, 448,  686,  -4,   295,  916,   625,  88,
+    537,  804,  109,  448,  -4,   -4,   -4,   -4,   -4,   -4,    -4,   -4,
+    -4,   -4,   -4,   -4,   -4,   -4,   -4,   -4,   -4,   -4,    -4,   -4,
+    -84,  -244, -140, -412, -182, -538, -238, -706, -116, -340,  -4,   156,
+    -164, 428,  -116, 464,  -248, 708,  -228, 444,  236,  -4,    244,  316,
+    418,  220,  454,  484,  108,  444,
+  };
+
+  cnn_config.layer_config[0].pad = PADDING_SAME_ZERO;
+
+  RunCNNTest(image_width, image_height, input, expected_21_same, &cnn_config,
+             image_width, &thread_data, MSE_INT_TOL);
+
+  cnn_config.layer_config[0].pad = PADDING_VALID;
+
+  RunCNNTest(image_width, image_height, input, expected_21_valid, &cnn_config,
+             image_width, &thread_data, MSE_INT_TOL);
+}
+
+TEST_F(CNNTest, TestLargeKernelsAndStrides) {
+  float input_10x11[] = {
+    4,  4,  2,  4,  2,  -5, -2, 3, -1, 0,  0,  1,  2,  0,  -5, -2, -5, 1,  -3,
+    -1, 4,  -3, 2,  -2, 1,  0,  1, -3, -3, -4, -2, -2, 1,  -4, -1, 4,  1,  -4,
+    -4, -4, 3,  2,  -5, 3,  -5, 1, 2,  -4, 1,  -1, 3,  4,  -2, 3,  -3, 3,  0,
+    2,  -4, -5, -5, -2, -1, -2, 1, 1,  1,  -2, 4,  -5, 4,  -1, -1, 2,  3,  -4,
+    2,  2,  3,  0,  0,  1,  0,  3, 2,  3,  1,  -2, 3,  -4, 3,  2,  4,  -2, 0,
+    4,  -4, 1,  -3, -3, -3, -5, 1, -3, -5, 0,  4,  -1, -3, 2,
+  };
+
+  float weights_10x11[] = {
+    -3, 4,  -4, -3, -5, 1,  -2, 3,  1,  -4, -4, 0,  -1, 0,  3,  1,  -3, -2, 0,
+    -1, 1,  3,  -4, -4, -3, -3, -2, 4,  3,  -5, 4,  2,  -3, 4,  -2, -1, 2,  -1,
+    -5, 0,  -3, 0,  3,  -5, -5, 3,  -4, -1, -5, 3,  4,  0,  4,  -5, 2,  -1, 2,
+    -1, -1, -1, -5, 0,  -4, 3,  -1, 1,  1,  -1, 3,  2,  -5, -4, 0,  -4, 4,  -5,
+    -3, 4,  -5, 2,  -5, -4, -4, -1, 3,  3,  0,  2,  -4, 1,  -2, 1,  1,  0,  3,
+    -2, 0,  1,  2,  4,  -3, -1, -5, -5, 2,  -4, 1,  1,  2,  -4, -2, -2, 2,  1,
+    3,  4,  -5, 1,  -1, -3, -3, -1, -2, -5, 1,  -1, 0,  1,  4,  4,  0,  0,  4,
+    -3, -1, -5, -3, 0,  1,  1,  1,  -5, 3,  4,  3,  -5, 3,  -2, -2, 0,  -4, 0,
+    0,  -2, 1,  -4, -1, 0,  -5, -2, -2, -5, -3, -3, 1,  1,  -3, 2,  4,  2,  4,
+    -4, -3, 3,  1,  1,  3,  -4, 4,  -2, -3, -3, -3, -3, -4, -2, 3,  -5, 2,  4,
+    -1, -4, -4, 4,  -2, -1, 3,  -3, -4, -4, -2, 4,  1,  0,  2,  -1, 4,  -3, 1,
+    4,  -3, 4,  4,  0,  -4, 3,  -2, -3, 2,  3,  -1, -3, 2,  1,  4,  -2, -3, 1,
+    4,  -2, 2,  -2, -5, -2, 1,  4,  -1, -4, 4,  -5, 2,  -5, -4, -1, -2, 3,  1,
+    2,  1,  -5, 1,  -5, -4, -1, -2, 2,  -2, -4, -3, -2, -2, 4,  -1, 2,  2,  -4,
+    2,  -2, 4,  -4, -2, -2, 1,  -1, 1,  1,  1,  -4, -5, -2, 3,  -4, -1, 3,  -2,
+    3,  2,  -5, -4, 0,  3,  -2, -4, -5, 3,  -2, -4, 2,  -2, 1,  -4, 0,  2,  -5,
+    1,  -4, -1, -1, 4,  -5, -4, 0,  -5, -4, -3, -5, -4, 0,  2,  0,  -4, 2,  -2,
+    1,  1,  -3, 2,  0,  -4, 0,  -4, 1,  0,  -5, -1, -1, -1, -5, 4,  2,  2,  -4,
+    3,  -2, -2, 2,  -3, -2, -1, 2,  -4, -5, 2,  -2, -4, -5, -5, -1, 2,  -1, 0,
+    -5, -2, -2, -5, 0,  1,  -1, -5, 0,  3,  2,  3,  0,  -3, -2, 0,  -5, -1, -2,
+    2,  -4, -1, 2,  2,  -5, 2,  -4, 0,  3,  -3, 1,  0,  0,  1,  -5, -3, 1,  -1,
+    0,  -4, -3, 2,  -4, -4, 4,  -1, 0,  1,  2,  -4, -5, 4,  -2, 1,  -4, -4, -3,
+    -1, -1, 1,  -1, -4, -1, -4, -3, 2,  -1, -2, -4, 1,  1,  0,  -2, 0,  -4, 3,
+    -3, 0,  -4, -1, -4, 2,  -1, -2, -5, -1, -2, -3, 3,  -1, 0,  -3, 0,  1,  -5,
+    1,  -5, 0,  1,
+  };
+
+  float bias_10x11[] = { 3 };
+
+  float expected_10x11[] = {
+    118,
+  };
+
+  CNN_CONFIG cnn_config = { 1,
+                            0,
+                            0,
+                            0,
+                            0,
+                            { {
+                                1,
+                                23,
+                                20,
+                                1,
+                                15,
+                                20,
+                                0,
+                                weights_10x11,
+                                bias_10x11,
+                                PADDING_SAME_ZERO,
+                                NONE,
+                                0,
+                                0,
+                                BRANCH_NO_COPY,
+                                BRANCH_NOC,
+                                {},
+                                {},
+                                0,
+                            } } };
+
+  int image_height = 10;
+  int image_width = 11;
+
+  CNN_THREAD_DATA thread_data = { 1, NULL };
+
+  RunCNNTest(image_width, image_height, input_10x11, expected_10x11,
+             &cnn_config, image_width, &thread_data, MSE_INT_TOL);
+
+  float input_11x10[] = {
+    -2, -2, 3,  -5, -1, -3, 1,  3,  2,  1,  1,  -5, 4,  1,  3,  -5, 3,  -3, -5,
+    0,  -1, -3, -3, 1,  1,  -5, -1, -5, -5, -3, 0,  1,  -3, -1, -3, -3, 0,  3,
+    4,  -4, -1, 3,  -3, -1, -3, 1,  -3, -2, -1, -4, -3, 2,  -4, 1,  -4, -1, -3,
+    -5, -1, 2,  3,  0,  2,  2,  -5, 4,  1,  2,  -1, -4, 4,  -4, -4, 0,  -1, 1,
+    -1, 1,  -3, -3, -2, 1,  2,  4,  4,  4,  -3, -3, 0,  1,  0,  1,  4,  1,  3,
+    4,  -3, -2, -4, 4,  2,  0,  3,  4,  -1, 2,  -2, 1,  -3, -2,
+  };
+
+  float weights_11x10[] = {
+    4,  -1, 1,  -1, 2,  4,  3,  3,  -4, 3,  -5, 1,  -1, -1, -2, -2, 0,  2,  -3,
+    -2, 3,  -5, -1, 0,  -1, -2, -2, -1, 2,  4,  3,  1,  0,  0,  -3, 3,  -4, -1,
+    -5, 4,  -2, -2, 1,  2,  -1, -3, 1,  2,  -5, 1,  -3, 3,  3,  0,  -4, -4, -5,
+    -3, -4, -4, 4,  -2, 4,  4,  -2, 2,  -5, -1, -2, -5, -1, 4,  -3, 3,  -2, 0,
+    -4, -3, 0,  -1, -2, 4,  2,  0,  -2, -5, -4, 1,  4,  -4, -2, 2,  -2, 1,  1,
+    -4, 1,  -4, -4, -2, 4,  2,  -1, -5, -5, 1,  -3, -3, 3,  -3, -5, -3, 4,  -1,
+    -1, -3, 0,  -4, 3,  -1, 0,  -2, 0,  -5, -2, -5, 2,  0,  -5, 2,  3,  -2, 2,
+    4,  -1, 1,  -3, 2,  3,  2,  0,  -5, -4, -5, 2,  1,  1,  -1, -2, 3,  4,  2,
+    -2, 4,  -2, 3,  1,  -4, -3, -1, 4,  4,  -3, -5, -2, 2,  0,  3,  -2, 3,  -1,
+    -4, 0,  -2, 0,  3,  4,  -2, -3, -2, 0,  3,  4,  2,  -4, 0,  1,  2,  2,  -1,
+    -1, 4,  1,  4,  -2, -1, -1, -5, 1,  -3, 3,  3,  -1, -4, 3,  -5, 0,  0,  -1,
+    -4, -1, -2, 4,  -2, 3,  3,  -3, 1,  -1, 2,  -1, 4,  4,  -2, -2, 4,  -2, 0,
+    3,  -3, -5, -1, -2, 4,  -4, 2,  -4, 0,  -2, 3,  -3, 2,  2,  -2, -5, -1, 4,
+    3,  -2, -1, 3,  3,  -1, 3,  0,  -3, 0,  4,  2,  0,  -1, 4,  1,  1,  2,  1,
+    3,  1,  1,  1,  -3, -5, -4, 4,  -4, 2,  0,  0,  -4, 1,  4,  -5, 4,  4,  0,
+    1,  0,  -2, -4, -4, -3, 0,  1,  -5, 4,  0,  -3, -2, -4, 2,  4,  1,  -5, 1,
+    -4, 1,  0,  -3, -3, 0,  2,  -5, 4,  3,  -2, -5, 3,  1,  -1, 0,  3,  -2, -2,
+    3,  -2, -5, 4,  1,  -2, 2,  -1, 0,  4,  0,  -5, 3,  -2, 1,  2,  1,  -5, -3,
+    -2, -5, 4,  -4, 0,  3,  2,  -1, -4, -1, 2,  1,  -2, 3,  -1, -4, 2,  0,  -3,
+    1,  -1, 2,  -5, -4, -1, -5, 1,  4,  3,  4,  2,  -3, 1,  -5, -1, 3,  0,  -1,
+    -4, 3,  4,  -5, 4,  4,  -3, 2,  -3, -1, -3, -5, -3, 2,  -3, -2, 1,  1,  0,
+    -5, 3,  2,  1,  -5, 1,  1,  1,  3,  4,  -4, -1, -2, 0,  -5, -3, -5, -2, -4,
+    3,  3,  3,  4,  0,  -4, -1, -5, 0,  -3, 1,  4,  4,  -4, 4,  -5, -5, -1, -2,
+    -5, 3,  -4, 4,  3,  0,  -3, 2,  -2, 0,  0,  4,  4,  0,  -2, 1,  -1, -3, 2,
+    -1, 1,  -3, -5,
+  };
+
+  float bias_11x10[] = {
+    -5,
+  };
+
+  float expected_11x10[] = {
+    36,  -84,  95,   45,  18,   46,   77,  -54, -99,  -149, 66,  49,  161, 11,
+    39,  61,   -66,  61,  4,    -3,   34,  -44, -23,  31,   64,  29,  47,  72,
+    -27, -27,  121,  -3,  100,  1,    30,  -78, -12,  -89,  -59, 8,   -16, 112,
+    91,  -102, -26,  -4,  30,   54,   4,   -84, -24,  -58,  27,  -53, -33, 5,
+    53,  -26,  63,   50,  -103, -130, -23, 6,   -104, -207, 73,  23,  77,  132,
+    38,  32,   -130, -44, -60,  7,    27,  176, 45,   -32,  -2,  99,  -97, 63,
+    69,  126,  47,   63,  136,  -57,  5,   16,  -40,  -157, 8,   38,  -44, -10,
+    91,  7,    122,  140, 30,   -105, 4,   -1,  113,  64,   180, 141,
+  };
+
+  cnn_config.layer_config[0].weights = weights_11x10;
+  cnn_config.layer_config[0].bias = bias_11x10;
+  cnn_config.layer_config[0].filter_width = 20;
+  cnn_config.layer_config[0].filter_height = 23;
+  cnn_config.layer_config[0].skip_width = 1;
+  cnn_config.layer_config[0].skip_height = 1;
+  image_height = 11;
+  image_width = 10;
+
+  RunCNNTest(image_width, image_height, input_11x10, expected_11x10,
+             &cnn_config, image_width, &thread_data, MSE_INT_TOL);
+}
+
+TEST_F(CNNTest, TestSoftsignSingleLayer) {
+  int image_width = 8;
+  int image_height = 8;
+  int filter_height = 5;
+  int filter_width = 4;
+  float input[] = {
+    -0.5220f, 0.8410f,  -0.8990f, -0.0090f, 0.6710f,  -0.9470f, -0.8240f,
+    -0.0870f, 0.5380f,  0.4750f,  0.570f,   -0.3760f, -0.6960f, -0.5940f,
+    -0.3830f, 0.080f,   -0.0980f, -0.4940f, -0.4030f, 0.9460f,  -0.6020f,
+    0.4220f,  0.6190f,  0.6640f,  -0.9210f, -0.1470f, -0.2480f, -0.1120f,
+    -0.580f,  -0.0650f, 0.3330f,  0.9860f,  -0.7430f, 0.7610f,  0.4840f,
+    0.1030f,  0.9570f,  0.6120f,  -0.5240f, -0.1220f, -0.5850f, -0.270f,
+    0.7840f,  -0.9790f, 0.7290f,  -0.30f,   -0.6460f, 0.0780f,  0.4750f,
+    -0.0510f, 0.4550f,  0.3850f,  -0.7230f, 0.4460f,  -0.6260f, -0.810f,
+    0.8720f,  -0.2120f, -0.580f,  -0.9510f, -0.8430f, -0.1340f, -0.0850f,
+    0.9190f,
+  };
+  float expected_same[] = {
+    0.430f,   0.660f,  0.5510f,  -0.610f,  0.450f,  -0.1610f, 0.0520f,  0.3240f,
+    0.6820f,  0.3820f, 0.6360f,  0.7480f,  0.3080f, 0.090f,   0.3910f,  0.1730f,
+    0.340f,   0.6660f, -0.4990f, 0.4280f,  0.1540f, 0.120f,   0.4670f,  0.6150f,
+    -0.3880f, 0.7590f, 0.4190f,  0.7350f,  0.5310f, -0.5160f, -0.1760f, 0.6790f,
+    -0.6780f, 0.5470f, 0.5750f,  -0.6420f, 0.7210f, -0.4620f, 0.5430f,  0.770f,
+    -0.1990f, 0.3950f, 0.7860f,  -0.4380f, 0.7540f, 0.2640f,  -0.6430f, 0.4510f,
+    -0.1260f, 0.1590f, -0.2110f, -0.0560f, 0.6570f, 0.680f,   0.5870f,  0.4720f,
+    0.4040f,  0.3630f, 0.670f,   0.2360f,  0.410f,  0.6980f,  -0.5350f, 0.3940f,
+  };
+  float expected_replicate[] = {
+    0.540f,   0.7230f,  -0.3530f, -0.2130f, 0.7440f,  -0.4470f, -0.6260f,
+    -0.2050f, 0.7230f,  0.4630f,  0.5920f,  0.7440f,  0.6080f,  0.3130f,
+    -0.5670f, -0.4720f, 0.5480f,  0.6660f,  -0.4990f, 0.4280f,  0.1540f,
+    0.120f,   0.3390f,  0.6090f,  0.4160f,  0.7590f,  0.4190f,  0.7350f,
+    0.5310f,  -0.5160f, -0.490f,  0.4450f,  -0.610f,  0.5470f,  0.5750f,
+    -0.6420f, 0.7210f,  -0.4620f, 0.3150f,  0.7370f,  -0.5820f, 0.3950f,
+    0.7860f,  -0.4380f, 0.7540f,  0.2640f,  -0.7430f, -0.5340f, -0.6270f,
+    0.4430f,  0.4730f,  0.4570f,  0.7450f,  0.630f,   0.2620f,  0.3140f,
+    -0.1840f, 0.1810f,  0.7210f,  0.2760f,  0.6430f,  0.6720f,  -0.4390f,
+    0.2040f,
+  };
+  float expected_valid[] = {
+    0.6660f,  -0.4990f, 0.4280f,  0.1540f,  0.120f,  0.7590f,  0.4190f,
+    0.7350f,  0.5310f,  -0.5160f, 0.5470f,  0.5750f, -0.6420f, 0.7210f,
+    -0.4620f, 0.3950f,  0.7860f,  -0.4380f, 0.7540f, 0.2640f,
+  };
+  float weights[] = {
+    0.6210f,  0.3710f,  -0.2770f, -0.7230f, -0.2450f, 0.6770f,  0.3080f,
+    -0.9880f, -0.080f,  0.7190f,  -0.6760f, -0.0170f, -0.8970f, 0.8260f,
+    0.7390f,  -0.4550f, -0.4260f, -0.6330f, 0.0880f,  -0.9390f,
+  };
+  float bias[] = {
+    0.750f,
+  };
+
+  CNN_CONFIG cnn_config = { 1,
+                            0,
+                            0,
+                            0,
+                            0,
+                            { {
+                                1,
+                                filter_width,
+                                filter_height,
+                                1,
+                                1,
+                                1,
+                                0,
+                                weights,
+                                bias,
+                                PADDING_SAME_ZERO,
+                                SOFTSIGN,
+                                0,
+                                0,
+                                BRANCH_NO_COPY,
+                                BRANCH_NOC,
+                                {},
+                                {},
+                                0,
+                            } } };
+
+  CNN_THREAD_DATA thread_data = { 1, NULL };
+
+  RunCNNTest(image_width, image_height, input, expected_same, &cnn_config,
+             image_width, &thread_data, MSE_FLOAT_TOL);
+
+  cnn_config.layer_config[0].pad = PADDING_SAME_REPLICATE;
+
+  RunCNNTest(image_width, image_height, input, expected_replicate, &cnn_config,
+             image_width, &thread_data, MSE_FLOAT_TOL);
+
+  cnn_config.layer_config[0].pad = PADDING_VALID;
+
+  RunCNNTest(image_width, image_height, input, expected_valid, &cnn_config,
+             image_width, &thread_data, MSE_FLOAT_TOL);
+}
+
+TEST_F(CNNTest, TestBranchTensorAdd) {
+  int filter_width = 2;
+  int filter_height = 3;
+
+  int image_width = 4;
+  int image_height = 4;
+
+  float input[] = {
+    -3, -2, -2, 0, -1, 3, 2, -2, 1, 3, 4, 0, 2, -5, -4, 0,
+  };
+
+  float weights[] = {
+    -3, -1, 4,  -1, -3, 3,  3,  0,  2,  0,  3,  2,  4,  4, 4,  -5, 1, -4,
+    2,  -4, 1,  -3, 0,  4,  -5, 4,  0,  -4, -3, -1, 0,  0, -2, 0,  0, 2,
+    -5, -1, 1,  -3, 3,  4,  3,  0,  1,  -1, 1,  1,  2,  4, -2, -5, 2, -2,
+    3,  -2, 4,  -1, 0,  2,  3,  2,  -2, -1, -3, 1,  3,  4, -1, -3, 0, -4,
+    4,  2,  -3, -3, -1, 0,  1,  0,  3,  3,  -3, 0,  3,  2, -5, -3, 4, -5,
+    3,  -1, -1, -3, 0,  1,  -1, -4, 2,  4,  -1, 4,  -1, 1, 3,  4,  4, 4,
+    0,  -1, -3, -3, -3, -3, 2,  -3, -2, 2,  3,  -3,
+  };
+
+  float bias[] = {
+    3, 4, -1, -1, 2, 1, -2, 1, 4, 1, 3,
+  };
+
+  float expected[] = {
+    -11502, -4101, -3424, 668,   -17950, -5470, -5504, 626,
+    4835,   446,   1779,  -3483, 3679,   -4214, 4578,  -105,
+  };
+
+  int channels = 2;
+
+  CNN_CONFIG cnn_config = { 6,
+                            0,
+                            0,
+                            0,
+                            0,
+                            { {
+                                  1,
+                                  filter_width,
+                                  filter_height,
+                                  channels,
+                                  1,
+                                  1,
+                                  0,
+                                  weights,
+                                  bias,
+                                  PADDING_SAME_ZERO,
+                                  NONE,
+                                  0,
+                                  0,
+                                  BRANCH_NO_COPY,
+                                  BRANCH_NOC,
+                                  {},
+                                  {},
+                                  -1,
+                              },
+                              {
+                                  channels,
+                                  filter_width,
+                                  filter_height,
+                                  channels,
+                                  1,
+                                  1,
+                                  0,
+                                  nullptr,
+                                  nullptr,
+                                  PADDING_SAME_ZERO,
+                                  NONE,
+                                  0,
+                                  0,
+                                  BRANCH_INPUT,
+                                  BRANCH_NOC,
+                                  {
+                                      0x02,
+                                      0,
+                                      0x00,
+                                  },
+                                  {},
+                                  -1,
+                              },
+                              {
+                                  channels,
+                                  filter_width,
+                                  filter_height,
+                                  channels,
+                                  1,
+                                  1,
+                                  0,
+                                  nullptr,
+                                  nullptr,
+                                  PADDING_SAME_ZERO,
+                                  NONE,
+                                  0,
+                                  1,
+                                  BRANCH_NO_COPY,
+                                  BRANCH_NOC,
+                                  {},
+                                  {},
+                                  -1,
+                              },
+                              {
+                                  channels,
+                                  filter_width,
+                                  filter_height,
+                                  channels,
+                                  1,
+                                  1,
+                                  0,
+                                  nullptr,
+                                  nullptr,
+                                  PADDING_SAME_ZERO,
+                                  NONE,
+                                  0,
+                                  1,
+                                  BRANCH_NO_COPY,
+                                  BRANCH_NOC,
+                                  {},
+                                  {},
+                                  -1,
+                              },
+                              {
+                                  channels,
+                                  filter_width,
+                                  filter_height,
+                                  channels,
+                                  1,
+                                  1,
+                                  0,
+                                  nullptr,
+                                  nullptr,
+                                  PADDING_SAME_ZERO,
+                                  NONE,
+                                  0,
+                                  0,
+                                  BRANCH_NO_COPY,
+                                  BRANCH_ADD,
+                                  {
+                                      0x00,
+                                      0,
+                                      0x02,
+                                  },
+                                  {},
+                                  -1,
+                              },
+                              {
+                                  channels,
+                                  filter_width,
+                                  filter_height,
+                                  1,
+                                  1,
+                                  1,
+                                  0,
+                                  nullptr,
+                                  nullptr,
+                                  PADDING_SAME_ZERO,
+                                  NONE,
+                                  0,
+                                  0,
+                                  BRANCH_NO_COPY,
+                                  BRANCH_NOC,
+                                  {},
+                                  {},
+                                  0,
+                              } } };
+
+  // Weights and biases need to be specified separately because
+  // of the offset.
+  AssignLayerWeightsBiases(&cnn_config, weights, bias);
+
+  CNN_THREAD_DATA thread_data = { 1, NULL };
+
+  RunCNNTest(image_width, image_height, input, expected, &cnn_config,
+             image_width, &thread_data, MSE_INT_TOL);
+}
+
+TEST_F(CNNTest, TestBranchTensorConcatenation) {
+  int filter_width = 2;
+  int filter_height = 3;
+
+  int image_width = 4;
+  int image_height = 4;
+
+  float input[] = {
+    -3, -2, -2, 0, -1, 3, 2, -2, 1, 3, 4, 0, 2, -5, -4, 0,
+  };
+
+  float weights[] = {
+    3,  0,  2,  0,  2,  3,  1,  -3, 1,  -5, -3, 0,  -4, 4,  0,  -5, 0,  -5, -1,
+    -2, -5, 0,  -3, 2,  -4, 2,  0,  2,  -1, 0,  -4, 3,  0,  0,  -1, -5, 2,  -1,
+    4,  -4, -2, -3, -3, 3,  4,  -2, -1, -4, -1, 4,  4,  -1, 4,  3,  -4, 2,  -2,
+    -4, -3, -2, 3,  -3, -5, -1, 3,  -2, 4,  1,  -4, -3, -5, -5, -3, 4,  -2, -2,
+    -1, -5, -5, 0,  -1, -2, -3, 3,  -4, -5, 2,  -3, 1,  0,  -5, 2,  2,  -2, 0,
+    2,  2,  -2, 4,  2,  2,  0,  1,  -5, -3, 0,  2,  -2, 1,  2,  -5, 2,  3,  3,
+    -1, 3,  0,  -3, 3,  -4, -4, 3,  3,  -4, -2, 2,  -2, 2,  -2, -1, 3,  0,
+  };
+
+  float bias[] = {
+    -3, -5, 4, -4, -3, -2, 0, 3, -4, 4, -3,
+  };
+
+  float expected[] = {
+    -33533, -32087, -6741,  -2124, 39979, 41453, 14034, 689,
+    -22611, -42203, -14882, -239,  15781, 15963, 9524,  837,
+  };
+
+  int channels = 2;
+
+  CNN_CONFIG cnn_config = { 6,
+                            0,
+                            0,
+                            0,
+                            0,
+                            { {
+                                  1,
+                                  filter_width,
+                                  filter_height,
+                                  channels,
+                                  1,
+                                  1,
+                                  0,
+                                  weights,
+                                  bias,
+                                  PADDING_SAME_ZERO,
+                                  NONE,
+                                  0,
+                                  0,
+                                  BRANCH_NO_COPY,
+                                  BRANCH_NOC,
+                                  {},
+                                  {},
+                                  -1,
+                              },
+                              {
+                                  channels,
+                                  filter_width,
+                                  filter_height,
+                                  channels,
+                                  1,
+                                  1,
+                                  0,
+                                  nullptr,
+                                  nullptr,
+                                  PADDING_SAME_ZERO,
+                                  NONE,
+                                  0,
+                                  0,
+                                  BRANCH_INPUT,
+                                  BRANCH_NOC,
+                                  {
+                                      0x02,
+                                      0,
+                                      0x00,
+                                  },
+                                  {},
+                                  -1,
+                              },
+                              {
+                                  channels,
+                                  filter_width,
+                                  filter_height,
+                                  channels,
+                                  1,
+                                  1,
+                                  0,
+                                  nullptr,
+                                  nullptr,
+                                  PADDING_SAME_ZERO,
+                                  NONE,
+                                  0,
+                                  1,
+                                  BRANCH_NO_COPY,
+                                  BRANCH_NOC,
+                                  {},
+                                  {},
+                                  -1,
+                              },
+                              {
+                                  channels,
+                                  filter_width,
+                                  filter_height,
+                                  channels,
+                                  1,
+                                  1,
+                                  0,
+                                  nullptr,
+                                  nullptr,
+                                  PADDING_SAME_ZERO,
+                                  NONE,
+                                  0,
+                                  1,
+                                  BRANCH_NO_COPY,
+                                  BRANCH_NOC,
+                                  {},
+                                  {},
+                                  -1,
+                              },
+                              {
+                                  channels,
+                                  filter_width,
+                                  filter_height,
+                                  channels,
+                                  1,
+                                  1,
+                                  0,
+                                  nullptr,
+                                  nullptr,
+                                  PADDING_SAME_ZERO,
+                                  NONE,
+                                  0,
+                                  0,
+                                  BRANCH_NO_COPY,
+                                  BRANCH_CAT,
+                                  {
+                                      0x00,
+                                      0,
+                                      0x02,
+                                  },
+                                  {},
+                                  -1,
+                              },
+                              {
+                                  channels + channels,
+                                  filter_width,
+                                  filter_height,
+                                  1,
+                                  1,
+                                  1,
+                                  0,
+                                  nullptr,
+                                  nullptr,
+                                  PADDING_SAME_ZERO,
+                                  NONE,
+                                  0,
+                                  0,
+                                  BRANCH_NO_COPY,
+                                  BRANCH_NOC,
+                                  {},
+                                  {},
+                                  0,
+                              } } };
+
+  // Weights and biases need to be specified separately because
+  // of the offset.
+  AssignLayerWeightsBiases(&cnn_config, weights, bias);
+
+  CNN_THREAD_DATA thread_data = { 1, NULL };
+
+  RunCNNTest(image_width, image_height, input, expected, &cnn_config,
+             image_width, &thread_data, MSE_INT_TOL);
+}
+
+// TODO(logangw): Add test to test all combinations of branch_copy_type.
+
+TEST_F(CNNTest, TestBranchCombinations) {
+  int filter_width = 2;
+  int filter_height = 3;
+
+  int image_width = 4;
+  int image_height = 4;
+
+  float input[] = {
+    3, 2, -5, -4, 4, -2, -4, -3, 4, 2, -3, 2, -3, 1, -5, -1,
+  };
+
+  float weights[] = {
+    2,  3,  0,  4,  4,  3,  1,  0,  1,  -5, 4,  -3, 3,  0,  4,  -1, -1, -5,
+    2,  1,  -3, -5, 3,  -1, -3, -2, 0,  -2, 3,  0,  -2, -4, -2, -2, 2,  -5,
+    4,  -5, 0,  1,  -5, -4, -3, -4, 2,  -2, 1,  0,  3,  -2, -4, 3,  4,  -4,
+    -1, -1, -3, -2, -2, -1, 2,  0,  2,  -1, 2,  -4, -4, -1, 2,  0,  3,  -2,
+    -2, 3,  -3, 4,  -2, 4,  3,  4,  1,  0,  -2, -3, -5, 1,  -3, 2,  0,  -2,
+    -2, -1, -1, -5, -2, -3, -1, 3,  3,  4,  4,  0,  2,  1,  3,  -3, 2,  -5,
+    -5, 1,  -5, -1, 3,  3,  2,  -4, -1, 3,  -4, -2, -5, -2, 1,  3,  2,  2,
+    -5, -2, -3, -1, -2, -4, -1, -2, 2,  1,  -4, -4, 2,  0,  2,  0,  2,  -3,
+    -2, -4, 4,  0,  1,  -3, -5, 4,  -1, 2,  3,  -5, -1, 0,  4,  -1, -1, 3,
+    -1, -3, 3,  1,  4,  3,  4,  3,  -4, -5, -1, 3,  3,  -4, 3,  1,  3,  -5,
+    3,  4,  -5, 4,  2,  -1, -5, 2,  1,  0,  4,  0,  -3, 2,  0,  2,  -2, 1,
+    -1, -2, -1, -5, 4,  3,  3,  -2, 2,  4,  -5, -5, -3, -2, 4,  0,  -4, 1,
+  };
+
+  float bias[] = {
+    -1, 4, 0, 2, 2, -2, 0, -4, -5, -1, 1, -2, 3, 0, 4, -2, 1, 0, 0,
+  };
+
+  float expected[] = {
+    149496, 15553,  -24193, -20956, 134094, 86432,  -68283, -6366,
+    -53031, 133739, 67407,  -13539, -53205, -58635, -20033, 1979,
+  };
+
+  int channels = 2;
+
+  CNN_CONFIG cnn_config = { 10,
+                            0,
+                            0,
+                            0,
+                            0,
+                            {
+                                {
+                                    1,
+                                    filter_width,
+                                    filter_height,
+                                    channels,
+                                    1,
+                                    1,
+                                    0,
+                                    weights,
+                                    bias,
+                                    PADDING_SAME_ZERO,
+                                    NONE,
+                                    0,
+                                    0,
+                                    BRANCH_NO_COPY,
+                                    BRANCH_NOC,
+                                    {},
+                                    {},
+                                    -1,
+                                },
+                                {
+                                    channels,
+                                    filter_width,
+                                    filter_height,
+                                    channels,
+                                    1,
+                                    1,
+                                    0,
+                                    nullptr,
+                                    nullptr,
+                                    PADDING_SAME_ZERO,
+                                    NONE,
+                                    0,
+                                    0,
+                                    BRANCH_INPUT,
+                                    BRANCH_NOC,
+                                    {
+                                        0x06,
+                                        0,
+                                        0x00,
+                                    },
+                                    {},
+                                    -1,
+                                },
+                                {
+                                    channels,
+                                    filter_width,
+                                    filter_height,
+                                    channels,
+                                    1,
+                                    1,
+                                    0,
+                                    nullptr,
+                                    nullptr,
+                                    PADDING_SAME_ZERO,
+                                    NONE,
+                                    0,
+                                    2,
+                                    BRANCH_OUTPUT,
+                                    BRANCH_NOC,
+                                    {
+                                        0x08,
+                                        0,
+                                        0x00,
+                                    },
+                                    {},
+                                    -1,
+                                },
+                                {
+                                    channels,
+                                    filter_width,
+                                    filter_height,
+                                    channels,
+                                    1,
+                                    1,
+                                    0,
+                                    nullptr,
+                                    nullptr,
+                                    PADDING_SAME_ZERO,
+                                    NONE,
+                                    0,
+                                    3,
+                                    BRANCH_NO_COPY,
+                                    BRANCH_NOC,
+                                    {},
+                                    {},
+                                    -1,
+                                },
+                                {
+                                    channels,
+                                    filter_width,
+                                    filter_height,
+                                    channels,
+                                    1,
+                                    1,
+                                    0,
+                                    nullptr,
+                                    nullptr,
+                                    PADDING_SAME_ZERO,
+                                    NONE,
+                                    0,
+                                    2,
+                                    BRANCH_NO_COPY,
+                                    BRANCH_ADD,
+                                    {
+                                        0x00,
+                                        0,
+                                        0x08,
+                                    },
+                                    {},
+                                    -1,
+                                },
+                                {
+                                    channels,
+                                    filter_width,
+                                    filter_height,
+                                    channels,
+                                    1,
+                                    1,
+                                    0,
+                                    nullptr,
+                                    nullptr,
+                                    PADDING_SAME_ZERO,
+                                    NONE,
+                                    0,
+                                    2,
+                                    BRANCH_NO_COPY,
+                                    BRANCH_NOC,
+                                    {},
+                                    {},
+                                    -1,
+                                },
+                                {
+                                    channels,
+                                    filter_width,
+                                    filter_height,
+                                    channels,
+                                    1,
+                                    1,
+                                    0,
+                                    nullptr,
+                                    nullptr,
+                                    PADDING_SAME_ZERO,
+                                    NONE,
+                                    0,
+                                    1,
+                                    BRANCH_NO_COPY,
+                                    BRANCH_NOC,
+                                    {},
+                                    {},
+                                    -1,
+                                },
+                                {
+                                    channels,
+                                    filter_width,
+                                    filter_height,
+                                    channels,
+                                    1,
+                                    1,
+                                    0,
+                                    nullptr,
+                                    nullptr,
+                                    PADDING_SAME_ZERO,
+                                    NONE,
+                                    0,
+                                    1,
+                                    BRANCH_NO_COPY,
+                                    BRANCH_ADD,
+                                    {
+                                        0x00,
+                                        0,
+                                        0x0C,
+                                    },
+                                    {},
+                                    -1,
+                                },
+                                {
+                                    channels,
+                                    filter_width,
+                                    filter_height,
+                                    channels,
+                                    1,
+                                    1,
+                                    0,
+                                    nullptr,
+                                    nullptr,
+                                    PADDING_SAME_ZERO,
+                                    NONE,
+                                    0,
+                                    0,
+                                    BRANCH_NO_COPY,
+                                    BRANCH_ADD,
+                                    {
+                                        0x00,
+                                        0,
+                                        0x02,
+                                    },
+                                    {},
+                                    -1,
+                                },
+                                {
+                                    channels,
+                                    filter_width,
+                                    filter_height,
+                                    1,
+                                    1,
+                                    1,
+                                    0,
+                                    nullptr,
+                                    nullptr,
+                                    PADDING_SAME_ZERO,
+                                    NONE,
+                                    0,
+                                    0,
+                                    BRANCH_NO_COPY,
+                                    BRANCH_NOC,
+                                    {},
+                                    {},
+                                    0,
+                                },
+                            } };
+
+  // Weights and biases need to be specified separately because
+  // of the offset.
+  AssignLayerWeightsBiases(&cnn_config, weights, bias);
+
+  CNN_THREAD_DATA thread_data = { 1, NULL };
+
+  RunCNNTest(image_width, image_height, input, expected, &cnn_config,
+             image_width, &thread_data, MSE_INT_TOL);
+}
+
+TEST_F(CNNTest, TestSplittingTensors) {
+  int filter_width = 2;
+  int filter_height = 3;
+
+  int image_width = 4;
+  int image_height = 4;
+
+  float input[] = {
+    -1, -1, 2, 1, 3, 2, 4, -3, -4, -2, 2, -3, 1, -3, 4, -2,
+  };
+
+  float weights[] = {
+    -4, 1,  0,  2,  3,  4,  4,  -4, -5, -3, 2,  2,  -4, -3, 3,  2,
+    4,  -4, -3, -4, -4, 1,  -3, -5, -3, 4,  2,  -2, 2,  -1, -4, -1,
+    -2, -3, 1,  1,  0,  -5, -1, 3,  3,  -5, -3, 0,  -3, 1,  -3, -1,
+    1,  -3, -2, -2, 4,  -2, 0,  1,  2,  2,  -4, 2,  4,  0,  -5, -2,
+    4,  4,  -5, 1,  0,  2,  -2, -5, -5, -3, -5, -5, 4,  -3, 0,  0,
+    -4, -4, 0,  -5, -4, 0,  0,  -3, -5, -3, -1, 2,  -1, 4,  -1, 2,
+  };
+
+  float bias[] = {
+    -4, -2, -3, -3, 3, 1, -2,
+  };
+
+  float expected[] = {
+    530,  -762,  1469, 777,  849,   -771, -1698, 600,
+    -658, -1821, 98,   -668, -1798, 30,   887,   -971,
+  };
+
+  CNN_CONFIG cnn_config = { 3,
+                            0,
+                            0,
+                            0,
+                            0,
+                            {
+                                {
+                                    1,
+                                    filter_width,
+                                    filter_height,
+                                    4,
+                                    1,
+                                    1,
+                                    0,
+                                    nullptr,
+                                    nullptr,
+                                    PADDING_SAME_ZERO,
+                                    NONE,
+                                    0,
+                                    0,
+                                    BRANCH_OUTPUT,
+                                    BRANCH_NOC,
+                                    {
+                                        0x02,
+                                        2,
+                                        0x00,
+                                    },
+                                    {},
+                                    -1,
+                                },
+                                {
+                                    4,
+                                    filter_width,
+                                    filter_height,
+                                    2,
+                                    1,
+                                    1,
+                                    0,
+                                    nullptr,
+                                    nullptr,
+                                    PADDING_SAME_ZERO,
+                                    NONE,
+                                    0,
+                                    0,
+                                    BRANCH_NO_COPY,
+                                    BRANCH_CAT,
+                                    {
+                                        0x00,
+                                        0,
+                                        0x02,
+                                    },
+                                    {},
+                                    -1,
+                                },
+                                {
+                                    4,
+                                    filter_width,
+                                    filter_height,
+                                    1,
+                                    1,
+                                    1,
+                                    0,
+                                    nullptr,
+                                    nullptr,
+                                    PADDING_SAME_ZERO,
+                                    NONE,
+                                    0,
+                                    0,
+                                    BRANCH_NO_COPY,
+                                    BRANCH_NOC,
+                                    {},
+                                    {},
+                                    0,
+                                },
+                            } };
+
+  // Weights and biases need to be specified separately because
+  // of the offset.
+  AssignLayerWeightsBiases(&cnn_config, weights, bias);
+
+  CNN_THREAD_DATA thread_data = { 1, NULL };
+
+  RunCNNTest(image_width, image_height, input, expected, &cnn_config,
+             image_width, &thread_data, MSE_INT_TOL);
+}
+
+TEST_F(CNNTest, TestOutputChannelsCount) {
+  int filter_width = 1;
+  int filter_height = 1;
+
+  int image_width = 2;
+  int image_height = 2;
+
+  float input[] = { 0, 0, 0, 0 };
+
+  float weights[] = { 0, 0, 0, 0, 0, 0, 0, 0 };
+
+  float bias[] = { 0, 0, 0, 0, 0, 0 };
+
+  float expected[] = {
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  };
+
+  CNN_CONFIG cnn_config = { 3,
+                            0,
+                            0,
+                            0,
+                            0,
+                            {
+                                {
+                                    1,
+                                    filter_width,
+                                    filter_height,
+                                    2,
+                                    1,
+                                    1,
+                                    0,
+                                    weights,
+                                    bias,
+                                    PADDING_SAME_ZERO,
+                                    NONE,
+                                    0,
+                                    0,
+                                    BRANCH_INPUT,
+                                    BRANCH_NOC,
+                                    {
+                                        0x06,
+                                        0,
+                                        0x00,
+                                    },
+                                    {},
+                                    -1,
+                                },
+                                {
+                                    1,
+                                    filter_width,
+                                    filter_height,
+                                    2,
+                                    1,
+                                    1,
+                                    0,
+                                    weights,
+                                    bias,
+                                    PADDING_SAME_ZERO,
+                                    NONE,
+                                    0,
+                                    2,
+                                    BRANCH_NO_COPY,
+                                    BRANCH_CAT,
+                                    {
+                                        0x00,
+                                        0,
+                                        0x03,
+                                    },
+                                    {},
+                                    -1,
+                                },
+                                {
+                                    2,
+                                    filter_width,
+                                    filter_height,
+                                    2,
+                                    1,
+                                    1,
+                                    0,
+                                    weights,
+                                    bias,
+                                    PADDING_SAME_ZERO,
+                                    NONE,
+                                    0,
+                                    0,
+                                    BRANCH_NO_COPY,
+                                    BRANCH_CAT,
+                                    {
+                                        0x00,
+                                        0,
+                                        0x04,
+                                    },
+                                    {},
+                                    0,
+                                },
+                            } };
+
+  // Weights and biases need to be specified separately because
+  // of the offset.
+  AssignLayerWeightsBiases(&cnn_config, weights, bias);
+
+  CNN_THREAD_DATA thread_data = { 1, NULL };
+
+  RunCNNTest(image_width, image_height, input, expected, &cnn_config,
+             image_width, &thread_data, MSE_FLOAT_TOL);
+}
+
+TEST_F(CNNTest, TestBatchNorm) {
+  int image_width = 28;
+  int image_height = 28;
+  int filter_height = 7;
+  int filter_width = 7;
+  float input[] = {
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0117647f,  0.0705882f,  0.0705882f,  0.0705882f,
+    0.494118f,  0.533333f,  0.686275f,   0.101961f,   0.65098f,    1.0f,
+    0.968627f,  0.498039f,  0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.117647f,   0.141176f,   0.368627f,   0.603922f,
+    0.666667f,  0.992157f,  0.992157f,   0.992157f,   0.992157f,   0.992157f,
+    0.882353f,  0.67451f,   0.992157f,   0.94902f,    0.764706f,   0.25098f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.192157f,
+    0.933333f,  0.992157f,  0.992157f,   0.992157f,   0.992157f,   0.992157f,
+    0.992157f,  0.992157f,  0.992157f,   0.984314f,   0.364706f,   0.321569f,
+    0.321569f,  0.219608f,  0.152941f,   0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0705882f,  0.858824f,   0.992157f,
+    0.992157f,  0.992157f,  0.992157f,   0.992157f,   0.776471f,   0.713725f,
+    0.968627f,  0.945098f,  0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.313725f,   0.611765f,   0.419608f,   0.992157f,
+    0.992157f,  0.803922f,  0.0431373f,  0.0f,        0.168627f,   0.603922f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.054902f,  0.00392157f, 0.603922f,   0.992157f,   0.352941f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.545098f,  0.992157f,   0.745098f,   0.00784314f, 0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0431373f,
+    0.745098f,  0.992157f,  0.27451f,    0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.137255f,   0.945098f,
+    0.882353f,  0.627451f,  0.423529f,   0.00392157f, 0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.317647f,   0.941176f,   0.992157f,
+    0.992157f,  0.466667f,  0.0980392f,  0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.176471f,   0.729412f,   0.992157f,   0.992157f,
+    0.588235f,  0.105882f,  0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0627451f, 0.364706f,   0.988235f,   0.992157f,   0.733333f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.976471f,  0.992157f,   0.976471f,   0.25098f,    0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.180392f,   0.509804f,   0.717647f,   0.992157f,
+    0.992157f,  0.811765f,  0.00784314f, 0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.152941f,   0.580392f,
+    0.898039f,  0.992157f,  0.992157f,   0.992157f,   0.980392f,   0.713725f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0941176f, 0.447059f,  0.866667f,   0.992157f,   0.992157f,   0.992157f,
+    0.992157f,  0.788235f,  0.305882f,   0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0901961f,  0.258824f,   0.835294f,   0.992157f,
+    0.992157f,  0.992157f,  0.992157f,   0.776471f,   0.317647f,   0.00784314f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0705882f,  0.670588f,
+    0.858824f,  0.992157f,  0.992157f,   0.992157f,   0.992157f,   0.764706f,
+    0.313725f,  0.0352941f, 0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.215686f,  0.67451f,   0.886275f,   0.992157f,   0.992157f,   0.992157f,
+    0.992157f,  0.956863f,  0.521569f,   0.0431373f,  0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.533333f,   0.992157f,
+    0.992157f,  0.992157f,  0.831373f,   0.529412f,   0.517647f,   0.0627451f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f,        0.0f,        0.0f,
+    0.0f,       0.0f,       0.0f,        0.0f
+  };
+  float expected[] = {
+    -0.836424f, -0.857365f, -1.62739f,  -1.62739f,  -0.836424f, 5.40742f,
+    0.920853f,  -0.692567f, -0.836424f, -0.534405f, -1.62739f,  -0.836424f,
+    1.32602f,   1.36312f,   0.112766f,  -0.836424f, -0.192962f, 1.56975f,
+    2.45777f,   0.944414f,  -0.192962f, -1.5519f,   -1.5519f,   -0.554006f,
+    -0.192962f, 1.4231f,    -1.5519f,   -0.192962f, 1.3661f,    -1.5519f,
+    -1.5519f,   -0.192962f, -0.843708f, -0.359025f, -0.843708f, -0.843708f,
+    -0.843708f, 4.53065f,   0.0429584f, -0.796804f, -0.843708f, 0.3473f,
+    -0.843708f, -0.843708f, -0.114439f, 3.14817f,   0.0811934f, -0.843708f
+  };
+  float kernel[] = {
+    0.119643f,    -0.237864f,   0.0462892f,   0.0502297f,   -0.0134528f,
+    0.146347f,    0.153133f,    0.0513307f,   0.0752369f,   0.0135557f,
+    -0.111434f,   0.0941854f,   0.0788362f,   0.0299412f,   0.111762f,
+    0.144066f,    0.00431504f,  -0.0177954f,  0.0738092f,   -0.0344215f,
+    0.0832582f,   0.053989f,    -0.112691f,   0.0962145f,   0.0186525f,
+    -0.00660205f, -0.111962f,   -0.126801f,   -0.231625f,   0.17309f,
+    0.0748875f,   -0.179569f,   -0.00513812f, -0.156579f,   -0.147322f,
+    0.184168f,    0.189308f,    -0.200359f,   -0.0156733f,  0.140649f,
+    0.0858496f,   -0.0263217f,  -0.0740749f,  -0.112563f,   0.107528f,
+    0.0609729f,   -0.221625f,   0.0769944f,   -0.00900815f, -0.00136441f,
+    -0.0236521f,  -0.0418025f,  -0.00286299f, 0.12241f,     0.0964093f,
+    -0.0150897f,  0.0532171f,   0.0625916f,   0.116939f,    0.118024f,
+    0.161918f,    -0.00909767f, 0.100897f,    -0.054563f,   -0.175179f,
+    -0.0687892f,  0.00734235f,  0.109833f,    -0.113776f,   0.0595405f,
+    -0.170255f,   0.0124815f,   -0.0363301f,  -0.0127038f,  0.0445554f,
+    -0.0729894f,  0.107428f,    -0.0341417f,  0.132619f,    0.00984557f,
+    -0.00443654f, 0.202929f,    0.0945134f,   0.0148725f,   0.00998574f,
+    -0.0226449f,  0.0478197f,   -0.0793442f,  0.0707599f,   -0.084225f,
+    0.0865795f,   0.071104f,    -0.047894f,   0.0838322f,   0.0635493f,
+    -0.00370265f, -0.157247f,   -0.0289622f,  -0.0590963f,  0.13207f,
+    0.00468011f,  -0.0345372f,  0.217939f,    0.18861f,     -0.0290393f,
+    -0.0440664f,  0.0126197f,   -0.129132f,   -0.124943f,   0.0968156f,
+    -0.0853643f,  -0.182305f,   0.00461618f,  -0.147095f,   -0.230282f,
+    0.00856019f,  0.0278893f,   -0.0300229f,  0.0417871f,   0.0804717f,
+    -0.0768571f,  -0.0397085f,  -0.0601096f,  0.100901f,    -0.0184926f,
+    0.0350673f,   0.0971094f,   -0.0171837f,  -0.289644f,   -0.0899041f,
+    0.08998f,     -0.160319f,   -0.0195103f,  0.0392167f,   -0.137864f,
+    -0.0136294f,  0.0330886f,   -0.0409244f,  -0.092533f,   -0.0427934f,
+    -0.191144f,   -0.0969461f,  0.112035f,    0.138611f,    0.128717f,
+    0.191184f,    0.197462f
+  };
+  float bias[] = { 0.186703f, 0.204358f, -0.0230452f };
+
+  float bn_gamma[] = { 1.32173f, 1.26171f, 1.21966f };
+  float bn_beta[] = { -0.232595f, -0.222652f, -0.232209f };
+  float bn_mean[] = { 0.329233f, 0.199894f, 0.12389f };
+  float bn_std[] = { 0.311986f, 0.189737f, 0.247104f };
+
+  CNN_BATCHNORM_PARAMS bn_params = {
+    bn_gamma,
+    bn_beta,
+    bn_mean,
+    bn_std,
+  };
+
+  CNN_CONFIG cnn_config = {
+    1,
+    0,
+    0,
+    0,
+    0,
+    {
+        {
+            1,
+            filter_width,
+            filter_height,
+            3,
+            7,
+            7,
+            0,
+            kernel,
+            bias,
+            PADDING_VALID,
+            RELU,
+            0,
+            0,
+            BRANCH_NO_COPY,
+            BRANCH_NOC,
+            {},
+            bn_params,
+            0,
+        },
+    },
+  };
+
+  CNN_THREAD_DATA thread_data = { 1, NULL };
+
+  RunCNNTest(image_width, image_height, input, expected, &cnn_config,
+             image_width, &thread_data, MSE_FLOAT_TOL);
+}
+
+TEST_F(CNNTest, TestMultithreading) {
+  int image_height = 2;
+  int image_width = 2;
+  int filter_height = 3;
+  int filter_width = 3;
+
+  float input[] = {
+    -2,
+    4,
+    1,
+    0,
+  };
+
+  float weights[] = {
+    -4, 2, -2, 0,  -4, 4, -3, -3, -3, -1, 1,  0,  -5, -3, 0, -5, 0, 0,
+    -1, 0, 2,  -5, 0,  1, 4,  2,  1,  0,  -2, -1, -5, -3, 2, -2, 1, -5,
+  };
+
+  float bias[] = {
+    -4,
+    -3,
+    -2,
+    3,
+  };
+
+  float expected[] = {
+    2, 10, -8, -17, -24, 5, -15, 6, -5, -5, 7, -10, 4, 13, 9, -14,
+  };
+
+  CNN_CONFIG cnn_config = {
+    1,
+    0,
+    0,
+    0,
+    0,
+    {
+        {
+            1,
+            filter_width,
+            filter_height,
+            4,
+            1,
+            1,
+            0,
+            weights,
+            bias,
+            PADDING_SAME_ZERO,
+            NONE,
+            0,
+            0,
+            BRANCH_NO_COPY,
+            BRANCH_NOC,
+            {},
+            {},
+            0,
+        },
+    },
+  };
+
+  CNN_THREAD_DATA thread_data = { 1, NULL };
+
+  RunCNNTest(image_width, image_height, input, expected, &cnn_config,
+             image_width, &thread_data, MSE_FLOAT_TOL);
+
+  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+  AVxWorker workers[4];
+
+  for (int i = 0; i < 4; ++i) {
+    winterface->init(&workers[i]);
+  }
+
+  thread_data = { 4, workers };
+
+  RunCNNTest(image_width, image_height, input, expected, &cnn_config,
+             image_width, &thread_data, MSE_FLOAT_TOL);
+
+  for (int i = 0; i < 4; ++i) {
+    winterface->end(&workers[i]);
+  }
+}
+
+TEST_F(CNNTest, TestMultiOutput) {
+  const int image_dim = 8;
+  const int image_ch = 3;
+  const int filter_dim = 2;
+  const int stride = 2;
+  const int num_filters = 2;
+
+  const float input_[] = {
+    1.7537929121f,     0.134331551012f,    0.123580039877f,   0.957731845246f,
+    0.391006834217f,   1.00699352042f,     -0.778177955829f,  -0.814166433059f,
+    -0.656374394915f,  0.321967305228f,    -2.19455719176f,   0.708035038966f,
+    0.409148822266f,   -0.318254408902f,   0.152450211189f,   -0.250210793369f,
+    0.826811563186f,   1.6804156584f,      0.273626975978f,   0.437936241887f,
+    -0.329935520167f,  -0.288761611645f,   0.156937008304f,   0.271054157295f,
+    -0.0224828854332f, 1.70110336895f,     -0.989066699309f,  1.30863131729f,
+    -0.165813705702f,  0.00380178619265f,  -0.0837342367587f, 0.760954783156f,
+    -0.413610373524f,  1.17968204175f,     0.720295719536f,   0.308718974472f,
+    -1.10091337671f,   0.693160033687f,    -0.0202862320697f, 1.0221927503f,
+    -1.24521801881f,   -0.478501952308f,   -1.71648619442f,   -0.182571723636f,
+    0.339292649504f,   2.0806519131f,      0.967974033444f,   0.175248672328f,
+    0.0658124561472f,  0.795504169496f,    0.750592557361f,   -1.46631013249f,
+    -1.79052846838f,   -1.03672179515f,    -0.841985521653f,  1.20995011489f,
+    0.140859718215f,   -0.651552622661f,   0.451065110806f,   1.1189443693f,
+    0.100213260593f,   -0.834076868118f,   -1.28734321611f,   1.22064420095f,
+    -0.364143084361f,  0.750961509335f,    -0.888689074553f,  -0.8253547106f,
+    -1.21800999027f,   -0.966670603566f,   1.37384014741f,    0.47281264834f,
+    -0.420416235531f,  0.520163906493f,    0.501296589423f,   1.53418976951f,
+    0.715234751485f,   0.644551588907f,    0.0763504863375f,  -0.0018541943723f,
+    0.322853189656f,   -0.795099723224f,   -0.125177096675f,  1.4476577471f,
+    -0.585888410088f,  -1.44391754955f,    -0.610543221933f,  -0.221859179799f,
+    0.252060200774f,   -0.86287169623f,    -0.0350246229157f, 1.0932311997f,
+    0.899464648842f,   -0.468806951704f,   -0.300861137168f,  1.15776414206f,
+    1.03268544738f,    -0.171579585622f,   -0.179136557119f,  -0.354091003368f,
+    -0.612298249394f,  -1.20237379258f,    1.54604109659f,    0.130664370287f,
+    0.885225111868f,   1.0362799581f,      0.980561720868f,   -0.619379186999f,
+    -1.33818929924f,   -0.237233737961f,   -1.89335425073f,   0.567821011321f,
+    0.862420368465f,   -1.37380916821f,    0.352190056666f,   0.611261516274f,
+    0.393237747152f,   0.894686247967f,    0.190405182149f,   0.264872662911f,
+    -0.0657009133797f, 0.0580512653493f,   -0.401825294366f,  0.4106081318f,
+    0.49484512188f,    -0.0751103149442f,  -1.43243736382f,   1.79855656009f,
+    -1.1075351975f,    0.000354882733011f, -0.950716438608f,  1.27129831688f,
+    1.00495189838f,    0.110358656713f,    1.08315032822f,    -0.972676676218f,
+    -0.0757668962831f, 1.88932045165f,     -0.0672638136275f, 0.425913010161f,
+    -0.781540372017f,  0.976000248609f,    0.687218504122f,   1.31374513445f,
+    -0.932658930672f,  -1.25339468479f,    0.422071294078f,   -0.24189927912f,
+    0.216906604642f,   -1.88720997548f,    1.99252872889f,    0.353943735777f,
+    0.737434784132f,   -1.17848645017f,    1.70424254896f,    0.775297112968f,
+    -0.516392797501f,  0.398130609129f,    0.737248101457f,   0.166282500886f,
+    1.24699015468f,    0.47116183125f,     1.19091180182f,    -0.372695424578f,
+    0.219773209389f,   -0.829467838962f,   -0.52533122724f,   1.98707754595f,
+    0.553692606972f,   -0.933228902369f,   1.55427751643f,    -1.08813399144f,
+    -0.325686682094f,  0.205091443796f,    -1.70381666435f,   0.466465327942f,
+    1.73126863447f,    -0.939133672634f,   1.48318077459f,    -0.599414038168f,
+    -1.1583078687f,    0.518116190201f,    0.133571482458f,   0.84958342672f,
+    1.02205000597f,    -0.0772082009087f,  -1.69567503859f,   1.4697939436f,
+    1.67813743122f,    -0.627911582938f,   0.131380509137f,   -1.35717850726f,
+  };
+  const float *input[3] = { input_, &input_[image_dim * image_dim],
+                            &input_[2 * image_dim * image_dim] };
+
+  const float bias[] = { 0.0f, 0.0f };
+
+  const float weights_1[] = {
+    -0.489547413618f, 0.141916424749f,  -0.279286485585f,  -0.115322211094f,
+    0.299572786936f,  0.205289980785f,  -0.536254480088f,  -0.253626313744f,
+    -0.422883815849f, -0.169702966298f, -0.540104704793f,  0.495319646763f,
+    0.298799079422f,  -0.10054550901f,  -0.306085047056f,  0.171061886165f,
+    -0.108058703878f, -0.410734629888f, -0.0640674673049f, -0.386524840979f,
+    -0.157203423678f, -0.362138920529f, -0.216206085209f,  0.147502517971f,
+  };
+
+  const float weights_2[] = {
+    0.207580604357f,  0.480821146263f,  -0.29111909562f,   0.47422567493f,
+    0.206892553253f,  -0.235067084092f, 0.354516800602f,   -0.212399370252f,
+    -0.419071343731f, -0.050350731631f, -0.0516457320279f, -0.0359310500731f,
+    0.567044864811f,  -0.060341127522f, 0.0501464839637f,  -0.437785677916f,
+  };
+
+  const float weights_3[] = {
+    -0.0690452401448f, -0.356657338763f,   -0.219464031809f, 0.551288365843f,
+    0.181372090853f,   -0.00245268542109f, 0.409000696276f,  -0.593209108763f,
+    0.587352566749f,   -0.243720660227f,   0.266232713887f,  -0.00439285245097f,
+    0.252883228305f,   0.152646192631f,    0.0918944932026f, 0.398853715057f,
+  };
+
+  const float weights_4[] = {
+    0.207560791573f,   0.194201350401f,   0.227802322443f,  0.206533663345f,
+    0.0557331066805f,  0.0224159800424f,  -0.143939197467f, -0.27703361602f,
+    0.130643888389f,   -0.269456557461f,  0.186242862864f,  -0.162879944774f,
+    -0.145503996718f,  -0.0768822987581f, -0.203127976359f, -0.238119922873f,
+    -0.258806479994f,  0.0357957680385f,  -0.1027606976f,   -0.287920082345f,
+    0.189047820993f,   0.250711538481f,   -0.272815714175f, -0.0431449742024f,
+    0.207261230996f,   -0.0396472677451f, 0.131236557412f,  0.174291832499f,
+    -0.251515885765f,  -0.107164007499f,  0.185824534748f,  -0.00561585838161f,
+    0.273393799578f,   -0.139563699075f,  -0.263922456031f, -0.118859844081f,
+    0.109230982597f,   -0.170170294794f,  0.0123025648515f, -0.0839368964355f,
+    -0.0774058234297f, 0.255847138286f,   -0.208430879637f, 0.279170114319f,
+    -0.272890330712f,  -0.217725903006f,  -0.295923275459f, -0.17008723953f,
+    -0.284281803405f,  0.281406323629f,   0.266910044663f,  -0.209963914338f,
+    0.271980962964f,   0.142013581699f,   -0.143896509026f, -0.290509242975f,
+    -0.305768180935f,  0.196902832117f,   -0.090424189662f, -0.147460802346f,
+    0.217722016651f,   0.12353848977f,    -0.169177363577f, -0.0454230918512f,
+  };
+
+  const float expected_0[] = {
+    -2.04858441055f,  -2.12883075791f,    -0.045177363807f, 0.763949675768f,
+    -0.544361512821f, -1.58123168032f,    1.89319847039f,   0.16859080901f,
+    -1.16023321135f,  -0.396988107751f,   1.76637090744f,   -1.40434786514f,
+    0.908227575669f,  0.817064817605f,    0.215631134908f,  -0.848605613428f,
+    -0.106756747018f, 0.0193027166685f,   0.801345615113f,  -0.395407237598f,
+    -1.79983795658f,  -1.73054496242f,    0.0584392594454f, -0.388786095569f,
+    -0.237269619354f, 0.000843578271263f, -1.24043512104f,  0.487839445893f,
+    -0.394259726605f, 0.559632843424f,    -0.527224052291f, -1.53792340282f,
+  };
+
+  const float expected_1[] = {
+    0.0f, 0.0f,           0.0f, 0.0f, 0.4057888292f, 0.325309571755f,
+    0.0f, 1.22013465602f,
+  };
+
+  const float expected_2[] = {
+    0.156119444687f,
+    0.517385299817f,
+  };
+
+  const float expected_3[] = {
+    0.224177852984f,
+    0.503384419034f,
+    0.156119444687f,
+    0.517385299817f,
+  };
+
+  const float *expected[] = { expected_0, expected_1, expected_2, expected_3 };
+
+  CNN_CONFIG cnn_config = {
+    4,  // num_layers
+    0,  // is_residue
+    0,  // ext_width
+    0,  // ext_height
+    0,  // strict_bounds
+    {
+        // layer_config
+        {
+            image_ch,           // in_channels
+            filter_dim,         // filter_width
+            filter_dim,         // filter_height
+            num_filters,        // out_channels
+            stride,             // skip_width
+            stride,             // skip_height
+            0,                  // max_pool
+            weights_1,          // weights
+            bias,               // bias
+            PADDING_SAME_ZERO,  // pad
+            NONE,               // activation
+            0,                  // deconvolve
+            0,                  // branch
+            BRANCH_OUTPUT,      // branch_copy_type
+            BRANCH_NOC,         // branch_combine_type
+            { 2, 0, 0 },        // branch_config
+            {},                 // bn_params
+            0,                  // output_num
+        },
+        {
+            num_filters,        // in_channels
+            filter_dim,         // filter_width
+            filter_dim,         // filter_height
+            num_filters,        // out_channels
+            stride,             // skip_width
+            stride,             // skip_height
+            0,                  // max_pool
+            weights_2,          // weights
+            bias,               // bias
+            PADDING_SAME_ZERO,  // pad
+            RELU,               // activation
+            0,                  // deconvolve
+            0,                  // branch
+            BRANCH_NO_COPY,     // branch_copy_type
+            BRANCH_NOC,         // branch_combine_type
+            {},                 // branch_config
+            {},                 // bn_params
+            1,                  // output_num
+        },
+        {
+            num_filters,        // in_channels
+            filter_dim,         // filter_width
+            filter_dim,         // filter_height
+            num_filters,        // out_channels
+            stride,             // skip_width
+            stride,             // skip_height
+            0,                  // max_pool
+            weights_3,          // weights
+            bias,               // bias
+            PADDING_SAME_ZERO,  // pad
+            RELU,               // activation
+            0,                  // deconvolve
+            0,                  // branch
+            BRANCH_NO_COPY,     // branch_copy_type
+            BRANCH_NOC,         // branch_combine_type
+            {},                 // branch_config
+            {},                 // bn_params
+            2,                  // output_num
+        },
+        {
+            num_filters,     // in_channels
+            2 * filter_dim,  // filter_width
+            2 * filter_dim,  // filter_height
+            num_filters,     // out_channels
+            2 * stride,      // skip_width
+            2 * stride,      // skip_height
+            0,               // max_pool
+            weights_4,       // weights
+            bias,            // bias
+            PADDING_VALID,   // pad
+            RELU,            // activation
+            0,               // deconvolve
+            1,               // branch
+            BRANCH_NO_COPY,  // branch_copy_type
+            BRANCH_CAT,      // branch_combine_type
+            { 0, 0, 1 },     // branch_config
+            {},              // bn_params
+            3,               // output_num
+        },
+    },
+  };
+
+  CNN_THREAD_DATA thread_data = { 1, NULL };
+
+  const int num_outputs = 4;
+  const int output_chs[4] = { filter_dim, filter_dim, filter_dim,
+                              2 * filter_dim };
+  const int output_dims[4] = { 4, 2, 1, 1 };
+  const int output_sizes[4] = {
+    output_chs[0] * output_dims[0] * output_dims[0],
+    output_chs[1] * output_dims[1] * output_dims[1],
+    output_chs[2] * output_dims[2] * output_dims[2],
+    output_chs[3] * output_dims[3] * output_dims[3],
+  };
+  float *const output_ = (float *)aom_malloc(
+      sizeof(*output_) *
+      (output_sizes[0] + output_sizes[1] + output_sizes[2] + output_sizes[3]));
+  float *output[CNN_MAX_CHANNELS] = { nullptr };
+  int ch_ite = 0;
+  float *output_ite = output_;
+  for (int output_idx = 0; output_idx < num_outputs; output_idx++) {
+    for (int channel = 0; channel < output_chs[output_idx]; ++channel) {
+      output[ch_ite++] = output_ite;
+      output_ite += output_dims[output_idx] * output_dims[output_idx];
+    }
+  }
+  CNN_MULTI_OUT output_struct = { num_outputs, output_chs, output_dims,
+                                  output };
+
+  RunMultiOutCNNTest(input, image_dim, image_dim, image_dim, &cnn_config,
+                     &thread_data, &output_struct, expected, MSE_FLOAT_TOL);
+
+  aom_free(output_);
+}
diff --git a/libs/libaom/src/test/codec_factory.h b/libs/libaom/src/test/codec_factory.h
new file mode 100644
index 000000000..801b8948f
--- /dev/null
+++ b/libs/libaom/src/test/codec_factory.h
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_TEST_CODEC_FACTORY_H_
+#define AOM_TEST_CODEC_FACTORY_H_
+
+#include <tuple>
+
+#include "config/aom_config.h"
+
+#include "aom/aom_decoder.h"
+#include "aom/aom_encoder.h"
+#if CONFIG_AV1_ENCODER
+#include "aom/aomcx.h"
+#endif
+#if CONFIG_AV1_DECODER
+#include "aom/aomdx.h"
+#endif
+
+#include "test/decode_test_driver.h"
+#include "test/encode_test_driver.h"
+namespace libaom_test {
+
+const int kCodecFactoryParam = 0;
+
+class CodecFactory {
+ public:
+  CodecFactory() {}
+
+  virtual ~CodecFactory() {}
+
+  virtual Decoder *CreateDecoder(aom_codec_dec_cfg_t cfg) const = 0;
+
+  virtual Decoder *CreateDecoder(aom_codec_dec_cfg_t cfg,
+                                 const aom_codec_flags_t flags) const = 0;
+
+  virtual Encoder *CreateEncoder(aom_codec_enc_cfg_t cfg,
+                                 const aom_codec_flags_t init_flags,
+                                 TwopassStatsStore *stats) const = 0;
+
+  virtual aom_codec_err_t DefaultEncoderConfig(aom_codec_enc_cfg_t *cfg,
+                                               unsigned int usage) const = 0;
+};
+
+/* Provide CodecTestWith<n>Params classes for a variable number of parameters
+ * to avoid having to include a pointer to the CodecFactory in every test
+ * definition.
+ */
+template <class T1>
+class CodecTestWithParam
+    : public ::testing::TestWithParam<
+          std::tuple<const libaom_test::CodecFactory *, T1> > {};
+
+template <class T1, class T2>
+class CodecTestWith2Params
+    : public ::testing::TestWithParam<
+          std::tuple<const libaom_test::CodecFactory *, T1, T2> > {};
+
+template <class T1, class T2, class T3>
+class CodecTestWith3Params
+    : public ::testing::TestWithParam<
+          std::tuple<const libaom_test::CodecFactory *, T1, T2, T3> > {};
+
+template <class T1, class T2, class T3, class T4>
+class CodecTestWith4Params
+    : public ::testing::TestWithParam<
+          std::tuple<const libaom_test::CodecFactory *, T1, T2, T3, T4> > {};
+
+template <class T1, class T2, class T3, class T4, class T5>
+class CodecTestWith5Params
+    : public ::testing::TestWithParam<
+          std::tuple<const libaom_test::CodecFactory *, T1, T2, T3, T4, T5> > {
+};
+
+/*
+ * AV1 Codec Definitions
+ */
+class AV1Decoder : public Decoder {
+ public:
+  explicit AV1Decoder(aom_codec_dec_cfg_t cfg) : Decoder(cfg) {}
+
+  AV1Decoder(aom_codec_dec_cfg_t cfg, const aom_codec_flags_t flag)
+      : Decoder(cfg, flag) {}
+
+ protected:
+  virtual aom_codec_iface_t *CodecInterface() const {
+#if CONFIG_AV1_DECODER
+    return aom_codec_av1_dx();
+#else
+    return NULL;
+#endif
+  }
+};
+
+class AV1Encoder : public Encoder {
+ public:
+  AV1Encoder(aom_codec_enc_cfg_t cfg, const aom_codec_flags_t init_flags,
+             TwopassStatsStore *stats)
+      : Encoder(cfg, init_flags, stats) {}
+
+ protected:
+  virtual aom_codec_iface_t *CodecInterface() const {
+#if CONFIG_AV1_ENCODER
+    return aom_codec_av1_cx();
+#else
+    return NULL;
+#endif
+  }
+};
+
+class AV1CodecFactory : public CodecFactory {
+ public:
+  AV1CodecFactory() : CodecFactory() {}
+
+  virtual Decoder *CreateDecoder(aom_codec_dec_cfg_t cfg) const {
+    return CreateDecoder(cfg, 0);
+  }
+
+  virtual Decoder *CreateDecoder(aom_codec_dec_cfg_t cfg,
+                                 const aom_codec_flags_t flags) const {
+#if CONFIG_AV1_DECODER
+    return new AV1Decoder(cfg, flags);
+#else
+    (void)cfg;
+    (void)flags;
+    return NULL;
+#endif
+  }
+
+  virtual Encoder *CreateEncoder(aom_codec_enc_cfg_t cfg,
+                                 const aom_codec_flags_t init_flags,
+                                 TwopassStatsStore *stats) const {
+#if CONFIG_AV1_ENCODER
+    return new AV1Encoder(cfg, init_flags, stats);
+#else
+    (void)cfg;
+    (void)init_flags;
+    (void)stats;
+    return NULL;
+#endif
+  }
+
+  virtual aom_codec_err_t DefaultEncoderConfig(aom_codec_enc_cfg_t *cfg,
+                                               unsigned int usage) const {
+#if CONFIG_AV1_ENCODER
+    return aom_codec_enc_config_default(aom_codec_av1_cx(), cfg, usage);
+#else
+    (void)cfg;
+    (void)usage;
+    return AOM_CODEC_INCAPABLE;
+#endif
+  }
+};
+
+const libaom_test::AV1CodecFactory kAV1;
+
+#define AV1_INSTANTIATE_TEST_CASE(test, ...)                                \
+  INSTANTIATE_TEST_SUITE_P(                                                 \
+      AV1, test,                                                            \
+      ::testing::Combine(                                                   \
+          ::testing::Values(static_cast<const libaom_test::CodecFactory *>( \
+              &libaom_test::kAV1)),                                         \
+          __VA_ARGS__))
+
+}  // namespace libaom_test
+#endif  // AOM_TEST_CODEC_FACTORY_H_
diff --git a/libs/libaom/src/test/coding_path_sync.cc b/libs/libaom/src/test/coding_path_sync.cc
new file mode 100644
index 000000000..4c613dc03
--- /dev/null
+++ b/libs/libaom/src/test/coding_path_sync.cc
@@ -0,0 +1,206 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <vector>
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/acm_random.h"
+
+#include "config/aom_config.h"
+
+#include "aom/aomcx.h"
+#include "aom/aomdx.h"
+#include "aom/aom_encoder.h"
+#include "aom/aom_decoder.h"
+
+#define NELEMENTS(x) static_cast<int>(sizeof(x) / sizeof(x[0]))
+
+using libaom_test::ACMRandom;
+namespace {
+
+class CompressedSource {
+ public:
+  explicit CompressedSource(int seed) : rnd_(seed), frame_count_(0) {
+    aom_codec_iface_t *algo = aom_codec_av1_cx();
+
+    aom_codec_enc_cfg_t cfg;
+    aom_codec_enc_config_default(algo, &cfg, 0);
+
+    // force the quantizer, to reduce the sensitivity on encoding choices.
+    // e.g, we don't want this test to break when the rate control is modified.
+    {
+      const int max_q = cfg.rc_max_quantizer;
+      const int min_q = cfg.rc_min_quantizer;
+      const int q = rnd_.PseudoUniform(max_q - min_q + 1) + min_q;
+
+      cfg.rc_end_usage = AOM_Q;
+      cfg.rc_max_quantizer = q;
+      cfg.rc_min_quantizer = q;
+    }
+
+    // choose the picture size
+    {
+      width_ = rnd_.PseudoUniform(kWidth - 8) + 8;
+      height_ = rnd_.PseudoUniform(kHeight - 8) + 8;
+    }
+
+    // choose the chroma subsampling
+    {
+      const aom_img_fmt_t fmts[] = {
+        AOM_IMG_FMT_I420,
+        AOM_IMG_FMT_I422,
+        AOM_IMG_FMT_I444,
+      };
+
+      format_ = fmts[rnd_.PseudoUniform(NELEMENTS(fmts))];
+    }
+
+    cfg.g_w = width_;
+    cfg.g_h = height_;
+    cfg.g_lag_in_frames = 0;
+    if (format_ == AOM_IMG_FMT_I420)
+      cfg.g_profile = 0;
+    else if (format_ == AOM_IMG_FMT_I444)
+      cfg.g_profile = 1;
+    else if (format_ == AOM_IMG_FMT_I422)
+      cfg.g_profile = 2;
+
+    aom_codec_enc_init(&enc_, algo, &cfg, 0);
+  }
+
+  ~CompressedSource() { aom_codec_destroy(&enc_); }
+
+  const aom_codec_cx_pkt_t *ReadFrame() {
+    uint8_t buf[kWidth * kHeight * 3] = { 0 };
+
+    // render regular pattern
+    const int period = rnd_.Rand8() % 32 + 1;
+    const int phase = rnd_.Rand8() % period;
+
+    const int val_a = rnd_.Rand8();
+    const int val_b = rnd_.Rand8();
+
+    for (int i = 0; i < (int)sizeof buf; ++i)
+      buf[i] = (i + phase) % period < period / 2 ? val_a : val_b;
+
+    aom_image_t img;
+    aom_img_wrap(&img, format_, width_, height_, 0, buf);
+    aom_codec_encode(&enc_, &img, frame_count_++, 1, 0);
+
+    aom_codec_iter_t iter = NULL;
+
+    const aom_codec_cx_pkt_t *pkt = NULL;
+
+    do {
+      pkt = aom_codec_get_cx_data(&enc_, &iter);
+    } while (pkt && pkt->kind != AOM_CODEC_CX_FRAME_PKT);
+
+    return pkt;
+  }
+
+ private:
+  static const int kWidth = 128;
+  static const int kHeight = 128;
+
+  ACMRandom rnd_;
+  aom_img_fmt_t format_;
+  aom_codec_ctx_t enc_;
+  int frame_count_;
+  int width_, height_;
+};
+
+// lowers an aom_image_t to a easily comparable/printable form
+std::vector<int16_t> Serialize(const aom_image_t *img) {
+  std::vector<int16_t> bytes;
+  bytes.reserve(img->d_w * img->d_h * 3);
+  for (int plane = 0; plane < 3; ++plane) {
+    const int w = aom_img_plane_width(img, plane);
+    const int h = aom_img_plane_height(img, plane);
+
+    for (int r = 0; r < h; ++r) {
+      for (int c = 0; c < w; ++c) {
+        unsigned char *row = img->planes[plane] + r * img->stride[plane];
+        if (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH)
+          bytes.push_back(row[c * 2]);
+        else
+          bytes.push_back(row[c]);
+      }
+    }
+  }
+
+  return bytes;
+}
+
+class Decoder {
+ public:
+  explicit Decoder(int allowLowbitdepth) {
+    aom_codec_iface_t *algo = aom_codec_av1_dx();
+
+    aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t();
+    cfg.allow_lowbitdepth = allowLowbitdepth;
+
+    aom_codec_dec_init(&dec_, algo, &cfg, 0);
+  }
+
+  ~Decoder() { aom_codec_destroy(&dec_); }
+
+  std::vector<int16_t> decode(const aom_codec_cx_pkt_t *pkt) {
+    aom_codec_decode(&dec_, static_cast<uint8_t *>(pkt->data.frame.buf),
+                     pkt->data.frame.sz, NULL);
+
+    aom_codec_iter_t iter = NULL;
+    return Serialize(aom_codec_get_frame(&dec_, &iter));
+  }
+
+ private:
+  aom_codec_ctx_t dec_;
+};
+
+// Try to reveal a mismatch between LBD and HBD coding paths.
+TEST(CodingPathSync, SearchForHbdLbdMismatch) {
+  const int count_tests = 10;
+  for (int i = 0; i < count_tests; ++i) {
+    Decoder dec_hbd(0);
+    Decoder dec_lbd(1);
+
+    CompressedSource enc(i);
+
+    for (int k = 0; k < 3; ++k) {
+      const aom_codec_cx_pkt_t *frame = enc.ReadFrame();
+
+      std::vector<int16_t> lbd_yuv = dec_lbd.decode(frame);
+      std::vector<int16_t> hbd_yuv = dec_hbd.decode(frame);
+
+      ASSERT_EQ(lbd_yuv, hbd_yuv);
+    }
+  }
+}
+
+TEST(CodingPathSyncLarge, SearchForHbdLbdMismatchLarge) {
+  const int count_tests = 100;
+  const int seed = 1234;
+  for (int i = 0; i < count_tests; ++i) {
+    Decoder dec_hbd(0);
+    Decoder dec_lbd(1);
+
+    CompressedSource enc(seed + i);
+
+    for (int k = 0; k < 5; ++k) {
+      const aom_codec_cx_pkt_t *frame = enc.ReadFrame();
+
+      std::vector<int16_t> lbd_yuv = dec_lbd.decode(frame);
+      std::vector<int16_t> hbd_yuv = dec_hbd.decode(frame);
+
+      ASSERT_EQ(lbd_yuv, hbd_yuv);
+    }
+  }
+}
+
+}  // namespace
diff --git a/libs/libaom/src/test/comp_avg_pred_test.cc b/libs/libaom/src/test/comp_avg_pred_test.cc
new file mode 100644
index 000000000..ac625a79d
--- /dev/null
+++ b/libs/libaom/src/test/comp_avg_pred_test.cc
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "test/comp_avg_pred_test.h"
+
+using libaom_test::ACMRandom;
+using libaom_test::AV1DISTWTDCOMPAVG::AV1DISTWTDCOMPAVGTest;
+using libaom_test::AV1DISTWTDCOMPAVG::AV1DISTWTDCOMPAVGUPSAMPLEDTest;
+#if CONFIG_AV1_HIGHBITDEPTH
+using libaom_test::AV1DISTWTDCOMPAVG::AV1HighBDDISTWTDCOMPAVGTest;
+using libaom_test::AV1DISTWTDCOMPAVG::AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest;
+#endif
+using std::make_tuple;
+using std::tuple;
+
+namespace {
+
+TEST_P(AV1DISTWTDCOMPAVGTest, DISABLED_Speed) { RunSpeedTest(GET_PARAM(0)); }
+
+TEST_P(AV1DISTWTDCOMPAVGTest, CheckOutput) { RunCheckOutput(GET_PARAM(0)); }
+
+#if HAVE_SSSE3
+INSTANTIATE_TEST_SUITE_P(SSSE3, AV1DISTWTDCOMPAVGTest,
+                         libaom_test::AV1DISTWTDCOMPAVG::BuildParams(
+                             aom_dist_wtd_comp_avg_pred_ssse3));
+#endif
+
+TEST_P(AV1DISTWTDCOMPAVGUPSAMPLEDTest, DISABLED_Speed) {
+  RunSpeedTest(GET_PARAM(0));
+}
+
+TEST_P(AV1DISTWTDCOMPAVGUPSAMPLEDTest, CheckOutput) {
+  RunCheckOutput(GET_PARAM(0));
+}
+
+#if HAVE_SSSE3
+INSTANTIATE_TEST_SUITE_P(SSSE3, AV1DISTWTDCOMPAVGUPSAMPLEDTest,
+                         libaom_test::AV1DISTWTDCOMPAVG::BuildParams(
+                             aom_dist_wtd_comp_avg_upsampled_pred_ssse3));
+#endif
+
+#if CONFIG_AV1_HIGHBITDEPTH
+TEST_P(AV1HighBDDISTWTDCOMPAVGTest, DISABLED_Speed) {
+  RunSpeedTest(GET_PARAM(1));
+}
+
+TEST_P(AV1HighBDDISTWTDCOMPAVGTest, CheckOutput) {
+  RunCheckOutput(GET_PARAM(1));
+}
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(SSE2, AV1HighBDDISTWTDCOMPAVGTest,
+                         libaom_test::AV1DISTWTDCOMPAVG::BuildParams(
+                             aom_highbd_dist_wtd_comp_avg_pred_sse2, 1));
+#endif
+
+TEST_P(AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest, DISABLED_Speed) {
+  RunSpeedTest(GET_PARAM(1));
+}
+
+TEST_P(AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest, CheckOutput) {
+  RunCheckOutput(GET_PARAM(1));
+}
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(SSE2, AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest,
+                         libaom_test::AV1DISTWTDCOMPAVG::BuildParams(
+                             aom_highbd_dist_wtd_comp_avg_upsampled_pred_sse2));
+#endif
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+}  // namespace
diff --git a/libs/libaom/src/test/comp_avg_pred_test.h b/libs/libaom/src/test/comp_avg_pred_test.h
new file mode 100644
index 000000000..7f73312c4
--- /dev/null
+++ b/libs/libaom/src/test/comp_avg_pred_test.h
@@ -0,0 +1,569 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_TEST_COMP_AVG_PRED_TEST_H_
+#define AOM_TEST_COMP_AVG_PRED_TEST_H_
+
+#include <tuple>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/acm_random.h"
+#include "test/util.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "av1/common/common_data.h"
+#include "aom_ports/aom_timer.h"
+
+namespace libaom_test {
+const int kMaxSize = 128 + 32;  // padding
+
+namespace AV1DISTWTDCOMPAVG {
+
+typedef void (*distwtdcompavg_func)(uint8_t *comp_pred, const uint8_t *pred,
+                                    int width, int height, const uint8_t *ref,
+                                    int ref_stride,
+                                    const DIST_WTD_COMP_PARAMS *jcp_param);
+
+typedef void (*distwtdcompavgupsampled_func)(
+    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+    int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search);
+
+typedef std::tuple<distwtdcompavg_func, BLOCK_SIZE> DISTWTDCOMPAVGParam;
+
+typedef std::tuple<distwtdcompavgupsampled_func, BLOCK_SIZE>
+    DISTWTDCOMPAVGUPSAMPLEDParam;
+
+#if CONFIG_AV1_HIGHBITDEPTH
+typedef void (*highbddistwtdcompavgupsampled_func)(
+    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+    int ref_stride, int bd, const DIST_WTD_COMP_PARAMS *jcp_param,
+    int subpel_search);
+
+typedef std::tuple<int, highbddistwtdcompavgupsampled_func, BLOCK_SIZE>
+    HighbdDISTWTDCOMPAVGUPSAMPLEDParam;
+
+typedef std::tuple<int, distwtdcompavg_func, BLOCK_SIZE>
+    HighbdDISTWTDCOMPAVGParam;
+
+::testing::internal::ParamGenerator<HighbdDISTWTDCOMPAVGParam> BuildParams(
+    distwtdcompavg_func filter, int is_hbd) {
+  (void)is_hbd;
+  return ::testing::Combine(::testing::Range(8, 13, 2),
+                            ::testing::Values(filter),
+                            ::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL));
+}
+
+::testing::internal::ParamGenerator<HighbdDISTWTDCOMPAVGUPSAMPLEDParam>
+BuildParams(highbddistwtdcompavgupsampled_func filter) {
+  return ::testing::Combine(::testing::Range(8, 13, 2),
+                            ::testing::Values(filter),
+                            ::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL));
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+::testing::internal::ParamGenerator<DISTWTDCOMPAVGParam> BuildParams(
+    distwtdcompavg_func filter) {
+  return ::testing::Combine(::testing::Values(filter),
+                            ::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL));
+}
+
+::testing::internal::ParamGenerator<DISTWTDCOMPAVGUPSAMPLEDParam> BuildParams(
+    distwtdcompavgupsampled_func filter) {
+  return ::testing::Combine(::testing::Values(filter),
+                            ::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL));
+}
+
+class AV1DISTWTDCOMPAVGTest
+    : public ::testing::TestWithParam<DISTWTDCOMPAVGParam> {
+ public:
+  ~AV1DISTWTDCOMPAVGTest() {}
+  void SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); }
+  void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+  void RunCheckOutput(distwtdcompavg_func test_impl) {
+    const int w = kMaxSize, h = kMaxSize;
+    const int block_idx = GET_PARAM(1);
+
+    uint8_t pred8[kMaxSize * kMaxSize];
+    uint8_t ref8[kMaxSize * kMaxSize];
+    uint8_t output[kMaxSize * kMaxSize];
+    uint8_t output2[kMaxSize * kMaxSize];
+
+    for (int i = 0; i < h; ++i)
+      for (int j = 0; j < w; ++j) {
+        pred8[i * w + j] = rnd_.Rand8();
+        ref8[i * w + j] = rnd_.Rand8();
+      }
+    const int in_w = block_size_wide[block_idx];
+    const int in_h = block_size_high[block_idx];
+
+    DIST_WTD_COMP_PARAMS dist_wtd_comp_params;
+    dist_wtd_comp_params.use_dist_wtd_comp_avg = 1;
+
+    for (int ii = 0; ii < 2; ii++) {
+      for (int jj = 0; jj < 4; jj++) {
+        dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[ii][jj][0];
+        dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[ii][jj][1];
+
+        const int offset_r = 3 + rnd_.PseudoUniform(h - in_h - 7);
+        const int offset_c = 3 + rnd_.PseudoUniform(w - in_w - 7);
+        aom_dist_wtd_comp_avg_pred_c(output, pred8 + offset_r * w + offset_c,
+                                     in_w, in_h, ref8 + offset_r * w + offset_c,
+                                     in_w, &dist_wtd_comp_params);
+        test_impl(output2, pred8 + offset_r * w + offset_c, in_w, in_h,
+                  ref8 + offset_r * w + offset_c, in_w, &dist_wtd_comp_params);
+
+        for (int i = 0; i < in_h; ++i) {
+          for (int j = 0; j < in_w; ++j) {
+            int idx = i * in_w + j;
+            ASSERT_EQ(output[idx], output2[idx])
+                << "Mismatch at unit tests for AV1DISTWTDCOMPAVGTest\n"
+                << in_w << "x" << in_h << " Pixel mismatch at index " << idx
+                << " = (" << i << ", " << j << ")";
+          }
+        }
+      }
+    }
+  }
+  void RunSpeedTest(distwtdcompavg_func test_impl) {
+    const int w = kMaxSize, h = kMaxSize;
+    const int block_idx = GET_PARAM(1);
+
+    uint8_t pred8[kMaxSize * kMaxSize];
+    uint8_t ref8[kMaxSize * kMaxSize];
+    uint8_t output[kMaxSize * kMaxSize];
+    uint8_t output2[kMaxSize * kMaxSize];
+
+    for (int i = 0; i < h; ++i)
+      for (int j = 0; j < w; ++j) {
+        pred8[i * w + j] = rnd_.Rand8();
+        ref8[i * w + j] = rnd_.Rand8();
+      }
+    const int in_w = block_size_wide[block_idx];
+    const int in_h = block_size_high[block_idx];
+
+    DIST_WTD_COMP_PARAMS dist_wtd_comp_params;
+    dist_wtd_comp_params.use_dist_wtd_comp_avg = 1;
+
+    dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[0][0][0];
+    dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[0][0][1];
+
+    const int num_loops = 1000000000 / (in_w + in_h);
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+
+    for (int i = 0; i < num_loops; ++i)
+      aom_dist_wtd_comp_avg_pred_c(output, pred8, in_w, in_h, ref8, in_w,
+                                   &dist_wtd_comp_params);
+
+    aom_usec_timer_mark(&timer);
+    const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+    printf("distwtdcompavg c_code %3dx%-3d: %7.2f us\n", in_w, in_h,
+           1000.0 * elapsed_time / num_loops);
+
+    aom_usec_timer timer1;
+    aom_usec_timer_start(&timer1);
+
+    for (int i = 0; i < num_loops; ++i)
+      test_impl(output2, pred8, in_w, in_h, ref8, in_w, &dist_wtd_comp_params);
+
+    aom_usec_timer_mark(&timer1);
+    const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1));
+    printf("distwtdcompavg test_code %3dx%-3d: %7.2f us\n", in_w, in_h,
+           1000.0 * elapsed_time1 / num_loops);
+  }
+
+  libaom_test::ACMRandom rnd_;
+};  // class AV1DISTWTDCOMPAVGTest
+
+class AV1DISTWTDCOMPAVGUPSAMPLEDTest
+    : public ::testing::TestWithParam<DISTWTDCOMPAVGUPSAMPLEDParam> {
+ public:
+  ~AV1DISTWTDCOMPAVGUPSAMPLEDTest() {}
+  void SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); }
+  void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+  void RunCheckOutput(distwtdcompavgupsampled_func test_impl) {
+    const int w = kMaxSize, h = kMaxSize;
+    const int block_idx = GET_PARAM(1);
+
+    uint8_t pred8[kMaxSize * kMaxSize];
+    uint8_t ref8[kMaxSize * kMaxSize];
+    DECLARE_ALIGNED(16, uint8_t, output[MAX_SB_SQUARE]);
+    DECLARE_ALIGNED(16, uint8_t, output2[MAX_SB_SQUARE]);
+
+    for (int i = 0; i < h; ++i)
+      for (int j = 0; j < w; ++j) {
+        pred8[i * w + j] = rnd_.Rand8();
+        ref8[i * w + j] = rnd_.Rand8();
+      }
+    const int in_w = block_size_wide[block_idx];
+    const int in_h = block_size_high[block_idx];
+
+    DIST_WTD_COMP_PARAMS dist_wtd_comp_params;
+    dist_wtd_comp_params.use_dist_wtd_comp_avg = 1;
+    int sub_x_q3, sub_y_q3;
+    int subpel_search;
+    for (subpel_search = USE_4_TAPS; subpel_search <= USE_8_TAPS;
+         ++subpel_search) {
+      for (sub_x_q3 = 0; sub_x_q3 < 8; ++sub_x_q3) {
+        for (sub_y_q3 = 0; sub_y_q3 < 8; ++sub_y_q3) {
+          for (int ii = 0; ii < 2; ii++) {
+            for (int jj = 0; jj < 4; jj++) {
+              dist_wtd_comp_params.fwd_offset =
+                  quant_dist_lookup_table[ii][jj][0];
+              dist_wtd_comp_params.bck_offset =
+                  quant_dist_lookup_table[ii][jj][1];
+
+              const int offset_r = 3 + rnd_.PseudoUniform(h - in_h - 7);
+              const int offset_c = 3 + rnd_.PseudoUniform(w - in_w - 7);
+
+              aom_dist_wtd_comp_avg_upsampled_pred_c(
+                  NULL, NULL, 0, 0, NULL, output,
+                  pred8 + offset_r * w + offset_c, in_w, in_h, sub_x_q3,
+                  sub_y_q3, ref8 + offset_r * w + offset_c, in_w,
+                  &dist_wtd_comp_params, subpel_search);
+              test_impl(NULL, NULL, 0, 0, NULL, output2,
+                        pred8 + offset_r * w + offset_c, in_w, in_h, sub_x_q3,
+                        sub_y_q3, ref8 + offset_r * w + offset_c, in_w,
+                        &dist_wtd_comp_params, subpel_search);
+
+              for (int i = 0; i < in_h; ++i) {
+                for (int j = 0; j < in_w; ++j) {
+                  int idx = i * in_w + j;
+                  ASSERT_EQ(output[idx], output2[idx])
+                      << "Mismatch at unit tests for "
+                         "AV1DISTWTDCOMPAVGUPSAMPLEDTest\n"
+                      << in_w << "x" << in_h << " Pixel mismatch at index "
+                      << idx << " = (" << i << ", " << j
+                      << "), sub pixel offset = (" << sub_y_q3 << ", "
+                      << sub_x_q3 << ")";
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  void RunSpeedTest(distwtdcompavgupsampled_func test_impl) {
+    const int w = kMaxSize, h = kMaxSize;
+    const int block_idx = GET_PARAM(1);
+
+    uint8_t pred8[kMaxSize * kMaxSize];
+    uint8_t ref8[kMaxSize * kMaxSize];
+    DECLARE_ALIGNED(16, uint8_t, output[MAX_SB_SQUARE]);
+    DECLARE_ALIGNED(16, uint8_t, output2[MAX_SB_SQUARE]);
+
+    for (int i = 0; i < h; ++i)
+      for (int j = 0; j < w; ++j) {
+        pred8[i * w + j] = rnd_.Rand8();
+        ref8[i * w + j] = rnd_.Rand8();
+      }
+    const int in_w = block_size_wide[block_idx];
+    const int in_h = block_size_high[block_idx];
+
+    DIST_WTD_COMP_PARAMS dist_wtd_comp_params;
+    dist_wtd_comp_params.use_dist_wtd_comp_avg = 1;
+
+    dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[0][0][0];
+    dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[0][0][1];
+
+    int sub_x_q3 = 0;
+    int sub_y_q3 = 0;
+
+    const int num_loops = 1000000000 / (in_w + in_h);
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    int subpel_search = USE_8_TAPS;  // set to USE_4_TAPS to test 4-tap filter.
+
+    for (int i = 0; i < num_loops; ++i)
+      aom_dist_wtd_comp_avg_upsampled_pred_c(
+          NULL, NULL, 0, 0, NULL, output, pred8, in_w, in_h, sub_x_q3, sub_y_q3,
+          ref8, in_w, &dist_wtd_comp_params, subpel_search);
+
+    aom_usec_timer_mark(&timer);
+    const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+    printf("distwtdcompavgupsampled c_code %3dx%-3d: %7.2f us\n", in_w, in_h,
+           1000.0 * elapsed_time / num_loops);
+
+    aom_usec_timer timer1;
+    aom_usec_timer_start(&timer1);
+
+    for (int i = 0; i < num_loops; ++i)
+      test_impl(NULL, NULL, 0, 0, NULL, output2, pred8, in_w, in_h, sub_x_q3,
+                sub_y_q3, ref8, in_w, &dist_wtd_comp_params, subpel_search);
+
+    aom_usec_timer_mark(&timer1);
+    const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1));
+    printf("distwtdcompavgupsampled test_code %3dx%-3d: %7.2f us\n", in_w, in_h,
+           1000.0 * elapsed_time1 / num_loops);
+  }
+
+  libaom_test::ACMRandom rnd_;
+};  // class AV1DISTWTDCOMPAVGUPSAMPLEDTest
+
+#if CONFIG_AV1_HIGHBITDEPTH
+class AV1HighBDDISTWTDCOMPAVGTest
+    : public ::testing::TestWithParam<HighbdDISTWTDCOMPAVGParam> {
+ public:
+  ~AV1HighBDDISTWTDCOMPAVGTest() {}
+  void SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); }
+
+  void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+  void RunCheckOutput(distwtdcompavg_func test_impl) {
+    const int w = kMaxSize, h = kMaxSize;
+    const int block_idx = GET_PARAM(2);
+    const int bd = GET_PARAM(0);
+    uint16_t pred8[kMaxSize * kMaxSize];
+    uint16_t ref8[kMaxSize * kMaxSize];
+    uint16_t output[kMaxSize * kMaxSize];
+    uint16_t output2[kMaxSize * kMaxSize];
+
+    for (int i = 0; i < h; ++i)
+      for (int j = 0; j < w; ++j) {
+        pred8[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
+        ref8[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
+      }
+    const int in_w = block_size_wide[block_idx];
+    const int in_h = block_size_high[block_idx];
+
+    DIST_WTD_COMP_PARAMS dist_wtd_comp_params;
+    dist_wtd_comp_params.use_dist_wtd_comp_avg = 1;
+
+    for (int ii = 0; ii < 2; ii++) {
+      for (int jj = 0; jj < 4; jj++) {
+        dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[ii][jj][0];
+        dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[ii][jj][1];
+
+        const int offset_r = 3 + rnd_.PseudoUniform(h - in_h - 7);
+        const int offset_c = 3 + rnd_.PseudoUniform(w - in_w - 7);
+        aom_highbd_dist_wtd_comp_avg_pred_c(
+            CONVERT_TO_BYTEPTR(output),
+            CONVERT_TO_BYTEPTR(pred8) + offset_r * w + offset_c, in_w, in_h,
+            CONVERT_TO_BYTEPTR(ref8) + offset_r * w + offset_c, in_w,
+            &dist_wtd_comp_params);
+        test_impl(CONVERT_TO_BYTEPTR(output2),
+                  CONVERT_TO_BYTEPTR(pred8) + offset_r * w + offset_c, in_w,
+                  in_h, CONVERT_TO_BYTEPTR(ref8) + offset_r * w + offset_c,
+                  in_w, &dist_wtd_comp_params);
+
+        for (int i = 0; i < in_h; ++i) {
+          for (int j = 0; j < in_w; ++j) {
+            int idx = i * in_w + j;
+            ASSERT_EQ(output[idx], output2[idx])
+                << "Mismatch at unit tests for AV1HighBDDISTWTDCOMPAVGTest\n"
+                << in_w << "x" << in_h << " Pixel mismatch at index " << idx
+                << " = (" << i << ", " << j << ")";
+          }
+        }
+      }
+    }
+  }
+  void RunSpeedTest(distwtdcompavg_func test_impl) {
+    const int w = kMaxSize, h = kMaxSize;
+    const int block_idx = GET_PARAM(2);
+    const int bd = GET_PARAM(0);
+    uint16_t pred8[kMaxSize * kMaxSize];
+    uint16_t ref8[kMaxSize * kMaxSize];
+    uint16_t output[kMaxSize * kMaxSize];
+    uint16_t output2[kMaxSize * kMaxSize];
+
+    for (int i = 0; i < h; ++i)
+      for (int j = 0; j < w; ++j) {
+        pred8[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
+        ref8[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
+      }
+    const int in_w = block_size_wide[block_idx];
+    const int in_h = block_size_high[block_idx];
+
+    DIST_WTD_COMP_PARAMS dist_wtd_comp_params;
+    dist_wtd_comp_params.use_dist_wtd_comp_avg = 1;
+
+    dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[0][0][0];
+    dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[0][0][1];
+
+    const int num_loops = 1000000000 / (in_w + in_h);
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+
+    for (int i = 0; i < num_loops; ++i)
+      aom_highbd_dist_wtd_comp_avg_pred_c(
+          CONVERT_TO_BYTEPTR(output), CONVERT_TO_BYTEPTR(pred8), in_w, in_h,
+          CONVERT_TO_BYTEPTR(ref8), in_w, &dist_wtd_comp_params);
+
+    aom_usec_timer_mark(&timer);
+    const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+    printf("highbddistwtdcompavg c_code %3dx%-3d: %7.2f us\n", in_w, in_h,
+           1000.0 * elapsed_time / num_loops);
+
+    aom_usec_timer timer1;
+    aom_usec_timer_start(&timer1);
+
+    for (int i = 0; i < num_loops; ++i)
+      test_impl(CONVERT_TO_BYTEPTR(output2), CONVERT_TO_BYTEPTR(pred8), in_w,
+                in_h, CONVERT_TO_BYTEPTR(ref8), in_w, &dist_wtd_comp_params);
+
+    aom_usec_timer_mark(&timer1);
+    const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1));
+    printf("highbddistwtdcompavg test_code %3dx%-3d: %7.2f us\n", in_w, in_h,
+           1000.0 * elapsed_time1 / num_loops);
+  }
+
+  libaom_test::ACMRandom rnd_;
+};  // class AV1HighBDDISTWTDCOMPAVGTest
+
+class AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest
+    : public ::testing::TestWithParam<HighbdDISTWTDCOMPAVGUPSAMPLEDParam> {
+ public:
+  ~AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest() {}
+  void SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); }
+  void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+  void RunCheckOutput(highbddistwtdcompavgupsampled_func test_impl) {
+    const int w = kMaxSize, h = kMaxSize;
+    const int block_idx = GET_PARAM(2);
+    const int bd = GET_PARAM(0);
+    uint16_t pred8[kMaxSize * kMaxSize];
+    uint16_t ref8[kMaxSize * kMaxSize];
+    DECLARE_ALIGNED(16, uint16_t, output[kMaxSize * kMaxSize]);
+    DECLARE_ALIGNED(16, uint16_t, output2[kMaxSize * kMaxSize]);
+
+    for (int i = 0; i < h; ++i)
+      for (int j = 0; j < w; ++j) {
+        pred8[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
+        ref8[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
+      }
+    const int in_w = block_size_wide[block_idx];
+    const int in_h = block_size_high[block_idx];
+
+    DIST_WTD_COMP_PARAMS dist_wtd_comp_params;
+    dist_wtd_comp_params.use_dist_wtd_comp_avg = 1;
+    int sub_x_q3, sub_y_q3;
+    int subpel_search;
+    for (subpel_search = USE_4_TAPS; subpel_search <= USE_8_TAPS;
+         ++subpel_search) {
+      for (sub_x_q3 = 0; sub_x_q3 < 8; ++sub_x_q3) {
+        for (sub_y_q3 = 0; sub_y_q3 < 8; ++sub_y_q3) {
+          for (int ii = 0; ii < 2; ii++) {
+            for (int jj = 0; jj < 4; jj++) {
+              dist_wtd_comp_params.fwd_offset =
+                  quant_dist_lookup_table[ii][jj][0];
+              dist_wtd_comp_params.bck_offset =
+                  quant_dist_lookup_table[ii][jj][1];
+
+              const int offset_r = 3 + rnd_.PseudoUniform(h - in_h - 7);
+              const int offset_c = 3 + rnd_.PseudoUniform(w - in_w - 7);
+
+              aom_highbd_dist_wtd_comp_avg_upsampled_pred_c(
+                  NULL, NULL, 0, 0, NULL, CONVERT_TO_BYTEPTR(output),
+                  CONVERT_TO_BYTEPTR(pred8) + offset_r * w + offset_c, in_w,
+                  in_h, sub_x_q3, sub_y_q3,
+                  CONVERT_TO_BYTEPTR(ref8) + offset_r * w + offset_c, in_w, bd,
+                  &dist_wtd_comp_params, subpel_search);
+              test_impl(NULL, NULL, 0, 0, NULL, CONVERT_TO_BYTEPTR(output2),
+                        CONVERT_TO_BYTEPTR(pred8) + offset_r * w + offset_c,
+                        in_w, in_h, sub_x_q3, sub_y_q3,
+                        CONVERT_TO_BYTEPTR(ref8) + offset_r * w + offset_c,
+                        in_w, bd, &dist_wtd_comp_params, subpel_search);
+
+              for (int i = 0; i < in_h; ++i) {
+                for (int j = 0; j < in_w; ++j) {
+                  int idx = i * in_w + j;
+                  ASSERT_EQ(output[idx], output2[idx])
+                      << "Mismatch at unit tests for "
+                         "AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest\n"
+                      << in_w << "x" << in_h << " Pixel mismatch at index "
+                      << idx << " = (" << i << ", " << j
+                      << "), sub pixel offset = (" << sub_y_q3 << ", "
+                      << sub_x_q3 << ")";
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  void RunSpeedTest(highbddistwtdcompavgupsampled_func test_impl) {
+    const int w = kMaxSize, h = kMaxSize;
+    const int block_idx = GET_PARAM(2);
+    const int bd = GET_PARAM(0);
+    uint16_t pred8[kMaxSize * kMaxSize];
+    uint16_t ref8[kMaxSize * kMaxSize];
+    DECLARE_ALIGNED(16, uint16_t, output[kMaxSize * kMaxSize]);
+    DECLARE_ALIGNED(16, uint16_t, output2[kMaxSize * kMaxSize]);
+
+    for (int i = 0; i < h; ++i)
+      for (int j = 0; j < w; ++j) {
+        pred8[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
+        ref8[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
+      }
+    const int in_w = block_size_wide[block_idx];
+    const int in_h = block_size_high[block_idx];
+
+    DIST_WTD_COMP_PARAMS dist_wtd_comp_params;
+    dist_wtd_comp_params.use_dist_wtd_comp_avg = 1;
+
+    dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[0][0][0];
+    dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[0][0][1];
+    int sub_x_q3 = 0;
+    int sub_y_q3 = 0;
+    const int num_loops = 1000000000 / (in_w + in_h);
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    int subpel_search = USE_8_TAPS;  // set to USE_4_TAPS to test 4-tap filter.
+    for (int i = 0; i < num_loops; ++i)
+      aom_highbd_dist_wtd_comp_avg_upsampled_pred_c(
+          NULL, NULL, 0, 0, NULL, CONVERT_TO_BYTEPTR(output),
+          CONVERT_TO_BYTEPTR(pred8), in_w, in_h, sub_x_q3, sub_y_q3,
+          CONVERT_TO_BYTEPTR(ref8), in_w, bd, &dist_wtd_comp_params,
+          subpel_search);
+
+    aom_usec_timer_mark(&timer);
+    const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+    printf("highbddistwtdcompavgupsampled c_code %3dx%-3d: %7.2f us\n", in_w,
+           in_h, 1000.0 * elapsed_time / num_loops);
+
+    aom_usec_timer timer1;
+    aom_usec_timer_start(&timer1);
+
+    for (int i = 0; i < num_loops; ++i)
+      test_impl(NULL, NULL, 0, 0, NULL, CONVERT_TO_BYTEPTR(output2),
+                CONVERT_TO_BYTEPTR(pred8), in_w, in_h, sub_x_q3, sub_y_q3,
+                CONVERT_TO_BYTEPTR(ref8), in_w, bd, &dist_wtd_comp_params,
+                subpel_search);
+
+    aom_usec_timer_mark(&timer1);
+    const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1));
+    printf("highbddistwtdcompavgupsampled test_code %3dx%-3d: %7.2f us\n", in_w,
+           in_h, 1000.0 * elapsed_time1 / num_loops);
+  }
+
+  libaom_test::ACMRandom rnd_;
+};      // class AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+}  // namespace AV1DISTWTDCOMPAVG
+}  // namespace libaom_test
+
+#endif  // AOM_TEST_COMP_AVG_PRED_TEST_H_
diff --git a/libs/libaom/src/test/comp_mask_variance_test.cc b/libs/libaom/src/test/comp_mask_variance_test.cc
new file mode 100644
index 000000000..b666306a3
--- /dev/null
+++ b/libs/libaom/src/test/comp_mask_variance_test.cc
@@ -0,0 +1,577 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <cstdlib>
+#include <new>
+#include <tuple>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_codec.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/variance.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/mem.h"
+#include "av1/common/reconinter.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace AV1CompMaskVariance {
+typedef void (*comp_mask_pred_func)(uint8_t *comp_pred, const uint8_t *pred,
+                                    int width, int height, const uint8_t *ref,
+                                    int ref_stride, const uint8_t *mask,
+                                    int mask_stride, int invert_mask);
+
+#if HAVE_SSSE3 || HAVE_SSE2 || HAVE_AV2
+const BLOCK_SIZE kValidBlockSize[] = {
+  BLOCK_8X8,   BLOCK_8X16, BLOCK_8X32,  BLOCK_16X8,  BLOCK_16X16,
+  BLOCK_16X32, BLOCK_32X8, BLOCK_32X16, BLOCK_32X32,
+};
+#endif
+typedef std::tuple<comp_mask_pred_func, BLOCK_SIZE> CompMaskPredParam;
+
+class AV1CompMaskVarianceTest
+    : public ::testing::TestWithParam<CompMaskPredParam> {
+ public:
+  ~AV1CompMaskVarianceTest();
+  void SetUp();
+
+  void TearDown();
+
+ protected:
+  void RunCheckOutput(comp_mask_pred_func test_impl, BLOCK_SIZE bsize, int inv);
+  void RunSpeedTest(comp_mask_pred_func test_impl, BLOCK_SIZE bsize);
+  bool CheckResult(int width, int height) {
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        const int idx = y * width + x;
+        if (comp_pred1_[idx] != comp_pred2_[idx]) {
+          printf("%dx%d mismatch @%d(%d,%d) ", width, height, idx, y, x);
+          printf("%d != %d ", comp_pred1_[idx], comp_pred2_[idx]);
+          return false;
+        }
+      }
+    }
+    return true;
+  }
+
+  libaom_test::ACMRandom rnd_;
+  uint8_t *comp_pred1_;
+  uint8_t *comp_pred2_;
+  uint8_t *pred_;
+  uint8_t *ref_buffer_;
+  uint8_t *ref_;
+};
+
+AV1CompMaskVarianceTest::~AV1CompMaskVarianceTest() { ; }
+
+void AV1CompMaskVarianceTest::SetUp() {
+  rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed());
+  av1_init_wedge_masks();
+  comp_pred1_ = (uint8_t *)aom_memalign(16, MAX_SB_SQUARE);
+  comp_pred2_ = (uint8_t *)aom_memalign(16, MAX_SB_SQUARE);
+  pred_ = (uint8_t *)aom_memalign(16, MAX_SB_SQUARE);
+  ref_buffer_ = (uint8_t *)aom_memalign(16, MAX_SB_SQUARE + (8 * MAX_SB_SIZE));
+  ref_ = ref_buffer_ + (8 * MAX_SB_SIZE);
+  for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+    pred_[i] = rnd_.Rand8();
+  }
+  for (int i = 0; i < MAX_SB_SQUARE + (8 * MAX_SB_SIZE); ++i) {
+    ref_buffer_[i] = rnd_.Rand8();
+  }
+}
+
+void AV1CompMaskVarianceTest::TearDown() {
+  aom_free(comp_pred1_);
+  aom_free(comp_pred2_);
+  aom_free(pred_);
+  aom_free(ref_buffer_);
+  libaom_test::ClearSystemState();
+}
+
+void AV1CompMaskVarianceTest::RunCheckOutput(comp_mask_pred_func test_impl,
+                                             BLOCK_SIZE bsize, int inv) {
+  const int w = block_size_wide[bsize];
+  const int h = block_size_high[bsize];
+  const int wedge_types = get_wedge_types_lookup(bsize);
+  for (int wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
+    const uint8_t *mask = av1_get_contiguous_soft_mask(wedge_index, 1, bsize);
+
+    aom_comp_mask_pred_c(comp_pred1_, pred_, w, h, ref_, MAX_SB_SIZE, mask, w,
+                         inv);
+    test_impl(comp_pred2_, pred_, w, h, ref_, MAX_SB_SIZE, mask, w, inv);
+
+    ASSERT_EQ(CheckResult(w, h), true)
+        << " wedge " << wedge_index << " inv " << inv;
+  }
+}
+
+void AV1CompMaskVarianceTest::RunSpeedTest(comp_mask_pred_func test_impl,
+                                           BLOCK_SIZE bsize) {
+  const int w = block_size_wide[bsize];
+  const int h = block_size_high[bsize];
+  const int wedge_types = get_wedge_types_lookup(bsize);
+  int wedge_index = wedge_types / 2;
+  const uint8_t *mask = av1_get_contiguous_soft_mask(wedge_index, 1, bsize);
+  const int num_loops = 1000000000 / (w + h);
+
+  comp_mask_pred_func funcs[2] = { aom_comp_mask_pred_c, test_impl };
+  double elapsed_time[2] = { 0 };
+  for (int i = 0; i < 2; ++i) {
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    comp_mask_pred_func func = funcs[i];
+    for (int j = 0; j < num_loops; ++j) {
+      func(comp_pred1_, pred_, w, h, ref_, MAX_SB_SIZE, mask, w, 0);
+    }
+    aom_usec_timer_mark(&timer);
+    double time = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    elapsed_time[i] = 1000.0 * time / num_loops;
+  }
+  printf("compMask %3dx%-3d: %7.2f/%7.2fns", w, h, elapsed_time[0],
+         elapsed_time[1]);
+  printf("(%3.2f)\n", elapsed_time[0] / elapsed_time[1]);
+}
+
+TEST_P(AV1CompMaskVarianceTest, CheckOutput) {
+  // inv = 0, 1
+  RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 0);
+  RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 1);
+}
+
+TEST_P(AV1CompMaskVarianceTest, DISABLED_Speed) {
+  RunSpeedTest(GET_PARAM(0), GET_PARAM(1));
+}
+
+#if HAVE_SSSE3
+INSTANTIATE_TEST_SUITE_P(
+    SSSE3, AV1CompMaskVarianceTest,
+    ::testing::Combine(::testing::Values(&aom_comp_mask_pred_ssse3),
+                       ::testing::ValuesIn(kValidBlockSize)));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, AV1CompMaskVarianceTest,
+    ::testing::Combine(::testing::Values(&aom_comp_mask_pred_avx2),
+                       ::testing::ValuesIn(kValidBlockSize)));
+#endif
+
+#ifndef aom_comp_mask_pred
+// can't run this test if aom_comp_mask_pred is defined to aom_comp_mask_pred_c
+class AV1CompMaskUpVarianceTest : public AV1CompMaskVarianceTest {
+ public:
+  ~AV1CompMaskUpVarianceTest();
+
+ protected:
+  void RunCheckOutput(comp_mask_pred_func test_impl, BLOCK_SIZE bsize, int inv);
+  void RunSpeedTest(comp_mask_pred_func test_impl, BLOCK_SIZE bsize,
+                    int havSub);
+};
+
+AV1CompMaskUpVarianceTest::~AV1CompMaskUpVarianceTest() { ; }
+
+void AV1CompMaskUpVarianceTest::RunCheckOutput(comp_mask_pred_func test_impl,
+                                               BLOCK_SIZE bsize, int inv) {
+  const int w = block_size_wide[bsize];
+  const int h = block_size_high[bsize];
+  const int wedge_types = get_wedge_types_lookup(bsize);
+  int subpel_search;
+  for (subpel_search = USE_4_TAPS; subpel_search <= USE_8_TAPS;
+       ++subpel_search) {
+    // loop through subx and suby
+    for (int sub = 0; sub < 8 * 8; ++sub) {
+      int subx = sub & 0x7;
+      int suby = (sub >> 3);
+      for (int wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
+        const uint8_t *mask =
+            av1_get_contiguous_soft_mask(wedge_index, 1, bsize);
+
+        // ref
+        aom_comp_mask_upsampled_pred_c(
+            NULL, NULL, 0, 0, NULL, comp_pred1_, pred_, w, h, subx, suby, ref_,
+            MAX_SB_SIZE, mask, w, inv, subpel_search);
+
+        aom_comp_mask_pred = test_impl;  // test
+        aom_comp_mask_upsampled_pred(NULL, NULL, 0, 0, NULL, comp_pred2_, pred_,
+                                     w, h, subx, suby, ref_, MAX_SB_SIZE, mask,
+                                     w, inv, subpel_search);
+        ASSERT_EQ(CheckResult(w, h), true)
+            << " wedge " << wedge_index << " inv " << inv << "sub (" << subx
+            << "," << suby << ")";
+      }
+    }
+  }
+}
+
+void AV1CompMaskUpVarianceTest::RunSpeedTest(comp_mask_pred_func test_impl,
+                                             BLOCK_SIZE bsize, int havSub) {
+  const int w = block_size_wide[bsize];
+  const int h = block_size_high[bsize];
+  const int subx = havSub ? 3 : 0;
+  const int suby = havSub ? 4 : 0;
+  const int wedge_types = get_wedge_types_lookup(bsize);
+  int wedge_index = wedge_types / 2;
+  const uint8_t *mask = av1_get_contiguous_soft_mask(wedge_index, 1, bsize);
+
+  const int num_loops = 1000000000 / (w + h);
+  comp_mask_pred_func funcs[2] = { &aom_comp_mask_pred_c, test_impl };
+  double elapsed_time[2] = { 0 };
+  int subpel_search = USE_8_TAPS;  // set to USE_4_TAPS to test 4-tap filter.
+  for (int i = 0; i < 2; ++i) {
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    aom_comp_mask_pred = funcs[i];
+    for (int j = 0; j < num_loops; ++j) {
+      aom_comp_mask_upsampled_pred(NULL, NULL, 0, 0, NULL, comp_pred1_, pred_,
+                                   w, h, subx, suby, ref_, MAX_SB_SIZE, mask, w,
+                                   0, subpel_search);
+    }
+    aom_usec_timer_mark(&timer);
+    double time = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    elapsed_time[i] = 1000.0 * time / num_loops;
+  }
+  printf("CompMaskUp[%d] %3dx%-3d:%7.2f/%7.2fns", havSub, w, h, elapsed_time[0],
+         elapsed_time[1]);
+  printf("(%3.2f)\n", elapsed_time[0] / elapsed_time[1]);
+}
+
+TEST_P(AV1CompMaskUpVarianceTest, CheckOutput) {
+  // inv mask = 0, 1
+  RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 0);
+  RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 1);
+}
+
+TEST_P(AV1CompMaskUpVarianceTest, DISABLED_Speed) {
+  RunSpeedTest(GET_PARAM(0), GET_PARAM(1), 1);
+}
+
+#if HAVE_SSSE3
+INSTANTIATE_TEST_SUITE_P(
+    SSSE3, AV1CompMaskUpVarianceTest,
+    ::testing::Combine(::testing::Values(&aom_comp_mask_pred_ssse3),
+                       ::testing::ValuesIn(kValidBlockSize)));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, AV1CompMaskUpVarianceTest,
+    ::testing::Combine(::testing::Values(&aom_comp_mask_pred_avx2),
+                       ::testing::ValuesIn(kValidBlockSize)));
+#endif
+
+#endif  // ifndef aom_comp_mask_pred
+
+#if CONFIG_AV1_HIGHBITDEPTH
+typedef void (*highbd_comp_mask_pred_func)(uint8_t *comp_pred8,
+                                           const uint8_t *pred8, int width,
+                                           int height, const uint8_t *ref8,
+                                           int ref_stride, const uint8_t *mask,
+                                           int mask_stride, int invert_mask);
+
+typedef std::tuple<highbd_comp_mask_pred_func, BLOCK_SIZE, int>
+    HighbdCompMaskPredParam;
+
+class AV1HighbdCompMaskVarianceTest
+    : public ::testing::TestWithParam<HighbdCompMaskPredParam> {
+ public:
+  ~AV1HighbdCompMaskVarianceTest();
+  void SetUp();
+
+  void TearDown();
+
+ protected:
+  void RunCheckOutput(highbd_comp_mask_pred_func test_impl, BLOCK_SIZE bsize,
+                      int inv);
+  void RunSpeedTest(highbd_comp_mask_pred_func test_impl, BLOCK_SIZE bsize);
+  bool CheckResult(int width, int height) {
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        const int idx = y * width + x;
+        if (comp_pred1_[idx] != comp_pred2_[idx]) {
+          printf("%dx%d mismatch @%d(%d,%d) ", width, height, idx, y, x);
+          printf("%d != %d ", comp_pred1_[idx], comp_pred2_[idx]);
+          return false;
+        }
+      }
+    }
+    return true;
+  }
+
+  libaom_test::ACMRandom rnd_;
+  uint16_t *comp_pred1_;
+  uint16_t *comp_pred2_;
+  uint16_t *pred_;
+  uint16_t *ref_buffer_;
+  uint16_t *ref_;
+};
+
+AV1HighbdCompMaskVarianceTest::~AV1HighbdCompMaskVarianceTest() { ; }
+
+void AV1HighbdCompMaskVarianceTest::SetUp() {
+  rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed());
+  av1_init_wedge_masks();
+
+  comp_pred1_ =
+      (uint16_t *)aom_memalign(16, MAX_SB_SQUARE * sizeof(*comp_pred1_));
+  comp_pred2_ =
+      (uint16_t *)aom_memalign(16, MAX_SB_SQUARE * sizeof(*comp_pred2_));
+  pred_ = (uint16_t *)aom_memalign(16, MAX_SB_SQUARE * sizeof(*pred_));
+  ref_buffer_ = (uint16_t *)aom_memalign(
+      16, (MAX_SB_SQUARE + (8 * MAX_SB_SIZE)) * sizeof(*ref_buffer_));
+  ref_ = ref_buffer_ + (8 * MAX_SB_SIZE);
+}
+
+void AV1HighbdCompMaskVarianceTest::TearDown() {
+  aom_free(comp_pred1_);
+  aom_free(comp_pred2_);
+  aom_free(pred_);
+  aom_free(ref_buffer_);
+  libaom_test::ClearSystemState();
+}
+
+void AV1HighbdCompMaskVarianceTest::RunCheckOutput(
+    highbd_comp_mask_pred_func test_impl, BLOCK_SIZE bsize, int inv) {
+  int bd_ = GET_PARAM(2);
+  const int w = block_size_wide[bsize];
+  const int h = block_size_high[bsize];
+  const int wedge_types = get_wedge_types_lookup(bsize);
+
+  for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+    pred_[i] = rnd_.Rand16() & ((1 << bd_) - 1);
+  }
+  for (int i = 0; i < MAX_SB_SQUARE + (8 * MAX_SB_SIZE); ++i) {
+    ref_buffer_[i] = rnd_.Rand16() & ((1 << bd_) - 1);
+  }
+
+  for (int wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
+    const uint8_t *mask = av1_get_contiguous_soft_mask(wedge_index, 1, bsize);
+
+    aom_highbd_comp_mask_pred_c(
+        CONVERT_TO_BYTEPTR(comp_pred1_), CONVERT_TO_BYTEPTR(pred_), w, h,
+        CONVERT_TO_BYTEPTR(ref_), MAX_SB_SIZE, mask, w, inv);
+
+    test_impl(CONVERT_TO_BYTEPTR(comp_pred2_), CONVERT_TO_BYTEPTR(pred_), w, h,
+              CONVERT_TO_BYTEPTR(ref_), MAX_SB_SIZE, mask, w, inv);
+
+    ASSERT_EQ(CheckResult(w, h), true)
+        << " wedge " << wedge_index << " inv " << inv;
+  }
+}
+
+void AV1HighbdCompMaskVarianceTest::RunSpeedTest(
+    highbd_comp_mask_pred_func test_impl, BLOCK_SIZE bsize) {
+  int bd_ = GET_PARAM(2);
+
+  const int w = block_size_wide[bsize];
+  const int h = block_size_high[bsize];
+  const int wedge_types = get_wedge_types_lookup(bsize);
+  int wedge_index = wedge_types / 2;
+
+  for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+    pred_[i] = rnd_.Rand16() & ((1 << bd_) - 1);
+  }
+  for (int i = 0; i < MAX_SB_SQUARE + (8 * MAX_SB_SIZE); ++i) {
+    ref_buffer_[i] = rnd_.Rand16() & ((1 << bd_) - 1);
+  }
+
+  const uint8_t *mask = av1_get_contiguous_soft_mask(wedge_index, 1, bsize);
+  const int num_loops = 1000000000 / (w + h);
+
+  highbd_comp_mask_pred_func funcs[2] = { aom_highbd_comp_mask_pred_c,
+                                          test_impl };
+  double elapsed_time[2] = { 0 };
+  for (int i = 0; i < 2; ++i) {
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    highbd_comp_mask_pred_func func = funcs[i];
+    for (int j = 0; j < num_loops; ++j) {
+      func(CONVERT_TO_BYTEPTR(comp_pred1_), CONVERT_TO_BYTEPTR(pred_), w, h,
+           CONVERT_TO_BYTEPTR(ref_), MAX_SB_SIZE, mask, w, 0);
+    }
+    aom_usec_timer_mark(&timer);
+    double time = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    elapsed_time[i] = 1000.0 * time / num_loops;
+  }
+  printf("compMask %3dx%-3d: %7.2f/%7.2fns", w, h, elapsed_time[0],
+         elapsed_time[1]);
+  printf("(%3.2f)\n", elapsed_time[0] / elapsed_time[1]);
+}
+
+TEST_P(AV1HighbdCompMaskVarianceTest, CheckOutput) {
+  // inv = 0, 1
+  RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 0);
+  RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 1);
+}
+
+TEST_P(AV1HighbdCompMaskVarianceTest, DISABLED_Speed) {
+  RunSpeedTest(GET_PARAM(0), GET_PARAM(1));
+}
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, AV1HighbdCompMaskVarianceTest,
+    ::testing::Combine(::testing::Values(&aom_highbd_comp_mask_pred_avx2),
+                       ::testing::ValuesIn(kValidBlockSize),
+                       ::testing::Range(8, 13, 2)));
+#endif
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, AV1HighbdCompMaskVarianceTest,
+    ::testing::Combine(::testing::Values(&aom_highbd_comp_mask_pred_sse2),
+                       ::testing::ValuesIn(kValidBlockSize),
+                       ::testing::Range(8, 13, 2)));
+#endif
+
+#ifndef aom_highbd_comp_mask_pred
+// can't run this test if aom_highbd_comp_mask_pred is defined to
+// aom_highbd_comp_mask_pred_c
+class AV1HighbdCompMaskUpVarianceTest : public AV1HighbdCompMaskVarianceTest {
+ public:
+  ~AV1HighbdCompMaskUpVarianceTest();
+
+ protected:
+  void RunCheckOutput(highbd_comp_mask_pred_func test_impl, BLOCK_SIZE bsize,
+                      int inv);
+  void RunSpeedTest(highbd_comp_mask_pred_func test_impl, BLOCK_SIZE bsize,
+                    int havSub);
+};
+
+AV1HighbdCompMaskUpVarianceTest::~AV1HighbdCompMaskUpVarianceTest() { ; }
+
+void AV1HighbdCompMaskUpVarianceTest::RunCheckOutput(
+    highbd_comp_mask_pred_func test_impl, BLOCK_SIZE bsize, int inv) {
+  (void)test_impl;
+  int bd_ = GET_PARAM(2);
+  const int w = block_size_wide[bsize];
+  const int h = block_size_high[bsize];
+  const int wedge_types = get_wedge_types_lookup(bsize);
+
+  for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+    pred_[i] = rnd_.Rand16() & ((1 << bd_) - 1);
+  }
+  for (int i = 0; i < MAX_SB_SQUARE + (8 * MAX_SB_SIZE); ++i) {
+    ref_buffer_[i] = rnd_.Rand16() & ((1 << bd_) - 1);
+  }
+
+  int subpel_search;
+  for (subpel_search = 1; subpel_search <= 2; ++subpel_search) {
+    // loop through subx and suby
+    for (int sub = 0; sub < 8 * 8; ++sub) {
+      int subx = sub & 0x7;
+      int suby = (sub >> 3);
+      for (int wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
+        const uint8_t *mask =
+            av1_get_contiguous_soft_mask(wedge_index, 1, bsize);
+
+        // ref
+        aom_highbd_upsampled_pred_c(
+            NULL, NULL, 0, 0, NULL, CONVERT_TO_BYTEPTR(comp_pred1_), w, h, subx,
+            suby, CONVERT_TO_BYTEPTR(ref_), MAX_SB_SIZE, bd_, subpel_search);
+
+        aom_highbd_comp_mask_pred_c(
+            CONVERT_TO_BYTEPTR(comp_pred1_), CONVERT_TO_BYTEPTR(pred_), w, h,
+            CONVERT_TO_BYTEPTR(comp_pred1_), w, mask, w, inv);
+
+        // test
+        aom_highbd_upsampled_pred(
+            NULL, NULL, 0, 0, NULL, CONVERT_TO_BYTEPTR(comp_pred2_), w, h, subx,
+            suby, CONVERT_TO_BYTEPTR(ref_), MAX_SB_SIZE, bd_, subpel_search);
+
+        aom_highbd_comp_mask_pred(
+            CONVERT_TO_BYTEPTR(comp_pred2_), CONVERT_TO_BYTEPTR(pred_), w, h,
+            CONVERT_TO_BYTEPTR(comp_pred2_), w, mask, w, inv);
+
+        ASSERT_EQ(CheckResult(w, h), true)
+            << " wedge " << wedge_index << " inv " << inv << "sub (" << subx
+            << "," << suby << ")";
+      }
+    }
+  }
+}
+
+void AV1HighbdCompMaskUpVarianceTest::RunSpeedTest(
+    highbd_comp_mask_pred_func test_impl, BLOCK_SIZE bsize, int havSub) {
+  int bd_ = GET_PARAM(2);
+  const int w = block_size_wide[bsize];
+  const int h = block_size_high[bsize];
+  const int subx = havSub ? 3 : 0;
+  const int suby = havSub ? 4 : 0;
+  const int wedge_types = get_wedge_types_lookup(bsize);
+  int wedge_index = wedge_types / 2;
+  const uint8_t *mask = av1_get_contiguous_soft_mask(wedge_index, 1, bsize);
+
+  for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+    pred_[i] = rnd_.Rand16() & ((1 << bd_) - 1);
+  }
+  for (int i = 0; i < MAX_SB_SQUARE + (8 * MAX_SB_SIZE); ++i) {
+    ref_buffer_[i] = rnd_.Rand16() & ((1 << bd_) - 1);
+  }
+
+  const int num_loops = 1000000000 / (w + h);
+  highbd_comp_mask_pred_func funcs[2] = { &aom_highbd_comp_mask_pred_c,
+                                          test_impl };
+  double elapsed_time[2] = { 0 };
+  for (int i = 0; i < 2; ++i) {
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    aom_highbd_comp_mask_pred = funcs[i];
+    int subpel_search = 2;  // set to 1 to test 4-tap filter.
+    for (int j = 0; j < num_loops; ++j) {
+      aom_highbd_comp_mask_upsampled_pred(
+          NULL, NULL, 0, 0, NULL, CONVERT_TO_BYTEPTR(comp_pred1_),
+          CONVERT_TO_BYTEPTR(pred_), w, h, subx, suby, CONVERT_TO_BYTEPTR(ref_),
+          MAX_SB_SIZE, mask, w, 0, bd_, subpel_search);
+    }
+    aom_usec_timer_mark(&timer);
+    double time = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    elapsed_time[i] = 1000.0 * time / num_loops;
+  }
+  printf("CompMaskUp[%d] %3dx%-3d:%7.2f/%7.2fns", havSub, w, h, elapsed_time[0],
+         elapsed_time[1]);
+  printf("(%3.2f)\n", elapsed_time[0] / elapsed_time[1]);
+}
+
+TEST_P(AV1HighbdCompMaskUpVarianceTest, CheckOutput) {
+  // inv mask = 0, 1
+  RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 0);
+  RunCheckOutput(GET_PARAM(0), GET_PARAM(1), 1);
+}
+
+TEST_P(AV1HighbdCompMaskUpVarianceTest, DISABLED_Speed) {
+  RunSpeedTest(GET_PARAM(0), GET_PARAM(1), 1);
+}
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, AV1HighbdCompMaskUpVarianceTest,
+    ::testing::Combine(::testing::Values(&aom_highbd_comp_mask_pred_avx2),
+                       ::testing::ValuesIn(kValidBlockSize),
+                       ::testing::Range(8, 13, 2)));
+#endif
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, AV1HighbdCompMaskUpVarianceTest,
+    ::testing::Combine(::testing::Values(&aom_highbd_comp_mask_pred_sse2),
+                       ::testing::ValuesIn(kValidBlockSize),
+                       ::testing::Range(8, 13, 2)));
+#endif
+
+#endif  // ifndef aom_highbd_comp_mask_pred
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+}  // namespace AV1CompMaskVariance
diff --git a/libs/libaom/src/test/convolve_round_test.cc b/libs/libaom/src/test/convolve_round_test.cc
new file mode 100644
index 000000000..4f17b5472
--- /dev/null
+++ b/libs/libaom/src/test/convolve_round_test.cc
@@ -0,0 +1,184 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <tuple>
+
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_ports/aom_timer.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+using libaom_test::ACMRandom;
+
+namespace {
+#define CONVOLVE_ROUNDING_PARAM                                            \
+  const int32_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, \
+      int h, int bits
+
+typedef void (*ConvolveRoundFunc)(CONVOLVE_ROUNDING_PARAM);
+
+typedef void (*ConvolveRoundFuncHbd)(CONVOLVE_ROUNDING_PARAM, int bd);
+
+template <ConvolveRoundFuncHbd fn>
+void highbd_convolve_rounding_8(CONVOLVE_ROUNDING_PARAM) {
+  const int bd = 8;
+  fn(src, src_stride, dst, dst_stride, w, h, bits, bd);
+}
+
+template <ConvolveRoundFuncHbd fn>
+void highbd_convolve_rounding_10(CONVOLVE_ROUNDING_PARAM) {
+  const int bd = 10;
+  fn(src, src_stride, dst, dst_stride, w, h, bits, bd);
+}
+
+template <ConvolveRoundFuncHbd fn>
+void highbd_convolve_rounding_12(CONVOLVE_ROUNDING_PARAM) {
+  const int bd = 12;
+  fn(src, src_stride, dst, dst_stride, w, h, bits, bd);
+}
+
+typedef enum { LOWBITDEPTH_TEST, HIGHBITDEPTH_TEST } DataPathType;
+
+using std::tuple;
+
+typedef tuple<ConvolveRoundFunc, ConvolveRoundFunc, DataPathType>
+    ConvolveRoundParam;
+
+const int kTestNum = 5000;
+
+class ConvolveRoundTest : public ::testing::TestWithParam<ConvolveRoundParam> {
+ protected:
+  ConvolveRoundTest()
+      : func_ref_(GET_PARAM(0)), func_(GET_PARAM(1)), data_path_(GET_PARAM(2)) {
+  }
+  virtual ~ConvolveRoundTest() {}
+
+  virtual void SetUp() {
+    const size_t block_size = 128 * 128;
+    src_ = reinterpret_cast<int32_t *>(
+        aom_memalign(16, block_size * sizeof(*src_)));
+    dst_ref_ = reinterpret_cast<uint16_t *>(
+        aom_memalign(16, block_size * sizeof(*dst_ref_)));
+    dst_ = reinterpret_cast<uint16_t *>(
+        aom_memalign(16, block_size * sizeof(*dst_)));
+  }
+
+  virtual void TearDown() {
+    aom_free(src_);
+    aom_free(dst_ref_);
+    aom_free(dst_);
+  }
+
+  void ConvolveRoundingRun() {
+    int test_num = 0;
+    const int src_stride = 128;
+    const int dst_stride = 128;
+    int bits = 13;
+    uint8_t *dst = 0;
+    uint8_t *dst_ref = 0;
+
+    if (data_path_ == LOWBITDEPTH_TEST) {
+      dst = reinterpret_cast<uint8_t *>(dst_);
+      dst_ref = reinterpret_cast<uint8_t *>(dst_ref_);
+    } else if (data_path_ == HIGHBITDEPTH_TEST) {
+      dst = CONVERT_TO_BYTEPTR(dst_);
+      dst_ref = CONVERT_TO_BYTEPTR(dst_ref_);
+    } else {
+      assert(0);
+    }
+
+    while (test_num < kTestNum) {
+      int block_size = test_num % BLOCK_SIZES_ALL;
+      int w = block_size_wide[block_size];
+      int h = block_size_high[block_size];
+
+      if (test_num % 2 == 0)
+        bits -= 1;
+      else
+        bits += 1;
+
+      GenerateBufferWithRandom(src_, src_stride, bits, w, h);
+
+      func_ref_(src_, src_stride, dst_ref, dst_stride, w, h, bits);
+      ASM_REGISTER_STATE_CHECK(
+          func_(src_, src_stride, dst, dst_stride, w, h, bits));
+
+      if (data_path_ == LOWBITDEPTH_TEST) {
+        for (int r = 0; r < h; ++r) {
+          for (int c = 0; c < w; ++c) {
+            ASSERT_EQ(dst_ref[r * dst_stride + c], dst[r * dst_stride + c])
+                << "Mismatch at r: " << r << " c: " << c << " w: " << w
+                << " h: " << h << " test: " << test_num;
+          }
+        }
+      } else {
+        for (int r = 0; r < h; ++r) {
+          for (int c = 0; c < w; ++c) {
+            ASSERT_EQ(dst_ref_[r * dst_stride + c], dst_[r * dst_stride + c])
+                << "Mismatch at r: " << r << " c: " << c << " w: " << w
+                << " h: " << h << " test: " << test_num;
+          }
+        }
+      }
+
+      test_num++;
+    }
+  }
+
+  void GenerateBufferWithRandom(int32_t *src, int src_stride, int bits, int w,
+                                int h) {
+    int32_t number;
+    for (int r = 0; r < h; ++r) {
+      for (int c = 0; c < w; ++c) {
+        number = static_cast<int32_t>(rand_.Rand31());
+        number %= 1 << (bits + 9);
+        src[r * src_stride + c] = number;
+      }
+    }
+  }
+
+  ACMRandom rand_;
+  int32_t *src_;
+  uint16_t *dst_ref_;
+  uint16_t *dst_;
+
+  ConvolveRoundFunc func_ref_;
+  ConvolveRoundFunc func_;
+  DataPathType data_path_;
+};
+
+TEST_P(ConvolveRoundTest, BitExactCheck) { ConvolveRoundingRun(); }
+
+using std::make_tuple;
+#if HAVE_AVX2
+const ConvolveRoundParam kConvRndParamArray[] = {
+  make_tuple(&av1_convolve_rounding_c, &av1_convolve_rounding_avx2,
+             LOWBITDEPTH_TEST),
+  make_tuple(&highbd_convolve_rounding_8<av1_highbd_convolve_rounding_c>,
+             &highbd_convolve_rounding_8<av1_highbd_convolve_rounding_avx2>,
+             HIGHBITDEPTH_TEST),
+  make_tuple(&highbd_convolve_rounding_10<av1_highbd_convolve_rounding_c>,
+             &highbd_convolve_rounding_10<av1_highbd_convolve_rounding_avx2>,
+             HIGHBITDEPTH_TEST),
+  make_tuple(&highbd_convolve_rounding_12<av1_highbd_convolve_rounding_c>,
+             &highbd_convolve_rounding_12<av1_highbd_convolve_rounding_avx2>,
+             HIGHBITDEPTH_TEST)
+};
+INSTANTIATE_TEST_SUITE_P(AVX2, ConvolveRoundTest,
+                         ::testing::ValuesIn(kConvRndParamArray));
+#endif  // HAVE_AVX2
+}  // namespace
diff --git a/libs/libaom/src/test/convolve_test.cc b/libs/libaom/src/test/convolve_test.cc
new file mode 100644
index 000000000..0b1eea16a
--- /dev/null
+++ b/libs/libaom/src/test/convolve_test.cc
@@ -0,0 +1,885 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <string.h>
+#include <tuple>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/mem.h"
+#include "av1/common/filter.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+
+namespace {
+
+static const unsigned int kMaxDimension = MAX_SB_SIZE;
+
+typedef void (*ConvolveFunc)(const uint8_t *src, ptrdiff_t src_stride,
+                             uint8_t *dst, ptrdiff_t dst_stride,
+                             const int16_t *filter_x, int filter_x_stride,
+                             const int16_t *filter_y, int filter_y_stride,
+                             int w, int h);
+
+struct ConvolveFunctions {
+  ConvolveFunctions(ConvolveFunc copy, ConvolveFunc h8, ConvolveFunc v8, int bd)
+      : copy_(copy), h8_(h8), v8_(v8), use_highbd_(bd) {}
+
+  ConvolveFunc copy_;
+  ConvolveFunc h8_;
+  ConvolveFunc v8_;
+  int use_highbd_;  // 0 if high bitdepth not used, else the actual bit depth.
+};
+
+typedef std::tuple<int, int, const ConvolveFunctions *> ConvolveParam;
+
+#define ALL_SIZES_64(convolve_fn)                                         \
+  make_tuple(4, 4, &convolve_fn), make_tuple(8, 4, &convolve_fn),         \
+      make_tuple(4, 8, &convolve_fn), make_tuple(8, 8, &convolve_fn),     \
+      make_tuple(16, 8, &convolve_fn), make_tuple(8, 16, &convolve_fn),   \
+      make_tuple(16, 16, &convolve_fn), make_tuple(32, 16, &convolve_fn), \
+      make_tuple(16, 32, &convolve_fn), make_tuple(32, 32, &convolve_fn), \
+      make_tuple(64, 32, &convolve_fn), make_tuple(32, 64, &convolve_fn), \
+      make_tuple(64, 64, &convolve_fn)
+
+#define ALL_SIZES(convolve_fn)                                          \
+  make_tuple(128, 64, &convolve_fn), make_tuple(64, 128, &convolve_fn), \
+      make_tuple(128, 128, &convolve_fn), ALL_SIZES_64(convolve_fn)
+
+// Reference 8-tap subpixel filter, slightly modified to fit into this test.
+#define AV1_FILTER_WEIGHT 128
+#define AV1_FILTER_SHIFT 7
+uint8_t clip_pixel(int x) { return x < 0 ? 0 : x > 255 ? 255 : x; }
+
+void filter_block2d_8_c(const uint8_t *src_ptr, unsigned int src_stride,
+                        const int16_t *HFilter, const int16_t *VFilter,
+                        uint8_t *dst_ptr, unsigned int dst_stride,
+                        unsigned int output_width, unsigned int output_height) {
+  // Between passes, we use an intermediate buffer whose height is extended to
+  // have enough horizontally filtered values as input for the vertical pass.
+  // This buffer is allocated to be big enough for the largest block type we
+  // support.
+  const int kInterp_Extend = 4;
+  const unsigned int intermediate_height =
+      (kInterp_Extend - 1) + output_height + kInterp_Extend;
+  unsigned int i, j;
+
+  assert(intermediate_height > 7);
+
+  // Size of intermediate_buffer is max_intermediate_height * filter_max_width,
+  // where max_intermediate_height = (kInterp_Extend - 1) + filter_max_height
+  //                                 + kInterp_Extend
+  //                               = 3 + 16 + 4
+  //                               = 23
+  // and filter_max_width          = 16
+  //
+  uint8_t intermediate_buffer[(kMaxDimension + 8) * kMaxDimension];
+  const int intermediate_next_stride =
+      1 - static_cast<int>(intermediate_height * output_width);
+
+  // Horizontal pass (src -> transposed intermediate).
+  uint8_t *output_ptr = intermediate_buffer;
+  const int src_next_row_stride = src_stride - output_width;
+  src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1);
+  for (i = 0; i < intermediate_height; ++i) {
+    for (j = 0; j < output_width; ++j) {
+      // Apply filter...
+      const int temp = (src_ptr[0] * HFilter[0]) + (src_ptr[1] * HFilter[1]) +
+                       (src_ptr[2] * HFilter[2]) + (src_ptr[3] * HFilter[3]) +
+                       (src_ptr[4] * HFilter[4]) + (src_ptr[5] * HFilter[5]) +
+                       (src_ptr[6] * HFilter[6]) + (src_ptr[7] * HFilter[7]) +
+                       (AV1_FILTER_WEIGHT >> 1);  // Rounding
+
+      // Normalize back to 0-255...
+      *output_ptr = clip_pixel(temp >> AV1_FILTER_SHIFT);
+      ++src_ptr;
+      output_ptr += intermediate_height;
+    }
+    src_ptr += src_next_row_stride;
+    output_ptr += intermediate_next_stride;
+  }
+
+  // Vertical pass (transposed intermediate -> dst).
+  src_ptr = intermediate_buffer;
+  const int dst_next_row_stride = dst_stride - output_width;
+  for (i = 0; i < output_height; ++i) {
+    for (j = 0; j < output_width; ++j) {
+      // Apply filter...
+      const int temp = (src_ptr[0] * VFilter[0]) + (src_ptr[1] * VFilter[1]) +
+                       (src_ptr[2] * VFilter[2]) + (src_ptr[3] * VFilter[3]) +
+                       (src_ptr[4] * VFilter[4]) + (src_ptr[5] * VFilter[5]) +
+                       (src_ptr[6] * VFilter[6]) + (src_ptr[7] * VFilter[7]) +
+                       (AV1_FILTER_WEIGHT >> 1);  // Rounding
+
+      // Normalize back to 0-255...
+      *dst_ptr++ = clip_pixel(temp >> AV1_FILTER_SHIFT);
+      src_ptr += intermediate_height;
+    }
+    src_ptr += intermediate_next_stride;
+    dst_ptr += dst_next_row_stride;
+  }
+}
+
+void block2d_average_c(uint8_t *src, unsigned int src_stride,
+                       uint8_t *output_ptr, unsigned int output_stride,
+                       unsigned int output_width, unsigned int output_height) {
+  unsigned int i, j;
+  for (i = 0; i < output_height; ++i) {
+    for (j = 0; j < output_width; ++j) {
+      output_ptr[j] = (output_ptr[j] + src[i * src_stride + j] + 1) >> 1;
+    }
+    output_ptr += output_stride;
+  }
+}
+
+void filter_average_block2d_8_c(const uint8_t *src_ptr,
+                                const unsigned int src_stride,
+                                const int16_t *HFilter, const int16_t *VFilter,
+                                uint8_t *dst_ptr, unsigned int dst_stride,
+                                unsigned int output_width,
+                                unsigned int output_height) {
+  uint8_t tmp[kMaxDimension * kMaxDimension];
+
+  assert(output_width <= kMaxDimension);
+  assert(output_height <= kMaxDimension);
+  filter_block2d_8_c(src_ptr, src_stride, HFilter, VFilter, tmp, kMaxDimension,
+                     output_width, output_height);
+  block2d_average_c(tmp, kMaxDimension, dst_ptr, dst_stride, output_width,
+                    output_height);
+}
+
+void highbd_filter_block2d_8_c(const uint16_t *src_ptr,
+                               const unsigned int src_stride,
+                               const int16_t *HFilter, const int16_t *VFilter,
+                               uint16_t *dst_ptr, unsigned int dst_stride,
+                               unsigned int output_width,
+                               unsigned int output_height, int bd) {
+  // Between passes, we use an intermediate buffer whose height is extended to
+  // have enough horizontally filtered values as input for the vertical pass.
+  // This buffer is allocated to be big enough for the largest block type we
+  // support.
+  const int kInterp_Extend = 4;
+  const unsigned int intermediate_height =
+      (kInterp_Extend - 1) + output_height + kInterp_Extend;
+
+  /* Size of intermediate_buffer is max_intermediate_height * filter_max_width,
+   * where max_intermediate_height = (kInterp_Extend - 1) + filter_max_height
+   *                                 + kInterp_Extend
+   *                               = 3 + 16 + 4
+   *                               = 23
+   * and filter_max_width = 16
+   */
+  uint16_t intermediate_buffer[(kMaxDimension + 8) * kMaxDimension] = { 0 };
+  const int intermediate_next_stride =
+      1 - static_cast<int>(intermediate_height * output_width);
+
+  // Horizontal pass (src -> transposed intermediate).
+  {
+    uint16_t *output_ptr = intermediate_buffer;
+    const int src_next_row_stride = src_stride - output_width;
+    unsigned int i, j;
+    src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1);
+    for (i = 0; i < intermediate_height; ++i) {
+      for (j = 0; j < output_width; ++j) {
+        // Apply filter...
+        const int temp = (src_ptr[0] * HFilter[0]) + (src_ptr[1] * HFilter[1]) +
+                         (src_ptr[2] * HFilter[2]) + (src_ptr[3] * HFilter[3]) +
+                         (src_ptr[4] * HFilter[4]) + (src_ptr[5] * HFilter[5]) +
+                         (src_ptr[6] * HFilter[6]) + (src_ptr[7] * HFilter[7]) +
+                         (AV1_FILTER_WEIGHT >> 1);  // Rounding
+
+        // Normalize back to 0-255...
+        *output_ptr = clip_pixel_highbd(temp >> AV1_FILTER_SHIFT, bd);
+        ++src_ptr;
+        output_ptr += intermediate_height;
+      }
+      src_ptr += src_next_row_stride;
+      output_ptr += intermediate_next_stride;
+    }
+  }
+
+  // Vertical pass (transposed intermediate -> dst).
+  {
+    const uint16_t *interm_ptr = intermediate_buffer;
+    const int dst_next_row_stride = dst_stride - output_width;
+    unsigned int i, j;
+    for (i = 0; i < output_height; ++i) {
+      for (j = 0; j < output_width; ++j) {
+        // Apply filter...
+        const int temp =
+            (interm_ptr[0] * VFilter[0]) + (interm_ptr[1] * VFilter[1]) +
+            (interm_ptr[2] * VFilter[2]) + (interm_ptr[3] * VFilter[3]) +
+            (interm_ptr[4] * VFilter[4]) + (interm_ptr[5] * VFilter[5]) +
+            (interm_ptr[6] * VFilter[6]) + (interm_ptr[7] * VFilter[7]) +
+            (AV1_FILTER_WEIGHT >> 1);  // Rounding
+
+        // Normalize back to 0-255...
+        *dst_ptr++ = clip_pixel_highbd(temp >> AV1_FILTER_SHIFT, bd);
+        interm_ptr += intermediate_height;
+      }
+      interm_ptr += intermediate_next_stride;
+      dst_ptr += dst_next_row_stride;
+    }
+  }
+}
+
+void highbd_block2d_average_c(uint16_t *src, unsigned int src_stride,
+                              uint16_t *output_ptr, unsigned int output_stride,
+                              unsigned int output_width,
+                              unsigned int output_height) {
+  unsigned int i, j;
+  for (i = 0; i < output_height; ++i) {
+    for (j = 0; j < output_width; ++j) {
+      output_ptr[j] = (output_ptr[j] + src[i * src_stride + j] + 1) >> 1;
+    }
+    output_ptr += output_stride;
+  }
+}
+
+void highbd_filter_average_block2d_8_c(
+    const uint16_t *src_ptr, unsigned int src_stride, const int16_t *HFilter,
+    const int16_t *VFilter, uint16_t *dst_ptr, unsigned int dst_stride,
+    unsigned int output_width, unsigned int output_height, int bd) {
+  uint16_t tmp[kMaxDimension * kMaxDimension];
+
+  assert(output_width <= kMaxDimension);
+  assert(output_height <= kMaxDimension);
+  highbd_filter_block2d_8_c(src_ptr, src_stride, HFilter, VFilter, tmp,
+                            kMaxDimension, output_width, output_height, bd);
+  highbd_block2d_average_c(tmp, kMaxDimension, dst_ptr, dst_stride,
+                           output_width, output_height);
+}
+
+class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> {
+ public:
+  static void SetUpTestCase() {
+    // Force input_ to be unaligned, output to be 16 byte aligned.
+    input_ = reinterpret_cast<uint8_t *>(
+                 aom_memalign(kDataAlignment, kInputBufferSize + 1)) +
+             1;
+    ref8_ = reinterpret_cast<uint8_t *>(
+        aom_memalign(kDataAlignment, kOutputStride * kMaxDimension));
+    output_ = reinterpret_cast<uint8_t *>(
+        aom_memalign(kDataAlignment, kOutputBufferSize));
+    output_ref_ = reinterpret_cast<uint8_t *>(
+        aom_memalign(kDataAlignment, kOutputBufferSize));
+    input16_ = reinterpret_cast<uint16_t *>(aom_memalign(
+                   kDataAlignment, (kInputBufferSize + 1) * sizeof(uint16_t))) +
+               1;
+    ref16_ = reinterpret_cast<uint16_t *>(aom_memalign(
+        kDataAlignment, kOutputStride * kMaxDimension * sizeof(uint16_t)));
+    output16_ = reinterpret_cast<uint16_t *>(
+        aom_memalign(kDataAlignment, (kOutputBufferSize) * sizeof(uint16_t)));
+    output16_ref_ = reinterpret_cast<uint16_t *>(
+        aom_memalign(kDataAlignment, (kOutputBufferSize) * sizeof(uint16_t)));
+  }
+
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+
+  static void TearDownTestCase() {
+    aom_free(input_ - 1);
+    input_ = NULL;
+    aom_free(ref8_);
+    ref8_ = NULL;
+    aom_free(output_);
+    output_ = NULL;
+    aom_free(output_ref_);
+    output_ref_ = NULL;
+    aom_free(input16_ - 1);
+    input16_ = NULL;
+    aom_free(ref16_);
+    ref16_ = NULL;
+    aom_free(output16_);
+    output16_ = NULL;
+    aom_free(output16_ref_);
+    output16_ref_ = NULL;
+  }
+
+ protected:
+  static const int kDataAlignment = 16;
+  static const int kOuterBlockSize = 4 * kMaxDimension;
+  static const int kInputStride = kOuterBlockSize;
+  static const int kOutputStride = kOuterBlockSize;
+  static const int kInputBufferSize = kOuterBlockSize * kOuterBlockSize;
+  static const int kOutputBufferSize = kOuterBlockSize * kOuterBlockSize;
+
+  int Width() const { return GET_PARAM(0); }
+  int Height() const { return GET_PARAM(1); }
+  int BorderLeft() const {
+    const int center = (kOuterBlockSize - Width()) / 2;
+    return (center + (kDataAlignment - 1)) & ~(kDataAlignment - 1);
+  }
+  int BorderTop() const { return (kOuterBlockSize - Height()) / 2; }
+
+  bool IsIndexInBorder(int i) {
+    return (i < BorderTop() * kOuterBlockSize ||
+            i >= (BorderTop() + Height()) * kOuterBlockSize ||
+            i % kOuterBlockSize < BorderLeft() ||
+            i % kOuterBlockSize >= (BorderLeft() + Width()));
+  }
+
+  virtual void SetUp() {
+    UUT_ = GET_PARAM(2);
+    if (UUT_->use_highbd_ != 0)
+      mask_ = (1 << UUT_->use_highbd_) - 1;
+    else
+      mask_ = 255;
+    /* Set up guard blocks for an inner block centered in the outer block */
+    for (int i = 0; i < kOutputBufferSize; ++i) {
+      if (IsIndexInBorder(i)) {
+        output_[i] = 255;
+        output16_[i] = mask_;
+      } else {
+        output_[i] = 0;
+        output16_[i] = 0;
+      }
+    }
+
+    ::libaom_test::ACMRandom prng;
+    for (int i = 0; i < kInputBufferSize; ++i) {
+      if (i & 1) {
+        input_[i] = 255;
+        input16_[i] = mask_;
+      } else {
+        input_[i] = prng.Rand8Extremes();
+        input16_[i] = prng.Rand16() & mask_;
+      }
+    }
+  }
+
+  void SetConstantInput(int value) {
+    memset(input_, value, kInputBufferSize);
+    aom_memset16(input16_, value, kInputBufferSize);
+  }
+
+  void CopyOutputToRef() {
+    memcpy(output_ref_, output_, kOutputBufferSize);
+    // Copy 16-bit pixels values. The effective number of bytes is double.
+    memcpy(output16_ref_, output16_, sizeof(output16_[0]) * kOutputBufferSize);
+  }
+
+  void CheckGuardBlocks() {
+    for (int i = 0; i < kOutputBufferSize; ++i) {
+      if (IsIndexInBorder(i)) {
+        EXPECT_EQ(255, output_[i]);
+      }
+    }
+  }
+
+  uint8_t *input() const {
+    const int offset = BorderTop() * kOuterBlockSize + BorderLeft();
+    if (UUT_->use_highbd_ == 0) {
+      return input_ + offset;
+    } else {
+      return CONVERT_TO_BYTEPTR(input16_) + offset;
+    }
+  }
+
+  uint8_t *output() const {
+    const int offset = BorderTop() * kOuterBlockSize + BorderLeft();
+    if (UUT_->use_highbd_ == 0) {
+      return output_ + offset;
+    } else {
+      return CONVERT_TO_BYTEPTR(output16_) + offset;
+    }
+  }
+
+  uint8_t *output_ref() const {
+    const int offset = BorderTop() * kOuterBlockSize + BorderLeft();
+    if (UUT_->use_highbd_ == 0) {
+      return output_ref_ + offset;
+    } else {
+      return CONVERT_TO_BYTEPTR(output16_ref_) + offset;
+    }
+  }
+
+  uint16_t lookup(uint8_t *list, int index) const {
+    if (UUT_->use_highbd_ == 0) {
+      return list[index];
+    } else {
+      return CONVERT_TO_SHORTPTR(list)[index];
+    }
+  }
+
+  void assign_val(uint8_t *list, int index, uint16_t val) const {
+    if (UUT_->use_highbd_ == 0) {
+      list[index] = (uint8_t)val;
+    } else {
+      CONVERT_TO_SHORTPTR(list)[index] = val;
+    }
+  }
+
+  void wrapper_filter_average_block2d_8_c(
+      const uint8_t *src_ptr, unsigned int src_stride, const int16_t *HFilter,
+      const int16_t *VFilter, uint8_t *dst_ptr, unsigned int dst_stride,
+      unsigned int output_width, unsigned int output_height) {
+    if (UUT_->use_highbd_ == 0) {
+      filter_average_block2d_8_c(src_ptr, src_stride, HFilter, VFilter, dst_ptr,
+                                 dst_stride, output_width, output_height);
+    } else {
+      highbd_filter_average_block2d_8_c(
+          CONVERT_TO_SHORTPTR(src_ptr), src_stride, HFilter, VFilter,
+          CONVERT_TO_SHORTPTR(dst_ptr), dst_stride, output_width, output_height,
+          UUT_->use_highbd_);
+    }
+  }
+
+  void wrapper_filter_block2d_8_c(
+      const uint8_t *src_ptr, unsigned int src_stride, const int16_t *HFilter,
+      const int16_t *VFilter, uint8_t *dst_ptr, unsigned int dst_stride,
+      unsigned int output_width, unsigned int output_height) {
+    if (UUT_->use_highbd_ == 0) {
+      filter_block2d_8_c(src_ptr, src_stride, HFilter, VFilter, dst_ptr,
+                         dst_stride, output_width, output_height);
+    } else {
+      highbd_filter_block2d_8_c(CONVERT_TO_SHORTPTR(src_ptr), src_stride,
+                                HFilter, VFilter, CONVERT_TO_SHORTPTR(dst_ptr),
+                                dst_stride, output_width, output_height,
+                                UUT_->use_highbd_);
+    }
+  }
+
+  const ConvolveFunctions *UUT_;
+  static uint8_t *input_;
+  static uint8_t *ref8_;
+  static uint8_t *output_;
+  static uint8_t *output_ref_;
+  static uint16_t *input16_;
+  static uint16_t *ref16_;
+  static uint16_t *output16_;
+  static uint16_t *output16_ref_;
+  int mask_;
+};
+
+uint8_t *ConvolveTest::input_ = NULL;
+uint8_t *ConvolveTest::ref8_ = NULL;
+uint8_t *ConvolveTest::output_ = NULL;
+uint8_t *ConvolveTest::output_ref_ = NULL;
+uint16_t *ConvolveTest::input16_ = NULL;
+uint16_t *ConvolveTest::ref16_ = NULL;
+uint16_t *ConvolveTest::output16_ = NULL;
+uint16_t *ConvolveTest::output16_ref_ = NULL;
+
+TEST_P(ConvolveTest, GuardBlocks) { CheckGuardBlocks(); }
+
+TEST_P(ConvolveTest, Copy) {
+  uint8_t *const in = input();
+  uint8_t *const out = output();
+
+  ASM_REGISTER_STATE_CHECK(UUT_->copy_(in, kInputStride, out, kOutputStride,
+                                       NULL, 0, NULL, 0, Width(), Height()));
+
+  CheckGuardBlocks();
+
+  for (int y = 0; y < Height(); ++y)
+    for (int x = 0; x < Width(); ++x)
+      ASSERT_EQ(lookup(out, y * kOutputStride + x),
+                lookup(in, y * kInputStride + x))
+          << "(" << x << "," << y << ")";
+}
+
+const int kNumFilterBanks = SWITCHABLE_FILTERS;
+const int kNumFilters = 16;
+
+TEST(ConvolveTest, FiltersWontSaturateWhenAddedPairwise) {
+  int subpel_search;
+  for (subpel_search = USE_4_TAPS; subpel_search <= USE_8_TAPS;
+       ++subpel_search) {
+    for (int filter_bank = 0; filter_bank < kNumFilterBanks; ++filter_bank) {
+      const InterpFilter filter = (InterpFilter)filter_bank;
+      const InterpKernel *filters =
+          (const InterpKernel *)av1_get_interp_filter_kernel(filter,
+                                                             subpel_search);
+      for (int i = 0; i < kNumFilters; i++) {
+        const int p0 = filters[i][0] + filters[i][1];
+        const int p1 = filters[i][2] + filters[i][3];
+        const int p2 = filters[i][4] + filters[i][5];
+        const int p3 = filters[i][6] + filters[i][7];
+        EXPECT_LE(p0, 128);
+        EXPECT_LE(p1, 128);
+        EXPECT_LE(p2, 128);
+        EXPECT_LE(p3, 128);
+        EXPECT_LE(p0 + p3, 128);
+        EXPECT_LE(p0 + p3 + p1, 128);
+        EXPECT_LE(p0 + p3 + p1 + p2, 128);
+        EXPECT_EQ(p0 + p1 + p2 + p3, 128);
+      }
+    }
+  }
+}
+
+const int16_t kInvalidFilter[8] = { 0 };
+
+TEST_P(ConvolveTest, MatchesReferenceSubpixelFilter) {
+  uint8_t *const in = input();
+  uint8_t *const out = output();
+  uint8_t *ref;
+  if (UUT_->use_highbd_ == 0) {
+    ref = ref8_;
+  } else {
+    ref = CONVERT_TO_BYTEPTR(ref16_);
+  }
+  int subpel_search;
+  for (subpel_search = USE_4_TAPS; subpel_search <= USE_8_TAPS;
+       ++subpel_search) {
+    for (int filter_bank = 0; filter_bank < kNumFilterBanks; ++filter_bank) {
+      const InterpFilter filter = (InterpFilter)filter_bank;
+      const InterpKernel *filters =
+          (const InterpKernel *)av1_get_interp_filter_kernel(filter,
+                                                             subpel_search);
+      for (int filter_x = 0; filter_x < kNumFilters; ++filter_x) {
+        for (int filter_y = 0; filter_y < kNumFilters; ++filter_y) {
+          wrapper_filter_block2d_8_c(in, kInputStride, filters[filter_x],
+                                     filters[filter_y], ref, kOutputStride,
+                                     Width(), Height());
+
+          if (filter_x && filter_y)
+            continue;
+          else if (filter_y)
+            ASM_REGISTER_STATE_CHECK(
+                UUT_->v8_(in, kInputStride, out, kOutputStride, kInvalidFilter,
+                          16, filters[filter_y], 16, Width(), Height()));
+          else if (filter_x)
+            ASM_REGISTER_STATE_CHECK(UUT_->h8_(
+                in, kInputStride, out, kOutputStride, filters[filter_x], 16,
+                kInvalidFilter, 16, Width(), Height()));
+          else
+            ASM_REGISTER_STATE_CHECK(UUT_->copy_(
+                in, kInputStride, out, kOutputStride, kInvalidFilter, 0,
+                kInvalidFilter, 0, Width(), Height()));
+
+          CheckGuardBlocks();
+
+          for (int y = 0; y < Height(); ++y)
+            for (int x = 0; x < Width(); ++x)
+              ASSERT_EQ(lookup(ref, y * kOutputStride + x),
+                        lookup(out, y * kOutputStride + x))
+                  << "mismatch at (" << x << "," << y << "), "
+                  << "filters (" << filter_bank << "," << filter_x << ","
+                  << filter_y << ")";
+        }
+      }
+    }
+  }
+}
+
+TEST_P(ConvolveTest, FilterExtremes) {
+  uint8_t *const in = input();
+  uint8_t *const out = output();
+  uint8_t *ref;
+  if (UUT_->use_highbd_ == 0) {
+    ref = ref8_;
+  } else {
+    ref = CONVERT_TO_BYTEPTR(ref16_);
+  }
+
+  // Populate ref and out with some random data
+  ::libaom_test::ACMRandom prng;
+  for (int y = 0; y < Height(); ++y) {
+    for (int x = 0; x < Width(); ++x) {
+      uint16_t r;
+      if (UUT_->use_highbd_ == 0 || UUT_->use_highbd_ == 8) {
+        r = prng.Rand8Extremes();
+      } else {
+        r = prng.Rand16() & mask_;
+      }
+      assign_val(out, y * kOutputStride + x, r);
+      assign_val(ref, y * kOutputStride + x, r);
+    }
+  }
+
+  for (int axis = 0; axis < 2; axis++) {
+    int seed_val = 0;
+    while (seed_val < 256) {
+      for (int y = 0; y < 8; ++y) {
+        for (int x = 0; x < 8; ++x) {
+          assign_val(in, y * kOutputStride + x - SUBPEL_TAPS / 2 + 1,
+                     ((seed_val >> (axis ? y : x)) & 1) * mask_);
+          if (axis) seed_val++;
+        }
+        if (axis)
+          seed_val -= 8;
+        else
+          seed_val++;
+      }
+      if (axis) seed_val += 8;
+      int subpel_search;
+      for (subpel_search = USE_4_TAPS; subpel_search <= USE_8_TAPS;
+           ++subpel_search) {
+        for (int filter_bank = 0; filter_bank < kNumFilterBanks;
+             ++filter_bank) {
+          const InterpFilter filter = (InterpFilter)filter_bank;
+          const InterpKernel *filters =
+              (const InterpKernel *)av1_get_interp_filter_kernel(filter,
+                                                                 subpel_search);
+          for (int filter_x = 0; filter_x < kNumFilters; ++filter_x) {
+            for (int filter_y = 0; filter_y < kNumFilters; ++filter_y) {
+              wrapper_filter_block2d_8_c(in, kInputStride, filters[filter_x],
+                                         filters[filter_y], ref, kOutputStride,
+                                         Width(), Height());
+              if (filter_x && filter_y)
+                continue;
+              else if (filter_y)
+                ASM_REGISTER_STATE_CHECK(UUT_->v8_(
+                    in, kInputStride, out, kOutputStride, kInvalidFilter, 16,
+                    filters[filter_y], 16, Width(), Height()));
+              else if (filter_x)
+                ASM_REGISTER_STATE_CHECK(UUT_->h8_(
+                    in, kInputStride, out, kOutputStride, filters[filter_x], 16,
+                    kInvalidFilter, 16, Width(), Height()));
+              else
+                ASM_REGISTER_STATE_CHECK(UUT_->copy_(
+                    in, kInputStride, out, kOutputStride, kInvalidFilter, 0,
+                    kInvalidFilter, 0, Width(), Height()));
+
+              for (int y = 0; y < Height(); ++y)
+                for (int x = 0; x < Width(); ++x)
+                  ASSERT_EQ(lookup(ref, y * kOutputStride + x),
+                            lookup(out, y * kOutputStride + x))
+                      << "mismatch at (" << x << "," << y << "), "
+                      << "filters (" << filter_bank << "," << filter_x << ","
+                      << filter_y << ")";
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST_P(ConvolveTest, DISABLED_Copy_Speed) {
+  const uint8_t *const in = input();
+  uint8_t *const out = output();
+  const int kNumTests = 5000000;
+  const int width = Width();
+  const int height = Height();
+  aom_usec_timer timer;
+
+  aom_usec_timer_start(&timer);
+  for (int n = 0; n < kNumTests; ++n) {
+    UUT_->copy_(in, kInputStride, out, kOutputStride, NULL, 0, NULL, 0, width,
+                height);
+  }
+  aom_usec_timer_mark(&timer);
+
+  const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+  printf("convolve_copy_%dx%d_%d: %d us\n", width, height,
+         UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
+}
+
+TEST_P(ConvolveTest, DISABLED_Speed) {
+  uint8_t *const in = input();
+  uint8_t *const out = output();
+  uint8_t *ref;
+  if (UUT_->use_highbd_ == 0) {
+    ref = ref8_;
+  } else {
+    ref = CONVERT_TO_BYTEPTR(ref16_);
+  }
+
+  // Populate ref and out with some random data
+  ::libaom_test::ACMRandom prng;
+  for (int y = 0; y < Height(); ++y) {
+    for (int x = 0; x < Width(); ++x) {
+      uint16_t r;
+      if (UUT_->use_highbd_ == 0 || UUT_->use_highbd_ == 8) {
+        r = prng.Rand8Extremes();
+      } else {
+        r = prng.Rand16() & mask_;
+      }
+      assign_val(out, y * kOutputStride + x, r);
+      assign_val(ref, y * kOutputStride + x, r);
+    }
+  }
+
+  const InterpFilter filter = (InterpFilter)1;
+  const InterpKernel *filters =
+      (const InterpKernel *)av1_get_interp_filter_kernel(filter, USE_8_TAPS);
+  wrapper_filter_average_block2d_8_c(in, kInputStride, filters[1], filters[1],
+                                     out, kOutputStride, Width(), Height());
+
+  aom_usec_timer timer;
+  int tests_num = 1000;
+
+  aom_usec_timer_start(&timer);
+  while (tests_num > 0) {
+    for (int filter_bank = 0; filter_bank < kNumFilterBanks; ++filter_bank) {
+      const InterpFilter filter = (InterpFilter)filter_bank;
+      const InterpKernel *filters =
+          (const InterpKernel *)av1_get_interp_filter_kernel(filter,
+                                                             USE_8_TAPS);
+      for (int filter_x = 0; filter_x < kNumFilters; ++filter_x) {
+        for (int filter_y = 0; filter_y < kNumFilters; ++filter_y) {
+          if (filter_x && filter_y) continue;
+          if (filter_y)
+            ASM_REGISTER_STATE_CHECK(
+                UUT_->v8_(in, kInputStride, out, kOutputStride, kInvalidFilter,
+                          16, filters[filter_y], 16, Width(), Height()));
+          else if (filter_x)
+            ASM_REGISTER_STATE_CHECK(UUT_->h8_(
+                in, kInputStride, out, kOutputStride, filters[filter_x], 16,
+                kInvalidFilter, 16, Width(), Height()));
+        }
+      }
+    }
+    tests_num--;
+  }
+  aom_usec_timer_mark(&timer);
+
+  const int elapsed_time =
+      static_cast<int>(aom_usec_timer_elapsed(&timer) / 1000);
+  printf("%dx%d (bitdepth %d) time: %5d ms\n", Width(), Height(),
+         UUT_->use_highbd_, elapsed_time);
+}
+
+using std::make_tuple;
+
+// WRAP macro is only used for high bitdepth build.
+#if CONFIG_AV1_HIGHBITDEPTH
+#define WRAP(func, bd)                                                       \
+  static void wrap_##func##_##bd(                                            \
+      const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,                \
+      ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride,    \
+      const int16_t *filter_y, int filter_y_stride, int w, int h) {          \
+    aom_highbd_##func(src, src_stride, dst, dst_stride, filter_x,            \
+                      filter_x_stride, filter_y, filter_y_stride, w, h, bd); \
+  }
+#if HAVE_SSE2 && ARCH_X86_64
+WRAP(convolve_copy_sse2, 8)
+WRAP(convolve_copy_sse2, 10)
+WRAP(convolve_copy_sse2, 12)
+WRAP(convolve8_horiz_sse2, 8)
+WRAP(convolve8_vert_sse2, 8)
+WRAP(convolve8_horiz_sse2, 10)
+WRAP(convolve8_vert_sse2, 10)
+WRAP(convolve8_horiz_sse2, 12)
+WRAP(convolve8_vert_sse2, 12)
+#endif  // HAVE_SSE2 && ARCH_X86_64
+
+WRAP(convolve_copy_c, 8)
+WRAP(convolve8_horiz_c, 8)
+WRAP(convolve8_vert_c, 8)
+WRAP(convolve_copy_c, 10)
+WRAP(convolve8_horiz_c, 10)
+WRAP(convolve8_vert_c, 10)
+WRAP(convolve_copy_c, 12)
+WRAP(convolve8_horiz_c, 12)
+WRAP(convolve8_vert_c, 12)
+
+#if HAVE_AVX2
+WRAP(convolve_copy_avx2, 8)
+WRAP(convolve8_horiz_avx2, 8)
+WRAP(convolve8_vert_avx2, 8)
+
+WRAP(convolve_copy_avx2, 10)
+WRAP(convolve8_horiz_avx2, 10)
+WRAP(convolve8_vert_avx2, 10)
+
+WRAP(convolve_copy_avx2, 12)
+WRAP(convolve8_horiz_avx2, 12)
+WRAP(convolve8_vert_avx2, 12)
+#endif  // HAVE_AVX2
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+#undef WRAP
+
+#if CONFIG_AV1_HIGHBITDEPTH
+const ConvolveFunctions wrap_convolve8_c(wrap_convolve_copy_c_8,
+                                         wrap_convolve8_horiz_c_8,
+                                         wrap_convolve8_vert_c_8, 8);
+const ConvolveFunctions wrap_convolve10_c(wrap_convolve_copy_c_10,
+                                          wrap_convolve8_horiz_c_10,
+                                          wrap_convolve8_vert_c_10, 10);
+const ConvolveFunctions wrap_convolve12_c(wrap_convolve_copy_c_12,
+                                          wrap_convolve8_horiz_c_12,
+                                          wrap_convolve8_vert_c_12, 12);
+const ConvolveParam kArrayConvolve_c[] = { ALL_SIZES(wrap_convolve8_c),
+                                           ALL_SIZES(wrap_convolve10_c),
+                                           ALL_SIZES(wrap_convolve12_c) };
+#else
+const ConvolveFunctions convolve8_c(aom_convolve_copy_c, aom_convolve8_horiz_c,
+                                    aom_convolve8_vert_c, 0);
+const ConvolveParam kArrayConvolve_c[] = { ALL_SIZES(convolve8_c) };
+#endif
+
+INSTANTIATE_TEST_SUITE_P(C, ConvolveTest,
+                         ::testing::ValuesIn(kArrayConvolve_c));
+
+#if HAVE_SSE2 && ARCH_X86_64
+#if CONFIG_AV1_HIGHBITDEPTH
+const ConvolveFunctions wrap_convolve8_sse2(wrap_convolve_copy_sse2_8,
+                                            wrap_convolve8_horiz_sse2_8,
+                                            wrap_convolve8_vert_sse2_8, 8);
+const ConvolveFunctions wrap_convolve10_sse2(wrap_convolve_copy_sse2_10,
+                                             wrap_convolve8_horiz_sse2_10,
+                                             wrap_convolve8_vert_sse2_10, 10);
+const ConvolveFunctions wrap_convolve12_sse2(wrap_convolve_copy_sse2_12,
+                                             wrap_convolve8_horiz_sse2_12,
+                                             wrap_convolve8_vert_sse2_12, 12);
+const ConvolveParam kArrayConvolve_sse2[] = { ALL_SIZES(wrap_convolve8_sse2),
+                                              ALL_SIZES(wrap_convolve10_sse2),
+                                              ALL_SIZES(wrap_convolve12_sse2) };
+#else
+const ConvolveFunctions convolve8_sse2(aom_convolve_copy_c,
+                                       aom_convolve8_horiz_sse2,
+                                       aom_convolve8_vert_sse2, 0);
+const ConvolveParam kArrayConvolve_sse2[] = { ALL_SIZES(convolve8_sse2) };
+#endif
+INSTANTIATE_TEST_SUITE_P(SSE2, ConvolveTest,
+                         ::testing::ValuesIn(kArrayConvolve_sse2));
+#endif
+
+#if HAVE_SSSE3
+const ConvolveFunctions convolve8_ssse3(aom_convolve_copy_c,
+                                        aom_convolve8_horiz_ssse3,
+                                        aom_convolve8_vert_ssse3, 0);
+
+const ConvolveParam kArrayConvolve8_ssse3[] = { ALL_SIZES(convolve8_ssse3) };
+INSTANTIATE_TEST_SUITE_P(SSSE3, ConvolveTest,
+                         ::testing::ValuesIn(kArrayConvolve8_ssse3));
+#endif
+
+#if HAVE_AVX2
+#if CONFIG_AV1_HIGHBITDEPTH
+const ConvolveFunctions wrap_convolve8_avx2(wrap_convolve_copy_avx2_8,
+                                            wrap_convolve8_horiz_avx2_8,
+                                            wrap_convolve8_vert_avx2_8, 8);
+const ConvolveFunctions wrap_convolve10_avx2(wrap_convolve_copy_avx2_10,
+                                             wrap_convolve8_horiz_avx2_10,
+                                             wrap_convolve8_vert_avx2_10, 10);
+const ConvolveFunctions wrap_convolve12_avx2(wrap_convolve_copy_avx2_12,
+                                             wrap_convolve8_horiz_avx2_12,
+                                             wrap_convolve8_vert_avx2_12, 12);
+const ConvolveParam kArray_Convolve8_avx2[] = {
+  ALL_SIZES_64(wrap_convolve8_avx2), ALL_SIZES_64(wrap_convolve10_avx2),
+  ALL_SIZES_64(wrap_convolve12_avx2)
+};
+#else
+const ConvolveFunctions convolve8_avx2(aom_convolve_copy_c,
+                                       aom_convolve8_horiz_avx2,
+                                       aom_convolve8_vert_avx2, 0);
+const ConvolveParam kArray_Convolve8_avx2[] = { ALL_SIZES(convolve8_avx2) };
+#endif
+
+INSTANTIATE_TEST_SUITE_P(AVX2, ConvolveTest,
+                         ::testing::ValuesIn(kArray_Convolve8_avx2));
+#endif  // HAVE_AVX2
+
+}  // namespace
diff --git a/libs/libaom/src/test/corner_match_test.cc b/libs/libaom/src/test/corner_match_test.cc
new file mode 100644
index 000000000..c685dca80
--- /dev/null
+++ b/libs/libaom/src/test/corner_match_test.cc
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <tuple>
+
+#include "config/av1_rtcd.h"
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/acm_random.h"
+#include "test/util.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+
+#include "av1/encoder/corner_match.h"
+
+namespace test_libaom {
+
+namespace AV1CornerMatch {
+
+using libaom_test::ACMRandom;
+
+typedef double (*ComputeCrossCorrFunc)(unsigned char *im1, int stride1, int x1,
+                                       int y1, unsigned char *im2, int stride2,
+                                       int x2, int y2);
+
+using std::make_tuple;
+using std::tuple;
+typedef tuple<int, ComputeCrossCorrFunc> CornerMatchParam;
+
+class AV1CornerMatchTest : public ::testing::TestWithParam<CornerMatchParam> {
+ public:
+  virtual ~AV1CornerMatchTest();
+  virtual void SetUp();
+
+  virtual void TearDown();
+
+ protected:
+  void RunCheckOutput(int run_times);
+  ComputeCrossCorrFunc target_func;
+
+  libaom_test::ACMRandom rnd_;
+};
+
+AV1CornerMatchTest::~AV1CornerMatchTest() {}
+void AV1CornerMatchTest::SetUp() {
+  rnd_.Reset(ACMRandom::DeterministicSeed());
+  target_func = GET_PARAM(1);
+}
+void AV1CornerMatchTest::TearDown() { libaom_test::ClearSystemState(); }
+
+void AV1CornerMatchTest::RunCheckOutput(int run_times) {
+  const int w = 128, h = 128;
+  const int num_iters = 10000;
+  int i, j;
+  aom_usec_timer ref_timer, test_timer;
+
+  uint8_t *input1 = new uint8_t[w * h];
+  uint8_t *input2 = new uint8_t[w * h];
+
+  // Test the two extreme cases:
+  // i) Random data, should have correlation close to 0
+  // ii) Linearly related data + noise, should have correlation close to 1
+  int mode = GET_PARAM(0);
+  if (mode == 0) {
+    for (i = 0; i < h; ++i)
+      for (j = 0; j < w; ++j) {
+        input1[i * w + j] = rnd_.Rand8();
+        input2[i * w + j] = rnd_.Rand8();
+      }
+  } else if (mode == 1) {
+    for (i = 0; i < h; ++i)
+      for (j = 0; j < w; ++j) {
+        int v = rnd_.Rand8();
+        input1[i * w + j] = v;
+        input2[i * w + j] = (v / 2) + (rnd_.Rand8() & 15);
+      }
+  }
+
+  for (i = 0; i < num_iters; ++i) {
+    int x1 = MATCH_SZ_BY2 + rnd_.PseudoUniform(w - 2 * MATCH_SZ_BY2);
+    int y1 = MATCH_SZ_BY2 + rnd_.PseudoUniform(h - 2 * MATCH_SZ_BY2);
+    int x2 = MATCH_SZ_BY2 + rnd_.PseudoUniform(w - 2 * MATCH_SZ_BY2);
+    int y2 = MATCH_SZ_BY2 + rnd_.PseudoUniform(h - 2 * MATCH_SZ_BY2);
+
+    double res_c =
+        av1_compute_cross_correlation_c(input1, w, x1, y1, input2, w, x2, y2);
+    double res_simd = target_func(input1, w, x1, y1, input2, w, x2, y2);
+
+    if (run_times > 1) {
+      aom_usec_timer_start(&ref_timer);
+      for (j = 0; j < run_times; j++) {
+        av1_compute_cross_correlation_c(input1, w, x1, y1, input2, w, x2, y2);
+      }
+      aom_usec_timer_mark(&ref_timer);
+      const int elapsed_time_c =
+          static_cast<int>(aom_usec_timer_elapsed(&ref_timer));
+
+      aom_usec_timer_start(&test_timer);
+      for (j = 0; j < run_times; j++) {
+        target_func(input1, w, x1, y1, input2, w, x2, y2);
+      }
+      aom_usec_timer_mark(&test_timer);
+      const int elapsed_time_simd =
+          static_cast<int>(aom_usec_timer_elapsed(&test_timer));
+
+      printf(
+          "c_time=%d \t simd_time=%d \t "
+          "gain=%d\n",
+          elapsed_time_c, elapsed_time_simd,
+          (elapsed_time_c / elapsed_time_simd));
+    } else {
+      ASSERT_EQ(res_simd, res_c);
+    }
+  }
+  delete[] input1;
+  delete[] input2;
+}
+
+TEST_P(AV1CornerMatchTest, CheckOutput) { RunCheckOutput(1); }
+TEST_P(AV1CornerMatchTest, DISABLED_Speed) { RunCheckOutput(100000); }
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(
+    SSE4_1, AV1CornerMatchTest,
+    ::testing::Values(make_tuple(0, &av1_compute_cross_correlation_sse4_1),
+                      make_tuple(1, &av1_compute_cross_correlation_sse4_1)));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, AV1CornerMatchTest,
+    ::testing::Values(make_tuple(0, &av1_compute_cross_correlation_avx2),
+                      make_tuple(1, &av1_compute_cross_correlation_avx2)));
+#endif
+}  // namespace AV1CornerMatch
+
+}  // namespace test_libaom
diff --git a/libs/libaom/src/test/cpu_speed_test.cc b/libs/libaom/src/test/cpu_speed_test.cc
new file mode 100644
index 000000000..2a164974b
--- /dev/null
+++ b/libs/libaom/src/test/cpu_speed_test.cc
@@ -0,0 +1,180 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+
+namespace {
+
+const int kMaxPSNR = 100;
+
+class CpuSpeedTest
+    : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode, int>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  CpuSpeedTest()
+      : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+        set_cpu_used_(GET_PARAM(2)), min_psnr_(kMaxPSNR),
+        tune_content_(AOM_CONTENT_DEFAULT) {}
+  virtual ~CpuSpeedTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(encoding_mode_);
+    if (encoding_mode_ != ::libaom_test::kRealTime) {
+      cfg_.g_lag_in_frames = 25;
+      cfg_.rc_end_usage = AOM_VBR;
+    } else {
+      cfg_.g_lag_in_frames = 0;
+      cfg_.rc_end_usage = AOM_CBR;
+    }
+  }
+
+  virtual void BeginPassHook(unsigned int /*pass*/) { min_psnr_ = kMaxPSNR; }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
+      encoder->Control(AV1E_SET_TUNE_CONTENT, tune_content_);
+      if (encoding_mode_ != ::libaom_test::kRealTime) {
+        encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+        encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
+        encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
+      }
+    }
+  }
+
+  virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+    if (pkt->data.psnr.psnr[0] < min_psnr_) min_psnr_ = pkt->data.psnr.psnr[0];
+  }
+
+  void TestQ0();
+  void TestScreencastQ0();
+  void TestTuneScreen();
+  void TestEncodeHighBitrate();
+  void TestLowBitrate();
+
+  ::libaom_test::TestMode encoding_mode_;
+  int set_cpu_used_;
+  double min_psnr_;
+  int tune_content_;
+};
+
+void CpuSpeedTest::TestQ0() {
+  // Validate that this non multiple of 64 wide clip encodes and decodes
+  // without a mismatch when passing in a very low max q.  This pushes
+  // the encoder to producing lots of big partitions which will likely
+  // extend into the border and test the border condition.
+  cfg_.rc_2pass_vbr_minsection_pct = 5;
+  cfg_.rc_2pass_vbr_maxsection_pct = 2000;
+  cfg_.rc_target_bitrate = 400;
+  cfg_.rc_max_quantizer = 0;
+  cfg_.rc_min_quantizer = 0;
+
+  ::libaom_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0,
+                                       10);
+
+  init_flags_ = AOM_CODEC_USE_PSNR;
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  EXPECT_GE(min_psnr_, kMaxPSNR);
+}
+
+void CpuSpeedTest::TestScreencastQ0() {
+  ::libaom_test::Y4mVideoSource video("screendata.y4m", 0, 3);
+  cfg_.g_timebase = video.timebase();
+  cfg_.rc_2pass_vbr_minsection_pct = 5;
+  cfg_.rc_2pass_vbr_maxsection_pct = 2000;
+  cfg_.rc_target_bitrate = 400;
+  cfg_.rc_max_quantizer = 0;
+  cfg_.rc_min_quantizer = 0;
+
+  init_flags_ = AOM_CODEC_USE_PSNR;
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  EXPECT_GE(min_psnr_, kMaxPSNR);
+}
+
+void CpuSpeedTest::TestTuneScreen() {
+  ::libaom_test::Y4mVideoSource video("screendata.y4m", 0, 3);
+  cfg_.g_timebase = video.timebase();
+  cfg_.rc_2pass_vbr_minsection_pct = 5;
+  cfg_.rc_2pass_vbr_minsection_pct = 2000;
+  cfg_.rc_target_bitrate = 2000;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.rc_min_quantizer = 0;
+  tune_content_ = AOM_CONTENT_SCREEN;
+
+  init_flags_ = AOM_CODEC_USE_PSNR;
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+void CpuSpeedTest::TestEncodeHighBitrate() {
+  // Validate that this non multiple of 64 wide clip encodes and decodes
+  // without a mismatch when passing in a very low max q.  This pushes
+  // the encoder to producing lots of big partitions which will likely
+  // extend into the border and test the border condition.
+  cfg_.rc_2pass_vbr_minsection_pct = 5;
+  cfg_.rc_2pass_vbr_maxsection_pct = 2000;
+  cfg_.rc_target_bitrate = 12000;
+  cfg_.rc_max_quantizer = 10;
+  cfg_.rc_min_quantizer = 0;
+
+  ::libaom_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0,
+                                       10);
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+void CpuSpeedTest::TestLowBitrate() {
+  // Validate that this clip encodes and decodes without a mismatch
+  // when passing in a very high min q.  This pushes the encoder to producing
+  // lots of small partitions which might will test the other condition.
+  cfg_.rc_2pass_vbr_minsection_pct = 5;
+  cfg_.rc_2pass_vbr_maxsection_pct = 2000;
+  cfg_.rc_target_bitrate = 200;
+  cfg_.rc_min_quantizer = 40;
+
+  ::libaom_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0,
+                                       10);
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+TEST_P(CpuSpeedTest, TestQ0) { TestQ0(); }
+TEST_P(CpuSpeedTest, TestScreencastQ0) { TestScreencastQ0(); }
+TEST_P(CpuSpeedTest, TestTuneScreen) { TestTuneScreen(); }
+TEST_P(CpuSpeedTest, TestEncodeHighBitrate) { TestEncodeHighBitrate(); }
+TEST_P(CpuSpeedTest, TestLowBitrate) { TestLowBitrate(); }
+
+class CpuSpeedTestLarge : public CpuSpeedTest {};
+
+TEST_P(CpuSpeedTestLarge, TestQ0) { TestQ0(); }
+TEST_P(CpuSpeedTestLarge, TestScreencastQ0) { TestScreencastQ0(); }
+TEST_P(CpuSpeedTestLarge, TestTuneScreen) { TestTuneScreen(); }
+TEST_P(CpuSpeedTestLarge, TestEncodeHighBitrate) { TestEncodeHighBitrate(); }
+TEST_P(CpuSpeedTestLarge, TestLowBitrate) { TestLowBitrate(); }
+
+AV1_INSTANTIATE_TEST_CASE(CpuSpeedTest,
+                          ::testing::Values(::libaom_test::kTwoPassGood,
+                                            ::libaom_test::kOnePassGood),
+                          ::testing::Range(1, 3));
+AV1_INSTANTIATE_TEST_CASE(CpuSpeedTestLarge,
+                          ::testing::Values(::libaom_test::kTwoPassGood,
+                                            ::libaom_test::kOnePassGood),
+                          ::testing::Range(0, 1));
+}  // namespace
diff --git a/libs/libaom/src/test/datarate_test.cc b/libs/libaom/src/test/datarate_test.cc
new file mode 100644
index 000000000..053c05571
--- /dev/null
+++ b/libs/libaom/src/test/datarate_test.cc
@@ -0,0 +1,373 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_config.h"
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/datarate_test.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+#include "aom/aom_codec.h"
+
+namespace datarate_test {
+namespace {
+
+// Params: test mode, speed, aq mode and index for bitrate array.
+class DatarateTestLarge
+    : public ::libaom_test::CodecTestWith4Params<libaom_test::TestMode, int,
+                                                 unsigned int, int>,
+      public DatarateTest {
+ public:
+  DatarateTestLarge() : DatarateTest(GET_PARAM(0)) {
+    set_cpu_used_ = GET_PARAM(2);
+    aq_mode_ = GET_PARAM(3);
+  }
+
+ protected:
+  virtual ~DatarateTestLarge() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(GET_PARAM(1));
+    ResetModel();
+  }
+
+  virtual void BasicRateTargetingVBRTest() {
+    cfg_.rc_min_quantizer = 0;
+    cfg_.rc_max_quantizer = 63;
+    cfg_.g_error_resilient = 0;
+    cfg_.rc_end_usage = AOM_VBR;
+    cfg_.g_lag_in_frames = 0;
+
+    ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+                                         288, 30, 1, 0, 140);
+    const int bitrate_array[2] = { 400, 800 };
+    cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+    ResetModel();
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_GE(effective_datarate_, cfg_.rc_target_bitrate * 0.7)
+        << " The datarate for the file is lower than target by too much!";
+    ASSERT_LE(effective_datarate_, cfg_.rc_target_bitrate * 1.3)
+        << " The datarate for the file is greater than target by too much!";
+  }
+
+  virtual void BasicRateTargetingCBRTest() {
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_dropframe_thresh = 1;
+    cfg_.rc_min_quantizer = 0;
+    cfg_.rc_max_quantizer = 63;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_lag_in_frames = 0;
+
+    ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+                                         288, 30, 1, 0, 140);
+    const int bitrate_array[2] = { 150, 550 };
+    cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+    ResetModel();
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_GE(effective_datarate_, cfg_.rc_target_bitrate * 0.85)
+        << " The datarate for the file is lower than target by too much!";
+    ASSERT_LE(effective_datarate_, cfg_.rc_target_bitrate * 1.15)
+        << " The datarate for the file is greater than target by too much!";
+  }
+
+  virtual void BasicRateTargetingCBRPeriodicKeyFrameTest() {
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_dropframe_thresh = 1;
+    cfg_.rc_min_quantizer = 0;
+    cfg_.rc_max_quantizer = 63;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_lag_in_frames = 0;
+    // Periodic keyframe
+    cfg_.kf_max_dist = 50;
+
+    ::libaom_test::I420VideoSource video("pixel_capture_w320h240.yuv", 320, 240,
+                                         30, 1, 0, 310);
+    const int bitrate_array[2] = { 150, 550 };
+    cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+    ResetModel();
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_GE(effective_datarate_, cfg_.rc_target_bitrate * 0.85)
+        << " The datarate for the file is lower than target by too much!";
+    ASSERT_LE(effective_datarate_, cfg_.rc_target_bitrate * 1.15)
+        << " The datarate for the file is greater than target by too much!";
+  }
+
+  virtual void BasicRateTargetingAQModeOnOffCBRTest() {
+    if (GET_PARAM(4) > 0) return;
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_min_quantizer = 2;
+    cfg_.rc_max_quantizer = 63;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.g_error_resilient = 0;
+    cfg_.g_pass = AOM_RC_ONE_PASS;
+    cfg_.g_usage = AOM_USAGE_REALTIME;
+    cfg_.kf_mode = AOM_KF_DISABLED;
+
+    ::libaom_test::I420VideoSource video("pixel_capture_w320h240.yuv", 320, 240,
+                                         30, 1, 0, 310);
+    const int bitrate_array[1] = { 60 };
+    cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+    ResetModel();
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_GE(effective_datarate_, cfg_.rc_target_bitrate * 0.85)
+        << " The datarate for the file is lower than target by too much!";
+    ASSERT_LE(effective_datarate_, cfg_.rc_target_bitrate * 1.15)
+        << " The datarate for the file is greater than target by too much!";
+  }
+
+  virtual void BasicRateTargeting444CBRTest() {
+    ::libaom_test::Y4mVideoSource video("rush_hour_444.y4m", 0, 140);
+
+    cfg_.g_profile = 1;
+    cfg_.g_timebase = video.timebase();
+
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_dropframe_thresh = 1;
+    cfg_.rc_min_quantizer = 0;
+    cfg_.rc_max_quantizer = 63;
+    cfg_.rc_end_usage = AOM_CBR;
+
+    const int bitrate_array[2] = { 250, 650 };
+    cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+    ResetModel();
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_GE(static_cast<double>(cfg_.rc_target_bitrate),
+              effective_datarate_ * 0.85)
+        << " The datarate for the file exceeds the target by too much!";
+    ASSERT_LE(static_cast<double>(cfg_.rc_target_bitrate),
+              effective_datarate_ * 1.15)
+        << " The datarate for the file missed the target!"
+        << cfg_.rc_target_bitrate << " " << effective_datarate_;
+  }
+};
+
+// Params: test mode, speed, aq mode.
+class DatarateTestFrameDropLarge
+    : public ::libaom_test::CodecTestWith3Params<libaom_test::TestMode, int,
+                                                 unsigned int>,
+      public DatarateTest {
+ public:
+  DatarateTestFrameDropLarge() : DatarateTest(GET_PARAM(0)) {
+    set_cpu_used_ = GET_PARAM(2);
+    aq_mode_ = GET_PARAM(3);
+  }
+
+ protected:
+  virtual ~DatarateTestFrameDropLarge() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(GET_PARAM(1));
+    ResetModel();
+  }
+
+  virtual void ChangingDropFrameThreshTest() {
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_undershoot_pct = 20;
+    cfg_.rc_undershoot_pct = 20;
+    cfg_.rc_dropframe_thresh = 10;
+    cfg_.rc_min_quantizer = 0;
+    cfg_.rc_max_quantizer = 50;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.rc_target_bitrate = 200;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.g_error_resilient = 1;
+    // TODO(marpan): Investigate datarate target failures with a smaller
+    // keyframe interval (128).
+    cfg_.kf_max_dist = 9999;
+
+    ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+                                         288, 30, 1, 0, 100);
+
+    const int kDropFrameThreshTestStep = 30;
+    aom_codec_pts_t last_drop = 140;
+    int last_num_drops = 0;
+    for (int i = 40; i < 100; i += kDropFrameThreshTestStep) {
+      cfg_.rc_dropframe_thresh = i;
+      ResetModel();
+      ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+      ASSERT_GE(effective_datarate_, cfg_.rc_target_bitrate * 0.85)
+          << " The datarate for the file is lower than target by too much!";
+      ASSERT_LE(effective_datarate_, cfg_.rc_target_bitrate * 1.17)
+          << " The datarate for the file is greater than target by too much!";
+      if (last_drop > 0) {
+        ASSERT_LE(first_drop_, last_drop)
+            << " The first dropped frame for drop_thresh " << i
+            << " > first dropped frame for drop_thresh "
+            << i - kDropFrameThreshTestStep;
+      }
+      ASSERT_GE(num_drops_, last_num_drops * 0.7)
+          << " The number of dropped frames for drop_thresh " << i
+          << " < number of dropped frames for drop_thresh "
+          << i - kDropFrameThreshTestStep;
+      last_drop = first_drop_;
+      last_num_drops = num_drops_;
+    }
+  }
+};
+
+// Check basic rate targeting for VBR mode.
+TEST_P(DatarateTestLarge, BasicRateTargetingVBR) {
+  BasicRateTargetingVBRTest();
+}
+
+// Check basic rate targeting for CBR.
+TEST_P(DatarateTestLarge, BasicRateTargetingCBR) {
+  BasicRateTargetingCBRTest();
+}
+
+// Check basic rate targeting for periodic key frame.
+TEST_P(DatarateTestLarge, PeriodicKeyFrameCBR) {
+  BasicRateTargetingCBRPeriodicKeyFrameTest();
+}
+
+// Check basic rate targeting for CBR.
+TEST_P(DatarateTestLarge, BasicRateTargeting444CBR) {
+  BasicRateTargeting444CBRTest();
+}
+
+// Check that (1) the first dropped frame gets earlier and earlier
+// as the drop frame threshold is increased, and (2) that the total number of
+// frame drops does not decrease as we increase frame drop threshold.
+// Use a lower qp-max to force some frame drops.
+TEST_P(DatarateTestFrameDropLarge, ChangingDropFrameThresh) {
+  ChangingDropFrameThreshTest();
+}
+
+TEST_P(DatarateTestLarge, BasicRateTargetingAQModeOnOffCBR) {
+  BasicRateTargetingAQModeOnOffCBRTest();
+}
+
+class DatarateTestRealtime : public DatarateTestLarge {};
+
+class DatarateTestFrameDropRealtime : public DatarateTestFrameDropLarge {};
+
+// Params: aq mode.
+class DatarateTestSpeedChangeRealtime
+    : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode,
+                                                 unsigned int>,
+      public DatarateTest {
+ public:
+  DatarateTestSpeedChangeRealtime() : DatarateTest(GET_PARAM(0)) {
+    aq_mode_ = GET_PARAM(1);
+    speed_change_test_ = true;
+  }
+
+ protected:
+  virtual ~DatarateTestSpeedChangeRealtime() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(GET_PARAM(1));
+    ResetModel();
+  }
+
+  virtual void ChangingSpeedTest() {
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_undershoot_pct = 20;
+    cfg_.rc_undershoot_pct = 20;
+    cfg_.rc_dropframe_thresh = 10;
+    cfg_.rc_min_quantizer = 0;
+    cfg_.rc_max_quantizer = 50;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.rc_target_bitrate = 200;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.g_error_resilient = 1;
+    // TODO(marpan): Investigate datarate target failures with a smaller
+    // keyframe interval (128).
+    cfg_.kf_max_dist = 9999;
+    cfg_.rc_dropframe_thresh = 0;
+    ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+                                         288, 30, 1, 0, 100);
+
+    ResetModel();
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_GE(effective_datarate_, cfg_.rc_target_bitrate * 0.83)
+        << " The datarate for the file is lower than target by too much!";
+    ASSERT_LE(effective_datarate_, cfg_.rc_target_bitrate * 1.20)
+        << " The datarate for the file is greater than target by too much!";
+  }
+};
+
+// Check basic rate targeting for VBR mode.
+TEST_P(DatarateTestRealtime, BasicRateTargetingVBR) {
+  BasicRateTargetingVBRTest();
+}
+
+// Check basic rate targeting for CBR.
+TEST_P(DatarateTestRealtime, BasicRateTargetingCBR) {
+  BasicRateTargetingCBRTest();
+}
+
+// Check basic rate targeting for periodic key frame.
+TEST_P(DatarateTestRealtime, PeriodicKeyFrameCBR) {
+  BasicRateTargetingCBRPeriodicKeyFrameTest();
+}
+
+// Check basic rate targeting for CBR.
+TEST_P(DatarateTestRealtime, BasicRateTargeting444CBR) {
+  BasicRateTargeting444CBRTest();
+}
+
+// Check that (1) the first dropped frame gets earlier and earlier
+// as the drop frame threshold is increased, and (2) that the total number of
+// frame drops does not decrease as we increase frame drop threshold.
+// Use a lower qp-max to force some frame drops.
+TEST_P(DatarateTestFrameDropRealtime, ChangingDropFrameThresh) {
+  ChangingDropFrameThreshTest();
+}
+
+TEST_P(DatarateTestSpeedChangeRealtime, ChangingSpeedTest) {
+  ChangingSpeedTest();
+}
+
+AV1_INSTANTIATE_TEST_CASE(DatarateTestLarge,
+                          ::testing::Values(::libaom_test::kRealTime),
+                          ::testing::Range(5, 7), ::testing::Values(0, 3),
+                          ::testing::Values(0, 1));
+
+AV1_INSTANTIATE_TEST_CASE(DatarateTestFrameDropLarge,
+                          ::testing::Values(::libaom_test::kRealTime),
+                          ::testing::Range(5, 7), ::testing::Values(0, 3));
+
+AV1_INSTANTIATE_TEST_CASE(DatarateTestRealtime,
+                          ::testing::Values(::libaom_test::kRealTime),
+                          ::testing::Range(7, 9), ::testing::Values(0, 3),
+                          ::testing::Values(0, 1));
+
+AV1_INSTANTIATE_TEST_CASE(DatarateTestFrameDropRealtime,
+                          ::testing::Values(::libaom_test::kRealTime),
+                          ::testing::Range(7, 9), ::testing::Values(0, 3));
+
+AV1_INSTANTIATE_TEST_CASE(DatarateTestSpeedChangeRealtime,
+                          ::testing::Values(::libaom_test::kRealTime),
+                          ::testing::Values(0, 3));
+
+}  // namespace
+}  // namespace datarate_test
diff --git a/libs/libaom/src/test/datarate_test.h b/libs/libaom/src/test/datarate_test.h
new file mode 100644
index 000000000..3c1573119
--- /dev/null
+++ b/libs/libaom/src/test/datarate_test.h
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_config.h"
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+#include "aom/aom_codec.h"
+
+namespace datarate_test {
+namespace {
+class DatarateTest : public ::libaom_test::EncoderTest {
+ public:
+  explicit DatarateTest(const ::libaom_test::CodecFactory *codec)
+      : EncoderTest(codec), set_cpu_used_(0), aq_mode_(0),
+        speed_change_test_(false) {}
+
+ protected:
+  virtual ~DatarateTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    ResetModel();
+  }
+
+  virtual void ResetModel() {
+    last_pts_ = 0;
+    bits_in_buffer_model_ = cfg_.rc_target_bitrate * cfg_.rc_buf_initial_sz;
+    frame_number_ = 0;
+    tot_frame_number_ = 0;
+    first_drop_ = 0;
+    num_drops_ = 0;
+    // Denoiser is off by default.
+    denoiser_on_ = 0;
+    bits_total_ = 0;
+    denoiser_offon_test_ = 0;
+    denoiser_offon_period_ = -1;
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
+      encoder->Control(AV1E_SET_AQ_MODE, aq_mode_);
+      encoder->Control(AV1E_SET_TILE_COLUMNS, 0);
+      if (cfg_.g_usage == AOM_USAGE_REALTIME) {
+        encoder->Control(AV1E_SET_DELTAQ_MODE, 0);
+        encoder->Control(AV1E_SET_ENABLE_TPL_MODEL, 0);
+        encoder->Control(AV1E_SET_ENABLE_CDEF, 1);
+        encoder->Control(AV1E_SET_COEFF_COST_UPD_FREQ, 2);
+        encoder->Control(AV1E_SET_MODE_COST_UPD_FREQ, 2);
+        encoder->Control(AV1E_SET_MV_COST_UPD_FREQ, 2);
+      }
+    }
+
+    if (speed_change_test_) {
+      if (video->frame() == 0) {
+        encoder->Control(AOME_SET_CPUUSED, 8);
+      }
+      if (video->frame() == 30) {
+        encoder->Control(AOME_SET_CPUUSED, 7);
+      }
+      if (video->frame() == 60) {
+        encoder->Control(AOME_SET_CPUUSED, 6);
+      }
+      if (video->frame() == 90) {
+        encoder->Control(AOME_SET_CPUUSED, 7);
+      }
+    }
+
+    if (denoiser_offon_test_) {
+      ASSERT_GT(denoiser_offon_period_, 0)
+          << "denoiser_offon_period_ is not positive.";
+      if ((video->frame() + 1) % denoiser_offon_period_ == 0) {
+        // Flip denoiser_on_ periodically
+        denoiser_on_ ^= 1;
+      }
+    }
+
+    encoder->Control(AV1E_SET_NOISE_SENSITIVITY, denoiser_on_);
+
+    const aom_rational_t tb = video->timebase();
+    timebase_ = static_cast<double>(tb.num) / tb.den;
+    duration_ = 0;
+  }
+
+  virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
+    // Time since last timestamp = duration.
+    aom_codec_pts_t duration = pkt->data.frame.pts - last_pts_;
+
+    if (duration > 1) {
+      // If first drop not set and we have a drop set it to this time.
+      if (!first_drop_) first_drop_ = last_pts_ + 1;
+      // Update the number of frame drops.
+      num_drops_ += static_cast<int>(duration - 1);
+      // Update counter for total number of frames (#frames input to encoder).
+      // Needed for setting the proper layer_id below.
+      tot_frame_number_ += static_cast<int>(duration - 1);
+    }
+
+    // Add to the buffer the bits we'd expect from a constant bitrate server.
+    bits_in_buffer_model_ += static_cast<int64_t>(
+        duration * timebase_ * cfg_.rc_target_bitrate * 1000);
+
+    // Buffer should not go negative.
+    ASSERT_GE(bits_in_buffer_model_, 0)
+        << "Buffer Underrun at frame " << pkt->data.frame.pts;
+
+    const size_t frame_size_in_bits = pkt->data.frame.sz * 8;
+
+    // Update the total encoded bits.
+    bits_total_ += frame_size_in_bits;
+
+    // Update the most recent pts.
+    last_pts_ = pkt->data.frame.pts;
+    ++frame_number_;
+    ++tot_frame_number_;
+  }
+
+  virtual void EndPassHook(void) {
+    duration_ = (last_pts_ + 1) * timebase_;
+    // Effective file datarate:
+    effective_datarate_ = (bits_total_ / 1000.0) / duration_;
+  }
+
+  aom_codec_pts_t last_pts_;
+  double timebase_;
+  int frame_number_;      // Counter for number of non-dropped/encoded frames.
+  int tot_frame_number_;  // Counter for total number of input frames.
+  int64_t bits_total_;
+  double duration_;
+  double effective_datarate_;
+  int set_cpu_used_;
+  int64_t bits_in_buffer_model_;
+  aom_codec_pts_t first_drop_;
+  int num_drops_;
+  int denoiser_on_;
+  int denoiser_offon_test_;
+  int denoiser_offon_period_;
+  unsigned int aq_mode_;
+  bool speed_change_test_;
+};
+
+}  // namespace
+}  // namespace datarate_test
diff --git a/libs/libaom/src/test/decode_api_test.cc b/libs/libaom/src/test/decode_api_test.cc
new file mode 100644
index 000000000..910640df7
--- /dev/null
+++ b/libs/libaom/src/test/decode_api_test.cc
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+
+#include "test/util.h"
+#include "aom/aomdx.h"
+#include "aom/aom_decoder.h"
+
+namespace {
+
+TEST(DecodeAPI, InvalidParams) {
+  static const aom_codec_iface_t *kCodecs[] = {
+#if CONFIG_AV1_DECODER
+    aom_codec_av1_dx(),
+#endif
+  };
+  uint8_t buf[1] = { 0 };
+  aom_codec_ctx_t dec;
+
+  EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_dec_init(NULL, NULL, NULL, 0));
+  EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_dec_init(&dec, NULL, NULL, 0));
+  EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_decode(NULL, NULL, 0, NULL));
+  EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_decode(NULL, buf, 0, NULL));
+  EXPECT_EQ(AOM_CODEC_INVALID_PARAM,
+            aom_codec_decode(NULL, buf, sizeof(buf), NULL));
+  EXPECT_EQ(AOM_CODEC_INVALID_PARAM,
+            aom_codec_decode(NULL, NULL, sizeof(buf), NULL));
+  EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_destroy(NULL));
+  EXPECT_TRUE(aom_codec_error(NULL) != NULL);
+
+  for (const aom_codec_iface_t *iface : kCodecs) {
+    EXPECT_EQ(AOM_CODEC_INVALID_PARAM,
+              aom_codec_dec_init(NULL, iface, NULL, 0));
+
+    EXPECT_EQ(AOM_CODEC_OK, aom_codec_dec_init(&dec, iface, NULL, 0));
+    EXPECT_EQ(AOM_CODEC_INVALID_PARAM,
+              aom_codec_decode(&dec, NULL, sizeof(buf), NULL));
+    EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_decode(&dec, buf, 0, NULL));
+
+    EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&dec));
+  }
+}
+
+}  // namespace
diff --git a/libs/libaom/src/test/decode_multithreaded_test.cc b/libs/libaom/src/test/decode_multithreaded_test.cc
new file mode 100644
index 000000000..92253ede8
--- /dev/null
+++ b/libs/libaom/src/test/decode_multithreaded_test.cc
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+
+#include "aom_mem/aom_mem.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/md5_helper.h"
+#include "test/util.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+static const int kNumMultiThreadDecoders = 3;
+
+class AV1DecodeMultiThreadedTest
+    : public ::libaom_test::CodecTestWith5Params<int, int, int, int, int>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  AV1DecodeMultiThreadedTest()
+      : EncoderTest(GET_PARAM(0)), md5_single_thread_(), md5_multi_thread_(),
+        n_tile_cols_(GET_PARAM(1)), n_tile_rows_(GET_PARAM(2)),
+        n_tile_groups_(GET_PARAM(3)), set_cpu_used_(GET_PARAM(4)),
+        row_mt_(GET_PARAM(5)) {
+    init_flags_ = AOM_CODEC_USE_PSNR;
+    aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t();
+    cfg.w = 704;
+    cfg.h = 576;
+    cfg.threads = 1;
+    cfg.allow_lowbitdepth = 1;
+    single_thread_dec_ = codec_->CreateDecoder(cfg, 0);
+
+    // Test cfg.threads == powers of 2.
+    for (int i = 0; i < kNumMultiThreadDecoders; ++i) {
+      cfg.threads <<= 1;
+      multi_thread_dec_[i] = codec_->CreateDecoder(cfg, 0);
+      multi_thread_dec_[i]->Control(AV1D_SET_ROW_MT, row_mt_);
+    }
+
+    if (single_thread_dec_->IsAV1()) {
+      single_thread_dec_->Control(AV1D_EXT_TILE_DEBUG, 1);
+      single_thread_dec_->Control(AV1_SET_DECODE_TILE_ROW, -1);
+      single_thread_dec_->Control(AV1_SET_DECODE_TILE_COL, -1);
+    }
+    for (int i = 0; i < kNumMultiThreadDecoders; ++i) {
+      if (multi_thread_dec_[i]->IsAV1()) {
+        multi_thread_dec_[i]->Control(AV1D_EXT_TILE_DEBUG, 1);
+        multi_thread_dec_[i]->Control(AV1_SET_DECODE_TILE_ROW, -1);
+        multi_thread_dec_[i]->Control(AV1_SET_DECODE_TILE_COL, -1);
+      }
+    }
+  }
+
+  virtual ~AV1DecodeMultiThreadedTest() {
+    delete single_thread_dec_;
+    for (int i = 0; i < kNumMultiThreadDecoders; ++i)
+      delete multi_thread_dec_[i];
+  }
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(libaom_test::kTwoPassGood);
+  }
+
+  virtual void PreEncodeFrameHook(libaom_test::VideoSource *video,
+                                  libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(AV1E_SET_TILE_COLUMNS, n_tile_cols_);
+      encoder->Control(AV1E_SET_TILE_ROWS, n_tile_rows_);
+      encoder->Control(AV1E_SET_NUM_TG, n_tile_groups_);
+      encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
+    }
+  }
+
+  void UpdateMD5(::libaom_test::Decoder *dec, const aom_codec_cx_pkt_t *pkt,
+                 ::libaom_test::MD5 *md5) {
+    const aom_codec_err_t res = dec->DecodeFrame(
+        reinterpret_cast<uint8_t *>(pkt->data.frame.buf), pkt->data.frame.sz);
+    if (res != AOM_CODEC_OK) {
+      abort_ = true;
+      ASSERT_EQ(AOM_CODEC_OK, res);
+    }
+    const aom_image_t *img = dec->GetDxData().Next();
+    md5->Add(img);
+  }
+
+  virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
+    UpdateMD5(single_thread_dec_, pkt, &md5_single_thread_);
+
+    for (int i = 0; i < kNumMultiThreadDecoders; ++i)
+      UpdateMD5(multi_thread_dec_[i], pkt, &md5_multi_thread_[i]);
+  }
+
+  void DoTest() {
+    const aom_rational timebase = { 33333333, 1000000000 };
+    cfg_.g_timebase = timebase;
+    cfg_.rc_target_bitrate = 500;
+    cfg_.g_lag_in_frames = 12;
+    cfg_.rc_end_usage = AOM_VBR;
+
+    libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 704, 576,
+                                       timebase.den, timebase.num, 0, 5);
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+    const char *md5_single_thread_str = md5_single_thread_.Get();
+
+    for (int i = 0; i < kNumMultiThreadDecoders; ++i) {
+      const char *md5_multi_thread_str = md5_multi_thread_[i].Get();
+      ASSERT_STREQ(md5_single_thread_str, md5_multi_thread_str);
+    }
+  }
+
+  ::libaom_test::MD5 md5_single_thread_;
+  ::libaom_test::MD5 md5_multi_thread_[kNumMultiThreadDecoders];
+  ::libaom_test::Decoder *single_thread_dec_;
+  ::libaom_test::Decoder *multi_thread_dec_[kNumMultiThreadDecoders];
+
+ private:
+  int n_tile_cols_;
+  int n_tile_rows_;
+  int n_tile_groups_;
+  int set_cpu_used_;
+  int row_mt_;
+};
+
+// run an encode and do the decode both in single thread
+// and multi thread. Ensure that the MD5 of the output in both cases
+// is identical. If so, the test passes.
+TEST_P(AV1DecodeMultiThreadedTest, MD5Match) {
+  cfg_.large_scale_tile = 0;
+  single_thread_dec_->Control(AV1_SET_TILE_MODE, 0);
+  for (int i = 0; i < kNumMultiThreadDecoders; ++i)
+    multi_thread_dec_[i]->Control(AV1_SET_TILE_MODE, 0);
+  DoTest();
+}
+
+class AV1DecodeMultiThreadedTestLarge : public AV1DecodeMultiThreadedTest {};
+
+TEST_P(AV1DecodeMultiThreadedTestLarge, MD5Match) {
+  cfg_.large_scale_tile = 0;
+  single_thread_dec_->Control(AV1_SET_TILE_MODE, 0);
+  for (int i = 0; i < kNumMultiThreadDecoders; ++i)
+    multi_thread_dec_[i]->Control(AV1_SET_TILE_MODE, 0);
+  DoTest();
+}
+
+// TODO(ranjit): More tests have to be added using pre-generated MD5.
+AV1_INSTANTIATE_TEST_CASE(AV1DecodeMultiThreadedTest, ::testing::Values(1, 2),
+                          ::testing::Values(1, 2), ::testing::Values(1),
+                          ::testing::Values(3), ::testing::Values(0, 1));
+AV1_INSTANTIATE_TEST_CASE(AV1DecodeMultiThreadedTestLarge,
+                          ::testing::Values(0, 1, 2, 6),
+                          ::testing::Values(0, 1, 2, 6),
+                          ::testing::Values(1, 4), ::testing::Values(0),
+                          ::testing::Values(0, 1));
+
+class AV1DecodeMultiThreadedLSTestLarge
+    : public AV1DecodeMultiThreadedTestLarge {};
+
+TEST_P(AV1DecodeMultiThreadedLSTestLarge, MD5Match) {
+  cfg_.large_scale_tile = 1;
+  single_thread_dec_->Control(AV1_SET_TILE_MODE, 1);
+  for (int i = 0; i < kNumMultiThreadDecoders; ++i)
+    multi_thread_dec_[i]->Control(AV1_SET_TILE_MODE, 1);
+  DoTest();
+}
+
+AV1_INSTANTIATE_TEST_CASE(AV1DecodeMultiThreadedLSTestLarge,
+                          ::testing::Values(6), ::testing::Values(6),
+                          ::testing::Values(1), ::testing::Values(0, 3),
+                          ::testing::Values(0, 1));
+
+}  // namespace
diff --git a/libs/libaom/src/test/decode_perf_test.cc b/libs/libaom/src/test/decode_perf_test.cc
new file mode 100644
index 000000000..691337cd6
--- /dev/null
+++ b/libs/libaom/src/test/decode_perf_test.cc
@@ -0,0 +1,247 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <string>
+#include <tuple>
+
+#include "config/aom_version.h"
+
+#include "aom_ports/aom_timer.h"
+#include "common/ivfenc.h"
+#include "test/codec_factory.h"
+#include "test/decode_test_driver.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/ivf_video_source.h"
+#include "test/md5_helper.h"
+#include "test/util.h"
+#include "test/webm_video_source.h"
+
+using std::make_tuple;
+
+namespace {
+
+#define VIDEO_NAME 0
+#define THREADS 1
+
+const double kUsecsInSec = 1000000.0;
+const char kNewEncodeOutputFile[] = "new_encode.ivf";
+
+/*
+ DecodePerfTest takes a tuple of filename + number of threads to decode with
+ */
+typedef std::tuple<const char *, unsigned> DecodePerfParam;
+
+// TODO(jimbankoski): Add actual test vectors here when available.
+// const DecodePerfParam kAV1DecodePerfVectors[] = {};
+
+/*
+ In order to reflect real world performance as much as possible, Perf tests
+ *DO NOT* do any correctness checks. Please run them alongside correctness
+ tests to ensure proper codec integrity. Furthermore, in this test we
+ deliberately limit the amount of system calls we make to avoid OS
+ preemption.
+
+ TODO(joshualitt) create a more detailed perf measurement test to collect
+   power/temp/min max frame decode times/etc
+ */
+
+class DecodePerfTest : public ::testing::TestWithParam<DecodePerfParam> {};
+
+TEST_P(DecodePerfTest, PerfTest) {
+  const char *const video_name = GET_PARAM(VIDEO_NAME);
+  const unsigned threads = GET_PARAM(THREADS);
+
+  libaom_test::WebMVideoSource video(video_name);
+  video.Init();
+
+  aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t();
+  cfg.threads = threads;
+  cfg.allow_lowbitdepth = 1;
+  libaom_test::AV1Decoder decoder(cfg, 0);
+
+  aom_usec_timer t;
+  aom_usec_timer_start(&t);
+
+  for (video.Begin(); video.cxdata() != NULL; video.Next()) {
+    decoder.DecodeFrame(video.cxdata(), video.frame_size());
+  }
+
+  aom_usec_timer_mark(&t);
+  const double elapsed_secs = double(aom_usec_timer_elapsed(&t)) / kUsecsInSec;
+  const unsigned frames = video.frame_number();
+  const double fps = double(frames) / elapsed_secs;
+
+  printf("{\n");
+  printf("\t\"type\" : \"decode_perf_test\",\n");
+  printf("\t\"version\" : \"%s\",\n", VERSION_STRING_NOSP);
+  printf("\t\"videoName\" : \"%s\",\n", video_name);
+  printf("\t\"threadCount\" : %u,\n", threads);
+  printf("\t\"decodeTimeSecs\" : %f,\n", elapsed_secs);
+  printf("\t\"totalFrames\" : %u,\n", frames);
+  printf("\t\"framesPerSecond\" : %f\n", fps);
+  printf("}\n");
+}
+
+// TODO(jimbankoski): Enabled when we have actual AV1 Decode vectors.
+// INSTANTIATE_TEST_SUITE_P(AV1, DecodePerfTest,
+//                        ::testing::ValuesIn(kAV1DecodePerfVectors));
+
+class AV1NewEncodeDecodePerfTest
+    : public ::libaom_test::CodecTestWithParam<libaom_test::TestMode>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  AV1NewEncodeDecodePerfTest()
+      : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)), speed_(0),
+        outfile_(0), out_frames_(0) {}
+
+  virtual ~AV1NewEncodeDecodePerfTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(encoding_mode_);
+
+    cfg_.g_lag_in_frames = 25;
+    cfg_.rc_min_quantizer = 2;
+    cfg_.rc_max_quantizer = 56;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_undershoot_pct = 50;
+    cfg_.rc_overshoot_pct = 50;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 600;
+    cfg_.rc_end_usage = AOM_VBR;
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(AOME_SET_CPUUSED, speed_);
+      encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1);
+      encoder->Control(AV1E_SET_TILE_COLUMNS, 2);
+    }
+  }
+
+  virtual void BeginPassHook(unsigned int /*pass*/) {
+    const char *const env = getenv("LIBAOM_TEST_DATA_PATH");
+    const std::string data_path(env ? env : ".");
+    const std::string path_to_source = data_path + "/" + kNewEncodeOutputFile;
+    outfile_ = fopen(path_to_source.c_str(), "wb");
+    ASSERT_TRUE(outfile_ != NULL);
+  }
+
+  virtual void EndPassHook() {
+    if (outfile_ != NULL) {
+      if (!fseek(outfile_, 0, SEEK_SET))
+        ivf_write_file_header(outfile_, &cfg_, AV1_FOURCC, out_frames_);
+      fclose(outfile_);
+      outfile_ = NULL;
+    }
+  }
+
+  virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
+    ++out_frames_;
+
+    // Write initial file header if first frame.
+    if (pkt->data.frame.pts == 0)
+      ivf_write_file_header(outfile_, &cfg_, AV1_FOURCC, out_frames_);
+
+    // Write frame header and data.
+    ivf_write_frame_header(outfile_, out_frames_, pkt->data.frame.sz);
+    ASSERT_EQ(fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, outfile_),
+              pkt->data.frame.sz);
+  }
+
+  virtual bool DoDecode() const { return false; }
+
+  void set_speed(unsigned int speed) { speed_ = speed; }
+
+ private:
+  libaom_test::TestMode encoding_mode_;
+  uint32_t speed_;
+  FILE *outfile_;
+  uint32_t out_frames_;
+};
+
+struct EncodePerfTestVideo {
+  EncodePerfTestVideo(const char *name_, uint32_t width_, uint32_t height_,
+                      uint32_t bitrate_, int frames_)
+      : name(name_), width(width_), height(height_), bitrate(bitrate_),
+        frames(frames_) {}
+  const char *name;
+  uint32_t width;
+  uint32_t height;
+  uint32_t bitrate;
+  int frames;
+};
+
+const EncodePerfTestVideo kAV1EncodePerfTestVectors[] = {
+  EncodePerfTestVideo("niklas_1280_720_30.yuv", 1280, 720, 600, 470),
+};
+
+TEST_P(AV1NewEncodeDecodePerfTest, PerfTest) {
+  SetUp();
+
+  // TODO(JBB): Make this work by going through the set of given files.
+  const int i = 0;
+  const aom_rational timebase = { 33333333, 1000000000 };
+  cfg_.g_timebase = timebase;
+  cfg_.rc_target_bitrate = kAV1EncodePerfTestVectors[i].bitrate;
+
+  init_flags_ = AOM_CODEC_USE_PSNR;
+
+  const char *video_name = kAV1EncodePerfTestVectors[i].name;
+  libaom_test::I420VideoSource video(
+      video_name, kAV1EncodePerfTestVectors[i].width,
+      kAV1EncodePerfTestVectors[i].height, timebase.den, timebase.num, 0,
+      kAV1EncodePerfTestVectors[i].frames);
+  set_speed(2);
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  const uint32_t threads = 4;
+
+  libaom_test::IVFVideoSource decode_video(kNewEncodeOutputFile);
+  decode_video.Init();
+
+  aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t();
+  cfg.threads = threads;
+  cfg.allow_lowbitdepth = 1;
+  libaom_test::AV1Decoder decoder(cfg, 0);
+
+  aom_usec_timer t;
+  aom_usec_timer_start(&t);
+
+  for (decode_video.Begin(); decode_video.cxdata() != NULL;
+       decode_video.Next()) {
+    decoder.DecodeFrame(decode_video.cxdata(), decode_video.frame_size());
+  }
+
+  aom_usec_timer_mark(&t);
+  const double elapsed_secs =
+      static_cast<double>(aom_usec_timer_elapsed(&t)) / kUsecsInSec;
+  const unsigned decode_frames = decode_video.frame_number();
+  const double fps = static_cast<double>(decode_frames) / elapsed_secs;
+
+  printf("{\n");
+  printf("\t\"type\" : \"decode_perf_test\",\n");
+  printf("\t\"version\" : \"%s\",\n", VERSION_STRING_NOSP);
+  printf("\t\"videoName\" : \"%s\",\n", kNewEncodeOutputFile);
+  printf("\t\"threadCount\" : %u,\n", threads);
+  printf("\t\"decodeTimeSecs\" : %f,\n", elapsed_secs);
+  printf("\t\"totalFrames\" : %u,\n", decode_frames);
+  printf("\t\"framesPerSecond\" : %f\n", fps);
+  printf("}\n");
+}
+
+AV1_INSTANTIATE_TEST_CASE(AV1NewEncodeDecodePerfTest,
+                          ::testing::Values(::libaom_test::kTwoPassGood));
+}  // namespace
diff --git a/libs/libaom/src/test/decode_test_driver.cc b/libs/libaom/src/test/decode_test_driver.cc
new file mode 100644
index 000000000..70de0cff6
--- /dev/null
+++ b/libs/libaom/src/test/decode_test_driver.cc
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "test/codec_factory.h"
+#include "test/decode_test_driver.h"
+#include "test/register_state_check.h"
+#include "test/video_source.h"
+
+namespace libaom_test {
+
+const char kAV1Name[] = "AOMedia Project AV1 Decoder";
+
+aom_codec_err_t Decoder::PeekStream(const uint8_t *cxdata, size_t size,
+                                    aom_codec_stream_info_t *stream_info) {
+  return aom_codec_peek_stream_info(CodecInterface(), cxdata, size,
+                                    stream_info);
+}
+
+aom_codec_err_t Decoder::DecodeFrame(const uint8_t *cxdata, size_t size) {
+  return DecodeFrame(cxdata, size, NULL);
+}
+
+aom_codec_err_t Decoder::DecodeFrame(const uint8_t *cxdata, size_t size,
+                                     void *user_priv) {
+  aom_codec_err_t res_dec;
+  InitOnce();
+  API_REGISTER_STATE_CHECK(
+      res_dec = aom_codec_decode(&decoder_, cxdata, size, user_priv));
+  return res_dec;
+}
+
+bool Decoder::IsAV1() const {
+  const char *codec_name = GetDecoderName();
+  return strncmp(kAV1Name, codec_name, sizeof(kAV1Name) - 1) == 0;
+}
+
+void DecoderTest::HandlePeekResult(Decoder *const /*decoder*/,
+                                   CompressedVideoSource * /*video*/,
+                                   const aom_codec_err_t res_peek) {
+  /* The Av1 implementation of PeekStream returns an error only if the
+   * data passed to it isn't a valid Av1 chunk. */
+  ASSERT_EQ(AOM_CODEC_OK, res_peek)
+      << "Peek return failed: " << aom_codec_err_to_string(res_peek);
+}
+
+void DecoderTest::RunLoop(CompressedVideoSource *video,
+                          const aom_codec_dec_cfg_t &dec_cfg) {
+  Decoder *const decoder = codec_->CreateDecoder(dec_cfg, flags_);
+  ASSERT_TRUE(decoder != NULL);
+  bool end_of_file = false;
+  bool peeked_stream = false;
+
+  // Decode frames.
+  for (video->Begin(); !::testing::Test::HasFailure() && !end_of_file;
+       video->Next()) {
+    PreDecodeFrameHook(*video, decoder);
+
+    aom_codec_stream_info_t stream_info;
+    stream_info.is_annexb = 0;
+
+    if (video->cxdata() != NULL) {
+      if (!peeked_stream) {
+        // TODO(yaowu): PeekStream returns error for non-sequence_header_obu,
+        // therefore should only be tried once per sequence, this shall be fixed
+        // once PeekStream is updated to properly operate on other obus.
+        const aom_codec_err_t res_peek = decoder->PeekStream(
+            video->cxdata(), video->frame_size(), &stream_info);
+        HandlePeekResult(decoder, video, res_peek);
+        ASSERT_FALSE(::testing::Test::HasFailure());
+        peeked_stream = true;
+      }
+
+      aom_codec_err_t res_dec =
+          decoder->DecodeFrame(video->cxdata(), video->frame_size());
+      if (!HandleDecodeResult(res_dec, *video, decoder)) break;
+    } else {
+      // Signal end of the file to the decoder.
+      const aom_codec_err_t res_dec = decoder->DecodeFrame(NULL, 0);
+      ASSERT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
+      end_of_file = true;
+    }
+
+    DxDataIterator dec_iter = decoder->GetDxData();
+    const aom_image_t *img = NULL;
+
+    // Get decompressed data
+    while (!::testing::Test::HasFailure() && (img = dec_iter.Next()))
+      DecompressedFrameHook(*img, video->frame_number());
+  }
+  delete decoder;
+}
+
+void DecoderTest::RunLoop(CompressedVideoSource *video) {
+  aom_codec_dec_cfg_t dec_cfg = aom_codec_dec_cfg_t();
+  RunLoop(video, dec_cfg);
+}
+
+void DecoderTest::set_cfg(const aom_codec_dec_cfg_t &dec_cfg) {
+  memcpy(&cfg_, &dec_cfg, sizeof(cfg_));
+}
+
+void DecoderTest::set_flags(const aom_codec_flags_t flags) { flags_ = flags; }
+
+}  // namespace libaom_test
diff --git a/libs/libaom/src/test/decode_test_driver.h b/libs/libaom/src/test/decode_test_driver.h
new file mode 100644
index 000000000..64722f43a
--- /dev/null
+++ b/libs/libaom/src/test/decode_test_driver.h
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_TEST_DECODE_TEST_DRIVER_H_
+#define AOM_TEST_DECODE_TEST_DRIVER_H_
+#include <cstring>
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+
+#include "aom/aom_decoder.h"
+
+namespace libaom_test {
+
+class CodecFactory;
+class CompressedVideoSource;
+
+// Provides an object to handle decoding output
+class DxDataIterator {
+ public:
+  explicit DxDataIterator(aom_codec_ctx_t *decoder)
+      : decoder_(decoder), iter_(NULL) {}
+
+  const aom_image_t *Next() { return aom_codec_get_frame(decoder_, &iter_); }
+
+ private:
+  aom_codec_ctx_t *decoder_;
+  aom_codec_iter_t iter_;
+};
+
+// Provides a simplified interface to manage one video decoding.
+// Similar to Encoder class, the exact services should be added
+// as more tests are added.
+class Decoder {
+ public:
+  explicit Decoder(aom_codec_dec_cfg_t cfg)
+      : cfg_(cfg), flags_(0), init_done_(false) {
+    memset(&decoder_, 0, sizeof(decoder_));
+  }
+
+  Decoder(aom_codec_dec_cfg_t cfg, const aom_codec_flags_t flag)
+      : cfg_(cfg), flags_(flag), init_done_(false) {
+    memset(&decoder_, 0, sizeof(decoder_));
+  }
+
+  virtual ~Decoder() { aom_codec_destroy(&decoder_); }
+
+  aom_codec_err_t PeekStream(const uint8_t *cxdata, size_t size,
+                             aom_codec_stream_info_t *stream_info);
+
+  aom_codec_err_t DecodeFrame(const uint8_t *cxdata, size_t size);
+
+  aom_codec_err_t DecodeFrame(const uint8_t *cxdata, size_t size,
+                              void *user_priv);
+
+  DxDataIterator GetDxData() { return DxDataIterator(&decoder_); }
+
+  void Control(int ctrl_id, int arg) { Control(ctrl_id, arg, AOM_CODEC_OK); }
+
+  void Control(int ctrl_id, const void *arg) {
+    InitOnce();
+    const aom_codec_err_t res = aom_codec_control(&decoder_, ctrl_id, arg);
+    ASSERT_EQ(AOM_CODEC_OK, res) << DecodeError();
+  }
+
+  void Control(int ctrl_id, int arg, aom_codec_err_t expected_value) {
+    InitOnce();
+    const aom_codec_err_t res = aom_codec_control(&decoder_, ctrl_id, arg);
+    ASSERT_EQ(expected_value, res) << DecodeError();
+  }
+
+  const char *DecodeError() {
+    const char *detail = aom_codec_error_detail(&decoder_);
+    return detail ? detail : aom_codec_error(&decoder_);
+  }
+
+  // Passes the external frame buffer information to libaom.
+  aom_codec_err_t SetFrameBufferFunctions(
+      aom_get_frame_buffer_cb_fn_t cb_get,
+      aom_release_frame_buffer_cb_fn_t cb_release, void *user_priv) {
+    InitOnce();
+    return aom_codec_set_frame_buffer_functions(&decoder_, cb_get, cb_release,
+                                                user_priv);
+  }
+
+  const char *GetDecoderName() const {
+    return aom_codec_iface_name(CodecInterface());
+  }
+
+  bool IsAV1() const;
+
+  aom_codec_ctx_t *GetDecoder() { return &decoder_; }
+
+ protected:
+  virtual aom_codec_iface_t *CodecInterface() const = 0;
+
+  void InitOnce() {
+    if (!init_done_) {
+      const aom_codec_err_t res =
+          aom_codec_dec_init(&decoder_, CodecInterface(), &cfg_, flags_);
+      ASSERT_EQ(AOM_CODEC_OK, res) << DecodeError();
+      init_done_ = true;
+    }
+  }
+
+  aom_codec_ctx_t decoder_;
+  aom_codec_dec_cfg_t cfg_;
+  aom_codec_flags_t flags_;
+  bool init_done_;
+};
+
+// Common test functionality for all Decoder tests.
+class DecoderTest {
+ public:
+  // Main decoding loop
+  virtual void RunLoop(CompressedVideoSource *video);
+  virtual void RunLoop(CompressedVideoSource *video,
+                       const aom_codec_dec_cfg_t &dec_cfg);
+
+  virtual void set_cfg(const aom_codec_dec_cfg_t &dec_cfg);
+  virtual void set_flags(const aom_codec_flags_t flags);
+
+  // Hook to be called before decompressing every frame.
+  virtual void PreDecodeFrameHook(const CompressedVideoSource & /*video*/,
+                                  Decoder * /*decoder*/) {}
+
+  // Hook to be called to handle decode result. Return true to continue.
+  virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
+                                  const CompressedVideoSource & /*video*/,
+                                  Decoder *decoder) {
+    EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
+    return AOM_CODEC_OK == res_dec;
+  }
+
+  // Hook to be called on every decompressed frame.
+  virtual void DecompressedFrameHook(const aom_image_t & /*img*/,
+                                     const unsigned int /*frame_number*/) {}
+
+  // Hook to be called on peek result
+  virtual void HandlePeekResult(Decoder *const decoder,
+                                CompressedVideoSource *video,
+                                const aom_codec_err_t res_peek);
+
+ protected:
+  explicit DecoderTest(const CodecFactory *codec)
+      : codec_(codec), cfg_(), flags_(0) {}
+
+  virtual ~DecoderTest() {}
+
+  const CodecFactory *codec_;
+  aom_codec_dec_cfg_t cfg_;
+  aom_codec_flags_t flags_;
+};
+
+}  // namespace libaom_test
+
+#endif  // AOM_TEST_DECODE_TEST_DRIVER_H_
diff --git a/libs/libaom/src/test/decode_to_md5.sh b/libs/libaom/src/test/decode_to_md5.sh
new file mode 100644
index 000000000..2edd1cb52
--- /dev/null
+++ b/libs/libaom/src/test/decode_to_md5.sh
@@ -0,0 +1,77 @@
+#!/bin/sh
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+## This file tests the libaom decode_to_md5 example. To add new tests to this
+## file, do the following:
+##   1. Write a shell function (this is your test).
+##   2. Add the function to decode_to_md5_tests (on a new line).
+##
+. $(dirname $0)/tools_common.sh
+
+# Environment check: Make sure input is available:
+#   $AV1_IVF_FILE is required.
+decode_to_md5_verify_environment() {
+  if [ "$(av1_encode_available)" != "yes" ] && [ ! -e "${AV1_IVF_FILE}" ]; then
+    return 1
+  fi
+}
+
+# Runs decode_to_md5 on $1 and captures the md5 sum for the final frame. $2 is
+# interpreted as codec name and used solely to name the output file. $3 is the
+# expected md5 sum: It must match that of the final frame.
+decode_to_md5() {
+  local decoder="$(aom_tool_path decode_to_md5)"
+  local input_file="$1"
+  local codec="$2"
+  local expected_md5="$3"
+  local output_file="${AOM_TEST_OUTPUT_DIR}/decode_to_md5_${codec}"
+
+  if [ ! -x "${decoder}" ]; then
+    elog "${decoder} does not exist or is not executable."
+    return 1
+  fi
+
+  eval "${AOM_TEST_PREFIX}" "${decoder}" "${input_file}" "${output_file}" \
+      ${devnull}
+
+  [ -e "${output_file}" ] || return 1
+
+  local md5_last_frame="$(tail -n1 "${output_file}" | awk '{print $1}')"
+  local actual_md5="$(echo "${md5_last_frame}" | awk '{print $1}')"
+  if [ "${actual_md5}" = "${expected_md5}" ]; then
+    return 0
+  else
+    elog "MD5 mismatch:"
+    elog "Expected: ${expected_md5}"
+    elog "Actual: ${actual_md5}"
+    return 1
+  fi
+}
+
+DISABLED_decode_to_md5_av1() {
+  # expected MD5 sum for the last frame.
+  local expected_md5="567dd6d4b7a7170edddbf58bbcc3aff1"
+  local file="${AV1_IVF_FILE}"
+
+  # TODO(urvang): Check in the encoded file (like libvpx does) to avoid
+  # encoding every time.
+  if [ "$(av1_decode_available)" = "yes" ]; then
+    if [ ! -e "${AV1_IVF_FILE}" ]; then
+      file="${AOM_TEST_OUTPUT_DIR}/test_encode.ivf"
+      encode_yuv_raw_input_av1 "${file}" --ivf
+    fi
+    decode_to_md5 "${file}" "av1" "${expected_md5}"
+  fi
+}
+
+# TODO(tomfinegan): Enable when the bitstream stabilizes.
+decode_to_md5_tests="DISABLED_decode_to_md5_av1"
+
+run_tests decode_to_md5_verify_environment "${decode_to_md5_tests}"
diff --git a/libs/libaom/src/test/decode_with_drops.sh b/libs/libaom/src/test/decode_with_drops.sh
new file mode 100644
index 000000000..155ee9207
--- /dev/null
+++ b/libs/libaom/src/test/decode_with_drops.sh
@@ -0,0 +1,68 @@
+#!/bin/sh
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+## This file tests the libaom decode_with_drops example. To add new tests to
+## this file, do the following:
+##   1. Write a shell function (this is your test).
+##   2. Add the function to decode_with_drops_tests (on a new line).
+##
+. $(dirname $0)/tools_common.sh
+
+# Environment check: Make sure input is available:
+#   $AV1_IVF_FILE is required.
+decode_with_drops_verify_environment() {
+  if [ "$(av1_encode_available)" != "yes" ] && [ ! -e "${AV1_IVF_FILE}" ]; then
+    return 1
+  fi
+}
+
+# Runs decode_with_drops on $1, $2 is interpreted as codec name and used solely
+# to name the output file. $3 is the drop mode, and is passed directly to
+# decode_with_drops.
+decode_with_drops() {
+  local decoder="$(aom_tool_path decode_with_drops)"
+  local input_file="$1"
+  local codec="$2"
+  local output_file="${AOM_TEST_OUTPUT_DIR}/decode_with_drops_${codec}"
+  local drop_mode="$3"
+
+  if [ ! -x "${decoder}" ]; then
+    elog "${decoder} does not exist or is not executable."
+    return 1
+  fi
+
+  eval "${AOM_TEST_PREFIX}" "${decoder}" "${input_file}" "${output_file}" \
+      "${drop_mode}" ${devnull}
+
+  [ -e "${output_file}" ] || return 1
+}
+
+
+# Decodes $AV1_IVF_FILE while dropping frames, twice: once in sequence mode,
+# and once in pattern mode.
+DISABLED_decode_with_drops_av1() {
+  if [ "$(av1_decode_available)" = "yes" ]; then
+    local file="${AV1_IVF_FILE}"
+    if [ ! -e "${AV1_IVF_FILE}" ]; then
+      file="${AOM_TEST_OUTPUT_DIR}/test_encode.ivf"
+      encode_yuv_raw_input_av1 "${file}" --ivf
+    fi
+    # Drop frames 3 and 4.
+    decode_with_drops "${file}" "av1" "3-4"
+
+    # Test pattern mode: Drop 3 of every 4 frames.
+    decode_with_drops "${file}" "av1" "3/4"
+  fi
+}
+
+# TODO(yaowu): Disable this test as trailing_bit check is expected to fail
+decode_with_drops_tests="DISABLED_decode_with_drops_av1"
+
+run_tests decode_with_drops_verify_environment "${decode_with_drops_tests}"
diff --git a/libs/libaom/src/test/divu_small_test.cc b/libs/libaom/src/test/divu_small_test.cc
new file mode 100644
index 000000000..f4d0846cf
--- /dev/null
+++ b/libs/libaom/src/test/divu_small_test.cc
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "test/acm_random.h"
+#include "av1/common/odintrin.h"
+
+using libaom_test::ACMRandom;
+
+TEST(DivuSmallTest, TestDIVUuptoMAX) {
+  for (int d = 1; d <= OD_DIVU_DMAX; d++) {
+    for (uint32_t x = 1; x <= 1000000; x++) {
+      GTEST_ASSERT_EQ(x / d, OD_DIVU_SMALL(x, d))
+          << "x=" << x << " d=" << d << " x/d=" << (x / d)
+          << " != " << OD_DIVU_SMALL(x, d);
+    }
+  }
+}
+
+TEST(DivuSmallTest, TestDIVUrandI31) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  for (int d = 1; d < OD_DIVU_DMAX; d++) {
+    for (int i = 0; i < 1000000; i++) {
+      uint32_t x = rnd.Rand31();
+      GTEST_ASSERT_EQ(x / d, OD_DIVU_SMALL(x, d))
+          << "x=" << x << " d=" << d << " x/d=" << (x / d)
+          << " != " << OD_DIVU_SMALL(x, d);
+    }
+  }
+}
diff --git a/libs/libaom/src/test/dr_prediction_test.cc b/libs/libaom/src/test/dr_prediction_test.cc
new file mode 100644
index 000000000..e8865c02a
--- /dev/null
+++ b/libs/libaom/src/test/dr_prediction_test.cc
@@ -0,0 +1,474 @@
+﻿/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tuple>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/aom_timer.h"
+#include "av1/common/blockd.h"
+#include "av1/common/pred_common.h"
+#include "av1/common/reconintra.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+
+namespace {
+
+const int kZ1Start = 0;
+const int kZ2Start = 90;
+const int kZ3Start = 180;
+
+const TX_SIZE kTxSize[] = { TX_4X4,   TX_8X8,   TX_16X16, TX_32X32, TX_64X64,
+                            TX_4X8,   TX_8X4,   TX_8X16,  TX_16X8,  TX_16X32,
+                            TX_32X16, TX_32X64, TX_64X32, TX_4X16,  TX_16X4,
+                            TX_8X32,  TX_32X8,  TX_16X64, TX_64X16 };
+
+const char *const kTxSizeStrings[] = {
+  "TX_4X4",   "TX_8X8",   "TX_16X16", "TX_32X32", "TX_64X64",
+  "TX_4X8",   "TX_8X4",   "TX_8X16",  "TX_16X8",  "TX_16X32",
+  "TX_32X16", "TX_32X64", "TX_64X32", "TX_4X16",  "TX_16X4",
+  "TX_8X32",  "TX_32X8",  "TX_16X64", "TX_64X16"
+};
+
+using libaom_test::ACMRandom;
+
+typedef void (*DrPred_Hbd)(uint16_t *dst, ptrdiff_t stride, int bw, int bh,
+                           const uint16_t *above, const uint16_t *left,
+                           int upsample_above, int upsample_left, int dx,
+                           int dy, int bd);
+
+typedef void (*DrPred)(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+                       const uint8_t *above, const uint8_t *left,
+                       int upsample_above, int upsample_left, int dx, int dy,
+                       int bd);
+
+typedef void (*Z1_Lbd)(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+                       const uint8_t *above, const uint8_t *left,
+                       int upsample_above, int dx, int dy);
+template <Z1_Lbd fn>
+void z1_wrapper(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+                const uint8_t *above, const uint8_t *left, int upsample_above,
+                int upsample_left, int dx, int dy, int bd) {
+  (void)bd;
+  (void)upsample_left;
+  fn(dst, stride, bw, bh, above, left, upsample_above, dx, dy);
+}
+
+typedef void (*Z2_Lbd)(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+                       const uint8_t *above, const uint8_t *left,
+                       int upsample_above, int upsample_left, int dx, int dy);
+template <Z2_Lbd fn>
+void z2_wrapper(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+                const uint8_t *above, const uint8_t *left, int upsample_above,
+                int upsample_left, int dx, int dy, int bd) {
+  (void)bd;
+  (void)upsample_left;
+  fn(dst, stride, bw, bh, above, left, upsample_above, upsample_left, dx, dy);
+}
+
+typedef void (*Z3_Lbd)(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+                       const uint8_t *above, const uint8_t *left,
+                       int upsample_left, int dx, int dy);
+template <Z3_Lbd fn>
+void z3_wrapper(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+                const uint8_t *above, const uint8_t *left, int upsample_above,
+                int upsample_left, int dx, int dy, int bd) {
+  (void)bd;
+  (void)upsample_above;
+  fn(dst, stride, bw, bh, above, left, upsample_left, dx, dy);
+}
+
+typedef void (*Z1_Hbd)(uint16_t *dst, ptrdiff_t stride, int bw, int bh,
+                       const uint16_t *above, const uint16_t *left,
+                       int upsample_above, int dx, int dy, int bd);
+template <Z1_Hbd fn>
+void z1_wrapper_hbd(uint16_t *dst, ptrdiff_t stride, int bw, int bh,
+                    const uint16_t *above, const uint16_t *left,
+                    int upsample_above, int upsample_left, int dx, int dy,
+                    int bd) {
+  (void)bd;
+  (void)upsample_left;
+  fn(dst, stride, bw, bh, above, left, upsample_above, dx, dy, bd);
+}
+
+typedef void (*Z2_Hbd)(uint16_t *dst, ptrdiff_t stride, int bw, int bh,
+                       const uint16_t *above, const uint16_t *left,
+                       int upsample_above, int upsample_left, int dx, int dy,
+                       int bd);
+template <Z2_Hbd fn>
+void z2_wrapper_hbd(uint16_t *dst, ptrdiff_t stride, int bw, int bh,
+                    const uint16_t *above, const uint16_t *left,
+                    int upsample_above, int upsample_left, int dx, int dy,
+                    int bd) {
+  (void)bd;
+  fn(dst, stride, bw, bh, above, left, upsample_above, upsample_left, dx, dy,
+     bd);
+}
+
+typedef void (*Z3_Hbd)(uint16_t *dst, ptrdiff_t stride, int bw, int bh,
+                       const uint16_t *above, const uint16_t *left,
+                       int upsample_left, int dx, int dy, int bd);
+template <Z3_Hbd fn>
+void z3_wrapper_hbd(uint16_t *dst, ptrdiff_t stride, int bw, int bh,
+                    const uint16_t *above, const uint16_t *left,
+                    int upsample_above, int upsample_left, int dx, int dy,
+                    int bd) {
+  (void)bd;
+  (void)upsample_above;
+  fn(dst, stride, bw, bh, above, left, upsample_left, dx, dy, bd);
+}
+
+template <typename FuncType>
+struct DrPredFunc {
+  DrPredFunc(FuncType pred = NULL, FuncType tst = NULL, int bit_depth_value = 0,
+             int start_angle_value = 0)
+      : ref_fn(pred), tst_fn(tst), bit_depth(bit_depth_value),
+        start_angle(start_angle_value) {}
+
+  FuncType ref_fn;
+  FuncType tst_fn;
+  int bit_depth;
+  int start_angle;
+};
+
+template <typename Pixel, typename FuncType>
+class DrPredTest : public ::testing::TestWithParam<DrPredFunc<FuncType> > {
+ protected:
+  static const int kMaxNumTests = 10000;
+  static const int kIterations = 10;
+  static const int kDstStride = 64;
+  static const int kDstSize = kDstStride * kDstStride;
+  static const int kOffset = 16;
+  static const int kBufSize = ((2 * MAX_TX_SIZE) << 1) + 16;
+
+  DrPredTest()
+      : enable_upsample_(0), upsample_above_(0), upsample_left_(0), bw_(0),
+        bh_(0), dx_(1), dy_(1), bd_(8), txsize_(TX_4X4) {
+    params_ = this->GetParam();
+    start_angle_ = params_.start_angle;
+    stop_angle_ = start_angle_ + 90;
+
+    dst_ref_ = &dst_ref_data_[0];
+    dst_tst_ = &dst_tst_data_[0];
+    dst_stride_ = kDstStride;
+    above_ = &above_data_[kOffset];
+    left_ = &left_data_[kOffset];
+
+    for (int i = 0; i < kBufSize; ++i) {
+      above_data_[i] = rng_.Rand8();
+      left_data_[i] = rng_.Rand8();
+    }
+
+    for (int i = 0; i < kDstSize; ++i) {
+      dst_ref_[i] = 0;
+      dst_tst_[i] = 0;
+    }
+  }
+
+  virtual ~DrPredTest() {}
+
+  void Predict(bool speedtest, int tx) {
+    const int kNumTests = speedtest ? kMaxNumTests : 1;
+    aom_usec_timer timer;
+    int tst_time = 0;
+
+    bd_ = params_.bit_depth;
+
+    aom_usec_timer_start(&timer);
+    for (int k = 0; k < kNumTests; ++k) {
+      params_.ref_fn(dst_ref_, dst_stride_, bw_, bh_, above_, left_,
+                     upsample_above_, upsample_left_, dx_, dy_, bd_);
+    }
+    aom_usec_timer_mark(&timer);
+    const int ref_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+
+    if (params_.tst_fn) {
+      aom_usec_timer_start(&timer);
+      for (int k = 0; k < kNumTests; ++k) {
+        ASM_REGISTER_STATE_CHECK(params_.tst_fn(dst_tst_, dst_stride_, bw_, bh_,
+                                                above_, left_, upsample_above_,
+                                                upsample_left_, dx_, dy_, bd_));
+      }
+      aom_usec_timer_mark(&timer);
+      tst_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+    } else {
+      for (int i = 0; i < kDstSize; ++i) {
+        dst_ref_[i] = dst_tst_[i];
+      }
+    }
+
+    OutputTimes(kNumTests, ref_time, tst_time, tx);
+  }
+
+  void RunTest(bool speedtest, bool needsaturation, int p_angle) {
+    bd_ = params_.bit_depth;
+
+    if (needsaturation) {
+      for (int i = 0; i < kBufSize; ++i) {
+        above_data_[i] = left_data_[i] = (1 << bd_) - 1;
+      }
+    }
+    for (int tx = 0; tx < TX_SIZES_ALL; ++tx) {
+      if (params_.tst_fn == NULL) {
+        for (int i = 0; i < kDstSize; ++i) {
+          dst_tst_[i] = (1 << bd_) - 1;
+          dst_ref_[i] = (1 << bd_) - 1;
+        }
+      } else {
+        for (int i = 0; i < kDstSize; ++i) {
+          dst_ref_[i] = 0;
+          dst_tst_[i] = 0;
+        }
+      }
+
+      bw_ = tx_size_wide[kTxSize[tx]];
+      bh_ = tx_size_high[kTxSize[tx]];
+
+      if (enable_upsample_) {
+        upsample_above_ =
+            av1_use_intra_edge_upsample(bw_, bh_, p_angle - 90, 0);
+        upsample_left_ =
+            av1_use_intra_edge_upsample(bw_, bh_, p_angle - 180, 0);
+      } else {
+        upsample_above_ = upsample_left_ = 0;
+      }
+
+      Predict(speedtest, tx);
+
+      for (int r = 0; r < bh_; ++r) {
+        for (int c = 0; c < bw_; ++c) {
+          ASSERT_EQ(dst_ref_[r * dst_stride_ + c],
+                    dst_tst_[r * dst_stride_ + c])
+              << bw_ << "x" << bh_ << " r: " << r << " c: " << c
+              << " dx: " << dx_ << " dy: " << dy_
+              << " upsample_above: " << upsample_above_
+              << " upsample_left: " << upsample_left_;
+        }
+      }
+    }
+  }
+
+  void OutputTimes(int num_tests, int ref_time, int tst_time, int tx) {
+    if (num_tests > 1) {
+      if (params_.tst_fn) {
+        const float x = static_cast<float>(ref_time) / tst_time;
+        printf("\t[%8s] :: ref time %6d, tst time %6d     %3.2f\n",
+               kTxSizeStrings[tx], ref_time, tst_time, x);
+      } else {
+        printf("\t[%8s] :: ref time %6d\n", kTxSizeStrings[tx], ref_time);
+      }
+    }
+  }
+
+  Pixel dst_ref_data_[kDstSize];
+  Pixel dst_tst_data_[kDstSize];
+
+  Pixel left_data_[kBufSize];
+  Pixel dummy_data_[kBufSize];
+  Pixel above_data_[kBufSize];
+
+  Pixel *dst_ref_;
+  Pixel *dst_tst_;
+  Pixel *above_;
+  Pixel *left_;
+  int dst_stride_;
+
+  int enable_upsample_;
+  int upsample_above_;
+  int upsample_left_;
+  int bw_;
+  int bh_;
+  int dx_;
+  int dy_;
+  int bd_;
+  TX_SIZE txsize_;
+
+  int start_angle_;
+  int stop_angle_;
+
+  ACMRandom rng_;
+
+  DrPredFunc<FuncType> params_;
+};
+
+class LowbdDrPredTest : public DrPredTest<uint8_t, DrPred> {};
+
+TEST_P(LowbdDrPredTest, SaturatedValues) {
+  for (enable_upsample_ = 0; enable_upsample_ < 2; ++enable_upsample_) {
+    for (int angle = start_angle_; angle < stop_angle_; ++angle) {
+      dx_ = av1_get_dx(angle);
+      dy_ = av1_get_dy(angle);
+      if (dx_ && dy_) RunTest(false, true, angle);
+    }
+  }
+}
+
+using std::make_tuple;
+
+INSTANTIATE_TEST_SUITE_P(
+    C, LowbdDrPredTest,
+    ::testing::Values(DrPredFunc<DrPred>(&z1_wrapper<av1_dr_prediction_z1_c>,
+                                         NULL, AOM_BITS_8, kZ1Start),
+                      DrPredFunc<DrPred>(&z2_wrapper<av1_dr_prediction_z2_c>,
+                                         NULL, AOM_BITS_8, kZ2Start),
+                      DrPredFunc<DrPred>(&z3_wrapper<av1_dr_prediction_z3_c>,
+                                         NULL, AOM_BITS_8, kZ3Start)));
+
+#if CONFIG_AV1_HIGHBITDEPTH
+class HighbdDrPredTest : public DrPredTest<uint16_t, DrPred_Hbd> {};
+
+TEST_P(HighbdDrPredTest, SaturatedValues) {
+  for (enable_upsample_ = 0; enable_upsample_ < 2; ++enable_upsample_) {
+    for (int angle = start_angle_; angle < stop_angle_; ++angle) {
+      dx_ = av1_get_dx(angle);
+      dy_ = av1_get_dy(angle);
+      if (dx_ && dy_) RunTest(false, true, angle);
+    }
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    C, HighbdDrPredTest,
+    ::testing::Values(
+        DrPredFunc<DrPred_Hbd>(&z1_wrapper_hbd<av1_highbd_dr_prediction_z1_c>,
+                               NULL, AOM_BITS_8, kZ1Start),
+        DrPredFunc<DrPred_Hbd>(&z1_wrapper_hbd<av1_highbd_dr_prediction_z1_c>,
+                               NULL, AOM_BITS_10, kZ1Start),
+        DrPredFunc<DrPred_Hbd>(&z1_wrapper_hbd<av1_highbd_dr_prediction_z1_c>,
+                               NULL, AOM_BITS_12, kZ1Start),
+        DrPredFunc<DrPred_Hbd>(&z2_wrapper_hbd<av1_highbd_dr_prediction_z2_c>,
+                               NULL, AOM_BITS_8, kZ2Start),
+        DrPredFunc<DrPred_Hbd>(&z2_wrapper_hbd<av1_highbd_dr_prediction_z2_c>,
+                               NULL, AOM_BITS_10, kZ2Start),
+        DrPredFunc<DrPred_Hbd>(&z2_wrapper_hbd<av1_highbd_dr_prediction_z2_c>,
+                               NULL, AOM_BITS_12, kZ2Start),
+        DrPredFunc<DrPred_Hbd>(&z3_wrapper_hbd<av1_highbd_dr_prediction_z3_c>,
+                               NULL, AOM_BITS_8, kZ3Start),
+        DrPredFunc<DrPred_Hbd>(&z3_wrapper_hbd<av1_highbd_dr_prediction_z3_c>,
+                               NULL, AOM_BITS_10, kZ3Start),
+        DrPredFunc<DrPred_Hbd>(&z3_wrapper_hbd<av1_highbd_dr_prediction_z3_c>,
+                               NULL, AOM_BITS_12, kZ3Start)));
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, LowbdDrPredTest,
+    ::testing::Values(DrPredFunc<DrPred>(&z1_wrapper<av1_dr_prediction_z1_c>,
+                                         &z1_wrapper<av1_dr_prediction_z1_avx2>,
+                                         AOM_BITS_8, kZ1Start),
+                      DrPredFunc<DrPred>(&z2_wrapper<av1_dr_prediction_z2_c>,
+                                         &z2_wrapper<av1_dr_prediction_z2_avx2>,
+                                         AOM_BITS_8, kZ2Start),
+                      DrPredFunc<DrPred>(&z3_wrapper<av1_dr_prediction_z3_c>,
+                                         &z3_wrapper<av1_dr_prediction_z3_avx2>,
+                                         AOM_BITS_8, kZ3Start)));
+
+TEST_P(LowbdDrPredTest, DISABLED_Speed) {
+  const int angles[] = { 3, 45, 87 };
+  for (enable_upsample_ = 0; enable_upsample_ < 2; ++enable_upsample_) {
+    for (int i = 0; i < 3; ++i) {
+      const int angle = angles[i] + start_angle_;
+      dx_ = av1_get_dx(angle);
+      dy_ = av1_get_dy(angle);
+      printf("enable_upsample: %d angle: %d ~~~~~~~~~~~~~~~\n",
+             enable_upsample_, angle);
+      if (dx_ && dy_) RunTest(true, false, angle);
+    }
+  }
+}
+
+TEST_P(LowbdDrPredTest, OperationCheck) {
+  if (params_.tst_fn == NULL) return;
+  // const int angles[] = { 3, 45, 81, 87, 93, 100, 145, 187, 199, 260 };
+  for (enable_upsample_ = 0; enable_upsample_ < 2; ++enable_upsample_) {
+    for (int angle = start_angle_; angle < stop_angle_; ++angle) {
+      dx_ = av1_get_dx(angle);
+      dy_ = av1_get_dy(angle);
+      if (dx_ && dy_) RunTest(false, false, angle);
+    }
+  }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, HighbdDrPredTest,
+    ::testing::Values(DrPredFunc<DrPred_Hbd>(
+                          &z1_wrapper_hbd<av1_highbd_dr_prediction_z1_c>,
+                          &z1_wrapper_hbd<av1_highbd_dr_prediction_z1_avx2>,
+                          AOM_BITS_8, kZ1Start),
+                      DrPredFunc<DrPred_Hbd>(
+                          &z1_wrapper_hbd<av1_highbd_dr_prediction_z1_c>,
+                          &z1_wrapper_hbd<av1_highbd_dr_prediction_z1_avx2>,
+                          AOM_BITS_10, kZ1Start),
+                      DrPredFunc<DrPred_Hbd>(
+                          &z1_wrapper_hbd<av1_highbd_dr_prediction_z1_c>,
+                          &z1_wrapper_hbd<av1_highbd_dr_prediction_z1_avx2>,
+                          AOM_BITS_12, kZ1Start),
+                      DrPredFunc<DrPred_Hbd>(
+                          &z2_wrapper_hbd<av1_highbd_dr_prediction_z2_c>,
+                          &z2_wrapper_hbd<av1_highbd_dr_prediction_z2_avx2>,
+                          AOM_BITS_8, kZ2Start),
+                      DrPredFunc<DrPred_Hbd>(
+                          &z2_wrapper_hbd<av1_highbd_dr_prediction_z2_c>,
+                          &z2_wrapper_hbd<av1_highbd_dr_prediction_z2_avx2>,
+                          AOM_BITS_10, kZ2Start),
+                      DrPredFunc<DrPred_Hbd>(
+                          &z2_wrapper_hbd<av1_highbd_dr_prediction_z2_c>,
+                          &z2_wrapper_hbd<av1_highbd_dr_prediction_z2_avx2>,
+                          AOM_BITS_12, kZ2Start),
+                      DrPredFunc<DrPred_Hbd>(
+                          &z3_wrapper_hbd<av1_highbd_dr_prediction_z3_c>,
+                          &z3_wrapper_hbd<av1_highbd_dr_prediction_z3_avx2>,
+                          AOM_BITS_8, kZ3Start),
+                      DrPredFunc<DrPred_Hbd>(
+                          &z3_wrapper_hbd<av1_highbd_dr_prediction_z3_c>,
+                          &z3_wrapper_hbd<av1_highbd_dr_prediction_z3_avx2>,
+                          AOM_BITS_10, kZ3Start),
+                      DrPredFunc<DrPred_Hbd>(
+                          &z3_wrapper_hbd<av1_highbd_dr_prediction_z3_c>,
+                          &z3_wrapper_hbd<av1_highbd_dr_prediction_z3_avx2>,
+                          AOM_BITS_12, kZ3Start)));
+
+TEST_P(HighbdDrPredTest, DISABLED_Speed) {
+  const int angles[] = { 3, 45, 87 };
+  for (enable_upsample_ = 0; enable_upsample_ < 2; ++enable_upsample_) {
+    for (int i = 0; i < 3; ++i) {
+      int angle = angles[i] + start_angle_;
+      dx_ = av1_get_dx(angle);
+      dy_ = av1_get_dy(angle);
+      printf("enable_upsample: %d angle: %d ~~~~~~~~~~~~~~~\n",
+             enable_upsample_, angle);
+      if (dx_ && dy_) RunTest(true, false, angle);
+    }
+  }
+}
+
+TEST_P(HighbdDrPredTest, OperationCheck) {
+  if (params_.tst_fn == NULL) return;
+  // const int angles[] = { 3, 45, 81, 87, 93, 100, 145, 187, 199, 260 };
+  for (enable_upsample_ = 0; enable_upsample_ < 2; ++enable_upsample_) {
+    for (int angle = start_angle_; angle < stop_angle_; angle++) {
+      dx_ = av1_get_dx(angle);
+      dy_ = av1_get_dy(angle);
+      if (dx_ && dy_) RunTest(false, false, angle);
+    }
+  }
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+#endif  // HAVE_AVX2
+
+}  // namespace
diff --git a/libs/libaom/src/test/dump_obu.sh b/libs/libaom/src/test/dump_obu.sh
new file mode 100644
index 000000000..da44dd7e6
--- /dev/null
+++ b/libs/libaom/src/test/dump_obu.sh
@@ -0,0 +1,70 @@
+#!/bin/sh
+## Copyright (c) 2018, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+## This file tests the libaom dump_obu tool. To add new tests to this
+## file, do the following:
+##   1. Write a shell function (this is your test).
+##   2. Add the function to dump_obu_tests (on a new line).
+##
+. $(dirname $0)/tools_common.sh
+
+readonly dump_obu_test_file="${AOM_TEST_OUTPUT_DIR}/av1_obu_test.ivf"
+
+dump_obu_verify_environment() {
+  if [ ! -e "${YUV_RAW_INPUT}" ]; then
+    elog "The file ${YUV_RAW_INPUT##*/} must exist in LIBAOM_TEST_DATA_PATH."
+    return 1
+  fi
+  if [ "$(dump_obu_available)" = "yes" ]; then
+    if [ -z "$(aom_tool_path dump_obu)" ]; then
+      elog "dump_obu not found in LIBAOM_BIN_PATH, its parent, or child tools/."
+    fi
+  fi
+}
+
+dump_obu_available() {
+  if [ "$(av1_decode_available)" = "yes" ] && \
+     [ "$(av1_encode_available)" = "yes" ]; then
+    echo yes
+  fi
+}
+
+aomenc_available() {
+  if [ -x "$(aom_tool_path aomenc)" ]; then
+    echo yes
+  fi
+}
+
+encode_test_file() {
+  if [ "$(aomenc_available)" = "yes" ]; then
+    local encoder="$(aom_tool_path aomenc)"
+
+    eval "${encoder}" \
+      $(aomenc_encode_test_fast_params) \
+      $(yuv_raw_input) \
+      --ivf \
+      --output=${dump_obu_test_file} \
+      ${devnull}
+
+    if [ ! -e "${dump_obu_test_file}" ]; then
+      elog "dump_obu test input encode failed."
+      return 1
+    fi
+  fi
+}
+
+dump_obu() {
+  encode_test_file
+  eval $(aom_tool_path dump_obu) "${dump_obu_test_file}" ${devnull}
+}
+
+dump_obu_tests="dump_obu"
+
+run_tests dump_obu_verify_environment "${dump_obu_tests}"
diff --git a/libs/libaom/src/test/ec_test.cc b/libs/libaom/src/test/ec_test.cc
new file mode 100644
index 000000000..853abcbc5
--- /dev/null
+++ b/libs/libaom/src/test/ec_test.cc
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include <cstdlib>
+
+#include "aom_dsp/entenc.h"
+#include "aom_dsp/entdec.h"
+
+TEST(EC_TEST, random_ec_test) {
+  od_ec_enc enc;
+  od_ec_dec dec;
+  int sz;
+  int i;
+  int ret;
+  unsigned int seed;
+  unsigned char *ptr;
+  uint32_t ptr_sz;
+  char *seed_str;
+  ret = 0;
+  seed_str = getenv("EC_TEST_SEED");
+  if (seed_str) {
+    seed = atoi(seed_str);
+  } else {
+    seed = 0xdaa1a;
+  }
+  srand(seed);
+  od_ec_enc_init(&enc, 1);
+  /*Test compatibility between multiple different encode/decode routines.*/
+  for (i = 0; i < 409600; i++) {
+    unsigned *fz;
+    unsigned *fts;
+    unsigned *data;
+    unsigned *tell;
+    unsigned *enc_method;
+    int j;
+    sz = rand() / ((RAND_MAX >> (rand() % 9U)) + 1U);
+    fz = (unsigned *)malloc(sz * sizeof(*fz));
+    fts = (unsigned *)malloc(sz * sizeof(*fts));
+    data = (unsigned *)malloc(sz * sizeof(*data));
+    tell = (unsigned *)malloc((sz + 1) * sizeof(*tell));
+    enc_method = (unsigned *)malloc(sz * sizeof(*enc_method));
+    od_ec_enc_reset(&enc);
+    tell[0] = od_ec_enc_tell_frac(&enc);
+    for (j = 0; j < sz; j++) {
+      data[j] = rand() / ((RAND_MAX >> 1) + 1);
+
+      fts[j] = CDF_PROB_BITS;
+      fz[j] = (rand() % (CDF_PROB_TOP - 2)) >> (CDF_PROB_BITS - fts[j]);
+      fz[j] = OD_MAXI(fz[j], 1);
+      enc_method[j] = 3 + (rand() & 1);
+      switch (enc_method[j]) {
+        case 3: {
+          od_ec_encode_bool_q15(&enc, data[j],
+                                OD_ICDF(fz[j] << (CDF_PROB_BITS - fts[j])));
+          break;
+        }
+        case 4: {
+          uint16_t cdf[2];
+          cdf[0] = OD_ICDF(fz[j]);
+          cdf[1] = OD_ICDF(1U << fts[j]);
+          od_ec_encode_cdf_q15(&enc, data[j], cdf, 2);
+          break;
+        }
+      }
+
+      tell[j + 1] = od_ec_enc_tell_frac(&enc);
+    }
+    ptr = od_ec_enc_done(&enc, &ptr_sz);
+    EXPECT_GE(((od_ec_enc_tell(&enc) + 7U) >> 3), ptr_sz)
+        << "od_ec_enc_tell() lied: "
+           "there's "
+        << ptr_sz << " bytes instead of " << ((od_ec_enc_tell(&enc) + 7) >> 3)
+        << " (Random seed: " << seed << ")\n";
+    od_ec_dec_init(&dec, ptr, ptr_sz);
+    EXPECT_EQ(od_ec_dec_tell_frac(&dec), tell[0])
+        << "od_ec_dec_tell() mismatch between encoder and decoder "
+           "at symbol 0: "
+        << (unsigned)od_ec_dec_tell_frac(&dec) << " instead of " << tell[0]
+        << " (Random seed: " << seed << ").\n";
+    for (j = 0; j < sz; j++) {
+      int dec_method;
+      unsigned int sym = data[j] + 1;  // Initialize sym to an invalid value.
+
+      if (CDF_SHIFT == 0) {
+        dec_method = 3 + (rand() & 1);
+      } else {
+        dec_method = enc_method[j];
+      }
+      switch (dec_method) {
+        case 3: {
+          sym = od_ec_decode_bool_q15(
+              &dec, OD_ICDF(fz[j] << (CDF_PROB_BITS - fts[j])));
+          break;
+        }
+        case 4: {
+          uint16_t cdf[2];
+          cdf[0] = OD_ICDF(fz[j]);
+          cdf[1] = OD_ICDF(1U << fts[j]);
+          sym = od_ec_decode_cdf_q15(&dec, cdf, 2);
+          break;
+        }
+      }
+
+      EXPECT_EQ(sym, data[j])
+          << "Decoded " << sym << " instead of " << data[j]
+          << " with fz=" << fz[j] << " and ftb=" << fts[j] << "at position "
+          << j << " of " << sz << " (Random seed: " << seed << ").\n"
+          << "Encoding method: " << enc_method[j]
+          << " decoding method: " << dec_method << "\n";
+      EXPECT_EQ(od_ec_dec_tell_frac(&dec), tell[j + 1])
+          << "od_ec_dec_tell() mismatch between encoder and "
+             "decoder at symbol "
+          << j + 1 << ": " << (unsigned)od_ec_dec_tell_frac(&dec)
+          << " instead of " << tell[j + 1] << " (Random seed: " << seed
+          << ").\n";
+    }
+    free(enc_method);
+    free(tell);
+    free(data);
+    free(fts);
+    free(fz);
+  }
+  od_ec_enc_reset(&enc);
+  if (CDF_SHIFT == 0) {
+    od_ec_encode_bool_q15(&enc, 0, OD_ICDF(16384));
+    od_ec_encode_bool_q15(&enc, 0, OD_ICDF(16384));
+    od_ec_encode_bool_q15(&enc, 0, OD_ICDF(16384));
+    od_ec_encode_bool_q15(&enc, 0, OD_ICDF(16384));
+    od_ec_encode_bool_q15(&enc, 0, OD_ICDF(24576));
+    od_ec_enc_patch_initial_bits(&enc, 3, 2);
+    EXPECT_FALSE(enc.error) << "od_ec_enc_patch_initial_bits() failed.\n";
+    od_ec_enc_patch_initial_bits(&enc, 0, 5);
+    EXPECT_TRUE(enc.error)
+        << "od_ec_enc_patch_initial_bits() didn't fail when it should have.\n";
+    od_ec_enc_reset(&enc);
+    od_ec_encode_bool_q15(&enc, 0, OD_ICDF(16384));
+    od_ec_encode_bool_q15(&enc, 0, OD_ICDF(16384));
+    od_ec_encode_bool_q15(&enc, 1, OD_ICDF(32256));
+    od_ec_encode_bool_q15(&enc, 0, OD_ICDF(24576));
+    od_ec_enc_patch_initial_bits(&enc, 0, 2);
+    EXPECT_FALSE(enc.error) << "od_ec_enc_patch_initial_bits() failed.\n";
+    ptr = od_ec_enc_done(&enc, &ptr_sz);
+    EXPECT_EQ(ptr_sz, 2u);
+    EXPECT_EQ(ptr[0], 63)
+        << "Got " << ptr[0]
+        << " when expecting 63 for od_ec_enc_patch_initial_bits().\n";
+  }
+  od_ec_enc_clear(&enc);
+  EXPECT_EQ(ret, 0);
+}
diff --git a/libs/libaom/src/test/edge_detect_test.cc b/libs/libaom/src/test/edge_detect_test.cc
new file mode 100644
index 000000000..33fbbc0bb
--- /dev/null
+++ b/libs/libaom/src/test/edge_detect_test.cc
@@ -0,0 +1,409 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdbool.h>
+#include <memory>
+#include <tuple>
+#include "aom_mem/aom_mem.h"
+#include "av1/encoder/rdopt.h"
+#include "test/util.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+using std::get;
+using std::tuple;
+
+static int get_pix(uint8_t *buf, int i, bool high_bd) {
+  if (high_bd) {
+    return *CONVERT_TO_SHORTPTR(buf + i);
+  } else {
+    return buf[i];
+  }
+}
+
+/** Get the (i, j) value from the input; if i or j is outside of the width
+ * or height, the nearest pixel value is returned.
+ */
+static int get_nearest_pix(const int *buf, int w, int h, int i, int j) {
+  int offset = AOMMAX(AOMMIN(i, w - 1), 0) + w * AOMMAX(AOMMIN(j, h - 1), 0);
+  return buf[offset];
+}
+
+/** Given the image data, creates a new image with padded values, so an
+ * 8-tap filter can be convolved. The padded value is the same as the closest
+ * value in the image. Returns a pointer to the start of the image in the
+ * padded data. Must be freed with free_pad_8tap. The output will be either
+ * 8-bit or 16-bit, depending on the high bit-depth (high_bd) field.
+ */
+static uint8_t *pad_8tap_convolve(const int *data, int w, int h, bool high_bd) {
+  // SIMD optimizations require the width to be a multiple of 8 and the height
+  // to be multiples of 4.
+  assert(w % 8 == 0);
+  assert(h % 4 == 0);
+  // For an 8-tap filter, we need to pad with 3 lines on top and on the left,
+  // and 4 lines on the right and bottom, for 7 extra lines.
+  const int pad_w = w + 7;
+  const int pad_h = h + 7;
+
+  uint8_t *dst;
+  if (high_bd) {
+    dst =
+        CONVERT_TO_BYTEPTR(aom_memalign(32, sizeof(uint16_t) * pad_w * pad_h));
+  } else {
+    dst = (uint8_t *)aom_memalign(32, sizeof(uint8_t) * pad_w * pad_h);
+  }
+  if (dst == nullptr) {
+    EXPECT_NE(dst, nullptr);
+    return nullptr;
+  }
+
+  for (int j = 0; j < pad_h; ++j) {
+    for (int i = 0; i < pad_w; ++i) {
+      const int v = get_nearest_pix(data, w, h, i - 3, j - 3);
+      if (high_bd) {
+        *CONVERT_TO_SHORTPTR(dst + i + j * pad_w) = v;
+      } else {
+        dst[i + j * pad_w] = static_cast<uint8_t>(v);
+      }
+    }
+  }
+  return dst + (w + 7) * 3 + 3;
+}
+
+static int stride_8tap(int width) { return width + 7; }
+
+static void free_pad_8tap(uint8_t *padded, int width, bool high_bd) {
+  if (high_bd) {
+    aom_free(CONVERT_TO_SHORTPTR(padded - (width + 7) * 3 - 3));
+  } else {
+    aom_free(padded - (width + 7) * 3 - 3);
+  }
+}
+
+struct Pad8TapConvolveDeleter {
+  Pad8TapConvolveDeleter(const int width, const bool high_bd)
+      : width(width), high_bd(high_bd) {}
+  void operator()(uint8_t *p) {
+    if (p != nullptr) {
+      free_pad_8tap(p, width, high_bd);
+    }
+  }
+  const int width;
+  const bool high_bd;
+};
+
+static uint8_t *malloc_bd(int num_entries, bool high_bd) {
+  const int bytes_per_entry = high_bd ? sizeof(uint16_t) : sizeof(uint8_t);
+
+  uint8_t *buf = (uint8_t *)aom_memalign(32, bytes_per_entry * num_entries);
+  if (high_bd) {
+    return CONVERT_TO_BYTEPTR(buf);
+  } else {
+    return buf;
+  }
+}
+
+static void free_bd(uint8_t *p, bool high_bd) {
+  if (high_bd) {
+    aom_free(CONVERT_TO_SHORTPTR(p));
+  } else {
+    aom_free(p);
+  }
+}
+
+struct MallocBdDeleter {
+  explicit MallocBdDeleter(const bool high_bd) : high_bd(high_bd) {}
+  void operator()(uint8_t *p) { free_bd(p, high_bd); }
+  const bool high_bd;
+};
+
+class EdgeDetectBrightnessTest :
+    // Parameters are (brightness, width, height, high bit depth representation,
+    // bit depth).
+    public ::testing::TestWithParam<tuple<int, int, int, bool, int> > {
+ protected:
+  void SetUp() override {
+    // Allocate a (width by height) array of luma values in orig_.
+    // padded_ will be filled by the pad() call, which adds a border around
+    // the orig_. The output_ array has enough space for the computation.
+    const int brightness = GET_PARAM(0);
+    const int width = GET_PARAM(1);
+    const int height = GET_PARAM(2);
+    const bool high_bd = GET_PARAM(3);
+
+    // Create the padded image of uniform brightness.
+    std::unique_ptr<int[]> orig(new int[width * height]);
+    ASSERT_NE(orig, nullptr);
+    for (int i = 0; i < width * height; ++i) {
+      orig[i] = brightness;
+    }
+    input_ = pad_8tap_convolve(orig.get(), width, height, high_bd);
+    ASSERT_NE(input_, nullptr);
+    output_ = malloc_bd(width * height, high_bd);
+    ASSERT_NE(output_, nullptr);
+  }
+
+  void TearDown() override {
+    const int width = GET_PARAM(1);
+    const bool high_bd = GET_PARAM(3);
+    free_pad_8tap(input_, width, high_bd);
+    free_bd(output_, high_bd);
+  }
+
+  // Skip the tests where brightness exceeds the bit-depth; we run into this
+  // issue because of gtest's limitation on valid combinations of test
+  // parameters. Also skip the tests where bit depth is greater than 8, but
+  // high bit depth representation is not set.
+  bool should_skip() const {
+    const int brightness = GET_PARAM(0);
+    const int bd = GET_PARAM(4);
+    if (brightness >= (1 << bd)) {
+      return true;
+    }
+    const bool high_bd = GET_PARAM(3);
+    if (bd > 8 && !high_bd) {
+      return true;
+    }
+    return false;
+  }
+
+  uint8_t *input_;
+  uint8_t *output_;
+};
+
+TEST_P(EdgeDetectBrightnessTest, BlurUniformBrightness) {
+  // Some combination of parameters are non-sensical, due to limitations
+  // of the testing framework. Ignore these.
+  if (should_skip()) {
+    return;
+  }
+
+  // For varying levels of brightness, the algorithm should
+  // produce the same output.
+  const int brightness = GET_PARAM(0);
+  const int width = GET_PARAM(1);
+  const int height = GET_PARAM(2);
+  const bool high_bd = GET_PARAM(3);
+  const int bd = GET_PARAM(4);
+
+  av1_gaussian_blur(input_, stride_8tap(width), width, height, output_, high_bd,
+                    bd);
+  for (int i = 0; i < width * height; ++i) {
+    ASSERT_EQ(brightness, get_pix(output_, i, high_bd));
+  }
+}
+
+// No edges on a uniformly bright image.
+TEST_P(EdgeDetectBrightnessTest, DetectUniformBrightness) {
+  if (should_skip()) {
+    return;
+  }
+  const int width = GET_PARAM(1);
+  const int height = GET_PARAM(2);
+  const bool high_bd = GET_PARAM(3);
+  const int bd = GET_PARAM(4);
+
+  ASSERT_EQ(
+      0, av1_edge_exists(input_, stride_8tap(width), width, height, high_bd, bd)
+             .magnitude);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+INSTANTIATE_TEST_SUITE_P(ImageBrightnessTests, EdgeDetectBrightnessTest,
+                         ::testing::Combine(
+                             // Brightness
+                             ::testing::Values(0, 1, 2, 127, 128, 129, 254, 255,
+                                               256, 511, 512, 1023, 1024, 2048,
+                                               4095),
+                             // Width
+                             ::testing::Values(8, 16, 32),
+                             // Height
+                             ::testing::Values(4, 8, 12, 32),
+                             // High bit depth representation
+                             ::testing::Bool(),
+                             // Bit depth
+                             ::testing::Values(8, 10, 12)));
+#else
+INSTANTIATE_TEST_SUITE_P(ImageBrightnessTests, EdgeDetectBrightnessTest,
+                         ::testing::Combine(
+                             // Brightness
+                             ::testing::Values(0, 1, 2, 127, 128, 129, 254, 255,
+                                               256, 511, 512, 1023, 1024, 2048,
+                                               4095),
+                             // Width
+                             ::testing::Values(8, 16, 32),
+                             // Height
+                             ::testing::Values(4, 8, 12, 32),
+                             // High bit depth representation
+                             ::testing::Values(false),
+                             // Bit depth
+                             ::testing::Values(8)));
+#endif
+
+class EdgeDetectImageTest :
+    // Parameters are (width, height, high bit depth representation, bit depth).
+    public ::testing::TestWithParam<tuple<int, int, bool, int> > {
+ protected:
+  // Skip the tests where bit depth is greater than 8, but high bit depth
+  // representation is not set (limitation of testing framework).
+  bool should_skip() const {
+    const bool high_bd = GET_PARAM(2);
+    const int bd = GET_PARAM(3);
+    return bd > 8 && !high_bd;
+  }
+};
+
+// Generate images with black on one side and white on the other.
+TEST_P(EdgeDetectImageTest, BlackWhite) {
+  // Some combination of parameters are non-sensical, due to limitations
+  // of the testing framework. Ignore these.
+  if (should_skip()) {
+    return;
+  }
+
+  const int width = GET_PARAM(0);
+  const int height = GET_PARAM(1);
+  const bool high_bd = GET_PARAM(2);
+  const int bd = GET_PARAM(3);
+
+  const int white = (1 << bd) - 1;
+  std::unique_ptr<int[]> orig(new int[width * height]);
+  for (int j = 0; j < height; ++j) {
+    for (int i = 0; i < width; ++i) {
+      if (i < width / 2) {
+        orig[i + j * width] = 0;
+      } else {
+        orig[i + j * width] = white;
+      }
+    }
+  }
+
+  std::unique_ptr<uint8_t[], Pad8TapConvolveDeleter> padded(
+      pad_8tap_convolve(orig.get(), width, height, high_bd),
+      Pad8TapConvolveDeleter(width, high_bd));
+  ASSERT_NE(padded, nullptr);
+  // Value should be between 556 and 560.
+  ASSERT_LE(556, av1_edge_exists(padded.get(), stride_8tap(width), width,
+                                 height, high_bd, bd)
+                     .magnitude);
+  ASSERT_GE(560, av1_edge_exists(padded.get(), stride_8tap(width), width,
+                                 height, high_bd, bd)
+                     .magnitude);
+}
+
+// Hardcoded blur tests.
+static const int luma[32] = { 241, 147, 7,   90,  184, 103, 28,  186,
+                              2,   248, 49,  242, 114, 146, 127, 22,
+                              121, 228, 167, 108, 158, 174, 41,  168,
+                              214, 99,  184, 109, 114, 247, 117, 119 };
+static const uint8_t expected[] = { 161, 138, 119, 118, 123, 118, 113, 122,
+                                    143, 140, 134, 133, 134, 126, 116, 114,
+                                    147, 149, 145, 142, 143, 138, 126, 118,
+                                    164, 156, 148, 144, 148, 148, 138, 126 };
+
+static void hardcoded_blur_test_aux(const bool high_bd) {
+  const int w = 8;
+  const int h = 4;
+  for (int bd = 8; bd <= 12; bd += 2) {
+    // Skip the tests where bit depth is greater than 8, but high bit depth
+    // representation is not set.
+    if (bd > 8 && !high_bd) {
+      break;
+    }
+    std::unique_ptr<uint8_t[], MallocBdDeleter> output(
+        malloc_bd(w * h, high_bd), MallocBdDeleter(high_bd));
+    ASSERT_NE(output, nullptr);
+    std::unique_ptr<uint8_t[], Pad8TapConvolveDeleter> padded(
+        pad_8tap_convolve(luma, w, h, high_bd),
+        Pad8TapConvolveDeleter(w, high_bd));
+    ASSERT_NE(padded, nullptr);
+    av1_gaussian_blur(padded.get(), stride_8tap(w), w, h, output.get(), high_bd,
+                      bd);
+    for (int i = 0; i < w * h; ++i) {
+      ASSERT_EQ(expected[i], get_pix(output.get(), i, high_bd));
+    }
+
+    // If we multiply the inputs by a constant factor, the output should not
+    // vary more than 0.5 * factor.
+    for (int c = 2; c < (1 << (bd - 8)); ++c) {
+      int scaled_luma[32];
+      for (int i = 0; i < 32; ++i) {
+        scaled_luma[i] = luma[i] * c;
+      }
+      padded.reset(pad_8tap_convolve(scaled_luma, w, h, high_bd));
+      ASSERT_NE(padded, nullptr);
+      av1_gaussian_blur(padded.get(), stride_8tap(w), w, h, output.get(),
+                        high_bd, bd);
+      for (int i = 0; i < w * h; ++i) {
+        ASSERT_GE(c / 2,
+                  abs(expected[i] * c - get_pix(output.get(), i, high_bd)));
+      }
+    }
+  }
+}
+
+TEST(EdgeDetectImageTest, HardcodedBlurTest) {
+  hardcoded_blur_test_aux(false);
+#if CONFIG_AV1_HIGHBITDEPTH
+  hardcoded_blur_test_aux(true);
+#endif
+}
+
+TEST(EdgeDetectImageTest, SobelTest) {
+  // Randomly generated 3x3. Compute Sobel for middle value.
+  const uint8_t buf[9] = { 241, 147, 7, 90, 184, 103, 28, 186, 2 };
+  const int stride = 3;
+  bool high_bd = false;
+  sobel_xy result = av1_sobel(buf, stride, 1, 1, high_bd);
+  ASSERT_EQ(234, result.x);
+  ASSERT_EQ(140, result.y);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+  // Verify it works for 8-bit values in a high bit-depth buffer.
+  const uint16_t buf8_16[9] = { 241, 147, 7, 90, 184, 103, 28, 186, 2 };
+  high_bd = true;
+  result = av1_sobel(CONVERT_TO_BYTEPTR(buf8_16), stride, 1, 1, high_bd);
+  ASSERT_EQ(234, result.x);
+  ASSERT_EQ(140, result.y);
+
+  // Verify it works for high bit-depth values as well.
+  const uint16_t buf16[9] = { 241, 147, 7, 90, 184, 2003, 1028, 186, 2 };
+  result = av1_sobel(CONVERT_TO_BYTEPTR(buf16), stride, 1, 1, high_bd);
+  ASSERT_EQ(-2566, result.x);
+  ASSERT_EQ(-860, result.y);
+#endif
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+INSTANTIATE_TEST_SUITE_P(EdgeDetectImages, EdgeDetectImageTest,
+                         ::testing::Combine(
+                             // Width
+                             ::testing::Values(8, 16, 32),
+                             // Height
+                             ::testing::Values(4, 8, 12, 32),
+                             // High bit depth representation
+                             ::testing::Bool(),
+                             // Bit depth
+                             ::testing::Values(8, 10, 12)));
+#else
+INSTANTIATE_TEST_SUITE_P(EdgeDetectImages, EdgeDetectImageTest,
+                         ::testing::Combine(
+                             // Width
+                             ::testing::Values(8, 16, 32),
+                             // Height
+                             ::testing::Values(4, 8, 12, 32),
+                             // High bit depth representation
+                             ::testing::Values(false),
+                             // Bit depth
+                             ::testing::Values(8)));
+#endif
+}  // namespace
diff --git a/libs/libaom/src/test/encode_api_test.cc b/libs/libaom/src/test/encode_api_test.cc
new file mode 100644
index 000000000..25bdb5c3f
--- /dev/null
+++ b/libs/libaom/src/test/encode_api_test.cc
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+
+#include "test/util.h"
+#include "aom/aomcx.h"
+#include "aom/aom_encoder.h"
+
+namespace {
+
+TEST(EncodeAPI, InvalidParams) {
+  static const aom_codec_iface_t *kCodecs[] = {
+#if CONFIG_AV1_ENCODER
+    aom_codec_av1_cx(),
+#endif
+  };
+  uint8_t buf[1] = { 0 };
+  aom_image_t img;
+  aom_codec_ctx_t enc;
+  aom_codec_enc_cfg_t cfg;
+
+  EXPECT_EQ(&img, aom_img_wrap(&img, AOM_IMG_FMT_I420, 1, 1, 1, buf));
+
+  EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_enc_init(NULL, NULL, NULL, 0));
+  EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_enc_init(&enc, NULL, NULL, 0));
+  EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_encode(NULL, NULL, 0, 0, 0));
+  EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_encode(NULL, &img, 0, 0, 0));
+  EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_destroy(NULL));
+  EXPECT_EQ(AOM_CODEC_INVALID_PARAM,
+            aom_codec_enc_config_default(NULL, NULL, 0));
+  EXPECT_EQ(AOM_CODEC_INVALID_PARAM,
+            aom_codec_enc_config_default(NULL, &cfg, 0));
+  EXPECT_TRUE(aom_codec_error(NULL) != NULL);
+
+  for (const aom_codec_iface_t *iface : kCodecs) {
+    SCOPED_TRACE(aom_codec_iface_name(iface));
+    EXPECT_EQ(AOM_CODEC_INVALID_PARAM,
+              aom_codec_enc_init(NULL, iface, NULL, 0));
+    EXPECT_EQ(AOM_CODEC_INVALID_PARAM,
+              aom_codec_enc_init(&enc, iface, NULL, 0));
+    EXPECT_EQ(AOM_CODEC_INVALID_PARAM,
+              aom_codec_enc_config_default(iface, &cfg, 2));
+
+    EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(iface, &cfg, 0));
+    EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, 0));
+
+    EXPECT_EQ(NULL, aom_codec_get_global_headers(NULL));
+
+    aom_fixed_buf_t *glob_headers = aom_codec_get_global_headers(&enc);
+    EXPECT_TRUE(glob_headers->buf != NULL);
+    if (glob_headers) {
+      free(glob_headers->buf);
+      free(glob_headers);
+    }
+
+    EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, NULL, 0, 0, 0));
+
+    EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
+  }
+}
+
+}  // namespace
diff --git a/libs/libaom/src/test/encode_perf_test.cc b/libs/libaom/src/test/encode_perf_test.cc
new file mode 100644
index 000000000..390a6e0e6
--- /dev/null
+++ b/libs/libaom/src/test/encode_perf_test.cc
@@ -0,0 +1,184 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <string>
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+#include "config/aom_version.h"
+
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+#include "aom_ports/aom_timer.h"
+
+namespace {
+
+const int kMaxPsnr = 100;
+const double kUsecsInSec = 1000000.0;
+
+struct EncodePerfTestVideo {
+  EncodePerfTestVideo(const char *name_, uint32_t width_, uint32_t height_,
+                      uint32_t bitrate_, int frames_)
+      : name(name_), width(width_), height(height_), bitrate(bitrate_),
+        frames(frames_) {}
+  const char *name;
+  uint32_t width;
+  uint32_t height;
+  uint32_t bitrate;
+  int frames;
+};
+
+const EncodePerfTestVideo kAV1EncodePerfTestVectors[] = {
+  EncodePerfTestVideo("desktop_640_360_30.yuv", 640, 360, 200, 2484),
+  EncodePerfTestVideo("kirland_640_480_30.yuv", 640, 480, 200, 300),
+  EncodePerfTestVideo("macmarcomoving_640_480_30.yuv", 640, 480, 200, 987),
+  EncodePerfTestVideo("macmarcostationary_640_480_30.yuv", 640, 480, 200, 718),
+  EncodePerfTestVideo("niklas_640_480_30.yuv", 640, 480, 200, 471),
+  EncodePerfTestVideo("tacomanarrows_640_480_30.yuv", 640, 480, 200, 300),
+  EncodePerfTestVideo("tacomasmallcameramovement_640_480_30.yuv", 640, 480, 200,
+                      300),
+  EncodePerfTestVideo("thaloundeskmtg_640_480_30.yuv", 640, 480, 200, 300),
+  EncodePerfTestVideo("niklas_1280_720_30.yuv", 1280, 720, 600, 470),
+};
+
+const int kEncodePerfTestSpeeds[] = { 5, 6, 7, 8 };
+const int kEncodePerfTestThreads[] = { 1, 2, 4 };
+
+class AV1EncodePerfTest
+    : public ::libaom_test::CodecTestWithParam<libaom_test::TestMode>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  AV1EncodePerfTest()
+      : EncoderTest(GET_PARAM(0)), min_psnr_(kMaxPsnr), nframes_(0),
+        encoding_mode_(GET_PARAM(1)), speed_(0), threads_(1) {}
+
+  virtual ~AV1EncodePerfTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(encoding_mode_);
+
+    cfg_.g_lag_in_frames = 0;
+    cfg_.rc_min_quantizer = 2;
+    cfg_.rc_max_quantizer = 56;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_undershoot_pct = 50;
+    cfg_.rc_overshoot_pct = 50;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 600;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_error_resilient = 1;
+    cfg_.g_threads = threads_;
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      const int log2_tile_columns = 3;
+      encoder->Control(AOME_SET_CPUUSED, speed_);
+      encoder->Control(AV1E_SET_TILE_COLUMNS, log2_tile_columns);
+      encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1);
+      encoder->Control(AOME_SET_ENABLEAUTOALTREF, 0);
+    }
+  }
+
+  virtual void BeginPassHook(unsigned int /*pass*/) {
+    min_psnr_ = kMaxPsnr;
+    nframes_ = 0;
+  }
+
+  virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+    if (pkt->data.psnr.psnr[0] < min_psnr_) {
+      min_psnr_ = pkt->data.psnr.psnr[0];
+    }
+  }
+
+  // for performance reasons don't decode
+  virtual bool DoDecode() { return 0; }
+
+  double min_psnr() const { return min_psnr_; }
+
+  void set_speed(unsigned int speed) { speed_ = speed; }
+
+  void set_threads(unsigned int threads) { threads_ = threads; }
+
+ private:
+  double min_psnr_;
+  unsigned int nframes_;
+  libaom_test::TestMode encoding_mode_;
+  unsigned speed_;
+  unsigned int threads_;
+};
+
+TEST_P(AV1EncodePerfTest, PerfTest) {
+  for (const EncodePerfTestVideo &test_video : kAV1EncodePerfTestVectors) {
+    for (int speed : kEncodePerfTestSpeeds) {
+      for (int threads : kEncodePerfTestThreads) {
+        if (test_video.width < 512 && threads > 1)
+          continue;
+        else if (test_video.width < 1024 && threads > 2)
+          continue;
+
+        set_threads(threads);
+        SetUp();
+
+        const aom_rational timebase = { 33333333, 1000000000 };
+        cfg_.g_timebase = timebase;
+        cfg_.rc_target_bitrate = test_video.bitrate;
+
+        init_flags_ = AOM_CODEC_USE_PSNR;
+
+        const unsigned frames = test_video.frames;
+        const char *video_name = test_video.name;
+        libaom_test::I420VideoSource video(video_name, test_video.width,
+                                           test_video.height, timebase.den,
+                                           timebase.num, 0, test_video.frames);
+        set_speed(speed);
+
+        aom_usec_timer t;
+        aom_usec_timer_start(&t);
+
+        ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+        aom_usec_timer_mark(&t);
+        const double elapsed_secs = aom_usec_timer_elapsed(&t) / kUsecsInSec;
+        const double fps = frames / elapsed_secs;
+        const double minimum_psnr = min_psnr();
+        std::string display_name(video_name);
+        if (threads > 1) {
+          char thread_count[32];
+          snprintf(thread_count, sizeof(thread_count), "_t-%d", threads);
+          display_name += thread_count;
+        }
+
+        printf("{\n");
+        printf("\t\"type\" : \"encode_perf_test\",\n");
+        printf("\t\"version\" : \"%s\",\n", VERSION_STRING_NOSP);
+        printf("\t\"videoName\" : \"%s\",\n", display_name.c_str());
+        printf("\t\"encodeTimeSecs\" : %f,\n", elapsed_secs);
+        printf("\t\"totalFrames\" : %u,\n", frames);
+        printf("\t\"framesPerSecond\" : %f,\n", fps);
+        printf("\t\"minPsnr\" : %f,\n", minimum_psnr);
+        printf("\t\"speed\" : %d,\n", speed);
+        printf("\t\"threads\" : %d\n", threads);
+        printf("}\n");
+      }
+    }
+  }
+}
+
+AV1_INSTANTIATE_TEST_CASE(AV1EncodePerfTest,
+                          ::testing::Values(::libaom_test::kRealTime));
+}  // namespace
diff --git a/libs/libaom/src/test/encode_test_driver.cc b/libs/libaom/src/test/encode_test_driver.cc
new file mode 100644
index 000000000..01f8d501a
--- /dev/null
+++ b/libs/libaom/src/test/encode_test_driver.cc
@@ -0,0 +1,297 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <memory>
+#include <string>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+
+#include "aom_ports/mem.h"
+#include "test/codec_factory.h"
+#include "test/decode_test_driver.h"
+#include "test/encode_test_driver.h"
+#include "test/register_state_check.h"
+#include "test/video_source.h"
+
+namespace libaom_test {
+void Encoder::InitEncoder(VideoSource *video) {
+  aom_codec_err_t res;
+  const aom_image_t *img = video->img();
+
+  if (video->img() && !encoder_.priv) {
+    cfg_.g_w = img->d_w;
+    cfg_.g_h = img->d_h;
+    cfg_.g_timebase = video->timebase();
+    cfg_.rc_twopass_stats_in = stats_->buf();
+
+    res = aom_codec_enc_init(&encoder_, CodecInterface(), &cfg_, init_flags_);
+    ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError();
+  }
+}
+
+void Encoder::EncodeFrame(VideoSource *video, const unsigned long frame_flags) {
+  if (video->img())
+    EncodeFrameInternal(*video, frame_flags);
+  else
+    Flush();
+
+  // Handle twopass stats
+  CxDataIterator iter = GetCxData();
+
+  while (const aom_codec_cx_pkt_t *pkt = iter.Next()) {
+    if (pkt->kind != AOM_CODEC_STATS_PKT) continue;
+
+    stats_->Append(*pkt);
+  }
+}
+
+void Encoder::EncodeFrameInternal(const VideoSource &video,
+                                  const unsigned long frame_flags) {
+  aom_codec_err_t res;
+  const aom_image_t *img = video.img();
+
+  // Handle frame resizing
+  if (cfg_.g_w != img->d_w || cfg_.g_h != img->d_h) {
+    cfg_.g_w = img->d_w;
+    cfg_.g_h = img->d_h;
+    res = aom_codec_enc_config_set(&encoder_, &cfg_);
+    ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError();
+  }
+
+  // Encode the frame
+  API_REGISTER_STATE_CHECK(res =
+                               aom_codec_encode(&encoder_, img, video.pts(),
+                                                video.duration(), frame_flags));
+  ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError();
+}
+
+void Encoder::Flush() {
+  const aom_codec_err_t res = aom_codec_encode(&encoder_, NULL, 0, 0, 0);
+  if (!encoder_.priv)
+    ASSERT_EQ(AOM_CODEC_ERROR, res) << EncoderError();
+  else
+    ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError();
+}
+
+void EncoderTest::InitializeConfig() {
+  const aom_codec_err_t res = codec_->DefaultEncoderConfig(&cfg_, 0);
+  ASSERT_EQ(AOM_CODEC_OK, res);
+}
+
+void EncoderTest::SetMode(TestMode mode) {
+  switch (mode) {
+    case kOnePassGood:
+    case kTwoPassGood: break;
+    case kRealTime: {
+      cfg_.g_lag_in_frames = 0;
+      cfg_.g_usage = AOM_USAGE_REALTIME;
+      break;
+    }
+    default: ASSERT_TRUE(false) << "Unexpected mode " << mode;
+  }
+  mode_ = mode;
+  if (mode == kTwoPassGood)
+    passes_ = 2;
+  else
+    passes_ = 1;
+}
+
+static bool compare_plane(const uint8_t *const buf1, int stride1,
+                          const uint8_t *const buf2, int stride2, int w, int h,
+                          int *const mismatch_row, int *const mismatch_col,
+                          int *const mismatch_pix1, int *const mismatch_pix2) {
+  int r, c;
+
+  for (r = 0; r < h; ++r) {
+    for (c = 0; c < w; ++c) {
+      const int pix1 = buf1[r * stride1 + c];
+      const int pix2 = buf2[r * stride2 + c];
+
+      if (pix1 != pix2) {
+        if (mismatch_row != NULL) *mismatch_row = r;
+        if (mismatch_col != NULL) *mismatch_col = c;
+        if (mismatch_pix1 != NULL) *mismatch_pix1 = pix1;
+        if (mismatch_pix2 != NULL) *mismatch_pix2 = pix2;
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+// The function should return "true" most of the time, therefore no early
+// break-out is implemented within the match checking process.
+static bool compare_img(const aom_image_t *img1, const aom_image_t *img2,
+                        int *const mismatch_row, int *const mismatch_col,
+                        int *const mismatch_plane, int *const mismatch_pix1,
+                        int *const mismatch_pix2) {
+  if (img1->fmt != img2->fmt || img1->cp != img2->cp || img1->tc != img2->tc ||
+      img1->mc != img2->mc || img1->d_w != img2->d_w ||
+      img1->d_h != img2->d_h || img1->monochrome != img2->monochrome) {
+    if (mismatch_row != NULL) *mismatch_row = -1;
+    if (mismatch_col != NULL) *mismatch_col = -1;
+    return false;
+  }
+
+  const int num_planes = img1->monochrome ? 1 : 3;
+  for (int plane = 0; plane < num_planes; plane++) {
+    if (!compare_plane(img1->planes[plane], img1->stride[plane],
+                       img2->planes[plane], img2->stride[plane],
+                       aom_img_plane_width(img1, plane),
+                       aom_img_plane_height(img1, plane), mismatch_row,
+                       mismatch_col, mismatch_pix1, mismatch_pix2)) {
+      if (mismatch_plane != NULL) *mismatch_plane = plane;
+      return false;
+    }
+  }
+
+  return true;
+}
+
+void EncoderTest::MismatchHook(const aom_image_t *img_enc,
+                               const aom_image_t *img_dec) {
+  int mismatch_row = 0;
+  int mismatch_col = 0;
+  int mismatch_plane = 0;
+  int mismatch_pix_enc = 0;
+  int mismatch_pix_dec = 0;
+
+  ASSERT_FALSE(compare_img(img_enc, img_dec, &mismatch_row, &mismatch_col,
+                           &mismatch_plane, &mismatch_pix_enc,
+                           &mismatch_pix_dec));
+
+  GTEST_FAIL() << "Encode/Decode mismatch found:" << std::endl
+               << "  pixel value enc/dec: " << mismatch_pix_enc << "/"
+               << mismatch_pix_dec << std::endl
+               << "                plane: " << mismatch_plane << std::endl
+               << "              row/col: " << mismatch_row << "/"
+               << mismatch_col << std::endl;
+}
+
+void EncoderTest::RunLoop(VideoSource *video) {
+  aom_codec_dec_cfg_t dec_cfg = aom_codec_dec_cfg_t();
+  dec_cfg.allow_lowbitdepth = 1;
+
+  stats_.Reset();
+
+  ASSERT_TRUE(passes_ == 1 || passes_ == 2);
+  for (unsigned int pass = 0; pass < passes_; pass++) {
+    last_pts_ = 0;
+
+    if (passes_ == 1)
+      cfg_.g_pass = AOM_RC_ONE_PASS;
+    else if (pass == 0)
+      cfg_.g_pass = AOM_RC_FIRST_PASS;
+    else
+      cfg_.g_pass = AOM_RC_LAST_PASS;
+
+    BeginPassHook(pass);
+    std::unique_ptr<Encoder> encoder(
+        codec_->CreateEncoder(cfg_, init_flags_, &stats_));
+    ASSERT_TRUE(encoder.get() != NULL);
+
+    ASSERT_NO_FATAL_FAILURE(video->Begin());
+    encoder->InitEncoder(video);
+
+    if (mode_ == kRealTime) {
+      encoder->Control(AOME_SET_ENABLEAUTOALTREF, 0);
+    }
+
+    ASSERT_FALSE(::testing::Test::HasFatalFailure());
+
+    std::unique_ptr<Decoder> decoder(
+        codec_->CreateDecoder(dec_cfg, 0 /* flags */));
+#if CONFIG_AV1_DECODER
+    if (decoder->IsAV1()) {
+      // Set dec_cfg.tile_row = -1 and dec_cfg.tile_col = -1 so that the whole
+      // frame is decoded.
+      decoder->Control(AV1_SET_TILE_MODE, cfg_.large_scale_tile);
+      decoder->Control(AV1D_EXT_TILE_DEBUG, 1);
+      decoder->Control(AV1_SET_DECODE_TILE_ROW, -1);
+      decoder->Control(AV1_SET_DECODE_TILE_COL, -1);
+    }
+#endif
+
+    number_spatial_layers_ = GetNumSpatialLayers();
+
+    bool again;
+    for (again = true; again; video->Next()) {
+      again = (video->img() != NULL);
+
+      for (int sl = 0; sl < number_spatial_layers_; sl++) {
+        PreEncodeFrameHook(video);
+        PreEncodeFrameHook(video, encoder.get());
+        encoder->EncodeFrame(video, frame_flags_);
+
+        CxDataIterator iter = encoder->GetCxData();
+
+        bool has_cxdata = false;
+        bool has_dxdata = false;
+        while (const aom_codec_cx_pkt_t *pkt = iter.Next()) {
+          pkt = MutateEncoderOutputHook(pkt);
+          again = true;
+          switch (pkt->kind) {
+            case AOM_CODEC_CX_FRAME_PKT:
+              has_cxdata = true;
+              if (decoder.get() != NULL && DoDecode()) {
+                aom_codec_err_t res_dec;
+                if (DoDecodeInvisible()) {
+                  res_dec = decoder->DecodeFrame(
+                      (const uint8_t *)pkt->data.frame.buf, pkt->data.frame.sz);
+                } else {
+                  res_dec = decoder->DecodeFrame(
+                      (const uint8_t *)pkt->data.frame.buf +
+                          (pkt->data.frame.sz - pkt->data.frame.vis_frame_size),
+                      pkt->data.frame.vis_frame_size);
+                }
+
+                if (!HandleDecodeResult(res_dec, decoder.get())) break;
+
+                has_dxdata = true;
+              }
+              ASSERT_GE(pkt->data.frame.pts, last_pts_);
+              if (sl == number_spatial_layers_) last_pts_ = pkt->data.frame.pts;
+              FramePktHook(pkt);
+              break;
+
+            case AOM_CODEC_PSNR_PKT: PSNRPktHook(pkt); break;
+
+            default: break;
+          }
+        }
+
+        if (has_dxdata && has_cxdata) {
+          const aom_image_t *img_enc = encoder->GetPreviewFrame();
+          DxDataIterator dec_iter = decoder->GetDxData();
+          const aom_image_t *img_dec = dec_iter.Next();
+          if (img_enc && img_dec) {
+            const bool res =
+                compare_img(img_enc, img_dec, NULL, NULL, NULL, NULL, NULL);
+            if (!res) {  // Mismatch
+              MismatchHook(img_enc, img_dec);
+            }
+          }
+          if (img_dec) DecompressedFrameHook(*img_dec, video->pts());
+        }
+        if (!Continue()) break;
+      }  // Loop over spatial layers
+    }
+
+    EndPassHook();
+
+    if (!Continue()) break;
+  }
+}
+
+}  // namespace libaom_test
diff --git a/libs/libaom/src/test/encode_test_driver.h b/libs/libaom/src/test/encode_test_driver.h
new file mode 100644
index 000000000..6319a5220
--- /dev/null
+++ b/libs/libaom/src/test/encode_test_driver.h
@@ -0,0 +1,265 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_TEST_ENCODE_TEST_DRIVER_H_
+#define AOM_TEST_ENCODE_TEST_DRIVER_H_
+
+#include <string>
+#include <vector>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+
+#if CONFIG_AV1_ENCODER
+#include "aom/aomcx.h"
+#endif
+#include "aom/aom_encoder.h"
+
+namespace libaom_test {
+
+class CodecFactory;
+class VideoSource;
+
+enum TestMode { kRealTime, kOnePassGood, kTwoPassGood };
+#define ALL_TEST_MODES                                                     \
+  ::testing::Values(::libaom_test::kRealTime, ::libaom_test::kOnePassGood, \
+                    ::libaom_test::kTwoPassGood)
+
+#define ONE_PASS_TEST_MODES \
+  ::testing::Values(::libaom_test::kRealTime, ::libaom_test::kOnePassGood)
+
+#define TWO_PASS_TEST_MODES ::testing::Values(::libaom_test::kTwoPassGood)
+
+#define NONREALTIME_TEST_MODES \
+  ::testing::Values(::libaom_test::kOnePassGood, ::libaom_test::kTwoPassGood)
+
+// Provides an object to handle the libaom get_cx_data() iteration pattern
+class CxDataIterator {
+ public:
+  explicit CxDataIterator(aom_codec_ctx_t *encoder)
+      : encoder_(encoder), iter_(NULL) {}
+
+  const aom_codec_cx_pkt_t *Next() {
+    return aom_codec_get_cx_data(encoder_, &iter_);
+  }
+
+ private:
+  aom_codec_ctx_t *encoder_;
+  aom_codec_iter_t iter_;
+};
+
+// Implements an in-memory store for libaom twopass statistics
+class TwopassStatsStore {
+ public:
+  void Append(const aom_codec_cx_pkt_t &pkt) {
+    buffer_.append(reinterpret_cast<char *>(pkt.data.twopass_stats.buf),
+                   pkt.data.twopass_stats.sz);
+  }
+
+  aom_fixed_buf_t buf() {
+    const aom_fixed_buf_t buf = { &buffer_[0], buffer_.size() };
+    return buf;
+  }
+
+  void Reset() { buffer_.clear(); }
+
+ protected:
+  std::string buffer_;
+};
+
+// Provides a simplified interface to manage one video encoding pass, given
+// a configuration and video source.
+//
+// TODO(jkoleszar): The exact services it provides and the appropriate
+// level of abstraction will be fleshed out as more tests are written.
+class Encoder {
+ public:
+  Encoder(aom_codec_enc_cfg_t cfg, const aom_codec_flags_t init_flags,
+          TwopassStatsStore *stats)
+      : cfg_(cfg), init_flags_(init_flags), stats_(stats) {
+    memset(&encoder_, 0, sizeof(encoder_));
+  }
+
+  virtual ~Encoder() { aom_codec_destroy(&encoder_); }
+
+  CxDataIterator GetCxData() { return CxDataIterator(&encoder_); }
+
+  void InitEncoder(VideoSource *video);
+
+  const aom_image_t *GetPreviewFrame() {
+    return aom_codec_get_preview_frame(&encoder_);
+  }
+  // This is a thin wrapper around aom_codec_encode(), so refer to
+  // aom_encoder.h for its semantics.
+  void EncodeFrame(VideoSource *video, const unsigned long frame_flags);
+
+  // Convenience wrapper for EncodeFrame()
+  void EncodeFrame(VideoSource *video) { EncodeFrame(video, 0); }
+
+  void Control(int ctrl_id, int arg) {
+    const aom_codec_err_t res = aom_codec_control(&encoder_, ctrl_id, arg);
+    ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError();
+  }
+
+  void Control(int ctrl_id, int *arg) {
+    const aom_codec_err_t res = aom_codec_control(&encoder_, ctrl_id, arg);
+    ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError();
+  }
+
+  void Control(int ctrl_id, struct aom_scaling_mode *arg) {
+    const aom_codec_err_t res = aom_codec_control(&encoder_, ctrl_id, arg);
+    ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError();
+  }
+
+  void Control(int ctrl_id, struct aom_svc_layer_id *arg) {
+    const aom_codec_err_t res = aom_codec_control(&encoder_, ctrl_id, arg);
+    ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError();
+  }
+
+  void Control(int ctrl_id, struct aom_svc_ref_frame_config *arg) {
+    const aom_codec_err_t res = aom_codec_control(&encoder_, ctrl_id, arg);
+    ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError();
+  }
+
+  void Control(int ctrl_id, struct aom_svc_params *arg) {
+    const aom_codec_err_t res = aom_codec_control(&encoder_, ctrl_id, arg);
+    ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError();
+  }
+
+#if CONFIG_AV1_ENCODER
+  void Control(int ctrl_id, aom_active_map_t *arg) {
+    const aom_codec_err_t res = aom_codec_control(&encoder_, ctrl_id, arg);
+    ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError();
+  }
+#endif
+
+  void Config(const aom_codec_enc_cfg_t *cfg) {
+    const aom_codec_err_t res = aom_codec_enc_config_set(&encoder_, cfg);
+    ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError();
+    cfg_ = *cfg;
+  }
+
+ protected:
+  virtual aom_codec_iface_t *CodecInterface() const = 0;
+
+  const char *EncoderError() {
+    const char *detail = aom_codec_error_detail(&encoder_);
+    return detail ? detail : aom_codec_error(&encoder_);
+  }
+
+  // Encode an image
+  void EncodeFrameInternal(const VideoSource &video,
+                           const unsigned long frame_flags);
+
+  // Flush the encoder on EOS
+  void Flush();
+
+  aom_codec_ctx_t encoder_;
+  aom_codec_enc_cfg_t cfg_;
+  aom_codec_flags_t init_flags_;
+  TwopassStatsStore *stats_;
+};
+
+// Common test functionality for all Encoder tests.
+//
+// This class is a mixin which provides the main loop common to all
+// encoder tests. It provides hooks which can be overridden by subclasses
+// to implement each test's specific behavior, while centralizing the bulk
+// of the boilerplate. Note that it doesn't inherit the gtest testing
+// classes directly, so that tests can be parameterized differently.
+class EncoderTest {
+ protected:
+  explicit EncoderTest(const CodecFactory *codec)
+      : codec_(codec), abort_(false), init_flags_(0), frame_flags_(0),
+        last_pts_(0), mode_(kRealTime), number_spatial_layers_(1) {
+    // Default to 1 thread.
+    cfg_.g_threads = 1;
+  }
+
+  virtual ~EncoderTest() {}
+
+  // Initialize the cfg_ member with the default configuration.
+  void InitializeConfig();
+
+  // Map the TestMode enum to the passes_ variables.
+  void SetMode(TestMode mode);
+
+  // Set encoder flag.
+  void set_init_flags(aom_codec_flags_t flag) { init_flags_ = flag; }
+
+  // Main loop
+  virtual void RunLoop(VideoSource *video);
+
+  // Hook to be called at the beginning of a pass.
+  virtual void BeginPassHook(unsigned int /*pass*/) {}
+
+  // Hook to be called at the end of a pass.
+  virtual void EndPassHook() {}
+
+  // Hook to be called before encoding a frame.
+  virtual void PreEncodeFrameHook(VideoSource * /*video*/) {}
+  virtual void PreEncodeFrameHook(VideoSource * /*video*/,
+                                  Encoder * /*encoder*/) {}
+
+  // Hook to be called on every compressed data packet.
+  virtual void FramePktHook(const aom_codec_cx_pkt_t * /*pkt*/) {}
+
+  // Hook to be called on every PSNR packet.
+  virtual void PSNRPktHook(const aom_codec_cx_pkt_t * /*pkt*/) {}
+
+  // Hook to determine whether the encode loop should continue.
+  virtual bool Continue() const {
+    return !(::testing::Test::HasFatalFailure() || abort_);
+  }
+
+  // Hook to determine whether to decode frame after encoding
+  virtual bool DoDecode() const { return true; }
+
+  // Hook to determine whether to decode invisible frames after encoding
+  virtual bool DoDecodeInvisible() const { return true; }
+
+  // Hook to handle encode/decode mismatch
+  virtual void MismatchHook(const aom_image_t *img1, const aom_image_t *img2);
+
+  // Hook to be called on every decompressed frame.
+  virtual void DecompressedFrameHook(const aom_image_t & /*img*/,
+                                     aom_codec_pts_t /*pts*/) {}
+
+  // Hook to be called to handle decode result. Return true to continue.
+  virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
+                                  Decoder *decoder) {
+    EXPECT_EQ(AOM_CODEC_OK, res_dec) << decoder->DecodeError();
+    return AOM_CODEC_OK == res_dec;
+  }
+
+  virtual int GetNumSpatialLayers() { return 1; }
+
+  // Hook that can modify the encoder's output data
+  virtual const aom_codec_cx_pkt_t *MutateEncoderOutputHook(
+      const aom_codec_cx_pkt_t *pkt) {
+    return pkt;
+  }
+
+  const CodecFactory *codec_;
+  bool abort_;
+  aom_codec_enc_cfg_t cfg_;
+  unsigned int passes_;
+  TwopassStatsStore stats_;
+  aom_codec_flags_t init_flags_;
+  unsigned long frame_flags_;
+  aom_codec_pts_t last_pts_;
+  TestMode mode_;
+  int number_spatial_layers_;
+};
+
+}  // namespace libaom_test
+
+#endif  // AOM_TEST_ENCODE_TEST_DRIVER_H_
diff --git a/libs/libaom/src/test/encodetxb_test.cc b/libs/libaom/src/test/encodetxb_test.cc
new file mode 100644
index 000000000..385d3f1a8
--- /dev/null
+++ b/libs/libaom/src/test/encodetxb_test.cc
@@ -0,0 +1,263 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <tuple>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/mem.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/common/idct.h"
+#include "av1/common/scan.h"
+#include "av1/common/txb_common.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+
+namespace {
+using libaom_test::ACMRandom;
+
+typedef void (*GetNzMapContextsFunc)(const uint8_t *const levels,
+                                     const int16_t *const scan,
+                                     const uint16_t eob, const TX_SIZE tx_size,
+                                     const TX_CLASS tx_class,
+                                     int8_t *const coeff_contexts);
+
+class EncodeTxbTest : public ::testing::TestWithParam<GetNzMapContextsFunc> {
+ public:
+  EncodeTxbTest() : get_nz_map_contexts_func_(GetParam()) {}
+
+  virtual ~EncodeTxbTest() {}
+
+  virtual void SetUp() {
+    coeff_contexts_ref_ = reinterpret_cast<int8_t *>(
+        aom_memalign(16, sizeof(*coeff_contexts_ref_) * MAX_TX_SQUARE));
+    ASSERT_TRUE(coeff_contexts_ref_ != NULL);
+    coeff_contexts_ = reinterpret_cast<int8_t *>(
+        aom_memalign(16, sizeof(*coeff_contexts_) * MAX_TX_SQUARE));
+    ASSERT_TRUE(coeff_contexts_ != NULL);
+  }
+
+  virtual void TearDown() {
+    aom_free(coeff_contexts_ref_);
+    aom_free(coeff_contexts_);
+    libaom_test::ClearSystemState();
+  }
+
+  void GetNzMapContextsRun() {
+    const int kNumTests = 10;
+    int result = 0;
+
+    for (int is_inter = 0; is_inter < 2; ++is_inter) {
+      for (int tx_type = DCT_DCT; tx_type < TX_TYPES; ++tx_type) {
+        const TX_CLASS tx_class = tx_type_to_class[tx_type];
+        for (int tx_size = TX_4X4; tx_size < TX_SIZES_ALL; ++tx_size) {
+          const int bwl = get_txb_bwl((TX_SIZE)tx_size);
+          const int width = get_txb_wide((TX_SIZE)tx_size);
+          const int height = get_txb_high((TX_SIZE)tx_size);
+          const int real_width = tx_size_wide[tx_size];
+          const int real_height = tx_size_high[tx_size];
+          const int16_t *const scan = av1_scan_orders[tx_size][tx_type].scan;
+
+          levels_ = set_levels(levels_buf_, width);
+          for (int i = 0; i < kNumTests && !result; ++i) {
+            for (int eob = 1; eob <= width * height && !result; ++eob) {
+              InitDataWithEob(scan, bwl, eob);
+
+              av1_get_nz_map_contexts_c(levels_, scan, eob, (TX_SIZE)tx_size,
+                                        tx_class, coeff_contexts_ref_);
+              get_nz_map_contexts_func_(levels_, scan, eob, (TX_SIZE)tx_size,
+                                        tx_class, coeff_contexts_);
+
+              result = Compare(scan, eob);
+
+              EXPECT_EQ(result, 0)
+                  << " tx_class " << tx_class << " width " << real_width
+                  << " height " << real_height << " eob " << eob;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  void SpeedTestGetNzMapContextsRun() {
+    const int kNumTests = 2000000000;
+    aom_usec_timer timer;
+
+    printf("Note: Only test the largest possible eob case!\n");
+    for (int tx_size = TX_4X4; tx_size < TX_SIZES_ALL; ++tx_size) {
+      const int bwl = get_txb_bwl((TX_SIZE)tx_size);
+      const int width = get_txb_wide((TX_SIZE)tx_size);
+      const int height = get_txb_high((TX_SIZE)tx_size);
+      const int real_width = tx_size_wide[tx_size];
+      const int real_height = tx_size_high[tx_size];
+      const TX_TYPE tx_type = DCT_DCT;
+      const TX_CLASS tx_class = tx_type_to_class[tx_type];
+      const int16_t *const scan = av1_scan_orders[tx_size][tx_type].scan;
+      const int eob = width * height;
+      const int numTests = kNumTests / (width * height);
+
+      levels_ = set_levels(levels_buf_, width);
+      InitDataWithEob(scan, bwl, eob);
+
+      aom_usec_timer_start(&timer);
+      for (int i = 0; i < numTests; ++i) {
+        get_nz_map_contexts_func_(levels_, scan, eob, (TX_SIZE)tx_size,
+                                  tx_class, coeff_contexts_);
+      }
+      aom_usec_timer_mark(&timer);
+
+      const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+      printf("get_nz_map_contexts_%2dx%2d: %7.1f ms\n", real_width, real_height,
+             elapsed_time / 1000.0);
+    }
+  }
+
+ private:
+  void InitDataWithEob(const int16_t *const scan, const int bwl,
+                       const int eob) {
+    memset(levels_buf_, 0, sizeof(levels_buf_));
+    memset(coeff_contexts_, 0, sizeof(*coeff_contexts_) * MAX_TX_SQUARE);
+
+    for (int c = 0; c < eob; ++c) {
+      levels_[get_padded_idx(scan[c], bwl)] =
+          static_cast<uint8_t>(clamp(rnd_.Rand8(), 0, INT8_MAX));
+      coeff_contexts_[scan[c]] = static_cast<int8_t>(rnd_.Rand16() >> 1);
+    }
+
+    memcpy(coeff_contexts_ref_, coeff_contexts_,
+           sizeof(*coeff_contexts_) * MAX_TX_SQUARE);
+  }
+
+  bool Compare(const int16_t *const scan, const int eob) const {
+    bool result = false;
+    if (memcmp(coeff_contexts_, coeff_contexts_ref_,
+               sizeof(*coeff_contexts_ref_) * MAX_TX_SQUARE)) {
+      for (int i = 0; i < eob; i++) {
+        const int pos = scan[i];
+        if (coeff_contexts_ref_[pos] != coeff_contexts_[pos]) {
+          printf("coeff_contexts_[%d] diff:%6d (ref),%6d (opt)\n", pos,
+                 coeff_contexts_ref_[pos], coeff_contexts_[pos]);
+          result = true;
+          break;
+        }
+      }
+    }
+    return result;
+  }
+
+  GetNzMapContextsFunc get_nz_map_contexts_func_;
+  ACMRandom rnd_;
+  uint8_t levels_buf_[TX_PAD_2D];
+  uint8_t *levels_;
+  int8_t *coeff_contexts_ref_;
+  int8_t *coeff_contexts_;
+};
+
+TEST_P(EncodeTxbTest, GetNzMapContexts) { GetNzMapContextsRun(); }
+
+TEST_P(EncodeTxbTest, DISABLED_SpeedTestGetNzMapContexts) {
+  SpeedTestGetNzMapContextsRun();
+}
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(SSE2, EncodeTxbTest,
+                         ::testing::Values(av1_get_nz_map_contexts_sse2));
+#endif
+
+typedef void (*av1_txb_init_levels_func)(const tran_low_t *const coeff,
+                                         const int width, const int height,
+                                         uint8_t *const levels);
+
+typedef std::tuple<av1_txb_init_levels_func, int> TxbInitLevelParam;
+
+class EncodeTxbInitLevelTest
+    : public ::testing::TestWithParam<TxbInitLevelParam> {
+ public:
+  virtual ~EncodeTxbInitLevelTest() {}
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+  void RunTest(av1_txb_init_levels_func test_func, int tx_size, int is_speed);
+};
+
+void EncodeTxbInitLevelTest::RunTest(av1_txb_init_levels_func test_func,
+                                     int tx_size, int is_speed) {
+  const int width = get_txb_wide((TX_SIZE)tx_size);
+  const int height = get_txb_high((TX_SIZE)tx_size);
+  tran_low_t coeff[MAX_TX_SQUARE];
+
+  uint8_t levels_buf[2][TX_PAD_2D];
+  uint8_t *const levels0 = set_levels(levels_buf[0], width);
+  uint8_t *const levels1 = set_levels(levels_buf[1], width);
+
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  for (int i = 0; i < width * height; i++) {
+    coeff[i] = rnd.Rand15Signed() + rnd.Rand15Signed();
+  }
+  for (int i = 0; i < TX_PAD_2D; i++) {
+    levels_buf[0][i] = rnd.Rand8();
+    levels_buf[1][i] = rnd.Rand8();
+  }
+  const int run_times = is_speed ? (width * height) * 10000 : 1;
+  aom_usec_timer timer;
+  aom_usec_timer_start(&timer);
+  for (int i = 0; i < run_times; ++i) {
+    av1_txb_init_levels_c(coeff, width, height, levels0);
+  }
+  const double t1 = get_time_mark(&timer);
+  aom_usec_timer_start(&timer);
+  for (int i = 0; i < run_times; ++i) {
+    test_func(coeff, width, height, levels1);
+  }
+  const double t2 = get_time_mark(&timer);
+  if (is_speed) {
+    printf("init %3dx%-3d:%7.2f/%7.2fns", width, height, t1, t2);
+    printf("(%3.2f)\n", t1 / t2);
+  }
+  const int stride = width + TX_PAD_HOR;
+  for (int r = 0; r < height + TX_PAD_VER; ++r) {
+    for (int c = 0; c < stride; ++c) {
+      ASSERT_EQ(levels_buf[0][c + r * stride], levels_buf[1][c + r * stride])
+          << "[" << r << "," << c << "] " << run_times << width << "x"
+          << height;
+    }
+  }
+}
+
+TEST_P(EncodeTxbInitLevelTest, match) {
+  RunTest(GET_PARAM(0), GET_PARAM(1), 0);
+}
+
+TEST_P(EncodeTxbInitLevelTest, DISABLED_Speed) {
+  RunTest(GET_PARAM(0), GET_PARAM(1), 1);
+}
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(
+    SSE4_1, EncodeTxbInitLevelTest,
+    ::testing::Combine(::testing::Values(&av1_txb_init_levels_sse4_1),
+                       ::testing::Range(0, static_cast<int>(TX_SIZES_ALL), 1)));
+#endif
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, EncodeTxbInitLevelTest,
+    ::testing::Combine(::testing::Values(&av1_txb_init_levels_avx2),
+                       ::testing::Range(0, static_cast<int>(TX_SIZES_ALL), 1)));
+#endif
+}  // namespace
diff --git a/libs/libaom/src/test/end_to_end_test.cc b/libs/libaom/src/test/end_to_end_test.cc
new file mode 100644
index 000000000..162a7c743
--- /dev/null
+++ b/libs/libaom/src/test/end_to_end_test.cc
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <memory>
+#include <ostream>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+#include "test/yuv_video_source.h"
+
+namespace {
+
+const unsigned int kWidth = 160;
+const unsigned int kHeight = 90;
+const unsigned int kFramerate = 50;
+const unsigned int kFrames = 10;
+const int kBitrate = 500;
+// List of psnr thresholds for speed settings 0-7 and 5 encoding modes
+const double kPsnrThreshold[][5] = {
+// Note:
+// AV1 HBD average PSNR is slightly lower than AV1.
+// We make two cases here to enable the testing and
+// guard picture quality.
+#if CONFIG_AV1_ENCODER
+  { 36.0, 37.0, 37.0, 37.0, 37.0 }, { 31.0, 36.0, 36.0, 36.0, 36.0 },
+  { 31.0, 35.0, 35.0, 35.0, 35.0 }, { 31.0, 34.0, 34.0, 34.0, 34.0 },
+  { 31.0, 33.0, 33.0, 33.0, 33.0 }, { 31.0, 32.0, 32.0, 32.0, 32.0 },
+  { 30.0, 31.0, 31.0, 31.0, 31.0 }, { 29.0, 30.0, 30.0, 30.0, 30.0 },
+#else
+  { 36.0, 37.0, 37.0, 37.0, 37.0 }, { 35.0, 36.0, 36.0, 36.0, 36.0 },
+  { 34.0, 35.0, 35.0, 35.0, 35.0 }, { 33.0, 34.0, 34.0, 34.0, 34.0 },
+  { 32.0, 33.0, 33.0, 33.0, 33.0 }, { 31.0, 32.0, 32.0, 32.0, 32.0 },
+  { 30.0, 31.0, 31.0, 31.0, 31.0 }, { 29.0, 30.0, 30.0, 30.0, 30.0 },
+#endif  // CONFIG_AV1_ENCODER
+};
+
+typedef struct {
+  const char *filename;
+  unsigned int input_bit_depth;
+  aom_img_fmt fmt;
+  aom_bit_depth_t bit_depth;
+  unsigned int profile;
+} TestVideoParam;
+
+std::ostream &operator<<(std::ostream &os, const TestVideoParam &test_arg) {
+  return os << "TestVideoParam { filename:" << test_arg.filename
+            << " input_bit_depth:" << test_arg.input_bit_depth
+            << " fmt:" << test_arg.fmt << " bit_depth:" << test_arg.bit_depth
+            << " profile:" << test_arg.profile << " }";
+}
+
+const TestVideoParam kTestVectors[] = {
+  { "park_joy_90p_8_420.y4m", 8, AOM_IMG_FMT_I420, AOM_BITS_8, 0 },
+  { "park_joy_90p_8_422.y4m", 8, AOM_IMG_FMT_I422, AOM_BITS_8, 2 },
+  { "park_joy_90p_8_444.y4m", 8, AOM_IMG_FMT_I444, AOM_BITS_8, 1 },
+#if CONFIG_AV1_HIGHBITDEPTH
+  { "park_joy_90p_10_420.y4m", 10, AOM_IMG_FMT_I42016, AOM_BITS_10, 0 },
+  { "park_joy_90p_10_422.y4m", 10, AOM_IMG_FMT_I42216, AOM_BITS_10, 2 },
+  { "park_joy_90p_10_444.y4m", 10, AOM_IMG_FMT_I44416, AOM_BITS_10, 1 },
+  { "park_joy_90p_12_420.y4m", 12, AOM_IMG_FMT_I42016, AOM_BITS_12, 2 },
+  { "park_joy_90p_12_422.y4m", 12, AOM_IMG_FMT_I42216, AOM_BITS_12, 2 },
+  { "park_joy_90p_12_444.y4m", 12, AOM_IMG_FMT_I44416, AOM_BITS_12, 2 },
+#endif
+};
+
+// Encoding modes tested
+const libaom_test::TestMode kEncodingModeVectors[] = {
+  ::libaom_test::kTwoPassGood,
+  ::libaom_test::kOnePassGood,
+  ::libaom_test::kRealTime,
+};
+
+// Speed settings tested
+const int kCpuUsedVectors[] = { 1, 2, 3, 5, 6 };
+
+int is_extension_y4m(const char *filename) {
+  const char *dot = strrchr(filename, '.');
+  if (!dot || dot == filename)
+    return 0;
+  else
+    return !strcmp(dot, ".y4m");
+}
+
+class EndToEndTest
+    : public ::libaom_test::CodecTestWith3Params<libaom_test::TestMode,
+                                                 TestVideoParam, int>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  EndToEndTest()
+      : EncoderTest(GET_PARAM(0)), test_video_param_(GET_PARAM(2)),
+        cpu_used_(GET_PARAM(3)), psnr_(0.0), nframes_(0),
+        encoding_mode_(GET_PARAM(1)) {}
+
+  virtual ~EndToEndTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(encoding_mode_);
+    if (encoding_mode_ != ::libaom_test::kRealTime) {
+      cfg_.g_lag_in_frames = 5;
+      cfg_.rc_end_usage = AOM_VBR;
+    } else {
+      cfg_.g_lag_in_frames = 0;
+      cfg_.rc_end_usage = AOM_CBR;
+      cfg_.rc_buf_sz = 1000;
+      cfg_.rc_buf_initial_sz = 500;
+      cfg_.rc_buf_optimal_sz = 600;
+    }
+  }
+
+  virtual void BeginPassHook(unsigned int) {
+    psnr_ = 0.0;
+    nframes_ = 0;
+  }
+
+  virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+    psnr_ += pkt->data.psnr.psnr[0];
+    nframes_++;
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1);
+      encoder->Control(AV1E_SET_TILE_COLUMNS, 4);
+      encoder->Control(AOME_SET_CPUUSED, cpu_used_);
+      // Test screen coding tools at cpu_used = 1 && encoding mode is two-pass.
+      if (cpu_used_ == 1 && encoding_mode_ == ::libaom_test::kTwoPassGood)
+        encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_SCREEN);
+      else
+        encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_DEFAULT);
+      if (encoding_mode_ != ::libaom_test::kRealTime) {
+        encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+        encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
+        encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
+      }
+    }
+  }
+
+  double GetAveragePsnr() const {
+    if (nframes_) return psnr_ / nframes_;
+    return 0.0;
+  }
+
+  double GetPsnrThreshold() {
+    return kPsnrThreshold[cpu_used_][encoding_mode_];
+  }
+
+  void DoTest() {
+    cfg_.rc_target_bitrate = kBitrate;
+    cfg_.g_error_resilient = 0;
+    cfg_.g_profile = test_video_param_.profile;
+    cfg_.g_input_bit_depth = test_video_param_.input_bit_depth;
+    cfg_.g_bit_depth = test_video_param_.bit_depth;
+    init_flags_ = AOM_CODEC_USE_PSNR;
+    if (cfg_.g_bit_depth > 8) init_flags_ |= AOM_CODEC_USE_HIGHBITDEPTH;
+
+    std::unique_ptr<libaom_test::VideoSource> video;
+    if (is_extension_y4m(test_video_param_.filename)) {
+      video.reset(new libaom_test::Y4mVideoSource(test_video_param_.filename, 0,
+                                                  kFrames));
+    } else {
+      video.reset(new libaom_test::YUVVideoSource(
+          test_video_param_.filename, test_video_param_.fmt, kWidth, kHeight,
+          kFramerate, 1, 0, kFrames));
+    }
+    ASSERT_TRUE(video.get() != NULL);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+    const double psnr = GetAveragePsnr();
+    EXPECT_GT(psnr, GetPsnrThreshold())
+        << "cpu used = " << cpu_used_ << ", encoding mode = " << encoding_mode_;
+  }
+
+  TestVideoParam test_video_param_;
+  int cpu_used_;
+
+ private:
+  double psnr_;
+  unsigned int nframes_;
+  libaom_test::TestMode encoding_mode_;
+};
+
+class EndToEndTestLarge : public EndToEndTest {};
+
+TEST_P(EndToEndTestLarge, EndtoEndPSNRTest) { DoTest(); }
+
+TEST_P(EndToEndTest, EndtoEndPSNRTest) { DoTest(); }
+
+AV1_INSTANTIATE_TEST_CASE(EndToEndTestLarge,
+                          ::testing::ValuesIn(kEncodingModeVectors),
+                          ::testing::ValuesIn(kTestVectors),
+                          ::testing::ValuesIn(kCpuUsedVectors));
+
+AV1_INSTANTIATE_TEST_CASE(EndToEndTest,
+                          ::testing::Values(kEncodingModeVectors[0]),
+                          ::testing::Values(kTestVectors[2]),  // 444
+                          ::testing::Values(kCpuUsedVectors[2]));
+}  // namespace
diff --git a/libs/libaom/src/test/error_block_test.cc b/libs/libaom/src/test/error_block_test.cc
new file mode 100644
index 000000000..462661e61
--- /dev/null
+++ b/libs/libaom/src/test/error_block_test.cc
@@ -0,0 +1,289 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <cmath>
+#include <cstdlib>
+#include <string>
+#include <tuple>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "av1/common/entropy.h"
+#include "aom/aom_codec.h"
+#include "aom/aom_integer.h"
+
+using libaom_test::ACMRandom;
+
+namespace {
+const int kNumIterations = 1000;
+
+typedef int64_t (*ErrorBlockFunc)(const tran_low_t *coeff,
+                                  const tran_low_t *dqcoeff,
+                                  intptr_t block_size, int64_t *ssz, int bps);
+
+typedef int64_t (*ErrorBlockFunc8Bits)(const tran_low_t *coeff,
+                                       const tran_low_t *dqcoeff,
+                                       intptr_t block_size, int64_t *ssz);
+
+typedef std::tuple<ErrorBlockFunc, ErrorBlockFunc, aom_bit_depth_t>
+    ErrorBlockParam;
+
+template <ErrorBlockFunc8Bits fn>
+int64_t BlockError8BitWrapper(const tran_low_t *coeff,
+                              const tran_low_t *dqcoeff, intptr_t block_size,
+                              int64_t *ssz, int bps) {
+  EXPECT_EQ(bps, 8);
+  return fn(coeff, dqcoeff, block_size, ssz);
+}
+
+class ErrorBlockTest : public ::testing::TestWithParam<ErrorBlockParam> {
+ public:
+  virtual ~ErrorBlockTest() {}
+  virtual void SetUp() {
+    error_block_op_ = GET_PARAM(0);
+    ref_error_block_op_ = GET_PARAM(1);
+    bit_depth_ = GET_PARAM(2);
+  }
+
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+  aom_bit_depth_t bit_depth_;
+  ErrorBlockFunc error_block_op_;
+  ErrorBlockFunc ref_error_block_op_;
+};
+
+TEST_P(ErrorBlockTest, OperationCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED(16, tran_low_t, coeff[4096]);
+  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[4096]);
+  int err_count_total = 0;
+  int first_failure = -1;
+  intptr_t block_size;
+  int64_t ssz;
+  int64_t ret;
+  int64_t ref_ssz;
+  int64_t ref_ret;
+  const int msb = bit_depth_ + 8 - 1;
+  for (int i = 0; i < kNumIterations; ++i) {
+    int err_count = 0;
+    block_size = 16 << (i % 9);  // All block sizes from 4x4, 8x4 ..64x64
+    for (int j = 0; j < block_size; j++) {
+      // coeff and dqcoeff will always have at least the same sign, and this
+      // can be used for optimization, so generate test input precisely.
+      if (rnd(2)) {
+        // Positive number
+        coeff[j] = rnd(1 << msb);
+        dqcoeff[j] = rnd(1 << msb);
+      } else {
+        // Negative number
+        coeff[j] = -rnd(1 << msb);
+        dqcoeff[j] = -rnd(1 << msb);
+      }
+    }
+    ref_ret =
+        ref_error_block_op_(coeff, dqcoeff, block_size, &ref_ssz, bit_depth_);
+    ASM_REGISTER_STATE_CHECK(
+        ret = error_block_op_(coeff, dqcoeff, block_size, &ssz, bit_depth_));
+    err_count += (ref_ret != ret) | (ref_ssz != ssz);
+    if (err_count && !err_count_total) {
+      first_failure = i;
+    }
+    err_count_total += err_count;
+  }
+  EXPECT_EQ(0, err_count_total)
+      << "Error: Error Block Test, C output doesn't match optimized output. "
+      << "First failed at test case " << first_failure;
+}
+
+TEST_P(ErrorBlockTest, ExtremeValues) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED(16, tran_low_t, coeff[4096]);
+  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[4096]);
+  int err_count_total = 0;
+  int first_failure = -1;
+  intptr_t block_size;
+  int64_t ssz;
+  int64_t ret;
+  int64_t ref_ssz;
+  int64_t ref_ret;
+  const int msb = bit_depth_ + 8 - 1;
+  int max_val = ((1 << msb) - 1);
+  for (int i = 0; i < kNumIterations; ++i) {
+    int err_count = 0;
+    int k = (i / 9) % 9;
+
+    // Change the maximum coeff value, to test different bit boundaries
+    if (k == 8 && (i % 9) == 0) {
+      max_val >>= 1;
+    }
+    block_size = 16 << (i % 9);  // All block sizes from 4x4, 8x4 ..64x64
+    for (int j = 0; j < block_size; j++) {
+      if (k < 4) {
+        // Test at positive maximum values
+        coeff[j] = k % 2 ? max_val : 0;
+        dqcoeff[j] = (k >> 1) % 2 ? max_val : 0;
+      } else if (k < 8) {
+        // Test at negative maximum values
+        coeff[j] = k % 2 ? -max_val : 0;
+        dqcoeff[j] = (k >> 1) % 2 ? -max_val : 0;
+      } else {
+        if (rnd(2)) {
+          // Positive number
+          coeff[j] = rnd(1 << 14);
+          dqcoeff[j] = rnd(1 << 14);
+        } else {
+          // Negative number
+          coeff[j] = -rnd(1 << 14);
+          dqcoeff[j] = -rnd(1 << 14);
+        }
+      }
+    }
+    ref_ret =
+        ref_error_block_op_(coeff, dqcoeff, block_size, &ref_ssz, bit_depth_);
+    ASM_REGISTER_STATE_CHECK(
+        ret = error_block_op_(coeff, dqcoeff, block_size, &ssz, bit_depth_));
+    err_count += (ref_ret != ret) | (ref_ssz != ssz);
+    if (err_count && !err_count_total) {
+      first_failure = i;
+    }
+    err_count_total += err_count;
+  }
+  EXPECT_EQ(0, err_count_total)
+      << "Error: Error Block Test, C output doesn't match optimized output. "
+      << "First failed at test case " << first_failure;
+}
+
+TEST_P(ErrorBlockTest, DISABLED_Speed) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED(16, tran_low_t, coeff[4096]);
+  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[4096]);
+  intptr_t block_size;
+  int64_t ssz;
+  int num_iters = 100000;
+  int64_t ref_ssz;
+  int k;
+  const int msb = bit_depth_ + 8 - 1;
+  for (int i = 0; i < 9; ++i) {
+    block_size = 16 << (i % 9);  // All block sizes from 4x4, 8x4 ..64x64
+    for (k = 0; k < 9; k++) {
+      for (int j = 0; j < block_size; j++) {
+        if (k < 5) {
+          if (rnd(2)) {
+            // Positive number
+            coeff[j] = rnd(1 << msb);
+            dqcoeff[j] = rnd(1 << msb);
+          } else {
+            // Negative number
+            coeff[j] = -rnd(1 << msb);
+            dqcoeff[j] = -rnd(1 << msb);
+          }
+        } else {
+          if (rnd(2)) {
+            // Positive number
+            coeff[j] = rnd(1 << 14);
+            dqcoeff[j] = rnd(1 << 14);
+          } else {
+            // Negative number
+            coeff[j] = -rnd(1 << 14);
+            dqcoeff[j] = -rnd(1 << 14);
+          }
+        }
+      }
+      aom_usec_timer ref_timer, test_timer;
+
+      aom_usec_timer_start(&ref_timer);
+      for (int i = 0; i < num_iters; ++i) {
+        ref_error_block_op_(coeff, dqcoeff, block_size, &ref_ssz, bit_depth_);
+      }
+      aom_usec_timer_mark(&ref_timer);
+      const int elapsed_time_c =
+          static_cast<int>(aom_usec_timer_elapsed(&ref_timer));
+
+      aom_usec_timer_start(&test_timer);
+      for (int i = 0; i < num_iters; ++i) {
+        error_block_op_(coeff, dqcoeff, block_size, &ssz, bit_depth_);
+      }
+      aom_usec_timer_mark(&test_timer);
+
+      const int elapsed_time_simd =
+          static_cast<int>(aom_usec_timer_elapsed(&test_timer));
+
+      printf(
+          " c_time=%d \t simd_time=%d \t "
+          "gain=%d \n",
+          elapsed_time_c, elapsed_time_simd,
+          (elapsed_time_c / elapsed_time_simd));
+    }
+  }
+}
+
+using std::make_tuple;
+
+#if (HAVE_SSE2)
+const ErrorBlockParam kErrorBlockTestParamsSse2[] = {
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(&av1_highbd_block_error_sse2, &av1_highbd_block_error_c,
+             AOM_BITS_10),
+  make_tuple(&av1_highbd_block_error_sse2, &av1_highbd_block_error_c,
+             AOM_BITS_12),
+  make_tuple(&av1_highbd_block_error_sse2, &av1_highbd_block_error_c,
+             AOM_BITS_8),
+#endif
+  make_tuple(&BlockError8BitWrapper<av1_block_error_sse2>,
+             &BlockError8BitWrapper<av1_block_error_c>, AOM_BITS_8)
+};
+
+INSTANTIATE_TEST_SUITE_P(SSE2, ErrorBlockTest,
+                         ::testing::ValuesIn(kErrorBlockTestParamsSse2));
+#endif  // HAVE_SSE2
+
+#if (HAVE_AVX2)
+const ErrorBlockParam kErrorBlockTestParamsAvx2[] = {
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(&av1_highbd_block_error_avx2, &av1_highbd_block_error_c,
+             AOM_BITS_10),
+  make_tuple(&av1_highbd_block_error_avx2, &av1_highbd_block_error_c,
+             AOM_BITS_12),
+  make_tuple(&av1_highbd_block_error_avx2, &av1_highbd_block_error_c,
+             AOM_BITS_8),
+#endif
+  make_tuple(&BlockError8BitWrapper<av1_block_error_avx2>,
+             &BlockError8BitWrapper<av1_block_error_c>, AOM_BITS_8)
+};
+
+INSTANTIATE_TEST_SUITE_P(AVX2, ErrorBlockTest,
+                         ::testing::ValuesIn(kErrorBlockTestParamsAvx2));
+#endif  // HAVE_AVX2
+
+#if (HAVE_MSA)
+INSTANTIATE_TEST_SUITE_P(
+    MSA, ErrorBlockTest,
+    ::testing::Values(make_tuple(&BlockError8BitWrapper<av1_block_error_msa>,
+                                 &BlockError8BitWrapper<av1_block_error_c>,
+                                 AOM_BITS_8)));
+#endif  // HAVE_MSA
+
+#if (HAVE_NEON)
+INSTANTIATE_TEST_SUITE_P(
+    NEON, ErrorBlockTest,
+    ::testing::Values(make_tuple(&BlockError8BitWrapper<av1_block_error_neon>,
+                                 &BlockError8BitWrapper<av1_block_error_c>,
+                                 AOM_BITS_8)));
+#endif  // HAVE_NEON
+}  // namespace
diff --git a/libs/libaom/src/test/error_resilience_test.cc b/libs/libaom/src/test/error_resilience_test.cc
new file mode 100644
index 000000000..1d52bb24a
--- /dev/null
+++ b/libs/libaom/src/test/error_resilience_test.cc
@@ -0,0 +1,459 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+
+namespace {
+
+const int kMaxErrorFrames = 12;
+const int kMaxInvisibleErrorFrames = 12;
+const int kMaxDroppableFrames = 12;
+const int kMaxErrorResilientFrames = 12;
+const int kMaxNoMFMVFrames = 12;
+const int kMaxPrimRefNoneFrames = 12;
+const int kMaxSFrames = 12;
+const int kCpuUsed = 1;
+
+class ErrorResilienceTestLarge
+    : public ::libaom_test::CodecTestWithParam<libaom_test::TestMode>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  ErrorResilienceTestLarge()
+      : EncoderTest(GET_PARAM(0)), psnr_(0.0), nframes_(0), mismatch_psnr_(0.0),
+        mismatch_nframes_(0), encoding_mode_(GET_PARAM(1)), allow_mismatch_(0) {
+    Reset();
+  }
+
+  virtual ~ErrorResilienceTestLarge() {}
+
+  void Reset() {
+    error_nframes_ = 0;
+    invisible_error_nframes_ = 0;
+    droppable_nframes_ = 0;
+    error_resilient_nframes_ = 0;
+    nomfmv_nframes_ = 0;
+    prim_ref_none_nframes_ = 0;
+    s_nframes_ = 0;
+  }
+
+  void SetupEncoder(int bitrate, int lag) {
+    const aom_rational timebase = { 33333333, 1000000000 };
+    cfg_.g_timebase = timebase;
+    cfg_.rc_target_bitrate = bitrate;
+    cfg_.kf_mode = AOM_KF_DISABLED;
+    cfg_.g_lag_in_frames = lag;
+    init_flags_ = AOM_CODEC_USE_PSNR;
+  }
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(encoding_mode_);
+  }
+
+  virtual void BeginPassHook(unsigned int /*pass*/) {
+    psnr_ = 0.0;
+    nframes_ = 0;
+    decoded_nframes_ = 0;
+    mismatch_psnr_ = 0.0;
+    mismatch_nframes_ = 0;
+  }
+
+  virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+    psnr_ += pkt->data.psnr.psnr[0];
+    nframes_++;
+  }
+
+  virtual void PreEncodeFrameHook(libaom_test::VideoSource *video,
+                                  libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) encoder->Control(AOME_SET_CPUUSED, kCpuUsed);
+    frame_flags_ &=
+        ~(AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF |
+          AOM_EFLAG_NO_REF_FRAME_MVS | AOM_EFLAG_ERROR_RESILIENT |
+          AOM_EFLAG_SET_S_FRAME | AOM_EFLAG_SET_PRIMARY_REF_NONE);
+    if (droppable_nframes_ > 0 &&
+        (cfg_.g_pass == AOM_RC_LAST_PASS || cfg_.g_pass == AOM_RC_ONE_PASS)) {
+      for (unsigned int i = 0; i < droppable_nframes_; ++i) {
+        if (droppable_frames_[i] == video->frame()) {
+          std::cout << "             Encoding droppable frame: "
+                    << droppable_frames_[i] << "\n";
+          frame_flags_ |= (AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF |
+                           AOM_EFLAG_NO_UPD_ARF);
+          break;
+        }
+      }
+    }
+
+    if (error_resilient_nframes_ > 0 &&
+        (cfg_.g_pass == AOM_RC_LAST_PASS || cfg_.g_pass == AOM_RC_ONE_PASS)) {
+      for (unsigned int i = 0; i < error_resilient_nframes_; ++i) {
+        if (error_resilient_frames_[i] == video->frame()) {
+          std::cout << "             Encoding error_resilient frame: "
+                    << error_resilient_frames_[i] << "\n";
+          frame_flags_ |= AOM_EFLAG_ERROR_RESILIENT;
+          break;
+        }
+      }
+    }
+
+    if (nomfmv_nframes_ > 0 &&
+        (cfg_.g_pass == AOM_RC_LAST_PASS || cfg_.g_pass == AOM_RC_ONE_PASS)) {
+      for (unsigned int i = 0; i < nomfmv_nframes_; ++i) {
+        if (nomfmv_frames_[i] == video->frame()) {
+          std::cout << "             Encoding no mfmv frame: "
+                    << nomfmv_frames_[i] << "\n";
+          frame_flags_ |= AOM_EFLAG_NO_REF_FRAME_MVS;
+          break;
+        }
+      }
+    }
+
+    if (prim_ref_none_nframes_ > 0 &&
+        (cfg_.g_pass == AOM_RC_LAST_PASS || cfg_.g_pass == AOM_RC_ONE_PASS)) {
+      for (unsigned int i = 0; i < prim_ref_none_nframes_; ++i) {
+        if (prim_ref_none_frames_[i] == video->frame()) {
+          std::cout << "             Encoding no PRIMARY_REF_NONE frame: "
+                    << prim_ref_none_frames_[i] << "\n";
+          frame_flags_ |= AOM_EFLAG_SET_PRIMARY_REF_NONE;
+          break;
+        }
+      }
+    }
+
+    encoder->Control(AV1E_SET_S_FRAME_MODE, 0);
+    if (s_nframes_ > 0 &&
+        (cfg_.g_pass == AOM_RC_LAST_PASS || cfg_.g_pass == AOM_RC_ONE_PASS)) {
+      for (unsigned int i = 0; i < s_nframes_; ++i) {
+        if (s_frames_[i] == video->frame()) {
+          std::cout << "             Encoding S frame: " << s_frames_[i]
+                    << "\n";
+          frame_flags_ |= AOM_EFLAG_SET_S_FRAME;
+          break;
+        }
+      }
+    }
+  }
+
+  virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
+    // Check that the encode frame flags are correctly reflected
+    // in the output frame flags.
+    const int encode_flags = pkt->data.frame.flags >> 16;
+    if ((encode_flags & (AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF |
+                         AOM_EFLAG_NO_UPD_ARF)) ==
+        (AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF)) {
+      ASSERT_EQ(pkt->data.frame.flags & AOM_FRAME_IS_DROPPABLE,
+                static_cast<aom_codec_frame_flags_t>(AOM_FRAME_IS_DROPPABLE));
+    }
+    if (encode_flags & AOM_EFLAG_SET_S_FRAME) {
+      ASSERT_EQ(pkt->data.frame.flags & AOM_FRAME_IS_SWITCH,
+                static_cast<aom_codec_frame_flags_t>(AOM_FRAME_IS_SWITCH));
+    }
+    if (encode_flags & AOM_EFLAG_ERROR_RESILIENT) {
+      ASSERT_EQ(
+          pkt->data.frame.flags & AOM_FRAME_IS_ERROR_RESILIENT,
+          static_cast<aom_codec_frame_flags_t>(AOM_FRAME_IS_ERROR_RESILIENT));
+    }
+  }
+
+  double GetAveragePsnr() const {
+    if (nframes_) return psnr_ / nframes_;
+    return 0.0;
+  }
+
+  double GetAverageMismatchPsnr() const {
+    if (mismatch_nframes_) return mismatch_psnr_ / mismatch_nframes_;
+    return 0.0;
+  }
+
+  virtual bool DoDecode() const {
+    if (error_nframes_ > 0 &&
+        (cfg_.g_pass == AOM_RC_LAST_PASS || cfg_.g_pass == AOM_RC_ONE_PASS)) {
+      for (unsigned int i = 0; i < error_nframes_; ++i) {
+        if (error_frames_[i] == nframes_ - 1) {
+          std::cout << "             Skipping decoding frame: "
+                    << error_frames_[i] << "\n";
+          return 0;
+        }
+      }
+    }
+    return 1;
+  }
+
+  virtual bool DoDecodeInvisible() const {
+    if (invisible_error_nframes_ > 0 &&
+        (cfg_.g_pass == AOM_RC_LAST_PASS || cfg_.g_pass == AOM_RC_ONE_PASS)) {
+      for (unsigned int i = 0; i < invisible_error_nframes_; ++i) {
+        if (invisible_error_frames_[i] == nframes_ - 1) {
+          std::cout << "             Skipping decoding all invisible frames in "
+                       "frame pkt: "
+                    << invisible_error_frames_[i] << "\n";
+          return 0;
+        }
+      }
+    }
+    return 1;
+  }
+
+  virtual void MismatchHook(const aom_image_t *img1, const aom_image_t *img2) {
+    if (allow_mismatch_) {
+      double mismatch_psnr = compute_psnr(img1, img2);
+      mismatch_psnr_ += mismatch_psnr;
+      ++mismatch_nframes_;
+      // std::cout << "Mismatch frame psnr: " << mismatch_psnr << "\n";
+    } else {
+      ::libaom_test::EncoderTest::MismatchHook(img1, img2);
+    }
+  }
+
+  virtual void DecompressedFrameHook(const aom_image_t &img,
+                                     aom_codec_pts_t pts) {
+    (void)img;
+    (void)pts;
+    ++decoded_nframes_;
+  }
+
+  void SetErrorFrames(int num, unsigned int *list) {
+    if (num > kMaxErrorFrames)
+      num = kMaxErrorFrames;
+    else if (num < 0)
+      num = 0;
+    error_nframes_ = num;
+    for (unsigned int i = 0; i < error_nframes_; ++i)
+      error_frames_[i] = list[i];
+  }
+
+  void SetInvisibleErrorFrames(int num, unsigned int *list) {
+    if (num > kMaxInvisibleErrorFrames)
+      num = kMaxInvisibleErrorFrames;
+    else if (num < 0)
+      num = 0;
+    invisible_error_nframes_ = num;
+    for (unsigned int i = 0; i < invisible_error_nframes_; ++i)
+      invisible_error_frames_[i] = list[i];
+  }
+
+  void SetDroppableFrames(int num, unsigned int *list) {
+    if (num > kMaxDroppableFrames)
+      num = kMaxDroppableFrames;
+    else if (num < 0)
+      num = 0;
+    droppable_nframes_ = num;
+    for (unsigned int i = 0; i < droppable_nframes_; ++i)
+      droppable_frames_[i] = list[i];
+  }
+
+  void SetErrorResilientFrames(int num, unsigned int *list) {
+    if (num > kMaxErrorResilientFrames)
+      num = kMaxErrorResilientFrames;
+    else if (num < 0)
+      num = 0;
+    error_resilient_nframes_ = num;
+    for (unsigned int i = 0; i < error_resilient_nframes_; ++i)
+      error_resilient_frames_[i] = list[i];
+  }
+
+  void SetNoMFMVFrames(int num, unsigned int *list) {
+    if (num > kMaxNoMFMVFrames)
+      num = kMaxNoMFMVFrames;
+    else if (num < 0)
+      num = 0;
+    nomfmv_nframes_ = num;
+    for (unsigned int i = 0; i < nomfmv_nframes_; ++i)
+      nomfmv_frames_[i] = list[i];
+  }
+
+  void SetPrimaryRefNoneFrames(int num, unsigned int *list) {
+    if (num > kMaxPrimRefNoneFrames)
+      num = kMaxPrimRefNoneFrames;
+    else if (num < 0)
+      num = 0;
+    prim_ref_none_nframes_ = num;
+    for (unsigned int i = 0; i < prim_ref_none_nframes_; ++i)
+      prim_ref_none_frames_[i] = list[i];
+  }
+
+  void SetSFrames(int num, unsigned int *list) {
+    if (num > kMaxSFrames)
+      num = kMaxSFrames;
+    else if (num < 0)
+      num = 0;
+    s_nframes_ = num;
+    for (unsigned int i = 0; i < s_nframes_; ++i) s_frames_[i] = list[i];
+  }
+
+  unsigned int GetMismatchFrames() { return mismatch_nframes_; }
+  unsigned int GetEncodedFrames() { return nframes_; }
+  unsigned int GetDecodedFrames() { return decoded_nframes_; }
+
+  void SetAllowMismatch(int allow) { allow_mismatch_ = allow; }
+
+ private:
+  double psnr_;
+  unsigned int nframes_;
+  unsigned int decoded_nframes_;
+  unsigned int error_nframes_;
+  unsigned int invisible_error_nframes_;
+  unsigned int droppable_nframes_;
+  unsigned int error_resilient_nframes_;
+  unsigned int nomfmv_nframes_;
+  unsigned int prim_ref_none_nframes_;
+  unsigned int s_nframes_;
+  double mismatch_psnr_;
+  unsigned int mismatch_nframes_;
+  unsigned int error_frames_[kMaxErrorFrames];
+  unsigned int invisible_error_frames_[kMaxInvisibleErrorFrames];
+  unsigned int droppable_frames_[kMaxDroppableFrames];
+  unsigned int error_resilient_frames_[kMaxErrorResilientFrames];
+  unsigned int nomfmv_frames_[kMaxNoMFMVFrames];
+  unsigned int prim_ref_none_frames_[kMaxPrimRefNoneFrames];
+  unsigned int s_frames_[kMaxSFrames];
+  libaom_test::TestMode encoding_mode_;
+  int allow_mismatch_;
+};
+
+TEST_P(ErrorResilienceTestLarge, OnVersusOff) {
+  SetupEncoder(2000, 10);
+  libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                     cfg_.g_timebase.den, cfg_.g_timebase.num,
+                                     0, 12);
+
+  // Global error resilient mode OFF.
+  cfg_.g_error_resilient = 0;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  const double psnr_resilience_off = GetAveragePsnr();
+  EXPECT_GT(psnr_resilience_off, 25.0);
+
+  Reset();
+  // Error resilient mode ON for certain frames
+  unsigned int num_error_resilient_frames = 5;
+  unsigned int error_resilient_frame_list[] = { 3, 5, 6, 9, 11 };
+  SetErrorResilientFrames(num_error_resilient_frames,
+                          error_resilient_frame_list);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  const double psnr_resilience_on = GetAveragePsnr();
+  EXPECT_GT(psnr_resilience_on, 25.0);
+
+  // Test that turning on error resilient mode hurts by 10% at most.
+  if (psnr_resilience_off > 0.0) {
+    const double psnr_ratio = psnr_resilience_on / psnr_resilience_off;
+    EXPECT_GE(psnr_ratio, 0.9);
+    EXPECT_LE(psnr_ratio, 1.1);
+  }
+}
+
+// Check for successful decoding and no encoder/decoder mismatch
+// if we lose (i.e., drop before decoding) a set of droppable
+// frames (i.e., frames that don't update any reference buffers).
+TEST_P(ErrorResilienceTestLarge, DropFramesWithoutRecovery) {
+  SetupEncoder(500, 10);
+  libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                     cfg_.g_timebase.den, cfg_.g_timebase.num,
+                                     0, 20);
+
+  // Set an arbitrary set of error frames same as droppable frames.
+  unsigned int num_droppable_frames = 3;
+  unsigned int droppable_frame_list[] = { 5, 11, 13 };
+  SetDroppableFrames(num_droppable_frames, droppable_frame_list);
+  SetErrorFrames(num_droppable_frames, droppable_frame_list);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  // Test that no mismatches have been found
+  std::cout << "             Encoded frames: " << GetEncodedFrames() << "\n";
+  std::cout << "             Decoded frames: " << GetDecodedFrames() << "\n";
+  std::cout << "             Mismatch frames: " << GetMismatchFrames() << "\n";
+  EXPECT_EQ(GetEncodedFrames() - GetDecodedFrames(), num_droppable_frames);
+}
+
+// Check for ParseAbility property of an error-resilient frame.
+// Encode a frame in error-resilient mode (E-frame), and disallow all
+// subsequent frames from using MFMV. If frames are dropped before the
+// E frame, all frames starting from the E frame should be parse-able.
+TEST_P(ErrorResilienceTestLarge, ParseAbilityTest) {
+  SetupEncoder(500, 10);
+
+  libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                     cfg_.g_timebase.den, cfg_.g_timebase.num,
+                                     0, 15);
+
+  SetAllowMismatch(1);
+
+  // Note that an E-frame cannot be forced on a frame that is a
+  // show_existing_frame, or a frame that comes directly after an invisible
+  // frame. Currently, this will cause an assertion failure.
+  // Set an arbitrary error resilient (E) frame
+  unsigned int num_error_resilient_frames = 1;
+  unsigned int error_resilient_frame_list[] = { 8 };
+  SetErrorResilientFrames(num_error_resilient_frames,
+                          error_resilient_frame_list);
+  // Ensure that any invisible frames before the E frame are dropped
+  SetInvisibleErrorFrames(num_error_resilient_frames,
+                          error_resilient_frame_list);
+  // Set all frames after the error resilient frame to not allow MFMV
+  unsigned int num_post_error_resilient_frames = 6;
+  unsigned int post_error_resilient_frame_list[] = { 9, 10, 11, 12, 13, 14 };
+  SetNoMFMVFrames(num_post_error_resilient_frames,
+                  post_error_resilient_frame_list);
+
+  // Set a few frames before the E frame that are lost (not decoded)
+  unsigned int num_error_frames = 5;
+  unsigned int error_frame_list[] = { 3, 4, 5, 6, 7 };
+  SetErrorFrames(num_error_frames, error_frame_list);
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  std::cout << "             Encoded frames: " << GetEncodedFrames() << "\n";
+  std::cout << "             Decoded frames: " << GetDecodedFrames() << "\n";
+  std::cout << "             Mismatch frames: " << GetMismatchFrames() << "\n";
+  EXPECT_EQ(GetEncodedFrames() - GetDecodedFrames(), num_error_frames);
+  // All frames following the E-frame and the E-frame are expected to have
+  // mismatches, but still be parse-able.
+  EXPECT_LE(GetMismatchFrames(), num_post_error_resilient_frames + 1);
+}
+
+// Check for ParseAbility property of an S frame.
+// Encode an S-frame. If frames are dropped before the S-frame, all frames
+// starting from the S frame should be parse-able.
+TEST_P(ErrorResilienceTestLarge, SFrameTest) {
+  SetupEncoder(500, 10);
+
+  libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                     cfg_.g_timebase.den, cfg_.g_timebase.num,
+                                     0, 15);
+
+  SetAllowMismatch(1);
+
+  // Note that an S-frame cannot be forced on a frame that is a
+  // show_existing_frame. This issue still needs to be addressed.
+  // Set an arbitrary S-frame
+  unsigned int num_s_frames = 1;
+  unsigned int s_frame_list[] = { 6 };
+  SetSFrames(num_s_frames, s_frame_list);
+  // Ensure that any invisible frames before the S frame are dropped
+  SetInvisibleErrorFrames(num_s_frames, s_frame_list);
+
+  // Set a few frames before the S frame that are lost (not decoded)
+  unsigned int num_error_frames = 4;
+  unsigned int error_frame_list[] = { 2, 3, 4, 5 };
+  SetErrorFrames(num_error_frames, error_frame_list);
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  std::cout << "             Encoded frames: " << GetEncodedFrames() << "\n";
+  std::cout << "             Decoded frames: " << GetDecodedFrames() << "\n";
+  std::cout << "             Mismatch frames: " << GetMismatchFrames() << "\n";
+  EXPECT_EQ(GetEncodedFrames() - GetDecodedFrames(), num_error_frames);
+  // All frames following the S-frame and the S-frame are expected to have
+  // mismatches, but still be parse-able.
+  EXPECT_LE(GetMismatchFrames(), GetEncodedFrames() - s_frame_list[0]);
+}
+
+AV1_INSTANTIATE_TEST_CASE(ErrorResilienceTestLarge, NONREALTIME_TEST_MODES);
+}  // namespace
diff --git a/libs/libaom/src/test/ethread_test.cc b/libs/libaom/src/test/ethread_test.cc
new file mode 100644
index 000000000..306cc2f3a
--- /dev/null
+++ b/libs/libaom/src/test/ethread_test.cc
@@ -0,0 +1,275 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <string>
+#include <vector>
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/md5_helper.h"
+#include "test/util.h"
+#include "test/yuv_video_source.h"
+
+namespace {
+class AVxEncoderThreadTest
+    : public ::libaom_test::CodecTestWith5Params<libaom_test::TestMode, int,
+                                                 int, int, int>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  AVxEncoderThreadTest()
+      : EncoderTest(GET_PARAM(0)), encoder_initialized_(false),
+        encoding_mode_(GET_PARAM(1)), set_cpu_used_(GET_PARAM(2)),
+        tile_cols_(GET_PARAM(3)), tile_rows_(GET_PARAM(4)),
+        row_mt_(GET_PARAM(5)) {
+    init_flags_ = AOM_CODEC_USE_PSNR;
+    aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t();
+    cfg.w = 1280;
+    cfg.h = 720;
+    cfg.allow_lowbitdepth = 1;
+    decoder_ = codec_->CreateDecoder(cfg, 0);
+    if (decoder_->IsAV1()) {
+      decoder_->Control(AV1_SET_DECODE_TILE_ROW, -1);
+      decoder_->Control(AV1_SET_DECODE_TILE_COL, -1);
+    }
+
+    size_enc_.clear();
+    md5_dec_.clear();
+    md5_enc_.clear();
+  }
+  virtual ~AVxEncoderThreadTest() { delete decoder_; }
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(encoding_mode_);
+
+    if (encoding_mode_ != ::libaom_test::kRealTime) {
+      cfg_.g_lag_in_frames = 5;
+      cfg_.rc_end_usage = AOM_VBR;
+      cfg_.rc_2pass_vbr_minsection_pct = 5;
+      cfg_.rc_2pass_vbr_maxsection_pct = 2000;
+    } else {
+      cfg_.g_lag_in_frames = 0;
+      cfg_.rc_end_usage = AOM_CBR;
+      cfg_.g_error_resilient = 1;
+    }
+    cfg_.rc_max_quantizer = 56;
+    cfg_.rc_min_quantizer = 0;
+  }
+
+  virtual void BeginPassHook(unsigned int /*pass*/) {
+    encoder_initialized_ = false;
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource * /*video*/,
+                                  ::libaom_test::Encoder *encoder) {
+    if (!encoder_initialized_) {
+      SetTileSize(encoder);
+      encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
+      encoder->Control(AV1E_SET_ROW_MT, row_mt_);
+      if (encoding_mode_ != ::libaom_test::kRealTime) {
+        encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+        encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
+        encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
+        encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 0);
+      } else {
+        encoder->Control(AOME_SET_ENABLEAUTOALTREF, 0);
+        encoder->Control(AV1E_SET_AQ_MODE, 3);
+      }
+      encoder_initialized_ = true;
+    }
+  }
+
+  virtual void SetTileSize(libaom_test::Encoder *encoder) {
+    encoder->Control(AV1E_SET_TILE_COLUMNS, tile_cols_);
+    encoder->Control(AV1E_SET_TILE_ROWS, tile_rows_);
+  }
+
+  virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
+    size_enc_.push_back(pkt->data.frame.sz);
+
+    ::libaom_test::MD5 md5_enc;
+    md5_enc.Add(reinterpret_cast<uint8_t *>(pkt->data.frame.buf),
+                pkt->data.frame.sz);
+    md5_enc_.push_back(md5_enc.Get());
+
+    const aom_codec_err_t res = decoder_->DecodeFrame(
+        reinterpret_cast<uint8_t *>(pkt->data.frame.buf), pkt->data.frame.sz);
+    if (res != AOM_CODEC_OK) {
+      abort_ = true;
+      ASSERT_EQ(AOM_CODEC_OK, res);
+    }
+    const aom_image_t *img = decoder_->GetDxData().Next();
+
+    if (img) {
+      ::libaom_test::MD5 md5_res;
+      md5_res.Add(img);
+      md5_dec_.push_back(md5_res.Get());
+    }
+  }
+
+  void DoTest() {
+    ::libaom_test::YUVVideoSource video(
+        "niklas_640_480_30.yuv", AOM_IMG_FMT_I420, 640, 480, 30, 1, 15, 21);
+    cfg_.rc_target_bitrate = 1000;
+
+    if (row_mt_ == 0) {
+      // Encode using single thread.
+      cfg_.g_threads = 1;
+      init_flags_ = AOM_CODEC_USE_PSNR;
+      ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+      std::vector<size_t> single_thr_size_enc;
+      std::vector<std::string> single_thr_md5_enc;
+      std::vector<std::string> single_thr_md5_dec;
+      single_thr_size_enc = size_enc_;
+      single_thr_md5_enc = md5_enc_;
+      single_thr_md5_dec = md5_dec_;
+      size_enc_.clear();
+      md5_enc_.clear();
+      md5_dec_.clear();
+
+      // Encode using multiple threads.
+      cfg_.g_threads = 4;
+      ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+      std::vector<size_t> multi_thr_size_enc;
+      std::vector<std::string> multi_thr_md5_enc;
+      std::vector<std::string> multi_thr_md5_dec;
+      multi_thr_size_enc = size_enc_;
+      multi_thr_md5_enc = md5_enc_;
+      multi_thr_md5_dec = md5_dec_;
+      size_enc_.clear();
+      md5_enc_.clear();
+      md5_dec_.clear();
+
+      // Check that the vectors are equal.
+      ASSERT_EQ(single_thr_size_enc, multi_thr_size_enc);
+      ASSERT_EQ(single_thr_md5_enc, multi_thr_md5_enc);
+      ASSERT_EQ(single_thr_md5_dec, multi_thr_md5_dec);
+    } else if (row_mt_ == 1) {
+      // Encode using multiple threads row-mt enabled.
+      cfg_.g_threads = 2;
+      ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+      std::vector<size_t> multi_thr2_row_mt_size_enc;
+      std::vector<std::string> multi_thr2_row_mt_md5_enc;
+      std::vector<std::string> multi_thr2_row_mt_md5_dec;
+      multi_thr2_row_mt_size_enc = size_enc_;
+      multi_thr2_row_mt_md5_enc = md5_enc_;
+      multi_thr2_row_mt_md5_dec = md5_dec_;
+      size_enc_.clear();
+      md5_enc_.clear();
+      md5_dec_.clear();
+
+      // Disable threads=3 test for now to reduce the time so that the nightly
+      // test would not time out.
+      // cfg_.g_threads = 3;
+      // ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+      // std::vector<size_t> multi_thr3_row_mt_size_enc;
+      // std::vector<std::string> multi_thr3_row_mt_md5_enc;
+      // std::vector<std::string> multi_thr3_row_mt_md5_dec;
+      // multi_thr3_row_mt_size_enc = size_enc_;
+      // multi_thr3_row_mt_md5_enc = md5_enc_;
+      // multi_thr3_row_mt_md5_dec = md5_dec_;
+      // size_enc_.clear();
+      // md5_enc_.clear();
+      // md5_dec_.clear();
+      // Check that the vectors are equal.
+      // ASSERT_EQ(multi_thr3_row_mt_size_enc, multi_thr2_row_mt_size_enc);
+      // ASSERT_EQ(multi_thr3_row_mt_md5_enc, multi_thr2_row_mt_md5_enc);
+      // ASSERT_EQ(multi_thr3_row_mt_md5_dec, multi_thr2_row_mt_md5_dec);
+
+      cfg_.g_threads = 4;
+      ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+      std::vector<size_t> multi_thr4_row_mt_size_enc;
+      std::vector<std::string> multi_thr4_row_mt_md5_enc;
+      std::vector<std::string> multi_thr4_row_mt_md5_dec;
+      multi_thr4_row_mt_size_enc = size_enc_;
+      multi_thr4_row_mt_md5_enc = md5_enc_;
+      multi_thr4_row_mt_md5_dec = md5_dec_;
+      size_enc_.clear();
+      md5_enc_.clear();
+      md5_dec_.clear();
+
+      // Check that the vectors are equal.
+      ASSERT_EQ(multi_thr4_row_mt_size_enc, multi_thr2_row_mt_size_enc);
+      ASSERT_EQ(multi_thr4_row_mt_md5_enc, multi_thr2_row_mt_md5_enc);
+      ASSERT_EQ(multi_thr4_row_mt_md5_dec, multi_thr2_row_mt_md5_dec);
+    }
+  }
+
+  bool encoder_initialized_;
+  ::libaom_test::TestMode encoding_mode_;
+  int set_cpu_used_;
+  int tile_cols_;
+  int tile_rows_;
+  int row_mt_;
+  ::libaom_test::Decoder *decoder_;
+  std::vector<size_t> size_enc_;
+  std::vector<std::string> md5_enc_;
+  std::vector<std::string> md5_dec_;
+};
+
+TEST_P(AVxEncoderThreadTest, EncoderResultTest) {
+  cfg_.large_scale_tile = 0;
+  decoder_->Control(AV1_SET_TILE_MODE, 0);
+  DoTest();
+}
+
+class AVxEncoderThreadTestLarge : public AVxEncoderThreadTest {};
+
+TEST_P(AVxEncoderThreadTestLarge, EncoderResultTest) {
+  cfg_.large_scale_tile = 0;
+  decoder_->Control(AV1_SET_TILE_MODE, 0);
+  DoTest();
+}
+
+// For AV1, only test speed 0 to 3.
+// Here test cpu_used 2 and 3
+AV1_INSTANTIATE_TEST_CASE(AVxEncoderThreadTest,
+                          ::testing::Values(::libaom_test::kTwoPassGood),
+                          ::testing::Range(2, 4), ::testing::Values(0, 2),
+                          ::testing::Values(0, 1), ::testing::Values(0, 1));
+
+// Test cpu_used 0 and 1.
+AV1_INSTANTIATE_TEST_CASE(AVxEncoderThreadTestLarge,
+                          ::testing::Values(::libaom_test::kTwoPassGood,
+                                            ::libaom_test::kOnePassGood),
+                          ::testing::Range(0, 2), ::testing::Values(0, 1, 2, 6),
+                          ::testing::Values(0, 1, 2, 6),
+                          ::testing::Values(0, 1));
+
+class AVxEncoderThreadLSTest : public AVxEncoderThreadTest {
+  virtual void SetTileSize(libaom_test::Encoder *encoder) {
+    encoder->Control(AV1E_SET_TILE_COLUMNS, tile_cols_);
+    encoder->Control(AV1E_SET_TILE_ROWS, tile_rows_);
+  }
+};
+
+TEST_P(AVxEncoderThreadLSTest, EncoderResultTest) {
+  cfg_.large_scale_tile = 1;
+  decoder_->Control(AV1_SET_TILE_MODE, 1);
+  decoder_->Control(AV1D_EXT_TILE_DEBUG, 1);
+  DoTest();
+}
+
+class AVxEncoderThreadLSTestLarge : public AVxEncoderThreadLSTest {};
+
+TEST_P(AVxEncoderThreadLSTestLarge, EncoderResultTest) {
+  cfg_.large_scale_tile = 1;
+  decoder_->Control(AV1_SET_TILE_MODE, 1);
+  decoder_->Control(AV1D_EXT_TILE_DEBUG, 1);
+  DoTest();
+}
+
+AV1_INSTANTIATE_TEST_CASE(AVxEncoderThreadLSTestLarge,
+                          ::testing::Values(::libaom_test::kTwoPassGood,
+                                            ::libaom_test::kOnePassGood),
+                          ::testing::Range(0, 4), ::testing::Values(0, 6),
+                          ::testing::Values(0, 6), ::testing::Values(0, 1));
+}  // namespace
diff --git a/libs/libaom/src/test/examples.sh b/libs/libaom/src/test/examples.sh
new file mode 100644
index 000000000..2cdb89dd0
--- /dev/null
+++ b/libs/libaom/src/test/examples.sh
@@ -0,0 +1,29 @@
+#!/bin/sh
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+## This file runs all of the tests for the libaom examples.
+##
+. $(dirname $0)/tools_common.sh
+
+example_tests=$(ls -r $(dirname $0)/*.sh)
+
+# List of script names to exclude.
+exclude_list="best_encode examples run_encodes tools_common"
+
+# Filter out the scripts in $exclude_list.
+for word in ${exclude_list}; do
+  example_tests=$(filter_strings "${example_tests}" "${word}" exclude)
+done
+
+for test in ${example_tests}; do
+  # Source each test script so that exporting variables can be avoided.
+  AOM_TEST_NAME="$(basename ${test%.*})"
+  . "${test}"
+done
diff --git a/libs/libaom/src/test/external_frame_buffer_test.cc b/libs/libaom/src/test/external_frame_buffer_test.cc
new file mode 100644
index 000000000..1d726a4f1
--- /dev/null
+++ b/libs/libaom/src/test/external_frame_buffer_test.cc
@@ -0,0 +1,540 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <memory>
+#include <string>
+#include "common/tools_common.h"
+#include "config/aom_config.h"
+#include "test/codec_factory.h"
+#include "test/decode_test_driver.h"
+#include "test/ivf_video_source.h"
+#include "test/md5_helper.h"
+#include "test/test_vectors.h"
+#include "test/util.h"
+#if CONFIG_WEBM_IO
+#include "test/webm_video_source.h"
+#endif
+
+namespace {
+
+const int kVideoNameParam = 1;
+
+struct ExternalFrameBuffer {
+  uint8_t *data;
+  size_t size;
+  int in_use;
+};
+
+// Class to manipulate a list of external frame buffers.
+class ExternalFrameBufferList {
+ public:
+  ExternalFrameBufferList()
+      : num_buffers_(0), num_used_buffers_(0), ext_fb_list_(NULL) {}
+
+  virtual ~ExternalFrameBufferList() {
+    for (int i = 0; i < num_buffers_; ++i) {
+      delete[] ext_fb_list_[i].data;
+    }
+    delete[] ext_fb_list_;
+  }
+
+  // Creates the list to hold the external buffers. Returns true on success.
+  bool CreateBufferList(int num_buffers) {
+    if (num_buffers < 0) return false;
+
+    num_buffers_ = num_buffers;
+    ext_fb_list_ = new ExternalFrameBuffer[num_buffers_];
+    EXPECT_TRUE(ext_fb_list_ != NULL);
+    memset(ext_fb_list_, 0, sizeof(ext_fb_list_[0]) * num_buffers_);
+    return true;
+  }
+
+  // Searches the frame buffer list for a free frame buffer. Makes sure
+  // that the frame buffer is at least |min_size| in bytes. Marks that the
+  // frame buffer is in use by libaom. Finally sets |fb| to point to the
+  // external frame buffer. Returns < 0 on an error.
+  int GetFreeFrameBuffer(size_t min_size, aom_codec_frame_buffer_t *fb) {
+    EXPECT_TRUE(fb != NULL);
+    const int idx = FindFreeBufferIndex();
+    if (idx == num_buffers_) return -1;
+
+    if (ext_fb_list_[idx].size < min_size) {
+      delete[] ext_fb_list_[idx].data;
+      ext_fb_list_[idx].data = new uint8_t[min_size];
+      memset(ext_fb_list_[idx].data, 0, min_size);
+      ext_fb_list_[idx].size = min_size;
+    }
+
+    SetFrameBuffer(idx, fb);
+
+    num_used_buffers_++;
+    return 0;
+  }
+
+  // Test function that will not allocate any data for the frame buffer.
+  // Returns < 0 on an error.
+  int GetZeroFrameBuffer(size_t min_size, aom_codec_frame_buffer_t *fb) {
+    EXPECT_TRUE(fb != NULL);
+    const int idx = FindFreeBufferIndex();
+    if (idx == num_buffers_) return -1;
+
+    if (ext_fb_list_[idx].size < min_size) {
+      delete[] ext_fb_list_[idx].data;
+      ext_fb_list_[idx].data = NULL;
+      ext_fb_list_[idx].size = min_size;
+    }
+
+    SetFrameBuffer(idx, fb);
+    return 0;
+  }
+
+  // Marks the external frame buffer that |fb| is pointing to as free.
+  // Returns < 0 on an error.
+  int ReturnFrameBuffer(aom_codec_frame_buffer_t *fb) {
+    if (fb == NULL) {
+      EXPECT_TRUE(fb != NULL);
+      return -1;
+    }
+    ExternalFrameBuffer *const ext_fb =
+        reinterpret_cast<ExternalFrameBuffer *>(fb->priv);
+    if (ext_fb == NULL) {
+      EXPECT_TRUE(ext_fb != NULL);
+      return -1;
+    }
+    EXPECT_EQ(1, ext_fb->in_use);
+    ext_fb->in_use = 0;
+    num_used_buffers_--;
+    return 0;
+  }
+
+  // Checks that the aom_image_t data is contained within the external frame
+  // buffer private data passed back in the aom_image_t.
+  void CheckImageFrameBuffer(const aom_image_t *img) {
+    const struct ExternalFrameBuffer *const ext_fb =
+        reinterpret_cast<ExternalFrameBuffer *>(img->fb_priv);
+
+    ASSERT_TRUE(img->planes[0] >= ext_fb->data &&
+                img->planes[0] < (ext_fb->data + ext_fb->size));
+  }
+
+  int num_used_buffers() const { return num_used_buffers_; }
+
+ private:
+  // Returns the index of the first free frame buffer. Returns |num_buffers_|
+  // if there are no free frame buffers.
+  int FindFreeBufferIndex() {
+    int i;
+    // Find a free frame buffer.
+    for (i = 0; i < num_buffers_; ++i) {
+      if (!ext_fb_list_[i].in_use) break;
+    }
+    return i;
+  }
+
+  // Sets |fb| to an external frame buffer. idx is the index into the frame
+  // buffer list.
+  void SetFrameBuffer(int idx, aom_codec_frame_buffer_t *fb) {
+    ASSERT_TRUE(fb != NULL);
+    fb->data = ext_fb_list_[idx].data;
+    fb->size = ext_fb_list_[idx].size;
+    ASSERT_EQ(0, ext_fb_list_[idx].in_use);
+    ext_fb_list_[idx].in_use = 1;
+    fb->priv = &ext_fb_list_[idx];
+  }
+
+  int num_buffers_;
+  int num_used_buffers_;
+  ExternalFrameBuffer *ext_fb_list_;
+};
+
+#if CONFIG_WEBM_IO
+
+// Callback used by libaom to request the application to return a frame
+// buffer of at least |min_size| in bytes.
+int get_aom_frame_buffer(void *user_priv, size_t min_size,
+                         aom_codec_frame_buffer_t *fb) {
+  ExternalFrameBufferList *const fb_list =
+      reinterpret_cast<ExternalFrameBufferList *>(user_priv);
+  return fb_list->GetFreeFrameBuffer(min_size, fb);
+}
+
+// Callback used by libaom to tell the application that |fb| is not needed
+// anymore.
+int release_aom_frame_buffer(void *user_priv, aom_codec_frame_buffer_t *fb) {
+  ExternalFrameBufferList *const fb_list =
+      reinterpret_cast<ExternalFrameBufferList *>(user_priv);
+  return fb_list->ReturnFrameBuffer(fb);
+}
+
+// Callback will not allocate data for frame buffer.
+int get_aom_zero_frame_buffer(void *user_priv, size_t min_size,
+                              aom_codec_frame_buffer_t *fb) {
+  ExternalFrameBufferList *const fb_list =
+      reinterpret_cast<ExternalFrameBufferList *>(user_priv);
+  return fb_list->GetZeroFrameBuffer(min_size, fb);
+}
+
+// Callback will allocate one less byte than |min_size|.
+int get_aom_one_less_byte_frame_buffer(void *user_priv, size_t min_size,
+                                       aom_codec_frame_buffer_t *fb) {
+  ExternalFrameBufferList *const fb_list =
+      reinterpret_cast<ExternalFrameBufferList *>(user_priv);
+  return fb_list->GetFreeFrameBuffer(min_size - 1, fb);
+}
+
+// Callback will not release the external frame buffer.
+int do_not_release_aom_frame_buffer(void *user_priv,
+                                    aom_codec_frame_buffer_t *fb) {
+  (void)user_priv;
+  (void)fb;
+  return 0;
+}
+
+#endif  // CONFIG_WEBM_IO
+
+// Class for testing passing in external frame buffers to libaom.
+class ExternalFrameBufferMD5Test
+    : public ::libaom_test::DecoderTest,
+      public ::libaom_test::CodecTestWithParam<const char *> {
+ protected:
+  ExternalFrameBufferMD5Test()
+      : DecoderTest(GET_PARAM(::libaom_test::kCodecFactoryParam)),
+        md5_file_(NULL), num_buffers_(0) {}
+
+  virtual ~ExternalFrameBufferMD5Test() {
+    if (md5_file_ != NULL) fclose(md5_file_);
+  }
+
+  virtual void PreDecodeFrameHook(
+      const libaom_test::CompressedVideoSource &video,
+      libaom_test::Decoder *decoder) {
+    if (num_buffers_ > 0 && video.frame_number() == 0) {
+      // Have libaom use frame buffers we create.
+      ASSERT_TRUE(fb_list_.CreateBufferList(num_buffers_));
+      ASSERT_EQ(AOM_CODEC_OK,
+                decoder->SetFrameBufferFunctions(GetAV1FrameBuffer,
+                                                 ReleaseAV1FrameBuffer, this));
+    }
+  }
+
+  void OpenMD5File(const std::string &md5_file_name_) {
+    md5_file_ = libaom_test::OpenTestDataFile(md5_file_name_);
+    ASSERT_TRUE(md5_file_ != NULL)
+        << "Md5 file open failed. Filename: " << md5_file_name_;
+  }
+
+  virtual void DecompressedFrameHook(const aom_image_t &img,
+                                     const unsigned int frame_number) {
+    ASSERT_TRUE(md5_file_ != NULL);
+    char expected_md5[33];
+    char junk[128];
+
+    // Read correct md5 checksums.
+    const int res = fscanf(md5_file_, "%s  %s", expected_md5, junk);
+    ASSERT_NE(EOF, res) << "Read md5 data failed";
+    expected_md5[32] = '\0';
+
+    ::libaom_test::MD5 md5_res;
+#if FORCE_HIGHBITDEPTH_DECODING
+    const aom_img_fmt_t shifted_fmt =
+        (aom_img_fmt)(img.fmt & ~AOM_IMG_FMT_HIGHBITDEPTH);
+    if (img.bit_depth == 8 && shifted_fmt != img.fmt) {
+      aom_image_t *img_shifted =
+          aom_img_alloc(NULL, shifted_fmt, img.d_w, img.d_h, 16);
+      img_shifted->bit_depth = img.bit_depth;
+      img_shifted->monochrome = img.monochrome;
+      aom_img_downshift(img_shifted, &img, 0);
+      md5_res.Add(img_shifted);
+      aom_img_free(img_shifted);
+    } else {
+#endif
+      md5_res.Add(&img);
+#if FORCE_HIGHBITDEPTH_DECODING
+    }
+#endif
+    const char *const actual_md5 = md5_res.Get();
+
+    // Check md5 match.
+    ASSERT_STREQ(expected_md5, actual_md5)
+        << "Md5 checksums don't match: frame number = " << frame_number;
+
+    const struct ExternalFrameBuffer *const ext_fb =
+        reinterpret_cast<ExternalFrameBuffer *>(img.fb_priv);
+
+    ASSERT_TRUE(img.planes[0] >= ext_fb->data &&
+                img.planes[0] < (ext_fb->data + ext_fb->size));
+  }
+
+  // Callback to get a free external frame buffer. Return value < 0 is an
+  // error.
+  static int GetAV1FrameBuffer(void *user_priv, size_t min_size,
+                               aom_codec_frame_buffer_t *fb) {
+    ExternalFrameBufferMD5Test *const md5Test =
+        reinterpret_cast<ExternalFrameBufferMD5Test *>(user_priv);
+    return md5Test->fb_list_.GetFreeFrameBuffer(min_size, fb);
+  }
+
+  // Callback to release an external frame buffer. Return value < 0 is an
+  // error.
+  static int ReleaseAV1FrameBuffer(void *user_priv,
+                                   aom_codec_frame_buffer_t *fb) {
+    ExternalFrameBufferMD5Test *const md5Test =
+        reinterpret_cast<ExternalFrameBufferMD5Test *>(user_priv);
+    return md5Test->fb_list_.ReturnFrameBuffer(fb);
+  }
+
+  void set_num_buffers(int num_buffers) { num_buffers_ = num_buffers; }
+  int num_buffers() const { return num_buffers_; }
+
+ private:
+  FILE *md5_file_;
+  int num_buffers_;
+  ExternalFrameBufferList fb_list_;
+};
+
+#if CONFIG_WEBM_IO
+const char kAV1TestFile[] = "av1-1-b8-03-sizeup.mkv";
+const char kAV1NonRefTestFile[] = "av1-1-b8-01-size-226x226.ivf";
+
+// Class for testing passing in external frame buffers to libaom.
+class ExternalFrameBufferTest : public ::testing::Test {
+ protected:
+  ExternalFrameBufferTest() : video_(NULL), decoder_(NULL), num_buffers_(0) {}
+
+  virtual void SetUp() {
+    video_ = new libaom_test::WebMVideoSource(kAV1TestFile);
+    ASSERT_TRUE(video_ != NULL);
+    video_->Init();
+    video_->Begin();
+
+    aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t();
+    cfg.allow_lowbitdepth = !FORCE_HIGHBITDEPTH_DECODING;
+    decoder_ = new libaom_test::AV1Decoder(cfg, 0);
+    ASSERT_TRUE(decoder_ != NULL);
+  }
+
+  virtual void TearDown() {
+    delete decoder_;
+    decoder_ = NULL;
+    delete video_;
+    video_ = NULL;
+  }
+
+  // Passes the external frame buffer information to libaom.
+  aom_codec_err_t SetFrameBufferFunctions(
+      int num_buffers, aom_get_frame_buffer_cb_fn_t cb_get,
+      aom_release_frame_buffer_cb_fn_t cb_release) {
+    if (num_buffers > 0) {
+      num_buffers_ = num_buffers;
+      EXPECT_TRUE(fb_list_.CreateBufferList(num_buffers_));
+    }
+
+    return decoder_->SetFrameBufferFunctions(cb_get, cb_release, &fb_list_);
+  }
+
+  aom_codec_err_t DecodeOneFrame() {
+    const aom_codec_err_t res =
+        decoder_->DecodeFrame(video_->cxdata(), video_->frame_size());
+    CheckDecodedFrames();
+    if (res == AOM_CODEC_OK) video_->Next();
+    return res;
+  }
+
+  aom_codec_err_t DecodeRemainingFrames() {
+    for (; video_->cxdata() != NULL; video_->Next()) {
+      const aom_codec_err_t res =
+          decoder_->DecodeFrame(video_->cxdata(), video_->frame_size());
+      if (res != AOM_CODEC_OK) return res;
+      CheckDecodedFrames();
+    }
+    return AOM_CODEC_OK;
+  }
+
+ protected:
+  void CheckDecodedFrames() {
+    libaom_test::DxDataIterator dec_iter = decoder_->GetDxData();
+    const aom_image_t *img = NULL;
+
+    // Get decompressed data
+    while ((img = dec_iter.Next()) != NULL) {
+      fb_list_.CheckImageFrameBuffer(img);
+    }
+  }
+
+  libaom_test::CompressedVideoSource *video_;
+  libaom_test::AV1Decoder *decoder_;
+  int num_buffers_;
+  ExternalFrameBufferList fb_list_;
+};
+
+class ExternalFrameBufferNonRefTest : public ExternalFrameBufferTest {
+ protected:
+  virtual void SetUp() {
+    video_ = new libaom_test::IVFVideoSource(kAV1NonRefTestFile);
+    ASSERT_TRUE(video_ != NULL);
+    video_->Init();
+    video_->Begin();
+
+    aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t();
+    cfg.allow_lowbitdepth = !FORCE_HIGHBITDEPTH_DECODING;
+    decoder_ = new libaom_test::AV1Decoder(cfg, 0);
+    ASSERT_TRUE(decoder_ != NULL);
+  }
+
+  virtual void CheckFrameBufferRelease() {
+    TearDown();
+    ASSERT_EQ(0, fb_list_.num_used_buffers());
+  }
+};
+#endif  // CONFIG_WEBM_IO
+
+// This test runs through the set of test vectors, and decodes them.
+// Libaom will call into the application to allocate a frame buffer when
+// needed. The md5 checksums are computed for each frame in the video file.
+// If md5 checksums match the correct md5 data, then the test is passed.
+// Otherwise, the test failed.
+TEST_P(ExternalFrameBufferMD5Test, ExtFBMD5Match) {
+  const std::string filename = GET_PARAM(kVideoNameParam);
+  aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t();
+
+  // Number of buffers equals #AOM_MAXIMUM_REF_BUFFERS +
+  // #AOM_MAXIMUM_WORK_BUFFERS + four jitter buffers.
+  const int jitter_buffers = 4;
+  const int num_buffers =
+      AOM_MAXIMUM_REF_BUFFERS + AOM_MAXIMUM_WORK_BUFFERS + jitter_buffers;
+  set_num_buffers(num_buffers);
+
+  // Open compressed video file.
+  std::unique_ptr<libaom_test::CompressedVideoSource> video;
+  if (filename.substr(filename.length() - 3, 3) == "ivf") {
+    video.reset(new libaom_test::IVFVideoSource(filename));
+  } else {
+#if CONFIG_WEBM_IO
+    video.reset(new libaom_test::WebMVideoSource(filename));
+#else
+    fprintf(stderr, "WebM IO is disabled, skipping test vector %s\n",
+            filename.c_str());
+    return;
+#endif
+  }
+  ASSERT_TRUE(video.get() != NULL);
+  video->Init();
+
+  // Construct md5 file name.
+  const std::string md5_filename = filename + ".md5";
+  OpenMD5File(md5_filename);
+
+  // Set decode config.
+  cfg.allow_lowbitdepth = !FORCE_HIGHBITDEPTH_DECODING;
+  set_cfg(cfg);
+
+  // Decode frame, and check the md5 matching.
+  ASSERT_NO_FATAL_FAILURE(RunLoop(video.get(), cfg));
+}
+
+#if CONFIG_WEBM_IO
+TEST_F(ExternalFrameBufferTest, MinFrameBuffers) {
+  // Minimum number of external frame buffers for AV1 is
+  // #AOM_MAXIMUM_REF_BUFFERS + #AOM_MAXIMUM_WORK_BUFFERS.
+  const int num_buffers = AOM_MAXIMUM_REF_BUFFERS + AOM_MAXIMUM_WORK_BUFFERS;
+  ASSERT_EQ(AOM_CODEC_OK,
+            SetFrameBufferFunctions(num_buffers, get_aom_frame_buffer,
+                                    release_aom_frame_buffer));
+  ASSERT_EQ(AOM_CODEC_OK, DecodeRemainingFrames());
+}
+
+TEST_F(ExternalFrameBufferTest, EightJitterBuffers) {
+  // Number of buffers equals #AOM_MAXIMUM_REF_BUFFERS +
+  // #AOM_MAXIMUM_WORK_BUFFERS + eight jitter buffers.
+  const int jitter_buffers = 8;
+  const int num_buffers =
+      AOM_MAXIMUM_REF_BUFFERS + AOM_MAXIMUM_WORK_BUFFERS + jitter_buffers;
+  ASSERT_EQ(AOM_CODEC_OK,
+            SetFrameBufferFunctions(num_buffers, get_aom_frame_buffer,
+                                    release_aom_frame_buffer));
+  ASSERT_EQ(AOM_CODEC_OK, DecodeRemainingFrames());
+}
+
+TEST_F(ExternalFrameBufferTest, NotEnoughBuffers) {
+  // Minimum number of external frame buffers for AV1 is
+  // #AOM_MAXIMUM_REF_BUFFERS + #AOM_MAXIMUM_WORK_BUFFERS. Most files will
+  // only use 5 frame buffers at one time.
+  const int num_buffers = 2;
+  ASSERT_EQ(AOM_CODEC_OK,
+            SetFrameBufferFunctions(num_buffers, get_aom_frame_buffer,
+                                    release_aom_frame_buffer));
+  ASSERT_EQ(AOM_CODEC_OK, DecodeOneFrame());
+  // Only run this on long clips. Decoding a very short clip will return
+  // AOM_CODEC_OK even with only 2 buffers.
+  ASSERT_EQ(AOM_CODEC_MEM_ERROR, DecodeRemainingFrames());
+}
+
+TEST_F(ExternalFrameBufferTest, NoRelease) {
+  const int num_buffers = AOM_MAXIMUM_REF_BUFFERS + AOM_MAXIMUM_WORK_BUFFERS;
+  ASSERT_EQ(AOM_CODEC_OK,
+            SetFrameBufferFunctions(num_buffers, get_aom_frame_buffer,
+                                    do_not_release_aom_frame_buffer));
+  ASSERT_EQ(AOM_CODEC_OK, DecodeOneFrame());
+  ASSERT_EQ(AOM_CODEC_MEM_ERROR, DecodeRemainingFrames());
+}
+
+TEST_F(ExternalFrameBufferTest, NullRealloc) {
+  const int num_buffers = AOM_MAXIMUM_REF_BUFFERS + AOM_MAXIMUM_WORK_BUFFERS;
+  ASSERT_EQ(AOM_CODEC_OK,
+            SetFrameBufferFunctions(num_buffers, get_aom_zero_frame_buffer,
+                                    release_aom_frame_buffer));
+  ASSERT_EQ(AOM_CODEC_MEM_ERROR, DecodeOneFrame());
+}
+
+TEST_F(ExternalFrameBufferTest, ReallocOneLessByte) {
+  const int num_buffers = AOM_MAXIMUM_REF_BUFFERS + AOM_MAXIMUM_WORK_BUFFERS;
+  ASSERT_EQ(AOM_CODEC_OK, SetFrameBufferFunctions(
+                              num_buffers, get_aom_one_less_byte_frame_buffer,
+                              release_aom_frame_buffer));
+  ASSERT_EQ(AOM_CODEC_MEM_ERROR, DecodeOneFrame());
+}
+
+TEST_F(ExternalFrameBufferTest, NullGetFunction) {
+  const int num_buffers = AOM_MAXIMUM_REF_BUFFERS + AOM_MAXIMUM_WORK_BUFFERS;
+  ASSERT_EQ(
+      AOM_CODEC_INVALID_PARAM,
+      SetFrameBufferFunctions(num_buffers, NULL, release_aom_frame_buffer));
+}
+
+TEST_F(ExternalFrameBufferTest, NullReleaseFunction) {
+  const int num_buffers = AOM_MAXIMUM_REF_BUFFERS + AOM_MAXIMUM_WORK_BUFFERS;
+  ASSERT_EQ(AOM_CODEC_INVALID_PARAM,
+            SetFrameBufferFunctions(num_buffers, get_aom_frame_buffer, NULL));
+}
+
+TEST_F(ExternalFrameBufferTest, SetAfterDecode) {
+  const int num_buffers = AOM_MAXIMUM_REF_BUFFERS + AOM_MAXIMUM_WORK_BUFFERS;
+  ASSERT_EQ(AOM_CODEC_OK, DecodeOneFrame());
+  ASSERT_EQ(AOM_CODEC_ERROR,
+            SetFrameBufferFunctions(num_buffers, get_aom_frame_buffer,
+                                    release_aom_frame_buffer));
+}
+
+TEST_F(ExternalFrameBufferNonRefTest, ReleaseNonRefFrameBuffer) {
+  const int num_buffers = AOM_MAXIMUM_REF_BUFFERS + AOM_MAXIMUM_WORK_BUFFERS;
+  ASSERT_EQ(AOM_CODEC_OK,
+            SetFrameBufferFunctions(num_buffers, get_aom_frame_buffer,
+                                    release_aom_frame_buffer));
+  ASSERT_EQ(AOM_CODEC_OK, DecodeRemainingFrames());
+  CheckFrameBufferRelease();
+}
+#endif  // CONFIG_WEBM_IO
+
+AV1_INSTANTIATE_TEST_CASE(
+    ExternalFrameBufferMD5Test,
+    ::testing::ValuesIn(libaom_test::kAV1TestVectors,
+                        libaom_test::kAV1TestVectors +
+                            libaom_test::kNumAV1TestVectors));
+}  // namespace
diff --git a/libs/libaom/src/test/fdct4x4_test.cc b/libs/libaom/src/test/fdct4x4_test.cc
new file mode 100644
index 000000000..6600f2c46
--- /dev/null
+++ b/libs/libaom/src/test/fdct4x4_test.cc
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <tuple>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/av1_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/transform_test_base.h"
+#include "test/util.h"
+#include "av1/common/entropy.h"
+#include "aom/aom_codec.h"
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+
+using libaom_test::ACMRandom;
+
+namespace {
+
+template <typename OutputType>
+using FdctFunc = void (*)(const int16_t *in, OutputType *out, int stride);
+
+template <typename OutputType>
+using FhtFunc = void (*)(const int16_t *in, OutputType *out, int stride,
+                         TxfmParam *txfm_param);
+
+template <typename OutputType>
+using Fdct4x4Param =
+    std::tuple<FdctFunc<OutputType>, FhtFunc<OutputType>, aom_bit_depth_t, int>;
+
+#if HAVE_NEON || HAVE_SSE2
+void fdct4x4_ref(const int16_t *in, tran_low_t *out, int stride,
+                 TxfmParam * /*txfm_param*/) {
+  aom_fdct4x4_c(in, out, stride);
+}
+
+void fdct4x4_lp_ref(const int16_t *in, int16_t *out, int stride,
+                    TxfmParam * /*txfm_param*/) {
+  aom_fdct4x4_lp_c(in, out, stride);
+}
+#endif
+
+template <typename OutputType>
+class Trans4x4FDCT : public libaom_test::TransformTestBase<OutputType>,
+                     public ::testing::TestWithParam<Fdct4x4Param<OutputType>> {
+ public:
+  virtual ~Trans4x4FDCT() {}
+
+  using TxfmBaseOutType = libaom_test::TransformTestBase<OutputType>;
+  virtual void SetUp() {
+    fwd_txfm_ = std::get<0>(this->GetParam());
+    TxfmBaseOutType::pitch_ = 4;
+    TxfmBaseOutType::height_ = 4;
+    TxfmBaseOutType::fwd_txfm_ref = std::get<1>(this->GetParam());
+    TxfmBaseOutType::bit_depth_ = std::get<2>(this->GetParam());
+    TxfmBaseOutType::mask_ = (1 << TxfmBaseOutType::bit_depth_) - 1;
+    TxfmBaseOutType::num_coeffs_ = std::get<3>(this->GetParam());
+  }
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+  void RunFwdTxfm(const int16_t *in, OutputType *out, int stride) {
+    fwd_txfm_(in, out, stride);
+  }
+
+  void RunInvTxfm(const OutputType *out, uint8_t *dst, int stride) {
+    (void)out;
+    (void)dst;
+    (void)stride;
+  }
+
+  FdctFunc<OutputType> fwd_txfm_;
+};
+
+using Trans4x4FDCTTranLow = Trans4x4FDCT<tran_low_t>;
+TEST_P(Trans4x4FDCTTranLow, CoeffCheck) { RunCoeffCheck(); }
+TEST_P(Trans4x4FDCTTranLow, MemCheck) { RunMemCheck(); }
+
+using Trans4x4FDCTInt16 = Trans4x4FDCT<int16_t>;
+TEST_P(Trans4x4FDCTInt16, CoeffCheck) { RunCoeffCheck(); }
+TEST_P(Trans4x4FDCTInt16, MemCheck) { RunMemCheck(); }
+
+using std::make_tuple;
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, Trans4x4FDCTTranLow,
+                         ::testing::Values(make_tuple(&aom_fdct4x4_neon,
+                                                      &fdct4x4_ref, AOM_BITS_8,
+                                                      16)));
+
+INSTANTIATE_TEST_SUITE_P(NEON, Trans4x4FDCTInt16,
+                         ::testing::Values(make_tuple(&aom_fdct4x4_lp_neon,
+                                                      &fdct4x4_lp_ref,
+                                                      AOM_BITS_8, 16)));
+#endif
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(SSE2, Trans4x4FDCTTranLow,
+                         ::testing::Values(make_tuple(&aom_fdct4x4_sse2,
+                                                      &fdct4x4_ref, AOM_BITS_8,
+                                                      16)));
+
+INSTANTIATE_TEST_SUITE_P(SSE2, Trans4x4FDCTInt16,
+                         ::testing::Values(make_tuple(&aom_fdct4x4_lp_sse2,
+                                                      &fdct4x4_lp_ref,
+                                                      AOM_BITS_8, 16)));
+#endif
+}  // namespace
diff --git a/libs/libaom/src/test/fft_test.cc b/libs/libaom/src/test/fft_test.cc
new file mode 100644
index 000000000..d23aa012c
--- /dev/null
+++ b/libs/libaom/src/test/fft_test.cc
@@ -0,0 +1,257 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+
+#include <algorithm>
+#include <complex>
+#include <ostream>
+#include <vector>
+
+#include "aom_dsp/fft_common.h"
+#include "aom_mem/aom_mem.h"
+#include "av1/common/common.h"
+#include "config/aom_dsp_rtcd.h"
+#include "test/acm_random.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+typedef void (*tform_fun_t)(const float *input, float *temp, float *output);
+
+// Simple 1D FFT implementation
+template <typename InputType>
+void fft(const InputType *data, std::complex<float> *result, int n) {
+  if (n == 1) {
+    result[0] = data[0];
+    return;
+  }
+  std::vector<InputType> temp(n);
+  for (int k = 0; k < n / 2; ++k) {
+    temp[k] = data[2 * k];
+    temp[n / 2 + k] = data[2 * k + 1];
+  }
+  fft(&temp[0], result, n / 2);
+  fft(&temp[n / 2], result + n / 2, n / 2);
+  for (int k = 0; k < n / 2; ++k) {
+    std::complex<float> w = std::complex<float>((float)cos(2. * PI * k / n),
+                                                (float)-sin(2. * PI * k / n));
+    std::complex<float> a = result[k];
+    std::complex<float> b = result[n / 2 + k];
+    result[k] = a + w * b;
+    result[n / 2 + k] = a - w * b;
+  }
+}
+
+void transpose(std::vector<std::complex<float> > *data, int n) {
+  for (int y = 0; y < n; ++y) {
+    for (int x = y + 1; x < n; ++x) {
+      std::swap((*data)[y * n + x], (*data)[x * n + y]);
+    }
+  }
+}
+
+// Simple 2D FFT implementation
+template <class InputType>
+std::vector<std::complex<float> > fft2d(const InputType *input, int n) {
+  std::vector<std::complex<float> > rowfft(n * n);
+  std::vector<std::complex<float> > result(n * n);
+  for (int y = 0; y < n; ++y) {
+    fft(input + y * n, &rowfft[y * n], n);
+  }
+  transpose(&rowfft, n);
+  for (int y = 0; y < n; ++y) {
+    fft(&rowfft[y * n], &result[y * n], n);
+  }
+  transpose(&result, n);
+  return result;
+}
+
+struct FFTTestArg {
+  int n;
+  void (*fft)(const float *input, float *temp, float *output);
+  FFTTestArg(int n_in, tform_fun_t fft_in) : n(n_in), fft(fft_in) {}
+};
+
+std::ostream &operator<<(std::ostream &os, const FFTTestArg &test_arg) {
+  return os << "fft_arg { n:" << test_arg.n << " fft:" << test_arg.fft << " }";
+}
+
+class FFT2DTest : public ::testing::TestWithParam<FFTTestArg> {
+ protected:
+  void SetUp() {
+    int n = GetParam().n;
+    input_ = (float *)aom_memalign(32, sizeof(*input_) * n * n);
+    temp_ = (float *)aom_memalign(32, sizeof(*temp_) * n * n);
+    output_ = (float *)aom_memalign(32, sizeof(*output_) * n * n * 2);
+    memset(input_, 0, sizeof(*input_) * n * n);
+    memset(temp_, 0, sizeof(*temp_) * n * n);
+    memset(output_, 0, sizeof(*output_) * n * n * 2);
+  }
+  void TearDown() {
+    aom_free(input_);
+    aom_free(temp_);
+    aom_free(output_);
+  }
+  float *input_;
+  float *temp_;
+  float *output_;
+};
+
+TEST_P(FFT2DTest, Correct) {
+  int n = GetParam().n;
+  for (int i = 0; i < n * n; ++i) {
+    input_[i] = 1;
+    std::vector<std::complex<float> > expected = fft2d<float>(&input_[0], n);
+    GetParam().fft(&input_[0], &temp_[0], &output_[0]);
+    for (int y = 0; y < n; ++y) {
+      for (int x = 0; x < (n / 2) + 1; ++x) {
+        EXPECT_NEAR(expected[y * n + x].real(), output_[2 * (y * n + x)], 1e-5);
+        EXPECT_NEAR(expected[y * n + x].imag(), output_[2 * (y * n + x) + 1],
+                    1e-5);
+      }
+    }
+    input_[i] = 0;
+  }
+}
+
+TEST_P(FFT2DTest, Benchmark) {
+  int n = GetParam().n;
+  float sum = 0;
+  for (int i = 0; i < 1000 * (64 - n); ++i) {
+    input_[i % (n * n)] = 1;
+    GetParam().fft(&input_[0], &temp_[0], &output_[0]);
+    sum += output_[0];
+    input_[i % (n * n)] = 0;
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(C, FFT2DTest,
+                         ::testing::Values(FFTTestArg(2, aom_fft2x2_float_c),
+                                           FFTTestArg(4, aom_fft4x4_float_c),
+                                           FFTTestArg(8, aom_fft8x8_float_c),
+                                           FFTTestArg(16, aom_fft16x16_float_c),
+                                           FFTTestArg(32,
+                                                      aom_fft32x32_float_c)));
+#if ARCH_X86 || ARCH_X86_64
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, FFT2DTest,
+    ::testing::Values(FFTTestArg(4, aom_fft4x4_float_sse2),
+                      FFTTestArg(8, aom_fft8x8_float_sse2),
+                      FFTTestArg(16, aom_fft16x16_float_sse2),
+                      FFTTestArg(32, aom_fft32x32_float_sse2)));
+#endif  // HAVE_SSE2
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, FFT2DTest,
+    ::testing::Values(FFTTestArg(8, aom_fft8x8_float_avx2),
+                      FFTTestArg(16, aom_fft16x16_float_avx2),
+                      FFTTestArg(32, aom_fft32x32_float_avx2)));
+#endif  // HAVE_AVX2
+#endif  // ARCH_X86 || ARCH_X86_64
+
+struct IFFTTestArg {
+  int n;
+  tform_fun_t ifft;
+  IFFTTestArg(int n_in, tform_fun_t ifft_in) : n(n_in), ifft(ifft_in) {}
+};
+
+std::ostream &operator<<(std::ostream &os, const IFFTTestArg &test_arg) {
+  return os << "ifft_arg { n:" << test_arg.n << " fft:" << test_arg.ifft
+            << " }";
+}
+
+class IFFT2DTest : public ::testing::TestWithParam<IFFTTestArg> {
+ protected:
+  void SetUp() {
+    int n = GetParam().n;
+    input_ = (float *)aom_memalign(32, sizeof(*input_) * n * n * 2);
+    temp_ = (float *)aom_memalign(32, sizeof(*temp_) * n * n * 2);
+    output_ = (float *)aom_memalign(32, sizeof(*output_) * n * n);
+    memset(input_, 0, sizeof(*input_) * n * n * 2);
+    memset(temp_, 0, sizeof(*temp_) * n * n * 2);
+    memset(output_, 0, sizeof(*output_) * n * n);
+  }
+  void TearDown() {
+    aom_free(input_);
+    aom_free(temp_);
+    aom_free(output_);
+  }
+  float *input_;
+  float *temp_;
+  float *output_;
+};
+
+TEST_P(IFFT2DTest, Correctness) {
+  int n = GetParam().n;
+  ASSERT_GE(n, 2);
+  std::vector<float> expected(n * n);
+  std::vector<float> actual(n * n);
+  // Do forward transform then invert to make sure we get back expected
+  for (int y = 0; y < n; ++y) {
+    for (int x = 0; x < n; ++x) {
+      expected[y * n + x] = 1;
+      std::vector<std::complex<float> > input_c = fft2d(&expected[0], n);
+      for (int i = 0; i < n * n; ++i) {
+        input_[2 * i + 0] = input_c[i].real();
+        input_[2 * i + 1] = input_c[i].imag();
+      }
+      GetParam().ifft(&input_[0], &temp_[0], &output_[0]);
+
+      for (int yy = 0; yy < n; ++yy) {
+        for (int xx = 0; xx < n; ++xx) {
+          EXPECT_NEAR(expected[yy * n + xx], output_[yy * n + xx] / (n * n),
+                      1e-5);
+        }
+      }
+      expected[y * n + x] = 0;
+    }
+  }
+};
+
+TEST_P(IFFT2DTest, Benchmark) {
+  int n = GetParam().n;
+  float sum = 0;
+  for (int i = 0; i < 1000 * (64 - n); ++i) {
+    input_[i % (n * n)] = 1;
+    GetParam().ifft(&input_[0], &temp_[0], &output_[0]);
+    sum += output_[0];
+    input_[i % (n * n)] = 0;
+  }
+}
+INSTANTIATE_TEST_SUITE_P(
+    C, IFFT2DTest,
+    ::testing::Values(IFFTTestArg(2, aom_ifft2x2_float_c),
+                      IFFTTestArg(4, aom_ifft4x4_float_c),
+                      IFFTTestArg(8, aom_ifft8x8_float_c),
+                      IFFTTestArg(16, aom_ifft16x16_float_c),
+                      IFFTTestArg(32, aom_ifft32x32_float_c)));
+#if ARCH_X86 || ARCH_X86_64
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, IFFT2DTest,
+    ::testing::Values(IFFTTestArg(4, aom_ifft4x4_float_sse2),
+                      IFFTTestArg(8, aom_ifft8x8_float_sse2),
+                      IFFTTestArg(16, aom_ifft16x16_float_sse2),
+                      IFFTTestArg(32, aom_ifft32x32_float_sse2)));
+#endif  // HAVE_SSE2
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, IFFT2DTest,
+    ::testing::Values(IFFTTestArg(8, aom_ifft8x8_float_avx2),
+                      IFFTTestArg(16, aom_ifft16x16_float_avx2),
+                      IFFTTestArg(32, aom_ifft32x32_float_avx2)));
+#endif  // HAVE_AVX2
+#endif  // ARCH_X86 || ARCH_X86_64
+
+}  // namespace
diff --git a/libs/libaom/src/test/film_grain_table_test.cc b/libs/libaom/src/test/film_grain_table_test.cc
new file mode 100644
index 000000000..524d67d7b
--- /dev/null
+++ b/libs/libaom/src/test/film_grain_table_test.cc
@@ -0,0 +1,250 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <string>
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "aom_dsp/grain_table.h"
+#include "aom/internal/aom_codec_internal.h"
+#include "av1/encoder/grain_test_vectors.h"
+#include "test/video_source.h"
+
+void grain_equal(const aom_film_grain_t *expected,
+                 const aom_film_grain_t *actual) {
+  EXPECT_EQ(expected->apply_grain, actual->apply_grain);
+  EXPECT_EQ(expected->update_parameters, actual->update_parameters);
+  if (!expected->update_parameters) return;
+  EXPECT_EQ(expected->num_y_points, actual->num_y_points);
+  EXPECT_EQ(expected->num_cb_points, actual->num_cb_points);
+  EXPECT_EQ(expected->num_cr_points, actual->num_cr_points);
+  EXPECT_EQ(0, memcmp(expected->scaling_points_y, actual->scaling_points_y,
+                      expected->num_y_points *
+                          sizeof(expected->scaling_points_y[0])));
+  EXPECT_EQ(0, memcmp(expected->scaling_points_cb, actual->scaling_points_cb,
+                      expected->num_cb_points *
+                          sizeof(expected->scaling_points_cb[0])));
+  EXPECT_EQ(0, memcmp(expected->scaling_points_cr, actual->scaling_points_cr,
+                      expected->num_cr_points *
+                          sizeof(expected->scaling_points_cr[0])));
+  EXPECT_EQ(expected->scaling_shift, actual->scaling_shift);
+  EXPECT_EQ(expected->ar_coeff_lag, actual->ar_coeff_lag);
+  EXPECT_EQ(expected->ar_coeff_shift, actual->ar_coeff_shift);
+
+  const int num_pos_luma =
+      2 * expected->ar_coeff_lag * (expected->ar_coeff_lag + 1);
+  const int num_pos_chroma = num_pos_luma;
+  EXPECT_EQ(0, memcmp(expected->ar_coeffs_y, actual->ar_coeffs_y,
+                      sizeof(expected->ar_coeffs_y[0]) * num_pos_luma));
+  if (actual->num_cb_points || actual->chroma_scaling_from_luma) {
+    EXPECT_EQ(0, memcmp(expected->ar_coeffs_cb, actual->ar_coeffs_cb,
+                        sizeof(expected->ar_coeffs_cb[0]) * num_pos_chroma));
+  }
+  if (actual->num_cr_points || actual->chroma_scaling_from_luma) {
+    EXPECT_EQ(0, memcmp(expected->ar_coeffs_cr, actual->ar_coeffs_cr,
+                        sizeof(expected->ar_coeffs_cr[0]) * num_pos_chroma));
+  }
+  EXPECT_EQ(expected->overlap_flag, actual->overlap_flag);
+  EXPECT_EQ(expected->chroma_scaling_from_luma,
+            actual->chroma_scaling_from_luma);
+  EXPECT_EQ(expected->grain_scale_shift, actual->grain_scale_shift);
+  // EXPECT_EQ(expected->random_seed, actual->random_seed);
+
+  // clip_to_restricted and bit_depth aren't written
+  if (expected->num_cb_points) {
+    EXPECT_EQ(expected->cb_mult, actual->cb_mult);
+    EXPECT_EQ(expected->cb_luma_mult, actual->cb_luma_mult);
+    EXPECT_EQ(expected->cb_offset, actual->cb_offset);
+  }
+  if (expected->num_cr_points) {
+    EXPECT_EQ(expected->cr_mult, actual->cr_mult);
+    EXPECT_EQ(expected->cr_luma_mult, actual->cr_luma_mult);
+    EXPECT_EQ(expected->cr_offset, actual->cr_offset);
+  }
+}
+
+TEST(FilmGrainTableTest, AddAndLookupSingleSegment) {
+  aom_film_grain_table_t table;
+  memset(&table, 0, sizeof(table));
+
+  aom_film_grain_t grain;
+  EXPECT_FALSE(aom_film_grain_table_lookup(&table, 0, 1000, false, &grain));
+
+  aom_film_grain_table_append(&table, 1000, 2000, film_grain_test_vectors + 0);
+  EXPECT_FALSE(aom_film_grain_table_lookup(&table, 0, 1000, false, &grain));
+  EXPECT_FALSE(aom_film_grain_table_lookup(&table, 2000, 3000, false, &grain));
+
+  EXPECT_TRUE(aom_film_grain_table_lookup(&table, 1000, 2000, false, &grain));
+
+  grain.bit_depth = film_grain_test_vectors[0].bit_depth;
+  EXPECT_EQ(0, memcmp(&grain, film_grain_test_vectors + 0, sizeof(table)));
+
+  // Extend the existing segment
+  aom_film_grain_table_append(&table, 2000, 3000, film_grain_test_vectors + 0);
+  EXPECT_EQ(0, table.head->next);
+
+  // Lookup and remove and check that the entry is no longer there
+  EXPECT_TRUE(aom_film_grain_table_lookup(&table, 1000, 2000, true, &grain));
+  EXPECT_FALSE(aom_film_grain_table_lookup(&table, 1000, 2000, false, &grain));
+
+  EXPECT_TRUE(aom_film_grain_table_lookup(&table, 2000, 3000, true, &grain));
+  EXPECT_FALSE(aom_film_grain_table_lookup(&table, 2000, 3000, false, &grain));
+
+  EXPECT_EQ(0, table.head);
+  EXPECT_EQ(0, table.tail);
+  aom_film_grain_table_free(&table);
+}
+
+TEST(FilmGrainTableTest, SplitSingleSegment) {
+  aom_film_grain_table_t table;
+  aom_film_grain_t grain;
+  memset(&table, 0, sizeof(table));
+
+  aom_film_grain_table_append(&table, 0, 1000, film_grain_test_vectors + 0);
+
+  // Test lookup and remove that adjusts start time
+  EXPECT_TRUE(aom_film_grain_table_lookup(&table, 0, 100, true, &grain));
+  EXPECT_EQ(NULL, table.head->next);
+  EXPECT_EQ(100, table.head->start_time);
+
+  // Test lookup and remove that adjusts end time
+  EXPECT_TRUE(aom_film_grain_table_lookup(&table, 900, 1000, true, &grain));
+  EXPECT_EQ(NULL, table.head->next);
+  EXPECT_EQ(100, table.head->start_time);
+  EXPECT_EQ(900, table.head->end_time);
+
+  // Test lookup and remove that splits the first entry
+  EXPECT_TRUE(aom_film_grain_table_lookup(&table, 400, 600, true, &grain));
+  EXPECT_EQ(100, table.head->start_time);
+  EXPECT_EQ(400, table.head->end_time);
+
+  ASSERT_NE((void *)NULL, table.head->next);
+  EXPECT_EQ(table.tail, table.head->next);
+  EXPECT_EQ(600, table.head->next->start_time);
+  EXPECT_EQ(900, table.head->next->end_time);
+
+  aom_film_grain_table_free(&table);
+}
+
+TEST(FilmGrainTableTest, AddAndLookupMultipleSegments) {
+  aom_film_grain_table_t table;
+  memset(&table, 0, sizeof(table));
+
+  aom_film_grain_t grain;
+  const int kNumTestVectors =
+      sizeof(film_grain_test_vectors) / sizeof(film_grain_test_vectors[0]);
+  for (int i = 0; i < kNumTestVectors; ++i) {
+    aom_film_grain_table_append(&table, i * 1000, (i + 1) * 1000,
+                                film_grain_test_vectors + i);
+  }
+
+  for (int i = kNumTestVectors - 1; i >= 0; --i) {
+    EXPECT_TRUE(aom_film_grain_table_lookup(&table, i * 1000, (i + 1) * 1000,
+                                            true, &grain));
+    grain_equal(film_grain_test_vectors + i, &grain);
+    EXPECT_FALSE(aom_film_grain_table_lookup(&table, i * 1000, (i + 1) * 1000,
+                                             true, &grain));
+  }
+
+  // Verify that all the data has been removed
+  for (int i = 0; i < kNumTestVectors; ++i) {
+    EXPECT_FALSE(aom_film_grain_table_lookup(&table, i * 1000, (i + 1) * 1000,
+                                             true, &grain));
+  }
+  aom_film_grain_table_free(&table);
+}
+
+class FilmGrainTableIOTest : public ::testing::Test {
+ protected:
+  void SetUp() { memset(&error_, 0, sizeof(error_)); }
+  struct aom_internal_error_info error_;
+};
+
+TEST_F(FilmGrainTableIOTest, ReadMissingFile) {
+  aom_film_grain_table_t table;
+  memset(&table, 0, sizeof(table));
+  ASSERT_EQ(AOM_CODEC_ERROR, aom_film_grain_table_read(
+                                 &table, "/path/to/missing/file", &error_));
+}
+
+TEST_F(FilmGrainTableIOTest, ReadTruncatedFile) {
+  aom_film_grain_table_t table;
+  memset(&table, 0, sizeof(table));
+
+  std::string grain_file;
+  FILE *file = libaom_test::GetTempOutFile(&grain_file);
+  fwrite("deadbeef", 8, 1, file);
+  fclose(file);
+  ASSERT_EQ(AOM_CODEC_ERROR,
+            aom_film_grain_table_read(&table, grain_file.c_str(), &error_));
+  EXPECT_EQ(0, remove(grain_file.c_str()));
+}
+
+TEST_F(FilmGrainTableIOTest, RoundTripReadWrite) {
+  aom_film_grain_table_t table;
+  memset(&table, 0, sizeof(table));
+
+  aom_film_grain_t expected_grain[16];
+  const int kNumTestVectors =
+      sizeof(film_grain_test_vectors) / sizeof(film_grain_test_vectors[0]);
+  for (int i = 0; i < kNumTestVectors; ++i) {
+    expected_grain[i] = film_grain_test_vectors[i];
+    expected_grain[i].random_seed = i;
+    expected_grain[i].update_parameters = i % 2;
+    expected_grain[i].apply_grain = (i + 1) % 2;
+    expected_grain[i].bit_depth = 0;
+    aom_film_grain_table_append(&table, i * 1000, (i + 1) * 1000,
+                                expected_grain + i);
+  }
+  std::string grain_file;
+  fclose(libaom_test::GetTempOutFile(&grain_file));
+  ASSERT_EQ(AOM_CODEC_OK,
+            aom_film_grain_table_write(&table, grain_file.c_str(), &error_));
+  aom_film_grain_table_free(&table);
+
+  memset(&table, 0, sizeof(table));
+  ASSERT_EQ(AOM_CODEC_OK,
+            aom_film_grain_table_read(&table, grain_file.c_str(), &error_));
+  for (int i = 0; i < kNumTestVectors; ++i) {
+    aom_film_grain_t grain;
+    EXPECT_TRUE(aom_film_grain_table_lookup(&table, i * 1000, (i + 1) * 1000,
+                                            true, &grain));
+    grain_equal(expected_grain + i, &grain);
+  }
+  aom_film_grain_table_free(&table);
+  EXPECT_EQ(0, remove(grain_file.c_str()));
+}
+
+TEST_F(FilmGrainTableIOTest, RoundTripSplit) {
+  std::string grain_file;
+  fclose(libaom_test::GetTempOutFile(&grain_file));
+
+  aom_film_grain_table_t table;
+  memset(&table, 0, sizeof(table));
+
+  aom_film_grain_t grain = film_grain_test_vectors[0];
+  aom_film_grain_table_append(&table, 0, 3000, &grain);
+  ASSERT_TRUE(aom_film_grain_table_lookup(&table, 1000, 2000, true, &grain));
+  ASSERT_TRUE(aom_film_grain_table_lookup(&table, 0, 1000, false, &grain));
+  EXPECT_FALSE(aom_film_grain_table_lookup(&table, 1000, 2000, false, &grain));
+  ASSERT_TRUE(aom_film_grain_table_lookup(&table, 2000, 3000, false, &grain));
+  ASSERT_EQ(AOM_CODEC_OK,
+            aom_film_grain_table_write(&table, grain_file.c_str(), &error_));
+  aom_film_grain_table_free(&table);
+
+  memset(&table, 0, sizeof(table));
+  ASSERT_EQ(AOM_CODEC_OK,
+            aom_film_grain_table_read(&table, grain_file.c_str(), &error_));
+  ASSERT_TRUE(aom_film_grain_table_lookup(&table, 0, 1000, false, &grain));
+  ASSERT_FALSE(aom_film_grain_table_lookup(&table, 1000, 2000, false, &grain));
+  ASSERT_TRUE(aom_film_grain_table_lookup(&table, 2000, 3000, false, &grain));
+  aom_film_grain_table_free(&table);
+
+  EXPECT_EQ(0, remove(grain_file.c_str()));
+}
diff --git a/libs/libaom/src/test/filterintra_test.cc b/libs/libaom/src/test/filterintra_test.cc
new file mode 100644
index 000000000..284353c69
--- /dev/null
+++ b/libs/libaom/src/test/filterintra_test.cc
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tuple>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/av1_rtcd.h"
+
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "av1/common/enums.h"
+
+namespace {
+
+using libaom_test::ACMRandom;
+using std::tuple;
+
+typedef void (*Predictor)(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size,
+                          const uint8_t *above, const uint8_t *left, int mode);
+
+// Note:
+//  Test parameter list:
+//  Reference predictor, optimized predictor, prediction mode, tx size
+//
+typedef tuple<Predictor, Predictor, int> PredFuncMode;
+typedef tuple<PredFuncMode, TX_SIZE> PredParams;
+
+const int MaxTxSize = 32;
+
+const int MaxTestNum = 100;
+
+class AV1FilterIntraPredTest : public ::testing::TestWithParam<PredParams> {
+ public:
+  virtual ~AV1FilterIntraPredTest() {}
+  virtual void SetUp() {
+    PredFuncMode funcMode = GET_PARAM(0);
+    predFuncRef_ = std::get<0>(funcMode);
+    predFunc_ = std::get<1>(funcMode);
+    mode_ = std::get<2>(funcMode);
+    txSize_ = GET_PARAM(1);
+
+    alloc_ = new uint8_t[2 * MaxTxSize + 1];
+    predRef_ = new uint8_t[MaxTxSize * MaxTxSize];
+    pred_ = new uint8_t[MaxTxSize * MaxTxSize];
+  }
+
+  virtual void TearDown() {
+    delete[] alloc_;
+    delete[] predRef_;
+    delete[] pred_;
+    libaom_test::ClearSystemState();
+  }
+
+ protected:
+  void RunTest() const {
+    int tstIndex = 0;
+    int stride = tx_size_wide[txSize_];
+    uint8_t *left = alloc_;
+    uint8_t *above = alloc_ + MaxTxSize;
+    while (tstIndex < MaxTestNum) {
+      PrepareBuffer();
+      predFuncRef_(predRef_, stride, txSize_, &above[1], left, mode_);
+      ASM_REGISTER_STATE_CHECK(
+          predFunc_(pred_, stride, txSize_, &above[1], left, mode_));
+      DiffPred(tstIndex);
+      tstIndex += 1;
+    }
+  }
+
+ private:
+  void PrepareBuffer() const {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    int i = 0;
+    while (i < (2 * MaxTxSize + 1)) {
+      alloc_[i] = rnd.Rand8();
+      i++;
+    }
+  }
+
+  void DiffPred(int testNum) const {
+    int i = 0;
+    while (i < tx_size_wide[txSize_] * tx_size_high[txSize_]) {
+      EXPECT_EQ(predRef_[i], pred_[i]) << "Error at position: " << i << " "
+                                       << "Tx size: " << tx_size_wide[txSize_]
+                                       << "x" << tx_size_high[txSize_] << " "
+                                       << "Test number: " << testNum;
+      i++;
+    }
+  }
+
+  Predictor predFunc_;
+  Predictor predFuncRef_;
+  int mode_;
+  TX_SIZE txSize_;
+  uint8_t *alloc_;
+  uint8_t *pred_;
+  uint8_t *predRef_;
+};
+
+TEST_P(AV1FilterIntraPredTest, BitExactCheck) { RunTest(); }
+
+using std::make_tuple;
+
+const PredFuncMode kPredFuncMdArray[] = {
+  make_tuple(&av1_filter_intra_predictor_c, &av1_filter_intra_predictor_sse4_1,
+             FILTER_DC_PRED),
+  make_tuple(&av1_filter_intra_predictor_c, &av1_filter_intra_predictor_sse4_1,
+             FILTER_V_PRED),
+  make_tuple(&av1_filter_intra_predictor_c, &av1_filter_intra_predictor_sse4_1,
+             FILTER_H_PRED),
+  make_tuple(&av1_filter_intra_predictor_c, &av1_filter_intra_predictor_sse4_1,
+             FILTER_D157_PRED),
+  make_tuple(&av1_filter_intra_predictor_c, &av1_filter_intra_predictor_sse4_1,
+             FILTER_PAETH_PRED),
+};
+
+const TX_SIZE kTxSize[] = { TX_4X4,  TX_8X8,  TX_16X16, TX_32X32, TX_4X8,
+                            TX_8X4,  TX_8X16, TX_16X8,  TX_16X32, TX_32X16,
+                            TX_4X16, TX_16X4, TX_8X32,  TX_32X8 };
+
+INSTANTIATE_TEST_SUITE_P(
+    SSE4_1, AV1FilterIntraPredTest,
+    ::testing::Combine(::testing::ValuesIn(kPredFuncMdArray),
+                       ::testing::ValuesIn(kTxSize)));
+}  // namespace
diff --git a/libs/libaom/src/test/frame_error_test.cc b/libs/libaom/src/test/frame_error_test.cc
new file mode 100644
index 000000000..6d74a68f2
--- /dev/null
+++ b/libs/libaom/src/test/frame_error_test.cc
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <tuple>
+
+#include "config/av1_rtcd.h"
+
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/mem.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/util.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+typedef int64_t (*frame_error_func)(const uint8_t *const ref, int stride,
+                                    const uint8_t *const dst, int p_width,
+                                    int p_height, int p_stride);
+#if HAVE_AVX2 || HAVE_SSE2
+const int kBlockWidth[] = {
+  832, 834, 640, 1280, 1920,
+};
+const int kBlockHeight[] = {
+  480, 482, 360, 720, 1080,
+};
+#endif
+typedef std::tuple<frame_error_func, int, int> FrameErrorParam;
+
+class AV1FrameErrorTest : public ::testing::TestWithParam<FrameErrorParam> {
+ public:
+  virtual ~AV1FrameErrorTest() {}
+  virtual void SetUp() {
+    rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed());
+  }
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+  void RandomValues(frame_error_func test_impl, int width, int height);
+  void ExtremeValues(frame_error_func test_impl, int width, int height);
+  void RunSpeedTest(frame_error_func test_impl, int width, int height);
+  libaom_test::ACMRandom rnd_;
+};
+
+void AV1FrameErrorTest::RandomValues(frame_error_func test_impl, int width,
+                                     int height) {
+  const int stride = (((width * 3) / 2) + 15) & ~15;
+  const int max_blk_size = stride * height;
+  uint8_t *const dst =
+      static_cast<uint8_t *>(aom_memalign(16, max_blk_size * sizeof(*dst)));
+  uint8_t *const ref =
+      static_cast<uint8_t *>(aom_memalign(16, max_blk_size * sizeof(*ref)));
+  ASSERT_TRUE(dst != NULL);
+  ASSERT_TRUE(ref != NULL);
+  for (int i = 0; i < max_blk_size; ++i) {
+    dst[i] = rnd_.Rand8();
+    ref[i] = rnd_.Rand8();
+  }
+  const int64_t ref_error =
+      av1_calc_frame_error_c(ref, stride, dst, width, height, stride);
+  const int64_t test_error = test_impl(ref, stride, dst, width, height, stride);
+  ASSERT_EQ(test_error, ref_error) << width << "x" << height;
+  aom_free(dst);
+  aom_free(ref);
+}
+
+void AV1FrameErrorTest::ExtremeValues(frame_error_func test_impl, int width,
+                                      int height) {
+  const int stride = (((width * 3) / 2) + 15) & ~15;
+  const int max_blk_size = stride * height;
+  uint8_t *const dst =
+      static_cast<uint8_t *>(aom_memalign(16, max_blk_size * sizeof(*dst)));
+  uint8_t *const ref =
+      static_cast<uint8_t *>(aom_memalign(16, max_blk_size * sizeof(*ref)));
+  ASSERT_TRUE(dst != NULL);
+  ASSERT_TRUE(ref != NULL);
+  for (int r = 0; r < 2; r++) {
+    if (r == 0) {
+      memset(dst, 0, max_blk_size);
+      memset(ref, 255, max_blk_size);
+    } else if (r == 1) {
+      memset(dst, 255, max_blk_size);
+      memset(ref, 0, max_blk_size);
+    }
+    const int64_t ref_error =
+        av1_calc_frame_error_c(ref, stride, dst, width, height, stride);
+    const int64_t test_error =
+        test_impl(ref, stride, dst, width, height, stride);
+    ASSERT_EQ(test_error, ref_error) << width << "x" << height;
+  }
+  aom_free(dst);
+  aom_free(ref);
+}
+
+void AV1FrameErrorTest::RunSpeedTest(frame_error_func test_impl, int width,
+                                     int height) {
+  const int stride = (((width * 3) / 2) + 15) & ~15;
+  const int max_blk_size = stride * height;
+  uint8_t *const dst =
+      static_cast<uint8_t *>(aom_memalign(16, max_blk_size * sizeof(*dst)));
+  uint8_t *const ref =
+      static_cast<uint8_t *>(aom_memalign(16, max_blk_size * sizeof(*ref)));
+  ASSERT_TRUE(dst != NULL);
+  ASSERT_TRUE(ref != NULL);
+  for (int i = 0; i < max_blk_size; ++i) {
+    dst[i] = ref[i] = rnd_.Rand8();
+  }
+  const int num_loops = 10000000 / (width + height);
+  frame_error_func funcs[2] = { av1_calc_frame_error_c, test_impl };
+  double elapsed_time[2] = { 0 };
+  for (int i = 0; i < 2; ++i) {
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    frame_error_func func = funcs[i];
+    for (int j = 0; j < num_loops; ++j) {
+      func(ref, stride, dst, width, height, stride);
+    }
+    aom_usec_timer_mark(&timer);
+    double time = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    elapsed_time[i] = 1000.0 * time / num_loops;
+  }
+  aom_free(dst);
+  aom_free(ref);
+  printf("av1_calc_frame_error %3dx%-3d: %7.2f/%7.2fns", width, height,
+         elapsed_time[0], elapsed_time[1]);
+  printf("(%3.2f)\n", elapsed_time[0] / elapsed_time[1]);
+}
+
+TEST_P(AV1FrameErrorTest, CheckOutput) {
+  RandomValues(GET_PARAM(0), GET_PARAM(1), GET_PARAM(2));
+  ExtremeValues(GET_PARAM(0), GET_PARAM(1), GET_PARAM(2));
+}
+
+TEST_P(AV1FrameErrorTest, DISABLED_Speed) {
+  RunSpeedTest(GET_PARAM(0), GET_PARAM(1), GET_PARAM(2));
+}
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, AV1FrameErrorTest,
+    ::testing::Combine(::testing::Values(&av1_calc_frame_error_sse2),
+                       ::testing::ValuesIn(kBlockWidth),
+                       ::testing::ValuesIn(kBlockHeight)));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, AV1FrameErrorTest,
+    ::testing::Combine(::testing::Values(&av1_calc_frame_error_avx2),
+                       ::testing::ValuesIn(kBlockWidth),
+                       ::testing::ValuesIn(kBlockHeight)));
+#endif
+}  // namespace
diff --git a/libs/libaom/src/test/frame_size_tests.cc b/libs/libaom/src/test/frame_size_tests.cc
new file mode 100644
index 000000000..1546012a3
--- /dev/null
+++ b/libs/libaom/src/test/frame_size_tests.cc
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/video_source.h"
+
+namespace {
+
+class AV1FrameSizeTests : public ::testing::Test,
+                          public ::libaom_test::EncoderTest {
+ protected:
+  AV1FrameSizeTests()
+      : EncoderTest(&::libaom_test::kAV1), expected_res_(AOM_CODEC_OK) {}
+  virtual ~AV1FrameSizeTests() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(::libaom_test::kRealTime);
+  }
+
+  virtual bool HandleDecodeResult(const aom_codec_err_t res_dec,
+                                  libaom_test::Decoder *decoder) {
+    EXPECT_EQ(expected_res_, res_dec) << decoder->DecodeError();
+    return !::testing::Test::HasFailure();
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(AOME_SET_CPUUSED, 7);
+      encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+      encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
+      encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
+    }
+  }
+
+  int expected_res_;
+};
+
+#if CONFIG_SIZE_LIMIT
+TEST_F(AV1FrameSizeTests, TestInvalidSizes) {
+  ::libaom_test::RandomVideoSource video;
+
+  video.SetSize(DECODE_WIDTH_LIMIT + 16, DECODE_HEIGHT_LIMIT + 16);
+  video.set_limit(2);
+  expected_res_ = AOM_CODEC_CORRUPT_FRAME;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+TEST_F(AV1FrameSizeTests, LargeValidSizes) {
+  ::libaom_test::RandomVideoSource video;
+
+  video.SetSize(DECODE_WIDTH_LIMIT, DECODE_HEIGHT_LIMIT);
+  video.set_limit(2);
+  expected_res_ = AOM_CODEC_OK;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+#endif
+
+TEST_F(AV1FrameSizeTests, OneByOneVideo) {
+  ::libaom_test::RandomVideoSource video;
+
+  video.SetSize(1, 1);
+  video.set_limit(2);
+  expected_res_ = AOM_CODEC_OK;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+#undef ONE_BY_ONE_VIDEO_NAME
+}  // namespace
diff --git a/libs/libaom/src/test/function_equivalence_test.h b/libs/libaom/src/test/function_equivalence_test.h
new file mode 100644
index 000000000..a299c48d4
--- /dev/null
+++ b/libs/libaom/src/test/function_equivalence_test.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_TEST_FUNCTION_EQUIVALENCE_TEST_H_
+#define AOM_TEST_FUNCTION_EQUIVALENCE_TEST_H_
+
+#include <ostream>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/util.h"
+
+using libaom_test::ACMRandom;
+
+namespace libaom_test {
+// Base class for tests that compare 2 implementations of the same function
+// for equivalence. The template parameter should be pointer to a function
+// that is being tested.
+//
+// The test takes a 3-parameters encapsulating struct 'FuncParam', containing:
+//   - Pointer to reference function
+//   - Pointer to tested function
+//   - Integer bit depth (default to 0).
+//
+// These values are then accessible in the tests as member of params_:
+// params_.ref_func, params_.tst_func, and params_.bit_depth.
+//
+
+template <typename T>
+struct FuncParam {
+  FuncParam(T ref = NULL, T tst = NULL, int bit_depth = 0)
+      : ref_func(ref), tst_func(tst), bit_depth(bit_depth) {}
+  T ref_func;
+  T tst_func;
+  int bit_depth;
+};
+
+template <typename T>
+std::ostream &operator<<(std::ostream &os, const FuncParam<T> &p) {
+  return os << "bit_depth:" << p.bit_depth
+            << " function:" << reinterpret_cast<const void *>(p.ref_func)
+            << " function:" << reinterpret_cast<const void *>(p.tst_func);
+}
+
+template <typename T>
+class FunctionEquivalenceTest : public ::testing::TestWithParam<FuncParam<T> > {
+ public:
+  FunctionEquivalenceTest() : rng_(ACMRandom::DeterministicSeed()) {}
+
+  virtual ~FunctionEquivalenceTest() {}
+
+  virtual void SetUp() { params_ = this->GetParam(); }
+
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+  ACMRandom rng_;
+  FuncParam<T> params_;
+};
+
+}  // namespace libaom_test
+#endif  // AOM_TEST_FUNCTION_EQUIVALENCE_TEST_H_
diff --git a/libs/libaom/src/test/fwd_kf_test.cc b/libs/libaom/src/test/fwd_kf_test.cc
new file mode 100644
index 000000000..50c2f36d8
--- /dev/null
+++ b/libs/libaom/src/test/fwd_kf_test.cc
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <ostream>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+
+namespace {
+
+typedef struct {
+  const int max_kf_dist;
+  const double psnr_thresh;
+} FwdKfTestParam;
+
+const FwdKfTestParam kTestParams[] = {
+  { 4, 33.4 },  { 6, 32.9 },  { 8, 32.6 },
+  { 12, 32.4 }, { 16, 32.3 }, { 18, 32.1 }
+};
+
+std::ostream &operator<<(std::ostream &os, const FwdKfTestParam &test_arg) {
+  return os << "FwdKfTestParam { max_kf_dist:" << test_arg.max_kf_dist
+            << " psnr_thresh:" << test_arg.psnr_thresh << " }";
+}
+
+class ForwardKeyTest
+    : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode,
+                                                 FwdKfTestParam>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  ForwardKeyTest()
+      : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+        kf_max_dist_param_(GET_PARAM(2)) {}
+  virtual ~ForwardKeyTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(encoding_mode_);
+    const aom_rational timebase = { 1, 30 };
+    cfg_.g_timebase = timebase;
+    cpu_used_ = 2;
+    kf_max_dist_ = kf_max_dist_param_.max_kf_dist;
+    psnr_threshold_ = kf_max_dist_param_.psnr_thresh;
+    cfg_.rc_end_usage = AOM_VBR;
+    cfg_.rc_target_bitrate = 200;
+    cfg_.g_lag_in_frames = 10;
+    cfg_.fwd_kf_enabled = 1;
+    cfg_.kf_max_dist = kf_max_dist_;
+    cfg_.g_threads = 0;
+    init_flags_ = AOM_CODEC_USE_PSNR;
+  }
+
+  virtual void BeginPassHook(unsigned int) {
+    psnr_ = 0.0;
+    nframes_ = 0;
+  }
+
+  virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+    psnr_ += pkt->data.psnr.psnr[0];
+    nframes_++;
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(AOME_SET_CPUUSED, cpu_used_);
+      if (encoding_mode_ != ::libaom_test::kRealTime) {
+        encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+        encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
+        encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
+      }
+    }
+  }
+
+  double GetAveragePsnr() const {
+    if (nframes_) return psnr_ / nframes_;
+    return 0.0;
+  }
+
+  double GetPsnrThreshold() { return psnr_threshold_; }
+
+  ::libaom_test::TestMode encoding_mode_;
+  const FwdKfTestParam kf_max_dist_param_;
+  double psnr_threshold_;
+  int kf_max_dist_;
+  int cpu_used_;
+  int nframes_;
+  double psnr_;
+};
+
+TEST_P(ForwardKeyTest, ForwardKeyEncodeTest) {
+  libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                     cfg_.g_timebase.den, cfg_.g_timebase.num,
+                                     0, 20);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  // TODO(sarahparker) Add functionality to assert the minimum number of
+  // keyframes were placed.
+  EXPECT_GT(GetAveragePsnr(), GetPsnrThreshold())
+      << "kf max dist = " << kf_max_dist_;
+}
+
+AV1_INSTANTIATE_TEST_CASE(ForwardKeyTest,
+                          ::testing::Values(::libaom_test::kTwoPassGood),
+                          ::testing::ValuesIn(kTestParams));
+}  // namespace
diff --git a/libs/libaom/src/test/fwht4x4_test.cc b/libs/libaom/src/test/fwht4x4_test.cc
new file mode 100644
index 000000000..d2f77b8d4
--- /dev/null
+++ b/libs/libaom/src/test/fwht4x4_test.cc
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <tuple>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/av1_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/transform_test_base.h"
+#include "test/util.h"
+#include "av1/common/entropy.h"
+#include "aom/aom_codec.h"
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+
+using libaom_test::ACMRandom;
+
+namespace {
+typedef void (*FdctFunc)(const int16_t *in, tran_low_t *out, int stride);
+typedef void (*IdctFunc)(const tran_low_t *in, uint8_t *out, int stride);
+
+using libaom_test::FhtFunc;
+
+typedef std::tuple<FdctFunc, IdctFunc, TX_TYPE, aom_bit_depth_t, int>
+    Dct4x4Param;
+
+void fwht4x4_ref(const int16_t *in, tran_low_t *out, int stride,
+                 TxfmParam * /*txfm_param*/) {
+  av1_fwht4x4_c(in, out, stride);
+}
+
+void iwht4x4_10(const tran_low_t *in, uint8_t *out, int stride) {
+  av1_highbd_iwht4x4_16_add_c(in, out, stride, 10);
+}
+
+void iwht4x4_12(const tran_low_t *in, uint8_t *out, int stride) {
+  av1_highbd_iwht4x4_16_add_c(in, out, stride, 12);
+}
+
+class Trans4x4WHT : public libaom_test::TransformTestBase<tran_low_t>,
+                    public ::testing::TestWithParam<Dct4x4Param> {
+ public:
+  virtual ~Trans4x4WHT() {}
+
+  virtual void SetUp() {
+    fwd_txfm_ = GET_PARAM(0);
+    inv_txfm_ = GET_PARAM(1);
+    pitch_ = 4;
+    height_ = 4;
+    fwd_txfm_ref = fwht4x4_ref;
+    bit_depth_ = GET_PARAM(3);
+    mask_ = (1 << bit_depth_) - 1;
+    num_coeffs_ = GET_PARAM(4);
+  }
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+  void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {
+    fwd_txfm_(in, out, stride);
+  }
+  void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {
+    inv_txfm_(out, dst, stride);
+  }
+
+  FdctFunc fwd_txfm_;
+  IdctFunc inv_txfm_;
+};
+
+TEST_P(Trans4x4WHT, AccuracyCheck) { RunAccuracyCheck(0, 0.00001); }
+
+TEST_P(Trans4x4WHT, CoeffCheck) { RunCoeffCheck(); }
+
+TEST_P(Trans4x4WHT, MemCheck) { RunMemCheck(); }
+
+TEST_P(Trans4x4WHT, InvAccuracyCheck) { RunInvAccuracyCheck(0); }
+using std::make_tuple;
+
+INSTANTIATE_TEST_SUITE_P(
+    C, Trans4x4WHT,
+    ::testing::Values(make_tuple(&av1_highbd_fwht4x4_c, &iwht4x4_10, DCT_DCT,
+                                 AOM_BITS_10, 16),
+                      make_tuple(&av1_highbd_fwht4x4_c, &iwht4x4_12, DCT_DCT,
+                                 AOM_BITS_12, 16)));
+}  // namespace
diff --git a/libs/libaom/src/test/gf_pyr_height_test.cc b/libs/libaom/src/test/gf_pyr_height_test.cc
new file mode 100644
index 000000000..b1ade67a6
--- /dev/null
+++ b/libs/libaom/src/test/gf_pyr_height_test.cc
@@ -0,0 +1,156 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <ostream>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+
+namespace {
+
+static const struct GFPyrHeightTestParam {
+  int gf_min_pyr_height;
+  int gf_max_pyr_height;
+  double psnr_thresh;
+} kTestParams[] = {
+  // gf_min_pyr_height = 0
+  { 0, 0, 33.40 },
+  { 0, 1, 34.00 },
+  { 0, 2, 34.00 },
+  { 0, 3, 34.20 },
+  { 0, 4, 34.30 },
+  { 0, 5, 34.40 },
+  // gf_min_pyr_height = 1
+  { 1, 1, 34.00 },
+  { 1, 2, 34.00 },
+  { 1, 3, 34.20 },
+  { 1, 4, 34.30 },
+  { 1, 5, 34.40 },
+  // gf_min_pyr_height = 2
+  { 2, 2, 34.00 },
+  { 2, 3, 34.20 },
+  { 2, 4, 34.30 },
+  { 2, 5, 34.40 },
+  // gf_min_pyr_height = 3
+  { 3, 3, 34.20 },
+  { 3, 4, 34.30 },
+  { 3, 5, 34.40 },
+  // gf_min_pyr_height = 4
+  { 4, 4, 34.30 },
+  { 4, 5, 34.40 },
+  // gf_min_pyr_height = 5
+  { 5, 5, 34.40 },
+};
+
+// Compiler may decide to add some padding to the struct above for alignment,
+// which the gtest may try to print (on error for example). This would cause
+// valgrind to complain that the padding is uninitialized. To avoid that, we
+// provide our own function to print the struct.
+// This also makes '--gtest_list_tests' output more understandable.
+std::ostream &operator<<(std::ostream &os, const GFPyrHeightTestParam &p) {
+  os << "GFPyrHeightTestParam { "
+     << "gf_min_pyr_height = " << p.gf_min_pyr_height << ", "
+     << "gf_max_pyr_height = " << p.gf_max_pyr_height << ", "
+     << "psnr_thresh = " << p.psnr_thresh << " }";
+  return os;
+}
+
+// Params: encoding mode, rate control mode and GFPyrHeightTestParam object.
+class GFPyrHeightTest
+    : public ::libaom_test::CodecTestWith3Params<
+          libaom_test::TestMode, aom_rc_mode, GFPyrHeightTestParam>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  GFPyrHeightTest()
+      : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+        rc_mode_(GET_PARAM(2)) {
+    gf_min_pyr_height_ = GET_PARAM(3).gf_min_pyr_height;
+    gf_max_pyr_height_ = GET_PARAM(3).gf_max_pyr_height;
+    psnr_threshold_ = GET_PARAM(3).psnr_thresh;
+  }
+  virtual ~GFPyrHeightTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(encoding_mode_);
+    const aom_rational timebase = { 1, 30 };
+    cfg_.g_timebase = timebase;
+    cpu_used_ = 4;
+    cfg_.rc_end_usage = rc_mode_;
+    if (rc_mode_ == AOM_VBR) {
+      cfg_.rc_target_bitrate = 200;
+    }
+    cfg_.g_lag_in_frames = 19;
+    cfg_.g_threads = 0;
+    init_flags_ = AOM_CODEC_USE_PSNR;
+  }
+
+  virtual void BeginPassHook(unsigned int) {
+    psnr_ = 0.0;
+    nframes_ = 0;
+  }
+
+  virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+    psnr_ += pkt->data.psnr.psnr[0];
+    nframes_++;
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(AOME_SET_CPUUSED, cpu_used_);
+      if (rc_mode_ == AOM_Q) {
+        encoder->Control(AOME_SET_CQ_LEVEL, 32);
+      }
+      if (encoding_mode_ != ::libaom_test::kRealTime) {
+        encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+        encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
+        encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
+      }
+      encoder->Control(AV1E_SET_GF_MIN_PYRAMID_HEIGHT, gf_min_pyr_height_);
+      encoder->Control(AV1E_SET_GF_MAX_PYRAMID_HEIGHT, gf_max_pyr_height_);
+    }
+  }
+
+  double GetAveragePsnr() const {
+    if (nframes_) return psnr_ / nframes_;
+    return 0.0;
+  }
+
+  double GetPsnrThreshold() { return psnr_threshold_; }
+
+  ::libaom_test::TestMode encoding_mode_;
+  aom_rc_mode rc_mode_;
+  double psnr_threshold_;
+  int gf_min_pyr_height_;
+  int gf_max_pyr_height_;
+  int cpu_used_;
+  int nframes_;
+  double psnr_;
+};
+
+TEST_P(GFPyrHeightTest, EncodeAndVerifyPSNR) {
+  libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                     cfg_.g_timebase.den, cfg_.g_timebase.num,
+                                     0, 32);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  EXPECT_GT(GetAveragePsnr(), GetPsnrThreshold())
+      << "GF Min Pyramid Height = " << gf_min_pyr_height_ << ", "
+      << "GF Max Pyramid Height = " << gf_max_pyr_height_;
+}
+
+AV1_INSTANTIATE_TEST_CASE(GFPyrHeightTest, NONREALTIME_TEST_MODES,
+                          ::testing::Values(AOM_Q, AOM_VBR),
+                          ::testing::ValuesIn(kTestParams));
+}  // namespace
diff --git a/libs/libaom/src/test/gviz_api.py b/libs/libaom/src/test/gviz_api.py
new file mode 100644
index 000000000..d3a443dab
--- /dev/null
+++ b/libs/libaom/src/test/gviz_api.py
@@ -0,0 +1,1087 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2016, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and
+# the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+# was not distributed with this source code in the LICENSE file, you can
+# obtain it at www.aomedia.org/license/software. If the Alliance for Open
+# Media Patent License 1.0 was not distributed with this source code in the
+# PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+#
+
+"""Converts Python data into data for Google Visualization API clients.
+
+This library can be used to create a google.visualization.DataTable usable by
+visualizations built on the Google Visualization API. Output formats are raw
+JSON, JSON response, JavaScript, CSV, and HTML table.
+
+See http://code.google.com/apis/visualization/ for documentation on the
+Google Visualization API.
+"""
+
+__author__ = "Amit Weinstein, Misha Seltzer, Jacob Baskin"
+
+import cgi
+import cStringIO
+import csv
+import datetime
+try:
+  import json
+except ImportError:
+  import simplejson as json
+import types
+
+
+class DataTableException(Exception):
+  """The general exception object thrown by DataTable."""
+  pass
+
+
+class DataTableJSONEncoder(json.JSONEncoder):
+  """JSON encoder that handles date/time/datetime objects correctly."""
+
+  def __init__(self):
+    json.JSONEncoder.__init__(self,
+                              separators=(",", ":"),
+                              ensure_ascii=False)
+
+  def default(self, o):
+    if isinstance(o, datetime.datetime):
+      if o.microsecond == 0:
+        # If the time doesn't have ms-resolution, leave it out to keep
+        # things smaller.
+        return "Date(%d,%d,%d,%d,%d,%d)" % (
+            o.year, o.month - 1, o.day, o.hour, o.minute, o.second)
+      else:
+        return "Date(%d,%d,%d,%d,%d,%d,%d)" % (
+            o.year, o.month - 1, o.day, o.hour, o.minute, o.second,
+            o.microsecond / 1000)
+    elif isinstance(o, datetime.date):
+      return "Date(%d,%d,%d)" % (o.year, o.month - 1, o.day)
+    elif isinstance(o, datetime.time):
+      return [o.hour, o.minute, o.second]
+    else:
+      return super(DataTableJSONEncoder, self).default(o)
+
+
+class DataTable(object):
+  """Wraps the data to convert to a Google Visualization API DataTable.
+
+  Create this object, populate it with data, then call one of the ToJS...
+  methods to return a string representation of the data in the format described.
+
+  You can clear all data from the object to reuse it, but you cannot clear
+  individual cells, rows, or columns. You also cannot modify the table schema
+  specified in the class constructor.
+
+  You can add new data one or more rows at a time. All data added to an
+  instantiated DataTable must conform to the schema passed in to __init__().
+
+  You can reorder the columns in the output table, and also specify row sorting
+  order by column. The default column order is according to the original
+  table_description parameter. Default row sort order is ascending, by column
+  1 values. For a dictionary, we sort the keys for order.
+
+  The data and the table_description are closely tied, as described here:
+
+  The table schema is defined in the class constructor's table_description
+  parameter. The user defines each column using a tuple of
+  (id[, type[, label[, custom_properties]]]). The default value for type is
+  string, label is the same as ID if not specified, and custom properties is
+  an empty dictionary if not specified.
+
+  table_description is a dictionary or list, containing one or more column
+  descriptor tuples, nested dictionaries, and lists. Each dictionary key, list
+  element, or dictionary element must eventually be defined as
+  a column description tuple. Here's an example of a dictionary where the key
+  is a tuple, and the value is a list of two tuples:
+    {('a', 'number'): [('b', 'number'), ('c', 'string')]}
+
+  This flexibility in data entry enables you to build and manipulate your data
+  in a Python structure that makes sense for your program.
+
+  Add data to the table using the same nested design as the table's
+  table_description, replacing column descriptor tuples with cell data, and
+  each row is an element in the top level collection. This will be a bit
+  clearer after you look at the following examples showing the
+  table_description, matching data, and the resulting table:
+
+  Columns as list of tuples [col1, col2, col3]
+    table_description: [('a', 'number'), ('b', 'string')]
+    AppendData( [[1, 'z'], [2, 'w'], [4, 'o'], [5, 'k']] )
+    Table:
+    a  b   <--- these are column ids/labels
+    1  z
+    2  w
+    4  o
+    5  k
+
+  Dictionary of columns, where key is a column, and value is a list of
+  columns  {col1: [col2, col3]}
+    table_description: {('a', 'number'): [('b', 'number'), ('c', 'string')]}
+    AppendData( data: {1: [2, 'z'], 3: [4, 'w']}
+    Table:
+    a  b  c
+    1  2  z
+    3  4  w
+
+  Dictionary where key is a column, and the value is itself a dictionary of
+  columns {col1: {col2, col3}}
+    table_description: {('a', 'number'): {'b': 'number', 'c': 'string'}}
+    AppendData( data: {1: {'b': 2, 'c': 'z'}, 3: {'b': 4, 'c': 'w'}}
+    Table:
+    a  b  c
+    1  2  z
+    3  4  w
+  """
+
+  def __init__(self, table_description, data=None, custom_properties=None):
+    """Initialize the data table from a table schema and (optionally) data.
+
+    See the class documentation for more information on table schema and data
+    values.
+
+    Args:
+      table_description: A table schema, following one of the formats described
+                         in TableDescriptionParser(). Schemas describe the
+                         column names, data types, and labels. See
+                         TableDescriptionParser() for acceptable formats.
+      data: Optional. If given, fills the table with the given data. The data
+            structure must be consistent with schema in table_description. See
+            the class documentation for more information on acceptable data. You
+            can add data later by calling AppendData().
+      custom_properties: Optional. A dictionary from string to string that
+                         goes into the table's custom properties. This can be
+                         later changed by changing self.custom_properties.
+
+    Raises:
+      DataTableException: Raised if the data and the description did not match,
+                          or did not use the supported formats.
+    """
+    self.__columns = self.TableDescriptionParser(table_description)
+    self.__data = []
+    self.custom_properties = {}
+    if custom_properties is not None:
+      self.custom_properties = custom_properties
+    if data:
+      self.LoadData(data)
+
+  @staticmethod
+  def CoerceValue(value, value_type):
+    """Coerces a single value into the type expected for its column.
+
+    Internal helper method.
+
+    Args:
+      value: The value which should be converted
+      value_type: One of "string", "number", "boolean", "date", "datetime" or
+                  "timeofday".
+
+    Returns:
+      An item of the Python type appropriate to the given value_type. Strings
+      are also converted to Unicode using UTF-8 encoding if necessary.
+      If a tuple is given, it should be in one of the following forms:
+        - (value, formatted value)
+        - (value, formatted value, custom properties)
+      where the formatted value is a string, and custom properties is a
+      dictionary of the custom properties for this cell.
+      To specify custom properties without specifying formatted value, one can
+      pass None as the formatted value.
+      One can also have a null-valued cell with formatted value and/or custom
+      properties by specifying None for the value.
+      This method ignores the custom properties except for checking that it is a
+      dictionary. The custom properties are handled in the ToJSon and ToJSCode
+      methods.
+      The real type of the given value is not strictly checked. For example,
+      any type can be used for string - as we simply take its str( ) and for
+      boolean value we just check "if value".
+      Examples:
+        CoerceValue(None, "string") returns None
+        CoerceValue((5, "5$"), "number") returns (5, "5$")
+        CoerceValue(100, "string") returns "100"
+        CoerceValue(0, "boolean") returns False
+
+    Raises:
+      DataTableException: The value and type did not match in a not-recoverable
+                          way, for example given value 'abc' for type 'number'.
+    """
+    if isinstance(value, tuple):
+      # In case of a tuple, we run the same function on the value itself and
+      # add the formatted value.
+      if (len(value) not in [2, 3] or
+          (len(value) == 3 and not isinstance(value[2], dict))):
+        raise DataTableException("Wrong format for value and formatting - %s." %
+                                 str(value))
+      if not isinstance(value[1], types.StringTypes + (types.NoneType,)):
+        raise DataTableException("Formatted value is not string, given %s." %
+                                 type(value[1]))
+      js_value = DataTable.CoerceValue(value[0], value_type)
+      return (js_value,) + value[1:]
+
+    t_value = type(value)
+    if value is None:
+      return value
+    if value_type == "boolean":
+      return bool(value)
+
+    elif value_type == "number":
+      if isinstance(value, (int, long, float)):
+        return value
+      raise DataTableException("Wrong type %s when expected number" % t_value)
+
+    elif value_type == "string":
+      if isinstance(value, unicode):
+        return value
+      else:
+        return str(value).decode("utf-8")
+
+    elif value_type == "date":
+      if isinstance(value, datetime.datetime):
+        return datetime.date(value.year, value.month, value.day)
+      elif isinstance(value, datetime.date):
+        return value
+      else:
+        raise DataTableException("Wrong type %s when expected date" % t_value)
+
+    elif value_type == "timeofday":
+      if isinstance(value, datetime.datetime):
+        return datetime.time(value.hour, value.minute, value.second)
+      elif isinstance(value, datetime.time):
+        return value
+      else:
+        raise DataTableException("Wrong type %s when expected time" % t_value)
+
+    elif value_type == "datetime":
+      if isinstance(value, datetime.datetime):
+        return value
+      else:
+        raise DataTableException("Wrong type %s when expected datetime" %
+                                 t_value)
+    # If we got here, it means the given value_type was not one of the
+    # supported types.
+    raise DataTableException("Unsupported type %s" % value_type)
+
+  @staticmethod
+  def EscapeForJSCode(encoder, value):
+    if value is None:
+      return "null"
+    elif isinstance(value, datetime.datetime):
+      if value.microsecond == 0:
+        # If it's not ms-resolution, leave that out to save space.
+        return "new Date(%d,%d,%d,%d,%d,%d)" % (value.year,
+                                                value.month - 1,  # To match JS
+                                                value.day,
+                                                value.hour,
+                                                value.minute,
+                                                value.second)
+      else:
+        return "new Date(%d,%d,%d,%d,%d,%d,%d)" % (value.year,
+                                                   value.month - 1,  # match JS
+                                                   value.day,
+                                                   value.hour,
+                                                   value.minute,
+                                                   value.second,
+                                                   value.microsecond / 1000)
+    elif isinstance(value, datetime.date):
+      return "new Date(%d,%d,%d)" % (value.year, value.month - 1, value.day)
+    else:
+      return encoder.encode(value)
+
+  @staticmethod
+  def ToString(value):
+    if value is None:
+      return "(empty)"
+    elif isinstance(value, (datetime.datetime,
+                            datetime.date,
+                            datetime.time)):
+      return str(value)
+    elif isinstance(value, unicode):
+      return value
+    elif isinstance(value, bool):
+      return str(value).lower()
+    else:
+      return str(value).decode("utf-8")
+
+  @staticmethod
+  def ColumnTypeParser(description):
+    """Parses a single column description. Internal helper method.
+
+    Args:
+      description: a column description in the possible formats:
+       'id'
+       ('id',)
+       ('id', 'type')
+       ('id', 'type', 'label')
+       ('id', 'type', 'label', {'custom_prop1': 'custom_val1'})
+    Returns:
+      Dictionary with the following keys: id, label, type, and
+      custom_properties where:
+        - If label not given, it equals the id.
+        - If type not given, string is used by default.
+        - If custom properties are not given, an empty dictionary is used by
+          default.
+
+    Raises:
+      DataTableException: The column description did not match the RE, or
+          unsupported type was passed.
+    """
+    if not description:
+      raise DataTableException("Description error: empty description given")
+
+    if not isinstance(description, (types.StringTypes, tuple)):
+      raise DataTableException("Description error: expected either string or "
+                               "tuple, got %s." % type(description))
+
+    if isinstance(description, types.StringTypes):
+      description = (description,)
+
+    # According to the tuple's length, we fill the keys
+    # We verify everything is of type string
+    for elem in description[:3]:
+      if not isinstance(elem, types.StringTypes):
+        raise DataTableException("Description error: expected tuple of "
+                                 "strings, current element of type %s." %
+                                 type(elem))
+    desc_dict = {"id": description[0],
+                 "label": description[0],
+                 "type": "string",
+                 "custom_properties": {}}
+    if len(description) > 1:
+      desc_dict["type"] = description[1].lower()
+      if len(description) > 2:
+        desc_dict["label"] = description[2]
+        if len(description) > 3:
+          if not isinstance(description[3], dict):
+            raise DataTableException("Description error: expected custom "
+                                     "properties of type dict, current element "
+                                     "of type %s." % type(description[3]))
+          desc_dict["custom_properties"] = description[3]
+          if len(description) > 4:
+            raise DataTableException("Description error: tuple of length > 4")
+    if desc_dict["type"] not in ["string", "number", "boolean",
+                                 "date", "datetime", "timeofday"]:
+      raise DataTableException(
+          "Description error: unsupported type '%s'" % desc_dict["type"])
+    return desc_dict
+
+  @staticmethod
+  def TableDescriptionParser(table_description, depth=0):
+    """Parses the table_description object for internal use.
+
+    Parses the user-submitted table description into an internal format used
+    by the Python DataTable class. Returns the flat list of parsed columns.
+
+    Args:
+      table_description: A description of the table which should comply
+                         with one of the formats described below.
+      depth: Optional. The depth of the first level in the current description.
+             Used by recursive calls to this function.
+
+    Returns:
+      List of columns, where each column represented by a dictionary with the
+      keys: id, label, type, depth, container which means the following:
+      - id: the id of the column
+      - name: The name of the column
+      - type: The datatype of the elements in this column. Allowed types are
+              described in ColumnTypeParser().
+      - depth: The depth of this column in the table description
+      - container: 'dict', 'iter' or 'scalar' for parsing the format easily.
+      - custom_properties: The custom properties for this column.
+      The returned description is flattened regardless of how it was given.
+
+    Raises:
+      DataTableException: Error in a column description or in the description
+                          structure.
+
+    Examples:
+      A column description can be of the following forms:
+       'id'
+       ('id',)
+       ('id', 'type')
+       ('id', 'type', 'label')
+       ('id', 'type', 'label', {'custom_prop1': 'custom_val1'})
+       or as a dictionary:
+       'id': 'type'
+       'id': ('type',)
+       'id': ('type', 'label')
+       'id': ('type', 'label', {'custom_prop1': 'custom_val1'})
+      If the type is not specified, we treat it as string.
+      If no specific label is given, the label is simply the id.
+      If no custom properties are given, we use an empty dictionary.
+
+      input: [('a', 'date'), ('b', 'timeofday', 'b', {'foo': 'bar'})]
+      output: [{'id': 'a', 'label': 'a', 'type': 'date',
+                'depth': 0, 'container': 'iter', 'custom_properties': {}},
+               {'id': 'b', 'label': 'b', 'type': 'timeofday',
+                'depth': 0, 'container': 'iter',
+                'custom_properties': {'foo': 'bar'}}]
+
+      input: {'a': [('b', 'number'), ('c', 'string', 'column c')]}
+      output: [{'id': 'a', 'label': 'a', 'type': 'string',
+                'depth': 0, 'container': 'dict', 'custom_properties': {}},
+               {'id': 'b', 'label': 'b', 'type': 'number',
+                'depth': 1, 'container': 'iter', 'custom_properties': {}},
+               {'id': 'c', 'label': 'column c', 'type': 'string',
+                'depth': 1, 'container': 'iter', 'custom_properties': {}}]
+
+      input:  {('a', 'number', 'column a'): { 'b': 'number', 'c': 'string'}}
+      output: [{'id': 'a', 'label': 'column a', 'type': 'number',
+                'depth': 0, 'container': 'dict', 'custom_properties': {}},
+               {'id': 'b', 'label': 'b', 'type': 'number',
+                'depth': 1, 'container': 'dict', 'custom_properties': {}},
+               {'id': 'c', 'label': 'c', 'type': 'string',
+                'depth': 1, 'container': 'dict', 'custom_properties': {}}]
+
+      input: { ('w', 'string', 'word'): ('c', 'number', 'count') }
+      output: [{'id': 'w', 'label': 'word', 'type': 'string',
+                'depth': 0, 'container': 'dict', 'custom_properties': {}},
+               {'id': 'c', 'label': 'count', 'type': 'number',
+                'depth': 1, 'container': 'scalar', 'custom_properties': {}}]
+
+      input: {'a': ('number', 'column a'), 'b': ('string', 'column b')}
+      output: [{'id': 'a', 'label': 'column a', 'type': 'number', 'depth': 0,
+               'container': 'dict', 'custom_properties': {}},
+               {'id': 'b', 'label': 'column b', 'type': 'string', 'depth': 0,
+               'container': 'dict', 'custom_properties': {}}
+
+      NOTE: there might be ambiguity in the case of a dictionary representation
+      of a single column. For example, the following description can be parsed
+      in 2 different ways: {'a': ('b', 'c')} can be thought of a single column
+      with the id 'a', of type 'b' and the label 'c', or as 2 columns: one named
+      'a', and the other named 'b' of type 'c'. We choose the first option by
+      default, and in case the second option is the right one, it is possible to
+      make the key into a tuple (i.e. {('a',): ('b', 'c')}) or add more info
+      into the tuple, thus making it look like this: {'a': ('b', 'c', 'b', {})}
+      -- second 'b' is the label, and {} is the custom properties field.
+    """
+    # For the recursion step, we check for a scalar object (string or tuple)
+    if isinstance(table_description, (types.StringTypes, tuple)):
+      parsed_col = DataTable.ColumnTypeParser(table_description)
+      parsed_col["depth"] = depth
+      parsed_col["container"] = "scalar"
+      return [parsed_col]
+
+    # Since it is not scalar, table_description must be iterable.
+    if not hasattr(table_description, "__iter__"):
+      raise DataTableException("Expected an iterable object, got %s" %
+                               type(table_description))
+    if not isinstance(table_description, dict):
+      # We expects a non-dictionary iterable item.
+      columns = []
+      for desc in table_description:
+        parsed_col = DataTable.ColumnTypeParser(desc)
+        parsed_col["depth"] = depth
+        parsed_col["container"] = "iter"
+        columns.append(parsed_col)
+      if not columns:
+        raise DataTableException("Description iterable objects should not"
+                                 " be empty.")
+      return columns
+    # The other case is a dictionary
+    if not table_description:
+      raise DataTableException("Empty dictionaries are not allowed inside"
+                               " description")
+
+    # To differentiate between the two cases of more levels below or this is
+    # the most inner dictionary, we consider the number of keys (more then one
+    # key is indication for most inner dictionary) and the type of the key and
+    # value in case of only 1 key (if the type of key is string and the type of
+    # the value is a tuple of 0-3 items, we assume this is the most inner
+    # dictionary).
+    # NOTE: this way of differentiating might create ambiguity. See docs.
+    if (len(table_description) != 1 or
+        (isinstance(table_description.keys()[0], types.StringTypes) and
+         isinstance(table_description.values()[0], tuple) and
+         len(table_description.values()[0]) < 4)):
+      # This is the most inner dictionary. Parsing types.
+      columns = []
+      # We sort the items, equivalent to sort the keys since they are unique
+      for key, value in sorted(table_description.items()):
+        # We parse the column type as (key, type) or (key, type, label) using
+        # ColumnTypeParser.
+        if isinstance(value, tuple):
+          parsed_col = DataTable.ColumnTypeParser((key,) + value)
+        else:
+          parsed_col = DataTable.ColumnTypeParser((key, value))
+        parsed_col["depth"] = depth
+        parsed_col["container"] = "dict"
+        columns.append(parsed_col)
+      return columns
+    # This is an outer dictionary, must have at most one key.
+    parsed_col = DataTable.ColumnTypeParser(table_description.keys()[0])
+    parsed_col["depth"] = depth
+    parsed_col["container"] = "dict"
+    return ([parsed_col] +
+            DataTable.TableDescriptionParser(table_description.values()[0],
+                                             depth=depth + 1))
+
+  @property
+  def columns(self):
+    """Returns the parsed table description."""
+    return self.__columns
+
+  def NumberOfRows(self):
+    """Returns the number of rows in the current data stored in the table."""
+    return len(self.__data)
+
+  def SetRowsCustomProperties(self, rows, custom_properties):
+    """Sets the custom properties for given row(s).
+
+    Can accept a single row or an iterable of rows.
+    Sets the given custom properties for all specified rows.
+
+    Args:
+      rows: The row, or rows, to set the custom properties for.
+      custom_properties: A string to string dictionary of custom properties to
+      set for all rows.
+    """
+    if not hasattr(rows, "__iter__"):
+      rows = [rows]
+    for row in rows:
+      self.__data[row] = (self.__data[row][0], custom_properties)
+
+  def LoadData(self, data, custom_properties=None):
+    """Loads new rows to the data table, clearing existing rows.
+
+    May also set the custom_properties for the added rows. The given custom
+    properties dictionary specifies the dictionary that will be used for *all*
+    given rows.
+
+    Args:
+      data: The rows that the table will contain.
+      custom_properties: A dictionary of string to string to set as the custom
+                         properties for all rows.
+    """
+    self.__data = []
+    self.AppendData(data, custom_properties)
+
+  def AppendData(self, data, custom_properties=None):
+    """Appends new data to the table.
+
+    Data is appended in rows. Data must comply with
+    the table schema passed in to __init__(). See CoerceValue() for a list
+    of acceptable data types. See the class documentation for more information
+    and examples of schema and data values.
+
+    Args:
+      data: The row to add to the table. The data must conform to the table
+            description format.
+      custom_properties: A dictionary of string to string, representing the
+                         custom properties to add to all the rows.
+
+    Raises:
+      DataTableException: The data structure does not match the description.
+    """
+    # If the maximal depth is 0, we simply iterate over the data table
+    # lines and insert them using _InnerAppendData. Otherwise, we simply
+    # let the _InnerAppendData handle all the levels.
+    if not self.__columns[-1]["depth"]:
+      for row in data:
+        self._InnerAppendData(({}, custom_properties), row, 0)
+    else:
+      self._InnerAppendData(({}, custom_properties), data, 0)
+
+  def _InnerAppendData(self, prev_col_values, data, col_index):
+    """Inner function to assist LoadData."""
+    # We first check that col_index has not exceeded the columns size
+    if col_index >= len(self.__columns):
+      raise DataTableException("The data does not match description, too deep")
+
+    # Dealing with the scalar case, the data is the last value.
+    if self.__columns[col_index]["container"] == "scalar":
+      prev_col_values[0][self.__columns[col_index]["id"]] = data
+      self.__data.append(prev_col_values)
+      return
+
+    if self.__columns[col_index]["container"] == "iter":
+      if not hasattr(data, "__iter__") or isinstance(data, dict):
+        raise DataTableException("Expected iterable object, got %s" %
+                                 type(data))
+      # We only need to insert the rest of the columns
+      # If there are less items than expected, we only add what there is.
+      for value in data:
+        if col_index >= len(self.__columns):
+          raise DataTableException("Too many elements given in data")
+        prev_col_values[0][self.__columns[col_index]["id"]] = value
+        col_index += 1
+      self.__data.append(prev_col_values)
+      return
+
+    # We know the current level is a dictionary, we verify the type.
+    if not isinstance(data, dict):
+      raise DataTableException("Expected dictionary at current level, got %s" %
+                               type(data))
+    # We check if this is the last level
+    if self.__columns[col_index]["depth"] == self.__columns[-1]["depth"]:
+      # We need to add the keys in the dictionary as they are
+      for col in self.__columns[col_index:]:
+        if col["id"] in data:
+          prev_col_values[0][col["id"]] = data[col["id"]]
+      self.__data.append(prev_col_values)
+      return
+
+    # We have a dictionary in an inner depth level.
+    if not data.keys():
+      # In case this is an empty dictionary, we add a record with the columns
+      # filled only until this point.
+      self.__data.append(prev_col_values)
+    else:
+      for key in sorted(data):
+        col_values = dict(prev_col_values[0])
+        col_values[self.__columns[col_index]["id"]] = key
+        self._InnerAppendData((col_values, prev_col_values[1]),
+                              data[key], col_index + 1)
+
+  def _PreparedData(self, order_by=()):
+    """Prepares the data for enumeration - sorting it by order_by.
+
+    Args:
+      order_by: Optional. Specifies the name of the column(s) to sort by, and
+                (optionally) which direction to sort in. Default sort direction
+                is asc. Following formats are accepted:
+                "string_col_name"  -- For a single key in default (asc) order.
+                ("string_col_name", "asc|desc") -- For a single key.
+                [("col_1","asc|desc"), ("col_2","asc|desc")] -- For more than
+                    one column, an array of tuples of (col_name, "asc|desc").
+
+    Returns:
+      The data sorted by the keys given.
+
+    Raises:
+      DataTableException: Sort direction not in 'asc' or 'desc'
+    """
+    if not order_by:
+      return self.__data
+
+    proper_sort_keys = []
+    if isinstance(order_by, types.StringTypes) or (
+        isinstance(order_by, tuple) and len(order_by) == 2 and
+        order_by[1].lower() in ["asc", "desc"]):
+      order_by = (order_by,)
+    for key in order_by:
+      if isinstance(key, types.StringTypes):
+        proper_sort_keys.append((key, 1))
+      elif (isinstance(key, (list, tuple)) and len(key) == 2 and
+            key[1].lower() in ("asc", "desc")):
+        proper_sort_keys.append((key[0], key[1].lower() == "asc" and 1 or -1))
+      else:
+        raise DataTableException("Expected tuple with second value: "
+                                 "'asc' or 'desc'")
+
+    def SortCmpFunc(row1, row2):
+      """cmp function for sorted. Compares by keys and 'asc'/'desc' keywords."""
+      for key, asc_mult in proper_sort_keys:
+        cmp_result = asc_mult * cmp(row1[0].get(key), row2[0].get(key))
+        if cmp_result:
+          return cmp_result
+      return 0
+
+    return sorted(self.__data, cmp=SortCmpFunc)
+
+  def ToJSCode(self, name, columns_order=None, order_by=()):
+    """Writes the data table as a JS code string.
+
+    This method writes a string of JS code that can be run to
+    generate a DataTable with the specified data. Typically used for debugging
+    only.
+
+    Args:
+      name: The name of the table. The name would be used as the DataTable's
+            variable name in the created JS code.
+      columns_order: Optional. Specifies the order of columns in the
+                     output table. Specify a list of all column IDs in the order
+                     in which you want the table created.
+                     Note that you must list all column IDs in this parameter,
+                     if you use it.
+      order_by: Optional. Specifies the name of the column(s) to sort by.
+                Passed as is to _PreparedData.
+
+    Returns:
+      A string of JS code that, when run, generates a DataTable with the given
+      name and the data stored in the DataTable object.
+      Example result:
+        "var tab1 = new google.visualization.DataTable();
+         tab1.addColumn("string", "a", "a");
+         tab1.addColumn("number", "b", "b");
+         tab1.addColumn("boolean", "c", "c");
+         tab1.addRows(10);
+         tab1.setCell(0, 0, "a");
+         tab1.setCell(0, 1, 1, null, {"foo": "bar"});
+         tab1.setCell(0, 2, true);
+         ...
+         tab1.setCell(9, 0, "c");
+         tab1.setCell(9, 1, 3, "3$");
+         tab1.setCell(9, 2, false);"
+
+    Raises:
+      DataTableException: The data does not match the type.
+    """
+
+    encoder = DataTableJSONEncoder()
+
+    if columns_order is None:
+      columns_order = [col["id"] for col in self.__columns]
+    col_dict = dict([(col["id"], col) for col in self.__columns])
+
+    # We first create the table with the given name
+    jscode = "var %s = new google.visualization.DataTable();\n" % name
+    if self.custom_properties:
+      jscode += "%s.setTableProperties(%s);\n" % (
+          name, encoder.encode(self.custom_properties))
+
+    # We add the columns to the table
+    for i, col in enumerate(columns_order):
+      jscode += "%s.addColumn(%s, %s, %s);\n" % (
+          name,
+          encoder.encode(col_dict[col]["type"]),
+          encoder.encode(col_dict[col]["label"]),
+          encoder.encode(col_dict[col]["id"]))
+      if col_dict[col]["custom_properties"]:
+        jscode += "%s.setColumnProperties(%d, %s);\n" % (
+            name, i, encoder.encode(col_dict[col]["custom_properties"]))
+    jscode += "%s.addRows(%d);\n" % (name, len(self.__data))
+
+    # We now go over the data and add each row
+    for (i, (row, cp)) in enumerate(self._PreparedData(order_by)):
+      # We add all the elements of this row by their order
+      for (j, col) in enumerate(columns_order):
+        if col not in row or row[col] is None:
+          continue
+        value = self.CoerceValue(row[col], col_dict[col]["type"])
+        if isinstance(value, tuple):
+          cell_cp = ""
+          if len(value) == 3:
+            cell_cp = ", %s" % encoder.encode(row[col][2])
+          # We have a formatted value or custom property as well
+          jscode += ("%s.setCell(%d, %d, %s, %s%s);\n" %
+                     (name, i, j,
+                      self.EscapeForJSCode(encoder, value[0]),
+                      self.EscapeForJSCode(encoder, value[1]), cell_cp))
+        else:
+          jscode += "%s.setCell(%d, %d, %s);\n" % (
+              name, i, j, self.EscapeForJSCode(encoder, value))
+      if cp:
+        jscode += "%s.setRowProperties(%d, %s);\n" % (
+            name, i, encoder.encode(cp))
+    return jscode
+
+  def ToHtml(self, columns_order=None, order_by=()):
+    """Writes the data table as an HTML table code string.
+
+    Args:
+      columns_order: Optional. Specifies the order of columns in the
+                     output table. Specify a list of all column IDs in the order
+                     in which you want the table created.
+                     Note that you must list all column IDs in this parameter,
+                     if you use it.
+      order_by: Optional. Specifies the name of the column(s) to sort by.
+                Passed as is to _PreparedData.
+
+    Returns:
+      An HTML table code string.
+      Example result (the result is without the newlines):
+       <html><body><table border="1">
+        <thead><tr><th>a</th><th>b</th><th>c</th></tr></thead>
+        <tbody>
+         <tr><td>1</td><td>"z"</td><td>2</td></tr>
+         <tr><td>"3$"</td><td>"w"</td><td></td></tr>
+        </tbody>
+       </table></body></html>
+
+    Raises:
+      DataTableException: The data does not match the type.
+    """
+    table_template = "<html><body><table border=\"1\">%s</table></body></html>"
+    columns_template = "<thead><tr>%s</tr></thead>"
+    rows_template = "<tbody>%s</tbody>"
+    row_template = "<tr>%s</tr>"
+    header_cell_template = "<th>%s</th>"
+    cell_template = "<td>%s</td>"
+
+    if columns_order is None:
+      columns_order = [col["id"] for col in self.__columns]
+    col_dict = dict([(col["id"], col) for col in self.__columns])
+
+    columns_list = []
+    for col in columns_order:
+      columns_list.append(header_cell_template %
+                          cgi.escape(col_dict[col]["label"]))
+    columns_html = columns_template % "".join(columns_list)
+
+    rows_list = []
+    # We now go over the data and add each row
+    for row, unused_cp in self._PreparedData(order_by):
+      cells_list = []
+      # We add all the elements of this row by their order
+      for col in columns_order:
+        # For empty string we want empty quotes ("").
+        value = ""
+        if col in row and row[col] is not None:
+          value = self.CoerceValue(row[col], col_dict[col]["type"])
+        if isinstance(value, tuple):
+          # We have a formatted value and we're going to use it
+          cells_list.append(cell_template % cgi.escape(self.ToString(value[1])))
+        else:
+          cells_list.append(cell_template % cgi.escape(self.ToString(value)))
+      rows_list.append(row_template % "".join(cells_list))
+    rows_html = rows_template % "".join(rows_list)
+
+    return table_template % (columns_html + rows_html)
+
+  def ToCsv(self, columns_order=None, order_by=(), separator=","):
+    """Writes the data table as a CSV string.
+
+    Output is encoded in UTF-8 because the Python "csv" module can't handle
+    Unicode properly according to its documentation.
+
+    Args:
+      columns_order: Optional. Specifies the order of columns in the
+                     output table. Specify a list of all column IDs in the order
+                     in which you want the table created.
+                     Note that you must list all column IDs in this parameter,
+                     if you use it.
+      order_by: Optional. Specifies the name of the column(s) to sort by.
+                Passed as is to _PreparedData.
+      separator: Optional. The separator to use between the values.
+
+    Returns:
+      A CSV string representing the table.
+      Example result:
+       'a','b','c'
+       1,'z',2
+       3,'w',''
+
+    Raises:
+      DataTableException: The data does not match the type.
+    """
+
+    csv_buffer = cStringIO.StringIO()
+    writer = csv.writer(csv_buffer, delimiter=separator)
+
+    if columns_order is None:
+      columns_order = [col["id"] for col in self.__columns]
+    col_dict = dict([(col["id"], col) for col in self.__columns])
+
+    writer.writerow([col_dict[col]["label"].encode("utf-8")
+                     for col in columns_order])
+
+    # We now go over the data and add each row
+    for row, unused_cp in self._PreparedData(order_by):
+      cells_list = []
+      # We add all the elements of this row by their order
+      for col in columns_order:
+        value = ""
+        if col in row and row[col] is not None:
+          value = self.CoerceValue(row[col], col_dict[col]["type"])
+        if isinstance(value, tuple):
+          # We have a formatted value. Using it only for date/time types.
+          if col_dict[col]["type"] in ["date", "datetime", "timeofday"]:
+            cells_list.append(self.ToString(value[1]).encode("utf-8"))
+          else:
+            cells_list.append(self.ToString(value[0]).encode("utf-8"))
+        else:
+          cells_list.append(self.ToString(value).encode("utf-8"))
+      writer.writerow(cells_list)
+    return csv_buffer.getvalue()
+
+  def ToTsvExcel(self, columns_order=None, order_by=()):
+    """Returns a file in tab-separated-format readable by MS Excel.
+
+    Returns a file in UTF-16 little endian encoding, with tabs separating the
+    values.
+
+    Args:
+      columns_order: Delegated to ToCsv.
+      order_by: Delegated to ToCsv.
+
+    Returns:
+      A tab-separated little endian UTF16 file representing the table.
+    """
+    return (self.ToCsv(columns_order, order_by, separator="\t")
+            .decode("utf-8").encode("UTF-16LE"))
+
+  def _ToJSonObj(self, columns_order=None, order_by=()):
+    """Returns an object suitable to be converted to JSON.
+
+    Args:
+      columns_order: Optional. A list of all column IDs in the order in which
+                     you want them created in the output table. If specified,
+                     all column IDs must be present.
+      order_by: Optional. Specifies the name of the column(s) to sort by.
+                Passed as is to _PreparedData().
+
+    Returns:
+      A dictionary object for use by ToJSon or ToJSonResponse.
+    """
+    if columns_order is None:
+      columns_order = [col["id"] for col in self.__columns]
+    col_dict = dict([(col["id"], col) for col in self.__columns])
+
+    # Creating the column JSON objects
+    col_objs = []
+    for col_id in columns_order:
+      col_obj = {"id": col_dict[col_id]["id"],
+                 "label": col_dict[col_id]["label"],
+                 "type": col_dict[col_id]["type"]}
+      if col_dict[col_id]["custom_properties"]:
+        col_obj["p"] = col_dict[col_id]["custom_properties"]
+      col_objs.append(col_obj)
+
+    # Creating the rows jsons
+    row_objs = []
+    for row, cp in self._PreparedData(order_by):
+      cell_objs = []
+      for col in columns_order:
+        value = self.CoerceValue(row.get(col, None), col_dict[col]["type"])
+        if value is None:
+          cell_obj = None
+        elif isinstance(value, tuple):
+          cell_obj = {"v": value[0]}
+          if len(value) > 1 and value[1] is not None:
+            cell_obj["f"] = value[1]
+          if len(value) == 3:
+            cell_obj["p"] = value[2]
+        else:
+          cell_obj = {"v": value}
+        cell_objs.append(cell_obj)
+      row_obj = {"c": cell_objs}
+      if cp:
+        row_obj["p"] = cp
+      row_objs.append(row_obj)
+
+    json_obj = {"cols": col_objs, "rows": row_objs}
+    if self.custom_properties:
+      json_obj["p"] = self.custom_properties
+
+    return json_obj
+
+  def ToJSon(self, columns_order=None, order_by=()):
+    """Returns a string that can be used in a JS DataTable constructor.
+
+    This method writes a JSON string that can be passed directly into a Google
+    Visualization API DataTable constructor. Use this output if you are
+    hosting the visualization HTML on your site, and want to code the data
+    table in Python. Pass this string into the
+    google.visualization.DataTable constructor, e.g,:
+      ... on my page that hosts my visualization ...
+      google.setOnLoadCallback(drawTable);
+      function drawTable() {
+        var data = new google.visualization.DataTable(_my_JSon_string, 0.6);
+        myTable.draw(data);
+      }
+
+    Args:
+      columns_order: Optional. Specifies the order of columns in the
+                     output table. Specify a list of all column IDs in the order
+                     in which you want the table created.
+                     Note that you must list all column IDs in this parameter,
+                     if you use it.
+      order_by: Optional. Specifies the name of the column(s) to sort by.
+                Passed as is to _PreparedData().
+
+    Returns:
+      A JSon constructor string to generate a JS DataTable with the data
+      stored in the DataTable object.
+      Example result (the result is without the newlines):
+       {cols: [{id:"a",label:"a",type:"number"},
+               {id:"b",label:"b",type:"string"},
+              {id:"c",label:"c",type:"number"}],
+        rows: [{c:[{v:1},{v:"z"},{v:2}]}, c:{[{v:3,f:"3$"},{v:"w"},{v:null}]}],
+        p:    {'foo': 'bar'}}
+
+    Raises:
+      DataTableException: The data does not match the type.
+    """
+
+    encoder = DataTableJSONEncoder()
+    return encoder.encode(
+        self._ToJSonObj(columns_order, order_by)).encode("utf-8")
+
+  def ToJSonResponse(self, columns_order=None, order_by=(), req_id=0,
+                     response_handler="google.visualization.Query.setResponse"):
+    """Writes a table as a JSON response that can be returned as-is to a client.
+
+    This method writes a JSON response to return to a client in response to a
+    Google Visualization API query. This string can be processed by the calling
+    page, and is used to deliver a data table to a visualization hosted on
+    a different page.
+
+    Args:
+      columns_order: Optional. Passed straight to self.ToJSon().
+      order_by: Optional. Passed straight to self.ToJSon().
+      req_id: Optional. The response id, as retrieved by the request.
+      response_handler: Optional. The response handler, as retrieved by the
+          request.
+
+    Returns:
+      A JSON response string to be received by JS the visualization Query
+      object. This response would be translated into a DataTable on the
+      client side.
+      Example result (newlines added for readability):
+       google.visualization.Query.setResponse({
+          'version':'0.6', 'reqId':'0', 'status':'OK',
+          'table': {cols: [...], rows: [...]}});
+
+    Note: The URL returning this string can be used as a data source by Google
+          Visualization Gadgets or from JS code.
+    """
+
+    response_obj = {
+        "version": "0.6",
+        "reqId": str(req_id),
+        "table": self._ToJSonObj(columns_order, order_by),
+        "status": "ok"
+    }
+    encoder = DataTableJSONEncoder()
+    return "%s(%s);" % (response_handler,
+                        encoder.encode(response_obj).encode("utf-8"))
+
+  def ToResponse(self, columns_order=None, order_by=(), tqx=""):
+    """Writes the right response according to the request string passed in tqx.
+
+    This method parses the tqx request string (format of which is defined in
+    the documentation for implementing a data source of Google Visualization),
+    and returns the right response according to the request.
+    It parses out the "out" parameter of tqx, calls the relevant response
+    (ToJSonResponse() for "json", ToCsv() for "csv", ToHtml() for "html",
+    ToTsvExcel() for "tsv-excel") and passes the response function the rest of
+    the relevant request keys.
+
+    Args:
+      columns_order: Optional. Passed as is to the relevant response function.
+      order_by: Optional. Passed as is to the relevant response function.
+      tqx: Optional. The request string as received by HTTP GET. Should be in
+           the format "key1:value1;key2:value2...". All keys have a default
+           value, so an empty string will just do the default (which is calling
+           ToJSonResponse() with no extra parameters).
+
+    Returns:
+      A response string, as returned by the relevant response function.
+
+    Raises:
+      DataTableException: One of the parameters passed in tqx is not supported.
+    """
+    tqx_dict = {}
+    if tqx:
+      tqx_dict = dict(opt.split(":") for opt in tqx.split(";"))
+    if tqx_dict.get("version", "0.6") != "0.6":
+      raise DataTableException(
+          "Version (%s) passed by request is not supported."
+          % tqx_dict["version"])
+
+    if tqx_dict.get("out", "json") == "json":
+      response_handler = tqx_dict.get("responseHandler",
+                                      "google.visualization.Query.setResponse")
+      return self.ToJSonResponse(columns_order, order_by,
+                                 req_id=tqx_dict.get("reqId", 0),
+                                 response_handler=response_handler)
+    elif tqx_dict["out"] == "html":
+      return self.ToHtml(columns_order, order_by)
+    elif tqx_dict["out"] == "csv":
+      return self.ToCsv(columns_order, order_by)
+    elif tqx_dict["out"] == "tsv-excel":
+      return self.ToTsvExcel(columns_order, order_by)
+    else:
+      raise DataTableException(
+          "'out' parameter: '%s' is not supported" % tqx_dict["out"])
diff --git a/libs/libaom/src/test/hadamard_test.cc b/libs/libaom/src/test/hadamard_test.cc
new file mode 100644
index 000000000..7903259e7
--- /dev/null
+++ b/libs/libaom/src/test/hadamard_test.cc
@@ -0,0 +1,261 @@
+/*
+ *  Copyright (c) 2019, Alliance for Open Media. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <algorithm>
+#include <ostream>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+
+namespace {
+
+using libaom_test::ACMRandom;
+
+typedef void (*HadamardFunc)(const int16_t *a, ptrdiff_t a_stride,
+                             tran_low_t *b);
+
+void HadamardLoop(const tran_low_t *a, tran_low_t *out) {
+  tran_low_t b[8];
+  for (int i = 0; i < 8; i += 2) {
+    b[i + 0] = a[i * 8] + a[(i + 1) * 8];
+    b[i + 1] = a[i * 8] - a[(i + 1) * 8];
+  }
+  tran_low_t c[8];
+  for (int i = 0; i < 8; i += 4) {
+    c[i + 0] = b[i + 0] + b[i + 2];
+    c[i + 1] = b[i + 1] + b[i + 3];
+    c[i + 2] = b[i + 0] - b[i + 2];
+    c[i + 3] = b[i + 1] - b[i + 3];
+  }
+  out[0] = c[0] + c[4];
+  out[7] = c[1] + c[5];
+  out[3] = c[2] + c[6];
+  out[4] = c[3] + c[7];
+  out[2] = c[0] - c[4];
+  out[6] = c[1] - c[5];
+  out[1] = c[2] - c[6];
+  out[5] = c[3] - c[7];
+}
+
+void ReferenceHadamard8x8(const int16_t *a, int a_stride, tran_low_t *b) {
+  tran_low_t input[64];
+  tran_low_t buf[64];
+  for (int i = 0; i < 8; ++i) {
+    for (int j = 0; j < 8; ++j) {
+      input[i * 8 + j] = static_cast<tran_low_t>(a[i * a_stride + j]);
+    }
+  }
+  for (int i = 0; i < 8; ++i) HadamardLoop(input + i, buf + i * 8);
+  for (int i = 0; i < 8; ++i) HadamardLoop(buf + i, b + i * 8);
+}
+
+void ReferenceHadamard16x16(const int16_t *a, int a_stride, tran_low_t *b) {
+  /* The source is a 16x16 block. The destination is rearranged to 8x32.
+   * Input is 9 bit. */
+  ReferenceHadamard8x8(a + 0 + 0 * a_stride, a_stride, b + 0);
+  ReferenceHadamard8x8(a + 8 + 0 * a_stride, a_stride, b + 64);
+  ReferenceHadamard8x8(a + 0 + 8 * a_stride, a_stride, b + 128);
+  ReferenceHadamard8x8(a + 8 + 8 * a_stride, a_stride, b + 192);
+
+  /* Overlay the 8x8 blocks and combine. */
+  for (int i = 0; i < 64; ++i) {
+    /* 8x8 steps the range up to 15 bits. */
+    const tran_low_t a0 = b[0];
+    const tran_low_t a1 = b[64];
+    const tran_low_t a2 = b[128];
+    const tran_low_t a3 = b[192];
+
+    /* Prevent the result from escaping int16_t. */
+    const tran_low_t b0 = (a0 + a1) >> 1;
+    const tran_low_t b1 = (a0 - a1) >> 1;
+    const tran_low_t b2 = (a2 + a3) >> 1;
+    const tran_low_t b3 = (a2 - a3) >> 1;
+
+    /* Store a 16 bit value. */
+    b[0] = b0 + b2;
+    b[64] = b1 + b3;
+    b[128] = b0 - b2;
+    b[192] = b1 - b3;
+
+    ++b;
+  }
+}
+
+void ReferenceHadamard32x32(const int16_t *a, int a_stride, tran_low_t *b) {
+  ReferenceHadamard16x16(a + 0 + 0 * a_stride, a_stride, b + 0);
+  ReferenceHadamard16x16(a + 16 + 0 * a_stride, a_stride, b + 256);
+  ReferenceHadamard16x16(a + 0 + 16 * a_stride, a_stride, b + 512);
+  ReferenceHadamard16x16(a + 16 + 16 * a_stride, a_stride, b + 768);
+
+  for (int i = 0; i < 256; ++i) {
+    const tran_low_t a0 = b[0];
+    const tran_low_t a1 = b[256];
+    const tran_low_t a2 = b[512];
+    const tran_low_t a3 = b[768];
+
+    const tran_low_t b0 = (a0 + a1) >> 2;
+    const tran_low_t b1 = (a0 - a1) >> 2;
+    const tran_low_t b2 = (a2 + a3) >> 2;
+    const tran_low_t b3 = (a2 - a3) >> 2;
+
+    b[0] = b0 + b2;
+    b[256] = b1 + b3;
+    b[512] = b0 - b2;
+    b[768] = b1 - b3;
+
+    ++b;
+  }
+}
+
+struct HadamardFuncWithSize {
+  HadamardFuncWithSize(HadamardFunc f, int s) : func(f), block_size(s) {}
+  HadamardFunc func;
+  int block_size;
+};
+
+std::ostream &operator<<(std::ostream &os, const HadamardFuncWithSize &hfs) {
+  return os << "block size: " << hfs.block_size;
+}
+
+class HadamardTestBase : public ::testing::TestWithParam<HadamardFuncWithSize> {
+ public:
+  virtual void SetUp() {
+    h_func_ = GetParam().func;
+    bwh_ = GetParam().block_size;
+    block_size_ = bwh_ * bwh_;
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+  }
+
+  virtual int16_t Rand() = 0;
+
+  void ReferenceHadamard(const int16_t *a, int a_stride, tran_low_t *b,
+                         int bwh) {
+    if (bwh == 32)
+      ReferenceHadamard32x32(a, a_stride, b);
+    else if (bwh == 16)
+      ReferenceHadamard16x16(a, a_stride, b);
+    else
+      ReferenceHadamard8x8(a, a_stride, b);
+  }
+
+  void CompareReferenceRandom() {
+    const int kMaxBlockSize = 32 * 32;
+    DECLARE_ALIGNED(16, int16_t, a[kMaxBlockSize]);
+    DECLARE_ALIGNED(16, tran_low_t, b[kMaxBlockSize]);
+    memset(a, 0, sizeof(a));
+    memset(b, 0, sizeof(b));
+
+    tran_low_t b_ref[kMaxBlockSize];
+    memset(b_ref, 0, sizeof(b_ref));
+
+    for (int i = 0; i < block_size_; ++i) a[i] = Rand();
+
+    ReferenceHadamard(a, bwh_, b_ref, bwh_);
+    ASM_REGISTER_STATE_CHECK(h_func_(a, bwh_, b));
+
+    // The order of the output is not important. Sort before checking.
+    std::sort(b, b + block_size_);
+    std::sort(b_ref, b_ref + block_size_);
+    EXPECT_EQ(memcmp(b, b_ref, sizeof(b)), 0);
+  }
+
+  void VaryStride() {
+    const int kMaxBlockSize = 32 * 32;
+    DECLARE_ALIGNED(16, int16_t, a[kMaxBlockSize * 8]);
+    DECLARE_ALIGNED(16, tran_low_t, b[kMaxBlockSize]);
+    memset(a, 0, sizeof(a));
+    for (int i = 0; i < block_size_ * 8; ++i) a[i] = Rand();
+
+    tran_low_t b_ref[kMaxBlockSize];
+    for (int i = 8; i < 64; i += 8) {
+      memset(b, 0, sizeof(b));
+      memset(b_ref, 0, sizeof(b_ref));
+
+      ReferenceHadamard(a, i, b_ref, bwh_);
+      ASM_REGISTER_STATE_CHECK(h_func_(a, i, b));
+
+      // The order of the output is not important. Sort before checking.
+      std::sort(b, b + block_size_);
+      std::sort(b_ref, b_ref + block_size_);
+      EXPECT_EQ(0, memcmp(b, b_ref, sizeof(b)));
+    }
+  }
+
+  void SpeedTest(int times) {
+    const int kMaxBlockSize = 32 * 32;
+    DECLARE_ALIGNED(16, int16_t, input[kMaxBlockSize]);
+    DECLARE_ALIGNED(16, tran_low_t, output[kMaxBlockSize]);
+    memset(input, 1, sizeof(input));
+    memset(output, 0, sizeof(output));
+
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < times; ++i) {
+      h_func_(input, bwh_, output);
+    }
+    aom_usec_timer_mark(&timer);
+
+    const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+    printf("Hadamard%dx%d[%12d runs]: %d us\n", bwh_, bwh_, times,
+           elapsed_time);
+  }
+
+  ACMRandom rnd_;
+
+ private:
+  int bwh_;
+  int block_size_;
+  HadamardFunc h_func_;
+};
+
+class HadamardLowbdTest : public HadamardTestBase {
+ public:
+  virtual int16_t Rand() { return rnd_.Rand9Signed(); }
+};
+
+TEST_P(HadamardLowbdTest, CompareReferenceRandom) { CompareReferenceRandom(); }
+
+TEST_P(HadamardLowbdTest, VaryStride) { VaryStride(); }
+
+INSTANTIATE_TEST_SUITE_P(
+    C, HadamardLowbdTest,
+    ::testing::Values(HadamardFuncWithSize(&aom_hadamard_8x8_c, 8),
+                      HadamardFuncWithSize(&aom_hadamard_16x16_c, 16),
+                      HadamardFuncWithSize(&aom_hadamard_32x32_c, 32)));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, HadamardLowbdTest,
+    ::testing::Values(HadamardFuncWithSize(&aom_hadamard_8x8_sse2, 8),
+                      HadamardFuncWithSize(&aom_hadamard_16x16_sse2, 16),
+                      HadamardFuncWithSize(&aom_hadamard_32x32_sse2, 32)));
+#endif  // HAVE_SSE2
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, HadamardLowbdTest,
+    ::testing::Values(HadamardFuncWithSize(&aom_hadamard_16x16_avx2, 16),
+                      HadamardFuncWithSize(&aom_hadamard_32x32_avx2, 32)));
+#endif  // HAVE_AVX2
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+    NEON, HadamardLowbdTest,
+    ::testing::Values(HadamardFuncWithSize(&aom_hadamard_8x8_neon, 8),
+                      HadamardFuncWithSize(&aom_hadamard_16x16_neon, 16)));
+#endif  // HAVE_NEON
+
+}  // namespace
diff --git a/libs/libaom/src/test/hash_test.cc b/libs/libaom/src/test/hash_test.cc
new file mode 100644
index 000000000..eb964ac5f
--- /dev/null
+++ b/libs/libaom/src/test/hash_test.cc
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <cstdlib>
+#include <new>
+#include <tuple>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_ports/aom_timer.h"
+#include "av1/encoder/hash.h"
+#include "test/acm_random.h"
+#include "test/util.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+typedef uint32_t (*get_crc32c_value_func)(void *calculator, uint8_t *p,
+                                          size_t length);
+
+typedef std::tuple<get_crc32c_value_func, int> HashParam;
+
+class AV1Crc32cHashTest : public ::testing::TestWithParam<HashParam> {
+ public:
+  ~AV1Crc32cHashTest();
+  void SetUp();
+
+  void TearDown();
+
+ protected:
+  void RunCheckOutput(get_crc32c_value_func test_impl);
+  void RunSpeedTest(get_crc32c_value_func test_impl);
+
+  void RunZeroTest(get_crc32c_value_func test_impl);
+
+  libaom_test::ACMRandom rnd_;
+  CRC32C calc_;
+  uint8_t *buffer_;
+  int bsize_;
+  size_t length_;
+};
+
+AV1Crc32cHashTest::~AV1Crc32cHashTest() { ; }
+
+void AV1Crc32cHashTest::SetUp() {
+  rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed());
+  av1_crc32c_calculator_init(&calc_);
+
+  bsize_ = GET_PARAM(1);
+  length_ = bsize_ * bsize_ * sizeof(uint16_t);
+  buffer_ = new uint8_t[length_];
+  ASSERT_TRUE(buffer_ != NULL);
+  for (size_t i = 0; i < length_; ++i) {
+    buffer_[i] = rnd_.Rand8();
+  }
+}
+
+void AV1Crc32cHashTest::TearDown() { delete[] buffer_; }
+
+void AV1Crc32cHashTest::RunCheckOutput(get_crc32c_value_func test_impl) {
+  get_crc32c_value_func ref_impl = av1_get_crc32c_value_c;
+  // for the same buffer crc should be the same
+  uint32_t crc0 = test_impl(&calc_, buffer_, length_);
+  uint32_t crc1 = test_impl(&calc_, buffer_, length_);
+  uint32_t crc2 = ref_impl(&calc_, buffer_, length_);
+  ASSERT_EQ(crc0, crc1);
+  ASSERT_EQ(crc0, crc2);  // should equal to software version
+  // modify buffer
+  buffer_[0] += 1;
+  uint32_t crc3 = test_impl(&calc_, buffer_, length_);
+  uint32_t crc4 = ref_impl(&calc_, buffer_, length_);
+  ASSERT_NE(crc0, crc3);  // crc shoud not equal to previous one
+  ASSERT_EQ(crc3, crc4);
+}
+
+void AV1Crc32cHashTest::RunSpeedTest(get_crc32c_value_func test_impl) {
+  get_crc32c_value_func impls[] = { av1_get_crc32c_value_c, test_impl };
+  const int repeat = 10000000 / (bsize_ + bsize_);
+
+  aom_usec_timer timer;
+  double time[2];
+  for (int i = 0; i < 2; ++i) {
+    aom_usec_timer_start(&timer);
+    for (int j = 0; j < repeat; ++j) {
+      impls[i](&calc_, buffer_, length_);
+    }
+    aom_usec_timer_mark(&timer);
+    time[i] = static_cast<double>(aom_usec_timer_elapsed(&timer));
+  }
+  printf("hash %3dx%-3d:%7.2f/%7.2fus", bsize_, bsize_, time[0], time[1]);
+  printf("(%3.2f)\n", time[0] / time[1]);
+}
+
+void AV1Crc32cHashTest::RunZeroTest(get_crc32c_value_func test_impl) {
+  uint8_t buffer0[1024] = { 0 };
+  // for buffer with different size the crc should not be the same
+  const uint32_t crc0 = test_impl(&calc_, buffer0, 32);
+  const uint32_t crc1 = test_impl(&calc_, buffer0, 128);
+  const uint32_t crc2 = test_impl(&calc_, buffer0, 1024);
+  ASSERT_NE(crc0, crc1);
+  ASSERT_NE(crc0, crc2);
+  ASSERT_NE(crc1, crc2);
+}
+
+TEST_P(AV1Crc32cHashTest, CheckOutput) { RunCheckOutput(GET_PARAM(0)); }
+
+TEST_P(AV1Crc32cHashTest, CheckZero) { RunZeroTest(GET_PARAM(0)); }
+
+TEST_P(AV1Crc32cHashTest, DISABLED_Speed) { RunSpeedTest(GET_PARAM(0)); }
+
+const int kValidBlockSize[] = { 64, 32, 8, 4 };
+
+INSTANTIATE_TEST_SUITE_P(
+    C, AV1Crc32cHashTest,
+    ::testing::Combine(::testing::Values(&av1_get_crc32c_value_c),
+                       ::testing::ValuesIn(kValidBlockSize)));
+
+#if HAVE_SSE4_2
+INSTANTIATE_TEST_SUITE_P(
+    SSE4_2, AV1Crc32cHashTest,
+    ::testing::Combine(::testing::Values(&av1_get_crc32c_value_sse4_2),
+                       ::testing::ValuesIn(kValidBlockSize)));
+#endif
+
+}  // namespace
diff --git a/libs/libaom/src/test/hbd_metrics_test.cc b/libs/libaom/src/test/hbd_metrics_test.cc
new file mode 100644
index 000000000..5b03beee7
--- /dev/null
+++ b/libs/libaom/src/test/hbd_metrics_test.cc
@@ -0,0 +1,240 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <new>
+#include <tuple>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/acm_random.h"
+#include "test/util.h"
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/psnr.h"
+#include "aom_dsp/ssim.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/msvc.h"
+#include "aom_scale/yv12config.h"
+
+using libaom_test::ACMRandom;
+
+namespace {
+
+typedef double (*LBDMetricFunc)(const YV12_BUFFER_CONFIG *source,
+                                const YV12_BUFFER_CONFIG *dest);
+typedef double (*HBDMetricFunc)(const YV12_BUFFER_CONFIG *source,
+                                const YV12_BUFFER_CONFIG *dest, uint32_t in_bd,
+                                uint32_t bd);
+
+double compute_hbd_psnr(const YV12_BUFFER_CONFIG *source,
+                        const YV12_BUFFER_CONFIG *dest, uint32_t in_bd,
+                        uint32_t bd) {
+  PSNR_STATS psnr;
+  aom_calc_highbd_psnr(source, dest, &psnr, bd, in_bd);
+  return psnr.psnr[0];
+}
+
+double compute_psnr(const YV12_BUFFER_CONFIG *source,
+                    const YV12_BUFFER_CONFIG *dest) {
+  PSNR_STATS psnr;
+  aom_calc_psnr(source, dest, &psnr);
+  return psnr.psnr[0];
+}
+
+double compute_hbd_psnrhvs(const YV12_BUFFER_CONFIG *source,
+                           const YV12_BUFFER_CONFIG *dest, uint32_t in_bd,
+                           uint32_t bd) {
+  double tempy, tempu, tempv;
+  return aom_psnrhvs(source, dest, &tempy, &tempu, &tempv, bd, in_bd);
+}
+
+double compute_psnrhvs(const YV12_BUFFER_CONFIG *source,
+                       const YV12_BUFFER_CONFIG *dest) {
+  double tempy, tempu, tempv;
+  return aom_psnrhvs(source, dest, &tempy, &tempu, &tempv, 8, 8);
+}
+
+double compute_hbd_fastssim(const YV12_BUFFER_CONFIG *source,
+                            const YV12_BUFFER_CONFIG *dest, uint32_t in_bd,
+                            uint32_t bd) {
+  double tempy, tempu, tempv;
+  return aom_calc_fastssim(source, dest, &tempy, &tempu, &tempv, bd, in_bd);
+}
+
+double compute_fastssim(const YV12_BUFFER_CONFIG *source,
+                        const YV12_BUFFER_CONFIG *dest) {
+  double tempy, tempu, tempv;
+  return aom_calc_fastssim(source, dest, &tempy, &tempu, &tempv, 8, 8);
+}
+
+double compute_hbd_aomssim(const YV12_BUFFER_CONFIG *source,
+                           const YV12_BUFFER_CONFIG *dest, uint32_t in_bd,
+                           uint32_t bd) {
+  double ssim, weight;
+  ssim = aom_highbd_calc_ssim(source, dest, &weight, bd, in_bd);
+  return 100 * pow(ssim / weight, 8.0);
+}
+
+double compute_aomssim(const YV12_BUFFER_CONFIG *source,
+                       const YV12_BUFFER_CONFIG *dest) {
+  double ssim, weight;
+  ssim = aom_calc_ssim(source, dest, &weight);
+  return 100 * pow(ssim / weight, 8.0);
+}
+
+class HBDMetricsTestBase {
+ public:
+  virtual ~HBDMetricsTestBase() {}
+
+ protected:
+  void RunAccuracyCheck() {
+    const int width = 1920;
+    const int height = 1080;
+    size_t i = 0;
+    const uint8_t kPixFiller = 128;
+    YV12_BUFFER_CONFIG lbd_src, lbd_dst;
+    YV12_BUFFER_CONFIG hbd_src, hbd_dst;
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    double lbd_db, hbd_db;
+
+    memset(&lbd_src, 0, sizeof(lbd_src));
+    memset(&lbd_dst, 0, sizeof(lbd_dst));
+    memset(&hbd_src, 0, sizeof(hbd_src));
+    memset(&hbd_dst, 0, sizeof(hbd_dst));
+
+    aom_alloc_frame_buffer(&lbd_src, width, height, 1, 1, 0, 32, 16);
+    aom_alloc_frame_buffer(&lbd_dst, width, height, 1, 1, 0, 32, 16);
+    aom_alloc_frame_buffer(&hbd_src, width, height, 1, 1, 1, 32, 16);
+    aom_alloc_frame_buffer(&hbd_dst, width, height, 1, 1, 1, 32, 16);
+
+    memset(lbd_src.buffer_alloc, kPixFiller, lbd_src.buffer_alloc_sz);
+    while (i < lbd_src.buffer_alloc_sz) {
+      uint16_t spel, dpel;
+      spel = lbd_src.buffer_alloc[i];
+      // Create some distortion for dst buffer.
+      dpel = rnd.Rand8();
+      lbd_dst.buffer_alloc[i] = (uint8_t)dpel;
+      ((uint16_t *)(hbd_src.buffer_alloc))[i] = spel << (bit_depth_ - 8);
+      ((uint16_t *)(hbd_dst.buffer_alloc))[i] = dpel << (bit_depth_ - 8);
+      i++;
+    }
+
+    lbd_db = lbd_metric_(&lbd_src, &lbd_dst);
+    hbd_db = hbd_metric_(&hbd_src, &hbd_dst, input_bit_depth_, bit_depth_);
+    EXPECT_LE(fabs(lbd_db - hbd_db), threshold_);
+
+    i = 0;
+    while (i < lbd_src.buffer_alloc_sz) {
+      uint16_t dpel;
+      // Create some small distortion for dst buffer.
+      dpel = 120 + (rnd.Rand8() >> 4);
+      lbd_dst.buffer_alloc[i] = (uint8_t)dpel;
+      ((uint16_t *)(hbd_dst.buffer_alloc))[i] = dpel << (bit_depth_ - 8);
+      i++;
+    }
+
+    lbd_db = lbd_metric_(&lbd_src, &lbd_dst);
+    hbd_db = hbd_metric_(&hbd_src, &hbd_dst, input_bit_depth_, bit_depth_);
+    EXPECT_LE(fabs(lbd_db - hbd_db), threshold_);
+
+    i = 0;
+    while (i < lbd_src.buffer_alloc_sz) {
+      uint16_t dpel;
+      // Create some small distortion for dst buffer.
+      dpel = 126 + (rnd.Rand8() >> 6);
+      lbd_dst.buffer_alloc[i] = (uint8_t)dpel;
+      ((uint16_t *)(hbd_dst.buffer_alloc))[i] = dpel << (bit_depth_ - 8);
+      i++;
+    }
+
+    lbd_db = lbd_metric_(&lbd_src, &lbd_dst);
+    hbd_db = hbd_metric_(&hbd_src, &hbd_dst, input_bit_depth_, bit_depth_);
+    EXPECT_LE(fabs(lbd_db - hbd_db), threshold_);
+
+    aom_free_frame_buffer(&lbd_src);
+    aom_free_frame_buffer(&lbd_dst);
+    aom_free_frame_buffer(&hbd_src);
+    aom_free_frame_buffer(&hbd_dst);
+  }
+
+  int input_bit_depth_;
+  int bit_depth_;
+  double threshold_;
+  LBDMetricFunc lbd_metric_;
+  HBDMetricFunc hbd_metric_;
+};
+
+typedef std::tuple<LBDMetricFunc, HBDMetricFunc, int, int, double>
+    MetricTestTParam;
+class HBDMetricsTest : public HBDMetricsTestBase,
+                       public ::testing::TestWithParam<MetricTestTParam> {
+ public:
+  virtual void SetUp() {
+    lbd_metric_ = GET_PARAM(0);
+    hbd_metric_ = GET_PARAM(1);
+    input_bit_depth_ = GET_PARAM(2);
+    bit_depth_ = GET_PARAM(3);
+    threshold_ = GET_PARAM(4);
+  }
+  virtual void TearDown() {}
+};
+
+TEST_P(HBDMetricsTest, RunAccuracyCheck) { RunAccuracyCheck(); }
+
+// Allow small variation due to floating point operations.
+static const double kSsim_thresh = 0.001;
+// Allow some additional errors accumulated in floating point operations.
+static const double kFSsim_thresh = 0.03;
+// Allow some extra variation due to rounding error accumulated in dct.
+static const double kPhvs_thresh = 0.3;
+
+INSTANTIATE_TEST_SUITE_P(
+    AOMSSIM, HBDMetricsTest,
+    ::testing::Values(MetricTestTParam(&compute_aomssim, &compute_hbd_aomssim,
+                                       8, 10, kSsim_thresh),
+                      MetricTestTParam(&compute_aomssim, &compute_hbd_aomssim,
+                                       10, 10, kPhvs_thresh),
+                      MetricTestTParam(&compute_aomssim, &compute_hbd_aomssim,
+                                       8, 12, kSsim_thresh),
+                      MetricTestTParam(&compute_aomssim, &compute_hbd_aomssim,
+                                       12, 12, kPhvs_thresh)));
+INSTANTIATE_TEST_SUITE_P(
+    FASTSSIM, HBDMetricsTest,
+    ::testing::Values(MetricTestTParam(&compute_fastssim, &compute_hbd_fastssim,
+                                       8, 10, kFSsim_thresh),
+                      MetricTestTParam(&compute_fastssim, &compute_hbd_fastssim,
+                                       10, 10, kFSsim_thresh),
+                      MetricTestTParam(&compute_fastssim, &compute_hbd_fastssim,
+                                       8, 12, kFSsim_thresh),
+                      MetricTestTParam(&compute_fastssim, &compute_hbd_fastssim,
+                                       12, 12, kFSsim_thresh)));
+INSTANTIATE_TEST_SUITE_P(
+    PSNRHVS, HBDMetricsTest,
+    ::testing::Values(MetricTestTParam(&compute_psnrhvs, &compute_hbd_psnrhvs,
+                                       8, 10, kPhvs_thresh),
+                      MetricTestTParam(&compute_psnrhvs, &compute_hbd_psnrhvs,
+                                       10, 10, kPhvs_thresh),
+                      MetricTestTParam(&compute_psnrhvs, &compute_hbd_psnrhvs,
+                                       8, 12, kPhvs_thresh),
+                      MetricTestTParam(&compute_psnrhvs, &compute_hbd_psnrhvs,
+                                       12, 12, kPhvs_thresh)));
+INSTANTIATE_TEST_SUITE_P(
+    PSNR, HBDMetricsTest,
+    ::testing::Values(
+        MetricTestTParam(&compute_psnr, &compute_hbd_psnr, 8, 10, kPhvs_thresh),
+        MetricTestTParam(&compute_psnr, &compute_hbd_psnr, 10, 10,
+                         kPhvs_thresh),
+        MetricTestTParam(&compute_psnr, &compute_hbd_psnr, 8, 12, kPhvs_thresh),
+        MetricTestTParam(&compute_psnr, &compute_hbd_psnr, 12, 12,
+                         kPhvs_thresh)));
+}  // namespace
diff --git a/libs/libaom/src/test/hiprec_convolve_test.cc b/libs/libaom/src/test/hiprec_convolve_test.cc
new file mode 100644
index 000000000..59d28e883
--- /dev/null
+++ b/libs/libaom/src/test/hiprec_convolve_test.cc
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tuple>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/hiprec_convolve_test_util.h"
+
+using libaom_test::ACMRandom;
+#if CONFIG_AV1_HIGHBITDEPTH
+using libaom_test::AV1HighbdHiprecConvolve::AV1HighbdHiprecConvolveTest;
+#endif
+using libaom_test::AV1HiprecConvolve::AV1HiprecConvolveTest;
+using std::make_tuple;
+using std::tuple;
+
+namespace {
+
+TEST_P(AV1HiprecConvolveTest, CheckOutput) { RunCheckOutput(GET_PARAM(3)); }
+TEST_P(AV1HiprecConvolveTest, DISABLED_SpeedTest) {
+  RunSpeedTest(GET_PARAM(3));
+}
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(SSE2, AV1HiprecConvolveTest,
+                         libaom_test::AV1HiprecConvolve::BuildParams(
+                             av1_wiener_convolve_add_src_sse2));
+#endif
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, AV1HiprecConvolveTest,
+                         libaom_test::AV1HiprecConvolve::BuildParams(
+                             av1_wiener_convolve_add_src_avx2));
+#endif
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, AV1HiprecConvolveTest,
+                         libaom_test::AV1HiprecConvolve::BuildParams(
+                             av1_wiener_convolve_add_src_neon));
+#endif
+
+#if CONFIG_AV1_HIGHBITDEPTH
+#if HAVE_SSSE3 || HAVE_AVX2
+TEST_P(AV1HighbdHiprecConvolveTest, CheckOutput) {
+  RunCheckOutput(GET_PARAM(4));
+}
+TEST_P(AV1HighbdHiprecConvolveTest, DISABLED_SpeedTest) {
+  RunSpeedTest(GET_PARAM(4));
+}
+#if HAVE_SSSE3
+INSTANTIATE_TEST_SUITE_P(SSSE3, AV1HighbdHiprecConvolveTest,
+                         libaom_test::AV1HighbdHiprecConvolve::BuildParams(
+                             av1_highbd_wiener_convolve_add_src_ssse3));
+#endif
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, AV1HighbdHiprecConvolveTest,
+                         libaom_test::AV1HighbdHiprecConvolve::BuildParams(
+                             av1_highbd_wiener_convolve_add_src_avx2));
+#endif
+#endif
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+}  // namespace
diff --git a/libs/libaom/src/test/hiprec_convolve_test_util.cc b/libs/libaom/src/test/hiprec_convolve_test_util.cc
new file mode 100644
index 000000000..956af7fc8
--- /dev/null
+++ b/libs/libaom/src/test/hiprec_convolve_test_util.cc
@@ -0,0 +1,350 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "test/hiprec_convolve_test_util.h"
+
+#include "av1/common/restoration.h"
+
+using std::make_tuple;
+using std::tuple;
+
+namespace libaom_test {
+
+// Generate a random pair of filter kernels, using the ranges
+// of possible values from the loop-restoration experiment
+static void generate_kernels(ACMRandom *rnd, InterpKernel hkernel,
+                             InterpKernel vkernel, int kernel_type = 2) {
+  if (kernel_type == 0) {
+    // Low possible values for filter coefficients
+    hkernel[0] = hkernel[6] = vkernel[0] = vkernel[6] = WIENER_FILT_TAP0_MINV;
+    hkernel[1] = hkernel[5] = vkernel[1] = vkernel[5] = WIENER_FILT_TAP1_MINV;
+    hkernel[2] = hkernel[4] = vkernel[2] = vkernel[4] = WIENER_FILT_TAP2_MINV;
+    hkernel[3] = vkernel[3] = -2 * (hkernel[0] + hkernel[1] + hkernel[2]);
+    hkernel[7] = vkernel[7] = 0;
+  } else if (kernel_type == 1) {
+    // Max possible values for filter coefficients
+    hkernel[0] = hkernel[6] = vkernel[0] = vkernel[6] = WIENER_FILT_TAP0_MAXV;
+    hkernel[1] = hkernel[5] = vkernel[1] = vkernel[5] = WIENER_FILT_TAP1_MAXV;
+    hkernel[2] = hkernel[4] = vkernel[2] = vkernel[4] = WIENER_FILT_TAP2_MAXV;
+    hkernel[3] = vkernel[3] = -2 * (hkernel[0] + hkernel[1] + hkernel[2]);
+    hkernel[7] = vkernel[7] = 0;
+  } else {
+    // Randomly generated values for filter coefficients
+    hkernel[0] = hkernel[6] =
+        WIENER_FILT_TAP0_MINV +
+        rnd->PseudoUniform(WIENER_FILT_TAP0_MAXV + 1 - WIENER_FILT_TAP0_MINV);
+    hkernel[1] = hkernel[5] =
+        WIENER_FILT_TAP1_MINV +
+        rnd->PseudoUniform(WIENER_FILT_TAP1_MAXV + 1 - WIENER_FILT_TAP1_MINV);
+    hkernel[2] = hkernel[4] =
+        WIENER_FILT_TAP2_MINV +
+        rnd->PseudoUniform(WIENER_FILT_TAP2_MAXV + 1 - WIENER_FILT_TAP2_MINV);
+    hkernel[3] = -2 * (hkernel[0] + hkernel[1] + hkernel[2]);
+    hkernel[7] = 0;
+
+    vkernel[0] = vkernel[6] =
+        WIENER_FILT_TAP0_MINV +
+        rnd->PseudoUniform(WIENER_FILT_TAP0_MAXV + 2 - WIENER_FILT_TAP0_MINV);
+    vkernel[1] = vkernel[5] =
+        WIENER_FILT_TAP1_MINV +
+        rnd->PseudoUniform(WIENER_FILT_TAP1_MAXV + 2 - WIENER_FILT_TAP1_MINV);
+    vkernel[2] = vkernel[4] =
+        WIENER_FILT_TAP2_MINV +
+        rnd->PseudoUniform(WIENER_FILT_TAP2_MAXV + 2 - WIENER_FILT_TAP2_MINV);
+    vkernel[3] = -2 * (vkernel[0] + vkernel[1] + vkernel[2]);
+    vkernel[7] = 0;
+  }
+}
+
+namespace AV1HiprecConvolve {
+
+::testing::internal::ParamGenerator<HiprecConvolveParam> BuildParams(
+    hiprec_convolve_func filter) {
+  const HiprecConvolveParam params[] = {
+    make_tuple(8, 8, 50000, filter),   make_tuple(8, 4, 50000, filter),
+    make_tuple(64, 24, 1000, filter),  make_tuple(64, 64, 1000, filter),
+    make_tuple(64, 56, 1000, filter),  make_tuple(32, 8, 10000, filter),
+    make_tuple(32, 28, 10000, filter), make_tuple(32, 32, 10000, filter),
+    make_tuple(16, 34, 10000, filter), make_tuple(32, 34, 10000, filter),
+    make_tuple(64, 34, 1000, filter),  make_tuple(8, 17, 10000, filter),
+    make_tuple(16, 17, 10000, filter), make_tuple(32, 17, 10000, filter)
+  };
+  return ::testing::ValuesIn(params);
+}
+
+AV1HiprecConvolveTest::~AV1HiprecConvolveTest() {}
+void AV1HiprecConvolveTest::SetUp() {
+  rnd_.Reset(ACMRandom::DeterministicSeed());
+}
+
+void AV1HiprecConvolveTest::TearDown() { libaom_test::ClearSystemState(); }
+
+void AV1HiprecConvolveTest::RunCheckOutput(hiprec_convolve_func test_impl) {
+  const int w = 128, h = 128;
+  const int out_w = GET_PARAM(0), out_h = GET_PARAM(1);
+  const int num_iters = GET_PARAM(2);
+  int i, j, k, m;
+  const ConvolveParams conv_params = get_conv_params_wiener(8);
+
+  uint8_t *input_ = new uint8_t[h * w];
+  uint8_t *input = input_;
+
+  // The AVX2 convolve functions always write rows with widths that are
+  // multiples of 16. So to avoid a buffer overflow, we may need to pad
+  // rows to a multiple of 16.
+  int output_n = ALIGN_POWER_OF_TWO(out_w, 4) * out_h;
+  uint8_t *output = new uint8_t[output_n];
+  uint8_t *output2 = new uint8_t[output_n];
+
+  // Generate random filter kernels
+  DECLARE_ALIGNED(16, InterpKernel, hkernel);
+  DECLARE_ALIGNED(16, InterpKernel, vkernel);
+
+  for (int kernel_type = 0; kernel_type < 3; kernel_type++) {
+    generate_kernels(&rnd_, hkernel, vkernel, kernel_type);
+    for (i = 0; i < num_iters; ++i) {
+      for (k = 0; k < h; ++k)
+        for (m = 0; m < w; ++m) input[k * w + m] = rnd_.Rand8();
+      // Choose random locations within the source block
+      int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7);
+      int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7);
+      av1_wiener_convolve_add_src_c(input + offset_r * w + offset_c, w, output,
+                                    out_w, hkernel, 16, vkernel, 16, out_w,
+                                    out_h, &conv_params);
+      test_impl(input + offset_r * w + offset_c, w, output2, out_w, hkernel, 16,
+                vkernel, 16, out_w, out_h, &conv_params);
+
+      for (j = 0; j < out_w * out_h; ++j)
+        ASSERT_EQ(output[j], output2[j])
+            << "Pixel mismatch at index " << j << " = (" << (j % out_w) << ", "
+            << (j / out_w) << ") on iteration " << i;
+    }
+  }
+  delete[] input_;
+  delete[] output;
+  delete[] output2;
+}
+
+void AV1HiprecConvolveTest::RunSpeedTest(hiprec_convolve_func test_impl) {
+  const int w = 128, h = 128;
+  const int out_w = GET_PARAM(0), out_h = GET_PARAM(1);
+  const int num_iters = GET_PARAM(2) / 500;
+  int i, j, k;
+  const ConvolveParams conv_params = get_conv_params_wiener(8);
+
+  uint8_t *input_ = new uint8_t[h * w];
+  uint8_t *input = input_;
+
+  // The AVX2 convolve functions always write rows with widths that are
+  // multiples of 16. So to avoid a buffer overflow, we may need to pad
+  // rows to a multiple of 16.
+  int output_n = ALIGN_POWER_OF_TWO(out_w, 4) * out_h;
+  uint8_t *output = new uint8_t[output_n];
+  uint8_t *output2 = new uint8_t[output_n];
+
+  // Generate random filter kernels
+  DECLARE_ALIGNED(16, InterpKernel, hkernel);
+  DECLARE_ALIGNED(16, InterpKernel, vkernel);
+
+  generate_kernels(&rnd_, hkernel, vkernel);
+
+  for (i = 0; i < h; ++i)
+    for (j = 0; j < w; ++j) input[i * w + j] = rnd_.Rand8();
+
+  aom_usec_timer ref_timer;
+  aom_usec_timer_start(&ref_timer);
+  for (i = 0; i < num_iters; ++i) {
+    for (j = 3; j < h - out_h - 4; j++) {
+      for (k = 3; k < w - out_w - 4; k++) {
+        av1_wiener_convolve_add_src_c(input + j * w + k, w, output, out_w,
+                                      hkernel, 16, vkernel, 16, out_w, out_h,
+                                      &conv_params);
+      }
+    }
+  }
+  aom_usec_timer_mark(&ref_timer);
+  const int64_t ref_time = aom_usec_timer_elapsed(&ref_timer);
+
+  aom_usec_timer tst_timer;
+  aom_usec_timer_start(&tst_timer);
+  for (i = 0; i < num_iters; ++i) {
+    for (j = 3; j < h - out_h - 4; j++) {
+      for (k = 3; k < w - out_w - 4; k++) {
+        test_impl(input + j * w + k, w, output2, out_w, hkernel, 16, vkernel,
+                  16, out_w, out_h, &conv_params);
+      }
+    }
+  }
+  aom_usec_timer_mark(&tst_timer);
+  const int64_t tst_time = aom_usec_timer_elapsed(&tst_timer);
+
+  std::cout << "[          ] C time = " << ref_time / 1000
+            << " ms, SIMD time = " << tst_time / 1000 << " ms\n";
+
+  EXPECT_GT(ref_time, tst_time)
+      << "Error: AV1HiprecConvolveTest.SpeedTest, SIMD slower than C.\n"
+      << "C time: " << ref_time << " us\n"
+      << "SIMD time: " << tst_time << " us\n";
+
+  delete[] input_;
+  delete[] output;
+  delete[] output2;
+}
+}  // namespace AV1HiprecConvolve
+
+#if CONFIG_AV1_HIGHBITDEPTH
+namespace AV1HighbdHiprecConvolve {
+
+::testing::internal::ParamGenerator<HighbdHiprecConvolveParam> BuildParams(
+    highbd_hiprec_convolve_func filter) {
+  const HighbdHiprecConvolveParam params[] = {
+    make_tuple(8, 8, 50000, 8, filter),   make_tuple(64, 64, 1000, 8, filter),
+    make_tuple(32, 8, 10000, 8, filter),  make_tuple(8, 8, 50000, 10, filter),
+    make_tuple(64, 64, 1000, 10, filter), make_tuple(32, 8, 10000, 10, filter),
+    make_tuple(8, 8, 50000, 12, filter),  make_tuple(64, 64, 1000, 12, filter),
+    make_tuple(32, 8, 10000, 12, filter),
+  };
+  return ::testing::ValuesIn(params);
+}
+
+AV1HighbdHiprecConvolveTest::~AV1HighbdHiprecConvolveTest() {}
+void AV1HighbdHiprecConvolveTest::SetUp() {
+  rnd_.Reset(ACMRandom::DeterministicSeed());
+}
+
+void AV1HighbdHiprecConvolveTest::TearDown() {
+  libaom_test::ClearSystemState();
+}
+
+void AV1HighbdHiprecConvolveTest::RunCheckOutput(
+    highbd_hiprec_convolve_func test_impl) {
+  const int w = 128, h = 128;
+  const int out_w = GET_PARAM(0), out_h = GET_PARAM(1);
+  const int num_iters = GET_PARAM(2);
+  const int bd = GET_PARAM(3);
+  int i, j;
+  const ConvolveParams conv_params = get_conv_params_wiener(bd);
+
+  uint16_t *input = new uint16_t[h * w];
+
+  // The AVX2 convolve functions always write rows with widths that are
+  // multiples of 16. So to avoid a buffer overflow, we may need to pad
+  // rows to a multiple of 16.
+  int output_n = ALIGN_POWER_OF_TWO(out_w, 4) * out_h;
+  uint16_t *output = new uint16_t[output_n];
+  uint16_t *output2 = new uint16_t[output_n];
+
+  // Generate random filter kernels
+  DECLARE_ALIGNED(16, InterpKernel, hkernel);
+  DECLARE_ALIGNED(16, InterpKernel, vkernel);
+
+  for (i = 0; i < h; ++i)
+    for (j = 0; j < w; ++j) input[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
+
+  uint8_t *input_ptr = CONVERT_TO_BYTEPTR(input);
+  uint8_t *output_ptr = CONVERT_TO_BYTEPTR(output);
+  uint8_t *output2_ptr = CONVERT_TO_BYTEPTR(output2);
+  for (int kernel_type = 0; kernel_type < 3; kernel_type++) {
+    generate_kernels(&rnd_, hkernel, vkernel, kernel_type);
+    for (i = 0; i < num_iters; ++i) {
+      // Choose random locations within the source block
+      int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7);
+      int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7);
+      av1_highbd_wiener_convolve_add_src_c(
+          input_ptr + offset_r * w + offset_c, w, output_ptr, out_w, hkernel,
+          16, vkernel, 16, out_w, out_h, &conv_params, bd);
+      test_impl(input_ptr + offset_r * w + offset_c, w, output2_ptr, out_w,
+                hkernel, 16, vkernel, 16, out_w, out_h, &conv_params, bd);
+
+      for (j = 0; j < out_w * out_h; ++j)
+        ASSERT_EQ(output[j], output2[j])
+            << "Pixel mismatch at index " << j << " = (" << (j % out_w) << ", "
+            << (j / out_w) << ") on iteration " << i;
+    }
+  }
+  delete[] input;
+  delete[] output;
+  delete[] output2;
+}
+
+void AV1HighbdHiprecConvolveTest::RunSpeedTest(
+    highbd_hiprec_convolve_func test_impl) {
+  const int w = 128, h = 128;
+  const int out_w = GET_PARAM(0), out_h = GET_PARAM(1);
+  const int num_iters = GET_PARAM(2) / 500;
+  const int bd = GET_PARAM(3);
+  int i, j, k;
+  const ConvolveParams conv_params = get_conv_params_wiener(bd);
+
+  uint16_t *input = new uint16_t[h * w];
+
+  // The AVX2 convolve functions always write rows with widths that are
+  // multiples of 16. So to avoid a buffer overflow, we may need to pad
+  // rows to a multiple of 16.
+  int output_n = ALIGN_POWER_OF_TWO(out_w, 4) * out_h;
+  uint16_t *output = new uint16_t[output_n];
+  uint16_t *output2 = new uint16_t[output_n];
+
+  // Generate random filter kernels
+  DECLARE_ALIGNED(16, InterpKernel, hkernel);
+  DECLARE_ALIGNED(16, InterpKernel, vkernel);
+
+  generate_kernels(&rnd_, hkernel, vkernel);
+
+  for (i = 0; i < h; ++i)
+    for (j = 0; j < w; ++j) input[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
+
+  uint8_t *input_ptr = CONVERT_TO_BYTEPTR(input);
+  uint8_t *output_ptr = CONVERT_TO_BYTEPTR(output);
+  uint8_t *output2_ptr = CONVERT_TO_BYTEPTR(output2);
+
+  aom_usec_timer ref_timer;
+  aom_usec_timer_start(&ref_timer);
+  for (i = 0; i < num_iters; ++i) {
+    for (j = 3; j < h - out_h - 4; j++) {
+      for (k = 3; k < w - out_w - 4; k++) {
+        av1_highbd_wiener_convolve_add_src_c(
+            input_ptr + j * w + k, w, output_ptr, out_w, hkernel, 16, vkernel,
+            16, out_w, out_h, &conv_params, bd);
+      }
+    }
+  }
+  aom_usec_timer_mark(&ref_timer);
+  const int64_t ref_time = aom_usec_timer_elapsed(&ref_timer);
+
+  aom_usec_timer tst_timer;
+  aom_usec_timer_start(&tst_timer);
+  for (i = 0; i < num_iters; ++i) {
+    for (j = 3; j < h - out_h - 4; j++) {
+      for (k = 3; k < w - out_w - 4; k++) {
+        test_impl(input_ptr + j * w + k, w, output2_ptr, out_w, hkernel, 16,
+                  vkernel, 16, out_w, out_h, &conv_params, bd);
+      }
+    }
+  }
+  aom_usec_timer_mark(&tst_timer);
+  const int64_t tst_time = aom_usec_timer_elapsed(&tst_timer);
+
+  std::cout << "[          ] C time = " << ref_time / 1000
+            << " ms, SIMD time = " << tst_time / 1000 << " ms\n";
+
+  EXPECT_GT(ref_time, tst_time)
+      << "Error: AV1HighbdHiprecConvolveTest.SpeedTest, SIMD slower than C.\n"
+      << "C time: " << ref_time << " us\n"
+      << "SIMD time: " << tst_time << " us\n";
+
+  delete[] input;
+  delete[] output;
+  delete[] output2;
+}
+}  // namespace AV1HighbdHiprecConvolve
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+}  // namespace libaom_test
diff --git a/libs/libaom/src/test/hiprec_convolve_test_util.h b/libs/libaom/src/test/hiprec_convolve_test_util.h
new file mode 100644
index 000000000..6b6da4ee8
--- /dev/null
+++ b/libs/libaom/src/test/hiprec_convolve_test_util.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_TEST_HIPREC_CONVOLVE_TEST_UTIL_H_
+#define AOM_TEST_HIPREC_CONVOLVE_TEST_UTIL_H_
+
+#include <tuple>
+
+#include "config/av1_rtcd.h"
+
+#include "test/acm_random.h"
+#include "test/util.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "aom_ports/aom_timer.h"
+#include "av1/common/convolve.h"
+#include "av1/common/mv.h"
+
+namespace libaom_test {
+
+namespace AV1HiprecConvolve {
+
+typedef void (*hiprec_convolve_func)(const uint8_t *src, ptrdiff_t src_stride,
+                                     uint8_t *dst, ptrdiff_t dst_stride,
+                                     const int16_t *filter_x, int x_step_q4,
+                                     const int16_t *filter_y, int y_step_q4,
+                                     int w, int h,
+                                     const ConvolveParams *conv_params);
+
+typedef std::tuple<int, int, int, hiprec_convolve_func> HiprecConvolveParam;
+
+::testing::internal::ParamGenerator<HiprecConvolveParam> BuildParams(
+    hiprec_convolve_func filter);
+
+class AV1HiprecConvolveTest
+    : public ::testing::TestWithParam<HiprecConvolveParam> {
+ public:
+  virtual ~AV1HiprecConvolveTest();
+  virtual void SetUp();
+
+  virtual void TearDown();
+
+ protected:
+  void RunCheckOutput(hiprec_convolve_func test_impl);
+  void RunSpeedTest(hiprec_convolve_func test_impl);
+
+  libaom_test::ACMRandom rnd_;
+};
+
+}  // namespace AV1HiprecConvolve
+
+#if CONFIG_AV1_HIGHBITDEPTH
+namespace AV1HighbdHiprecConvolve {
+typedef void (*highbd_hiprec_convolve_func)(
+    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+    ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
+    const int16_t *filter_y, int y_step_q4, int w, int h,
+    const ConvolveParams *conv_params, int bps);
+
+typedef std::tuple<int, int, int, int, highbd_hiprec_convolve_func>
+    HighbdHiprecConvolveParam;
+
+::testing::internal::ParamGenerator<HighbdHiprecConvolveParam> BuildParams(
+    highbd_hiprec_convolve_func filter);
+
+class AV1HighbdHiprecConvolveTest
+    : public ::testing::TestWithParam<HighbdHiprecConvolveParam> {
+ public:
+  virtual ~AV1HighbdHiprecConvolveTest();
+  virtual void SetUp();
+
+  virtual void TearDown();
+
+ protected:
+  void RunCheckOutput(highbd_hiprec_convolve_func test_impl);
+  void RunSpeedTest(highbd_hiprec_convolve_func test_impl);
+
+  libaom_test::ACMRandom rnd_;
+};
+
+}  // namespace AV1HighbdHiprecConvolve
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+}  // namespace libaom_test
+
+#endif  // AOM_TEST_HIPREC_CONVOLVE_TEST_UTIL_H_
diff --git a/libs/libaom/src/test/horver_correlation_test.cc b/libs/libaom/src/test/horver_correlation_test.cc
new file mode 100644
index 000000000..ccb8eddd0
--- /dev/null
+++ b/libs/libaom/src/test/horver_correlation_test.cc
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tuple>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "test/acm_random.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+
+using libaom_test::ACMRandom;
+
+namespace {
+typedef void (*HorverFunc)(const int16_t *diff, int stride, int w, int h,
+                           float *hcorr, float *vcorr);
+
+typedef std::tuple<const HorverFunc> HorverTestParam;
+
+class HorverTest : public ::testing::TestWithParam<HorverTestParam> {
+ public:
+  virtual void SetUp() {
+    data_buf_ = (int16_t *)aom_malloc(MAX_SB_SQUARE * sizeof(int16_t));
+    ASSERT_NE(data_buf_, nullptr);
+    target_func_ = GET_PARAM(0);
+  }
+  virtual void TearDown() { aom_free(data_buf_); }
+  void RunHorverTest(void);
+  void RunHorverTest_ExtremeValues(void);
+  void RunHorverSpeedTest(int run_times);
+
+ private:
+  HorverFunc target_func_;
+  ACMRandom rng_;
+  int16_t *data_buf_;
+};
+
+void HorverTest::RunHorverTest(void) {
+  for (int block_size = 0; block_size < BLOCK_SIZES_ALL; block_size++) {
+    const int w = block_size_wide[block_size];
+    const int h = block_size_high[block_size];
+    for (int iter = 0; iter < 1000 && !HasFatalFailure(); ++iter) {
+      float hcorr_ref = 0.0, vcorr_ref = 0.0;
+      float hcorr_test = 0.0, vcorr_test = 0.0;
+
+      for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+        data_buf_[i] = (rng_.Rand16() % (1 << 12)) - (1 << 11);
+      }
+
+      av1_get_horver_correlation_full_c(data_buf_, MAX_SB_SIZE, w, h,
+                                        &hcorr_ref, &vcorr_ref);
+
+      target_func_(data_buf_, MAX_SB_SIZE, w, h, &hcorr_test, &vcorr_test);
+
+      ASSERT_LE(fabs(hcorr_ref - hcorr_test), 1e-6)
+          << "hcorr incorrect (" << w << "x" << h << ")";
+      ASSERT_LE(fabs(vcorr_ref - vcorr_test), 1e-6)
+          << "vcorr incorrect (" << w << "x" << h << ")";
+    }
+    //    printf("(%3dx%-3d) passed\n", w, h);
+  }
+}
+
+void HorverTest::RunHorverSpeedTest(int run_times) {
+  for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+    data_buf_[i] = rng_.Rand16() % (1 << 12);
+  }
+
+  for (int block_size = 0; block_size < BLOCK_SIZES_ALL; block_size++) {
+    const int w = block_size_wide[block_size];
+    const int h = block_size_high[block_size];
+    float hcorr_ref = 0.0, vcorr_ref = 0.0;
+    float hcorr_test = 0.0, vcorr_test = 0.0;
+
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < run_times; ++i) {
+      av1_get_horver_correlation_full_c(data_buf_, MAX_SB_SIZE, w, h,
+                                        &hcorr_ref, &vcorr_ref);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < run_times; ++i) {
+      target_func_(data_buf_, MAX_SB_SIZE, w, h, &hcorr_test, &vcorr_test);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+
+    printf("%3dx%-3d:%7.2f/%7.2fns (%3.2f)\n", w, h, time1, time2,
+           time1 / time2);
+  }
+}
+
+void HorverTest::RunHorverTest_ExtremeValues(void) {
+  for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+    // Most of get_horver_test is squaring and summing, so simply saturating
+    // the whole buffer is mostly likely to cause an overflow.
+    data_buf_[i] = (1 << 12) - 1;
+  }
+
+  for (int block_size = 0; block_size < BLOCK_SIZES_ALL; block_size++) {
+    const int w = block_size_wide[block_size];
+    const int h = block_size_high[block_size];
+    float hcorr_ref = 0.0, vcorr_ref = 0.0;
+    float hcorr_test = 0.0, vcorr_test = 0.0;
+
+    av1_get_horver_correlation_full_c(data_buf_, MAX_SB_SIZE, w, h, &hcorr_ref,
+                                      &vcorr_ref);
+    target_func_(data_buf_, MAX_SB_SIZE, w, h, &hcorr_test, &vcorr_test);
+
+    ASSERT_LE(fabs(hcorr_ref - hcorr_test), 1e-6) << "hcorr incorrect";
+    ASSERT_LE(fabs(vcorr_ref - vcorr_test), 1e-6) << "vcorr incorrect";
+  }
+}
+
+TEST_P(HorverTest, RandomValues) { RunHorverTest(); }
+
+TEST_P(HorverTest, ExtremeValues) { RunHorverTest_ExtremeValues(); }
+
+TEST_P(HorverTest, DISABLED_Speed) { RunHorverSpeedTest(100000); }
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(
+    SSE4_1, HorverTest,
+    ::testing::Values(av1_get_horver_correlation_full_sse4_1));
+#endif  // HAVE_SSE4_1
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, HorverTest, ::testing::Values(av1_get_horver_correlation_full_avx2));
+#endif  // HAVE_AVX2
+
+}  // namespace
diff --git a/libs/libaom/src/test/horz_superres_test.cc b/libs/libaom/src/test/horz_superres_test.cc
new file mode 100644
index 000000000..938b0b15a
--- /dev/null
+++ b/libs/libaom/src/test/horz_superres_test.cc
@@ -0,0 +1,406 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <memory>
+#include <ostream>
+#include <tuple>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "av1/encoder/encoder.h"
+
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+#include "test/yuv_video_source.h"
+
+namespace {
+
+using std::make_tuple;
+using std::tuple;
+
+/* TESTING PARAMETERS */
+
+const int kBitrate = 40;
+
+typedef struct {
+  const char *filename;
+  aom_img_fmt fmt;
+  aom_bit_depth_t bit_depth;
+  unsigned int profile;
+  unsigned int limit;
+  unsigned int screen_content;
+  double psnr_threshold;
+} TestVideoParam;
+
+std::ostream &operator<<(std::ostream &os, const TestVideoParam &test_arg) {
+  return os << "TestVideoParam { filename:" << test_arg.filename
+            << " fmt:" << test_arg.fmt << " bit_depth:" << test_arg.bit_depth
+            << " profile:" << test_arg.profile << " limit:" << test_arg.limit
+            << " screen_content:" << test_arg.screen_content
+            << " psnr_threshold:" << test_arg.psnr_threshold << " }";
+}
+
+const TestVideoParam kTestVideoVectors[] = {
+  { "park_joy_90p_8_420.y4m", AOM_IMG_FMT_I420, AOM_BITS_8, 0, 5, 0, 25.5 },
+#if CONFIG_AV1_HIGHBITDEPTH
+  { "park_joy_90p_10_444.y4m", AOM_IMG_FMT_I44416, AOM_BITS_10, 1, 5, 0, 28.0 },
+#endif
+  { "screendata.y4m", AOM_IMG_FMT_I420, AOM_BITS_8, 0, 4, 1, 20.0 },
+  // Image coding (single frame).
+  { "niklas_1280_720_30.y4m", AOM_IMG_FMT_I420, AOM_BITS_8, 0, 1, 0, 32.0 },
+};
+
+// Modes with extra params have their own tests.
+const SUPERRES_MODE kSuperresModesWithoutParams[] = { SUPERRES_RANDOM,
+                                                      SUPERRES_AUTO };
+
+// Superres denominators and superres kf denominators to be tested
+typedef tuple<int, int> SuperresDenominatorPair;
+const SuperresDenominatorPair kSuperresDenominators[] = {
+  make_tuple(16, 9),  make_tuple(13, 11), make_tuple(9, 9),
+  make_tuple(13, 13), make_tuple(11, 16), make_tuple(8, 16),
+  make_tuple(16, 8),  make_tuple(8, 8),   make_tuple(9, 14),
+};
+
+// Superres q thresholds and superres kf q thresholds to be tested
+typedef tuple<int, int> SuperresQThresholdPair;
+const SuperresQThresholdPair kSuperresQThresholds[] = {
+  make_tuple(63, 63), make_tuple(63, 41), make_tuple(17, 63),
+  make_tuple(41, 11), make_tuple(1, 37),  make_tuple(11, 11),
+  make_tuple(1, 1),   make_tuple(17, 29), make_tuple(29, 11),
+};
+
+/* END (TESTING PARAMETERS) */
+
+// Test parameter list:
+//  <[needed for EncoderTest], test_video_param_, superres_mode_>
+typedef tuple<const libaom_test::CodecFactory *, TestVideoParam, SUPERRES_MODE>
+    HorzSuperresTestParam;
+
+class HorzSuperresEndToEndTest
+    : public ::testing::TestWithParam<HorzSuperresTestParam>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  HorzSuperresEndToEndTest()
+      : EncoderTest(GET_PARAM(0)), test_video_param_(GET_PARAM(1)),
+        superres_mode_(GET_PARAM(2)), psnr_(0.0), frame_count_(0) {}
+
+  virtual ~HorzSuperresEndToEndTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(::libaom_test::kTwoPassGood);
+    cfg_.g_lag_in_frames = 5;
+    cfg_.rc_end_usage = AOM_Q;
+    cfg_.rc_target_bitrate = kBitrate;
+    cfg_.g_error_resilient = 0;
+    cfg_.g_profile = test_video_param_.profile;
+    cfg_.g_input_bit_depth = (unsigned int)test_video_param_.bit_depth;
+    cfg_.g_bit_depth = test_video_param_.bit_depth;
+    init_flags_ = AOM_CODEC_USE_PSNR;
+    if (cfg_.g_bit_depth > 8) init_flags_ |= AOM_CODEC_USE_HIGHBITDEPTH;
+
+    // Set superres parameters
+    cfg_.rc_superres_mode = superres_mode_;
+  }
+
+  virtual void BeginPassHook(unsigned int) {
+    psnr_ = 0.0;
+    frame_count_ = 0;
+  }
+
+  virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+    psnr_ += pkt->data.psnr.psnr[0];
+    frame_count_++;
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1);
+      encoder->Control(AV1E_SET_TILE_COLUMNS, 4);
+
+      // Set cpu-used = 8 for speed
+      encoder->Control(AOME_SET_CPUUSED, 8);
+
+      // Test screen coding tools
+      if (test_video_param_.screen_content)
+        encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_SCREEN);
+      else
+        encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_DEFAULT);
+
+      encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+      encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
+      encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
+    }
+  }
+
+  double GetAveragePsnr() const {
+    if (frame_count_) return psnr_ / frame_count_;
+    return 0.0;
+  }
+
+  void DoTest() {
+    std::unique_ptr<libaom_test::VideoSource> video;
+    video.reset(new libaom_test::Y4mVideoSource(test_video_param_.filename, 0,
+                                                test_video_param_.limit));
+    ASSERT_TRUE(video.get() != NULL);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+    const double psnr = GetAveragePsnr();
+    EXPECT_GT(psnr, test_video_param_.psnr_threshold)
+        << "superres_mode_ = " << superres_mode_;
+
+    EXPECT_EQ(test_video_param_.limit, frame_count_)
+        << "superres_mode_ = " << superres_mode_;
+  }
+
+  TestVideoParam test_video_param_;
+  SUPERRES_MODE superres_mode_;
+
+ private:
+  double psnr_;
+  unsigned int frame_count_;
+};
+
+TEST_P(HorzSuperresEndToEndTest, HorzSuperresEndToEndPSNRTest) { DoTest(); }
+
+AV1_INSTANTIATE_TEST_CASE(HorzSuperresEndToEndTest,
+                          ::testing::ValuesIn(kTestVideoVectors),
+                          ::testing::ValuesIn(kSuperresModesWithoutParams));
+
+// Test parameter list:
+//  <[needed for EncoderTest], test_video_param_, tuple(superres_denom_,
+//  superres_kf_denom_)>
+typedef tuple<const libaom_test::CodecFactory *, TestVideoParam,
+              SuperresDenominatorPair>
+    HorzSuperresFixedTestParam;
+
+class HorzSuperresFixedEndToEndTest
+    : public ::testing::TestWithParam<HorzSuperresFixedTestParam>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  HorzSuperresFixedEndToEndTest()
+      : EncoderTest(GET_PARAM(0)), test_video_param_(GET_PARAM(1)),
+        superres_mode_(SUPERRES_FIXED), psnr_(0.0), frame_count_(0) {
+    SuperresDenominatorPair denoms = GET_PARAM(2);
+    superres_denom_ = std::get<0>(denoms);
+    superres_kf_denom_ = std::get<1>(denoms);
+  }
+
+  virtual ~HorzSuperresFixedEndToEndTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(::libaom_test::kTwoPassGood);
+    cfg_.g_lag_in_frames = 5;
+    cfg_.rc_end_usage = AOM_VBR;
+    cfg_.rc_target_bitrate = kBitrate;
+    cfg_.g_error_resilient = 0;
+    cfg_.g_profile = test_video_param_.profile;
+    cfg_.g_input_bit_depth = (unsigned int)test_video_param_.bit_depth;
+    cfg_.g_bit_depth = test_video_param_.bit_depth;
+    init_flags_ = AOM_CODEC_USE_PSNR;
+    if (cfg_.g_bit_depth > 8) init_flags_ |= AOM_CODEC_USE_HIGHBITDEPTH;
+
+    // Set superres parameters
+    cfg_.rc_superres_mode = superres_mode_;
+    cfg_.rc_superres_denominator = superres_denom_;
+    cfg_.rc_superres_kf_denominator = superres_kf_denom_;
+  }
+
+  virtual void BeginPassHook(unsigned int) {
+    psnr_ = 0.0;
+    frame_count_ = 0;
+  }
+
+  virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+    psnr_ += pkt->data.psnr.psnr[0];
+    frame_count_++;
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1);
+      encoder->Control(AV1E_SET_TILE_COLUMNS, 4);
+
+      // Set cpu-used = 8 for speed
+      encoder->Control(AOME_SET_CPUUSED, 8);
+
+      // Test screen coding tools
+      if (test_video_param_.screen_content)
+        encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_SCREEN);
+      else
+        encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_DEFAULT);
+
+      encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+      encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
+      encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
+    }
+  }
+
+  double GetAveragePsnr() const {
+    if (frame_count_) return psnr_ / frame_count_;
+    return 0.0;
+  }
+
+  void DoTest() {
+    std::unique_ptr<libaom_test::VideoSource> video;
+    video.reset(new libaom_test::Y4mVideoSource(test_video_param_.filename, 0,
+                                                test_video_param_.limit));
+    ASSERT_TRUE(video.get() != NULL);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+    const double psnr = GetAveragePsnr();
+    EXPECT_GT(psnr, test_video_param_.psnr_threshold)
+        << "superres_mode_ = " << superres_mode_
+        << ", superres_denom_ = " << superres_denom_
+        << ", superres_kf_denom_ = " << superres_kf_denom_;
+
+    EXPECT_EQ(test_video_param_.limit, frame_count_)
+        << "superres_mode_ = " << superres_mode_
+        << ", superres_denom_ = " << superres_denom_
+        << ", superres_kf_denom_ = " << superres_kf_denom_;
+  }
+
+  TestVideoParam test_video_param_;
+  SUPERRES_MODE superres_mode_;
+  int superres_denom_;
+  int superres_kf_denom_;
+
+ private:
+  double psnr_;
+  unsigned int frame_count_;
+};
+
+TEST_P(HorzSuperresFixedEndToEndTest, HorzSuperresFixedTestParam) { DoTest(); }
+
+AV1_INSTANTIATE_TEST_CASE(HorzSuperresFixedEndToEndTest,
+                          ::testing::ValuesIn(kTestVideoVectors),
+                          ::testing::ValuesIn(kSuperresDenominators));
+
+// Test parameter list:
+//  <[needed for EncoderTest], test_video_param_,
+//  tuple(superres_qthresh_,superres_kf_qthresh_)>
+typedef tuple<const libaom_test::CodecFactory *, TestVideoParam,
+              SuperresQThresholdPair>
+    HorzSuperresQThreshTestParam;
+
+class HorzSuperresQThreshEndToEndTest
+    : public ::testing::TestWithParam<HorzSuperresQThreshTestParam>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  HorzSuperresQThreshEndToEndTest()
+      : EncoderTest(GET_PARAM(0)), test_video_param_(GET_PARAM(1)),
+        superres_mode_(SUPERRES_QTHRESH), psnr_(0.0), frame_count_(0) {
+    SuperresQThresholdPair qthresholds = GET_PARAM(2);
+    superres_qthresh_ = std::get<0>(qthresholds);
+    superres_kf_qthresh_ = std::get<1>(qthresholds);
+  }
+
+  virtual ~HorzSuperresQThreshEndToEndTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(::libaom_test::kTwoPassGood);
+    cfg_.g_lag_in_frames = 5;
+    cfg_.rc_end_usage = AOM_VBR;
+    cfg_.rc_target_bitrate = kBitrate;
+    cfg_.g_error_resilient = 0;
+    cfg_.g_profile = test_video_param_.profile;
+    cfg_.g_input_bit_depth = (unsigned int)test_video_param_.bit_depth;
+    cfg_.g_bit_depth = test_video_param_.bit_depth;
+    init_flags_ = AOM_CODEC_USE_PSNR;
+    if (cfg_.g_bit_depth > 8) init_flags_ |= AOM_CODEC_USE_HIGHBITDEPTH;
+
+    // Set superres parameters
+    cfg_.rc_superres_mode = superres_mode_;
+    cfg_.rc_superres_qthresh = superres_qthresh_;
+    cfg_.rc_superres_kf_qthresh = superres_kf_qthresh_;
+  }
+
+  virtual void BeginPassHook(unsigned int) {
+    psnr_ = 0.0;
+    frame_count_ = 0;
+  }
+
+  virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+    psnr_ += pkt->data.psnr.psnr[0];
+    frame_count_++;
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1);
+      encoder->Control(AV1E_SET_TILE_COLUMNS, 0);
+
+      // Set cpu-used = 8 for speed
+      encoder->Control(AOME_SET_CPUUSED, 8);
+
+      // Test screen coding tools
+      if (test_video_param_.screen_content)
+        encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_SCREEN);
+      else
+        encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_DEFAULT);
+
+      encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+      encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
+      encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
+    }
+  }
+
+  double GetAveragePsnr() const {
+    if (frame_count_) return psnr_ / frame_count_;
+    return 0.0;
+  }
+
+  void DoTest() {
+    std::unique_ptr<libaom_test::VideoSource> video;
+    video.reset(new libaom_test::Y4mVideoSource(test_video_param_.filename, 0,
+                                                test_video_param_.limit));
+    ASSERT_TRUE(video.get() != NULL);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+    const double psnr = GetAveragePsnr();
+    EXPECT_GT(psnr, test_video_param_.psnr_threshold)
+        << "superres_mode_ = " << superres_mode_
+        << ", superres_qthresh_ = " << superres_qthresh_
+        << ", superres_kf_qthresh_ = " << superres_kf_qthresh_;
+
+    EXPECT_EQ(test_video_param_.limit, frame_count_)
+        << "superres_mode_ = " << superres_mode_
+        << ", superres_qthresh_ = " << superres_qthresh_
+        << ", superres_kf_qthresh_ = " << superres_kf_qthresh_;
+  }
+
+  TestVideoParam test_video_param_;
+  SUPERRES_MODE superres_mode_;
+  int superres_qthresh_;
+  int superres_kf_qthresh_;
+
+ private:
+  double psnr_;
+  unsigned int frame_count_;
+};
+
+TEST_P(HorzSuperresQThreshEndToEndTest, HorzSuperresQThreshEndToEndPSNRTest) {
+  DoTest();
+}
+
+AV1_INSTANTIATE_TEST_CASE(HorzSuperresQThreshEndToEndTest,
+                          ::testing::ValuesIn(kTestVideoVectors),
+                          ::testing::ValuesIn(kSuperresQThresholds));
+
+}  // namespace
diff --git a/libs/libaom/src/test/i420_video_source.h b/libs/libaom/src/test/i420_video_source.h
new file mode 100644
index 000000000..233e7152b
--- /dev/null
+++ b/libs/libaom/src/test/i420_video_source.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_TEST_I420_VIDEO_SOURCE_H_
+#define AOM_TEST_I420_VIDEO_SOURCE_H_
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+
+#include "test/yuv_video_source.h"
+
+namespace libaom_test {
+
+// This class extends VideoSource to allow parsing of raw yv12
+// so that we can do actual file encodes.
+class I420VideoSource : public YUVVideoSource {
+ public:
+  I420VideoSource(const std::string &file_name, unsigned int width,
+                  unsigned int height, int rate_numerator, int rate_denominator,
+                  unsigned int start, int limit)
+      : YUVVideoSource(file_name, AOM_IMG_FMT_I420, width, height,
+                       rate_numerator, rate_denominator, start, limit) {}
+};
+
+}  // namespace libaom_test
+
+#endif  // AOM_TEST_I420_VIDEO_SOURCE_H_
diff --git a/libs/libaom/src/test/intra_edge_test.cc b/libs/libaom/src/test/intra_edge_test.cc
new file mode 100644
index 000000000..f7702c952
--- /dev/null
+++ b/libs/libaom/src/test/intra_edge_test.cc
@@ -0,0 +1,337 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/register_state_check.h"
+#include "test/function_equivalence_test.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "av1/common/enums.h"
+
+using libaom_test::FunctionEquivalenceTest;
+
+namespace {
+
+template <typename F, typename T>
+class UpsampleTest : public FunctionEquivalenceTest<F> {
+ protected:
+  static const int kIterations = 1000000;
+  static const int kMinEdge = 4;
+  static const int kMaxEdge = 24;
+  static const int kBufSize = 2 * 64 + 32;
+  static const int kOffset = 16;
+
+  virtual ~UpsampleTest() {}
+
+  virtual void Execute(T *edge_tst) = 0;
+
+  void Common() {
+    edge_ref_ = &edge_ref_data_[kOffset];
+    edge_tst_ = &edge_tst_data_[kOffset];
+
+    Execute(edge_tst_);
+
+    const int max_idx = (size_ - 1) * 2;
+    for (int r = -2; r <= max_idx; ++r) {
+      ASSERT_EQ(edge_ref_[r], edge_tst_[r]);
+    }
+  }
+
+  T edge_ref_data_[kBufSize];
+  T edge_tst_data_[kBufSize];
+
+  T *edge_ref_;
+  T *edge_tst_;
+
+  int size_;
+};
+
+//////////////////////////////////////////////////////////////////////////////
+// 8 bit version
+//////////////////////////////////////////////////////////////////////////////
+
+typedef void (*UP8B)(uint8_t *p, int size);
+typedef libaom_test::FuncParam<UP8B> TestFuncs;
+
+class UpsampleTest8B : public UpsampleTest<UP8B, uint8_t> {
+ protected:
+  void Execute(uint8_t *edge_tst) {
+    params_.ref_func(edge_ref_, size_);
+    ASM_REGISTER_STATE_CHECK(params_.tst_func(edge_tst, size_));
+  }
+};
+
+TEST_P(UpsampleTest8B, RandomValues) {
+  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    size_ = 4 * (this->rng_(4) + 1);
+
+    int i, pix = 0;
+    for (i = 0; i < kOffset + size_; ++i) {
+      pix = rng_.Rand8();
+      edge_ref_data_[i] = pix;
+      edge_tst_data_[i] = edge_ref_data_[i];
+    }
+
+    // Extend final sample
+    while (i < kBufSize) {
+      edge_ref_data_[i] = pix;
+      edge_tst_data_[i] = pix;
+      i++;
+    }
+
+    Common();
+  }
+}
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(
+    SSE4_1, UpsampleTest8B,
+    ::testing::Values(TestFuncs(av1_upsample_intra_edge_c,
+                                av1_upsample_intra_edge_sse4_1)));
+#endif  // HAVE_SSE4_1
+
+//////////////////////////////////////////////////////////////////////////////
+// High bit-depth version
+//////////////////////////////////////////////////////////////////////////////
+
+typedef void (*UPHB)(uint16_t *p, int size, int bd);
+typedef libaom_test::FuncParam<UPHB> TestFuncsHBD;
+
+class UpsampleTestHB : public UpsampleTest<UPHB, uint16_t> {
+ protected:
+  void Execute(uint16_t *edge_tst) {
+    params_.ref_func(edge_ref_, size_, bit_depth_);
+    ASM_REGISTER_STATE_CHECK(params_.tst_func(edge_tst, size_, bit_depth_));
+  }
+  int bit_depth_;
+};
+
+TEST_P(UpsampleTestHB, RandomValues) {
+  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    switch (rng_(3)) {
+      case 0: bit_depth_ = 8; break;
+      case 1: bit_depth_ = 10; break;
+      default: bit_depth_ = 12; break;
+    }
+    const int hi = 1 << bit_depth_;
+
+    size_ = 4 * (this->rng_(4) + 1);
+
+    int i, pix = 0;
+    for (i = 0; i < kOffset + size_; ++i) {
+      pix = rng_(hi);
+      edge_ref_data_[i] = pix;
+      edge_tst_data_[i] = pix;
+    }
+
+    // Extend final sample
+    while (i < kBufSize) {
+      edge_ref_data_[i] = pix;
+      edge_tst_data_[i] = pix;
+      i++;
+    }
+
+    Common();
+  }
+}
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(
+    SSE4_1, UpsampleTestHB,
+    ::testing::Values(TestFuncsHBD(av1_upsample_intra_edge_high_c,
+                                   av1_upsample_intra_edge_high_sse4_1)));
+#endif  // HAVE_SSE4_1
+
+template <typename F, typename T>
+class FilterEdgeTest : public FunctionEquivalenceTest<F> {
+ protected:
+  static const int kIterations = 1000000;
+  static const int kMaxEdge = 2 * 64;
+  static const int kBufSize = kMaxEdge + 32;
+  static const int kOffset = 15;
+
+  virtual ~FilterEdgeTest() {}
+
+  virtual void Execute(T *edge_tst) = 0;
+
+  void Common() {
+    edge_ref_ = &edge_ref_data_[kOffset];
+    edge_tst_ = &edge_tst_data_[kOffset];
+
+    Execute(edge_tst_);
+
+    for (int r = 0; r < size_; ++r) {
+      ASSERT_EQ(edge_ref_[r], edge_tst_[r]);
+    }
+  }
+
+  T edge_ref_data_[kBufSize];
+  T edge_tst_data_[kBufSize];
+
+  T *edge_ref_;
+  T *edge_tst_;
+
+  int size_;
+  int strength_;
+};
+
+//////////////////////////////////////////////////////////////////////////////
+// 8 bit version
+//////////////////////////////////////////////////////////////////////////////
+
+typedef void (*FE8B)(uint8_t *p, int size, int strength);
+typedef libaom_test::FuncParam<FE8B> FilterEdgeTestFuncs;
+
+class FilterEdgeTest8B : public FilterEdgeTest<FE8B, uint8_t> {
+ protected:
+  void Execute(uint8_t *edge_tst) {
+    params_.ref_func(edge_ref_, size_, strength_);
+    ASM_REGISTER_STATE_CHECK(params_.tst_func(edge_tst, size_, strength_));
+  }
+};
+
+TEST_P(FilterEdgeTest8B, RandomValues) {
+  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    strength_ = this->rng_(4);
+    size_ = 4 * (this->rng_(128 / 4) + 1) + 1;
+
+    int i, pix = 0;
+    for (i = 0; i < kOffset + size_; ++i) {
+      pix = rng_.Rand8();
+      edge_ref_data_[i] = pix;
+      edge_tst_data_[i] = pix;
+    }
+
+    Common();
+  }
+}
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(
+    SSE4_1, FilterEdgeTest8B,
+    ::testing::Values(FilterEdgeTestFuncs(av1_filter_intra_edge_c,
+                                          av1_filter_intra_edge_sse4_1)));
+#endif  // HAVE_SSE4_1
+
+//////////////////////////////////////////////////////////////////////////////
+// High bit-depth version
+//////////////////////////////////////////////////////////////////////////////
+
+typedef void (*FEHB)(uint16_t *p, int size, int strength);
+typedef libaom_test::FuncParam<FEHB> FilterEdgeTestFuncsHBD;
+
+class FilterEdgeTestHB : public FilterEdgeTest<FEHB, uint16_t> {
+ protected:
+  void Execute(uint16_t *edge_tst) {
+    params_.ref_func(edge_ref_, size_, strength_);
+    ASM_REGISTER_STATE_CHECK(params_.tst_func(edge_tst, size_, strength_));
+  }
+  int bit_depth_;
+};
+
+TEST_P(FilterEdgeTestHB, RandomValues) {
+  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    switch (rng_(3)) {
+      case 0: bit_depth_ = 8; break;
+      case 1: bit_depth_ = 10; break;
+      default: bit_depth_ = 12; break;
+    }
+    const int hi = 1 << bit_depth_;
+    strength_ = this->rng_(4);
+    size_ = 4 * (this->rng_(128 / 4) + 1) + 1;
+
+    int i, pix = 0;
+    for (i = 0; i < kOffset + size_; ++i) {
+      pix = rng_(hi);
+      edge_ref_data_[i] = pix;
+      edge_tst_data_[i] = pix;
+    }
+
+    Common();
+  }
+}
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE4_1, FilterEdgeTestHB,
+                         ::testing::Values(FilterEdgeTestFuncsHBD(
+                             av1_filter_intra_edge_high_c,
+                             av1_filter_intra_edge_high_sse4_1)));
+#endif  // HAVE_SSE4_1
+
+// Speed tests
+
+TEST_P(UpsampleTest8B, DISABLED_Speed) {
+  const int test_count = 10000000;
+  size_ = kMaxEdge;
+  for (int i = 0; i < kOffset + size_; ++i) {
+    edge_tst_data_[i] = rng_.Rand8();
+  }
+  edge_tst_ = &edge_tst_data_[kOffset];
+  for (int iter = 0; iter < test_count; ++iter) {
+    ASM_REGISTER_STATE_CHECK(params_.tst_func(edge_tst_, size_));
+  }
+}
+
+TEST_P(UpsampleTestHB, DISABLED_Speed) {
+  const int test_count = 10000000;
+  size_ = kMaxEdge;
+  bit_depth_ = 12;
+  const int hi = 1 << bit_depth_;
+  for (int i = 0; i < kOffset + size_; ++i) {
+    edge_tst_data_[i] = rng_(hi);
+  }
+  edge_tst_ = &edge_tst_data_[kOffset];
+  for (int iter = 0; iter < test_count; ++iter) {
+    ASM_REGISTER_STATE_CHECK(params_.tst_func(edge_tst_, size_, bit_depth_));
+  }
+}
+
+TEST_P(FilterEdgeTest8B, DISABLED_Speed) {
+  const int test_count = 10000000;
+  size_ = kMaxEdge;
+  strength_ = 1;
+  for (int i = 0; i < kOffset + size_; ++i) {
+    edge_tst_data_[i] = rng_.Rand8();
+  }
+  edge_tst_ = &edge_tst_data_[kOffset];
+  for (int iter = 0; iter < test_count; ++iter) {
+    ASM_REGISTER_STATE_CHECK(params_.tst_func(edge_tst_, size_, strength_));
+    // iterate over filter strengths (1,2,3)
+    strength_ = (strength_ == 3) ? 1 : strength_ + 1;
+  }
+}
+
+TEST_P(FilterEdgeTestHB, DISABLED_Speed) {
+  const int test_count = 10000000;
+  size_ = kMaxEdge;
+  strength_ = 1;
+  bit_depth_ = 12;
+  const int hi = 1 << bit_depth_;
+  for (int i = 0; i < kOffset + size_; ++i) {
+    edge_tst_data_[i] = rng_(hi);
+  }
+  edge_tst_ = &edge_tst_data_[kOffset];
+  for (int iter = 0; iter < test_count; ++iter) {
+    ASM_REGISTER_STATE_CHECK(params_.tst_func(edge_tst_, size_, strength_));
+    // iterate over filter strengths (1,2,3)
+    strength_ = (strength_ == 3) ? 1 : strength_ + 1;
+  }
+}
+
+}  // namespace
diff --git a/libs/libaom/src/test/intrabc_test.cc b/libs/libaom/src/test/intrabc_test.cc
new file mode 100644
index 000000000..b57eb6fab
--- /dev/null
+++ b/libs/libaom/src/test/intrabc_test.cc
@@ -0,0 +1,170 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/blockd.h"
+#include "av1/common/enums.h"
+#include "av1/common/mv.h"
+#include "av1/common/mvref_common.h"
+#include "av1/common/tile_common.h"
+
+namespace {
+TEST(IntrabcTest, DvValidation) {
+  struct DvTestCase {
+    MV dv;
+    int mi_row_offset;
+    int mi_col_offset;
+    BLOCK_SIZE bsize;
+    bool valid;
+  };
+  const int kSubPelScale = 8;
+  const int kTileMaxMibWidth = 8;
+  const DvTestCase kDvCases[] = {
+    { { 0, 0 }, 0, 0, BLOCK_128X128, false },
+    { { 0, 0 }, 0, 0, BLOCK_64X64, false },
+    { { 0, 0 }, 0, 0, BLOCK_32X32, false },
+    { { 0, 0 }, 0, 0, BLOCK_16X16, false },
+    { { 0, 0 }, 0, 0, BLOCK_8X8, false },
+    { { 0, 0 }, 0, 0, BLOCK_4X4, false },
+    { { -MAX_SB_SIZE * kSubPelScale, -MAX_SB_SIZE * kSubPelScale },
+      MAX_SB_SIZE / MI_SIZE,
+      MAX_SB_SIZE / MI_SIZE,
+      BLOCK_16X16,
+      true },
+    { { 0, -MAX_SB_SIZE * kSubPelScale },
+      MAX_SB_SIZE / MI_SIZE,
+      MAX_SB_SIZE / MI_SIZE,
+      BLOCK_16X16,
+      false },
+    { { -MAX_SB_SIZE * kSubPelScale, 0 },
+      MAX_SB_SIZE / MI_SIZE,
+      MAX_SB_SIZE / MI_SIZE,
+      BLOCK_16X16,
+      true },
+    { { MAX_SB_SIZE * kSubPelScale, 0 },
+      MAX_SB_SIZE / MI_SIZE,
+      MAX_SB_SIZE / MI_SIZE,
+      BLOCK_16X16,
+      false },
+    { { 0, MAX_SB_SIZE * kSubPelScale },
+      MAX_SB_SIZE / MI_SIZE,
+      MAX_SB_SIZE / MI_SIZE,
+      BLOCK_16X16,
+      false },
+    { { -32 * kSubPelScale, -32 * kSubPelScale },
+      MAX_SB_SIZE / MI_SIZE,
+      MAX_SB_SIZE / MI_SIZE,
+      BLOCK_32X32,
+      true },
+    { { -32 * kSubPelScale, -32 * kSubPelScale },
+      32 / MI_SIZE,
+      32 / MI_SIZE,
+      BLOCK_32X32,
+      false },
+    { { -32 * kSubPelScale - kSubPelScale / 2, -32 * kSubPelScale },
+      MAX_SB_SIZE / MI_SIZE,
+      MAX_SB_SIZE / MI_SIZE,
+      BLOCK_32X32,
+      false },
+    { { -33 * kSubPelScale, -32 * kSubPelScale },
+      MAX_SB_SIZE / MI_SIZE,
+      MAX_SB_SIZE / MI_SIZE,
+      BLOCK_32X32,
+      true },
+    { { -32 * kSubPelScale, -32 * kSubPelScale - kSubPelScale / 2 },
+      MAX_SB_SIZE / MI_SIZE,
+      MAX_SB_SIZE / MI_SIZE,
+      BLOCK_32X32,
+      false },
+    { { -32 * kSubPelScale, -33 * kSubPelScale },
+      MAX_SB_SIZE / MI_SIZE,
+      MAX_SB_SIZE / MI_SIZE,
+      BLOCK_32X32,
+      true },
+    { { -MAX_SB_SIZE * kSubPelScale, -MAX_SB_SIZE * kSubPelScale },
+      MAX_SB_SIZE / MI_SIZE,
+      MAX_SB_SIZE / MI_SIZE,
+      BLOCK_LARGEST,
+      true },
+    { { -(MAX_SB_SIZE + 1) * kSubPelScale, -MAX_SB_SIZE * kSubPelScale },
+      MAX_SB_SIZE / MI_SIZE,
+      MAX_SB_SIZE / MI_SIZE,
+      BLOCK_LARGEST,
+      false },
+    { { -MAX_SB_SIZE * kSubPelScale, -(MAX_SB_SIZE + 1) * kSubPelScale },
+      MAX_SB_SIZE / MI_SIZE,
+      MAX_SB_SIZE / MI_SIZE,
+      BLOCK_LARGEST,
+      false },
+    { { -(MAX_SB_SIZE - 1) * kSubPelScale, -MAX_SB_SIZE * kSubPelScale },
+      MAX_SB_SIZE / MI_SIZE,
+      MAX_SB_SIZE / MI_SIZE,
+      BLOCK_LARGEST,
+      false },
+    { { -MAX_SB_SIZE * kSubPelScale, -(MAX_SB_SIZE - 1) * kSubPelScale },
+      MAX_SB_SIZE / MI_SIZE,
+      MAX_SB_SIZE / MI_SIZE,
+      BLOCK_LARGEST,
+      true },
+    { { -(MAX_SB_SIZE - 1) * kSubPelScale, -(MAX_SB_SIZE - 1) * kSubPelScale },
+      MAX_SB_SIZE / MI_SIZE,
+      MAX_SB_SIZE / MI_SIZE,
+      BLOCK_LARGEST,
+      false },
+    { { -MAX_SB_SIZE * kSubPelScale, MAX_SB_SIZE * kSubPelScale },
+      MAX_SB_SIZE / MI_SIZE,
+      MAX_SB_SIZE / MI_SIZE,
+      BLOCK_LARGEST,
+      false },
+    { { -MAX_SB_SIZE * kSubPelScale,
+        (kTileMaxMibWidth - 2) * MAX_SB_SIZE * kSubPelScale },
+      MAX_SB_SIZE / MI_SIZE,
+      MAX_SB_SIZE / MI_SIZE,
+      BLOCK_LARGEST,
+      false },
+    { { -MAX_SB_SIZE * kSubPelScale,
+        ((kTileMaxMibWidth - 2) * MAX_SB_SIZE + 1) * kSubPelScale },
+      MAX_SB_SIZE / MI_SIZE,
+      MAX_SB_SIZE / MI_SIZE,
+      BLOCK_LARGEST,
+      false },
+  };
+
+  MACROBLOCKD xd;
+  memset(&xd, 0, sizeof(xd));
+  xd.tile.mi_row_start = 8 * MAX_MIB_SIZE;
+  xd.tile.mi_row_end = 16 * MAX_MIB_SIZE;
+  xd.tile.mi_col_start = 24 * MAX_MIB_SIZE;
+  xd.tile.mi_col_end = xd.tile.mi_col_start + kTileMaxMibWidth * MAX_MIB_SIZE;
+  xd.plane[1].subsampling_x = 1;
+  xd.plane[1].subsampling_y = 1;
+  xd.plane[2].subsampling_x = 1;
+  xd.plane[2].subsampling_y = 1;
+
+  AV1_COMMON cm;
+  memset(&cm, 0, sizeof(cm));
+
+  for (const DvTestCase &dv_case : kDvCases) {
+    const int mi_row = xd.tile.mi_row_start + dv_case.mi_row_offset;
+    const int mi_col = xd.tile.mi_col_start + dv_case.mi_col_offset;
+    xd.is_chroma_ref = is_chroma_reference(mi_row, mi_col, dv_case.bsize,
+                                           xd.plane[1].subsampling_x,
+                                           xd.plane[1].subsampling_y);
+    EXPECT_EQ(static_cast<int>(dv_case.valid),
+              av1_is_dv_valid(dv_case.dv, &cm, &xd, mi_row, mi_col,
+                              dv_case.bsize, MAX_MIB_SIZE_LOG2));
+  }
+}
+}  // namespace
diff --git a/libs/libaom/src/test/intrapred_test.cc b/libs/libaom/src/test/intrapred_test.cc
new file mode 100644
index 000000000..779cf9a5d
--- /dev/null
+++ b/libs/libaom/src/test/intrapred_test.cc
@@ -0,0 +1,273 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <string>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "av1/common/blockd.h"
+#include "av1/common/common.h"
+#include "av1/common/pred_common.h"
+#include "aom_mem/aom_mem.h"
+
+namespace {
+
+using libaom_test::ACMRandom;
+
+const int count_test_block = 100000;
+
+typedef void (*HighbdIntraPred)(uint16_t *dst, ptrdiff_t stride,
+                                const uint16_t *above, const uint16_t *left,
+                                int bps);
+typedef void (*IntraPred)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above,
+                          const uint8_t *left);
+
+}  // namespace
+
+// NOTE: Under gcc version 7.3.0 (Debian 7.3.0-5), if this template is in the
+// anonymous namespace, then we get a strange compiler warning in
+// the begin() and end() methods of the ParamGenerator template class in
+// gtest/internal/gtest-param-util.h:
+//   warning: ‘<anonymous>’ is used uninitialized in this function
+// As a workaround, put this template outside the anonymous namespace.
+// See bug aomedia:2003.
+template <typename FuncType>
+struct IntraPredFunc {
+  IntraPredFunc(FuncType pred = NULL, FuncType ref = NULL,
+                int block_width_value = 0, int block_height_value = 0,
+                int bit_depth_value = 0)
+      : pred_fn(pred), ref_fn(ref), block_width(block_width_value),
+        block_height(block_height_value), bit_depth(bit_depth_value) {}
+
+  FuncType pred_fn;
+  FuncType ref_fn;
+  int block_width;
+  int block_height;
+  int bit_depth;
+};
+
+namespace {
+
+template <typename FuncType, typename Pixel>
+class AV1IntraPredTest
+    : public ::testing::TestWithParam<IntraPredFunc<FuncType> > {
+ public:
+  void RunTest(Pixel *left_col, Pixel *above_data, Pixel *dst, Pixel *ref_dst) {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int block_width = params_.block_width;
+    const int block_height = params_.block_height;
+    above_row_ = above_data + 16;
+    left_col_ = left_col;
+    dst_ = dst;
+    ref_dst_ = ref_dst;
+    int error_count = 0;
+    for (int i = 0; i < count_test_block; ++i) {
+      // Fill edges with random data, try first with saturated values.
+      for (int x = -1; x <= block_width * 2; x++) {
+        if (i == 0) {
+          above_row_[x] = mask_;
+        } else {
+          above_row_[x] = rnd.Rand16() & mask_;
+        }
+      }
+      for (int y = 0; y < block_height; y++) {
+        if (i == 0) {
+          left_col_[y] = mask_;
+        } else {
+          left_col_[y] = rnd.Rand16() & mask_;
+        }
+      }
+      Predict();
+      CheckPrediction(i, &error_count);
+    }
+    ASSERT_EQ(0, error_count);
+  }
+
+ protected:
+  virtual void SetUp() {
+    params_ = this->GetParam();
+    stride_ = params_.block_width * 3;
+    mask_ = (1 << params_.bit_depth) - 1;
+  }
+
+  virtual void Predict() = 0;
+
+  void CheckPrediction(int test_case_number, int *error_count) const {
+    // For each pixel ensure that the calculated value is the same as reference.
+    const int block_width = params_.block_width;
+    const int block_height = params_.block_height;
+    for (int y = 0; y < block_height; y++) {
+      for (int x = 0; x < block_width; x++) {
+        *error_count += ref_dst_[x + y * stride_] != dst_[x + y * stride_];
+        if (*error_count == 1) {
+          ASSERT_EQ(ref_dst_[x + y * stride_], dst_[x + y * stride_])
+              << " Failed on Test Case Number " << test_case_number
+              << " location: x = " << x << " y = " << y;
+        }
+      }
+    }
+  }
+
+  Pixel *above_row_;
+  Pixel *left_col_;
+  Pixel *dst_;
+  Pixel *ref_dst_;
+  ptrdiff_t stride_;
+  int mask_;
+
+  IntraPredFunc<FuncType> params_;
+};
+
+#if CONFIG_AV1_HIGHBITDEPTH
+class HighbdIntraPredTest : public AV1IntraPredTest<HighbdIntraPred, uint16_t> {
+ protected:
+  void Predict() {
+    const int bit_depth = params_.bit_depth;
+    params_.ref_fn(ref_dst_, stride_, above_row_, left_col_, bit_depth);
+    ASM_REGISTER_STATE_CHECK(
+        params_.pred_fn(dst_, stride_, above_row_, left_col_, bit_depth));
+  }
+};
+#endif
+
+class LowbdIntraPredTest : public AV1IntraPredTest<IntraPred, uint8_t> {
+ protected:
+  void Predict() {
+    params_.ref_fn(ref_dst_, stride_, above_row_, left_col_);
+    ASM_REGISTER_STATE_CHECK(
+        params_.pred_fn(dst_, stride_, above_row_, left_col_));
+  }
+};
+
+#if CONFIG_AV1_HIGHBITDEPTH
+// Suppress an unitialized warning. Once there are implementations to test then
+// this can be restored.
+TEST_P(HighbdIntraPredTest, Bitexact) {
+  // max block size is 64
+  DECLARE_ALIGNED(16, uint16_t, left_col[2 * 64]);
+  DECLARE_ALIGNED(16, uint16_t, above_data[2 * 64 + 64]);
+  DECLARE_ALIGNED(16, uint16_t, dst[3 * 64 * 64]);
+  DECLARE_ALIGNED(16, uint16_t, ref_dst[3 * 64 * 64]);
+  av1_zero(left_col);
+  av1_zero(above_data);
+  RunTest(left_col, above_data, dst, ref_dst);
+}
+#endif
+
+// Same issue as above but for arm.
+#if !HAVE_NEON
+TEST_P(LowbdIntraPredTest, Bitexact) {
+  // max block size is 32
+  DECLARE_ALIGNED(16, uint8_t, left_col[2 * 32]);
+  DECLARE_ALIGNED(16, uint8_t, above_data[2 * 32 + 32]);
+  DECLARE_ALIGNED(16, uint8_t, dst[3 * 32 * 32]);
+  DECLARE_ALIGNED(16, uint8_t, ref_dst[3 * 32 * 32]);
+  av1_zero(left_col);
+  av1_zero(above_data);
+  RunTest(left_col, above_data, dst, ref_dst);
+}
+#endif  // !HAVE_NEON
+
+#if CONFIG_AV1_HIGHBITDEPTH
+// -----------------------------------------------------------------------------
+// High Bit Depth Tests
+#define highbd_entry(type, width, height, opt, bd)                          \
+  IntraPredFunc<HighbdIntraPred>(                                           \
+      &aom_highbd_##type##_predictor_##width##x##height##_##opt,            \
+      &aom_highbd_##type##_predictor_##width##x##height##_c, width, height, \
+      bd)
+
+#if 0
+#define highbd_intrapred(type, opt, bd)                                       \
+  highbd_entry(type, 4, 4, opt, bd), highbd_entry(type, 4, 8, opt, bd),       \
+      highbd_entry(type, 8, 4, opt, bd), highbd_entry(type, 8, 8, opt, bd),   \
+      highbd_entry(type, 8, 16, opt, bd), highbd_entry(type, 16, 8, opt, bd), \
+      highbd_entry(type, 16, 16, opt, bd),                                    \
+      highbd_entry(type, 16, 32, opt, bd),                                    \
+      highbd_entry(type, 32, 16, opt, bd), highbd_entry(type, 32, 32, opt, bd)
+#endif
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+// ---------------------------------------------------------------------------
+// Low Bit Depth Tests
+
+#define lowbd_entry(type, width, height, opt)                                  \
+  IntraPredFunc<IntraPred>(&aom_##type##_predictor_##width##x##height##_##opt, \
+                           &aom_##type##_predictor_##width##x##height##_c,     \
+                           width, height, 8)
+
+#define lowbd_intrapred(type, opt)                                    \
+  lowbd_entry(type, 4, 4, opt), lowbd_entry(type, 4, 8, opt),         \
+      lowbd_entry(type, 8, 4, opt), lowbd_entry(type, 8, 8, opt),     \
+      lowbd_entry(type, 8, 16, opt), lowbd_entry(type, 16, 8, opt),   \
+      lowbd_entry(type, 16, 16, opt), lowbd_entry(type, 16, 32, opt), \
+      lowbd_entry(type, 32, 16, opt), lowbd_entry(type, 32, 32, opt)
+
+#if HAVE_SSE2
+const IntraPredFunc<IntraPred> LowbdIntraPredTestVector[] = {
+  lowbd_intrapred(dc, sse2),      lowbd_intrapred(dc_top, sse2),
+  lowbd_intrapred(dc_left, sse2), lowbd_intrapred(dc_128, sse2),
+  lowbd_intrapred(v, sse2),       lowbd_intrapred(h, sse2),
+};
+
+INSTANTIATE_TEST_SUITE_P(SSE2, LowbdIntraPredTest,
+                         ::testing::ValuesIn(LowbdIntraPredTestVector));
+
+#endif  // HAVE_SSE2
+
+#if HAVE_SSSE3
+const IntraPredFunc<IntraPred> LowbdIntraPredTestVectorSsse3[] = {
+  lowbd_intrapred(paeth, ssse3),
+  lowbd_intrapred(smooth, ssse3),
+};
+
+INSTANTIATE_TEST_SUITE_P(SSSE3, LowbdIntraPredTest,
+                         ::testing::ValuesIn(LowbdIntraPredTestVectorSsse3));
+
+#endif  // HAVE_SSSE3
+
+#if HAVE_AVX2
+const IntraPredFunc<IntraPred> LowbdIntraPredTestVectorAvx2[] = {
+  lowbd_entry(dc, 32, 32, avx2),      lowbd_entry(dc_top, 32, 32, avx2),
+  lowbd_entry(dc_left, 32, 32, avx2), lowbd_entry(dc_128, 32, 32, avx2),
+  lowbd_entry(v, 32, 32, avx2),       lowbd_entry(h, 32, 32, avx2),
+  lowbd_entry(dc, 32, 16, avx2),      lowbd_entry(dc_top, 32, 16, avx2),
+  lowbd_entry(dc_left, 32, 16, avx2), lowbd_entry(dc_128, 32, 16, avx2),
+  lowbd_entry(v, 32, 16, avx2),       lowbd_entry(paeth, 16, 8, avx2),
+  lowbd_entry(paeth, 16, 16, avx2),   lowbd_entry(paeth, 16, 32, avx2),
+  lowbd_entry(paeth, 32, 16, avx2),   lowbd_entry(paeth, 32, 32, avx2),
+};
+
+INSTANTIATE_TEST_SUITE_P(AVX2, LowbdIntraPredTest,
+                         ::testing::ValuesIn(LowbdIntraPredTestVectorAvx2));
+
+#endif  // HAVE_AVX2
+
+#if CONFIG_AV1_HIGHBITDEPTH
+#if HAVE_NEON
+const IntraPredFunc<HighbdIntraPred> HighbdIntraPredTestVectorNeon[] = {
+  highbd_entry(dc, 4, 4, neon, 8),   highbd_entry(dc, 8, 8, neon, 8),
+  highbd_entry(dc, 16, 16, neon, 8), highbd_entry(dc, 32, 32, neon, 8),
+  highbd_entry(dc, 64, 64, neon, 8),
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON, HighbdIntraPredTest,
+                         ::testing::ValuesIn(HighbdIntraPredTestVectorNeon));
+
+#endif  // HAVE_NEON
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+}  // namespace
diff --git a/libs/libaom/src/test/invalid_file_test.cc b/libs/libaom/src/test/invalid_file_test.cc
new file mode 100644
index 000000000..dd0956d0c
--- /dev/null
+++ b/libs/libaom/src/test/invalid_file_test.cc
@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <cstdio>
+#include <ostream>
+#include <string>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/ivf_video_source.h"
+#include "test/util.h"
+#include "test/video_source.h"
+
+namespace {
+
+struct DecodeParam {
+  int threads;
+  const char *filename;
+  const char *res_filename;  // If NULL, the result filename is
+                             // filename + ".res".
+};
+
+// Constructs result file name.
+std::string GetResFilename(const DecodeParam &param) {
+  if (param.res_filename != NULL) return param.res_filename;
+  const std::string filename = param.filename;
+  return filename + ".res";
+}
+
+std::ostream &operator<<(std::ostream &os, const DecodeParam &dp) {
+  return os << "threads: " << dp.threads << " file: " << dp.filename
+            << " result file: " << GetResFilename(dp);
+}
+
+class InvalidFileTest : public ::libaom_test::DecoderTest,
+                        public ::libaom_test::CodecTestWithParam<DecodeParam> {
+ protected:
+  InvalidFileTest() : DecoderTest(GET_PARAM(0)), res_file_(NULL) {}
+
+  virtual ~InvalidFileTest() {
+    if (res_file_ != NULL) fclose(res_file_);
+  }
+
+  void OpenResFile(const std::string &res_file_name) {
+    res_file_ = libaom_test::OpenTestDataFile(res_file_name);
+    ASSERT_TRUE(res_file_ != NULL)
+        << "Result file open failed. Filename: " << res_file_name;
+  }
+
+  virtual void DecompressedFrameHook(const aom_image_t &img,
+                                     const unsigned int /*frame_number*/) {
+    EXPECT_NE(img.fb_priv, nullptr);
+  }
+
+  virtual bool HandleDecodeResult(
+      const aom_codec_err_t res_dec,
+      const libaom_test::CompressedVideoSource &video,
+      libaom_test::Decoder *decoder) {
+    EXPECT_TRUE(res_file_ != NULL);
+    int expected_res_dec = -1;
+
+    // Read integer result.
+    const int res = fscanf(res_file_, "%d", &expected_res_dec);
+    EXPECT_NE(res, EOF) << "Read result data failed";
+
+    if (expected_res_dec != -1) {
+      // Check results match.
+      const DecodeParam input = GET_PARAM(1);
+      if (input.threads > 1) {
+        // The serial decode check is too strict for tile-threaded decoding as
+        // there is no guarantee on the decode order nor which specific error
+        // will take precedence. Currently a tile-level error is not forwarded
+        // so the frame will simply be marked corrupt.
+        EXPECT_TRUE(res_dec == expected_res_dec ||
+                    res_dec == AOM_CODEC_CORRUPT_FRAME)
+            << "Results don't match: frame number = " << video.frame_number()
+            << ". (" << decoder->DecodeError()
+            << "). Expected: " << expected_res_dec << " or "
+            << AOM_CODEC_CORRUPT_FRAME;
+      } else {
+        EXPECT_EQ(expected_res_dec, res_dec)
+            << "Results don't match: frame number = " << video.frame_number()
+            << ". (" << decoder->DecodeError() << ")";
+      }
+    }
+
+    return !HasFailure();
+  }
+
+  virtual void HandlePeekResult(libaom_test::Decoder *const /*decoder*/,
+                                libaom_test::CompressedVideoSource * /*video*/,
+                                const aom_codec_err_t /*res_peek*/) {}
+
+  void RunTest() {
+    const DecodeParam input = GET_PARAM(1);
+    aom_codec_dec_cfg_t cfg = { 0, 0, 0, !FORCE_HIGHBITDEPTH_DECODING };
+    cfg.threads = input.threads;
+    const std::string filename = input.filename;
+    libaom_test::IVFVideoSource decode_video(filename);
+    decode_video.Init();
+
+    // The result file holds a list of expected integer results, one for each
+    // decoded frame.  Any result that doesn't match the file's list will
+    // cause a test failure.
+    const std::string res_filename = GetResFilename(input);
+    OpenResFile(res_filename);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&decode_video, cfg));
+  }
+
+ private:
+  FILE *res_file_;
+};
+
+TEST_P(InvalidFileTest, ReturnCode) { RunTest(); }
+
+// If res_filename (the third field) is NULL, then the result filename is
+// filename + ".res" by default. Set res_filename to a string if the result
+// filename differs from the default.
+const DecodeParam kAV1InvalidFileTests[] = {
+  // { threads, filename, res_filename }
+  { 1, "invalid-bug-1814.ivf", NULL },
+  { 1, "invalid-chromium-906381.ivf", NULL },
+  { 1, "invalid-google-142530197.ivf", NULL },
+  { 1, "invalid-google-142530197-1.ivf", NULL },
+  { 4, "invalid-oss-fuzz-9463.ivf", "invalid-oss-fuzz-9463.ivf.res.2" },
+  { 1, "invalid-oss-fuzz-9720.ivf", NULL },
+  { 1, "invalid-oss-fuzz-10389.ivf", "invalid-oss-fuzz-10389.ivf.res.2" },
+  { 1, "invalid-oss-fuzz-11523.ivf", "invalid-oss-fuzz-11523.ivf.res.2" },
+  { 4, "invalid-oss-fuzz-15363.ivf", NULL },
+  { 1, "invalid-oss-fuzz-16437.ivf", NULL },
+#if CONFIG_AV1_HIGHBITDEPTH
+  // These test vectors contain 10-bit or 12-bit video.
+  { 1, "invalid-oss-fuzz-9288.ivf", NULL },
+  { 1, "invalid-oss-fuzz-9482.ivf", NULL },
+  { 1, "invalid-oss-fuzz-10061.ivf", NULL },
+  { 1, "invalid-oss-fuzz-10117-mc-buf-use-highbd.ivf", NULL },
+  { 1, "invalid-oss-fuzz-10227.ivf", NULL },
+  { 4, "invalid-oss-fuzz-10555.ivf", NULL },
+  { 1, "invalid-oss-fuzz-10705.ivf", NULL },
+  { 1, "invalid-oss-fuzz-10723.ivf", "invalid-oss-fuzz-10723.ivf.res.2" },
+  { 1, "invalid-oss-fuzz-10779.ivf", NULL },
+  { 1, "invalid-oss-fuzz-11477.ivf", NULL },
+  { 1, "invalid-oss-fuzz-11479.ivf", "invalid-oss-fuzz-11479.ivf.res.2" },
+#endif
+};
+
+AV1_INSTANTIATE_TEST_CASE(InvalidFileTest,
+                          ::testing::ValuesIn(kAV1InvalidFileTests));
+
+}  // namespace
diff --git a/libs/libaom/src/test/ivf_video_source.h b/libs/libaom/src/test/ivf_video_source.h
new file mode 100644
index 000000000..ff2841445
--- /dev/null
+++ b/libs/libaom/src/test/ivf_video_source.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_TEST_IVF_VIDEO_SOURCE_H_
+#define AOM_TEST_IVF_VIDEO_SOURCE_H_
+
+#include <cstdio>
+#include <cstdlib>
+#include <new>
+#include <string>
+
+#include "aom_ports/sanitizer.h"
+#include "test/video_source.h"
+
+namespace libaom_test {
+const unsigned int kCodeBufferSize = 256 * 1024 * 1024;
+const unsigned int kIvfFileHdrSize = 32;
+const unsigned int kIvfFrameHdrSize = 12;
+
+static unsigned int MemGetLe32(const uint8_t *mem) {
+  return (mem[3] << 24) | (mem[2] << 16) | (mem[1] << 8) | (mem[0]);
+}
+
+// This class extends VideoSource to allow parsing of ivf files,
+// so that we can do actual file decodes.
+class IVFVideoSource : public CompressedVideoSource {
+ public:
+  explicit IVFVideoSource(const std::string &file_name)
+      : file_name_(file_name), input_file_(NULL), compressed_frame_buf_(NULL),
+        frame_sz_(0), frame_(0), end_of_file_(false) {}
+
+  virtual ~IVFVideoSource() {
+    delete[] compressed_frame_buf_;
+
+    if (input_file_) fclose(input_file_);
+  }
+
+  virtual void Init() {
+    // Allocate a buffer for read in the compressed video frame.
+    compressed_frame_buf_ = new uint8_t[kCodeBufferSize];
+    ASSERT_TRUE(compressed_frame_buf_ != NULL)
+        << "Allocate frame buffer failed";
+    ASAN_POISON_MEMORY_REGION(compressed_frame_buf_, kCodeBufferSize);
+  }
+
+  virtual void Begin() {
+    input_file_ = OpenTestDataFile(file_name_);
+    ASSERT_TRUE(input_file_ != NULL)
+        << "Input file open failed. Filename: " << file_name_;
+
+    // Read file header
+    uint8_t file_hdr[kIvfFileHdrSize];
+    ASSERT_EQ(kIvfFileHdrSize, fread(file_hdr, 1, kIvfFileHdrSize, input_file_))
+        << "File header read failed.";
+    // Check file header
+    ASSERT_TRUE(file_hdr[0] == 'D' && file_hdr[1] == 'K' &&
+                file_hdr[2] == 'I' && file_hdr[3] == 'F')
+        << "Input is not an IVF file.";
+
+    FillFrame();
+  }
+
+  virtual void Next() {
+    ++frame_;
+    FillFrame();
+  }
+
+  void FillFrame() {
+    ASSERT_TRUE(input_file_ != NULL);
+    uint8_t frame_hdr[kIvfFrameHdrSize];
+    // Check frame header and read a frame from input_file.
+    if (fread(frame_hdr, 1, kIvfFrameHdrSize, input_file_) !=
+        kIvfFrameHdrSize) {
+      end_of_file_ = true;
+    } else {
+      end_of_file_ = false;
+
+      frame_sz_ = MemGetLe32(frame_hdr);
+      ASSERT_LE(frame_sz_, kCodeBufferSize)
+          << "Frame is too big for allocated code buffer";
+      ASAN_UNPOISON_MEMORY_REGION(compressed_frame_buf_, kCodeBufferSize);
+      ASSERT_EQ(frame_sz_,
+                fread(compressed_frame_buf_, 1, frame_sz_, input_file_))
+          << "Failed to read complete frame";
+      ASAN_POISON_MEMORY_REGION(compressed_frame_buf_ + frame_sz_,
+                                kCodeBufferSize - frame_sz_);
+    }
+  }
+
+  virtual const uint8_t *cxdata() const {
+    return end_of_file_ ? NULL : compressed_frame_buf_;
+  }
+  virtual size_t frame_size() const { return frame_sz_; }
+  virtual unsigned int frame_number() const { return frame_; }
+
+ protected:
+  std::string file_name_;
+  FILE *input_file_;
+  uint8_t *compressed_frame_buf_;
+  size_t frame_sz_;
+  unsigned int frame_;
+  bool end_of_file_;
+};
+
+}  // namespace libaom_test
+
+#endif  // AOM_TEST_IVF_VIDEO_SOURCE_H_
diff --git a/libs/libaom/src/test/level_test.cc b/libs/libaom/src/test/level_test.cc
new file mode 100644
index 000000000..a9613c5f7
--- /dev/null
+++ b/libs/libaom/src/test/level_test.cc
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <memory>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+#include "test/yuv_video_source.h"
+
+namespace {
+const int kLevelMin = 0;
+const int kLevelMax = 31;
+const int kLevelKeepStats = 24;
+// Speed settings tested
+static const int kCpuUsedVectors[] = {
+  1,
+  2,
+  3,
+  4,
+};
+
+class LevelTest
+    : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode, int>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  LevelTest()
+      : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+        cpu_used_(GET_PARAM(2)), target_level_(31) {}
+
+  virtual ~LevelTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(encoding_mode_);
+    if (encoding_mode_ != ::libaom_test::kRealTime) {
+      cfg_.g_lag_in_frames = 5;
+      cfg_.rc_end_usage = AOM_VBR;
+    } else {
+      cfg_.g_lag_in_frames = 0;
+      cfg_.rc_end_usage = AOM_CBR;
+      cfg_.rc_buf_sz = 1000;
+      cfg_.rc_buf_initial_sz = 500;
+      cfg_.rc_buf_optimal_sz = 600;
+    }
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(AOME_SET_CPUUSED, cpu_used_);
+      encoder->Control(AV1E_SET_TARGET_SEQ_LEVEL_IDX, target_level_);
+      if (encoding_mode_ != ::libaom_test::kRealTime) {
+        encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+        encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
+        encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
+      }
+    }
+
+    encoder->Control(AV1E_GET_SEQ_LEVEL_IDX, level_);
+    ASSERT_LE(level_[0], kLevelMax);
+    ASSERT_GE(level_[0], kLevelMin);
+  }
+
+  libaom_test::TestMode encoding_mode_;
+  int cpu_used_;
+  int target_level_;
+  int level_[32];
+};
+
+TEST_P(LevelTest, TestTargetLevelApi) {
+  static const aom_codec_iface_t *codec = &aom_codec_av1_cx_algo;
+  aom_codec_ctx_t enc;
+  aom_codec_enc_cfg_t cfg;
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(codec, &cfg, 0));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, codec, &cfg, 0));
+  for (int operating_point = 0; operating_point <= 32; ++operating_point) {
+    for (int level = 0; level <= 32; ++level) {
+      const int target_level = operating_point * 100 + level;
+      if ((level <= 24 && level != 2 && level != 3 && level != 6 &&
+           level != 7 && level != 10 && level != 11 && level != 20 &&
+           level != 21 && level != 22 && level != 23) ||
+          level == 31 || operating_point > 31) {
+        EXPECT_EQ(AOM_CODEC_OK,
+                  AOM_CODEC_CONTROL_TYPECHECKED(
+                      &enc, AV1E_SET_TARGET_SEQ_LEVEL_IDX, target_level));
+      } else {
+        EXPECT_EQ(AOM_CODEC_INVALID_PARAM,
+                  AOM_CODEC_CONTROL_TYPECHECKED(
+                      &enc, AV1E_SET_TARGET_SEQ_LEVEL_IDX, target_level));
+      }
+    }
+  }
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
+}
+
+TEST_P(LevelTest, TestTargetLevel19) {
+  std::unique_ptr<libaom_test::VideoSource> video;
+  video.reset(new libaom_test::Y4mVideoSource("park_joy_90p_8_420.y4m", 0, 10));
+  ASSERT_TRUE(video.get() != NULL);
+  // Level index 19 corresponding to level 6.3.
+  target_level_ = 19;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+}
+
+TEST_P(LevelTest, TestLevelMonitoringLowBitrate) {
+  // To save run time, we only test speed 4.
+  if (cpu_used_ == 4) {
+    libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 40);
+    target_level_ = kLevelKeepStats;
+    cfg_.rc_target_bitrate = 1000;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_EQ(level_[0], 0);
+  }
+}
+
+TEST_P(LevelTest, TestLevelMonitoringHighBitrate) {
+  // To save run time, we only test speed 4.
+  if (cpu_used_ == 4) {
+    libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 40);
+    target_level_ = kLevelKeepStats;
+    cfg_.rc_target_bitrate = 4000;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_EQ(level_[0], 1);
+  }
+}
+
+TEST_P(LevelTest, TestTargetLevel0) {
+  // To save run time, we only test speed 4.
+  if (cpu_used_ == 4) {
+    libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 50);
+    const int target_level = 0;
+    target_level_ = target_level;
+    cfg_.rc_target_bitrate = 4000;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_EQ(level_[0], target_level);
+  }
+}
+
+AV1_INSTANTIATE_TEST_CASE(LevelTest,
+                          ::testing::Values(::libaom_test::kTwoPassGood),
+                          ::testing::ValuesIn(kCpuUsedVectors));
+}  // namespace
diff --git a/libs/libaom/src/test/lightfield_test.sh b/libs/libaom/src/test/lightfield_test.sh
new file mode 100644
index 000000000..3de88af87
--- /dev/null
+++ b/libs/libaom/src/test/lightfield_test.sh
@@ -0,0 +1,115 @@
+#!/bin/sh
+## Copyright (c) 2018, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+## This file tests the lightfield example.
+##
+. $(dirname $0)/tools_common.sh
+
+# Environment check: $infile is required.
+lightfield_test_verify_environment() {
+  local infile="${LIBAOM_TEST_DATA_PATH}/vase10x10.yuv"
+  if [ ! -e "${infile}" ]; then
+    echo "Libaom test data must exist in LIBAOM_TEST_DATA_PATH."
+    return 1
+  fi
+}
+
+# Run the lightfield example
+lightfield_test() {
+  local img_width=1024
+  local img_height=1024
+  local lf_width=10
+  local lf_height=10
+  local lf_blocksize=5
+  local num_references=4
+  local num_tile_lists=2
+
+  # Encode the lightfield.
+  local encoder="${LIBAOM_BIN_PATH}/lightfield_encoder${AOM_TEST_EXE_SUFFIX}"
+  local yuv_file="${LIBAOM_TEST_DATA_PATH}/vase10x10.yuv"
+  local lf_file="${AOM_TEST_OUTPUT_DIR}/vase10x10.ivf"
+  if [ ! -x "${encoder}" ]; then
+    elog "${encoder} does not exist or is not executable."
+    return 1
+  fi
+
+  eval "${AOM_TEST_PREFIX}" "${encoder}" "${img_width}" "${img_height}" \
+      "${yuv_file}" "${lf_file}" "${lf_width}" \
+      "${lf_height}" "${lf_blocksize}" ${devnull}
+
+  [ -e "${lf_file}" ] || return 1
+
+  # Check to ensure all camera frames have the identical frame header. If not identical, this test fails.
+  for i in ./fh*; do
+    diff ./fh004 $i > /dev/null
+      if [ $? -eq 1 ]; then
+      return 1
+    fi
+  done
+
+  # Check to ensure all camera frames use the identical frame context. If not identical, this test fails.
+  for i in ./fc*; do
+    diff ./fc004 $i > /dev/null
+      if [ $? -eq 1 ]; then
+      return 1
+    fi
+  done
+
+  # Parse lightfield bitstream to construct and output a new bitstream that can
+  # be decoded by an AV1 decoder.
+  local bs_decoder="${LIBAOM_BIN_PATH}/lightfield_bitstream_parsing${AOM_TEST_EXE_SUFFIX}"
+  local tl_file="${AOM_TEST_OUTPUT_DIR}/vase_tile_list.ivf"
+  local tl_text_file="${LIBAOM_TEST_DATA_PATH}/vase10x10_tiles.txt"
+  if [ ! -x "${bs_decoder}" ]; then
+    elog "${bs_decoder} does not exist or is not executable."
+    return 1
+  fi
+
+  eval "${AOM_TEST_PREFIX}" "${bs_decoder}" "${lf_file}" "${tl_file}" \
+      "${num_references}" "${tl_text_file}" ${devnull}
+
+  [ -e "${tl_file}" ] || return 1
+
+  # Run lightfield tile list decoder
+  local tl_decoder="${LIBAOM_BIN_PATH}/lightfield_tile_list_decoder${AOM_TEST_EXE_SUFFIX}"
+  local tl_outfile="${AOM_TEST_OUTPUT_DIR}/vase_tile_list.yuv"
+  if [ ! -x "${tl_decoder}" ]; then
+    elog "${tl_decoder} does not exist or is not executable."
+    return 1
+  fi
+
+  eval "${AOM_TEST_PREFIX}" "${tl_decoder}" "${tl_file}" "${tl_outfile}" \
+      "${num_references}" "${num_tile_lists}" ${devnull}
+
+  [ -e "${tl_outfile}" ] || return 1
+
+  # Run reference lightfield decoder
+  local ref_decoder="${LIBAOM_BIN_PATH}/lightfield_decoder${AOM_TEST_EXE_SUFFIX}"
+  local tl_reffile="${AOM_TEST_OUTPUT_DIR}/vase_reference.yuv"
+  if [ ! -x "${ref_decoder}" ]; then
+    elog "${ref_decoder} does not exist or is not executable."
+    return 1
+  fi
+
+  eval "${AOM_TEST_PREFIX}" "${ref_decoder}" "${lf_file}" "${tl_reffile}" \
+      "${num_references}" "${tl_text_file}" ${devnull}
+
+  [ -e "${tl_reffile}" ] || return 1
+
+  # Check if tl_outfile and tl_reffile are identical. If not identical, this test fails.
+  diff ${tl_outfile} ${tl_reffile} > /dev/null
+  if [ $? -eq 1 ]; then
+    return 1
+  fi
+}
+
+lightfield_test_tests="lightfield_test"
+
+run_tests lightfield_test_verify_environment "${lightfield_test_tests}"
diff --git a/libs/libaom/src/test/log2_test.cc b/libs/libaom/src/test/log2_test.cc
new file mode 100644
index 000000000..d7840c68b
--- /dev/null
+++ b/libs/libaom/src/test/log2_test.cc
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+
+#include "aom_ports/bitops.h"
+#include "av1/common/entropymode.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+TEST(Log2Test, GetMsb) {
+  // Test small numbers exhaustively.
+  for (unsigned int n = 1; n < 10000; n++) {
+    EXPECT_EQ(get_msb(n), static_cast<int>(floor(log2(n))));
+  }
+
+  // Test every power of 2 and the two adjacent numbers.
+  for (int exponent = 2; exponent < 32; exponent++) {
+    const unsigned int power_of_2 = 1U << exponent;
+    EXPECT_EQ(get_msb(power_of_2 - 1), exponent - 1);
+    EXPECT_EQ(get_msb(power_of_2), exponent);
+    EXPECT_EQ(get_msb(power_of_2 + 1), exponent);
+  }
+}
+
+TEST(Log2Test, Av1CeilLog2) {
+  // Test small numbers exhaustively.
+  EXPECT_EQ(av1_ceil_log2(0), 0);
+  for (int n = 1; n < 10000; n++) {
+    EXPECT_EQ(av1_ceil_log2(n), static_cast<int>(ceil(log2(n))));
+  }
+
+  // Test every power of 2 and the two adjacent numbers.
+  for (int exponent = 2; exponent < 31; exponent++) {
+    const int power_of_2 = 1 << exponent;
+    EXPECT_EQ(av1_ceil_log2(power_of_2 - 1), exponent);
+    EXPECT_EQ(av1_ceil_log2(power_of_2), exponent);
+    // The current implementation of av1_ceil_log2 only works up to 2^30.
+    if (exponent < 30) {
+      EXPECT_EQ(av1_ceil_log2(power_of_2 + 1), exponent + 1);
+    }
+  }
+}
diff --git a/libs/libaom/src/test/lossless_test.cc b/libs/libaom/src/test/lossless_test.cc
new file mode 100644
index 000000000..71ae5e72b
--- /dev/null
+++ b/libs/libaom/src/test/lossless_test.cc
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+
+namespace {
+
+const int kMaxPsnr = 100;
+
+class LosslessTestLarge
+    : public ::libaom_test::CodecTestWithParam<libaom_test::TestMode>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  LosslessTestLarge()
+      : EncoderTest(GET_PARAM(0)), psnr_(kMaxPsnr), nframes_(0),
+        encoding_mode_(GET_PARAM(1)) {}
+
+  virtual ~LosslessTestLarge() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(encoding_mode_);
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      // Only call Control if quantizer > 0 to verify that using quantizer
+      // alone will activate lossless
+      if (cfg_.rc_max_quantizer > 0 || cfg_.rc_min_quantizer > 0) {
+        encoder->Control(AV1E_SET_LOSSLESS, 1);
+      }
+    }
+  }
+
+  virtual void BeginPassHook(unsigned int /*pass*/) {
+    psnr_ = kMaxPsnr;
+    nframes_ = 0;
+  }
+
+  virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+    if (pkt->data.psnr.psnr[0] < psnr_) psnr_ = pkt->data.psnr.psnr[0];
+  }
+
+  double GetMinPsnr() const { return psnr_; }
+
+ private:
+  double psnr_;
+  unsigned int nframes_;
+  libaom_test::TestMode encoding_mode_;
+};
+
+TEST_P(LosslessTestLarge, TestLossLessEncoding) {
+  const aom_rational timebase = { 33333333, 1000000000 };
+  cfg_.g_timebase = timebase;
+  cfg_.rc_target_bitrate = 2000;
+  cfg_.g_lag_in_frames = 25;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 0;
+
+  init_flags_ = AOM_CODEC_USE_PSNR;
+
+  // intentionally changed the dimension for better testing coverage
+  libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                     timebase.den, timebase.num, 0, 5);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  const double psnr_lossless = GetMinPsnr();
+  EXPECT_GE(psnr_lossless, kMaxPsnr);
+}
+
+TEST_P(LosslessTestLarge, TestLossLessEncoding444) {
+  libaom_test::Y4mVideoSource video("rush_hour_444.y4m", 0, 5);
+
+  cfg_.g_profile = 1;
+  cfg_.g_timebase = video.timebase();
+  cfg_.rc_target_bitrate = 2000;
+  cfg_.g_lag_in_frames = 25;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 0;
+
+  init_flags_ = AOM_CODEC_USE_PSNR;
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  const double psnr_lossless = GetMinPsnr();
+  EXPECT_GE(psnr_lossless, kMaxPsnr);
+}
+
+TEST_P(LosslessTestLarge, TestLossLessEncodingCtrl) {
+  const aom_rational timebase = { 33333333, 1000000000 };
+  cfg_.g_timebase = timebase;
+  cfg_.rc_target_bitrate = 2000;
+  cfg_.g_lag_in_frames = 25;
+  // Intentionally set Q > 0, to make sure control can be used to activate
+  // lossless
+  cfg_.rc_min_quantizer = 10;
+  cfg_.rc_max_quantizer = 20;
+
+  init_flags_ = AOM_CODEC_USE_PSNR;
+
+  libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                     timebase.den, timebase.num, 0, 5);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  const double psnr_lossless = GetMinPsnr();
+  EXPECT_GE(psnr_lossless, kMaxPsnr);
+}
+
+AV1_INSTANTIATE_TEST_CASE(LosslessTestLarge,
+                          ::testing::Values(::libaom_test::kOnePassGood,
+                                            ::libaom_test::kTwoPassGood));
+}  // namespace
diff --git a/libs/libaom/src/test/lpf_test.cc b/libs/libaom/src/test/lpf_test.cc
new file mode 100644
index 000000000..e8eeceb7c
--- /dev/null
+++ b/libs/libaom/src/test/lpf_test.cc
@@ -0,0 +1,645 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <cmath>
+#include <cstdlib>
+#include <string>
+#include <tuple>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "av1/common/av1_loopfilter.h"
+#include "av1/common/entropy.h"
+#include "aom/aom_integer.h"
+
+using libaom_test::ACMRandom;
+
+namespace {
+// Horizontally and Vertically need 32x32: 8  Coeffs preceeding filtered section
+//                                         16 Coefs within filtered section
+//                                         8  Coeffs following filtered section
+const int kNumCoeffs = 1024;
+
+const int number_of_iterations = 10000;
+
+const int kSpeedTestNum = 500000;
+
+#define LOOP_PARAM \
+  int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh
+#define DUAL_LOOP_PARAM                                                      \
+  int p, const uint8_t *blimit0, const uint8_t *limit0,                      \
+      const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, \
+      const uint8_t *thresh1
+
+typedef void (*loop_op_t)(uint8_t *s, LOOP_PARAM);
+typedef void (*dual_loop_op_t)(uint8_t *s, DUAL_LOOP_PARAM);
+typedef void (*hbdloop_op_t)(uint16_t *s, LOOP_PARAM, int bd);
+typedef void (*hbddual_loop_op_t)(uint16_t *s, DUAL_LOOP_PARAM, int bd);
+
+typedef std::tuple<hbdloop_op_t, hbdloop_op_t, int> hbdloop_param_t;
+typedef std::tuple<hbddual_loop_op_t, hbddual_loop_op_t, int>
+    hbddual_loop_param_t;
+typedef std::tuple<loop_op_t, loop_op_t, int> loop_param_t;
+typedef std::tuple<dual_loop_op_t, dual_loop_op_t, int> dual_loop_param_t;
+
+template <typename Pixel_t, int PIXEL_WIDTH_t>
+void InitInput(Pixel_t *s, Pixel_t *ref_s, ACMRandom *rnd, const uint8_t limit,
+               const int mask, const int32_t p, const int i) {
+  uint16_t tmp_s[kNumCoeffs];
+
+  for (int j = 0; j < kNumCoeffs;) {
+    const uint8_t val = rnd->Rand8();
+    if (val & 0x80) {  // 50% chance to choose a new value.
+      tmp_s[j] = rnd->Rand16();
+      j++;
+    } else {  // 50% chance to repeat previous value in row X times.
+      int k = 0;
+      while (k++ < ((val & 0x1f) + 1) && j < kNumCoeffs) {
+        if (j < 1) {
+          tmp_s[j] = rnd->Rand16();
+        } else if (val & 0x20) {  // Increment by a value within the limit.
+          tmp_s[j] = static_cast<uint16_t>(tmp_s[j - 1] + (limit - 1));
+        } else {  // Decrement by a value within the limit.
+          tmp_s[j] = static_cast<uint16_t>(tmp_s[j - 1] - (limit - 1));
+        }
+        j++;
+      }
+    }
+  }
+
+  for (int j = 0; j < kNumCoeffs;) {
+    const uint8_t val = rnd->Rand8();
+    if (val & 0x80) {
+      j++;
+    } else {  // 50% chance to repeat previous value in column X times.
+      int k = 0;
+      while (k++ < ((val & 0x1f) + 1) && j < kNumCoeffs) {
+        if (j < 1) {
+          tmp_s[j] = rnd->Rand16();
+        } else if (val & 0x20) {  // Increment by a value within the limit.
+          tmp_s[(j % 32) * 32 + j / 32] = static_cast<uint16_t>(
+              tmp_s[((j - 1) % 32) * 32 + (j - 1) / 32] + (limit - 1));
+        } else {  // Decrement by a value within the limit.
+          tmp_s[(j % 32) * 32 + j / 32] = static_cast<uint16_t>(
+              tmp_s[((j - 1) % 32) * 32 + (j - 1) / 32] - (limit - 1));
+        }
+        j++;
+      }
+    }
+  }
+
+  for (int j = 0; j < kNumCoeffs; j++) {
+    if (i % 2) {
+      s[j] = tmp_s[j] & mask;
+    } else {
+      s[j] = tmp_s[p * (j % p) + j / p] & mask;
+    }
+    ref_s[j] = s[j];
+  }
+}
+
+uint8_t GetOuterThresh(ACMRandom *rnd) {
+  return static_cast<uint8_t>(rnd->PseudoUniform(3 * MAX_LOOP_FILTER + 5));
+}
+
+uint8_t GetInnerThresh(ACMRandom *rnd) {
+  return static_cast<uint8_t>(rnd->PseudoUniform(MAX_LOOP_FILTER + 1));
+}
+
+uint8_t GetHevThresh(ACMRandom *rnd) {
+  return static_cast<uint8_t>(rnd->PseudoUniform(MAX_LOOP_FILTER + 1) >> 4);
+}
+
+template <typename func_type_t, typename params_t>
+class LoopTestParam : public ::testing::TestWithParam<params_t> {
+ public:
+  virtual ~LoopTestParam() {}
+  virtual void SetUp() {
+    loopfilter_op_ = std::get<0>(this->GetParam());
+    ref_loopfilter_op_ = std::get<1>(this->GetParam());
+    bit_depth_ = std::get<2>(this->GetParam());
+    mask_ = (1 << bit_depth_) - 1;
+  }
+
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+  int bit_depth_;
+  int mask_;
+  func_type_t loopfilter_op_;
+  func_type_t ref_loopfilter_op_;
+};
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void call_filter(uint16_t *s, LOOP_PARAM, int bd, hbdloop_op_t op) {
+  op(s, p, blimit, limit, thresh, bd);
+}
+void call_dualfilter(uint16_t *s, DUAL_LOOP_PARAM, int bd,
+                     hbddual_loop_op_t op) {
+  op(s, p, blimit0, limit0, thresh0, blimit1, limit1, thresh1, bd);
+}
+#endif
+void call_filter(uint8_t *s, LOOP_PARAM, int bd, loop_op_t op) {
+  (void)bd;
+  op(s, p, blimit, limit, thresh);
+}
+void call_dualfilter(uint8_t *s, DUAL_LOOP_PARAM, int bd, dual_loop_op_t op) {
+  (void)bd;
+  op(s, p, blimit0, limit0, thresh0, blimit1, limit1, thresh1);
+};
+
+#if CONFIG_AV1_HIGHBITDEPTH
+typedef LoopTestParam<hbdloop_op_t, hbdloop_param_t> Loop8Test6Param_hbd;
+typedef LoopTestParam<hbddual_loop_op_t, hbddual_loop_param_t>
+    Loop8Test9Param_hbd;
+#endif
+typedef LoopTestParam<loop_op_t, loop_param_t> Loop8Test6Param_lbd;
+typedef LoopTestParam<dual_loop_op_t, dual_loop_param_t> Loop8Test9Param_lbd;
+
+#define OPCHECK(a, b)                                                          \
+  ACMRandom rnd(ACMRandom::DeterministicSeed());                               \
+  const int count_test_block = number_of_iterations;                           \
+  const int32_t p = kNumCoeffs / 32;                                           \
+  DECLARE_ALIGNED(b, a, s[kNumCoeffs]);                                        \
+  DECLARE_ALIGNED(b, a, ref_s[kNumCoeffs]);                                    \
+  int err_count_total = 0;                                                     \
+  int first_failure = -1;                                                      \
+  for (int i = 0; i < count_test_block; ++i) {                                 \
+    int err_count = 0;                                                         \
+    uint8_t tmp = GetOuterThresh(&rnd);                                        \
+    DECLARE_ALIGNED(16, const uint8_t,                                         \
+                    blimit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,    \
+                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };  \
+    tmp = GetInnerThresh(&rnd);                                                \
+    DECLARE_ALIGNED(16, const uint8_t,                                         \
+                    limit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,     \
+                                   tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };   \
+    tmp = GetHevThresh(&rnd);                                                  \
+    DECLARE_ALIGNED(16, const uint8_t,                                         \
+                    thresh[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,    \
+                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };  \
+    InitInput<a, b>(s, ref_s, &rnd, *limit, mask_, p, i);                      \
+    call_filter(ref_s + 8 + p * 8, p, blimit, limit, thresh, bit_depth_,       \
+                ref_loopfilter_op_);                                           \
+    ASM_REGISTER_STATE_CHECK(call_filter(s + 8 + p * 8, p, blimit, limit,      \
+                                         thresh, bit_depth_, loopfilter_op_)); \
+    for (int j = 0; j < kNumCoeffs; ++j) {                                     \
+      err_count += ref_s[j] != s[j];                                           \
+    }                                                                          \
+    if (err_count && !err_count_total) {                                       \
+      first_failure = i;                                                       \
+    }                                                                          \
+    err_count_total += err_count;                                              \
+  }                                                                            \
+  EXPECT_EQ(0, err_count_total)                                                \
+      << "Error: Loop8Test6Param, C output doesn't match SIMD "                \
+         "loopfilter output. "                                                 \
+      << "First failed at test case " << first_failure;
+
+#if CONFIG_AV1_HIGHBITDEPTH
+TEST_P(Loop8Test6Param_hbd, OperationCheck) { OPCHECK(uint16_t, 16); }
+#endif
+TEST_P(Loop8Test6Param_lbd, OperationCheck) { OPCHECK(uint8_t, 8); }
+
+#define VALCHECK(a, b)                                                         \
+  ACMRandom rnd(ACMRandom::DeterministicSeed());                               \
+  const int count_test_block = number_of_iterations;                           \
+  DECLARE_ALIGNED(b, a, s[kNumCoeffs]);                                        \
+  DECLARE_ALIGNED(b, a, ref_s[kNumCoeffs]);                                    \
+  int err_count_total = 0;                                                     \
+  int first_failure = -1;                                                      \
+  for (int i = 0; i < count_test_block; ++i) {                                 \
+    int err_count = 0;                                                         \
+    uint8_t tmp = GetOuterThresh(&rnd);                                        \
+    DECLARE_ALIGNED(16, const uint8_t,                                         \
+                    blimit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,    \
+                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };  \
+    tmp = GetInnerThresh(&rnd);                                                \
+    DECLARE_ALIGNED(16, const uint8_t,                                         \
+                    limit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,     \
+                                   tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };   \
+    tmp = GetHevThresh(&rnd);                                                  \
+    DECLARE_ALIGNED(16, const uint8_t,                                         \
+                    thresh[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,    \
+                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };  \
+    int32_t p = kNumCoeffs / 32;                                               \
+    for (int j = 0; j < kNumCoeffs; ++j) {                                     \
+      s[j] = rnd.Rand16() & mask_;                                             \
+      ref_s[j] = s[j];                                                         \
+    }                                                                          \
+    call_filter(ref_s + 8 + p * 8, p, blimit, limit, thresh, bit_depth_,       \
+                ref_loopfilter_op_);                                           \
+    ASM_REGISTER_STATE_CHECK(call_filter(s + 8 + p * 8, p, blimit, limit,      \
+                                         thresh, bit_depth_, loopfilter_op_)); \
+    for (int j = 0; j < kNumCoeffs; ++j) {                                     \
+      err_count += ref_s[j] != s[j];                                           \
+    }                                                                          \
+    if (err_count && !err_count_total) {                                       \
+      first_failure = i;                                                       \
+    }                                                                          \
+    err_count_total += err_count;                                              \
+  }                                                                            \
+  EXPECT_EQ(0, err_count_total)                                                \
+      << "Error: Loop8Test6Param, C output doesn't match SIMD "                \
+         "loopfilter output. "                                                 \
+      << "First failed at test case " << first_failure;
+
+#if CONFIG_AV1_HIGHBITDEPTH
+TEST_P(Loop8Test6Param_hbd, ValueCheck) { VALCHECK(uint16_t, 16); }
+#endif
+TEST_P(Loop8Test6Param_lbd, ValueCheck) { VALCHECK(uint8_t, 8); }
+
+#define SPEEDCHECK(a, b)                                                      \
+  ACMRandom rnd(ACMRandom::DeterministicSeed());                              \
+  const int count_test_block = kSpeedTestNum;                                 \
+  const int32_t bd = bit_depth_;                                              \
+  DECLARE_ALIGNED(b, a, s[kNumCoeffs]);                                       \
+  uint8_t tmp = GetOuterThresh(&rnd);                                         \
+  DECLARE_ALIGNED(16, const uint8_t,                                          \
+                  blimit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,     \
+                                  tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };   \
+  tmp = GetInnerThresh(&rnd);                                                 \
+  DECLARE_ALIGNED(16, const uint8_t,                                          \
+                  limit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,      \
+                                 tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };    \
+  tmp = GetHevThresh(&rnd);                                                   \
+  DECLARE_ALIGNED(16, const uint8_t,                                          \
+                  thresh[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,     \
+                                  tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };   \
+  int32_t p = kNumCoeffs / 32;                                                \
+  for (int j = 0; j < kNumCoeffs; ++j) {                                      \
+    s[j] = rnd.Rand16() & mask_;                                              \
+  }                                                                           \
+  for (int i = 0; i < count_test_block; ++i) {                                \
+    call_filter(s + 8 + p * 8, p, blimit, limit, thresh, bd, loopfilter_op_); \
+  }
+
+#if CONFIG_AV1_HIGHBITDEPTH
+TEST_P(Loop8Test6Param_hbd, DISABLED_Speed) { SPEEDCHECK(uint16_t, 16); }
+#endif
+TEST_P(Loop8Test6Param_lbd, DISABLED_Speed) { SPEEDCHECK(uint8_t, 8); }
+
+#define OPCHECKd(a, b)                                                         \
+  ACMRandom rnd(ACMRandom::DeterministicSeed());                               \
+  const int count_test_block = number_of_iterations;                           \
+  DECLARE_ALIGNED(b, a, s[kNumCoeffs]);                                        \
+  DECLARE_ALIGNED(b, a, ref_s[kNumCoeffs]);                                    \
+  int err_count_total = 0;                                                     \
+  int first_failure = -1;                                                      \
+  for (int i = 0; i < count_test_block; ++i) {                                 \
+    int err_count = 0;                                                         \
+    uint8_t tmp = GetOuterThresh(&rnd);                                        \
+    DECLARE_ALIGNED(16, const uint8_t,                                         \
+                    blimit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,   \
+                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
+    tmp = GetInnerThresh(&rnd);                                                \
+    DECLARE_ALIGNED(16, const uint8_t,                                         \
+                    limit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,    \
+                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };  \
+    tmp = GetHevThresh(&rnd);                                                  \
+    DECLARE_ALIGNED(16, const uint8_t,                                         \
+                    thresh0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,   \
+                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
+    tmp = GetOuterThresh(&rnd);                                                \
+    DECLARE_ALIGNED(16, const uint8_t,                                         \
+                    blimit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,   \
+                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
+    tmp = GetInnerThresh(&rnd);                                                \
+    DECLARE_ALIGNED(16, const uint8_t,                                         \
+                    limit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,    \
+                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };  \
+    tmp = GetHevThresh(&rnd);                                                  \
+    DECLARE_ALIGNED(16, const uint8_t,                                         \
+                    thresh1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,   \
+                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
+    int32_t p = kNumCoeffs / 32;                                               \
+    const uint8_t limit = *limit0 < *limit1 ? *limit0 : *limit1;               \
+    InitInput<a, b>(s, ref_s, &rnd, limit, mask_, p, i);                       \
+    call_dualfilter(ref_s + 8 + p * 8, p, blimit0, limit0, thresh0, blimit1,   \
+                    limit1, thresh1, bit_depth_, ref_loopfilter_op_);          \
+    ASM_REGISTER_STATE_CHECK(                                                  \
+        call_dualfilter(s + 8 + p * 8, p, blimit0, limit0, thresh0, blimit1,   \
+                        limit1, thresh1, bit_depth_, loopfilter_op_));         \
+    for (int j = 0; j < kNumCoeffs; ++j) {                                     \
+      err_count += ref_s[j] != s[j];                                           \
+    }                                                                          \
+    if (err_count && !err_count_total) {                                       \
+      first_failure = i;                                                       \
+    }                                                                          \
+    err_count_total += err_count;                                              \
+  }                                                                            \
+  EXPECT_EQ(0, err_count_total)                                                \
+      << "Error: Loop8Test9Param, C output doesn't match SIMD "                \
+         "loopfilter output. "                                                 \
+      << "First failed at test case " << first_failure;
+
+#if CONFIG_AV1_HIGHBITDEPTH
+TEST_P(Loop8Test9Param_hbd, OperationCheck) { OPCHECKd(uint16_t, 16); }
+#endif
+TEST_P(Loop8Test9Param_lbd, OperationCheck) { OPCHECKd(uint8_t, 8); }
+
+#define VALCHECKd(a, b)                                                        \
+  ACMRandom rnd(ACMRandom::DeterministicSeed());                               \
+  const int count_test_block = number_of_iterations;                           \
+  DECLARE_ALIGNED(b, a, s[kNumCoeffs]);                                        \
+  DECLARE_ALIGNED(b, a, ref_s[kNumCoeffs]);                                    \
+  int err_count_total = 0;                                                     \
+  int first_failure = -1;                                                      \
+  for (int i = 0; i < count_test_block; ++i) {                                 \
+    int err_count = 0;                                                         \
+    uint8_t tmp = GetOuterThresh(&rnd);                                        \
+    DECLARE_ALIGNED(16, const uint8_t,                                         \
+                    blimit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,   \
+                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
+    tmp = GetInnerThresh(&rnd);                                                \
+    DECLARE_ALIGNED(16, const uint8_t,                                         \
+                    limit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,    \
+                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };  \
+    tmp = GetHevThresh(&rnd);                                                  \
+    DECLARE_ALIGNED(16, const uint8_t,                                         \
+                    thresh0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,   \
+                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
+    tmp = GetOuterThresh(&rnd);                                                \
+    DECLARE_ALIGNED(16, const uint8_t,                                         \
+                    blimit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,   \
+                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
+    tmp = GetInnerThresh(&rnd);                                                \
+    DECLARE_ALIGNED(16, const uint8_t,                                         \
+                    limit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,    \
+                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };  \
+    tmp = GetHevThresh(&rnd);                                                  \
+    DECLARE_ALIGNED(16, const uint8_t,                                         \
+                    thresh1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,   \
+                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
+    int32_t p = kNumCoeffs / 32;                                               \
+    for (int j = 0; j < kNumCoeffs; ++j) {                                     \
+      s[j] = rnd.Rand16() & mask_;                                             \
+      ref_s[j] = s[j];                                                         \
+    }                                                                          \
+    call_dualfilter(ref_s + 8 + p * 8, p, blimit0, limit0, thresh0, blimit1,   \
+                    limit1, thresh1, bit_depth_, ref_loopfilter_op_);          \
+    ASM_REGISTER_STATE_CHECK(                                                  \
+        call_dualfilter(s + 8 + p * 8, p, blimit0, limit0, thresh0, blimit1,   \
+                        limit1, thresh1, bit_depth_, loopfilter_op_));         \
+    for (int j = 0; j < kNumCoeffs; ++j) {                                     \
+      err_count += ref_s[j] != s[j];                                           \
+    }                                                                          \
+    if (err_count && !err_count_total) {                                       \
+      first_failure = i;                                                       \
+    }                                                                          \
+    err_count_total += err_count;                                              \
+  }                                                                            \
+  EXPECT_EQ(0, err_count_total)                                                \
+      << "Error: Loop8Test9Param, C output doesn't match SIMD "                \
+         "loopfilter output. "                                                 \
+      << "First failed at test case " << first_failure;
+
+#if CONFIG_AV1_HIGHBITDEPTH
+TEST_P(Loop8Test9Param_hbd, ValueCheck) { VALCHECKd(uint16_t, 16); }
+#endif
+TEST_P(Loop8Test9Param_lbd, ValueCheck) { VALCHECKd(uint8_t, 8); }
+
+#define SPEEDCHECKd(a, b)                                                    \
+  ACMRandom rnd(ACMRandom::DeterministicSeed());                             \
+  const int count_test_block = kSpeedTestNum;                                \
+  DECLARE_ALIGNED(b, a, s[kNumCoeffs]);                                      \
+  uint8_t tmp = GetOuterThresh(&rnd);                                        \
+  DECLARE_ALIGNED(16, const uint8_t,                                         \
+                  blimit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,   \
+                                   tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
+  tmp = GetInnerThresh(&rnd);                                                \
+  DECLARE_ALIGNED(16, const uint8_t,                                         \
+                  limit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,    \
+                                  tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };  \
+  tmp = GetHevThresh(&rnd);                                                  \
+  DECLARE_ALIGNED(16, const uint8_t,                                         \
+                  thresh0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,   \
+                                   tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
+  tmp = GetOuterThresh(&rnd);                                                \
+  DECLARE_ALIGNED(16, const uint8_t,                                         \
+                  blimit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,   \
+                                   tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
+  tmp = GetInnerThresh(&rnd);                                                \
+  DECLARE_ALIGNED(16, const uint8_t,                                         \
+                  limit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,    \
+                                  tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };  \
+  tmp = GetHevThresh(&rnd);                                                  \
+  DECLARE_ALIGNED(16, const uint8_t,                                         \
+                  thresh1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,   \
+                                   tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; \
+  int32_t p = kNumCoeffs / 32;                                               \
+  for (int j = 0; j < kNumCoeffs; ++j) {                                     \
+    s[j] = rnd.Rand16() & mask_;                                             \
+  }                                                                          \
+  for (int i = 0; i < count_test_block; ++i) {                               \
+    call_dualfilter(s + 8 + p * 8, p, blimit0, limit0, thresh0, blimit1,     \
+                    limit1, thresh1, bit_depth_, loopfilter_op_);            \
+  }
+
+#if CONFIG_AV1_HIGHBITDEPTH
+TEST_P(Loop8Test9Param_hbd, DISABLED_Speed) { SPEEDCHECKd(uint16_t, 16); }
+#endif
+TEST_P(Loop8Test9Param_lbd, DISABLED_Speed) { SPEEDCHECKd(uint8_t, 8); }
+
+using std::make_tuple;
+
+#if HAVE_SSE2
+#if CONFIG_AV1_HIGHBITDEPTH
+const hbdloop_param_t kHbdLoop8Test6[] = {
+  make_tuple(&aom_highbd_lpf_horizontal_4_sse2, &aom_highbd_lpf_horizontal_4_c,
+             8),
+  make_tuple(&aom_highbd_lpf_vertical_4_sse2, &aom_highbd_lpf_vertical_4_c, 8),
+  make_tuple(&aom_highbd_lpf_horizontal_6_sse2, &aom_highbd_lpf_horizontal_6_c,
+             8),
+  make_tuple(&aom_highbd_lpf_horizontal_8_sse2, &aom_highbd_lpf_horizontal_8_c,
+             8),
+  make_tuple(&aom_highbd_lpf_horizontal_14_sse2,
+             &aom_highbd_lpf_horizontal_14_c, 8),
+  make_tuple(&aom_highbd_lpf_vertical_6_sse2, &aom_highbd_lpf_vertical_6_c, 8),
+  make_tuple(&aom_highbd_lpf_vertical_8_sse2, &aom_highbd_lpf_vertical_8_c, 8),
+
+  make_tuple(&aom_highbd_lpf_vertical_14_sse2, &aom_highbd_lpf_vertical_14_c,
+             8),
+  make_tuple(&aom_highbd_lpf_horizontal_4_sse2, &aom_highbd_lpf_horizontal_4_c,
+             10),
+  make_tuple(&aom_highbd_lpf_vertical_4_sse2, &aom_highbd_lpf_vertical_4_c, 10),
+  make_tuple(&aom_highbd_lpf_horizontal_6_sse2, &aom_highbd_lpf_horizontal_6_c,
+             10),
+  make_tuple(&aom_highbd_lpf_horizontal_8_sse2, &aom_highbd_lpf_horizontal_8_c,
+             10),
+  make_tuple(&aom_highbd_lpf_horizontal_14_sse2,
+             &aom_highbd_lpf_horizontal_14_c, 10),
+  make_tuple(&aom_highbd_lpf_vertical_6_sse2, &aom_highbd_lpf_vertical_6_c, 10),
+  make_tuple(&aom_highbd_lpf_vertical_8_sse2, &aom_highbd_lpf_vertical_8_c, 10),
+  make_tuple(&aom_highbd_lpf_vertical_14_sse2, &aom_highbd_lpf_vertical_14_c,
+             10),
+  make_tuple(&aom_highbd_lpf_horizontal_4_sse2, &aom_highbd_lpf_horizontal_4_c,
+             12),
+  make_tuple(&aom_highbd_lpf_vertical_4_sse2, &aom_highbd_lpf_vertical_4_c, 12),
+  make_tuple(&aom_highbd_lpf_horizontal_6_sse2, &aom_highbd_lpf_horizontal_6_c,
+             12),
+  make_tuple(&aom_highbd_lpf_horizontal_8_sse2, &aom_highbd_lpf_horizontal_8_c,
+             12),
+  make_tuple(&aom_highbd_lpf_horizontal_14_sse2,
+             &aom_highbd_lpf_horizontal_14_c, 12),
+  make_tuple(&aom_highbd_lpf_vertical_14_sse2, &aom_highbd_lpf_vertical_14_c,
+             12),
+  make_tuple(&aom_highbd_lpf_vertical_6_sse2, &aom_highbd_lpf_vertical_6_c, 12),
+  make_tuple(&aom_highbd_lpf_vertical_8_sse2, &aom_highbd_lpf_vertical_8_c, 12)
+};
+
+INSTANTIATE_TEST_SUITE_P(SSE2, Loop8Test6Param_hbd,
+                         ::testing::ValuesIn(kHbdLoop8Test6));
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+const loop_param_t kLoop8Test6[] = {
+  make_tuple(&aom_lpf_horizontal_4_sse2, &aom_lpf_horizontal_4_c, 8),
+  make_tuple(&aom_lpf_horizontal_8_sse2, &aom_lpf_horizontal_8_c, 8),
+  make_tuple(&aom_lpf_horizontal_6_sse2, &aom_lpf_horizontal_6_c, 8),
+  make_tuple(&aom_lpf_vertical_6_sse2, &aom_lpf_vertical_6_c, 8),
+  make_tuple(&aom_lpf_horizontal_14_sse2, &aom_lpf_horizontal_14_c, 8),
+  make_tuple(&aom_lpf_vertical_4_sse2, &aom_lpf_vertical_4_c, 8),
+  make_tuple(&aom_lpf_vertical_8_sse2, &aom_lpf_vertical_8_c, 8),
+  make_tuple(&aom_lpf_vertical_14_sse2, &aom_lpf_vertical_14_c, 8),
+};
+
+INSTANTIATE_TEST_SUITE_P(SSE2, Loop8Test6Param_lbd,
+                         ::testing::ValuesIn(kLoop8Test6));
+
+const dual_loop_param_t kLoop8Test9[] = {
+  make_tuple(&aom_lpf_horizontal_4_dual_sse2, &aom_lpf_horizontal_4_dual_c, 8),
+  make_tuple(&aom_lpf_vertical_4_dual_sse2, &aom_lpf_vertical_4_dual_c, 8),
+  make_tuple(&aom_lpf_horizontal_6_dual_sse2, &aom_lpf_horizontal_6_dual_c, 8),
+  make_tuple(&aom_lpf_vertical_6_dual_sse2, &aom_lpf_vertical_6_dual_c, 8),
+  make_tuple(&aom_lpf_horizontal_8_dual_sse2, &aom_lpf_horizontal_8_dual_c, 8),
+  make_tuple(&aom_lpf_vertical_8_dual_sse2, &aom_lpf_vertical_8_dual_c, 8),
+  make_tuple(&aom_lpf_horizontal_14_dual_sse2, &aom_lpf_horizontal_14_dual_c,
+             8),
+  make_tuple(&aom_lpf_vertical_14_dual_sse2, &aom_lpf_vertical_14_dual_c, 8)
+};
+
+INSTANTIATE_TEST_SUITE_P(SSE2, Loop8Test9Param_lbd,
+                         ::testing::ValuesIn(kLoop8Test9));
+
+#endif  // HAVE_SSE2
+
+#if HAVE_SSE2 && CONFIG_AV1_HIGHBITDEPTH
+const hbddual_loop_param_t kHbdLoop8Test9[] = {
+  make_tuple(&aom_highbd_lpf_horizontal_4_dual_sse2,
+             &aom_highbd_lpf_horizontal_4_dual_c, 8),
+  make_tuple(&aom_highbd_lpf_horizontal_6_dual_sse2,
+             &aom_highbd_lpf_horizontal_6_dual_c, 8),
+  make_tuple(&aom_highbd_lpf_horizontal_8_dual_sse2,
+             &aom_highbd_lpf_horizontal_8_dual_c, 8),
+  make_tuple(&aom_highbd_lpf_horizontal_14_dual_sse2,
+             &aom_highbd_lpf_horizontal_14_dual_c, 8),
+  make_tuple(&aom_highbd_lpf_vertical_4_dual_sse2,
+             &aom_highbd_lpf_vertical_4_dual_c, 8),
+  make_tuple(&aom_highbd_lpf_vertical_6_dual_sse2,
+             &aom_highbd_lpf_vertical_6_dual_c, 8),
+  make_tuple(&aom_highbd_lpf_vertical_8_dual_sse2,
+             &aom_highbd_lpf_vertical_8_dual_c, 8),
+  make_tuple(&aom_highbd_lpf_vertical_14_dual_sse2,
+             &aom_highbd_lpf_vertical_14_dual_c, 8),
+  make_tuple(&aom_highbd_lpf_horizontal_4_dual_sse2,
+             &aom_highbd_lpf_horizontal_4_dual_c, 10),
+  make_tuple(&aom_highbd_lpf_horizontal_6_dual_sse2,
+             &aom_highbd_lpf_horizontal_6_dual_c, 10),
+  make_tuple(&aom_highbd_lpf_horizontal_8_dual_sse2,
+             &aom_highbd_lpf_horizontal_8_dual_c, 10),
+  make_tuple(&aom_highbd_lpf_horizontal_14_dual_sse2,
+             &aom_highbd_lpf_horizontal_14_dual_c, 10),
+  make_tuple(&aom_highbd_lpf_vertical_4_dual_sse2,
+             &aom_highbd_lpf_vertical_4_dual_c, 10),
+  make_tuple(&aom_highbd_lpf_vertical_6_dual_sse2,
+             &aom_highbd_lpf_vertical_6_dual_c, 10),
+  make_tuple(&aom_highbd_lpf_vertical_8_dual_sse2,
+             &aom_highbd_lpf_vertical_8_dual_c, 10),
+  make_tuple(&aom_highbd_lpf_vertical_14_dual_sse2,
+             &aom_highbd_lpf_vertical_14_dual_c, 10),
+  make_tuple(&aom_highbd_lpf_horizontal_4_dual_sse2,
+             &aom_highbd_lpf_horizontal_4_dual_c, 12),
+  make_tuple(&aom_highbd_lpf_horizontal_6_dual_sse2,
+             &aom_highbd_lpf_horizontal_6_dual_c, 12),
+  make_tuple(&aom_highbd_lpf_horizontal_8_dual_sse2,
+             &aom_highbd_lpf_horizontal_8_dual_c, 12),
+  make_tuple(&aom_highbd_lpf_horizontal_14_dual_sse2,
+             &aom_highbd_lpf_horizontal_14_dual_c, 12),
+  make_tuple(&aom_highbd_lpf_vertical_4_dual_sse2,
+             &aom_highbd_lpf_vertical_4_dual_c, 12),
+  make_tuple(&aom_highbd_lpf_vertical_6_dual_sse2,
+             &aom_highbd_lpf_vertical_6_dual_c, 12),
+  make_tuple(&aom_highbd_lpf_vertical_8_dual_sse2,
+             &aom_highbd_lpf_vertical_8_dual_c, 12),
+  make_tuple(&aom_highbd_lpf_vertical_14_dual_sse2,
+             &aom_highbd_lpf_vertical_14_dual_c, 12),
+};
+
+INSTANTIATE_TEST_SUITE_P(SSE2, Loop8Test9Param_hbd,
+                         ::testing::ValuesIn(kHbdLoop8Test9));
+
+#endif  // HAVE_SSE2 && CONFIG_AV1_HIGHBITDEPTH
+
+#if HAVE_NEON
+const loop_param_t kLoop8Test6[] = {
+  make_tuple(&aom_lpf_vertical_14_neon, &aom_lpf_vertical_14_c, 8),
+  make_tuple(&aom_lpf_vertical_8_neon, &aom_lpf_vertical_8_c, 8),
+  make_tuple(&aom_lpf_vertical_6_neon, &aom_lpf_vertical_6_c, 8),
+  make_tuple(&aom_lpf_vertical_4_neon, &aom_lpf_vertical_4_c, 8),
+  make_tuple(&aom_lpf_horizontal_14_neon, &aom_lpf_horizontal_14_c, 8),
+  make_tuple(&aom_lpf_horizontal_8_neon, &aom_lpf_horizontal_8_c, 8),
+  make_tuple(&aom_lpf_horizontal_6_neon, &aom_lpf_horizontal_6_c, 8),
+  make_tuple(&aom_lpf_horizontal_4_neon, &aom_lpf_horizontal_4_c, 8)
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON, Loop8Test6Param_lbd,
+                         ::testing::ValuesIn(kLoop8Test6));
+#endif  // HAVE_NEON
+
+#if HAVE_AVX2 && CONFIG_AV1_HIGHBITDEPTH
+const hbddual_loop_param_t kHbdLoop8Test9Avx2[] = {
+  make_tuple(&aom_highbd_lpf_horizontal_4_dual_avx2,
+             &aom_highbd_lpf_horizontal_4_dual_c, 8),
+  make_tuple(&aom_highbd_lpf_horizontal_4_dual_avx2,
+             &aom_highbd_lpf_horizontal_4_dual_c, 10),
+  make_tuple(&aom_highbd_lpf_horizontal_4_dual_avx2,
+             &aom_highbd_lpf_horizontal_4_dual_c, 12),
+  make_tuple(&aom_highbd_lpf_horizontal_8_dual_avx2,
+             &aom_highbd_lpf_horizontal_8_dual_c, 8),
+  make_tuple(&aom_highbd_lpf_horizontal_8_dual_avx2,
+             &aom_highbd_lpf_horizontal_8_dual_c, 10),
+  make_tuple(&aom_highbd_lpf_horizontal_8_dual_avx2,
+             &aom_highbd_lpf_horizontal_8_dual_c, 12),
+  make_tuple(&aom_highbd_lpf_vertical_4_dual_avx2,
+             &aom_highbd_lpf_vertical_4_dual_c, 8),
+  make_tuple(&aom_highbd_lpf_vertical_4_dual_avx2,
+             &aom_highbd_lpf_vertical_4_dual_c, 10),
+  make_tuple(&aom_highbd_lpf_vertical_4_dual_avx2,
+             &aom_highbd_lpf_vertical_4_dual_c, 12),
+  make_tuple(&aom_highbd_lpf_vertical_8_dual_avx2,
+             &aom_highbd_lpf_vertical_8_dual_c, 8),
+  make_tuple(&aom_highbd_lpf_vertical_8_dual_avx2,
+             &aom_highbd_lpf_vertical_8_dual_c, 10),
+  make_tuple(&aom_highbd_lpf_vertical_8_dual_avx2,
+             &aom_highbd_lpf_vertical_8_dual_c, 12),
+};
+
+INSTANTIATE_TEST_SUITE_P(AVX2, Loop8Test9Param_hbd,
+                         ::testing::ValuesIn(kHbdLoop8Test9Avx2));
+#endif
+}  // namespace
diff --git a/libs/libaom/src/test/masked_sad_test.cc b/libs/libaom/src/test/masked_sad_test.cc
new file mode 100644
index 000000000..aa4dd8341
--- /dev/null
+++ b/libs/libaom/src/test/masked_sad_test.cc
@@ -0,0 +1,495 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <tuple>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+
+using libaom_test::ACMRandom;
+
+namespace {
+const int number_of_iterations = 200;
+
+typedef unsigned int (*MaskedSADFunc)(const uint8_t *src, int src_stride,
+                                      const uint8_t *ref, int ref_stride,
+                                      const uint8_t *second_pred,
+                                      const uint8_t *msk, int msk_stride,
+                                      int invert_mask);
+typedef std::tuple<MaskedSADFunc, MaskedSADFunc> MaskedSADParam;
+
+typedef void (*MaskedSADx4Func)(const uint8_t *src, int src_stride,
+                                const uint8_t *ref[], int ref_stride,
+                                const uint8_t *second_pred, const uint8_t *msk,
+                                int msk_stride, int invert_mask,
+                                unsigned sads[]);
+
+typedef std::tuple<MaskedSADx4Func, MaskedSADx4Func> MaskedSADx4Param;
+
+class MaskedSADTestBase : public ::testing::Test {
+ public:
+  virtual ~MaskedSADTestBase() {}
+  virtual void SetUp() = 0;
+  virtual void runRef(const uint8_t *src_ptr, int src_stride,
+                      const uint8_t *ref_ptr[], int ref_stride,
+                      const uint8_t *second_pred, const uint8_t *msk,
+                      int msk_stride, int inv_mask, unsigned sads[],
+                      int times) = 0;
+  virtual void runTest(const uint8_t *src_ptr, int src_stride,
+                       const uint8_t *ref_ptr[], int ref_stride,
+                       const uint8_t *second_pred, const uint8_t *msk,
+                       int msk_stride, int inv_mask, unsigned sads[],
+                       int times) = 0;
+
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+  void runMaskedSADTest(int run_times);
+};
+
+class MaskedSADTest : public MaskedSADTestBase,
+                      public ::testing::WithParamInterface<MaskedSADParam> {
+ public:
+  virtual ~MaskedSADTest() {}
+  virtual void SetUp() {
+    maskedSAD_op_ = GET_PARAM(0);
+    ref_maskedSAD_op_ = GET_PARAM(1);
+  }
+
+  virtual void runRef(const uint8_t *src_ptr, int src_stride,
+                      const uint8_t *ref_ptr[], int ref_stride,
+                      const uint8_t *second_pred, const uint8_t *msk,
+                      int msk_stride, int inv_mask, unsigned sads[], int times);
+  virtual void runTest(const uint8_t *src_ptr, int src_stride,
+                       const uint8_t *ref_ptr[], int ref_stride,
+                       const uint8_t *second_pred, const uint8_t *msk,
+                       int msk_stride, int inv_mask, unsigned sads[],
+                       int times);
+
+ protected:
+  MaskedSADFunc maskedSAD_op_;
+  MaskedSADFunc ref_maskedSAD_op_;
+};
+
+class MaskedSADx4Test : public MaskedSADTestBase,
+                        public ::testing::WithParamInterface<MaskedSADx4Param> {
+ public:
+  virtual ~MaskedSADx4Test() {}
+  virtual void SetUp() {
+    maskedSAD_op_ = GET_PARAM(0);
+    ref_maskedSAD_op_ = GET_PARAM(1);
+  }
+  virtual void runRef(const uint8_t *src_ptr, int src_stride,
+                      const uint8_t *ref_ptr[], int ref_stride,
+                      const uint8_t *second_pred, const uint8_t *msk,
+                      int msk_stride, int inv_mask, unsigned sads[], int times);
+  virtual void runTest(const uint8_t *src_ptr, int src_stride,
+                       const uint8_t *ref_ptr[], int ref_stride,
+                       const uint8_t *second_pred, const uint8_t *msk,
+                       int msk_stride, int inv_mask, unsigned sads[],
+                       int times);
+
+ protected:
+  MaskedSADx4Func maskedSAD_op_;
+  MaskedSADx4Func ref_maskedSAD_op_;
+};
+
+void MaskedSADTest::runRef(const uint8_t *src_ptr, int src_stride,
+                           const uint8_t *ref_ptr[], int ref_stride,
+                           const uint8_t *second_pred, const uint8_t *msk,
+                           int msk_stride, int invert_mask, unsigned sads[],
+                           int times) {
+  for (int repeat = 0; repeat < times; ++repeat) {
+    sads[0] = ref_maskedSAD_op_(src_ptr, src_stride, ref_ptr[0], ref_stride,
+                                second_pred, msk, msk_stride, invert_mask);
+  }
+}
+
+void MaskedSADTest::runTest(const uint8_t *src_ptr, int src_stride,
+                            const uint8_t *ref_ptr[], int ref_stride,
+                            const uint8_t *second_pred, const uint8_t *msk,
+                            int msk_stride, int invert_mask, unsigned sads[],
+                            int times) {
+  if (times == 1) {
+    sads[0] = maskedSAD_op_(src_ptr, src_stride, ref_ptr[0], ref_stride,
+                            second_pred, msk, msk_stride, invert_mask);
+  } else {
+    for (int repeat = 0; repeat < times; ++repeat) {
+      ASM_REGISTER_STATE_CHECK(
+          sads[0] = maskedSAD_op_(src_ptr, src_stride, ref_ptr[0], ref_stride,
+                                  second_pred, msk, msk_stride, invert_mask));
+    }
+  }
+}
+
+void MaskedSADx4Test::runRef(const uint8_t *src_ptr, int src_stride,
+                             const uint8_t *ref_ptr[], int ref_stride,
+                             const uint8_t *second_pred, const uint8_t *msk,
+                             int msk_stride, int invert_mask, unsigned sads[],
+                             int times) {
+  for (int repeat = 0; repeat < times; ++repeat) {
+    ref_maskedSAD_op_(src_ptr, src_stride, ref_ptr, ref_stride, second_pred,
+                      msk, msk_stride, invert_mask, sads);
+  }
+}
+
+void MaskedSADx4Test::runTest(const uint8_t *src_ptr, int src_stride,
+                              const uint8_t *ref_ptr[], int ref_stride,
+                              const uint8_t *second_pred, const uint8_t *msk,
+                              int msk_stride, int invert_mask, unsigned sads[],
+                              int times) {
+  if (times == 1) {
+    ASM_REGISTER_STATE_CHECK(maskedSAD_op_(src_ptr, src_stride, ref_ptr,
+                                           ref_stride, second_pred, msk,
+                                           msk_stride, invert_mask, sads));
+  } else {
+    for (int repeat = 0; repeat < times; ++repeat) {
+      maskedSAD_op_(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, msk,
+                    msk_stride, invert_mask, sads);
+    }
+  }
+}
+
+void MaskedSADTestBase::runMaskedSADTest(int run_times) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const unsigned kBlockSize = MAX_SB_SIZE * MAX_SB_SIZE;
+  DECLARE_ALIGNED(16, uint8_t, src_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
+  DECLARE_ALIGNED(16, uint8_t, ref_ptr[MAX_SB_SIZE * MAX_SB_SIZE * 4]);
+  DECLARE_ALIGNED(16, uint8_t, second_pred_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
+  DECLARE_ALIGNED(16, uint8_t, msk_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
+
+  const uint8_t *refs[] = { ref_ptr, ref_ptr + kBlockSize,
+                            ref_ptr + 2 * kBlockSize,
+                            ref_ptr + 3 * kBlockSize };
+  unsigned sads[] = { 0, 0, 0, 0 };
+  unsigned sads_ref[] = { 0, 0, 0, 0 };
+  int err_count = 0;
+  int first_failure = -1;
+  int src_stride = MAX_SB_SIZE;
+  int ref_stride = MAX_SB_SIZE;
+  int msk_stride = MAX_SB_SIZE;
+  const int iters = run_times == 1 ? number_of_iterations : 1;
+  for (int i = 0; i < iters; ++i) {
+    for (int j = 0; j < MAX_SB_SIZE * MAX_SB_SIZE; j++) {
+      src_ptr[j] = rnd.Rand8();
+      ref_ptr[j] = rnd.Rand8();
+      (ref_ptr + kBlockSize)[j] = rnd.Rand8();
+      (ref_ptr + 2 * kBlockSize)[j] = rnd.Rand8();
+      (ref_ptr + 3 * kBlockSize)[j] = rnd.Rand8();
+      second_pred_ptr[j] = rnd.Rand8();
+      msk_ptr[j] = ((rnd.Rand8() & 0x7f) > 64) ? rnd.Rand8() & 0x3f : 64;
+      assert(msk_ptr[j] <= 64);
+    }
+
+    for (int invert_mask = 0; invert_mask < 2; ++invert_mask) {
+      aom_usec_timer timer;
+      aom_usec_timer_start(&timer);
+      runRef(src_ptr, src_stride, refs, ref_stride, second_pred_ptr, msk_ptr,
+             msk_stride, invert_mask, sads_ref, run_times);
+      aom_usec_timer_mark(&timer);
+      const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+
+      aom_usec_timer_start(&timer);
+      runTest(src_ptr, src_stride, refs, ref_stride, second_pred_ptr, msk_ptr,
+              msk_stride, invert_mask, sads, run_times);
+      aom_usec_timer_mark(&timer);
+      const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+
+      if (run_times > 10) {
+        printf("%7.2f/%7.2fns", time1, time2);
+        printf("(%3.2f)\n", time1 / time2);
+      }
+      if (sads_ref[0] != sads[0] || sads_ref[1] != sads[1] ||
+          sads_ref[2] != sads[2] || sads_ref[3] != sads[3]) {
+        err_count++;
+        if (first_failure == -1) first_failure = i;
+      }
+    }
+  }
+  EXPECT_EQ(0, err_count) << "Error: Masked SAD Test,  output doesn't match. "
+                          << "First failed at test case " << first_failure;
+}
+
+TEST_P(MaskedSADTest, OperationCheck) { runMaskedSADTest(1); }
+
+TEST_P(MaskedSADTest, DISABLED_Speed) { runMaskedSADTest(2000000); }
+
+TEST_P(MaskedSADx4Test, OperationCheck) { runMaskedSADTest(1); }
+
+TEST_P(MaskedSADx4Test, DISABLED_Speed) { runMaskedSADTest(2000000); }
+
+#if CONFIG_AV1_HIGHBITDEPTH
+typedef unsigned int (*HighbdMaskedSADFunc)(const uint8_t *src, int src_stride,
+                                            const uint8_t *ref, int ref_stride,
+                                            const uint8_t *second_pred,
+                                            const uint8_t *msk, int msk_stride,
+                                            int invert_mask);
+typedef std::tuple<HighbdMaskedSADFunc, HighbdMaskedSADFunc>
+    HighbdMaskedSADParam;
+
+class HighbdMaskedSADTest
+    : public ::testing::TestWithParam<HighbdMaskedSADParam> {
+ public:
+  virtual ~HighbdMaskedSADTest() {}
+  virtual void SetUp() {
+    maskedSAD_op_ = GET_PARAM(0);
+    ref_maskedSAD_op_ = GET_PARAM(1);
+  }
+
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+  void runHighbdMaskedSADTest(int run_times);
+
+ protected:
+  HighbdMaskedSADFunc maskedSAD_op_;
+  HighbdMaskedSADFunc ref_maskedSAD_op_;
+};
+void HighbdMaskedSADTest::runHighbdMaskedSADTest(int run_times) {
+  unsigned int ref_ret = 0, ret = 1;
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED(16, uint16_t, src_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
+  DECLARE_ALIGNED(16, uint16_t, ref_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
+  DECLARE_ALIGNED(16, uint16_t, second_pred_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
+  DECLARE_ALIGNED(16, uint8_t, msk_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
+  uint8_t *src8_ptr = CONVERT_TO_BYTEPTR(src_ptr);
+  uint8_t *ref8_ptr = CONVERT_TO_BYTEPTR(ref_ptr);
+  uint8_t *second_pred8_ptr = CONVERT_TO_BYTEPTR(second_pred_ptr);
+  int err_count = 0;
+  int first_failure = -1;
+  int src_stride = MAX_SB_SIZE;
+  int ref_stride = MAX_SB_SIZE;
+  int msk_stride = MAX_SB_SIZE;
+  const int iters = run_times == 1 ? number_of_iterations : 1;
+  for (int i = 0; i < iters; ++i) {
+    for (int j = 0; j < MAX_SB_SIZE * MAX_SB_SIZE; j++) {
+      src_ptr[j] = rnd.Rand16() & 0xfff;
+      ref_ptr[j] = rnd.Rand16() & 0xfff;
+      second_pred_ptr[j] = rnd.Rand16() & 0xfff;
+      msk_ptr[j] = ((rnd.Rand8() & 0x7f) > 64) ? rnd.Rand8() & 0x3f : 64;
+    }
+
+    for (int invert_mask = 0; invert_mask < 2; ++invert_mask) {
+      aom_usec_timer timer;
+      aom_usec_timer_start(&timer);
+      for (int repeat = 0; repeat < run_times; ++repeat) {
+        ref_ret = ref_maskedSAD_op_(src8_ptr, src_stride, ref8_ptr, ref_stride,
+                                    second_pred8_ptr, msk_ptr, msk_stride,
+                                    invert_mask);
+      }
+      aom_usec_timer_mark(&timer);
+      const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+      aom_usec_timer_start(&timer);
+      if (run_times == 1) {
+        ASM_REGISTER_STATE_CHECK(ret = maskedSAD_op_(src8_ptr, src_stride,
+                                                     ref8_ptr, ref_stride,
+                                                     second_pred8_ptr, msk_ptr,
+                                                     msk_stride, invert_mask));
+      } else {
+        for (int repeat = 0; repeat < run_times; ++repeat) {
+          ret =
+              maskedSAD_op_(src8_ptr, src_stride, ref8_ptr, ref_stride,
+                            second_pred8_ptr, msk_ptr, msk_stride, invert_mask);
+        }
+      }
+      aom_usec_timer_mark(&timer);
+      const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+      if (run_times > 10) {
+        printf("%7.2f/%7.2fns", time1, time2);
+        printf("(%3.2f)\n", time1 / time2);
+      }
+      if (ret != ref_ret) {
+        err_count++;
+        if (first_failure == -1) first_failure = i;
+      }
+    }
+  }
+  EXPECT_EQ(0, err_count)
+      << "Error: High BD Masked SAD Test, output doesn't match. "
+      << "First failed at test case " << first_failure;
+}
+
+TEST_P(HighbdMaskedSADTest, OperationCheck) { runHighbdMaskedSADTest(1); }
+
+TEST_P(HighbdMaskedSADTest, DISABLED_Speed) { runHighbdMaskedSADTest(1000000); }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+using std::make_tuple;
+
+#if HAVE_SSSE3
+const MaskedSADParam msad_test[] = {
+  make_tuple(&aom_masked_sad4x4_ssse3, &aom_masked_sad4x4_c),
+  make_tuple(&aom_masked_sad4x8_ssse3, &aom_masked_sad4x8_c),
+  make_tuple(&aom_masked_sad8x4_ssse3, &aom_masked_sad8x4_c),
+  make_tuple(&aom_masked_sad8x8_ssse3, &aom_masked_sad8x8_c),
+  make_tuple(&aom_masked_sad8x16_ssse3, &aom_masked_sad8x16_c),
+  make_tuple(&aom_masked_sad16x8_ssse3, &aom_masked_sad16x8_c),
+  make_tuple(&aom_masked_sad16x16_ssse3, &aom_masked_sad16x16_c),
+  make_tuple(&aom_masked_sad16x32_ssse3, &aom_masked_sad16x32_c),
+  make_tuple(&aom_masked_sad32x16_ssse3, &aom_masked_sad32x16_c),
+  make_tuple(&aom_masked_sad32x32_ssse3, &aom_masked_sad32x32_c),
+  make_tuple(&aom_masked_sad32x64_ssse3, &aom_masked_sad32x64_c),
+  make_tuple(&aom_masked_sad64x32_ssse3, &aom_masked_sad64x32_c),
+  make_tuple(&aom_masked_sad64x64_ssse3, &aom_masked_sad64x64_c),
+  make_tuple(&aom_masked_sad64x128_ssse3, &aom_masked_sad64x128_c),
+  make_tuple(&aom_masked_sad128x64_ssse3, &aom_masked_sad128x64_c),
+  make_tuple(&aom_masked_sad128x128_ssse3, &aom_masked_sad128x128_c),
+  make_tuple(&aom_masked_sad4x16_ssse3, &aom_masked_sad4x16_c),
+  make_tuple(&aom_masked_sad16x4_ssse3, &aom_masked_sad16x4_c),
+  make_tuple(&aom_masked_sad8x32_ssse3, &aom_masked_sad8x32_c),
+  make_tuple(&aom_masked_sad32x8_ssse3, &aom_masked_sad32x8_c),
+  make_tuple(&aom_masked_sad16x64_ssse3, &aom_masked_sad16x64_c),
+  make_tuple(&aom_masked_sad64x16_ssse3, &aom_masked_sad64x16_c),
+};
+
+INSTANTIATE_TEST_SUITE_P(SSSE3, MaskedSADTest, ::testing::ValuesIn(msad_test));
+
+const MaskedSADx4Param msadx4_test[] = {
+  make_tuple(&aom_masked_sad4x4x4d_ssse3, &aom_masked_sad4x4x4d_c),
+  make_tuple(&aom_masked_sad4x8x4d_ssse3, &aom_masked_sad4x8x4d_c),
+  make_tuple(&aom_masked_sad8x4x4d_ssse3, &aom_masked_sad8x4x4d_c),
+  make_tuple(&aom_masked_sad8x8x4d_ssse3, &aom_masked_sad8x8x4d_c),
+  make_tuple(&aom_masked_sad8x16x4d_ssse3, &aom_masked_sad8x16x4d_c),
+  make_tuple(&aom_masked_sad16x8x4d_ssse3, &aom_masked_sad16x8x4d_c),
+  make_tuple(&aom_masked_sad16x16x4d_ssse3, &aom_masked_sad16x16x4d_c),
+  make_tuple(&aom_masked_sad16x32x4d_ssse3, &aom_masked_sad16x32x4d_c),
+  make_tuple(&aom_masked_sad32x16x4d_ssse3, &aom_masked_sad32x16x4d_c),
+  make_tuple(&aom_masked_sad32x32x4d_ssse3, &aom_masked_sad32x32x4d_c),
+  make_tuple(&aom_masked_sad32x64x4d_ssse3, &aom_masked_sad32x64x4d_c),
+  make_tuple(&aom_masked_sad64x32x4d_ssse3, &aom_masked_sad64x32x4d_c),
+  make_tuple(&aom_masked_sad64x64x4d_ssse3, &aom_masked_sad64x64x4d_c),
+  make_tuple(&aom_masked_sad64x128x4d_ssse3, &aom_masked_sad64x128x4d_c),
+  make_tuple(&aom_masked_sad128x64x4d_ssse3, &aom_masked_sad128x64x4d_c),
+  make_tuple(&aom_masked_sad128x128x4d_ssse3, &aom_masked_sad128x128x4d_c),
+  make_tuple(&aom_masked_sad4x16x4d_ssse3, &aom_masked_sad4x16x4d_c),
+  make_tuple(&aom_masked_sad16x4x4d_ssse3, &aom_masked_sad16x4x4d_c),
+  make_tuple(&aom_masked_sad8x32x4d_ssse3, &aom_masked_sad8x32x4d_c),
+  make_tuple(&aom_masked_sad32x8x4d_ssse3, &aom_masked_sad32x8x4d_c),
+  make_tuple(&aom_masked_sad16x64x4d_ssse3, &aom_masked_sad16x64x4d_c),
+  make_tuple(&aom_masked_sad64x16x4d_ssse3, &aom_masked_sad64x16x4d_c),
+};
+
+INSTANTIATE_TEST_SUITE_P(SSSE3, MaskedSADx4Test,
+                         ::testing::ValuesIn(msadx4_test));
+
+#if CONFIG_AV1_HIGHBITDEPTH
+const HighbdMaskedSADParam hbd_msad_test[] = {
+  make_tuple(&aom_highbd_masked_sad4x4_ssse3, &aom_highbd_masked_sad4x4_c),
+  make_tuple(&aom_highbd_masked_sad4x8_ssse3, &aom_highbd_masked_sad4x8_c),
+  make_tuple(&aom_highbd_masked_sad8x4_ssse3, &aom_highbd_masked_sad8x4_c),
+  make_tuple(&aom_highbd_masked_sad8x8_ssse3, &aom_highbd_masked_sad8x8_c),
+  make_tuple(&aom_highbd_masked_sad8x16_ssse3, &aom_highbd_masked_sad8x16_c),
+  make_tuple(&aom_highbd_masked_sad16x8_ssse3, &aom_highbd_masked_sad16x8_c),
+  make_tuple(&aom_highbd_masked_sad16x16_ssse3, &aom_highbd_masked_sad16x16_c),
+  make_tuple(&aom_highbd_masked_sad16x32_ssse3, &aom_highbd_masked_sad16x32_c),
+  make_tuple(&aom_highbd_masked_sad32x16_ssse3, &aom_highbd_masked_sad32x16_c),
+  make_tuple(&aom_highbd_masked_sad32x32_ssse3, &aom_highbd_masked_sad32x32_c),
+  make_tuple(&aom_highbd_masked_sad32x64_ssse3, &aom_highbd_masked_sad32x64_c),
+  make_tuple(&aom_highbd_masked_sad64x32_ssse3, &aom_highbd_masked_sad64x32_c),
+  make_tuple(&aom_highbd_masked_sad64x64_ssse3, &aom_highbd_masked_sad64x64_c),
+  make_tuple(&aom_highbd_masked_sad64x128_ssse3,
+             &aom_highbd_masked_sad64x128_c),
+  make_tuple(&aom_highbd_masked_sad128x64_ssse3,
+             &aom_highbd_masked_sad128x64_c),
+  make_tuple(&aom_highbd_masked_sad128x128_ssse3,
+             &aom_highbd_masked_sad128x128_c),
+  make_tuple(&aom_highbd_masked_sad4x16_ssse3, &aom_highbd_masked_sad4x16_c),
+  make_tuple(&aom_highbd_masked_sad16x4_ssse3, &aom_highbd_masked_sad16x4_c),
+  make_tuple(&aom_highbd_masked_sad8x32_ssse3, &aom_highbd_masked_sad8x32_c),
+  make_tuple(&aom_highbd_masked_sad32x8_ssse3, &aom_highbd_masked_sad32x8_c),
+  make_tuple(&aom_highbd_masked_sad16x64_ssse3, &aom_highbd_masked_sad16x64_c),
+  make_tuple(&aom_highbd_masked_sad64x16_ssse3, &aom_highbd_masked_sad64x16_c),
+};
+
+INSTANTIATE_TEST_SUITE_P(SSSE3, HighbdMaskedSADTest,
+                         ::testing::ValuesIn(hbd_msad_test));
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+#endif  // HAVE_SSSE3
+
+#if HAVE_AVX2
+const MaskedSADParam msad_avx2_test[] = {
+  make_tuple(&aom_masked_sad4x4_avx2, &aom_masked_sad4x4_ssse3),
+  make_tuple(&aom_masked_sad4x8_avx2, &aom_masked_sad4x8_ssse3),
+  make_tuple(&aom_masked_sad8x4_avx2, &aom_masked_sad8x4_ssse3),
+  make_tuple(&aom_masked_sad8x8_avx2, &aom_masked_sad8x8_ssse3),
+  make_tuple(&aom_masked_sad8x16_avx2, &aom_masked_sad8x16_ssse3),
+  make_tuple(&aom_masked_sad16x8_avx2, &aom_masked_sad16x8_ssse3),
+  make_tuple(&aom_masked_sad16x16_avx2, &aom_masked_sad16x16_ssse3),
+  make_tuple(&aom_masked_sad16x32_avx2, &aom_masked_sad16x32_ssse3),
+  make_tuple(&aom_masked_sad32x16_avx2, &aom_masked_sad32x16_ssse3),
+  make_tuple(&aom_masked_sad32x32_avx2, &aom_masked_sad32x32_ssse3),
+  make_tuple(&aom_masked_sad32x64_avx2, &aom_masked_sad32x64_ssse3),
+  make_tuple(&aom_masked_sad64x32_avx2, &aom_masked_sad64x32_ssse3),
+  make_tuple(&aom_masked_sad64x64_avx2, &aom_masked_sad64x64_ssse3),
+  make_tuple(&aom_masked_sad64x128_avx2, &aom_masked_sad64x128_ssse3),
+  make_tuple(&aom_masked_sad128x64_avx2, &aom_masked_sad128x64_ssse3),
+  make_tuple(&aom_masked_sad128x128_avx2, &aom_masked_sad128x128_ssse3),
+  make_tuple(&aom_masked_sad4x16_avx2, &aom_masked_sad4x16_ssse3),
+  make_tuple(&aom_masked_sad16x4_avx2, &aom_masked_sad16x4_ssse3),
+  make_tuple(&aom_masked_sad8x32_avx2, &aom_masked_sad8x32_ssse3),
+  make_tuple(&aom_masked_sad32x8_avx2, &aom_masked_sad32x8_ssse3),
+  make_tuple(&aom_masked_sad16x64_avx2, &aom_masked_sad16x64_ssse3),
+  make_tuple(&aom_masked_sad64x16_avx2, &aom_masked_sad64x16_ssse3)
+};
+
+INSTANTIATE_TEST_SUITE_P(AVX2, MaskedSADTest,
+                         ::testing::ValuesIn(msad_avx2_test));
+
+#if CONFIG_AV1_HIGHBITDEPTH
+const HighbdMaskedSADParam hbd_msad_avx2_test[] = {
+  make_tuple(&aom_highbd_masked_sad4x4_avx2, &aom_highbd_masked_sad4x4_ssse3),
+  make_tuple(&aom_highbd_masked_sad4x8_avx2, &aom_highbd_masked_sad4x8_ssse3),
+  make_tuple(&aom_highbd_masked_sad8x4_avx2, &aom_highbd_masked_sad8x4_ssse3),
+  make_tuple(&aom_highbd_masked_sad8x8_avx2, &aom_highbd_masked_sad8x8_ssse3),
+  make_tuple(&aom_highbd_masked_sad8x16_avx2, &aom_highbd_masked_sad8x16_ssse3),
+  make_tuple(&aom_highbd_masked_sad16x8_avx2, &aom_highbd_masked_sad16x8_ssse3),
+  make_tuple(&aom_highbd_masked_sad16x16_avx2,
+             &aom_highbd_masked_sad16x16_ssse3),
+  make_tuple(&aom_highbd_masked_sad16x32_avx2,
+             &aom_highbd_masked_sad16x32_ssse3),
+  make_tuple(&aom_highbd_masked_sad32x16_avx2,
+             &aom_highbd_masked_sad32x16_ssse3),
+  make_tuple(&aom_highbd_masked_sad32x32_avx2,
+             &aom_highbd_masked_sad32x32_ssse3),
+  make_tuple(&aom_highbd_masked_sad32x64_avx2,
+             &aom_highbd_masked_sad32x64_ssse3),
+  make_tuple(&aom_highbd_masked_sad64x32_avx2,
+             &aom_highbd_masked_sad64x32_ssse3),
+  make_tuple(&aom_highbd_masked_sad64x64_avx2,
+             &aom_highbd_masked_sad64x64_ssse3),
+  make_tuple(&aom_highbd_masked_sad64x128_avx2,
+             &aom_highbd_masked_sad64x128_ssse3),
+  make_tuple(&aom_highbd_masked_sad128x64_avx2,
+             &aom_highbd_masked_sad128x64_ssse3),
+  make_tuple(&aom_highbd_masked_sad128x128_avx2,
+             &aom_highbd_masked_sad128x128_ssse3),
+  make_tuple(&aom_highbd_masked_sad4x16_avx2, &aom_highbd_masked_sad4x16_ssse3),
+  make_tuple(&aom_highbd_masked_sad16x4_avx2, &aom_highbd_masked_sad16x4_ssse3),
+  make_tuple(&aom_highbd_masked_sad8x32_avx2, &aom_highbd_masked_sad8x32_ssse3),
+  make_tuple(&aom_highbd_masked_sad32x8_avx2, &aom_highbd_masked_sad32x8_ssse3),
+  make_tuple(&aom_highbd_masked_sad16x64_avx2,
+             &aom_highbd_masked_sad16x64_ssse3),
+  make_tuple(&aom_highbd_masked_sad64x16_avx2,
+             &aom_highbd_masked_sad64x16_ssse3)
+};
+
+INSTANTIATE_TEST_SUITE_P(AVX2, HighbdMaskedSADTest,
+                         ::testing::ValuesIn(hbd_msad_avx2_test));
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+#endif  // HAVE_AVX2
+
+}  // namespace
diff --git a/libs/libaom/src/test/masked_variance_test.cc b/libs/libaom/src/test/masked_variance_test.cc
new file mode 100644
index 000000000..bf814cea2
--- /dev/null
+++ b/libs/libaom/src/test/masked_variance_test.cc
@@ -0,0 +1,514 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <tuple>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_codec.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_mem/aom_mem.h"
+
+using libaom_test::ACMRandom;
+
+namespace {
+const int number_of_iterations = 200;
+
+typedef unsigned int (*MaskedSubPixelVarianceFunc)(
+    const uint8_t *src, int src_stride, int xoffset, int yoffset,
+    const uint8_t *ref, int ref_stride, const uint8_t *second_pred,
+    const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+
+typedef std::tuple<MaskedSubPixelVarianceFunc, MaskedSubPixelVarianceFunc>
+    MaskedSubPixelVarianceParam;
+
+class MaskedSubPixelVarianceTest
+    : public ::testing::TestWithParam<MaskedSubPixelVarianceParam> {
+ public:
+  virtual ~MaskedSubPixelVarianceTest() {}
+  virtual void SetUp() {
+    opt_func_ = GET_PARAM(0);
+    ref_func_ = GET_PARAM(1);
+  }
+
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+  MaskedSubPixelVarianceFunc opt_func_;
+  MaskedSubPixelVarianceFunc ref_func_;
+};
+
+TEST_P(MaskedSubPixelVarianceTest, OperationCheck) {
+  unsigned int ref_ret, opt_ret;
+  unsigned int ref_sse, opt_sse;
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  // Note: We pad out the input array to a multiple of 16 bytes wide, so that
+  // consecutive rows keep the 16-byte alignment.
+  DECLARE_ALIGNED(16, uint8_t, src_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16)]);
+  DECLARE_ALIGNED(16, uint8_t, ref_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16)]);
+  DECLARE_ALIGNED(16, uint8_t,
+                  second_pred_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16)]);
+  DECLARE_ALIGNED(16, uint8_t, msk_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16)]);
+  int err_count = 0;
+  int first_failure = -1;
+  int src_stride = (MAX_SB_SIZE + 16);
+  int ref_stride = (MAX_SB_SIZE + 16);
+  int msk_stride = (MAX_SB_SIZE + 16);
+  int xoffset;
+  int yoffset;
+
+  for (int i = 0; i < number_of_iterations; ++i) {
+    int xoffsets[] = { 0, 4, rnd(BIL_SUBPEL_SHIFTS) };
+    int yoffsets[] = { 0, 4, rnd(BIL_SUBPEL_SHIFTS) };
+    for (int j = 0; j < (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16); j++) {
+      src_ptr[j] = rnd.Rand8();
+      ref_ptr[j] = rnd.Rand8();
+      second_pred_ptr[j] = rnd.Rand8();
+      msk_ptr[j] = rnd(65);
+    }
+    for (int k = 0; k < 3; k++) {
+      for (int l = 0; l < 3; l++) {
+        xoffset = xoffsets[k];
+        yoffset = yoffsets[l];
+        for (int invert_mask = 0; invert_mask < 2; ++invert_mask) {
+          ref_ret = ref_func_(src_ptr, src_stride, xoffset, yoffset, ref_ptr,
+                              ref_stride, second_pred_ptr, msk_ptr, msk_stride,
+                              invert_mask, &ref_sse);
+          ASM_REGISTER_STATE_CHECK(
+              opt_ret = opt_func_(src_ptr, src_stride, xoffset, yoffset,
+                                  ref_ptr, ref_stride, second_pred_ptr, msk_ptr,
+                                  msk_stride, invert_mask, &opt_sse));
+
+          if (opt_ret != ref_ret || opt_sse != ref_sse) {
+            err_count++;
+            if (first_failure == -1) first_failure = i;
+          }
+        }
+      }
+    }
+  }
+
+  EXPECT_EQ(0, err_count)
+      << "Error: Masked Sub Pixel Variance Test OperationCheck,"
+      << "C output doesn't match SSSE3 output. "
+      << "First failed at test case " << first_failure;
+}
+
+TEST_P(MaskedSubPixelVarianceTest, ExtremeValues) {
+  unsigned int ref_ret, opt_ret;
+  unsigned int ref_sse, opt_sse;
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED(16, uint8_t, src_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16)]);
+  DECLARE_ALIGNED(16, uint8_t, ref_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16)]);
+  DECLARE_ALIGNED(16, uint8_t,
+                  second_pred_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16)]);
+  DECLARE_ALIGNED(16, uint8_t, msk_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16)]);
+  int first_failure_x = -1;
+  int first_failure_y = -1;
+  int err_count = 0;
+  int first_failure = -1;
+  int src_stride = (MAX_SB_SIZE + 16);
+  int ref_stride = (MAX_SB_SIZE + 16);
+  int msk_stride = (MAX_SB_SIZE + 16);
+
+  for (int xoffset = 0; xoffset < BIL_SUBPEL_SHIFTS; xoffset++) {
+    for (int yoffset = 0; yoffset < BIL_SUBPEL_SHIFTS; yoffset++) {
+      for (int i = 0; i < 16; ++i) {
+        memset(src_ptr, (i & 0x1) ? 255 : 0,
+               (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16));
+        memset(ref_ptr, (i & 0x2) ? 255 : 0,
+               (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16));
+        memset(second_pred_ptr, (i & 0x4) ? 255 : 0,
+               (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16));
+        memset(msk_ptr, (i & 0x8) ? 64 : 0,
+               (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 16));
+
+        for (int invert_mask = 0; invert_mask < 2; ++invert_mask) {
+          ref_ret = ref_func_(src_ptr, src_stride, xoffset, yoffset, ref_ptr,
+                              ref_stride, second_pred_ptr, msk_ptr, msk_stride,
+                              invert_mask, &ref_sse);
+          ASM_REGISTER_STATE_CHECK(
+              opt_ret = opt_func_(src_ptr, src_stride, xoffset, yoffset,
+                                  ref_ptr, ref_stride, second_pred_ptr, msk_ptr,
+                                  msk_stride, invert_mask, &opt_sse));
+
+          if (opt_ret != ref_ret || opt_sse != ref_sse) {
+            err_count++;
+            if (first_failure == -1) {
+              first_failure = i;
+              first_failure_x = xoffset;
+              first_failure_y = yoffset;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  EXPECT_EQ(0, err_count) << "Error: Masked Variance Test ExtremeValues,"
+                          << "C output doesn't match SSSE3 output. "
+                          << "First failed at test case " << first_failure
+                          << " x_offset = " << first_failure_x
+                          << " y_offset = " << first_failure_y;
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+typedef std::tuple<MaskedSubPixelVarianceFunc, MaskedSubPixelVarianceFunc,
+                   aom_bit_depth_t>
+    HighbdMaskedSubPixelVarianceParam;
+
+class HighbdMaskedSubPixelVarianceTest
+    : public ::testing::TestWithParam<HighbdMaskedSubPixelVarianceParam> {
+ public:
+  virtual ~HighbdMaskedSubPixelVarianceTest() {}
+  virtual void SetUp() {
+    opt_func_ = GET_PARAM(0);
+    ref_func_ = GET_PARAM(1);
+    bit_depth_ = GET_PARAM(2);
+  }
+
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+  MaskedSubPixelVarianceFunc opt_func_;
+  MaskedSubPixelVarianceFunc ref_func_;
+  aom_bit_depth_t bit_depth_;
+};
+
+TEST_P(HighbdMaskedSubPixelVarianceTest, OperationCheck) {
+  unsigned int ref_ret, opt_ret;
+  unsigned int ref_sse, opt_sse;
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED(16, uint16_t, src_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 8)]);
+  DECLARE_ALIGNED(16, uint16_t, ref_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 8)]);
+  DECLARE_ALIGNED(16, uint16_t,
+                  second_pred_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 8)]);
+  DECLARE_ALIGNED(16, uint8_t, msk_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 8)]);
+  uint8_t *src8_ptr = CONVERT_TO_BYTEPTR(src_ptr);
+  uint8_t *ref8_ptr = CONVERT_TO_BYTEPTR(ref_ptr);
+  uint8_t *second_pred8_ptr = CONVERT_TO_BYTEPTR(second_pred_ptr);
+  int err_count = 0;
+  int first_failure = -1;
+  int first_failure_x = -1;
+  int first_failure_y = -1;
+  int src_stride = (MAX_SB_SIZE + 8);
+  int ref_stride = (MAX_SB_SIZE + 8);
+  int msk_stride = (MAX_SB_SIZE + 8);
+  int xoffset, yoffset;
+
+  for (int i = 0; i < number_of_iterations; ++i) {
+    for (int j = 0; j < (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 8); j++) {
+      src_ptr[j] = rnd.Rand16() & ((1 << bit_depth_) - 1);
+      ref_ptr[j] = rnd.Rand16() & ((1 << bit_depth_) - 1);
+      second_pred_ptr[j] = rnd.Rand16() & ((1 << bit_depth_) - 1);
+      msk_ptr[j] = rnd(65);
+    }
+    for (xoffset = 0; xoffset < BIL_SUBPEL_SHIFTS; xoffset++) {
+      for (yoffset = 0; yoffset < BIL_SUBPEL_SHIFTS; yoffset++) {
+        for (int invert_mask = 0; invert_mask < 2; ++invert_mask) {
+          ref_ret = ref_func_(src8_ptr, src_stride, xoffset, yoffset, ref8_ptr,
+                              ref_stride, second_pred8_ptr, msk_ptr, msk_stride,
+                              invert_mask, &ref_sse);
+          ASM_REGISTER_STATE_CHECK(
+              opt_ret = opt_func_(src8_ptr, src_stride, xoffset, yoffset,
+                                  ref8_ptr, ref_stride, second_pred8_ptr,
+                                  msk_ptr, msk_stride, invert_mask, &opt_sse));
+
+          if (opt_ret != ref_ret || opt_sse != ref_sse) {
+            err_count++;
+            if (first_failure == -1) {
+              first_failure = i;
+              first_failure_x = xoffset;
+              first_failure_y = yoffset;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  EXPECT_EQ(0, err_count)
+      << "Error: Masked Sub Pixel Variance Test OperationCheck,"
+      << "C output doesn't match SSSE3 output. "
+      << "First failed at test case " << first_failure
+      << " x_offset = " << first_failure_x << " y_offset = " << first_failure_y;
+}
+
+TEST_P(HighbdMaskedSubPixelVarianceTest, ExtremeValues) {
+  unsigned int ref_ret, opt_ret;
+  unsigned int ref_sse, opt_sse;
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED(16, uint16_t, src_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 8)]);
+  DECLARE_ALIGNED(16, uint16_t, ref_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 8)]);
+  DECLARE_ALIGNED(16, uint8_t, msk_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 8)]);
+  DECLARE_ALIGNED(16, uint16_t,
+                  second_pred_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 8)]);
+  uint8_t *src8_ptr = CONVERT_TO_BYTEPTR(src_ptr);
+  uint8_t *ref8_ptr = CONVERT_TO_BYTEPTR(ref_ptr);
+  uint8_t *second_pred8_ptr = CONVERT_TO_BYTEPTR(second_pred_ptr);
+  int first_failure_x = -1;
+  int first_failure_y = -1;
+  int err_count = 0;
+  int first_failure = -1;
+  int src_stride = (MAX_SB_SIZE + 8);
+  int ref_stride = (MAX_SB_SIZE + 8);
+  int msk_stride = (MAX_SB_SIZE + 8);
+
+  for (int xoffset = 0; xoffset < BIL_SUBPEL_SHIFTS; xoffset++) {
+    for (int yoffset = 0; yoffset < BIL_SUBPEL_SHIFTS; yoffset++) {
+      for (int i = 0; i < 16; ++i) {
+        aom_memset16(src_ptr, (i & 0x1) ? ((1 << bit_depth_) - 1) : 0,
+                     (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 8));
+        aom_memset16(ref_ptr, (i & 0x2) ? ((1 << bit_depth_) - 1) : 0,
+                     (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 8));
+        aom_memset16(second_pred_ptr, (i & 0x4) ? ((1 << bit_depth_) - 1) : 0,
+                     (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 8));
+        memset(msk_ptr, (i & 0x8) ? 64 : 0,
+               (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 8));
+
+        for (int invert_mask = 0; invert_mask < 2; ++invert_mask) {
+          ref_ret = ref_func_(src8_ptr, src_stride, xoffset, yoffset, ref8_ptr,
+                              ref_stride, second_pred8_ptr, msk_ptr, msk_stride,
+                              invert_mask, &ref_sse);
+          ASM_REGISTER_STATE_CHECK(
+              opt_ret = opt_func_(src8_ptr, src_stride, xoffset, yoffset,
+                                  ref8_ptr, ref_stride, second_pred8_ptr,
+                                  msk_ptr, msk_stride, invert_mask, &opt_sse));
+
+          if (opt_ret != ref_ret || opt_sse != ref_sse) {
+            err_count++;
+            if (first_failure == -1) {
+              first_failure = i;
+              first_failure_x = xoffset;
+              first_failure_y = yoffset;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  EXPECT_EQ(0, err_count) << "Error: Masked Variance Test ExtremeValues,"
+                          << "C output doesn't match SSSE3 output. "
+                          << "First failed at test case " << first_failure
+                          << " x_offset = " << first_failure_x
+                          << " y_offset = " << first_failure_y;
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+using std::make_tuple;
+
+#if HAVE_SSSE3
+
+const MaskedSubPixelVarianceParam sub_pel_var_test[] = {
+  make_tuple(&aom_masked_sub_pixel_variance128x128_ssse3,
+             &aom_masked_sub_pixel_variance128x128_c),
+  make_tuple(&aom_masked_sub_pixel_variance128x64_ssse3,
+             &aom_masked_sub_pixel_variance128x64_c),
+  make_tuple(&aom_masked_sub_pixel_variance64x128_ssse3,
+             &aom_masked_sub_pixel_variance64x128_c),
+  make_tuple(&aom_masked_sub_pixel_variance64x64_ssse3,
+             &aom_masked_sub_pixel_variance64x64_c),
+  make_tuple(&aom_masked_sub_pixel_variance64x32_ssse3,
+             &aom_masked_sub_pixel_variance64x32_c),
+  make_tuple(&aom_masked_sub_pixel_variance32x64_ssse3,
+             &aom_masked_sub_pixel_variance32x64_c),
+  make_tuple(&aom_masked_sub_pixel_variance32x32_ssse3,
+             &aom_masked_sub_pixel_variance32x32_c),
+  make_tuple(&aom_masked_sub_pixel_variance32x16_ssse3,
+             &aom_masked_sub_pixel_variance32x16_c),
+  make_tuple(&aom_masked_sub_pixel_variance16x32_ssse3,
+             &aom_masked_sub_pixel_variance16x32_c),
+  make_tuple(&aom_masked_sub_pixel_variance16x16_ssse3,
+             &aom_masked_sub_pixel_variance16x16_c),
+  make_tuple(&aom_masked_sub_pixel_variance16x8_ssse3,
+             &aom_masked_sub_pixel_variance16x8_c),
+  make_tuple(&aom_masked_sub_pixel_variance8x16_ssse3,
+             &aom_masked_sub_pixel_variance8x16_c),
+  make_tuple(&aom_masked_sub_pixel_variance8x8_ssse3,
+             &aom_masked_sub_pixel_variance8x8_c),
+  make_tuple(&aom_masked_sub_pixel_variance8x4_ssse3,
+             &aom_masked_sub_pixel_variance8x4_c),
+  make_tuple(&aom_masked_sub_pixel_variance4x8_ssse3,
+             &aom_masked_sub_pixel_variance4x8_c),
+  make_tuple(&aom_masked_sub_pixel_variance4x4_ssse3,
+             &aom_masked_sub_pixel_variance4x4_c),
+
+  make_tuple(&aom_masked_sub_pixel_variance64x16_ssse3,
+             &aom_masked_sub_pixel_variance64x16_c),
+  make_tuple(&aom_masked_sub_pixel_variance16x64_ssse3,
+             &aom_masked_sub_pixel_variance16x64_c),
+  make_tuple(&aom_masked_sub_pixel_variance32x8_ssse3,
+             &aom_masked_sub_pixel_variance32x8_c),
+  make_tuple(&aom_masked_sub_pixel_variance8x32_ssse3,
+             &aom_masked_sub_pixel_variance8x32_c),
+  make_tuple(&aom_masked_sub_pixel_variance16x4_ssse3,
+             &aom_masked_sub_pixel_variance16x4_c),
+  make_tuple(&aom_masked_sub_pixel_variance4x16_ssse3,
+             &aom_masked_sub_pixel_variance4x16_c),
+};
+
+INSTANTIATE_TEST_SUITE_P(SSSE3_C_COMPARE, MaskedSubPixelVarianceTest,
+                         ::testing::ValuesIn(sub_pel_var_test));
+
+#if CONFIG_AV1_HIGHBITDEPTH
+const HighbdMaskedSubPixelVarianceParam hbd_sub_pel_var_test[] = {
+  make_tuple(&aom_highbd_8_masked_sub_pixel_variance128x128_ssse3,
+             &aom_highbd_8_masked_sub_pixel_variance128x128_c, AOM_BITS_8),
+  make_tuple(&aom_highbd_8_masked_sub_pixel_variance128x64_ssse3,
+             &aom_highbd_8_masked_sub_pixel_variance128x64_c, AOM_BITS_8),
+  make_tuple(&aom_highbd_8_masked_sub_pixel_variance64x128_ssse3,
+             &aom_highbd_8_masked_sub_pixel_variance64x128_c, AOM_BITS_8),
+  make_tuple(&aom_highbd_8_masked_sub_pixel_variance64x64_ssse3,
+             &aom_highbd_8_masked_sub_pixel_variance64x64_c, AOM_BITS_8),
+  make_tuple(&aom_highbd_8_masked_sub_pixel_variance64x32_ssse3,
+             &aom_highbd_8_masked_sub_pixel_variance64x32_c, AOM_BITS_8),
+  make_tuple(&aom_highbd_8_masked_sub_pixel_variance32x64_ssse3,
+             &aom_highbd_8_masked_sub_pixel_variance32x64_c, AOM_BITS_8),
+  make_tuple(&aom_highbd_8_masked_sub_pixel_variance32x32_ssse3,
+             &aom_highbd_8_masked_sub_pixel_variance32x32_c, AOM_BITS_8),
+  make_tuple(&aom_highbd_8_masked_sub_pixel_variance32x16_ssse3,
+             &aom_highbd_8_masked_sub_pixel_variance32x16_c, AOM_BITS_8),
+  make_tuple(&aom_highbd_8_masked_sub_pixel_variance16x32_ssse3,
+             &aom_highbd_8_masked_sub_pixel_variance16x32_c, AOM_BITS_8),
+  make_tuple(&aom_highbd_8_masked_sub_pixel_variance16x16_ssse3,
+             &aom_highbd_8_masked_sub_pixel_variance16x16_c, AOM_BITS_8),
+  make_tuple(&aom_highbd_8_masked_sub_pixel_variance16x8_ssse3,
+             &aom_highbd_8_masked_sub_pixel_variance16x8_c, AOM_BITS_8),
+  make_tuple(&aom_highbd_8_masked_sub_pixel_variance8x16_ssse3,
+             &aom_highbd_8_masked_sub_pixel_variance8x16_c, AOM_BITS_8),
+  make_tuple(&aom_highbd_8_masked_sub_pixel_variance8x8_ssse3,
+             &aom_highbd_8_masked_sub_pixel_variance8x8_c, AOM_BITS_8),
+  make_tuple(&aom_highbd_8_masked_sub_pixel_variance8x4_ssse3,
+             &aom_highbd_8_masked_sub_pixel_variance8x4_c, AOM_BITS_8),
+  make_tuple(&aom_highbd_8_masked_sub_pixel_variance4x8_ssse3,
+             &aom_highbd_8_masked_sub_pixel_variance4x8_c, AOM_BITS_8),
+  make_tuple(&aom_highbd_8_masked_sub_pixel_variance4x4_ssse3,
+             &aom_highbd_8_masked_sub_pixel_variance4x4_c, AOM_BITS_8),
+  make_tuple(&aom_highbd_10_masked_sub_pixel_variance128x128_ssse3,
+             &aom_highbd_10_masked_sub_pixel_variance128x128_c, AOM_BITS_10),
+  make_tuple(&aom_highbd_10_masked_sub_pixel_variance128x64_ssse3,
+             &aom_highbd_10_masked_sub_pixel_variance128x64_c, AOM_BITS_10),
+  make_tuple(&aom_highbd_10_masked_sub_pixel_variance64x128_ssse3,
+             &aom_highbd_10_masked_sub_pixel_variance64x128_c, AOM_BITS_10),
+  make_tuple(&aom_highbd_10_masked_sub_pixel_variance64x64_ssse3,
+             &aom_highbd_10_masked_sub_pixel_variance64x64_c, AOM_BITS_10),
+  make_tuple(&aom_highbd_10_masked_sub_pixel_variance64x32_ssse3,
+             &aom_highbd_10_masked_sub_pixel_variance64x32_c, AOM_BITS_10),
+  make_tuple(&aom_highbd_10_masked_sub_pixel_variance32x64_ssse3,
+             &aom_highbd_10_masked_sub_pixel_variance32x64_c, AOM_BITS_10),
+  make_tuple(&aom_highbd_10_masked_sub_pixel_variance32x32_ssse3,
+             &aom_highbd_10_masked_sub_pixel_variance32x32_c, AOM_BITS_10),
+  make_tuple(&aom_highbd_10_masked_sub_pixel_variance32x16_ssse3,
+             &aom_highbd_10_masked_sub_pixel_variance32x16_c, AOM_BITS_10),
+  make_tuple(&aom_highbd_10_masked_sub_pixel_variance16x32_ssse3,
+             &aom_highbd_10_masked_sub_pixel_variance16x32_c, AOM_BITS_10),
+  make_tuple(&aom_highbd_10_masked_sub_pixel_variance16x16_ssse3,
+             &aom_highbd_10_masked_sub_pixel_variance16x16_c, AOM_BITS_10),
+  make_tuple(&aom_highbd_10_masked_sub_pixel_variance16x8_ssse3,
+             &aom_highbd_10_masked_sub_pixel_variance16x8_c, AOM_BITS_10),
+  make_tuple(&aom_highbd_10_masked_sub_pixel_variance8x16_ssse3,
+             &aom_highbd_10_masked_sub_pixel_variance8x16_c, AOM_BITS_10),
+  make_tuple(&aom_highbd_10_masked_sub_pixel_variance8x8_ssse3,
+             &aom_highbd_10_masked_sub_pixel_variance8x8_c, AOM_BITS_10),
+  make_tuple(&aom_highbd_10_masked_sub_pixel_variance8x4_ssse3,
+             &aom_highbd_10_masked_sub_pixel_variance8x4_c, AOM_BITS_10),
+  make_tuple(&aom_highbd_10_masked_sub_pixel_variance4x8_ssse3,
+             &aom_highbd_10_masked_sub_pixel_variance4x8_c, AOM_BITS_10),
+  make_tuple(&aom_highbd_10_masked_sub_pixel_variance4x4_ssse3,
+             &aom_highbd_10_masked_sub_pixel_variance4x4_c, AOM_BITS_10),
+  make_tuple(&aom_highbd_12_masked_sub_pixel_variance128x128_ssse3,
+             &aom_highbd_12_masked_sub_pixel_variance128x128_c, AOM_BITS_12),
+  make_tuple(&aom_highbd_12_masked_sub_pixel_variance128x64_ssse3,
+             &aom_highbd_12_masked_sub_pixel_variance128x64_c, AOM_BITS_12),
+  make_tuple(&aom_highbd_12_masked_sub_pixel_variance64x128_ssse3,
+             &aom_highbd_12_masked_sub_pixel_variance64x128_c, AOM_BITS_12),
+  make_tuple(&aom_highbd_12_masked_sub_pixel_variance64x64_ssse3,
+             &aom_highbd_12_masked_sub_pixel_variance64x64_c, AOM_BITS_12),
+  make_tuple(&aom_highbd_12_masked_sub_pixel_variance64x32_ssse3,
+             &aom_highbd_12_masked_sub_pixel_variance64x32_c, AOM_BITS_12),
+  make_tuple(&aom_highbd_12_masked_sub_pixel_variance32x64_ssse3,
+             &aom_highbd_12_masked_sub_pixel_variance32x64_c, AOM_BITS_12),
+  make_tuple(&aom_highbd_12_masked_sub_pixel_variance32x32_ssse3,
+             &aom_highbd_12_masked_sub_pixel_variance32x32_c, AOM_BITS_12),
+  make_tuple(&aom_highbd_12_masked_sub_pixel_variance32x16_ssse3,
+             &aom_highbd_12_masked_sub_pixel_variance32x16_c, AOM_BITS_12),
+  make_tuple(&aom_highbd_12_masked_sub_pixel_variance16x32_ssse3,
+             &aom_highbd_12_masked_sub_pixel_variance16x32_c, AOM_BITS_12),
+  make_tuple(&aom_highbd_12_masked_sub_pixel_variance16x16_ssse3,
+             &aom_highbd_12_masked_sub_pixel_variance16x16_c, AOM_BITS_12),
+  make_tuple(&aom_highbd_12_masked_sub_pixel_variance16x8_ssse3,
+             &aom_highbd_12_masked_sub_pixel_variance16x8_c, AOM_BITS_12),
+  make_tuple(&aom_highbd_12_masked_sub_pixel_variance8x16_ssse3,
+             &aom_highbd_12_masked_sub_pixel_variance8x16_c, AOM_BITS_12),
+  make_tuple(&aom_highbd_12_masked_sub_pixel_variance8x8_ssse3,
+             &aom_highbd_12_masked_sub_pixel_variance8x8_c, AOM_BITS_12),
+  make_tuple(&aom_highbd_12_masked_sub_pixel_variance8x4_ssse3,
+             &aom_highbd_12_masked_sub_pixel_variance8x4_c, AOM_BITS_12),
+  make_tuple(&aom_highbd_12_masked_sub_pixel_variance4x8_ssse3,
+             &aom_highbd_12_masked_sub_pixel_variance4x8_c, AOM_BITS_12),
+  make_tuple(&aom_highbd_12_masked_sub_pixel_variance4x4_ssse3,
+             &aom_highbd_12_masked_sub_pixel_variance4x4_c, AOM_BITS_12),
+
+  make_tuple(&aom_highbd_8_masked_sub_pixel_variance64x16_ssse3,
+             &aom_highbd_8_masked_sub_pixel_variance64x16_c, AOM_BITS_8),
+  make_tuple(&aom_highbd_8_masked_sub_pixel_variance16x64_ssse3,
+             &aom_highbd_8_masked_sub_pixel_variance16x64_c, AOM_BITS_8),
+  make_tuple(&aom_highbd_8_masked_sub_pixel_variance32x8_ssse3,
+             &aom_highbd_8_masked_sub_pixel_variance32x8_c, AOM_BITS_8),
+  make_tuple(&aom_highbd_8_masked_sub_pixel_variance8x32_ssse3,
+             &aom_highbd_8_masked_sub_pixel_variance8x32_c, AOM_BITS_8),
+  make_tuple(&aom_highbd_8_masked_sub_pixel_variance16x4_ssse3,
+             &aom_highbd_8_masked_sub_pixel_variance16x4_c, AOM_BITS_8),
+  make_tuple(&aom_highbd_8_masked_sub_pixel_variance4x16_ssse3,
+             &aom_highbd_8_masked_sub_pixel_variance4x16_c, AOM_BITS_8),
+  make_tuple(&aom_highbd_10_masked_sub_pixel_variance64x16_ssse3,
+             &aom_highbd_10_masked_sub_pixel_variance64x16_c, AOM_BITS_10),
+  make_tuple(&aom_highbd_10_masked_sub_pixel_variance16x64_ssse3,
+             &aom_highbd_10_masked_sub_pixel_variance16x64_c, AOM_BITS_10),
+  make_tuple(&aom_highbd_10_masked_sub_pixel_variance32x8_ssse3,
+             &aom_highbd_10_masked_sub_pixel_variance32x8_c, AOM_BITS_10),
+  make_tuple(&aom_highbd_10_masked_sub_pixel_variance8x32_ssse3,
+             &aom_highbd_10_masked_sub_pixel_variance8x32_c, AOM_BITS_10),
+  make_tuple(&aom_highbd_10_masked_sub_pixel_variance16x4_ssse3,
+             &aom_highbd_10_masked_sub_pixel_variance16x4_c, AOM_BITS_10),
+  make_tuple(&aom_highbd_10_masked_sub_pixel_variance4x16_ssse3,
+             &aom_highbd_10_masked_sub_pixel_variance4x16_c, AOM_BITS_10),
+  make_tuple(&aom_highbd_12_masked_sub_pixel_variance64x16_ssse3,
+             &aom_highbd_12_masked_sub_pixel_variance64x16_c, AOM_BITS_12),
+  make_tuple(&aom_highbd_12_masked_sub_pixel_variance16x64_ssse3,
+             &aom_highbd_12_masked_sub_pixel_variance16x64_c, AOM_BITS_12),
+  make_tuple(&aom_highbd_12_masked_sub_pixel_variance32x8_ssse3,
+             &aom_highbd_12_masked_sub_pixel_variance32x8_c, AOM_BITS_12),
+  make_tuple(&aom_highbd_12_masked_sub_pixel_variance8x32_ssse3,
+             &aom_highbd_12_masked_sub_pixel_variance8x32_c, AOM_BITS_12),
+  make_tuple(&aom_highbd_12_masked_sub_pixel_variance16x4_ssse3,
+             &aom_highbd_12_masked_sub_pixel_variance16x4_c, AOM_BITS_12),
+  make_tuple(&aom_highbd_12_masked_sub_pixel_variance4x16_ssse3,
+             &aom_highbd_12_masked_sub_pixel_variance4x16_c, AOM_BITS_12),
+};
+
+INSTANTIATE_TEST_SUITE_P(SSSE3_C_COMPARE, HighbdMaskedSubPixelVarianceTest,
+                         ::testing::ValuesIn(hbd_sub_pel_var_test));
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+#endif  // HAVE_SSSE3
+}  // namespace
diff --git a/libs/libaom/src/test/md5_helper.h b/libs/libaom/src/test/md5_helper.h
new file mode 100644
index 000000000..9443cb262
--- /dev/null
+++ b/libs/libaom/src/test/md5_helper.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_TEST_MD5_HELPER_H_
+#define AOM_TEST_MD5_HELPER_H_
+
+#include "aom/aom_decoder.h"
+#include "common/md5_utils.h"
+
+namespace libaom_test {
+class MD5 {
+ public:
+  MD5() { MD5Init(&md5_); }
+
+  void Add(const aom_image_t *img) {
+    for (int plane = 0; plane < 3; ++plane) {
+      const uint8_t *buf = img->planes[plane];
+      // Calculate the width and height to do the md5 check. For the chroma
+      // plane, we never want to round down and thus skip a pixel so if
+      // we are shifting by 1 (chroma_shift) we add 1 before doing the shift.
+      // This works only for chroma_shift of 0 and 1.
+      const int bytes_per_sample =
+          (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1;
+      const int h =
+          plane ? (img->d_h + img->y_chroma_shift) >> img->y_chroma_shift
+                : img->d_h;
+      const int w =
+          (plane ? (img->d_w + img->x_chroma_shift) >> img->x_chroma_shift
+                 : img->d_w) *
+          bytes_per_sample;
+
+      for (int y = 0; y < h; ++y) {
+        MD5Update(&md5_, buf, w);
+        buf += img->stride[plane];
+      }
+    }
+  }
+
+  void Add(const uint8_t *data, size_t size) {
+    MD5Update(&md5_, data, static_cast<uint32_t>(size));
+  }
+
+  const char *Get(void) {
+    static const char hex[16] = {
+      '0', '1', '2', '3', '4', '5', '6', '7',
+      '8', '9', 'a', 'b', 'c', 'd', 'e', 'f',
+    };
+    uint8_t tmp[16];
+    MD5Context ctx_tmp = md5_;
+
+    MD5Final(tmp, &ctx_tmp);
+    for (int i = 0; i < 16; i++) {
+      res_[i * 2 + 0] = hex[tmp[i] >> 4];
+      res_[i * 2 + 1] = hex[tmp[i] & 0xf];
+    }
+    res_[32] = 0;
+
+    return res_;
+  }
+
+ protected:
+  char res_[33];
+  MD5Context md5_;
+};
+
+}  // namespace libaom_test
+
+#endif  // AOM_TEST_MD5_HELPER_H_
diff --git a/libs/libaom/src/test/metadata_test.cc b/libs/libaom/src/test/metadata_test.cc
new file mode 100644
index 000000000..79e08a7a5
--- /dev/null
+++ b/libs/libaom/src/test/metadata_test.cc
@@ -0,0 +1,337 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "aom/aom_codec.h"
+#include "aom/aom_image.h"
+#include "aom/internal/aom_image_internal.h"
+#include "aom_scale/yv12config.h"
+#include "av1/encoder/bitstream.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+#include "test/video_source.h"
+
+namespace {
+const size_t kMetadataPayloadSizeT35 = 24;
+// 0xB5 stands for the itut t35 metadata country code for the Unites States
+const uint8_t kMetadataPayloadT35[kMetadataPayloadSizeT35] = {
+  0xB5, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B,
+  0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
+};
+
+const size_t kMetadataPayloadSizeCll = 4;
+const uint8_t kMetadataPayloadCll[kMetadataPayloadSizeCll] = { 0xB5, 0x01, 0x02,
+                                                               0x03 };
+
+#if CONFIG_AV1_ENCODER
+
+const size_t kMetadataObuSizeT35 = 28;
+const uint8_t kMetadataObuT35[kMetadataObuSizeT35] = {
+  0x2A, 0x1A, 0x02, 0xB5, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
+  0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10,
+  0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x80
+};
+const size_t kMetadataObuSizeMdcv = 28;
+const uint8_t kMetadataObuMdcv[kMetadataObuSizeMdcv] = {
+  0x2A, 0x1A, 0x02, 0xB5, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
+  0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10,
+  0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x80
+};
+const size_t kMetadataObuSizeCll = 8;
+const uint8_t kMetadataObuCll[kMetadataObuSizeCll] = { 0x2A, 0x06, 0x01, 0xB5,
+                                                       0x01, 0x02, 0x03, 0x80 };
+
+class MetadataEncodeTest
+    : public ::libaom_test::CodecTestWithParam<libaom_test::TestMode>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  MetadataEncodeTest() : EncoderTest(GET_PARAM(0)) {}
+
+  virtual ~MetadataEncodeTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(GET_PARAM(1));
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video) {
+    aom_image_t *current_frame = video->img();
+    if (current_frame) {
+      if (current_frame->metadata) aom_img_remove_metadata(current_frame);
+      ASSERT_EQ(aom_img_add_metadata(current_frame, OBU_METADATA_TYPE_ITUT_T35,
+                                     kMetadataPayloadT35, 0, AOM_MIF_ANY_FRAME),
+                -1);
+      ASSERT_EQ(
+          aom_img_add_metadata(current_frame, OBU_METADATA_TYPE_ITUT_T35, NULL,
+                               kMetadataPayloadSizeT35, AOM_MIF_ANY_FRAME),
+          -1);
+      ASSERT_EQ(aom_img_add_metadata(current_frame, OBU_METADATA_TYPE_ITUT_T35,
+                                     NULL, 0, AOM_MIF_ANY_FRAME),
+                -1);
+      ASSERT_EQ(
+          aom_img_add_metadata(current_frame, OBU_METADATA_TYPE_ITUT_T35,
+                               kMetadataPayloadT35, kMetadataPayloadSizeT35,
+                               AOM_MIF_ANY_FRAME),
+          0);
+
+      ASSERT_EQ(
+          aom_img_add_metadata(current_frame, OBU_METADATA_TYPE_HDR_MDCV,
+                               kMetadataPayloadT35, kMetadataPayloadSizeT35,
+                               AOM_MIF_KEY_FRAME),
+          0);
+
+      ASSERT_EQ(
+          aom_img_add_metadata(current_frame, OBU_METADATA_TYPE_HDR_CLL,
+                               kMetadataPayloadCll, kMetadataPayloadSizeCll,
+                               AOM_MIF_KEY_FRAME),
+          0);
+    }
+  }
+
+  virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
+    if (pkt->kind == AOM_CODEC_CX_FRAME_PKT) {
+      const size_t bitstream_size = pkt->data.frame.sz;
+      const uint8_t *bitstream =
+          static_cast<const uint8_t *>(pkt->data.frame.buf);
+      // look for valid metadatas in bitstream
+      bool itut_t35_metadata_found = false;
+      if (bitstream_size >= kMetadataObuSizeT35) {
+        for (size_t i = 0; i <= bitstream_size - kMetadataObuSizeT35; ++i) {
+          if (memcmp(bitstream + i, kMetadataObuT35, kMetadataObuSizeT35) ==
+              0) {
+            itut_t35_metadata_found = true;
+          }
+        }
+      }
+      ASSERT_EQ(itut_t35_metadata_found, 1u);
+
+      // Testing for HDR MDCV metadata
+      bool hdr_mdcv_metadata_found = false;
+      if (bitstream_size >= kMetadataObuSizeMdcv) {
+        for (size_t i = 0; i <= bitstream_size - kMetadataObuSizeMdcv; ++i) {
+          if (memcmp(bitstream + i, kMetadataObuMdcv, kMetadataObuSizeMdcv) ==
+              0) {
+            hdr_mdcv_metadata_found = true;
+          }
+        }
+      }
+      ASSERT_TRUE(hdr_mdcv_metadata_found);
+
+      // Testing for HDR CLL metadata
+      bool hdr_cll_metadata_found = false;
+      if (bitstream_size >= kMetadataObuSizeCll) {
+        for (size_t i = 0; i <= bitstream_size - kMetadataObuSizeCll; ++i) {
+          if (memcmp(bitstream + i, kMetadataObuCll, kMetadataObuSizeCll) ==
+              0) {
+            hdr_cll_metadata_found = true;
+          }
+        }
+      }
+      ASSERT_TRUE(hdr_cll_metadata_found);
+    }
+  }
+
+  virtual void DecompressedFrameHook(const aom_image_t &img,
+                                     aom_codec_pts_t /*pts*/) {
+    ASSERT_TRUE(img.metadata != nullptr);
+
+    ASSERT_EQ(img.metadata->sz, 3u);
+
+    for (size_t i = 0; i < img.metadata->sz - 1; ++i) {
+      ASSERT_EQ(kMetadataPayloadSizeT35, img.metadata->metadata_array[i]->sz);
+      EXPECT_EQ(
+          memcmp(kMetadataPayloadT35, img.metadata->metadata_array[i]->payload,
+                 kMetadataPayloadSizeT35),
+          0);
+    }
+
+    ASSERT_EQ(kMetadataPayloadSizeCll, img.metadata->metadata_array[2]->sz);
+    EXPECT_EQ(
+        memcmp(kMetadataPayloadCll, img.metadata->metadata_array[2]->payload,
+               kMetadataPayloadSizeCll),
+        0);
+  }
+};
+
+TEST_P(MetadataEncodeTest, TestMetadataEncoding) {
+  ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 5);
+  init_flags_ = AOM_CODEC_USE_PSNR;
+
+  cfg_.g_w = 352;
+  cfg_.g_h = 288;
+
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 600;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 2;
+  cfg_.rc_max_quantizer = 56;
+  cfg_.rc_undershoot_pct = 50;
+  cfg_.rc_overshoot_pct = 50;
+  cfg_.rc_end_usage = AOM_CBR;
+  cfg_.kf_mode = AOM_KF_AUTO;
+  cfg_.g_lag_in_frames = 1;
+  cfg_.kf_min_dist = cfg_.kf_max_dist = 3000;
+  // Enable dropped frames.
+  cfg_.rc_dropframe_thresh = 1;
+  // Disable error_resilience mode.
+  cfg_.g_error_resilient = 0;
+  // Run at low bitrate.
+  cfg_.rc_target_bitrate = 40;
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+AV1_INSTANTIATE_TEST_CASE(MetadataEncodeTest,
+                          ::testing::Values(::libaom_test::kOnePassGood));
+
+#endif  // CONFIG_AV1_ENCODER
+}  // namespace
+
+TEST(MetadataTest, MetadataAllocation) {
+  aom_metadata_t *metadata =
+      aom_img_metadata_alloc(OBU_METADATA_TYPE_ITUT_T35, kMetadataPayloadT35,
+                             kMetadataPayloadSizeT35, AOM_MIF_ANY_FRAME);
+  ASSERT_NE(metadata, nullptr);
+  aom_img_metadata_free(metadata);
+}
+
+TEST(MetadataTest, MetadataArrayAllocation) {
+  aom_metadata_array_t *metadata_array = aom_img_metadata_array_alloc(2);
+  ASSERT_NE(metadata_array, nullptr);
+
+  metadata_array->metadata_array[0] =
+      aom_img_metadata_alloc(OBU_METADATA_TYPE_ITUT_T35, kMetadataPayloadT35,
+                             kMetadataPayloadSizeT35, AOM_MIF_ANY_FRAME);
+  metadata_array->metadata_array[1] =
+      aom_img_metadata_alloc(OBU_METADATA_TYPE_ITUT_T35, kMetadataPayloadT35,
+                             kMetadataPayloadSizeT35, AOM_MIF_ANY_FRAME);
+
+  aom_img_metadata_array_free(metadata_array);
+}
+
+TEST(MetadataTest, AddMetadataToImage) {
+  aom_image_t image;
+  image.metadata = NULL;
+
+  ASSERT_EQ(aom_img_add_metadata(&image, OBU_METADATA_TYPE_ITUT_T35,
+                                 kMetadataPayloadT35, kMetadataPayloadSizeT35,
+                                 AOM_MIF_ANY_FRAME),
+            0);
+  aom_img_metadata_array_free(image.metadata);
+  EXPECT_EQ(aom_img_add_metadata(NULL, OBU_METADATA_TYPE_ITUT_T35,
+                                 kMetadataPayloadT35, kMetadataPayloadSizeT35,
+                                 AOM_MIF_ANY_FRAME),
+            -1);
+}
+
+TEST(MetadataTest, RemoveMetadataFromImage) {
+  aom_image_t image;
+  image.metadata = NULL;
+
+  ASSERT_EQ(aom_img_add_metadata(&image, OBU_METADATA_TYPE_ITUT_T35,
+                                 kMetadataPayloadT35, kMetadataPayloadSizeT35,
+                                 AOM_MIF_ANY_FRAME),
+            0);
+  aom_img_remove_metadata(&image);
+  aom_img_remove_metadata(NULL);
+}
+
+TEST(MetadataTest, CopyMetadataToFrameBuffer) {
+  YV12_BUFFER_CONFIG yvBuf;
+  yvBuf.metadata = NULL;
+
+  aom_metadata_array_t *metadata_array = aom_img_metadata_array_alloc(1);
+  ASSERT_NE(metadata_array, nullptr);
+
+  metadata_array->metadata_array[0] =
+      aom_img_metadata_alloc(OBU_METADATA_TYPE_ITUT_T35, kMetadataPayloadT35,
+                             kMetadataPayloadSizeT35, AOM_MIF_ANY_FRAME);
+
+  // Metadata_array
+  int status = aom_copy_metadata_to_frame_buffer(&yvBuf, metadata_array);
+  EXPECT_EQ(status, 0);
+  status = aom_copy_metadata_to_frame_buffer(NULL, metadata_array);
+  EXPECT_EQ(status, -1);
+  aom_img_metadata_array_free(metadata_array);
+
+  // Metadata_array_2
+  aom_metadata_array_t *metadata_array_2 = aom_img_metadata_array_alloc(0);
+  ASSERT_NE(metadata_array_2, nullptr);
+  status = aom_copy_metadata_to_frame_buffer(&yvBuf, metadata_array_2);
+  EXPECT_EQ(status, -1);
+  aom_img_metadata_array_free(metadata_array_2);
+
+  // YV12_BUFFER_CONFIG
+  status = aom_copy_metadata_to_frame_buffer(&yvBuf, NULL);
+  EXPECT_EQ(status, -1);
+  aom_remove_metadata_from_frame_buffer(&yvBuf);
+  aom_remove_metadata_from_frame_buffer(NULL);
+}
+
+TEST(MetadataTest, GetMetadataFromImage) {
+  aom_image_t image;
+  image.metadata = NULL;
+
+  ASSERT_EQ(aom_img_add_metadata(&image, OBU_METADATA_TYPE_ITUT_T35,
+                                 kMetadataPayloadT35, kMetadataPayloadSizeT35,
+                                 AOM_MIF_ANY_FRAME),
+            0);
+
+  EXPECT_TRUE(aom_img_get_metadata(NULL, 0) == NULL);
+  EXPECT_TRUE(aom_img_get_metadata(&image, 1u) == NULL);
+  EXPECT_TRUE(aom_img_get_metadata(&image, 10u) == NULL);
+
+  const aom_metadata_t *metadata = aom_img_get_metadata(&image, 0);
+  ASSERT_TRUE(metadata != NULL);
+  ASSERT_EQ(metadata->sz, kMetadataPayloadSizeT35);
+  EXPECT_EQ(
+      memcmp(kMetadataPayloadT35, metadata->payload, kMetadataPayloadSizeT35),
+      0);
+
+  aom_img_metadata_array_free(image.metadata);
+}
+
+TEST(MetadataTest, ReadMetadatasFromImage) {
+  aom_image_t image;
+  image.metadata = NULL;
+
+  uint32_t types[3];
+  types[0] = OBU_METADATA_TYPE_ITUT_T35;
+  types[1] = OBU_METADATA_TYPE_HDR_CLL;
+  types[2] = OBU_METADATA_TYPE_HDR_MDCV;
+
+  ASSERT_EQ(aom_img_add_metadata(&image, types[0], kMetadataPayloadT35,
+                                 kMetadataPayloadSizeT35, AOM_MIF_ANY_FRAME),
+            0);
+  ASSERT_EQ(aom_img_add_metadata(&image, types[1], kMetadataPayloadT35,
+                                 kMetadataPayloadSizeT35, AOM_MIF_KEY_FRAME),
+            0);
+  ASSERT_EQ(aom_img_add_metadata(&image, types[2], kMetadataPayloadT35,
+                                 kMetadataPayloadSizeT35, AOM_MIF_KEY_FRAME),
+            0);
+
+  size_t number_metadata = aom_img_num_metadata(&image);
+  ASSERT_EQ(number_metadata, 3u);
+  for (size_t i = 0; i < number_metadata; ++i) {
+    const aom_metadata_t *metadata = aom_img_get_metadata(&image, i);
+    ASSERT_TRUE(metadata != NULL);
+    ASSERT_EQ(metadata->type, types[i]);
+    ASSERT_EQ(metadata->sz, kMetadataPayloadSizeT35);
+    EXPECT_EQ(
+        memcmp(kMetadataPayloadT35, metadata->payload, kMetadataPayloadSizeT35),
+        0);
+  }
+  aom_img_metadata_array_free(image.metadata);
+}
diff --git a/libs/libaom/src/test/metrics_template.html b/libs/libaom/src/test/metrics_template.html
new file mode 100644
index 000000000..b57c62314
--- /dev/null
+++ b/libs/libaom/src/test/metrics_template.html
@@ -0,0 +1,422 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<title>Video Codec Test Results</title>
+<style type="text/css">
+<!-- Begin 960 reset -->
+a,abbr,acronym,address,applet,article,aside,audio,b,big,blockquote,body,canvas,caption,center,cite,c
+ode,dd,del,details,dfn,dialog,div,dl,dt,em,embed,fieldset,figcaption,figure,font,footer,form,h1,h2,h
+3,h4,h5,h6,header,hgroup,hr,html,i,iframe,img,ins,kbd,label,legend,li,mark,menu,meter,nav,object,ol,
+output,p,pre,progress,q,rp,rt,ruby,s,samp,section,small,span,strike,strong,sub,summary,sup,table,tbo
+dy,td,tfoot,th,thead,time,tr,tt,u,ul,var,video,xmp{border:0;margin:0;padding:0;font-size:100%}html,b
+ody{height:100%}article,aside,details,figcaption,figure,footer,header,hgroup,menu,nav,section{displa
+y:block}b,strong{font-weight:bold}img{color:transparent;font-size:0;vertical-align:middle;-ms-interp
+olation-mode:bicubic}ol,ul{list-style:none}li{display:list-item}table{border-collapse:collapse;borde
+r-spacing:0}th,td,caption{font-weight:normal;vertical-align:top;text-align:left}q{quotes:none}q:befo
+re,q:after{content:'';content:none}sub,sup,small{font-size:75%}sub,sup{line-height:0;position:relati
+ve;vertical-align:baseline}sub{bottom:-0.25em}sup{top:-0.5em}svg{overflow:hidden}
+<!-- End 960 reset -->
+<!-- Begin 960 text -->
+body{font:13px/1.5 'Helvetica Neue',Arial,'Liberation Sans',FreeSans,sans-serif}pre,code{font-family
+:'DejaVu Sans Mono',Menlo,Consolas,monospace}hr{border:0 #ccc solid;border-top-width:1px;clear:both;
+height:0}h1{font-size:25px}h2{font-size:23px}h3{font-size:21px}h4{font-size:19px}h5{font-size:17px}h
+6{font-size:15px}ol{list-style:decimal}ul{list-style:disc}li{margin-left:30px}p,dl,hr,h1,h2,h3,h4,h5
+,h6,ol,ul,pre,table,address,fieldset,figure{margin-bottom:20px}
+<!-- End 960 text -->
+<!-- Begin 960 grid (fluid variant)
+     12 columns, 1152px total width
+     http://960.gs/ | http://grids.heroku.com/ -->
+.container_12{width:92%;margin-left:4%;margin-right:4%}.grid_1,.grid_2,.grid_3,.grid_4,.grid_5,.grid
+_6,.grid_7,.grid_8,.grid_9,.grid_10,.grid_11,.grid_12{display:inline;float:left;position:relative;ma
+rgin-left:1%;margin-right:1%}.alpha{margin-left:0}.omega{margin-right:0}.container_12 .grid_1{width:
+6.333%}.container_12 .grid_2{width:14.667%}.container_12 .grid_3{width:23.0%}.container_12 .grid_4{w
+idth:31.333%}.container_12 .grid_5{width:39.667%}.container_12 .grid_6{width:48.0%}.container_12 .gr
+id_7{width:56.333%}.container_12 .grid_8{width:64.667%}.container_12 .grid_9{width:73.0%}.container_
+12 .grid_10{width:81.333%}.container_12 .grid_11{width:89.667%}.container_12 .grid_12{width:98.0%}.c
+ontainer_12 .prefix_1{padding-left:8.333%}.container_12 .prefix_2{padding-left:16.667%}.container_12
+ .prefix_3{padding-left:25.0%}.container_12 .prefix_4{padding-left:33.333%}.container_12 .prefix_5{p
+adding-left:41.667%}.container_12 .prefix_6{padding-left:50.0%}.container_12 .prefix_7{padding-left:
+58.333%}.container_12 .prefix_8{padding-left:66.667%}.container_12 .prefix_9{padding-left:75.0%}.con
+tainer_12 .prefix_10{padding-left:83.333%}.container_12 .prefix_11{padding-left:91.667%}.container_1
+2 .suffix_1{padding-right:8.333%}.container_12 .suffix_2{padding-right:16.667%}.container_12 .suffix
+_3{padding-right:25.0%}.container_12 .suffix_4{padding-right:33.333%}.container_12 .suffix_5{padding
+-right:41.667%}.container_12 .suffix_6{padding-right:50.0%}.container_12 .suffix_7{padding-right:58.
+333%}.container_12 .suffix_8{padding-right:66.667%}.container_12 .suffix_9{padding-right:75.0%}.cont
+ainer_12 .suffix_10{padding-right:83.333%}.container_12 .suffix_11{padding-right:91.667%}.container_
+12 .push_1{left:8.333%}.container_12 .push_2{left:16.667%}.container_12 .push_3{left:25.0%}.containe
+r_12 .push_4{left:33.333%}.container_12 .push_5{left:41.667%}.container_12 .push_6{left:50.0%}.conta
+iner_12 .push_7{left:58.333%}.container_12 .push_8{left:66.667%}.container_12 .push_9{left:75.0%}.co
+ntainer_12 .push_10{left:83.333%}.container_12 .push_11{left:91.667%}.container_12 .pull_1{left:-8.3
+33%}.container_12 .pull_2{left:-16.667%}.container_12 .pull_3{left:-25.0%}.container_12 .pull_4{left
+:-33.333%}.container_12 .pull_5{left:-41.667%}.container_12 .pull_6{left:-50.0%}.container_12 .pull_
+7{left:-58.333%}.container_12 .pull_8{left:-66.667%}.container_12 .pull_9{left:-75.0%}.container_12
+.pull_10{left:-83.333%}.container_12 .pull_11{left:-91.667%}.clear{clear:both;display:block;overflow
+:hidden;visibility:hidden;width:0;height:0}.clearfix:after{clear:both;content:' ';display:block;font
+-size:0;line-height:0;visibility:hidden;width:0;height:0}.clearfix{display:inline-block}* html .clea
+rfix{height:1%}.clearfix{display:block}
+<!-- End 960 grid -->
+
+div.metricgraph {
+
+}
+
+body {
+
+}
+
+div.header {
+  font-family: Arial, sans-serif;
+}
+
+div.header h2 {
+  margin: .5em auto;
+}
+
+div.radio {
+  font-family: Arial, sans-serif;
+  margin-bottom: 1em;
+}
+
+div.main {
+
+}
+
+div.cliplist {
+  font-family: Arial, sans-serif;
+  margin-top: 6px;
+}
+
+div.chartarea {
+  font-family: Arial, sans-serif;
+}
+
+div.indicators {
+  font-family: Arial, sans-serif;
+  font-size: 13px;
+  margin-top: 6px;
+  min-height: 600px;
+  background-color: #f7f7f7;
+}
+
+div.indicators div.content {
+  margin: 1em;
+}
+
+div.indicators div.content h5 {
+  font-size: 13px;
+  text-align: center;
+  margin: 0;
+}
+
+div.indicators div.content ul {
+  margin-left: 0;
+  padding-left: 0;
+  margin-top: 0;
+}
+
+div.indicators div.content ul li {
+  margin-left: 1.5em;
+}
+
+div.indicators div.content p:first-child {
+  margin-bottom: .5em;
+}
+
+span.google-visualization-table-sortind {
+  color: #000;
+}
+.header-style {
+  font-weight: bold;
+  border: 1px solid #fff;
+  background-color: #ccc;
+}
+
+td.header-style+td {
+
+}
+
+.orange-background {
+  background-color: orange;
+}
+
+.light-gray-background {
+  background-color: #f0f0f0;
+}
+</style>
+<script type="text/javascript" src="https://www.google.com/jsapi"></script>
+<script type="text/javascript">
+var chart_left   = 40;
+var chart_top    = 6;
+var chart_height = document.documentElement.clientHeight-100;
+var chart_width  = "100%";
+ftable='filestable_avg'
+var snrs = [];
+var filestable_dsnr = [];
+var filestable_drate = [];
+var filestable_avg = [];
+
+// Python template code replaces the following 2 lines.
+//%%metrics_js%%//
+//%%filestable_dpsnr%%//
+//%%filestable_avg%%//
+//%%filestable_drate%%//
+//%%snrs%%//
+
+var selected = 0
+var imagestr = '';
+var bettertable=0;
+var chart=0;
+var better=0;
+var metricdata=0;
+var metricView=0;
+var column=1;
+var formatter=0;
+
+function changeColumn(col) {
+  column = col;
+  console.log(col)
+  draw_files();
+}
+
+function changeMetric(m) {
+  ftable=m
+  draw_files()
+}
+
+function setup_vis() {
+  chart = new google.visualization.ScatterChart(
+      document.getElementById("metricgraph"));
+
+  bettertable = new google.visualization.Table(
+      document.getElementById("bettertable"));
+
+  draw_files();
+  build_metrics_radio();
+}
+
+function build_metrics_radio() {
+  for (metric=1; metric < metrics.length; metric++) {
+    var rb = document.createElement('input');
+    var l = document.createElement('label');
+    rb.setAttribute('type','radio');
+    rb.setAttribute('name','metric');
+    rb.setAttribute('onClick', "changeColumn('"+metric.toString()+"')");
+    l.innerHTML = metrics[metric];
+    document.getElementById('metrics').appendChild(rb);
+    document.getElementById('metrics').appendChild(l);
+  }
+}
+
+function draw_files() {
+  var options = {'allowHtml': true, 'width': "100%", 'height': "50%"};
+  if (better != 0) delete better;
+
+  col=eval(ftable+'[column]')
+  better = new google.visualization.DataTable(col)
+
+  // Python Template code replaces the following line with a list of
+  // formatters.
+  if (ftable == 'filestable_dsnr')
+    formatter = new google.visualization.NumberFormat(
+      {fractionDigits: 4, suffix:" db"});
+  else
+    formatter = new google.visualization.NumberFormat(
+       {fractionDigits: 4, suffix:"%"});
+
+  //%%formatters%%//
+
+  bettertable.draw(better,options);
+  google.visualization.events.addListener(bettertable, 'select',
+                                          selectBetterHandler);
+  query_file()
+}
+
+function query_file() {
+  imagestr = better.getFormattedValue(selected, 0)
+  var metricjson = eval('(' + snrs[column][selected] + ')');
+  metricdata = new google.visualization.DataTable(metricjson, 0.6);
+  if( metricView != 0 ) delete metricView;
+  metricView = new google.visualization.DataView(metricdata);
+
+  chart.draw(metricView, {curveType:'function',
+      explorer: {},
+      chartArea:{left:chart_left, top:chart_top, width:chart_width,
+      height:chart_height-90},
+      hAxis:{title:"Datarate in kbps"},
+      vAxis:{title:"Quality in decibels", format: '##.0', textPosition: 'in'},
+      legend:{position:"in"}, title:imagestr, pointSize:2, lineWidth:1,
+      width:chart_width, height:chart_height-50 });
+
+  google.visualization.events.addListener(chart, 'select', chartSelect);
+  google.visualization.events.addListener(chart, 'onmouseover', chartMouseOver);
+  google.visualization.events.addListener(chart, 'onmouseout', chartMouseOut);
+}
+
+function chartMouseOut(e) {
+  statusbar = document.getElementById('status');
+  statusbar.style.display = 'none';
+}
+
+function chartMouseOver(e) {
+  pointDifference(e.row, e.column)
+}
+
+function pointDifference(row, col) {
+  if(!row || !col)
+    return;
+
+  var cols = metricdata.getNumberOfColumns();
+  var rows = metricdata.getNumberOfRows();
+
+  var sel_bitrate = metricView.getValue(row, 0 );
+  var sel_metric = metricView.getValue(row, col);
+
+  var message = '<ul>' + metricView.getColumnLabel(col) +
+     ' (' + sel_bitrate.toFixed(0) + ' kbps, ' + sel_metric.toFixed(2) + ')' + ' is ';
+
+
+  // col 0 is datarate
+  for( var i=1;i<cols;++i) {
+
+    var metric_greatest_thats_less = 0;
+    var rate_greatest_thats_less = 0;
+    var metric_smallest_thats_greater = 999;
+    var rate_smallest_thats_greater = 0;
+
+    if(i==col)
+      continue;
+
+    // Find the lowest metric for the column that's greater than sel_metric and
+    // the highest metric for this column that's less than the metric.
+    for(var line_count = 0; line_count < rows; ++line_count) {
+      this_metric = metricdata.getValue(line_count, i)
+      this_rate = metricdata.getValue(line_count, 0)
+      if(!this_metric)
+        continue;
+
+      if(this_metric > metric_greatest_thats_less &&
+         this_metric <= sel_metric) {
+        metric_greatest_thats_less = this_metric;
+        rate_greatest_thats_less = this_rate;
+      }
+      if(this_metric < metric_smallest_thats_greater &&
+        this_metric > sel_metric) {
+        metric_smallest_thats_greater = this_metric;
+        rate_smallest_thats_greater = this_rate;
+      }
+    }
+
+    if(rate_smallest_thats_greater == 0 || rate_greatest_thats_less == 0) {
+      message = message + " <li> Couldn't find a point on both sides.</li>"
+    } else {
+      metric_slope = ( rate_smallest_thats_greater - rate_greatest_thats_less) /
+          ( metric_smallest_thats_greater - metric_greatest_thats_less);
+
+      projected_rate = ( sel_metric - metric_greatest_thats_less) *
+          metric_slope + rate_greatest_thats_less;
+
+      difference = 100 * (projected_rate / sel_bitrate - 1);
+
+
+      if (difference > 0)
+        message = message + "<li>  " + difference.toFixed(2) +
+                  "% smaller than <em>" +
+                  metricdata.getColumnLabel(i) + "</em></li> "
+      else
+        message = message + "<li>  " + -difference.toFixed(2) +
+                  "% bigger than <em>" +
+                  metricdata.getColumnLabel(i) + "</em></li> "
+    }
+
+  }
+  message = message + "</ul>"
+  statusbar = document.getElementById('status');
+  statusbar.innerHTML = "<p>" + message + "</p>";
+  statusbar.style.display = 'block';
+}
+
+function chartSelect() {
+  var selection = chart.getSelection();
+  var message = '';
+  var min = metricView.getFormattedValue(selection[0].row, 0);
+  var max = metricView.getFormattedValue(selection[selection.length-1].row, 0);
+  var val = metricView.getFormattedValue(selection[0].row,selection[0].column);
+
+  pointDifference(selection[0].row, selection[0].column)
+  min = min / 3
+  max = max * 3
+  metricView.setRows(metricdata.getFilteredRows(
+      [{column: 0,minValue: min, maxValue:max}]));
+
+  chart.draw(metricView, {curveType:'function',
+      chartArea:{left:40, top:10, width:chart_width, height:chart_height - 110},
+      hAxis:{title:"datarate in kbps"}, vAxis:{title:"quality in decibels"},
+      legend:{position:"in"}, title:imagestr, pointSize:2, lineWidth:1,
+      width:chart_width, height:chart_height - 50});
+}
+
+function selectBetterHandler() {
+  var selection = bettertable.getSelection();
+  for (var i = 0; i < selection.length; i++) {
+    item = selection[i];
+  }
+  selected = item.row
+  query_file()
+}
+
+
+google.load('visualization', '1', {'packages' : ['corechart','table']});
+google.setOnLoadCallback(setup_vis);
+</script>
+</head>
+
+<body>
+
+  <div class="container_12">
+
+    <div class="grid_12 header">
+      <h2>Codec Comparison Results</h2>
+    </div>
+
+    <div class="grid_12 radio">
+
+      <form name="myform">
+        Method For Combining Points
+        <input type="radio" checked name="column" value="1"
+          onClick="changeMetric('filestable_avg')" />Average of bitrates difference
+        <input type="radio" name="column" value="2"
+          onClick="changeMetric('filestable_dsnr')" />BDSNR
+        <input type="radio" name="column" value="3"
+          onClick="changeMetric('filestable_drate')" />BDRATE
+      </form>
+
+      <form id="metrics" name="myform">
+      </form>
+
+    </div>
+
+    <div class="grid_12 main">
+
+      <div class="grid_5 alpha cliplist">
+        <div id="bettertable"></div>
+      </div>
+
+      <div class="grid_5 chartarea">
+        <div id="metricgraph"></div>
+      </div>
+
+      <div class="grid_2 omega indicators">
+        <div class="content">
+          <h5>Indicators</h5>
+          <hr>
+          <div id="status"></div>
+        </div>
+      </div>
+
+    </div>
+
+  </div>
+
+</body>
+</html>
diff --git a/libs/libaom/src/test/monochrome_test.cc b/libs/libaom/src/test/monochrome_test.cc
new file mode 100644
index 000000000..ebccba584
--- /dev/null
+++ b/libs/libaom/src/test/monochrome_test.cc
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <climits>
+#include <vector>
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/video_source.h"
+#include "test/util.h"
+
+namespace {
+
+class MonochromeTest
+    : public ::libaom_test::CodecTestWithParam<libaom_test::TestMode>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  MonochromeTest() : EncoderTest(GET_PARAM(0)), frame0_psnr_y_(0.) {}
+
+  virtual ~MonochromeTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(GET_PARAM(1));
+  }
+
+  virtual void DecompressedFrameHook(const aom_image_t &img,
+                                     aom_codec_pts_t pts) {
+    (void)pts;
+
+    // Get value of top-left corner pixel of U plane
+    int chroma_value = img.planes[AOM_PLANE_U][0];
+
+    bool is_chroma_constant =
+        ComparePlaneToValue(img, AOM_PLANE_U, chroma_value) &&
+        ComparePlaneToValue(img, AOM_PLANE_V, chroma_value);
+
+    // Chroma planes should be constant
+    EXPECT_TRUE(is_chroma_constant);
+
+    // Monochrome flag on image should be set
+    EXPECT_EQ(img.monochrome, 1);
+
+    chroma_value_list_.push_back(chroma_value);
+  }
+
+  // Returns true if all pixels on the plane are equal to value, and returns
+  // false otherwise.
+  bool ComparePlaneToValue(const aom_image_t &img, const int plane,
+                           const int value) {
+    const int w = aom_img_plane_width(&img, plane);
+    const int h = aom_img_plane_height(&img, plane);
+    const uint8_t *const buf = img.planes[plane];
+    const int stride = img.stride[plane];
+
+    for (int r = 0; r < h; ++r) {
+      for (int c = 0; c < w; ++c) {
+        if (buf[r * stride + c] != value) return false;
+      }
+    }
+    return true;
+  }
+
+  virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+    // Check that the initial Y PSNR value is 'high enough', and check that
+    // subsequent Y PSNR values are 'close' to this initial value.
+    if (frame0_psnr_y_ == 0.) {
+      frame0_psnr_y_ = pkt->data.psnr.psnr[1];
+      EXPECT_GT(frame0_psnr_y_, 29.);
+    }
+    EXPECT_NEAR(pkt->data.psnr.psnr[1], frame0_psnr_y_, 2.5);
+  }
+
+  std::vector<int> chroma_value_list_;
+  double frame0_psnr_y_;
+};
+
+TEST_P(MonochromeTest, TestMonochromeEncoding) {
+  ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 5);
+
+  init_flags_ = AOM_CODEC_USE_PSNR;
+
+  cfg_.g_w = 352;
+  cfg_.g_h = 288;
+
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 600;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 2;
+  cfg_.rc_max_quantizer = 56;
+  cfg_.rc_undershoot_pct = 50;
+  cfg_.rc_overshoot_pct = 50;
+  cfg_.rc_end_usage = AOM_CBR;
+  cfg_.kf_mode = AOM_KF_AUTO;
+  cfg_.g_lag_in_frames = 1;
+  cfg_.kf_min_dist = cfg_.kf_max_dist = 3000;
+  // Enable dropped frames.
+  cfg_.rc_dropframe_thresh = 1;
+  // Disable error_resilience mode.
+  cfg_.g_error_resilient = 0;
+  // Run at low bitrate.
+  cfg_.rc_target_bitrate = 40;
+  // Set monochrome encoding flag
+  cfg_.monochrome = 1;
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  // Check that the chroma planes are equal across all frames
+  std::vector<int>::const_iterator iter = chroma_value_list_.begin();
+  int initial_chroma_value = *iter;
+  for (; iter != chroma_value_list_.end(); ++iter) {
+    // Check that all decoded frames have the same constant chroma planes.
+    EXPECT_EQ(*iter, initial_chroma_value);
+  }
+}
+
+AV1_INSTANTIATE_TEST_CASE(MonochromeTest,
+                          ::testing::Values(::libaom_test::kTwoPassGood));
+
+}  // namespace
diff --git a/libs/libaom/src/test/motion_vector_test.cc b/libs/libaom/src/test/motion_vector_test.cc
new file mode 100644
index 000000000..2636c39aa
--- /dev/null
+++ b/libs/libaom/src/test/motion_vector_test.cc
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <memory>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/util.h"
+#include "test/yuv_video_source.h"
+
+namespace {
+#define MAX_EXTREME_MV 1
+#define MIN_EXTREME_MV 2
+
+// Encoding modes
+const libaom_test::TestMode kEncodingModeVectors[] = {
+  ::libaom_test::kTwoPassGood,
+  ::libaom_test::kOnePassGood,
+};
+
+// Encoding speeds
+const int kCpuUsedVectors[] = { 1, 5 };
+
+// MV test modes: 1 - always use maximum MV; 2 - always use minimum MV.
+const int kMVTestModes[] = { MAX_EXTREME_MV, MIN_EXTREME_MV };
+
+class MotionVectorTestLarge
+    : public ::libaom_test::CodecTestWith3Params<libaom_test::TestMode, int,
+                                                 int>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  MotionVectorTestLarge()
+      : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+        cpu_used_(GET_PARAM(2)), mv_test_mode_(GET_PARAM(3)) {}
+
+  virtual ~MotionVectorTestLarge() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(encoding_mode_);
+    if (encoding_mode_ != ::libaom_test::kRealTime) {
+      cfg_.g_lag_in_frames = 3;
+      cfg_.rc_end_usage = AOM_VBR;
+    } else {
+      cfg_.g_lag_in_frames = 0;
+      cfg_.rc_end_usage = AOM_CBR;
+      cfg_.rc_buf_sz = 1000;
+      cfg_.rc_buf_initial_sz = 500;
+      cfg_.rc_buf_optimal_sz = 600;
+    }
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(AOME_SET_CPUUSED, cpu_used_);
+      encoder->Control(AV1E_ENABLE_MOTION_VECTOR_UNIT_TEST, mv_test_mode_);
+      if (encoding_mode_ != ::libaom_test::kRealTime) {
+        encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+        encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
+        encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
+      }
+    }
+  }
+
+  libaom_test::TestMode encoding_mode_;
+  int cpu_used_;
+  int mv_test_mode_;
+};
+
+TEST_P(MotionVectorTestLarge, OverallTest) {
+  int width = 3840;
+  int height = 2160;
+
+  // Reduce the test clip's resolution while testing on 32-bit system.
+  if (sizeof(void *) == 4) {
+    width = 2048;
+    height = 360;
+  }
+
+  cfg_.rc_target_bitrate = 24000;
+  cfg_.g_profile = 0;
+  init_flags_ = AOM_CODEC_USE_PSNR;
+
+  std::unique_ptr<libaom_test::VideoSource> video;
+  video.reset(new libaom_test::YUVVideoSource(
+      "niklas_640_480_30.yuv", AOM_IMG_FMT_I420, width, height, 30, 1, 0, 3));
+
+  ASSERT_TRUE(video.get() != NULL);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+}
+
+AV1_INSTANTIATE_TEST_CASE(MotionVectorTestLarge,
+                          ::testing::ValuesIn(kEncodingModeVectors),
+                          ::testing::ValuesIn(kCpuUsedVectors),
+                          ::testing::ValuesIn(kMVTestModes));
+}  // namespace
diff --git a/libs/libaom/src/test/noise_model_test.cc b/libs/libaom/src/test/noise_model_test.cc
new file mode 100644
index 000000000..5b61236f0
--- /dev/null
+++ b/libs/libaom/src/test/noise_model_test.cc
@@ -0,0 +1,1343 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <algorithm>
+#include <vector>
+
+#include "aom_dsp/noise_model.h"
+#include "aom_dsp/noise_util.h"
+#include "config/aom_dsp_rtcd.h"
+#include "test/acm_random.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+// Return normally distrbuted values with standard deviation of sigma.
+double randn(libaom_test::ACMRandom *random, double sigma) {
+  while (1) {
+    const double u = 2.0 * ((double)random->Rand31() /
+                            testing::internal::Random::kMaxRange) -
+                     1.0;
+    const double v = 2.0 * ((double)random->Rand31() /
+                            testing::internal::Random::kMaxRange) -
+                     1.0;
+    const double s = u * u + v * v;
+    if (s > 0 && s < 1) {
+      return sigma * (u * sqrt(-2.0 * log(s) / s));
+    }
+  }
+  return 0;
+}
+
+// Synthesizes noise using the auto-regressive filter of the given lag,
+// with the provided n coefficients sampled at the given coords.
+void noise_synth(libaom_test::ACMRandom *random, int lag, int n,
+                 const int (*coords)[2], const double *coeffs, double *data,
+                 int w, int h) {
+  const int pad_size = 3 * lag;
+  const int padded_w = w + pad_size;
+  const int padded_h = h + pad_size;
+  int x = 0, y = 0;
+  std::vector<double> padded(padded_w * padded_h);
+
+  for (y = 0; y < padded_h; ++y) {
+    for (x = 0; x < padded_w; ++x) {
+      padded[y * padded_w + x] = randn(random, 1.0);
+    }
+  }
+  for (y = lag; y < padded_h; ++y) {
+    for (x = lag; x < padded_w; ++x) {
+      double sum = 0;
+      int i = 0;
+      for (i = 0; i < n; ++i) {
+        const int dx = coords[i][0];
+        const int dy = coords[i][1];
+        sum += padded[(y + dy) * padded_w + (x + dx)] * coeffs[i];
+      }
+      padded[y * padded_w + x] += sum;
+    }
+  }
+  // Copy over the padded rows to the output
+  for (y = 0; y < h; ++y) {
+    memcpy(data + y * w, &padded[0] + y * padded_w, sizeof(*data) * w);
+  }
+}
+
+std::vector<float> get_noise_psd(double *noise, int width, int height,
+                                 int block_size) {
+  float *block =
+      (float *)aom_memalign(32, block_size * block_size * sizeof(block));
+  std::vector<float> psd(block_size * block_size);
+  int num_blocks = 0;
+  struct aom_noise_tx_t *tx = aom_noise_tx_malloc(block_size);
+  for (int y = 0; y <= height - block_size; y += block_size / 2) {
+    for (int x = 0; x <= width - block_size; x += block_size / 2) {
+      for (int yy = 0; yy < block_size; ++yy) {
+        for (int xx = 0; xx < block_size; ++xx) {
+          block[yy * block_size + xx] = (float)noise[(y + yy) * width + x + xx];
+        }
+      }
+      aom_noise_tx_forward(tx, &block[0]);
+      aom_noise_tx_add_energy(tx, &psd[0]);
+      num_blocks++;
+    }
+  }
+  for (int yy = 0; yy < block_size; ++yy) {
+    for (int xx = 0; xx <= block_size / 2; ++xx) {
+      psd[yy * block_size + xx] /= num_blocks;
+    }
+  }
+  // Fill in the data that is missing due to symmetries
+  for (int xx = 1; xx < block_size / 2; ++xx) {
+    psd[(block_size - xx)] = psd[xx];
+  }
+  for (int yy = 1; yy < block_size; ++yy) {
+    for (int xx = 1; xx < block_size / 2; ++xx) {
+      psd[(block_size - yy) * block_size + (block_size - xx)] =
+          psd[yy * block_size + xx];
+    }
+  }
+  aom_noise_tx_free(tx);
+  aom_free(block);
+  return psd;
+}
+
+}  // namespace
+
+TEST(NoiseStrengthSolver, GetCentersTwoBins) {
+  aom_noise_strength_solver_t solver;
+  aom_noise_strength_solver_init(&solver, 2, 8);
+  EXPECT_NEAR(0, aom_noise_strength_solver_get_center(&solver, 0), 1e-5);
+  EXPECT_NEAR(255, aom_noise_strength_solver_get_center(&solver, 1), 1e-5);
+  aom_noise_strength_solver_free(&solver);
+}
+
+TEST(NoiseStrengthSolver, GetCentersTwoBins10bit) {
+  aom_noise_strength_solver_t solver;
+  aom_noise_strength_solver_init(&solver, 2, 10);
+  EXPECT_NEAR(0, aom_noise_strength_solver_get_center(&solver, 0), 1e-5);
+  EXPECT_NEAR(1023, aom_noise_strength_solver_get_center(&solver, 1), 1e-5);
+  aom_noise_strength_solver_free(&solver);
+}
+
+TEST(NoiseStrengthSolver, GetCenters256Bins) {
+  const int num_bins = 256;
+  aom_noise_strength_solver_t solver;
+  aom_noise_strength_solver_init(&solver, num_bins, 8);
+
+  for (int i = 0; i < 256; ++i) {
+    EXPECT_NEAR(i, aom_noise_strength_solver_get_center(&solver, i), 1e-5);
+  }
+  aom_noise_strength_solver_free(&solver);
+}
+
+// Tests that the noise strength solver returns the identity transform when
+// given identity-like constraints.
+TEST(NoiseStrengthSolver, ObserveIdentity) {
+  const int num_bins = 256;
+  aom_noise_strength_solver_t solver;
+  EXPECT_EQ(1, aom_noise_strength_solver_init(&solver, num_bins, 8));
+
+  // We have to add a big more strength to constraints at the boundary to
+  // overcome any regularization.
+  for (int j = 0; j < 5; ++j) {
+    aom_noise_strength_solver_add_measurement(&solver, 0, 0);
+    aom_noise_strength_solver_add_measurement(&solver, 255, 255);
+  }
+  for (int i = 0; i < 256; ++i) {
+    aom_noise_strength_solver_add_measurement(&solver, i, i);
+  }
+  EXPECT_EQ(1, aom_noise_strength_solver_solve(&solver));
+  for (int i = 2; i < num_bins - 2; ++i) {
+    EXPECT_NEAR(i, solver.eqns.x[i], 0.1);
+  }
+
+  aom_noise_strength_lut_t lut;
+  EXPECT_EQ(1, aom_noise_strength_solver_fit_piecewise(&solver, 2, &lut));
+
+  ASSERT_EQ(2, lut.num_points);
+  EXPECT_NEAR(0.0, lut.points[0][0], 1e-5);
+  EXPECT_NEAR(0.0, lut.points[0][1], 0.5);
+  EXPECT_NEAR(255.0, lut.points[1][0], 1e-5);
+  EXPECT_NEAR(255.0, lut.points[1][1], 0.5);
+
+  aom_noise_strength_lut_free(&lut);
+  aom_noise_strength_solver_free(&solver);
+}
+
+TEST(NoiseStrengthSolver, SimplifiesCurve) {
+  const int num_bins = 256;
+  aom_noise_strength_solver_t solver;
+  EXPECT_EQ(1, aom_noise_strength_solver_init(&solver, num_bins, 8));
+
+  // Create a parabolic input
+  for (int i = 0; i < 256; ++i) {
+    const double x = (i - 127.5) / 63.5;
+    aom_noise_strength_solver_add_measurement(&solver, i, x * x);
+  }
+  EXPECT_EQ(1, aom_noise_strength_solver_solve(&solver));
+
+  // First try to fit an unconstrained lut
+  aom_noise_strength_lut_t lut;
+  EXPECT_EQ(1, aom_noise_strength_solver_fit_piecewise(&solver, -1, &lut));
+  ASSERT_LE(20, lut.num_points);
+  aom_noise_strength_lut_free(&lut);
+
+  // Now constrain the maximum number of points
+  const int kMaxPoints = 9;
+  EXPECT_EQ(1,
+            aom_noise_strength_solver_fit_piecewise(&solver, kMaxPoints, &lut));
+  ASSERT_EQ(kMaxPoints, lut.num_points);
+
+  // Check that the input parabola is still well represented
+  EXPECT_NEAR(0.0, lut.points[0][0], 1e-5);
+  EXPECT_NEAR(4.0, lut.points[0][1], 0.1);
+  for (int i = 1; i < lut.num_points - 1; ++i) {
+    const double x = (lut.points[i][0] - 128.) / 64.;
+    EXPECT_NEAR(x * x, lut.points[i][1], 0.1);
+  }
+  EXPECT_NEAR(255.0, lut.points[kMaxPoints - 1][0], 1e-5);
+
+  EXPECT_NEAR(4.0, lut.points[kMaxPoints - 1][1], 0.1);
+  aom_noise_strength_lut_free(&lut);
+  aom_noise_strength_solver_free(&solver);
+}
+
+TEST(NoiseStrengthLut, LutEvalSinglePoint) {
+  aom_noise_strength_lut_t lut;
+  ASSERT_TRUE(aom_noise_strength_lut_init(&lut, 1));
+  ASSERT_EQ(1, lut.num_points);
+  lut.points[0][0] = 0;
+  lut.points[0][1] = 1;
+  EXPECT_EQ(1, aom_noise_strength_lut_eval(&lut, -1));
+  EXPECT_EQ(1, aom_noise_strength_lut_eval(&lut, 0));
+  EXPECT_EQ(1, aom_noise_strength_lut_eval(&lut, 1));
+  aom_noise_strength_lut_free(&lut);
+}
+
+TEST(NoiseStrengthLut, LutEvalMultiPointInterp) {
+  const double kEps = 1e-5;
+  aom_noise_strength_lut_t lut;
+  ASSERT_TRUE(aom_noise_strength_lut_init(&lut, 4));
+  ASSERT_EQ(4, lut.num_points);
+
+  lut.points[0][0] = 0;
+  lut.points[0][1] = 0;
+
+  lut.points[1][0] = 1;
+  lut.points[1][1] = 1;
+
+  lut.points[2][0] = 2;
+  lut.points[2][1] = 1;
+
+  lut.points[3][0] = 100;
+  lut.points[3][1] = 1001;
+
+  // Test lower boundary
+  EXPECT_EQ(0, aom_noise_strength_lut_eval(&lut, -1));
+  EXPECT_EQ(0, aom_noise_strength_lut_eval(&lut, 0));
+
+  // Test first part that should be identity
+  EXPECT_NEAR(0.25, aom_noise_strength_lut_eval(&lut, 0.25), kEps);
+  EXPECT_NEAR(0.75, aom_noise_strength_lut_eval(&lut, 0.75), kEps);
+
+  // This is a constant section (should evaluate to 1)
+  EXPECT_NEAR(1.0, aom_noise_strength_lut_eval(&lut, 1.25), kEps);
+  EXPECT_NEAR(1.0, aom_noise_strength_lut_eval(&lut, 1.75), kEps);
+
+  // Test interpolation between to non-zero y coords.
+  EXPECT_NEAR(1, aom_noise_strength_lut_eval(&lut, 2), kEps);
+  EXPECT_NEAR(251, aom_noise_strength_lut_eval(&lut, 26.5), kEps);
+  EXPECT_NEAR(751, aom_noise_strength_lut_eval(&lut, 75.5), kEps);
+
+  // Test upper boundary
+  EXPECT_EQ(1001, aom_noise_strength_lut_eval(&lut, 100));
+  EXPECT_EQ(1001, aom_noise_strength_lut_eval(&lut, 101));
+
+  aom_noise_strength_lut_free(&lut);
+}
+
+TEST(NoiseModel, InitSuccessWithValidSquareShape) {
+  aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, 2, 8, 0 };
+  aom_noise_model_t model;
+
+  EXPECT_TRUE(aom_noise_model_init(&model, params));
+
+  const int kNumCoords = 12;
+  const int kCoords[][2] = { { -2, -2 }, { -1, -2 }, { 0, -2 },  { 1, -2 },
+                             { 2, -2 },  { -2, -1 }, { -1, -1 }, { 0, -1 },
+                             { 1, -1 },  { 2, -1 },  { -2, 0 },  { -1, 0 } };
+  EXPECT_EQ(kNumCoords, model.n);
+  for (int i = 0; i < kNumCoords; ++i) {
+    const int *coord = kCoords[i];
+    EXPECT_EQ(coord[0], model.coords[i][0]);
+    EXPECT_EQ(coord[1], model.coords[i][1]);
+  }
+  aom_noise_model_free(&model);
+}
+
+TEST(NoiseModel, InitSuccessWithValidDiamondShape) {
+  aom_noise_model_t model;
+  aom_noise_model_params_t params = { AOM_NOISE_SHAPE_DIAMOND, 2, 8, 0 };
+  EXPECT_TRUE(aom_noise_model_init(&model, params));
+  EXPECT_EQ(6, model.n);
+  const int kNumCoords = 6;
+  const int kCoords[][2] = { { 0, -2 }, { -1, -1 }, { 0, -1 },
+                             { 1, -1 }, { -2, 0 },  { -1, 0 } };
+  EXPECT_EQ(kNumCoords, model.n);
+  for (int i = 0; i < kNumCoords; ++i) {
+    const int *coord = kCoords[i];
+    EXPECT_EQ(coord[0], model.coords[i][0]);
+    EXPECT_EQ(coord[1], model.coords[i][1]);
+  }
+  aom_noise_model_free(&model);
+}
+
+TEST(NoiseModel, InitFailsWithTooLargeLag) {
+  aom_noise_model_t model;
+  aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, 10, 8, 0 };
+  EXPECT_FALSE(aom_noise_model_init(&model, params));
+  aom_noise_model_free(&model);
+}
+
+TEST(NoiseModel, InitFailsWithTooSmallLag) {
+  aom_noise_model_t model;
+  aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, 0, 8, 0 };
+  EXPECT_FALSE(aom_noise_model_init(&model, params));
+  aom_noise_model_free(&model);
+}
+
+TEST(NoiseModel, InitFailsWithInvalidShape) {
+  aom_noise_model_t model;
+  aom_noise_model_params_t params = { aom_noise_shape(100), 3, 8, 0 };
+  EXPECT_FALSE(aom_noise_model_init(&model, params));
+  aom_noise_model_free(&model);
+}
+
+// A container template class to hold a data type and extra arguments.
+// All of these args are bundled into one struct so that we can use
+// parameterized tests on combinations of supported data types
+// (uint8_t and uint16_t) and bit depths (8, 10, 12).
+template <typename T, int bit_depth, bool use_highbd>
+struct BitDepthParams {
+  typedef T data_type_t;
+  static const int kBitDepth = bit_depth;
+  static const bool kUseHighBD = use_highbd;
+};
+
+template <typename T>
+class FlatBlockEstimatorTest : public ::testing::Test, public T {
+ public:
+  virtual void SetUp() { random_.Reset(171); }
+  typedef std::vector<typename T::data_type_t> VecType;
+  VecType data_;
+  libaom_test::ACMRandom random_;
+};
+
+TYPED_TEST_SUITE_P(FlatBlockEstimatorTest);
+
+TYPED_TEST_P(FlatBlockEstimatorTest, ExtractBlock) {
+  const int kBlockSize = 16;
+  aom_flat_block_finder_t flat_block_finder;
+  ASSERT_EQ(1, aom_flat_block_finder_init(&flat_block_finder, kBlockSize,
+                                          this->kBitDepth, this->kUseHighBD));
+  const double normalization = flat_block_finder.normalization;
+
+  // Test with an image of more than one block.
+  const int h = 2 * kBlockSize;
+  const int w = 2 * kBlockSize;
+  const int stride = 2 * kBlockSize;
+  this->data_.resize(h * stride, 128);
+
+  // Set up the (0,0) block to be a plane and the (0,1) block to be a
+  // checkerboard
+  const int shift = this->kBitDepth - 8;
+  for (int y = 0; y < kBlockSize; ++y) {
+    for (int x = 0; x < kBlockSize; ++x) {
+      this->data_[y * stride + x] = (-y + x + 128) << shift;
+      this->data_[y * stride + x + kBlockSize] =
+          ((x % 2 + y % 2) % 2 ? 128 - 20 : 128 + 20) << shift;
+    }
+  }
+  std::vector<double> block(kBlockSize * kBlockSize, 1);
+  std::vector<double> plane(kBlockSize * kBlockSize, 1);
+
+  // The block data should be a constant (zero) and the rest of the plane
+  // trend is covered in the plane data.
+  aom_flat_block_finder_extract_block(&flat_block_finder,
+                                      (uint8_t *)&this->data_[0], w, h, stride,
+                                      0, 0, &plane[0], &block[0]);
+  for (int y = 0; y < kBlockSize; ++y) {
+    for (int x = 0; x < kBlockSize; ++x) {
+      EXPECT_NEAR(0, block[y * kBlockSize + x], 1e-5);
+      EXPECT_NEAR((double)(this->data_[y * stride + x]) / normalization,
+                  plane[y * kBlockSize + x], 1e-5);
+    }
+  }
+
+  // The plane trend is a constant, and the block is a zero mean checkerboard.
+  aom_flat_block_finder_extract_block(&flat_block_finder,
+                                      (uint8_t *)&this->data_[0], w, h, stride,
+                                      kBlockSize, 0, &plane[0], &block[0]);
+  const int mid = 128 << shift;
+  for (int y = 0; y < kBlockSize; ++y) {
+    for (int x = 0; x < kBlockSize; ++x) {
+      EXPECT_NEAR(((double)this->data_[y * stride + x + kBlockSize] - mid) /
+                      normalization,
+                  block[y * kBlockSize + x], 1e-5);
+      EXPECT_NEAR(mid / normalization, plane[y * kBlockSize + x], 1e-5);
+    }
+  }
+  aom_flat_block_finder_free(&flat_block_finder);
+}
+
+TYPED_TEST_P(FlatBlockEstimatorTest, FindFlatBlocks) {
+  const int kBlockSize = 32;
+  aom_flat_block_finder_t flat_block_finder;
+  ASSERT_EQ(1, aom_flat_block_finder_init(&flat_block_finder, kBlockSize,
+                                          this->kBitDepth, this->kUseHighBD));
+
+  const int num_blocks_w = 8;
+  const int h = kBlockSize;
+  const int w = kBlockSize * num_blocks_w;
+  const int stride = w;
+  this->data_.resize(h * stride, 128);
+  std::vector<uint8_t> flat_blocks(num_blocks_w, 0);
+
+  const int shift = this->kBitDepth - 8;
+  for (int y = 0; y < kBlockSize; ++y) {
+    for (int x = 0; x < kBlockSize; ++x) {
+      // Block 0 (not flat): constant doesn't have enough variance to qualify
+      this->data_[y * stride + x + 0 * kBlockSize] = 128 << shift;
+
+      // Block 1 (not flat): too high of variance is hard to validate as flat
+      this->data_[y * stride + x + 1 * kBlockSize] =
+          ((uint8_t)(128 + randn(&this->random_, 5))) << shift;
+
+      // Block 2 (flat): slight checkerboard added to constant
+      const int check = (x % 2 + y % 2) % 2 ? -2 : 2;
+      this->data_[y * stride + x + 2 * kBlockSize] = (128 + check) << shift;
+
+      // Block 3 (flat): planar block with checkerboard pattern is also flat
+      this->data_[y * stride + x + 3 * kBlockSize] =
+          (y * 2 - x / 2 + 128 + check) << shift;
+
+      // Block 4 (flat): gaussian random with standard deviation 1.
+      this->data_[y * stride + x + 4 * kBlockSize] =
+          ((uint8_t)(randn(&this->random_, 1) + x + 128.0)) << shift;
+
+      // Block 5 (flat): gaussian random with standard deviation 2.
+      this->data_[y * stride + x + 5 * kBlockSize] =
+          ((uint8_t)(randn(&this->random_, 2) + y + 128.0)) << shift;
+
+      // Block 6 (not flat): too high of directional gradient.
+      const int strong_edge = x > kBlockSize / 2 ? 64 : 0;
+      this->data_[y * stride + x + 6 * kBlockSize] =
+          ((uint8_t)(randn(&this->random_, 1) + strong_edge + 128.0)) << shift;
+
+      // Block 7 (not flat): too high gradient.
+      const int big_check = ((x >> 2) % 2 + (y >> 2) % 2) % 2 ? -16 : 16;
+      this->data_[y * stride + x + 7 * kBlockSize] =
+          ((uint8_t)(randn(&this->random_, 1) + big_check + 128.0)) << shift;
+    }
+  }
+
+  EXPECT_EQ(4, aom_flat_block_finder_run(&flat_block_finder,
+                                         (uint8_t *)&this->data_[0], w, h,
+                                         stride, &flat_blocks[0]));
+
+  // First two blocks are not flat
+  EXPECT_EQ(0, flat_blocks[0]);
+  EXPECT_EQ(0, flat_blocks[1]);
+
+  // Next 4 blocks are flat.
+  EXPECT_EQ(255, flat_blocks[2]);
+  EXPECT_EQ(255, flat_blocks[3]);
+  EXPECT_EQ(255, flat_blocks[4]);
+  EXPECT_EQ(255, flat_blocks[5]);
+
+  // Last 2 are not flat by threshold
+  EXPECT_EQ(0, flat_blocks[6]);
+  EXPECT_EQ(0, flat_blocks[7]);
+
+  // Add the noise from non-flat block 1 to every block.
+  for (int y = 0; y < kBlockSize; ++y) {
+    for (int x = 0; x < kBlockSize * num_blocks_w; ++x) {
+      this->data_[y * stride + x] +=
+          (this->data_[y * stride + x % kBlockSize + kBlockSize] -
+           (128 << shift));
+    }
+  }
+  // Now the scored selection will pick the one that is most likely flat (block
+  // 0)
+  EXPECT_EQ(1, aom_flat_block_finder_run(&flat_block_finder,
+                                         (uint8_t *)&this->data_[0], w, h,
+                                         stride, &flat_blocks[0]));
+  EXPECT_EQ(1, flat_blocks[0]);
+  EXPECT_EQ(0, flat_blocks[1]);
+  EXPECT_EQ(0, flat_blocks[2]);
+  EXPECT_EQ(0, flat_blocks[3]);
+  EXPECT_EQ(0, flat_blocks[4]);
+  EXPECT_EQ(0, flat_blocks[5]);
+  EXPECT_EQ(0, flat_blocks[6]);
+  EXPECT_EQ(0, flat_blocks[7]);
+
+  aom_flat_block_finder_free(&flat_block_finder);
+}
+
+REGISTER_TYPED_TEST_SUITE_P(FlatBlockEstimatorTest, ExtractBlock,
+                            FindFlatBlocks);
+
+typedef ::testing::Types<BitDepthParams<uint8_t, 8, false>,   // lowbd
+                         BitDepthParams<uint16_t, 8, true>,   // lowbd in 16-bit
+                         BitDepthParams<uint16_t, 10, true>,  // highbd data
+                         BitDepthParams<uint16_t, 12, true> >
+    AllBitDepthParams;
+INSTANTIATE_TYPED_TEST_SUITE_P(FlatBlockInstatiation, FlatBlockEstimatorTest,
+                               AllBitDepthParams);
+
+template <typename T>
+class NoiseModelUpdateTest : public ::testing::Test, public T {
+ public:
+  static const int kWidth = 128;
+  static const int kHeight = 128;
+  static const int kBlockSize = 16;
+  static const int kNumBlocksX = kWidth / kBlockSize;
+  static const int kNumBlocksY = kHeight / kBlockSize;
+
+  virtual void SetUp() {
+    const aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, 3,
+                                              T::kBitDepth, T::kUseHighBD };
+    ASSERT_TRUE(aom_noise_model_init(&model_, params));
+
+    random_.Reset(100171);
+
+    data_.resize(kWidth * kHeight * 3);
+    denoised_.resize(kWidth * kHeight * 3);
+    noise_.resize(kWidth * kHeight * 3);
+    renoise_.resize(kWidth * kHeight);
+    flat_blocks_.resize(kNumBlocksX * kNumBlocksY);
+
+    for (int c = 0, offset = 0; c < 3; ++c, offset += kWidth * kHeight) {
+      data_ptr_[c] = &data_[offset];
+      noise_ptr_[c] = &noise_[offset];
+      denoised_ptr_[c] = &denoised_[offset];
+      strides_[c] = kWidth;
+
+      data_ptr_raw_[c] = (uint8_t *)&data_[offset];
+      denoised_ptr_raw_[c] = (uint8_t *)&denoised_[offset];
+    }
+    chroma_sub_[0] = 0;
+    chroma_sub_[1] = 0;
+  }
+
+  int NoiseModelUpdate(int block_size = kBlockSize) {
+    return aom_noise_model_update(&model_, data_ptr_raw_, denoised_ptr_raw_,
+                                  kWidth, kHeight, strides_, chroma_sub_,
+                                  &flat_blocks_[0], block_size);
+  }
+
+  void TearDown() { aom_noise_model_free(&model_); }
+
+ protected:
+  aom_noise_model_t model_;
+  std::vector<typename T::data_type_t> data_;
+  std::vector<typename T::data_type_t> denoised_;
+
+  std::vector<double> noise_;
+  std::vector<double> renoise_;
+  std::vector<uint8_t> flat_blocks_;
+
+  typename T::data_type_t *data_ptr_[3];
+  typename T::data_type_t *denoised_ptr_[3];
+
+  double *noise_ptr_[3];
+  int strides_[3];
+  int chroma_sub_[2];
+  libaom_test::ACMRandom random_;
+
+ private:
+  uint8_t *data_ptr_raw_[3];
+  uint8_t *denoised_ptr_raw_[3];
+};
+
+TYPED_TEST_SUITE_P(NoiseModelUpdateTest);
+
+TYPED_TEST_P(NoiseModelUpdateTest, UpdateFailsNoFlatBlocks) {
+  EXPECT_EQ(AOM_NOISE_STATUS_INSUFFICIENT_FLAT_BLOCKS,
+            this->NoiseModelUpdate());
+}
+
+TYPED_TEST_P(NoiseModelUpdateTest, UpdateSuccessForZeroNoiseAllFlat) {
+  this->flat_blocks_.assign(this->flat_blocks_.size(), 1);
+  this->denoised_.assign(this->denoised_.size(), 128);
+  this->data_.assign(this->denoised_.size(), 128);
+  EXPECT_EQ(AOM_NOISE_STATUS_INTERNAL_ERROR, this->NoiseModelUpdate());
+}
+
+TYPED_TEST_P(NoiseModelUpdateTest, UpdateFailsBlockSizeTooSmall) {
+  this->flat_blocks_.assign(this->flat_blocks_.size(), 1);
+  this->denoised_.assign(this->denoised_.size(), 128);
+  this->data_.assign(this->denoised_.size(), 128);
+  EXPECT_EQ(AOM_NOISE_STATUS_INVALID_ARGUMENT,
+            this->NoiseModelUpdate(6 /* block_size=6 is too small*/));
+}
+
+TYPED_TEST_P(NoiseModelUpdateTest, UpdateSuccessForWhiteRandomNoise) {
+  aom_noise_model_t &model = this->model_;
+  const int kWidth = this->kWidth;
+  const int kHeight = this->kHeight;
+
+  const int shift = this->kBitDepth - 8;
+  for (int y = 0; y < kHeight; ++y) {
+    for (int x = 0; x < kWidth; ++x) {
+      this->data_ptr_[0][y * kWidth + x] =
+          int(64 + y + randn(&this->random_, 1)) << shift;
+      this->denoised_ptr_[0][y * kWidth + x] = (64 + y) << shift;
+      // Make the chroma planes completely correlated with the Y plane
+      for (int c = 1; c < 3; ++c) {
+        this->data_ptr_[c][y * kWidth + x] = this->data_ptr_[0][y * kWidth + x];
+        this->denoised_ptr_[c][y * kWidth + x] =
+            this->denoised_ptr_[0][y * kWidth + x];
+      }
+    }
+  }
+  this->flat_blocks_.assign(this->flat_blocks_.size(), 1);
+  EXPECT_EQ(AOM_NOISE_STATUS_OK, this->NoiseModelUpdate());
+
+  const double kCoeffEps = 0.075;
+  const int n = model.n;
+  for (int c = 0; c < 3; ++c) {
+    for (int i = 0; i < n; ++i) {
+      EXPECT_NEAR(0, model.latest_state[c].eqns.x[i], kCoeffEps);
+      EXPECT_NEAR(0, model.combined_state[c].eqns.x[i], kCoeffEps);
+    }
+    // The second and third channels are highly correlated with the first.
+    if (c > 0) {
+      ASSERT_EQ(n + 1, model.latest_state[c].eqns.n);
+      ASSERT_EQ(n + 1, model.combined_state[c].eqns.n);
+
+      EXPECT_NEAR(1, model.latest_state[c].eqns.x[n], kCoeffEps);
+      EXPECT_NEAR(1, model.combined_state[c].eqns.x[n], kCoeffEps);
+    }
+  }
+
+  // The fitted noise strength should be close to the standard deviation
+  // for all intensity bins.
+  const double kStdEps = 0.1;
+  const double normalize = 1 << shift;
+
+  for (int i = 0; i < model.latest_state[0].strength_solver.eqns.n; ++i) {
+    EXPECT_NEAR(1.0,
+                model.latest_state[0].strength_solver.eqns.x[i] / normalize,
+                kStdEps);
+    EXPECT_NEAR(1.0,
+                model.combined_state[0].strength_solver.eqns.x[i] / normalize,
+                kStdEps);
+  }
+
+  aom_noise_strength_lut_t lut;
+  aom_noise_strength_solver_fit_piecewise(
+      &model.latest_state[0].strength_solver, -1, &lut);
+  ASSERT_EQ(2, lut.num_points);
+  EXPECT_NEAR(0.0, lut.points[0][0], 1e-5);
+  EXPECT_NEAR(1.0, lut.points[0][1] / normalize, kStdEps);
+  EXPECT_NEAR((1 << this->kBitDepth) - 1, lut.points[1][0], 1e-5);
+  EXPECT_NEAR(1.0, lut.points[1][1] / normalize, kStdEps);
+  aom_noise_strength_lut_free(&lut);
+}
+
+TYPED_TEST_P(NoiseModelUpdateTest, UpdateSuccessForScaledWhiteNoise) {
+  aom_noise_model_t &model = this->model_;
+  const int kWidth = this->kWidth;
+  const int kHeight = this->kHeight;
+
+  const double kCoeffEps = 0.055;
+  const double kLowStd = 1;
+  const double kHighStd = 4;
+  const int shift = this->kBitDepth - 8;
+  for (int y = 0; y < kHeight; ++y) {
+    for (int x = 0; x < kWidth; ++x) {
+      for (int c = 0; c < 3; ++c) {
+        // The image data is bimodal:
+        // Bottom half has low intensity and low noise strength
+        // Top half has high intensity and high noise strength
+        const int avg = (y < kHeight / 2) ? 4 : 245;
+        const double std = (y < kHeight / 2) ? kLowStd : kHighStd;
+        this->data_ptr_[c][y * kWidth + x] =
+            ((uint8_t)std::min((int)255,
+                               (int)(2 + avg + randn(&this->random_, std))))
+            << shift;
+        this->denoised_ptr_[c][y * kWidth + x] = (2 + avg) << shift;
+      }
+    }
+  }
+  // Label all blocks as flat for the update
+  this->flat_blocks_.assign(this->flat_blocks_.size(), 1);
+  EXPECT_EQ(AOM_NOISE_STATUS_OK, this->NoiseModelUpdate());
+
+  const int n = model.n;
+  // The noise is uncorrelated spatially and with the y channel.
+  // All coefficients should be reasonably close to zero.
+  for (int c = 0; c < 3; ++c) {
+    for (int i = 0; i < n; ++i) {
+      EXPECT_NEAR(0, model.latest_state[c].eqns.x[i], kCoeffEps);
+      EXPECT_NEAR(0, model.combined_state[c].eqns.x[i], kCoeffEps);
+    }
+    if (c > 0) {
+      ASSERT_EQ(n + 1, model.latest_state[c].eqns.n);
+      ASSERT_EQ(n + 1, model.combined_state[c].eqns.n);
+
+      // The correlation to the y channel should be low (near zero)
+      EXPECT_NEAR(0, model.latest_state[c].eqns.x[n], kCoeffEps);
+      EXPECT_NEAR(0, model.combined_state[c].eqns.x[n], kCoeffEps);
+    }
+  }
+
+  // Noise strength should vary between kLowStd and kHighStd.
+  const double kStdEps = 0.15;
+  // We have to normalize fitted standard deviation based on bit depth.
+  const double normalize = (1 << shift);
+
+  ASSERT_EQ(20, model.latest_state[0].strength_solver.eqns.n);
+  for (int i = 0; i < model.latest_state[0].strength_solver.eqns.n; ++i) {
+    const double a = i / 19.0;
+    const double expected = (kLowStd * (1.0 - a) + kHighStd * a);
+    EXPECT_NEAR(expected,
+                model.latest_state[0].strength_solver.eqns.x[i] / normalize,
+                kStdEps);
+    EXPECT_NEAR(expected,
+                model.combined_state[0].strength_solver.eqns.x[i] / normalize,
+                kStdEps);
+  }
+
+  // If we fit a piecewise linear model, there should be two points:
+  // one near kLowStd at 0, and the other near kHighStd and 255.
+  aom_noise_strength_lut_t lut;
+  aom_noise_strength_solver_fit_piecewise(
+      &model.latest_state[0].strength_solver, 2, &lut);
+  ASSERT_EQ(2, lut.num_points);
+  EXPECT_NEAR(0, lut.points[0][0], 1e-4);
+  EXPECT_NEAR(kLowStd, lut.points[0][1] / normalize, kStdEps);
+  EXPECT_NEAR((1 << this->kBitDepth) - 1, lut.points[1][0], 1e-5);
+  EXPECT_NEAR(kHighStd, lut.points[1][1] / normalize, kStdEps);
+  aom_noise_strength_lut_free(&lut);
+}
+
+TYPED_TEST_P(NoiseModelUpdateTest, UpdateSuccessForCorrelatedNoise) {
+  aom_noise_model_t &model = this->model_;
+  const int kWidth = this->kWidth;
+  const int kHeight = this->kHeight;
+  const int kNumCoeffs = 24;
+  const double kStd = 4;
+  const double kStdEps = 0.3;
+  const double kCoeffEps = 0.065;
+  // Use different coefficients for each channel
+  const double kCoeffs[3][24] = {
+    { 0.02884, -0.03356, 0.00633,  0.01757,  0.02849,  -0.04620,
+      0.02833, -0.07178, 0.07076,  -0.11603, -0.10413, -0.16571,
+      0.05158, -0.07969, 0.02640,  -0.07191, 0.02530,  0.41968,
+      0.21450, -0.00702, -0.01401, -0.03676, -0.08713, 0.44196 },
+    { 0.00269, -0.01291, -0.01513, 0.07234,  0.03208,   0.00477,
+      0.00226, -0.00254, 0.03533,  0.12841,  -0.25970,  -0.06336,
+      0.05238, -0.00845, -0.03118, 0.09043,  -0.36558,  0.48903,
+      0.00595, -0.11938, 0.02106,  0.095956, -0.350139, 0.59305 },
+    { -0.00643, -0.01080, -0.01466, 0.06951, 0.03707,  -0.00482,
+      0.00817,  -0.00909, 0.02949,  0.12181, -0.25210, -0.07886,
+      0.06083,  -0.01210, -0.03108, 0.08944, -0.35875, 0.49150,
+      0.00415,  -0.12905, 0.02870,  0.09740, -0.34610, 0.58824 },
+  };
+
+  ASSERT_EQ(model.n, kNumCoeffs);
+  this->chroma_sub_[0] = this->chroma_sub_[1] = 1;
+
+  this->flat_blocks_.assign(this->flat_blocks_.size(), 1);
+
+  // Add different noise onto each plane
+  const int shift = this->kBitDepth - 8;
+  for (int c = 0; c < 3; ++c) {
+    noise_synth(&this->random_, model.params.lag, model.n, model.coords,
+                kCoeffs[c], this->noise_ptr_[c], kWidth, kHeight);
+    const int x_shift = c > 0 ? this->chroma_sub_[0] : 0;
+    const int y_shift = c > 0 ? this->chroma_sub_[1] : 0;
+    for (int y = 0; y < (kHeight >> y_shift); ++y) {
+      for (int x = 0; x < (kWidth >> x_shift); ++x) {
+        const uint8_t value = 64 + x / 2 + y / 4;
+        this->data_ptr_[c][y * kWidth + x] =
+            (uint8_t(value + this->noise_ptr_[c][y * kWidth + x] * kStd))
+            << shift;
+        this->denoised_ptr_[c][y * kWidth + x] = value << shift;
+      }
+    }
+  }
+  EXPECT_EQ(AOM_NOISE_STATUS_OK, this->NoiseModelUpdate());
+
+  // For the Y plane, the solved coefficients should be close to the original
+  const int n = model.n;
+  for (int c = 0; c < 3; ++c) {
+    for (int i = 0; i < n; ++i) {
+      EXPECT_NEAR(kCoeffs[c][i], model.latest_state[c].eqns.x[i], kCoeffEps);
+      EXPECT_NEAR(kCoeffs[c][i], model.combined_state[c].eqns.x[i], kCoeffEps);
+    }
+    // The chroma planes should be uncorrelated with the luma plane
+    if (c > 0) {
+      EXPECT_NEAR(0, model.latest_state[c].eqns.x[n], kCoeffEps);
+      EXPECT_NEAR(0, model.combined_state[c].eqns.x[n], kCoeffEps);
+    }
+    // Correlation between the coefficient vector and the fitted coefficients
+    // should be close to 1.
+    EXPECT_LT(0.98, aom_normalized_cross_correlation(
+                        model.latest_state[c].eqns.x, kCoeffs[c], kNumCoeffs));
+
+    noise_synth(&this->random_, model.params.lag, model.n, model.coords,
+                model.latest_state[c].eqns.x, &this->renoise_[0], kWidth,
+                kHeight);
+
+    EXPECT_TRUE(aom_noise_data_validate(&this->renoise_[0], kWidth, kHeight));
+  }
+
+  // Check fitted noise strength
+  const double normalize = 1 << shift;
+  for (int c = 0; c < 3; ++c) {
+    for (int i = 0; i < model.latest_state[c].strength_solver.eqns.n; ++i) {
+      EXPECT_NEAR(kStd,
+                  model.latest_state[c].strength_solver.eqns.x[i] / normalize,
+                  kStdEps);
+    }
+  }
+}
+
+TYPED_TEST_P(NoiseModelUpdateTest,
+             NoiseStrengthChangeSignalsDifferentNoiseType) {
+  aom_noise_model_t &model = this->model_;
+  const int kWidth = this->kWidth;
+  const int kHeight = this->kHeight;
+  const int kBlockSize = this->kBlockSize;
+  // Create a gradient image with std = 2 uncorrelated noise
+  const double kStd = 2;
+  const int shift = this->kBitDepth - 8;
+
+  for (int i = 0; i < kWidth * kHeight; ++i) {
+    const uint8_t val = (i % kWidth) < kWidth / 2 ? 64 : 192;
+    for (int c = 0; c < 3; ++c) {
+      this->noise_ptr_[c][i] = randn(&this->random_, 1);
+      this->data_ptr_[c][i] = ((uint8_t)(this->noise_ptr_[c][i] * kStd + val))
+                              << shift;
+      this->denoised_ptr_[c][i] = val << shift;
+    }
+  }
+  this->flat_blocks_.assign(this->flat_blocks_.size(), 1);
+  EXPECT_EQ(AOM_NOISE_STATUS_OK, this->NoiseModelUpdate());
+
+  const int kNumBlocks = kWidth * kHeight / kBlockSize / kBlockSize;
+  EXPECT_EQ(kNumBlocks, model.latest_state[0].strength_solver.num_equations);
+  EXPECT_EQ(kNumBlocks, model.latest_state[1].strength_solver.num_equations);
+  EXPECT_EQ(kNumBlocks, model.latest_state[2].strength_solver.num_equations);
+  EXPECT_EQ(kNumBlocks, model.combined_state[0].strength_solver.num_equations);
+  EXPECT_EQ(kNumBlocks, model.combined_state[1].strength_solver.num_equations);
+  EXPECT_EQ(kNumBlocks, model.combined_state[2].strength_solver.num_equations);
+
+  // Bump up noise by an insignificant amount
+  for (int i = 0; i < kWidth * kHeight; ++i) {
+    const uint8_t val = (i % kWidth) < kWidth / 2 ? 64 : 192;
+    this->data_ptr_[0][i] =
+        ((uint8_t)(this->noise_ptr_[0][i] * (kStd + 0.085) + val)) << shift;
+  }
+  EXPECT_EQ(AOM_NOISE_STATUS_OK, this->NoiseModelUpdate());
+
+  const double kARGainTolerance = 0.02;
+  for (int c = 0; c < 3; ++c) {
+    EXPECT_EQ(kNumBlocks, model.latest_state[c].strength_solver.num_equations);
+    EXPECT_EQ(15250, model.latest_state[c].num_observations);
+    EXPECT_NEAR(1, model.latest_state[c].ar_gain, kARGainTolerance);
+
+    EXPECT_EQ(2 * kNumBlocks,
+              model.combined_state[c].strength_solver.num_equations);
+    EXPECT_EQ(2 * 15250, model.combined_state[c].num_observations);
+    EXPECT_NEAR(1, model.combined_state[c].ar_gain, kARGainTolerance);
+  }
+
+  // Bump up the noise strength on half the image for one channel by a
+  // significant amount.
+  for (int i = 0; i < kWidth * kHeight; ++i) {
+    const uint8_t val = (i % kWidth) < kWidth / 2 ? 64 : 128;
+    if (i % kWidth < kWidth / 2) {
+      this->data_ptr_[0][i] =
+          ((uint8_t)(randn(&this->random_, kStd + 0.5) + val)) << shift;
+    }
+  }
+  EXPECT_EQ(AOM_NOISE_STATUS_DIFFERENT_NOISE_TYPE, this->NoiseModelUpdate());
+
+  // Since we didn't update the combined state, it should still be at 2 *
+  // num_blocks
+  EXPECT_EQ(kNumBlocks, model.latest_state[0].strength_solver.num_equations);
+  EXPECT_EQ(2 * kNumBlocks,
+            model.combined_state[0].strength_solver.num_equations);
+
+  // In normal operation, the "latest" estimate can be saved to the "combined"
+  // state for continued updates.
+  aom_noise_model_save_latest(&model);
+  for (int c = 0; c < 3; ++c) {
+    EXPECT_EQ(kNumBlocks, model.latest_state[c].strength_solver.num_equations);
+    EXPECT_EQ(15250, model.latest_state[c].num_observations);
+    EXPECT_NEAR(1, model.latest_state[c].ar_gain, kARGainTolerance);
+
+    EXPECT_EQ(kNumBlocks,
+              model.combined_state[c].strength_solver.num_equations);
+    EXPECT_EQ(15250, model.combined_state[c].num_observations);
+    EXPECT_NEAR(1, model.combined_state[c].ar_gain, kARGainTolerance);
+  }
+}
+
+TYPED_TEST_P(NoiseModelUpdateTest, NoiseCoeffsSignalsDifferentNoiseType) {
+  aom_noise_model_t &model = this->model_;
+  const int kWidth = this->kWidth;
+  const int kHeight = this->kHeight;
+  const double kCoeffs[2][24] = {
+    { 0.02884, -0.03356, 0.00633,  0.01757,  0.02849,  -0.04620,
+      0.02833, -0.07178, 0.07076,  -0.11603, -0.10413, -0.16571,
+      0.05158, -0.07969, 0.02640,  -0.07191, 0.02530,  0.41968,
+      0.21450, -0.00702, -0.01401, -0.03676, -0.08713, 0.44196 },
+    { 0.00269, -0.01291, -0.01513, 0.07234,  0.03208,   0.00477,
+      0.00226, -0.00254, 0.03533,  0.12841,  -0.25970,  -0.06336,
+      0.05238, -0.00845, -0.03118, 0.09043,  -0.36558,  0.48903,
+      0.00595, -0.11938, 0.02106,  0.095956, -0.350139, 0.59305 }
+  };
+
+  noise_synth(&this->random_, model.params.lag, model.n, model.coords,
+              kCoeffs[0], this->noise_ptr_[0], kWidth, kHeight);
+  for (int i = 0; i < kWidth * kHeight; ++i) {
+    this->data_ptr_[0][i] = (uint8_t)(128 + this->noise_ptr_[0][i]);
+  }
+  this->flat_blocks_.assign(this->flat_blocks_.size(), 1);
+  EXPECT_EQ(AOM_NOISE_STATUS_OK, this->NoiseModelUpdate());
+
+  // Now try with the second set of AR coefficients
+  noise_synth(&this->random_, model.params.lag, model.n, model.coords,
+              kCoeffs[1], this->noise_ptr_[0], kWidth, kHeight);
+  for (int i = 0; i < kWidth * kHeight; ++i) {
+    this->data_ptr_[0][i] = (uint8_t)(128 + this->noise_ptr_[0][i]);
+  }
+  EXPECT_EQ(AOM_NOISE_STATUS_DIFFERENT_NOISE_TYPE, this->NoiseModelUpdate());
+}
+REGISTER_TYPED_TEST_SUITE_P(NoiseModelUpdateTest, UpdateFailsNoFlatBlocks,
+                            UpdateSuccessForZeroNoiseAllFlat,
+                            UpdateFailsBlockSizeTooSmall,
+                            UpdateSuccessForWhiteRandomNoise,
+                            UpdateSuccessForScaledWhiteNoise,
+                            UpdateSuccessForCorrelatedNoise,
+                            NoiseStrengthChangeSignalsDifferentNoiseType,
+                            NoiseCoeffsSignalsDifferentNoiseType);
+
+INSTANTIATE_TYPED_TEST_SUITE_P(NoiseModelUpdateTestInstatiation,
+                               NoiseModelUpdateTest, AllBitDepthParams);
+
+TEST(NoiseModelGetGrainParameters, TestLagSize) {
+  aom_film_grain_t film_grain;
+  for (int lag = 1; lag <= 3; ++lag) {
+    aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, lag, 8, 0 };
+    aom_noise_model_t model;
+    EXPECT_TRUE(aom_noise_model_init(&model, params));
+    EXPECT_TRUE(aom_noise_model_get_grain_parameters(&model, &film_grain));
+    EXPECT_EQ(lag, film_grain.ar_coeff_lag);
+    aom_noise_model_free(&model);
+  }
+
+  aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, 4, 8, 0 };
+  aom_noise_model_t model;
+  EXPECT_TRUE(aom_noise_model_init(&model, params));
+  EXPECT_FALSE(aom_noise_model_get_grain_parameters(&model, &film_grain));
+  aom_noise_model_free(&model);
+}
+
+TEST(NoiseModelGetGrainParameters, TestARCoeffShiftBounds) {
+  struct TestCase {
+    double max_input_value;
+    int expected_ar_coeff_shift;
+    int expected_value;
+  };
+  const int lag = 1;
+  const int kNumTestCases = 19;
+  const TestCase test_cases[] = {
+    // Test cases for ar_coeff_shift = 9
+    { 0, 9, 0 },
+    { 0.125, 9, 64 },
+    { -0.125, 9, -64 },
+    { 0.2499, 9, 127 },
+    { -0.25, 9, -128 },
+    // Test cases for ar_coeff_shift = 8
+    { 0.25, 8, 64 },
+    { -0.2501, 8, -64 },
+    { 0.499, 8, 127 },
+    { -0.5, 8, -128 },
+    // Test cases for ar_coeff_shift = 7
+    { 0.5, 7, 64 },
+    { -0.5001, 7, -64 },
+    { 0.999, 7, 127 },
+    { -1, 7, -128 },
+    // Test cases for ar_coeff_shift = 6
+    { 1.0, 6, 64 },
+    { -1.0001, 6, -64 },
+    { 2.0, 6, 127 },
+    { -2.0, 6, -128 },
+    { 4, 6, 127 },
+    { -4, 6, -128 },
+  };
+  aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, lag, 8, 0 };
+  aom_noise_model_t model;
+  EXPECT_TRUE(aom_noise_model_init(&model, params));
+
+  for (int i = 0; i < kNumTestCases; ++i) {
+    const TestCase &test_case = test_cases[i];
+    model.combined_state[0].eqns.x[0] = test_case.max_input_value;
+
+    aom_film_grain_t film_grain;
+    EXPECT_TRUE(aom_noise_model_get_grain_parameters(&model, &film_grain));
+    EXPECT_EQ(1, film_grain.ar_coeff_lag);
+    EXPECT_EQ(test_case.expected_ar_coeff_shift, film_grain.ar_coeff_shift);
+    EXPECT_EQ(test_case.expected_value, film_grain.ar_coeffs_y[0]);
+  }
+  aom_noise_model_free(&model);
+}
+
+TEST(NoiseModelGetGrainParameters, TestNoiseStrengthShiftBounds) {
+  struct TestCase {
+    double max_input_value;
+    int expected_scaling_shift;
+    int expected_value;
+  };
+  const int kNumTestCases = 10;
+  const TestCase test_cases[] = {
+    { 0, 11, 0 },      { 1, 11, 64 },     { 2, 11, 128 }, { 3.99, 11, 255 },
+    { 4, 10, 128 },    { 7.99, 10, 255 }, { 8, 9, 128 },  { 16, 8, 128 },
+    { 31.99, 8, 255 }, { 64, 8, 255 },  // clipped
+  };
+  const int lag = 1;
+  aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, lag, 8, 0 };
+  aom_noise_model_t model;
+  EXPECT_TRUE(aom_noise_model_init(&model, params));
+
+  for (int i = 0; i < kNumTestCases; ++i) {
+    const TestCase &test_case = test_cases[i];
+    aom_equation_system_t &eqns = model.combined_state[0].strength_solver.eqns;
+    // Set the fitted scale parameters to be a constant value.
+    for (int j = 0; j < eqns.n; ++j) {
+      eqns.x[j] = test_case.max_input_value;
+    }
+    aom_film_grain_t film_grain;
+    EXPECT_TRUE(aom_noise_model_get_grain_parameters(&model, &film_grain));
+    // We expect a single constant segemnt
+    EXPECT_EQ(test_case.expected_scaling_shift, film_grain.scaling_shift);
+    EXPECT_EQ(test_case.expected_value, film_grain.scaling_points_y[0][1]);
+    EXPECT_EQ(test_case.expected_value, film_grain.scaling_points_y[1][1]);
+  }
+  aom_noise_model_free(&model);
+}
+
+// The AR coefficients are the same inputs used to generate "Test 2" in the test
+// vectors
+TEST(NoiseModelGetGrainParameters, GetGrainParametersReal) {
+  const double kInputCoeffsY[] = { 0.0315,  0.0073,  0.0218,  0.00235, 0.00511,
+                                   -0.0222, 0.0627,  -0.022,  0.05575, -0.1816,
+                                   0.0107,  -0.1966, 0.00065, -0.0809, 0.04934,
+                                   -0.1349, -0.0352, 0.41772, 0.27973, 0.04207,
+                                   -0.0429, -0.1372, 0.06193, 0.52032 };
+  const double kInputCoeffsCB[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0,
+                                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.5 };
+  const double kInputCoeffsCR[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   0,
+                                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -0.5 };
+  const int kExpectedARCoeffsY[] = { 4,  1,   3,  0,   1,  -3,  8, -3,
+                                     7,  -23, 1,  -25, 0,  -10, 6, -17,
+                                     -5, 53,  36, 5,   -5, -18, 8, 67 };
+  const int kExpectedARCoeffsCB[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 84 };
+  const int kExpectedARCoeffsCR[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   0,
+                                      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -126 };
+  // Scaling function is initialized analytically with a sqrt function.
+  const int kNumScalingPointsY = 12;
+  const int kExpectedScalingPointsY[][2] = {
+    { 0, 0 },     { 13, 44 },   { 27, 62 },   { 40, 76 },
+    { 54, 88 },   { 67, 98 },   { 94, 117 },  { 121, 132 },
+    { 148, 146 }, { 174, 159 }, { 201, 171 }, { 255, 192 },
+  };
+
+  const int lag = 3;
+  aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, lag, 8, 0 };
+  aom_noise_model_t model;
+  EXPECT_TRUE(aom_noise_model_init(&model, params));
+
+  // Setup the AR coeffs
+  memcpy(model.combined_state[0].eqns.x, kInputCoeffsY, sizeof(kInputCoeffsY));
+  memcpy(model.combined_state[1].eqns.x, kInputCoeffsCB,
+         sizeof(kInputCoeffsCB));
+  memcpy(model.combined_state[2].eqns.x, kInputCoeffsCR,
+         sizeof(kInputCoeffsCR));
+  for (int i = 0; i < model.combined_state[0].strength_solver.num_bins; ++i) {
+    const double x =
+        ((double)i) / (model.combined_state[0].strength_solver.num_bins - 1.0);
+    model.combined_state[0].strength_solver.eqns.x[i] = 6 * sqrt(x);
+    model.combined_state[1].strength_solver.eqns.x[i] = 3;
+    model.combined_state[2].strength_solver.eqns.x[i] = 2;
+
+    // Inject some observations into the strength solver, as during film grain
+    // parameter extraction an estimate of the average strength will be used to
+    // adjust correlation.
+    const int n = model.combined_state[0].strength_solver.num_bins;
+    for (int j = 0; j < model.combined_state[0].strength_solver.num_bins; ++j) {
+      model.combined_state[0].strength_solver.eqns.A[i * n + j] = 1;
+      model.combined_state[1].strength_solver.eqns.A[i * n + j] = 1;
+      model.combined_state[2].strength_solver.eqns.A[i * n + j] = 1;
+    }
+  }
+
+  aom_film_grain_t film_grain;
+  EXPECT_TRUE(aom_noise_model_get_grain_parameters(&model, &film_grain));
+  EXPECT_EQ(lag, film_grain.ar_coeff_lag);
+  EXPECT_EQ(3, film_grain.ar_coeff_lag);
+  EXPECT_EQ(7, film_grain.ar_coeff_shift);
+  EXPECT_EQ(10, film_grain.scaling_shift);
+  EXPECT_EQ(kNumScalingPointsY, film_grain.num_y_points);
+  EXPECT_EQ(1, film_grain.update_parameters);
+  EXPECT_EQ(1, film_grain.apply_grain);
+
+  const int kNumARCoeffs = 24;
+  for (int i = 0; i < kNumARCoeffs; ++i) {
+    EXPECT_EQ(kExpectedARCoeffsY[i], film_grain.ar_coeffs_y[i]);
+  }
+  for (int i = 0; i < kNumARCoeffs + 1; ++i) {
+    EXPECT_EQ(kExpectedARCoeffsCB[i], film_grain.ar_coeffs_cb[i]);
+  }
+  for (int i = 0; i < kNumARCoeffs + 1; ++i) {
+    EXPECT_EQ(kExpectedARCoeffsCR[i], film_grain.ar_coeffs_cr[i]);
+  }
+  for (int i = 0; i < kNumScalingPointsY; ++i) {
+    EXPECT_EQ(kExpectedScalingPointsY[i][0], film_grain.scaling_points_y[i][0]);
+    EXPECT_EQ(kExpectedScalingPointsY[i][1], film_grain.scaling_points_y[i][1]);
+  }
+
+  // CB strength should just be a piecewise segment
+  EXPECT_EQ(2, film_grain.num_cb_points);
+  EXPECT_EQ(0, film_grain.scaling_points_cb[0][0]);
+  EXPECT_EQ(255, film_grain.scaling_points_cb[1][0]);
+  EXPECT_EQ(96, film_grain.scaling_points_cb[0][1]);
+  EXPECT_EQ(96, film_grain.scaling_points_cb[1][1]);
+
+  // CR strength should just be a piecewise segment
+  EXPECT_EQ(2, film_grain.num_cr_points);
+  EXPECT_EQ(0, film_grain.scaling_points_cr[0][0]);
+  EXPECT_EQ(255, film_grain.scaling_points_cr[1][0]);
+  EXPECT_EQ(64, film_grain.scaling_points_cr[0][1]);
+  EXPECT_EQ(64, film_grain.scaling_points_cr[1][1]);
+
+  EXPECT_EQ(128, film_grain.cb_mult);
+  EXPECT_EQ(192, film_grain.cb_luma_mult);
+  EXPECT_EQ(256, film_grain.cb_offset);
+  EXPECT_EQ(128, film_grain.cr_mult);
+  EXPECT_EQ(192, film_grain.cr_luma_mult);
+  EXPECT_EQ(256, film_grain.cr_offset);
+  EXPECT_EQ(0, film_grain.chroma_scaling_from_luma);
+  EXPECT_EQ(0, film_grain.grain_scale_shift);
+
+  aom_noise_model_free(&model);
+}
+
+template <typename T>
+class WienerDenoiseTest : public ::testing::Test, public T {
+ public:
+  static void SetUpTestCase() { aom_dsp_rtcd(); }
+
+ protected:
+  void SetUp() {
+    static const float kNoiseLevel = 5.f;
+    static const float kStd = 4.0;
+    static const double kMaxValue = (1 << T::kBitDepth) - 1;
+
+    chroma_sub_[0] = 1;
+    chroma_sub_[1] = 1;
+    stride_[0] = kWidth;
+    stride_[1] = kWidth / 2;
+    stride_[2] = kWidth / 2;
+    for (int k = 0; k < 3; ++k) {
+      data_[k].resize(kWidth * kHeight);
+      denoised_[k].resize(kWidth * kHeight);
+      noise_psd_[k].resize(kBlockSize * kBlockSize);
+    }
+
+    const double kCoeffsY[] = { 0.0406, -0.116, -0.078, -0.152, 0.0033, -0.093,
+                                0.048,  0.404,  0.2353, -0.035, -0.093, 0.441 };
+    const int kCoords[12][2] = {
+      { -2, -2 }, { -1, -2 }, { 0, -2 }, { 1, -2 }, { 2, -2 }, { -2, -1 },
+      { -1, -1 }, { 0, -1 },  { 1, -1 }, { 2, -1 }, { -2, 0 }, { -1, 0 }
+    };
+    const int kLag = 2;
+    const int kLength = 12;
+    libaom_test::ACMRandom random;
+    std::vector<double> noise(kWidth * kHeight);
+    noise_synth(&random, kLag, kLength, kCoords, kCoeffsY, &noise[0], kWidth,
+                kHeight);
+    noise_psd_[0] = get_noise_psd(&noise[0], kWidth, kHeight, kBlockSize);
+    for (int i = 0; i < kBlockSize * kBlockSize; ++i) {
+      noise_psd_[0][i] = (float)(noise_psd_[0][i] * kStd * kStd * kScaleNoise *
+                                 kScaleNoise / (kMaxValue * kMaxValue));
+    }
+
+    float psd_value =
+        aom_noise_psd_get_default_value(kBlockSizeChroma, kNoiseLevel);
+    for (int i = 0; i < kBlockSizeChroma * kBlockSizeChroma; ++i) {
+      noise_psd_[1][i] = psd_value;
+      noise_psd_[2][i] = psd_value;
+    }
+    for (int y = 0; y < kHeight; ++y) {
+      for (int x = 0; x < kWidth; ++x) {
+        data_[0][y * stride_[0] + x] = (typename T::data_type_t)fclamp(
+            (x + noise[y * stride_[0] + x] * kStd) * kScaleNoise, 0, kMaxValue);
+      }
+    }
+
+    for (int c = 1; c < 3; ++c) {
+      for (int y = 0; y < (kHeight >> 1); ++y) {
+        for (int x = 0; x < (kWidth >> 1); ++x) {
+          data_[c][y * stride_[c] + x] = (typename T::data_type_t)fclamp(
+              (x + randn(&random, kStd)) * kScaleNoise, 0, kMaxValue);
+        }
+      }
+    }
+    for (int k = 0; k < 3; ++k) {
+      noise_psd_ptrs_[k] = &noise_psd_[k][0];
+    }
+  }
+  static const int kBlockSize = 32;
+  static const int kBlockSizeChroma = 16;
+  static const int kWidth = 256;
+  static const int kHeight = 256;
+  static const int kScaleNoise = 1 << (T::kBitDepth - 8);
+
+  std::vector<typename T::data_type_t> data_[3];
+  std::vector<typename T::data_type_t> denoised_[3];
+  std::vector<float> noise_psd_[3];
+  int chroma_sub_[2];
+  float *noise_psd_ptrs_[3];
+  int stride_[3];
+};
+
+TYPED_TEST_SUITE_P(WienerDenoiseTest);
+
+TYPED_TEST_P(WienerDenoiseTest, InvalidBlockSize) {
+  const uint8_t *const data_ptrs[3] = {
+    reinterpret_cast<uint8_t *>(&this->data_[0][0]),
+    reinterpret_cast<uint8_t *>(&this->data_[1][0]),
+    reinterpret_cast<uint8_t *>(&this->data_[2][0]),
+  };
+  uint8_t *denoised_ptrs[3] = {
+    reinterpret_cast<uint8_t *>(&this->denoised_[0][0]),
+    reinterpret_cast<uint8_t *>(&this->denoised_[1][0]),
+    reinterpret_cast<uint8_t *>(&this->denoised_[2][0]),
+  };
+  EXPECT_EQ(0, aom_wiener_denoise_2d(data_ptrs, denoised_ptrs, this->kWidth,
+                                     this->kHeight, this->stride_,
+                                     this->chroma_sub_, this->noise_psd_ptrs_,
+                                     18, this->kBitDepth, this->kUseHighBD));
+  EXPECT_EQ(0, aom_wiener_denoise_2d(data_ptrs, denoised_ptrs, this->kWidth,
+                                     this->kHeight, this->stride_,
+                                     this->chroma_sub_, this->noise_psd_ptrs_,
+                                     48, this->kBitDepth, this->kUseHighBD));
+  EXPECT_EQ(0, aom_wiener_denoise_2d(data_ptrs, denoised_ptrs, this->kWidth,
+                                     this->kHeight, this->stride_,
+                                     this->chroma_sub_, this->noise_psd_ptrs_,
+                                     64, this->kBitDepth, this->kUseHighBD));
+}
+
+TYPED_TEST_P(WienerDenoiseTest, InvalidChromaSubsampling) {
+  const uint8_t *const data_ptrs[3] = {
+    reinterpret_cast<uint8_t *>(&this->data_[0][0]),
+    reinterpret_cast<uint8_t *>(&this->data_[1][0]),
+    reinterpret_cast<uint8_t *>(&this->data_[2][0]),
+  };
+  uint8_t *denoised_ptrs[3] = {
+    reinterpret_cast<uint8_t *>(&this->denoised_[0][0]),
+    reinterpret_cast<uint8_t *>(&this->denoised_[1][0]),
+    reinterpret_cast<uint8_t *>(&this->denoised_[2][0]),
+  };
+  int chroma_sub[2] = { 1, 0 };
+  EXPECT_EQ(0, aom_wiener_denoise_2d(data_ptrs, denoised_ptrs, this->kWidth,
+                                     this->kHeight, this->stride_, chroma_sub,
+                                     this->noise_psd_ptrs_, 32, this->kBitDepth,
+                                     this->kUseHighBD));
+
+  chroma_sub[0] = 0;
+  chroma_sub[1] = 1;
+  EXPECT_EQ(0, aom_wiener_denoise_2d(data_ptrs, denoised_ptrs, this->kWidth,
+                                     this->kHeight, this->stride_, chroma_sub,
+                                     this->noise_psd_ptrs_, 32, this->kBitDepth,
+                                     this->kUseHighBD));
+}
+
+TYPED_TEST_P(WienerDenoiseTest, GradientTest) {
+  const int kWidth = this->kWidth;
+  const int kHeight = this->kHeight;
+  const int kBlockSize = this->kBlockSize;
+  const uint8_t *const data_ptrs[3] = {
+    reinterpret_cast<uint8_t *>(&this->data_[0][0]),
+    reinterpret_cast<uint8_t *>(&this->data_[1][0]),
+    reinterpret_cast<uint8_t *>(&this->data_[2][0]),
+  };
+  uint8_t *denoised_ptrs[3] = {
+    reinterpret_cast<uint8_t *>(&this->denoised_[0][0]),
+    reinterpret_cast<uint8_t *>(&this->denoised_[1][0]),
+    reinterpret_cast<uint8_t *>(&this->denoised_[2][0]),
+  };
+  const int ret = aom_wiener_denoise_2d(
+      data_ptrs, denoised_ptrs, kWidth, kHeight, this->stride_,
+      this->chroma_sub_, this->noise_psd_ptrs_, this->kBlockSize,
+      this->kBitDepth, this->kUseHighBD);
+  EXPECT_EQ(1, ret);
+
+  // Check the noise on the denoised image (from the analytical gradient)
+  // and make sure that it is less than what we added.
+  for (int c = 0; c < 3; ++c) {
+    std::vector<double> measured_noise(kWidth * kHeight);
+
+    double var = 0;
+    const int shift = (c > 0);
+    for (int x = 0; x < (kWidth >> shift); ++x) {
+      for (int y = 0; y < (kHeight >> shift); ++y) {
+        const double diff = this->denoised_[c][y * this->stride_[c] + x] -
+                            x * this->kScaleNoise;
+        var += diff * diff;
+        measured_noise[y * kWidth + x] = diff;
+      }
+    }
+    var /= (kWidth * kHeight);
+    const double std = sqrt(std::max(0.0, var));
+    EXPECT_LE(std, 1.25f * this->kScaleNoise);
+    if (c == 0) {
+      std::vector<float> measured_psd =
+          get_noise_psd(&measured_noise[0], kWidth, kHeight, kBlockSize);
+      std::vector<double> measured_psd_d(kBlockSize * kBlockSize);
+      std::vector<double> noise_psd_d(kBlockSize * kBlockSize);
+      std::copy(measured_psd.begin(), measured_psd.end(),
+                measured_psd_d.begin());
+      std::copy(this->noise_psd_[0].begin(), this->noise_psd_[0].end(),
+                noise_psd_d.begin());
+      EXPECT_LT(
+          aom_normalized_cross_correlation(&measured_psd_d[0], &noise_psd_d[0],
+                                           (int)(noise_psd_d.size())),
+          0.35);
+    }
+  }
+}
+
+REGISTER_TYPED_TEST_SUITE_P(WienerDenoiseTest, InvalidBlockSize,
+                            InvalidChromaSubsampling, GradientTest);
+
+INSTANTIATE_TYPED_TEST_SUITE_P(WienerDenoiseTestInstatiation, WienerDenoiseTest,
+                               AllBitDepthParams);
diff --git a/libs/libaom/src/test/obmc_sad_test.cc b/libs/libaom/src/test/obmc_sad_test.cc
new file mode 100644
index 000000000..6b4382cd7
--- /dev/null
+++ b/libs/libaom/src/test/obmc_sad_test.cc
@@ -0,0 +1,268 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "test/function_equivalence_test.h"
+#include "test/register_state_check.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+
+#define MAX_SB_SQUARE (MAX_SB_SIZE * MAX_SB_SIZE)
+
+using libaom_test::FunctionEquivalenceTest;
+
+namespace {
+
+static const int kIterations = 1000;
+static const int kMaskMax = 64;
+
+typedef unsigned int (*ObmcSadF)(const uint8_t *pre, int pre_stride,
+                                 const int32_t *wsrc, const int32_t *mask);
+typedef libaom_test::FuncParam<ObmcSadF> TestFuncs;
+
+////////////////////////////////////////////////////////////////////////////////
+// 8 bit
+////////////////////////////////////////////////////////////////////////////////
+
+class ObmcSadTest : public FunctionEquivalenceTest<ObmcSadF> {};
+
+TEST_P(ObmcSadTest, RandomValues) {
+  DECLARE_ALIGNED(32, uint8_t, pre[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int32_t, wsrc[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int32_t, mask[MAX_SB_SQUARE]);
+
+  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    const int pre_stride = rng_(MAX_SB_SIZE + 1);
+
+    for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+      pre[i] = rng_.Rand8();
+      wsrc[i] = rng_.Rand8() * rng_(kMaskMax * kMaskMax + 1);
+      mask[i] = rng_(kMaskMax * kMaskMax + 1);
+    }
+
+    const unsigned int ref_res = params_.ref_func(pre, pre_stride, wsrc, mask);
+    unsigned int tst_res;
+    ASM_REGISTER_STATE_CHECK(tst_res =
+                                 params_.tst_func(pre, pre_stride, wsrc, mask));
+
+    ASSERT_EQ(ref_res, tst_res);
+  }
+}
+
+TEST_P(ObmcSadTest, ExtremeValues) {
+  DECLARE_ALIGNED(32, uint8_t, pre[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int32_t, wsrc[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int32_t, mask[MAX_SB_SQUARE]);
+
+  for (int iter = 0; iter < MAX_SB_SIZE && !HasFatalFailure(); ++iter) {
+    const int pre_stride = iter;
+
+    for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+      pre[i] = UINT8_MAX;
+      wsrc[i] = UINT8_MAX * kMaskMax * kMaskMax;
+      mask[i] = kMaskMax * kMaskMax;
+    }
+
+    const unsigned int ref_res = params_.ref_func(pre, pre_stride, wsrc, mask);
+    unsigned int tst_res;
+    ASM_REGISTER_STATE_CHECK(tst_res =
+                                 params_.tst_func(pre, pre_stride, wsrc, mask));
+
+    ASSERT_EQ(ref_res, tst_res);
+  }
+}
+
+#if HAVE_SSE4_1
+const ObmcSadTest::ParamType sse4_functions[] = {
+  TestFuncs(aom_obmc_sad128x128_c, aom_obmc_sad128x128_sse4_1),
+  TestFuncs(aom_obmc_sad128x64_c, aom_obmc_sad128x64_sse4_1),
+  TestFuncs(aom_obmc_sad64x128_c, aom_obmc_sad64x128_sse4_1),
+  TestFuncs(aom_obmc_sad64x64_c, aom_obmc_sad64x64_sse4_1),
+  TestFuncs(aom_obmc_sad64x32_c, aom_obmc_sad64x32_sse4_1),
+  TestFuncs(aom_obmc_sad32x64_c, aom_obmc_sad32x64_sse4_1),
+  TestFuncs(aom_obmc_sad32x32_c, aom_obmc_sad32x32_sse4_1),
+  TestFuncs(aom_obmc_sad32x16_c, aom_obmc_sad32x16_sse4_1),
+  TestFuncs(aom_obmc_sad16x32_c, aom_obmc_sad16x32_sse4_1),
+  TestFuncs(aom_obmc_sad16x16_c, aom_obmc_sad16x16_sse4_1),
+  TestFuncs(aom_obmc_sad16x8_c, aom_obmc_sad16x8_sse4_1),
+  TestFuncs(aom_obmc_sad8x16_c, aom_obmc_sad8x16_sse4_1),
+  TestFuncs(aom_obmc_sad8x8_c, aom_obmc_sad8x8_sse4_1),
+  TestFuncs(aom_obmc_sad8x4_c, aom_obmc_sad8x4_sse4_1),
+  TestFuncs(aom_obmc_sad4x8_c, aom_obmc_sad4x8_sse4_1),
+  TestFuncs(aom_obmc_sad4x4_c, aom_obmc_sad4x4_sse4_1),
+
+  TestFuncs(aom_obmc_sad64x16_c, aom_obmc_sad64x16_sse4_1),
+  TestFuncs(aom_obmc_sad16x64_c, aom_obmc_sad16x64_sse4_1),
+  TestFuncs(aom_obmc_sad32x8_c, aom_obmc_sad32x8_sse4_1),
+  TestFuncs(aom_obmc_sad8x32_c, aom_obmc_sad8x32_sse4_1),
+  TestFuncs(aom_obmc_sad16x4_c, aom_obmc_sad16x4_sse4_1),
+  TestFuncs(aom_obmc_sad4x16_c, aom_obmc_sad4x16_sse4_1),
+};
+
+INSTANTIATE_TEST_SUITE_P(SSE4_1, ObmcSadTest,
+                         ::testing::ValuesIn(sse4_functions));
+#endif  // HAVE_SSE4_1
+
+#if HAVE_AVX2
+const ObmcSadTest::ParamType avx2_functions[] = {
+  TestFuncs(aom_obmc_sad128x128_c, aom_obmc_sad128x128_avx2),
+  TestFuncs(aom_obmc_sad128x64_c, aom_obmc_sad128x64_avx2),
+  TestFuncs(aom_obmc_sad64x128_c, aom_obmc_sad64x128_avx2),
+  TestFuncs(aom_obmc_sad64x64_c, aom_obmc_sad64x64_avx2),
+  TestFuncs(aom_obmc_sad64x32_c, aom_obmc_sad64x32_avx2),
+  TestFuncs(aom_obmc_sad32x64_c, aom_obmc_sad32x64_avx2),
+  TestFuncs(aom_obmc_sad32x32_c, aom_obmc_sad32x32_avx2),
+  TestFuncs(aom_obmc_sad32x16_c, aom_obmc_sad32x16_avx2),
+  TestFuncs(aom_obmc_sad16x32_c, aom_obmc_sad16x32_avx2),
+  TestFuncs(aom_obmc_sad16x16_c, aom_obmc_sad16x16_avx2),
+  TestFuncs(aom_obmc_sad16x8_c, aom_obmc_sad16x8_avx2),
+  TestFuncs(aom_obmc_sad8x16_c, aom_obmc_sad8x16_avx2),
+  TestFuncs(aom_obmc_sad8x8_c, aom_obmc_sad8x8_avx2),
+  TestFuncs(aom_obmc_sad8x4_c, aom_obmc_sad8x4_avx2),
+  TestFuncs(aom_obmc_sad4x8_c, aom_obmc_sad4x8_avx2),
+  TestFuncs(aom_obmc_sad4x4_c, aom_obmc_sad4x4_avx2),
+
+  TestFuncs(aom_obmc_sad64x16_c, aom_obmc_sad64x16_avx2),
+  TestFuncs(aom_obmc_sad16x64_c, aom_obmc_sad16x64_avx2),
+  TestFuncs(aom_obmc_sad32x8_c, aom_obmc_sad32x8_avx2),
+  TestFuncs(aom_obmc_sad8x32_c, aom_obmc_sad8x32_avx2),
+  TestFuncs(aom_obmc_sad16x4_c, aom_obmc_sad16x4_avx2),
+  TestFuncs(aom_obmc_sad4x16_c, aom_obmc_sad4x16_avx2),
+};
+
+INSTANTIATE_TEST_SUITE_P(AVX2, ObmcSadTest,
+                         ::testing::ValuesIn(avx2_functions));
+#endif  // HAVE_AVX2
+
+#if CONFIG_AV1_HIGHBITDEPTH
+////////////////////////////////////////////////////////////////////////////////
+// High bit-depth
+////////////////////////////////////////////////////////////////////////////////
+
+class ObmcSadHBDTest : public FunctionEquivalenceTest<ObmcSadF> {};
+
+TEST_P(ObmcSadHBDTest, RandomValues) {
+  DECLARE_ALIGNED(32, uint16_t, pre[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int32_t, wsrc[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int32_t, mask[MAX_SB_SQUARE]);
+
+  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    const int pre_stride = rng_(MAX_SB_SIZE + 1);
+
+    for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+      pre[i] = rng_(1 << 12);
+      wsrc[i] = rng_(1 << 12) * rng_(kMaskMax * kMaskMax + 1);
+      mask[i] = rng_(kMaskMax * kMaskMax + 1);
+    }
+
+    const unsigned int ref_res =
+        params_.ref_func(CONVERT_TO_BYTEPTR(pre), pre_stride, wsrc, mask);
+    unsigned int tst_res;
+    ASM_REGISTER_STATE_CHECK(
+        tst_res =
+            params_.tst_func(CONVERT_TO_BYTEPTR(pre), pre_stride, wsrc, mask));
+
+    ASSERT_EQ(ref_res, tst_res);
+  }
+}
+
+TEST_P(ObmcSadHBDTest, ExtremeValues) {
+  DECLARE_ALIGNED(32, uint16_t, pre[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int32_t, wsrc[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int32_t, mask[MAX_SB_SQUARE]);
+
+  for (int iter = 0; iter < MAX_SB_SIZE && !HasFatalFailure(); ++iter) {
+    const int pre_stride = iter;
+
+    for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+      pre[i] = (1 << 12) - 1;
+      wsrc[i] = ((1 << 12) - 1) * kMaskMax * kMaskMax;
+      mask[i] = kMaskMax * kMaskMax;
+    }
+
+    const unsigned int ref_res =
+        params_.ref_func(CONVERT_TO_BYTEPTR(pre), pre_stride, wsrc, mask);
+    unsigned int tst_res;
+    ASM_REGISTER_STATE_CHECK(
+        tst_res =
+            params_.tst_func(CONVERT_TO_BYTEPTR(pre), pre_stride, wsrc, mask));
+
+    ASSERT_EQ(ref_res, tst_res);
+  }
+}
+
+#if HAVE_SSE4_1
+ObmcSadHBDTest::ParamType sse4_functions_hbd[] = {
+  TestFuncs(aom_highbd_obmc_sad128x128_c, aom_highbd_obmc_sad128x128_sse4_1),
+  TestFuncs(aom_highbd_obmc_sad128x64_c, aom_highbd_obmc_sad128x64_sse4_1),
+  TestFuncs(aom_highbd_obmc_sad64x128_c, aom_highbd_obmc_sad64x128_sse4_1),
+  TestFuncs(aom_highbd_obmc_sad64x64_c, aom_highbd_obmc_sad64x64_sse4_1),
+  TestFuncs(aom_highbd_obmc_sad64x32_c, aom_highbd_obmc_sad64x32_sse4_1),
+  TestFuncs(aom_highbd_obmc_sad32x64_c, aom_highbd_obmc_sad32x64_sse4_1),
+  TestFuncs(aom_highbd_obmc_sad32x32_c, aom_highbd_obmc_sad32x32_sse4_1),
+  TestFuncs(aom_highbd_obmc_sad32x16_c, aom_highbd_obmc_sad32x16_sse4_1),
+  TestFuncs(aom_highbd_obmc_sad16x32_c, aom_highbd_obmc_sad16x32_sse4_1),
+  TestFuncs(aom_highbd_obmc_sad16x16_c, aom_highbd_obmc_sad16x16_sse4_1),
+  TestFuncs(aom_highbd_obmc_sad16x8_c, aom_highbd_obmc_sad16x8_sse4_1),
+  TestFuncs(aom_highbd_obmc_sad8x16_c, aom_highbd_obmc_sad8x16_sse4_1),
+  TestFuncs(aom_highbd_obmc_sad8x8_c, aom_highbd_obmc_sad8x8_sse4_1),
+  TestFuncs(aom_highbd_obmc_sad8x4_c, aom_highbd_obmc_sad8x4_sse4_1),
+  TestFuncs(aom_highbd_obmc_sad4x8_c, aom_highbd_obmc_sad4x8_sse4_1),
+  TestFuncs(aom_highbd_obmc_sad4x4_c, aom_highbd_obmc_sad4x4_sse4_1),
+
+  TestFuncs(aom_highbd_obmc_sad64x16_c, aom_highbd_obmc_sad64x16_sse4_1),
+  TestFuncs(aom_highbd_obmc_sad16x64_c, aom_highbd_obmc_sad16x64_sse4_1),
+  TestFuncs(aom_highbd_obmc_sad32x8_c, aom_highbd_obmc_sad32x8_sse4_1),
+  TestFuncs(aom_highbd_obmc_sad8x32_c, aom_highbd_obmc_sad8x32_sse4_1),
+  TestFuncs(aom_highbd_obmc_sad16x4_c, aom_highbd_obmc_sad16x4_sse4_1),
+  TestFuncs(aom_highbd_obmc_sad4x16_c, aom_highbd_obmc_sad4x16_sse4_1),
+};
+
+INSTANTIATE_TEST_SUITE_P(SSE4_1, ObmcSadHBDTest,
+                         ::testing::ValuesIn(sse4_functions_hbd));
+#endif  // HAVE_SSE4_1
+
+#if HAVE_AVX2
+ObmcSadHBDTest::ParamType avx2_functions_hbd[] = {
+  TestFuncs(aom_highbd_obmc_sad128x128_c, aom_highbd_obmc_sad128x128_avx2),
+  TestFuncs(aom_highbd_obmc_sad128x64_c, aom_highbd_obmc_sad128x64_avx2),
+  TestFuncs(aom_highbd_obmc_sad64x128_c, aom_highbd_obmc_sad64x128_avx2),
+  TestFuncs(aom_highbd_obmc_sad64x64_c, aom_highbd_obmc_sad64x64_avx2),
+  TestFuncs(aom_highbd_obmc_sad64x32_c, aom_highbd_obmc_sad64x32_avx2),
+  TestFuncs(aom_highbd_obmc_sad32x64_c, aom_highbd_obmc_sad32x64_avx2),
+  TestFuncs(aom_highbd_obmc_sad32x32_c, aom_highbd_obmc_sad32x32_avx2),
+  TestFuncs(aom_highbd_obmc_sad32x16_c, aom_highbd_obmc_sad32x16_avx2),
+  TestFuncs(aom_highbd_obmc_sad16x32_c, aom_highbd_obmc_sad16x32_avx2),
+  TestFuncs(aom_highbd_obmc_sad16x16_c, aom_highbd_obmc_sad16x16_avx2),
+  TestFuncs(aom_highbd_obmc_sad16x8_c, aom_highbd_obmc_sad16x8_avx2),
+  TestFuncs(aom_highbd_obmc_sad8x16_c, aom_highbd_obmc_sad8x16_avx2),
+  TestFuncs(aom_highbd_obmc_sad8x8_c, aom_highbd_obmc_sad8x8_avx2),
+  TestFuncs(aom_highbd_obmc_sad8x4_c, aom_highbd_obmc_sad8x4_avx2),
+  TestFuncs(aom_highbd_obmc_sad4x8_c, aom_highbd_obmc_sad4x8_avx2),
+  TestFuncs(aom_highbd_obmc_sad4x4_c, aom_highbd_obmc_sad4x4_avx2),
+
+  TestFuncs(aom_highbd_obmc_sad64x16_c, aom_highbd_obmc_sad64x16_avx2),
+  TestFuncs(aom_highbd_obmc_sad16x64_c, aom_highbd_obmc_sad16x64_avx2),
+  TestFuncs(aom_highbd_obmc_sad32x8_c, aom_highbd_obmc_sad32x8_avx2),
+  TestFuncs(aom_highbd_obmc_sad8x32_c, aom_highbd_obmc_sad8x32_avx2),
+  TestFuncs(aom_highbd_obmc_sad16x4_c, aom_highbd_obmc_sad16x4_avx2),
+  TestFuncs(aom_highbd_obmc_sad4x16_c, aom_highbd_obmc_sad4x16_avx2),
+};
+
+INSTANTIATE_TEST_SUITE_P(AVX2, ObmcSadHBDTest,
+                         ::testing::ValuesIn(avx2_functions_hbd));
+#endif  // HAVE_AVX2
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+}  // namespace
diff --git a/libs/libaom/src/test/obmc_variance_test.cc b/libs/libaom/src/test/obmc_variance_test.cc
new file mode 100644
index 000000000..fc281d70b
--- /dev/null
+++ b/libs/libaom/src/test/obmc_variance_test.cc
@@ -0,0 +1,397 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/acm_random.h"
+
+#include "test/function_equivalence_test.h"
+#include "test/register_state_check.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+
+#define MAX_SB_SQUARE (MAX_SB_SIZE * MAX_SB_SIZE)
+
+using libaom_test::ACMRandom;
+using libaom_test::FunctionEquivalenceTest;
+
+namespace {
+
+static const int kIterations = 1000;
+static const int kMaskMax = 64;
+
+typedef unsigned int (*ObmcVarF)(const uint8_t *pre, int pre_stride,
+                                 const int32_t *wsrc, const int32_t *mask,
+                                 unsigned int *sse);
+typedef libaom_test::FuncParam<ObmcVarF> TestFuncs;
+
+////////////////////////////////////////////////////////////////////////////////
+// 8 bit
+////////////////////////////////////////////////////////////////////////////////
+
+class ObmcVarianceTest : public FunctionEquivalenceTest<ObmcVarF> {};
+
+TEST_P(ObmcVarianceTest, RandomValues) {
+  DECLARE_ALIGNED(32, uint8_t, pre[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int32_t, wsrc[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int32_t, mask[MAX_SB_SQUARE]);
+
+  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    const int pre_stride = this->rng_(MAX_SB_SIZE + 1);
+
+    for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+      pre[i] = this->rng_.Rand8();
+      wsrc[i] = this->rng_.Rand8() * this->rng_(kMaskMax * kMaskMax + 1);
+      mask[i] = this->rng_(kMaskMax * kMaskMax + 1);
+    }
+
+    unsigned int ref_sse, tst_sse;
+    const unsigned int ref_res =
+        params_.ref_func(pre, pre_stride, wsrc, mask, &ref_sse);
+    unsigned int tst_res;
+    ASM_REGISTER_STATE_CHECK(
+        tst_res = params_.tst_func(pre, pre_stride, wsrc, mask, &tst_sse));
+
+    ASSERT_EQ(ref_res, tst_res);
+    ASSERT_EQ(ref_sse, tst_sse);
+  }
+}
+
+TEST_P(ObmcVarianceTest, ExtremeValues) {
+  DECLARE_ALIGNED(32, uint8_t, pre[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int32_t, wsrc[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int32_t, mask[MAX_SB_SQUARE]);
+
+  for (int iter = 0; iter < MAX_SB_SIZE && !HasFatalFailure(); ++iter) {
+    const int pre_stride = iter;
+
+    for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+      pre[i] = UINT8_MAX;
+      wsrc[i] = UINT8_MAX * kMaskMax * kMaskMax;
+      mask[i] = kMaskMax * kMaskMax;
+    }
+
+    unsigned int ref_sse, tst_sse;
+    const unsigned int ref_res =
+        params_.ref_func(pre, pre_stride, wsrc, mask, &ref_sse);
+    unsigned int tst_res;
+    ASM_REGISTER_STATE_CHECK(
+        tst_res = params_.tst_func(pre, pre_stride, wsrc, mask, &tst_sse));
+
+    ASSERT_EQ(ref_res, tst_res);
+    ASSERT_EQ(ref_sse, tst_sse);
+  }
+}
+
+TEST_P(ObmcVarianceTest, DISABLED_Speed) {
+  DECLARE_ALIGNED(32, uint8_t, pre[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int32_t, wsrc[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int32_t, mask[MAX_SB_SQUARE]);
+
+  const int pre_stride = this->rng_(MAX_SB_SIZE + 1);
+
+  for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+    pre[i] = this->rng_.Rand8();
+    wsrc[i] = this->rng_.Rand8() * this->rng_(kMaskMax * kMaskMax + 1);
+    mask[i] = this->rng_(kMaskMax * kMaskMax + 1);
+  }
+
+  const int num_loops = 1000000;
+  unsigned int ref_sse, tst_sse;
+  aom_usec_timer ref_timer, test_timer;
+
+  aom_usec_timer_start(&ref_timer);
+  for (int i = 0; i < num_loops; ++i) {
+    params_.ref_func(pre, pre_stride, wsrc, mask, &ref_sse);
+  }
+  aom_usec_timer_mark(&ref_timer);
+  const int elapsed_time_c =
+      static_cast<int>(aom_usec_timer_elapsed(&ref_timer));
+
+  aom_usec_timer_start(&test_timer);
+  for (int i = 0; i < num_loops; ++i) {
+    params_.tst_func(pre, pre_stride, wsrc, mask, &tst_sse);
+  }
+  aom_usec_timer_mark(&test_timer);
+  const int elapsed_time_simd =
+      static_cast<int>(aom_usec_timer_elapsed(&test_timer));
+
+  printf("c_time=%d \t simd_time=%d \t gain=%d \n", elapsed_time_c,
+         elapsed_time_simd, (elapsed_time_c / elapsed_time_simd));
+}
+
+#if HAVE_SSE4_1
+const ObmcVarianceTest::ParamType sse4_functions[] = {
+  TestFuncs(aom_obmc_variance128x128_c, aom_obmc_variance128x128_sse4_1),
+  TestFuncs(aom_obmc_variance128x64_c, aom_obmc_variance128x64_sse4_1),
+  TestFuncs(aom_obmc_variance64x128_c, aom_obmc_variance64x128_sse4_1),
+  TestFuncs(aom_obmc_variance64x64_c, aom_obmc_variance64x64_sse4_1),
+  TestFuncs(aom_obmc_variance64x32_c, aom_obmc_variance64x32_sse4_1),
+  TestFuncs(aom_obmc_variance32x64_c, aom_obmc_variance32x64_sse4_1),
+  TestFuncs(aom_obmc_variance32x32_c, aom_obmc_variance32x32_sse4_1),
+  TestFuncs(aom_obmc_variance32x16_c, aom_obmc_variance32x16_sse4_1),
+  TestFuncs(aom_obmc_variance16x32_c, aom_obmc_variance16x32_sse4_1),
+  TestFuncs(aom_obmc_variance16x16_c, aom_obmc_variance16x16_sse4_1),
+  TestFuncs(aom_obmc_variance16x8_c, aom_obmc_variance16x8_sse4_1),
+  TestFuncs(aom_obmc_variance8x16_c, aom_obmc_variance8x16_sse4_1),
+  TestFuncs(aom_obmc_variance8x8_c, aom_obmc_variance8x8_sse4_1),
+  TestFuncs(aom_obmc_variance8x4_c, aom_obmc_variance8x4_sse4_1),
+  TestFuncs(aom_obmc_variance4x8_c, aom_obmc_variance4x8_sse4_1),
+  TestFuncs(aom_obmc_variance4x4_c, aom_obmc_variance4x4_sse4_1),
+
+  TestFuncs(aom_obmc_variance64x16_c, aom_obmc_variance64x16_sse4_1),
+  TestFuncs(aom_obmc_variance16x64_c, aom_obmc_variance16x64_sse4_1),
+  TestFuncs(aom_obmc_variance32x8_c, aom_obmc_variance32x8_sse4_1),
+  TestFuncs(aom_obmc_variance8x32_c, aom_obmc_variance8x32_sse4_1),
+  TestFuncs(aom_obmc_variance16x4_c, aom_obmc_variance16x4_sse4_1),
+  TestFuncs(aom_obmc_variance4x16_c, aom_obmc_variance4x16_sse4_1),
+};
+
+INSTANTIATE_TEST_SUITE_P(SSE4_1, ObmcVarianceTest,
+                         ::testing::ValuesIn(sse4_functions));
+#endif  // HAVE_SSE4_1
+
+#if HAVE_AVX2
+const ObmcVarianceTest::ParamType avx2_functions[] = {
+  TestFuncs(aom_obmc_variance128x128_c, aom_obmc_variance128x128_avx2),
+  TestFuncs(aom_obmc_variance128x64_c, aom_obmc_variance128x64_avx2),
+  TestFuncs(aom_obmc_variance64x128_c, aom_obmc_variance64x128_avx2),
+  TestFuncs(aom_obmc_variance64x64_c, aom_obmc_variance64x64_avx2),
+  TestFuncs(aom_obmc_variance64x32_c, aom_obmc_variance64x32_avx2),
+  TestFuncs(aom_obmc_variance32x64_c, aom_obmc_variance32x64_avx2),
+  TestFuncs(aom_obmc_variance32x32_c, aom_obmc_variance32x32_avx2),
+  TestFuncs(aom_obmc_variance32x16_c, aom_obmc_variance32x16_avx2),
+  TestFuncs(aom_obmc_variance16x32_c, aom_obmc_variance16x32_avx2),
+  TestFuncs(aom_obmc_variance16x16_c, aom_obmc_variance16x16_avx2),
+  TestFuncs(aom_obmc_variance16x8_c, aom_obmc_variance16x8_avx2),
+  TestFuncs(aom_obmc_variance8x16_c, aom_obmc_variance8x16_avx2),
+  TestFuncs(aom_obmc_variance8x8_c, aom_obmc_variance8x8_avx2),
+  TestFuncs(aom_obmc_variance8x4_c, aom_obmc_variance8x4_avx2),
+  TestFuncs(aom_obmc_variance4x8_c, aom_obmc_variance4x8_avx2),
+  TestFuncs(aom_obmc_variance4x4_c, aom_obmc_variance4x4_avx2),
+
+  TestFuncs(aom_obmc_variance64x16_c, aom_obmc_variance64x16_avx2),
+  TestFuncs(aom_obmc_variance16x64_c, aom_obmc_variance16x64_avx2),
+  TestFuncs(aom_obmc_variance32x8_c, aom_obmc_variance32x8_avx2),
+  TestFuncs(aom_obmc_variance8x32_c, aom_obmc_variance8x32_avx2),
+  TestFuncs(aom_obmc_variance16x4_c, aom_obmc_variance16x4_avx2),
+  TestFuncs(aom_obmc_variance4x16_c, aom_obmc_variance4x16_avx2),
+};
+
+INSTANTIATE_TEST_SUITE_P(AVX2, ObmcVarianceTest,
+                         ::testing::ValuesIn(avx2_functions));
+#endif  // HAVE_AVX2
+
+////////////////////////////////////////////////////////////////////////////////
+// High bit-depth
+////////////////////////////////////////////////////////////////////////////////
+#if CONFIG_AV1_HIGHBITDEPTH
+class ObmcVarianceHBDTest : public FunctionEquivalenceTest<ObmcVarF> {};
+
+TEST_P(ObmcVarianceHBDTest, RandomValues) {
+  DECLARE_ALIGNED(32, uint16_t, pre[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int32_t, wsrc[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int32_t, mask[MAX_SB_SQUARE]);
+
+  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    const int pre_stride = this->rng_(MAX_SB_SIZE + 1);
+
+    for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+      pre[i] = this->rng_(1 << params_.bit_depth);
+      wsrc[i] = this->rng_(1 << params_.bit_depth) *
+                this->rng_(kMaskMax * kMaskMax + 1);
+      mask[i] = this->rng_(kMaskMax * kMaskMax + 1);
+    }
+
+    unsigned int ref_sse, tst_sse;
+    const unsigned int ref_res = params_.ref_func(
+        CONVERT_TO_BYTEPTR(pre), pre_stride, wsrc, mask, &ref_sse);
+    unsigned int tst_res;
+    ASM_REGISTER_STATE_CHECK(tst_res = params_.tst_func(CONVERT_TO_BYTEPTR(pre),
+                                                        pre_stride, wsrc, mask,
+                                                        &tst_sse));
+
+    ASSERT_EQ(ref_res, tst_res);
+    ASSERT_EQ(ref_sse, tst_sse);
+  }
+}
+
+TEST_P(ObmcVarianceHBDTest, ExtremeValues) {
+  DECLARE_ALIGNED(32, uint16_t, pre[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int32_t, wsrc[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int32_t, mask[MAX_SB_SQUARE]);
+
+  for (int iter = 0; iter < MAX_SB_SIZE && !HasFatalFailure(); ++iter) {
+    const int pre_stride = iter;
+
+    for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+      pre[i] = (1 << params_.bit_depth) - 1;
+      wsrc[i] = ((1 << params_.bit_depth) - 1) * kMaskMax * kMaskMax;
+      mask[i] = kMaskMax * kMaskMax;
+    }
+
+    unsigned int ref_sse, tst_sse;
+    const unsigned int ref_res = params_.ref_func(
+        CONVERT_TO_BYTEPTR(pre), pre_stride, wsrc, mask, &ref_sse);
+    unsigned int tst_res;
+    ASM_REGISTER_STATE_CHECK(tst_res = params_.tst_func(CONVERT_TO_BYTEPTR(pre),
+                                                        pre_stride, wsrc, mask,
+                                                        &tst_sse));
+
+    ASSERT_EQ(ref_res, tst_res);
+    ASSERT_EQ(ref_sse, tst_sse);
+  }
+}
+
+#if HAVE_SSE4_1
+ObmcVarianceHBDTest::ParamType sse4_functions_hbd[] = {
+  TestFuncs(aom_highbd_obmc_variance128x128_c,
+            aom_highbd_obmc_variance128x128_sse4_1, 8),
+  TestFuncs(aom_highbd_obmc_variance128x64_c,
+            aom_highbd_obmc_variance128x64_sse4_1, 8),
+  TestFuncs(aom_highbd_obmc_variance64x128_c,
+            aom_highbd_obmc_variance64x128_sse4_1, 8),
+  TestFuncs(aom_highbd_obmc_variance64x64_c,
+            aom_highbd_obmc_variance64x64_sse4_1, 8),
+  TestFuncs(aom_highbd_obmc_variance64x32_c,
+            aom_highbd_obmc_variance64x32_sse4_1, 8),
+  TestFuncs(aom_highbd_obmc_variance32x64_c,
+            aom_highbd_obmc_variance32x64_sse4_1, 8),
+  TestFuncs(aom_highbd_obmc_variance32x32_c,
+            aom_highbd_obmc_variance32x32_sse4_1, 8),
+  TestFuncs(aom_highbd_obmc_variance32x16_c,
+            aom_highbd_obmc_variance32x16_sse4_1, 8),
+  TestFuncs(aom_highbd_obmc_variance16x32_c,
+            aom_highbd_obmc_variance16x32_sse4_1, 8),
+  TestFuncs(aom_highbd_obmc_variance16x16_c,
+            aom_highbd_obmc_variance16x16_sse4_1, 8),
+  TestFuncs(aom_highbd_obmc_variance16x8_c, aom_highbd_obmc_variance16x8_sse4_1,
+            8),
+  TestFuncs(aom_highbd_obmc_variance8x16_c, aom_highbd_obmc_variance8x16_sse4_1,
+            8),
+  TestFuncs(aom_highbd_obmc_variance8x8_c, aom_highbd_obmc_variance8x8_sse4_1,
+            8),
+  TestFuncs(aom_highbd_obmc_variance8x4_c, aom_highbd_obmc_variance8x4_sse4_1,
+            8),
+  TestFuncs(aom_highbd_obmc_variance4x8_c, aom_highbd_obmc_variance4x8_sse4_1,
+            8),
+  TestFuncs(aom_highbd_obmc_variance4x4_c, aom_highbd_obmc_variance4x4_sse4_1,
+            8),
+  TestFuncs(aom_highbd_10_obmc_variance128x128_c,
+            aom_highbd_10_obmc_variance128x128_sse4_1, 10),
+  TestFuncs(aom_highbd_10_obmc_variance128x64_c,
+            aom_highbd_10_obmc_variance128x64_sse4_1, 10),
+  TestFuncs(aom_highbd_10_obmc_variance64x128_c,
+            aom_highbd_10_obmc_variance64x128_sse4_1, 10),
+  TestFuncs(aom_highbd_10_obmc_variance64x64_c,
+            aom_highbd_10_obmc_variance64x64_sse4_1, 10),
+  TestFuncs(aom_highbd_10_obmc_variance64x32_c,
+            aom_highbd_10_obmc_variance64x32_sse4_1, 10),
+  TestFuncs(aom_highbd_10_obmc_variance32x64_c,
+            aom_highbd_10_obmc_variance32x64_sse4_1, 10),
+  TestFuncs(aom_highbd_10_obmc_variance32x32_c,
+            aom_highbd_10_obmc_variance32x32_sse4_1, 10),
+  TestFuncs(aom_highbd_10_obmc_variance32x16_c,
+            aom_highbd_10_obmc_variance32x16_sse4_1, 10),
+  TestFuncs(aom_highbd_10_obmc_variance16x32_c,
+            aom_highbd_10_obmc_variance16x32_sse4_1, 10),
+  TestFuncs(aom_highbd_10_obmc_variance16x16_c,
+            aom_highbd_10_obmc_variance16x16_sse4_1, 10),
+  TestFuncs(aom_highbd_10_obmc_variance16x8_c,
+            aom_highbd_10_obmc_variance16x8_sse4_1, 10),
+  TestFuncs(aom_highbd_10_obmc_variance8x16_c,
+            aom_highbd_10_obmc_variance8x16_sse4_1, 10),
+  TestFuncs(aom_highbd_10_obmc_variance8x8_c,
+            aom_highbd_10_obmc_variance8x8_sse4_1, 10),
+  TestFuncs(aom_highbd_10_obmc_variance8x4_c,
+            aom_highbd_10_obmc_variance8x4_sse4_1, 10),
+  TestFuncs(aom_highbd_10_obmc_variance4x8_c,
+            aom_highbd_10_obmc_variance4x8_sse4_1, 10),
+  TestFuncs(aom_highbd_10_obmc_variance4x4_c,
+            aom_highbd_10_obmc_variance4x4_sse4_1, 10),
+  TestFuncs(aom_highbd_12_obmc_variance128x128_c,
+            aom_highbd_12_obmc_variance128x128_sse4_1, 12),
+  TestFuncs(aom_highbd_12_obmc_variance128x64_c,
+            aom_highbd_12_obmc_variance128x64_sse4_1, 12),
+  TestFuncs(aom_highbd_12_obmc_variance64x128_c,
+            aom_highbd_12_obmc_variance64x128_sse4_1, 12),
+  TestFuncs(aom_highbd_12_obmc_variance64x64_c,
+            aom_highbd_12_obmc_variance64x64_sse4_1, 12),
+  TestFuncs(aom_highbd_12_obmc_variance64x32_c,
+            aom_highbd_12_obmc_variance64x32_sse4_1, 12),
+  TestFuncs(aom_highbd_12_obmc_variance32x64_c,
+            aom_highbd_12_obmc_variance32x64_sse4_1, 12),
+  TestFuncs(aom_highbd_12_obmc_variance32x32_c,
+            aom_highbd_12_obmc_variance32x32_sse4_1, 12),
+  TestFuncs(aom_highbd_12_obmc_variance32x16_c,
+            aom_highbd_12_obmc_variance32x16_sse4_1, 12),
+  TestFuncs(aom_highbd_12_obmc_variance16x32_c,
+            aom_highbd_12_obmc_variance16x32_sse4_1, 12),
+  TestFuncs(aom_highbd_12_obmc_variance16x16_c,
+            aom_highbd_12_obmc_variance16x16_sse4_1, 12),
+  TestFuncs(aom_highbd_12_obmc_variance16x8_c,
+            aom_highbd_12_obmc_variance16x8_sse4_1, 12),
+  TestFuncs(aom_highbd_12_obmc_variance8x16_c,
+            aom_highbd_12_obmc_variance8x16_sse4_1, 12),
+  TestFuncs(aom_highbd_12_obmc_variance8x8_c,
+            aom_highbd_12_obmc_variance8x8_sse4_1, 12),
+  TestFuncs(aom_highbd_12_obmc_variance8x4_c,
+            aom_highbd_12_obmc_variance8x4_sse4_1, 12),
+  TestFuncs(aom_highbd_12_obmc_variance4x8_c,
+            aom_highbd_12_obmc_variance4x8_sse4_1, 12),
+  TestFuncs(aom_highbd_12_obmc_variance4x4_c,
+            aom_highbd_12_obmc_variance4x4_sse4_1, 12),
+
+  TestFuncs(aom_highbd_obmc_variance64x16_c,
+            aom_highbd_obmc_variance64x16_sse4_1, 8),
+  TestFuncs(aom_highbd_obmc_variance16x64_c,
+            aom_highbd_obmc_variance16x64_sse4_1, 8),
+  TestFuncs(aom_highbd_obmc_variance32x8_c, aom_highbd_obmc_variance32x8_sse4_1,
+            8),
+  TestFuncs(aom_highbd_obmc_variance8x32_c, aom_highbd_obmc_variance8x32_sse4_1,
+            8),
+  TestFuncs(aom_highbd_obmc_variance16x4_c, aom_highbd_obmc_variance16x4_sse4_1,
+            8),
+  TestFuncs(aom_highbd_obmc_variance4x16_c, aom_highbd_obmc_variance4x16_sse4_1,
+            8),
+  TestFuncs(aom_highbd_10_obmc_variance64x16_c,
+            aom_highbd_10_obmc_variance64x16_sse4_1, 10),
+  TestFuncs(aom_highbd_10_obmc_variance16x64_c,
+            aom_highbd_10_obmc_variance16x64_sse4_1, 10),
+  TestFuncs(aom_highbd_10_obmc_variance32x8_c,
+            aom_highbd_10_obmc_variance32x8_sse4_1, 10),
+  TestFuncs(aom_highbd_10_obmc_variance8x32_c,
+            aom_highbd_10_obmc_variance8x32_sse4_1, 10),
+  TestFuncs(aom_highbd_10_obmc_variance16x4_c,
+            aom_highbd_10_obmc_variance16x4_sse4_1, 10),
+  TestFuncs(aom_highbd_10_obmc_variance4x16_c,
+            aom_highbd_10_obmc_variance4x16_sse4_1, 10),
+  TestFuncs(aom_highbd_12_obmc_variance64x16_c,
+            aom_highbd_12_obmc_variance64x16_sse4_1, 12),
+  TestFuncs(aom_highbd_12_obmc_variance16x64_c,
+            aom_highbd_12_obmc_variance16x64_sse4_1, 12),
+  TestFuncs(aom_highbd_12_obmc_variance32x8_c,
+            aom_highbd_12_obmc_variance32x8_sse4_1, 12),
+  TestFuncs(aom_highbd_12_obmc_variance8x32_c,
+            aom_highbd_12_obmc_variance8x32_sse4_1, 12),
+  TestFuncs(aom_highbd_12_obmc_variance16x4_c,
+            aom_highbd_12_obmc_variance16x4_sse4_1, 12),
+  TestFuncs(aom_highbd_12_obmc_variance4x16_c,
+            aom_highbd_12_obmc_variance4x16_sse4_1, 12),
+};
+
+INSTANTIATE_TEST_SUITE_P(SSE4_1, ObmcVarianceHBDTest,
+                         ::testing::ValuesIn(sse4_functions_hbd));
+#endif  // HAVE_SSE4_1
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+}  // namespace
diff --git a/libs/libaom/src/test/pickrst_test.cc b/libs/libaom/src/test/pickrst_test.cc
new file mode 100644
index 000000000..9a2c5bcd4
--- /dev/null
+++ b/libs/libaom/src/test/pickrst_test.cc
@@ -0,0 +1,534 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tuple>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "test/register_state_check.h"
+#include "test/acm_random.h"
+#include "test/util.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_ports/aom_timer.h"
+#include "av1/encoder/pickrst.h"
+
+#define MAX_DATA_BLOCK 384
+
+namespace pickrst_test_lowbd {
+static const int kIterations = 100;
+
+typedef int64_t (*lowbd_pixel_proj_error_func)(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+    int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params);
+
+////////////////////////////////////////////////////////////////////////////////
+// 8 bit
+////////////////////////////////////////////////////////////////////////////////
+
+typedef std::tuple<const lowbd_pixel_proj_error_func> PixelProjErrorTestParam;
+
+class PixelProjErrorTest
+    : public ::testing::TestWithParam<PixelProjErrorTestParam> {
+ public:
+  virtual void SetUp() {
+    target_func_ = GET_PARAM(0);
+    src_ = (uint8_t *)(aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK *
+                                  sizeof(*src_)));
+    ASSERT_NE(src_, nullptr);
+    dgd_ = (uint8_t *)(aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK *
+                                  sizeof(*dgd_)));
+    ASSERT_NE(dgd_, nullptr);
+    flt0_ = (int32_t *)(aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK *
+                                   sizeof(*flt0_)));
+    ASSERT_NE(flt0_, nullptr);
+    flt1_ = (int32_t *)(aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK *
+                                   sizeof(*flt1_)));
+    ASSERT_NE(flt1_, nullptr);
+  }
+  virtual void TearDown() {
+    aom_free(src_);
+    aom_free(dgd_);
+    aom_free(flt0_);
+    aom_free(flt1_);
+  }
+  void RunPixelProjErrorTest(int32_t run_times);
+  void RunPixelProjErrorTest_ExtremeValues();
+
+ private:
+  lowbd_pixel_proj_error_func target_func_;
+  libaom_test::ACMRandom rng_;
+  uint8_t *src_;
+  uint8_t *dgd_;
+  int32_t *flt0_;
+  int32_t *flt1_;
+};
+
+void PixelProjErrorTest::RunPixelProjErrorTest(int32_t run_times) {
+  int h_end = run_times != 1 ? 128 : (rng_.Rand16() % MAX_DATA_BLOCK) + 1;
+  int v_end = run_times != 1 ? 128 : (rng_.Rand16() % MAX_DATA_BLOCK) + 1;
+  const int dgd_stride = MAX_DATA_BLOCK;
+  const int src_stride = MAX_DATA_BLOCK;
+  const int flt0_stride = MAX_DATA_BLOCK;
+  const int flt1_stride = MAX_DATA_BLOCK;
+  sgr_params_type params;
+  int xq[2];
+  const int iters = run_times == 1 ? kIterations : 4;
+  for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) {
+    int64_t err_ref = 0, err_test = 1;
+    for (int i = 0; i < MAX_DATA_BLOCK * MAX_DATA_BLOCK; ++i) {
+      dgd_[i] = rng_.Rand8();
+      src_[i] = rng_.Rand8();
+      flt0_[i] = rng_.Rand15Signed();
+      flt1_[i] = rng_.Rand15Signed();
+    }
+    xq[0] = rng_.Rand8() % (1 << SGRPROJ_PRJ_BITS);
+    xq[1] = rng_.Rand8() % (1 << SGRPROJ_PRJ_BITS);
+    params.r[0] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : (iter % 2);
+    params.r[1] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : (iter / 2);
+    params.s[0] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : (iter % 2);
+    params.s[1] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : (iter / 2);
+    uint8_t *dgd = dgd_;
+    uint8_t *src = src_;
+
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < run_times; ++i) {
+      err_ref = av1_lowbd_pixel_proj_error_c(src, h_end, v_end, src_stride, dgd,
+                                             dgd_stride, flt0_, flt0_stride,
+                                             flt1_, flt1_stride, xq, &params);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < run_times; ++i) {
+      err_test =
+          target_func_(src, h_end, v_end, src_stride, dgd, dgd_stride, flt0_,
+                       flt0_stride, flt1_, flt1_stride, xq, &params);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    if (run_times > 10) {
+      printf("r0 %d r1 %d %3dx%-3d:%7.2f/%7.2fns (%3.2f)\n", params.r[0],
+             params.r[1], h_end, v_end, time1, time2, time1 / time2);
+    }
+    ASSERT_EQ(err_ref, err_test);
+  }
+}
+
+void PixelProjErrorTest::RunPixelProjErrorTest_ExtremeValues() {
+  const int h_start = 0;
+  int h_end = 192;
+  const int v_start = 0;
+  int v_end = 192;
+  const int dgd_stride = MAX_DATA_BLOCK;
+  const int src_stride = MAX_DATA_BLOCK;
+  const int flt0_stride = MAX_DATA_BLOCK;
+  const int flt1_stride = MAX_DATA_BLOCK;
+  sgr_params_type params;
+  int xq[2];
+  const int iters = kIterations;
+  for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) {
+    int64_t err_ref = 0, err_test = 1;
+    for (int i = 0; i < MAX_DATA_BLOCK * MAX_DATA_BLOCK; ++i) {
+      dgd_[i] = 0;
+      src_[i] = 255;
+      flt0_[i] = rng_.Rand15Signed();
+      flt1_[i] = rng_.Rand15Signed();
+    }
+    xq[0] = rng_.Rand8() % (1 << SGRPROJ_PRJ_BITS);
+    xq[1] = rng_.Rand8() % (1 << SGRPROJ_PRJ_BITS);
+    params.r[0] = rng_.Rand8() % MAX_RADIUS;
+    params.r[1] = rng_.Rand8() % MAX_RADIUS;
+    params.s[0] = rng_.Rand8() % MAX_RADIUS;
+    params.s[1] = rng_.Rand8() % MAX_RADIUS;
+    uint8_t *dgd = dgd_;
+    uint8_t *src = src_;
+
+    err_ref = av1_lowbd_pixel_proj_error_c(
+        src, h_end - h_start, v_end - v_start, src_stride, dgd, dgd_stride,
+        flt0_, flt0_stride, flt1_, flt1_stride, xq, &params);
+
+    err_test = target_func_(src, h_end - h_start, v_end - v_start, src_stride,
+                            dgd, dgd_stride, flt0_, flt0_stride, flt1_,
+                            flt1_stride, xq, &params);
+
+    ASSERT_EQ(err_ref, err_test);
+  }
+}
+
+TEST_P(PixelProjErrorTest, RandomValues) { RunPixelProjErrorTest(1); }
+
+TEST_P(PixelProjErrorTest, ExtremeValues) {
+  RunPixelProjErrorTest_ExtremeValues();
+}
+
+TEST_P(PixelProjErrorTest, DISABLED_Speed) { RunPixelProjErrorTest(200000); }
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE4_1, PixelProjErrorTest,
+                         ::testing::Values(av1_lowbd_pixel_proj_error_sse4_1));
+#endif  // HAVE_SSE4_1
+
+#if HAVE_AVX2
+
+INSTANTIATE_TEST_SUITE_P(AVX2, PixelProjErrorTest,
+                         ::testing::Values(av1_lowbd_pixel_proj_error_avx2));
+#endif  // HAVE_AVX2
+
+}  // namespace pickrst_test_lowbd
+
+#if CONFIG_AV1_HIGHBITDEPTH
+namespace pickrst_test_highbd {
+static const int kIterations = 100;
+
+typedef int64_t (*highbd_pixel_proj_error_func)(
+    const uint8_t *src8, int width, int height, int src_stride,
+    const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+    int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params);
+
+////////////////////////////////////////////////////////////////////////////////
+// High bit-depth
+////////////////////////////////////////////////////////////////////////////////
+
+typedef std::tuple<const highbd_pixel_proj_error_func> PixelProjErrorTestParam;
+
+class PixelProjHighbdErrorTest
+    : public ::testing::TestWithParam<PixelProjErrorTestParam> {
+ public:
+  virtual void SetUp() {
+    target_func_ = GET_PARAM(0);
+    src_ =
+        (uint16_t *)aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK * sizeof(*src_));
+    ASSERT_NE(src_, nullptr);
+    dgd_ =
+        (uint16_t *)aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK * sizeof(*dgd_));
+    ASSERT_NE(dgd_, nullptr);
+    flt0_ =
+        (int32_t *)aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK * sizeof(*flt0_));
+    ASSERT_NE(flt0_, nullptr);
+    flt1_ =
+        (int32_t *)aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK * sizeof(*flt1_));
+    ASSERT_NE(flt1_, nullptr);
+  }
+  virtual void TearDown() {
+    aom_free(src_);
+    aom_free(dgd_);
+    aom_free(flt0_);
+    aom_free(flt1_);
+  }
+  void RunPixelProjErrorTest(int32_t run_times);
+  void RunPixelProjErrorTest_ExtremeValues();
+
+ private:
+  highbd_pixel_proj_error_func target_func_;
+  libaom_test::ACMRandom rng_;
+  uint16_t *src_;
+  uint16_t *dgd_;
+  int32_t *flt0_;
+  int32_t *flt1_;
+};
+
+void PixelProjHighbdErrorTest::RunPixelProjErrorTest(int32_t run_times) {
+  int h_end = run_times != 1 ? 128 : (rng_.Rand16() % MAX_DATA_BLOCK) + 1;
+  int v_end = run_times != 1 ? 128 : (rng_.Rand16() % MAX_DATA_BLOCK) + 1;
+  const int dgd_stride = MAX_DATA_BLOCK;
+  const int src_stride = MAX_DATA_BLOCK;
+  const int flt0_stride = MAX_DATA_BLOCK;
+  const int flt1_stride = MAX_DATA_BLOCK;
+  sgr_params_type params;
+  int xq[2];
+  const int iters = run_times == 1 ? kIterations : 4;
+  for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) {
+    int64_t err_ref = 0, err_test = 1;
+    for (int i = 0; i < MAX_DATA_BLOCK * MAX_DATA_BLOCK; ++i) {
+      dgd_[i] = rng_.Rand16() % (1 << 12);
+      src_[i] = rng_.Rand16() % (1 << 12);
+      flt0_[i] = rng_.Rand15Signed();
+      flt1_[i] = rng_.Rand15Signed();
+    }
+    xq[0] = rng_.Rand8() % (1 << SGRPROJ_PRJ_BITS);
+    xq[1] = rng_.Rand8() % (1 << SGRPROJ_PRJ_BITS);
+    params.r[0] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : (iter % 2);
+    params.r[1] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : (iter / 2);
+    params.s[0] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : (iter % 2);
+    params.s[1] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : (iter / 2);
+    uint8_t *dgd8 = CONVERT_TO_BYTEPTR(dgd_);
+    uint8_t *src8 = CONVERT_TO_BYTEPTR(src_);
+
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < run_times; ++i) {
+      err_ref = av1_highbd_pixel_proj_error_c(
+          src8, h_end, v_end, src_stride, dgd8, dgd_stride, flt0_, flt0_stride,
+          flt1_, flt1_stride, xq, &params);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < run_times; ++i) {
+      err_test =
+          target_func_(src8, h_end, v_end, src_stride, dgd8, dgd_stride, flt0_,
+                       flt0_stride, flt1_, flt1_stride, xq, &params);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    if (run_times > 10) {
+      printf("r0 %d r1 %d %3dx%-3d:%7.2f/%7.2fns (%3.2f)\n", params.r[0],
+             params.r[1], h_end, v_end, time1, time2, time1 / time2);
+    }
+    ASSERT_EQ(err_ref, err_test);
+  }
+}
+
+void PixelProjHighbdErrorTest::RunPixelProjErrorTest_ExtremeValues() {
+  const int h_start = 0;
+  int h_end = 192;
+  const int v_start = 0;
+  int v_end = 192;
+  const int dgd_stride = MAX_DATA_BLOCK;
+  const int src_stride = MAX_DATA_BLOCK;
+  const int flt0_stride = MAX_DATA_BLOCK;
+  const int flt1_stride = MAX_DATA_BLOCK;
+  sgr_params_type params;
+  int xq[2];
+  const int iters = kIterations;
+  for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) {
+    int64_t err_ref = 0, err_test = 1;
+    for (int i = 0; i < MAX_DATA_BLOCK * MAX_DATA_BLOCK; ++i) {
+      dgd_[i] = 0;
+      src_[i] = (1 << 12) - 1;
+      flt0_[i] = rng_.Rand15Signed();
+      flt1_[i] = rng_.Rand15Signed();
+    }
+    xq[0] = rng_.Rand8() % (1 << SGRPROJ_PRJ_BITS);
+    xq[1] = rng_.Rand8() % (1 << SGRPROJ_PRJ_BITS);
+    params.r[0] = rng_.Rand8() % MAX_RADIUS;
+    params.r[1] = rng_.Rand8() % MAX_RADIUS;
+    params.s[0] = rng_.Rand8() % MAX_RADIUS;
+    params.s[1] = rng_.Rand8() % MAX_RADIUS;
+    uint8_t *dgd8 = CONVERT_TO_BYTEPTR(dgd_);
+    uint8_t *src8 = CONVERT_TO_BYTEPTR(src_);
+
+    err_ref = av1_highbd_pixel_proj_error_c(
+        src8, h_end - h_start, v_end - v_start, src_stride, dgd8, dgd_stride,
+        flt0_, flt0_stride, flt1_, flt1_stride, xq, &params);
+
+    err_test = target_func_(src8, h_end - h_start, v_end - v_start, src_stride,
+                            dgd8, dgd_stride, flt0_, flt0_stride, flt1_,
+                            flt1_stride, xq, &params);
+
+    ASSERT_EQ(err_ref, err_test);
+  }
+}
+
+TEST_P(PixelProjHighbdErrorTest, RandomValues) { RunPixelProjErrorTest(1); }
+
+TEST_P(PixelProjHighbdErrorTest, ExtremeValues) {
+  RunPixelProjErrorTest_ExtremeValues();
+}
+
+TEST_P(PixelProjHighbdErrorTest, DISABLED_Speed) {
+  RunPixelProjErrorTest(200000);
+}
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE4_1, PixelProjHighbdErrorTest,
+                         ::testing::Values(av1_highbd_pixel_proj_error_sse4_1));
+#endif  // HAVE_SSE4_1
+
+#if HAVE_AVX2
+
+INSTANTIATE_TEST_SUITE_P(AVX2, PixelProjHighbdErrorTest,
+                         ::testing::Values(av1_highbd_pixel_proj_error_avx2));
+#endif  // HAVE_AVX2
+
+}  // namespace pickrst_test_highbd
+
+////////////////////////////////////////////////////////////////////////////////
+// Get_proj_subspace_Test
+////////////////////////////////////////////////////////////////////////////////
+
+namespace get_proj_subspace_test_lowbd {
+static const int kIterations = 100;
+
+typedef void (*set_get_proj_subspace)(const uint8_t *src8, int width,
+                                      int height, int src_stride,
+                                      const uint8_t *dat8, int dat_stride,
+                                      int32_t *flt0, int flt0_stride,
+                                      int32_t *flt1, int flt1_stride,
+                                      int64_t H[2][2], int64_t C[2],
+                                      const sgr_params_type *params);
+
+typedef std::tuple<const set_get_proj_subspace> GetProjSubspaceTestParam;
+
+class GetProjSubspaceTest
+    : public ::testing::TestWithParam<GetProjSubspaceTestParam> {
+ public:
+  virtual void SetUp() {
+    target_func_ = GET_PARAM(0);
+    src_ = (uint8_t *)(aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK *
+                                  sizeof(*src_)));
+    ASSERT_NE(src_, nullptr);
+    dgd_ = (uint8_t *)(aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK *
+                                  sizeof(*dgd_)));
+    ASSERT_NE(dgd_, nullptr);
+    flt0_ = (int32_t *)(aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK *
+                                   sizeof(*flt0_)));
+    ASSERT_NE(flt0_, nullptr);
+    flt1_ = (int32_t *)(aom_malloc(MAX_DATA_BLOCK * MAX_DATA_BLOCK *
+                                   sizeof(*flt1_)));
+    ASSERT_NE(flt1_, nullptr);
+  }
+  virtual void TearDown() {
+    aom_free(src_);
+    aom_free(dgd_);
+    aom_free(flt0_);
+    aom_free(flt1_);
+  }
+  void RunGetProjSubspaceTest(int32_t run_times);
+  void RunGetProjSubspaceTest_ExtremeValues();
+
+ private:
+  set_get_proj_subspace target_func_;
+  libaom_test::ACMRandom rng_;
+  uint8_t *src_;
+  uint8_t *dgd_;
+  int32_t *flt0_;
+  int32_t *flt1_;
+};
+
+void GetProjSubspaceTest::RunGetProjSubspaceTest(int32_t run_times) {
+  int h_end = run_times != 1
+                  ? 128
+                  : ((rng_.Rand16() % MAX_DATA_BLOCK) &
+                     2147483640);  // We test for widths divisible by 8.
+  int v_end =
+      run_times != 1 ? 128 : ((rng_.Rand16() % MAX_DATA_BLOCK) & 2147483640);
+  const int dgd_stride = MAX_DATA_BLOCK;
+  const int src_stride = MAX_DATA_BLOCK;
+  const int flt0_stride = MAX_DATA_BLOCK;
+  const int flt1_stride = MAX_DATA_BLOCK;
+  sgr_params_type params;
+  const int iters = run_times == 1 ? kIterations : 4;
+  for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) {
+    int64_t C_ref[2] = { 0 }, C_test[2] = { 0 };
+    int64_t H_ref[2][2] = { { 0, 0 }, { 0, 0 } };
+    int64_t H_test[2][2] = { { 0, 0 }, { 0, 0 } };
+    for (int i = 0; i < MAX_DATA_BLOCK * MAX_DATA_BLOCK; ++i) {
+      dgd_[i] = rng_.Rand8();
+      src_[i] = rng_.Rand8();
+      flt0_[i] = rng_.Rand15Signed();
+      flt1_[i] = rng_.Rand15Signed();
+    }
+
+    params.r[0] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : 1;
+    params.r[1] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : 1;
+    params.s[0] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : (iter % 2);
+    params.s[1] = run_times == 1 ? (rng_.Rand8() % MAX_RADIUS) : (iter / 2);
+    uint8_t *dgd = dgd_;
+    uint8_t *src = src_;
+
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < run_times; ++i) {
+      av1_calc_proj_params_c(src, v_end, h_end, src_stride, dgd, dgd_stride,
+                             flt0_, flt0_stride, flt1_, flt1_stride, H_ref,
+                             C_ref, &params);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < run_times; ++i) {
+      target_func_(src, v_end, h_end, src_stride, dgd, dgd_stride, flt0_,
+                   flt0_stride, flt1_, flt1_stride, H_test, C_test, &params);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    if (run_times > 10) {
+      printf("r0 %d r1 %d %3dx%-3d:%7.2f/%7.2fns (%3.2f)\n", params.r[0],
+             params.r[1], h_end, v_end, time1, time2, time1 / time2);
+    } else {
+      ASSERT_EQ(H_ref[0][0], H_test[0][0]);
+      ASSERT_EQ(H_ref[0][1], H_test[0][1]);
+      ASSERT_EQ(H_ref[1][0], H_test[1][0]);
+      ASSERT_EQ(H_ref[1][1], H_test[1][1]);
+      ASSERT_EQ(C_ref[0], C_test[0]);
+      ASSERT_EQ(C_ref[1], C_test[1]);
+    }
+  }
+}
+
+void GetProjSubspaceTest::RunGetProjSubspaceTest_ExtremeValues() {
+  const int h_start = 0;
+  int h_end = MAX_DATA_BLOCK;
+  const int v_start = 0;
+  int v_end = MAX_DATA_BLOCK;
+  const int dgd_stride = MAX_DATA_BLOCK;
+  const int src_stride = MAX_DATA_BLOCK;
+  const int flt0_stride = MAX_DATA_BLOCK;
+  const int flt1_stride = MAX_DATA_BLOCK;
+  sgr_params_type params;
+  const int iters = kIterations;
+  for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) {
+    int64_t C_ref[2] = { 0 }, C_test[2] = { 0 };
+    int64_t H_ref[2][2] = { { 0, 0 }, { 0, 0 } };
+    int64_t H_test[2][2] = { { 0, 0 }, { 0, 0 } };
+    for (int i = 0; i < MAX_DATA_BLOCK * MAX_DATA_BLOCK; ++i) {
+      dgd_[i] = 0;
+      src_[i] = 255;
+      flt0_[i] = rng_.Rand15Signed();
+      flt1_[i] = rng_.Rand15Signed();
+    }
+    params.r[0] = 1;
+    params.r[1] = 1;
+    params.s[0] = rng_.Rand8() % MAX_RADIUS;
+    params.s[1] = rng_.Rand8() % MAX_RADIUS;
+    uint8_t *dgd = dgd_;
+    uint8_t *src = src_;
+
+    av1_calc_proj_params_c(src, h_end - h_start, v_end - v_start, src_stride,
+                           dgd, dgd_stride, flt0_, flt0_stride, flt1_,
+                           flt1_stride, H_ref, C_ref, &params);
+
+    target_func_(src, h_end - h_start, v_end - v_start, src_stride, dgd,
+                 dgd_stride, flt0_, flt0_stride, flt1_, flt1_stride, H_test,
+                 C_test, &params);
+
+    ASSERT_EQ(H_ref[0][0], H_test[0][0]);
+    ASSERT_EQ(H_ref[0][1], H_test[0][1]);
+    ASSERT_EQ(H_ref[1][0], H_test[1][0]);
+    ASSERT_EQ(H_ref[1][1], H_test[1][1]);
+    ASSERT_EQ(C_ref[0], C_test[0]);
+    ASSERT_EQ(C_ref[1], C_test[1]);
+  }
+}
+
+TEST_P(GetProjSubspaceTest, RandomValues) { RunGetProjSubspaceTest(1); }
+
+TEST_P(GetProjSubspaceTest, ExtremeValues) {
+  RunGetProjSubspaceTest_ExtremeValues();
+}
+
+TEST_P(GetProjSubspaceTest, DISABLED_Speed) { RunGetProjSubspaceTest(200000); }
+
+#if HAVE_AVX2
+
+INSTANTIATE_TEST_SUITE_P(AVX2, GetProjSubspaceTest,
+                         ::testing::Values(av1_calc_proj_params_avx2));
+#endif  // HAVE_AVX2
+
+}  // namespace get_proj_subspace_test_lowbd
+#endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/libs/libaom/src/test/qm_test.cc b/libs/libaom/src/test/qm_test.cc
new file mode 100644
index 000000000..d1dfbb849
--- /dev/null
+++ b/libs/libaom/src/test/qm_test.cc
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include "config/aom_config.h"
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+
+namespace {
+
+class QMTest
+    : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode, int>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  QMTest() : EncoderTest(GET_PARAM(0)) {}
+  virtual ~QMTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(GET_PARAM(1));
+    set_cpu_used_ = GET_PARAM(2);
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
+      encoder->Control(AV1E_SET_ENABLE_QM, 1);
+      encoder->Control(AV1E_SET_QM_MIN, qm_min_);
+      encoder->Control(AV1E_SET_QM_MAX, qm_max_);
+
+      encoder->Control(AOME_SET_MAX_INTRA_BITRATE_PCT, 100);
+    }
+  }
+
+  void DoTest(int qm_min, int qm_max) {
+    qm_min_ = qm_min;
+    qm_max_ = qm_max;
+    cfg_.kf_max_dist = 12;
+    cfg_.rc_min_quantizer = 8;
+    cfg_.rc_max_quantizer = 56;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_lag_in_frames = 6;
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_target_bitrate = 300;
+    ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+                                         288, 30, 1, 0, 15);
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+
+  int set_cpu_used_;
+  int qm_min_;
+  int qm_max_;
+};
+
+// encodes and decodes without a mismatch.
+TEST_P(QMTest, TestNoMisMatchQM1) { DoTest(5, 9); }
+
+// encodes and decodes without a mismatch.
+TEST_P(QMTest, TestNoMisMatchQM2) { DoTest(0, 8); }
+
+// encodes and decodes without a mismatch.
+TEST_P(QMTest, TestNoMisMatchQM3) { DoTest(9, 15); }
+
+AV1_INSTANTIATE_TEST_CASE(QMTest,
+                          ::testing::Values(::libaom_test::kRealTime,
+                                            ::libaom_test::kOnePassGood),
+                          ::testing::Range(5, 9));
+}  // namespace
diff --git a/libs/libaom/src/test/quantize_func_test.cc b/libs/libaom/src/test/quantize_func_test.cc
new file mode 100644
index 000000000..b40b38d5a
--- /dev/null
+++ b/libs/libaom/src/test/quantize_func_test.cc
@@ -0,0 +1,547 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tuple>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_codec.h"
+#include "aom_ports/aom_timer.h"
+#include "av1/encoder/encoder.h"
+#include "av1/common/scan.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+
+namespace {
+using libaom_test::ACMRandom;
+
+#define QUAN_PARAM_LIST                                                       \
+  const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,    \
+      const int16_t *round_ptr, const int16_t *quant_ptr,                     \
+      const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,                 \
+      tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, \
+      const int16_t *scan, const int16_t *iscan
+
+typedef void (*QuantizeFunc)(QUAN_PARAM_LIST);
+typedef void (*QuantizeFuncHbd)(QUAN_PARAM_LIST, int log_scale);
+
+#define HBD_QUAN_FUNC                                                      \
+  fn(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr, \
+     qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, log_scale)
+
+#define LBD_QUAN_FUNC                                                      \
+  fn(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr, \
+     qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan)
+
+template <QuantizeFuncHbd fn>
+void highbd_quan16x16_wrapper(QUAN_PARAM_LIST) {
+  const int log_scale = 0;
+  HBD_QUAN_FUNC;
+}
+
+template <QuantizeFuncHbd fn>
+void highbd_quan32x32_wrapper(QUAN_PARAM_LIST) {
+  const int log_scale = 1;
+  HBD_QUAN_FUNC;
+}
+
+template <QuantizeFuncHbd fn>
+void highbd_quan64x64_wrapper(QUAN_PARAM_LIST) {
+  const int log_scale = 2;
+  HBD_QUAN_FUNC;
+}
+
+enum QuantType { TYPE_B, TYPE_DC, TYPE_FP };
+
+using std::tuple;
+typedef tuple<QuantizeFunc, QuantizeFunc, TX_SIZE, QuantType, aom_bit_depth_t>
+    QuantizeParam;
+
+typedef struct {
+  QUANTS quant;
+  Dequants dequant;
+} QuanTable;
+
+const int kTestNum = 1000;
+
+class QuantizeTest : public ::testing::TestWithParam<QuantizeParam> {
+ protected:
+  QuantizeTest()
+      : quant_ref_(GET_PARAM(0)), quant_(GET_PARAM(1)), tx_size_(GET_PARAM(2)),
+        type_(GET_PARAM(3)), bd_(GET_PARAM(4)) {}
+
+  virtual ~QuantizeTest() {}
+
+  virtual void SetUp() {
+    qtab_ = reinterpret_cast<QuanTable *>(aom_memalign(32, sizeof(*qtab_)));
+    const int n_coeffs = coeff_num();
+    coeff_ = reinterpret_cast<tran_low_t *>(
+        aom_memalign(32, 6 * n_coeffs * sizeof(tran_low_t)));
+    InitQuantizer();
+  }
+
+  virtual void TearDown() {
+    aom_free(qtab_);
+    qtab_ = NULL;
+    aom_free(coeff_);
+    coeff_ = NULL;
+    libaom_test::ClearSystemState();
+  }
+
+  void InitQuantizer() {
+    av1_build_quantizer(bd_, 0, 0, 0, 0, 0, &qtab_->quant, &qtab_->dequant);
+  }
+
+  void QuantizeRun(bool is_loop, int q = 0, int test_num = 1) {
+    tran_low_t *coeff_ptr = coeff_;
+    const intptr_t n_coeffs = coeff_num();
+
+    tran_low_t *qcoeff_ref = coeff_ptr + n_coeffs;
+    tran_low_t *dqcoeff_ref = qcoeff_ref + n_coeffs;
+
+    tran_low_t *qcoeff = dqcoeff_ref + n_coeffs;
+    tran_low_t *dqcoeff = qcoeff + n_coeffs;
+    uint16_t *eob = (uint16_t *)(dqcoeff + n_coeffs);
+
+    // Testing uses 2-D DCT scan order table
+    const SCAN_ORDER *const sc = get_default_scan(tx_size_, DCT_DCT);
+
+    // Testing uses luminance quantization table
+    const int16_t *zbin = qtab_->quant.y_zbin[q];
+
+    const int16_t *round = 0;
+    const int16_t *quant = 0;
+    if (type_ == TYPE_B) {
+      round = qtab_->quant.y_round[q];
+      quant = qtab_->quant.y_quant[q];
+    } else if (type_ == TYPE_FP) {
+      round = qtab_->quant.y_round_fp[q];
+      quant = qtab_->quant.y_quant_fp[q];
+    }
+
+    const int16_t *quant_shift = qtab_->quant.y_quant_shift[q];
+    const int16_t *dequant = qtab_->dequant.y_dequant_QTX[q];
+
+    for (int i = 0; i < test_num; ++i) {
+      if (is_loop) FillCoeffRandom();
+
+      memset(qcoeff_ref, 0, 5 * n_coeffs * sizeof(*qcoeff_ref));
+
+      quant_ref_(coeff_ptr, n_coeffs, zbin, round, quant, quant_shift,
+                 qcoeff_ref, dqcoeff_ref, dequant, &eob[0], sc->scan,
+                 sc->iscan);
+
+      ASM_REGISTER_STATE_CHECK(quant_(coeff_ptr, n_coeffs, zbin, round, quant,
+                                      quant_shift, qcoeff, dqcoeff, dequant,
+                                      &eob[1], sc->scan, sc->iscan));
+
+      for (int j = 0; j < n_coeffs; ++j) {
+        ASSERT_EQ(qcoeff_ref[j], qcoeff[j])
+            << "Q mismatch on test: " << i << " at position: " << j
+            << " Q: " << q << " coeff: " << coeff_ptr[j];
+      }
+
+      for (int j = 0; j < n_coeffs; ++j) {
+        ASSERT_EQ(dqcoeff_ref[j], dqcoeff[j])
+            << "Dq mismatch on test: " << i << " at position: " << j
+            << " Q: " << q << " coeff: " << coeff_ptr[j];
+      }
+
+      ASSERT_EQ(eob[0], eob[1])
+          << "eobs mismatch on test: " << i << " Q: " << q;
+    }
+  }
+
+  void CompareResults(const tran_low_t *buf_ref, const tran_low_t *buf,
+                      int size, const char *text, int q, int number) {
+    int i;
+    for (i = 0; i < size; ++i) {
+      ASSERT_EQ(buf_ref[i], buf[i]) << text << " mismatch on test: " << number
+                                    << " at position: " << i << " Q: " << q;
+    }
+  }
+
+  int coeff_num() const { return av1_get_max_eob(tx_size_); }
+
+  void FillCoeff(tran_low_t c) {
+    const int n_coeffs = coeff_num();
+    for (int i = 0; i < n_coeffs; ++i) {
+      coeff_[i] = c;
+    }
+  }
+
+  void FillCoeffRandom() {
+    const int n_coeffs = coeff_num();
+    FillCoeffZero();
+    int num = rnd_.Rand16() % n_coeffs;
+    for (int i = 0; i < num; ++i) {
+      coeff_[i] = GetRandomCoeff();
+    }
+  }
+
+  void FillCoeffRandomRows(int num) {
+    FillCoeffZero();
+    for (int i = 0; i < num; ++i) {
+      coeff_[i] = GetRandomCoeff();
+    }
+  }
+
+  void FillCoeffZero() { FillCoeff(0); }
+
+  void FillCoeffConstant() {
+    tran_low_t c = GetRandomCoeff();
+    FillCoeff(c);
+  }
+
+  void FillDcOnly() {
+    FillCoeffZero();
+    coeff_[0] = GetRandomCoeff();
+  }
+
+  void FillDcLargeNegative() {
+    FillCoeffZero();
+    // Generate a qcoeff which contains 512/-512 (0x0100/0xFE00) to catch issues
+    // like BUG=883 where the constant being compared was incorrectly
+    // initialized.
+    coeff_[0] = -8191;
+  }
+
+  tran_low_t GetRandomCoeff() {
+    tran_low_t coeff;
+    if (bd_ == AOM_BITS_8) {
+      coeff =
+          clamp(static_cast<int16_t>(rnd_.Rand16()), INT16_MIN + 1, INT16_MAX);
+    } else {
+      tran_low_t min = -(1 << (7 + bd_));
+      tran_low_t max = -min - 1;
+      coeff = clamp(static_cast<tran_low_t>(rnd_.Rand31()), min, max);
+    }
+    return coeff;
+  }
+
+  ACMRandom rnd_;
+  QuanTable *qtab_;
+  tran_low_t *coeff_;
+  QuantizeFunc quant_ref_;
+  QuantizeFunc quant_;
+  TX_SIZE tx_size_;
+  QuantType type_;
+  aom_bit_depth_t bd_;
+};
+
+TEST_P(QuantizeTest, ZeroInput) {
+  FillCoeffZero();
+  QuantizeRun(false);
+}
+
+TEST_P(QuantizeTest, LargeNegativeInput) {
+  FillDcLargeNegative();
+  QuantizeRun(false, 0, 1);
+}
+
+TEST_P(QuantizeTest, DcOnlyInput) {
+  FillDcOnly();
+  QuantizeRun(false, 0, 1);
+}
+
+TEST_P(QuantizeTest, RandomInput) { QuantizeRun(true, 0, kTestNum); }
+
+TEST_P(QuantizeTest, MultipleQ) {
+  for (int q = 0; q < QINDEX_RANGE; ++q) {
+    QuantizeRun(true, q, kTestNum);
+  }
+}
+
+// Force the coeff to be half the value of the dequant.  This exposes a
+// mismatch found in av1_quantize_fp_sse2().
+TEST_P(QuantizeTest, CoeffHalfDequant) {
+  FillCoeff(16);
+  QuantizeRun(false, 25, 1);
+}
+
+TEST_P(QuantizeTest, DISABLED_Speed) {
+  tran_low_t *coeff_ptr = coeff_;
+  const intptr_t n_coeffs = coeff_num();
+
+  tran_low_t *qcoeff_ref = coeff_ptr + n_coeffs;
+  tran_low_t *dqcoeff_ref = qcoeff_ref + n_coeffs;
+
+  tran_low_t *qcoeff = dqcoeff_ref + n_coeffs;
+  tran_low_t *dqcoeff = qcoeff + n_coeffs;
+  uint16_t *eob = (uint16_t *)(dqcoeff + n_coeffs);
+
+  // Testing uses 2-D DCT scan order table
+  const SCAN_ORDER *const sc = get_default_scan(tx_size_, DCT_DCT);
+
+  // Testing uses luminance quantization table
+  const int q = 22;
+  const int16_t *zbin = qtab_->quant.y_zbin[q];
+  const int16_t *round_fp = qtab_->quant.y_round_fp[q];
+  const int16_t *quant_fp = qtab_->quant.y_quant_fp[q];
+  const int16_t *quant_shift = qtab_->quant.y_quant_shift[q];
+  const int16_t *dequant = qtab_->dequant.y_dequant_QTX[q];
+  const int kNumTests = 5000000;
+  aom_usec_timer timer, simd_timer;
+  int rows = tx_size_high[tx_size_];
+  int cols = tx_size_wide[tx_size_];
+  rows = AOMMIN(32, rows);
+  cols = AOMMIN(32, cols);
+  for (int cnt = 0; cnt <= rows; cnt++) {
+    FillCoeffRandomRows(cnt * cols);
+
+    aom_usec_timer_start(&timer);
+    for (int n = 0; n < kNumTests; ++n) {
+      quant_ref_(coeff_ptr, n_coeffs, zbin, round_fp, quant_fp, quant_shift,
+                 qcoeff, dqcoeff, dequant, eob, sc->scan, sc->iscan);
+    }
+    aom_usec_timer_mark(&timer);
+
+    aom_usec_timer_start(&simd_timer);
+    for (int n = 0; n < kNumTests; ++n) {
+      quant_(coeff_ptr, n_coeffs, zbin, round_fp, quant_fp, quant_shift, qcoeff,
+             dqcoeff, dequant, eob, sc->scan, sc->iscan);
+    }
+    aom_usec_timer_mark(&simd_timer);
+
+    const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+    const int simd_elapsed_time =
+        static_cast<int>(aom_usec_timer_elapsed(&simd_timer));
+    printf("c_time = %d \t simd_time = %d \t Gain = %d \n", elapsed_time,
+           simd_elapsed_time, (elapsed_time / simd_elapsed_time));
+  }
+}
+
+using std::make_tuple;
+
+#if HAVE_AVX2
+const QuantizeParam kQParamArrayAvx2[] = {
+  make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_avx2,
+             static_cast<TX_SIZE>(TX_16X16), TYPE_FP, AOM_BITS_8),
+  make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_avx2,
+             static_cast<TX_SIZE>(TX_4X16), TYPE_FP, AOM_BITS_8),
+  make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_avx2,
+             static_cast<TX_SIZE>(TX_16X4), TYPE_FP, AOM_BITS_8),
+  make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_avx2,
+             static_cast<TX_SIZE>(TX_32X8), TYPE_FP, AOM_BITS_8),
+  make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_avx2,
+             static_cast<TX_SIZE>(TX_8X32), TYPE_FP, AOM_BITS_8),
+  make_tuple(&av1_quantize_fp_32x32_c, &av1_quantize_fp_32x32_avx2,
+             static_cast<TX_SIZE>(TX_32X32), TYPE_FP, AOM_BITS_8),
+  make_tuple(&av1_quantize_fp_32x32_c, &av1_quantize_fp_32x32_avx2,
+             static_cast<TX_SIZE>(TX_16X64), TYPE_FP, AOM_BITS_8),
+  make_tuple(&av1_quantize_fp_32x32_c, &av1_quantize_fp_32x32_avx2,
+             static_cast<TX_SIZE>(TX_64X16), TYPE_FP, AOM_BITS_8),
+  make_tuple(&av1_quantize_fp_64x64_c, &av1_quantize_fp_64x64_avx2,
+             static_cast<TX_SIZE>(TX_64X64), TYPE_FP, AOM_BITS_8),
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(&highbd_quan16x16_wrapper<av1_highbd_quantize_fp_c>,
+             &highbd_quan16x16_wrapper<av1_highbd_quantize_fp_avx2>,
+             static_cast<TX_SIZE>(TX_16X16), TYPE_FP, AOM_BITS_8),
+  make_tuple(&highbd_quan16x16_wrapper<av1_highbd_quantize_fp_c>,
+             &highbd_quan16x16_wrapper<av1_highbd_quantize_fp_avx2>,
+             static_cast<TX_SIZE>(TX_16X16), TYPE_FP, AOM_BITS_10),
+  make_tuple(&highbd_quan16x16_wrapper<av1_highbd_quantize_fp_c>,
+             &highbd_quan16x16_wrapper<av1_highbd_quantize_fp_avx2>,
+             static_cast<TX_SIZE>(TX_16X16), TYPE_FP, AOM_BITS_12),
+  make_tuple(&highbd_quan32x32_wrapper<av1_highbd_quantize_fp_c>,
+             &highbd_quan32x32_wrapper<av1_highbd_quantize_fp_avx2>,
+             static_cast<TX_SIZE>(TX_32X32), TYPE_FP, AOM_BITS_8),
+  make_tuple(&highbd_quan32x32_wrapper<av1_highbd_quantize_fp_c>,
+             &highbd_quan32x32_wrapper<av1_highbd_quantize_fp_avx2>,
+             static_cast<TX_SIZE>(TX_32X32), TYPE_FP, AOM_BITS_10),
+  make_tuple(&highbd_quan32x32_wrapper<av1_highbd_quantize_fp_c>,
+             &highbd_quan32x32_wrapper<av1_highbd_quantize_fp_avx2>,
+             static_cast<TX_SIZE>(TX_32X32), TYPE_FP, AOM_BITS_12),
+  make_tuple(&highbd_quan64x64_wrapper<av1_highbd_quantize_fp_c>,
+             &highbd_quan64x64_wrapper<av1_highbd_quantize_fp_avx2>,
+             static_cast<TX_SIZE>(TX_64X64), TYPE_FP, AOM_BITS_8),
+  make_tuple(&highbd_quan64x64_wrapper<av1_highbd_quantize_fp_c>,
+             &highbd_quan64x64_wrapper<av1_highbd_quantize_fp_avx2>,
+             static_cast<TX_SIZE>(TX_64X64), TYPE_FP, AOM_BITS_10),
+  make_tuple(&highbd_quan64x64_wrapper<av1_highbd_quantize_fp_c>,
+             &highbd_quan64x64_wrapper<av1_highbd_quantize_fp_avx2>,
+             static_cast<TX_SIZE>(TX_64X64), TYPE_FP, AOM_BITS_12),
+  make_tuple(&aom_highbd_quantize_b_c, &aom_highbd_quantize_b_avx2,
+             static_cast<TX_SIZE>(TX_16X16), TYPE_B, AOM_BITS_8),
+  make_tuple(&aom_highbd_quantize_b_c, &aom_highbd_quantize_b_avx2,
+             static_cast<TX_SIZE>(TX_16X16), TYPE_B, AOM_BITS_10),
+  make_tuple(&aom_highbd_quantize_b_c, &aom_highbd_quantize_b_avx2,
+             static_cast<TX_SIZE>(TX_16X16), TYPE_B, AOM_BITS_12),
+  make_tuple(&aom_highbd_quantize_b_adaptive_c,
+             &aom_highbd_quantize_b_adaptive_avx2,
+             static_cast<TX_SIZE>(TX_16X16), TYPE_B, AOM_BITS_8),
+  make_tuple(&aom_highbd_quantize_b_adaptive_c,
+             &aom_highbd_quantize_b_adaptive_avx2,
+             static_cast<TX_SIZE>(TX_16X16), TYPE_B, AOM_BITS_10),
+  make_tuple(&aom_highbd_quantize_b_adaptive_c,
+             &aom_highbd_quantize_b_adaptive_avx2,
+             static_cast<TX_SIZE>(TX_16X16), TYPE_B, AOM_BITS_12),
+  make_tuple(&aom_highbd_quantize_b_32x32_adaptive_c,
+             &aom_highbd_quantize_b_32x32_adaptive_avx2,
+             static_cast<TX_SIZE>(TX_32X32), TYPE_B, AOM_BITS_8),
+  make_tuple(&aom_highbd_quantize_b_32x32_adaptive_c,
+             &aom_highbd_quantize_b_32x32_adaptive_avx2,
+             static_cast<TX_SIZE>(TX_32X32), TYPE_B, AOM_BITS_10),
+  make_tuple(&aom_highbd_quantize_b_32x32_adaptive_c,
+             &aom_highbd_quantize_b_32x32_adaptive_avx2,
+             static_cast<TX_SIZE>(TX_32X32), TYPE_B, AOM_BITS_12),
+#endif
+  make_tuple(&aom_quantize_b_adaptive_c, &aom_quantize_b_adaptive_avx2,
+             static_cast<TX_SIZE>(TX_16X16), TYPE_B, AOM_BITS_8),
+  make_tuple(&aom_quantize_b_adaptive_c, &aom_quantize_b_adaptive_avx2,
+             static_cast<TX_SIZE>(TX_8X8), TYPE_B, AOM_BITS_8),
+  make_tuple(&aom_quantize_b_adaptive_c, &aom_quantize_b_adaptive_avx2,
+             static_cast<TX_SIZE>(TX_4X4), TYPE_B, AOM_BITS_8)
+};
+
+INSTANTIATE_TEST_SUITE_P(AVX2, QuantizeTest,
+                         ::testing::ValuesIn(kQParamArrayAvx2));
+#endif  // HAVE_AVX2
+
+#if HAVE_SSE2
+const QuantizeParam kQParamArraySSE2[] = {
+  make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_sse2,
+             static_cast<TX_SIZE>(TX_16X16), TYPE_FP, AOM_BITS_8),
+  make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_sse2,
+             static_cast<TX_SIZE>(TX_4X16), TYPE_FP, AOM_BITS_8),
+  make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_sse2,
+             static_cast<TX_SIZE>(TX_16X4), TYPE_FP, AOM_BITS_8),
+  make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_sse2,
+             static_cast<TX_SIZE>(TX_8X32), TYPE_FP, AOM_BITS_8),
+  make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_sse2,
+             static_cast<TX_SIZE>(TX_32X8), TYPE_FP, AOM_BITS_8),
+  make_tuple(&aom_quantize_b_c, &aom_quantize_b_sse2,
+             static_cast<TX_SIZE>(TX_16X16), TYPE_B, AOM_BITS_8),
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(&aom_highbd_quantize_b_c, &aom_highbd_quantize_b_sse2,
+             static_cast<TX_SIZE>(TX_16X16), TYPE_B, AOM_BITS_8),
+  make_tuple(&aom_highbd_quantize_b_c, &aom_highbd_quantize_b_sse2,
+             static_cast<TX_SIZE>(TX_16X16), TYPE_B, AOM_BITS_10),
+  make_tuple(&aom_highbd_quantize_b_c, &aom_highbd_quantize_b_sse2,
+             static_cast<TX_SIZE>(TX_16X16), TYPE_B, AOM_BITS_12),
+  make_tuple(&aom_highbd_quantize_b_adaptive_c,
+             &aom_highbd_quantize_b_adaptive_sse2,
+             static_cast<TX_SIZE>(TX_16X16), TYPE_B, AOM_BITS_8),
+  make_tuple(&aom_highbd_quantize_b_adaptive_c,
+             &aom_highbd_quantize_b_adaptive_sse2,
+             static_cast<TX_SIZE>(TX_16X16), TYPE_B, AOM_BITS_10),
+  make_tuple(&aom_highbd_quantize_b_adaptive_c,
+             &aom_highbd_quantize_b_adaptive_sse2,
+             static_cast<TX_SIZE>(TX_16X16), TYPE_B, AOM_BITS_12),
+  make_tuple(&aom_highbd_quantize_b_32x32_c, &aom_highbd_quantize_b_32x32_sse2,
+             static_cast<TX_SIZE>(TX_32X32), TYPE_B, AOM_BITS_8),
+  make_tuple(&aom_highbd_quantize_b_32x32_c, &aom_highbd_quantize_b_32x32_sse2,
+             static_cast<TX_SIZE>(TX_32X32), TYPE_B, AOM_BITS_10),
+  make_tuple(&aom_highbd_quantize_b_32x32_c, &aom_highbd_quantize_b_32x32_sse2,
+             static_cast<TX_SIZE>(TX_32X32), TYPE_B, AOM_BITS_12),
+  make_tuple(&aom_highbd_quantize_b_32x32_adaptive_c,
+             &aom_highbd_quantize_b_32x32_adaptive_sse2,
+             static_cast<TX_SIZE>(TX_32X32), TYPE_B, AOM_BITS_8),
+  make_tuple(&aom_highbd_quantize_b_32x32_adaptive_c,
+             &aom_highbd_quantize_b_32x32_adaptive_sse2,
+             static_cast<TX_SIZE>(TX_32X32), TYPE_B, AOM_BITS_10),
+  make_tuple(&aom_highbd_quantize_b_32x32_adaptive_c,
+             &aom_highbd_quantize_b_32x32_adaptive_sse2,
+             static_cast<TX_SIZE>(TX_32X32), TYPE_B, AOM_BITS_12),
+  make_tuple(&aom_highbd_quantize_b_64x64_c, &aom_highbd_quantize_b_64x64_sse2,
+             static_cast<TX_SIZE>(TX_64X64), TYPE_B, AOM_BITS_8),
+  make_tuple(&aom_highbd_quantize_b_64x64_c, &aom_highbd_quantize_b_64x64_sse2,
+             static_cast<TX_SIZE>(TX_64X64), TYPE_B, AOM_BITS_10),
+  make_tuple(&aom_highbd_quantize_b_64x64_c, &aom_highbd_quantize_b_64x64_sse2,
+             static_cast<TX_SIZE>(TX_64X64), TYPE_B, AOM_BITS_12),
+  make_tuple(&aom_highbd_quantize_b_64x64_adaptive_c,
+             &aom_highbd_quantize_b_64x64_adaptive_sse2,
+             static_cast<TX_SIZE>(TX_64X64), TYPE_B, AOM_BITS_8),
+  make_tuple(&aom_highbd_quantize_b_64x64_adaptive_c,
+             &aom_highbd_quantize_b_64x64_adaptive_sse2,
+             static_cast<TX_SIZE>(TX_64X64), TYPE_B, AOM_BITS_10),
+  make_tuple(&aom_highbd_quantize_b_64x64_adaptive_c,
+             &aom_highbd_quantize_b_64x64_adaptive_sse2,
+             static_cast<TX_SIZE>(TX_64X64), TYPE_B, AOM_BITS_12),
+#endif
+  make_tuple(&aom_quantize_b_adaptive_c, &aom_quantize_b_adaptive_sse2,
+             static_cast<TX_SIZE>(TX_16X16), TYPE_B, AOM_BITS_8),
+  make_tuple(&aom_quantize_b_adaptive_c, &aom_quantize_b_adaptive_sse2,
+             static_cast<TX_SIZE>(TX_8X8), TYPE_B, AOM_BITS_8),
+  make_tuple(&aom_quantize_b_adaptive_c, &aom_quantize_b_adaptive_sse2,
+             static_cast<TX_SIZE>(TX_4X4), TYPE_B, AOM_BITS_8),
+  make_tuple(&aom_quantize_b_32x32_adaptive_c,
+             &aom_quantize_b_32x32_adaptive_sse2,
+             static_cast<TX_SIZE>(TX_32X16), TYPE_B, AOM_BITS_8),
+  make_tuple(&aom_quantize_b_32x32_adaptive_c,
+             &aom_quantize_b_32x32_adaptive_sse2,
+             static_cast<TX_SIZE>(TX_16X32), TYPE_B, AOM_BITS_8),
+  make_tuple(&aom_quantize_b_32x32_adaptive_c,
+             &aom_quantize_b_32x32_adaptive_sse2,
+             static_cast<TX_SIZE>(TX_32X32), TYPE_B, AOM_BITS_8),
+  make_tuple(&aom_quantize_b_64x64_adaptive_c,
+             &aom_quantize_b_64x64_adaptive_sse2,
+             static_cast<TX_SIZE>(TX_32X64), TYPE_B, AOM_BITS_8),
+  make_tuple(&aom_quantize_b_64x64_adaptive_c,
+             &aom_quantize_b_64x64_adaptive_sse2,
+             static_cast<TX_SIZE>(TX_64X32), TYPE_B, AOM_BITS_8),
+  make_tuple(&aom_quantize_b_64x64_adaptive_c,
+             &aom_quantize_b_64x64_adaptive_sse2,
+             static_cast<TX_SIZE>(TX_64X64), TYPE_B, AOM_BITS_8)
+};
+
+INSTANTIATE_TEST_SUITE_P(SSE2, QuantizeTest,
+                         ::testing::ValuesIn(kQParamArraySSE2));
+#endif
+
+#if HAVE_NEON
+const QuantizeParam kQParamArrayNEON[] = {
+  make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_neon,
+             static_cast<TX_SIZE>(TX_16X16), TYPE_FP, AOM_BITS_8),
+  make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_neon,
+             static_cast<TX_SIZE>(TX_4X16), TYPE_FP, AOM_BITS_8),
+  make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_neon,
+             static_cast<TX_SIZE>(TX_16X4), TYPE_FP, AOM_BITS_8),
+  make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_neon,
+             static_cast<TX_SIZE>(TX_8X32), TYPE_FP, AOM_BITS_8),
+  make_tuple(&av1_quantize_fp_c, &av1_quantize_fp_neon,
+             static_cast<TX_SIZE>(TX_32X8), TYPE_FP, AOM_BITS_8)
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON, QuantizeTest,
+                         ::testing::ValuesIn(kQParamArrayNEON));
+#endif
+
+#if HAVE_SSSE3 && ARCH_X86_64
+INSTANTIATE_TEST_SUITE_P(
+    SSSE3, QuantizeTest,
+    ::testing::Values(
+        make_tuple(&aom_quantize_b_c, &aom_quantize_b_ssse3,
+                   static_cast<TX_SIZE>(TX_16X16), TYPE_B, AOM_BITS_8),
+        make_tuple(&aom_quantize_b_32x32_c, &aom_quantize_b_32x32_ssse3,
+                   static_cast<TX_SIZE>(TX_32X32), TYPE_B, AOM_BITS_8),
+        make_tuple(&aom_quantize_b_64x64_c, &aom_quantize_b_64x64_ssse3,
+                   static_cast<TX_SIZE>(TX_64X64), TYPE_B, AOM_BITS_8)));
+
+#endif  // HAVE_SSSE3 && ARCH_X86_64
+
+#if HAVE_AVX && ARCH_X86_64
+INSTANTIATE_TEST_SUITE_P(
+    AVX, QuantizeTest,
+    ::testing::Values(
+        make_tuple(&aom_quantize_b_c, &aom_quantize_b_avx,
+                   static_cast<TX_SIZE>(TX_16X16), TYPE_B, AOM_BITS_8),
+        make_tuple(&aom_quantize_b_32x32_c, &aom_quantize_b_32x32_avx,
+                   static_cast<TX_SIZE>(TX_32X32), TYPE_B, AOM_BITS_8)));
+
+#endif  // HAVE_AVX && ARCH_X86_64
+}  // namespace
diff --git a/libs/libaom/src/test/reconinter_test.cc b/libs/libaom/src/test/reconinter_test.cc
new file mode 100644
index 000000000..51bec0eab
--- /dev/null
+++ b/libs/libaom/src/test/reconinter_test.cc
@@ -0,0 +1,259 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <tuple>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_ports/mem.h"
+#include "av1/common/scan.h"
+#include "av1/common/txb_common.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+using libaom_test::ACMRandom;
+
+typedef void (*buildcompdiffwtdmaskd_func)(uint8_t *mask,
+                                           DIFFWTD_MASK_TYPE mask_type,
+                                           const uint8_t *src0, int src0_stride,
+                                           const uint8_t *src1, int src1_stride,
+                                           int h, int w);
+
+typedef std::tuple<BLOCK_SIZE, buildcompdiffwtdmaskd_func>
+    BuildCompDiffwtdMaskDParam;
+
+#if HAVE_SSE4_1
+::testing::internal::ParamGenerator<BuildCompDiffwtdMaskDParam> BuildParams(
+    buildcompdiffwtdmaskd_func filter) {
+  return ::testing::Combine(::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL),
+                            ::testing::Values(filter));
+}
+#endif
+
+class BuildCompDiffwtdMaskTest
+    : public ::testing::TestWithParam<BuildCompDiffwtdMaskDParam> {
+ public:
+  virtual ~BuildCompDiffwtdMaskTest() {}
+
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+  void RunTest(buildcompdiffwtdmaskd_func test_impl, const int is_speed,
+               const DIFFWTD_MASK_TYPE type);
+
+ private:
+  ACMRandom rnd_;
+};
+
+typedef void (*buildcompdiffwtdmaskd16_func)(
+    uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0,
+    int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w,
+    ConvolveParams *conv_params, int bd);
+
+typedef std::tuple<int, buildcompdiffwtdmaskd16_func, BLOCK_SIZE>
+    BuildCompDiffwtdMaskD16Param;
+
+#if HAVE_SSE4_1 || HAVE_NEON
+::testing::internal::ParamGenerator<BuildCompDiffwtdMaskD16Param> BuildParams(
+    buildcompdiffwtdmaskd16_func filter) {
+  return ::testing::Combine(::testing::Range(8, 13, 2),
+                            ::testing::Values(filter),
+                            ::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL));
+}
+#endif
+class BuildCompDiffwtdMaskD16Test
+    : public ::testing::TestWithParam<BuildCompDiffwtdMaskD16Param> {
+ public:
+  ~BuildCompDiffwtdMaskD16Test() {}
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+  void SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); }
+
+ protected:
+  void RunCheckOutput(buildcompdiffwtdmaskd16_func test_impl);
+  void RunSpeedTest(buildcompdiffwtdmaskd16_func test_impl,
+                    DIFFWTD_MASK_TYPE mask_type);
+  libaom_test::ACMRandom rnd_;
+};  // class BuildCompDiffwtdMaskD16Test
+
+void BuildCompDiffwtdMaskD16Test::RunCheckOutput(
+    buildcompdiffwtdmaskd16_func test_impl) {
+  const int block_idx = GET_PARAM(2);
+  const int bd = GET_PARAM(0);
+  const int width = block_size_wide[block_idx];
+  const int height = block_size_high[block_idx];
+  DECLARE_ALIGNED(16, uint8_t, mask_ref[2 * MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(16, uint8_t, mask_test[2 * MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, uint16_t, src0[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, uint16_t, src1[MAX_SB_SQUARE]);
+
+  ConvolveParams conv_params = get_conv_params_no_round(0, 0, NULL, 0, 1, bd);
+
+  int in_precision =
+      bd + 2 * FILTER_BITS - conv_params.round_0 - conv_params.round_1 + 2;
+
+  for (int i = 0; i < MAX_SB_SQUARE; i++) {
+    src0[i] = rnd_.Rand16() & ((1 << in_precision) - 1);
+    src1[i] = rnd_.Rand16() & ((1 << in_precision) - 1);
+  }
+
+  for (int mask_type = 0; mask_type < DIFFWTD_MASK_TYPES; mask_type++) {
+    av1_build_compound_diffwtd_mask_d16_c(
+        mask_ref, (DIFFWTD_MASK_TYPE)mask_type, src0, width, src1, width,
+        height, width, &conv_params, bd);
+
+    test_impl(mask_test, (DIFFWTD_MASK_TYPE)mask_type, src0, width, src1, width,
+              height, width, &conv_params, bd);
+
+    for (int r = 0; r < height; ++r) {
+      for (int c = 0; c < width; ++c) {
+        ASSERT_EQ(mask_ref[c + r * width], mask_test[c + r * width])
+            << "Mismatch at unit tests for BuildCompDiffwtdMaskD16Test\n"
+            << " Pixel mismatch at index "
+            << "[" << r << "," << c << "] "
+            << " @ " << width << "x" << height << " inv " << mask_type;
+      }
+    }
+  }
+}
+
+void BuildCompDiffwtdMaskD16Test::RunSpeedTest(
+    buildcompdiffwtdmaskd16_func test_impl, DIFFWTD_MASK_TYPE mask_type) {
+  const int block_idx = GET_PARAM(2);
+  const int bd = GET_PARAM(0);
+  const int width = block_size_wide[block_idx];
+  const int height = block_size_high[block_idx];
+  DECLARE_ALIGNED(16, uint8_t, mask[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, uint16_t, src0[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, uint16_t, src1[MAX_SB_SQUARE]);
+
+  ConvolveParams conv_params = get_conv_params_no_round(0, 0, NULL, 0, 1, bd);
+
+  int in_precision =
+      bd + 2 * FILTER_BITS - conv_params.round_0 - conv_params.round_1 + 2;
+
+  for (int i = 0; i < MAX_SB_SQUARE; i++) {
+    src0[i] = rnd_.Rand16() & ((1 << in_precision) - 1);
+    src1[i] = rnd_.Rand16() & ((1 << in_precision) - 1);
+  }
+
+  const int num_loops = 10000000 / (width + height);
+  aom_usec_timer timer;
+  aom_usec_timer_start(&timer);
+
+  for (int i = 0; i < num_loops; ++i)
+    av1_build_compound_diffwtd_mask_d16_c(mask, mask_type, src0, width, src1,
+                                          width, height, width, &conv_params,
+                                          bd);
+
+  aom_usec_timer_mark(&timer);
+  const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+
+  aom_usec_timer timer1;
+  aom_usec_timer_start(&timer1);
+
+  for (int i = 0; i < num_loops; ++i)
+    test_impl(mask, mask_type, src0, width, src1, width, height, width,
+              &conv_params, bd);
+
+  aom_usec_timer_mark(&timer1);
+  const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1));
+  printf("av1_build_compound_diffwtd_mask_d16  %3dx%-3d: %7.2f \n", width,
+         height, elapsed_time / double(elapsed_time1));
+}
+#if HAVE_SSE4_1
+void BuildCompDiffwtdMaskTest::RunTest(buildcompdiffwtdmaskd_func test_impl,
+                                       const int is_speed,
+                                       const DIFFWTD_MASK_TYPE type) {
+  const int sb_type = GET_PARAM(0);
+  const int width = block_size_wide[sb_type];
+  const int height = block_size_high[sb_type];
+  DECLARE_ALIGNED(16, uint8_t, mask_ref[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(16, uint8_t, mask_test[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(16, uint8_t, src0[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(16, uint8_t, src1[MAX_SB_SQUARE]);
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  for (int i = 0; i < width * height; i++) {
+    src0[i] = rnd.Rand8();
+    src1[i] = rnd.Rand8();
+  }
+  const int run_times = is_speed ? (10000000 / (width + height)) : 1;
+  aom_usec_timer timer;
+  aom_usec_timer_start(&timer);
+  for (int i = 0; i < run_times; ++i) {
+    av1_build_compound_diffwtd_mask_c(mask_ref, type, src0, width, src1, width,
+                                      height, width);
+  }
+  const double t1 = get_time_mark(&timer);
+  aom_usec_timer_start(&timer);
+  for (int i = 0; i < run_times; ++i) {
+    test_impl(mask_test, type, src0, width, src1, width, height, width);
+  }
+  const double t2 = get_time_mark(&timer);
+  if (is_speed) {
+    printf("mask %d %3dx%-3d:%7.2f/%7.2fns", type, width, height, t1, t2);
+    printf("(%3.2f)\n", t1 / t2);
+  }
+  for (int r = 0; r < height; ++r) {
+    for (int c = 0; c < width; ++c) {
+      ASSERT_EQ(mask_ref[c + r * width], mask_test[c + r * width])
+          << "[" << r << "," << c << "] " << run_times << " @ " << width << "x"
+          << height << " inv " << type;
+    }
+  }
+}
+
+TEST_P(BuildCompDiffwtdMaskTest, match) {
+  RunTest(GET_PARAM(1), 0, DIFFWTD_38);
+  RunTest(GET_PARAM(1), 0, DIFFWTD_38_INV);
+}
+TEST_P(BuildCompDiffwtdMaskTest, DISABLED_Speed) {
+  RunTest(GET_PARAM(1), 1, DIFFWTD_38);
+  RunTest(GET_PARAM(1), 1, DIFFWTD_38_INV);
+}
+#endif
+TEST_P(BuildCompDiffwtdMaskD16Test, CheckOutput) {
+  RunCheckOutput(GET_PARAM(1));
+}
+
+TEST_P(BuildCompDiffwtdMaskD16Test, DISABLED_Speed) {
+  RunSpeedTest(GET_PARAM(1), DIFFWTD_38);
+  RunSpeedTest(GET_PARAM(1), DIFFWTD_38_INV);
+}
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE4_1, BuildCompDiffwtdMaskTest,
+                         BuildParams(av1_build_compound_diffwtd_mask_sse4_1));
+
+INSTANTIATE_TEST_SUITE_P(
+    SSE4_1, BuildCompDiffwtdMaskD16Test,
+    BuildParams(av1_build_compound_diffwtd_mask_d16_sse4_1));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, BuildCompDiffwtdMaskTest,
+                         BuildParams(av1_build_compound_diffwtd_mask_avx2));
+
+INSTANTIATE_TEST_SUITE_P(AVX2, BuildCompDiffwtdMaskD16Test,
+                         BuildParams(av1_build_compound_diffwtd_mask_d16_avx2));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, BuildCompDiffwtdMaskD16Test,
+                         BuildParams(av1_build_compound_diffwtd_mask_d16_neon));
+#endif
+
+}  // namespace
diff --git a/libs/libaom/src/test/register_state_check.h b/libs/libaom/src/test/register_state_check.h
new file mode 100644
index 000000000..d404621dd
--- /dev/null
+++ b/libs/libaom/src/test/register_state_check.h
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_TEST_REGISTER_STATE_CHECK_H_
+#define AOM_TEST_REGISTER_STATE_CHECK_H_
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+
+// ASM_REGISTER_STATE_CHECK(asm_function)
+//   Minimally validates the environment pre & post function execution. This
+//   variant should be used with assembly functions which are not expected to
+//   fully restore the system state. See platform implementations of
+//   RegisterStateCheck for details.
+//
+// API_REGISTER_STATE_CHECK(api_function)
+//   Performs all the checks done by ASM_REGISTER_STATE_CHECK() and any
+//   additional checks to ensure the environment is in a consistent state pre &
+//   post function execution. This variant should be used with API functions.
+//   See platform implementations of RegisterStateCheckXXX for details.
+//
+
+#if defined(_WIN64) && ARCH_X86_64
+
+#undef NOMINMAX
+#define NOMINMAX
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#include <winnt.h>
+
+inline bool operator==(const M128A &lhs, const M128A &rhs) {
+  return (lhs.Low == rhs.Low && lhs.High == rhs.High);
+}
+
+namespace libaom_test {
+
+// Compares the state of xmm[6-15] at construction with their state at
+// destruction. These registers should be preserved by the callee on
+// Windows x64.
+class RegisterStateCheck {
+ public:
+  RegisterStateCheck() { initialized_ = StoreRegisters(&pre_context_); }
+  ~RegisterStateCheck() { Check(); }
+
+ private:
+  static bool StoreRegisters(CONTEXT *const context) {
+    const HANDLE this_thread = GetCurrentThread();
+    EXPECT_TRUE(this_thread != NULL);
+    context->ContextFlags = CONTEXT_FLOATING_POINT;
+    const bool context_saved = GetThreadContext(this_thread, context) == TRUE;
+    EXPECT_TRUE(context_saved) << "GetLastError: " << GetLastError();
+    return context_saved;
+  }
+
+  // Compares the register state. Returns true if the states match.
+  void Check() const {
+    ASSERT_TRUE(initialized_);
+    CONTEXT post_context;
+    ASSERT_TRUE(StoreRegisters(&post_context));
+
+    const M128A *xmm_pre = &pre_context_.Xmm6;
+    const M128A *xmm_post = &post_context.Xmm6;
+    for (int i = 6; i <= 15; ++i) {
+      EXPECT_EQ(*xmm_pre, *xmm_post) << "xmm" << i << " has been modified!";
+      ++xmm_pre;
+      ++xmm_post;
+    }
+  }
+
+  bool initialized_;
+  CONTEXT pre_context_;
+};
+
+#define ASM_REGISTER_STATE_CHECK(statement)    \
+  do {                                         \
+    libaom_test::RegisterStateCheck reg_check; \
+    statement;                                 \
+  } while (false)
+
+}  // namespace libaom_test
+
+#else
+
+namespace libaom_test {
+
+class RegisterStateCheck {};
+#define ASM_REGISTER_STATE_CHECK(statement) statement
+
+}  // namespace libaom_test
+
+#endif  // _WIN64 && ARCH_X86_64
+
+#if ARCH_X86 || ARCH_X86_64
+#if defined(__GNUC__)
+
+namespace libaom_test {
+
+// Checks the FPU tag word pre/post execution to ensure emms has been called.
+class RegisterStateCheckMMX {
+ public:
+  RegisterStateCheckMMX() {
+    __asm__ volatile("fstenv %0" : "=rm"(pre_fpu_env_));
+  }
+  ~RegisterStateCheckMMX() { Check(); }
+
+ private:
+  // Checks the FPU tag word pre/post execution, returning false if not cleared
+  // to 0xffff.
+  void Check() const {
+    EXPECT_EQ(0xffff, pre_fpu_env_[4])
+        << "FPU was in an inconsistent state prior to call";
+
+    uint16_t post_fpu_env[14];
+    __asm__ volatile("fstenv %0" : "=rm"(post_fpu_env));
+    EXPECT_EQ(0xffff, post_fpu_env[4])
+        << "FPU was left in an inconsistent state after call";
+  }
+
+  uint16_t pre_fpu_env_[14];
+};
+
+#define API_REGISTER_STATE_CHECK(statement)       \
+  do {                                            \
+    libaom_test::RegisterStateCheckMMX reg_check; \
+    ASM_REGISTER_STATE_CHECK(statement);          \
+  } while (false)
+
+}  // namespace libaom_test
+
+#endif  // __GNUC__
+#endif  // ARCH_X86 || ARCH_X86_64
+
+#ifndef API_REGISTER_STATE_CHECK
+#define API_REGISTER_STATE_CHECK ASM_REGISTER_STATE_CHECK
+#endif
+
+#endif  // AOM_TEST_REGISTER_STATE_CHECK_H_
diff --git a/libs/libaom/src/test/resize_test.cc b/libs/libaom/src/test/resize_test.cc
new file mode 100644
index 000000000..bcf6794d0
--- /dev/null
+++ b/libs/libaom/src/test/resize_test.cc
@@ -0,0 +1,644 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <climits>
+#include <vector>
+#include "aom_dsp/aom_dsp_common.h"
+#include "common/tools_common.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/video_source.h"
+#include "test/util.h"
+
+// Enable(1) or Disable(0) writing of the compressed bitstream.
+#define WRITE_COMPRESSED_STREAM 0
+
+namespace {
+
+#if WRITE_COMPRESSED_STREAM
+static void mem_put_le16(char *const mem, unsigned int val) {
+  mem[0] = val;
+  mem[1] = val >> 8;
+}
+
+static void mem_put_le32(char *const mem, unsigned int val) {
+  mem[0] = val;
+  mem[1] = val >> 8;
+  mem[2] = val >> 16;
+  mem[3] = val >> 24;
+}
+
+static void write_ivf_file_header(const aom_codec_enc_cfg_t *const cfg,
+                                  int frame_cnt, FILE *const outfile) {
+  char header[32];
+
+  header[0] = 'D';
+  header[1] = 'K';
+  header[2] = 'I';
+  header[3] = 'F';
+  mem_put_le16(header + 4, 0);                    /* version */
+  mem_put_le16(header + 6, 32);                   /* headersize */
+  mem_put_le32(header + 8, AV1_FOURCC);           /* fourcc (av1) */
+  mem_put_le16(header + 12, cfg->g_w);            /* width */
+  mem_put_le16(header + 14, cfg->g_h);            /* height */
+  mem_put_le32(header + 16, cfg->g_timebase.den); /* rate */
+  mem_put_le32(header + 20, cfg->g_timebase.num); /* scale */
+  mem_put_le32(header + 24, frame_cnt);           /* length */
+  mem_put_le32(header + 28, 0);                   /* unused */
+
+  (void)fwrite(header, 1, 32, outfile);
+}
+
+static void write_ivf_frame_size(FILE *const outfile, const size_t size) {
+  char header[4];
+  mem_put_le32(header, static_cast<unsigned int>(size));
+  (void)fwrite(header, 1, 4, outfile);
+}
+
+static void write_ivf_frame_header(const aom_codec_cx_pkt_t *const pkt,
+                                   FILE *const outfile) {
+  char header[12];
+  aom_codec_pts_t pts;
+
+  if (pkt->kind != AOM_CODEC_CX_FRAME_PKT) return;
+
+  pts = pkt->data.frame.pts;
+  mem_put_le32(header, static_cast<unsigned int>(pkt->data.frame.sz));
+  mem_put_le32(header + 4, pts & 0xFFFFFFFF);
+  mem_put_le32(header + 8, pts >> 32);
+
+  (void)fwrite(header, 1, 12, outfile);
+}
+#endif  // WRITE_COMPRESSED_STREAM
+
+const unsigned int kInitialWidth = 320;
+const unsigned int kInitialHeight = 240;
+
+struct FrameInfo {
+  FrameInfo(aom_codec_pts_t _pts, unsigned int _w, unsigned int _h)
+      : pts(_pts), w(_w), h(_h) {}
+
+  aom_codec_pts_t pts;
+  unsigned int w;
+  unsigned int h;
+};
+
+void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w,
+                         unsigned int initial_h, unsigned int *w,
+                         unsigned int *h, int flag_codec) {
+  if (frame < 10) {
+    *w = initial_w;
+    *h = initial_h;
+    return;
+  }
+  if (frame < 20) {
+    *w = initial_w * 3 / 4;
+    *h = initial_h * 3 / 4;
+    return;
+  }
+  if (frame < 30) {
+    *w = initial_w / 2;
+    *h = initial_h / 2;
+    return;
+  }
+  if (frame < 40) {
+    *w = initial_w;
+    *h = initial_h;
+    return;
+  }
+  if (frame < 50) {
+    *w = initial_w * 3 / 4;
+    *h = initial_h * 3 / 4;
+    return;
+  }
+  if (frame < 60) {
+    *w = initial_w / 2;
+    *h = initial_h / 2;
+    return;
+  }
+  if (frame < 70) {
+    *w = initial_w;
+    *h = initial_h;
+    return;
+  }
+  if (frame < 80) {
+    *w = initial_w * 3 / 4;
+    *h = initial_h * 3 / 4;
+    return;
+  }
+  if (frame < 90) {
+    *w = initial_w / 2;
+    *h = initial_h / 2;
+    return;
+  }
+  if (frame < 100) {
+    *w = initial_w * 3 / 4;
+    *h = initial_h * 3 / 4;
+    return;
+  }
+  if (frame < 110) {
+    *w = initial_w;
+    *h = initial_h;
+    return;
+  }
+  // Go down very low
+  if (frame < 120) {
+    *w = initial_w / 4;
+    *h = initial_h / 4;
+    return;
+  }
+  if (flag_codec == 1) {
+    // Cases that only works for AV1.
+    // For AV1: Swap width and height of original.
+    if (frame < 140) {
+      *w = initial_h;
+      *h = initial_w;
+      return;
+    }
+  }
+  *w = initial_w;
+  *h = initial_h;
+}
+
+class ResizingVideoSource : public ::libaom_test::DummyVideoSource {
+ public:
+  ResizingVideoSource() {
+    SetSize(kInitialWidth, kInitialHeight);
+    limit_ = 150;
+  }
+  int flag_codec_;
+  virtual ~ResizingVideoSource() {}
+
+ protected:
+  virtual void Next() {
+    ++frame_;
+    unsigned int width;
+    unsigned int height;
+    ScaleForFrameNumber(frame_, kInitialWidth, kInitialHeight, &width, &height,
+                        flag_codec_);
+    SetSize(width, height);
+    FillFrame();
+  }
+};
+
+class ResizeTest
+    : public ::libaom_test::CodecTestWithParam<libaom_test::TestMode>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  ResizeTest() : EncoderTest(GET_PARAM(0)) {}
+
+  virtual ~ResizeTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(GET_PARAM(1));
+  }
+
+  virtual void DecompressedFrameHook(const aom_image_t &img,
+                                     aom_codec_pts_t pts) {
+    frame_info_list_.push_back(FrameInfo(pts, img.d_w, img.d_h));
+  }
+
+  std::vector<FrameInfo> frame_info_list_;
+};
+
+TEST_P(ResizeTest, TestExternalResizeWorks) {
+  ResizingVideoSource video;
+  video.flag_codec_ = 0;
+  cfg_.g_lag_in_frames = 0;
+  // We use max(kInitialWidth, kInitialHeight) because during the test
+  // the width and height of the frame are swapped
+  cfg_.g_forced_max_frame_width = cfg_.g_forced_max_frame_height =
+      AOMMAX(kInitialWidth, kInitialHeight);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  // Check we decoded the same number of frames as we attempted to encode
+  ASSERT_EQ(frame_info_list_.size(), video.limit());
+
+  for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
+       info != frame_info_list_.end(); ++info) {
+    const unsigned int frame = static_cast<unsigned>(info->pts);
+    unsigned int expected_w;
+    unsigned int expected_h;
+    ScaleForFrameNumber(frame, kInitialWidth, kInitialHeight, &expected_w,
+                        &expected_h, 0);
+    EXPECT_EQ(expected_w, info->w)
+        << "Frame " << frame << " had unexpected width";
+    EXPECT_EQ(expected_h, info->h)
+        << "Frame " << frame << " had unexpected height";
+  }
+}
+
+const unsigned int kStepDownFrame = 3;
+const unsigned int kStepUpFrame = 6;
+
+class ResizeInternalTestLarge : public ResizeTest {
+ protected:
+#if WRITE_COMPRESSED_STREAM
+  ResizeInternalTestLarge()
+      : ResizeTest(), frame0_psnr_(0.0), outfile_(NULL), out_frames_(0) {}
+#else
+  ResizeInternalTestLarge() : ResizeTest(), frame0_psnr_(0.0) {}
+#endif
+
+  virtual ~ResizeInternalTestLarge() {}
+
+  virtual void BeginPassHook(unsigned int /*pass*/) {
+#if WRITE_COMPRESSED_STREAM
+    outfile_ = fopen("av10-2-05-resize.ivf", "wb");
+#endif
+  }
+
+  virtual void EndPassHook() {
+#if WRITE_COMPRESSED_STREAM
+    if (outfile_) {
+      if (!fseek(outfile_, 0, SEEK_SET))
+        write_ivf_file_header(&cfg_, out_frames_, outfile_);
+      fclose(outfile_);
+      outfile_ = NULL;
+    }
+#endif
+  }
+
+  virtual void PreEncodeFrameHook(libaom_test::VideoSource *video,
+                                  libaom_test::Encoder *encoder) {
+    if (change_config_) {
+      int new_q = 60;
+      if (video->frame() == 0) {
+        struct aom_scaling_mode mode = { AOME_ONETWO, AOME_ONETWO };
+        encoder->Control(AOME_SET_SCALEMODE, &mode);
+      }
+      if (video->frame() == 1) {
+        struct aom_scaling_mode mode = { AOME_NORMAL, AOME_NORMAL };
+        encoder->Control(AOME_SET_SCALEMODE, &mode);
+        cfg_.rc_min_quantizer = cfg_.rc_max_quantizer = new_q;
+        encoder->Config(&cfg_);
+      }
+    } else {
+      if (video->frame() >= kStepDownFrame && video->frame() < kStepUpFrame) {
+        struct aom_scaling_mode mode = { AOME_FOURFIVE, AOME_THREEFIVE };
+        encoder->Control(AOME_SET_SCALEMODE, &mode);
+      }
+      if (video->frame() >= kStepUpFrame) {
+        struct aom_scaling_mode mode = { AOME_NORMAL, AOME_NORMAL };
+        encoder->Control(AOME_SET_SCALEMODE, &mode);
+      }
+    }
+  }
+
+  virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+    if (frame0_psnr_ == 0.) frame0_psnr_ = pkt->data.psnr.psnr[0];
+    EXPECT_NEAR(pkt->data.psnr.psnr[0], frame0_psnr_, 3.0);
+  }
+
+#if WRITE_COMPRESSED_STREAM
+  virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
+    ++out_frames_;
+
+    // Write initial file header if first frame.
+    if (pkt->data.frame.pts == 0) write_ivf_file_header(&cfg_, 0, outfile_);
+
+    // Write frame header and data.
+    write_ivf_frame_header(pkt, outfile_);
+    (void)fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, outfile_);
+  }
+#endif
+
+  double frame0_psnr_;
+  bool change_config_;
+#if WRITE_COMPRESSED_STREAM
+  FILE *outfile_;
+  unsigned int out_frames_;
+#endif
+};
+
+TEST_P(ResizeInternalTestLarge, TestInternalResizeWorks) {
+  ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 10);
+  init_flags_ = AOM_CODEC_USE_PSNR;
+  change_config_ = false;
+
+  // q picked such that initial keyframe on this clip is ~30dB PSNR
+  cfg_.rc_min_quantizer = cfg_.rc_max_quantizer = 48;
+
+  // If the number of frames being encoded is smaller than g_lag_in_frames
+  // the encoded frame is unavailable using the current API. Comparing
+  // frames to detect mismatch would then not be possible. Set
+  // g_lag_in_frames = 0 to get around this.
+  cfg_.g_lag_in_frames = 0;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
+       info != frame_info_list_.end(); ++info) {
+  }
+  for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
+       info != frame_info_list_.end(); ++info) {
+    const aom_codec_pts_t pts = info->pts;
+    if (pts >= kStepDownFrame && pts < kStepUpFrame) {
+      ASSERT_EQ(282U, info->w) << "Frame " << pts << " had unexpected width";
+      ASSERT_EQ(173U, info->h) << "Frame " << pts << " had unexpected height";
+    } else {
+      EXPECT_EQ(352U, info->w) << "Frame " << pts << " had unexpected width";
+      EXPECT_EQ(288U, info->h) << "Frame " << pts << " had unexpected height";
+    }
+  }
+}
+
+TEST_P(ResizeInternalTestLarge, TestInternalResizeChangeConfig) {
+  ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 10);
+  cfg_.g_w = 352;
+  cfg_.g_h = 288;
+  change_config_ = true;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+class ResizeRealtimeTest
+    : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode, int>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  ResizeRealtimeTest() : EncoderTest(GET_PARAM(0)) {}
+  virtual ~ResizeRealtimeTest() {}
+
+  virtual void PreEncodeFrameHook(libaom_test::VideoSource *video,
+                                  libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(AV1E_SET_AQ_MODE, 3);
+      encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
+      encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1);
+    }
+
+    if (change_bitrate_ && video->frame() == 120) {
+      change_bitrate_ = false;
+      cfg_.rc_target_bitrate = 500;
+      encoder->Config(&cfg_);
+    }
+  }
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(GET_PARAM(1));
+    set_cpu_used_ = GET_PARAM(2);
+  }
+
+  virtual void DecompressedFrameHook(const aom_image_t &img,
+                                     aom_codec_pts_t pts) {
+    frame_info_list_.push_back(FrameInfo(pts, img.d_w, img.d_h));
+  }
+
+  virtual void MismatchHook(const aom_image_t *img1, const aom_image_t *img2) {
+    double mismatch_psnr = compute_psnr(img1, img2);
+    mismatch_psnr_ += mismatch_psnr;
+    ++mismatch_nframes_;
+  }
+
+  unsigned int GetMismatchFrames() { return mismatch_nframes_; }
+
+  void DefaultConfig() {
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 600;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_min_quantizer = 2;
+    cfg_.rc_max_quantizer = 56;
+    cfg_.rc_undershoot_pct = 50;
+    cfg_.rc_overshoot_pct = 50;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.kf_mode = AOM_KF_AUTO;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.kf_min_dist = cfg_.kf_max_dist = 3000;
+    // Enable dropped frames.
+    cfg_.rc_dropframe_thresh = 1;
+    // Disable error_resilience mode.
+    cfg_.g_error_resilient = 0;
+    // Run at low bitrate.
+    cfg_.rc_target_bitrate = 200;
+    // We use max(kInitialWidth, kInitialHeight) because during the test
+    // the width and height of the frame are swapped
+    cfg_.g_forced_max_frame_width = cfg_.g_forced_max_frame_height =
+        AOMMAX(kInitialWidth, kInitialHeight);
+  }
+
+  std::vector<FrameInfo> frame_info_list_;
+  int set_cpu_used_;
+  bool change_bitrate_;
+  double mismatch_psnr_;
+  int mismatch_nframes_;
+};
+
+TEST_P(ResizeRealtimeTest, TestExternalResizeWorks) {
+  ResizingVideoSource video;
+  video.flag_codec_ = 1;
+  DefaultConfig();
+  change_bitrate_ = false;
+  mismatch_psnr_ = 0.0;
+  mismatch_nframes_ = 0;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  // Check we decoded the same number of frames as we attempted to encode
+  ASSERT_EQ(frame_info_list_.size(), video.limit());
+
+  for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
+       info != frame_info_list_.end(); ++info) {
+    const unsigned int frame = static_cast<unsigned>(info->pts);
+    unsigned int expected_w;
+    unsigned int expected_h;
+    ScaleForFrameNumber(frame, kInitialWidth, kInitialHeight, &expected_w,
+                        &expected_h, 1);
+    EXPECT_EQ(expected_w, info->w)
+        << "Frame " << frame << " had unexpected width";
+    EXPECT_EQ(expected_h, info->h)
+        << "Frame " << frame << " had unexpected height";
+    EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
+  }
+}
+
+// Verify the dynamic resizer behavior for real time, 1 pass CBR mode.
+// Run at low bitrate, with resize_allowed = 1, and verify that we get
+// one resize down event.
+TEST_P(ResizeRealtimeTest, DISABLED_TestInternalResizeDown) {
+  ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 299);
+  DefaultConfig();
+  cfg_.g_w = 352;
+  cfg_.g_h = 288;
+  change_bitrate_ = false;
+  mismatch_psnr_ = 0.0;
+  mismatch_nframes_ = 0;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  unsigned int last_w = cfg_.g_w;
+  unsigned int last_h = cfg_.g_h;
+  int resize_count = 0;
+  for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
+       info != frame_info_list_.end(); ++info) {
+    if (info->w != last_w || info->h != last_h) {
+      // Verify that resize down occurs.
+      ASSERT_LT(info->w, last_w);
+      ASSERT_LT(info->h, last_h);
+      last_w = info->w;
+      last_h = info->h;
+      resize_count++;
+    }
+  }
+
+#if CONFIG_AV1_DECODER
+  // Verify that we get 1 resize down event in this test.
+  ASSERT_EQ(1, resize_count) << "Resizing should occur.";
+  EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
+#else
+  printf("Warning: AV1 decoder unavailable, unable to check resize count!\n");
+#endif
+}
+
+// Verify the dynamic resizer behavior for real time, 1 pass CBR mode.
+// Start at low target bitrate, raise the bitrate in the middle of the clip,
+// scaling-up should occur after bitrate changed.
+TEST_P(ResizeRealtimeTest, DISABLED_TestInternalResizeDownUpChangeBitRate) {
+  ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 359);
+  DefaultConfig();
+  cfg_.g_w = 352;
+  cfg_.g_h = 288;
+  change_bitrate_ = true;
+  mismatch_psnr_ = 0.0;
+  mismatch_nframes_ = 0;
+  // Disable dropped frames.
+  cfg_.rc_dropframe_thresh = 0;
+  // Starting bitrate low.
+  cfg_.rc_target_bitrate = 80;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  unsigned int last_w = cfg_.g_w;
+  unsigned int last_h = cfg_.g_h;
+  int resize_count = 0;
+  for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
+       info != frame_info_list_.end(); ++info) {
+    if (info->w != last_w || info->h != last_h) {
+      resize_count++;
+      if (resize_count == 1) {
+        // Verify that resize down occurs.
+        ASSERT_LT(info->w, last_w);
+        ASSERT_LT(info->h, last_h);
+      } else if (resize_count == 2) {
+        // Verify that resize up occurs.
+        ASSERT_GT(info->w, last_w);
+        ASSERT_GT(info->h, last_h);
+      }
+      last_w = info->w;
+      last_h = info->h;
+    }
+  }
+
+#if CONFIG_AV1_DECODER
+  // Verify that we get 2 resize events in this test.
+  ASSERT_EQ(resize_count, 2) << "Resizing should occur twice.";
+  EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
+#else
+  printf("Warning: AV1 decoder unavailable, unable to check resize count!\n");
+#endif
+}
+
+class ResizeCspTest : public ResizeTest {
+ protected:
+#if WRITE_COMPRESSED_STREAM
+  ResizeCspTest()
+      : ResizeTest(), frame0_psnr_(0.0), outfile_(NULL), out_frames_(0) {}
+#else
+  ResizeCspTest() : ResizeTest(), frame0_psnr_(0.0) {}
+#endif
+
+  virtual ~ResizeCspTest() {}
+
+  virtual void BeginPassHook(unsigned int /*pass*/) {
+#if WRITE_COMPRESSED_STREAM
+    outfile_ = fopen("av11-2-05-cspchape.ivf", "wb");
+#endif
+  }
+
+  virtual void EndPassHook() {
+#if WRITE_COMPRESSED_STREAM
+    if (outfile_) {
+      if (!fseek(outfile_, 0, SEEK_SET))
+        write_ivf_file_header(&cfg_, out_frames_, outfile_);
+      fclose(outfile_);
+      outfile_ = NULL;
+    }
+#endif
+  }
+
+  virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+    if (frame0_psnr_ == 0.) frame0_psnr_ = pkt->data.psnr.psnr[0];
+    EXPECT_NEAR(pkt->data.psnr.psnr[0], frame0_psnr_, 2.0);
+  }
+
+#if WRITE_COMPRESSED_STREAM
+  virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
+    ++out_frames_;
+
+    // Write initial file header if first frame.
+    if (pkt->data.frame.pts == 0) write_ivf_file_header(&cfg_, 0, outfile_);
+
+    // Write frame header and data.
+    write_ivf_frame_header(pkt, outfile_);
+    (void)fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz, outfile_);
+  }
+#endif
+
+  double frame0_psnr_;
+#if WRITE_COMPRESSED_STREAM
+  FILE *outfile_;
+  unsigned int out_frames_;
+#endif
+};
+
+class ResizingCspVideoSource : public ::libaom_test::DummyVideoSource {
+ public:
+  explicit ResizingCspVideoSource(aom_img_fmt_t image_format) {
+    SetSize(kInitialWidth, kInitialHeight);
+    SetImageFormat(image_format);
+    limit_ = 30;
+  }
+
+  virtual ~ResizingCspVideoSource() {}
+};
+
+#if (defined(DISABLE_TRELLISQ_SEARCH) && DISABLE_TRELLISQ_SEARCH)
+TEST_P(ResizeCspTest, DISABLED_TestResizeCspWorks) {
+#else
+TEST_P(ResizeCspTest, TestResizeCspWorks) {
+#endif
+  const aom_img_fmt_t image_formats[] = { AOM_IMG_FMT_I420, AOM_IMG_FMT_I444 };
+  for (const aom_img_fmt_t &img_format : image_formats) {
+    ResizingCspVideoSource video(img_format);
+    init_flags_ = AOM_CODEC_USE_PSNR;
+    cfg_.rc_min_quantizer = cfg_.rc_max_quantizer = 48;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.g_profile = (img_format == AOM_IMG_FMT_I420) ? 0 : 1;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+    // Check we decoded the same number of frames as we attempted to encode
+    ASSERT_EQ(frame_info_list_.size(), video.limit());
+    frame_info_list_.clear();
+  }
+}
+
+AV1_INSTANTIATE_TEST_CASE(ResizeTest,
+                          ::testing::Values(::libaom_test::kRealTime));
+AV1_INSTANTIATE_TEST_CASE(ResizeInternalTestLarge,
+                          ::testing::Values(::libaom_test::kOnePassGood));
+AV1_INSTANTIATE_TEST_CASE(ResizeRealtimeTest,
+                          ::testing::Values(::libaom_test::kRealTime),
+                          ::testing::Range(5, 9));
+AV1_INSTANTIATE_TEST_CASE(ResizeCspTest,
+                          ::testing::Values(::libaom_test::kRealTime));
+}  // namespace
diff --git a/libs/libaom/src/test/rt_end_to_end_test.cc b/libs/libaom/src/test/rt_end_to_end_test.cc
new file mode 100644
index 000000000..f14d12474
--- /dev/null
+++ b/libs/libaom/src/test/rt_end_to_end_test.cc
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <memory>
+#include <ostream>
+#include <string>
+#include <unordered_map>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+#include "test/yuv_video_source.h"
+
+namespace {
+
+const unsigned int kFrames = 10;
+const int kBitrate = 500;
+
+// List of psnr thresholds for speed settings 6-8
+// keys: video, speed, aq mode.
+std::unordered_map<std::string,
+                   std::unordered_map<int, std::unordered_map<int, double>>>
+    kPsnrThreshold = { { "park_joy_90p_8_420.y4m",
+                         { { 5, { { 0, 35.4 }, { 3, 36.4 } } },
+                           { 6, { { 0, 35.3 }, { 3, 36.2 } } },
+                           { 7, { { 0, 34.9 }, { 3, 35.8 } } },
+                           { 8, { { 0, 35.0 }, { 3, 35.8 } } } } },
+                       { "paris_352_288_30.y4m",
+                         { { 5, { { 0, 36.2 }, { 3, 36.7 } } },
+                           { 6, { { 0, 36.1 }, { 3, 36.6 } } },
+                           { 7, { { 0, 35.5 }, { 3, 36.0 } } },
+                           { 8, { { 0, 36.0 }, { 3, 36.5 } } } } },
+                       { "niklas_1280_720_30.y4m",
+                         { { 5, { { 0, 34.6 }, { 3, 34.6 } } },
+                           { 6, { { 0, 34.2 }, { 3, 34.2 } } },
+                           { 7, { { 0, 33.7 }, { 3, 33.6 } } },
+                           { 8, { { 0, 33.6 }, { 3, 33.4 } } } } } };
+
+typedef struct {
+  const char *filename;
+  unsigned int input_bit_depth;
+  aom_img_fmt fmt;
+  aom_bit_depth_t bit_depth;
+  unsigned int profile;
+} TestVideoParam;
+
+std::ostream &operator<<(std::ostream &os, const TestVideoParam &test_arg) {
+  return os << "TestVideoParam { filename:" << test_arg.filename
+            << " input_bit_depth:" << test_arg.input_bit_depth
+            << " fmt:" << test_arg.fmt << " bit_depth:" << test_arg.bit_depth
+            << " profile:" << test_arg.profile << " }";
+}
+
+const TestVideoParam kTestVectors[] = {
+  { "park_joy_90p_8_420.y4m", 8, AOM_IMG_FMT_I420, AOM_BITS_8, 0 },
+  { "paris_352_288_30.y4m", 8, AOM_IMG_FMT_I420, AOM_BITS_8, 0 },
+  { "niklas_1280_720_30.y4m", 8, AOM_IMG_FMT_I420, AOM_BITS_8, 0 },
+};
+
+// Params: test video, speed, aq mode, threads, tile columns.
+class RTEndToEndTest
+    : public ::libaom_test::CodecTestWith5Params<TestVideoParam, int,
+                                                 unsigned int, int, int>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  RTEndToEndTest()
+      : EncoderTest(GET_PARAM(0)), test_video_param_(GET_PARAM(1)),
+        cpu_used_(GET_PARAM(2)), psnr_(0.0), nframes_(0),
+        aq_mode_(GET_PARAM(3)), threads_(GET_PARAM(4)),
+        tile_columns_(GET_PARAM(5)) {}
+
+  virtual ~RTEndToEndTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(::libaom_test::kRealTime);
+
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_threads = threads_;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 600;
+  }
+
+  virtual void BeginPassHook(unsigned int) {
+    psnr_ = 0.0;
+    nframes_ = 0;
+  }
+
+  virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+    psnr_ += pkt->data.psnr.psnr[0];
+    nframes_++;
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1);
+      encoder->Control(AV1E_SET_TILE_COLUMNS, tile_columns_);
+      encoder->Control(AOME_SET_CPUUSED, cpu_used_);
+      encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_DEFAULT);
+      encoder->Control(AV1E_SET_AQ_MODE, aq_mode_);
+      encoder->Control(AV1E_SET_ROW_MT, 1);
+    }
+  }
+
+  double GetAveragePsnr() const {
+    if (nframes_) return psnr_ / nframes_;
+    return 0.0;
+  }
+
+  double GetPsnrThreshold() {
+    return kPsnrThreshold[test_video_param_.filename][cpu_used_][aq_mode_];
+  }
+
+  void DoTest() {
+    cfg_.rc_target_bitrate = kBitrate;
+    cfg_.g_error_resilient = 0;
+    cfg_.g_profile = test_video_param_.profile;
+    cfg_.g_input_bit_depth = test_video_param_.input_bit_depth;
+    cfg_.g_bit_depth = test_video_param_.bit_depth;
+    init_flags_ = AOM_CODEC_USE_PSNR;
+    if (cfg_.g_bit_depth > 8) init_flags_ |= AOM_CODEC_USE_HIGHBITDEPTH;
+
+    std::unique_ptr<libaom_test::VideoSource> video;
+    video.reset(new libaom_test::Y4mVideoSource(test_video_param_.filename, 0,
+                                                kFrames));
+    ASSERT_TRUE(video.get() != NULL);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+    const double psnr = GetAveragePsnr();
+    EXPECT_GT(psnr, GetPsnrThreshold())
+        << "cpu used = " << cpu_used_ << " aq mode = " << aq_mode_;
+  }
+
+  TestVideoParam test_video_param_;
+  int cpu_used_;
+
+ private:
+  double psnr_;
+  unsigned int nframes_;
+  unsigned int aq_mode_;
+  int threads_;
+  int tile_columns_;
+};
+
+class RTEndToEndTestThreaded : public RTEndToEndTest {};
+
+TEST_P(RTEndToEndTest, EndtoEndPSNRTest) { DoTest(); }
+
+TEST_P(RTEndToEndTestThreaded, EndtoEndPSNRTest) { DoTest(); }
+
+AV1_INSTANTIATE_TEST_CASE(RTEndToEndTest, ::testing::ValuesIn(kTestVectors),
+                          ::testing::Range(5, 9),
+                          ::testing::Values<unsigned int>(0, 3),
+                          ::testing::Values(1), ::testing::Values(1));
+
+AV1_INSTANTIATE_TEST_CASE(RTEndToEndTestThreaded,
+                          ::testing::ValuesIn(kTestVectors),
+                          ::testing::Range(5, 9),
+                          ::testing::Values<unsigned int>(0, 3),
+                          ::testing::Range(2, 5), ::testing::Range(2, 5));
+}  // namespace
diff --git a/libs/libaom/src/test/run_encodes.sh b/libs/libaom/src/test/run_encodes.sh
new file mode 100644
index 000000000..2096d8b15
--- /dev/null
+++ b/libs/libaom/src/test/run_encodes.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+#
+# Copyright (c) 2016, Alliance for Open Media. All rights reserved.
+#
+# This source code is subject to the terms of the BSD 2 Clause License and
+# the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+# was not distributed with this source code in the LICENSE file, you can
+# obtain it at www.aomedia.org/license/software. If the Alliance for Open
+# Media Patent License 1.0 was not distributed with this source code in the
+# PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+#
+# Author: jimbankoski@google.com (Jim Bankoski)
+
+if [[ $# -ne 4 ]]; then
+  echo Encodes all the y4m files in the directory at the bitrates specified by
+  echo the first 3 parameters and stores the results in a subdirectory named by
+  echo the 4th parameter:
+  echo
+  echo Usage:    run_encodes.sh start-kbps end-kbps step-kbps output-directory
+  echo Example:  run_encodes.sh 200 500 50 baseline
+  exit
+fi
+
+s=$1
+e=$2
+step=$3
+newdir=$4
+
+for i in ./*y4m; do
+  for (( b=$s; b<= $e; b+= $step ))
+  do
+    best_encode.sh $i $b
+  done
+  mv opsnr.stt $i.stt
+done
+
+mkdir $newdir
+mv *.stt $newdir
+mv *.webm $newdir
diff --git a/libs/libaom/src/test/sad_test.cc b/libs/libaom/src/test/sad_test.cc
new file mode 100644
index 000000000..0bdbf3745
--- /dev/null
+++ b/libs/libaom/src/test/sad_test.cc
@@ -0,0 +1,1981 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <string.h>
+#include <limits.h>
+#include <stdio.h>
+#include <tuple>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "aom/aom_codec.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+
+typedef unsigned int (*SadMxNFunc)(const uint8_t *src_ptr, int src_stride,
+                                   const uint8_t *ref_ptr, int ref_stride);
+typedef std::tuple<int, int, SadMxNFunc, int> SadMxNParam;
+
+typedef uint32_t (*SadMxNAvgFunc)(const uint8_t *src_ptr, int src_stride,
+                                  const uint8_t *ref_ptr, int ref_stride,
+                                  const uint8_t *second_pred);
+typedef std::tuple<int, int, SadMxNAvgFunc, int> SadMxNAvgParam;
+
+typedef void (*DistWtdCompAvgFunc)(uint8_t *comp_pred, const uint8_t *pred,
+                                   int width, int height, const uint8_t *ref,
+                                   int ref_stride,
+                                   const DIST_WTD_COMP_PARAMS *jcp_param);
+typedef std::tuple<int, int, DistWtdCompAvgFunc, int> DistWtdCompAvgParam;
+
+typedef unsigned int (*DistWtdSadMxhFunc)(const uint8_t *src_ptr,
+                                          int src_stride,
+                                          const uint8_t *ref_ptr,
+                                          int ref_stride, int width,
+                                          int height);
+typedef std::tuple<int, int, DistWtdSadMxhFunc, int> DistWtdSadMxhParam;
+
+typedef uint32_t (*DistWtdSadMxNAvgFunc)(const uint8_t *src_ptr, int src_stride,
+                                         const uint8_t *ref_ptr, int ref_stride,
+                                         const uint8_t *second_pred,
+                                         const DIST_WTD_COMP_PARAMS *jcp_param);
+typedef std::tuple<int, int, DistWtdSadMxNAvgFunc, int> DistWtdSadMxNAvgParam;
+
+typedef void (*SadMxNx4Func)(const uint8_t *src_ptr, int src_stride,
+                             const uint8_t *const ref_ptr[], int ref_stride,
+                             uint32_t *sad_array);
+typedef std::tuple<int, int, SadMxNx4Func, int> SadMxNx4Param;
+
+typedef void (*SadMxNx4AvgFunc)(const uint8_t *src_ptr, int src_stride,
+                                const uint8_t *const ref_ptr[], int ref_stride,
+                                const uint8_t *second_pred,
+                                uint32_t *sad_array);
+typedef std::tuple<int, int, SadMxNx4AvgFunc, int> SadMxNx4AvgParam;
+
+using libaom_test::ACMRandom;
+
+namespace {
+class SADTestBase : public ::testing::Test {
+ public:
+  SADTestBase(int width, int height, int bit_depth)
+      : width_(width), height_(height), bd_(bit_depth) {}
+
+  static void SetUpTestCase() {
+    source_data8_ = reinterpret_cast<uint8_t *>(
+        aom_memalign(kDataAlignment, kDataBlockSize));
+    reference_data8_ = reinterpret_cast<uint8_t *>(
+        aom_memalign(kDataAlignment, kDataBufferSize));
+    second_pred8_ =
+        reinterpret_cast<uint8_t *>(aom_memalign(kDataAlignment, 128 * 128));
+    comp_pred8_ =
+        reinterpret_cast<uint8_t *>(aom_memalign(kDataAlignment, 128 * 128));
+    comp_pred8_test_ =
+        reinterpret_cast<uint8_t *>(aom_memalign(kDataAlignment, 128 * 128));
+    source_data16_ = reinterpret_cast<uint16_t *>(
+        aom_memalign(kDataAlignment, kDataBlockSize * sizeof(uint16_t)));
+    reference_data16_ = reinterpret_cast<uint16_t *>(
+        aom_memalign(kDataAlignment, kDataBufferSize * sizeof(uint16_t)));
+    second_pred16_ = reinterpret_cast<uint16_t *>(
+        aom_memalign(kDataAlignment, 128 * 128 * sizeof(uint16_t)));
+    comp_pred16_ = reinterpret_cast<uint16_t *>(
+        aom_memalign(kDataAlignment, 128 * 128 * sizeof(uint16_t)));
+    comp_pred16_test_ = reinterpret_cast<uint16_t *>(
+        aom_memalign(kDataAlignment, 128 * 128 * sizeof(uint16_t)));
+  }
+
+  static void TearDownTestCase() {
+    aom_free(source_data8_);
+    source_data8_ = NULL;
+    aom_free(reference_data8_);
+    reference_data8_ = NULL;
+    aom_free(second_pred8_);
+    second_pred8_ = NULL;
+    aom_free(comp_pred8_);
+    comp_pred8_ = NULL;
+    aom_free(comp_pred8_test_);
+    comp_pred8_test_ = NULL;
+    aom_free(source_data16_);
+    source_data16_ = NULL;
+    aom_free(reference_data16_);
+    reference_data16_ = NULL;
+    aom_free(second_pred16_);
+    second_pred16_ = NULL;
+    aom_free(comp_pred16_);
+    comp_pred16_ = NULL;
+    aom_free(comp_pred16_test_);
+    comp_pred16_test_ = NULL;
+  }
+
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+  // Handle up to 4 128x128 blocks, with stride up to 256
+  static const int kDataAlignment = 16;
+  static const int kDataBlockSize = 128 * 256;
+  static const int kDataBufferSize = 4 * kDataBlockSize;
+
+  virtual void SetUp() {
+    if (bd_ == -1) {
+      use_high_bit_depth_ = false;
+      bit_depth_ = AOM_BITS_8;
+      source_data_ = source_data8_;
+      reference_data_ = reference_data8_;
+      second_pred_ = second_pred8_;
+      comp_pred_ = comp_pred8_;
+      comp_pred_test_ = comp_pred8_test_;
+    } else {
+      use_high_bit_depth_ = true;
+      bit_depth_ = static_cast<aom_bit_depth_t>(bd_);
+      source_data_ = CONVERT_TO_BYTEPTR(source_data16_);
+      reference_data_ = CONVERT_TO_BYTEPTR(reference_data16_);
+      second_pred_ = CONVERT_TO_BYTEPTR(second_pred16_);
+      comp_pred_ = CONVERT_TO_BYTEPTR(comp_pred16_);
+      comp_pred_test_ = CONVERT_TO_BYTEPTR(comp_pred16_test_);
+    }
+    mask_ = (1 << bit_depth_) - 1;
+    source_stride_ = (width_ + 31) & ~31;
+    reference_stride_ = width_ * 2;
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+  }
+
+  virtual uint8_t *GetReference(int block_idx) {
+    if (use_high_bit_depth_)
+      return CONVERT_TO_BYTEPTR(CONVERT_TO_SHORTPTR(reference_data_) +
+                                block_idx * kDataBlockSize);
+    return reference_data_ + block_idx * kDataBlockSize;
+  }
+
+  // Sum of Absolute Differences. Given two blocks, calculate the absolute
+  // difference between two pixels in the same relative location; accumulate.
+  unsigned int ReferenceSAD(int block_idx) {
+    unsigned int sad = 0;
+    const uint8_t *const reference8 = GetReference(block_idx);
+    const uint8_t *const source8 = source_data_;
+    const uint16_t *const reference16 =
+        CONVERT_TO_SHORTPTR(GetReference(block_idx));
+    const uint16_t *const source16 = CONVERT_TO_SHORTPTR(source_data_);
+    for (int h = 0; h < height_; ++h) {
+      for (int w = 0; w < width_; ++w) {
+        if (!use_high_bit_depth_) {
+          sad += abs(source8[h * source_stride_ + w] -
+                     reference8[h * reference_stride_ + w]);
+        } else {
+          sad += abs(source16[h * source_stride_ + w] -
+                     reference16[h * reference_stride_ + w]);
+        }
+      }
+    }
+    return sad;
+  }
+
+  // Sum of Absolute Differences Average. Given two blocks, and a prediction
+  // calculate the absolute difference between one pixel and average of the
+  // corresponding and predicted pixels; accumulate.
+  unsigned int ReferenceSADavg(int block_idx) {
+    unsigned int sad = 0;
+    const uint8_t *const reference8 = GetReference(block_idx);
+    const uint8_t *const source8 = source_data_;
+    const uint8_t *const second_pred8 = second_pred_;
+    const uint16_t *const reference16 =
+        CONVERT_TO_SHORTPTR(GetReference(block_idx));
+    const uint16_t *const source16 = CONVERT_TO_SHORTPTR(source_data_);
+    const uint16_t *const second_pred16 = CONVERT_TO_SHORTPTR(second_pred_);
+    for (int h = 0; h < height_; ++h) {
+      for (int w = 0; w < width_; ++w) {
+        if (!use_high_bit_depth_) {
+          const int tmp = second_pred8[h * width_ + w] +
+                          reference8[h * reference_stride_ + w];
+          const uint8_t comp_pred = ROUND_POWER_OF_TWO(tmp, 1);
+          sad += abs(source8[h * source_stride_ + w] - comp_pred);
+        } else {
+          const int tmp = second_pred16[h * width_ + w] +
+                          reference16[h * reference_stride_ + w];
+          const uint16_t comp_pred = ROUND_POWER_OF_TWO(tmp, 1);
+          sad += abs(source16[h * source_stride_ + w] - comp_pred);
+        }
+      }
+    }
+    return sad;
+  }
+
+  void ReferenceDistWtdCompAvg(int block_idx) {
+    const uint8_t *const reference8 = GetReference(block_idx);
+    const uint8_t *const second_pred8 = second_pred_;
+    uint8_t *const comp_pred8 = comp_pred_;
+    const uint16_t *const reference16 =
+        CONVERT_TO_SHORTPTR(GetReference(block_idx));
+    const uint16_t *const second_pred16 = CONVERT_TO_SHORTPTR(second_pred_);
+    uint16_t *const comp_pred16 = CONVERT_TO_SHORTPTR(comp_pred_);
+    for (int h = 0; h < height_; ++h) {
+      for (int w = 0; w < width_; ++w) {
+        if (!use_high_bit_depth_) {
+          const int tmp =
+              second_pred8[h * width_ + w] * jcp_param_.bck_offset +
+              reference8[h * reference_stride_ + w] * jcp_param_.fwd_offset;
+          comp_pred8[h * width_ + w] = ROUND_POWER_OF_TWO(tmp, 4);
+        } else {
+          const int tmp =
+              second_pred16[h * width_ + w] * jcp_param_.bck_offset +
+              reference16[h * reference_stride_ + w] * jcp_param_.fwd_offset;
+          comp_pred16[h * width_ + w] = ROUND_POWER_OF_TWO(tmp, 4);
+        }
+      }
+    }
+  }
+
+  unsigned int ReferenceDistWtdSADavg(int block_idx) {
+    unsigned int sad = 0;
+    const uint8_t *const reference8 = GetReference(block_idx);
+    const uint8_t *const source8 = source_data_;
+    const uint8_t *const second_pred8 = second_pred_;
+    const uint16_t *const reference16 =
+        CONVERT_TO_SHORTPTR(GetReference(block_idx));
+    const uint16_t *const source16 = CONVERT_TO_SHORTPTR(source_data_);
+    const uint16_t *const second_pred16 = CONVERT_TO_SHORTPTR(second_pred_);
+    for (int h = 0; h < height_; ++h) {
+      for (int w = 0; w < width_; ++w) {
+        if (!use_high_bit_depth_) {
+          const int tmp =
+              second_pred8[h * width_ + w] * jcp_param_.bck_offset +
+              reference8[h * reference_stride_ + w] * jcp_param_.fwd_offset;
+          const uint8_t comp_pred = ROUND_POWER_OF_TWO(tmp, 4);
+          sad += abs(source8[h * source_stride_ + w] - comp_pred);
+        } else {
+          const int tmp =
+              second_pred16[h * width_ + w] * jcp_param_.bck_offset +
+              reference16[h * reference_stride_ + w] * jcp_param_.fwd_offset;
+          const uint16_t comp_pred = ROUND_POWER_OF_TWO(tmp, 4);
+          sad += abs(source16[h * source_stride_ + w] - comp_pred);
+        }
+      }
+    }
+    return sad;
+  }
+
+  void FillConstant(uint8_t *data, int stride, uint16_t fill_constant) {
+    uint8_t *data8 = data;
+    uint16_t *data16 = CONVERT_TO_SHORTPTR(data);
+    for (int h = 0; h < height_; ++h) {
+      for (int w = 0; w < width_; ++w) {
+        if (!use_high_bit_depth_) {
+          data8[h * stride + w] = static_cast<uint8_t>(fill_constant);
+        } else {
+          data16[h * stride + w] = fill_constant;
+        }
+      }
+    }
+  }
+
+  void FillRandom(uint8_t *data, int stride) {
+    uint8_t *data8 = data;
+    uint16_t *data16 = CONVERT_TO_SHORTPTR(data);
+    for (int h = 0; h < height_; ++h) {
+      for (int w = 0; w < width_; ++w) {
+        if (!use_high_bit_depth_) {
+          data8[h * stride + w] = rnd_.Rand8();
+        } else {
+          data16[h * stride + w] = rnd_.Rand16() & mask_;
+        }
+      }
+    }
+  }
+
+  int width_, height_, mask_, bd_;
+  aom_bit_depth_t bit_depth_;
+  static uint8_t *source_data_;
+  static uint8_t *reference_data_;
+  static uint8_t *second_pred_;
+  int source_stride_;
+  bool use_high_bit_depth_;
+  static uint8_t *source_data8_;
+  static uint8_t *reference_data8_;
+  static uint8_t *second_pred8_;
+  static uint16_t *source_data16_;
+  static uint16_t *reference_data16_;
+  static uint16_t *second_pred16_;
+  int reference_stride_;
+  static uint8_t *comp_pred_;
+  static uint8_t *comp_pred8_;
+  static uint16_t *comp_pred16_;
+  static uint8_t *comp_pred_test_;
+  static uint8_t *comp_pred8_test_;
+  static uint16_t *comp_pred16_test_;
+  DIST_WTD_COMP_PARAMS jcp_param_;
+
+  ACMRandom rnd_;
+};
+
+class SADx4Test : public ::testing::WithParamInterface<SadMxNx4Param>,
+                  public SADTestBase {
+ public:
+  SADx4Test() : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {}
+
+ protected:
+  void SADs(unsigned int *results) {
+    const uint8_t *references[] = { GetReference(0), GetReference(1),
+                                    GetReference(2), GetReference(3) };
+
+    ASM_REGISTER_STATE_CHECK(GET_PARAM(2)(
+        source_data_, source_stride_, references, reference_stride_, results));
+  }
+
+  void CheckSADs() {
+    unsigned int reference_sad, exp_sad[4];
+
+    SADs(exp_sad);
+    for (int block = 0; block < 4; ++block) {
+      reference_sad = ReferenceSAD(block);
+
+      EXPECT_EQ(reference_sad, exp_sad[block]) << "block " << block;
+    }
+  }
+};
+
+class SADx4AvgTest : public ::testing::WithParamInterface<SadMxNx4AvgParam>,
+                     public SADTestBase {
+ public:
+  SADx4AvgTest() : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {}
+
+ protected:
+  void SADs(unsigned int *results) {
+    const uint8_t *references[] = { GetReference(0), GetReference(1),
+                                    GetReference(2), GetReference(3) };
+
+    ASM_REGISTER_STATE_CHECK(GET_PARAM(2)(source_data_, source_stride_,
+                                          references, reference_stride_,
+                                          second_pred_, results));
+  }
+
+  void CheckSADs() {
+    unsigned int reference_sad, exp_sad[4];
+
+    SADs(exp_sad);
+    for (int block = 0; block < 4; ++block) {
+      reference_sad = ReferenceSADavg(block);
+
+      EXPECT_EQ(reference_sad, exp_sad[block]) << "block " << block;
+    }
+  }
+
+  void SpeedSAD() {
+    int test_count = 200000;
+    unsigned int exp_sad[4];
+    while (test_count > 0) {
+      SADs(exp_sad);
+      test_count -= 1;
+    }
+  }
+};
+
+class SADTest : public ::testing::WithParamInterface<SadMxNParam>,
+                public SADTestBase {
+ public:
+  SADTest() : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {}
+
+ protected:
+  unsigned int SAD(int block_idx) {
+    unsigned int ret;
+    const uint8_t *const reference = GetReference(block_idx);
+
+    ASM_REGISTER_STATE_CHECK(ret = GET_PARAM(2)(source_data_, source_stride_,
+                                                reference, reference_stride_));
+    return ret;
+  }
+
+  void CheckSAD() {
+    const unsigned int reference_sad = ReferenceSAD(0);
+    const unsigned int exp_sad = SAD(0);
+
+    ASSERT_EQ(reference_sad, exp_sad);
+  }
+
+  void SpeedSAD() {
+    int test_count = 20000000;
+    while (test_count > 0) {
+      SAD(0);
+      test_count -= 1;
+    }
+  }
+};
+
+class SADavgTest : public ::testing::WithParamInterface<SadMxNAvgParam>,
+                   public SADTestBase {
+ public:
+  SADavgTest() : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {}
+
+ protected:
+  unsigned int SAD_avg(int block_idx) {
+    unsigned int ret;
+    const uint8_t *const reference = GetReference(block_idx);
+
+    ASM_REGISTER_STATE_CHECK(ret = GET_PARAM(2)(source_data_, source_stride_,
+                                                reference, reference_stride_,
+                                                second_pred_));
+    return ret;
+  }
+
+  void CheckSAD() {
+    const unsigned int reference_sad = ReferenceSADavg(0);
+    const unsigned int exp_sad = SAD_avg(0);
+
+    ASSERT_EQ(reference_sad, exp_sad);
+  }
+};
+
+class DistWtdCompAvgTest
+    : public ::testing::WithParamInterface<DistWtdCompAvgParam>,
+      public SADTestBase {
+ public:
+  DistWtdCompAvgTest()
+      : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {}
+
+ protected:
+  void dist_wtd_comp_avg(int block_idx) {
+    const uint8_t *const reference = GetReference(block_idx);
+
+    ASM_REGISTER_STATE_CHECK(GET_PARAM(2)(comp_pred_test_, second_pred_, width_,
+                                          height_, reference, reference_stride_,
+                                          &jcp_param_));
+  }
+
+  void CheckCompAvg() {
+    for (int j = 0; j < 2; ++j) {
+      for (int i = 0; i < 4; ++i) {
+        jcp_param_.fwd_offset = quant_dist_lookup_table[j][i][0];
+        jcp_param_.bck_offset = quant_dist_lookup_table[j][i][1];
+
+        ReferenceDistWtdCompAvg(0);
+        dist_wtd_comp_avg(0);
+
+        for (int y = 0; y < height_; ++y)
+          for (int x = 0; x < width_; ++x)
+            ASSERT_EQ(comp_pred_[y * width_ + x],
+                      comp_pred_test_[y * width_ + x]);
+      }
+    }
+  }
+};
+
+class DistWtdSADTest : public ::testing::WithParamInterface<DistWtdSadMxhParam>,
+                       public SADTestBase {
+ public:
+  DistWtdSADTest() : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {}
+
+ protected:
+  unsigned int SAD(int block_idx) {
+    unsigned int ret;
+    const uint8_t *const reference = GetReference(block_idx);
+
+    ASM_REGISTER_STATE_CHECK(ret = GET_PARAM(2)(source_data_, source_stride_,
+                                                reference, reference_stride_,
+                                                GET_PARAM(0), GET_PARAM(1)));
+    return ret;
+  }
+
+  void CheckSAD() {
+    const unsigned int reference_sad = ReferenceSAD(0);
+    const unsigned int exp_sad = SAD(0);
+
+    ASSERT_EQ(reference_sad, exp_sad);
+  }
+
+  void SpeedSAD() {
+    int test_count = 20000000;
+    while (test_count > 0) {
+      SAD(0);
+      test_count -= 1;
+    }
+  }
+};
+
+class DistWtdSADavgTest
+    : public ::testing::WithParamInterface<DistWtdSadMxNAvgParam>,
+      public SADTestBase {
+ public:
+  DistWtdSADavgTest() : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {}
+
+ protected:
+  unsigned int dist_wtd_SAD_avg(int block_idx) {
+    unsigned int ret;
+    const uint8_t *const reference = GetReference(block_idx);
+
+    ASM_REGISTER_STATE_CHECK(ret = GET_PARAM(2)(source_data_, source_stride_,
+                                                reference, reference_stride_,
+                                                second_pred_, &jcp_param_));
+    return ret;
+  }
+
+  void CheckSAD() {
+    for (int j = 0; j < 2; ++j) {
+      for (int i = 0; i < 4; ++i) {
+        jcp_param_.fwd_offset = quant_dist_lookup_table[j][i][0];
+        jcp_param_.bck_offset = quant_dist_lookup_table[j][i][1];
+
+        const unsigned int reference_sad = ReferenceDistWtdSADavg(0);
+        const unsigned int exp_sad = dist_wtd_SAD_avg(0);
+
+        ASSERT_EQ(reference_sad, exp_sad);
+      }
+    }
+  }
+};
+
+uint8_t *SADTestBase::source_data_ = NULL;
+uint8_t *SADTestBase::reference_data_ = NULL;
+uint8_t *SADTestBase::second_pred_ = NULL;
+uint8_t *SADTestBase::comp_pred_ = NULL;
+uint8_t *SADTestBase::comp_pred_test_ = NULL;
+uint8_t *SADTestBase::source_data8_ = NULL;
+uint8_t *SADTestBase::reference_data8_ = NULL;
+uint8_t *SADTestBase::second_pred8_ = NULL;
+uint8_t *SADTestBase::comp_pred8_ = NULL;
+uint8_t *SADTestBase::comp_pred8_test_ = NULL;
+uint16_t *SADTestBase::source_data16_ = NULL;
+uint16_t *SADTestBase::reference_data16_ = NULL;
+uint16_t *SADTestBase::second_pred16_ = NULL;
+uint16_t *SADTestBase::comp_pred16_ = NULL;
+uint16_t *SADTestBase::comp_pred16_test_ = NULL;
+
+TEST_P(SADTest, MaxRef) {
+  FillConstant(source_data_, source_stride_, 0);
+  FillConstant(reference_data_, reference_stride_, mask_);
+  CheckSAD();
+}
+
+TEST_P(SADTest, MaxSrc) {
+  FillConstant(source_data_, source_stride_, mask_);
+  FillConstant(reference_data_, reference_stride_, 0);
+  CheckSAD();
+}
+
+TEST_P(SADTest, ShortRef) {
+  const int tmp_stride = reference_stride_;
+  reference_stride_ >>= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(reference_data_, reference_stride_);
+  CheckSAD();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADTest, UnalignedRef) {
+  // The reference frame, but not the source frame, may be unaligned for
+  // certain types of searches.
+  const int tmp_stride = reference_stride_;
+  reference_stride_ -= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(reference_data_, reference_stride_);
+  CheckSAD();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADTest, ShortSrc) {
+  const int tmp_stride = source_stride_;
+  source_stride_ >>= 1;
+  int test_count = 2000;
+  while (test_count > 0) {
+    FillRandom(source_data_, source_stride_);
+    FillRandom(reference_data_, reference_stride_);
+    CheckSAD();
+    test_count -= 1;
+  }
+  source_stride_ = tmp_stride;
+}
+
+#define SPEED_TEST (0)
+#if SPEED_TEST
+TEST_P(SADTest, Speed) {
+  const int tmp_stride = source_stride_;
+  source_stride_ >>= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(reference_data_, reference_stride_);
+  SpeedSAD();
+  source_stride_ = tmp_stride;
+}
+#endif
+
+TEST_P(SADavgTest, MaxRef) {
+  FillConstant(source_data_, source_stride_, 0);
+  FillConstant(reference_data_, reference_stride_, mask_);
+  FillConstant(second_pred_, width_, 0);
+  CheckSAD();
+}
+TEST_P(SADavgTest, MaxSrc) {
+  FillConstant(source_data_, source_stride_, mask_);
+  FillConstant(reference_data_, reference_stride_, 0);
+  FillConstant(second_pred_, width_, 0);
+  CheckSAD();
+}
+
+TEST_P(SADavgTest, ShortRef) {
+  const int tmp_stride = reference_stride_;
+  reference_stride_ >>= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(reference_data_, reference_stride_);
+  FillRandom(second_pred_, width_);
+  CheckSAD();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADavgTest, UnalignedRef) {
+  // The reference frame, but not the source frame, may be unaligned for
+  // certain types of searches.
+  const int tmp_stride = reference_stride_;
+  reference_stride_ -= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(reference_data_, reference_stride_);
+  FillRandom(second_pred_, width_);
+  CheckSAD();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADavgTest, ShortSrc) {
+  const int tmp_stride = source_stride_;
+  source_stride_ >>= 1;
+  int test_count = 2000;
+  while (test_count > 0) {
+    FillRandom(source_data_, source_stride_);
+    FillRandom(reference_data_, reference_stride_);
+    FillRandom(second_pred_, width_);
+    CheckSAD();
+    test_count -= 1;
+  }
+  source_stride_ = tmp_stride;
+}
+
+TEST_P(DistWtdCompAvgTest, MaxRef) {
+  FillConstant(reference_data_, reference_stride_, mask_);
+  FillConstant(second_pred_, width_, 0);
+  CheckCompAvg();
+}
+
+TEST_P(DistWtdCompAvgTest, MaxSecondPred) {
+  FillConstant(reference_data_, reference_stride_, 0);
+  FillConstant(second_pred_, width_, mask_);
+  CheckCompAvg();
+}
+
+TEST_P(DistWtdCompAvgTest, ShortRef) {
+  const int tmp_stride = reference_stride_;
+  reference_stride_ >>= 1;
+  FillRandom(reference_data_, reference_stride_);
+  FillRandom(second_pred_, width_);
+  CheckCompAvg();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(DistWtdCompAvgTest, UnalignedRef) {
+  // The reference frame, but not the source frame, may be unaligned for
+  // certain types of searches.
+  const int tmp_stride = reference_stride_;
+  reference_stride_ -= 1;
+  FillRandom(reference_data_, reference_stride_);
+  FillRandom(second_pred_, width_);
+  CheckCompAvg();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(DistWtdSADTest, MaxRef) {
+  FillConstant(source_data_, source_stride_, 0);
+  FillConstant(reference_data_, reference_stride_, mask_);
+  CheckSAD();
+}
+
+TEST_P(DistWtdSADTest, MaxSrc) {
+  FillConstant(source_data_, source_stride_, mask_);
+  FillConstant(reference_data_, reference_stride_, 0);
+  CheckSAD();
+}
+
+TEST_P(DistWtdSADTest, ShortRef) {
+  const int tmp_stride = reference_stride_;
+  reference_stride_ >>= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(reference_data_, reference_stride_);
+  CheckSAD();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(DistWtdSADTest, UnalignedRef) {
+  // The reference frame, but not the source frame, may be unaligned for
+  // certain types of searches.
+  const int tmp_stride = reference_stride_;
+  reference_stride_ -= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(reference_data_, reference_stride_);
+  CheckSAD();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(DistWtdSADTest, ShortSrc) {
+  const int tmp_stride = source_stride_;
+  source_stride_ >>= 1;
+  int test_count = 2000;
+  while (test_count > 0) {
+    FillRandom(source_data_, source_stride_);
+    FillRandom(reference_data_, reference_stride_);
+    CheckSAD();
+    test_count -= 1;
+  }
+  source_stride_ = tmp_stride;
+}
+
+TEST_P(DistWtdSADavgTest, MaxRef) {
+  FillConstant(source_data_, source_stride_, 0);
+  FillConstant(reference_data_, reference_stride_, mask_);
+  FillConstant(second_pred_, width_, 0);
+  CheckSAD();
+}
+TEST_P(DistWtdSADavgTest, MaxSrc) {
+  FillConstant(source_data_, source_stride_, mask_);
+  FillConstant(reference_data_, reference_stride_, 0);
+  FillConstant(second_pred_, width_, 0);
+  CheckSAD();
+}
+
+TEST_P(DistWtdSADavgTest, ShortRef) {
+  const int tmp_stride = reference_stride_;
+  reference_stride_ >>= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(reference_data_, reference_stride_);
+  FillRandom(second_pred_, width_);
+  CheckSAD();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(DistWtdSADavgTest, UnalignedRef) {
+  // The reference frame, but not the source frame, may be unaligned for
+  // certain types of searches.
+  const int tmp_stride = reference_stride_;
+  reference_stride_ -= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(reference_data_, reference_stride_);
+  FillRandom(second_pred_, width_);
+  CheckSAD();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(DistWtdSADavgTest, ShortSrc) {
+  const int tmp_stride = source_stride_;
+  source_stride_ >>= 1;
+  int test_count = 2000;
+  while (test_count > 0) {
+    FillRandom(source_data_, source_stride_);
+    FillRandom(reference_data_, reference_stride_);
+    FillRandom(second_pred_, width_);
+    CheckSAD();
+    test_count -= 1;
+  }
+  source_stride_ = tmp_stride;
+}
+
+TEST_P(SADx4Test, MaxRef) {
+  FillConstant(source_data_, source_stride_, 0);
+  FillConstant(GetReference(0), reference_stride_, mask_);
+  FillConstant(GetReference(1), reference_stride_, mask_);
+  FillConstant(GetReference(2), reference_stride_, mask_);
+  FillConstant(GetReference(3), reference_stride_, mask_);
+  CheckSADs();
+}
+
+TEST_P(SADx4Test, MaxSrc) {
+  FillConstant(source_data_, source_stride_, mask_);
+  FillConstant(GetReference(0), reference_stride_, 0);
+  FillConstant(GetReference(1), reference_stride_, 0);
+  FillConstant(GetReference(2), reference_stride_, 0);
+  FillConstant(GetReference(3), reference_stride_, 0);
+  CheckSADs();
+}
+
+TEST_P(SADx4Test, ShortRef) {
+  int tmp_stride = reference_stride_;
+  reference_stride_ >>= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(GetReference(0), reference_stride_);
+  FillRandom(GetReference(1), reference_stride_);
+  FillRandom(GetReference(2), reference_stride_);
+  FillRandom(GetReference(3), reference_stride_);
+  CheckSADs();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADx4Test, UnalignedRef) {
+  // The reference frame, but not the source frame, may be unaligned for
+  // certain types of searches.
+  int tmp_stride = reference_stride_;
+  reference_stride_ -= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(GetReference(0), reference_stride_);
+  FillRandom(GetReference(1), reference_stride_);
+  FillRandom(GetReference(2), reference_stride_);
+  FillRandom(GetReference(3), reference_stride_);
+  CheckSADs();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADx4Test, ShortSrc) {
+  int tmp_stride = source_stride_;
+  source_stride_ >>= 1;
+  int test_count = 1000;
+  while (test_count > 0) {
+    FillRandom(source_data_, source_stride_);
+    FillRandom(GetReference(0), reference_stride_);
+    FillRandom(GetReference(1), reference_stride_);
+    FillRandom(GetReference(2), reference_stride_);
+    FillRandom(GetReference(3), reference_stride_);
+    CheckSADs();
+    test_count -= 1;
+  }
+  source_stride_ = tmp_stride;
+}
+
+TEST_P(SADx4Test, SrcAlignedByWidth) {
+  uint8_t *tmp_source_data = source_data_;
+  source_data_ += width_;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(GetReference(0), reference_stride_);
+  FillRandom(GetReference(1), reference_stride_);
+  FillRandom(GetReference(2), reference_stride_);
+  FillRandom(GetReference(3), reference_stride_);
+  CheckSADs();
+  source_data_ = tmp_source_data;
+}
+
+using std::make_tuple;
+
+#if SPEED_TEST
+TEST_P(SADx4AvgTest, Speed) {
+  int tmp_stride = reference_stride_;
+  reference_stride_ >>= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(GetReference(0), reference_stride_);
+  FillRandom(GetReference(1), reference_stride_);
+  FillRandom(GetReference(2), reference_stride_);
+  FillRandom(GetReference(3), reference_stride_);
+  FillRandom(second_pred_, width_);
+  SpeedSAD();
+  reference_stride_ = tmp_stride;
+}
+#endif
+
+TEST_P(SADx4AvgTest, MaxRef) {
+  FillConstant(source_data_, source_stride_, 0);
+  FillConstant(GetReference(0), reference_stride_, mask_);
+  FillConstant(GetReference(1), reference_stride_, mask_);
+  FillConstant(GetReference(2), reference_stride_, mask_);
+  FillConstant(GetReference(3), reference_stride_, mask_);
+  FillConstant(second_pred_, width_, 0);
+  CheckSADs();
+}
+
+TEST_P(SADx4AvgTest, MaxSrc) {
+  FillConstant(source_data_, source_stride_, mask_);
+  FillConstant(GetReference(0), reference_stride_, 0);
+  FillConstant(GetReference(1), reference_stride_, 0);
+  FillConstant(GetReference(2), reference_stride_, 0);
+  FillConstant(GetReference(3), reference_stride_, 0);
+  FillConstant(second_pred_, width_, 0);
+  CheckSADs();
+}
+
+TEST_P(SADx4AvgTest, ShortRef) {
+  int tmp_stride = reference_stride_;
+  reference_stride_ >>= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(GetReference(0), reference_stride_);
+  FillRandom(GetReference(1), reference_stride_);
+  FillRandom(GetReference(2), reference_stride_);
+  FillRandom(GetReference(3), reference_stride_);
+  FillRandom(second_pred_, width_);
+  CheckSADs();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADx4AvgTest, UnalignedRef) {
+  // The reference frame, but not the source frame, may be unaligned for
+  // certain types of searches.
+  int tmp_stride = reference_stride_;
+  reference_stride_ -= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(GetReference(0), reference_stride_);
+  FillRandom(GetReference(1), reference_stride_);
+  FillRandom(GetReference(2), reference_stride_);
+  FillRandom(GetReference(3), reference_stride_);
+  FillRandom(second_pred_, width_);
+  CheckSADs();
+  reference_stride_ = tmp_stride;
+}
+
+//------------------------------------------------------------------------------
+// C functions
+const SadMxNParam c_tests[] = {
+  make_tuple(128, 128, &aom_sad128x128_c, -1),
+  make_tuple(128, 64, &aom_sad128x64_c, -1),
+  make_tuple(64, 128, &aom_sad64x128_c, -1),
+  make_tuple(64, 64, &aom_sad64x64_c, -1),
+  make_tuple(64, 32, &aom_sad64x32_c, -1),
+  make_tuple(32, 64, &aom_sad32x64_c, -1),
+  make_tuple(32, 32, &aom_sad32x32_c, -1),
+  make_tuple(32, 16, &aom_sad32x16_c, -1),
+  make_tuple(16, 32, &aom_sad16x32_c, -1),
+  make_tuple(16, 16, &aom_sad16x16_c, -1),
+  make_tuple(16, 8, &aom_sad16x8_c, -1),
+  make_tuple(8, 16, &aom_sad8x16_c, -1),
+  make_tuple(8, 8, &aom_sad8x8_c, -1),
+  make_tuple(8, 4, &aom_sad8x4_c, -1),
+  make_tuple(4, 8, &aom_sad4x8_c, -1),
+  make_tuple(4, 4, &aom_sad4x4_c, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(128, 128, &aom_highbd_sad128x128_c, 8),
+  make_tuple(128, 64, &aom_highbd_sad128x64_c, 8),
+  make_tuple(64, 128, &aom_highbd_sad64x128_c, 8),
+  make_tuple(64, 64, &aom_highbd_sad64x64_c, 8),
+  make_tuple(64, 32, &aom_highbd_sad64x32_c, 8),
+  make_tuple(32, 64, &aom_highbd_sad32x64_c, 8),
+  make_tuple(32, 32, &aom_highbd_sad32x32_c, 8),
+  make_tuple(32, 16, &aom_highbd_sad32x16_c, 8),
+  make_tuple(16, 32, &aom_highbd_sad16x32_c, 8),
+  make_tuple(16, 16, &aom_highbd_sad16x16_c, 8),
+  make_tuple(16, 8, &aom_highbd_sad16x8_c, 8),
+  make_tuple(8, 16, &aom_highbd_sad8x16_c, 8),
+  make_tuple(8, 8, &aom_highbd_sad8x8_c, 8),
+  make_tuple(8, 4, &aom_highbd_sad8x4_c, 8),
+  make_tuple(4, 8, &aom_highbd_sad4x8_c, 8),
+  make_tuple(4, 4, &aom_highbd_sad4x4_c, 8),
+  make_tuple(128, 128, &aom_highbd_sad128x128_c, 10),
+  make_tuple(128, 64, &aom_highbd_sad128x64_c, 10),
+  make_tuple(64, 128, &aom_highbd_sad64x128_c, 10),
+  make_tuple(64, 64, &aom_highbd_sad64x64_c, 10),
+  make_tuple(64, 32, &aom_highbd_sad64x32_c, 10),
+  make_tuple(32, 64, &aom_highbd_sad32x64_c, 10),
+  make_tuple(32, 32, &aom_highbd_sad32x32_c, 10),
+  make_tuple(32, 16, &aom_highbd_sad32x16_c, 10),
+  make_tuple(16, 32, &aom_highbd_sad16x32_c, 10),
+  make_tuple(16, 16, &aom_highbd_sad16x16_c, 10),
+  make_tuple(16, 8, &aom_highbd_sad16x8_c, 10),
+  make_tuple(8, 16, &aom_highbd_sad8x16_c, 10),
+  make_tuple(8, 8, &aom_highbd_sad8x8_c, 10),
+  make_tuple(8, 4, &aom_highbd_sad8x4_c, 10),
+  make_tuple(4, 8, &aom_highbd_sad4x8_c, 10),
+  make_tuple(4, 4, &aom_highbd_sad4x4_c, 10),
+  make_tuple(128, 128, &aom_highbd_sad128x128_c, 12),
+  make_tuple(128, 64, &aom_highbd_sad128x64_c, 12),
+  make_tuple(64, 128, &aom_highbd_sad64x128_c, 12),
+  make_tuple(64, 64, &aom_highbd_sad64x64_c, 12),
+  make_tuple(64, 32, &aom_highbd_sad64x32_c, 12),
+  make_tuple(32, 64, &aom_highbd_sad32x64_c, 12),
+  make_tuple(32, 32, &aom_highbd_sad32x32_c, 12),
+  make_tuple(32, 16, &aom_highbd_sad32x16_c, 12),
+  make_tuple(16, 32, &aom_highbd_sad16x32_c, 12),
+  make_tuple(16, 16, &aom_highbd_sad16x16_c, 12),
+  make_tuple(16, 8, &aom_highbd_sad16x8_c, 12),
+  make_tuple(8, 16, &aom_highbd_sad8x16_c, 12),
+  make_tuple(8, 8, &aom_highbd_sad8x8_c, 12),
+  make_tuple(8, 4, &aom_highbd_sad8x4_c, 12),
+  make_tuple(4, 8, &aom_highbd_sad4x8_c, 12),
+  make_tuple(4, 4, &aom_highbd_sad4x4_c, 12),
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(64, 16, &aom_sad64x16_c, -1),
+  make_tuple(16, 64, &aom_sad16x64_c, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(64, 16, &aom_highbd_sad64x16_c, 8),
+  make_tuple(16, 64, &aom_highbd_sad16x64_c, 8),
+  make_tuple(64, 16, &aom_highbd_sad64x16_c, 10),
+  make_tuple(16, 64, &aom_highbd_sad16x64_c, 10),
+  make_tuple(64, 16, &aom_highbd_sad64x16_c, 12),
+  make_tuple(16, 64, &aom_highbd_sad16x64_c, 12),
+#endif
+  make_tuple(32, 8, &aom_sad32x8_c, -1),
+  make_tuple(8, 32, &aom_sad8x32_c, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(32, 8, &aom_highbd_sad32x8_c, 8),
+  make_tuple(8, 32, &aom_highbd_sad8x32_c, 8),
+  make_tuple(32, 8, &aom_highbd_sad32x8_c, 10),
+  make_tuple(8, 32, &aom_highbd_sad8x32_c, 10),
+  make_tuple(32, 8, &aom_highbd_sad32x8_c, 12),
+  make_tuple(8, 32, &aom_highbd_sad8x32_c, 12),
+#endif
+  make_tuple(16, 4, &aom_sad16x4_c, -1),
+  make_tuple(4, 16, &aom_sad4x16_c, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(16, 4, &aom_highbd_sad16x4_c, 8),
+  make_tuple(4, 16, &aom_highbd_sad4x16_c, 8),
+  make_tuple(16, 4, &aom_highbd_sad16x4_c, 10),
+  make_tuple(4, 16, &aom_highbd_sad4x16_c, 10),
+  make_tuple(16, 4, &aom_highbd_sad16x4_c, 12),
+  make_tuple(4, 16, &aom_highbd_sad4x16_c, 12),
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(C, SADTest, ::testing::ValuesIn(c_tests));
+
+const SadMxNAvgParam avg_c_tests[] = {
+  make_tuple(128, 128, &aom_sad128x128_avg_c, -1),
+  make_tuple(128, 64, &aom_sad128x64_avg_c, -1),
+  make_tuple(64, 128, &aom_sad64x128_avg_c, -1),
+  make_tuple(64, 64, &aom_sad64x64_avg_c, -1),
+  make_tuple(64, 32, &aom_sad64x32_avg_c, -1),
+  make_tuple(32, 64, &aom_sad32x64_avg_c, -1),
+  make_tuple(32, 32, &aom_sad32x32_avg_c, -1),
+  make_tuple(32, 16, &aom_sad32x16_avg_c, -1),
+  make_tuple(16, 32, &aom_sad16x32_avg_c, -1),
+  make_tuple(16, 16, &aom_sad16x16_avg_c, -1),
+  make_tuple(16, 8, &aom_sad16x8_avg_c, -1),
+  make_tuple(8, 16, &aom_sad8x16_avg_c, -1),
+  make_tuple(8, 8, &aom_sad8x8_avg_c, -1),
+  make_tuple(8, 4, &aom_sad8x4_avg_c, -1),
+  make_tuple(4, 8, &aom_sad4x8_avg_c, -1),
+  make_tuple(4, 4, &aom_sad4x4_avg_c, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(128, 128, &aom_highbd_sad128x128_avg_c, 8),
+  make_tuple(128, 64, &aom_highbd_sad128x64_avg_c, 8),
+  make_tuple(64, 128, &aom_highbd_sad64x128_avg_c, 8),
+  make_tuple(64, 64, &aom_highbd_sad64x64_avg_c, 8),
+  make_tuple(64, 32, &aom_highbd_sad64x32_avg_c, 8),
+  make_tuple(32, 64, &aom_highbd_sad32x64_avg_c, 8),
+  make_tuple(32, 32, &aom_highbd_sad32x32_avg_c, 8),
+  make_tuple(32, 16, &aom_highbd_sad32x16_avg_c, 8),
+  make_tuple(16, 32, &aom_highbd_sad16x32_avg_c, 8),
+  make_tuple(16, 16, &aom_highbd_sad16x16_avg_c, 8),
+  make_tuple(16, 8, &aom_highbd_sad16x8_avg_c, 8),
+  make_tuple(8, 16, &aom_highbd_sad8x16_avg_c, 8),
+  make_tuple(8, 8, &aom_highbd_sad8x8_avg_c, 8),
+  make_tuple(8, 4, &aom_highbd_sad8x4_avg_c, 8),
+  make_tuple(4, 8, &aom_highbd_sad4x8_avg_c, 8),
+  make_tuple(4, 4, &aom_highbd_sad4x4_avg_c, 8),
+  make_tuple(128, 128, &aom_highbd_sad128x128_avg_c, 10),
+  make_tuple(128, 64, &aom_highbd_sad128x64_avg_c, 10),
+  make_tuple(64, 128, &aom_highbd_sad64x128_avg_c, 10),
+  make_tuple(64, 64, &aom_highbd_sad64x64_avg_c, 10),
+  make_tuple(64, 32, &aom_highbd_sad64x32_avg_c, 10),
+  make_tuple(32, 64, &aom_highbd_sad32x64_avg_c, 10),
+  make_tuple(32, 32, &aom_highbd_sad32x32_avg_c, 10),
+  make_tuple(32, 16, &aom_highbd_sad32x16_avg_c, 10),
+  make_tuple(16, 32, &aom_highbd_sad16x32_avg_c, 10),
+  make_tuple(16, 16, &aom_highbd_sad16x16_avg_c, 10),
+  make_tuple(16, 8, &aom_highbd_sad16x8_avg_c, 10),
+  make_tuple(8, 16, &aom_highbd_sad8x16_avg_c, 10),
+  make_tuple(8, 8, &aom_highbd_sad8x8_avg_c, 10),
+  make_tuple(8, 4, &aom_highbd_sad8x4_avg_c, 10),
+  make_tuple(4, 8, &aom_highbd_sad4x8_avg_c, 10),
+  make_tuple(4, 4, &aom_highbd_sad4x4_avg_c, 10),
+  make_tuple(128, 128, &aom_highbd_sad128x128_avg_c, 12),
+  make_tuple(128, 64, &aom_highbd_sad128x64_avg_c, 12),
+  make_tuple(64, 128, &aom_highbd_sad64x128_avg_c, 12),
+  make_tuple(64, 64, &aom_highbd_sad64x64_avg_c, 12),
+  make_tuple(64, 32, &aom_highbd_sad64x32_avg_c, 12),
+  make_tuple(32, 64, &aom_highbd_sad32x64_avg_c, 12),
+  make_tuple(32, 32, &aom_highbd_sad32x32_avg_c, 12),
+  make_tuple(32, 16, &aom_highbd_sad32x16_avg_c, 12),
+  make_tuple(16, 32, &aom_highbd_sad16x32_avg_c, 12),
+  make_tuple(16, 16, &aom_highbd_sad16x16_avg_c, 12),
+  make_tuple(16, 8, &aom_highbd_sad16x8_avg_c, 12),
+  make_tuple(8, 16, &aom_highbd_sad8x16_avg_c, 12),
+  make_tuple(8, 8, &aom_highbd_sad8x8_avg_c, 12),
+  make_tuple(8, 4, &aom_highbd_sad8x4_avg_c, 12),
+  make_tuple(4, 8, &aom_highbd_sad4x8_avg_c, 12),
+  make_tuple(4, 4, &aom_highbd_sad4x4_avg_c, 12),
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(64, 16, &aom_sad64x16_avg_c, -1),
+  make_tuple(16, 64, &aom_sad16x64_avg_c, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(64, 16, &aom_highbd_sad64x16_avg_c, 8),
+  make_tuple(16, 64, &aom_highbd_sad16x64_avg_c, 8),
+  make_tuple(64, 16, &aom_highbd_sad64x16_avg_c, 10),
+  make_tuple(16, 64, &aom_highbd_sad16x64_avg_c, 10),
+  make_tuple(64, 16, &aom_highbd_sad64x16_avg_c, 12),
+  make_tuple(16, 64, &aom_highbd_sad16x64_avg_c, 12),
+#endif
+  make_tuple(32, 8, &aom_sad32x8_avg_c, -1),
+  make_tuple(8, 32, &aom_sad8x32_avg_c, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(32, 8, &aom_highbd_sad32x8_avg_c, 8),
+  make_tuple(8, 32, &aom_highbd_sad8x32_avg_c, 8),
+  make_tuple(32, 8, &aom_highbd_sad32x8_avg_c, 10),
+  make_tuple(8, 32, &aom_highbd_sad8x32_avg_c, 10),
+  make_tuple(32, 8, &aom_highbd_sad32x8_avg_c, 12),
+  make_tuple(8, 32, &aom_highbd_sad8x32_avg_c, 12),
+#endif
+  make_tuple(16, 4, &aom_sad16x4_avg_c, -1),
+  make_tuple(4, 16, &aom_sad4x16_avg_c, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(16, 4, &aom_highbd_sad16x4_avg_c, 8),
+  make_tuple(4, 16, &aom_highbd_sad4x16_avg_c, 8),
+  make_tuple(16, 4, &aom_highbd_sad16x4_avg_c, 10),
+  make_tuple(4, 16, &aom_highbd_sad4x16_avg_c, 10),
+  make_tuple(16, 4, &aom_highbd_sad16x4_avg_c, 12),
+  make_tuple(4, 16, &aom_highbd_sad4x16_avg_c, 12),
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(C, SADavgTest, ::testing::ValuesIn(avg_c_tests));
+
+// TODO(chengchen): add highbd tests
+const DistWtdCompAvgParam dist_wtd_comp_avg_c_tests[] = {
+  make_tuple(128, 128, &aom_dist_wtd_comp_avg_pred_c, -1),
+  make_tuple(128, 64, &aom_dist_wtd_comp_avg_pred_c, -1),
+  make_tuple(64, 128, &aom_dist_wtd_comp_avg_pred_c, -1),
+  make_tuple(64, 64, &aom_dist_wtd_comp_avg_pred_c, -1),
+  make_tuple(64, 32, &aom_dist_wtd_comp_avg_pred_c, -1),
+  make_tuple(32, 64, &aom_dist_wtd_comp_avg_pred_c, -1),
+  make_tuple(32, 32, &aom_dist_wtd_comp_avg_pred_c, -1),
+  make_tuple(32, 16, &aom_dist_wtd_comp_avg_pred_c, -1),
+  make_tuple(16, 32, &aom_dist_wtd_comp_avg_pred_c, -1),
+  make_tuple(16, 16, &aom_dist_wtd_comp_avg_pred_c, -1),
+  make_tuple(16, 8, &aom_dist_wtd_comp_avg_pred_c, -1),
+  make_tuple(8, 16, &aom_dist_wtd_comp_avg_pred_c, -1),
+  make_tuple(8, 8, &aom_dist_wtd_comp_avg_pred_c, -1),
+  make_tuple(8, 4, &aom_dist_wtd_comp_avg_pred_c, -1),
+  make_tuple(4, 8, &aom_dist_wtd_comp_avg_pred_c, -1),
+  make_tuple(4, 4, &aom_dist_wtd_comp_avg_pred_c, -1),
+
+  make_tuple(64, 16, &aom_dist_wtd_comp_avg_pred_c, -1),
+  make_tuple(16, 64, &aom_dist_wtd_comp_avg_pred_c, -1),
+  make_tuple(32, 8, &aom_dist_wtd_comp_avg_pred_c, -1),
+  make_tuple(8, 32, &aom_dist_wtd_comp_avg_pred_c, -1),
+  make_tuple(16, 4, &aom_dist_wtd_comp_avg_pred_c, -1),
+  make_tuple(4, 16, &aom_dist_wtd_comp_avg_pred_c, -1),
+};
+
+INSTANTIATE_TEST_SUITE_P(C, DistWtdCompAvgTest,
+                         ::testing::ValuesIn(dist_wtd_comp_avg_c_tests));
+
+const DistWtdSadMxNAvgParam dist_wtd_avg_c_tests[] = {
+  make_tuple(128, 128, &aom_dist_wtd_sad128x128_avg_c, -1),
+  make_tuple(128, 64, &aom_dist_wtd_sad128x64_avg_c, -1),
+  make_tuple(64, 128, &aom_dist_wtd_sad64x128_avg_c, -1),
+  make_tuple(64, 64, &aom_dist_wtd_sad64x64_avg_c, -1),
+  make_tuple(64, 32, &aom_dist_wtd_sad64x32_avg_c, -1),
+  make_tuple(32, 64, &aom_dist_wtd_sad32x64_avg_c, -1),
+  make_tuple(32, 32, &aom_dist_wtd_sad32x32_avg_c, -1),
+  make_tuple(32, 16, &aom_dist_wtd_sad32x16_avg_c, -1),
+  make_tuple(16, 32, &aom_dist_wtd_sad16x32_avg_c, -1),
+  make_tuple(16, 16, &aom_dist_wtd_sad16x16_avg_c, -1),
+  make_tuple(16, 8, &aom_dist_wtd_sad16x8_avg_c, -1),
+  make_tuple(8, 16, &aom_dist_wtd_sad8x16_avg_c, -1),
+  make_tuple(8, 8, &aom_dist_wtd_sad8x8_avg_c, -1),
+  make_tuple(8, 4, &aom_dist_wtd_sad8x4_avg_c, -1),
+  make_tuple(4, 8, &aom_dist_wtd_sad4x8_avg_c, -1),
+  make_tuple(4, 4, &aom_dist_wtd_sad4x4_avg_c, -1),
+
+  make_tuple(64, 16, &aom_dist_wtd_sad64x16_avg_c, -1),
+  make_tuple(16, 64, &aom_dist_wtd_sad16x64_avg_c, -1),
+  make_tuple(32, 8, &aom_dist_wtd_sad32x8_avg_c, -1),
+  make_tuple(8, 32, &aom_dist_wtd_sad8x32_avg_c, -1),
+  make_tuple(16, 4, &aom_dist_wtd_sad16x4_avg_c, -1),
+  make_tuple(4, 16, &aom_dist_wtd_sad4x16_avg_c, -1),
+};
+
+INSTANTIATE_TEST_SUITE_P(C, DistWtdSADavgTest,
+                         ::testing::ValuesIn(dist_wtd_avg_c_tests));
+
+const SadMxNx4Param x4d_c_tests[] = {
+  make_tuple(128, 128, &aom_sad128x128x4d_c, -1),
+  make_tuple(128, 64, &aom_sad128x64x4d_c, -1),
+  make_tuple(64, 128, &aom_sad64x128x4d_c, -1),
+  make_tuple(64, 64, &aom_sad64x64x4d_c, -1),
+  make_tuple(64, 32, &aom_sad64x32x4d_c, -1),
+  make_tuple(32, 64, &aom_sad32x64x4d_c, -1),
+  make_tuple(32, 32, &aom_sad32x32x4d_c, -1),
+  make_tuple(32, 16, &aom_sad32x16x4d_c, -1),
+  make_tuple(16, 32, &aom_sad16x32x4d_c, -1),
+  make_tuple(16, 16, &aom_sad16x16x4d_c, -1),
+  make_tuple(16, 8, &aom_sad16x8x4d_c, -1),
+  make_tuple(8, 16, &aom_sad8x16x4d_c, -1),
+  make_tuple(8, 8, &aom_sad8x8x4d_c, -1),
+  make_tuple(8, 4, &aom_sad8x4x4d_c, -1),
+  make_tuple(4, 8, &aom_sad4x8x4d_c, -1),
+  make_tuple(4, 4, &aom_sad4x4x4d_c, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(128, 128, &aom_highbd_sad128x128x4d_c, 8),
+  make_tuple(128, 64, &aom_highbd_sad128x64x4d_c, 8),
+  make_tuple(64, 128, &aom_highbd_sad64x128x4d_c, 8),
+  make_tuple(64, 64, &aom_highbd_sad64x64x4d_c, 8),
+  make_tuple(64, 32, &aom_highbd_sad64x32x4d_c, 8),
+  make_tuple(32, 64, &aom_highbd_sad32x64x4d_c, 8),
+  make_tuple(32, 32, &aom_highbd_sad32x32x4d_c, 8),
+  make_tuple(32, 16, &aom_highbd_sad32x16x4d_c, 8),
+  make_tuple(16, 32, &aom_highbd_sad16x32x4d_c, 8),
+  make_tuple(16, 16, &aom_highbd_sad16x16x4d_c, 8),
+  make_tuple(16, 8, &aom_highbd_sad16x8x4d_c, 8),
+  make_tuple(8, 16, &aom_highbd_sad8x16x4d_c, 8),
+  make_tuple(8, 8, &aom_highbd_sad8x8x4d_c, 8),
+  make_tuple(8, 4, &aom_highbd_sad8x4x4d_c, 8),
+  make_tuple(4, 8, &aom_highbd_sad4x8x4d_c, 8),
+  make_tuple(4, 4, &aom_highbd_sad4x4x4d_c, 8),
+  make_tuple(128, 128, &aom_highbd_sad128x128x4d_c, 10),
+  make_tuple(128, 64, &aom_highbd_sad128x64x4d_c, 10),
+  make_tuple(64, 128, &aom_highbd_sad64x128x4d_c, 10),
+  make_tuple(64, 64, &aom_highbd_sad64x64x4d_c, 10),
+  make_tuple(64, 32, &aom_highbd_sad64x32x4d_c, 10),
+  make_tuple(32, 64, &aom_highbd_sad32x64x4d_c, 10),
+  make_tuple(32, 32, &aom_highbd_sad32x32x4d_c, 10),
+  make_tuple(32, 16, &aom_highbd_sad32x16x4d_c, 10),
+  make_tuple(16, 32, &aom_highbd_sad16x32x4d_c, 10),
+  make_tuple(16, 16, &aom_highbd_sad16x16x4d_c, 10),
+  make_tuple(16, 8, &aom_highbd_sad16x8x4d_c, 10),
+  make_tuple(8, 16, &aom_highbd_sad8x16x4d_c, 10),
+  make_tuple(8, 8, &aom_highbd_sad8x8x4d_c, 10),
+  make_tuple(8, 4, &aom_highbd_sad8x4x4d_c, 10),
+  make_tuple(4, 8, &aom_highbd_sad4x8x4d_c, 10),
+  make_tuple(4, 4, &aom_highbd_sad4x4x4d_c, 10),
+  make_tuple(128, 128, &aom_highbd_sad128x128x4d_c, 12),
+  make_tuple(128, 64, &aom_highbd_sad128x64x4d_c, 12),
+  make_tuple(64, 128, &aom_highbd_sad64x128x4d_c, 12),
+  make_tuple(64, 64, &aom_highbd_sad64x64x4d_c, 12),
+  make_tuple(64, 32, &aom_highbd_sad64x32x4d_c, 12),
+  make_tuple(32, 64, &aom_highbd_sad32x64x4d_c, 12),
+  make_tuple(32, 32, &aom_highbd_sad32x32x4d_c, 12),
+  make_tuple(32, 16, &aom_highbd_sad32x16x4d_c, 12),
+  make_tuple(16, 32, &aom_highbd_sad16x32x4d_c, 12),
+  make_tuple(16, 16, &aom_highbd_sad16x16x4d_c, 12),
+  make_tuple(16, 8, &aom_highbd_sad16x8x4d_c, 12),
+  make_tuple(8, 16, &aom_highbd_sad8x16x4d_c, 12),
+  make_tuple(8, 8, &aom_highbd_sad8x8x4d_c, 12),
+  make_tuple(8, 4, &aom_highbd_sad8x4x4d_c, 12),
+  make_tuple(4, 8, &aom_highbd_sad4x8x4d_c, 12),
+  make_tuple(4, 4, &aom_highbd_sad4x4x4d_c, 12),
+#endif
+  make_tuple(64, 16, &aom_sad64x16x4d_c, -1),
+  make_tuple(16, 64, &aom_sad16x64x4d_c, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(64, 16, &aom_highbd_sad64x16x4d_c, 8),
+  make_tuple(16, 64, &aom_highbd_sad16x64x4d_c, 8),
+  make_tuple(64, 16, &aom_highbd_sad64x16x4d_c, 10),
+  make_tuple(16, 64, &aom_highbd_sad16x64x4d_c, 10),
+  make_tuple(64, 16, &aom_highbd_sad64x16x4d_c, 12),
+  make_tuple(16, 64, &aom_highbd_sad16x64x4d_c, 12),
+#endif
+  make_tuple(32, 8, &aom_sad32x8x4d_c, -1),
+  make_tuple(8, 32, &aom_sad8x32x4d_c, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(32, 8, &aom_highbd_sad32x8x4d_c, 8),
+  make_tuple(8, 32, &aom_highbd_sad8x32x4d_c, 8),
+  make_tuple(32, 8, &aom_highbd_sad32x8x4d_c, 10),
+  make_tuple(8, 32, &aom_highbd_sad8x32x4d_c, 10),
+  make_tuple(32, 8, &aom_highbd_sad32x8x4d_c, 12),
+  make_tuple(8, 32, &aom_highbd_sad8x32x4d_c, 12),
+#endif
+  make_tuple(16, 4, &aom_sad16x4x4d_c, -1),
+  make_tuple(4, 16, &aom_sad4x16x4d_c, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(16, 4, &aom_highbd_sad16x4x4d_c, 8),
+  make_tuple(4, 16, &aom_highbd_sad4x16x4d_c, 8),
+  make_tuple(16, 4, &aom_highbd_sad16x4x4d_c, 10),
+  make_tuple(4, 16, &aom_highbd_sad4x16x4d_c, 10),
+  make_tuple(16, 4, &aom_highbd_sad16x4x4d_c, 12),
+  make_tuple(4, 16, &aom_highbd_sad4x16x4d_c, 12),
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(C, SADx4Test, ::testing::ValuesIn(x4d_c_tests));
+
+const SadMxNx4AvgParam x4d_avg_c_tests[] = {
+  make_tuple(128, 128, &aom_sad128x128x4d_avg_c, -1),
+  make_tuple(128, 64, &aom_sad128x64x4d_avg_c, -1),
+  make_tuple(64, 128, &aom_sad64x128x4d_avg_c, -1),
+  make_tuple(64, 64, &aom_sad64x64x4d_avg_c, -1),
+  make_tuple(64, 32, &aom_sad64x32x4d_avg_c, -1),
+  make_tuple(32, 64, &aom_sad32x64x4d_avg_c, -1),
+  make_tuple(32, 32, &aom_sad32x32x4d_avg_c, -1),
+  make_tuple(32, 16, &aom_sad32x16x4d_avg_c, -1),
+  make_tuple(16, 32, &aom_sad16x32x4d_avg_c, -1),
+  make_tuple(16, 16, &aom_sad16x16x4d_avg_c, -1),
+  make_tuple(16, 8, &aom_sad16x8x4d_avg_c, -1),
+  make_tuple(8, 16, &aom_sad8x16x4d_avg_c, -1),
+  make_tuple(8, 8, &aom_sad8x8x4d_avg_c, -1),
+  make_tuple(8, 4, &aom_sad8x4x4d_avg_c, -1),
+  make_tuple(4, 8, &aom_sad4x8x4d_avg_c, -1),
+  make_tuple(4, 4, &aom_sad4x4x4d_avg_c, -1),
+  make_tuple(64, 16, &aom_sad64x16x4d_avg_c, -1),
+  make_tuple(16, 64, &aom_sad16x64x4d_avg_c, -1),
+  make_tuple(32, 8, &aom_sad32x8x4d_avg_c, -1),
+  make_tuple(8, 32, &aom_sad8x32x4d_avg_c, -1),
+  make_tuple(16, 4, &aom_sad16x4x4d_avg_c, -1),
+  make_tuple(4, 16, &aom_sad4x16x4d_avg_c, -1),
+};
+INSTANTIATE_TEST_SUITE_P(C, SADx4AvgTest, ::testing::ValuesIn(x4d_avg_c_tests));
+
+//------------------------------------------------------------------------------
+// ARM functions
+#if HAVE_NEON
+const SadMxNParam neon_tests[] = {
+  make_tuple(64, 64, &aom_sad64x64_neon, -1),
+  make_tuple(32, 32, &aom_sad32x32_neon, -1),
+  make_tuple(16, 16, &aom_sad16x16_neon, -1),
+  make_tuple(16, 8, &aom_sad16x8_neon, -1),
+  make_tuple(8, 16, &aom_sad8x16_neon, -1),
+  make_tuple(8, 8, &aom_sad8x8_neon, -1),
+  make_tuple(4, 4, &aom_sad4x4_neon, -1),
+};
+INSTANTIATE_TEST_SUITE_P(NEON, SADTest, ::testing::ValuesIn(neon_tests));
+
+const SadMxNx4Param x4d_neon_tests[] = {
+  make_tuple(64, 64, &aom_sad64x64x4d_neon, -1),
+  make_tuple(32, 32, &aom_sad32x32x4d_neon, -1),
+  make_tuple(16, 16, &aom_sad16x16x4d_neon, -1),
+};
+INSTANTIATE_TEST_SUITE_P(NEON, SADx4Test, ::testing::ValuesIn(x4d_neon_tests));
+#endif  // HAVE_NEON
+
+//------------------------------------------------------------------------------
+// x86 functions
+#if HAVE_SSE2
+const SadMxNParam sse2_tests[] = {
+  make_tuple(128, 128, &aom_sad128x128_sse2, -1),
+  make_tuple(128, 64, &aom_sad128x64_sse2, -1),
+  make_tuple(64, 128, &aom_sad64x128_sse2, -1),
+  make_tuple(64, 64, &aom_sad64x64_sse2, -1),
+  make_tuple(64, 32, &aom_sad64x32_sse2, -1),
+  make_tuple(32, 64, &aom_sad32x64_sse2, -1),
+  make_tuple(32, 32, &aom_sad32x32_sse2, -1),
+  make_tuple(32, 16, &aom_sad32x16_sse2, -1),
+  make_tuple(16, 32, &aom_sad16x32_sse2, -1),
+  make_tuple(16, 16, &aom_sad16x16_sse2, -1),
+  make_tuple(16, 8, &aom_sad16x8_sse2, -1),
+  make_tuple(8, 16, &aom_sad8x16_sse2, -1),
+  make_tuple(8, 8, &aom_sad8x8_sse2, -1),
+  make_tuple(8, 4, &aom_sad8x4_sse2, -1),
+  make_tuple(4, 8, &aom_sad4x8_sse2, -1),
+  make_tuple(4, 4, &aom_sad4x4_sse2, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(64, 64, &aom_highbd_sad64x64_sse2, 8),
+  make_tuple(64, 32, &aom_highbd_sad64x32_sse2, 8),
+  make_tuple(32, 64, &aom_highbd_sad32x64_sse2, 8),
+  make_tuple(32, 32, &aom_highbd_sad32x32_sse2, 8),
+  make_tuple(32, 16, &aom_highbd_sad32x16_sse2, 8),
+  make_tuple(16, 32, &aom_highbd_sad16x32_sse2, 8),
+  make_tuple(16, 16, &aom_highbd_sad16x16_sse2, 8),
+  make_tuple(16, 8, &aom_highbd_sad16x8_sse2, 8),
+  make_tuple(8, 16, &aom_highbd_sad8x16_sse2, 8),
+  make_tuple(8, 8, &aom_highbd_sad8x8_sse2, 8),
+  make_tuple(8, 4, &aom_highbd_sad8x4_sse2, 8),
+  make_tuple(4, 8, &aom_highbd_sad4x8_sse2, 8),
+  make_tuple(4, 4, &aom_highbd_sad4x4_sse2, 8),
+  make_tuple(64, 64, &aom_highbd_sad64x64_sse2, 10),
+  make_tuple(64, 32, &aom_highbd_sad64x32_sse2, 10),
+  make_tuple(32, 64, &aom_highbd_sad32x64_sse2, 10),
+  make_tuple(32, 32, &aom_highbd_sad32x32_sse2, 10),
+  make_tuple(32, 16, &aom_highbd_sad32x16_sse2, 10),
+  make_tuple(16, 32, &aom_highbd_sad16x32_sse2, 10),
+  make_tuple(16, 16, &aom_highbd_sad16x16_sse2, 10),
+  make_tuple(16, 8, &aom_highbd_sad16x8_sse2, 10),
+  make_tuple(8, 16, &aom_highbd_sad8x16_sse2, 10),
+  make_tuple(8, 8, &aom_highbd_sad8x8_sse2, 10),
+  make_tuple(8, 4, &aom_highbd_sad8x4_sse2, 10),
+  make_tuple(4, 8, &aom_highbd_sad4x8_sse2, 10),
+  make_tuple(4, 4, &aom_highbd_sad4x4_sse2, 10),
+  make_tuple(64, 64, &aom_highbd_sad64x64_sse2, 12),
+  make_tuple(64, 32, &aom_highbd_sad64x32_sse2, 12),
+  make_tuple(32, 64, &aom_highbd_sad32x64_sse2, 12),
+  make_tuple(32, 32, &aom_highbd_sad32x32_sse2, 12),
+  make_tuple(32, 16, &aom_highbd_sad32x16_sse2, 12),
+  make_tuple(16, 32, &aom_highbd_sad16x32_sse2, 12),
+  make_tuple(16, 16, &aom_highbd_sad16x16_sse2, 12),
+  make_tuple(16, 8, &aom_highbd_sad16x8_sse2, 12),
+  make_tuple(8, 16, &aom_highbd_sad8x16_sse2, 12),
+  make_tuple(8, 8, &aom_highbd_sad8x8_sse2, 12),
+  make_tuple(8, 4, &aom_highbd_sad8x4_sse2, 12),
+  make_tuple(4, 8, &aom_highbd_sad4x8_sse2, 12),
+  make_tuple(4, 4, &aom_highbd_sad4x4_sse2, 12),
+#endif
+  make_tuple(64, 16, &aom_sad64x16_sse2, -1),
+  make_tuple(16, 64, &aom_sad16x64_sse2, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(64, 16, &aom_highbd_sad64x16_sse2, 8),
+  make_tuple(16, 64, &aom_highbd_sad16x64_sse2, 8),
+  make_tuple(64, 16, &aom_highbd_sad64x16_sse2, 10),
+  make_tuple(16, 64, &aom_highbd_sad16x64_sse2, 10),
+  make_tuple(64, 16, &aom_highbd_sad64x16_sse2, 12),
+  make_tuple(16, 64, &aom_highbd_sad16x64_sse2, 12),
+#endif
+  make_tuple(32, 8, &aom_sad32x8_sse2, -1),
+  make_tuple(8, 32, &aom_sad8x32_sse2, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(32, 8, &aom_highbd_sad32x8_sse2, 8),
+  make_tuple(8, 32, &aom_highbd_sad8x32_sse2, 8),
+  make_tuple(32, 8, &aom_highbd_sad32x8_sse2, 10),
+  make_tuple(8, 32, &aom_highbd_sad8x32_sse2, 10),
+  make_tuple(32, 8, &aom_highbd_sad32x8_sse2, 12),
+  make_tuple(8, 32, &aom_highbd_sad8x32_sse2, 12),
+#endif
+  make_tuple(16, 4, &aom_sad16x4_sse2, -1),
+  make_tuple(4, 16, &aom_sad4x16_sse2, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(16, 4, &aom_highbd_sad16x4_sse2, 8),
+  make_tuple(4, 16, &aom_highbd_sad4x16_sse2, 8),
+  make_tuple(16, 4, &aom_highbd_sad16x4_sse2, 10),
+  make_tuple(4, 16, &aom_highbd_sad4x16_sse2, 10),
+  make_tuple(16, 4, &aom_highbd_sad16x4_sse2, 12),
+  make_tuple(4, 16, &aom_highbd_sad4x16_sse2, 12),
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(SSE2, SADTest, ::testing::ValuesIn(sse2_tests));
+
+const SadMxNAvgParam avg_sse2_tests[] = {
+  make_tuple(128, 128, &aom_sad128x128_avg_sse2, -1),
+  make_tuple(128, 64, &aom_sad128x64_avg_sse2, -1),
+  make_tuple(64, 128, &aom_sad64x128_avg_sse2, -1),
+  make_tuple(64, 64, &aom_sad64x64_avg_sse2, -1),
+  make_tuple(64, 32, &aom_sad64x32_avg_sse2, -1),
+  make_tuple(32, 64, &aom_sad32x64_avg_sse2, -1),
+  make_tuple(32, 32, &aom_sad32x32_avg_sse2, -1),
+  make_tuple(32, 16, &aom_sad32x16_avg_sse2, -1),
+  make_tuple(16, 32, &aom_sad16x32_avg_sse2, -1),
+  make_tuple(16, 16, &aom_sad16x16_avg_sse2, -1),
+  make_tuple(16, 8, &aom_sad16x8_avg_sse2, -1),
+  make_tuple(8, 16, &aom_sad8x16_avg_sse2, -1),
+  make_tuple(8, 8, &aom_sad8x8_avg_sse2, -1),
+  make_tuple(8, 4, &aom_sad8x4_avg_sse2, -1),
+  make_tuple(4, 8, &aom_sad4x8_avg_sse2, -1),
+  make_tuple(4, 4, &aom_sad4x4_avg_sse2, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(64, 64, &aom_highbd_sad64x64_avg_sse2, 8),
+  make_tuple(64, 32, &aom_highbd_sad64x32_avg_sse2, 8),
+  make_tuple(32, 64, &aom_highbd_sad32x64_avg_sse2, 8),
+  make_tuple(32, 32, &aom_highbd_sad32x32_avg_sse2, 8),
+  make_tuple(32, 16, &aom_highbd_sad32x16_avg_sse2, 8),
+  make_tuple(16, 32, &aom_highbd_sad16x32_avg_sse2, 8),
+  make_tuple(16, 16, &aom_highbd_sad16x16_avg_sse2, 8),
+  make_tuple(16, 8, &aom_highbd_sad16x8_avg_sse2, 8),
+  make_tuple(8, 16, &aom_highbd_sad8x16_avg_sse2, 8),
+  make_tuple(8, 8, &aom_highbd_sad8x8_avg_sse2, 8),
+  make_tuple(8, 4, &aom_highbd_sad8x4_avg_sse2, 8),
+  make_tuple(4, 8, &aom_highbd_sad4x8_avg_sse2, 8),
+  make_tuple(4, 4, &aom_highbd_sad4x4_avg_sse2, 8),
+  make_tuple(64, 64, &aom_highbd_sad64x64_avg_sse2, 10),
+  make_tuple(64, 32, &aom_highbd_sad64x32_avg_sse2, 10),
+  make_tuple(32, 64, &aom_highbd_sad32x64_avg_sse2, 10),
+  make_tuple(32, 32, &aom_highbd_sad32x32_avg_sse2, 10),
+  make_tuple(32, 16, &aom_highbd_sad32x16_avg_sse2, 10),
+  make_tuple(16, 32, &aom_highbd_sad16x32_avg_sse2, 10),
+  make_tuple(16, 16, &aom_highbd_sad16x16_avg_sse2, 10),
+  make_tuple(16, 8, &aom_highbd_sad16x8_avg_sse2, 10),
+  make_tuple(8, 16, &aom_highbd_sad8x16_avg_sse2, 10),
+  make_tuple(8, 8, &aom_highbd_sad8x8_avg_sse2, 10),
+  make_tuple(8, 4, &aom_highbd_sad8x4_avg_sse2, 10),
+  make_tuple(4, 8, &aom_highbd_sad4x8_avg_sse2, 10),
+  make_tuple(4, 4, &aom_highbd_sad4x4_avg_sse2, 10),
+  make_tuple(64, 64, &aom_highbd_sad64x64_avg_sse2, 12),
+  make_tuple(64, 32, &aom_highbd_sad64x32_avg_sse2, 12),
+  make_tuple(32, 64, &aom_highbd_sad32x64_avg_sse2, 12),
+  make_tuple(32, 32, &aom_highbd_sad32x32_avg_sse2, 12),
+  make_tuple(32, 16, &aom_highbd_sad32x16_avg_sse2, 12),
+  make_tuple(16, 32, &aom_highbd_sad16x32_avg_sse2, 12),
+  make_tuple(16, 16, &aom_highbd_sad16x16_avg_sse2, 12),
+  make_tuple(16, 8, &aom_highbd_sad16x8_avg_sse2, 12),
+  make_tuple(8, 16, &aom_highbd_sad8x16_avg_sse2, 12),
+  make_tuple(8, 8, &aom_highbd_sad8x8_avg_sse2, 12),
+  make_tuple(8, 4, &aom_highbd_sad8x4_avg_sse2, 12),
+  make_tuple(4, 8, &aom_highbd_sad4x8_avg_sse2, 12),
+  make_tuple(4, 4, &aom_highbd_sad4x4_avg_sse2, 12),
+#endif
+  make_tuple(64, 16, &aom_sad64x16_avg_sse2, -1),
+  make_tuple(16, 64, &aom_sad16x64_avg_sse2, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(64, 16, &aom_highbd_sad64x16_avg_sse2, 8),
+  make_tuple(16, 64, &aom_highbd_sad16x64_avg_sse2, 8),
+  make_tuple(64, 16, &aom_highbd_sad64x16_avg_sse2, 10),
+  make_tuple(16, 64, &aom_highbd_sad16x64_avg_sse2, 10),
+  make_tuple(64, 16, &aom_highbd_sad64x16_avg_sse2, 12),
+  make_tuple(16, 64, &aom_highbd_sad16x64_avg_sse2, 12),
+#endif
+  make_tuple(32, 8, &aom_sad32x8_avg_sse2, -1),
+  make_tuple(8, 32, &aom_sad8x32_avg_sse2, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(32, 8, &aom_highbd_sad32x8_avg_sse2, 8),
+  make_tuple(8, 32, &aom_highbd_sad8x32_avg_sse2, 8),
+  make_tuple(32, 8, &aom_highbd_sad32x8_avg_sse2, 10),
+  make_tuple(8, 32, &aom_highbd_sad8x32_avg_sse2, 10),
+  make_tuple(32, 8, &aom_highbd_sad32x8_avg_sse2, 12),
+  make_tuple(8, 32, &aom_highbd_sad8x32_avg_sse2, 12),
+#endif
+  make_tuple(16, 4, &aom_sad16x4_avg_sse2, -1),
+  make_tuple(4, 16, &aom_sad4x16_avg_sse2, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(16, 4, &aom_highbd_sad16x4_avg_sse2, 8),
+  make_tuple(4, 16, &aom_highbd_sad4x16_avg_sse2, 8),
+  make_tuple(16, 4, &aom_highbd_sad16x4_avg_sse2, 10),
+  make_tuple(4, 16, &aom_highbd_sad4x16_avg_sse2, 10),
+  make_tuple(16, 4, &aom_highbd_sad16x4_avg_sse2, 12),
+  make_tuple(4, 16, &aom_highbd_sad4x16_avg_sse2, 12),
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(SSE2, SADavgTest, ::testing::ValuesIn(avg_sse2_tests));
+
+const SadMxNx4Param x4d_sse2_tests[] = {
+  make_tuple(128, 128, &aom_sad128x128x4d_sse2, -1),
+  make_tuple(128, 64, &aom_sad128x64x4d_sse2, -1),
+  make_tuple(64, 128, &aom_sad64x128x4d_sse2, -1),
+  make_tuple(64, 64, &aom_sad64x64x4d_sse2, -1),
+  make_tuple(64, 32, &aom_sad64x32x4d_sse2, -1),
+  make_tuple(32, 64, &aom_sad32x64x4d_sse2, -1),
+  make_tuple(32, 32, &aom_sad32x32x4d_sse2, -1),
+  make_tuple(32, 16, &aom_sad32x16x4d_sse2, -1),
+  make_tuple(16, 32, &aom_sad16x32x4d_sse2, -1),
+  make_tuple(16, 16, &aom_sad16x16x4d_sse2, -1),
+  make_tuple(16, 8, &aom_sad16x8x4d_sse2, -1),
+  make_tuple(8, 16, &aom_sad8x16x4d_sse2, -1),
+  make_tuple(8, 8, &aom_sad8x8x4d_sse2, -1),
+  make_tuple(8, 4, &aom_sad8x4x4d_sse2, -1),
+  make_tuple(4, 8, &aom_sad4x8x4d_sse2, -1),
+  make_tuple(4, 4, &aom_sad4x4x4d_sse2, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(64, 64, &aom_highbd_sad64x64x4d_sse2, 8),
+  make_tuple(64, 32, &aom_highbd_sad64x32x4d_sse2, 8),
+  make_tuple(32, 64, &aom_highbd_sad32x64x4d_sse2, 8),
+  make_tuple(32, 32, &aom_highbd_sad32x32x4d_sse2, 8),
+  make_tuple(32, 16, &aom_highbd_sad32x16x4d_sse2, 8),
+  make_tuple(16, 32, &aom_highbd_sad16x32x4d_sse2, 8),
+  make_tuple(16, 16, &aom_highbd_sad16x16x4d_sse2, 8),
+  make_tuple(16, 8, &aom_highbd_sad16x8x4d_sse2, 8),
+  make_tuple(8, 16, &aom_highbd_sad8x16x4d_sse2, 8),
+  make_tuple(8, 8, &aom_highbd_sad8x8x4d_sse2, 8),
+  make_tuple(8, 4, &aom_highbd_sad8x4x4d_sse2, 8),
+  make_tuple(4, 8, &aom_highbd_sad4x8x4d_sse2, 8),
+  make_tuple(4, 4, &aom_highbd_sad4x4x4d_sse2, 8),
+  make_tuple(64, 64, &aom_highbd_sad64x64x4d_sse2, 10),
+  make_tuple(64, 32, &aom_highbd_sad64x32x4d_sse2, 10),
+  make_tuple(32, 64, &aom_highbd_sad32x64x4d_sse2, 10),
+  make_tuple(32, 32, &aom_highbd_sad32x32x4d_sse2, 10),
+  make_tuple(32, 16, &aom_highbd_sad32x16x4d_sse2, 10),
+  make_tuple(16, 32, &aom_highbd_sad16x32x4d_sse2, 10),
+  make_tuple(16, 16, &aom_highbd_sad16x16x4d_sse2, 10),
+  make_tuple(16, 8, &aom_highbd_sad16x8x4d_sse2, 10),
+  make_tuple(8, 16, &aom_highbd_sad8x16x4d_sse2, 10),
+  make_tuple(8, 8, &aom_highbd_sad8x8x4d_sse2, 10),
+  make_tuple(8, 4, &aom_highbd_sad8x4x4d_sse2, 10),
+  make_tuple(4, 8, &aom_highbd_sad4x8x4d_sse2, 10),
+  make_tuple(4, 4, &aom_highbd_sad4x4x4d_sse2, 10),
+  make_tuple(64, 64, &aom_highbd_sad64x64x4d_sse2, 12),
+  make_tuple(64, 32, &aom_highbd_sad64x32x4d_sse2, 12),
+  make_tuple(32, 64, &aom_highbd_sad32x64x4d_sse2, 12),
+  make_tuple(32, 32, &aom_highbd_sad32x32x4d_sse2, 12),
+  make_tuple(32, 16, &aom_highbd_sad32x16x4d_sse2, 12),
+  make_tuple(16, 32, &aom_highbd_sad16x32x4d_sse2, 12),
+  make_tuple(16, 16, &aom_highbd_sad16x16x4d_sse2, 12),
+  make_tuple(16, 8, &aom_highbd_sad16x8x4d_sse2, 12),
+  make_tuple(8, 16, &aom_highbd_sad8x16x4d_sse2, 12),
+  make_tuple(8, 8, &aom_highbd_sad8x8x4d_sse2, 12),
+  make_tuple(8, 4, &aom_highbd_sad8x4x4d_sse2, 12),
+  make_tuple(4, 8, &aom_highbd_sad4x8x4d_sse2, 12),
+  make_tuple(4, 4, &aom_highbd_sad4x4x4d_sse2, 12),
+#endif
+  make_tuple(64, 16, &aom_sad64x16x4d_sse2, -1),
+  make_tuple(16, 64, &aom_sad16x64x4d_sse2, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(64, 16, &aom_highbd_sad64x16x4d_sse2, 8),
+  make_tuple(16, 64, &aom_highbd_sad16x64x4d_sse2, 8),
+  make_tuple(64, 16, &aom_highbd_sad64x16x4d_sse2, 10),
+  make_tuple(16, 64, &aom_highbd_sad16x64x4d_sse2, 10),
+  make_tuple(64, 16, &aom_highbd_sad64x16x4d_sse2, 12),
+  make_tuple(16, 64, &aom_highbd_sad16x64x4d_sse2, 12),
+#endif
+  make_tuple(32, 8, &aom_sad32x8x4d_sse2, -1),
+  make_tuple(8, 32, &aom_sad8x32x4d_sse2, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(32, 8, &aom_highbd_sad32x8x4d_sse2, 8),
+  make_tuple(8, 32, &aom_highbd_sad8x32x4d_sse2, 8),
+  make_tuple(32, 8, &aom_highbd_sad32x8x4d_sse2, 10),
+  make_tuple(8, 32, &aom_highbd_sad8x32x4d_sse2, 10),
+  make_tuple(32, 8, &aom_highbd_sad32x8x4d_sse2, 12),
+  make_tuple(8, 32, &aom_highbd_sad8x32x4d_sse2, 12),
+#endif
+  make_tuple(16, 4, &aom_sad16x4x4d_sse2, -1),
+  make_tuple(4, 16, &aom_sad4x16x4d_sse2, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(16, 4, &aom_highbd_sad16x4x4d_sse2, 8),
+  make_tuple(4, 16, &aom_highbd_sad4x16x4d_sse2, 8),
+  make_tuple(16, 4, &aom_highbd_sad16x4x4d_sse2, 10),
+  make_tuple(4, 16, &aom_highbd_sad4x16x4d_sse2, 10),
+  make_tuple(16, 4, &aom_highbd_sad16x4x4d_sse2, 12),
+  make_tuple(4, 16, &aom_highbd_sad4x16x4d_sse2, 12),
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(SSE2, SADx4Test, ::testing::ValuesIn(x4d_sse2_tests));
+
+const SadMxNx4AvgParam x4d_avg_sse2_tests[] = {
+  make_tuple(128, 128, &aom_sad128x128x4d_avg_sse2, -1),
+  make_tuple(128, 64, &aom_sad128x64x4d_avg_sse2, -1),
+  make_tuple(64, 128, &aom_sad64x128x4d_avg_sse2, -1),
+  make_tuple(64, 64, &aom_sad64x64x4d_avg_sse2, -1),
+  make_tuple(64, 32, &aom_sad64x32x4d_avg_sse2, -1),
+  make_tuple(32, 64, &aom_sad32x64x4d_avg_sse2, -1),
+  make_tuple(32, 32, &aom_sad32x32x4d_avg_sse2, -1),
+  make_tuple(32, 16, &aom_sad32x16x4d_avg_sse2, -1),
+  make_tuple(16, 32, &aom_sad16x32x4d_avg_sse2, -1),
+  make_tuple(16, 16, &aom_sad16x16x4d_avg_sse2, -1),
+  make_tuple(16, 8, &aom_sad16x8x4d_avg_sse2, -1),
+  make_tuple(8, 16, &aom_sad8x16x4d_avg_sse2, -1),
+  make_tuple(8, 8, &aom_sad8x8x4d_avg_sse2, -1),
+  make_tuple(8, 4, &aom_sad8x4x4d_avg_sse2, -1),
+  make_tuple(4, 8, &aom_sad4x8x4d_avg_sse2, -1),
+  make_tuple(4, 4, &aom_sad4x4x4d_avg_sse2, -1),
+  make_tuple(64, 16, &aom_sad64x16x4d_avg_sse2, -1),
+  make_tuple(16, 64, &aom_sad16x64x4d_avg_sse2, -1),
+  make_tuple(32, 8, &aom_sad32x8x4d_avg_sse2, -1),
+  make_tuple(8, 32, &aom_sad8x32x4d_avg_sse2, -1),
+  make_tuple(16, 4, &aom_sad16x4x4d_avg_sse2, -1),
+  make_tuple(4, 16, &aom_sad4x16x4d_avg_sse2, -1),
+};
+INSTANTIATE_TEST_SUITE_P(SSE2, SADx4AvgTest,
+                         ::testing::ValuesIn(x4d_avg_sse2_tests));
+#endif  // HAVE_SSE2
+
+#if HAVE_SSSE3
+// Note: These are named sse2, but part of ssse3 file and only built and linked
+// when ssse3 is enabled.
+const DistWtdSadMxhParam dist_wtd_sad_sse2_tests[] = {
+  make_tuple(4, 4, &aom_sad4xh_sse2, -1),
+  make_tuple(4, 8, &aom_sad4xh_sse2, -1),
+  make_tuple(8, 4, &aom_sad8xh_sse2, -1),
+  make_tuple(8, 8, &aom_sad8xh_sse2, -1),
+  make_tuple(8, 16, &aom_sad8xh_sse2, -1),
+  make_tuple(16, 8, &aom_sad16xh_sse2, -1),
+  make_tuple(16, 16, &aom_sad16xh_sse2, -1),
+  make_tuple(16, 32, &aom_sad16xh_sse2, -1),
+  make_tuple(32, 16, &aom_sad32xh_sse2, -1),
+  make_tuple(32, 32, &aom_sad32xh_sse2, -1),
+  make_tuple(32, 64, &aom_sad32xh_sse2, -1),
+  make_tuple(64, 32, &aom_sad64xh_sse2, -1),
+  make_tuple(64, 64, &aom_sad64xh_sse2, -1),
+  make_tuple(128, 128, &aom_sad128xh_sse2, -1),
+  make_tuple(128, 64, &aom_sad128xh_sse2, -1),
+  make_tuple(64, 128, &aom_sad64xh_sse2, -1),
+  make_tuple(4, 16, &aom_sad4xh_sse2, -1),
+  make_tuple(16, 4, &aom_sad16xh_sse2, -1),
+  make_tuple(8, 32, &aom_sad8xh_sse2, -1),
+  make_tuple(32, 8, &aom_sad32xh_sse2, -1),
+  make_tuple(16, 64, &aom_sad16xh_sse2, -1),
+  make_tuple(64, 16, &aom_sad64xh_sse2, -1),
+
+  make_tuple(16, 64, &aom_sad16xh_sse2, -1),
+  make_tuple(64, 16, &aom_sad64xh_sse2, -1),
+  make_tuple(8, 32, &aom_sad8xh_sse2, -1),
+  make_tuple(32, 8, &aom_sad32xh_sse2, -1),
+  make_tuple(4, 16, &aom_sad4xh_sse2, -1),
+  make_tuple(16, 4, &aom_sad16xh_sse2, -1),
+};
+INSTANTIATE_TEST_SUITE_P(SSE2, DistWtdSADTest,
+                         ::testing::ValuesIn(dist_wtd_sad_sse2_tests));
+
+#endif  // HAVE_SSSE3
+
+#if HAVE_SSE3
+// Only functions are x3, which do not have tests.
+#endif  // HAVE_SSE3
+
+#if HAVE_SSSE3
+const DistWtdCompAvgParam dist_wtd_comp_avg_ssse3_tests[] = {
+  make_tuple(128, 128, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+  make_tuple(128, 64, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+  make_tuple(64, 128, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+  make_tuple(64, 64, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+  make_tuple(64, 32, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+  make_tuple(32, 64, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+  make_tuple(32, 32, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+  make_tuple(32, 16, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+  make_tuple(16, 32, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+  make_tuple(16, 16, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+  make_tuple(16, 8, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+  make_tuple(8, 16, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+  make_tuple(8, 8, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+  make_tuple(8, 4, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+  make_tuple(4, 8, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+  make_tuple(4, 4, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+  make_tuple(16, 16, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+
+  make_tuple(64, 16, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+  make_tuple(16, 64, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+  make_tuple(32, 8, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+  make_tuple(8, 32, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+  make_tuple(16, 4, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+  make_tuple(4, 16, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+};
+
+INSTANTIATE_TEST_SUITE_P(SSSE3, DistWtdCompAvgTest,
+                         ::testing::ValuesIn(dist_wtd_comp_avg_ssse3_tests));
+
+const DistWtdSadMxNAvgParam dist_wtd_avg_ssse3_tests[] = {
+  make_tuple(128, 128, &aom_dist_wtd_sad128x128_avg_ssse3, -1),
+  make_tuple(128, 64, &aom_dist_wtd_sad128x64_avg_ssse3, -1),
+  make_tuple(64, 128, &aom_dist_wtd_sad64x128_avg_ssse3, -1),
+  make_tuple(64, 64, &aom_dist_wtd_sad64x64_avg_ssse3, -1),
+  make_tuple(64, 32, &aom_dist_wtd_sad64x32_avg_ssse3, -1),
+  make_tuple(32, 64, &aom_dist_wtd_sad32x64_avg_ssse3, -1),
+  make_tuple(32, 32, &aom_dist_wtd_sad32x32_avg_ssse3, -1),
+  make_tuple(32, 16, &aom_dist_wtd_sad32x16_avg_ssse3, -1),
+  make_tuple(16, 32, &aom_dist_wtd_sad16x32_avg_ssse3, -1),
+  make_tuple(16, 16, &aom_dist_wtd_sad16x16_avg_ssse3, -1),
+  make_tuple(16, 8, &aom_dist_wtd_sad16x8_avg_ssse3, -1),
+  make_tuple(8, 16, &aom_dist_wtd_sad8x16_avg_ssse3, -1),
+  make_tuple(8, 8, &aom_dist_wtd_sad8x8_avg_ssse3, -1),
+  make_tuple(8, 4, &aom_dist_wtd_sad8x4_avg_ssse3, -1),
+  make_tuple(4, 8, &aom_dist_wtd_sad4x8_avg_ssse3, -1),
+  make_tuple(4, 4, &aom_dist_wtd_sad4x4_avg_ssse3, -1),
+
+  make_tuple(64, 16, &aom_dist_wtd_sad64x16_avg_ssse3, -1),
+  make_tuple(16, 64, &aom_dist_wtd_sad16x64_avg_ssse3, -1),
+  make_tuple(32, 8, &aom_dist_wtd_sad32x8_avg_ssse3, -1),
+  make_tuple(8, 32, &aom_dist_wtd_sad8x32_avg_ssse3, -1),
+  make_tuple(16, 4, &aom_dist_wtd_sad16x4_avg_ssse3, -1),
+  make_tuple(4, 16, &aom_dist_wtd_sad4x16_avg_ssse3, -1),
+};
+INSTANTIATE_TEST_SUITE_P(SSSE3, DistWtdSADavgTest,
+                         ::testing::ValuesIn(dist_wtd_avg_ssse3_tests));
+#endif  // HAVE_SSSE3
+
+#if HAVE_SSE4_1
+// Only functions are x8, which do not have tests.
+#endif  // HAVE_SSE4_1
+
+#if HAVE_AVX2
+const SadMxNParam avx2_tests[] = {
+  make_tuple(64, 128, &aom_sad64x128_avx2, -1),
+  make_tuple(128, 64, &aom_sad128x64_avx2, -1),
+  make_tuple(128, 128, &aom_sad128x128_avx2, -1),
+  make_tuple(64, 64, &aom_sad64x64_avx2, -1),
+  make_tuple(64, 32, &aom_sad64x32_avx2, -1),
+  make_tuple(32, 64, &aom_sad32x64_avx2, -1),
+  make_tuple(32, 32, &aom_sad32x32_avx2, -1),
+  make_tuple(32, 16, &aom_sad32x16_avx2, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(128, 128, &aom_highbd_sad128x128_avx2, 8),
+  make_tuple(128, 128, &aom_highbd_sad128x128_avx2, 10),
+  make_tuple(128, 128, &aom_highbd_sad128x128_avx2, 12),
+  make_tuple(128, 64, &aom_highbd_sad128x64_avx2, 8),
+  make_tuple(128, 64, &aom_highbd_sad128x64_avx2, 10),
+  make_tuple(128, 64, &aom_highbd_sad128x64_avx2, 12),
+  make_tuple(64, 128, &aom_highbd_sad64x128_avx2, 8),
+  make_tuple(64, 128, &aom_highbd_sad64x128_avx2, 10),
+  make_tuple(64, 128, &aom_highbd_sad64x128_avx2, 12),
+  make_tuple(64, 64, &aom_highbd_sad64x64_avx2, 8),
+  make_tuple(64, 64, &aom_highbd_sad64x64_avx2, 10),
+  make_tuple(64, 64, &aom_highbd_sad64x64_avx2, 12),
+  make_tuple(64, 32, &aom_highbd_sad64x32_avx2, 8),
+  make_tuple(64, 32, &aom_highbd_sad64x32_avx2, 10),
+  make_tuple(64, 32, &aom_highbd_sad64x32_avx2, 12),
+  make_tuple(32, 64, &aom_highbd_sad32x64_avx2, 8),
+  make_tuple(32, 64, &aom_highbd_sad32x64_avx2, 10),
+  make_tuple(32, 64, &aom_highbd_sad32x64_avx2, 12),
+  make_tuple(32, 32, &aom_highbd_sad32x32_avx2, 8),
+  make_tuple(32, 32, &aom_highbd_sad32x32_avx2, 10),
+  make_tuple(32, 32, &aom_highbd_sad32x32_avx2, 12),
+  make_tuple(32, 16, &aom_highbd_sad32x16_avx2, 8),
+  make_tuple(32, 16, &aom_highbd_sad32x16_avx2, 10),
+  make_tuple(32, 16, &aom_highbd_sad32x16_avx2, 12),
+  make_tuple(16, 32, &aom_highbd_sad16x32_avx2, 8),
+  make_tuple(16, 32, &aom_highbd_sad16x32_avx2, 10),
+  make_tuple(16, 32, &aom_highbd_sad16x32_avx2, 12),
+  make_tuple(16, 16, &aom_highbd_sad16x16_avx2, 8),
+  make_tuple(16, 16, &aom_highbd_sad16x16_avx2, 10),
+  make_tuple(16, 16, &aom_highbd_sad16x16_avx2, 12),
+  make_tuple(16, 8, &aom_highbd_sad16x8_avx2, 8),
+  make_tuple(16, 8, &aom_highbd_sad16x8_avx2, 10),
+  make_tuple(16, 8, &aom_highbd_sad16x8_avx2, 12),
+
+  make_tuple(64, 16, &aom_highbd_sad64x16_avx2, 8),
+  make_tuple(64, 16, &aom_highbd_sad64x16_avx2, 10),
+  make_tuple(64, 16, &aom_highbd_sad64x16_avx2, 12),
+  make_tuple(16, 64, &aom_highbd_sad16x64_avx2, 8),
+  make_tuple(16, 64, &aom_highbd_sad16x64_avx2, 10),
+  make_tuple(16, 64, &aom_highbd_sad16x64_avx2, 12),
+  make_tuple(32, 8, &aom_highbd_sad32x8_avx2, 8),
+  make_tuple(32, 8, &aom_highbd_sad32x8_avx2, 10),
+  make_tuple(32, 8, &aom_highbd_sad32x8_avx2, 12),
+  make_tuple(16, 4, &aom_highbd_sad16x4_avx2, 8),
+  make_tuple(16, 4, &aom_highbd_sad16x4_avx2, 10),
+  make_tuple(16, 4, &aom_highbd_sad16x4_avx2, 12),
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(AVX2, SADTest, ::testing::ValuesIn(avx2_tests));
+
+const SadMxNAvgParam avg_avx2_tests[] = {
+  make_tuple(64, 128, &aom_sad64x128_avg_avx2, -1),
+  make_tuple(128, 64, &aom_sad128x64_avg_avx2, -1),
+  make_tuple(128, 128, &aom_sad128x128_avg_avx2, -1),
+  make_tuple(64, 64, &aom_sad64x64_avg_avx2, -1),
+  make_tuple(64, 32, &aom_sad64x32_avg_avx2, -1),
+  make_tuple(32, 64, &aom_sad32x64_avg_avx2, -1),
+  make_tuple(32, 32, &aom_sad32x32_avg_avx2, -1),
+  make_tuple(32, 16, &aom_sad32x16_avg_avx2, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(128, 128, &aom_highbd_sad128x128_avg_avx2, 8),
+  make_tuple(128, 128, &aom_highbd_sad128x128_avg_avx2, 10),
+  make_tuple(128, 128, &aom_highbd_sad128x128_avg_avx2, 12),
+  make_tuple(128, 64, &aom_highbd_sad128x64_avg_avx2, 8),
+  make_tuple(128, 64, &aom_highbd_sad128x64_avg_avx2, 10),
+  make_tuple(128, 64, &aom_highbd_sad128x64_avg_avx2, 12),
+  make_tuple(64, 128, &aom_highbd_sad64x128_avg_avx2, 8),
+  make_tuple(64, 128, &aom_highbd_sad64x128_avg_avx2, 10),
+  make_tuple(64, 128, &aom_highbd_sad64x128_avg_avx2, 12),
+  make_tuple(64, 64, &aom_highbd_sad64x64_avg_avx2, 8),
+  make_tuple(64, 64, &aom_highbd_sad64x64_avg_avx2, 10),
+  make_tuple(64, 64, &aom_highbd_sad64x64_avg_avx2, 12),
+  make_tuple(64, 32, &aom_highbd_sad64x32_avg_avx2, 8),
+  make_tuple(64, 32, &aom_highbd_sad64x32_avg_avx2, 10),
+  make_tuple(64, 32, &aom_highbd_sad64x32_avg_avx2, 12),
+  make_tuple(32, 64, &aom_highbd_sad32x64_avg_avx2, 8),
+  make_tuple(32, 64, &aom_highbd_sad32x64_avg_avx2, 10),
+  make_tuple(32, 64, &aom_highbd_sad32x64_avg_avx2, 12),
+  make_tuple(32, 32, &aom_highbd_sad32x32_avg_avx2, 8),
+  make_tuple(32, 32, &aom_highbd_sad32x32_avg_avx2, 10),
+  make_tuple(32, 32, &aom_highbd_sad32x32_avg_avx2, 12),
+  make_tuple(32, 16, &aom_highbd_sad32x16_avg_avx2, 8),
+  make_tuple(32, 16, &aom_highbd_sad32x16_avg_avx2, 10),
+  make_tuple(32, 16, &aom_highbd_sad32x16_avg_avx2, 12),
+  make_tuple(16, 32, &aom_highbd_sad16x32_avg_avx2, 8),
+  make_tuple(16, 32, &aom_highbd_sad16x32_avg_avx2, 10),
+  make_tuple(16, 32, &aom_highbd_sad16x32_avg_avx2, 12),
+  make_tuple(16, 16, &aom_highbd_sad16x16_avg_avx2, 8),
+  make_tuple(16, 16, &aom_highbd_sad16x16_avg_avx2, 10),
+  make_tuple(16, 16, &aom_highbd_sad16x16_avg_avx2, 12),
+  make_tuple(16, 8, &aom_highbd_sad16x8_avg_avx2, 8),
+  make_tuple(16, 8, &aom_highbd_sad16x8_avg_avx2, 10),
+  make_tuple(16, 8, &aom_highbd_sad16x8_avg_avx2, 12),
+
+  make_tuple(64, 16, &aom_highbd_sad64x16_avg_avx2, 8),
+  make_tuple(64, 16, &aom_highbd_sad64x16_avg_avx2, 10),
+  make_tuple(64, 16, &aom_highbd_sad64x16_avg_avx2, 12),
+  make_tuple(16, 64, &aom_highbd_sad16x64_avg_avx2, 8),
+  make_tuple(16, 64, &aom_highbd_sad16x64_avg_avx2, 10),
+  make_tuple(16, 64, &aom_highbd_sad16x64_avg_avx2, 12),
+  make_tuple(32, 8, &aom_highbd_sad32x8_avg_avx2, 8),
+  make_tuple(32, 8, &aom_highbd_sad32x8_avg_avx2, 10),
+  make_tuple(32, 8, &aom_highbd_sad32x8_avg_avx2, 12),
+  make_tuple(16, 4, &aom_highbd_sad16x4_avg_avx2, 8),
+  make_tuple(16, 4, &aom_highbd_sad16x4_avg_avx2, 10),
+  make_tuple(16, 4, &aom_highbd_sad16x4_avg_avx2, 12),
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(AVX2, SADavgTest, ::testing::ValuesIn(avg_avx2_tests));
+
+const SadMxNx4Param x4d_avx2_tests[] = {
+  make_tuple(32, 64, &aom_sad32x64x4d_avx2, -1),
+  make_tuple(32, 32, &aom_sad32x32x4d_avx2, -1),
+  make_tuple(32, 16, &aom_sad32x16x4d_avx2, -1),
+  make_tuple(32, 8, &aom_sad32x8x4d_avx2, -1),
+  make_tuple(64, 128, &aom_sad64x128x4d_avx2, -1),
+  make_tuple(64, 64, &aom_sad64x64x4d_avx2, -1),
+  make_tuple(64, 32, &aom_sad64x32x4d_avx2, -1),
+  make_tuple(64, 16, &aom_sad64x16x4d_avx2, -1),
+  make_tuple(128, 128, &aom_sad128x128x4d_avx2, -1),
+  make_tuple(128, 64, &aom_sad128x64x4d_avx2, -1),
+#if CONFIG_AV1_HIGHBITDEPTH
+  make_tuple(128, 128, &aom_highbd_sad128x128x4d_avx2, 8),
+  make_tuple(128, 128, &aom_highbd_sad128x128x4d_avx2, 10),
+  make_tuple(128, 128, &aom_highbd_sad128x128x4d_avx2, 12),
+  make_tuple(128, 64, &aom_highbd_sad128x64x4d_avx2, 8),
+  make_tuple(128, 64, &aom_highbd_sad128x64x4d_avx2, 10),
+  make_tuple(128, 64, &aom_highbd_sad128x64x4d_avx2, 12),
+  make_tuple(64, 128, &aom_highbd_sad64x128x4d_avx2, 8),
+  make_tuple(64, 128, &aom_highbd_sad64x128x4d_avx2, 10),
+  make_tuple(64, 128, &aom_highbd_sad64x128x4d_avx2, 12),
+  make_tuple(64, 64, &aom_highbd_sad64x64x4d_avx2, 8),
+  make_tuple(64, 64, &aom_highbd_sad64x64x4d_avx2, 10),
+  make_tuple(64, 64, &aom_highbd_sad64x64x4d_avx2, 12),
+  make_tuple(64, 32, &aom_highbd_sad64x32x4d_avx2, 8),
+  make_tuple(64, 32, &aom_highbd_sad64x32x4d_avx2, 10),
+  make_tuple(64, 32, &aom_highbd_sad64x32x4d_avx2, 12),
+  make_tuple(32, 64, &aom_highbd_sad32x64x4d_avx2, 8),
+  make_tuple(32, 64, &aom_highbd_sad32x64x4d_avx2, 10),
+  make_tuple(32, 64, &aom_highbd_sad32x64x4d_avx2, 12),
+  make_tuple(32, 32, &aom_highbd_sad32x32x4d_avx2, 8),
+  make_tuple(32, 32, &aom_highbd_sad32x32x4d_avx2, 10),
+  make_tuple(32, 32, &aom_highbd_sad32x32x4d_avx2, 12),
+  make_tuple(32, 16, &aom_highbd_sad32x16x4d_avx2, 8),
+  make_tuple(32, 16, &aom_highbd_sad32x16x4d_avx2, 10),
+  make_tuple(32, 16, &aom_highbd_sad32x16x4d_avx2, 12),
+  make_tuple(16, 32, &aom_highbd_sad16x32x4d_avx2, 8),
+  make_tuple(16, 32, &aom_highbd_sad16x32x4d_avx2, 10),
+  make_tuple(16, 32, &aom_highbd_sad16x32x4d_avx2, 12),
+  make_tuple(16, 16, &aom_highbd_sad16x16x4d_avx2, 8),
+  make_tuple(16, 16, &aom_highbd_sad16x16x4d_avx2, 10),
+  make_tuple(16, 16, &aom_highbd_sad16x16x4d_avx2, 12),
+  make_tuple(16, 8, &aom_highbd_sad16x8x4d_avx2, 8),
+  make_tuple(16, 8, &aom_highbd_sad16x8x4d_avx2, 10),
+  make_tuple(16, 8, &aom_highbd_sad16x8x4d_avx2, 12),
+
+  make_tuple(16, 64, &aom_highbd_sad16x64x4d_avx2, 8),
+  make_tuple(16, 64, &aom_highbd_sad16x64x4d_avx2, 10),
+  make_tuple(16, 64, &aom_highbd_sad16x64x4d_avx2, 12),
+  make_tuple(64, 16, &aom_highbd_sad64x16x4d_avx2, 8),
+  make_tuple(64, 16, &aom_highbd_sad64x16x4d_avx2, 10),
+  make_tuple(64, 16, &aom_highbd_sad64x16x4d_avx2, 12),
+  make_tuple(32, 8, &aom_highbd_sad32x8x4d_avx2, 8),
+  make_tuple(32, 8, &aom_highbd_sad32x8x4d_avx2, 10),
+  make_tuple(32, 8, &aom_highbd_sad32x8x4d_avx2, 12),
+  make_tuple(16, 4, &aom_highbd_sad16x4x4d_avx2, 8),
+  make_tuple(16, 4, &aom_highbd_sad16x4x4d_avx2, 10),
+  make_tuple(16, 4, &aom_highbd_sad16x4x4d_avx2, 12),
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(AVX2, SADx4Test, ::testing::ValuesIn(x4d_avx2_tests));
+#endif  // HAVE_AVX2
+
+//------------------------------------------------------------------------------
+// MIPS functions
+#if HAVE_MSA
+const SadMxNParam msa_tests[] = {
+  make_tuple(64, 64, &aom_sad64x64_msa, -1),
+  make_tuple(64, 32, &aom_sad64x32_msa, -1),
+  make_tuple(32, 64, &aom_sad32x64_msa, -1),
+  make_tuple(32, 32, &aom_sad32x32_msa, -1),
+  make_tuple(32, 16, &aom_sad32x16_msa, -1),
+  make_tuple(16, 32, &aom_sad16x32_msa, -1),
+  make_tuple(16, 16, &aom_sad16x16_msa, -1),
+  make_tuple(16, 8, &aom_sad16x8_msa, -1),
+  make_tuple(8, 16, &aom_sad8x16_msa, -1),
+  make_tuple(8, 8, &aom_sad8x8_msa, -1),
+  make_tuple(8, 4, &aom_sad8x4_msa, -1),
+  make_tuple(4, 8, &aom_sad4x8_msa, -1),
+  make_tuple(4, 4, &aom_sad4x4_msa, -1),
+};
+INSTANTIATE_TEST_SUITE_P(MSA, SADTest, ::testing::ValuesIn(msa_tests));
+
+const SadMxNAvgParam avg_msa_tests[] = {
+  make_tuple(64, 64, &aom_sad64x64_avg_msa, -1),
+  make_tuple(64, 32, &aom_sad64x32_avg_msa, -1),
+  make_tuple(32, 64, &aom_sad32x64_avg_msa, -1),
+  make_tuple(32, 32, &aom_sad32x32_avg_msa, -1),
+  make_tuple(32, 16, &aom_sad32x16_avg_msa, -1),
+  make_tuple(16, 32, &aom_sad16x32_avg_msa, -1),
+  make_tuple(16, 16, &aom_sad16x16_avg_msa, -1),
+  make_tuple(16, 8, &aom_sad16x8_avg_msa, -1),
+  make_tuple(8, 16, &aom_sad8x16_avg_msa, -1),
+  make_tuple(8, 8, &aom_sad8x8_avg_msa, -1),
+  make_tuple(8, 4, &aom_sad8x4_avg_msa, -1),
+  make_tuple(4, 8, &aom_sad4x8_avg_msa, -1),
+  make_tuple(4, 4, &aom_sad4x4_avg_msa, -1),
+};
+INSTANTIATE_TEST_SUITE_P(MSA, SADavgTest, ::testing::ValuesIn(avg_msa_tests));
+
+const SadMxNx4Param x4d_msa_tests[] = {
+  make_tuple(64, 64, &aom_sad64x64x4d_msa, -1),
+  make_tuple(64, 32, &aom_sad64x32x4d_msa, -1),
+  make_tuple(32, 64, &aom_sad32x64x4d_msa, -1),
+  make_tuple(32, 32, &aom_sad32x32x4d_msa, -1),
+  make_tuple(32, 16, &aom_sad32x16x4d_msa, -1),
+  make_tuple(16, 32, &aom_sad16x32x4d_msa, -1),
+  make_tuple(16, 16, &aom_sad16x16x4d_msa, -1),
+  make_tuple(16, 8, &aom_sad16x8x4d_msa, -1),
+  make_tuple(8, 16, &aom_sad8x16x4d_msa, -1),
+  make_tuple(8, 8, &aom_sad8x8x4d_msa, -1),
+  make_tuple(8, 4, &aom_sad8x4x4d_msa, -1),
+  make_tuple(4, 8, &aom_sad4x8x4d_msa, -1),
+  make_tuple(4, 4, &aom_sad4x4x4d_msa, -1),
+};
+INSTANTIATE_TEST_SUITE_P(MSA, SADx4Test, ::testing::ValuesIn(x4d_msa_tests));
+#endif  // HAVE_MSA
+
+}  // namespace
diff --git a/libs/libaom/src/test/sb_multipass_test.cc b/libs/libaom/src/test/sb_multipass_test.cc
new file mode 100644
index 000000000..0ca76ab85
--- /dev/null
+++ b/libs/libaom/src/test/sb_multipass_test.cc
@@ -0,0 +1,153 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <initializer_list>
+#include <string>
+#include <vector>
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/md5_helper.h"
+#include "test/util.h"
+#include "test/yuv_video_source.h"
+
+namespace {
+class AV1SBMultipassTest
+    : public ::libaom_test::CodecTestWith2Params<int, bool>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  AV1SBMultipassTest()
+      : EncoderTest(GET_PARAM(0)), set_cpu_used_(GET_PARAM(1)),
+        row_mt_(GET_PARAM(2)) {
+    init_flags_ = AOM_CODEC_USE_PSNR;
+    aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t();
+    cfg.w = 1280;
+    cfg.h = 720;
+    cfg.allow_lowbitdepth = 1;
+    decoder_ = codec_->CreateDecoder(cfg, 0);
+    if (decoder_->IsAV1()) {
+      decoder_->Control(AV1_SET_DECODE_TILE_ROW, -1);
+      decoder_->Control(AV1_SET_DECODE_TILE_COL, -1);
+    }
+
+    size_enc_.clear();
+    md5_dec_.clear();
+    md5_enc_.clear();
+  }
+  virtual ~AV1SBMultipassTest() { delete decoder_; }
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(::libaom_test::kTwoPassGood);
+
+    cfg_.g_lag_in_frames = 5;
+    cfg_.rc_end_usage = AOM_VBR;
+    cfg_.rc_2pass_vbr_minsection_pct = 5;
+    cfg_.rc_2pass_vbr_maxsection_pct = 2000;
+
+    cfg_.rc_max_quantizer = 56;
+    cfg_.rc_min_quantizer = 0;
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      SetTileSize(encoder);
+      encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
+      encoder->Control(AV1E_ENABLE_SB_MULTIPASS_UNIT_TEST, use_multipass_);
+      encoder->Control(AV1E_SET_ROW_MT, row_mt_);
+
+      encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+      encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
+      encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
+    }
+  }
+
+  virtual void SetTileSize(libaom_test::Encoder *encoder) {
+    encoder->Control(AV1E_SET_TILE_COLUMNS, 1);
+    encoder->Control(AV1E_SET_TILE_ROWS, 1);
+  }
+
+  virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
+    size_enc_.push_back(pkt->data.frame.sz);
+
+    ::libaom_test::MD5 md5_enc;
+    md5_enc.Add(reinterpret_cast<uint8_t *>(pkt->data.frame.buf),
+                pkt->data.frame.sz);
+    md5_enc_.push_back(md5_enc.Get());
+
+    const aom_codec_err_t res = decoder_->DecodeFrame(
+        reinterpret_cast<uint8_t *>(pkt->data.frame.buf), pkt->data.frame.sz);
+    if (res != AOM_CODEC_OK) {
+      abort_ = true;
+      ASSERT_EQ(AOM_CODEC_OK, res);
+    }
+    const aom_image_t *img = decoder_->GetDxData().Next();
+
+    if (img) {
+      ::libaom_test::MD5 md5_res;
+      md5_res.Add(img);
+      md5_dec_.push_back(md5_res.Get());
+    }
+  }
+
+  void DoTest() {
+    ::libaom_test::YUVVideoSource video(
+        "niklas_640_480_30.yuv", AOM_IMG_FMT_I420, 640, 480, 30, 1, 0, 6);
+    cfg_.rc_target_bitrate = 1000;
+
+    // Encode while coding each sb once
+    use_multipass_ = false;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    std::vector<size_t> single_pass_size_enc;
+    std::vector<std::string> single_pass_md5_enc;
+    std::vector<std::string> single_pass_md5_dec;
+    single_pass_size_enc = size_enc_;
+    single_pass_md5_enc = md5_enc_;
+    single_pass_md5_dec = md5_dec_;
+    size_enc_.clear();
+    md5_enc_.clear();
+    md5_dec_.clear();
+
+    // Encode while coding each sb twice
+    use_multipass_ = true;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    std::vector<size_t> multi_pass_size_enc;
+    std::vector<std::string> multi_pass_md5_enc;
+    std::vector<std::string> multi_pass_md5_dec;
+    multi_pass_size_enc = size_enc_;
+    multi_pass_md5_enc = md5_enc_;
+    multi_pass_md5_dec = md5_dec_;
+    size_enc_.clear();
+    md5_enc_.clear();
+    md5_dec_.clear();
+
+    // Check that the vectors are equal.
+    ASSERT_EQ(single_pass_size_enc, multi_pass_size_enc);
+    ASSERT_EQ(single_pass_md5_enc, multi_pass_md5_enc);
+    ASSERT_EQ(single_pass_md5_dec, multi_pass_md5_dec);
+  }
+
+  bool use_multipass_;
+  int set_cpu_used_;
+  bool row_mt_;
+  ::libaom_test::Decoder *decoder_;
+  std::vector<size_t> size_enc_;
+  std::vector<std::string> md5_enc_;
+  std::vector<std::string> md5_dec_;
+};
+
+TEST_P(AV1SBMultipassTest, TwoPassMatchTest) { DoTest(); }
+
+AV1_INSTANTIATE_TEST_CASE(AV1SBMultipassTest, ::testing::Range(0, 6),
+                          ::testing::Bool());
+
+}  // namespace
diff --git a/libs/libaom/src/test/scalability_test.cc b/libs/libaom/src/test/scalability_test.cc
new file mode 100644
index 000000000..b39918861
--- /dev/null
+++ b/libs/libaom/src/test/scalability_test.cc
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+
+namespace {
+
+const int kCpuUsed = 8;
+const int kBaseLayerQp = 55;
+const int kEnhancementLayerQp = 20;
+
+class ScalabilityTest
+    : public ::libaom_test::CodecTestWithParam<libaom_test::TestMode>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  ScalabilityTest() : EncoderTest(GET_PARAM(0)) {}
+  virtual ~ScalabilityTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(GET_PARAM(1));
+    num_spatial_layers_ = 2;
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(AOME_SET_CPUUSED, kCpuUsed);
+      encoder->Control(AOME_SET_NUMBER_SPATIAL_LAYERS, num_spatial_layers_);
+    } else if (video->frame() % num_spatial_layers_) {
+      frame_flags_ = AOM_EFLAG_NO_REF_LAST2 | AOM_EFLAG_NO_REF_LAST3 |
+                     AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF |
+                     AOM_EFLAG_NO_REF_BWD | AOM_EFLAG_NO_REF_ARF2 |
+                     AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF |
+                     AOM_EFLAG_NO_UPD_ARF | AOM_EFLAG_NO_UPD_ENTROPY;
+      encoder->Control(AOME_SET_SPATIAL_LAYER_ID, 1);
+      encoder->Control(AOME_SET_CQ_LEVEL, kEnhancementLayerQp);
+    } else {
+      frame_flags_ = AOM_EFLAG_NO_REF_LAST2 | AOM_EFLAG_NO_REF_LAST3 |
+                     AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF |
+                     AOM_EFLAG_NO_REF_BWD | AOM_EFLAG_NO_REF_ARF2 |
+                     AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF |
+                     AOM_EFLAG_NO_UPD_ENTROPY;
+      encoder->Control(AOME_SET_SPATIAL_LAYER_ID, 0);
+      encoder->Control(AOME_SET_CQ_LEVEL, kBaseLayerQp);
+    }
+  }
+
+  void DoTest(int num_spatial_layers) {
+    num_spatial_layers_ = num_spatial_layers;
+    cfg_.rc_end_usage = AOM_Q;
+    cfg_.g_lag_in_frames = 0;
+
+    ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+                                         288, 30, 1, 0, 18);
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+
+  int num_spatial_layers_;
+};
+
+TEST_P(ScalabilityTest, TestNoMismatch2SpatialLayers) { DoTest(2); }
+
+TEST_P(ScalabilityTest, TestNoMismatch3SpatialLayers) { DoTest(3); }
+
+AV1_INSTANTIATE_TEST_CASE(ScalabilityTest,
+                          ::testing::Values(::libaom_test::kRealTime));
+
+}  // namespace
diff --git a/libs/libaom/src/test/scan_test.cc b/libs/libaom/src/test/scan_test.cc
new file mode 100644
index 000000000..dee2ab5a6
--- /dev/null
+++ b/libs/libaom/src/test/scan_test.cc
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "av1/common/scan.h"
+#include "av1/common/txb_common.h"
+#include "test/av1_txfm_test.h"
+
+static int scan_test(const int16_t *scan, const int16_t *iscan, int si, int r,
+                     int c, int w) {
+  if (iscan[r * w + c] != si || scan[si] != r * w + c) {
+    printf("r %d c %d ref_iscan %d iscan %d ref_scan %d scan %d\n", r, c, si,
+           iscan[r * w + c], r * w + c, scan[si]);
+    return 1;
+  } else {
+    return 0;
+  }
+}
+
+int scan_order_test(const SCAN_ORDER *scan_order, int w, int h,
+                    SCAN_MODE mode) {
+  const int16_t *scan = scan_order->scan;
+  const int16_t *iscan = scan_order->iscan;
+  int dim = w + h - 1;
+  if (mode == SCAN_MODE_ZIG_ZAG) {
+    int si = 0;
+    for (int i = 0; i < dim; ++i) {
+      if (i % 2 == 0) {
+        for (int c = 0; c < w; ++c) {
+          int r = i - c;
+          if (r >= 0 && r < h) {
+            if (scan_test(scan, iscan, si, r, c, w)) return 1;
+            ++si;
+          }
+        }
+      } else {
+        for (int r = 0; r < h; ++r) {
+          int c = i - r;
+          if (c >= 0 && c < w) {
+            if (scan_test(scan, iscan, si, r, c, w)) return 1;
+            ++si;
+          }
+        }
+      }
+    }
+  } else if (mode == SCAN_MODE_COL_DIAG) {
+    int si = 0;
+    for (int i = 0; i < dim; ++i) {
+      for (int c = 0; c < w; ++c) {
+        int r = i - c;
+        if (r >= 0 && r < h) {
+          if (scan_test(scan, iscan, si, r, c, w)) return 1;
+          ++si;
+        }
+      }
+    }
+  } else if (mode == SCAN_MODE_ROW_DIAG) {
+    int si = 0;
+    for (int i = 0; i < dim; ++i) {
+      for (int r = 0; r < h; ++r) {
+        int c = i - r;
+        if (c >= 0 && c < w) {
+          if (scan_test(scan, iscan, si, r, c, w)) return 1;
+          ++si;
+        }
+      }
+    }
+  } else if (mode == SCAN_MODE_ROW_1D) {
+    int si = 0;
+    for (int r = 0; r < h; ++r) {
+      for (int c = 0; c < w; ++c) {
+        if (scan_test(scan, iscan, si, r, c, w)) return 1;
+        ++si;
+      }
+    }
+  } else {
+    assert(mode == SCAN_MODE_COL_1D);
+    int si = 0;
+    for (int c = 0; c < w; ++c) {
+      for (int r = 0; r < h; ++r) {
+        if (scan_test(scan, iscan, si, r, c, w)) return 1;
+        ++si;
+      }
+    }
+  }
+  return 0;
+}
+
+TEST(Av1ScanTest, Dependency) {
+  for (int tx_size = TX_4X4; tx_size < TX_SIZES_ALL; ++tx_size) {
+    const int org_rows = tx_size_high[(TX_SIZE)tx_size];
+    const int org_cols = tx_size_wide[(TX_SIZE)tx_size];
+    const int rows = get_txb_high((TX_SIZE)tx_size);
+    const int cols = get_txb_wide((TX_SIZE)tx_size);
+    for (int tx_type = 0; tx_type < TX_TYPES; ++tx_type) {
+      if (libaom_test::IsTxSizeTypeValid(static_cast<TX_SIZE>(tx_size),
+                                         static_cast<TX_TYPE>(tx_type)) ==
+          false) {
+        continue;
+      }
+      SCAN_MODE scan_mode;
+      TX_CLASS tx_class = tx_type_to_class[(TX_TYPE)tx_type];
+      if (tx_class == TX_CLASS_2D) {
+        if (rows == cols) {
+          scan_mode = SCAN_MODE_ZIG_ZAG;
+        } else if (rows > cols) {
+          scan_mode = SCAN_MODE_ROW_DIAG;
+        } else {
+          scan_mode = SCAN_MODE_COL_DIAG;
+        }
+      } else if (tx_class == TX_CLASS_VERT) {
+        scan_mode = SCAN_MODE_ROW_1D;
+      } else {
+        assert(tx_class == TX_CLASS_HORIZ);
+        scan_mode = SCAN_MODE_COL_1D;
+      }
+      const SCAN_ORDER *scan_order =
+          get_default_scan((TX_SIZE)tx_size, (TX_TYPE)tx_type);
+      ASSERT_EQ(scan_order_test(scan_order, cols, rows, scan_mode), 0)
+          << "scan mismatch tx_class " << tx_class << " tx_type " << tx_type
+          << " tx_w " << org_cols << " tx_h " << org_rows << " scan_mode "
+          << scan_mode << "\n";
+    }
+  }
+}
diff --git a/libs/libaom/src/test/segment_binarization_sync.cc b/libs/libaom/src/test/segment_binarization_sync.cc
new file mode 100644
index 000000000..bd8cf1141
--- /dev/null
+++ b/libs/libaom/src/test/segment_binarization_sync.cc
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/acm_random.h"
+
+using libaom_test::ACMRandom;
+
+extern "C" {
+int av1_neg_interleave(int x, int ref, int max);
+int av1_neg_deinterleave(int diff, int ref, int max);
+}
+
+namespace {
+
+struct Segment {
+  int id;
+  int pred;
+  int last_id;
+};
+
+Segment GenerateSegment(int seed) {
+  static const int MAX_SEGMENTS = 8;
+
+  ACMRandom rnd_(seed);
+
+  Segment segment;
+  const int last_segid = rnd_.PseudoUniform(MAX_SEGMENTS);
+  segment.last_id = last_segid;
+  segment.pred = rnd_.PseudoUniform(MAX_SEGMENTS);
+  segment.id = rnd_.PseudoUniform(last_segid + 1);
+
+  return segment;
+}
+
+// Try to reveal a mismatch between segment binarization and debinarization
+TEST(SegmentBinarizationSync, SearchForBinarizationMismatch) {
+  const int count_tests = 1000;
+  const int seed_init = 4321;
+
+  for (int i = 0; i < count_tests; ++i) {
+    const Segment seg = GenerateSegment(seed_init + i);
+
+    const int max_segid = seg.last_id + 1;
+    const int seg_diff = av1_neg_interleave(seg.id, seg.pred, max_segid);
+    const int decoded_segid =
+        av1_neg_deinterleave(seg_diff, seg.pred, max_segid);
+
+    ASSERT_EQ(decoded_segid, seg.id);
+  }
+}
+
+}  // namespace
diff --git a/libs/libaom/src/test/selfguided_filter_test.cc b/libs/libaom/src/test/selfguided_filter_test.cc
new file mode 100644
index 000000000..d65cce58a
--- /dev/null
+++ b/libs/libaom/src/test/selfguided_filter_test.cc
@@ -0,0 +1,420 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <ctime>
+#include <tuple>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/av1_rtcd.h"
+
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+
+#include "aom_ports/aom_timer.h"
+#include "av1/common/mv.h"
+#include "av1/common/restoration.h"
+
+namespace {
+
+using libaom_test::ACMRandom;
+using std::make_tuple;
+using std::tuple;
+
+typedef void (*SgrFunc)(const uint8_t *dat8, int width, int height, int stride,
+                        int eps, const int *xqd, uint8_t *dst8, int dst_stride,
+                        int32_t *tmpbuf, int bit_depth, int highbd);
+
+// Test parameter list:
+//  <tst_fun_>
+typedef tuple<SgrFunc> FilterTestParam;
+
+class AV1SelfguidedFilterTest
+    : public ::testing::TestWithParam<FilterTestParam> {
+ public:
+  virtual ~AV1SelfguidedFilterTest() {}
+  virtual void SetUp() {}
+
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+  void RunSpeedTest() {
+    tst_fun_ = GET_PARAM(0);
+    const int pu_width = RESTORATION_PROC_UNIT_SIZE;
+    const int pu_height = RESTORATION_PROC_UNIT_SIZE;
+    const int width = 256, height = 256, stride = 288, out_stride = 288;
+    const int NUM_ITERS = 2000;
+    int i, j, k;
+
+    uint8_t *input_ =
+        (uint8_t *)aom_memalign(32, stride * (height + 32) * sizeof(uint8_t));
+    uint8_t *output_ = (uint8_t *)aom_memalign(
+        32, out_stride * (height + 32) * sizeof(uint8_t));
+    int32_t *tmpbuf = (int32_t *)aom_memalign(32, RESTORATION_TMPBUF_SIZE);
+    uint8_t *input = input_ + stride * 16 + 16;
+    uint8_t *output = output_ + out_stride * 16 + 16;
+
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+
+    for (i = -16; i < height + 16; ++i)
+      for (j = -16; j < width + 16; ++j)
+        input[i * stride + j] = rnd.Rand16() & 0xFF;
+
+    int xqd[2] = { SGRPROJ_PRJ_MIN0 + rnd.PseudoUniform(SGRPROJ_PRJ_MAX0 + 1 -
+                                                        SGRPROJ_PRJ_MIN0),
+                   SGRPROJ_PRJ_MIN1 + rnd.PseudoUniform(SGRPROJ_PRJ_MAX1 + 1 -
+                                                        SGRPROJ_PRJ_MIN1) };
+    // Fix a parameter set, since the speed depends slightly on r.
+    // Change this to test different combinations of values of r.
+    int eps = 15;
+
+    av1_loop_restoration_precal();
+
+    aom_usec_timer ref_timer;
+    aom_usec_timer_start(&ref_timer);
+    for (i = 0; i < NUM_ITERS; ++i) {
+      for (k = 0; k < height; k += pu_height)
+        for (j = 0; j < width; j += pu_width) {
+          int w = AOMMIN(pu_width, width - j);
+          int h = AOMMIN(pu_height, height - k);
+          uint8_t *input_p = input + k * stride + j;
+          uint8_t *output_p = output + k * out_stride + j;
+          av1_apply_selfguided_restoration_c(input_p, w, h, stride, eps, xqd,
+                                             output_p, out_stride, tmpbuf, 8,
+                                             0);
+        }
+    }
+    aom_usec_timer_mark(&ref_timer);
+    const int64_t ref_time = aom_usec_timer_elapsed(&ref_timer);
+
+    aom_usec_timer tst_timer;
+    aom_usec_timer_start(&tst_timer);
+    for (i = 0; i < NUM_ITERS; ++i) {
+      for (k = 0; k < height; k += pu_height)
+        for (j = 0; j < width; j += pu_width) {
+          int w = AOMMIN(pu_width, width - j);
+          int h = AOMMIN(pu_height, height - k);
+          uint8_t *input_p = input + k * stride + j;
+          uint8_t *output_p = output + k * out_stride + j;
+          tst_fun_(input_p, w, h, stride, eps, xqd, output_p, out_stride,
+                   tmpbuf, 8, 0);
+        }
+    }
+    aom_usec_timer_mark(&tst_timer);
+    const int64_t tst_time = aom_usec_timer_elapsed(&tst_timer);
+
+    std::cout << "[          ] C time = " << ref_time / 1000
+              << " ms, SIMD time = " << tst_time / 1000 << " ms\n";
+
+    EXPECT_GT(ref_time, tst_time)
+        << "Error: AV1SelfguidedFilterTest.SpeedTest, SIMD slower than C.\n"
+        << "C time: " << ref_time << " us\n"
+        << "SIMD time: " << tst_time << " us\n";
+
+    aom_free(input_);
+    aom_free(output_);
+    aom_free(tmpbuf);
+  }
+
+  void RunCorrectnessTest() {
+    tst_fun_ = GET_PARAM(0);
+    const int pu_width = RESTORATION_PROC_UNIT_SIZE;
+    const int pu_height = RESTORATION_PROC_UNIT_SIZE;
+    // Set the maximum width/height to test here. We actually test a small
+    // range of sizes *up to* this size, so that we can check, eg.,
+    // the behaviour on tiles which are not a multiple of 4 wide.
+    const int max_w = 260, max_h = 260, stride = 672, out_stride = 672;
+    const int NUM_ITERS = 81;
+    int i, j, k;
+
+    uint8_t *input_ =
+        (uint8_t *)aom_memalign(32, stride * (max_h + 32) * sizeof(uint8_t));
+    uint8_t *output_ = (uint8_t *)aom_memalign(
+        32, out_stride * (max_h + 32) * sizeof(uint8_t));
+    uint8_t *output2_ = (uint8_t *)aom_memalign(
+        32, out_stride * (max_h + 32) * sizeof(uint8_t));
+    int32_t *tmpbuf = (int32_t *)aom_memalign(32, RESTORATION_TMPBUF_SIZE);
+
+    uint8_t *input = input_ + stride * 16 + 16;
+    uint8_t *output = output_ + out_stride * 16 + 16;
+    uint8_t *output2 = output2_ + out_stride * 16 + 16;
+
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+
+    av1_loop_restoration_precal();
+
+    for (i = 0; i < NUM_ITERS; ++i) {
+      for (j = -16; j < max_h + 16; ++j)
+        for (k = -16; k < max_w + 16; ++k)
+          input[j * stride + k] = rnd.Rand16() & 0xFF;
+
+      int xqd[2] = { SGRPROJ_PRJ_MIN0 + rnd.PseudoUniform(SGRPROJ_PRJ_MAX0 + 1 -
+                                                          SGRPROJ_PRJ_MIN0),
+                     SGRPROJ_PRJ_MIN1 + rnd.PseudoUniform(SGRPROJ_PRJ_MAX1 + 1 -
+                                                          SGRPROJ_PRJ_MIN1) };
+      int eps = rnd.PseudoUniform(1 << SGRPROJ_PARAMS_BITS);
+
+      // Test various tile sizes around 256x256
+      int test_w = max_w - (i / 9);
+      int test_h = max_h - (i % 9);
+
+      for (k = 0; k < test_h; k += pu_height)
+        for (j = 0; j < test_w; j += pu_width) {
+          int w = AOMMIN(pu_width, test_w - j);
+          int h = AOMMIN(pu_height, test_h - k);
+          uint8_t *input_p = input + k * stride + j;
+          uint8_t *output_p = output + k * out_stride + j;
+          uint8_t *output2_p = output2 + k * out_stride + j;
+          tst_fun_(input_p, w, h, stride, eps, xqd, output_p, out_stride,
+                   tmpbuf, 8, 0);
+          av1_apply_selfguided_restoration_c(input_p, w, h, stride, eps, xqd,
+                                             output2_p, out_stride, tmpbuf, 8,
+                                             0);
+        }
+
+      for (j = 0; j < test_h; ++j)
+        for (k = 0; k < test_w; ++k) {
+          ASSERT_EQ(output[j * out_stride + k], output2[j * out_stride + k]);
+        }
+    }
+
+    aom_free(input_);
+    aom_free(output_);
+    aom_free(output2_);
+    aom_free(tmpbuf);
+  }
+
+ private:
+  SgrFunc tst_fun_;
+};
+
+TEST_P(AV1SelfguidedFilterTest, DISABLED_SpeedTest) { RunSpeedTest(); }
+TEST_P(AV1SelfguidedFilterTest, CorrectnessTest) { RunCorrectnessTest(); }
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(
+    SSE4_1, AV1SelfguidedFilterTest,
+    ::testing::Values(av1_apply_selfguided_restoration_sse4_1));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, AV1SelfguidedFilterTest,
+    ::testing::Values(av1_apply_selfguided_restoration_avx2));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+    NEON, AV1SelfguidedFilterTest,
+    ::testing::Values(av1_apply_selfguided_restoration_neon));
+#endif
+
+#if CONFIG_AV1_HIGHBITDEPTH
+// Test parameter list:
+//  <tst_fun_, bit_depth>
+typedef tuple<SgrFunc, int> HighbdFilterTestParam;
+
+class AV1HighbdSelfguidedFilterTest
+    : public ::testing::TestWithParam<HighbdFilterTestParam> {
+ public:
+  virtual ~AV1HighbdSelfguidedFilterTest() {}
+  virtual void SetUp() {}
+
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+  void RunSpeedTest() {
+    tst_fun_ = GET_PARAM(0);
+    const int pu_width = RESTORATION_PROC_UNIT_SIZE;
+    const int pu_height = RESTORATION_PROC_UNIT_SIZE;
+    const int width = 256, height = 256, stride = 288, out_stride = 288;
+    const int NUM_ITERS = 2000;
+    int i, j, k;
+    int bit_depth = GET_PARAM(1);
+    int mask = (1 << bit_depth) - 1;
+
+    uint16_t *input_ =
+        (uint16_t *)aom_memalign(32, stride * (height + 32) * sizeof(uint16_t));
+    uint16_t *output_ = (uint16_t *)aom_memalign(
+        32, out_stride * (height + 32) * sizeof(uint16_t));
+    int32_t *tmpbuf = (int32_t *)aom_memalign(32, RESTORATION_TMPBUF_SIZE);
+    uint16_t *input = input_ + stride * 16 + 16;
+    uint16_t *output = output_ + out_stride * 16 + 16;
+
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+
+    for (i = -16; i < height + 16; ++i)
+      for (j = -16; j < width + 16; ++j)
+        input[i * stride + j] = rnd.Rand16() & mask;
+
+    int xqd[2] = { SGRPROJ_PRJ_MIN0 + rnd.PseudoUniform(SGRPROJ_PRJ_MAX0 + 1 -
+                                                        SGRPROJ_PRJ_MIN0),
+                   SGRPROJ_PRJ_MIN1 + rnd.PseudoUniform(SGRPROJ_PRJ_MAX1 + 1 -
+                                                        SGRPROJ_PRJ_MIN1) };
+    // Fix a parameter set, since the speed depends slightly on r.
+    // Change this to test different combinations of values of r.
+    int eps = 15;
+
+    av1_loop_restoration_precal();
+
+    aom_usec_timer ref_timer;
+    aom_usec_timer_start(&ref_timer);
+    for (i = 0; i < NUM_ITERS; ++i) {
+      for (k = 0; k < height; k += pu_height)
+        for (j = 0; j < width; j += pu_width) {
+          int w = AOMMIN(pu_width, width - j);
+          int h = AOMMIN(pu_height, height - k);
+          uint16_t *input_p = input + k * stride + j;
+          uint16_t *output_p = output + k * out_stride + j;
+          av1_apply_selfguided_restoration_c(
+              CONVERT_TO_BYTEPTR(input_p), w, h, stride, eps, xqd,
+              CONVERT_TO_BYTEPTR(output_p), out_stride, tmpbuf, bit_depth, 1);
+        }
+    }
+    aom_usec_timer_mark(&ref_timer);
+    const int64_t ref_time = aom_usec_timer_elapsed(&ref_timer);
+
+    aom_usec_timer tst_timer;
+    aom_usec_timer_start(&tst_timer);
+    for (i = 0; i < NUM_ITERS; ++i) {
+      for (k = 0; k < height; k += pu_height)
+        for (j = 0; j < width; j += pu_width) {
+          int w = AOMMIN(pu_width, width - j);
+          int h = AOMMIN(pu_height, height - k);
+          uint16_t *input_p = input + k * stride + j;
+          uint16_t *output_p = output + k * out_stride + j;
+          tst_fun_(CONVERT_TO_BYTEPTR(input_p), w, h, stride, eps, xqd,
+                   CONVERT_TO_BYTEPTR(output_p), out_stride, tmpbuf, bit_depth,
+                   1);
+        }
+    }
+    aom_usec_timer_mark(&tst_timer);
+    const int64_t tst_time = aom_usec_timer_elapsed(&tst_timer);
+
+    std::cout << "[          ] C time = " << ref_time / 1000
+              << " ms, SIMD time = " << tst_time / 1000 << " ms\n";
+
+    EXPECT_GT(ref_time, tst_time)
+        << "Error: AV1HighbdSelfguidedFilterTest.SpeedTest, SIMD slower than "
+           "C.\n"
+        << "C time: " << ref_time << " us\n"
+        << "SIMD time: " << tst_time << " us\n";
+
+    aom_free(input_);
+    aom_free(output_);
+    aom_free(tmpbuf);
+  }
+
+  void RunCorrectnessTest() {
+    tst_fun_ = GET_PARAM(0);
+    const int pu_width = RESTORATION_PROC_UNIT_SIZE;
+    const int pu_height = RESTORATION_PROC_UNIT_SIZE;
+    // Set the maximum width/height to test here. We actually test a small
+    // range of sizes *up to* this size, so that we can check, eg.,
+    // the behaviour on tiles which are not a multiple of 4 wide.
+    const int max_w = 260, max_h = 260, stride = 672, out_stride = 672;
+    const int NUM_ITERS = 81;
+    int i, j, k;
+    int bit_depth = GET_PARAM(1);
+    int mask = (1 << bit_depth) - 1;
+
+    uint16_t *input_ =
+        (uint16_t *)aom_memalign(32, stride * (max_h + 32) * sizeof(uint16_t));
+    uint16_t *output_ = (uint16_t *)aom_memalign(
+        32, out_stride * (max_h + 32) * sizeof(uint16_t));
+    uint16_t *output2_ = (uint16_t *)aom_memalign(
+        32, out_stride * (max_h + 32) * sizeof(uint16_t));
+    int32_t *tmpbuf = (int32_t *)aom_memalign(32, RESTORATION_TMPBUF_SIZE);
+
+    uint16_t *input = input_ + stride * 16 + 16;
+    uint16_t *output = output_ + out_stride * 16 + 16;
+    uint16_t *output2 = output2_ + out_stride * 16 + 16;
+
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+
+    av1_loop_restoration_precal();
+
+    for (i = 0; i < NUM_ITERS; ++i) {
+      for (j = -16; j < max_h + 16; ++j)
+        for (k = -16; k < max_w + 16; ++k)
+          input[j * stride + k] = rnd.Rand16() & mask;
+
+      int xqd[2] = { SGRPROJ_PRJ_MIN0 + rnd.PseudoUniform(SGRPROJ_PRJ_MAX0 + 1 -
+                                                          SGRPROJ_PRJ_MIN0),
+                     SGRPROJ_PRJ_MIN1 + rnd.PseudoUniform(SGRPROJ_PRJ_MAX1 + 1 -
+                                                          SGRPROJ_PRJ_MIN1) };
+      int eps = rnd.PseudoUniform(1 << SGRPROJ_PARAMS_BITS);
+
+      // Test various tile sizes around 256x256
+      int test_w = max_w - (i / 9);
+      int test_h = max_h - (i % 9);
+
+      for (k = 0; k < test_h; k += pu_height)
+        for (j = 0; j < test_w; j += pu_width) {
+          int w = AOMMIN(pu_width, test_w - j);
+          int h = AOMMIN(pu_height, test_h - k);
+          uint16_t *input_p = input + k * stride + j;
+          uint16_t *output_p = output + k * out_stride + j;
+          uint16_t *output2_p = output2 + k * out_stride + j;
+          tst_fun_(CONVERT_TO_BYTEPTR(input_p), w, h, stride, eps, xqd,
+                   CONVERT_TO_BYTEPTR(output_p), out_stride, tmpbuf, bit_depth,
+                   1);
+          av1_apply_selfguided_restoration_c(
+              CONVERT_TO_BYTEPTR(input_p), w, h, stride, eps, xqd,
+              CONVERT_TO_BYTEPTR(output2_p), out_stride, tmpbuf, bit_depth, 1);
+        }
+
+      for (j = 0; j < test_h; ++j)
+        for (k = 0; k < test_w; ++k)
+          ASSERT_EQ(output[j * out_stride + k], output2[j * out_stride + k]);
+    }
+
+    aom_free(input_);
+    aom_free(output_);
+    aom_free(output2_);
+    aom_free(tmpbuf);
+  }
+
+ private:
+  SgrFunc tst_fun_;
+};
+
+TEST_P(AV1HighbdSelfguidedFilterTest, DISABLED_SpeedTest) { RunSpeedTest(); }
+TEST_P(AV1HighbdSelfguidedFilterTest, CorrectnessTest) { RunCorrectnessTest(); }
+
+#if HAVE_SSE4_1
+const int highbd_params_sse4_1[] = { 8, 10, 12 };
+INSTANTIATE_TEST_SUITE_P(
+    SSE4_1, AV1HighbdSelfguidedFilterTest,
+    ::testing::Combine(
+        ::testing::Values(av1_apply_selfguided_restoration_sse4_1),
+        ::testing::ValuesIn(highbd_params_sse4_1)));
+#endif
+
+#if HAVE_AVX2
+const int highbd_params_avx2[] = { 8, 10, 12 };
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, AV1HighbdSelfguidedFilterTest,
+    ::testing::Combine(::testing::Values(av1_apply_selfguided_restoration_avx2),
+                       ::testing::ValuesIn(highbd_params_avx2)));
+#endif
+
+#if HAVE_NEON
+const int highbd_params_neon[] = { 8, 10, 12 };
+INSTANTIATE_TEST_SUITE_P(
+    NEON, AV1HighbdSelfguidedFilterTest,
+    ::testing::Combine(::testing::Values(av1_apply_selfguided_restoration_neon),
+                       ::testing::ValuesIn(highbd_params_neon)));
+#endif
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+}  // namespace
diff --git a/libs/libaom/src/test/set_maps.sh b/libs/libaom/src/test/set_maps.sh
new file mode 100644
index 000000000..4f59b06d6
--- /dev/null
+++ b/libs/libaom/src/test/set_maps.sh
@@ -0,0 +1,52 @@
+#!/bin/sh
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+## This file tests the libaom set_maps example. To add new tests to this file,
+## do the following:
+##   1. Write a shell function (this is your test).
+##   2. Add the function to set_maps_tests (on a new line).
+##
+. $(dirname $0)/tools_common.sh
+
+# Environment check: $YUV_RAW_INPUT is required, and set_maps must exist in
+# $LIBAOM_BIN_PATH.
+set_maps_verify_environment() {
+  if [ ! -e "${YUV_RAW_INPUT}" ]; then
+    echo "Libaom test data must exist in LIBAOM_TEST_DATA_PATH."
+    return 1
+  fi
+  if [ -z "$(aom_tool_path set_maps)" ]; then
+    elog "set_maps not found. It must exist in LIBAOM_BIN_PATH or its parent."
+    return 1
+  fi
+}
+
+# Runs set_maps using the codec specified by $1.
+set_maps() {
+  local encoder="$(aom_tool_path set_maps)"
+  local codec="$1"
+  local output_file="${AOM_TEST_OUTPUT_DIR}/set_maps_${codec}.ivf"
+
+  eval "${AOM_TEST_PREFIX}" "${encoder}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \
+      "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" \
+      ${devnull}
+
+  [ -e "${output_file}" ] || return 1
+}
+
+set_maps_av1() {
+  if [ "$(av1_encode_available)" = "yes" ]; then
+    set_maps av1 || return 1
+  fi
+}
+
+set_maps_tests="set_maps_av1"
+
+run_tests set_maps_verify_environment "${set_maps_tests}"
diff --git a/libs/libaom/src/test/simd_avx2_test.cc b/libs/libaom/src/test/simd_avx2_test.cc
new file mode 100644
index 000000000..8a012bff8
--- /dev/null
+++ b/libs/libaom/src/test/simd_avx2_test.cc
@@ -0,0 +1,15 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#define ARCH AVX2
+#define ARCH_POSTFIX(name) name##_avx2
+#define SIMD_NAMESPACE simd_test_avx2
+#include "test/simd_impl.h"
diff --git a/libs/libaom/src/test/simd_cmp_avx2.cc b/libs/libaom/src/test/simd_cmp_avx2.cc
new file mode 100644
index 000000000..cda632bcd
--- /dev/null
+++ b/libs/libaom/src/test/simd_cmp_avx2.cc
@@ -0,0 +1,15 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#define ARCH AVX2
+#define ARCH_POSTFIX(name) name##_avx2
+#define SIMD_NAMESPACE simd_test_avx2
+#include "test/simd_cmp_impl.h"
diff --git a/libs/libaom/src/test/simd_cmp_impl.h b/libs/libaom/src/test/simd_cmp_impl.h
new file mode 100644
index 000000000..d3eb33619
--- /dev/null
+++ b/libs/libaom/src/test/simd_cmp_impl.h
@@ -0,0 +1,2171 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <string>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "test/acm_random.h"
+#include "aom_dsp/aom_simd.h"
+#undef SIMD_INLINE
+#define SIMD_INLINE static  // Don't enforce inlining
+#include "aom_dsp/simd/v256_intrinsics_c.h"
+
+// Machine tuned code goes into this file. This file is included from
+// simd_cmp_sse2.cc, simd_cmp_ssse3.cc etc which define the macros
+// ARCH (=neon, sse2, ssse3, etc), SIMD_NAMESPACE and ARCH_POSTFIX().
+
+#ifdef _MSC_VER
+// Disable "value of intrinsic immediate argument 'value' is out of range
+// 'lowerbound - upperbound'" warning. Visual Studio emits this warning though
+// the parameters are conditionally checked in e.g., v256_shr_n_byte. Adding a
+// mask doesn't always appear to be sufficient.
+#pragma warning(disable : 4556)
+#endif
+
+using libaom_test::ACMRandom;
+
+namespace SIMD_NAMESPACE {
+
+// Wrap templates around intrinsics using immediate values
+template <int shift>
+v64 imm_v64_shl_n_byte(v64 a) {
+  return v64_shl_n_byte(a, shift);
+}
+template <int shift>
+v64 imm_v64_shr_n_byte(v64 a) {
+  return v64_shr_n_byte(a, shift);
+}
+template <int shift>
+v64 imm_v64_shl_n_8(v64 a) {
+  return v64_shl_n_8(a, shift);
+}
+template <int shift>
+v64 imm_v64_shr_n_u8(v64 a) {
+  return v64_shr_n_u8(a, shift);
+}
+template <int shift>
+v64 imm_v64_shr_n_s8(v64 a) {
+  return v64_shr_n_s8(a, shift);
+}
+template <int shift>
+v64 imm_v64_shl_n_16(v64 a) {
+  return v64_shl_n_16(a, shift);
+}
+template <int shift>
+v64 imm_v64_shr_n_u16(v64 a) {
+  return v64_shr_n_u16(a, shift);
+}
+template <int shift>
+v64 imm_v64_shr_n_s16(v64 a) {
+  return v64_shr_n_s16(a, shift);
+}
+template <int shift>
+v64 imm_v64_shl_n_32(v64 a) {
+  return v64_shl_n_32(a, shift);
+}
+template <int shift>
+v64 imm_v64_shr_n_u32(v64 a) {
+  return v64_shr_n_u32(a, shift);
+}
+template <int shift>
+v64 imm_v64_shr_n_s32(v64 a) {
+  return v64_shr_n_s32(a, shift);
+}
+template <int shift>
+v64 imm_v64_align(v64 a, v64 b) {
+  return v64_align(a, b, shift);
+}
+
+// Wrap templates around corresponding C implementations of the above
+template <int shift>
+c_v64 c_imm_v64_shl_n_byte(c_v64 a) {
+  return c_v64_shl_n_byte(a, shift);
+}
+template <int shift>
+c_v64 c_imm_v64_shr_n_byte(c_v64 a) {
+  return c_v64_shr_n_byte(a, shift);
+}
+template <int shift>
+c_v64 c_imm_v64_shl_n_8(c_v64 a) {
+  return c_v64_shl_n_8(a, shift);
+}
+template <int shift>
+c_v64 c_imm_v64_shr_n_u8(c_v64 a) {
+  return c_v64_shr_n_u8(a, shift);
+}
+template <int shift>
+c_v64 c_imm_v64_shr_n_s8(c_v64 a) {
+  return c_v64_shr_n_s8(a, shift);
+}
+template <int shift>
+c_v64 c_imm_v64_shl_n_16(c_v64 a) {
+  return c_v64_shl_n_16(a, shift);
+}
+template <int shift>
+c_v64 c_imm_v64_shr_n_u16(c_v64 a) {
+  return c_v64_shr_n_u16(a, shift);
+}
+template <int shift>
+c_v64 c_imm_v64_shr_n_s16(c_v64 a) {
+  return c_v64_shr_n_s16(a, shift);
+}
+template <int shift>
+c_v64 c_imm_v64_shl_n_32(c_v64 a) {
+  return c_v64_shl_n_32(a, shift);
+}
+template <int shift>
+c_v64 c_imm_v64_shr_n_u32(c_v64 a) {
+  return c_v64_shr_n_u32(a, shift);
+}
+template <int shift>
+c_v64 c_imm_v64_shr_n_s32(c_v64 a) {
+  return c_v64_shr_n_s32(a, shift);
+}
+template <int shift>
+c_v64 c_imm_v64_align(c_v64 a, c_v64 b) {
+  return c_v64_align(a, b, shift);
+}
+
+template <int shift>
+v128 imm_v128_shl_n_byte(v128 a) {
+  return v128_shl_n_byte(a, shift);
+}
+template <int shift>
+v128 imm_v128_shr_n_byte(v128 a) {
+  return v128_shr_n_byte(a, shift);
+}
+template <int shift>
+v128 imm_v128_shl_n_8(v128 a) {
+  return v128_shl_n_8(a, shift);
+}
+template <int shift>
+v128 imm_v128_shr_n_u8(v128 a) {
+  return v128_shr_n_u8(a, shift);
+}
+template <int shift>
+v128 imm_v128_shr_n_s8(v128 a) {
+  return v128_shr_n_s8(a, shift);
+}
+template <int shift>
+v128 imm_v128_shl_n_16(v128 a) {
+  return v128_shl_n_16(a, shift);
+}
+template <int shift>
+v128 imm_v128_shr_n_u16(v128 a) {
+  return v128_shr_n_u16(a, shift);
+}
+template <int shift>
+v128 imm_v128_shr_n_s16(v128 a) {
+  return v128_shr_n_s16(a, shift);
+}
+template <int shift>
+v128 imm_v128_shl_n_32(v128 a) {
+  return v128_shl_n_32(a, shift);
+}
+template <int shift>
+v128 imm_v128_shr_n_u32(v128 a) {
+  return v128_shr_n_u32(a, shift);
+}
+template <int shift>
+v128 imm_v128_shr_n_s32(v128 a) {
+  return v128_shr_n_s32(a, shift);
+}
+template <int shift>
+v128 imm_v128_shl_n_64(v128 a) {
+  return v128_shl_n_64(a, shift);
+}
+template <int shift>
+v128 imm_v128_shr_n_u64(v128 a) {
+  return v128_shr_n_u64(a, shift);
+}
+template <int shift>
+v128 imm_v128_shr_n_s64(v128 a) {
+  return v128_shr_n_s64(a, shift);
+}
+template <int shift>
+v128 imm_v128_align(v128 a, v128 b) {
+  return v128_align(a, b, shift);
+}
+
+template <int shift>
+c_v128 c_imm_v128_shl_n_byte(c_v128 a) {
+  return c_v128_shl_n_byte(a, shift);
+}
+template <int shift>
+c_v128 c_imm_v128_shr_n_byte(c_v128 a) {
+  return c_v128_shr_n_byte(a, shift);
+}
+template <int shift>
+c_v128 c_imm_v128_shl_n_8(c_v128 a) {
+  return c_v128_shl_n_8(a, shift);
+}
+template <int shift>
+c_v128 c_imm_v128_shr_n_u8(c_v128 a) {
+  return c_v128_shr_n_u8(a, shift);
+}
+template <int shift>
+c_v128 c_imm_v128_shr_n_s8(c_v128 a) {
+  return c_v128_shr_n_s8(a, shift);
+}
+template <int shift>
+c_v128 c_imm_v128_shl_n_16(c_v128 a) {
+  return c_v128_shl_n_16(a, shift);
+}
+template <int shift>
+c_v128 c_imm_v128_shr_n_u16(c_v128 a) {
+  return c_v128_shr_n_u16(a, shift);
+}
+template <int shift>
+c_v128 c_imm_v128_shr_n_s16(c_v128 a) {
+  return c_v128_shr_n_s16(a, shift);
+}
+template <int shift>
+c_v128 c_imm_v128_shl_n_32(c_v128 a) {
+  return c_v128_shl_n_32(a, shift);
+}
+template <int shift>
+c_v128 c_imm_v128_shr_n_u32(c_v128 a) {
+  return c_v128_shr_n_u32(a, shift);
+}
+template <int shift>
+c_v128 c_imm_v128_shr_n_s32(c_v128 a) {
+  return c_v128_shr_n_s32(a, shift);
+}
+template <int shift>
+c_v128 c_imm_v128_shl_n_64(c_v128 a) {
+  return c_v128_shl_n_64(a, shift);
+}
+template <int shift>
+c_v128 c_imm_v128_shr_n_u64(c_v128 a) {
+  return c_v128_shr_n_u64(a, shift);
+}
+template <int shift>
+c_v128 c_imm_v128_shr_n_s64(c_v128 a) {
+  return c_v128_shr_n_s64(a, shift);
+}
+template <int shift>
+c_v128 c_imm_v128_align(c_v128 a, c_v128 b) {
+  return c_v128_align(a, b, shift);
+}
+
+template <int shift>
+v256 imm_v256_shl_n_word(v256 a) {
+  return v256_shl_n_word(a, shift);
+}
+template <int shift>
+v256 imm_v256_shr_n_word(v256 a) {
+  return v256_shr_n_word(a, shift);
+}
+template <int shift>
+v256 imm_v256_shl_n_byte(v256 a) {
+  return v256_shl_n_byte(a, shift);
+}
+template <int shift>
+v256 imm_v256_shr_n_byte(v256 a) {
+  return v256_shr_n_byte(a, shift);
+}
+template <int shift>
+v256 imm_v256_shl_n_8(v256 a) {
+  return v256_shl_n_8(a, shift);
+}
+template <int shift>
+v256 imm_v256_shr_n_u8(v256 a) {
+  return v256_shr_n_u8(a, shift);
+}
+template <int shift>
+v256 imm_v256_shr_n_s8(v256 a) {
+  return v256_shr_n_s8(a, shift);
+}
+template <int shift>
+v256 imm_v256_shl_n_16(v256 a) {
+  return v256_shl_n_16(a, shift);
+}
+template <int shift>
+v256 imm_v256_shr_n_u16(v256 a) {
+  return v256_shr_n_u16(a, shift);
+}
+template <int shift>
+v256 imm_v256_shr_n_s16(v256 a) {
+  return v256_shr_n_s16(a, shift);
+}
+template <int shift>
+v256 imm_v256_shl_n_32(v256 a) {
+  return v256_shl_n_32(a, shift);
+}
+template <int shift>
+v256 imm_v256_shr_n_u32(v256 a) {
+  return v256_shr_n_u32(a, shift);
+}
+template <int shift>
+v256 imm_v256_shr_n_s32(v256 a) {
+  return v256_shr_n_s32(a, shift);
+}
+template <int shift>
+v256 imm_v256_shl_n_64(v256 a) {
+  return v256_shl_n_64(a, shift);
+}
+template <int shift>
+v256 imm_v256_shr_n_u64(v256 a) {
+  return v256_shr_n_u64(a, shift);
+}
+template <int shift>
+v256 imm_v256_shr_n_s64(v256 a) {
+  return v256_shr_n_s64(a, shift);
+}
+template <int shift>
+v256 imm_v256_align(v256 a, v256 b) {
+  return v256_align(a, b, shift);
+}
+
+template <int shift>
+c_v256 c_imm_v256_shl_n_word(c_v256 a) {
+  return c_v256_shl_n_word(a, shift);
+}
+template <int shift>
+c_v256 c_imm_v256_shr_n_word(c_v256 a) {
+  return c_v256_shr_n_word(a, shift);
+}
+template <int shift>
+c_v256 c_imm_v256_shl_n_byte(c_v256 a) {
+  return c_v256_shl_n_byte(a, shift);
+}
+template <int shift>
+c_v256 c_imm_v256_shr_n_byte(c_v256 a) {
+  return c_v256_shr_n_byte(a, shift);
+}
+template <int shift>
+c_v256 c_imm_v256_shl_n_8(c_v256 a) {
+  return c_v256_shl_n_8(a, shift);
+}
+template <int shift>
+c_v256 c_imm_v256_shr_n_u8(c_v256 a) {
+  return c_v256_shr_n_u8(a, shift);
+}
+template <int shift>
+c_v256 c_imm_v256_shr_n_s8(c_v256 a) {
+  return c_v256_shr_n_s8(a, shift);
+}
+template <int shift>
+c_v256 c_imm_v256_shl_n_16(c_v256 a) {
+  return c_v256_shl_n_16(a, shift);
+}
+template <int shift>
+c_v256 c_imm_v256_shr_n_u16(c_v256 a) {
+  return c_v256_shr_n_u16(a, shift);
+}
+template <int shift>
+c_v256 c_imm_v256_shr_n_s16(c_v256 a) {
+  return c_v256_shr_n_s16(a, shift);
+}
+template <int shift>
+c_v256 c_imm_v256_shl_n_32(c_v256 a) {
+  return c_v256_shl_n_32(a, shift);
+}
+template <int shift>
+c_v256 c_imm_v256_shr_n_u32(c_v256 a) {
+  return c_v256_shr_n_u32(a, shift);
+}
+template <int shift>
+c_v256 c_imm_v256_shr_n_s32(c_v256 a) {
+  return c_v256_shr_n_s32(a, shift);
+}
+template <int shift>
+c_v256 c_imm_v256_shl_n_64(c_v256 a) {
+  return c_v256_shl_n_64(a, shift);
+}
+template <int shift>
+c_v256 c_imm_v256_shr_n_u64(c_v256 a) {
+  return c_v256_shr_n_u64(a, shift);
+}
+template <int shift>
+c_v256 c_imm_v256_shr_n_s64(c_v256 a) {
+  return c_v256_shr_n_s64(a, shift);
+}
+template <int shift>
+c_v256 c_imm_v256_align(c_v256 a, c_v256 b) {
+  return c_v256_align(a, b, shift);
+}
+
+// Wrappers around the the SAD and SSD functions
+uint32_t v64_sad_u8(v64 a, v64 b) {
+  return v64_sad_u8_sum(::v64_sad_u8(v64_sad_u8_init(), a, b));
+}
+uint32_t v64_ssd_u8(v64 a, v64 b) {
+  return v64_ssd_u8_sum(::v64_ssd_u8(v64_ssd_u8_init(), a, b));
+}
+
+uint32_t c_v64_sad_u8(c_v64 a, c_v64 b) {
+  return c_v64_sad_u8_sum(::c_v64_sad_u8(c_v64_sad_u8_init(), a, b));
+}
+uint32_t c_v64_ssd_u8(c_v64 a, c_v64 b) {
+  return c_v64_ssd_u8_sum(::c_v64_ssd_u8(c_v64_ssd_u8_init(), a, b));
+}
+uint32_t v128_sad_u8(v128 a, v128 b) {
+  return v128_sad_u8_sum(::v128_sad_u8(v128_sad_u8_init(), a, b));
+}
+uint32_t v128_ssd_u8(v128 a, v128 b) {
+  return v128_ssd_u8_sum(::v128_ssd_u8(v128_ssd_u8_init(), a, b));
+}
+uint32_t c_v128_sad_u8(c_v128 a, c_v128 b) {
+  return c_v128_sad_u8_sum(::c_v128_sad_u8(c_v128_sad_u8_init(), a, b));
+}
+uint32_t c_v128_ssd_u8(c_v128 a, c_v128 b) {
+  return c_v128_ssd_u8_sum(::c_v128_ssd_u8(c_v128_ssd_u8_init(), a, b));
+}
+uint32_t v128_sad_u16(v128 a, v128 b) {
+  return v128_sad_u16_sum(::v128_sad_u16(v128_sad_u16_init(), a, b));
+}
+uint64_t v128_ssd_s16(v128 a, v128 b) {
+  return v128_ssd_s16_sum(::v128_ssd_s16(v128_ssd_s16_init(), a, b));
+}
+uint32_t c_v128_sad_u16(c_v128 a, c_v128 b) {
+  return c_v128_sad_u16_sum(::c_v128_sad_u16(c_v128_sad_u16_init(), a, b));
+}
+uint64_t c_v128_ssd_s16(c_v128 a, c_v128 b) {
+  return c_v128_ssd_s16_sum(::c_v128_ssd_s16(c_v128_ssd_s16_init(), a, b));
+}
+uint32_t v256_sad_u8(v256 a, v256 b) {
+  return v256_sad_u8_sum(::v256_sad_u8(v256_sad_u8_init(), a, b));
+}
+uint32_t v256_ssd_u8(v256 a, v256 b) {
+  return v256_ssd_u8_sum(::v256_ssd_u8(v256_ssd_u8_init(), a, b));
+}
+uint32_t c_v256_sad_u8(c_v256 a, c_v256 b) {
+  return c_v256_sad_u8_sum(::c_v256_sad_u8(c_v256_sad_u8_init(), a, b));
+}
+uint32_t c_v256_ssd_u8(c_v256 a, c_v256 b) {
+  return c_v256_ssd_u8_sum(::c_v256_ssd_u8(c_v256_ssd_u8_init(), a, b));
+}
+uint32_t v256_sad_u16(v256 a, v256 b) {
+  return v256_sad_u16_sum(::v256_sad_u16(v256_sad_u16_init(), a, b));
+}
+uint64_t v256_ssd_s16(v256 a, v256 b) {
+  return v256_ssd_s16_sum(::v256_ssd_s16(v256_ssd_s16_init(), a, b));
+}
+uint32_t c_v256_sad_u16(c_v256 a, c_v256 b) {
+  return c_v256_sad_u16_sum(::c_v256_sad_u16(c_v256_sad_u16_init(), a, b));
+}
+uint64_t c_v256_ssd_s16(c_v256 a, c_v256 b) {
+  return c_v256_ssd_s16_sum(::c_v256_ssd_s16(c_v256_ssd_s16_init(), a, b));
+}
+
+namespace {
+
+typedef void (*fptr)();
+
+typedef struct {
+  const char *name;
+  fptr ref;
+  fptr simd;
+} mapping;
+
+#define MAP(name) \
+  { #name, reinterpret_cast < fptr>(c_##name), reinterpret_cast < fptr>(name) }
+
+const mapping m[] = { MAP(v64_sad_u8),
+                      MAP(v64_ssd_u8),
+                      MAP(v64_add_8),
+                      MAP(v64_add_16),
+                      MAP(v64_sadd_s8),
+                      MAP(v64_sadd_u8),
+                      MAP(v64_sadd_s16),
+                      MAP(v64_add_32),
+                      MAP(v64_sub_8),
+                      MAP(v64_ssub_u8),
+                      MAP(v64_ssub_s8),
+                      MAP(v64_sub_16),
+                      MAP(v64_ssub_s16),
+                      MAP(v64_ssub_u16),
+                      MAP(v64_sub_32),
+                      MAP(v64_ziplo_8),
+                      MAP(v64_ziphi_8),
+                      MAP(v64_ziplo_16),
+                      MAP(v64_ziphi_16),
+                      MAP(v64_ziplo_32),
+                      MAP(v64_ziphi_32),
+                      MAP(v64_pack_s32_u16),
+                      MAP(v64_pack_s32_s16),
+                      MAP(v64_pack_s16_u8),
+                      MAP(v64_pack_s16_s8),
+                      MAP(v64_unziphi_8),
+                      MAP(v64_unziplo_8),
+                      MAP(v64_unziphi_16),
+                      MAP(v64_unziplo_16),
+                      MAP(v64_or),
+                      MAP(v64_xor),
+                      MAP(v64_and),
+                      MAP(v64_andn),
+                      MAP(v64_mullo_s16),
+                      MAP(v64_mulhi_s16),
+                      MAP(v64_mullo_s32),
+                      MAP(v64_madd_s16),
+                      MAP(v64_madd_us8),
+                      MAP(v64_avg_u8),
+                      MAP(v64_rdavg_u8),
+                      MAP(v64_rdavg_u16),
+                      MAP(v64_avg_u16),
+                      MAP(v64_min_u8),
+                      MAP(v64_max_u8),
+                      MAP(v64_min_s8),
+                      MAP(v64_max_s8),
+                      MAP(v64_min_s16),
+                      MAP(v64_max_s16),
+                      MAP(v64_cmpgt_s8),
+                      MAP(v64_cmplt_s8),
+                      MAP(v64_cmpeq_8),
+                      MAP(v64_cmpgt_s16),
+                      MAP(v64_cmplt_s16),
+                      MAP(v64_cmpeq_16),
+                      MAP(v64_shuffle_8),
+                      MAP(imm_v64_align<1>),
+                      MAP(imm_v64_align<2>),
+                      MAP(imm_v64_align<3>),
+                      MAP(imm_v64_align<4>),
+                      MAP(imm_v64_align<5>),
+                      MAP(imm_v64_align<6>),
+                      MAP(imm_v64_align<7>),
+                      MAP(v64_abs_s8),
+                      MAP(v64_abs_s16),
+                      MAP(v64_unpacklo_u8_s16),
+                      MAP(v64_unpackhi_u8_s16),
+                      MAP(v64_unpacklo_s8_s16),
+                      MAP(v64_unpackhi_s8_s16),
+                      MAP(v64_unpacklo_u16_s32),
+                      MAP(v64_unpacklo_s16_s32),
+                      MAP(v64_unpackhi_u16_s32),
+                      MAP(v64_unpackhi_s16_s32),
+                      MAP(imm_v64_shr_n_byte<1>),
+                      MAP(imm_v64_shr_n_byte<2>),
+                      MAP(imm_v64_shr_n_byte<3>),
+                      MAP(imm_v64_shr_n_byte<4>),
+                      MAP(imm_v64_shr_n_byte<5>),
+                      MAP(imm_v64_shr_n_byte<6>),
+                      MAP(imm_v64_shr_n_byte<7>),
+                      MAP(imm_v64_shl_n_byte<1>),
+                      MAP(imm_v64_shl_n_byte<2>),
+                      MAP(imm_v64_shl_n_byte<3>),
+                      MAP(imm_v64_shl_n_byte<4>),
+                      MAP(imm_v64_shl_n_byte<5>),
+                      MAP(imm_v64_shl_n_byte<6>),
+                      MAP(imm_v64_shl_n_byte<7>),
+                      MAP(imm_v64_shl_n_8<1>),
+                      MAP(imm_v64_shl_n_8<2>),
+                      MAP(imm_v64_shl_n_8<3>),
+                      MAP(imm_v64_shl_n_8<4>),
+                      MAP(imm_v64_shl_n_8<5>),
+                      MAP(imm_v64_shl_n_8<6>),
+                      MAP(imm_v64_shl_n_8<7>),
+                      MAP(imm_v64_shr_n_u8<1>),
+                      MAP(imm_v64_shr_n_u8<2>),
+                      MAP(imm_v64_shr_n_u8<3>),
+                      MAP(imm_v64_shr_n_u8<4>),
+                      MAP(imm_v64_shr_n_u8<5>),
+                      MAP(imm_v64_shr_n_u8<6>),
+                      MAP(imm_v64_shr_n_u8<7>),
+                      MAP(imm_v64_shr_n_s8<1>),
+                      MAP(imm_v64_shr_n_s8<2>),
+                      MAP(imm_v64_shr_n_s8<3>),
+                      MAP(imm_v64_shr_n_s8<4>),
+                      MAP(imm_v64_shr_n_s8<5>),
+                      MAP(imm_v64_shr_n_s8<6>),
+                      MAP(imm_v64_shr_n_s8<7>),
+                      MAP(imm_v64_shl_n_16<1>),
+                      MAP(imm_v64_shl_n_16<2>),
+                      MAP(imm_v64_shl_n_16<4>),
+                      MAP(imm_v64_shl_n_16<6>),
+                      MAP(imm_v64_shl_n_16<8>),
+                      MAP(imm_v64_shl_n_16<10>),
+                      MAP(imm_v64_shl_n_16<12>),
+                      MAP(imm_v64_shl_n_16<14>),
+                      MAP(imm_v64_shr_n_u16<1>),
+                      MAP(imm_v64_shr_n_u16<2>),
+                      MAP(imm_v64_shr_n_u16<4>),
+                      MAP(imm_v64_shr_n_u16<6>),
+                      MAP(imm_v64_shr_n_u16<8>),
+                      MAP(imm_v64_shr_n_u16<10>),
+                      MAP(imm_v64_shr_n_u16<12>),
+                      MAP(imm_v64_shr_n_u16<14>),
+                      MAP(imm_v64_shr_n_s16<1>),
+                      MAP(imm_v64_shr_n_s16<2>),
+                      MAP(imm_v64_shr_n_s16<4>),
+                      MAP(imm_v64_shr_n_s16<6>),
+                      MAP(imm_v64_shr_n_s16<8>),
+                      MAP(imm_v64_shr_n_s16<10>),
+                      MAP(imm_v64_shr_n_s16<12>),
+                      MAP(imm_v64_shr_n_s16<14>),
+                      MAP(imm_v64_shl_n_32<1>),
+                      MAP(imm_v64_shl_n_32<4>),
+                      MAP(imm_v64_shl_n_32<8>),
+                      MAP(imm_v64_shl_n_32<12>),
+                      MAP(imm_v64_shl_n_32<16>),
+                      MAP(imm_v64_shl_n_32<20>),
+                      MAP(imm_v64_shl_n_32<24>),
+                      MAP(imm_v64_shl_n_32<28>),
+                      MAP(imm_v64_shr_n_u32<1>),
+                      MAP(imm_v64_shr_n_u32<4>),
+                      MAP(imm_v64_shr_n_u32<8>),
+                      MAP(imm_v64_shr_n_u32<12>),
+                      MAP(imm_v64_shr_n_u32<16>),
+                      MAP(imm_v64_shr_n_u32<20>),
+                      MAP(imm_v64_shr_n_u32<24>),
+                      MAP(imm_v64_shr_n_u32<28>),
+                      MAP(imm_v64_shr_n_s32<1>),
+                      MAP(imm_v64_shr_n_s32<4>),
+                      MAP(imm_v64_shr_n_s32<8>),
+                      MAP(imm_v64_shr_n_s32<12>),
+                      MAP(imm_v64_shr_n_s32<16>),
+                      MAP(imm_v64_shr_n_s32<20>),
+                      MAP(imm_v64_shr_n_s32<24>),
+                      MAP(imm_v64_shr_n_s32<28>),
+                      MAP(v64_shl_8),
+                      MAP(v64_shr_u8),
+                      MAP(v64_shr_s8),
+                      MAP(v64_shl_16),
+                      MAP(v64_shr_u16),
+                      MAP(v64_shr_s16),
+                      MAP(v64_shl_32),
+                      MAP(v64_shr_u32),
+                      MAP(v64_shr_s32),
+                      MAP(v64_hadd_u8),
+                      MAP(v64_hadd_s16),
+                      MAP(v64_dotp_s16),
+                      MAP(v64_dotp_su8),
+                      MAP(v64_u64),
+                      MAP(v64_low_u32),
+                      MAP(v64_high_u32),
+                      MAP(v64_low_s32),
+                      MAP(v64_high_s32),
+                      MAP(v64_dup_8),
+                      MAP(v64_dup_16),
+                      MAP(v64_dup_32),
+                      MAP(v64_from_32),
+                      MAP(v64_zero),
+                      MAP(v64_from_16),
+                      MAP(v128_sad_u8),
+                      MAP(v128_ssd_u8),
+                      MAP(v128_sad_u16),
+                      MAP(v128_ssd_s16),
+                      MAP(v128_add_8),
+                      MAP(v128_add_16),
+                      MAP(v128_sadd_s8),
+                      MAP(v128_sadd_u8),
+                      MAP(v128_sadd_s16),
+                      MAP(v128_add_32),
+                      MAP(v128_add_64),
+                      MAP(v128_sub_8),
+                      MAP(v128_ssub_u8),
+                      MAP(v128_ssub_s8),
+                      MAP(v128_sub_16),
+                      MAP(v128_ssub_s16),
+                      MAP(v128_ssub_u16),
+                      MAP(v128_sub_32),
+                      MAP(v128_sub_64),
+                      MAP(v128_ziplo_8),
+                      MAP(v128_ziphi_8),
+                      MAP(v128_ziplo_16),
+                      MAP(v128_ziphi_16),
+                      MAP(v128_ziplo_32),
+                      MAP(v128_ziphi_32),
+                      MAP(v128_ziplo_64),
+                      MAP(v128_ziphi_64),
+                      MAP(v128_unziphi_8),
+                      MAP(v128_unziplo_8),
+                      MAP(v128_unziphi_16),
+                      MAP(v128_unziplo_16),
+                      MAP(v128_unziphi_32),
+                      MAP(v128_unziplo_32),
+                      MAP(v128_pack_s32_u16),
+                      MAP(v128_pack_s32_s16),
+                      MAP(v128_pack_s16_u8),
+                      MAP(v128_pack_s16_s8),
+                      MAP(v128_or),
+                      MAP(v128_xor),
+                      MAP(v128_and),
+                      MAP(v128_andn),
+                      MAP(v128_mullo_s16),
+                      MAP(v128_mulhi_s16),
+                      MAP(v128_mullo_s32),
+                      MAP(v128_madd_s16),
+                      MAP(v128_madd_us8),
+                      MAP(v128_avg_u8),
+                      MAP(v128_rdavg_u8),
+                      MAP(v128_rdavg_u16),
+                      MAP(v128_avg_u16),
+                      MAP(v128_min_u8),
+                      MAP(v128_max_u8),
+                      MAP(v128_min_s8),
+                      MAP(v128_max_s8),
+                      MAP(v128_min_s16),
+                      MAP(v128_max_s16),
+                      MAP(v128_min_s32),
+                      MAP(v128_max_s32),
+                      MAP(v128_cmpgt_s8),
+                      MAP(v128_cmplt_s8),
+                      MAP(v128_cmpeq_8),
+                      MAP(v128_cmpgt_s16),
+                      MAP(v128_cmpeq_16),
+                      MAP(v128_cmplt_s16),
+                      MAP(v128_cmpgt_s32),
+                      MAP(v128_cmpeq_32),
+                      MAP(v128_cmplt_s32),
+                      MAP(v128_shuffle_8),
+                      MAP(imm_v128_align<1>),
+                      MAP(imm_v128_align<2>),
+                      MAP(imm_v128_align<3>),
+                      MAP(imm_v128_align<4>),
+                      MAP(imm_v128_align<5>),
+                      MAP(imm_v128_align<6>),
+                      MAP(imm_v128_align<7>),
+                      MAP(imm_v128_align<8>),
+                      MAP(imm_v128_align<9>),
+                      MAP(imm_v128_align<10>),
+                      MAP(imm_v128_align<11>),
+                      MAP(imm_v128_align<12>),
+                      MAP(imm_v128_align<13>),
+                      MAP(imm_v128_align<14>),
+                      MAP(imm_v128_align<15>),
+                      MAP(v128_abs_s8),
+                      MAP(v128_abs_s16),
+                      MAP(v128_padd_u8),
+                      MAP(v128_padd_s16),
+                      MAP(v128_unpacklo_u16_s32),
+                      MAP(v128_unpacklo_s16_s32),
+                      MAP(v128_unpackhi_u16_s32),
+                      MAP(v128_unpackhi_s16_s32),
+                      MAP(imm_v128_shr_n_byte<1>),
+                      MAP(imm_v128_shr_n_byte<2>),
+                      MAP(imm_v128_shr_n_byte<3>),
+                      MAP(imm_v128_shr_n_byte<4>),
+                      MAP(imm_v128_shr_n_byte<5>),
+                      MAP(imm_v128_shr_n_byte<6>),
+                      MAP(imm_v128_shr_n_byte<7>),
+                      MAP(imm_v128_shr_n_byte<8>),
+                      MAP(imm_v128_shr_n_byte<9>),
+                      MAP(imm_v128_shr_n_byte<10>),
+                      MAP(imm_v128_shr_n_byte<11>),
+                      MAP(imm_v128_shr_n_byte<12>),
+                      MAP(imm_v128_shr_n_byte<13>),
+                      MAP(imm_v128_shr_n_byte<14>),
+                      MAP(imm_v128_shr_n_byte<15>),
+                      MAP(imm_v128_shl_n_byte<1>),
+                      MAP(imm_v128_shl_n_byte<2>),
+                      MAP(imm_v128_shl_n_byte<3>),
+                      MAP(imm_v128_shl_n_byte<4>),
+                      MAP(imm_v128_shl_n_byte<5>),
+                      MAP(imm_v128_shl_n_byte<6>),
+                      MAP(imm_v128_shl_n_byte<7>),
+                      MAP(imm_v128_shl_n_byte<8>),
+                      MAP(imm_v128_shl_n_byte<9>),
+                      MAP(imm_v128_shl_n_byte<10>),
+                      MAP(imm_v128_shl_n_byte<11>),
+                      MAP(imm_v128_shl_n_byte<12>),
+                      MAP(imm_v128_shl_n_byte<13>),
+                      MAP(imm_v128_shl_n_byte<14>),
+                      MAP(imm_v128_shl_n_byte<15>),
+                      MAP(imm_v128_shl_n_8<1>),
+                      MAP(imm_v128_shl_n_8<2>),
+                      MAP(imm_v128_shl_n_8<3>),
+                      MAP(imm_v128_shl_n_8<4>),
+                      MAP(imm_v128_shl_n_8<5>),
+                      MAP(imm_v128_shl_n_8<6>),
+                      MAP(imm_v128_shl_n_8<7>),
+                      MAP(imm_v128_shr_n_u8<1>),
+                      MAP(imm_v128_shr_n_u8<2>),
+                      MAP(imm_v128_shr_n_u8<3>),
+                      MAP(imm_v128_shr_n_u8<4>),
+                      MAP(imm_v128_shr_n_u8<5>),
+                      MAP(imm_v128_shr_n_u8<6>),
+                      MAP(imm_v128_shr_n_u8<7>),
+                      MAP(imm_v128_shr_n_s8<1>),
+                      MAP(imm_v128_shr_n_s8<2>),
+                      MAP(imm_v128_shr_n_s8<3>),
+                      MAP(imm_v128_shr_n_s8<4>),
+                      MAP(imm_v128_shr_n_s8<5>),
+                      MAP(imm_v128_shr_n_s8<6>),
+                      MAP(imm_v128_shr_n_s8<7>),
+                      MAP(imm_v128_shl_n_16<1>),
+                      MAP(imm_v128_shl_n_16<2>),
+                      MAP(imm_v128_shl_n_16<4>),
+                      MAP(imm_v128_shl_n_16<6>),
+                      MAP(imm_v128_shl_n_16<8>),
+                      MAP(imm_v128_shl_n_16<10>),
+                      MAP(imm_v128_shl_n_16<12>),
+                      MAP(imm_v128_shl_n_16<14>),
+                      MAP(imm_v128_shr_n_u16<1>),
+                      MAP(imm_v128_shr_n_u16<2>),
+                      MAP(imm_v128_shr_n_u16<4>),
+                      MAP(imm_v128_shr_n_u16<6>),
+                      MAP(imm_v128_shr_n_u16<8>),
+                      MAP(imm_v128_shr_n_u16<10>),
+                      MAP(imm_v128_shr_n_u16<12>),
+                      MAP(imm_v128_shr_n_u16<14>),
+                      MAP(imm_v128_shr_n_s16<1>),
+                      MAP(imm_v128_shr_n_s16<2>),
+                      MAP(imm_v128_shr_n_s16<4>),
+                      MAP(imm_v128_shr_n_s16<6>),
+                      MAP(imm_v128_shr_n_s16<8>),
+                      MAP(imm_v128_shr_n_s16<10>),
+                      MAP(imm_v128_shr_n_s16<12>),
+                      MAP(imm_v128_shr_n_s16<14>),
+                      MAP(imm_v128_shl_n_32<1>),
+                      MAP(imm_v128_shl_n_32<4>),
+                      MAP(imm_v128_shl_n_32<8>),
+                      MAP(imm_v128_shl_n_32<12>),
+                      MAP(imm_v128_shl_n_32<16>),
+                      MAP(imm_v128_shl_n_32<20>),
+                      MAP(imm_v128_shl_n_32<24>),
+                      MAP(imm_v128_shl_n_32<28>),
+                      MAP(imm_v128_shr_n_u32<1>),
+                      MAP(imm_v128_shr_n_u32<4>),
+                      MAP(imm_v128_shr_n_u32<8>),
+                      MAP(imm_v128_shr_n_u32<12>),
+                      MAP(imm_v128_shr_n_u32<16>),
+                      MAP(imm_v128_shr_n_u32<20>),
+                      MAP(imm_v128_shr_n_u32<24>),
+                      MAP(imm_v128_shr_n_u32<28>),
+                      MAP(imm_v128_shr_n_s32<1>),
+                      MAP(imm_v128_shr_n_s32<4>),
+                      MAP(imm_v128_shr_n_s32<8>),
+                      MAP(imm_v128_shr_n_s32<12>),
+                      MAP(imm_v128_shr_n_s32<16>),
+                      MAP(imm_v128_shr_n_s32<20>),
+                      MAP(imm_v128_shr_n_s32<24>),
+                      MAP(imm_v128_shr_n_s32<28>),
+                      MAP(imm_v128_shl_n_64<1>),
+                      MAP(imm_v128_shl_n_64<4>),
+                      MAP(imm_v128_shl_n_64<8>),
+                      MAP(imm_v128_shl_n_64<12>),
+                      MAP(imm_v128_shl_n_64<16>),
+                      MAP(imm_v128_shl_n_64<20>),
+                      MAP(imm_v128_shl_n_64<24>),
+                      MAP(imm_v128_shl_n_64<28>),
+                      MAP(imm_v128_shl_n_64<32>),
+                      MAP(imm_v128_shl_n_64<36>),
+                      MAP(imm_v128_shl_n_64<40>),
+                      MAP(imm_v128_shl_n_64<44>),
+                      MAP(imm_v128_shl_n_64<48>),
+                      MAP(imm_v128_shl_n_64<52>),
+                      MAP(imm_v128_shl_n_64<56>),
+                      MAP(imm_v128_shl_n_64<60>),
+                      MAP(imm_v128_shr_n_u64<1>),
+                      MAP(imm_v128_shr_n_u64<4>),
+                      MAP(imm_v128_shr_n_u64<8>),
+                      MAP(imm_v128_shr_n_u64<12>),
+                      MAP(imm_v128_shr_n_u64<16>),
+                      MAP(imm_v128_shr_n_u64<20>),
+                      MAP(imm_v128_shr_n_u64<24>),
+                      MAP(imm_v128_shr_n_u64<28>),
+                      MAP(imm_v128_shr_n_u64<32>),
+                      MAP(imm_v128_shr_n_u64<36>),
+                      MAP(imm_v128_shr_n_u64<40>),
+                      MAP(imm_v128_shr_n_u64<44>),
+                      MAP(imm_v128_shr_n_u64<48>),
+                      MAP(imm_v128_shr_n_u64<52>),
+                      MAP(imm_v128_shr_n_u64<56>),
+                      MAP(imm_v128_shr_n_u64<60>),
+                      MAP(imm_v128_shr_n_s64<1>),
+                      MAP(imm_v128_shr_n_s64<4>),
+                      MAP(imm_v128_shr_n_s64<8>),
+                      MAP(imm_v128_shr_n_s64<12>),
+                      MAP(imm_v128_shr_n_s64<16>),
+                      MAP(imm_v128_shr_n_s64<20>),
+                      MAP(imm_v128_shr_n_s64<24>),
+                      MAP(imm_v128_shr_n_s64<28>),
+                      MAP(imm_v128_shr_n_s64<32>),
+                      MAP(imm_v128_shr_n_s64<36>),
+                      MAP(imm_v128_shr_n_s64<40>),
+                      MAP(imm_v128_shr_n_s64<44>),
+                      MAP(imm_v128_shr_n_s64<48>),
+                      MAP(imm_v128_shr_n_s64<52>),
+                      MAP(imm_v128_shr_n_s64<56>),
+                      MAP(imm_v128_shr_n_s64<60>),
+                      MAP(v128_from_v64),
+                      MAP(v128_zip_8),
+                      MAP(v128_zip_16),
+                      MAP(v128_zip_32),
+                      MAP(v128_mul_s16),
+                      MAP(v128_unpack_u8_s16),
+                      MAP(v128_unpack_s8_s16),
+                      MAP(v128_unpack_u16_s32),
+                      MAP(v128_unpack_s16_s32),
+                      MAP(v128_shl_8),
+                      MAP(v128_shr_u8),
+                      MAP(v128_shr_s8),
+                      MAP(v128_shl_16),
+                      MAP(v128_shr_u16),
+                      MAP(v128_shr_s16),
+                      MAP(v128_shl_32),
+                      MAP(v128_shr_u32),
+                      MAP(v128_shr_s32),
+                      MAP(v128_shl_64),
+                      MAP(v128_shr_u64),
+                      MAP(v128_shr_s64),
+                      MAP(v128_hadd_u8),
+                      MAP(v128_dotp_su8),
+                      MAP(v128_dotp_s16),
+                      MAP(v128_dotp_s32),
+                      MAP(v128_low_u32),
+                      MAP(v128_low_v64),
+                      MAP(v128_high_v64),
+                      MAP(v128_from_64),
+                      MAP(v128_from_32),
+                      MAP(v128_movemask_8),
+                      MAP(v128_zero),
+                      MAP(v128_dup_8),
+                      MAP(v128_dup_16),
+                      MAP(v128_dup_32),
+                      MAP(v128_dup_64),
+                      MAP(v128_unpacklo_u8_s16),
+                      MAP(v128_unpackhi_u8_s16),
+                      MAP(v128_unpacklo_s8_s16),
+                      MAP(v128_unpackhi_s8_s16),
+                      MAP(v128_blend_8),
+                      MAP(u32_load_unaligned),
+                      MAP(u32_store_unaligned),
+                      MAP(v64_load_unaligned),
+                      MAP(v64_store_unaligned),
+                      MAP(v128_load_unaligned),
+                      MAP(v128_store_unaligned),
+                      MAP(v256_sad_u8),
+                      MAP(v256_ssd_u8),
+                      MAP(v256_sad_u16),
+                      MAP(v256_ssd_s16),
+                      MAP(v256_hadd_u8),
+                      MAP(v256_low_u64),
+                      MAP(v256_dotp_su8),
+                      MAP(v256_dotp_s16),
+                      MAP(v256_dotp_s32),
+                      MAP(v256_add_8),
+                      MAP(v256_add_16),
+                      MAP(v256_sadd_s8),
+                      MAP(v256_sadd_u8),
+                      MAP(v256_sadd_s16),
+                      MAP(v256_add_32),
+                      MAP(v256_add_64),
+                      MAP(v256_sub_8),
+                      MAP(v256_ssub_u8),
+                      MAP(v256_ssub_s8),
+                      MAP(v256_sub_16),
+                      MAP(v256_ssub_u16),
+                      MAP(v256_ssub_s16),
+                      MAP(v256_sub_32),
+                      MAP(v256_sub_64),
+                      MAP(v256_ziplo_8),
+                      MAP(v256_ziphi_8),
+                      MAP(v256_ziplo_16),
+                      MAP(v256_ziphi_16),
+                      MAP(v256_ziplo_32),
+                      MAP(v256_ziphi_32),
+                      MAP(v256_ziplo_64),
+                      MAP(v256_ziphi_64),
+                      MAP(v256_unziphi_8),
+                      MAP(v256_unziplo_8),
+                      MAP(v256_unziphi_16),
+                      MAP(v256_unziplo_16),
+                      MAP(v256_unziphi_32),
+                      MAP(v256_unziplo_32),
+                      MAP(v256_unziphi_64),
+                      MAP(v256_unziplo_64),
+                      MAP(v256_pack_s32_u16),
+                      MAP(v256_pack_s32_s16),
+                      MAP(v256_pack_s16_u8),
+                      MAP(v256_pack_s16_s8),
+                      MAP(v256_or),
+                      MAP(v256_xor),
+                      MAP(v256_and),
+                      MAP(v256_andn),
+                      MAP(v256_mullo_s16),
+                      MAP(v256_mulhi_s16),
+                      MAP(v256_mullo_s32),
+                      MAP(v256_madd_s16),
+                      MAP(v256_madd_us8),
+                      MAP(v256_avg_u8),
+                      MAP(v256_rdavg_u8),
+                      MAP(v256_rdavg_u16),
+                      MAP(v256_avg_u16),
+                      MAP(v256_min_u8),
+                      MAP(v256_max_u8),
+                      MAP(v256_min_s8),
+                      MAP(v256_max_s8),
+                      MAP(v256_min_s16),
+                      MAP(v256_max_s16),
+                      MAP(v256_min_s32),
+                      MAP(v256_max_s32),
+                      MAP(v256_cmpgt_s8),
+                      MAP(v256_cmplt_s8),
+                      MAP(v256_cmpeq_8),
+                      MAP(v256_cmpgt_s16),
+                      MAP(v256_cmplt_s16),
+                      MAP(v256_cmpeq_16),
+                      MAP(v256_cmpgt_s32),
+                      MAP(v256_cmplt_s32),
+                      MAP(v256_cmpeq_32),
+                      MAP(v256_shuffle_8),
+                      MAP(v256_pshuffle_8),
+                      MAP(v256_wideshuffle_8),
+                      MAP(imm_v256_align<1>),
+                      MAP(imm_v256_align<2>),
+                      MAP(imm_v256_align<3>),
+                      MAP(imm_v256_align<4>),
+                      MAP(imm_v256_align<5>),
+                      MAP(imm_v256_align<6>),
+                      MAP(imm_v256_align<7>),
+                      MAP(imm_v256_align<8>),
+                      MAP(imm_v256_align<9>),
+                      MAP(imm_v256_align<10>),
+                      MAP(imm_v256_align<11>),
+                      MAP(imm_v256_align<12>),
+                      MAP(imm_v256_align<13>),
+                      MAP(imm_v256_align<14>),
+                      MAP(imm_v256_align<15>),
+                      MAP(imm_v256_align<16>),
+                      MAP(imm_v256_align<17>),
+                      MAP(imm_v256_align<18>),
+                      MAP(imm_v256_align<19>),
+                      MAP(imm_v256_align<20>),
+                      MAP(imm_v256_align<21>),
+                      MAP(imm_v256_align<22>),
+                      MAP(imm_v256_align<23>),
+                      MAP(imm_v256_align<24>),
+                      MAP(imm_v256_align<25>),
+                      MAP(imm_v256_align<26>),
+                      MAP(imm_v256_align<27>),
+                      MAP(imm_v256_align<28>),
+                      MAP(imm_v256_align<29>),
+                      MAP(imm_v256_align<30>),
+                      MAP(imm_v256_align<31>),
+                      MAP(v256_from_v128),
+                      MAP(v256_zip_8),
+                      MAP(v256_zip_16),
+                      MAP(v256_zip_32),
+                      MAP(v256_mul_s16),
+                      MAP(v256_unpack_u8_s16),
+                      MAP(v256_unpack_s8_s16),
+                      MAP(v256_unpack_u16_s32),
+                      MAP(v256_unpack_s16_s32),
+                      MAP(v256_shl_8),
+                      MAP(v256_shr_u8),
+                      MAP(v256_shr_s8),
+                      MAP(v256_shl_16),
+                      MAP(v256_shr_u16),
+                      MAP(v256_shr_s16),
+                      MAP(v256_shl_32),
+                      MAP(v256_shr_u32),
+                      MAP(v256_shr_s32),
+                      MAP(v256_shl_64),
+                      MAP(v256_shr_u64),
+                      MAP(v256_shr_s64),
+                      MAP(v256_abs_s8),
+                      MAP(v256_abs_s16),
+                      MAP(v256_padd_u8),
+                      MAP(v256_padd_s16),
+                      MAP(v256_unpacklo_u16_s32),
+                      MAP(v256_unpacklo_s16_s32),
+                      MAP(v256_unpackhi_u16_s32),
+                      MAP(v256_unpackhi_s16_s32),
+                      MAP(imm_v256_shr_n_word<1>),
+                      MAP(imm_v256_shr_n_word<2>),
+                      MAP(imm_v256_shr_n_word<3>),
+                      MAP(imm_v256_shr_n_word<4>),
+                      MAP(imm_v256_shr_n_word<5>),
+                      MAP(imm_v256_shr_n_word<6>),
+                      MAP(imm_v256_shr_n_word<7>),
+                      MAP(imm_v256_shr_n_word<8>),
+                      MAP(imm_v256_shr_n_word<9>),
+                      MAP(imm_v256_shr_n_word<10>),
+                      MAP(imm_v256_shr_n_word<11>),
+                      MAP(imm_v256_shr_n_word<12>),
+                      MAP(imm_v256_shr_n_word<13>),
+                      MAP(imm_v256_shr_n_word<14>),
+                      MAP(imm_v256_shr_n_word<15>),
+                      MAP(imm_v256_shl_n_word<1>),
+                      MAP(imm_v256_shl_n_word<2>),
+                      MAP(imm_v256_shl_n_word<3>),
+                      MAP(imm_v256_shl_n_word<4>),
+                      MAP(imm_v256_shl_n_word<5>),
+                      MAP(imm_v256_shl_n_word<6>),
+                      MAP(imm_v256_shl_n_word<7>),
+                      MAP(imm_v256_shl_n_word<8>),
+                      MAP(imm_v256_shl_n_word<9>),
+                      MAP(imm_v256_shl_n_word<10>),
+                      MAP(imm_v256_shl_n_word<11>),
+                      MAP(imm_v256_shl_n_word<12>),
+                      MAP(imm_v256_shl_n_word<13>),
+                      MAP(imm_v256_shl_n_word<14>),
+                      MAP(imm_v256_shl_n_word<15>),
+                      MAP(imm_v256_shr_n_byte<1>),
+                      MAP(imm_v256_shr_n_byte<2>),
+                      MAP(imm_v256_shr_n_byte<3>),
+                      MAP(imm_v256_shr_n_byte<4>),
+                      MAP(imm_v256_shr_n_byte<5>),
+                      MAP(imm_v256_shr_n_byte<6>),
+                      MAP(imm_v256_shr_n_byte<7>),
+                      MAP(imm_v256_shr_n_byte<8>),
+                      MAP(imm_v256_shr_n_byte<9>),
+                      MAP(imm_v256_shr_n_byte<10>),
+                      MAP(imm_v256_shr_n_byte<11>),
+                      MAP(imm_v256_shr_n_byte<12>),
+                      MAP(imm_v256_shr_n_byte<13>),
+                      MAP(imm_v256_shr_n_byte<14>),
+                      MAP(imm_v256_shr_n_byte<15>),
+                      MAP(imm_v256_shr_n_byte<16>),
+                      MAP(imm_v256_shr_n_byte<17>),
+                      MAP(imm_v256_shr_n_byte<18>),
+                      MAP(imm_v256_shr_n_byte<19>),
+                      MAP(imm_v256_shr_n_byte<20>),
+                      MAP(imm_v256_shr_n_byte<21>),
+                      MAP(imm_v256_shr_n_byte<22>),
+                      MAP(imm_v256_shr_n_byte<23>),
+                      MAP(imm_v256_shr_n_byte<24>),
+                      MAP(imm_v256_shr_n_byte<25>),
+                      MAP(imm_v256_shr_n_byte<26>),
+                      MAP(imm_v256_shr_n_byte<27>),
+                      MAP(imm_v256_shr_n_byte<28>),
+                      MAP(imm_v256_shr_n_byte<29>),
+                      MAP(imm_v256_shr_n_byte<30>),
+                      MAP(imm_v256_shr_n_byte<31>),
+                      MAP(imm_v256_shl_n_byte<1>),
+                      MAP(imm_v256_shl_n_byte<2>),
+                      MAP(imm_v256_shl_n_byte<3>),
+                      MAP(imm_v256_shl_n_byte<4>),
+                      MAP(imm_v256_shl_n_byte<5>),
+                      MAP(imm_v256_shl_n_byte<6>),
+                      MAP(imm_v256_shl_n_byte<7>),
+                      MAP(imm_v256_shl_n_byte<8>),
+                      MAP(imm_v256_shl_n_byte<9>),
+                      MAP(imm_v256_shl_n_byte<10>),
+                      MAP(imm_v256_shl_n_byte<11>),
+                      MAP(imm_v256_shl_n_byte<12>),
+                      MAP(imm_v256_shl_n_byte<13>),
+                      MAP(imm_v256_shl_n_byte<14>),
+                      MAP(imm_v256_shl_n_byte<15>),
+                      MAP(imm_v256_shl_n_byte<16>),
+                      MAP(imm_v256_shl_n_byte<17>),
+                      MAP(imm_v256_shl_n_byte<18>),
+                      MAP(imm_v256_shl_n_byte<19>),
+                      MAP(imm_v256_shl_n_byte<20>),
+                      MAP(imm_v256_shl_n_byte<21>),
+                      MAP(imm_v256_shl_n_byte<22>),
+                      MAP(imm_v256_shl_n_byte<23>),
+                      MAP(imm_v256_shl_n_byte<24>),
+                      MAP(imm_v256_shl_n_byte<25>),
+                      MAP(imm_v256_shl_n_byte<26>),
+                      MAP(imm_v256_shl_n_byte<27>),
+                      MAP(imm_v256_shl_n_byte<28>),
+                      MAP(imm_v256_shl_n_byte<29>),
+                      MAP(imm_v256_shl_n_byte<30>),
+                      MAP(imm_v256_shl_n_byte<31>),
+                      MAP(imm_v256_shl_n_8<1>),
+                      MAP(imm_v256_shl_n_8<2>),
+                      MAP(imm_v256_shl_n_8<3>),
+                      MAP(imm_v256_shl_n_8<4>),
+                      MAP(imm_v256_shl_n_8<5>),
+                      MAP(imm_v256_shl_n_8<6>),
+                      MAP(imm_v256_shl_n_8<7>),
+                      MAP(imm_v256_shr_n_u8<1>),
+                      MAP(imm_v256_shr_n_u8<2>),
+                      MAP(imm_v256_shr_n_u8<3>),
+                      MAP(imm_v256_shr_n_u8<4>),
+                      MAP(imm_v256_shr_n_u8<5>),
+                      MAP(imm_v256_shr_n_u8<6>),
+                      MAP(imm_v256_shr_n_u8<7>),
+                      MAP(imm_v256_shr_n_s8<1>),
+                      MAP(imm_v256_shr_n_s8<2>),
+                      MAP(imm_v256_shr_n_s8<3>),
+                      MAP(imm_v256_shr_n_s8<4>),
+                      MAP(imm_v256_shr_n_s8<5>),
+                      MAP(imm_v256_shr_n_s8<6>),
+                      MAP(imm_v256_shr_n_s8<7>),
+                      MAP(imm_v256_shl_n_16<1>),
+                      MAP(imm_v256_shl_n_16<2>),
+                      MAP(imm_v256_shl_n_16<4>),
+                      MAP(imm_v256_shl_n_16<6>),
+                      MAP(imm_v256_shl_n_16<8>),
+                      MAP(imm_v256_shl_n_16<10>),
+                      MAP(imm_v256_shl_n_16<12>),
+                      MAP(imm_v256_shl_n_16<14>),
+                      MAP(imm_v256_shr_n_u16<1>),
+                      MAP(imm_v256_shr_n_u16<2>),
+                      MAP(imm_v256_shr_n_u16<4>),
+                      MAP(imm_v256_shr_n_u16<6>),
+                      MAP(imm_v256_shr_n_u16<8>),
+                      MAP(imm_v256_shr_n_u16<10>),
+                      MAP(imm_v256_shr_n_u16<12>),
+                      MAP(imm_v256_shr_n_u16<14>),
+                      MAP(imm_v256_shr_n_s16<1>),
+                      MAP(imm_v256_shr_n_s16<2>),
+                      MAP(imm_v256_shr_n_s16<4>),
+                      MAP(imm_v256_shr_n_s16<6>),
+                      MAP(imm_v256_shr_n_s16<8>),
+                      MAP(imm_v256_shr_n_s16<10>),
+                      MAP(imm_v256_shr_n_s16<12>),
+                      MAP(imm_v256_shr_n_s16<14>),
+                      MAP(imm_v256_shl_n_32<1>),
+                      MAP(imm_v256_shl_n_32<4>),
+                      MAP(imm_v256_shl_n_32<8>),
+                      MAP(imm_v256_shl_n_32<12>),
+                      MAP(imm_v256_shl_n_32<16>),
+                      MAP(imm_v256_shl_n_32<20>),
+                      MAP(imm_v256_shl_n_32<24>),
+                      MAP(imm_v256_shl_n_32<28>),
+                      MAP(imm_v256_shr_n_u32<1>),
+                      MAP(imm_v256_shr_n_u32<4>),
+                      MAP(imm_v256_shr_n_u32<8>),
+                      MAP(imm_v256_shr_n_u32<12>),
+                      MAP(imm_v256_shr_n_u32<16>),
+                      MAP(imm_v256_shr_n_u32<20>),
+                      MAP(imm_v256_shr_n_u32<24>),
+                      MAP(imm_v256_shr_n_u32<28>),
+                      MAP(imm_v256_shr_n_s32<1>),
+                      MAP(imm_v256_shr_n_s32<4>),
+                      MAP(imm_v256_shr_n_s32<8>),
+                      MAP(imm_v256_shr_n_s32<12>),
+                      MAP(imm_v256_shr_n_s32<16>),
+                      MAP(imm_v256_shr_n_s32<20>),
+                      MAP(imm_v256_shr_n_s32<24>),
+                      MAP(imm_v256_shr_n_s32<28>),
+                      MAP(imm_v256_shl_n_64<1>),
+                      MAP(imm_v256_shl_n_64<4>),
+                      MAP(imm_v256_shl_n_64<8>),
+                      MAP(imm_v256_shl_n_64<12>),
+                      MAP(imm_v256_shl_n_64<16>),
+                      MAP(imm_v256_shl_n_64<20>),
+                      MAP(imm_v256_shl_n_64<24>),
+                      MAP(imm_v256_shl_n_64<28>),
+                      MAP(imm_v256_shl_n_64<32>),
+                      MAP(imm_v256_shl_n_64<36>),
+                      MAP(imm_v256_shl_n_64<40>),
+                      MAP(imm_v256_shl_n_64<44>),
+                      MAP(imm_v256_shl_n_64<48>),
+                      MAP(imm_v256_shl_n_64<52>),
+                      MAP(imm_v256_shl_n_64<56>),
+                      MAP(imm_v256_shl_n_64<60>),
+                      MAP(imm_v256_shr_n_u64<1>),
+                      MAP(imm_v256_shr_n_u64<4>),
+                      MAP(imm_v256_shr_n_u64<8>),
+                      MAP(imm_v256_shr_n_u64<12>),
+                      MAP(imm_v256_shr_n_u64<16>),
+                      MAP(imm_v256_shr_n_u64<20>),
+                      MAP(imm_v256_shr_n_u64<24>),
+                      MAP(imm_v256_shr_n_u64<28>),
+                      MAP(imm_v256_shr_n_u64<32>),
+                      MAP(imm_v256_shr_n_u64<36>),
+                      MAP(imm_v256_shr_n_u64<40>),
+                      MAP(imm_v256_shr_n_u64<44>),
+                      MAP(imm_v256_shr_n_u64<48>),
+                      MAP(imm_v256_shr_n_u64<52>),
+                      MAP(imm_v256_shr_n_u64<56>),
+                      MAP(imm_v256_shr_n_u64<60>),
+                      MAP(imm_v256_shr_n_s64<1>),
+                      MAP(imm_v256_shr_n_s64<4>),
+                      MAP(imm_v256_shr_n_s64<8>),
+                      MAP(imm_v256_shr_n_s64<12>),
+                      MAP(imm_v256_shr_n_s64<16>),
+                      MAP(imm_v256_shr_n_s64<20>),
+                      MAP(imm_v256_shr_n_s64<24>),
+                      MAP(imm_v256_shr_n_s64<28>),
+                      MAP(imm_v256_shr_n_s64<32>),
+                      MAP(imm_v256_shr_n_s64<36>),
+                      MAP(imm_v256_shr_n_s64<40>),
+                      MAP(imm_v256_shr_n_s64<44>),
+                      MAP(imm_v256_shr_n_s64<48>),
+                      MAP(imm_v256_shr_n_s64<52>),
+                      MAP(imm_v256_shr_n_s64<56>),
+                      MAP(imm_v256_shr_n_s64<60>),
+                      MAP(v256_movemask_8),
+                      MAP(v256_zero),
+                      MAP(v256_dup_8),
+                      MAP(v256_dup_16),
+                      MAP(v256_dup_32),
+                      MAP(v256_dup_64),
+                      MAP(v256_low_u32),
+                      MAP(v256_low_v64),
+                      MAP(v256_from_64),
+                      MAP(v256_from_v64),
+                      MAP(v256_ziplo_128),
+                      MAP(v256_ziphi_128),
+                      MAP(v256_unpacklo_u8_s16),
+                      MAP(v256_unpackhi_u8_s16),
+                      MAP(v256_unpacklo_s8_s16),
+                      MAP(v256_unpackhi_s8_s16),
+                      MAP(v256_blend_8),
+                      { NULL, NULL, NULL } };
+#undef MAP
+
+// Map reference functions to machine tuned functions. Since the
+// functions depend on machine tuned types, the non-machine tuned
+// instantiations of the test can't refer to these functions directly,
+// so we refer to them by name and do the mapping here.
+void Map(const char *name, fptr *ref, fptr *simd) {
+  unsigned int i;
+  for (i = 0; m[i].name && strcmp(name, m[i].name); i++) {
+  }
+
+  *ref = m[i].ref;
+  *simd = m[i].simd;
+}
+
+// Used for printing errors in TestSimd1Arg, TestSimd2Args and TestSimd3Args
+std::string Print(const uint8_t *a, int size) {
+  std::string text = "0x";
+  for (int i = 0; i < size; i++) {
+    const uint8_t c = a[!CONFIG_BIG_ENDIAN ? size - 1 - i : i];
+    // Same as snprintf(..., ..., "%02x", c)
+    text += (c >> 4) + '0' + ((c >> 4) > 9) * ('a' - '0' - 10);
+    text += (c & 15) + '0' + ((c & 15) > 9) * ('a' - '0' - 10);
+  }
+
+  return text;
+}
+
+// Used in TestSimd1Arg, TestSimd2Args and TestSimd3Args to restrict argument
+// ranges
+void SetMask(uint8_t *s, int size, uint32_t mask, uint32_t maskwidth) {
+  switch (maskwidth) {
+    case 0: {
+      break;
+    }
+    case 8: {
+      for (int i = 0; i < size; i++) s[i] &= mask;
+      break;
+    }
+    case 16: {
+      uint16_t *t = reinterpret_cast<uint16_t *>(s);
+      assert(!(reinterpret_cast<uintptr_t>(s) & 1));
+      for (int i = 0; i < size / 2; i++) t[i] &= mask;
+      break;
+    }
+    case 32: {
+      uint32_t *t = reinterpret_cast<uint32_t *>(s);
+      assert(!(reinterpret_cast<uintptr_t>(s) & 3));
+      for (int i = 0; i < size / 4; i++) t[i] &= mask;
+      break;
+    }
+    case 64: {
+      uint64_t *t = reinterpret_cast<uint64_t *>(s);
+      assert(!(reinterpret_cast<uintptr_t>(s) & 7));
+      for (int i = 0; i < size / 8; i++) t[i] &= mask;
+      break;
+    }
+    default: {
+      FAIL() << "Unsupported mask width";
+      break;
+    }
+  }
+}
+
+// We need some extra load/store functions
+void u64_store_aligned(void *p, uint64_t a) {
+  v64_store_aligned(p, v64_from_64(a));
+}
+void s32_store_aligned(void *p, int32_t a) {
+  u32_store_aligned(p, static_cast<uint32_t>(a));
+}
+void s64_store_aligned(void *p, int64_t a) {
+  v64_store_aligned(p, v64_from_64(static_cast<uint64_t>(a)));
+}
+
+void c_u64_store_aligned(void *p, uint64_t a) {
+  c_v64_store_aligned(p, c_v64_from_64(a));
+}
+
+void c_s32_store_aligned(void *p, int32_t a) {
+  c_u32_store_aligned(p, static_cast<uint32_t>(a));
+}
+
+void c_s64_store_aligned(void *p, int64_t a) {
+  c_v64_store_aligned(p, c_v64_from_64(static_cast<uint64_t>(a)));
+}
+
+uint64_t u64_load_aligned(const void *p) {
+  return v64_u64(v64_load_aligned(p));
+}
+uint16_t u16_load_aligned(const void *p) {
+  return *(reinterpret_cast<const uint16_t *>(p));
+}
+uint8_t u8_load_aligned(const void *p) {
+  return *(reinterpret_cast<const uint8_t *>(p));
+}
+
+uint64_t c_u64_load_aligned(const void *p) {
+  return c_v64_u64(c_v64_load_aligned(p));
+}
+uint16_t c_u16_load_aligned(const void *p) {
+  return *(reinterpret_cast<const uint16_t *>(p));
+}
+uint8_t c_u8_load_aligned(const void *p) {
+  return *(reinterpret_cast<const uint8_t *>(p));
+}
+
+// CompareSimd1Arg, CompareSimd2Args and CompareSimd3Args compare
+// intrinsics taking 1, 2 or 3 arguments respectively with their
+// corresponding C reference.  Ideally, the loads and stores should
+// have gone into the template parameter list, but v64 and v128 could
+// be typedef'ed to the same type (which is the case on x86) and then
+// we can't instantiate both v64 and v128, so the function return and
+// argument types, including the always differing types in the C
+// equivalent are used instead.  The function arguments must be void
+// pointers and then go through a cast to avoid matching errors in the
+// branches eliminated by the typeid tests in the calling function.
+template <typename Ret, typename Arg, typename CRet, typename CArg>
+int CompareSimd1Arg(fptr store, fptr load, fptr simd, void *d, fptr c_store,
+                    fptr c_load, fptr c_simd, void *ref_d, const void *a) {
+  void (*const my_store)(void *, Ret) = (void (*const)(void *, Ret))store;
+  Arg (*const my_load)(const void *) = (Arg(*const)(const void *))load;
+  Ret (*const my_simd)(Arg) = (Ret(*const)(Arg))simd;
+  void (*const my_c_store)(void *, CRet) = (void (*const)(void *, CRet))c_store;
+  CArg (*const my_c_load)(const void *) = (CArg(*const)(const void *))c_load;
+  CRet (*const my_c_simd)(CArg) = (CRet(*const)(CArg))c_simd;
+
+  // Call reference and intrinsic
+  my_c_store(ref_d, my_c_simd(my_c_load(a)));
+  my_store(d, my_simd(my_load(a)));
+
+  // Compare results
+  return memcmp(ref_d, d, sizeof(CRet));
+}
+
+template <typename Ret, typename Arg1, typename Arg2, typename CRet,
+          typename CArg1, typename CArg2>
+int CompareSimd2Args(fptr store, fptr load1, fptr load2, fptr simd, void *d,
+                     fptr c_store, fptr c_load1, fptr c_load2, fptr c_simd,
+                     void *ref_d, const void *a, const void *b) {
+  void (*const my_store)(void *, Ret) = (void (*const)(void *, Ret))store;
+  Arg1 (*const my_load1)(const void *) = (Arg1(*const)(const void *))load1;
+  Arg2 (*const my_load2)(const void *) = (Arg2(*const)(const void *))load2;
+  Ret (*const my_simd)(Arg1, Arg2) = (Ret(*const)(Arg1, Arg2))simd;
+  void (*const my_c_store)(void *, CRet) = (void (*const)(void *, CRet))c_store;
+  CArg1 (*const my_c_load1)(const void *) =
+      (CArg1(*const)(const void *))c_load1;
+  CArg2 (*const my_c_load2)(const void *) =
+      (CArg2(*const)(const void *))c_load2;
+  CRet (*const my_c_simd)(CArg1, CArg2) = (CRet(*const)(CArg1, CArg2))c_simd;
+
+  // Call reference and intrinsic
+  my_c_store(ref_d, my_c_simd(my_c_load1(a), my_c_load2(b)));
+  my_store(d, my_simd(my_load1(a), my_load2(b)));
+
+  // Compare results
+  return memcmp(ref_d, d, sizeof(CRet));
+}
+
+template <typename Ret, typename Arg1, typename Arg2, typename Arg3,
+          typename CRet, typename CArg1, typename CArg2, typename CArg3>
+int CompareSimd3Args(fptr store, fptr load1, fptr load2, fptr load3, fptr simd,
+                     void *d, fptr c_store, fptr c_load1, fptr c_load2,
+                     fptr c_load3, fptr c_simd, void *ref_d, const void *a,
+                     const void *b, const void *c) {
+  void (*const my_store)(void *, Ret) = (void (*const)(void *, Ret))store;
+  Arg1 (*const my_load1)(const void *) = (Arg1(*const)(const void *))load1;
+  Arg2 (*const my_load2)(const void *) = (Arg2(*const)(const void *))load2;
+  Arg3 (*const my_load3)(const void *) = (Arg3(*const)(const void *))load3;
+  Ret (*const my_simd)(Arg1, Arg2, Arg3) = (Ret(*const)(Arg1, Arg2, Arg3))simd;
+  void (*const my_c_store)(void *, CRet) = (void (*const)(void *, CRet))c_store;
+  CArg1 (*const my_c_load1)(const void *) =
+      (CArg1(*const)(const void *))c_load1;
+  CArg2 (*const my_c_load2)(const void *) =
+      (CArg2(*const)(const void *))c_load2;
+  CArg3 (*const my_c_load3)(const void *) =
+      (CArg3(*const)(const void *))c_load3;
+  CRet (*const my_c_simd)(CArg1, CArg2, CArg3) =
+      (CRet(*const)(CArg1, CArg2, CArg3))c_simd;
+
+  // Call reference and intrinsic
+  my_c_store(ref_d, my_c_simd(my_c_load1(a), my_c_load2(b), my_c_load3(c)));
+  my_store(d, my_simd(my_load1(a), my_load2(b), my_load3(c)));
+
+  // Compare results
+  return memcmp(ref_d, d, sizeof(CRet));
+}
+
+}  // namespace
+
+template <typename CRet, typename CArg>
+void TestSimd1Arg(uint32_t iterations, uint32_t mask, uint32_t maskwidth,
+                  const char *name) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  fptr ref_simd;
+  fptr simd;
+  int error = 0;
+  DECLARE_ALIGNED(32, uint8_t, s[32]);
+  DECLARE_ALIGNED(32, uint8_t, d[32]);
+  DECLARE_ALIGNED(32, uint8_t, ref_d[32]);
+  assert(sizeof(CArg) <= 32 && sizeof(CRet) <= 32);
+  memset(ref_d, 0, sizeof(ref_d));
+  memset(d, 0, sizeof(d));
+
+  Map(name, &ref_simd, &simd);
+  if (simd == NULL || ref_simd == NULL) {
+    FAIL() << "Internal error: Unknown intrinsic function " << name;
+  }
+  for (unsigned int count = 0;
+       count < iterations && !error && !testing::Test::HasFailure(); count++) {
+    for (unsigned int c = 0; c < sizeof(CArg); c++) s[c] = rnd.Rand8();
+
+    if (maskwidth) {
+      SetMask(s, sizeof(CArg), mask, maskwidth);
+    }
+
+    if (typeid(CRet) == typeid(c_v64) && typeid(CArg) == typeid(c_v64)) {
+      // V64_V64
+      error = CompareSimd1Arg<v64, v64, CRet, CArg>(
+          reinterpret_cast<fptr>(v64_store_aligned),
+          reinterpret_cast<fptr>(v64_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_v64_store_aligned),
+          reinterpret_cast<fptr>(c_v64_load_aligned), ref_simd, ref_d, s);
+    } else if (typeid(CRet) == typeid(c_v64) &&
+               typeid(CArg) == typeid(uint8_t)) {
+      // V64_U8
+      error = CompareSimd1Arg<v64, uint8_t, CRet, CArg>(
+          reinterpret_cast<fptr>(v64_store_aligned),
+          reinterpret_cast<fptr>(u8_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_v64_store_aligned),
+          reinterpret_cast<fptr>(c_u8_load_aligned), ref_simd, ref_d, s);
+    } else if (typeid(CRet) == typeid(c_v64) &&
+               typeid(CArg) == typeid(uint16_t)) {
+      // V64_U16
+      error = CompareSimd1Arg<v64, uint16_t, CRet, CArg>(
+          reinterpret_cast<fptr>(v64_store_aligned),
+          reinterpret_cast<fptr>(u16_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_v64_store_aligned),
+          reinterpret_cast<fptr>(c_u16_load_aligned), ref_simd, ref_d, s);
+    } else if (typeid(CRet) == typeid(c_v64) &&
+               typeid(CArg) == typeid(uint32_t)) {
+      // V64_U32
+      error = CompareSimd1Arg<v64, uint32_t, CRet, CArg>(
+          reinterpret_cast<fptr>(v64_store_aligned),
+          reinterpret_cast<fptr>(u32_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_v64_store_aligned),
+          reinterpret_cast<fptr>(c_u32_load_aligned), ref_simd, ref_d, s);
+    } else if (typeid(CRet) == typeid(uint64_t) &&
+               typeid(CArg) == typeid(c_v64)) {
+      // U64_V64
+      error = CompareSimd1Arg<uint64_t, v64, CRet, CArg>(
+          reinterpret_cast<fptr>(u64_store_aligned),
+          reinterpret_cast<fptr>(v64_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_u64_store_aligned),
+          reinterpret_cast<fptr>(c_v64_load_aligned), ref_simd, ref_d, s);
+    } else if (typeid(CRet) == typeid(int64_t) &&
+               typeid(CArg) == typeid(c_v64)) {
+      // S64_V64
+      error = CompareSimd1Arg<int64_t, v64, CRet, CArg>(
+          reinterpret_cast<fptr>(s64_store_aligned),
+          reinterpret_cast<fptr>(v64_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_s64_store_aligned),
+          reinterpret_cast<fptr>(c_v64_load_aligned), ref_simd, ref_d, s);
+    } else if (typeid(CRet) == typeid(uint32_t) &&
+               typeid(CArg) == typeid(c_v64)) {
+      // U32_V64
+      error = CompareSimd1Arg<uint32_t, v64, CRet, CArg>(
+          reinterpret_cast<fptr>(u32_store_aligned),
+          reinterpret_cast<fptr>(v64_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_u32_store_aligned),
+          reinterpret_cast<fptr>(c_v64_load_aligned), ref_simd, ref_d, s);
+    } else if (typeid(CRet) == typeid(int32_t) &&
+               typeid(CArg) == typeid(c_v64)) {
+      // S32_V64
+      error = CompareSimd1Arg<int32_t, v64, CRet, CArg>(
+          reinterpret_cast<fptr>(s32_store_aligned),
+          reinterpret_cast<fptr>(v64_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_s32_store_aligned),
+          reinterpret_cast<fptr>(c_v64_load_aligned), ref_simd, ref_d, s);
+    } else if (typeid(CRet) == typeid(uint32_t) &&
+               typeid(CArg) == typeid(c_v128)) {
+      // U32_V128
+      error = CompareSimd1Arg<uint32_t, v128, CRet, CArg>(
+          reinterpret_cast<fptr>(u32_store_aligned),
+          reinterpret_cast<fptr>(v128_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_u32_store_aligned),
+          reinterpret_cast<fptr>(c_v128_load_aligned), ref_simd, ref_d, s);
+    } else if (typeid(CRet) == typeid(uint64_t) &&
+               typeid(CArg) == typeid(c_v128)) {
+      // U64_V128
+      error = CompareSimd1Arg<uint64_t, v128, CRet, CArg>(
+          reinterpret_cast<fptr>(u64_store_aligned),
+          reinterpret_cast<fptr>(v128_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_u64_store_aligned),
+          reinterpret_cast<fptr>(c_v128_load_aligned), ref_simd, ref_d, s);
+    } else if (typeid(CRet) == typeid(uint64_t) &&
+               typeid(CArg) == typeid(c_v256)) {
+      // U64_V256
+      error = CompareSimd1Arg<uint64_t, v256, CRet, CArg>(
+          reinterpret_cast<fptr>(u64_store_aligned),
+          reinterpret_cast<fptr>(v256_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_u64_store_aligned),
+          reinterpret_cast<fptr>(c_v256_load_aligned), ref_simd, ref_d, s);
+    } else if (typeid(CRet) == typeid(c_v64) &&
+               typeid(CArg) == typeid(c_v128)) {
+      // V64_V128
+      error = CompareSimd1Arg<v64, v128, CRet, CArg>(
+          reinterpret_cast<fptr>(v64_store_aligned),
+          reinterpret_cast<fptr>(v128_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_v64_store_aligned),
+          reinterpret_cast<fptr>(c_v128_load_aligned), ref_simd, ref_d, s);
+    } else if (typeid(CRet) == typeid(c_v128) &&
+               typeid(CArg) == typeid(c_v128)) {
+      // V128_V128
+      error = CompareSimd1Arg<v128, v128, CRet, CArg>(
+          reinterpret_cast<fptr>(v128_store_aligned),
+          reinterpret_cast<fptr>(v128_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_v128_store_aligned),
+          reinterpret_cast<fptr>(c_v128_load_aligned), ref_simd, ref_d, s);
+    } else if (typeid(CRet) == typeid(c_v128) &&
+               typeid(CArg) == typeid(c_v64)) {
+      // V128_V64
+      error = CompareSimd1Arg<v128, v64, CRet, CArg>(
+          reinterpret_cast<fptr>(v128_store_aligned),
+          reinterpret_cast<fptr>(v64_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_v128_store_aligned),
+          reinterpret_cast<fptr>(c_v64_load_aligned), ref_simd, ref_d, s);
+    } else if (typeid(CRet) == typeid(c_v128) &&
+               typeid(CArg) == typeid(uint8_t)) {
+      // V128_U8
+      error = CompareSimd1Arg<v128, uint8_t, CRet, CArg>(
+          reinterpret_cast<fptr>(v128_store_aligned),
+          reinterpret_cast<fptr>(u8_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_v128_store_aligned),
+          reinterpret_cast<fptr>(c_u8_load_aligned), ref_simd, ref_d, s);
+    } else if (typeid(CRet) == typeid(c_v128) &&
+               typeid(CArg) == typeid(uint16_t)) {
+      // V128_U16
+      error = CompareSimd1Arg<v128, uint16_t, CRet, CArg>(
+          reinterpret_cast<fptr>(v128_store_aligned),
+          reinterpret_cast<fptr>(u16_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_v128_store_aligned),
+          reinterpret_cast<fptr>(c_u16_load_aligned), ref_simd, ref_d, s);
+    } else if (typeid(CRet) == typeid(c_v128) &&
+               typeid(CArg) == typeid(uint32_t)) {
+      // V128_U32
+      error = CompareSimd1Arg<v128, uint32_t, CRet, CArg>(
+          reinterpret_cast<fptr>(v128_store_aligned),
+          reinterpret_cast<fptr>(u32_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_v128_store_aligned),
+          reinterpret_cast<fptr>(c_u32_load_aligned), ref_simd, ref_d, s);
+    } else if (typeid(CRet) == typeid(c_v128) &&
+               typeid(CArg) == typeid(uint64_t)) {
+      // V128_U64
+      error = CompareSimd1Arg<v128, uint64_t, CRet, CArg>(
+          reinterpret_cast<fptr>(v128_store_aligned),
+          reinterpret_cast<fptr>(u64_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_v128_store_aligned),
+          reinterpret_cast<fptr>(c_u64_load_aligned), ref_simd, ref_d, s);
+    } else if (typeid(CRet) == typeid(c_v256) &&
+               typeid(CArg) == typeid(c_v256)) {
+      // V256_V256
+      error = CompareSimd1Arg<v256, v256, CRet, CArg>(
+          reinterpret_cast<fptr>(v256_store_aligned),
+          reinterpret_cast<fptr>(v256_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_v256_store_aligned),
+          reinterpret_cast<fptr>(c_v256_load_aligned), ref_simd, ref_d, s);
+    } else if (typeid(CRet) == typeid(c_v256) &&
+               typeid(CArg) == typeid(c_v128)) {
+      // V256_V128
+      error = CompareSimd1Arg<v256, v128, CRet, CArg>(
+          reinterpret_cast<fptr>(v256_store_aligned),
+          reinterpret_cast<fptr>(v128_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_v256_store_aligned),
+          reinterpret_cast<fptr>(c_v128_load_aligned), ref_simd, ref_d, s);
+    } else if (typeid(CRet) == typeid(c_v256) &&
+               typeid(CArg) == typeid(uint8_t)) {
+      // V256_U8
+      error = CompareSimd1Arg<v256, uint8_t, CRet, CArg>(
+          reinterpret_cast<fptr>(v256_store_aligned),
+          reinterpret_cast<fptr>(u8_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_v256_store_aligned),
+          reinterpret_cast<fptr>(c_u8_load_aligned), ref_simd, ref_d, s);
+    } else if (typeid(CRet) == typeid(c_v256) &&
+               typeid(CArg) == typeid(uint16_t)) {
+      // V256_U16
+      error = CompareSimd1Arg<v256, uint16_t, CRet, CArg>(
+          reinterpret_cast<fptr>(v256_store_aligned),
+          reinterpret_cast<fptr>(u16_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_v256_store_aligned),
+          reinterpret_cast<fptr>(c_u16_load_aligned), ref_simd, ref_d, s);
+    } else if (typeid(CRet) == typeid(c_v256) &&
+               typeid(CArg) == typeid(uint32_t)) {
+      // V256_U32
+      error = CompareSimd1Arg<v256, uint32_t, CRet, CArg>(
+          reinterpret_cast<fptr>(v256_store_aligned),
+          reinterpret_cast<fptr>(u32_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_v256_store_aligned),
+          reinterpret_cast<fptr>(c_u32_load_aligned), ref_simd, ref_d, s);
+    } else if (typeid(CRet) == typeid(c_v256) &&
+               typeid(CArg) == typeid(uint64_t)) {
+      // V256_U64
+      error = CompareSimd1Arg<v256, uint64_t, CRet, CArg>(
+          reinterpret_cast<fptr>(v256_store_aligned),
+          reinterpret_cast<fptr>(u64_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_v256_store_aligned),
+          reinterpret_cast<fptr>(c_u64_load_aligned), ref_simd, ref_d, s);
+    } else if (typeid(CRet) == typeid(uint32_t) &&
+               typeid(CArg) == typeid(c_v256)) {
+      // U32_V256
+      error = CompareSimd1Arg<uint32_t, v256, CRet, CArg>(
+          reinterpret_cast<fptr>(u32_store_aligned),
+          reinterpret_cast<fptr>(v256_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_u32_store_aligned),
+          reinterpret_cast<fptr>(c_v256_load_aligned), ref_simd, ref_d, s);
+    } else if (typeid(CRet) == typeid(c_v64) &&
+               typeid(CArg) == typeid(c_v256)) {
+      // V64_V256
+      error = CompareSimd1Arg<v64, v256, CRet, CArg>(
+          reinterpret_cast<fptr>(v64_store_aligned),
+          reinterpret_cast<fptr>(v256_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_v64_store_aligned),
+          reinterpret_cast<fptr>(c_v256_load_aligned), ref_simd, ref_d, s);
+    } else {
+      FAIL() << "Internal error: Unknown intrinsic function "
+             << typeid(CRet).name() << " " << name << "(" << typeid(CArg).name()
+             << ")";
+    }
+  }
+
+  EXPECT_EQ(0, error) << "Error: mismatch for " << name << "("
+                      << Print(s, sizeof(CArg)) << ") -> "
+                      << Print(d, sizeof(CRet)) << " (simd), "
+                      << Print(ref_d, sizeof(CRet)) << " (ref)";
+}
+
+template <typename CRet, typename CArg1, typename CArg2>
+void TestSimd2Args(uint32_t iterations, uint32_t mask, uint32_t maskwidth,
+                   const char *name) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  fptr ref_simd;
+  fptr simd;
+  int error = 0;
+  DECLARE_ALIGNED(32, uint8_t, s1[32]);
+  DECLARE_ALIGNED(32, uint8_t, s2[32]);
+  DECLARE_ALIGNED(32, uint8_t, d[32]);
+  DECLARE_ALIGNED(32, uint8_t, ref_d[32]);
+  assert(sizeof(CArg1) <= 32 && sizeof(CArg2) <= 32 && sizeof(CRet) <= 32);
+  memset(ref_d, 0, sizeof(ref_d));
+  memset(d, 0, sizeof(d));
+
+  Map(name, &ref_simd, &simd);
+  if (simd == NULL || ref_simd == NULL) {
+    FAIL() << "Internal error: Unknown intrinsic function " << name;
+  }
+
+  for (unsigned int count = 0;
+       count < iterations && !error && !testing::Test::HasFailure(); count++) {
+    for (unsigned int c = 0; c < sizeof(CArg1); c++) s1[c] = rnd.Rand8();
+
+    for (unsigned int c = 0; c < sizeof(CArg2); c++) s2[c] = rnd.Rand8();
+
+    if (maskwidth) SetMask(s2, sizeof(CArg2), mask, maskwidth);
+
+    if (typeid(CRet) == typeid(c_v64) && typeid(CArg1) == typeid(c_v64) &&
+        typeid(CArg2) == typeid(c_v64)) {
+      // V64_V64V64
+      error = CompareSimd2Args<v64, v64, v64, CRet, CArg1, CArg2>(
+          reinterpret_cast<fptr>(v64_store_aligned),
+          reinterpret_cast<fptr>(v64_load_aligned),
+          reinterpret_cast<fptr>(v64_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_v64_store_aligned),
+          reinterpret_cast<fptr>(c_v64_load_aligned),
+          reinterpret_cast<fptr>(c_v64_load_aligned),
+          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
+    } else if (typeid(CRet) == typeid(c_v64) &&
+               typeid(CArg1) == typeid(uint32_t) &&
+               typeid(CArg2) == typeid(uint32_t)) {
+      // V64_U32U32
+      error = CompareSimd2Args<v64, uint32_t, uint32_t, CRet, CArg1, CArg2>(
+          reinterpret_cast<fptr>(v64_store_aligned),
+          reinterpret_cast<fptr>(u32_load_aligned),
+          reinterpret_cast<fptr>(u32_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_v64_store_aligned),
+          reinterpret_cast<fptr>(c_u32_load_aligned),
+          reinterpret_cast<fptr>(c_u32_load_aligned),
+          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
+    } else if (typeid(CRet) == typeid(uint32_t) &&
+               typeid(CArg1) == typeid(c_v64) &&
+               typeid(CArg2) == typeid(c_v64)) {
+      // U32_V64V64
+      error = CompareSimd2Args<uint32_t, v64, v64, CRet, CArg1, CArg2>(
+          reinterpret_cast<fptr>(u32_store_aligned),
+          reinterpret_cast<fptr>(v64_load_aligned),
+          reinterpret_cast<fptr>(v64_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_u32_store_aligned),
+          reinterpret_cast<fptr>(c_v64_load_aligned),
+          reinterpret_cast<fptr>(c_v64_load_aligned),
+          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
+    } else if (typeid(CRet) == typeid(int64_t) &&
+               typeid(CArg1) == typeid(c_v64) &&
+               typeid(CArg2) == typeid(c_v64)) {
+      // S64_V64V64
+      error = CompareSimd2Args<int64_t, v64, v64, CRet, CArg1, CArg2>(
+          reinterpret_cast<fptr>(s64_store_aligned),
+          reinterpret_cast<fptr>(v64_load_aligned),
+          reinterpret_cast<fptr>(v64_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_s64_store_aligned),
+          reinterpret_cast<fptr>(c_v64_load_aligned),
+          reinterpret_cast<fptr>(c_v64_load_aligned),
+          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
+    } else if (typeid(CRet) == typeid(c_v64) &&
+               typeid(CArg1) == typeid(c_v64) &&
+               typeid(CArg2) == typeid(uint32_t)) {
+      // V64_V64U32
+      error = CompareSimd2Args<v64, v64, uint32_t, CRet, CArg1, CArg2>(
+          reinterpret_cast<fptr>(v64_store_aligned),
+          reinterpret_cast<fptr>(v64_load_aligned),
+          reinterpret_cast<fptr>(u32_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_v64_store_aligned),
+          reinterpret_cast<fptr>(c_v64_load_aligned),
+          reinterpret_cast<fptr>(c_u32_load_aligned),
+          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
+    } else if (typeid(CRet) == typeid(c_v128) &&
+               typeid(CArg1) == typeid(c_v128) &&
+               typeid(CArg2) == typeid(c_v128)) {
+      // V128_V128V128
+      error = CompareSimd2Args<v128, v128, v128, CRet, CArg1, CArg2>(
+          reinterpret_cast<fptr>(v128_store_aligned),
+          reinterpret_cast<fptr>(v128_load_aligned),
+          reinterpret_cast<fptr>(v128_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_v128_store_aligned),
+          reinterpret_cast<fptr>(c_v128_load_aligned),
+          reinterpret_cast<fptr>(c_v128_load_aligned),
+          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
+    } else if (typeid(CRet) == typeid(uint32_t) &&
+               typeid(CArg1) == typeid(c_v128) &&
+               typeid(CArg2) == typeid(c_v128)) {
+      // U32_V128V128
+      error = CompareSimd2Args<uint32_t, v128, v128, CRet, CArg1, CArg2>(
+          reinterpret_cast<fptr>(u32_store_aligned),
+          reinterpret_cast<fptr>(v128_load_aligned),
+          reinterpret_cast<fptr>(v128_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_u32_store_aligned),
+          reinterpret_cast<fptr>(c_v128_load_aligned),
+          reinterpret_cast<fptr>(c_v128_load_aligned),
+          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
+    } else if (typeid(CRet) == typeid(uint64_t) &&
+               typeid(CArg1) == typeid(c_v128) &&
+               typeid(CArg2) == typeid(c_v128)) {
+      // U64_V128V128
+      error = CompareSimd2Args<uint64_t, v128, v128, CRet, CArg1, CArg2>(
+          reinterpret_cast<fptr>(u64_store_aligned),
+          reinterpret_cast<fptr>(v128_load_aligned),
+          reinterpret_cast<fptr>(v128_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_u64_store_aligned),
+          reinterpret_cast<fptr>(c_v128_load_aligned),
+          reinterpret_cast<fptr>(c_v128_load_aligned),
+          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
+    } else if (typeid(CRet) == typeid(int64_t) &&
+               typeid(CArg1) == typeid(c_v128) &&
+               typeid(CArg2) == typeid(c_v128)) {
+      // S64_V128V128
+      error = CompareSimd2Args<int64_t, v128, v128, CRet, CArg1, CArg2>(
+          reinterpret_cast<fptr>(s64_store_aligned),
+          reinterpret_cast<fptr>(v128_load_aligned),
+          reinterpret_cast<fptr>(v128_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_s64_store_aligned),
+          reinterpret_cast<fptr>(c_v128_load_aligned),
+          reinterpret_cast<fptr>(c_v128_load_aligned),
+          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
+    } else if (typeid(CRet) == typeid(c_v128) &&
+               typeid(CArg1) == typeid(uint64_t) &&
+               typeid(CArg2) == typeid(uint64_t)) {
+      // V128_U64U64
+      error = CompareSimd2Args<v128, uint64_t, uint64_t, CRet, CArg1, CArg2>(
+          reinterpret_cast<fptr>(v128_store_aligned),
+          reinterpret_cast<fptr>(u64_load_aligned),
+          reinterpret_cast<fptr>(u64_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_v128_store_aligned),
+          reinterpret_cast<fptr>(c_u64_load_aligned),
+          reinterpret_cast<fptr>(c_u64_load_aligned),
+          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
+    } else if (typeid(CRet) == typeid(c_v128) &&
+               typeid(CArg1) == typeid(c_v64) &&
+               typeid(CArg2) == typeid(c_v64)) {
+      // V128_V64V64
+      error = CompareSimd2Args<v128, v64, v64, CRet, CArg1, CArg2>(
+          reinterpret_cast<fptr>(v128_store_aligned),
+          reinterpret_cast<fptr>(v64_load_aligned),
+          reinterpret_cast<fptr>(v64_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_v128_store_aligned),
+          reinterpret_cast<fptr>(c_v64_load_aligned),
+          reinterpret_cast<fptr>(c_v64_load_aligned),
+          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
+    } else if (typeid(CRet) == typeid(c_v128) &&
+               typeid(CArg1) == typeid(c_v128) &&
+               typeid(CArg2) == typeid(uint32_t)) {
+      // V128_V128U32
+      error = CompareSimd2Args<v128, v128, uint32_t, CRet, CArg1, CArg2>(
+          reinterpret_cast<fptr>(v128_store_aligned),
+          reinterpret_cast<fptr>(v128_load_aligned),
+          reinterpret_cast<fptr>(u32_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_v128_store_aligned),
+          reinterpret_cast<fptr>(c_v128_load_aligned),
+          reinterpret_cast<fptr>(c_u32_load_aligned),
+          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
+    } else if (typeid(CRet) == typeid(c_v256) &&
+               typeid(CArg1) == typeid(c_v256) &&
+               typeid(CArg2) == typeid(c_v256)) {
+      // V256_V256V256
+      error = CompareSimd2Args<v256, v256, v256, CRet, CArg1, CArg2>(
+          reinterpret_cast<fptr>(v256_store_aligned),
+          reinterpret_cast<fptr>(v256_load_aligned),
+          reinterpret_cast<fptr>(v256_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_v256_store_aligned),
+          reinterpret_cast<fptr>(c_v256_load_aligned),
+          reinterpret_cast<fptr>(c_v256_load_aligned),
+          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
+    } else if (typeid(CRet) == typeid(uint64_t) &&
+               typeid(CArg1) == typeid(c_v256) &&
+               typeid(CArg2) == typeid(c_v256)) {
+      // U64_V256V256
+      error = CompareSimd2Args<uint64_t, v256, v256, CRet, CArg1, CArg2>(
+          reinterpret_cast<fptr>(u64_store_aligned),
+          reinterpret_cast<fptr>(v256_load_aligned),
+          reinterpret_cast<fptr>(v256_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_u64_store_aligned),
+          reinterpret_cast<fptr>(c_v256_load_aligned),
+          reinterpret_cast<fptr>(c_v256_load_aligned),
+          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
+    } else if (typeid(CRet) == typeid(int64_t) &&
+               typeid(CArg1) == typeid(c_v256) &&
+               typeid(CArg2) == typeid(c_v256)) {
+      // S64_V256V256
+      error = CompareSimd2Args<int64_t, v256, v256, CRet, CArg1, CArg2>(
+          reinterpret_cast<fptr>(s64_store_aligned),
+          reinterpret_cast<fptr>(v256_load_aligned),
+          reinterpret_cast<fptr>(v256_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_s64_store_aligned),
+          reinterpret_cast<fptr>(c_v256_load_aligned),
+          reinterpret_cast<fptr>(c_v256_load_aligned),
+          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
+    } else if (typeid(CRet) == typeid(uint32_t) &&
+               typeid(CArg1) == typeid(c_v256) &&
+               typeid(CArg2) == typeid(c_v256)) {
+      // U32_V256V256
+      error = CompareSimd2Args<uint32_t, v256, v256, CRet, CArg1, CArg2>(
+          reinterpret_cast<fptr>(u32_store_aligned),
+          reinterpret_cast<fptr>(v256_load_aligned),
+          reinterpret_cast<fptr>(v256_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_u32_store_aligned),
+          reinterpret_cast<fptr>(c_v256_load_aligned),
+          reinterpret_cast<fptr>(c_v256_load_aligned),
+          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
+    } else if (typeid(CRet) == typeid(c_v256) &&
+               typeid(CArg1) == typeid(c_v128) &&
+               typeid(CArg2) == typeid(c_v128)) {
+      // V256_V128V128
+      error = CompareSimd2Args<v256, v128, v128, CRet, CArg1, CArg2>(
+          reinterpret_cast<fptr>(v256_store_aligned),
+          reinterpret_cast<fptr>(v128_load_aligned),
+          reinterpret_cast<fptr>(v128_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_v256_store_aligned),
+          reinterpret_cast<fptr>(c_v128_load_aligned),
+          reinterpret_cast<fptr>(c_v128_load_aligned),
+          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
+    } else if (typeid(CRet) == typeid(c_v256) &&
+               typeid(CArg1) == typeid(c_v256) &&
+               typeid(CArg2) == typeid(uint32_t)) {
+      // V256_V256U32
+      error = CompareSimd2Args<v256, v256, uint32_t, CRet, CArg1, CArg2>(
+          reinterpret_cast<fptr>(v256_store_aligned),
+          reinterpret_cast<fptr>(v256_load_aligned),
+          reinterpret_cast<fptr>(u32_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_v256_store_aligned),
+          reinterpret_cast<fptr>(c_v256_load_aligned),
+          reinterpret_cast<fptr>(c_u32_load_aligned),
+          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
+
+    } else {
+      FAIL() << "Internal error: Unknown intrinsic function "
+             << typeid(CRet).name() << " " << name << "("
+             << typeid(CArg1).name() << ", " << typeid(CArg2).name() << ")";
+    }
+  }
+
+  EXPECT_EQ(0, error) << "Error: mismatch for " << name << "("
+                      << Print(s1, sizeof(CArg1)) << ", "
+                      << Print(s2, sizeof(CArg2)) << ") -> "
+                      << Print(d, sizeof(CRet)) << " (simd), "
+                      << Print(ref_d, sizeof(CRet)) << " (ref)";
+}
+
+template <typename CRet, typename CArg1, typename CArg2, typename CArg3>
+void TestSimd3Args(uint32_t iterations, uint32_t mask, uint32_t maskwidth,
+                   const char *name) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  fptr ref_simd;
+  fptr simd;
+  int error = 0;
+  DECLARE_ALIGNED(32, uint8_t, s1[32]);
+  DECLARE_ALIGNED(32, uint8_t, s2[32]);
+  DECLARE_ALIGNED(32, uint8_t, s3[32]);
+  DECLARE_ALIGNED(32, uint8_t, d[32]);
+  DECLARE_ALIGNED(32, uint8_t, ref_d[32]);
+  assert(sizeof(CArg1) <= 32 && sizeof(CArg2) <= 32 && sizeof(CArg3) <= 32 &&
+         sizeof(CRet) <= 32);
+  memset(ref_d, 0, sizeof(ref_d));
+  memset(d, 0, sizeof(d));
+
+  Map(name, &ref_simd, &simd);
+  if (simd == NULL || ref_simd == NULL) {
+    FAIL() << "Internal error: Unknown intrinsic function " << name;
+  }
+
+  for (unsigned int count = 0;
+       count < iterations && !error && !testing::Test::HasFailure(); count++) {
+    for (unsigned int c = 0; c < sizeof(CArg1); c++) s1[c] = rnd.Rand8();
+
+    for (unsigned int c = 0; c < sizeof(CArg2); c++) s2[c] = rnd.Rand8();
+
+    for (unsigned int c = 0; c < sizeof(CArg3); c++) s3[c] = rnd.Rand8();
+
+    if (maskwidth) SetMask(s3, sizeof(CArg3), mask, maskwidth);
+
+    if (typeid(CRet) == typeid(c_v128) && typeid(CArg1) == typeid(c_v128) &&
+        typeid(CArg2) == typeid(c_v128) && typeid(CArg3) == typeid(c_v128)) {
+      // V128_V128V128V128
+      error =
+          CompareSimd3Args<v128, v128, v128, v128, CRet, CArg1, CArg2, CArg3>(
+              reinterpret_cast<fptr>(v128_store_aligned),
+              reinterpret_cast<fptr>(v128_load_aligned),
+              reinterpret_cast<fptr>(v128_load_aligned),
+              reinterpret_cast<fptr>(v128_load_aligned), simd, d,
+              reinterpret_cast<fptr>(c_v128_store_aligned),
+              reinterpret_cast<fptr>(c_v128_load_aligned),
+              reinterpret_cast<fptr>(c_v128_load_aligned),
+              reinterpret_cast<fptr>(c_v128_load_aligned),
+              reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2, s3);
+    } else if (typeid(CRet) == typeid(c_v256) &&
+               typeid(CArg1) == typeid(c_v256) &&
+               typeid(CArg2) == typeid(c_v256) &&
+               typeid(CArg3) == typeid(c_v256)) {
+      // V256_V256V256V256
+      error =
+          CompareSimd3Args<v256, v256, v256, v256, CRet, CArg1, CArg2, CArg3>(
+              reinterpret_cast<fptr>(v256_store_aligned),
+              reinterpret_cast<fptr>(v256_load_aligned),
+              reinterpret_cast<fptr>(v256_load_aligned),
+              reinterpret_cast<fptr>(v256_load_aligned), simd, d,
+              reinterpret_cast<fptr>(c_v256_store_aligned),
+              reinterpret_cast<fptr>(c_v256_load_aligned),
+              reinterpret_cast<fptr>(c_v256_load_aligned),
+              reinterpret_cast<fptr>(c_v256_load_aligned),
+              reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2, s3);
+    } else {
+      FAIL() << "Internal error: Unknown intrinsic function "
+             << typeid(CRet).name() << " " << name << "("
+             << typeid(CArg1).name() << ", " << typeid(CArg2).name() << ", "
+             << typeid(CArg3).name() << ")";
+    }
+  }
+
+  EXPECT_EQ(0, error) << "Error: mismatch for " << name << "("
+                      << Print(s1, sizeof(CArg1)) << ", "
+                      << Print(s2, sizeof(CArg2)) << ", "
+                      << Print(s3, sizeof(CArg3)) << ") -> "
+                      << Print(d, sizeof(CRet)) << " (simd), "
+                      << Print(ref_d, sizeof(CRet)) << " (ref)";
+}
+
+// Instantiations to make the functions callable from another files
+template void TestSimd1Arg<c_v64, uint8_t>(uint32_t, uint32_t, uint32_t,
+                                           const char *);
+template void TestSimd1Arg<c_v64, uint16_t>(uint32_t, uint32_t, uint32_t,
+                                            const char *);
+template void TestSimd1Arg<c_v64, uint32_t>(uint32_t, uint32_t, uint32_t,
+                                            const char *);
+template void TestSimd1Arg<c_v64, c_v64>(uint32_t, uint32_t, uint32_t,
+                                         const char *);
+template void TestSimd1Arg<uint32_t, c_v64>(uint32_t, uint32_t, uint32_t,
+                                            const char *);
+template void TestSimd1Arg<int32_t, c_v64>(uint32_t, uint32_t, uint32_t,
+                                           const char *);
+template void TestSimd1Arg<uint64_t, c_v64>(uint32_t, uint32_t, uint32_t,
+                                            const char *);
+template void TestSimd1Arg<int64_t, c_v64>(uint32_t, uint32_t, uint32_t,
+                                           const char *);
+template void TestSimd2Args<c_v64, uint32_t, uint32_t>(uint32_t, uint32_t,
+                                                       uint32_t, const char *);
+template void TestSimd2Args<c_v64, c_v64, c_v64>(uint32_t, uint32_t, uint32_t,
+                                                 const char *);
+template void TestSimd2Args<c_v64, c_v64, uint32_t>(uint32_t, uint32_t,
+                                                    uint32_t, const char *);
+template void TestSimd2Args<int64_t, c_v64, c_v64>(uint32_t, uint32_t, uint32_t,
+                                                   const char *);
+template void TestSimd2Args<uint32_t, c_v64, c_v64>(uint32_t, uint32_t,
+                                                    uint32_t, const char *);
+template void TestSimd1Arg<c_v128, c_v128>(uint32_t, uint32_t, uint32_t,
+                                           const char *);
+template void TestSimd1Arg<c_v128, uint8_t>(uint32_t, uint32_t, uint32_t,
+                                            const char *);
+template void TestSimd1Arg<c_v128, uint16_t>(uint32_t, uint32_t, uint32_t,
+                                             const char *);
+template void TestSimd1Arg<c_v128, uint32_t>(uint32_t, uint32_t, uint32_t,
+                                             const char *);
+template void TestSimd1Arg<c_v128, uint64_t>(uint32_t, uint32_t, uint32_t,
+                                             const char *);
+template void TestSimd1Arg<c_v128, c_v64>(uint32_t, uint32_t, uint32_t,
+                                          const char *);
+template void TestSimd1Arg<uint32_t, c_v128>(uint32_t, uint32_t, uint32_t,
+                                             const char *);
+template void TestSimd1Arg<uint64_t, c_v128>(uint32_t, uint32_t, uint32_t,
+                                             const char *);
+template void TestSimd1Arg<c_v64, c_v128>(uint32_t, uint32_t, uint32_t,
+                                          const char *);
+template void TestSimd2Args<c_v128, c_v128, c_v128>(uint32_t, uint32_t,
+                                                    uint32_t, const char *);
+template void TestSimd2Args<c_v128, c_v128, uint32_t>(uint32_t, uint32_t,
+                                                      uint32_t, const char *);
+template void TestSimd2Args<c_v128, uint64_t, uint64_t>(uint32_t, uint32_t,
+                                                        uint32_t, const char *);
+template void TestSimd2Args<c_v128, c_v64, c_v64>(uint32_t, uint32_t, uint32_t,
+                                                  const char *);
+template void TestSimd2Args<uint64_t, c_v128, c_v128>(uint32_t, uint32_t,
+                                                      uint32_t, const char *);
+template void TestSimd2Args<int64_t, c_v128, c_v128>(uint32_t, uint32_t,
+                                                     uint32_t, const char *);
+template void TestSimd2Args<uint32_t, c_v128, c_v128>(uint32_t, uint32_t,
+                                                      uint32_t, const char *);
+template void TestSimd3Args<c_v128, c_v128, c_v128, c_v128>(uint32_t, uint32_t,
+                                                            uint32_t,
+                                                            const char *);
+template void TestSimd1Arg<c_v256, c_v128>(uint32_t, uint32_t, uint32_t,
+                                           const char *);
+template void TestSimd1Arg<c_v256, c_v256>(uint32_t, uint32_t, uint32_t,
+                                           const char *);
+template void TestSimd1Arg<uint64_t, c_v256>(uint32_t, uint32_t, uint32_t,
+                                             const char *);
+template void TestSimd1Arg<c_v256, uint8_t>(uint32_t, uint32_t, uint32_t,
+                                            const char *);
+template void TestSimd1Arg<c_v256, uint16_t>(uint32_t, uint32_t, uint32_t,
+                                             const char *);
+template void TestSimd1Arg<c_v256, uint32_t>(uint32_t, uint32_t, uint32_t,
+                                             const char *);
+template void TestSimd1Arg<c_v256, uint64_t>(uint32_t, uint32_t, uint32_t,
+                                             const char *);
+template void TestSimd1Arg<uint32_t, c_v256>(uint32_t, uint32_t, uint32_t,
+                                             const char *);
+template void TestSimd1Arg<c_v64, c_v256>(uint32_t, uint32_t, uint32_t,
+                                          const char *);
+template void TestSimd2Args<c_v256, c_v128, c_v128>(uint32_t, uint32_t,
+                                                    uint32_t, const char *);
+template void TestSimd2Args<c_v256, c_v256, c_v256>(uint32_t, uint32_t,
+                                                    uint32_t, const char *);
+template void TestSimd2Args<c_v256, c_v256, uint32_t>(uint32_t, uint32_t,
+                                                      uint32_t, const char *);
+template void TestSimd2Args<uint64_t, c_v256, c_v256>(uint32_t, uint32_t,
+                                                      uint32_t, const char *);
+template void TestSimd2Args<int64_t, c_v256, c_v256>(uint32_t, uint32_t,
+                                                     uint32_t, const char *);
+template void TestSimd2Args<uint32_t, c_v256, c_v256>(uint32_t, uint32_t,
+                                                      uint32_t, const char *);
+template void TestSimd3Args<c_v256, c_v256, c_v256, c_v256>(uint32_t, uint32_t,
+                                                            uint32_t,
+                                                            const char *);
+
+}  // namespace SIMD_NAMESPACE
diff --git a/libs/libaom/src/test/simd_cmp_neon.cc b/libs/libaom/src/test/simd_cmp_neon.cc
new file mode 100644
index 000000000..53c1e2a07
--- /dev/null
+++ b/libs/libaom/src/test/simd_cmp_neon.cc
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#if defined(__OPTIMIZE__) && __OPTIMIZE__
+#define ARCH NEON
+#define ARCH_POSTFIX(name) name##_neon
+#define SIMD_NAMESPACE simd_test_neon
+#include "test/simd_cmp_impl.h"
+#endif
diff --git a/libs/libaom/src/test/simd_cmp_sse2.cc b/libs/libaom/src/test/simd_cmp_sse2.cc
new file mode 100644
index 000000000..f7827a7fa
--- /dev/null
+++ b/libs/libaom/src/test/simd_cmp_sse2.cc
@@ -0,0 +1,18 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#if (defined(__OPTIMIZE__) && __OPTIMIZE__) || \
+    (!defined(__GNUC__) && !defined(_DEBUG))
+#define ARCH SSE2
+#define ARCH_POSTFIX(name) name##_sse2
+#define SIMD_NAMESPACE simd_test_sse2
+#include "test/simd_cmp_impl.h"
+#endif
diff --git a/libs/libaom/src/test/simd_cmp_sse4.cc b/libs/libaom/src/test/simd_cmp_sse4.cc
new file mode 100644
index 000000000..3566764b6
--- /dev/null
+++ b/libs/libaom/src/test/simd_cmp_sse4.cc
@@ -0,0 +1,18 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#if (defined(__OPTIMIZE__) && __OPTIMIZE__) || \
+    (!defined(__GNUC__) && !defined(_DEBUG))
+#define ARCH SSE4_1
+#define ARCH_POSTFIX(name) name##_sse4_1
+#define SIMD_NAMESPACE simd_test_sse4_1
+#include "test/simd_cmp_impl.h"
+#endif
diff --git a/libs/libaom/src/test/simd_cmp_ssse3.cc b/libs/libaom/src/test/simd_cmp_ssse3.cc
new file mode 100644
index 000000000..57bf135dd
--- /dev/null
+++ b/libs/libaom/src/test/simd_cmp_ssse3.cc
@@ -0,0 +1,18 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#if (defined(__OPTIMIZE__) && __OPTIMIZE__) || \
+    (!defined(__GNUC__) && !defined(_DEBUG))
+#define ARCH SSSE3
+#define ARCH_POSTFIX(name) name##_ssse3
+#define SIMD_NAMESPACE simd_test_ssse3
+#include "test/simd_cmp_impl.h"
+#endif
diff --git a/libs/libaom/src/test/simd_impl.h b/libs/libaom/src/test/simd_impl.h
new file mode 100644
index 000000000..61fda009f
--- /dev/null
+++ b/libs/libaom/src/test/simd_impl.h
@@ -0,0 +1,1143 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tuple>
+
+#define SIMD_CHECK 1
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "aom_dsp/aom_simd_inline.h"
+#include "aom_dsp/simd/v256_intrinsics_c.h"
+
+namespace SIMD_NAMESPACE {
+
+template <typename param_signature>
+class TestIntrinsic : public ::testing::TestWithParam<param_signature> {
+ public:
+  virtual ~TestIntrinsic() {}
+  virtual void SetUp() {
+    mask = std::get<0>(this->GetParam());
+    maskwidth = std::get<1>(this->GetParam());
+    name = std::get<2>(this->GetParam());
+  }
+
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+  uint32_t mask, maskwidth;
+  const char *name;
+};
+
+// Create one typedef for each function signature
+#define TYPEDEF_SIMD(name)                                             \
+  typedef TestIntrinsic<std::tuple<uint32_t, uint32_t, const char *> > \
+      ARCH_POSTFIX(name)
+
+TYPEDEF_SIMD(V64_U8);
+TYPEDEF_SIMD(V64_U16);
+TYPEDEF_SIMD(V64_U32);
+TYPEDEF_SIMD(V64_V64);
+TYPEDEF_SIMD(U32_V64);
+TYPEDEF_SIMD(S32_V64);
+TYPEDEF_SIMD(U64_V64);
+TYPEDEF_SIMD(S64_V64);
+TYPEDEF_SIMD(V64_U32U32);
+TYPEDEF_SIMD(V64_V64V64);
+TYPEDEF_SIMD(S64_V64V64);
+TYPEDEF_SIMD(V64_V64U32);
+TYPEDEF_SIMD(U32_V64V64);
+TYPEDEF_SIMD(V128_V64);
+TYPEDEF_SIMD(V128_V128);
+TYPEDEF_SIMD(U32_V128);
+TYPEDEF_SIMD(U64_V128);
+TYPEDEF_SIMD(V64_V128);
+TYPEDEF_SIMD(V128_U8);
+TYPEDEF_SIMD(V128_U16);
+TYPEDEF_SIMD(V128_U32);
+TYPEDEF_SIMD(V128_U64);
+TYPEDEF_SIMD(V128_U64U64);
+TYPEDEF_SIMD(V128_V64V64);
+TYPEDEF_SIMD(V128_V128V128);
+TYPEDEF_SIMD(V128_V128V128V128);
+TYPEDEF_SIMD(S64_V128V128);
+TYPEDEF_SIMD(V128_V128U32);
+TYPEDEF_SIMD(U32_V128V128);
+TYPEDEF_SIMD(U64_V128V128);
+TYPEDEF_SIMD(V256_V128);
+TYPEDEF_SIMD(V256_V256);
+TYPEDEF_SIMD(U64_V256);
+TYPEDEF_SIMD(V256_V128V128);
+TYPEDEF_SIMD(V256_V256V256);
+TYPEDEF_SIMD(V256_V256V256V256);
+TYPEDEF_SIMD(U64_V256V256);
+TYPEDEF_SIMD(S64_V256V256);
+TYPEDEF_SIMD(V256_V256U32);
+TYPEDEF_SIMD(U32_V256V256);
+TYPEDEF_SIMD(V256_U8);
+TYPEDEF_SIMD(V256_U16);
+TYPEDEF_SIMD(V256_U32);
+TYPEDEF_SIMD(V256_U64);
+TYPEDEF_SIMD(U32_V256);
+TYPEDEF_SIMD(V64_V256);
+
+// Google Test allows up to 50 tests per case, so split the largest
+typedef ARCH_POSTFIX(V64_V64) ARCH_POSTFIX(V64_V64_Part2);
+typedef ARCH_POSTFIX(V64_V64V64) ARCH_POSTFIX(V64_V64V64_Part2);
+typedef ARCH_POSTFIX(V128_V128) ARCH_POSTFIX(V128_V128_Part2);
+typedef ARCH_POSTFIX(V128_V128) ARCH_POSTFIX(V128_V128_Part3);
+typedef ARCH_POSTFIX(V128_V128) ARCH_POSTFIX(V128_V128_Part4);
+typedef ARCH_POSTFIX(V128_V128V128) ARCH_POSTFIX(V128_V128V128_Part2);
+typedef ARCH_POSTFIX(V256_V256) ARCH_POSTFIX(V256_V256_Part2);
+typedef ARCH_POSTFIX(V256_V256) ARCH_POSTFIX(V256_V256_Part3);
+typedef ARCH_POSTFIX(V256_V256) ARCH_POSTFIX(V256_V256_Part4);
+typedef ARCH_POSTFIX(V256_V256) ARCH_POSTFIX(V256_V256_Part5);
+typedef ARCH_POSTFIX(V256_V256V256) ARCH_POSTFIX(V256_V256V256_Part2);
+
+// These functions are machine tuned located elsewhere
+template <typename c_ret, typename c_arg>
+void TestSimd1Arg(uint32_t iterations, uint32_t mask, uint32_t maskwidth,
+                  const char *name);
+
+template <typename c_ret, typename c_arg1, typename c_arg2>
+void TestSimd2Args(uint32_t iterations, uint32_t mask, uint32_t maskwidth,
+                   const char *name);
+
+template <typename c_ret, typename c_arg1, typename c_arg2, typename c_arg3>
+void TestSimd3Args(uint32_t iterations, uint32_t mask, uint32_t maskwidth,
+                   const char *name);
+
+const int kIterations = 65536;
+
+// Add a macro layer since TEST_P will quote the name so we need to
+// expand it first with the prefix.
+#define MY_TEST_P(name, test) TEST_P(name, test)
+
+MY_TEST_P(ARCH_POSTFIX(V64_U8), TestIntrinsics) {
+  TestSimd1Arg<c_v64, uint8_t>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V64_U16), TestIntrinsics) {
+  TestSimd1Arg<c_v64, uint16_t>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V64_U32), TestIntrinsics) {
+  TestSimd1Arg<c_v64, uint32_t>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V64_V64), TestIntrinsics) {
+  TestSimd1Arg<c_v64, c_v64>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(U64_V64), TestIntrinsics) {
+  TestSimd1Arg<uint64_t, c_v64>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(S64_V64), TestIntrinsics) {
+  TestSimd1Arg<int64_t, c_v64>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(U32_V64), TestIntrinsics) {
+  TestSimd1Arg<uint32_t, c_v64>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(S32_V64), TestIntrinsics) {
+  TestSimd1Arg<int32_t, c_v64>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V64_U32U32), TestIntrinsics) {
+  TestSimd2Args<c_v64, uint32_t, uint32_t>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V64_V64V64), TestIntrinsics) {
+  TestSimd2Args<c_v64, c_v64, c_v64>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(S64_V64V64), TestIntrinsics) {
+  TestSimd2Args<int64_t, c_v64, c_v64>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(U32_V64V64), TestIntrinsics) {
+  TestSimd2Args<uint32_t, c_v64, c_v64>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V64_V64U32), TestIntrinsics) {
+  TestSimd2Args<c_v64, c_v64, uint32_t>(kIterations, mask, maskwidth, name);
+}
+
+// Google Test allows up to 50 tests per case, so split the largest
+MY_TEST_P(ARCH_POSTFIX(V64_V64_Part2), TestIntrinsics) {
+  TestSimd1Arg<c_v64, c_v64>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V64_V64V64_Part2), TestIntrinsics) {
+  TestSimd2Args<c_v64, c_v64, c_v64>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(U32_V128), TestIntrinsics) {
+  TestSimd1Arg<uint32_t, c_v128>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(U64_V128), TestIntrinsics) {
+  TestSimd1Arg<uint64_t, c_v128>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V64_V128), TestIntrinsics) {
+  TestSimd1Arg<c_v64, c_v128>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V128_V128), TestIntrinsics) {
+  TestSimd1Arg<c_v128, c_v128>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V128_U8), TestIntrinsics) {
+  TestSimd1Arg<c_v128, uint8_t>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V128_U16), TestIntrinsics) {
+  TestSimd1Arg<c_v128, uint16_t>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V128_U32), TestIntrinsics) {
+  TestSimd1Arg<c_v128, uint32_t>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V128_U64), TestIntrinsics) {
+  TestSimd1Arg<c_v128, uint64_t>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V128_V64), TestIntrinsics) {
+  TestSimd1Arg<c_v128, c_v64>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V128_V128V128), TestIntrinsics) {
+  TestSimd2Args<c_v128, c_v128, c_v128>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V128_V128V128V128), TestIntrinsics) {
+  TestSimd3Args<c_v128, c_v128, c_v128, c_v128>(kIterations, mask, maskwidth,
+                                                name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(U32_V128V128), TestIntrinsics) {
+  TestSimd2Args<uint32_t, c_v128, c_v128>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(U64_V128V128), TestIntrinsics) {
+  TestSimd2Args<uint64_t, c_v128, c_v128>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(S64_V128V128), TestIntrinsics) {
+  TestSimd2Args<int64_t, c_v128, c_v128>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V128_U64U64), TestIntrinsics) {
+  TestSimd2Args<c_v128, uint64_t, uint64_t>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V128_V64V64), TestIntrinsics) {
+  TestSimd2Args<c_v128, c_v64, c_v64>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V128_V128U32), TestIntrinsics) {
+  TestSimd2Args<c_v128, c_v128, uint32_t>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V128_V128V128_Part2), TestIntrinsics) {
+  TestSimd2Args<c_v128, c_v128, c_v128>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V128_V128_Part2), TestIntrinsics) {
+  TestSimd1Arg<c_v128, c_v128>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V128_V128_Part3), TestIntrinsics) {
+  TestSimd1Arg<c_v128, c_v128>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V128_V128_Part4), TestIntrinsics) {
+  TestSimd1Arg<c_v128, c_v128>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(U64_V256), TestIntrinsics) {
+  TestSimd1Arg<uint64_t, c_v256>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V256_V256), TestIntrinsics) {
+  TestSimd1Arg<c_v256, c_v256>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V256_V128), TestIntrinsics) {
+  TestSimd1Arg<c_v256, c_v128>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V256_V256V256), TestIntrinsics) {
+  TestSimd2Args<c_v256, c_v256, c_v256>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V256_V256V256V256), TestIntrinsics) {
+  TestSimd3Args<c_v256, c_v256, c_v256, c_v256>(kIterations, mask, maskwidth,
+                                                name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V256_V128V128), TestIntrinsics) {
+  TestSimd2Args<c_v256, c_v128, c_v128>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(U32_V256V256), TestIntrinsics) {
+  TestSimd2Args<uint32_t, c_v256, c_v256>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(U64_V256V256), TestIntrinsics) {
+  TestSimd2Args<uint64_t, c_v256, c_v256>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(S64_V256V256), TestIntrinsics) {
+  TestSimd2Args<int64_t, c_v256, c_v256>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V256_V256V256_Part2), TestIntrinsics) {
+  TestSimd2Args<c_v256, c_v256, c_v256>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V256_V256U32), TestIntrinsics) {
+  TestSimd2Args<c_v256, c_v256, uint32_t>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V256_V256_Part2), TestIntrinsics) {
+  TestSimd1Arg<c_v256, c_v256>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V256_V256_Part3), TestIntrinsics) {
+  TestSimd1Arg<c_v256, c_v256>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V256_V256_Part4), TestIntrinsics) {
+  TestSimd1Arg<c_v256, c_v256>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V256_V256_Part5), TestIntrinsics) {
+  TestSimd1Arg<c_v256, c_v256>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V256_U8), TestIntrinsics) {
+  TestSimd1Arg<c_v256, uint8_t>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V256_U16), TestIntrinsics) {
+  TestSimd1Arg<c_v256, uint16_t>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V256_U32), TestIntrinsics) {
+  TestSimd1Arg<c_v256, uint32_t>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V256_U64), TestIntrinsics) {
+  TestSimd1Arg<c_v256, uint64_t>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(U32_V256), TestIntrinsics) {
+  TestSimd1Arg<uint32_t, c_v256>(kIterations, mask, maskwidth, name);
+}
+
+MY_TEST_P(ARCH_POSTFIX(V64_V256), TestIntrinsics) {
+  TestSimd1Arg<c_v64, c_v256>(kIterations, mask, maskwidth, name);
+}
+
+// Add a macro layer since INSTANTIATE_TEST_SUITE_P will quote the name
+// so we need to expand it first with the prefix
+#define INSTANTIATE(name, type, ...) \
+  INSTANTIATE_TEST_SUITE_P(name, type, ::testing::Values(__VA_ARGS__))
+
+#define SIMD_TUPLE(name, mask, maskwidth) \
+  std::make_tuple(mask, maskwidth, static_cast<const char *>(#name))
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(U32_V64V64), SIMD_TUPLE(v64_sad_u8, 0U, 0U),
+            SIMD_TUPLE(v64_ssd_u8, 0U, 0U));
+
+INSTANTIATE(
+    ARCH, ARCH_POSTFIX(V64_V64V64), SIMD_TUPLE(v64_add_8, 0U, 0U),
+    SIMD_TUPLE(v64_add_16, 0U, 0U), SIMD_TUPLE(v64_sadd_s16, 0U, 0U),
+    SIMD_TUPLE(v64_add_32, 0U, 0U), SIMD_TUPLE(v64_sub_8, 0U, 0U),
+    SIMD_TUPLE(v64_ssub_u8, 0U, 0U), SIMD_TUPLE(v64_ssub_s8, 0U, 0U),
+    SIMD_TUPLE(v64_sub_16, 0U, 0U), SIMD_TUPLE(v64_ssub_s16, 0U, 0U),
+    SIMD_TUPLE(v64_ssub_u16, 0U, 0U), SIMD_TUPLE(v64_sub_32, 0U, 0U),
+    SIMD_TUPLE(v64_ziplo_8, 0U, 0U), SIMD_TUPLE(v64_ziphi_8, 0U, 0U),
+    SIMD_TUPLE(v64_ziplo_16, 0U, 0U), SIMD_TUPLE(v64_ziphi_16, 0U, 0U),
+    SIMD_TUPLE(v64_ziplo_32, 0U, 0U), SIMD_TUPLE(v64_ziphi_32, 0U, 0U),
+    SIMD_TUPLE(v64_pack_s32_s16, 0U, 0U), SIMD_TUPLE(v64_pack_s16_u8, 0U, 0U),
+    SIMD_TUPLE(v64_pack_s16_s8, 0U, 0U), SIMD_TUPLE(v64_unziphi_8, 0U, 0U),
+    SIMD_TUPLE(v64_unziplo_8, 0U, 0U), SIMD_TUPLE(v64_unziphi_16, 0U, 0U),
+    SIMD_TUPLE(v64_unziplo_16, 0U, 0U), SIMD_TUPLE(v64_or, 0U, 0U),
+    SIMD_TUPLE(v64_xor, 0U, 0U), SIMD_TUPLE(v64_and, 0U, 0U),
+    SIMD_TUPLE(v64_andn, 0U, 0U), SIMD_TUPLE(v64_mullo_s16, 0U, 0U),
+    SIMD_TUPLE(v64_mulhi_s16, 0U, 0U), SIMD_TUPLE(v64_mullo_s32, 0U, 0U),
+    SIMD_TUPLE(v64_madd_s16, 0U, 0U), SIMD_TUPLE(v64_madd_us8, 0U, 0U),
+    SIMD_TUPLE(v64_avg_u8, 0U, 0U), SIMD_TUPLE(v64_rdavg_u8, 0U, 0U),
+    SIMD_TUPLE(v64_avg_u16, 0U, 0U), SIMD_TUPLE(v64_min_u8, 0U, 0U),
+    SIMD_TUPLE(v64_max_u8, 0U, 0U), SIMD_TUPLE(v64_min_s8, 0U, 0U),
+    SIMD_TUPLE(v64_max_s8, 0U, 0U), SIMD_TUPLE(v64_min_s16, 0U, 0U),
+    SIMD_TUPLE(v64_max_s16, 0U, 0U), SIMD_TUPLE(v64_cmpgt_s8, 0U, 0U),
+    SIMD_TUPLE(v64_cmplt_s8, 0U, 0U), SIMD_TUPLE(v64_cmpeq_8, 0U, 0U),
+    SIMD_TUPLE(v64_cmpgt_s16, 0U, 0U), SIMD_TUPLE(v64_cmplt_s16, 0U, 0U),
+    SIMD_TUPLE(v64_cmpeq_16, 0U, 0U));
+
+INSTANTIATE(
+    ARCH, ARCH_POSTFIX(V64_V64V64_Part2), SIMD_TUPLE(v64_shuffle_8, 7U, 8U),
+    SIMD_TUPLE(v64_pack_s32_u16, 0U, 0U), SIMD_TUPLE(v64_rdavg_u16, 0U, 0U),
+    SIMD_TUPLE(v64_sadd_s8, 0U, 0U), SIMD_TUPLE(v64_sadd_u8, 0U, 0U),
+    SIMD_TUPLE(imm_v64_align<1>, 0U, 0U), SIMD_TUPLE(imm_v64_align<2>, 0U, 0U),
+    SIMD_TUPLE(imm_v64_align<3>, 0U, 0U), SIMD_TUPLE(imm_v64_align<4>, 0U, 0U),
+    SIMD_TUPLE(imm_v64_align<5>, 0U, 0U), SIMD_TUPLE(imm_v64_align<6>, 0U, 0U),
+    SIMD_TUPLE(imm_v64_align<7>, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V64_V64), SIMD_TUPLE(v64_abs_s8, 0U, 0U),
+            SIMD_TUPLE(v64_abs_s16, 0U, 0U),
+            SIMD_TUPLE(v64_unpacklo_u8_s16, 0U, 0U),
+            SIMD_TUPLE(v64_unpackhi_u8_s16, 0U, 0U),
+            SIMD_TUPLE(v64_unpacklo_s8_s16, 0U, 0U),
+            SIMD_TUPLE(v64_unpackhi_s8_s16, 0U, 0U),
+            SIMD_TUPLE(v64_unpacklo_u16_s32, 0U, 0U),
+            SIMD_TUPLE(v64_unpacklo_s16_s32, 0U, 0U),
+            SIMD_TUPLE(v64_unpackhi_u16_s32, 0U, 0U),
+            SIMD_TUPLE(v64_unpackhi_s16_s32, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_byte<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_byte<2>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_byte<3>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_byte<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_byte<5>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_byte<6>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_byte<7>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shl_n_byte<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shl_n_byte<2>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shl_n_byte<3>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shl_n_byte<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shl_n_byte<5>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shl_n_byte<6>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shl_n_byte<7>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shl_n_8<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shl_n_8<2>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shl_n_8<3>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shl_n_8<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shl_n_8<5>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shl_n_8<6>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shl_n_8<7>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_u8<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_u8<2>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_u8<3>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_u8<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_u8<5>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_u8<6>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_u8<7>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_s8<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_s8<2>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_s8<3>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_s8<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_s8<5>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_s8<6>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_s8<7>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shl_n_16<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shl_n_16<2>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shl_n_16<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shl_n_16<6>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shl_n_16<8>, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V64_V64_Part2),
+            SIMD_TUPLE(imm_v64_shl_n_16<10>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shl_n_16<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shl_n_16<14>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_u16<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_u16<2>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_u16<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_u16<6>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_u16<8>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_u16<10>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_u16<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_u16<14>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_s16<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_s16<2>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_s16<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_s16<6>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_s16<8>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_s16<10>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_s16<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_s16<14>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shl_n_32<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shl_n_32<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shl_n_32<8>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shl_n_32<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shl_n_32<16>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shl_n_32<20>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shl_n_32<24>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shl_n_32<28>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_u32<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_u32<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_u32<8>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_u32<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_u32<16>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_u32<20>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_u32<24>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_u32<28>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_s32<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_s32<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_s32<8>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_s32<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_s32<16>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_s32<20>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_s32<24>, 0U, 0U),
+            SIMD_TUPLE(imm_v64_shr_n_s32<28>, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V64_V64U32), SIMD_TUPLE(v64_shl_8, 7U, 32U),
+            SIMD_TUPLE(v64_shr_u8, 7U, 32U), SIMD_TUPLE(v64_shr_s8, 7U, 32U),
+            SIMD_TUPLE(v64_shl_16, 15U, 32U), SIMD_TUPLE(v64_shr_u16, 15U, 32U),
+            SIMD_TUPLE(v64_shr_s16, 15U, 32U), SIMD_TUPLE(v64_shl_32, 31U, 32U),
+            SIMD_TUPLE(v64_shr_u32, 31U, 32U),
+            SIMD_TUPLE(v64_shr_s32, 31U, 32U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(U64_V64), SIMD_TUPLE(v64_hadd_u8, 0U, 0U),
+            SIMD_TUPLE(v64_u64, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(S64_V64), SIMD_TUPLE(v64_hadd_s16, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(U32_V64), SIMD_TUPLE(v64_low_u32, 0U, 0U),
+            SIMD_TUPLE(v64_high_u32, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(S32_V64), SIMD_TUPLE(v64_low_s32, 0U, 0U),
+            SIMD_TUPLE(v64_high_s32, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(S64_V64V64), SIMD_TUPLE(v64_dotp_s16, 0U, 0U),
+            SIMD_TUPLE(v64_dotp_su8, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V64_U8), SIMD_TUPLE(v64_dup_8, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V64_U16), SIMD_TUPLE(v64_dup_16, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V64_U32), SIMD_TUPLE(v64_dup_32, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V64_U32U32), SIMD_TUPLE(v64_from_32, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(U32_V128V128), SIMD_TUPLE(v128_sad_u8, 0U, 0U),
+            SIMD_TUPLE(v128_ssd_u8, 0U, 0U), SIMD_TUPLE(v128_sad_u16, 0U, 0U));
+INSTANTIATE(ARCH, ARCH_POSTFIX(U64_V128V128), SIMD_TUPLE(v128_ssd_s16, 0U, 0U));
+
+INSTANTIATE(
+    ARCH, ARCH_POSTFIX(V128_V128V128), SIMD_TUPLE(v128_add_8, 0U, 0U),
+    SIMD_TUPLE(v128_add_16, 0U, 0U), SIMD_TUPLE(v128_sadd_s16, 0U, 0U),
+    SIMD_TUPLE(v128_add_32, 0U, 0U), SIMD_TUPLE(v128_sub_8, 0U, 0U),
+    SIMD_TUPLE(v128_ssub_u8, 0U, 0U), SIMD_TUPLE(v128_ssub_s8, 0U, 0U),
+    SIMD_TUPLE(v128_sub_16, 0U, 0U), SIMD_TUPLE(v128_ssub_s16, 0U, 0U),
+    SIMD_TUPLE(v128_ssub_u16, 0U, 0U), SIMD_TUPLE(v128_sub_32, 0U, 0U),
+    SIMD_TUPLE(v128_ziplo_8, 0U, 0U), SIMD_TUPLE(v128_ziphi_8, 0U, 0U),
+    SIMD_TUPLE(v128_ziplo_16, 0U, 0U), SIMD_TUPLE(v128_ziphi_16, 0U, 0U),
+    SIMD_TUPLE(v128_ziplo_32, 0U, 0U), SIMD_TUPLE(v128_ziphi_32, 0U, 0U),
+    SIMD_TUPLE(v128_ziplo_64, 0U, 0U), SIMD_TUPLE(v128_ziphi_64, 0U, 0U),
+    SIMD_TUPLE(v128_unziphi_8, 0U, 0U), SIMD_TUPLE(v128_unziplo_8, 0U, 0U),
+    SIMD_TUPLE(v128_unziphi_16, 0U, 0U), SIMD_TUPLE(v128_unziplo_16, 0U, 0U),
+    SIMD_TUPLE(v128_unziphi_32, 0U, 0U), SIMD_TUPLE(v128_unziplo_32, 0U, 0U),
+    SIMD_TUPLE(v128_pack_s32_s16, 0U, 0U), SIMD_TUPLE(v128_pack_s16_u8, 0U, 0U),
+    SIMD_TUPLE(v128_pack_s16_s8, 0U, 0U), SIMD_TUPLE(v128_or, 0U, 0U),
+    SIMD_TUPLE(v128_xor, 0U, 0U), SIMD_TUPLE(v128_and, 0U, 0U),
+    SIMD_TUPLE(v128_andn, 0U, 0U), SIMD_TUPLE(v128_mullo_s16, 0U, 0U),
+    SIMD_TUPLE(v128_mulhi_s16, 0U, 0U), SIMD_TUPLE(v128_mullo_s32, 0U, 0U),
+    SIMD_TUPLE(v128_madd_s16, 0U, 0U), SIMD_TUPLE(v128_madd_us8, 0U, 0U),
+    SIMD_TUPLE(v128_avg_u8, 0U, 0U), SIMD_TUPLE(v128_rdavg_u8, 0U, 0U),
+    SIMD_TUPLE(v128_avg_u16, 0U, 0U), SIMD_TUPLE(v128_min_u8, 0U, 0U),
+    SIMD_TUPLE(v128_max_u8, 0U, 0U), SIMD_TUPLE(v128_min_s8, 0U, 0U),
+    SIMD_TUPLE(v128_max_s8, 0U, 0U), SIMD_TUPLE(v128_min_s16, 0U, 0U),
+    SIMD_TUPLE(v128_max_s16, 0U, 0U), SIMD_TUPLE(v128_cmpgt_s8, 0U, 0U),
+    SIMD_TUPLE(v128_cmplt_s8, 0U, 0U), SIMD_TUPLE(v128_cmpeq_8, 0U, 0U),
+    SIMD_TUPLE(v128_cmpgt_s16, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V128_V128V128_Part2),
+            SIMD_TUPLE(v128_pack_s32_u16, 0U, 0U),
+            SIMD_TUPLE(v128_rdavg_u16, 0U, 0U), SIMD_TUPLE(v128_add_64, 0U, 0U),
+            SIMD_TUPLE(v128_sub_64, 0U, 0U), SIMD_TUPLE(v128_sadd_s8, 0U, 0U),
+            SIMD_TUPLE(v128_sadd_u8, 0U, 0U), SIMD_TUPLE(v128_cmpeq_16, 0U, 0U),
+            SIMD_TUPLE(v128_cmplt_s16, 0U, 0U),
+            SIMD_TUPLE(v128_cmplt_s32, 0U, 0U),
+            SIMD_TUPLE(v128_cmpeq_32, 0U, 0U),
+            SIMD_TUPLE(v128_cmpgt_s32, 0U, 0U),
+            SIMD_TUPLE(v128_shuffle_8, 15U, 8U),
+            SIMD_TUPLE(v128_min_s32, 0U, 0U), SIMD_TUPLE(v128_max_s32, 0U, 0U),
+            SIMD_TUPLE(imm_v128_align<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_align<2>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_align<3>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_align<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_align<5>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_align<6>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_align<7>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_align<8>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_align<9>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_align<10>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_align<11>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_align<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_align<13>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_align<14>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_align<15>, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V128_V128V128V128),
+            SIMD_TUPLE(v128_blend_8, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V128_V128), SIMD_TUPLE(v128_abs_s8, 0U, 0U),
+            SIMD_TUPLE(v128_abs_s16, 0U, 0U), SIMD_TUPLE(v128_padd_s16, 0U, 0U),
+            SIMD_TUPLE(v128_unpacklo_u8_s16, 0U, 0U),
+            SIMD_TUPLE(v128_unpacklo_s8_s16, 0U, 0U),
+            SIMD_TUPLE(v128_unpacklo_u16_s32, 0U, 0U),
+            SIMD_TUPLE(v128_unpacklo_s16_s32, 0U, 0U),
+            SIMD_TUPLE(v128_unpackhi_u8_s16, 0U, 0U),
+            SIMD_TUPLE(v128_unpackhi_s8_s16, 0U, 0U),
+            SIMD_TUPLE(v128_unpackhi_u16_s32, 0U, 0U),
+            SIMD_TUPLE(v128_unpackhi_s16_s32, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_byte<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_byte<2>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_byte<3>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_byte<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_byte<5>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_byte<6>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_byte<7>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_byte<8>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_byte<9>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_byte<10>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_byte<11>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_byte<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_byte<13>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_byte<14>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_byte<15>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_byte<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_byte<2>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_byte<3>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_byte<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_byte<5>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_byte<6>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_byte<7>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_byte<8>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_byte<9>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_byte<10>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_byte<11>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_byte<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_byte<13>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_byte<14>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_byte<15>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_8<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_8<2>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_8<3>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_8<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_8<5>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_8<6>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_8<7>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u8<1>, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V128_V128_Part2),
+            SIMD_TUPLE(imm_v128_shr_n_u8<2>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u8<3>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u8<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u8<5>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u8<6>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u8<7>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s8<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s8<2>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s8<3>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s8<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s8<5>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s8<6>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s8<7>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_16<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_16<2>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_16<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_16<6>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_16<8>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_16<10>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_16<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_16<14>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u16<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u16<2>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u16<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u16<6>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u16<8>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u16<10>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u16<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u16<14>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s16<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s16<2>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s16<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s16<6>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s16<8>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s16<10>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s16<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s16<14>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_32<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_32<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_32<8>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_32<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_32<16>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_32<20>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_32<24>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_32<28>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u32<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u32<4>, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V128_V128_Part3),
+            SIMD_TUPLE(imm_v128_shr_n_u32<8>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u32<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u32<16>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u32<20>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u32<24>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u32<28>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s32<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s32<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s32<8>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s32<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s32<16>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s32<20>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s32<24>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s32<28>, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V128_V128_Part4),
+            SIMD_TUPLE(imm_v128_shl_n_64<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_64<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_64<8>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_64<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_64<16>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_64<20>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_64<24>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_64<28>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_64<32>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_64<36>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_64<40>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_64<44>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_64<48>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_64<52>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_64<56>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_64<60>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u64<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u64<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u64<8>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u64<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u64<16>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u64<20>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u64<24>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u64<28>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u64<32>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u64<36>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u64<40>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u64<44>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u64<48>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u64<52>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u64<56>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u64<60>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s64<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s64<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s64<8>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s64<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s64<16>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s64<20>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s64<24>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s64<28>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s64<32>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s64<36>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s64<40>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s64<44>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s64<48>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s64<52>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s64<56>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s64<60>, 0U, 0U),
+            SIMD_TUPLE(v128_padd_u8, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V128_V64V64), SIMD_TUPLE(v128_from_v64, 0U, 0U),
+            SIMD_TUPLE(v128_zip_8, 0U, 0U), SIMD_TUPLE(v128_zip_16, 0U, 0U),
+            SIMD_TUPLE(v128_zip_32, 0U, 0U), SIMD_TUPLE(v128_mul_s16, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V128_U64U64), SIMD_TUPLE(v128_from_64, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V128_V64),
+            SIMD_TUPLE(v128_unpack_u8_s16, 0U, 0U),
+            SIMD_TUPLE(v128_unpack_s8_s16, 0U, 0U),
+            SIMD_TUPLE(v128_unpack_u16_s32, 0U, 0U),
+            SIMD_TUPLE(v128_unpack_s16_s32, 0U, 0U));
+
+INSTANTIATE(
+    ARCH, ARCH_POSTFIX(V128_V128U32), SIMD_TUPLE(v128_shl_8, 7U, 32U),
+    SIMD_TUPLE(v128_shr_u8, 7U, 32U), SIMD_TUPLE(v128_shr_s8, 7U, 32U),
+    SIMD_TUPLE(v128_shl_16, 15U, 32U), SIMD_TUPLE(v128_shr_u16, 15U, 32U),
+    SIMD_TUPLE(v128_shr_s16, 15U, 32U), SIMD_TUPLE(v128_shl_32, 31U, 32U),
+    SIMD_TUPLE(v128_shr_u32, 31U, 32U), SIMD_TUPLE(v128_shr_s32, 31U, 32U),
+    SIMD_TUPLE(v128_shl_64, 63U, 32U), SIMD_TUPLE(v128_shr_u64, 63U, 32U),
+    SIMD_TUPLE(v128_shr_s64, 63U, 32U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(U32_V128), SIMD_TUPLE(v128_low_u32, 0U, 0U),
+            SIMD_TUPLE(v128_movemask_8, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(U64_V128), SIMD_TUPLE(v128_hadd_u8, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V64_V128), SIMD_TUPLE(v128_low_v64, 0U, 0U),
+            SIMD_TUPLE(v128_high_v64, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V128_U8), SIMD_TUPLE(v128_dup_8, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V128_U16), SIMD_TUPLE(v128_dup_16, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V128_U32), SIMD_TUPLE(v128_dup_32, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V128_U64), SIMD_TUPLE(v128_dup_64, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(S64_V128V128), SIMD_TUPLE(v128_dotp_s16, 0U, 0U),
+            SIMD_TUPLE(v128_dotp_s32, 0U, 0U),
+            SIMD_TUPLE(v128_dotp_su8, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(U32_V256V256), SIMD_TUPLE(v256_sad_u8, 0U, 0U),
+            SIMD_TUPLE(v256_ssd_u8, 0U, 0U), SIMD_TUPLE(v256_sad_u16, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(U64_V256), SIMD_TUPLE(v256_hadd_u8, 0U, 0U),
+            SIMD_TUPLE(v256_low_u64, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(S64_V256V256), SIMD_TUPLE(v256_dotp_s16, 0U, 0U),
+            SIMD_TUPLE(v256_dotp_s32, 0U, 0U),
+            SIMD_TUPLE(v256_dotp_su8, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(U64_V256V256), SIMD_TUPLE(v256_ssd_s16, 0U, 0U));
+
+INSTANTIATE(
+    ARCH, ARCH_POSTFIX(V256_V256V256), SIMD_TUPLE(v256_add_8, 0U, 0U),
+    SIMD_TUPLE(v256_add_16, 0U, 0U), SIMD_TUPLE(v256_sadd_s16, 0U, 0U),
+    SIMD_TUPLE(v256_add_32, 0U, 0U), SIMD_TUPLE(v256_sub_8, 0U, 0U),
+    SIMD_TUPLE(v256_ssub_u8, 0U, 0U), SIMD_TUPLE(v256_ssub_s8, 0U, 0U),
+    SIMD_TUPLE(v256_sub_16, 0U, 0U), SIMD_TUPLE(v256_ssub_s16, 0U, 0U),
+    SIMD_TUPLE(v256_ssub_u16, 0U, 0U), SIMD_TUPLE(v256_sub_32, 0U, 0U),
+    SIMD_TUPLE(v256_ziplo_8, 0U, 0U), SIMD_TUPLE(v256_ziphi_8, 0U, 0U),
+    SIMD_TUPLE(v256_ziplo_16, 0U, 0U), SIMD_TUPLE(v256_ziphi_16, 0U, 0U),
+    SIMD_TUPLE(v256_ziplo_32, 0U, 0U), SIMD_TUPLE(v256_ziphi_32, 0U, 0U),
+    SIMD_TUPLE(v256_ziplo_64, 0U, 0U), SIMD_TUPLE(v256_ziphi_64, 0U, 0U),
+    SIMD_TUPLE(v256_ziplo_128, 0U, 0U), SIMD_TUPLE(v256_ziphi_128, 0U, 0U),
+    SIMD_TUPLE(v256_unziphi_8, 0U, 0U), SIMD_TUPLE(v256_unziplo_8, 0U, 0U),
+    SIMD_TUPLE(v256_unziphi_16, 0U, 0U), SIMD_TUPLE(v256_unziplo_16, 0U, 0U),
+    SIMD_TUPLE(v256_unziphi_32, 0U, 0U), SIMD_TUPLE(v256_unziplo_32, 0U, 0U),
+    SIMD_TUPLE(v256_pack_s32_s16, 0U, 0U), SIMD_TUPLE(v256_pack_s16_u8, 0U, 0U),
+    SIMD_TUPLE(v256_pack_s16_s8, 0U, 0U), SIMD_TUPLE(v256_or, 0U, 0U),
+    SIMD_TUPLE(v256_xor, 0U, 0U), SIMD_TUPLE(v256_and, 0U, 0U),
+    SIMD_TUPLE(v256_andn, 0U, 0U), SIMD_TUPLE(v256_mullo_s16, 0U, 0U),
+    SIMD_TUPLE(v256_mulhi_s16, 0U, 0U), SIMD_TUPLE(v256_mullo_s32, 0U, 0U),
+    SIMD_TUPLE(v256_madd_s16, 0U, 0U), SIMD_TUPLE(v256_madd_us8, 0U, 0U),
+    SIMD_TUPLE(v256_avg_u8, 0U, 0U), SIMD_TUPLE(v256_rdavg_u8, 0U, 0U),
+    SIMD_TUPLE(v256_avg_u16, 0U, 0U), SIMD_TUPLE(v256_min_u8, 0U, 0U),
+    SIMD_TUPLE(v256_max_u8, 0U, 0U), SIMD_TUPLE(v256_min_s8, 0U, 0U),
+    SIMD_TUPLE(v256_max_s8, 0U, 0U), SIMD_TUPLE(v256_min_s16, 0U, 0U),
+    SIMD_TUPLE(v256_max_s16, 0U, 0U), SIMD_TUPLE(v256_cmpgt_s8, 0U, 0U),
+    SIMD_TUPLE(v256_cmplt_s8, 0U, 0U));
+
+INSTANTIATE(
+    ARCH, ARCH_POSTFIX(V256_V256V256_Part2), SIMD_TUPLE(v256_cmpeq_8, 0U, 0U),
+    SIMD_TUPLE(v256_min_s32, 0U, 0U), SIMD_TUPLE(v256_max_s32, 0U, 0U),
+    SIMD_TUPLE(v256_add_64, 0U, 0U), SIMD_TUPLE(v256_sub_64, 0U, 0U),
+    SIMD_TUPLE(v256_cmpgt_s16, 0U, 0U), SIMD_TUPLE(v256_cmplt_s16, 0U, 0U),
+    SIMD_TUPLE(v256_cmpeq_16, 0U, 0U), SIMD_TUPLE(v256_cmpgt_s32, 0U, 0U),
+    SIMD_TUPLE(v256_cmplt_s32, 0U, 0U), SIMD_TUPLE(v256_cmpeq_32, 0U, 0U),
+    SIMD_TUPLE(v256_shuffle_8, 31U, 8U), SIMD_TUPLE(v256_pshuffle_8, 15U, 8U),
+    SIMD_TUPLE(imm_v256_align<1>, 0U, 0U), SIMD_TUPLE(v256_sadd_s8, 0U, 0U),
+    SIMD_TUPLE(v256_sadd_u8, 0U, 0U), SIMD_TUPLE(v256_pack_s32_u16, 0U, 0U),
+    SIMD_TUPLE(v256_rdavg_u16, 0U, 0U), SIMD_TUPLE(imm_v256_align<2>, 0U, 0U),
+    SIMD_TUPLE(v256_unziphi_64, 0U, 0U), SIMD_TUPLE(v256_unziplo_64, 0U, 0U),
+    SIMD_TUPLE(imm_v256_align<3>, 0U, 0U),
+    SIMD_TUPLE(imm_v256_align<4>, 0U, 0U),
+    SIMD_TUPLE(imm_v256_align<5>, 0U, 0U),
+    SIMD_TUPLE(imm_v256_align<6>, 0U, 0U),
+    SIMD_TUPLE(imm_v256_align<7>, 0U, 0U),
+    SIMD_TUPLE(imm_v256_align<8>, 0U, 0U),
+    SIMD_TUPLE(imm_v256_align<9>, 0U, 0U),
+    SIMD_TUPLE(imm_v256_align<10>, 0U, 0U),
+    SIMD_TUPLE(imm_v256_align<11>, 0U, 0U),
+    SIMD_TUPLE(imm_v256_align<12>, 0U, 0U),
+    SIMD_TUPLE(imm_v256_align<13>, 0U, 0U),
+    SIMD_TUPLE(imm_v256_align<14>, 0U, 0U),
+    SIMD_TUPLE(imm_v256_align<15>, 0U, 0U),
+    SIMD_TUPLE(imm_v256_align<16>, 0U, 0U),
+    SIMD_TUPLE(imm_v256_align<17>, 0U, 0U),
+    SIMD_TUPLE(imm_v256_align<18>, 0U, 0U),
+    SIMD_TUPLE(imm_v256_align<19>, 0U, 0U),
+    SIMD_TUPLE(imm_v256_align<20>, 0U, 0U),
+    SIMD_TUPLE(imm_v256_align<21>, 0U, 0U),
+    SIMD_TUPLE(imm_v256_align<22>, 0U, 0U),
+    SIMD_TUPLE(imm_v256_align<23>, 0U, 0U),
+    SIMD_TUPLE(imm_v256_align<24>, 0U, 0U),
+    SIMD_TUPLE(imm_v256_align<25>, 0U, 0U),
+    SIMD_TUPLE(imm_v256_align<26>, 0U, 0U),
+    SIMD_TUPLE(imm_v256_align<27>, 0U, 0U),
+    SIMD_TUPLE(imm_v256_align<28>, 0U, 0U),
+    SIMD_TUPLE(imm_v256_align<29>, 0U, 0U),
+    SIMD_TUPLE(imm_v256_align<30>, 0U, 0U),
+    SIMD_TUPLE(imm_v256_align<31>, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V256_V128V128),
+            SIMD_TUPLE(v256_from_v128, 0U, 0U), SIMD_TUPLE(v256_zip_8, 0U, 0U),
+            SIMD_TUPLE(v256_zip_16, 0U, 0U), SIMD_TUPLE(v256_zip_32, 0U, 0U),
+            SIMD_TUPLE(v256_mul_s16, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V256_V128),
+            SIMD_TUPLE(v256_unpack_u8_s16, 0U, 0U),
+            SIMD_TUPLE(v256_unpack_s8_s16, 0U, 0U),
+            SIMD_TUPLE(v256_unpack_u16_s32, 0U, 0U),
+            SIMD_TUPLE(v256_unpack_s16_s32, 0U, 0U));
+
+INSTANTIATE(
+    ARCH, ARCH_POSTFIX(V256_V256U32), SIMD_TUPLE(v256_shl_8, 7U, 32U),
+    SIMD_TUPLE(v256_shr_u8, 7U, 32U), SIMD_TUPLE(v256_shr_s8, 7U, 32U),
+    SIMD_TUPLE(v256_shl_16, 15U, 32U), SIMD_TUPLE(v256_shr_u16, 15U, 32U),
+    SIMD_TUPLE(v256_shr_s16, 15U, 32U), SIMD_TUPLE(v256_shl_32, 31U, 32U),
+    SIMD_TUPLE(v256_shr_u32, 31U, 32U), SIMD_TUPLE(v256_shr_s32, 31U, 32U),
+    SIMD_TUPLE(v256_shl_64, 63U, 32U), SIMD_TUPLE(v256_shr_u64, 63U, 32U),
+    SIMD_TUPLE(v256_shr_s64, 63U, 32U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V256_V256), SIMD_TUPLE(v256_abs_s8, 0U, 0U),
+            SIMD_TUPLE(v256_abs_s16, 0U, 0U), SIMD_TUPLE(v256_padd_s16, 0U, 0U),
+            SIMD_TUPLE(v256_unpacklo_u8_s16, 0U, 0U),
+            SIMD_TUPLE(v256_unpacklo_s8_s16, 0U, 0U),
+            SIMD_TUPLE(v256_unpacklo_u16_s32, 0U, 0U),
+            SIMD_TUPLE(v256_unpacklo_s16_s32, 0U, 0U),
+            SIMD_TUPLE(v256_unpackhi_u8_s16, 0U, 0U),
+            SIMD_TUPLE(v256_unpackhi_s8_s16, 0U, 0U),
+            SIMD_TUPLE(v256_unpackhi_u16_s32, 0U, 0U),
+            SIMD_TUPLE(v256_unpackhi_s16_s32, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_byte<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_byte<2>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_byte<3>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_byte<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_byte<5>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_byte<6>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_byte<7>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_byte<8>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_byte<9>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_byte<10>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_byte<11>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_byte<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_byte<13>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_byte<14>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_byte<15>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_byte<16>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_byte<17>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_byte<18>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_byte<19>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_byte<20>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_byte<21>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_byte<22>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_byte<23>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_byte<24>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_byte<25>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_byte<26>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_byte<27>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_byte<28>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_byte<29>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_byte<30>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_byte<31>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_byte<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_byte<2>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_byte<3>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_byte<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_byte<5>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_byte<6>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_byte<7>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_byte<8>, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V256_V256_Part2),
+            SIMD_TUPLE(imm_v256_shl_n_byte<9>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_byte<10>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_byte<11>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_byte<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_byte<13>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_byte<14>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_byte<15>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_byte<16>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_byte<17>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_byte<18>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_byte<19>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_byte<20>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_byte<21>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_byte<22>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_byte<23>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_byte<24>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_byte<25>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_byte<26>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_byte<27>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_byte<28>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_byte<29>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_byte<30>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_byte<31>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_8<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_8<2>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_8<3>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_8<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_8<5>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_8<6>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_8<7>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u8<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u8<2>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u8<3>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u8<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u8<5>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u8<6>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u8<7>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s8<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s8<2>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s8<3>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s8<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s8<5>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s8<6>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s8<7>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_16<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_16<2>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_16<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_16<6>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_16<8>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_16<10>, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V256_V256_Part3),
+            SIMD_TUPLE(imm_v256_shl_n_16<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_16<14>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u16<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u16<2>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u16<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u16<6>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u16<8>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u16<10>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u16<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u16<14>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s16<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s16<2>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s16<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s16<6>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s16<8>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s16<10>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s16<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s16<14>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_32<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_32<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_32<8>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_32<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_32<16>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_32<20>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_32<24>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_32<28>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u32<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u32<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u32<8>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u32<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u32<16>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u32<20>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u32<24>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u32<28>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s32<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s32<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s32<8>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s32<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s32<16>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s32<20>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s32<24>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s32<28>, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V256_V256_Part4),
+            SIMD_TUPLE(imm_v256_shl_n_64<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_64<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_64<8>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_64<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_64<16>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_64<20>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_64<24>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_64<28>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_64<32>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_64<36>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_64<40>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_64<44>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_64<48>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_64<52>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_64<56>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_64<60>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u64<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u64<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u64<8>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u64<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u64<16>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u64<20>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u64<24>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u64<28>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u64<32>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u64<36>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u64<40>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u64<44>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u64<48>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u64<52>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u64<56>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u64<60>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s64<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s64<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s64<8>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s64<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s64<16>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s64<20>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s64<24>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s64<28>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s64<32>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s64<36>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s64<40>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s64<44>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s64<48>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s64<52>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s64<56>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s64<60>, 0U, 0U),
+            SIMD_TUPLE(v256_padd_u8, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V256_V256_Part5),
+            SIMD_TUPLE(imm_v256_shr_n_word<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_word<2>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_word<3>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_word<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_word<5>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_word<6>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_word<7>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_word<8>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_word<9>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_word<10>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_word<11>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_word<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_word<13>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_word<14>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_word<15>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_word<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_word<2>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_word<3>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_word<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_word<5>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_word<6>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_word<7>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_word<8>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_word<9>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_word<10>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_word<11>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_word<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_word<13>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_word<14>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_word<15>, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V256_V256V256V256),
+            SIMD_TUPLE(v256_blend_8, 0U, 0U),
+            SIMD_TUPLE(v256_wideshuffle_8, 63U, 8U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V256_U8), SIMD_TUPLE(v256_dup_8, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V256_U16), SIMD_TUPLE(v256_dup_16, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V256_U32), SIMD_TUPLE(v256_dup_32, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V256_U64), SIMD_TUPLE(v256_dup_64, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(U32_V256), SIMD_TUPLE(v256_low_u32, 0U, 0U),
+            SIMD_TUPLE(v256_movemask_8, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V64_V256), SIMD_TUPLE(v256_low_v64, 0U, 0U));
+
+}  // namespace SIMD_NAMESPACE
diff --git a/libs/libaom/src/test/simd_neon_test.cc b/libs/libaom/src/test/simd_neon_test.cc
new file mode 100644
index 000000000..b67b18895
--- /dev/null
+++ b/libs/libaom/src/test/simd_neon_test.cc
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#if defined(__OPTIMIZE__) && __OPTIMIZE__
+#define ARCH NEON
+#define ARCH_POSTFIX(name) name##_neon
+#define SIMD_NAMESPACE simd_test_neon
+#include "test/simd_impl.h"
+#endif
diff --git a/libs/libaom/src/test/simd_sse2_test.cc b/libs/libaom/src/test/simd_sse2_test.cc
new file mode 100644
index 000000000..b37a931b3
--- /dev/null
+++ b/libs/libaom/src/test/simd_sse2_test.cc
@@ -0,0 +1,18 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#if (defined(__OPTIMIZE__) && __OPTIMIZE__) || \
+    (!defined(__GNUC__) && !defined(_DEBUG))
+#define ARCH SSE2
+#define ARCH_POSTFIX(name) name##_sse2
+#define SIMD_NAMESPACE simd_test_sse2
+#include "test/simd_impl.h"
+#endif
diff --git a/libs/libaom/src/test/simd_sse4_test.cc b/libs/libaom/src/test/simd_sse4_test.cc
new file mode 100644
index 000000000..b1c9d5cd8
--- /dev/null
+++ b/libs/libaom/src/test/simd_sse4_test.cc
@@ -0,0 +1,18 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#if (defined(__OPTIMIZE__) && __OPTIMIZE__) || \
+    (!defined(__GNUC__) && !defined(_DEBUG))
+#define ARCH SSE4_1
+#define ARCH_POSTFIX(name) name##_sse4_1
+#define SIMD_NAMESPACE simd_test_sse4_1
+#include "test/simd_impl.h"
+#endif
diff --git a/libs/libaom/src/test/simd_ssse3_test.cc b/libs/libaom/src/test/simd_ssse3_test.cc
new file mode 100644
index 000000000..d95c26fb5
--- /dev/null
+++ b/libs/libaom/src/test/simd_ssse3_test.cc
@@ -0,0 +1,18 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#if (defined(__OPTIMIZE__) && __OPTIMIZE__) || \
+    (!defined(__GNUC__) && !defined(_DEBUG))
+#define ARCH SSSE3
+#define ARCH_POSTFIX(name) name##_ssse3
+#define SIMD_NAMESPACE simd_test_ssse3
+#include "test/simd_impl.h"
+#endif
diff --git a/libs/libaom/src/test/simple_decoder.sh b/libs/libaom/src/test/simple_decoder.sh
new file mode 100644
index 000000000..5f39ad206
--- /dev/null
+++ b/libs/libaom/src/test/simple_decoder.sh
@@ -0,0 +1,58 @@
+#!/bin/sh
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+## This file tests the libaom simple_decoder example code. To add new tests to
+## this file, do the following:
+##   1. Write a shell function (this is your test).
+##   2. Add the function to simple_decoder_tests (on a new line).
+##
+. $(dirname $0)/tools_common.sh
+
+# Environment check: Make sure input is available:
+simple_decoder_verify_environment() {
+  if [ ! "$(av1_encode_available)" = "yes" ] && [ ! -e "${AV1_IVF_FILE}" ]; then
+    return 1
+  fi
+}
+
+# Runs simple_decoder using $1 as input file. $2 is the codec name, and is used
+# solely to name the output file.
+simple_decoder() {
+  local decoder="$(aom_tool_path simple_decoder)"
+  local input_file="$1"
+  local codec="$2"
+  local output_file="${AOM_TEST_OUTPUT_DIR}/simple_decoder_${codec}.raw"
+
+  if [ ! -x "${decoder}" ]; then
+    elog "${decoder} does not exist or is not executable."
+    return 1
+  fi
+
+  eval "${AOM_TEST_PREFIX}" "${decoder}" "${input_file}" "${output_file}" \
+      ${devnull}
+
+  [ -e "${output_file}" ] || return 1
+}
+
+simple_decoder_av1() {
+  if [ "$(av1_decode_available)" = "yes" ]; then
+    if [ ! -e "${AV1_IVF_FILE}" ]; then
+      local file="${AOM_TEST_OUTPUT_DIR}/test_encode.ivf"
+      encode_yuv_raw_input_av1 "${file}" --ivf
+      simple_decoder "${file}" av1 || return 1
+    else
+      simple_decoder "${AV1_IVF_FILE}" av1 || return 1
+    fi
+  fi
+}
+
+simple_decoder_tests="simple_decoder_av1"
+
+run_tests simple_decoder_verify_environment "${simple_decoder_tests}"
diff --git a/libs/libaom/src/test/simple_encoder.sh b/libs/libaom/src/test/simple_encoder.sh
new file mode 100644
index 000000000..5cd6b46a1
--- /dev/null
+++ b/libs/libaom/src/test/simple_encoder.sh
@@ -0,0 +1,53 @@
+#!/bin/sh
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+## This file tests the libaom simple_encoder example. To add new tests to this
+## file, do the following:
+##   1. Write a shell function (this is your test).
+##   2. Add the function to simple_encoder_tests (on a new line).
+##
+. $(dirname $0)/tools_common.sh
+
+# Environment check: $YUV_RAW_INPUT is required.
+simple_encoder_verify_environment() {
+  if [ ! -e "${YUV_RAW_INPUT}" ]; then
+    echo "Libaom test data must exist in LIBAOM_TEST_DATA_PATH."
+    return 1
+  fi
+}
+
+# Runs simple_encoder using the codec specified by $1 with a frame limit of 100.
+simple_encoder() {
+  local encoder="${LIBAOM_BIN_PATH}/simple_encoder${AOM_TEST_EXE_SUFFIX}"
+  local codec="$1"
+  local output_file="${AOM_TEST_OUTPUT_DIR}/simple_encoder_${codec}.ivf"
+
+  if [ ! -x "${encoder}" ]; then
+    elog "${encoder} does not exist or is not executable."
+    return 1
+  fi
+
+  eval "${AOM_TEST_PREFIX}" "${encoder}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \
+      "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" 9999 0 5 \
+      ${devnull}
+
+  [ -e "${output_file}" ] || return 1
+}
+
+
+simple_encoder_av1() {
+  if [ "$(av1_encode_available)" = "yes" ]; then
+    simple_encoder av1 || return 1
+  fi
+}
+
+simple_encoder_tests="simple_encoder_av1"
+
+run_tests simple_encoder_verify_environment "${simple_encoder_tests}"
diff --git a/libs/libaom/src/test/subtract_test.cc b/libs/libaom/src/test/subtract_test.cc
new file mode 100644
index 000000000..4001e8b7a
--- /dev/null
+++ b/libs/libaom/src/test/subtract_test.cc
@@ -0,0 +1,252 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tuple>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "av1/common/blockd.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+
+typedef void (*SubtractFunc)(int rows, int cols, int16_t *diff_ptr,
+                             ptrdiff_t diff_stride, const uint8_t *src_ptr,
+                             ptrdiff_t src_stride, const uint8_t *pred_ptr,
+                             ptrdiff_t pred_stride);
+
+namespace {
+
+class AV1SubtractBlockTest : public ::testing::TestWithParam<SubtractFunc> {
+ public:
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+};
+
+using libaom_test::ACMRandom;
+
+TEST_P(AV1SubtractBlockTest, SimpleSubtract) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+
+  // FIXME(rbultje) split in its own file
+  for (BLOCK_SIZE bsize = BLOCK_4X4; bsize < BLOCK_SIZES;
+       bsize = static_cast<BLOCK_SIZE>(static_cast<int>(bsize) + 1)) {
+    const int block_width = block_size_wide[bsize];
+    const int block_height = block_size_high[bsize];
+    int16_t *diff = reinterpret_cast<int16_t *>(
+        aom_memalign(16, sizeof(*diff) * block_width * block_height * 2));
+    uint8_t *pred = reinterpret_cast<uint8_t *>(
+        aom_memalign(16, block_width * block_height * 2));
+    uint8_t *src = reinterpret_cast<uint8_t *>(
+        aom_memalign(16, block_width * block_height * 2));
+
+    for (int n = 0; n < 100; n++) {
+      for (int r = 0; r < block_height; ++r) {
+        for (int c = 0; c < block_width * 2; ++c) {
+          src[r * block_width * 2 + c] = rnd.Rand8();
+          pred[r * block_width * 2 + c] = rnd.Rand8();
+        }
+      }
+
+      GetParam()(block_height, block_width, diff, block_width, src, block_width,
+                 pred, block_width);
+
+      for (int r = 0; r < block_height; ++r) {
+        for (int c = 0; c < block_width; ++c) {
+          EXPECT_EQ(diff[r * block_width + c],
+                    (src[r * block_width + c] - pred[r * block_width + c]))
+              << "r = " << r << ", c = " << c << ", bs = " << bsize;
+        }
+      }
+
+      GetParam()(block_height, block_width, diff, block_width * 2, src,
+                 block_width * 2, pred, block_width * 2);
+
+      for (int r = 0; r < block_height; ++r) {
+        for (int c = 0; c < block_width; ++c) {
+          EXPECT_EQ(
+              diff[r * block_width * 2 + c],
+              (src[r * block_width * 2 + c] - pred[r * block_width * 2 + c]))
+              << "r = " << r << ", c = " << c << ", bs = " << bsize;
+        }
+      }
+    }
+    aom_free(diff);
+    aom_free(pred);
+    aom_free(src);
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(C, AV1SubtractBlockTest,
+                         ::testing::Values(aom_subtract_block_c));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(SSE2, AV1SubtractBlockTest,
+                         ::testing::Values(aom_subtract_block_sse2));
+#endif
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, AV1SubtractBlockTest,
+                         ::testing::Values(aom_subtract_block_neon));
+#endif
+#if HAVE_MSA
+INSTANTIATE_TEST_SUITE_P(MSA, AV1SubtractBlockTest,
+                         ::testing::Values(aom_subtract_block_msa));
+#endif
+
+#if CONFIG_AV1_HIGHBITDEPTH
+typedef void (*HBDSubtractFunc)(int rows, int cols, int16_t *diff_ptr,
+                                ptrdiff_t diff_stride, const uint8_t *src_ptr,
+                                ptrdiff_t src_stride, const uint8_t *pred_ptr,
+                                ptrdiff_t pred_stride, int bd);
+
+using std::get;
+using std::make_tuple;
+using std::tuple;
+
+// <width, height, bit_dpeth, subtract>
+typedef tuple<int, int, int, HBDSubtractFunc> Params;
+
+class AV1HBDSubtractBlockTest : public ::testing::TestWithParam<Params> {
+ public:
+  virtual void SetUp() {
+    block_width_ = GET_PARAM(0);
+    block_height_ = GET_PARAM(1);
+    bit_depth_ = static_cast<aom_bit_depth_t>(GET_PARAM(2));
+    func_ = GET_PARAM(3);
+
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+
+    const size_t max_width = 128;
+    const size_t max_block_size = max_width * max_width;
+    src_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(
+        aom_memalign(16, max_block_size * sizeof(uint16_t))));
+    pred_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(
+        aom_memalign(16, max_block_size * sizeof(uint16_t))));
+    diff_ = reinterpret_cast<int16_t *>(
+        aom_memalign(16, max_block_size * sizeof(int16_t)));
+  }
+
+  virtual void TearDown() {
+    aom_free(CONVERT_TO_SHORTPTR(src_));
+    aom_free(CONVERT_TO_SHORTPTR(pred_));
+    aom_free(diff_);
+  }
+
+ protected:
+  void CheckResult();
+  void RunForSpeed();
+
+ private:
+  ACMRandom rnd_;
+  int block_height_;
+  int block_width_;
+  aom_bit_depth_t bit_depth_;
+  HBDSubtractFunc func_;
+  uint8_t *src_;
+  uint8_t *pred_;
+  int16_t *diff_;
+};
+
+void AV1HBDSubtractBlockTest::CheckResult() {
+  const int test_num = 100;
+  const size_t max_width = 128;
+  const int max_block_size = max_width * max_width;
+  const int mask = (1 << bit_depth_) - 1;
+  int i, j;
+
+  for (i = 0; i < test_num; ++i) {
+    for (j = 0; j < max_block_size; ++j) {
+      CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask;
+      CONVERT_TO_SHORTPTR(pred_)[j] = rnd_.Rand16() & mask;
+    }
+
+    func_(block_height_, block_width_, diff_, block_width_, src_, block_width_,
+          pred_, block_width_, bit_depth_);
+
+    for (int r = 0; r < block_height_; ++r) {
+      for (int c = 0; c < block_width_; ++c) {
+        EXPECT_EQ(diff_[r * block_width_ + c],
+                  (CONVERT_TO_SHORTPTR(src_)[r * block_width_ + c] -
+                   CONVERT_TO_SHORTPTR(pred_)[r * block_width_ + c]))
+            << "r = " << r << ", c = " << c << ", test: " << i;
+      }
+    }
+  }
+}
+
+TEST_P(AV1HBDSubtractBlockTest, CheckResult) { CheckResult(); }
+
+void AV1HBDSubtractBlockTest::RunForSpeed() {
+  const int test_num = 200000;
+  const size_t max_width = 128;
+  const int max_block_size = max_width * max_width;
+  const int mask = (1 << bit_depth_) - 1;
+  int i, j;
+
+  for (j = 0; j < max_block_size; ++j) {
+    CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask;
+    CONVERT_TO_SHORTPTR(pred_)[j] = rnd_.Rand16() & mask;
+  }
+
+  for (i = 0; i < test_num; ++i) {
+    func_(block_height_, block_width_, diff_, block_width_, src_, block_width_,
+          pred_, block_width_, bit_depth_);
+  }
+}
+
+TEST_P(AV1HBDSubtractBlockTest, DISABLED_Speed) { RunForSpeed(); }
+
+#if HAVE_SSE2
+const Params kAV1HBDSubtractBlock_sse2[] = {
+  make_tuple(4, 4, 12, &aom_highbd_subtract_block_sse2),
+  make_tuple(4, 4, 12, &aom_highbd_subtract_block_c),
+  make_tuple(4, 8, 12, &aom_highbd_subtract_block_sse2),
+  make_tuple(4, 8, 12, &aom_highbd_subtract_block_c),
+  make_tuple(8, 4, 12, &aom_highbd_subtract_block_sse2),
+  make_tuple(8, 4, 12, &aom_highbd_subtract_block_c),
+  make_tuple(8, 8, 12, &aom_highbd_subtract_block_sse2),
+  make_tuple(8, 8, 12, &aom_highbd_subtract_block_c),
+  make_tuple(8, 16, 12, &aom_highbd_subtract_block_sse2),
+  make_tuple(8, 16, 12, &aom_highbd_subtract_block_c),
+  make_tuple(16, 8, 12, &aom_highbd_subtract_block_sse2),
+  make_tuple(16, 8, 12, &aom_highbd_subtract_block_c),
+  make_tuple(16, 16, 12, &aom_highbd_subtract_block_sse2),
+  make_tuple(16, 16, 12, &aom_highbd_subtract_block_c),
+  make_tuple(16, 32, 12, &aom_highbd_subtract_block_sse2),
+  make_tuple(16, 32, 12, &aom_highbd_subtract_block_c),
+  make_tuple(32, 16, 12, &aom_highbd_subtract_block_sse2),
+  make_tuple(32, 16, 12, &aom_highbd_subtract_block_c),
+  make_tuple(32, 32, 12, &aom_highbd_subtract_block_sse2),
+  make_tuple(32, 32, 12, &aom_highbd_subtract_block_c),
+  make_tuple(32, 64, 12, &aom_highbd_subtract_block_sse2),
+  make_tuple(32, 64, 12, &aom_highbd_subtract_block_c),
+  make_tuple(64, 32, 12, &aom_highbd_subtract_block_sse2),
+  make_tuple(64, 32, 12, &aom_highbd_subtract_block_c),
+  make_tuple(64, 64, 12, &aom_highbd_subtract_block_sse2),
+  make_tuple(64, 64, 12, &aom_highbd_subtract_block_c),
+  make_tuple(64, 128, 12, &aom_highbd_subtract_block_sse2),
+  make_tuple(64, 128, 12, &aom_highbd_subtract_block_c),
+  make_tuple(128, 64, 12, &aom_highbd_subtract_block_sse2),
+  make_tuple(128, 64, 12, &aom_highbd_subtract_block_c),
+  make_tuple(128, 128, 12, &aom_highbd_subtract_block_sse2),
+  make_tuple(128, 128, 12, &aom_highbd_subtract_block_c)
+};
+
+INSTANTIATE_TEST_SUITE_P(SSE2, AV1HBDSubtractBlockTest,
+                         ::testing::ValuesIn(kAV1HBDSubtractBlock_sse2));
+#endif  // HAVE_SSE2
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+}  // namespace
diff --git a/libs/libaom/src/test/sum_squares_test.cc b/libs/libaom/src/test/sum_squares_test.cc
new file mode 100644
index 000000000..8845466b8
--- /dev/null
+++ b/libs/libaom/src/test/sum_squares_test.cc
@@ -0,0 +1,839 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <cmath>
+#include <cstdlib>
+#include <string>
+#include <tuple>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_ports/mem.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "test/function_equivalence_test.h"
+
+using libaom_test::ACMRandom;
+using libaom_test::FunctionEquivalenceTest;
+using ::testing::Combine;
+using ::testing::Range;
+using ::testing::Values;
+using ::testing::ValuesIn;
+
+namespace {
+const int kNumIterations = 10000;
+
+static const int16_t kInt13Max = (1 << 12) - 1;
+
+typedef uint64_t (*SSI16Func)(const int16_t *src, int stride, int width,
+                              int height);
+typedef libaom_test::FuncParam<SSI16Func> TestFuncs;
+
+class SumSquaresTest : public ::testing::TestWithParam<TestFuncs> {
+ public:
+  virtual ~SumSquaresTest() {}
+  virtual void SetUp() {
+    params_ = this->GetParam();
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+    src_ = reinterpret_cast<int16_t *>(aom_memalign(16, 256 * 256 * 2));
+    ASSERT_TRUE(src_ != NULL);
+  }
+
+  virtual void TearDown() {
+    libaom_test::ClearSystemState();
+    aom_free(src_);
+  }
+  void RunTest(int isRandom);
+  void RunSpeedTest();
+
+  void GenRandomData(int width, int height, int stride) {
+    const int msb = 11;  // Up to 12 bit input
+    const int limit = 1 << (msb + 1);
+    for (int ii = 0; ii < height; ii++) {
+      for (int jj = 0; jj < width; jj++) {
+        src_[ii * stride + jj] = rnd_(2) ? rnd_(limit) : -rnd_(limit);
+      }
+    }
+  }
+
+  void GenExtremeData(int width, int height, int stride) {
+    const int msb = 11;  // Up to 12 bit input
+    const int limit = 1 << (msb + 1);
+    const int val = rnd_(2) ? limit - 1 : -(limit - 1);
+    for (int ii = 0; ii < height; ii++) {
+      for (int jj = 0; jj < width; jj++) {
+        src_[ii * stride + jj] = val;
+      }
+    }
+  }
+
+ protected:
+  TestFuncs params_;
+  int16_t *src_;
+  ACMRandom rnd_;
+};
+
+void SumSquaresTest::RunTest(int isRandom) {
+  int failed = 0;
+  for (int k = 0; k < kNumIterations; k++) {
+    const int width = 4 * (rnd_(31) + 1);   // Up to 128x128
+    const int height = 4 * (rnd_(31) + 1);  // Up to 128x128
+    int stride = 4 << rnd_(7);              // Up to 256 stride
+    while (stride < width) {                // Make sure it's valid
+      stride = 4 << rnd_(7);
+    }
+    if (isRandom) {
+      GenRandomData(width, height, stride);
+    } else {
+      GenExtremeData(width, height, stride);
+    }
+    const uint64_t res_ref = params_.ref_func(src_, stride, width, height);
+    uint64_t res_tst;
+    ASM_REGISTER_STATE_CHECK(res_tst =
+                                 params_.tst_func(src_, stride, width, height));
+
+    if (!failed) {
+      failed = res_ref != res_tst;
+      EXPECT_EQ(res_ref, res_tst)
+          << "Error: Sum Squares Test [" << width << "x" << height
+          << "] C output does not match optimized output.";
+    }
+  }
+}
+
+void SumSquaresTest::RunSpeedTest() {
+  for (int block = BLOCK_4X4; block < BLOCK_SIZES_ALL; block++) {
+    const int width = block_size_wide[block];   // Up to 128x128
+    const int height = block_size_high[block];  // Up to 128x128
+    int stride = 4 << rnd_(7);                  // Up to 256 stride
+    while (stride < width) {                    // Make sure it's valid
+      stride = 4 << rnd_(7);
+    }
+    GenExtremeData(width, height, stride);
+    const int num_loops = 1000000000 / (width + height);
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+
+    for (int i = 0; i < num_loops; ++i)
+      params_.ref_func(src_, stride, width, height);
+
+    aom_usec_timer_mark(&timer);
+    const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+    printf("SumSquaresTest C %3dx%-3d: %7.2f ns\n", width, height,
+           1000.0 * elapsed_time / num_loops);
+
+    aom_usec_timer timer1;
+    aom_usec_timer_start(&timer1);
+    for (int i = 0; i < num_loops; ++i)
+      params_.tst_func(src_, stride, width, height);
+    aom_usec_timer_mark(&timer1);
+    const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1));
+    printf("SumSquaresTest Test %3dx%-3d: %7.2f ns\n", width, height,
+           1000.0 * elapsed_time1 / num_loops);
+  }
+}
+
+TEST_P(SumSquaresTest, OperationCheck) {
+  RunTest(1);  // GenRandomData
+}
+
+TEST_P(SumSquaresTest, ExtremeValues) {
+  RunTest(0);  // GenExtremeData
+}
+
+TEST_P(SumSquaresTest, DISABLED_Speed) { RunSpeedTest(); }
+
+#if HAVE_SSE2
+
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, SumSquaresTest,
+    ::testing::Values(TestFuncs(&aom_sum_squares_2d_i16_c,
+                                &aom_sum_squares_2d_i16_sse2)));
+
+#endif  // HAVE_SSE2
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, SumSquaresTest,
+    ::testing::Values(TestFuncs(&aom_sum_squares_2d_i16_c,
+                                &aom_sum_squares_2d_i16_avx2)));
+#endif  // HAVE_AVX2
+
+//////////////////////////////////////////////////////////////////////////////
+// 1D version
+//////////////////////////////////////////////////////////////////////////////
+
+typedef uint64_t (*F1D)(const int16_t *src, uint32_t N);
+typedef libaom_test::FuncParam<F1D> TestFuncs1D;
+
+class SumSquares1DTest : public FunctionEquivalenceTest<F1D> {
+ protected:
+  static const int kIterations = 1000;
+  static const int kMaxSize = 256;
+};
+
+TEST_P(SumSquares1DTest, RandomValues) {
+  DECLARE_ALIGNED(16, int16_t, src[kMaxSize * kMaxSize]);
+
+  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    for (int i = 0; i < kMaxSize * kMaxSize; ++i)
+      src[i] = rng_(kInt13Max * 2 + 1) - kInt13Max;
+
+    const int N = rng_(2) ? rng_(kMaxSize * kMaxSize + 1 - kMaxSize) + kMaxSize
+                          : rng_(kMaxSize) + 1;
+
+    const uint64_t ref_res = params_.ref_func(src, N);
+    uint64_t tst_res;
+    ASM_REGISTER_STATE_CHECK(tst_res = params_.tst_func(src, N));
+
+    ASSERT_EQ(ref_res, tst_res);
+  }
+}
+
+TEST_P(SumSquares1DTest, ExtremeValues) {
+  DECLARE_ALIGNED(16, int16_t, src[kMaxSize * kMaxSize]);
+
+  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    if (rng_(2)) {
+      for (int i = 0; i < kMaxSize * kMaxSize; ++i) src[i] = kInt13Max;
+    } else {
+      for (int i = 0; i < kMaxSize * kMaxSize; ++i) src[i] = -kInt13Max;
+    }
+
+    const int N = rng_(2) ? rng_(kMaxSize * kMaxSize + 1 - kMaxSize) + kMaxSize
+                          : rng_(kMaxSize) + 1;
+
+    const uint64_t ref_res = params_.ref_func(src, N);
+    uint64_t tst_res;
+    ASM_REGISTER_STATE_CHECK(tst_res = params_.tst_func(src, N));
+
+    ASSERT_EQ(ref_res, tst_res);
+  }
+}
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(SSE2, SumSquares1DTest,
+                         ::testing::Values(TestFuncs1D(
+                             aom_sum_squares_i16_c, aom_sum_squares_i16_sse2)));
+
+#endif  // HAVE_SSE2
+
+typedef int64_t (*sse_func)(const uint8_t *a, int a_stride, const uint8_t *b,
+                            int b_stride, int width, int height);
+typedef libaom_test::FuncParam<sse_func> TestSSEFuncs;
+
+typedef std::tuple<TestSSEFuncs, int> SSETestParam;
+
+class SSETest : public ::testing::TestWithParam<SSETestParam> {
+ public:
+  virtual ~SSETest() {}
+  virtual void SetUp() {
+    params_ = GET_PARAM(0);
+    width_ = GET_PARAM(1);
+    isHbd_ =
+#if CONFIG_AV1_HIGHBITDEPTH
+        params_.ref_func == aom_highbd_sse_c;
+#else
+        0;
+#endif
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+    src_ = reinterpret_cast<uint8_t *>(aom_memalign(32, 256 * 256 * 2));
+    ref_ = reinterpret_cast<uint8_t *>(aom_memalign(32, 256 * 256 * 2));
+    ASSERT_TRUE(src_ != NULL);
+    ASSERT_TRUE(ref_ != NULL);
+  }
+
+  virtual void TearDown() {
+    libaom_test::ClearSystemState();
+    aom_free(src_);
+    aom_free(ref_);
+  }
+  void RunTest(int isRandom, int width, int height, int run_times);
+
+  void GenRandomData(int width, int height, int stride) {
+    uint16_t *pSrc = (uint16_t *)src_;
+    uint16_t *pRef = (uint16_t *)ref_;
+    const int msb = 11;  // Up to 12 bit input
+    const int limit = 1 << (msb + 1);
+    for (int ii = 0; ii < height; ii++) {
+      for (int jj = 0; jj < width; jj++) {
+        if (!isHbd_) {
+          src_[ii * stride + jj] = rnd_.Rand8();
+          ref_[ii * stride + jj] = rnd_.Rand8();
+        } else {
+          pSrc[ii * stride + jj] = rnd_(limit);
+          pRef[ii * stride + jj] = rnd_(limit);
+        }
+      }
+    }
+  }
+
+  void GenExtremeData(int width, int height, int stride, uint8_t *data,
+                      int16_t val) {
+    uint16_t *pData = (uint16_t *)data;
+    for (int ii = 0; ii < height; ii++) {
+      for (int jj = 0; jj < width; jj++) {
+        if (!isHbd_) {
+          data[ii * stride + jj] = (uint8_t)val;
+        } else {
+          pData[ii * stride + jj] = val;
+        }
+      }
+    }
+  }
+
+ protected:
+  int isHbd_;
+  int width_;
+  TestSSEFuncs params_;
+  uint8_t *src_;
+  uint8_t *ref_;
+  ACMRandom rnd_;
+};
+
+void SSETest::RunTest(int isRandom, int width, int height, int run_times) {
+  int failed = 0;
+  aom_usec_timer ref_timer, test_timer;
+  for (int k = 0; k < 3; k++) {
+    int stride = 4 << rnd_(7);  // Up to 256 stride
+    while (stride < width) {    // Make sure it's valid
+      stride = 4 << rnd_(7);
+    }
+    if (isRandom) {
+      GenRandomData(width, height, stride);
+    } else {
+      const int msb = isHbd_ ? 12 : 8;  // Up to 12 bit input
+      const int limit = (1 << msb) - 1;
+      if (k == 0) {
+        GenExtremeData(width, height, stride, src_, 0);
+        GenExtremeData(width, height, stride, ref_, limit);
+      } else {
+        GenExtremeData(width, height, stride, src_, limit);
+        GenExtremeData(width, height, stride, ref_, 0);
+      }
+    }
+    int64_t res_ref, res_tst;
+    uint8_t *pSrc = src_;
+    uint8_t *pRef = ref_;
+    if (isHbd_) {
+      pSrc = CONVERT_TO_BYTEPTR(src_);
+      pRef = CONVERT_TO_BYTEPTR(ref_);
+    }
+    res_ref = params_.ref_func(pSrc, stride, pRef, stride, width, height);
+    res_tst = params_.tst_func(pSrc, stride, pRef, stride, width, height);
+    if (run_times > 1) {
+      aom_usec_timer_start(&ref_timer);
+      for (int j = 0; j < run_times; j++) {
+        params_.ref_func(pSrc, stride, pRef, stride, width, height);
+      }
+      aom_usec_timer_mark(&ref_timer);
+      const int elapsed_time_c =
+          static_cast<int>(aom_usec_timer_elapsed(&ref_timer));
+
+      aom_usec_timer_start(&test_timer);
+      for (int j = 0; j < run_times; j++) {
+        params_.tst_func(pSrc, stride, pRef, stride, width, height);
+      }
+      aom_usec_timer_mark(&test_timer);
+      const int elapsed_time_simd =
+          static_cast<int>(aom_usec_timer_elapsed(&test_timer));
+
+      printf(
+          "c_time=%d \t simd_time=%d \t "
+          "gain=%d\n",
+          elapsed_time_c, elapsed_time_simd,
+          (elapsed_time_c / elapsed_time_simd));
+    } else {
+      if (!failed) {
+        failed = res_ref != res_tst;
+        EXPECT_EQ(res_ref, res_tst)
+            << "Error:" << (isHbd_ ? "hbd " : " ") << k << " SSE Test ["
+            << width << "x" << height
+            << "] C output does not match optimized output.";
+      }
+    }
+  }
+}
+
+TEST_P(SSETest, OperationCheck) {
+  for (int height = 4; height <= 128; height += 4) {
+    RunTest(1, width_, height, 1);  // GenRandomData
+  }
+}
+
+TEST_P(SSETest, ExtremeValues) {
+  for (int height = 4; height <= 128; height += 4) {
+    RunTest(0, width_, height, 1);
+  }
+}
+
+TEST_P(SSETest, DISABLED_Speed) {
+  for (int height = 4; height <= 128; height += 4) {
+    RunTest(1, width_, height, 100);
+  }
+}
+
+#if HAVE_NEON
+TestSSEFuncs sse_neon[] = {
+  TestSSEFuncs(&aom_sse_c, &aom_sse_neon),
+#if CONFIG_AV1_HIGHBITDEPTH
+  TestSSEFuncs(&aom_highbd_sse_c, &aom_highbd_sse_neon)
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(NEON, SSETest,
+                         Combine(ValuesIn(sse_neon), Range(4, 129, 4)));
+#endif  // HAVE_NEON
+
+#if HAVE_SSE4_1
+TestSSEFuncs sse_sse4[] = {
+  TestSSEFuncs(&aom_sse_c, &aom_sse_sse4_1),
+#if CONFIG_AV1_HIGHBITDEPTH
+  TestSSEFuncs(&aom_highbd_sse_c, &aom_highbd_sse_sse4_1)
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(SSE4_1, SSETest,
+                         Combine(ValuesIn(sse_sse4), Range(4, 129, 4)));
+#endif  // HAVE_SSE4_1
+
+#if HAVE_AVX2
+
+TestSSEFuncs sse_avx2[] = {
+  TestSSEFuncs(&aom_sse_c, &aom_sse_avx2),
+#if CONFIG_AV1_HIGHBITDEPTH
+  TestSSEFuncs(&aom_highbd_sse_c, &aom_highbd_sse_avx2)
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(AVX2, SSETest,
+                         Combine(ValuesIn(sse_avx2), Range(4, 129, 4)));
+#endif  // HAVE_AVX2
+
+//////////////////////////////////////////////////////////////////////////////
+// get_blk sum squares test functions
+//////////////////////////////////////////////////////////////////////////////
+
+typedef void (*sse_sum_func)(const int16_t *data, int stride, int bw, int bh,
+                             int *x_sum, int64_t *x2_sum);
+typedef libaom_test::FuncParam<sse_sum_func> TestSSE_SumFuncs;
+
+typedef std::tuple<TestSSE_SumFuncs, int> SSE_SumTestParam;
+
+class SSE_Sum_Test : public ::testing::TestWithParam<SSE_SumTestParam> {
+ public:
+  virtual ~SSE_Sum_Test() {}
+  virtual void SetUp() {
+    params_ = GET_PARAM(0);
+    width_ = GET_PARAM(1);
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+    src_ = reinterpret_cast<int16_t *>(aom_memalign(32, 256 * 256 * 2));
+    ASSERT_TRUE(src_ != NULL);
+  }
+
+  virtual void TearDown() {
+    libaom_test::ClearSystemState();
+    aom_free(src_);
+  }
+  void RunTest(int isRandom, int width, int height, int run_times);
+
+  void GenRandomData(int width, int height, int stride) {
+    const int msb = 11;  // Up to 12 bit input
+    const int limit = 1 << (msb + 1);
+    for (int ii = 0; ii < height; ii++) {
+      for (int jj = 0; jj < width; jj++) {
+        src_[ii * stride + jj] = rnd_(limit);
+      }
+    }
+  }
+
+  void GenExtremeData(int width, int height, int stride, int16_t *data,
+                      int16_t val) {
+    for (int ii = 0; ii < height; ii++) {
+      for (int jj = 0; jj < width; jj++) {
+        data[ii * stride + jj] = val;
+      }
+    }
+  }
+
+ protected:
+  int width_;
+  TestSSE_SumFuncs params_;
+  int16_t *src_;
+  ACMRandom rnd_;
+};
+
+void SSE_Sum_Test::RunTest(int isRandom, int width, int height, int run_times) {
+  aom_usec_timer ref_timer, test_timer;
+  for (int k = 0; k < 3; k++) {
+    int stride = 4 << rnd_(7);  // Up to 256 stride
+    while (stride < width) {    // Make sure it's valid
+      stride = 4 << rnd_(7);
+    }
+    if (isRandom) {
+      GenRandomData(width, height, stride);
+    } else {
+      const int msb = 12;  // Up to 12 bit input
+      const int limit = (1 << msb) - 1;
+      if (k == 0) {
+        GenExtremeData(width, height, stride, src_, limit);
+      } else {
+        GenExtremeData(width, height, stride, src_, -limit);
+      }
+    }
+    int sum_c = 0;
+    int64_t sse_intr = 0;
+    int sum_intr = 0;
+    int64_t sse_c = 0;
+
+    params_.ref_func(src_, stride, width, height, &sum_c, &sse_c);
+    params_.tst_func(src_, stride, width, height, &sum_intr, &sse_intr);
+
+    if (run_times > 1) {
+      aom_usec_timer_start(&ref_timer);
+      for (int j = 0; j < run_times; j++) {
+        params_.ref_func(src_, stride, width, height, &sum_c, &sse_c);
+      }
+      aom_usec_timer_mark(&ref_timer);
+      const int elapsed_time_c =
+          static_cast<int>(aom_usec_timer_elapsed(&ref_timer));
+
+      aom_usec_timer_start(&test_timer);
+      for (int j = 0; j < run_times; j++) {
+        params_.tst_func(src_, stride, width, height, &sum_intr, &sse_intr);
+      }
+      aom_usec_timer_mark(&test_timer);
+      const int elapsed_time_simd =
+          static_cast<int>(aom_usec_timer_elapsed(&test_timer));
+
+      printf(
+          "c_time=%d \t simd_time=%d \t "
+          "gain=%f\t width=%d\t height=%d \n",
+          elapsed_time_c, elapsed_time_simd,
+          (float)((float)elapsed_time_c / (float)elapsed_time_simd), width,
+          height);
+
+    } else {
+      EXPECT_EQ(sum_c, sum_intr)
+          << "Error:" << k << " SSE Sum Test [" << width << "x" << height
+          << "] C output does not match optimized output.";
+      EXPECT_EQ(sse_c, sse_intr)
+          << "Error:" << k << " SSE Sum Test [" << width << "x" << height
+          << "] C output does not match optimized output.";
+    }
+  }
+}
+
+TEST_P(SSE_Sum_Test, OperationCheck) {
+  for (int height = 4; height <= 64; height = height * 2) {
+    RunTest(1, width_, height, 1);  // GenRandomData
+  }
+}
+
+TEST_P(SSE_Sum_Test, ExtremeValues) {
+  for (int height = 4; height <= 64; height = height * 2) {
+    RunTest(0, width_, height, 1);
+  }
+}
+
+TEST_P(SSE_Sum_Test, DISABLED_Speed) {
+  for (int height = 4; height <= 64; height = height * 2) {
+    RunTest(1, width_, height, 10000);
+  }
+}
+
+#if HAVE_SSE2
+TestSSE_SumFuncs sse_sum_sse2[] = { TestSSE_SumFuncs(
+    &aom_get_blk_sse_sum_c, &aom_get_blk_sse_sum_sse2) };
+INSTANTIATE_TEST_SUITE_P(SSE2, SSE_Sum_Test,
+                         Combine(ValuesIn(sse_sum_sse2), Range(4, 65, 4)));
+#endif  // HAVE_SSE2
+
+#if HAVE_AVX2
+TestSSE_SumFuncs sse_sum_avx2[] = { TestSSE_SumFuncs(
+    &aom_get_blk_sse_sum_c, &aom_get_blk_sse_sum_avx2) };
+INSTANTIATE_TEST_SUITE_P(AVX2, SSE_Sum_Test,
+                         Combine(ValuesIn(sse_sum_avx2), Range(4, 65, 4)));
+#endif  // HAVE_AVX2
+
+//////////////////////////////////////////////////////////////////////////////
+// 2D Variance test functions
+//////////////////////////////////////////////////////////////////////////////
+
+typedef uint64_t (*Var2DFunc)(uint8_t *src, int stride, int width, int height);
+typedef libaom_test::FuncParam<Var2DFunc> TestFuncVar2D;
+
+const uint16_t test_block_size[2] = { 128, 256 };
+
+class Lowbd2dVarTest : public ::testing::TestWithParam<TestFuncVar2D> {
+ public:
+  virtual ~Lowbd2dVarTest() {}
+  virtual void SetUp() {
+    params_ = this->GetParam();
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+    src_ = reinterpret_cast<uint8_t *>(
+        aom_memalign(16, 512 * 512 * sizeof(uint8_t)));
+    ASSERT_TRUE(src_ != NULL);
+  }
+
+  virtual void TearDown() {
+    libaom_test::ClearSystemState();
+    aom_free(src_);
+  }
+  void RunTest(int isRandom);
+  void RunSpeedTest();
+
+  void GenRandomData(int width, int height, int stride) {
+    const int msb = 7;  // Up to 8 bit input
+    const int limit = 1 << (msb + 1);
+    for (int ii = 0; ii < height; ii++) {
+      for (int jj = 0; jj < width; jj++) {
+        src_[ii * stride + jj] = rnd_(limit);
+      }
+    }
+  }
+
+  void GenExtremeData(int width, int height, int stride) {
+    const int msb = 7;  // Up to 8 bit input
+    const int limit = 1 << (msb + 1);
+    const int val = rnd_(2) ? limit - 1 : 0;
+    for (int ii = 0; ii < height; ii++) {
+      for (int jj = 0; jj < width; jj++) {
+        src_[ii * stride + jj] = val;
+      }
+    }
+  }
+
+ protected:
+  TestFuncVar2D params_;
+  uint8_t *src_;
+  ACMRandom rnd_;
+};
+
+void Lowbd2dVarTest::RunTest(int isRandom) {
+  int failed = 0;
+  for (int k = 0; k < kNumIterations; k++) {
+    const int width = 4 * (rnd_(63) + 1);   // Up to 256x256
+    const int height = 4 * (rnd_(63) + 1);  // Up to 256x256
+    int stride = 4 << rnd_(8);              // Up to 512 stride
+    while (stride < width) {                // Make sure it's valid
+      stride = 4 << rnd_(8);
+    }
+    if (isRandom) {
+      GenRandomData(width, height, stride);
+    } else {
+      GenExtremeData(width, height, stride);
+    }
+
+    const uint64_t res_ref = params_.ref_func(src_, stride, width, height);
+    uint64_t res_tst;
+    ASM_REGISTER_STATE_CHECK(res_tst =
+                                 params_.tst_func(src_, stride, width, height));
+
+    if (!failed) {
+      failed = res_ref != res_tst;
+      EXPECT_EQ(res_ref, res_tst)
+          << "Error: Sum Squares Test [" << width << "x" << height
+          << "] C output does not match optimized output.";
+    }
+  }
+}
+
+void Lowbd2dVarTest::RunSpeedTest() {
+  for (int block = 0; block < 2; block++) {
+    const int width = test_block_size[block];
+    const int height = test_block_size[block];
+    int stride = 4 << rnd_(8);  // Up to 512 stride
+    while (stride < width) {    // Make sure it's valid
+      stride = 4 << rnd_(8);
+    }
+    GenExtremeData(width, height, stride);
+    const int num_loops = 1000000000 / (width + height);
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+
+    for (int i = 0; i < num_loops; ++i)
+      params_.ref_func(src_, stride, width, height);
+
+    aom_usec_timer_mark(&timer);
+    const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+
+    aom_usec_timer timer1;
+    aom_usec_timer_start(&timer1);
+    for (int i = 0; i < num_loops; ++i)
+      params_.tst_func(src_, stride, width, height);
+    aom_usec_timer_mark(&timer1);
+    const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1));
+    printf("%3dx%-3d: Scaling = %.2f\n", width, height,
+           (double)elapsed_time / elapsed_time1);
+  }
+}
+
+TEST_P(Lowbd2dVarTest, OperationCheck) {
+  RunTest(1);  // GenRandomData
+}
+
+TEST_P(Lowbd2dVarTest, ExtremeValues) {
+  RunTest(0);  // GenExtremeData
+}
+
+TEST_P(Lowbd2dVarTest, DISABLED_Speed) { RunSpeedTest(); }
+
+#if HAVE_SSE2
+
+INSTANTIATE_TEST_SUITE_P(SSE2, Lowbd2dVarTest,
+                         ::testing::Values(TestFuncVar2D(&aom_var_2d_u8_c,
+                                                         &aom_var_2d_u8_sse2)));
+
+#endif  // HAVE_SSE2
+
+#if HAVE_AVX2
+
+INSTANTIATE_TEST_SUITE_P(AVX2, Lowbd2dVarTest,
+                         ::testing::Values(TestFuncVar2D(&aom_var_2d_u8_c,
+                                                         &aom_var_2d_u8_avx2)));
+
+#endif  // HAVE_SSE2
+
+class Highbd2dVarTest : public ::testing::TestWithParam<TestFuncVar2D> {
+ public:
+  virtual ~Highbd2dVarTest() {}
+  virtual void SetUp() {
+    params_ = this->GetParam();
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+    src_ = reinterpret_cast<uint16_t *>(
+        aom_memalign(16, 512 * 512 * sizeof(uint16_t)));
+    ASSERT_TRUE(src_ != NULL);
+  }
+
+  virtual void TearDown() {
+    libaom_test::ClearSystemState();
+    aom_free(src_);
+  }
+  void RunTest(int isRandom);
+  void RunSpeedTest();
+
+  void GenRandomData(int width, int height, int stride) {
+    const int msb = 11;  // Up to 12 bit input
+    const int limit = 1 << (msb + 1);
+    for (int ii = 0; ii < height; ii++) {
+      for (int jj = 0; jj < width; jj++) {
+        src_[ii * stride + jj] = rnd_(limit);
+      }
+    }
+  }
+
+  void GenExtremeData(int width, int height, int stride) {
+    const int msb = 11;  // Up to 12 bit input
+    const int limit = 1 << (msb + 1);
+    const int val = rnd_(2) ? limit - 1 : 0;
+    for (int ii = 0; ii < height; ii++) {
+      for (int jj = 0; jj < width; jj++) {
+        src_[ii * stride + jj] = val;
+      }
+    }
+  }
+
+ protected:
+  TestFuncVar2D params_;
+  uint16_t *src_;
+  ACMRandom rnd_;
+};
+
+void Highbd2dVarTest::RunTest(int isRandom) {
+  int failed = 0;
+  for (int k = 0; k < kNumIterations; k++) {
+    const int width = 4 * (rnd_(63) + 1);   // Up to 256x256
+    const int height = 4 * (rnd_(63) + 1);  // Up to 256x256
+    int stride = 4 << rnd_(8);              // Up to 512 stride
+    while (stride < width) {                // Make sure it's valid
+      stride = 4 << rnd_(8);
+    }
+    if (isRandom) {
+      GenRandomData(width, height, stride);
+    } else {
+      GenExtremeData(width, height, stride);
+    }
+
+    const uint64_t res_ref =
+        params_.ref_func(CONVERT_TO_BYTEPTR(src_), stride, width, height);
+    uint64_t res_tst;
+    ASM_REGISTER_STATE_CHECK(
+        res_tst =
+            params_.tst_func(CONVERT_TO_BYTEPTR(src_), stride, width, height));
+
+    if (!failed) {
+      failed = res_ref != res_tst;
+      EXPECT_EQ(res_ref, res_tst)
+          << "Error: Sum Squares Test [" << width << "x" << height
+          << "] C output does not match optimized output.";
+    }
+  }
+}
+
+void Highbd2dVarTest::RunSpeedTest() {
+  for (int block = 0; block < 2; block++) {
+    const int width = test_block_size[block];
+    const int height = test_block_size[block];
+    int stride = 4 << rnd_(8);  // Up to 512 stride
+    while (stride < width) {    // Make sure it's valid
+      stride = 4 << rnd_(8);
+    }
+    GenExtremeData(width, height, stride);
+    const int num_loops = 1000000000 / (width + height);
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+
+    for (int i = 0; i < num_loops; ++i)
+      params_.ref_func(CONVERT_TO_BYTEPTR(src_), stride, width, height);
+
+    aom_usec_timer_mark(&timer);
+    const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+
+    aom_usec_timer timer1;
+    aom_usec_timer_start(&timer1);
+    for (int i = 0; i < num_loops; ++i)
+      params_.tst_func(CONVERT_TO_BYTEPTR(src_), stride, width, height);
+    aom_usec_timer_mark(&timer1);
+    const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1));
+    printf("%3dx%-3d: Scaling = %.2f\n", width, height,
+           (double)elapsed_time / elapsed_time1);
+  }
+}
+
+TEST_P(Highbd2dVarTest, OperationCheck) {
+  RunTest(1);  // GenRandomData
+}
+
+TEST_P(Highbd2dVarTest, ExtremeValues) {
+  RunTest(0);  // GenExtremeData
+}
+
+TEST_P(Highbd2dVarTest, DISABLED_Speed) { RunSpeedTest(); }
+
+#if HAVE_SSE2
+
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, Highbd2dVarTest,
+    ::testing::Values(TestFuncVar2D(&aom_var_2d_u16_c, &aom_var_2d_u16_sse2)));
+
+#endif  // HAVE_SSE2
+
+#if HAVE_AVX2
+
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, Highbd2dVarTest,
+    ::testing::Values(TestFuncVar2D(&aom_var_2d_u16_c, &aom_var_2d_u16_avx2)));
+
+#endif  // HAVE_SSE2
+}  // namespace
diff --git a/libs/libaom/src/test/superframe_test.cc b/libs/libaom/src/test/superframe_test.cc
new file mode 100644
index 000000000..024a18b97
--- /dev/null
+++ b/libs/libaom/src/test/superframe_test.cc
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <climits>
+#include <tuple>
+#include <vector>
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+
+namespace {
+
+const int kTestMode = 0;
+const int kTileCols = 1;
+const int kTileRows = 2;
+
+typedef std::tuple<libaom_test::TestMode, int, int> SuperframeTestParam;
+
+class SuperframeTest
+    : public ::libaom_test::CodecTestWithParam<SuperframeTestParam>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  SuperframeTest() : EncoderTest(GET_PARAM(0)), last_sf_pts_(0) {}
+  virtual ~SuperframeTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    const SuperframeTestParam input = GET_PARAM(1);
+    const libaom_test::TestMode mode = std::get<kTestMode>(input);
+    SetMode(mode);
+    sf_count_ = 0;
+    sf_count_max_ = INT_MAX;
+    n_tile_cols_ = std::get<kTileCols>(input);
+    n_tile_rows_ = std::get<kTileRows>(input);
+  }
+
+  virtual void PreEncodeFrameHook(libaom_test::VideoSource *video,
+                                  libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+      encoder->Control(AOME_SET_CPUUSED, 2);
+      encoder->Control(AV1E_SET_TILE_COLUMNS, n_tile_cols_);
+      encoder->Control(AV1E_SET_TILE_ROWS, n_tile_rows_);
+    }
+  }
+
+  virtual const aom_codec_cx_pkt_t *MutateEncoderOutputHook(
+      const aom_codec_cx_pkt_t *pkt) {
+    if (pkt->kind != AOM_CODEC_CX_FRAME_PKT) return pkt;
+
+    const uint8_t *buffer = reinterpret_cast<uint8_t *>(pkt->data.frame.buf);
+    const uint8_t marker = buffer[0];
+    const int frames = (marker & 0x7) + 1;
+    const int mag = ((marker >> 3) & 3) + 1;
+    const unsigned int index_sz = 2 + mag * (frames - 1);
+    if ((marker & 0xe0) == 0xc0 && pkt->data.frame.sz >= index_sz &&
+        buffer[index_sz - 1] == marker) {
+      // frame is a superframe. strip off the index.
+      modified_buf_.resize(pkt->data.frame.sz - index_sz);
+      memcpy(&modified_buf_[0], (uint8_t *)pkt->data.frame.buf + index_sz,
+             pkt->data.frame.sz - index_sz);
+      modified_pkt_ = *pkt;
+      modified_pkt_.data.frame.buf = &modified_buf_[0];
+      modified_pkt_.data.frame.sz -= index_sz;
+
+      sf_count_++;
+      last_sf_pts_ = pkt->data.frame.pts;
+      return &modified_pkt_;
+    }
+
+    // Make sure we do a few frames after the last SF
+    abort_ |=
+        sf_count_ > sf_count_max_ && pkt->data.frame.pts - last_sf_pts_ >= 5;
+    return pkt;
+  }
+
+  int sf_count_;
+  int sf_count_max_;
+  aom_codec_cx_pkt_t modified_pkt_;
+  std::vector<uint8_t> modified_buf_;
+  aom_codec_pts_t last_sf_pts_;
+
+ private:
+  int n_tile_cols_;
+  int n_tile_rows_;
+};
+
+TEST_P(SuperframeTest, TestSuperframeIndexIsOptional) {
+  sf_count_max_ = 0;  // early exit on successful test.
+  cfg_.g_lag_in_frames = 25;
+  cfg_.large_scale_tile = 1;
+  ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 40);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  // NOTE: The use of BWDREF_FRAME will enable the coding of more non-show
+  //       frames besides ALTREF_FRAME.
+  EXPECT_GE(sf_count_, 1);
+}
+
+}  // namespace
diff --git a/libs/libaom/src/test/svc_datarate_test.cc b/libs/libaom/src/test/svc_datarate_test.cc
new file mode 100644
index 000000000..28e517ba1
--- /dev/null
+++ b/libs/libaom/src/test/svc_datarate_test.cc
@@ -0,0 +1,609 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_config.h"
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/datarate_test.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+#include "aom/aom_codec.h"
+#include "av1/common/enums.h"
+
+namespace datarate_test {
+namespace {
+
+class DatarateTestSVC
+    : public ::libaom_test::CodecTestWith4Params<libaom_test::TestMode, int,
+                                                 unsigned int, int>,
+      public DatarateTest {
+ public:
+  DatarateTestSVC() : DatarateTest(GET_PARAM(0)) {
+    set_cpu_used_ = GET_PARAM(2);
+    aq_mode_ = GET_PARAM(3);
+  }
+
+ protected:
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(GET_PARAM(1));
+    ResetModel();
+  }
+
+  virtual int GetNumSpatialLayers() { return number_spatial_layers_; }
+
+  virtual void ResetModel() {
+    DatarateTest::ResetModel();
+    layer_frame_cnt_ = 0;
+    superframe_cnt_ = 0;
+    number_temporal_layers_ = 1;
+    number_spatial_layers_ = 1;
+    for (int i = 0; i < AOM_MAX_LAYERS; i++) {
+      target_layer_bitrate_[i] = 0;
+      effective_datarate_tl[i] = 0.0;
+    }
+    memset(&layer_id_, 0, sizeof(aom_svc_layer_id_t));
+    memset(&svc_params_, 0, sizeof(aom_svc_params_t));
+    memset(&ref_frame_config_, 0, sizeof(aom_svc_ref_frame_config_t));
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    int spatial_layer_id = 0;
+    if (video->frame() == 0) {
+      initialize_svc(number_temporal_layers_, number_spatial_layers_,
+                     &svc_params_);
+      encoder->Control(AV1E_SET_SVC_PARAMS, &svc_params_);
+      encoder->Control(AV1E_SET_ENABLE_ORDER_HINT, 0);
+      encoder->Control(AV1E_SET_ENABLE_TPL_MODEL, 0);
+      encoder->Control(AV1E_SET_DELTAQ_MODE, 0);
+    }
+    if (number_spatial_layers_ == 2) {
+      spatial_layer_id = (layer_frame_cnt_ % 2 == 0) ? 0 : 1;
+    } else if (number_spatial_layers_ == 3) {
+      spatial_layer_id = (layer_frame_cnt_ % 3 == 0)
+                             ? 0
+                             : ((layer_frame_cnt_ - 1) % 3 == 0) ? 1 : 2;
+    }
+    // Set the reference/update flags, layer_id, and reference_map
+    // buffer index.
+    frame_flags_ = set_layer_pattern(video->frame(), &layer_id_,
+                                     &ref_frame_config_, spatial_layer_id);
+    encoder->Control(AV1E_SET_SVC_LAYER_ID, &layer_id_);
+    encoder->Control(AV1E_SET_SVC_REF_FRAME_CONFIG, &ref_frame_config_);
+    layer_frame_cnt_++;
+    DatarateTest::PreEncodeFrameHook(video, encoder);
+  }
+
+  virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
+    const size_t frame_size_in_bits = pkt->data.frame.sz * 8;
+    // Update the layer cumulative  bitrate.
+    for (int i = layer_id_.temporal_layer_id; i < number_temporal_layers_;
+         i++) {
+      int layer = layer_id_.spatial_layer_id * number_temporal_layers_ + i;
+      effective_datarate_tl[layer] += 1.0 * frame_size_in_bits;
+    }
+    if (layer_id_.spatial_layer_id == number_spatial_layers_ - 1) {
+      last_pts_ = pkt->data.frame.pts;
+      superframe_cnt_++;
+    }
+  }
+
+  virtual void EndPassHook(void) {
+    duration_ = ((last_pts_ + 1) * timebase_);
+    for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+      effective_datarate_tl[i] = (effective_datarate_tl[i] / 1000) / duration_;
+    }
+  }
+
+  // Layer pattern configuration.
+  virtual int set_layer_pattern(int frame_cnt, aom_svc_layer_id_t *layer_id,
+                                aom_svc_ref_frame_config_t *ref_frame_config,
+                                int spatial_layer) {
+    layer_id->spatial_layer_id = spatial_layer;
+    // Set the referende map buffer idx for the 7 references:
+    // LAST_FRAME (0), LAST2_FRAME(1), LAST3_FRAME(2), GOLDEN_FRAME(3),
+    // BWDREF_FRAME(4), ALTREF2_FRAME(5), ALTREF_FRAME(6).
+    for (int i = 0; i < INTER_REFS_PER_FRAME; i++) {
+      ref_frame_config->ref_idx[i] = i;
+      ref_frame_config->reference[i] = 0;
+    }
+    for (int i = 0; i < REF_FRAMES; i++) ref_frame_config->refresh[i] = 0;
+    // Set layer_flags to 0 when using ref_frame_config->reference.
+    int layer_flags = 0;
+    // Always reference LAST.
+    ref_frame_config->reference[0] = 1;
+    if (number_temporal_layers_ == 3 && number_spatial_layers_ == 1) {
+      // 3-layer:
+      //   1    3   5    7
+      //     2        6
+      // 0        4        8
+      if (frame_cnt % 4 == 0) {
+        // Base layer.
+        layer_id->temporal_layer_id = 0;
+        // Update LAST on layer 0, reference LAST and GF.
+        ref_frame_config->refresh[0] = 1;
+        ref_frame_config->reference[3] = 1;
+      } else if ((frame_cnt - 1) % 4 == 0) {
+        layer_id->temporal_layer_id = 2;
+        // First top layer: no updates, only reference LAST (TL0).
+      } else if ((frame_cnt - 2) % 4 == 0) {
+        layer_id->temporal_layer_id = 1;
+        // Middle layer (TL1): update LAST2, only reference LAST (TL0).
+        ref_frame_config->refresh[1] = 1;
+      } else if ((frame_cnt - 3) % 4 == 0) {
+        layer_id->temporal_layer_id = 2;
+        // Second top layer: no updates, only reference LAST.
+        // Set buffer idx for LAST to slot 1, since that was the slot
+        // updated in previous frame. So LAST is TL1 frame.
+        ref_frame_config->ref_idx[0] = 1;
+        ref_frame_config->ref_idx[1] = 0;
+      }
+    } else if (number_temporal_layers_ == 1 && number_spatial_layers_ == 2) {
+      layer_id->temporal_layer_id = 0;
+      if (layer_id->spatial_layer_id == 0) {
+        // Reference LAST, update LAST. Keep LAST and GOLDEN in slots 0 and 3.
+        ref_frame_config->ref_idx[0] = 0;
+        ref_frame_config->ref_idx[3] = 3;
+        ref_frame_config->refresh[0] = 1;
+      } else if (layer_id->spatial_layer_id == 1) {
+        // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 3
+        // and GOLDEN to slot 0. Update slot 3 (LAST).
+        ref_frame_config->ref_idx[0] = 3;
+        ref_frame_config->ref_idx[3] = 0;
+        ref_frame_config->refresh[3] = 1;
+      }
+      // Reference GOLDEN.
+      if (layer_id->spatial_layer_id > 0) ref_frame_config->reference[3] = 1;
+    } else if (number_temporal_layers_ == 1 && number_spatial_layers_ == 3) {
+      // 3 spatial layers, 1 temporal.
+      // Note for this case , we set the buffer idx for all references to be
+      // either LAST or GOLDEN, which are always valid references, since decoder
+      // will check if any of the 7 references is valid scale in
+      // valid_ref_frame_size().
+      layer_id->temporal_layer_id = 0;
+      if (layer_id->spatial_layer_id == 0) {
+        // Reference LAST, update LAST. Set all other buffer_idx to 0.
+        for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 0;
+        ref_frame_config->refresh[0] = 1;
+      } else if (layer_id->spatial_layer_id == 1) {
+        // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1
+        // and GOLDEN (and all other refs) to slot 0.
+        // Update slot 1 (LAST).
+        for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 0;
+        ref_frame_config->ref_idx[0] = 1;
+        ref_frame_config->refresh[1] = 1;
+      } else if (layer_id->spatial_layer_id == 2) {
+        // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2
+        // and GOLDEN (and all other refs) to slot 1.
+        // Update slot 2 (LAST).
+        for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 1;
+        ref_frame_config->ref_idx[0] = 2;
+        ref_frame_config->refresh[2] = 1;
+      }
+      // Reference GOLDEN.
+      if (layer_id->spatial_layer_id > 0) ref_frame_config->reference[3] = 1;
+    } else if (number_temporal_layers_ == 3 && number_spatial_layers_ == 3) {
+      // 3 spatial and 3 temporal layer.
+      if (superframe_cnt_ % 4 == 0) {
+        // Base temporal layer.
+        layer_id->temporal_layer_id = 0;
+        if (layer_id->spatial_layer_id == 0) {
+          // Reference LAST, update LAST.
+          // Set all buffer_idx to 0.
+          for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 0;
+          ref_frame_config->refresh[0] = 1;
+        } else if (layer_id->spatial_layer_id == 1) {
+          // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1,
+          // GOLDEN (and all other refs) to slot 0.
+          // Update slot 1 (LAST).
+          for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 0;
+          ref_frame_config->ref_idx[0] = 1;
+          ref_frame_config->refresh[1] = 1;
+        } else if (layer_id->spatial_layer_id == 2) {
+          // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2,
+          // GOLDEN (and all other refs) to slot 1.
+          // Update slot 2 (LAST).
+          for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 1;
+          ref_frame_config->ref_idx[0] = 2;
+          ref_frame_config->refresh[2] = 1;
+        }
+      } else if ((superframe_cnt_ - 1) % 4 == 0) {
+        // First top temporal enhancement layer.
+        layer_id->temporal_layer_id = 2;
+        if (layer_id->spatial_layer_id == 0) {
+          // Reference LAST (slot 0).
+          // Set GOLDEN to slot 3 and update slot 3.
+          // Set all other buffer_idx to slot 0.
+          for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 0;
+          ref_frame_config->ref_idx[3] = 3;
+          ref_frame_config->refresh[3] = 1;
+        } else if (layer_id->spatial_layer_id == 1) {
+          // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1,
+          // GOLDEN (and all other refs) to slot 3.
+          // Set LAST2 to slot 4 and Update slot 4.
+          for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 3;
+          ref_frame_config->ref_idx[0] = 1;
+          ref_frame_config->ref_idx[1] = 4;
+          ref_frame_config->refresh[4] = 1;
+        } else if (layer_id->spatial_layer_id == 2) {
+          // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2,
+          // GOLDEN (and all other refs) to slot 4.
+          // No update.
+          for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 4;
+          ref_frame_config->ref_idx[0] = 2;
+        }
+      } else if ((superframe_cnt_ - 2) % 4 == 0) {
+        // Middle temporal enhancement layer.
+        layer_id->temporal_layer_id = 1;
+        if (layer_id->spatial_layer_id == 0) {
+          // Reference LAST.
+          // Set all buffer_idx to 0.
+          // Set GOLDEN to slot 5 and update slot 5.
+          for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 0;
+          ref_frame_config->ref_idx[3] = 5;
+          ref_frame_config->refresh[5] = 1;
+        } else if (layer_id->spatial_layer_id == 1) {
+          // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 1,
+          // GOLDEN (and all other refs) to slot 5.
+          // Set LAST2 to slot 6 and update slot 6.
+          for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 5;
+          ref_frame_config->ref_idx[0] = 1;
+          ref_frame_config->ref_idx[2] = 6;
+          ref_frame_config->refresh[6] = 1;
+        } else if (layer_id->spatial_layer_id == 2) {
+          // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 2,
+          // GOLDEN (and all other refs) to slot 6.
+          // Set LAST2 to slot 6 and update slot 7.
+          for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 6;
+          ref_frame_config->ref_idx[0] = 2;
+          ref_frame_config->ref_idx[2] = 7;
+          ref_frame_config->refresh[7] = 1;
+        }
+      } else if ((superframe_cnt_ - 3) % 4 == 0) {
+        // Second top temporal enhancement layer.
+        layer_id->temporal_layer_id = 2;
+        if (layer_id->spatial_layer_id == 0) {
+          // Set LAST to slot 5 and reference LAST.
+          // Set GOLDEN to slot 3 and update slot 3.
+          // Set all other buffer_idx to 0.
+          for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 0;
+          ref_frame_config->ref_idx[0] = 5;
+          ref_frame_config->ref_idx[3] = 3;
+          ref_frame_config->refresh[3] = 1;
+        } else if (layer_id->spatial_layer_id == 1) {
+          // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 6,
+          // GOLDEN to slot 3. Set LAST2 to slot 4 and update slot 4.
+          for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 0;
+          ref_frame_config->ref_idx[0] = 6;
+          ref_frame_config->ref_idx[3] = 3;
+          ref_frame_config->ref_idx[1] = 4;
+          ref_frame_config->refresh[4] = 1;
+        } else if (layer_id->spatial_layer_id == 2) {
+          // Reference LAST and GOLDEN. Set buffer_idx for LAST to slot 7,
+          // GOLDEN to slot 4. No update.
+          for (int i = 0; i < 7; i++) ref_frame_config->ref_idx[i] = 0;
+          ref_frame_config->ref_idx[0] = 7;
+          ref_frame_config->ref_idx[3] = 4;
+        }
+      }
+      // Reference GOLDEN.
+      if (layer_id->spatial_layer_id > 0) ref_frame_config->reference[3] = 1;
+    }
+    return layer_flags;
+  }
+
+  virtual void initialize_svc(int number_temporal_layers,
+                              int number_spatial_layers,
+                              aom_svc_params *svc_params) {
+    svc_params->number_spatial_layers = number_spatial_layers;
+    svc_params->number_temporal_layers = number_temporal_layers;
+    for (int i = 0; i < number_temporal_layers * number_spatial_layers; ++i) {
+      svc_params->max_quantizers[i] = 60;
+      svc_params->min_quantizers[i] = 2;
+      svc_params->layer_target_bitrate[i] = target_layer_bitrate_[i];
+    }
+    // Do at most 3 spatial or temporal layers here.
+    svc_params->framerate_factor[0] = 1;
+    if (number_temporal_layers == 2) {
+      svc_params->framerate_factor[0] = 2;
+      svc_params->framerate_factor[1] = 1;
+    } else if (number_temporal_layers == 3) {
+      svc_params->framerate_factor[0] = 4;
+      svc_params->framerate_factor[1] = 2;
+      svc_params->framerate_factor[2] = 1;
+    }
+    svc_params->scaling_factor_num[0] = 1;
+    svc_params->scaling_factor_den[0] = 1;
+    if (number_spatial_layers == 2) {
+      svc_params->scaling_factor_num[0] = 1;
+      svc_params->scaling_factor_den[0] = 2;
+      svc_params->scaling_factor_num[1] = 1;
+      svc_params->scaling_factor_den[1] = 1;
+    } else if (number_spatial_layers == 3) {
+      svc_params->scaling_factor_num[0] = 1;
+      svc_params->scaling_factor_den[0] = 4;
+      svc_params->scaling_factor_num[1] = 1;
+      svc_params->scaling_factor_den[1] = 2;
+      svc_params->scaling_factor_num[2] = 1;
+      svc_params->scaling_factor_den[2] = 1;
+    }
+  }
+
+  virtual void BasicRateTargetingSVC3TL1SLTest() {
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_min_quantizer = 0;
+    cfg_.rc_max_quantizer = 63;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.g_error_resilient = 1;
+
+    ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+                                         288, 30, 1, 0, 300);
+    const int bitrate_array[2] = { 200, 550 };
+    cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+    ResetModel();
+    number_temporal_layers_ = 3;
+    target_layer_bitrate_[0] = 50 * cfg_.rc_target_bitrate / 100;
+    target_layer_bitrate_[1] = 70 * cfg_.rc_target_bitrate / 100;
+    target_layer_bitrate_[2] = cfg_.rc_target_bitrate;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+      ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.80)
+          << " The datarate for the file is lower than target by too much!";
+      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.30)
+          << " The datarate for the file is greater than target by too much!";
+    }
+  }
+
+  virtual void BasicRateTargetingSVC1TL2SLTest() {
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_min_quantizer = 0;
+    cfg_.rc_max_quantizer = 63;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.g_error_resilient = 1;
+
+    ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+                                         288, 30, 1, 0, 300);
+    const int bitrate_array[2] = { 300, 600 };
+    cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+    ResetModel();
+    number_temporal_layers_ = 1;
+    number_spatial_layers_ = 2;
+    target_layer_bitrate_[0] = 2 * cfg_.rc_target_bitrate / 4;
+    target_layer_bitrate_[1] = 2 * cfg_.rc_target_bitrate / 4;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+      ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.80)
+          << " The datarate for the file is lower than target by too much!";
+      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.35)
+          << " The datarate for the file is greater than target by too much!";
+    }
+  }
+
+  virtual void BasicRateTargetingSVC1TL3SLTest() {
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_min_quantizer = 0;
+    cfg_.rc_max_quantizer = 63;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.g_error_resilient = 1;
+
+    ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+                                         288, 30, 1, 0, 300);
+    const int bitrate_array[2] = { 500, 1000 };
+    cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+    ResetModel();
+    number_temporal_layers_ = 1;
+    number_spatial_layers_ = 3;
+    target_layer_bitrate_[0] = 1 * cfg_.rc_target_bitrate / 8;
+    target_layer_bitrate_[1] = 3 * cfg_.rc_target_bitrate / 8;
+    target_layer_bitrate_[2] = 4 * cfg_.rc_target_bitrate / 8;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+      ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.80)
+          << " The datarate for the file is lower than target by too much!";
+      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.38)
+          << " The datarate for the file is greater than target by too much!";
+    }
+  }
+
+  virtual void BasicRateTargetingSVC3TL3SLTest() {
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_min_quantizer = 0;
+    cfg_.rc_max_quantizer = 63;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.g_error_resilient = 1;
+
+    ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+                                         288, 30, 1, 0, 300);
+    const int bitrate_array[2] = { 600, 1200 };
+    cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+    ResetModel();
+    number_temporal_layers_ = 3;
+    number_spatial_layers_ = 3;
+    // SL0
+    const int bitrate_sl0 = 1 * cfg_.rc_target_bitrate / 8;
+    target_layer_bitrate_[0] = 50 * bitrate_sl0 / 100;
+    target_layer_bitrate_[1] = 70 * bitrate_sl0 / 100;
+    target_layer_bitrate_[2] = bitrate_sl0;
+    // SL1
+    const int bitrate_sl1 = 3 * cfg_.rc_target_bitrate / 8;
+    target_layer_bitrate_[3] = 50 * bitrate_sl1 / 100;
+    target_layer_bitrate_[4] = 70 * bitrate_sl1 / 100;
+    target_layer_bitrate_[5] = bitrate_sl1;
+    // SL2
+    const int bitrate_sl2 = 4 * cfg_.rc_target_bitrate / 8;
+    target_layer_bitrate_[6] = 50 * bitrate_sl2 / 100;
+    target_layer_bitrate_[7] = 70 * bitrate_sl2 / 100;
+    target_layer_bitrate_[8] = bitrate_sl2;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+      ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.80)
+          << " The datarate for the file is lower than target by too much!";
+      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.38)
+          << " The datarate for the file is greater than target by too much!";
+    }
+  }
+
+  virtual void BasicRateTargetingSVC3TL3SLHDTest() {
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_min_quantizer = 0;
+    cfg_.rc_max_quantizer = 63;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.g_error_resilient = 1;
+
+    ::libaom_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+    const int bitrate_array[2] = { 600, 1200 };
+    cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+    ResetModel();
+    number_temporal_layers_ = 3;
+    number_spatial_layers_ = 3;
+    // SL0
+    const int bitrate_sl0 = 1 * cfg_.rc_target_bitrate / 8;
+    target_layer_bitrate_[0] = 50 * bitrate_sl0 / 100;
+    target_layer_bitrate_[1] = 70 * bitrate_sl0 / 100;
+    target_layer_bitrate_[2] = bitrate_sl0;
+    // SL1
+    const int bitrate_sl1 = 3 * cfg_.rc_target_bitrate / 8;
+    target_layer_bitrate_[3] = 50 * bitrate_sl1 / 100;
+    target_layer_bitrate_[4] = 70 * bitrate_sl1 / 100;
+    target_layer_bitrate_[5] = bitrate_sl1;
+    // SL2
+    const int bitrate_sl2 = 4 * cfg_.rc_target_bitrate / 8;
+    target_layer_bitrate_[6] = 50 * bitrate_sl2 / 100;
+    target_layer_bitrate_[7] = 70 * bitrate_sl2 / 100;
+    target_layer_bitrate_[8] = bitrate_sl2;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+      ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.70)
+          << " The datarate for the file is lower than target by too much!";
+      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.4)
+          << " The datarate for the file is greater than target by too much!";
+    }
+  }
+
+  virtual void BasicRateTargetingSVC3TL3SLKfTest() {
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_min_quantizer = 0;
+    cfg_.rc_max_quantizer = 63;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.g_error_resilient = 1;
+    cfg_.kf_mode = AOM_KF_AUTO;
+    cfg_.kf_min_dist = cfg_.kf_max_dist = 100;
+
+    ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+                                         288, 30, 1, 0, 300);
+    const int bitrate_array[2] = { 600, 1200 };
+    cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+    ResetModel();
+    number_temporal_layers_ = 3;
+    number_spatial_layers_ = 3;
+    // SL0
+    const int bitrate_sl0 = 1 * cfg_.rc_target_bitrate / 8;
+    target_layer_bitrate_[0] = 50 * bitrate_sl0 / 100;
+    target_layer_bitrate_[1] = 70 * bitrate_sl0 / 100;
+    target_layer_bitrate_[2] = bitrate_sl0;
+    // SL1
+    const int bitrate_sl1 = 3 * cfg_.rc_target_bitrate / 8;
+    target_layer_bitrate_[3] = 50 * bitrate_sl1 / 100;
+    target_layer_bitrate_[4] = 70 * bitrate_sl1 / 100;
+    target_layer_bitrate_[5] = bitrate_sl1;
+    // SL2
+    const int bitrate_sl2 = 4 * cfg_.rc_target_bitrate / 8;
+    target_layer_bitrate_[6] = 50 * bitrate_sl2 / 100;
+    target_layer_bitrate_[7] = 70 * bitrate_sl2 / 100;
+    target_layer_bitrate_[8] = bitrate_sl2;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+      ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.75)
+          << " The datarate for the file is lower than target by too much!";
+      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.4)
+          << " The datarate for the file is greater than target by too much!";
+    }
+  }
+
+  int layer_frame_cnt_;
+  int superframe_cnt_;
+  int number_temporal_layers_;
+  int number_spatial_layers_;
+  // Allow for up to 3 temporal layers.
+  int target_layer_bitrate_[AOM_MAX_LAYERS];
+  aom_svc_params_t svc_params_;
+  aom_svc_ref_frame_config_t ref_frame_config_;
+  aom_svc_layer_id_t layer_id_;
+  double effective_datarate_tl[AOM_MAX_LAYERS];
+};
+
+// Check basic rate targeting for CBR, for 3 temporal layers, 1 spatial.
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL1SL) {
+  BasicRateTargetingSVC3TL1SLTest();
+}
+
+// Check basic rate targeting for CBR, for 2 spatial layers, 1 temporal.
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC1TL2SL) {
+  BasicRateTargetingSVC1TL2SLTest();
+}
+
+// Check basic rate targeting for CBR, for 3 spatial layers, 1 temporal.
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC1TL3SL) {
+  BasicRateTargetingSVC1TL3SLTest();
+}
+
+// Check basic rate targeting for CBR, for 3 spatial, 3 temporal layers.
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL3SL) {
+  BasicRateTargetingSVC3TL3SLTest();
+}
+
+// Check basic rate targeting for CBR, for 3 spatial, 3 temporal layers.
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL3SLHD) {
+  BasicRateTargetingSVC3TL3SLHDTest();
+}
+
+// Check basic rate targeting for CBR, for 3 spatial, 3 temporal layers,
+// for auto key frame mode with short key frame period.
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL3SLKf) {
+  BasicRateTargetingSVC3TL3SLKfTest();
+}
+
+AV1_INSTANTIATE_TEST_CASE(DatarateTestSVC,
+                          ::testing::Values(::libaom_test::kRealTime),
+                          ::testing::Range(7, 9),
+                          ::testing::Range<unsigned int>(0, 4),
+                          ::testing::Values(0, 1));
+
+}  // namespace
+}  // namespace datarate_test
diff --git a/libs/libaom/src/test/temporal_filter_planewise_test.cc b/libs/libaom/src/test/temporal_filter_planewise_test.cc
new file mode 100644
index 000000000..c3f3e9e05
--- /dev/null
+++ b/libs/libaom/src/test/temporal_filter_planewise_test.cc
@@ -0,0 +1,242 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <cmath>
+#include <cstdlib>
+#include <string>
+#include <tuple>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_ports/mem.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "test/function_equivalence_test.h"
+
+using libaom_test::ACMRandom;
+using libaom_test::FunctionEquivalenceTest;
+using ::testing::Combine;
+using ::testing::Range;
+using ::testing::Values;
+using ::testing::ValuesIn;
+
+#if !CONFIG_REALTIME_ONLY
+namespace {
+
+typedef void (*TemporalFilterPlanewiseFunc)(
+    const YV12_BUFFER_CONFIG *ref_frame, const MACROBLOCKD *mbd,
+    const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
+    const int num_planes, const double *noise_level, const int use_subblock,
+    const int block_mse, const int *subblock_mses, const int q_factor,
+    const uint8_t *pred, uint32_t *accum, uint16_t *count);
+typedef libaom_test::FuncParam<TemporalFilterPlanewiseFunc>
+    TemporalFilterPlanewiseFuncParam;
+
+typedef std::tuple<TemporalFilterPlanewiseFuncParam, int>
+    TemporalFilterPlanewiseWithParam;
+
+class TemporalFilterPlanewiseTest
+    : public ::testing::TestWithParam<TemporalFilterPlanewiseWithParam> {
+ public:
+  virtual ~TemporalFilterPlanewiseTest() {}
+  virtual void SetUp() {
+    params_ = GET_PARAM(0);
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+    src1_ = reinterpret_cast<uint8_t *>(aom_memalign(8, 256 * 256));
+    src2_ = reinterpret_cast<uint8_t *>(aom_memalign(8, 256 * 256));
+
+    ASSERT_TRUE(src1_ != NULL);
+    ASSERT_TRUE(src2_ != NULL);
+  }
+
+  virtual void TearDown() {
+    libaom_test::ClearSystemState();
+    aom_free(src1_);
+    aom_free(src2_);
+  }
+  void RunTest(int isRandom, int width, int height, int run_times);
+
+  void GenRandomData(int width, int height, int stride, int stride2) {
+    for (int ii = 0; ii < height; ii++) {
+      for (int jj = 0; jj < width; jj++) {
+        src1_[ii * stride + jj] = rnd_.Rand8();
+        src2_[ii * stride2 + jj] = rnd_.Rand8();
+      }
+    }
+  }
+
+  void GenExtremeData(int width, int height, int stride, uint8_t *data,
+                      int stride2, uint8_t *data2, uint8_t val) {
+    for (int ii = 0; ii < height; ii++) {
+      for (int jj = 0; jj < width; jj++) {
+        data[ii * stride + jj] = val;
+        data2[ii * stride2 + jj] = (255 - val);
+      }
+    }
+  }
+
+ protected:
+  TemporalFilterPlanewiseFuncParam params_;
+  uint8_t *src1_;
+  uint8_t *src2_;
+  ACMRandom rnd_;
+};
+
+void TemporalFilterPlanewiseTest::RunTest(int isRandom, int width, int height,
+                                          int run_times) {
+  aom_usec_timer ref_timer, test_timer;
+  for (int k = 0; k < 3; k++) {
+    const int stride = width;
+    const int stride2 = width;
+    if (isRandom) {
+      GenRandomData(width, height, stride, stride2);
+    } else {
+      const int msb = 8;  // Up to 8 bit input
+      const int limit = (1 << msb) - 1;
+      if (k == 0) {
+        GenExtremeData(width, height, stride, src1_, stride2, src2_, limit);
+      } else {
+        GenExtremeData(width, height, stride, src1_, stride2, src2_, 0);
+      }
+    }
+    double sigma[1] = { 2.1002103677063437 };
+    DECLARE_ALIGNED(16, unsigned int, accumulator_ref[1024 * 3]);
+    DECLARE_ALIGNED(16, uint16_t, count_ref[1024 * 3]);
+    memset(accumulator_ref, 0, 1024 * 3 * sizeof(accumulator_ref[0]));
+    memset(count_ref, 0, 1024 * 3 * sizeof(count_ref[0]));
+    DECLARE_ALIGNED(16, unsigned int, accumulator_mod[1024 * 3]);
+    DECLARE_ALIGNED(16, uint16_t, count_mod[1024 * 3]);
+    memset(accumulator_mod, 0, 1024 * 3 * sizeof(accumulator_mod[0]));
+    memset(count_mod, 0, 1024 * 3 * sizeof(count_mod[0]));
+
+    assert(width == 32 && height == 32);
+    const BLOCK_SIZE block_size = BLOCK_32X32;
+    const int use_subblock = 0;
+    const int block_mse = 20;
+    const int subblock_mses[4] = { 15, 16, 17, 18 };
+    const int q_factor = 12;
+    const int mb_row = 0;
+    const int mb_col = 0;
+    const int num_planes = 1;
+    YV12_BUFFER_CONFIG *ref_frame =
+        (YV12_BUFFER_CONFIG *)malloc(sizeof(YV12_BUFFER_CONFIG));
+    ref_frame->heights[0] = height;
+    ref_frame->strides[0] = stride;
+    DECLARE_ALIGNED(16, uint8_t, src[1024 * 3]);
+    ref_frame->buffer_alloc = src;
+    ref_frame->buffers[0] = ref_frame->buffer_alloc;
+    ref_frame->flags = 0;  // Only support low bit-depth test.
+    memcpy(src, src1_, 1024 * 3 * sizeof(uint8_t));
+
+    MACROBLOCKD *mbd = (MACROBLOCKD *)malloc(sizeof(MACROBLOCKD));
+    mbd->plane[0].subsampling_y = 0;
+    mbd->plane[0].subsampling_x = 0;
+    mbd->bd = 8;
+
+    params_.ref_func(ref_frame, mbd, block_size, mb_row, mb_col, num_planes,
+                     sigma, use_subblock, block_mse, subblock_mses, q_factor,
+                     src2_, accumulator_ref, count_ref);
+    params_.tst_func(ref_frame, mbd, block_size, mb_row, mb_col, num_planes,
+                     sigma, use_subblock, block_mse, subblock_mses, q_factor,
+                     src2_, accumulator_mod, count_mod);
+
+    if (run_times > 1) {
+      aom_usec_timer_start(&ref_timer);
+      for (int j = 0; j < run_times; j++) {
+        params_.ref_func(ref_frame, mbd, block_size, mb_row, mb_col, num_planes,
+                         sigma, use_subblock, block_mse, subblock_mses,
+                         q_factor, src2_, accumulator_ref, count_ref);
+      }
+      aom_usec_timer_mark(&ref_timer);
+      const int elapsed_time_c =
+          static_cast<int>(aom_usec_timer_elapsed(&ref_timer));
+
+      aom_usec_timer_start(&test_timer);
+      for (int j = 0; j < run_times; j++) {
+        params_.tst_func(ref_frame, mbd, block_size, mb_row, mb_col, num_planes,
+                         sigma, use_subblock, block_mse, subblock_mses,
+                         q_factor, src2_, accumulator_mod, count_mod);
+      }
+      aom_usec_timer_mark(&test_timer);
+      const int elapsed_time_simd =
+          static_cast<int>(aom_usec_timer_elapsed(&test_timer));
+
+      printf(
+          "c_time=%d \t simd_time=%d \t "
+          "gain=%f\t width=%d\t height=%d \n",
+          elapsed_time_c, elapsed_time_simd,
+          (float)((float)elapsed_time_c / (float)elapsed_time_simd), width,
+          height);
+
+    } else {
+      for (int i = 0, l = 0; i < height; i++) {
+        for (int j = 0; j < width; j++, l++) {
+          EXPECT_EQ(accumulator_ref[l], accumulator_mod[l])
+              << "Error:" << k << " SSE Sum Test [" << width << "x" << height
+              << "] C accumulator does not match optimized accumulator.";
+          EXPECT_EQ(count_ref[l], count_mod[l])
+              << "Error:" << k << " SSE Sum Test [" << width << "x" << height
+              << "] C count does not match optimized count.";
+        }
+      }
+    }
+
+    free(ref_frame);
+    free(mbd);
+  }
+}
+
+TEST_P(TemporalFilterPlanewiseTest, OperationCheck) {
+  for (int height = 32; height <= 32; height = height * 2) {
+    RunTest(1, height, height, 1);  // GenRandomData
+  }
+}
+
+TEST_P(TemporalFilterPlanewiseTest, ExtremeValues) {
+  for (int height = 32; height <= 32; height = height * 2) {
+    RunTest(0, height, height, 1);
+  }
+}
+
+TEST_P(TemporalFilterPlanewiseTest, DISABLED_Speed) {
+  for (int height = 32; height <= 32; height = height * 2) {
+    RunTest(1, height, height, 100000);
+  }
+}
+
+#if HAVE_AVX2
+TemporalFilterPlanewiseFuncParam temporal_filter_planewise_test_avx2[] = {
+  TemporalFilterPlanewiseFuncParam(&av1_apply_temporal_filter_planewise_c,
+                                   &av1_apply_temporal_filter_planewise_avx2)
+};
+INSTANTIATE_TEST_SUITE_P(AVX2, TemporalFilterPlanewiseTest,
+                         Combine(ValuesIn(temporal_filter_planewise_test_avx2),
+                                 Range(64, 65, 4)));
+#endif  // HAVE_AVX2
+
+#if HAVE_SSE2
+TemporalFilterPlanewiseFuncParam temporal_filter_planewise_test_sse2[] = {
+  TemporalFilterPlanewiseFuncParam(&av1_apply_temporal_filter_planewise_c,
+                                   &av1_apply_temporal_filter_planewise_sse2)
+};
+INSTANTIATE_TEST_SUITE_P(SSE2, TemporalFilterPlanewiseTest,
+                         Combine(ValuesIn(temporal_filter_planewise_test_sse2),
+                                 Range(64, 65, 4)));
+#endif  // HAVE_SSE2
+
+}  // namespace
+#endif
diff --git a/libs/libaom/src/test/temporal_filter_yuv_test.cc b/libs/libaom/src/test/temporal_filter_yuv_test.cc
new file mode 100644
index 000000000..dc17aaaf7
--- /dev/null
+++ b/libs/libaom/src/test/temporal_filter_yuv_test.cc
@@ -0,0 +1,841 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <ostream>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/av1_rtcd.h"
+#include "test/acm_random.h"
+#include "test/register_state_check.h"
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/mem.h"
+
+namespace {
+
+using ::libaom_test::ACMRandom;
+
+const int MAX_WIDTH = 32;
+const int MAX_HEIGHT = 32;
+
+typedef void (*TemporalFilterYUVFunc)(
+    const YV12_BUFFER_CONFIG *ref_frame, const MACROBLOCKD *mbd,
+    const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
+    const int num_planes, const int strength, const int use_subblock,
+    const int *blk_fw, const uint8_t *pred, uint32_t *accum, uint16_t *count);
+
+struct TemporalFilterWithBd {
+  TemporalFilterWithBd(TemporalFilterYUVFunc func, int bitdepth)
+      : temporal_filter(func), bd(bitdepth) {}
+
+  TemporalFilterYUVFunc temporal_filter;
+  int bd;
+};
+
+std::ostream &operator<<(std::ostream &os, const TemporalFilterWithBd &tf) {
+  return os << "Bitdepth: " << tf.bd;
+}
+
+int GetFilterWeight(unsigned int row, unsigned int col,
+                    unsigned int block_height, unsigned int block_width,
+                    const int *const blk_fw, int use_32x32) {
+  if (use_32x32) {
+    return blk_fw[0];
+  }
+
+  return blk_fw[2 * (row >= block_height / 2) + (col >= block_width / 2)];
+}
+
+template <typename PixelType>
+int GetModIndex(int sum_dist, int index, int rounding, int strength,
+                int filter_weight) {
+  int mod = sum_dist * 3 / index;
+  mod += rounding;
+  mod >>= strength;
+
+  mod = AOMMIN(16, mod);
+
+  mod = 16 - mod;
+  mod *= filter_weight;
+
+  return mod;
+}
+
+// Lowbitdepth version
+template <>
+int GetModIndex<uint8_t>(int sum_dist, int index, int rounding, int strength,
+                         int filter_weight) {
+  unsigned int index_mult[14] = { 0,     0,     0,     0,     49152,
+                                  39322, 32768, 28087, 24576, 21846,
+                                  19661, 17874, 0,     15124 };
+
+  assert(index >= 0 && index <= 13);
+  assert(index_mult[index] != 0);
+
+  int mod = (clamp(sum_dist, 0, UINT16_MAX) * index_mult[index]) >> 16;
+  mod += rounding;
+  mod >>= strength;
+
+  mod = AOMMIN(16, mod);
+
+  mod = 16 - mod;
+  mod *= filter_weight;
+
+  return mod;
+}
+
+// Highbitdepth version
+template <>
+int GetModIndex<uint16_t>(int sum_dist, int index, int rounding, int strength,
+                          int filter_weight) {
+  int64_t index_mult[14] = { 0U,          0U,          0U,          0U,
+                             3221225472U, 2576980378U, 2147483648U, 1840700270U,
+                             1610612736U, 1431655766U, 1288490189U, 1171354718U,
+                             0U,          991146300U };
+
+  assert(index >= 0 && index <= 13);
+  assert(index_mult[index] != 0);
+
+  int mod = static_cast<int>((sum_dist * index_mult[index]) >> 32);
+  mod += rounding;
+  mod >>= strength;
+
+  mod = AOMMIN(16, mod);
+
+  mod = 16 - mod;
+  mod *= filter_weight;
+
+  return mod;
+}
+
+template <typename PixelType>
+void SetArray(PixelType *pixel_array, int width, int height, int stride,
+              int val) {
+  for (int row = 0; row < height; row++) {
+    for (int col = 0; col < width; col++) {
+      pixel_array[col] = val;
+    }
+    pixel_array += stride;
+  }
+}
+
+template <typename PixelType>
+void SetArray(PixelType *pixel_array, int width, int height, int stride,
+              ACMRandom *rnd, int low_val, int high_val) {
+  EXPECT_LE(low_val, high_val);
+
+  for (int row = 0; row < height; row++) {
+    for (int col = 0; col < width; col++) {
+      const int val =
+          static_cast<int>((*rnd).PseudoUniform(high_val - low_val));
+      pixel_array[col] = low_val + val;
+    }
+    pixel_array += stride;
+  }
+}
+
+template <typename ValueType>
+bool CheckArrayEqual(const ValueType *arr_1, const ValueType *arr_2, int width,
+                     int height, int stride_1, int stride_2) {
+  for (int row = 0; row < height; row++) {
+    for (int col = 0; col < width; col++) {
+      if (arr_1[col] != arr_2[col]) {
+        return false;
+      }
+    }
+    arr_1 += stride_1;
+    arr_2 += stride_2;
+  }
+  return true;
+}
+
+template <typename ValueType>
+void PrintArrayDiff(const ValueType *arr_1, const ValueType *arr_2, int width,
+                    int height, int stride_1, int stride_2) {
+  const ValueType *arr_1_start = arr_1, *arr_2_start = arr_2;
+
+  printf("Array 1:\n");
+  for (int row = 0; row < height; ++row) {
+    for (int col = 0; col < width; ++col) {
+      if (arr_1[col] != arr_2[col]) {
+        printf("*%3d", arr_1[col]);
+      } else {
+        printf("%4d", arr_1[col]);
+      }
+    }
+    printf("\n");
+    arr_1 += stride_1;
+    arr_2 += stride_2;
+  }
+
+  arr_1 = arr_1_start;
+  arr_2 = arr_2_start;
+
+  printf("Array 2:\n");
+  for (int row = 0; row < height; ++row) {
+    for (int col = 0; col < width; ++col) {
+      if (arr_1[col] != arr_2[col]) {
+        printf("*%3d", arr_2[col]);
+      } else {
+        printf("%4d", arr_2[col]);
+      }
+    }
+    printf("\n");
+    arr_1 += stride_1;
+    arr_2 += stride_2;
+  }
+
+  arr_1 = arr_1_start;
+  arr_2 = arr_2_start;
+  printf("Difference:\n");
+  for (int row = 0; row < height; ++row) {
+    for (int col = 0; col < width; ++col) {
+      printf("%4d", arr_1[col] - arr_2[col]);
+    }
+    printf("\n");
+    arr_1 += stride_1;
+    arr_2 += stride_2;
+  }
+}
+
+template <typename PixelType>
+void ApplyReferenceFilter(const PixelType *y_src, const PixelType *y_pre,
+                          const PixelType *u_src, const PixelType *v_src,
+                          const PixelType *u_pre, const PixelType *v_pre,
+                          unsigned int block_width, unsigned int block_height,
+                          int ss_x, int ss_y, int strength,
+                          const int *const blk_fw, int use_32x32,
+                          uint32_t *y_accum, uint16_t *y_count,
+                          uint32_t *u_accum, uint16_t *u_count,
+                          uint32_t *v_accum, uint16_t *v_count) {
+  const int uv_block_width = block_width >> ss_x,
+            uv_block_height = block_height >> ss_y;
+  const int y_src_stride = block_width, y_pre_stride = block_width;
+  const int uv_src_stride = uv_block_width, uv_pre_stride = uv_block_width;
+  const int y_diff_stride = block_width, uv_diff_stride = uv_block_width;
+  const int y_count_stride = block_width, u_count_stride = uv_block_width,
+            v_count_stride = uv_block_width;
+  const int y_accum_stride = block_width, u_accum_stride = uv_block_width,
+            v_accum_stride = uv_block_width;
+
+  int y_dif[MAX_WIDTH * MAX_HEIGHT] = { 0 };
+  int u_dif[MAX_WIDTH * MAX_HEIGHT] = { 0 };
+  int v_dif[MAX_WIDTH * MAX_HEIGHT] = { 0 };
+
+  const int rounding = (1 << strength) >> 1;
+
+  // Get the square diffs
+  for (int row = 0; row < (int)block_height; row++) {
+    for (int col = 0; col < (int)block_width; col++) {
+      const int diff =
+          y_src[row * y_src_stride + col] - y_pre[row * y_pre_stride + col];
+      y_dif[row * y_diff_stride + col] = diff * diff;
+    }
+  }
+
+  for (int row = 0; row < (int)uv_block_height; row++) {
+    for (int col = 0; col < (int)uv_block_width; col++) {
+      const int u_diff =
+          u_src[row * uv_src_stride + col] - u_pre[row * uv_pre_stride + col];
+      const int v_diff =
+          v_src[row * uv_src_stride + col] - v_pre[row * uv_pre_stride + col];
+      u_dif[row * uv_diff_stride + col] = u_diff * u_diff;
+      v_dif[row * uv_diff_stride + col] = v_diff * v_diff;
+    }
+  }
+
+  // Apply the filter to luma
+  for (int row = 0; row < (int)block_height; row++) {
+    for (int col = 0; col < (int)block_width; col++) {
+      const int uv_row = row >> ss_y;
+      const int uv_col = col >> ss_x;
+      const int filter_weight = GetFilterWeight(row, col, block_height,
+                                                block_width, blk_fw, use_32x32);
+
+      // First we get the modifier for the current y pixel
+      const int y_pixel = y_pre[row * y_pre_stride + col];
+      int y_num_used = 0;
+      int y_mod = 0;
+
+      // Sum the neighboring 3x3 y pixels
+      for (int row_step = -1; row_step <= 1; row_step++) {
+        for (int col_step = -1; col_step <= 1; col_step++) {
+          const int sub_row = row + row_step;
+          const int sub_col = col + col_step;
+
+          if (sub_row >= 0 && sub_row < (int)block_height && sub_col >= 0 &&
+              sub_col < (int)block_width) {
+            y_mod += y_dif[sub_row * y_diff_stride + sub_col];
+            y_num_used++;
+          }
+        }
+      }
+
+      // Sum the corresponding uv pixels to the current y modifier
+      // Note we are rounding down instead of rounding to the nearest pixel.
+      y_mod += u_dif[uv_row * uv_diff_stride + uv_col];
+      y_mod += v_dif[uv_row * uv_diff_stride + uv_col];
+
+      y_num_used += 2;
+
+      // Set the modifier
+      y_mod = GetModIndex<PixelType>(y_mod, y_num_used, rounding, strength,
+                                     filter_weight);
+
+      // Accumulate the result
+      y_count[row * y_count_stride + col] += y_mod;
+      y_accum[row * y_accum_stride + col] += y_mod * y_pixel;
+    }
+  }
+
+  // Apply the filter to chroma
+  for (int uv_row = 0; uv_row < (int)uv_block_height; uv_row++) {
+    for (int uv_col = 0; uv_col < (int)uv_block_width; uv_col++) {
+      const int y_row = uv_row << ss_y;
+      const int y_col = uv_col << ss_x;
+      const int filter_weight = GetFilterWeight(
+          uv_row, uv_col, uv_block_height, uv_block_width, blk_fw, use_32x32);
+
+      const int u_pixel = u_pre[uv_row * uv_pre_stride + uv_col];
+      const int v_pixel = v_pre[uv_row * uv_pre_stride + uv_col];
+
+      int uv_num_used = 0;
+      int u_mod = 0, v_mod = 0;
+
+      // Sum the neighboring 3x3 chromal pixels to the chroma modifier
+      for (int row_step = -1; row_step <= 1; row_step++) {
+        for (int col_step = -1; col_step <= 1; col_step++) {
+          const int sub_row = uv_row + row_step;
+          const int sub_col = uv_col + col_step;
+
+          if (sub_row >= 0 && sub_row < uv_block_height && sub_col >= 0 &&
+              sub_col < uv_block_width) {
+            u_mod += u_dif[sub_row * uv_diff_stride + sub_col];
+            v_mod += v_dif[sub_row * uv_diff_stride + sub_col];
+            uv_num_used++;
+          }
+        }
+      }
+
+      // Sum all the luma pixels associated with the current luma pixel
+      for (int row_step = 0; row_step < 1 + ss_y; row_step++) {
+        for (int col_step = 0; col_step < 1 + ss_x; col_step++) {
+          const int sub_row = y_row + row_step;
+          const int sub_col = y_col + col_step;
+          const int y_diff = y_dif[sub_row * y_diff_stride + sub_col];
+
+          u_mod += y_diff;
+          v_mod += y_diff;
+          uv_num_used++;
+        }
+      }
+
+      // Set the modifier
+      u_mod = GetModIndex<PixelType>(u_mod, uv_num_used, rounding, strength,
+                                     filter_weight);
+      v_mod = GetModIndex<PixelType>(v_mod, uv_num_used, rounding, strength,
+                                     filter_weight);
+
+      // Accumulate the result
+      u_count[uv_row * u_count_stride + uv_col] += u_mod;
+      u_accum[uv_row * u_accum_stride + uv_col] += u_mod * u_pixel;
+      v_count[uv_row * v_count_stride + uv_col] += v_mod;
+      v_accum[uv_row * v_accum_stride + uv_col] += v_mod * v_pixel;
+    }
+  }
+}
+
+class TemporalFilterYUVTest
+    : public ::testing::TestWithParam<TemporalFilterWithBd> {
+ public:
+  virtual void SetUp() {
+    filter_func_ = GetParam().temporal_filter;
+    bd_ = GetParam().bd;
+    use_highbd_ = (bd_ != 8);
+
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+    saturate_test_ = 0;
+    num_repeats_ = 10;
+
+    ASSERT_TRUE(bd_ == 8 || bd_ == 10 || bd_ == 12);
+  }
+
+ protected:
+  template <typename PixelType>
+  void CompareTestWithParam(int width, int height, int ss_x, int ss_y,
+                            int filter_strength, int use_32x32,
+                            const int *filter_weight);
+  template <typename PixelType>
+  void RunTestFilterWithParam(int width, int height, int ss_x, int ss_y,
+                              int filter_strength, int use_32x32,
+                              const int *filter_weight);
+  template <typename PixelType>
+  void ApplyTestFilter(const PixelType *y_src, int y_src_stride,
+                       const PixelType *y_pre, int y_pre_stride,
+                       const PixelType *u_src, const PixelType *v_src,
+                       int uv_src_stride, const PixelType *u_pre,
+                       const PixelType *v_pre, int uv_pre_stride,
+                       unsigned int block_width, unsigned int block_height,
+                       int ss_x, int ss_y, int strength, const int *blk_fw,
+                       int use_32x32, uint32_t *y_accum, uint16_t *y_count,
+                       uint32_t *u_accumu, uint16_t *u_count, uint32_t *v_accum,
+                       uint16_t *v_count);
+
+  TemporalFilterYUVFunc filter_func_;
+  ACMRandom rnd_;
+  int saturate_test_;
+  int num_repeats_;
+  int use_highbd_;
+  int bd_;
+};
+
+template <>
+void TemporalFilterYUVTest::ApplyTestFilter<uint8_t>(
+    const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
+    int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
+    int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
+    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
+    int ss_x, int ss_y, int strength, const int *blk_fw, int use_32x32,
+    uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum, uint16_t *u_count,
+    uint32_t *v_accum, uint16_t *v_count) {
+  (void)block_width;
+  (void)block_height;
+  (void)y_src_stride;
+  (void)uv_src_stride;
+
+  assert(block_width == MAX_WIDTH && MAX_WIDTH == 32);
+  assert(block_height == MAX_HEIGHT && MAX_HEIGHT == 32);
+  const BLOCK_SIZE block_size = BLOCK_32X32;
+  const int num_planes = 3;
+  const int mb_pels = MAX_WIDTH * MAX_HEIGHT;
+  const int mb_row = 0;
+  const int mb_col = 0;
+  const int use_subblock = !(use_32x32);
+
+  YV12_BUFFER_CONFIG *ref_frame =
+      (YV12_BUFFER_CONFIG *)malloc(sizeof(YV12_BUFFER_CONFIG));
+  ref_frame->strides[0] = y_pre_stride;
+  ref_frame->strides[1] = uv_pre_stride;
+  const int alloc_size = MAX_MB_PLANE * mb_pels;
+  DECLARE_ALIGNED(16, uint8_t, src[alloc_size]);
+  ref_frame->buffer_alloc = src;
+  ref_frame->buffers[0] = ref_frame->buffer_alloc + 0 * mb_pels;
+  ref_frame->buffers[1] = ref_frame->buffer_alloc + 1 * mb_pels;
+  ref_frame->buffers[2] = ref_frame->buffer_alloc + 2 * mb_pels;
+  ref_frame->flags = bd_ > 8 ? YV12_FLAG_HIGHBITDEPTH : 0;
+
+  MACROBLOCKD *mbd = (MACROBLOCKD *)malloc(sizeof(MACROBLOCKD));
+  mbd->plane[0].subsampling_y = 0;
+  mbd->plane[0].subsampling_x = 0;
+  mbd->plane[1].subsampling_y = ss_y;
+  mbd->plane[1].subsampling_x = ss_x;
+  mbd->plane[2].subsampling_y = ss_y;
+  mbd->plane[2].subsampling_x = ss_x;
+
+  DECLARE_ALIGNED(16, uint8_t, pred[alloc_size]);
+  DECLARE_ALIGNED(16, uint32_t, accum[alloc_size]);
+  DECLARE_ALIGNED(16, uint16_t, count[alloc_size]);
+  memcpy(src + 0 * mb_pels, y_src, mb_pels * sizeof(uint8_t));
+  memcpy(src + 1 * mb_pels, u_src, mb_pels * sizeof(uint8_t));
+  memcpy(src + 2 * mb_pels, v_src, mb_pels * sizeof(uint8_t));
+  memcpy(pred + 0 * mb_pels, y_pre, mb_pels * sizeof(uint8_t));
+  memcpy(pred + 1 * mb_pels, u_pre, mb_pels * sizeof(uint8_t));
+  memcpy(pred + 2 * mb_pels, v_pre, mb_pels * sizeof(uint8_t));
+  memcpy(accum + 0 * mb_pels, y_accum, mb_pels * sizeof(uint32_t));
+  memcpy(accum + 1 * mb_pels, u_accum, mb_pels * sizeof(uint32_t));
+  memcpy(accum + 2 * mb_pels, v_accum, mb_pels * sizeof(uint32_t));
+  memcpy(count + 0 * mb_pels, y_count, mb_pels * sizeof(uint16_t));
+  memcpy(count + 1 * mb_pels, u_count, mb_pels * sizeof(uint16_t));
+  memcpy(count + 2 * mb_pels, v_count, mb_pels * sizeof(uint16_t));
+
+  ASM_REGISTER_STATE_CHECK(
+      filter_func_(ref_frame, mbd, block_size, mb_row, mb_col, num_planes,
+                   strength, use_subblock, blk_fw, pred, accum, count));
+
+  memcpy(y_accum, accum + 0 * mb_pels, mb_pels * sizeof(uint32_t));
+  memcpy(u_accum, accum + 1 * mb_pels, mb_pels * sizeof(uint32_t));
+  memcpy(v_accum, accum + 2 * mb_pels, mb_pels * sizeof(uint32_t));
+  memcpy(y_count, count + 0 * mb_pels, mb_pels * sizeof(uint16_t));
+  memcpy(u_count, count + 1 * mb_pels, mb_pels * sizeof(uint16_t));
+  memcpy(v_count, count + 2 * mb_pels, mb_pels * sizeof(uint16_t));
+
+  free(ref_frame);
+  free(mbd);
+}
+
+template <>
+void TemporalFilterYUVTest::ApplyTestFilter<uint16_t>(
+    const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre,
+    int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src,
+    int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre,
+    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
+    int ss_x, int ss_y, int strength, const int *blk_fw, int use_32x32,
+    uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum, uint16_t *u_count,
+    uint32_t *v_accum, uint16_t *v_count) {
+  (void)block_width;
+  (void)block_height;
+  (void)y_src_stride;
+  (void)uv_src_stride;
+
+  assert(block_width == MAX_WIDTH && MAX_WIDTH == 32);
+  assert(block_height == MAX_HEIGHT && MAX_HEIGHT == 32);
+  const BLOCK_SIZE block_size = BLOCK_32X32;
+  const int num_planes = 3;
+  const int mb_pels = MAX_WIDTH * MAX_HEIGHT;
+  const int mb_row = 0;
+  const int mb_col = 0;
+  const int use_subblock = !(use_32x32);
+
+  YV12_BUFFER_CONFIG *ref_frame =
+      (YV12_BUFFER_CONFIG *)malloc(sizeof(YV12_BUFFER_CONFIG));
+  ref_frame->strides[0] = y_pre_stride;
+  ref_frame->strides[1] = uv_pre_stride;
+  const int alloc_size = MAX_MB_PLANE * mb_pels;
+  DECLARE_ALIGNED(16, uint16_t, src16[alloc_size]);
+  ref_frame->buffer_alloc = CONVERT_TO_BYTEPTR(src16);
+  ref_frame->buffers[0] = ref_frame->buffer_alloc + 0 * mb_pels;
+  ref_frame->buffers[1] = ref_frame->buffer_alloc + 1 * mb_pels;
+  ref_frame->buffers[2] = ref_frame->buffer_alloc + 2 * mb_pels;
+  ref_frame->flags = bd_ > 8 ? YV12_FLAG_HIGHBITDEPTH : 0;
+
+  MACROBLOCKD *mbd = (MACROBLOCKD *)malloc(sizeof(MACROBLOCKD));
+  mbd->plane[0].subsampling_y = 0;
+  mbd->plane[0].subsampling_x = 0;
+  mbd->plane[1].subsampling_y = ss_y;
+  mbd->plane[1].subsampling_x = ss_x;
+  mbd->plane[2].subsampling_y = ss_y;
+  mbd->plane[2].subsampling_x = ss_x;
+
+  DECLARE_ALIGNED(16, uint16_t, pred16[alloc_size]);
+  DECLARE_ALIGNED(16, uint32_t, accum[alloc_size]);
+  DECLARE_ALIGNED(16, uint16_t, count[alloc_size]);
+  memcpy(src16 + 0 * mb_pels, y_src, mb_pels * sizeof(uint16_t));
+  memcpy(src16 + 1 * mb_pels, u_src, mb_pels * sizeof(uint16_t));
+  memcpy(src16 + 2 * mb_pels, v_src, mb_pels * sizeof(uint16_t));
+  memcpy(pred16 + 0 * mb_pels, y_pre, mb_pels * sizeof(uint16_t));
+  memcpy(pred16 + 1 * mb_pels, u_pre, mb_pels * sizeof(uint16_t));
+  memcpy(pred16 + 2 * mb_pels, v_pre, mb_pels * sizeof(uint16_t));
+  memcpy(accum + 0 * mb_pels, y_accum, mb_pels * sizeof(uint32_t));
+  memcpy(accum + 1 * mb_pels, u_accum, mb_pels * sizeof(uint32_t));
+  memcpy(accum + 2 * mb_pels, v_accum, mb_pels * sizeof(uint32_t));
+  memcpy(count + 0 * mb_pels, y_count, mb_pels * sizeof(uint16_t));
+  memcpy(count + 1 * mb_pels, u_count, mb_pels * sizeof(uint16_t));
+  memcpy(count + 2 * mb_pels, v_count, mb_pels * sizeof(uint16_t));
+  const uint8_t *pred = CONVERT_TO_BYTEPTR(pred16);
+
+  ASM_REGISTER_STATE_CHECK(
+      filter_func_(ref_frame, mbd, block_size, mb_row, mb_col, num_planes,
+                   strength, use_subblock, blk_fw, pred, accum, count));
+
+  memcpy(y_accum, accum + 0 * mb_pels, mb_pels * sizeof(uint32_t));
+  memcpy(u_accum, accum + 1 * mb_pels, mb_pels * sizeof(uint32_t));
+  memcpy(v_accum, accum + 2 * mb_pels, mb_pels * sizeof(uint32_t));
+  memcpy(y_count, count + 0 * mb_pels, mb_pels * sizeof(uint16_t));
+  memcpy(u_count, count + 1 * mb_pels, mb_pels * sizeof(uint16_t));
+  memcpy(v_count, count + 2 * mb_pels, mb_pels * sizeof(uint16_t));
+
+  free(ref_frame);
+  free(mbd);
+}
+
+template <typename PixelType>
+void TemporalFilterYUVTest::CompareTestWithParam(int width, int height,
+                                                 int ss_x, int ss_y,
+                                                 int filter_strength,
+                                                 int use_32x32,
+                                                 const int *filter_weight) {
+  const int uv_width = width >> ss_x, uv_height = height >> ss_y;
+  const int y_stride = width, uv_stride = uv_width;
+
+  DECLARE_ALIGNED(16, PixelType, y_src[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
+  DECLARE_ALIGNED(16, PixelType, y_pre[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
+  DECLARE_ALIGNED(16, uint16_t, y_count_ref[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
+  DECLARE_ALIGNED(16, uint32_t, y_accum_ref[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
+  DECLARE_ALIGNED(16, uint16_t, y_count_tst[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
+  DECLARE_ALIGNED(16, uint32_t, y_accum_tst[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
+
+  DECLARE_ALIGNED(16, PixelType, u_src[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
+  DECLARE_ALIGNED(16, PixelType, u_pre[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
+  DECLARE_ALIGNED(16, uint16_t, u_count_ref[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
+  DECLARE_ALIGNED(16, uint32_t, u_accum_ref[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
+  DECLARE_ALIGNED(16, uint16_t, u_count_tst[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
+  DECLARE_ALIGNED(16, uint32_t, u_accum_tst[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
+
+  DECLARE_ALIGNED(16, PixelType, v_src[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
+  DECLARE_ALIGNED(16, PixelType, v_pre[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
+  DECLARE_ALIGNED(16, uint16_t, v_count_ref[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
+  DECLARE_ALIGNED(16, uint32_t, v_accum_ref[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
+  DECLARE_ALIGNED(16, uint16_t, v_count_tst[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
+  DECLARE_ALIGNED(16, uint32_t, v_accum_tst[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
+
+  for (int repeats = 0; repeats < num_repeats_; repeats++) {
+    if (saturate_test_) {
+      const int max_val = (1 << bd_) - 1;
+      SetArray(y_src, width, height, y_stride, max_val);
+      SetArray(y_pre, width, height, y_stride, 0);
+      SetArray(u_src, uv_width, uv_height, uv_stride, max_val);
+      SetArray(u_pre, uv_width, uv_height, uv_stride, 0);
+      SetArray(v_src, uv_width, uv_height, uv_stride, max_val);
+      SetArray(v_pre, uv_width, uv_height, uv_stride, 0);
+    } else {
+      const int max_val = 7 << (bd_ - 8);
+      SetArray(y_src, width, height, y_stride, &rnd_, 0, max_val);
+      SetArray(y_pre, width, height, y_stride, &rnd_, 0, max_val);
+      SetArray(u_src, uv_width, uv_height, uv_stride, &rnd_, 0, max_val);
+      SetArray(u_pre, uv_width, uv_height, uv_stride, &rnd_, 0, max_val);
+      SetArray(v_src, uv_width, uv_height, uv_stride, &rnd_, 0, max_val);
+      SetArray(v_pre, uv_width, uv_height, uv_stride, &rnd_, 0, max_val);
+    }
+
+    ApplyReferenceFilter<PixelType>(
+        y_src, y_pre, u_src, v_src, u_pre, v_pre, width, height, ss_x, ss_y,
+        filter_strength, filter_weight, use_32x32, y_accum_ref, y_count_ref,
+        u_accum_ref, u_count_ref, v_accum_ref, v_count_ref);
+
+    ApplyTestFilter(y_src, y_stride, y_pre, y_stride, u_src, v_src, uv_stride,
+                    u_pre, v_pre, uv_stride, width, height, ss_x, ss_y,
+                    filter_strength, filter_weight, use_32x32, y_accum_tst,
+                    y_count_tst, u_accum_tst, u_count_tst, v_accum_tst,
+                    v_count_tst);
+
+    EXPECT_TRUE(CheckArrayEqual(y_accum_tst, y_accum_ref, width, height,
+                                y_stride, y_stride));
+    EXPECT_TRUE(CheckArrayEqual(y_count_tst, y_count_ref, width, height,
+                                y_stride, y_stride));
+    EXPECT_TRUE(CheckArrayEqual(u_accum_tst, u_accum_ref, uv_width, uv_height,
+                                uv_stride, uv_stride));
+    EXPECT_TRUE(CheckArrayEqual(u_count_tst, u_count_ref, uv_width, uv_height,
+                                uv_stride, uv_stride));
+    EXPECT_TRUE(CheckArrayEqual(v_accum_tst, v_accum_ref, uv_width, uv_height,
+                                uv_stride, uv_stride));
+    EXPECT_TRUE(CheckArrayEqual(v_count_tst, v_count_ref, uv_width, uv_height,
+                                uv_stride, uv_stride));
+
+    if (HasFailure()) {
+      if (use_32x32) {
+        printf("SS_X: %d, SS_Y: %d, Strength: %d, Weight: %d\n", ss_x, ss_y,
+               filter_strength, *filter_weight);
+      } else {
+        printf("SS_X: %d, SS_Y: %d, Strength: %d, Weights: %d,%d,%d,%d\n", ss_x,
+               ss_y, filter_strength, filter_weight[0], filter_weight[1],
+               filter_weight[2], filter_weight[3]);
+      }
+
+      PrintArrayDiff(y_accum_ref, y_accum_tst, width, height, y_stride,
+                     y_stride);
+      PrintArrayDiff(y_count_ref, y_count_tst, width, height, y_stride,
+                     y_stride);
+      PrintArrayDiff(u_accum_ref, v_accum_tst, uv_width, uv_height, uv_stride,
+                     uv_stride);
+      PrintArrayDiff(u_count_ref, v_count_tst, uv_width, uv_height, uv_stride,
+                     uv_stride);
+      PrintArrayDiff(u_accum_ref, v_accum_tst, uv_width, uv_height, uv_stride,
+                     uv_stride);
+      PrintArrayDiff(u_count_ref, v_count_tst, uv_width, uv_height, uv_stride,
+                     uv_stride);
+
+      return;
+    }
+  }
+}
+
+template <typename PixelType>
+void TemporalFilterYUVTest::RunTestFilterWithParam(int width, int height,
+                                                   int ss_x, int ss_y,
+                                                   int filter_strength,
+                                                   int use_32x32,
+                                                   const int *filter_weight) {
+  PixelType y_src[MAX_WIDTH * MAX_HEIGHT] = { 0 };
+  PixelType y_pre[MAX_WIDTH * MAX_HEIGHT] = { 0 };
+  uint16_t y_count[MAX_WIDTH * MAX_HEIGHT] = { 0 };
+  uint32_t y_accum[MAX_WIDTH * MAX_HEIGHT] = { 0 };
+
+  PixelType u_src[MAX_WIDTH * MAX_HEIGHT] = { 0 };
+  PixelType u_pre[MAX_WIDTH * MAX_HEIGHT] = { 0 };
+  uint16_t u_count[MAX_WIDTH * MAX_HEIGHT] = { 0 };
+  uint32_t u_accum[MAX_WIDTH * MAX_HEIGHT] = { 0 };
+
+  PixelType v_src[MAX_WIDTH * MAX_HEIGHT] = { 0 };
+  PixelType v_pre[MAX_WIDTH * MAX_HEIGHT] = { 0 };
+  uint16_t v_count[MAX_WIDTH * MAX_HEIGHT] = { 0 };
+  uint32_t v_accum[MAX_WIDTH * MAX_HEIGHT] = { 0 };
+
+  SetArray(y_src, width, height, MAX_WIDTH, &rnd_, 0, 7 << (bd_ = 8));
+  SetArray(y_pre, width, height, MAX_WIDTH, &rnd_, 0, 7 << (bd_ = 8));
+  SetArray(u_src, width, height, MAX_WIDTH, &rnd_, 0, 7 << (bd_ = 8));
+  SetArray(u_pre, width, height, MAX_WIDTH, &rnd_, 0, 7 << (bd_ = 8));
+  SetArray(v_src, width, height, MAX_WIDTH, &rnd_, 0, 7 << (bd_ = 8));
+  SetArray(v_pre, width, height, MAX_WIDTH, &rnd_, 0, 7 << (bd_ = 8));
+
+  for (int repeats = 0; repeats < num_repeats_; repeats++) {
+    ApplyTestFilter(y_src, MAX_WIDTH, y_pre, MAX_WIDTH, u_src, v_src, MAX_WIDTH,
+                    u_pre, v_pre, MAX_WIDTH, width, height, ss_x, ss_y,
+                    filter_strength, filter_weight, use_32x32, y_accum, y_count,
+                    u_accum, u_count, v_accum, v_count);
+  }
+}
+
+TEST_P(TemporalFilterYUVTest, Use32x32) {
+  const int width = 32, height = 32;
+  const int use_32x32 = 1;
+
+  for (int ss_x = 0; ss_x <= 1; ss_x++) {
+    for (int ss_y = 0; ss_y <= 1; ss_y++) {
+      for (int filter_strength = 0; filter_strength <= 6;
+           filter_strength += 2) {
+        for (int filter_weight = 0; filter_weight <= 2; filter_weight++) {
+          if (use_highbd_) {
+            const int adjusted_strength = filter_strength + 2 * (bd_ - 8);
+            CompareTestWithParam<uint16_t>(width, height, ss_x, ss_y,
+                                           adjusted_strength, use_32x32,
+                                           &filter_weight);
+          } else {
+            CompareTestWithParam<uint8_t>(width, height, ss_x, ss_y,
+                                          filter_strength, use_32x32,
+                                          &filter_weight);
+          }
+          ASSERT_FALSE(HasFailure());
+        }
+      }
+    }
+  }
+}
+
+TEST_P(TemporalFilterYUVTest, Use16x16) {
+  const int width = 32, height = 32;
+  const int use_32x32 = 0;
+
+  for (int ss_x = 0; ss_x <= 1; ss_x++) {
+    for (int ss_y = 0; ss_y <= 1; ss_y++) {
+      for (int filter_idx = 0; filter_idx < 3 * 3 * 3 * 3; filter_idx++) {
+        // Set up the filter
+        int filter_weight[4];
+        int filter_idx_cp = filter_idx;
+        for (int idx = 0; idx < 4; idx++) {
+          filter_weight[idx] = filter_idx_cp % 3;
+          filter_idx_cp /= 3;
+        }
+
+        // Test each parameter
+        for (int filter_strength = 0; filter_strength <= 6;
+             filter_strength += 2) {
+          if (use_highbd_) {
+            const int adjusted_strength = filter_strength + 2 * (bd_ - 8);
+            CompareTestWithParam<uint16_t>(width, height, ss_x, ss_y,
+                                           adjusted_strength, use_32x32,
+                                           filter_weight);
+          } else {
+            CompareTestWithParam<uint8_t>(width, height, ss_x, ss_y,
+                                          filter_strength, use_32x32,
+                                          filter_weight);
+          }
+
+          ASSERT_FALSE(HasFailure());
+        }
+      }
+    }
+  }
+}
+
+TEST_P(TemporalFilterYUVTest, SaturationTest) {
+  const int width = 32, height = 32;
+  const int use_32x32 = 1;
+  const int filter_weight = 1;
+  saturate_test_ = 1;
+
+  for (int ss_x = 0; ss_x <= 1; ss_x++) {
+    for (int ss_y = 0; ss_y <= 1; ss_y++) {
+      for (int filter_strength = 0; filter_strength <= 6;
+           filter_strength += 2) {
+        if (use_highbd_) {
+          const int adjusted_strength = filter_strength + 2 * (bd_ - 8);
+          CompareTestWithParam<uint16_t>(width, height, ss_x, ss_y,
+                                         adjusted_strength, use_32x32,
+                                         &filter_weight);
+        } else {
+          CompareTestWithParam<uint8_t>(width, height, ss_x, ss_y,
+                                        filter_strength, use_32x32,
+                                        &filter_weight);
+        }
+
+        ASSERT_FALSE(HasFailure());
+      }
+    }
+  }
+}
+
+TEST_P(TemporalFilterYUVTest, DISABLED_Speed) {
+  const int width = 32, height = 32;
+  num_repeats_ = 1000;
+
+  for (int use_32x32 = 0; use_32x32 <= 1; use_32x32++) {
+    const int num_filter_weights = use_32x32 ? 3 : 3 * 3 * 3 * 3;
+    for (int ss_x = 0; ss_x <= 1; ss_x++) {
+      for (int ss_y = 0; ss_y <= 1; ss_y++) {
+        for (int filter_idx = 0; filter_idx < num_filter_weights;
+             filter_idx++) {
+          // Set up the filter
+          int filter_weight[4];
+          int filter_idx_cp = filter_idx;
+          for (int idx = 0; idx < 4; idx++) {
+            filter_weight[idx] = filter_idx_cp % 3;
+            filter_idx_cp /= 3;
+          }
+
+          // Test each parameter
+          for (int filter_strength = 0; filter_strength <= 6;
+               filter_strength += 2) {
+            aom_usec_timer timer;
+            aom_usec_timer_start(&timer);
+
+            if (use_highbd_) {
+              RunTestFilterWithParam<uint16_t>(width, height, ss_x, ss_y,
+                                               filter_strength, use_32x32,
+                                               filter_weight);
+            } else {
+              RunTestFilterWithParam<uint8_t>(width, height, ss_x, ss_y,
+                                              filter_strength, use_32x32,
+                                              filter_weight);
+            }
+
+            aom_usec_timer_mark(&timer);
+            const int elapsed_time =
+                static_cast<int>(aom_usec_timer_elapsed(&timer));
+
+            printf(
+                "Bitdepth: %d, Use 32X32: %d, SS_X: %d, SS_Y: %d, Weight Idx: "
+                "%d, Strength: %d, Time: %5d\n",
+                bd_, use_32x32, ss_x, ss_y, filter_idx, filter_strength,
+                elapsed_time);
+          }
+        }
+      }
+    }
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    C, TemporalFilterYUVTest,
+    ::testing::Values(
+        TemporalFilterWithBd(&av1_apply_temporal_filter_yuv_c, 8),
+        TemporalFilterWithBd(&av1_apply_temporal_filter_yuv_c, 10),
+        TemporalFilterWithBd(&av1_apply_temporal_filter_yuv_c, 12)));
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(
+    SSE4_1, TemporalFilterYUVTest,
+    ::testing::Values(
+        TemporalFilterWithBd(&av1_apply_temporal_filter_yuv_sse4_1, 8),
+        TemporalFilterWithBd(&av1_apply_temporal_filter_yuv_sse4_1, 10),
+        TemporalFilterWithBd(&av1_apply_temporal_filter_yuv_sse4_1, 12)));
+#endif  // HAVE_SSE4_1
+
+}  // namespace
diff --git a/libs/libaom/src/test/test-data.sha1 b/libs/libaom/src/test/test-data.sha1
new file mode 100644
index 000000000..383ae79c1
--- /dev/null
+++ b/libs/libaom/src/test/test-data.sha1
@@ -0,0 +1,559 @@
+d5dfb0151c9051f8c85999255645d7a23916d3c0 *hantro_collage_w352h288.yuv
+b87815bf86020c592ccc7a846ba2e28ec8043902 *hantro_odd.yuv
+26b7f64399b84db4b4c9c915d743ec5c2619d4b9 *invalid-bug-1814.ivf
+d3964f9dad9f60363c81b688324d95b4ec7c8038 *invalid-bug-1814.ivf.res
+09aa07e5325b3bb5462182eb30b8ecc914630740 *invalid-chromium-906381.ivf
+09d2af8dd22201dd8d48e5dcfcaed281ff9422c7 *invalid-chromium-906381.ivf.res
+f7c83c14aa35b928ba8b70f3eaa3b92070be4519 *invalid-google-142530197-1.ivf
+d3964f9dad9f60363c81b688324d95b4ec7c8038 *invalid-google-142530197-1.ivf.res
+703c05720d5d67053bcee44987635cd78af2f971 *invalid-google-142530197.ivf
+d3964f9dad9f60363c81b688324d95b4ec7c8038 *invalid-google-142530197.ivf.res
+fa06784f23751d8c37be94160fb821e855199af4 *invalid-oss-fuzz-10061.ivf
+b055f06b9a95aaa5697fa26497b592a47843a7c8 *invalid-oss-fuzz-10061.ivf.res
+c9e06c4c7fb7d69fd635a1f606a5e478d60e99cf *invalid-oss-fuzz-10117-mc-buf-use-highbd.ivf
+88e18e61bd2b7457b4c71ebefbdff0029c41cc04 *invalid-oss-fuzz-10117-mc-buf-use-highbd.ivf.res
+91a5bedeb4832c1c2900736cc0f644bb63971bbc *invalid-oss-fuzz-10227.ivf
+b055f06b9a95aaa5697fa26497b592a47843a7c8 *invalid-oss-fuzz-10227.ivf.res
+b2d0a29a65879436bf483d04865faca7d11cc2ee *invalid-oss-fuzz-10389.ivf
+9655e6275888547ecd1f14e20e08ce4891372e76 *invalid-oss-fuzz-10389.ivf.res
+e5fe0e8984c42d53d4ff734c3fbfd57d5c5c25cf *invalid-oss-fuzz-10389.ivf.res.2
+11df8e9a068669c678097d460b63609d3da73828 *invalid-oss-fuzz-10555.ivf
+b055f06b9a95aaa5697fa26497b592a47843a7c8 *invalid-oss-fuzz-10555.ivf.res
+cf5945085fe85456a1f74bf4cc7998b88b3f4b62 *invalid-oss-fuzz-10705.ivf
+758671858368ffd2a2c0727898de5661f7cf7d68 *invalid-oss-fuzz-10705.ivf.res
+88e29851122cca3f336824f7fa4d9f757f91110c *invalid-oss-fuzz-10723.ivf
+1af486cd2cc83ebeddc76ca7a1c512cc0ec568d5 *invalid-oss-fuzz-10723.ivf.res
+64f8a208dec7f1580fbe0371aa15e62bb1262715 *invalid-oss-fuzz-10723.ivf.res.2
+0784acc8931090ec24eba752d6c27e359e68fe7d *invalid-oss-fuzz-10779.ivf
+5d9474c0309b7ca09a182d888f73b37a8fe1362c *invalid-oss-fuzz-10779.ivf.res
+7d37be9357f89a100ced694aee1ca5a6fad35ba9 *invalid-oss-fuzz-11477.ivf
+15932651aacfc4622f0910f728f3f95e08e1753d *invalid-oss-fuzz-11477.ivf.res
+1674787c38ddf82a2e5c804203f04f56a304e8e0 *invalid-oss-fuzz-11479.ivf
+1af486cd2cc83ebeddc76ca7a1c512cc0ec568d5 *invalid-oss-fuzz-11479.ivf.res
+64f8a208dec7f1580fbe0371aa15e62bb1262715 *invalid-oss-fuzz-11479.ivf.res.2
+b1a45514f0c59be03c9991cd04882426b9b930fa *invalid-oss-fuzz-11523.ivf
+7c44ac1723c14d98bcb888fbf118c959511519ba *invalid-oss-fuzz-11523.ivf.res
+3198c7af55a7d50173ce3c369c0cf2d9cdfface6 *invalid-oss-fuzz-11523.ivf.res.2
+cb445173be760c3554f1740ce4d119f57a7be043 *invalid-oss-fuzz-15363.ivf
+d3964f9dad9f60363c81b688324d95b4ec7c8038 *invalid-oss-fuzz-15363.ivf.res
+5b697360bf0f02de31bae9b8da78e93570958fa4 *invalid-oss-fuzz-16437.ivf
+09d2af8dd22201dd8d48e5dcfcaed281ff9422c7 *invalid-oss-fuzz-16437.ivf.res
+ccbe4081557eb44820a0e6337c4a094421826b9a *invalid-oss-fuzz-9288.ivf
+67c54283fe1a26ccf02cc991e4f9a1eea3ac5e78 *invalid-oss-fuzz-9288.ivf.res
+c0960f032484579f967881cc025b71cfd7a79ee1 *invalid-oss-fuzz-9463.ivf
+d3964f9dad9f60363c81b688324d95b4ec7c8038 *invalid-oss-fuzz-9463.ivf.res
+5d9474c0309b7ca09a182d888f73b37a8fe1362c *invalid-oss-fuzz-9463.ivf.res.2
+f448caf378e250b7eea4fa2d1c3cd7ef4a3211ce *invalid-oss-fuzz-9482.ivf
+b055f06b9a95aaa5697fa26497b592a47843a7c8 *invalid-oss-fuzz-9482.ivf.res
+a686989de79af89136f631fd630df639c7861851 *invalid-oss-fuzz-9720.ivf
+d3964f9dad9f60363c81b688324d95b4ec7c8038 *invalid-oss-fuzz-9720.ivf.res
+a432f96ff0a787268e2f94a8092ab161a18d1b06 *park_joy_90p_10_420.y4m
+0b194cc312c3a2e84d156a221b0a5eb615dfddc5 *park_joy_90p_10_422.y4m
+ff0e0a21dc2adc95b8c1b37902713700655ced17 *park_joy_90p_10_444.y4m
+c934da6fb8cc54ee2a8c17c54cf6076dac37ead0 *park_joy_90p_10_440.yuv
+614c32ae1eca391e867c70d19974f0d62664dd99 *park_joy_90p_12_420.y4m
+c92825f1ea25c5c37855083a69faac6ac4641a9e *park_joy_90p_12_422.y4m
+b592189b885b6cc85db55cc98512a197d73d3b34 *park_joy_90p_12_444.y4m
+82c1bfcca368c2f22bad7d693d690d5499ecdd11 *park_joy_90p_12_440.yuv
+b9e1e90aece2be6e2c90d89e6ab2372d5f8c792d *park_joy_90p_8_420_a10-1.y4m
+4e0eb61e76f0684188d9bc9f3ce61f6b6b77bb2c *park_joy_90p_8_420.y4m
+7a193ff7dfeb96ba5f82b2afd7afa9e1fe83d947 *park_joy_90p_8_422.y4m
+bdb7856e6bc93599bdda05c2e773a9f22b6c6d03 *park_joy_90p_8_444.y4m
+81e1f3843748438b8f2e71db484eb22daf72e939 *park_joy_90p_8_440.yuv
+b1f1c3ec79114b9a0651af24ce634afb44a9a419 *rush_hour_444.y4m
+eb438c6540eb429f74404eedfa3228d409c57874 *desktop_640_360_30.yuv
+89e70ebd22c27d275fe14dc2f1a41841a6d8b9ab *kirland_640_480_30.yuv
+33c533192759e5bb4f07abfbac389dc259db4686 *macmarcomoving_640_480_30.yuv
+8bfaab121080821b8f03b23467911e59ec59b8fe *macmarcostationary_640_480_30.yuv
+70894878d916a599842d9ad0dcd24e10c13e5467 *niklas_640_480_30.yuv
+8784b6df2d8cc946195a90ac00540500d2e522e4 *tacomanarrows_640_480_30.yuv
+edd86a1f5e62fd9da9a9d46078247759c2638009 *tacomasmallcameramovement_640_480_30.yuv
+9a70e8b7d14fba9234d0e51dce876635413ce444 *thaloundeskmtg_640_480_30.yuv
+e7d315dbf4f3928779e0dc624311196d44491d32 *niklas_1280_720_30.yuv
+717da707afcaa1f692ff1946f291054eb75a4f06 *screendata.y4m
+9cfc855459e7549fd015c79e8eca512b2f2cb7e3 *niklas_1280_720_30.y4m
+5b5763b388b1b52a81bb82b39f7ec25c4bd3d0e1 *desktop_credits.y4m
+36ddab9b99eb7545aa0bf362d6f498212d596516 *vase10x10.yuv
+c542890ac929749000f7b3883174f2202070d834 *pixel_capture_w320h240.yuv
+c2e1ec9936b95254187a359e94aa32a9f3dad1b7 *av1-1-b8-00-quantizer-00.ivf
+26cd2a0321d01d9db5f6dace8b43a40cd5b9d58d *av1-1-b8-00-quantizer-00.ivf.md5
+a56dd02c0258d4afea1ee358a22b54e99e39d5e1 *av1-1-b8-00-quantizer-01.ivf
+b3d24124d81f1fbb26f5eb0036accb54f3ec69b2 *av1-1-b8-00-quantizer-01.ivf.md5
+3466327cb842a91d69839b11ef930a74f086f4c6 *av1-1-b8-00-quantizer-02.ivf
+c111dce946100efeaad34203080eee1d55464df6 *av1-1-b8-00-quantizer-02.ivf.md5
+d3f1f32de5e2c0c19a58bb8ef096108388c6a820 *av1-1-b8-00-quantizer-03.ivf
+6265321b31130545b4454982ca93e412a56845b8 *av1-1-b8-00-quantizer-03.ivf.md5
+f37c393ebe73266a5ec8508a2ca33c586ff28e64 *av1-1-b8-00-quantizer-04.ivf
+c6e979da71aecc593c0abb40135dd304152b00dd *av1-1-b8-00-quantizer-04.ivf.md5
+ac9c5e93cb19942a9be259d0567ec96c54dcdc7c *av1-1-b8-00-quantizer-05.ivf
+49e35a7399568a0e4f015ce323d5a45ea780ca87 *av1-1-b8-00-quantizer-05.ivf.md5
+461142b1b50ae74c6b698d23f5ed3b764eadfb89 *av1-1-b8-00-quantizer-06.ivf
+6477ff260624e0f76c94ac872d1e7d5576af4177 *av1-1-b8-00-quantizer-06.ivf.md5
+7f8113cd13d8faaa06fdbaaa50dc328daf037e6d *av1-1-b8-00-quantizer-07.ivf
+b26795c6cb408487c20737977cd6b77311772bf7 *av1-1-b8-00-quantizer-07.ivf.md5
+4218f7945a172e1fe4f9e77ec35085a394eda9f4 *av1-1-b8-00-quantizer-08.ivf
+ea5d7d501e9a69d805251e4871515d28468d8676 *av1-1-b8-00-quantizer-08.ivf.md5
+837f3bcadfe56cf302db2ebaf9a990446fb35801 *av1-1-b8-00-quantizer-09.ivf
+eede995cdac5fd01a411da2e74e86e8394138be1 *av1-1-b8-00-quantizer-09.ivf.md5
+adc229b3780a4968c18ded1bcbe72e3f04643833 *av1-1-b8-00-quantizer-10.ivf
+0799b7e54e54ee97bf0e8aad2b75509ce59c7097 *av1-1-b8-00-quantizer-10.ivf.md5
+44bac8247160a8d9a0ab19f890fc89cc9298de1d *av1-1-b8-00-quantizer-11.ivf
+cc6b2bf167e114599b242aba574e8c6f1fa2f047 *av1-1-b8-00-quantizer-11.ivf.md5
+ebb3af7dfc15567188bcb617021cdc95ebc560e3 *av1-1-b8-00-quantizer-12.ivf
+b716ae29d56cd0c052dbfa1b5dcf850cd0fa8ca7 *av1-1-b8-00-quantizer-12.ivf.md5
+46159641f981a26fb9c374a5ca41e44f0ce0a9f0 *av1-1-b8-00-quantizer-13.ivf
+c6db1b8b4a74f83e4a0647e053cea0fc00f6abab *av1-1-b8-00-quantizer-13.ivf.md5
+fadc909d18eb640760fbb075f922fb050e715470 *av1-1-b8-00-quantizer-14.ivf
+e36bb6b23273633ba3ef7d28160a7258840a1476 *av1-1-b8-00-quantizer-14.ivf.md5
+8befbd9cc1601dcd36ec6911613855f68e6fd40e *av1-1-b8-00-quantizer-15.ivf
+cfc2334b76fb5e7aa9d8607e89d37cbc7716d62e *av1-1-b8-00-quantizer-15.ivf.md5
+ca42e00ae27c6b7f684fe3d2a787d50d2827cb3f *av1-1-b8-00-quantizer-16.ivf
+f11278218a7c3c73cfaab2332bab55f06cedcc81 *av1-1-b8-00-quantizer-16.ivf.md5
+05270d365bdc067f9446eda3029a6f41571a5229 *av1-1-b8-00-quantizer-17.ivf
+fb6482f35e7ad04bf231ea1806226760abcb3c26 *av1-1-b8-00-quantizer-17.ivf.md5
+617bc72037165efbff478d5a0d342b3c20ffcafd *av1-1-b8-00-quantizer-18.ivf
+1ff68d5424f91322123fe0d58f436b8e49cfa99d *av1-1-b8-00-quantizer-18.ivf.md5
+821c3b1ae6054c7a91b2f64428806e57f1157ca6 *av1-1-b8-00-quantizer-19.ivf
+f2fd118e786697553d6987f786660a2bb9f00680 *av1-1-b8-00-quantizer-19.ivf.md5
+48bcf17c27d9a4eb73632a68c09f42eff9f9af99 *av1-1-b8-00-quantizer-20.ivf
+64d55e4c858414bc2837c9c3e2d5fb6d2208c4b8 *av1-1-b8-00-quantizer-20.ivf.md5
+d61ecdd4f0950bc5c8bae1270b22e711bdd22763 *av1-1-b8-00-quantizer-21.ivf
+9d447938596096704fd5f4d41bcdf6fabf9cdfb9 *av1-1-b8-00-quantizer-21.ivf.md5
+59b4b65d8e56ccdd1bddff26a03e991a63409334 *av1-1-b8-00-quantizer-22.ivf
+aa1be0c7c7622d612af85f9bf96a212f6fe5ab56 *av1-1-b8-00-quantizer-22.ivf.md5
+95ed96988eb9916cad956db9b929718769de49f1 *av1-1-b8-00-quantizer-23.ivf
+596b8a3aea468996d609624367465c412751f52b *av1-1-b8-00-quantizer-23.ivf.md5
+e6c2dc4ce725003152797b3d7b34d7eb34da50c8 *av1-1-b8-00-quantizer-24.ivf
+1cd3d7e8b3813a9e5591b94eaeb72d471780e64a *av1-1-b8-00-quantizer-24.ivf.md5
+6734e353008824e523939d1a18daa3f2ab2d8ec6 *av1-1-b8-00-quantizer-25.ivf
+c45cf440a05802c1f9e29472175ed397d130d988 *av1-1-b8-00-quantizer-25.ivf.md5
+3372b1c69fb39811156adcea4f6dba802c0918c2 *av1-1-b8-00-quantizer-26.ivf
+b1751d55bb3fb788751fe28fb7434bee153bda68 *av1-1-b8-00-quantizer-26.ivf.md5
+e7ddb19a6e2a798d6a4e7dfdfc10b4df777b60e3 *av1-1-b8-00-quantizer-27.ivf
+0e19d6b79cd71de69d03e0455349568af979b170 *av1-1-b8-00-quantizer-27.ivf.md5
+7f1c90a35543d6b673e353b3702baf3aa1caeaa7 *av1-1-b8-00-quantizer-28.ivf
+d9a4f9cb88103249a05a7e6aa616bf0c16bf9c95 *av1-1-b8-00-quantizer-28.ivf.md5
+28d741b923011c7fcc50a7318256a638d3110a07 *av1-1-b8-00-quantizer-29.ivf
+c68cacf2b2ff2694945a99ad836dcf1ee3961c09 *av1-1-b8-00-quantizer-29.ivf.md5
+9a5d9ea4bc76dd40d04e92f33f45e9c2e120e85d *av1-1-b8-00-quantizer-30.ivf
+eb02bb8c16c4c0368ddff83e05e516e84ec9eaf3 *av1-1-b8-00-quantizer-30.ivf.md5
+20193c372f44f522e094c2c05fc7e4aaa0717fa8 *av1-1-b8-00-quantizer-31.ivf
+a4c1a4ac332f4911f0d5abbd826ebecfb8432d6c *av1-1-b8-00-quantizer-31.ivf.md5
+9617bbd691f093d259dbc8a642a57a153c1fc00c *av1-1-b8-00-quantizer-32.ivf
+73d60a348454b126ea6368ea604954bc23f210ae *av1-1-b8-00-quantizer-32.ivf.md5
+d9aea9d72a686c59b60584d827f60ca1ee8eee26 *av1-1-b8-00-quantizer-33.ivf
+fbf64de376a63d2d3051da83b0e4e56579b55c0a *av1-1-b8-00-quantizer-33.ivf.md5
+791aaf067f125e5cf4a247cf06a2e29ab071ec90 *av1-1-b8-00-quantizer-34.ivf
+8e2e6efe4c069e54844da19125c4280b95990c69 *av1-1-b8-00-quantizer-34.ivf.md5
+01ba67bba5cbf7c94c65da8f4c9bd6e7db24cf3a *av1-1-b8-00-quantizer-35.ivf
+0c5e60704a4a6bd27e67b6fd72ca7d2cf7fff50f *av1-1-b8-00-quantizer-35.ivf.md5
+3e255b4a320c9522dcec539fef770b6920b9a102 *av1-1-b8-00-quantizer-36.ivf
+1241aab865fd7b4bae73736cbeec1866ea9c90ec *av1-1-b8-00-quantizer-36.ivf.md5
+44fa6fca109747d8f43f6c6aa46d782e5d476d54 *av1-1-b8-00-quantizer-37.ivf
+947f0f887c5ac9149cf85e8114a709d6f410fc32 *av1-1-b8-00-quantizer-37.ivf.md5
+8319ac1ddd6ce3279da5780175dff7a3a5fa1054 *av1-1-b8-00-quantizer-38.ivf
+5f571b7f88678eab9e54f162cc9898f14e437770 *av1-1-b8-00-quantizer-38.ivf.md5
+5975e7056e17608593a8c40619b68e6576d373d9 *av1-1-b8-00-quantizer-39.ivf
+7c870192d6eb70ce5367147a3d2c6a52e11f7bec *av1-1-b8-00-quantizer-39.ivf.md5
+47da942f1e455f1422fc65f06dd57304541d16ac *av1-1-b8-00-quantizer-40.ivf
+6ea7116c9ce3a1641c7060bab2f5e06fd0910d61 *av1-1-b8-00-quantizer-40.ivf.md5
+ab35c15dfde21c2572b14e04dbfd5fac1adae449 *av1-1-b8-00-quantizer-41.ivf
+19596f9849653b913186b9d6b7072984ede96177 *av1-1-b8-00-quantizer-41.ivf.md5
+23a5fa6c3d0eaffaf13f6402465f5dd33d8ea7f1 *av1-1-b8-00-quantizer-42.ivf
+5a2726f0d1b1799d4f70883f1bfe5c9d976c6cf5 *av1-1-b8-00-quantizer-42.ivf.md5
+86cddfc463d2b186ec5a1aa25c4562c05201e3c3 *av1-1-b8-00-quantizer-43.ivf
+674c64ec8487ee774ad09350380fa6ac43815807 *av1-1-b8-00-quantizer-43.ivf.md5
+6894c154eb56c4f3fe44d54fc4f9af468b03d175 *av1-1-b8-00-quantizer-44.ivf
+eca679a2781eb894d18b3d578e3aaf4f48019a15 *av1-1-b8-00-quantizer-44.ivf.md5
+0960bf018ada4224b8344519cf091850d50a57bd *av1-1-b8-00-quantizer-45.ivf
+291bb43b9e1ab167040b51019daf1ccf94fd1e50 *av1-1-b8-00-quantizer-45.ivf.md5
+ea644a4732f1a2534332802c2fa5073344f3c356 *av1-1-b8-00-quantizer-46.ivf
+4c7915382b1d6d08709c95525b04ab8830f20ca1 *av1-1-b8-00-quantizer-46.ivf.md5
+d1f8832d33234e2c74a2280090850153ea24ea82 *av1-1-b8-00-quantizer-47.ivf
+90eb9959e612602934dcc512fe6f54abf0c88d9c *av1-1-b8-00-quantizer-47.ivf.md5
+69c93f760e8b666eb5b98f510e09d90f9230ac9b *av1-1-b8-00-quantizer-48.ivf
+931f869e14bd455de9dac2101b383c29e7d6f04c *av1-1-b8-00-quantizer-48.ivf.md5
+8b660c577d95c031d6711c1134b8d115097f8d7e *av1-1-b8-00-quantizer-49.ivf
+0e3fe8b49d497050dc1a0eac5f3ad60f5fe068fe *av1-1-b8-00-quantizer-49.ivf.md5
+d40bb21448a6da0fc9b88cbcf76d2f4226573acb *av1-1-b8-00-quantizer-50.ivf
+bcd2a9c9a021ba44fc5dc74ae02194fe49ca76a4 *av1-1-b8-00-quantizer-50.ivf.md5
+3b5a1d464aa89b0f1a6ad4f5a03602292b826172 *av1-1-b8-00-quantizer-51.ivf
+49bcde0c56cf8b7fbe429336981be22d39025b74 *av1-1-b8-00-quantizer-51.ivf.md5
+38970a02fb38ddb4954fe4240164cb75de5fc744 *av1-1-b8-00-quantizer-52.ivf
+fd02b034d79d4be150efb02bd4349edfd0e41311 *av1-1-b8-00-quantizer-52.ivf.md5
+2fde7a7cf3014d5196d011c47de4a144227ed122 *av1-1-b8-00-quantizer-53.ivf
+0cb66e6d8fbb29962a69ae1703e22da50db2c92b *av1-1-b8-00-quantizer-53.ivf.md5
+89a69e9b9a601e40cb491ac3a1d32491f2468ac8 *av1-1-b8-00-quantizer-54.ivf
+2f8af51acc73c99b5af81db2bdd1883b611ad311 *av1-1-b8-00-quantizer-54.ivf.md5
+31ee4f56fcb0043e95fff7af49e4ef82aafa5543 *av1-1-b8-00-quantizer-55.ivf
+04a7104e02bdd0fa38c118202dbbecdbd11ace02 *av1-1-b8-00-quantizer-55.ivf.md5
+f262f0b234006a2652fceb77b1a8711aa53abb54 *av1-1-b8-00-quantizer-56.ivf
+bdd54dc25bc5a147c76163af0bced45c56435d79 *av1-1-b8-00-quantizer-56.ivf.md5
+1ef00617091db4b2b839de623bd6b4fb0b2f5f83 *av1-1-b8-00-quantizer-57.ivf
+714c65363a87ed5e6e4ad75c79ddb6af57d41fd9 *av1-1-b8-00-quantizer-57.ivf.md5
+43c9b02feccbb3c709d96015f126b7e3d4c24c64 *av1-1-b8-00-quantizer-58.ivf
+bae22b8d6377862bff8219470c0d87205d186a68 *av1-1-b8-00-quantizer-58.ivf.md5
+ca5f780abe4c02e48cceb9c804f3625723c359bf *av1-1-b8-00-quantizer-59.ivf
+c60a20bbf60b0b0a442ef3f7b682979053909d6e *av1-1-b8-00-quantizer-59.ivf.md5
+1f6f047e9f0e1da22fb514370d92c3c7c66dcf89 *av1-1-b8-00-quantizer-60.ivf
+86dc7fa59d363cf1ae4b027a57b119bda893c1c1 *av1-1-b8-00-quantizer-60.ivf.md5
+bcf0c3353568c47a043f2dc34c9abd3fc04eebd4 *av1-1-b8-00-quantizer-61.ivf
+66fc4f729c5915aa19939d1b6e28e5b398e747bb *av1-1-b8-00-quantizer-61.ivf.md5
+ac8d3c54451b52cf557ef435d33e7638088d66df *av1-1-b8-00-quantizer-62.ivf
+b57f4e1276ead626a3662339a86111ae6fda49d2 *av1-1-b8-00-quantizer-62.ivf.md5
+2a8aa33513d8e01ae9410c4bf5fe1e471b775482 *av1-1-b8-00-quantizer-63.ivf
+9f646ec35a168f495e144c64ba7ce9aeb41cd0a2 *av1-1-b8-00-quantizer-63.ivf.md5
+838388fbda4a1d91be81ff62694c3bf13c460d38 *av1-1-b8-01-size-16x16.ivf
+4229c1caf8e25eb3073456fb90ceed206753901e *av1-1-b8-01-size-16x16.ivf.md5
+23f4253bf71e02b2e8ead66da4b3de875e879ef2 *av1-1-b8-01-size-18x16.ivf
+af125644436d4b6897dade68336cedad663b6610 *av1-1-b8-01-size-18x16.ivf.md5
+94e4a75bd93052f79998e9e08e6b5dd73dc27e50 *av1-1-b8-01-size-32x16.ivf
+e7b3fbc5e4b2469838e7ae36512bd3ce0a81040c *av1-1-b8-01-size-32x16.ivf.md5
+f297bde01c05ec5c07ff8118a0280bd36c52b246 *av1-1-b8-01-size-34x16.ivf
+f6bbd94d6063c689de3c7cf94afa2c68b969d12c *av1-1-b8-01-size-34x16.ivf.md5
+1e18bdf68bab7e7282aacc77e423bc7d93d04a8e *av1-1-b8-01-size-64x16.ivf
+de75732fccfb385294b23c17f0f1a57b455edcf7 *av1-1-b8-01-size-64x16.ivf.md5
+26b1f6ae80b161e971468085778cc1ece502b330 *av1-1-b8-01-size-66x16.ivf
+48bd99813557c314d398e6952da78da07c79d416 *av1-1-b8-01-size-66x16.ivf.md5
+ff213ecf31b982a3a7f009c9739f64e066e1ffe9 *av1-1-b8-01-size-16x18.ivf
+86b20a13b1939dc5f678e80491f190d376233d58 *av1-1-b8-01-size-16x18.ivf.md5
+c90bd878c59263a15c6a6f515d1c7e071f141559 *av1-1-b8-01-size-18x18.ivf
+6f659036ffcd3dd380cf970cf1a06f7755e0b2de *av1-1-b8-01-size-18x18.ivf.md5
+e16a1411381b34817a4c0d8e5eeaeb8cddcc9c46 *av1-1-b8-01-size-32x18.ivf
+fdb1c4ec56f5aa690eadbe897340fee86a06ae2f *av1-1-b8-01-size-32x18.ivf.md5
+fac7052b39bd2d0ae107e0e94050226712c770c2 *av1-1-b8-01-size-34x18.ivf
+adb0d5a99228027eaa3b016963df447c9818c447 *av1-1-b8-01-size-34x18.ivf.md5
+b8be5e55d9be42746c2b547d0e26e80b21c9802a *av1-1-b8-01-size-64x18.ivf
+8f8f6da34cdf78c5a6551c637e1afe279cc3884e *av1-1-b8-01-size-64x18.ivf.md5
+9e066bdcc2cd789cdf551bd4c9c85c178887b880 *av1-1-b8-01-size-66x18.ivf
+e8ec6effa936423ae2eec2b60a3160720d2de912 *av1-1-b8-01-size-66x18.ivf.md5
+6ebe45085cdeebc2acd6da5abd542a59312c0ff4 *av1-1-b8-01-size-16x32.ivf
+044695669103dbf158591dce9c649317a177d5f6 *av1-1-b8-01-size-16x32.ivf.md5
+9fabb4f60641b8c7995d1dc451419165d41258ff *av1-1-b8-01-size-18x32.ivf
+7263764680dfec864c3fad5df824ab1973489a14 *av1-1-b8-01-size-18x32.ivf.md5
+3f72841a24a13e601d79cf029aa1fdb02970ce0b *av1-1-b8-01-size-32x32.ivf
+bbe1ae2888d291ec6bc98cd0784937580c554103 *av1-1-b8-01-size-32x32.ivf.md5
+392131a7c7609acd0dba88fee14f1ed042d23ab1 *av1-1-b8-01-size-34x32.ivf
+eea68165ebe9acd28693374bf2266374b9c77786 *av1-1-b8-01-size-34x32.ivf.md5
+78afdd96265811ab9466e906347b57161e5c010d *av1-1-b8-01-size-64x32.ivf
+47b317af582700b67f6e77659db1dfaa26c8cde6 *av1-1-b8-01-size-64x32.ivf.md5
+2b4d01f2c9f23044c0d886482c7073bd4d5d37d1 *av1-1-b8-01-size-66x32.ivf
+3ad5a58a0ee5086af370b22ab2b5b7592a4f33e7 *av1-1-b8-01-size-66x32.ivf.md5
+78ddae04eb8277ae605bd7017ad7ad27bfc82d39 *av1-1-b8-01-size-16x34.ivf
+d0c18e679f1fc51e4f7409831321eed9c4858f6f *av1-1-b8-01-size-16x34.ivf.md5
+38d8ed885f46aead6ec1271d8a5d4aee79b8eb68 *av1-1-b8-01-size-18x34.ivf
+097ddbd69b8f54826a35efeb0b8b07ec198bba6b *av1-1-b8-01-size-18x34.ivf.md5
+91a42720bc2e7ba701f4d97b463a098b6707cdbd *av1-1-b8-01-size-32x34.ivf
+c590d43d37095bd2e8f8d12c9278477419b72d1a *av1-1-b8-01-size-32x34.ivf.md5
+4cc2a437dba56e8878113d9b390b980522542028 *av1-1-b8-01-size-34x34.ivf
+57eeb971f00e64abde25be69dbcb4e3ce5065a57 *av1-1-b8-01-size-34x34.ivf.md5
+b36fee1b6ad69d1206466615d69c05e0a4407939 *av1-1-b8-01-size-64x34.ivf
+a78aea0250d0b32657dc0eaf2d8394bc766c0e35 *av1-1-b8-01-size-64x34.ivf.md5
+10e441209262e082e31fef8c15b51579c9e81509 *av1-1-b8-01-size-66x34.ivf
+558b46f6ef1662c208012d0b66d1857eeff3244e *av1-1-b8-01-size-66x34.ivf.md5
+dd44aad500c7ca0fc97e3d8f0abed3c83b24c79c *av1-1-b8-01-size-16x64.ivf
+a5b64e8063abcf3e4872dc4baf1c32384dc5cf83 *av1-1-b8-01-size-16x64.ivf.md5
+aa849f0d09bcb2ead44719d63043536932d5c9f2 *av1-1-b8-01-size-18x64.ivf
+bcdf2dea3590c7031158ffe7b907d9ee35e2fe57 *av1-1-b8-01-size-18x64.ivf.md5
+36e856d30e160ba2fbb00510296202f61afaae49 *av1-1-b8-01-size-32x64.ivf
+99299f75b82c40c13f168adf2d124f57044a39a2 *av1-1-b8-01-size-32x64.ivf.md5
+e3e03ec5d38eb25e97e4ec3adc6ed40ecdebd278 *av1-1-b8-01-size-34x64.ivf
+84625abf8a200a7d20dd3dd3b277b50b3d62ce32 *av1-1-b8-01-size-34x64.ivf.md5
+7d017daebef2d39ed42a505a8e6103ab0c0988c1 *av1-1-b8-01-size-64x64.ivf
+1ff38d5ecba82fb2e6ac3b09c29c9fe74885ac29 *av1-1-b8-01-size-64x64.ivf.md5
+e1b58ba0b462508593399a2ed84db5f1c59ffcd2 *av1-1-b8-01-size-66x64.ivf
+a6b2c84c94fe79ab0373d157d1203f8d66de0706 *av1-1-b8-01-size-66x64.ivf.md5
+7b4faa7eb7b73392b62de6613282a98dddc13bb6 *av1-1-b8-01-size-16x66.ivf
+a2dacf2bae3c4ab352af66a9600946d29ab9a6ee *av1-1-b8-01-size-16x66.ivf.md5
+0f97805fa30497d4cf39665150f00dfdea52d862 *av1-1-b8-01-size-18x66.ivf
+33d8ea0765953250f998da3fe161f2a8cfca2353 *av1-1-b8-01-size-18x66.ivf.md5
+c8bb00256de973e3b3ee31b924f554336d310cdb *av1-1-b8-01-size-32x66.ivf
+6a6588e6edc68ff7739968a9e7cc6d9eaaeed356 *av1-1-b8-01-size-32x66.ivf.md5
+75ec54fec5c36eecde6d0a16e0389a5f7ad8ec22 *av1-1-b8-01-size-34x66.ivf
+36101dfa9495c18696c0d7d61f25e748f4de7425 *av1-1-b8-01-size-34x66.ivf.md5
+7e5491716e70f8199156b8843513c935667b281e *av1-1-b8-01-size-64x66.ivf
+da38755bb0c9ef56b81617835ddf1340242c6dce *av1-1-b8-01-size-64x66.ivf.md5
+68b47b386f61d67cb5b824a7e6bf87c8b9c2bf7b *av1-1-b8-01-size-66x66.ivf
+25974893956ebd92df474325946130c34f880ea7 *av1-1-b8-01-size-66x66.ivf.md5
+9f386d19c87dbfd6ac84a06d2393dd88863ac003 *av1-1-b8-01-size-196x196.ivf
+788f77f655f55de3db94dd69870316134c149116 *av1-1-b8-01-size-196x196.ivf.md5
+ed3bb2bb52a9d1786e233ef38142b15b85097875 *av1-1-b8-01-size-198x196.ivf
+3bb6b6721ad9b2838b2d07e47b29d6c0117526b1 *av1-1-b8-01-size-198x196.ivf.md5
+49461772caaaa7b824d48f4e9c77a906b0dc02d5 *av1-1-b8-01-size-200x196.ivf
+f1cba00c36909c56097c8785df476d42bc91f259 *av1-1-b8-01-size-200x196.ivf.md5
+44a656a22958e26ed169a69deb8f373117224f06 *av1-1-b8-01-size-202x196.ivf
+69be876b52fe42811bba52d36d0bcc88d6c25b3f *av1-1-b8-01-size-202x196.ivf.md5
+0a6fe9b478363faedbfd465a75790b4c2661b9ba *av1-1-b8-01-size-208x196.ivf
+fc8e95a6860a8a37ccdf1dfe49828502fcf96a08 *av1-1-b8-01-size-208x196.ivf.md5
+8e05b5a20ec95afd92bb615a7daa2e17a7ef55a8 *av1-1-b8-01-size-210x196.ivf
+0add512bffbda3300d8f684a53b13b996fe2e46d *av1-1-b8-01-size-210x196.ivf.md5
+a15f12652c6b4d0c30f13a439c941bfc4a431d1a *av1-1-b8-01-size-224x196.ivf
+b904b93252175f79e0e2b28896131ce93d5fc925 *av1-1-b8-01-size-224x196.ivf.md5
+1a57b913443b267f4a31a6925c39f5b58022f550 *av1-1-b8-01-size-226x196.ivf
+7cf3087de5804763a82d2a798243a66459664772 *av1-1-b8-01-size-226x196.ivf.md5
+2cc28541a2a72e8b45a368f71e70fc294e2de3ab *av1-1-b8-01-size-196x198.ivf
+bb736eedb4bd1e39bf9d60435b4b27a12842e112 *av1-1-b8-01-size-196x198.ivf.md5
+c4ebf93fbf3ae52108fd7b39ddef3afae48188ea *av1-1-b8-01-size-198x198.ivf
+fa4de6881511728bafa15b5f441a0cfdf683cc75 *av1-1-b8-01-size-198x198.ivf.md5
+55fce983186d454b0eb15527393bb2465ba41c6b *av1-1-b8-01-size-200x198.ivf
+1ac8fb1ee622cbc4aa1b83cb46b4731c85efae62 *av1-1-b8-01-size-200x198.ivf.md5
+67d276c67886f0a91a7ee06751a64f95eeb7bc1f *av1-1-b8-01-size-202x198.ivf
+1633b62d9e4ea41737c42f70cbde9a5671da0cef *av1-1-b8-01-size-202x198.ivf.md5
+081cb3f29d3956d4d858d9661fd3d62c94b68867 *av1-1-b8-01-size-208x198.ivf
+871d1c99167408dd32fa7603a7296c9b99ccda15 *av1-1-b8-01-size-208x198.ivf.md5
+b2d80b42468d5f296ae240cfb1fc0b3dd3d96bbc *av1-1-b8-01-size-210x198.ivf
+6a3382656cb17b532a97b1061697f9a878fc58d1 *av1-1-b8-01-size-210x198.ivf.md5
+84d7994fa20fcf6c1d8dbd4c2060c988a6fce831 *av1-1-b8-01-size-224x198.ivf
+42ea12e15de81f2e8617b6de7bae76de2da4d648 *av1-1-b8-01-size-224x198.ivf.md5
+c74a9281cf98c597121df6bff0ac5312b887f969 *av1-1-b8-01-size-226x198.ivf
+4133aae0001804e2bbc7928fc065517a6dd8b288 *av1-1-b8-01-size-226x198.ivf.md5
+27adbf148c63f807bd617cfd78aeaedb8b0f2304 *av1-1-b8-01-size-196x200.ivf
+9253e525e6207ef1ce0839b8f88ea781e9abe41e *av1-1-b8-01-size-196x200.ivf.md5
+21c9ea4d882e48353d3df66fcde0e4746168163f *av1-1-b8-01-size-198x200.ivf
+3d5ee59fde9194f0eaff736051cfd1d7b7daeff1 *av1-1-b8-01-size-198x200.ivf.md5
+c27b0b57667910847122a0309c703315e444110f *av1-1-b8-01-size-200x200.ivf
+7b2a15a17b421ef07e285ca4e8a224f0512c434d *av1-1-b8-01-size-200x200.ivf.md5
+780de549e4163a52590f7c0f488e027a8a4aa053 *av1-1-b8-01-size-202x200.ivf
+cb0ec0969522ca60d79a639e9b9509363468ffd0 *av1-1-b8-01-size-202x200.ivf.md5
+2c59821904863e264ae61401cbd494a79bc04f13 *av1-1-b8-01-size-208x200.ivf
+9963955966a52b65cdd13465c9fb2ba3b5356755 *av1-1-b8-01-size-208x200.ivf.md5
+ff63121611ea9c0628c7e5af13de5e7786611ca6 *av1-1-b8-01-size-210x200.ivf
+2a5993be234e3af2af6d185b2a6f3aaf1979b83a *av1-1-b8-01-size-210x200.ivf.md5
+b8485ada95440d78b51153227231b1aced1a8273 *av1-1-b8-01-size-224x200.ivf
+9c3cd32ea6c006a91eb37d69dbeccf878de5d214 *av1-1-b8-01-size-224x200.ivf.md5
+1aa0ce3e3a74f9b600a146e98b05547a0b454c48 *av1-1-b8-01-size-226x200.ivf
+e045be96c3af16a9ddc10a9933e8ddfb3319d716 *av1-1-b8-01-size-226x200.ivf.md5
+e92b76480f4339855d998b97182f36b28deadcfa *av1-1-b8-01-size-196x202.ivf
+480c707abcd2a650e2160ec397f8348cecb45770 *av1-1-b8-01-size-196x202.ivf.md5
+137b9c0d10a3bdbdf6f97b3e6331f3e8acaf8f91 *av1-1-b8-01-size-198x202.ivf
+7429642146d0da55161ab13024a261094ee2ce87 *av1-1-b8-01-size-198x202.ivf.md5
+9cea71c44ad015ac702d675bacca17876e65cb1a *av1-1-b8-01-size-200x202.ivf
+76b1ec6c42da55f47e389a561590d1a7c713e495 *av1-1-b8-01-size-200x202.ivf.md5
+26dffdcd0dac9becf68d12e31fcd91eddf1f7154 *av1-1-b8-01-size-202x202.ivf
+ddb75e99123fed4ef05d9b85200cefd8985bc84c *av1-1-b8-01-size-202x202.ivf.md5
+04007e83bb66ba547d09f8926ea5bfc7fd9e4b2a *av1-1-b8-01-size-208x202.ivf
+5b72eb58db22087ad416c499119f41e718395b52 *av1-1-b8-01-size-208x202.ivf.md5
+721ff7c0ae0e2ed896b5acac230113f1404e769c *av1-1-b8-01-size-210x202.ivf
+187d2ef939fc26e1a1c7de65abe8e058d8aae17a *av1-1-b8-01-size-210x202.ivf.md5
+dba41421cc938bcf0234254f96be0325ab66186e *av1-1-b8-01-size-224x202.ivf
+58856038c1eb13a7bf0353a30b1affe844cd31b1 *av1-1-b8-01-size-224x202.ivf.md5
+55eba14878d25dcc351ee5e92fa06e559035b409 *av1-1-b8-01-size-226x202.ivf
+e295b3d791d40d7c1fff2c40a260078dccaef24a *av1-1-b8-01-size-226x202.ivf.md5
+6c777223990ddfd92040a8526646ed0f39299b0d *av1-1-b8-01-size-196x208.ivf
+5210daff766cddaf3945610ee05ff242aef8175a *av1-1-b8-01-size-196x208.ivf.md5
+252831abfb9f4a9a8556c21cc3bf60adfe88210f *av1-1-b8-01-size-198x208.ivf
+35ed9601e608a829980cec81e41b7bd3e5f4c2ce *av1-1-b8-01-size-198x208.ivf.md5
+e800ed893a88704a4576d4984957f3664560daa9 *av1-1-b8-01-size-200x208.ivf
+82c038f9072a2fcf8d55fb4a474fdd791ba9a290 *av1-1-b8-01-size-200x208.ivf.md5
+9ce7bb932dd99f86da8ff2ab89fa4d3089a78da8 *av1-1-b8-01-size-202x208.ivf
+0611bf0179abe3c820a447a2bd3a04c3790f3a87 *av1-1-b8-01-size-202x208.ivf.md5
+e5900d9150c8bebc49776227afd3b0a21f5a6ac6 *av1-1-b8-01-size-208x208.ivf
+86d6b9a3840aa0a77938547c905bd6f45d069681 *av1-1-b8-01-size-208x208.ivf.md5
+2758ba5dad16f4a91334f2ed07a4a037201bb873 *av1-1-b8-01-size-210x208.ivf
+78453b1fda2ccc6f35e0d762567807757bcddb16 *av1-1-b8-01-size-210x208.ivf.md5
+fff88fb8e833f6b4ad64cb591b219c7cceb7f2d2 *av1-1-b8-01-size-224x208.ivf
+87266fc34aaed82cdb98cbc309b221ad52eccd81 *av1-1-b8-01-size-224x208.ivf.md5
+dec839fe64046461015b56cda191835284f42a52 *av1-1-b8-01-size-226x208.ivf
+d7a15264fc3fd55d3aec0ccfaa7c434c6d90969f *av1-1-b8-01-size-226x208.ivf.md5
+584782e93ed1cb7797a90fece44becdd1e23bf0d *av1-1-b8-01-size-196x210.ivf
+ed76ec841b18a457853e368576967c4768fc2730 *av1-1-b8-01-size-196x210.ivf.md5
+dab625599b9f01398b593e865d9a4a95a029d60f *av1-1-b8-01-size-198x210.ivf
+b90e8d96a1f5b329b088b467a11fed2d055d74ca *av1-1-b8-01-size-198x210.ivf.md5
+6774bee17b9e50d2d8630e2e1afc30ded67e662d *av1-1-b8-01-size-200x210.ivf
+343a86bd54eb3dd5e9902eb62a3d776dcff2f4f3 *av1-1-b8-01-size-200x210.ivf.md5
+0456c3b8e242eeee019ca97d155f81124de62c90 *av1-1-b8-01-size-202x210.ivf
+5a6a6428c9858a0d3561db42ceaf981c143fe479 *av1-1-b8-01-size-202x210.ivf.md5
+6a3a8f65bf806b1be7726b983427880f772c9986 *av1-1-b8-01-size-208x210.ivf
+5563ea6d8c65887553ff3000addc6418913f1650 *av1-1-b8-01-size-208x210.ivf.md5
+5a8b69489f8e9b917ea7718ad2645101cdbe5644 *av1-1-b8-01-size-210x210.ivf
+f4b01604036fa23000d44fbf42097ae1181bcd62 *av1-1-b8-01-size-210x210.ivf.md5
+fb6f5b08a048698cfe324557ee8cd840c4a3f6ce *av1-1-b8-01-size-224x210.ivf
+3ce5c404e3ca09c8e994b3043bad42cd555b00c0 *av1-1-b8-01-size-224x210.ivf.md5
+2e9fc8510d2131b2f3c9a93bececac985e4426d2 *av1-1-b8-01-size-226x210.ivf
+897c537e259331ca86cdd6e4d2bd343f8538402e *av1-1-b8-01-size-226x210.ivf.md5
+8300512106fce3424eb74b5d4bc0f4f19f7c9af8 *av1-1-b8-01-size-196x224.ivf
+43662ea025ea79afe4964fd4d12a77f4aa4e565e *av1-1-b8-01-size-196x224.ivf.md5
+640f8fda7ade8f2850e2275a9f5e233e33a0ba8d *av1-1-b8-01-size-198x224.ivf
+9ac690bdbbce47d7b169128b568f955e70076f8c *av1-1-b8-01-size-198x224.ivf.md5
+ce2e9379c72fc924e364d5727605394a1438a211 *av1-1-b8-01-size-200x224.ivf
+1ec35a53d88072b96b255202f678178bc7e5bb20 *av1-1-b8-01-size-200x224.ivf.md5
+5d3af7921623deccb578115c8ce207c019f97f50 *av1-1-b8-01-size-202x224.ivf
+14eafd55b0cda3a3476cae7ad500dbd5ee899dd5 *av1-1-b8-01-size-202x224.ivf.md5
+6b6d78e466cf94a5ef8dfe252caa0948dd2ec175 *av1-1-b8-01-size-208x224.ivf
+e178b0c272dfcfe614c6b49cb28dad11781af0b6 *av1-1-b8-01-size-208x224.ivf.md5
+dd2232b9e18971d7e19650a1e3218aef1010247f *av1-1-b8-01-size-210x224.ivf
+40a66198c47820f5fa2d2e389ec0c1191ea4ffcc *av1-1-b8-01-size-210x224.ivf.md5
+9ec028b81a5ea311683328d856f436e6d0b0e6a0 *av1-1-b8-01-size-224x224.ivf
+143b9530ce722385db2c2d883daa649ed42b8d40 *av1-1-b8-01-size-224x224.ivf.md5
+bf833947e62935c54e1e727ccb36157f7c1e9e5d *av1-1-b8-01-size-226x224.ivf
+ca4f3b44463106e4f0bb54e490c3bd457d7d780b *av1-1-b8-01-size-226x224.ivf.md5
+5525f7e312ec073f480ed5a2be5bdc4f0ce51a09 *av1-1-b8-01-size-196x226.ivf
+062d4b240741184458d2d2abd243ed7877631de8 *av1-1-b8-01-size-196x226.ivf.md5
+e6b911142394b94c23191eaa63c9eb41a00f80b0 *av1-1-b8-01-size-198x226.ivf
+3b580d903dddf47082f5e055bfb01a4f05c09b7d *av1-1-b8-01-size-198x226.ivf.md5
+70feb5efeb28df25f7d1a661c73bf013c5ada9b4 *av1-1-b8-01-size-200x226.ivf
+f0b894e7f787e62f1492be62f3dedeb065062160 *av1-1-b8-01-size-200x226.ivf.md5
+7f9a10831e2389b31497fad50080b4d5452d6e91 *av1-1-b8-01-size-202x226.ivf
+45b7194eba9367c8059403c23ca4ae49e988dfaf *av1-1-b8-01-size-202x226.ivf.md5
+967837a2cfbf9aa3131f73aec6a52dcdd82926c7 *av1-1-b8-01-size-208x226.ivf
+c8baedb48fd5d4c956aa8d73fd957370f718f047 *av1-1-b8-01-size-208x226.ivf.md5
+9c926226b9f6b015501d8ac1e3f95e8570283a05 *av1-1-b8-01-size-210x226.ivf
+57d4837667fd4c5a7aeb908626d701b632852c60 *av1-1-b8-01-size-210x226.ivf.md5
+25a4940922761239809d82c45c2be1c5e4f48785 *av1-1-b8-01-size-224x226.ivf
+87ae7e7558241bf3575a333f56fbad4dfdade8ff *av1-1-b8-01-size-224x226.ivf.md5
+40dd208eb525cd90d7c0674cf787097fb909afae *av1-1-b8-01-size-226x226.ivf
+34bdef682a4eae0e0a05e4486a968af1df8b220a *av1-1-b8-01-size-226x226.ivf.md5
+9bbe8499796aa588ff02e313fb0d4349940d2fea *av1-1-b10-00-quantizer-00.ivf
+36b402eedad2bacee8ac09acce44e2fc356dd80b *av1-1-b10-00-quantizer-00.ivf.md5
+1d5e1d2827624f328020bf123df213bb175577e0 *av1-1-b10-00-quantizer-01.ivf
+16c529be5502369e43ce9c6fe99a9709968e3daf *av1-1-b10-00-quantizer-01.ivf.md5
+39abc20739242a8f05efd4b35d7603c8ad7ff45d *av1-1-b10-00-quantizer-02.ivf
+81faa72c3d43b003966fe09ffaae51b07b1059be *av1-1-b10-00-quantizer-02.ivf.md5
+92ebf349b803333a43824a83d997b8cf76f656f9 *av1-1-b10-00-quantizer-03.ivf
+5e7556dc998cb8b506a43cc078e30802d7e600e6 *av1-1-b10-00-quantizer-03.ivf.md5
+1c496177c66e49f2e3556af87ec67afb5060170b *av1-1-b10-00-quantizer-04.ivf
+560fea4800a44fe19ed8d3e74f425bdbf1fb8abd *av1-1-b10-00-quantizer-04.ivf.md5
+7de864b8475ce0acd0ecb01827f2c9add815352b *av1-1-b10-00-quantizer-05.ivf
+1c1aea3db3f54a91866d89fd3b1a0d285ca10310 *av1-1-b10-00-quantizer-05.ivf.md5
+b6501c165619b036d0f7864fd4739973d2d18970 *av1-1-b10-00-quantizer-06.ivf
+d758c8eff275651006c41e7dd447cac13b489ad7 *av1-1-b10-00-quantizer-06.ivf.md5
+e4df6f588f156dffaafd9517b64f753cfc9ccf05 *av1-1-b10-00-quantizer-07.ivf
+3c577f67dade4537de642fd457ea2b367424f336 *av1-1-b10-00-quantizer-07.ivf.md5
+07e9c4c18abb36c8699c1c12bebcc727f090b525 *av1-1-b10-00-quantizer-08.ivf
+4981568ade3170f311cb114fa2689edc4bc35e67 *av1-1-b10-00-quantizer-08.ivf.md5
+2268ecd2899f1b41ae9898925b1d62cfefa30282 *av1-1-b10-00-quantizer-09.ivf
+029b03029b65b7c4c208961f0820467ad42fd3d6 *av1-1-b10-00-quantizer-09.ivf.md5
+3d2adaf6441cfa9585dcbf7d19d65bf6992a29a3 *av1-1-b10-00-quantizer-10.ivf
+017b7fb4c3ba0747c2d5688d493da33ef993d110 *av1-1-b10-00-quantizer-10.ivf.md5
+006535760bd7dc1cfc95e648b05215954a2e76c2 *av1-1-b10-00-quantizer-11.ivf
+c0ae083deb8e820aa49034af4d100944dd977018 *av1-1-b10-00-quantizer-11.ivf.md5
+840e0cbfe1acc8a7a45c823dc55ab44a0b6b553e *av1-1-b10-00-quantizer-12.ivf
+49232ea38bdef650c94808f53834f1137cd4bf39 *av1-1-b10-00-quantizer-12.ivf.md5
+04b0e5a7387e07474f51be4b2c3e05211b40f0d0 *av1-1-b10-00-quantizer-13.ivf
+a51b5ec4b890df3a64f9f0d866b8c41296c9e081 *av1-1-b10-00-quantizer-13.ivf.md5
+5dc47a140fbcbf08bf91481ee3585e9e067561ab *av1-1-b10-00-quantizer-14.ivf
+2625319eef69d6225e6ab6e5ce7790491406cb5d *av1-1-b10-00-quantizer-14.ivf.md5
+f866be86d8d8aa08ded30e42988b0936c1a16064 *av1-1-b10-00-quantizer-15.ivf
+03b7c1eefb54d99e30051c7123c0453f04a6579d *av1-1-b10-00-quantizer-15.ivf.md5
+548df2371dfb485419ed9baf28e3f495c64f364a *av1-1-b10-00-quantizer-16.ivf
+8a0d6bf1626b05b65c77331305414fe9be54e8c6 *av1-1-b10-00-quantizer-16.ivf.md5
+0077c82f96a2e095a3cb8de9bfa63715e3c9f438 *av1-1-b10-00-quantizer-17.ivf
+5d85f77f3087f4b206930722a945c60039262be4 *av1-1-b10-00-quantizer-17.ivf.md5
+1e0f1245ecb4c903b5dc7072d959fc43a7bba381 *av1-1-b10-00-quantizer-18.ivf
+06316ae2b45f2359a70cc3855ffd6ab81048b41a *av1-1-b10-00-quantizer-18.ivf.md5
+f197198f7ec058110185fda5297a1a43993654df *av1-1-b10-00-quantizer-19.ivf
+bac522c7f234d506c75b5495d74b3fa57c83a4df *av1-1-b10-00-quantizer-19.ivf.md5
+c2f57324d000b349323f37d5ebebde8c2b861f30 *av1-1-b10-00-quantizer-20.ivf
+999c6110786cbc25e67792234a5a02f2cb4553c7 *av1-1-b10-00-quantizer-20.ivf.md5
+2ffad9adfd19286fe2166ba877289d201c9a634f *av1-1-b10-00-quantizer-21.ivf
+d55713eaa791cfd7bf69b6c26d5032029d9a0f06 *av1-1-b10-00-quantizer-21.ivf.md5
+382528db53328c1a38976f5d9b579eef35d839f4 *av1-1-b10-00-quantizer-22.ivf
+cb5bd459e1a90126da9264cff4281515f95755b2 *av1-1-b10-00-quantizer-22.ivf.md5
+b52cc6160fc66f72ad66c198d275a1c73f925022 *av1-1-b10-00-quantizer-23.ivf
+c0f9d6659e1f283e9356fd7b4ac9f7cc5544cdc2 *av1-1-b10-00-quantizer-23.ivf.md5
+e11f15e3b63e7606b1122bb3670ee77c09c04840 *av1-1-b10-00-quantizer-24.ivf
+e9f141b924440e044270c81a68458fe498599a8e *av1-1-b10-00-quantizer-24.ivf.md5
+fb91793b69824c99b0218788dcea0a74ebd7e84e *av1-1-b10-00-quantizer-25.ivf
+434e33d609b2683c3cfbcc3a2cdfc26339590fb6 *av1-1-b10-00-quantizer-25.ivf.md5
+d82e38f31cdcf8b43479e6ddaa83373de38f70a2 *av1-1-b10-00-quantizer-26.ivf
+183943b851ba383a536f13c83b93f61ac8961ad5 *av1-1-b10-00-quantizer-26.ivf.md5
+6bf5e4e8e0aca699e493b9eb3672d2117494d74d *av1-1-b10-00-quantizer-27.ivf
+f0fb7e0a99180828b0e38b2cfe0622eecc2d26b8 *av1-1-b10-00-quantizer-27.ivf.md5
+d5adee2567544c3ae4223b3f3528a770377878d2 *av1-1-b10-00-quantizer-28.ivf
+14edf588efc67570e529b0ff8aeb8e7a0c69238b *av1-1-b10-00-quantizer-28.ivf.md5
+e6dcdc106847956035e3f00aabf4470f97e1887e *av1-1-b10-00-quantizer-29.ivf
+413c5cb778611c7c1a810b53861b9ab1fb391f17 *av1-1-b10-00-quantizer-29.ivf.md5
+b5e98b3f6b1db04d46bf43064c6ac64f797aff00 *av1-1-b10-00-quantizer-30.ivf
+d1a603661d76c28658c7cd2892b408e91d77893e *av1-1-b10-00-quantizer-30.ivf.md5
+80168371d1150e82e3f46bcbbcabba458b835b19 *av1-1-b10-00-quantizer-31.ivf
+904ecd033d4af5239c4d5b3f86e51ed5c3c2e3fb *av1-1-b10-00-quantizer-31.ivf.md5
+96291f6ace85980892d135a5b74188cd629c325f *av1-1-b10-00-quantizer-32.ivf
+a5ceace390d4a75d48281fe29060c21557e4f5ae *av1-1-b10-00-quantizer-32.ivf.md5
+0f80495de34eae07c4905b72573a315a879390ec *av1-1-b10-00-quantizer-33.ivf
+72b8f662973a660412946687dff878b276ae518e *av1-1-b10-00-quantizer-33.ivf.md5
+24905e3be7db320994b7fb8311dfd50a7c9e54da *av1-1-b10-00-quantizer-34.ivf
+cea514bb1b7b064c4d31914a2cb266611c278577 *av1-1-b10-00-quantizer-34.ivf.md5
+083012960dd7c17d3b00fa0e807759c98faded8f *av1-1-b10-00-quantizer-35.ivf
+de5fdb9e1e581484af1cc7d2dd3c3e84c90cebb2 *av1-1-b10-00-quantizer-35.ivf.md5
+f725f179aeee5b413620c0dd81b007b245c2a7ed *av1-1-b10-00-quantizer-36.ivf
+246b1931c04c02df1f168090e2650827cd5dbabd *av1-1-b10-00-quantizer-36.ivf.md5
+f6aa824156e9848f237481889a8103eb6130f31d *av1-1-b10-00-quantizer-37.ivf
+a8f78dd15fc2994369a08c2ddddcd0760c62ea5b *av1-1-b10-00-quantizer-37.ivf.md5
+a8dd662338c493aea266b99203e70af25982633f *av1-1-b10-00-quantizer-38.ivf
+09f36d998e85d0450060f540e50b075ae1432fc6 *av1-1-b10-00-quantizer-38.ivf.md5
+d97428871720ed658da6ed0e3f7c15da83387e4c *av1-1-b10-00-quantizer-39.ivf
+8c5230048909ee8f86f87c116f153cd910d0141f *av1-1-b10-00-quantizer-39.ivf.md5
+86e754e55e9b63c6e0a4fef01761414f8a6b61ca *av1-1-b10-00-quantizer-40.ivf
+99a71accf6457264e45ca80d3b1f082ee5acdecc *av1-1-b10-00-quantizer-40.ivf.md5
+9d18b7236506ab7e107c062620b64096ec0cf423 *av1-1-b10-00-quantizer-41.ivf
+5771159a9a7c7b66c9e13bb13ec3d53b37860208 *av1-1-b10-00-quantizer-41.ivf.md5
+54b72bc879a80e66613f421e67db62bba1c0041b *av1-1-b10-00-quantizer-42.ivf
+bf958236883ee7209ef4cb0b7503b430634a291e *av1-1-b10-00-quantizer-42.ivf.md5
+a06d5321a51d90404dd7085ae511d7df5d5e1e05 *av1-1-b10-00-quantizer-43.ivf
+ddb25723d976043d863634b9dc3b5fb84a245803 *av1-1-b10-00-quantizer-43.ivf.md5
+2ea0b64c170d7299dae1c14a8a49349aee8e0d08 *av1-1-b10-00-quantizer-44.ivf
+d18bde1b4893792173fa2014665e9364395ad5e9 *av1-1-b10-00-quantizer-44.ivf.md5
+73e506a32d3518e23424f231c7b5323d7a34a3d6 *av1-1-b10-00-quantizer-45.ivf
+be6224ebc77a3e5fb9c1645b876007e584a09d89 *av1-1-b10-00-quantizer-45.ivf.md5
+841223871374464194edc739c48dc7cefd1ff255 *av1-1-b10-00-quantizer-46.ivf
+4766d616f923496a8dc113c9b7f875f0c0735f9a *av1-1-b10-00-quantizer-46.ivf.md5
+8bbbbea130aaea453f7b826956a5520d10a0eccf *av1-1-b10-00-quantizer-47.ivf
+3ea21fac0c492b03d8ec25e4ee0971cd57e5f71a *av1-1-b10-00-quantizer-47.ivf.md5
+3ce83e0f1e1835b9a6c10fe502a16fd3650839e0 *av1-1-b10-00-quantizer-48.ivf
+b468de2c09fca5a6b2bb7a20bab4afd8d192c31d *av1-1-b10-00-quantizer-48.ivf.md5
+f3a757c678aa00f9a9c4c4658d37733fd935925a *av1-1-b10-00-quantizer-49.ivf
+f888dc88db576122695d4eb41c486aacd28a2d1d *av1-1-b10-00-quantizer-49.ivf.md5
+a9d78aaef105cc5a95b7ebb54783f37e75673123 *av1-1-b10-00-quantizer-50.ivf
+06d0c5e79cc794030c4be022089b1d12c1383f71 *av1-1-b10-00-quantizer-50.ivf.md5
+165c20ee372f83682d094541097e375227353239 *av1-1-b10-00-quantizer-51.ivf
+b3d90214b8c6e6f6d9357bb5784d10081325c356 *av1-1-b10-00-quantizer-51.ivf.md5
+5b3ea7a18654d943065f5c176974c3960b56664e *av1-1-b10-00-quantizer-52.ivf
+dc61a6e4e2549074130023b14b137fb4fe442ce3 *av1-1-b10-00-quantizer-52.ivf.md5
+74c3b5851b6a94d33b575a689eb8d34592e95d5f *av1-1-b10-00-quantizer-53.ivf
+a80e43a0fb2b852426bd941b8d4b8f56690e9bc9 *av1-1-b10-00-quantizer-53.ivf.md5
+d05b8dea2cddd4f0d9e792f42f71afbd29f7811c *av1-1-b10-00-quantizer-54.ivf
+432937893321f4bd25fa400b8988c5788cb06ecf *av1-1-b10-00-quantizer-54.ivf.md5
+4eaee0f1970426be0bbeb7d4fccdc7e804e9bea4 *av1-1-b10-00-quantizer-55.ivf
+710ab95ce1dcd2540db4477ff4ee6ab771fe0759 *av1-1-b10-00-quantizer-55.ivf.md5
+fe637930c9faa8744cba37effc4cb5510315d1c0 *av1-1-b10-00-quantizer-56.ivf
+2f9431b30523fb6a3e4122f22c6c3ff7b96a7987 *av1-1-b10-00-quantizer-56.ivf.md5
+ed54fc7fcec194eef1f50adbbe12a6a36ab6836b *av1-1-b10-00-quantizer-57.ivf
+43bccac7800b399210cf15520a83739c23a5d9c7 *av1-1-b10-00-quantizer-57.ivf.md5
+a7b8d628ba3e4c5f37aa6a3d7b82afda73ac89dc *av1-1-b10-00-quantizer-58.ivf
+b26638272b787df54f45a46629b852acbcb73e3d *av1-1-b10-00-quantizer-58.ivf.md5
+c077f22ff547fb5ffd020e8dac91d05942fb52df *av1-1-b10-00-quantizer-59.ivf
+4efd99cc0891bf345b8cd2ae8e21709d61be497b *av1-1-b10-00-quantizer-59.ivf.md5
+301ab53039d75e1ffa8cc6a0874d9ea94e4a6a0d *av1-1-b10-00-quantizer-60.ivf
+4729bd734a6edd2d8d0432a3f66b3d91d565050e *av1-1-b10-00-quantizer-60.ivf.md5
+c78640d3211034df9fcb273bdfc18625819652f2 *av1-1-b10-00-quantizer-61.ivf
+3d823eb2b33ccfea68db506626bcbecf49b0f167 *av1-1-b10-00-quantizer-61.ivf.md5
+bf241a449a28773b93e6e529a06dfc28109577e4 *av1-1-b10-00-quantizer-62.ivf
+75457d8476f1927f737d089dcf3d0f7f99f3c4fb *av1-1-b10-00-quantizer-62.ivf.md5
+8b6eb3fff2e0db7eac775b08c745250ca591e2d9 *av1-1-b10-00-quantizer-63.ivf
+63ea689d025593e5d91760785b8e446d04d4671e *av1-1-b10-00-quantizer-63.ivf.md5
+a9f7ea6312a533cc6426a6145edd190d45813c37 *av1-1-b8-02-allintra.ivf
+8fd8f789cfee1069d20f3e2c241f5cad7292239e *av1-1-b8-02-allintra.ivf.md5
+e69e41fee40b408b6eebcc79f266a95f2ee24f9e *av1-1-b8-03-sizedown.mkv
+8c528fb3ccda959a29721566e132f730935ca32b *av1-1-b8-03-sizedown.mkv.md5
+1889da5ee1708007e47bb887470ac477e1d7ba01 *av1-1-b8-03-sizeup.mkv
+8de81b170635d456602dc8923a8b39c534d01fa8 *av1-1-b8-03-sizeup.mkv.md5
+d3ed7de0aa8c155fe35e0f5f4203240710d31383 *park_joy_90p_8_420_monochrome.y4m
+5b3f0907407b809aa66b62cb080feda8c92454ca *park_joy_90p_8_420_vertical_csp.y4m
+caf8b6a5f1a5bcb38afae8a54a08c4f4459aafa3 *vase10x10_tiles.txt
+e14825f50ff845b8a6932c64cb254007a0b5e3a1 *av1-1-b8-22-svc-L2T1.ivf
+0f75f2ac44e61fc83be70c955410fa378e433237 *av1-1-b8-22-svc-L2T1.ivf.md5
+e94687eb0e90179b3800b6d5e11eb7e9bfb34eec *av1-1-b8-22-svc-L1T2.ivf
+2bc12b16385ea14323bc79607fb8dfbd7edaf8ef *av1-1-b8-22-svc-L1T2.ivf.md5
+32ef2f14ee9cb11a24a22934f4c065e926e5d236 *av1-1-b8-22-svc-L2T2.ivf
+f476a10ff06d750129f8229755d51e17ff141b2a *av1-1-b8-22-svc-L2T2.ivf.md5
+afca5502a489692b0a3c120370b0f43b8fc572a1 *av1-1-b8-04-cdfupdate.ivf
+13b9423155a08d5e3a2fd9ae4a973bb046718cdf *av1-1-b8-04-cdfupdate.ivf.md5
+f064290d7fcd3b3de19020e8aec6c43c88d3a505 *av1-1-b8-05-mv.ivf
+bff316e63ded5559116bdc2fa4aa97ad7b1a1761 *av1-1-b8-05-mv.ivf.md5
+b48a717c7c003b8dd23c3c2caed1ac673380fdb3 *av1-1-b8-06-mfmv.ivf
+1424e3cb53e00eb56b94f4c725826274212c42b6 *av1-1-b8-06-mfmv.ivf.md5
+f8724ed96272ddbc35776908f2df7cb9955766a9 *paris_352_288_30.y4m
+11bb40026103182c23a88133edafca369e5575e2 *av1-1-b8-23-film_grain-50.ivf
+c58ccf7ff04711acc559c06f0bfce3c5b14800c3 *av1-1-b8-23-film_grain-50.ivf.md5
+2f883c7e11c21a31f79bd9c809541be90b0c7c4a *av1-1-b10-23-film_grain-50.ivf
+83f2094fca597ad38b4fd623b807de1774c53ffb *av1-1-b10-23-film_grain-50.ivf.md5
diff --git a/libs/libaom/src/test/test.cmake b/libs/libaom/src/test/test.cmake
new file mode 100644
index 000000000..d4d3b298d
--- /dev/null
+++ b/libs/libaom/src/test/test.cmake
@@ -0,0 +1,471 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_TEST_TEST_CMAKE_)
+  return()
+endif() # AOM_TEST_TEST_CMAKE_
+set(AOM_TEST_TEST_CMAKE_ 1)
+
+include(FindPythonInterp)
+include(ProcessorCount)
+
+include("${AOM_ROOT}/test/test_data_util.cmake")
+
+set(AOM_UNIT_TEST_DATA_LIST_FILE "${AOM_ROOT}/test/test-data.sha1")
+
+list(APPEND AOM_UNIT_TEST_WRAPPER_SOURCES "${AOM_GEN_SRC_DIR}/usage_exit.c"
+            "${AOM_ROOT}/test/test_libaom.cc")
+
+list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
+            "${AOM_ROOT}/test/acm_random.h"
+            "${AOM_ROOT}/test/aom_integer_test.cc"
+            "${AOM_ROOT}/test/av1_config_test.cc"
+            "${AOM_ROOT}/test/blockd_test.cc"
+            "${AOM_ROOT}/test/clear_system_state.h"
+            "${AOM_ROOT}/test/codec_factory.h"
+            "${AOM_ROOT}/test/decode_test_driver.cc"
+            "${AOM_ROOT}/test/decode_test_driver.h"
+            "${AOM_ROOT}/test/function_equivalence_test.h"
+            "${AOM_ROOT}/test/log2_test.cc"
+            "${AOM_ROOT}/test/md5_helper.h"
+            "${AOM_ROOT}/test/metadata_test.cc"
+            "${AOM_ROOT}/test/register_state_check.h"
+            "${AOM_ROOT}/test/test_vectors.cc"
+            "${AOM_ROOT}/test/test_vectors.h"
+            "${AOM_ROOT}/test/transform_test_base.h"
+            "${AOM_ROOT}/test/util.h"
+            "${AOM_ROOT}/test/video_source.h")
+
+if(CONFIG_INTERNAL_STATS)
+  list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
+              "${AOM_ROOT}/test/hbd_metrics_test.cc")
+endif()
+
+list(APPEND AOM_UNIT_TEST_DECODER_SOURCES "${AOM_ROOT}/test/decode_api_test.cc"
+            "${AOM_ROOT}/test/external_frame_buffer_test.cc"
+            "${AOM_ROOT}/test/invalid_file_test.cc"
+            "${AOM_ROOT}/test/test_vector_test.cc"
+            "${AOM_ROOT}/test/ivf_video_source.h")
+
+list(APPEND AOM_UNIT_TEST_ENCODER_SOURCES
+            "${AOM_ROOT}/test/active_map_test.cc"
+            "${AOM_ROOT}/test/altref_test.cc"
+            "${AOM_ROOT}/test/aq_segment_test.cc"
+            "${AOM_ROOT}/test/borders_test.cc"
+            "${AOM_ROOT}/test/cpu_speed_test.cc"
+            "${AOM_ROOT}/test/datarate_test.cc"
+            "${AOM_ROOT}/test/datarate_test.h"
+            "${AOM_ROOT}/test/svc_datarate_test.cc"
+            "${AOM_ROOT}/test/encode_api_test.cc"
+            "${AOM_ROOT}/test/encode_test_driver.cc"
+            "${AOM_ROOT}/test/encode_test_driver.h"
+            "${AOM_ROOT}/test/end_to_end_test.cc"
+            "${AOM_ROOT}/test/fwd_kf_test.cc"
+            "${AOM_ROOT}/test/gf_pyr_height_test.cc"
+            "${AOM_ROOT}/test/rt_end_to_end_test.cc"
+            "${AOM_ROOT}/test/error_resilience_test.cc"
+            "${AOM_ROOT}/test/frame_size_tests.cc"
+            "${AOM_ROOT}/test/horz_superres_test.cc"
+            "${AOM_ROOT}/test/i420_video_source.h"
+            "${AOM_ROOT}/test/level_test.cc"
+            "${AOM_ROOT}/test/lossless_test.cc"
+            "${AOM_ROOT}/test/monochrome_test.cc"
+            "${AOM_ROOT}/test/qm_test.cc"
+            "${AOM_ROOT}/test/resize_test.cc"
+            "${AOM_ROOT}/test/scalability_test.cc"
+            "${AOM_ROOT}/test/y4m_test.cc"
+            "${AOM_ROOT}/test/y4m_video_source.h"
+            "${AOM_ROOT}/test/yuv_video_source.h"
+            "${AOM_ROOT}/test/time_stamp_test.cc")
+
+list(APPEND AOM_DECODE_PERF_TEST_SOURCES "${AOM_ROOT}/test/decode_perf_test.cc")
+list(APPEND AOM_ENCODE_PERF_TEST_SOURCES "${AOM_ROOT}/test/encode_perf_test.cc")
+list(APPEND AOM_UNIT_TEST_WEBM_SOURCES "${AOM_ROOT}/test/webm_video_source.h")
+list(APPEND AOM_TEST_INTRA_PRED_SPEED_SOURCES "${AOM_GEN_SRC_DIR}/usage_exit.c"
+            "${AOM_ROOT}/test/test_intra_pred_speed.cc")
+
+if(NOT BUILD_SHARED_LIBS)
+  list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
+              "${AOM_ROOT}/test/av1_common_int_test.cc"
+              "${AOM_ROOT}/test/cdef_test.cc"
+              "${AOM_ROOT}/test/cfl_test.cc"
+              "${AOM_ROOT}/test/convolve_test.cc"
+              "${AOM_ROOT}/test/hiprec_convolve_test.cc"
+              "${AOM_ROOT}/test/hiprec_convolve_test_util.cc"
+              "${AOM_ROOT}/test/hiprec_convolve_test_util.h"
+              "${AOM_ROOT}/test/intrabc_test.cc"
+              "${AOM_ROOT}/test/intrapred_test.cc"
+              "${AOM_ROOT}/test/lpf_test.cc"
+              "${AOM_ROOT}/test/scan_test.cc"
+              "${AOM_ROOT}/test/selfguided_filter_test.cc"
+              "${AOM_ROOT}/test/simd_cmp_impl.h"
+              "${AOM_ROOT}/test/simd_impl.h")
+
+  if(CONFIG_ACCOUNTING)
+    list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
+                "${AOM_ROOT}/test/accounting_test.cc")
+  endif()
+
+  if(CONFIG_AV1_DECODER AND CONFIG_AV1_ENCODER)
+    list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
+                "${AOM_ROOT}/test/av1_encoder_parms_get_to_decoder.cc"
+                "${AOM_ROOT}/test/av1_ext_tile_test.cc"
+                "${AOM_ROOT}/test/binary_codes_test.cc"
+                "${AOM_ROOT}/test/boolcoder_test.cc"
+                "${AOM_ROOT}/test/cnn_test.cc"
+                "${AOM_ROOT}/test/coding_path_sync.cc"
+                "${AOM_ROOT}/test/decode_multithreaded_test.cc"
+                "${AOM_ROOT}/test/divu_small_test.cc"
+                "${AOM_ROOT}/test/dr_prediction_test.cc"
+                "${AOM_ROOT}/test/ec_test.cc"
+                "${AOM_ROOT}/test/ethread_test.cc"
+                "${AOM_ROOT}/test/film_grain_table_test.cc"
+                "${AOM_ROOT}/test/sb_multipass_test.cc"
+                "${AOM_ROOT}/test/segment_binarization_sync.cc"
+                "${AOM_ROOT}/test/superframe_test.cc"
+                "${AOM_ROOT}/test/tile_independence_test.cc"
+                "${AOM_ROOT}/test/temporal_filter_planewise_test.cc"
+                "${AOM_ROOT}/test/temporal_filter_yuv_test.cc")
+    if(CONFIG_REALTIME_ONLY)
+      list(REMOVE_ITEM AOM_UNIT_TEST_COMMON_SOURCES
+                       "${AOM_ROOT}/test/cnn_test.cc"
+                       "${AOM_ROOT}/test/temporal_filter_yuv_test.cc")
+    endif()
+    if(NOT CONFIG_AV1_HIGHBITDEPTH)
+      list(REMOVE_ITEM AOM_UNIT_TEST_COMMON_SOURCES
+                       "${AOM_ROOT}/test/coding_path_sync.cc")
+    endif()
+  endif()
+
+  list(APPEND AOM_UNIT_TEST_COMMON_INTRIN_NEON
+              "${AOM_ROOT}/test/simd_cmp_neon.cc")
+  if(HAVE_NEON)
+    list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
+                "${AOM_ROOT}/test/simd_neon_test.cc")
+  endif()
+
+  list(APPEND AOM_UNIT_TEST_COMMON_INTRIN_SSE2
+              "${AOM_ROOT}/test/simd_cmp_sse2.cc")
+  if(HAVE_SSE2)
+    list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
+                "${AOM_ROOT}/test/simd_sse2_test.cc")
+  endif()
+
+  list(APPEND AOM_UNIT_TEST_COMMON_INTRIN_SSSE3
+              "${AOM_ROOT}/test/simd_cmp_ssse3.cc")
+  if(HAVE_SSSE3)
+    list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
+                "${AOM_ROOT}/test/simd_ssse3_test.cc")
+  endif()
+
+  if(HAVE_SSE4)
+    list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
+                "${AOM_ROOT}/test/simd_sse4_test.cc")
+  endif()
+
+  if(HAVE_SSE4_1)
+    list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
+                "${AOM_ROOT}/test/filterintra_test.cc")
+  endif()
+
+  list(APPEND AOM_UNIT_TEST_COMMON_INTRIN_AVX2
+              "${AOM_ROOT}/test/simd_cmp_avx2.cc")
+  if(HAVE_AVX2)
+    list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
+                "${AOM_ROOT}/test/simd_avx2_test.cc")
+  endif()
+
+  list(APPEND AOM_UNIT_TEST_ENCODER_SOURCES
+              "${AOM_ROOT}/test/arf_freq_test.cc"
+              "${AOM_ROOT}/test/av1_convolve_2d_test.cc"
+              "${AOM_ROOT}/test/av1_convolve_2d_test_util.cc"
+              "${AOM_ROOT}/test/av1_convolve_2d_test_util.h"
+              "${AOM_ROOT}/test/av1_fwd_txfm1d_test.cc"
+              "${AOM_ROOT}/test/av1_fwd_txfm2d_test.cc"
+              "${AOM_ROOT}/test/av1_inv_txfm1d_test.cc"
+              "${AOM_ROOT}/test/av1_inv_txfm2d_test.cc"
+              "${AOM_ROOT}/test/av1_nn_predict_test.cc"
+              "${AOM_ROOT}/test/av1_round_shift_array_test.cc"
+              "${AOM_ROOT}/test/av1_txfm_test.cc"
+              "${AOM_ROOT}/test/av1_txfm_test.h"
+              "${AOM_ROOT}/test/av1_wedge_utils_test.cc"
+              "${AOM_ROOT}/test/avg_test.cc"
+              "${AOM_ROOT}/test/blend_a64_mask_1d_test.cc"
+              "${AOM_ROOT}/test/blend_a64_mask_test.cc"
+              "${AOM_ROOT}/test/comp_avg_pred_test.cc"
+              "${AOM_ROOT}/test/comp_avg_pred_test.h"
+              "${AOM_ROOT}/test/comp_mask_variance_test.cc"
+              "${AOM_ROOT}/test/edge_detect_test.cc"
+              "${AOM_ROOT}/test/encodetxb_test.cc"
+              "${AOM_ROOT}/test/error_block_test.cc"
+              "${AOM_ROOT}/test/fft_test.cc"
+              "${AOM_ROOT}/test/fwht4x4_test.cc"
+              "${AOM_ROOT}/test/fdct4x4_test.cc"
+              "${AOM_ROOT}/test/hadamard_test.cc"
+              "${AOM_ROOT}/test/horver_correlation_test.cc"
+              "${AOM_ROOT}/test/masked_sad_test.cc"
+              "${AOM_ROOT}/test/masked_variance_test.cc"
+              "${AOM_ROOT}/test/motion_vector_test.cc"
+              "${AOM_ROOT}/test/noise_model_test.cc"
+              "${AOM_ROOT}/test/obmc_sad_test.cc"
+              "${AOM_ROOT}/test/obmc_variance_test.cc"
+              "${AOM_ROOT}/test/pickrst_test.cc"
+              "${AOM_ROOT}/test/quantize_func_test.cc"
+              "${AOM_ROOT}/test/sad_test.cc"
+              "${AOM_ROOT}/test/subtract_test.cc"
+              "${AOM_ROOT}/test/reconinter_test.cc"
+              "${AOM_ROOT}/test/sum_squares_test.cc"
+              "${AOM_ROOT}/test/variance_test.cc"
+              "${AOM_ROOT}/test/wiener_test.cc"
+              "${AOM_ROOT}/test/frame_error_test.cc"
+              "${AOM_ROOT}/test/warp_filter_test.cc"
+              "${AOM_ROOT}/test/warp_filter_test_util.cc"
+              "${AOM_ROOT}/test/warp_filter_test_util.h")
+
+  list(APPEND AOM_UNIT_TEST_ENCODER_INTRIN_SSE4_1
+              "${AOM_ROOT}/test/av1_highbd_iht_test.cc"
+              "${AOM_ROOT}/test/av1_quantize_test.cc"
+              "${AOM_ROOT}/test/corner_match_test.cc"
+              "${AOM_ROOT}/test/simd_cmp_sse4.cc")
+
+  if(NOT CONFIG_AV1_HIGHBITDEPTH)
+    list(REMOVE_ITEM AOM_UNIT_TEST_ENCODER_INTRIN_SSE4_1
+                     "${AOM_ROOT}/test/av1_quantize_test.cc")
+  endif()
+
+  if(NOT (HAVE_SSE2 OR HAVE_NEON))
+    list(REMOVE_ITEM AOM_UNIT_TEST_ENCODER_SOURCES
+                     "${AOM_ROOT}/test/quantize_func_test.cc")
+  endif()
+
+  if(HAVE_SSE4_1)
+    list(APPEND AOM_UNIT_TEST_ENCODER_SOURCES
+                "${AOM_ROOT}/test/av1_convolve_scale_test.cc"
+                "${AOM_ROOT}/test/av1_horz_only_frame_superres_test.cc"
+                "${AOM_ROOT}/test/intra_edge_test.cc")
+
+  endif()
+
+  if(HAVE_SSE4_2)
+    list(APPEND AOM_UNIT_TEST_ENCODER_SOURCES "${AOM_ROOT}/test/hash_test.cc")
+  endif()
+
+endif()
+
+if(ENABLE_TESTS)
+  find_package(PythonInterp)
+  if(NOT PYTHONINTERP_FOUND)
+    message(
+      FATAL_ERROR "--- Unit tests require Python, rerun cmake with "
+                  "-DENABLE_TESTS=0 to avoid this error, or install Python and "
+                  "make sure it's in your PATH.")
+  endif()
+
+  if(BUILD_SHARED_LIBS AND APPLE) # Silence an RPATH warning.
+    set(CMAKE_MACOSX_RPATH 1)
+  endif()
+
+  include_directories(
+    "${AOM_ROOT}/third_party/googletest/src/googletest/include")
+
+  include_directories("${AOM_ROOT}/third_party/googletest/src/googletest")
+  add_library(
+    aom_gtest STATIC
+    "${AOM_ROOT}/third_party/googletest/src/googletest/src/gtest-all.cc")
+  if(MSVC OR WIN32)
+    target_compile_definitions(aom_gtest PRIVATE GTEST_OS_WINDOWS=1)
+  elseif(CONFIG_MULTITHREAD AND CMAKE_USE_PTHREADS_INIT)
+    target_compile_definitions(aom_gtest PRIVATE GTEST_HAS_PTHREAD=1)
+  else()
+    target_compile_definitions(aom_gtest PRIVATE GTEST_HAS_PTHREAD=0)
+  endif()
+endif()
+
+# Setup testdata download targets, test build targets, and test run targets. The
+# libaom and app util targets must exist before this function is called.
+function(setup_aom_test_targets)
+
+  # TODO(tomfinegan): Build speed optimization. $AOM_UNIT_TEST_COMMON_SOURCES
+  # and $AOM_UNIT_TEST_ENCODER_SOURCES are very large. The build of test targets
+  # could be sped up (on multicore build machines) by compiling sources in each
+  # list into separate object library targets, and then linking them into
+  # test_libaom.
+  add_library(test_aom_common OBJECT ${AOM_UNIT_TEST_COMMON_SOURCES})
+  add_dependencies(test_aom_common aom)
+
+  if(CONFIG_AV1_DECODER)
+    add_library(test_aom_decoder OBJECT ${AOM_UNIT_TEST_DECODER_SOURCES})
+    add_dependencies(test_aom_decoder aom)
+  endif()
+
+  if(CONFIG_AV1_ENCODER)
+    add_library(test_aom_encoder OBJECT ${AOM_UNIT_TEST_ENCODER_SOURCES})
+    add_dependencies(test_aom_encoder aom)
+  endif()
+
+  add_executable(test_libaom ${AOM_UNIT_TEST_WRAPPER_SOURCES}
+                             $<TARGET_OBJECTS:aom_common_app_util>
+                             $<TARGET_OBJECTS:test_aom_common>)
+  list(APPEND AOM_APP_TARGETS test_libaom)
+
+  if(CONFIG_AV1_DECODER)
+    target_sources(test_libaom PRIVATE $<TARGET_OBJECTS:aom_decoder_app_util>
+                   $<TARGET_OBJECTS:test_aom_decoder>)
+
+    if(ENABLE_DECODE_PERF_TESTS AND CONFIG_WEBM_IO)
+      target_sources(test_libaom PRIVATE ${AOM_DECODE_PERF_TEST_SOURCES})
+    endif()
+  endif()
+
+  if(CONFIG_AV1_ENCODER)
+    target_sources(test_libaom PRIVATE $<TARGET_OBJECTS:test_aom_encoder>
+                   $<TARGET_OBJECTS:aom_encoder_app_util>)
+
+    if(ENABLE_ENCODE_PERF_TESTS)
+      target_sources(test_libaom PRIVATE ${AOM_ENCODE_PERF_TEST_SOURCES})
+    endif()
+
+    if(NOT BUILD_SHARED_LIBS)
+      add_executable(test_intra_pred_speed
+                     ${AOM_TEST_INTRA_PRED_SPEED_SOURCES}
+                     $<TARGET_OBJECTS:aom_common_app_util>)
+      target_link_libraries(test_intra_pred_speed ${AOM_LIB_LINK_TYPE} aom
+                            aom_gtest)
+      list(APPEND AOM_APP_TARGETS test_intra_pred_speed)
+    endif()
+  endif()
+
+  target_link_libraries(test_libaom ${AOM_LIB_LINK_TYPE} aom aom_gtest)
+
+  if(CONFIG_LIBYUV)
+    target_sources(test_libaom PRIVATE $<TARGET_OBJECTS:yuv>)
+  endif()
+  if(CONFIG_WEBM_IO)
+    target_sources(test_libaom PRIVATE $<TARGET_OBJECTS:webm>)
+  endif()
+  if(HAVE_SSE2)
+    add_intrinsics_source_to_target("-msse2" "test_libaom"
+                                    "AOM_UNIT_TEST_COMMON_INTRIN_SSE2")
+  endif()
+  if(HAVE_SSSE3)
+    add_intrinsics_source_to_target("-mssse3" "test_libaom"
+                                    "AOM_UNIT_TEST_COMMON_INTRIN_SSSE3")
+  endif()
+  if(HAVE_SSE4_1)
+    add_intrinsics_source_to_target("-msse4.1" "test_libaom"
+                                    "AOM_UNIT_TEST_COMMON_INTRIN_SSE4_1")
+    if(CONFIG_AV1_ENCODER)
+      if(AOM_UNIT_TEST_ENCODER_INTRIN_SSE4_1)
+        add_intrinsics_source_to_target("-msse4.1" "test_libaom"
+                                        "AOM_UNIT_TEST_ENCODER_INTRIN_SSE4_1")
+      endif()
+    endif()
+  endif()
+  if(HAVE_AVX2)
+    add_intrinsics_source_to_target("-mavx2" "test_libaom"
+                                    "AOM_UNIT_TEST_COMMON_INTRIN_AVX2")
+  endif()
+  if(HAVE_NEON)
+    add_intrinsics_source_to_target("${AOM_NEON_INTRIN_FLAG}" "test_libaom"
+                                    "AOM_UNIT_TEST_COMMON_INTRIN_NEON")
+  endif()
+
+  if(ENABLE_TESTDATA)
+    make_test_data_lists("${AOM_UNIT_TEST_DATA_LIST_FILE}" test_files
+                         test_file_checksums)
+    list(LENGTH test_files num_test_files)
+    list(LENGTH test_file_checksums num_test_file_checksums)
+
+    math(EXPR max_file_index "${num_test_files} - 1")
+    foreach(test_index RANGE ${max_file_index})
+      list(GET test_files ${test_index} test_file)
+      list(GET test_file_checksums ${test_index} test_file_checksum)
+      add_custom_target(
+        testdata_${test_index}
+        COMMAND ${CMAKE_COMMAND}
+                -DAOM_CONFIG_DIR="${AOM_CONFIG_DIR}" -DAOM_ROOT="${AOM_ROOT}"
+                -DAOM_TEST_FILE="${test_file}"
+                -DAOM_TEST_CHECKSUM=${test_file_checksum} -P
+                "${AOM_ROOT}/test/test_data_download_worker.cmake")
+      list(APPEND testdata_targets testdata_${test_index})
+    endforeach()
+
+    # Create a custom build target for running each test data download target.
+    add_custom_target(testdata)
+    add_dependencies(testdata ${testdata_targets})
+
+    # Skip creation of test run targets when generating for Visual Studio and
+    # Xcode unless the user explicitly requests IDE test hosting. This is done
+    # to make build cycles in the IDE tolerable when the IDE command for build
+    # project is used to build AOM. Default behavior in IDEs is to build all
+    # targets, and the test run takes hours.
+    if(((NOT MSVC) AND (NOT XCODE)) OR ENABLE_IDE_TEST_HOSTING)
+
+      # Pick a reasonable number of targets (this controls parallelization).
+      processorcount(num_test_targets)
+      if(num_test_targets EQUAL 0) # Just default to 10 targets when there's no
+                                   # processor count available.
+        set(num_test_targets 10)
+      endif()
+
+      math(EXPR max_shard_index "${num_test_targets} - 1")
+      foreach(shard_index RANGE ${max_shard_index})
+        set(test_name "test_${shard_index}")
+        add_custom_target(${test_name}
+                          COMMAND ${CMAKE_COMMAND}
+                                  -DGTEST_SHARD_INDEX=${shard_index}
+                                  -DGTEST_TOTAL_SHARDS=${num_test_targets}
+                                  -DTEST_LIBAOM=$<TARGET_FILE:test_libaom> -P
+                                  "${AOM_ROOT}/test/test_runner.cmake"
+                          DEPENDS testdata test_libaom)
+        list(APPEND test_targets ${test_name})
+      endforeach()
+      add_custom_target(runtests)
+      add_dependencies(runtests ${test_targets})
+    endif()
+  endif()
+
+  # Collect all variables containing libaom test source files.
+  get_cmake_property(all_cmake_vars VARIABLES)
+  foreach(var ${all_cmake_vars})
+
+    # https://github.com/cheshirekow/cmake_format/issues/34
+    # cmake-format: off
+    if (("${var}" MATCHES "_TEST_" AND NOT
+         "${var}" MATCHES
+         "_DATA_\|_CMAKE_\|INTRA_PRED\|_COMPILED\|_HOSTING\|_PERF_\|CODER_")
+        OR (CONFIG_AV1_ENCODER AND ENABLE_ENCODE_PERF_TESTS AND
+            "${var}" MATCHES "_ENCODE_PERF_TEST_")
+        OR (CONFIG_AV1_DECODER AND ENABLE_DECODE_PERF_TESTS AND
+            "${var}" MATCHES "_DECODE_PERF_TEST_")
+        OR (CONFIG_AV1_ENCODER AND "${var}" MATCHES "_TEST_ENCODER_")
+        OR (CONFIG_AV1_DECODER AND  "${var}" MATCHES "_TEST_DECODER_"))
+      list(APPEND aom_test_source_vars ${var})
+    endif()
+    # cmake-format: on
+  endforeach()
+
+  # Libaom_test_srcs.txt generation.
+  set(libaom_test_srcs_txt_file "${AOM_CONFIG_DIR}/libaom_test_srcs.txt")
+  file(WRITE "${libaom_test_srcs_txt_file}"
+       "# This file is generated. DO NOT EDIT.\n")
+
+  # Static source file list first.
+  foreach(aom_test_source_var ${aom_test_source_vars})
+    foreach(file ${${aom_test_source_var}})
+      if(NOT "${file}" MATCHES "${AOM_CONFIG_DIR}")
+        string(REPLACE "${AOM_ROOT}/" "" file "${file}")
+        file(APPEND "${libaom_test_srcs_txt_file}" "${file}\n")
+      endif()
+    endforeach()
+  endforeach()
+
+  set(AOM_APP_TARGETS ${AOM_APP_TARGETS} PARENT_SCOPE)
+endfunction()
diff --git a/libs/libaom/src/test/test_data_download_worker.cmake b/libs/libaom/src/test/test_data_download_worker.cmake
new file mode 100644
index 000000000..a49038888
--- /dev/null
+++ b/libs/libaom/src/test/test_data_download_worker.cmake
@@ -0,0 +1,46 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+include("${AOM_ROOT}/test/test_data_util.cmake")
+
+# https://github.com/cheshirekow/cmake_format/issues/34
+# cmake-format: off
+if (NOT AOM_ROOT OR NOT AOM_CONFIG_DIR OR NOT AOM_TEST_FILE
+    OR NOT AOM_TEST_CHECKSUM)
+  message(FATAL_ERROR
+          "AOM_ROOT, AOM_CONFIG_DIR, AOM_TEST_FILE and AOM_TEST_CHECKSUM must be
+          defined.")
+endif ()
+# cmake-format: on
+
+set(AOM_TEST_DATA_URL "https://storage.googleapis.com/aom-test-data")
+
+if(NOT AOM_TEST_DATA_PATH)
+  set(AOM_TEST_DATA_PATH "$ENV{LIBAOM_TEST_DATA_PATH}")
+endif()
+
+if("${AOM_TEST_DATA_PATH}" STREQUAL "")
+  message(
+    WARNING "Writing test data to ${AOM_CONFIG_DIR}, set "
+            "$LIBAOM_TEST_DATA_PATH in your environment to avoid this warning.")
+  set(AOM_TEST_DATA_PATH "${AOM_CONFIG_DIR}")
+endif()
+
+if(NOT EXISTS "${AOM_TEST_DATA_PATH}")
+  file(MAKE_DIRECTORY "${AOM_TEST_DATA_PATH}")
+endif()
+
+expand_test_file_paths("AOM_TEST_FILE" "${AOM_TEST_DATA_PATH}" "filepath")
+expand_test_file_paths("AOM_TEST_FILE" "${AOM_TEST_DATA_URL}" "url")
+
+check_file("${filepath}" "${AOM_TEST_CHECKSUM}" "needs_download")
+if(needs_download)
+  download_test_file("${url}" "${AOM_TEST_CHECKSUM}" "${filepath}")
+endif()
diff --git a/libs/libaom/src/test/test_data_util.cmake b/libs/libaom/src/test/test_data_util.cmake
new file mode 100644
index 000000000..050600e13
--- /dev/null
+++ b/libs/libaom/src/test/test_data_util.cmake
@@ -0,0 +1,650 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+
+list(APPEND AOM_TEST_DATA_FILE_NAMES
+            "hantro_collage_w352h288.yuv"
+            "hantro_odd.yuv"
+            "paris_352_288_30.y4m"
+            "park_joy_90p_10_420.y4m"
+            "park_joy_90p_10_422.y4m"
+            "park_joy_90p_10_444.y4m"
+            "park_joy_90p_12_420.y4m"
+            "park_joy_90p_12_422.y4m"
+            "park_joy_90p_12_444.y4m"
+            "park_joy_90p_8_420_a10-1.y4m"
+            "park_joy_90p_8_420.y4m"
+            "park_joy_90p_8_420_monochrome.y4m"
+            "park_joy_90p_8_420_vertical_csp.y4m"
+            "park_joy_90p_8_422.y4m"
+            "park_joy_90p_8_444.y4m"
+            "pixel_capture_w320h240.yuv"
+            "desktop_credits.y4m"
+            "niklas_1280_720_30.y4m"
+            "rush_hour_444.y4m"
+            "screendata.y4m"
+            "niklas_640_480_30.yuv"
+            "vase10x10.yuv"
+            "vase10x10_tiles.txt")
+
+if(ENABLE_DECODE_PERF_TESTS AND CONFIG_AV1_ENCODER)
+  list(APPEND AOM_TEST_DATA_FILE_NAMES "niklas_1280_720_30.yuv")
+endif()
+
+if(CONFIG_AV1_DECODER)
+  list(APPEND AOM_TEST_DATA_FILE_NAMES
+              "av1-1-b8-00-quantizer-00.ivf"
+              "av1-1-b8-00-quantizer-00.ivf.md5"
+              "av1-1-b8-00-quantizer-01.ivf"
+              "av1-1-b8-00-quantizer-01.ivf.md5"
+              "av1-1-b8-00-quantizer-02.ivf"
+              "av1-1-b8-00-quantizer-02.ivf.md5"
+              "av1-1-b8-00-quantizer-03.ivf"
+              "av1-1-b8-00-quantizer-03.ivf.md5"
+              "av1-1-b8-00-quantizer-04.ivf"
+              "av1-1-b8-00-quantizer-04.ivf.md5"
+              "av1-1-b8-00-quantizer-05.ivf"
+              "av1-1-b8-00-quantizer-05.ivf.md5"
+              "av1-1-b8-00-quantizer-06.ivf"
+              "av1-1-b8-00-quantizer-06.ivf.md5"
+              "av1-1-b8-00-quantizer-07.ivf"
+              "av1-1-b8-00-quantizer-07.ivf.md5"
+              "av1-1-b8-00-quantizer-08.ivf"
+              "av1-1-b8-00-quantizer-08.ivf.md5"
+              "av1-1-b8-00-quantizer-09.ivf"
+              "av1-1-b8-00-quantizer-09.ivf.md5"
+              "av1-1-b8-00-quantizer-10.ivf"
+              "av1-1-b8-00-quantizer-10.ivf.md5"
+              "av1-1-b8-00-quantizer-11.ivf"
+              "av1-1-b8-00-quantizer-11.ivf.md5"
+              "av1-1-b8-00-quantizer-12.ivf"
+              "av1-1-b8-00-quantizer-12.ivf.md5"
+              "av1-1-b8-00-quantizer-13.ivf"
+              "av1-1-b8-00-quantizer-13.ivf.md5"
+              "av1-1-b8-00-quantizer-14.ivf"
+              "av1-1-b8-00-quantizer-14.ivf.md5"
+              "av1-1-b8-00-quantizer-15.ivf"
+              "av1-1-b8-00-quantizer-15.ivf.md5"
+              "av1-1-b8-00-quantizer-16.ivf"
+              "av1-1-b8-00-quantizer-16.ivf.md5"
+              "av1-1-b8-00-quantizer-17.ivf"
+              "av1-1-b8-00-quantizer-17.ivf.md5"
+              "av1-1-b8-00-quantizer-18.ivf"
+              "av1-1-b8-00-quantizer-18.ivf.md5"
+              "av1-1-b8-00-quantizer-19.ivf"
+              "av1-1-b8-00-quantizer-19.ivf.md5"
+              "av1-1-b8-00-quantizer-20.ivf"
+              "av1-1-b8-00-quantizer-20.ivf.md5"
+              "av1-1-b8-00-quantizer-21.ivf"
+              "av1-1-b8-00-quantizer-21.ivf.md5"
+              "av1-1-b8-00-quantizer-22.ivf"
+              "av1-1-b8-00-quantizer-22.ivf.md5"
+              "av1-1-b8-00-quantizer-23.ivf"
+              "av1-1-b8-00-quantizer-23.ivf.md5"
+              "av1-1-b8-00-quantizer-24.ivf"
+              "av1-1-b8-00-quantizer-24.ivf.md5"
+              "av1-1-b8-00-quantizer-25.ivf"
+              "av1-1-b8-00-quantizer-25.ivf.md5"
+              "av1-1-b8-00-quantizer-26.ivf"
+              "av1-1-b8-00-quantizer-26.ivf.md5"
+              "av1-1-b8-00-quantizer-27.ivf"
+              "av1-1-b8-00-quantizer-27.ivf.md5"
+              "av1-1-b8-00-quantizer-28.ivf"
+              "av1-1-b8-00-quantizer-28.ivf.md5"
+              "av1-1-b8-00-quantizer-29.ivf"
+              "av1-1-b8-00-quantizer-29.ivf.md5"
+              "av1-1-b8-00-quantizer-30.ivf"
+              "av1-1-b8-00-quantizer-30.ivf.md5"
+              "av1-1-b8-00-quantizer-31.ivf"
+              "av1-1-b8-00-quantizer-31.ivf.md5"
+              "av1-1-b8-00-quantizer-32.ivf"
+              "av1-1-b8-00-quantizer-32.ivf.md5"
+              "av1-1-b8-00-quantizer-33.ivf"
+              "av1-1-b8-00-quantizer-33.ivf.md5"
+              "av1-1-b8-00-quantizer-34.ivf"
+              "av1-1-b8-00-quantizer-34.ivf.md5"
+              "av1-1-b8-00-quantizer-35.ivf"
+              "av1-1-b8-00-quantizer-35.ivf.md5"
+              "av1-1-b8-00-quantizer-36.ivf"
+              "av1-1-b8-00-quantizer-36.ivf.md5"
+              "av1-1-b8-00-quantizer-37.ivf"
+              "av1-1-b8-00-quantizer-37.ivf.md5"
+              "av1-1-b8-00-quantizer-38.ivf"
+              "av1-1-b8-00-quantizer-38.ivf.md5"
+              "av1-1-b8-00-quantizer-39.ivf"
+              "av1-1-b8-00-quantizer-39.ivf.md5"
+              "av1-1-b8-00-quantizer-40.ivf"
+              "av1-1-b8-00-quantizer-40.ivf.md5"
+              "av1-1-b8-00-quantizer-41.ivf"
+              "av1-1-b8-00-quantizer-41.ivf.md5"
+              "av1-1-b8-00-quantizer-42.ivf"
+              "av1-1-b8-00-quantizer-42.ivf.md5"
+              "av1-1-b8-00-quantizer-43.ivf"
+              "av1-1-b8-00-quantizer-43.ivf.md5"
+              "av1-1-b8-00-quantizer-44.ivf"
+              "av1-1-b8-00-quantizer-44.ivf.md5"
+              "av1-1-b8-00-quantizer-45.ivf"
+              "av1-1-b8-00-quantizer-45.ivf.md5"
+              "av1-1-b8-00-quantizer-46.ivf"
+              "av1-1-b8-00-quantizer-46.ivf.md5"
+              "av1-1-b8-00-quantizer-47.ivf"
+              "av1-1-b8-00-quantizer-47.ivf.md5"
+              "av1-1-b8-00-quantizer-48.ivf"
+              "av1-1-b8-00-quantizer-48.ivf.md5"
+              "av1-1-b8-00-quantizer-49.ivf"
+              "av1-1-b8-00-quantizer-49.ivf.md5"
+              "av1-1-b8-00-quantizer-50.ivf"
+              "av1-1-b8-00-quantizer-50.ivf.md5"
+              "av1-1-b8-00-quantizer-51.ivf"
+              "av1-1-b8-00-quantizer-51.ivf.md5"
+              "av1-1-b8-00-quantizer-52.ivf"
+              "av1-1-b8-00-quantizer-52.ivf.md5"
+              "av1-1-b8-00-quantizer-53.ivf"
+              "av1-1-b8-00-quantizer-53.ivf.md5"
+              "av1-1-b8-00-quantizer-54.ivf"
+              "av1-1-b8-00-quantizer-54.ivf.md5"
+              "av1-1-b8-00-quantizer-55.ivf"
+              "av1-1-b8-00-quantizer-55.ivf.md5"
+              "av1-1-b8-00-quantizer-56.ivf"
+              "av1-1-b8-00-quantizer-56.ivf.md5"
+              "av1-1-b8-00-quantizer-57.ivf"
+              "av1-1-b8-00-quantizer-57.ivf.md5"
+              "av1-1-b8-00-quantizer-58.ivf"
+              "av1-1-b8-00-quantizer-58.ivf.md5"
+              "av1-1-b8-00-quantizer-59.ivf"
+              "av1-1-b8-00-quantizer-59.ivf.md5"
+              "av1-1-b8-00-quantizer-60.ivf"
+              "av1-1-b8-00-quantizer-60.ivf.md5"
+              "av1-1-b8-00-quantizer-61.ivf"
+              "av1-1-b8-00-quantizer-61.ivf.md5"
+              "av1-1-b8-00-quantizer-62.ivf"
+              "av1-1-b8-00-quantizer-62.ivf.md5"
+              "av1-1-b8-00-quantizer-63.ivf"
+              "av1-1-b8-00-quantizer-63.ivf.md5"
+              "av1-1-b10-00-quantizer-00.ivf"
+              "av1-1-b10-00-quantizer-00.ivf.md5"
+              "av1-1-b10-00-quantizer-01.ivf"
+              "av1-1-b10-00-quantizer-01.ivf.md5"
+              "av1-1-b10-00-quantizer-02.ivf"
+              "av1-1-b10-00-quantizer-02.ivf.md5"
+              "av1-1-b10-00-quantizer-03.ivf"
+              "av1-1-b10-00-quantizer-03.ivf.md5"
+              "av1-1-b10-00-quantizer-04.ivf"
+              "av1-1-b10-00-quantizer-04.ivf.md5"
+              "av1-1-b10-00-quantizer-05.ivf"
+              "av1-1-b10-00-quantizer-05.ivf.md5"
+              "av1-1-b10-00-quantizer-06.ivf"
+              "av1-1-b10-00-quantizer-06.ivf.md5"
+              "av1-1-b10-00-quantizer-07.ivf"
+              "av1-1-b10-00-quantizer-07.ivf.md5"
+              "av1-1-b10-00-quantizer-08.ivf"
+              "av1-1-b10-00-quantizer-08.ivf.md5"
+              "av1-1-b10-00-quantizer-09.ivf"
+              "av1-1-b10-00-quantizer-09.ivf.md5"
+              "av1-1-b10-00-quantizer-10.ivf"
+              "av1-1-b10-00-quantizer-10.ivf.md5"
+              "av1-1-b10-00-quantizer-11.ivf"
+              "av1-1-b10-00-quantizer-11.ivf.md5"
+              "av1-1-b10-00-quantizer-12.ivf"
+              "av1-1-b10-00-quantizer-12.ivf.md5"
+              "av1-1-b10-00-quantizer-13.ivf"
+              "av1-1-b10-00-quantizer-13.ivf.md5"
+              "av1-1-b10-00-quantizer-14.ivf"
+              "av1-1-b10-00-quantizer-14.ivf.md5"
+              "av1-1-b10-00-quantizer-15.ivf"
+              "av1-1-b10-00-quantizer-15.ivf.md5"
+              "av1-1-b10-00-quantizer-16.ivf"
+              "av1-1-b10-00-quantizer-16.ivf.md5"
+              "av1-1-b10-00-quantizer-17.ivf"
+              "av1-1-b10-00-quantizer-17.ivf.md5"
+              "av1-1-b10-00-quantizer-18.ivf"
+              "av1-1-b10-00-quantizer-18.ivf.md5"
+              "av1-1-b10-00-quantizer-19.ivf"
+              "av1-1-b10-00-quantizer-19.ivf.md5"
+              "av1-1-b10-00-quantizer-20.ivf"
+              "av1-1-b10-00-quantizer-20.ivf.md5"
+              "av1-1-b10-00-quantizer-21.ivf"
+              "av1-1-b10-00-quantizer-21.ivf.md5"
+              "av1-1-b10-00-quantizer-22.ivf"
+              "av1-1-b10-00-quantizer-22.ivf.md5"
+              "av1-1-b10-00-quantizer-23.ivf"
+              "av1-1-b10-00-quantizer-23.ivf.md5"
+              "av1-1-b10-00-quantizer-24.ivf"
+              "av1-1-b10-00-quantizer-24.ivf.md5"
+              "av1-1-b10-00-quantizer-25.ivf"
+              "av1-1-b10-00-quantizer-25.ivf.md5"
+              "av1-1-b10-00-quantizer-26.ivf"
+              "av1-1-b10-00-quantizer-26.ivf.md5"
+              "av1-1-b10-00-quantizer-27.ivf"
+              "av1-1-b10-00-quantizer-27.ivf.md5"
+              "av1-1-b10-00-quantizer-28.ivf"
+              "av1-1-b10-00-quantizer-28.ivf.md5"
+              "av1-1-b10-00-quantizer-29.ivf"
+              "av1-1-b10-00-quantizer-29.ivf.md5"
+              "av1-1-b10-00-quantizer-30.ivf"
+              "av1-1-b10-00-quantizer-30.ivf.md5"
+              "av1-1-b10-00-quantizer-31.ivf"
+              "av1-1-b10-00-quantizer-31.ivf.md5"
+              "av1-1-b10-00-quantizer-32.ivf"
+              "av1-1-b10-00-quantizer-32.ivf.md5"
+              "av1-1-b10-00-quantizer-33.ivf"
+              "av1-1-b10-00-quantizer-33.ivf.md5"
+              "av1-1-b10-00-quantizer-34.ivf"
+              "av1-1-b10-00-quantizer-34.ivf.md5"
+              "av1-1-b10-00-quantizer-35.ivf"
+              "av1-1-b10-00-quantizer-35.ivf.md5"
+              "av1-1-b10-00-quantizer-36.ivf"
+              "av1-1-b10-00-quantizer-36.ivf.md5"
+              "av1-1-b10-00-quantizer-37.ivf"
+              "av1-1-b10-00-quantizer-37.ivf.md5"
+              "av1-1-b10-00-quantizer-38.ivf"
+              "av1-1-b10-00-quantizer-38.ivf.md5"
+              "av1-1-b10-00-quantizer-39.ivf"
+              "av1-1-b10-00-quantizer-39.ivf.md5"
+              "av1-1-b10-00-quantizer-40.ivf"
+              "av1-1-b10-00-quantizer-40.ivf.md5"
+              "av1-1-b10-00-quantizer-41.ivf"
+              "av1-1-b10-00-quantizer-41.ivf.md5"
+              "av1-1-b10-00-quantizer-42.ivf"
+              "av1-1-b10-00-quantizer-42.ivf.md5"
+              "av1-1-b10-00-quantizer-43.ivf"
+              "av1-1-b10-00-quantizer-43.ivf.md5"
+              "av1-1-b10-00-quantizer-44.ivf"
+              "av1-1-b10-00-quantizer-44.ivf.md5"
+              "av1-1-b10-00-quantizer-45.ivf"
+              "av1-1-b10-00-quantizer-45.ivf.md5"
+              "av1-1-b10-00-quantizer-46.ivf"
+              "av1-1-b10-00-quantizer-46.ivf.md5"
+              "av1-1-b10-00-quantizer-47.ivf"
+              "av1-1-b10-00-quantizer-47.ivf.md5"
+              "av1-1-b10-00-quantizer-48.ivf"
+              "av1-1-b10-00-quantizer-48.ivf.md5"
+              "av1-1-b10-00-quantizer-49.ivf"
+              "av1-1-b10-00-quantizer-49.ivf.md5"
+              "av1-1-b10-00-quantizer-50.ivf"
+              "av1-1-b10-00-quantizer-50.ivf.md5"
+              "av1-1-b10-00-quantizer-51.ivf"
+              "av1-1-b10-00-quantizer-51.ivf.md5"
+              "av1-1-b10-00-quantizer-52.ivf"
+              "av1-1-b10-00-quantizer-52.ivf.md5"
+              "av1-1-b10-00-quantizer-53.ivf"
+              "av1-1-b10-00-quantizer-53.ivf.md5"
+              "av1-1-b10-00-quantizer-54.ivf"
+              "av1-1-b10-00-quantizer-54.ivf.md5"
+              "av1-1-b10-00-quantizer-55.ivf"
+              "av1-1-b10-00-quantizer-55.ivf.md5"
+              "av1-1-b10-00-quantizer-56.ivf"
+              "av1-1-b10-00-quantizer-56.ivf.md5"
+              "av1-1-b10-00-quantizer-57.ivf"
+              "av1-1-b10-00-quantizer-57.ivf.md5"
+              "av1-1-b10-00-quantizer-58.ivf"
+              "av1-1-b10-00-quantizer-58.ivf.md5"
+              "av1-1-b10-00-quantizer-59.ivf"
+              "av1-1-b10-00-quantizer-59.ivf.md5"
+              "av1-1-b10-00-quantizer-60.ivf"
+              "av1-1-b10-00-quantizer-60.ivf.md5"
+              "av1-1-b10-00-quantizer-61.ivf"
+              "av1-1-b10-00-quantizer-61.ivf.md5"
+              "av1-1-b10-00-quantizer-62.ivf"
+              "av1-1-b10-00-quantizer-62.ivf.md5"
+              "av1-1-b10-00-quantizer-63.ivf"
+              "av1-1-b10-00-quantizer-63.ivf.md5"
+              "av1-1-b10-23-film_grain-50.ivf"
+              "av1-1-b10-23-film_grain-50.ivf.md5"
+              "av1-1-b8-01-size-16x16.ivf"
+              "av1-1-b8-01-size-16x16.ivf.md5"
+              "av1-1-b8-01-size-16x18.ivf"
+              "av1-1-b8-01-size-16x18.ivf.md5"
+              "av1-1-b8-01-size-16x32.ivf"
+              "av1-1-b8-01-size-16x32.ivf.md5"
+              "av1-1-b8-01-size-16x34.ivf"
+              "av1-1-b8-01-size-16x34.ivf.md5"
+              "av1-1-b8-01-size-16x64.ivf"
+              "av1-1-b8-01-size-16x64.ivf.md5"
+              "av1-1-b8-01-size-16x66.ivf"
+              "av1-1-b8-01-size-16x66.ivf.md5"
+              "av1-1-b8-01-size-18x16.ivf"
+              "av1-1-b8-01-size-18x16.ivf.md5"
+              "av1-1-b8-01-size-18x18.ivf"
+              "av1-1-b8-01-size-18x18.ivf.md5"
+              "av1-1-b8-01-size-18x32.ivf"
+              "av1-1-b8-01-size-18x32.ivf.md5"
+              "av1-1-b8-01-size-18x34.ivf"
+              "av1-1-b8-01-size-18x34.ivf.md5"
+              "av1-1-b8-01-size-18x64.ivf"
+              "av1-1-b8-01-size-18x64.ivf.md5"
+              "av1-1-b8-01-size-18x66.ivf"
+              "av1-1-b8-01-size-18x66.ivf.md5"
+              "av1-1-b8-01-size-196x196.ivf"
+              "av1-1-b8-01-size-196x196.ivf.md5"
+              "av1-1-b8-01-size-196x198.ivf"
+              "av1-1-b8-01-size-196x198.ivf.md5"
+              "av1-1-b8-01-size-196x200.ivf"
+              "av1-1-b8-01-size-196x200.ivf.md5"
+              "av1-1-b8-01-size-196x202.ivf"
+              "av1-1-b8-01-size-196x202.ivf.md5"
+              "av1-1-b8-01-size-196x208.ivf"
+              "av1-1-b8-01-size-196x208.ivf.md5"
+              "av1-1-b8-01-size-196x210.ivf"
+              "av1-1-b8-01-size-196x210.ivf.md5"
+              "av1-1-b8-01-size-196x224.ivf"
+              "av1-1-b8-01-size-196x224.ivf.md5"
+              "av1-1-b8-01-size-196x226.ivf"
+              "av1-1-b8-01-size-196x226.ivf.md5"
+              "av1-1-b8-01-size-198x196.ivf"
+              "av1-1-b8-01-size-198x196.ivf.md5"
+              "av1-1-b8-01-size-198x198.ivf"
+              "av1-1-b8-01-size-198x198.ivf.md5"
+              "av1-1-b8-01-size-198x200.ivf"
+              "av1-1-b8-01-size-198x200.ivf.md5"
+              "av1-1-b8-01-size-198x202.ivf"
+              "av1-1-b8-01-size-198x202.ivf.md5"
+              "av1-1-b8-01-size-198x208.ivf"
+              "av1-1-b8-01-size-198x208.ivf.md5"
+              "av1-1-b8-01-size-198x210.ivf"
+              "av1-1-b8-01-size-198x210.ivf.md5"
+              "av1-1-b8-01-size-198x224.ivf"
+              "av1-1-b8-01-size-198x224.ivf.md5"
+              "av1-1-b8-01-size-198x226.ivf"
+              "av1-1-b8-01-size-198x226.ivf.md5"
+              "av1-1-b8-01-size-200x196.ivf"
+              "av1-1-b8-01-size-200x196.ivf.md5"
+              "av1-1-b8-01-size-200x198.ivf"
+              "av1-1-b8-01-size-200x198.ivf.md5"
+              "av1-1-b8-01-size-200x200.ivf"
+              "av1-1-b8-01-size-200x200.ivf.md5"
+              "av1-1-b8-01-size-200x202.ivf"
+              "av1-1-b8-01-size-200x202.ivf.md5"
+              "av1-1-b8-01-size-200x208.ivf"
+              "av1-1-b8-01-size-200x208.ivf.md5"
+              "av1-1-b8-01-size-200x210.ivf"
+              "av1-1-b8-01-size-200x210.ivf.md5"
+              "av1-1-b8-01-size-200x224.ivf"
+              "av1-1-b8-01-size-200x224.ivf.md5"
+              "av1-1-b8-01-size-200x226.ivf"
+              "av1-1-b8-01-size-200x226.ivf.md5"
+              "av1-1-b8-01-size-202x196.ivf"
+              "av1-1-b8-01-size-202x196.ivf.md5"
+              "av1-1-b8-01-size-202x198.ivf"
+              "av1-1-b8-01-size-202x198.ivf.md5"
+              "av1-1-b8-01-size-202x200.ivf"
+              "av1-1-b8-01-size-202x200.ivf.md5"
+              "av1-1-b8-01-size-202x202.ivf"
+              "av1-1-b8-01-size-202x202.ivf.md5"
+              "av1-1-b8-01-size-202x208.ivf"
+              "av1-1-b8-01-size-202x208.ivf.md5"
+              "av1-1-b8-01-size-202x210.ivf"
+              "av1-1-b8-01-size-202x210.ivf.md5"
+              "av1-1-b8-01-size-202x224.ivf"
+              "av1-1-b8-01-size-202x224.ivf.md5"
+              "av1-1-b8-01-size-202x226.ivf"
+              "av1-1-b8-01-size-202x226.ivf.md5"
+              "av1-1-b8-01-size-208x196.ivf"
+              "av1-1-b8-01-size-208x196.ivf.md5"
+              "av1-1-b8-01-size-208x198.ivf"
+              "av1-1-b8-01-size-208x198.ivf.md5"
+              "av1-1-b8-01-size-208x200.ivf"
+              "av1-1-b8-01-size-208x200.ivf.md5"
+              "av1-1-b8-01-size-208x202.ivf"
+              "av1-1-b8-01-size-208x202.ivf.md5"
+              "av1-1-b8-01-size-208x208.ivf"
+              "av1-1-b8-01-size-208x208.ivf.md5"
+              "av1-1-b8-01-size-208x210.ivf"
+              "av1-1-b8-01-size-208x210.ivf.md5"
+              "av1-1-b8-01-size-208x224.ivf"
+              "av1-1-b8-01-size-208x224.ivf.md5"
+              "av1-1-b8-01-size-208x226.ivf"
+              "av1-1-b8-01-size-208x226.ivf.md5"
+              "av1-1-b8-01-size-210x196.ivf"
+              "av1-1-b8-01-size-210x196.ivf.md5"
+              "av1-1-b8-01-size-210x198.ivf"
+              "av1-1-b8-01-size-210x198.ivf.md5"
+              "av1-1-b8-01-size-210x200.ivf"
+              "av1-1-b8-01-size-210x200.ivf.md5"
+              "av1-1-b8-01-size-210x202.ivf"
+              "av1-1-b8-01-size-210x202.ivf.md5"
+              "av1-1-b8-01-size-210x208.ivf"
+              "av1-1-b8-01-size-210x208.ivf.md5"
+              "av1-1-b8-01-size-210x210.ivf"
+              "av1-1-b8-01-size-210x210.ivf.md5"
+              "av1-1-b8-01-size-210x224.ivf"
+              "av1-1-b8-01-size-210x224.ivf.md5"
+              "av1-1-b8-01-size-210x226.ivf"
+              "av1-1-b8-01-size-210x226.ivf.md5"
+              "av1-1-b8-01-size-224x196.ivf"
+              "av1-1-b8-01-size-224x196.ivf.md5"
+              "av1-1-b8-01-size-224x198.ivf"
+              "av1-1-b8-01-size-224x198.ivf.md5"
+              "av1-1-b8-01-size-224x200.ivf"
+              "av1-1-b8-01-size-224x200.ivf.md5"
+              "av1-1-b8-01-size-224x202.ivf"
+              "av1-1-b8-01-size-224x202.ivf.md5"
+              "av1-1-b8-01-size-224x208.ivf"
+              "av1-1-b8-01-size-224x208.ivf.md5"
+              "av1-1-b8-01-size-224x210.ivf"
+              "av1-1-b8-01-size-224x210.ivf.md5"
+              "av1-1-b8-01-size-224x224.ivf"
+              "av1-1-b8-01-size-224x224.ivf.md5"
+              "av1-1-b8-01-size-224x226.ivf"
+              "av1-1-b8-01-size-224x226.ivf.md5"
+              "av1-1-b8-01-size-226x196.ivf"
+              "av1-1-b8-01-size-226x196.ivf.md5"
+              "av1-1-b8-01-size-226x198.ivf"
+              "av1-1-b8-01-size-226x198.ivf.md5"
+              "av1-1-b8-01-size-226x200.ivf"
+              "av1-1-b8-01-size-226x200.ivf.md5"
+              "av1-1-b8-01-size-226x202.ivf"
+              "av1-1-b8-01-size-226x202.ivf.md5"
+              "av1-1-b8-01-size-226x208.ivf"
+              "av1-1-b8-01-size-226x208.ivf.md5"
+              "av1-1-b8-01-size-226x210.ivf"
+              "av1-1-b8-01-size-226x210.ivf.md5"
+              "av1-1-b8-01-size-226x224.ivf"
+              "av1-1-b8-01-size-226x224.ivf.md5"
+              "av1-1-b8-01-size-226x226.ivf"
+              "av1-1-b8-01-size-226x226.ivf.md5"
+              "av1-1-b8-01-size-32x16.ivf"
+              "av1-1-b8-01-size-32x16.ivf.md5"
+              "av1-1-b8-01-size-32x18.ivf"
+              "av1-1-b8-01-size-32x18.ivf.md5"
+              "av1-1-b8-01-size-32x32.ivf"
+              "av1-1-b8-01-size-32x32.ivf.md5"
+              "av1-1-b8-01-size-32x34.ivf"
+              "av1-1-b8-01-size-32x34.ivf.md5"
+              "av1-1-b8-01-size-32x64.ivf"
+              "av1-1-b8-01-size-32x64.ivf.md5"
+              "av1-1-b8-01-size-32x66.ivf"
+              "av1-1-b8-01-size-32x66.ivf.md5"
+              "av1-1-b8-01-size-34x16.ivf"
+              "av1-1-b8-01-size-34x16.ivf.md5"
+              "av1-1-b8-01-size-34x18.ivf"
+              "av1-1-b8-01-size-34x18.ivf.md5"
+              "av1-1-b8-01-size-34x32.ivf"
+              "av1-1-b8-01-size-34x32.ivf.md5"
+              "av1-1-b8-01-size-34x34.ivf"
+              "av1-1-b8-01-size-34x34.ivf.md5"
+              "av1-1-b8-01-size-34x64.ivf"
+              "av1-1-b8-01-size-34x64.ivf.md5"
+              "av1-1-b8-01-size-34x66.ivf"
+              "av1-1-b8-01-size-34x66.ivf.md5"
+              "av1-1-b8-01-size-64x16.ivf"
+              "av1-1-b8-01-size-64x16.ivf.md5"
+              "av1-1-b8-01-size-64x18.ivf"
+              "av1-1-b8-01-size-64x18.ivf.md5"
+              "av1-1-b8-01-size-64x32.ivf"
+              "av1-1-b8-01-size-64x32.ivf.md5"
+              "av1-1-b8-01-size-64x34.ivf"
+              "av1-1-b8-01-size-64x34.ivf.md5"
+              "av1-1-b8-01-size-64x64.ivf"
+              "av1-1-b8-01-size-64x64.ivf.md5"
+              "av1-1-b8-01-size-64x66.ivf"
+              "av1-1-b8-01-size-64x66.ivf.md5"
+              "av1-1-b8-01-size-66x16.ivf"
+              "av1-1-b8-01-size-66x16.ivf.md5"
+              "av1-1-b8-01-size-66x18.ivf"
+              "av1-1-b8-01-size-66x18.ivf.md5"
+              "av1-1-b8-01-size-66x32.ivf"
+              "av1-1-b8-01-size-66x32.ivf.md5"
+              "av1-1-b8-01-size-66x34.ivf"
+              "av1-1-b8-01-size-66x34.ivf.md5"
+              "av1-1-b8-01-size-66x64.ivf"
+              "av1-1-b8-01-size-66x64.ivf.md5"
+              "av1-1-b8-01-size-66x66.ivf"
+              "av1-1-b8-01-size-66x66.ivf.md5"
+              "av1-1-b8-02-allintra.ivf"
+              "av1-1-b8-02-allintra.ivf.md5"
+              "av1-1-b8-03-sizeup.mkv"
+              "av1-1-b8-03-sizeup.mkv.md5"
+              "av1-1-b8-03-sizedown.mkv"
+              "av1-1-b8-03-sizedown.mkv.md5"
+              "av1-1-b8-04-cdfupdate.ivf"
+              "av1-1-b8-04-cdfupdate.ivf.md5"
+              "av1-1-b8-05-mv.ivf"
+              "av1-1-b8-05-mv.ivf.md5"
+              "av1-1-b8-06-mfmv.ivf"
+              "av1-1-b8-06-mfmv.ivf.md5"
+              "av1-1-b8-22-svc-L2T1.ivf"
+              "av1-1-b8-22-svc-L2T1.ivf.md5"
+              "av1-1-b8-22-svc-L1T2.ivf"
+              "av1-1-b8-22-svc-L1T2.ivf.md5"
+              "av1-1-b8-22-svc-L2T2.ivf"
+              "av1-1-b8-22-svc-L2T2.ivf.md5"
+              "av1-1-b8-23-film_grain-50.ivf"
+              "av1-1-b8-23-film_grain-50.ivf.md5"
+              "invalid-bug-1814.ivf"
+              "invalid-bug-1814.ivf.res"
+              "invalid-chromium-906381.ivf"
+              "invalid-chromium-906381.ivf.res"
+              "invalid-google-142530197-1.ivf"
+              "invalid-google-142530197-1.ivf.res"
+              "invalid-google-142530197.ivf"
+              "invalid-google-142530197.ivf.res"
+              "invalid-oss-fuzz-10061.ivf"
+              "invalid-oss-fuzz-10061.ivf.res"
+              "invalid-oss-fuzz-10117-mc-buf-use-highbd.ivf"
+              "invalid-oss-fuzz-10117-mc-buf-use-highbd.ivf.res"
+              "invalid-oss-fuzz-10227.ivf"
+              "invalid-oss-fuzz-10227.ivf.res"
+              "invalid-oss-fuzz-10389.ivf"
+              "invalid-oss-fuzz-10389.ivf.res"
+              "invalid-oss-fuzz-10389.ivf.res.2"
+              "invalid-oss-fuzz-10555.ivf"
+              "invalid-oss-fuzz-10555.ivf.res"
+              "invalid-oss-fuzz-10705.ivf"
+              "invalid-oss-fuzz-10705.ivf.res"
+              "invalid-oss-fuzz-10723.ivf"
+              "invalid-oss-fuzz-10723.ivf.res"
+              "invalid-oss-fuzz-10723.ivf.res.2"
+              "invalid-oss-fuzz-10779.ivf"
+              "invalid-oss-fuzz-10779.ivf.res"
+              "invalid-oss-fuzz-11477.ivf"
+              "invalid-oss-fuzz-11477.ivf.res"
+              "invalid-oss-fuzz-11479.ivf"
+              "invalid-oss-fuzz-11479.ivf.res"
+              "invalid-oss-fuzz-11479.ivf.res.2"
+              "invalid-oss-fuzz-11523.ivf"
+              "invalid-oss-fuzz-11523.ivf.res"
+              "invalid-oss-fuzz-11523.ivf.res.2"
+              "invalid-oss-fuzz-15363.ivf"
+              "invalid-oss-fuzz-15363.ivf.res"
+              "invalid-oss-fuzz-16437.ivf"
+              "invalid-oss-fuzz-16437.ivf.res"
+              "invalid-oss-fuzz-9288.ivf"
+              "invalid-oss-fuzz-9288.ivf.res"
+              "invalid-oss-fuzz-9463.ivf"
+              "invalid-oss-fuzz-9463.ivf.res"
+              "invalid-oss-fuzz-9463.ivf.res.2"
+              "invalid-oss-fuzz-9482.ivf"
+              "invalid-oss-fuzz-9482.ivf.res"
+              "invalid-oss-fuzz-9720.ivf"
+              "invalid-oss-fuzz-9720.ivf.res")
+endif()
+
+if(ENABLE_ENCODE_PERF_TESTS AND CONFIG_AV1_ENCODER)
+  list(APPEND AOM_TEST_DATA_FILE_NAMES "desktop_640_360_30.yuv"
+              "kirland_640_480_30.yuv" "macmarcomoving_640_480_30.yuv"
+              "macmarcostationary_640_480_30.yuv" "niklas_1280_720_30.yuv"
+              "tacomanarrows_640_480_30.yuv"
+              "tacomasmallcameramovement_640_480_30.yuv"
+              "thaloundeskmtg_640_480_30.yuv")
+endif()
+
+# Parses test/test-data.sha1 and writes captured file names and checksums to
+# $out_files and $out_checksums as lists.
+function(make_test_data_lists test_data_file out_files out_checksums)
+  if(NOT test_data_file OR NOT EXISTS "${test_data_file}")
+    message(FATAL_ERROR "Test info file missing or empty (${test_data_file})")
+  endif()
+
+  # Read $test_data_file into $files_and_checksums. $files_and_checksums becomes
+  # a list with an entry for each line from $test_data_file.
+  file(STRINGS "${test_data_file}" files_and_checksums)
+
+  # Iterate over the list of lines and split it into $checksums and $filenames.
+  foreach(line ${files_and_checksums})
+    string(FIND "${line}" " *" delim_pos)
+
+    math(EXPR filename_pos "${delim_pos} + 2")
+    string(SUBSTRING "${line}" 0 ${delim_pos} checksum)
+    string(SUBSTRING "${line}" ${filename_pos} -1 filename)
+
+    list(FIND AOM_TEST_DATA_FILE_NAMES ${filename} list_index)
+    if(NOT ${list_index} EQUAL -1)
+
+      # Include the name and checksum in output only when the file is needed.
+      set(checksums ${checksums} ${checksum})
+      set(filenames ${filenames} ${filename})
+    endif()
+  endforeach()
+
+  list(LENGTH filenames num_files)
+  list(LENGTH checksums num_checksums)
+  if(NOT checksums OR NOT filenames OR NOT num_files EQUAL num_checksums)
+    message(FATAL_ERROR "Parsing of ${test_data_file} failed.")
+  endif()
+
+  set(${out_checksums} ${checksums} PARENT_SCOPE)
+  set(${out_files} ${filenames} PARENT_SCOPE)
+endfunction()
+
+# Appends each file name in $test_files to $test_dir and adds the result path to
+# $out_path_list.
+function(expand_test_file_paths test_files test_dir out_path_list)
+  foreach(filename ${${test_files}})
+    set(path_list ${path_list} "${test_dir}/${filename}")
+  endforeach()
+  set(${out_path_list} ${path_list} PARENT_SCOPE)
+endfunction()
+
+function(check_file local_path expected_checksum out_needs_update)
+  if(EXISTS "${local_path}")
+    file(SHA1 "${local_path}" file_checksum)
+  else()
+    set(${out_needs_update} 1 PARENT_SCOPE)
+    return()
+  endif()
+
+  if("${file_checksum}" STREQUAL "${expected_checksum}")
+    unset(${out_needs_update} PARENT_SCOPE)
+  else()
+    set(${out_needs_update} 1 PARENT_SCOPE)
+    return()
+  endif()
+  message("${local_path} up to date.")
+endfunction()
+
+# Downloads data from $file_url, confirms that $file_checksum matches, and
+# writes it to $local_path.
+function(download_test_file file_url file_checksum local_path)
+  message("Downloading ${file_url} ...")
+  file(DOWNLOAD "${file_url}" "${local_path}" SHOW_PROGRESS EXPECTED_HASH
+                                              SHA1=${file_checksum})
+  message("Download of ${file_url} complete.")
+endfunction()
diff --git a/libs/libaom/src/test/test_intra_pred_speed.cc b/libs/libaom/src/test/test_intra_pred_speed.cc
new file mode 100644
index 000000000..25c50d022
--- /dev/null
+++ b/libs/libaom/src/test/test_intra_pred_speed.cc
@@ -0,0 +1,1467 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+//  Test and time AOM intra-predictor functions
+
+#include <stdio.h>
+#include <string>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/md5_helper.h"
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/aom_timer.h"
+#include "av1/common/common_data.h"
+
+// -----------------------------------------------------------------------------
+
+namespace {
+
+// Note:
+// APPLY_UNIT_TESTS
+// 1: Do unit tests
+// 0: Generate MD5 array as required
+#define APPLY_UNIT_TESTS 1
+
+typedef void (*AvxPredFunc)(uint8_t *dst, ptrdiff_t y_stride,
+                            const uint8_t *above, const uint8_t *left);
+
+const int kBPS = 64;
+const int kTotalPixels = kBPS * kBPS;
+// 4 DC variants, V, H, PAETH, SMOOTH, SMOOTH_V, SMOOTH_H
+const int kNumAv1IntraFuncs = 10;
+
+#if APPLY_UNIT_TESTS
+const char *kAv1IntraPredNames[kNumAv1IntraFuncs] = {
+  "DC_PRED", "DC_LEFT_PRED", "DC_TOP_PRED", "DC_128_PRED",   "V_PRED",
+  "H_PRED",  "PAETH_PRED",   "SMOOTH_PRED", "SMOOTH_V_PRED", "SMOOTH_H_PRED",
+};
+#endif  // APPLY_UNIT_TESTS
+
+template <typename Pixel>
+struct IntraPredTestMem {
+  void Init(int block_width, int block_height, int bd) {
+    ASSERT_LE(block_width, kBPS);
+    ASSERT_LE(block_height, kBPS);
+    // Note: for blocks having width <= 32 and height <= 32, we generate 32x32
+    // random pixels as before to avoid having to recalculate all hashes again.
+    const int block_size_upto_32 = (block_width <= 32) && (block_height <= 32);
+    stride = block_size_upto_32 ? 32 : kBPS;
+    num_pixels = stride * stride;
+    libaom_test::ACMRandom rnd(libaom_test::ACMRandom::DeterministicSeed());
+    above = above_mem + 16;
+    const int mask = (1 << bd) - 1;
+    for (int i = 0; i < num_pixels; ++i) ref_src[i] = rnd.Rand16() & mask;
+    for (int i = 0; i < stride; ++i) left[i] = rnd.Rand16() & mask;
+    for (int i = -1; i < stride; ++i) above[i] = rnd.Rand16() & mask;
+
+    for (int i = stride; i < 2 * stride; ++i) {
+      left[i] = rnd.Rand16() & mask;
+      above[i] = rnd.Rand16() & mask;
+    }
+  }
+
+  DECLARE_ALIGNED(16, Pixel, src[kTotalPixels]);
+  DECLARE_ALIGNED(16, Pixel, ref_src[kTotalPixels]);
+  DECLARE_ALIGNED(16, Pixel, left[2 * kBPS]);
+  Pixel *above;
+  int stride;
+  int num_pixels;
+
+ private:
+  DECLARE_ALIGNED(16, Pixel, above_mem[2 * kBPS + 16]);
+};
+
+// -----------------------------------------------------------------------------
+// Low Bittdepth
+
+typedef IntraPredTestMem<uint8_t> Av1IntraPredTestMem;
+
+static const char *const kTxSizeStrings[TX_SIZES_ALL] = {
+  "4X4",  "8X8",  "16X16", "32X32", "64X64", "4X8",   "8X4",
+  "8X16", "16X8", "16X32", "32X16", "32X64", "64X32", "4X16",
+  "16X4", "8X32", "32X8",  "16X64", "64X16",
+};
+
+void CheckMd5Signature(TX_SIZE tx_size, bool is_hbd,
+                       const char *const signatures[], const void *data,
+                       size_t data_size, int elapsed_time, int idx) {
+  const std::string hbd_str = is_hbd ? "Hbd " : "";
+  const std::string name_str = hbd_str + "Intra" + kTxSizeStrings[tx_size];
+  libaom_test::MD5 md5;
+  md5.Add(reinterpret_cast<const uint8_t *>(data), data_size);
+#if APPLY_UNIT_TESTS
+  printf("Mode %s[%13s]: %5d ms     MD5: %s\n", name_str.c_str(),
+         kAv1IntraPredNames[idx], elapsed_time, md5.Get());
+  EXPECT_STREQ(signatures[idx], md5.Get());
+#else
+  (void)signatures;
+  (void)elapsed_time;
+  (void)idx;
+  printf("\"%s\",\n", md5.Get());
+#endif
+}
+
+void TestIntraPred(TX_SIZE tx_size, AvxPredFunc const *pred_funcs,
+                   const char *const signatures[]) {
+  const int block_width = tx_size_wide[tx_size];
+  const int block_height = tx_size_high[tx_size];
+  const int num_pixels_per_test =
+      block_width * block_height * kNumAv1IntraFuncs;
+  const int kNumTests = static_cast<int>(2.e10 / num_pixels_per_test);
+  Av1IntraPredTestMem intra_pred_test_mem;
+  intra_pred_test_mem.Init(block_width, block_height, 8);
+
+  for (int k = 0; k < kNumAv1IntraFuncs; ++k) {
+    if (pred_funcs[k] == NULL) continue;
+    memcpy(intra_pred_test_mem.src, intra_pred_test_mem.ref_src,
+           sizeof(intra_pred_test_mem.src));
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    for (int num_tests = 0; num_tests < kNumTests; ++num_tests) {
+      pred_funcs[k](intra_pred_test_mem.src, intra_pred_test_mem.stride,
+                    intra_pred_test_mem.above, intra_pred_test_mem.left);
+    }
+    libaom_test::ClearSystemState();
+    aom_usec_timer_mark(&timer);
+    const int elapsed_time =
+        static_cast<int>(aom_usec_timer_elapsed(&timer) / 1000);
+    CheckMd5Signature(
+        tx_size, false, signatures, intra_pred_test_mem.src,
+        intra_pred_test_mem.num_pixels * sizeof(*intra_pred_test_mem.src),
+        elapsed_time, k);
+  }
+}
+
+static const char *const kSignatures[TX_SIZES_ALL][kNumAv1IntraFuncs] = {
+  {
+      // 4X4
+      "e7ed7353c3383fff942e500e9bfe82fe",
+      "2a4a26fcc6ce005eadc08354d196c8a9",
+      "269d92eff86f315d9c38fe7640d85b15",
+      "ae2960eea9f71ee3dabe08b282ec1773",
+      "6c1abcc44e90148998b51acd11144e9c",
+      "f7bb3186e1ef8a2b326037ff898cad8e",
+      "59fc0e923a08cfac0a493fb38988e2bb",
+      "9ff8bb37d9c830e6ab8ecb0c435d3c91",
+      "de6937fca02354f2874dbc5dbec5d5b3",
+      "723cf948137f7d8c7860d814e55ae67d",
+  },
+  {
+      // 8X8
+      "d8bbae5d6547cfc17e4f5f44c8730e88",
+      "373bab6d931868d41a601d9d88ce9ac3",
+      "6fdd5ff4ff79656c14747598ca9e3706",
+      "d9661c2811d6a73674f40ffb2b841847",
+      "7c722d10b19ccff0b8c171868e747385",
+      "f81dd986eb2b50f750d3a7da716b7e27",
+      "064404361748dd111a890a1470d7f0ea",
+      "dc29b7e1f78cc8e7525d5ea4c0ab9b78",
+      "97111eb1bc26bade6272015df829f1ae",
+      "d19a8a73cc46b807f2c5e817576cc1e1",
+  },
+  {
+      // 16X16
+      "50971c07ce26977d30298538fffec619",
+      "527a6b9e0dc5b21b98cf276305432bef",
+      "7eff2868f80ebc2c43a4f367281d80f7",
+      "67cd60512b54964ef6aff1bd4816d922",
+      "48371c87dc95c08a33b2048f89cf6468",
+      "b0acf2872ee411d7530af6d2625a7084",
+      "93d6b5352b571805ab16a55e1bbed86a",
+      "03764e4c0aebbc180e4e2c68fb06df2b",
+      "bb6c74c9076c9f266ab11fb57060d8e6",
+      "0c5162bc28489756ddb847b5678e6f07",
+  },
+  {
+      // 32X32
+      "a0a618c900e65ae521ccc8af789729f2",
+      "985aaa7c72b4a6c2fb431d32100cf13a",
+      "10662d09febc3ca13ee4e700120daeb5",
+      "b3b01379ba08916ef6b1b35f7d9ad51c",
+      "9f4261755795af97e34679c333ec7004",
+      "bc2c9da91ad97ef0d1610fb0a9041657",
+      "ef1653982b69e1f64bee3759f3e1ec45",
+      "1a51a675deba2c83282142eb48d3dc3d",
+      "866c224746dc260cda861a7b1b383fb3",
+      "cea23799fc3526e1b6a6ff02b42b82af",
+  },
+  {
+      // 64X64
+      "6e1094fa7b50bc813aa2ba29f5df8755",
+      "afe020786b83b793c2bbd9468097ff6e",
+      "be91585259bc37bf4dc1651936e90b3e",
+      "a1650dbcd56e10288c3e269eca37967d",
+      "9e5c34f3797e0cdd3cd9d4c05b0d8950",
+      "bc87be7ac899cc6a28f399d7516c49fe",
+      "9811fd0d2dd515f06122f5d1bd18b784",
+      "3c140e466f2c2c0d9cb7d2157ab8dc27",
+      "9543de76c925a8f6adc884cc7f98dc91",
+      "df1df0376cc944afe7e74e94f53e575a",
+  },
+  {
+      // 4X8
+      "d9fbebdc85f71ab1e18461b2db4a2adc",
+      "5ccb2a68284bc9714d94b8a06ccadbb2",
+      "735d059abc2744f3ff3f9590f7191b37",
+      "d9fbebdc85f71ab1e18461b2db4a2adc",
+      "6819497c44cd0ace120add83672996ee",
+      "7e3244f5a2d3edf81c7e962a842b97f9",
+      "809350f164cd4d1650850bb0f59c3260",
+      "1b60a394331eeab6927a6f8aaff57040",
+      "5307de1bd7329ba6b281d2c1b0b457f9",
+      "24c58a8138339846d95568efb91751db",
+  },
+  {
+      // 8X4
+      "23f9fc11344426c9bee2e06d57dfd628",
+      "2d71a26d1bae1fb34734de7b42fc5eb7",
+      "5af9c1b2fd9d5721fad67b67b3f7c816",
+      "00d71b17be662753813d515f197d145e",
+      "bef10ec984427e28f4390f43809d10af",
+      "77773cdfb7ed6bc882ab202a64b0a470",
+      "2cc48bd66d6b0121b5221d52ccd732af",
+      "b302155e1c9eeeafe2ba2bf68e807a46",
+      "561bc8d0e76d5041ebd5168fc6a115e1",
+      "81d0113fb1d0a9a24ffd6f1987b77948",
+  },
+  {
+      // 8X16
+      "c849de88b24f773dfcdd1d48d1209796",
+      "6cb807c1897b94866a0f3d3c56ed8695",
+      "d56db05a8ac7981762f5b877f486c4ef",
+      "b4bc01eb6e59a40922ad17715cafb04b",
+      "09d178439534f4062ae687c351f66d64",
+      "644501399cf73080ac606e5cef7ca09b",
+      "278076495180e17c065a95ab7278539a",
+      "9dd7f324816f242be408ffeb0c673732",
+      "f520c4a20acfa0bea1d253c6f0f040fd",
+      "85f38df809df2c2d7c8b4a157a65cd44",
+  },
+  {
+      // 16X8
+      "b4cbdbdf10ce13300b4063a3daf99e04",
+      "3731e1e6202064a9d0604d7c293ecee4",
+      "6c856188c4256a06452f0d5d70cac436",
+      "1f2192b4c8c497589484ea7bf9c944e8",
+      "84011bd4b7f565119d06787840e333a0",
+      "0e48949f7a6aa36f0d76b5d01f91124a",
+      "60eff8064634b6c73b10681356baeee9",
+      "1559aeb081a9c0c71111d6093c2ff9fd",
+      "c15479b739713773e5cabb748451987b",
+      "72e33ec12c9b67aea26d8d005fb82de2",
+  },
+  {
+      // 16X32
+      "abe5233d189cdbf79424721571bbaa7b",
+      "282759f81e3cfb2e2d396fe406b72a8b",
+      "e2224926c264f6f174cbc3167a233168",
+      "6814e85c2b33f8c9415d62e80394b47b",
+      "99cbbb60459c08a3061d72c4e4f6276a",
+      "1d1567d40b8e816f8c1f71e576fe0f87",
+      "36fdd371b624a075814d497c4832ec85",
+      "8ab8da61b727442b6ff692b40d0df018",
+      "e35a10ad7fdf2327e821504a90f6a6eb",
+      "1f7211e727dc1de7d6a55d082fbdd821",
+  },
+  {
+      // 32X16
+      "d1aeb8d5fdcfd3307922af01a798a4dc",
+      "b0bcb514ebfbee065faea9d34c12ae75",
+      "d6a18c63b4e909871c0137ca652fad23",
+      "fd047f2fc1b8ffb95d0eeef3e8796a45",
+      "645ab60779ea348fd93c81561c31bab9",
+      "4409633c9db8dff41ade4292a3a56e7f",
+      "5e36a11e069b31c2a739f3a9c7b37c24",
+      "e83b9483d702cfae496991c3c7fa92c0",
+      "12f6ddf98c7f30a277307f1ea935b030",
+      "354321d6c32bbdb0739e4fa2acbf41e1",
+  },
+  {
+      // 32X64
+      "0ce332b343934b34cd4417725faa85cb",
+      "4e2a2cfd8f56f15939bdfc753145b303",
+      "0f46d124ba9f48cdd5d5290acf786d6d",
+      "e1e8ed803236367821981500a3d9eebe",
+      "1d2f8e48e3adb7c448be05d9f66f4954",
+      "9fb2e176636a5689b26f73ca73fcc512",
+      "e720ebccae7e25e36f23da53ae5b5d6a",
+      "86fe4364734169aaa4520d799890d530",
+      "b1870290764bb1b100d1974e2bd70f1d",
+      "ce5b238e19d85ef69d85badfab4e63ae",
+  },
+  {
+      // 64X32
+      "a6c5aeb722615089efbca80b02951ceb",
+      "538424b24bd0830f21788e7238ca762f",
+      "80c15b303235f9bc2259027bb92dfdc4",
+      "e48e1ac15e97191a8fda08d62fff343e",
+      "12604b37875533665078405ef4582e35",
+      "0048afa17bd3e1632d68b96048836530",
+      "07a0cfcb56a5eed50c4bd6c26814336b",
+      "529d8a070de5bc6531fa3ee8f450c233",
+      "33c50a11c7d78f72434064f634305e95",
+      "e0ef7f0559c1a50ec5a8c12011b962f7",
+  },
+  {
+      // 4X16
+      "750491056568eb8fe15387b86bdf06b8",
+      "3a52dae9f599f08cfb3bd1b910dc0e11",
+      "af79f71e3e03dbeca44e2e13561f70c7",
+      "ca7dfd7624afc0c06fb5552f44398535",
+      "b591af115444bf43140c29c269f68fb2",
+      "483d942ae36e69e62f31eb215331416f",
+      "f14b58525e81870bc5d95c7ac71a347f",
+      "371208bb4027d9badb04095d1590bbc4",
+      "c7049c21b2924d70c7c12784d6b6b796",
+      "7d87233f4b5b0f12086045e5d7b2d4c2",
+  },
+  {
+      // 16X4
+      "7c6e325a65e77e732b3adbe237e045e4",
+      "24478f93ffcec47852e004d0fe948464",
+      "258d042c67d4ba3ecfa667f0adc9aebf",
+      "b2cd21d06959f159a1f3c4d9768ee7fb",
+      "b4e1f38157bf8410e7c3da02f687a343",
+      "869e703729eb0fc0711c254944ff5d5a",
+      "9638dd77105a640b146a8201ea7a0801",
+      "919d932c6af8a1cc7486e8ce996dd487",
+      "e1c9be493b6714c7ae48f30044c43140",
+      "bf0fe3889d654b2f6eb98c8fc751f9e4",
+  },
+  {
+      // 8X32
+      "8dfac4319fe0bd40013ffb3102da8c72",
+      "feb46b6dc4e2ca0a09533bfc51d4dcb0",
+      "850837ec714c37262216527aaf4cbbe9",
+      "4603c7800fb08361f163daca876e8bda",
+      "1ff95e7d2debc27b05806fb25abfd624",
+      "d81b9a51a062b23ca7823804cb7bec22",
+      "f1d8978158766f46335203608cb807e7",
+      "f3527096256258c0878d644a9d7d53ca",
+      "cbde98ac8b009953eb112807ad2ea29e",
+      "654fb1153415747feae599f538122af5",
+  },
+  {
+      // 32X8
+      "3d4ee16fab374357474f60b845327bc7",
+      "bc17c5059473a476df4e85f56395ad55",
+      "3d4ee16fab374357474f60b845327bc7",
+      "c14b8db34dc2355b84e3735c9ba16c7f",
+      "a71d25b5d47a92a8b9223c98f18458ee",
+      "6c1cfe2b1893f4576a80675687cb6426",
+      "92d11bbef8b85bb48d799bb055de3514",
+      "bcf81d1db8ae5cc03360467f44f498ec",
+      "79f8c564163555592e808e145eaf5c60",
+      "46fff139cef2ef773938bcc8b0e5abb8",
+  },
+  {
+      // 16X64
+      "3b2a053ee8b05a8ac35ad23b0422a151",
+      "12b0c69595328c465e0b25e0c9e3e9fc",
+      "f77c544ac8035e01920deae40cee7b07",
+      "727797ef15ccd8d325476fe8f12006a3",
+      "f3be77c0fe67eb5d9d515e92bec21eb7",
+      "f1ece6409e01e9dd98b800d49628247d",
+      "efd2ec9bfbbd4fd1f6604ea369df1894",
+      "ec703de918422b9e03197ba0ed60a199",
+      "739418efb89c07f700895deaa5d0b3e3",
+      "9943ae1bbeeebfe1d3a92dc39e049d63",
+  },
+  {
+      // 64X16
+      "821b76b1494d4f84d20817840f719a1a",
+      "69e462c3338a9aaf993c3f7cfbc15649",
+      "516d8f6eb054d74d150e7b444185b6b9",
+      "de1b736e9d99129609d6ef3a491507a0",
+      "fd9b4276e7affe1e0e4ce4f428058994",
+      "cd82fd361a4767ac29a9f406b480b8f3",
+      "2792c2f810157a4a6cb13c28529ff779",
+      "1220442d90c4255ba0969d28b91e93a6",
+      "c7253e10b45f7f67dfee3256c9b94825",
+      "879792198071c7e0b50b9b5010d8c18f",
+  },
+};
+
+}  // namespace
+
+// Defines a test case for |arch| (e.g., C, SSE2, ...) passing the predictors
+// to TestIntraPred. The test name is 'arch.TestIntraPred_tx_size', e.g.,
+// C.TestIntraPred.0
+#define INTRA_PRED_TEST(arch, tx_size, dc, dc_left, dc_top, dc_128, v, h,  \
+                        paeth, smooth, smooth_v, smooth_h)                 \
+  TEST(arch, DISABLED_##TestIntraPred_##tx_size) {                         \
+    static const AvxPredFunc aom_intra_pred[] = {                          \
+      dc, dc_left, dc_top, dc_128, v, h, paeth, smooth, smooth_v, smooth_h \
+    };                                                                     \
+    TestIntraPred(tx_size, aom_intra_pred, kSignatures[tx_size]);          \
+  }
+
+// -----------------------------------------------------------------------------
+// 4x4, 4x8, 4x16
+
+INTRA_PRED_TEST(C_1, TX_4X4, aom_dc_predictor_4x4_c,
+                aom_dc_left_predictor_4x4_c, aom_dc_top_predictor_4x4_c,
+                aom_dc_128_predictor_4x4_c, aom_v_predictor_4x4_c,
+                aom_h_predictor_4x4_c, aom_paeth_predictor_4x4_c,
+                aom_smooth_predictor_4x4_c, aom_smooth_v_predictor_4x4_c,
+                aom_smooth_h_predictor_4x4_c)
+
+INTRA_PRED_TEST(C_2, TX_4X8, aom_dc_predictor_4x8_c,
+                aom_dc_left_predictor_4x8_c, aom_dc_top_predictor_4x8_c,
+                aom_dc_128_predictor_4x8_c, aom_v_predictor_4x8_c,
+                aom_h_predictor_4x8_c, aom_paeth_predictor_4x8_c,
+                aom_smooth_predictor_4x8_c, aom_smooth_v_predictor_4x8_c,
+                aom_smooth_h_predictor_4x8_c)
+
+INTRA_PRED_TEST(C_3, TX_4X16, aom_dc_predictor_4x16_c,
+                aom_dc_left_predictor_4x16_c, aom_dc_top_predictor_4x16_c,
+                aom_dc_128_predictor_4x16_c, aom_v_predictor_4x16_c,
+                aom_h_predictor_4x16_c, aom_paeth_predictor_4x16_c,
+                aom_smooth_predictor_4x16_c, aom_smooth_v_predictor_4x16_c,
+                aom_smooth_h_predictor_4x16_c)
+
+#if HAVE_SSE2
+INTRA_PRED_TEST(SSE2_1, TX_4X4, aom_dc_predictor_4x4_sse2,
+                aom_dc_left_predictor_4x4_sse2, aom_dc_top_predictor_4x4_sse2,
+                aom_dc_128_predictor_4x4_sse2, aom_v_predictor_4x4_sse2,
+                aom_h_predictor_4x4_sse2, NULL, NULL, NULL, NULL)
+INTRA_PRED_TEST(SSE2_2, TX_4X8, aom_dc_predictor_4x8_sse2,
+                aom_dc_left_predictor_4x8_sse2, aom_dc_top_predictor_4x8_sse2,
+                aom_dc_128_predictor_4x8_sse2, aom_v_predictor_4x8_sse2,
+                aom_h_predictor_4x8_sse2, NULL, NULL, NULL, NULL)
+INTRA_PRED_TEST(SSE2_3, TX_4X16, aom_dc_predictor_4x16_sse2,
+                aom_dc_left_predictor_4x16_sse2, aom_dc_top_predictor_4x16_sse2,
+                aom_dc_128_predictor_4x16_sse2, aom_v_predictor_4x16_sse2,
+                aom_h_predictor_4x16_sse2, NULL, NULL, NULL, NULL)
+#endif  // HAVE_SSE2
+
+#if HAVE_SSSE3
+INTRA_PRED_TEST(SSSE3_1, TX_4X4, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_4x4_ssse3, aom_smooth_predictor_4x4_ssse3,
+                aom_smooth_v_predictor_4x4_ssse3,
+                aom_smooth_h_predictor_4x4_ssse3)
+INTRA_PRED_TEST(SSSE3_2, TX_4X8, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_4x8_ssse3, aom_smooth_predictor_4x8_ssse3,
+                aom_smooth_v_predictor_4x8_ssse3,
+                aom_smooth_h_predictor_4x8_ssse3)
+INTRA_PRED_TEST(SSSE3_3, TX_4X16, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_4x16_ssse3, aom_smooth_predictor_4x16_ssse3,
+                aom_smooth_v_predictor_4x16_ssse3,
+                aom_smooth_h_predictor_4x16_ssse3)
+#endif  // HAVE_SSSE3
+
+#if HAVE_DSPR2
+INTRA_PRED_TEST(DSPR2, TX_4X4, aom_dc_predictor_4x4_dspr2, NULL, NULL, NULL,
+                NULL, aom_h_predictor_4x4_dspr2, NULL, NULL, NULL, NULL)
+#endif  // HAVE_DSPR2
+
+#if HAVE_NEON
+INTRA_PRED_TEST(NEON, TX_4X4, aom_dc_predictor_4x4_neon,
+                aom_dc_left_predictor_4x4_neon, aom_dc_top_predictor_4x4_neon,
+                aom_dc_128_predictor_4x4_neon, aom_v_predictor_4x4_neon,
+                aom_h_predictor_4x4_neon, NULL, NULL, NULL, NULL)
+#endif  // HAVE_NEON
+
+#if HAVE_MSA
+INTRA_PRED_TEST(MSA, TX_4X4, aom_dc_predictor_4x4_msa,
+                aom_dc_left_predictor_4x4_msa, aom_dc_top_predictor_4x4_msa,
+                aom_dc_128_predictor_4x4_msa, aom_v_predictor_4x4_msa,
+                aom_h_predictor_4x4_msa, NULL, NULL, NULL, NULL)
+#endif  // HAVE_MSA
+
+// -----------------------------------------------------------------------------
+// 8x8, 8x4, 8x16, 8x32
+
+INTRA_PRED_TEST(C_1, TX_8X8, aom_dc_predictor_8x8_c,
+                aom_dc_left_predictor_8x8_c, aom_dc_top_predictor_8x8_c,
+                aom_dc_128_predictor_8x8_c, aom_v_predictor_8x8_c,
+                aom_h_predictor_8x8_c, aom_paeth_predictor_8x8_c,
+                aom_smooth_predictor_8x8_c, aom_smooth_v_predictor_8x8_c,
+                aom_smooth_h_predictor_8x8_c)
+
+INTRA_PRED_TEST(C_2, TX_8X4, aom_dc_predictor_8x4_c,
+                aom_dc_left_predictor_8x4_c, aom_dc_top_predictor_8x4_c,
+                aom_dc_128_predictor_8x4_c, aom_v_predictor_8x4_c,
+                aom_h_predictor_8x4_c, aom_paeth_predictor_8x4_c,
+                aom_smooth_predictor_8x4_c, aom_smooth_v_predictor_8x4_c,
+                aom_smooth_h_predictor_8x4_c)
+
+INTRA_PRED_TEST(C_3, TX_8X16, aom_dc_predictor_8x16_c,
+                aom_dc_left_predictor_8x16_c, aom_dc_top_predictor_8x16_c,
+                aom_dc_128_predictor_8x16_c, aom_v_predictor_8x16_c,
+                aom_h_predictor_8x16_c, aom_paeth_predictor_8x16_c,
+                aom_smooth_predictor_8x16_c, aom_smooth_v_predictor_8x16_c,
+                aom_smooth_h_predictor_8x16_c)
+
+INTRA_PRED_TEST(C_4, TX_8X32, aom_dc_predictor_8x32_c,
+                aom_dc_left_predictor_8x32_c, aom_dc_top_predictor_8x32_c,
+                aom_dc_128_predictor_8x32_c, aom_v_predictor_8x32_c,
+                aom_h_predictor_8x32_c, aom_paeth_predictor_8x32_c,
+                aom_smooth_predictor_8x32_c, aom_smooth_v_predictor_8x32_c,
+                aom_smooth_h_predictor_8x32_c)
+
+#if HAVE_SSE2
+INTRA_PRED_TEST(SSE2_1, TX_8X8, aom_dc_predictor_8x8_sse2,
+                aom_dc_left_predictor_8x8_sse2, aom_dc_top_predictor_8x8_sse2,
+                aom_dc_128_predictor_8x8_sse2, aom_v_predictor_8x8_sse2,
+                aom_h_predictor_8x8_sse2, NULL, NULL, NULL, NULL)
+INTRA_PRED_TEST(SSE2_2, TX_8X4, aom_dc_predictor_8x4_sse2,
+                aom_dc_left_predictor_8x4_sse2, aom_dc_top_predictor_8x4_sse2,
+                aom_dc_128_predictor_8x4_sse2, aom_v_predictor_8x4_sse2,
+                aom_h_predictor_8x4_sse2, NULL, NULL, NULL, NULL)
+INTRA_PRED_TEST(SSE2_3, TX_8X16, aom_dc_predictor_8x16_sse2,
+                aom_dc_left_predictor_8x16_sse2, aom_dc_top_predictor_8x16_sse2,
+                aom_dc_128_predictor_8x16_sse2, aom_v_predictor_8x16_sse2,
+                aom_h_predictor_8x16_sse2, NULL, NULL, NULL, NULL)
+INTRA_PRED_TEST(SSE2_4, TX_8X32, aom_dc_predictor_8x32_sse2,
+                aom_dc_left_predictor_8x32_sse2, aom_dc_top_predictor_8x32_sse2,
+                aom_dc_128_predictor_8x32_sse2, aom_v_predictor_8x32_sse2,
+                aom_h_predictor_8x32_sse2, NULL, NULL, NULL, NULL)
+#endif  // HAVE_SSE2
+
+#if HAVE_SSSE3
+INTRA_PRED_TEST(SSSE3_1, TX_8X8, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_8x8_ssse3, aom_smooth_predictor_8x8_ssse3,
+                aom_smooth_v_predictor_8x8_ssse3,
+                aom_smooth_h_predictor_8x8_ssse3)
+INTRA_PRED_TEST(SSSE3_2, TX_8X4, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_8x4_ssse3, aom_smooth_predictor_8x4_ssse3,
+                aom_smooth_v_predictor_8x4_ssse3,
+                aom_smooth_h_predictor_8x4_ssse3)
+INTRA_PRED_TEST(SSSE3_3, TX_8X16, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_8x16_ssse3, aom_smooth_predictor_8x16_ssse3,
+                aom_smooth_v_predictor_8x16_ssse3,
+                aom_smooth_h_predictor_8x16_ssse3)
+INTRA_PRED_TEST(SSSE3_4, TX_8X32, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_8x32_ssse3, aom_smooth_predictor_8x32_ssse3,
+                aom_smooth_v_predictor_8x32_ssse3,
+                aom_smooth_h_predictor_8x32_ssse3)
+#endif  // HAVE_SSSE3
+
+#if HAVE_DSPR2
+INTRA_PRED_TEST(DSPR2, TX_8X8, aom_dc_predictor_8x8_dspr2, NULL, NULL, NULL,
+                NULL, aom_h_predictor_8x8_dspr2, NULL, NULL, NULL, NULL)
+#endif  // HAVE_DSPR2
+
+#if HAVE_NEON
+INTRA_PRED_TEST(NEON, TX_8X8, aom_dc_predictor_8x8_neon,
+                aom_dc_left_predictor_8x8_neon, aom_dc_top_predictor_8x8_neon,
+                aom_dc_128_predictor_8x8_neon, aom_v_predictor_8x8_neon,
+                aom_h_predictor_8x8_neon, NULL, NULL, NULL, NULL)
+#endif  // HAVE_NEON
+
+#if HAVE_MSA
+INTRA_PRED_TEST(MSA, TX_8X8, aom_dc_predictor_8x8_msa,
+                aom_dc_left_predictor_8x8_msa, aom_dc_top_predictor_8x8_msa,
+                aom_dc_128_predictor_8x8_msa, aom_v_predictor_8x8_msa,
+                aom_h_predictor_8x8_msa, NULL, NULL, NULL, NULL)
+#endif  // HAVE_MSA
+
+// -----------------------------------------------------------------------------
+// 16x16, 16x8, 16x32, 16x4, 16x64
+
+INTRA_PRED_TEST(C_1, TX_16X16, aom_dc_predictor_16x16_c,
+                aom_dc_left_predictor_16x16_c, aom_dc_top_predictor_16x16_c,
+                aom_dc_128_predictor_16x16_c, aom_v_predictor_16x16_c,
+                aom_h_predictor_16x16_c, aom_paeth_predictor_16x16_c,
+                aom_smooth_predictor_16x16_c, aom_smooth_v_predictor_16x16_c,
+                aom_smooth_h_predictor_16x16_c)
+
+INTRA_PRED_TEST(C_2, TX_16X8, aom_dc_predictor_16x8_c,
+                aom_dc_left_predictor_16x8_c, aom_dc_top_predictor_16x8_c,
+                aom_dc_128_predictor_16x8_c, aom_v_predictor_16x8_c,
+                aom_h_predictor_16x8_c, aom_paeth_predictor_16x8_c,
+                aom_smooth_predictor_16x8_c, aom_smooth_v_predictor_16x8_c,
+                aom_smooth_h_predictor_16x8_c)
+
+INTRA_PRED_TEST(C_3, TX_16X32, aom_dc_predictor_16x32_c,
+                aom_dc_left_predictor_16x32_c, aom_dc_top_predictor_16x32_c,
+                aom_dc_128_predictor_16x32_c, aom_v_predictor_16x32_c,
+                aom_h_predictor_16x32_c, aom_paeth_predictor_16x32_c,
+                aom_smooth_predictor_16x32_c, aom_smooth_v_predictor_16x32_c,
+                aom_smooth_h_predictor_16x32_c)
+
+INTRA_PRED_TEST(C_4, TX_16X4, aom_dc_predictor_16x4_c,
+                aom_dc_left_predictor_16x4_c, aom_dc_top_predictor_16x4_c,
+                aom_dc_128_predictor_16x4_c, aom_v_predictor_16x4_c,
+                aom_h_predictor_16x4_c, aom_paeth_predictor_16x4_c,
+                aom_smooth_predictor_16x4_c, aom_smooth_v_predictor_16x4_c,
+                aom_smooth_h_predictor_16x4_c)
+
+INTRA_PRED_TEST(C_5, TX_16X64, aom_dc_predictor_16x64_c,
+                aom_dc_left_predictor_16x64_c, aom_dc_top_predictor_16x64_c,
+                aom_dc_128_predictor_16x64_c, aom_v_predictor_16x64_c,
+                aom_h_predictor_16x64_c, aom_paeth_predictor_16x64_c,
+                aom_smooth_predictor_16x64_c, aom_smooth_v_predictor_16x64_c,
+                aom_smooth_h_predictor_16x64_c)
+
+#if HAVE_SSE2
+INTRA_PRED_TEST(SSE2_1, TX_16X16, aom_dc_predictor_16x16_sse2,
+                aom_dc_left_predictor_16x16_sse2,
+                aom_dc_top_predictor_16x16_sse2,
+                aom_dc_128_predictor_16x16_sse2, aom_v_predictor_16x16_sse2,
+                aom_h_predictor_16x16_sse2, NULL, NULL, NULL, NULL)
+INTRA_PRED_TEST(SSE2_2, TX_16X8, aom_dc_predictor_16x8_sse2,
+                aom_dc_left_predictor_16x8_sse2, aom_dc_top_predictor_16x8_sse2,
+                aom_dc_128_predictor_16x8_sse2, aom_v_predictor_16x8_sse2,
+                aom_h_predictor_16x8_sse2, NULL, NULL, NULL, NULL)
+INTRA_PRED_TEST(SSE2_3, TX_16X32, aom_dc_predictor_16x32_sse2,
+                aom_dc_left_predictor_16x32_sse2,
+                aom_dc_top_predictor_16x32_sse2,
+                aom_dc_128_predictor_16x32_sse2, aom_v_predictor_16x32_sse2,
+                aom_h_predictor_16x32_sse2, NULL, NULL, NULL, NULL)
+INTRA_PRED_TEST(SSE2_4, TX_16X64, aom_dc_predictor_16x64_sse2,
+                aom_dc_left_predictor_16x64_sse2,
+                aom_dc_top_predictor_16x64_sse2,
+                aom_dc_128_predictor_16x64_sse2, aom_v_predictor_16x64_sse2,
+                aom_h_predictor_16x64_sse2, NULL, NULL, NULL, NULL)
+INTRA_PRED_TEST(SSE2_5, TX_16X4, aom_dc_predictor_16x4_sse2,
+                aom_dc_left_predictor_16x4_sse2, aom_dc_top_predictor_16x4_sse2,
+                aom_dc_128_predictor_16x4_sse2, aom_v_predictor_16x4_sse2,
+                aom_h_predictor_16x4_sse2, NULL, NULL, NULL, NULL)
+#endif  // HAVE_SSE2
+
+#if HAVE_SSSE3
+INTRA_PRED_TEST(SSSE3_1, TX_16X16, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_16x16_ssse3,
+                aom_smooth_predictor_16x16_ssse3,
+                aom_smooth_v_predictor_16x16_ssse3,
+                aom_smooth_h_predictor_16x16_ssse3)
+INTRA_PRED_TEST(SSSE3_2, TX_16X8, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_16x8_ssse3, aom_smooth_predictor_16x8_ssse3,
+                aom_smooth_v_predictor_16x8_ssse3,
+                aom_smooth_h_predictor_16x8_ssse3)
+INTRA_PRED_TEST(SSSE3_3, TX_16X32, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_16x32_ssse3,
+                aom_smooth_predictor_16x32_ssse3,
+                aom_smooth_v_predictor_16x32_ssse3,
+                aom_smooth_h_predictor_16x32_ssse3)
+INTRA_PRED_TEST(SSSE3_4, TX_16X64, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_16x64_ssse3,
+                aom_smooth_predictor_16x64_ssse3,
+                aom_smooth_v_predictor_16x64_ssse3,
+                aom_smooth_h_predictor_16x64_ssse3)
+INTRA_PRED_TEST(SSSE3_5, TX_16X4, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_16x4_ssse3, aom_smooth_predictor_16x4_ssse3,
+                aom_smooth_v_predictor_16x4_ssse3,
+                aom_smooth_h_predictor_16x4_ssse3)
+#endif  // HAVE_SSSE3
+
+#if HAVE_AVX2
+INTRA_PRED_TEST(AVX2_1, TX_16X16, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_16x16_avx2, NULL, NULL, NULL)
+INTRA_PRED_TEST(AVX2_2, TX_16X8, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_16x8_avx2, NULL, NULL, NULL)
+INTRA_PRED_TEST(AVX2_3, TX_16X32, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_16x32_avx2, NULL, NULL, NULL)
+INTRA_PRED_TEST(AVX2_4, TX_16X64, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_16x64_avx2, NULL, NULL, NULL)
+#endif  // HAVE_AVX2
+
+#if HAVE_DSPR2
+INTRA_PRED_TEST(DSPR2, TX_16X16, aom_dc_predictor_16x16_dspr2, NULL, NULL, NULL,
+                NULL, aom_h_predictor_16x16_dspr2, NULL, NULL, NULL, NULL)
+#endif  // HAVE_DSPR2
+
+#if HAVE_NEON
+INTRA_PRED_TEST(NEON, TX_16X16, aom_dc_predictor_16x16_neon,
+                aom_dc_left_predictor_16x16_neon,
+                aom_dc_top_predictor_16x16_neon,
+                aom_dc_128_predictor_16x16_neon, aom_v_predictor_16x16_neon,
+                aom_h_predictor_16x16_neon, NULL, NULL, NULL, NULL)
+#endif  // HAVE_NEON
+
+#if HAVE_MSA
+INTRA_PRED_TEST(MSA, TX_16X16, aom_dc_predictor_16x16_msa,
+                aom_dc_left_predictor_16x16_msa, aom_dc_top_predictor_16x16_msa,
+                aom_dc_128_predictor_16x16_msa, aom_v_predictor_16x16_msa,
+                aom_h_predictor_16x16_msa, NULL, NULL, NULL, NULL)
+#endif  // HAVE_MSA
+
+// -----------------------------------------------------------------------------
+// 32x32, 32x16, 32x64, 32x8
+
+INTRA_PRED_TEST(C_1, TX_32X32, aom_dc_predictor_32x32_c,
+                aom_dc_left_predictor_32x32_c, aom_dc_top_predictor_32x32_c,
+                aom_dc_128_predictor_32x32_c, aom_v_predictor_32x32_c,
+                aom_h_predictor_32x32_c, aom_paeth_predictor_32x32_c,
+                aom_smooth_predictor_32x32_c, aom_smooth_v_predictor_32x32_c,
+                aom_smooth_h_predictor_32x32_c)
+
+INTRA_PRED_TEST(C_2, TX_32X16, aom_dc_predictor_32x16_c,
+                aom_dc_left_predictor_32x16_c, aom_dc_top_predictor_32x16_c,
+                aom_dc_128_predictor_32x16_c, aom_v_predictor_32x16_c,
+                aom_h_predictor_32x16_c, aom_paeth_predictor_32x16_c,
+                aom_smooth_predictor_32x16_c, aom_smooth_v_predictor_32x16_c,
+                aom_smooth_h_predictor_32x16_c)
+
+INTRA_PRED_TEST(C_3, TX_32X64, aom_dc_predictor_32x64_c,
+                aom_dc_left_predictor_32x64_c, aom_dc_top_predictor_32x64_c,
+                aom_dc_128_predictor_32x64_c, aom_v_predictor_32x64_c,
+                aom_h_predictor_32x64_c, aom_paeth_predictor_32x64_c,
+                aom_smooth_predictor_32x64_c, aom_smooth_v_predictor_32x64_c,
+                aom_smooth_h_predictor_32x64_c)
+
+INTRA_PRED_TEST(C_4, TX_32X8, aom_dc_predictor_32x8_c,
+                aom_dc_left_predictor_32x8_c, aom_dc_top_predictor_32x8_c,
+                aom_dc_128_predictor_32x8_c, aom_v_predictor_32x8_c,
+                aom_h_predictor_32x8_c, aom_paeth_predictor_32x8_c,
+                aom_smooth_predictor_32x8_c, aom_smooth_v_predictor_32x8_c,
+                aom_smooth_h_predictor_32x8_c)
+
+#if HAVE_SSE2
+INTRA_PRED_TEST(SSE2_1, TX_32X32, aom_dc_predictor_32x32_sse2,
+                aom_dc_left_predictor_32x32_sse2,
+                aom_dc_top_predictor_32x32_sse2,
+                aom_dc_128_predictor_32x32_sse2, aom_v_predictor_32x32_sse2,
+                aom_h_predictor_32x32_sse2, NULL, NULL, NULL, NULL)
+INTRA_PRED_TEST(SSE2_2, TX_32X16, aom_dc_predictor_32x16_sse2,
+                aom_dc_left_predictor_32x16_sse2,
+                aom_dc_top_predictor_32x16_sse2,
+                aom_dc_128_predictor_32x16_sse2, aom_v_predictor_32x16_sse2,
+                aom_h_predictor_32x16_sse2, NULL, NULL, NULL, NULL)
+INTRA_PRED_TEST(SSE2_3, TX_32X64, aom_dc_predictor_32x64_sse2,
+                aom_dc_left_predictor_32x64_sse2,
+                aom_dc_top_predictor_32x64_sse2,
+                aom_dc_128_predictor_32x64_sse2, aom_v_predictor_32x64_sse2,
+                aom_h_predictor_32x64_sse2, NULL, NULL, NULL, NULL)
+INTRA_PRED_TEST(SSE2_4, TX_32X8, aom_dc_predictor_32x8_sse2,
+                aom_dc_left_predictor_32x8_sse2, aom_dc_top_predictor_32x8_sse2,
+                aom_dc_128_predictor_32x8_sse2, aom_v_predictor_32x8_sse2,
+                aom_h_predictor_32x8_sse2, NULL, NULL, NULL, NULL)
+#endif  // HAVE_SSE2
+
+#if HAVE_SSSE3
+INTRA_PRED_TEST(SSSE3_1, TX_32X32, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_32x32_ssse3,
+                aom_smooth_predictor_32x32_ssse3,
+                aom_smooth_v_predictor_32x32_ssse3,
+                aom_smooth_h_predictor_32x32_ssse3)
+INTRA_PRED_TEST(SSSE3_2, TX_32X16, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_32x16_ssse3,
+                aom_smooth_predictor_32x16_ssse3,
+                aom_smooth_v_predictor_32x16_ssse3,
+                aom_smooth_h_predictor_32x16_ssse3)
+INTRA_PRED_TEST(SSSE3_3, TX_32X64, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_32x64_ssse3,
+                aom_smooth_predictor_32x64_ssse3,
+                aom_smooth_v_predictor_32x64_ssse3,
+                aom_smooth_h_predictor_32x64_ssse3)
+INTRA_PRED_TEST(SSSE3_4, TX_32X8, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_32x8_ssse3, aom_smooth_predictor_32x8_ssse3,
+                aom_smooth_v_predictor_32x8_ssse3,
+                aom_smooth_h_predictor_32x8_ssse3)
+#endif  // HAVE_SSSE3
+
+#if HAVE_AVX2
+INTRA_PRED_TEST(AVX2_1, TX_32X32, aom_dc_predictor_32x32_avx2,
+                aom_dc_left_predictor_32x32_avx2,
+                aom_dc_top_predictor_32x32_avx2,
+                aom_dc_128_predictor_32x32_avx2, aom_v_predictor_32x32_avx2,
+                aom_h_predictor_32x32_avx2, aom_paeth_predictor_32x32_avx2,
+                NULL, NULL, NULL)
+INTRA_PRED_TEST(AVX2_2, TX_32X16, aom_dc_predictor_32x16_avx2,
+                aom_dc_left_predictor_32x16_avx2,
+                aom_dc_top_predictor_32x16_avx2,
+                aom_dc_128_predictor_32x16_avx2, aom_v_predictor_32x16_avx2,
+                NULL, aom_paeth_predictor_32x16_avx2, NULL, NULL, NULL)
+INTRA_PRED_TEST(AVX2_3, TX_32X64, aom_dc_predictor_32x64_avx2,
+                aom_dc_left_predictor_32x64_avx2,
+                aom_dc_top_predictor_32x64_avx2,
+                aom_dc_128_predictor_32x64_avx2, aom_v_predictor_32x64_avx2,
+                NULL, aom_paeth_predictor_32x64_avx2, NULL, NULL, NULL)
+#endif  // HAVE_AVX2
+
+#if HAVE_NEON
+INTRA_PRED_TEST(NEON, TX_32X32, aom_dc_predictor_32x32_neon,
+                aom_dc_left_predictor_32x32_neon,
+                aom_dc_top_predictor_32x32_neon,
+                aom_dc_128_predictor_32x32_neon, aom_v_predictor_32x32_neon,
+                aom_h_predictor_32x32_neon, NULL, NULL, NULL, NULL)
+#endif  // HAVE_NEON
+
+#if HAVE_MSA
+INTRA_PRED_TEST(MSA, TX_32X32, aom_dc_predictor_32x32_msa,
+                aom_dc_left_predictor_32x32_msa, aom_dc_top_predictor_32x32_msa,
+                aom_dc_128_predictor_32x32_msa, aom_v_predictor_32x32_msa,
+                aom_h_predictor_32x32_msa, NULL, NULL, NULL, NULL)
+#endif  // HAVE_MSA
+
+// -----------------------------------------------------------------------------
+// 64x64, 64x32, 64x16
+
+INTRA_PRED_TEST(C_1, TX_64X64, aom_dc_predictor_64x64_c,
+                aom_dc_left_predictor_64x64_c, aom_dc_top_predictor_64x64_c,
+                aom_dc_128_predictor_64x64_c, aom_v_predictor_64x64_c,
+                aom_h_predictor_64x64_c, aom_paeth_predictor_64x64_c,
+                aom_smooth_predictor_64x64_c, aom_smooth_v_predictor_64x64_c,
+                aom_smooth_h_predictor_64x64_c)
+
+INTRA_PRED_TEST(C_2, TX_64X32, aom_dc_predictor_64x32_c,
+                aom_dc_left_predictor_64x32_c, aom_dc_top_predictor_64x32_c,
+                aom_dc_128_predictor_64x32_c, aom_v_predictor_64x32_c,
+                aom_h_predictor_64x32_c, aom_paeth_predictor_64x32_c,
+                aom_smooth_predictor_64x32_c, aom_smooth_v_predictor_64x32_c,
+                aom_smooth_h_predictor_64x32_c)
+
+INTRA_PRED_TEST(C_3, TX_64X16, aom_dc_predictor_64x16_c,
+                aom_dc_left_predictor_64x16_c, aom_dc_top_predictor_64x16_c,
+                aom_dc_128_predictor_64x16_c, aom_v_predictor_64x16_c,
+                aom_h_predictor_64x16_c, aom_paeth_predictor_64x16_c,
+                aom_smooth_predictor_64x16_c, aom_smooth_v_predictor_64x16_c,
+                aom_smooth_h_predictor_64x16_c)
+
+#if HAVE_SSE2
+INTRA_PRED_TEST(SSE2_4, TX_64X64, aom_dc_predictor_64x64_sse2,
+                aom_dc_left_predictor_64x64_sse2,
+                aom_dc_top_predictor_64x64_sse2,
+                aom_dc_128_predictor_64x64_sse2, aom_v_predictor_64x64_sse2,
+                aom_h_predictor_64x64_sse2, NULL, NULL, NULL, NULL)
+INTRA_PRED_TEST(SSE2_5, TX_64X32, aom_dc_predictor_64x32_sse2,
+                aom_dc_left_predictor_64x32_sse2,
+                aom_dc_top_predictor_64x32_sse2,
+                aom_dc_128_predictor_64x32_sse2, aom_v_predictor_64x32_sse2,
+                aom_h_predictor_64x32_sse2, NULL, NULL, NULL, NULL)
+INTRA_PRED_TEST(SSE2_6, TX_64X16, aom_dc_predictor_64x16_sse2,
+                aom_dc_left_predictor_64x16_sse2,
+                aom_dc_top_predictor_64x16_sse2,
+                aom_dc_128_predictor_64x16_sse2, aom_v_predictor_64x16_sse2,
+                aom_h_predictor_64x16_sse2, NULL, NULL, NULL, NULL)
+#endif
+
+#if HAVE_SSSE3
+INTRA_PRED_TEST(SSSE3_4, TX_64X64, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_64x64_ssse3,
+                aom_smooth_predictor_64x64_ssse3,
+                aom_smooth_v_predictor_64x64_ssse3,
+                aom_smooth_h_predictor_64x64_ssse3)
+INTRA_PRED_TEST(SSSE3_5, TX_64X32, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_64x32_ssse3,
+                aom_smooth_predictor_64x32_ssse3,
+                aom_smooth_v_predictor_64x32_ssse3,
+                aom_smooth_h_predictor_64x32_ssse3)
+INTRA_PRED_TEST(SSSE3_6, TX_64X16, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_64x16_ssse3,
+                aom_smooth_predictor_64x16_ssse3,
+                aom_smooth_v_predictor_64x16_ssse3,
+                aom_smooth_h_predictor_64x16_ssse3)
+#endif
+
+#if HAVE_AVX2
+INTRA_PRED_TEST(AVX2_4, TX_64X64, aom_dc_predictor_64x64_avx2,
+                aom_dc_left_predictor_64x64_avx2,
+                aom_dc_top_predictor_64x64_avx2,
+                aom_dc_128_predictor_64x64_avx2, aom_v_predictor_64x64_avx2,
+                NULL, aom_paeth_predictor_64x64_avx2, NULL, NULL, NULL)
+INTRA_PRED_TEST(AVX2_5, TX_64X32, aom_dc_predictor_64x32_avx2,
+                aom_dc_left_predictor_64x32_avx2,
+                aom_dc_top_predictor_64x32_avx2,
+                aom_dc_128_predictor_64x32_avx2, aom_v_predictor_64x32_avx2,
+                NULL, aom_paeth_predictor_64x32_avx2, NULL, NULL, NULL)
+INTRA_PRED_TEST(AVX2_6, TX_64X16, aom_dc_predictor_64x16_avx2,
+                aom_dc_left_predictor_64x16_avx2,
+                aom_dc_top_predictor_64x16_avx2,
+                aom_dc_128_predictor_64x16_avx2, aom_v_predictor_64x16_avx2,
+                NULL, aom_paeth_predictor_64x16_avx2, NULL, NULL, NULL)
+#endif
+
+#if CONFIG_AV1_HIGHBITDEPTH
+// -----------------------------------------------------------------------------
+// High Bitdepth
+namespace {
+
+typedef void (*AvxHighbdPredFunc)(uint16_t *dst, ptrdiff_t y_stride,
+                                  const uint16_t *above, const uint16_t *left,
+                                  int bd);
+
+typedef IntraPredTestMem<uint16_t> Av1HighbdIntraPredTestMem;
+
+void TestHighbdIntraPred(TX_SIZE tx_size, AvxHighbdPredFunc const *pred_funcs,
+                         const char *const signatures[]) {
+  const int block_width = tx_size_wide[tx_size];
+  const int block_height = tx_size_high[tx_size];
+  const int num_pixels_per_test =
+      block_width * block_height * kNumAv1IntraFuncs;
+  const int kNumTests = static_cast<int>(2.e10 / num_pixels_per_test);
+  Av1HighbdIntraPredTestMem intra_pred_test_mem;
+  const int bd = 12;
+  intra_pred_test_mem.Init(block_width, block_height, bd);
+
+  for (int k = 0; k < kNumAv1IntraFuncs; ++k) {
+    if (pred_funcs[k] == NULL) continue;
+    memcpy(intra_pred_test_mem.src, intra_pred_test_mem.ref_src,
+           sizeof(intra_pred_test_mem.src));
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    for (int num_tests = 0; num_tests < kNumTests; ++num_tests) {
+      pred_funcs[k](intra_pred_test_mem.src, intra_pred_test_mem.stride,
+                    intra_pred_test_mem.above, intra_pred_test_mem.left, bd);
+    }
+    libaom_test::ClearSystemState();
+    aom_usec_timer_mark(&timer);
+    const int elapsed_time =
+        static_cast<int>(aom_usec_timer_elapsed(&timer) / 1000);
+    CheckMd5Signature(
+        tx_size, true, signatures, intra_pred_test_mem.src,
+        intra_pred_test_mem.num_pixels * sizeof(*intra_pred_test_mem.src),
+        elapsed_time, k);
+  }
+}
+
+static const char *const kHighbdSignatures[TX_SIZES_ALL][kNumAv1IntraFuncs] = {
+  {
+      // 4X4
+      "11f74af6c5737df472f3275cbde062fa",
+      "51bea056b6447c93f6eb8f6b7e8f6f71",
+      "27e97f946766331795886f4de04c5594",
+      "53ab15974b049111fb596c5168ec7e3f",
+      "f0b640bb176fbe4584cf3d32a9b0320a",
+      "729783ca909e03afd4b47111c80d967b",
+      "6e30009c45474a22032678b1bd579c8f",
+      "e57cba016d808aa8a35619df2a65f049",
+      "55a6c37f39afcbbf5abca4a985b96459",
+      "a623d45b37dafec1f8a75c4c5218913d",
+  },
+  {
+      // 8X8
+      "03da8829fe94663047fd108c5fcaa71d",
+      "ecdb37b8120a2d3a4c706b016bd1bfd7",
+      "1d4543ed8d2b9368cb96898095fe8a75",
+      "f791c9a67b913cbd82d9da8ecede30e2",
+      "065c70646f4dbaff913282f55a45a441",
+      "51f87123616662ef7c35691497dfd0ba",
+      "85c01ba03df68f9ece7bd3fa0f8980e6",
+      "ad19b7dac092f56df6d054e1f67f21e7",
+      "0edc415b5dd7299f7a34fb9f71d31d78",
+      "2bc8ec19e9f4b77a64b8a0a1f6aec7e7",
+  },
+  {
+      // 16X16
+      "e33cb3f56a878e2fddb1b2fc51cdd275",
+      "c7bff6f04b6052c8ab335d726dbbd52d",
+      "d0b0b47b654a9bcc5c6008110a44589b",
+      "78f5da7b10b2b9ab39f114a33b6254e9",
+      "c78e31d23831abb40d6271a318fdd6f3",
+      "90d1347f4ec9198a0320daecb6ff90b8",
+      "e63ded54ab3d0e8728b6f24d4f01e53f",
+      "35ce21fbe0ea114c089fc3489a78155d",
+      "f277f6ef8e4d717f1f0dfe2706ac197d",
+      "e8014d3f41256976c02e0f1e622ba2b9",
+  },
+  {
+      // 32X32
+      "a3e8056ba7e36628cce4917cd956fedd",
+      "cc7d3024fe8748b512407edee045377e",
+      "2aab0a0f330a1d3e19b8ecb8f06387a3",
+      "a547bc3fb7b06910bf3973122a426661",
+      "26f712514da95042f93d6e8dc8e431dc",
+      "bb08c6e16177081daa3d936538dbc2e3",
+      "84bf83f94a51b33654ca940c6f8bc057",
+      "7168b03fc31bf29596a344d6a35d007c",
+      "b073a70d3672f1282236994f5d12e94b",
+      "c51607aebad5dcb3c1e3b58ef9e5b84e",
+  },
+  {
+      // 64X64
+      "a6baa0d4bfb2269a94c7a38f86a4bccf",
+      "3f1ef5f473a49eba743f17a3324adf9d",
+      "12ac11889ae5f55b7781454efd706a6a",
+      "d9a906c0e692b22e1b4414e71a704b7e",
+      "47d4cadd56f70c11ff8f3e5d8df81161",
+      "de997744cf24c16c5ac2a36b02b351cc",
+      "23781211ae178ddeb6c4bb97a6bd7d83",
+      "a79d2e28340ca34b9e37daabbf030f63",
+      "0372bd3ddfc258750a6ac106b70587f4",
+      "228ef625d9460cbf6fa253a16a730976",
+  },
+  {
+      // 4X8
+      "22d519b796d59644043466320e4ccd14",
+      "09513a738c49b3f9542d27f34abbe1d5",
+      "807ae5e8813443ff01e71be6efacfb69",
+      "cbfa18d0293430b6e9708b0be1fd2394",
+      "346c354c34ec7fa780b576db355dab88",
+      "f97dae85c35359632380b09ca98d611e",
+      "698ae351d8896d89ed9e4e67b6e53eda",
+      "dcc197034a9c45a3d8238bf085835f4e",
+      "7a35e2c42ffdc2efc2d6d1d75a100fc7",
+      "41ab6cebd4516c87a91b2a593e2c2506",
+  },
+  {
+      // 8X4
+      "d58cd4c4bf3b7bbaa5db5e1a5622ec78",
+      "6e572c35aa782d00cafcb99e9ea047ea",
+      "e8c22a3702b416dc9ab974505afbed09",
+      "aaa4e4762a795aad7ad74de0c662c4e4",
+      "a19f9101967383c3dcbd516dc317a291",
+      "9ab8cb91f1a595b9ebe3fe8de58031aa",
+      "2cf9021d5f1169268699807ee118b65f",
+      "ee9605fcbd6fb871f1c5cd81a6989327",
+      "b4871af8316089e3e23522175df7e93f",
+      "d33301e1c2cb173be46792a22d19881a",
+  },
+  {
+      // 8X16
+      "4562de1d0336610880fdd5685498a9ec",
+      "16310fa7076394f16fc85c4b149d89c9",
+      "0e94af88e1dc573b6f0f499cddd1f530",
+      "dfd245ee20d091c67809160340365aa9",
+      "d3562504327f70c096c5be23fd8a3747",
+      "601b853558502acbb5135eadd2da117a",
+      "3c624345a723a1b2b1bea05a6a08bc99",
+      "2a9c781de609e0184cc7ab442050f4e5",
+      "0ddc5035c22252747126b61fc238c74d",
+      "e43f5d83bab759af69c7b6773fc8f9b2",
+  },
+  {
+      // 16X8
+      "a57d6b5a9bfd30c29591d8717ace9c51",
+      "f5907ba97ee6c53e339e953fc8d845ee",
+      "ea3aa727913ce45af06f89dd1808db5f",
+      "408af4f23e48d14b48ee35ae094fcd18",
+      "85c41cbcb5d744f7961e8950026fbffe",
+      "8a4e588a837638887ba671f8d4910485",
+      "b792d8826b67a21757ea7097cff9e05b",
+      "f94ce7101bb87fd3bb9312112527dbf4",
+      "688c6660a6dc6fa61fa1aa38e708c209",
+      "0cdf641b4f81d69509c92ae0b93ef5ff",
+  },
+  {
+      // 16X32
+      "aee4b3b0e3cc02d48e2c40d77f807927",
+      "8baef2b2e789f79c8df9d90ad10f34a4",
+      "038c38ee3c4f090bb8d736eab136aafc",
+      "1a3de2aaeaffd68a9fd6c7f6557b83f3",
+      "385c6e0ea29421dd81011a2934641e26",
+      "6cf96c285d1a2d4787f955dad715b08c",
+      "2d7f75dcd73b9528c8396279ff09ff3a",
+      "5a63cd1841e4ed470e4ca5ef845f2281",
+      "610d899ca945fbead33287d4335a8b32",
+      "6bafaad81fce37be46730187e78d8b11",
+  },
+  {
+      // 32X16
+      "290b23c9f5a1de7905bfa71a942da29b",
+      "701e7b82593c66da5052fc4b6afd79ce",
+      "4da828c5455cd246735a663fbb204989",
+      "e3fbeaf234efece8dbd752b77226200c",
+      "4d1d8c969f05155a7e7e84cf7aad021b",
+      "c22e4877c2c946d5bdc0d542e29e70cf",
+      "8ac1ce815e7780500f842b0beb0bb980",
+      "9fee2e2502b507f25bfad30a55b0b610",
+      "4ced9c212ec6f9956e27f68a91b59fef",
+      "4a7a0b93f138bb0863e4e465b01ec0b1",
+  },
+  {
+      // 32X64
+      "ad9cfc395a5c5644a21d958c7274ac14",
+      "f29d6d03c143ddf96fef04c19f2c8333",
+      "a8bdc852ef704dd4975c61893e8fbc3f",
+      "7d0bd7dea26226741dbca9a97f27fa74",
+      "45c27c5cca9a91b6ae8379feb0881c9f",
+      "8a0b78df1e001b85c874d686eac4aa1b",
+      "ce9fa75fac54a3f6c0cc3f2083b938f1",
+      "c0dca10d88762c954af18dc9e3791a39",
+      "61df229eddfccab913b8fda4bb02f9ac",
+      "4f4df6bc8d50a5600b573f0e44d70e66",
+  },
+  {
+      // 64X32
+      "db9d82921fd88b24fdff6f849f2f9c87",
+      "5ecc7fdc52d2f575ad4f2d0e9e6b1e11",
+      "b4581311a0a73d95dfac7f8f44591032",
+      "68bd283cfd1a125f6b2ee47cee874d36",
+      "804179f05c032908a5e36077bb87c994",
+      "fc5fd041a8ee779015394d0c066ee43c",
+      "68f5579ccadfe9a1baafb158334a3db2",
+      "fe237e45e215ab06d79046da9ad71e84",
+      "9a8a938a6824551bf7d21b8fd1d70ea1",
+      "eb7332f2017cd96882c76e7136aeaf53",
+  },
+  {
+      // 4X16
+      "7bafa307d507747b8132e7735b7f1c73",
+      "e58bc2d8213a97d1fea9cfb73d7a9633",
+      "435f8a8e8bbf14dbf2fe16b2be9e97aa",
+      "1d0e767b68d84acbfb50b7a04e633836",
+      "5f713bd7b324fe73bb7063e35ee14e5e",
+      "0dac4e1fa3d59814202715468c01ed56",
+      "47709d1db4a330c7a8900f450e6fddd1",
+      "258e0b930bb27db28f05da9cf7d1ee7c",
+      "36cf030fbae767912593efea045bfff5",
+      "248d7aceabb7499febae663fae41a920",
+  },
+  {
+      // 16X4
+      "04dde98e632670e393704742c89f9067",
+      "8c72543f1664651ae1fa08e2ac0adb9b",
+      "2354a2cdc2773aa2df8ab4010db1be39",
+      "6300ad3221c26da39b10e0e6d87ee3be",
+      "8ea30b661c6ba60b28d3167f19e449b8",
+      "fb6c1e4ff101a371cede63c2955cdb7e",
+      "a517c06433d6d7927b16a72184a23e92",
+      "393828be5d62ab6c48668bea5e2f801a",
+      "b1e510c542013eb9d6fb188dea2ce90a",
+      "569a8f2fe01679ca216535ecbcdccb62",
+  },
+  {
+      // 8X32
+      "9d541865c185ca7607852852613ac1fc",
+      "b96be67f08c6b5fa5ebd3411299c2f7c",
+      "75a2dcf50004b9d188849b048239767e",
+      "429492ff415c9fd9b050d73b2ad500f8",
+      "64b3606c1ccd036bd766bd5711392cf4",
+      "cb59844a0f01660ac955bae3511f1100",
+      "3e076155b7a70e8828618e3f33b51e3d",
+      "ed2d1f597ab7c50beff690f737cf9726",
+      "7909c6a26aaf20c59d996d3e5b5f9c29",
+      "965798807240c98c6f7cc9b457ed0773",
+  },
+  {
+      // 32X8
+      "36f391aa31619eec1f4d9ee95ea454cc",
+      "b82648f14eeba2527357cb50bc3223cb",
+      "7a7b2adf429125e8bee9d1d00a66e13f",
+      "4198e4d6ba503b7cc2d7e96bb845f661",
+      "96c160d2ec1be9fe0cdea9682f14d257",
+      "19a450bcebaa75afb4fc6bd1fd6434af",
+      "2bd2e35967d43d0ec1c6587a36f204d5",
+      "49799a99aa4ccfbd989bee92a99422f1",
+      "955530e99813812a74659edeac3f5475",
+      "f0316b84e378a19cd11b19a6e40b2914",
+  },
+  {
+      // 16X64
+      "8cba1b70a0bde29e8ef235cedc5faa7d",
+      "96d00ddc7537bf7f196006591b733b4e",
+      "cbf69d5d157c9f3355a4757b1d6e3414",
+      "3ac1f642019493dec1b737d7a3a1b4e5",
+      "35f9ee300d7fa3c97338e81a6f21dcd4",
+      "aae335442e77c8ebc280f16ea50ba9c7",
+      "a6140fdac2278644328be094d88731db",
+      "2df93621b6ff100f7008432d509f4161",
+      "c77bf5aee39e7ed4a3dd715f816f452a",
+      "02109bd63557d90225c32a8f1338258e",
+  },
+  {
+      // 64X16
+      "a5e2f9fb685d5f4a048e9a96affd25a4",
+      "1348f249690d9eefe09d9ad7ead2c801",
+      "525da4b187acd81b1ff1116b60461141",
+      "e99d072de858094c98b01bd4a6772634",
+      "873bfa9dc24693f19721f7c8d527f7d3",
+      "0acfc6507bd3468e9679efc127d6e4b9",
+      "57d03f8d079c7264854e22ac1157cfae",
+      "6c2c4036f70c7d957a9399b5436c0774",
+      "42b8e4a97b7f8416c72a5148c031c0b1",
+      "a38a2c5f79993dfae8530e9e25800893",
+  },
+};
+
+}  // namespace
+
+#define HIGHBD_INTRA_PRED_TEST(arch, tx_size, dc, dc_left, dc_top, dc_128, v, \
+                               h, paeth, smooth, smooth_v, smooth_h)          \
+  TEST(arch, DISABLED_##TestHighbdIntraPred_##tx_size) {                      \
+    static const AvxHighbdPredFunc aom_intra_pred[] = {                       \
+      dc, dc_left, dc_top, dc_128, v, h, paeth, smooth, smooth_v, smooth_h    \
+    };                                                                        \
+    TestHighbdIntraPred(tx_size, aom_intra_pred, kHighbdSignatures[tx_size]); \
+  }
+
+// -----------------------------------------------------------------------------
+// 4x4, 4x8, 4x16
+
+HIGHBD_INTRA_PRED_TEST(
+    C_1, TX_4X4, aom_highbd_dc_predictor_4x4_c,
+    aom_highbd_dc_left_predictor_4x4_c, aom_highbd_dc_top_predictor_4x4_c,
+    aom_highbd_dc_128_predictor_4x4_c, aom_highbd_v_predictor_4x4_c,
+    aom_highbd_h_predictor_4x4_c, aom_highbd_paeth_predictor_4x4_c,
+    aom_highbd_smooth_predictor_4x4_c, aom_highbd_smooth_v_predictor_4x4_c,
+    aom_highbd_smooth_h_predictor_4x4_c)
+
+HIGHBD_INTRA_PRED_TEST(
+    C_2, TX_4X8, aom_highbd_dc_predictor_4x8_c,
+    aom_highbd_dc_left_predictor_4x8_c, aom_highbd_dc_top_predictor_4x8_c,
+    aom_highbd_dc_128_predictor_4x8_c, aom_highbd_v_predictor_4x8_c,
+    aom_highbd_h_predictor_4x8_c, aom_highbd_paeth_predictor_4x8_c,
+    aom_highbd_smooth_predictor_4x8_c, aom_highbd_smooth_v_predictor_4x8_c,
+    aom_highbd_smooth_h_predictor_4x8_c)
+
+HIGHBD_INTRA_PRED_TEST(
+    C_3, TX_4X16, aom_highbd_dc_predictor_4x16_c,
+    aom_highbd_dc_left_predictor_4x16_c, aom_highbd_dc_top_predictor_4x16_c,
+    aom_highbd_dc_128_predictor_4x16_c, aom_highbd_v_predictor_4x16_c,
+    aom_highbd_h_predictor_4x16_c, aom_highbd_paeth_predictor_4x16_c,
+    aom_highbd_smooth_predictor_4x16_c, aom_highbd_smooth_v_predictor_4x16_c,
+    aom_highbd_smooth_h_predictor_4x16_c)
+
+#if HAVE_SSE2
+HIGHBD_INTRA_PRED_TEST(SSE2_1, TX_4X4, aom_highbd_dc_predictor_4x4_sse2,
+                       aom_highbd_dc_left_predictor_4x4_sse2,
+                       aom_highbd_dc_top_predictor_4x4_sse2,
+                       aom_highbd_dc_128_predictor_4x4_sse2,
+                       aom_highbd_v_predictor_4x4_sse2,
+                       aom_highbd_h_predictor_4x4_sse2, NULL, NULL, NULL, NULL)
+
+HIGHBD_INTRA_PRED_TEST(SSE2_2, TX_4X8, aom_highbd_dc_predictor_4x8_sse2,
+                       aom_highbd_dc_left_predictor_4x8_sse2,
+                       aom_highbd_dc_top_predictor_4x8_sse2,
+                       aom_highbd_dc_128_predictor_4x8_sse2,
+                       aom_highbd_v_predictor_4x8_sse2,
+                       aom_highbd_h_predictor_4x8_sse2, NULL, NULL, NULL, NULL)
+#endif
+
+// -----------------------------------------------------------------------------
+// 8x8, 8x4, 8x16, 8x32
+
+HIGHBD_INTRA_PRED_TEST(
+    C_1, TX_8X8, aom_highbd_dc_predictor_8x8_c,
+    aom_highbd_dc_left_predictor_8x8_c, aom_highbd_dc_top_predictor_8x8_c,
+    aom_highbd_dc_128_predictor_8x8_c, aom_highbd_v_predictor_8x8_c,
+    aom_highbd_h_predictor_8x8_c, aom_highbd_paeth_predictor_8x8_c,
+    aom_highbd_smooth_predictor_8x8_c, aom_highbd_smooth_v_predictor_8x8_c,
+    aom_highbd_smooth_h_predictor_8x8_c)
+
+HIGHBD_INTRA_PRED_TEST(
+    C_2, TX_8X4, aom_highbd_dc_predictor_8x4_c,
+    aom_highbd_dc_left_predictor_8x4_c, aom_highbd_dc_top_predictor_8x4_c,
+    aom_highbd_dc_128_predictor_8x4_c, aom_highbd_v_predictor_8x4_c,
+    aom_highbd_h_predictor_8x4_c, aom_highbd_paeth_predictor_8x4_c,
+    aom_highbd_smooth_predictor_8x4_c, aom_highbd_smooth_v_predictor_8x4_c,
+    aom_highbd_smooth_h_predictor_8x4_c)
+
+HIGHBD_INTRA_PRED_TEST(
+    C_3, TX_8X16, aom_highbd_dc_predictor_8x16_c,
+    aom_highbd_dc_left_predictor_8x16_c, aom_highbd_dc_top_predictor_8x16_c,
+    aom_highbd_dc_128_predictor_8x16_c, aom_highbd_v_predictor_8x16_c,
+    aom_highbd_h_predictor_8x16_c, aom_highbd_paeth_predictor_8x16_c,
+    aom_highbd_smooth_predictor_8x16_c, aom_highbd_smooth_v_predictor_8x16_c,
+    aom_highbd_smooth_h_predictor_8x16_c)
+
+HIGHBD_INTRA_PRED_TEST(
+    C_4, TX_8X32, aom_highbd_dc_predictor_8x32_c,
+    aom_highbd_dc_left_predictor_8x32_c, aom_highbd_dc_top_predictor_8x32_c,
+    aom_highbd_dc_128_predictor_8x32_c, aom_highbd_v_predictor_8x32_c,
+    aom_highbd_h_predictor_8x32_c, aom_highbd_paeth_predictor_8x32_c,
+    aom_highbd_smooth_predictor_8x32_c, aom_highbd_smooth_v_predictor_8x32_c,
+    aom_highbd_smooth_h_predictor_8x32_c)
+
+#if HAVE_SSE2
+HIGHBD_INTRA_PRED_TEST(SSE2_1, TX_8X8, aom_highbd_dc_predictor_8x8_sse2,
+                       aom_highbd_dc_left_predictor_8x8_sse2,
+                       aom_highbd_dc_top_predictor_8x8_sse2,
+                       aom_highbd_dc_128_predictor_8x8_sse2,
+                       aom_highbd_v_predictor_8x8_sse2,
+                       aom_highbd_h_predictor_8x8_sse2, NULL, NULL, NULL, NULL)
+HIGHBD_INTRA_PRED_TEST(SSE2_2, TX_8X4, aom_highbd_dc_predictor_8x4_sse2,
+                       aom_highbd_dc_left_predictor_8x4_sse2,
+                       aom_highbd_dc_top_predictor_8x4_sse2,
+                       aom_highbd_dc_128_predictor_8x4_sse2,
+                       aom_highbd_v_predictor_8x4_sse2,
+                       aom_highbd_h_predictor_8x4_sse2, NULL, NULL, NULL, NULL)
+HIGHBD_INTRA_PRED_TEST(SSE2_3, TX_8X16, aom_highbd_dc_predictor_8x16_sse2,
+                       aom_highbd_dc_left_predictor_8x16_sse2,
+                       aom_highbd_dc_top_predictor_8x16_sse2,
+                       aom_highbd_dc_128_predictor_8x16_sse2,
+                       aom_highbd_v_predictor_8x16_sse2,
+                       aom_highbd_h_predictor_8x16_sse2, NULL, NULL, NULL, NULL)
+#endif
+
+#if HAVE_SSSE3
+HIGHBD_INTRA_PRED_TEST(SSSE3, TX_8X8, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                       NULL, NULL, NULL)
+#endif
+
+// -----------------------------------------------------------------------------
+// 16x16, 16x8, 16x32, 16x4, 16x64
+
+HIGHBD_INTRA_PRED_TEST(
+    C_1, TX_16X16, aom_highbd_dc_predictor_16x16_c,
+    aom_highbd_dc_left_predictor_16x16_c, aom_highbd_dc_top_predictor_16x16_c,
+    aom_highbd_dc_128_predictor_16x16_c, aom_highbd_v_predictor_16x16_c,
+    aom_highbd_h_predictor_16x16_c, aom_highbd_paeth_predictor_16x16_c,
+    aom_highbd_smooth_predictor_16x16_c, aom_highbd_smooth_v_predictor_16x16_c,
+    aom_highbd_smooth_h_predictor_16x16_c)
+
+HIGHBD_INTRA_PRED_TEST(
+    C_2, TX_16X8, aom_highbd_dc_predictor_16x8_c,
+    aom_highbd_dc_left_predictor_16x8_c, aom_highbd_dc_top_predictor_16x8_c,
+    aom_highbd_dc_128_predictor_16x8_c, aom_highbd_v_predictor_16x8_c,
+    aom_highbd_h_predictor_16x8_c, aom_highbd_paeth_predictor_16x8_c,
+    aom_highbd_smooth_predictor_16x8_c, aom_highbd_smooth_v_predictor_16x8_c,
+    aom_highbd_smooth_h_predictor_16x8_c)
+
+HIGHBD_INTRA_PRED_TEST(
+    C_3, TX_16X32, aom_highbd_dc_predictor_16x32_c,
+    aom_highbd_dc_left_predictor_16x32_c, aom_highbd_dc_top_predictor_16x32_c,
+    aom_highbd_dc_128_predictor_16x32_c, aom_highbd_v_predictor_16x32_c,
+    aom_highbd_h_predictor_16x32_c, aom_highbd_paeth_predictor_16x32_c,
+    aom_highbd_smooth_predictor_16x32_c, aom_highbd_smooth_v_predictor_16x32_c,
+    aom_highbd_smooth_h_predictor_16x32_c)
+
+HIGHBD_INTRA_PRED_TEST(
+    C_4, TX_16X4, aom_highbd_dc_predictor_16x4_c,
+    aom_highbd_dc_left_predictor_16x4_c, aom_highbd_dc_top_predictor_16x4_c,
+    aom_highbd_dc_128_predictor_16x4_c, aom_highbd_v_predictor_16x4_c,
+    aom_highbd_h_predictor_16x4_c, aom_highbd_paeth_predictor_16x4_c,
+    aom_highbd_smooth_predictor_16x4_c, aom_highbd_smooth_v_predictor_16x4_c,
+    aom_highbd_smooth_h_predictor_16x4_c)
+
+HIGHBD_INTRA_PRED_TEST(
+    C_5, TX_16X64, aom_highbd_dc_predictor_16x64_c,
+    aom_highbd_dc_left_predictor_16x64_c, aom_highbd_dc_top_predictor_16x64_c,
+    aom_highbd_dc_128_predictor_16x64_c, aom_highbd_v_predictor_16x64_c,
+    aom_highbd_h_predictor_16x64_c, aom_highbd_paeth_predictor_16x64_c,
+    aom_highbd_smooth_predictor_16x64_c, aom_highbd_smooth_v_predictor_16x64_c,
+    aom_highbd_smooth_h_predictor_16x64_c)
+
+#if HAVE_SSE2
+HIGHBD_INTRA_PRED_TEST(SSE2_1, TX_16X16, aom_highbd_dc_predictor_16x16_sse2,
+                       aom_highbd_dc_left_predictor_16x16_sse2,
+                       aom_highbd_dc_top_predictor_16x16_sse2,
+                       aom_highbd_dc_128_predictor_16x16_sse2,
+                       aom_highbd_v_predictor_16x16_sse2,
+                       aom_highbd_h_predictor_16x16_sse2, NULL, NULL, NULL,
+                       NULL)
+HIGHBD_INTRA_PRED_TEST(SSE2_2, TX_16X8, aom_highbd_dc_predictor_16x8_sse2,
+                       aom_highbd_dc_left_predictor_16x8_sse2,
+                       aom_highbd_dc_top_predictor_16x8_sse2,
+                       aom_highbd_dc_128_predictor_16x8_sse2,
+                       aom_highbd_v_predictor_16x8_sse2,
+                       aom_highbd_h_predictor_16x8_sse2, NULL, NULL, NULL, NULL)
+HIGHBD_INTRA_PRED_TEST(SSE2_3, TX_16X32, aom_highbd_dc_predictor_16x32_sse2,
+                       aom_highbd_dc_left_predictor_16x32_sse2,
+                       aom_highbd_dc_top_predictor_16x32_sse2,
+                       aom_highbd_dc_128_predictor_16x32_sse2,
+                       aom_highbd_v_predictor_16x32_sse2,
+                       aom_highbd_h_predictor_16x32_sse2, NULL, NULL, NULL,
+                       NULL)
+#endif
+
+#if HAVE_SSSE3
+HIGHBD_INTRA_PRED_TEST(SSSE3_1, TX_16X16, NULL, NULL, NULL, NULL, NULL, NULL,
+                       NULL, NULL, NULL, NULL)
+#endif
+
+#if HAVE_AVX2
+HIGHBD_INTRA_PRED_TEST(AVX2_1, TX_16X16, NULL, NULL, NULL, NULL, NULL, NULL,
+                       NULL, NULL, NULL, NULL)
+
+HIGHBD_INTRA_PRED_TEST(AVX2_2, TX_16X8, NULL, NULL, NULL, NULL, NULL, NULL,
+                       NULL, NULL, NULL, NULL)
+
+HIGHBD_INTRA_PRED_TEST(AVX2_3, TX_16X32, NULL, NULL, NULL, NULL, NULL, NULL,
+                       NULL, NULL, NULL, NULL)
+#endif
+
+// -----------------------------------------------------------------------------
+// 32x32, 32x16, 32x64, 32x8
+
+HIGHBD_INTRA_PRED_TEST(
+    C_1, TX_32X32, aom_highbd_dc_predictor_32x32_c,
+    aom_highbd_dc_left_predictor_32x32_c, aom_highbd_dc_top_predictor_32x32_c,
+    aom_highbd_dc_128_predictor_32x32_c, aom_highbd_v_predictor_32x32_c,
+    aom_highbd_h_predictor_32x32_c, aom_highbd_paeth_predictor_32x32_c,
+    aom_highbd_smooth_predictor_32x32_c, aom_highbd_smooth_v_predictor_32x32_c,
+    aom_highbd_smooth_h_predictor_32x32_c)
+
+HIGHBD_INTRA_PRED_TEST(
+    C_2, TX_32X16, aom_highbd_dc_predictor_32x16_c,
+    aom_highbd_dc_left_predictor_32x16_c, aom_highbd_dc_top_predictor_32x16_c,
+    aom_highbd_dc_128_predictor_32x16_c, aom_highbd_v_predictor_32x16_c,
+    aom_highbd_h_predictor_32x16_c, aom_highbd_paeth_predictor_32x16_c,
+    aom_highbd_smooth_predictor_32x16_c, aom_highbd_smooth_v_predictor_32x16_c,
+    aom_highbd_smooth_h_predictor_32x16_c)
+
+HIGHBD_INTRA_PRED_TEST(
+    C_3, TX_32X64, aom_highbd_dc_predictor_32x64_c,
+    aom_highbd_dc_left_predictor_32x64_c, aom_highbd_dc_top_predictor_32x64_c,
+    aom_highbd_dc_128_predictor_32x64_c, aom_highbd_v_predictor_32x64_c,
+    aom_highbd_h_predictor_32x64_c, aom_highbd_paeth_predictor_32x64_c,
+    aom_highbd_smooth_predictor_32x64_c, aom_highbd_smooth_v_predictor_32x64_c,
+    aom_highbd_smooth_h_predictor_32x64_c)
+
+HIGHBD_INTRA_PRED_TEST(
+    C_4, TX_32X8, aom_highbd_dc_predictor_32x8_c,
+    aom_highbd_dc_left_predictor_32x8_c, aom_highbd_dc_top_predictor_32x8_c,
+    aom_highbd_dc_128_predictor_32x8_c, aom_highbd_v_predictor_32x8_c,
+    aom_highbd_h_predictor_32x8_c, aom_highbd_paeth_predictor_32x8_c,
+    aom_highbd_smooth_predictor_32x8_c, aom_highbd_smooth_v_predictor_32x8_c,
+    aom_highbd_smooth_h_predictor_32x8_c)
+
+#if HAVE_SSE2
+HIGHBD_INTRA_PRED_TEST(SSE2_1, TX_32X32, aom_highbd_dc_predictor_32x32_sse2,
+                       aom_highbd_dc_left_predictor_32x32_sse2,
+                       aom_highbd_dc_top_predictor_32x32_sse2,
+                       aom_highbd_dc_128_predictor_32x32_sse2,
+                       aom_highbd_v_predictor_32x32_sse2,
+                       aom_highbd_h_predictor_32x32_sse2, NULL, NULL, NULL,
+                       NULL)
+HIGHBD_INTRA_PRED_TEST(SSE2_2, TX_32X16, aom_highbd_dc_predictor_32x16_sse2,
+                       aom_highbd_dc_left_predictor_32x16_sse2,
+                       aom_highbd_dc_top_predictor_32x16_sse2,
+                       aom_highbd_dc_128_predictor_32x16_sse2,
+                       aom_highbd_v_predictor_32x16_sse2,
+                       aom_highbd_h_predictor_32x16_sse2, NULL, NULL, NULL,
+                       NULL)
+#endif
+
+#if HAVE_SSSE3
+HIGHBD_INTRA_PRED_TEST(SSSE3_1, TX_32X32, NULL, NULL, NULL, NULL, NULL, NULL,
+                       NULL, NULL, NULL, NULL)
+#endif
+
+#if HAVE_AVX2
+HIGHBD_INTRA_PRED_TEST(AVX2_1, TX_32X32, NULL, NULL, NULL, NULL, NULL, NULL,
+                       NULL, NULL, NULL, NULL)
+
+HIGHBD_INTRA_PRED_TEST(AVX2_2, TX_32X16, NULL, NULL, NULL, NULL, NULL, NULL,
+                       NULL, NULL, NULL, NULL)
+#endif
+
+// -----------------------------------------------------------------------------
+// 64x64, 64x32, 64x16
+
+HIGHBD_INTRA_PRED_TEST(
+    C_1, TX_64X64, aom_highbd_dc_predictor_64x64_c,
+    aom_highbd_dc_left_predictor_64x64_c, aom_highbd_dc_top_predictor_64x64_c,
+    aom_highbd_dc_128_predictor_64x64_c, aom_highbd_v_predictor_64x64_c,
+    aom_highbd_h_predictor_64x64_c, aom_highbd_paeth_predictor_64x64_c,
+    aom_highbd_smooth_predictor_64x64_c, aom_highbd_smooth_v_predictor_64x64_c,
+    aom_highbd_smooth_h_predictor_64x64_c)
+
+HIGHBD_INTRA_PRED_TEST(
+    C_2, TX_64X32, aom_highbd_dc_predictor_64x32_c,
+    aom_highbd_dc_left_predictor_64x32_c, aom_highbd_dc_top_predictor_64x32_c,
+    aom_highbd_dc_128_predictor_64x32_c, aom_highbd_v_predictor_64x32_c,
+    aom_highbd_h_predictor_64x32_c, aom_highbd_paeth_predictor_64x32_c,
+    aom_highbd_smooth_predictor_64x32_c, aom_highbd_smooth_v_predictor_64x32_c,
+    aom_highbd_smooth_h_predictor_64x32_c)
+
+HIGHBD_INTRA_PRED_TEST(
+    C_3, TX_64X16, aom_highbd_dc_predictor_64x16_c,
+    aom_highbd_dc_left_predictor_64x16_c, aom_highbd_dc_top_predictor_64x16_c,
+    aom_highbd_dc_128_predictor_64x16_c, aom_highbd_v_predictor_64x16_c,
+    aom_highbd_h_predictor_64x16_c, aom_highbd_paeth_predictor_64x16_c,
+    aom_highbd_smooth_predictor_64x16_c, aom_highbd_smooth_v_predictor_64x16_c,
+    aom_highbd_smooth_h_predictor_64x16_c)
+
+// -----------------------------------------------------------------------------
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+#include "test/test_libaom.cc"
diff --git a/libs/libaom/src/test/test_libaom.cc b/libs/libaom/src/test/test_libaom.cc
new file mode 100644
index 000000000..b55d76237
--- /dev/null
+++ b/libs/libaom/src/test/test_libaom.cc
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <string.h>
+
+#include <string>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+
+#if ARCH_X86 || ARCH_X86_64
+#include "aom_ports/x86.h"
+#endif
+extern "C" {
+extern void av1_rtcd();
+extern void aom_dsp_rtcd();
+extern void aom_scale_rtcd();
+}
+
+#if ARCH_X86 || ARCH_X86_64
+static void append_negative_gtest_filter(const char *str) {
+  std::string filter = ::testing::FLAGS_gtest_filter;
+  // Negative patterns begin with one '-' followed by a ':' separated list.
+  if (filter.find('-') == std::string::npos) filter += '-';
+  // OPT.* matches TEST() functions
+  // OPT/* matches TEST_P() functions
+  // OPT_* matches tests which have been manually sharded.
+  // We do not match OPT* because of SSE/SSE2 collisions.
+  const char *search_terminators = "./_";
+  for (size_t pos = 0; pos < strlen(search_terminators); ++pos) {
+    filter += ":";
+    filter += str;
+    filter += search_terminators[pos];
+    filter += "*";
+  }
+  ::testing::FLAGS_gtest_filter = filter;
+}
+#endif  // ARCH_X86 || ARCH_X86_64
+
+int main(int argc, char **argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+
+#if ARCH_X86 || ARCH_X86_64
+  const int simd_caps = x86_simd_caps();
+  if (!(simd_caps & HAS_MMX)) append_negative_gtest_filter("MMX");
+  if (!(simd_caps & HAS_SSE)) append_negative_gtest_filter("SSE");
+  if (!(simd_caps & HAS_SSE2)) append_negative_gtest_filter("SSE2");
+  if (!(simd_caps & HAS_SSE3)) append_negative_gtest_filter("SSE3");
+  if (!(simd_caps & HAS_SSSE3)) append_negative_gtest_filter("SSSE3");
+  if (!(simd_caps & HAS_SSE4_1)) append_negative_gtest_filter("SSE4_1");
+  if (!(simd_caps & HAS_SSE4_2)) append_negative_gtest_filter("SSE4_2");
+  if (!(simd_caps & HAS_AVX)) append_negative_gtest_filter("AVX");
+  if (!(simd_caps & HAS_AVX2)) append_negative_gtest_filter("AVX2");
+#endif  // ARCH_X86 || ARCH_X86_64
+
+// Shared library builds don't support whitebox tests that exercise internal
+// symbols.
+#if !CONFIG_SHARED
+  av1_rtcd();
+  aom_dsp_rtcd();
+  aom_scale_rtcd();
+#endif  // !CONFIG_SHARED
+
+  return RUN_ALL_TESTS();
+}
diff --git a/libs/libaom/src/test/test_runner.cmake b/libs/libaom/src/test/test_runner.cmake
new file mode 100644
index 000000000..f0648d16b
--- /dev/null
+++ b/libs/libaom/src/test/test_runner.cmake
@@ -0,0 +1,28 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(NOT GTEST_TOTAL_SHARDS
+   OR "${GTEST_SHARD_INDEX}" STREQUAL ""
+   OR NOT TEST_LIBAOM)
+  message(
+    FATAL_ERROR
+      "The variables GTEST_SHARD_INDEX, GTEST_TOTAL_SHARDS and TEST_LIBAOM
+          must be defined.")
+endif()
+
+set($ENV{GTEST_SHARD_INDEX} ${GTEST_SHARD_INDEX})
+set($ENV{GTEST_TOTAL_SHARDS} ${GTEST_TOTAL_SHARDS})
+execute_process(COMMAND ${TEST_LIBAOM} RESULT_VARIABLE test_result)
+set(test_message "Test shard ${GTEST_SHARD_INDEX}/${GTEST_TOTAL_SHARDS} result")
+message("${test_message}: ${test_result}")
+
+if(NOT "${test_result}" STREQUAL "0")
+  message(FATAL_ERROR "${test_message}: FAILED, non-zero exit code.")
+endif()
diff --git a/libs/libaom/src/test/test_vector_test.cc b/libs/libaom/src/test/test_vector_test.cc
new file mode 100644
index 000000000..eab92b685
--- /dev/null
+++ b/libs/libaom/src/test/test_vector_test.cc
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <cstdio>
+#include <cstdlib>
+#include <memory>
+#include <set>
+#include <string>
+#include <tuple>
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "common/tools_common.h"
+#include "config/aom_config.h"
+#include "test/codec_factory.h"
+#include "test/decode_test_driver.h"
+#include "test/ivf_video_source.h"
+#include "test/md5_helper.h"
+#include "test/test_vectors.h"
+#include "test/util.h"
+#if CONFIG_WEBM_IO
+#include "test/webm_video_source.h"
+#endif
+
+namespace {
+
+const int kThreads = 0;
+const int kFileName = 1;
+const int kRowMT = 2;
+
+typedef std::tuple<int, const char *, int> DecodeParam;
+
+class TestVectorTest : public ::libaom_test::DecoderTest,
+                       public ::libaom_test::CodecTestWithParam<DecodeParam> {
+ protected:
+  TestVectorTest() : DecoderTest(GET_PARAM(0)), md5_file_(NULL) {}
+
+  virtual ~TestVectorTest() {
+    if (md5_file_) fclose(md5_file_);
+  }
+
+  void OpenMD5File(const std::string &md5_file_name_) {
+    md5_file_ = libaom_test::OpenTestDataFile(md5_file_name_);
+    ASSERT_TRUE(md5_file_ != NULL)
+        << "Md5 file open failed. Filename: " << md5_file_name_;
+  }
+
+  virtual void PreDecodeFrameHook(
+      const libaom_test::CompressedVideoSource &video,
+      libaom_test::Decoder *decoder) {
+    if (video.frame_number() == 0) decoder->Control(AV1D_SET_ROW_MT, row_mt_);
+  }
+
+  virtual void DecompressedFrameHook(const aom_image_t &img,
+                                     const unsigned int frame_number) {
+    ASSERT_TRUE(md5_file_ != NULL);
+    char expected_md5[33];
+    char junk[128];
+
+    // Read correct md5 checksums.
+    const int res = fscanf(md5_file_, "%s  %s", expected_md5, junk);
+    ASSERT_NE(res, EOF) << "Read md5 data failed";
+    expected_md5[32] = '\0';
+
+    ::libaom_test::MD5 md5_res;
+#if FORCE_HIGHBITDEPTH_DECODING
+    const aom_img_fmt_t shifted_fmt =
+        (aom_img_fmt)(img.fmt & ~AOM_IMG_FMT_HIGHBITDEPTH);
+    if (img.bit_depth == 8 && shifted_fmt != img.fmt) {
+      aom_image_t *img_shifted =
+          aom_img_alloc(NULL, shifted_fmt, img.d_w, img.d_h, 16);
+      img_shifted->bit_depth = img.bit_depth;
+      img_shifted->monochrome = img.monochrome;
+      aom_img_downshift(img_shifted, &img, 0);
+      md5_res.Add(img_shifted);
+      aom_img_free(img_shifted);
+    } else {
+#endif
+      md5_res.Add(&img);
+#if FORCE_HIGHBITDEPTH_DECODING
+    }
+#endif
+
+    const char *actual_md5 = md5_res.Get();
+    // Check md5 match.
+    ASSERT_STREQ(expected_md5, actual_md5)
+        << "Md5 checksums don't match: frame number = " << frame_number;
+  }
+
+  unsigned int row_mt_;
+
+ private:
+  FILE *md5_file_;
+};
+
+// This test runs through the whole set of test vectors, and decodes them.
+// The md5 checksums are computed for each frame in the video file. If md5
+// checksums match the correct md5 data, then the test is passed. Otherwise,
+// the test failed.
+TEST_P(TestVectorTest, MD5Match) {
+  const DecodeParam input = GET_PARAM(1);
+  const std::string filename = std::get<kFileName>(input);
+  aom_codec_flags_t flags = 0;
+  aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t();
+  char str[256];
+
+  cfg.threads = std::get<kThreads>(input);
+  row_mt_ = std::get<kRowMT>(input);
+
+  snprintf(str, sizeof(str) / sizeof(str[0]) - 1, "file: %s threads: %d",
+           filename.c_str(), cfg.threads);
+  SCOPED_TRACE(str);
+
+  // Open compressed video file.
+  std::unique_ptr<libaom_test::CompressedVideoSource> video;
+  if (filename.substr(filename.length() - 3, 3) == "ivf") {
+    video.reset(new libaom_test::IVFVideoSource(filename));
+  } else if (filename.substr(filename.length() - 4, 4) == "webm" ||
+             filename.substr(filename.length() - 3, 3) == "mkv") {
+#if CONFIG_WEBM_IO
+    video.reset(new libaom_test::WebMVideoSource(filename));
+#else
+    fprintf(stderr, "WebM IO is disabled, skipping test vector %s\n",
+            filename.c_str());
+    return;
+#endif
+  }
+  ASSERT_TRUE(video.get() != NULL);
+  video->Init();
+
+  // Construct md5 file name.
+  const std::string md5_filename = filename + ".md5";
+  OpenMD5File(md5_filename);
+
+  // Set decode config and flags.
+  cfg.allow_lowbitdepth = !FORCE_HIGHBITDEPTH_DECODING;
+  set_cfg(cfg);
+  set_flags(flags);
+
+  // Decode frame, and check the md5 matching.
+  ASSERT_NO_FATAL_FAILURE(RunLoop(video.get(), cfg));
+}
+
+#if CONFIG_AV1_DECODER
+AV1_INSTANTIATE_TEST_CASE(
+    TestVectorTest,
+    ::testing::Combine(::testing::Values(1),  // Single thread.
+                       ::testing::ValuesIn(libaom_test::kAV1TestVectors,
+                                           libaom_test::kAV1TestVectors +
+                                               libaom_test::kNumAV1TestVectors),
+                       ::testing::Values(0)));
+
+// Test AV1 decode in with different numbers of threads.
+INSTANTIATE_TEST_SUITE_P(
+    AV1MultiThreaded, TestVectorTest,
+    ::testing::Combine(
+        ::testing::Values(
+            static_cast<const libaom_test::CodecFactory *>(&libaom_test::kAV1)),
+        ::testing::Combine(
+            ::testing::Range(2, 9),  // With 2 ~ 8 threads.
+            ::testing::ValuesIn(libaom_test::kAV1TestVectors,
+                                libaom_test::kAV1TestVectors +
+                                    libaom_test::kNumAV1TestVectors),
+            ::testing::Range(0, 2))));
+
+#endif  // CONFIG_AV1_DECODER
+
+}  // namespace
diff --git a/libs/libaom/src/test/test_vectors.cc b/libs/libaom/src/test/test_vectors.cc
new file mode 100644
index 000000000..991667a08
--- /dev/null
+++ b/libs/libaom/src/test/test_vectors.cc
@@ -0,0 +1,263 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "test/test_vectors.h"
+
+namespace libaom_test {
+
+#define NELEMENTS(x) static_cast<int>(sizeof(x) / sizeof(x[0]))
+
+#if CONFIG_AV1_DECODER
+const char *const kAV1TestVectors[] = { "av1-1-b8-00-quantizer-00.ivf",
+                                        "av1-1-b8-00-quantizer-01.ivf",
+                                        "av1-1-b8-00-quantizer-02.ivf",
+                                        "av1-1-b8-00-quantizer-03.ivf",
+                                        "av1-1-b8-00-quantizer-04.ivf",
+                                        "av1-1-b8-00-quantizer-05.ivf",
+                                        "av1-1-b8-00-quantizer-06.ivf",
+                                        "av1-1-b8-00-quantizer-07.ivf",
+                                        "av1-1-b8-00-quantizer-08.ivf",
+                                        "av1-1-b8-00-quantizer-09.ivf",
+                                        "av1-1-b8-00-quantizer-10.ivf",
+                                        "av1-1-b8-00-quantizer-11.ivf",
+                                        "av1-1-b8-00-quantizer-12.ivf",
+                                        "av1-1-b8-00-quantizer-13.ivf",
+                                        "av1-1-b8-00-quantizer-14.ivf",
+                                        "av1-1-b8-00-quantizer-15.ivf",
+                                        "av1-1-b8-00-quantizer-16.ivf",
+                                        "av1-1-b8-00-quantizer-17.ivf",
+                                        "av1-1-b8-00-quantizer-18.ivf",
+                                        "av1-1-b8-00-quantizer-19.ivf",
+                                        "av1-1-b8-00-quantizer-20.ivf",
+                                        "av1-1-b8-00-quantizer-21.ivf",
+                                        "av1-1-b8-00-quantizer-22.ivf",
+                                        "av1-1-b8-00-quantizer-23.ivf",
+                                        "av1-1-b8-00-quantizer-24.ivf",
+                                        "av1-1-b8-00-quantizer-25.ivf",
+                                        "av1-1-b8-00-quantizer-26.ivf",
+                                        "av1-1-b8-00-quantizer-27.ivf",
+                                        "av1-1-b8-00-quantizer-28.ivf",
+                                        "av1-1-b8-00-quantizer-29.ivf",
+                                        "av1-1-b8-00-quantizer-30.ivf",
+                                        "av1-1-b8-00-quantizer-31.ivf",
+                                        "av1-1-b8-00-quantizer-32.ivf",
+                                        "av1-1-b8-00-quantizer-33.ivf",
+                                        "av1-1-b8-00-quantizer-34.ivf",
+                                        "av1-1-b8-00-quantizer-35.ivf",
+                                        "av1-1-b8-00-quantizer-36.ivf",
+                                        "av1-1-b8-00-quantizer-37.ivf",
+                                        "av1-1-b8-00-quantizer-38.ivf",
+                                        "av1-1-b8-00-quantizer-39.ivf",
+                                        "av1-1-b8-00-quantizer-40.ivf",
+                                        "av1-1-b8-00-quantizer-41.ivf",
+                                        "av1-1-b8-00-quantizer-42.ivf",
+                                        "av1-1-b8-00-quantizer-43.ivf",
+                                        "av1-1-b8-00-quantizer-44.ivf",
+                                        "av1-1-b8-00-quantizer-45.ivf",
+                                        "av1-1-b8-00-quantizer-46.ivf",
+                                        "av1-1-b8-00-quantizer-47.ivf",
+                                        "av1-1-b8-00-quantizer-48.ivf",
+                                        "av1-1-b8-00-quantizer-49.ivf",
+                                        "av1-1-b8-00-quantizer-50.ivf",
+                                        "av1-1-b8-00-quantizer-51.ivf",
+                                        "av1-1-b8-00-quantizer-52.ivf",
+                                        "av1-1-b8-00-quantizer-53.ivf",
+                                        "av1-1-b8-00-quantizer-54.ivf",
+                                        "av1-1-b8-00-quantizer-55.ivf",
+                                        "av1-1-b8-00-quantizer-56.ivf",
+                                        "av1-1-b8-00-quantizer-57.ivf",
+                                        "av1-1-b8-00-quantizer-58.ivf",
+                                        "av1-1-b8-00-quantizer-59.ivf",
+                                        "av1-1-b8-00-quantizer-60.ivf",
+                                        "av1-1-b8-00-quantizer-61.ivf",
+                                        "av1-1-b8-00-quantizer-62.ivf",
+                                        "av1-1-b8-00-quantizer-63.ivf",
+#if CONFIG_AV1_HIGHBITDEPTH
+                                        "av1-1-b10-00-quantizer-00.ivf",
+                                        "av1-1-b10-00-quantizer-01.ivf",
+                                        "av1-1-b10-00-quantizer-02.ivf",
+                                        "av1-1-b10-00-quantizer-03.ivf",
+                                        "av1-1-b10-00-quantizer-04.ivf",
+                                        "av1-1-b10-00-quantizer-05.ivf",
+                                        "av1-1-b10-00-quantizer-06.ivf",
+                                        "av1-1-b10-00-quantizer-07.ivf",
+                                        "av1-1-b10-00-quantizer-08.ivf",
+                                        "av1-1-b10-00-quantizer-09.ivf",
+                                        "av1-1-b10-00-quantizer-10.ivf",
+                                        "av1-1-b10-00-quantizer-11.ivf",
+                                        "av1-1-b10-00-quantizer-12.ivf",
+                                        "av1-1-b10-00-quantizer-13.ivf",
+                                        "av1-1-b10-00-quantizer-14.ivf",
+                                        "av1-1-b10-00-quantizer-15.ivf",
+                                        "av1-1-b10-00-quantizer-16.ivf",
+                                        "av1-1-b10-00-quantizer-17.ivf",
+                                        "av1-1-b10-00-quantizer-18.ivf",
+                                        "av1-1-b10-00-quantizer-19.ivf",
+                                        "av1-1-b10-00-quantizer-20.ivf",
+                                        "av1-1-b10-00-quantizer-21.ivf",
+                                        "av1-1-b10-00-quantizer-22.ivf",
+                                        "av1-1-b10-00-quantizer-23.ivf",
+                                        "av1-1-b10-00-quantizer-24.ivf",
+                                        "av1-1-b10-00-quantizer-25.ivf",
+                                        "av1-1-b10-00-quantizer-26.ivf",
+                                        "av1-1-b10-00-quantizer-27.ivf",
+                                        "av1-1-b10-00-quantizer-28.ivf",
+                                        "av1-1-b10-00-quantizer-29.ivf",
+                                        "av1-1-b10-00-quantizer-30.ivf",
+                                        "av1-1-b10-00-quantizer-31.ivf",
+                                        "av1-1-b10-00-quantizer-32.ivf",
+                                        "av1-1-b10-00-quantizer-33.ivf",
+                                        "av1-1-b10-00-quantizer-34.ivf",
+                                        "av1-1-b10-00-quantizer-35.ivf",
+                                        "av1-1-b10-00-quantizer-36.ivf",
+                                        "av1-1-b10-00-quantizer-37.ivf",
+                                        "av1-1-b10-00-quantizer-38.ivf",
+                                        "av1-1-b10-00-quantizer-39.ivf",
+                                        "av1-1-b10-00-quantizer-40.ivf",
+                                        "av1-1-b10-00-quantizer-41.ivf",
+                                        "av1-1-b10-00-quantizer-42.ivf",
+                                        "av1-1-b10-00-quantizer-43.ivf",
+                                        "av1-1-b10-00-quantizer-44.ivf",
+                                        "av1-1-b10-00-quantizer-45.ivf",
+                                        "av1-1-b10-00-quantizer-46.ivf",
+                                        "av1-1-b10-00-quantizer-47.ivf",
+                                        "av1-1-b10-00-quantizer-48.ivf",
+                                        "av1-1-b10-00-quantizer-49.ivf",
+                                        "av1-1-b10-00-quantizer-50.ivf",
+                                        "av1-1-b10-00-quantizer-51.ivf",
+                                        "av1-1-b10-00-quantizer-52.ivf",
+                                        "av1-1-b10-00-quantizer-53.ivf",
+                                        "av1-1-b10-00-quantizer-54.ivf",
+                                        "av1-1-b10-00-quantizer-55.ivf",
+                                        "av1-1-b10-00-quantizer-56.ivf",
+                                        "av1-1-b10-00-quantizer-57.ivf",
+                                        "av1-1-b10-00-quantizer-58.ivf",
+                                        "av1-1-b10-00-quantizer-59.ivf",
+                                        "av1-1-b10-00-quantizer-60.ivf",
+                                        "av1-1-b10-00-quantizer-61.ivf",
+                                        "av1-1-b10-00-quantizer-62.ivf",
+                                        "av1-1-b10-00-quantizer-63.ivf",
+                                        "av1-1-b10-23-film_grain-50.ivf",
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+                                        "av1-1-b8-01-size-16x16.ivf",
+                                        "av1-1-b8-01-size-16x18.ivf",
+                                        "av1-1-b8-01-size-16x32.ivf",
+                                        "av1-1-b8-01-size-16x34.ivf",
+                                        "av1-1-b8-01-size-16x64.ivf",
+                                        "av1-1-b8-01-size-16x66.ivf",
+                                        "av1-1-b8-01-size-18x16.ivf",
+                                        "av1-1-b8-01-size-18x18.ivf",
+                                        "av1-1-b8-01-size-18x32.ivf",
+                                        "av1-1-b8-01-size-18x34.ivf",
+                                        "av1-1-b8-01-size-18x64.ivf",
+                                        "av1-1-b8-01-size-18x66.ivf",
+                                        "av1-1-b8-01-size-196x196.ivf",
+                                        "av1-1-b8-01-size-196x198.ivf",
+                                        "av1-1-b8-01-size-196x200.ivf",
+                                        "av1-1-b8-01-size-196x202.ivf",
+                                        "av1-1-b8-01-size-196x208.ivf",
+                                        "av1-1-b8-01-size-196x210.ivf",
+                                        "av1-1-b8-01-size-196x224.ivf",
+                                        "av1-1-b8-01-size-196x226.ivf",
+                                        "av1-1-b8-01-size-198x196.ivf",
+                                        "av1-1-b8-01-size-198x198.ivf",
+                                        "av1-1-b8-01-size-198x200.ivf",
+                                        "av1-1-b8-01-size-198x202.ivf",
+                                        "av1-1-b8-01-size-198x208.ivf",
+                                        "av1-1-b8-01-size-198x210.ivf",
+                                        "av1-1-b8-01-size-198x224.ivf",
+                                        "av1-1-b8-01-size-198x226.ivf",
+                                        "av1-1-b8-01-size-200x196.ivf",
+                                        "av1-1-b8-01-size-200x198.ivf",
+                                        "av1-1-b8-01-size-200x200.ivf",
+                                        "av1-1-b8-01-size-200x202.ivf",
+                                        "av1-1-b8-01-size-200x208.ivf",
+                                        "av1-1-b8-01-size-200x210.ivf",
+                                        "av1-1-b8-01-size-200x224.ivf",
+                                        "av1-1-b8-01-size-200x226.ivf",
+                                        "av1-1-b8-01-size-202x196.ivf",
+                                        "av1-1-b8-01-size-202x198.ivf",
+                                        "av1-1-b8-01-size-202x200.ivf",
+                                        "av1-1-b8-01-size-202x202.ivf",
+                                        "av1-1-b8-01-size-202x208.ivf",
+                                        "av1-1-b8-01-size-202x210.ivf",
+                                        "av1-1-b8-01-size-202x224.ivf",
+                                        "av1-1-b8-01-size-202x226.ivf",
+                                        "av1-1-b8-01-size-208x196.ivf",
+                                        "av1-1-b8-01-size-208x198.ivf",
+                                        "av1-1-b8-01-size-208x200.ivf",
+                                        "av1-1-b8-01-size-208x202.ivf",
+                                        "av1-1-b8-01-size-208x208.ivf",
+                                        "av1-1-b8-01-size-208x210.ivf",
+                                        "av1-1-b8-01-size-208x224.ivf",
+                                        "av1-1-b8-01-size-208x226.ivf",
+                                        "av1-1-b8-01-size-210x196.ivf",
+                                        "av1-1-b8-01-size-210x198.ivf",
+                                        "av1-1-b8-01-size-210x200.ivf",
+                                        "av1-1-b8-01-size-210x202.ivf",
+                                        "av1-1-b8-01-size-210x208.ivf",
+                                        "av1-1-b8-01-size-210x210.ivf",
+                                        "av1-1-b8-01-size-210x224.ivf",
+                                        "av1-1-b8-01-size-210x226.ivf",
+                                        "av1-1-b8-01-size-224x196.ivf",
+                                        "av1-1-b8-01-size-224x198.ivf",
+                                        "av1-1-b8-01-size-224x200.ivf",
+                                        "av1-1-b8-01-size-224x202.ivf",
+                                        "av1-1-b8-01-size-224x208.ivf",
+                                        "av1-1-b8-01-size-224x210.ivf",
+                                        "av1-1-b8-01-size-224x224.ivf",
+                                        "av1-1-b8-01-size-224x226.ivf",
+                                        "av1-1-b8-01-size-226x196.ivf",
+                                        "av1-1-b8-01-size-226x198.ivf",
+                                        "av1-1-b8-01-size-226x200.ivf",
+                                        "av1-1-b8-01-size-226x202.ivf",
+                                        "av1-1-b8-01-size-226x208.ivf",
+                                        "av1-1-b8-01-size-226x210.ivf",
+                                        "av1-1-b8-01-size-226x224.ivf",
+                                        "av1-1-b8-01-size-226x226.ivf",
+                                        "av1-1-b8-01-size-32x16.ivf",
+                                        "av1-1-b8-01-size-32x18.ivf",
+                                        "av1-1-b8-01-size-32x32.ivf",
+                                        "av1-1-b8-01-size-32x34.ivf",
+                                        "av1-1-b8-01-size-32x64.ivf",
+                                        "av1-1-b8-01-size-32x66.ivf",
+                                        "av1-1-b8-01-size-34x16.ivf",
+                                        "av1-1-b8-01-size-34x18.ivf",
+                                        "av1-1-b8-01-size-34x32.ivf",
+                                        "av1-1-b8-01-size-34x34.ivf",
+                                        "av1-1-b8-01-size-34x64.ivf",
+                                        "av1-1-b8-01-size-34x66.ivf",
+                                        "av1-1-b8-01-size-64x16.ivf",
+                                        "av1-1-b8-01-size-64x18.ivf",
+                                        "av1-1-b8-01-size-64x32.ivf",
+                                        "av1-1-b8-01-size-64x34.ivf",
+                                        "av1-1-b8-01-size-64x64.ivf",
+                                        "av1-1-b8-01-size-64x66.ivf",
+                                        "av1-1-b8-01-size-66x16.ivf",
+                                        "av1-1-b8-01-size-66x18.ivf",
+                                        "av1-1-b8-01-size-66x32.ivf",
+                                        "av1-1-b8-01-size-66x34.ivf",
+                                        "av1-1-b8-01-size-66x64.ivf",
+                                        "av1-1-b8-01-size-66x66.ivf",
+                                        "av1-1-b8-02-allintra.ivf",
+                                        "av1-1-b8-03-sizedown.mkv",
+                                        "av1-1-b8-03-sizeup.mkv",
+                                        "av1-1-b8-04-cdfupdate.ivf",
+                                        "av1-1-b8-05-mv.ivf",
+                                        "av1-1-b8-06-mfmv.ivf",
+                                        "av1-1-b8-22-svc-L1T2.ivf",
+                                        "av1-1-b8-22-svc-L2T1.ivf",
+                                        "av1-1-b8-22-svc-L2T2.ivf",
+                                        "av1-1-b8-23-film_grain-50.ivf" };
+const int kNumAV1TestVectors = NELEMENTS(kAV1TestVectors);
+#endif  // CONFIG_AV1_DECODER
+
+}  // namespace libaom_test
diff --git a/libs/libaom/src/test/test_vectors.h b/libs/libaom/src/test/test_vectors.h
new file mode 100644
index 000000000..be37f6e37
--- /dev/null
+++ b/libs/libaom/src/test/test_vectors.h
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_TEST_TEST_VECTORS_H_
+#define AOM_TEST_TEST_VECTORS_H_
+
+#include "config/aom_config.h"
+
+namespace libaom_test {
+
+#if CONFIG_AV1_DECODER
+extern const int kNumAV1TestVectors;
+extern const char *const kAV1TestVectors[];
+#endif
+
+}  // namespace libaom_test
+
+#endif  // AOM_TEST_TEST_VECTORS_H_
diff --git a/libs/libaom/src/test/tile_independence_test.cc b/libs/libaom/src/test/tile_independence_test.cc
new file mode 100644
index 000000000..4f7c4a475
--- /dev/null
+++ b/libs/libaom/src/test/tile_independence_test.cc
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+#include "test/md5_helper.h"
+#include "aom_mem/aom_mem.h"
+
+namespace {
+class TileIndependenceTest
+    : public ::libaom_test::CodecTestWith3Params<int, int, int>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  TileIndependenceTest()
+      : EncoderTest(GET_PARAM(0)), md5_fw_order_(), md5_inv_order_(),
+        n_tile_cols_(GET_PARAM(1)), n_tile_rows_(GET_PARAM(2)),
+        n_tile_groups_(GET_PARAM(3)) {
+    init_flags_ = AOM_CODEC_USE_PSNR;
+    aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t();
+    cfg.w = 704;
+    cfg.h = 576;
+    cfg.threads = 1;
+    cfg.allow_lowbitdepth = 1;
+    fw_dec_ = codec_->CreateDecoder(cfg, 0);
+    inv_dec_ = codec_->CreateDecoder(cfg, 0);
+    inv_dec_->Control(AV1_INVERT_TILE_DECODE_ORDER, 1);
+
+    if (fw_dec_->IsAV1() && inv_dec_->IsAV1()) {
+      fw_dec_->Control(AV1_SET_DECODE_TILE_ROW, -1);
+      fw_dec_->Control(AV1_SET_DECODE_TILE_COL, -1);
+      inv_dec_->Control(AV1_SET_DECODE_TILE_ROW, -1);
+      inv_dec_->Control(AV1_SET_DECODE_TILE_COL, -1);
+    }
+  }
+
+  virtual ~TileIndependenceTest() {
+    delete fw_dec_;
+    delete inv_dec_;
+  }
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(libaom_test::kTwoPassGood);
+  }
+
+  virtual void PreEncodeFrameHook(libaom_test::VideoSource *video,
+                                  libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(AV1E_SET_TILE_COLUMNS, n_tile_cols_);
+      encoder->Control(AV1E_SET_TILE_ROWS, n_tile_rows_);
+      SetCpuUsed(encoder);
+    } else if (video->frame() == 3) {
+      encoder->Control(AV1E_SET_NUM_TG, n_tile_groups_);
+    }
+  }
+
+  virtual void SetCpuUsed(libaom_test::Encoder *encoder) {
+    static const int kCpuUsed = 3;
+    encoder->Control(AOME_SET_CPUUSED, kCpuUsed);
+  }
+
+  void UpdateMD5(::libaom_test::Decoder *dec, const aom_codec_cx_pkt_t *pkt,
+                 ::libaom_test::MD5 *md5) {
+    const aom_codec_err_t res = dec->DecodeFrame(
+        reinterpret_cast<uint8_t *>(pkt->data.frame.buf), pkt->data.frame.sz);
+    if (res != AOM_CODEC_OK) {
+      abort_ = true;
+      ASSERT_EQ(AOM_CODEC_OK, res);
+    }
+    const aom_image_t *img = dec->GetDxData().Next();
+    md5->Add(img);
+  }
+
+  virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
+    UpdateMD5(fw_dec_, pkt, &md5_fw_order_);
+    UpdateMD5(inv_dec_, pkt, &md5_inv_order_);
+  }
+
+  void DoTest() {
+    const aom_rational timebase = { 33333333, 1000000000 };
+    cfg_.g_timebase = timebase;
+    cfg_.rc_target_bitrate = 500;
+    cfg_.g_lag_in_frames = 12;
+    cfg_.rc_end_usage = AOM_VBR;
+
+    libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 704, 576,
+                                       timebase.den, timebase.num, 0, 5);
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+    const char *md5_fw_str = md5_fw_order_.Get();
+    const char *md5_inv_str = md5_inv_order_.Get();
+    ASSERT_STREQ(md5_fw_str, md5_inv_str);
+  }
+
+  ::libaom_test::MD5 md5_fw_order_, md5_inv_order_;
+  ::libaom_test::Decoder *fw_dec_, *inv_dec_;
+
+ private:
+  int n_tile_cols_;
+  int n_tile_rows_;
+  int n_tile_groups_;
+};
+
+// run an encode with 2 or 4 tiles, and do the decode both in normal and
+// inverted tile ordering. Ensure that the MD5 of the output in both cases
+// is identical. If so, tiles are considered independent and the test passes.
+TEST_P(TileIndependenceTest, MD5Match) {
+  cfg_.large_scale_tile = 0;
+  fw_dec_->Control(AV1_SET_TILE_MODE, 0);
+  inv_dec_->Control(AV1_SET_TILE_MODE, 0);
+  DoTest();
+}
+
+class TileIndependenceTestLarge : public TileIndependenceTest {
+  virtual void SetCpuUsed(libaom_test::Encoder *encoder) {
+    static const int kCpuUsed = 0;
+    encoder->Control(AOME_SET_CPUUSED, kCpuUsed);
+  }
+};
+
+TEST_P(TileIndependenceTestLarge, MD5Match) {
+  cfg_.large_scale_tile = 0;
+  fw_dec_->Control(AV1_SET_TILE_MODE, 0);
+  inv_dec_->Control(AV1_SET_TILE_MODE, 0);
+  DoTest();
+}
+
+AV1_INSTANTIATE_TEST_CASE(TileIndependenceTest, ::testing::Values(0, 1),
+                          ::testing::Values(0, 1), ::testing::Values(1, 2, 4));
+AV1_INSTANTIATE_TEST_CASE(TileIndependenceTestLarge, ::testing::Values(0, 1),
+                          ::testing::Values(0, 1), ::testing::Values(1, 2, 4));
+
+class TileIndependenceLSTest : public TileIndependenceTest {};
+
+TEST_P(TileIndependenceLSTest, MD5Match) {
+  cfg_.large_scale_tile = 1;
+  fw_dec_->Control(AV1_SET_TILE_MODE, 1);
+  fw_dec_->Control(AV1D_EXT_TILE_DEBUG, 1);
+  inv_dec_->Control(AV1_SET_TILE_MODE, 1);
+  inv_dec_->Control(AV1D_EXT_TILE_DEBUG, 1);
+  DoTest();
+}
+
+class TileIndependenceLSTestLarge : public TileIndependenceTestLarge {};
+
+TEST_P(TileIndependenceLSTestLarge, MD5Match) {
+  cfg_.large_scale_tile = 1;
+  fw_dec_->Control(AV1_SET_TILE_MODE, 1);
+  fw_dec_->Control(AV1D_EXT_TILE_DEBUG, 1);
+  inv_dec_->Control(AV1_SET_TILE_MODE, 1);
+  inv_dec_->Control(AV1D_EXT_TILE_DEBUG, 1);
+  DoTest();
+}
+
+AV1_INSTANTIATE_TEST_CASE(TileIndependenceLSTest, ::testing::Values(6),
+                          ::testing::Values(6), ::testing::Values(1));
+AV1_INSTANTIATE_TEST_CASE(TileIndependenceLSTestLarge, ::testing::Values(6),
+                          ::testing::Values(6), ::testing::Values(1));
+}  // namespace
diff --git a/libs/libaom/src/test/time_stamp_test.cc b/libs/libaom/src/test/time_stamp_test.cc
new file mode 100644
index 000000000..679e4da29
--- /dev/null
+++ b/libs/libaom/src/test/time_stamp_test.cc
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+//  Test AOM timestamp handling
+
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/util.h"
+#include "test/video_source.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+const int kVideoSourceWidth = 320;
+const int kVideoSourceHeight = 240;
+const int kFramesToEncode = 3;
+
+// A video source that exposes functions to set the timebase, framerate and
+// starting pts.
+class DummyTimebaseVideoSource : public ::libaom_test::DummyVideoSource {
+ public:
+  // Parameters num and den set the timebase for the video source.
+  DummyTimebaseVideoSource(int num, int den)
+      : framerate_numerator_(30), framerate_denominator_(1), starting_pts_(0) {
+    SetSize(kVideoSourceWidth, kVideoSourceHeight);
+    set_limit(kFramesToEncode);
+    timebase_.num = num;
+    timebase_.den = den;
+  }
+
+  void SetFramerate(int numerator, int denominator) {
+    framerate_numerator_ = numerator;
+    framerate_denominator_ = denominator;
+  }
+
+  // Returns one frames duration in timebase units as a double.
+  double FrameDuration() const {
+    return (static_cast<double>(timebase_.den) / timebase_.num) /
+           (static_cast<double>(framerate_numerator_) / framerate_denominator_);
+  }
+
+  virtual aom_codec_pts_t pts() const {
+    return static_cast<aom_codec_pts_t>(frame_ * FrameDuration() +
+                                        starting_pts_ + 0.5);
+  }
+
+  virtual unsigned long duration() const {
+    return static_cast<unsigned long>(FrameDuration() + 0.5);
+  }
+
+  virtual aom_rational_t timebase() const { return timebase_; }
+
+  void set_starting_pts(int64_t starting_pts) { starting_pts_ = starting_pts; }
+
+ private:
+  aom_rational_t timebase_;
+  int framerate_numerator_;
+  int framerate_denominator_;
+  int64_t starting_pts_;
+};
+
+class TimestampTest
+    : public ::libaom_test::EncoderTest,
+      public ::libaom_test::CodecTestWithParam<libaom_test::TestMode> {
+ protected:
+  TimestampTest() : EncoderTest(GET_PARAM(0)) {}
+  virtual ~TimestampTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(GET_PARAM(1));
+  }
+};
+
+// Tests encoding in millisecond timebase.
+TEST_P(TimestampTest, EncodeFrames) {
+  DummyTimebaseVideoSource video(1, 1000);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+TEST_P(TimestampTest, TestMicrosecondTimebase) {
+  // Set the timebase to microseconds.
+  DummyTimebaseVideoSource video(1, 1000000);
+  video.set_limit(1);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+TEST_P(TimestampTest, TestAv1Rollover) {
+  DummyTimebaseVideoSource video(1, 1000);
+  video.set_starting_pts(922337170351ll);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+AV1_INSTANTIATE_TEST_CASE(TimestampTest,
+                          ::testing::Values(::libaom_test::kTwoPassGood));
+
+}  // namespace
diff --git a/libs/libaom/src/test/tools_common.sh b/libs/libaom/src/test/tools_common.sh
new file mode 100644
index 000000000..c08710606
--- /dev/null
+++ b/libs/libaom/src/test/tools_common.sh
@@ -0,0 +1,477 @@
+#!/bin/sh
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+##  This file contains shell code shared by test scripts for libaom tools.
+
+# Use $AOM_TEST_TOOLS_COMMON_SH as a pseudo include guard.
+if [ -z "${AOM_TEST_TOOLS_COMMON_SH}" ]; then
+AOM_TEST_TOOLS_COMMON_SH=included
+
+set -e
+devnull='> /dev/null 2>&1'
+AOM_TEST_PREFIX=""
+
+elog() {
+  echo "$@" 1>&2
+}
+
+vlog() {
+  if [ "${AOM_TEST_VERBOSE_OUTPUT}" = "yes" ]; then
+    echo "$@"
+  fi
+}
+
+# Sets $AOM_TOOL_TEST to the name specified by positional parameter one.
+test_begin() {
+  AOM_TOOL_TEST="${1}"
+}
+
+# Clears the AOM_TOOL_TEST variable after confirming that $AOM_TOOL_TEST matches
+# positional parameter one.
+test_end() {
+  if [ "$1" != "${AOM_TOOL_TEST}" ]; then
+    echo "FAIL completed test mismatch!."
+    echo "  completed test: ${1}"
+    echo "  active test: ${AOM_TOOL_TEST}."
+    return 1
+  fi
+  AOM_TOOL_TEST='<unset>'
+}
+
+# Echoes the target configuration being tested.
+test_configuration_target() {
+  aom_config_c="${LIBAOM_CONFIG_PATH}/config/aom_config.c"
+  # Clean up the cfg pointer line from aom_config.c for easier re-use by
+  # someone examining a failure in the example tests.
+  # 1. Run grep on aom_config.c for cfg and limit the results to 1.
+  # 2. Split the line using ' = ' as separator.
+  # 3. Abuse sed to consume the leading " and trailing "; from the assignment
+  #    to the cfg pointer.
+  cmake_config=$(awk -F ' = ' '/cfg/ { print $NF; exit }' "${aom_config_c}" \
+    | sed -e s/\"// -e s/\"\;//)
+  echo cmake generated via command: cmake path/to/aom ${cmake_config}
+}
+
+# Trap function used for failure reports and tool output directory removal.
+# When the contents of $AOM_TOOL_TEST do not match the string '<unset>', reports
+# failure of test stored in $AOM_TOOL_TEST.
+cleanup() {
+  if [ -n "${AOM_TOOL_TEST}" ] && [ "${AOM_TOOL_TEST}" != '<unset>' ]; then
+    echo "FAIL: $AOM_TOOL_TEST"
+  fi
+  if [ "${AOM_TEST_PRESERVE_OUTPUT}" = "yes" ]; then
+    return
+  fi
+  if [ -n "${AOM_TEST_OUTPUT_DIR}" ] && [ -d "${AOM_TEST_OUTPUT_DIR}" ]; then
+    rm -rf "${AOM_TEST_OUTPUT_DIR}"
+  fi
+}
+
+# Echoes the version string assigned to the VERSION_STRING_NOSP variable defined
+# in $LIBAOM_CONFIG_PATH/config/aom_version.h to stdout.
+cmake_version() {
+  aom_version_h="${LIBAOM_CONFIG_PATH}/config/aom_version.h"
+
+  # Find VERSION_STRING_NOSP line, split it with '"' and print the next to last
+  # field to output the version string to stdout.
+  aom_version=$(awk -F \" '/VERSION_STRING_NOSP/ {print $(NF-1)}' \
+    "${aom_version_h}")
+  echo "v${aom_version}"
+}
+
+# Echoes current git version as reported by running 'git describe', or the
+# version used by the cmake build when git is unavailable.
+source_version() {
+  if git --version > /dev/null 2>&1; then
+    (cd "$(dirname "${0}")"
+    git describe)
+  else
+    cmake_version
+  fi
+}
+
+# Echoes warnings to stdout when source version and CMake build generated
+# version are out of sync.
+check_version_strings() {
+  cmake_version=$(cmake_version)
+  source_version=$(source_version)
+
+  if [ "${cmake_version}" != "${source_version}" ]; then
+    echo "Warning: version has changed since last cmake run."
+    vlog "  cmake version: ${cmake_version} version now: ${source_version}"
+  fi
+}
+
+# $1 is the name of an environment variable containing a directory name to
+# test.
+test_env_var_dir() {
+  local dir=$(eval echo "\${$1}")
+  if [ ! -d "${dir}" ]; then
+    elog "'${dir}': No such directory"
+    elog "The $1 environment variable must be set to a valid directory."
+    return 1
+  fi
+}
+
+# This script requires that the LIBAOM_BIN_PATH, LIBAOM_CONFIG_PATH, and
+# LIBAOM_TEST_DATA_PATH variables are in the environment: Confirm that
+# the variables are set and that they all evaluate to directory paths.
+verify_aom_test_environment() {
+  test_env_var_dir "LIBAOM_BIN_PATH" \
+    && test_env_var_dir "LIBAOM_CONFIG_PATH" \
+    && test_env_var_dir "LIBAOM_TEST_DATA_PATH"
+}
+
+# Greps aom_config.h in LIBAOM_CONFIG_PATH for positional parameter one, which
+# should be a LIBAOM preprocessor flag. Echoes yes to stdout when the feature
+# is available.
+aom_config_option_enabled() {
+  aom_config_option="${1}"
+  aom_config_file="${LIBAOM_CONFIG_PATH}/config/aom_config.h"
+  config_line=$(grep "${aom_config_option}" "${aom_config_file}")
+  if echo "${config_line}" | egrep -q '1$'; then
+    echo yes
+  fi
+}
+
+# Echoes yes when output of test_configuration_target() contains win32 or win64.
+is_windows_target() {
+  if test_configuration_target \
+     | grep -q -e win32 -e win64 > /dev/null 2>&1; then
+    echo yes
+  fi
+}
+
+# Echoes path to $1 when it's executable and exists in one of the directories
+# included in $tool_paths, or an empty string. Caller is responsible for testing
+# the string once the function returns.
+aom_tool_path() {
+  local tool_name="$1"
+  local root_path="${LIBAOM_BIN_PATH}"
+  local suffix="${AOM_TEST_EXE_SUFFIX}"
+  local tool_paths="\
+    ${root_path}/${tool_name}${suffix} \
+    ${root_path}/../${tool_name}${suffix} \
+    ${root_path}/tools/${tool_name}${suffix} \
+    ${root_path}/../tools/${tool_name}${suffix}"
+
+  local toolpath=""
+
+  for tool_path in ${tool_paths}; do
+    if [ -x "${tool_path}" ] && [ -f "${tool_path}" ]; then
+      echo "${tool_path}"
+      return 0
+    fi
+  done
+
+  return 1
+}
+
+# Echoes yes to stdout when the file named by positional parameter one exists
+# in LIBAOM_BIN_PATH, and is executable.
+aom_tool_available() {
+  local tool_name="$1"
+  local tool="${LIBAOM_BIN_PATH}/${tool_name}${AOM_TEST_EXE_SUFFIX}"
+  [ -x "${tool}" ] && echo yes
+}
+
+# Echoes yes to stdout when aom_config_option_enabled() reports yes for
+# CONFIG_AV1_DECODER.
+av1_decode_available() {
+  [ "$(aom_config_option_enabled CONFIG_AV1_DECODER)" = "yes" ] && echo yes
+}
+
+# Echoes yes to stdout when aom_config_option_enabled() reports yes for
+# CONFIG_AV1_ENCODER.
+av1_encode_available() {
+  [ "$(aom_config_option_enabled CONFIG_AV1_ENCODER)" = "yes" ] && echo yes
+}
+
+# Echoes "fast" encode params for use with aomenc.
+aomenc_encode_test_fast_params() {
+  echo "--cpu-used=1
+        --limit=${AV1_ENCODE_TEST_FRAME_LIMIT}
+        --lag-in-frames=0
+        --test-decode=fatal"
+}
+
+# Echoes yes to stdout when aom_config_option_enabled() reports yes for
+# CONFIG_WEBM_IO.
+webm_io_available() {
+  [ "$(aom_config_option_enabled CONFIG_WEBM_IO)" = "yes" ] && echo yes
+}
+
+# Filters strings from $1 using the filter specified by $2. Filter behavior
+# depends on the presence of $3. When $3 is present, strings that match the
+# filter are excluded. When $3 is omitted, strings matching the filter are
+# included.
+# The filtered result is echoed to stdout.
+filter_strings() {
+  strings=${1}
+  filter=${2}
+  exclude=${3}
+
+  if [ -n "${exclude}" ]; then
+    # When positional parameter three exists the caller wants to remove strings.
+    # Tell grep to invert matches using the -v argument.
+    exclude='-v'
+  else
+    unset exclude
+  fi
+
+  if [ -n "${filter}" ]; then
+    for s in ${strings}; do
+      if echo "${s}" | egrep -q ${exclude} "${filter}" > /dev/null 2>&1; then
+        filtered_strings="${filtered_strings} ${s}"
+      fi
+    done
+  else
+    filtered_strings="${strings}"
+  fi
+  echo "${filtered_strings}"
+}
+
+# Runs user test functions passed via positional parameters one and two.
+# Functions in positional parameter one are treated as environment verification
+# functions and are run unconditionally. Functions in positional parameter two
+# are run according to the rules specified in aom_test_usage().
+run_tests() {
+  local env_tests="verify_aom_test_environment $1"
+  local tests_to_filter="$2"
+  local test_name="${AOM_TEST_NAME}"
+
+  if [ -z "${test_name}" ]; then
+    test_name="$(basename "${0%.*}")"
+  fi
+
+  if [ "${AOM_TEST_RUN_DISABLED_TESTS}" != "yes" ]; then
+    # Filter out DISABLED tests.
+    tests_to_filter=$(filter_strings "${tests_to_filter}" ^DISABLED exclude)
+  fi
+
+  if [ -n "${AOM_TEST_FILTER}" ]; then
+    # Remove tests not matching the user's filter.
+    tests_to_filter=$(filter_strings "${tests_to_filter}" ${AOM_TEST_FILTER})
+  fi
+
+  # User requested test listing: Dump test names and return.
+  if [ "${AOM_TEST_LIST_TESTS}" = "yes" ]; then
+    for test_name in $tests_to_filter; do
+      echo ${test_name}
+    done
+    return
+  fi
+
+  # Don't bother with the environment tests if everything else was disabled.
+  [ -z "${tests_to_filter}" ] && return
+
+  # Combine environment and actual tests.
+  local tests_to_run="${env_tests} ${tests_to_filter}"
+
+  check_version_strings
+
+  # Run tests.
+  for test in ${tests_to_run}; do
+    test_begin "${test}"
+    vlog "  RUN  ${test}"
+    "${test}"
+    vlog "  PASS ${test}"
+    test_end "${test}"
+  done
+
+  local tested_config="$(test_configuration_target) @ $(source_version)"
+  echo "${test_name}: Done, all tests pass for ${tested_config}."
+}
+
+aom_test_usage() {
+cat << EOF
+  Usage: ${0##*/} [arguments]
+    --bin-path <path to libaom binaries directory>
+    --config-path <path to libaom config directory>
+    --filter <filter>: User test filter. Only tests matching filter are run.
+    --run-disabled-tests: Run disabled tests.
+    --help: Display this message and exit.
+    --test-data-path <path to libaom test data directory>
+    --show-program-output: Shows output from all programs being tested.
+    --prefix: Allows for a user specified prefix to be inserted before all test
+              programs. Grants the ability, for example, to run test programs
+              within valgrind.
+    --list-tests: List all test names and exit without actually running tests.
+    --verbose: Verbose output.
+
+    When the --bin-path option is not specified the script attempts to use
+    \$LIBAOM_BIN_PATH and then the current directory.
+
+    When the --config-path option is not specified the script attempts to use
+    \$LIBAOM_CONFIG_PATH and then the current directory.
+
+    When the -test-data-path option is not specified the script attempts to use
+    \$LIBAOM_TEST_DATA_PATH and then the current directory.
+EOF
+}
+
+# Returns non-zero (failure) when required environment variables are empty
+# strings.
+aom_test_check_environment() {
+  if [ -z "${LIBAOM_BIN_PATH}" ] || \
+     [ -z "${LIBAOM_CONFIG_PATH}" ] || \
+     [ -z "${LIBAOM_TEST_DATA_PATH}" ]; then
+    return 1
+  fi
+}
+
+# Echo aomenc command line parameters allowing use of a raw yuv file as
+# input to aomenc.
+yuv_raw_input() {
+  echo ""${YUV_RAW_INPUT}"
+       --width="${YUV_RAW_INPUT_WIDTH}"
+       --height="${YUV_RAW_INPUT_HEIGHT}""
+}
+
+# Do a small encode for testing decoders.
+encode_yuv_raw_input_av1() {
+  if [ "$(av1_encode_available)" = "yes" ]; then
+    local output="$1"
+    local encoder="$(aom_tool_path aomenc)"
+    shift
+    eval "${encoder}" $(yuv_raw_input) \
+      $(aomenc_encode_test_fast_params) \
+      --output="${output}" \
+      $@ \
+      ${devnull}
+
+    if [ ! -e "${output}" ]; then
+      elog "Output file does not exist."
+      return 1
+    fi
+  fi
+}
+
+# Parse the command line.
+while [ -n "$1" ]; do
+  case "$1" in
+    --bin-path)
+      LIBAOM_BIN_PATH="$2"
+      shift
+      ;;
+    --config-path)
+      LIBAOM_CONFIG_PATH="$2"
+      shift
+      ;;
+    --filter)
+      AOM_TEST_FILTER="$2"
+      shift
+      ;;
+    --run-disabled-tests)
+      AOM_TEST_RUN_DISABLED_TESTS=yes
+      ;;
+    --help)
+      aom_test_usage
+      exit
+      ;;
+    --test-data-path)
+      LIBAOM_TEST_DATA_PATH="$2"
+      shift
+      ;;
+    --prefix)
+      AOM_TEST_PREFIX="$2"
+      shift
+      ;;
+    --verbose)
+      AOM_TEST_VERBOSE_OUTPUT=yes
+      ;;
+    --show-program-output)
+      devnull=
+      ;;
+    --list-tests)
+      AOM_TEST_LIST_TESTS=yes
+      ;;
+    *)
+      aom_test_usage
+      exit 1
+      ;;
+  esac
+  shift
+done
+
+# Handle running the tests from a build directory without arguments when running
+# the tests on *nix/macosx.
+LIBAOM_BIN_PATH="${LIBAOM_BIN_PATH:-.}"
+LIBAOM_CONFIG_PATH="${LIBAOM_CONFIG_PATH:-.}"
+LIBAOM_TEST_DATA_PATH="${LIBAOM_TEST_DATA_PATH:-.}"
+
+# Create a temporary directory for output files, and a trap to clean it up.
+if [ -n "${TMPDIR}" ]; then
+  AOM_TEST_TEMP_ROOT="${TMPDIR}"
+elif [ -n "${TEMPDIR}" ]; then
+  AOM_TEST_TEMP_ROOT="${TEMPDIR}"
+else
+  AOM_TEST_TEMP_ROOT=/tmp
+fi
+
+AOM_TEST_OUTPUT_DIR="${AOM_TEST_OUTPUT_DIR:-${AOM_TEST_TEMP_ROOT}/aom_test_$$}"
+
+if ! mkdir -p "${AOM_TEST_OUTPUT_DIR}" || \
+   [ ! -d "${AOM_TEST_OUTPUT_DIR}" ]; then
+  echo "${0##*/}: Cannot create output directory, giving up."
+  echo "${0##*/}:   AOM_TEST_OUTPUT_DIR=${AOM_TEST_OUTPUT_DIR}"
+  exit 1
+fi
+
+AOM_TEST_PRESERVE_OUTPUT=${AOM_TEST_PRESERVE_OUTPUT:-no}
+
+if [ "$(is_windows_target)" = "yes" ]; then
+  AOM_TEST_EXE_SUFFIX=".exe"
+fi
+
+# Variables shared by tests.
+AV1_ENCODE_CPU_USED=${AV1_ENCODE_CPU_USED:-1}
+AV1_ENCODE_TEST_FRAME_LIMIT=${AV1_ENCODE_TEST_FRAME_LIMIT:-5}
+AV1_IVF_FILE="${AV1_IVF_FILE:-${AOM_TEST_OUTPUT_DIR}/av1.ivf}"
+AV1_OBU_ANNEXB_FILE="${AV1_OBU_ANNEXB_FILE:-${AOM_TEST_OUTPUT_DIR}/av1.annexb.obu}"
+AV1_OBU_SEC5_FILE="${AV1_OBU_SEC5_FILE:-${AOM_TEST_OUTPUT_DIR}/av1.section5.obu}"
+AV1_WEBM_FILE="${AV1_WEBM_FILE:-${AOM_TEST_OUTPUT_DIR}/av1.webm}"
+
+YUV_RAW_INPUT="${LIBAOM_TEST_DATA_PATH}/hantro_collage_w352h288.yuv"
+YUV_RAW_INPUT_WIDTH=352
+YUV_RAW_INPUT_HEIGHT=288
+
+Y4M_NOSQ_PAR_INPUT="${LIBAOM_TEST_DATA_PATH}/park_joy_90p_8_420_a10-1.y4m"
+Y4M_720P_INPUT="${LIBAOM_TEST_DATA_PATH}/niklas_1280_720_30.y4m"
+
+# Setup a trap function to clean up after tests complete.
+trap cleanup EXIT
+
+vlog "$(basename "${0%.*}") test configuration:
+  LIBAOM_BIN_PATH=${LIBAOM_BIN_PATH}
+  LIBAOM_CONFIG_PATH=${LIBAOM_CONFIG_PATH}
+  LIBAOM_TEST_DATA_PATH=${LIBAOM_TEST_DATA_PATH}
+  AOM_TEST_EXE_SUFFIX=${AOM_TEST_EXE_SUFFIX}
+  AOM_TEST_FILTER=${AOM_TEST_FILTER}
+  AOM_TEST_LIST_TESTS=${AOM_TEST_LIST_TESTS}
+  AOM_TEST_OUTPUT_DIR=${AOM_TEST_OUTPUT_DIR}
+  AOM_TEST_PREFIX=${AOM_TEST_PREFIX}
+  AOM_TEST_PRESERVE_OUTPUT=${AOM_TEST_PRESERVE_OUTPUT}
+  AOM_TEST_RUN_DISABLED_TESTS=${AOM_TEST_RUN_DISABLED_TESTS}
+  AOM_TEST_SHOW_PROGRAM_OUTPUT=${AOM_TEST_SHOW_PROGRAM_OUTPUT}
+  AOM_TEST_TEMP_ROOT=${AOM_TEST_TEMP_ROOT}
+  AOM_TEST_VERBOSE_OUTPUT=${AOM_TEST_VERBOSE_OUTPUT}
+  AV1_ENCODE_CPU_USED=${AV1_ENCODE_CPU_USED}
+  AV1_ENCODE_TEST_FRAME_LIMIT=${AV1_ENCODE_TEST_FRAME_LIMIT}
+  AV1_IVF_FILE=${AV1_IVF_FILE}
+  AV1_OBU_ANNEXB_FILE=${AV1_OBU_ANNEXB_FILE}
+  AV1_OBU_SEC5_FILE=${AV1_OBU_SEC5_FILE}
+  AV1_WEBM_FILE=${AV1_WEBM_FILE}
+  YUV_RAW_INPUT=${YUV_RAW_INPUT}
+  YUV_RAW_INPUT_WIDTH=${YUV_RAW_INPUT_WIDTH}
+  YUV_RAW_INPUT_HEIGHT=${YUV_RAW_INPUT_HEIGHT}
+  Y4M_NOSQ_PAR_INPUT=${Y4M_NOSQ_PAR_INPUT}"
+
+fi  # End $AOM_TEST_TOOLS_COMMON_SH pseudo include guard.
diff --git a/libs/libaom/src/test/transform_test_base.h b/libs/libaom/src/test/transform_test_base.h
new file mode 100644
index 000000000..68f5cc74d
--- /dev/null
+++ b/libs/libaom/src/test/transform_test_base.h
@@ -0,0 +1,345 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_TEST_TRANSFORM_TEST_BASE_H_
+#define AOM_TEST_TRANSFORM_TEST_BASE_H_
+
+#include "config/aom_config.h"
+
+#include "aom_mem/aom_mem.h"
+#include "aom/aom_codec.h"
+#include "aom_dsp/txfm_common.h"
+
+namespace libaom_test {
+
+//  Note:
+//   Same constant are defined in av1/common/av1_entropy.h and
+//   av1/common/entropy.h.  Goal is to make this base class
+//   to use for future codec transform testing.  But including
+//   either of them would lead to compiling error when we do
+//   unit test for another codec. Suggest to move the definition
+//   to a aom header file.
+const int kDctMaxValue = 16384;
+
+template <typename OutputType>
+using FhtFunc = void (*)(const int16_t *in, OutputType *out, int stride,
+                         TxfmParam *txfm_param);
+
+template <typename OutputType>
+using IhtFunc = void (*)(const tran_low_t *in, uint8_t *out, int stride,
+                         const TxfmParam *txfm_param);
+
+template <typename OutType>
+class TransformTestBase {
+ public:
+  virtual ~TransformTestBase() {}
+
+ protected:
+  virtual void RunFwdTxfm(const int16_t *in, OutType *out, int stride) = 0;
+
+  virtual void RunInvTxfm(const OutType *out, uint8_t *dst, int stride) = 0;
+
+  void RunAccuracyCheck(uint32_t ref_max_error, double ref_avg_error) {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    uint32_t max_error = 0;
+    int64_t total_error = 0;
+    const int count_test_block = 10000;
+
+    int16_t *test_input_block = reinterpret_cast<int16_t *>(
+        aom_memalign(16, sizeof(int16_t) * num_coeffs_));
+    OutType *test_temp_block = reinterpret_cast<OutType *>(
+        aom_memalign(16, sizeof(test_temp_block[0]) * num_coeffs_));
+    uint8_t *dst = reinterpret_cast<uint8_t *>(
+        aom_memalign(16, sizeof(uint8_t) * num_coeffs_));
+    uint8_t *src = reinterpret_cast<uint8_t *>(
+        aom_memalign(16, sizeof(uint8_t) * num_coeffs_));
+    uint16_t *dst16 = reinterpret_cast<uint16_t *>(
+        aom_memalign(16, sizeof(uint16_t) * num_coeffs_));
+    uint16_t *src16 = reinterpret_cast<uint16_t *>(
+        aom_memalign(16, sizeof(uint16_t) * num_coeffs_));
+
+    for (int i = 0; i < count_test_block; ++i) {
+      // Initialize a test block with input range [-255, 255].
+      for (int j = 0; j < num_coeffs_; ++j) {
+        if (bit_depth_ == AOM_BITS_8) {
+          src[j] = rnd.Rand8();
+          dst[j] = rnd.Rand8();
+          test_input_block[j] = src[j] - dst[j];
+        } else {
+          src16[j] = rnd.Rand16() & mask_;
+          dst16[j] = rnd.Rand16() & mask_;
+          test_input_block[j] = src16[j] - dst16[j];
+        }
+      }
+
+      ASM_REGISTER_STATE_CHECK(
+          RunFwdTxfm(test_input_block, test_temp_block, pitch_));
+      if (bit_depth_ == AOM_BITS_8) {
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block, dst, pitch_));
+      } else {
+        ASM_REGISTER_STATE_CHECK(
+            RunInvTxfm(test_temp_block, CONVERT_TO_BYTEPTR(dst16), pitch_));
+      }
+
+      for (int j = 0; j < num_coeffs_; ++j) {
+        const int diff =
+            bit_depth_ == AOM_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
+        const uint32_t error = diff * diff;
+        if (max_error < error) max_error = error;
+        total_error += error;
+      }
+    }
+
+    double avg_error = total_error * 1. / count_test_block / num_coeffs_;
+
+    EXPECT_GE(ref_max_error, max_error)
+        << "Error: FHT/IHT has an individual round trip error > "
+        << ref_max_error;
+
+    EXPECT_GE(ref_avg_error, avg_error)
+        << "Error: FHT/IHT has average round trip error > " << ref_avg_error
+        << " per block";
+
+    aom_free(test_input_block);
+    aom_free(test_temp_block);
+    aom_free(dst);
+    aom_free(src);
+    aom_free(dst16);
+    aom_free(src16);
+  }
+
+  void RunCoeffCheck() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 5000;
+
+    // Use a stride value which is not the width of any transform, to catch
+    // cases where the transforms use the stride incorrectly.
+    int stride = 96;
+
+    int16_t *input_block = reinterpret_cast<int16_t *>(
+        aom_memalign(16, sizeof(int16_t) * stride * height_));
+    OutType *output_ref_block = reinterpret_cast<OutType *>(
+        aom_memalign(16, sizeof(output_ref_block[0]) * num_coeffs_));
+    OutType *output_block = reinterpret_cast<OutType *>(
+        aom_memalign(16, sizeof(output_block[0]) * num_coeffs_));
+
+    for (int i = 0; i < count_test_block; ++i) {
+      int j, k;
+      for (j = 0; j < height_; ++j) {
+        for (k = 0; k < pitch_; ++k) {
+          int in_idx = j * stride + k;
+          int out_idx = j * pitch_ + k;
+          input_block[in_idx] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
+          if (bit_depth_ == AOM_BITS_8) {
+            output_block[out_idx] = output_ref_block[out_idx] = rnd.Rand8();
+          } else {
+            output_block[out_idx] = output_ref_block[out_idx] =
+                rnd.Rand16() & mask_;
+          }
+        }
+      }
+
+      fwd_txfm_ref(input_block, output_ref_block, stride, &txfm_param_);
+      ASM_REGISTER_STATE_CHECK(RunFwdTxfm(input_block, output_block, stride));
+
+      // The minimum quant value is 4.
+      for (j = 0; j < height_; ++j) {
+        for (k = 0; k < pitch_; ++k) {
+          int out_idx = j * pitch_ + k;
+          ASSERT_EQ(output_block[out_idx], output_ref_block[out_idx])
+              << "Error: not bit-exact result at index: " << out_idx
+              << " at test block: " << i;
+        }
+      }
+    }
+    aom_free(input_block);
+    aom_free(output_ref_block);
+    aom_free(output_block);
+  }
+
+  void RunInvCoeffCheck() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 5000;
+
+    // Use a stride value which is not the width of any transform, to catch
+    // cases where the transforms use the stride incorrectly.
+    int stride = 96;
+
+    int16_t *input_block = reinterpret_cast<int16_t *>(
+        aom_memalign(16, sizeof(int16_t) * num_coeffs_));
+    OutType *trans_block = reinterpret_cast<OutType *>(
+        aom_memalign(16, sizeof(trans_block[0]) * num_coeffs_));
+    uint8_t *output_block = reinterpret_cast<uint8_t *>(
+        aom_memalign(16, sizeof(uint8_t) * stride * height_));
+    uint8_t *output_ref_block = reinterpret_cast<uint8_t *>(
+        aom_memalign(16, sizeof(uint8_t) * stride * height_));
+
+    for (int i = 0; i < count_test_block; ++i) {
+      // Initialize a test block with input range [-mask_, mask_].
+      int j, k;
+      for (j = 0; j < height_; ++j) {
+        for (k = 0; k < pitch_; ++k) {
+          int in_idx = j * pitch_ + k;
+          int out_idx = j * stride + k;
+          input_block[in_idx] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
+          output_ref_block[out_idx] = rnd.Rand16() & mask_;
+          output_block[out_idx] = output_ref_block[out_idx];
+        }
+      }
+
+      fwd_txfm_ref(input_block, trans_block, pitch_, &txfm_param_);
+
+      inv_txfm_ref(trans_block, output_ref_block, stride, &txfm_param_);
+      ASM_REGISTER_STATE_CHECK(RunInvTxfm(trans_block, output_block, stride));
+
+      for (j = 0; j < height_; ++j) {
+        for (k = 0; k < pitch_; ++k) {
+          int out_idx = j * stride + k;
+          ASSERT_EQ(output_block[out_idx], output_ref_block[out_idx])
+              << "Error: not bit-exact result at index: " << out_idx
+              << " j = " << j << " k = " << k << " at test block: " << i;
+        }
+      }
+    }
+    aom_free(input_block);
+    aom_free(trans_block);
+    aom_free(output_ref_block);
+    aom_free(output_block);
+  }
+
+  void RunMemCheck() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 5000;
+
+    int16_t *input_extreme_block = reinterpret_cast<int16_t *>(
+        aom_memalign(16, sizeof(int16_t) * num_coeffs_));
+    OutType *output_ref_block = reinterpret_cast<OutType *>(
+        aom_memalign(16, sizeof(output_ref_block[0]) * num_coeffs_));
+    OutType *output_block = reinterpret_cast<OutType *>(
+        aom_memalign(16, sizeof(output_block[0]) * num_coeffs_));
+
+    for (int i = 0; i < count_test_block; ++i) {
+      // Initialize a test block with input range [-mask_, mask_].
+      for (int j = 0; j < num_coeffs_; ++j) {
+        input_extreme_block[j] = rnd.Rand8() % 2 ? mask_ : -mask_;
+      }
+      if (i == 0) {
+        for (int j = 0; j < num_coeffs_; ++j) input_extreme_block[j] = mask_;
+      } else if (i == 1) {
+        for (int j = 0; j < num_coeffs_; ++j) input_extreme_block[j] = -mask_;
+      }
+
+      fwd_txfm_ref(input_extreme_block, output_ref_block, pitch_, &txfm_param_);
+      ASM_REGISTER_STATE_CHECK(
+          RunFwdTxfm(input_extreme_block, output_block, pitch_));
+
+      int row_length = FindRowLength();
+      // The minimum quant value is 4.
+      for (int j = 0; j < num_coeffs_; ++j) {
+        ASSERT_EQ(output_block[j], output_ref_block[j])
+            << "Not bit-exact at test index: " << i << ", "
+            << "j = " << j << std::endl;
+        EXPECT_GE(row_length * kDctMaxValue << (bit_depth_ - 8),
+                  abs(output_block[j]))
+            << "Error: NxN FDCT has coefficient larger than N*DCT_MAX_VALUE";
+      }
+    }
+    aom_free(input_extreme_block);
+    aom_free(output_ref_block);
+    aom_free(output_block);
+  }
+
+  void RunInvAccuracyCheck(int limit) {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 1000;
+
+    int16_t *in = reinterpret_cast<int16_t *>(
+        aom_memalign(16, sizeof(int16_t) * num_coeffs_));
+    OutType *coeff = reinterpret_cast<OutType *>(
+        aom_memalign(16, sizeof(coeff[0]) * num_coeffs_));
+    uint8_t *dst = reinterpret_cast<uint8_t *>(
+        aom_memalign(16, sizeof(uint8_t) * num_coeffs_));
+    uint8_t *src = reinterpret_cast<uint8_t *>(
+        aom_memalign(16, sizeof(uint8_t) * num_coeffs_));
+
+    uint16_t *dst16 = reinterpret_cast<uint16_t *>(
+        aom_memalign(16, sizeof(uint16_t) * num_coeffs_));
+    uint16_t *src16 = reinterpret_cast<uint16_t *>(
+        aom_memalign(16, sizeof(uint16_t) * num_coeffs_));
+
+    for (int i = 0; i < count_test_block; ++i) {
+      // Initialize a test block with input range [-mask_, mask_].
+      for (int j = 0; j < num_coeffs_; ++j) {
+        if (bit_depth_ == AOM_BITS_8) {
+          src[j] = rnd.Rand8();
+          dst[j] = rnd.Rand8();
+          in[j] = src[j] - dst[j];
+        } else {
+          src16[j] = rnd.Rand16() & mask_;
+          dst16[j] = rnd.Rand16() & mask_;
+          in[j] = src16[j] - dst16[j];
+        }
+      }
+
+      fwd_txfm_ref(in, coeff, pitch_, &txfm_param_);
+
+      if (bit_depth_ == AOM_BITS_8) {
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_));
+      } else {
+        ASM_REGISTER_STATE_CHECK(
+            RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16), pitch_));
+      }
+
+      for (int j = 0; j < num_coeffs_; ++j) {
+        const int diff =
+            bit_depth_ == AOM_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
+        const uint32_t error = diff * diff;
+        ASSERT_GE(static_cast<uint32_t>(limit), error)
+            << "Error: 4x4 IDCT has error " << error << " at index " << j;
+      }
+    }
+    aom_free(in);
+    aom_free(coeff);
+    aom_free(dst);
+    aom_free(src);
+    aom_free(src16);
+    aom_free(dst16);
+  }
+
+  int pitch_;
+  int height_;
+  FhtFunc<OutType> fwd_txfm_ref;
+  IhtFunc<OutType> inv_txfm_ref;
+  aom_bit_depth_t bit_depth_;
+  int mask_;
+  int num_coeffs_;
+  TxfmParam txfm_param_;
+
+ private:
+  //  Assume transform size is 4x4, 8x8, 16x16,...
+  int FindRowLength() const {
+    int row = 4;
+    if (16 == num_coeffs_) {
+      row = 4;
+    } else if (64 == num_coeffs_) {
+      row = 8;
+    } else if (256 == num_coeffs_) {
+      row = 16;
+    } else if (1024 == num_coeffs_) {
+      row = 32;
+    }
+    return row;
+  }
+};
+
+}  // namespace libaom_test
+
+#endif  // AOM_TEST_TRANSFORM_TEST_BASE_H_
diff --git a/libs/libaom/src/test/twopass_encoder.sh b/libs/libaom/src/test/twopass_encoder.sh
new file mode 100644
index 000000000..cca44ced8
--- /dev/null
+++ b/libs/libaom/src/test/twopass_encoder.sh
@@ -0,0 +1,54 @@
+#!/bin/sh
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+## This file tests the libaom twopass_encoder example. To add new tests to this
+## file, do the following:
+##   1. Write a shell function (this is your test).
+##   2. Add the function to twopass_encoder_tests (on a new line).
+##
+. $(dirname $0)/tools_common.sh
+
+# Environment check: $YUV_RAW_INPUT is required.
+twopass_encoder_verify_environment() {
+  if [ ! -e "${YUV_RAW_INPUT}" ]; then
+    echo "Libaom test data must exist in LIBAOM_TEST_DATA_PATH."
+    return 1
+  fi
+}
+
+# Runs twopass_encoder using the codec specified by $1 with a frame limit of
+# 100.
+twopass_encoder() {
+  local encoder="$(aom_tool_path twopass_encoder)"
+  local codec="$1"
+  local output_file="${AOM_TEST_OUTPUT_DIR}/twopass_encoder_${codec}.ivf"
+  local limit=7
+
+  if [ ! -x "${encoder}" ]; then
+    elog "${encoder} does not exist or is not executable."
+    return 1
+  fi
+
+  eval "${AOM_TEST_PREFIX}" "${encoder}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \
+      "${YUV_RAW_INPUT_HEIGHT}" "${YUV_RAW_INPUT}" "${output_file}" "${limit}" \
+      ${devnull}
+
+  [ -e "${output_file}" ] || return 1
+}
+
+twopass_encoder_av1() {
+  if [ "$(av1_encode_available)" = "yes" ]; then
+    twopass_encoder av1 || return 1
+  fi
+}
+
+twopass_encoder_tests="twopass_encoder_av1"
+
+run_tests twopass_encoder_verify_environment "${twopass_encoder_tests}"
diff --git a/libs/libaom/src/test/util.h b/libs/libaom/src/test/util.h
new file mode 100644
index 000000000..aa4b106e4
--- /dev/null
+++ b/libs/libaom/src/test/util.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_TEST_UTIL_H_
+#define AOM_TEST_UTIL_H_
+
+#include <stdio.h>
+#include <math.h>
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "aom/aom_integer.h"
+#include "aom/aom_image.h"
+#include "aom_ports/aom_timer.h"
+
+// Macros
+#define GET_PARAM(k) std::get<k>(GetParam())
+
+inline double compute_psnr(const aom_image_t *img1, const aom_image_t *img2) {
+  assert((img1->fmt == img2->fmt) && (img1->d_w == img2->d_w) &&
+         (img1->d_h == img2->d_h));
+
+  const unsigned int width_y = img1->d_w;
+  const unsigned int height_y = img1->d_h;
+  unsigned int i, j;
+
+  int64_t sqrerr = 0;
+  for (i = 0; i < height_y; ++i)
+    for (j = 0; j < width_y; ++j) {
+      int64_t d = img1->planes[AOM_PLANE_Y][i * img1->stride[AOM_PLANE_Y] + j] -
+                  img2->planes[AOM_PLANE_Y][i * img2->stride[AOM_PLANE_Y] + j];
+      sqrerr += d * d;
+    }
+  double mse = static_cast<double>(sqrerr) / (width_y * height_y);
+  double psnr = 100.0;
+  if (mse > 0.0) {
+    psnr = 10 * log10(255.0 * 255.0 / mse);
+  }
+  return psnr;
+}
+
+static INLINE double get_time_mark(aom_usec_timer *t) {
+  aom_usec_timer_mark(t);
+  return static_cast<double>(aom_usec_timer_elapsed(t));
+}
+
+#endif  // AOM_TEST_UTIL_H_
diff --git a/libs/libaom/src/test/variance_test.cc b/libs/libaom/src/test/variance_test.cc
new file mode 100644
index 000000000..1458ece28
--- /dev/null
+++ b/libs/libaom/src/test/variance_test.cc
@@ -0,0 +1,2410 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <cstdlib>
+#include <new>
+#include <ostream>
+#include <tuple>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "aom/aom_codec.h"
+#include "aom/aom_integer.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/mem.h"
+
+namespace {
+
+typedef unsigned int (*VarianceMxNFunc)(const uint8_t *a, int a_stride,
+                                        const uint8_t *b, int b_stride,
+                                        unsigned int *sse);
+typedef unsigned int (*SubpixVarMxNFunc)(const uint8_t *a, int a_stride,
+                                         int xoffset, int yoffset,
+                                         const uint8_t *b, int b_stride,
+                                         unsigned int *sse);
+typedef unsigned int (*SubpixAvgVarMxNFunc)(const uint8_t *a, int a_stride,
+                                            int xoffset, int yoffset,
+                                            const uint8_t *b, int b_stride,
+                                            uint32_t *sse,
+                                            const uint8_t *second_pred);
+typedef unsigned int (*Get4x4SseFunc)(const uint8_t *a, int a_stride,
+                                      const uint8_t *b, int b_stride);
+typedef unsigned int (*SumOfSquaresFunction)(const int16_t *src);
+typedef unsigned int (*DistWtdSubpixAvgVarMxNFunc)(
+    const uint8_t *a, int a_stride, int xoffset, int yoffset, const uint8_t *b,
+    int b_stride, uint32_t *sse, const uint8_t *second_pred,
+    const DIST_WTD_COMP_PARAMS *jcp_param);
+typedef uint32_t (*ObmcSubpelVarFunc)(const uint8_t *pre, int pre_stride,
+                                      int xoffset, int yoffset,
+                                      const int32_t *wsrc, const int32_t *mask,
+                                      unsigned int *sse);
+
+using libaom_test::ACMRandom;
+
+// Truncate high bit depth results by downshifting (with rounding) by:
+// 2 * (bit_depth - 8) for sse
+// (bit_depth - 8) for se
+static void RoundHighBitDepth(int bit_depth, int64_t *se, uint64_t *sse) {
+  switch (bit_depth) {
+    case AOM_BITS_12:
+      *sse = (*sse + 128) >> 8;
+      *se = (*se + 8) >> 4;
+      break;
+    case AOM_BITS_10:
+      *sse = (*sse + 8) >> 4;
+      *se = (*se + 2) >> 2;
+      break;
+    case AOM_BITS_8:
+    default: break;
+  }
+}
+
+static unsigned int mb_ss_ref(const int16_t *src) {
+  unsigned int res = 0;
+  for (int i = 0; i < 256; ++i) {
+    res += src[i] * src[i];
+  }
+  return res;
+}
+
+/* Note:
+ *  Our codebase calculates the "diff" value in the variance algorithm by
+ *  (src - ref).
+ */
+static uint32_t variance_ref(const uint8_t *src, const uint8_t *ref, int l2w,
+                             int l2h, int src_stride, int ref_stride,
+                             uint32_t *sse_ptr, bool use_high_bit_depth_,
+                             aom_bit_depth_t bit_depth) {
+  int64_t se = 0;
+  uint64_t sse = 0;
+  const int w = 1 << l2w;
+  const int h = 1 << l2h;
+  for (int y = 0; y < h; y++) {
+    for (int x = 0; x < w; x++) {
+      int diff;
+      if (!use_high_bit_depth_) {
+        diff = src[y * src_stride + x] - ref[y * ref_stride + x];
+        se += diff;
+        sse += diff * diff;
+      } else {
+        diff = CONVERT_TO_SHORTPTR(src)[y * src_stride + x] -
+               CONVERT_TO_SHORTPTR(ref)[y * ref_stride + x];
+        se += diff;
+        sse += diff * diff;
+      }
+    }
+  }
+  RoundHighBitDepth(bit_depth, &se, &sse);
+  *sse_ptr = static_cast<uint32_t>(sse);
+  return static_cast<uint32_t>(sse - ((se * se) >> (l2w + l2h)));
+}
+
+/* The subpel reference functions differ from the codec version in one aspect:
+ * they calculate the bilinear factors directly instead of using a lookup table
+ * and therefore upshift xoff and yoff by 1. Only every other calculated value
+ * is used so the codec version shrinks the table to save space.
+ */
+static uint32_t subpel_variance_ref(const uint8_t *ref, const uint8_t *src,
+                                    int l2w, int l2h, int xoff, int yoff,
+                                    uint32_t *sse_ptr, bool use_high_bit_depth_,
+                                    aom_bit_depth_t bit_depth) {
+  int64_t se = 0;
+  uint64_t sse = 0;
+  const int w = 1 << l2w;
+  const int h = 1 << l2h;
+
+  xoff <<= 1;
+  yoff <<= 1;
+
+  for (int y = 0; y < h; y++) {
+    for (int x = 0; x < w; x++) {
+      // Bilinear interpolation at a 16th pel step.
+      if (!use_high_bit_depth_) {
+        const int a1 = ref[(w + 1) * (y + 0) + x + 0];
+        const int a2 = ref[(w + 1) * (y + 0) + x + 1];
+        const int b1 = ref[(w + 1) * (y + 1) + x + 0];
+        const int b2 = ref[(w + 1) * (y + 1) + x + 1];
+        const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
+        const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
+        const int r = a + (((b - a) * yoff + 8) >> 4);
+        const int diff = r - src[w * y + x];
+        se += diff;
+        sse += diff * diff;
+      } else {
+        uint16_t *ref16 = CONVERT_TO_SHORTPTR(ref);
+        uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
+        const int a1 = ref16[(w + 1) * (y + 0) + x + 0];
+        const int a2 = ref16[(w + 1) * (y + 0) + x + 1];
+        const int b1 = ref16[(w + 1) * (y + 1) + x + 0];
+        const int b2 = ref16[(w + 1) * (y + 1) + x + 1];
+        const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
+        const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
+        const int r = a + (((b - a) * yoff + 8) >> 4);
+        const int diff = r - src16[w * y + x];
+        se += diff;
+        sse += diff * diff;
+      }
+    }
+  }
+  RoundHighBitDepth(bit_depth, &se, &sse);
+  *sse_ptr = static_cast<uint32_t>(sse);
+  return static_cast<uint32_t>(sse - ((se * se) >> (l2w + l2h)));
+}
+
+static uint32_t subpel_avg_variance_ref(const uint8_t *ref, const uint8_t *src,
+                                        const uint8_t *second_pred, int l2w,
+                                        int l2h, int xoff, int yoff,
+                                        uint32_t *sse_ptr,
+                                        bool use_high_bit_depth,
+                                        aom_bit_depth_t bit_depth) {
+  int64_t se = 0;
+  uint64_t sse = 0;
+  const int w = 1 << l2w;
+  const int h = 1 << l2h;
+
+  xoff <<= 1;
+  yoff <<= 1;
+
+  for (int y = 0; y < h; y++) {
+    for (int x = 0; x < w; x++) {
+      // bilinear interpolation at a 16th pel step
+      if (!use_high_bit_depth) {
+        const int a1 = ref[(w + 1) * (y + 0) + x + 0];
+        const int a2 = ref[(w + 1) * (y + 0) + x + 1];
+        const int b1 = ref[(w + 1) * (y + 1) + x + 0];
+        const int b2 = ref[(w + 1) * (y + 1) + x + 1];
+        const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
+        const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
+        const int r = a + (((b - a) * yoff + 8) >> 4);
+        const int diff =
+            ((r + second_pred[w * y + x] + 1) >> 1) - src[w * y + x];
+        se += diff;
+        sse += diff * diff;
+      } else {
+        const uint16_t *ref16 = CONVERT_TO_SHORTPTR(ref);
+        const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
+        const uint16_t *sec16 = CONVERT_TO_SHORTPTR(second_pred);
+        const int a1 = ref16[(w + 1) * (y + 0) + x + 0];
+        const int a2 = ref16[(w + 1) * (y + 0) + x + 1];
+        const int b1 = ref16[(w + 1) * (y + 1) + x + 0];
+        const int b2 = ref16[(w + 1) * (y + 1) + x + 1];
+        const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
+        const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
+        const int r = a + (((b - a) * yoff + 8) >> 4);
+        const int diff = ((r + sec16[w * y + x] + 1) >> 1) - src16[w * y + x];
+        se += diff;
+        sse += diff * diff;
+      }
+    }
+  }
+  RoundHighBitDepth(bit_depth, &se, &sse);
+  *sse_ptr = static_cast<uint32_t>(sse);
+  return static_cast<uint32_t>(sse - ((se * se) >> (l2w + l2h)));
+}
+
+static uint32_t dist_wtd_subpel_avg_variance_ref(
+    const uint8_t *ref, const uint8_t *src, const uint8_t *second_pred, int l2w,
+    int l2h, int xoff, int yoff, uint32_t *sse_ptr, bool use_high_bit_depth,
+    aom_bit_depth_t bit_depth, DIST_WTD_COMP_PARAMS *jcp_param) {
+  int64_t se = 0;
+  uint64_t sse = 0;
+  const int w = 1 << l2w;
+  const int h = 1 << l2h;
+
+  xoff <<= 1;
+  yoff <<= 1;
+
+  for (int y = 0; y < h; y++) {
+    for (int x = 0; x < w; x++) {
+      // bilinear interpolation at a 16th pel step
+      if (!use_high_bit_depth) {
+        const int a1 = ref[(w + 0) * (y + 0) + x + 0];
+        const int a2 = ref[(w + 0) * (y + 0) + x + 1];
+        const int b1 = ref[(w + 0) * (y + 1) + x + 0];
+        const int b2 = ref[(w + 0) * (y + 1) + x + 1];
+        const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
+        const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
+        const int r = a + (((b - a) * yoff + 8) >> 4);
+        const int avg = ROUND_POWER_OF_TWO(
+            r * jcp_param->fwd_offset +
+                second_pred[w * y + x] * jcp_param->bck_offset,
+            DIST_PRECISION_BITS);
+        const int diff = avg - src[w * y + x];
+
+        se += diff;
+        sse += diff * diff;
+      } else {
+        const uint16_t *ref16 = CONVERT_TO_SHORTPTR(ref);
+        const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
+        const uint16_t *sec16 = CONVERT_TO_SHORTPTR(second_pred);
+        const int a1 = ref16[(w + 0) * (y + 0) + x + 0];
+        const int a2 = ref16[(w + 0) * (y + 0) + x + 1];
+        const int b1 = ref16[(w + 0) * (y + 1) + x + 0];
+        const int b2 = ref16[(w + 0) * (y + 1) + x + 1];
+        const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
+        const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
+        const int r = a + (((b - a) * yoff + 8) >> 4);
+        const int avg =
+            ROUND_POWER_OF_TWO(r * jcp_param->fwd_offset +
+                                   sec16[w * y + x] * jcp_param->bck_offset,
+                               DIST_PRECISION_BITS);
+        const int diff = avg - src16[w * y + x];
+
+        se += diff;
+        sse += diff * diff;
+      }
+    }
+  }
+  RoundHighBitDepth(bit_depth, &se, &sse);
+  *sse_ptr = static_cast<uint32_t>(sse);
+  return static_cast<uint32_t>(sse - ((se * se) >> (l2w + l2h)));
+}
+
+static uint32_t obmc_subpel_variance_ref(const uint8_t *pre, int l2w, int l2h,
+                                         int xoff, int yoff,
+                                         const int32_t *wsrc,
+                                         const int32_t *mask, uint32_t *sse_ptr,
+                                         bool use_high_bit_depth_,
+                                         aom_bit_depth_t bit_depth) {
+  int64_t se = 0;
+  uint64_t sse = 0;
+  const int w = 1 << l2w;
+  const int h = 1 << l2h;
+
+  xoff <<= 1;
+  yoff <<= 1;
+
+  for (int y = 0; y < h; y++) {
+    for (int x = 0; x < w; x++) {
+      // Bilinear interpolation at a 16th pel step.
+      if (!use_high_bit_depth_) {
+        const int a1 = pre[(w + 1) * (y + 0) + x + 0];
+        const int a2 = pre[(w + 1) * (y + 0) + x + 1];
+        const int b1 = pre[(w + 1) * (y + 1) + x + 0];
+        const int b2 = pre[(w + 1) * (y + 1) + x + 1];
+        const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
+        const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
+        const int r = a + (((b - a) * yoff + 8) >> 4);
+        const int diff = ROUND_POWER_OF_TWO_SIGNED(
+            wsrc[w * y + x] - r * mask[w * y + x], 12);
+        se += diff;
+        sse += diff * diff;
+      } else {
+        uint16_t *pre16 = CONVERT_TO_SHORTPTR(pre);
+        const int a1 = pre16[(w + 1) * (y + 0) + x + 0];
+        const int a2 = pre16[(w + 1) * (y + 0) + x + 1];
+        const int b1 = pre16[(w + 1) * (y + 1) + x + 0];
+        const int b2 = pre16[(w + 1) * (y + 1) + x + 1];
+        const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
+        const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
+        const int r = a + (((b - a) * yoff + 8) >> 4);
+        const int diff = ROUND_POWER_OF_TWO_SIGNED(
+            wsrc[w * y + x] - r * mask[w * y + x], 12);
+        se += diff;
+        sse += diff * diff;
+      }
+    }
+  }
+  RoundHighBitDepth(bit_depth, &se, &sse);
+  *sse_ptr = static_cast<uint32_t>(sse);
+  return static_cast<uint32_t>(sse - ((se * se) >> (l2w + l2h)));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+class SumOfSquaresTest : public ::testing::TestWithParam<SumOfSquaresFunction> {
+ public:
+  SumOfSquaresTest() : func_(GetParam()) {}
+
+  virtual ~SumOfSquaresTest() { libaom_test::ClearSystemState(); }
+
+ protected:
+  void ConstTest();
+  void RefTest();
+
+  SumOfSquaresFunction func_;
+  ACMRandom rnd_;
+};
+
+void SumOfSquaresTest::ConstTest() {
+  int16_t mem[256];
+  unsigned int res;
+  for (int v = 0; v < 256; ++v) {
+    for (int i = 0; i < 256; ++i) {
+      mem[i] = v;
+    }
+    ASM_REGISTER_STATE_CHECK(res = func_(mem));
+    EXPECT_EQ(256u * (v * v), res);
+  }
+}
+
+void SumOfSquaresTest::RefTest() {
+  int16_t mem[256];
+  for (int i = 0; i < 100; ++i) {
+    for (int j = 0; j < 256; ++j) {
+      mem[j] = rnd_.Rand8() - rnd_.Rand8();
+    }
+
+    const unsigned int expected = mb_ss_ref(mem);
+    unsigned int res;
+    ASM_REGISTER_STATE_CHECK(res = func_(mem));
+    EXPECT_EQ(expected, res);
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Encapsulating struct to store the function to test along with
+// some testing context.
+// Can be used for MSE, SSE, Variance, etc.
+
+template <typename Func>
+struct TestParams {
+  TestParams(int log2w = 0, int log2h = 0, Func function = NULL,
+             int bit_depth_value = 0)
+      : log2width(log2w), log2height(log2h), func(function) {
+    use_high_bit_depth = (bit_depth_value > 0);
+    if (use_high_bit_depth) {
+      bit_depth = static_cast<aom_bit_depth_t>(bit_depth_value);
+    } else {
+      bit_depth = AOM_BITS_8;
+    }
+    width = 1 << log2width;
+    height = 1 << log2height;
+    block_size = width * height;
+    mask = (1u << bit_depth) - 1;
+  }
+
+  int log2width, log2height;
+  int width, height;
+  int block_size;
+  Func func;
+  aom_bit_depth_t bit_depth;
+  bool use_high_bit_depth;
+  uint32_t mask;
+};
+
+template <typename Func>
+std::ostream &operator<<(std::ostream &os, const TestParams<Func> &p) {
+  return os << "width/height:" << p.width << "/" << p.height
+            << " function:" << reinterpret_cast<const void *>(p.func)
+            << " bit-depth:" << p.bit_depth;
+}
+
+// Main class for testing a function type
+template <typename FunctionType>
+class MainTestClass
+    : public ::testing::TestWithParam<TestParams<FunctionType> > {
+ public:
+  virtual void SetUp() {
+    params_ = this->GetParam();
+
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+    const size_t unit =
+        use_high_bit_depth() ? sizeof(uint16_t) : sizeof(uint8_t);
+    src_ = reinterpret_cast<uint8_t *>(aom_memalign(16, block_size() * unit));
+    ref_ = new uint8_t[block_size() * unit];
+    ASSERT_TRUE(src_ != NULL);
+    ASSERT_TRUE(ref_ != NULL);
+    if (use_high_bit_depth()) {
+      // TODO(skal): remove!
+      src_ = CONVERT_TO_BYTEPTR(src_);
+      ref_ = CONVERT_TO_BYTEPTR(ref_);
+    }
+  }
+
+  virtual void TearDown() {
+    if (use_high_bit_depth()) {
+      // TODO(skal): remove!
+      src_ = reinterpret_cast<uint8_t *>(CONVERT_TO_SHORTPTR(src_));
+      ref_ = reinterpret_cast<uint8_t *>(CONVERT_TO_SHORTPTR(ref_));
+    }
+
+    aom_free(src_);
+    delete[] ref_;
+    src_ = NULL;
+    ref_ = NULL;
+    libaom_test::ClearSystemState();
+  }
+
+ protected:
+  // We could sub-class MainTestClass into dedicated class for Variance
+  // and MSE/SSE, but it involves a lot of 'this->xxx' dereferencing
+  // to access top class fields xxx. That's cumbersome, so for now we'll just
+  // implement the testing methods here:
+
+  // Variance tests
+  void ZeroTest();
+  void RefTest();
+  void RefStrideTest();
+  void OneQuarterTest();
+  void SpeedTest();
+
+  // MSE/SSE tests
+  void RefTestMse();
+  void RefTestSse();
+  void MaxTestMse();
+  void MaxTestSse();
+
+ protected:
+  ACMRandom rnd_;
+  uint8_t *src_;
+  uint8_t *ref_;
+  TestParams<FunctionType> params_;
+
+  // some relay helpers
+  bool use_high_bit_depth() const { return params_.use_high_bit_depth; }
+  int byte_shift() const { return params_.bit_depth - 8; }
+  int block_size() const { return params_.block_size; }
+  int width() const { return params_.width; }
+  int height() const { return params_.height; }
+  uint32_t mask() const { return params_.mask; }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Tests related to variance.
+
+template <typename VarianceFunctionType>
+void MainTestClass<VarianceFunctionType>::ZeroTest() {
+  for (int i = 0; i <= 255; ++i) {
+    if (!use_high_bit_depth()) {
+      memset(src_, i, block_size());
+    } else {
+      uint16_t *const src16 = CONVERT_TO_SHORTPTR(src_);
+      for (int k = 0; k < block_size(); ++k) src16[k] = i << byte_shift();
+    }
+    for (int j = 0; j <= 255; ++j) {
+      if (!use_high_bit_depth()) {
+        memset(ref_, j, block_size());
+      } else {
+        uint16_t *const ref16 = CONVERT_TO_SHORTPTR(ref_);
+        for (int k = 0; k < block_size(); ++k) ref16[k] = j << byte_shift();
+      }
+      unsigned int sse, var;
+      ASM_REGISTER_STATE_CHECK(
+          var = params_.func(src_, width(), ref_, width(), &sse));
+      EXPECT_EQ(0u, var) << "src values: " << i << " ref values: " << j;
+    }
+  }
+}
+
+template <typename VarianceFunctionType>
+void MainTestClass<VarianceFunctionType>::RefTest() {
+  for (int i = 0; i < 10; ++i) {
+    for (int j = 0; j < block_size(); j++) {
+      if (!use_high_bit_depth()) {
+        src_[j] = rnd_.Rand8();
+        ref_[j] = rnd_.Rand8();
+      } else {
+        CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask();
+        CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask();
+      }
+    }
+    unsigned int sse1, sse2, var1, var2;
+    const int stride = width();
+    ASM_REGISTER_STATE_CHECK(
+        var1 = params_.func(src_, stride, ref_, stride, &sse1));
+    var2 =
+        variance_ref(src_, ref_, params_.log2width, params_.log2height, stride,
+                     stride, &sse2, use_high_bit_depth(), params_.bit_depth);
+    EXPECT_EQ(sse1, sse2) << "Error at test index: " << i;
+    EXPECT_EQ(var1, var2) << "Error at test index: " << i;
+  }
+}
+
+template <typename VarianceFunctionType>
+void MainTestClass<VarianceFunctionType>::RefStrideTest() {
+  for (int i = 0; i < 10; ++i) {
+    const int ref_stride = (i & 1) * width();
+    const int src_stride = ((i >> 1) & 1) * width();
+    for (int j = 0; j < block_size(); j++) {
+      const int ref_ind = (j / width()) * ref_stride + j % width();
+      const int src_ind = (j / width()) * src_stride + j % width();
+      if (!use_high_bit_depth()) {
+        src_[src_ind] = rnd_.Rand8();
+        ref_[ref_ind] = rnd_.Rand8();
+      } else {
+        CONVERT_TO_SHORTPTR(src_)[src_ind] = rnd_.Rand16() & mask();
+        CONVERT_TO_SHORTPTR(ref_)[ref_ind] = rnd_.Rand16() & mask();
+      }
+    }
+    unsigned int sse1, sse2;
+    unsigned int var1, var2;
+
+    ASM_REGISTER_STATE_CHECK(
+        var1 = params_.func(src_, src_stride, ref_, ref_stride, &sse1));
+    var2 = variance_ref(src_, ref_, params_.log2width, params_.log2height,
+                        src_stride, ref_stride, &sse2, use_high_bit_depth(),
+                        params_.bit_depth);
+    EXPECT_EQ(sse1, sse2) << "Error at test index: " << i;
+    EXPECT_EQ(var1, var2) << "Error at test index: " << i;
+  }
+}
+
+template <typename VarianceFunctionType>
+void MainTestClass<VarianceFunctionType>::OneQuarterTest() {
+  const int half = block_size() / 2;
+  if (!use_high_bit_depth()) {
+    memset(src_, 255, block_size());
+    memset(ref_, 255, half);
+    memset(ref_ + half, 0, half);
+  } else {
+    aom_memset16(CONVERT_TO_SHORTPTR(src_), 255 << byte_shift(), block_size());
+    aom_memset16(CONVERT_TO_SHORTPTR(ref_), 255 << byte_shift(), half);
+    aom_memset16(CONVERT_TO_SHORTPTR(ref_) + half, 0, half);
+  }
+  unsigned int sse, var, expected;
+  ASM_REGISTER_STATE_CHECK(
+      var = params_.func(src_, width(), ref_, width(), &sse));
+  expected = block_size() * 255 * 255 / 4;
+  EXPECT_EQ(expected, var);
+}
+
+template <typename VarianceFunctionType>
+void MainTestClass<VarianceFunctionType>::SpeedTest() {
+  for (int j = 0; j < block_size(); j++) {
+    if (!use_high_bit_depth()) {
+      src_[j] = rnd_.Rand8();
+      ref_[j] = rnd_.Rand8();
+    } else {
+      CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask();
+      CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask();
+    }
+  }
+  unsigned int sse;
+  const int stride = width();
+  int run_time = 1000000000 / block_size();
+  aom_usec_timer timer;
+  aom_usec_timer_start(&timer);
+  for (int i = 0; i < run_time; ++i) {
+    params_.func(src_, stride, ref_, stride, &sse);
+  }
+
+  aom_usec_timer_mark(&timer);
+  const double elapsed_time =
+      static_cast<double>(aom_usec_timer_elapsed(&timer));
+  printf("Variance %dx%d : %7.2fns\n", width(), height(), elapsed_time);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Tests related to MSE / SSE.
+
+template <typename FunctionType>
+void MainTestClass<FunctionType>::RefTestMse() {
+  for (int i = 0; i < 10; ++i) {
+    for (int j = 0; j < block_size(); ++j) {
+      src_[j] = rnd_.Rand8();
+      ref_[j] = rnd_.Rand8();
+    }
+    unsigned int sse1, sse2;
+    const int stride = width();
+    ASM_REGISTER_STATE_CHECK(params_.func(src_, stride, ref_, stride, &sse1));
+    variance_ref(src_, ref_, params_.log2width, params_.log2height, stride,
+                 stride, &sse2, false, AOM_BITS_8);
+    EXPECT_EQ(sse1, sse2);
+  }
+}
+
+template <typename FunctionType>
+void MainTestClass<FunctionType>::RefTestSse() {
+  for (int i = 0; i < 10; ++i) {
+    for (int j = 0; j < block_size(); ++j) {
+      src_[j] = rnd_.Rand8();
+      ref_[j] = rnd_.Rand8();
+    }
+    unsigned int sse2;
+    unsigned int var1;
+    const int stride = width();
+    ASM_REGISTER_STATE_CHECK(var1 = params_.func(src_, stride, ref_, stride));
+    variance_ref(src_, ref_, params_.log2width, params_.log2height, stride,
+                 stride, &sse2, false, AOM_BITS_8);
+    EXPECT_EQ(var1, sse2);
+  }
+}
+
+template <typename FunctionType>
+void MainTestClass<FunctionType>::MaxTestMse() {
+  memset(src_, 255, block_size());
+  memset(ref_, 0, block_size());
+  unsigned int sse;
+  ASM_REGISTER_STATE_CHECK(params_.func(src_, width(), ref_, width(), &sse));
+  const unsigned int expected = block_size() * 255 * 255;
+  EXPECT_EQ(expected, sse);
+}
+
+template <typename FunctionType>
+void MainTestClass<FunctionType>::MaxTestSse() {
+  memset(src_, 255, block_size());
+  memset(ref_, 0, block_size());
+  unsigned int var;
+  ASM_REGISTER_STATE_CHECK(var = params_.func(src_, width(), ref_, width()));
+  const unsigned int expected = block_size() * 255 * 255;
+  EXPECT_EQ(expected, var);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+using std::get;
+using std::make_tuple;
+using std::tuple;
+
+template <typename FunctionType>
+class SubpelVarianceTest
+    : public ::testing::TestWithParam<TestParams<FunctionType> > {
+ public:
+  virtual void SetUp() {
+    params_ = this->GetParam();
+
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+    if (!use_high_bit_depth()) {
+      src_ = reinterpret_cast<uint8_t *>(aom_memalign(32, block_size()));
+      sec_ = reinterpret_cast<uint8_t *>(aom_memalign(32, block_size()));
+      ref_ = reinterpret_cast<uint8_t *>(
+          aom_memalign(32, block_size() + width() + height() + 1));
+    } else {
+      src_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(
+          aom_memalign(32, block_size() * sizeof(uint16_t))));
+      sec_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(
+          aom_memalign(32, block_size() * sizeof(uint16_t))));
+      ref_ = CONVERT_TO_BYTEPTR(aom_memalign(
+          32, (block_size() + width() + height() + 1) * sizeof(uint16_t)));
+    }
+    ASSERT_TRUE(src_ != NULL);
+    ASSERT_TRUE(sec_ != NULL);
+    ASSERT_TRUE(ref_ != NULL);
+  }
+
+  virtual void TearDown() {
+    if (!use_high_bit_depth()) {
+      aom_free(src_);
+      aom_free(ref_);
+      aom_free(sec_);
+    } else {
+      aom_free(CONVERT_TO_SHORTPTR(src_));
+      aom_free(CONVERT_TO_SHORTPTR(ref_));
+      aom_free(CONVERT_TO_SHORTPTR(sec_));
+    }
+    libaom_test::ClearSystemState();
+  }
+
+ protected:
+  void RefTest();
+  void ExtremeRefTest();
+  void SpeedTest();
+
+  ACMRandom rnd_;
+  uint8_t *src_;
+  uint8_t *ref_;
+  uint8_t *sec_;
+  TestParams<FunctionType> params_;
+  DIST_WTD_COMP_PARAMS jcp_param_;
+
+  // some relay helpers
+  bool use_high_bit_depth() const { return params_.use_high_bit_depth; }
+  int byte_shift() const { return params_.bit_depth - 8; }
+  int block_size() const { return params_.block_size; }
+  int width() const { return params_.width; }
+  int height() const { return params_.height; }
+  uint32_t mask() const { return params_.mask; }
+};
+
+template <typename SubpelVarianceFunctionType>
+void SubpelVarianceTest<SubpelVarianceFunctionType>::RefTest() {
+  for (int x = 0; x < 8; ++x) {
+    for (int y = 0; y < 8; ++y) {
+      if (!use_high_bit_depth()) {
+        for (int j = 0; j < block_size(); j++) {
+          src_[j] = rnd_.Rand8();
+        }
+        for (int j = 0; j < block_size() + width() + height() + 1; j++) {
+          ref_[j] = rnd_.Rand8();
+        }
+      } else {
+        for (int j = 0; j < block_size(); j++) {
+          CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask();
+        }
+        for (int j = 0; j < block_size() + width() + height() + 1; j++) {
+          CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask();
+        }
+      }
+      unsigned int sse1, sse2;
+      unsigned int var1;
+      ASM_REGISTER_STATE_CHECK(
+          var1 = params_.func(ref_, width() + 1, x, y, src_, width(), &sse1));
+      const unsigned int var2 = subpel_variance_ref(
+          ref_, src_, params_.log2width, params_.log2height, x, y, &sse2,
+          use_high_bit_depth(), params_.bit_depth);
+      EXPECT_EQ(sse1, sse2) << "at position " << x << ", " << y;
+      EXPECT_EQ(var1, var2) << "at position " << x << ", " << y;
+    }
+  }
+}
+
+template <typename SubpelVarianceFunctionType>
+void SubpelVarianceTest<SubpelVarianceFunctionType>::ExtremeRefTest() {
+  // Compare against reference.
+  // Src: Set the first half of values to 0, the second half to the maximum.
+  // Ref: Set the first half of values to the maximum, the second half to 0.
+  for (int x = 0; x < 8; ++x) {
+    for (int y = 0; y < 8; ++y) {
+      const int half = block_size() / 2;
+      if (!use_high_bit_depth()) {
+        memset(src_, 0, half);
+        memset(src_ + half, 255, half);
+        memset(ref_, 255, half);
+        memset(ref_ + half, 0, half + width() + height() + 1);
+      } else {
+        aom_memset16(CONVERT_TO_SHORTPTR(src_), mask(), half);
+        aom_memset16(CONVERT_TO_SHORTPTR(src_) + half, 0, half);
+        aom_memset16(CONVERT_TO_SHORTPTR(ref_), 0, half);
+        aom_memset16(CONVERT_TO_SHORTPTR(ref_) + half, mask(),
+                     half + width() + height() + 1);
+      }
+      unsigned int sse1, sse2;
+      unsigned int var1;
+      ASM_REGISTER_STATE_CHECK(
+          var1 = params_.func(ref_, width() + 1, x, y, src_, width(), &sse1));
+      const unsigned int var2 = subpel_variance_ref(
+          ref_, src_, params_.log2width, params_.log2height, x, y, &sse2,
+          use_high_bit_depth(), params_.bit_depth);
+      EXPECT_EQ(sse1, sse2) << "for xoffset " << x << " and yoffset " << y;
+      EXPECT_EQ(var1, var2) << "for xoffset " << x << " and yoffset " << y;
+    }
+  }
+}
+
+template <typename SubpelVarianceFunctionType>
+void SubpelVarianceTest<SubpelVarianceFunctionType>::SpeedTest() {
+  if (!use_high_bit_depth()) {
+    for (int j = 0; j < block_size(); j++) {
+      src_[j] = rnd_.Rand8();
+    }
+    for (int j = 0; j < block_size() + width() + height() + 1; j++) {
+      ref_[j] = rnd_.Rand8();
+    }
+  } else {
+    for (int j = 0; j < block_size(); j++) {
+      CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask();
+    }
+    for (int j = 0; j < block_size() + width() + height() + 1; j++) {
+      CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask();
+    }
+  }
+
+  unsigned int sse1, sse2;
+  int run_time = 1000000000 / block_size();
+  aom_usec_timer timer;
+
+  aom_usec_timer_start(&timer);
+  for (int i = 0; i < run_time; ++i) {
+    int x = rnd_(8);
+    int y = rnd_(8);
+    params_.func(ref_, width() + 1, x, y, src_, width(), &sse1);
+  }
+  aom_usec_timer_mark(&timer);
+
+  const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+
+  aom_usec_timer timer_c;
+
+  aom_usec_timer_start(&timer_c);
+  for (int i = 0; i < run_time; ++i) {
+    int x = rnd_(8);
+    int y = rnd_(8);
+    subpel_variance_ref(ref_, src_, params_.log2width, params_.log2height, x, y,
+                        &sse2, use_high_bit_depth(), params_.bit_depth);
+  }
+  aom_usec_timer_mark(&timer_c);
+
+  const int elapsed_time_c = static_cast<int>(aom_usec_timer_elapsed(&timer_c));
+
+  printf(
+      "sub_pixel_variance_%dx%d_%d: ref_time=%d us opt_time=%d us gain=%d \n",
+      width(), height(), params_.bit_depth, elapsed_time_c, elapsed_time,
+      elapsed_time_c / elapsed_time);
+}
+
+template <>
+void SubpelVarianceTest<SubpixAvgVarMxNFunc>::RefTest() {
+  for (int x = 0; x < 8; ++x) {
+    for (int y = 0; y < 8; ++y) {
+      if (!use_high_bit_depth()) {
+        for (int j = 0; j < block_size(); j++) {
+          src_[j] = rnd_.Rand8();
+          sec_[j] = rnd_.Rand8();
+        }
+        for (int j = 0; j < block_size() + width() + height() + 1; j++) {
+          ref_[j] = rnd_.Rand8();
+        }
+      } else {
+        for (int j = 0; j < block_size(); j++) {
+          CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask();
+          CONVERT_TO_SHORTPTR(sec_)[j] = rnd_.Rand16() & mask();
+        }
+        for (int j = 0; j < block_size() + width() + height() + 1; j++) {
+          CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask();
+        }
+      }
+      uint32_t sse1, sse2;
+      uint32_t var1, var2;
+      ASM_REGISTER_STATE_CHECK(var1 = params_.func(ref_, width() + 1, x, y,
+                                                   src_, width(), &sse1, sec_));
+      var2 = subpel_avg_variance_ref(ref_, src_, sec_, params_.log2width,
+                                     params_.log2height, x, y, &sse2,
+                                     use_high_bit_depth(), params_.bit_depth);
+      EXPECT_EQ(sse1, sse2) << "at position " << x << ", " << y;
+      EXPECT_EQ(var1, var2) << "at position " << x << ", " << y;
+    }
+  }
+}
+
+template <>
+void SubpelVarianceTest<DistWtdSubpixAvgVarMxNFunc>::RefTest() {
+  for (int x = 0; x < 8; ++x) {
+    for (int y = 0; y < 8; ++y) {
+      if (!use_high_bit_depth()) {
+        for (int j = 0; j < block_size(); j++) {
+          src_[j] = rnd_.Rand8();
+          sec_[j] = rnd_.Rand8();
+        }
+        for (int j = 0; j < block_size() + width() + height() + 1; j++) {
+          ref_[j] = rnd_.Rand8();
+        }
+      } else {
+        for (int j = 0; j < block_size(); j++) {
+          CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask();
+          CONVERT_TO_SHORTPTR(sec_)[j] = rnd_.Rand16() & mask();
+        }
+        for (int j = 0; j < block_size() + width() + height() + 1; j++) {
+          CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask();
+        }
+      }
+      for (int x0 = 0; x0 < 2; ++x0) {
+        for (int y0 = 0; y0 < 4; ++y0) {
+          uint32_t sse1, sse2;
+          uint32_t var1, var2;
+          jcp_param_.fwd_offset = quant_dist_lookup_table[x0][y0][0];
+          jcp_param_.bck_offset = quant_dist_lookup_table[x0][y0][1];
+          ASM_REGISTER_STATE_CHECK(var1 = params_.func(ref_, width() + 0, x, y,
+                                                       src_, width(), &sse1,
+                                                       sec_, &jcp_param_));
+          var2 = dist_wtd_subpel_avg_variance_ref(
+              ref_, src_, sec_, params_.log2width, params_.log2height, x, y,
+              &sse2, use_high_bit_depth(), params_.bit_depth, &jcp_param_);
+          EXPECT_EQ(sse1, sse2) << "at position " << x << ", " << y;
+          EXPECT_EQ(var1, var2) << "at position " << x << ", " << y;
+        }
+      }
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+static const int kMaskMax = 64;
+
+typedef TestParams<ObmcSubpelVarFunc> ObmcSubpelVarianceParams;
+
+template <typename FunctionType>
+class ObmcVarianceTest
+    : public ::testing::TestWithParam<TestParams<FunctionType> > {
+ public:
+  virtual void SetUp() {
+    params_ = this->GetParam();
+
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+    if (!use_high_bit_depth()) {
+      pre_ = reinterpret_cast<uint8_t *>(
+          aom_memalign(32, block_size() + width() + height() + 1));
+    } else {
+      pre_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(aom_memalign(
+          32, block_size() + width() + height() + 1 * sizeof(uint16_t))));
+    }
+    wsrc_ = reinterpret_cast<int32_t *>(
+        aom_memalign(32, block_size() * sizeof(uint32_t)));
+    mask_ = reinterpret_cast<int32_t *>(
+        aom_memalign(32, block_size() * sizeof(uint32_t)));
+    ASSERT_TRUE(pre_ != NULL);
+    ASSERT_TRUE(wsrc_ != NULL);
+    ASSERT_TRUE(mask_ != NULL);
+  }
+
+  virtual void TearDown() {
+    if (!use_high_bit_depth()) {
+      aom_free(pre_);
+    } else {
+      aom_free(CONVERT_TO_SHORTPTR(pre_));
+    }
+    aom_free(wsrc_);
+    aom_free(mask_);
+    libaom_test::ClearSystemState();
+  }
+
+ protected:
+  void RefTest();
+  void ExtremeRefTest();
+  void SpeedTest();
+
+  ACMRandom rnd_;
+  uint8_t *pre_;
+  int32_t *wsrc_;
+  int32_t *mask_;
+  TestParams<FunctionType> params_;
+
+  // some relay helpers
+  bool use_high_bit_depth() const { return params_.use_high_bit_depth; }
+  int byte_shift() const { return params_.bit_depth - 8; }
+  int block_size() const { return params_.block_size; }
+  int width() const { return params_.width; }
+  int height() const { return params_.height; }
+  uint32_t bd_mask() const { return params_.mask; }
+};
+
+template <>
+void ObmcVarianceTest<ObmcSubpelVarFunc>::RefTest() {
+  for (int x = 0; x < 8; ++x) {
+    for (int y = 0; y < 8; ++y) {
+      if (!use_high_bit_depth())
+        for (int j = 0; j < block_size() + width() + height() + 1; j++)
+          pre_[j] = rnd_.Rand8();
+      else
+        for (int j = 0; j < block_size() + width() + height() + 1; j++)
+          CONVERT_TO_SHORTPTR(pre_)[j] = rnd_.Rand16() & bd_mask();
+      for (int j = 0; j < block_size(); j++) {
+        wsrc_[j] = (rnd_.Rand16() & bd_mask()) * rnd_(kMaskMax * kMaskMax + 1);
+        mask_[j] = rnd_(kMaskMax * kMaskMax + 1);
+      }
+
+      uint32_t sse1, sse2;
+      uint32_t var1, var2;
+      ASM_REGISTER_STATE_CHECK(
+          var1 = params_.func(pre_, width() + 1, x, y, wsrc_, mask_, &sse1));
+      var2 = obmc_subpel_variance_ref(
+          pre_, params_.log2width, params_.log2height, x, y, wsrc_, mask_,
+          &sse2, use_high_bit_depth(), params_.bit_depth);
+      EXPECT_EQ(sse1, sse2) << "for xoffset " << x << " and yoffset " << y;
+      EXPECT_EQ(var1, var2) << "for xoffset " << x << " and yoffset " << y;
+    }
+  }
+}
+
+template <>
+void ObmcVarianceTest<ObmcSubpelVarFunc>::ExtremeRefTest() {
+  // Pre: Set the first half of values to the maximum, the second half to 0.
+  // Mask: same as above
+  // WSrc: Set the first half of values to 0, the second half to the maximum.
+  for (int x = 0; x < 8; ++x) {
+    for (int y = 0; y < 8; ++y) {
+      const int half = block_size() / 2;
+      if (!use_high_bit_depth()) {
+        memset(pre_, 255, half);
+        memset(pre_ + half, 0, half + width() + height() + 1);
+      } else {
+        aom_memset16(CONVERT_TO_SHORTPTR(pre_), bd_mask(), half);
+        aom_memset16(CONVERT_TO_SHORTPTR(pre_) + half, 0, half);
+      }
+      for (int j = 0; j < half; j++) {
+        wsrc_[j] = bd_mask() * kMaskMax * kMaskMax;
+        mask_[j] = 0;
+      }
+      for (int j = half; j < block_size(); j++) {
+        wsrc_[j] = 0;
+        mask_[j] = kMaskMax * kMaskMax;
+      }
+
+      uint32_t sse1, sse2;
+      uint32_t var1, var2;
+      ASM_REGISTER_STATE_CHECK(
+          var1 = params_.func(pre_, width() + 1, x, y, wsrc_, mask_, &sse1));
+      var2 = obmc_subpel_variance_ref(
+          pre_, params_.log2width, params_.log2height, x, y, wsrc_, mask_,
+          &sse2, use_high_bit_depth(), params_.bit_depth);
+      EXPECT_EQ(sse1, sse2) << "for xoffset " << x << " and yoffset " << y;
+      EXPECT_EQ(var1, var2) << "for xoffset " << x << " and yoffset " << y;
+    }
+  }
+}
+
+template <>
+void ObmcVarianceTest<ObmcSubpelVarFunc>::SpeedTest() {
+  if (!use_high_bit_depth())
+    for (int j = 0; j < block_size() + width() + height() + 1; j++)
+      pre_[j] = rnd_.Rand8();
+  else
+    for (int j = 0; j < block_size() + width() + height() + 1; j++)
+      CONVERT_TO_SHORTPTR(pre_)[j] = rnd_.Rand16() & bd_mask();
+  for (int j = 0; j < block_size(); j++) {
+    wsrc_[j] = (rnd_.Rand16() & bd_mask()) * rnd_(kMaskMax * kMaskMax + 1);
+    mask_[j] = rnd_(kMaskMax * kMaskMax + 1);
+  }
+  unsigned int sse1;
+  const int stride = width() + 1;
+  int run_time = 1000000000 / block_size();
+  aom_usec_timer timer;
+
+  aom_usec_timer_start(&timer);
+  for (int i = 0; i < run_time; ++i) {
+    int x = rnd_(8);
+    int y = rnd_(8);
+    ASM_REGISTER_STATE_CHECK(
+        params_.func(pre_, stride, x, y, wsrc_, mask_, &sse1));
+  }
+  aom_usec_timer_mark(&timer);
+
+  const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+  printf("obmc_sub_pixel_variance_%dx%d_%d: %d us\n", width(), height(),
+         params_.bit_depth, elapsed_time);
+}
+
+typedef MainTestClass<Get4x4SseFunc> AvxSseTest;
+typedef MainTestClass<VarianceMxNFunc> AvxMseTest;
+typedef MainTestClass<VarianceMxNFunc> AvxVarianceTest;
+typedef SubpelVarianceTest<SubpixVarMxNFunc> AvxSubpelVarianceTest;
+typedef SubpelVarianceTest<SubpixAvgVarMxNFunc> AvxSubpelAvgVarianceTest;
+typedef SubpelVarianceTest<DistWtdSubpixAvgVarMxNFunc>
+    AvxDistWtdSubpelAvgVarianceTest;
+typedef ObmcVarianceTest<ObmcSubpelVarFunc> AvxObmcSubpelVarianceTest;
+
+TEST_P(AvxSseTest, RefSse) { RefTestSse(); }
+TEST_P(AvxSseTest, MaxSse) { MaxTestSse(); }
+TEST_P(AvxMseTest, RefMse) { RefTestMse(); }
+TEST_P(AvxMseTest, MaxMse) { MaxTestMse(); }
+TEST_P(AvxVarianceTest, Zero) { ZeroTest(); }
+TEST_P(AvxVarianceTest, Ref) { RefTest(); }
+TEST_P(AvxVarianceTest, RefStride) { RefStrideTest(); }
+TEST_P(AvxVarianceTest, OneQuarter) { OneQuarterTest(); }
+TEST_P(AvxVarianceTest, DISABLED_Speed) { SpeedTest(); }
+TEST_P(SumOfSquaresTest, Const) { ConstTest(); }
+TEST_P(SumOfSquaresTest, Ref) { RefTest(); }
+TEST_P(AvxSubpelVarianceTest, Ref) { RefTest(); }
+TEST_P(AvxSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }
+TEST_P(AvxSubpelVarianceTest, DISABLED_Speed) { SpeedTest(); }
+TEST_P(AvxSubpelAvgVarianceTest, Ref) { RefTest(); }
+TEST_P(AvxDistWtdSubpelAvgVarianceTest, Ref) { RefTest(); }
+TEST_P(AvxObmcSubpelVarianceTest, Ref) { RefTest(); }
+TEST_P(AvxObmcSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }
+TEST_P(AvxObmcSubpelVarianceTest, DISABLED_Speed) { SpeedTest(); }
+
+INSTANTIATE_TEST_SUITE_P(C, SumOfSquaresTest,
+                         ::testing::Values(aom_get_mb_ss_c));
+
+typedef TestParams<Get4x4SseFunc> SseParams;
+INSTANTIATE_TEST_SUITE_P(C, AvxSseTest,
+                         ::testing::Values(SseParams(2, 2,
+                                                     &aom_get4x4sse_cs_c)));
+
+typedef TestParams<VarianceMxNFunc> MseParams;
+INSTANTIATE_TEST_SUITE_P(C, AvxMseTest,
+                         ::testing::Values(MseParams(4, 4, &aom_mse16x16_c),
+                                           MseParams(4, 3, &aom_mse16x8_c),
+                                           MseParams(3, 4, &aom_mse8x16_c),
+                                           MseParams(3, 3, &aom_mse8x8_c)));
+
+typedef TestParams<VarianceMxNFunc> VarianceParams;
+INSTANTIATE_TEST_SUITE_P(
+    C, AvxVarianceTest,
+    ::testing::Values(VarianceParams(7, 7, &aom_variance128x128_c),
+                      VarianceParams(7, 6, &aom_variance128x64_c),
+                      VarianceParams(6, 7, &aom_variance64x128_c),
+                      VarianceParams(6, 6, &aom_variance64x64_c),
+                      VarianceParams(6, 5, &aom_variance64x32_c),
+                      VarianceParams(5, 6, &aom_variance32x64_c),
+                      VarianceParams(5, 5, &aom_variance32x32_c),
+                      VarianceParams(5, 4, &aom_variance32x16_c),
+                      VarianceParams(4, 5, &aom_variance16x32_c),
+                      VarianceParams(4, 4, &aom_variance16x16_c),
+                      VarianceParams(4, 3, &aom_variance16x8_c),
+                      VarianceParams(3, 4, &aom_variance8x16_c),
+                      VarianceParams(3, 3, &aom_variance8x8_c),
+                      VarianceParams(3, 2, &aom_variance8x4_c),
+                      VarianceParams(2, 3, &aom_variance4x8_c),
+                      VarianceParams(2, 2, &aom_variance4x4_c),
+
+                      VarianceParams(6, 4, &aom_variance64x16_c),
+                      VarianceParams(4, 6, &aom_variance16x64_c),
+                      VarianceParams(5, 3, &aom_variance32x8_c),
+                      VarianceParams(3, 5, &aom_variance8x32_c),
+                      VarianceParams(4, 2, &aom_variance16x4_c),
+                      VarianceParams(2, 4, &aom_variance4x16_c)));
+
+typedef TestParams<SubpixVarMxNFunc> SubpelVarianceParams;
+INSTANTIATE_TEST_SUITE_P(
+    C, AvxSubpelVarianceTest,
+    ::testing::Values(
+        SubpelVarianceParams(7, 7, &aom_sub_pixel_variance128x128_c, 0),
+        SubpelVarianceParams(7, 6, &aom_sub_pixel_variance128x64_c, 0),
+        SubpelVarianceParams(6, 7, &aom_sub_pixel_variance64x128_c, 0),
+        SubpelVarianceParams(6, 6, &aom_sub_pixel_variance64x64_c, 0),
+        SubpelVarianceParams(6, 5, &aom_sub_pixel_variance64x32_c, 0),
+        SubpelVarianceParams(5, 6, &aom_sub_pixel_variance32x64_c, 0),
+        SubpelVarianceParams(5, 5, &aom_sub_pixel_variance32x32_c, 0),
+        SubpelVarianceParams(5, 4, &aom_sub_pixel_variance32x16_c, 0),
+        SubpelVarianceParams(4, 5, &aom_sub_pixel_variance16x32_c, 0),
+        SubpelVarianceParams(4, 4, &aom_sub_pixel_variance16x16_c, 0),
+        SubpelVarianceParams(4, 3, &aom_sub_pixel_variance16x8_c, 0),
+        SubpelVarianceParams(3, 4, &aom_sub_pixel_variance8x16_c, 0),
+        SubpelVarianceParams(3, 3, &aom_sub_pixel_variance8x8_c, 0),
+        SubpelVarianceParams(3, 2, &aom_sub_pixel_variance8x4_c, 0),
+        SubpelVarianceParams(2, 3, &aom_sub_pixel_variance4x8_c, 0),
+        SubpelVarianceParams(2, 2, &aom_sub_pixel_variance4x4_c, 0),
+
+        SubpelVarianceParams(6, 4, &aom_sub_pixel_variance64x16_c, 0),
+        SubpelVarianceParams(4, 6, &aom_sub_pixel_variance16x64_c, 0),
+        SubpelVarianceParams(5, 3, &aom_sub_pixel_variance32x8_c, 0),
+        SubpelVarianceParams(3, 5, &aom_sub_pixel_variance8x32_c, 0),
+        SubpelVarianceParams(4, 2, &aom_sub_pixel_variance16x4_c, 0),
+        SubpelVarianceParams(2, 4, &aom_sub_pixel_variance4x16_c, 0)));
+
+typedef TestParams<SubpixAvgVarMxNFunc> SubpelAvgVarianceParams;
+INSTANTIATE_TEST_SUITE_P(
+    C, AvxSubpelAvgVarianceTest,
+    ::testing::Values(
+        SubpelAvgVarianceParams(7, 7, &aom_sub_pixel_avg_variance128x128_c, 0),
+        SubpelAvgVarianceParams(7, 6, &aom_sub_pixel_avg_variance128x64_c, 0),
+        SubpelAvgVarianceParams(6, 7, &aom_sub_pixel_avg_variance64x128_c, 0),
+        SubpelAvgVarianceParams(6, 6, &aom_sub_pixel_avg_variance64x64_c, 0),
+        SubpelAvgVarianceParams(6, 5, &aom_sub_pixel_avg_variance64x32_c, 0),
+        SubpelAvgVarianceParams(5, 6, &aom_sub_pixel_avg_variance32x64_c, 0),
+        SubpelAvgVarianceParams(5, 5, &aom_sub_pixel_avg_variance32x32_c, 0),
+        SubpelAvgVarianceParams(5, 4, &aom_sub_pixel_avg_variance32x16_c, 0),
+        SubpelAvgVarianceParams(4, 5, &aom_sub_pixel_avg_variance16x32_c, 0),
+        SubpelAvgVarianceParams(4, 4, &aom_sub_pixel_avg_variance16x16_c, 0),
+        SubpelAvgVarianceParams(4, 3, &aom_sub_pixel_avg_variance16x8_c, 0),
+        SubpelAvgVarianceParams(3, 4, &aom_sub_pixel_avg_variance8x16_c, 0),
+        SubpelAvgVarianceParams(3, 3, &aom_sub_pixel_avg_variance8x8_c, 0),
+        SubpelAvgVarianceParams(3, 2, &aom_sub_pixel_avg_variance8x4_c, 0),
+        SubpelAvgVarianceParams(2, 3, &aom_sub_pixel_avg_variance4x8_c, 0),
+        SubpelAvgVarianceParams(2, 2, &aom_sub_pixel_avg_variance4x4_c, 0),
+
+        SubpelAvgVarianceParams(6, 4, &aom_sub_pixel_avg_variance64x16_c, 0),
+        SubpelAvgVarianceParams(4, 6, &aom_sub_pixel_avg_variance16x64_c, 0),
+        SubpelAvgVarianceParams(5, 3, &aom_sub_pixel_avg_variance32x8_c, 0),
+        SubpelAvgVarianceParams(3, 5, &aom_sub_pixel_avg_variance8x32_c, 0),
+        SubpelAvgVarianceParams(4, 2, &aom_sub_pixel_avg_variance16x4_c, 0),
+        SubpelAvgVarianceParams(2, 4, &aom_sub_pixel_avg_variance4x16_c, 0)));
+
+typedef TestParams<DistWtdSubpixAvgVarMxNFunc> DistWtdSubpelAvgVarianceParams;
+INSTANTIATE_TEST_SUITE_P(
+    C, AvxDistWtdSubpelAvgVarianceTest,
+    ::testing::Values(DistWtdSubpelAvgVarianceParams(
+                          6, 6, &aom_dist_wtd_sub_pixel_avg_variance64x64_c, 0),
+                      DistWtdSubpelAvgVarianceParams(
+                          6, 5, &aom_dist_wtd_sub_pixel_avg_variance64x32_c, 0),
+                      DistWtdSubpelAvgVarianceParams(
+                          5, 6, &aom_dist_wtd_sub_pixel_avg_variance32x64_c, 0),
+                      DistWtdSubpelAvgVarianceParams(
+                          5, 5, &aom_dist_wtd_sub_pixel_avg_variance32x32_c, 0),
+                      DistWtdSubpelAvgVarianceParams(
+                          5, 4, &aom_dist_wtd_sub_pixel_avg_variance32x16_c, 0),
+                      DistWtdSubpelAvgVarianceParams(
+                          4, 5, &aom_dist_wtd_sub_pixel_avg_variance16x32_c, 0),
+                      DistWtdSubpelAvgVarianceParams(
+                          4, 4, &aom_dist_wtd_sub_pixel_avg_variance16x16_c, 0),
+                      DistWtdSubpelAvgVarianceParams(
+                          4, 3, &aom_dist_wtd_sub_pixel_avg_variance16x8_c, 0),
+                      DistWtdSubpelAvgVarianceParams(
+                          3, 4, &aom_dist_wtd_sub_pixel_avg_variance8x16_c, 0),
+                      DistWtdSubpelAvgVarianceParams(
+                          3, 3, &aom_dist_wtd_sub_pixel_avg_variance8x8_c, 0),
+                      DistWtdSubpelAvgVarianceParams(
+                          3, 2, &aom_dist_wtd_sub_pixel_avg_variance8x4_c, 0),
+                      DistWtdSubpelAvgVarianceParams(
+                          2, 3, &aom_dist_wtd_sub_pixel_avg_variance4x8_c, 0),
+                      DistWtdSubpelAvgVarianceParams(
+                          2, 2, &aom_dist_wtd_sub_pixel_avg_variance4x4_c, 0),
+
+                      DistWtdSubpelAvgVarianceParams(
+                          6, 4, &aom_dist_wtd_sub_pixel_avg_variance64x16_c, 0),
+                      DistWtdSubpelAvgVarianceParams(
+                          4, 6, &aom_dist_wtd_sub_pixel_avg_variance16x64_c, 0),
+                      DistWtdSubpelAvgVarianceParams(
+                          5, 3, &aom_dist_wtd_sub_pixel_avg_variance32x8_c, 0),
+                      DistWtdSubpelAvgVarianceParams(
+                          3, 5, &aom_dist_wtd_sub_pixel_avg_variance8x32_c, 0),
+                      DistWtdSubpelAvgVarianceParams(
+                          4, 2, &aom_dist_wtd_sub_pixel_avg_variance16x4_c, 0),
+                      DistWtdSubpelAvgVarianceParams(
+                          2, 4, &aom_dist_wtd_sub_pixel_avg_variance4x16_c,
+                          0)));
+
+INSTANTIATE_TEST_SUITE_P(
+    C, AvxObmcSubpelVarianceTest,
+    ::testing::Values(
+        ObmcSubpelVarianceParams(7, 7, &aom_obmc_sub_pixel_variance128x128_c,
+                                 0),
+        ObmcSubpelVarianceParams(7, 6, &aom_obmc_sub_pixel_variance128x64_c, 0),
+        ObmcSubpelVarianceParams(6, 7, &aom_obmc_sub_pixel_variance64x128_c, 0),
+        ObmcSubpelVarianceParams(6, 6, &aom_obmc_sub_pixel_variance64x64_c, 0),
+        ObmcSubpelVarianceParams(6, 5, &aom_obmc_sub_pixel_variance64x32_c, 0),
+        ObmcSubpelVarianceParams(5, 6, &aom_obmc_sub_pixel_variance32x64_c, 0),
+        ObmcSubpelVarianceParams(5, 5, &aom_obmc_sub_pixel_variance32x32_c, 0),
+        ObmcSubpelVarianceParams(5, 4, &aom_obmc_sub_pixel_variance32x16_c, 0),
+        ObmcSubpelVarianceParams(4, 5, &aom_obmc_sub_pixel_variance16x32_c, 0),
+        ObmcSubpelVarianceParams(4, 4, &aom_obmc_sub_pixel_variance16x16_c, 0),
+        ObmcSubpelVarianceParams(4, 3, &aom_obmc_sub_pixel_variance16x8_c, 0),
+        ObmcSubpelVarianceParams(3, 4, &aom_obmc_sub_pixel_variance8x16_c, 0),
+        ObmcSubpelVarianceParams(3, 3, &aom_obmc_sub_pixel_variance8x8_c, 0),
+        ObmcSubpelVarianceParams(3, 2, &aom_obmc_sub_pixel_variance8x4_c, 0),
+        ObmcSubpelVarianceParams(2, 3, &aom_obmc_sub_pixel_variance4x8_c, 0),
+        ObmcSubpelVarianceParams(2, 2, &aom_obmc_sub_pixel_variance4x4_c, 0),
+
+        ObmcSubpelVarianceParams(6, 4, &aom_obmc_sub_pixel_variance64x16_c, 0),
+        ObmcSubpelVarianceParams(4, 6, &aom_obmc_sub_pixel_variance16x64_c, 0),
+        ObmcSubpelVarianceParams(5, 3, &aom_obmc_sub_pixel_variance32x8_c, 0),
+        ObmcSubpelVarianceParams(3, 5, &aom_obmc_sub_pixel_variance8x32_c, 0),
+        ObmcSubpelVarianceParams(4, 2, &aom_obmc_sub_pixel_variance16x4_c, 0),
+        ObmcSubpelVarianceParams(2, 4, &aom_obmc_sub_pixel_variance4x16_c, 0)));
+
+#if CONFIG_AV1_HIGHBITDEPTH
+typedef MainTestClass<VarianceMxNFunc> AvxHBDMseTest;
+typedef MainTestClass<VarianceMxNFunc> AvxHBDVarianceTest;
+typedef SubpelVarianceTest<SubpixVarMxNFunc> AvxHBDSubpelVarianceTest;
+typedef SubpelVarianceTest<SubpixAvgVarMxNFunc> AvxHBDSubpelAvgVarianceTest;
+typedef ObmcVarianceTest<ObmcSubpelVarFunc> AvxHBDObmcSubpelVarianceTest;
+
+TEST_P(AvxHBDMseTest, RefMse) { RefTestMse(); }
+TEST_P(AvxHBDMseTest, MaxMse) { MaxTestMse(); }
+TEST_P(AvxHBDVarianceTest, Zero) { ZeroTest(); }
+TEST_P(AvxHBDVarianceTest, Ref) { RefTest(); }
+TEST_P(AvxHBDVarianceTest, RefStride) { RefStrideTest(); }
+TEST_P(AvxHBDVarianceTest, OneQuarter) { OneQuarterTest(); }
+TEST_P(AvxHBDVarianceTest, DISABLED_Speed) { SpeedTest(); }
+TEST_P(AvxHBDSubpelVarianceTest, Ref) { RefTest(); }
+TEST_P(AvxHBDSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }
+TEST_P(AvxHBDSubpelVarianceTest, DISABLED_Speed) { SpeedTest(); }
+TEST_P(AvxHBDSubpelAvgVarianceTest, Ref) { RefTest(); }
+
+/* TODO(debargha): This test does not support the highbd version
+INSTANTIATE_TEST_SUITE_P(
+    C, AvxHBDMseTest,
+    ::testing::Values(make_tuple(4, 4, &aom_highbd_12_mse16x16_c),
+                      make_tuple(4, 4, &aom_highbd_12_mse16x8_c),
+                      make_tuple(4, 4, &aom_highbd_12_mse8x16_c),
+                      make_tuple(4, 4, &aom_highbd_12_mse8x8_c),
+                      make_tuple(4, 4, &aom_highbd_10_mse16x16_c),
+                      make_tuple(4, 4, &aom_highbd_10_mse16x8_c),
+                      make_tuple(4, 4, &aom_highbd_10_mse8x16_c),
+                      make_tuple(4, 4, &aom_highbd_10_mse8x8_c),
+                      make_tuple(4, 4, &aom_highbd_8_mse16x16_c),
+                      make_tuple(4, 4, &aom_highbd_8_mse16x8_c),
+                      make_tuple(4, 4, &aom_highbd_8_mse8x16_c),
+                      make_tuple(4, 4, &aom_highbd_8_mse8x8_c)));
+*/
+
+const VarianceParams kArrayHBDVariance_c[] = {
+  VarianceParams(7, 7, &aom_highbd_12_variance128x128_c, 12),
+  VarianceParams(7, 6, &aom_highbd_12_variance128x64_c, 12),
+  VarianceParams(6, 7, &aom_highbd_12_variance64x128_c, 12),
+  VarianceParams(6, 6, &aom_highbd_12_variance64x64_c, 12),
+  VarianceParams(6, 5, &aom_highbd_12_variance64x32_c, 12),
+  VarianceParams(5, 6, &aom_highbd_12_variance32x64_c, 12),
+  VarianceParams(5, 5, &aom_highbd_12_variance32x32_c, 12),
+  VarianceParams(5, 4, &aom_highbd_12_variance32x16_c, 12),
+  VarianceParams(4, 5, &aom_highbd_12_variance16x32_c, 12),
+  VarianceParams(4, 4, &aom_highbd_12_variance16x16_c, 12),
+  VarianceParams(4, 3, &aom_highbd_12_variance16x8_c, 12),
+  VarianceParams(3, 4, &aom_highbd_12_variance8x16_c, 12),
+  VarianceParams(3, 3, &aom_highbd_12_variance8x8_c, 12),
+  VarianceParams(3, 2, &aom_highbd_12_variance8x4_c, 12),
+  VarianceParams(2, 3, &aom_highbd_12_variance4x8_c, 12),
+  VarianceParams(2, 2, &aom_highbd_12_variance4x4_c, 12),
+  VarianceParams(7, 7, &aom_highbd_10_variance128x128_c, 10),
+  VarianceParams(7, 6, &aom_highbd_10_variance128x64_c, 10),
+  VarianceParams(6, 7, &aom_highbd_10_variance64x128_c, 10),
+  VarianceParams(6, 6, &aom_highbd_10_variance64x64_c, 10),
+  VarianceParams(6, 5, &aom_highbd_10_variance64x32_c, 10),
+  VarianceParams(5, 6, &aom_highbd_10_variance32x64_c, 10),
+  VarianceParams(5, 5, &aom_highbd_10_variance32x32_c, 10),
+  VarianceParams(5, 4, &aom_highbd_10_variance32x16_c, 10),
+  VarianceParams(4, 5, &aom_highbd_10_variance16x32_c, 10),
+  VarianceParams(4, 4, &aom_highbd_10_variance16x16_c, 10),
+  VarianceParams(4, 3, &aom_highbd_10_variance16x8_c, 10),
+  VarianceParams(3, 4, &aom_highbd_10_variance8x16_c, 10),
+  VarianceParams(3, 3, &aom_highbd_10_variance8x8_c, 10),
+  VarianceParams(3, 2, &aom_highbd_10_variance8x4_c, 10),
+  VarianceParams(2, 3, &aom_highbd_10_variance4x8_c, 10),
+  VarianceParams(2, 2, &aom_highbd_10_variance4x4_c, 10),
+  VarianceParams(7, 7, &aom_highbd_8_variance128x128_c, 8),
+  VarianceParams(7, 6, &aom_highbd_8_variance128x64_c, 8),
+  VarianceParams(6, 7, &aom_highbd_8_variance64x128_c, 8),
+  VarianceParams(6, 6, &aom_highbd_8_variance64x64_c, 8),
+  VarianceParams(6, 5, &aom_highbd_8_variance64x32_c, 8),
+  VarianceParams(5, 6, &aom_highbd_8_variance32x64_c, 8),
+  VarianceParams(5, 5, &aom_highbd_8_variance32x32_c, 8),
+  VarianceParams(5, 4, &aom_highbd_8_variance32x16_c, 8),
+  VarianceParams(4, 5, &aom_highbd_8_variance16x32_c, 8),
+  VarianceParams(4, 4, &aom_highbd_8_variance16x16_c, 8),
+  VarianceParams(4, 3, &aom_highbd_8_variance16x8_c, 8),
+  VarianceParams(3, 4, &aom_highbd_8_variance8x16_c, 8),
+  VarianceParams(3, 3, &aom_highbd_8_variance8x8_c, 8),
+  VarianceParams(3, 2, &aom_highbd_8_variance8x4_c, 8),
+  VarianceParams(2, 3, &aom_highbd_8_variance4x8_c, 8),
+  VarianceParams(2, 2, &aom_highbd_8_variance4x4_c, 8),
+
+  VarianceParams(6, 4, &aom_highbd_12_variance64x16_c, 12),
+  VarianceParams(4, 6, &aom_highbd_12_variance16x64_c, 12),
+  VarianceParams(5, 3, &aom_highbd_12_variance32x8_c, 12),
+  VarianceParams(3, 5, &aom_highbd_12_variance8x32_c, 12),
+  VarianceParams(4, 2, &aom_highbd_12_variance16x4_c, 12),
+  VarianceParams(2, 4, &aom_highbd_12_variance4x16_c, 12),
+  VarianceParams(6, 4, &aom_highbd_10_variance64x16_c, 10),
+  VarianceParams(4, 6, &aom_highbd_10_variance16x64_c, 10),
+  VarianceParams(5, 3, &aom_highbd_10_variance32x8_c, 10),
+  VarianceParams(3, 5, &aom_highbd_10_variance8x32_c, 10),
+  VarianceParams(4, 2, &aom_highbd_10_variance16x4_c, 10),
+  VarianceParams(2, 4, &aom_highbd_10_variance4x16_c, 10),
+  VarianceParams(6, 4, &aom_highbd_8_variance64x16_c, 8),
+  VarianceParams(4, 6, &aom_highbd_8_variance16x64_c, 8),
+  VarianceParams(5, 3, &aom_highbd_8_variance32x8_c, 8),
+  VarianceParams(3, 5, &aom_highbd_8_variance8x32_c, 8),
+  VarianceParams(4, 2, &aom_highbd_8_variance16x4_c, 8),
+  VarianceParams(2, 4, &aom_highbd_8_variance4x16_c, 8),
+};
+INSTANTIATE_TEST_SUITE_P(C, AvxHBDVarianceTest,
+                         ::testing::ValuesIn(kArrayHBDVariance_c));
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(
+    SSE4_1, AvxHBDVarianceTest,
+    ::testing::Values(
+        VarianceParams(2, 2, &aom_highbd_8_variance4x4_sse4_1, 8),
+        VarianceParams(2, 2, &aom_highbd_10_variance4x4_sse4_1, 10),
+        VarianceParams(2, 2, &aom_highbd_12_variance4x4_sse4_1, 12)));
+#endif  // HAVE_SSE4_1
+
+const SubpelVarianceParams kArrayHBDSubpelVariance_c[] = {
+  SubpelVarianceParams(7, 7, &aom_highbd_8_sub_pixel_variance128x128_c, 8),
+  SubpelVarianceParams(7, 6, &aom_highbd_8_sub_pixel_variance128x64_c, 8),
+  SubpelVarianceParams(6, 7, &aom_highbd_8_sub_pixel_variance64x128_c, 8),
+  SubpelVarianceParams(6, 6, &aom_highbd_8_sub_pixel_variance64x64_c, 8),
+  SubpelVarianceParams(6, 5, &aom_highbd_8_sub_pixel_variance64x32_c, 8),
+  SubpelVarianceParams(5, 6, &aom_highbd_8_sub_pixel_variance32x64_c, 8),
+  SubpelVarianceParams(5, 5, &aom_highbd_8_sub_pixel_variance32x32_c, 8),
+  SubpelVarianceParams(5, 4, &aom_highbd_8_sub_pixel_variance32x16_c, 8),
+  SubpelVarianceParams(4, 5, &aom_highbd_8_sub_pixel_variance16x32_c, 8),
+  SubpelVarianceParams(4, 4, &aom_highbd_8_sub_pixel_variance16x16_c, 8),
+  SubpelVarianceParams(4, 3, &aom_highbd_8_sub_pixel_variance16x8_c, 8),
+  SubpelVarianceParams(3, 4, &aom_highbd_8_sub_pixel_variance8x16_c, 8),
+  SubpelVarianceParams(3, 3, &aom_highbd_8_sub_pixel_variance8x8_c, 8),
+  SubpelVarianceParams(3, 2, &aom_highbd_8_sub_pixel_variance8x4_c, 8),
+  SubpelVarianceParams(2, 3, &aom_highbd_8_sub_pixel_variance4x8_c, 8),
+  SubpelVarianceParams(2, 2, &aom_highbd_8_sub_pixel_variance4x4_c, 8),
+  SubpelVarianceParams(7, 7, &aom_highbd_10_sub_pixel_variance128x128_c, 10),
+  SubpelVarianceParams(7, 6, &aom_highbd_10_sub_pixel_variance128x64_c, 10),
+  SubpelVarianceParams(6, 7, &aom_highbd_10_sub_pixel_variance64x128_c, 10),
+  SubpelVarianceParams(6, 6, &aom_highbd_10_sub_pixel_variance64x64_c, 10),
+  SubpelVarianceParams(6, 5, &aom_highbd_10_sub_pixel_variance64x32_c, 10),
+  SubpelVarianceParams(5, 6, &aom_highbd_10_sub_pixel_variance32x64_c, 10),
+  SubpelVarianceParams(5, 5, &aom_highbd_10_sub_pixel_variance32x32_c, 10),
+  SubpelVarianceParams(5, 4, &aom_highbd_10_sub_pixel_variance32x16_c, 10),
+  SubpelVarianceParams(4, 5, &aom_highbd_10_sub_pixel_variance16x32_c, 10),
+  SubpelVarianceParams(4, 4, &aom_highbd_10_sub_pixel_variance16x16_c, 10),
+  SubpelVarianceParams(4, 3, &aom_highbd_10_sub_pixel_variance16x8_c, 10),
+  SubpelVarianceParams(3, 4, &aom_highbd_10_sub_pixel_variance8x16_c, 10),
+  SubpelVarianceParams(3, 3, &aom_highbd_10_sub_pixel_variance8x8_c, 10),
+  SubpelVarianceParams(3, 2, &aom_highbd_10_sub_pixel_variance8x4_c, 10),
+  SubpelVarianceParams(2, 3, &aom_highbd_10_sub_pixel_variance4x8_c, 10),
+  SubpelVarianceParams(2, 2, &aom_highbd_10_sub_pixel_variance4x4_c, 10),
+  SubpelVarianceParams(7, 7, &aom_highbd_12_sub_pixel_variance128x128_c, 12),
+  SubpelVarianceParams(7, 6, &aom_highbd_12_sub_pixel_variance128x64_c, 12),
+  SubpelVarianceParams(6, 7, &aom_highbd_12_sub_pixel_variance64x128_c, 12),
+  SubpelVarianceParams(6, 6, &aom_highbd_12_sub_pixel_variance64x64_c, 12),
+  SubpelVarianceParams(6, 5, &aom_highbd_12_sub_pixel_variance64x32_c, 12),
+  SubpelVarianceParams(5, 6, &aom_highbd_12_sub_pixel_variance32x64_c, 12),
+  SubpelVarianceParams(5, 5, &aom_highbd_12_sub_pixel_variance32x32_c, 12),
+  SubpelVarianceParams(5, 4, &aom_highbd_12_sub_pixel_variance32x16_c, 12),
+  SubpelVarianceParams(4, 5, &aom_highbd_12_sub_pixel_variance16x32_c, 12),
+  SubpelVarianceParams(4, 4, &aom_highbd_12_sub_pixel_variance16x16_c, 12),
+  SubpelVarianceParams(4, 3, &aom_highbd_12_sub_pixel_variance16x8_c, 12),
+  SubpelVarianceParams(3, 4, &aom_highbd_12_sub_pixel_variance8x16_c, 12),
+  SubpelVarianceParams(3, 3, &aom_highbd_12_sub_pixel_variance8x8_c, 12),
+  SubpelVarianceParams(3, 2, &aom_highbd_12_sub_pixel_variance8x4_c, 12),
+  SubpelVarianceParams(2, 3, &aom_highbd_12_sub_pixel_variance4x8_c, 12),
+  SubpelVarianceParams(2, 2, &aom_highbd_12_sub_pixel_variance4x4_c, 12),
+
+  SubpelVarianceParams(6, 4, &aom_highbd_8_sub_pixel_variance64x16_c, 8),
+  SubpelVarianceParams(4, 6, &aom_highbd_8_sub_pixel_variance16x64_c, 8),
+  SubpelVarianceParams(5, 3, &aom_highbd_8_sub_pixel_variance32x8_c, 8),
+  SubpelVarianceParams(3, 5, &aom_highbd_8_sub_pixel_variance8x32_c, 8),
+  SubpelVarianceParams(4, 2, &aom_highbd_8_sub_pixel_variance16x4_c, 8),
+  SubpelVarianceParams(2, 4, &aom_highbd_8_sub_pixel_variance4x16_c, 8),
+  SubpelVarianceParams(6, 4, &aom_highbd_10_sub_pixel_variance64x16_c, 10),
+  SubpelVarianceParams(4, 6, &aom_highbd_10_sub_pixel_variance16x64_c, 10),
+  SubpelVarianceParams(5, 3, &aom_highbd_10_sub_pixel_variance32x8_c, 10),
+  SubpelVarianceParams(3, 5, &aom_highbd_10_sub_pixel_variance8x32_c, 10),
+  SubpelVarianceParams(4, 2, &aom_highbd_10_sub_pixel_variance16x4_c, 10),
+  SubpelVarianceParams(2, 4, &aom_highbd_10_sub_pixel_variance4x16_c, 10),
+  SubpelVarianceParams(6, 4, &aom_highbd_12_sub_pixel_variance64x16_c, 12),
+  SubpelVarianceParams(4, 6, &aom_highbd_12_sub_pixel_variance16x64_c, 12),
+  SubpelVarianceParams(5, 3, &aom_highbd_12_sub_pixel_variance32x8_c, 12),
+  SubpelVarianceParams(3, 5, &aom_highbd_12_sub_pixel_variance8x32_c, 12),
+  SubpelVarianceParams(4, 2, &aom_highbd_12_sub_pixel_variance16x4_c, 12),
+  SubpelVarianceParams(2, 4, &aom_highbd_12_sub_pixel_variance4x16_c, 12),
+};
+INSTANTIATE_TEST_SUITE_P(C, AvxHBDSubpelVarianceTest,
+                         ::testing::ValuesIn(kArrayHBDSubpelVariance_c));
+
+const SubpelAvgVarianceParams kArrayHBDSubpelAvgVariance_c[] = {
+  SubpelAvgVarianceParams(7, 7, &aom_highbd_8_sub_pixel_avg_variance128x128_c,
+                          8),
+  SubpelAvgVarianceParams(7, 6, &aom_highbd_8_sub_pixel_avg_variance128x64_c,
+                          8),
+  SubpelAvgVarianceParams(6, 7, &aom_highbd_8_sub_pixel_avg_variance64x128_c,
+                          8),
+  SubpelAvgVarianceParams(6, 6, &aom_highbd_8_sub_pixel_avg_variance64x64_c, 8),
+  SubpelAvgVarianceParams(6, 5, &aom_highbd_8_sub_pixel_avg_variance64x32_c, 8),
+  SubpelAvgVarianceParams(5, 6, &aom_highbd_8_sub_pixel_avg_variance32x64_c, 8),
+  SubpelAvgVarianceParams(5, 5, &aom_highbd_8_sub_pixel_avg_variance32x32_c, 8),
+  SubpelAvgVarianceParams(5, 4, &aom_highbd_8_sub_pixel_avg_variance32x16_c, 8),
+  SubpelAvgVarianceParams(4, 5, &aom_highbd_8_sub_pixel_avg_variance16x32_c, 8),
+  SubpelAvgVarianceParams(4, 4, &aom_highbd_8_sub_pixel_avg_variance16x16_c, 8),
+  SubpelAvgVarianceParams(4, 3, &aom_highbd_8_sub_pixel_avg_variance16x8_c, 8),
+  SubpelAvgVarianceParams(3, 4, &aom_highbd_8_sub_pixel_avg_variance8x16_c, 8),
+  SubpelAvgVarianceParams(3, 3, &aom_highbd_8_sub_pixel_avg_variance8x8_c, 8),
+  SubpelAvgVarianceParams(3, 2, &aom_highbd_8_sub_pixel_avg_variance8x4_c, 8),
+  SubpelAvgVarianceParams(2, 3, &aom_highbd_8_sub_pixel_avg_variance4x8_c, 8),
+  SubpelAvgVarianceParams(2, 2, &aom_highbd_8_sub_pixel_avg_variance4x4_c, 8),
+  SubpelAvgVarianceParams(7, 7, &aom_highbd_10_sub_pixel_avg_variance128x128_c,
+                          10),
+  SubpelAvgVarianceParams(7, 6, &aom_highbd_10_sub_pixel_avg_variance128x64_c,
+                          10),
+  SubpelAvgVarianceParams(6, 7, &aom_highbd_10_sub_pixel_avg_variance64x128_c,
+                          10),
+  SubpelAvgVarianceParams(6, 6, &aom_highbd_10_sub_pixel_avg_variance64x64_c,
+                          10),
+  SubpelAvgVarianceParams(6, 5, &aom_highbd_10_sub_pixel_avg_variance64x32_c,
+                          10),
+  SubpelAvgVarianceParams(5, 6, &aom_highbd_10_sub_pixel_avg_variance32x64_c,
+                          10),
+  SubpelAvgVarianceParams(5, 5, &aom_highbd_10_sub_pixel_avg_variance32x32_c,
+                          10),
+  SubpelAvgVarianceParams(5, 4, &aom_highbd_10_sub_pixel_avg_variance32x16_c,
+                          10),
+  SubpelAvgVarianceParams(4, 5, &aom_highbd_10_sub_pixel_avg_variance16x32_c,
+                          10),
+  SubpelAvgVarianceParams(4, 4, &aom_highbd_10_sub_pixel_avg_variance16x16_c,
+                          10),
+  SubpelAvgVarianceParams(4, 3, &aom_highbd_10_sub_pixel_avg_variance16x8_c,
+                          10),
+  SubpelAvgVarianceParams(3, 4, &aom_highbd_10_sub_pixel_avg_variance8x16_c,
+                          10),
+  SubpelAvgVarianceParams(3, 3, &aom_highbd_10_sub_pixel_avg_variance8x8_c, 10),
+  SubpelAvgVarianceParams(3, 2, &aom_highbd_10_sub_pixel_avg_variance8x4_c, 10),
+  SubpelAvgVarianceParams(2, 3, &aom_highbd_10_sub_pixel_avg_variance4x8_c, 10),
+  SubpelAvgVarianceParams(2, 2, &aom_highbd_10_sub_pixel_avg_variance4x4_c, 10),
+  SubpelAvgVarianceParams(7, 7, &aom_highbd_12_sub_pixel_avg_variance128x128_c,
+                          12),
+  SubpelAvgVarianceParams(7, 6, &aom_highbd_12_sub_pixel_avg_variance128x64_c,
+                          12),
+  SubpelAvgVarianceParams(6, 7, &aom_highbd_12_sub_pixel_avg_variance64x128_c,
+                          12),
+  SubpelAvgVarianceParams(6, 6, &aom_highbd_12_sub_pixel_avg_variance64x64_c,
+                          12),
+  SubpelAvgVarianceParams(6, 5, &aom_highbd_12_sub_pixel_avg_variance64x32_c,
+                          12),
+  SubpelAvgVarianceParams(5, 6, &aom_highbd_12_sub_pixel_avg_variance32x64_c,
+                          12),
+  SubpelAvgVarianceParams(5, 5, &aom_highbd_12_sub_pixel_avg_variance32x32_c,
+                          12),
+  SubpelAvgVarianceParams(5, 4, &aom_highbd_12_sub_pixel_avg_variance32x16_c,
+                          12),
+  SubpelAvgVarianceParams(4, 5, &aom_highbd_12_sub_pixel_avg_variance16x32_c,
+                          12),
+  SubpelAvgVarianceParams(4, 4, &aom_highbd_12_sub_pixel_avg_variance16x16_c,
+                          12),
+  SubpelAvgVarianceParams(4, 3, &aom_highbd_12_sub_pixel_avg_variance16x8_c,
+                          12),
+  SubpelAvgVarianceParams(3, 4, &aom_highbd_12_sub_pixel_avg_variance8x16_c,
+                          12),
+  SubpelAvgVarianceParams(3, 3, &aom_highbd_12_sub_pixel_avg_variance8x8_c, 12),
+  SubpelAvgVarianceParams(3, 2, &aom_highbd_12_sub_pixel_avg_variance8x4_c, 12),
+  SubpelAvgVarianceParams(2, 3, &aom_highbd_12_sub_pixel_avg_variance4x8_c, 12),
+  SubpelAvgVarianceParams(2, 2, &aom_highbd_12_sub_pixel_avg_variance4x4_c, 12),
+
+  SubpelAvgVarianceParams(6, 4, &aom_highbd_8_sub_pixel_avg_variance64x16_c, 8),
+  SubpelAvgVarianceParams(4, 6, &aom_highbd_8_sub_pixel_avg_variance16x64_c, 8),
+  SubpelAvgVarianceParams(5, 3, &aom_highbd_8_sub_pixel_avg_variance32x8_c, 8),
+  SubpelAvgVarianceParams(3, 5, &aom_highbd_8_sub_pixel_avg_variance8x32_c, 8),
+  SubpelAvgVarianceParams(4, 2, &aom_highbd_8_sub_pixel_avg_variance16x4_c, 8),
+  SubpelAvgVarianceParams(2, 4, &aom_highbd_8_sub_pixel_avg_variance4x16_c, 8),
+  SubpelAvgVarianceParams(6, 4, &aom_highbd_10_sub_pixel_avg_variance64x16_c,
+                          10),
+  SubpelAvgVarianceParams(4, 6, &aom_highbd_10_sub_pixel_avg_variance16x64_c,
+                          10),
+  SubpelAvgVarianceParams(5, 3, &aom_highbd_10_sub_pixel_avg_variance32x8_c,
+                          10),
+  SubpelAvgVarianceParams(3, 5, &aom_highbd_10_sub_pixel_avg_variance8x32_c,
+                          10),
+  SubpelAvgVarianceParams(4, 2, &aom_highbd_10_sub_pixel_avg_variance16x4_c,
+                          10),
+  SubpelAvgVarianceParams(2, 4, &aom_highbd_10_sub_pixel_avg_variance4x16_c,
+                          10),
+  SubpelAvgVarianceParams(6, 4, &aom_highbd_12_sub_pixel_avg_variance64x16_c,
+                          12),
+  SubpelAvgVarianceParams(4, 6, &aom_highbd_12_sub_pixel_avg_variance16x64_c,
+                          12),
+  SubpelAvgVarianceParams(5, 3, &aom_highbd_12_sub_pixel_avg_variance32x8_c,
+                          12),
+  SubpelAvgVarianceParams(3, 5, &aom_highbd_12_sub_pixel_avg_variance8x32_c,
+                          12),
+  SubpelAvgVarianceParams(4, 2, &aom_highbd_12_sub_pixel_avg_variance16x4_c,
+                          12),
+  SubpelAvgVarianceParams(2, 4, &aom_highbd_12_sub_pixel_avg_variance4x16_c,
+                          12),
+};
+INSTANTIATE_TEST_SUITE_P(C, AvxHBDSubpelAvgVarianceTest,
+                         ::testing::ValuesIn(kArrayHBDSubpelAvgVariance_c));
+
+const ObmcSubpelVarianceParams kArrayHBDObmcSubpelVariance_c[] = {
+  ObmcSubpelVarianceParams(7, 7, &aom_highbd_obmc_sub_pixel_variance128x128_c,
+                           8),
+  ObmcSubpelVarianceParams(7, 6, &aom_highbd_obmc_sub_pixel_variance128x64_c,
+                           8),
+  ObmcSubpelVarianceParams(6, 7, &aom_highbd_obmc_sub_pixel_variance64x128_c,
+                           8),
+  ObmcSubpelVarianceParams(6, 6, &aom_highbd_obmc_sub_pixel_variance64x64_c, 8),
+  ObmcSubpelVarianceParams(6, 5, &aom_highbd_obmc_sub_pixel_variance64x32_c, 8),
+  ObmcSubpelVarianceParams(5, 6, &aom_highbd_obmc_sub_pixel_variance32x64_c, 8),
+  ObmcSubpelVarianceParams(5, 5, &aom_highbd_obmc_sub_pixel_variance32x32_c, 8),
+  ObmcSubpelVarianceParams(5, 4, &aom_highbd_obmc_sub_pixel_variance32x16_c, 8),
+  ObmcSubpelVarianceParams(4, 5, &aom_highbd_obmc_sub_pixel_variance16x32_c, 8),
+  ObmcSubpelVarianceParams(4, 4, &aom_highbd_obmc_sub_pixel_variance16x16_c, 8),
+  ObmcSubpelVarianceParams(4, 3, &aom_highbd_obmc_sub_pixel_variance16x8_c, 8),
+  ObmcSubpelVarianceParams(3, 4, &aom_highbd_obmc_sub_pixel_variance8x16_c, 8),
+  ObmcSubpelVarianceParams(3, 3, &aom_highbd_obmc_sub_pixel_variance8x8_c, 8),
+  ObmcSubpelVarianceParams(3, 2, &aom_highbd_obmc_sub_pixel_variance8x4_c, 8),
+  ObmcSubpelVarianceParams(2, 3, &aom_highbd_obmc_sub_pixel_variance4x8_c, 8),
+  ObmcSubpelVarianceParams(2, 2, &aom_highbd_obmc_sub_pixel_variance4x4_c, 8),
+  ObmcSubpelVarianceParams(7, 7,
+                           &aom_highbd_10_obmc_sub_pixel_variance128x128_c, 10),
+  ObmcSubpelVarianceParams(7, 6, &aom_highbd_10_obmc_sub_pixel_variance128x64_c,
+                           10),
+  ObmcSubpelVarianceParams(6, 7, &aom_highbd_10_obmc_sub_pixel_variance64x128_c,
+                           10),
+  ObmcSubpelVarianceParams(6, 6, &aom_highbd_10_obmc_sub_pixel_variance64x64_c,
+                           10),
+  ObmcSubpelVarianceParams(6, 5, &aom_highbd_10_obmc_sub_pixel_variance64x32_c,
+                           10),
+  ObmcSubpelVarianceParams(5, 6, &aom_highbd_10_obmc_sub_pixel_variance32x64_c,
+                           10),
+  ObmcSubpelVarianceParams(5, 5, &aom_highbd_10_obmc_sub_pixel_variance32x32_c,
+                           10),
+  ObmcSubpelVarianceParams(5, 4, &aom_highbd_10_obmc_sub_pixel_variance32x16_c,
+                           10),
+  ObmcSubpelVarianceParams(4, 5, &aom_highbd_10_obmc_sub_pixel_variance16x32_c,
+                           10),
+  ObmcSubpelVarianceParams(4, 4, &aom_highbd_10_obmc_sub_pixel_variance16x16_c,
+                           10),
+  ObmcSubpelVarianceParams(4, 3, &aom_highbd_10_obmc_sub_pixel_variance16x8_c,
+                           10),
+  ObmcSubpelVarianceParams(3, 4, &aom_highbd_10_obmc_sub_pixel_variance8x16_c,
+                           10),
+  ObmcSubpelVarianceParams(3, 3, &aom_highbd_10_obmc_sub_pixel_variance8x8_c,
+                           10),
+  ObmcSubpelVarianceParams(3, 2, &aom_highbd_10_obmc_sub_pixel_variance8x4_c,
+                           10),
+  ObmcSubpelVarianceParams(2, 3, &aom_highbd_10_obmc_sub_pixel_variance4x8_c,
+                           10),
+  ObmcSubpelVarianceParams(2, 2, &aom_highbd_10_obmc_sub_pixel_variance4x4_c,
+                           10),
+  ObmcSubpelVarianceParams(7, 7,
+                           &aom_highbd_12_obmc_sub_pixel_variance128x128_c, 12),
+  ObmcSubpelVarianceParams(7, 6, &aom_highbd_12_obmc_sub_pixel_variance128x64_c,
+                           12),
+  ObmcSubpelVarianceParams(6, 7, &aom_highbd_12_obmc_sub_pixel_variance64x128_c,
+                           12),
+  ObmcSubpelVarianceParams(6, 6, &aom_highbd_12_obmc_sub_pixel_variance64x64_c,
+                           12),
+  ObmcSubpelVarianceParams(6, 5, &aom_highbd_12_obmc_sub_pixel_variance64x32_c,
+                           12),
+  ObmcSubpelVarianceParams(5, 6, &aom_highbd_12_obmc_sub_pixel_variance32x64_c,
+                           12),
+  ObmcSubpelVarianceParams(5, 5, &aom_highbd_12_obmc_sub_pixel_variance32x32_c,
+                           12),
+  ObmcSubpelVarianceParams(5, 4, &aom_highbd_12_obmc_sub_pixel_variance32x16_c,
+                           12),
+  ObmcSubpelVarianceParams(4, 5, &aom_highbd_12_obmc_sub_pixel_variance16x32_c,
+                           12),
+  ObmcSubpelVarianceParams(4, 4, &aom_highbd_12_obmc_sub_pixel_variance16x16_c,
+                           12),
+  ObmcSubpelVarianceParams(4, 3, &aom_highbd_12_obmc_sub_pixel_variance16x8_c,
+                           12),
+  ObmcSubpelVarianceParams(3, 4, &aom_highbd_12_obmc_sub_pixel_variance8x16_c,
+                           12),
+  ObmcSubpelVarianceParams(3, 3, &aom_highbd_12_obmc_sub_pixel_variance8x8_c,
+                           12),
+  ObmcSubpelVarianceParams(3, 2, &aom_highbd_12_obmc_sub_pixel_variance8x4_c,
+                           12),
+  ObmcSubpelVarianceParams(2, 3, &aom_highbd_12_obmc_sub_pixel_variance4x8_c,
+                           12),
+  ObmcSubpelVarianceParams(2, 2, &aom_highbd_12_obmc_sub_pixel_variance4x4_c,
+                           12),
+
+  ObmcSubpelVarianceParams(6, 4, &aom_highbd_obmc_sub_pixel_variance64x16_c, 8),
+  ObmcSubpelVarianceParams(4, 6, &aom_highbd_obmc_sub_pixel_variance16x64_c, 8),
+  ObmcSubpelVarianceParams(5, 3, &aom_highbd_obmc_sub_pixel_variance32x8_c, 8),
+  ObmcSubpelVarianceParams(3, 5, &aom_highbd_obmc_sub_pixel_variance8x32_c, 8),
+  ObmcSubpelVarianceParams(4, 2, &aom_highbd_obmc_sub_pixel_variance16x4_c, 8),
+  ObmcSubpelVarianceParams(2, 4, &aom_highbd_obmc_sub_pixel_variance4x16_c, 8),
+  ObmcSubpelVarianceParams(6, 4, &aom_highbd_10_obmc_sub_pixel_variance64x16_c,
+                           10),
+  ObmcSubpelVarianceParams(4, 6, &aom_highbd_10_obmc_sub_pixel_variance16x64_c,
+                           10),
+  ObmcSubpelVarianceParams(5, 3, &aom_highbd_10_obmc_sub_pixel_variance32x8_c,
+                           10),
+  ObmcSubpelVarianceParams(3, 5, &aom_highbd_10_obmc_sub_pixel_variance8x32_c,
+                           10),
+  ObmcSubpelVarianceParams(4, 2, &aom_highbd_10_obmc_sub_pixel_variance16x4_c,
+                           10),
+  ObmcSubpelVarianceParams(2, 4, &aom_highbd_10_obmc_sub_pixel_variance4x16_c,
+                           10),
+  ObmcSubpelVarianceParams(6, 4, &aom_highbd_12_obmc_sub_pixel_variance64x16_c,
+                           12),
+  ObmcSubpelVarianceParams(4, 6, &aom_highbd_12_obmc_sub_pixel_variance16x64_c,
+                           12),
+  ObmcSubpelVarianceParams(5, 3, &aom_highbd_12_obmc_sub_pixel_variance32x8_c,
+                           12),
+  ObmcSubpelVarianceParams(3, 5, &aom_highbd_12_obmc_sub_pixel_variance8x32_c,
+                           12),
+  ObmcSubpelVarianceParams(4, 2, &aom_highbd_12_obmc_sub_pixel_variance16x4_c,
+                           12),
+  ObmcSubpelVarianceParams(2, 4, &aom_highbd_12_obmc_sub_pixel_variance4x16_c,
+                           12),
+};
+INSTANTIATE_TEST_SUITE_P(C, AvxHBDObmcSubpelVarianceTest,
+                         ::testing::ValuesIn(kArrayHBDObmcSubpelVariance_c));
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(SSE2, SumOfSquaresTest,
+                         ::testing::Values(aom_get_mb_ss_sse2));
+
+INSTANTIATE_TEST_SUITE_P(SSE2, AvxMseTest,
+                         ::testing::Values(MseParams(4, 4, &aom_mse16x16_sse2),
+                                           MseParams(4, 3, &aom_mse16x8_sse2),
+                                           MseParams(3, 4, &aom_mse8x16_sse2),
+                                           MseParams(3, 3, &aom_mse8x8_sse2)));
+
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, AvxVarianceTest,
+    ::testing::Values(VarianceParams(7, 7, &aom_variance128x128_sse2),
+                      VarianceParams(7, 6, &aom_variance128x64_sse2),
+                      VarianceParams(6, 7, &aom_variance64x128_sse2),
+                      VarianceParams(6, 6, &aom_variance64x64_sse2),
+                      VarianceParams(6, 5, &aom_variance64x32_sse2),
+                      VarianceParams(6, 4, &aom_variance64x16_sse2),
+                      VarianceParams(5, 6, &aom_variance32x64_sse2),
+                      VarianceParams(5, 5, &aom_variance32x32_sse2),
+                      VarianceParams(5, 4, &aom_variance32x16_sse2),
+                      VarianceParams(5, 3, &aom_variance32x8_sse2),
+                      VarianceParams(4, 6, &aom_variance16x64_sse2),
+                      VarianceParams(4, 5, &aom_variance16x32_sse2),
+                      VarianceParams(4, 4, &aom_variance16x16_sse2),
+                      VarianceParams(4, 3, &aom_variance16x8_sse2),
+                      VarianceParams(4, 2, &aom_variance16x4_sse2),
+                      VarianceParams(3, 5, &aom_variance8x32_sse2),
+                      VarianceParams(3, 4, &aom_variance8x16_sse2),
+                      VarianceParams(3, 3, &aom_variance8x8_sse2),
+                      VarianceParams(3, 2, &aom_variance8x4_sse2),
+                      VarianceParams(2, 4, &aom_variance4x16_sse2),
+                      VarianceParams(2, 3, &aom_variance4x8_sse2),
+                      VarianceParams(2, 2, &aom_variance4x4_sse2)));
+
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, AvxSubpelVarianceTest,
+    ::testing::Values(
+        SubpelVarianceParams(7, 7, &aom_sub_pixel_variance128x128_sse2, 0),
+        SubpelVarianceParams(7, 6, &aom_sub_pixel_variance128x64_sse2, 0),
+        SubpelVarianceParams(6, 7, &aom_sub_pixel_variance64x128_sse2, 0),
+        SubpelVarianceParams(6, 6, &aom_sub_pixel_variance64x64_sse2, 0),
+        SubpelVarianceParams(6, 5, &aom_sub_pixel_variance64x32_sse2, 0),
+        SubpelVarianceParams(5, 6, &aom_sub_pixel_variance32x64_sse2, 0),
+        SubpelVarianceParams(5, 5, &aom_sub_pixel_variance32x32_sse2, 0),
+        SubpelVarianceParams(5, 4, &aom_sub_pixel_variance32x16_sse2, 0),
+        SubpelVarianceParams(4, 5, &aom_sub_pixel_variance16x32_sse2, 0),
+        SubpelVarianceParams(4, 4, &aom_sub_pixel_variance16x16_sse2, 0),
+        SubpelVarianceParams(4, 3, &aom_sub_pixel_variance16x8_sse2, 0),
+        SubpelVarianceParams(3, 4, &aom_sub_pixel_variance8x16_sse2, 0),
+        SubpelVarianceParams(3, 3, &aom_sub_pixel_variance8x8_sse2, 0),
+        SubpelVarianceParams(3, 2, &aom_sub_pixel_variance8x4_sse2, 0),
+        SubpelVarianceParams(2, 3, &aom_sub_pixel_variance4x8_sse2, 0),
+        SubpelVarianceParams(2, 2, &aom_sub_pixel_variance4x4_sse2, 0),
+
+        SubpelVarianceParams(6, 4, &aom_sub_pixel_variance64x16_sse2, 0),
+        SubpelVarianceParams(4, 6, &aom_sub_pixel_variance16x64_sse2, 0),
+        SubpelVarianceParams(5, 3, &aom_sub_pixel_variance32x8_sse2, 0),
+        SubpelVarianceParams(3, 5, &aom_sub_pixel_variance8x32_sse2, 0),
+        SubpelVarianceParams(4, 2, &aom_sub_pixel_variance16x4_sse2, 0),
+        SubpelVarianceParams(2, 4, &aom_sub_pixel_variance4x16_sse2, 0)));
+
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, AvxSubpelAvgVarianceTest,
+    ::testing::Values(
+        SubpelAvgVarianceParams(7, 7, &aom_sub_pixel_avg_variance128x128_sse2,
+                                0),
+        SubpelAvgVarianceParams(7, 6, &aom_sub_pixel_avg_variance128x64_sse2,
+                                0),
+        SubpelAvgVarianceParams(6, 7, &aom_sub_pixel_avg_variance64x128_sse2,
+                                0),
+        SubpelAvgVarianceParams(6, 6, &aom_sub_pixel_avg_variance64x64_sse2, 0),
+        SubpelAvgVarianceParams(6, 5, &aom_sub_pixel_avg_variance64x32_sse2, 0),
+        SubpelAvgVarianceParams(5, 6, &aom_sub_pixel_avg_variance32x64_sse2, 0),
+        SubpelAvgVarianceParams(5, 5, &aom_sub_pixel_avg_variance32x32_sse2, 0),
+        SubpelAvgVarianceParams(5, 4, &aom_sub_pixel_avg_variance32x16_sse2, 0),
+        SubpelAvgVarianceParams(4, 5, &aom_sub_pixel_avg_variance16x32_sse2, 0),
+        SubpelAvgVarianceParams(4, 4, &aom_sub_pixel_avg_variance16x16_sse2, 0),
+        SubpelAvgVarianceParams(4, 3, &aom_sub_pixel_avg_variance16x8_sse2, 0),
+        SubpelAvgVarianceParams(3, 4, &aom_sub_pixel_avg_variance8x16_sse2, 0),
+        SubpelAvgVarianceParams(3, 3, &aom_sub_pixel_avg_variance8x8_sse2, 0),
+        SubpelAvgVarianceParams(3, 2, &aom_sub_pixel_avg_variance8x4_sse2, 0),
+        SubpelAvgVarianceParams(2, 3, &aom_sub_pixel_avg_variance4x8_sse2, 0),
+        SubpelAvgVarianceParams(2, 2, &aom_sub_pixel_avg_variance4x4_sse2, 0),
+
+        SubpelAvgVarianceParams(6, 4, &aom_sub_pixel_avg_variance64x16_sse2, 0),
+        SubpelAvgVarianceParams(4, 6, &aom_sub_pixel_avg_variance16x64_sse2, 0),
+        SubpelAvgVarianceParams(5, 3, &aom_sub_pixel_avg_variance32x8_sse2, 0),
+        SubpelAvgVarianceParams(3, 5, &aom_sub_pixel_avg_variance8x32_sse2, 0),
+        SubpelAvgVarianceParams(4, 2, &aom_sub_pixel_avg_variance16x4_sse2, 0),
+        SubpelAvgVarianceParams(2, 4, &aom_sub_pixel_avg_variance4x16_sse2,
+                                0)));
+
+#if CONFIG_AV1_HIGHBITDEPTH
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(
+    SSE4_1, AvxSubpelVarianceTest,
+    ::testing::Values(
+        SubpelVarianceParams(2, 2, &aom_highbd_8_sub_pixel_variance4x4_sse4_1,
+                             8),
+        SubpelVarianceParams(2, 2, &aom_highbd_10_sub_pixel_variance4x4_sse4_1,
+                             10),
+        SubpelVarianceParams(2, 2, &aom_highbd_12_sub_pixel_variance4x4_sse4_1,
+                             12)));
+
+INSTANTIATE_TEST_SUITE_P(
+    SSE4_1, AvxSubpelAvgVarianceTest,
+    ::testing::Values(
+        SubpelAvgVarianceParams(2, 2,
+                                &aom_highbd_8_sub_pixel_avg_variance4x4_sse4_1,
+                                8),
+        SubpelAvgVarianceParams(2, 2,
+                                &aom_highbd_10_sub_pixel_avg_variance4x4_sse4_1,
+                                10),
+        SubpelAvgVarianceParams(2, 2,
+                                &aom_highbd_12_sub_pixel_avg_variance4x4_sse4_1,
+                                12)));
+#endif  // HAVE_SSE4_1
+
+/* TODO(debargha): This test does not support the highbd version
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, AvxHBDMseTest,
+    ::testing::Values(MseParams(4, 4, &aom_highbd_12_mse16x16_sse2),
+                      MseParams(4, 3, &aom_highbd_12_mse16x8_sse2),
+                      MseParams(3, 4, &aom_highbd_12_mse8x16_sse2),
+                      MseParams(3, 3, &aom_highbd_12_mse8x8_sse2),
+                      MseParams(4, 4, &aom_highbd_10_mse16x16_sse2),
+                      MseParams(4, 3, &aom_highbd_10_mse16x8_sse2),
+                      MseParams(3, 4, &aom_highbd_10_mse8x16_sse2),
+                      MseParams(3, 3, &aom_highbd_10_mse8x8_sse2),
+                      MseParams(4, 4, &aom_highbd_8_mse16x16_sse2),
+                      MseParams(4, 3, &aom_highbd_8_mse16x8_sse2),
+                      MseParams(3, 4, &aom_highbd_8_mse8x16_sse2),
+                      MseParams(3, 3, &aom_highbd_8_mse8x8_sse2)));
+*/
+
+const VarianceParams kArrayHBDVariance_sse2[] = {
+  VarianceParams(7, 7, &aom_highbd_12_variance128x128_sse2, 12),
+  VarianceParams(7, 6, &aom_highbd_12_variance128x64_sse2, 12),
+  VarianceParams(6, 7, &aom_highbd_12_variance64x128_sse2, 12),
+  VarianceParams(6, 6, &aom_highbd_12_variance64x64_sse2, 12),
+  VarianceParams(6, 5, &aom_highbd_12_variance64x32_sse2, 12),
+  VarianceParams(5, 6, &aom_highbd_12_variance32x64_sse2, 12),
+  VarianceParams(5, 5, &aom_highbd_12_variance32x32_sse2, 12),
+  VarianceParams(5, 4, &aom_highbd_12_variance32x16_sse2, 12),
+  VarianceParams(4, 5, &aom_highbd_12_variance16x32_sse2, 12),
+  VarianceParams(4, 4, &aom_highbd_12_variance16x16_sse2, 12),
+  VarianceParams(4, 3, &aom_highbd_12_variance16x8_sse2, 12),
+  VarianceParams(3, 4, &aom_highbd_12_variance8x16_sse2, 12),
+  VarianceParams(3, 3, &aom_highbd_12_variance8x8_sse2, 12),
+  VarianceParams(7, 7, &aom_highbd_10_variance128x128_sse2, 10),
+  VarianceParams(7, 6, &aom_highbd_10_variance128x64_sse2, 10),
+  VarianceParams(6, 7, &aom_highbd_10_variance64x128_sse2, 10),
+  VarianceParams(6, 6, &aom_highbd_10_variance64x64_sse2, 10),
+  VarianceParams(6, 5, &aom_highbd_10_variance64x32_sse2, 10),
+  VarianceParams(5, 6, &aom_highbd_10_variance32x64_sse2, 10),
+  VarianceParams(5, 5, &aom_highbd_10_variance32x32_sse2, 10),
+  VarianceParams(5, 4, &aom_highbd_10_variance32x16_sse2, 10),
+  VarianceParams(4, 5, &aom_highbd_10_variance16x32_sse2, 10),
+  VarianceParams(4, 4, &aom_highbd_10_variance16x16_sse2, 10),
+  VarianceParams(4, 3, &aom_highbd_10_variance16x8_sse2, 10),
+  VarianceParams(3, 4, &aom_highbd_10_variance8x16_sse2, 10),
+  VarianceParams(3, 3, &aom_highbd_10_variance8x8_sse2, 10),
+  VarianceParams(7, 7, &aom_highbd_8_variance128x128_sse2, 8),
+  VarianceParams(7, 6, &aom_highbd_8_variance128x64_sse2, 8),
+  VarianceParams(6, 7, &aom_highbd_8_variance64x128_sse2, 8),
+  VarianceParams(6, 6, &aom_highbd_8_variance64x64_sse2, 8),
+  VarianceParams(6, 5, &aom_highbd_8_variance64x32_sse2, 8),
+  VarianceParams(5, 6, &aom_highbd_8_variance32x64_sse2, 8),
+  VarianceParams(5, 5, &aom_highbd_8_variance32x32_sse2, 8),
+  VarianceParams(5, 4, &aom_highbd_8_variance32x16_sse2, 8),
+  VarianceParams(4, 5, &aom_highbd_8_variance16x32_sse2, 8),
+  VarianceParams(4, 4, &aom_highbd_8_variance16x16_sse2, 8),
+  VarianceParams(4, 3, &aom_highbd_8_variance16x8_sse2, 8),
+  VarianceParams(3, 4, &aom_highbd_8_variance8x16_sse2, 8),
+  VarianceParams(3, 3, &aom_highbd_8_variance8x8_sse2, 8),
+
+  VarianceParams(6, 4, &aom_highbd_12_variance64x16_sse2, 12),
+  VarianceParams(4, 6, &aom_highbd_12_variance16x64_sse2, 12),
+  VarianceParams(5, 3, &aom_highbd_12_variance32x8_sse2, 12),
+  VarianceParams(3, 5, &aom_highbd_12_variance8x32_sse2, 12),
+  // VarianceParams(4, 2, &aom_highbd_12_variance16x4_sse2, 12),
+  // VarianceParams(2, 4, &aom_highbd_12_variance4x16_sse2, 12),
+  VarianceParams(6, 4, &aom_highbd_10_variance64x16_sse2, 10),
+  VarianceParams(4, 6, &aom_highbd_10_variance16x64_sse2, 10),
+  VarianceParams(5, 3, &aom_highbd_10_variance32x8_sse2, 10),
+  VarianceParams(3, 5, &aom_highbd_10_variance8x32_sse2, 10),
+  // VarianceParams(4, 2, &aom_highbd_10_variance16x4_sse2, 10),
+  // VarianceParams(2, 4, &aom_highbd_10_variance4x16_sse2, 10),
+  VarianceParams(6, 4, &aom_highbd_8_variance64x16_sse2, 8),
+  VarianceParams(4, 6, &aom_highbd_8_variance16x64_sse2, 8),
+  VarianceParams(5, 3, &aom_highbd_8_variance32x8_sse2, 8),
+  VarianceParams(3, 5, &aom_highbd_8_variance8x32_sse2, 8),
+  // VarianceParams(4, 2, &aom_highbd_8_variance16x4_sse2, 8),
+  // VarianceParams(2, 4, &aom_highbd_8_variance4x16_sse2, 8),
+};
+INSTANTIATE_TEST_SUITE_P(SSE2, AvxHBDVarianceTest,
+                         ::testing::ValuesIn(kArrayHBDVariance_sse2));
+
+#if HAVE_AVX2
+
+const VarianceParams kArrayHBDVariance_avx2[] = {
+  VarianceParams(7, 7, &aom_highbd_10_variance128x128_avx2, 10),
+  VarianceParams(7, 6, &aom_highbd_10_variance128x64_avx2, 10),
+  VarianceParams(6, 7, &aom_highbd_10_variance64x128_avx2, 10),
+  VarianceParams(6, 6, &aom_highbd_10_variance64x64_avx2, 10),
+  VarianceParams(6, 5, &aom_highbd_10_variance64x32_avx2, 10),
+  VarianceParams(5, 6, &aom_highbd_10_variance32x64_avx2, 10),
+  VarianceParams(5, 5, &aom_highbd_10_variance32x32_avx2, 10),
+  VarianceParams(5, 4, &aom_highbd_10_variance32x16_avx2, 10),
+  VarianceParams(4, 5, &aom_highbd_10_variance16x32_avx2, 10),
+  VarianceParams(4, 4, &aom_highbd_10_variance16x16_avx2, 10),
+  VarianceParams(4, 3, &aom_highbd_10_variance16x8_avx2, 10),
+  VarianceParams(3, 4, &aom_highbd_10_variance8x16_avx2, 10),
+  VarianceParams(3, 3, &aom_highbd_10_variance8x8_avx2, 10),
+};
+
+INSTANTIATE_TEST_SUITE_P(AVX2, AvxHBDVarianceTest,
+                         ::testing::ValuesIn(kArrayHBDVariance_avx2));
+#endif  // HAVE_AVX2
+
+const SubpelVarianceParams kArrayHBDSubpelVariance_sse2[] = {
+  SubpelVarianceParams(7, 7, &aom_highbd_12_sub_pixel_variance128x128_sse2, 12),
+  SubpelVarianceParams(7, 6, &aom_highbd_12_sub_pixel_variance128x64_sse2, 12),
+  SubpelVarianceParams(6, 7, &aom_highbd_12_sub_pixel_variance64x128_sse2, 12),
+  SubpelVarianceParams(6, 6, &aom_highbd_12_sub_pixel_variance64x64_sse2, 12),
+  SubpelVarianceParams(6, 5, &aom_highbd_12_sub_pixel_variance64x32_sse2, 12),
+  SubpelVarianceParams(5, 6, &aom_highbd_12_sub_pixel_variance32x64_sse2, 12),
+  SubpelVarianceParams(5, 5, &aom_highbd_12_sub_pixel_variance32x32_sse2, 12),
+  SubpelVarianceParams(5, 4, &aom_highbd_12_sub_pixel_variance32x16_sse2, 12),
+  SubpelVarianceParams(4, 5, &aom_highbd_12_sub_pixel_variance16x32_sse2, 12),
+  SubpelVarianceParams(4, 4, &aom_highbd_12_sub_pixel_variance16x16_sse2, 12),
+  SubpelVarianceParams(4, 3, &aom_highbd_12_sub_pixel_variance16x8_sse2, 12),
+  SubpelVarianceParams(3, 4, &aom_highbd_12_sub_pixel_variance8x16_sse2, 12),
+  SubpelVarianceParams(3, 3, &aom_highbd_12_sub_pixel_variance8x8_sse2, 12),
+  SubpelVarianceParams(3, 2, &aom_highbd_12_sub_pixel_variance8x4_sse2, 12),
+  SubpelVarianceParams(7, 7, &aom_highbd_10_sub_pixel_variance128x128_sse2, 10),
+  SubpelVarianceParams(7, 6, &aom_highbd_10_sub_pixel_variance128x64_sse2, 10),
+  SubpelVarianceParams(6, 7, &aom_highbd_10_sub_pixel_variance64x128_sse2, 10),
+  SubpelVarianceParams(6, 6, &aom_highbd_10_sub_pixel_variance64x64_sse2, 10),
+  SubpelVarianceParams(6, 5, &aom_highbd_10_sub_pixel_variance64x32_sse2, 10),
+  SubpelVarianceParams(5, 6, &aom_highbd_10_sub_pixel_variance32x64_sse2, 10),
+  SubpelVarianceParams(5, 5, &aom_highbd_10_sub_pixel_variance32x32_sse2, 10),
+  SubpelVarianceParams(5, 4, &aom_highbd_10_sub_pixel_variance32x16_sse2, 10),
+  SubpelVarianceParams(4, 5, &aom_highbd_10_sub_pixel_variance16x32_sse2, 10),
+  SubpelVarianceParams(4, 4, &aom_highbd_10_sub_pixel_variance16x16_sse2, 10),
+  SubpelVarianceParams(4, 3, &aom_highbd_10_sub_pixel_variance16x8_sse2, 10),
+  SubpelVarianceParams(3, 4, &aom_highbd_10_sub_pixel_variance8x16_sse2, 10),
+  SubpelVarianceParams(3, 3, &aom_highbd_10_sub_pixel_variance8x8_sse2, 10),
+  SubpelVarianceParams(3, 2, &aom_highbd_10_sub_pixel_variance8x4_sse2, 10),
+  SubpelVarianceParams(7, 7, &aom_highbd_8_sub_pixel_variance128x128_sse2, 8),
+  SubpelVarianceParams(7, 6, &aom_highbd_8_sub_pixel_variance128x64_sse2, 8),
+  SubpelVarianceParams(6, 7, &aom_highbd_8_sub_pixel_variance64x128_sse2, 8),
+  SubpelVarianceParams(6, 6, &aom_highbd_8_sub_pixel_variance64x64_sse2, 8),
+  SubpelVarianceParams(6, 5, &aom_highbd_8_sub_pixel_variance64x32_sse2, 8),
+  SubpelVarianceParams(5, 6, &aom_highbd_8_sub_pixel_variance32x64_sse2, 8),
+  SubpelVarianceParams(5, 5, &aom_highbd_8_sub_pixel_variance32x32_sse2, 8),
+  SubpelVarianceParams(5, 4, &aom_highbd_8_sub_pixel_variance32x16_sse2, 8),
+  SubpelVarianceParams(4, 5, &aom_highbd_8_sub_pixel_variance16x32_sse2, 8),
+  SubpelVarianceParams(4, 4, &aom_highbd_8_sub_pixel_variance16x16_sse2, 8),
+  SubpelVarianceParams(4, 3, &aom_highbd_8_sub_pixel_variance16x8_sse2, 8),
+  SubpelVarianceParams(3, 4, &aom_highbd_8_sub_pixel_variance8x16_sse2, 8),
+  SubpelVarianceParams(3, 3, &aom_highbd_8_sub_pixel_variance8x8_sse2, 8),
+  SubpelVarianceParams(3, 2, &aom_highbd_8_sub_pixel_variance8x4_sse2, 8),
+
+  SubpelVarianceParams(6, 4, &aom_highbd_12_sub_pixel_variance64x16_sse2, 12),
+  SubpelVarianceParams(4, 6, &aom_highbd_12_sub_pixel_variance16x64_sse2, 12),
+  SubpelVarianceParams(5, 3, &aom_highbd_12_sub_pixel_variance32x8_sse2, 12),
+  SubpelVarianceParams(3, 5, &aom_highbd_12_sub_pixel_variance8x32_sse2, 12),
+  SubpelVarianceParams(4, 2, &aom_highbd_12_sub_pixel_variance16x4_sse2, 12),
+  // SubpelVarianceParams(2, 4, &aom_highbd_12_sub_pixel_variance4x16_sse2, 12),
+  SubpelVarianceParams(6, 4, &aom_highbd_10_sub_pixel_variance64x16_sse2, 10),
+  SubpelVarianceParams(4, 6, &aom_highbd_10_sub_pixel_variance16x64_sse2, 10),
+  SubpelVarianceParams(5, 3, &aom_highbd_10_sub_pixel_variance32x8_sse2, 10),
+  SubpelVarianceParams(3, 5, &aom_highbd_10_sub_pixel_variance8x32_sse2, 10),
+  SubpelVarianceParams(4, 2, &aom_highbd_10_sub_pixel_variance16x4_sse2, 10),
+  // SubpelVarianceParams(2, 4, &aom_highbd_10_sub_pixel_variance4x16_sse2, 10),
+  SubpelVarianceParams(6, 4, &aom_highbd_8_sub_pixel_variance64x16_sse2, 8),
+  SubpelVarianceParams(4, 6, &aom_highbd_8_sub_pixel_variance16x64_sse2, 8),
+  SubpelVarianceParams(5, 3, &aom_highbd_8_sub_pixel_variance32x8_sse2, 8),
+  SubpelVarianceParams(3, 5, &aom_highbd_8_sub_pixel_variance8x32_sse2, 8),
+  SubpelVarianceParams(4, 2, &aom_highbd_8_sub_pixel_variance16x4_sse2, 8),
+  // SubpelVarianceParams(2, 4, &aom_highbd_8_sub_pixel_variance4x16_sse2, 8),
+};
+INSTANTIATE_TEST_SUITE_P(SSE2, AvxHBDSubpelVarianceTest,
+                         ::testing::ValuesIn(kArrayHBDSubpelVariance_sse2));
+
+const SubpelAvgVarianceParams kArrayHBDSubpelAvgVariance_sse2[] = {
+  SubpelAvgVarianceParams(6, 6, &aom_highbd_12_sub_pixel_avg_variance64x64_sse2,
+                          12),
+  SubpelAvgVarianceParams(6, 5, &aom_highbd_12_sub_pixel_avg_variance64x32_sse2,
+                          12),
+  SubpelAvgVarianceParams(5, 6, &aom_highbd_12_sub_pixel_avg_variance32x64_sse2,
+                          12),
+  SubpelAvgVarianceParams(5, 5, &aom_highbd_12_sub_pixel_avg_variance32x32_sse2,
+                          12),
+  SubpelAvgVarianceParams(5, 4, &aom_highbd_12_sub_pixel_avg_variance32x16_sse2,
+                          12),
+  SubpelAvgVarianceParams(4, 5, &aom_highbd_12_sub_pixel_avg_variance16x32_sse2,
+                          12),
+  SubpelAvgVarianceParams(4, 4, &aom_highbd_12_sub_pixel_avg_variance16x16_sse2,
+                          12),
+  SubpelAvgVarianceParams(4, 3, &aom_highbd_12_sub_pixel_avg_variance16x8_sse2,
+                          12),
+  SubpelAvgVarianceParams(3, 4, &aom_highbd_12_sub_pixel_avg_variance8x16_sse2,
+                          12),
+  SubpelAvgVarianceParams(3, 3, &aom_highbd_12_sub_pixel_avg_variance8x8_sse2,
+                          12),
+  SubpelAvgVarianceParams(3, 2, &aom_highbd_12_sub_pixel_avg_variance8x4_sse2,
+                          12),
+  SubpelAvgVarianceParams(6, 6, &aom_highbd_10_sub_pixel_avg_variance64x64_sse2,
+                          10),
+  SubpelAvgVarianceParams(6, 5, &aom_highbd_10_sub_pixel_avg_variance64x32_sse2,
+                          10),
+  SubpelAvgVarianceParams(5, 6, &aom_highbd_10_sub_pixel_avg_variance32x64_sse2,
+                          10),
+  SubpelAvgVarianceParams(5, 5, &aom_highbd_10_sub_pixel_avg_variance32x32_sse2,
+                          10),
+  SubpelAvgVarianceParams(5, 4, &aom_highbd_10_sub_pixel_avg_variance32x16_sse2,
+                          10),
+  SubpelAvgVarianceParams(4, 5, &aom_highbd_10_sub_pixel_avg_variance16x32_sse2,
+                          10),
+  SubpelAvgVarianceParams(4, 4, &aom_highbd_10_sub_pixel_avg_variance16x16_sse2,
+                          10),
+  SubpelAvgVarianceParams(4, 3, &aom_highbd_10_sub_pixel_avg_variance16x8_sse2,
+                          10),
+  SubpelAvgVarianceParams(3, 4, &aom_highbd_10_sub_pixel_avg_variance8x16_sse2,
+                          10),
+  SubpelAvgVarianceParams(3, 3, &aom_highbd_10_sub_pixel_avg_variance8x8_sse2,
+                          10),
+  SubpelAvgVarianceParams(3, 2, &aom_highbd_10_sub_pixel_avg_variance8x4_sse2,
+                          10),
+  SubpelAvgVarianceParams(6, 6, &aom_highbd_8_sub_pixel_avg_variance64x64_sse2,
+                          8),
+  SubpelAvgVarianceParams(6, 5, &aom_highbd_8_sub_pixel_avg_variance64x32_sse2,
+                          8),
+  SubpelAvgVarianceParams(5, 6, &aom_highbd_8_sub_pixel_avg_variance32x64_sse2,
+                          8),
+  SubpelAvgVarianceParams(5, 5, &aom_highbd_8_sub_pixel_avg_variance32x32_sse2,
+                          8),
+  SubpelAvgVarianceParams(5, 4, &aom_highbd_8_sub_pixel_avg_variance32x16_sse2,
+                          8),
+  SubpelAvgVarianceParams(4, 5, &aom_highbd_8_sub_pixel_avg_variance16x32_sse2,
+                          8),
+  SubpelAvgVarianceParams(4, 4, &aom_highbd_8_sub_pixel_avg_variance16x16_sse2,
+                          8),
+  SubpelAvgVarianceParams(4, 3, &aom_highbd_8_sub_pixel_avg_variance16x8_sse2,
+                          8),
+  SubpelAvgVarianceParams(3, 4, &aom_highbd_8_sub_pixel_avg_variance8x16_sse2,
+                          8),
+  SubpelAvgVarianceParams(3, 3, &aom_highbd_8_sub_pixel_avg_variance8x8_sse2,
+                          8),
+  SubpelAvgVarianceParams(3, 2, &aom_highbd_8_sub_pixel_avg_variance8x4_sse2,
+                          8),
+
+  SubpelAvgVarianceParams(6, 4, &aom_highbd_12_sub_pixel_avg_variance64x16_sse2,
+                          12),
+  SubpelAvgVarianceParams(4, 6, &aom_highbd_12_sub_pixel_avg_variance16x64_sse2,
+                          12),
+  SubpelAvgVarianceParams(5, 3, &aom_highbd_12_sub_pixel_avg_variance32x8_sse2,
+                          12),
+  SubpelAvgVarianceParams(3, 5, &aom_highbd_12_sub_pixel_avg_variance8x32_sse2,
+                          12),
+  SubpelAvgVarianceParams(4, 2, &aom_highbd_12_sub_pixel_avg_variance16x4_sse2,
+                          12),
+  // SubpelAvgVarianceParams(2, 4,
+  // &aom_highbd_12_sub_pixel_avg_variance4x16_sse2, 12),
+  SubpelAvgVarianceParams(6, 4, &aom_highbd_10_sub_pixel_avg_variance64x16_sse2,
+                          10),
+  SubpelAvgVarianceParams(4, 6, &aom_highbd_10_sub_pixel_avg_variance16x64_sse2,
+                          10),
+  SubpelAvgVarianceParams(5, 3, &aom_highbd_10_sub_pixel_avg_variance32x8_sse2,
+                          10),
+  SubpelAvgVarianceParams(3, 5, &aom_highbd_10_sub_pixel_avg_variance8x32_sse2,
+                          10),
+  SubpelAvgVarianceParams(4, 2, &aom_highbd_10_sub_pixel_avg_variance16x4_sse2,
+                          10),
+  // SubpelAvgVarianceParams(2, 4,
+  // &aom_highbd_10_sub_pixel_avg_variance4x16_sse2, 10),
+  SubpelAvgVarianceParams(6, 4, &aom_highbd_8_sub_pixel_avg_variance64x16_sse2,
+                          8),
+  SubpelAvgVarianceParams(4, 6, &aom_highbd_8_sub_pixel_avg_variance16x64_sse2,
+                          8),
+  SubpelAvgVarianceParams(5, 3, &aom_highbd_8_sub_pixel_avg_variance32x8_sse2,
+                          8),
+  SubpelAvgVarianceParams(3, 5, &aom_highbd_8_sub_pixel_avg_variance8x32_sse2,
+                          8),
+  SubpelAvgVarianceParams(4, 2, &aom_highbd_8_sub_pixel_avg_variance16x4_sse2,
+                          8),
+  // SubpelAvgVarianceParams(2, 4,
+  // &aom_highbd_8_sub_pixel_avg_variance4x16_sse2, 8),
+};
+
+INSTANTIATE_TEST_SUITE_P(SSE2, AvxHBDSubpelAvgVarianceTest,
+                         ::testing::ValuesIn(kArrayHBDSubpelAvgVariance_sse2));
+#endif  // HAVE_SSE2
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+#if HAVE_SSSE3
+INSTANTIATE_TEST_SUITE_P(
+    SSSE3, AvxSubpelVarianceTest,
+    ::testing::Values(
+        SubpelVarianceParams(7, 7, &aom_sub_pixel_variance128x128_ssse3, 0),
+        SubpelVarianceParams(7, 6, &aom_sub_pixel_variance128x64_ssse3, 0),
+        SubpelVarianceParams(6, 7, &aom_sub_pixel_variance64x128_ssse3, 0),
+        SubpelVarianceParams(6, 6, &aom_sub_pixel_variance64x64_ssse3, 0),
+        SubpelVarianceParams(6, 5, &aom_sub_pixel_variance64x32_ssse3, 0),
+        SubpelVarianceParams(5, 6, &aom_sub_pixel_variance32x64_ssse3, 0),
+        SubpelVarianceParams(5, 5, &aom_sub_pixel_variance32x32_ssse3, 0),
+        SubpelVarianceParams(5, 4, &aom_sub_pixel_variance32x16_ssse3, 0),
+        SubpelVarianceParams(4, 5, &aom_sub_pixel_variance16x32_ssse3, 0),
+        SubpelVarianceParams(4, 4, &aom_sub_pixel_variance16x16_ssse3, 0),
+        SubpelVarianceParams(4, 3, &aom_sub_pixel_variance16x8_ssse3, 0),
+        SubpelVarianceParams(3, 4, &aom_sub_pixel_variance8x16_ssse3, 0),
+        SubpelVarianceParams(3, 3, &aom_sub_pixel_variance8x8_ssse3, 0),
+        SubpelVarianceParams(3, 2, &aom_sub_pixel_variance8x4_ssse3, 0),
+        SubpelVarianceParams(2, 3, &aom_sub_pixel_variance4x8_ssse3, 0),
+        SubpelVarianceParams(2, 2, &aom_sub_pixel_variance4x4_ssse3, 0),
+
+        SubpelVarianceParams(6, 4, &aom_sub_pixel_variance64x16_ssse3, 0),
+        SubpelVarianceParams(4, 6, &aom_sub_pixel_variance16x64_ssse3, 0),
+        SubpelVarianceParams(5, 3, &aom_sub_pixel_variance32x8_ssse3, 0),
+        SubpelVarianceParams(3, 5, &aom_sub_pixel_variance8x32_ssse3, 0),
+        SubpelVarianceParams(4, 2, &aom_sub_pixel_variance16x4_ssse3, 0),
+        SubpelVarianceParams(2, 4, &aom_sub_pixel_variance4x16_ssse3, 0)));
+
+INSTANTIATE_TEST_SUITE_P(
+    SSSE3, AvxSubpelAvgVarianceTest,
+    ::testing::Values(
+        SubpelAvgVarianceParams(7, 7, &aom_sub_pixel_avg_variance128x128_ssse3,
+                                0),
+        SubpelAvgVarianceParams(7, 6, &aom_sub_pixel_avg_variance128x64_ssse3,
+                                0),
+        SubpelAvgVarianceParams(6, 7, &aom_sub_pixel_avg_variance64x128_ssse3,
+                                0),
+        SubpelAvgVarianceParams(6, 6, &aom_sub_pixel_avg_variance64x64_ssse3,
+                                0),
+        SubpelAvgVarianceParams(6, 5, &aom_sub_pixel_avg_variance64x32_ssse3,
+                                0),
+        SubpelAvgVarianceParams(5, 6, &aom_sub_pixel_avg_variance32x64_ssse3,
+                                0),
+        SubpelAvgVarianceParams(5, 5, &aom_sub_pixel_avg_variance32x32_ssse3,
+                                0),
+        SubpelAvgVarianceParams(5, 4, &aom_sub_pixel_avg_variance32x16_ssse3,
+                                0),
+        SubpelAvgVarianceParams(4, 5, &aom_sub_pixel_avg_variance16x32_ssse3,
+                                0),
+        SubpelAvgVarianceParams(4, 4, &aom_sub_pixel_avg_variance16x16_ssse3,
+                                0),
+        SubpelAvgVarianceParams(4, 3, &aom_sub_pixel_avg_variance16x8_ssse3, 0),
+        SubpelAvgVarianceParams(3, 4, &aom_sub_pixel_avg_variance8x16_ssse3, 0),
+        SubpelAvgVarianceParams(3, 3, &aom_sub_pixel_avg_variance8x8_ssse3, 0),
+        SubpelAvgVarianceParams(3, 2, &aom_sub_pixel_avg_variance8x4_ssse3, 0),
+        SubpelAvgVarianceParams(2, 3, &aom_sub_pixel_avg_variance4x8_ssse3, 0),
+        SubpelAvgVarianceParams(2, 2, &aom_sub_pixel_avg_variance4x4_ssse3, 0),
+
+        SubpelAvgVarianceParams(6, 4, &aom_sub_pixel_avg_variance64x16_ssse3,
+                                0),
+        SubpelAvgVarianceParams(4, 6, &aom_sub_pixel_avg_variance16x64_ssse3,
+                                0),
+        SubpelAvgVarianceParams(5, 3, &aom_sub_pixel_avg_variance32x8_ssse3, 0),
+        SubpelAvgVarianceParams(3, 5, &aom_sub_pixel_avg_variance8x32_ssse3, 0),
+        SubpelAvgVarianceParams(4, 2, &aom_sub_pixel_avg_variance16x4_ssse3, 0),
+        SubpelAvgVarianceParams(2, 4, &aom_sub_pixel_avg_variance4x16_ssse3,
+                                0)));
+
+INSTANTIATE_TEST_SUITE_P(
+    SSSE3, AvxDistWtdSubpelAvgVarianceTest,
+    ::testing::Values(
+        DistWtdSubpelAvgVarianceParams(
+            7, 7, &aom_dist_wtd_sub_pixel_avg_variance128x128_ssse3, 0),
+        DistWtdSubpelAvgVarianceParams(
+            7, 6, &aom_dist_wtd_sub_pixel_avg_variance128x64_ssse3, 0),
+        DistWtdSubpelAvgVarianceParams(
+            6, 7, &aom_dist_wtd_sub_pixel_avg_variance64x128_ssse3, 0),
+        DistWtdSubpelAvgVarianceParams(
+            6, 6, &aom_dist_wtd_sub_pixel_avg_variance64x64_ssse3, 0),
+        DistWtdSubpelAvgVarianceParams(
+            6, 5, &aom_dist_wtd_sub_pixel_avg_variance64x32_ssse3, 0),
+        DistWtdSubpelAvgVarianceParams(
+            5, 6, &aom_dist_wtd_sub_pixel_avg_variance32x64_ssse3, 0),
+        DistWtdSubpelAvgVarianceParams(
+            5, 5, &aom_dist_wtd_sub_pixel_avg_variance32x32_ssse3, 0),
+        DistWtdSubpelAvgVarianceParams(
+            5, 4, &aom_dist_wtd_sub_pixel_avg_variance32x16_ssse3, 0),
+        DistWtdSubpelAvgVarianceParams(
+            4, 5, &aom_dist_wtd_sub_pixel_avg_variance16x32_ssse3, 0),
+        DistWtdSubpelAvgVarianceParams(
+            4, 4, &aom_dist_wtd_sub_pixel_avg_variance16x16_ssse3, 0),
+        DistWtdSubpelAvgVarianceParams(
+            4, 3, &aom_dist_wtd_sub_pixel_avg_variance16x8_ssse3, 0),
+        DistWtdSubpelAvgVarianceParams(
+            3, 4, &aom_dist_wtd_sub_pixel_avg_variance8x16_ssse3, 0),
+        DistWtdSubpelAvgVarianceParams(
+            3, 3, &aom_dist_wtd_sub_pixel_avg_variance8x8_ssse3, 0),
+        DistWtdSubpelAvgVarianceParams(
+            3, 2, &aom_dist_wtd_sub_pixel_avg_variance8x4_ssse3, 0),
+        DistWtdSubpelAvgVarianceParams(
+            2, 3, &aom_dist_wtd_sub_pixel_avg_variance4x8_ssse3, 0),
+        DistWtdSubpelAvgVarianceParams(
+            2, 2, &aom_dist_wtd_sub_pixel_avg_variance4x4_ssse3, 0),
+
+        DistWtdSubpelAvgVarianceParams(
+            6, 4, &aom_dist_wtd_sub_pixel_avg_variance64x16_ssse3, 0),
+        DistWtdSubpelAvgVarianceParams(
+            4, 6, &aom_dist_wtd_sub_pixel_avg_variance16x64_ssse3, 0),
+        DistWtdSubpelAvgVarianceParams(
+            5, 3, &aom_dist_wtd_sub_pixel_avg_variance32x8_ssse3, 0),
+        DistWtdSubpelAvgVarianceParams(
+            3, 5, &aom_dist_wtd_sub_pixel_avg_variance8x32_ssse3, 0),
+        DistWtdSubpelAvgVarianceParams(
+            4, 2, &aom_dist_wtd_sub_pixel_avg_variance16x4_ssse3, 0),
+        DistWtdSubpelAvgVarianceParams(
+            2, 4, &aom_dist_wtd_sub_pixel_avg_variance4x16_ssse3, 0)));
+#endif  // HAVE_SSSE3
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(
+    SSE4_1, AvxObmcSubpelVarianceTest,
+    ::testing::Values(
+        ObmcSubpelVarianceParams(7, 7,
+                                 &aom_obmc_sub_pixel_variance128x128_sse4_1, 0),
+        ObmcSubpelVarianceParams(7, 6,
+                                 &aom_obmc_sub_pixel_variance128x64_sse4_1, 0),
+        ObmcSubpelVarianceParams(6, 7,
+                                 &aom_obmc_sub_pixel_variance64x128_sse4_1, 0),
+        ObmcSubpelVarianceParams(6, 6, &aom_obmc_sub_pixel_variance64x64_sse4_1,
+                                 0),
+        ObmcSubpelVarianceParams(6, 5, &aom_obmc_sub_pixel_variance64x32_sse4_1,
+                                 0),
+        ObmcSubpelVarianceParams(5, 6, &aom_obmc_sub_pixel_variance32x64_sse4_1,
+                                 0),
+        ObmcSubpelVarianceParams(5, 5, &aom_obmc_sub_pixel_variance32x32_sse4_1,
+                                 0),
+        ObmcSubpelVarianceParams(5, 4, &aom_obmc_sub_pixel_variance32x16_sse4_1,
+                                 0),
+        ObmcSubpelVarianceParams(4, 5, &aom_obmc_sub_pixel_variance16x32_sse4_1,
+                                 0),
+        ObmcSubpelVarianceParams(4, 4, &aom_obmc_sub_pixel_variance16x16_sse4_1,
+                                 0),
+        ObmcSubpelVarianceParams(4, 3, &aom_obmc_sub_pixel_variance16x8_sse4_1,
+                                 0),
+        ObmcSubpelVarianceParams(3, 4, &aom_obmc_sub_pixel_variance8x16_sse4_1,
+                                 0),
+        ObmcSubpelVarianceParams(3, 3, &aom_obmc_sub_pixel_variance8x8_sse4_1,
+                                 0),
+        ObmcSubpelVarianceParams(3, 2, &aom_obmc_sub_pixel_variance8x4_sse4_1,
+                                 0),
+        ObmcSubpelVarianceParams(2, 3, &aom_obmc_sub_pixel_variance4x8_sse4_1,
+                                 0),
+        ObmcSubpelVarianceParams(2, 2, &aom_obmc_sub_pixel_variance4x4_sse4_1,
+                                 0),
+
+        ObmcSubpelVarianceParams(6, 4, &aom_obmc_sub_pixel_variance64x16_sse4_1,
+                                 0),
+        ObmcSubpelVarianceParams(4, 6, &aom_obmc_sub_pixel_variance16x64_sse4_1,
+                                 0),
+        ObmcSubpelVarianceParams(5, 3, &aom_obmc_sub_pixel_variance32x8_sse4_1,
+                                 0),
+        ObmcSubpelVarianceParams(3, 5, &aom_obmc_sub_pixel_variance8x32_sse4_1,
+                                 0),
+        ObmcSubpelVarianceParams(4, 2, &aom_obmc_sub_pixel_variance16x4_sse4_1,
+                                 0),
+        ObmcSubpelVarianceParams(2, 4, &aom_obmc_sub_pixel_variance4x16_sse4_1,
+                                 0)));
+#endif  // HAVE_SSE4_1
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, AvxMseTest,
+                         ::testing::Values(MseParams(4, 4,
+                                                     &aom_mse16x16_avx2)));
+
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, AvxVarianceTest,
+    ::testing::Values(VarianceParams(7, 7, &aom_variance128x128_avx2),
+                      VarianceParams(7, 6, &aom_variance128x64_avx2),
+                      VarianceParams(6, 7, &aom_variance64x128_avx2),
+                      VarianceParams(6, 6, &aom_variance64x64_avx2),
+                      VarianceParams(6, 5, &aom_variance64x32_avx2),
+                      VarianceParams(6, 4, &aom_variance64x16_avx2),
+                      VarianceParams(5, 6, &aom_variance32x64_avx2),
+                      VarianceParams(5, 5, &aom_variance32x32_avx2),
+                      VarianceParams(5, 4, &aom_variance32x16_avx2),
+                      VarianceParams(5, 3, &aom_variance32x8_avx2),
+                      VarianceParams(4, 6, &aom_variance16x64_avx2),
+                      VarianceParams(4, 5, &aom_variance16x32_avx2),
+                      VarianceParams(4, 4, &aom_variance16x16_avx2),
+                      VarianceParams(4, 3, &aom_variance16x8_avx2),
+                      VarianceParams(4, 2, &aom_variance16x4_avx2)));
+
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, AvxSubpelVarianceTest,
+    ::testing::Values(
+        SubpelVarianceParams(7, 7, &aom_sub_pixel_variance128x128_avx2, 0),
+        SubpelVarianceParams(7, 6, &aom_sub_pixel_variance128x64_avx2, 0),
+        SubpelVarianceParams(6, 7, &aom_sub_pixel_variance64x128_avx2, 0),
+        SubpelVarianceParams(6, 6, &aom_sub_pixel_variance64x64_avx2, 0),
+        SubpelVarianceParams(6, 5, &aom_sub_pixel_variance64x32_avx2, 0),
+        SubpelVarianceParams(5, 6, &aom_sub_pixel_variance32x64_avx2, 0),
+        SubpelVarianceParams(5, 5, &aom_sub_pixel_variance32x32_avx2, 0),
+        SubpelVarianceParams(5, 4, &aom_sub_pixel_variance32x16_avx2, 0),
+        SubpelVarianceParams(4, 6, &aom_sub_pixel_variance16x64_avx2, 0),
+        SubpelVarianceParams(4, 5, &aom_sub_pixel_variance16x32_avx2, 0),
+        SubpelVarianceParams(4, 4, &aom_sub_pixel_variance16x16_avx2, 0),
+        SubpelVarianceParams(4, 3, &aom_sub_pixel_variance16x8_avx2, 0),
+        SubpelVarianceParams(4, 2, &aom_sub_pixel_variance16x4_avx2, 0)));
+
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, AvxSubpelAvgVarianceTest,
+    ::testing::Values(
+        SubpelAvgVarianceParams(7, 7, &aom_sub_pixel_avg_variance128x128_avx2,
+                                0),
+        SubpelAvgVarianceParams(7, 6, &aom_sub_pixel_avg_variance128x64_avx2,
+                                0),
+        SubpelAvgVarianceParams(6, 7, &aom_sub_pixel_avg_variance64x128_avx2,
+                                0),
+        SubpelAvgVarianceParams(6, 6, &aom_sub_pixel_avg_variance64x64_avx2, 0),
+        SubpelAvgVarianceParams(6, 5, &aom_sub_pixel_avg_variance64x32_avx2, 0),
+        SubpelAvgVarianceParams(5, 6, &aom_sub_pixel_avg_variance32x64_avx2, 0),
+        SubpelAvgVarianceParams(5, 5, &aom_sub_pixel_avg_variance32x32_avx2, 0),
+        SubpelAvgVarianceParams(5, 4, &aom_sub_pixel_avg_variance32x16_avx2,
+                                0)));
+#endif  // HAVE_AVX2
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, AvxSseTest,
+                         ::testing::Values(SseParams(2, 2,
+                                                     &aom_get4x4sse_cs_neon)));
+
+INSTANTIATE_TEST_SUITE_P(NEON, AvxMseTest,
+                         ::testing::Values(MseParams(4, 4,
+                                                     &aom_mse16x16_neon)));
+
+INSTANTIATE_TEST_SUITE_P(
+    NEON, AvxVarianceTest,
+    ::testing::Values(VarianceParams(7, 7, &aom_variance128x128_neon),
+                      VarianceParams(6, 6, &aom_variance64x64_neon),
+                      VarianceParams(6, 5, &aom_variance64x32_neon),
+                      VarianceParams(5, 6, &aom_variance32x64_neon),
+                      VarianceParams(5, 5, &aom_variance32x32_neon),
+                      VarianceParams(4, 4, &aom_variance16x16_neon),
+                      VarianceParams(4, 3, &aom_variance16x8_neon),
+                      VarianceParams(3, 4, &aom_variance8x16_neon),
+                      VarianceParams(3, 3, &aom_variance8x8_neon)));
+
+INSTANTIATE_TEST_SUITE_P(
+    NEON, AvxSubpelVarianceTest,
+    ::testing::Values(
+        SubpelVarianceParams(6, 6, &aom_sub_pixel_variance64x64_neon, 0),
+        SubpelVarianceParams(5, 5, &aom_sub_pixel_variance32x32_neon, 0),
+        SubpelVarianceParams(4, 4, &aom_sub_pixel_variance16x16_neon, 0),
+        SubpelVarianceParams(3, 3, &aom_sub_pixel_variance8x8_neon, 0)));
+#endif  // HAVE_NEON
+
+#if HAVE_MSA
+INSTANTIATE_TEST_SUITE_P(MSA, SumOfSquaresTest,
+                         ::testing::Values(aom_get_mb_ss_msa));
+
+INSTANTIATE_TEST_SUITE_P(MSA, AvxSseTest,
+                         ::testing::Values(SseParams(2, 2,
+                                                     &aom_get4x4sse_cs_msa)));
+
+INSTANTIATE_TEST_SUITE_P(MSA, AvxMseTest,
+                         ::testing::Values(MseParams(4, 4, &aom_mse16x16_msa),
+                                           MseParams(4, 3, &aom_mse16x8_msa),
+                                           MseParams(3, 4, &aom_mse8x16_msa),
+                                           MseParams(3, 3, &aom_mse8x8_msa)));
+
+INSTANTIATE_TEST_SUITE_P(
+    MSA, AvxVarianceTest,
+    ::testing::Values(VarianceParams(6, 6, &aom_variance64x64_msa),
+                      VarianceParams(6, 5, &aom_variance64x32_msa),
+                      VarianceParams(5, 6, &aom_variance32x64_msa),
+                      VarianceParams(5, 5, &aom_variance32x32_msa),
+                      VarianceParams(5, 4, &aom_variance32x16_msa),
+                      VarianceParams(4, 5, &aom_variance16x32_msa),
+                      VarianceParams(4, 4, &aom_variance16x16_msa),
+                      VarianceParams(4, 3, &aom_variance16x8_msa),
+                      VarianceParams(3, 4, &aom_variance8x16_msa),
+                      VarianceParams(3, 3, &aom_variance8x8_msa),
+                      VarianceParams(3, 2, &aom_variance8x4_msa),
+                      VarianceParams(2, 3, &aom_variance4x8_msa),
+                      VarianceParams(2, 2, &aom_variance4x4_msa)));
+
+INSTANTIATE_TEST_SUITE_P(
+    MSA, AvxSubpelVarianceTest,
+    ::testing::Values(
+        SubpelVarianceParams(2, 2, &aom_sub_pixel_variance4x4_msa, 0),
+        SubpelVarianceParams(2, 3, &aom_sub_pixel_variance4x8_msa, 0),
+        SubpelVarianceParams(3, 2, &aom_sub_pixel_variance8x4_msa, 0),
+        SubpelVarianceParams(3, 3, &aom_sub_pixel_variance8x8_msa, 0),
+        SubpelVarianceParams(3, 4, &aom_sub_pixel_variance8x16_msa, 0),
+        SubpelVarianceParams(4, 3, &aom_sub_pixel_variance16x8_msa, 0),
+        SubpelVarianceParams(4, 4, &aom_sub_pixel_variance16x16_msa, 0),
+        SubpelVarianceParams(4, 5, &aom_sub_pixel_variance16x32_msa, 0),
+        SubpelVarianceParams(5, 4, &aom_sub_pixel_variance32x16_msa, 0),
+        SubpelVarianceParams(5, 5, &aom_sub_pixel_variance32x32_msa, 0),
+        SubpelVarianceParams(5, 6, &aom_sub_pixel_variance32x64_msa, 0),
+        SubpelVarianceParams(6, 5, &aom_sub_pixel_variance64x32_msa, 0),
+        SubpelVarianceParams(6, 6, &aom_sub_pixel_variance64x64_msa, 0)));
+
+INSTANTIATE_TEST_SUITE_P(
+    MSA, AvxSubpelAvgVarianceTest,
+    ::testing::Values(
+        SubpelAvgVarianceParams(6, 6, &aom_sub_pixel_avg_variance64x64_msa, 0),
+        SubpelAvgVarianceParams(6, 5, &aom_sub_pixel_avg_variance64x32_msa, 0),
+        SubpelAvgVarianceParams(5, 6, &aom_sub_pixel_avg_variance32x64_msa, 0),
+        SubpelAvgVarianceParams(5, 5, &aom_sub_pixel_avg_variance32x32_msa, 0),
+        SubpelAvgVarianceParams(5, 4, &aom_sub_pixel_avg_variance32x16_msa, 0),
+        SubpelAvgVarianceParams(4, 5, &aom_sub_pixel_avg_variance16x32_msa, 0),
+        SubpelAvgVarianceParams(4, 4, &aom_sub_pixel_avg_variance16x16_msa, 0),
+        SubpelAvgVarianceParams(4, 3, &aom_sub_pixel_avg_variance16x8_msa, 0),
+        SubpelAvgVarianceParams(3, 4, &aom_sub_pixel_avg_variance8x16_msa, 0),
+        SubpelAvgVarianceParams(3, 3, &aom_sub_pixel_avg_variance8x8_msa, 0),
+        SubpelAvgVarianceParams(3, 2, &aom_sub_pixel_avg_variance8x4_msa, 0),
+        SubpelAvgVarianceParams(2, 3, &aom_sub_pixel_avg_variance4x8_msa, 0),
+        SubpelAvgVarianceParams(2, 2, &aom_sub_pixel_avg_variance4x4_msa, 0)));
+#endif  // HAVE_MSA
+}  // namespace
diff --git a/libs/libaom/src/test/video_source.h b/libs/libaom/src/test/video_source.h
new file mode 100644
index 000000000..3c1c5e559
--- /dev/null
+++ b/libs/libaom/src/test/video_source.h
@@ -0,0 +1,259 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_TEST_VIDEO_SOURCE_H_
+#define AOM_TEST_VIDEO_SOURCE_H_
+
+#if defined(_WIN32)
+#undef NOMINMAX
+#define NOMINMAX
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#endif
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+#include "test/acm_random.h"
+#include "aom/aom_encoder.h"
+
+namespace libaom_test {
+
+// Helper macros to ensure LIBAOM_TEST_DATA_PATH is a quoted string.
+// These are undefined right below GetDataPath
+// NOTE: LIBAOM_TEST_DATA_PATH MUST NOT be a quoted string before
+// Stringification or the GetDataPath will fail at runtime
+#define TO_STRING(S) #S
+#define STRINGIFY(S) TO_STRING(S)
+
+// A simple function to encapsulate cross platform retrieval of test data path
+static std::string GetDataPath() {
+  const char *const data_path = getenv("LIBAOM_TEST_DATA_PATH");
+  if (data_path == NULL) {
+#ifdef LIBAOM_TEST_DATA_PATH
+    // In some environments, we cannot set environment variables
+    // Instead, we set the data path by using a preprocessor symbol
+    // which can be set from make files
+    return STRINGIFY(LIBAOM_TEST_DATA_PATH);
+#else
+    return ".";
+#endif
+  }
+  return data_path;
+}
+
+// Undefining stringification macros because they are not used elsewhere
+#undef TO_STRING
+#undef STRINGIFY
+
+inline FILE *OpenTestDataFile(const std::string &file_name) {
+  const std::string path_to_source = GetDataPath() + "/" + file_name;
+  return fopen(path_to_source.c_str(), "rb");
+}
+
+static FILE *GetTempOutFile(std::string *file_name) {
+  file_name->clear();
+#if defined(_WIN32)
+  char fname[MAX_PATH];
+  char tmppath[MAX_PATH];
+  if (GetTempPathA(MAX_PATH, tmppath)) {
+    // Assume for now that the filename generated is unique per process
+    if (GetTempFileNameA(tmppath, "lvx", 0, fname)) {
+      file_name->assign(fname);
+      return fopen(fname, "wb+");
+    }
+  }
+  return NULL;
+#else
+  char name_template[] = "/tmp/libaomtest.XXXXXX";
+  const int fd = mkstemp(name_template);
+  *file_name = name_template;
+  return fdopen(fd, "wb+");
+#endif
+}
+
+class TempOutFile {
+ public:
+  TempOutFile() { file_ = GetTempOutFile(&file_name_); }
+  ~TempOutFile() {
+    CloseFile();
+    if (!file_name_.empty()) {
+      EXPECT_EQ(0, remove(file_name_.c_str()));
+    }
+  }
+  FILE *file() { return file_; }
+  const std::string &file_name() { return file_name_; }
+
+ protected:
+  void CloseFile() {
+    if (file_) {
+      fclose(file_);
+      file_ = NULL;
+    }
+  }
+  FILE *file_;
+  std::string file_name_;
+};
+
+// Abstract base class for test video sources, which provide a stream of
+// aom_image_t images with associated timestamps and duration.
+class VideoSource {
+ public:
+  virtual ~VideoSource() {}
+
+  // Prepare the stream for reading, rewind/open as necessary.
+  virtual void Begin() = 0;
+
+  // Advance the cursor to the next frame
+  virtual void Next() = 0;
+
+  // Get the current video frame, or NULL on End-Of-Stream.
+  virtual aom_image_t *img() const = 0;
+
+  // Get the presentation timestamp of the current frame.
+  virtual aom_codec_pts_t pts() const = 0;
+
+  // Get the current frame's duration
+  virtual unsigned long duration() const = 0;
+
+  // Get the timebase for the stream
+  virtual aom_rational_t timebase() const = 0;
+
+  // Get the current frame counter, starting at 0.
+  virtual unsigned int frame() const = 0;
+
+  // Get the current file limit.
+  virtual unsigned int limit() const = 0;
+};
+
+class DummyVideoSource : public VideoSource {
+ public:
+  DummyVideoSource()
+      : img_(NULL), limit_(100), width_(80), height_(64),
+        format_(AOM_IMG_FMT_I420) {
+    ReallocImage();
+  }
+
+  virtual ~DummyVideoSource() { aom_img_free(img_); }
+
+  virtual void Begin() {
+    frame_ = 0;
+    FillFrame();
+  }
+
+  virtual void Next() {
+    ++frame_;
+    FillFrame();
+  }
+
+  virtual aom_image_t *img() const { return (frame_ < limit_) ? img_ : NULL; }
+
+  // Models a stream where Timebase = 1/FPS, so pts == frame.
+  virtual aom_codec_pts_t pts() const { return frame_; }
+
+  virtual unsigned long duration() const { return 1; }
+
+  virtual aom_rational_t timebase() const {
+    const aom_rational_t t = { 1, 30 };
+    return t;
+  }
+
+  virtual unsigned int frame() const { return frame_; }
+
+  virtual unsigned int limit() const { return limit_; }
+
+  void set_limit(unsigned int limit) { limit_ = limit; }
+
+  void SetSize(unsigned int width, unsigned int height) {
+    if (width != width_ || height != height_) {
+      width_ = width;
+      height_ = height;
+      ReallocImage();
+    }
+  }
+
+  void SetImageFormat(aom_img_fmt_t format) {
+    if (format_ != format) {
+      format_ = format;
+      ReallocImage();
+    }
+  }
+
+ protected:
+  virtual void FillFrame() {
+    if (img_) memset(img_->img_data, 0, raw_sz_);
+  }
+
+  void ReallocImage() {
+    aom_img_free(img_);
+    img_ = aom_img_alloc(NULL, format_, width_, height_, 32);
+    raw_sz_ = ((img_->w + 31) & ~31) * img_->h * img_->bps / 8;
+  }
+
+  aom_image_t *img_;
+  size_t raw_sz_;
+  unsigned int limit_;
+  unsigned int frame_;
+  unsigned int width_;
+  unsigned int height_;
+  aom_img_fmt_t format_;
+};
+
+class RandomVideoSource : public DummyVideoSource {
+ public:
+  RandomVideoSource(int seed = ACMRandom::DeterministicSeed())
+      : rnd_(seed), seed_(seed) {}
+
+ protected:
+  // Reset the RNG to get a matching stream for the second pass
+  virtual void Begin() {
+    frame_ = 0;
+    rnd_.Reset(seed_);
+    FillFrame();
+  }
+
+  // 15 frames of noise, followed by 15 static frames. Reset to 0 rather
+  // than holding previous frames to encourage keyframes to be thrown.
+  virtual void FillFrame() {
+    if (img_) {
+      if (frame_ % 30 < 15)
+        for (size_t i = 0; i < raw_sz_; ++i) img_->img_data[i] = rnd_.Rand8();
+      else
+        memset(img_->img_data, 0, raw_sz_);
+    }
+  }
+
+  ACMRandom rnd_;
+  int seed_;
+};
+
+// Abstract base class for test video sources, which provide a stream of
+// decompressed images to the decoder.
+class CompressedVideoSource {
+ public:
+  virtual ~CompressedVideoSource() {}
+
+  virtual void Init() = 0;
+
+  // Prepare the stream for reading, rewind/open as necessary.
+  virtual void Begin() = 0;
+
+  // Advance the cursor to the next frame
+  virtual void Next() = 0;
+
+  virtual const uint8_t *cxdata() const = 0;
+
+  virtual size_t frame_size() const = 0;
+
+  virtual unsigned int frame_number() const = 0;
+};
+
+}  // namespace libaom_test
+
+#endif  // AOM_TEST_VIDEO_SOURCE_H_
diff --git a/libs/libaom/src/test/visual_metrics.py b/libs/libaom/src/test/visual_metrics.py
new file mode 100644
index 000000000..9055feb33
--- /dev/null
+++ b/libs/libaom/src/test/visual_metrics.py
@@ -0,0 +1,466 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2016, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and
+# the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+# was not distributed with this source code in the LICENSE file, you can
+# obtain it at www.aomedia.org/license/software. If the Alliance for Open
+# Media Patent License 1.0 was not distributed with this source code in the
+# PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+#
+
+"""Converts video encoding result data from text files to visualization
+data source."""
+
+__author__ = "jzern@google.com (James Zern),"
+__author__ += "jimbankoski@google.com (Jim Bankoski)"
+
+import fnmatch
+import numpy as np
+import scipy as sp
+import scipy.interpolate
+import os
+import re
+import string
+import sys
+import math
+import warnings
+
+import gviz_api
+
+from os.path import basename
+from os.path import splitext
+
+warnings.simplefilter('ignore', np.RankWarning)
+warnings.simplefilter('ignore', RuntimeWarning)
+
+def bdsnr2(metric_set1, metric_set2):
+  """
+  BJONTEGAARD    Bjontegaard metric calculation adapted
+  Bjontegaard's snr metric allows to compute the average % saving in decibels
+  between two rate-distortion curves [1].  This is an adaptation of that
+  method that fixes inconsistencies when the curve fit operation goes awry
+  by replacing the curve fit function with a Piecewise Cubic Hermite
+  Interpolating Polynomial and then integrating that by evaluating that
+  function at small intervals using the trapezoid method to calculate
+  the integral.
+
+  metric_set1 - list of tuples ( bitrate,  metric ) for first graph
+  metric_set2 - list of tuples ( bitrate,  metric ) for second graph
+  """
+
+  if not metric_set1 or not metric_set2:
+    return 0.0
+
+  try:
+
+    # pchip_interlopate requires keys sorted by x axis. x-axis will
+    # be our metric not the bitrate so sort by metric.
+    metric_set1.sort()
+    metric_set2.sort()
+
+    # Pull the log of the rate and clamped psnr from metric_sets.
+    log_rate1 = [math.log(x[0]) for x in metric_set1]
+    metric1 = [100.0 if x[1] == float('inf') else x[1] for x in metric_set1]
+    log_rate2 = [math.log(x[0]) for x in metric_set2]
+    metric2 = [100.0 if x[1] == float('inf') else x[1] for x in metric_set2]
+
+    # Integration interval.  This metric only works on the area that's
+    # overlapping.   Extrapolation of these things is sketchy so we avoid.
+    min_int = max([min(log_rate1), min(log_rate2)])
+    max_int = min([max(log_rate1), max(log_rate2)])
+
+    # No overlap means no sensible metric possible.
+    if max_int <= min_int:
+      return 0.0
+
+    # Use Piecewise Cubic Hermite Interpolating Polynomial interpolation to
+    # create 100 new samples points separated by interval.
+    lin = np.linspace(min_int, max_int, num=100, retstep=True)
+    interval = lin[1]
+    samples = lin[0]
+    v1 = scipy.interpolate.pchip_interpolate(log_rate1, metric1, samples)
+    v2 = scipy.interpolate.pchip_interpolate(log_rate2, metric2, samples)
+
+    # Calculate the integral using the trapezoid method on the samples.
+    int_v1 = np.trapz(v1, dx=interval)
+    int_v2 = np.trapz(v2, dx=interval)
+
+    # Calculate the average improvement.
+    avg_exp_diff = (int_v2 - int_v1) / (max_int - min_int)
+
+  except (TypeError, ZeroDivisionError, ValueError, np.RankWarning) as e:
+    return 0
+
+  return avg_exp_diff
+
+def bdrate2(metric_set1, metric_set2):
+  """
+  BJONTEGAARD    Bjontegaard metric calculation adapted
+  Bjontegaard's metric allows to compute the average % saving in bitrate
+  between two rate-distortion curves [1].  This is an adaptation of that
+  method that fixes inconsistencies when the curve fit operation goes awry
+  by replacing the curve fit function with a Piecewise Cubic Hermite
+  Interpolating Polynomial and then integrating that by evaluating that
+  function at small intervals using the trapezoid method to calculate
+  the integral.
+
+  metric_set1 - list of tuples ( bitrate,  metric ) for first graph
+  metric_set2 - list of tuples ( bitrate,  metric ) for second graph
+  """
+
+  if not metric_set1 or not metric_set2:
+    return 0.0
+
+  try:
+
+    # pchip_interlopate requires keys sorted by x axis. x-axis will
+    # be our metric not the bitrate so sort by metric.
+    metric_set1.sort(key=lambda tup: tup[1])
+    metric_set2.sort(key=lambda tup: tup[1])
+
+    # Pull the log of the rate and clamped psnr from metric_sets.
+    log_rate1 = [math.log(x[0]) for x in metric_set1]
+    metric1 = [100.0 if x[1] == float('inf') else x[1] for x in metric_set1]
+    log_rate2 = [math.log(x[0]) for x in metric_set2]
+    metric2 = [100.0 if x[1] == float('inf') else x[1] for x in metric_set2]
+
+    # Integration interval.  This metric only works on the area that's
+    # overlapping.   Extrapolation of these things is sketchy so we avoid.
+    min_int = max([min(metric1), min(metric2)])
+    max_int = min([max(metric1), max(metric2)])
+
+    # No overlap means no sensible metric possible.
+    if max_int <= min_int:
+      return 0.0
+
+    # Use Piecewise Cubic Hermite Interpolating Polynomial interpolation to
+    # create 100 new samples points separated by interval.
+    lin = np.linspace(min_int, max_int, num=100, retstep=True)
+    interval = lin[1]
+    samples = lin[0]
+    v1 = scipy.interpolate.pchip_interpolate(metric1, log_rate1, samples)
+    v2 = scipy.interpolate.pchip_interpolate(metric2, log_rate2, samples)
+
+    # Calculate the integral using the trapezoid method on the samples.
+    int_v1 = np.trapz(v1, dx=interval)
+    int_v2 = np.trapz(v2, dx=interval)
+
+    # Calculate the average improvement.
+    avg_exp_diff = (int_v2 - int_v1) / (max_int - min_int)
+
+  except (TypeError, ZeroDivisionError, ValueError, np.RankWarning) as e:
+    return 0
+
+  # Convert to a percentage.
+  avg_diff = (math.exp(avg_exp_diff) - 1) * 100
+
+  return avg_diff
+
+
+
+def FillForm(string_for_substitution, dictionary_of_vars):
+  """
+  This function substitutes all matches of the command string //%% ... %%//
+  with the variable represented by ...  .
+  """
+  return_string = string_for_substitution
+  for i in re.findall("//%%(.*)%%//", string_for_substitution):
+    return_string = re.sub("//%%" + i + "%%//", dictionary_of_vars[i],
+                           return_string)
+  return return_string
+
+
+def HasMetrics(line):
+  """
+  The metrics files produced by aomenc are started with a B for headers.
+  """
+  # If the first char of the first word on the line is a digit
+  if len(line) == 0:
+    return False
+  if len(line.split()) == 0:
+    return False
+  if line.split()[0][0:1].isdigit():
+    return True
+  return False
+
+def GetMetrics(file_name):
+  metric_file = open(file_name, "r")
+  return metric_file.readline().split();
+
+def ParseMetricFile(file_name, metric_column):
+  metric_set1 = set([])
+  metric_file = open(file_name, "r")
+  for line in metric_file:
+    metrics = string.split(line)
+    if HasMetrics(line):
+      if metric_column < len(metrics):
+        try:
+          tuple = float(metrics[0]), float(metrics[metric_column])
+        except:
+          tuple = float(metrics[0]), 0
+      else:
+        tuple = float(metrics[0]), 0
+      metric_set1.add(tuple)
+  metric_set1_sorted = sorted(metric_set1)
+  return metric_set1_sorted
+
+
+def FileBetter(file_name_1, file_name_2, metric_column, method):
+  """
+  Compares two data files and determines which is better and by how
+  much. Also produces a histogram of how much better, by PSNR.
+  metric_column is the metric.
+  """
+  # Store and parse our two files into lists of unique tuples.
+
+  # Read the two files, parsing out lines starting with bitrate.
+  metric_set1_sorted = ParseMetricFile(file_name_1, metric_column)
+  metric_set2_sorted = ParseMetricFile(file_name_2, metric_column)
+
+
+  def GraphBetter(metric_set1_sorted, metric_set2_sorted, base_is_set_2):
+    """
+    Search through the sorted metric file for metrics on either side of
+    the metric from file 1.  Since both lists are sorted we really
+    should not have to search through the entire range, but these
+    are small files."""
+    total_bitrate_difference_ratio = 0.0
+    count = 0
+    for bitrate, metric in metric_set1_sorted:
+      if bitrate == 0:
+        continue
+      for i in range(len(metric_set2_sorted) - 1):
+        s2_bitrate_0, s2_metric_0 = metric_set2_sorted[i]
+        s2_bitrate_1, s2_metric_1 = metric_set2_sorted[i + 1]
+        # We have a point on either side of our metric range.
+        if metric > s2_metric_0 and metric <= s2_metric_1:
+
+          # Calculate a slope.
+          if s2_metric_1 - s2_metric_0 != 0:
+            metric_slope = ((s2_bitrate_1 - s2_bitrate_0) /
+                            (s2_metric_1 - s2_metric_0))
+          else:
+            metric_slope = 0
+
+          estimated_s2_bitrate = (s2_bitrate_0 + (metric - s2_metric_0) *
+                                  metric_slope)
+
+          if estimated_s2_bitrate == 0:
+            continue
+          # Calculate percentage difference as given by base.
+          if base_is_set_2 == 0:
+            bitrate_difference_ratio = ((bitrate - estimated_s2_bitrate) /
+                                        bitrate)
+          else:
+            bitrate_difference_ratio = ((bitrate - estimated_s2_bitrate) /
+                                        estimated_s2_bitrate)
+
+          total_bitrate_difference_ratio += bitrate_difference_ratio
+          count += 1
+          break
+
+    # Calculate the average improvement between graphs.
+    if count != 0:
+      avg = total_bitrate_difference_ratio / count
+
+    else:
+      avg = 0.0
+
+    return avg
+
+  # Be fair to both graphs by testing all the points in each.
+  if method == 'avg':
+    avg_improvement = 50 * (
+                       GraphBetter(metric_set1_sorted, metric_set2_sorted, 1) -
+                       GraphBetter(metric_set2_sorted, metric_set1_sorted, 0))
+  elif method == 'dsnr':
+      avg_improvement = bdsnr2(metric_set1_sorted, metric_set2_sorted)
+  else:
+      avg_improvement = bdrate2(metric_set2_sorted, metric_set1_sorted)
+
+  return avg_improvement
+
+
+def HandleFiles(variables):
+  """
+  This script creates html for displaying metric data produced from data
+  in a video stats file,  as created by the AOM project when enable_psnr
+  is turned on:
+
+  Usage: visual_metrics.py template.html pattern base_dir sub_dir [ sub_dir2 ..]
+
+  The script parses each metrics file [see below] that matches the
+  statfile_pattern  in the baseline directory and looks for the file that
+  matches that same file in each of the sub_dirs, and compares the resultant
+  metrics bitrate, avg psnr, glb psnr, and ssim. "
+
+  It provides a table in which each row is a file in the line directory,
+  and a column for each subdir, with the cells representing how that clip
+  compares to baseline for that subdir.   A graph is given for each which
+  compares filesize to that metric.  If you click on a point in the graph it
+  zooms in on that point.
+
+  a SAMPLE metrics file:
+
+  Bitrate  AVGPsnr  GLBPsnr  AVPsnrP  GLPsnrP  VPXSSIM    Time(us)
+   25.911   38.242   38.104   38.258   38.121   75.790    14103
+  Bitrate  AVGPsnr  GLBPsnr  AVPsnrP  GLPsnrP  VPXSSIM    Time(us)
+   49.982   41.264   41.129   41.255   41.122   83.993    19817
+  Bitrate  AVGPsnr  GLBPsnr  AVPsnrP  GLPsnrP  VPXSSIM    Time(us)
+   74.967   42.911   42.767   42.899   42.756   87.928    17332
+  Bitrate  AVGPsnr  GLBPsnr  AVPsnrP  GLPsnrP  VPXSSIM    Time(us)
+  100.012   43.983   43.838   43.881   43.738   89.695    25389
+  Bitrate  AVGPsnr  GLBPsnr  AVPsnrP  GLPsnrP  VPXSSIM    Time(us)
+  149.980   45.338   45.203   45.184   45.043   91.591    25438
+  Bitrate  AVGPsnr  GLBPsnr  AVPsnrP  GLPsnrP  VPXSSIM    Time(us)
+  199.852   46.225   46.123   46.113   45.999   92.679    28302
+  Bitrate  AVGPsnr  GLBPsnr  AVPsnrP  GLPsnrP  VPXSSIM    Time(us)
+  249.922   46.864   46.773   46.777   46.673   93.334    27244
+  Bitrate  AVGPsnr  GLBPsnr  AVPsnrP  GLPsnrP  VPXSSIM    Time(us)
+  299.998   47.366   47.281   47.317   47.220   93.844    27137
+  Bitrate  AVGPsnr  GLBPsnr  AVPsnrP  GLPsnrP  VPXSSIM    Time(us)
+  349.769   47.746   47.677   47.722   47.648   94.178    32226
+  Bitrate  AVGPsnr  GLBPsnr  AVPsnrP  GLPsnrP  VPXSSIM    Time(us)
+  399.773   48.032   47.971   48.013   47.946   94.362    36203
+
+  sample use:
+  visual_metrics.py template.html "*stt" aom aom_b aom_c > metrics.html
+  """
+
+  # The template file is the html file into which we will write the
+  # data from the stats file, formatted correctly for the gviz_api.
+  template_file = open(variables[1], "r")
+  page_template = template_file.read()
+  template_file.close()
+
+  # This is the path match pattern for finding stats files amongst
+  # all the other files it could be.  eg: *.stt
+  file_pattern = variables[2]
+
+  # This is the directory with files that we will use to do the comparison
+  # against.
+  baseline_dir = variables[3]
+  snrs = ''
+  filestable = {}
+
+  filestable['dsnr'] = ''
+  filestable['drate'] = ''
+  filestable['avg'] = ''
+
+  # Dirs is directories after the baseline to compare to the base.
+  dirs = variables[4:len(variables)]
+
+  # Find the metric files in the baseline directory.
+  dir_list = sorted(fnmatch.filter(os.listdir(baseline_dir), file_pattern))
+
+  metrics = GetMetrics(baseline_dir + "/" + dir_list[0])
+
+  metrics_js = 'metrics = ["' + '", "'.join(metrics) + '"];'
+
+  for column in range(1, len(metrics)):
+
+    for metric in ['avg','dsnr','drate']:
+      description = {"file": ("string", "File")}
+
+      # Go through each directory and add a column header to our description.
+      countoverall = {}
+      sumoverall = {}
+
+      for directory in dirs:
+        description[directory] = ("number", directory)
+        countoverall[directory] = 0
+        sumoverall[directory] = 0
+
+      # Data holds the data for the visualization, name given comes from
+      # gviz_api sample code.
+      data = []
+      for filename in dir_list:
+        row = {'file': splitext(basename(filename))[0] }
+        baseline_file_name = baseline_dir + "/" + filename
+
+        # Read the metric file from each of the directories in our list.
+        for directory in dirs:
+          metric_file_name = directory + "/" + filename
+
+          # If there is a metric file in the current directory, open it
+          # and calculate its overall difference between it and the baseline
+          # directory's metric file.
+          if os.path.isfile(metric_file_name):
+            overall = FileBetter(baseline_file_name, metric_file_name,
+                                 column, metric)
+            row[directory] = overall
+
+            sumoverall[directory] += overall
+            countoverall[directory] += 1
+
+        data.append(row)
+
+      # Add the overall numbers.
+      row = {"file": "OVERALL" }
+      for directory in dirs:
+        row[directory] = sumoverall[directory] / countoverall[directory]
+      data.append(row)
+
+      # write the tables out
+      data_table = gviz_api.DataTable(description)
+      data_table.LoadData(data)
+
+      filestable[metric] = ( filestable[metric] + "filestable_" + metric +
+                             "[" + str(column) + "]=" +
+                             data_table.ToJSon(columns_order=["file"]+dirs) + "\n" )
+
+    filestable_avg = filestable['avg']
+    filestable_dpsnr = filestable['dsnr']
+    filestable_drate = filestable['drate']
+
+    # Now we collect all the data for all the graphs.  First the column
+    # headers which will be Datarate and then each directory.
+    columns = ("datarate",baseline_dir)
+    description = {"datarate":("number", "Datarate")}
+    for directory in dirs:
+      description[directory] = ("number", directory)
+
+    description[baseline_dir] = ("number", baseline_dir)
+
+    snrs = snrs + "snrs[" + str(column) + "] = ["
+
+    # Now collect the data for the graphs, file by file.
+    for filename in dir_list:
+
+      data = []
+
+      # Collect the file in each directory and store all of its metrics
+      # in the associated gviz metrics table.
+      all_dirs = dirs + [baseline_dir]
+      for directory in all_dirs:
+
+        metric_file_name = directory + "/" + filename
+        if not os.path.isfile(metric_file_name):
+          continue
+
+        # Read and parse the metrics file storing it to the data we'll
+        # use for the gviz_api.Datatable.
+        metrics = ParseMetricFile(metric_file_name, column)
+        for bitrate, metric in metrics:
+          data.append({"datarate": bitrate, directory: metric})
+
+      data_table = gviz_api.DataTable(description)
+      data_table.LoadData(data)
+      snrs = snrs + "'" + data_table.ToJSon(
+         columns_order=tuple(["datarate",baseline_dir]+dirs)) + "',"
+
+    snrs = snrs + "]\n"
+
+    formatters = ""
+    for i in range(len(dirs)):
+      formatters = "%s   formatter.format(better, %d);" % (formatters, i+1)
+
+  print FillForm(page_template, vars())
+  return
+
+if len(sys.argv) < 3:
+  print HandleFiles.__doc__
+else:
+  HandleFiles(sys.argv)
diff --git a/libs/libaom/src/test/warp_filter_test.cc b/libs/libaom/src/test/warp_filter_test.cc
new file mode 100644
index 000000000..c5e87f085
--- /dev/null
+++ b/libs/libaom/src/test/warp_filter_test.cc
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <tuple>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/warp_filter_test_util.h"
+using libaom_test::ACMRandom;
+#if CONFIG_AV1_HIGHBITDEPTH
+using libaom_test::AV1HighbdWarpFilter::AV1HighbdWarpFilterTest;
+#endif
+using libaom_test::AV1WarpFilter::AV1WarpFilterTest;
+using std::make_tuple;
+using std::tuple;
+
+namespace {
+
+TEST_P(AV1WarpFilterTest, CheckOutput) {
+  RunCheckOutput(std::get<3>(GET_PARAM(0)));
+}
+TEST_P(AV1WarpFilterTest, DISABLED_Speed) {
+  RunSpeedTest(std::get<3>(GET_PARAM(0)));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    C, AV1WarpFilterTest,
+    libaom_test::AV1WarpFilter::BuildParams(av1_warp_affine_c));
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(
+    SSE4_1, AV1WarpFilterTest,
+    libaom_test::AV1WarpFilter::BuildParams(av1_warp_affine_sse4_1));
+
+#if CONFIG_AV1_HIGHBITDEPTH
+TEST_P(AV1HighbdWarpFilterTest, CheckOutput) {
+  RunCheckOutput(std::get<4>(GET_PARAM(0)));
+}
+TEST_P(AV1HighbdWarpFilterTest, DISABLED_Speed) {
+  RunSpeedTest(std::get<4>(GET_PARAM(0)));
+}
+
+INSTANTIATE_TEST_SUITE_P(SSE4_1, AV1HighbdWarpFilterTest,
+                         libaom_test::AV1HighbdWarpFilter::BuildParams(
+                             av1_highbd_warp_affine_sse4_1));
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+#endif  // HAVE_SSE4_1
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, AV1WarpFilterTest,
+    libaom_test::AV1WarpFilter::BuildParams(av1_warp_affine_avx2));
+#endif  // HAVE_AVX2
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+    NEON, AV1WarpFilterTest,
+    libaom_test::AV1WarpFilter::BuildParams(av1_warp_affine_neon));
+#endif  // HAVE_NEON
+
+}  // namespace
diff --git a/libs/libaom/src/test/warp_filter_test_util.cc b/libs/libaom/src/test/warp_filter_test_util.cc
new file mode 100644
index 000000000..bcb0c1859
--- /dev/null
+++ b/libs/libaom/src/test/warp_filter_test_util.cc
@@ -0,0 +1,483 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include "aom_ports/aom_timer.h"
+#include "test/warp_filter_test_util.h"
+
+using std::make_tuple;
+using std::tuple;
+
+namespace libaom_test {
+
+int32_t random_warped_param(libaom_test::ACMRandom *rnd, int bits) {
+  // 1 in 8 chance of generating zero (arbitrarily chosen)
+  if (((rnd->Rand8()) & 7) == 0) return 0;
+  // Otherwise, enerate uniform values in the range
+  // [-(1 << bits), 1] U [1, 1<<bits]
+  int32_t v = 1 + (rnd->Rand16() & ((1 << bits) - 1));
+  if ((rnd->Rand8()) & 1) return -v;
+  return v;
+}
+
+void generate_warped_model(libaom_test::ACMRandom *rnd, int32_t *mat,
+                           int16_t *alpha, int16_t *beta, int16_t *gamma,
+                           int16_t *delta, const int is_alpha_zero,
+                           const int is_beta_zero, const int is_gamma_zero,
+                           const int is_delta_zero) {
+  while (1) {
+    int rnd8 = rnd->Rand8() & 3;
+    mat[0] = random_warped_param(rnd, WARPEDMODEL_PREC_BITS + 6);
+    mat[1] = random_warped_param(rnd, WARPEDMODEL_PREC_BITS + 6);
+    mat[2] = (random_warped_param(rnd, WARPEDMODEL_PREC_BITS - 3)) +
+             (1 << WARPEDMODEL_PREC_BITS);
+    mat[3] = random_warped_param(rnd, WARPEDMODEL_PREC_BITS - 3);
+
+    if (rnd8 <= 1) {
+      // AFFINE
+      mat[4] = random_warped_param(rnd, WARPEDMODEL_PREC_BITS - 3);
+      mat[5] = (random_warped_param(rnd, WARPEDMODEL_PREC_BITS - 3)) +
+               (1 << WARPEDMODEL_PREC_BITS);
+    } else if (rnd8 == 2) {
+      mat[4] = -mat[3];
+      mat[5] = mat[2];
+    } else {
+      mat[4] = random_warped_param(rnd, WARPEDMODEL_PREC_BITS - 3);
+      mat[5] = (random_warped_param(rnd, WARPEDMODEL_PREC_BITS - 3)) +
+               (1 << WARPEDMODEL_PREC_BITS);
+      if (is_alpha_zero == 1) mat[2] = 1 << WARPEDMODEL_PREC_BITS;
+      if (is_beta_zero == 1) mat[3] = 0;
+      if (is_gamma_zero == 1) mat[4] = 0;
+      if (is_delta_zero == 1)
+        mat[5] = static_cast<int32_t>(
+            ((static_cast<int64_t>(mat[3]) * mat[4] + (mat[2] / 2)) / mat[2]) +
+            (1 << WARPEDMODEL_PREC_BITS));
+    }
+
+    // Calculate the derived parameters and check that they are suitable
+    // for the warp filter.
+    assert(mat[2] != 0);
+
+    *alpha = clamp(mat[2] - (1 << WARPEDMODEL_PREC_BITS), INT16_MIN, INT16_MAX);
+    *beta = clamp(mat[3], INT16_MIN, INT16_MAX);
+    *gamma = static_cast<int16_t>(clamp64(
+        (static_cast<int64_t>(mat[4]) * (1 << WARPEDMODEL_PREC_BITS)) / mat[2],
+        INT16_MIN, INT16_MAX));
+    *delta = static_cast<int16_t>(clamp64(
+        mat[5] -
+            ((static_cast<int64_t>(mat[3]) * mat[4] + (mat[2] / 2)) / mat[2]) -
+            (1 << WARPEDMODEL_PREC_BITS),
+        INT16_MIN, INT16_MAX));
+
+    if ((4 * abs(*alpha) + 7 * abs(*beta) >= (1 << WARPEDMODEL_PREC_BITS)) ||
+        (4 * abs(*gamma) + 4 * abs(*delta) >= (1 << WARPEDMODEL_PREC_BITS)))
+      continue;
+
+    *alpha = ROUND_POWER_OF_TWO_SIGNED(*alpha, WARP_PARAM_REDUCE_BITS) *
+             (1 << WARP_PARAM_REDUCE_BITS);
+    *beta = ROUND_POWER_OF_TWO_SIGNED(*beta, WARP_PARAM_REDUCE_BITS) *
+            (1 << WARP_PARAM_REDUCE_BITS);
+    *gamma = ROUND_POWER_OF_TWO_SIGNED(*gamma, WARP_PARAM_REDUCE_BITS) *
+             (1 << WARP_PARAM_REDUCE_BITS);
+    *delta = ROUND_POWER_OF_TWO_SIGNED(*delta, WARP_PARAM_REDUCE_BITS) *
+             (1 << WARP_PARAM_REDUCE_BITS);
+
+    // We have a valid model, so finish
+    return;
+  }
+}
+
+namespace AV1WarpFilter {
+::testing::internal::ParamGenerator<WarpTestParams> BuildParams(
+    warp_affine_func filter) {
+  WarpTestParam params[] = {
+    make_tuple(4, 4, 50000, filter),  make_tuple(8, 8, 50000, filter),
+    make_tuple(64, 64, 1000, filter), make_tuple(4, 16, 20000, filter),
+    make_tuple(32, 8, 10000, filter),
+  };
+  return ::testing::Combine(::testing::ValuesIn(params),
+                            ::testing::Values(0, 1), ::testing::Values(0, 1),
+                            ::testing::Values(0, 1), ::testing::Values(0, 1));
+}
+
+AV1WarpFilterTest::~AV1WarpFilterTest() {}
+void AV1WarpFilterTest::SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); }
+
+void AV1WarpFilterTest::TearDown() { libaom_test::ClearSystemState(); }
+
+void AV1WarpFilterTest::RunSpeedTest(warp_affine_func test_impl) {
+  const int w = 128, h = 128;
+  const int border = 16;
+  const int stride = w + 2 * border;
+  WarpTestParam params = GET_PARAM(0);
+  const int out_w = std::get<0>(params), out_h = std::get<1>(params);
+  const int is_alpha_zero = GET_PARAM(1);
+  const int is_beta_zero = GET_PARAM(2);
+  const int is_gamma_zero = GET_PARAM(3);
+  const int is_delta_zero = GET_PARAM(4);
+  int sub_x, sub_y;
+  const int bd = 8;
+
+  uint8_t *input_ = new uint8_t[h * stride];
+  uint8_t *input = input_ + border;
+
+  // The warp functions always write rows with widths that are multiples of 8.
+  // So to avoid a buffer overflow, we may need to pad rows to a multiple of 8.
+  int output_n = ((out_w + 7) & ~7) * out_h;
+  uint8_t *output = new uint8_t[output_n];
+  int32_t mat[8];
+  int16_t alpha, beta, gamma, delta;
+  ConvolveParams conv_params = get_conv_params(0, 0, bd);
+  CONV_BUF_TYPE *dsta = new CONV_BUF_TYPE[output_n];
+  generate_warped_model(&rnd_, mat, &alpha, &beta, &gamma, &delta,
+                        is_alpha_zero, is_beta_zero, is_gamma_zero,
+                        is_delta_zero);
+
+  for (int r = 0; r < h; ++r)
+    for (int c = 0; c < w; ++c) input[r * stride + c] = rnd_.Rand8();
+  for (int r = 0; r < h; ++r) {
+    memset(input + r * stride - border, input[r * stride], border);
+    memset(input + r * stride + w, input[r * stride + (w - 1)], border);
+  }
+
+  sub_x = 0;
+  sub_y = 0;
+  int do_average = 0;
+
+  conv_params = get_conv_params_no_round(do_average, 0, dsta, out_w, 1, bd);
+  conv_params.use_dist_wtd_comp_avg = 0;
+
+  const int num_loops = 1000000000 / (out_w + out_h);
+  aom_usec_timer timer;
+  aom_usec_timer_start(&timer);
+  for (int i = 0; i < num_loops; ++i)
+    test_impl(mat, input, w, h, stride, output, 32, 32, out_w, out_h, out_w,
+              sub_x, sub_y, &conv_params, alpha, beta, gamma, delta);
+
+  aom_usec_timer_mark(&timer);
+  const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+  printf("warp %3dx%-3d: %7.2f ns\n", out_w, out_h,
+         1000.0 * elapsed_time / num_loops);
+
+  delete[] input_;
+  delete[] output;
+  delete[] dsta;
+}
+
+void AV1WarpFilterTest::RunCheckOutput(warp_affine_func test_impl) {
+  const int w = 128, h = 128;
+  const int border = 16;
+  const int stride = w + 2 * border;
+  WarpTestParam params = GET_PARAM(0);
+  const int is_alpha_zero = GET_PARAM(1);
+  const int is_beta_zero = GET_PARAM(2);
+  const int is_gamma_zero = GET_PARAM(3);
+  const int is_delta_zero = GET_PARAM(4);
+  const int out_w = std::get<0>(params), out_h = std::get<1>(params);
+  const int num_iters = std::get<2>(params);
+  int i, j, sub_x, sub_y;
+  const int bd = 8;
+
+  // The warp functions always write rows with widths that are multiples of 8.
+  // So to avoid a buffer overflow, we may need to pad rows to a multiple of 8.
+  int output_n = ((out_w + 7) & ~7) * out_h;
+  uint8_t *input_ = new uint8_t[h * stride];
+  uint8_t *input = input_ + border;
+  uint8_t *output = new uint8_t[output_n];
+  uint8_t *output2 = new uint8_t[output_n];
+  int32_t mat[8];
+  int16_t alpha, beta, gamma, delta;
+  ConvolveParams conv_params = get_conv_params(0, 0, bd);
+  CONV_BUF_TYPE *dsta = new CONV_BUF_TYPE[output_n];
+  CONV_BUF_TYPE *dstb = new CONV_BUF_TYPE[output_n];
+  for (int i = 0; i < output_n; ++i) output[i] = output2[i] = rnd_.Rand8();
+
+  for (i = 0; i < num_iters; ++i) {
+    // Generate an input block and extend its borders horizontally
+    for (int r = 0; r < h; ++r)
+      for (int c = 0; c < w; ++c) input[r * stride + c] = rnd_.Rand8();
+    for (int r = 0; r < h; ++r) {
+      memset(input + r * stride - border, input[r * stride], border);
+      memset(input + r * stride + w, input[r * stride + (w - 1)], border);
+    }
+    const int use_no_round = rnd_.Rand8() & 1;
+    for (sub_x = 0; sub_x < 2; ++sub_x)
+      for (sub_y = 0; sub_y < 2; ++sub_y) {
+        generate_warped_model(&rnd_, mat, &alpha, &beta, &gamma, &delta,
+                              is_alpha_zero, is_beta_zero, is_gamma_zero,
+                              is_delta_zero);
+
+        for (int ii = 0; ii < 2; ++ii) {
+          for (int jj = 0; jj < 5; ++jj) {
+            for (int do_average = 0; do_average <= 1; ++do_average) {
+              if (use_no_round) {
+                conv_params =
+                    get_conv_params_no_round(do_average, 0, dsta, out_w, 1, bd);
+              } else {
+                conv_params = get_conv_params(0, 0, bd);
+              }
+              if (jj >= 4) {
+                conv_params.use_dist_wtd_comp_avg = 0;
+              } else {
+                conv_params.use_dist_wtd_comp_avg = 1;
+                conv_params.fwd_offset = quant_dist_lookup_table[ii][jj][0];
+                conv_params.bck_offset = quant_dist_lookup_table[ii][jj][1];
+              }
+              av1_warp_affine_c(mat, input, w, h, stride, output, 32, 32, out_w,
+                                out_h, out_w, sub_x, sub_y, &conv_params, alpha,
+                                beta, gamma, delta);
+              if (use_no_round) {
+                conv_params =
+                    get_conv_params_no_round(do_average, 0, dstb, out_w, 1, bd);
+              }
+              if (jj >= 4) {
+                conv_params.use_dist_wtd_comp_avg = 0;
+              } else {
+                conv_params.use_dist_wtd_comp_avg = 1;
+                conv_params.fwd_offset = quant_dist_lookup_table[ii][jj][0];
+                conv_params.bck_offset = quant_dist_lookup_table[ii][jj][1];
+              }
+              test_impl(mat, input, w, h, stride, output2, 32, 32, out_w, out_h,
+                        out_w, sub_x, sub_y, &conv_params, alpha, beta, gamma,
+                        delta);
+              if (use_no_round) {
+                for (j = 0; j < out_w * out_h; ++j)
+                  ASSERT_EQ(dsta[j], dstb[j])
+                      << "Pixel mismatch at index " << j << " = ("
+                      << (j % out_w) << ", " << (j / out_w) << ") on iteration "
+                      << i;
+                for (j = 0; j < out_w * out_h; ++j)
+                  ASSERT_EQ(output[j], output2[j])
+                      << "Pixel mismatch at index " << j << " = ("
+                      << (j % out_w) << ", " << (j / out_w) << ") on iteration "
+                      << i;
+              } else {
+                for (j = 0; j < out_w * out_h; ++j)
+                  ASSERT_EQ(output[j], output2[j])
+                      << "Pixel mismatch at index " << j << " = ("
+                      << (j % out_w) << ", " << (j / out_w) << ") on iteration "
+                      << i;
+              }
+            }
+          }
+        }
+      }
+  }
+  delete[] input_;
+  delete[] output;
+  delete[] output2;
+  delete[] dsta;
+  delete[] dstb;
+}
+}  // namespace AV1WarpFilter
+
+#if CONFIG_AV1_HIGHBITDEPTH
+namespace AV1HighbdWarpFilter {
+::testing::internal::ParamGenerator<HighbdWarpTestParams> BuildParams(
+    highbd_warp_affine_func filter) {
+  const HighbdWarpTestParam params[] = {
+    make_tuple(4, 4, 100, 8, filter),    make_tuple(8, 8, 100, 8, filter),
+    make_tuple(64, 64, 100, 8, filter),  make_tuple(4, 16, 100, 8, filter),
+    make_tuple(32, 8, 100, 8, filter),   make_tuple(4, 4, 100, 10, filter),
+    make_tuple(8, 8, 100, 10, filter),   make_tuple(64, 64, 100, 10, filter),
+    make_tuple(4, 16, 100, 10, filter),  make_tuple(32, 8, 100, 10, filter),
+    make_tuple(4, 4, 100, 12, filter),   make_tuple(8, 8, 100, 12, filter),
+    make_tuple(64, 64, 100, 12, filter), make_tuple(4, 16, 100, 12, filter),
+    make_tuple(32, 8, 100, 12, filter),
+  };
+  return ::testing::Combine(::testing::ValuesIn(params),
+                            ::testing::Values(0, 1), ::testing::Values(0, 1),
+                            ::testing::Values(0, 1), ::testing::Values(0, 1));
+}
+
+AV1HighbdWarpFilterTest::~AV1HighbdWarpFilterTest() {}
+void AV1HighbdWarpFilterTest::SetUp() {
+  rnd_.Reset(ACMRandom::DeterministicSeed());
+}
+
+void AV1HighbdWarpFilterTest::TearDown() { libaom_test::ClearSystemState(); }
+
+void AV1HighbdWarpFilterTest::RunSpeedTest(highbd_warp_affine_func test_impl) {
+  const int w = 128, h = 128;
+  const int border = 16;
+  const int stride = w + 2 * border;
+  HighbdWarpTestParam param = GET_PARAM(0);
+  const int is_alpha_zero = GET_PARAM(1);
+  const int is_beta_zero = GET_PARAM(2);
+  const int is_gamma_zero = GET_PARAM(3);
+  const int is_delta_zero = GET_PARAM(4);
+  const int out_w = std::get<0>(param), out_h = std::get<1>(param);
+  const int bd = std::get<3>(param);
+  const int mask = (1 << bd) - 1;
+  int sub_x, sub_y;
+
+  // The warp functions always write rows with widths that are multiples of 8.
+  // So to avoid a buffer overflow, we may need to pad rows to a multiple of 8.
+  int output_n = ((out_w + 7) & ~7) * out_h;
+  uint16_t *input_ = new uint16_t[h * stride];
+  uint16_t *input = input_ + border;
+  uint16_t *output = new uint16_t[output_n];
+  int32_t mat[8];
+  int16_t alpha, beta, gamma, delta;
+  ConvolveParams conv_params = get_conv_params(0, 0, bd);
+  CONV_BUF_TYPE *dsta = new CONV_BUF_TYPE[output_n];
+
+  generate_warped_model(&rnd_, mat, &alpha, &beta, &gamma, &delta,
+                        is_alpha_zero, is_beta_zero, is_gamma_zero,
+                        is_delta_zero);
+  // Generate an input block and extend its borders horizontally
+  for (int r = 0; r < h; ++r)
+    for (int c = 0; c < w; ++c) input[r * stride + c] = rnd_.Rand16() & mask;
+  for (int r = 0; r < h; ++r) {
+    for (int c = 0; c < border; ++c) {
+      input[r * stride - border + c] = input[r * stride];
+      input[r * stride + w + c] = input[r * stride + (w - 1)];
+    }
+  }
+
+  sub_x = 0;
+  sub_y = 0;
+  int do_average = 0;
+  conv_params.use_dist_wtd_comp_avg = 0;
+  conv_params = get_conv_params_no_round(do_average, 0, dsta, out_w, 1, bd);
+
+  const int num_loops = 1000000000 / (out_w + out_h);
+  aom_usec_timer timer;
+  aom_usec_timer_start(&timer);
+
+  for (int i = 0; i < num_loops; ++i)
+    test_impl(mat, input, w, h, stride, output, 32, 32, out_w, out_h, out_w,
+              sub_x, sub_y, bd, &conv_params, alpha, beta, gamma, delta);
+
+  aom_usec_timer_mark(&timer);
+  const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+  printf("highbd warp %3dx%-3d: %7.2f ns\n", out_w, out_h,
+         1000.0 * elapsed_time / num_loops);
+
+  delete[] input_;
+  delete[] output;
+  delete[] dsta;
+}
+
+void AV1HighbdWarpFilterTest::RunCheckOutput(
+    highbd_warp_affine_func test_impl) {
+  const int w = 128, h = 128;
+  const int border = 16;
+  const int stride = w + 2 * border;
+  HighbdWarpTestParam param = GET_PARAM(0);
+  const int is_alpha_zero = GET_PARAM(1);
+  const int is_beta_zero = GET_PARAM(2);
+  const int is_gamma_zero = GET_PARAM(3);
+  const int is_delta_zero = GET_PARAM(4);
+  const int out_w = std::get<0>(param), out_h = std::get<1>(param);
+  const int bd = std::get<3>(param);
+  const int num_iters = std::get<2>(param);
+  const int mask = (1 << bd) - 1;
+  int i, j, sub_x, sub_y;
+
+  // The warp functions always write rows with widths that are multiples of 8.
+  // So to avoid a buffer overflow, we may need to pad rows to a multiple of 8.
+  int output_n = ((out_w + 7) & ~7) * out_h;
+  uint16_t *input_ = new uint16_t[h * stride];
+  uint16_t *input = input_ + border;
+  uint16_t *output = new uint16_t[output_n];
+  uint16_t *output2 = new uint16_t[output_n];
+  int32_t mat[8];
+  int16_t alpha, beta, gamma, delta;
+  ConvolveParams conv_params = get_conv_params(0, 0, bd);
+  CONV_BUF_TYPE *dsta = new CONV_BUF_TYPE[output_n];
+  CONV_BUF_TYPE *dstb = new CONV_BUF_TYPE[output_n];
+  for (int i = 0; i < output_n; ++i) output[i] = output2[i] = rnd_.Rand16();
+
+  for (i = 0; i < num_iters; ++i) {
+    // Generate an input block and extend its borders horizontally
+    for (int r = 0; r < h; ++r)
+      for (int c = 0; c < w; ++c) input[r * stride + c] = rnd_.Rand16() & mask;
+    for (int r = 0; r < h; ++r) {
+      for (int c = 0; c < border; ++c) {
+        input[r * stride - border + c] = input[r * stride];
+        input[r * stride + w + c] = input[r * stride + (w - 1)];
+      }
+    }
+    const int use_no_round = rnd_.Rand8() & 1;
+    for (sub_x = 0; sub_x < 2; ++sub_x)
+      for (sub_y = 0; sub_y < 2; ++sub_y) {
+        generate_warped_model(&rnd_, mat, &alpha, &beta, &gamma, &delta,
+                              is_alpha_zero, is_beta_zero, is_gamma_zero,
+                              is_delta_zero);
+        for (int ii = 0; ii < 2; ++ii) {
+          for (int jj = 0; jj < 5; ++jj) {
+            for (int do_average = 0; do_average <= 1; ++do_average) {
+              if (use_no_round) {
+                conv_params =
+                    get_conv_params_no_round(do_average, 0, dsta, out_w, 1, bd);
+              } else {
+                conv_params = get_conv_params(0, 0, bd);
+              }
+              if (jj >= 4) {
+                conv_params.use_dist_wtd_comp_avg = 0;
+              } else {
+                conv_params.use_dist_wtd_comp_avg = 1;
+                conv_params.fwd_offset = quant_dist_lookup_table[ii][jj][0];
+                conv_params.bck_offset = quant_dist_lookup_table[ii][jj][1];
+              }
+
+              av1_highbd_warp_affine_c(mat, input, w, h, stride, output, 32, 32,
+                                       out_w, out_h, out_w, sub_x, sub_y, bd,
+                                       &conv_params, alpha, beta, gamma, delta);
+              if (use_no_round) {
+                // TODO(angiebird): Change this to test_impl once we have SIMD
+                // implementation
+                conv_params =
+                    get_conv_params_no_round(do_average, 0, dstb, out_w, 1, bd);
+              }
+              if (jj >= 4) {
+                conv_params.use_dist_wtd_comp_avg = 0;
+              } else {
+                conv_params.use_dist_wtd_comp_avg = 1;
+                conv_params.fwd_offset = quant_dist_lookup_table[ii][jj][0];
+                conv_params.bck_offset = quant_dist_lookup_table[ii][jj][1];
+              }
+              test_impl(mat, input, w, h, stride, output2, 32, 32, out_w, out_h,
+                        out_w, sub_x, sub_y, bd, &conv_params, alpha, beta,
+                        gamma, delta);
+
+              if (use_no_round) {
+                for (j = 0; j < out_w * out_h; ++j)
+                  ASSERT_EQ(dsta[j], dstb[j])
+                      << "Pixel mismatch at index " << j << " = ("
+                      << (j % out_w) << ", " << (j / out_w) << ") on iteration "
+                      << i;
+                for (j = 0; j < out_w * out_h; ++j)
+                  ASSERT_EQ(output[j], output2[j])
+                      << "Pixel mismatch at index " << j << " = ("
+                      << (j % out_w) << ", " << (j / out_w) << ") on iteration "
+                      << i;
+              } else {
+                for (j = 0; j < out_w * out_h; ++j)
+                  ASSERT_EQ(output[j], output2[j])
+                      << "Pixel mismatch at index " << j << " = ("
+                      << (j % out_w) << ", " << (j / out_w) << ") on iteration "
+                      << i;
+              }
+            }
+          }
+        }
+      }
+  }
+
+  delete[] input_;
+  delete[] output;
+  delete[] output2;
+  delete[] dsta;
+  delete[] dstb;
+}
+}  // namespace AV1HighbdWarpFilter
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+}  // namespace libaom_test
diff --git a/libs/libaom/src/test/warp_filter_test_util.h b/libs/libaom/src/test/warp_filter_test_util.h
new file mode 100644
index 000000000..66a6e244b
--- /dev/null
+++ b/libs/libaom/src/test/warp_filter_test_util.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_TEST_WARP_FILTER_TEST_UTIL_H_
+#define AOM_TEST_WARP_FILTER_TEST_UTIL_H_
+
+#include <tuple>
+
+#include "config/av1_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/acm_random.h"
+#include "test/util.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+
+#include "av1/common/mv.h"
+#include "av1/common/common_data.h"
+
+namespace libaom_test {
+
+void generate_warped_model(libaom_test::ACMRandom *rnd, int32_t *mat,
+                           int16_t *alpha, int16_t *beta, int16_t *gamma,
+                           int16_t *delta, int is_alpha_zero, int is_beta_zero,
+                           int is_gamma_zero, int is_delta_zero);
+
+namespace AV1WarpFilter {
+
+typedef void (*warp_affine_func)(const int32_t *mat, const uint8_t *ref,
+                                 int width, int height, int stride,
+                                 uint8_t *pred, int p_col, int p_row,
+                                 int p_width, int p_height, int p_stride,
+                                 int subsampling_x, int subsampling_y,
+                                 ConvolveParams *conv_params, int16_t alpha,
+                                 int16_t beta, int16_t gamma, int16_t delta);
+
+typedef std::tuple<int, int, int, warp_affine_func> WarpTestParam;
+typedef std::tuple<WarpTestParam, int, int, int, int> WarpTestParams;
+
+::testing::internal::ParamGenerator<WarpTestParams> BuildParams(
+    warp_affine_func filter);
+
+class AV1WarpFilterTest : public ::testing::TestWithParam<WarpTestParams> {
+ public:
+  virtual ~AV1WarpFilterTest();
+  virtual void SetUp();
+
+  virtual void TearDown();
+
+ protected:
+  void RunCheckOutput(warp_affine_func test_impl);
+  void RunSpeedTest(warp_affine_func test_impl);
+
+  libaom_test::ACMRandom rnd_;
+};
+
+}  // namespace AV1WarpFilter
+
+#if CONFIG_AV1_HIGHBITDEPTH
+namespace AV1HighbdWarpFilter {
+typedef void (*highbd_warp_affine_func)(const int32_t *mat, const uint16_t *ref,
+                                        int width, int height, int stride,
+                                        uint16_t *pred, int p_col, int p_row,
+                                        int p_width, int p_height, int p_stride,
+                                        int subsampling_x, int subsampling_y,
+                                        int bd, ConvolveParams *conv_params,
+                                        int16_t alpha, int16_t beta,
+                                        int16_t gamma, int16_t delta);
+
+typedef std::tuple<int, int, int, int, highbd_warp_affine_func>
+    HighbdWarpTestParam;
+typedef std::tuple<HighbdWarpTestParam, int, int, int, int>
+    HighbdWarpTestParams;
+
+::testing::internal::ParamGenerator<HighbdWarpTestParams> BuildParams(
+    highbd_warp_affine_func filter);
+
+class AV1HighbdWarpFilterTest
+    : public ::testing::TestWithParam<HighbdWarpTestParams> {
+ public:
+  virtual ~AV1HighbdWarpFilterTest();
+  virtual void SetUp();
+
+  virtual void TearDown();
+
+ protected:
+  void RunCheckOutput(highbd_warp_affine_func test_impl);
+  void RunSpeedTest(highbd_warp_affine_func test_impl);
+
+  libaom_test::ACMRandom rnd_;
+};
+
+}  // namespace AV1HighbdWarpFilter
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+}  // namespace libaom_test
+
+#endif  // AOM_TEST_WARP_FILTER_TEST_UTIL_H_
diff --git a/libs/libaom/src/test/webm_video_source.h b/libs/libaom/src/test/webm_video_source.h
new file mode 100644
index 000000000..bb3d11735
--- /dev/null
+++ b/libs/libaom/src/test/webm_video_source.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_TEST_WEBM_VIDEO_SOURCE_H_
+#define AOM_TEST_WEBM_VIDEO_SOURCE_H_
+#include <cstdarg>
+#include <cstdio>
+#include <cstdlib>
+#include <new>
+#include <string>
+#include "common/tools_common.h"
+#include "common/webmdec.h"
+#include "test/video_source.h"
+
+namespace libaom_test {
+
+// This class extends VideoSource to allow parsing of WebM files,
+// so that we can do actual file decodes.
+class WebMVideoSource : public CompressedVideoSource {
+ public:
+  explicit WebMVideoSource(const std::string &file_name)
+      : file_name_(file_name), aom_ctx_(new AvxInputContext()),
+        webm_ctx_(new WebmInputContext()), buf_(NULL), buf_sz_(0), frame_sz_(0),
+        frame_number_(0), end_of_file_(false) {}
+
+  virtual ~WebMVideoSource() {
+    if (aom_ctx_->file != NULL) fclose(aom_ctx_->file);
+    webm_free(webm_ctx_);
+    delete aom_ctx_;
+    delete webm_ctx_;
+  }
+
+  virtual void Init() {}
+
+  virtual void Begin() {
+    aom_ctx_->file = OpenTestDataFile(file_name_);
+    ASSERT_TRUE(aom_ctx_->file != NULL)
+        << "Input file open failed. Filename: " << file_name_;
+
+    ASSERT_EQ(file_is_webm(webm_ctx_, aom_ctx_), 1) << "file is not WebM";
+
+    FillFrame();
+  }
+
+  virtual void Next() {
+    ++frame_number_;
+    FillFrame();
+  }
+
+  void FillFrame() {
+    ASSERT_TRUE(aom_ctx_->file != NULL);
+    const int status = webm_read_frame(webm_ctx_, &buf_, &frame_sz_, &buf_sz_);
+    ASSERT_GE(status, 0) << "webm_read_frame failed";
+    if (status == 1) {
+      end_of_file_ = true;
+    }
+  }
+
+  void SeekToNextKeyFrame() {
+    ASSERT_TRUE(aom_ctx_->file != NULL);
+    do {
+      const int status =
+          webm_read_frame(webm_ctx_, &buf_, &frame_sz_, &buf_sz_);
+      ASSERT_GE(status, 0) << "webm_read_frame failed";
+      ++frame_number_;
+      if (status == 1) {
+        end_of_file_ = true;
+      }
+    } while (!webm_ctx_->is_key_frame && !end_of_file_);
+  }
+
+  virtual const uint8_t *cxdata() const { return end_of_file_ ? NULL : buf_; }
+  virtual size_t frame_size() const { return frame_sz_; }
+  virtual unsigned int frame_number() const { return frame_number_; }
+
+ protected:
+  std::string file_name_;
+  AvxInputContext *aom_ctx_;
+  WebmInputContext *webm_ctx_;
+  uint8_t *buf_;  // Owned by webm_ctx_ and freed when webm_ctx_ is freed.
+  size_t buf_sz_;
+  size_t frame_sz_;
+  unsigned int frame_number_;
+  bool end_of_file_;
+};
+
+}  // namespace libaom_test
+
+#endif  // AOM_TEST_WEBM_VIDEO_SOURCE_H_
diff --git a/libs/libaom/src/test/wiener_test.cc b/libs/libaom/src/test/wiener_test.cc
new file mode 100644
index 000000000..81839fd56
--- /dev/null
+++ b/libs/libaom/src/test/wiener_test.cc
@@ -0,0 +1,587 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tuple>
+#include <vector>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "test/register_state_check.h"
+#include "test/acm_random.h"
+#include "test/util.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_ports/aom_timer.h"
+#include "av1/encoder/pickrst.h"
+
+#define MAX_WIENER_BLOCK 384
+#define MAX_DATA_BLOCK (MAX_WIENER_BLOCK + WIENER_WIN)
+
+// 8-bit-depth tests
+namespace wiener_lowbd {
+
+static void compute_stats_win_opt_c(int wiener_win, const uint8_t *dgd,
+                                    const uint8_t *src, int h_start, int h_end,
+                                    int v_start, int v_end, int dgd_stride,
+                                    int src_stride, int64_t *M, int64_t *H) {
+  ASSERT_TRUE(wiener_win == WIENER_WIN || wiener_win == WIENER_WIN_CHROMA);
+  int i, j, k, l, m, n;
+  const int pixel_count = (h_end - h_start) * (v_end - v_start);
+  const int wiener_win2 = wiener_win * wiener_win;
+  const int wiener_halfwin = (wiener_win >> 1);
+  uint8_t avg = find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+
+  std::vector<std::vector<int64_t> > M_int(wiener_win,
+                                           std::vector<int64_t>(wiener_win, 0));
+  std::vector<std::vector<int64_t> > H_int(
+      wiener_win * wiener_win, std::vector<int64_t>(wiener_win * 8, 0));
+  std::vector<std::vector<int32_t> > sumY(wiener_win,
+                                          std::vector<int32_t>(wiener_win, 0));
+  int32_t sumX = 0;
+  const uint8_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
+
+  for (i = v_start; i < v_end; i++) {
+    for (j = h_start; j < h_end; j += 2) {
+      const uint8_t X1 = src[i * src_stride + j];
+      const uint8_t X2 = src[i * src_stride + j + 1];
+      sumX += X1 + X2;
+
+      const uint8_t *dgd_ij = dgd_win + i * dgd_stride + j;
+      for (k = 0; k < wiener_win; k++) {
+        for (l = 0; l < wiener_win; l++) {
+          const uint8_t *dgd_ijkl = dgd_ij + k * dgd_stride + l;
+          int64_t *H_int_temp = &H_int[(l * wiener_win + k)][0];
+          const uint8_t D1 = dgd_ijkl[0];
+          const uint8_t D2 = dgd_ijkl[1];
+          sumY[k][l] += D1 + D2;
+          M_int[l][k] += D1 * X1 + D2 * X2;
+          for (m = 0; m < wiener_win; m++) {
+            for (n = 0; n < wiener_win; n++) {
+              H_int_temp[m * 8 + n] += D1 * dgd_ij[n + dgd_stride * m] +
+                                       D2 * dgd_ij[n + dgd_stride * m + 1];
+            }
+          }
+        }
+      }
+    }
+  }
+
+  const int64_t avg_square_sum = (int64_t)avg * (int64_t)avg * pixel_count;
+  for (k = 0; k < wiener_win; k++) {
+    for (l = 0; l < wiener_win; l++) {
+      M[l * wiener_win + k] =
+          M_int[l][k] + avg_square_sum - (int64_t)avg * (sumX + sumY[k][l]);
+      for (m = 0; m < wiener_win; m++) {
+        for (n = 0; n < wiener_win; n++) {
+          H[(l * wiener_win + k) * wiener_win2 + m * wiener_win + n] =
+              H_int[(l * wiener_win + k)][n * 8 + m] + avg_square_sum -
+              (int64_t)avg * (sumY[k][l] + sumY[n][m]);
+        }
+      }
+    }
+  }
+}
+
+void compute_stats_opt_c(int wiener_win, const uint8_t *dgd, const uint8_t *src,
+                         int h_start, int h_end, int v_start, int v_end,
+                         int dgd_stride, int src_stride, int64_t *M,
+                         int64_t *H) {
+  if (wiener_win == WIENER_WIN || wiener_win == WIENER_WIN_CHROMA) {
+    compute_stats_win_opt_c(wiener_win, dgd, src, h_start, h_end, v_start,
+                            v_end, dgd_stride, src_stride, M, H);
+  } else {
+    av1_compute_stats_c(wiener_win, dgd, src, h_start, h_end, v_start, v_end,
+                        dgd_stride, src_stride, M, H);
+  }
+}
+
+static const int kIterations = 100;
+typedef void (*compute_stats_Func)(int wiener_win, const uint8_t *dgd,
+                                   const uint8_t *src, int h_start, int h_end,
+                                   int v_start, int v_end, int dgd_stride,
+                                   int src_stride, int64_t *M, int64_t *H);
+
+////////////////////////////////////////////////////////////////////////////////
+// 8 bit
+////////////////////////////////////////////////////////////////////////////////
+
+typedef std::tuple<const compute_stats_Func> WienerTestParam;
+
+class WienerTest : public ::testing::TestWithParam<WienerTestParam> {
+ public:
+  virtual void SetUp() {
+    src_buf = (uint8_t *)aom_memalign(
+        32, MAX_DATA_BLOCK * MAX_DATA_BLOCK * sizeof(*src_buf));
+    dgd_buf = (uint8_t *)aom_memalign(
+        32, MAX_DATA_BLOCK * MAX_DATA_BLOCK * sizeof(*dgd_buf));
+    target_func_ = GET_PARAM(0);
+  }
+  virtual void TearDown() {
+    aom_free(src_buf);
+    aom_free(dgd_buf);
+  }
+  void RunWienerTest(const int32_t wiener_win, int32_t run_times);
+  void RunWienerTest_ExtremeValues(const int32_t wiener_win);
+
+ private:
+  compute_stats_Func target_func_;
+  libaom_test::ACMRandom rng_;
+  uint8_t *src_buf;
+  uint8_t *dgd_buf;
+};
+
+void WienerTest::RunWienerTest(const int32_t wiener_win, int32_t run_times) {
+  const int32_t wiener_halfwin = wiener_win >> 1;
+  const int32_t wiener_win2 = wiener_win * wiener_win;
+  DECLARE_ALIGNED(32, int64_t, M_ref[WIENER_WIN2]);
+  DECLARE_ALIGNED(32, int64_t, H_ref[WIENER_WIN2 * WIENER_WIN2]);
+  DECLARE_ALIGNED(32, int64_t, M_test[WIENER_WIN2]);
+  DECLARE_ALIGNED(32, int64_t, H_test[WIENER_WIN2 * WIENER_WIN2]);
+  const int h_start = ((rng_.Rand16() % (MAX_WIENER_BLOCK / 2)) & (~7));
+  int h_end =
+      run_times != 1 ? 256 : ((rng_.Rand16() % MAX_WIENER_BLOCK) & (~7)) + 8;
+  const int v_start = ((rng_.Rand16() % (MAX_WIENER_BLOCK / 2)) & (~7));
+  int v_end =
+      run_times != 1 ? 256 : ((rng_.Rand16() % MAX_WIENER_BLOCK) & (~7)) + 8;
+  const int dgd_stride = h_end;
+  const int src_stride = MAX_DATA_BLOCK;
+  const int iters = run_times == 1 ? kIterations : 2;
+  for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) {
+    for (int i = 0; i < MAX_DATA_BLOCK * MAX_DATA_BLOCK; ++i) {
+      dgd_buf[i] = rng_.Rand8();
+      src_buf[i] = rng_.Rand8();
+    }
+    uint8_t *dgd = dgd_buf + wiener_halfwin * MAX_DATA_BLOCK + wiener_halfwin;
+    uint8_t *src = src_buf;
+
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < run_times; ++i) {
+      av1_compute_stats_c(wiener_win, dgd, src, h_start, h_end, v_start, v_end,
+                          dgd_stride, src_stride, M_ref, H_ref);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < run_times; ++i) {
+      target_func_(wiener_win, dgd, src, h_start, h_end, v_start, v_end,
+                   dgd_stride, src_stride, M_test, H_test);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    if (run_times > 10) {
+      printf("win %d %3dx%-3d:%7.2f/%7.2fns", wiener_win, h_end, v_end, time1,
+             time2);
+      printf("(%3.2f)\n", time1 / time2);
+    }
+    int failed = 0;
+    for (int i = 0; i < wiener_win2; ++i) {
+      if (M_ref[i] != M_test[i]) {
+        failed = 1;
+        printf("win %d M iter %d [%4d] ref %6" PRId64 " test %6" PRId64 " \n",
+               wiener_win, iter, i, M_ref[i], M_test[i]);
+        break;
+      }
+    }
+    for (int i = 0; i < wiener_win2 * wiener_win2; ++i) {
+      if (H_ref[i] != H_test[i]) {
+        failed = 1;
+        printf("win %d H iter %d [%4d] ref %6" PRId64 " test %6" PRId64 " \n",
+               wiener_win, iter, i, H_ref[i], H_test[i]);
+        break;
+      }
+    }
+    ASSERT_EQ(failed, 0);
+  }
+}
+
+void WienerTest::RunWienerTest_ExtremeValues(const int32_t wiener_win) {
+  const int32_t wiener_halfwin = wiener_win >> 1;
+  const int32_t wiener_win2 = wiener_win * wiener_win;
+  DECLARE_ALIGNED(32, int64_t, M_ref[WIENER_WIN2]);
+  DECLARE_ALIGNED(32, int64_t, H_ref[WIENER_WIN2 * WIENER_WIN2]);
+  DECLARE_ALIGNED(32, int64_t, M_test[WIENER_WIN2]);
+  DECLARE_ALIGNED(32, int64_t, H_test[WIENER_WIN2 * WIENER_WIN2]);
+  const int h_start = 16;
+  const int h_end = MAX_WIENER_BLOCK;
+  const int v_start = 16;
+  const int v_end = MAX_WIENER_BLOCK;
+  const int dgd_stride = h_end;
+  const int src_stride = MAX_DATA_BLOCK;
+  const int iters = 1;
+  for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) {
+    for (int i = 0; i < MAX_DATA_BLOCK * MAX_DATA_BLOCK; ++i) {
+      dgd_buf[i] = 255;
+      src_buf[i] = 255;
+    }
+    uint8_t *dgd = dgd_buf + wiener_halfwin * MAX_DATA_BLOCK + wiener_halfwin;
+    uint8_t *src = src_buf;
+
+    av1_compute_stats_c(wiener_win, dgd, src, h_start, h_end, v_start, v_end,
+                        dgd_stride, src_stride, M_ref, H_ref);
+
+    target_func_(wiener_win, dgd, src, h_start, h_end, v_start, v_end,
+                 dgd_stride, src_stride, M_test, H_test);
+
+    int failed = 0;
+    for (int i = 0; i < wiener_win2; ++i) {
+      if (M_ref[i] != M_test[i]) {
+        failed = 1;
+        printf("win %d M iter %d [%4d] ref %6" PRId64 " test %6" PRId64 " \n",
+               wiener_win, iter, i, M_ref[i], M_test[i]);
+        break;
+      }
+    }
+    for (int i = 0; i < wiener_win2 * wiener_win2; ++i) {
+      if (H_ref[i] != H_test[i]) {
+        failed = 1;
+        printf("win %d H iter %d [%4d] ref %6" PRId64 " test %6" PRId64 " \n",
+               wiener_win, iter, i, H_ref[i], H_test[i]);
+        break;
+      }
+    }
+    ASSERT_EQ(failed, 0);
+  }
+}
+
+TEST_P(WienerTest, RandomValues) {
+  RunWienerTest(WIENER_WIN, 1);
+  RunWienerTest(WIENER_WIN_CHROMA, 1);
+}
+
+TEST_P(WienerTest, ExtremeValues) {
+  RunWienerTest_ExtremeValues(WIENER_WIN);
+  RunWienerTest_ExtremeValues(WIENER_WIN_CHROMA);
+}
+
+TEST_P(WienerTest, DISABLED_Speed) {
+  RunWienerTest(WIENER_WIN, 200);
+  RunWienerTest(WIENER_WIN_CHROMA, 200);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, WienerTest, ::testing::Values(compute_stats_opt_c));
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE4_1, WienerTest,
+                         ::testing::Values(av1_compute_stats_sse4_1));
+#endif  // HAVE_SSE4_1
+
+#if HAVE_AVX2
+
+INSTANTIATE_TEST_SUITE_P(AVX2, WienerTest,
+                         ::testing::Values(av1_compute_stats_avx2));
+#endif  // HAVE_AVX2
+
+}  // namespace wiener_lowbd
+
+#if CONFIG_AV1_HIGHBITDEPTH
+// High bit-depth tests:
+namespace wiener_highbd {
+
+static void compute_stats_highbd_win_opt_c(int wiener_win, const uint8_t *dgd8,
+                                           const uint8_t *src8, int h_start,
+                                           int h_end, int v_start, int v_end,
+                                           int dgd_stride, int src_stride,
+                                           int64_t *M, int64_t *H,
+                                           aom_bit_depth_t bit_depth) {
+  ASSERT_TRUE(wiener_win == WIENER_WIN || wiener_win == WIENER_WIN_CHROMA);
+  int i, j, k, l, m, n;
+  const int pixel_count = (h_end - h_start) * (v_end - v_start);
+  const int wiener_win2 = wiener_win * wiener_win;
+  const int wiener_halfwin = (wiener_win >> 1);
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *dgd = CONVERT_TO_SHORTPTR(dgd8);
+  const uint16_t avg =
+      find_average_highbd(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+
+  std::vector<std::vector<int64_t> > M_int(wiener_win,
+                                           std::vector<int64_t>(wiener_win, 0));
+  std::vector<std::vector<int64_t> > H_int(
+      wiener_win * wiener_win, std::vector<int64_t>(wiener_win * 8, 0));
+  std::vector<std::vector<int32_t> > sumY(wiener_win,
+                                          std::vector<int32_t>(wiener_win, 0));
+
+  memset(M, 0, sizeof(*M) * wiener_win2);
+  memset(H, 0, sizeof(*H) * wiener_win2 * wiener_win2);
+
+  int64_t sumX = 0;
+  const uint16_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
+
+  for (i = v_start; i < v_end; i++) {
+    for (j = h_start; j < h_end; j += 2) {
+      const uint16_t X1 = src[i * src_stride + j];
+      const uint16_t X2 = src[i * src_stride + j + 1];
+      sumX += X1 + X2;
+
+      const uint16_t *dgd_ij = dgd_win + i * dgd_stride + j;
+      for (k = 0; k < wiener_win; k++) {
+        for (l = 0; l < wiener_win; l++) {
+          const uint16_t *dgd_ijkl = dgd_ij + k * dgd_stride + l;
+          int64_t *H_int_temp = &H_int[(l * wiener_win + k)][0];
+          const uint16_t D1 = dgd_ijkl[0];
+          const uint16_t D2 = dgd_ijkl[1];
+          sumY[k][l] += D1 + D2;
+          M_int[l][k] += D1 * X1 + D2 * X2;
+          for (m = 0; m < wiener_win; m++) {
+            for (n = 0; n < wiener_win; n++) {
+              H_int_temp[m * 8 + n] += D1 * dgd_ij[n + dgd_stride * m] +
+                                       D2 * dgd_ij[n + dgd_stride * m + 1];
+            }
+          }
+        }
+      }
+    }
+  }
+
+  uint8_t bit_depth_divider = 1;
+  if (bit_depth == AOM_BITS_12)
+    bit_depth_divider = 16;
+  else if (bit_depth == AOM_BITS_10)
+    bit_depth_divider = 4;
+
+  const int64_t avg_square_sum = (int64_t)avg * (int64_t)avg * pixel_count;
+  for (k = 0; k < wiener_win; k++) {
+    for (l = 0; l < wiener_win; l++) {
+      M[l * wiener_win + k] =
+          (M_int[l][k] +
+           (avg_square_sum - (int64_t)avg * (sumX + sumY[k][l]))) /
+          bit_depth_divider;
+      for (m = 0; m < wiener_win; m++) {
+        for (n = 0; n < wiener_win; n++) {
+          H[(l * wiener_win + k) * wiener_win2 + m * wiener_win + n] =
+              (H_int[(l * wiener_win + k)][n * 8 + m] +
+               (avg_square_sum - (int64_t)avg * (sumY[k][l] + sumY[n][m]))) /
+              bit_depth_divider;
+        }
+      }
+    }
+  }
+}
+
+void compute_stats_highbd_opt_c(int wiener_win, const uint8_t *dgd,
+                                const uint8_t *src, int h_start, int h_end,
+                                int v_start, int v_end, int dgd_stride,
+                                int src_stride, int64_t *M, int64_t *H,
+                                aom_bit_depth_t bit_depth) {
+  if (wiener_win == WIENER_WIN || wiener_win == WIENER_WIN_CHROMA) {
+    compute_stats_highbd_win_opt_c(wiener_win, dgd, src, h_start, h_end,
+                                   v_start, v_end, dgd_stride, src_stride, M, H,
+                                   bit_depth);
+  } else {
+    av1_compute_stats_highbd_c(wiener_win, dgd, src, h_start, h_end, v_start,
+                               v_end, dgd_stride, src_stride, M, H, bit_depth);
+  }
+}
+
+static const int kIterations = 100;
+typedef void (*compute_stats_Func)(int wiener_win, const uint8_t *dgd,
+                                   const uint8_t *src, int h_start, int h_end,
+                                   int v_start, int v_end, int dgd_stride,
+                                   int src_stride, int64_t *M, int64_t *H,
+                                   aom_bit_depth_t bit_depth);
+
+typedef std::tuple<const compute_stats_Func> WienerTestParam;
+
+class WienerTestHighbd : public ::testing::TestWithParam<WienerTestParam> {
+ public:
+  virtual void SetUp() {
+    src_buf = (uint16_t *)aom_memalign(
+        32, MAX_DATA_BLOCK * MAX_DATA_BLOCK * sizeof(*src_buf));
+    dgd_buf = (uint16_t *)aom_memalign(
+        32, MAX_DATA_BLOCK * MAX_DATA_BLOCK * sizeof(*dgd_buf));
+    target_func_ = GET_PARAM(0);
+  }
+  virtual void TearDown() {
+    aom_free(src_buf);
+    aom_free(dgd_buf);
+  }
+  void RunWienerTest(const int32_t wiener_win, int32_t run_times,
+                     aom_bit_depth_t bit_depth);
+  void RunWienerTest_ExtremeValues(const int32_t wiener_win,
+                                   aom_bit_depth_t bit_depth);
+
+ private:
+  compute_stats_Func target_func_;
+  libaom_test::ACMRandom rng_;
+  uint16_t *src_buf;
+  uint16_t *dgd_buf;
+};
+
+void WienerTestHighbd::RunWienerTest(const int32_t wiener_win,
+                                     int32_t run_times,
+                                     aom_bit_depth_t bit_depth) {
+  const int32_t wiener_halfwin = wiener_win >> 1;
+  const int32_t wiener_win2 = wiener_win * wiener_win;
+  DECLARE_ALIGNED(32, int64_t, M_ref[WIENER_WIN2]);
+  DECLARE_ALIGNED(32, int64_t, H_ref[WIENER_WIN2 * WIENER_WIN2]);
+  DECLARE_ALIGNED(32, int64_t, M_test[WIENER_WIN2]);
+  DECLARE_ALIGNED(32, int64_t, H_test[WIENER_WIN2 * WIENER_WIN2]);
+  const int h_start = ((rng_.Rand16() % (MAX_WIENER_BLOCK / 2)) & (~7));
+  const int h_end =
+      run_times != 1 ? 256 : ((rng_.Rand16() % MAX_WIENER_BLOCK) & (~7)) + 8;
+  const int v_start = ((rng_.Rand16() % (MAX_WIENER_BLOCK / 2)) & (~7));
+  const int v_end =
+      run_times != 1 ? 256 : ((rng_.Rand16() % MAX_WIENER_BLOCK) & (~7)) + 8;
+  const int dgd_stride = h_end;
+  const int src_stride = MAX_DATA_BLOCK;
+  const int iters = run_times == 1 ? kIterations : 2;
+  for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) {
+    for (int i = 0; i < MAX_DATA_BLOCK * MAX_DATA_BLOCK; ++i) {
+      dgd_buf[i] = rng_.Rand16() % (1 << bit_depth);
+      src_buf[i] = rng_.Rand16() % (1 << bit_depth);
+    }
+    const uint8_t *dgd8 = CONVERT_TO_BYTEPTR(
+        dgd_buf + wiener_halfwin * MAX_DATA_BLOCK + wiener_halfwin);
+    const uint8_t *src8 = CONVERT_TO_BYTEPTR(src_buf);
+
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < run_times; ++i) {
+      av1_compute_stats_highbd_c(wiener_win, dgd8, src8, h_start, h_end,
+                                 v_start, v_end, dgd_stride, src_stride, M_ref,
+                                 H_ref, bit_depth);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < run_times; ++i) {
+      target_func_(wiener_win, dgd8, src8, h_start, h_end, v_start, v_end,
+                   dgd_stride, src_stride, M_test, H_test, bit_depth);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    if (run_times > 10) {
+      printf("win %d bd %d %3dx%-3d:%7.2f/%7.2fns", wiener_win, bit_depth,
+             h_end, v_end, time1, time2);
+      printf("(%3.2f)\n", time1 / time2);
+    }
+    int failed = 0;
+    for (int i = 0; i < wiener_win2; ++i) {
+      if (M_ref[i] != M_test[i]) {
+        failed = 1;
+        printf("win %d bd %d M iter %d [%4d] ref %6" PRId64 " test %6" PRId64
+               " \n",
+               wiener_win, bit_depth, iter, i, M_ref[i], M_test[i]);
+        break;
+      }
+    }
+    for (int i = 0; i < wiener_win2 * wiener_win2; ++i) {
+      if (H_ref[i] != H_test[i]) {
+        failed = 1;
+        printf("win %d bd %d H iter %d [%4d] ref %6" PRId64 " test %6" PRId64
+               " \n",
+               wiener_win, bit_depth, iter, i, H_ref[i], H_test[i]);
+        break;
+      }
+    }
+    ASSERT_EQ(failed, 0);
+  }
+}
+
+void WienerTestHighbd::RunWienerTest_ExtremeValues(const int32_t wiener_win,
+                                                   aom_bit_depth_t bit_depth) {
+  const int32_t wiener_halfwin = wiener_win >> 1;
+  const int32_t wiener_win2 = wiener_win * wiener_win;
+  DECLARE_ALIGNED(32, int64_t, M_ref[WIENER_WIN2]);
+  DECLARE_ALIGNED(32, int64_t, H_ref[WIENER_WIN2 * WIENER_WIN2]);
+  DECLARE_ALIGNED(32, int64_t, M_test[WIENER_WIN2]);
+  DECLARE_ALIGNED(32, int64_t, H_test[WIENER_WIN2 * WIENER_WIN2]);
+  const int h_start = 16;
+  const int h_end = MAX_WIENER_BLOCK;
+  const int v_start = 16;
+  const int v_end = MAX_WIENER_BLOCK;
+  const int dgd_stride = h_end;
+  const int src_stride = MAX_DATA_BLOCK;
+  const int iters = 1;
+  for (int iter = 0; iter < iters && !HasFatalFailure(); ++iter) {
+    for (int i = 0; i < MAX_DATA_BLOCK * MAX_DATA_BLOCK; ++i) {
+      dgd_buf[i] = ((uint16_t)1 << bit_depth) - 1;
+      src_buf[i] = ((uint16_t)1 << bit_depth) - 1;
+    }
+    const uint8_t *dgd8 = CONVERT_TO_BYTEPTR(
+        dgd_buf + wiener_halfwin * MAX_DATA_BLOCK + wiener_halfwin);
+    const uint8_t *src8 = CONVERT_TO_BYTEPTR(src_buf);
+
+    av1_compute_stats_highbd_c(wiener_win, dgd8, src8, h_start, h_end, v_start,
+                               v_end, dgd_stride, src_stride, M_ref, H_ref,
+                               bit_depth);
+
+    target_func_(wiener_win, dgd8, src8, h_start, h_end, v_start, v_end,
+                 dgd_stride, src_stride, M_test, H_test, bit_depth);
+
+    int failed = 0;
+    for (int i = 0; i < wiener_win2; ++i) {
+      if (M_ref[i] != M_test[i]) {
+        failed = 1;
+        printf("win %d bd %d M iter %d [%4d] ref %6" PRId64 " test %6" PRId64
+               " \n",
+               wiener_win, bit_depth, iter, i, M_ref[i], M_test[i]);
+        break;
+      }
+    }
+    for (int i = 0; i < wiener_win2 * wiener_win2; ++i) {
+      if (H_ref[i] != H_test[i]) {
+        failed = 1;
+        printf("win %d bd %d H iter %d [%4d] ref %6" PRId64 " test %6" PRId64
+               " \n",
+               wiener_win, bit_depth, iter, i, H_ref[i], H_test[i]);
+        break;
+      }
+    }
+    ASSERT_EQ(failed, 0);
+  }
+}
+
+TEST_P(WienerTestHighbd, RandomValues) {
+  RunWienerTest(WIENER_WIN, 1, AOM_BITS_8);
+  RunWienerTest(WIENER_WIN_CHROMA, 1, AOM_BITS_8);
+  RunWienerTest(WIENER_WIN, 1, AOM_BITS_10);
+  RunWienerTest(WIENER_WIN_CHROMA, 1, AOM_BITS_10);
+  RunWienerTest(WIENER_WIN, 1, AOM_BITS_12);
+  RunWienerTest(WIENER_WIN_CHROMA, 1, AOM_BITS_12);
+}
+
+TEST_P(WienerTestHighbd, ExtremeValues) {
+  RunWienerTest_ExtremeValues(WIENER_WIN, AOM_BITS_8);
+  RunWienerTest_ExtremeValues(WIENER_WIN_CHROMA, AOM_BITS_8);
+  RunWienerTest_ExtremeValues(WIENER_WIN, AOM_BITS_10);
+  RunWienerTest_ExtremeValues(WIENER_WIN_CHROMA, AOM_BITS_10);
+  RunWienerTest_ExtremeValues(WIENER_WIN, AOM_BITS_12);
+  RunWienerTest_ExtremeValues(WIENER_WIN_CHROMA, AOM_BITS_12);
+}
+
+TEST_P(WienerTestHighbd, DISABLED_Speed) {
+  RunWienerTest(WIENER_WIN, 200, AOM_BITS_8);
+  RunWienerTest(WIENER_WIN_CHROMA, 200, AOM_BITS_8);
+  RunWienerTest(WIENER_WIN, 200, AOM_BITS_10);
+  RunWienerTest(WIENER_WIN_CHROMA, 200, AOM_BITS_10);
+  RunWienerTest(WIENER_WIN, 200, AOM_BITS_12);
+  RunWienerTest(WIENER_WIN_CHROMA, 200, AOM_BITS_12);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, WienerTestHighbd,
+                         ::testing::Values(compute_stats_highbd_opt_c));
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_SUITE_P(SSE4_1, WienerTestHighbd,
+                         ::testing::Values(av1_compute_stats_highbd_sse4_1));
+#endif  // HAVE_SSE4_1
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, WienerTestHighbd,
+                         ::testing::Values(av1_compute_stats_highbd_avx2));
+#endif  // HAVE_AVX2
+
+}  // namespace wiener_highbd
+#endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/libs/libaom/src/test/y4m_test.cc b/libs/libaom/src/test/y4m_test.cc
new file mode 100644
index 000000000..5d795fad9
--- /dev/null
+++ b/libs/libaom/src/test/y4m_test.cc
@@ -0,0 +1,180 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <string>
+
+#include "config/aom_config.h"
+
+#include "common/y4menc.h"
+#include "test/md5_helper.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+using std::string;
+
+static const unsigned int kWidth = 160;
+static const unsigned int kHeight = 90;
+static const unsigned int kFrames = 10;
+
+struct Y4mTestParam {
+  const char *filename;
+  unsigned int bit_depth;
+  aom_img_fmt format;
+  const char *md5raw;
+};
+
+const Y4mTestParam kY4mTestVectors[] = {
+  { "park_joy_90p_8_420.y4m", 8, AOM_IMG_FMT_I420,
+    "e5406275b9fc6bb3436c31d4a05c1cab" },
+  { "park_joy_90p_8_420_monochrome.y4m", 8, AOM_IMG_FMT_I420,
+    "95ef5bf6218580588be24a5271bb6a7f" },
+  { "park_joy_90p_8_420_vertical_csp.y4m", 8, AOM_IMG_FMT_I420,
+    "f53a40fec15254ac312527339d9c686b" },
+  { "park_joy_90p_8_422.y4m", 8, AOM_IMG_FMT_I422,
+    "284a47a47133b12884ec3a14e959a0b6" },
+  { "park_joy_90p_8_444.y4m", 8, AOM_IMG_FMT_I444,
+    "90517ff33843d85de712fd4fe60dbed0" },
+  { "park_joy_90p_10_420.y4m", 10, AOM_IMG_FMT_I42016,
+    "63f21f9f717d8b8631bd2288ee87137b" },
+  { "park_joy_90p_10_422.y4m", 10, AOM_IMG_FMT_I42216,
+    "48ab51fb540aed07f7ff5af130c9b605" },
+  { "park_joy_90p_10_444.y4m", 10, AOM_IMG_FMT_I44416,
+    "067bfd75aa85ff9bae91fa3e0edd1e3e" },
+  { "park_joy_90p_12_420.y4m", 12, AOM_IMG_FMT_I42016,
+    "9e6d8f6508c6e55625f6b697bc461cef" },
+  { "park_joy_90p_12_422.y4m", 12, AOM_IMG_FMT_I42216,
+    "b239c6b301c0b835485be349ca83a7e3" },
+  { "park_joy_90p_12_444.y4m", 12, AOM_IMG_FMT_I44416,
+    "5a6481a550821dab6d0192f5c63845e9" },
+};
+
+static const int PLANES_YUV[] = { AOM_PLANE_Y, AOM_PLANE_U, AOM_PLANE_V };
+
+class Y4mVideoSourceTest : public ::testing::TestWithParam<Y4mTestParam>,
+                           public ::libaom_test::Y4mVideoSource {
+ protected:
+  Y4mVideoSourceTest() : Y4mVideoSource("", 0, 0) {}
+
+  virtual ~Y4mVideoSourceTest() { CloseSource(); }
+
+  virtual void Init(const std::string &file_name, int limit) {
+    file_name_ = file_name;
+    start_ = 0;
+    limit_ = limit;
+    frame_ = 0;
+    Begin();
+  }
+
+  // Checks y4m header information
+  void HeaderChecks(unsigned int bit_depth, aom_img_fmt_t fmt) {
+    ASSERT_TRUE(input_file_ != NULL);
+    ASSERT_EQ(y4m_.pic_w, (int)kWidth);
+    ASSERT_EQ(y4m_.pic_h, (int)kHeight);
+    ASSERT_EQ(img()->d_w, kWidth);
+    ASSERT_EQ(img()->d_h, kHeight);
+    ASSERT_EQ(y4m_.bit_depth, bit_depth);
+    ASSERT_EQ(y4m_.aom_fmt, fmt);
+    if (fmt == AOM_IMG_FMT_I420 || fmt == AOM_IMG_FMT_I42016) {
+      ASSERT_EQ(y4m_.bps, (int)y4m_.bit_depth * 3 / 2);
+      ASSERT_EQ(img()->x_chroma_shift, 1U);
+      ASSERT_EQ(img()->y_chroma_shift, 1U);
+    }
+    if (fmt == AOM_IMG_FMT_I422 || fmt == AOM_IMG_FMT_I42216) {
+      ASSERT_EQ(y4m_.bps, (int)y4m_.bit_depth * 2);
+      ASSERT_EQ(img()->x_chroma_shift, 1U);
+      ASSERT_EQ(img()->y_chroma_shift, 0U);
+    }
+    if (fmt == AOM_IMG_FMT_I444 || fmt == AOM_IMG_FMT_I44416) {
+      ASSERT_EQ(y4m_.bps, (int)y4m_.bit_depth * 3);
+      ASSERT_EQ(img()->x_chroma_shift, 0U);
+      ASSERT_EQ(img()->y_chroma_shift, 0U);
+    }
+  }
+
+  // Checks MD5 of the raw frame data
+  void Md5Check(const string &expected_md5) {
+    ASSERT_TRUE(input_file_ != NULL);
+    libaom_test::MD5 md5;
+    for (unsigned int i = start_; i < limit_; i++) {
+      md5.Add(img());
+      Next();
+    }
+    ASSERT_EQ(string(md5.Get()), expected_md5);
+  }
+};
+
+TEST_P(Y4mVideoSourceTest, SourceTest) {
+  const Y4mTestParam t = GetParam();
+  Init(t.filename, kFrames);
+  HeaderChecks(t.bit_depth, t.format);
+  Md5Check(t.md5raw);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, Y4mVideoSourceTest,
+                         ::testing::ValuesIn(kY4mTestVectors));
+
+class Y4mVideoWriteTest : public Y4mVideoSourceTest {
+ protected:
+  Y4mVideoWriteTest() : tmpfile_(NULL) {}
+
+  virtual ~Y4mVideoWriteTest() {
+    delete tmpfile_;
+    input_file_ = NULL;
+  }
+
+  void ReplaceInputFile(FILE *input_file) {
+    CloseSource();
+    frame_ = 0;
+    input_file_ = input_file;
+    rewind(input_file_);
+    ReadSourceToStart();
+  }
+
+  // Writes out a y4m file and then reads it back
+  void WriteY4mAndReadBack() {
+    ASSERT_TRUE(input_file_ != NULL);
+    char buf[Y4M_BUFFER_SIZE] = { 0 };
+    const struct AvxRational framerate = { y4m_.fps_n, y4m_.fps_d };
+    tmpfile_ = new libaom_test::TempOutFile;
+    ASSERT_TRUE(tmpfile_->file() != NULL);
+    y4m_write_file_header(buf, sizeof(buf), kWidth, kHeight, &framerate,
+                          img()->monochrome, img()->csp, y4m_.aom_fmt,
+                          y4m_.bit_depth);
+    fputs(buf, tmpfile_->file());
+    for (unsigned int i = start_; i < limit_; i++) {
+      y4m_write_frame_header(buf, sizeof(buf));
+      fputs(buf, tmpfile_->file());
+      y4m_write_image_file(img(), PLANES_YUV, tmpfile_->file());
+      Next();
+    }
+    ReplaceInputFile(tmpfile_->file());
+  }
+
+  virtual void Init(const std::string &file_name, int limit) {
+    Y4mVideoSourceTest::Init(file_name, limit);
+    WriteY4mAndReadBack();
+  }
+  libaom_test::TempOutFile *tmpfile_;
+};
+
+TEST_P(Y4mVideoWriteTest, WriteTest) {
+  const Y4mTestParam t = GetParam();
+  Init(t.filename, kFrames);
+  HeaderChecks(t.bit_depth, t.format);
+  Md5Check(t.md5raw);
+}
+
+INSTANTIATE_TEST_SUITE_P(C, Y4mVideoWriteTest,
+                         ::testing::ValuesIn(kY4mTestVectors));
+}  // namespace
diff --git a/libs/libaom/src/test/y4m_video_source.h b/libs/libaom/src/test/y4m_video_source.h
new file mode 100644
index 000000000..63f74f567
--- /dev/null
+++ b/libs/libaom/src/test/y4m_video_source.h
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_TEST_Y4M_VIDEO_SOURCE_H_
+#define AOM_TEST_Y4M_VIDEO_SOURCE_H_
+#include <algorithm>
+#include <memory>
+#include <string>
+
+#include "common/y4minput.h"
+#include "test/video_source.h"
+
+namespace libaom_test {
+
+// This class extends VideoSource to allow parsing of raw yv12
+// so that we can do actual file encodes.
+class Y4mVideoSource : public VideoSource {
+ public:
+  Y4mVideoSource(const std::string &file_name, unsigned int start, int limit)
+      : file_name_(file_name), input_file_(NULL), img_(new aom_image_t()),
+        start_(start), limit_(limit), frame_(0), framerate_numerator_(0),
+        framerate_denominator_(0), y4m_() {}
+
+  virtual ~Y4mVideoSource() {
+    aom_img_free(img_.get());
+    CloseSource();
+  }
+
+  virtual void OpenSource() {
+    CloseSource();
+    input_file_ = OpenTestDataFile(file_name_);
+    ASSERT_TRUE(input_file_ != NULL)
+        << "Input file open failed. Filename: " << file_name_;
+  }
+
+  virtual void ReadSourceToStart() {
+    ASSERT_TRUE(input_file_ != NULL);
+    ASSERT_FALSE(
+        y4m_input_open(&y4m_, input_file_, NULL, 0, AOM_CSP_UNKNOWN, 0));
+    framerate_numerator_ = y4m_.fps_n;
+    framerate_denominator_ = y4m_.fps_d;
+    frame_ = 0;
+    for (unsigned int i = 0; i < start_; i++) {
+      Next();
+    }
+    FillFrame();
+  }
+
+  virtual void Begin() {
+    OpenSource();
+    ReadSourceToStart();
+  }
+
+  virtual void Next() {
+    ++frame_;
+    FillFrame();
+  }
+
+  virtual aom_image_t *img() const {
+    return (frame_ < limit_) ? img_.get() : NULL;
+  }
+
+  // Models a stream where Timebase = 1/FPS, so pts == frame.
+  virtual aom_codec_pts_t pts() const { return frame_; }
+
+  virtual unsigned long duration() const { return 1; }
+
+  virtual aom_rational_t timebase() const {
+    const aom_rational_t t = { framerate_denominator_, framerate_numerator_ };
+    return t;
+  }
+
+  virtual unsigned int frame() const { return frame_; }
+
+  virtual unsigned int limit() const { return limit_; }
+
+  virtual void FillFrame() {
+    ASSERT_TRUE(input_file_ != NULL);
+    // Read a frame from input_file.
+    y4m_input_fetch_frame(&y4m_, input_file_, img_.get());
+  }
+
+  // Swap buffers with another y4m source. This allows reading a new frame
+  // while keeping the old frame around. A whole Y4mSource is required and
+  // not just a aom_image_t because of how the y4m reader manipulates
+  // aom_image_t internals,
+  void SwapBuffers(Y4mVideoSource *other) {
+    std::swap(other->y4m_.dst_buf, y4m_.dst_buf);
+    aom_image_t *tmp;
+    tmp = other->img_.release();
+    other->img_.reset(img_.release());
+    img_.reset(tmp);
+  }
+
+ protected:
+  void CloseSource() {
+    y4m_input_close(&y4m_);
+    y4m_ = y4m_input();
+    if (input_file_ != NULL) {
+      fclose(input_file_);
+      input_file_ = NULL;
+    }
+  }
+
+  std::string file_name_;
+  FILE *input_file_;
+  std::unique_ptr<aom_image_t> img_;
+  unsigned int start_;
+  unsigned int limit_;
+  unsigned int frame_;
+  int framerate_numerator_;
+  int framerate_denominator_;
+  y4m_input y4m_;
+};
+
+}  // namespace libaom_test
+
+#endif  // AOM_TEST_Y4M_VIDEO_SOURCE_H_
diff --git a/libs/libaom/src/test/yuv_video_source.h b/libs/libaom/src/test/yuv_video_source.h
new file mode 100644
index 000000000..774ecc008
--- /dev/null
+++ b/libs/libaom/src/test/yuv_video_source.h
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_TEST_YUV_VIDEO_SOURCE_H_
+#define AOM_TEST_YUV_VIDEO_SOURCE_H_
+
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+
+#include "test/video_source.h"
+#include "aom/aom_image.h"
+
+namespace libaom_test {
+
+// This class extends VideoSource to allow parsing of raw YUV
+// formats of various color sampling and bit-depths so that we can
+// do actual file encodes.
+class YUVVideoSource : public VideoSource {
+ public:
+  YUVVideoSource(const std::string &file_name, aom_img_fmt format,
+                 unsigned int width, unsigned int height, int rate_numerator,
+                 int rate_denominator, unsigned int start, int limit)
+      : file_name_(file_name), input_file_(NULL), img_(NULL), start_(start),
+        limit_(limit), frame_(0), width_(0), height_(0),
+        format_(AOM_IMG_FMT_NONE), framerate_numerator_(rate_numerator),
+        framerate_denominator_(rate_denominator) {
+    // This initializes format_, raw_size_, width_, height_ and allocates img.
+    SetSize(width, height, format);
+  }
+
+  virtual ~YUVVideoSource() {
+    aom_img_free(img_);
+    if (input_file_) fclose(input_file_);
+  }
+
+  virtual void Begin() {
+    if (input_file_) fclose(input_file_);
+    input_file_ = OpenTestDataFile(file_name_);
+    ASSERT_TRUE(input_file_ != NULL)
+        << "Input file open failed. Filename: " << file_name_;
+    if (start_)
+      fseek(input_file_, static_cast<unsigned>(raw_size_) * start_, SEEK_SET);
+
+    frame_ = start_;
+    FillFrame();
+  }
+
+  virtual void Next() {
+    ++frame_;
+    FillFrame();
+  }
+
+  virtual aom_image_t *img() const { return (frame_ < limit_) ? img_ : NULL; }
+
+  // Models a stream where Timebase = 1/FPS, so pts == frame.
+  virtual aom_codec_pts_t pts() const { return frame_; }
+
+  virtual unsigned long duration() const { return 1; }
+
+  virtual aom_rational_t timebase() const {
+    const aom_rational_t t = { framerate_denominator_, framerate_numerator_ };
+    return t;
+  }
+
+  virtual unsigned int frame() const { return frame_; }
+
+  virtual unsigned int limit() const { return limit_; }
+
+  virtual void SetSize(unsigned int width, unsigned int height,
+                       aom_img_fmt format) {
+    if (width != width_ || height != height_ || format != format_) {
+      aom_img_free(img_);
+      img_ = aom_img_alloc(NULL, format, width, height, 1);
+      ASSERT_TRUE(img_ != NULL);
+      width_ = width;
+      height_ = height;
+      format_ = format;
+      switch (format) {
+        case AOM_IMG_FMT_I420: raw_size_ = width * height * 3 / 2; break;
+        case AOM_IMG_FMT_I422: raw_size_ = width * height * 2; break;
+        case AOM_IMG_FMT_I444: raw_size_ = width * height * 3; break;
+        case AOM_IMG_FMT_I42016: raw_size_ = width * height * 3; break;
+        case AOM_IMG_FMT_I42216: raw_size_ = width * height * 4; break;
+        case AOM_IMG_FMT_I44416: raw_size_ = width * height * 6; break;
+        default: ASSERT_TRUE(0);
+      }
+    }
+  }
+
+  virtual void FillFrame() {
+    ASSERT_TRUE(input_file_ != NULL);
+    // Read a frame from input_file.
+    if (fread(img_->img_data, raw_size_, 1, input_file_) == 0) {
+      limit_ = frame_;
+    }
+  }
+
+ protected:
+  std::string file_name_;
+  FILE *input_file_;
+  aom_image_t *img_;
+  size_t raw_size_;
+  unsigned int start_;
+  unsigned int limit_;
+  unsigned int frame_;
+  unsigned int width_;
+  unsigned int height_;
+  aom_img_fmt format_;
+  int framerate_numerator_;
+  int framerate_denominator_;
+};
+
+}  // namespace libaom_test
+
+#endif  // AOM_TEST_YUV_VIDEO_SOURCE_H_
diff --git a/libs/libaom/src/third_party/fastfeat/LICENSE b/libs/libaom/src/third_party/fastfeat/LICENSE
new file mode 100644
index 000000000..f347008d6
--- /dev/null
+++ b/libs/libaom/src/third_party/fastfeat/LICENSE
@@ -0,0 +1,30 @@
+Copyright (c) 2006, 2008 Edward Rosten
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+
+	*Redistributions of source code must retain the above copyright
+	 notice, this list of conditions and the following disclaimer.
+
+	*Redistributions in binary form must reproduce the above copyright
+	 notice, this list of conditions and the following disclaimer in the
+	 documentation and/or other materials provided with the distribution.
+
+	*Neither the name of the University of Cambridge nor the names of 
+	 its contributors may be used to endorse or promote products derived 
+	 from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/libs/libaom/src/third_party/fastfeat/README.libaom b/libs/libaom/src/third_party/fastfeat/README.libaom
new file mode 100644
index 000000000..a732b0d93
--- /dev/null
+++ b/libs/libaom/src/third_party/fastfeat/README.libaom
@@ -0,0 +1,40 @@
+URL: https://github.com/edrosten/fast-C-src
+Version: 391d5e939eb1545d24c10533d7de424db8d9c191
+License: BSD
+License File: LICENSE
+
+Description:
+Library to compute FAST features with non-maximum suppression.
+
+The files are valid C and C++ code, and have no special requirements for
+compiling, and they do not depend on any libraries. Just compile them along with
+the rest of your project.
+
+To use the functions, #include "fast.h"
+
+The corner detectors have the following prototype (where X is 9, 10, 11 or 12):
+
+xy* fastX_detect_nonmax(const unsigned char * data, int xsize, int ysize, int stride, int threshold, int* numcorners)
+
+Where xy is the following simple struct typedef:
+
+typedef struct
+{
+	int x, y;
+} xy;
+
+The image is passed in as a block of data and dimensions, and the list of
+corners is returned as an array of xy structs, and an integer (numcorners)
+with the number of corners returned.  The data can be deallocated with free().
+Nonmaximal suppression is performed on the corners. Note that the stride
+is the number of bytes between rows. If your image has no padding, then this
+is the same as xsize.
+
+The detection, scoring and nonmaximal suppression are available as individual
+functions.  To see how to use the individual functions, see fast.c
+
+Local Modifications:
+Add lines to turn off clang formatting for these files
+Remove Fast 10, 11 and 12
+Convert tabs to spaces
+Prefix global functions with "aom_"
diff --git a/libs/libaom/src/third_party/fastfeat/fast.c b/libs/libaom/src/third_party/fastfeat/fast.c
new file mode 100644
index 000000000..f29ac8f72
--- /dev/null
+++ b/libs/libaom/src/third_party/fastfeat/fast.c
@@ -0,0 +1,22 @@
+// clang-format off
+#include <stdlib.h>
+#include "fast.h"
+
+
+xy* aom_fast9_detect_nonmax(const byte* im, int xsize, int ysize, int stride, int b, int* ret_num_corners)
+{
+	xy* corners;
+	int num_corners;
+	int* scores;
+	xy* nonmax;
+
+	corners = aom_fast9_detect(im, xsize, ysize, stride, b, &num_corners);
+	scores = aom_fast9_score(im, stride, corners, num_corners, b);
+	nonmax = aom_nonmax_suppression(corners, scores, num_corners, ret_num_corners);
+
+	free(corners);
+	free(scores);
+
+	return nonmax;
+}
+// clang-format on
diff --git a/libs/libaom/src/third_party/fastfeat/fast.h b/libs/libaom/src/third_party/fastfeat/fast.h
new file mode 100644
index 000000000..a65d5a5d1
--- /dev/null
+++ b/libs/libaom/src/third_party/fastfeat/fast.h
@@ -0,0 +1,20 @@
+// clang-format off
+#ifndef FAST_H
+#define FAST_H
+
+typedef struct { int x, y; } xy;
+typedef unsigned char byte;
+
+int aom_fast9_corner_score(const byte* p, const int pixel[], int bstart);
+
+xy* aom_fast9_detect(const byte* im, int xsize, int ysize, int stride, int b, int* ret_num_corners);
+
+int* aom_fast9_score(const byte* i, int stride, xy* corners, int num_corners, int b);
+
+xy* aom_fast9_detect_nonmax(const byte* im, int xsize, int ysize, int stride, int b, int* ret_num_corners);
+
+xy* aom_nonmax_suppression(const xy* corners, const int* scores, int num_corners, int* ret_num_nonmax);
+
+
+#endif
+// clang-format on
diff --git a/libs/libaom/src/third_party/fastfeat/fast_9.c b/libs/libaom/src/third_party/fastfeat/fast_9.c
new file mode 100644
index 000000000..61c654c47
--- /dev/null
+++ b/libs/libaom/src/third_party/fastfeat/fast_9.c
@@ -0,0 +1,5911 @@
+// clang-format off
+/*This is mechanically generated code*/
+#include <stdlib.h>
+
+typedef struct { int x, y; } xy;
+typedef unsigned char byte;
+
+int aom_fast9_corner_score(const byte* p, const int pixel[], int bstart)
+{
+  int bmin = bstart;
+  int bmax = 255;
+  int b = (bmax + bmin)/2;
+
+  /*Compute the score using binary search*/
+  for(;;)
+  {
+    int cb = *p + b;
+    int c_b= *p - b;
+
+
+    if( p[pixel[0]] > cb)
+      if( p[pixel[1]] > cb)
+        if( p[pixel[2]] > cb)
+          if( p[pixel[3]] > cb)
+            if( p[pixel[4]] > cb)
+              if( p[pixel[5]] > cb)
+                if( p[pixel[6]] > cb)
+                  if( p[pixel[7]] > cb)
+                    if( p[pixel[8]] > cb)
+                      goto is_a_corner;
+                    else
+                      if( p[pixel[15]] > cb)
+                        goto is_a_corner;
+                      else
+                        goto is_not_a_corner;
+                  else if( p[pixel[7]] < c_b)
+                    if( p[pixel[14]] > cb)
+                      if( p[pixel[15]] > cb)
+                        goto is_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else if( p[pixel[14]] < c_b)
+                      if( p[pixel[8]] < c_b)
+                        if( p[pixel[9]] < c_b)
+                          if( p[pixel[10]] < c_b)
+                            if( p[pixel[11]] < c_b)
+                              if( p[pixel[12]] < c_b)
+                                if( p[pixel[13]] < c_b)
+                                  if( p[pixel[15]] < c_b)
+                                    goto is_a_corner;
+                                  else
+                                    goto is_not_a_corner;
+                                else
+                                  goto is_not_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    if( p[pixel[14]] > cb)
+                      if( p[pixel[15]] > cb)
+                        goto is_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                else if( p[pixel[6]] < c_b)
+                  if( p[pixel[15]] > cb)
+                    if( p[pixel[13]] > cb)
+                      if( p[pixel[14]] > cb)
+                        goto is_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else if( p[pixel[13]] < c_b)
+                      if( p[pixel[7]] < c_b)
+                        if( p[pixel[8]] < c_b)
+                          if( p[pixel[9]] < c_b)
+                            if( p[pixel[10]] < c_b)
+                              if( p[pixel[11]] < c_b)
+                                if( p[pixel[12]] < c_b)
+                                  if( p[pixel[14]] < c_b)
+                                    goto is_a_corner;
+                                  else
+                                    goto is_not_a_corner;
+                                else
+                                  goto is_not_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    if( p[pixel[7]] < c_b)
+                      if( p[pixel[8]] < c_b)
+                        if( p[pixel[9]] < c_b)
+                          if( p[pixel[10]] < c_b)
+                            if( p[pixel[11]] < c_b)
+                              if( p[pixel[12]] < c_b)
+                                if( p[pixel[13]] < c_b)
+                                  if( p[pixel[14]] < c_b)
+                                    goto is_a_corner;
+                                  else
+                                    goto is_not_a_corner;
+                                else
+                                  goto is_not_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                else
+                  if( p[pixel[13]] > cb)
+                    if( p[pixel[14]] > cb)
+                      if( p[pixel[15]] > cb)
+                        goto is_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else if( p[pixel[13]] < c_b)
+                    if( p[pixel[7]] < c_b)
+                      if( p[pixel[8]] < c_b)
+                        if( p[pixel[9]] < c_b)
+                          if( p[pixel[10]] < c_b)
+                            if( p[pixel[11]] < c_b)
+                              if( p[pixel[12]] < c_b)
+                                if( p[pixel[14]] < c_b)
+                                  if( p[pixel[15]] < c_b)
+                                    goto is_a_corner;
+                                  else
+                                    goto is_not_a_corner;
+                                else
+                                  goto is_not_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+              else if( p[pixel[5]] < c_b)
+                if( p[pixel[14]] > cb)
+                  if( p[pixel[12]] > cb)
+                    if( p[pixel[13]] > cb)
+                      if( p[pixel[15]] > cb)
+                        goto is_a_corner;
+                      else
+                        if( p[pixel[6]] > cb)
+                          if( p[pixel[7]] > cb)
+                            if( p[pixel[8]] > cb)
+                              if( p[pixel[9]] > cb)
+                                if( p[pixel[10]] > cb)
+                                  if( p[pixel[11]] > cb)
+                                    goto is_a_corner;
+                                  else
+                                    goto is_not_a_corner;
+                                else
+                                  goto is_not_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else if( p[pixel[12]] < c_b)
+                    if( p[pixel[6]] < c_b)
+                      if( p[pixel[7]] < c_b)
+                        if( p[pixel[8]] < c_b)
+                          if( p[pixel[9]] < c_b)
+                            if( p[pixel[10]] < c_b)
+                              if( p[pixel[11]] < c_b)
+                                if( p[pixel[13]] < c_b)
+                                  goto is_a_corner;
+                                else
+                                  goto is_not_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+                else if( p[pixel[14]] < c_b)
+                  if( p[pixel[7]] < c_b)
+                    if( p[pixel[8]] < c_b)
+                      if( p[pixel[9]] < c_b)
+                        if( p[pixel[10]] < c_b)
+                          if( p[pixel[11]] < c_b)
+                            if( p[pixel[12]] < c_b)
+                              if( p[pixel[13]] < c_b)
+                                if( p[pixel[6]] < c_b)
+                                  goto is_a_corner;
+                                else
+                                  if( p[pixel[15]] < c_b)
+                                    goto is_a_corner;
+                                  else
+                                    goto is_not_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+                else
+                  if( p[pixel[6]] < c_b)
+                    if( p[pixel[7]] < c_b)
+                      if( p[pixel[8]] < c_b)
+                        if( p[pixel[9]] < c_b)
+                          if( p[pixel[10]] < c_b)
+                            if( p[pixel[11]] < c_b)
+                              if( p[pixel[12]] < c_b)
+                                if( p[pixel[13]] < c_b)
+                                  goto is_a_corner;
+                                else
+                                  goto is_not_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+              else
+                if( p[pixel[12]] > cb)
+                  if( p[pixel[13]] > cb)
+                    if( p[pixel[14]] > cb)
+                      if( p[pixel[15]] > cb)
+                        goto is_a_corner;
+                      else
+                        if( p[pixel[6]] > cb)
+                          if( p[pixel[7]] > cb)
+                            if( p[pixel[8]] > cb)
+                              if( p[pixel[9]] > cb)
+                                if( p[pixel[10]] > cb)
+                                  if( p[pixel[11]] > cb)
+                                    goto is_a_corner;
+                                  else
+                                    goto is_not_a_corner;
+                                else
+                                  goto is_not_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+                else if( p[pixel[12]] < c_b)
+                  if( p[pixel[7]] < c_b)
+                    if( p[pixel[8]] < c_b)
+                      if( p[pixel[9]] < c_b)
+                        if( p[pixel[10]] < c_b)
+                          if( p[pixel[11]] < c_b)
+                            if( p[pixel[13]] < c_b)
+                              if( p[pixel[14]] < c_b)
+                                if( p[pixel[6]] < c_b)
+                                  goto is_a_corner;
+                                else
+                                  if( p[pixel[15]] < c_b)
+                                    goto is_a_corner;
+                                  else
+                                    goto is_not_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+                else
+                  goto is_not_a_corner;
+            else if( p[pixel[4]] < c_b)
+              if( p[pixel[13]] > cb)
+                if( p[pixel[11]] > cb)
+                  if( p[pixel[12]] > cb)
+                    if( p[pixel[14]] > cb)
+                      if( p[pixel[15]] > cb)
+                        goto is_a_corner;
+                      else
+                        if( p[pixel[6]] > cb)
+                          if( p[pixel[7]] > cb)
+                            if( p[pixel[8]] > cb)
+                              if( p[pixel[9]] > cb)
+                                if( p[pixel[10]] > cb)
+                                  goto is_a_corner;
+                                else
+                                  goto is_not_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                    else
+                      if( p[pixel[5]] > cb)
+                        if( p[pixel[6]] > cb)
+                          if( p[pixel[7]] > cb)
+                            if( p[pixel[8]] > cb)
+                              if( p[pixel[9]] > cb)
+                                if( p[pixel[10]] > cb)
+                                  goto is_a_corner;
+                                else
+                                  goto is_not_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+                else if( p[pixel[11]] < c_b)
+                  if( p[pixel[5]] < c_b)
+                    if( p[pixel[6]] < c_b)
+                      if( p[pixel[7]] < c_b)
+                        if( p[pixel[8]] < c_b)
+                          if( p[pixel[9]] < c_b)
+                            if( p[pixel[10]] < c_b)
+                              if( p[pixel[12]] < c_b)
+                                goto is_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+                else
+                  goto is_not_a_corner;
+              else if( p[pixel[13]] < c_b)
+                if( p[pixel[7]] < c_b)
+                  if( p[pixel[8]] < c_b)
+                    if( p[pixel[9]] < c_b)
+                      if( p[pixel[10]] < c_b)
+                        if( p[pixel[11]] < c_b)
+                          if( p[pixel[12]] < c_b)
+                            if( p[pixel[6]] < c_b)
+                              if( p[pixel[5]] < c_b)
+                                goto is_a_corner;
+                              else
+                                if( p[pixel[14]] < c_b)
+                                  goto is_a_corner;
+                                else
+                                  goto is_not_a_corner;
+                            else
+                              if( p[pixel[14]] < c_b)
+                                if( p[pixel[15]] < c_b)
+                                  goto is_a_corner;
+                                else
+                                  goto is_not_a_corner;
+                              else
+                                goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+                else
+                  goto is_not_a_corner;
+              else
+                if( p[pixel[5]] < c_b)
+                  if( p[pixel[6]] < c_b)
+                    if( p[pixel[7]] < c_b)
+                      if( p[pixel[8]] < c_b)
+                        if( p[pixel[9]] < c_b)
+                          if( p[pixel[10]] < c_b)
+                            if( p[pixel[11]] < c_b)
+                              if( p[pixel[12]] < c_b)
+                                goto is_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+                else
+                  goto is_not_a_corner;
+            else
+              if( p[pixel[11]] > cb)
+                if( p[pixel[12]] > cb)
+                  if( p[pixel[13]] > cb)
+                    if( p[pixel[14]] > cb)
+                      if( p[pixel[15]] > cb)
+                        goto is_a_corner;
+                      else
+                        if( p[pixel[6]] > cb)
+                          if( p[pixel[7]] > cb)
+                            if( p[pixel[8]] > cb)
+                              if( p[pixel[9]] > cb)
+                                if( p[pixel[10]] > cb)
+                                  goto is_a_corner;
+                                else
+                                  goto is_not_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                    else
+                      if( p[pixel[5]] > cb)
+                        if( p[pixel[6]] > cb)
+                          if( p[pixel[7]] > cb)
+                            if( p[pixel[8]] > cb)
+                              if( p[pixel[9]] > cb)
+                                if( p[pixel[10]] > cb)
+                                  goto is_a_corner;
+                                else
+                                  goto is_not_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+                else
+                  goto is_not_a_corner;
+              else if( p[pixel[11]] < c_b)
+                if( p[pixel[7]] < c_b)
+                  if( p[pixel[8]] < c_b)
+                    if( p[pixel[9]] < c_b)
+                      if( p[pixel[10]] < c_b)
+                        if( p[pixel[12]] < c_b)
+                          if( p[pixel[13]] < c_b)
+                            if( p[pixel[6]] < c_b)
+                              if( p[pixel[5]] < c_b)
+                                goto is_a_corner;
+                              else
+                                if( p[pixel[14]] < c_b)
+                                  goto is_a_corner;
+                                else
+                                  goto is_not_a_corner;
+                            else
+                              if( p[pixel[14]] < c_b)
+                                if( p[pixel[15]] < c_b)
+                                  goto is_a_corner;
+                                else
+                                  goto is_not_a_corner;
+                              else
+                                goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+                else
+                  goto is_not_a_corner;
+              else
+                goto is_not_a_corner;
+          else if( p[pixel[3]] < c_b)
+            if( p[pixel[10]] > cb)
+              if( p[pixel[11]] > cb)
+                if( p[pixel[12]] > cb)
+                  if( p[pixel[13]] > cb)
+                    if( p[pixel[14]] > cb)
+                      if( p[pixel[15]] > cb)
+                        goto is_a_corner;
+                      else
+                        if( p[pixel[6]] > cb)
+                          if( p[pixel[7]] > cb)
+                            if( p[pixel[8]] > cb)
+                              if( p[pixel[9]] > cb)
+                                goto is_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                    else
+                      if( p[pixel[5]] > cb)
+                        if( p[pixel[6]] > cb)
+                          if( p[pixel[7]] > cb)
+                            if( p[pixel[8]] > cb)
+                              if( p[pixel[9]] > cb)
+                                goto is_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                  else
+                    if( p[pixel[4]] > cb)
+                      if( p[pixel[5]] > cb)
+                        if( p[pixel[6]] > cb)
+                          if( p[pixel[7]] > cb)
+                            if( p[pixel[8]] > cb)
+                              if( p[pixel[9]] > cb)
+                                goto is_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                else
+                  goto is_not_a_corner;
+              else
+                goto is_not_a_corner;
+            else if( p[pixel[10]] < c_b)
+              if( p[pixel[7]] < c_b)
+                if( p[pixel[8]] < c_b)
+                  if( p[pixel[9]] < c_b)
+                    if( p[pixel[11]] < c_b)
+                      if( p[pixel[6]] < c_b)
+                        if( p[pixel[5]] < c_b)
+                          if( p[pixel[4]] < c_b)
+                            goto is_a_corner;
+                          else
+                            if( p[pixel[12]] < c_b)
+                              if( p[pixel[13]] < c_b)
+                                goto is_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                        else
+                          if( p[pixel[12]] < c_b)
+                            if( p[pixel[13]] < c_b)
+                              if( p[pixel[14]] < c_b)
+                                goto is_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                      else
+                        if( p[pixel[12]] < c_b)
+                          if( p[pixel[13]] < c_b)
+                            if( p[pixel[14]] < c_b)
+                              if( p[pixel[15]] < c_b)
+                                goto is_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+                else
+                  goto is_not_a_corner;
+              else
+                goto is_not_a_corner;
+            else
+              goto is_not_a_corner;
+          else
+            if( p[pixel[10]] > cb)
+              if( p[pixel[11]] > cb)
+                if( p[pixel[12]] > cb)
+                  if( p[pixel[13]] > cb)
+                    if( p[pixel[14]] > cb)
+                      if( p[pixel[15]] > cb)
+                        goto is_a_corner;
+                      else
+                        if( p[pixel[6]] > cb)
+                          if( p[pixel[7]] > cb)
+                            if( p[pixel[8]] > cb)
+                              if( p[pixel[9]] > cb)
+                                goto is_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                    else
+                      if( p[pixel[5]] > cb)
+                        if( p[pixel[6]] > cb)
+                          if( p[pixel[7]] > cb)
+                            if( p[pixel[8]] > cb)
+                              if( p[pixel[9]] > cb)
+                                goto is_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                  else
+                    if( p[pixel[4]] > cb)
+                      if( p[pixel[5]] > cb)
+                        if( p[pixel[6]] > cb)
+                          if( p[pixel[7]] > cb)
+                            if( p[pixel[8]] > cb)
+                              if( p[pixel[9]] > cb)
+                                goto is_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                else
+                  goto is_not_a_corner;
+              else
+                goto is_not_a_corner;
+            else if( p[pixel[10]] < c_b)
+              if( p[pixel[7]] < c_b)
+                if( p[pixel[8]] < c_b)
+                  if( p[pixel[9]] < c_b)
+                    if( p[pixel[11]] < c_b)
+                      if( p[pixel[12]] < c_b)
+                        if( p[pixel[6]] < c_b)
+                          if( p[pixel[5]] < c_b)
+                            if( p[pixel[4]] < c_b)
+                              goto is_a_corner;
+                            else
+                              if( p[pixel[13]] < c_b)
+                                goto is_a_corner;
+                              else
+                                goto is_not_a_corner;
+                          else
+                            if( p[pixel[13]] < c_b)
+                              if( p[pixel[14]] < c_b)
+                                goto is_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                        else
+                          if( p[pixel[13]] < c_b)
+                            if( p[pixel[14]] < c_b)
+                              if( p[pixel[15]] < c_b)
+                                goto is_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+                else
+                  goto is_not_a_corner;
+              else
+                goto is_not_a_corner;
+            else
+              goto is_not_a_corner;
+        else if( p[pixel[2]] < c_b)
+          if( p[pixel[9]] > cb)
+            if( p[pixel[10]] > cb)
+              if( p[pixel[11]] > cb)
+                if( p[pixel[12]] > cb)
+                  if( p[pixel[13]] > cb)
+                    if( p[pixel[14]] > cb)
+                      if( p[pixel[15]] > cb)
+                        goto is_a_corner;
+                      else
+                        if( p[pixel[6]] > cb)
+                          if( p[pixel[7]] > cb)
+                            if( p[pixel[8]] > cb)
+                              goto is_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                    else
+                      if( p[pixel[5]] > cb)
+                        if( p[pixel[6]] > cb)
+                          if( p[pixel[7]] > cb)
+                            if( p[pixel[8]] > cb)
+                              goto is_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                  else
+                    if( p[pixel[4]] > cb)
+                      if( p[pixel[5]] > cb)
+                        if( p[pixel[6]] > cb)
+                          if( p[pixel[7]] > cb)
+                            if( p[pixel[8]] > cb)
+                              goto is_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                else
+                  if( p[pixel[3]] > cb)
+                    if( p[pixel[4]] > cb)
+                      if( p[pixel[5]] > cb)
+                        if( p[pixel[6]] > cb)
+                          if( p[pixel[7]] > cb)
+                            if( p[pixel[8]] > cb)
+                              goto is_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+              else
+                goto is_not_a_corner;
+            else
+              goto is_not_a_corner;
+          else if( p[pixel[9]] < c_b)
+            if( p[pixel[7]] < c_b)
+              if( p[pixel[8]] < c_b)
+                if( p[pixel[10]] < c_b)
+                  if( p[pixel[6]] < c_b)
+                    if( p[pixel[5]] < c_b)
+                      if( p[pixel[4]] < c_b)
+                        if( p[pixel[3]] < c_b)
+                          goto is_a_corner;
+                        else
+                          if( p[pixel[11]] < c_b)
+                            if( p[pixel[12]] < c_b)
+                              goto is_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                      else
+                        if( p[pixel[11]] < c_b)
+                          if( p[pixel[12]] < c_b)
+                            if( p[pixel[13]] < c_b)
+                              goto is_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                    else
+                      if( p[pixel[11]] < c_b)
+                        if( p[pixel[12]] < c_b)
+                          if( p[pixel[13]] < c_b)
+                            if( p[pixel[14]] < c_b)
+                              goto is_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                  else
+                    if( p[pixel[11]] < c_b)
+                      if( p[pixel[12]] < c_b)
+                        if( p[pixel[13]] < c_b)
+                          if( p[pixel[14]] < c_b)
+                            if( p[pixel[15]] < c_b)
+                              goto is_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                else
+                  goto is_not_a_corner;
+              else
+                goto is_not_a_corner;
+            else
+              goto is_not_a_corner;
+          else
+            goto is_not_a_corner;
+        else
+          if( p[pixel[9]] > cb)
+            if( p[pixel[10]] > cb)
+              if( p[pixel[11]] > cb)
+                if( p[pixel[12]] > cb)
+                  if( p[pixel[13]] > cb)
+                    if( p[pixel[14]] > cb)
+                      if( p[pixel[15]] > cb)
+                        goto is_a_corner;
+                      else
+                        if( p[pixel[6]] > cb)
+                          if( p[pixel[7]] > cb)
+                            if( p[pixel[8]] > cb)
+                              goto is_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                    else
+                      if( p[pixel[5]] > cb)
+                        if( p[pixel[6]] > cb)
+                          if( p[pixel[7]] > cb)
+                            if( p[pixel[8]] > cb)
+                              goto is_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                  else
+                    if( p[pixel[4]] > cb)
+                      if( p[pixel[5]] > cb)
+                        if( p[pixel[6]] > cb)
+                          if( p[pixel[7]] > cb)
+                            if( p[pixel[8]] > cb)
+                              goto is_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                else
+                  if( p[pixel[3]] > cb)
+                    if( p[pixel[4]] > cb)
+                      if( p[pixel[5]] > cb)
+                        if( p[pixel[6]] > cb)
+                          if( p[pixel[7]] > cb)
+                            if( p[pixel[8]] > cb)
+                              goto is_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+              else
+                goto is_not_a_corner;
+            else
+              goto is_not_a_corner;
+          else if( p[pixel[9]] < c_b)
+            if( p[pixel[7]] < c_b)
+              if( p[pixel[8]] < c_b)
+                if( p[pixel[10]] < c_b)
+                  if( p[pixel[11]] < c_b)
+                    if( p[pixel[6]] < c_b)
+                      if( p[pixel[5]] < c_b)
+                        if( p[pixel[4]] < c_b)
+                          if( p[pixel[3]] < c_b)
+                            goto is_a_corner;
+                          else
+                            if( p[pixel[12]] < c_b)
+                              goto is_a_corner;
+                            else
+                              goto is_not_a_corner;
+                        else
+                          if( p[pixel[12]] < c_b)
+                            if( p[pixel[13]] < c_b)
+                              goto is_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                      else
+                        if( p[pixel[12]] < c_b)
+                          if( p[pixel[13]] < c_b)
+                            if( p[pixel[14]] < c_b)
+                              goto is_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                    else
+                      if( p[pixel[12]] < c_b)
+                        if( p[pixel[13]] < c_b)
+                          if( p[pixel[14]] < c_b)
+                            if( p[pixel[15]] < c_b)
+                              goto is_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+                else
+                  goto is_not_a_corner;
+              else
+                goto is_not_a_corner;
+            else
+              goto is_not_a_corner;
+          else
+            goto is_not_a_corner;
+      else if( p[pixel[1]] < c_b)
+        if( p[pixel[8]] > cb)
+          if( p[pixel[9]] > cb)
+            if( p[pixel[10]] > cb)
+              if( p[pixel[11]] > cb)
+                if( p[pixel[12]] > cb)
+                  if( p[pixel[13]] > cb)
+                    if( p[pixel[14]] > cb)
+                      if( p[pixel[15]] > cb)
+                        goto is_a_corner;
+                      else
+                        if( p[pixel[6]] > cb)
+                          if( p[pixel[7]] > cb)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                    else
+                      if( p[pixel[5]] > cb)
+                        if( p[pixel[6]] > cb)
+                          if( p[pixel[7]] > cb)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                  else
+                    if( p[pixel[4]] > cb)
+                      if( p[pixel[5]] > cb)
+                        if( p[pixel[6]] > cb)
+                          if( p[pixel[7]] > cb)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                else
+                  if( p[pixel[3]] > cb)
+                    if( p[pixel[4]] > cb)
+                      if( p[pixel[5]] > cb)
+                        if( p[pixel[6]] > cb)
+                          if( p[pixel[7]] > cb)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+              else
+                if( p[pixel[2]] > cb)
+                  if( p[pixel[3]] > cb)
+                    if( p[pixel[4]] > cb)
+                      if( p[pixel[5]] > cb)
+                        if( p[pixel[6]] > cb)
+                          if( p[pixel[7]] > cb)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+                else
+                  goto is_not_a_corner;
+            else
+              goto is_not_a_corner;
+          else
+            goto is_not_a_corner;
+        else if( p[pixel[8]] < c_b)
+          if( p[pixel[7]] < c_b)
+            if( p[pixel[9]] < c_b)
+              if( p[pixel[6]] < c_b)
+                if( p[pixel[5]] < c_b)
+                  if( p[pixel[4]] < c_b)
+                    if( p[pixel[3]] < c_b)
+                      if( p[pixel[2]] < c_b)
+                        goto is_a_corner;
+                      else
+                        if( p[pixel[10]] < c_b)
+                          if( p[pixel[11]] < c_b)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                    else
+                      if( p[pixel[10]] < c_b)
+                        if( p[pixel[11]] < c_b)
+                          if( p[pixel[12]] < c_b)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                  else
+                    if( p[pixel[10]] < c_b)
+                      if( p[pixel[11]] < c_b)
+                        if( p[pixel[12]] < c_b)
+                          if( p[pixel[13]] < c_b)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                else
+                  if( p[pixel[10]] < c_b)
+                    if( p[pixel[11]] < c_b)
+                      if( p[pixel[12]] < c_b)
+                        if( p[pixel[13]] < c_b)
+                          if( p[pixel[14]] < c_b)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+              else
+                if( p[pixel[10]] < c_b)
+                  if( p[pixel[11]] < c_b)
+                    if( p[pixel[12]] < c_b)
+                      if( p[pixel[13]] < c_b)
+                        if( p[pixel[14]] < c_b)
+                          if( p[pixel[15]] < c_b)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+                else
+                  goto is_not_a_corner;
+            else
+              goto is_not_a_corner;
+          else
+            goto is_not_a_corner;
+        else
+          goto is_not_a_corner;
+      else
+        if( p[pixel[8]] > cb)
+          if( p[pixel[9]] > cb)
+            if( p[pixel[10]] > cb)
+              if( p[pixel[11]] > cb)
+                if( p[pixel[12]] > cb)
+                  if( p[pixel[13]] > cb)
+                    if( p[pixel[14]] > cb)
+                      if( p[pixel[15]] > cb)
+                        goto is_a_corner;
+                      else
+                        if( p[pixel[6]] > cb)
+                          if( p[pixel[7]] > cb)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                    else
+                      if( p[pixel[5]] > cb)
+                        if( p[pixel[6]] > cb)
+                          if( p[pixel[7]] > cb)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                  else
+                    if( p[pixel[4]] > cb)
+                      if( p[pixel[5]] > cb)
+                        if( p[pixel[6]] > cb)
+                          if( p[pixel[7]] > cb)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                else
+                  if( p[pixel[3]] > cb)
+                    if( p[pixel[4]] > cb)
+                      if( p[pixel[5]] > cb)
+                        if( p[pixel[6]] > cb)
+                          if( p[pixel[7]] > cb)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+              else
+                if( p[pixel[2]] > cb)
+                  if( p[pixel[3]] > cb)
+                    if( p[pixel[4]] > cb)
+                      if( p[pixel[5]] > cb)
+                        if( p[pixel[6]] > cb)
+                          if( p[pixel[7]] > cb)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+                else
+                  goto is_not_a_corner;
+            else
+              goto is_not_a_corner;
+          else
+            goto is_not_a_corner;
+        else if( p[pixel[8]] < c_b)
+          if( p[pixel[7]] < c_b)
+            if( p[pixel[9]] < c_b)
+              if( p[pixel[10]] < c_b)
+                if( p[pixel[6]] < c_b)
+                  if( p[pixel[5]] < c_b)
+                    if( p[pixel[4]] < c_b)
+                      if( p[pixel[3]] < c_b)
+                        if( p[pixel[2]] < c_b)
+                          goto is_a_corner;
+                        else
+                          if( p[pixel[11]] < c_b)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                      else
+                        if( p[pixel[11]] < c_b)
+                          if( p[pixel[12]] < c_b)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                    else
+                      if( p[pixel[11]] < c_b)
+                        if( p[pixel[12]] < c_b)
+                          if( p[pixel[13]] < c_b)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                  else
+                    if( p[pixel[11]] < c_b)
+                      if( p[pixel[12]] < c_b)
+                        if( p[pixel[13]] < c_b)
+                          if( p[pixel[14]] < c_b)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                else
+                  if( p[pixel[11]] < c_b)
+                    if( p[pixel[12]] < c_b)
+                      if( p[pixel[13]] < c_b)
+                        if( p[pixel[14]] < c_b)
+                          if( p[pixel[15]] < c_b)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+              else
+                goto is_not_a_corner;
+            else
+              goto is_not_a_corner;
+          else
+            goto is_not_a_corner;
+        else
+          goto is_not_a_corner;
+    else if( p[pixel[0]] < c_b)
+      if( p[pixel[1]] > cb)
+        if( p[pixel[8]] > cb)
+          if( p[pixel[7]] > cb)
+            if( p[pixel[9]] > cb)
+              if( p[pixel[6]] > cb)
+                if( p[pixel[5]] > cb)
+                  if( p[pixel[4]] > cb)
+                    if( p[pixel[3]] > cb)
+                      if( p[pixel[2]] > cb)
+                        goto is_a_corner;
+                      else
+                        if( p[pixel[10]] > cb)
+                          if( p[pixel[11]] > cb)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                    else
+                      if( p[pixel[10]] > cb)
+                        if( p[pixel[11]] > cb)
+                          if( p[pixel[12]] > cb)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                  else
+                    if( p[pixel[10]] > cb)
+                      if( p[pixel[11]] > cb)
+                        if( p[pixel[12]] > cb)
+                          if( p[pixel[13]] > cb)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                else
+                  if( p[pixel[10]] > cb)
+                    if( p[pixel[11]] > cb)
+                      if( p[pixel[12]] > cb)
+                        if( p[pixel[13]] > cb)
+                          if( p[pixel[14]] > cb)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+              else
+                if( p[pixel[10]] > cb)
+                  if( p[pixel[11]] > cb)
+                    if( p[pixel[12]] > cb)
+                      if( p[pixel[13]] > cb)
+                        if( p[pixel[14]] > cb)
+                          if( p[pixel[15]] > cb)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+                else
+                  goto is_not_a_corner;
+            else
+              goto is_not_a_corner;
+          else
+            goto is_not_a_corner;
+        else if( p[pixel[8]] < c_b)
+          if( p[pixel[9]] < c_b)
+            if( p[pixel[10]] < c_b)
+              if( p[pixel[11]] < c_b)
+                if( p[pixel[12]] < c_b)
+                  if( p[pixel[13]] < c_b)
+                    if( p[pixel[14]] < c_b)
+                      if( p[pixel[15]] < c_b)
+                        goto is_a_corner;
+                      else
+                        if( p[pixel[6]] < c_b)
+                          if( p[pixel[7]] < c_b)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                    else
+                      if( p[pixel[5]] < c_b)
+                        if( p[pixel[6]] < c_b)
+                          if( p[pixel[7]] < c_b)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                  else
+                    if( p[pixel[4]] < c_b)
+                      if( p[pixel[5]] < c_b)
+                        if( p[pixel[6]] < c_b)
+                          if( p[pixel[7]] < c_b)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                else
+                  if( p[pixel[3]] < c_b)
+                    if( p[pixel[4]] < c_b)
+                      if( p[pixel[5]] < c_b)
+                        if( p[pixel[6]] < c_b)
+                          if( p[pixel[7]] < c_b)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+              else
+                if( p[pixel[2]] < c_b)
+                  if( p[pixel[3]] < c_b)
+                    if( p[pixel[4]] < c_b)
+                      if( p[pixel[5]] < c_b)
+                        if( p[pixel[6]] < c_b)
+                          if( p[pixel[7]] < c_b)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+                else
+                  goto is_not_a_corner;
+            else
+              goto is_not_a_corner;
+          else
+            goto is_not_a_corner;
+        else
+          goto is_not_a_corner;
+      else if( p[pixel[1]] < c_b)
+        if( p[pixel[2]] > cb)
+          if( p[pixel[9]] > cb)
+            if( p[pixel[7]] > cb)
+              if( p[pixel[8]] > cb)
+                if( p[pixel[10]] > cb)
+                  if( p[pixel[6]] > cb)
+                    if( p[pixel[5]] > cb)
+                      if( p[pixel[4]] > cb)
+                        if( p[pixel[3]] > cb)
+                          goto is_a_corner;
+                        else
+                          if( p[pixel[11]] > cb)
+                            if( p[pixel[12]] > cb)
+                              goto is_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                      else
+                        if( p[pixel[11]] > cb)
+                          if( p[pixel[12]] > cb)
+                            if( p[pixel[13]] > cb)
+                              goto is_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                    else
+                      if( p[pixel[11]] > cb)
+                        if( p[pixel[12]] > cb)
+                          if( p[pixel[13]] > cb)
+                            if( p[pixel[14]] > cb)
+                              goto is_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                  else
+                    if( p[pixel[11]] > cb)
+                      if( p[pixel[12]] > cb)
+                        if( p[pixel[13]] > cb)
+                          if( p[pixel[14]] > cb)
+                            if( p[pixel[15]] > cb)
+                              goto is_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                else
+                  goto is_not_a_corner;
+              else
+                goto is_not_a_corner;
+            else
+              goto is_not_a_corner;
+          else if( p[pixel[9]] < c_b)
+            if( p[pixel[10]] < c_b)
+              if( p[pixel[11]] < c_b)
+                if( p[pixel[12]] < c_b)
+                  if( p[pixel[13]] < c_b)
+                    if( p[pixel[14]] < c_b)
+                      if( p[pixel[15]] < c_b)
+                        goto is_a_corner;
+                      else
+                        if( p[pixel[6]] < c_b)
+                          if( p[pixel[7]] < c_b)
+                            if( p[pixel[8]] < c_b)
+                              goto is_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                    else
+                      if( p[pixel[5]] < c_b)
+                        if( p[pixel[6]] < c_b)
+                          if( p[pixel[7]] < c_b)
+                            if( p[pixel[8]] < c_b)
+                              goto is_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                  else
+                    if( p[pixel[4]] < c_b)
+                      if( p[pixel[5]] < c_b)
+                        if( p[pixel[6]] < c_b)
+                          if( p[pixel[7]] < c_b)
+                            if( p[pixel[8]] < c_b)
+                              goto is_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                else
+                  if( p[pixel[3]] < c_b)
+                    if( p[pixel[4]] < c_b)
+                      if( p[pixel[5]] < c_b)
+                        if( p[pixel[6]] < c_b)
+                          if( p[pixel[7]] < c_b)
+                            if( p[pixel[8]] < c_b)
+                              goto is_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+              else
+                goto is_not_a_corner;
+            else
+              goto is_not_a_corner;
+          else
+            goto is_not_a_corner;
+        else if( p[pixel[2]] < c_b)
+          if( p[pixel[3]] > cb)
+            if( p[pixel[10]] > cb)
+              if( p[pixel[7]] > cb)
+                if( p[pixel[8]] > cb)
+                  if( p[pixel[9]] > cb)
+                    if( p[pixel[11]] > cb)
+                      if( p[pixel[6]] > cb)
+                        if( p[pixel[5]] > cb)
+                          if( p[pixel[4]] > cb)
+                            goto is_a_corner;
+                          else
+                            if( p[pixel[12]] > cb)
+                              if( p[pixel[13]] > cb)
+                                goto is_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                        else
+                          if( p[pixel[12]] > cb)
+                            if( p[pixel[13]] > cb)
+                              if( p[pixel[14]] > cb)
+                                goto is_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                      else
+                        if( p[pixel[12]] > cb)
+                          if( p[pixel[13]] > cb)
+                            if( p[pixel[14]] > cb)
+                              if( p[pixel[15]] > cb)
+                                goto is_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+                else
+                  goto is_not_a_corner;
+              else
+                goto is_not_a_corner;
+            else if( p[pixel[10]] < c_b)
+              if( p[pixel[11]] < c_b)
+                if( p[pixel[12]] < c_b)
+                  if( p[pixel[13]] < c_b)
+                    if( p[pixel[14]] < c_b)
+                      if( p[pixel[15]] < c_b)
+                        goto is_a_corner;
+                      else
+                        if( p[pixel[6]] < c_b)
+                          if( p[pixel[7]] < c_b)
+                            if( p[pixel[8]] < c_b)
+                              if( p[pixel[9]] < c_b)
+                                goto is_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                    else
+                      if( p[pixel[5]] < c_b)
+                        if( p[pixel[6]] < c_b)
+                          if( p[pixel[7]] < c_b)
+                            if( p[pixel[8]] < c_b)
+                              if( p[pixel[9]] < c_b)
+                                goto is_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                  else
+                    if( p[pixel[4]] < c_b)
+                      if( p[pixel[5]] < c_b)
+                        if( p[pixel[6]] < c_b)
+                          if( p[pixel[7]] < c_b)
+                            if( p[pixel[8]] < c_b)
+                              if( p[pixel[9]] < c_b)
+                                goto is_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                else
+                  goto is_not_a_corner;
+              else
+                goto is_not_a_corner;
+            else
+              goto is_not_a_corner;
+          else if( p[pixel[3]] < c_b)
+            if( p[pixel[4]] > cb)
+              if( p[pixel[13]] > cb)
+                if( p[pixel[7]] > cb)
+                  if( p[pixel[8]] > cb)
+                    if( p[pixel[9]] > cb)
+                      if( p[pixel[10]] > cb)
+                        if( p[pixel[11]] > cb)
+                          if( p[pixel[12]] > cb)
+                            if( p[pixel[6]] > cb)
+                              if( p[pixel[5]] > cb)
+                                goto is_a_corner;
+                              else
+                                if( p[pixel[14]] > cb)
+                                  goto is_a_corner;
+                                else
+                                  goto is_not_a_corner;
+                            else
+                              if( p[pixel[14]] > cb)
+                                if( p[pixel[15]] > cb)
+                                  goto is_a_corner;
+                                else
+                                  goto is_not_a_corner;
+                              else
+                                goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+                else
+                  goto is_not_a_corner;
+              else if( p[pixel[13]] < c_b)
+                if( p[pixel[11]] > cb)
+                  if( p[pixel[5]] > cb)
+                    if( p[pixel[6]] > cb)
+                      if( p[pixel[7]] > cb)
+                        if( p[pixel[8]] > cb)
+                          if( p[pixel[9]] > cb)
+                            if( p[pixel[10]] > cb)
+                              if( p[pixel[12]] > cb)
+                                goto is_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+                else if( p[pixel[11]] < c_b)
+                  if( p[pixel[12]] < c_b)
+                    if( p[pixel[14]] < c_b)
+                      if( p[pixel[15]] < c_b)
+                        goto is_a_corner;
+                      else
+                        if( p[pixel[6]] < c_b)
+                          if( p[pixel[7]] < c_b)
+                            if( p[pixel[8]] < c_b)
+                              if( p[pixel[9]] < c_b)
+                                if( p[pixel[10]] < c_b)
+                                  goto is_a_corner;
+                                else
+                                  goto is_not_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                    else
+                      if( p[pixel[5]] < c_b)
+                        if( p[pixel[6]] < c_b)
+                          if( p[pixel[7]] < c_b)
+                            if( p[pixel[8]] < c_b)
+                              if( p[pixel[9]] < c_b)
+                                if( p[pixel[10]] < c_b)
+                                  goto is_a_corner;
+                                else
+                                  goto is_not_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+                else
+                  goto is_not_a_corner;
+              else
+                if( p[pixel[5]] > cb)
+                  if( p[pixel[6]] > cb)
+                    if( p[pixel[7]] > cb)
+                      if( p[pixel[8]] > cb)
+                        if( p[pixel[9]] > cb)
+                          if( p[pixel[10]] > cb)
+                            if( p[pixel[11]] > cb)
+                              if( p[pixel[12]] > cb)
+                                goto is_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+                else
+                  goto is_not_a_corner;
+            else if( p[pixel[4]] < c_b)
+              if( p[pixel[5]] > cb)
+                if( p[pixel[14]] > cb)
+                  if( p[pixel[7]] > cb)
+                    if( p[pixel[8]] > cb)
+                      if( p[pixel[9]] > cb)
+                        if( p[pixel[10]] > cb)
+                          if( p[pixel[11]] > cb)
+                            if( p[pixel[12]] > cb)
+                              if( p[pixel[13]] > cb)
+                                if( p[pixel[6]] > cb)
+                                  goto is_a_corner;
+                                else
+                                  if( p[pixel[15]] > cb)
+                                    goto is_a_corner;
+                                  else
+                                    goto is_not_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+                else if( p[pixel[14]] < c_b)
+                  if( p[pixel[12]] > cb)
+                    if( p[pixel[6]] > cb)
+                      if( p[pixel[7]] > cb)
+                        if( p[pixel[8]] > cb)
+                          if( p[pixel[9]] > cb)
+                            if( p[pixel[10]] > cb)
+                              if( p[pixel[11]] > cb)
+                                if( p[pixel[13]] > cb)
+                                  goto is_a_corner;
+                                else
+                                  goto is_not_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else if( p[pixel[12]] < c_b)
+                    if( p[pixel[13]] < c_b)
+                      if( p[pixel[15]] < c_b)
+                        goto is_a_corner;
+                      else
+                        if( p[pixel[6]] < c_b)
+                          if( p[pixel[7]] < c_b)
+                            if( p[pixel[8]] < c_b)
+                              if( p[pixel[9]] < c_b)
+                                if( p[pixel[10]] < c_b)
+                                  if( p[pixel[11]] < c_b)
+                                    goto is_a_corner;
+                                  else
+                                    goto is_not_a_corner;
+                                else
+                                  goto is_not_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+                else
+                  if( p[pixel[6]] > cb)
+                    if( p[pixel[7]] > cb)
+                      if( p[pixel[8]] > cb)
+                        if( p[pixel[9]] > cb)
+                          if( p[pixel[10]] > cb)
+                            if( p[pixel[11]] > cb)
+                              if( p[pixel[12]] > cb)
+                                if( p[pixel[13]] > cb)
+                                  goto is_a_corner;
+                                else
+                                  goto is_not_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+              else if( p[pixel[5]] < c_b)
+                if( p[pixel[6]] > cb)
+                  if( p[pixel[15]] < c_b)
+                    if( p[pixel[13]] > cb)
+                      if( p[pixel[7]] > cb)
+                        if( p[pixel[8]] > cb)
+                          if( p[pixel[9]] > cb)
+                            if( p[pixel[10]] > cb)
+                              if( p[pixel[11]] > cb)
+                                if( p[pixel[12]] > cb)
+                                  if( p[pixel[14]] > cb)
+                                    goto is_a_corner;
+                                  else
+                                    goto is_not_a_corner;
+                                else
+                                  goto is_not_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else if( p[pixel[13]] < c_b)
+                      if( p[pixel[14]] < c_b)
+                        goto is_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    if( p[pixel[7]] > cb)
+                      if( p[pixel[8]] > cb)
+                        if( p[pixel[9]] > cb)
+                          if( p[pixel[10]] > cb)
+                            if( p[pixel[11]] > cb)
+                              if( p[pixel[12]] > cb)
+                                if( p[pixel[13]] > cb)
+                                  if( p[pixel[14]] > cb)
+                                    goto is_a_corner;
+                                  else
+                                    goto is_not_a_corner;
+                                else
+                                  goto is_not_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                else if( p[pixel[6]] < c_b)
+                  if( p[pixel[7]] > cb)
+                    if( p[pixel[14]] > cb)
+                      if( p[pixel[8]] > cb)
+                        if( p[pixel[9]] > cb)
+                          if( p[pixel[10]] > cb)
+                            if( p[pixel[11]] > cb)
+                              if( p[pixel[12]] > cb)
+                                if( p[pixel[13]] > cb)
+                                  if( p[pixel[15]] > cb)
+                                    goto is_a_corner;
+                                  else
+                                    goto is_not_a_corner;
+                                else
+                                  goto is_not_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else if( p[pixel[14]] < c_b)
+                      if( p[pixel[15]] < c_b)
+                        goto is_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else if( p[pixel[7]] < c_b)
+                    if( p[pixel[8]] < c_b)
+                      goto is_a_corner;
+                    else
+                      if( p[pixel[15]] < c_b)
+                        goto is_a_corner;
+                      else
+                        goto is_not_a_corner;
+                  else
+                    if( p[pixel[14]] < c_b)
+                      if( p[pixel[15]] < c_b)
+                        goto is_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                else
+                  if( p[pixel[13]] > cb)
+                    if( p[pixel[7]] > cb)
+                      if( p[pixel[8]] > cb)
+                        if( p[pixel[9]] > cb)
+                          if( p[pixel[10]] > cb)
+                            if( p[pixel[11]] > cb)
+                              if( p[pixel[12]] > cb)
+                                if( p[pixel[14]] > cb)
+                                  if( p[pixel[15]] > cb)
+                                    goto is_a_corner;
+                                  else
+                                    goto is_not_a_corner;
+                                else
+                                  goto is_not_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else if( p[pixel[13]] < c_b)
+                    if( p[pixel[14]] < c_b)
+                      if( p[pixel[15]] < c_b)
+                        goto is_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+              else
+                if( p[pixel[12]] > cb)
+                  if( p[pixel[7]] > cb)
+                    if( p[pixel[8]] > cb)
+                      if( p[pixel[9]] > cb)
+                        if( p[pixel[10]] > cb)
+                          if( p[pixel[11]] > cb)
+                            if( p[pixel[13]] > cb)
+                              if( p[pixel[14]] > cb)
+                                if( p[pixel[6]] > cb)
+                                  goto is_a_corner;
+                                else
+                                  if( p[pixel[15]] > cb)
+                                    goto is_a_corner;
+                                  else
+                                    goto is_not_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+                else if( p[pixel[12]] < c_b)
+                  if( p[pixel[13]] < c_b)
+                    if( p[pixel[14]] < c_b)
+                      if( p[pixel[15]] < c_b)
+                        goto is_a_corner;
+                      else
+                        if( p[pixel[6]] < c_b)
+                          if( p[pixel[7]] < c_b)
+                            if( p[pixel[8]] < c_b)
+                              if( p[pixel[9]] < c_b)
+                                if( p[pixel[10]] < c_b)
+                                  if( p[pixel[11]] < c_b)
+                                    goto is_a_corner;
+                                  else
+                                    goto is_not_a_corner;
+                                else
+                                  goto is_not_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+                else
+                  goto is_not_a_corner;
+            else
+              if( p[pixel[11]] > cb)
+                if( p[pixel[7]] > cb)
+                  if( p[pixel[8]] > cb)
+                    if( p[pixel[9]] > cb)
+                      if( p[pixel[10]] > cb)
+                        if( p[pixel[12]] > cb)
+                          if( p[pixel[13]] > cb)
+                            if( p[pixel[6]] > cb)
+                              if( p[pixel[5]] > cb)
+                                goto is_a_corner;
+                              else
+                                if( p[pixel[14]] > cb)
+                                  goto is_a_corner;
+                                else
+                                  goto is_not_a_corner;
+                            else
+                              if( p[pixel[14]] > cb)
+                                if( p[pixel[15]] > cb)
+                                  goto is_a_corner;
+                                else
+                                  goto is_not_a_corner;
+                              else
+                                goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+                else
+                  goto is_not_a_corner;
+              else if( p[pixel[11]] < c_b)
+                if( p[pixel[12]] < c_b)
+                  if( p[pixel[13]] < c_b)
+                    if( p[pixel[14]] < c_b)
+                      if( p[pixel[15]] < c_b)
+                        goto is_a_corner;
+                      else
+                        if( p[pixel[6]] < c_b)
+                          if( p[pixel[7]] < c_b)
+                            if( p[pixel[8]] < c_b)
+                              if( p[pixel[9]] < c_b)
+                                if( p[pixel[10]] < c_b)
+                                  goto is_a_corner;
+                                else
+                                  goto is_not_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                    else
+                      if( p[pixel[5]] < c_b)
+                        if( p[pixel[6]] < c_b)
+                          if( p[pixel[7]] < c_b)
+                            if( p[pixel[8]] < c_b)
+                              if( p[pixel[9]] < c_b)
+                                if( p[pixel[10]] < c_b)
+                                  goto is_a_corner;
+                                else
+                                  goto is_not_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+                else
+                  goto is_not_a_corner;
+              else
+                goto is_not_a_corner;
+          else
+            if( p[pixel[10]] > cb)
+              if( p[pixel[7]] > cb)
+                if( p[pixel[8]] > cb)
+                  if( p[pixel[9]] > cb)
+                    if( p[pixel[11]] > cb)
+                      if( p[pixel[12]] > cb)
+                        if( p[pixel[6]] > cb)
+                          if( p[pixel[5]] > cb)
+                            if( p[pixel[4]] > cb)
+                              goto is_a_corner;
+                            else
+                              if( p[pixel[13]] > cb)
+                                goto is_a_corner;
+                              else
+                                goto is_not_a_corner;
+                          else
+                            if( p[pixel[13]] > cb)
+                              if( p[pixel[14]] > cb)
+                                goto is_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                        else
+                          if( p[pixel[13]] > cb)
+                            if( p[pixel[14]] > cb)
+                              if( p[pixel[15]] > cb)
+                                goto is_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+                else
+                  goto is_not_a_corner;
+              else
+                goto is_not_a_corner;
+            else if( p[pixel[10]] < c_b)
+              if( p[pixel[11]] < c_b)
+                if( p[pixel[12]] < c_b)
+                  if( p[pixel[13]] < c_b)
+                    if( p[pixel[14]] < c_b)
+                      if( p[pixel[15]] < c_b)
+                        goto is_a_corner;
+                      else
+                        if( p[pixel[6]] < c_b)
+                          if( p[pixel[7]] < c_b)
+                            if( p[pixel[8]] < c_b)
+                              if( p[pixel[9]] < c_b)
+                                goto is_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                    else
+                      if( p[pixel[5]] < c_b)
+                        if( p[pixel[6]] < c_b)
+                          if( p[pixel[7]] < c_b)
+                            if( p[pixel[8]] < c_b)
+                              if( p[pixel[9]] < c_b)
+                                goto is_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                  else
+                    if( p[pixel[4]] < c_b)
+                      if( p[pixel[5]] < c_b)
+                        if( p[pixel[6]] < c_b)
+                          if( p[pixel[7]] < c_b)
+                            if( p[pixel[8]] < c_b)
+                              if( p[pixel[9]] < c_b)
+                                goto is_a_corner;
+                              else
+                                goto is_not_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                else
+                  goto is_not_a_corner;
+              else
+                goto is_not_a_corner;
+            else
+              goto is_not_a_corner;
+        else
+          if( p[pixel[9]] > cb)
+            if( p[pixel[7]] > cb)
+              if( p[pixel[8]] > cb)
+                if( p[pixel[10]] > cb)
+                  if( p[pixel[11]] > cb)
+                    if( p[pixel[6]] > cb)
+                      if( p[pixel[5]] > cb)
+                        if( p[pixel[4]] > cb)
+                          if( p[pixel[3]] > cb)
+                            goto is_a_corner;
+                          else
+                            if( p[pixel[12]] > cb)
+                              goto is_a_corner;
+                            else
+                              goto is_not_a_corner;
+                        else
+                          if( p[pixel[12]] > cb)
+                            if( p[pixel[13]] > cb)
+                              goto is_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                      else
+                        if( p[pixel[12]] > cb)
+                          if( p[pixel[13]] > cb)
+                            if( p[pixel[14]] > cb)
+                              goto is_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                    else
+                      if( p[pixel[12]] > cb)
+                        if( p[pixel[13]] > cb)
+                          if( p[pixel[14]] > cb)
+                            if( p[pixel[15]] > cb)
+                              goto is_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+                else
+                  goto is_not_a_corner;
+              else
+                goto is_not_a_corner;
+            else
+              goto is_not_a_corner;
+          else if( p[pixel[9]] < c_b)
+            if( p[pixel[10]] < c_b)
+              if( p[pixel[11]] < c_b)
+                if( p[pixel[12]] < c_b)
+                  if( p[pixel[13]] < c_b)
+                    if( p[pixel[14]] < c_b)
+                      if( p[pixel[15]] < c_b)
+                        goto is_a_corner;
+                      else
+                        if( p[pixel[6]] < c_b)
+                          if( p[pixel[7]] < c_b)
+                            if( p[pixel[8]] < c_b)
+                              goto is_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                    else
+                      if( p[pixel[5]] < c_b)
+                        if( p[pixel[6]] < c_b)
+                          if( p[pixel[7]] < c_b)
+                            if( p[pixel[8]] < c_b)
+                              goto is_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                  else
+                    if( p[pixel[4]] < c_b)
+                      if( p[pixel[5]] < c_b)
+                        if( p[pixel[6]] < c_b)
+                          if( p[pixel[7]] < c_b)
+                            if( p[pixel[8]] < c_b)
+                              goto is_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                else
+                  if( p[pixel[3]] < c_b)
+                    if( p[pixel[4]] < c_b)
+                      if( p[pixel[5]] < c_b)
+                        if( p[pixel[6]] < c_b)
+                          if( p[pixel[7]] < c_b)
+                            if( p[pixel[8]] < c_b)
+                              goto is_a_corner;
+                            else
+                              goto is_not_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+              else
+                goto is_not_a_corner;
+            else
+              goto is_not_a_corner;
+          else
+            goto is_not_a_corner;
+      else
+        if( p[pixel[8]] > cb)
+          if( p[pixel[7]] > cb)
+            if( p[pixel[9]] > cb)
+              if( p[pixel[10]] > cb)
+                if( p[pixel[6]] > cb)
+                  if( p[pixel[5]] > cb)
+                    if( p[pixel[4]] > cb)
+                      if( p[pixel[3]] > cb)
+                        if( p[pixel[2]] > cb)
+                          goto is_a_corner;
+                        else
+                          if( p[pixel[11]] > cb)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                      else
+                        if( p[pixel[11]] > cb)
+                          if( p[pixel[12]] > cb)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                    else
+                      if( p[pixel[11]] > cb)
+                        if( p[pixel[12]] > cb)
+                          if( p[pixel[13]] > cb)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                  else
+                    if( p[pixel[11]] > cb)
+                      if( p[pixel[12]] > cb)
+                        if( p[pixel[13]] > cb)
+                          if( p[pixel[14]] > cb)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                else
+                  if( p[pixel[11]] > cb)
+                    if( p[pixel[12]] > cb)
+                      if( p[pixel[13]] > cb)
+                        if( p[pixel[14]] > cb)
+                          if( p[pixel[15]] > cb)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+              else
+                goto is_not_a_corner;
+            else
+              goto is_not_a_corner;
+          else
+            goto is_not_a_corner;
+        else if( p[pixel[8]] < c_b)
+          if( p[pixel[9]] < c_b)
+            if( p[pixel[10]] < c_b)
+              if( p[pixel[11]] < c_b)
+                if( p[pixel[12]] < c_b)
+                  if( p[pixel[13]] < c_b)
+                    if( p[pixel[14]] < c_b)
+                      if( p[pixel[15]] < c_b)
+                        goto is_a_corner;
+                      else
+                        if( p[pixel[6]] < c_b)
+                          if( p[pixel[7]] < c_b)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                    else
+                      if( p[pixel[5]] < c_b)
+                        if( p[pixel[6]] < c_b)
+                          if( p[pixel[7]] < c_b)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                  else
+                    if( p[pixel[4]] < c_b)
+                      if( p[pixel[5]] < c_b)
+                        if( p[pixel[6]] < c_b)
+                          if( p[pixel[7]] < c_b)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                else
+                  if( p[pixel[3]] < c_b)
+                    if( p[pixel[4]] < c_b)
+                      if( p[pixel[5]] < c_b)
+                        if( p[pixel[6]] < c_b)
+                          if( p[pixel[7]] < c_b)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+              else
+                if( p[pixel[2]] < c_b)
+                  if( p[pixel[3]] < c_b)
+                    if( p[pixel[4]] < c_b)
+                      if( p[pixel[5]] < c_b)
+                        if( p[pixel[6]] < c_b)
+                          if( p[pixel[7]] < c_b)
+                            goto is_a_corner;
+                          else
+                            goto is_not_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+                else
+                  goto is_not_a_corner;
+            else
+              goto is_not_a_corner;
+          else
+            goto is_not_a_corner;
+        else
+          goto is_not_a_corner;
+    else
+      if( p[pixel[7]] > cb)
+        if( p[pixel[8]] > cb)
+          if( p[pixel[9]] > cb)
+            if( p[pixel[6]] > cb)
+              if( p[pixel[5]] > cb)
+                if( p[pixel[4]] > cb)
+                  if( p[pixel[3]] > cb)
+                    if( p[pixel[2]] > cb)
+                      if( p[pixel[1]] > cb)
+                        goto is_a_corner;
+                      else
+                        if( p[pixel[10]] > cb)
+                          goto is_a_corner;
+                        else
+                          goto is_not_a_corner;
+                    else
+                      if( p[pixel[10]] > cb)
+                        if( p[pixel[11]] > cb)
+                          goto is_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                  else
+                    if( p[pixel[10]] > cb)
+                      if( p[pixel[11]] > cb)
+                        if( p[pixel[12]] > cb)
+                          goto is_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                else
+                  if( p[pixel[10]] > cb)
+                    if( p[pixel[11]] > cb)
+                      if( p[pixel[12]] > cb)
+                        if( p[pixel[13]] > cb)
+                          goto is_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+              else
+                if( p[pixel[10]] > cb)
+                  if( p[pixel[11]] > cb)
+                    if( p[pixel[12]] > cb)
+                      if( p[pixel[13]] > cb)
+                        if( p[pixel[14]] > cb)
+                          goto is_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+                else
+                  goto is_not_a_corner;
+            else
+              if( p[pixel[10]] > cb)
+                if( p[pixel[11]] > cb)
+                  if( p[pixel[12]] > cb)
+                    if( p[pixel[13]] > cb)
+                      if( p[pixel[14]] > cb)
+                        if( p[pixel[15]] > cb)
+                          goto is_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+                else
+                  goto is_not_a_corner;
+              else
+                goto is_not_a_corner;
+          else
+            goto is_not_a_corner;
+        else
+          goto is_not_a_corner;
+      else if( p[pixel[7]] < c_b)
+        if( p[pixel[8]] < c_b)
+          if( p[pixel[9]] < c_b)
+            if( p[pixel[6]] < c_b)
+              if( p[pixel[5]] < c_b)
+                if( p[pixel[4]] < c_b)
+                  if( p[pixel[3]] < c_b)
+                    if( p[pixel[2]] < c_b)
+                      if( p[pixel[1]] < c_b)
+                        goto is_a_corner;
+                      else
+                        if( p[pixel[10]] < c_b)
+                          goto is_a_corner;
+                        else
+                          goto is_not_a_corner;
+                    else
+                      if( p[pixel[10]] < c_b)
+                        if( p[pixel[11]] < c_b)
+                          goto is_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                  else
+                    if( p[pixel[10]] < c_b)
+                      if( p[pixel[11]] < c_b)
+                        if( p[pixel[12]] < c_b)
+                          goto is_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                else
+                  if( p[pixel[10]] < c_b)
+                    if( p[pixel[11]] < c_b)
+                      if( p[pixel[12]] < c_b)
+                        if( p[pixel[13]] < c_b)
+                          goto is_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+              else
+                if( p[pixel[10]] < c_b)
+                  if( p[pixel[11]] < c_b)
+                    if( p[pixel[12]] < c_b)
+                      if( p[pixel[13]] < c_b)
+                        if( p[pixel[14]] < c_b)
+                          goto is_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+                else
+                  goto is_not_a_corner;
+            else
+              if( p[pixel[10]] < c_b)
+                if( p[pixel[11]] < c_b)
+                  if( p[pixel[12]] < c_b)
+                    if( p[pixel[13]] < c_b)
+                      if( p[pixel[14]] < c_b)
+                        if( p[pixel[15]] < c_b)
+                          goto is_a_corner;
+                        else
+                          goto is_not_a_corner;
+                      else
+                        goto is_not_a_corner;
+                    else
+                      goto is_not_a_corner;
+                  else
+                    goto is_not_a_corner;
+                else
+                  goto is_not_a_corner;
+              else
+                goto is_not_a_corner;
+          else
+            goto is_not_a_corner;
+        else
+          goto is_not_a_corner;
+      else
+        goto is_not_a_corner;
+
+is_a_corner:
+    bmin=b;
+    goto end_if;
+
+is_not_a_corner:
+    bmax=b;
+    goto end_if;
+
+end_if:
+
+    if(bmin == bmax - 1 || bmin == bmax)
+      return bmin;
+    b = (bmin + bmax) / 2;
+  }
+}
+
+static void make_offsets(int pixel[], int row_stride)
+{
+  pixel[0] = 0 + row_stride * 3;
+  pixel[1] = 1 + row_stride * 3;
+  pixel[2] = 2 + row_stride * 2;
+  pixel[3] = 3 + row_stride * 1;
+  pixel[4] = 3 + row_stride * 0;
+  pixel[5] = 3 + row_stride * -1;
+  pixel[6] = 2 + row_stride * -2;
+  pixel[7] = 1 + row_stride * -3;
+  pixel[8] = 0 + row_stride * -3;
+  pixel[9] = -1 + row_stride * -3;
+  pixel[10] = -2 + row_stride * -2;
+  pixel[11] = -3 + row_stride * -1;
+  pixel[12] = -3 + row_stride * 0;
+  pixel[13] = -3 + row_stride * 1;
+  pixel[14] = -2 + row_stride * 2;
+  pixel[15] = -1 + row_stride * 3;
+}
+
+
+
+int* aom_fast9_score(const byte* i, int stride, xy* corners, int num_corners, int b)
+{
+  int* scores = (int*)malloc(sizeof(int)* num_corners);
+  int n;
+
+  int pixel[16];
+  make_offsets(pixel, stride);
+
+  for(n=0; n < num_corners; n++)
+    scores[n] = aom_fast9_corner_score(i + corners[n].y*stride + corners[n].x, pixel, b);
+
+  return scores;
+}
+
+
+xy* aom_fast9_detect(const byte* im, int xsize, int ysize, int stride, int b, int* ret_num_corners)
+{
+  int num_corners=0;
+  xy* ret_corners;
+  int rsize=512;
+  int pixel[16];
+  int x, y;
+
+  ret_corners = (xy*)malloc(sizeof(xy)*rsize);
+  make_offsets(pixel, stride);
+
+  for(y=3; y < ysize - 3; y++)
+    for(x=3; x < xsize - 3; x++)
+    {
+      const byte* p = im + y*stride + x;
+
+      int cb = *p + b;
+      int c_b= *p - b;
+      if(p[pixel[0]] > cb)
+        if(p[pixel[1]] > cb)
+          if(p[pixel[2]] > cb)
+            if(p[pixel[3]] > cb)
+              if(p[pixel[4]] > cb)
+                if(p[pixel[5]] > cb)
+                  if(p[pixel[6]] > cb)
+                    if(p[pixel[7]] > cb)
+                      if(p[pixel[8]] > cb)
+                      {}
+                      else
+                        if(p[pixel[15]] > cb)
+                        {}
+                        else
+                          continue;
+                    else if(p[pixel[7]] < c_b)
+                      if(p[pixel[14]] > cb)
+                        if(p[pixel[15]] > cb)
+                        {}
+                        else
+                          continue;
+                      else if(p[pixel[14]] < c_b)
+                        if(p[pixel[8]] < c_b)
+                          if(p[pixel[9]] < c_b)
+                            if(p[pixel[10]] < c_b)
+                              if(p[pixel[11]] < c_b)
+                                if(p[pixel[12]] < c_b)
+                                  if(p[pixel[13]] < c_b)
+                                    if(p[pixel[15]] < c_b)
+                                    {}
+                                    else
+                                      continue;
+                                  else
+                                    continue;
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      if(p[pixel[14]] > cb)
+                        if(p[pixel[15]] > cb)
+                        {}
+                        else
+                          continue;
+                      else
+                        continue;
+                  else if(p[pixel[6]] < c_b)
+                    if(p[pixel[15]] > cb)
+                      if(p[pixel[13]] > cb)
+                        if(p[pixel[14]] > cb)
+                        {}
+                        else
+                          continue;
+                      else if(p[pixel[13]] < c_b)
+                        if(p[pixel[7]] < c_b)
+                          if(p[pixel[8]] < c_b)
+                            if(p[pixel[9]] < c_b)
+                              if(p[pixel[10]] < c_b)
+                                if(p[pixel[11]] < c_b)
+                                  if(p[pixel[12]] < c_b)
+                                    if(p[pixel[14]] < c_b)
+                                    {}
+                                    else
+                                      continue;
+                                  else
+                                    continue;
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      if(p[pixel[7]] < c_b)
+                        if(p[pixel[8]] < c_b)
+                          if(p[pixel[9]] < c_b)
+                            if(p[pixel[10]] < c_b)
+                              if(p[pixel[11]] < c_b)
+                                if(p[pixel[12]] < c_b)
+                                  if(p[pixel[13]] < c_b)
+                                    if(p[pixel[14]] < c_b)
+                                    {}
+                                    else
+                                      continue;
+                                  else
+                                    continue;
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                  else
+                    if(p[pixel[13]] > cb)
+                      if(p[pixel[14]] > cb)
+                        if(p[pixel[15]] > cb)
+                        {}
+                        else
+                          continue;
+                      else
+                        continue;
+                    else if(p[pixel[13]] < c_b)
+                      if(p[pixel[7]] < c_b)
+                        if(p[pixel[8]] < c_b)
+                          if(p[pixel[9]] < c_b)
+                            if(p[pixel[10]] < c_b)
+                              if(p[pixel[11]] < c_b)
+                                if(p[pixel[12]] < c_b)
+                                  if(p[pixel[14]] < c_b)
+                                    if(p[pixel[15]] < c_b)
+                                    {}
+                                    else
+                                      continue;
+                                  else
+                                    continue;
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                else if(p[pixel[5]] < c_b)
+                  if(p[pixel[14]] > cb)
+                    if(p[pixel[12]] > cb)
+                      if(p[pixel[13]] > cb)
+                        if(p[pixel[15]] > cb)
+                        {}
+                        else
+                          if(p[pixel[6]] > cb)
+                            if(p[pixel[7]] > cb)
+                              if(p[pixel[8]] > cb)
+                                if(p[pixel[9]] > cb)
+                                  if(p[pixel[10]] > cb)
+                                    if(p[pixel[11]] > cb)
+                                    {}
+                                    else
+                                      continue;
+                                  else
+                                    continue;
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                      else
+                        continue;
+                    else if(p[pixel[12]] < c_b)
+                      if(p[pixel[6]] < c_b)
+                        if(p[pixel[7]] < c_b)
+                          if(p[pixel[8]] < c_b)
+                            if(p[pixel[9]] < c_b)
+                              if(p[pixel[10]] < c_b)
+                                if(p[pixel[11]] < c_b)
+                                  if(p[pixel[13]] < c_b)
+                                  {}
+                                  else
+                                    continue;
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                  else if(p[pixel[14]] < c_b)
+                    if(p[pixel[7]] < c_b)
+                      if(p[pixel[8]] < c_b)
+                        if(p[pixel[9]] < c_b)
+                          if(p[pixel[10]] < c_b)
+                            if(p[pixel[11]] < c_b)
+                              if(p[pixel[12]] < c_b)
+                                if(p[pixel[13]] < c_b)
+                                  if(p[pixel[6]] < c_b)
+                                  {}
+                                  else
+                                    if(p[pixel[15]] < c_b)
+                                    {}
+                                    else
+                                      continue;
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                  else
+                    if(p[pixel[6]] < c_b)
+                      if(p[pixel[7]] < c_b)
+                        if(p[pixel[8]] < c_b)
+                          if(p[pixel[9]] < c_b)
+                            if(p[pixel[10]] < c_b)
+                              if(p[pixel[11]] < c_b)
+                                if(p[pixel[12]] < c_b)
+                                  if(p[pixel[13]] < c_b)
+                                  {}
+                                  else
+                                    continue;
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                else
+                  if(p[pixel[12]] > cb)
+                    if(p[pixel[13]] > cb)
+                      if(p[pixel[14]] > cb)
+                        if(p[pixel[15]] > cb)
+                        {}
+                        else
+                          if(p[pixel[6]] > cb)
+                            if(p[pixel[7]] > cb)
+                              if(p[pixel[8]] > cb)
+                                if(p[pixel[9]] > cb)
+                                  if(p[pixel[10]] > cb)
+                                    if(p[pixel[11]] > cb)
+                                    {}
+                                    else
+                                      continue;
+                                  else
+                                    continue;
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                  else if(p[pixel[12]] < c_b)
+                    if(p[pixel[7]] < c_b)
+                      if(p[pixel[8]] < c_b)
+                        if(p[pixel[9]] < c_b)
+                          if(p[pixel[10]] < c_b)
+                            if(p[pixel[11]] < c_b)
+                              if(p[pixel[13]] < c_b)
+                                if(p[pixel[14]] < c_b)
+                                  if(p[pixel[6]] < c_b)
+                                  {}
+                                  else
+                                    if(p[pixel[15]] < c_b)
+                                    {}
+                                    else
+                                      continue;
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                  else
+                    continue;
+              else if(p[pixel[4]] < c_b)
+                if(p[pixel[13]] > cb)
+                  if(p[pixel[11]] > cb)
+                    if(p[pixel[12]] > cb)
+                      if(p[pixel[14]] > cb)
+                        if(p[pixel[15]] > cb)
+                        {}
+                        else
+                          if(p[pixel[6]] > cb)
+                            if(p[pixel[7]] > cb)
+                              if(p[pixel[8]] > cb)
+                                if(p[pixel[9]] > cb)
+                                  if(p[pixel[10]] > cb)
+                                  {}
+                                  else
+                                    continue;
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                      else
+                        if(p[pixel[5]] > cb)
+                          if(p[pixel[6]] > cb)
+                            if(p[pixel[7]] > cb)
+                              if(p[pixel[8]] > cb)
+                                if(p[pixel[9]] > cb)
+                                  if(p[pixel[10]] > cb)
+                                  {}
+                                  else
+                                    continue;
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                    else
+                      continue;
+                  else if(p[pixel[11]] < c_b)
+                    if(p[pixel[5]] < c_b)
+                      if(p[pixel[6]] < c_b)
+                        if(p[pixel[7]] < c_b)
+                          if(p[pixel[8]] < c_b)
+                            if(p[pixel[9]] < c_b)
+                              if(p[pixel[10]] < c_b)
+                                if(p[pixel[12]] < c_b)
+                                {}
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                  else
+                    continue;
+                else if(p[pixel[13]] < c_b)
+                  if(p[pixel[7]] < c_b)
+                    if(p[pixel[8]] < c_b)
+                      if(p[pixel[9]] < c_b)
+                        if(p[pixel[10]] < c_b)
+                          if(p[pixel[11]] < c_b)
+                            if(p[pixel[12]] < c_b)
+                              if(p[pixel[6]] < c_b)
+                                if(p[pixel[5]] < c_b)
+                                {}
+                                else
+                                  if(p[pixel[14]] < c_b)
+                                  {}
+                                  else
+                                    continue;
+                              else
+                                if(p[pixel[14]] < c_b)
+                                  if(p[pixel[15]] < c_b)
+                                  {}
+                                  else
+                                    continue;
+                                else
+                                  continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                  else
+                    continue;
+                else
+                  if(p[pixel[5]] < c_b)
+                    if(p[pixel[6]] < c_b)
+                      if(p[pixel[7]] < c_b)
+                        if(p[pixel[8]] < c_b)
+                          if(p[pixel[9]] < c_b)
+                            if(p[pixel[10]] < c_b)
+                              if(p[pixel[11]] < c_b)
+                                if(p[pixel[12]] < c_b)
+                                {}
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                  else
+                    continue;
+              else
+                if(p[pixel[11]] > cb)
+                  if(p[pixel[12]] > cb)
+                    if(p[pixel[13]] > cb)
+                      if(p[pixel[14]] > cb)
+                        if(p[pixel[15]] > cb)
+                        {}
+                        else
+                          if(p[pixel[6]] > cb)
+                            if(p[pixel[7]] > cb)
+                              if(p[pixel[8]] > cb)
+                                if(p[pixel[9]] > cb)
+                                  if(p[pixel[10]] > cb)
+                                  {}
+                                  else
+                                    continue;
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                      else
+                        if(p[pixel[5]] > cb)
+                          if(p[pixel[6]] > cb)
+                            if(p[pixel[7]] > cb)
+                              if(p[pixel[8]] > cb)
+                                if(p[pixel[9]] > cb)
+                                  if(p[pixel[10]] > cb)
+                                  {}
+                                  else
+                                    continue;
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                    else
+                      continue;
+                  else
+                    continue;
+                else if(p[pixel[11]] < c_b)
+                  if(p[pixel[7]] < c_b)
+                    if(p[pixel[8]] < c_b)
+                      if(p[pixel[9]] < c_b)
+                        if(p[pixel[10]] < c_b)
+                          if(p[pixel[12]] < c_b)
+                            if(p[pixel[13]] < c_b)
+                              if(p[pixel[6]] < c_b)
+                                if(p[pixel[5]] < c_b)
+                                {}
+                                else
+                                  if(p[pixel[14]] < c_b)
+                                  {}
+                                  else
+                                    continue;
+                              else
+                                if(p[pixel[14]] < c_b)
+                                  if(p[pixel[15]] < c_b)
+                                  {}
+                                  else
+                                    continue;
+                                else
+                                  continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                  else
+                    continue;
+                else
+                  continue;
+            else if(p[pixel[3]] < c_b)
+              if(p[pixel[10]] > cb)
+                if(p[pixel[11]] > cb)
+                  if(p[pixel[12]] > cb)
+                    if(p[pixel[13]] > cb)
+                      if(p[pixel[14]] > cb)
+                        if(p[pixel[15]] > cb)
+                        {}
+                        else
+                          if(p[pixel[6]] > cb)
+                            if(p[pixel[7]] > cb)
+                              if(p[pixel[8]] > cb)
+                                if(p[pixel[9]] > cb)
+                                {}
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                      else
+                        if(p[pixel[5]] > cb)
+                          if(p[pixel[6]] > cb)
+                            if(p[pixel[7]] > cb)
+                              if(p[pixel[8]] > cb)
+                                if(p[pixel[9]] > cb)
+                                {}
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                    else
+                      if(p[pixel[4]] > cb)
+                        if(p[pixel[5]] > cb)
+                          if(p[pixel[6]] > cb)
+                            if(p[pixel[7]] > cb)
+                              if(p[pixel[8]] > cb)
+                                if(p[pixel[9]] > cb)
+                                {}
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                  else
+                    continue;
+                else
+                  continue;
+              else if(p[pixel[10]] < c_b)
+                if(p[pixel[7]] < c_b)
+                  if(p[pixel[8]] < c_b)
+                    if(p[pixel[9]] < c_b)
+                      if(p[pixel[11]] < c_b)
+                        if(p[pixel[6]] < c_b)
+                          if(p[pixel[5]] < c_b)
+                            if(p[pixel[4]] < c_b)
+                            {}
+                            else
+                              if(p[pixel[12]] < c_b)
+                                if(p[pixel[13]] < c_b)
+                                {}
+                                else
+                                  continue;
+                              else
+                                continue;
+                          else
+                            if(p[pixel[12]] < c_b)
+                              if(p[pixel[13]] < c_b)
+                                if(p[pixel[14]] < c_b)
+                                {}
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                        else
+                          if(p[pixel[12]] < c_b)
+                            if(p[pixel[13]] < c_b)
+                              if(p[pixel[14]] < c_b)
+                                if(p[pixel[15]] < c_b)
+                                {}
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                  else
+                    continue;
+                else
+                  continue;
+              else
+                continue;
+            else
+              if(p[pixel[10]] > cb)
+                if(p[pixel[11]] > cb)
+                  if(p[pixel[12]] > cb)
+                    if(p[pixel[13]] > cb)
+                      if(p[pixel[14]] > cb)
+                        if(p[pixel[15]] > cb)
+                        {}
+                        else
+                          if(p[pixel[6]] > cb)
+                            if(p[pixel[7]] > cb)
+                              if(p[pixel[8]] > cb)
+                                if(p[pixel[9]] > cb)
+                                {}
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                      else
+                        if(p[pixel[5]] > cb)
+                          if(p[pixel[6]] > cb)
+                            if(p[pixel[7]] > cb)
+                              if(p[pixel[8]] > cb)
+                                if(p[pixel[9]] > cb)
+                                {}
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                    else
+                      if(p[pixel[4]] > cb)
+                        if(p[pixel[5]] > cb)
+                          if(p[pixel[6]] > cb)
+                            if(p[pixel[7]] > cb)
+                              if(p[pixel[8]] > cb)
+                                if(p[pixel[9]] > cb)
+                                {}
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                  else
+                    continue;
+                else
+                  continue;
+              else if(p[pixel[10]] < c_b)
+                if(p[pixel[7]] < c_b)
+                  if(p[pixel[8]] < c_b)
+                    if(p[pixel[9]] < c_b)
+                      if(p[pixel[11]] < c_b)
+                        if(p[pixel[12]] < c_b)
+                          if(p[pixel[6]] < c_b)
+                            if(p[pixel[5]] < c_b)
+                              if(p[pixel[4]] < c_b)
+                              {}
+                              else
+                                if(p[pixel[13]] < c_b)
+                                {}
+                                else
+                                  continue;
+                            else
+                              if(p[pixel[13]] < c_b)
+                                if(p[pixel[14]] < c_b)
+                                {}
+                                else
+                                  continue;
+                              else
+                                continue;
+                          else
+                            if(p[pixel[13]] < c_b)
+                              if(p[pixel[14]] < c_b)
+                                if(p[pixel[15]] < c_b)
+                                {}
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                  else
+                    continue;
+                else
+                  continue;
+              else
+                continue;
+          else if(p[pixel[2]] < c_b)
+            if(p[pixel[9]] > cb)
+              if(p[pixel[10]] > cb)
+                if(p[pixel[11]] > cb)
+                  if(p[pixel[12]] > cb)
+                    if(p[pixel[13]] > cb)
+                      if(p[pixel[14]] > cb)
+                        if(p[pixel[15]] > cb)
+                        {}
+                        else
+                          if(p[pixel[6]] > cb)
+                            if(p[pixel[7]] > cb)
+                              if(p[pixel[8]] > cb)
+                              {}
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                      else
+                        if(p[pixel[5]] > cb)
+                          if(p[pixel[6]] > cb)
+                            if(p[pixel[7]] > cb)
+                              if(p[pixel[8]] > cb)
+                              {}
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                    else
+                      if(p[pixel[4]] > cb)
+                        if(p[pixel[5]] > cb)
+                          if(p[pixel[6]] > cb)
+                            if(p[pixel[7]] > cb)
+                              if(p[pixel[8]] > cb)
+                              {}
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                  else
+                    if(p[pixel[3]] > cb)
+                      if(p[pixel[4]] > cb)
+                        if(p[pixel[5]] > cb)
+                          if(p[pixel[6]] > cb)
+                            if(p[pixel[7]] > cb)
+                              if(p[pixel[8]] > cb)
+                              {}
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                else
+                  continue;
+              else
+                continue;
+            else if(p[pixel[9]] < c_b)
+              if(p[pixel[7]] < c_b)
+                if(p[pixel[8]] < c_b)
+                  if(p[pixel[10]] < c_b)
+                    if(p[pixel[6]] < c_b)
+                      if(p[pixel[5]] < c_b)
+                        if(p[pixel[4]] < c_b)
+                          if(p[pixel[3]] < c_b)
+                          {}
+                          else
+                            if(p[pixel[11]] < c_b)
+                              if(p[pixel[12]] < c_b)
+                              {}
+                              else
+                                continue;
+                            else
+                              continue;
+                        else
+                          if(p[pixel[11]] < c_b)
+                            if(p[pixel[12]] < c_b)
+                              if(p[pixel[13]] < c_b)
+                              {}
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                      else
+                        if(p[pixel[11]] < c_b)
+                          if(p[pixel[12]] < c_b)
+                            if(p[pixel[13]] < c_b)
+                              if(p[pixel[14]] < c_b)
+                              {}
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                    else
+                      if(p[pixel[11]] < c_b)
+                        if(p[pixel[12]] < c_b)
+                          if(p[pixel[13]] < c_b)
+                            if(p[pixel[14]] < c_b)
+                              if(p[pixel[15]] < c_b)
+                              {}
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                  else
+                    continue;
+                else
+                  continue;
+              else
+                continue;
+            else
+              continue;
+          else
+            if(p[pixel[9]] > cb)
+              if(p[pixel[10]] > cb)
+                if(p[pixel[11]] > cb)
+                  if(p[pixel[12]] > cb)
+                    if(p[pixel[13]] > cb)
+                      if(p[pixel[14]] > cb)
+                        if(p[pixel[15]] > cb)
+                        {}
+                        else
+                          if(p[pixel[6]] > cb)
+                            if(p[pixel[7]] > cb)
+                              if(p[pixel[8]] > cb)
+                              {}
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                      else
+                        if(p[pixel[5]] > cb)
+                          if(p[pixel[6]] > cb)
+                            if(p[pixel[7]] > cb)
+                              if(p[pixel[8]] > cb)
+                              {}
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                    else
+                      if(p[pixel[4]] > cb)
+                        if(p[pixel[5]] > cb)
+                          if(p[pixel[6]] > cb)
+                            if(p[pixel[7]] > cb)
+                              if(p[pixel[8]] > cb)
+                              {}
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                  else
+                    if(p[pixel[3]] > cb)
+                      if(p[pixel[4]] > cb)
+                        if(p[pixel[5]] > cb)
+                          if(p[pixel[6]] > cb)
+                            if(p[pixel[7]] > cb)
+                              if(p[pixel[8]] > cb)
+                              {}
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                else
+                  continue;
+              else
+                continue;
+            else if(p[pixel[9]] < c_b)
+              if(p[pixel[7]] < c_b)
+                if(p[pixel[8]] < c_b)
+                  if(p[pixel[10]] < c_b)
+                    if(p[pixel[11]] < c_b)
+                      if(p[pixel[6]] < c_b)
+                        if(p[pixel[5]] < c_b)
+                          if(p[pixel[4]] < c_b)
+                            if(p[pixel[3]] < c_b)
+                            {}
+                            else
+                              if(p[pixel[12]] < c_b)
+                              {}
+                              else
+                                continue;
+                          else
+                            if(p[pixel[12]] < c_b)
+                              if(p[pixel[13]] < c_b)
+                              {}
+                              else
+                                continue;
+                            else
+                              continue;
+                        else
+                          if(p[pixel[12]] < c_b)
+                            if(p[pixel[13]] < c_b)
+                              if(p[pixel[14]] < c_b)
+                              {}
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                      else
+                        if(p[pixel[12]] < c_b)
+                          if(p[pixel[13]] < c_b)
+                            if(p[pixel[14]] < c_b)
+                              if(p[pixel[15]] < c_b)
+                              {}
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                    else
+                      continue;
+                  else
+                    continue;
+                else
+                  continue;
+              else
+                continue;
+            else
+              continue;
+        else if(p[pixel[1]] < c_b)
+          if(p[pixel[8]] > cb)
+            if(p[pixel[9]] > cb)
+              if(p[pixel[10]] > cb)
+                if(p[pixel[11]] > cb)
+                  if(p[pixel[12]] > cb)
+                    if(p[pixel[13]] > cb)
+                      if(p[pixel[14]] > cb)
+                        if(p[pixel[15]] > cb)
+                        {}
+                        else
+                          if(p[pixel[6]] > cb)
+                            if(p[pixel[7]] > cb)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                      else
+                        if(p[pixel[5]] > cb)
+                          if(p[pixel[6]] > cb)
+                            if(p[pixel[7]] > cb)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                    else
+                      if(p[pixel[4]] > cb)
+                        if(p[pixel[5]] > cb)
+                          if(p[pixel[6]] > cb)
+                            if(p[pixel[7]] > cb)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                  else
+                    if(p[pixel[3]] > cb)
+                      if(p[pixel[4]] > cb)
+                        if(p[pixel[5]] > cb)
+                          if(p[pixel[6]] > cb)
+                            if(p[pixel[7]] > cb)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                else
+                  if(p[pixel[2]] > cb)
+                    if(p[pixel[3]] > cb)
+                      if(p[pixel[4]] > cb)
+                        if(p[pixel[5]] > cb)
+                          if(p[pixel[6]] > cb)
+                            if(p[pixel[7]] > cb)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                  else
+                    continue;
+              else
+                continue;
+            else
+              continue;
+          else if(p[pixel[8]] < c_b)
+            if(p[pixel[7]] < c_b)
+              if(p[pixel[9]] < c_b)
+                if(p[pixel[6]] < c_b)
+                  if(p[pixel[5]] < c_b)
+                    if(p[pixel[4]] < c_b)
+                      if(p[pixel[3]] < c_b)
+                        if(p[pixel[2]] < c_b)
+                        {}
+                        else
+                          if(p[pixel[10]] < c_b)
+                            if(p[pixel[11]] < c_b)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                      else
+                        if(p[pixel[10]] < c_b)
+                          if(p[pixel[11]] < c_b)
+                            if(p[pixel[12]] < c_b)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                    else
+                      if(p[pixel[10]] < c_b)
+                        if(p[pixel[11]] < c_b)
+                          if(p[pixel[12]] < c_b)
+                            if(p[pixel[13]] < c_b)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                  else
+                    if(p[pixel[10]] < c_b)
+                      if(p[pixel[11]] < c_b)
+                        if(p[pixel[12]] < c_b)
+                          if(p[pixel[13]] < c_b)
+                            if(p[pixel[14]] < c_b)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                else
+                  if(p[pixel[10]] < c_b)
+                    if(p[pixel[11]] < c_b)
+                      if(p[pixel[12]] < c_b)
+                        if(p[pixel[13]] < c_b)
+                          if(p[pixel[14]] < c_b)
+                            if(p[pixel[15]] < c_b)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                  else
+                    continue;
+              else
+                continue;
+            else
+              continue;
+          else
+            continue;
+        else
+          if(p[pixel[8]] > cb)
+            if(p[pixel[9]] > cb)
+              if(p[pixel[10]] > cb)
+                if(p[pixel[11]] > cb)
+                  if(p[pixel[12]] > cb)
+                    if(p[pixel[13]] > cb)
+                      if(p[pixel[14]] > cb)
+                        if(p[pixel[15]] > cb)
+                        {}
+                        else
+                          if(p[pixel[6]] > cb)
+                            if(p[pixel[7]] > cb)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                      else
+                        if(p[pixel[5]] > cb)
+                          if(p[pixel[6]] > cb)
+                            if(p[pixel[7]] > cb)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                    else
+                      if(p[pixel[4]] > cb)
+                        if(p[pixel[5]] > cb)
+                          if(p[pixel[6]] > cb)
+                            if(p[pixel[7]] > cb)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                  else
+                    if(p[pixel[3]] > cb)
+                      if(p[pixel[4]] > cb)
+                        if(p[pixel[5]] > cb)
+                          if(p[pixel[6]] > cb)
+                            if(p[pixel[7]] > cb)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                else
+                  if(p[pixel[2]] > cb)
+                    if(p[pixel[3]] > cb)
+                      if(p[pixel[4]] > cb)
+                        if(p[pixel[5]] > cb)
+                          if(p[pixel[6]] > cb)
+                            if(p[pixel[7]] > cb)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                  else
+                    continue;
+              else
+                continue;
+            else
+              continue;
+          else if(p[pixel[8]] < c_b)
+            if(p[pixel[7]] < c_b)
+              if(p[pixel[9]] < c_b)
+                if(p[pixel[10]] < c_b)
+                  if(p[pixel[6]] < c_b)
+                    if(p[pixel[5]] < c_b)
+                      if(p[pixel[4]] < c_b)
+                        if(p[pixel[3]] < c_b)
+                          if(p[pixel[2]] < c_b)
+                          {}
+                          else
+                            if(p[pixel[11]] < c_b)
+                            {}
+                            else
+                              continue;
+                        else
+                          if(p[pixel[11]] < c_b)
+                            if(p[pixel[12]] < c_b)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                      else
+                        if(p[pixel[11]] < c_b)
+                          if(p[pixel[12]] < c_b)
+                            if(p[pixel[13]] < c_b)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                    else
+                      if(p[pixel[11]] < c_b)
+                        if(p[pixel[12]] < c_b)
+                          if(p[pixel[13]] < c_b)
+                            if(p[pixel[14]] < c_b)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                  else
+                    if(p[pixel[11]] < c_b)
+                      if(p[pixel[12]] < c_b)
+                        if(p[pixel[13]] < c_b)
+                          if(p[pixel[14]] < c_b)
+                            if(p[pixel[15]] < c_b)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                else
+                  continue;
+              else
+                continue;
+            else
+              continue;
+          else
+            continue;
+      else if(p[pixel[0]] < c_b)
+        if(p[pixel[1]] > cb)
+          if(p[pixel[8]] > cb)
+            if(p[pixel[7]] > cb)
+              if(p[pixel[9]] > cb)
+                if(p[pixel[6]] > cb)
+                  if(p[pixel[5]] > cb)
+                    if(p[pixel[4]] > cb)
+                      if(p[pixel[3]] > cb)
+                        if(p[pixel[2]] > cb)
+                        {}
+                        else
+                          if(p[pixel[10]] > cb)
+                            if(p[pixel[11]] > cb)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                      else
+                        if(p[pixel[10]] > cb)
+                          if(p[pixel[11]] > cb)
+                            if(p[pixel[12]] > cb)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                    else
+                      if(p[pixel[10]] > cb)
+                        if(p[pixel[11]] > cb)
+                          if(p[pixel[12]] > cb)
+                            if(p[pixel[13]] > cb)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                  else
+                    if(p[pixel[10]] > cb)
+                      if(p[pixel[11]] > cb)
+                        if(p[pixel[12]] > cb)
+                          if(p[pixel[13]] > cb)
+                            if(p[pixel[14]] > cb)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                else
+                  if(p[pixel[10]] > cb)
+                    if(p[pixel[11]] > cb)
+                      if(p[pixel[12]] > cb)
+                        if(p[pixel[13]] > cb)
+                          if(p[pixel[14]] > cb)
+                            if(p[pixel[15]] > cb)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                  else
+                    continue;
+              else
+                continue;
+            else
+              continue;
+          else if(p[pixel[8]] < c_b)
+            if(p[pixel[9]] < c_b)
+              if(p[pixel[10]] < c_b)
+                if(p[pixel[11]] < c_b)
+                  if(p[pixel[12]] < c_b)
+                    if(p[pixel[13]] < c_b)
+                      if(p[pixel[14]] < c_b)
+                        if(p[pixel[15]] < c_b)
+                        {}
+                        else
+                          if(p[pixel[6]] < c_b)
+                            if(p[pixel[7]] < c_b)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                      else
+                        if(p[pixel[5]] < c_b)
+                          if(p[pixel[6]] < c_b)
+                            if(p[pixel[7]] < c_b)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                    else
+                      if(p[pixel[4]] < c_b)
+                        if(p[pixel[5]] < c_b)
+                          if(p[pixel[6]] < c_b)
+                            if(p[pixel[7]] < c_b)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                  else
+                    if(p[pixel[3]] < c_b)
+                      if(p[pixel[4]] < c_b)
+                        if(p[pixel[5]] < c_b)
+                          if(p[pixel[6]] < c_b)
+                            if(p[pixel[7]] < c_b)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                else
+                  if(p[pixel[2]] < c_b)
+                    if(p[pixel[3]] < c_b)
+                      if(p[pixel[4]] < c_b)
+                        if(p[pixel[5]] < c_b)
+                          if(p[pixel[6]] < c_b)
+                            if(p[pixel[7]] < c_b)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                  else
+                    continue;
+              else
+                continue;
+            else
+              continue;
+          else
+            continue;
+        else if(p[pixel[1]] < c_b)
+          if(p[pixel[2]] > cb)
+            if(p[pixel[9]] > cb)
+              if(p[pixel[7]] > cb)
+                if(p[pixel[8]] > cb)
+                  if(p[pixel[10]] > cb)
+                    if(p[pixel[6]] > cb)
+                      if(p[pixel[5]] > cb)
+                        if(p[pixel[4]] > cb)
+                          if(p[pixel[3]] > cb)
+                          {}
+                          else
+                            if(p[pixel[11]] > cb)
+                              if(p[pixel[12]] > cb)
+                              {}
+                              else
+                                continue;
+                            else
+                              continue;
+                        else
+                          if(p[pixel[11]] > cb)
+                            if(p[pixel[12]] > cb)
+                              if(p[pixel[13]] > cb)
+                              {}
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                      else
+                        if(p[pixel[11]] > cb)
+                          if(p[pixel[12]] > cb)
+                            if(p[pixel[13]] > cb)
+                              if(p[pixel[14]] > cb)
+                              {}
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                    else
+                      if(p[pixel[11]] > cb)
+                        if(p[pixel[12]] > cb)
+                          if(p[pixel[13]] > cb)
+                            if(p[pixel[14]] > cb)
+                              if(p[pixel[15]] > cb)
+                              {}
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                  else
+                    continue;
+                else
+                  continue;
+              else
+                continue;
+            else if(p[pixel[9]] < c_b)
+              if(p[pixel[10]] < c_b)
+                if(p[pixel[11]] < c_b)
+                  if(p[pixel[12]] < c_b)
+                    if(p[pixel[13]] < c_b)
+                      if(p[pixel[14]] < c_b)
+                        if(p[pixel[15]] < c_b)
+                        {}
+                        else
+                          if(p[pixel[6]] < c_b)
+                            if(p[pixel[7]] < c_b)
+                              if(p[pixel[8]] < c_b)
+                              {}
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                      else
+                        if(p[pixel[5]] < c_b)
+                          if(p[pixel[6]] < c_b)
+                            if(p[pixel[7]] < c_b)
+                              if(p[pixel[8]] < c_b)
+                              {}
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                    else
+                      if(p[pixel[4]] < c_b)
+                        if(p[pixel[5]] < c_b)
+                          if(p[pixel[6]] < c_b)
+                            if(p[pixel[7]] < c_b)
+                              if(p[pixel[8]] < c_b)
+                              {}
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                  else
+                    if(p[pixel[3]] < c_b)
+                      if(p[pixel[4]] < c_b)
+                        if(p[pixel[5]] < c_b)
+                          if(p[pixel[6]] < c_b)
+                            if(p[pixel[7]] < c_b)
+                              if(p[pixel[8]] < c_b)
+                              {}
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                else
+                  continue;
+              else
+                continue;
+            else
+              continue;
+          else if(p[pixel[2]] < c_b)
+            if(p[pixel[3]] > cb)
+              if(p[pixel[10]] > cb)
+                if(p[pixel[7]] > cb)
+                  if(p[pixel[8]] > cb)
+                    if(p[pixel[9]] > cb)
+                      if(p[pixel[11]] > cb)
+                        if(p[pixel[6]] > cb)
+                          if(p[pixel[5]] > cb)
+                            if(p[pixel[4]] > cb)
+                            {}
+                            else
+                              if(p[pixel[12]] > cb)
+                                if(p[pixel[13]] > cb)
+                                {}
+                                else
+                                  continue;
+                              else
+                                continue;
+                          else
+                            if(p[pixel[12]] > cb)
+                              if(p[pixel[13]] > cb)
+                                if(p[pixel[14]] > cb)
+                                {}
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                        else
+                          if(p[pixel[12]] > cb)
+                            if(p[pixel[13]] > cb)
+                              if(p[pixel[14]] > cb)
+                                if(p[pixel[15]] > cb)
+                                {}
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                  else
+                    continue;
+                else
+                  continue;
+              else if(p[pixel[10]] < c_b)
+                if(p[pixel[11]] < c_b)
+                  if(p[pixel[12]] < c_b)
+                    if(p[pixel[13]] < c_b)
+                      if(p[pixel[14]] < c_b)
+                        if(p[pixel[15]] < c_b)
+                        {}
+                        else
+                          if(p[pixel[6]] < c_b)
+                            if(p[pixel[7]] < c_b)
+                              if(p[pixel[8]] < c_b)
+                                if(p[pixel[9]] < c_b)
+                                {}
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                      else
+                        if(p[pixel[5]] < c_b)
+                          if(p[pixel[6]] < c_b)
+                            if(p[pixel[7]] < c_b)
+                              if(p[pixel[8]] < c_b)
+                                if(p[pixel[9]] < c_b)
+                                {}
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                    else
+                      if(p[pixel[4]] < c_b)
+                        if(p[pixel[5]] < c_b)
+                          if(p[pixel[6]] < c_b)
+                            if(p[pixel[7]] < c_b)
+                              if(p[pixel[8]] < c_b)
+                                if(p[pixel[9]] < c_b)
+                                {}
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                  else
+                    continue;
+                else
+                  continue;
+              else
+                continue;
+            else if(p[pixel[3]] < c_b)
+              if(p[pixel[4]] > cb)
+                if(p[pixel[13]] > cb)
+                  if(p[pixel[7]] > cb)
+                    if(p[pixel[8]] > cb)
+                      if(p[pixel[9]] > cb)
+                        if(p[pixel[10]] > cb)
+                          if(p[pixel[11]] > cb)
+                            if(p[pixel[12]] > cb)
+                              if(p[pixel[6]] > cb)
+                                if(p[pixel[5]] > cb)
+                                {}
+                                else
+                                  if(p[pixel[14]] > cb)
+                                  {}
+                                  else
+                                    continue;
+                              else
+                                if(p[pixel[14]] > cb)
+                                  if(p[pixel[15]] > cb)
+                                  {}
+                                  else
+                                    continue;
+                                else
+                                  continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                  else
+                    continue;
+                else if(p[pixel[13]] < c_b)
+                  if(p[pixel[11]] > cb)
+                    if(p[pixel[5]] > cb)
+                      if(p[pixel[6]] > cb)
+                        if(p[pixel[7]] > cb)
+                          if(p[pixel[8]] > cb)
+                            if(p[pixel[9]] > cb)
+                              if(p[pixel[10]] > cb)
+                                if(p[pixel[12]] > cb)
+                                {}
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                  else if(p[pixel[11]] < c_b)
+                    if(p[pixel[12]] < c_b)
+                      if(p[pixel[14]] < c_b)
+                        if(p[pixel[15]] < c_b)
+                        {}
+                        else
+                          if(p[pixel[6]] < c_b)
+                            if(p[pixel[7]] < c_b)
+                              if(p[pixel[8]] < c_b)
+                                if(p[pixel[9]] < c_b)
+                                  if(p[pixel[10]] < c_b)
+                                  {}
+                                  else
+                                    continue;
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                      else
+                        if(p[pixel[5]] < c_b)
+                          if(p[pixel[6]] < c_b)
+                            if(p[pixel[7]] < c_b)
+                              if(p[pixel[8]] < c_b)
+                                if(p[pixel[9]] < c_b)
+                                  if(p[pixel[10]] < c_b)
+                                  {}
+                                  else
+                                    continue;
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                    else
+                      continue;
+                  else
+                    continue;
+                else
+                  if(p[pixel[5]] > cb)
+                    if(p[pixel[6]] > cb)
+                      if(p[pixel[7]] > cb)
+                        if(p[pixel[8]] > cb)
+                          if(p[pixel[9]] > cb)
+                            if(p[pixel[10]] > cb)
+                              if(p[pixel[11]] > cb)
+                                if(p[pixel[12]] > cb)
+                                {}
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                  else
+                    continue;
+              else if(p[pixel[4]] < c_b)
+                if(p[pixel[5]] > cb)
+                  if(p[pixel[14]] > cb)
+                    if(p[pixel[7]] > cb)
+                      if(p[pixel[8]] > cb)
+                        if(p[pixel[9]] > cb)
+                          if(p[pixel[10]] > cb)
+                            if(p[pixel[11]] > cb)
+                              if(p[pixel[12]] > cb)
+                                if(p[pixel[13]] > cb)
+                                  if(p[pixel[6]] > cb)
+                                  {}
+                                  else
+                                    if(p[pixel[15]] > cb)
+                                    {}
+                                    else
+                                      continue;
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                  else if(p[pixel[14]] < c_b)
+                    if(p[pixel[12]] > cb)
+                      if(p[pixel[6]] > cb)
+                        if(p[pixel[7]] > cb)
+                          if(p[pixel[8]] > cb)
+                            if(p[pixel[9]] > cb)
+                              if(p[pixel[10]] > cb)
+                                if(p[pixel[11]] > cb)
+                                  if(p[pixel[13]] > cb)
+                                  {}
+                                  else
+                                    continue;
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else if(p[pixel[12]] < c_b)
+                      if(p[pixel[13]] < c_b)
+                        if(p[pixel[15]] < c_b)
+                        {}
+                        else
+                          if(p[pixel[6]] < c_b)
+                            if(p[pixel[7]] < c_b)
+                              if(p[pixel[8]] < c_b)
+                                if(p[pixel[9]] < c_b)
+                                  if(p[pixel[10]] < c_b)
+                                    if(p[pixel[11]] < c_b)
+                                    {}
+                                    else
+                                      continue;
+                                  else
+                                    continue;
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                  else
+                    if(p[pixel[6]] > cb)
+                      if(p[pixel[7]] > cb)
+                        if(p[pixel[8]] > cb)
+                          if(p[pixel[9]] > cb)
+                            if(p[pixel[10]] > cb)
+                              if(p[pixel[11]] > cb)
+                                if(p[pixel[12]] > cb)
+                                  if(p[pixel[13]] > cb)
+                                  {}
+                                  else
+                                    continue;
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                else if(p[pixel[5]] < c_b)
+                  if(p[pixel[6]] > cb)
+                    if(p[pixel[15]] < c_b)
+                      if(p[pixel[13]] > cb)
+                        if(p[pixel[7]] > cb)
+                          if(p[pixel[8]] > cb)
+                            if(p[pixel[9]] > cb)
+                              if(p[pixel[10]] > cb)
+                                if(p[pixel[11]] > cb)
+                                  if(p[pixel[12]] > cb)
+                                    if(p[pixel[14]] > cb)
+                                    {}
+                                    else
+                                      continue;
+                                  else
+                                    continue;
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else if(p[pixel[13]] < c_b)
+                        if(p[pixel[14]] < c_b)
+                        {}
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      if(p[pixel[7]] > cb)
+                        if(p[pixel[8]] > cb)
+                          if(p[pixel[9]] > cb)
+                            if(p[pixel[10]] > cb)
+                              if(p[pixel[11]] > cb)
+                                if(p[pixel[12]] > cb)
+                                  if(p[pixel[13]] > cb)
+                                    if(p[pixel[14]] > cb)
+                                    {}
+                                    else
+                                      continue;
+                                  else
+                                    continue;
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                  else if(p[pixel[6]] < c_b)
+                    if(p[pixel[7]] > cb)
+                      if(p[pixel[14]] > cb)
+                        if(p[pixel[8]] > cb)
+                          if(p[pixel[9]] > cb)
+                            if(p[pixel[10]] > cb)
+                              if(p[pixel[11]] > cb)
+                                if(p[pixel[12]] > cb)
+                                  if(p[pixel[13]] > cb)
+                                    if(p[pixel[15]] > cb)
+                                    {}
+                                    else
+                                      continue;
+                                  else
+                                    continue;
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else if(p[pixel[14]] < c_b)
+                        if(p[pixel[15]] < c_b)
+                        {}
+                        else
+                          continue;
+                      else
+                        continue;
+                    else if(p[pixel[7]] < c_b)
+                      if(p[pixel[8]] < c_b)
+                      {}
+                      else
+                        if(p[pixel[15]] < c_b)
+                        {}
+                        else
+                          continue;
+                    else
+                      if(p[pixel[14]] < c_b)
+                        if(p[pixel[15]] < c_b)
+                        {}
+                        else
+                          continue;
+                      else
+                        continue;
+                  else
+                    if(p[pixel[13]] > cb)
+                      if(p[pixel[7]] > cb)
+                        if(p[pixel[8]] > cb)
+                          if(p[pixel[9]] > cb)
+                            if(p[pixel[10]] > cb)
+                              if(p[pixel[11]] > cb)
+                                if(p[pixel[12]] > cb)
+                                  if(p[pixel[14]] > cb)
+                                    if(p[pixel[15]] > cb)
+                                    {}
+                                    else
+                                      continue;
+                                  else
+                                    continue;
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else if(p[pixel[13]] < c_b)
+                      if(p[pixel[14]] < c_b)
+                        if(p[pixel[15]] < c_b)
+                        {}
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                else
+                  if(p[pixel[12]] > cb)
+                    if(p[pixel[7]] > cb)
+                      if(p[pixel[8]] > cb)
+                        if(p[pixel[9]] > cb)
+                          if(p[pixel[10]] > cb)
+                            if(p[pixel[11]] > cb)
+                              if(p[pixel[13]] > cb)
+                                if(p[pixel[14]] > cb)
+                                  if(p[pixel[6]] > cb)
+                                  {}
+                                  else
+                                    if(p[pixel[15]] > cb)
+                                    {}
+                                    else
+                                      continue;
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                  else if(p[pixel[12]] < c_b)
+                    if(p[pixel[13]] < c_b)
+                      if(p[pixel[14]] < c_b)
+                        if(p[pixel[15]] < c_b)
+                        {}
+                        else
+                          if(p[pixel[6]] < c_b)
+                            if(p[pixel[7]] < c_b)
+                              if(p[pixel[8]] < c_b)
+                                if(p[pixel[9]] < c_b)
+                                  if(p[pixel[10]] < c_b)
+                                    if(p[pixel[11]] < c_b)
+                                    {}
+                                    else
+                                      continue;
+                                  else
+                                    continue;
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                  else
+                    continue;
+              else
+                if(p[pixel[11]] > cb)
+                  if(p[pixel[7]] > cb)
+                    if(p[pixel[8]] > cb)
+                      if(p[pixel[9]] > cb)
+                        if(p[pixel[10]] > cb)
+                          if(p[pixel[12]] > cb)
+                            if(p[pixel[13]] > cb)
+                              if(p[pixel[6]] > cb)
+                                if(p[pixel[5]] > cb)
+                                {}
+                                else
+                                  if(p[pixel[14]] > cb)
+                                  {}
+                                  else
+                                    continue;
+                              else
+                                if(p[pixel[14]] > cb)
+                                  if(p[pixel[15]] > cb)
+                                  {}
+                                  else
+                                    continue;
+                                else
+                                  continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                  else
+                    continue;
+                else if(p[pixel[11]] < c_b)
+                  if(p[pixel[12]] < c_b)
+                    if(p[pixel[13]] < c_b)
+                      if(p[pixel[14]] < c_b)
+                        if(p[pixel[15]] < c_b)
+                        {}
+                        else
+                          if(p[pixel[6]] < c_b)
+                            if(p[pixel[7]] < c_b)
+                              if(p[pixel[8]] < c_b)
+                                if(p[pixel[9]] < c_b)
+                                  if(p[pixel[10]] < c_b)
+                                  {}
+                                  else
+                                    continue;
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                      else
+                        if(p[pixel[5]] < c_b)
+                          if(p[pixel[6]] < c_b)
+                            if(p[pixel[7]] < c_b)
+                              if(p[pixel[8]] < c_b)
+                                if(p[pixel[9]] < c_b)
+                                  if(p[pixel[10]] < c_b)
+                                  {}
+                                  else
+                                    continue;
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                    else
+                      continue;
+                  else
+                    continue;
+                else
+                  continue;
+            else
+              if(p[pixel[10]] > cb)
+                if(p[pixel[7]] > cb)
+                  if(p[pixel[8]] > cb)
+                    if(p[pixel[9]] > cb)
+                      if(p[pixel[11]] > cb)
+                        if(p[pixel[12]] > cb)
+                          if(p[pixel[6]] > cb)
+                            if(p[pixel[5]] > cb)
+                              if(p[pixel[4]] > cb)
+                              {}
+                              else
+                                if(p[pixel[13]] > cb)
+                                {}
+                                else
+                                  continue;
+                            else
+                              if(p[pixel[13]] > cb)
+                                if(p[pixel[14]] > cb)
+                                {}
+                                else
+                                  continue;
+                              else
+                                continue;
+                          else
+                            if(p[pixel[13]] > cb)
+                              if(p[pixel[14]] > cb)
+                                if(p[pixel[15]] > cb)
+                                {}
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                  else
+                    continue;
+                else
+                  continue;
+              else if(p[pixel[10]] < c_b)
+                if(p[pixel[11]] < c_b)
+                  if(p[pixel[12]] < c_b)
+                    if(p[pixel[13]] < c_b)
+                      if(p[pixel[14]] < c_b)
+                        if(p[pixel[15]] < c_b)
+                        {}
+                        else
+                          if(p[pixel[6]] < c_b)
+                            if(p[pixel[7]] < c_b)
+                              if(p[pixel[8]] < c_b)
+                                if(p[pixel[9]] < c_b)
+                                {}
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                      else
+                        if(p[pixel[5]] < c_b)
+                          if(p[pixel[6]] < c_b)
+                            if(p[pixel[7]] < c_b)
+                              if(p[pixel[8]] < c_b)
+                                if(p[pixel[9]] < c_b)
+                                {}
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                    else
+                      if(p[pixel[4]] < c_b)
+                        if(p[pixel[5]] < c_b)
+                          if(p[pixel[6]] < c_b)
+                            if(p[pixel[7]] < c_b)
+                              if(p[pixel[8]] < c_b)
+                                if(p[pixel[9]] < c_b)
+                                {}
+                                else
+                                  continue;
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                  else
+                    continue;
+                else
+                  continue;
+              else
+                continue;
+          else
+            if(p[pixel[9]] > cb)
+              if(p[pixel[7]] > cb)
+                if(p[pixel[8]] > cb)
+                  if(p[pixel[10]] > cb)
+                    if(p[pixel[11]] > cb)
+                      if(p[pixel[6]] > cb)
+                        if(p[pixel[5]] > cb)
+                          if(p[pixel[4]] > cb)
+                            if(p[pixel[3]] > cb)
+                            {}
+                            else
+                              if(p[pixel[12]] > cb)
+                              {}
+                              else
+                                continue;
+                          else
+                            if(p[pixel[12]] > cb)
+                              if(p[pixel[13]] > cb)
+                              {}
+                              else
+                                continue;
+                            else
+                              continue;
+                        else
+                          if(p[pixel[12]] > cb)
+                            if(p[pixel[13]] > cb)
+                              if(p[pixel[14]] > cb)
+                              {}
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                      else
+                        if(p[pixel[12]] > cb)
+                          if(p[pixel[13]] > cb)
+                            if(p[pixel[14]] > cb)
+                              if(p[pixel[15]] > cb)
+                              {}
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                    else
+                      continue;
+                  else
+                    continue;
+                else
+                  continue;
+              else
+                continue;
+            else if(p[pixel[9]] < c_b)
+              if(p[pixel[10]] < c_b)
+                if(p[pixel[11]] < c_b)
+                  if(p[pixel[12]] < c_b)
+                    if(p[pixel[13]] < c_b)
+                      if(p[pixel[14]] < c_b)
+                        if(p[pixel[15]] < c_b)
+                        {}
+                        else
+                          if(p[pixel[6]] < c_b)
+                            if(p[pixel[7]] < c_b)
+                              if(p[pixel[8]] < c_b)
+                              {}
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                      else
+                        if(p[pixel[5]] < c_b)
+                          if(p[pixel[6]] < c_b)
+                            if(p[pixel[7]] < c_b)
+                              if(p[pixel[8]] < c_b)
+                              {}
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                    else
+                      if(p[pixel[4]] < c_b)
+                        if(p[pixel[5]] < c_b)
+                          if(p[pixel[6]] < c_b)
+                            if(p[pixel[7]] < c_b)
+                              if(p[pixel[8]] < c_b)
+                              {}
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                  else
+                    if(p[pixel[3]] < c_b)
+                      if(p[pixel[4]] < c_b)
+                        if(p[pixel[5]] < c_b)
+                          if(p[pixel[6]] < c_b)
+                            if(p[pixel[7]] < c_b)
+                              if(p[pixel[8]] < c_b)
+                              {}
+                              else
+                                continue;
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                else
+                  continue;
+              else
+                continue;
+            else
+              continue;
+        else
+          if(p[pixel[8]] > cb)
+            if(p[pixel[7]] > cb)
+              if(p[pixel[9]] > cb)
+                if(p[pixel[10]] > cb)
+                  if(p[pixel[6]] > cb)
+                    if(p[pixel[5]] > cb)
+                      if(p[pixel[4]] > cb)
+                        if(p[pixel[3]] > cb)
+                          if(p[pixel[2]] > cb)
+                          {}
+                          else
+                            if(p[pixel[11]] > cb)
+                            {}
+                            else
+                              continue;
+                        else
+                          if(p[pixel[11]] > cb)
+                            if(p[pixel[12]] > cb)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                      else
+                        if(p[pixel[11]] > cb)
+                          if(p[pixel[12]] > cb)
+                            if(p[pixel[13]] > cb)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                    else
+                      if(p[pixel[11]] > cb)
+                        if(p[pixel[12]] > cb)
+                          if(p[pixel[13]] > cb)
+                            if(p[pixel[14]] > cb)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                  else
+                    if(p[pixel[11]] > cb)
+                      if(p[pixel[12]] > cb)
+                        if(p[pixel[13]] > cb)
+                          if(p[pixel[14]] > cb)
+                            if(p[pixel[15]] > cb)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                else
+                  continue;
+              else
+                continue;
+            else
+              continue;
+          else if(p[pixel[8]] < c_b)
+            if(p[pixel[9]] < c_b)
+              if(p[pixel[10]] < c_b)
+                if(p[pixel[11]] < c_b)
+                  if(p[pixel[12]] < c_b)
+                    if(p[pixel[13]] < c_b)
+                      if(p[pixel[14]] < c_b)
+                        if(p[pixel[15]] < c_b)
+                        {}
+                        else
+                          if(p[pixel[6]] < c_b)
+                            if(p[pixel[7]] < c_b)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                      else
+                        if(p[pixel[5]] < c_b)
+                          if(p[pixel[6]] < c_b)
+                            if(p[pixel[7]] < c_b)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                    else
+                      if(p[pixel[4]] < c_b)
+                        if(p[pixel[5]] < c_b)
+                          if(p[pixel[6]] < c_b)
+                            if(p[pixel[7]] < c_b)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                  else
+                    if(p[pixel[3]] < c_b)
+                      if(p[pixel[4]] < c_b)
+                        if(p[pixel[5]] < c_b)
+                          if(p[pixel[6]] < c_b)
+                            if(p[pixel[7]] < c_b)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                else
+                  if(p[pixel[2]] < c_b)
+                    if(p[pixel[3]] < c_b)
+                      if(p[pixel[4]] < c_b)
+                        if(p[pixel[5]] < c_b)
+                          if(p[pixel[6]] < c_b)
+                            if(p[pixel[7]] < c_b)
+                            {}
+                            else
+                              continue;
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                  else
+                    continue;
+              else
+                continue;
+            else
+              continue;
+          else
+            continue;
+      else
+        if(p[pixel[7]] > cb)
+          if(p[pixel[8]] > cb)
+            if(p[pixel[9]] > cb)
+              if(p[pixel[6]] > cb)
+                if(p[pixel[5]] > cb)
+                  if(p[pixel[4]] > cb)
+                    if(p[pixel[3]] > cb)
+                      if(p[pixel[2]] > cb)
+                        if(p[pixel[1]] > cb)
+                        {}
+                        else
+                          if(p[pixel[10]] > cb)
+                          {}
+                          else
+                            continue;
+                      else
+                        if(p[pixel[10]] > cb)
+                          if(p[pixel[11]] > cb)
+                          {}
+                          else
+                            continue;
+                        else
+                          continue;
+                    else
+                      if(p[pixel[10]] > cb)
+                        if(p[pixel[11]] > cb)
+                          if(p[pixel[12]] > cb)
+                          {}
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                  else
+                    if(p[pixel[10]] > cb)
+                      if(p[pixel[11]] > cb)
+                        if(p[pixel[12]] > cb)
+                          if(p[pixel[13]] > cb)
+                          {}
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                else
+                  if(p[pixel[10]] > cb)
+                    if(p[pixel[11]] > cb)
+                      if(p[pixel[12]] > cb)
+                        if(p[pixel[13]] > cb)
+                          if(p[pixel[14]] > cb)
+                          {}
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                  else
+                    continue;
+              else
+                if(p[pixel[10]] > cb)
+                  if(p[pixel[11]] > cb)
+                    if(p[pixel[12]] > cb)
+                      if(p[pixel[13]] > cb)
+                        if(p[pixel[14]] > cb)
+                          if(p[pixel[15]] > cb)
+                          {}
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                  else
+                    continue;
+                else
+                  continue;
+            else
+              continue;
+          else
+            continue;
+        else if(p[pixel[7]] < c_b)
+          if(p[pixel[8]] < c_b)
+            if(p[pixel[9]] < c_b)
+              if(p[pixel[6]] < c_b)
+                if(p[pixel[5]] < c_b)
+                  if(p[pixel[4]] < c_b)
+                    if(p[pixel[3]] < c_b)
+                      if(p[pixel[2]] < c_b)
+                        if(p[pixel[1]] < c_b)
+                        {}
+                        else
+                          if(p[pixel[10]] < c_b)
+                          {}
+                          else
+                            continue;
+                      else
+                        if(p[pixel[10]] < c_b)
+                          if(p[pixel[11]] < c_b)
+                          {}
+                          else
+                            continue;
+                        else
+                          continue;
+                    else
+                      if(p[pixel[10]] < c_b)
+                        if(p[pixel[11]] < c_b)
+                          if(p[pixel[12]] < c_b)
+                          {}
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                  else
+                    if(p[pixel[10]] < c_b)
+                      if(p[pixel[11]] < c_b)
+                        if(p[pixel[12]] < c_b)
+                          if(p[pixel[13]] < c_b)
+                          {}
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                else
+                  if(p[pixel[10]] < c_b)
+                    if(p[pixel[11]] < c_b)
+                      if(p[pixel[12]] < c_b)
+                        if(p[pixel[13]] < c_b)
+                          if(p[pixel[14]] < c_b)
+                          {}
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                  else
+                    continue;
+              else
+                if(p[pixel[10]] < c_b)
+                  if(p[pixel[11]] < c_b)
+                    if(p[pixel[12]] < c_b)
+                      if(p[pixel[13]] < c_b)
+                        if(p[pixel[14]] < c_b)
+                          if(p[pixel[15]] < c_b)
+                          {}
+                          else
+                            continue;
+                        else
+                          continue;
+                      else
+                        continue;
+                    else
+                      continue;
+                  else
+                    continue;
+                else
+                  continue;
+            else
+              continue;
+          else
+            continue;
+        else
+          continue;
+      if(num_corners == rsize)
+      {
+        rsize*=2;
+        ret_corners = (xy*)realloc(ret_corners, sizeof(xy)*rsize);
+      }
+      ret_corners[num_corners].x = x;
+      ret_corners[num_corners].y = y;
+      num_corners++;
+
+    }
+
+  *ret_num_corners = num_corners;
+  return ret_corners;
+
+}
+
+// clang-format on
diff --git a/libs/libaom/src/third_party/fastfeat/nonmax.c b/libs/libaom/src/third_party/fastfeat/nonmax.c
new file mode 100644
index 000000000..0dbc660cb
--- /dev/null
+++ b/libs/libaom/src/third_party/fastfeat/nonmax.c
@@ -0,0 +1,121 @@
+// clang-format off
+#include <stdlib.h>
+#include "fast.h"
+
+
+#define Compare(X, Y) ((X)>=(Y))
+
+xy* aom_nonmax_suppression(const xy* corners, const int* scores, int num_corners, int* ret_num_nonmax)
+{
+  int num_nonmax=0;
+  int last_row;
+  int* row_start;
+  int i, j;
+  xy* ret_nonmax;
+  const int sz = (int)num_corners;
+
+  /*Point above points (roughly) to the pixel above the one of interest, if there
+    is a feature there.*/
+  int point_above = 0;
+  int point_below = 0;
+
+
+  if(num_corners < 1)
+  {
+    *ret_num_nonmax = 0;
+    return 0;
+  }
+
+  ret_nonmax = (xy*)malloc(num_corners * sizeof(xy));
+
+  /* Find where each row begins
+     (the corners are output in raster scan order). A beginning of -1 signifies
+     that there are no corners on that row. */
+  last_row = corners[num_corners-1].y;
+  row_start = (int*)malloc((last_row+1)*sizeof(int));
+
+  for(i=0; i < last_row+1; i++)
+    row_start[i] = -1;
+
+  {
+    int prev_row = -1;
+    for(i=0; i< num_corners; i++)
+      if(corners[i].y != prev_row)
+      {
+        row_start[corners[i].y] = i;
+        prev_row = corners[i].y;
+      }
+  }
+
+
+
+  for(i=0; i < sz; i++)
+  {
+    int score = scores[i];
+    xy pos = corners[i];
+
+    /*Check left */
+    if(i > 0)
+      if(corners[i-1].x == pos.x-1 && corners[i-1].y == pos.y && Compare(scores[i-1], score))
+        continue;
+
+    /*Check right*/
+    if(i < (sz - 1))
+      if(corners[i+1].x == pos.x+1 && corners[i+1].y == pos.y && Compare(scores[i+1], score))
+        continue;
+
+    /*Check above (if there is a valid row above)*/
+    if(pos.y > 0)
+      if (row_start[pos.y - 1] != -1)
+      {
+        /*Make sure that current point_above is one
+          row above.*/
+        if(corners[point_above].y < pos.y - 1)
+          point_above = row_start[pos.y-1];
+
+        /*Make point_above point to the first of the pixels above the current point,
+          if it exists.*/
+        for(; corners[point_above].y < pos.y && corners[point_above].x < pos.x - 1; point_above++)
+        {}
+
+
+        for(j=point_above; corners[j].y < pos.y && corners[j].x <= pos.x + 1; j++)
+        {
+          int x = corners[j].x;
+          if( (x == pos.x - 1 || x ==pos.x || x == pos.x+1) && Compare(scores[j], score))
+            goto cont;
+        }
+
+      }
+
+    /*Check below (if there is anything below)*/
+    if(pos.y >= 0)
+      if (pos.y != last_row && row_start[pos.y + 1] != -1 && point_below < sz) /*Nothing below*/
+      {
+        if(corners[point_below].y < pos.y + 1)
+          point_below = row_start[pos.y+1];
+
+        /* Make point below point to one of the pixels belowthe current point, if it
+           exists.*/
+        for(; point_below < sz && corners[point_below].y == pos.y+1 && corners[point_below].x < pos.x - 1; point_below++)
+        {}
+
+        for(j=point_below; j < sz && corners[j].y == pos.y+1 && corners[j].x <= pos.x + 1; j++)
+        {
+          int x = corners[j].x;
+          if( (x == pos.x - 1 || x ==pos.x || x == pos.x+1) && Compare(scores[j],score))
+            goto cont;
+        }
+      }
+
+    ret_nonmax[num_nonmax++] = corners[i];
+cont:
+    ;
+  }
+
+  free(row_start);
+  *ret_num_nonmax = num_nonmax;
+  return ret_nonmax;
+}
+
+// clang-format on
diff --git a/libs/libaom/src/third_party/googletest/README.libaom b/libs/libaom/src/third_party/googletest/README.libaom
new file mode 100644
index 000000000..9b8a86398
--- /dev/null
+++ b/libs/libaom/src/third_party/googletest/README.libaom
@@ -0,0 +1,17 @@
+URL: https://github.com/google/googletest
+Version: 1.10.x
+License: BSD
+License File: LICENSE
+
+Description:
+Google's framework for writing C++ tests on a variety of platforms
+(Linux, Mac OS X, Windows, Windows CE, Symbian, etc).  Based on the
+xUnit architecture.  Supports automatic test discovery, a rich set of
+assertions, user-defined assertions, death tests, fatal and non-fatal
+failures, various options for running the tests, and XML test report
+generation.
+
+Local Modifications:
+- Replace everything in:
+  third_party/googletest/src/googletest/src/
+  third_party/googletest/src/googletest/include/
diff --git a/libs/libaom/src/third_party/googletest/src/googletest/CHANGES b/libs/libaom/src/third_party/googletest/src/googletest/CHANGES
new file mode 100644
index 000000000..055213242
--- /dev/null
+++ b/libs/libaom/src/third_party/googletest/src/googletest/CHANGES
@@ -0,0 +1,157 @@
+Changes for 1.7.0:
+
+* New feature: death tests are supported on OpenBSD and in iOS
+  simulator now.
+* New feature: Google Test now implements a protocol to allow
+  a test runner to detect that a test program has exited
+  prematurely and report it as a failure (before it would be
+  falsely reported as a success if the exit code is 0).
+* New feature: Test::RecordProperty() can now be used outside of the
+  lifespan of a test method, in which case it will be attributed to
+  the current test case or the test program in the XML report.
+* New feature (potentially breaking): --gtest_list_tests now prints
+  the type parameters and value parameters for each test.
+* Improvement: char pointers and char arrays are now escaped properly
+  in failure messages.
+* Improvement: failure summary in XML reports now includes file and
+  line information.
+* Improvement: the <testsuites> XML element now has a timestamp attribute.
+* Improvement: When --gtest_filter is specified, XML report now doesn't
+  contain information about tests that are filtered out.
+* Fixed the bug where long --gtest_filter flag values are truncated in
+  death tests.
+* Potentially breaking change: RUN_ALL_TESTS() is now implemented as a
+  function instead of a macro in order to work better with Clang.
+* Compatibility fixes with C++ 11 and various platforms.
+* Bug/warning fixes.
+
+Changes for 1.6.0:
+
+* New feature: ADD_FAILURE_AT() for reporting a test failure at the
+  given source location -- useful for writing testing utilities.
+* New feature: the universal value printer is moved from Google Mock
+  to Google Test.
+* New feature: type parameters and value parameters are reported in
+  the XML report now.
+* A gtest_disable_pthreads CMake option.
+* Colored output works in GNU Screen sessions now.
+* Parameters of value-parameterized tests are now printed in the
+  textual output.
+* Failures from ad hoc test assertions run before RUN_ALL_TESTS() are
+  now correctly reported.
+* Arguments of ASSERT_XY and EXPECT_XY no longer need to support << to
+  ostream.
+* More complete handling of exceptions.
+* GTEST_ASSERT_XY can be used instead of ASSERT_XY in case the latter
+  name is already used by another library.
+* --gtest_catch_exceptions is now true by default, allowing a test
+  program to continue after an exception is thrown.
+* Value-parameterized test fixtures can now derive from Test and
+  WithParamInterface<T> separately, easing conversion of legacy tests.
+* Death test messages are clearly marked to make them more
+  distinguishable from other messages.
+* Compatibility fixes for Android, Google Native Client, MinGW, HP UX,
+  PowerPC, Lucid autotools, libCStd, Sun C++, Borland C++ Builder (Code Gear),
+  IBM XL C++ (Visual Age C++), and C++0x.
+* Bug fixes and implementation clean-ups.
+* Potentially incompatible changes: disables the harmful 'make install'
+  command in autotools.
+
+Changes for 1.5.0:
+
+ * New feature: assertions can be safely called in multiple threads
+   where the pthreads library is available.
+ * New feature: predicates used inside EXPECT_TRUE() and friends
+   can now generate custom failure messages.
+ * New feature: Google Test can now be compiled as a DLL.
+ * New feature: fused source files are included.
+ * New feature: prints help when encountering unrecognized Google Test flags.
+ * Experimental feature: CMake build script (requires CMake 2.6.4+).
+ * Experimental feature: the Pump script for meta programming.
+ * double values streamed to an assertion are printed with enough precision
+   to differentiate any two different values.
+ * Google Test now works on Solaris and AIX.
+ * Build and test script improvements.
+ * Bug fixes and implementation clean-ups.
+
+ Potentially breaking changes:
+
+ * Stopped supporting VC++ 7.1 with exceptions disabled.
+ * Dropped support for 'make install'.
+
+Changes for 1.4.0:
+
+ * New feature: the event listener API
+ * New feature: test shuffling
+ * New feature: the XML report format is closer to junitreport and can
+   be parsed by Hudson now.
+ * New feature: when a test runs under Visual Studio, its failures are
+   integrated in the IDE.
+ * New feature: /MD(d) versions of VC++ projects.
+ * New feature: elapsed time for the tests is printed by default.
+ * New feature: comes with a TR1 tuple implementation such that Boost
+   is no longer needed for Combine().
+ * New feature: EXPECT_DEATH_IF_SUPPORTED macro and friends.
+ * New feature: the Xcode project can now produce static gtest
+   libraries in addition to a framework.
+ * Compatibility fixes for Solaris, Cygwin, minGW, Windows Mobile,
+   Symbian, gcc, and C++Builder.
+ * Bug fixes and implementation clean-ups.
+
+Changes for 1.3.0:
+
+ * New feature: death tests on Windows, Cygwin, and Mac.
+ * New feature: ability to use Google Test assertions in other testing
+   frameworks.
+ * New feature: ability to run disabled test via
+   --gtest_also_run_disabled_tests.
+ * New feature: the --help flag for printing the usage.
+ * New feature: access to Google Test flag values in user code.
+ * New feature: a script that packs Google Test into one .h and one
+   .cc file for easy deployment.
+ * New feature: support for distributing test functions to multiple
+   machines (requires support from the test runner).
+ * Bug fixes and implementation clean-ups.
+
+Changes for 1.2.1:
+
+ * Compatibility fixes for Linux IA-64 and IBM z/OS.
+ * Added support for using Boost and other TR1 implementations.
+ * Changes to the build scripts to support upcoming release of Google C++
+   Mocking Framework.
+ * Added Makefile to the distribution package.
+ * Improved build instructions in README.
+
+Changes for 1.2.0:
+
+ * New feature: value-parameterized tests.
+ * New feature: the ASSERT/EXPECT_(NON)FATAL_FAILURE(_ON_ALL_THREADS)
+   macros.
+ * Changed the XML report format to match JUnit/Ant's.
+ * Added tests to the Xcode project.
+ * Added scons/SConscript for building with SCons.
+ * Added src/gtest-all.cc for building Google Test from a single file.
+ * Fixed compatibility with Solaris and z/OS.
+ * Enabled running Python tests on systems with python 2.3 installed,
+   e.g. Mac OS X 10.4.
+ * Bug fixes.
+
+Changes for 1.1.0:
+
+ * New feature: type-parameterized tests.
+ * New feature: exception assertions.
+ * New feature: printing elapsed time of tests.
+ * Improved the robustness of death tests.
+ * Added an Xcode project and samples.
+ * Adjusted the output format on Windows to be understandable by Visual Studio.
+ * Minor bug fixes.
+
+Changes for 1.0.1:
+
+ * Added project files for Visual Studio 7.1.
+ * Fixed issues with compiling on Mac OS X.
+ * Fixed issues with compiling on Cygwin.
+
+Changes for 1.0.0:
+
+ * Initial Open Source release of Google Test
diff --git a/libs/libaom/src/third_party/googletest/src/googletest/CMakeLists.txt b/libs/libaom/src/third_party/googletest/src/googletest/CMakeLists.txt
new file mode 100644
index 000000000..9ee79408c
--- /dev/null
+++ b/libs/libaom/src/third_party/googletest/src/googletest/CMakeLists.txt
@@ -0,0 +1,331 @@
+########################################################################
+# CMake build script for Google Test.
+#
+# To run the tests for Google Test itself on Linux, use 'make test' or
+# ctest.  You can select which tests to run using 'ctest -R regex'.
+# For more options, run 'ctest --help'.
+
+# When other libraries are using a shared version of runtime libraries,
+# Google Test also has to use one.
+option(
+  gtest_force_shared_crt
+  "Use shared (DLL) run-time lib even when Google Test is built as static lib."
+  OFF)
+
+option(gtest_build_tests "Build all of gtest's own tests." OFF)
+
+option(gtest_build_samples "Build gtest's sample programs." OFF)
+
+option(gtest_disable_pthreads "Disable uses of pthreads in gtest." OFF)
+
+option(
+  gtest_hide_internal_symbols
+  "Build gtest with internal symbols hidden in shared libraries."
+  OFF)
+
+# Defines pre_project_set_up_hermetic_build() and set_up_hermetic_build().
+include(cmake/hermetic_build.cmake OPTIONAL)
+
+if (COMMAND pre_project_set_up_hermetic_build)
+  pre_project_set_up_hermetic_build()
+endif()
+
+########################################################################
+#
+# Project-wide settings
+
+# Name of the project.
+#
+# CMake files in this project can refer to the root source directory
+# as ${gtest_SOURCE_DIR} and to the root binary directory as
+# ${gtest_BINARY_DIR}.
+# Language "C" is required for find_package(Threads).
+if (CMAKE_VERSION VERSION_LESS 3.0)
+  project(gtest CXX C)
+else()
+  cmake_policy(SET CMP0048 NEW)
+  project(gtest VERSION ${GOOGLETEST_VERSION} LANGUAGES CXX C)
+endif()
+cmake_minimum_required(VERSION 2.6.4)
+
+if (POLICY CMP0063) # Visibility
+  cmake_policy(SET CMP0063 NEW)
+endif (POLICY CMP0063)
+
+if (COMMAND set_up_hermetic_build)
+  set_up_hermetic_build()
+endif()
+
+# These commands only run if this is the main project
+if(CMAKE_PROJECT_NAME STREQUAL "gtest" OR CMAKE_PROJECT_NAME STREQUAL "googletest-distribution")
+
+  # BUILD_SHARED_LIBS is a standard CMake variable, but we declare it here to
+  # make it prominent in the GUI.
+  option(BUILD_SHARED_LIBS "Build shared libraries (DLLs)." OFF)
+
+else()
+
+  mark_as_advanced(
+    gtest_force_shared_crt
+    gtest_build_tests
+    gtest_build_samples
+    gtest_disable_pthreads
+    gtest_hide_internal_symbols)
+
+endif()
+
+
+if (gtest_hide_internal_symbols)
+  set(CMAKE_CXX_VISIBILITY_PRESET hidden)
+  set(CMAKE_VISIBILITY_INLINES_HIDDEN 1)
+endif()
+
+# Define helper functions and macros used by Google Test.
+include(cmake/internal_utils.cmake)
+
+config_compiler_and_linker()  # Defined in internal_utils.cmake.
+
+# Create the CMake package file descriptors.
+if (INSTALL_GTEST)
+  include(CMakePackageConfigHelpers)
+  set(cmake_package_name GTest)
+  set(targets_export_name ${cmake_package_name}Targets CACHE INTERNAL "")
+  set(generated_dir "${CMAKE_CURRENT_BINARY_DIR}/generated" CACHE INTERNAL "")
+  set(cmake_files_install_dir "${CMAKE_INSTALL_LIBDIR}/cmake/${cmake_package_name}")
+  set(version_file "${generated_dir}/${cmake_package_name}ConfigVersion.cmake")
+  write_basic_package_version_file(${version_file} COMPATIBILITY AnyNewerVersion)
+  install(EXPORT ${targets_export_name}
+    NAMESPACE ${cmake_package_name}::
+    DESTINATION ${cmake_files_install_dir})
+  set(config_file "${generated_dir}/${cmake_package_name}Config.cmake")
+  configure_package_config_file("${gtest_SOURCE_DIR}/cmake/Config.cmake.in"
+    "${config_file}" INSTALL_DESTINATION ${cmake_files_install_dir})
+  install(FILES ${version_file} ${config_file}
+    DESTINATION ${cmake_files_install_dir})
+endif()
+
+# Where Google Test's .h files can be found.
+set(gtest_build_include_dirs
+  "${gtest_SOURCE_DIR}/include"
+  "${gtest_SOURCE_DIR}")
+include_directories(${gtest_build_include_dirs})
+
+# Summary of tuple support for Microsoft Visual Studio:
+# Compiler    version(MS)  version(cmake)  Support
+# ----------  -----------  --------------  -----------------------------
+# <= VS 2010  <= 10        <= 1600         Use Google Tests's own tuple.
+# VS 2012     11           1700            std::tr1::tuple + _VARIADIC_MAX=10
+# VS 2013     12           1800            std::tr1::tuple
+# VS 2015     14           1900            std::tuple
+# VS 2017     15           >= 1910         std::tuple
+if (MSVC AND MSVC_VERSION EQUAL 1700)
+  add_definitions(/D _VARIADIC_MAX=10)
+endif()
+
+########################################################################
+#
+# Defines the gtest & gtest_main libraries.  User tests should link
+# with one of them.
+
+# Google Test libraries.  We build them using more strict warnings than what
+# are used for other targets, to ensure that gtest can be compiled by a user
+# aggressive about warnings.
+cxx_library(gtest "${cxx_strict}" src/gtest-all.cc)
+cxx_library(gtest_main "${cxx_strict}" src/gtest_main.cc)
+# If the CMake version supports it, attach header directory information
+# to the targets for when we are part of a parent build (ie being pulled
+# in via add_subdirectory() rather than being a standalone build).
+if (DEFINED CMAKE_VERSION AND NOT "${CMAKE_VERSION}" VERSION_LESS "2.8.11")
+  target_include_directories(gtest SYSTEM INTERFACE
+    "$<BUILD_INTERFACE:${gtest_build_include_dirs}>"
+    "$<INSTALL_INTERFACE:$<INSTALL_PREFIX>/${CMAKE_INSTALL_INCLUDEDIR}>")
+  target_include_directories(gtest_main SYSTEM INTERFACE
+    "$<BUILD_INTERFACE:${gtest_build_include_dirs}>"
+    "$<INSTALL_INTERFACE:$<INSTALL_PREFIX>/${CMAKE_INSTALL_INCLUDEDIR}>")
+endif()
+target_link_libraries(gtest_main PUBLIC gtest)
+
+########################################################################
+#
+# Install rules
+install_project(gtest gtest_main)
+
+########################################################################
+#
+# Samples on how to link user tests with gtest or gtest_main.
+#
+# They are not built by default.  To build them, set the
+# gtest_build_samples option to ON.  You can do it by running ccmake
+# or specifying the -Dgtest_build_samples=ON flag when running cmake.
+
+if (gtest_build_samples)
+  cxx_executable(sample1_unittest samples gtest_main samples/sample1.cc)
+  cxx_executable(sample2_unittest samples gtest_main samples/sample2.cc)
+  cxx_executable(sample3_unittest samples gtest_main)
+  cxx_executable(sample4_unittest samples gtest_main samples/sample4.cc)
+  cxx_executable(sample5_unittest samples gtest_main samples/sample1.cc)
+  cxx_executable(sample6_unittest samples gtest_main)
+  cxx_executable(sample7_unittest samples gtest_main)
+  cxx_executable(sample8_unittest samples gtest_main)
+  cxx_executable(sample9_unittest samples gtest)
+  cxx_executable(sample10_unittest samples gtest)
+endif()
+
+########################################################################
+#
+# Google Test's own tests.
+#
+# You can skip this section if you aren't interested in testing
+# Google Test itself.
+#
+# The tests are not built by default.  To build them, set the
+# gtest_build_tests option to ON.  You can do it by running ccmake
+# or specifying the -Dgtest_build_tests=ON flag when running cmake.
+
+if (gtest_build_tests)
+  # This must be set in the root directory for the tests to be run by
+  # 'make test' or ctest.
+  enable_testing()
+
+  ############################################################
+  # C++ tests built with standard compiler flags.
+
+  cxx_test(googletest-death-test-test gtest_main)
+  cxx_test(gtest_environment_test gtest)
+  cxx_test(googletest-filepath-test gtest_main)
+  cxx_test(googletest-linked-ptr-test gtest_main)
+  cxx_test(googletest-listener-test gtest_main)
+  cxx_test(gtest_main_unittest gtest_main)
+  cxx_test(googletest-message-test gtest_main)
+  cxx_test(gtest_no_test_unittest gtest)
+  cxx_test(googletest-options-test gtest_main)
+  cxx_test(googletest-param-test-test gtest
+    test/googletest-param-test2-test.cc)
+  cxx_test(googletest-port-test gtest_main)
+  cxx_test(gtest_pred_impl_unittest gtest_main)
+  cxx_test(gtest_premature_exit_test gtest
+    test/gtest_premature_exit_test.cc)
+  cxx_test(googletest-printers-test gtest_main)
+  cxx_test(gtest_prod_test gtest_main
+    test/production.cc)
+  cxx_test(gtest_repeat_test gtest)
+  cxx_test(gtest_sole_header_test gtest_main)
+  cxx_test(gtest_stress_test gtest)
+  cxx_test(googletest-test-part-test gtest_main)
+  cxx_test(gtest_throw_on_failure_ex_test gtest)
+  cxx_test(gtest-typed-test_test gtest_main
+    test/gtest-typed-test2_test.cc)
+  cxx_test(gtest_unittest gtest_main)
+  cxx_test(gtest-unittest-api_test gtest)
+
+  ############################################################
+  # C++ tests built with non-standard compiler flags.
+
+  # MSVC 7.1 does not support STL with exceptions disabled.
+  if (NOT MSVC OR MSVC_VERSION GREATER 1310)
+    cxx_library(gtest_no_exception "${cxx_no_exception}"
+      src/gtest-all.cc)
+    cxx_library(gtest_main_no_exception "${cxx_no_exception}"
+      src/gtest-all.cc src/gtest_main.cc)
+  endif()
+  cxx_library(gtest_main_no_rtti "${cxx_no_rtti}"
+    src/gtest-all.cc src/gtest_main.cc)
+
+  cxx_test_with_flags(gtest-death-test_ex_nocatch_test
+    "${cxx_exception} -DGTEST_ENABLE_CATCH_EXCEPTIONS_=0"
+    gtest test/googletest-death-test_ex_test.cc)
+  cxx_test_with_flags(gtest-death-test_ex_catch_test
+    "${cxx_exception} -DGTEST_ENABLE_CATCH_EXCEPTIONS_=1"
+    gtest test/googletest-death-test_ex_test.cc)
+
+  cxx_test_with_flags(gtest_no_rtti_unittest "${cxx_no_rtti}"
+    gtest_main_no_rtti test/gtest_unittest.cc)
+
+  cxx_shared_library(gtest_dll "${cxx_default}"
+    src/gtest-all.cc src/gtest_main.cc)
+
+  cxx_executable_with_flags(gtest_dll_test_ "${cxx_default}"
+    gtest_dll test/gtest_all_test.cc)
+  set_target_properties(gtest_dll_test_
+                        PROPERTIES
+                        COMPILE_DEFINITIONS "GTEST_LINKED_AS_SHARED_LIBRARY=1")
+
+  if (NOT MSVC OR MSVC_VERSION LESS 1600)  # 1600 is Visual Studio 2010.
+    # Visual Studio 2010, 2012, and 2013 define symbols in std::tr1 that
+    # conflict with our own definitions. Therefore using our own tuple does not
+    # work on those compilers.
+    cxx_library(gtest_main_use_own_tuple "${cxx_use_own_tuple}"
+      src/gtest-all.cc src/gtest_main.cc)
+
+    cxx_test_with_flags(googletest-tuple-test "${cxx_use_own_tuple}"
+      gtest_main_use_own_tuple test/googletest-tuple-test.cc)
+
+    cxx_test_with_flags(gtest_use_own_tuple_test "${cxx_use_own_tuple}"
+      gtest_main_use_own_tuple
+      test/googletest-param-test-test.cc test/googletest-param-test2-test.cc)
+  endif()
+
+  ############################################################
+  # Python tests.
+
+  cxx_executable(googletest-break-on-failure-unittest_ test gtest)
+  py_test(googletest-break-on-failure-unittest)
+
+  # Visual Studio .NET 2003 does not support STL with exceptions disabled.
+  if (NOT MSVC OR MSVC_VERSION GREATER 1310)  # 1310 is Visual Studio .NET 2003
+    cxx_executable_with_flags(
+      googletest-catch-exceptions-no-ex-test_
+      "${cxx_no_exception}"
+      gtest_main_no_exception
+      test/googletest-catch-exceptions-test_.cc)
+  endif()
+
+  cxx_executable_with_flags(
+    googletest-catch-exceptions-ex-test_
+    "${cxx_exception}"
+    gtest_main
+    test/googletest-catch-exceptions-test_.cc)
+  py_test(googletest-catch-exceptions-test)
+
+  cxx_executable(googletest-color-test_ test gtest)
+  py_test(googletest-color-test)
+
+  cxx_executable(googletest-env-var-test_ test gtest)
+  py_test(googletest-env-var-test)
+
+  cxx_executable(googletest-filter-unittest_ test gtest)
+  py_test(googletest-filter-unittest)
+
+  cxx_executable(gtest_help_test_ test gtest_main)
+  py_test(gtest_help_test)
+
+  cxx_executable(googletest-list-tests-unittest_ test gtest)
+  py_test(googletest-list-tests-unittest)
+
+  cxx_executable(googletest-output-test_ test gtest)
+  py_test(googletest-output-test --no_stacktrace_support)
+
+  cxx_executable(googletest-shuffle-test_ test gtest)
+  py_test(googletest-shuffle-test)
+
+  # MSVC 7.1 does not support STL with exceptions disabled.
+  if (NOT MSVC OR MSVC_VERSION GREATER 1310)
+    cxx_executable(googletest-throw-on-failure-test_ test gtest_no_exception)
+    set_target_properties(googletest-throw-on-failure-test_
+      PROPERTIES
+      COMPILE_FLAGS "${cxx_no_exception}")
+    py_test(googletest-throw-on-failure-test)
+  endif()
+
+  cxx_executable(googletest-uninitialized-test_ test gtest)
+  py_test(googletest-uninitialized-test)
+
+  cxx_executable(gtest_xml_outfile1_test_ test gtest_main)
+  cxx_executable(gtest_xml_outfile2_test_ test gtest_main)
+  py_test(gtest_xml_outfiles_test)
+  py_test(googletest-json-outfiles-test)
+
+  cxx_executable(gtest_xml_output_unittest_ test gtest)
+  py_test(gtest_xml_output_unittest --no_stacktrace_support)
+  py_test(googletest-json-output-unittest --no_stacktrace_support)
+endif()
diff --git a/libs/libaom/src/third_party/googletest/src/googletest/CONTRIBUTORS b/libs/libaom/src/third_party/googletest/src/googletest/CONTRIBUTORS
new file mode 100644
index 000000000..feae2fc04
--- /dev/null
+++ b/libs/libaom/src/third_party/googletest/src/googletest/CONTRIBUTORS
@@ -0,0 +1,37 @@
+# This file contains a list of people who've made non-trivial
+# contribution to the Google C++ Testing Framework project.  People
+# who commit code to the project are encouraged to add their names
+# here.  Please keep the list sorted by first names.
+
+Ajay Joshi <jaj@google.com>
+Balázs Dán <balazs.dan@gmail.com>
+Bharat Mediratta <bharat@menalto.com>
+Chandler Carruth <chandlerc@google.com>
+Chris Prince <cprince@google.com>
+Chris Taylor <taylorc@google.com>
+Dan Egnor <egnor@google.com>
+Eric Roman <eroman@chromium.org>
+Hady Zalek <hady.zalek@gmail.com>
+Jeffrey Yasskin <jyasskin@google.com>
+Jói Sigurðsson <joi@google.com>
+Keir Mierle <mierle@gmail.com>
+Keith Ray <keith.ray@gmail.com>
+Kenton Varda <kenton@google.com>
+Manuel Klimek <klimek@google.com>
+Markus Heule <markus.heule@gmail.com>
+Mika Raento <mikie@iki.fi>
+Miklós Fazekas <mfazekas@szemafor.com>
+Pasi Valminen <pasi.valminen@gmail.com>
+Patrick Hanna <phanna@google.com>
+Patrick Riley <pfr@google.com>
+Peter Kaminski <piotrk@google.com>
+Preston Jackson <preston.a.jackson@gmail.com>
+Rainer Klaffenboeck <rainer.klaffenboeck@dynatrace.com>
+Russ Cox <rsc@google.com>
+Russ Rufer <russ@pentad.com>
+Sean Mcafee <eefacm@gmail.com>
+Sigurður Ásgeirsson <siggi@google.com>
+Tracy Bialik <tracy@pentad.com>
+Vadim Berman <vadimb@google.com>
+Vlad Losev <vladl@google.com>
+Zhanyong Wan <wan@google.com>
diff --git a/libs/libaom/src/third_party/googletest/src/googletest/LICENSE b/libs/libaom/src/third_party/googletest/src/googletest/LICENSE
new file mode 100644
index 000000000..1941a11f8
--- /dev/null
+++ b/libs/libaom/src/third_party/googletest/src/googletest/LICENSE
@@ -0,0 +1,28 @@
+Copyright 2008, Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+    * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/libs/libaom/src/third_party/googletest/src/googletest/README.md b/libs/libaom/src/third_party/googletest/src/googletest/README.md
new file mode 100644
index 000000000..e30fe8047
--- /dev/null
+++ b/libs/libaom/src/third_party/googletest/src/googletest/README.md
@@ -0,0 +1,341 @@
+### Generic Build Instructions
+
+#### Setup
+
+To build Google Test and your tests that use it, you need to tell your build
+system where to find its headers and source files. The exact way to do it
+depends on which build system you use, and is usually straightforward.
+
+#### Build
+
+Suppose you put Google Test in directory `${GTEST_DIR}`. To build it, create a
+library build target (or a project as called by Visual Studio and Xcode) to
+compile
+
+    ${GTEST_DIR}/src/gtest-all.cc
+
+with `${GTEST_DIR}/include` in the system header search path and `${GTEST_DIR}`
+in the normal header search path. Assuming a Linux-like system and gcc,
+something like the following will do:
+
+    g++ -isystem ${GTEST_DIR}/include -I${GTEST_DIR} \
+        -pthread -c ${GTEST_DIR}/src/gtest-all.cc
+    ar -rv libgtest.a gtest-all.o
+
+(We need `-pthread` as Google Test uses threads.)
+
+Next, you should compile your test source file with `${GTEST_DIR}/include` in
+the system header search path, and link it with gtest and any other necessary
+libraries:
+
+    g++ -isystem ${GTEST_DIR}/include -pthread path/to/your_test.cc libgtest.a \
+        -o your_test
+
+As an example, the make/ directory contains a Makefile that you can use to build
+Google Test on systems where GNU make is available (e.g. Linux, Mac OS X, and
+Cygwin). It doesn't try to build Google Test's own tests. Instead, it just
+builds the Google Test library and a sample test. You can use it as a starting
+point for your own build script.
+
+If the default settings are correct for your environment, the following commands
+should succeed:
+
+    cd ${GTEST_DIR}/make
+    make
+    ./sample1_unittest
+
+If you see errors, try to tweak the contents of `make/Makefile` to make them go
+away. There are instructions in `make/Makefile` on how to do it.
+
+### Using CMake
+
+Google Test comes with a CMake build script (
+[CMakeLists.txt](https://github.com/google/googletest/blob/master/CMakeLists.txt))
+that can be used on a wide range of platforms ("C" stands for cross-platform.).
+If you don't have CMake installed already, you can download it for free from
+<http://www.cmake.org/>.
+
+CMake works by generating native makefiles or build projects that can be used in
+the compiler environment of your choice. You can either build Google Test as a
+standalone project or it can be incorporated into an existing CMake build for
+another project.
+
+#### Standalone CMake Project
+
+When building Google Test as a standalone project, the typical workflow starts
+with:
+
+    mkdir mybuild       # Create a directory to hold the build output.
+    cd mybuild
+    cmake ${GTEST_DIR}  # Generate native build scripts.
+
+If you want to build Google Test's samples, you should replace the last command
+with
+
+    cmake -Dgtest_build_samples=ON ${GTEST_DIR}
+
+If you are on a \*nix system, you should now see a Makefile in the current
+directory. Just type 'make' to build gtest.
+
+If you use Windows and have Visual Studio installed, a `gtest.sln` file and
+several `.vcproj` files will be created. You can then build them using Visual
+Studio.
+
+On Mac OS X with Xcode installed, a `.xcodeproj` file will be generated.
+
+#### Incorporating Into An Existing CMake Project
+
+If you want to use gtest in a project which already uses CMake, then a more
+robust and flexible approach is to build gtest as part of that project directly.
+This is done by making the GoogleTest source code available to the main build
+and adding it using CMake's `add_subdirectory()` command. This has the
+significant advantage that the same compiler and linker settings are used
+between gtest and the rest of your project, so issues associated with using
+incompatible libraries (eg debug/release), etc. are avoided. This is
+particularly useful on Windows. Making GoogleTest's source code available to the
+main build can be done a few different ways:
+
+*   Download the GoogleTest source code manually and place it at a known
+    location. This is the least flexible approach and can make it more difficult
+    to use with continuous integration systems, etc.
+*   Embed the GoogleTest source code as a direct copy in the main project's
+    source tree. This is often the simplest approach, but is also the hardest to
+    keep up to date. Some organizations may not permit this method.
+*   Add GoogleTest as a git submodule or equivalent. This may not always be
+    possible or appropriate. Git submodules, for example, have their own set of
+    advantages and drawbacks.
+*   Use CMake to download GoogleTest as part of the build's configure step. This
+    is just a little more complex, but doesn't have the limitations of the other
+    methods.
+
+The last of the above methods is implemented with a small piece of CMake code in
+a separate file (e.g. `CMakeLists.txt.in`) which is copied to the build area and
+then invoked as a sub-build _during the CMake stage_. That directory is then
+pulled into the main build with `add_subdirectory()`. For example:
+
+New file `CMakeLists.txt.in`:
+
+    cmake_minimum_required(VERSION 2.8.2)
+
+    project(googletest-download NONE)
+
+    include(ExternalProject)
+    ExternalProject_Add(googletest
+      GIT_REPOSITORY    https://github.com/google/googletest.git
+      GIT_TAG           master
+      SOURCE_DIR        "${CMAKE_BINARY_DIR}/googletest-src"
+      BINARY_DIR        "${CMAKE_BINARY_DIR}/googletest-build"
+      CONFIGURE_COMMAND ""
+      BUILD_COMMAND     ""
+      INSTALL_COMMAND   ""
+      TEST_COMMAND      ""
+    )
+
+Existing build's `CMakeLists.txt`:
+
+    # Download and unpack googletest at configure time
+    configure_file(CMakeLists.txt.in googletest-download/CMakeLists.txt)
+    execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" .
+      RESULT_VARIABLE result
+      WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/googletest-download )
+    if(result)
+      message(FATAL_ERROR "CMake step for googletest failed: ${result}")
+    endif()
+    execute_process(COMMAND ${CMAKE_COMMAND} --build .
+      RESULT_VARIABLE result
+      WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/googletest-download )
+    if(result)
+      message(FATAL_ERROR "Build step for googletest failed: ${result}")
+    endif()
+
+    # Prevent overriding the parent project's compiler/linker
+    # settings on Windows
+    set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
+
+    # Add googletest directly to our build. This defines
+    # the gtest and gtest_main targets.
+    add_subdirectory(${CMAKE_BINARY_DIR}/googletest-src
+                     ${CMAKE_BINARY_DIR}/googletest-build
+                     EXCLUDE_FROM_ALL)
+
+    # The gtest/gtest_main targets carry header search path
+    # dependencies automatically when using CMake 2.8.11 or
+    # later. Otherwise we have to add them here ourselves.
+    if (CMAKE_VERSION VERSION_LESS 2.8.11)
+      include_directories("${gtest_SOURCE_DIR}/include")
+    endif()
+
+    # Now simply link against gtest or gtest_main as needed. Eg
+    add_executable(example example.cpp)
+    target_link_libraries(example gtest_main)
+    add_test(NAME example_test COMMAND example)
+
+Note that this approach requires CMake 2.8.2 or later due to its use of the
+`ExternalProject_Add()` command. The above technique is discussed in more detail
+in [this separate article](http://crascit.com/2015/07/25/cmake-gtest/) which
+also contains a link to a fully generalized implementation of the technique.
+
+##### Visual Studio Dynamic vs Static Runtimes
+
+By default, new Visual Studio projects link the C runtimes dynamically but
+Google Test links them statically. This will generate an error that looks
+something like the following: gtest.lib(gtest-all.obj) : error LNK2038: mismatch
+detected for 'RuntimeLibrary': value 'MTd_StaticDebug' doesn't match value
+'MDd_DynamicDebug' in main.obj
+
+Google Test already has a CMake option for this: `gtest_force_shared_crt`
+
+Enabling this option will make gtest link the runtimes dynamically too, and
+match the project in which it is included.
+
+### Legacy Build Scripts
+
+Before settling on CMake, we have been providing hand-maintained build
+projects/scripts for Visual Studio, Xcode, and Autotools. While we continue to
+provide them for convenience, they are not actively maintained any more. We
+highly recommend that you follow the instructions in the above sections to
+integrate Google Test with your existing build system.
+
+If you still need to use the legacy build scripts, here's how:
+
+The msvc\ folder contains two solutions with Visual C++ projects. Open the
+`gtest.sln` or `gtest-md.sln` file using Visual Studio, and you are ready to
+build Google Test the same way you build any Visual Studio project. Files that
+have names ending with -md use DLL versions of Microsoft runtime libraries (the
+/MD or the /MDd compiler option). Files without that suffix use static versions
+of the runtime libraries (the /MT or the /MTd option). Please note that one must
+use the same option to compile both gtest and the test code. If you use Visual
+Studio 2005 or above, we recommend the -md version as /MD is the default for new
+projects in these versions of Visual Studio.
+
+On Mac OS X, open the `gtest.xcodeproj` in the `xcode/` folder using Xcode.
+Build the "gtest" target. The universal binary framework will end up in your
+selected build directory (selected in the Xcode "Preferences..." -> "Building"
+pane and defaults to xcode/build). Alternatively, at the command line, enter:
+
+    xcodebuild
+
+This will build the "Release" configuration of gtest.framework in your default
+build location. See the "xcodebuild" man page for more information about
+building different configurations and building in different locations.
+
+If you wish to use the Google Test Xcode project with Xcode 4.x and above, you
+need to either:
+
+*   update the SDK configuration options in xcode/Config/General.xconfig.
+    Comment options `SDKROOT`, `MACOS_DEPLOYMENT_TARGET`, and `GCC_VERSION`. If
+    you choose this route you lose the ability to target earlier versions of
+    MacOS X.
+*   Install an SDK for an earlier version. This doesn't appear to be supported
+    by Apple, but has been reported to work
+    (http://stackoverflow.com/questions/5378518).
+
+### Tweaking Google Test
+
+Google Test can be used in diverse environments. The default configuration may
+not work (or may not work well) out of the box in some environments. However,
+you can easily tweak Google Test by defining control macros on the compiler
+command line. Generally, these macros are named like `GTEST_XYZ` and you define
+them to either 1 or 0 to enable or disable a certain feature.
+
+We list the most frequently used macros below. For a complete list, see file
+[include/gtest/internal/gtest-port.h](https://github.com/google/googletest/blob/master/include/gtest/internal/gtest-port.h).
+
+### Choosing a TR1 Tuple Library
+
+Some Google Test features require the C++ Technical Report 1 (TR1) tuple
+library, which is not yet available with all compilers. The good news is that
+Google Test implements a subset of TR1 tuple that's enough for its own need, and
+will automatically use this when the compiler doesn't provide TR1 tuple.
+
+Usually you don't need to care about which tuple library Google Test uses.
+However, if your project already uses TR1 tuple, you need to tell Google Test to
+use the same TR1 tuple library the rest of your project uses, or the two tuple
+implementations will clash. To do that, add
+
+    -DGTEST_USE_OWN_TR1_TUPLE=0
+
+to the compiler flags while compiling Google Test and your tests. If you want to
+force Google Test to use its own tuple library, just add
+
+    -DGTEST_USE_OWN_TR1_TUPLE=1
+
+to the compiler flags instead.
+
+If you don't want Google Test to use tuple at all, add
+
+    -DGTEST_HAS_TR1_TUPLE=0
+
+and all features using tuple will be disabled.
+
+### Multi-threaded Tests
+
+Google Test is thread-safe where the pthread library is available. After
+`#include "gtest/gtest.h"`, you can check the `GTEST_IS_THREADSAFE` macro to see
+whether this is the case (yes if the macro is `#defined` to 1, no if it's
+undefined.).
+
+If Google Test doesn't correctly detect whether pthread is available in your
+environment, you can force it with
+
+    -DGTEST_HAS_PTHREAD=1
+
+or
+
+    -DGTEST_HAS_PTHREAD=0
+
+When Google Test uses pthread, you may need to add flags to your compiler and/or
+linker to select the pthread library, or you'll get link errors. If you use the
+CMake script or the deprecated Autotools script, this is taken care of for you.
+If you use your own build script, you'll need to read your compiler and linker's
+manual to figure out what flags to add.
+
+### As a Shared Library (DLL)
+
+Google Test is compact, so most users can build and link it as a static library
+for the simplicity. You can choose to use Google Test as a shared library (known
+as a DLL on Windows) if you prefer.
+
+To compile *gtest* as a shared library, add
+
+    -DGTEST_CREATE_SHARED_LIBRARY=1
+
+to the compiler flags. You'll also need to tell the linker to produce a shared
+library instead - consult your linker's manual for how to do it.
+
+To compile your *tests* that use the gtest shared library, add
+
+    -DGTEST_LINKED_AS_SHARED_LIBRARY=1
+
+to the compiler flags.
+
+Note: while the above steps aren't technically necessary today when using some
+compilers (e.g. GCC), they may become necessary in the future, if we decide to
+improve the speed of loading the library (see
+<http://gcc.gnu.org/wiki/Visibility> for details). Therefore you are recommended
+to always add the above flags when using Google Test as a shared library.
+Otherwise a future release of Google Test may break your build script.
+
+### Avoiding Macro Name Clashes
+
+In C++, macros don't obey namespaces. Therefore two libraries that both define a
+macro of the same name will clash if you `#include` both definitions. In case a
+Google Test macro clashes with another library, you can force Google Test to
+rename its macro to avoid the conflict.
+
+Specifically, if both Google Test and some other code define macro FOO, you can
+add
+
+    -DGTEST_DONT_DEFINE_FOO=1
+
+to the compiler flags to tell Google Test to change the macro's name from `FOO`
+to `GTEST_FOO`. Currently `FOO` can be `FAIL`, `SUCCEED`, or `TEST`. For
+example, with `-DGTEST_DONT_DEFINE_TEST=1`, you'll need to write
+
+    GTEST_TEST(SomeTest, DoesThis) { ... }
+
+instead of
+
+    TEST(SomeTest, DoesThis) { ... }
+
+in order to define a test.
diff --git a/libs/libaom/src/third_party/googletest/src/googletest/cmake/Config.cmake.in b/libs/libaom/src/third_party/googletest/src/googletest/cmake/Config.cmake.in
new file mode 100644
index 000000000..12be4498b
--- /dev/null
+++ b/libs/libaom/src/third_party/googletest/src/googletest/cmake/Config.cmake.in
@@ -0,0 +1,9 @@
+@PACKAGE_INIT@
+include(CMakeFindDependencyMacro)
+if (@GTEST_HAS_PTHREAD@)
+  set(THREADS_PREFER_PTHREAD_FLAG @THREADS_PREFER_PTHREAD_FLAG@)
+  find_dependency(Threads)
+endif()
+
+include("${CMAKE_CURRENT_LIST_DIR}/@targets_export_name@.cmake")
+check_required_components("@project_name@")
diff --git a/libs/libaom/src/third_party/googletest/src/googletest/cmake/gtest.pc.in b/libs/libaom/src/third_party/googletest/src/googletest/cmake/gtest.pc.in
new file mode 100644
index 000000000..e7967ad56
--- /dev/null
+++ b/libs/libaom/src/third_party/googletest/src/googletest/cmake/gtest.pc.in
@@ -0,0 +1,9 @@
+libdir=@CMAKE_INSTALL_FULL_LIBDIR@
+includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
+
+Name: gtest
+Description: GoogleTest (without main() function)
+Version: @PROJECT_VERSION@
+URL: https://github.com/google/googletest
+Libs: -L${libdir} -lgtest @CMAKE_THREAD_LIBS_INIT@
+Cflags: -I${includedir} @GTEST_HAS_PTHREAD_MACRO@ @CMAKE_THREAD_LIBS_INIT@
diff --git a/libs/libaom/src/third_party/googletest/src/googletest/cmake/gtest_main.pc.in b/libs/libaom/src/third_party/googletest/src/googletest/cmake/gtest_main.pc.in
new file mode 100644
index 000000000..fe25d9c73
--- /dev/null
+++ b/libs/libaom/src/third_party/googletest/src/googletest/cmake/gtest_main.pc.in
@@ -0,0 +1,10 @@
+libdir=@CMAKE_INSTALL_FULL_LIBDIR@
+includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
+
+Name: gtest_main
+Description: GoogleTest (with main() function)
+Version: @PROJECT_VERSION@
+URL: https://github.com/google/googletest
+Requires: gtest
+Libs: -L${libdir} -lgtest_main @CMAKE_THREAD_LIBS_INIT@
+Cflags: -I${includedir} @GTEST_HAS_PTHREAD_MACRO@ @CMAKE_THREAD_LIBS_INIT@
diff --git a/libs/libaom/src/third_party/googletest/src/googletest/cmake/internal_utils.cmake b/libs/libaom/src/third_party/googletest/src/googletest/cmake/internal_utils.cmake
new file mode 100644
index 000000000..8c1f9ba99
--- /dev/null
+++ b/libs/libaom/src/third_party/googletest/src/googletest/cmake/internal_utils.cmake
@@ -0,0 +1,318 @@
+# Defines functions and macros useful for building Google Test and
+# Google Mock.
+#
+# Note:
+#
+# - This file will be run twice when building Google Mock (once via
+#   Google Test's CMakeLists.txt, and once via Google Mock's).
+#   Therefore it shouldn't have any side effects other than defining
+#   the functions and macros.
+#
+# - The functions/macros defined in this file may depend on Google
+#   Test and Google Mock's option() definitions, and thus must be
+#   called *after* the options have been defined.
+
+# Tweaks CMake's default compiler/linker settings to suit Google Test's needs.
+#
+# This must be a macro(), as inside a function string() can only
+# update variables in the function scope.
+macro(fix_default_compiler_settings_)
+  if (MSVC)
+    # For MSVC, CMake sets certain flags to defaults we want to override.
+    # This replacement code is taken from sample in the CMake Wiki at
+    # https://gitlab.kitware.com/cmake/community/wikis/FAQ#dynamic-replace.
+    foreach (flag_var
+             CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
+             CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
+      if (NOT BUILD_SHARED_LIBS AND NOT gtest_force_shared_crt)
+        # When Google Test is built as a shared library, it should also use
+        # shared runtime libraries.  Otherwise, it may end up with multiple
+        # copies of runtime library data in different modules, resulting in
+        # hard-to-find crashes. When it is built as a static library, it is
+        # preferable to use CRT as static libraries, as we don't have to rely
+        # on CRT DLLs being available. CMake always defaults to using shared
+        # CRT libraries, so we override that default here.
+        string(REPLACE "/MD" "-MT" ${flag_var} "${${flag_var}}")
+      endif()
+
+      # We prefer more strict warning checking for building Google Test.
+      # Replaces /W3 with /W4 in defaults.
+      string(REPLACE "/W3" "/W4" ${flag_var} "${${flag_var}}")
+
+      # Prevent D9025 warning for targets that have exception handling
+      # turned off (/EHs-c- flag). Where required, exceptions are explicitly
+      # re-enabled using the cxx_exception_flags variable.
+      string(REPLACE "/EHsc" "" ${flag_var} "${${flag_var}}")
+    endforeach()
+  endif()
+endmacro()
+
+# Defines the compiler/linker flags used to build Google Test and
+# Google Mock.  You can tweak these definitions to suit your need.  A
+# variable's value is empty before it's explicitly assigned to.
+macro(config_compiler_and_linker)
+  # Note: pthreads on MinGW is not supported, even if available
+  # instead, we use windows threading primitives
+  unset(GTEST_HAS_PTHREAD)
+  if (NOT gtest_disable_pthreads AND NOT MINGW)
+    # Defines CMAKE_USE_PTHREADS_INIT and CMAKE_THREAD_LIBS_INIT.
+    set(THREADS_PREFER_PTHREAD_FLAG ON)
+    find_package(Threads)
+    if (CMAKE_USE_PTHREADS_INIT)
+      set(GTEST_HAS_PTHREAD ON)
+    endif()
+  endif()
+
+  fix_default_compiler_settings_()
+  if (MSVC)
+    # Newlines inside flags variables break CMake's NMake generator.
+    # TODO(vladl@google.com): Add -RTCs and -RTCu to debug builds.
+    set(cxx_base_flags "-GS -W4 -WX -wd4251 -wd4275 -nologo -J -Zi")
+    if (MSVC_VERSION LESS 1400)  # 1400 is Visual Studio 2005
+      # Suppress spurious warnings MSVC 7.1 sometimes issues.
+      # Forcing value to bool.
+      set(cxx_base_flags "${cxx_base_flags} -wd4800")
+      # Copy constructor and assignment operator could not be generated.
+      set(cxx_base_flags "${cxx_base_flags} -wd4511 -wd4512")
+      # Compatibility warnings not applicable to Google Test.
+      # Resolved overload was found by argument-dependent lookup.
+      set(cxx_base_flags "${cxx_base_flags} -wd4675")
+    endif()
+    if (MSVC_VERSION LESS 1500)  # 1500 is Visual Studio 2008
+      # Conditional expression is constant.
+      # When compiling with /W4, we get several instances of C4127
+      # (Conditional expression is constant). In our code, we disable that
+      # warning on a case-by-case basis. However, on Visual Studio 2005,
+      # the warning fires on std::list. Therefore on that compiler and earlier,
+      # we disable the warning project-wide.
+      set(cxx_base_flags "${cxx_base_flags} -wd4127")
+    endif()
+    if (NOT (MSVC_VERSION LESS 1700))  # 1700 is Visual Studio 2012.
+      # Suppress "unreachable code" warning on VS 2012 and later.
+      # http://stackoverflow.com/questions/3232669 explains the issue.
+      set(cxx_base_flags "${cxx_base_flags} -wd4702")
+    endif()
+
+    set(cxx_base_flags "${cxx_base_flags} -D_UNICODE -DUNICODE -DWIN32 -D_WIN32")
+    set(cxx_base_flags "${cxx_base_flags} -DSTRICT -DWIN32_LEAN_AND_MEAN")
+    set(cxx_exception_flags "-EHsc -D_HAS_EXCEPTIONS=1")
+    set(cxx_no_exception_flags "-EHs-c- -D_HAS_EXCEPTIONS=0")
+    set(cxx_no_rtti_flags "-GR-")
+  elseif (CMAKE_COMPILER_IS_GNUCXX)
+    set(cxx_base_flags "-Wall -Wshadow -Werror")
+    if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 7.0.0)
+      set(cxx_base_flags "${cxx_base_flags} -Wno-error=dangling-else")
+    endif()
+    set(cxx_exception_flags "-fexceptions")
+    set(cxx_no_exception_flags "-fno-exceptions")
+    # Until version 4.3.2, GCC doesn't define a macro to indicate
+    # whether RTTI is enabled.  Therefore we define GTEST_HAS_RTTI
+    # explicitly.
+    set(cxx_no_rtti_flags "-fno-rtti -DGTEST_HAS_RTTI=0")
+    set(cxx_strict_flags
+      "-Wextra -Wno-unused-parameter -Wno-missing-field-initializers")
+  elseif (CMAKE_CXX_COMPILER_ID STREQUAL "SunPro")
+    set(cxx_exception_flags "-features=except")
+    # Sun Pro doesn't provide macros to indicate whether exceptions and
+    # RTTI are enabled, so we define GTEST_HAS_* explicitly.
+    set(cxx_no_exception_flags "-features=no%except -DGTEST_HAS_EXCEPTIONS=0")
+    set(cxx_no_rtti_flags "-features=no%rtti -DGTEST_HAS_RTTI=0")
+  elseif (CMAKE_CXX_COMPILER_ID STREQUAL "VisualAge" OR
+      CMAKE_CXX_COMPILER_ID STREQUAL "XL")
+    # CMake 2.8 changes Visual Age's compiler ID to "XL".
+    set(cxx_exception_flags "-qeh")
+    set(cxx_no_exception_flags "-qnoeh")
+    # Until version 9.0, Visual Age doesn't define a macro to indicate
+    # whether RTTI is enabled.  Therefore we define GTEST_HAS_RTTI
+    # explicitly.
+    set(cxx_no_rtti_flags "-qnortti -DGTEST_HAS_RTTI=0")
+  elseif (CMAKE_CXX_COMPILER_ID STREQUAL "HP")
+    set(cxx_base_flags "-AA -mt")
+    set(cxx_exception_flags "-DGTEST_HAS_EXCEPTIONS=1")
+    set(cxx_no_exception_flags "+noeh -DGTEST_HAS_EXCEPTIONS=0")
+    # RTTI can not be disabled in HP aCC compiler.
+    set(cxx_no_rtti_flags "")
+  endif()
+
+  # The pthreads library is available and allowed?
+  if (DEFINED GTEST_HAS_PTHREAD)
+    set(GTEST_HAS_PTHREAD_MACRO "-DGTEST_HAS_PTHREAD=1")
+  else()
+    set(GTEST_HAS_PTHREAD_MACRO "-DGTEST_HAS_PTHREAD=0")
+  endif()
+  set(cxx_base_flags "${cxx_base_flags} ${GTEST_HAS_PTHREAD_MACRO}")
+
+  # For building gtest's own tests and samples.
+  set(cxx_exception "${cxx_base_flags} ${cxx_exception_flags}")
+  set(cxx_no_exception
+    "${CMAKE_CXX_FLAGS} ${cxx_base_flags} ${cxx_no_exception_flags}")
+  set(cxx_default "${cxx_exception}")
+  set(cxx_no_rtti "${cxx_default} ${cxx_no_rtti_flags}")
+  set(cxx_use_own_tuple "${cxx_default} -DGTEST_USE_OWN_TR1_TUPLE=1")
+
+  # For building the gtest libraries.
+  set(cxx_strict "${cxx_default} ${cxx_strict_flags}")
+endmacro()
+
+# Defines the gtest & gtest_main libraries.  User tests should link
+# with one of them.
+function(cxx_library_with_type name type cxx_flags)
+  # type can be either STATIC or SHARED to denote a static or shared library.
+  # ARGN refers to additional arguments after 'cxx_flags'.
+  add_library(${name} ${type} ${ARGN})
+  set_target_properties(${name}
+    PROPERTIES
+    COMPILE_FLAGS "${cxx_flags}")
+  # Generate debug library name with a postfix.
+  set_target_properties(${name}
+    PROPERTIES
+    DEBUG_POSTFIX "d")
+  if (BUILD_SHARED_LIBS OR type STREQUAL "SHARED")
+    set_target_properties(${name}
+      PROPERTIES
+      COMPILE_DEFINITIONS "GTEST_CREATE_SHARED_LIBRARY=1")
+    if (NOT "${CMAKE_VERSION}" VERSION_LESS "2.8.11")
+      target_compile_definitions(${name} INTERFACE
+        $<INSTALL_INTERFACE:GTEST_LINKED_AS_SHARED_LIBRARY=1>)
+    endif()
+  endif()
+  if (DEFINED GTEST_HAS_PTHREAD)
+    if ("${CMAKE_VERSION}" VERSION_LESS "3.1.0")
+      set(threads_spec ${CMAKE_THREAD_LIBS_INIT})
+    else()
+      set(threads_spec Threads::Threads)
+    endif()
+    target_link_libraries(${name} PUBLIC ${threads_spec})
+  endif()
+endfunction()
+
+########################################################################
+#
+# Helper functions for creating build targets.
+
+function(cxx_shared_library name cxx_flags)
+  cxx_library_with_type(${name} SHARED "${cxx_flags}" ${ARGN})
+endfunction()
+
+function(cxx_library name cxx_flags)
+  cxx_library_with_type(${name} "" "${cxx_flags}" ${ARGN})
+endfunction()
+
+# cxx_executable_with_flags(name cxx_flags libs srcs...)
+#
+# creates a named C++ executable that depends on the given libraries and
+# is built from the given source files with the given compiler flags.
+function(cxx_executable_with_flags name cxx_flags libs)
+  add_executable(${name} ${ARGN})
+  if (MSVC AND (NOT (MSVC_VERSION LESS 1700)))  # 1700 is Visual Studio 2012.
+    # BigObj required for tests.
+    set(cxx_flags "${cxx_flags} -bigobj")
+  endif()
+  if (cxx_flags)
+    set_target_properties(${name}
+      PROPERTIES
+      COMPILE_FLAGS "${cxx_flags}")
+  endif()
+  if (BUILD_SHARED_LIBS)
+    set_target_properties(${name}
+      PROPERTIES
+      COMPILE_DEFINITIONS "GTEST_LINKED_AS_SHARED_LIBRARY=1")
+  endif()
+  # To support mixing linking in static and dynamic libraries, link each
+  # library in with an extra call to target_link_libraries.
+  foreach (lib "${libs}")
+    target_link_libraries(${name} ${lib})
+  endforeach()
+endfunction()
+
+# cxx_executable(name dir lib srcs...)
+#
+# creates a named target that depends on the given libs and is built
+# from the given source files.  dir/name.cc is implicitly included in
+# the source file list.
+function(cxx_executable name dir libs)
+  cxx_executable_with_flags(
+    ${name} "${cxx_default}" "${libs}" "${dir}/${name}.cc" ${ARGN})
+endfunction()
+
+# Sets PYTHONINTERP_FOUND and PYTHON_EXECUTABLE.
+find_package(PythonInterp)
+
+# cxx_test_with_flags(name cxx_flags libs srcs...)
+#
+# creates a named C++ test that depends on the given libs and is built
+# from the given source files with the given compiler flags.
+function(cxx_test_with_flags name cxx_flags libs)
+  cxx_executable_with_flags(${name} "${cxx_flags}" "${libs}" ${ARGN})
+  add_test(NAME ${name} COMMAND ${name})
+endfunction()
+
+# cxx_test(name libs srcs...)
+#
+# creates a named test target that depends on the given libs and is
+# built from the given source files.  Unlike cxx_test_with_flags,
+# test/name.cc is already implicitly included in the source file list.
+function(cxx_test name libs)
+  cxx_test_with_flags("${name}" "${cxx_default}" "${libs}"
+    "test/${name}.cc" ${ARGN})
+endfunction()
+
+# py_test(name)
+#
+# creates a Python test with the given name whose main module is in
+# test/name.py.  It does nothing if Python is not installed.
+function(py_test name)
+  if (PYTHONINTERP_FOUND)
+    if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 3.1)
+      if (CMAKE_CONFIGURATION_TYPES)
+	# Multi-configuration build generators as for Visual Studio save
+	# output in a subdirectory of CMAKE_CURRENT_BINARY_DIR (Debug,
+	# Release etc.), so we have to provide it here.
+        add_test(
+          NAME ${name}
+          COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/test/${name}.py
+              --build_dir=${CMAKE_CURRENT_BINARY_DIR}/$<CONFIG> ${ARGN})
+      else (CMAKE_CONFIGURATION_TYPES)
+	# Single-configuration build generators like Makefile generators
+	# don't have subdirs below CMAKE_CURRENT_BINARY_DIR.
+        add_test(
+          NAME ${name}
+          COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/test/${name}.py
+              --build_dir=${CMAKE_CURRENT_BINARY_DIR} ${ARGN})
+      endif (CMAKE_CONFIGURATION_TYPES)
+    else (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 3.1)
+      # ${CMAKE_CURRENT_BINARY_DIR} is known at configuration time, so we can
+      # directly bind it from cmake. ${CTEST_CONFIGURATION_TYPE} is known
+      # only at ctest runtime (by calling ctest -c <Configuration>), so
+      # we have to escape $ to delay variable substitution here.
+      add_test(
+        ${name}
+        ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/test/${name}.py
+          --build_dir=${CMAKE_CURRENT_BINARY_DIR}/\${CTEST_CONFIGURATION_TYPE} ${ARGN})
+    endif (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} GREATER 3.1)
+  endif(PYTHONINTERP_FOUND)
+endfunction()
+
+# install_project(targets...)
+#
+# Installs the specified targets and configures the associated pkgconfig files.
+function(install_project)
+  if(INSTALL_GTEST)
+    install(DIRECTORY "${PROJECT_SOURCE_DIR}/include/"
+      DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}")
+    # Install the project targets.
+    install(TARGETS ${ARGN}
+      EXPORT ${targets_export_name}
+      RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}"
+      ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}"
+      LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}")
+    # Configure and install pkgconfig files.
+    foreach(t ${ARGN})
+      set(configured_pc "${generated_dir}/${t}.pc")
+      configure_file("${PROJECT_SOURCE_DIR}/cmake/${t}.pc.in"
+        "${configured_pc}" @ONLY)
+      install(FILES "${configured_pc}"
+        DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig")
+    endforeach()
+  endif()
+endfunction()
diff --git a/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-death-test.h b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-death-test.h
new file mode 100644
index 000000000..39f0ded1b
--- /dev/null
+++ b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-death-test.h
@@ -0,0 +1,342 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+//
+// The Google C++ Testing and Mocking Framework (Google Test)
+//
+// This header file defines the public API for death tests.  It is
+// #included by gtest.h so a user doesn't need to include this
+// directly.
+// GOOGLETEST_CM0001 DO NOT DELETE
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
+#define GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
+
+#include "gtest/internal/gtest-death-test-internal.h"
+
+namespace testing {
+
+// This flag controls the style of death tests.  Valid values are "threadsafe",
+// meaning that the death test child process will re-execute the test binary
+// from the start, running only a single death test, or "fast",
+// meaning that the child process will execute the test logic immediately
+// after forking.
+GTEST_DECLARE_string_(death_test_style);
+
+#if GTEST_HAS_DEATH_TEST
+
+namespace internal {
+
+// Returns a Boolean value indicating whether the caller is currently
+// executing in the context of the death test child process.  Tools such as
+// Valgrind heap checkers may need this to modify their behavior in death
+// tests.  IMPORTANT: This is an internal utility.  Using it may break the
+// implementation of death tests.  User code MUST NOT use it.
+GTEST_API_ bool InDeathTestChild();
+
+}  // namespace internal
+
+// The following macros are useful for writing death tests.
+
+// Here's what happens when an ASSERT_DEATH* or EXPECT_DEATH* is
+// executed:
+//
+//   1. It generates a warning if there is more than one active
+//   thread.  This is because it's safe to fork() or clone() only
+//   when there is a single thread.
+//
+//   2. The parent process clone()s a sub-process and runs the death
+//   test in it; the sub-process exits with code 0 at the end of the
+//   death test, if it hasn't exited already.
+//
+//   3. The parent process waits for the sub-process to terminate.
+//
+//   4. The parent process checks the exit code and error message of
+//   the sub-process.
+//
+// Examples:
+//
+//   ASSERT_DEATH(server.SendMessage(56, "Hello"), "Invalid port number");
+//   for (int i = 0; i < 5; i++) {
+//     EXPECT_DEATH(server.ProcessRequest(i),
+//                  "Invalid request .* in ProcessRequest()")
+//                  << "Failed to die on request " << i;
+//   }
+//
+//   ASSERT_EXIT(server.ExitNow(), ::testing::ExitedWithCode(0), "Exiting");
+//
+//   bool KilledBySIGHUP(int exit_code) {
+//     return WIFSIGNALED(exit_code) && WTERMSIG(exit_code) == SIGHUP;
+//   }
+//
+//   ASSERT_EXIT(client.HangUpServer(), KilledBySIGHUP, "Hanging up!");
+//
+// On the regular expressions used in death tests:
+//
+//   GOOGLETEST_CM0005 DO NOT DELETE
+//   On POSIX-compliant systems (*nix), we use the <regex.h> library,
+//   which uses the POSIX extended regex syntax.
+//
+//   On other platforms (e.g. Windows or Mac), we only support a simple regex
+//   syntax implemented as part of Google Test.  This limited
+//   implementation should be enough most of the time when writing
+//   death tests; though it lacks many features you can find in PCRE
+//   or POSIX extended regex syntax.  For example, we don't support
+//   union ("x|y"), grouping ("(xy)"), brackets ("[xy]"), and
+//   repetition count ("x{5,7}"), among others.
+//
+//   Below is the syntax that we do support.  We chose it to be a
+//   subset of both PCRE and POSIX extended regex, so it's easy to
+//   learn wherever you come from.  In the following: 'A' denotes a
+//   literal character, period (.), or a single \\ escape sequence;
+//   'x' and 'y' denote regular expressions; 'm' and 'n' are for
+//   natural numbers.
+//
+//     c     matches any literal character c
+//     \\d   matches any decimal digit
+//     \\D   matches any character that's not a decimal digit
+//     \\f   matches \f
+//     \\n   matches \n
+//     \\r   matches \r
+//     \\s   matches any ASCII whitespace, including \n
+//     \\S   matches any character that's not a whitespace
+//     \\t   matches \t
+//     \\v   matches \v
+//     \\w   matches any letter, _, or decimal digit
+//     \\W   matches any character that \\w doesn't match
+//     \\c   matches any literal character c, which must be a punctuation
+//     .     matches any single character except \n
+//     A?    matches 0 or 1 occurrences of A
+//     A*    matches 0 or many occurrences of A
+//     A+    matches 1 or many occurrences of A
+//     ^     matches the beginning of a string (not that of each line)
+//     $     matches the end of a string (not that of each line)
+//     xy    matches x followed by y
+//
+//   If you accidentally use PCRE or POSIX extended regex features
+//   not implemented by us, you will get a run-time failure.  In that
+//   case, please try to rewrite your regular expression within the
+//   above syntax.
+//
+//   This implementation is *not* meant to be as highly tuned or robust
+//   as a compiled regex library, but should perform well enough for a
+//   death test, which already incurs significant overhead by launching
+//   a child process.
+//
+// Known caveats:
+//
+//   A "threadsafe" style death test obtains the path to the test
+//   program from argv[0] and re-executes it in the sub-process.  For
+//   simplicity, the current implementation doesn't search the PATH
+//   when launching the sub-process.  This means that the user must
+//   invoke the test program via a path that contains at least one
+//   path separator (e.g. path/to/foo_test and
+//   /absolute/path/to/bar_test are fine, but foo_test is not).  This
+//   is rarely a problem as people usually don't put the test binary
+//   directory in PATH.
+//
+
+// Asserts that a given statement causes the program to exit, with an
+// integer exit status that satisfies predicate, and emitting error output
+// that matches regex.
+#define ASSERT_EXIT(statement, predicate, regex) \
+  GTEST_DEATH_TEST_(statement, predicate, regex, GTEST_FATAL_FAILURE_)
+
+// Like ASSERT_EXIT, but continues on to successive tests in the
+// test suite, if any:
+#define EXPECT_EXIT(statement, predicate, regex) \
+  GTEST_DEATH_TEST_(statement, predicate, regex, GTEST_NONFATAL_FAILURE_)
+
+// Asserts that a given statement causes the program to exit, either by
+// explicitly exiting with a nonzero exit code or being killed by a
+// signal, and emitting error output that matches regex.
+#define ASSERT_DEATH(statement, regex) \
+  ASSERT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, regex)
+
+// Like ASSERT_DEATH, but continues on to successive tests in the
+// test suite, if any:
+#define EXPECT_DEATH(statement, regex) \
+  EXPECT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, regex)
+
+// Two predicate classes that can be used in {ASSERT,EXPECT}_EXIT*:
+
+// Tests that an exit code describes a normal exit with a given exit code.
+class GTEST_API_ ExitedWithCode {
+ public:
+  explicit ExitedWithCode(int exit_code);
+  bool operator()(int exit_status) const;
+
+ private:
+  // No implementation - assignment is unsupported.
+  void operator=(const ExitedWithCode &other);
+
+  const int exit_code_;
+};
+
+#if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
+// Tests that an exit code describes an exit due to termination by a
+// given signal.
+// GOOGLETEST_CM0006 DO NOT DELETE
+class GTEST_API_ KilledBySignal {
+ public:
+  explicit KilledBySignal(int signum);
+  bool operator()(int exit_status) const;
+
+ private:
+  const int signum_;
+};
+#endif  // !GTEST_OS_WINDOWS
+
+// EXPECT_DEBUG_DEATH asserts that the given statements die in debug mode.
+// The death testing framework causes this to have interesting semantics,
+// since the sideeffects of the call are only visible in opt mode, and not
+// in debug mode.
+//
+// In practice, this can be used to test functions that utilize the
+// LOG(DFATAL) macro using the following style:
+//
+// int DieInDebugOr12(int* sideeffect) {
+//   if (sideeffect) {
+//     *sideeffect = 12;
+//   }
+//   LOG(DFATAL) << "death";
+//   return 12;
+// }
+//
+// TEST(TestSuite, TestDieOr12WorksInDgbAndOpt) {
+//   int sideeffect = 0;
+//   // Only asserts in dbg.
+//   EXPECT_DEBUG_DEATH(DieInDebugOr12(&sideeffect), "death");
+//
+// #ifdef NDEBUG
+//   // opt-mode has sideeffect visible.
+//   EXPECT_EQ(12, sideeffect);
+// #else
+//   // dbg-mode no visible sideeffect.
+//   EXPECT_EQ(0, sideeffect);
+// #endif
+// }
+//
+// This will assert that DieInDebugReturn12InOpt() crashes in debug
+// mode, usually due to a DCHECK or LOG(DFATAL), but returns the
+// appropriate fallback value (12 in this case) in opt mode. If you
+// need to test that a function has appropriate side-effects in opt
+// mode, include assertions against the side-effects.  A general
+// pattern for this is:
+//
+// EXPECT_DEBUG_DEATH({
+//   // Side-effects here will have an effect after this statement in
+//   // opt mode, but none in debug mode.
+//   EXPECT_EQ(12, DieInDebugOr12(&sideeffect));
+// }, "death");
+//
+#ifdef NDEBUG
+
+#define EXPECT_DEBUG_DEATH(statement, regex) \
+  GTEST_EXECUTE_STATEMENT_(statement, regex)
+
+#define ASSERT_DEBUG_DEATH(statement, regex) \
+  GTEST_EXECUTE_STATEMENT_(statement, regex)
+
+#else
+
+#define EXPECT_DEBUG_DEATH(statement, regex) EXPECT_DEATH(statement, regex)
+
+#define ASSERT_DEBUG_DEATH(statement, regex) ASSERT_DEATH(statement, regex)
+
+#endif  // NDEBUG for EXPECT_DEBUG_DEATH
+#endif  // GTEST_HAS_DEATH_TEST
+
+// This macro is used for implementing macros such as
+// EXPECT_DEATH_IF_SUPPORTED and ASSERT_DEATH_IF_SUPPORTED on systems where
+// death tests are not supported. Those macros must compile on such systems
+// if and only if EXPECT_DEATH and ASSERT_DEATH compile with the same parameters
+// on systems that support death tests. This allows one to write such a macro on
+// a system that does not support death tests and be sure that it will compile
+// on a death-test supporting system. It is exposed publicly so that systems
+// that have death-tests with stricter requirements than GTEST_HAS_DEATH_TEST
+// can write their own equivalent of EXPECT_DEATH_IF_SUPPORTED and
+// ASSERT_DEATH_IF_SUPPORTED.
+//
+// Parameters:
+//   statement -  A statement that a macro such as EXPECT_DEATH would test
+//                for program termination. This macro has to make sure this
+//                statement is compiled but not executed, to ensure that
+//                EXPECT_DEATH_IF_SUPPORTED compiles with a certain
+//                parameter if and only if EXPECT_DEATH compiles with it.
+//   regex     -  A regex that a macro such as EXPECT_DEATH would use to test
+//                the output of statement.  This parameter has to be
+//                compiled but not evaluated by this macro, to ensure that
+//                this macro only accepts expressions that a macro such as
+//                EXPECT_DEATH would accept.
+//   terminator - Must be an empty statement for EXPECT_DEATH_IF_SUPPORTED
+//                and a return statement for ASSERT_DEATH_IF_SUPPORTED.
+//                This ensures that ASSERT_DEATH_IF_SUPPORTED will not
+//                compile inside functions where ASSERT_DEATH doesn't
+//                compile.
+//
+//  The branch that has an always false condition is used to ensure that
+//  statement and regex are compiled (and thus syntactically correct) but
+//  never executed. The unreachable code macro protects the terminator
+//  statement from generating an 'unreachable code' warning in case
+//  statement unconditionally returns or throws. The Message constructor at
+//  the end allows the syntax of streaming additional messages into the
+//  macro, for compilational compatibility with EXPECT_DEATH/ASSERT_DEATH.
+#define GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, terminator)             \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_                                                \
+  if (::testing::internal::AlwaysTrue()) {                                     \
+    GTEST_LOG_(WARNING) << "Death tests are not supported on this platform.\n" \
+                        << "Statement '" #statement "' cannot be verified.";   \
+  } else if (::testing::internal::AlwaysFalse()) {                             \
+    ::testing::internal::RE::PartialMatch(".*", (regex));                      \
+    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement);                 \
+    terminator;                                                                \
+  } else                                                                       \
+    ::testing::Message()
+
+// EXPECT_DEATH_IF_SUPPORTED(statement, regex) and
+// ASSERT_DEATH_IF_SUPPORTED(statement, regex) expand to real death tests if
+// death tests are supported; otherwise they just issue a warning.  This is
+// useful when you are combining death test assertions with normal test
+// assertions in one test.
+#if GTEST_HAS_DEATH_TEST
+#define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \
+  EXPECT_DEATH(statement, regex)
+#define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \
+  ASSERT_DEATH(statement, regex)
+#else
+#define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \
+  GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, )
+#define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \
+  GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, return )
+#endif
+
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
diff --git a/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-matchers.h b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-matchers.h
new file mode 100644
index 000000000..20be24f43
--- /dev/null
+++ b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-matchers.h
@@ -0,0 +1,769 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// The Google C++ Testing and Mocking Framework (Google Test)
+//
+// This file implements just enough of the matcher interface to allow
+// EXPECT_DEATH and friends to accept a matcher argument.
+
+// IWYU pragma: private, include "testing/base/public/gunit.h"
+// IWYU pragma: friend third_party/googletest/googlemock/.*
+// IWYU pragma: friend third_party/googletest/googletest/.*
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_MATCHERS_H_
+#define GTEST_INCLUDE_GTEST_GTEST_MATCHERS_H_
+
+#include <memory>
+#include <ostream>
+#include <string>
+#include <type_traits>
+
+#include "gtest/gtest-printers.h"
+#include "gtest/internal/gtest-internal.h"
+#include "gtest/internal/gtest-port.h"
+
+// MSVC warning C5046 is new as of VS2017 version 15.8.
+#if defined(_MSC_VER) && _MSC_VER >= 1915
+#define GTEST_MAYBE_5046_ 5046
+#else
+#define GTEST_MAYBE_5046_
+#endif
+
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(
+    4251 GTEST_MAYBE_5046_ /* class A needs to have dll-interface to be used by
+                              clients of class B */
+    /* Symbol involving type with internal linkage not defined */)
+
+namespace testing {
+
+// To implement a matcher Foo for type T, define:
+//   1. a class FooMatcherImpl that implements the
+//      MatcherInterface<T> interface, and
+//   2. a factory function that creates a Matcher<T> object from a
+//      FooMatcherImpl*.
+//
+// The two-level delegation design makes it possible to allow a user
+// to write "v" instead of "Eq(v)" where a Matcher is expected, which
+// is impossible if we pass matchers by pointers.  It also eases
+// ownership management as Matcher objects can now be copied like
+// plain values.
+
+// MatchResultListener is an abstract class.  Its << operator can be
+// used by a matcher to explain why a value matches or doesn't match.
+//
+class MatchResultListener {
+ public:
+  // Creates a listener object with the given underlying ostream.  The
+  // listener does not own the ostream, and does not dereference it
+  // in the constructor or destructor.
+  explicit MatchResultListener(::std::ostream *os) : stream_(os) {}
+  virtual ~MatchResultListener() = 0;  // Makes this class abstract.
+
+  // Streams x to the underlying ostream; does nothing if the ostream
+  // is NULL.
+  template <typename T>
+  MatchResultListener &operator<<(const T &x) {
+    if (stream_ != nullptr) *stream_ << x;
+    return *this;
+  }
+
+  // Returns the underlying ostream.
+  ::std::ostream *stream() { return stream_; }
+
+  // Returns true if and only if the listener is interested in an explanation
+  // of the match result.  A matcher's MatchAndExplain() method can use
+  // this information to avoid generating the explanation when no one
+  // intends to hear it.
+  bool IsInterested() const { return stream_ != nullptr; }
+
+ private:
+  ::std::ostream *const stream_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(MatchResultListener);
+};
+
+inline MatchResultListener::~MatchResultListener() {}
+
+// An instance of a subclass of this knows how to describe itself as a
+// matcher.
+class MatcherDescriberInterface {
+ public:
+  virtual ~MatcherDescriberInterface() {}
+
+  // Describes this matcher to an ostream.  The function should print
+  // a verb phrase that describes the property a value matching this
+  // matcher should have.  The subject of the verb phrase is the value
+  // being matched.  For example, the DescribeTo() method of the Gt(7)
+  // matcher prints "is greater than 7".
+  virtual void DescribeTo(::std::ostream *os) const = 0;
+
+  // Describes the negation of this matcher to an ostream.  For
+  // example, if the description of this matcher is "is greater than
+  // 7", the negated description could be "is not greater than 7".
+  // You are not required to override this when implementing
+  // MatcherInterface, but it is highly advised so that your matcher
+  // can produce good error messages.
+  virtual void DescribeNegationTo(::std::ostream *os) const {
+    *os << "not (";
+    DescribeTo(os);
+    *os << ")";
+  }
+};
+
+// The implementation of a matcher.
+template <typename T>
+class MatcherInterface : public MatcherDescriberInterface {
+ public:
+  // Returns true if and only if the matcher matches x; also explains the
+  // match result to 'listener' if necessary (see the next paragraph), in
+  // the form of a non-restrictive relative clause ("which ...",
+  // "whose ...", etc) that describes x.  For example, the
+  // MatchAndExplain() method of the Pointee(...) matcher should
+  // generate an explanation like "which points to ...".
+  //
+  // Implementations of MatchAndExplain() should add an explanation of
+  // the match result *if and only if* they can provide additional
+  // information that's not already present (or not obvious) in the
+  // print-out of x and the matcher's description.  Whether the match
+  // succeeds is not a factor in deciding whether an explanation is
+  // needed, as sometimes the caller needs to print a failure message
+  // when the match succeeds (e.g. when the matcher is used inside
+  // Not()).
+  //
+  // For example, a "has at least 10 elements" matcher should explain
+  // what the actual element count is, regardless of the match result,
+  // as it is useful information to the reader; on the other hand, an
+  // "is empty" matcher probably only needs to explain what the actual
+  // size is when the match fails, as it's redundant to say that the
+  // size is 0 when the value is already known to be empty.
+  //
+  // You should override this method when defining a new matcher.
+  //
+  // It's the responsibility of the caller (Google Test) to guarantee
+  // that 'listener' is not NULL.  This helps to simplify a matcher's
+  // implementation when it doesn't care about the performance, as it
+  // can talk to 'listener' without checking its validity first.
+  // However, in order to implement dummy listeners efficiently,
+  // listener->stream() may be NULL.
+  virtual bool MatchAndExplain(T x, MatchResultListener *listener) const = 0;
+
+  // Inherits these methods from MatcherDescriberInterface:
+  //   virtual void DescribeTo(::std::ostream* os) const = 0;
+  //   virtual void DescribeNegationTo(::std::ostream* os) const;
+};
+
+namespace internal {
+
+// Converts a MatcherInterface<T> to a MatcherInterface<const T&>.
+template <typename T>
+class MatcherInterfaceAdapter : public MatcherInterface<const T &> {
+ public:
+  explicit MatcherInterfaceAdapter(const MatcherInterface<T> *impl)
+      : impl_(impl) {}
+  ~MatcherInterfaceAdapter() override { delete impl_; }
+
+  void DescribeTo(::std::ostream *os) const override { impl_->DescribeTo(os); }
+
+  void DescribeNegationTo(::std::ostream *os) const override {
+    impl_->DescribeNegationTo(os);
+  }
+
+  bool MatchAndExplain(const T &x,
+                       MatchResultListener *listener) const override {
+    return impl_->MatchAndExplain(x, listener);
+  }
+
+ private:
+  const MatcherInterface<T> *const impl_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(MatcherInterfaceAdapter);
+};
+
+struct AnyEq {
+  template <typename A, typename B>
+  bool operator()(const A &a, const B &b) const {
+    return a == b;
+  }
+};
+struct AnyNe {
+  template <typename A, typename B>
+  bool operator()(const A &a, const B &b) const {
+    return a != b;
+  }
+};
+struct AnyLt {
+  template <typename A, typename B>
+  bool operator()(const A &a, const B &b) const {
+    return a < b;
+  }
+};
+struct AnyGt {
+  template <typename A, typename B>
+  bool operator()(const A &a, const B &b) const {
+    return a > b;
+  }
+};
+struct AnyLe {
+  template <typename A, typename B>
+  bool operator()(const A &a, const B &b) const {
+    return a <= b;
+  }
+};
+struct AnyGe {
+  template <typename A, typename B>
+  bool operator()(const A &a, const B &b) const {
+    return a >= b;
+  }
+};
+
+// A match result listener that ignores the explanation.
+class DummyMatchResultListener : public MatchResultListener {
+ public:
+  DummyMatchResultListener() : MatchResultListener(nullptr) {}
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(DummyMatchResultListener);
+};
+
+// A match result listener that forwards the explanation to a given
+// ostream.  The difference between this and MatchResultListener is
+// that the former is concrete.
+class StreamMatchResultListener : public MatchResultListener {
+ public:
+  explicit StreamMatchResultListener(::std::ostream *os)
+      : MatchResultListener(os) {}
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(StreamMatchResultListener);
+};
+
+// An internal class for implementing Matcher<T>, which will derive
+// from it.  We put functionalities common to all Matcher<T>
+// specializations here to avoid code duplication.
+template <typename T>
+class MatcherBase {
+ public:
+  // Returns true if and only if the matcher matches x; also explains the
+  // match result to 'listener'.
+  bool MatchAndExplain(const T &x, MatchResultListener *listener) const {
+    return impl_->MatchAndExplain(x, listener);
+  }
+
+  // Returns true if and only if this matcher matches x.
+  bool Matches(const T &x) const {
+    DummyMatchResultListener dummy;
+    return MatchAndExplain(x, &dummy);
+  }
+
+  // Describes this matcher to an ostream.
+  void DescribeTo(::std::ostream *os) const { impl_->DescribeTo(os); }
+
+  // Describes the negation of this matcher to an ostream.
+  void DescribeNegationTo(::std::ostream *os) const {
+    impl_->DescribeNegationTo(os);
+  }
+
+  // Explains why x matches, or doesn't match, the matcher.
+  void ExplainMatchResultTo(const T &x, ::std::ostream *os) const {
+    StreamMatchResultListener listener(os);
+    MatchAndExplain(x, &listener);
+  }
+
+  // Returns the describer for this matcher object; retains ownership
+  // of the describer, which is only guaranteed to be alive when
+  // this matcher object is alive.
+  const MatcherDescriberInterface *GetDescriber() const { return impl_.get(); }
+
+ protected:
+  MatcherBase() {}
+
+  // Constructs a matcher from its implementation.
+  explicit MatcherBase(const MatcherInterface<const T &> *impl) : impl_(impl) {}
+
+  template <typename U>
+  explicit MatcherBase(
+      const MatcherInterface<U> *impl,
+      typename std::enable_if<!std::is_same<U, const U &>::value>::type * =
+          nullptr)
+      : impl_(new internal::MatcherInterfaceAdapter<U>(impl)) {}
+
+  MatcherBase(const MatcherBase &) = default;
+  MatcherBase &operator=(const MatcherBase &) = default;
+  MatcherBase(MatcherBase &&) = default;
+  MatcherBase &operator=(MatcherBase &&) = default;
+
+  virtual ~MatcherBase() {}
+
+ private:
+  std::shared_ptr<const MatcherInterface<const T &>> impl_;
+};
+
+}  // namespace internal
+
+// A Matcher<T> is a copyable and IMMUTABLE (except by assignment)
+// object that can check whether a value of type T matches.  The
+// implementation of Matcher<T> is just a std::shared_ptr to const
+// MatcherInterface<T>.  Don't inherit from Matcher!
+template <typename T>
+class Matcher : public internal::MatcherBase<T> {
+ public:
+  // Constructs a null matcher.  Needed for storing Matcher objects in STL
+  // containers.  A default-constructed matcher is not yet initialized.  You
+  // cannot use it until a valid value has been assigned to it.
+  explicit Matcher() {}  // NOLINT
+
+  // Constructs a matcher from its implementation.
+  explicit Matcher(const MatcherInterface<const T &> *impl)
+      : internal::MatcherBase<T>(impl) {}
+
+  template <typename U>
+  explicit Matcher(
+      const MatcherInterface<U> *impl,
+      typename std::enable_if<!std::is_same<U, const U &>::value>::type * =
+          nullptr)
+      : internal::MatcherBase<T>(impl) {}
+
+  // Implicit constructor here allows people to write
+  // EXPECT_CALL(foo, Bar(5)) instead of EXPECT_CALL(foo, Bar(Eq(5))) sometimes
+  Matcher(T value);  // NOLINT
+};
+
+// The following two specializations allow the user to write str
+// instead of Eq(str) and "foo" instead of Eq("foo") when a std::string
+// matcher is expected.
+template <>
+class GTEST_API_ Matcher<const std::string &>
+    : public internal::MatcherBase<const std::string &> {
+ public:
+  Matcher() {}
+
+  explicit Matcher(const MatcherInterface<const std::string &> *impl)
+      : internal::MatcherBase<const std::string &>(impl) {}
+
+  // Allows the user to write str instead of Eq(str) sometimes, where
+  // str is a std::string object.
+  Matcher(const std::string &s);  // NOLINT
+
+  // Allows the user to write "foo" instead of Eq("foo") sometimes.
+  Matcher(const char *s);  // NOLINT
+};
+
+template <>
+class GTEST_API_ Matcher<std::string>
+    : public internal::MatcherBase<std::string> {
+ public:
+  Matcher() {}
+
+  explicit Matcher(const MatcherInterface<const std::string &> *impl)
+      : internal::MatcherBase<std::string>(impl) {}
+  explicit Matcher(const MatcherInterface<std::string> *impl)
+      : internal::MatcherBase<std::string>(impl) {}
+
+  // Allows the user to write str instead of Eq(str) sometimes, where
+  // str is a string object.
+  Matcher(const std::string &s);  // NOLINT
+
+  // Allows the user to write "foo" instead of Eq("foo") sometimes.
+  Matcher(const char *s);  // NOLINT
+};
+
+#if GTEST_INTERNAL_HAS_STRING_VIEW
+// The following two specializations allow the user to write str
+// instead of Eq(str) and "foo" instead of Eq("foo") when a absl::string_view
+// matcher is expected.
+template <>
+class GTEST_API_ Matcher<const internal::StringView &>
+    : public internal::MatcherBase<const internal::StringView &> {
+ public:
+  Matcher() {}
+
+  explicit Matcher(const MatcherInterface<const internal::StringView &> *impl)
+      : internal::MatcherBase<const internal::StringView &>(impl) {}
+
+  // Allows the user to write str instead of Eq(str) sometimes, where
+  // str is a std::string object.
+  Matcher(const std::string &s);  // NOLINT
+
+  // Allows the user to write "foo" instead of Eq("foo") sometimes.
+  Matcher(const char *s);  // NOLINT
+
+  // Allows the user to pass absl::string_views or std::string_views directly.
+  Matcher(internal::StringView s);  // NOLINT
+};
+
+template <>
+class GTEST_API_ Matcher<internal::StringView>
+    : public internal::MatcherBase<internal::StringView> {
+ public:
+  Matcher() {}
+
+  explicit Matcher(const MatcherInterface<const internal::StringView &> *impl)
+      : internal::MatcherBase<internal::StringView>(impl) {}
+  explicit Matcher(const MatcherInterface<internal::StringView> *impl)
+      : internal::MatcherBase<internal::StringView>(impl) {}
+
+  // Allows the user to write str instead of Eq(str) sometimes, where
+  // str is a std::string object.
+  Matcher(const std::string &s);  // NOLINT
+
+  // Allows the user to write "foo" instead of Eq("foo") sometimes.
+  Matcher(const char *s);  // NOLINT
+
+  // Allows the user to pass absl::string_views or std::string_views directly.
+  Matcher(internal::StringView s);  // NOLINT
+};
+#endif  // GTEST_INTERNAL_HAS_STRING_VIEW
+
+// Prints a matcher in a human-readable format.
+template <typename T>
+std::ostream &operator<<(std::ostream &os, const Matcher<T> &matcher) {
+  matcher.DescribeTo(&os);
+  return os;
+}
+
+// The PolymorphicMatcher class template makes it easy to implement a
+// polymorphic matcher (i.e. a matcher that can match values of more
+// than one type, e.g. Eq(n) and NotNull()).
+//
+// To define a polymorphic matcher, a user should provide an Impl
+// class that has a DescribeTo() method and a DescribeNegationTo()
+// method, and define a member function (or member function template)
+//
+//   bool MatchAndExplain(const Value& value,
+//                        MatchResultListener* listener) const;
+//
+// See the definition of NotNull() for a complete example.
+template <class Impl>
+class PolymorphicMatcher {
+ public:
+  explicit PolymorphicMatcher(const Impl &an_impl) : impl_(an_impl) {}
+
+  // Returns a mutable reference to the underlying matcher
+  // implementation object.
+  Impl &mutable_impl() { return impl_; }
+
+  // Returns an immutable reference to the underlying matcher
+  // implementation object.
+  const Impl &impl() const { return impl_; }
+
+  template <typename T>
+  operator Matcher<T>() const {
+    return Matcher<T>(new MonomorphicImpl<const T &>(impl_));
+  }
+
+ private:
+  template <typename T>
+  class MonomorphicImpl : public MatcherInterface<T> {
+   public:
+    explicit MonomorphicImpl(const Impl &impl) : impl_(impl) {}
+
+    void DescribeTo(::std::ostream *os) const override { impl_.DescribeTo(os); }
+
+    void DescribeNegationTo(::std::ostream *os) const override {
+      impl_.DescribeNegationTo(os);
+    }
+
+    bool MatchAndExplain(T x, MatchResultListener *listener) const override {
+      return impl_.MatchAndExplain(x, listener);
+    }
+
+   private:
+    const Impl impl_;
+  };
+
+  Impl impl_;
+};
+
+// Creates a matcher from its implementation.
+// DEPRECATED: Especially in the generic code, prefer:
+//   Matcher<T>(new MyMatcherImpl<const T&>(...));
+//
+// MakeMatcher may create a Matcher that accepts its argument by value, which
+// leads to unnecessary copies & lack of support for non-copyable types.
+template <typename T>
+inline Matcher<T> MakeMatcher(const MatcherInterface<T> *impl) {
+  return Matcher<T>(impl);
+}
+
+// Creates a polymorphic matcher from its implementation.  This is
+// easier to use than the PolymorphicMatcher<Impl> constructor as it
+// doesn't require you to explicitly write the template argument, e.g.
+//
+//   MakePolymorphicMatcher(foo);
+// vs
+//   PolymorphicMatcher<TypeOfFoo>(foo);
+template <class Impl>
+inline PolymorphicMatcher<Impl> MakePolymorphicMatcher(const Impl &impl) {
+  return PolymorphicMatcher<Impl>(impl);
+}
+
+namespace internal {
+// Implements a matcher that compares a given value with a
+// pre-supplied value using one of the ==, <=, <, etc, operators.  The
+// two values being compared don't have to have the same type.
+//
+// The matcher defined here is polymorphic (for example, Eq(5) can be
+// used to match an int, a short, a double, etc).  Therefore we use
+// a template type conversion operator in the implementation.
+//
+// The following template definition assumes that the Rhs parameter is
+// a "bare" type (i.e. neither 'const T' nor 'T&').
+template <typename D, typename Rhs, typename Op>
+class ComparisonBase {
+ public:
+  explicit ComparisonBase(const Rhs &rhs) : rhs_(rhs) {}
+  template <typename Lhs>
+  operator Matcher<Lhs>() const {
+    return Matcher<Lhs>(new Impl<const Lhs &>(rhs_));
+  }
+
+ private:
+  template <typename T>
+  static const T &Unwrap(const T &v) {
+    return v;
+  }
+  template <typename T>
+  static const T &Unwrap(std::reference_wrapper<T> v) {
+    return v;
+  }
+
+  template <typename Lhs, typename = Rhs>
+  class Impl : public MatcherInterface<Lhs> {
+   public:
+    explicit Impl(const Rhs &rhs) : rhs_(rhs) {}
+    bool MatchAndExplain(Lhs lhs,
+                         MatchResultListener * /* listener */) const override {
+      return Op()(lhs, Unwrap(rhs_));
+    }
+    void DescribeTo(::std::ostream *os) const override {
+      *os << D::Desc() << " ";
+      UniversalPrint(Unwrap(rhs_), os);
+    }
+    void DescribeNegationTo(::std::ostream *os) const override {
+      *os << D::NegatedDesc() << " ";
+      UniversalPrint(Unwrap(rhs_), os);
+    }
+
+   private:
+    Rhs rhs_;
+  };
+  Rhs rhs_;
+};
+
+template <typename Rhs>
+class EqMatcher : public ComparisonBase<EqMatcher<Rhs>, Rhs, AnyEq> {
+ public:
+  explicit EqMatcher(const Rhs &rhs)
+      : ComparisonBase<EqMatcher<Rhs>, Rhs, AnyEq>(rhs) {}
+  static const char *Desc() { return "is equal to"; }
+  static const char *NegatedDesc() { return "isn't equal to"; }
+};
+template <typename Rhs>
+class NeMatcher : public ComparisonBase<NeMatcher<Rhs>, Rhs, AnyNe> {
+ public:
+  explicit NeMatcher(const Rhs &rhs)
+      : ComparisonBase<NeMatcher<Rhs>, Rhs, AnyNe>(rhs) {}
+  static const char *Desc() { return "isn't equal to"; }
+  static const char *NegatedDesc() { return "is equal to"; }
+};
+template <typename Rhs>
+class LtMatcher : public ComparisonBase<LtMatcher<Rhs>, Rhs, AnyLt> {
+ public:
+  explicit LtMatcher(const Rhs &rhs)
+      : ComparisonBase<LtMatcher<Rhs>, Rhs, AnyLt>(rhs) {}
+  static const char *Desc() { return "is <"; }
+  static const char *NegatedDesc() { return "isn't <"; }
+};
+template <typename Rhs>
+class GtMatcher : public ComparisonBase<GtMatcher<Rhs>, Rhs, AnyGt> {
+ public:
+  explicit GtMatcher(const Rhs &rhs)
+      : ComparisonBase<GtMatcher<Rhs>, Rhs, AnyGt>(rhs) {}
+  static const char *Desc() { return "is >"; }
+  static const char *NegatedDesc() { return "isn't >"; }
+};
+template <typename Rhs>
+class LeMatcher : public ComparisonBase<LeMatcher<Rhs>, Rhs, AnyLe> {
+ public:
+  explicit LeMatcher(const Rhs &rhs)
+      : ComparisonBase<LeMatcher<Rhs>, Rhs, AnyLe>(rhs) {}
+  static const char *Desc() { return "is <="; }
+  static const char *NegatedDesc() { return "isn't <="; }
+};
+template <typename Rhs>
+class GeMatcher : public ComparisonBase<GeMatcher<Rhs>, Rhs, AnyGe> {
+ public:
+  explicit GeMatcher(const Rhs &rhs)
+      : ComparisonBase<GeMatcher<Rhs>, Rhs, AnyGe>(rhs) {}
+  static const char *Desc() { return "is >="; }
+  static const char *NegatedDesc() { return "isn't >="; }
+};
+
+// Implements polymorphic matchers MatchesRegex(regex) and
+// ContainsRegex(regex), which can be used as a Matcher<T> as long as
+// T can be converted to a string.
+class MatchesRegexMatcher {
+ public:
+  MatchesRegexMatcher(const RE *regex, bool full_match)
+      : regex_(regex), full_match_(full_match) {}
+
+#if GTEST_INTERNAL_HAS_STRING_VIEW
+  bool MatchAndExplain(const internal::StringView &s,
+                       MatchResultListener *listener) const {
+    return MatchAndExplain(std::string(s), listener);
+  }
+#endif  // GTEST_INTERNAL_HAS_STRING_VIEW
+
+  // Accepts pointer types, particularly:
+  //   const char*
+  //   char*
+  //   const wchar_t*
+  //   wchar_t*
+  template <typename CharType>
+  bool MatchAndExplain(CharType *s, MatchResultListener *listener) const {
+    return s != nullptr && MatchAndExplain(std::string(s), listener);
+  }
+
+  // Matches anything that can convert to std::string.
+  //
+  // This is a template, not just a plain function with const std::string&,
+  // because absl::string_view has some interfering non-explicit constructors.
+  template <class MatcheeStringType>
+  bool MatchAndExplain(const MatcheeStringType &s,
+                       MatchResultListener * /* listener */) const {
+    const std::string &s2(s);
+    return full_match_ ? RE::FullMatch(s2, *regex_)
+                       : RE::PartialMatch(s2, *regex_);
+  }
+
+  void DescribeTo(::std::ostream *os) const {
+    *os << (full_match_ ? "matches" : "contains") << " regular expression ";
+    UniversalPrinter<std::string>::Print(regex_->pattern(), os);
+  }
+
+  void DescribeNegationTo(::std::ostream *os) const {
+    *os << "doesn't " << (full_match_ ? "match" : "contain")
+        << " regular expression ";
+    UniversalPrinter<std::string>::Print(regex_->pattern(), os);
+  }
+
+ private:
+  const std::shared_ptr<const RE> regex_;
+  const bool full_match_;
+};
+}  // namespace internal
+
+// Matches a string that fully matches regular expression 'regex'.
+// The matcher takes ownership of 'regex'.
+inline PolymorphicMatcher<internal::MatchesRegexMatcher> MatchesRegex(
+    const internal::RE *regex) {
+  return MakePolymorphicMatcher(internal::MatchesRegexMatcher(regex, true));
+}
+inline PolymorphicMatcher<internal::MatchesRegexMatcher> MatchesRegex(
+    const std::string &regex) {
+  return MatchesRegex(new internal::RE(regex));
+}
+
+// Matches a string that contains regular expression 'regex'.
+// The matcher takes ownership of 'regex'.
+inline PolymorphicMatcher<internal::MatchesRegexMatcher> ContainsRegex(
+    const internal::RE *regex) {
+  return MakePolymorphicMatcher(internal::MatchesRegexMatcher(regex, false));
+}
+inline PolymorphicMatcher<internal::MatchesRegexMatcher> ContainsRegex(
+    const std::string &regex) {
+  return ContainsRegex(new internal::RE(regex));
+}
+
+// Creates a polymorphic matcher that matches anything equal to x.
+// Note: if the parameter of Eq() were declared as const T&, Eq("foo")
+// wouldn't compile.
+template <typename T>
+inline internal::EqMatcher<T> Eq(T x) {
+  return internal::EqMatcher<T>(x);
+}
+
+// Constructs a Matcher<T> from a 'value' of type T.  The constructed
+// matcher matches any value that's equal to 'value'.
+template <typename T>
+Matcher<T>::Matcher(T value) {
+  *this = Eq(value);
+}
+
+// Creates a monomorphic matcher that matches anything with type Lhs
+// and equal to rhs.  A user may need to use this instead of Eq(...)
+// in order to resolve an overloading ambiguity.
+//
+// TypedEq<T>(x) is just a convenient short-hand for Matcher<T>(Eq(x))
+// or Matcher<T>(x), but more readable than the latter.
+//
+// We could define similar monomorphic matchers for other comparison
+// operations (e.g. TypedLt, TypedGe, and etc), but decided not to do
+// it yet as those are used much less than Eq() in practice.  A user
+// can always write Matcher<T>(Lt(5)) to be explicit about the type,
+// for example.
+template <typename Lhs, typename Rhs>
+inline Matcher<Lhs> TypedEq(const Rhs &rhs) {
+  return Eq(rhs);
+}
+
+// Creates a polymorphic matcher that matches anything >= x.
+template <typename Rhs>
+inline internal::GeMatcher<Rhs> Ge(Rhs x) {
+  return internal::GeMatcher<Rhs>(x);
+}
+
+// Creates a polymorphic matcher that matches anything > x.
+template <typename Rhs>
+inline internal::GtMatcher<Rhs> Gt(Rhs x) {
+  return internal::GtMatcher<Rhs>(x);
+}
+
+// Creates a polymorphic matcher that matches anything <= x.
+template <typename Rhs>
+inline internal::LeMatcher<Rhs> Le(Rhs x) {
+  return internal::LeMatcher<Rhs>(x);
+}
+
+// Creates a polymorphic matcher that matches anything < x.
+template <typename Rhs>
+inline internal::LtMatcher<Rhs> Lt(Rhs x) {
+  return internal::LtMatcher<Rhs>(x);
+}
+
+// Creates a polymorphic matcher that matches anything != x.
+template <typename Rhs>
+inline internal::NeMatcher<Rhs> Ne(Rhs x) {
+  return internal::NeMatcher<Rhs>(x);
+}
+}  // namespace testing
+
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251 5046
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_MATCHERS_H_
diff --git a/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-message.h b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-message.h
new file mode 100644
index 000000000..713facae8
--- /dev/null
+++ b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-message.h
@@ -0,0 +1,217 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+//
+// The Google C++ Testing and Mocking Framework (Google Test)
+//
+// This header file defines the Message class.
+//
+// IMPORTANT NOTE: Due to limitation of the C++ language, we have to
+// leave some internal implementation details in this header file.
+// They are clearly marked by comments like this:
+//
+//   // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+//
+// Such code is NOT meant to be used by a user directly, and is subject
+// to CHANGE WITHOUT NOTICE.  Therefore DO NOT DEPEND ON IT in a user
+// program!
+
+// GOOGLETEST_CM0001 DO NOT DELETE
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
+#define GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
+
+#include <limits>
+#include <memory>
+#include <sstream>
+
+#include "gtest/internal/gtest-port.h"
+
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
+// Ensures that there is at least one operator<< in the global namespace.
+// See Message& operator<<(...) below for why.
+void operator<<(const testing::internal::Secret &, int);
+
+namespace testing {
+
+// The Message class works like an ostream repeater.
+//
+// Typical usage:
+//
+//   1. You stream a bunch of values to a Message object.
+//      It will remember the text in a stringstream.
+//   2. Then you stream the Message object to an ostream.
+//      This causes the text in the Message to be streamed
+//      to the ostream.
+//
+// For example;
+//
+//   testing::Message foo;
+//   foo << 1 << " != " << 2;
+//   std::cout << foo;
+//
+// will print "1 != 2".
+//
+// Message is not intended to be inherited from.  In particular, its
+// destructor is not virtual.
+//
+// Note that stringstream behaves differently in gcc and in MSVC.  You
+// can stream a NULL char pointer to it in the former, but not in the
+// latter (it causes an access violation if you do).  The Message
+// class hides this difference by treating a NULL char pointer as
+// "(null)".
+class GTEST_API_ Message {
+ private:
+  // The type of basic IO manipulators (endl, ends, and flush) for
+  // narrow streams.
+  typedef std::ostream &(*BasicNarrowIoManip)(std::ostream &);
+
+ public:
+  // Constructs an empty Message.
+  Message();
+
+  // Copy constructor.
+  Message(const Message &msg) : ss_(new ::std::stringstream) {  // NOLINT
+    *ss_ << msg.GetString();
+  }
+
+  // Constructs a Message from a C-string.
+  explicit Message(const char *str) : ss_(new ::std::stringstream) {
+    *ss_ << str;
+  }
+
+  // Streams a non-pointer value to this object.
+  template <typename T>
+  inline Message &operator<<(const T &val) {
+    // Some libraries overload << for STL containers.  These
+    // overloads are defined in the global namespace instead of ::std.
+    //
+    // C++'s symbol lookup rule (i.e. Koenig lookup) says that these
+    // overloads are visible in either the std namespace or the global
+    // namespace, but not other namespaces, including the testing
+    // namespace which Google Test's Message class is in.
+    //
+    // To allow STL containers (and other types that has a << operator
+    // defined in the global namespace) to be used in Google Test
+    // assertions, testing::Message must access the custom << operator
+    // from the global namespace.  With this using declaration,
+    // overloads of << defined in the global namespace and those
+    // visible via Koenig lookup are both exposed in this function.
+    using ::operator<<;
+    *ss_ << val;
+    return *this;
+  }
+
+  // Streams a pointer value to this object.
+  //
+  // This function is an overload of the previous one.  When you
+  // stream a pointer to a Message, this definition will be used as it
+  // is more specialized.  (The C++ Standard, section
+  // [temp.func.order].)  If you stream a non-pointer, then the
+  // previous definition will be used.
+  //
+  // The reason for this overload is that streaming a NULL pointer to
+  // ostream is undefined behavior.  Depending on the compiler, you
+  // may get "0", "(nil)", "(null)", or an access violation.  To
+  // ensure consistent result across compilers, we always treat NULL
+  // as "(null)".
+  template <typename T>
+  inline Message &operator<<(T *const &pointer) {  // NOLINT
+    if (pointer == nullptr) {
+      *ss_ << "(null)";
+    } else {
+      *ss_ << pointer;
+    }
+    return *this;
+  }
+
+  // Since the basic IO manipulators are overloaded for both narrow
+  // and wide streams, we have to provide this specialized definition
+  // of operator <<, even though its body is the same as the
+  // templatized version above.  Without this definition, streaming
+  // endl or other basic IO manipulators to Message will confuse the
+  // compiler.
+  Message &operator<<(BasicNarrowIoManip val) {
+    *ss_ << val;
+    return *this;
+  }
+
+  // Instead of 1/0, we want to see true/false for bool values.
+  Message &operator<<(bool b) { return *this << (b ? "true" : "false"); }
+
+  // These two overloads allow streaming a wide C string to a Message
+  // using the UTF-8 encoding.
+  Message &operator<<(const wchar_t *wide_c_str);
+  Message &operator<<(wchar_t *wide_c_str);
+
+#if GTEST_HAS_STD_WSTRING
+  // Converts the given wide string to a narrow string using the UTF-8
+  // encoding, and streams the result to this Message object.
+  Message &operator<<(const ::std::wstring &wstr);
+#endif  // GTEST_HAS_STD_WSTRING
+
+  // Gets the text streamed to this object so far as an std::string.
+  // Each '\0' character in the buffer is replaced with "\\0".
+  //
+  // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+  std::string GetString() const;
+
+ private:
+  // We'll hold the text streamed to this object here.
+  const std::unique_ptr< ::std::stringstream> ss_;
+
+  // We declare (but don't implement) this to prevent the compiler
+  // from implementing the assignment operator.
+  void operator=(const Message &);
+};
+
+// Streams a Message to an ostream.
+inline std::ostream &operator<<(std::ostream &os, const Message &sb) {
+  return os << sb.GetString();
+}
+
+namespace internal {
+
+// Converts a streamable value to an std::string.  A NULL pointer is
+// converted to "(null)".  When the input value is a ::string,
+// ::std::string, ::wstring, or ::std::wstring object, each NUL
+// character in it is replaced with "\\0".
+template <typename T>
+std::string StreamableToString(const T &streamable) {
+  return (Message() << streamable).GetString();
+}
+
+}  // namespace internal
+}  // namespace testing
+
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
diff --git a/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-param-test.h b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-param-test.h
new file mode 100644
index 000000000..8d01df525
--- /dev/null
+++ b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-param-test.h
@@ -0,0 +1,507 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Macros and functions for implementing parameterized tests
+// in Google C++ Testing and Mocking Framework (Google Test)
+//
+// This file is generated by a SCRIPT.  DO NOT EDIT BY HAND!
+//
+// GOOGLETEST_CM0001 DO NOT DELETE
+#ifndef GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
+#define GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
+
+// Value-parameterized tests allow you to test your code with different
+// parameters without writing multiple copies of the same test.
+//
+// Here is how you use value-parameterized tests:
+
+#if 0
+
+// To write value-parameterized tests, first you should define a fixture
+// class. It is usually derived from testing::TestWithParam<T> (see below for
+// another inheritance scheme that's sometimes useful in more complicated
+// class hierarchies), where the type of your parameter values.
+// TestWithParam<T> is itself derived from testing::Test. T can be any
+// copyable type. If it's a raw pointer, you are responsible for managing the
+// lifespan of the pointed values.
+
+class FooTest : public ::testing::TestWithParam<const char*> {
+  // You can implement all the usual class fixture members here.
+};
+
+// Then, use the TEST_P macro to define as many parameterized tests
+// for this fixture as you want. The _P suffix is for "parameterized"
+// or "pattern", whichever you prefer to think.
+
+TEST_P(FooTest, DoesBlah) {
+  // Inside a test, access the test parameter with the GetParam() method
+  // of the TestWithParam<T> class:
+  EXPECT_TRUE(foo.Blah(GetParam()));
+  ...
+}
+
+TEST_P(FooTest, HasBlahBlah) {
+  ...
+}
+
+// Finally, you can use INSTANTIATE_TEST_SUITE_P to instantiate the test
+// case with any set of parameters you want. Google Test defines a number
+// of functions for generating test parameters. They return what we call
+// (surprise!) parameter generators. Here is a summary of them, which
+// are all in the testing namespace:
+//
+//
+//  Range(begin, end [, step]) - Yields values {begin, begin+step,
+//                               begin+step+step, ...}. The values do not
+//                               include end. step defaults to 1.
+//  Values(v1, v2, ..., vN)    - Yields values {v1, v2, ..., vN}.
+//  ValuesIn(container)        - Yields values from a C-style array, an STL
+//  ValuesIn(begin,end)          container, or an iterator range [begin, end).
+//  Bool()                     - Yields sequence {false, true}.
+//  Combine(g1, g2, ..., gN)   - Yields all combinations (the Cartesian product
+//                               for the math savvy) of the values generated
+//                               by the N generators.
+//
+// For more details, see comments at the definitions of these functions below
+// in this file.
+//
+// The following statement will instantiate tests from the FooTest test suite
+// each with parameter values "meeny", "miny", and "moe".
+
+INSTANTIATE_TEST_SUITE_P(InstantiationName,
+                         FooTest,
+                         Values("meeny", "miny", "moe"));
+
+// To distinguish different instances of the pattern, (yes, you
+// can instantiate it more than once) the first argument to the
+// INSTANTIATE_TEST_SUITE_P macro is a prefix that will be added to the
+// actual test suite name. Remember to pick unique prefixes for different
+// instantiations. The tests from the instantiation above will have
+// these names:
+//
+//    * InstantiationName/FooTest.DoesBlah/0 for "meeny"
+//    * InstantiationName/FooTest.DoesBlah/1 for "miny"
+//    * InstantiationName/FooTest.DoesBlah/2 for "moe"
+//    * InstantiationName/FooTest.HasBlahBlah/0 for "meeny"
+//    * InstantiationName/FooTest.HasBlahBlah/1 for "miny"
+//    * InstantiationName/FooTest.HasBlahBlah/2 for "moe"
+//
+// You can use these names in --gtest_filter.
+//
+// This statement will instantiate all tests from FooTest again, each
+// with parameter values "cat" and "dog":
+
+const char* pets[] = {"cat", "dog"};
+INSTANTIATE_TEST_SUITE_P(AnotherInstantiationName, FooTest, ValuesIn(pets));
+
+// The tests from the instantiation above will have these names:
+//
+//    * AnotherInstantiationName/FooTest.DoesBlah/0 for "cat"
+//    * AnotherInstantiationName/FooTest.DoesBlah/1 for "dog"
+//    * AnotherInstantiationName/FooTest.HasBlahBlah/0 for "cat"
+//    * AnotherInstantiationName/FooTest.HasBlahBlah/1 for "dog"
+//
+// Please note that INSTANTIATE_TEST_SUITE_P will instantiate all tests
+// in the given test suite, whether their definitions come before or
+// AFTER the INSTANTIATE_TEST_SUITE_P statement.
+//
+// Please also note that generator expressions (including parameters to the
+// generators) are evaluated in InitGoogleTest(), after main() has started.
+// This allows the user on one hand, to adjust generator parameters in order
+// to dynamically determine a set of tests to run and on the other hand,
+// give the user a chance to inspect the generated tests with Google Test
+// reflection API before RUN_ALL_TESTS() is executed.
+//
+// You can see samples/sample7_unittest.cc and samples/sample8_unittest.cc
+// for more examples.
+//
+// In the future, we plan to publish the API for defining new parameter
+// generators. But for now this interface remains part of the internal
+// implementation and is subject to change.
+//
+//
+// A parameterized test fixture must be derived from testing::Test and from
+// testing::WithParamInterface<T>, where T is the type of the parameter
+// values. Inheriting from TestWithParam<T> satisfies that requirement because
+// TestWithParam<T> inherits from both Test and WithParamInterface. In more
+// complicated hierarchies, however, it is occasionally useful to inherit
+// separately from Test and WithParamInterface. For example:
+
+class BaseTest : public ::testing::Test {
+  // You can inherit all the usual members for a non-parameterized test
+  // fixture here.
+};
+
+class DerivedTest : public BaseTest, public ::testing::WithParamInterface<int> {
+  // The usual test fixture members go here too.
+};
+
+TEST_F(BaseTest, HasFoo) {
+  // This is an ordinary non-parameterized test.
+}
+
+TEST_P(DerivedTest, DoesBlah) {
+  // GetParam works just the same here as if you inherit from TestWithParam.
+  EXPECT_TRUE(foo.Blah(GetParam()));
+}
+
+#endif  // 0
+
+#include <iterator>
+#include <utility>
+
+#include "gtest/internal/gtest-internal.h"
+#include "gtest/internal/gtest-param-util.h"
+#include "gtest/internal/gtest-port.h"
+
+namespace testing {
+
+// Functions producing parameter generators.
+//
+// Google Test uses these generators to produce parameters for value-
+// parameterized tests. When a parameterized test suite is instantiated
+// with a particular generator, Google Test creates and runs tests
+// for each element in the sequence produced by the generator.
+//
+// In the following sample, tests from test suite FooTest are instantiated
+// each three times with parameter values 3, 5, and 8:
+//
+// class FooTest : public TestWithParam<int> { ... };
+//
+// TEST_P(FooTest, TestThis) {
+// }
+// TEST_P(FooTest, TestThat) {
+// }
+// INSTANTIATE_TEST_SUITE_P(TestSequence, FooTest, Values(3, 5, 8));
+//
+
+// Range() returns generators providing sequences of values in a range.
+//
+// Synopsis:
+// Range(start, end)
+//   - returns a generator producing a sequence of values {start, start+1,
+//     start+2, ..., }.
+// Range(start, end, step)
+//   - returns a generator producing a sequence of values {start, start+step,
+//     start+step+step, ..., }.
+// Notes:
+//   * The generated sequences never include end. For example, Range(1, 5)
+//     returns a generator producing a sequence {1, 2, 3, 4}. Range(1, 9, 2)
+//     returns a generator producing {1, 3, 5, 7}.
+//   * start and end must have the same type. That type may be any integral or
+//     floating-point type or a user defined type satisfying these conditions:
+//     * It must be assignable (have operator=() defined).
+//     * It must have operator+() (operator+(int-compatible type) for
+//       two-operand version).
+//     * It must have operator<() defined.
+//     Elements in the resulting sequences will also have that type.
+//   * Condition start < end must be satisfied in order for resulting sequences
+//     to contain any elements.
+//
+template <typename T, typename IncrementT>
+internal::ParamGenerator<T> Range(T start, T end, IncrementT step) {
+  return internal::ParamGenerator<T>(
+      new internal::RangeGenerator<T, IncrementT>(start, end, step));
+}
+
+template <typename T>
+internal::ParamGenerator<T> Range(T start, T end) {
+  return Range(start, end, 1);
+}
+
+// ValuesIn() function allows generation of tests with parameters coming from
+// a container.
+//
+// Synopsis:
+// ValuesIn(const T (&array)[N])
+//   - returns a generator producing sequences with elements from
+//     a C-style array.
+// ValuesIn(const Container& container)
+//   - returns a generator producing sequences with elements from
+//     an STL-style container.
+// ValuesIn(Iterator begin, Iterator end)
+//   - returns a generator producing sequences with elements from
+//     a range [begin, end) defined by a pair of STL-style iterators. These
+//     iterators can also be plain C pointers.
+//
+// Please note that ValuesIn copies the values from the containers
+// passed in and keeps them to generate tests in RUN_ALL_TESTS().
+//
+// Examples:
+//
+// This instantiates tests from test suite StringTest
+// each with C-string values of "foo", "bar", and "baz":
+//
+// const char* strings[] = {"foo", "bar", "baz"};
+// INSTANTIATE_TEST_SUITE_P(StringSequence, StringTest, ValuesIn(strings));
+//
+// This instantiates tests from test suite StlStringTest
+// each with STL strings with values "a" and "b":
+//
+// ::std::vector< ::std::string> GetParameterStrings() {
+//   ::std::vector< ::std::string> v;
+//   v.push_back("a");
+//   v.push_back("b");
+//   return v;
+// }
+//
+// INSTANTIATE_TEST_SUITE_P(CharSequence,
+//                          StlStringTest,
+//                          ValuesIn(GetParameterStrings()));
+//
+//
+// This will also instantiate tests from CharTest
+// each with parameter values 'a' and 'b':
+//
+// ::std::list<char> GetParameterChars() {
+//   ::std::list<char> list;
+//   list.push_back('a');
+//   list.push_back('b');
+//   return list;
+// }
+// ::std::list<char> l = GetParameterChars();
+// INSTANTIATE_TEST_SUITE_P(CharSequence2,
+//                          CharTest,
+//                          ValuesIn(l.begin(), l.end()));
+//
+template <typename ForwardIterator>
+internal::ParamGenerator<
+    typename std::iterator_traits<ForwardIterator>::value_type>
+ValuesIn(ForwardIterator begin, ForwardIterator end) {
+  typedef typename std::iterator_traits<ForwardIterator>::value_type ParamType;
+  return internal::ParamGenerator<ParamType>(
+      new internal::ValuesInIteratorRangeGenerator<ParamType>(begin, end));
+}
+
+template <typename T, size_t N>
+internal::ParamGenerator<T> ValuesIn(const T (&array)[N]) {
+  return ValuesIn(array, array + N);
+}
+
+template <class Container>
+internal::ParamGenerator<typename Container::value_type> ValuesIn(
+    const Container &container) {
+  return ValuesIn(container.begin(), container.end());
+}
+
+// Values() allows generating tests from explicitly specified list of
+// parameters.
+//
+// Synopsis:
+// Values(T v1, T v2, ..., T vN)
+//   - returns a generator producing sequences with elements v1, v2, ..., vN.
+//
+// For example, this instantiates tests from test suite BarTest each
+// with values "one", "two", and "three":
+//
+// INSTANTIATE_TEST_SUITE_P(NumSequence,
+//                          BarTest,
+//                          Values("one", "two", "three"));
+//
+// This instantiates tests from test suite BazTest each with values 1, 2, 3.5.
+// The exact type of values will depend on the type of parameter in BazTest.
+//
+// INSTANTIATE_TEST_SUITE_P(FloatingNumbers, BazTest, Values(1, 2, 3.5));
+//
+//
+template <typename... T>
+internal::ValueArray<T...> Values(T... v) {
+  return internal::ValueArray<T...>(std::move(v)...);
+}
+
+// Bool() allows generating tests with parameters in a set of (false, true).
+//
+// Synopsis:
+// Bool()
+//   - returns a generator producing sequences with elements {false, true}.
+//
+// It is useful when testing code that depends on Boolean flags. Combinations
+// of multiple flags can be tested when several Bool()'s are combined using
+// Combine() function.
+//
+// In the following example all tests in the test suite FlagDependentTest
+// will be instantiated twice with parameters false and true.
+//
+// class FlagDependentTest : public testing::TestWithParam<bool> {
+//   virtual void SetUp() {
+//     external_flag = GetParam();
+//   }
+// }
+// INSTANTIATE_TEST_SUITE_P(BoolSequence, FlagDependentTest, Bool());
+//
+inline internal::ParamGenerator<bool> Bool() { return Values(false, true); }
+
+// Combine() allows the user to combine two or more sequences to produce
+// values of a Cartesian product of those sequences' elements.
+//
+// Synopsis:
+// Combine(gen1, gen2, ..., genN)
+//   - returns a generator producing sequences with elements coming from
+//     the Cartesian product of elements from the sequences generated by
+//     gen1, gen2, ..., genN. The sequence elements will have a type of
+//     std::tuple<T1, T2, ..., TN> where T1, T2, ..., TN are the types
+//     of elements from sequences produces by gen1, gen2, ..., genN.
+//
+// Combine can have up to 10 arguments.
+//
+// Example:
+//
+// This will instantiate tests in test suite AnimalTest each one with
+// the parameter values tuple("cat", BLACK), tuple("cat", WHITE),
+// tuple("dog", BLACK), and tuple("dog", WHITE):
+//
+// enum Color { BLACK, GRAY, WHITE };
+// class AnimalTest
+//     : public testing::TestWithParam<std::tuple<const char*, Color> > {...};
+//
+// TEST_P(AnimalTest, AnimalLooksNice) {...}
+//
+// INSTANTIATE_TEST_SUITE_P(AnimalVariations, AnimalTest,
+//                          Combine(Values("cat", "dog"),
+//                                  Values(BLACK, WHITE)));
+//
+// This will instantiate tests in FlagDependentTest with all variations of two
+// Boolean flags:
+//
+// class FlagDependentTest
+//     : public testing::TestWithParam<std::tuple<bool, bool> > {
+//   virtual void SetUp() {
+//     // Assigns external_flag_1 and external_flag_2 values from the tuple.
+//     std::tie(external_flag_1, external_flag_2) = GetParam();
+//   }
+// };
+//
+// TEST_P(FlagDependentTest, TestFeature1) {
+//   // Test your code using external_flag_1 and external_flag_2 here.
+// }
+// INSTANTIATE_TEST_SUITE_P(TwoBoolSequence, FlagDependentTest,
+//                          Combine(Bool(), Bool()));
+//
+template <typename... Generator>
+internal::CartesianProductHolder<Generator...> Combine(const Generator &... g) {
+  return internal::CartesianProductHolder<Generator...>(g...);
+}
+
+#define TEST_P(test_suite_name, test_name)                                     \
+  class GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)                     \
+      : public test_suite_name {                                               \
+   public:                                                                     \
+    GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)() {}                    \
+    void TestBody() override;                                                  \
+                                                                               \
+   private:                                                                    \
+    static int AddToRegistry() {                                               \
+      ::testing::UnitTest::GetInstance()                                       \
+          ->parameterized_test_registry()                                      \
+          .GetTestSuitePatternHolder<test_suite_name>(                         \
+              GTEST_STRINGIFY_(test_suite_name),                               \
+              ::testing::internal::CodeLocation(__FILE__, __LINE__))           \
+          ->AddTestPattern(                                                    \
+              GTEST_STRINGIFY_(test_suite_name), GTEST_STRINGIFY_(test_name),  \
+              new ::testing::internal::TestMetaFactory<GTEST_TEST_CLASS_NAME_( \
+                  test_suite_name, test_name)>());                             \
+      return 0;                                                                \
+    }                                                                          \
+    static int gtest_registering_dummy_ GTEST_ATTRIBUTE_UNUSED_;               \
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(GTEST_TEST_CLASS_NAME_(test_suite_name,    \
+                                                           test_name));        \
+  };                                                                           \
+  int GTEST_TEST_CLASS_NAME_(test_suite_name,                                  \
+                             test_name)::gtest_registering_dummy_ =            \
+      GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)::AddToRegistry();     \
+  void GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)::TestBody()
+
+// The last argument to INSTANTIATE_TEST_SUITE_P allows the user to specify
+// generator and an optional function or functor that generates custom test name
+// suffixes based on the test parameters. Such a function or functor should
+// accept one argument of type testing::TestParamInfo<class ParamType>, and
+// return std::string.
+//
+// testing::PrintToStringParamName is a builtin test suffix generator that
+// returns the value of testing::PrintToString(GetParam()).
+//
+// Note: test names must be non-empty, unique, and may only contain ASCII
+// alphanumeric characters or underscore. Because PrintToString adds quotes
+// to std::string and C strings, it won't work for these types.
+
+#define GTEST_EXPAND_(arg) arg
+#define GTEST_GET_FIRST_(first, ...) first
+#define GTEST_GET_SECOND_(first, second, ...) second
+
+#define INSTANTIATE_TEST_SUITE_P(prefix, test_suite_name, ...)               \
+  static ::testing::internal::ParamGenerator<test_suite_name::ParamType>     \
+      gtest_##prefix##test_suite_name##_EvalGenerator_() {                   \
+    return GTEST_EXPAND_(GTEST_GET_FIRST_(__VA_ARGS__, DUMMY_PARAM_));       \
+  }                                                                          \
+  static ::std::string gtest_##prefix##test_suite_name##_EvalGenerateName_(  \
+      const ::testing::TestParamInfo<test_suite_name::ParamType> &info) {    \
+    if (::testing::internal::AlwaysFalse()) {                                \
+      ::testing::internal::TestNotEmpty(GTEST_EXPAND_(GTEST_GET_SECOND_(     \
+          __VA_ARGS__,                                                       \
+          ::testing::internal::DefaultParamName<test_suite_name::ParamType>, \
+          DUMMY_PARAM_)));                                                   \
+      auto t = std::make_tuple(__VA_ARGS__);                                 \
+      static_assert(std::tuple_size<decltype(t)>::value <= 2,                \
+                    "Too Many Args!");                                       \
+    }                                                                        \
+    return ((GTEST_EXPAND_(GTEST_GET_SECOND_(                                \
+        __VA_ARGS__,                                                         \
+        ::testing::internal::DefaultParamName<test_suite_name::ParamType>,   \
+        DUMMY_PARAM_))))(info);                                              \
+  }                                                                          \
+  static int gtest_##prefix##test_suite_name##_dummy_                        \
+      GTEST_ATTRIBUTE_UNUSED_ =                                              \
+          ::testing::UnitTest::GetInstance()                                 \
+              ->parameterized_test_registry()                                \
+              .GetTestSuitePatternHolder<test_suite_name>(                   \
+                  GTEST_STRINGIFY_(test_suite_name),                         \
+                  ::testing::internal::CodeLocation(__FILE__, __LINE__))     \
+              ->AddTestSuiteInstantiation(                                   \
+                  GTEST_STRINGIFY_(prefix),                                  \
+                  &gtest_##prefix##test_suite_name##_EvalGenerator_,         \
+                  &gtest_##prefix##test_suite_name##_EvalGenerateName_,      \
+                  __FILE__, __LINE__)
+
+// Allow Marking a Parameterized test class as not needing to be instantiated.
+#define GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(T)                  \
+  namespace gtest_do_not_use_outside_namespace_scope {}                   \
+  static const ::testing::internal::MarkAsIgnored gtest_allow_ignore_##T( \
+      GTEST_STRINGIFY_(T))
+
+// Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+#define INSTANTIATE_TEST_CASE_P                                            \
+  static_assert(::testing::internal::InstantiateTestCase_P_IsDeprecated(), \
+                "");                                                       \
+  INSTANTIATE_TEST_SUITE_P
+#endif  // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
diff --git a/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-printers.h b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-printers.h
new file mode 100644
index 000000000..950247cf6
--- /dev/null
+++ b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-printers.h
@@ -0,0 +1,925 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Google Test - The Google C++ Testing and Mocking Framework
+//
+// This file implements a universal value printer that can print a
+// value of any type T:
+//
+//   void ::testing::internal::UniversalPrinter<T>::Print(value, ostream_ptr);
+//
+// A user can teach this function how to print a class type T by
+// defining either operator<<() or PrintTo() in the namespace that
+// defines T.  More specifically, the FIRST defined function in the
+// following list will be used (assuming T is defined in namespace
+// foo):
+//
+//   1. foo::PrintTo(const T&, ostream*)
+//   2. operator<<(ostream&, const T&) defined in either foo or the
+//      global namespace.
+//
+// However if T is an STL-style container then it is printed element-wise
+// unless foo::PrintTo(const T&, ostream*) is defined. Note that
+// operator<<() is ignored for container types.
+//
+// If none of the above is defined, it will print the debug string of
+// the value if it is a protocol buffer, or print the raw bytes in the
+// value otherwise.
+//
+// To aid debugging: when T is a reference type, the address of the
+// value is also printed; when T is a (const) char pointer, both the
+// pointer value and the NUL-terminated string it points to are
+// printed.
+//
+// We also provide some convenient wrappers:
+//
+//   // Prints a value to a string.  For a (const or not) char
+//   // pointer, the NUL-terminated string (but not the pointer) is
+//   // printed.
+//   std::string ::testing::PrintToString(const T& value);
+//
+//   // Prints a value tersely: for a reference type, the referenced
+//   // value (but not the address) is printed; for a (const or not) char
+//   // pointer, the NUL-terminated string (but not the pointer) is
+//   // printed.
+//   void ::testing::internal::UniversalTersePrint(const T& value, ostream*);
+//
+//   // Prints value using the type inferred by the compiler.  The difference
+//   // from UniversalTersePrint() is that this function prints both the
+//   // pointer and the NUL-terminated string for a (const or not) char pointer.
+//   void ::testing::internal::UniversalPrint(const T& value, ostream*);
+//
+//   // Prints the fields of a tuple tersely to a string vector, one
+//   // element for each field. Tuple support must be enabled in
+//   // gtest-port.h.
+//   std::vector<string> UniversalTersePrintTupleFieldsToStrings(
+//       const Tuple& value);
+//
+// Known limitation:
+//
+// The print primitives print the elements of an STL-style container
+// using the compiler-inferred type of *iter where iter is a
+// const_iterator of the container.  When const_iterator is an input
+// iterator but not a forward iterator, this inferred type may not
+// match value_type, and the print output may be incorrect.  In
+// practice, this is rarely a problem as for most containers
+// const_iterator is a forward iterator.  We'll fix this if there's an
+// actual need for it.  Note that this fix cannot rely on value_type
+// being defined as many user-defined container types don't have
+// value_type.
+
+// GOOGLETEST_CM0001 DO NOT DELETE
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
+#define GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
+
+#include <functional>
+#include <ostream>  // NOLINT
+#include <sstream>
+#include <string>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+#include <vector>
+#include "gtest/internal/gtest-internal.h"
+#include "gtest/internal/gtest-port.h"
+
+#if GTEST_HAS_ABSL
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
+#include "absl/types/variant.h"
+#endif  // GTEST_HAS_ABSL
+
+namespace testing {
+
+// Definitions in the 'internal' and 'internal2' name spaces are
+// subject to change without notice.  DO NOT USE THEM IN USER CODE!
+namespace internal2 {
+
+// Prints the given number of bytes in the given object to the given
+// ostream.
+GTEST_API_ void PrintBytesInObjectTo(const unsigned char *obj_bytes,
+                                     size_t count, ::std::ostream *os);
+
+// For selecting which printer to use when a given type has neither <<
+// nor PrintTo().
+enum TypeKind {
+  kProtobuf,              // a protobuf type
+  kConvertibleToInteger,  // a type implicitly convertible to BiggestInt
+                          // (e.g. a named or unnamed enum type)
+#if GTEST_INTERNAL_HAS_STRING_VIEW
+  kConvertibleToStringView,  // a type implicitly convertible to
+                             // absl::string_view or std::string_view
+#endif
+  kOtherType  // anything else
+};
+
+// TypeWithoutFormatter<T, kTypeKind>::PrintValue(value, os) is called
+// by the universal printer to print a value of type T when neither
+// operator<< nor PrintTo() is defined for T, where kTypeKind is the
+// "kind" of T as defined by enum TypeKind.
+template <typename T, TypeKind kTypeKind>
+class TypeWithoutFormatter {
+ public:
+  // This default version is called when kTypeKind is kOtherType.
+  static void PrintValue(const T &value, ::std::ostream *os) {
+    PrintBytesInObjectTo(
+        static_cast<const unsigned char *>(
+            reinterpret_cast<const void *>(std::addressof(value))),
+        sizeof(value), os);
+  }
+};
+
+// We print a protobuf using its ShortDebugString() when the string
+// doesn't exceed this many characters; otherwise we print it using
+// DebugString() for better readability.
+const size_t kProtobufOneLinerMaxLength = 50;
+
+template <typename T>
+class TypeWithoutFormatter<T, kProtobuf> {
+ public:
+  static void PrintValue(const T &value, ::std::ostream *os) {
+    std::string pretty_str = value.ShortDebugString();
+    if (pretty_str.length() > kProtobufOneLinerMaxLength) {
+      pretty_str = "\n" + value.DebugString();
+    }
+    *os << ("<" + pretty_str + ">");
+  }
+};
+
+template <typename T>
+class TypeWithoutFormatter<T, kConvertibleToInteger> {
+ public:
+  // Since T has no << operator or PrintTo() but can be implicitly
+  // converted to BiggestInt, we print it as a BiggestInt.
+  //
+  // Most likely T is an enum type (either named or unnamed), in which
+  // case printing it as an integer is the desired behavior.  In case
+  // T is not an enum, printing it as an integer is the best we can do
+  // given that it has no user-defined printer.
+  static void PrintValue(const T &value, ::std::ostream *os) {
+    const internal::BiggestInt kBigInt = value;
+    *os << kBigInt;
+  }
+};
+
+#if GTEST_INTERNAL_HAS_STRING_VIEW
+template <typename T>
+class TypeWithoutFormatter<T, kConvertibleToStringView> {
+ public:
+  // Since T has neither operator<< nor PrintTo() but can be implicitly
+  // converted to absl::string_view, we print it as a absl::string_view
+  // (or std::string_view).
+  //
+  // Note: the implementation is further below, as it depends on
+  // internal::PrintTo symbol which is defined later in the file.
+  static void PrintValue(const T &value, ::std::ostream *os);
+};
+#endif
+
+// Prints the given value to the given ostream.  If the value is a
+// protocol message, its debug string is printed; if it's an enum or
+// of a type implicitly convertible to BiggestInt, it's printed as an
+// integer; otherwise the bytes in the value are printed.  This is
+// what UniversalPrinter<T>::Print() does when it knows nothing about
+// type T and T has neither << operator nor PrintTo().
+//
+// A user can override this behavior for a class type Foo by defining
+// a << operator in the namespace where Foo is defined.
+//
+// We put this operator in namespace 'internal2' instead of 'internal'
+// to simplify the implementation, as much code in 'internal' needs to
+// use << in STL, which would conflict with our own << were it defined
+// in 'internal'.
+//
+// Note that this operator<< takes a generic std::basic_ostream<Char,
+// CharTraits> type instead of the more restricted std::ostream.  If
+// we define it to take an std::ostream instead, we'll get an
+// "ambiguous overloads" compiler error when trying to print a type
+// Foo that supports streaming to std::basic_ostream<Char,
+// CharTraits>, as the compiler cannot tell whether
+// operator<<(std::ostream&, const T&) or
+// operator<<(std::basic_stream<Char, CharTraits>, const Foo&) is more
+// specific.
+template <typename Char, typename CharTraits, typename T>
+::std::basic_ostream<Char, CharTraits> &operator<<(
+    ::std::basic_ostream<Char, CharTraits> &os, const T &x) {
+  TypeWithoutFormatter<
+      T, (internal::IsAProtocolMessage<T>::value
+              ? kProtobuf
+              : std::is_convertible<const T &, internal::BiggestInt>::value
+                    ? kConvertibleToInteger
+                    :
+#if GTEST_INTERNAL_HAS_STRING_VIEW
+                    std::is_convertible<const T &, internal::StringView>::value
+                        ? kConvertibleToStringView
+                        :
+#endif
+                        kOtherType)>::PrintValue(x, &os);
+  return os;
+}
+
+}  // namespace internal2
+}  // namespace testing
+
+// This namespace MUST NOT BE NESTED IN ::testing, or the name look-up
+// magic needed for implementing UniversalPrinter won't work.
+namespace testing_internal {
+
+// Used to print a value that is not an STL-style container when the
+// user doesn't define PrintTo() for it.
+template <typename T>
+void DefaultPrintNonContainerTo(const T &value, ::std::ostream *os) {
+  // With the following statement, during unqualified name lookup,
+  // testing::internal2::operator<< appears as if it was declared in
+  // the nearest enclosing namespace that contains both
+  // ::testing_internal and ::testing::internal2, i.e. the global
+  // namespace.  For more details, refer to the C++ Standard section
+  // 7.3.4-1 [namespace.udir].  This allows us to fall back onto
+  // testing::internal2::operator<< in case T doesn't come with a <<
+  // operator.
+
+  using ::testing::internal2::operator<<;
+
+  // Assuming T is defined in namespace foo, in the next statement,
+  // the compiler will consider all of:
+  //
+  //   1. foo::operator<< (thanks to Koenig look-up),
+  //   2. ::operator<< (as the current namespace is enclosed in ::),
+  //   3. testing::internal2::operator<< (thanks to the using statement above).
+  //
+  // The operator<< whose type matches T best will be picked.
+  //
+  // We deliberately allow #2 to be a candidate, as sometimes it's
+  // impossible to define #1 (e.g. when foo is ::std, defining
+  // anything in it is undefined behavior unless you are a compiler
+  // vendor.).
+  *os << value;
+}
+
+}  // namespace testing_internal
+
+namespace testing {
+namespace internal {
+
+// FormatForComparison<ToPrint, OtherOperand>::Format(value) formats a
+// value of type ToPrint that is an operand of a comparison assertion
+// (e.g. ASSERT_EQ).  OtherOperand is the type of the other operand in
+// the comparison, and is used to help determine the best way to
+// format the value.  In particular, when the value is a C string
+// (char pointer) and the other operand is an STL string object, we
+// want to format the C string as a string, since we know it is
+// compared by value with the string object.  If the value is a char
+// pointer but the other operand is not an STL string object, we don't
+// know whether the pointer is supposed to point to a NUL-terminated
+// string, and thus want to print it as a pointer to be safe.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+
+// The default case.
+template <typename ToPrint, typename OtherOperand>
+class FormatForComparison {
+ public:
+  static ::std::string Format(const ToPrint &value) {
+    return ::testing::PrintToString(value);
+  }
+};
+
+// Array.
+template <typename ToPrint, size_t N, typename OtherOperand>
+class FormatForComparison<ToPrint[N], OtherOperand> {
+ public:
+  static ::std::string Format(const ToPrint *value) {
+    return FormatForComparison<const ToPrint *, OtherOperand>::Format(value);
+  }
+};
+
+// By default, print C string as pointers to be safe, as we don't know
+// whether they actually point to a NUL-terminated string.
+
+#define GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(CharType)                 \
+  template <typename OtherOperand>                                       \
+  class FormatForComparison<CharType *, OtherOperand> {                  \
+   public:                                                               \
+    static ::std::string Format(CharType *value) {                       \
+      return ::testing::PrintToString(static_cast<const void *>(value)); \
+    }                                                                    \
+  }
+
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(char);
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char);
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(wchar_t);
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const wchar_t);
+
+#undef GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_
+
+// If a C string is compared with an STL string object, we know it's meant
+// to point to a NUL-terminated string, and thus can print it as a string.
+
+#define GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(CharType, OtherStringType) \
+  template <>                                                            \
+  class FormatForComparison<CharType *, OtherStringType> {               \
+   public:                                                               \
+    static ::std::string Format(CharType *value) {                       \
+      return ::testing::PrintToString(value);                            \
+    }                                                                    \
+  }
+
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char, ::std::string);
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char, ::std::string);
+
+#if GTEST_HAS_STD_WSTRING
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(wchar_t, ::std::wstring);
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const wchar_t, ::std::wstring);
+#endif
+
+#undef GTEST_IMPL_FORMAT_C_STRING_AS_STRING_
+
+// Formats a comparison assertion (e.g. ASSERT_EQ, EXPECT_LT, and etc)
+// operand to be used in a failure message.  The type (but not value)
+// of the other operand may affect the format.  This allows us to
+// print a char* as a raw pointer when it is compared against another
+// char* or void*, and print it as a C string when it is compared
+// against an std::string object, for example.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+template <typename T1, typename T2>
+std::string FormatForComparisonFailureMessage(const T1 &value,
+                                              const T2 & /* other_operand */) {
+  return FormatForComparison<T1, T2>::Format(value);
+}
+
+// UniversalPrinter<T>::Print(value, ostream_ptr) prints the given
+// value to the given ostream.  The caller must ensure that
+// 'ostream_ptr' is not NULL, or the behavior is undefined.
+//
+// We define UniversalPrinter as a class template (as opposed to a
+// function template), as we need to partially specialize it for
+// reference types, which cannot be done with function templates.
+template <typename T>
+class UniversalPrinter;
+
+template <typename T>
+void UniversalPrint(const T &value, ::std::ostream *os);
+
+enum DefaultPrinterType {
+  kPrintContainer,
+  kPrintPointer,
+  kPrintFunctionPointer,
+  kPrintOther,
+};
+template <DefaultPrinterType type>
+struct WrapPrinterType {};
+
+// Used to print an STL-style container when the user doesn't define
+// a PrintTo() for it.
+template <typename C>
+void DefaultPrintTo(WrapPrinterType<kPrintContainer> /* dummy */,
+                    const C &container, ::std::ostream *os) {
+  const size_t kMaxCount = 32;  // The maximum number of elements to print.
+  *os << '{';
+  size_t count = 0;
+  for (typename C::const_iterator it = container.begin(); it != container.end();
+       ++it, ++count) {
+    if (count > 0) {
+      *os << ',';
+      if (count == kMaxCount) {  // Enough has been printed.
+        *os << " ...";
+        break;
+      }
+    }
+    *os << ' ';
+    // We cannot call PrintTo(*it, os) here as PrintTo() doesn't
+    // handle *it being a native array.
+    internal::UniversalPrint(*it, os);
+  }
+
+  if (count > 0) {
+    *os << ' ';
+  }
+  *os << '}';
+}
+
+// Used to print a pointer that is neither a char pointer nor a member
+// pointer, when the user doesn't define PrintTo() for it.  (A member
+// variable pointer or member function pointer doesn't really point to
+// a location in the address space.  Their representation is
+// implementation-defined.  Therefore they will be printed as raw
+// bytes.)
+template <typename T>
+void DefaultPrintTo(WrapPrinterType<kPrintPointer> /* dummy */, T *p,
+                    ::std::ostream *os) {
+  if (p == nullptr) {
+    *os << "NULL";
+  } else {
+    // T is not a function type.  We just call << to print p,
+    // relying on ADL to pick up user-defined << for their pointer
+    // types, if any.
+    *os << p;
+  }
+}
+template <typename T>
+void DefaultPrintTo(WrapPrinterType<kPrintFunctionPointer> /* dummy */, T *p,
+                    ::std::ostream *os) {
+  if (p == nullptr) {
+    *os << "NULL";
+  } else {
+    // T is a function type, so '*os << p' doesn't do what we want
+    // (it just prints p as bool).  We want to print p as a const
+    // void*.
+    *os << reinterpret_cast<const void *>(p);
+  }
+}
+
+// Used to print a non-container, non-pointer value when the user
+// doesn't define PrintTo() for it.
+template <typename T>
+void DefaultPrintTo(WrapPrinterType<kPrintOther> /* dummy */, const T &value,
+                    ::std::ostream *os) {
+  ::testing_internal::DefaultPrintNonContainerTo(value, os);
+}
+
+// Prints the given value using the << operator if it has one;
+// otherwise prints the bytes in it.  This is what
+// UniversalPrinter<T>::Print() does when PrintTo() is not specialized
+// or overloaded for type T.
+//
+// A user can override this behavior for a class type Foo by defining
+// an overload of PrintTo() in the namespace where Foo is defined.  We
+// give the user this option as sometimes defining a << operator for
+// Foo is not desirable (e.g. the coding style may prevent doing it,
+// or there is already a << operator but it doesn't do what the user
+// wants).
+template <typename T>
+void PrintTo(const T &value, ::std::ostream *os) {
+  // DefaultPrintTo() is overloaded.  The type of its first argument
+  // determines which version will be picked.
+  //
+  // Note that we check for container types here, prior to we check
+  // for protocol message types in our operator<<.  The rationale is:
+  //
+  // For protocol messages, we want to give people a chance to
+  // override Google Mock's format by defining a PrintTo() or
+  // operator<<.  For STL containers, other formats can be
+  // incompatible with Google Mock's format for the container
+  // elements; therefore we check for container types here to ensure
+  // that our format is used.
+  //
+  // Note that MSVC and clang-cl do allow an implicit conversion from
+  // pointer-to-function to pointer-to-object, but clang-cl warns on it.
+  // So don't use ImplicitlyConvertible if it can be helped since it will
+  // cause this warning, and use a separate overload of DefaultPrintTo for
+  // function pointers so that the `*os << p` in the object pointer overload
+  // doesn't cause that warning either.
+  DefaultPrintTo(
+      WrapPrinterType <
+                  (sizeof(IsContainerTest<T>(0)) == sizeof(IsContainer)) &&
+              !IsRecursiveContainer<T>::value
+          ? kPrintContainer
+          : !std::is_pointer<T>::value
+                ? kPrintOther
+                : std::is_function<typename std::remove_pointer<T>::type>::value
+                      ? kPrintFunctionPointer
+                      : kPrintPointer > (),
+      value, os);
+}
+
+// The following list of PrintTo() overloads tells
+// UniversalPrinter<T>::Print() how to print standard types (built-in
+// types, strings, plain arrays, and pointers).
+
+// Overloads for various char types.
+GTEST_API_ void PrintTo(unsigned char c, ::std::ostream *os);
+GTEST_API_ void PrintTo(signed char c, ::std::ostream *os);
+inline void PrintTo(char c, ::std::ostream *os) {
+  // When printing a plain char, we always treat it as unsigned.  This
+  // way, the output won't be affected by whether the compiler thinks
+  // char is signed or not.
+  PrintTo(static_cast<unsigned char>(c), os);
+}
+
+// Overloads for other simple built-in types.
+inline void PrintTo(bool x, ::std::ostream *os) {
+  *os << (x ? "true" : "false");
+}
+
+// Overload for wchar_t type.
+// Prints a wchar_t as a symbol if it is printable or as its internal
+// code otherwise and also as its decimal code (except for L'\0').
+// The L'\0' char is printed as "L'\\0'". The decimal code is printed
+// as signed integer when wchar_t is implemented by the compiler
+// as a signed type and is printed as an unsigned integer when wchar_t
+// is implemented as an unsigned type.
+GTEST_API_ void PrintTo(wchar_t wc, ::std::ostream *os);
+
+// Overloads for C strings.
+GTEST_API_ void PrintTo(const char *s, ::std::ostream *os);
+inline void PrintTo(char *s, ::std::ostream *os) {
+  PrintTo(ImplicitCast_<const char *>(s), os);
+}
+
+// signed/unsigned char is often used for representing binary data, so
+// we print pointers to it as void* to be safe.
+inline void PrintTo(const signed char *s, ::std::ostream *os) {
+  PrintTo(ImplicitCast_<const void *>(s), os);
+}
+inline void PrintTo(signed char *s, ::std::ostream *os) {
+  PrintTo(ImplicitCast_<const void *>(s), os);
+}
+inline void PrintTo(const unsigned char *s, ::std::ostream *os) {
+  PrintTo(ImplicitCast_<const void *>(s), os);
+}
+inline void PrintTo(unsigned char *s, ::std::ostream *os) {
+  PrintTo(ImplicitCast_<const void *>(s), os);
+}
+
+// MSVC can be configured to define wchar_t as a typedef of unsigned
+// short.  It defines _NATIVE_WCHAR_T_DEFINED when wchar_t is a native
+// type.  When wchar_t is a typedef, defining an overload for const
+// wchar_t* would cause unsigned short* be printed as a wide string,
+// possibly causing invalid memory accesses.
+#if !defined(_MSC_VER) || defined(_NATIVE_WCHAR_T_DEFINED)
+// Overloads for wide C strings
+GTEST_API_ void PrintTo(const wchar_t *s, ::std::ostream *os);
+inline void PrintTo(wchar_t *s, ::std::ostream *os) {
+  PrintTo(ImplicitCast_<const wchar_t *>(s), os);
+}
+#endif
+
+// Overload for C arrays.  Multi-dimensional arrays are printed
+// properly.
+
+// Prints the given number of elements in an array, without printing
+// the curly braces.
+template <typename T>
+void PrintRawArrayTo(const T a[], size_t count, ::std::ostream *os) {
+  UniversalPrint(a[0], os);
+  for (size_t i = 1; i != count; i++) {
+    *os << ", ";
+    UniversalPrint(a[i], os);
+  }
+}
+
+// Overloads for ::std::string.
+GTEST_API_ void PrintStringTo(const ::std::string &s, ::std::ostream *os);
+inline void PrintTo(const ::std::string &s, ::std::ostream *os) {
+  PrintStringTo(s, os);
+}
+
+// Overloads for ::std::wstring.
+#if GTEST_HAS_STD_WSTRING
+GTEST_API_ void PrintWideStringTo(const ::std::wstring &s, ::std::ostream *os);
+inline void PrintTo(const ::std::wstring &s, ::std::ostream *os) {
+  PrintWideStringTo(s, os);
+}
+#endif  // GTEST_HAS_STD_WSTRING
+
+#if GTEST_INTERNAL_HAS_STRING_VIEW
+// Overload for internal::StringView.
+inline void PrintTo(internal::StringView sp, ::std::ostream *os) {
+  PrintTo(::std::string(sp), os);
+}
+#endif  // GTEST_INTERNAL_HAS_STRING_VIEW
+
+inline void PrintTo(std::nullptr_t, ::std::ostream *os) { *os << "(nullptr)"; }
+
+template <typename T>
+void PrintTo(std::reference_wrapper<T> ref, ::std::ostream *os) {
+  UniversalPrinter<T &>::Print(ref.get(), os);
+}
+
+// Helper function for printing a tuple.  T must be instantiated with
+// a tuple type.
+template <typename T>
+void PrintTupleTo(const T &, std::integral_constant<size_t, 0>,
+                  ::std::ostream *) {}
+
+template <typename T, size_t I>
+void PrintTupleTo(const T &t, std::integral_constant<size_t, I>,
+                  ::std::ostream *os) {
+  PrintTupleTo(t, std::integral_constant<size_t, I - 1>(), os);
+  GTEST_INTENTIONAL_CONST_COND_PUSH_()
+  if (I > 1) {
+    GTEST_INTENTIONAL_CONST_COND_POP_()
+    *os << ", ";
+  }
+  UniversalPrinter<typename std::tuple_element<I - 1, T>::type>::Print(
+      std::get<I - 1>(t), os);
+}
+
+template <typename... Types>
+void PrintTo(const ::std::tuple<Types...> &t, ::std::ostream *os) {
+  *os << "(";
+  PrintTupleTo(t, std::integral_constant<size_t, sizeof...(Types)>(), os);
+  *os << ")";
+}
+
+// Overload for std::pair.
+template <typename T1, typename T2>
+void PrintTo(const ::std::pair<T1, T2> &value, ::std::ostream *os) {
+  *os << '(';
+  // We cannot use UniversalPrint(value.first, os) here, as T1 may be
+  // a reference type.  The same for printing value.second.
+  UniversalPrinter<T1>::Print(value.first, os);
+  *os << ", ";
+  UniversalPrinter<T2>::Print(value.second, os);
+  *os << ')';
+}
+
+// Implements printing a non-reference type T by letting the compiler
+// pick the right overload of PrintTo() for T.
+template <typename T>
+class UniversalPrinter {
+ public:
+  // MSVC warns about adding const to a function type, so we want to
+  // disable the warning.
+  GTEST_DISABLE_MSC_WARNINGS_PUSH_(4180)
+
+  // Note: we deliberately don't call this PrintTo(), as that name
+  // conflicts with ::testing::internal::PrintTo in the body of the
+  // function.
+  static void Print(const T &value, ::std::ostream *os) {
+    // By default, ::testing::internal::PrintTo() is used for printing
+    // the value.
+    //
+    // Thanks to Koenig look-up, if T is a class and has its own
+    // PrintTo() function defined in its namespace, that function will
+    // be visible here.  Since it is more specific than the generic ones
+    // in ::testing::internal, it will be picked by the compiler in the
+    // following statement - exactly what we want.
+    PrintTo(value, os);
+  }
+
+  GTEST_DISABLE_MSC_WARNINGS_POP_()
+};
+
+#if GTEST_HAS_ABSL
+
+// Printer for absl::optional
+
+template <typename T>
+class UniversalPrinter<::absl::optional<T>> {
+ public:
+  static void Print(const ::absl::optional<T> &value, ::std::ostream *os) {
+    *os << '(';
+    if (!value) {
+      *os << "nullopt";
+    } else {
+      UniversalPrint(*value, os);
+    }
+    *os << ')';
+  }
+};
+
+// Printer for absl::variant
+
+template <typename... T>
+class UniversalPrinter<::absl::variant<T...>> {
+ public:
+  static void Print(const ::absl::variant<T...> &value, ::std::ostream *os) {
+    *os << '(';
+    absl::visit(Visitor{ os }, value);
+    *os << ')';
+  }
+
+ private:
+  struct Visitor {
+    template <typename U>
+    void operator()(const U &u) const {
+      *os << "'" << GetTypeName<U>() << "' with value ";
+      UniversalPrint(u, os);
+    }
+    ::std::ostream *os;
+  };
+};
+
+#endif  // GTEST_HAS_ABSL
+
+// UniversalPrintArray(begin, len, os) prints an array of 'len'
+// elements, starting at address 'begin'.
+template <typename T>
+void UniversalPrintArray(const T *begin, size_t len, ::std::ostream *os) {
+  if (len == 0) {
+    *os << "{}";
+  } else {
+    *os << "{ ";
+    const size_t kThreshold = 18;
+    const size_t kChunkSize = 8;
+    // If the array has more than kThreshold elements, we'll have to
+    // omit some details by printing only the first and the last
+    // kChunkSize elements.
+    if (len <= kThreshold) {
+      PrintRawArrayTo(begin, len, os);
+    } else {
+      PrintRawArrayTo(begin, kChunkSize, os);
+      *os << ", ..., ";
+      PrintRawArrayTo(begin + len - kChunkSize, kChunkSize, os);
+    }
+    *os << " }";
+  }
+}
+// This overload prints a (const) char array compactly.
+GTEST_API_ void UniversalPrintArray(const char *begin, size_t len,
+                                    ::std::ostream *os);
+
+// This overload prints a (const) wchar_t array compactly.
+GTEST_API_ void UniversalPrintArray(const wchar_t *begin, size_t len,
+                                    ::std::ostream *os);
+
+// Implements printing an array type T[N].
+template <typename T, size_t N>
+class UniversalPrinter<T[N]> {
+ public:
+  // Prints the given array, omitting some elements when there are too
+  // many.
+  static void Print(const T (&a)[N], ::std::ostream *os) {
+    UniversalPrintArray(a, N, os);
+  }
+};
+
+// Implements printing a reference type T&.
+template <typename T>
+class UniversalPrinter<T &> {
+ public:
+  // MSVC warns about adding const to a function type, so we want to
+  // disable the warning.
+  GTEST_DISABLE_MSC_WARNINGS_PUSH_(4180)
+
+  static void Print(const T &value, ::std::ostream *os) {
+    // Prints the address of the value.  We use reinterpret_cast here
+    // as static_cast doesn't compile when T is a function type.
+    *os << "@" << reinterpret_cast<const void *>(&value) << " ";
+
+    // Then prints the value itself.
+    UniversalPrint(value, os);
+  }
+
+  GTEST_DISABLE_MSC_WARNINGS_POP_()
+};
+
+// Prints a value tersely: for a reference type, the referenced value
+// (but not the address) is printed; for a (const) char pointer, the
+// NUL-terminated string (but not the pointer) is printed.
+
+template <typename T>
+class UniversalTersePrinter {
+ public:
+  static void Print(const T &value, ::std::ostream *os) {
+    UniversalPrint(value, os);
+  }
+};
+template <typename T>
+class UniversalTersePrinter<T &> {
+ public:
+  static void Print(const T &value, ::std::ostream *os) {
+    UniversalPrint(value, os);
+  }
+};
+template <typename T, size_t N>
+class UniversalTersePrinter<T[N]> {
+ public:
+  static void Print(const T (&value)[N], ::std::ostream *os) {
+    UniversalPrinter<T[N]>::Print(value, os);
+  }
+};
+template <>
+class UniversalTersePrinter<const char *> {
+ public:
+  static void Print(const char *str, ::std::ostream *os) {
+    if (str == nullptr) {
+      *os << "NULL";
+    } else {
+      UniversalPrint(std::string(str), os);
+    }
+  }
+};
+template <>
+class UniversalTersePrinter<char *> {
+ public:
+  static void Print(char *str, ::std::ostream *os) {
+    UniversalTersePrinter<const char *>::Print(str, os);
+  }
+};
+
+#if GTEST_HAS_STD_WSTRING
+template <>
+class UniversalTersePrinter<const wchar_t *> {
+ public:
+  static void Print(const wchar_t *str, ::std::ostream *os) {
+    if (str == nullptr) {
+      *os << "NULL";
+    } else {
+      UniversalPrint(::std::wstring(str), os);
+    }
+  }
+};
+#endif
+
+template <>
+class UniversalTersePrinter<wchar_t *> {
+ public:
+  static void Print(wchar_t *str, ::std::ostream *os) {
+    UniversalTersePrinter<const wchar_t *>::Print(str, os);
+  }
+};
+
+template <typename T>
+void UniversalTersePrint(const T &value, ::std::ostream *os) {
+  UniversalTersePrinter<T>::Print(value, os);
+}
+
+// Prints a value using the type inferred by the compiler.  The
+// difference between this and UniversalTersePrint() is that for a
+// (const) char pointer, this prints both the pointer and the
+// NUL-terminated string.
+template <typename T>
+void UniversalPrint(const T &value, ::std::ostream *os) {
+  // A workarond for the bug in VC++ 7.1 that prevents us from instantiating
+  // UniversalPrinter with T directly.
+  typedef T T1;
+  UniversalPrinter<T1>::Print(value, os);
+}
+
+typedef ::std::vector<::std::string> Strings;
+
+// Tersely prints the first N fields of a tuple to a string vector,
+// one element for each field.
+template <typename Tuple>
+void TersePrintPrefixToStrings(const Tuple &, std::integral_constant<size_t, 0>,
+                               Strings *) {}
+template <typename Tuple, size_t I>
+void TersePrintPrefixToStrings(const Tuple &t,
+                               std::integral_constant<size_t, I>,
+                               Strings *strings) {
+  TersePrintPrefixToStrings(t, std::integral_constant<size_t, I - 1>(),
+                            strings);
+  ::std::stringstream ss;
+  UniversalTersePrint(std::get<I - 1>(t), &ss);
+  strings->push_back(ss.str());
+}
+
+// Prints the fields of a tuple tersely to a string vector, one
+// element for each field.  See the comment before
+// UniversalTersePrint() for how we define "tersely".
+template <typename Tuple>
+Strings UniversalTersePrintTupleFieldsToStrings(const Tuple &value) {
+  Strings result;
+  TersePrintPrefixToStrings(
+      value, std::integral_constant<size_t, std::tuple_size<Tuple>::value>(),
+      &result);
+  return result;
+}
+
+}  // namespace internal
+
+#if GTEST_INTERNAL_HAS_STRING_VIEW
+namespace internal2 {
+template <typename T>
+void TypeWithoutFormatter<T, kConvertibleToStringView>::PrintValue(
+    const T &value, ::std::ostream *os) {
+  internal::PrintTo(internal::StringView(value), os);
+}
+}  // namespace internal2
+#endif
+
+template <typename T>
+::std::string PrintToString(const T &value) {
+  ::std::stringstream ss;
+  internal::UniversalTersePrinter<T>::Print(value, &ss);
+  return ss.str();
+}
+
+}  // namespace testing
+
+// Include any custom printer added by the local installation.
+// We must include this header at the end to make sure it can use the
+// declarations from this file.
+#include "gtest/internal/custom/gtest-printers.h"
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
diff --git a/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-spi.h b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-spi.h
new file mode 100644
index 000000000..e263b1033
--- /dev/null
+++ b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-spi.h
@@ -0,0 +1,245 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+//
+// Utilities for testing Google Test itself and code that uses Google Test
+// (e.g. frameworks built on top of Google Test).
+
+// GOOGLETEST_CM0004 DO NOT DELETE
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_SPI_H_
+#define GTEST_INCLUDE_GTEST_GTEST_SPI_H_
+
+#include "gtest/gtest.h"
+
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
+namespace testing {
+
+// This helper class can be used to mock out Google Test failure reporting
+// so that we can test Google Test or code that builds on Google Test.
+//
+// An object of this class appends a TestPartResult object to the
+// TestPartResultArray object given in the constructor whenever a Google Test
+// failure is reported. It can either intercept only failures that are
+// generated in the same thread that created this object or it can intercept
+// all generated failures. The scope of this mock object can be controlled with
+// the second argument to the two arguments constructor.
+class GTEST_API_ ScopedFakeTestPartResultReporter
+    : public TestPartResultReporterInterface {
+ public:
+  // The two possible mocking modes of this object.
+  enum InterceptMode {
+    INTERCEPT_ONLY_CURRENT_THREAD,  // Intercepts only thread local failures.
+    INTERCEPT_ALL_THREADS           // Intercepts all failures.
+  };
+
+  // The c'tor sets this object as the test part result reporter used
+  // by Google Test.  The 'result' parameter specifies where to report the
+  // results. This reporter will only catch failures generated in the current
+  // thread. DEPRECATED
+  explicit ScopedFakeTestPartResultReporter(TestPartResultArray *result);
+
+  // Same as above, but you can choose the interception scope of this object.
+  ScopedFakeTestPartResultReporter(InterceptMode intercept_mode,
+                                   TestPartResultArray *result);
+
+  // The d'tor restores the previous test part result reporter.
+  ~ScopedFakeTestPartResultReporter() override;
+
+  // Appends the TestPartResult object to the TestPartResultArray
+  // received in the constructor.
+  //
+  // This method is from the TestPartResultReporterInterface
+  // interface.
+  void ReportTestPartResult(const TestPartResult &result) override;
+
+ private:
+  void Init();
+
+  const InterceptMode intercept_mode_;
+  TestPartResultReporterInterface *old_reporter_;
+  TestPartResultArray *const result_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedFakeTestPartResultReporter);
+};
+
+namespace internal {
+
+// A helper class for implementing EXPECT_FATAL_FAILURE() and
+// EXPECT_NONFATAL_FAILURE().  Its destructor verifies that the given
+// TestPartResultArray contains exactly one failure that has the given
+// type and contains the given substring.  If that's not the case, a
+// non-fatal failure will be generated.
+class GTEST_API_ SingleFailureChecker {
+ public:
+  // The constructor remembers the arguments.
+  SingleFailureChecker(const TestPartResultArray *results,
+                       TestPartResult::Type type, const std::string &substr);
+  ~SingleFailureChecker();
+
+ private:
+  const TestPartResultArray *const results_;
+  const TestPartResult::Type type_;
+  const std::string substr_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(SingleFailureChecker);
+};
+
+}  // namespace internal
+
+}  // namespace testing
+
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
+
+// A set of macros for testing Google Test assertions or code that's expected
+// to generate Google Test fatal failures.  It verifies that the given
+// statement will cause exactly one fatal Google Test failure with 'substr'
+// being part of the failure message.
+//
+// There are two different versions of this macro. EXPECT_FATAL_FAILURE only
+// affects and considers failures generated in the current thread and
+// EXPECT_FATAL_FAILURE_ON_ALL_THREADS does the same but for all threads.
+//
+// The verification of the assertion is done correctly even when the statement
+// throws an exception or aborts the current function.
+//
+// Known restrictions:
+//   - 'statement' cannot reference local non-static variables or
+//     non-static members of the current object.
+//   - 'statement' cannot return a value.
+//   - You cannot stream a failure message to this macro.
+//
+// Note that even though the implementations of the following two
+// macros are much alike, we cannot refactor them to use a common
+// helper macro, due to some peculiarity in how the preprocessor
+// works.  The AcceptsMacroThatExpandsToUnprotectedComma test in
+// gtest_unittest.cc will fail to compile if we do that.
+#define EXPECT_FATAL_FAILURE(statement, substr)                               \
+  do {                                                                        \
+    class GTestExpectFatalFailureHelper {                                     \
+     public:                                                                  \
+      static void Execute() { statement; }                                    \
+    };                                                                        \
+    ::testing::TestPartResultArray gtest_failures;                            \
+    ::testing::internal::SingleFailureChecker gtest_checker(                  \
+        &gtest_failures, ::testing::TestPartResult::kFatalFailure, (substr)); \
+    {                                                                         \
+      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(             \
+          ::testing::ScopedFakeTestPartResultReporter::                       \
+              INTERCEPT_ONLY_CURRENT_THREAD,                                  \
+          &gtest_failures);                                                   \
+      GTestExpectFatalFailureHelper::Execute();                               \
+    }                                                                         \
+  } while (::testing::internal::AlwaysFalse())
+
+#define EXPECT_FATAL_FAILURE_ON_ALL_THREADS(statement, substr)                \
+  do {                                                                        \
+    class GTestExpectFatalFailureHelper {                                     \
+     public:                                                                  \
+      static void Execute() { statement; }                                    \
+    };                                                                        \
+    ::testing::TestPartResultArray gtest_failures;                            \
+    ::testing::internal::SingleFailureChecker gtest_checker(                  \
+        &gtest_failures, ::testing::TestPartResult::kFatalFailure, (substr)); \
+    {                                                                         \
+      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(             \
+          ::testing::ScopedFakeTestPartResultReporter::INTERCEPT_ALL_THREADS, \
+          &gtest_failures);                                                   \
+      GTestExpectFatalFailureHelper::Execute();                               \
+    }                                                                         \
+  } while (::testing::internal::AlwaysFalse())
+
+// A macro for testing Google Test assertions or code that's expected to
+// generate Google Test non-fatal failures.  It asserts that the given
+// statement will cause exactly one non-fatal Google Test failure with 'substr'
+// being part of the failure message.
+//
+// There are two different versions of this macro. EXPECT_NONFATAL_FAILURE only
+// affects and considers failures generated in the current thread and
+// EXPECT_NONFATAL_FAILURE_ON_ALL_THREADS does the same but for all threads.
+//
+// 'statement' is allowed to reference local variables and members of
+// the current object.
+//
+// The verification of the assertion is done correctly even when the statement
+// throws an exception or aborts the current function.
+//
+// Known restrictions:
+//   - You cannot stream a failure message to this macro.
+//
+// Note that even though the implementations of the following two
+// macros are much alike, we cannot refactor them to use a common
+// helper macro, due to some peculiarity in how the preprocessor
+// works.  If we do that, the code won't compile when the user gives
+// EXPECT_NONFATAL_FAILURE() a statement that contains a macro that
+// expands to code containing an unprotected comma.  The
+// AcceptsMacroThatExpandsToUnprotectedComma test in gtest_unittest.cc
+// catches that.
+//
+// For the same reason, we have to write
+//   if (::testing::internal::AlwaysTrue()) { statement; }
+// instead of
+//   GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement)
+// to avoid an MSVC warning on unreachable code.
+#define EXPECT_NONFATAL_FAILURE(statement, substr)                    \
+  do {                                                                \
+    ::testing::TestPartResultArray gtest_failures;                    \
+    ::testing::internal::SingleFailureChecker gtest_checker(          \
+        &gtest_failures, ::testing::TestPartResult::kNonFatalFailure, \
+        (substr));                                                    \
+    {                                                                 \
+      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(     \
+          ::testing::ScopedFakeTestPartResultReporter::               \
+              INTERCEPT_ONLY_CURRENT_THREAD,                          \
+          &gtest_failures);                                           \
+      if (::testing::internal::AlwaysTrue()) {                        \
+        statement;                                                    \
+      }                                                               \
+    }                                                                 \
+  } while (::testing::internal::AlwaysFalse())
+
+#define EXPECT_NONFATAL_FAILURE_ON_ALL_THREADS(statement, substr)             \
+  do {                                                                        \
+    ::testing::TestPartResultArray gtest_failures;                            \
+    ::testing::internal::SingleFailureChecker gtest_checker(                  \
+        &gtest_failures, ::testing::TestPartResult::kNonFatalFailure,         \
+        (substr));                                                            \
+    {                                                                         \
+      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(             \
+          ::testing::ScopedFakeTestPartResultReporter::INTERCEPT_ALL_THREADS, \
+          &gtest_failures);                                                   \
+      if (::testing::internal::AlwaysTrue()) {                                \
+        statement;                                                            \
+      }                                                                       \
+    }                                                                         \
+  } while (::testing::internal::AlwaysFalse())
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_SPI_H_
diff --git a/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-test-part.h b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-test-part.h
new file mode 100644
index 000000000..a28afb309
--- /dev/null
+++ b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-test-part.h
@@ -0,0 +1,183 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// GOOGLETEST_CM0001 DO NOT DELETE
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
+#define GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
+
+#include <iosfwd>
+#include <vector>
+#include "gtest/internal/gtest-internal.h"
+#include "gtest/internal/gtest-string.h"
+
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
+namespace testing {
+
+// A copyable object representing the result of a test part (i.e. an
+// assertion or an explicit FAIL(), ADD_FAILURE(), or SUCCESS()).
+//
+// Don't inherit from TestPartResult as its destructor is not virtual.
+class GTEST_API_ TestPartResult {
+ public:
+  // The possible outcomes of a test part (i.e. an assertion or an
+  // explicit SUCCEED(), FAIL(), or ADD_FAILURE()).
+  enum Type {
+    kSuccess,          // Succeeded.
+    kNonFatalFailure,  // Failed but the test can continue.
+    kFatalFailure,     // Failed and the test should be terminated.
+    kSkip              // Skipped.
+  };
+
+  // C'tor.  TestPartResult does NOT have a default constructor.
+  // Always use this constructor (with parameters) to create a
+  // TestPartResult object.
+  TestPartResult(Type a_type, const char *a_file_name, int a_line_number,
+                 const char *a_message)
+      : type_(a_type), file_name_(a_file_name == nullptr ? "" : a_file_name),
+        line_number_(a_line_number), summary_(ExtractSummary(a_message)),
+        message_(a_message) {}
+
+  // Gets the outcome of the test part.
+  Type type() const { return type_; }
+
+  // Gets the name of the source file where the test part took place, or
+  // NULL if it's unknown.
+  const char *file_name() const {
+    return file_name_.empty() ? nullptr : file_name_.c_str();
+  }
+
+  // Gets the line in the source file where the test part took place,
+  // or -1 if it's unknown.
+  int line_number() const { return line_number_; }
+
+  // Gets the summary of the failure message.
+  const char *summary() const { return summary_.c_str(); }
+
+  // Gets the message associated with the test part.
+  const char *message() const { return message_.c_str(); }
+
+  // Returns true if and only if the test part was skipped.
+  bool skipped() const { return type_ == kSkip; }
+
+  // Returns true if and only if the test part passed.
+  bool passed() const { return type_ == kSuccess; }
+
+  // Returns true if and only if the test part non-fatally failed.
+  bool nonfatally_failed() const { return type_ == kNonFatalFailure; }
+
+  // Returns true if and only if the test part fatally failed.
+  bool fatally_failed() const { return type_ == kFatalFailure; }
+
+  // Returns true if and only if the test part failed.
+  bool failed() const { return fatally_failed() || nonfatally_failed(); }
+
+ private:
+  Type type_;
+
+  // Gets the summary of the failure message by omitting the stack
+  // trace in it.
+  static std::string ExtractSummary(const char *message);
+
+  // The name of the source file where the test part took place, or
+  // "" if the source file is unknown.
+  std::string file_name_;
+  // The line in the source file where the test part took place, or -1
+  // if the line number is unknown.
+  int line_number_;
+  std::string summary_;  // The test failure summary.
+  std::string message_;  // The test failure message.
+};
+
+// Prints a TestPartResult object.
+std::ostream &operator<<(std::ostream &os, const TestPartResult &result);
+
+// An array of TestPartResult objects.
+//
+// Don't inherit from TestPartResultArray as its destructor is not
+// virtual.
+class GTEST_API_ TestPartResultArray {
+ public:
+  TestPartResultArray() {}
+
+  // Appends the given TestPartResult to the array.
+  void Append(const TestPartResult &result);
+
+  // Returns the TestPartResult at the given index (0-based).
+  const TestPartResult &GetTestPartResult(int index) const;
+
+  // Returns the number of TestPartResult objects in the array.
+  int size() const;
+
+ private:
+  std::vector<TestPartResult> array_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestPartResultArray);
+};
+
+// This interface knows how to report a test part result.
+class GTEST_API_ TestPartResultReporterInterface {
+ public:
+  virtual ~TestPartResultReporterInterface() {}
+
+  virtual void ReportTestPartResult(const TestPartResult &result) = 0;
+};
+
+namespace internal {
+
+// This helper class is used by {ASSERT|EXPECT}_NO_FATAL_FAILURE to check if a
+// statement generates new fatal failures. To do so it registers itself as the
+// current test part result reporter. Besides checking if fatal failures were
+// reported, it only delegates the reporting to the former result reporter.
+// The original result reporter is restored in the destructor.
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+class GTEST_API_ HasNewFatalFailureHelper
+    : public TestPartResultReporterInterface {
+ public:
+  HasNewFatalFailureHelper();
+  ~HasNewFatalFailureHelper() override;
+  void ReportTestPartResult(const TestPartResult &result) override;
+  bool has_new_fatal_failure() const { return has_new_fatal_failure_; }
+
+ private:
+  bool has_new_fatal_failure_;
+  TestPartResultReporterInterface *original_reporter_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(HasNewFatalFailureHelper);
+};
+
+}  // namespace internal
+
+}  // namespace testing
+
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
diff --git a/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-typed-test.h b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-typed-test.h
new file mode 100644
index 000000000..f5afc4db8
--- /dev/null
+++ b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest-typed-test.h
@@ -0,0 +1,337 @@
+// Copyright 2008 Google Inc.
+// All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// GOOGLETEST_CM0001 DO NOT DELETE
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
+#define GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
+
+// This header implements typed tests and type-parameterized tests.
+
+// Typed (aka type-driven) tests repeat the same test for types in a
+// list.  You must know which types you want to test with when writing
+// typed tests. Here's how you do it:
+
+#if 0
+
+// First, define a fixture class template.  It should be parameterized
+// by a type.  Remember to derive it from testing::Test.
+template <typename T>
+class FooTest : public testing::Test {
+ public:
+  ...
+  typedef std::list<T> List;
+  static T shared_;
+  T value_;
+};
+
+// Next, associate a list of types with the test suite, which will be
+// repeated for each type in the list.  The typedef is necessary for
+// the macro to parse correctly.
+typedef testing::Types<char, int, unsigned int> MyTypes;
+TYPED_TEST_SUITE(FooTest, MyTypes);
+
+// If the type list contains only one type, you can write that type
+// directly without Types<...>:
+//   TYPED_TEST_SUITE(FooTest, int);
+
+// Then, use TYPED_TEST() instead of TEST_F() to define as many typed
+// tests for this test suite as you want.
+TYPED_TEST(FooTest, DoesBlah) {
+  // Inside a test, refer to the special name TypeParam to get the type
+  // parameter.  Since we are inside a derived class template, C++ requires
+  // us to visit the members of FooTest via 'this'.
+  TypeParam n = this->value_;
+
+  // To visit static members of the fixture, add the TestFixture::
+  // prefix.
+  n += TestFixture::shared_;
+
+  // To refer to typedefs in the fixture, add the "typename
+  // TestFixture::" prefix.
+  typename TestFixture::List values;
+  values.push_back(n);
+  ...
+}
+
+TYPED_TEST(FooTest, HasPropertyA) { ... }
+
+// TYPED_TEST_SUITE takes an optional third argument which allows to specify a
+// class that generates custom test name suffixes based on the type. This should
+// be a class which has a static template function GetName(int index) returning
+// a string for each type. The provided integer index equals the index of the
+// type in the provided type list. In many cases the index can be ignored.
+//
+// For example:
+//   class MyTypeNames {
+//    public:
+//     template <typename T>
+//     static std::string GetName(int) {
+//       if (std::is_same<T, char>()) return "char";
+//       if (std::is_same<T, int>()) return "int";
+//       if (std::is_same<T, unsigned int>()) return "unsignedInt";
+//     }
+//   };
+//   TYPED_TEST_SUITE(FooTest, MyTypes, MyTypeNames);
+
+#endif  // 0
+
+// Type-parameterized tests are abstract test patterns parameterized
+// by a type.  Compared with typed tests, type-parameterized tests
+// allow you to define the test pattern without knowing what the type
+// parameters are.  The defined pattern can be instantiated with
+// different types any number of times, in any number of translation
+// units.
+//
+// If you are designing an interface or concept, you can define a
+// suite of type-parameterized tests to verify properties that any
+// valid implementation of the interface/concept should have.  Then,
+// each implementation can easily instantiate the test suite to verify
+// that it conforms to the requirements, without having to write
+// similar tests repeatedly.  Here's an example:
+
+#if 0
+
+// First, define a fixture class template.  It should be parameterized
+// by a type.  Remember to derive it from testing::Test.
+template <typename T>
+class FooTest : public testing::Test {
+  ...
+};
+
+// Next, declare that you will define a type-parameterized test suite
+// (the _P suffix is for "parameterized" or "pattern", whichever you
+// prefer):
+TYPED_TEST_SUITE_P(FooTest);
+
+// Then, use TYPED_TEST_P() to define as many type-parameterized tests
+// for this type-parameterized test suite as you want.
+TYPED_TEST_P(FooTest, DoesBlah) {
+  // Inside a test, refer to TypeParam to get the type parameter.
+  TypeParam n = 0;
+  ...
+}
+
+TYPED_TEST_P(FooTest, HasPropertyA) { ... }
+
+// Now the tricky part: you need to register all test patterns before
+// you can instantiate them.  The first argument of the macro is the
+// test suite name; the rest are the names of the tests in this test
+// case.
+REGISTER_TYPED_TEST_SUITE_P(FooTest,
+                            DoesBlah, HasPropertyA);
+
+// Finally, you are free to instantiate the pattern with the types you
+// want.  If you put the above code in a header file, you can #include
+// it in multiple C++ source files and instantiate it multiple times.
+//
+// To distinguish different instances of the pattern, the first
+// argument to the INSTANTIATE_* macro is a prefix that will be added
+// to the actual test suite name.  Remember to pick unique prefixes for
+// different instances.
+typedef testing::Types<char, int, unsigned int> MyTypes;
+INSTANTIATE_TYPED_TEST_SUITE_P(My, FooTest, MyTypes);
+
+// If the type list contains only one type, you can write that type
+// directly without Types<...>:
+//   INSTANTIATE_TYPED_TEST_SUITE_P(My, FooTest, int);
+//
+// Similar to the optional argument of TYPED_TEST_SUITE above,
+// INSTANTIATE_TEST_SUITE_P takes an optional fourth argument which allows to
+// generate custom names.
+//   INSTANTIATE_TYPED_TEST_SUITE_P(My, FooTest, MyTypes, MyTypeNames);
+
+#endif  // 0
+
+#include "gtest/internal/gtest-internal.h"
+#include "gtest/internal/gtest-port.h"
+#include "gtest/internal/gtest-type-util.h"
+
+// Implements typed tests.
+
+#if GTEST_HAS_TYPED_TEST
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Expands to the name of the typedef for the type parameters of the
+// given test suite.
+#define GTEST_TYPE_PARAMS_(TestSuiteName) gtest_type_params_##TestSuiteName##_
+
+// Expands to the name of the typedef for the NameGenerator, responsible for
+// creating the suffixes of the name.
+#define GTEST_NAME_GENERATOR_(TestSuiteName) \
+  gtest_type_params_##TestSuiteName##_NameGenerator
+
+#define TYPED_TEST_SUITE(CaseName, Types, ...)                          \
+  typedef ::testing::internal::GenerateTypeList<Types>::type            \
+      GTEST_TYPE_PARAMS_(CaseName);                                     \
+  typedef ::testing::internal::NameGeneratorSelector<__VA_ARGS__>::type \
+      GTEST_NAME_GENERATOR_(CaseName)
+
+#define TYPED_TEST(CaseName, TestName)                                        \
+  static_assert(sizeof(GTEST_STRINGIFY_(TestName)) > 1,                       \
+                "test-name must not be empty");                               \
+  template <typename gtest_TypeParam_>                                        \
+  class GTEST_TEST_CLASS_NAME_(CaseName, TestName)                            \
+      : public CaseName<gtest_TypeParam_> {                                   \
+   private:                                                                   \
+    typedef CaseName<gtest_TypeParam_> TestFixture;                           \
+    typedef gtest_TypeParam_ TypeParam;                                       \
+    void TestBody() override;                                                 \
+  };                                                                          \
+  static bool gtest_##CaseName##_##TestName##_registered_                     \
+      GTEST_ATTRIBUTE_UNUSED_ = ::testing::internal::TypeParameterizedTest<   \
+          CaseName,                                                           \
+          ::testing::internal::TemplateSel<GTEST_TEST_CLASS_NAME_(CaseName,   \
+                                                                  TestName)>, \
+          GTEST_TYPE_PARAMS_(                                                 \
+              CaseName)>::Register("",                                        \
+                                   ::testing::internal::CodeLocation(         \
+                                       __FILE__, __LINE__),                   \
+                                   GTEST_STRINGIFY_(CaseName),                \
+                                   GTEST_STRINGIFY_(TestName), 0,             \
+                                   ::testing::internal::GenerateNames<        \
+                                       GTEST_NAME_GENERATOR_(CaseName),       \
+                                       GTEST_TYPE_PARAMS_(CaseName)>());      \
+  template <typename gtest_TypeParam_>                                        \
+  void GTEST_TEST_CLASS_NAME_(CaseName,                                       \
+                              TestName)<gtest_TypeParam_>::TestBody()
+
+// Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+#define TYPED_TEST_CASE                                                \
+  static_assert(::testing::internal::TypedTestCaseIsDeprecated(), ""); \
+  TYPED_TEST_SUITE
+#endif  // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+#endif  // GTEST_HAS_TYPED_TEST
+
+// Implements type-parameterized tests.
+
+#if GTEST_HAS_TYPED_TEST_P
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Expands to the namespace name that the type-parameterized tests for
+// the given type-parameterized test suite are defined in.  The exact
+// name of the namespace is subject to change without notice.
+#define GTEST_SUITE_NAMESPACE_(TestSuiteName) gtest_suite_##TestSuiteName##_
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Expands to the name of the variable used to remember the names of
+// the defined tests in the given test suite.
+#define GTEST_TYPED_TEST_SUITE_P_STATE_(TestSuiteName) \
+  gtest_typed_test_suite_p_state_##TestSuiteName##_
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE DIRECTLY.
+//
+// Expands to the name of the variable used to remember the names of
+// the registered tests in the given test suite.
+#define GTEST_REGISTERED_TEST_NAMES_(TestSuiteName) \
+  gtest_registered_test_names_##TestSuiteName##_
+
+// The variables defined in the type-parameterized test macros are
+// static as typically these macros are used in a .h file that can be
+// #included in multiple translation units linked together.
+#define TYPED_TEST_SUITE_P(SuiteName)              \
+  static ::testing::internal::TypedTestSuitePState \
+      GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName)
+
+// Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+#define TYPED_TEST_CASE_P                                                 \
+  static_assert(::testing::internal::TypedTestCase_P_IsDeprecated(), ""); \
+  TYPED_TEST_SUITE_P
+#endif  // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+#define TYPED_TEST_P(SuiteName, TestName)                             \
+  namespace GTEST_SUITE_NAMESPACE_(SuiteName) {                       \
+    template <typename gtest_TypeParam_>                              \
+    class TestName : public SuiteName<gtest_TypeParam_> {             \
+     private:                                                         \
+      typedef SuiteName<gtest_TypeParam_> TestFixture;                \
+      typedef gtest_TypeParam_ TypeParam;                             \
+      void TestBody() override;                                       \
+    };                                                                \
+    static bool gtest_##TestName##_defined_ GTEST_ATTRIBUTE_UNUSED_ = \
+        GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName).AddTestName(       \
+            __FILE__, __LINE__, GTEST_STRINGIFY_(SuiteName),          \
+            GTEST_STRINGIFY_(TestName));                              \
+  }                                                                   \
+  template <typename gtest_TypeParam_>                                \
+  void GTEST_SUITE_NAMESPACE_(                                        \
+      SuiteName)::TestName<gtest_TypeParam_>::TestBody()
+
+// Note: this won't work correctly if the trailing arguments are macros.
+#define REGISTER_TYPED_TEST_SUITE_P(SuiteName, ...)                         \
+  namespace GTEST_SUITE_NAMESPACE_(SuiteName) {                             \
+    typedef ::testing::internal::Templates<__VA_ARGS__> gtest_AllTests_;    \
+  }                                                                         \
+  static const char *const GTEST_REGISTERED_TEST_NAMES_(                    \
+      SuiteName) GTEST_ATTRIBUTE_UNUSED_ =                                  \
+      GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName).VerifyRegisteredTestNames( \
+          GTEST_STRINGIFY_(SuiteName), __FILE__, __LINE__, #__VA_ARGS__)
+
+// Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+#define REGISTER_TYPED_TEST_CASE_P                                           \
+  static_assert(::testing::internal::RegisterTypedTestCase_P_IsDeprecated(), \
+                "");                                                         \
+  REGISTER_TYPED_TEST_SUITE_P
+#endif  // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+#define INSTANTIATE_TYPED_TEST_SUITE_P(Prefix, SuiteName, Types, ...)     \
+  static_assert(sizeof(GTEST_STRINGIFY_(Prefix)) > 1,                     \
+                "test-suit-prefix must not be empty");                    \
+  static bool gtest_##Prefix##_##SuiteName GTEST_ATTRIBUTE_UNUSED_ =      \
+      ::testing::internal::TypeParameterizedTestSuite<                    \
+          SuiteName, GTEST_SUITE_NAMESPACE_(SuiteName)::gtest_AllTests_,  \
+          ::testing::internal::GenerateTypeList<Types>::type>::           \
+          Register(GTEST_STRINGIFY_(Prefix),                              \
+                   ::testing::internal::CodeLocation(__FILE__, __LINE__), \
+                   &GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName),           \
+                   GTEST_STRINGIFY_(SuiteName),                           \
+                   GTEST_REGISTERED_TEST_NAMES_(SuiteName),               \
+                   ::testing::internal::GenerateNames<                    \
+                       ::testing::internal::NameGeneratorSelector<        \
+                           __VA_ARGS__>::type,                            \
+                       ::testing::internal::GenerateTypeList<Types>::type>())
+
+// Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+#define INSTANTIATE_TYPED_TEST_CASE_P                                      \
+  static_assert(                                                           \
+      ::testing::internal::InstantiateTypedTestCase_P_IsDeprecated(), ""); \
+  INSTANTIATE_TYPED_TEST_SUITE_P
+#endif  // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+#endif  // GTEST_HAS_TYPED_TEST_P
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
diff --git a/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest.h b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest.h
new file mode 100644
index 000000000..8fd7eea1e
--- /dev/null
+++ b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest.h
@@ -0,0 +1,2454 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+//
+// The Google C++ Testing and Mocking Framework (Google Test)
+//
+// This header file defines the public API for Google Test.  It should be
+// included by any test program that uses Google Test.
+//
+// IMPORTANT NOTE: Due to limitation of the C++ language, we have to
+// leave some internal implementation details in this header file.
+// They are clearly marked by comments like this:
+//
+//   // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+//
+// Such code is NOT meant to be used by a user directly, and is subject
+// to CHANGE WITHOUT NOTICE.  Therefore DO NOT DEPEND ON IT in a user
+// program!
+//
+// Acknowledgment: Google Test borrowed the idea of automatic test
+// registration from Barthelemy Dagenais' (barthelemy@prologique.com)
+// easyUnit framework.
+
+// GOOGLETEST_CM0001 DO NOT DELETE
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_H_
+#define GTEST_INCLUDE_GTEST_GTEST_H_
+
+#include <cstddef>
+#include <limits>
+#include <memory>
+#include <ostream>
+#include <type_traits>
+#include <vector>
+
+#include "gtest/internal/gtest-internal.h"
+#include "gtest/internal/gtest-string.h"
+#include "gtest/gtest-death-test.h"
+#include "gtest/gtest-matchers.h"
+#include "gtest/gtest-message.h"
+#include "gtest/gtest-param-test.h"
+#include "gtest/gtest-printers.h"
+#include "gtest/gtest_prod.h"
+#include "gtest/gtest-test-part.h"
+#include "gtest/gtest-typed-test.h"
+
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
+namespace testing {
+
+// Silence C4100 (unreferenced formal parameter) and 4805
+// unsafe mix of type 'const int' and type 'const bool'
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4805)
+#pragma warning(disable : 4100)
+#endif
+
+// Declares the flags.
+
+// This flag temporary enables the disabled tests.
+GTEST_DECLARE_bool_(also_run_disabled_tests);
+
+// This flag brings the debugger on an assertion failure.
+GTEST_DECLARE_bool_(break_on_failure);
+
+// This flag controls whether Google Test catches all test-thrown exceptions
+// and logs them as failures.
+GTEST_DECLARE_bool_(catch_exceptions);
+
+// This flag enables using colors in terminal output. Available values are
+// "yes" to enable colors, "no" (disable colors), or "auto" (the default)
+// to let Google Test decide.
+GTEST_DECLARE_string_(color);
+
+// This flag sets up the filter to select by name using a glob pattern
+// the tests to run. If the filter is not given all tests are executed.
+GTEST_DECLARE_string_(filter);
+
+// This flag controls whether Google Test installs a signal handler that dumps
+// debugging information when fatal signals are raised.
+GTEST_DECLARE_bool_(install_failure_signal_handler);
+
+// This flag causes the Google Test to list tests. None of the tests listed
+// are actually run if the flag is provided.
+GTEST_DECLARE_bool_(list_tests);
+
+// This flag controls whether Google Test emits a detailed XML report to a file
+// in addition to its normal textual output.
+GTEST_DECLARE_string_(output);
+
+// This flags control whether Google Test prints the elapsed time for each
+// test.
+GTEST_DECLARE_bool_(print_time);
+
+// This flags control whether Google Test prints UTF8 characters as text.
+GTEST_DECLARE_bool_(print_utf8);
+
+// This flag specifies the random number seed.
+GTEST_DECLARE_int32_(random_seed);
+
+// This flag sets how many times the tests are repeated. The default value
+// is 1. If the value is -1 the tests are repeating forever.
+GTEST_DECLARE_int32_(repeat);
+
+// This flag controls whether Google Test includes Google Test internal
+// stack frames in failure stack traces.
+GTEST_DECLARE_bool_(show_internal_stack_frames);
+
+// When this flag is specified, tests' order is randomized on every iteration.
+GTEST_DECLARE_bool_(shuffle);
+
+// This flag specifies the maximum number of stack frames to be
+// printed in a failure message.
+GTEST_DECLARE_int32_(stack_trace_depth);
+
+// When this flag is specified, a failed assertion will throw an
+// exception if exceptions are enabled, or exit the program with a
+// non-zero code otherwise. For use with an external test framework.
+GTEST_DECLARE_bool_(throw_on_failure);
+
+// When this flag is set with a "host:port" string, on supported
+// platforms test results are streamed to the specified port on
+// the specified host machine.
+GTEST_DECLARE_string_(stream_result_to);
+
+#if GTEST_USE_OWN_FLAGFILE_FLAG_
+GTEST_DECLARE_string_(flagfile);
+#endif  // GTEST_USE_OWN_FLAGFILE_FLAG_
+
+// The upper limit for valid stack trace depths.
+const int kMaxStackTraceDepth = 100;
+
+namespace internal {
+
+class AssertHelper;
+class DefaultGlobalTestPartResultReporter;
+class ExecDeathTest;
+class NoExecDeathTest;
+class FinalSuccessChecker;
+class GTestFlagSaver;
+class StreamingListenerTest;
+class TestResultAccessor;
+class TestEventListenersAccessor;
+class TestEventRepeater;
+class UnitTestRecordPropertyTestHelper;
+class WindowsDeathTest;
+class FuchsiaDeathTest;
+class UnitTestImpl *GetUnitTestImpl();
+void ReportFailureInUnknownLocation(TestPartResult::Type result_type,
+                                    const std::string &message);
+std::set<std::string> *GetIgnoredParameterizedTestSuites();
+
+}  // namespace internal
+
+// The friend relationship of some of these classes is cyclic.
+// If we don't forward declare them the compiler might confuse the classes
+// in friendship clauses with same named classes on the scope.
+class Test;
+class TestSuite;
+
+// Old API is still available but deprecated
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+using TestCase = TestSuite;
+#endif
+class TestInfo;
+class UnitTest;
+
+// A class for indicating whether an assertion was successful.  When
+// the assertion wasn't successful, the AssertionResult object
+// remembers a non-empty message that describes how it failed.
+//
+// To create an instance of this class, use one of the factory functions
+// (AssertionSuccess() and AssertionFailure()).
+//
+// This class is useful for two purposes:
+//   1. Defining predicate functions to be used with Boolean test assertions
+//      EXPECT_TRUE/EXPECT_FALSE and their ASSERT_ counterparts
+//   2. Defining predicate-format functions to be
+//      used with predicate assertions (ASSERT_PRED_FORMAT*, etc).
+//
+// For example, if you define IsEven predicate:
+//
+//   testing::AssertionResult IsEven(int n) {
+//     if ((n % 2) == 0)
+//       return testing::AssertionSuccess();
+//     else
+//       return testing::AssertionFailure() << n << " is odd";
+//   }
+//
+// Then the failed expectation EXPECT_TRUE(IsEven(Fib(5)))
+// will print the message
+//
+//   Value of: IsEven(Fib(5))
+//     Actual: false (5 is odd)
+//   Expected: true
+//
+// instead of a more opaque
+//
+//   Value of: IsEven(Fib(5))
+//     Actual: false
+//   Expected: true
+//
+// in case IsEven is a simple Boolean predicate.
+//
+// If you expect your predicate to be reused and want to support informative
+// messages in EXPECT_FALSE and ASSERT_FALSE (negative assertions show up
+// about half as often as positive ones in our tests), supply messages for
+// both success and failure cases:
+//
+//   testing::AssertionResult IsEven(int n) {
+//     if ((n % 2) == 0)
+//       return testing::AssertionSuccess() << n << " is even";
+//     else
+//       return testing::AssertionFailure() << n << " is odd";
+//   }
+//
+// Then a statement EXPECT_FALSE(IsEven(Fib(6))) will print
+//
+//   Value of: IsEven(Fib(6))
+//     Actual: true (8 is even)
+//   Expected: false
+//
+// NB: Predicates that support negative Boolean assertions have reduced
+// performance in positive ones so be careful not to use them in tests
+// that have lots (tens of thousands) of positive Boolean assertions.
+//
+// To use this class with EXPECT_PRED_FORMAT assertions such as:
+//
+//   // Verifies that Foo() returns an even number.
+//   EXPECT_PRED_FORMAT1(IsEven, Foo());
+//
+// you need to define:
+//
+//   testing::AssertionResult IsEven(const char* expr, int n) {
+//     if ((n % 2) == 0)
+//       return testing::AssertionSuccess();
+//     else
+//       return testing::AssertionFailure()
+//         << "Expected: " << expr << " is even\n  Actual: it's " << n;
+//   }
+//
+// If Foo() returns 5, you will see the following message:
+//
+//   Expected: Foo() is even
+//     Actual: it's 5
+//
+class GTEST_API_ AssertionResult {
+ public:
+  // Copy constructor.
+  // Used in EXPECT_TRUE/FALSE(assertion_result).
+  AssertionResult(const AssertionResult &other);
+
+// C4800 is a level 3 warning in Visual Studio 2015 and earlier.
+// This warning is not emitted in Visual Studio 2017.
+// This warning is off by default starting in Visual Studio 2019 but can be
+// enabled with command-line options.
+#if defined(_MSC_VER) && (_MSC_VER < 1910 || _MSC_VER >= 1920)
+  GTEST_DISABLE_MSC_WARNINGS_PUSH_(4800 /* forcing value to bool */)
+#endif
+
+  // Used in the EXPECT_TRUE/FALSE(bool_expression).
+  //
+  // T must be contextually convertible to bool.
+  //
+  // The second parameter prevents this overload from being considered if
+  // the argument is implicitly convertible to AssertionResult. In that case
+  // we want AssertionResult's copy constructor to be used.
+  template <typename T>
+  explicit AssertionResult(
+      const T &success,
+      typename std::enable_if<
+          !std::is_convertible<T, AssertionResult>::value>::type *
+      /*enabler*/
+      = nullptr)
+      : success_(success) {}
+
+#if defined(_MSC_VER) && (_MSC_VER < 1910 || _MSC_VER >= 1920)
+  GTEST_DISABLE_MSC_WARNINGS_POP_()
+#endif
+
+  // Assignment operator.
+  AssertionResult &operator=(AssertionResult other) {
+    swap(other);
+    return *this;
+  }
+
+  // Returns true if and only if the assertion succeeded.
+  operator bool() const { return success_; }  // NOLINT
+
+  // Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE.
+  AssertionResult operator!() const;
+
+  // Returns the text streamed into this AssertionResult. Test assertions
+  // use it when they fail (i.e., the predicate's outcome doesn't match the
+  // assertion's expectation). When nothing has been streamed into the
+  // object, returns an empty string.
+  const char *message() const {
+    return message_.get() != nullptr ? message_->c_str() : "";
+  }
+  // Deprecated; please use message() instead.
+  const char *failure_message() const { return message(); }
+
+  // Streams a custom failure message into this object.
+  template <typename T>
+  AssertionResult &operator<<(const T &value) {
+    AppendMessage(Message() << value);
+    return *this;
+  }
+
+  // Allows streaming basic output manipulators such as endl or flush into
+  // this object.
+  AssertionResult &operator<<(
+      ::std::ostream &(*basic_manipulator)(::std::ostream &stream)) {
+    AppendMessage(Message() << basic_manipulator);
+    return *this;
+  }
+
+ private:
+  // Appends the contents of message to message_.
+  void AppendMessage(const Message &a_message) {
+    if (message_.get() == nullptr) message_.reset(new ::std::string);
+    message_->append(a_message.GetString().c_str());
+  }
+
+  // Swap the contents of this AssertionResult with other.
+  void swap(AssertionResult &other);
+
+  // Stores result of the assertion predicate.
+  bool success_;
+  // Stores the message describing the condition in case the expectation
+  // construct is not satisfied with the predicate's outcome.
+  // Referenced via a pointer to avoid taking too much stack frame space
+  // with test assertions.
+  std::unique_ptr< ::std::string> message_;
+};
+
+// Makes a successful assertion result.
+GTEST_API_ AssertionResult AssertionSuccess();
+
+// Makes a failed assertion result.
+GTEST_API_ AssertionResult AssertionFailure();
+
+// Makes a failed assertion result with the given failure message.
+// Deprecated; use AssertionFailure() << msg.
+GTEST_API_ AssertionResult AssertionFailure(const Message &msg);
+
+}  // namespace testing
+
+// Includes the auto-generated header that implements a family of generic
+// predicate assertion macros. This include comes late because it relies on
+// APIs declared above.
+#include "gtest/gtest_pred_impl.h"
+
+namespace testing {
+
+// The abstract class that all tests inherit from.
+//
+// In Google Test, a unit test program contains one or many TestSuites, and
+// each TestSuite contains one or many Tests.
+//
+// When you define a test using the TEST macro, you don't need to
+// explicitly derive from Test - the TEST macro automatically does
+// this for you.
+//
+// The only time you derive from Test is when defining a test fixture
+// to be used in a TEST_F.  For example:
+//
+//   class FooTest : public testing::Test {
+//    protected:
+//     void SetUp() override { ... }
+//     void TearDown() override { ... }
+//     ...
+//   };
+//
+//   TEST_F(FooTest, Bar) { ... }
+//   TEST_F(FooTest, Baz) { ... }
+//
+// Test is not copyable.
+class GTEST_API_ Test {
+ public:
+  friend class TestInfo;
+
+  // The d'tor is virtual as we intend to inherit from Test.
+  virtual ~Test();
+
+  // Sets up the stuff shared by all tests in this test case.
+  //
+  // Google Test will call Foo::SetUpTestSuite() before running the first
+  // test in test case Foo.  Hence a sub-class can define its own
+  // SetUpTestSuite() method to shadow the one defined in the super
+  // class.
+  static void SetUpTestSuite() {}
+
+  // Tears down the stuff shared by all tests in this test suite.
+  //
+  // Google Test will call Foo::TearDownTestSuite() after running the last
+  // test in test case Foo.  Hence a sub-class can define its own
+  // TearDownTestSuite() method to shadow the one defined in the super
+  // class.
+  static void TearDownTestSuite() {}
+
+  // Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  static void TearDownTestCase() {}
+  static void SetUpTestCase() {}
+#endif  // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+  // Returns true if and only if the current test has a fatal failure.
+  static bool HasFatalFailure();
+
+  // Returns true if and only if the current test has a non-fatal failure.
+  static bool HasNonfatalFailure();
+
+  // Returns true if and only if the current test was skipped.
+  static bool IsSkipped();
+
+  // Returns true if and only if the current test has a (either fatal or
+  // non-fatal) failure.
+  static bool HasFailure() { return HasFatalFailure() || HasNonfatalFailure(); }
+
+  // Logs a property for the current test, test suite, or for the entire
+  // invocation of the test program when used outside of the context of a
+  // test suite.  Only the last value for a given key is remembered.  These
+  // are public static so they can be called from utility functions that are
+  // not members of the test fixture.  Calls to RecordProperty made during
+  // lifespan of the test (from the moment its constructor starts to the
+  // moment its destructor finishes) will be output in XML as attributes of
+  // the <testcase> element.  Properties recorded from fixture's
+  // SetUpTestSuite or TearDownTestSuite are logged as attributes of the
+  // corresponding <testsuite> element.  Calls to RecordProperty made in the
+  // global context (before or after invocation of RUN_ALL_TESTS and from
+  // SetUp/TearDown method of Environment objects registered with Google
+  // Test) will be output as attributes of the <testsuites> element.
+  static void RecordProperty(const std::string &key, const std::string &value);
+  static void RecordProperty(const std::string &key, int value);
+
+ protected:
+  // Creates a Test object.
+  Test();
+
+  // Sets up the test fixture.
+  virtual void SetUp();
+
+  // Tears down the test fixture.
+  virtual void TearDown();
+
+ private:
+  // Returns true if and only if the current test has the same fixture class
+  // as the first test in the current test suite.
+  static bool HasSameFixtureClass();
+
+  // Runs the test after the test fixture has been set up.
+  //
+  // A sub-class must implement this to define the test logic.
+  //
+  // DO NOT OVERRIDE THIS FUNCTION DIRECTLY IN A USER PROGRAM.
+  // Instead, use the TEST or TEST_F macro.
+  virtual void TestBody() = 0;
+
+  // Sets up, executes, and tears down the test.
+  void Run();
+
+  // Deletes self.  We deliberately pick an unusual name for this
+  // internal method to avoid clashing with names used in user TESTs.
+  void DeleteSelf_() { delete this; }
+
+  const std::unique_ptr<GTEST_FLAG_SAVER_> gtest_flag_saver_;
+
+  // Often a user misspells SetUp() as Setup() and spends a long time
+  // wondering why it is never called by Google Test.  The declaration of
+  // the following method is solely for catching such an error at
+  // compile time:
+  //
+  //   - The return type is deliberately chosen to be not void, so it
+  //   will be a conflict if void Setup() is declared in the user's
+  //   test fixture.
+  //
+  //   - This method is private, so it will be another compiler error
+  //   if the method is called from the user's test fixture.
+  //
+  // DO NOT OVERRIDE THIS FUNCTION.
+  //
+  // If you see an error about overriding the following function or
+  // about it being private, you have mis-spelled SetUp() as Setup().
+  struct Setup_should_be_spelled_SetUp {};
+  virtual Setup_should_be_spelled_SetUp *Setup() { return nullptr; }
+
+  // We disallow copying Tests.
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(Test);
+};
+
+typedef internal::TimeInMillis TimeInMillis;
+
+// A copyable object representing a user specified test property which can be
+// output as a key/value string pair.
+//
+// Don't inherit from TestProperty as its destructor is not virtual.
+class TestProperty {
+ public:
+  // C'tor.  TestProperty does NOT have a default constructor.
+  // Always use this constructor (with parameters) to create a
+  // TestProperty object.
+  TestProperty(const std::string &a_key, const std::string &a_value)
+      : key_(a_key), value_(a_value) {}
+
+  // Gets the user supplied key.
+  const char *key() const { return key_.c_str(); }
+
+  // Gets the user supplied value.
+  const char *value() const { return value_.c_str(); }
+
+  // Sets a new value, overriding the one supplied in the constructor.
+  void SetValue(const std::string &new_value) { value_ = new_value; }
+
+ private:
+  // The key supplied by the user.
+  std::string key_;
+  // The value supplied by the user.
+  std::string value_;
+};
+
+// The result of a single Test.  This includes a list of
+// TestPartResults, a list of TestProperties, a count of how many
+// death tests there are in the Test, and how much time it took to run
+// the Test.
+//
+// TestResult is not copyable.
+class GTEST_API_ TestResult {
+ public:
+  // Creates an empty TestResult.
+  TestResult();
+
+  // D'tor.  Do not inherit from TestResult.
+  ~TestResult();
+
+  // Gets the number of all test parts.  This is the sum of the number
+  // of successful test parts and the number of failed test parts.
+  int total_part_count() const;
+
+  // Returns the number of the test properties.
+  int test_property_count() const;
+
+  // Returns true if and only if the test passed (i.e. no test part failed).
+  bool Passed() const { return !Skipped() && !Failed(); }
+
+  // Returns true if and only if the test was skipped.
+  bool Skipped() const;
+
+  // Returns true if and only if the test failed.
+  bool Failed() const;
+
+  // Returns true if and only if the test fatally failed.
+  bool HasFatalFailure() const;
+
+  // Returns true if and only if the test has a non-fatal failure.
+  bool HasNonfatalFailure() const;
+
+  // Returns the elapsed time, in milliseconds.
+  TimeInMillis elapsed_time() const { return elapsed_time_; }
+
+  // Gets the time of the test case start, in ms from the start of the
+  // UNIX epoch.
+  TimeInMillis start_timestamp() const { return start_timestamp_; }
+
+  // Returns the i-th test part result among all the results. i can range from 0
+  // to total_part_count() - 1. If i is not in that range, aborts the program.
+  const TestPartResult &GetTestPartResult(int i) const;
+
+  // Returns the i-th test property. i can range from 0 to
+  // test_property_count() - 1. If i is not in that range, aborts the
+  // program.
+  const TestProperty &GetTestProperty(int i) const;
+
+ private:
+  friend class TestInfo;
+  friend class TestSuite;
+  friend class UnitTest;
+  friend class internal::DefaultGlobalTestPartResultReporter;
+  friend class internal::ExecDeathTest;
+  friend class internal::TestResultAccessor;
+  friend class internal::UnitTestImpl;
+  friend class internal::WindowsDeathTest;
+  friend class internal::FuchsiaDeathTest;
+
+  // Gets the vector of TestPartResults.
+  const std::vector<TestPartResult> &test_part_results() const {
+    return test_part_results_;
+  }
+
+  // Gets the vector of TestProperties.
+  const std::vector<TestProperty> &test_properties() const {
+    return test_properties_;
+  }
+
+  // Sets the start time.
+  void set_start_timestamp(TimeInMillis start) { start_timestamp_ = start; }
+
+  // Sets the elapsed time.
+  void set_elapsed_time(TimeInMillis elapsed) { elapsed_time_ = elapsed; }
+
+  // Adds a test property to the list. The property is validated and may add
+  // a non-fatal failure if invalid (e.g., if it conflicts with reserved
+  // key names). If a property is already recorded for the same key, the
+  // value will be updated, rather than storing multiple values for the same
+  // key.  xml_element specifies the element for which the property is being
+  // recorded and is used for validation.
+  void RecordProperty(const std::string &xml_element,
+                      const TestProperty &test_property);
+
+  // Adds a failure if the key is a reserved attribute of Google Test
+  // testsuite tags.  Returns true if the property is valid.
+  // FIXME: Validate attribute names are legal and human readable.
+  static bool ValidateTestProperty(const std::string &xml_element,
+                                   const TestProperty &test_property);
+
+  // Adds a test part result to the list.
+  void AddTestPartResult(const TestPartResult &test_part_result);
+
+  // Returns the death test count.
+  int death_test_count() const { return death_test_count_; }
+
+  // Increments the death test count, returning the new count.
+  int increment_death_test_count() { return ++death_test_count_; }
+
+  // Clears the test part results.
+  void ClearTestPartResults();
+
+  // Clears the object.
+  void Clear();
+
+  // Protects mutable state of the property vector and of owned
+  // properties, whose values may be updated.
+  internal::Mutex test_properites_mutex_;
+
+  // The vector of TestPartResults
+  std::vector<TestPartResult> test_part_results_;
+  // The vector of TestProperties
+  std::vector<TestProperty> test_properties_;
+  // Running count of death tests.
+  int death_test_count_;
+  // The start time, in milliseconds since UNIX Epoch.
+  TimeInMillis start_timestamp_;
+  // The elapsed time, in milliseconds.
+  TimeInMillis elapsed_time_;
+
+  // We disallow copying TestResult.
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestResult);
+};  // class TestResult
+
+// A TestInfo object stores the following information about a test:
+//
+//   Test suite name
+//   Test name
+//   Whether the test should be run
+//   A function pointer that creates the test object when invoked
+//   Test result
+//
+// The constructor of TestInfo registers itself with the UnitTest
+// singleton such that the RUN_ALL_TESTS() macro knows which tests to
+// run.
+class GTEST_API_ TestInfo {
+ public:
+  // Destructs a TestInfo object.  This function is not virtual, so
+  // don't inherit from TestInfo.
+  ~TestInfo();
+
+  // Returns the test suite name.
+  const char *test_suite_name() const { return test_suite_name_.c_str(); }
+
+// Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  const char *test_case_name() const { return test_suite_name(); }
+#endif  // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+  // Returns the test name.
+  const char *name() const { return name_.c_str(); }
+
+  // Returns the name of the parameter type, or NULL if this is not a typed
+  // or a type-parameterized test.
+  const char *type_param() const {
+    if (type_param_.get() != nullptr) return type_param_->c_str();
+    return nullptr;
+  }
+
+  // Returns the text representation of the value parameter, or NULL if this
+  // is not a value-parameterized test.
+  const char *value_param() const {
+    if (value_param_.get() != nullptr) return value_param_->c_str();
+    return nullptr;
+  }
+
+  // Returns the file name where this test is defined.
+  const char *file() const { return location_.file.c_str(); }
+
+  // Returns the line where this test is defined.
+  int line() const { return location_.line; }
+
+  // Return true if this test should not be run because it's in another shard.
+  bool is_in_another_shard() const { return is_in_another_shard_; }
+
+  // Returns true if this test should run, that is if the test is not
+  // disabled (or it is disabled but the also_run_disabled_tests flag has
+  // been specified) and its full name matches the user-specified filter.
+  //
+  // Google Test allows the user to filter the tests by their full names.
+  // The full name of a test Bar in test suite Foo is defined as
+  // "Foo.Bar".  Only the tests that match the filter will run.
+  //
+  // A filter is a colon-separated list of glob (not regex) patterns,
+  // optionally followed by a '-' and a colon-separated list of
+  // negative patterns (tests to exclude).  A test is run if it
+  // matches one of the positive patterns and does not match any of
+  // the negative patterns.
+  //
+  // For example, *A*:Foo.* is a filter that matches any string that
+  // contains the character 'A' or starts with "Foo.".
+  bool should_run() const { return should_run_; }
+
+  // Returns true if and only if this test will appear in the XML report.
+  bool is_reportable() const {
+    // The XML report includes tests matching the filter, excluding those
+    // run in other shards.
+    return matches_filter_ && !is_in_another_shard_;
+  }
+
+  // Returns the result of the test.
+  const TestResult *result() const { return &result_; }
+
+ private:
+#if GTEST_HAS_DEATH_TEST
+  friend class internal::DefaultDeathTestFactory;
+#endif  // GTEST_HAS_DEATH_TEST
+  friend class Test;
+  friend class TestSuite;
+  friend class internal::UnitTestImpl;
+  friend class internal::StreamingListenerTest;
+  friend TestInfo *internal::MakeAndRegisterTestInfo(
+      const char *test_suite_name, const char *name, const char *type_param,
+      const char *value_param, internal::CodeLocation code_location,
+      internal::TypeId fixture_class_id, internal::SetUpTestSuiteFunc set_up_tc,
+      internal::TearDownTestSuiteFunc tear_down_tc,
+      internal::TestFactoryBase *factory);
+
+  // Constructs a TestInfo object. The newly constructed instance assumes
+  // ownership of the factory object.
+  TestInfo(const std::string &test_suite_name, const std::string &name,
+           const char *a_type_param,   // NULL if not a type-parameterized test
+           const char *a_value_param,  // NULL if not a value-parameterized test
+           internal::CodeLocation a_code_location,
+           internal::TypeId fixture_class_id,
+           internal::TestFactoryBase *factory);
+
+  // Increments the number of death tests encountered in this test so
+  // far.
+  int increment_death_test_count() {
+    return result_.increment_death_test_count();
+  }
+
+  // Creates the test object, runs it, records its result, and then
+  // deletes it.
+  void Run();
+
+  static void ClearTestResult(TestInfo *test_info) {
+    test_info->result_.Clear();
+  }
+
+  // These fields are immutable properties of the test.
+  const std::string test_suite_name_;  // test suite name
+  const std::string name_;             // Test name
+  // Name of the parameter type, or NULL if this is not a typed or a
+  // type-parameterized test.
+  const std::unique_ptr<const ::std::string> type_param_;
+  // Text representation of the value parameter, or NULL if this is not a
+  // value-parameterized test.
+  const std::unique_ptr<const ::std::string> value_param_;
+  internal::CodeLocation location_;
+  const internal::TypeId fixture_class_id_;  // ID of the test fixture class
+  bool should_run_;           // True if and only if this test should run
+  bool is_disabled_;          // True if and only if this test is disabled
+  bool matches_filter_;       // True if this test matches the
+                              // user-specified filter.
+  bool is_in_another_shard_;  // Will be run in another shard.
+  internal::TestFactoryBase *const factory_;  // The factory that creates
+                                              // the test object
+
+  // This field is mutable and needs to be reset before running the
+  // test for the second time.
+  TestResult result_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestInfo);
+};
+
+// A test suite, which consists of a vector of TestInfos.
+//
+// TestSuite is not copyable.
+class GTEST_API_ TestSuite {
+ public:
+  // Creates a TestSuite with the given name.
+  //
+  // TestSuite does NOT have a default constructor.  Always use this
+  // constructor to create a TestSuite object.
+  //
+  // Arguments:
+  //
+  //   name:         name of the test suite
+  //   a_type_param: the name of the test's type parameter, or NULL if
+  //                 this is not a type-parameterized test.
+  //   set_up_tc:    pointer to the function that sets up the test suite
+  //   tear_down_tc: pointer to the function that tears down the test suite
+  TestSuite(const char *name, const char *a_type_param,
+            internal::SetUpTestSuiteFunc set_up_tc,
+            internal::TearDownTestSuiteFunc tear_down_tc);
+
+  // Destructor of TestSuite.
+  virtual ~TestSuite();
+
+  // Gets the name of the TestSuite.
+  const char *name() const { return name_.c_str(); }
+
+  // Returns the name of the parameter type, or NULL if this is not a
+  // type-parameterized test suite.
+  const char *type_param() const {
+    if (type_param_.get() != nullptr) return type_param_->c_str();
+    return nullptr;
+  }
+
+  // Returns true if any test in this test suite should run.
+  bool should_run() const { return should_run_; }
+
+  // Gets the number of successful tests in this test suite.
+  int successful_test_count() const;
+
+  // Gets the number of skipped tests in this test suite.
+  int skipped_test_count() const;
+
+  // Gets the number of failed tests in this test suite.
+  int failed_test_count() const;
+
+  // Gets the number of disabled tests that will be reported in the XML report.
+  int reportable_disabled_test_count() const;
+
+  // Gets the number of disabled tests in this test suite.
+  int disabled_test_count() const;
+
+  // Gets the number of tests to be printed in the XML report.
+  int reportable_test_count() const;
+
+  // Get the number of tests in this test suite that should run.
+  int test_to_run_count() const;
+
+  // Gets the number of all tests in this test suite.
+  int total_test_count() const;
+
+  // Returns true if and only if the test suite passed.
+  bool Passed() const { return !Failed(); }
+
+  // Returns true if and only if the test suite failed.
+  bool Failed() const {
+    return failed_test_count() > 0 || ad_hoc_test_result().Failed();
+  }
+
+  // Returns the elapsed time, in milliseconds.
+  TimeInMillis elapsed_time() const { return elapsed_time_; }
+
+  // Gets the time of the test suite start, in ms from the start of the
+  // UNIX epoch.
+  TimeInMillis start_timestamp() const { return start_timestamp_; }
+
+  // Returns the i-th test among all the tests. i can range from 0 to
+  // total_test_count() - 1. If i is not in that range, returns NULL.
+  const TestInfo *GetTestInfo(int i) const;
+
+  // Returns the TestResult that holds test properties recorded during
+  // execution of SetUpTestSuite and TearDownTestSuite.
+  const TestResult &ad_hoc_test_result() const { return ad_hoc_test_result_; }
+
+ private:
+  friend class Test;
+  friend class internal::UnitTestImpl;
+
+  // Gets the (mutable) vector of TestInfos in this TestSuite.
+  std::vector<TestInfo *> &test_info_list() { return test_info_list_; }
+
+  // Gets the (immutable) vector of TestInfos in this TestSuite.
+  const std::vector<TestInfo *> &test_info_list() const {
+    return test_info_list_;
+  }
+
+  // Returns the i-th test among all the tests. i can range from 0 to
+  // total_test_count() - 1. If i is not in that range, returns NULL.
+  TestInfo *GetMutableTestInfo(int i);
+
+  // Sets the should_run member.
+  void set_should_run(bool should) { should_run_ = should; }
+
+  // Adds a TestInfo to this test suite.  Will delete the TestInfo upon
+  // destruction of the TestSuite object.
+  void AddTestInfo(TestInfo *test_info);
+
+  // Clears the results of all tests in this test suite.
+  void ClearResult();
+
+  // Clears the results of all tests in the given test suite.
+  static void ClearTestSuiteResult(TestSuite *test_suite) {
+    test_suite->ClearResult();
+  }
+
+  // Runs every test in this TestSuite.
+  void Run();
+
+  // Runs SetUpTestSuite() for this TestSuite.  This wrapper is needed
+  // for catching exceptions thrown from SetUpTestSuite().
+  void RunSetUpTestSuite() {
+    if (set_up_tc_ != nullptr) {
+      (*set_up_tc_)();
+    }
+  }
+
+  // Runs TearDownTestSuite() for this TestSuite.  This wrapper is
+  // needed for catching exceptions thrown from TearDownTestSuite().
+  void RunTearDownTestSuite() {
+    if (tear_down_tc_ != nullptr) {
+      (*tear_down_tc_)();
+    }
+  }
+
+  // Returns true if and only if test passed.
+  static bool TestPassed(const TestInfo *test_info) {
+    return test_info->should_run() && test_info->result()->Passed();
+  }
+
+  // Returns true if and only if test skipped.
+  static bool TestSkipped(const TestInfo *test_info) {
+    return test_info->should_run() && test_info->result()->Skipped();
+  }
+
+  // Returns true if and only if test failed.
+  static bool TestFailed(const TestInfo *test_info) {
+    return test_info->should_run() && test_info->result()->Failed();
+  }
+
+  // Returns true if and only if the test is disabled and will be reported in
+  // the XML report.
+  static bool TestReportableDisabled(const TestInfo *test_info) {
+    return test_info->is_reportable() && test_info->is_disabled_;
+  }
+
+  // Returns true if and only if test is disabled.
+  static bool TestDisabled(const TestInfo *test_info) {
+    return test_info->is_disabled_;
+  }
+
+  // Returns true if and only if this test will appear in the XML report.
+  static bool TestReportable(const TestInfo *test_info) {
+    return test_info->is_reportable();
+  }
+
+  // Returns true if the given test should run.
+  static bool ShouldRunTest(const TestInfo *test_info) {
+    return test_info->should_run();
+  }
+
+  // Shuffles the tests in this test suite.
+  void ShuffleTests(internal::Random *random);
+
+  // Restores the test order to before the first shuffle.
+  void UnshuffleTests();
+
+  // Name of the test suite.
+  std::string name_;
+  // Name of the parameter type, or NULL if this is not a typed or a
+  // type-parameterized test.
+  const std::unique_ptr<const ::std::string> type_param_;
+  // The vector of TestInfos in their original order.  It owns the
+  // elements in the vector.
+  std::vector<TestInfo *> test_info_list_;
+  // Provides a level of indirection for the test list to allow easy
+  // shuffling and restoring the test order.  The i-th element in this
+  // vector is the index of the i-th test in the shuffled test list.
+  std::vector<int> test_indices_;
+  // Pointer to the function that sets up the test suite.
+  internal::SetUpTestSuiteFunc set_up_tc_;
+  // Pointer to the function that tears down the test suite.
+  internal::TearDownTestSuiteFunc tear_down_tc_;
+  // True if and only if any test in this test suite should run.
+  bool should_run_;
+  // The start time, in milliseconds since UNIX Epoch.
+  TimeInMillis start_timestamp_;
+  // Elapsed time, in milliseconds.
+  TimeInMillis elapsed_time_;
+  // Holds test properties recorded during execution of SetUpTestSuite and
+  // TearDownTestSuite.
+  TestResult ad_hoc_test_result_;
+
+  // We disallow copying TestSuites.
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestSuite);
+};
+
+// An Environment object is capable of setting up and tearing down an
+// environment.  You should subclass this to define your own
+// environment(s).
+//
+// An Environment object does the set-up and tear-down in virtual
+// methods SetUp() and TearDown() instead of the constructor and the
+// destructor, as:
+//
+//   1. You cannot safely throw from a destructor.  This is a problem
+//      as in some cases Google Test is used where exceptions are enabled, and
+//      we may want to implement ASSERT_* using exceptions where they are
+//      available.
+//   2. You cannot use ASSERT_* directly in a constructor or
+//      destructor.
+class Environment {
+ public:
+  // The d'tor is virtual as we need to subclass Environment.
+  virtual ~Environment() {}
+
+  // Override this to define how to set up the environment.
+  virtual void SetUp() {}
+
+  // Override this to define how to tear down the environment.
+  virtual void TearDown() {}
+
+ private:
+  // If you see an error about overriding the following function or
+  // about it being private, you have mis-spelled SetUp() as Setup().
+  struct Setup_should_be_spelled_SetUp {};
+  virtual Setup_should_be_spelled_SetUp *Setup() { return nullptr; }
+};
+
+#if GTEST_HAS_EXCEPTIONS
+
+// Exception which can be thrown from TestEventListener::OnTestPartResult.
+class GTEST_API_ AssertionException
+    : public internal::GoogleTestFailureException {
+ public:
+  explicit AssertionException(const TestPartResult &result)
+      : GoogleTestFailureException(result) {}
+};
+
+#endif  // GTEST_HAS_EXCEPTIONS
+
+// The interface for tracing execution of tests. The methods are organized in
+// the order the corresponding events are fired.
+class TestEventListener {
+ public:
+  virtual ~TestEventListener() {}
+
+  // Fired before any test activity starts.
+  virtual void OnTestProgramStart(const UnitTest &unit_test) = 0;
+
+  // Fired before each iteration of tests starts.  There may be more than
+  // one iteration if GTEST_FLAG(repeat) is set. iteration is the iteration
+  // index, starting from 0.
+  virtual void OnTestIterationStart(const UnitTest &unit_test,
+                                    int iteration) = 0;
+
+  // Fired before environment set-up for each iteration of tests starts.
+  virtual void OnEnvironmentsSetUpStart(const UnitTest &unit_test) = 0;
+
+  // Fired after environment set-up for each iteration of tests ends.
+  virtual void OnEnvironmentsSetUpEnd(const UnitTest &unit_test) = 0;
+
+  // Fired before the test suite starts.
+  virtual void OnTestSuiteStart(const TestSuite & /*test_suite*/) {}
+
+  //  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  virtual void OnTestCaseStart(const TestCase & /*test_case*/) {}
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+  // Fired before the test starts.
+  virtual void OnTestStart(const TestInfo &test_info) = 0;
+
+  // Fired after a failed assertion or a SUCCEED() invocation.
+  // If you want to throw an exception from this function to skip to the next
+  // TEST, it must be AssertionException defined above, or inherited from it.
+  virtual void OnTestPartResult(const TestPartResult &test_part_result) = 0;
+
+  // Fired after the test ends.
+  virtual void OnTestEnd(const TestInfo &test_info) = 0;
+
+  // Fired after the test suite ends.
+  virtual void OnTestSuiteEnd(const TestSuite & /*test_suite*/) {}
+
+//  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  virtual void OnTestCaseEnd(const TestCase & /*test_case*/) {}
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+  // Fired before environment tear-down for each iteration of tests starts.
+  virtual void OnEnvironmentsTearDownStart(const UnitTest &unit_test) = 0;
+
+  // Fired after environment tear-down for each iteration of tests ends.
+  virtual void OnEnvironmentsTearDownEnd(const UnitTest &unit_test) = 0;
+
+  // Fired after each iteration of tests finishes.
+  virtual void OnTestIterationEnd(const UnitTest &unit_test, int iteration) = 0;
+
+  // Fired after all test activities have ended.
+  virtual void OnTestProgramEnd(const UnitTest &unit_test) = 0;
+};
+
+// The convenience class for users who need to override just one or two
+// methods and are not concerned that a possible change to a signature of
+// the methods they override will not be caught during the build.  For
+// comments about each method please see the definition of TestEventListener
+// above.
+class EmptyTestEventListener : public TestEventListener {
+ public:
+  void OnTestProgramStart(const UnitTest & /*unit_test*/) override {}
+  void OnTestIterationStart(const UnitTest & /*unit_test*/,
+                            int /*iteration*/) override {}
+  void OnEnvironmentsSetUpStart(const UnitTest & /*unit_test*/) override {}
+  void OnEnvironmentsSetUpEnd(const UnitTest & /*unit_test*/) override {}
+  void OnTestSuiteStart(const TestSuite & /*test_suite*/) override {}
+//  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  void OnTestCaseStart(const TestCase & /*test_case*/) override {}
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+  void OnTestStart(const TestInfo & /*test_info*/) override {}
+  void OnTestPartResult(const TestPartResult & /*test_part_result*/) override {}
+  void OnTestEnd(const TestInfo & /*test_info*/) override {}
+  void OnTestSuiteEnd(const TestSuite & /*test_suite*/) override {}
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  void OnTestCaseEnd(const TestCase & /*test_case*/) override {}
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+  void OnEnvironmentsTearDownStart(const UnitTest & /*unit_test*/) override {}
+  void OnEnvironmentsTearDownEnd(const UnitTest & /*unit_test*/) override {}
+  void OnTestIterationEnd(const UnitTest & /*unit_test*/,
+                          int /*iteration*/) override {}
+  void OnTestProgramEnd(const UnitTest & /*unit_test*/) override {}
+};
+
+// TestEventListeners lets users add listeners to track events in Google Test.
+class GTEST_API_ TestEventListeners {
+ public:
+  TestEventListeners();
+  ~TestEventListeners();
+
+  // Appends an event listener to the end of the list. Google Test assumes
+  // the ownership of the listener (i.e. it will delete the listener when
+  // the test program finishes).
+  void Append(TestEventListener *listener);
+
+  // Removes the given event listener from the list and returns it.  It then
+  // becomes the caller's responsibility to delete the listener. Returns
+  // NULL if the listener is not found in the list.
+  TestEventListener *Release(TestEventListener *listener);
+
+  // Returns the standard listener responsible for the default console
+  // output.  Can be removed from the listeners list to shut down default
+  // console output.  Note that removing this object from the listener list
+  // with Release transfers its ownership to the caller and makes this
+  // function return NULL the next time.
+  TestEventListener *default_result_printer() const {
+    return default_result_printer_;
+  }
+
+  // Returns the standard listener responsible for the default XML output
+  // controlled by the --gtest_output=xml flag.  Can be removed from the
+  // listeners list by users who want to shut down the default XML output
+  // controlled by this flag and substitute it with custom one.  Note that
+  // removing this object from the listener list with Release transfers its
+  // ownership to the caller and makes this function return NULL the next
+  // time.
+  TestEventListener *default_xml_generator() const {
+    return default_xml_generator_;
+  }
+
+ private:
+  friend class TestSuite;
+  friend class TestInfo;
+  friend class internal::DefaultGlobalTestPartResultReporter;
+  friend class internal::NoExecDeathTest;
+  friend class internal::TestEventListenersAccessor;
+  friend class internal::UnitTestImpl;
+
+  // Returns repeater that broadcasts the TestEventListener events to all
+  // subscribers.
+  TestEventListener *repeater();
+
+  // Sets the default_result_printer attribute to the provided listener.
+  // The listener is also added to the listener list and previous
+  // default_result_printer is removed from it and deleted. The listener can
+  // also be NULL in which case it will not be added to the list. Does
+  // nothing if the previous and the current listener objects are the same.
+  void SetDefaultResultPrinter(TestEventListener *listener);
+
+  // Sets the default_xml_generator attribute to the provided listener.  The
+  // listener is also added to the listener list and previous
+  // default_xml_generator is removed from it and deleted. The listener can
+  // also be NULL in which case it will not be added to the list. Does
+  // nothing if the previous and the current listener objects are the same.
+  void SetDefaultXmlGenerator(TestEventListener *listener);
+
+  // Controls whether events will be forwarded by the repeater to the
+  // listeners in the list.
+  bool EventForwardingEnabled() const;
+  void SuppressEventForwarding();
+
+  // The actual list of listeners.
+  internal::TestEventRepeater *repeater_;
+  // Listener responsible for the standard result output.
+  TestEventListener *default_result_printer_;
+  // Listener responsible for the creation of the XML output file.
+  TestEventListener *default_xml_generator_;
+
+  // We disallow copying TestEventListeners.
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestEventListeners);
+};
+
+// A UnitTest consists of a vector of TestSuites.
+//
+// This is a singleton class.  The only instance of UnitTest is
+// created when UnitTest::GetInstance() is first called.  This
+// instance is never deleted.
+//
+// UnitTest is not copyable.
+//
+// This class is thread-safe as long as the methods are called
+// according to their specification.
+class GTEST_API_ UnitTest {
+ public:
+  // Gets the singleton UnitTest object.  The first time this method
+  // is called, a UnitTest object is constructed and returned.
+  // Consecutive calls will return the same object.
+  static UnitTest *GetInstance();
+
+  // Runs all tests in this UnitTest object and prints the result.
+  // Returns 0 if successful, or 1 otherwise.
+  //
+  // This method can only be called from the main thread.
+  //
+  // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+  int Run() GTEST_MUST_USE_RESULT_;
+
+  // Returns the working directory when the first TEST() or TEST_F()
+  // was executed.  The UnitTest object owns the string.
+  const char *original_working_dir() const;
+
+  // Returns the TestSuite object for the test that's currently running,
+  // or NULL if no test is running.
+  const TestSuite *current_test_suite() const GTEST_LOCK_EXCLUDED_(mutex_);
+
+// Legacy API is still available but deprecated
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  const TestCase *current_test_case() const GTEST_LOCK_EXCLUDED_(mutex_);
+#endif
+
+  // Returns the TestInfo object for the test that's currently running,
+  // or NULL if no test is running.
+  const TestInfo *current_test_info() const GTEST_LOCK_EXCLUDED_(mutex_);
+
+  // Returns the random seed used at the start of the current test run.
+  int random_seed() const;
+
+  // Returns the ParameterizedTestSuiteRegistry object used to keep track of
+  // value-parameterized tests and instantiate and register them.
+  //
+  // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+  internal::ParameterizedTestSuiteRegistry &parameterized_test_registry()
+      GTEST_LOCK_EXCLUDED_(mutex_);
+
+  // Gets the number of successful test suites.
+  int successful_test_suite_count() const;
+
+  // Gets the number of failed test suites.
+  int failed_test_suite_count() const;
+
+  // Gets the number of all test suites.
+  int total_test_suite_count() const;
+
+  // Gets the number of all test suites that contain at least one test
+  // that should run.
+  int test_suite_to_run_count() const;
+
+  //  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  int successful_test_case_count() const;
+  int failed_test_case_count() const;
+  int total_test_case_count() const;
+  int test_case_to_run_count() const;
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+  // Gets the number of successful tests.
+  int successful_test_count() const;
+
+  // Gets the number of skipped tests.
+  int skipped_test_count() const;
+
+  // Gets the number of failed tests.
+  int failed_test_count() const;
+
+  // Gets the number of disabled tests that will be reported in the XML report.
+  int reportable_disabled_test_count() const;
+
+  // Gets the number of disabled tests.
+  int disabled_test_count() const;
+
+  // Gets the number of tests to be printed in the XML report.
+  int reportable_test_count() const;
+
+  // Gets the number of all tests.
+  int total_test_count() const;
+
+  // Gets the number of tests that should run.
+  int test_to_run_count() const;
+
+  // Gets the time of the test program start, in ms from the start of the
+  // UNIX epoch.
+  TimeInMillis start_timestamp() const;
+
+  // Gets the elapsed time, in milliseconds.
+  TimeInMillis elapsed_time() const;
+
+  // Returns true if and only if the unit test passed (i.e. all test suites
+  // passed).
+  bool Passed() const;
+
+  // Returns true if and only if the unit test failed (i.e. some test suite
+  // failed or something outside of all tests failed).
+  bool Failed() const;
+
+  // Gets the i-th test suite among all the test suites. i can range from 0 to
+  // total_test_suite_count() - 1. If i is not in that range, returns NULL.
+  const TestSuite *GetTestSuite(int i) const;
+
+//  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  const TestCase *GetTestCase(int i) const;
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+  // Returns the TestResult containing information on test failures and
+  // properties logged outside of individual test suites.
+  const TestResult &ad_hoc_test_result() const;
+
+  // Returns the list of event listeners that can be used to track events
+  // inside Google Test.
+  TestEventListeners &listeners();
+
+ private:
+  // Registers and returns a global test environment.  When a test
+  // program is run, all global test environments will be set-up in
+  // the order they were registered.  After all tests in the program
+  // have finished, all global test environments will be torn-down in
+  // the *reverse* order they were registered.
+  //
+  // The UnitTest object takes ownership of the given environment.
+  //
+  // This method can only be called from the main thread.
+  Environment *AddEnvironment(Environment *env);
+
+  // Adds a TestPartResult to the current TestResult object.  All
+  // Google Test assertion macros (e.g. ASSERT_TRUE, EXPECT_EQ, etc)
+  // eventually call this to report their results.  The user code
+  // should use the assertion macros instead of calling this directly.
+  void AddTestPartResult(TestPartResult::Type result_type,
+                         const char *file_name, int line_number,
+                         const std::string &message,
+                         const std::string &os_stack_trace)
+      GTEST_LOCK_EXCLUDED_(mutex_);
+
+  // Adds a TestProperty to the current TestResult object when invoked from
+  // inside a test, to current TestSuite's ad_hoc_test_result_ when invoked
+  // from SetUpTestSuite or TearDownTestSuite, or to the global property set
+  // when invoked elsewhere.  If the result already contains a property with
+  // the same key, the value will be updated.
+  void RecordProperty(const std::string &key, const std::string &value);
+
+  // Gets the i-th test suite among all the test suites. i can range from 0 to
+  // total_test_suite_count() - 1. If i is not in that range, returns NULL.
+  TestSuite *GetMutableTestSuite(int i);
+
+  // Accessors for the implementation object.
+  internal::UnitTestImpl *impl() { return impl_; }
+  const internal::UnitTestImpl *impl() const { return impl_; }
+
+  // These classes and functions are friends as they need to access private
+  // members of UnitTest.
+  friend class ScopedTrace;
+  friend class Test;
+  friend class internal::AssertHelper;
+  friend class internal::StreamingListenerTest;
+  friend class internal::UnitTestRecordPropertyTestHelper;
+  friend Environment *AddGlobalTestEnvironment(Environment *env);
+  friend std::set<std::string> *internal::GetIgnoredParameterizedTestSuites();
+  friend internal::UnitTestImpl *internal::GetUnitTestImpl();
+  friend void internal::ReportFailureInUnknownLocation(
+      TestPartResult::Type result_type, const std::string &message);
+
+  // Creates an empty UnitTest.
+  UnitTest();
+
+  // D'tor
+  virtual ~UnitTest();
+
+  // Pushes a trace defined by SCOPED_TRACE() on to the per-thread
+  // Google Test trace stack.
+  void PushGTestTrace(const internal::TraceInfo &trace)
+      GTEST_LOCK_EXCLUDED_(mutex_);
+
+  // Pops a trace from the per-thread Google Test trace stack.
+  void PopGTestTrace() GTEST_LOCK_EXCLUDED_(mutex_);
+
+  // Protects mutable state in *impl_.  This is mutable as some const
+  // methods need to lock it too.
+  mutable internal::Mutex mutex_;
+
+  // Opaque implementation object.  This field is never changed once
+  // the object is constructed.  We don't mark it as const here, as
+  // doing so will cause a warning in the constructor of UnitTest.
+  // Mutable state in *impl_ is protected by mutex_.
+  internal::UnitTestImpl *impl_;
+
+  // We disallow copying UnitTest.
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(UnitTest);
+};
+
+// A convenient wrapper for adding an environment for the test
+// program.
+//
+// You should call this before RUN_ALL_TESTS() is called, probably in
+// main().  If you use gtest_main, you need to call this before main()
+// starts for it to take effect.  For example, you can define a global
+// variable like this:
+//
+//   testing::Environment* const foo_env =
+//       testing::AddGlobalTestEnvironment(new FooEnvironment);
+//
+// However, we strongly recommend you to write your own main() and
+// call AddGlobalTestEnvironment() there, as relying on initialization
+// of global variables makes the code harder to read and may cause
+// problems when you register multiple environments from different
+// translation units and the environments have dependencies among them
+// (remember that the compiler doesn't guarantee the order in which
+// global variables from different translation units are initialized).
+inline Environment *AddGlobalTestEnvironment(Environment *env) {
+  return UnitTest::GetInstance()->AddEnvironment(env);
+}
+
+// Initializes Google Test.  This must be called before calling
+// RUN_ALL_TESTS().  In particular, it parses a command line for the
+// flags that Google Test recognizes.  Whenever a Google Test flag is
+// seen, it is removed from argv, and *argc is decremented.
+//
+// No value is returned.  Instead, the Google Test flag variables are
+// updated.
+//
+// Calling the function for the second time has no user-visible effect.
+GTEST_API_ void InitGoogleTest(int *argc, char **argv);
+
+// This overloaded version can be used in Windows programs compiled in
+// UNICODE mode.
+GTEST_API_ void InitGoogleTest(int *argc, wchar_t **argv);
+
+// This overloaded version can be used on Arduino/embedded platforms where
+// there is no argc/argv.
+GTEST_API_ void InitGoogleTest();
+
+namespace internal {
+
+// Separate the error generating code from the code path to reduce the stack
+// frame size of CmpHelperEQ. This helps reduce the overhead of some sanitizers
+// when calling EXPECT_* in a tight loop.
+template <typename T1, typename T2>
+AssertionResult CmpHelperEQFailure(const char *lhs_expression,
+                                   const char *rhs_expression, const T1 &lhs,
+                                   const T2 &rhs) {
+  return EqFailure(lhs_expression, rhs_expression,
+                   FormatForComparisonFailureMessage(lhs, rhs),
+                   FormatForComparisonFailureMessage(rhs, lhs), false);
+}
+
+// This block of code defines operator==/!=
+// to block lexical scope lookup.
+// It prevents using invalid operator==/!= defined at namespace scope.
+struct faketype {};
+inline bool operator==(faketype, faketype) { return true; }
+inline bool operator!=(faketype, faketype) { return false; }
+
+// The helper function for {ASSERT|EXPECT}_EQ.
+template <typename T1, typename T2>
+AssertionResult CmpHelperEQ(const char *lhs_expression,
+                            const char *rhs_expression, const T1 &lhs,
+                            const T2 &rhs) {
+  if (lhs == rhs) {
+    return AssertionSuccess();
+  }
+
+  return CmpHelperEQFailure(lhs_expression, rhs_expression, lhs, rhs);
+}
+
+// With this overloaded version, we allow anonymous enums to be used
+// in {ASSERT|EXPECT}_EQ when compiled with gcc 4, as anonymous enums
+// can be implicitly cast to BiggestInt.
+GTEST_API_ AssertionResult CmpHelperEQ(const char *lhs_expression,
+                                       const char *rhs_expression,
+                                       BiggestInt lhs, BiggestInt rhs);
+
+class EqHelper {
+ public:
+  // This templatized version is for the general case.
+  template <
+      typename T1, typename T2,
+      // Disable this overload for cases where one argument is a pointer
+      // and the other is the null pointer constant.
+      typename std::enable_if<!std::is_integral<T1>::value ||
+                              !std::is_pointer<T2>::value>::type * = nullptr>
+  static AssertionResult Compare(const char *lhs_expression,
+                                 const char *rhs_expression, const T1 &lhs,
+                                 const T2 &rhs) {
+    return CmpHelperEQ(lhs_expression, rhs_expression, lhs, rhs);
+  }
+
+  // With this overloaded version, we allow anonymous enums to be used
+  // in {ASSERT|EXPECT}_EQ when compiled with gcc 4, as anonymous
+  // enums can be implicitly cast to BiggestInt.
+  //
+  // Even though its body looks the same as the above version, we
+  // cannot merge the two, as it will make anonymous enums unhappy.
+  static AssertionResult Compare(const char *lhs_expression,
+                                 const char *rhs_expression, BiggestInt lhs,
+                                 BiggestInt rhs) {
+    return CmpHelperEQ(lhs_expression, rhs_expression, lhs, rhs);
+  }
+
+  template <typename T>
+  static AssertionResult Compare(
+      const char *lhs_expression, const char *rhs_expression,
+      // Handle cases where '0' is used as a null pointer literal.
+      std::nullptr_t /* lhs */, T *rhs) {
+    // We already know that 'lhs' is a null pointer.
+    return CmpHelperEQ(lhs_expression, rhs_expression,
+                       static_cast<T *>(nullptr), rhs);
+  }
+};
+
+// Separate the error generating code from the code path to reduce the stack
+// frame size of CmpHelperOP. This helps reduce the overhead of some sanitizers
+// when calling EXPECT_OP in a tight loop.
+template <typename T1, typename T2>
+AssertionResult CmpHelperOpFailure(const char *expr1, const char *expr2,
+                                   const T1 &val1, const T2 &val2,
+                                   const char *op) {
+  return AssertionFailure()
+         << "Expected: (" << expr1 << ") " << op << " (" << expr2
+         << "), actual: " << FormatForComparisonFailureMessage(val1, val2)
+         << " vs " << FormatForComparisonFailureMessage(val2, val1);
+}
+
+// A macro for implementing the helper functions needed to implement
+// ASSERT_?? and EXPECT_??.  It is here just to avoid copy-and-paste
+// of similar code.
+//
+// For each templatized helper function, we also define an overloaded
+// version for BiggestInt in order to reduce code bloat and allow
+// anonymous enums to be used with {ASSERT|EXPECT}_?? when compiled
+// with gcc 4.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+
+#define GTEST_IMPL_CMP_HELPER_(op_name, op)                                \
+  template <typename T1, typename T2>                                      \
+  AssertionResult CmpHelper##op_name(const char *expr1, const char *expr2, \
+                                     const T1 &val1, const T2 &val2) {     \
+    if (val1 op val2) {                                                    \
+      return AssertionSuccess();                                           \
+    } else {                                                               \
+      return CmpHelperOpFailure(expr1, expr2, val1, val2, #op);            \
+    }                                                                      \
+  }                                                                        \
+  GTEST_API_ AssertionResult CmpHelper##op_name(                           \
+      const char *expr1, const char *expr2, BiggestInt val1, BiggestInt val2)
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+
+// Implements the helper function for {ASSERT|EXPECT}_NE
+GTEST_IMPL_CMP_HELPER_(NE, !=);
+// Implements the helper function for {ASSERT|EXPECT}_LE
+GTEST_IMPL_CMP_HELPER_(LE, <=);
+// Implements the helper function for {ASSERT|EXPECT}_LT
+GTEST_IMPL_CMP_HELPER_(LT, <);
+// Implements the helper function for {ASSERT|EXPECT}_GE
+GTEST_IMPL_CMP_HELPER_(GE, >=);
+// Implements the helper function for {ASSERT|EXPECT}_GT
+GTEST_IMPL_CMP_HELPER_(GT, >);
+
+#undef GTEST_IMPL_CMP_HELPER_
+
+// The helper function for {ASSERT|EXPECT}_STREQ.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+GTEST_API_ AssertionResult CmpHelperSTREQ(const char *s1_expression,
+                                          const char *s2_expression,
+                                          const char *s1, const char *s2);
+
+// The helper function for {ASSERT|EXPECT}_STRCASEEQ.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+GTEST_API_ AssertionResult CmpHelperSTRCASEEQ(const char *s1_expression,
+                                              const char *s2_expression,
+                                              const char *s1, const char *s2);
+
+// The helper function for {ASSERT|EXPECT}_STRNE.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+GTEST_API_ AssertionResult CmpHelperSTRNE(const char *s1_expression,
+                                          const char *s2_expression,
+                                          const char *s1, const char *s2);
+
+// The helper function for {ASSERT|EXPECT}_STRCASENE.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+GTEST_API_ AssertionResult CmpHelperSTRCASENE(const char *s1_expression,
+                                              const char *s2_expression,
+                                              const char *s1, const char *s2);
+
+// Helper function for *_STREQ on wide strings.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+GTEST_API_ AssertionResult CmpHelperSTREQ(const char *s1_expression,
+                                          const char *s2_expression,
+                                          const wchar_t *s1, const wchar_t *s2);
+
+// Helper function for *_STRNE on wide strings.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+GTEST_API_ AssertionResult CmpHelperSTRNE(const char *s1_expression,
+                                          const char *s2_expression,
+                                          const wchar_t *s1, const wchar_t *s2);
+
+}  // namespace internal
+
+// IsSubstring() and IsNotSubstring() are intended to be used as the
+// first argument to {EXPECT,ASSERT}_PRED_FORMAT2(), not by
+// themselves.  They check whether needle is a substring of haystack
+// (NULL is considered a substring of itself only), and return an
+// appropriate error message when they fail.
+//
+// The {needle,haystack}_expr arguments are the stringified
+// expressions that generated the two real arguments.
+GTEST_API_ AssertionResult IsSubstring(const char *needle_expr,
+                                       const char *haystack_expr,
+                                       const char *needle,
+                                       const char *haystack);
+GTEST_API_ AssertionResult IsSubstring(const char *needle_expr,
+                                       const char *haystack_expr,
+                                       const wchar_t *needle,
+                                       const wchar_t *haystack);
+GTEST_API_ AssertionResult IsNotSubstring(const char *needle_expr,
+                                          const char *haystack_expr,
+                                          const char *needle,
+                                          const char *haystack);
+GTEST_API_ AssertionResult IsNotSubstring(const char *needle_expr,
+                                          const char *haystack_expr,
+                                          const wchar_t *needle,
+                                          const wchar_t *haystack);
+GTEST_API_ AssertionResult IsSubstring(const char *needle_expr,
+                                       const char *haystack_expr,
+                                       const ::std::string &needle,
+                                       const ::std::string &haystack);
+GTEST_API_ AssertionResult IsNotSubstring(const char *needle_expr,
+                                          const char *haystack_expr,
+                                          const ::std::string &needle,
+                                          const ::std::string &haystack);
+
+#if GTEST_HAS_STD_WSTRING
+GTEST_API_ AssertionResult IsSubstring(const char *needle_expr,
+                                       const char *haystack_expr,
+                                       const ::std::wstring &needle,
+                                       const ::std::wstring &haystack);
+GTEST_API_ AssertionResult IsNotSubstring(const char *needle_expr,
+                                          const char *haystack_expr,
+                                          const ::std::wstring &needle,
+                                          const ::std::wstring &haystack);
+#endif  // GTEST_HAS_STD_WSTRING
+
+namespace internal {
+
+// Helper template function for comparing floating-points.
+//
+// Template parameter:
+//
+//   RawType: the raw floating-point type (either float or double)
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+template <typename RawType>
+AssertionResult CmpHelperFloatingPointEQ(const char *lhs_expression,
+                                         const char *rhs_expression,
+                                         RawType lhs_value, RawType rhs_value) {
+  const FloatingPoint<RawType> lhs(lhs_value), rhs(rhs_value);
+
+  if (lhs.AlmostEquals(rhs)) {
+    return AssertionSuccess();
+  }
+
+  ::std::stringstream lhs_ss;
+  lhs_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
+         << lhs_value;
+
+  ::std::stringstream rhs_ss;
+  rhs_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
+         << rhs_value;
+
+  return EqFailure(lhs_expression, rhs_expression,
+                   StringStreamToString(&lhs_ss), StringStreamToString(&rhs_ss),
+                   false);
+}
+
+// Helper function for implementing ASSERT_NEAR.
+//
+// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
+GTEST_API_ AssertionResult DoubleNearPredFormat(const char *expr1,
+                                                const char *expr2,
+                                                const char *abs_error_expr,
+                                                double val1, double val2,
+                                                double abs_error);
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+// A class that enables one to stream messages to assertion macros
+class GTEST_API_ AssertHelper {
+ public:
+  // Constructor.
+  AssertHelper(TestPartResult::Type type, const char *file, int line,
+               const char *message);
+  ~AssertHelper();
+
+  // Message assignment is a semantic trick to enable assertion
+  // streaming; see the GTEST_MESSAGE_ macro below.
+  void operator=(const Message &message) const;
+
+ private:
+  // We put our data in a struct so that the size of the AssertHelper class can
+  // be as small as possible.  This is important because gcc is incapable of
+  // re-using stack space even for temporary variables, so every EXPECT_EQ
+  // reserves stack space for another AssertHelper.
+  struct AssertHelperData {
+    AssertHelperData(TestPartResult::Type t, const char *srcfile, int line_num,
+                     const char *msg)
+        : type(t), file(srcfile), line(line_num), message(msg) {}
+
+    TestPartResult::Type const type;
+    const char *const file;
+    int const line;
+    std::string const message;
+
+   private:
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(AssertHelperData);
+  };
+
+  AssertHelperData *const data_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(AssertHelper);
+};
+
+enum GTestColor { COLOR_DEFAULT, COLOR_RED, COLOR_GREEN, COLOR_YELLOW };
+
+GTEST_API_ GTEST_ATTRIBUTE_PRINTF_(2, 3) void ColoredPrintf(GTestColor color,
+                                                            const char *fmt,
+                                                            ...);
+
+}  // namespace internal
+
+// The pure interface class that all value-parameterized tests inherit from.
+// A value-parameterized class must inherit from both ::testing::Test and
+// ::testing::WithParamInterface. In most cases that just means inheriting
+// from ::testing::TestWithParam, but more complicated test hierarchies
+// may need to inherit from Test and WithParamInterface at different levels.
+//
+// This interface has support for accessing the test parameter value via
+// the GetParam() method.
+//
+// Use it with one of the parameter generator defining functions, like Range(),
+// Values(), ValuesIn(), Bool(), and Combine().
+//
+// class FooTest : public ::testing::TestWithParam<int> {
+//  protected:
+//   FooTest() {
+//     // Can use GetParam() here.
+//   }
+//   ~FooTest() override {
+//     // Can use GetParam() here.
+//   }
+//   void SetUp() override {
+//     // Can use GetParam() here.
+//   }
+//   void TearDown override {
+//     // Can use GetParam() here.
+//   }
+// };
+// TEST_P(FooTest, DoesBar) {
+//   // Can use GetParam() method here.
+//   Foo foo;
+//   ASSERT_TRUE(foo.DoesBar(GetParam()));
+// }
+// INSTANTIATE_TEST_SUITE_P(OneToTenRange, FooTest, ::testing::Range(1, 10));
+
+template <typename T>
+class WithParamInterface {
+ public:
+  typedef T ParamType;
+  virtual ~WithParamInterface() {}
+
+  // The current parameter value. Is also available in the test fixture's
+  // constructor.
+  static const ParamType &GetParam() {
+    GTEST_CHECK_(parameter_ != nullptr)
+        << "GetParam() can only be called inside a value-parameterized test "
+        << "-- did you intend to write TEST_P instead of TEST_F?";
+    return *parameter_;
+  }
+
+ private:
+  // Sets parameter value. The caller is responsible for making sure the value
+  // remains alive and unchanged throughout the current test.
+  static void SetParam(const ParamType *parameter) { parameter_ = parameter; }
+
+  // Static value used for accessing parameter during a test lifetime.
+  static const ParamType *parameter_;
+
+  // TestClass must be a subclass of WithParamInterface<T> and Test.
+  template <class TestClass>
+  friend class internal::ParameterizedTestFactory;
+};
+
+template <typename T>
+const T *WithParamInterface<T>::parameter_ = nullptr;
+
+// Most value-parameterized classes can ignore the existence of
+// WithParamInterface, and can just inherit from ::testing::TestWithParam.
+
+template <typename T>
+class TestWithParam : public Test, public WithParamInterface<T> {};
+
+// Macros for indicating success/failure in test code.
+
+// Skips test in runtime.
+// Skipping test aborts current function.
+// Skipped tests are neither successful nor failed.
+#define GTEST_SKIP() GTEST_SKIP_("")
+
+// ADD_FAILURE unconditionally adds a failure to the current test.
+// SUCCEED generates a success - it doesn't automatically make the
+// current test successful, as a test is only successful when it has
+// no failure.
+//
+// EXPECT_* verifies that a certain condition is satisfied.  If not,
+// it behaves like ADD_FAILURE.  In particular:
+//
+//   EXPECT_TRUE  verifies that a Boolean condition is true.
+//   EXPECT_FALSE verifies that a Boolean condition is false.
+//
+// FAIL and ASSERT_* are similar to ADD_FAILURE and EXPECT_*, except
+// that they will also abort the current function on failure.  People
+// usually want the fail-fast behavior of FAIL and ASSERT_*, but those
+// writing data-driven tests often find themselves using ADD_FAILURE
+// and EXPECT_* more.
+
+// Generates a nonfatal failure with a generic message.
+#define ADD_FAILURE() GTEST_NONFATAL_FAILURE_("Failed")
+
+// Generates a nonfatal failure at the given source file location with
+// a generic message.
+#define ADD_FAILURE_AT(file, line)        \
+  GTEST_MESSAGE_AT_(file, line, "Failed", \
+                    ::testing::TestPartResult::kNonFatalFailure)
+
+// Generates a fatal failure with a generic message.
+#define GTEST_FAIL() GTEST_FATAL_FAILURE_("Failed")
+
+// Like GTEST_FAIL(), but at the given source file location.
+#define GTEST_FAIL_AT(file, line)         \
+  GTEST_MESSAGE_AT_(file, line, "Failed", \
+                    ::testing::TestPartResult::kFatalFailure)
+
+// Define this macro to 1 to omit the definition of FAIL(), which is a
+// generic name and clashes with some other libraries.
+#if !GTEST_DONT_DEFINE_FAIL
+#define FAIL() GTEST_FAIL()
+#endif
+
+// Generates a success with a generic message.
+#define GTEST_SUCCEED() GTEST_SUCCESS_("Succeeded")
+
+// Define this macro to 1 to omit the definition of SUCCEED(), which
+// is a generic name and clashes with some other libraries.
+#if !GTEST_DONT_DEFINE_SUCCEED
+#define SUCCEED() GTEST_SUCCEED()
+#endif
+
+// Macros for testing exceptions.
+//
+//    * {ASSERT|EXPECT}_THROW(statement, expected_exception):
+//         Tests that the statement throws the expected exception.
+//    * {ASSERT|EXPECT}_NO_THROW(statement):
+//         Tests that the statement doesn't throw any exception.
+//    * {ASSERT|EXPECT}_ANY_THROW(statement):
+//         Tests that the statement throws an exception.
+
+#define EXPECT_THROW(statement, expected_exception) \
+  GTEST_TEST_THROW_(statement, expected_exception, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_NO_THROW(statement) \
+  GTEST_TEST_NO_THROW_(statement, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_ANY_THROW(statement) \
+  GTEST_TEST_ANY_THROW_(statement, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_THROW(statement, expected_exception) \
+  GTEST_TEST_THROW_(statement, expected_exception, GTEST_FATAL_FAILURE_)
+#define ASSERT_NO_THROW(statement) \
+  GTEST_TEST_NO_THROW_(statement, GTEST_FATAL_FAILURE_)
+#define ASSERT_ANY_THROW(statement) \
+  GTEST_TEST_ANY_THROW_(statement, GTEST_FATAL_FAILURE_)
+
+// Boolean assertions. Condition can be either a Boolean expression or an
+// AssertionResult. For more information on how to use AssertionResult with
+// these macros see comments on that class.
+#define EXPECT_TRUE(condition)                            \
+  GTEST_TEST_BOOLEAN_(condition, #condition, false, true, \
+                      GTEST_NONFATAL_FAILURE_)
+#define EXPECT_FALSE(condition)                              \
+  GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \
+                      GTEST_NONFATAL_FAILURE_)
+#define ASSERT_TRUE(condition) \
+  GTEST_TEST_BOOLEAN_(condition, #condition, false, true, GTEST_FATAL_FAILURE_)
+#define ASSERT_FALSE(condition)                              \
+  GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \
+                      GTEST_FATAL_FAILURE_)
+
+// Macros for testing equalities and inequalities.
+//
+//    * {ASSERT|EXPECT}_EQ(v1, v2): Tests that v1 == v2
+//    * {ASSERT|EXPECT}_NE(v1, v2): Tests that v1 != v2
+//    * {ASSERT|EXPECT}_LT(v1, v2): Tests that v1 < v2
+//    * {ASSERT|EXPECT}_LE(v1, v2): Tests that v1 <= v2
+//    * {ASSERT|EXPECT}_GT(v1, v2): Tests that v1 > v2
+//    * {ASSERT|EXPECT}_GE(v1, v2): Tests that v1 >= v2
+//
+// When they are not, Google Test prints both the tested expressions and
+// their actual values.  The values must be compatible built-in types,
+// or you will get a compiler error.  By "compatible" we mean that the
+// values can be compared by the respective operator.
+//
+// Note:
+//
+//   1. It is possible to make a user-defined type work with
+//   {ASSERT|EXPECT}_??(), but that requires overloading the
+//   comparison operators and is thus discouraged by the Google C++
+//   Usage Guide.  Therefore, you are advised to use the
+//   {ASSERT|EXPECT}_TRUE() macro to assert that two objects are
+//   equal.
+//
+//   2. The {ASSERT|EXPECT}_??() macros do pointer comparisons on
+//   pointers (in particular, C strings).  Therefore, if you use it
+//   with two C strings, you are testing how their locations in memory
+//   are related, not how their content is related.  To compare two C
+//   strings by content, use {ASSERT|EXPECT}_STR*().
+//
+//   3. {ASSERT|EXPECT}_EQ(v1, v2) is preferred to
+//   {ASSERT|EXPECT}_TRUE(v1 == v2), as the former tells you
+//   what the actual value is when it fails, and similarly for the
+//   other comparisons.
+//
+//   4. Do not depend on the order in which {ASSERT|EXPECT}_??()
+//   evaluate their arguments, which is undefined.
+//
+//   5. These macros evaluate their arguments exactly once.
+//
+// Examples:
+//
+//   EXPECT_NE(Foo(), 5);
+//   EXPECT_EQ(a_pointer, NULL);
+//   ASSERT_LT(i, array_size);
+//   ASSERT_GT(records.size(), 0) << "There is no record left.";
+
+#define EXPECT_EQ(val1, val2) \
+  EXPECT_PRED_FORMAT2(::testing::internal::EqHelper::Compare, val1, val2)
+#define EXPECT_NE(val1, val2) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperNE, val1, val2)
+#define EXPECT_LE(val1, val2) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperLE, val1, val2)
+#define EXPECT_LT(val1, val2) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperLT, val1, val2)
+#define EXPECT_GE(val1, val2) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperGE, val1, val2)
+#define EXPECT_GT(val1, val2) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperGT, val1, val2)
+
+#define GTEST_ASSERT_EQ(val1, val2) \
+  ASSERT_PRED_FORMAT2(::testing::internal::EqHelper::Compare, val1, val2)
+#define GTEST_ASSERT_NE(val1, val2) \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperNE, val1, val2)
+#define GTEST_ASSERT_LE(val1, val2) \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperLE, val1, val2)
+#define GTEST_ASSERT_LT(val1, val2) \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperLT, val1, val2)
+#define GTEST_ASSERT_GE(val1, val2) \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperGE, val1, val2)
+#define GTEST_ASSERT_GT(val1, val2) \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperGT, val1, val2)
+
+// Define macro GTEST_DONT_DEFINE_ASSERT_XY to 1 to omit the definition of
+// ASSERT_XY(), which clashes with some users' own code.
+
+#if !GTEST_DONT_DEFINE_ASSERT_EQ
+#define ASSERT_EQ(val1, val2) GTEST_ASSERT_EQ(val1, val2)
+#endif
+
+#if !GTEST_DONT_DEFINE_ASSERT_NE
+#define ASSERT_NE(val1, val2) GTEST_ASSERT_NE(val1, val2)
+#endif
+
+#if !GTEST_DONT_DEFINE_ASSERT_LE
+#define ASSERT_LE(val1, val2) GTEST_ASSERT_LE(val1, val2)
+#endif
+
+#if !GTEST_DONT_DEFINE_ASSERT_LT
+#define ASSERT_LT(val1, val2) GTEST_ASSERT_LT(val1, val2)
+#endif
+
+#if !GTEST_DONT_DEFINE_ASSERT_GE
+#define ASSERT_GE(val1, val2) GTEST_ASSERT_GE(val1, val2)
+#endif
+
+#if !GTEST_DONT_DEFINE_ASSERT_GT
+#define ASSERT_GT(val1, val2) GTEST_ASSERT_GT(val1, val2)
+#endif
+
+// C-string Comparisons.  All tests treat NULL and any non-NULL string
+// as different.  Two NULLs are equal.
+//
+//    * {ASSERT|EXPECT}_STREQ(s1, s2):     Tests that s1 == s2
+//    * {ASSERT|EXPECT}_STRNE(s1, s2):     Tests that s1 != s2
+//    * {ASSERT|EXPECT}_STRCASEEQ(s1, s2): Tests that s1 == s2, ignoring case
+//    * {ASSERT|EXPECT}_STRCASENE(s1, s2): Tests that s1 != s2, ignoring case
+//
+// For wide or narrow string objects, you can use the
+// {ASSERT|EXPECT}_??() macros.
+//
+// Don't depend on the order in which the arguments are evaluated,
+// which is undefined.
+//
+// These macros evaluate their arguments exactly once.
+
+#define EXPECT_STREQ(s1, s2) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTREQ, s1, s2)
+#define EXPECT_STRNE(s1, s2) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRNE, s1, s2)
+#define EXPECT_STRCASEEQ(s1, s2) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, s1, s2)
+#define EXPECT_STRCASENE(s1, s2) \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASENE, s1, s2)
+
+#define ASSERT_STREQ(s1, s2) \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTREQ, s1, s2)
+#define ASSERT_STRNE(s1, s2) \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRNE, s1, s2)
+#define ASSERT_STRCASEEQ(s1, s2) \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, s1, s2)
+#define ASSERT_STRCASENE(s1, s2) \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASENE, s1, s2)
+
+// Macros for comparing floating-point numbers.
+//
+//    * {ASSERT|EXPECT}_FLOAT_EQ(val1, val2):
+//         Tests that two float values are almost equal.
+//    * {ASSERT|EXPECT}_DOUBLE_EQ(val1, val2):
+//         Tests that two double values are almost equal.
+//    * {ASSERT|EXPECT}_NEAR(v1, v2, abs_error):
+//         Tests that v1 and v2 are within the given distance to each other.
+//
+// Google Test uses ULP-based comparison to automatically pick a default
+// error bound that is appropriate for the operands.  See the
+// FloatingPoint template class in gtest-internal.h if you are
+// interested in the implementation details.
+
+#define EXPECT_FLOAT_EQ(val1, val2)                                         \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<float>, \
+                      val1, val2)
+
+#define EXPECT_DOUBLE_EQ(val1, val2)                                         \
+  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<double>, \
+                      val1, val2)
+
+#define ASSERT_FLOAT_EQ(val1, val2)                                         \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<float>, \
+                      val1, val2)
+
+#define ASSERT_DOUBLE_EQ(val1, val2)                                         \
+  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<double>, \
+                      val1, val2)
+
+#define EXPECT_NEAR(val1, val2, abs_error)                                   \
+  EXPECT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, val1, val2, \
+                      abs_error)
+
+#define ASSERT_NEAR(val1, val2, abs_error)                                   \
+  ASSERT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, val1, val2, \
+                      abs_error)
+
+// These predicate format functions work on floating-point values, and
+// can be used in {ASSERT|EXPECT}_PRED_FORMAT2*(), e.g.
+//
+//   EXPECT_PRED_FORMAT2(testing::DoubleLE, Foo(), 5.0);
+
+// Asserts that val1 is less than, or almost equal to, val2.  Fails
+// otherwise.  In particular, it fails if either val1 or val2 is NaN.
+GTEST_API_ AssertionResult FloatLE(const char *expr1, const char *expr2,
+                                   float val1, float val2);
+GTEST_API_ AssertionResult DoubleLE(const char *expr1, const char *expr2,
+                                    double val1, double val2);
+
+#if GTEST_OS_WINDOWS
+
+// Macros that test for HRESULT failure and success, these are only useful
+// on Windows, and rely on Windows SDK macros and APIs to compile.
+//
+//    * {ASSERT|EXPECT}_HRESULT_{SUCCEEDED|FAILED}(expr)
+//
+// When expr unexpectedly fails or succeeds, Google Test prints the
+// expected result and the actual result with both a human-readable
+// string representation of the error, if available, as well as the
+// hex result code.
+#define EXPECT_HRESULT_SUCCEEDED(expr) \
+  EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr))
+
+#define ASSERT_HRESULT_SUCCEEDED(expr) \
+  ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr))
+
+#define EXPECT_HRESULT_FAILED(expr) \
+  EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr))
+
+#define ASSERT_HRESULT_FAILED(expr) \
+  ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr))
+
+#endif  // GTEST_OS_WINDOWS
+
+// Macros that execute statement and check that it doesn't generate new fatal
+// failures in the current thread.
+//
+//   * {ASSERT|EXPECT}_NO_FATAL_FAILURE(statement);
+//
+// Examples:
+//
+//   EXPECT_NO_FATAL_FAILURE(Process());
+//   ASSERT_NO_FATAL_FAILURE(Process()) << "Process() failed";
+//
+#define ASSERT_NO_FATAL_FAILURE(statement) \
+  GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_FATAL_FAILURE_)
+#define EXPECT_NO_FATAL_FAILURE(statement) \
+  GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_NONFATAL_FAILURE_)
+
+// Causes a trace (including the given source file path and line number,
+// and the given message) to be included in every test failure message generated
+// by code in the scope of the lifetime of an instance of this class. The effect
+// is undone with the destruction of the instance.
+//
+// The message argument can be anything streamable to std::ostream.
+//
+// Example:
+//   testing::ScopedTrace trace("file.cc", 123, "message");
+//
+class GTEST_API_ ScopedTrace {
+ public:
+  // The c'tor pushes the given source file location and message onto
+  // a trace stack maintained by Google Test.
+
+  // Template version. Uses Message() to convert the values into strings.
+  // Slow, but flexible.
+  template <typename T>
+  ScopedTrace(const char *file, int line, const T &message) {
+    PushTrace(file, line, (Message() << message).GetString());
+  }
+
+  // Optimize for some known types.
+  ScopedTrace(const char *file, int line, const char *message) {
+    PushTrace(file, line, message ? message : "(null)");
+  }
+
+  ScopedTrace(const char *file, int line, const std::string &message) {
+    PushTrace(file, line, message);
+  }
+
+  // The d'tor pops the info pushed by the c'tor.
+  //
+  // Note that the d'tor is not virtual in order to be efficient.
+  // Don't inherit from ScopedTrace!
+  ~ScopedTrace();
+
+ private:
+  void PushTrace(const char *file, int line, std::string message);
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedTrace);
+} GTEST_ATTRIBUTE_UNUSED_;  // A ScopedTrace object does its job in its
+                            // c'tor and d'tor.  Therefore it doesn't
+                            // need to be used otherwise.
+
+// Causes a trace (including the source file path, the current line
+// number, and the given message) to be included in every test failure
+// message generated by code in the current scope.  The effect is
+// undone when the control leaves the current scope.
+//
+// The message argument can be anything streamable to std::ostream.
+//
+// In the implementation, we include the current line number as part
+// of the dummy variable name, thus allowing multiple SCOPED_TRACE()s
+// to appear in the same block - as long as they are on different
+// lines.
+//
+// Assuming that each thread maintains its own stack of traces.
+// Therefore, a SCOPED_TRACE() would (correctly) only affect the
+// assertions in its own thread.
+#define SCOPED_TRACE(message)                                         \
+  ::testing::ScopedTrace GTEST_CONCAT_TOKEN_(gtest_trace_, __LINE__)( \
+      __FILE__, __LINE__, (message))
+
+// Compile-time assertion for type equality.
+// StaticAssertTypeEq<type1, type2>() compiles if and only if type1 and type2
+// are the same type.  The value it returns is not interesting.
+//
+// Instead of making StaticAssertTypeEq a class template, we make it a
+// function template that invokes a helper class template.  This
+// prevents a user from misusing StaticAssertTypeEq<T1, T2> by
+// defining objects of that type.
+//
+// CAVEAT:
+//
+// When used inside a method of a class template,
+// StaticAssertTypeEq<T1, T2>() is effective ONLY IF the method is
+// instantiated.  For example, given:
+//
+//   template <typename T> class Foo {
+//    public:
+//     void Bar() { testing::StaticAssertTypeEq<int, T>(); }
+//   };
+//
+// the code:
+//
+//   void Test1() { Foo<bool> foo; }
+//
+// will NOT generate a compiler error, as Foo<bool>::Bar() is never
+// actually instantiated.  Instead, you need:
+//
+//   void Test2() { Foo<bool> foo; foo.Bar(); }
+//
+// to cause a compiler error.
+template <typename T1, typename T2>
+constexpr bool StaticAssertTypeEq() noexcept {
+  static_assert(std::is_same<T1, T2>::value, "T1 and T2 are not the same type");
+  return true;
+}
+
+// Defines a test.
+//
+// The first parameter is the name of the test suite, and the second
+// parameter is the name of the test within the test suite.
+//
+// The convention is to end the test suite name with "Test".  For
+// example, a test suite for the Foo class can be named FooTest.
+//
+// Test code should appear between braces after an invocation of
+// this macro.  Example:
+//
+//   TEST(FooTest, InitializesCorrectly) {
+//     Foo foo;
+//     EXPECT_TRUE(foo.StatusIsOK());
+//   }
+
+// Note that we call GetTestTypeId() instead of GetTypeId<
+// ::testing::Test>() here to get the type ID of testing::Test.  This
+// is to work around a suspected linker bug when using Google Test as
+// a framework on Mac OS X.  The bug causes GetTypeId<
+// ::testing::Test>() to return different values depending on whether
+// the call is from the Google Test framework itself or from user test
+// code.  GetTestTypeId() is guaranteed to always return the same
+// value, as it always calls GetTypeId<>() from the Google Test
+// framework.
+#define GTEST_TEST(test_suite_name, test_name)             \
+  GTEST_TEST_(test_suite_name, test_name, ::testing::Test, \
+              ::testing::internal::GetTestTypeId())
+
+// Define this macro to 1 to omit the definition of TEST(), which
+// is a generic name and clashes with some other libraries.
+#if !GTEST_DONT_DEFINE_TEST
+#define TEST(test_suite_name, test_name) GTEST_TEST(test_suite_name, test_name)
+#endif
+
+// Defines a test that uses a test fixture.
+//
+// The first parameter is the name of the test fixture class, which
+// also doubles as the test suite name.  The second parameter is the
+// name of the test within the test suite.
+//
+// A test fixture class must be declared earlier.  The user should put
+// the test code between braces after using this macro.  Example:
+//
+//   class FooTest : public testing::Test {
+//    protected:
+//     void SetUp() override { b_.AddElement(3); }
+//
+//     Foo a_;
+//     Foo b_;
+//   };
+//
+//   TEST_F(FooTest, InitializesCorrectly) {
+//     EXPECT_TRUE(a_.StatusIsOK());
+//   }
+//
+//   TEST_F(FooTest, ReturnsElementCountCorrectly) {
+//     EXPECT_EQ(a_.size(), 0);
+//     EXPECT_EQ(b_.size(), 1);
+//   }
+//
+// GOOGLETEST_CM0011 DO NOT DELETE
+#if !GTEST_DONT_DEFINE_TEST
+#define TEST_F(test_fixture, test_name)              \
+  GTEST_TEST_(test_fixture, test_name, test_fixture, \
+              ::testing::internal::GetTypeId<test_fixture>())
+#endif  // !GTEST_DONT_DEFINE_TEST
+
+// Returns a path to temporary directory.
+// Tries to determine an appropriate directory for the platform.
+GTEST_API_ std::string TempDir();
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+// Dynamically registers a test with the framework.
+//
+// This is an advanced API only to be used when the `TEST` macros are
+// insufficient. The macros should be preferred when possible, as they avoid
+// most of the complexity of calling this function.
+//
+// The `factory` argument is a factory callable (move-constructible) object or
+// function pointer that creates a new instance of the Test object. It
+// handles ownership to the caller. The signature of the callable is
+// `Fixture*()`, where `Fixture` is the test fixture class for the test. All
+// tests registered with the same `test_suite_name` must return the same
+// fixture type. This is checked at runtime.
+//
+// The framework will infer the fixture class from the factory and will call
+// the `SetUpTestSuite` and `TearDownTestSuite` for it.
+//
+// Must be called before `RUN_ALL_TESTS()` is invoked, otherwise behavior is
+// undefined.
+//
+// Use case example:
+//
+// class MyFixture : public ::testing::Test {
+//  public:
+//   // All of these optional, just like in regular macro usage.
+//   static void SetUpTestSuite() { ... }
+//   static void TearDownTestSuite() { ... }
+//   void SetUp() override { ... }
+//   void TearDown() override { ... }
+// };
+//
+// class MyTest : public MyFixture {
+//  public:
+//   explicit MyTest(int data) : data_(data) {}
+//   void TestBody() override { ... }
+//
+//  private:
+//   int data_;
+// };
+//
+// void RegisterMyTests(const std::vector<int>& values) {
+//   for (int v : values) {
+//     ::testing::RegisterTest(
+//         "MyFixture", ("Test" + std::to_string(v)).c_str(), nullptr,
+//         std::to_string(v).c_str(),
+//         __FILE__, __LINE__,
+//         // Important to use the fixture type as the return type here.
+//         [=]() -> MyFixture* { return new MyTest(v); });
+//   }
+// }
+// ...
+// int main(int argc, char** argv) {
+//   std::vector<int> values_to_test = LoadValuesFromConfig();
+//   RegisterMyTests(values_to_test);
+//   ...
+//   return RUN_ALL_TESTS();
+// }
+//
+template <int &... ExplicitParameterBarrier, typename Factory>
+TestInfo *RegisterTest(const char *test_suite_name, const char *test_name,
+                       const char *type_param, const char *value_param,
+                       const char *file, int line, Factory factory) {
+  using TestT = typename std::remove_pointer<decltype(factory())>::type;
+
+  class FactoryImpl : public internal::TestFactoryBase {
+   public:
+    explicit FactoryImpl(Factory f) : factory_(std::move(f)) {}
+    Test *CreateTest() override { return factory_(); }
+
+   private:
+    Factory factory_;
+  };
+
+  return internal::MakeAndRegisterTestInfo(
+      test_suite_name, test_name, type_param, value_param,
+      internal::CodeLocation(file, line), internal::GetTypeId<TestT>(),
+      internal::SuiteApiResolver<TestT>::GetSetUpCaseOrSuite(file, line),
+      internal::SuiteApiResolver<TestT>::GetTearDownCaseOrSuite(file, line),
+      new FactoryImpl{ std::move(factory) });
+}
+
+}  // namespace testing
+
+// Use this function in main() to run all tests.  It returns 0 if all
+// tests are successful, or 1 otherwise.
+//
+// RUN_ALL_TESTS() should be invoked after the command line has been
+// parsed by InitGoogleTest().
+//
+// This function was formerly a macro; thus, it is in the global
+// namespace and has an all-caps name.
+int RUN_ALL_TESTS() GTEST_MUST_USE_RESULT_;
+
+inline int RUN_ALL_TESTS() { return ::testing::UnitTest::GetInstance()->Run(); }
+
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_H_
diff --git a/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest_pred_impl.h b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest_pred_impl.h
new file mode 100644
index 000000000..1fc21910b
--- /dev/null
+++ b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest_pred_impl.h
@@ -0,0 +1,277 @@
+// Copyright 2006, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// This file is AUTOMATICALLY GENERATED on 01/02/2019 by command
+// 'gen_gtest_pred_impl.py 5'.  DO NOT EDIT BY HAND!
+//
+// Implements a family of generic predicate assertion macros.
+// GOOGLETEST_CM0001 DO NOT DELETE
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
+#define GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
+
+#include "gtest/gtest.h"
+
+namespace testing {
+
+// This header implements a family of generic predicate assertion
+// macros:
+//
+//   ASSERT_PRED_FORMAT1(pred_format, v1)
+//   ASSERT_PRED_FORMAT2(pred_format, v1, v2)
+//   ...
+//
+// where pred_format is a function or functor that takes n (in the
+// case of ASSERT_PRED_FORMATn) values and their source expression
+// text, and returns a testing::AssertionResult.  See the definition
+// of ASSERT_EQ in gtest.h for an example.
+//
+// If you don't care about formatting, you can use the more
+// restrictive version:
+//
+//   ASSERT_PRED1(pred, v1)
+//   ASSERT_PRED2(pred, v1, v2)
+//   ...
+//
+// where pred is an n-ary function or functor that returns bool,
+// and the values v1, v2, ..., must support the << operator for
+// streaming to std::ostream.
+//
+// We also define the EXPECT_* variations.
+//
+// For now we only support predicates whose arity is at most 5.
+// Please email googletestframework@googlegroups.com if you need
+// support for higher arities.
+
+// GTEST_ASSERT_ is the basic statement to which all of the assertions
+// in this file reduce.  Don't use this in your code.
+
+#define GTEST_ASSERT_(expression, on_failure)                   \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_                                 \
+  if (const ::testing::AssertionResult gtest_ar = (expression)) \
+    ;                                                           \
+  else                                                          \
+    on_failure(gtest_ar.failure_message())
+
+// Helper function for implementing {EXPECT|ASSERT}_PRED1.  Don't use
+// this in your code.
+template <typename Pred, typename T1>
+AssertionResult AssertPred1Helper(const char *pred_text, const char *e1,
+                                  Pred pred, const T1 &v1) {
+  if (pred(v1)) return AssertionSuccess();
+
+  return AssertionFailure()
+         << pred_text << "(" << e1 << ") evaluates to false, where"
+         << "\n"
+         << e1 << " evaluates to " << ::testing::PrintToString(v1);
+}
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT1.
+// Don't use this in your code.
+#define GTEST_PRED_FORMAT1_(pred_format, v1, on_failure) \
+  GTEST_ASSERT_(pred_format(#v1, v1), on_failure)
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED1.  Don't use
+// this in your code.
+#define GTEST_PRED1_(pred, v1, on_failure) \
+  GTEST_ASSERT_(::testing::AssertPred1Helper(#pred, #v1, pred, v1), on_failure)
+
+// Unary predicate assertion macros.
+#define EXPECT_PRED_FORMAT1(pred_format, v1) \
+  GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_PRED1(pred, v1) GTEST_PRED1_(pred, v1, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_PRED_FORMAT1(pred_format, v1) \
+  GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_FATAL_FAILURE_)
+#define ASSERT_PRED1(pred, v1) GTEST_PRED1_(pred, v1, GTEST_FATAL_FAILURE_)
+
+// Helper function for implementing {EXPECT|ASSERT}_PRED2.  Don't use
+// this in your code.
+template <typename Pred, typename T1, typename T2>
+AssertionResult AssertPred2Helper(const char *pred_text, const char *e1,
+                                  const char *e2, Pred pred, const T1 &v1,
+                                  const T2 &v2) {
+  if (pred(v1, v2)) return AssertionSuccess();
+
+  return AssertionFailure()
+         << pred_text << "(" << e1 << ", " << e2
+         << ") evaluates to false, where"
+         << "\n"
+         << e1 << " evaluates to " << ::testing::PrintToString(v1) << "\n"
+         << e2 << " evaluates to " << ::testing::PrintToString(v2);
+}
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT2.
+// Don't use this in your code.
+#define GTEST_PRED_FORMAT2_(pred_format, v1, v2, on_failure) \
+  GTEST_ASSERT_(pred_format(#v1, #v2, v1, v2), on_failure)
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED2.  Don't use
+// this in your code.
+#define GTEST_PRED2_(pred, v1, v2, on_failure)                               \
+  GTEST_ASSERT_(::testing::AssertPred2Helper(#pred, #v1, #v2, pred, v1, v2), \
+                on_failure)
+
+// Binary predicate assertion macros.
+#define EXPECT_PRED_FORMAT2(pred_format, v1, v2) \
+  GTEST_PRED_FORMAT2_(pred_format, v1, v2, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_PRED2(pred, v1, v2) \
+  GTEST_PRED2_(pred, v1, v2, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_PRED_FORMAT2(pred_format, v1, v2) \
+  GTEST_PRED_FORMAT2_(pred_format, v1, v2, GTEST_FATAL_FAILURE_)
+#define ASSERT_PRED2(pred, v1, v2) \
+  GTEST_PRED2_(pred, v1, v2, GTEST_FATAL_FAILURE_)
+
+// Helper function for implementing {EXPECT|ASSERT}_PRED3.  Don't use
+// this in your code.
+template <typename Pred, typename T1, typename T2, typename T3>
+AssertionResult AssertPred3Helper(const char *pred_text, const char *e1,
+                                  const char *e2, const char *e3, Pred pred,
+                                  const T1 &v1, const T2 &v2, const T3 &v3) {
+  if (pred(v1, v2, v3)) return AssertionSuccess();
+
+  return AssertionFailure()
+         << pred_text << "(" << e1 << ", " << e2 << ", " << e3
+         << ") evaluates to false, where"
+         << "\n"
+         << e1 << " evaluates to " << ::testing::PrintToString(v1) << "\n"
+         << e2 << " evaluates to " << ::testing::PrintToString(v2) << "\n"
+         << e3 << " evaluates to " << ::testing::PrintToString(v3);
+}
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT3.
+// Don't use this in your code.
+#define GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, on_failure) \
+  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, v1, v2, v3), on_failure)
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED3.  Don't use
+// this in your code.
+#define GTEST_PRED3_(pred, v1, v2, v3, on_failure)                          \
+  GTEST_ASSERT_(                                                            \
+      ::testing::AssertPred3Helper(#pred, #v1, #v2, #v3, pred, v1, v2, v3), \
+      on_failure)
+
+// Ternary predicate assertion macros.
+#define EXPECT_PRED_FORMAT3(pred_format, v1, v2, v3) \
+  GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_PRED3(pred, v1, v2, v3) \
+  GTEST_PRED3_(pred, v1, v2, v3, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_PRED_FORMAT3(pred_format, v1, v2, v3) \
+  GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, GTEST_FATAL_FAILURE_)
+#define ASSERT_PRED3(pred, v1, v2, v3) \
+  GTEST_PRED3_(pred, v1, v2, v3, GTEST_FATAL_FAILURE_)
+
+// Helper function for implementing {EXPECT|ASSERT}_PRED4.  Don't use
+// this in your code.
+template <typename Pred, typename T1, typename T2, typename T3, typename T4>
+AssertionResult AssertPred4Helper(const char *pred_text, const char *e1,
+                                  const char *e2, const char *e3,
+                                  const char *e4, Pred pred, const T1 &v1,
+                                  const T2 &v2, const T3 &v3, const T4 &v4) {
+  if (pred(v1, v2, v3, v4)) return AssertionSuccess();
+
+  return AssertionFailure()
+         << pred_text << "(" << e1 << ", " << e2 << ", " << e3 << ", " << e4
+         << ") evaluates to false, where"
+         << "\n"
+         << e1 << " evaluates to " << ::testing::PrintToString(v1) << "\n"
+         << e2 << " evaluates to " << ::testing::PrintToString(v2) << "\n"
+         << e3 << " evaluates to " << ::testing::PrintToString(v3) << "\n"
+         << e4 << " evaluates to " << ::testing::PrintToString(v4);
+}
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT4.
+// Don't use this in your code.
+#define GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, on_failure) \
+  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, v1, v2, v3, v4), on_failure)
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED4.  Don't use
+// this in your code.
+#define GTEST_PRED4_(pred, v1, v2, v3, v4, on_failure)                        \
+  GTEST_ASSERT_(::testing::AssertPred4Helper(#pred, #v1, #v2, #v3, #v4, pred, \
+                                             v1, v2, v3, v4),                 \
+                on_failure)
+
+// 4-ary predicate assertion macros.
+#define EXPECT_PRED_FORMAT4(pred_format, v1, v2, v3, v4) \
+  GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_PRED4(pred, v1, v2, v3, v4) \
+  GTEST_PRED4_(pred, v1, v2, v3, v4, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_PRED_FORMAT4(pred_format, v1, v2, v3, v4) \
+  GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, GTEST_FATAL_FAILURE_)
+#define ASSERT_PRED4(pred, v1, v2, v3, v4) \
+  GTEST_PRED4_(pred, v1, v2, v3, v4, GTEST_FATAL_FAILURE_)
+
+// Helper function for implementing {EXPECT|ASSERT}_PRED5.  Don't use
+// this in your code.
+template <typename Pred, typename T1, typename T2, typename T3, typename T4,
+          typename T5>
+AssertionResult AssertPred5Helper(const char *pred_text, const char *e1,
+                                  const char *e2, const char *e3,
+                                  const char *e4, const char *e5, Pred pred,
+                                  const T1 &v1, const T2 &v2, const T3 &v3,
+                                  const T4 &v4, const T5 &v5) {
+  if (pred(v1, v2, v3, v4, v5)) return AssertionSuccess();
+
+  return AssertionFailure()
+         << pred_text << "(" << e1 << ", " << e2 << ", " << e3 << ", " << e4
+         << ", " << e5 << ") evaluates to false, where"
+         << "\n"
+         << e1 << " evaluates to " << ::testing::PrintToString(v1) << "\n"
+         << e2 << " evaluates to " << ::testing::PrintToString(v2) << "\n"
+         << e3 << " evaluates to " << ::testing::PrintToString(v3) << "\n"
+         << e4 << " evaluates to " << ::testing::PrintToString(v4) << "\n"
+         << e5 << " evaluates to " << ::testing::PrintToString(v5);
+}
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT5.
+// Don't use this in your code.
+#define GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, on_failure)  \
+  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, #v5, v1, v2, v3, v4, v5), \
+                on_failure)
+
+// Internal macro for implementing {EXPECT|ASSERT}_PRED5.  Don't use
+// this in your code.
+#define GTEST_PRED5_(pred, v1, v2, v3, v4, v5, on_failure)                   \
+  GTEST_ASSERT_(::testing::AssertPred5Helper(#pred, #v1, #v2, #v3, #v4, #v5, \
+                                             pred, v1, v2, v3, v4, v5),      \
+                on_failure)
+
+// 5-ary predicate assertion macros.
+#define EXPECT_PRED_FORMAT5(pred_format, v1, v2, v3, v4, v5) \
+  GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_PRED5(pred, v1, v2, v3, v4, v5) \
+  GTEST_PRED5_(pred, v1, v2, v3, v4, v5, GTEST_NONFATAL_FAILURE_)
+#define ASSERT_PRED_FORMAT5(pred_format, v1, v2, v3, v4, v5) \
+  GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, GTEST_FATAL_FAILURE_)
+#define ASSERT_PRED5(pred, v1, v2, v3, v4, v5) \
+  GTEST_PRED5_(pred, v1, v2, v3, v4, v5, GTEST_FATAL_FAILURE_)
+
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
diff --git a/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest_prod.h b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest_prod.h
new file mode 100644
index 000000000..3dc5b2386
--- /dev/null
+++ b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/gtest_prod.h
@@ -0,0 +1,61 @@
+// Copyright 2006, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+//
+// Google C++ Testing and Mocking Framework definitions useful in production
+// code. GOOGLETEST_CM0003 DO NOT DELETE
+
+#ifndef GTEST_INCLUDE_GTEST_GTEST_PROD_H_
+#define GTEST_INCLUDE_GTEST_GTEST_PROD_H_
+
+// When you need to test the private or protected members of a class,
+// use the FRIEND_TEST macro to declare your tests as friends of the
+// class.  For example:
+//
+// class MyClass {
+//  private:
+//   void PrivateMethod();
+//   FRIEND_TEST(MyClassTest, PrivateMethodWorks);
+// };
+//
+// class MyClassTest : public testing::Test {
+//   // ...
+// };
+//
+// TEST_F(MyClassTest, PrivateMethodWorks) {
+//   // Can call MyClass::PrivateMethod() here.
+// }
+//
+// Note: The test class must be in the same namespace as the class being tested.
+// For example, putting MyClassTest in an anonymous namespace will not work.
+
+#define FRIEND_TEST(test_case_name, test_name) \
+  friend class test_case_name##_##test_name##_Test
+
+#endif  // GTEST_INCLUDE_GTEST_GTEST_PROD_H_
diff --git a/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/custom/README.md b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/custom/README.md
new file mode 100644
index 000000000..ff391fb4e
--- /dev/null
+++ b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/custom/README.md
@@ -0,0 +1,56 @@
+# Customization Points
+
+The custom directory is an injection point for custom user configurations.
+
+## Header `gtest.h`
+
+### The following macros can be defined:
+
+*   `GTEST_OS_STACK_TRACE_GETTER_` - The name of an implementation of
+    `OsStackTraceGetterInterface`.
+*   `GTEST_CUSTOM_TEMPDIR_FUNCTION_` - An override for `testing::TempDir()`. See
+    `testing::TempDir` for semantics and signature.
+
+## Header `gtest-port.h`
+
+The following macros can be defined:
+
+### Flag related macros:
+
+*   `GTEST_FLAG(flag_name)`
+*   `GTEST_USE_OWN_FLAGFILE_FLAG_` - Define to 0 when the system provides its
+    own flagfile flag parsing.
+*   `GTEST_DECLARE_bool_(name)`
+*   `GTEST_DECLARE_int32_(name)`
+*   `GTEST_DECLARE_string_(name)`
+*   `GTEST_DEFINE_bool_(name, default_val, doc)`
+*   `GTEST_DEFINE_int32_(name, default_val, doc)`
+*   `GTEST_DEFINE_string_(name, default_val, doc)`
+
+### Logging:
+
+*   `GTEST_LOG_(severity)`
+*   `GTEST_CHECK_(condition)`
+*   Functions `LogToStderr()` and `FlushInfoLog()` have to be provided too.
+
+### Threading:
+
+*   `GTEST_HAS_NOTIFICATION_` - Enabled if Notification is already provided.
+*   `GTEST_HAS_MUTEX_AND_THREAD_LOCAL_` - Enabled if `Mutex` and `ThreadLocal`
+    are already provided. Must also provide `GTEST_DECLARE_STATIC_MUTEX_(mutex)`
+    and `GTEST_DEFINE_STATIC_MUTEX_(mutex)`
+*   `GTEST_EXCLUSIVE_LOCK_REQUIRED_(locks)`
+*   `GTEST_LOCK_EXCLUDED_(locks)`
+
+### Underlying library support features
+
+*   `GTEST_HAS_CXXABI_H_`
+
+### Exporting API symbols:
+
+*   `GTEST_API_` - Specifier for exported symbols.
+
+## Header `gtest-printers.h`
+
+*   See documentation at `gtest/gtest-printers.h` for details on how to define a
+    custom printer.
diff --git a/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest-port.h b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest-port.h
new file mode 100644
index 000000000..cd85d956d
--- /dev/null
+++ b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest-port.h
@@ -0,0 +1,37 @@
+// Copyright 2015, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Injection point for custom user configurations. See README for details
+//
+// ** Custom implementation starts here **
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_
diff --git a/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest-printers.h b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest-printers.h
new file mode 100644
index 000000000..eb4467abc
--- /dev/null
+++ b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest-printers.h
@@ -0,0 +1,42 @@
+// Copyright 2015, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// This file provides an injection point for custom printers in a local
+// installation of gTest.
+// It will be included from gtest-printers.h and the overrides in this file
+// will be visible to everyone.
+//
+// Injection point for custom user configurations. See README for details
+//
+// ** Custom implementation starts here **
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_
diff --git a/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest.h b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest.h
new file mode 100644
index 000000000..4c8e07be2
--- /dev/null
+++ b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/custom/gtest.h
@@ -0,0 +1,37 @@
+// Copyright 2015, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Injection point for custom user configurations. See README for details
+//
+// ** Custom implementation starts here **
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_H_
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_H_
diff --git a/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-death-test-internal.h b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-death-test-internal.h
new file mode 100644
index 000000000..3e9497d45
--- /dev/null
+++ b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-death-test-internal.h
@@ -0,0 +1,301 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// The Google C++ Testing and Mocking Framework (Google Test)
+//
+// This header file defines internal utilities needed for implementing
+// death tests.  They are subject to change without notice.
+// GOOGLETEST_CM0001 DO NOT DELETE
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
+
+#include "gtest/gtest-matchers.h"
+#include "gtest/internal/gtest-internal.h"
+
+#include <stdio.h>
+#include <memory>
+
+namespace testing {
+namespace internal {
+
+GTEST_DECLARE_string_(internal_run_death_test);
+
+// Names of the flags (needed for parsing Google Test flags).
+const char kDeathTestStyleFlag[] = "death_test_style";
+const char kDeathTestUseFork[] = "death_test_use_fork";
+const char kInternalRunDeathTestFlag[] = "internal_run_death_test";
+
+#if GTEST_HAS_DEATH_TEST
+
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
+// DeathTest is a class that hides much of the complexity of the
+// GTEST_DEATH_TEST_ macro.  It is abstract; its static Create method
+// returns a concrete class that depends on the prevailing death test
+// style, as defined by the --gtest_death_test_style and/or
+// --gtest_internal_run_death_test flags.
+
+// In describing the results of death tests, these terms are used with
+// the corresponding definitions:
+//
+// exit status:  The integer exit information in the format specified
+//               by wait(2)
+// exit code:    The integer code passed to exit(3), _exit(2), or
+//               returned from main()
+class GTEST_API_ DeathTest {
+ public:
+  // Create returns false if there was an error determining the
+  // appropriate action to take for the current death test; for example,
+  // if the gtest_death_test_style flag is set to an invalid value.
+  // The LastMessage method will return a more detailed message in that
+  // case.  Otherwise, the DeathTest pointer pointed to by the "test"
+  // argument is set.  If the death test should be skipped, the pointer
+  // is set to NULL; otherwise, it is set to the address of a new concrete
+  // DeathTest object that controls the execution of the current test.
+  static bool Create(const char *statement,
+                     Matcher<const std::string &> matcher, const char *file,
+                     int line, DeathTest **test);
+  DeathTest();
+  virtual ~DeathTest() {}
+
+  // A helper class that aborts a death test when it's deleted.
+  class ReturnSentinel {
+   public:
+    explicit ReturnSentinel(DeathTest *test) : test_(test) {}
+    ~ReturnSentinel() { test_->Abort(TEST_ENCOUNTERED_RETURN_STATEMENT); }
+
+   private:
+    DeathTest *const test_;
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(ReturnSentinel);
+  } GTEST_ATTRIBUTE_UNUSED_;
+
+  // An enumeration of possible roles that may be taken when a death
+  // test is encountered.  EXECUTE means that the death test logic should
+  // be executed immediately.  OVERSEE means that the program should prepare
+  // the appropriate environment for a child process to execute the death
+  // test, then wait for it to complete.
+  enum TestRole { OVERSEE_TEST, EXECUTE_TEST };
+
+  // An enumeration of the three reasons that a test might be aborted.
+  enum AbortReason {
+    TEST_ENCOUNTERED_RETURN_STATEMENT,
+    TEST_THREW_EXCEPTION,
+    TEST_DID_NOT_DIE
+  };
+
+  // Assumes one of the above roles.
+  virtual TestRole AssumeRole() = 0;
+
+  // Waits for the death test to finish and returns its status.
+  virtual int Wait() = 0;
+
+  // Returns true if the death test passed; that is, the test process
+  // exited during the test, its exit status matches a user-supplied
+  // predicate, and its stderr output matches a user-supplied regular
+  // expression.
+  // The user-supplied predicate may be a macro expression rather
+  // than a function pointer or functor, or else Wait and Passed could
+  // be combined.
+  virtual bool Passed(bool exit_status_ok) = 0;
+
+  // Signals that the death test did not die as expected.
+  virtual void Abort(AbortReason reason) = 0;
+
+  // Returns a human-readable outcome message regarding the outcome of
+  // the last death test.
+  static const char *LastMessage();
+
+  static void set_last_death_test_message(const std::string &message);
+
+ private:
+  // A string containing a description of the outcome of the last death test.
+  static std::string last_death_test_message_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(DeathTest);
+};
+
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
+
+// Factory interface for death tests.  May be mocked out for testing.
+class DeathTestFactory {
+ public:
+  virtual ~DeathTestFactory() {}
+  virtual bool Create(const char *statement,
+                      Matcher<const std::string &> matcher, const char *file,
+                      int line, DeathTest **test) = 0;
+};
+
+// A concrete DeathTestFactory implementation for normal use.
+class DefaultDeathTestFactory : public DeathTestFactory {
+ public:
+  bool Create(const char *statement, Matcher<const std::string &> matcher,
+              const char *file, int line, DeathTest **test) override;
+};
+
+// Returns true if exit_status describes a process that was terminated
+// by a signal, or exited normally with a nonzero exit code.
+GTEST_API_ bool ExitedUnsuccessfully(int exit_status);
+
+// A string passed to EXPECT_DEATH (etc.) is caught by one of these overloads
+// and interpreted as a regex (rather than an Eq matcher) for legacy
+// compatibility.
+inline Matcher<const ::std::string &> MakeDeathTestMatcher(
+    ::testing::internal::RE regex) {
+  return ContainsRegex(regex.pattern());
+}
+inline Matcher<const ::std::string &> MakeDeathTestMatcher(const char *regex) {
+  return ContainsRegex(regex);
+}
+inline Matcher<const ::std::string &> MakeDeathTestMatcher(
+    const ::std::string &regex) {
+  return ContainsRegex(regex);
+}
+
+// If a Matcher<const ::std::string&> is passed to EXPECT_DEATH (etc.), it's
+// used directly.
+inline Matcher<const ::std::string &> MakeDeathTestMatcher(
+    Matcher<const ::std::string &> matcher) {
+  return matcher;
+}
+
+// Traps C++ exceptions escaping statement and reports them as test
+// failures. Note that trapping SEH exceptions is not implemented here.
+#if GTEST_HAS_EXCEPTIONS
+#define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test)           \
+  try {                                                                      \
+    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement);               \
+  } catch (const ::std::exception &gtest_exception) {                        \
+    fprintf(                                                                 \
+        stderr,                                                              \
+        "\n%s: Caught std::exception-derived exception escaping the "        \
+        "death test statement. Exception message: %s\n",                     \
+        ::testing::internal::FormatFileLocation(__FILE__, __LINE__).c_str(), \
+        gtest_exception.what());                                             \
+    fflush(stderr);                                                          \
+    death_test->Abort(::testing::internal::DeathTest::TEST_THREW_EXCEPTION); \
+  } catch (...) {                                                            \
+    death_test->Abort(::testing::internal::DeathTest::TEST_THREW_EXCEPTION); \
+  }
+
+#else
+#define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \
+  GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement)
+
+#endif
+
+// This macro is for implementing ASSERT_DEATH*, EXPECT_DEATH*,
+// ASSERT_EXIT*, and EXPECT_EXIT*.
+#define GTEST_DEATH_TEST_(statement, predicate, regex_or_matcher, fail)        \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_                                                \
+  if (::testing::internal::AlwaysTrue()) {                                     \
+    ::testing::internal::DeathTest *gtest_dt;                                  \
+    if (!::testing::internal::DeathTest::Create(                               \
+            #statement,                                                        \
+            ::testing::internal::MakeDeathTestMatcher(regex_or_matcher),       \
+            __FILE__, __LINE__, &gtest_dt)) {                                  \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__);                        \
+    }                                                                          \
+    if (gtest_dt != nullptr) {                                                 \
+      std::unique_ptr< ::testing::internal::DeathTest> gtest_dt_ptr(gtest_dt); \
+      switch (gtest_dt->AssumeRole()) {                                        \
+        case ::testing::internal::DeathTest::OVERSEE_TEST:                     \
+          if (!gtest_dt->Passed(predicate(gtest_dt->Wait()))) {                \
+            goto GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__);                  \
+          }                                                                    \
+          break;                                                               \
+        case ::testing::internal::DeathTest::EXECUTE_TEST: {                   \
+          ::testing::internal::DeathTest::ReturnSentinel gtest_sentinel(       \
+              gtest_dt);                                                       \
+          GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, gtest_dt);            \
+          gtest_dt->Abort(::testing::internal::DeathTest::TEST_DID_NOT_DIE);   \
+          break;                                                               \
+        }                                                                      \
+        default: break;                                                        \
+      }                                                                        \
+    }                                                                          \
+  } else                                                                       \
+    GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__)                                \
+        : fail(::testing::internal::DeathTest::LastMessage())
+// The symbol "fail" here expands to something into which a message
+// can be streamed.
+
+// This macro is for implementing ASSERT/EXPECT_DEBUG_DEATH when compiled in
+// NDEBUG mode. In this case we need the statements to be executed and the macro
+// must accept a streamed message even though the message is never printed.
+// The regex object is not evaluated, but it is used to prevent "unused"
+// warnings and to avoid an expression that doesn't compile in debug mode.
+#define GTEST_EXECUTE_STATEMENT_(statement, regex_or_matcher)    \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_                                  \
+  if (::testing::internal::AlwaysTrue()) {                       \
+    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement);   \
+  } else if (!::testing::internal::AlwaysTrue()) {               \
+    ::testing::internal::MakeDeathTestMatcher(regex_or_matcher); \
+  } else                                                         \
+    ::testing::Message()
+
+// A class representing the parsed contents of the
+// --gtest_internal_run_death_test flag, as it existed when
+// RUN_ALL_TESTS was called.
+class InternalRunDeathTestFlag {
+ public:
+  InternalRunDeathTestFlag(const std::string &a_file, int a_line, int an_index,
+                           int a_write_fd)
+      : file_(a_file), line_(a_line), index_(an_index), write_fd_(a_write_fd) {}
+
+  ~InternalRunDeathTestFlag() {
+    if (write_fd_ >= 0) posix::Close(write_fd_);
+  }
+
+  const std::string &file() const { return file_; }
+  int line() const { return line_; }
+  int index() const { return index_; }
+  int write_fd() const { return write_fd_; }
+
+ private:
+  std::string file_;
+  int line_;
+  int index_;
+  int write_fd_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(InternalRunDeathTestFlag);
+};
+
+// Returns a newly created InternalRunDeathTestFlag object with fields
+// initialized from the GTEST_FLAG(internal_run_death_test) flag if
+// the flag is specified; otherwise returns NULL.
+InternalRunDeathTestFlag *ParseInternalRunDeathTestFlag();
+
+#endif  // GTEST_HAS_DEATH_TEST
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
diff --git a/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-filepath.h b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-filepath.h
new file mode 100644
index 000000000..b228d4734
--- /dev/null
+++ b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-filepath.h
@@ -0,0 +1,208 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Google Test filepath utilities
+//
+// This header file declares classes and functions used internally by
+// Google Test.  They are subject to change without notice.
+//
+// This file is #included in gtest/internal/gtest-internal.h.
+// Do not include this header file separately!
+
+// GOOGLETEST_CM0001 DO NOT DELETE
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
+
+#include "gtest/internal/gtest-string.h"
+
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
+namespace testing {
+namespace internal {
+
+// FilePath - a class for file and directory pathname manipulation which
+// handles platform-specific conventions (like the pathname separator).
+// Used for helper functions for naming files in a directory for xml output.
+// Except for Set methods, all methods are const or static, which provides an
+// "immutable value object" -- useful for peace of mind.
+// A FilePath with a value ending in a path separator ("like/this/") represents
+// a directory, otherwise it is assumed to represent a file. In either case,
+// it may or may not represent an actual file or directory in the file system.
+// Names are NOT checked for syntax correctness -- no checking for illegal
+// characters, malformed paths, etc.
+
+class GTEST_API_ FilePath {
+ public:
+  FilePath() : pathname_("") {}
+  FilePath(const FilePath &rhs) : pathname_(rhs.pathname_) {}
+
+  explicit FilePath(const std::string &pathname) : pathname_(pathname) {
+    Normalize();
+  }
+
+  FilePath &operator=(const FilePath &rhs) {
+    Set(rhs);
+    return *this;
+  }
+
+  void Set(const FilePath &rhs) { pathname_ = rhs.pathname_; }
+
+  const std::string &string() const { return pathname_; }
+  const char *c_str() const { return pathname_.c_str(); }
+
+  // Returns the current working directory, or "" if unsuccessful.
+  static FilePath GetCurrentDir();
+
+  // Given directory = "dir", base_name = "test", number = 0,
+  // extension = "xml", returns "dir/test.xml". If number is greater
+  // than zero (e.g., 12), returns "dir/test_12.xml".
+  // On Windows platform, uses \ as the separator rather than /.
+  static FilePath MakeFileName(const FilePath &directory,
+                               const FilePath &base_name, int number,
+                               const char *extension);
+
+  // Given directory = "dir", relative_path = "test.xml",
+  // returns "dir/test.xml".
+  // On Windows, uses \ as the separator rather than /.
+  static FilePath ConcatPaths(const FilePath &directory,
+                              const FilePath &relative_path);
+
+  // Returns a pathname for a file that does not currently exist. The pathname
+  // will be directory/base_name.extension or
+  // directory/base_name_<number>.extension if directory/base_name.extension
+  // already exists. The number will be incremented until a pathname is found
+  // that does not already exist.
+  // Examples: 'dir/foo_test.xml' or 'dir/foo_test_1.xml'.
+  // There could be a race condition if two or more processes are calling this
+  // function at the same time -- they could both pick the same filename.
+  static FilePath GenerateUniqueFileName(const FilePath &directory,
+                                         const FilePath &base_name,
+                                         const char *extension);
+
+  // Returns true if and only if the path is "".
+  bool IsEmpty() const { return pathname_.empty(); }
+
+  // If input name has a trailing separator character, removes it and returns
+  // the name, otherwise return the name string unmodified.
+  // On Windows platform, uses \ as the separator, other platforms use /.
+  FilePath RemoveTrailingPathSeparator() const;
+
+  // Returns a copy of the FilePath with the directory part removed.
+  // Example: FilePath("path/to/file").RemoveDirectoryName() returns
+  // FilePath("file"). If there is no directory part ("just_a_file"), it returns
+  // the FilePath unmodified. If there is no file part ("just_a_dir/") it
+  // returns an empty FilePath ("").
+  // On Windows platform, '\' is the path separator, otherwise it is '/'.
+  FilePath RemoveDirectoryName() const;
+
+  // RemoveFileName returns the directory path with the filename removed.
+  // Example: FilePath("path/to/file").RemoveFileName() returns "path/to/".
+  // If the FilePath is "a_file" or "/a_file", RemoveFileName returns
+  // FilePath("./") or, on Windows, FilePath(".\\"). If the filepath does
+  // not have a file, like "just/a/dir/", it returns the FilePath unmodified.
+  // On Windows platform, '\' is the path separator, otherwise it is '/'.
+  FilePath RemoveFileName() const;
+
+  // Returns a copy of the FilePath with the case-insensitive extension removed.
+  // Example: FilePath("dir/file.exe").RemoveExtension("EXE") returns
+  // FilePath("dir/file"). If a case-insensitive extension is not
+  // found, returns a copy of the original FilePath.
+  FilePath RemoveExtension(const char *extension) const;
+
+  // Creates directories so that path exists. Returns true if successful or if
+  // the directories already exist; returns false if unable to create
+  // directories for any reason. Will also return false if the FilePath does
+  // not represent a directory (that is, it doesn't end with a path separator).
+  bool CreateDirectoriesRecursively() const;
+
+  // Create the directory so that path exists. Returns true if successful or
+  // if the directory already exists; returns false if unable to create the
+  // directory for any reason, including if the parent directory does not
+  // exist. Not named "CreateDirectory" because that's a macro on Windows.
+  bool CreateFolder() const;
+
+  // Returns true if FilePath describes something in the file-system,
+  // either a file, directory, or whatever, and that something exists.
+  bool FileOrDirectoryExists() const;
+
+  // Returns true if pathname describes a directory in the file-system
+  // that exists.
+  bool DirectoryExists() const;
+
+  // Returns true if FilePath ends with a path separator, which indicates that
+  // it is intended to represent a directory. Returns false otherwise.
+  // This does NOT check that a directory (or file) actually exists.
+  bool IsDirectory() const;
+
+  // Returns true if pathname describes a root directory. (Windows has one
+  // root directory per disk drive.)
+  bool IsRootDirectory() const;
+
+  // Returns true if pathname describes an absolute path.
+  bool IsAbsolutePath() const;
+
+ private:
+  // Replaces multiple consecutive separators with a single separator.
+  // For example, "bar///foo" becomes "bar/foo". Does not eliminate other
+  // redundancies that might be in a pathname involving "." or "..".
+  //
+  // A pathname with multiple consecutive separators may occur either through
+  // user error or as a result of some scripts or APIs that generate a pathname
+  // with a trailing separator. On other platforms the same API or script
+  // may NOT generate a pathname with a trailing "/". Then elsewhere that
+  // pathname may have another "/" and pathname components added to it,
+  // without checking for the separator already being there.
+  // The script language and operating system may allow paths like "foo//bar"
+  // but some of the functions in FilePath will not handle that correctly. In
+  // particular, RemoveTrailingPathSeparator() only removes one separator, and
+  // it is called in CreateDirectoriesRecursively() assuming that it will change
+  // a pathname from directory syntax (trailing separator) to filename syntax.
+  //
+  // On Windows this method also replaces the alternate path separator '/' with
+  // the primary path separator '\\', so that for example "bar\\/\\foo" becomes
+  // "bar\\foo".
+
+  void Normalize();
+
+  // Returns a pointer to the last occurence of a valid path separator in
+  // the FilePath. On Windows, for example, both '/' and '\' are valid path
+  // separators. Returns NULL if no path separator was found.
+  const char *FindLastPathSeparator() const;
+
+  std::string pathname_;
+};  // class FilePath
+
+}  // namespace internal
+}  // namespace testing
+
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
diff --git a/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-internal.h b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-internal.h
new file mode 100644
index 000000000..9640aba83
--- /dev/null
+++ b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-internal.h
@@ -0,0 +1,1441 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// The Google C++ Testing and Mocking Framework (Google Test)
+//
+// This header file declares functions and macros used internally by
+// Google Test.  They are subject to change without notice.
+
+// GOOGLETEST_CM0001 DO NOT DELETE
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
+
+#include "gtest/internal/gtest-port.h"
+
+#if GTEST_OS_LINUX
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#endif  // GTEST_OS_LINUX
+
+#if GTEST_HAS_EXCEPTIONS
+#include <stdexcept>
+#endif
+
+#include <ctype.h>
+#include <float.h>
+#include <string.h>
+#include <cstdint>
+#include <iomanip>
+#include <limits>
+#include <map>
+#include <set>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include "gtest/gtest-message.h"
+#include "gtest/internal/gtest-filepath.h"
+#include "gtest/internal/gtest-string.h"
+#include "gtest/internal/gtest-type-util.h"
+
+// Due to C++ preprocessor weirdness, we need double indirection to
+// concatenate two tokens when one of them is __LINE__.  Writing
+//
+//   foo ## __LINE__
+//
+// will result in the token foo__LINE__, instead of foo followed by
+// the current line number.  For more details, see
+// http://www.parashift.com/c++-faq-lite/misc-technical-issues.html#faq-39.6
+#define GTEST_CONCAT_TOKEN_(foo, bar) GTEST_CONCAT_TOKEN_IMPL_(foo, bar)
+#define GTEST_CONCAT_TOKEN_IMPL_(foo, bar) foo##bar
+
+// Stringifies its argument.
+// Work around a bug in visual studio which doesn't accept code like this:
+//
+//   #define GTEST_STRINGIFY_(name) #name
+//   #define MACRO(a, b, c) ... GTEST_STRINGIFY_(a) ...
+//   MACRO(, x, y)
+//
+// Complaining about the argument to GTEST_STRINGIFY_ being empty.
+// This is allowed by the spec.
+#define GTEST_STRINGIFY_HELPER_(name, ...) #name
+#define GTEST_STRINGIFY_(...) GTEST_STRINGIFY_HELPER_(__VA_ARGS__, )
+
+namespace proto2 {
+class Message;
+}
+
+namespace testing {
+
+// Forward declarations.
+
+class AssertionResult;  // Result of an assertion.
+class Message;          // Represents a failure message.
+class Test;             // Represents a test.
+class TestInfo;         // Information about a test.
+class TestPartResult;   // Result of a test part.
+class UnitTest;         // A collection of test suites.
+
+template <typename T>
+::std::string PrintToString(const T &value);
+
+namespace internal {
+
+struct TraceInfo;    // Information about a trace point.
+class TestInfoImpl;  // Opaque implementation of TestInfo
+class UnitTestImpl;  // Opaque implementation of UnitTest
+
+// The text used in failure messages to indicate the start of the
+// stack trace.
+GTEST_API_ extern const char kStackTraceMarker[];
+
+// An IgnoredValue object can be implicitly constructed from ANY value.
+class IgnoredValue {
+  struct Sink {};
+
+ public:
+  // This constructor template allows any value to be implicitly
+  // converted to IgnoredValue.  The object has no data member and
+  // doesn't try to remember anything about the argument.  We
+  // deliberately omit the 'explicit' keyword in order to allow the
+  // conversion to be implicit.
+  // Disable the conversion if T already has a magical conversion operator.
+  // Otherwise we get ambiguity.
+  template <typename T,
+            typename std::enable_if<!std::is_convertible<T, Sink>::value,
+                                    int>::type = 0>
+  IgnoredValue(const T & /* ignored */) {}  // NOLINT(runtime/explicit)
+};
+
+// Appends the user-supplied message to the Google-Test-generated message.
+GTEST_API_ std::string AppendUserMessage(const std::string &gtest_msg,
+                                         const Message &user_msg);
+
+#if GTEST_HAS_EXCEPTIONS
+
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(
+    4275 /* an exported class was derived from a class that was not exported */)
+
+// This exception is thrown by (and only by) a failed Google Test
+// assertion when GTEST_FLAG(throw_on_failure) is true (if exceptions
+// are enabled).  We derive it from std::runtime_error, which is for
+// errors presumably detectable only at run time.  Since
+// std::runtime_error inherits from std::exception, many testing
+// frameworks know how to extract and print the message inside it.
+class GTEST_API_ GoogleTestFailureException : public ::std::runtime_error {
+ public:
+  explicit GoogleTestFailureException(const TestPartResult &failure);
+};
+
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4275
+
+#endif  // GTEST_HAS_EXCEPTIONS
+
+namespace edit_distance {
+// Returns the optimal edits to go from 'left' to 'right'.
+// All edits cost the same, with replace having lower priority than
+// add/remove.
+// Simple implementation of the Wagner-Fischer algorithm.
+// See http://en.wikipedia.org/wiki/Wagner-Fischer_algorithm
+enum EditType { kMatch, kAdd, kRemove, kReplace };
+GTEST_API_ std::vector<EditType> CalculateOptimalEdits(
+    const std::vector<size_t> &left, const std::vector<size_t> &right);
+
+// Same as above, but the input is represented as strings.
+GTEST_API_ std::vector<EditType> CalculateOptimalEdits(
+    const std::vector<std::string> &left,
+    const std::vector<std::string> &right);
+
+// Create a diff of the input strings in Unified diff format.
+GTEST_API_ std::string CreateUnifiedDiff(const std::vector<std::string> &left,
+                                         const std::vector<std::string> &right,
+                                         size_t context = 2);
+
+}  // namespace edit_distance
+
+// Calculate the diff between 'left' and 'right' and return it in unified diff
+// format.
+// If not null, stores in 'total_line_count' the total number of lines found
+// in left + right.
+GTEST_API_ std::string DiffStrings(const std::string &left,
+                                   const std::string &right,
+                                   size_t *total_line_count);
+
+// Constructs and returns the message for an equality assertion
+// (e.g. ASSERT_EQ, EXPECT_STREQ, etc) failure.
+//
+// The first four parameters are the expressions used in the assertion
+// and their values, as strings.  For example, for ASSERT_EQ(foo, bar)
+// where foo is 5 and bar is 6, we have:
+//
+//   expected_expression: "foo"
+//   actual_expression:   "bar"
+//   expected_value:      "5"
+//   actual_value:        "6"
+//
+// The ignoring_case parameter is true if and only if the assertion is a
+// *_STRCASEEQ*.  When it's true, the string " (ignoring case)" will
+// be inserted into the message.
+GTEST_API_ AssertionResult EqFailure(const char *expected_expression,
+                                     const char *actual_expression,
+                                     const std::string &expected_value,
+                                     const std::string &actual_value,
+                                     bool ignoring_case);
+
+// Constructs a failure message for Boolean assertions such as EXPECT_TRUE.
+GTEST_API_ std::string GetBoolAssertionFailureMessage(
+    const AssertionResult &assertion_result, const char *expression_text,
+    const char *actual_predicate_value, const char *expected_predicate_value);
+
+// This template class represents an IEEE floating-point number
+// (either single-precision or double-precision, depending on the
+// template parameters).
+//
+// The purpose of this class is to do more sophisticated number
+// comparison.  (Due to round-off error, etc, it's very unlikely that
+// two floating-points will be equal exactly.  Hence a naive
+// comparison by the == operation often doesn't work.)
+//
+// Format of IEEE floating-point:
+//
+//   The most-significant bit being the leftmost, an IEEE
+//   floating-point looks like
+//
+//     sign_bit exponent_bits fraction_bits
+//
+//   Here, sign_bit is a single bit that designates the sign of the
+//   number.
+//
+//   For float, there are 8 exponent bits and 23 fraction bits.
+//
+//   For double, there are 11 exponent bits and 52 fraction bits.
+//
+//   More details can be found at
+//   http://en.wikipedia.org/wiki/IEEE_floating-point_standard.
+//
+// Template parameter:
+//
+//   RawType: the raw floating-point type (either float or double)
+template <typename RawType>
+class FloatingPoint {
+ public:
+  // Defines the unsigned integer type that has the same size as the
+  // floating point number.
+  typedef typename TypeWithSize<sizeof(RawType)>::UInt Bits;
+
+  // Constants.
+
+  // # of bits in a number.
+  static const size_t kBitCount = 8 * sizeof(RawType);
+
+  // # of fraction bits in a number.
+  static const size_t kFractionBitCount =
+      std::numeric_limits<RawType>::digits - 1;
+
+  // # of exponent bits in a number.
+  static const size_t kExponentBitCount = kBitCount - 1 - kFractionBitCount;
+
+  // The mask for the sign bit.
+  static const Bits kSignBitMask = static_cast<Bits>(1) << (kBitCount - 1);
+
+  // The mask for the fraction bits.
+  static const Bits kFractionBitMask = ~static_cast<Bits>(0) >>
+                                       (kExponentBitCount + 1);
+
+  // The mask for the exponent bits.
+  static const Bits kExponentBitMask = ~(kSignBitMask | kFractionBitMask);
+
+  // How many ULP's (Units in the Last Place) we want to tolerate when
+  // comparing two numbers.  The larger the value, the more error we
+  // allow.  A 0 value means that two numbers must be exactly the same
+  // to be considered equal.
+  //
+  // The maximum error of a single floating-point operation is 0.5
+  // units in the last place.  On Intel CPU's, all floating-point
+  // calculations are done with 80-bit precision, while double has 64
+  // bits.  Therefore, 4 should be enough for ordinary use.
+  //
+  // See the following article for more details on ULP:
+  // http://randomascii.wordpress.com/2012/02/25/comparing-floating-point-numbers-2012-edition/
+  static const size_t kMaxUlps = 4;
+
+  // Constructs a FloatingPoint from a raw floating-point number.
+  //
+  // On an Intel CPU, passing a non-normalized NAN (Not a Number)
+  // around may change its bits, although the new value is guaranteed
+  // to be also a NAN.  Therefore, don't expect this constructor to
+  // preserve the bits in x when x is a NAN.
+  explicit FloatingPoint(const RawType &x) { u_.value_ = x; }
+
+  // Static methods
+
+  // Reinterprets a bit pattern as a floating-point number.
+  //
+  // This function is needed to test the AlmostEquals() method.
+  static RawType ReinterpretBits(const Bits bits) {
+    FloatingPoint fp(0);
+    fp.u_.bits_ = bits;
+    return fp.u_.value_;
+  }
+
+  // Returns the floating-point number that represent positive infinity.
+  static RawType Infinity() { return ReinterpretBits(kExponentBitMask); }
+
+  // Returns the maximum representable finite floating-point number.
+  static RawType Max();
+
+  // Non-static methods
+
+  // Returns the bits that represents this number.
+  const Bits &bits() const { return u_.bits_; }
+
+  // Returns the exponent bits of this number.
+  Bits exponent_bits() const { return kExponentBitMask & u_.bits_; }
+
+  // Returns the fraction bits of this number.
+  Bits fraction_bits() const { return kFractionBitMask & u_.bits_; }
+
+  // Returns the sign bit of this number.
+  Bits sign_bit() const { return kSignBitMask & u_.bits_; }
+
+  // Returns true if and only if this is NAN (not a number).
+  bool is_nan() const {
+    // It's a NAN if the exponent bits are all ones and the fraction
+    // bits are not entirely zeros.
+    return (exponent_bits() == kExponentBitMask) && (fraction_bits() != 0);
+  }
+
+  // Returns true if and only if this number is at most kMaxUlps ULP's away
+  // from rhs.  In particular, this function:
+  //
+  //   - returns false if either number is (or both are) NAN.
+  //   - treats really large numbers as almost equal to infinity.
+  //   - thinks +0.0 and -0.0 are 0 DLP's apart.
+  bool AlmostEquals(const FloatingPoint &rhs) const {
+    // The IEEE standard says that any comparison operation involving
+    // a NAN must return false.
+    if (is_nan() || rhs.is_nan()) return false;
+
+    return DistanceBetweenSignAndMagnitudeNumbers(u_.bits_, rhs.u_.bits_) <=
+           kMaxUlps;
+  }
+
+ private:
+  // The data type used to store the actual floating-point number.
+  union FloatingPointUnion {
+    RawType value_;  // The raw floating-point number.
+    Bits bits_;      // The bits that represent the number.
+  };
+
+  // Converts an integer from the sign-and-magnitude representation to
+  // the biased representation.  More precisely, let N be 2 to the
+  // power of (kBitCount - 1), an integer x is represented by the
+  // unsigned number x + N.
+  //
+  // For instance,
+  //
+  //   -N + 1 (the most negative number representable using
+  //          sign-and-magnitude) is represented by 1;
+  //   0      is represented by N; and
+  //   N - 1  (the biggest number representable using
+  //          sign-and-magnitude) is represented by 2N - 1.
+  //
+  // Read http://en.wikipedia.org/wiki/Signed_number_representations
+  // for more details on signed number representations.
+  static Bits SignAndMagnitudeToBiased(const Bits &sam) {
+    if (kSignBitMask & sam) {
+      // sam represents a negative number.
+      return ~sam + 1;
+    } else {
+      // sam represents a positive number.
+      return kSignBitMask | sam;
+    }
+  }
+
+  // Given two numbers in the sign-and-magnitude representation,
+  // returns the distance between them as an unsigned number.
+  static Bits DistanceBetweenSignAndMagnitudeNumbers(const Bits &sam1,
+                                                     const Bits &sam2) {
+    const Bits biased1 = SignAndMagnitudeToBiased(sam1);
+    const Bits biased2 = SignAndMagnitudeToBiased(sam2);
+    return (biased1 >= biased2) ? (biased1 - biased2) : (biased2 - biased1);
+  }
+
+  FloatingPointUnion u_;
+};
+
+// We cannot use std::numeric_limits<T>::max() as it clashes with the max()
+// macro defined by <windows.h>.
+template <>
+inline float FloatingPoint<float>::Max() {
+  return FLT_MAX;
+}
+template <>
+inline double FloatingPoint<double>::Max() {
+  return DBL_MAX;
+}
+
+// Typedefs the instances of the FloatingPoint template class that we
+// care to use.
+typedef FloatingPoint<float> Float;
+typedef FloatingPoint<double> Double;
+
+// In order to catch the mistake of putting tests that use different
+// test fixture classes in the same test suite, we need to assign
+// unique IDs to fixture classes and compare them.  The TypeId type is
+// used to hold such IDs.  The user should treat TypeId as an opaque
+// type: the only operation allowed on TypeId values is to compare
+// them for equality using the == operator.
+typedef const void *TypeId;
+
+template <typename T>
+class TypeIdHelper {
+ public:
+  // dummy_ must not have a const type.  Otherwise an overly eager
+  // compiler (e.g. MSVC 7.1 & 8.0) may try to merge
+  // TypeIdHelper<T>::dummy_ for different Ts as an "optimization".
+  static bool dummy_;
+};
+
+template <typename T>
+bool TypeIdHelper<T>::dummy_ = false;
+
+// GetTypeId<T>() returns the ID of type T.  Different values will be
+// returned for different types.  Calling the function twice with the
+// same type argument is guaranteed to return the same ID.
+template <typename T>
+TypeId GetTypeId() {
+  // The compiler is required to allocate a different
+  // TypeIdHelper<T>::dummy_ variable for each T used to instantiate
+  // the template.  Therefore, the address of dummy_ is guaranteed to
+  // be unique.
+  return &(TypeIdHelper<T>::dummy_);
+}
+
+// Returns the type ID of ::testing::Test.  Always call this instead
+// of GetTypeId< ::testing::Test>() to get the type ID of
+// ::testing::Test, as the latter may give the wrong result due to a
+// suspected linker bug when compiling Google Test as a Mac OS X
+// framework.
+GTEST_API_ TypeId GetTestTypeId();
+
+// Defines the abstract factory interface that creates instances
+// of a Test object.
+class TestFactoryBase {
+ public:
+  virtual ~TestFactoryBase() {}
+
+  // Creates a test instance to run. The instance is both created and destroyed
+  // within TestInfoImpl::Run()
+  virtual Test *CreateTest() = 0;
+
+ protected:
+  TestFactoryBase() {}
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestFactoryBase);
+};
+
+// This class provides implementation of TeastFactoryBase interface.
+// It is used in TEST and TEST_F macros.
+template <class TestClass>
+class TestFactoryImpl : public TestFactoryBase {
+ public:
+  Test *CreateTest() override { return new TestClass; }
+};
+
+#if GTEST_OS_WINDOWS
+
+// Predicate-formatters for implementing the HRESULT checking macros
+// {ASSERT|EXPECT}_HRESULT_{SUCCEEDED|FAILED}
+// We pass a long instead of HRESULT to avoid causing an
+// include dependency for the HRESULT type.
+GTEST_API_ AssertionResult IsHRESULTSuccess(const char *expr,
+                                            long hr);  // NOLINT
+GTEST_API_ AssertionResult IsHRESULTFailure(const char *expr,
+                                            long hr);  // NOLINT
+
+#endif  // GTEST_OS_WINDOWS
+
+// Types of SetUpTestSuite() and TearDownTestSuite() functions.
+using SetUpTestSuiteFunc = void (*)();
+using TearDownTestSuiteFunc = void (*)();
+
+struct CodeLocation {
+  CodeLocation(const std::string &a_file, int a_line)
+      : file(a_file), line(a_line) {}
+
+  std::string file;
+  int line;
+};
+
+//  Helper to identify which setup function for TestCase / TestSuite to call.
+//  Only one function is allowed, either TestCase or TestSute but not both.
+
+// Utility functions to help SuiteApiResolver
+using SetUpTearDownSuiteFuncType = void (*)();
+
+inline SetUpTearDownSuiteFuncType GetNotDefaultOrNull(
+    SetUpTearDownSuiteFuncType a, SetUpTearDownSuiteFuncType def) {
+  return a == def ? nullptr : a;
+}
+
+template <typename T>
+//  Note that SuiteApiResolver inherits from T because
+//  SetUpTestSuite()/TearDownTestSuite() could be protected. Ths way
+//  SuiteApiResolver can access them.
+struct SuiteApiResolver : T {
+  // testing::Test is only forward declared at this point. So we make it a
+  // dependend class for the compiler to be OK with it.
+  using Test =
+      typename std::conditional<sizeof(T) != 0, ::testing::Test, void>::type;
+
+  static SetUpTearDownSuiteFuncType GetSetUpCaseOrSuite(const char *filename,
+                                                        int line_num) {
+    SetUpTearDownSuiteFuncType test_case_fp =
+        GetNotDefaultOrNull(&T::SetUpTestCase, &Test::SetUpTestCase);
+    SetUpTearDownSuiteFuncType test_suite_fp =
+        GetNotDefaultOrNull(&T::SetUpTestSuite, &Test::SetUpTestSuite);
+
+    GTEST_CHECK_(!test_case_fp || !test_suite_fp)
+        << "Test can not provide both SetUpTestSuite and SetUpTestCase, please "
+           "make sure there is only one present at "
+        << filename << ":" << line_num;
+
+    return test_case_fp != nullptr ? test_case_fp : test_suite_fp;
+  }
+
+  static SetUpTearDownSuiteFuncType GetTearDownCaseOrSuite(const char *filename,
+                                                           int line_num) {
+    SetUpTearDownSuiteFuncType test_case_fp =
+        GetNotDefaultOrNull(&T::TearDownTestCase, &Test::TearDownTestCase);
+    SetUpTearDownSuiteFuncType test_suite_fp =
+        GetNotDefaultOrNull(&T::TearDownTestSuite, &Test::TearDownTestSuite);
+
+    GTEST_CHECK_(!test_case_fp || !test_suite_fp)
+        << "Test can not provide both TearDownTestSuite and TearDownTestCase,"
+           " please make sure there is only one present at"
+        << filename << ":" << line_num;
+
+    return test_case_fp != nullptr ? test_case_fp : test_suite_fp;
+  }
+};
+
+// Creates a new TestInfo object and registers it with Google Test;
+// returns the created object.
+//
+// Arguments:
+//
+//   test_suite_name:   name of the test suite
+//   name:             name of the test
+//   type_param        the name of the test's type parameter, or NULL if
+//                     this is not a typed or a type-parameterized test.
+//   value_param       text representation of the test's value parameter,
+//                     or NULL if this is not a type-parameterized test.
+//   code_location:    code location where the test is defined
+//   fixture_class_id: ID of the test fixture class
+//   set_up_tc:        pointer to the function that sets up the test suite
+//   tear_down_tc:     pointer to the function that tears down the test suite
+//   factory:          pointer to the factory that creates a test object.
+//                     The newly created TestInfo instance will assume
+//                     ownership of the factory object.
+GTEST_API_ TestInfo *MakeAndRegisterTestInfo(
+    const char *test_suite_name, const char *name, const char *type_param,
+    const char *value_param, CodeLocation code_location,
+    TypeId fixture_class_id, SetUpTestSuiteFunc set_up_tc,
+    TearDownTestSuiteFunc tear_down_tc, TestFactoryBase *factory);
+
+// If *pstr starts with the given prefix, modifies *pstr to be right
+// past the prefix and returns true; otherwise leaves *pstr unchanged
+// and returns false.  None of pstr, *pstr, and prefix can be NULL.
+GTEST_API_ bool SkipPrefix(const char *prefix, const char **pstr);
+
+#if GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
+
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
+// State of the definition of a type-parameterized test suite.
+class GTEST_API_ TypedTestSuitePState {
+ public:
+  TypedTestSuitePState() : registered_(false) {}
+
+  // Adds the given test name to defined_test_names_ and return true
+  // if the test suite hasn't been registered; otherwise aborts the
+  // program.
+  bool AddTestName(const char *file, int line, const char *case_name,
+                   const char *test_name) {
+    if (registered_) {
+      fprintf(stderr,
+              "%s Test %s must be defined before "
+              "REGISTER_TYPED_TEST_SUITE_P(%s, ...).\n",
+              FormatFileLocation(file, line).c_str(), test_name, case_name);
+      fflush(stderr);
+      posix::Abort();
+    }
+    registered_tests_.insert(
+        ::std::make_pair(test_name, CodeLocation(file, line)));
+    return true;
+  }
+
+  bool TestExists(const std::string &test_name) const {
+    return registered_tests_.count(test_name) > 0;
+  }
+
+  const CodeLocation &GetCodeLocation(const std::string &test_name) const {
+    RegisteredTestsMap::const_iterator it = registered_tests_.find(test_name);
+    GTEST_CHECK_(it != registered_tests_.end());
+    return it->second;
+  }
+
+  // Verifies that registered_tests match the test names in
+  // defined_test_names_; returns registered_tests if successful, or
+  // aborts the program otherwise.
+  const char *VerifyRegisteredTestNames(const char *test_suite_name,
+                                        const char *file, int line,
+                                        const char *registered_tests);
+
+ private:
+  typedef ::std::map<std::string, CodeLocation> RegisteredTestsMap;
+
+  bool registered_;
+  RegisteredTestsMap registered_tests_;
+};
+
+//  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+using TypedTestCasePState = TypedTestSuitePState;
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
+
+// Skips to the first non-space char after the first comma in 'str';
+// returns NULL if no comma is found in 'str'.
+inline const char *SkipComma(const char *str) {
+  const char *comma = strchr(str, ',');
+  if (comma == nullptr) {
+    return nullptr;
+  }
+  while (IsSpace(*(++comma))) {
+  }
+  return comma;
+}
+
+// Returns the prefix of 'str' before the first comma in it; returns
+// the entire string if it contains no comma.
+inline std::string GetPrefixUntilComma(const char *str) {
+  const char *comma = strchr(str, ',');
+  return comma == nullptr ? str : std::string(str, comma);
+}
+
+// Splits a given string on a given delimiter, populating a given
+// vector with the fields.
+void SplitString(const ::std::string &str, char delimiter,
+                 ::std::vector<::std::string> *dest);
+
+// The default argument to the template below for the case when the user does
+// not provide a name generator.
+struct DefaultNameGenerator {
+  template <typename T>
+  static std::string GetName(int i) {
+    return StreamableToString(i);
+  }
+};
+
+template <typename Provided = DefaultNameGenerator>
+struct NameGeneratorSelector {
+  typedef Provided type;
+};
+
+template <typename NameGenerator>
+void GenerateNamesRecursively(internal::None, std::vector<std::string> *, int) {
+}
+
+template <typename NameGenerator, typename Types>
+void GenerateNamesRecursively(Types, std::vector<std::string> *result, int i) {
+  result->push_back(NameGenerator::template GetName<typename Types::Head>(i));
+  GenerateNamesRecursively<NameGenerator>(typename Types::Tail(), result,
+                                          i + 1);
+}
+
+template <typename NameGenerator, typename Types>
+std::vector<std::string> GenerateNames() {
+  std::vector<std::string> result;
+  GenerateNamesRecursively<NameGenerator>(Types(), &result, 0);
+  return result;
+}
+
+// TypeParameterizedTest<Fixture, TestSel, Types>::Register()
+// registers a list of type-parameterized tests with Google Test.  The
+// return value is insignificant - we just need to return something
+// such that we can call this function in a namespace scope.
+//
+// Implementation note: The GTEST_TEMPLATE_ macro declares a template
+// template parameter.  It's defined in gtest-type-util.h.
+template <GTEST_TEMPLATE_ Fixture, class TestSel, typename Types>
+class TypeParameterizedTest {
+ public:
+  // 'index' is the index of the test in the type list 'Types'
+  // specified in INSTANTIATE_TYPED_TEST_SUITE_P(Prefix, TestSuite,
+  // Types).  Valid values for 'index' are [0, N - 1] where N is the
+  // length of Types.
+  static bool Register(const char *prefix, const CodeLocation &code_location,
+                       const char *case_name, const char *test_names, int index,
+                       const std::vector<std::string> &type_names =
+                           GenerateNames<DefaultNameGenerator, Types>()) {
+    typedef typename Types::Head Type;
+    typedef Fixture<Type> FixtureClass;
+    typedef typename GTEST_BIND_(TestSel, Type) TestClass;
+
+    // First, registers the first type-parameterized test in the type
+    // list.
+    MakeAndRegisterTestInfo(
+        (std::string(prefix) + (prefix[0] == '\0' ? "" : "/") + case_name +
+         "/" + type_names[static_cast<size_t>(index)])
+            .c_str(),
+        StripTrailingSpaces(GetPrefixUntilComma(test_names)).c_str(),
+        GetTypeName<Type>().c_str(),
+        nullptr,  // No value parameter.
+        code_location, GetTypeId<FixtureClass>(),
+        SuiteApiResolver<TestClass>::GetSetUpCaseOrSuite(
+            code_location.file.c_str(), code_location.line),
+        SuiteApiResolver<TestClass>::GetTearDownCaseOrSuite(
+            code_location.file.c_str(), code_location.line),
+        new TestFactoryImpl<TestClass>);
+
+    // Next, recurses (at compile time) with the tail of the type list.
+    return TypeParameterizedTest<Fixture, TestSel,
+                                 typename Types::Tail>::Register(prefix,
+                                                                 code_location,
+                                                                 case_name,
+                                                                 test_names,
+                                                                 index + 1,
+                                                                 type_names);
+  }
+};
+
+// The base case for the compile time recursion.
+template <GTEST_TEMPLATE_ Fixture, class TestSel>
+class TypeParameterizedTest<Fixture, TestSel, internal::None> {
+ public:
+  static bool Register(const char * /*prefix*/, const CodeLocation &,
+                       const char * /*case_name*/, const char * /*test_names*/,
+                       int /*index*/,
+                       const std::vector<std::string> & =
+                           std::vector<std::string>() /*type_names*/) {
+    return true;
+  }
+};
+
+GTEST_API_ void RegisterTypeParameterizedTestSuite(const char *test_suite_name,
+                                                   CodeLocation code_location);
+GTEST_API_ void RegisterTypeParameterizedTestSuiteInstantiation(
+    const char *case_name);
+
+// TypeParameterizedTestSuite<Fixture, Tests, Types>::Register()
+// registers *all combinations* of 'Tests' and 'Types' with Google
+// Test.  The return value is insignificant - we just need to return
+// something such that we can call this function in a namespace scope.
+template <GTEST_TEMPLATE_ Fixture, typename Tests, typename Types>
+class TypeParameterizedTestSuite {
+ public:
+  static bool Register(const char *prefix, CodeLocation code_location,
+                       const TypedTestSuitePState *state, const char *case_name,
+                       const char *test_names,
+                       const std::vector<std::string> &type_names =
+                           GenerateNames<DefaultNameGenerator, Types>()) {
+    RegisterTypeParameterizedTestSuiteInstantiation(case_name);
+    std::string test_name =
+        StripTrailingSpaces(GetPrefixUntilComma(test_names));
+    if (!state->TestExists(test_name)) {
+      fprintf(stderr, "Failed to get code location for test %s.%s at %s.",
+              case_name, test_name.c_str(),
+              FormatFileLocation(code_location.file.c_str(), code_location.line)
+                  .c_str());
+      fflush(stderr);
+      posix::Abort();
+    }
+    const CodeLocation &test_location = state->GetCodeLocation(test_name);
+
+    typedef typename Tests::Head Head;
+
+    // First, register the first test in 'Test' for each type in 'Types'.
+    TypeParameterizedTest<Fixture, Head, Types>::Register(
+        prefix, test_location, case_name, test_names, 0, type_names);
+
+    // Next, recurses (at compile time) with the tail of the test list.
+    return TypeParameterizedTestSuite<Fixture, typename Tests::Tail,
+                                      Types>::Register(prefix, code_location,
+                                                       state, case_name,
+                                                       SkipComma(test_names),
+                                                       type_names);
+  }
+};
+
+// The base case for the compile time recursion.
+template <GTEST_TEMPLATE_ Fixture, typename Types>
+class TypeParameterizedTestSuite<Fixture, internal::None, Types> {
+ public:
+  static bool Register(const char * /*prefix*/, const CodeLocation &,
+                       const TypedTestSuitePState * /*state*/,
+                       const char * /*case_name*/, const char * /*test_names*/,
+                       const std::vector<std::string> & =
+                           std::vector<std::string>() /*type_names*/) {
+    return true;
+  }
+};
+
+#endif  // GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
+
+// Returns the current OS stack trace as an std::string.
+//
+// The maximum number of stack frames to be included is specified by
+// the gtest_stack_trace_depth flag.  The skip_count parameter
+// specifies the number of top frames to be skipped, which doesn't
+// count against the number of frames to be included.
+//
+// For example, if Foo() calls Bar(), which in turn calls
+// GetCurrentOsStackTraceExceptTop(..., 1), Foo() will be included in
+// the trace but Bar() and GetCurrentOsStackTraceExceptTop() won't.
+GTEST_API_ std::string GetCurrentOsStackTraceExceptTop(UnitTest *unit_test,
+                                                       int skip_count);
+
+// Helpers for suppressing warnings on unreachable code or constant
+// condition.
+
+// Always returns true.
+GTEST_API_ bool AlwaysTrue();
+
+// Always returns false.
+inline bool AlwaysFalse() { return !AlwaysTrue(); }
+
+// Helper for suppressing false warning from Clang on a const char*
+// variable declared in a conditional expression always being NULL in
+// the else branch.
+struct GTEST_API_ ConstCharPtr {
+  ConstCharPtr(const char *str) : value(str) {}
+  operator bool() const { return true; }
+  const char *value;
+};
+
+// Helper for declaring std::string within 'if' statement
+// in pre C++17 build environment.
+struct TrueWithString {
+  TrueWithString() = default;
+  explicit TrueWithString(const char *str) : value(str) {}
+  explicit TrueWithString(const std::string &str) : value(str) {}
+  explicit operator bool() const { return true; }
+  std::string value;
+};
+
+// A simple Linear Congruential Generator for generating random
+// numbers with a uniform distribution.  Unlike rand() and srand(), it
+// doesn't use global state (and therefore can't interfere with user
+// code).  Unlike rand_r(), it's portable.  An LCG isn't very random,
+// but it's good enough for our purposes.
+class GTEST_API_ Random {
+ public:
+  static const uint32_t kMaxRange = 1u << 31;
+
+  explicit Random(uint32_t seed) : state_(seed) {}
+
+  void Reseed(uint32_t seed) { state_ = seed; }
+
+  // Generates a random number from [0, range).  Crashes if 'range' is
+  // 0 or greater than kMaxRange.
+  uint32_t Generate(uint32_t range);
+
+ private:
+  uint32_t state_;
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(Random);
+};
+
+// Turns const U&, U&, const U, and U all into U.
+#define GTEST_REMOVE_REFERENCE_AND_CONST_(T) \
+  typename std::remove_const<typename std::remove_reference<T>::type>::type
+
+// IsAProtocolMessage<T>::value is a compile-time bool constant that's
+// true if and only if T is type proto2::Message or a subclass of it.
+template <typename T>
+struct IsAProtocolMessage
+    : public std::is_convertible<const T *, const ::proto2::Message *> {};
+
+// When the compiler sees expression IsContainerTest<C>(0), if C is an
+// STL-style container class, the first overload of IsContainerTest
+// will be viable (since both C::iterator* and C::const_iterator* are
+// valid types and NULL can be implicitly converted to them).  It will
+// be picked over the second overload as 'int' is a perfect match for
+// the type of argument 0.  If C::iterator or C::const_iterator is not
+// a valid type, the first overload is not viable, and the second
+// overload will be picked.  Therefore, we can determine whether C is
+// a container class by checking the type of IsContainerTest<C>(0).
+// The value of the expression is insignificant.
+//
+// In C++11 mode we check the existence of a const_iterator and that an
+// iterator is properly implemented for the container.
+//
+// For pre-C++11 that we look for both C::iterator and C::const_iterator.
+// The reason is that C++ injects the name of a class as a member of the
+// class itself (e.g. you can refer to class iterator as either
+// 'iterator' or 'iterator::iterator').  If we look for C::iterator
+// only, for example, we would mistakenly think that a class named
+// iterator is an STL container.
+//
+// Also note that the simpler approach of overloading
+// IsContainerTest(typename C::const_iterator*) and
+// IsContainerTest(...) doesn't work with Visual Age C++ and Sun C++.
+typedef int IsContainer;
+template <class C,
+          class Iterator = decltype(::std::declval<const C &>().begin()),
+          class = decltype(::std::declval<const C &>().end()),
+          class = decltype(++::std::declval<Iterator &>()),
+          class = decltype(*::std::declval<Iterator>()),
+          class = typename C::const_iterator>
+IsContainer IsContainerTest(int /* dummy */) {
+  return 0;
+}
+
+typedef char IsNotContainer;
+template <class C>
+IsNotContainer IsContainerTest(long /* dummy */) {
+  return '\0';
+}
+
+// Trait to detect whether a type T is a hash table.
+// The heuristic used is that the type contains an inner type `hasher` and does
+// not contain an inner type `reverse_iterator`.
+// If the container is iterable in reverse, then order might actually matter.
+template <typename T>
+struct IsHashTable {
+ private:
+  template <typename U>
+  static char test(typename U::hasher *, typename U::reverse_iterator *);
+  template <typename U>
+  static int test(typename U::hasher *, ...);
+  template <typename U>
+  static char test(...);
+
+ public:
+  static const bool value = sizeof(test<T>(nullptr, nullptr)) == sizeof(int);
+};
+
+template <typename T>
+const bool IsHashTable<T>::value;
+
+template <typename C,
+          bool = sizeof(IsContainerTest<C>(0)) == sizeof(IsContainer)>
+struct IsRecursiveContainerImpl;
+
+template <typename C>
+struct IsRecursiveContainerImpl<C, false> : public std::false_type {};
+
+// Since the IsRecursiveContainerImpl depends on the IsContainerTest we need to
+// obey the same inconsistencies as the IsContainerTest, namely check if
+// something is a container is relying on only const_iterator in C++11 and
+// is relying on both const_iterator and iterator otherwise
+template <typename C>
+struct IsRecursiveContainerImpl<C, true> {
+  using value_type = decltype(*std::declval<typename C::const_iterator>());
+  using type =
+      std::is_same<typename std::remove_const<
+                       typename std::remove_reference<value_type>::type>::type,
+                   C>;
+};
+
+// IsRecursiveContainer<Type> is a unary compile-time predicate that
+// evaluates whether C is a recursive container type. A recursive container
+// type is a container type whose value_type is equal to the container type
+// itself. An example for a recursive container type is
+// boost::filesystem::path, whose iterator has a value_type that is equal to
+// boost::filesystem::path.
+template <typename C>
+struct IsRecursiveContainer : public IsRecursiveContainerImpl<C>::type {};
+
+// Utilities for native arrays.
+
+// ArrayEq() compares two k-dimensional native arrays using the
+// elements' operator==, where k can be any integer >= 0.  When k is
+// 0, ArrayEq() degenerates into comparing a single pair of values.
+
+template <typename T, typename U>
+bool ArrayEq(const T *lhs, size_t size, const U *rhs);
+
+// This generic version is used when k is 0.
+template <typename T, typename U>
+inline bool ArrayEq(const T &lhs, const U &rhs) {
+  return lhs == rhs;
+}
+
+// This overload is used when k >= 1.
+template <typename T, typename U, size_t N>
+inline bool ArrayEq(const T (&lhs)[N], const U (&rhs)[N]) {
+  return internal::ArrayEq(lhs, N, rhs);
+}
+
+// This helper reduces code bloat.  If we instead put its logic inside
+// the previous ArrayEq() function, arrays with different sizes would
+// lead to different copies of the template code.
+template <typename T, typename U>
+bool ArrayEq(const T *lhs, size_t size, const U *rhs) {
+  for (size_t i = 0; i != size; i++) {
+    if (!internal::ArrayEq(lhs[i], rhs[i])) return false;
+  }
+  return true;
+}
+
+// Finds the first element in the iterator range [begin, end) that
+// equals elem.  Element may be a native array type itself.
+template <typename Iter, typename Element>
+Iter ArrayAwareFind(Iter begin, Iter end, const Element &elem) {
+  for (Iter it = begin; it != end; ++it) {
+    if (internal::ArrayEq(*it, elem)) return it;
+  }
+  return end;
+}
+
+// CopyArray() copies a k-dimensional native array using the elements'
+// operator=, where k can be any integer >= 0.  When k is 0,
+// CopyArray() degenerates into copying a single value.
+
+template <typename T, typename U>
+void CopyArray(const T *from, size_t size, U *to);
+
+// This generic version is used when k is 0.
+template <typename T, typename U>
+inline void CopyArray(const T &from, U *to) {
+  *to = from;
+}
+
+// This overload is used when k >= 1.
+template <typename T, typename U, size_t N>
+inline void CopyArray(const T (&from)[N], U (*to)[N]) {
+  internal::CopyArray(from, N, *to);
+}
+
+// This helper reduces code bloat.  If we instead put its logic inside
+// the previous CopyArray() function, arrays with different sizes
+// would lead to different copies of the template code.
+template <typename T, typename U>
+void CopyArray(const T *from, size_t size, U *to) {
+  for (size_t i = 0; i != size; i++) {
+    internal::CopyArray(from[i], to + i);
+  }
+}
+
+// The relation between an NativeArray object (see below) and the
+// native array it represents.
+// We use 2 different structs to allow non-copyable types to be used, as long
+// as RelationToSourceReference() is passed.
+struct RelationToSourceReference {};
+struct RelationToSourceCopy {};
+
+// Adapts a native array to a read-only STL-style container.  Instead
+// of the complete STL container concept, this adaptor only implements
+// members useful for Google Mock's container matchers.  New members
+// should be added as needed.  To simplify the implementation, we only
+// support Element being a raw type (i.e. having no top-level const or
+// reference modifier).  It's the client's responsibility to satisfy
+// this requirement.  Element can be an array type itself (hence
+// multi-dimensional arrays are supported).
+template <typename Element>
+class NativeArray {
+ public:
+  // STL-style container typedefs.
+  typedef Element value_type;
+  typedef Element *iterator;
+  typedef const Element *const_iterator;
+
+  // Constructs from a native array. References the source.
+  NativeArray(const Element *array, size_t count, RelationToSourceReference) {
+    InitRef(array, count);
+  }
+
+  // Constructs from a native array. Copies the source.
+  NativeArray(const Element *array, size_t count, RelationToSourceCopy) {
+    InitCopy(array, count);
+  }
+
+  // Copy constructor.
+  NativeArray(const NativeArray &rhs) {
+    (this->*rhs.clone_)(rhs.array_, rhs.size_);
+  }
+
+  ~NativeArray() {
+    if (clone_ != &NativeArray::InitRef) delete[] array_;
+  }
+
+  // STL-style container methods.
+  size_t size() const { return size_; }
+  const_iterator begin() const { return array_; }
+  const_iterator end() const { return array_ + size_; }
+  bool operator==(const NativeArray &rhs) const {
+    return size() == rhs.size() && ArrayEq(begin(), size(), rhs.begin());
+  }
+
+ private:
+  static_assert(!std::is_const<Element>::value, "Type must not be const");
+  static_assert(!std::is_reference<Element>::value,
+                "Type must not be a reference");
+
+  // Initializes this object with a copy of the input.
+  void InitCopy(const Element *array, size_t a_size) {
+    Element *const copy = new Element[a_size];
+    CopyArray(array, a_size, copy);
+    array_ = copy;
+    size_ = a_size;
+    clone_ = &NativeArray::InitCopy;
+  }
+
+  // Initializes this object with a reference of the input.
+  void InitRef(const Element *array, size_t a_size) {
+    array_ = array;
+    size_ = a_size;
+    clone_ = &NativeArray::InitRef;
+  }
+
+  const Element *array_;
+  size_t size_;
+  void (NativeArray::*clone_)(const Element *, size_t);
+
+  GTEST_DISALLOW_ASSIGN_(NativeArray);
+};
+
+// Backport of std::index_sequence.
+template <size_t... Is>
+struct IndexSequence {
+  using type = IndexSequence;
+};
+
+// Double the IndexSequence, and one if plus_one is true.
+template <bool plus_one, typename T, size_t sizeofT>
+struct DoubleSequence;
+template <size_t... I, size_t sizeofT>
+struct DoubleSequence<true, IndexSequence<I...>, sizeofT> {
+  using type = IndexSequence<I..., (sizeofT + I)..., 2 * sizeofT>;
+};
+template <size_t... I, size_t sizeofT>
+struct DoubleSequence<false, IndexSequence<I...>, sizeofT> {
+  using type = IndexSequence<I..., (sizeofT + I)...>;
+};
+
+// Backport of std::make_index_sequence.
+// It uses O(ln(N)) instantiation depth.
+template <size_t N>
+struct MakeIndexSequence
+    : DoubleSequence<N % 2 == 1, typename MakeIndexSequence<N / 2>::type,
+                     N / 2>::type {};
+
+template <>
+struct MakeIndexSequence<0> : IndexSequence<> {};
+
+template <size_t>
+struct Ignore {
+  Ignore(...);  // NOLINT
+};
+
+template <typename>
+struct ElemFromListImpl;
+template <size_t... I>
+struct ElemFromListImpl<IndexSequence<I...>> {
+  // We make Ignore a template to solve a problem with MSVC.
+  // A non-template Ignore would work fine with `decltype(Ignore(I))...`, but
+  // MSVC doesn't understand how to deal with that pack expansion.
+  // Use `0 * I` to have a single instantiation of Ignore.
+  template <typename R>
+  static R Apply(Ignore<0 * I>..., R (*)(), ...);
+};
+
+template <size_t N, typename... T>
+struct ElemFromList {
+  using type =
+      decltype(ElemFromListImpl<typename MakeIndexSequence<N>::type>::Apply(
+          static_cast<T (*)()>(nullptr)...));
+};
+
+template <typename... T>
+class FlatTuple;
+
+template <typename Derived, size_t I>
+struct FlatTupleElemBase;
+
+template <typename... T, size_t I>
+struct FlatTupleElemBase<FlatTuple<T...>, I> {
+  using value_type = typename ElemFromList<I, T...>::type;
+  FlatTupleElemBase() = default;
+  explicit FlatTupleElemBase(value_type t) : value(std::move(t)) {}
+  value_type value;
+};
+
+template <typename Derived, typename Idx>
+struct FlatTupleBase;
+
+template <size_t... Idx, typename... T>
+struct FlatTupleBase<FlatTuple<T...>, IndexSequence<Idx...>>
+    : FlatTupleElemBase<FlatTuple<T...>, Idx>... {
+  using Indices = IndexSequence<Idx...>;
+  FlatTupleBase() = default;
+  explicit FlatTupleBase(T... t)
+      : FlatTupleElemBase<FlatTuple<T...>, Idx>(std::move(t))... {}
+};
+
+// Analog to std::tuple but with different tradeoffs.
+// This class minimizes the template instantiation depth, thus allowing more
+// elements than std::tuple would. std::tuple has been seen to require an
+// instantiation depth of more than 10x the number of elements in some
+// implementations.
+// FlatTuple and ElemFromList are not recursive and have a fixed depth
+// regardless of T...
+// MakeIndexSequence, on the other hand, it is recursive but with an
+// instantiation depth of O(ln(N)).
+template <typename... T>
+class FlatTuple
+    : private FlatTupleBase<FlatTuple<T...>,
+                            typename MakeIndexSequence<sizeof...(T)>::type> {
+  using Indices = typename FlatTupleBase<
+      FlatTuple<T...>, typename MakeIndexSequence<sizeof...(T)>::type>::Indices;
+
+ public:
+  FlatTuple() = default;
+  explicit FlatTuple(T... t) : FlatTuple::FlatTupleBase(std::move(t)...) {}
+
+  template <size_t I>
+  const typename ElemFromList<I, T...>::type &Get() const {
+    return static_cast<const FlatTupleElemBase<FlatTuple, I> *>(this)->value;
+  }
+
+  template <size_t I>
+  typename ElemFromList<I, T...>::type &Get() {
+    return static_cast<FlatTupleElemBase<FlatTuple, I> *>(this)->value;
+  }
+};
+
+// Utility functions to be called with static_assert to induce deprecation
+// warnings.
+GTEST_INTERNAL_DEPRECATED(
+    "INSTANTIATE_TEST_CASE_P is deprecated, please use "
+    "INSTANTIATE_TEST_SUITE_P")
+constexpr bool InstantiateTestCase_P_IsDeprecated() { return true; }
+
+GTEST_INTERNAL_DEPRECATED(
+    "TYPED_TEST_CASE_P is deprecated, please use "
+    "TYPED_TEST_SUITE_P")
+constexpr bool TypedTestCase_P_IsDeprecated() { return true; }
+
+GTEST_INTERNAL_DEPRECATED(
+    "TYPED_TEST_CASE is deprecated, please use "
+    "TYPED_TEST_SUITE")
+constexpr bool TypedTestCaseIsDeprecated() { return true; }
+
+GTEST_INTERNAL_DEPRECATED(
+    "REGISTER_TYPED_TEST_CASE_P is deprecated, please use "
+    "REGISTER_TYPED_TEST_SUITE_P")
+constexpr bool RegisterTypedTestCase_P_IsDeprecated() { return true; }
+
+GTEST_INTERNAL_DEPRECATED(
+    "INSTANTIATE_TYPED_TEST_CASE_P is deprecated, please use "
+    "INSTANTIATE_TYPED_TEST_SUITE_P")
+constexpr bool InstantiateTypedTestCase_P_IsDeprecated() { return true; }
+
+}  // namespace internal
+}  // namespace testing
+
+#define GTEST_MESSAGE_AT_(file, line, message, result_type)             \
+  ::testing::internal::AssertHelper(result_type, file, line, message) = \
+      ::testing::Message()
+
+#define GTEST_MESSAGE_(message, result_type) \
+  GTEST_MESSAGE_AT_(__FILE__, __LINE__, message, result_type)
+
+#define GTEST_FATAL_FAILURE_(message) \
+  return GTEST_MESSAGE_(message, ::testing::TestPartResult::kFatalFailure)
+
+#define GTEST_NONFATAL_FAILURE_(message) \
+  GTEST_MESSAGE_(message, ::testing::TestPartResult::kNonFatalFailure)
+
+#define GTEST_SUCCESS_(message) \
+  GTEST_MESSAGE_(message, ::testing::TestPartResult::kSuccess)
+
+#define GTEST_SKIP_(message) \
+  return GTEST_MESSAGE_(message, ::testing::TestPartResult::kSkip)
+
+// Suppress MSVC warning 4072 (unreachable code) for the code following
+// statement if it returns or throws (or doesn't return or throw in some
+// situations).
+#define GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement) \
+  if (::testing::internal::AlwaysTrue()) {                        \
+    statement;                                                    \
+  }
+
+#define GTEST_TEST_THROW_(statement, expected_exception, fail)              \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_                                             \
+  if (::testing::internal::ConstCharPtr gtest_msg = "") {                   \
+    bool gtest_caught_expected = false;                                     \
+    try {                                                                   \
+      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement);            \
+    } catch (expected_exception const &) {                                  \
+      gtest_caught_expected = true;                                         \
+    } catch (...) {                                                         \
+      gtest_msg.value = "Expected: " #statement                             \
+                        " throws an exception of type " #expected_exception \
+                        ".\n  Actual: it throws a different type.";         \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__);           \
+    }                                                                       \
+    if (!gtest_caught_expected) {                                           \
+      gtest_msg.value = "Expected: " #statement                             \
+                        " throws an exception of type " #expected_exception \
+                        ".\n  Actual: it throws nothing.";                  \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__);           \
+    }                                                                       \
+  } else                                                                    \
+    GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__)                   \
+        : fail(gtest_msg.value)
+
+#if GTEST_HAS_EXCEPTIONS
+
+#define GTEST_TEST_NO_THROW_CATCH_STD_EXCEPTION_()                           \
+  catch (std::exception const &e) {                                          \
+    gtest_msg.value =                                                        \
+        ("it throws std::exception-derived exception with description: \""); \
+    gtest_msg.value += e.what();                                             \
+    gtest_msg.value += "\".";                                                \
+    goto GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__);            \
+  }
+
+#else  // GTEST_HAS_EXCEPTIONS
+
+#define GTEST_TEST_NO_THROW_CATCH_STD_EXCEPTION_()
+
+#endif  // GTEST_HAS_EXCEPTIONS
+
+#define GTEST_TEST_NO_THROW_(statement, fail)                            \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_                                          \
+  if (::testing::internal::TrueWithString gtest_msg{}) {                 \
+    try {                                                                \
+      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement);         \
+    }                                                                    \
+    GTEST_TEST_NO_THROW_CATCH_STD_EXCEPTION_()                           \
+    catch (...) {                                                        \
+      gtest_msg.value = "it throws.";                                    \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__);      \
+    }                                                                    \
+  } else                                                                 \
+    GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__)              \
+        : fail(("Expected: " #statement " doesn't throw an exception.\n" \
+                "  Actual: " +                                           \
+                gtest_msg.value)                                         \
+                   .c_str())
+
+#define GTEST_TEST_ANY_THROW_(statement, fail)                       \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_                                      \
+  if (::testing::internal::AlwaysTrue()) {                           \
+    bool gtest_caught_any = false;                                   \
+    try {                                                            \
+      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement);     \
+    } catch (...) {                                                  \
+      gtest_caught_any = true;                                       \
+    }                                                                \
+    if (!gtest_caught_any) {                                         \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__); \
+    }                                                                \
+  } else                                                             \
+    GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__)         \
+        : fail("Expected: " #statement                               \
+               " throws an exception.\n"                             \
+               "  Actual: it doesn't.")
+
+// Implements Boolean test assertions such as EXPECT_TRUE. expression can be
+// either a boolean expression or an AssertionResult. text is a textual
+// represenation of expression as it was passed into the EXPECT_TRUE.
+#define GTEST_TEST_BOOLEAN_(expression, text, actual, expected, fail) \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_                                       \
+  if (const ::testing::AssertionResult gtest_ar_ =                    \
+          ::testing::AssertionResult(expression))                     \
+    ;                                                                 \
+  else                                                                \
+    fail(::testing::internal::GetBoolAssertionFailureMessage(         \
+             gtest_ar_, text, #actual, #expected)                     \
+             .c_str())
+
+#define GTEST_TEST_NO_FATAL_FAILURE_(statement, fail)                          \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_                                                \
+  if (::testing::internal::AlwaysTrue()) {                                     \
+    ::testing::internal::HasNewFatalFailureHelper gtest_fatal_failure_checker; \
+    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement);                 \
+    if (gtest_fatal_failure_checker.has_new_fatal_failure()) {                 \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__);            \
+    }                                                                          \
+  } else                                                                       \
+    GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__)                    \
+        : fail("Expected: " #statement                                         \
+               " doesn't generate new fatal "                                  \
+               "failures in the current thread.\n"                             \
+               "  Actual: it does.")
+
+// Expands to the name of the class that implements the given test.
+#define GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) \
+  test_suite_name##_##test_name##_Test
+
+// Helper macro for defining tests.
+#define GTEST_TEST_(test_suite_name, test_name, parent_class, parent_id)      \
+  static_assert(sizeof(GTEST_STRINGIFY_(test_suite_name)) > 1,                \
+                "test_suite_name must not be empty");                         \
+  static_assert(sizeof(GTEST_STRINGIFY_(test_name)) > 1,                      \
+                "test_name must not be empty");                               \
+  class GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)                    \
+      : public parent_class {                                                 \
+   public:                                                                    \
+    GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)() {}                   \
+    ~GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)() override = default; \
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(GTEST_TEST_CLASS_NAME_(test_suite_name,   \
+                                                           test_name));       \
+    GTEST_DISALLOW_MOVE_AND_ASSIGN_(GTEST_TEST_CLASS_NAME_(test_suite_name,   \
+                                                           test_name));       \
+                                                                              \
+   private:                                                                   \
+    void TestBody() override;                                                 \
+    static ::testing::TestInfo *const test_info_ GTEST_ATTRIBUTE_UNUSED_;     \
+  };                                                                          \
+                                                                              \
+  ::testing::TestInfo *const GTEST_TEST_CLASS_NAME_(test_suite_name,          \
+                                                    test_name)::test_info_ =  \
+      ::testing::internal::MakeAndRegisterTestInfo(                           \
+          #test_suite_name, #test_name, nullptr, nullptr,                     \
+          ::testing::internal::CodeLocation(__FILE__, __LINE__), (parent_id), \
+          ::testing::internal::SuiteApiResolver<                              \
+              parent_class>::GetSetUpCaseOrSuite(__FILE__, __LINE__),         \
+          ::testing::internal::SuiteApiResolver<                              \
+              parent_class>::GetTearDownCaseOrSuite(__FILE__, __LINE__),      \
+          new ::testing::internal::TestFactoryImpl<GTEST_TEST_CLASS_NAME_(    \
+              test_suite_name, test_name)>);                                  \
+  void GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)::TestBody()
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
diff --git a/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-param-util.h b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-param-util.h
new file mode 100644
index 000000000..0d8fc71ce
--- /dev/null
+++ b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-param-util.h
@@ -0,0 +1,922 @@
+// Copyright 2008 Google Inc.
+// All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Type and function utilities for implementing parameterized tests.
+
+// GOOGLETEST_CM0001 DO NOT DELETE
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
+
+#include <ctype.h>
+
+#include <cassert>
+#include <iterator>
+#include <memory>
+#include <set>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "gtest/internal/gtest-internal.h"
+#include "gtest/internal/gtest-port.h"
+#include "gtest/gtest-printers.h"
+#include "gtest/gtest-test-part.h"
+
+namespace testing {
+// Input to a parameterized test name generator, describing a test parameter.
+// Consists of the parameter value and the integer parameter index.
+template <class ParamType>
+struct TestParamInfo {
+  TestParamInfo(const ParamType &a_param, size_t an_index)
+      : param(a_param), index(an_index) {}
+  ParamType param;
+  size_t index;
+};
+
+// A builtin parameterized test name generator which returns the result of
+// testing::PrintToString.
+struct PrintToStringParamName {
+  template <class ParamType>
+  std::string operator()(const TestParamInfo<ParamType> &info) const {
+    return PrintToString(info.param);
+  }
+};
+
+namespace internal {
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+// Utility Functions
+
+// Outputs a message explaining invalid registration of different
+// fixture class for the same test suite. This may happen when
+// TEST_P macro is used to define two tests with the same name
+// but in different namespaces.
+GTEST_API_ void ReportInvalidTestSuiteType(const char *test_suite_name,
+                                           CodeLocation code_location);
+
+template <typename>
+class ParamGeneratorInterface;
+template <typename>
+class ParamGenerator;
+
+// Interface for iterating over elements provided by an implementation
+// of ParamGeneratorInterface<T>.
+template <typename T>
+class ParamIteratorInterface {
+ public:
+  virtual ~ParamIteratorInterface() {}
+  // A pointer to the base generator instance.
+  // Used only for the purposes of iterator comparison
+  // to make sure that two iterators belong to the same generator.
+  virtual const ParamGeneratorInterface<T> *BaseGenerator() const = 0;
+  // Advances iterator to point to the next element
+  // provided by the generator. The caller is responsible
+  // for not calling Advance() on an iterator equal to
+  // BaseGenerator()->End().
+  virtual void Advance() = 0;
+  // Clones the iterator object. Used for implementing copy semantics
+  // of ParamIterator<T>.
+  virtual ParamIteratorInterface *Clone() const = 0;
+  // Dereferences the current iterator and provides (read-only) access
+  // to the pointed value. It is the caller's responsibility not to call
+  // Current() on an iterator equal to BaseGenerator()->End().
+  // Used for implementing ParamGenerator<T>::operator*().
+  virtual const T *Current() const = 0;
+  // Determines whether the given iterator and other point to the same
+  // element in the sequence generated by the generator.
+  // Used for implementing ParamGenerator<T>::operator==().
+  virtual bool Equals(const ParamIteratorInterface &other) const = 0;
+};
+
+// Class iterating over elements provided by an implementation of
+// ParamGeneratorInterface<T>. It wraps ParamIteratorInterface<T>
+// and implements the const forward iterator concept.
+template <typename T>
+class ParamIterator {
+ public:
+  typedef T value_type;
+  typedef const T &reference;
+  typedef ptrdiff_t difference_type;
+
+  // ParamIterator assumes ownership of the impl_ pointer.
+  ParamIterator(const ParamIterator &other) : impl_(other.impl_->Clone()) {}
+  ParamIterator &operator=(const ParamIterator &other) {
+    if (this != &other) impl_.reset(other.impl_->Clone());
+    return *this;
+  }
+
+  const T &operator*() const { return *impl_->Current(); }
+  const T *operator->() const { return impl_->Current(); }
+  // Prefix version of operator++.
+  ParamIterator &operator++() {
+    impl_->Advance();
+    return *this;
+  }
+  // Postfix version of operator++.
+  ParamIterator operator++(int /*unused*/) {
+    ParamIteratorInterface<T> *clone = impl_->Clone();
+    impl_->Advance();
+    return ParamIterator(clone);
+  }
+  bool operator==(const ParamIterator &other) const {
+    return impl_.get() == other.impl_.get() || impl_->Equals(*other.impl_);
+  }
+  bool operator!=(const ParamIterator &other) const {
+    return !(*this == other);
+  }
+
+ private:
+  friend class ParamGenerator<T>;
+  explicit ParamIterator(ParamIteratorInterface<T> *impl) : impl_(impl) {}
+  std::unique_ptr<ParamIteratorInterface<T>> impl_;
+};
+
+// ParamGeneratorInterface<T> is the binary interface to access generators
+// defined in other translation units.
+template <typename T>
+class ParamGeneratorInterface {
+ public:
+  typedef T ParamType;
+
+  virtual ~ParamGeneratorInterface() {}
+
+  // Generator interface definition
+  virtual ParamIteratorInterface<T> *Begin() const = 0;
+  virtual ParamIteratorInterface<T> *End() const = 0;
+};
+
+// Wraps ParamGeneratorInterface<T> and provides general generator syntax
+// compatible with the STL Container concept.
+// This class implements copy initialization semantics and the contained
+// ParamGeneratorInterface<T> instance is shared among all copies
+// of the original object. This is possible because that instance is immutable.
+template <typename T>
+class ParamGenerator {
+ public:
+  typedef ParamIterator<T> iterator;
+
+  explicit ParamGenerator(ParamGeneratorInterface<T> *impl) : impl_(impl) {}
+  ParamGenerator(const ParamGenerator &other) : impl_(other.impl_) {}
+
+  ParamGenerator &operator=(const ParamGenerator &other) {
+    impl_ = other.impl_;
+    return *this;
+  }
+
+  iterator begin() const { return iterator(impl_->Begin()); }
+  iterator end() const { return iterator(impl_->End()); }
+
+ private:
+  std::shared_ptr<const ParamGeneratorInterface<T>> impl_;
+};
+
+// Generates values from a range of two comparable values. Can be used to
+// generate sequences of user-defined types that implement operator+() and
+// operator<().
+// This class is used in the Range() function.
+template <typename T, typename IncrementT>
+class RangeGenerator : public ParamGeneratorInterface<T> {
+ public:
+  RangeGenerator(T begin, T end, IncrementT step)
+      : begin_(begin), end_(end), step_(step),
+        end_index_(CalculateEndIndex(begin, end, step)) {}
+  ~RangeGenerator() override {}
+
+  ParamIteratorInterface<T> *Begin() const override {
+    return new Iterator(this, begin_, 0, step_);
+  }
+  ParamIteratorInterface<T> *End() const override {
+    return new Iterator(this, end_, end_index_, step_);
+  }
+
+ private:
+  class Iterator : public ParamIteratorInterface<T> {
+   public:
+    Iterator(const ParamGeneratorInterface<T> *base, T value, int index,
+             IncrementT step)
+        : base_(base), value_(value), index_(index), step_(step) {}
+    ~Iterator() override {}
+
+    const ParamGeneratorInterface<T> *BaseGenerator() const override {
+      return base_;
+    }
+    void Advance() override {
+      value_ = static_cast<T>(value_ + step_);
+      index_++;
+    }
+    ParamIteratorInterface<T> *Clone() const override {
+      return new Iterator(*this);
+    }
+    const T *Current() const override { return &value_; }
+    bool Equals(const ParamIteratorInterface<T> &other) const override {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const int other_index =
+          CheckedDowncastToActualType<const Iterator>(&other)->index_;
+      return index_ == other_index;
+    }
+
+   private:
+    Iterator(const Iterator &other)
+        : ParamIteratorInterface<T>(), base_(other.base_), value_(other.value_),
+          index_(other.index_), step_(other.step_) {}
+
+    // No implementation - assignment is unsupported.
+    void operator=(const Iterator &other);
+
+    const ParamGeneratorInterface<T> *const base_;
+    T value_;
+    int index_;
+    const IncrementT step_;
+  };  // class RangeGenerator::Iterator
+
+  static int CalculateEndIndex(const T &begin, const T &end,
+                               const IncrementT &step) {
+    int end_index = 0;
+    for (T i = begin; i < end; i = static_cast<T>(i + step)) end_index++;
+    return end_index;
+  }
+
+  // No implementation - assignment is unsupported.
+  void operator=(const RangeGenerator &other);
+
+  const T begin_;
+  const T end_;
+  const IncrementT step_;
+  // The index for the end() iterator. All the elements in the generated
+  // sequence are indexed (0-based) to aid iterator comparison.
+  const int end_index_;
+};  // class RangeGenerator
+
+// Generates values from a pair of STL-style iterators. Used in the
+// ValuesIn() function. The elements are copied from the source range
+// since the source can be located on the stack, and the generator
+// is likely to persist beyond that stack frame.
+template <typename T>
+class ValuesInIteratorRangeGenerator : public ParamGeneratorInterface<T> {
+ public:
+  template <typename ForwardIterator>
+  ValuesInIteratorRangeGenerator(ForwardIterator begin, ForwardIterator end)
+      : container_(begin, end) {}
+  ~ValuesInIteratorRangeGenerator() override {}
+
+  ParamIteratorInterface<T> *Begin() const override {
+    return new Iterator(this, container_.begin());
+  }
+  ParamIteratorInterface<T> *End() const override {
+    return new Iterator(this, container_.end());
+  }
+
+ private:
+  typedef typename ::std::vector<T> ContainerType;
+
+  class Iterator : public ParamIteratorInterface<T> {
+   public:
+    Iterator(const ParamGeneratorInterface<T> *base,
+             typename ContainerType::const_iterator iterator)
+        : base_(base), iterator_(iterator) {}
+    ~Iterator() override {}
+
+    const ParamGeneratorInterface<T> *BaseGenerator() const override {
+      return base_;
+    }
+    void Advance() override {
+      ++iterator_;
+      value_.reset();
+    }
+    ParamIteratorInterface<T> *Clone() const override {
+      return new Iterator(*this);
+    }
+    // We need to use cached value referenced by iterator_ because *iterator_
+    // can return a temporary object (and of type other then T), so just
+    // having "return &*iterator_;" doesn't work.
+    // value_ is updated here and not in Advance() because Advance()
+    // can advance iterator_ beyond the end of the range, and we cannot
+    // detect that fact. The client code, on the other hand, is
+    // responsible for not calling Current() on an out-of-range iterator.
+    const T *Current() const override {
+      if (value_.get() == nullptr) value_.reset(new T(*iterator_));
+      return value_.get();
+    }
+    bool Equals(const ParamIteratorInterface<T> &other) const override {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      return iterator_ ==
+             CheckedDowncastToActualType<const Iterator>(&other)->iterator_;
+    }
+
+   private:
+    Iterator(const Iterator &other)
+        // The explicit constructor call suppresses a false warning
+        // emitted by gcc when supplied with the -Wextra option.
+        : ParamIteratorInterface<T>(), base_(other.base_),
+          iterator_(other.iterator_) {}
+
+    const ParamGeneratorInterface<T> *const base_;
+    typename ContainerType::const_iterator iterator_;
+    // A cached value of *iterator_. We keep it here to allow access by
+    // pointer in the wrapping iterator's operator->().
+    // value_ needs to be mutable to be accessed in Current().
+    // Use of std::unique_ptr helps manage cached value's lifetime,
+    // which is bound by the lifespan of the iterator itself.
+    mutable std::unique_ptr<const T> value_;
+  };  // class ValuesInIteratorRangeGenerator::Iterator
+
+  // No implementation - assignment is unsupported.
+  void operator=(const ValuesInIteratorRangeGenerator &other);
+
+  const ContainerType container_;
+};  // class ValuesInIteratorRangeGenerator
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Default parameterized test name generator, returns a string containing the
+// integer test parameter index.
+template <class ParamType>
+std::string DefaultParamName(const TestParamInfo<ParamType> &info) {
+  Message name_stream;
+  name_stream << info.index;
+  return name_stream.GetString();
+}
+
+template <typename T = int>
+void TestNotEmpty() {
+  static_assert(sizeof(T) == 0, "Empty arguments are not allowed.");
+}
+template <typename T = int>
+void TestNotEmpty(const T &) {}
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Stores a parameter value and later creates tests parameterized with that
+// value.
+template <class TestClass>
+class ParameterizedTestFactory : public TestFactoryBase {
+ public:
+  typedef typename TestClass::ParamType ParamType;
+  explicit ParameterizedTestFactory(ParamType parameter)
+      : parameter_(parameter) {}
+  Test *CreateTest() override {
+    TestClass::SetParam(&parameter_);
+    return new TestClass();
+  }
+
+ private:
+  const ParamType parameter_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestFactory);
+};
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// TestMetaFactoryBase is a base class for meta-factories that create
+// test factories for passing into MakeAndRegisterTestInfo function.
+template <class ParamType>
+class TestMetaFactoryBase {
+ public:
+  virtual ~TestMetaFactoryBase() {}
+
+  virtual TestFactoryBase *CreateTestFactory(ParamType parameter) = 0;
+};
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// TestMetaFactory creates test factories for passing into
+// MakeAndRegisterTestInfo function. Since MakeAndRegisterTestInfo receives
+// ownership of test factory pointer, same factory object cannot be passed
+// into that method twice. But ParameterizedTestSuiteInfo is going to call
+// it for each Test/Parameter value combination. Thus it needs meta factory
+// creator class.
+template <class TestSuite>
+class TestMetaFactory
+    : public TestMetaFactoryBase<typename TestSuite::ParamType> {
+ public:
+  using ParamType = typename TestSuite::ParamType;
+
+  TestMetaFactory() {}
+
+  TestFactoryBase *CreateTestFactory(ParamType parameter) override {
+    return new ParameterizedTestFactory<TestSuite>(parameter);
+  }
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestMetaFactory);
+};
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// ParameterizedTestSuiteInfoBase is a generic interface
+// to ParameterizedTestSuiteInfo classes. ParameterizedTestSuiteInfoBase
+// accumulates test information provided by TEST_P macro invocations
+// and generators provided by INSTANTIATE_TEST_SUITE_P macro invocations
+// and uses that information to register all resulting test instances
+// in RegisterTests method. The ParameterizeTestSuiteRegistry class holds
+// a collection of pointers to the ParameterizedTestSuiteInfo objects
+// and calls RegisterTests() on each of them when asked.
+class ParameterizedTestSuiteInfoBase {
+ public:
+  virtual ~ParameterizedTestSuiteInfoBase() {}
+
+  // Base part of test suite name for display purposes.
+  virtual const std::string &GetTestSuiteName() const = 0;
+  // Test case id to verify identity.
+  virtual TypeId GetTestSuiteTypeId() const = 0;
+  // UnitTest class invokes this method to register tests in this
+  // test suite right before running them in RUN_ALL_TESTS macro.
+  // This method should not be called more than once on any single
+  // instance of a ParameterizedTestSuiteInfoBase derived class.
+  virtual void RegisterTests() = 0;
+
+ protected:
+  ParameterizedTestSuiteInfoBase() {}
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestSuiteInfoBase);
+};
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Report a the name of a test_suit as safe to ignore
+// as the side effect of construction of this type.
+struct MarkAsIgnored {
+  explicit MarkAsIgnored(const char *test_suite);
+};
+
+GTEST_API_ void InsertSyntheticTestCase(const std::string &name,
+                                        CodeLocation location, bool has_test_p);
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// ParameterizedTestSuiteInfo accumulates tests obtained from TEST_P
+// macro invocations for a particular test suite and generators
+// obtained from INSTANTIATE_TEST_SUITE_P macro invocations for that
+// test suite. It registers tests with all values generated by all
+// generators when asked.
+template <class TestSuite>
+class ParameterizedTestSuiteInfo : public ParameterizedTestSuiteInfoBase {
+ public:
+  // ParamType and GeneratorCreationFunc are private types but are required
+  // for declarations of public methods AddTestPattern() and
+  // AddTestSuiteInstantiation().
+  using ParamType = typename TestSuite::ParamType;
+  // A function that returns an instance of appropriate generator type.
+  typedef ParamGenerator<ParamType>(GeneratorCreationFunc)();
+  using ParamNameGeneratorFunc = std::string(const TestParamInfo<ParamType> &);
+
+  explicit ParameterizedTestSuiteInfo(const char *name,
+                                      CodeLocation code_location)
+      : test_suite_name_(name), code_location_(code_location) {}
+
+  // Test case base name for display purposes.
+  const std::string &GetTestSuiteName() const override {
+    return test_suite_name_;
+  }
+  // Test case id to verify identity.
+  TypeId GetTestSuiteTypeId() const override { return GetTypeId<TestSuite>(); }
+  // TEST_P macro uses AddTestPattern() to record information
+  // about a single test in a LocalTestInfo structure.
+  // test_suite_name is the base name of the test suite (without invocation
+  // prefix). test_base_name is the name of an individual test without
+  // parameter index. For the test SequenceA/FooTest.DoBar/1 FooTest is
+  // test suite base name and DoBar is test base name.
+  void AddTestPattern(const char *test_suite_name, const char *test_base_name,
+                      TestMetaFactoryBase<ParamType> *meta_factory) {
+    tests_.push_back(std::shared_ptr<TestInfo>(
+        new TestInfo(test_suite_name, test_base_name, meta_factory)));
+  }
+  // INSTANTIATE_TEST_SUITE_P macro uses AddGenerator() to record information
+  // about a generator.
+  int AddTestSuiteInstantiation(const std::string &instantiation_name,
+                                GeneratorCreationFunc *func,
+                                ParamNameGeneratorFunc *name_func,
+                                const char *file, int line) {
+    instantiations_.push_back(
+        InstantiationInfo(instantiation_name, func, name_func, file, line));
+    return 0;  // Return value used only to run this method in namespace scope.
+  }
+  // UnitTest class invokes this method to register tests in this test suite
+  // right before running tests in RUN_ALL_TESTS macro.
+  // This method should not be called more than once on any single
+  // instance of a ParameterizedTestSuiteInfoBase derived class.
+  // UnitTest has a guard to prevent from calling this method more than once.
+  void RegisterTests() override {
+    bool generated_instantiations = false;
+
+    for (typename TestInfoContainer::iterator test_it = tests_.begin();
+         test_it != tests_.end(); ++test_it) {
+      std::shared_ptr<TestInfo> test_info = *test_it;
+      for (typename InstantiationContainer::iterator gen_it =
+               instantiations_.begin();
+           gen_it != instantiations_.end(); ++gen_it) {
+        const std::string &instantiation_name = gen_it->name;
+        ParamGenerator<ParamType> generator((*gen_it->generator)());
+        ParamNameGeneratorFunc *name_func = gen_it->name_func;
+        const char *file = gen_it->file;
+        int line = gen_it->line;
+
+        std::string test_suite_name;
+        if (!instantiation_name.empty())
+          test_suite_name = instantiation_name + "/";
+        test_suite_name += test_info->test_suite_base_name;
+
+        size_t i = 0;
+        std::set<std::string> test_param_names;
+        for (typename ParamGenerator<ParamType>::iterator param_it =
+                 generator.begin();
+             param_it != generator.end(); ++param_it, ++i) {
+          generated_instantiations = true;
+
+          Message test_name_stream;
+
+          std::string param_name =
+              name_func(TestParamInfo<ParamType>(*param_it, i));
+
+          GTEST_CHECK_(IsValidParamName(param_name))
+              << "Parameterized test name '" << param_name
+              << "' is invalid, in " << file << " line " << line << std::endl;
+
+          GTEST_CHECK_(test_param_names.count(param_name) == 0)
+              << "Duplicate parameterized test name '" << param_name << "', in "
+              << file << " line " << line << std::endl;
+
+          test_param_names.insert(param_name);
+
+          if (!test_info->test_base_name.empty()) {
+            test_name_stream << test_info->test_base_name << "/";
+          }
+          test_name_stream << param_name;
+          MakeAndRegisterTestInfo(
+              test_suite_name.c_str(), test_name_stream.GetString().c_str(),
+              nullptr,  // No type parameter.
+              PrintToString(*param_it).c_str(), code_location_,
+              GetTestSuiteTypeId(),
+              SuiteApiResolver<TestSuite>::GetSetUpCaseOrSuite(file, line),
+              SuiteApiResolver<TestSuite>::GetTearDownCaseOrSuite(file, line),
+              test_info->test_meta_factory->CreateTestFactory(*param_it));
+        }  // for param_it
+      }    // for gen_it
+    }      // for test_it
+
+    if (!generated_instantiations) {
+      // There are no generaotrs, or they all generate nothing ...
+      InsertSyntheticTestCase(GetTestSuiteName(), code_location_,
+                              !tests_.empty());
+    }
+  }  // RegisterTests
+
+ private:
+  // LocalTestInfo structure keeps information about a single test registered
+  // with TEST_P macro.
+  struct TestInfo {
+    TestInfo(const char *a_test_suite_base_name, const char *a_test_base_name,
+             TestMetaFactoryBase<ParamType> *a_test_meta_factory)
+        : test_suite_base_name(a_test_suite_base_name),
+          test_base_name(a_test_base_name),
+          test_meta_factory(a_test_meta_factory) {}
+
+    const std::string test_suite_base_name;
+    const std::string test_base_name;
+    const std::unique_ptr<TestMetaFactoryBase<ParamType>> test_meta_factory;
+  };
+  using TestInfoContainer = ::std::vector<std::shared_ptr<TestInfo>>;
+  // Records data received from INSTANTIATE_TEST_SUITE_P macros:
+  //  <Instantiation name, Sequence generator creation function,
+  //     Name generator function, Source file, Source line>
+  struct InstantiationInfo {
+    InstantiationInfo(const std::string &name_in,
+                      GeneratorCreationFunc *generator_in,
+                      ParamNameGeneratorFunc *name_func_in, const char *file_in,
+                      int line_in)
+        : name(name_in), generator(generator_in), name_func(name_func_in),
+          file(file_in), line(line_in) {}
+
+    std::string name;
+    GeneratorCreationFunc *generator;
+    ParamNameGeneratorFunc *name_func;
+    const char *file;
+    int line;
+  };
+  typedef ::std::vector<InstantiationInfo> InstantiationContainer;
+
+  static bool IsValidParamName(const std::string &name) {
+    // Check for empty string
+    if (name.empty()) return false;
+
+    // Check for invalid characters
+    for (std::string::size_type index = 0; index < name.size(); ++index) {
+      if (!isalnum(name[index]) && name[index] != '_') return false;
+    }
+
+    return true;
+  }
+
+  const std::string test_suite_name_;
+  CodeLocation code_location_;
+  TestInfoContainer tests_;
+  InstantiationContainer instantiations_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestSuiteInfo);
+};  // class ParameterizedTestSuiteInfo
+
+//  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+template <class TestCase>
+using ParameterizedTestCaseInfo = ParameterizedTestSuiteInfo<TestCase>;
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// ParameterizedTestSuiteRegistry contains a map of
+// ParameterizedTestSuiteInfoBase classes accessed by test suite names. TEST_P
+// and INSTANTIATE_TEST_SUITE_P macros use it to locate their corresponding
+// ParameterizedTestSuiteInfo descriptors.
+class ParameterizedTestSuiteRegistry {
+ public:
+  ParameterizedTestSuiteRegistry() {}
+  ~ParameterizedTestSuiteRegistry() {
+    for (auto &test_suite_info : test_suite_infos_) {
+      delete test_suite_info;
+    }
+  }
+
+  // Looks up or creates and returns a structure containing information about
+  // tests and instantiations of a particular test suite.
+  template <class TestSuite>
+  ParameterizedTestSuiteInfo<TestSuite> *GetTestSuitePatternHolder(
+      const char *test_suite_name, CodeLocation code_location) {
+    ParameterizedTestSuiteInfo<TestSuite> *typed_test_info = nullptr;
+    for (auto &test_suite_info : test_suite_infos_) {
+      if (test_suite_info->GetTestSuiteName() == test_suite_name) {
+        if (test_suite_info->GetTestSuiteTypeId() != GetTypeId<TestSuite>()) {
+          // Complain about incorrect usage of Google Test facilities
+          // and terminate the program since we cannot guaranty correct
+          // test suite setup and tear-down in this case.
+          ReportInvalidTestSuiteType(test_suite_name, code_location);
+          posix::Abort();
+        } else {
+          // At this point we are sure that the object we found is of the same
+          // type we are looking for, so we downcast it to that type
+          // without further checks.
+          typed_test_info = CheckedDowncastToActualType<
+              ParameterizedTestSuiteInfo<TestSuite>>(test_suite_info);
+        }
+        break;
+      }
+    }
+    if (typed_test_info == nullptr) {
+      typed_test_info = new ParameterizedTestSuiteInfo<TestSuite>(
+          test_suite_name, code_location);
+      test_suite_infos_.push_back(typed_test_info);
+    }
+    return typed_test_info;
+  }
+  void RegisterTests() {
+    for (auto &test_suite_info : test_suite_infos_) {
+      test_suite_info->RegisterTests();
+    }
+  }
+//  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  template <class TestCase>
+  ParameterizedTestCaseInfo<TestCase> *GetTestCasePatternHolder(
+      const char *test_case_name, CodeLocation code_location) {
+    return GetTestSuitePatternHolder<TestCase>(test_case_name, code_location);
+  }
+
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+ private:
+  using TestSuiteInfoContainer =
+      ::std::vector<ParameterizedTestSuiteInfoBase *>;
+
+  TestSuiteInfoContainer test_suite_infos_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestSuiteRegistry);
+};
+
+// Keep track of what type-parameterized test suite are defined and
+// where as well as which are intatiated. This allows susequently
+// identifying suits that are defined but never used.
+class TypeParameterizedTestSuiteRegistry {
+ public:
+  // Add a suite definition
+  void RegisterTestSuite(const char *test_suite_name,
+                         CodeLocation code_location);
+
+  // Add an instantiation of a suit.
+  void RegisterInstantiation(const char *test_suite_name);
+
+  // For each suit repored as defined but not reported as instantiation,
+  // emit a test that reports that fact (configurably, as an error).
+  void CheckForInstantiations();
+
+ private:
+  struct TypeParameterizedTestSuiteInfo {
+    explicit TypeParameterizedTestSuiteInfo(CodeLocation c)
+        : code_location(c), instantiated(false) {}
+
+    CodeLocation code_location;
+    bool instantiated;
+  };
+
+  std::map<std::string, TypeParameterizedTestSuiteInfo> suites_;
+};
+
+}  // namespace internal
+
+// Forward declarations of ValuesIn(), which is implemented in
+// include/gtest/gtest-param-test.h.
+template <class Container>
+internal::ParamGenerator<typename Container::value_type> ValuesIn(
+    const Container &container);
+
+namespace internal {
+// Used in the Values() function to provide polymorphic capabilities.
+
+template <typename... Ts>
+class ValueArray {
+ public:
+  ValueArray(Ts... v) : v_{ std::move(v)... } {}
+
+  template <typename T>
+  operator ParamGenerator<T>() const {  // NOLINT
+    return ValuesIn(MakeVector<T>(MakeIndexSequence<sizeof...(Ts)>()));
+  }
+
+ private:
+  template <typename T, size_t... I>
+  std::vector<T> MakeVector(IndexSequence<I...>) const {
+    return std::vector<T>{ static_cast<T>(v_.template Get<I>())... };
+  }
+
+  FlatTuple<Ts...> v_;
+};
+
+template <typename... T>
+class CartesianProductGenerator
+    : public ParamGeneratorInterface<::std::tuple<T...>> {
+ public:
+  typedef ::std::tuple<T...> ParamType;
+
+  CartesianProductGenerator(const std::tuple<ParamGenerator<T>...> &g)
+      : generators_(g) {}
+  ~CartesianProductGenerator() override {}
+
+  ParamIteratorInterface<ParamType> *Begin() const override {
+    return new Iterator(this, generators_, false);
+  }
+  ParamIteratorInterface<ParamType> *End() const override {
+    return new Iterator(this, generators_, true);
+  }
+
+ private:
+  template <class I>
+  class IteratorImpl;
+  template <size_t... I>
+  class IteratorImpl<IndexSequence<I...>>
+      : public ParamIteratorInterface<ParamType> {
+   public:
+    IteratorImpl(const ParamGeneratorInterface<ParamType> *base,
+                 const std::tuple<ParamGenerator<T>...> &generators,
+                 bool is_end)
+        : base_(base), begin_(std::get<I>(generators).begin()...),
+          end_(std::get<I>(generators).end()...),
+          current_(is_end ? end_ : begin_) {
+      ComputeCurrentValue();
+    }
+    ~IteratorImpl() override {}
+
+    const ParamGeneratorInterface<ParamType> *BaseGenerator() const override {
+      return base_;
+    }
+    // Advance should not be called on beyond-of-range iterators
+    // so no component iterators must be beyond end of range, either.
+    void Advance() override {
+      assert(!AtEnd());
+      // Advance the last iterator.
+      ++std::get<sizeof...(T) - 1>(current_);
+      // if that reaches end, propagate that up.
+      AdvanceIfEnd<sizeof...(T) - 1>();
+      ComputeCurrentValue();
+    }
+    ParamIteratorInterface<ParamType> *Clone() const override {
+      return new IteratorImpl(*this);
+    }
+
+    const ParamType *Current() const override { return current_value_.get(); }
+
+    bool Equals(const ParamIteratorInterface<ParamType> &other) const override {
+      // Having the same base generator guarantees that the other
+      // iterator is of the same type and we can downcast.
+      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
+          << "The program attempted to compare iterators "
+          << "from different generators." << std::endl;
+      const IteratorImpl *typed_other =
+          CheckedDowncastToActualType<const IteratorImpl>(&other);
+
+      // We must report iterators equal if they both point beyond their
+      // respective ranges. That can happen in a variety of fashions,
+      // so we have to consult AtEnd().
+      if (AtEnd() && typed_other->AtEnd()) return true;
+
+      bool same = true;
+      bool dummy[] = { (same = same &&
+                               std::get<I>(current_) ==
+                                   std::get<I>(typed_other->current_))... };
+      (void)dummy;
+      return same;
+    }
+
+   private:
+    template <size_t ThisI>
+    void AdvanceIfEnd() {
+      if (std::get<ThisI>(current_) != std::get<ThisI>(end_)) return;
+
+      bool last = ThisI == 0;
+      if (last) {
+        // We are done. Nothing else to propagate.
+        return;
+      }
+
+      constexpr size_t NextI = ThisI - (ThisI != 0);
+      std::get<ThisI>(current_) = std::get<ThisI>(begin_);
+      ++std::get<NextI>(current_);
+      AdvanceIfEnd<NextI>();
+    }
+
+    void ComputeCurrentValue() {
+      if (!AtEnd())
+        current_value_ = std::make_shared<ParamType>(*std::get<I>(current_)...);
+    }
+    bool AtEnd() const {
+      bool at_end = false;
+      bool dummy[] = { (at_end = at_end || std::get<I>(current_) ==
+                                               std::get<I>(end_))... };
+      (void)dummy;
+      return at_end;
+    }
+
+    const ParamGeneratorInterface<ParamType> *const base_;
+    std::tuple<typename ParamGenerator<T>::iterator...> begin_;
+    std::tuple<typename ParamGenerator<T>::iterator...> end_;
+    std::tuple<typename ParamGenerator<T>::iterator...> current_;
+    std::shared_ptr<ParamType> current_value_;
+  };
+
+  using Iterator = IteratorImpl<typename MakeIndexSequence<sizeof...(T)>::type>;
+
+  std::tuple<ParamGenerator<T>...> generators_;
+};
+
+template <class... Gen>
+class CartesianProductHolder {
+ public:
+  CartesianProductHolder(const Gen &... g) : generators_(g...) {}
+  template <typename... T>
+  operator ParamGenerator<::std::tuple<T...>>() const {
+    return ParamGenerator<::std::tuple<T...>>(
+        new CartesianProductGenerator<T...>(generators_));
+  }
+
+ private:
+  std::tuple<Gen...> generators_;
+};
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
diff --git a/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-port-arch.h b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-port-arch.h
new file mode 100644
index 000000000..f803a19be
--- /dev/null
+++ b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-port-arch.h
@@ -0,0 +1,111 @@
+// Copyright 2015, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// The Google C++ Testing and Mocking Framework (Google Test)
+//
+// This header file defines the GTEST_OS_* macro.
+// It is separate from gtest-port.h so that custom/gtest-port.h can include it.
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_
+
+// Determines the platform on which Google Test is compiled.
+#ifdef __CYGWIN__
+#define GTEST_OS_CYGWIN 1
+#elif defined(__MINGW__) || defined(__MINGW32__) || defined(__MINGW64__)
+#define GTEST_OS_WINDOWS_MINGW 1
+#define GTEST_OS_WINDOWS 1
+#elif defined _WIN32
+#define GTEST_OS_WINDOWS 1
+#ifdef _WIN32_WCE
+#define GTEST_OS_WINDOWS_MOBILE 1
+#elif defined(WINAPI_FAMILY)
+#include <winapifamily.h>
+#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
+#define GTEST_OS_WINDOWS_DESKTOP 1
+#elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_PHONE_APP)
+#define GTEST_OS_WINDOWS_PHONE 1
+#elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP)
+#define GTEST_OS_WINDOWS_RT 1
+#elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_TV_TITLE)
+#define GTEST_OS_WINDOWS_PHONE 1
+#define GTEST_OS_WINDOWS_TV_TITLE 1
+#else
+// WINAPI_FAMILY defined but no known partition matched.
+// Default to desktop.
+#define GTEST_OS_WINDOWS_DESKTOP 1
+#endif
+#else
+#define GTEST_OS_WINDOWS_DESKTOP 1
+#endif  // _WIN32_WCE
+#elif defined __OS2__
+#define GTEST_OS_OS2 1
+#elif defined __APPLE__
+#define GTEST_OS_MAC 1
+#if TARGET_OS_IPHONE
+#define GTEST_OS_IOS 1
+#endif
+#elif defined __DragonFly__
+#define GTEST_OS_DRAGONFLY 1
+#elif defined __FreeBSD__
+#define GTEST_OS_FREEBSD 1
+#elif defined __Fuchsia__
+#define GTEST_OS_FUCHSIA 1
+#elif defined(__GLIBC__) && defined(__FreeBSD_kernel__)
+#define GTEST_OS_GNU_KFREEBSD 1
+#elif defined __linux__
+#define GTEST_OS_LINUX 1
+#if defined __ANDROID__
+#define GTEST_OS_LINUX_ANDROID 1
+#endif
+#elif defined __MVS__
+#define GTEST_OS_ZOS 1
+#elif defined(__sun) && defined(__SVR4)
+#define GTEST_OS_SOLARIS 1
+#elif defined(_AIX)
+#define GTEST_OS_AIX 1
+#elif defined(__hpux)
+#define GTEST_OS_HPUX 1
+#elif defined __native_client__
+#define GTEST_OS_NACL 1
+#elif defined __NetBSD__
+#define GTEST_OS_NETBSD 1
+#elif defined __OpenBSD__
+#define GTEST_OS_OPENBSD 1
+#elif defined __QNX__
+#define GTEST_OS_QNX 1
+#elif defined(__HAIKU__)
+#define GTEST_OS_HAIKU 1
+#elif defined ESP8266
+#define GTEST_OS_ESP8266 1
+#elif defined ESP32
+#define GTEST_OS_ESP32 1
+#endif  // __CYGWIN__
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_
diff --git a/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-port.h b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-port.h
new file mode 100644
index 000000000..083da569f
--- /dev/null
+++ b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-port.h
@@ -0,0 +1,2232 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Low-level types and utilities for porting Google Test to various
+// platforms.  All macros ending with _ and symbols defined in an
+// internal namespace are subject to change without notice.  Code
+// outside Google Test MUST NOT USE THEM DIRECTLY.  Macros that don't
+// end with _ are part of Google Test's public API and can be used by
+// code outside Google Test.
+//
+// This file is fundamental to Google Test.  All other Google Test source
+// files are expected to #include this.  Therefore, it cannot #include
+// any other Google Test header.
+
+// GOOGLETEST_CM0001 DO NOT DELETE
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
+
+// Environment-describing macros
+// -----------------------------
+//
+// Google Test can be used in many different environments.  Macros in
+// this section tell Google Test what kind of environment it is being
+// used in, such that Google Test can provide environment-specific
+// features and implementations.
+//
+// Google Test tries to automatically detect the properties of its
+// environment, so users usually don't need to worry about these
+// macros.  However, the automatic detection is not perfect.
+// Sometimes it's necessary for a user to define some of the following
+// macros in the build script to override Google Test's decisions.
+//
+// If the user doesn't define a macro in the list, Google Test will
+// provide a default definition.  After this header is #included, all
+// macros in this list will be defined to either 1 or 0.
+//
+// Notes to maintainers:
+//   - Each macro here is a user-tweakable knob; do not grow the list
+//     lightly.
+//   - Use #if to key off these macros.  Don't use #ifdef or "#if
+//     defined(...)", which will not work as these macros are ALWAYS
+//     defined.
+//
+//   GTEST_HAS_CLONE          - Define it to 1/0 to indicate that clone(2)
+//                              is/isn't available.
+//   GTEST_HAS_EXCEPTIONS     - Define it to 1/0 to indicate that exceptions
+//                              are enabled.
+//   GTEST_HAS_POSIX_RE       - Define it to 1/0 to indicate that POSIX regular
+//                              expressions are/aren't available.
+//   GTEST_HAS_PTHREAD        - Define it to 1/0 to indicate that <pthread.h>
+//                              is/isn't available.
+//   GTEST_HAS_RTTI           - Define it to 1/0 to indicate that RTTI is/isn't
+//                              enabled.
+//   GTEST_HAS_STD_WSTRING    - Define it to 1/0 to indicate that
+//                              std::wstring does/doesn't work (Google Test can
+//                              be used where std::wstring is unavailable).
+//   GTEST_HAS_SEH            - Define it to 1/0 to indicate whether the
+//                              compiler supports Microsoft's "Structured
+//                              Exception Handling".
+//   GTEST_HAS_STREAM_REDIRECTION
+//                            - Define it to 1/0 to indicate whether the
+//                              platform supports I/O stream redirection using
+//                              dup() and dup2().
+//   GTEST_LINKED_AS_SHARED_LIBRARY
+//                            - Define to 1 when compiling tests that use
+//                              Google Test as a shared library (known as
+//                              DLL on Windows).
+//   GTEST_CREATE_SHARED_LIBRARY
+//                            - Define to 1 when compiling Google Test itself
+//                              as a shared library.
+//   GTEST_DEFAULT_DEATH_TEST_STYLE
+//                            - The default value of --gtest_death_test_style.
+//                              The legacy default has been "fast" in the open
+//                              source version since 2008. The recommended value
+//                              is "threadsafe", and can be set in
+//                              custom/gtest-port.h.
+
+// Platform-indicating macros
+// --------------------------
+//
+// Macros indicating the platform on which Google Test is being used
+// (a macro is defined to 1 if compiled on the given platform;
+// otherwise UNDEFINED -- it's never defined to 0.).  Google Test
+// defines these macros automatically.  Code outside Google Test MUST
+// NOT define them.
+//
+//   GTEST_OS_AIX      - IBM AIX
+//   GTEST_OS_CYGWIN   - Cygwin
+//   GTEST_OS_DRAGONFLY - DragonFlyBSD
+//   GTEST_OS_FREEBSD  - FreeBSD
+//   GTEST_OS_FUCHSIA  - Fuchsia
+//   GTEST_OS_GNU_KFREEBSD - GNU/kFreeBSD
+//   GTEST_OS_HAIKU    - Haiku
+//   GTEST_OS_HPUX     - HP-UX
+//   GTEST_OS_LINUX    - Linux
+//     GTEST_OS_LINUX_ANDROID - Google Android
+//   GTEST_OS_MAC      - Mac OS X
+//     GTEST_OS_IOS    - iOS
+//   GTEST_OS_NACL     - Google Native Client (NaCl)
+//   GTEST_OS_NETBSD   - NetBSD
+//   GTEST_OS_OPENBSD  - OpenBSD
+//   GTEST_OS_OS2      - OS/2
+//   GTEST_OS_QNX      - QNX
+//   GTEST_OS_SOLARIS  - Sun Solaris
+//   GTEST_OS_WINDOWS  - Windows (Desktop, MinGW, or Mobile)
+//     GTEST_OS_WINDOWS_DESKTOP  - Windows Desktop
+//     GTEST_OS_WINDOWS_MINGW    - MinGW
+//     GTEST_OS_WINDOWS_MOBILE   - Windows Mobile
+//     GTEST_OS_WINDOWS_PHONE    - Windows Phone
+//     GTEST_OS_WINDOWS_RT       - Windows Store App/WinRT
+//   GTEST_OS_ZOS      - z/OS
+//
+// Among the platforms, Cygwin, Linux, Mac OS X, and Windows have the
+// most stable support.  Since core members of the Google Test project
+// don't have access to other platforms, support for them may be less
+// stable.  If you notice any problems on your platform, please notify
+// googletestframework@googlegroups.com (patches for fixing them are
+// even more welcome!).
+//
+// It is possible that none of the GTEST_OS_* macros are defined.
+
+// Feature-indicating macros
+// -------------------------
+//
+// Macros indicating which Google Test features are available (a macro
+// is defined to 1 if the corresponding feature is supported;
+// otherwise UNDEFINED -- it's never defined to 0.).  Google Test
+// defines these macros automatically.  Code outside Google Test MUST
+// NOT define them.
+//
+// These macros are public so that portable tests can be written.
+// Such tests typically surround code using a feature with an #if
+// which controls that code.  For example:
+//
+// #if GTEST_HAS_DEATH_TEST
+//   EXPECT_DEATH(DoSomethingDeadly());
+// #endif
+//
+//   GTEST_HAS_DEATH_TEST   - death tests
+//   GTEST_HAS_TYPED_TEST   - typed tests
+//   GTEST_HAS_TYPED_TEST_P - type-parameterized tests
+//   GTEST_IS_THREADSAFE    - Google Test is thread-safe.
+//   GOOGLETEST_CM0007 DO NOT DELETE
+//   GTEST_USES_POSIX_RE    - enhanced POSIX regex is used. Do not confuse with
+//                            GTEST_HAS_POSIX_RE (see above) which users can
+//                            define themselves.
+//   GTEST_USES_SIMPLE_RE   - our own simple regex is used;
+//                            the above RE\b(s) are mutually exclusive.
+
+// Misc public macros
+// ------------------
+//
+//   GTEST_FLAG(flag_name)  - references the variable corresponding to
+//                            the given Google Test flag.
+
+// Internal utilities
+// ------------------
+//
+// The following macros and utilities are for Google Test's INTERNAL
+// use only.  Code outside Google Test MUST NOT USE THEM DIRECTLY.
+//
+// Macros for basic C++ coding:
+//   GTEST_AMBIGUOUS_ELSE_BLOCKER_ - for disabling a gcc warning.
+//   GTEST_ATTRIBUTE_UNUSED_  - declares that a class' instances or a
+//                              variable don't have to be used.
+//   GTEST_DISALLOW_ASSIGN_   - disables copy operator=.
+//   GTEST_DISALLOW_COPY_AND_ASSIGN_ - disables copy ctor and operator=.
+//   GTEST_DISALLOW_MOVE_ASSIGN_   - disables move operator=.
+//   GTEST_DISALLOW_MOVE_AND_ASSIGN_ - disables move ctor and operator=.
+//   GTEST_MUST_USE_RESULT_   - declares that a function's result must be used.
+//   GTEST_INTENTIONAL_CONST_COND_PUSH_ - start code section where MSVC C4127 is
+//                                        suppressed (constant conditional).
+//   GTEST_INTENTIONAL_CONST_COND_POP_  - finish code section where MSVC C4127
+//                                        is suppressed.
+//   GTEST_INTERNAL_HAS_STRING_VIEW - for enabling Matcher<std::string_view> or
+//                                    Matcher<absl::string_view>
+//                                    specializations.
+//
+// Synchronization:
+//   Mutex, MutexLock, ThreadLocal, GetThreadCount()
+//                            - synchronization primitives.
+//
+// Regular expressions:
+//   RE             - a simple regular expression class using the POSIX
+//                    Extended Regular Expression syntax on UNIX-like platforms
+//                    GOOGLETEST_CM0008 DO NOT DELETE
+//                    or a reduced regular exception syntax on other
+//                    platforms, including Windows.
+// Logging:
+//   GTEST_LOG_()   - logs messages at the specified severity level.
+//   LogToStderr()  - directs all log messages to stderr.
+//   FlushInfoLog() - flushes informational log messages.
+//
+// Stdout and stderr capturing:
+//   CaptureStdout()     - starts capturing stdout.
+//   GetCapturedStdout() - stops capturing stdout and returns the captured
+//                         string.
+//   CaptureStderr()     - starts capturing stderr.
+//   GetCapturedStderr() - stops capturing stderr and returns the captured
+//                         string.
+//
+// Integer types:
+//   TypeWithSize   - maps an integer to a int type.
+//   TimeInMillis   - integers of known sizes.
+//   BiggestInt     - the biggest signed integer type.
+//
+// Command-line utilities:
+//   GTEST_DECLARE_*()  - declares a flag.
+//   GTEST_DEFINE_*()   - defines a flag.
+//   GetInjectableArgvs() - returns the command line as a vector of strings.
+//
+// Environment variable utilities:
+//   GetEnv()             - gets the value of an environment variable.
+//   BoolFromGTestEnv()   - parses a bool environment variable.
+//   Int32FromGTestEnv()  - parses an int32_t environment variable.
+//   StringFromGTestEnv() - parses a string environment variable.
+//
+// Deprecation warnings:
+//   GTEST_INTERNAL_DEPRECATED(message) - attribute marking a function as
+//                                        deprecated; calling a marked function
+//                                        should generate a compiler warning
+
+#include <ctype.h>   // for isspace, etc
+#include <stddef.h>  // for ptrdiff_t
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+
+#ifndef _WIN32_WCE
+#include <sys/types.h>
+#include <sys/stat.h>
+#endif  // !_WIN32_WCE
+
+#if defined __APPLE__
+#include <AvailabilityMacros.h>
+#include <TargetConditionals.h>
+#endif
+
+#include <iostream>  // NOLINT
+#include <memory>
+#include <string>  // NOLINT
+#include <tuple>
+#include <vector>  // NOLINT
+
+#include "gtest/internal/custom/gtest-port.h"
+#include "gtest/internal/gtest-port-arch.h"
+
+#if !defined(GTEST_DEV_EMAIL_)
+#define GTEST_DEV_EMAIL_ "googletestframework@@googlegroups.com"
+#define GTEST_FLAG_PREFIX_ "gtest_"
+#define GTEST_FLAG_PREFIX_DASH_ "gtest-"
+#define GTEST_FLAG_PREFIX_UPPER_ "GTEST_"
+#define GTEST_NAME_ "Google Test"
+#define GTEST_PROJECT_URL_ "https://github.com/google/googletest/"
+#endif  // !defined(GTEST_DEV_EMAIL_)
+
+#if !defined(GTEST_INIT_GOOGLE_TEST_NAME_)
+#define GTEST_INIT_GOOGLE_TEST_NAME_ "testing::InitGoogleTest"
+#endif  // !defined(GTEST_INIT_GOOGLE_TEST_NAME_)
+
+// Determines the version of gcc that is used to compile this.
+#ifdef __GNUC__
+// 40302 means version 4.3.2.
+#define GTEST_GCC_VER_ \
+  (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
+#endif  // __GNUC__
+
+// Macros for disabling Microsoft Visual C++ warnings.
+//
+//   GTEST_DISABLE_MSC_WARNINGS_PUSH_(4800 4385)
+//   /* code that triggers warnings C4800 and C4385 */
+//   GTEST_DISABLE_MSC_WARNINGS_POP_()
+#if defined(_MSC_VER)
+#define GTEST_DISABLE_MSC_WARNINGS_PUSH_(warnings) \
+  __pragma(warning(push)) __pragma(warning(disable : warnings))
+#define GTEST_DISABLE_MSC_WARNINGS_POP_() __pragma(warning(pop))
+#else
+// Not all compilers are MSVC
+#define GTEST_DISABLE_MSC_WARNINGS_PUSH_(warnings)
+#define GTEST_DISABLE_MSC_WARNINGS_POP_()
+#endif
+
+// Clang on Windows does not understand MSVC's pragma warning.
+// We need clang-specific way to disable function deprecation warning.
+#ifdef __clang__
+#define GTEST_DISABLE_MSC_DEPRECATED_PUSH_()                            \
+  _Pragma("clang diagnostic push")                                      \
+      _Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"") \
+          _Pragma("clang diagnostic ignored \"-Wdeprecated-implementations\"")
+#define GTEST_DISABLE_MSC_DEPRECATED_POP_() _Pragma("clang diagnostic pop")
+#else
+#define GTEST_DISABLE_MSC_DEPRECATED_PUSH_() \
+  GTEST_DISABLE_MSC_WARNINGS_PUSH_(4996)
+#define GTEST_DISABLE_MSC_DEPRECATED_POP_() GTEST_DISABLE_MSC_WARNINGS_POP_()
+#endif
+
+// Brings in definitions for functions used in the testing::internal::posix
+// namespace (read, write, close, chdir, isatty, stat). We do not currently
+// use them on Windows Mobile.
+#if GTEST_OS_WINDOWS
+#if !GTEST_OS_WINDOWS_MOBILE
+#include <direct.h>
+#include <io.h>
+#endif
+// In order to avoid having to include <windows.h>, use forward declaration
+#if GTEST_OS_WINDOWS_MINGW && !defined(__MINGW64_VERSION_MAJOR)
+// MinGW defined _CRITICAL_SECTION and _RTL_CRITICAL_SECTION as two
+// separate (equivalent) structs, instead of using typedef
+typedef struct _CRITICAL_SECTION GTEST_CRITICAL_SECTION;
+#else
+// Assume CRITICAL_SECTION is a typedef of _RTL_CRITICAL_SECTION.
+// This assumption is verified by
+// WindowsTypesTest.CRITICAL_SECTIONIs_RTL_CRITICAL_SECTION.
+typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
+#endif
+#else
+// This assumes that non-Windows OSes provide unistd.h. For OSes where this
+// is not the case, we need to include headers that provide the functions
+// mentioned above.
+#include <unistd.h>
+#include <strings.h>
+#endif  // GTEST_OS_WINDOWS
+
+#if GTEST_OS_LINUX_ANDROID
+// Used to define __ANDROID_API__ matching the target NDK API level.
+#include <android/api-level.h>  // NOLINT
+#endif
+
+// Defines this to true if and only if Google Test can use POSIX regular
+// expressions.
+#ifndef GTEST_HAS_POSIX_RE
+#if GTEST_OS_LINUX_ANDROID
+// On Android, <regex.h> is only available starting with Gingerbread.
+#define GTEST_HAS_POSIX_RE (__ANDROID_API__ >= 9)
+#else
+#define GTEST_HAS_POSIX_RE (!GTEST_OS_WINDOWS)
+#endif
+#endif
+
+#if GTEST_USES_PCRE
+// The appropriate headers have already been included.
+
+#elif GTEST_HAS_POSIX_RE
+
+// On some platforms, <regex.h> needs someone to define size_t, and
+// won't compile otherwise.  We can #include it here as we already
+// included <stdlib.h>, which is guaranteed to define size_t through
+// <stddef.h>.
+#include <regex.h>  // NOLINT
+
+#define GTEST_USES_POSIX_RE 1
+
+#elif GTEST_OS_WINDOWS
+
+// <regex.h> is not available on Windows.  Use our own simple regex
+// implementation instead.
+#define GTEST_USES_SIMPLE_RE 1
+
+#else
+
+// <regex.h> may not be available on this platform.  Use our own
+// simple regex implementation instead.
+#define GTEST_USES_SIMPLE_RE 1
+
+#endif  // GTEST_USES_PCRE
+
+#ifndef GTEST_HAS_EXCEPTIONS
+// The user didn't tell us whether exceptions are enabled, so we need
+// to figure it out.
+#if defined(_MSC_VER) && defined(_CPPUNWIND)
+// MSVC defines _CPPUNWIND to 1 if and only if exceptions are enabled.
+#define GTEST_HAS_EXCEPTIONS 1
+#elif defined(__BORLANDC__)
+// C++Builder's implementation of the STL uses the _HAS_EXCEPTIONS
+// macro to enable exceptions, so we'll do the same.
+// Assumes that exceptions are enabled by default.
+#ifndef _HAS_EXCEPTIONS
+#define _HAS_EXCEPTIONS 1
+#endif  // _HAS_EXCEPTIONS
+#define GTEST_HAS_EXCEPTIONS _HAS_EXCEPTIONS
+#elif defined(__clang__)
+// clang defines __EXCEPTIONS if and only if exceptions are enabled before clang
+// 220714, but if and only if cleanups are enabled after that. In Obj-C++ files,
+// there can be cleanups for ObjC exceptions which also need cleanups, even if
+// C++ exceptions are disabled. clang has __has_feature(cxx_exceptions) which
+// checks for C++ exceptions starting at clang r206352, but which checked for
+// cleanups prior to that. To reliably check for C++ exception availability with
+// clang, check for
+// __EXCEPTIONS && __has_feature(cxx_exceptions).
+#define GTEST_HAS_EXCEPTIONS (__EXCEPTIONS && __has_feature(cxx_exceptions))
+#elif defined(__GNUC__) && __EXCEPTIONS
+// gcc defines __EXCEPTIONS to 1 if and only if exceptions are enabled.
+#define GTEST_HAS_EXCEPTIONS 1
+#elif defined(__SUNPRO_CC)
+// Sun Pro CC supports exceptions.  However, there is no compile-time way of
+// detecting whether they are enabled or not.  Therefore, we assume that
+// they are enabled unless the user tells us otherwise.
+#define GTEST_HAS_EXCEPTIONS 1
+#elif defined(__IBMCPP__) && __EXCEPTIONS
+// xlC defines __EXCEPTIONS to 1 if and only if exceptions are enabled.
+#define GTEST_HAS_EXCEPTIONS 1
+#elif defined(__HP_aCC)
+// Exception handling is in effect by default in HP aCC compiler. It has to
+// be turned of by +noeh compiler option if desired.
+#define GTEST_HAS_EXCEPTIONS 1
+#else
+// For other compilers, we assume exceptions are disabled to be
+// conservative.
+#define GTEST_HAS_EXCEPTIONS 0
+#endif  // defined(_MSC_VER) || defined(__BORLANDC__)
+#endif  // GTEST_HAS_EXCEPTIONS
+
+#ifndef GTEST_HAS_STD_WSTRING
+// The user didn't tell us whether ::std::wstring is available, so we need
+// to figure it out.
+// Cygwin 1.7 and below doesn't support ::std::wstring.
+// Solaris' libc++ doesn't support it either.  Android has
+// no support for it at least as recent as Froyo (2.2).
+#define GTEST_HAS_STD_WSTRING                                         \
+  (!(GTEST_OS_LINUX_ANDROID || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS || \
+     GTEST_OS_HAIKU || GTEST_OS_ESP32 || GTEST_OS_ESP8266))
+
+#endif  // GTEST_HAS_STD_WSTRING
+
+// Determines whether RTTI is available.
+#ifndef GTEST_HAS_RTTI
+// The user didn't tell us whether RTTI is enabled, so we need to
+// figure it out.
+
+#ifdef _MSC_VER
+
+#ifdef _CPPRTTI  // MSVC defines this macro if and only if RTTI is enabled.
+#define GTEST_HAS_RTTI 1
+#else
+#define GTEST_HAS_RTTI 0
+#endif
+
+// Starting with version 4.3.2, gcc defines __GXX_RTTI if and only if RTTI is
+// enabled.
+#elif defined(__GNUC__)
+
+#ifdef __GXX_RTTI
+// When building against STLport with the Android NDK and with
+// -frtti -fno-exceptions, the build fails at link time with undefined
+// references to __cxa_bad_typeid. Note sure if STL or toolchain bug,
+// so disable RTTI when detected.
+#if GTEST_OS_LINUX_ANDROID && defined(_STLPORT_MAJOR) && !defined(__EXCEPTIONS)
+#define GTEST_HAS_RTTI 0
+#else
+#define GTEST_HAS_RTTI 1
+#endif  // GTEST_OS_LINUX_ANDROID && __STLPORT_MAJOR && !__EXCEPTIONS
+#else
+#define GTEST_HAS_RTTI 0
+#endif  // __GXX_RTTI
+
+// Clang defines __GXX_RTTI starting with version 3.0, but its manual recommends
+// using has_feature instead. has_feature(cxx_rtti) is supported since 2.7, the
+// first version with C++ support.
+#elif defined(__clang__)
+
+#define GTEST_HAS_RTTI __has_feature(cxx_rtti)
+
+// Starting with version 9.0 IBM Visual Age defines __RTTI_ALL__ to 1 if
+// both the typeid and dynamic_cast features are present.
+#elif defined(__IBMCPP__) && (__IBMCPP__ >= 900)
+
+#ifdef __RTTI_ALL__
+#define GTEST_HAS_RTTI 1
+#else
+#define GTEST_HAS_RTTI 0
+#endif
+
+#else
+
+// For all other compilers, we assume RTTI is enabled.
+#define GTEST_HAS_RTTI 1
+
+#endif  // _MSC_VER
+
+#endif  // GTEST_HAS_RTTI
+
+// It's this header's responsibility to #include <typeinfo> when RTTI
+// is enabled.
+#if GTEST_HAS_RTTI
+#include <typeinfo>
+#endif
+
+// Determines whether Google Test can use the pthreads library.
+#ifndef GTEST_HAS_PTHREAD
+// The user didn't tell us explicitly, so we make reasonable assumptions about
+// which platforms have pthreads support.
+//
+// To disable threading support in Google Test, add -DGTEST_HAS_PTHREAD=0
+// to your compiler flags.
+#define GTEST_HAS_PTHREAD                                                      \
+  (GTEST_OS_LINUX || GTEST_OS_MAC || GTEST_OS_HPUX || GTEST_OS_QNX ||          \
+   GTEST_OS_FREEBSD || GTEST_OS_NACL || GTEST_OS_NETBSD || GTEST_OS_FUCHSIA || \
+   GTEST_OS_DRAGONFLY || GTEST_OS_GNU_KFREEBSD || GTEST_OS_OPENBSD ||          \
+   GTEST_OS_HAIKU)
+#endif  // GTEST_HAS_PTHREAD
+
+#if GTEST_HAS_PTHREAD
+// gtest-port.h guarantees to #include <pthread.h> when GTEST_HAS_PTHREAD is
+// true.
+#include <pthread.h>  // NOLINT
+
+// For timespec and nanosleep, used below.
+#include <time.h>  // NOLINT
+#endif
+
+// Determines whether clone(2) is supported.
+// Usually it will only be available on Linux, excluding
+// Linux on the Itanium architecture.
+// Also see http://linux.die.net/man/2/clone.
+#ifndef GTEST_HAS_CLONE
+// The user didn't tell us, so we need to figure it out.
+
+#if GTEST_OS_LINUX && !defined(__ia64__)
+#if GTEST_OS_LINUX_ANDROID
+// On Android, clone() became available at different API levels for each 32-bit
+// architecture.
+#if defined(__LP64__) || (defined(__arm__) && __ANDROID_API__ >= 9) || \
+    (defined(__mips__) && __ANDROID_API__ >= 12) ||                    \
+    (defined(__i386__) && __ANDROID_API__ >= 17)
+#define GTEST_HAS_CLONE 1
+#else
+#define GTEST_HAS_CLONE 0
+#endif
+#else
+#define GTEST_HAS_CLONE 1
+#endif
+#else
+#define GTEST_HAS_CLONE 0
+#endif  // GTEST_OS_LINUX && !defined(__ia64__)
+
+#endif  // GTEST_HAS_CLONE
+
+// Determines whether to support stream redirection. This is used to test
+// output correctness and to implement death tests.
+#ifndef GTEST_HAS_STREAM_REDIRECTION
+// By default, we assume that stream redirection is supported on all
+// platforms except known mobile ones.
+#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE || \
+    GTEST_OS_WINDOWS_RT || GTEST_OS_ESP8266
+#define GTEST_HAS_STREAM_REDIRECTION 0
+#else
+#define GTEST_HAS_STREAM_REDIRECTION 1
+#endif  // !GTEST_OS_WINDOWS_MOBILE
+#endif  // GTEST_HAS_STREAM_REDIRECTION
+
+// Determines whether to support death tests.
+// pops up a dialog window that cannot be suppressed programmatically.
+#if (GTEST_OS_LINUX || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS ||             \
+     (GTEST_OS_MAC && !GTEST_OS_IOS) ||                                   \
+     (GTEST_OS_WINDOWS_DESKTOP && _MSC_VER) || GTEST_OS_WINDOWS_MINGW ||  \
+     GTEST_OS_AIX || GTEST_OS_HPUX || GTEST_OS_OPENBSD || GTEST_OS_QNX || \
+     GTEST_OS_FREEBSD || GTEST_OS_NETBSD || GTEST_OS_FUCHSIA ||           \
+     GTEST_OS_DRAGONFLY || GTEST_OS_GNU_KFREEBSD || GTEST_OS_HAIKU)
+#define GTEST_HAS_DEATH_TEST 1
+#endif
+
+// Determines whether to support type-driven tests.
+
+// Typed tests need <typeinfo> and variadic macros, which GCC, VC++ 8.0,
+// Sun Pro CC, IBM Visual Age, and HP aCC support.
+#if defined(__GNUC__) || defined(_MSC_VER) || defined(__SUNPRO_CC) || \
+    defined(__IBMCPP__) || defined(__HP_aCC)
+#define GTEST_HAS_TYPED_TEST 1
+#define GTEST_HAS_TYPED_TEST_P 1
+#endif
+
+// Determines whether the system compiler uses UTF-16 for encoding wide strings.
+#define GTEST_WIDE_STRING_USES_UTF16_ \
+  (GTEST_OS_WINDOWS || GTEST_OS_CYGWIN || GTEST_OS_AIX || GTEST_OS_OS2)
+
+// Determines whether test results can be streamed to a socket.
+#if GTEST_OS_LINUX || GTEST_OS_GNU_KFREEBSD || GTEST_OS_DRAGONFLY || \
+    GTEST_OS_FREEBSD || GTEST_OS_NETBSD || GTEST_OS_OPENBSD
+#define GTEST_CAN_STREAM_RESULTS_ 1
+#endif
+
+// Defines some utility macros.
+
+// The GNU compiler emits a warning if nested "if" statements are followed by
+// an "else" statement and braces are not used to explicitly disambiguate the
+// "else" binding.  This leads to problems with code like:
+//
+//   if (gate)
+//     ASSERT_*(condition) << "Some message";
+//
+// The "switch (0) case 0:" idiom is used to suppress this.
+#ifdef __INTEL_COMPILER
+#define GTEST_AMBIGUOUS_ELSE_BLOCKER_
+#else
+#define GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  switch (0)                          \
+  case 0:                             \
+  default:  // NOLINT
+#endif
+
+// Use this annotation at the end of a struct/class definition to
+// prevent the compiler from optimizing away instances that are never
+// used.  This is useful when all interesting logic happens inside the
+// c'tor and / or d'tor.  Example:
+//
+//   struct Foo {
+//     Foo() { ... }
+//   } GTEST_ATTRIBUTE_UNUSED_;
+//
+// Also use it after a variable or parameter declaration to tell the
+// compiler the variable/parameter does not have to be used.
+#if defined(__GNUC__) && !defined(COMPILER_ICC)
+#define GTEST_ATTRIBUTE_UNUSED_ __attribute__((unused))
+#elif defined(__clang__)
+#if __has_attribute(unused)
+#define GTEST_ATTRIBUTE_UNUSED_ __attribute__((unused))
+#endif
+#endif
+#ifndef GTEST_ATTRIBUTE_UNUSED_
+#define GTEST_ATTRIBUTE_UNUSED_
+#endif
+
+// Use this annotation before a function that takes a printf format string.
+#if (defined(__GNUC__) || defined(__clang__)) && !defined(COMPILER_ICC)
+#if defined(__MINGW_PRINTF_FORMAT)
+// MinGW has two different printf implementations. Ensure the format macro
+// matches the selected implementation. See
+// https://sourceforge.net/p/mingw-w64/wiki2/gnu%20printf/.
+#define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check) \
+  __attribute__(                                              \
+      (__format__(__MINGW_PRINTF_FORMAT, string_index, first_to_check)))
+#else
+#define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check) \
+  __attribute__((__format__(__printf__, string_index, first_to_check)))
+#endif
+#else
+#define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check)
+#endif
+
+// A macro to disallow copy operator=
+// This should be used in the private: declarations for a class.
+#define GTEST_DISALLOW_ASSIGN_(type) type &operator=(type const &) = delete
+
+// A macro to disallow copy constructor and operator=
+// This should be used in the private: declarations for a class.
+#define GTEST_DISALLOW_COPY_AND_ASSIGN_(type) \
+  type(type const &) = delete;                \
+  GTEST_DISALLOW_ASSIGN_(type)
+
+// A macro to disallow move operator=
+// This should be used in the private: declarations for a class.
+#define GTEST_DISALLOW_MOVE_ASSIGN_(type) \
+  type &operator=(type &&) noexcept = delete
+
+// A macro to disallow move constructor and operator=
+// This should be used in the private: declarations for a class.
+#define GTEST_DISALLOW_MOVE_AND_ASSIGN_(type) \
+  type(type &&) noexcept = delete;            \
+  GTEST_DISALLOW_MOVE_ASSIGN_(type)
+
+// Tell the compiler to warn about unused return values for functions declared
+// with this macro.  The macro should be used on function declarations
+// following the argument list:
+//
+//   Sprocket* AllocateSprocket() GTEST_MUST_USE_RESULT_;
+#if defined(__GNUC__) && !defined(COMPILER_ICC)
+#define GTEST_MUST_USE_RESULT_ __attribute__((warn_unused_result))
+#else
+#define GTEST_MUST_USE_RESULT_
+#endif  // __GNUC__ && !COMPILER_ICC
+
+// MS C++ compiler emits warning when a conditional expression is compile time
+// constant. In some contexts this warning is false positive and needs to be
+// suppressed. Use the following two macros in such cases:
+//
+// GTEST_INTENTIONAL_CONST_COND_PUSH_()
+// while (true) {
+// GTEST_INTENTIONAL_CONST_COND_POP_()
+// }
+#define GTEST_INTENTIONAL_CONST_COND_PUSH_() \
+  GTEST_DISABLE_MSC_WARNINGS_PUSH_(4127)
+#define GTEST_INTENTIONAL_CONST_COND_POP_() GTEST_DISABLE_MSC_WARNINGS_POP_()
+
+// Determine whether the compiler supports Microsoft's Structured Exception
+// Handling.  This is supported by several Windows compilers but generally
+// does not exist on any other system.
+#ifndef GTEST_HAS_SEH
+// The user didn't tell us, so we need to figure it out.
+
+#if defined(_MSC_VER) || defined(__BORLANDC__)
+// These two compilers are known to support SEH.
+#define GTEST_HAS_SEH 1
+#else
+// Assume no SEH.
+#define GTEST_HAS_SEH 0
+#endif
+
+#endif  // GTEST_HAS_SEH
+
+#ifndef GTEST_IS_THREADSAFE
+
+#define GTEST_IS_THREADSAFE                                                 \
+  (GTEST_HAS_MUTEX_AND_THREAD_LOCAL_ ||                                     \
+   (GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT) || \
+   GTEST_HAS_PTHREAD)
+
+#endif  // GTEST_IS_THREADSAFE
+
+// GTEST_API_ qualifies all symbols that must be exported. The definitions below
+// are guarded by #ifndef to give embedders a chance to define GTEST_API_ in
+// gtest/internal/custom/gtest-port.h
+#ifndef GTEST_API_
+
+#ifdef _MSC_VER
+#if GTEST_LINKED_AS_SHARED_LIBRARY
+#define GTEST_API_ __declspec(dllimport)
+#elif GTEST_CREATE_SHARED_LIBRARY
+#define GTEST_API_ __declspec(dllexport)
+#endif
+#elif __GNUC__ >= 4 || defined(__clang__)
+#define GTEST_API_ __attribute__((visibility("default")))
+#endif  // _MSC_VER
+
+#endif  // GTEST_API_
+
+#ifndef GTEST_API_
+#define GTEST_API_
+#endif  // GTEST_API_
+
+#ifndef GTEST_DEFAULT_DEATH_TEST_STYLE
+#define GTEST_DEFAULT_DEATH_TEST_STYLE "fast"
+#endif  // GTEST_DEFAULT_DEATH_TEST_STYLE
+
+#ifdef __GNUC__
+// Ask the compiler to never inline a given function.
+#define GTEST_NO_INLINE_ __attribute__((noinline))
+#else
+#define GTEST_NO_INLINE_
+#endif
+
+// _LIBCPP_VERSION is defined by the libc++ library from the LLVM project.
+#if !defined(GTEST_HAS_CXXABI_H_)
+#if defined(__GLIBCXX__) || (defined(_LIBCPP_VERSION) && !defined(_MSC_VER))
+#define GTEST_HAS_CXXABI_H_ 1
+#else
+#define GTEST_HAS_CXXABI_H_ 0
+#endif
+#endif
+
+// A function level attribute to disable checking for use of uninitialized
+// memory when built with MemorySanitizer.
+#if defined(__clang__)
+#if __has_feature(memory_sanitizer)
+#define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ __attribute__((no_sanitize_memory))
+#else
+#define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
+#endif  // __has_feature(memory_sanitizer)
+#else
+#define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
+#endif  // __clang__
+
+// A function level attribute to disable AddressSanitizer instrumentation.
+#if defined(__clang__)
+#if __has_feature(address_sanitizer)
+#define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ \
+  __attribute__((no_sanitize_address))
+#else
+#define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+#endif  // __has_feature(address_sanitizer)
+#else
+#define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+#endif  // __clang__
+
+// A function level attribute to disable HWAddressSanitizer instrumentation.
+#if defined(__clang__)
+#if __has_feature(hwaddress_sanitizer)
+#define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_ \
+  __attribute__((no_sanitize("hwaddress")))
+#else
+#define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
+#endif  // __has_feature(hwaddress_sanitizer)
+#else
+#define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
+#endif  // __clang__
+
+// A function level attribute to disable ThreadSanitizer instrumentation.
+#if defined(__clang__)
+#if __has_feature(thread_sanitizer)
+#define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ __attribute__((no_sanitize_thread))
+#else
+#define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
+#endif  // __has_feature(thread_sanitizer)
+#else
+#define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
+#endif  // __clang__
+
+namespace testing {
+
+class Message;
+
+// Legacy imports for backwards compatibility.
+// New code should use std:: names directly.
+using std::get;
+using std::make_tuple;
+using std::tuple;
+using std::tuple_element;
+using std::tuple_size;
+
+namespace internal {
+
+// A secret type that Google Test users don't know about.  It has no
+// definition on purpose.  Therefore it's impossible to create a
+// Secret object, which is what we want.
+class Secret;
+
+// The GTEST_COMPILE_ASSERT_ is a legacy macro used to verify that a compile
+// time expression is true (in new code, use static_assert instead). For
+// example, you could use it to verify the size of a static array:
+//
+//   GTEST_COMPILE_ASSERT_(GTEST_ARRAY_SIZE_(names) == NUM_NAMES,
+//                         names_incorrect_size);
+//
+// The second argument to the macro must be a valid C++ identifier. If the
+// expression is false, compiler will issue an error containing this identifier.
+#define GTEST_COMPILE_ASSERT_(expr, msg) static_assert(expr, #msg)
+
+// A helper for suppressing warnings on constant condition.  It just
+// returns 'condition'.
+GTEST_API_ bool IsTrue(bool condition);
+
+// Defines RE.
+
+#if GTEST_USES_PCRE
+// if used, PCRE is injected by custom/gtest-port.h
+#elif GTEST_USES_POSIX_RE || GTEST_USES_SIMPLE_RE
+
+// A simple C++ wrapper for <regex.h>.  It uses the POSIX Extended
+// Regular Expression syntax.
+class GTEST_API_ RE {
+ public:
+  // A copy constructor is required by the Standard to initialize object
+  // references from r-values.
+  RE(const RE &other) { Init(other.pattern()); }
+
+  // Constructs an RE from a string.
+  RE(const ::std::string &regex) { Init(regex.c_str()); }  // NOLINT
+
+  RE(const char *regex) { Init(regex); }  // NOLINT
+  ~RE();
+
+  // Returns the string representation of the regex.
+  const char *pattern() const { return pattern_; }
+
+  // FullMatch(str, re) returns true if and only if regular expression re
+  // matches the entire str.
+  // PartialMatch(str, re) returns true if and only if regular expression re
+  // matches a substring of str (including str itself).
+  static bool FullMatch(const ::std::string &str, const RE &re) {
+    return FullMatch(str.c_str(), re);
+  }
+  static bool PartialMatch(const ::std::string &str, const RE &re) {
+    return PartialMatch(str.c_str(), re);
+  }
+
+  static bool FullMatch(const char *str, const RE &re);
+  static bool PartialMatch(const char *str, const RE &re);
+
+ private:
+  void Init(const char *regex);
+  const char *pattern_;
+  bool is_valid_;
+
+#if GTEST_USES_POSIX_RE
+
+  regex_t full_regex_;     // For FullMatch().
+  regex_t partial_regex_;  // For PartialMatch().
+
+#else  // GTEST_USES_SIMPLE_RE
+
+  const char *full_pattern_;  // For FullMatch();
+
+#endif
+
+  GTEST_DISALLOW_ASSIGN_(RE);
+};
+
+#endif  // GTEST_USES_PCRE
+
+// Formats a source file path and a line number as they would appear
+// in an error message from the compiler used to compile this code.
+GTEST_API_ ::std::string FormatFileLocation(const char *file, int line);
+
+// Formats a file location for compiler-independent XML output.
+// Although this function is not platform dependent, we put it next to
+// FormatFileLocation in order to contrast the two functions.
+GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(const char *file,
+                                                               int line);
+
+// Defines logging utilities:
+//   GTEST_LOG_(severity) - logs messages at the specified severity level. The
+//                          message itself is streamed into the macro.
+//   LogToStderr()  - directs all log messages to stderr.
+//   FlushInfoLog() - flushes informational log messages.
+
+enum GTestLogSeverity { GTEST_INFO, GTEST_WARNING, GTEST_ERROR, GTEST_FATAL };
+
+// Formats log entry severity, provides a stream object for streaming the
+// log message, and terminates the message with a newline when going out of
+// scope.
+class GTEST_API_ GTestLog {
+ public:
+  GTestLog(GTestLogSeverity severity, const char *file, int line);
+
+  // Flushes the buffers and, if severity is GTEST_FATAL, aborts the program.
+  ~GTestLog();
+
+  ::std::ostream &GetStream() { return ::std::cerr; }
+
+ private:
+  const GTestLogSeverity severity_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestLog);
+};
+
+#if !defined(GTEST_LOG_)
+
+#define GTEST_LOG_(severity)                                           \
+  ::testing::internal::GTestLog(::testing::internal::GTEST_##severity, \
+                                __FILE__, __LINE__)                    \
+      .GetStream()
+
+inline void LogToStderr() {}
+inline void FlushInfoLog() { fflush(nullptr); }
+
+#endif  // !defined(GTEST_LOG_)
+
+#if !defined(GTEST_CHECK_)
+// INTERNAL IMPLEMENTATION - DO NOT USE.
+//
+// GTEST_CHECK_ is an all-mode assert. It aborts the program if the condition
+// is not satisfied.
+//  Synopsys:
+//    GTEST_CHECK_(boolean_condition);
+//     or
+//    GTEST_CHECK_(boolean_condition) << "Additional message";
+//
+//    This checks the condition and if the condition is not satisfied
+//    it prints message about the condition violation, including the
+//    condition itself, plus additional message streamed into it, if any,
+//    and then it aborts the program. It aborts the program irrespective of
+//    whether it is built in the debug mode or not.
+#define GTEST_CHECK_(condition)               \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_               \
+  if (::testing::internal::IsTrue(condition)) \
+    ;                                         \
+  else                                        \
+    GTEST_LOG_(FATAL) << "Condition " #condition " failed. "
+#endif  // !defined(GTEST_CHECK_)
+
+// An all-mode assert to verify that the given POSIX-style function
+// call returns 0 (indicating success).  Known limitation: this
+// doesn't expand to a balanced 'if' statement, so enclose the macro
+// in {} if you need to use it as the only statement in an 'if'
+// branch.
+#define GTEST_CHECK_POSIX_SUCCESS_(posix_call) \
+  if (const int gtest_error = (posix_call))    \
+  GTEST_LOG_(FATAL) << #posix_call << "failed with error " << gtest_error
+
+// Transforms "T" into "const T&" according to standard reference collapsing
+// rules (this is only needed as a backport for C++98 compilers that do not
+// support reference collapsing). Specifically, it transforms:
+//
+//   char         ==> const char&
+//   const char   ==> const char&
+//   char&        ==> char&
+//   const char&  ==> const char&
+//
+// Note that the non-const reference will not have "const" added. This is
+// standard, and necessary so that "T" can always bind to "const T&".
+template <typename T>
+struct ConstRef {
+  typedef const T &type;
+};
+template <typename T>
+struct ConstRef<T &> {
+  typedef T &type;
+};
+
+// The argument T must depend on some template parameters.
+#define GTEST_REFERENCE_TO_CONST_(T) \
+  typename ::testing::internal::ConstRef<T>::type
+
+// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
+//
+// Use ImplicitCast_ as a safe version of static_cast for upcasting in
+// the type hierarchy (e.g. casting a Foo* to a SuperclassOfFoo* or a
+// const Foo*).  When you use ImplicitCast_, the compiler checks that
+// the cast is safe.  Such explicit ImplicitCast_s are necessary in
+// surprisingly many situations where C++ demands an exact type match
+// instead of an argument type convertable to a target type.
+//
+// The syntax for using ImplicitCast_ is the same as for static_cast:
+//
+//   ImplicitCast_<ToType>(expr)
+//
+// ImplicitCast_ would have been part of the C++ standard library,
+// but the proposal was submitted too late.  It will probably make
+// its way into the language in the future.
+//
+// This relatively ugly name is intentional. It prevents clashes with
+// similar functions users may have (e.g., implicit_cast). The internal
+// namespace alone is not enough because the function can be found by ADL.
+template <typename To>
+inline To ImplicitCast_(To x) {
+  return x;
+}
+
+// When you upcast (that is, cast a pointer from type Foo to type
+// SuperclassOfFoo), it's fine to use ImplicitCast_<>, since upcasts
+// always succeed.  When you downcast (that is, cast a pointer from
+// type Foo to type SubclassOfFoo), static_cast<> isn't safe, because
+// how do you know the pointer is really of type SubclassOfFoo?  It
+// could be a bare Foo, or of type DifferentSubclassOfFoo.  Thus,
+// when you downcast, you should use this macro.  In debug mode, we
+// use dynamic_cast<> to double-check the downcast is legal (we die
+// if it's not).  In normal mode, we do the efficient static_cast<>
+// instead.  Thus, it's important to test in debug mode to make sure
+// the cast is legal!
+//    This is the only place in the code we should use dynamic_cast<>.
+// In particular, you SHOULDN'T be using dynamic_cast<> in order to
+// do RTTI (eg code like this:
+//    if (dynamic_cast<Subclass1>(foo)) HandleASubclass1Object(foo);
+//    if (dynamic_cast<Subclass2>(foo)) HandleASubclass2Object(foo);
+// You should design the code some other way not to need this.
+//
+// This relatively ugly name is intentional. It prevents clashes with
+// similar functions users may have (e.g., down_cast). The internal
+// namespace alone is not enough because the function can be found by ADL.
+template <typename To, typename From>  // use like this: DownCast_<T*>(foo);
+inline To DownCast_(From *f) {         // so we only accept pointers
+  // Ensures that To is a sub-type of From *.  This test is here only
+  // for compile-time type checking, and has no overhead in an
+  // optimized build at run-time, as it will be optimized away
+  // completely.
+  GTEST_INTENTIONAL_CONST_COND_PUSH_()
+  if (false) {
+    GTEST_INTENTIONAL_CONST_COND_POP_()
+    const To to = nullptr;
+    ::testing::internal::ImplicitCast_<From *>(to);
+  }
+
+#if GTEST_HAS_RTTI
+  // RTTI: debug mode only!
+  GTEST_CHECK_(f == nullptr || dynamic_cast<To>(f) != nullptr);
+#endif
+  return static_cast<To>(f);
+}
+
+// Downcasts the pointer of type Base to Derived.
+// Derived must be a subclass of Base. The parameter MUST
+// point to a class of type Derived, not any subclass of it.
+// When RTTI is available, the function performs a runtime
+// check to enforce this.
+template <class Derived, class Base>
+Derived *CheckedDowncastToActualType(Base *base) {
+#if GTEST_HAS_RTTI
+  GTEST_CHECK_(typeid(*base) == typeid(Derived));
+#endif
+
+#if GTEST_HAS_DOWNCAST_
+  return ::down_cast<Derived *>(base);
+#elif GTEST_HAS_RTTI
+  return dynamic_cast<Derived *>(base);  // NOLINT
+#else
+  return static_cast<Derived *>(base);  // Poor man's downcast.
+#endif
+}
+
+#if GTEST_HAS_STREAM_REDIRECTION
+
+// Defines the stderr capturer:
+//   CaptureStdout     - starts capturing stdout.
+//   GetCapturedStdout - stops capturing stdout and returns the captured string.
+//   CaptureStderr     - starts capturing stderr.
+//   GetCapturedStderr - stops capturing stderr and returns the captured string.
+//
+GTEST_API_ void CaptureStdout();
+GTEST_API_ std::string GetCapturedStdout();
+GTEST_API_ void CaptureStderr();
+GTEST_API_ std::string GetCapturedStderr();
+
+#endif  // GTEST_HAS_STREAM_REDIRECTION
+// Returns the size (in bytes) of a file.
+GTEST_API_ size_t GetFileSize(FILE *file);
+
+// Reads the entire content of a file as a string.
+GTEST_API_ std::string ReadEntireFile(FILE *file);
+
+// All command line arguments.
+GTEST_API_ std::vector<std::string> GetArgvs();
+
+#if GTEST_HAS_DEATH_TEST
+
+std::vector<std::string> GetInjectableArgvs();
+// Deprecated: pass the args vector by value instead.
+void SetInjectableArgvs(const std::vector<std::string> *new_argvs);
+void SetInjectableArgvs(const std::vector<std::string> &new_argvs);
+void ClearInjectableArgvs();
+
+#endif  // GTEST_HAS_DEATH_TEST
+
+// Defines synchronization primitives.
+#if GTEST_IS_THREADSAFE
+#if GTEST_HAS_PTHREAD
+// Sleeps for (roughly) n milliseconds.  This function is only for testing
+// Google Test's own constructs.  Don't use it in user tests, either
+// directly or indirectly.
+inline void SleepMilliseconds(int n) {
+  const timespec time = {
+    0,                  // 0 seconds.
+    n * 1000L * 1000L,  // And n ms.
+  };
+  nanosleep(&time, nullptr);
+}
+#endif  // GTEST_HAS_PTHREAD
+
+#if GTEST_HAS_NOTIFICATION_
+// Notification has already been imported into the namespace.
+// Nothing to do here.
+
+#elif GTEST_HAS_PTHREAD
+// Allows a controller thread to pause execution of newly created
+// threads until notified.  Instances of this class must be created
+// and destroyed in the controller thread.
+//
+// This class is only for testing Google Test's own constructs. Do not
+// use it in user tests, either directly or indirectly.
+class Notification {
+ public:
+  Notification() : notified_(false) {
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_init(&mutex_, nullptr));
+  }
+  ~Notification() { pthread_mutex_destroy(&mutex_); }
+
+  // Notifies all threads created with this notification to start. Must
+  // be called from the controller thread.
+  void Notify() {
+    pthread_mutex_lock(&mutex_);
+    notified_ = true;
+    pthread_mutex_unlock(&mutex_);
+  }
+
+  // Blocks until the controller thread notifies. Must be called from a test
+  // thread.
+  void WaitForNotification() {
+    for (;;) {
+      pthread_mutex_lock(&mutex_);
+      const bool notified = notified_;
+      pthread_mutex_unlock(&mutex_);
+      if (notified) break;
+      SleepMilliseconds(10);
+    }
+  }
+
+ private:
+  pthread_mutex_t mutex_;
+  bool notified_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(Notification);
+};
+
+#elif GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
+
+GTEST_API_ void SleepMilliseconds(int n);
+
+// Provides leak-safe Windows kernel handle ownership.
+// Used in death tests and in threading support.
+class GTEST_API_ AutoHandle {
+ public:
+  // Assume that Win32 HANDLE type is equivalent to void*. Doing so allows us to
+  // avoid including <windows.h> in this header file. Including <windows.h> is
+  // undesirable because it defines a lot of symbols and macros that tend to
+  // conflict with client code. This assumption is verified by
+  // WindowsTypesTest.HANDLEIsVoidStar.
+  typedef void *Handle;
+  AutoHandle();
+  explicit AutoHandle(Handle handle);
+
+  ~AutoHandle();
+
+  Handle Get() const;
+  void Reset();
+  void Reset(Handle handle);
+
+ private:
+  // Returns true if and only if the handle is a valid handle object that can be
+  // closed.
+  bool IsCloseable() const;
+
+  Handle handle_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(AutoHandle);
+};
+
+// Allows a controller thread to pause execution of newly created
+// threads until notified.  Instances of this class must be created
+// and destroyed in the controller thread.
+//
+// This class is only for testing Google Test's own constructs. Do not
+// use it in user tests, either directly or indirectly.
+class GTEST_API_ Notification {
+ public:
+  Notification();
+  void Notify();
+  void WaitForNotification();
+
+ private:
+  AutoHandle event_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(Notification);
+};
+#endif  // GTEST_HAS_NOTIFICATION_
+
+// On MinGW, we can have both GTEST_OS_WINDOWS and GTEST_HAS_PTHREAD
+// defined, but we don't want to use MinGW's pthreads implementation, which
+// has conformance problems with some versions of the POSIX standard.
+#if GTEST_HAS_PTHREAD && !GTEST_OS_WINDOWS_MINGW
+
+// As a C-function, ThreadFuncWithCLinkage cannot be templated itself.
+// Consequently, it cannot select a correct instantiation of ThreadWithParam
+// in order to call its Run(). Introducing ThreadWithParamBase as a
+// non-templated base class for ThreadWithParam allows us to bypass this
+// problem.
+class ThreadWithParamBase {
+ public:
+  virtual ~ThreadWithParamBase() {}
+  virtual void Run() = 0;
+};
+
+// pthread_create() accepts a pointer to a function type with the C linkage.
+// According to the Standard (7.5/1), function types with different linkages
+// are different even if they are otherwise identical.  Some compilers (for
+// example, SunStudio) treat them as different types.  Since class methods
+// cannot be defined with C-linkage we need to define a free C-function to
+// pass into pthread_create().
+extern "C" inline void *ThreadFuncWithCLinkage(void *thread) {
+  static_cast<ThreadWithParamBase *>(thread)->Run();
+  return nullptr;
+}
+
+// Helper class for testing Google Test's multi-threading constructs.
+// To use it, write:
+//
+//   void ThreadFunc(int param) { /* Do things with param */ }
+//   Notification thread_can_start;
+//   ...
+//   // The thread_can_start parameter is optional; you can supply NULL.
+//   ThreadWithParam<int> thread(&ThreadFunc, 5, &thread_can_start);
+//   thread_can_start.Notify();
+//
+// These classes are only for testing Google Test's own constructs. Do
+// not use them in user tests, either directly or indirectly.
+template <typename T>
+class ThreadWithParam : public ThreadWithParamBase {
+ public:
+  typedef void UserThreadFunc(T);
+
+  ThreadWithParam(UserThreadFunc *func, T param, Notification *thread_can_start)
+      : func_(func), param_(param), thread_can_start_(thread_can_start),
+        finished_(false) {
+    ThreadWithParamBase *const base = this;
+    // The thread can be created only after all fields except thread_
+    // have been initialized.
+    GTEST_CHECK_POSIX_SUCCESS_(
+        pthread_create(&thread_, nullptr, &ThreadFuncWithCLinkage, base));
+  }
+  ~ThreadWithParam() override { Join(); }
+
+  void Join() {
+    if (!finished_) {
+      GTEST_CHECK_POSIX_SUCCESS_(pthread_join(thread_, nullptr));
+      finished_ = true;
+    }
+  }
+
+  void Run() override {
+    if (thread_can_start_ != nullptr) thread_can_start_->WaitForNotification();
+    func_(param_);
+  }
+
+ private:
+  UserThreadFunc *const func_;  // User-supplied thread function.
+  const T param_;  // User-supplied parameter to the thread function.
+  // When non-NULL, used to block execution until the controller thread
+  // notifies.
+  Notification *const thread_can_start_;
+  bool finished_;  // true if and only if we know that the thread function has
+                   // finished.
+  pthread_t thread_;  // The native thread object.
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadWithParam);
+};
+#endif  // !GTEST_OS_WINDOWS && GTEST_HAS_PTHREAD ||
+        // GTEST_HAS_MUTEX_AND_THREAD_LOCAL_
+
+#if GTEST_HAS_MUTEX_AND_THREAD_LOCAL_
+// Mutex and ThreadLocal have already been imported into the namespace.
+// Nothing to do here.
+
+#elif GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
+
+// Mutex implements mutex on Windows platforms.  It is used in conjunction
+// with class MutexLock:
+//
+//   Mutex mutex;
+//   ...
+//   MutexLock lock(&mutex);  // Acquires the mutex and releases it at the
+//                            // end of the current scope.
+//
+// A static Mutex *must* be defined or declared using one of the following
+// macros:
+//   GTEST_DEFINE_STATIC_MUTEX_(g_some_mutex);
+//   GTEST_DECLARE_STATIC_MUTEX_(g_some_mutex);
+//
+// (A non-static Mutex is defined/declared in the usual way).
+class GTEST_API_ Mutex {
+ public:
+  enum MutexType { kStatic = 0, kDynamic = 1 };
+  // We rely on kStaticMutex being 0 as it is to what the linker initializes
+  // type_ in static mutexes.  critical_section_ will be initialized lazily
+  // in ThreadSafeLazyInit().
+  enum StaticConstructorSelector { kStaticMutex = 0 };
+
+  // This constructor intentionally does nothing.  It relies on type_ being
+  // statically initialized to 0 (effectively setting it to kStatic) and on
+  // ThreadSafeLazyInit() to lazily initialize the rest of the members.
+  explicit Mutex(StaticConstructorSelector /*dummy*/) {}
+
+  Mutex();
+  ~Mutex();
+
+  void Lock();
+
+  void Unlock();
+
+  // Does nothing if the current thread holds the mutex. Otherwise, crashes
+  // with high probability.
+  void AssertHeld();
+
+ private:
+  // Initializes owner_thread_id_ and critical_section_ in static mutexes.
+  void ThreadSafeLazyInit();
+
+  // Per https://blogs.msdn.microsoft.com/oldnewthing/20040223-00/?p=40503,
+  // we assume that 0 is an invalid value for thread IDs.
+  unsigned int owner_thread_id_;
+
+  // For static mutexes, we rely on these members being initialized to zeros
+  // by the linker.
+  MutexType type_;
+  long critical_section_init_phase_;  // NOLINT
+  GTEST_CRITICAL_SECTION *critical_section_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(Mutex);
+};
+
+#define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
+  extern ::testing::internal::Mutex mutex
+
+#define GTEST_DEFINE_STATIC_MUTEX_(mutex) \
+  ::testing::internal::Mutex mutex(::testing::internal::Mutex::kStaticMutex)
+
+// We cannot name this class MutexLock because the ctor declaration would
+// conflict with a macro named MutexLock, which is defined on some
+// platforms. That macro is used as a defensive measure to prevent against
+// inadvertent misuses of MutexLock like "MutexLock(&mu)" rather than
+// "MutexLock l(&mu)".  Hence the typedef trick below.
+class GTestMutexLock {
+ public:
+  explicit GTestMutexLock(Mutex *mutex) : mutex_(mutex) { mutex_->Lock(); }
+
+  ~GTestMutexLock() { mutex_->Unlock(); }
+
+ private:
+  Mutex *const mutex_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestMutexLock);
+};
+
+typedef GTestMutexLock MutexLock;
+
+// Base class for ValueHolder<T>.  Allows a caller to hold and delete a value
+// without knowing its type.
+class ThreadLocalValueHolderBase {
+ public:
+  virtual ~ThreadLocalValueHolderBase() {}
+};
+
+// Provides a way for a thread to send notifications to a ThreadLocal
+// regardless of its parameter type.
+class ThreadLocalBase {
+ public:
+  // Creates a new ValueHolder<T> object holding a default value passed to
+  // this ThreadLocal<T>'s constructor and returns it.  It is the caller's
+  // responsibility not to call this when the ThreadLocal<T> instance already
+  // has a value on the current thread.
+  virtual ThreadLocalValueHolderBase *NewValueForCurrentThread() const = 0;
+
+ protected:
+  ThreadLocalBase() {}
+  virtual ~ThreadLocalBase() {}
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadLocalBase);
+};
+
+// Maps a thread to a set of ThreadLocals that have values instantiated on that
+// thread and notifies them when the thread exits.  A ThreadLocal instance is
+// expected to persist until all threads it has values on have terminated.
+class GTEST_API_ ThreadLocalRegistry {
+ public:
+  // Registers thread_local_instance as having value on the current thread.
+  // Returns a value that can be used to identify the thread from other threads.
+  static ThreadLocalValueHolderBase *GetValueOnCurrentThread(
+      const ThreadLocalBase *thread_local_instance);
+
+  // Invoked when a ThreadLocal instance is destroyed.
+  static void OnThreadLocalDestroyed(
+      const ThreadLocalBase *thread_local_instance);
+};
+
+class GTEST_API_ ThreadWithParamBase {
+ public:
+  void Join();
+
+ protected:
+  class Runnable {
+   public:
+    virtual ~Runnable() {}
+    virtual void Run() = 0;
+  };
+
+  ThreadWithParamBase(Runnable *runnable, Notification *thread_can_start);
+  virtual ~ThreadWithParamBase();
+
+ private:
+  AutoHandle thread_;
+};
+
+// Helper class for testing Google Test's multi-threading constructs.
+template <typename T>
+class ThreadWithParam : public ThreadWithParamBase {
+ public:
+  typedef void UserThreadFunc(T);
+
+  ThreadWithParam(UserThreadFunc *func, T param, Notification *thread_can_start)
+      : ThreadWithParamBase(new RunnableImpl(func, param), thread_can_start) {}
+  virtual ~ThreadWithParam() {}
+
+ private:
+  class RunnableImpl : public Runnable {
+   public:
+    RunnableImpl(UserThreadFunc *func, T param) : func_(func), param_(param) {}
+    virtual ~RunnableImpl() {}
+    virtual void Run() { func_(param_); }
+
+   private:
+    UserThreadFunc *const func_;
+    const T param_;
+
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(RunnableImpl);
+  };
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadWithParam);
+};
+
+// Implements thread-local storage on Windows systems.
+//
+//   // Thread 1
+//   ThreadLocal<int> tl(100);  // 100 is the default value for each thread.
+//
+//   // Thread 2
+//   tl.set(150);  // Changes the value for thread 2 only.
+//   EXPECT_EQ(150, tl.get());
+//
+//   // Thread 1
+//   EXPECT_EQ(100, tl.get());  // In thread 1, tl has the original value.
+//   tl.set(200);
+//   EXPECT_EQ(200, tl.get());
+//
+// The template type argument T must have a public copy constructor.
+// In addition, the default ThreadLocal constructor requires T to have
+// a public default constructor.
+//
+// The users of a TheadLocal instance have to make sure that all but one
+// threads (including the main one) using that instance have exited before
+// destroying it. Otherwise, the per-thread objects managed for them by the
+// ThreadLocal instance are not guaranteed to be destroyed on all platforms.
+//
+// Google Test only uses global ThreadLocal objects.  That means they
+// will die after main() has returned.  Therefore, no per-thread
+// object managed by Google Test will be leaked as long as all threads
+// using Google Test have exited when main() returns.
+template <typename T>
+class ThreadLocal : public ThreadLocalBase {
+ public:
+  ThreadLocal() : default_factory_(new DefaultValueHolderFactory()) {}
+  explicit ThreadLocal(const T &value)
+      : default_factory_(new InstanceValueHolderFactory(value)) {}
+
+  ~ThreadLocal() { ThreadLocalRegistry::OnThreadLocalDestroyed(this); }
+
+  T *pointer() { return GetOrCreateValue(); }
+  const T *pointer() const { return GetOrCreateValue(); }
+  const T &get() const { return *pointer(); }
+  void set(const T &value) { *pointer() = value; }
+
+ private:
+  // Holds a value of T.  Can be deleted via its base class without the caller
+  // knowing the type of T.
+  class ValueHolder : public ThreadLocalValueHolderBase {
+   public:
+    ValueHolder() : value_() {}
+    explicit ValueHolder(const T &value) : value_(value) {}
+
+    T *pointer() { return &value_; }
+
+   private:
+    T value_;
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolder);
+  };
+
+  T *GetOrCreateValue() const {
+    return static_cast<ValueHolder *>(
+               ThreadLocalRegistry::GetValueOnCurrentThread(this))
+        ->pointer();
+  }
+
+  virtual ThreadLocalValueHolderBase *NewValueForCurrentThread() const {
+    return default_factory_->MakeNewHolder();
+  }
+
+  class ValueHolderFactory {
+   public:
+    ValueHolderFactory() {}
+    virtual ~ValueHolderFactory() {}
+    virtual ValueHolder *MakeNewHolder() const = 0;
+
+   private:
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolderFactory);
+  };
+
+  class DefaultValueHolderFactory : public ValueHolderFactory {
+   public:
+    DefaultValueHolderFactory() {}
+    ValueHolder *MakeNewHolder() const override { return new ValueHolder(); }
+
+   private:
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultValueHolderFactory);
+  };
+
+  class InstanceValueHolderFactory : public ValueHolderFactory {
+   public:
+    explicit InstanceValueHolderFactory(const T &value) : value_(value) {}
+    ValueHolder *MakeNewHolder() const override {
+      return new ValueHolder(value_);
+    }
+
+   private:
+    const T value_;  // The value for each thread.
+
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(InstanceValueHolderFactory);
+  };
+
+  std::unique_ptr<ValueHolderFactory> default_factory_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadLocal);
+};
+
+#elif GTEST_HAS_PTHREAD
+
+// MutexBase and Mutex implement mutex on pthreads-based platforms.
+class MutexBase {
+ public:
+  // Acquires this mutex.
+  void Lock() {
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_lock(&mutex_));
+    owner_ = pthread_self();
+    has_owner_ = true;
+  }
+
+  // Releases this mutex.
+  void Unlock() {
+    // Since the lock is being released the owner_ field should no longer be
+    // considered valid. We don't protect writing to has_owner_ here, as it's
+    // the caller's responsibility to ensure that the current thread holds the
+    // mutex when this is called.
+    has_owner_ = false;
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_unlock(&mutex_));
+  }
+
+  // Does nothing if the current thread holds the mutex. Otherwise, crashes
+  // with high probability.
+  void AssertHeld() const {
+    GTEST_CHECK_(has_owner_ && pthread_equal(owner_, pthread_self()))
+        << "The current thread is not holding the mutex @" << this;
+  }
+
+  // A static mutex may be used before main() is entered.  It may even
+  // be used before the dynamic initialization stage.  Therefore we
+  // must be able to initialize a static mutex object at link time.
+  // This means MutexBase has to be a POD and its member variables
+  // have to be public.
+ public:
+  pthread_mutex_t mutex_;  // The underlying pthread mutex.
+  // has_owner_ indicates whether the owner_ field below contains a valid thread
+  // ID and is therefore safe to inspect (e.g., to use in pthread_equal()). All
+  // accesses to the owner_ field should be protected by a check of this field.
+  // An alternative might be to memset() owner_ to all zeros, but there's no
+  // guarantee that a zero'd pthread_t is necessarily invalid or even different
+  // from pthread_self().
+  bool has_owner_;
+  pthread_t owner_;  // The thread holding the mutex.
+};
+
+// Forward-declares a static mutex.
+#define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
+  extern ::testing::internal::MutexBase mutex
+
+// Defines and statically (i.e. at link time) initializes a static mutex.
+// The initialization list here does not explicitly initialize each field,
+// instead relying on default initialization for the unspecified fields. In
+// particular, the owner_ field (a pthread_t) is not explicitly initialized.
+// This allows initialization to work whether pthread_t is a scalar or struct.
+// The flag -Wmissing-field-initializers must not be specified for this to work.
+#define GTEST_DEFINE_STATIC_MUTEX_(mutex) \
+  ::testing::internal::MutexBase mutex = { PTHREAD_MUTEX_INITIALIZER, false, 0 }
+
+// The Mutex class can only be used for mutexes created at runtime. It
+// shares its API with MutexBase otherwise.
+class Mutex : public MutexBase {
+ public:
+  Mutex() {
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_init(&mutex_, nullptr));
+    has_owner_ = false;
+  }
+  ~Mutex() { GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_destroy(&mutex_)); }
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(Mutex);
+};
+
+// We cannot name this class MutexLock because the ctor declaration would
+// conflict with a macro named MutexLock, which is defined on some
+// platforms. That macro is used as a defensive measure to prevent against
+// inadvertent misuses of MutexLock like "MutexLock(&mu)" rather than
+// "MutexLock l(&mu)".  Hence the typedef trick below.
+class GTestMutexLock {
+ public:
+  explicit GTestMutexLock(MutexBase *mutex) : mutex_(mutex) { mutex_->Lock(); }
+
+  ~GTestMutexLock() { mutex_->Unlock(); }
+
+ private:
+  MutexBase *const mutex_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestMutexLock);
+};
+
+typedef GTestMutexLock MutexLock;
+
+// Helpers for ThreadLocal.
+
+// pthread_key_create() requires DeleteThreadLocalValue() to have
+// C-linkage.  Therefore it cannot be templatized to access
+// ThreadLocal<T>.  Hence the need for class
+// ThreadLocalValueHolderBase.
+class ThreadLocalValueHolderBase {
+ public:
+  virtual ~ThreadLocalValueHolderBase() {}
+};
+
+// Called by pthread to delete thread-local data stored by
+// pthread_setspecific().
+extern "C" inline void DeleteThreadLocalValue(void *value_holder) {
+  delete static_cast<ThreadLocalValueHolderBase *>(value_holder);
+}
+
+// Implements thread-local storage on pthreads-based systems.
+template <typename T>
+class GTEST_API_ ThreadLocal {
+ public:
+  ThreadLocal()
+      : key_(CreateKey()), default_factory_(new DefaultValueHolderFactory()) {}
+  explicit ThreadLocal(const T &value)
+      : key_(CreateKey()),
+        default_factory_(new InstanceValueHolderFactory(value)) {}
+
+  ~ThreadLocal() {
+    // Destroys the managed object for the current thread, if any.
+    DeleteThreadLocalValue(pthread_getspecific(key_));
+
+    // Releases resources associated with the key.  This will *not*
+    // delete managed objects for other threads.
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_key_delete(key_));
+  }
+
+  T *pointer() { return GetOrCreateValue(); }
+  const T *pointer() const { return GetOrCreateValue(); }
+  const T &get() const { return *pointer(); }
+  void set(const T &value) { *pointer() = value; }
+
+ private:
+  // Holds a value of type T.
+  class ValueHolder : public ThreadLocalValueHolderBase {
+   public:
+    ValueHolder() : value_() {}
+    explicit ValueHolder(const T &value) : value_(value) {}
+
+    T *pointer() { return &value_; }
+
+   private:
+    T value_;
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolder);
+  };
+
+  static pthread_key_t CreateKey() {
+    pthread_key_t key;
+    // When a thread exits, DeleteThreadLocalValue() will be called on
+    // the object managed for that thread.
+    GTEST_CHECK_POSIX_SUCCESS_(
+        pthread_key_create(&key, &DeleteThreadLocalValue));
+    return key;
+  }
+
+  T *GetOrCreateValue() const {
+    ThreadLocalValueHolderBase *const holder =
+        static_cast<ThreadLocalValueHolderBase *>(pthread_getspecific(key_));
+    if (holder != nullptr) {
+      return CheckedDowncastToActualType<ValueHolder>(holder)->pointer();
+    }
+
+    ValueHolder *const new_holder = default_factory_->MakeNewHolder();
+    ThreadLocalValueHolderBase *const holder_base = new_holder;
+    GTEST_CHECK_POSIX_SUCCESS_(pthread_setspecific(key_, holder_base));
+    return new_holder->pointer();
+  }
+
+  class ValueHolderFactory {
+   public:
+    ValueHolderFactory() {}
+    virtual ~ValueHolderFactory() {}
+    virtual ValueHolder *MakeNewHolder() const = 0;
+
+   private:
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolderFactory);
+  };
+
+  class DefaultValueHolderFactory : public ValueHolderFactory {
+   public:
+    DefaultValueHolderFactory() {}
+    ValueHolder *MakeNewHolder() const override { return new ValueHolder(); }
+
+   private:
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultValueHolderFactory);
+  };
+
+  class InstanceValueHolderFactory : public ValueHolderFactory {
+   public:
+    explicit InstanceValueHolderFactory(const T &value) : value_(value) {}
+    ValueHolder *MakeNewHolder() const override {
+      return new ValueHolder(value_);
+    }
+
+   private:
+    const T value_;  // The value for each thread.
+
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(InstanceValueHolderFactory);
+  };
+
+  // A key pthreads uses for looking up per-thread values.
+  const pthread_key_t key_;
+  std::unique_ptr<ValueHolderFactory> default_factory_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadLocal);
+};
+
+#endif  // GTEST_HAS_MUTEX_AND_THREAD_LOCAL_
+
+#else  // GTEST_IS_THREADSAFE
+
+// A dummy implementation of synchronization primitives (mutex, lock,
+// and thread-local variable).  Necessary for compiling Google Test where
+// mutex is not supported - using Google Test in multiple threads is not
+// supported on such platforms.
+
+class Mutex {
+ public:
+  Mutex() {}
+  void Lock() {}
+  void Unlock() {}
+  void AssertHeld() const {}
+};
+
+#define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
+  extern ::testing::internal::Mutex mutex
+
+#define GTEST_DEFINE_STATIC_MUTEX_(mutex) ::testing::internal::Mutex mutex
+
+// We cannot name this class MutexLock because the ctor declaration would
+// conflict with a macro named MutexLock, which is defined on some
+// platforms. That macro is used as a defensive measure to prevent against
+// inadvertent misuses of MutexLock like "MutexLock(&mu)" rather than
+// "MutexLock l(&mu)".  Hence the typedef trick below.
+class GTestMutexLock {
+ public:
+  explicit GTestMutexLock(Mutex *) {}  // NOLINT
+};
+
+typedef GTestMutexLock MutexLock;
+
+template <typename T>
+class GTEST_API_ ThreadLocal {
+ public:
+  ThreadLocal() : value_() {}
+  explicit ThreadLocal(const T &value) : value_(value) {}
+  T *pointer() { return &value_; }
+  const T *pointer() const { return &value_; }
+  const T &get() const { return value_; }
+  void set(const T &value) { value_ = value; }
+
+ private:
+  T value_;
+};
+
+#endif  // GTEST_IS_THREADSAFE
+
+// Returns the number of threads running in the process, or 0 to indicate that
+// we cannot detect it.
+GTEST_API_ size_t GetThreadCount();
+
+#if GTEST_OS_WINDOWS
+#define GTEST_PATH_SEP_ "\\"
+#define GTEST_HAS_ALT_PATH_SEP_ 1
+#else
+#define GTEST_PATH_SEP_ "/"
+#define GTEST_HAS_ALT_PATH_SEP_ 0
+#endif  // GTEST_OS_WINDOWS
+
+// Utilities for char.
+
+// isspace(int ch) and friends accept an unsigned char or EOF.  char
+// may be signed, depending on the compiler (or compiler flags).
+// Therefore we need to cast a char to unsigned char before calling
+// isspace(), etc.
+
+inline bool IsAlpha(char ch) {
+  return isalpha(static_cast<unsigned char>(ch)) != 0;
+}
+inline bool IsAlNum(char ch) {
+  return isalnum(static_cast<unsigned char>(ch)) != 0;
+}
+inline bool IsDigit(char ch) {
+  return isdigit(static_cast<unsigned char>(ch)) != 0;
+}
+inline bool IsLower(char ch) {
+  return islower(static_cast<unsigned char>(ch)) != 0;
+}
+inline bool IsSpace(char ch) {
+  return isspace(static_cast<unsigned char>(ch)) != 0;
+}
+inline bool IsUpper(char ch) {
+  return isupper(static_cast<unsigned char>(ch)) != 0;
+}
+inline bool IsXDigit(char ch) {
+  return isxdigit(static_cast<unsigned char>(ch)) != 0;
+}
+inline bool IsXDigit(wchar_t ch) {
+  const unsigned char low_byte = static_cast<unsigned char>(ch);
+  return ch == low_byte && isxdigit(low_byte) != 0;
+}
+
+inline char ToLower(char ch) {
+  return static_cast<char>(tolower(static_cast<unsigned char>(ch)));
+}
+inline char ToUpper(char ch) {
+  return static_cast<char>(toupper(static_cast<unsigned char>(ch)));
+}
+
+inline std::string StripTrailingSpaces(std::string str) {
+  std::string::iterator it = str.end();
+  while (it != str.begin() && IsSpace(*--it)) it = str.erase(it);
+  return str;
+}
+
+// The testing::internal::posix namespace holds wrappers for common
+// POSIX functions.  These wrappers hide the differences between
+// Windows/MSVC and POSIX systems.  Since some compilers define these
+// standard functions as macros, the wrapper cannot have the same name
+// as the wrapped function.
+
+namespace posix {
+
+// Functions with a different name on Windows.
+
+#if GTEST_OS_WINDOWS
+
+typedef struct _stat StatStruct;
+
+#ifdef __BORLANDC__
+inline int IsATTY(int fd) { return isatty(fd); }
+inline int StrCaseCmp(const char *s1, const char *s2) {
+  return stricmp(s1, s2);
+}
+inline char *StrDup(const char *src) { return strdup(src); }
+#else  // !__BORLANDC__
+#if GTEST_OS_WINDOWS_MOBILE
+inline int IsATTY(int /* fd */) { return 0; }
+#else
+inline int IsATTY(int fd) { return _isatty(fd); }
+#endif  // GTEST_OS_WINDOWS_MOBILE
+inline int StrCaseCmp(const char *s1, const char *s2) {
+  return _stricmp(s1, s2);
+}
+inline char *StrDup(const char *src) { return _strdup(src); }
+#endif  // __BORLANDC__
+
+#if GTEST_OS_WINDOWS_MOBILE
+inline int FileNo(FILE *file) { return reinterpret_cast<int>(_fileno(file)); }
+// Stat(), RmDir(), and IsDir() are not needed on Windows CE at this
+// time and thus not defined there.
+#else
+inline int FileNo(FILE *file) { return _fileno(file); }
+inline int Stat(const char *path, StatStruct *buf) { return _stat(path, buf); }
+inline int RmDir(const char *dir) { return _rmdir(dir); }
+inline bool IsDir(const StatStruct &st) { return (_S_IFDIR & st.st_mode) != 0; }
+#endif  // GTEST_OS_WINDOWS_MOBILE
+
+#elif GTEST_OS_ESP8266
+typedef struct stat StatStruct;
+
+inline int FileNo(FILE *file) { return fileno(file); }
+inline int IsATTY(int fd) { return isatty(fd); }
+inline int Stat(const char *path, StatStruct *buf) {
+  // stat function not implemented on ESP8266
+  return 0;
+}
+inline int StrCaseCmp(const char *s1, const char *s2) {
+  return strcasecmp(s1, s2);
+}
+inline char *StrDup(const char *src) { return strdup(src); }
+inline int RmDir(const char *dir) { return rmdir(dir); }
+inline bool IsDir(const StatStruct &st) { return S_ISDIR(st.st_mode); }
+
+#else
+
+typedef struct stat StatStruct;
+
+inline int FileNo(FILE *file) { return fileno(file); }
+inline int IsATTY(int fd) { return isatty(fd); }
+inline int Stat(const char *path, StatStruct *buf) { return stat(path, buf); }
+inline int StrCaseCmp(const char *s1, const char *s2) {
+  return strcasecmp(s1, s2);
+}
+inline char *StrDup(const char *src) { return strdup(src); }
+inline int RmDir(const char *dir) { return rmdir(dir); }
+inline bool IsDir(const StatStruct &st) { return S_ISDIR(st.st_mode); }
+
+#endif  // GTEST_OS_WINDOWS
+
+// Functions deprecated by MSVC 8.0.
+
+GTEST_DISABLE_MSC_DEPRECATED_PUSH_()
+
+// ChDir(), FReopen(), FDOpen(), Read(), Write(), Close(), and
+// StrError() aren't needed on Windows CE at this time and thus not
+// defined there.
+
+#if !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
+inline int ChDir(const char *dir) { return chdir(dir); }
+#endif
+inline FILE *FOpen(const char *path, const char *mode) {
+  return fopen(path, mode);
+}
+#if !GTEST_OS_WINDOWS_MOBILE
+inline FILE *FReopen(const char *path, const char *mode, FILE *stream) {
+  return freopen(path, mode, stream);
+}
+inline FILE *FDOpen(int fd, const char *mode) { return fdopen(fd, mode); }
+#endif
+inline int FClose(FILE *fp) { return fclose(fp); }
+#if !GTEST_OS_WINDOWS_MOBILE
+inline int Read(int fd, void *buf, unsigned int count) {
+  return static_cast<int>(read(fd, buf, count));
+}
+inline int Write(int fd, const void *buf, unsigned int count) {
+  return static_cast<int>(write(fd, buf, count));
+}
+inline int Close(int fd) { return close(fd); }
+inline const char *StrError(int errnum) { return strerror(errnum); }
+#endif
+inline const char *GetEnv(const char *name) {
+#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE || \
+    GTEST_OS_WINDOWS_RT || GTEST_OS_ESP8266
+  // We are on an embedded platform, which has no environment variables.
+  static_cast<void>(name);  // To prevent 'unused argument' warning.
+  return nullptr;
+#elif defined(__BORLANDC__) || defined(__SunOS_5_8) || defined(__SunOS_5_9)
+  // Environment variables which we programmatically clear will be set to the
+  // empty string rather than unset (NULL).  Handle that case.
+  const char *const env = getenv(name);
+  return (env != nullptr && env[0] != '\0') ? env : nullptr;
+#else
+  return getenv(name);
+#endif
+}
+
+GTEST_DISABLE_MSC_DEPRECATED_POP_()
+
+#if GTEST_OS_WINDOWS_MOBILE
+// Windows CE has no C library. The abort() function is used in
+// several places in Google Test. This implementation provides a reasonable
+// imitation of standard behaviour.
+[[noreturn]] void Abort();
+#else
+[[noreturn]] inline void Abort() {
+  abort();
+}
+#endif  // GTEST_OS_WINDOWS_MOBILE
+
+}  // namespace posix
+
+// MSVC "deprecates" snprintf and issues warnings wherever it is used.  In
+// order to avoid these warnings, we need to use _snprintf or _snprintf_s on
+// MSVC-based platforms.  We map the GTEST_SNPRINTF_ macro to the appropriate
+// function in order to achieve that.  We use macro definition here because
+// snprintf is a variadic function.
+#if _MSC_VER && !GTEST_OS_WINDOWS_MOBILE
+// MSVC 2005 and above support variadic macros.
+#define GTEST_SNPRINTF_(buffer, size, format, ...) \
+  _snprintf_s(buffer, size, size, format, __VA_ARGS__)
+#elif defined(_MSC_VER)
+// Windows CE does not define _snprintf_s
+#define GTEST_SNPRINTF_ _snprintf
+#else
+#define GTEST_SNPRINTF_ snprintf
+#endif
+
+// The biggest signed integer type the compiler supports.
+//
+// long long is guaranteed to be at least 64-bits in C++11.
+using BiggestInt = long long;  // NOLINT
+
+// The maximum number a BiggestInt can represent.
+constexpr BiggestInt kMaxBiggestInt = (std::numeric_limits<BiggestInt>::max)();
+
+// This template class serves as a compile-time function from size to
+// type.  It maps a size in bytes to a primitive type with that
+// size. e.g.
+//
+//   TypeWithSize<4>::UInt
+//
+// is typedef-ed to be unsigned int (unsigned integer made up of 4
+// bytes).
+//
+// Such functionality should belong to STL, but I cannot find it
+// there.
+//
+// Google Test uses this class in the implementation of floating-point
+// comparison.
+//
+// For now it only handles UInt (unsigned int) as that's all Google Test
+// needs.  Other types can be easily added in the future if need
+// arises.
+template <size_t size>
+class TypeWithSize {
+ public:
+  // This prevents the user from using TypeWithSize<N> with incorrect
+  // values of N.
+  using UInt = void;
+};
+
+// The specialization for size 4.
+template <>
+class TypeWithSize<4> {
+ public:
+  using Int = std::int32_t;
+  using UInt = std::uint32_t;
+};
+
+// The specialization for size 8.
+template <>
+class TypeWithSize<8> {
+ public:
+  using Int = std::int64_t;
+  using UInt = std::uint64_t;
+};
+
+// Integer types of known sizes.
+using TimeInMillis = int64_t;  // Represents time in milliseconds.
+
+// Utilities for command line flags and environment variables.
+
+// Macro for referencing flags.
+#if !defined(GTEST_FLAG)
+#define GTEST_FLAG(name) FLAGS_gtest_##name
+#endif  // !defined(GTEST_FLAG)
+
+#if !defined(GTEST_USE_OWN_FLAGFILE_FLAG_)
+#define GTEST_USE_OWN_FLAGFILE_FLAG_ 1
+#endif  // !defined(GTEST_USE_OWN_FLAGFILE_FLAG_)
+
+#if !defined(GTEST_DECLARE_bool_)
+#define GTEST_FLAG_SAVER_ ::testing::internal::GTestFlagSaver
+
+// Macros for declaring flags.
+#define GTEST_DECLARE_bool_(name) GTEST_API_ extern bool GTEST_FLAG(name)
+#define GTEST_DECLARE_int32_(name) \
+  GTEST_API_ extern std::int32_t GTEST_FLAG(name)
+#define GTEST_DECLARE_string_(name) \
+  GTEST_API_ extern ::std::string GTEST_FLAG(name)
+
+// Macros for defining flags.
+#define GTEST_DEFINE_bool_(name, default_val, doc) \
+  GTEST_API_ bool GTEST_FLAG(name) = (default_val)
+#define GTEST_DEFINE_int32_(name, default_val, doc) \
+  GTEST_API_ std::int32_t GTEST_FLAG(name) = (default_val)
+#define GTEST_DEFINE_string_(name, default_val, doc) \
+  GTEST_API_ ::std::string GTEST_FLAG(name) = (default_val)
+
+#endif  // !defined(GTEST_DECLARE_bool_)
+
+// Thread annotations
+#if !defined(GTEST_EXCLUSIVE_LOCK_REQUIRED_)
+#define GTEST_EXCLUSIVE_LOCK_REQUIRED_(locks)
+#define GTEST_LOCK_EXCLUDED_(locks)
+#endif  // !defined(GTEST_EXCLUSIVE_LOCK_REQUIRED_)
+
+// Parses 'str' for a 32-bit signed integer.  If successful, writes the result
+// to *value and returns true; otherwise leaves *value unchanged and returns
+// false.
+bool ParseInt32(const Message &src_text, const char *str, int32_t *value);
+
+// Parses a bool/int32_t/string from the environment variable
+// corresponding to the given Google Test flag.
+bool BoolFromGTestEnv(const char *flag, bool default_val);
+GTEST_API_ int32_t Int32FromGTestEnv(const char *flag, int32_t default_val);
+std::string OutputFlagAlsoCheckEnvVar();
+const char *StringFromGTestEnv(const char *flag, const char *default_val);
+
+}  // namespace internal
+}  // namespace testing
+
+#if !defined(GTEST_INTERNAL_DEPRECATED)
+
+// Internal Macro to mark an API deprecated, for googletest usage only
+// Usage: class GTEST_INTERNAL_DEPRECATED(message) MyClass or
+// GTEST_INTERNAL_DEPRECATED(message) <return_type> myFunction(); Every usage of
+// a deprecated entity will trigger a warning when compiled with
+// `-Wdeprecated-declarations` option (clang, gcc, any __GNUC__ compiler).
+// For msvc /W3 option will need to be used
+// Note that for 'other' compilers this macro evaluates to nothing to prevent
+// compilations errors.
+#if defined(_MSC_VER)
+#define GTEST_INTERNAL_DEPRECATED(message) __declspec(deprecated(message))
+#elif defined(__GNUC__)
+#define GTEST_INTERNAL_DEPRECATED(message) __attribute__((deprecated(message)))
+#else
+#define GTEST_INTERNAL_DEPRECATED(message)
+#endif
+
+#endif  // !defined(GTEST_INTERNAL_DEPRECATED)
+
+#if GTEST_HAS_ABSL
+// Always use absl::string_view for Matcher<> specializations if googletest
+// is built with absl support.
+#define GTEST_INTERNAL_HAS_STRING_VIEW 1
+#include "absl/strings/string_view.h"
+namespace testing {
+namespace internal {
+using StringView = ::absl::string_view;
+}  // namespace internal
+}  // namespace testing
+#else
+#ifdef __has_include
+#if __has_include(<string_view>) && __cplusplus >= 201703L
+// Otherwise for C++17 and higher use std::string_view for Matcher<>
+// specializations.
+#define GTEST_INTERNAL_HAS_STRING_VIEW 1
+#include <string_view>
+namespace testing {
+namespace internal {
+using StringView = ::std::string_view;
+}  // namespace internal
+}  // namespace testing
+   // The case where absl is configured NOT to alias std::string_view is not
+   // supported.
+#endif  // __has_include(<string_view>) && __cplusplus >= 201703L
+#endif  // __has_include
+#endif  // GTEST_HAS_ABSL
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
diff --git a/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-string.h b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-string.h
new file mode 100644
index 000000000..f1f933097
--- /dev/null
+++ b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-string.h
@@ -0,0 +1,171 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// The Google C++ Testing and Mocking Framework (Google Test)
+//
+// This header file declares the String class and functions used internally by
+// Google Test.  They are subject to change without notice. They should not used
+// by code external to Google Test.
+//
+// This header file is #included by gtest-internal.h.
+// It should not be #included by other files.
+
+// GOOGLETEST_CM0001 DO NOT DELETE
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
+
+#ifdef __BORLANDC__
+// string.h is not guaranteed to provide strcpy on C++ Builder.
+#include <mem.h>
+#endif
+
+#include <string.h>
+#include <cstdint>
+#include <string>
+
+#include "gtest/internal/gtest-port.h"
+
+namespace testing {
+namespace internal {
+
+// String - an abstract class holding static string utilities.
+class GTEST_API_ String {
+ public:
+  // Static utility methods
+
+  // Clones a 0-terminated C string, allocating memory using new.  The
+  // caller is responsible for deleting the return value using
+  // delete[].  Returns the cloned string, or NULL if the input is
+  // NULL.
+  //
+  // This is different from strdup() in string.h, which allocates
+  // memory using malloc().
+  static const char *CloneCString(const char *c_str);
+
+#if GTEST_OS_WINDOWS_MOBILE
+  // Windows CE does not have the 'ANSI' versions of Win32 APIs. To be
+  // able to pass strings to Win32 APIs on CE we need to convert them
+  // to 'Unicode', UTF-16.
+
+  // Creates a UTF-16 wide string from the given ANSI string, allocating
+  // memory using new. The caller is responsible for deleting the return
+  // value using delete[]. Returns the wide string, or NULL if the
+  // input is NULL.
+  //
+  // The wide string is created using the ANSI codepage (CP_ACP) to
+  // match the behaviour of the ANSI versions of Win32 calls and the
+  // C runtime.
+  static LPCWSTR AnsiToUtf16(const char *c_str);
+
+  // Creates an ANSI string from the given wide string, allocating
+  // memory using new. The caller is responsible for deleting the return
+  // value using delete[]. Returns the ANSI string, or NULL if the
+  // input is NULL.
+  //
+  // The returned string is created using the ANSI codepage (CP_ACP) to
+  // match the behaviour of the ANSI versions of Win32 calls and the
+  // C runtime.
+  static const char *Utf16ToAnsi(LPCWSTR utf16_str);
+#endif
+
+  // Compares two C strings.  Returns true if and only if they have the same
+  // content.
+  //
+  // Unlike strcmp(), this function can handle NULL argument(s).  A
+  // NULL C string is considered different to any non-NULL C string,
+  // including the empty string.
+  static bool CStringEquals(const char *lhs, const char *rhs);
+
+  // Converts a wide C string to a String using the UTF-8 encoding.
+  // NULL will be converted to "(null)".  If an error occurred during
+  // the conversion, "(failed to convert from wide string)" is
+  // returned.
+  static std::string ShowWideCString(const wchar_t *wide_c_str);
+
+  // Compares two wide C strings.  Returns true if and only if they have the
+  // same content.
+  //
+  // Unlike wcscmp(), this function can handle NULL argument(s).  A
+  // NULL C string is considered different to any non-NULL C string,
+  // including the empty string.
+  static bool WideCStringEquals(const wchar_t *lhs, const wchar_t *rhs);
+
+  // Compares two C strings, ignoring case.  Returns true if and only if
+  // they have the same content.
+  //
+  // Unlike strcasecmp(), this function can handle NULL argument(s).
+  // A NULL C string is considered different to any non-NULL C string,
+  // including the empty string.
+  static bool CaseInsensitiveCStringEquals(const char *lhs, const char *rhs);
+
+  // Compares two wide C strings, ignoring case.  Returns true if and only if
+  // they have the same content.
+  //
+  // Unlike wcscasecmp(), this function can handle NULL argument(s).
+  // A NULL C string is considered different to any non-NULL wide C string,
+  // including the empty string.
+  // NB: The implementations on different platforms slightly differ.
+  // On windows, this method uses _wcsicmp which compares according to LC_CTYPE
+  // environment variable. On GNU platform this method uses wcscasecmp
+  // which compares according to LC_CTYPE category of the current locale.
+  // On MacOS X, it uses towlower, which also uses LC_CTYPE category of the
+  // current locale.
+  static bool CaseInsensitiveWideCStringEquals(const wchar_t *lhs,
+                                               const wchar_t *rhs);
+
+  // Returns true if and only if the given string ends with the given suffix,
+  // ignoring case. Any string is considered to end with an empty suffix.
+  static bool EndsWithCaseInsensitive(const std::string &str,
+                                      const std::string &suffix);
+
+  // Formats an int value as "%02d".
+  static std::string FormatIntWidth2(int value);  // "%02d" for width == 2
+
+  // Formats an int value as "%X".
+  static std::string FormatHexInt(int value);
+
+  // Formats an int value as "%X".
+  static std::string FormatHexUInt32(uint32_t value);
+
+  // Formats a byte as "%02X".
+  static std::string FormatByte(unsigned char value);
+
+ private:
+  String();  // Not meant to be instantiated.
+};           // class String
+
+// Gets the content of the stringstream's buffer as an std::string.  Each '\0'
+// character in the buffer is replaced with "\\0".
+GTEST_API_ std::string StringStreamToString(::std::stringstream *stream);
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
diff --git a/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-type-util.h b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-type-util.h
new file mode 100644
index 000000000..3b3a651dc
--- /dev/null
+++ b/libs/libaom/src/third_party/googletest/src/googletest/include/gtest/internal/gtest-type-util.h
@@ -0,0 +1,184 @@
+// Copyright 2008 Google Inc.
+// All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Type utilities needed for implementing typed and type-parameterized
+// tests.
+
+// GOOGLETEST_CM0001 DO NOT DELETE
+
+#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
+#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
+
+#include "gtest/internal/gtest-port.h"
+
+// #ifdef __GNUC__ is too general here.  It is possible to use gcc without using
+// libstdc++ (which is where cxxabi.h comes from).
+#if GTEST_HAS_CXXABI_H_
+#include <cxxabi.h>
+#elif defined(__HP_aCC)
+#include <acxx_demangle.h>
+#endif  // GTEST_HASH_CXXABI_H_
+
+namespace testing {
+namespace internal {
+
+// Canonicalizes a given name with respect to the Standard C++ Library.
+// This handles removing the inline namespace within `std` that is
+// used by various standard libraries (e.g., `std::__1`).  Names outside
+// of namespace std are returned unmodified.
+inline std::string CanonicalizeForStdLibVersioning(std::string s) {
+  static const char prefix[] = "std::__";
+  if (s.compare(0, strlen(prefix), prefix) == 0) {
+    std::string::size_type end = s.find("::", strlen(prefix));
+    if (end != s.npos) {
+      // Erase everything between the initial `std` and the second `::`.
+      s.erase(strlen("std"), end - strlen("std"));
+    }
+  }
+  return s;
+}
+
+// GetTypeName<T>() returns a human-readable name of type T.
+// NB: This function is also used in Google Mock, so don't move it inside of
+// the typed-test-only section below.
+template <typename T>
+std::string GetTypeName() {
+#if GTEST_HAS_RTTI
+
+  const char *const name = typeid(T).name();
+#if GTEST_HAS_CXXABI_H_ || defined(__HP_aCC)
+  int status = 0;
+  // gcc's implementation of typeid(T).name() mangles the type name,
+  // so we have to demangle it.
+#if GTEST_HAS_CXXABI_H_
+  using abi::__cxa_demangle;
+#endif  // GTEST_HAS_CXXABI_H_
+  char *const readable_name = __cxa_demangle(name, nullptr, nullptr, &status);
+  const std::string name_str(status == 0 ? readable_name : name);
+  free(readable_name);
+  return CanonicalizeForStdLibVersioning(name_str);
+#else
+  return name;
+#endif  // GTEST_HAS_CXXABI_H_ || __HP_aCC
+
+#else
+
+  return "<type>";
+
+#endif  // GTEST_HAS_RTTI
+}
+
+#if GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
+
+// A unique type indicating an empty node
+struct None {};
+
+#define GTEST_TEMPLATE_ \
+  template <typename T> \
+  class
+
+// The template "selector" struct TemplateSel<Tmpl> is used to
+// represent Tmpl, which must be a class template with one type
+// parameter, as a type.  TemplateSel<Tmpl>::Bind<T>::type is defined
+// as the type Tmpl<T>.  This allows us to actually instantiate the
+// template "selected" by TemplateSel<Tmpl>.
+//
+// This trick is necessary for simulating typedef for class templates,
+// which C++ doesn't support directly.
+template <GTEST_TEMPLATE_ Tmpl>
+struct TemplateSel {
+  template <typename T>
+  struct Bind {
+    typedef Tmpl<T> type;
+  };
+};
+
+#define GTEST_BIND_(TmplSel, T) TmplSel::template Bind<T>::type
+
+template <GTEST_TEMPLATE_ Head_, GTEST_TEMPLATE_... Tail_>
+struct Templates {
+  using Head = TemplateSel<Head_>;
+  using Tail = Templates<Tail_...>;
+};
+
+template <GTEST_TEMPLATE_ Head_>
+struct Templates<Head_> {
+  using Head = TemplateSel<Head_>;
+  using Tail = None;
+};
+
+// Tuple-like type lists
+template <typename Head_, typename... Tail_>
+struct Types {
+  using Head = Head_;
+  using Tail = Types<Tail_...>;
+};
+
+template <typename Head_>
+struct Types<Head_> {
+  using Head = Head_;
+  using Tail = None;
+};
+
+// Helper metafunctions to tell apart a single type from types
+// generated by ::testing::Types
+template <typename... Ts>
+struct ProxyTypeList {
+  using type = Types<Ts...>;
+};
+
+template <typename>
+struct is_proxy_type_list : std::false_type {};
+
+template <typename... Ts>
+struct is_proxy_type_list<ProxyTypeList<Ts...>> : std::true_type {};
+
+// Generator which conditionally creates type lists.
+// It recognizes if a requested type list should be created
+// and prevents creating a new type list nested within another one.
+template <typename T>
+struct GenerateTypeList {
+ private:
+  using proxy = typename std::conditional<is_proxy_type_list<T>::value, T,
+                                          ProxyTypeList<T>>::type;
+
+ public:
+  using type = typename proxy::type;
+};
+
+#endif  // GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
+
+}  // namespace internal
+
+template <typename... Ts>
+using Types = internal::ProxyTypeList<Ts...>;
+
+}  // namespace testing
+
+#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
diff --git a/libs/libaom/src/third_party/googletest/src/googletest/src/gtest-all.cc b/libs/libaom/src/third_party/googletest/src/googletest/src/gtest-all.cc
new file mode 100644
index 000000000..ad292905c
--- /dev/null
+++ b/libs/libaom/src/third_party/googletest/src/googletest/src/gtest-all.cc
@@ -0,0 +1,48 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+//
+// Google C++ Testing and Mocking Framework (Google Test)
+//
+// Sometimes it's desirable to build Google Test by compiling a single file.
+// This file serves this purpose.
+
+// This line ensures that gtest.h can be compiled on its own, even
+// when it's fused.
+#include "gtest/gtest.h"
+
+// The following lines pull in the real gtest *.cc files.
+#include "src/gtest.cc"
+#include "src/gtest-death-test.cc"
+#include "src/gtest-filepath.cc"
+#include "src/gtest-matchers.cc"
+#include "src/gtest-port.cc"
+#include "src/gtest-printers.cc"
+#include "src/gtest-test-part.cc"
+#include "src/gtest-typed-test.cc"
diff --git a/libs/libaom/src/third_party/googletest/src/googletest/src/gtest-death-test.cc b/libs/libaom/src/third_party/googletest/src/googletest/src/gtest-death-test.cc
new file mode 100644
index 000000000..c38551cda
--- /dev/null
+++ b/libs/libaom/src/third_party/googletest/src/googletest/src/gtest-death-test.cc
@@ -0,0 +1,1614 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+//
+// This file implements death tests.
+
+#include "gtest/gtest-death-test.h"
+
+#include <utility>
+
+#include "gtest/internal/gtest-port.h"
+#include "gtest/internal/custom/gtest.h"
+
+#if GTEST_HAS_DEATH_TEST
+
+#if GTEST_OS_MAC
+#include <crt_externs.h>
+#endif  // GTEST_OS_MAC
+
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+
+#if GTEST_OS_LINUX
+#include <signal.h>
+#endif  // GTEST_OS_LINUX
+
+#include <stdarg.h>
+
+#if GTEST_OS_WINDOWS
+#include <windows.h>
+#else
+#include <sys/mman.h>
+#include <sys/wait.h>
+#endif  // GTEST_OS_WINDOWS
+
+#if GTEST_OS_QNX
+#include <spawn.h>
+#endif  // GTEST_OS_QNX
+
+#if GTEST_OS_FUCHSIA
+#include <lib/fdio/fd.h>
+#include <lib/fdio/io.h>
+#include <lib/fdio/spawn.h>
+#include <lib/zx/channel.h>
+#include <lib/zx/port.h>
+#include <lib/zx/process.h>
+#include <lib/zx/socket.h>
+#include <zircon/processargs.h>
+#include <zircon/syscalls.h>
+#include <zircon/syscalls/policy.h>
+#include <zircon/syscalls/port.h>
+#endif  // GTEST_OS_FUCHSIA
+
+#endif  // GTEST_HAS_DEATH_TEST
+
+#include "gtest/gtest-message.h"
+#include "gtest/internal/gtest-string.h"
+#include "src/gtest-internal-inl.h"
+
+namespace testing {
+
+// Constants.
+
+// The default death test style.
+//
+// This is defined in internal/gtest-port.h as "fast", but can be overridden by
+// a definition in internal/custom/gtest-port.h. The recommended value, which is
+// used internally at Google, is "threadsafe".
+static const char kDefaultDeathTestStyle[] = GTEST_DEFAULT_DEATH_TEST_STYLE;
+
+GTEST_DEFINE_string_(
+    death_test_style,
+    internal::StringFromGTestEnv("death_test_style", kDefaultDeathTestStyle),
+    "Indicates how to run a death test in a forked child process: "
+    "\"threadsafe\" (child process re-executes the test binary "
+    "from the beginning, running only the specific death test) or "
+    "\"fast\" (child process runs the death test immediately "
+    "after forking).");
+
+GTEST_DEFINE_bool_(
+    death_test_use_fork,
+    internal::BoolFromGTestEnv("death_test_use_fork", false),
+    "Instructs to use fork()/_exit() instead of clone() in death tests. "
+    "Ignored and always uses fork() on POSIX systems where clone() is not "
+    "implemented. Useful when running under valgrind or similar tools if "
+    "those do not support clone(). Valgrind 3.3.1 will just fail if "
+    "it sees an unsupported combination of clone() flags. "
+    "It is not recommended to use this flag w/o valgrind though it will "
+    "work in 99% of the cases. Once valgrind is fixed, this flag will "
+    "most likely be removed.");
+
+namespace internal {
+GTEST_DEFINE_string_(
+    internal_run_death_test, "",
+    "Indicates the file, line number, temporal index of "
+    "the single death test to run, and a file descriptor to "
+    "which a success code may be sent, all separated by "
+    "the '|' characters.  This flag is specified if and only if the "
+    "current process is a sub-process launched for running a thread-safe "
+    "death test.  FOR INTERNAL USE ONLY.");
+}  // namespace internal
+
+#if GTEST_HAS_DEATH_TEST
+
+namespace internal {
+
+// Valid only for fast death tests. Indicates the code is running in the
+// child process of a fast style death test.
+#if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
+static bool g_in_fast_death_test_child = false;
+#endif
+
+// Returns a Boolean value indicating whether the caller is currently
+// executing in the context of the death test child process.  Tools such as
+// Valgrind heap checkers may need this to modify their behavior in death
+// tests.  IMPORTANT: This is an internal utility.  Using it may break the
+// implementation of death tests.  User code MUST NOT use it.
+bool InDeathTestChild() {
+#if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
+
+  // On Windows and Fuchsia, death tests are thread-safe regardless of the value
+  // of the death_test_style flag.
+  return !GTEST_FLAG(internal_run_death_test).empty();
+
+#else
+
+  if (GTEST_FLAG(death_test_style) == "threadsafe")
+    return !GTEST_FLAG(internal_run_death_test).empty();
+  else
+    return g_in_fast_death_test_child;
+#endif
+}
+
+}  // namespace internal
+
+// ExitedWithCode constructor.
+ExitedWithCode::ExitedWithCode(int exit_code) : exit_code_(exit_code) {}
+
+// ExitedWithCode function-call operator.
+bool ExitedWithCode::operator()(int exit_status) const {
+#if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
+
+  return exit_status == exit_code_;
+
+#else
+
+  return WIFEXITED(exit_status) && WEXITSTATUS(exit_status) == exit_code_;
+
+#endif  // GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
+}
+
+#if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
+// KilledBySignal constructor.
+KilledBySignal::KilledBySignal(int signum) : signum_(signum) {}
+
+// KilledBySignal function-call operator.
+bool KilledBySignal::operator()(int exit_status) const {
+#if defined(GTEST_KILLED_BY_SIGNAL_OVERRIDE_)
+  {
+    bool result;
+    if (GTEST_KILLED_BY_SIGNAL_OVERRIDE_(signum_, exit_status, &result)) {
+      return result;
+    }
+  }
+#endif  // defined(GTEST_KILLED_BY_SIGNAL_OVERRIDE_)
+  return WIFSIGNALED(exit_status) && WTERMSIG(exit_status) == signum_;
+}
+#endif  // !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
+
+namespace internal {
+
+// Utilities needed for death tests.
+
+// Generates a textual description of a given exit code, in the format
+// specified by wait(2).
+static std::string ExitSummary(int exit_code) {
+  Message m;
+
+#if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
+
+  m << "Exited with exit status " << exit_code;
+
+#else
+
+  if (WIFEXITED(exit_code)) {
+    m << "Exited with exit status " << WEXITSTATUS(exit_code);
+  } else if (WIFSIGNALED(exit_code)) {
+    m << "Terminated by signal " << WTERMSIG(exit_code);
+  }
+#ifdef WCOREDUMP
+  if (WCOREDUMP(exit_code)) {
+    m << " (core dumped)";
+  }
+#endif
+#endif  // GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
+
+  return m.GetString();
+}
+
+// Returns true if exit_status describes a process that was terminated
+// by a signal, or exited normally with a nonzero exit code.
+bool ExitedUnsuccessfully(int exit_status) {
+  return !ExitedWithCode(0)(exit_status);
+}
+
+#if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
+// Generates a textual failure message when a death test finds more than
+// one thread running, or cannot determine the number of threads, prior
+// to executing the given statement.  It is the responsibility of the
+// caller not to pass a thread_count of 1.
+static std::string DeathTestThreadWarning(size_t thread_count) {
+  Message msg;
+  msg << "Death tests use fork(), which is unsafe particularly"
+      << " in a threaded context. For this test, " << GTEST_NAME_ << " ";
+  if (thread_count == 0) {
+    msg << "couldn't detect the number of threads.";
+  } else {
+    msg << "detected " << thread_count << " threads.";
+  }
+  msg << " See "
+         "https://github.com/google/googletest/blob/master/googletest/docs/"
+         "advanced.md#death-tests-and-threads"
+      << " for more explanation and suggested solutions, especially if"
+      << " this is the last message you see before your test times out.";
+  return msg.GetString();
+}
+#endif  // !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
+
+// Flag characters for reporting a death test that did not die.
+static const char kDeathTestLived = 'L';
+static const char kDeathTestReturned = 'R';
+static const char kDeathTestThrew = 'T';
+static const char kDeathTestInternalError = 'I';
+
+#if GTEST_OS_FUCHSIA
+
+// File descriptor used for the pipe in the child process.
+static const int kFuchsiaReadPipeFd = 3;
+
+#endif
+
+// An enumeration describing all of the possible ways that a death test can
+// conclude.  DIED means that the process died while executing the test
+// code; LIVED means that process lived beyond the end of the test code;
+// RETURNED means that the test statement attempted to execute a return
+// statement, which is not allowed; THREW means that the test statement
+// returned control by throwing an exception.  IN_PROGRESS means the test
+// has not yet concluded.
+enum DeathTestOutcome { IN_PROGRESS, DIED, LIVED, RETURNED, THREW };
+
+// Routine for aborting the program which is safe to call from an
+// exec-style death test child process, in which case the error
+// message is propagated back to the parent process.  Otherwise, the
+// message is simply printed to stderr.  In either case, the program
+// then exits with status 1.
+static void DeathTestAbort(const std::string &message) {
+  // On a POSIX system, this function may be called from a threadsafe-style
+  // death test child process, which operates on a very small stack.  Use
+  // the heap for any additional non-minuscule memory requirements.
+  const InternalRunDeathTestFlag *const flag =
+      GetUnitTestImpl()->internal_run_death_test_flag();
+  if (flag != nullptr) {
+    FILE *parent = posix::FDOpen(flag->write_fd(), "w");
+    fputc(kDeathTestInternalError, parent);
+    fprintf(parent, "%s", message.c_str());
+    fflush(parent);
+    _exit(1);
+  } else {
+    fprintf(stderr, "%s", message.c_str());
+    fflush(stderr);
+    posix::Abort();
+  }
+}
+
+// A replacement for CHECK that calls DeathTestAbort if the assertion
+// fails.
+#define GTEST_DEATH_TEST_CHECK_(expression)                              \
+  do {                                                                   \
+    if (!::testing::internal::IsTrue(expression)) {                      \
+      DeathTestAbort(::std::string("CHECK failed: File ") + __FILE__ +   \
+                     ", line " +                                         \
+                     ::testing::internal::StreamableToString(__LINE__) + \
+                     ": " + #expression);                                \
+    }                                                                    \
+  } while (::testing::internal::AlwaysFalse())
+
+// This macro is similar to GTEST_DEATH_TEST_CHECK_, but it is meant for
+// evaluating any system call that fulfills two conditions: it must return
+// -1 on failure, and set errno to EINTR when it is interrupted and
+// should be tried again.  The macro expands to a loop that repeatedly
+// evaluates the expression as long as it evaluates to -1 and sets
+// errno to EINTR.  If the expression evaluates to -1 but errno is
+// something other than EINTR, DeathTestAbort is called.
+#define GTEST_DEATH_TEST_CHECK_SYSCALL_(expression)                      \
+  do {                                                                   \
+    int gtest_retval;                                                    \
+    do {                                                                 \
+      gtest_retval = (expression);                                       \
+    } while (gtest_retval == -1 && errno == EINTR);                      \
+    if (gtest_retval == -1) {                                            \
+      DeathTestAbort(::std::string("CHECK failed: File ") + __FILE__ +   \
+                     ", line " +                                         \
+                     ::testing::internal::StreamableToString(__LINE__) + \
+                     ": " + #expression + " != -1");                     \
+    }                                                                    \
+  } while (::testing::internal::AlwaysFalse())
+
+// Returns the message describing the last system error in errno.
+std::string GetLastErrnoDescription() {
+  return errno == 0 ? "" : posix::StrError(errno);
+}
+
+// This is called from a death test parent process to read a failure
+// message from the death test child process and log it with the FATAL
+// severity. On Windows, the message is read from a pipe handle. On other
+// platforms, it is read from a file descriptor.
+static void FailFromInternalError(int fd) {
+  Message error;
+  char buffer[256];
+  int num_read;
+
+  do {
+    while ((num_read = posix::Read(fd, buffer, 255)) > 0) {
+      buffer[num_read] = '\0';
+      error << buffer;
+    }
+  } while (num_read == -1 && errno == EINTR);
+
+  if (num_read == 0) {
+    GTEST_LOG_(FATAL) << error.GetString();
+  } else {
+    const int last_error = errno;
+    GTEST_LOG_(FATAL) << "Error while reading death test internal: "
+                      << GetLastErrnoDescription() << " [" << last_error << "]";
+  }
+}
+
+// Death test constructor.  Increments the running death test count
+// for the current test.
+DeathTest::DeathTest() {
+  TestInfo *const info = GetUnitTestImpl()->current_test_info();
+  if (info == nullptr) {
+    DeathTestAbort(
+        "Cannot run a death test outside of a TEST or "
+        "TEST_F construct");
+  }
+}
+
+// Creates and returns a death test by dispatching to the current
+// death test factory.
+bool DeathTest::Create(const char *statement,
+                       Matcher<const std::string &> matcher, const char *file,
+                       int line, DeathTest **test) {
+  return GetUnitTestImpl()->death_test_factory()->Create(
+      statement, std::move(matcher), file, line, test);
+}
+
+const char *DeathTest::LastMessage() {
+  return last_death_test_message_.c_str();
+}
+
+void DeathTest::set_last_death_test_message(const std::string &message) {
+  last_death_test_message_ = message;
+}
+
+std::string DeathTest::last_death_test_message_;
+
+// Provides cross platform implementation for some death functionality.
+class DeathTestImpl : public DeathTest {
+ protected:
+  DeathTestImpl(const char *a_statement, Matcher<const std::string &> matcher)
+      : statement_(a_statement), matcher_(std::move(matcher)), spawned_(false),
+        status_(-1), outcome_(IN_PROGRESS), read_fd_(-1), write_fd_(-1) {}
+
+  // read_fd_ is expected to be closed and cleared by a derived class.
+  ~DeathTestImpl() override { GTEST_DEATH_TEST_CHECK_(read_fd_ == -1); }
+
+  void Abort(AbortReason reason) override;
+  bool Passed(bool status_ok) override;
+
+  const char *statement() const { return statement_; }
+  bool spawned() const { return spawned_; }
+  void set_spawned(bool is_spawned) { spawned_ = is_spawned; }
+  int status() const { return status_; }
+  void set_status(int a_status) { status_ = a_status; }
+  DeathTestOutcome outcome() const { return outcome_; }
+  void set_outcome(DeathTestOutcome an_outcome) { outcome_ = an_outcome; }
+  int read_fd() const { return read_fd_; }
+  void set_read_fd(int fd) { read_fd_ = fd; }
+  int write_fd() const { return write_fd_; }
+  void set_write_fd(int fd) { write_fd_ = fd; }
+
+  // Called in the parent process only. Reads the result code of the death
+  // test child process via a pipe, interprets it to set the outcome_
+  // member, and closes read_fd_.  Outputs diagnostics and terminates in
+  // case of unexpected codes.
+  void ReadAndInterpretStatusByte();
+
+  // Returns stderr output from the child process.
+  virtual std::string GetErrorLogs();
+
+ private:
+  // The textual content of the code this object is testing.  This class
+  // doesn't own this string and should not attempt to delete it.
+  const char *const statement_;
+  // A matcher that's expected to match the stderr output by the child process.
+  Matcher<const std::string &> matcher_;
+  // True if the death test child process has been successfully spawned.
+  bool spawned_;
+  // The exit status of the child process.
+  int status_;
+  // How the death test concluded.
+  DeathTestOutcome outcome_;
+  // Descriptor to the read end of the pipe to the child process.  It is
+  // always -1 in the child process.  The child keeps its write end of the
+  // pipe in write_fd_.
+  int read_fd_;
+  // Descriptor to the child's write end of the pipe to the parent process.
+  // It is always -1 in the parent process.  The parent keeps its end of the
+  // pipe in read_fd_.
+  int write_fd_;
+};
+
+// Called in the parent process only. Reads the result code of the death
+// test child process via a pipe, interprets it to set the outcome_
+// member, and closes read_fd_.  Outputs diagnostics and terminates in
+// case of unexpected codes.
+void DeathTestImpl::ReadAndInterpretStatusByte() {
+  char flag;
+  int bytes_read;
+
+  // The read() here blocks until data is available (signifying the
+  // failure of the death test) or until the pipe is closed (signifying
+  // its success), so it's okay to call this in the parent before
+  // the child process has exited.
+  do {
+    bytes_read = posix::Read(read_fd(), &flag, 1);
+  } while (bytes_read == -1 && errno == EINTR);
+
+  if (bytes_read == 0) {
+    set_outcome(DIED);
+  } else if (bytes_read == 1) {
+    switch (flag) {
+      case kDeathTestReturned: set_outcome(RETURNED); break;
+      case kDeathTestThrew: set_outcome(THREW); break;
+      case kDeathTestLived: set_outcome(LIVED); break;
+      case kDeathTestInternalError:
+        FailFromInternalError(read_fd());  // Does not return.
+        break;
+      default:
+        GTEST_LOG_(FATAL) << "Death test child process reported "
+                          << "unexpected status byte ("
+                          << static_cast<unsigned int>(flag) << ")";
+    }
+  } else {
+    GTEST_LOG_(FATAL) << "Read from death test child process failed: "
+                      << GetLastErrnoDescription();
+  }
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(posix::Close(read_fd()));
+  set_read_fd(-1);
+}
+
+std::string DeathTestImpl::GetErrorLogs() { return GetCapturedStderr(); }
+
+// Signals that the death test code which should have exited, didn't.
+// Should be called only in a death test child process.
+// Writes a status byte to the child's status file descriptor, then
+// calls _exit(1).
+void DeathTestImpl::Abort(AbortReason reason) {
+  // The parent process considers the death test to be a failure if
+  // it finds any data in our pipe.  So, here we write a single flag byte
+  // to the pipe, then exit.
+  const char status_ch = reason == TEST_DID_NOT_DIE
+                             ? kDeathTestLived
+                             : reason == TEST_THREW_EXCEPTION
+                                   ? kDeathTestThrew
+                                   : kDeathTestReturned;
+
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(posix::Write(write_fd(), &status_ch, 1));
+  // We are leaking the descriptor here because on some platforms (i.e.,
+  // when built as Windows DLL), destructors of global objects will still
+  // run after calling _exit(). On such systems, write_fd_ will be
+  // indirectly closed from the destructor of UnitTestImpl, causing double
+  // close if it is also closed here. On debug configurations, double close
+  // may assert. As there are no in-process buffers to flush here, we are
+  // relying on the OS to close the descriptor after the process terminates
+  // when the destructors are not run.
+  _exit(1);  // Exits w/o any normal exit hooks (we were supposed to crash)
+}
+
+// Returns an indented copy of stderr output for a death test.
+// This makes distinguishing death test output lines from regular log lines
+// much easier.
+static ::std::string FormatDeathTestOutput(const ::std::string &output) {
+  ::std::string ret;
+  for (size_t at = 0;;) {
+    const size_t line_end = output.find('\n', at);
+    ret += "[  DEATH   ] ";
+    if (line_end == ::std::string::npos) {
+      ret += output.substr(at);
+      break;
+    }
+    ret += output.substr(at, line_end + 1 - at);
+    at = line_end + 1;
+  }
+  return ret;
+}
+
+// Assesses the success or failure of a death test, using both private
+// members which have previously been set, and one argument:
+//
+// Private data members:
+//   outcome:  An enumeration describing how the death test
+//             concluded: DIED, LIVED, THREW, or RETURNED.  The death test
+//             fails in the latter three cases.
+//   status:   The exit status of the child process. On *nix, it is in the
+//             in the format specified by wait(2). On Windows, this is the
+//             value supplied to the ExitProcess() API or a numeric code
+//             of the exception that terminated the program.
+//   matcher_: A matcher that's expected to match the stderr output by the child
+//             process.
+//
+// Argument:
+//   status_ok: true if exit_status is acceptable in the context of
+//              this particular death test, which fails if it is false
+//
+// Returns true if and only if all of the above conditions are met.  Otherwise,
+// the first failing condition, in the order given above, is the one that is
+// reported. Also sets the last death test message string.
+bool DeathTestImpl::Passed(bool status_ok) {
+  if (!spawned()) return false;
+
+  const std::string error_message = GetErrorLogs();
+
+  bool success = false;
+  Message buffer;
+
+  buffer << "Death test: " << statement() << "\n";
+  switch (outcome()) {
+    case LIVED:
+      buffer << "    Result: failed to die.\n"
+             << " Error msg:\n"
+             << FormatDeathTestOutput(error_message);
+      break;
+    case THREW:
+      buffer << "    Result: threw an exception.\n"
+             << " Error msg:\n"
+             << FormatDeathTestOutput(error_message);
+      break;
+    case RETURNED:
+      buffer << "    Result: illegal return in test statement.\n"
+             << " Error msg:\n"
+             << FormatDeathTestOutput(error_message);
+      break;
+    case DIED:
+      if (status_ok) {
+        if (matcher_.Matches(error_message)) {
+          success = true;
+        } else {
+          std::ostringstream stream;
+          matcher_.DescribeTo(&stream);
+          buffer << "    Result: died but not with expected error.\n"
+                 << "  Expected: " << stream.str() << "\n"
+                 << "Actual msg:\n"
+                 << FormatDeathTestOutput(error_message);
+        }
+      } else {
+        buffer << "    Result: died but not with expected exit code:\n"
+               << "            " << ExitSummary(status()) << "\n"
+               << "Actual msg:\n"
+               << FormatDeathTestOutput(error_message);
+      }
+      break;
+    case IN_PROGRESS:
+    default:
+      GTEST_LOG_(FATAL)
+          << "DeathTest::Passed somehow called before conclusion of test";
+  }
+
+  DeathTest::set_last_death_test_message(buffer.GetString());
+  return success;
+}
+
+#if GTEST_OS_WINDOWS
+// WindowsDeathTest implements death tests on Windows. Due to the
+// specifics of starting new processes on Windows, death tests there are
+// always threadsafe, and Google Test considers the
+// --gtest_death_test_style=fast setting to be equivalent to
+// --gtest_death_test_style=threadsafe there.
+//
+// A few implementation notes:  Like the Linux version, the Windows
+// implementation uses pipes for child-to-parent communication. But due to
+// the specifics of pipes on Windows, some extra steps are required:
+//
+// 1. The parent creates a communication pipe and stores handles to both
+//    ends of it.
+// 2. The parent starts the child and provides it with the information
+//    necessary to acquire the handle to the write end of the pipe.
+// 3. The child acquires the write end of the pipe and signals the parent
+//    using a Windows event.
+// 4. Now the parent can release the write end of the pipe on its side. If
+//    this is done before step 3, the object's reference count goes down to
+//    0 and it is destroyed, preventing the child from acquiring it. The
+//    parent now has to release it, or read operations on the read end of
+//    the pipe will not return when the child terminates.
+// 5. The parent reads child's output through the pipe (outcome code and
+//    any possible error messages) from the pipe, and its stderr and then
+//    determines whether to fail the test.
+//
+// Note: to distinguish Win32 API calls from the local method and function
+// calls, the former are explicitly resolved in the global namespace.
+//
+class WindowsDeathTest : public DeathTestImpl {
+ public:
+  WindowsDeathTest(const char *a_statement,
+                   Matcher<const std::string &> matcher, const char *file,
+                   int line)
+      : DeathTestImpl(a_statement, std::move(matcher)), file_(file),
+        line_(line) {}
+
+  // All of these virtual functions are inherited from DeathTest.
+  virtual int Wait();
+  virtual TestRole AssumeRole();
+
+ private:
+  // The name of the file in which the death test is located.
+  const char *const file_;
+  // The line number on which the death test is located.
+  const int line_;
+  // Handle to the write end of the pipe to the child process.
+  AutoHandle write_handle_;
+  // Child process handle.
+  AutoHandle child_handle_;
+  // Event the child process uses to signal the parent that it has
+  // acquired the handle to the write end of the pipe. After seeing this
+  // event the parent can release its own handles to make sure its
+  // ReadFile() calls return when the child terminates.
+  AutoHandle event_handle_;
+};
+
+// Waits for the child in a death test to exit, returning its exit
+// status, or 0 if no child process exists.  As a side effect, sets the
+// outcome data member.
+int WindowsDeathTest::Wait() {
+  if (!spawned()) return 0;
+
+  // Wait until the child either signals that it has acquired the write end
+  // of the pipe or it dies.
+  const HANDLE wait_handles[2] = { child_handle_.Get(), event_handle_.Get() };
+  switch (::WaitForMultipleObjects(2, wait_handles,
+                                   FALSE,  // Waits for any of the handles.
+                                   INFINITE)) {
+    case WAIT_OBJECT_0:
+    case WAIT_OBJECT_0 + 1: break;
+    default: GTEST_DEATH_TEST_CHECK_(false);  // Should not get here.
+  }
+
+  // The child has acquired the write end of the pipe or exited.
+  // We release the handle on our side and continue.
+  write_handle_.Reset();
+  event_handle_.Reset();
+
+  ReadAndInterpretStatusByte();
+
+  // Waits for the child process to exit if it haven't already. This
+  // returns immediately if the child has already exited, regardless of
+  // whether previous calls to WaitForMultipleObjects synchronized on this
+  // handle or not.
+  GTEST_DEATH_TEST_CHECK_(WAIT_OBJECT_0 ==
+                          ::WaitForSingleObject(child_handle_.Get(), INFINITE));
+  DWORD status_code;
+  GTEST_DEATH_TEST_CHECK_(
+      ::GetExitCodeProcess(child_handle_.Get(), &status_code) != FALSE);
+  child_handle_.Reset();
+  set_status(static_cast<int>(status_code));
+  return status();
+}
+
+// The AssumeRole process for a Windows death test.  It creates a child
+// process with the same executable as the current process to run the
+// death test.  The child process is given the --gtest_filter and
+// --gtest_internal_run_death_test flags such that it knows to run the
+// current death test only.
+DeathTest::TestRole WindowsDeathTest::AssumeRole() {
+  const UnitTestImpl *const impl = GetUnitTestImpl();
+  const InternalRunDeathTestFlag *const flag =
+      impl->internal_run_death_test_flag();
+  const TestInfo *const info = impl->current_test_info();
+  const int death_test_index = info->result()->death_test_count();
+
+  if (flag != nullptr) {
+    // ParseInternalRunDeathTestFlag() has performed all the necessary
+    // processing.
+    set_write_fd(flag->write_fd());
+    return EXECUTE_TEST;
+  }
+
+  // WindowsDeathTest uses an anonymous pipe to communicate results of
+  // a death test.
+  SECURITY_ATTRIBUTES handles_are_inheritable = { sizeof(SECURITY_ATTRIBUTES),
+                                                  nullptr, TRUE };
+  HANDLE read_handle, write_handle;
+  GTEST_DEATH_TEST_CHECK_(::CreatePipe(&read_handle, &write_handle,
+                                       &handles_are_inheritable,
+                                       0)  // Default buffer size.
+                          != FALSE);
+  set_read_fd(
+      ::_open_osfhandle(reinterpret_cast<intptr_t>(read_handle), O_RDONLY));
+  write_handle_.Reset(write_handle);
+  event_handle_.Reset(::CreateEvent(
+      &handles_are_inheritable,
+      TRUE,       // The event will automatically reset to non-signaled state.
+      FALSE,      // The initial state is non-signalled.
+      nullptr));  // The even is unnamed.
+  GTEST_DEATH_TEST_CHECK_(event_handle_.Get() != nullptr);
+  const std::string filter_flag = std::string("--") + GTEST_FLAG_PREFIX_ +
+                                  kFilterFlag + "=" + info->test_suite_name() +
+                                  "." + info->name();
+  const std::string internal_flag =
+      std::string("--") + GTEST_FLAG_PREFIX_ + kInternalRunDeathTestFlag + "=" +
+      file_ + "|" + StreamableToString(line_) + "|" +
+      StreamableToString(death_test_index) + "|" +
+      StreamableToString(static_cast<unsigned int>(::GetCurrentProcessId())) +
+      // size_t has the same width as pointers on both 32-bit and 64-bit
+      // Windows platforms.
+      // See http://msdn.microsoft.com/en-us/library/tcxf1dw6.aspx.
+      "|" + StreamableToString(reinterpret_cast<size_t>(write_handle)) + "|" +
+      StreamableToString(reinterpret_cast<size_t>(event_handle_.Get()));
+
+  char executable_path[_MAX_PATH + 1];  // NOLINT
+  GTEST_DEATH_TEST_CHECK_(_MAX_PATH + 1 != ::GetModuleFileNameA(nullptr,
+                                                                executable_path,
+                                                                _MAX_PATH));
+
+  std::string command_line = std::string(::GetCommandLineA()) + " " +
+                             filter_flag + " \"" + internal_flag + "\"";
+
+  DeathTest::set_last_death_test_message("");
+
+  CaptureStderr();
+  // Flush the log buffers since the log streams are shared with the child.
+  FlushInfoLog();
+
+  // The child process will share the standard handles with the parent.
+  STARTUPINFOA startup_info;
+  memset(&startup_info, 0, sizeof(STARTUPINFO));
+  startup_info.dwFlags = STARTF_USESTDHANDLES;
+  startup_info.hStdInput = ::GetStdHandle(STD_INPUT_HANDLE);
+  startup_info.hStdOutput = ::GetStdHandle(STD_OUTPUT_HANDLE);
+  startup_info.hStdError = ::GetStdHandle(STD_ERROR_HANDLE);
+
+  PROCESS_INFORMATION process_info;
+  GTEST_DEATH_TEST_CHECK_(
+      ::CreateProcessA(
+          executable_path, const_cast<char *>(command_line.c_str()),
+          nullptr,  // Retuned process handle is not inheritable.
+          nullptr,  // Retuned thread handle is not inheritable.
+          TRUE,  // Child inherits all inheritable handles (for write_handle_).
+          0x0,   // Default creation flags.
+          nullptr,  // Inherit the parent's environment.
+          UnitTest::GetInstance()->original_working_dir(), &startup_info,
+          &process_info) != FALSE);
+  child_handle_.Reset(process_info.hProcess);
+  ::CloseHandle(process_info.hThread);
+  set_spawned(true);
+  return OVERSEE_TEST;
+}
+
+#elif GTEST_OS_FUCHSIA
+
+class FuchsiaDeathTest : public DeathTestImpl {
+ public:
+  FuchsiaDeathTest(const char *a_statement,
+                   Matcher<const std::string &> matcher, const char *file,
+                   int line)
+      : DeathTestImpl(a_statement, std::move(matcher)), file_(file),
+        line_(line) {}
+
+  // All of these virtual functions are inherited from DeathTest.
+  int Wait() override;
+  TestRole AssumeRole() override;
+  std::string GetErrorLogs() override;
+
+ private:
+  // The name of the file in which the death test is located.
+  const char *const file_;
+  // The line number on which the death test is located.
+  const int line_;
+  // The stderr data captured by the child process.
+  std::string captured_stderr_;
+
+  zx::process child_process_;
+  zx::channel exception_channel_;
+  zx::socket stderr_socket_;
+};
+
+// Utility class for accumulating command-line arguments.
+class Arguments {
+ public:
+  Arguments() { args_.push_back(nullptr); }
+
+  ~Arguments() {
+    for (std::vector<char *>::iterator i = args_.begin(); i != args_.end();
+         ++i) {
+      free(*i);
+    }
+  }
+  void AddArgument(const char *argument) {
+    args_.insert(args_.end() - 1, posix::StrDup(argument));
+  }
+
+  template <typename Str>
+  void AddArguments(const ::std::vector<Str> &arguments) {
+    for (typename ::std::vector<Str>::const_iterator i = arguments.begin();
+         i != arguments.end(); ++i) {
+      args_.insert(args_.end() - 1, posix::StrDup(i->c_str()));
+    }
+  }
+  char *const *Argv() { return &args_[0]; }
+
+  int size() { return args_.size() - 1; }
+
+ private:
+  std::vector<char *> args_;
+};
+
+// Waits for the child in a death test to exit, returning its exit
+// status, or 0 if no child process exists.  As a side effect, sets the
+// outcome data member.
+int FuchsiaDeathTest::Wait() {
+  const int kProcessKey = 0;
+  const int kSocketKey = 1;
+  const int kExceptionKey = 2;
+
+  if (!spawned()) return 0;
+
+  // Create a port to wait for socket/task/exception events.
+  zx_status_t status_zx;
+  zx::port port;
+  status_zx = zx::port::create(0, &port);
+  GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
+
+  // Register to wait for the child process to terminate.
+  status_zx = child_process_.wait_async(
+      port, kProcessKey, ZX_PROCESS_TERMINATED, ZX_WAIT_ASYNC_ONCE);
+  GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
+
+  // Register to wait for the socket to be readable or closed.
+  status_zx = stderr_socket_.wait_async(
+      port, kSocketKey, ZX_SOCKET_READABLE | ZX_SOCKET_PEER_CLOSED,
+      ZX_WAIT_ASYNC_ONCE);
+  GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
+
+  // Register to wait for an exception.
+  status_zx = exception_channel_.wait_async(
+      port, kExceptionKey, ZX_CHANNEL_READABLE, ZX_WAIT_ASYNC_ONCE);
+  GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
+
+  bool process_terminated = false;
+  bool socket_closed = false;
+  do {
+    zx_port_packet_t packet = {};
+    status_zx = port.wait(zx::time::infinite(), &packet);
+    GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
+
+    if (packet.key == kExceptionKey) {
+      // Process encountered an exception. Kill it directly rather than
+      // letting other handlers process the event. We will get a kProcessKey
+      // event when the process actually terminates.
+      status_zx = child_process_.kill();
+      GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
+    } else if (packet.key == kProcessKey) {
+      // Process terminated.
+      GTEST_DEATH_TEST_CHECK_(ZX_PKT_IS_SIGNAL_ONE(packet.type));
+      GTEST_DEATH_TEST_CHECK_(packet.signal.observed & ZX_PROCESS_TERMINATED);
+      process_terminated = true;
+    } else if (packet.key == kSocketKey) {
+      GTEST_DEATH_TEST_CHECK_(ZX_PKT_IS_SIGNAL_ONE(packet.type));
+      if (packet.signal.observed & ZX_SOCKET_READABLE) {
+        // Read data from the socket.
+        constexpr size_t kBufferSize = 1024;
+        do {
+          size_t old_length = captured_stderr_.length();
+          size_t bytes_read = 0;
+          captured_stderr_.resize(old_length + kBufferSize);
+          status_zx =
+              stderr_socket_.read(0, &captured_stderr_.front() + old_length,
+                                  kBufferSize, &bytes_read);
+          captured_stderr_.resize(old_length + bytes_read);
+        } while (status_zx == ZX_OK);
+        if (status_zx == ZX_ERR_PEER_CLOSED) {
+          socket_closed = true;
+        } else {
+          GTEST_DEATH_TEST_CHECK_(status_zx == ZX_ERR_SHOULD_WAIT);
+          status_zx = stderr_socket_.wait_async(
+              port, kSocketKey, ZX_SOCKET_READABLE | ZX_SOCKET_PEER_CLOSED,
+              ZX_WAIT_ASYNC_ONCE);
+          GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
+        }
+      } else {
+        GTEST_DEATH_TEST_CHECK_(packet.signal.observed & ZX_SOCKET_PEER_CLOSED);
+        socket_closed = true;
+      }
+    }
+  } while (!process_terminated && !socket_closed);
+
+  ReadAndInterpretStatusByte();
+
+  zx_info_process_t buffer;
+  status_zx = child_process_.get_info(ZX_INFO_PROCESS, &buffer, sizeof(buffer),
+                                      nullptr, nullptr);
+  GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
+
+  GTEST_DEATH_TEST_CHECK_(buffer.exited);
+  set_status(buffer.return_code);
+  return status();
+}
+
+// The AssumeRole process for a Fuchsia death test.  It creates a child
+// process with the same executable as the current process to run the
+// death test.  The child process is given the --gtest_filter and
+// --gtest_internal_run_death_test flags such that it knows to run the
+// current death test only.
+DeathTest::TestRole FuchsiaDeathTest::AssumeRole() {
+  const UnitTestImpl *const impl = GetUnitTestImpl();
+  const InternalRunDeathTestFlag *const flag =
+      impl->internal_run_death_test_flag();
+  const TestInfo *const info = impl->current_test_info();
+  const int death_test_index = info->result()->death_test_count();
+
+  if (flag != nullptr) {
+    // ParseInternalRunDeathTestFlag() has performed all the necessary
+    // processing.
+    set_write_fd(kFuchsiaReadPipeFd);
+    return EXECUTE_TEST;
+  }
+
+  // Flush the log buffers since the log streams are shared with the child.
+  FlushInfoLog();
+
+  // Build the child process command line.
+  const std::string filter_flag = std::string("--") + GTEST_FLAG_PREFIX_ +
+                                  kFilterFlag + "=" + info->test_suite_name() +
+                                  "." + info->name();
+  const std::string internal_flag = std::string("--") + GTEST_FLAG_PREFIX_ +
+                                    kInternalRunDeathTestFlag + "=" + file_ +
+                                    "|" + StreamableToString(line_) + "|" +
+                                    StreamableToString(death_test_index);
+  Arguments args;
+  args.AddArguments(GetInjectableArgvs());
+  args.AddArgument(filter_flag.c_str());
+  args.AddArgument(internal_flag.c_str());
+
+  // Build the pipe for communication with the child.
+  zx_status_t status;
+  zx_handle_t child_pipe_handle;
+  int child_pipe_fd;
+  status = fdio_pipe_half(&child_pipe_fd, &child_pipe_handle);
+  GTEST_DEATH_TEST_CHECK_(status == ZX_OK);
+  set_read_fd(child_pipe_fd);
+
+  // Set the pipe handle for the child.
+  fdio_spawn_action_t spawn_actions[2] = {};
+  fdio_spawn_action_t *add_handle_action = &spawn_actions[0];
+  add_handle_action->action = FDIO_SPAWN_ACTION_ADD_HANDLE;
+  add_handle_action->h.id = PA_HND(PA_FD, kFuchsiaReadPipeFd);
+  add_handle_action->h.handle = child_pipe_handle;
+
+  // Create a socket pair will be used to receive the child process' stderr.
+  zx::socket stderr_producer_socket;
+  status = zx::socket::create(0, &stderr_producer_socket, &stderr_socket_);
+  GTEST_DEATH_TEST_CHECK_(status >= 0);
+  int stderr_producer_fd = -1;
+  status =
+      fdio_fd_create(stderr_producer_socket.release(), &stderr_producer_fd);
+  GTEST_DEATH_TEST_CHECK_(status >= 0);
+
+  // Make the stderr socket nonblocking.
+  GTEST_DEATH_TEST_CHECK_(fcntl(stderr_producer_fd, F_SETFL, 0) == 0);
+
+  fdio_spawn_action_t *add_stderr_action = &spawn_actions[1];
+  add_stderr_action->action = FDIO_SPAWN_ACTION_CLONE_FD;
+  add_stderr_action->fd.local_fd = stderr_producer_fd;
+  add_stderr_action->fd.target_fd = STDERR_FILENO;
+
+  // Create a child job.
+  zx_handle_t child_job = ZX_HANDLE_INVALID;
+  status = zx_job_create(zx_job_default(), 0, &child_job);
+  GTEST_DEATH_TEST_CHECK_(status == ZX_OK);
+  zx_policy_basic_t policy;
+  policy.condition = ZX_POL_NEW_ANY;
+  policy.policy = ZX_POL_ACTION_ALLOW;
+  status = zx_job_set_policy(child_job, ZX_JOB_POL_RELATIVE, ZX_JOB_POL_BASIC,
+                             &policy, 1);
+  GTEST_DEATH_TEST_CHECK_(status == ZX_OK);
+
+  // Create an exception channel attached to the |child_job|, to allow
+  // us to suppress the system default exception handler from firing.
+  status = zx_task_create_exception_channel(
+      child_job, 0, exception_channel_.reset_and_get_address());
+  GTEST_DEATH_TEST_CHECK_(status == ZX_OK);
+
+  // Spawn the child process.
+  status = fdio_spawn_etc(child_job, FDIO_SPAWN_CLONE_ALL, args.Argv()[0],
+                          args.Argv(), nullptr, 2, spawn_actions,
+                          child_process_.reset_and_get_address(), nullptr);
+  GTEST_DEATH_TEST_CHECK_(status == ZX_OK);
+
+  set_spawned(true);
+  return OVERSEE_TEST;
+}
+
+std::string FuchsiaDeathTest::GetErrorLogs() { return captured_stderr_; }
+
+#else  // We are neither on Windows, nor on Fuchsia.
+
+// ForkingDeathTest provides implementations for most of the abstract
+// methods of the DeathTest interface.  Only the AssumeRole method is
+// left undefined.
+class ForkingDeathTest : public DeathTestImpl {
+ public:
+  ForkingDeathTest(const char *statement, Matcher<const std::string &> matcher);
+
+  // All of these virtual functions are inherited from DeathTest.
+  int Wait() override;
+
+ protected:
+  void set_child_pid(pid_t child_pid) { child_pid_ = child_pid; }
+
+ private:
+  // PID of child process during death test; 0 in the child process itself.
+  pid_t child_pid_;
+};
+
+// Constructs a ForkingDeathTest.
+ForkingDeathTest::ForkingDeathTest(const char *a_statement,
+                                   Matcher<const std::string &> matcher)
+    : DeathTestImpl(a_statement, std::move(matcher)), child_pid_(-1) {}
+
+// Waits for the child in a death test to exit, returning its exit
+// status, or 0 if no child process exists.  As a side effect, sets the
+// outcome data member.
+int ForkingDeathTest::Wait() {
+  if (!spawned()) return 0;
+
+  ReadAndInterpretStatusByte();
+
+  int status_value;
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(waitpid(child_pid_, &status_value, 0));
+  set_status(status_value);
+  return status_value;
+}
+
+// A concrete death test class that forks, then immediately runs the test
+// in the child process.
+class NoExecDeathTest : public ForkingDeathTest {
+ public:
+  NoExecDeathTest(const char *a_statement, Matcher<const std::string &> matcher)
+      : ForkingDeathTest(a_statement, std::move(matcher)) {}
+  TestRole AssumeRole() override;
+};
+
+// The AssumeRole process for a fork-and-run death test.  It implements a
+// straightforward fork, with a simple pipe to transmit the status byte.
+DeathTest::TestRole NoExecDeathTest::AssumeRole() {
+  const size_t thread_count = GetThreadCount();
+  if (thread_count != 1) {
+    GTEST_LOG_(WARNING) << DeathTestThreadWarning(thread_count);
+  }
+
+  int pipe_fd[2];
+  GTEST_DEATH_TEST_CHECK_(pipe(pipe_fd) != -1);
+
+  DeathTest::set_last_death_test_message("");
+  CaptureStderr();
+  // When we fork the process below, the log file buffers are copied, but the
+  // file descriptors are shared.  We flush all log files here so that closing
+  // the file descriptors in the child process doesn't throw off the
+  // synchronization between descriptors and buffers in the parent process.
+  // This is as close to the fork as possible to avoid a race condition in case
+  // there are multiple threads running before the death test, and another
+  // thread writes to the log file.
+  FlushInfoLog();
+
+  const pid_t child_pid = fork();
+  GTEST_DEATH_TEST_CHECK_(child_pid != -1);
+  set_child_pid(child_pid);
+  if (child_pid == 0) {
+    GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[0]));
+    set_write_fd(pipe_fd[1]);
+    // Redirects all logging to stderr in the child process to prevent
+    // concurrent writes to the log files.  We capture stderr in the parent
+    // process and append the child process' output to a log.
+    LogToStderr();
+    // Event forwarding to the listeners of event listener API mush be shut
+    // down in death test subprocesses.
+    GetUnitTestImpl()->listeners()->SuppressEventForwarding();
+    g_in_fast_death_test_child = true;
+    return EXECUTE_TEST;
+  } else {
+    GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[1]));
+    set_read_fd(pipe_fd[0]);
+    set_spawned(true);
+    return OVERSEE_TEST;
+  }
+}
+
+// A concrete death test class that forks and re-executes the main
+// program from the beginning, with command-line flags set that cause
+// only this specific death test to be run.
+class ExecDeathTest : public ForkingDeathTest {
+ public:
+  ExecDeathTest(const char *a_statement, Matcher<const std::string &> matcher,
+                const char *file, int line)
+      : ForkingDeathTest(a_statement, std::move(matcher)), file_(file),
+        line_(line) {}
+  TestRole AssumeRole() override;
+
+ private:
+  static ::std::vector<std::string> GetArgvsForDeathTestChildProcess() {
+    ::std::vector<std::string> args = GetInjectableArgvs();
+#if defined(GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_)
+    ::std::vector<std::string> extra_args =
+        GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_();
+    args.insert(args.end(), extra_args.begin(), extra_args.end());
+#endif  // defined(GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_)
+    return args;
+  }
+  // The name of the file in which the death test is located.
+  const char *const file_;
+  // The line number on which the death test is located.
+  const int line_;
+};
+
+// Utility class for accumulating command-line arguments.
+class Arguments {
+ public:
+  Arguments() { args_.push_back(nullptr); }
+
+  ~Arguments() {
+    for (std::vector<char *>::iterator i = args_.begin(); i != args_.end();
+         ++i) {
+      free(*i);
+    }
+  }
+  void AddArgument(const char *argument) {
+    args_.insert(args_.end() - 1, posix::StrDup(argument));
+  }
+
+  template <typename Str>
+  void AddArguments(const ::std::vector<Str> &arguments) {
+    for (typename ::std::vector<Str>::const_iterator i = arguments.begin();
+         i != arguments.end(); ++i) {
+      args_.insert(args_.end() - 1, posix::StrDup(i->c_str()));
+    }
+  }
+  char *const *Argv() { return &args_[0]; }
+
+ private:
+  std::vector<char *> args_;
+};
+
+// A struct that encompasses the arguments to the child process of a
+// threadsafe-style death test process.
+struct ExecDeathTestArgs {
+  char *const *argv;  // Command-line arguments for the child's call to exec
+  int close_fd;       // File descriptor to close; the read end of a pipe
+};
+
+#if GTEST_OS_MAC
+inline char **GetEnviron() {
+  // When Google Test is built as a framework on MacOS X, the environ variable
+  // is unavailable. Apple's documentation (man environ) recommends using
+  // _NSGetEnviron() instead.
+  return *_NSGetEnviron();
+}
+#else
+// Some POSIX platforms expect you to declare environ. extern "C" makes
+// it reside in the global namespace.
+extern "C" char **environ;
+inline char **GetEnviron() { return environ; }
+#endif  // GTEST_OS_MAC
+
+#if !GTEST_OS_QNX
+// The main function for a threadsafe-style death test child process.
+// This function is called in a clone()-ed process and thus must avoid
+// any potentially unsafe operations like malloc or libc functions.
+static int ExecDeathTestChildMain(void *child_arg) {
+  ExecDeathTestArgs *const args = static_cast<ExecDeathTestArgs *>(child_arg);
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(close(args->close_fd));
+
+  // We need to execute the test program in the same environment where
+  // it was originally invoked.  Therefore we change to the original
+  // working directory first.
+  const char *const original_dir =
+      UnitTest::GetInstance()->original_working_dir();
+  // We can safely call chdir() as it's a direct system call.
+  if (chdir(original_dir) != 0) {
+    DeathTestAbort(std::string("chdir(\"") + original_dir +
+                   "\") failed: " + GetLastErrnoDescription());
+    return EXIT_FAILURE;
+  }
+
+  // We can safely call execve() as it's a direct system call.  We
+  // cannot use execvp() as it's a libc function and thus potentially
+  // unsafe.  Since execve() doesn't search the PATH, the user must
+  // invoke the test program via a valid path that contains at least
+  // one path separator.
+  execve(args->argv[0], args->argv, GetEnviron());
+  DeathTestAbort(std::string("execve(") + args->argv[0] + ", ...) in " +
+                 original_dir + " failed: " + GetLastErrnoDescription());
+  return EXIT_FAILURE;
+}
+#endif  // !GTEST_OS_QNX
+
+#if GTEST_HAS_CLONE
+// Two utility routines that together determine the direction the stack
+// grows.
+// This could be accomplished more elegantly by a single recursive
+// function, but we want to guard against the unlikely possibility of
+// a smart compiler optimizing the recursion away.
+//
+// GTEST_NO_INLINE_ is required to prevent GCC 4.6 from inlining
+// StackLowerThanAddress into StackGrowsDown, which then doesn't give
+// correct answer.
+static void StackLowerThanAddress(const void *ptr,
+                                  bool *result) GTEST_NO_INLINE_;
+// HWAddressSanitizer add a random tag to the MSB of the local variable address,
+// making comparison result unpredictable.
+GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
+static void StackLowerThanAddress(const void *ptr, bool *result) {
+  int dummy;
+  *result = (&dummy < ptr);
+}
+
+// Make sure AddressSanitizer does not tamper with the stack here.
+GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
+static bool StackGrowsDown() {
+  int dummy;
+  bool result;
+  StackLowerThanAddress(&dummy, &result);
+  return result;
+}
+#endif  // GTEST_HAS_CLONE
+
+// Spawns a child process with the same executable as the current process in
+// a thread-safe manner and instructs it to run the death test.  The
+// implementation uses fork(2) + exec.  On systems where clone(2) is
+// available, it is used instead, being slightly more thread-safe.  On QNX,
+// fork supports only single-threaded environments, so this function uses
+// spawn(2) there instead.  The function dies with an error message if
+// anything goes wrong.
+static pid_t ExecDeathTestSpawnChild(char *const *argv, int close_fd) {
+  ExecDeathTestArgs args = { argv, close_fd };
+  pid_t child_pid = -1;
+
+#if GTEST_OS_QNX
+  // Obtains the current directory and sets it to be closed in the child
+  // process.
+  const int cwd_fd = open(".", O_RDONLY);
+  GTEST_DEATH_TEST_CHECK_(cwd_fd != -1);
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(fcntl(cwd_fd, F_SETFD, FD_CLOEXEC));
+  // We need to execute the test program in the same environment where
+  // it was originally invoked.  Therefore we change to the original
+  // working directory first.
+  const char *const original_dir =
+      UnitTest::GetInstance()->original_working_dir();
+  // We can safely call chdir() as it's a direct system call.
+  if (chdir(original_dir) != 0) {
+    DeathTestAbort(std::string("chdir(\"") + original_dir +
+                   "\") failed: " + GetLastErrnoDescription());
+    return EXIT_FAILURE;
+  }
+
+  int fd_flags;
+  // Set close_fd to be closed after spawn.
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(fd_flags = fcntl(close_fd, F_GETFD));
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(
+      fcntl(close_fd, F_SETFD, fd_flags | FD_CLOEXEC));
+  struct inheritance inherit = { 0 };
+  // spawn is a system call.
+  child_pid =
+      spawn(args.argv[0], 0, nullptr, &inherit, args.argv, GetEnviron());
+  // Restores the current working directory.
+  GTEST_DEATH_TEST_CHECK_(fchdir(cwd_fd) != -1);
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(close(cwd_fd));
+
+#else  // GTEST_OS_QNX
+#if GTEST_OS_LINUX
+  // When a SIGPROF signal is received while fork() or clone() are executing,
+  // the process may hang. To avoid this, we ignore SIGPROF here and re-enable
+  // it after the call to fork()/clone() is complete.
+  struct sigaction saved_sigprof_action;
+  struct sigaction ignore_sigprof_action;
+  memset(&ignore_sigprof_action, 0, sizeof(ignore_sigprof_action));
+  sigemptyset(&ignore_sigprof_action.sa_mask);
+  ignore_sigprof_action.sa_handler = SIG_IGN;
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(
+      sigaction(SIGPROF, &ignore_sigprof_action, &saved_sigprof_action));
+#endif  // GTEST_OS_LINUX
+
+#if GTEST_HAS_CLONE
+  const bool use_fork = GTEST_FLAG(death_test_use_fork);
+
+  if (!use_fork) {
+    static const bool stack_grows_down = StackGrowsDown();
+    const auto stack_size = static_cast<size_t>(getpagesize() * 2);
+    // MMAP_ANONYMOUS is not defined on Mac, so we use MAP_ANON instead.
+    void *const stack = mmap(nullptr, stack_size, PROT_READ | PROT_WRITE,
+                             MAP_ANON | MAP_PRIVATE, -1, 0);
+    GTEST_DEATH_TEST_CHECK_(stack != MAP_FAILED);
+
+    // Maximum stack alignment in bytes:  For a downward-growing stack, this
+    // amount is subtracted from size of the stack space to get an address
+    // that is within the stack space and is aligned on all systems we care
+    // about.  As far as I know there is no ABI with stack alignment greater
+    // than 64.  We assume stack and stack_size already have alignment of
+    // kMaxStackAlignment.
+    const size_t kMaxStackAlignment = 64;
+    void *const stack_top =
+        static_cast<char *>(stack) +
+        (stack_grows_down ? stack_size - kMaxStackAlignment : 0);
+    GTEST_DEATH_TEST_CHECK_(
+        static_cast<size_t>(stack_size) > kMaxStackAlignment &&
+        reinterpret_cast<uintptr_t>(stack_top) % kMaxStackAlignment == 0);
+
+    child_pid = clone(&ExecDeathTestChildMain, stack_top, SIGCHLD, &args);
+
+    GTEST_DEATH_TEST_CHECK_(munmap(stack, stack_size) != -1);
+  }
+#else
+  const bool use_fork = true;
+#endif  // GTEST_HAS_CLONE
+
+  if (use_fork && (child_pid = fork()) == 0) {
+    ExecDeathTestChildMain(&args);
+    _exit(0);
+  }
+#endif  // GTEST_OS_QNX
+#if GTEST_OS_LINUX
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(
+      sigaction(SIGPROF, &saved_sigprof_action, nullptr));
+#endif  // GTEST_OS_LINUX
+
+  GTEST_DEATH_TEST_CHECK_(child_pid != -1);
+  return child_pid;
+}
+
+// The AssumeRole process for a fork-and-exec death test.  It re-executes the
+// main program from the beginning, setting the --gtest_filter
+// and --gtest_internal_run_death_test flags to cause only the current
+// death test to be re-run.
+DeathTest::TestRole ExecDeathTest::AssumeRole() {
+  const UnitTestImpl *const impl = GetUnitTestImpl();
+  const InternalRunDeathTestFlag *const flag =
+      impl->internal_run_death_test_flag();
+  const TestInfo *const info = impl->current_test_info();
+  const int death_test_index = info->result()->death_test_count();
+
+  if (flag != nullptr) {
+    set_write_fd(flag->write_fd());
+    return EXECUTE_TEST;
+  }
+
+  int pipe_fd[2];
+  GTEST_DEATH_TEST_CHECK_(pipe(pipe_fd) != -1);
+  // Clear the close-on-exec flag on the write end of the pipe, lest
+  // it be closed when the child process does an exec:
+  GTEST_DEATH_TEST_CHECK_(fcntl(pipe_fd[1], F_SETFD, 0) != -1);
+
+  const std::string filter_flag = std::string("--") + GTEST_FLAG_PREFIX_ +
+                                  kFilterFlag + "=" + info->test_suite_name() +
+                                  "." + info->name();
+  const std::string internal_flag = std::string("--") + GTEST_FLAG_PREFIX_ +
+                                    kInternalRunDeathTestFlag + "=" + file_ +
+                                    "|" + StreamableToString(line_) + "|" +
+                                    StreamableToString(death_test_index) + "|" +
+                                    StreamableToString(pipe_fd[1]);
+  Arguments args;
+  args.AddArguments(GetArgvsForDeathTestChildProcess());
+  args.AddArgument(filter_flag.c_str());
+  args.AddArgument(internal_flag.c_str());
+
+  DeathTest::set_last_death_test_message("");
+
+  CaptureStderr();
+  // See the comment in NoExecDeathTest::AssumeRole for why the next line
+  // is necessary.
+  FlushInfoLog();
+
+  const pid_t child_pid = ExecDeathTestSpawnChild(args.Argv(), pipe_fd[0]);
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[1]));
+  set_child_pid(child_pid);
+  set_read_fd(pipe_fd[0]);
+  set_spawned(true);
+  return OVERSEE_TEST;
+}
+
+#endif  // !GTEST_OS_WINDOWS
+
+// Creates a concrete DeathTest-derived class that depends on the
+// --gtest_death_test_style flag, and sets the pointer pointed to
+// by the "test" argument to its address.  If the test should be
+// skipped, sets that pointer to NULL.  Returns true, unless the
+// flag is set to an invalid value.
+bool DefaultDeathTestFactory::Create(const char *statement,
+                                     Matcher<const std::string &> matcher,
+                                     const char *file, int line,
+                                     DeathTest **test) {
+  UnitTestImpl *const impl = GetUnitTestImpl();
+  const InternalRunDeathTestFlag *const flag =
+      impl->internal_run_death_test_flag();
+  const int death_test_index =
+      impl->current_test_info()->increment_death_test_count();
+
+  if (flag != nullptr) {
+    if (death_test_index > flag->index()) {
+      DeathTest::set_last_death_test_message(
+          "Death test count (" + StreamableToString(death_test_index) +
+          ") somehow exceeded expected maximum (" +
+          StreamableToString(flag->index()) + ")");
+      return false;
+    }
+
+    if (!(flag->file() == file && flag->line() == line &&
+          flag->index() == death_test_index)) {
+      *test = nullptr;
+      return true;
+    }
+  }
+
+#if GTEST_OS_WINDOWS
+
+  if (GTEST_FLAG(death_test_style) == "threadsafe" ||
+      GTEST_FLAG(death_test_style) == "fast") {
+    *test = new WindowsDeathTest(statement, std::move(matcher), file, line);
+  }
+
+#elif GTEST_OS_FUCHSIA
+
+  if (GTEST_FLAG(death_test_style) == "threadsafe" ||
+      GTEST_FLAG(death_test_style) == "fast") {
+    *test = new FuchsiaDeathTest(statement, std::move(matcher), file, line);
+  }
+
+#else
+
+  if (GTEST_FLAG(death_test_style) == "threadsafe") {
+    *test = new ExecDeathTest(statement, std::move(matcher), file, line);
+  } else if (GTEST_FLAG(death_test_style) == "fast") {
+    *test = new NoExecDeathTest(statement, std::move(matcher));
+  }
+
+#endif  // GTEST_OS_WINDOWS
+
+  else {  // NOLINT - this is more readable than unbalanced brackets inside #if.
+    DeathTest::set_last_death_test_message("Unknown death test style \"" +
+                                           GTEST_FLAG(death_test_style) +
+                                           "\" encountered");
+    return false;
+  }
+
+  return true;
+}
+
+#if GTEST_OS_WINDOWS
+// Recreates the pipe and event handles from the provided parameters,
+// signals the event, and returns a file descriptor wrapped around the pipe
+// handle. This function is called in the child process only.
+static int GetStatusFileDescriptor(unsigned int parent_process_id,
+                                   size_t write_handle_as_size_t,
+                                   size_t event_handle_as_size_t) {
+  AutoHandle parent_process_handle(::OpenProcess(PROCESS_DUP_HANDLE,
+                                                 FALSE,  // Non-inheritable.
+                                                 parent_process_id));
+  if (parent_process_handle.Get() == INVALID_HANDLE_VALUE) {
+    DeathTestAbort("Unable to open parent process " +
+                   StreamableToString(parent_process_id));
+  }
+
+  GTEST_CHECK_(sizeof(HANDLE) <= sizeof(size_t));
+
+  const HANDLE write_handle = reinterpret_cast<HANDLE>(write_handle_as_size_t);
+  HANDLE dup_write_handle;
+
+  // The newly initialized handle is accessible only in the parent
+  // process. To obtain one accessible within the child, we need to use
+  // DuplicateHandle.
+  if (!::DuplicateHandle(parent_process_handle.Get(), write_handle,
+                         ::GetCurrentProcess(), &dup_write_handle,
+                         0x0,    // Requested privileges ignored since
+                                 // DUPLICATE_SAME_ACCESS is used.
+                         FALSE,  // Request non-inheritable handler.
+                         DUPLICATE_SAME_ACCESS)) {
+    DeathTestAbort("Unable to duplicate the pipe handle " +
+                   StreamableToString(write_handle_as_size_t) +
+                   " from the parent process " +
+                   StreamableToString(parent_process_id));
+  }
+
+  const HANDLE event_handle = reinterpret_cast<HANDLE>(event_handle_as_size_t);
+  HANDLE dup_event_handle;
+
+  if (!::DuplicateHandle(parent_process_handle.Get(), event_handle,
+                         ::GetCurrentProcess(), &dup_event_handle, 0x0, FALSE,
+                         DUPLICATE_SAME_ACCESS)) {
+    DeathTestAbort("Unable to duplicate the event handle " +
+                   StreamableToString(event_handle_as_size_t) +
+                   " from the parent process " +
+                   StreamableToString(parent_process_id));
+  }
+
+  const int write_fd =
+      ::_open_osfhandle(reinterpret_cast<intptr_t>(dup_write_handle), O_APPEND);
+  if (write_fd == -1) {
+    DeathTestAbort("Unable to convert pipe handle " +
+                   StreamableToString(write_handle_as_size_t) +
+                   " to a file descriptor");
+  }
+
+  // Signals the parent that the write end of the pipe has been acquired
+  // so the parent can release its own write end.
+  ::SetEvent(dup_event_handle);
+
+  return write_fd;
+}
+#endif  // GTEST_OS_WINDOWS
+
+// Returns a newly created InternalRunDeathTestFlag object with fields
+// initialized from the GTEST_FLAG(internal_run_death_test) flag if
+// the flag is specified; otherwise returns NULL.
+InternalRunDeathTestFlag *ParseInternalRunDeathTestFlag() {
+  if (GTEST_FLAG(internal_run_death_test) == "") return nullptr;
+
+  // GTEST_HAS_DEATH_TEST implies that we have ::std::string, so we
+  // can use it here.
+  int line = -1;
+  int index = -1;
+  ::std::vector< ::std::string> fields;
+  SplitString(GTEST_FLAG(internal_run_death_test).c_str(), '|', &fields);
+  int write_fd = -1;
+
+#if GTEST_OS_WINDOWS
+
+  unsigned int parent_process_id = 0;
+  size_t write_handle_as_size_t = 0;
+  size_t event_handle_as_size_t = 0;
+
+  if (fields.size() != 6 || !ParseNaturalNumber(fields[1], &line) ||
+      !ParseNaturalNumber(fields[2], &index) ||
+      !ParseNaturalNumber(fields[3], &parent_process_id) ||
+      !ParseNaturalNumber(fields[4], &write_handle_as_size_t) ||
+      !ParseNaturalNumber(fields[5], &event_handle_as_size_t)) {
+    DeathTestAbort("Bad --gtest_internal_run_death_test flag: " +
+                   GTEST_FLAG(internal_run_death_test));
+  }
+  write_fd = GetStatusFileDescriptor(parent_process_id, write_handle_as_size_t,
+                                     event_handle_as_size_t);
+
+#elif GTEST_OS_FUCHSIA
+
+  if (fields.size() != 3 || !ParseNaturalNumber(fields[1], &line) ||
+      !ParseNaturalNumber(fields[2], &index)) {
+    DeathTestAbort("Bad --gtest_internal_run_death_test flag: " +
+                   GTEST_FLAG(internal_run_death_test));
+  }
+
+#else
+
+  if (fields.size() != 4 || !ParseNaturalNumber(fields[1], &line) ||
+      !ParseNaturalNumber(fields[2], &index) ||
+      !ParseNaturalNumber(fields[3], &write_fd)) {
+    DeathTestAbort("Bad --gtest_internal_run_death_test flag: " +
+                   GTEST_FLAG(internal_run_death_test));
+  }
+
+#endif  // GTEST_OS_WINDOWS
+
+  return new InternalRunDeathTestFlag(fields[0], line, index, write_fd);
+}
+
+}  // namespace internal
+
+#endif  // GTEST_HAS_DEATH_TEST
+
+}  // namespace testing
diff --git a/libs/libaom/src/third_party/googletest/src/googletest/src/gtest-filepath.cc b/libs/libaom/src/third_party/googletest/src/googletest/src/gtest-filepath.cc
new file mode 100644
index 000000000..f9427e0f1
--- /dev/null
+++ b/libs/libaom/src/third_party/googletest/src/googletest/src/gtest-filepath.cc
@@ -0,0 +1,377 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "gtest/internal/gtest-filepath.h"
+
+#include <stdlib.h>
+#include "gtest/internal/gtest-port.h"
+#include "gtest/gtest-message.h"
+
+#if GTEST_OS_WINDOWS_MOBILE
+#include <windows.h>
+#elif GTEST_OS_WINDOWS
+#include <direct.h>
+#include <io.h>
+#else
+#include <limits.h>
+#include <climits>  // Some Linux distributions define PATH_MAX here.
+#endif              // GTEST_OS_WINDOWS_MOBILE
+
+#include "gtest/internal/gtest-string.h"
+
+#if GTEST_OS_WINDOWS
+#define GTEST_PATH_MAX_ _MAX_PATH
+#elif defined(PATH_MAX)
+#define GTEST_PATH_MAX_ PATH_MAX
+#elif defined(_XOPEN_PATH_MAX)
+#define GTEST_PATH_MAX_ _XOPEN_PATH_MAX
+#else
+#define GTEST_PATH_MAX_ _POSIX_PATH_MAX
+#endif  // GTEST_OS_WINDOWS
+
+namespace testing {
+namespace internal {
+
+#if GTEST_OS_WINDOWS
+// On Windows, '\\' is the standard path separator, but many tools and the
+// Windows API also accept '/' as an alternate path separator. Unless otherwise
+// noted, a file path can contain either kind of path separators, or a mixture
+// of them.
+const char kPathSeparator = '\\';
+const char kAlternatePathSeparator = '/';
+const char kAlternatePathSeparatorString[] = "/";
+#if GTEST_OS_WINDOWS_MOBILE
+// Windows CE doesn't have a current directory. You should not use
+// the current directory in tests on Windows CE, but this at least
+// provides a reasonable fallback.
+const char kCurrentDirectoryString[] = "\\";
+// Windows CE doesn't define INVALID_FILE_ATTRIBUTES
+const DWORD kInvalidFileAttributes = 0xffffffff;
+#else
+const char kCurrentDirectoryString[] = ".\\";
+#endif  // GTEST_OS_WINDOWS_MOBILE
+#else
+const char kPathSeparator = '/';
+const char kCurrentDirectoryString[] = "./";
+#endif  // GTEST_OS_WINDOWS
+
+// Returns whether the given character is a valid path separator.
+static bool IsPathSeparator(char c) {
+#if GTEST_HAS_ALT_PATH_SEP_
+  return (c == kPathSeparator) || (c == kAlternatePathSeparator);
+#else
+  return c == kPathSeparator;
+#endif
+}
+
+// Returns the current working directory, or "" if unsuccessful.
+FilePath FilePath::GetCurrentDir() {
+#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE || \
+    GTEST_OS_WINDOWS_RT || GTEST_OS_ESP8266 || GTEST_OS_ESP32
+  // These platforms do not have a current directory, so we just return
+  // something reasonable.
+  return FilePath(kCurrentDirectoryString);
+#elif GTEST_OS_WINDOWS
+  char cwd[GTEST_PATH_MAX_ + 1] = { '\0' };
+  return FilePath(_getcwd(cwd, sizeof(cwd)) == nullptr ? "" : cwd);
+#else
+  char cwd[GTEST_PATH_MAX_ + 1] = { '\0' };
+  char *result = getcwd(cwd, sizeof(cwd));
+#if GTEST_OS_NACL
+  // getcwd will likely fail in NaCl due to the sandbox, so return something
+  // reasonable. The user may have provided a shim implementation for getcwd,
+  // however, so fallback only when failure is detected.
+  return FilePath(result == nullptr ? kCurrentDirectoryString : cwd);
+#endif  // GTEST_OS_NACL
+  return FilePath(result == nullptr ? "" : cwd);
+#endif  // GTEST_OS_WINDOWS_MOBILE
+}
+
+// Returns a copy of the FilePath with the case-insensitive extension removed.
+// Example: FilePath("dir/file.exe").RemoveExtension("EXE") returns
+// FilePath("dir/file"). If a case-insensitive extension is not
+// found, returns a copy of the original FilePath.
+FilePath FilePath::RemoveExtension(const char *extension) const {
+  const std::string dot_extension = std::string(".") + extension;
+  if (String::EndsWithCaseInsensitive(pathname_, dot_extension)) {
+    return FilePath(
+        pathname_.substr(0, pathname_.length() - dot_extension.length()));
+  }
+  return *this;
+}
+
+// Returns a pointer to the last occurrence of a valid path separator in
+// the FilePath. On Windows, for example, both '/' and '\' are valid path
+// separators. Returns NULL if no path separator was found.
+const char *FilePath::FindLastPathSeparator() const {
+  const char *const last_sep = strrchr(c_str(), kPathSeparator);
+#if GTEST_HAS_ALT_PATH_SEP_
+  const char *const last_alt_sep = strrchr(c_str(), kAlternatePathSeparator);
+  // Comparing two pointers of which only one is NULL is undefined.
+  if (last_alt_sep != nullptr &&
+      (last_sep == nullptr || last_alt_sep > last_sep)) {
+    return last_alt_sep;
+  }
+#endif
+  return last_sep;
+}
+
+// Returns a copy of the FilePath with the directory part removed.
+// Example: FilePath("path/to/file").RemoveDirectoryName() returns
+// FilePath("file"). If there is no directory part ("just_a_file"), it returns
+// the FilePath unmodified. If there is no file part ("just_a_dir/") it
+// returns an empty FilePath ("").
+// On Windows platform, '\' is the path separator, otherwise it is '/'.
+FilePath FilePath::RemoveDirectoryName() const {
+  const char *const last_sep = FindLastPathSeparator();
+  return last_sep ? FilePath(last_sep + 1) : *this;
+}
+
+// RemoveFileName returns the directory path with the filename removed.
+// Example: FilePath("path/to/file").RemoveFileName() returns "path/to/".
+// If the FilePath is "a_file" or "/a_file", RemoveFileName returns
+// FilePath("./") or, on Windows, FilePath(".\\"). If the filepath does
+// not have a file, like "just/a/dir/", it returns the FilePath unmodified.
+// On Windows platform, '\' is the path separator, otherwise it is '/'.
+FilePath FilePath::RemoveFileName() const {
+  const char *const last_sep = FindLastPathSeparator();
+  std::string dir;
+  if (last_sep) {
+    dir = std::string(c_str(), static_cast<size_t>(last_sep + 1 - c_str()));
+  } else {
+    dir = kCurrentDirectoryString;
+  }
+  return FilePath(dir);
+}
+
+// Helper functions for naming files in a directory for xml output.
+
+// Given directory = "dir", base_name = "test", number = 0,
+// extension = "xml", returns "dir/test.xml". If number is greater
+// than zero (e.g., 12), returns "dir/test_12.xml".
+// On Windows platform, uses \ as the separator rather than /.
+FilePath FilePath::MakeFileName(const FilePath &directory,
+                                const FilePath &base_name, int number,
+                                const char *extension) {
+  std::string file;
+  if (number == 0) {
+    file = base_name.string() + "." + extension;
+  } else {
+    file =
+        base_name.string() + "_" + StreamableToString(number) + "." + extension;
+  }
+  return ConcatPaths(directory, FilePath(file));
+}
+
+// Given directory = "dir", relative_path = "test.xml", returns "dir/test.xml".
+// On Windows, uses \ as the separator rather than /.
+FilePath FilePath::ConcatPaths(const FilePath &directory,
+                               const FilePath &relative_path) {
+  if (directory.IsEmpty()) return relative_path;
+  const FilePath dir(directory.RemoveTrailingPathSeparator());
+  return FilePath(dir.string() + kPathSeparator + relative_path.string());
+}
+
+// Returns true if pathname describes something findable in the file-system,
+// either a file, directory, or whatever.
+bool FilePath::FileOrDirectoryExists() const {
+#if GTEST_OS_WINDOWS_MOBILE
+  LPCWSTR unicode = String::AnsiToUtf16(pathname_.c_str());
+  const DWORD attributes = GetFileAttributes(unicode);
+  delete[] unicode;
+  return attributes != kInvalidFileAttributes;
+#else
+  posix::StatStruct file_stat;
+  return posix::Stat(pathname_.c_str(), &file_stat) == 0;
+#endif  // GTEST_OS_WINDOWS_MOBILE
+}
+
+// Returns true if pathname describes a directory in the file-system
+// that exists.
+bool FilePath::DirectoryExists() const {
+  bool result = false;
+#if GTEST_OS_WINDOWS
+  // Don't strip off trailing separator if path is a root directory on
+  // Windows (like "C:\\").
+  const FilePath &path(IsRootDirectory() ? *this
+                                         : RemoveTrailingPathSeparator());
+#else
+  const FilePath &path(*this);
+#endif
+
+#if GTEST_OS_WINDOWS_MOBILE
+  LPCWSTR unicode = String::AnsiToUtf16(path.c_str());
+  const DWORD attributes = GetFileAttributes(unicode);
+  delete[] unicode;
+  if ((attributes != kInvalidFileAttributes) &&
+      (attributes & FILE_ATTRIBUTE_DIRECTORY)) {
+    result = true;
+  }
+#else
+  posix::StatStruct file_stat;
+  result =
+      posix::Stat(path.c_str(), &file_stat) == 0 && posix::IsDir(file_stat);
+#endif  // GTEST_OS_WINDOWS_MOBILE
+
+  return result;
+}
+
+// Returns true if pathname describes a root directory. (Windows has one
+// root directory per disk drive.)
+bool FilePath::IsRootDirectory() const {
+#if GTEST_OS_WINDOWS
+  return pathname_.length() == 3 && IsAbsolutePath();
+#else
+  return pathname_.length() == 1 && IsPathSeparator(pathname_.c_str()[0]);
+#endif
+}
+
+// Returns true if pathname describes an absolute path.
+bool FilePath::IsAbsolutePath() const {
+  const char *const name = pathname_.c_str();
+#if GTEST_OS_WINDOWS
+  return pathname_.length() >= 3 &&
+         ((name[0] >= 'a' && name[0] <= 'z') ||
+          (name[0] >= 'A' && name[0] <= 'Z')) &&
+         name[1] == ':' && IsPathSeparator(name[2]);
+#else
+  return IsPathSeparator(name[0]);
+#endif
+}
+
+// Returns a pathname for a file that does not currently exist. The pathname
+// will be directory/base_name.extension or
+// directory/base_name_<number>.extension if directory/base_name.extension
+// already exists. The number will be incremented until a pathname is found
+// that does not already exist.
+// Examples: 'dir/foo_test.xml' or 'dir/foo_test_1.xml'.
+// There could be a race condition if two or more processes are calling this
+// function at the same time -- they could both pick the same filename.
+FilePath FilePath::GenerateUniqueFileName(const FilePath &directory,
+                                          const FilePath &base_name,
+                                          const char *extension) {
+  FilePath full_pathname;
+  int number = 0;
+  do {
+    full_pathname.Set(MakeFileName(directory, base_name, number++, extension));
+  } while (full_pathname.FileOrDirectoryExists());
+  return full_pathname;
+}
+
+// Returns true if FilePath ends with a path separator, which indicates that
+// it is intended to represent a directory. Returns false otherwise.
+// This does NOT check that a directory (or file) actually exists.
+bool FilePath::IsDirectory() const {
+  return !pathname_.empty() &&
+         IsPathSeparator(pathname_.c_str()[pathname_.length() - 1]);
+}
+
+// Create directories so that path exists. Returns true if successful or if
+// the directories already exist; returns false if unable to create directories
+// for any reason.
+bool FilePath::CreateDirectoriesRecursively() const {
+  if (!this->IsDirectory()) {
+    return false;
+  }
+
+  if (pathname_.length() == 0 || this->DirectoryExists()) {
+    return true;
+  }
+
+  const FilePath parent(this->RemoveTrailingPathSeparator().RemoveFileName());
+  return parent.CreateDirectoriesRecursively() && this->CreateFolder();
+}
+
+// Create the directory so that path exists. Returns true if successful or
+// if the directory already exists; returns false if unable to create the
+// directory for any reason, including if the parent directory does not
+// exist. Not named "CreateDirectory" because that's a macro on Windows.
+bool FilePath::CreateFolder() const {
+#if GTEST_OS_WINDOWS_MOBILE
+  FilePath removed_sep(this->RemoveTrailingPathSeparator());
+  LPCWSTR unicode = String::AnsiToUtf16(removed_sep.c_str());
+  int result = CreateDirectory(unicode, nullptr) ? 0 : -1;
+  delete[] unicode;
+#elif GTEST_OS_WINDOWS
+  int result = _mkdir(pathname_.c_str());
+#elif GTEST_OS_ESP8266
+  // do nothing
+  int result = 0;
+#else
+  int result = mkdir(pathname_.c_str(), 0777);
+#endif  // GTEST_OS_WINDOWS_MOBILE
+
+  if (result == -1) {
+    return this->DirectoryExists();  // An error is OK if the directory exists.
+  }
+  return true;  // No error.
+}
+
+// If input name has a trailing separator character, remove it and return the
+// name, otherwise return the name string unmodified.
+// On Windows platform, uses \ as the separator, other platforms use /.
+FilePath FilePath::RemoveTrailingPathSeparator() const {
+  return IsDirectory() ? FilePath(pathname_.substr(0, pathname_.length() - 1))
+                       : *this;
+}
+
+// Removes any redundant separators that might be in the pathname.
+// For example, "bar///foo" becomes "bar/foo". Does not eliminate other
+// redundancies that might be in a pathname involving "." or "..".
+void FilePath::Normalize() {
+  if (pathname_.c_str() == nullptr) {
+    pathname_ = "";
+    return;
+  }
+  const char *src = pathname_.c_str();
+  char *const dest = new char[pathname_.length() + 1];
+  char *dest_ptr = dest;
+  memset(dest_ptr, 0, pathname_.length() + 1);
+
+  while (*src != '\0') {
+    *dest_ptr = *src;
+    if (!IsPathSeparator(*src)) {
+      src++;
+    } else {
+#if GTEST_HAS_ALT_PATH_SEP_
+      if (*dest_ptr == kAlternatePathSeparator) {
+        *dest_ptr = kPathSeparator;
+      }
+#endif
+      while (IsPathSeparator(*src)) src++;
+    }
+    dest_ptr++;
+  }
+  *dest_ptr = '\0';
+  pathname_ = dest;
+  delete[] dest;
+}
+
+}  // namespace internal
+}  // namespace testing
diff --git a/libs/libaom/src/third_party/googletest/src/googletest/src/gtest-internal-inl.h b/libs/libaom/src/third_party/googletest/src/googletest/src/gtest-internal-inl.h
new file mode 100644
index 000000000..16d8cde66
--- /dev/null
+++ b/libs/libaom/src/third_party/googletest/src/googletest/src/gtest-internal-inl.h
@@ -0,0 +1,1213 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Utility functions and classes used by the Google C++ testing framework.//
+// This file contains purely Google Test's internal implementation.  Please
+// DO NOT #INCLUDE IT IN A USER PROGRAM.
+
+#ifndef GTEST_SRC_GTEST_INTERNAL_INL_H_
+#define GTEST_SRC_GTEST_INTERNAL_INL_H_
+
+#ifndef _WIN32_WCE
+#include <errno.h>
+#endif  // !_WIN32_WCE
+#include <stddef.h>
+#include <stdlib.h>  // For strtoll/_strtoul64/malloc/free.
+#include <string.h>  // For memmove.
+
+#include <algorithm>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "gtest/internal/gtest-port.h"
+
+#if GTEST_CAN_STREAM_RESULTS_
+#include <arpa/inet.h>  // NOLINT
+#include <netdb.h>      // NOLINT
+#endif
+
+#if GTEST_OS_WINDOWS
+#include <windows.h>  // NOLINT
+#endif                // GTEST_OS_WINDOWS
+
+#include "gtest/gtest.h"
+#include "gtest/gtest-spi.h"
+
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
+namespace testing {
+
+// Declares the flags.
+//
+// We don't want the users to modify this flag in the code, but want
+// Google Test's own unit tests to be able to access it. Therefore we
+// declare it here as opposed to in gtest.h.
+GTEST_DECLARE_bool_(death_test_use_fork);
+
+namespace internal {
+
+// The value of GetTestTypeId() as seen from within the Google Test
+// library.  This is solely for testing GetTestTypeId().
+GTEST_API_ extern const TypeId kTestTypeIdInGoogleTest;
+
+// Names of the flags (needed for parsing Google Test flags).
+const char kAlsoRunDisabledTestsFlag[] = "also_run_disabled_tests";
+const char kBreakOnFailureFlag[] = "break_on_failure";
+const char kCatchExceptionsFlag[] = "catch_exceptions";
+const char kColorFlag[] = "color";
+const char kFilterFlag[] = "filter";
+const char kListTestsFlag[] = "list_tests";
+const char kOutputFlag[] = "output";
+const char kPrintTimeFlag[] = "print_time";
+const char kPrintUTF8Flag[] = "print_utf8";
+const char kRandomSeedFlag[] = "random_seed";
+const char kRepeatFlag[] = "repeat";
+const char kShuffleFlag[] = "shuffle";
+const char kStackTraceDepthFlag[] = "stack_trace_depth";
+const char kStreamResultToFlag[] = "stream_result_to";
+const char kThrowOnFailureFlag[] = "throw_on_failure";
+const char kFlagfileFlag[] = "flagfile";
+
+// A valid random seed must be in [1, kMaxRandomSeed].
+const int kMaxRandomSeed = 99999;
+
+// g_help_flag is true if and only if the --help flag or an equivalent form
+// is specified on the command line.
+GTEST_API_ extern bool g_help_flag;
+
+// Returns the current time in milliseconds.
+GTEST_API_ TimeInMillis GetTimeInMillis();
+
+// Returns true if and only if Google Test should use colors in the output.
+GTEST_API_ bool ShouldUseColor(bool stdout_is_tty);
+
+// Formats the given time in milliseconds as seconds.
+GTEST_API_ std::string FormatTimeInMillisAsSeconds(TimeInMillis ms);
+
+// Converts the given time in milliseconds to a date string in the ISO 8601
+// format, without the timezone information.  N.B.: due to the use the
+// non-reentrant localtime() function, this function is not thread safe.  Do
+// not use it in any code that can be called from multiple threads.
+GTEST_API_ std::string FormatEpochTimeInMillisAsIso8601(TimeInMillis ms);
+
+// Parses a string for an Int32 flag, in the form of "--flag=value".
+//
+// On success, stores the value of the flag in *value, and returns
+// true.  On failure, returns false without changing *value.
+GTEST_API_ bool ParseInt32Flag(const char *str, const char *flag,
+                               int32_t *value);
+
+// Returns a random seed in range [1, kMaxRandomSeed] based on the
+// given --gtest_random_seed flag value.
+inline int GetRandomSeedFromFlag(int32_t random_seed_flag) {
+  const unsigned int raw_seed =
+      (random_seed_flag == 0) ? static_cast<unsigned int>(GetTimeInMillis())
+                              : static_cast<unsigned int>(random_seed_flag);
+
+  // Normalizes the actual seed to range [1, kMaxRandomSeed] such that
+  // it's easy to type.
+  const int normalized_seed =
+      static_cast<int>((raw_seed - 1U) %
+                       static_cast<unsigned int>(kMaxRandomSeed)) +
+      1;
+  return normalized_seed;
+}
+
+// Returns the first valid random seed after 'seed'.  The behavior is
+// undefined if 'seed' is invalid.  The seed after kMaxRandomSeed is
+// considered to be 1.
+inline int GetNextRandomSeed(int seed) {
+  GTEST_CHECK_(1 <= seed && seed <= kMaxRandomSeed)
+      << "Invalid random seed " << seed << " - must be in [1, "
+      << kMaxRandomSeed << "].";
+  const int next_seed = seed + 1;
+  return (next_seed > kMaxRandomSeed) ? 1 : next_seed;
+}
+
+// This class saves the values of all Google Test flags in its c'tor, and
+// restores them in its d'tor.
+class GTestFlagSaver {
+ public:
+  // The c'tor.
+  GTestFlagSaver() {
+    also_run_disabled_tests_ = GTEST_FLAG(also_run_disabled_tests);
+    break_on_failure_ = GTEST_FLAG(break_on_failure);
+    catch_exceptions_ = GTEST_FLAG(catch_exceptions);
+    color_ = GTEST_FLAG(color);
+    death_test_style_ = GTEST_FLAG(death_test_style);
+    death_test_use_fork_ = GTEST_FLAG(death_test_use_fork);
+    filter_ = GTEST_FLAG(filter);
+    internal_run_death_test_ = GTEST_FLAG(internal_run_death_test);
+    list_tests_ = GTEST_FLAG(list_tests);
+    output_ = GTEST_FLAG(output);
+    print_time_ = GTEST_FLAG(print_time);
+    print_utf8_ = GTEST_FLAG(print_utf8);
+    random_seed_ = GTEST_FLAG(random_seed);
+    repeat_ = GTEST_FLAG(repeat);
+    shuffle_ = GTEST_FLAG(shuffle);
+    stack_trace_depth_ = GTEST_FLAG(stack_trace_depth);
+    stream_result_to_ = GTEST_FLAG(stream_result_to);
+    throw_on_failure_ = GTEST_FLAG(throw_on_failure);
+  }
+
+  // The d'tor is not virtual.  DO NOT INHERIT FROM THIS CLASS.
+  ~GTestFlagSaver() {
+    GTEST_FLAG(also_run_disabled_tests) = also_run_disabled_tests_;
+    GTEST_FLAG(break_on_failure) = break_on_failure_;
+    GTEST_FLAG(catch_exceptions) = catch_exceptions_;
+    GTEST_FLAG(color) = color_;
+    GTEST_FLAG(death_test_style) = death_test_style_;
+    GTEST_FLAG(death_test_use_fork) = death_test_use_fork_;
+    GTEST_FLAG(filter) = filter_;
+    GTEST_FLAG(internal_run_death_test) = internal_run_death_test_;
+    GTEST_FLAG(list_tests) = list_tests_;
+    GTEST_FLAG(output) = output_;
+    GTEST_FLAG(print_time) = print_time_;
+    GTEST_FLAG(print_utf8) = print_utf8_;
+    GTEST_FLAG(random_seed) = random_seed_;
+    GTEST_FLAG(repeat) = repeat_;
+    GTEST_FLAG(shuffle) = shuffle_;
+    GTEST_FLAG(stack_trace_depth) = stack_trace_depth_;
+    GTEST_FLAG(stream_result_to) = stream_result_to_;
+    GTEST_FLAG(throw_on_failure) = throw_on_failure_;
+  }
+
+ private:
+  // Fields for saving the original values of flags.
+  bool also_run_disabled_tests_;
+  bool break_on_failure_;
+  bool catch_exceptions_;
+  std::string color_;
+  std::string death_test_style_;
+  bool death_test_use_fork_;
+  std::string filter_;
+  std::string internal_run_death_test_;
+  bool list_tests_;
+  std::string output_;
+  bool print_time_;
+  bool print_utf8_;
+  int32_t random_seed_;
+  int32_t repeat_;
+  bool shuffle_;
+  int32_t stack_trace_depth_;
+  std::string stream_result_to_;
+  bool throw_on_failure_;
+} GTEST_ATTRIBUTE_UNUSED_;
+
+// Converts a Unicode code point to a narrow string in UTF-8 encoding.
+// code_point parameter is of type UInt32 because wchar_t may not be
+// wide enough to contain a code point.
+// If the code_point is not a valid Unicode code point
+// (i.e. outside of Unicode range U+0 to U+10FFFF) it will be converted
+// to "(Invalid Unicode 0xXXXXXXXX)".
+GTEST_API_ std::string CodePointToUtf8(uint32_t code_point);
+
+// Converts a wide string to a narrow string in UTF-8 encoding.
+// The wide string is assumed to have the following encoding:
+//   UTF-16 if sizeof(wchar_t) == 2 (on Windows, Cygwin)
+//   UTF-32 if sizeof(wchar_t) == 4 (on Linux)
+// Parameter str points to a null-terminated wide string.
+// Parameter num_chars may additionally limit the number
+// of wchar_t characters processed. -1 is used when the entire string
+// should be processed.
+// If the string contains code points that are not valid Unicode code points
+// (i.e. outside of Unicode range U+0 to U+10FFFF) they will be output
+// as '(Invalid Unicode 0xXXXXXXXX)'. If the string is in UTF16 encoding
+// and contains invalid UTF-16 surrogate pairs, values in those pairs
+// will be encoded as individual Unicode characters from Basic Normal Plane.
+GTEST_API_ std::string WideStringToUtf8(const wchar_t *str, int num_chars);
+
+// Reads the GTEST_SHARD_STATUS_FILE environment variable, and creates the file
+// if the variable is present. If a file already exists at this location, this
+// function will write over it. If the variable is present, but the file cannot
+// be created, prints an error and exits.
+void WriteToShardStatusFileIfNeeded();
+
+// Checks whether sharding is enabled by examining the relevant
+// environment variable values. If the variables are present,
+// but inconsistent (e.g., shard_index >= total_shards), prints
+// an error and exits. If in_subprocess_for_death_test, sharding is
+// disabled because it must only be applied to the original test
+// process. Otherwise, we could filter out death tests we intended to execute.
+GTEST_API_ bool ShouldShard(const char *total_shards_str,
+                            const char *shard_index_str,
+                            bool in_subprocess_for_death_test);
+
+// Parses the environment variable var as a 32-bit integer. If it is unset,
+// returns default_val. If it is not a 32-bit integer, prints an error and
+// and aborts.
+GTEST_API_ int32_t Int32FromEnvOrDie(const char *env_var, int32_t default_val);
+
+// Given the total number of shards, the shard index, and the test id,
+// returns true if and only if the test should be run on this shard. The test id
+// is some arbitrary but unique non-negative integer assigned to each test
+// method. Assumes that 0 <= shard_index < total_shards.
+GTEST_API_ bool ShouldRunTestOnShard(int total_shards, int shard_index,
+                                     int test_id);
+
+// STL container utilities.
+
+// Returns the number of elements in the given container that satisfy
+// the given predicate.
+template <class Container, typename Predicate>
+inline int CountIf(const Container &c, Predicate predicate) {
+  // Implemented as an explicit loop since std::count_if() in libCstd on
+  // Solaris has a non-standard signature.
+  int count = 0;
+  for (typename Container::const_iterator it = c.begin(); it != c.end(); ++it) {
+    if (predicate(*it)) ++count;
+  }
+  return count;
+}
+
+// Applies a function/functor to each element in the container.
+template <class Container, typename Functor>
+void ForEach(const Container &c, Functor functor) {
+  std::for_each(c.begin(), c.end(), functor);
+}
+
+// Returns the i-th element of the vector, or default_value if i is not
+// in range [0, v.size()).
+template <typename E>
+inline E GetElementOr(const std::vector<E> &v, int i, E default_value) {
+  return (i < 0 || i >= static_cast<int>(v.size())) ? default_value
+                                                    : v[static_cast<size_t>(i)];
+}
+
+// Performs an in-place shuffle of a range of the vector's elements.
+// 'begin' and 'end' are element indices as an STL-style range;
+// i.e. [begin, end) are shuffled, where 'end' == size() means to
+// shuffle to the end of the vector.
+template <typename E>
+void ShuffleRange(internal::Random *random, int begin, int end,
+                  std::vector<E> *v) {
+  const int size = static_cast<int>(v->size());
+  GTEST_CHECK_(0 <= begin && begin <= size)
+      << "Invalid shuffle range start " << begin << ": must be in range [0, "
+      << size << "].";
+  GTEST_CHECK_(begin <= end && end <= size)
+      << "Invalid shuffle range finish " << end << ": must be in range ["
+      << begin << ", " << size << "].";
+
+  // Fisher-Yates shuffle, from
+  // http://en.wikipedia.org/wiki/Fisher-Yates_shuffle
+  for (int range_width = end - begin; range_width >= 2; range_width--) {
+    const int last_in_range = begin + range_width - 1;
+    const int selected =
+        begin +
+        static_cast<int>(random->Generate(static_cast<uint32_t>(range_width)));
+    std::swap((*v)[static_cast<size_t>(selected)],
+              (*v)[static_cast<size_t>(last_in_range)]);
+  }
+}
+
+// Performs an in-place shuffle of the vector's elements.
+template <typename E>
+inline void Shuffle(internal::Random *random, std::vector<E> *v) {
+  ShuffleRange(random, 0, static_cast<int>(v->size()), v);
+}
+
+// A function for deleting an object.  Handy for being used as a
+// functor.
+template <typename T>
+static void Delete(T *x) {
+  delete x;
+}
+
+// A predicate that checks the key of a TestProperty against a known key.
+//
+// TestPropertyKeyIs is copyable.
+class TestPropertyKeyIs {
+ public:
+  // Constructor.
+  //
+  // TestPropertyKeyIs has NO default constructor.
+  explicit TestPropertyKeyIs(const std::string &key) : key_(key) {}
+
+  // Returns true if and only if the test name of test property matches on key_.
+  bool operator()(const TestProperty &test_property) const {
+    return test_property.key() == key_;
+  }
+
+ private:
+  std::string key_;
+};
+
+// Class UnitTestOptions.
+//
+// This class contains functions for processing options the user
+// specifies when running the tests.  It has only static members.
+//
+// In most cases, the user can specify an option using either an
+// environment variable or a command line flag.  E.g. you can set the
+// test filter using either GTEST_FILTER or --gtest_filter.  If both
+// the variable and the flag are present, the latter overrides the
+// former.
+class GTEST_API_ UnitTestOptions {
+ public:
+  // Functions for processing the gtest_output flag.
+
+  // Returns the output format, or "" for normal printed output.
+  static std::string GetOutputFormat();
+
+  // Returns the absolute path of the requested output file, or the
+  // default (test_detail.xml in the original working directory) if
+  // none was explicitly specified.
+  static std::string GetAbsolutePathToOutputFile();
+
+  // Functions for processing the gtest_filter flag.
+
+  // Returns true if and only if the wildcard pattern matches the string.
+  // The first ':' or '\0' character in pattern marks the end of it.
+  //
+  // This recursive algorithm isn't very efficient, but is clear and
+  // works well enough for matching test names, which are short.
+  static bool PatternMatchesString(const char *pattern, const char *str);
+
+  // Returns true if and only if the user-specified filter matches the test
+  // suite name and the test name.
+  static bool FilterMatchesTest(const std::string &test_suite_name,
+                                const std::string &test_name);
+
+#if GTEST_OS_WINDOWS
+  // Function for supporting the gtest_catch_exception flag.
+
+  // Returns EXCEPTION_EXECUTE_HANDLER if Google Test should handle the
+  // given SEH exception, or EXCEPTION_CONTINUE_SEARCH otherwise.
+  // This function is useful as an __except condition.
+  static int GTestShouldProcessSEH(DWORD exception_code);
+#endif  // GTEST_OS_WINDOWS
+
+  // Returns true if "name" matches the ':' separated list of glob-style
+  // filters in "filter".
+  static bool MatchesFilter(const std::string &name, const char *filter);
+};
+
+// Returns the current application's name, removing directory path if that
+// is present.  Used by UnitTestOptions::GetOutputFile.
+GTEST_API_ FilePath GetCurrentExecutableName();
+
+// The role interface for getting the OS stack trace as a string.
+class OsStackTraceGetterInterface {
+ public:
+  OsStackTraceGetterInterface() {}
+  virtual ~OsStackTraceGetterInterface() {}
+
+  // Returns the current OS stack trace as an std::string.  Parameters:
+  //
+  //   max_depth  - the maximum number of stack frames to be included
+  //                in the trace.
+  //   skip_count - the number of top frames to be skipped; doesn't count
+  //                against max_depth.
+  virtual std::string CurrentStackTrace(int max_depth, int skip_count) = 0;
+
+  // UponLeavingGTest() should be called immediately before Google Test calls
+  // user code. It saves some information about the current stack that
+  // CurrentStackTrace() will use to find and hide Google Test stack frames.
+  virtual void UponLeavingGTest() = 0;
+
+  // This string is inserted in place of stack frames that are part of
+  // Google Test's implementation.
+  static const char *const kElidedFramesMarker;
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(OsStackTraceGetterInterface);
+};
+
+// A working implementation of the OsStackTraceGetterInterface interface.
+class OsStackTraceGetter : public OsStackTraceGetterInterface {
+ public:
+  OsStackTraceGetter() {}
+
+  std::string CurrentStackTrace(int max_depth, int skip_count) override;
+  void UponLeavingGTest() override;
+
+ private:
+#if GTEST_HAS_ABSL
+  Mutex mutex_;  // Protects all internal state.
+
+  // We save the stack frame below the frame that calls user code.
+  // We do this because the address of the frame immediately below
+  // the user code changes between the call to UponLeavingGTest()
+  // and any calls to the stack trace code from within the user code.
+  void *caller_frame_ = nullptr;
+#endif  // GTEST_HAS_ABSL
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(OsStackTraceGetter);
+};
+
+// Information about a Google Test trace point.
+struct TraceInfo {
+  const char *file;
+  int line;
+  std::string message;
+};
+
+// This is the default global test part result reporter used in UnitTestImpl.
+// This class should only be used by UnitTestImpl.
+class DefaultGlobalTestPartResultReporter
+    : public TestPartResultReporterInterface {
+ public:
+  explicit DefaultGlobalTestPartResultReporter(UnitTestImpl *unit_test);
+  // Implements the TestPartResultReporterInterface. Reports the test part
+  // result in the current test.
+  void ReportTestPartResult(const TestPartResult &result) override;
+
+ private:
+  UnitTestImpl *const unit_test_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultGlobalTestPartResultReporter);
+};
+
+// This is the default per thread test part result reporter used in
+// UnitTestImpl. This class should only be used by UnitTestImpl.
+class DefaultPerThreadTestPartResultReporter
+    : public TestPartResultReporterInterface {
+ public:
+  explicit DefaultPerThreadTestPartResultReporter(UnitTestImpl *unit_test);
+  // Implements the TestPartResultReporterInterface. The implementation just
+  // delegates to the current global test part result reporter of *unit_test_.
+  void ReportTestPartResult(const TestPartResult &result) override;
+
+ private:
+  UnitTestImpl *const unit_test_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultPerThreadTestPartResultReporter);
+};
+
+// The private implementation of the UnitTest class.  We don't protect
+// the methods under a mutex, as this class is not accessible by a
+// user and the UnitTest class that delegates work to this class does
+// proper locking.
+class GTEST_API_ UnitTestImpl {
+ public:
+  explicit UnitTestImpl(UnitTest *parent);
+  virtual ~UnitTestImpl();
+
+  // There are two different ways to register your own TestPartResultReporter.
+  // You can register your own repoter to listen either only for test results
+  // from the current thread or for results from all threads.
+  // By default, each per-thread test result repoter just passes a new
+  // TestPartResult to the global test result reporter, which registers the
+  // test part result for the currently running test.
+
+  // Returns the global test part result reporter.
+  TestPartResultReporterInterface *GetGlobalTestPartResultReporter();
+
+  // Sets the global test part result reporter.
+  void SetGlobalTestPartResultReporter(
+      TestPartResultReporterInterface *reporter);
+
+  // Returns the test part result reporter for the current thread.
+  TestPartResultReporterInterface *GetTestPartResultReporterForCurrentThread();
+
+  // Sets the test part result reporter for the current thread.
+  void SetTestPartResultReporterForCurrentThread(
+      TestPartResultReporterInterface *reporter);
+
+  // Gets the number of successful test suites.
+  int successful_test_suite_count() const;
+
+  // Gets the number of failed test suites.
+  int failed_test_suite_count() const;
+
+  // Gets the number of all test suites.
+  int total_test_suite_count() const;
+
+  // Gets the number of all test suites that contain at least one test
+  // that should run.
+  int test_suite_to_run_count() const;
+
+  // Gets the number of successful tests.
+  int successful_test_count() const;
+
+  // Gets the number of skipped tests.
+  int skipped_test_count() const;
+
+  // Gets the number of failed tests.
+  int failed_test_count() const;
+
+  // Gets the number of disabled tests that will be reported in the XML report.
+  int reportable_disabled_test_count() const;
+
+  // Gets the number of disabled tests.
+  int disabled_test_count() const;
+
+  // Gets the number of tests to be printed in the XML report.
+  int reportable_test_count() const;
+
+  // Gets the number of all tests.
+  int total_test_count() const;
+
+  // Gets the number of tests that should run.
+  int test_to_run_count() const;
+
+  // Gets the time of the test program start, in ms from the start of the
+  // UNIX epoch.
+  TimeInMillis start_timestamp() const { return start_timestamp_; }
+
+  // Gets the elapsed time, in milliseconds.
+  TimeInMillis elapsed_time() const { return elapsed_time_; }
+
+  // Returns true if and only if the unit test passed (i.e. all test suites
+  // passed).
+  bool Passed() const { return !Failed(); }
+
+  // Returns true if and only if the unit test failed (i.e. some test suite
+  // failed or something outside of all tests failed).
+  bool Failed() const {
+    return failed_test_suite_count() > 0 || ad_hoc_test_result()->Failed();
+  }
+
+  // Gets the i-th test suite among all the test suites. i can range from 0 to
+  // total_test_suite_count() - 1. If i is not in that range, returns NULL.
+  const TestSuite *GetTestSuite(int i) const {
+    const int index = GetElementOr(test_suite_indices_, i, -1);
+    return index < 0 ? nullptr : test_suites_[static_cast<size_t>(i)];
+  }
+
+  //  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  const TestCase *GetTestCase(int i) const { return GetTestSuite(i); }
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+  // Gets the i-th test suite among all the test suites. i can range from 0 to
+  // total_test_suite_count() - 1. If i is not in that range, returns NULL.
+  TestSuite *GetMutableSuiteCase(int i) {
+    const int index = GetElementOr(test_suite_indices_, i, -1);
+    return index < 0 ? nullptr : test_suites_[static_cast<size_t>(index)];
+  }
+
+  // Provides access to the event listener list.
+  TestEventListeners *listeners() { return &listeners_; }
+
+  // Returns the TestResult for the test that's currently running, or
+  // the TestResult for the ad hoc test if no test is running.
+  TestResult *current_test_result();
+
+  // Returns the TestResult for the ad hoc test.
+  const TestResult *ad_hoc_test_result() const { return &ad_hoc_test_result_; }
+
+  // Sets the OS stack trace getter.
+  //
+  // Does nothing if the input and the current OS stack trace getter
+  // are the same; otherwise, deletes the old getter and makes the
+  // input the current getter.
+  void set_os_stack_trace_getter(OsStackTraceGetterInterface *getter);
+
+  // Returns the current OS stack trace getter if it is not NULL;
+  // otherwise, creates an OsStackTraceGetter, makes it the current
+  // getter, and returns it.
+  OsStackTraceGetterInterface *os_stack_trace_getter();
+
+  // Returns the current OS stack trace as an std::string.
+  //
+  // The maximum number of stack frames to be included is specified by
+  // the gtest_stack_trace_depth flag.  The skip_count parameter
+  // specifies the number of top frames to be skipped, which doesn't
+  // count against the number of frames to be included.
+  //
+  // For example, if Foo() calls Bar(), which in turn calls
+  // CurrentOsStackTraceExceptTop(1), Foo() will be included in the
+  // trace but Bar() and CurrentOsStackTraceExceptTop() won't.
+  std::string CurrentOsStackTraceExceptTop(int skip_count) GTEST_NO_INLINE_;
+
+  // Finds and returns a TestSuite with the given name.  If one doesn't
+  // exist, creates one and returns it.
+  //
+  // Arguments:
+  //
+  //   test_suite_name: name of the test suite
+  //   type_param:     the name of the test's type parameter, or NULL if
+  //                   this is not a typed or a type-parameterized test.
+  //   set_up_tc:      pointer to the function that sets up the test suite
+  //   tear_down_tc:   pointer to the function that tears down the test suite
+  TestSuite *GetTestSuite(const char *test_suite_name, const char *type_param,
+                          internal::SetUpTestSuiteFunc set_up_tc,
+                          internal::TearDownTestSuiteFunc tear_down_tc);
+
+//  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  TestCase *GetTestCase(const char *test_case_name, const char *type_param,
+                        internal::SetUpTestSuiteFunc set_up_tc,
+                        internal::TearDownTestSuiteFunc tear_down_tc) {
+    return GetTestSuite(test_case_name, type_param, set_up_tc, tear_down_tc);
+  }
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+  // Adds a TestInfo to the unit test.
+  //
+  // Arguments:
+  //
+  //   set_up_tc:    pointer to the function that sets up the test suite
+  //   tear_down_tc: pointer to the function that tears down the test suite
+  //   test_info:    the TestInfo object
+  void AddTestInfo(internal::SetUpTestSuiteFunc set_up_tc,
+                   internal::TearDownTestSuiteFunc tear_down_tc,
+                   TestInfo *test_info) {
+    // In order to support thread-safe death tests, we need to
+    // remember the original working directory when the test program
+    // was first invoked.  We cannot do this in RUN_ALL_TESTS(), as
+    // the user may have changed the current directory before calling
+    // RUN_ALL_TESTS().  Therefore we capture the current directory in
+    // AddTestInfo(), which is called to register a TEST or TEST_F
+    // before main() is reached.
+    if (original_working_dir_.IsEmpty()) {
+      original_working_dir_.Set(FilePath::GetCurrentDir());
+      GTEST_CHECK_(!original_working_dir_.IsEmpty())
+          << "Failed to get the current working directory.";
+    }
+
+    GetTestSuite(test_info->test_suite_name(), test_info->type_param(),
+                 set_up_tc, tear_down_tc)
+        ->AddTestInfo(test_info);
+  }
+
+  // Returns ParameterizedTestSuiteRegistry object used to keep track of
+  // value-parameterized tests and instantiate and register them.
+  internal::ParameterizedTestSuiteRegistry &parameterized_test_registry() {
+    return parameterized_test_registry_;
+  }
+
+  std::set<std::string> *ignored_parameterized_test_suites() {
+    return &ignored_parameterized_test_suites_;
+  }
+
+  // Returns TypeParameterizedTestSuiteRegistry object used to keep track of
+  // type-parameterized tests and instantiations of them.
+  internal::TypeParameterizedTestSuiteRegistry &
+  type_parameterized_test_registry() {
+    return type_parameterized_test_registry_;
+  }
+
+  // Sets the TestSuite object for the test that's currently running.
+  void set_current_test_suite(TestSuite *a_current_test_suite) {
+    current_test_suite_ = a_current_test_suite;
+  }
+
+  // Sets the TestInfo object for the test that's currently running.  If
+  // current_test_info is NULL, the assertion results will be stored in
+  // ad_hoc_test_result_.
+  void set_current_test_info(TestInfo *a_current_test_info) {
+    current_test_info_ = a_current_test_info;
+  }
+
+  // Registers all parameterized tests defined using TEST_P and
+  // INSTANTIATE_TEST_SUITE_P, creating regular tests for each test/parameter
+  // combination. This method can be called more then once; it has guards
+  // protecting from registering the tests more then once.  If
+  // value-parameterized tests are disabled, RegisterParameterizedTests is
+  // present but does nothing.
+  void RegisterParameterizedTests();
+
+  // Runs all tests in this UnitTest object, prints the result, and
+  // returns true if all tests are successful.  If any exception is
+  // thrown during a test, this test is considered to be failed, but
+  // the rest of the tests will still be run.
+  bool RunAllTests();
+
+  // Clears the results of all tests, except the ad hoc tests.
+  void ClearNonAdHocTestResult() {
+    ForEach(test_suites_, TestSuite::ClearTestSuiteResult);
+  }
+
+  // Clears the results of ad-hoc test assertions.
+  void ClearAdHocTestResult() { ad_hoc_test_result_.Clear(); }
+
+  // Adds a TestProperty to the current TestResult object when invoked in a
+  // context of a test or a test suite, or to the global property set. If the
+  // result already contains a property with the same key, the value will be
+  // updated.
+  void RecordProperty(const TestProperty &test_property);
+
+  enum ReactionToSharding { HONOR_SHARDING_PROTOCOL, IGNORE_SHARDING_PROTOCOL };
+
+  // Matches the full name of each test against the user-specified
+  // filter to decide whether the test should run, then records the
+  // result in each TestSuite and TestInfo object.
+  // If shard_tests == HONOR_SHARDING_PROTOCOL, further filters tests
+  // based on sharding variables in the environment.
+  // Returns the number of tests that should run.
+  int FilterTests(ReactionToSharding shard_tests);
+
+  // Prints the names of the tests matching the user-specified filter flag.
+  void ListTestsMatchingFilter();
+
+  const TestSuite *current_test_suite() const { return current_test_suite_; }
+  TestInfo *current_test_info() { return current_test_info_; }
+  const TestInfo *current_test_info() const { return current_test_info_; }
+
+  // Returns the vector of environments that need to be set-up/torn-down
+  // before/after the tests are run.
+  std::vector<Environment *> &environments() { return environments_; }
+
+  // Getters for the per-thread Google Test trace stack.
+  std::vector<TraceInfo> &gtest_trace_stack() {
+    return *(gtest_trace_stack_.pointer());
+  }
+  const std::vector<TraceInfo> &gtest_trace_stack() const {
+    return gtest_trace_stack_.get();
+  }
+
+#if GTEST_HAS_DEATH_TEST
+  void InitDeathTestSubprocessControlInfo() {
+    internal_run_death_test_flag_.reset(ParseInternalRunDeathTestFlag());
+  }
+  // Returns a pointer to the parsed --gtest_internal_run_death_test
+  // flag, or NULL if that flag was not specified.
+  // This information is useful only in a death test child process.
+  // Must not be called before a call to InitGoogleTest.
+  const InternalRunDeathTestFlag *internal_run_death_test_flag() const {
+    return internal_run_death_test_flag_.get();
+  }
+
+  // Returns a pointer to the current death test factory.
+  internal::DeathTestFactory *death_test_factory() {
+    return death_test_factory_.get();
+  }
+
+  void SuppressTestEventsIfInSubprocess();
+
+  friend class ReplaceDeathTestFactory;
+#endif  // GTEST_HAS_DEATH_TEST
+
+  // Initializes the event listener performing XML output as specified by
+  // UnitTestOptions. Must not be called before InitGoogleTest.
+  void ConfigureXmlOutput();
+
+#if GTEST_CAN_STREAM_RESULTS_
+  // Initializes the event listener for streaming test results to a socket.
+  // Must not be called before InitGoogleTest.
+  void ConfigureStreamingOutput();
+#endif
+
+  // Performs initialization dependent upon flag values obtained in
+  // ParseGoogleTestFlagsOnly.  Is called from InitGoogleTest after the call to
+  // ParseGoogleTestFlagsOnly.  In case a user neglects to call InitGoogleTest
+  // this function is also called from RunAllTests.  Since this function can be
+  // called more than once, it has to be idempotent.
+  void PostFlagParsingInit();
+
+  // Gets the random seed used at the start of the current test iteration.
+  int random_seed() const { return random_seed_; }
+
+  // Gets the random number generator.
+  internal::Random *random() { return &random_; }
+
+  // Shuffles all test suites, and the tests within each test suite,
+  // making sure that death tests are still run first.
+  void ShuffleTests();
+
+  // Restores the test suites and tests to their order before the first shuffle.
+  void UnshuffleTests();
+
+  // Returns the value of GTEST_FLAG(catch_exceptions) at the moment
+  // UnitTest::Run() starts.
+  bool catch_exceptions() const { return catch_exceptions_; }
+
+ private:
+  friend class ::testing::UnitTest;
+
+  // Used by UnitTest::Run() to capture the state of
+  // GTEST_FLAG(catch_exceptions) at the moment it starts.
+  void set_catch_exceptions(bool value) { catch_exceptions_ = value; }
+
+  // The UnitTest object that owns this implementation object.
+  UnitTest *const parent_;
+
+  // The working directory when the first TEST() or TEST_F() was
+  // executed.
+  internal::FilePath original_working_dir_;
+
+  // The default test part result reporters.
+  DefaultGlobalTestPartResultReporter default_global_test_part_result_reporter_;
+  DefaultPerThreadTestPartResultReporter
+      default_per_thread_test_part_result_reporter_;
+
+  // Points to (but doesn't own) the global test part result reporter.
+  TestPartResultReporterInterface *global_test_part_result_repoter_;
+
+  // Protects read and write access to global_test_part_result_reporter_.
+  internal::Mutex global_test_part_result_reporter_mutex_;
+
+  // Points to (but doesn't own) the per-thread test part result reporter.
+  internal::ThreadLocal<TestPartResultReporterInterface *>
+      per_thread_test_part_result_reporter_;
+
+  // The vector of environments that need to be set-up/torn-down
+  // before/after the tests are run.
+  std::vector<Environment *> environments_;
+
+  // The vector of TestSuites in their original order.  It owns the
+  // elements in the vector.
+  std::vector<TestSuite *> test_suites_;
+
+  // Provides a level of indirection for the test suite list to allow
+  // easy shuffling and restoring the test suite order.  The i-th
+  // element of this vector is the index of the i-th test suite in the
+  // shuffled order.
+  std::vector<int> test_suite_indices_;
+
+  // ParameterizedTestRegistry object used to register value-parameterized
+  // tests.
+  internal::ParameterizedTestSuiteRegistry parameterized_test_registry_;
+  internal::TypeParameterizedTestSuiteRegistry
+      type_parameterized_test_registry_;
+
+  // The set holding the name of parameterized
+  // test suites that may go uninstantiated.
+  std::set<std::string> ignored_parameterized_test_suites_;
+
+  // Indicates whether RegisterParameterizedTests() has been called already.
+  bool parameterized_tests_registered_;
+
+  // Index of the last death test suite registered.  Initially -1.
+  int last_death_test_suite_;
+
+  // This points to the TestSuite for the currently running test.  It
+  // changes as Google Test goes through one test suite after another.
+  // When no test is running, this is set to NULL and Google Test
+  // stores assertion results in ad_hoc_test_result_.  Initially NULL.
+  TestSuite *current_test_suite_;
+
+  // This points to the TestInfo for the currently running test.  It
+  // changes as Google Test goes through one test after another.  When
+  // no test is running, this is set to NULL and Google Test stores
+  // assertion results in ad_hoc_test_result_.  Initially NULL.
+  TestInfo *current_test_info_;
+
+  // Normally, a user only writes assertions inside a TEST or TEST_F,
+  // or inside a function called by a TEST or TEST_F.  Since Google
+  // Test keeps track of which test is current running, it can
+  // associate such an assertion with the test it belongs to.
+  //
+  // If an assertion is encountered when no TEST or TEST_F is running,
+  // Google Test attributes the assertion result to an imaginary "ad hoc"
+  // test, and records the result in ad_hoc_test_result_.
+  TestResult ad_hoc_test_result_;
+
+  // The list of event listeners that can be used to track events inside
+  // Google Test.
+  TestEventListeners listeners_;
+
+  // The OS stack trace getter.  Will be deleted when the UnitTest
+  // object is destructed.  By default, an OsStackTraceGetter is used,
+  // but the user can set this field to use a custom getter if that is
+  // desired.
+  OsStackTraceGetterInterface *os_stack_trace_getter_;
+
+  // True if and only if PostFlagParsingInit() has been called.
+  bool post_flag_parse_init_performed_;
+
+  // The random number seed used at the beginning of the test run.
+  int random_seed_;
+
+  // Our random number generator.
+  internal::Random random_;
+
+  // The time of the test program start, in ms from the start of the
+  // UNIX epoch.
+  TimeInMillis start_timestamp_;
+
+  // How long the test took to run, in milliseconds.
+  TimeInMillis elapsed_time_;
+
+#if GTEST_HAS_DEATH_TEST
+  // The decomposed components of the gtest_internal_run_death_test flag,
+  // parsed when RUN_ALL_TESTS is called.
+  std::unique_ptr<InternalRunDeathTestFlag> internal_run_death_test_flag_;
+  std::unique_ptr<internal::DeathTestFactory> death_test_factory_;
+#endif  // GTEST_HAS_DEATH_TEST
+
+  // A per-thread stack of traces created by the SCOPED_TRACE() macro.
+  internal::ThreadLocal<std::vector<TraceInfo> > gtest_trace_stack_;
+
+  // The value of GTEST_FLAG(catch_exceptions) at the moment RunAllTests()
+  // starts.
+  bool catch_exceptions_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(UnitTestImpl);
+};  // class UnitTestImpl
+
+// Convenience function for accessing the global UnitTest
+// implementation object.
+inline UnitTestImpl *GetUnitTestImpl() {
+  return UnitTest::GetInstance()->impl();
+}
+
+#if GTEST_USES_SIMPLE_RE
+
+// Internal helper functions for implementing the simple regular
+// expression matcher.
+GTEST_API_ bool IsInSet(char ch, const char *str);
+GTEST_API_ bool IsAsciiDigit(char ch);
+GTEST_API_ bool IsAsciiPunct(char ch);
+GTEST_API_ bool IsRepeat(char ch);
+GTEST_API_ bool IsAsciiWhiteSpace(char ch);
+GTEST_API_ bool IsAsciiWordChar(char ch);
+GTEST_API_ bool IsValidEscape(char ch);
+GTEST_API_ bool AtomMatchesChar(bool escaped, char pattern, char ch);
+GTEST_API_ bool ValidateRegex(const char *regex);
+GTEST_API_ bool MatchRegexAtHead(const char *regex, const char *str);
+GTEST_API_ bool MatchRepetitionAndRegexAtHead(bool escaped, char ch,
+                                              char repeat, const char *regex,
+                                              const char *str);
+GTEST_API_ bool MatchRegexAnywhere(const char *regex, const char *str);
+
+#endif  // GTEST_USES_SIMPLE_RE
+
+// Parses the command line for Google Test flags, without initializing
+// other parts of Google Test.
+GTEST_API_ void ParseGoogleTestFlagsOnly(int *argc, char **argv);
+GTEST_API_ void ParseGoogleTestFlagsOnly(int *argc, wchar_t **argv);
+
+#if GTEST_HAS_DEATH_TEST
+
+// Returns the message describing the last system error, regardless of the
+// platform.
+GTEST_API_ std::string GetLastErrnoDescription();
+
+// Attempts to parse a string into a positive integer pointed to by the
+// number parameter.  Returns true if that is possible.
+// GTEST_HAS_DEATH_TEST implies that we have ::std::string, so we can use
+// it here.
+template <typename Integer>
+bool ParseNaturalNumber(const ::std::string &str, Integer *number) {
+  // Fail fast if the given string does not begin with a digit;
+  // this bypasses strtoXXX's "optional leading whitespace and plus
+  // or minus sign" semantics, which are undesirable here.
+  if (str.empty() || !IsDigit(str[0])) {
+    return false;
+  }
+  errno = 0;
+
+  char *end;
+  // BiggestConvertible is the largest integer type that system-provided
+  // string-to-number conversion routines can return.
+  using BiggestConvertible = unsigned long long;  // NOLINT
+
+  const BiggestConvertible parsed = strtoull(str.c_str(), &end, 10);  // NOLINT
+  const bool parse_success = *end == '\0' && errno == 0;
+
+  GTEST_CHECK_(sizeof(Integer) <= sizeof(parsed));
+
+  const Integer result = static_cast<Integer>(parsed);
+  if (parse_success && static_cast<BiggestConvertible>(result) == parsed) {
+    *number = result;
+    return true;
+  }
+  return false;
+}
+#endif  // GTEST_HAS_DEATH_TEST
+
+// TestResult contains some private methods that should be hidden from
+// Google Test user but are required for testing. This class allow our tests
+// to access them.
+//
+// This class is supplied only for the purpose of testing Google Test's own
+// constructs. Do not use it in user tests, either directly or indirectly.
+class TestResultAccessor {
+ public:
+  static void RecordProperty(TestResult *test_result,
+                             const std::string &xml_element,
+                             const TestProperty &property) {
+    test_result->RecordProperty(xml_element, property);
+  }
+
+  static void ClearTestPartResults(TestResult *test_result) {
+    test_result->ClearTestPartResults();
+  }
+
+  static const std::vector<testing::TestPartResult> &test_part_results(
+      const TestResult &test_result) {
+    return test_result.test_part_results();
+  }
+};
+
+#if GTEST_CAN_STREAM_RESULTS_
+
+// Streams test results to the given port on the given host machine.
+class StreamingListener : public EmptyTestEventListener {
+ public:
+  // Abstract base class for writing strings to a socket.
+  class AbstractSocketWriter {
+   public:
+    virtual ~AbstractSocketWriter() {}
+
+    // Sends a string to the socket.
+    virtual void Send(const std::string &message) = 0;
+
+    // Closes the socket.
+    virtual void CloseConnection() {}
+
+    // Sends a string and a newline to the socket.
+    void SendLn(const std::string &message) { Send(message + "\n"); }
+  };
+
+  // Concrete class for actually writing strings to a socket.
+  class SocketWriter : public AbstractSocketWriter {
+   public:
+    SocketWriter(const std::string &host, const std::string &port)
+        : sockfd_(-1), host_name_(host), port_num_(port) {
+      MakeConnection();
+    }
+
+    ~SocketWriter() override {
+      if (sockfd_ != -1) CloseConnection();
+    }
+
+    // Sends a string to the socket.
+    void Send(const std::string &message) override {
+      GTEST_CHECK_(sockfd_ != -1)
+          << "Send() can be called only when there is a connection.";
+
+      const auto len = static_cast<size_t>(message.length());
+      if (write(sockfd_, message.c_str(), len) != static_cast<ssize_t>(len)) {
+        GTEST_LOG_(WARNING) << "stream_result_to: failed to stream to "
+                            << host_name_ << ":" << port_num_;
+      }
+    }
+
+   private:
+    // Creates a client socket and connects to the server.
+    void MakeConnection();
+
+    // Closes the socket.
+    void CloseConnection() override {
+      GTEST_CHECK_(sockfd_ != -1)
+          << "CloseConnection() can be called only when there is a connection.";
+
+      close(sockfd_);
+      sockfd_ = -1;
+    }
+
+    int sockfd_;  // socket file descriptor
+    const std::string host_name_;
+    const std::string port_num_;
+
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(SocketWriter);
+  };  // class SocketWriter
+
+  // Escapes '=', '&', '%', and '\n' characters in str as "%xx".
+  static std::string UrlEncode(const char *str);
+
+  StreamingListener(const std::string &host, const std::string &port)
+      : socket_writer_(new SocketWriter(host, port)) {
+    Start();
+  }
+
+  explicit StreamingListener(AbstractSocketWriter *socket_writer)
+      : socket_writer_(socket_writer) {
+    Start();
+  }
+
+  void OnTestProgramStart(const UnitTest & /* unit_test */) override {
+    SendLn("event=TestProgramStart");
+  }
+
+  void OnTestProgramEnd(const UnitTest &unit_test) override {
+    // Note that Google Test current only report elapsed time for each
+    // test iteration, not for the entire test program.
+    SendLn("event=TestProgramEnd&passed=" + FormatBool(unit_test.Passed()));
+
+    // Notify the streaming server to stop.
+    socket_writer_->CloseConnection();
+  }
+
+  void OnTestIterationStart(const UnitTest & /* unit_test */,
+                            int iteration) override {
+    SendLn("event=TestIterationStart&iteration=" +
+           StreamableToString(iteration));
+  }
+
+  void OnTestIterationEnd(const UnitTest &unit_test,
+                          int /* iteration */) override {
+    SendLn("event=TestIterationEnd&passed=" + FormatBool(unit_test.Passed()) +
+           "&elapsed_time=" + StreamableToString(unit_test.elapsed_time()) +
+           "ms");
+  }
+
+  // Note that "event=TestCaseStart" is a wire format and has to remain
+  // "case" for compatibilty
+  void OnTestCaseStart(const TestCase &test_case) override {
+    SendLn(std::string("event=TestCaseStart&name=") + test_case.name());
+  }
+
+  // Note that "event=TestCaseEnd" is a wire format and has to remain
+  // "case" for compatibilty
+  void OnTestCaseEnd(const TestCase &test_case) override {
+    SendLn("event=TestCaseEnd&passed=" + FormatBool(test_case.Passed()) +
+           "&elapsed_time=" + StreamableToString(test_case.elapsed_time()) +
+           "ms");
+  }
+
+  void OnTestStart(const TestInfo &test_info) override {
+    SendLn(std::string("event=TestStart&name=") + test_info.name());
+  }
+
+  void OnTestEnd(const TestInfo &test_info) override {
+    SendLn("event=TestEnd&passed=" +
+           FormatBool((test_info.result())->Passed()) + "&elapsed_time=" +
+           StreamableToString((test_info.result())->elapsed_time()) + "ms");
+  }
+
+  void OnTestPartResult(const TestPartResult &test_part_result) override {
+    const char *file_name = test_part_result.file_name();
+    if (file_name == nullptr) file_name = "";
+    SendLn("event=TestPartResult&file=" + UrlEncode(file_name) +
+           "&line=" + StreamableToString(test_part_result.line_number()) +
+           "&message=" + UrlEncode(test_part_result.message()));
+  }
+
+ private:
+  // Sends the given message and a newline to the socket.
+  void SendLn(const std::string &message) { socket_writer_->SendLn(message); }
+
+  // Called at the start of streaming to notify the receiver what
+  // protocol we are using.
+  void Start() { SendLn("gtest_streaming_protocol_version=1.0"); }
+
+  std::string FormatBool(bool value) { return value ? "1" : "0"; }
+
+  const std::unique_ptr<AbstractSocketWriter> socket_writer_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(StreamingListener);
+};  // class StreamingListener
+
+#endif  // GTEST_CAN_STREAM_RESULTS_
+
+}  // namespace internal
+}  // namespace testing
+
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
+
+#endif  // GTEST_SRC_GTEST_INTERNAL_INL_H_
diff --git a/libs/libaom/src/third_party/googletest/src/googletest/src/gtest-matchers.cc b/libs/libaom/src/third_party/googletest/src/googletest/src/gtest-matchers.cc
new file mode 100644
index 000000000..27aaa2b7c
--- /dev/null
+++ b/libs/libaom/src/third_party/googletest/src/googletest/src/gtest-matchers.cc
@@ -0,0 +1,97 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// The Google C++ Testing and Mocking Framework (Google Test)
+//
+// This file implements just enough of the matcher interface to allow
+// EXPECT_DEATH and friends to accept a matcher argument.
+
+#include "gtest/internal/gtest-internal.h"
+#include "gtest/internal/gtest-port.h"
+#include "gtest/gtest-matchers.h"
+
+#include <string>
+
+namespace testing {
+
+// Constructs a matcher that matches a const std::string& whose value is
+// equal to s.
+Matcher<const std::string &>::Matcher(const std::string &s) { *this = Eq(s); }
+
+// Constructs a matcher that matches a const std::string& whose value is
+// equal to s.
+Matcher<const std::string &>::Matcher(const char *s) {
+  *this = Eq(std::string(s));
+}
+
+// Constructs a matcher that matches a std::string whose value is equal to
+// s.
+Matcher<std::string>::Matcher(const std::string &s) { *this = Eq(s); }
+
+// Constructs a matcher that matches a std::string whose value is equal to
+// s.
+Matcher<std::string>::Matcher(const char *s) { *this = Eq(std::string(s)); }
+
+#if GTEST_INTERNAL_HAS_STRING_VIEW
+// Constructs a matcher that matches a const StringView& whose value is
+// equal to s.
+Matcher<const internal::StringView &>::Matcher(const std::string &s) {
+  *this = Eq(s);
+}
+
+// Constructs a matcher that matches a const StringView& whose value is
+// equal to s.
+Matcher<const internal::StringView &>::Matcher(const char *s) {
+  *this = Eq(std::string(s));
+}
+
+// Constructs a matcher that matches a const StringView& whose value is
+// equal to s.
+Matcher<const internal::StringView &>::Matcher(internal::StringView s) {
+  *this = Eq(std::string(s));
+}
+
+// Constructs a matcher that matches a StringView whose value is equal to
+// s.
+Matcher<internal::StringView>::Matcher(const std::string &s) { *this = Eq(s); }
+
+// Constructs a matcher that matches a StringView whose value is equal to
+// s.
+Matcher<internal::StringView>::Matcher(const char *s) {
+  *this = Eq(std::string(s));
+}
+
+// Constructs a matcher that matches a StringView whose value is equal to
+// s.
+Matcher<internal::StringView>::Matcher(internal::StringView s) {
+  *this = Eq(std::string(s));
+}
+#endif  // GTEST_INTERNAL_HAS_STRING_VIEW
+
+}  // namespace testing
diff --git a/libs/libaom/src/third_party/googletest/src/googletest/src/gtest-port.cc b/libs/libaom/src/third_party/googletest/src/googletest/src/gtest-port.cc
new file mode 100644
index 000000000..adfdbef9c
--- /dev/null
+++ b/libs/libaom/src/third_party/googletest/src/googletest/src/gtest-port.cc
@@ -0,0 +1,1361 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "gtest/internal/gtest-port.h"
+
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <cstdint>
+#include <fstream>
+#include <memory>
+
+#if GTEST_OS_WINDOWS
+#include <windows.h>
+#include <io.h>
+#include <sys/stat.h>
+#include <map>  // Used in ThreadLocal.
+#ifdef _MSC_VER
+#include <crtdbg.h>
+#endif  // _MSC_VER
+#else
+#include <unistd.h>
+#endif  // GTEST_OS_WINDOWS
+
+#if GTEST_OS_MAC
+#include <mach/mach_init.h>
+#include <mach/task.h>
+#include <mach/vm_map.h>
+#endif  // GTEST_OS_MAC
+
+#if GTEST_OS_DRAGONFLY || GTEST_OS_FREEBSD || GTEST_OS_GNU_KFREEBSD || \
+    GTEST_OS_NETBSD || GTEST_OS_OPENBSD
+#include <sys/sysctl.h>
+#if GTEST_OS_DRAGONFLY || GTEST_OS_FREEBSD || GTEST_OS_GNU_KFREEBSD
+#include <sys/user.h>
+#endif
+#endif
+
+#if GTEST_OS_QNX
+#include <devctl.h>
+#include <fcntl.h>
+#include <sys/procfs.h>
+#endif  // GTEST_OS_QNX
+
+#if GTEST_OS_AIX
+#include <procinfo.h>
+#include <sys/types.h>
+#endif  // GTEST_OS_AIX
+
+#if GTEST_OS_FUCHSIA
+#include <zircon/process.h>
+#include <zircon/syscalls.h>
+#endif  // GTEST_OS_FUCHSIA
+
+#include "gtest/gtest-spi.h"
+#include "gtest/gtest-message.h"
+#include "gtest/internal/gtest-internal.h"
+#include "gtest/internal/gtest-string.h"
+#include "src/gtest-internal-inl.h"
+
+namespace testing {
+namespace internal {
+
+#if defined(_MSC_VER) || defined(__BORLANDC__)
+// MSVC and C++Builder do not provide a definition of STDERR_FILENO.
+const int kStdOutFileno = 1;
+const int kStdErrFileno = 2;
+#else
+const int kStdOutFileno = STDOUT_FILENO;
+const int kStdErrFileno = STDERR_FILENO;
+#endif  // _MSC_VER
+
+#if GTEST_OS_LINUX
+
+namespace {
+template <typename T>
+T ReadProcFileField(const std::string &filename, int field) {
+  std::string dummy;
+  std::ifstream file(filename.c_str());
+  while (field-- > 0) {
+    file >> dummy;
+  }
+  T output = 0;
+  file >> output;
+  return output;
+}
+}  // namespace
+
+// Returns the number of active threads, or 0 when there is an error.
+size_t GetThreadCount() {
+  const std::string filename =
+      (Message() << "/proc/" << getpid() << "/stat").GetString();
+  return ReadProcFileField<size_t>(filename, 19);
+}
+
+#elif GTEST_OS_MAC
+
+size_t GetThreadCount() {
+  const task_t task = mach_task_self();
+  mach_msg_type_number_t thread_count;
+  thread_act_array_t thread_list;
+  const kern_return_t status = task_threads(task, &thread_list, &thread_count);
+  if (status == KERN_SUCCESS) {
+    // task_threads allocates resources in thread_list and we need to free them
+    // to avoid leaks.
+    vm_deallocate(task, reinterpret_cast<vm_address_t>(thread_list),
+                  sizeof(thread_t) * thread_count);
+    return static_cast<size_t>(thread_count);
+  } else {
+    return 0;
+  }
+}
+
+#elif GTEST_OS_DRAGONFLY || GTEST_OS_FREEBSD || GTEST_OS_GNU_KFREEBSD || \
+    GTEST_OS_NETBSD
+
+#if GTEST_OS_NETBSD
+#undef KERN_PROC
+#define KERN_PROC KERN_PROC2
+#define kinfo_proc kinfo_proc2
+#endif
+
+#if GTEST_OS_DRAGONFLY
+#define KP_NLWP(kp) (kp.kp_nthreads)
+#elif GTEST_OS_FREEBSD || GTEST_OS_GNU_KFREEBSD
+#define KP_NLWP(kp) (kp.ki_numthreads)
+#elif GTEST_OS_NETBSD
+#define KP_NLWP(kp) (kp.p_nlwps)
+#endif
+
+// Returns the number of threads running in the process, or 0 to indicate that
+// we cannot detect it.
+size_t GetThreadCount() {
+  int mib[] = {
+    CTL_KERN,
+    KERN_PROC,
+    KERN_PROC_PID,
+    getpid(),
+#if GTEST_OS_NETBSD
+    sizeof(struct kinfo_proc),
+    1,
+#endif
+  };
+  u_int miblen = sizeof(mib) / sizeof(mib[0]);
+  struct kinfo_proc info;
+  size_t size = sizeof(info);
+  if (sysctl(mib, miblen, &info, &size, NULL, 0)) {
+    return 0;
+  }
+  return static_cast<size_t>(KP_NLWP(info));
+}
+#elif GTEST_OS_OPENBSD
+
+// Returns the number of threads running in the process, or 0 to indicate that
+// we cannot detect it.
+size_t GetThreadCount() {
+  int mib[] = {
+    CTL_KERN,
+    KERN_PROC,
+    KERN_PROC_PID | KERN_PROC_SHOW_THREADS,
+    getpid(),
+    sizeof(struct kinfo_proc),
+    0,
+  };
+  u_int miblen = sizeof(mib) / sizeof(mib[0]);
+
+  // get number of structs
+  size_t size;
+  if (sysctl(mib, miblen, NULL, &size, NULL, 0)) {
+    return 0;
+  }
+  mib[5] = size / mib[4];
+
+  // populate array of structs
+  struct kinfo_proc info[mib[5]];
+  if (sysctl(mib, miblen, &info, &size, NULL, 0)) {
+    return 0;
+  }
+
+  // exclude empty members
+  int nthreads = 0;
+  for (int i = 0; i < size / mib[4]; i++) {
+    if (info[i].p_tid != -1) nthreads++;
+  }
+  return nthreads;
+}
+
+#elif GTEST_OS_QNX
+
+// Returns the number of threads running in the process, or 0 to indicate that
+// we cannot detect it.
+size_t GetThreadCount() {
+  const int fd = open("/proc/self/as", O_RDONLY);
+  if (fd < 0) {
+    return 0;
+  }
+  procfs_info process_info;
+  const int status =
+      devctl(fd, DCMD_PROC_INFO, &process_info, sizeof(process_info), nullptr);
+  close(fd);
+  if (status == EOK) {
+    return static_cast<size_t>(process_info.num_threads);
+  } else {
+    return 0;
+  }
+}
+
+#elif GTEST_OS_AIX
+
+size_t GetThreadCount() {
+  struct procentry64 entry;
+  pid_t pid = getpid();
+  int status = getprocs64(&entry, sizeof(entry), nullptr, 0, &pid, 1);
+  if (status == 1) {
+    return entry.pi_thcount;
+  } else {
+    return 0;
+  }
+}
+
+#elif GTEST_OS_FUCHSIA
+
+size_t GetThreadCount() {
+  int dummy_buffer;
+  size_t avail;
+  zx_status_t status =
+      zx_object_get_info(zx_process_self(), ZX_INFO_PROCESS_THREADS,
+                         &dummy_buffer, 0, nullptr, &avail);
+  if (status == ZX_OK) {
+    return avail;
+  } else {
+    return 0;
+  }
+}
+
+#else
+
+size_t GetThreadCount() {
+  // There's no portable way to detect the number of threads, so we just
+  // return 0 to indicate that we cannot detect it.
+  return 0;
+}
+
+#endif  // GTEST_OS_LINUX
+
+#if GTEST_IS_THREADSAFE && GTEST_OS_WINDOWS
+
+void SleepMilliseconds(int n) { ::Sleep(static_cast<DWORD>(n)); }
+
+AutoHandle::AutoHandle() : handle_(INVALID_HANDLE_VALUE) {}
+
+AutoHandle::AutoHandle(Handle handle) : handle_(handle) {}
+
+AutoHandle::~AutoHandle() { Reset(); }
+
+AutoHandle::Handle AutoHandle::Get() const { return handle_; }
+
+void AutoHandle::Reset() { Reset(INVALID_HANDLE_VALUE); }
+
+void AutoHandle::Reset(HANDLE handle) {
+  // Resetting with the same handle we already own is invalid.
+  if (handle_ != handle) {
+    if (IsCloseable()) {
+      ::CloseHandle(handle_);
+    }
+    handle_ = handle;
+  } else {
+    GTEST_CHECK_(!IsCloseable())
+        << "Resetting a valid handle to itself is likely a programmer error "
+           "and thus not allowed.";
+  }
+}
+
+bool AutoHandle::IsCloseable() const {
+  // Different Windows APIs may use either of these values to represent an
+  // invalid handle.
+  return handle_ != nullptr && handle_ != INVALID_HANDLE_VALUE;
+}
+
+Notification::Notification()
+    : event_(::CreateEvent(nullptr,     // Default security attributes.
+                           TRUE,        // Do not reset automatically.
+                           FALSE,       // Initially unset.
+                           nullptr)) {  // Anonymous event.
+  GTEST_CHECK_(event_.Get() != nullptr);
+}
+
+void Notification::Notify() { GTEST_CHECK_(::SetEvent(event_.Get()) != FALSE); }
+
+void Notification::WaitForNotification() {
+  GTEST_CHECK_(::WaitForSingleObject(event_.Get(), INFINITE) == WAIT_OBJECT_0);
+}
+
+Mutex::Mutex()
+    : owner_thread_id_(0), type_(kDynamic), critical_section_init_phase_(0),
+      critical_section_(new CRITICAL_SECTION) {
+  ::InitializeCriticalSection(critical_section_);
+}
+
+Mutex::~Mutex() {
+  // Static mutexes are leaked intentionally. It is not thread-safe to try
+  // to clean them up.
+  if (type_ == kDynamic) {
+    ::DeleteCriticalSection(critical_section_);
+    delete critical_section_;
+    critical_section_ = nullptr;
+  }
+}
+
+void Mutex::Lock() {
+  ThreadSafeLazyInit();
+  ::EnterCriticalSection(critical_section_);
+  owner_thread_id_ = ::GetCurrentThreadId();
+}
+
+void Mutex::Unlock() {
+  ThreadSafeLazyInit();
+  // We don't protect writing to owner_thread_id_ here, as it's the
+  // caller's responsibility to ensure that the current thread holds the
+  // mutex when this is called.
+  owner_thread_id_ = 0;
+  ::LeaveCriticalSection(critical_section_);
+}
+
+// Does nothing if the current thread holds the mutex. Otherwise, crashes
+// with high probability.
+void Mutex::AssertHeld() {
+  ThreadSafeLazyInit();
+  GTEST_CHECK_(owner_thread_id_ == ::GetCurrentThreadId())
+      << "The current thread is not holding the mutex @" << this;
+}
+
+namespace {
+
+#ifdef _MSC_VER
+// Use the RAII idiom to flag mem allocs that are intentionally never
+// deallocated. The motivation is to silence the false positive mem leaks
+// that are reported by the debug version of MS's CRT which can only detect
+// if an alloc is missing a matching deallocation.
+// Example:
+//    MemoryIsNotDeallocated memory_is_not_deallocated;
+//    critical_section_ = new CRITICAL_SECTION;
+//
+class MemoryIsNotDeallocated {
+ public:
+  MemoryIsNotDeallocated() : old_crtdbg_flag_(0) {
+    old_crtdbg_flag_ = _CrtSetDbgFlag(_CRTDBG_REPORT_FLAG);
+    // Set heap allocation block type to _IGNORE_BLOCK so that MS debug CRT
+    // doesn't report mem leak if there's no matching deallocation.
+    _CrtSetDbgFlag(old_crtdbg_flag_ & ~_CRTDBG_ALLOC_MEM_DF);
+  }
+
+  ~MemoryIsNotDeallocated() {
+    // Restore the original _CRTDBG_ALLOC_MEM_DF flag
+    _CrtSetDbgFlag(old_crtdbg_flag_);
+  }
+
+ private:
+  int old_crtdbg_flag_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(MemoryIsNotDeallocated);
+};
+#endif  // _MSC_VER
+
+}  // namespace
+
+// Initializes owner_thread_id_ and critical_section_ in static mutexes.
+void Mutex::ThreadSafeLazyInit() {
+  // Dynamic mutexes are initialized in the constructor.
+  if (type_ == kStatic) {
+    switch (
+        ::InterlockedCompareExchange(&critical_section_init_phase_, 1L, 0L)) {
+      case 0:
+        // If critical_section_init_phase_ was 0 before the exchange, we
+        // are the first to test it and need to perform the initialization.
+        owner_thread_id_ = 0;
+        {
+          // Use RAII to flag that following mem alloc is never deallocated.
+#ifdef _MSC_VER
+          MemoryIsNotDeallocated memory_is_not_deallocated;
+#endif  // _MSC_VER
+          critical_section_ = new CRITICAL_SECTION;
+        }
+        ::InitializeCriticalSection(critical_section_);
+        // Updates the critical_section_init_phase_ to 2 to signal
+        // initialization complete.
+        GTEST_CHECK_(::InterlockedCompareExchange(&critical_section_init_phase_,
+                                                  2L, 1L) == 1L);
+        break;
+      case 1:
+        // Somebody else is already initializing the mutex; spin until they
+        // are done.
+        while (::InterlockedCompareExchange(&critical_section_init_phase_, 2L,
+                                            2L) != 2L) {
+          // Possibly yields the rest of the thread's time slice to other
+          // threads.
+          ::Sleep(0);
+        }
+        break;
+
+      case 2: break;  // The mutex is already initialized and ready for use.
+
+      default:
+        GTEST_CHECK_(false)
+            << "Unexpected value of critical_section_init_phase_ "
+            << "while initializing a static mutex.";
+    }
+  }
+}
+
+namespace {
+
+class ThreadWithParamSupport : public ThreadWithParamBase {
+ public:
+  static HANDLE CreateThread(Runnable *runnable,
+                             Notification *thread_can_start) {
+    ThreadMainParam *param = new ThreadMainParam(runnable, thread_can_start);
+    DWORD thread_id;
+    HANDLE thread_handle = ::CreateThread(
+        nullptr,  // Default security.
+        0,        // Default stack size.
+        &ThreadWithParamSupport::ThreadMain,
+        param,        // Parameter to ThreadMainStatic
+        0x0,          // Default creation flags.
+        &thread_id);  // Need a valid pointer for the call to work under Win98.
+    GTEST_CHECK_(thread_handle != nullptr)
+        << "CreateThread failed with error " << ::GetLastError() << ".";
+    if (thread_handle == nullptr) {
+      delete param;
+    }
+    return thread_handle;
+  }
+
+ private:
+  struct ThreadMainParam {
+    ThreadMainParam(Runnable *runnable, Notification *thread_can_start)
+        : runnable_(runnable), thread_can_start_(thread_can_start) {}
+    std::unique_ptr<Runnable> runnable_;
+    // Does not own.
+    Notification *thread_can_start_;
+  };
+
+  static DWORD WINAPI ThreadMain(void *ptr) {
+    // Transfers ownership.
+    std::unique_ptr<ThreadMainParam> param(static_cast<ThreadMainParam *>(ptr));
+    if (param->thread_can_start_ != nullptr)
+      param->thread_can_start_->WaitForNotification();
+    param->runnable_->Run();
+    return 0;
+  }
+
+  // Prohibit instantiation.
+  ThreadWithParamSupport();
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadWithParamSupport);
+};
+
+}  // namespace
+
+ThreadWithParamBase::ThreadWithParamBase(Runnable *runnable,
+                                         Notification *thread_can_start)
+    : thread_(
+          ThreadWithParamSupport::CreateThread(runnable, thread_can_start)) {}
+
+ThreadWithParamBase::~ThreadWithParamBase() { Join(); }
+
+void ThreadWithParamBase::Join() {
+  GTEST_CHECK_(::WaitForSingleObject(thread_.Get(), INFINITE) == WAIT_OBJECT_0)
+      << "Failed to join the thread with error " << ::GetLastError() << ".";
+}
+
+// Maps a thread to a set of ThreadIdToThreadLocals that have values
+// instantiated on that thread and notifies them when the thread exits.  A
+// ThreadLocal instance is expected to persist until all threads it has
+// values on have terminated.
+class ThreadLocalRegistryImpl {
+ public:
+  // Registers thread_local_instance as having value on the current thread.
+  // Returns a value that can be used to identify the thread from other threads.
+  static ThreadLocalValueHolderBase *GetValueOnCurrentThread(
+      const ThreadLocalBase *thread_local_instance) {
+#ifdef _MSC_VER
+    MemoryIsNotDeallocated memory_is_not_deallocated;
+#endif  // _MSC_VER
+    DWORD current_thread = ::GetCurrentThreadId();
+    MutexLock lock(&mutex_);
+    ThreadIdToThreadLocals *const thread_to_thread_locals =
+        GetThreadLocalsMapLocked();
+    ThreadIdToThreadLocals::iterator thread_local_pos =
+        thread_to_thread_locals->find(current_thread);
+    if (thread_local_pos == thread_to_thread_locals->end()) {
+      thread_local_pos =
+          thread_to_thread_locals
+              ->insert(std::make_pair(current_thread, ThreadLocalValues()))
+              .first;
+      StartWatcherThreadFor(current_thread);
+    }
+    ThreadLocalValues &thread_local_values = thread_local_pos->second;
+    ThreadLocalValues::iterator value_pos =
+        thread_local_values.find(thread_local_instance);
+    if (value_pos == thread_local_values.end()) {
+      value_pos =
+          thread_local_values
+              .insert(std::make_pair(
+                  thread_local_instance,
+                  std::shared_ptr<ThreadLocalValueHolderBase>(
+                      thread_local_instance->NewValueForCurrentThread())))
+              .first;
+    }
+    return value_pos->second.get();
+  }
+
+  static void OnThreadLocalDestroyed(
+      const ThreadLocalBase *thread_local_instance) {
+    std::vector<std::shared_ptr<ThreadLocalValueHolderBase> > value_holders;
+    // Clean up the ThreadLocalValues data structure while holding the lock, but
+    // defer the destruction of the ThreadLocalValueHolderBases.
+    {
+      MutexLock lock(&mutex_);
+      ThreadIdToThreadLocals *const thread_to_thread_locals =
+          GetThreadLocalsMapLocked();
+      for (ThreadIdToThreadLocals::iterator it =
+               thread_to_thread_locals->begin();
+           it != thread_to_thread_locals->end(); ++it) {
+        ThreadLocalValues &thread_local_values = it->second;
+        ThreadLocalValues::iterator value_pos =
+            thread_local_values.find(thread_local_instance);
+        if (value_pos != thread_local_values.end()) {
+          value_holders.push_back(value_pos->second);
+          thread_local_values.erase(value_pos);
+          // This 'if' can only be successful at most once, so theoretically we
+          // could break out of the loop here, but we don't bother doing so.
+        }
+      }
+    }
+    // Outside the lock, let the destructor for 'value_holders' deallocate the
+    // ThreadLocalValueHolderBases.
+  }
+
+  static void OnThreadExit(DWORD thread_id) {
+    GTEST_CHECK_(thread_id != 0) << ::GetLastError();
+    std::vector<std::shared_ptr<ThreadLocalValueHolderBase> > value_holders;
+    // Clean up the ThreadIdToThreadLocals data structure while holding the
+    // lock, but defer the destruction of the ThreadLocalValueHolderBases.
+    {
+      MutexLock lock(&mutex_);
+      ThreadIdToThreadLocals *const thread_to_thread_locals =
+          GetThreadLocalsMapLocked();
+      ThreadIdToThreadLocals::iterator thread_local_pos =
+          thread_to_thread_locals->find(thread_id);
+      if (thread_local_pos != thread_to_thread_locals->end()) {
+        ThreadLocalValues &thread_local_values = thread_local_pos->second;
+        for (ThreadLocalValues::iterator value_pos =
+                 thread_local_values.begin();
+             value_pos != thread_local_values.end(); ++value_pos) {
+          value_holders.push_back(value_pos->second);
+        }
+        thread_to_thread_locals->erase(thread_local_pos);
+      }
+    }
+    // Outside the lock, let the destructor for 'value_holders' deallocate the
+    // ThreadLocalValueHolderBases.
+  }
+
+ private:
+  // In a particular thread, maps a ThreadLocal object to its value.
+  typedef std::map<const ThreadLocalBase *,
+                   std::shared_ptr<ThreadLocalValueHolderBase> >
+      ThreadLocalValues;
+  // Stores all ThreadIdToThreadLocals having values in a thread, indexed by
+  // thread's ID.
+  typedef std::map<DWORD, ThreadLocalValues> ThreadIdToThreadLocals;
+
+  // Holds the thread id and thread handle that we pass from
+  // StartWatcherThreadFor to WatcherThreadFunc.
+  typedef std::pair<DWORD, HANDLE> ThreadIdAndHandle;
+
+  static void StartWatcherThreadFor(DWORD thread_id) {
+    // The returned handle will be kept in thread_map and closed by
+    // watcher_thread in WatcherThreadFunc.
+    HANDLE thread =
+        ::OpenThread(SYNCHRONIZE | THREAD_QUERY_INFORMATION, FALSE, thread_id);
+    GTEST_CHECK_(thread != nullptr);
+    // We need to pass a valid thread ID pointer into CreateThread for it
+    // to work correctly under Win98.
+    DWORD watcher_thread_id;
+    HANDLE watcher_thread = ::CreateThread(
+        nullptr,  // Default security.
+        0,        // Default stack size
+        &ThreadLocalRegistryImpl::WatcherThreadFunc,
+        reinterpret_cast<LPVOID>(new ThreadIdAndHandle(thread_id, thread)),
+        CREATE_SUSPENDED, &watcher_thread_id);
+    GTEST_CHECK_(watcher_thread != nullptr);
+    // Give the watcher thread the same priority as ours to avoid being
+    // blocked by it.
+    ::SetThreadPriority(watcher_thread,
+                        ::GetThreadPriority(::GetCurrentThread()));
+    ::ResumeThread(watcher_thread);
+    ::CloseHandle(watcher_thread);
+  }
+
+  // Monitors exit from a given thread and notifies those
+  // ThreadIdToThreadLocals about thread termination.
+  static DWORD WINAPI WatcherThreadFunc(LPVOID param) {
+    const ThreadIdAndHandle *tah =
+        reinterpret_cast<const ThreadIdAndHandle *>(param);
+    GTEST_CHECK_(::WaitForSingleObject(tah->second, INFINITE) == WAIT_OBJECT_0);
+    OnThreadExit(tah->first);
+    ::CloseHandle(tah->second);
+    delete tah;
+    return 0;
+  }
+
+  // Returns map of thread local instances.
+  static ThreadIdToThreadLocals *GetThreadLocalsMapLocked() {
+    mutex_.AssertHeld();
+#ifdef _MSC_VER
+    MemoryIsNotDeallocated memory_is_not_deallocated;
+#endif  // _MSC_VER
+    static ThreadIdToThreadLocals *map = new ThreadIdToThreadLocals();
+    return map;
+  }
+
+  // Protects access to GetThreadLocalsMapLocked() and its return value.
+  static Mutex mutex_;
+  // Protects access to GetThreadMapLocked() and its return value.
+  static Mutex thread_map_mutex_;
+};
+
+Mutex ThreadLocalRegistryImpl::mutex_(Mutex::kStaticMutex);
+Mutex ThreadLocalRegistryImpl::thread_map_mutex_(Mutex::kStaticMutex);
+
+ThreadLocalValueHolderBase *ThreadLocalRegistry::GetValueOnCurrentThread(
+    const ThreadLocalBase *thread_local_instance) {
+  return ThreadLocalRegistryImpl::GetValueOnCurrentThread(
+      thread_local_instance);
+}
+
+void ThreadLocalRegistry::OnThreadLocalDestroyed(
+    const ThreadLocalBase *thread_local_instance) {
+  ThreadLocalRegistryImpl::OnThreadLocalDestroyed(thread_local_instance);
+}
+
+#endif  // GTEST_IS_THREADSAFE && GTEST_OS_WINDOWS
+
+#if GTEST_USES_POSIX_RE
+
+// Implements RE.  Currently only needed for death tests.
+
+RE::~RE() {
+  if (is_valid_) {
+    // regfree'ing an invalid regex might crash because the content
+    // of the regex is undefined. Since the regex's are essentially
+    // the same, one cannot be valid (or invalid) without the other
+    // being so too.
+    regfree(&partial_regex_);
+    regfree(&full_regex_);
+  }
+  free(const_cast<char *>(pattern_));
+}
+
+// Returns true if and only if regular expression re matches the entire str.
+bool RE::FullMatch(const char *str, const RE &re) {
+  if (!re.is_valid_) return false;
+
+  regmatch_t match;
+  return regexec(&re.full_regex_, str, 1, &match, 0) == 0;
+}
+
+// Returns true if and only if regular expression re matches a substring of
+// str (including str itself).
+bool RE::PartialMatch(const char *str, const RE &re) {
+  if (!re.is_valid_) return false;
+
+  regmatch_t match;
+  return regexec(&re.partial_regex_, str, 1, &match, 0) == 0;
+}
+
+// Initializes an RE from its string representation.
+void RE::Init(const char *regex) {
+  pattern_ = posix::StrDup(regex);
+
+  // Reserves enough bytes to hold the regular expression used for a
+  // full match.
+  const size_t full_regex_len = strlen(regex) + 10;
+  char *const full_pattern = new char[full_regex_len];
+
+  snprintf(full_pattern, full_regex_len, "^(%s)$", regex);
+  is_valid_ = regcomp(&full_regex_, full_pattern, REG_EXTENDED) == 0;
+  // We want to call regcomp(&partial_regex_, ...) even if the
+  // previous expression returns false.  Otherwise partial_regex_ may
+  // not be properly initialized can may cause trouble when it's
+  // freed.
+  //
+  // Some implementation of POSIX regex (e.g. on at least some
+  // versions of Cygwin) doesn't accept the empty string as a valid
+  // regex.  We change it to an equivalent form "()" to be safe.
+  if (is_valid_) {
+    const char *const partial_regex = (*regex == '\0') ? "()" : regex;
+    is_valid_ = regcomp(&partial_regex_, partial_regex, REG_EXTENDED) == 0;
+  }
+  EXPECT_TRUE(is_valid_)
+      << "Regular expression \"" << regex
+      << "\" is not a valid POSIX Extended regular expression.";
+
+  delete[] full_pattern;
+}
+
+#elif GTEST_USES_SIMPLE_RE
+
+// Returns true if and only if ch appears anywhere in str (excluding the
+// terminating '\0' character).
+bool IsInSet(char ch, const char *str) {
+  return ch != '\0' && strchr(str, ch) != nullptr;
+}
+
+// Returns true if and only if ch belongs to the given classification.
+// Unlike similar functions in <ctype.h>, these aren't affected by the
+// current locale.
+bool IsAsciiDigit(char ch) { return '0' <= ch && ch <= '9'; }
+bool IsAsciiPunct(char ch) {
+  return IsInSet(ch, "^-!\"#$%&'()*+,./:;<=>?@[\\]_`{|}~");
+}
+bool IsRepeat(char ch) { return IsInSet(ch, "?*+"); }
+bool IsAsciiWhiteSpace(char ch) { return IsInSet(ch, " \f\n\r\t\v"); }
+bool IsAsciiWordChar(char ch) {
+  return ('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z') ||
+         ('0' <= ch && ch <= '9') || ch == '_';
+}
+
+// Returns true if and only if "\\c" is a supported escape sequence.
+bool IsValidEscape(char c) {
+  return (IsAsciiPunct(c) || IsInSet(c, "dDfnrsStvwW"));
+}
+
+// Returns true if and only if the given atom (specified by escaped and
+// pattern) matches ch.  The result is undefined if the atom is invalid.
+bool AtomMatchesChar(bool escaped, char pattern_char, char ch) {
+  if (escaped) {  // "\\p" where p is pattern_char.
+    switch (pattern_char) {
+      case 'd': return IsAsciiDigit(ch);
+      case 'D': return !IsAsciiDigit(ch);
+      case 'f': return ch == '\f';
+      case 'n': return ch == '\n';
+      case 'r': return ch == '\r';
+      case 's': return IsAsciiWhiteSpace(ch);
+      case 'S': return !IsAsciiWhiteSpace(ch);
+      case 't': return ch == '\t';
+      case 'v': return ch == '\v';
+      case 'w': return IsAsciiWordChar(ch);
+      case 'W': return !IsAsciiWordChar(ch);
+    }
+    return IsAsciiPunct(pattern_char) && pattern_char == ch;
+  }
+
+  return (pattern_char == '.' && ch != '\n') || pattern_char == ch;
+}
+
+// Helper function used by ValidateRegex() to format error messages.
+static std::string FormatRegexSyntaxError(const char *regex, int index) {
+  return (Message() << "Syntax error at index " << index
+                    << " in simple regular expression \"" << regex << "\": ")
+      .GetString();
+}
+
+// Generates non-fatal failures and returns false if regex is invalid;
+// otherwise returns true.
+bool ValidateRegex(const char *regex) {
+  if (regex == nullptr) {
+    ADD_FAILURE() << "NULL is not a valid simple regular expression.";
+    return false;
+  }
+
+  bool is_valid = true;
+
+  // True if and only if ?, *, or + can follow the previous atom.
+  bool prev_repeatable = false;
+  for (int i = 0; regex[i]; i++) {
+    if (regex[i] == '\\') {  // An escape sequence
+      i++;
+      if (regex[i] == '\0') {
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i - 1)
+                      << "'\\' cannot appear at the end.";
+        return false;
+      }
+
+      if (!IsValidEscape(regex[i])) {
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i - 1)
+                      << "invalid escape sequence \"\\" << regex[i] << "\".";
+        is_valid = false;
+      }
+      prev_repeatable = true;
+    } else {  // Not an escape sequence.
+      const char ch = regex[i];
+
+      if (ch == '^' && i > 0) {
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
+                      << "'^' can only appear at the beginning.";
+        is_valid = false;
+      } else if (ch == '$' && regex[i + 1] != '\0') {
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
+                      << "'$' can only appear at the end.";
+        is_valid = false;
+      } else if (IsInSet(ch, "()[]{}|")) {
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i) << "'" << ch
+                      << "' is unsupported.";
+        is_valid = false;
+      } else if (IsRepeat(ch) && !prev_repeatable) {
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i) << "'" << ch
+                      << "' can only follow a repeatable token.";
+        is_valid = false;
+      }
+
+      prev_repeatable = !IsInSet(ch, "^$?*+");
+    }
+  }
+
+  return is_valid;
+}
+
+// Matches a repeated regex atom followed by a valid simple regular
+// expression.  The regex atom is defined as c if escaped is false,
+// or \c otherwise.  repeat is the repetition meta character (?, *,
+// or +).  The behavior is undefined if str contains too many
+// characters to be indexable by size_t, in which case the test will
+// probably time out anyway.  We are fine with this limitation as
+// std::string has it too.
+bool MatchRepetitionAndRegexAtHead(bool escaped, char c, char repeat,
+                                   const char *regex, const char *str) {
+  const size_t min_count = (repeat == '+') ? 1 : 0;
+  const size_t max_count = (repeat == '?') ? 1 : static_cast<size_t>(-1) - 1;
+  // We cannot call numeric_limits::max() as it conflicts with the
+  // max() macro on Windows.
+
+  for (size_t i = 0; i <= max_count; ++i) {
+    // We know that the atom matches each of the first i characters in str.
+    if (i >= min_count && MatchRegexAtHead(regex, str + i)) {
+      // We have enough matches at the head, and the tail matches too.
+      // Since we only care about *whether* the pattern matches str
+      // (as opposed to *how* it matches), there is no need to find a
+      // greedy match.
+      return true;
+    }
+    if (str[i] == '\0' || !AtomMatchesChar(escaped, c, str[i])) return false;
+  }
+  return false;
+}
+
+// Returns true if and only if regex matches a prefix of str. regex must
+// be a valid simple regular expression and not start with "^", or the
+// result is undefined.
+bool MatchRegexAtHead(const char *regex, const char *str) {
+  if (*regex == '\0')  // An empty regex matches a prefix of anything.
+    return true;
+
+  // "$" only matches the end of a string.  Note that regex being
+  // valid guarantees that there's nothing after "$" in it.
+  if (*regex == '$') return *str == '\0';
+
+  // Is the first thing in regex an escape sequence?
+  const bool escaped = *regex == '\\';
+  if (escaped) ++regex;
+  if (IsRepeat(regex[1])) {
+    // MatchRepetitionAndRegexAtHead() calls MatchRegexAtHead(), so
+    // here's an indirect recursion.  It terminates as the regex gets
+    // shorter in each recursion.
+    return MatchRepetitionAndRegexAtHead(escaped, regex[0], regex[1], regex + 2,
+                                         str);
+  } else {
+    // regex isn't empty, isn't "$", and doesn't start with a
+    // repetition.  We match the first atom of regex with the first
+    // character of str and recurse.
+    return (*str != '\0') && AtomMatchesChar(escaped, *regex, *str) &&
+           MatchRegexAtHead(regex + 1, str + 1);
+  }
+}
+
+// Returns true if and only if regex matches any substring of str.  regex must
+// be a valid simple regular expression, or the result is undefined.
+//
+// The algorithm is recursive, but the recursion depth doesn't exceed
+// the regex length, so we won't need to worry about running out of
+// stack space normally.  In rare cases the time complexity can be
+// exponential with respect to the regex length + the string length,
+// but usually it's must faster (often close to linear).
+bool MatchRegexAnywhere(const char *regex, const char *str) {
+  if (regex == nullptr || str == nullptr) return false;
+
+  if (*regex == '^') return MatchRegexAtHead(regex + 1, str);
+
+  // A successful match can be anywhere in str.
+  do {
+    if (MatchRegexAtHead(regex, str)) return true;
+  } while (*str++ != '\0');
+  return false;
+}
+
+// Implements the RE class.
+
+RE::~RE() {
+  free(const_cast<char *>(pattern_));
+  free(const_cast<char *>(full_pattern_));
+}
+
+// Returns true if and only if regular expression re matches the entire str.
+bool RE::FullMatch(const char *str, const RE &re) {
+  return re.is_valid_ && MatchRegexAnywhere(re.full_pattern_, str);
+}
+
+// Returns true if and only if regular expression re matches a substring of
+// str (including str itself).
+bool RE::PartialMatch(const char *str, const RE &re) {
+  return re.is_valid_ && MatchRegexAnywhere(re.pattern_, str);
+}
+
+// Initializes an RE from its string representation.
+void RE::Init(const char *regex) {
+  pattern_ = full_pattern_ = nullptr;
+  if (regex != nullptr) {
+    pattern_ = posix::StrDup(regex);
+  }
+
+  is_valid_ = ValidateRegex(regex);
+  if (!is_valid_) {
+    // No need to calculate the full pattern when the regex is invalid.
+    return;
+  }
+
+  const size_t len = strlen(regex);
+  // Reserves enough bytes to hold the regular expression used for a
+  // full match: we need space to prepend a '^', append a '$', and
+  // terminate the string with '\0'.
+  char *buffer = static_cast<char *>(malloc(len + 3));
+  full_pattern_ = buffer;
+
+  if (*regex != '^')
+    *buffer++ = '^';  // Makes sure full_pattern_ starts with '^'.
+
+  // We don't use snprintf or strncpy, as they trigger a warning when
+  // compiled with VC++ 8.0.
+  memcpy(buffer, regex, len);
+  buffer += len;
+
+  if (len == 0 || regex[len - 1] != '$')
+    *buffer++ = '$';  // Makes sure full_pattern_ ends with '$'.
+
+  *buffer = '\0';
+}
+
+#endif  // GTEST_USES_POSIX_RE
+
+const char kUnknownFile[] = "unknown file";
+
+// Formats a source file path and a line number as they would appear
+// in an error message from the compiler used to compile this code.
+GTEST_API_ ::std::string FormatFileLocation(const char *file, int line) {
+  const std::string file_name(file == nullptr ? kUnknownFile : file);
+
+  if (line < 0) {
+    return file_name + ":";
+  }
+#ifdef _MSC_VER
+  return file_name + "(" + StreamableToString(line) + "):";
+#else
+  return file_name + ":" + StreamableToString(line) + ":";
+#endif  // _MSC_VER
+}
+
+// Formats a file location for compiler-independent XML output.
+// Although this function is not platform dependent, we put it next to
+// FormatFileLocation in order to contrast the two functions.
+// Note that FormatCompilerIndependentFileLocation() does NOT append colon
+// to the file location it produces, unlike FormatFileLocation().
+GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(const char *file,
+                                                               int line) {
+  const std::string file_name(file == nullptr ? kUnknownFile : file);
+
+  if (line < 0)
+    return file_name;
+  else
+    return file_name + ":" + StreamableToString(line);
+}
+
+GTestLog::GTestLog(GTestLogSeverity severity, const char *file, int line)
+    : severity_(severity) {
+  const char *const marker =
+      severity == GTEST_INFO
+          ? "[  INFO ]"
+          : severity == GTEST_WARNING
+                ? "[WARNING]"
+                : severity == GTEST_ERROR ? "[ ERROR ]" : "[ FATAL ]";
+  GetStream() << ::std::endl
+              << marker << " " << FormatFileLocation(file, line).c_str()
+              << ": ";
+}
+
+// Flushes the buffers and, if severity is GTEST_FATAL, aborts the program.
+GTestLog::~GTestLog() {
+  GetStream() << ::std::endl;
+  if (severity_ == GTEST_FATAL) {
+    fflush(stderr);
+    posix::Abort();
+  }
+}
+
+// Disable Microsoft deprecation warnings for POSIX functions called from
+// this class (creat, dup, dup2, and close)
+GTEST_DISABLE_MSC_DEPRECATED_PUSH_()
+
+#if GTEST_HAS_STREAM_REDIRECTION
+
+// Object that captures an output stream (stdout/stderr).
+class CapturedStream {
+ public:
+  // The ctor redirects the stream to a temporary file.
+  explicit CapturedStream(int fd) : fd_(fd), uncaptured_fd_(dup(fd)) {
+#if GTEST_OS_WINDOWS
+    char temp_dir_path[MAX_PATH + 1] = { '\0' };   // NOLINT
+    char temp_file_path[MAX_PATH + 1] = { '\0' };  // NOLINT
+
+    ::GetTempPathA(sizeof(temp_dir_path), temp_dir_path);
+    const UINT success = ::GetTempFileNameA(temp_dir_path, "gtest_redir",
+                                            0,  // Generate unique file name.
+                                            temp_file_path);
+    GTEST_CHECK_(success != 0)
+        << "Unable to create a temporary file in " << temp_dir_path;
+    const int captured_fd = creat(temp_file_path, _S_IREAD | _S_IWRITE);
+    GTEST_CHECK_(captured_fd != -1)
+        << "Unable to open temporary file " << temp_file_path;
+    filename_ = temp_file_path;
+#else
+    // There's no guarantee that a test has write access to the current
+    // directory, so we create the temporary file in the /tmp directory
+    // instead. We use /tmp on most systems, and /sdcard on Android.
+    // That's because Android doesn't have /tmp.
+#if GTEST_OS_LINUX_ANDROID
+    // Note: Android applications are expected to call the framework's
+    // Context.getExternalStorageDirectory() method through JNI to get
+    // the location of the world-writable SD Card directory. However,
+    // this requires a Context handle, which cannot be retrieved
+    // globally from native code. Doing so also precludes running the
+    // code as part of a regular standalone executable, which doesn't
+    // run in a Dalvik process (e.g. when running it through 'adb shell').
+    //
+    // The location /data/local/tmp is directly accessible from native code.
+    // '/sdcard' and other variants cannot be relied on, as they are not
+    // guaranteed to be mounted, or may have a delay in mounting.
+    char name_template[] = "/data/local/tmp/gtest_captured_stream.XXXXXX";
+#else
+    char name_template[] = "/tmp/captured_stream.XXXXXX";
+#endif  // GTEST_OS_LINUX_ANDROID
+    const int captured_fd = mkstemp(name_template);
+    if (captured_fd == -1) {
+      GTEST_LOG_(WARNING)
+          << "Failed to create tmp file " << name_template
+          << " for test; does the test have access to the /tmp directory?";
+    }
+    filename_ = name_template;
+#endif  // GTEST_OS_WINDOWS
+    fflush(nullptr);
+    dup2(captured_fd, fd_);
+    close(captured_fd);
+  }
+
+  ~CapturedStream() { remove(filename_.c_str()); }
+
+  std::string GetCapturedString() {
+    if (uncaptured_fd_ != -1) {
+      // Restores the original stream.
+      fflush(nullptr);
+      dup2(uncaptured_fd_, fd_);
+      close(uncaptured_fd_);
+      uncaptured_fd_ = -1;
+    }
+
+    FILE *const file = posix::FOpen(filename_.c_str(), "r");
+    if (file == nullptr) {
+      GTEST_LOG_(FATAL) << "Failed to open tmp file " << filename_
+                        << " for capturing stream.";
+    }
+    const std::string content = ReadEntireFile(file);
+    posix::FClose(file);
+    return content;
+  }
+
+ private:
+  const int fd_;  // A stream to capture.
+  int uncaptured_fd_;
+  // Name of the temporary file holding the stderr output.
+  ::std::string filename_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(CapturedStream);
+};
+
+GTEST_DISABLE_MSC_DEPRECATED_POP_()
+
+static CapturedStream *g_captured_stderr = nullptr;
+static CapturedStream *g_captured_stdout = nullptr;
+
+// Starts capturing an output stream (stdout/stderr).
+static void CaptureStream(int fd, const char *stream_name,
+                          CapturedStream **stream) {
+  if (*stream != nullptr) {
+    GTEST_LOG_(FATAL) << "Only one " << stream_name
+                      << " capturer can exist at a time.";
+  }
+  *stream = new CapturedStream(fd);
+}
+
+// Stops capturing the output stream and returns the captured string.
+static std::string GetCapturedStream(CapturedStream **captured_stream) {
+  const std::string content = (*captured_stream)->GetCapturedString();
+
+  delete *captured_stream;
+  *captured_stream = nullptr;
+
+  return content;
+}
+
+// Starts capturing stdout.
+void CaptureStdout() {
+  CaptureStream(kStdOutFileno, "stdout", &g_captured_stdout);
+}
+
+// Starts capturing stderr.
+void CaptureStderr() {
+  CaptureStream(kStdErrFileno, "stderr", &g_captured_stderr);
+}
+
+// Stops capturing stdout and returns the captured string.
+std::string GetCapturedStdout() {
+  return GetCapturedStream(&g_captured_stdout);
+}
+
+// Stops capturing stderr and returns the captured string.
+std::string GetCapturedStderr() {
+  return GetCapturedStream(&g_captured_stderr);
+}
+
+#endif  // GTEST_HAS_STREAM_REDIRECTION
+
+size_t GetFileSize(FILE *file) {
+  fseek(file, 0, SEEK_END);
+  return static_cast<size_t>(ftell(file));
+}
+
+std::string ReadEntireFile(FILE *file) {
+  const size_t file_size = GetFileSize(file);
+  char *const buffer = new char[file_size];
+
+  size_t bytes_last_read = 0;  // # of bytes read in the last fread()
+  size_t bytes_read = 0;       // # of bytes read so far
+
+  fseek(file, 0, SEEK_SET);
+
+  // Keeps reading the file until we cannot read further or the
+  // pre-determined file size is reached.
+  do {
+    bytes_last_read =
+        fread(buffer + bytes_read, 1, file_size - bytes_read, file);
+    bytes_read += bytes_last_read;
+  } while (bytes_last_read > 0 && bytes_read < file_size);
+
+  const std::string content(buffer, bytes_read);
+  delete[] buffer;
+
+  return content;
+}
+
+#if GTEST_HAS_DEATH_TEST
+static const std::vector<std::string> *g_injected_test_argvs =
+    nullptr;  // Owned.
+
+std::vector<std::string> GetInjectableArgvs() {
+  if (g_injected_test_argvs != nullptr) {
+    return *g_injected_test_argvs;
+  }
+  return GetArgvs();
+}
+
+void SetInjectableArgvs(const std::vector<std::string> *new_argvs) {
+  if (g_injected_test_argvs != new_argvs) delete g_injected_test_argvs;
+  g_injected_test_argvs = new_argvs;
+}
+
+void SetInjectableArgvs(const std::vector<std::string> &new_argvs) {
+  SetInjectableArgvs(
+      new std::vector<std::string>(new_argvs.begin(), new_argvs.end()));
+}
+
+void ClearInjectableArgvs() {
+  delete g_injected_test_argvs;
+  g_injected_test_argvs = nullptr;
+}
+#endif  // GTEST_HAS_DEATH_TEST
+
+#if GTEST_OS_WINDOWS_MOBILE
+namespace posix {
+void Abort() {
+  DebugBreak();
+  TerminateProcess(GetCurrentProcess(), 1);
+}
+}  // namespace posix
+#endif  // GTEST_OS_WINDOWS_MOBILE
+
+// Returns the name of the environment variable corresponding to the
+// given flag.  For example, FlagToEnvVar("foo") will return
+// "GTEST_FOO" in the open-source version.
+static std::string FlagToEnvVar(const char *flag) {
+  const std::string full_flag =
+      (Message() << GTEST_FLAG_PREFIX_ << flag).GetString();
+
+  Message env_var;
+  for (size_t i = 0; i != full_flag.length(); i++) {
+    env_var << ToUpper(full_flag.c_str()[i]);
+  }
+
+  return env_var.GetString();
+}
+
+// Parses 'str' for a 32-bit signed integer.  If successful, writes
+// the result to *value and returns true; otherwise leaves *value
+// unchanged and returns false.
+bool ParseInt32(const Message &src_text, const char *str, int32_t *value) {
+  // Parses the environment variable as a decimal integer.
+  char *end = nullptr;
+  const long long_value = strtol(str, &end, 10);  // NOLINT
+
+  // Has strtol() consumed all characters in the string?
+  if (*end != '\0') {
+    // No - an invalid character was encountered.
+    Message msg;
+    msg << "WARNING: " << src_text
+        << " is expected to be a 32-bit integer, but actually"
+        << " has value \"" << str << "\".\n";
+    printf("%s", msg.GetString().c_str());
+    fflush(stdout);
+    return false;
+  }
+
+  // Is the parsed value in the range of an int32_t?
+  const auto result = static_cast<int32_t>(long_value);
+  if (long_value == LONG_MAX || long_value == LONG_MIN ||
+      // The parsed value overflows as a long.  (strtol() returns
+      // LONG_MAX or LONG_MIN when the input overflows.)
+      result != long_value
+      // The parsed value overflows as an int32_t.
+  ) {
+    Message msg;
+    msg << "WARNING: " << src_text
+        << " is expected to be a 32-bit integer, but actually"
+        << " has value " << str << ", which overflows.\n";
+    printf("%s", msg.GetString().c_str());
+    fflush(stdout);
+    return false;
+  }
+
+  *value = result;
+  return true;
+}
+
+// Reads and returns the Boolean environment variable corresponding to
+// the given flag; if it's not set, returns default_value.
+//
+// The value is considered true if and only if it's not "0".
+bool BoolFromGTestEnv(const char *flag, bool default_value) {
+#if defined(GTEST_GET_BOOL_FROM_ENV_)
+  return GTEST_GET_BOOL_FROM_ENV_(flag, default_value);
+#else
+  const std::string env_var = FlagToEnvVar(flag);
+  const char *const string_value = posix::GetEnv(env_var.c_str());
+  return string_value == nullptr ? default_value
+                                 : strcmp(string_value, "0") != 0;
+#endif  // defined(GTEST_GET_BOOL_FROM_ENV_)
+}
+
+// Reads and returns a 32-bit integer stored in the environment
+// variable corresponding to the given flag; if it isn't set or
+// doesn't represent a valid 32-bit integer, returns default_value.
+int32_t Int32FromGTestEnv(const char *flag, int32_t default_value) {
+#if defined(GTEST_GET_INT32_FROM_ENV_)
+  return GTEST_GET_INT32_FROM_ENV_(flag, default_value);
+#else
+  const std::string env_var = FlagToEnvVar(flag);
+  const char *const string_value = posix::GetEnv(env_var.c_str());
+  if (string_value == nullptr) {
+    // The environment variable is not set.
+    return default_value;
+  }
+
+  int32_t result = default_value;
+  if (!ParseInt32(Message() << "Environment variable " << env_var, string_value,
+                  &result)) {
+    printf("The default value %s is used.\n",
+           (Message() << default_value).GetString().c_str());
+    fflush(stdout);
+    return default_value;
+  }
+
+  return result;
+#endif  // defined(GTEST_GET_INT32_FROM_ENV_)
+}
+
+// As a special case for the 'output' flag, if GTEST_OUTPUT is not
+// set, we look for XML_OUTPUT_FILE, which is set by the Bazel build
+// system.  The value of XML_OUTPUT_FILE is a filename without the
+// "xml:" prefix of GTEST_OUTPUT.
+// Note that this is meant to be called at the call site so it does
+// not check that the flag is 'output'
+// In essence this checks an env variable called XML_OUTPUT_FILE
+// and if it is set we prepend "xml:" to its value, if it not set we return ""
+std::string OutputFlagAlsoCheckEnvVar() {
+  std::string default_value_for_output_flag = "";
+  const char *xml_output_file_env = posix::GetEnv("XML_OUTPUT_FILE");
+  if (nullptr != xml_output_file_env) {
+    default_value_for_output_flag = std::string("xml:") + xml_output_file_env;
+  }
+  return default_value_for_output_flag;
+}
+
+// Reads and returns the string environment variable corresponding to
+// the given flag; if it's not set, returns default_value.
+const char *StringFromGTestEnv(const char *flag, const char *default_value) {
+#if defined(GTEST_GET_STRING_FROM_ENV_)
+  return GTEST_GET_STRING_FROM_ENV_(flag, default_value);
+#else
+  const std::string env_var = FlagToEnvVar(flag);
+  const char *const value = posix::GetEnv(env_var.c_str());
+  return value == nullptr ? default_value : value;
+#endif  // defined(GTEST_GET_STRING_FROM_ENV_)
+}
+
+}  // namespace internal
+}  // namespace testing
diff --git a/libs/libaom/src/third_party/googletest/src/googletest/src/gtest-printers.cc b/libs/libaom/src/third_party/googletest/src/googletest/src/gtest-printers.cc
new file mode 100644
index 000000000..8399386a9
--- /dev/null
+++ b/libs/libaom/src/third_party/googletest/src/googletest/src/gtest-printers.cc
@@ -0,0 +1,400 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Google Test - The Google C++ Testing and Mocking Framework
+//
+// This file implements a universal value printer that can print a
+// value of any type T:
+//
+//   void ::testing::internal::UniversalPrinter<T>::Print(value, ostream_ptr);
+//
+// It uses the << operator when possible, and prints the bytes in the
+// object otherwise.  A user can override its behavior for a class
+// type Foo by defining either operator<<(::std::ostream&, const Foo&)
+// or void PrintTo(const Foo&, ::std::ostream*) in the namespace that
+// defines Foo.
+
+#include "gtest/gtest-printers.h"
+#include <stdio.h>
+#include <cctype>
+#include <cwchar>
+#include <ostream>  // NOLINT
+#include <string>
+#include "gtest/internal/gtest-port.h"
+#include "src/gtest-internal-inl.h"
+
+namespace testing {
+
+namespace {
+
+using ::std::ostream;
+
+// Prints a segment of bytes in the given object.
+GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
+GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
+GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
+void PrintByteSegmentInObjectTo(const unsigned char *obj_bytes, size_t start,
+                                size_t count, ostream *os) {
+  char text[5] = "";
+  for (size_t i = 0; i != count; i++) {
+    const size_t j = start + i;
+    if (i != 0) {
+      // Organizes the bytes into groups of 2 for easy parsing by
+      // human.
+      if ((j % 2) == 0)
+        *os << ' ';
+      else
+        *os << '-';
+    }
+    GTEST_SNPRINTF_(text, sizeof(text), "%02X", obj_bytes[j]);
+    *os << text;
+  }
+}
+
+// Prints the bytes in the given value to the given ostream.
+void PrintBytesInObjectToImpl(const unsigned char *obj_bytes, size_t count,
+                              ostream *os) {
+  // Tells the user how big the object is.
+  *os << count << "-byte object <";
+
+  const size_t kThreshold = 132;
+  const size_t kChunkSize = 64;
+  // If the object size is bigger than kThreshold, we'll have to omit
+  // some details by printing only the first and the last kChunkSize
+  // bytes.
+  if (count < kThreshold) {
+    PrintByteSegmentInObjectTo(obj_bytes, 0, count, os);
+  } else {
+    PrintByteSegmentInObjectTo(obj_bytes, 0, kChunkSize, os);
+    *os << " ... ";
+    // Rounds up to 2-byte boundary.
+    const size_t resume_pos = (count - kChunkSize + 1) / 2 * 2;
+    PrintByteSegmentInObjectTo(obj_bytes, resume_pos, count - resume_pos, os);
+  }
+  *os << ">";
+}
+
+}  // namespace
+
+namespace internal2 {
+
+// Delegates to PrintBytesInObjectToImpl() to print the bytes in the
+// given object.  The delegation simplifies the implementation, which
+// uses the << operator and thus is easier done outside of the
+// ::testing::internal namespace, which contains a << operator that
+// sometimes conflicts with the one in STL.
+void PrintBytesInObjectTo(const unsigned char *obj_bytes, size_t count,
+                          ostream *os) {
+  PrintBytesInObjectToImpl(obj_bytes, count, os);
+}
+
+}  // namespace internal2
+
+namespace internal {
+
+// Depending on the value of a char (or wchar_t), we print it in one
+// of three formats:
+//   - as is if it's a printable ASCII (e.g. 'a', '2', ' '),
+//   - as a hexadecimal escape sequence (e.g. '\x7F'), or
+//   - as a special escape sequence (e.g. '\r', '\n').
+enum CharFormat { kAsIs, kHexEscape, kSpecialEscape };
+
+// Returns true if c is a printable ASCII character.  We test the
+// value of c directly instead of calling isprint(), which is buggy on
+// Windows Mobile.
+inline bool IsPrintableAscii(wchar_t c) { return 0x20 <= c && c <= 0x7E; }
+
+// Prints a wide or narrow char c as a character literal without the
+// quotes, escaping it when necessary; returns how c was formatted.
+// The template argument UnsignedChar is the unsigned version of Char,
+// which is the type of c.
+template <typename UnsignedChar, typename Char>
+static CharFormat PrintAsCharLiteralTo(Char c, ostream *os) {
+  wchar_t w_c = static_cast<wchar_t>(c);
+  switch (w_c) {
+    case L'\0': *os << "\\0"; break;
+    case L'\'': *os << "\\'"; break;
+    case L'\\': *os << "\\\\"; break;
+    case L'\a': *os << "\\a"; break;
+    case L'\b': *os << "\\b"; break;
+    case L'\f': *os << "\\f"; break;
+    case L'\n': *os << "\\n"; break;
+    case L'\r': *os << "\\r"; break;
+    case L'\t': *os << "\\t"; break;
+    case L'\v': *os << "\\v"; break;
+    default:
+      if (IsPrintableAscii(w_c)) {
+        *os << static_cast<char>(c);
+        return kAsIs;
+      } else {
+        ostream::fmtflags flags = os->flags();
+        *os << "\\x" << std::hex << std::uppercase
+            << static_cast<int>(static_cast<UnsignedChar>(c));
+        os->flags(flags);
+        return kHexEscape;
+      }
+  }
+  return kSpecialEscape;
+}
+
+// Prints a wchar_t c as if it's part of a string literal, escaping it when
+// necessary; returns how c was formatted.
+static CharFormat PrintAsStringLiteralTo(wchar_t c, ostream *os) {
+  switch (c) {
+    case L'\'': *os << "'"; return kAsIs;
+    case L'"': *os << "\\\""; return kSpecialEscape;
+    default: return PrintAsCharLiteralTo<wchar_t>(c, os);
+  }
+}
+
+// Prints a char c as if it's part of a string literal, escaping it when
+// necessary; returns how c was formatted.
+static CharFormat PrintAsStringLiteralTo(char c, ostream *os) {
+  return PrintAsStringLiteralTo(
+      static_cast<wchar_t>(static_cast<unsigned char>(c)), os);
+}
+
+// Prints a wide or narrow character c and its code.  '\0' is printed
+// as "'\\0'", other unprintable characters are also properly escaped
+// using the standard C++ escape sequence.  The template argument
+// UnsignedChar is the unsigned version of Char, which is the type of c.
+template <typename UnsignedChar, typename Char>
+void PrintCharAndCodeTo(Char c, ostream *os) {
+  // First, print c as a literal in the most readable form we can find.
+  *os << ((sizeof(c) > 1) ? "L'" : "'");
+  const CharFormat format = PrintAsCharLiteralTo<UnsignedChar>(c, os);
+  *os << "'";
+
+  // To aid user debugging, we also print c's code in decimal, unless
+  // it's 0 (in which case c was printed as '\\0', making the code
+  // obvious).
+  if (c == 0) return;
+  *os << " (" << static_cast<int>(c);
+
+  // For more convenience, we print c's code again in hexadecimal,
+  // unless c was already printed in the form '\x##' or the code is in
+  // [1, 9].
+  if (format == kHexEscape || (1 <= c && c <= 9)) {
+    // Do nothing.
+  } else {
+    *os << ", 0x" << String::FormatHexInt(static_cast<int>(c));
+  }
+  *os << ")";
+}
+
+void PrintTo(unsigned char c, ::std::ostream *os) {
+  PrintCharAndCodeTo<unsigned char>(c, os);
+}
+void PrintTo(signed char c, ::std::ostream *os) {
+  PrintCharAndCodeTo<unsigned char>(c, os);
+}
+
+// Prints a wchar_t as a symbol if it is printable or as its internal
+// code otherwise and also as its code.  L'\0' is printed as "L'\\0'".
+void PrintTo(wchar_t wc, ostream *os) { PrintCharAndCodeTo<wchar_t>(wc, os); }
+
+// Prints the given array of characters to the ostream.  CharType must be either
+// char or wchar_t.
+// The array starts at begin, the length is len, it may include '\0' characters
+// and may not be NUL-terminated.
+template <typename CharType>
+GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+    GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
+        GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ static CharFormat
+        PrintCharsAsStringTo(const CharType *begin, size_t len, ostream *os) {
+  const char *const kQuoteBegin = sizeof(CharType) == 1 ? "\"" : "L\"";
+  *os << kQuoteBegin;
+  bool is_previous_hex = false;
+  CharFormat print_format = kAsIs;
+  for (size_t index = 0; index < len; ++index) {
+    const CharType cur = begin[index];
+    if (is_previous_hex && IsXDigit(cur)) {
+      // Previous character is of '\x..' form and this character can be
+      // interpreted as another hexadecimal digit in its number. Break string to
+      // disambiguate.
+      *os << "\" " << kQuoteBegin;
+    }
+    is_previous_hex = PrintAsStringLiteralTo(cur, os) == kHexEscape;
+    // Remember if any characters required hex escaping.
+    if (is_previous_hex) {
+      print_format = kHexEscape;
+    }
+  }
+  *os << "\"";
+  return print_format;
+}
+
+// Prints a (const) char/wchar_t array of 'len' elements, starting at address
+// 'begin'.  CharType must be either char or wchar_t.
+template <typename CharType>
+GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+    GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
+        GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ static void
+        UniversalPrintCharArray(const CharType *begin, size_t len,
+                                ostream *os) {
+  // The code
+  //   const char kFoo[] = "foo";
+  // generates an array of 4, not 3, elements, with the last one being '\0'.
+  //
+  // Therefore when printing a char array, we don't print the last element if
+  // it's '\0', such that the output matches the string literal as it's
+  // written in the source code.
+  if (len > 0 && begin[len - 1] == '\0') {
+    PrintCharsAsStringTo(begin, len - 1, os);
+    return;
+  }
+
+  // If, however, the last element in the array is not '\0', e.g.
+  //    const char kFoo[] = { 'f', 'o', 'o' };
+  // we must print the entire array.  We also print a message to indicate
+  // that the array is not NUL-terminated.
+  PrintCharsAsStringTo(begin, len, os);
+  *os << " (no terminating NUL)";
+}
+
+// Prints a (const) char array of 'len' elements, starting at address 'begin'.
+void UniversalPrintArray(const char *begin, size_t len, ostream *os) {
+  UniversalPrintCharArray(begin, len, os);
+}
+
+// Prints a (const) wchar_t array of 'len' elements, starting at address
+// 'begin'.
+void UniversalPrintArray(const wchar_t *begin, size_t len, ostream *os) {
+  UniversalPrintCharArray(begin, len, os);
+}
+
+// Prints the given C string to the ostream.
+void PrintTo(const char *s, ostream *os) {
+  if (s == nullptr) {
+    *os << "NULL";
+  } else {
+    *os << ImplicitCast_<const void *>(s) << " pointing to ";
+    PrintCharsAsStringTo(s, strlen(s), os);
+  }
+}
+
+// MSVC compiler can be configured to define whar_t as a typedef
+// of unsigned short. Defining an overload for const wchar_t* in that case
+// would cause pointers to unsigned shorts be printed as wide strings,
+// possibly accessing more memory than intended and causing invalid
+// memory accesses. MSVC defines _NATIVE_WCHAR_T_DEFINED symbol when
+// wchar_t is implemented as a native type.
+#if !defined(_MSC_VER) || defined(_NATIVE_WCHAR_T_DEFINED)
+// Prints the given wide C string to the ostream.
+void PrintTo(const wchar_t *s, ostream *os) {
+  if (s == nullptr) {
+    *os << "NULL";
+  } else {
+    *os << ImplicitCast_<const void *>(s) << " pointing to ";
+    PrintCharsAsStringTo(s, wcslen(s), os);
+  }
+}
+#endif  // wchar_t is native
+
+namespace {
+
+bool ContainsUnprintableControlCodes(const char *str, size_t length) {
+  const unsigned char *s = reinterpret_cast<const unsigned char *>(str);
+
+  for (size_t i = 0; i < length; i++) {
+    unsigned char ch = *s++;
+    if (std::iscntrl(ch)) {
+      switch (ch) {
+        case '\t':
+        case '\n':
+        case '\r': break;
+        default: return true;
+      }
+    }
+  }
+  return false;
+}
+
+bool IsUTF8TrailByte(unsigned char t) { return 0x80 <= t && t <= 0xbf; }
+
+bool IsValidUTF8(const char *str, size_t length) {
+  const unsigned char *s = reinterpret_cast<const unsigned char *>(str);
+
+  for (size_t i = 0; i < length;) {
+    unsigned char lead = s[i++];
+
+    if (lead <= 0x7f) {
+      continue;  // single-byte character (ASCII) 0..7F
+    }
+    if (lead < 0xc2) {
+      return false;  // trail byte or non-shortest form
+    } else if (lead <= 0xdf && (i + 1) <= length && IsUTF8TrailByte(s[i])) {
+      ++i;  // 2-byte character
+    } else if (0xe0 <= lead && lead <= 0xef && (i + 2) <= length &&
+               IsUTF8TrailByte(s[i]) && IsUTF8TrailByte(s[i + 1]) &&
+               // check for non-shortest form and surrogate
+               (lead != 0xe0 || s[i] >= 0xa0) &&
+               (lead != 0xed || s[i] < 0xa0)) {
+      i += 2;  // 3-byte character
+    } else if (0xf0 <= lead && lead <= 0xf4 && (i + 3) <= length &&
+               IsUTF8TrailByte(s[i]) && IsUTF8TrailByte(s[i + 1]) &&
+               IsUTF8TrailByte(s[i + 2]) &&
+               // check for non-shortest form
+               (lead != 0xf0 || s[i] >= 0x90) &&
+               (lead != 0xf4 || s[i] < 0x90)) {
+      i += 3;  // 4-byte character
+    } else {
+      return false;
+    }
+  }
+  return true;
+}
+
+void ConditionalPrintAsText(const char *str, size_t length, ostream *os) {
+  if (!ContainsUnprintableControlCodes(str, length) &&
+      IsValidUTF8(str, length)) {
+    *os << "\n    As Text: \"" << str << "\"";
+  }
+}
+
+}  // anonymous namespace
+
+void PrintStringTo(const ::std::string &s, ostream *os) {
+  if (PrintCharsAsStringTo(s.data(), s.size(), os) == kHexEscape) {
+    if (GTEST_FLAG(print_utf8)) {
+      ConditionalPrintAsText(s.data(), s.size(), os);
+    }
+  }
+}
+
+#if GTEST_HAS_STD_WSTRING
+void PrintWideStringTo(const ::std::wstring &s, ostream *os) {
+  PrintCharsAsStringTo(s.data(), s.size(), os);
+}
+#endif  // GTEST_HAS_STD_WSTRING
+
+}  // namespace internal
+
+}  // namespace testing
diff --git a/libs/libaom/src/third_party/googletest/src/googletest/src/gtest-test-part.cc b/libs/libaom/src/third_party/googletest/src/googletest/src/gtest-test-part.cc
new file mode 100644
index 000000000..44b0e2b3f
--- /dev/null
+++ b/libs/libaom/src/third_party/googletest/src/googletest/src/gtest-test-part.cc
@@ -0,0 +1,107 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+//
+// The Google C++ Testing and Mocking Framework (Google Test)
+
+#include "gtest/gtest-test-part.h"
+
+#include "gtest/internal/gtest-port.h"
+#include "src/gtest-internal-inl.h"
+
+namespace testing {
+
+using internal::GetUnitTestImpl;
+
+// Gets the summary of the failure message by omitting the stack trace
+// in it.
+std::string TestPartResult::ExtractSummary(const char *message) {
+  const char *const stack_trace = strstr(message, internal::kStackTraceMarker);
+  return stack_trace == nullptr ? message : std::string(message, stack_trace);
+}
+
+// Prints a TestPartResult object.
+std::ostream &operator<<(std::ostream &os, const TestPartResult &result) {
+  return os << internal::FormatFileLocation(result.file_name(),
+                                            result.line_number())
+            << " "
+            << (result.type() == TestPartResult::kSuccess
+                    ? "Success"
+                    : result.type() == TestPartResult::kSkip
+                          ? "Skipped"
+                          : result.type() == TestPartResult::kFatalFailure
+                                ? "Fatal failure"
+                                : "Non-fatal failure")
+            << ":\n"
+            << result.message() << std::endl;
+}
+
+// Appends a TestPartResult to the array.
+void TestPartResultArray::Append(const TestPartResult &result) {
+  array_.push_back(result);
+}
+
+// Returns the TestPartResult at the given index (0-based).
+const TestPartResult &TestPartResultArray::GetTestPartResult(int index) const {
+  if (index < 0 || index >= size()) {
+    printf("\nInvalid index (%d) into TestPartResultArray.\n", index);
+    internal::posix::Abort();
+  }
+
+  return array_[static_cast<size_t>(index)];
+}
+
+// Returns the number of TestPartResult objects in the array.
+int TestPartResultArray::size() const {
+  return static_cast<int>(array_.size());
+}
+
+namespace internal {
+
+HasNewFatalFailureHelper::HasNewFatalFailureHelper()
+    : has_new_fatal_failure_(false),
+      original_reporter_(
+          GetUnitTestImpl()->GetTestPartResultReporterForCurrentThread()) {
+  GetUnitTestImpl()->SetTestPartResultReporterForCurrentThread(this);
+}
+
+HasNewFatalFailureHelper::~HasNewFatalFailureHelper() {
+  GetUnitTestImpl()->SetTestPartResultReporterForCurrentThread(
+      original_reporter_);
+}
+
+void HasNewFatalFailureHelper::ReportTestPartResult(
+    const TestPartResult &result) {
+  if (result.fatally_failed()) has_new_fatal_failure_ = true;
+  original_reporter_->ReportTestPartResult(result);
+}
+
+}  // namespace internal
+
+}  // namespace testing
diff --git a/libs/libaom/src/third_party/googletest/src/googletest/src/gtest-typed-test.cc b/libs/libaom/src/third_party/googletest/src/googletest/src/gtest-typed-test.cc
new file mode 100644
index 000000000..04effad17
--- /dev/null
+++ b/libs/libaom/src/third_party/googletest/src/googletest/src/gtest-typed-test.cc
@@ -0,0 +1,117 @@
+// Copyright 2008 Google Inc.
+// All Rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "gtest/gtest-typed-test.h"
+
+#include "gtest/gtest.h"
+
+namespace testing {
+namespace internal {
+
+#if GTEST_HAS_TYPED_TEST_P
+
+// Skips to the first non-space char in str. Returns an empty string if str
+// contains only whitespace characters.
+static const char *SkipSpaces(const char *str) {
+  while (IsSpace(*str)) str++;
+  return str;
+}
+
+static std::vector<std::string> SplitIntoTestNames(const char *src) {
+  std::vector<std::string> name_vec;
+  src = SkipSpaces(src);
+  for (; src != nullptr; src = SkipComma(src)) {
+    name_vec.push_back(StripTrailingSpaces(GetPrefixUntilComma(src)));
+  }
+  return name_vec;
+}
+
+// Verifies that registered_tests match the test names in
+// registered_tests_; returns registered_tests if successful, or
+// aborts the program otherwise.
+const char *TypedTestSuitePState::VerifyRegisteredTestNames(
+    const char *test_suite_name, const char *file, int line,
+    const char *registered_tests) {
+  RegisterTypeParameterizedTestSuite(test_suite_name, CodeLocation(file, line));
+
+  typedef RegisteredTestsMap::const_iterator RegisteredTestIter;
+  registered_ = true;
+
+  std::vector<std::string> name_vec = SplitIntoTestNames(registered_tests);
+
+  Message errors;
+
+  std::set<std::string> tests;
+  for (std::vector<std::string>::const_iterator name_it = name_vec.begin();
+       name_it != name_vec.end(); ++name_it) {
+    const std::string &name = *name_it;
+    if (tests.count(name) != 0) {
+      errors << "Test " << name << " is listed more than once.\n";
+      continue;
+    }
+
+    bool found = false;
+    for (RegisteredTestIter it = registered_tests_.begin();
+         it != registered_tests_.end(); ++it) {
+      if (name == it->first) {
+        found = true;
+        break;
+      }
+    }
+
+    if (found) {
+      tests.insert(name);
+    } else {
+      errors << "No test named " << name
+             << " can be found in this test suite.\n";
+    }
+  }
+
+  for (RegisteredTestIter it = registered_tests_.begin();
+       it != registered_tests_.end(); ++it) {
+    if (tests.count(it->first) == 0) {
+      errors << "You forgot to list test " << it->first << ".\n";
+    }
+  }
+
+  const std::string &errors_str = errors.GetString();
+  if (errors_str != "") {
+    fprintf(stderr, "%s %s", FormatFileLocation(file, line).c_str(),
+            errors_str.c_str());
+    fflush(stderr);
+    posix::Abort();
+  }
+
+  return registered_tests;
+}
+
+#endif  // GTEST_HAS_TYPED_TEST_P
+
+}  // namespace internal
+}  // namespace testing
diff --git a/libs/libaom/src/third_party/googletest/src/googletest/src/gtest.cc b/libs/libaom/src/third_party/googletest/src/googletest/src/gtest.cc
new file mode 100644
index 000000000..5b4037fec
--- /dev/null
+++ b/libs/libaom/src/third_party/googletest/src/googletest/src/gtest.cc
@@ -0,0 +1,6240 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+//
+// The Google C++ Testing and Mocking Framework (Google Test)
+
+#include "gtest/gtest.h"
+#include "gtest/internal/custom/gtest.h"
+#include "gtest/gtest-spi.h"
+
+#include <ctype.h>
+#include <math.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <wchar.h>
+#include <wctype.h>
+
+#include <algorithm>
+#include <cstdint>
+#include <iomanip>
+#include <limits>
+#include <list>
+#include <map>
+#include <ostream>  // NOLINT
+#include <sstream>
+#include <vector>
+
+#if GTEST_OS_LINUX
+
+#define GTEST_HAS_GETTIMEOFDAY_ 1
+
+#include <fcntl.h>   // NOLINT
+#include <limits.h>  // NOLINT
+#include <sched.h>   // NOLINT
+// Declares vsnprintf().  This header is not available on Windows.
+#include <strings.h>   // NOLINT
+#include <sys/mman.h>  // NOLINT
+#include <sys/time.h>  // NOLINT
+#include <unistd.h>    // NOLINT
+#include <string>
+
+#elif GTEST_OS_ZOS
+#define GTEST_HAS_GETTIMEOFDAY_ 1
+#include <sys/time.h>  // NOLINT
+
+// On z/OS we additionally need strings.h for strcasecmp.
+#include <strings.h>   // NOLINT
+
+#elif GTEST_OS_WINDOWS_MOBILE  // We are on Windows CE.
+
+#include <windows.h>  // NOLINT
+#undef min
+
+#elif GTEST_OS_WINDOWS  // We are on Windows proper.
+
+#include <windows.h>  // NOLINT
+#undef min
+
+#ifdef _MSC_VER
+#include <crtdbg.h>    // NOLINT
+#include <debugapi.h>  // NOLINT
+#endif
+
+#include <io.h>         // NOLINT
+#include <sys/timeb.h>  // NOLINT
+#include <sys/types.h>  // NOLINT
+#include <sys/stat.h>   // NOLINT
+
+#if GTEST_OS_WINDOWS_MINGW
+// MinGW has gettimeofday() but not _ftime64().
+#define GTEST_HAS_GETTIMEOFDAY_ 1
+#include <sys/time.h>  // NOLINT
+#endif                 // GTEST_OS_WINDOWS_MINGW
+
+#else
+
+// Assume other platforms have gettimeofday().
+#define GTEST_HAS_GETTIMEOFDAY_ 1
+
+// cpplint thinks that the header is already included, so we want to
+// silence it.
+#include <sys/time.h>  // NOLINT
+#include <unistd.h>    // NOLINT
+
+#endif  // GTEST_OS_LINUX
+
+#if GTEST_HAS_EXCEPTIONS
+#include <stdexcept>
+#endif
+
+#if GTEST_CAN_STREAM_RESULTS_
+#include <arpa/inet.h>   // NOLINT
+#include <netdb.h>       // NOLINT
+#include <sys/socket.h>  // NOLINT
+#include <sys/types.h>   // NOLINT
+#endif
+
+#include "src/gtest-internal-inl.h"
+
+#if GTEST_OS_WINDOWS
+#define vsnprintf _vsnprintf
+#endif  // GTEST_OS_WINDOWS
+
+#if GTEST_OS_MAC
+#ifndef GTEST_OS_IOS
+#include <crt_externs.h>
+#endif
+#endif
+
+#if GTEST_HAS_ABSL
+#include "absl/debugging/failure_signal_handler.h"
+#include "absl/debugging/stacktrace.h"
+#include "absl/debugging/symbolize.h"
+#include "absl/strings/str_cat.h"
+#endif  // GTEST_HAS_ABSL
+
+namespace testing {
+
+using internal::CountIf;
+using internal::ForEach;
+using internal::GetElementOr;
+using internal::Shuffle;
+
+// Constants.
+
+// A test whose test suite name or test name matches this filter is
+// disabled and not run.
+static const char kDisableTestFilter[] = "DISABLED_*:*/DISABLED_*";
+
+// A test suite whose name matches this filter is considered a death
+// test suite and will be run before test suites whose name doesn't
+// match this filter.
+static const char kDeathTestSuiteFilter[] = "*DeathTest:*DeathTest/*";
+
+// A test filter that matches everything.
+static const char kUniversalFilter[] = "*";
+
+// The default output format.
+static const char kDefaultOutputFormat[] = "xml";
+// The default output file.
+static const char kDefaultOutputFile[] = "test_detail";
+
+// The environment variable name for the test shard index.
+static const char kTestShardIndex[] = "GTEST_SHARD_INDEX";
+// The environment variable name for the total number of test shards.
+static const char kTestTotalShards[] = "GTEST_TOTAL_SHARDS";
+// The environment variable name for the test shard status file.
+static const char kTestShardStatusFile[] = "GTEST_SHARD_STATUS_FILE";
+
+namespace internal {
+
+// The text used in failure messages to indicate the start of the
+// stack trace.
+const char kStackTraceMarker[] = "\nStack trace:\n";
+
+// g_help_flag is true if and only if the --help flag or an equivalent form
+// is specified on the command line.
+bool g_help_flag = false;
+
+// Utilty function to Open File for Writing
+static FILE *OpenFileForWriting(const std::string &output_file) {
+  FILE *fileout = nullptr;
+  FilePath output_file_path(output_file);
+  FilePath output_dir(output_file_path.RemoveFileName());
+
+  if (output_dir.CreateDirectoriesRecursively()) {
+    fileout = posix::FOpen(output_file.c_str(), "w");
+  }
+  if (fileout == nullptr) {
+    GTEST_LOG_(FATAL) << "Unable to open file \"" << output_file << "\"";
+  }
+  return fileout;
+}
+
+}  // namespace internal
+
+// Bazel passes in the argument to '--test_filter' via the TESTBRIDGE_TEST_ONLY
+// environment variable.
+static const char *GetDefaultFilter() {
+  const char *const testbridge_test_only =
+      internal::posix::GetEnv("TESTBRIDGE_TEST_ONLY");
+  if (testbridge_test_only != nullptr) {
+    return testbridge_test_only;
+  }
+  return kUniversalFilter;
+}
+
+GTEST_DEFINE_bool_(
+    also_run_disabled_tests,
+    internal::BoolFromGTestEnv("also_run_disabled_tests", false),
+    "Run disabled tests too, in addition to the tests normally being run.");
+
+GTEST_DEFINE_bool_(
+    break_on_failure, internal::BoolFromGTestEnv("break_on_failure", false),
+    "True if and only if a failed assertion should be a debugger "
+    "break-point.");
+
+GTEST_DEFINE_bool_(catch_exceptions,
+                   internal::BoolFromGTestEnv("catch_exceptions", true),
+                   "True if and only if " GTEST_NAME_
+                   " should catch exceptions and treat them as test failures.");
+
+GTEST_DEFINE_string_(
+    color, internal::StringFromGTestEnv("color", "auto"),
+    "Whether to use colors in the output.  Valid values: yes, no, "
+    "and auto.  'auto' means to use colors if the output is "
+    "being sent to a terminal and the TERM environment variable "
+    "is set to a terminal type that supports colors.");
+
+GTEST_DEFINE_string_(
+    filter, internal::StringFromGTestEnv("filter", GetDefaultFilter()),
+    "A colon-separated list of glob (not regex) patterns "
+    "for filtering the tests to run, optionally followed by a "
+    "'-' and a : separated list of negative patterns (tests to "
+    "exclude).  A test is run if it matches one of the positive "
+    "patterns and does not match any of the negative patterns.");
+
+GTEST_DEFINE_bool_(
+    install_failure_signal_handler,
+    internal::BoolFromGTestEnv("install_failure_signal_handler", false),
+    "If true and supported on the current platform, " GTEST_NAME_
+    " should "
+    "install a signal handler that dumps debugging information when fatal "
+    "signals are raised.");
+
+GTEST_DEFINE_bool_(list_tests, false, "List all tests without running them.");
+
+// The net priority order after flag processing is thus:
+//   --gtest_output command line flag
+//   GTEST_OUTPUT environment variable
+//   XML_OUTPUT_FILE environment variable
+//   ''
+GTEST_DEFINE_string_(
+    output,
+    internal::StringFromGTestEnv("output",
+                                 internal::OutputFlagAlsoCheckEnvVar().c_str()),
+    "A format (defaults to \"xml\" but can be specified to be \"json\"), "
+    "optionally followed by a colon and an output file name or directory. "
+    "A directory is indicated by a trailing pathname separator. "
+    "Examples: \"xml:filename.xml\", \"xml::directoryname/\". "
+    "If a directory is specified, output files will be created "
+    "within that directory, with file-names based on the test "
+    "executable's name and, if necessary, made unique by adding "
+    "digits.");
+
+GTEST_DEFINE_bool_(print_time, internal::BoolFromGTestEnv("print_time", true),
+                   "True if and only if " GTEST_NAME_
+                   " should display elapsed time in text output.");
+
+GTEST_DEFINE_bool_(print_utf8, internal::BoolFromGTestEnv("print_utf8", true),
+                   "True if and only if " GTEST_NAME_
+                   " prints UTF8 characters as text.");
+
+GTEST_DEFINE_int32_(
+    random_seed, internal::Int32FromGTestEnv("random_seed", 0),
+    "Random number seed to use when shuffling test orders.  Must be in range "
+    "[1, 99999], or 0 to use a seed based on the current time.");
+
+GTEST_DEFINE_int32_(
+    repeat, internal::Int32FromGTestEnv("repeat", 1),
+    "How many times to repeat each test.  Specify a negative number "
+    "for repeating forever.  Useful for shaking out flaky tests.");
+
+GTEST_DEFINE_bool_(show_internal_stack_frames, false,
+                   "True if and only if " GTEST_NAME_
+                   " should include internal stack frames when "
+                   "printing test failure stack traces.");
+
+GTEST_DEFINE_bool_(shuffle, internal::BoolFromGTestEnv("shuffle", false),
+                   "True if and only if " GTEST_NAME_
+                   " should randomize tests' order on every run.");
+
+GTEST_DEFINE_int32_(
+    stack_trace_depth,
+    internal::Int32FromGTestEnv("stack_trace_depth", kMaxStackTraceDepth),
+    "The maximum number of stack frames to print when an "
+    "assertion fails.  The valid range is 0 through 100, inclusive.");
+
+GTEST_DEFINE_string_(
+    stream_result_to, internal::StringFromGTestEnv("stream_result_to", ""),
+    "This flag specifies the host name and the port number on which to stream "
+    "test results. Example: \"localhost:555\". The flag is effective only on "
+    "Linux.");
+
+GTEST_DEFINE_bool_(
+    throw_on_failure, internal::BoolFromGTestEnv("throw_on_failure", false),
+    "When this flag is specified, a failed assertion will throw an exception "
+    "if exceptions are enabled or exit the program with a non-zero code "
+    "otherwise. For use with an external test framework.");
+
+#if GTEST_USE_OWN_FLAGFILE_FLAG_
+GTEST_DEFINE_string_(
+    flagfile, internal::StringFromGTestEnv("flagfile", ""),
+    "This flag specifies the flagfile to read command-line flags from.");
+#endif  // GTEST_USE_OWN_FLAGFILE_FLAG_
+
+namespace internal {
+
+// Generates a random number from [0, range), using a Linear
+// Congruential Generator (LCG).  Crashes if 'range' is 0 or greater
+// than kMaxRange.
+uint32_t Random::Generate(uint32_t range) {
+  // These constants are the same as are used in glibc's rand(3).
+  // Use wider types than necessary to prevent unsigned overflow diagnostics.
+  state_ = static_cast<uint32_t>(1103515245ULL * state_ + 12345U) % kMaxRange;
+
+  GTEST_CHECK_(range > 0) << "Cannot generate a number in the range [0, 0).";
+  GTEST_CHECK_(range <= kMaxRange)
+      << "Generation of a number in [0, " << range << ") was requested, "
+      << "but this can only generate numbers in [0, " << kMaxRange << ").";
+
+  // Converting via modulus introduces a bit of downward bias, but
+  // it's simple, and a linear congruential generator isn't too good
+  // to begin with.
+  return state_ % range;
+}
+
+// GTestIsInitialized() returns true if and only if the user has initialized
+// Google Test.  Useful for catching the user mistake of not initializing
+// Google Test before calling RUN_ALL_TESTS().
+static bool GTestIsInitialized() { return GetArgvs().size() > 0; }
+
+// Iterates over a vector of TestSuites, keeping a running sum of the
+// results of calling a given int-returning method on each.
+// Returns the sum.
+static int SumOverTestSuiteList(const std::vector<TestSuite *> &case_list,
+                                int (TestSuite::*method)() const) {
+  int sum = 0;
+  for (size_t i = 0; i < case_list.size(); i++) {
+    sum += (case_list[i]->*method)();
+  }
+  return sum;
+}
+
+// Returns true if and only if the test suite passed.
+static bool TestSuitePassed(const TestSuite *test_suite) {
+  return test_suite->should_run() && test_suite->Passed();
+}
+
+// Returns true if and only if the test suite failed.
+static bool TestSuiteFailed(const TestSuite *test_suite) {
+  return test_suite->should_run() && test_suite->Failed();
+}
+
+// Returns true if and only if test_suite contains at least one test that
+// should run.
+static bool ShouldRunTestSuite(const TestSuite *test_suite) {
+  return test_suite->should_run();
+}
+
+// AssertHelper constructor.
+AssertHelper::AssertHelper(TestPartResult::Type type, const char *file,
+                           int line, const char *message)
+    : data_(new AssertHelperData(type, file, line, message)) {}
+
+AssertHelper::~AssertHelper() { delete data_; }
+
+// Message assignment, for assertion streaming support.
+void AssertHelper::operator=(const Message &message) const {
+  UnitTest::GetInstance()->AddTestPartResult(
+      data_->type, data_->file, data_->line,
+      AppendUserMessage(data_->message, message),
+      UnitTest::GetInstance()->impl()->CurrentOsStackTraceExceptTop(1)
+      // Skips the stack frame for this function itself.
+  );  // NOLINT
+}
+
+namespace {
+
+// When TEST_P is found without a matching INSTANTIATE_TEST_SUITE_P
+// to creates test cases for it, a syntetic test case is
+// inserted to report ether an error or a log message.
+//
+// This configuration bit will likely be removed at some point.
+constexpr bool kErrorOnUninstantiatedParameterizedTest = false;
+constexpr bool kErrorOnUninstantiatedTypeParameterizedTest = false;
+
+// A test that fails at a given file/line location with a given message.
+class FailureTest : public Test {
+ public:
+  explicit FailureTest(const CodeLocation &loc, std::string error_message,
+                       bool as_error)
+      : loc_(loc), error_message_(std::move(error_message)),
+        as_error_(as_error) {}
+
+  void TestBody() override {
+    if (as_error_) {
+      AssertHelper(TestPartResult::kNonFatalFailure, loc_.file.c_str(),
+                   loc_.line, "") = Message() << error_message_;
+    } else {
+      std::cout << error_message_ << std::endl;
+    }
+  }
+
+ private:
+  const CodeLocation loc_;
+  const std::string error_message_;
+  const bool as_error_;
+};
+
+}  // namespace
+
+std::set<std::string> *GetIgnoredParameterizedTestSuites() {
+  return UnitTest::GetInstance()->impl()->ignored_parameterized_test_suites();
+}
+
+// Add a given test_suit to the list of them allow to go un-instantiated.
+MarkAsIgnored::MarkAsIgnored(const char *test_suite) {
+  GetIgnoredParameterizedTestSuites()->insert(test_suite);
+}
+
+// If this parameterized test suite has no instantiations (and that
+// has not been marked as okay), emit a test case reporting that.
+void InsertSyntheticTestCase(const std::string &name, CodeLocation location,
+                             bool has_test_p) {
+  const auto &ignored = *GetIgnoredParameterizedTestSuites();
+  if (ignored.find(name) != ignored.end()) return;
+
+  const char kMissingInstantiation[] =  //
+      " is defined via TEST_P, but never instantiated. None of the test cases "
+      "will run. Either no INSTANTIATE_TEST_SUITE_P is provided or the only "
+      "ones provided expand to nothing."
+      "\n\n"
+      "Ideally, TEST_P definitions should only ever be included as part of "
+      "binaries that intend to use them. (As opposed to, for example, being "
+      "placed in a library that may be linked in to get other utilities.)";
+
+  const char kMissingTestCase[] =  //
+      " is instantiated via INSTANTIATE_TEST_SUITE_P, but no tests are "
+      "defined via TEST_P . No test cases will run."
+      "\n\n"
+      "Ideally, INSTANTIATE_TEST_SUITE_P should only ever be invoked from "
+      "code that always depend on code that provides TEST_P. Failing to do "
+      "so is often an indication of dead code, e.g. the last TEST_P was "
+      "removed but the rest got left behind.";
+
+  std::string message =
+      "Paramaterized test suite " + name +
+      (has_test_p ? kMissingInstantiation : kMissingTestCase) +
+      "\n\n"
+      "To suppress this error for this test suite, insert the following line "
+      "(in a non-header) in the namespace it is defined in:"
+      "\n\n"
+      "GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(" +
+      name + ");";
+
+  std::string full_name = "UninstantiatedParamaterizedTestSuite<" + name + ">";
+  RegisterTest(  //
+      "GoogleTestVerification", full_name.c_str(),
+      nullptr,  // No type parameter.
+      nullptr,  // No value parameter.
+      location.file.c_str(), location.line, [message, location] {
+        return new FailureTest(location, message,
+                               kErrorOnUninstantiatedParameterizedTest);
+      });
+}
+
+void RegisterTypeParameterizedTestSuite(const char *test_suite_name,
+                                        CodeLocation code_location) {
+  GetUnitTestImpl()->type_parameterized_test_registry().RegisterTestSuite(
+      test_suite_name, code_location);
+}
+
+void RegisterTypeParameterizedTestSuiteInstantiation(const char *case_name) {
+  GetUnitTestImpl()->type_parameterized_test_registry().RegisterInstantiation(
+      case_name);
+}
+
+void TypeParameterizedTestSuiteRegistry::RegisterTestSuite(
+    const char *test_suite_name, CodeLocation code_location) {
+  suites_.emplace(std::string(test_suite_name),
+                  TypeParameterizedTestSuiteInfo(code_location));
+}
+
+void TypeParameterizedTestSuiteRegistry::RegisterInstantiation(
+    const char *test_suite_name) {
+  auto it = suites_.find(std::string(test_suite_name));
+  if (it != suites_.end()) {
+    it->second.instantiated = true;
+  } else {
+    GTEST_LOG_(ERROR) << "Unknown type parameterized test suit '"
+                      << test_suite_name << "'";
+  }
+}
+
+void TypeParameterizedTestSuiteRegistry::CheckForInstantiations() {
+  const auto &ignored = *GetIgnoredParameterizedTestSuites();
+  for (const auto &testcase : suites_) {
+    if (testcase.second.instantiated) continue;
+    if (ignored.find(testcase.first) != ignored.end()) continue;
+
+    std::string message =
+        "Type paramaterized test suite " + testcase.first +
+        " is defined via REGISTER_TYPED_TEST_SUITE_P, but never instantiated "
+        "via INSTANTIATE_TYPED_TEST_SUITE_P. None of the test cases will run."
+        "\n\n"
+        "Ideally, TYPED_TEST_P definitions should only ever be included as "
+        "part of binaries that intend to use them. (As opposed to, for "
+        "example, being placed in a library that may be linked in to get other "
+        "utilities.)"
+        "\n\n"
+        "To suppress this error for this test suite, insert the following line "
+        "(in a non-header) in the namespace it is definedin in:"
+        "\n\n"
+        "GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(" +
+        testcase.first + ");";
+
+    std::string full_name =
+        "UninstantiatedTypeParamaterizedTestSuite<" + testcase.first + ">";
+    RegisterTest(  //
+        "GoogleTestVerification", full_name.c_str(),
+        nullptr,  // No type parameter.
+        nullptr,  // No value parameter.
+        testcase.second.code_location.file.c_str(),
+        testcase.second.code_location.line, [message, testcase] {
+          return new FailureTest(testcase.second.code_location, message,
+                                 kErrorOnUninstantiatedTypeParameterizedTest);
+        });
+  }
+}
+
+// A copy of all command line arguments.  Set by InitGoogleTest().
+static ::std::vector<std::string> g_argvs;
+
+::std::vector<std::string> GetArgvs() {
+#if defined(GTEST_CUSTOM_GET_ARGVS_)
+  // GTEST_CUSTOM_GET_ARGVS_() may return a container of std::string or
+  // ::string. This code converts it to the appropriate type.
+  const auto &custom = GTEST_CUSTOM_GET_ARGVS_();
+  return ::std::vector<std::string>(custom.begin(), custom.end());
+#else   // defined(GTEST_CUSTOM_GET_ARGVS_)
+  return g_argvs;
+#endif  // defined(GTEST_CUSTOM_GET_ARGVS_)
+}
+
+// Returns the current application's name, removing directory path if that
+// is present.
+FilePath GetCurrentExecutableName() {
+  FilePath result;
+
+#if GTEST_OS_WINDOWS || GTEST_OS_OS2
+  result.Set(FilePath(GetArgvs()[0]).RemoveExtension("exe"));
+#else
+  result.Set(FilePath(GetArgvs()[0]));
+#endif  // GTEST_OS_WINDOWS
+
+  return result.RemoveDirectoryName();
+}
+
+// Functions for processing the gtest_output flag.
+
+// Returns the output format, or "" for normal printed output.
+std::string UnitTestOptions::GetOutputFormat() {
+  const char *const gtest_output_flag = GTEST_FLAG(output).c_str();
+  const char *const colon = strchr(gtest_output_flag, ':');
+  return (colon == nullptr)
+             ? std::string(gtest_output_flag)
+             : std::string(gtest_output_flag,
+                           static_cast<size_t>(colon - gtest_output_flag));
+}
+
+// Returns the name of the requested output file, or the default if none
+// was explicitly specified.
+std::string UnitTestOptions::GetAbsolutePathToOutputFile() {
+  const char *const gtest_output_flag = GTEST_FLAG(output).c_str();
+
+  std::string format = GetOutputFormat();
+  if (format.empty()) format = std::string(kDefaultOutputFormat);
+
+  const char *const colon = strchr(gtest_output_flag, ':');
+  if (colon == nullptr)
+    return internal::FilePath::MakeFileName(
+               internal::FilePath(
+                   UnitTest::GetInstance()->original_working_dir()),
+               internal::FilePath(kDefaultOutputFile), 0, format.c_str())
+        .string();
+
+  internal::FilePath output_name(colon + 1);
+  if (!output_name.IsAbsolutePath())
+    output_name = internal::FilePath::ConcatPaths(
+        internal::FilePath(UnitTest::GetInstance()->original_working_dir()),
+        internal::FilePath(colon + 1));
+
+  if (!output_name.IsDirectory()) return output_name.string();
+
+  internal::FilePath result(internal::FilePath::GenerateUniqueFileName(
+      output_name, internal::GetCurrentExecutableName(),
+      GetOutputFormat().c_str()));
+  return result.string();
+}
+
+// Returns true if and only if the wildcard pattern matches the string.
+// The first ':' or '\0' character in pattern marks the end of it.
+//
+// This recursive algorithm isn't very efficient, but is clear and
+// works well enough for matching test names, which are short.
+bool UnitTestOptions::PatternMatchesString(const char *pattern,
+                                           const char *str) {
+  switch (*pattern) {
+    case '\0':
+    case ':':  // Either ':' or '\0' marks the end of the pattern.
+      return *str == '\0';
+    case '?':  // Matches any single character.
+      return *str != '\0' && PatternMatchesString(pattern + 1, str + 1);
+    case '*':  // Matches any string (possibly empty) of characters.
+      return (*str != '\0' && PatternMatchesString(pattern, str + 1)) ||
+             PatternMatchesString(pattern + 1, str);
+    default:  // Non-special character.  Matches itself.
+      return *pattern == *str && PatternMatchesString(pattern + 1, str + 1);
+  }
+}
+
+bool UnitTestOptions::MatchesFilter(const std::string &name,
+                                    const char *filter) {
+  const char *cur_pattern = filter;
+  for (;;) {
+    if (PatternMatchesString(cur_pattern, name.c_str())) {
+      return true;
+    }
+
+    // Finds the next pattern in the filter.
+    cur_pattern = strchr(cur_pattern, ':');
+
+    // Returns if no more pattern can be found.
+    if (cur_pattern == nullptr) {
+      return false;
+    }
+
+    // Skips the pattern separater (the ':' character).
+    cur_pattern++;
+  }
+}
+
+// Returns true if and only if the user-specified filter matches the test
+// suite name and the test name.
+bool UnitTestOptions::FilterMatchesTest(const std::string &test_suite_name,
+                                        const std::string &test_name) {
+  const std::string &full_name = test_suite_name + "." + test_name.c_str();
+
+  // Split --gtest_filter at '-', if there is one, to separate into
+  // positive filter and negative filter portions
+  const char *const p = GTEST_FLAG(filter).c_str();
+  const char *const dash = strchr(p, '-');
+  std::string positive;
+  std::string negative;
+  if (dash == nullptr) {
+    positive = GTEST_FLAG(filter).c_str();  // Whole string is a positive filter
+    negative = "";
+  } else {
+    positive = std::string(p, dash);   // Everything up to the dash
+    negative = std::string(dash + 1);  // Everything after the dash
+    if (positive.empty()) {
+      // Treat '-test1' as the same as '*-test1'
+      positive = kUniversalFilter;
+    }
+  }
+
+  // A filter is a colon-separated list of patterns.  It matches a
+  // test if any pattern in it matches the test.
+  return (MatchesFilter(full_name, positive.c_str()) &&
+          !MatchesFilter(full_name, negative.c_str()));
+}
+
+#if GTEST_HAS_SEH
+// Returns EXCEPTION_EXECUTE_HANDLER if Google Test should handle the
+// given SEH exception, or EXCEPTION_CONTINUE_SEARCH otherwise.
+// This function is useful as an __except condition.
+int UnitTestOptions::GTestShouldProcessSEH(DWORD exception_code) {
+  // Google Test should handle a SEH exception if:
+  //   1. the user wants it to, AND
+  //   2. this is not a breakpoint exception, AND
+  //   3. this is not a C++ exception (VC++ implements them via SEH,
+  //      apparently).
+  //
+  // SEH exception code for C++ exceptions.
+  // (see http://support.microsoft.com/kb/185294 for more information).
+  const DWORD kCxxExceptionCode = 0xe06d7363;
+
+  bool should_handle = true;
+
+  if (!GTEST_FLAG(catch_exceptions))
+    should_handle = false;
+  else if (exception_code == EXCEPTION_BREAKPOINT)
+    should_handle = false;
+  else if (exception_code == kCxxExceptionCode)
+    should_handle = false;
+
+  return should_handle ? EXCEPTION_EXECUTE_HANDLER : EXCEPTION_CONTINUE_SEARCH;
+}
+#endif  // GTEST_HAS_SEH
+
+}  // namespace internal
+
+// The c'tor sets this object as the test part result reporter used by
+// Google Test.  The 'result' parameter specifies where to report the
+// results. Intercepts only failures from the current thread.
+ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter(
+    TestPartResultArray *result)
+    : intercept_mode_(INTERCEPT_ONLY_CURRENT_THREAD), result_(result) {
+  Init();
+}
+
+// The c'tor sets this object as the test part result reporter used by
+// Google Test.  The 'result' parameter specifies where to report the
+// results.
+ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter(
+    InterceptMode intercept_mode, TestPartResultArray *result)
+    : intercept_mode_(intercept_mode), result_(result) {
+  Init();
+}
+
+void ScopedFakeTestPartResultReporter::Init() {
+  internal::UnitTestImpl *const impl = internal::GetUnitTestImpl();
+  if (intercept_mode_ == INTERCEPT_ALL_THREADS) {
+    old_reporter_ = impl->GetGlobalTestPartResultReporter();
+    impl->SetGlobalTestPartResultReporter(this);
+  } else {
+    old_reporter_ = impl->GetTestPartResultReporterForCurrentThread();
+    impl->SetTestPartResultReporterForCurrentThread(this);
+  }
+}
+
+// The d'tor restores the test part result reporter used by Google Test
+// before.
+ScopedFakeTestPartResultReporter::~ScopedFakeTestPartResultReporter() {
+  internal::UnitTestImpl *const impl = internal::GetUnitTestImpl();
+  if (intercept_mode_ == INTERCEPT_ALL_THREADS) {
+    impl->SetGlobalTestPartResultReporter(old_reporter_);
+  } else {
+    impl->SetTestPartResultReporterForCurrentThread(old_reporter_);
+  }
+}
+
+// Increments the test part result count and remembers the result.
+// This method is from the TestPartResultReporterInterface interface.
+void ScopedFakeTestPartResultReporter::ReportTestPartResult(
+    const TestPartResult &result) {
+  result_->Append(result);
+}
+
+namespace internal {
+
+// Returns the type ID of ::testing::Test.  We should always call this
+// instead of GetTypeId< ::testing::Test>() to get the type ID of
+// testing::Test.  This is to work around a suspected linker bug when
+// using Google Test as a framework on Mac OS X.  The bug causes
+// GetTypeId< ::testing::Test>() to return different values depending
+// on whether the call is from the Google Test framework itself or
+// from user test code.  GetTestTypeId() is guaranteed to always
+// return the same value, as it always calls GetTypeId<>() from the
+// gtest.cc, which is within the Google Test framework.
+TypeId GetTestTypeId() { return GetTypeId<Test>(); }
+
+// The value of GetTestTypeId() as seen from within the Google Test
+// library.  This is solely for testing GetTestTypeId().
+extern const TypeId kTestTypeIdInGoogleTest = GetTestTypeId();
+
+// This predicate-formatter checks that 'results' contains a test part
+// failure of the given type and that the failure message contains the
+// given substring.
+static AssertionResult HasOneFailure(const char * /* results_expr */,
+                                     const char * /* type_expr */,
+                                     const char * /* substr_expr */,
+                                     const TestPartResultArray &results,
+                                     TestPartResult::Type type,
+                                     const std::string &substr) {
+  const std::string expected(type == TestPartResult::kFatalFailure
+                                 ? "1 fatal failure"
+                                 : "1 non-fatal failure");
+  Message msg;
+  if (results.size() != 1) {
+    msg << "Expected: " << expected << "\n"
+        << "  Actual: " << results.size() << " failures";
+    for (int i = 0; i < results.size(); i++) {
+      msg << "\n" << results.GetTestPartResult(i);
+    }
+    return AssertionFailure() << msg;
+  }
+
+  const TestPartResult &r = results.GetTestPartResult(0);
+  if (r.type() != type) {
+    return AssertionFailure() << "Expected: " << expected << "\n"
+                              << "  Actual:\n"
+                              << r;
+  }
+
+  if (strstr(r.message(), substr.c_str()) == nullptr) {
+    return AssertionFailure()
+           << "Expected: " << expected << " containing \"" << substr << "\"\n"
+           << "  Actual:\n"
+           << r;
+  }
+
+  return AssertionSuccess();
+}
+
+// The constructor of SingleFailureChecker remembers where to look up
+// test part results, what type of failure we expect, and what
+// substring the failure message should contain.
+SingleFailureChecker::SingleFailureChecker(const TestPartResultArray *results,
+                                           TestPartResult::Type type,
+                                           const std::string &substr)
+    : results_(results), type_(type), substr_(substr) {}
+
+// The destructor of SingleFailureChecker verifies that the given
+// TestPartResultArray contains exactly one failure that has the given
+// type and contains the given substring.  If that's not the case, a
+// non-fatal failure will be generated.
+SingleFailureChecker::~SingleFailureChecker() {
+  EXPECT_PRED_FORMAT3(HasOneFailure, *results_, type_, substr_);
+}
+
+DefaultGlobalTestPartResultReporter::DefaultGlobalTestPartResultReporter(
+    UnitTestImpl *unit_test)
+    : unit_test_(unit_test) {}
+
+void DefaultGlobalTestPartResultReporter::ReportTestPartResult(
+    const TestPartResult &result) {
+  unit_test_->current_test_result()->AddTestPartResult(result);
+  unit_test_->listeners()->repeater()->OnTestPartResult(result);
+}
+
+DefaultPerThreadTestPartResultReporter::DefaultPerThreadTestPartResultReporter(
+    UnitTestImpl *unit_test)
+    : unit_test_(unit_test) {}
+
+void DefaultPerThreadTestPartResultReporter::ReportTestPartResult(
+    const TestPartResult &result) {
+  unit_test_->GetGlobalTestPartResultReporter()->ReportTestPartResult(result);
+}
+
+// Returns the global test part result reporter.
+TestPartResultReporterInterface *
+UnitTestImpl::GetGlobalTestPartResultReporter() {
+  internal::MutexLock lock(&global_test_part_result_reporter_mutex_);
+  return global_test_part_result_repoter_;
+}
+
+// Sets the global test part result reporter.
+void UnitTestImpl::SetGlobalTestPartResultReporter(
+    TestPartResultReporterInterface *reporter) {
+  internal::MutexLock lock(&global_test_part_result_reporter_mutex_);
+  global_test_part_result_repoter_ = reporter;
+}
+
+// Returns the test part result reporter for the current thread.
+TestPartResultReporterInterface *
+UnitTestImpl::GetTestPartResultReporterForCurrentThread() {
+  return per_thread_test_part_result_reporter_.get();
+}
+
+// Sets the test part result reporter for the current thread.
+void UnitTestImpl::SetTestPartResultReporterForCurrentThread(
+    TestPartResultReporterInterface *reporter) {
+  per_thread_test_part_result_reporter_.set(reporter);
+}
+
+// Gets the number of successful test suites.
+int UnitTestImpl::successful_test_suite_count() const {
+  return CountIf(test_suites_, TestSuitePassed);
+}
+
+// Gets the number of failed test suites.
+int UnitTestImpl::failed_test_suite_count() const {
+  return CountIf(test_suites_, TestSuiteFailed);
+}
+
+// Gets the number of all test suites.
+int UnitTestImpl::total_test_suite_count() const {
+  return static_cast<int>(test_suites_.size());
+}
+
+// Gets the number of all test suites that contain at least one test
+// that should run.
+int UnitTestImpl::test_suite_to_run_count() const {
+  return CountIf(test_suites_, ShouldRunTestSuite);
+}
+
+// Gets the number of successful tests.
+int UnitTestImpl::successful_test_count() const {
+  return SumOverTestSuiteList(test_suites_, &TestSuite::successful_test_count);
+}
+
+// Gets the number of skipped tests.
+int UnitTestImpl::skipped_test_count() const {
+  return SumOverTestSuiteList(test_suites_, &TestSuite::skipped_test_count);
+}
+
+// Gets the number of failed tests.
+int UnitTestImpl::failed_test_count() const {
+  return SumOverTestSuiteList(test_suites_, &TestSuite::failed_test_count);
+}
+
+// Gets the number of disabled tests that will be reported in the XML report.
+int UnitTestImpl::reportable_disabled_test_count() const {
+  return SumOverTestSuiteList(test_suites_,
+                              &TestSuite::reportable_disabled_test_count);
+}
+
+// Gets the number of disabled tests.
+int UnitTestImpl::disabled_test_count() const {
+  return SumOverTestSuiteList(test_suites_, &TestSuite::disabled_test_count);
+}
+
+// Gets the number of tests to be printed in the XML report.
+int UnitTestImpl::reportable_test_count() const {
+  return SumOverTestSuiteList(test_suites_, &TestSuite::reportable_test_count);
+}
+
+// Gets the number of all tests.
+int UnitTestImpl::total_test_count() const {
+  return SumOverTestSuiteList(test_suites_, &TestSuite::total_test_count);
+}
+
+// Gets the number of tests that should run.
+int UnitTestImpl::test_to_run_count() const {
+  return SumOverTestSuiteList(test_suites_, &TestSuite::test_to_run_count);
+}
+
+// Returns the current OS stack trace as an std::string.
+//
+// The maximum number of stack frames to be included is specified by
+// the gtest_stack_trace_depth flag.  The skip_count parameter
+// specifies the number of top frames to be skipped, which doesn't
+// count against the number of frames to be included.
+//
+// For example, if Foo() calls Bar(), which in turn calls
+// CurrentOsStackTraceExceptTop(1), Foo() will be included in the
+// trace but Bar() and CurrentOsStackTraceExceptTop() won't.
+std::string UnitTestImpl::CurrentOsStackTraceExceptTop(int skip_count) {
+  return os_stack_trace_getter()->CurrentStackTrace(
+      static_cast<int>(GTEST_FLAG(stack_trace_depth)), skip_count + 1
+      // Skips the user-specified number of frames plus this function
+      // itself.
+  );  // NOLINT
+}
+
+// Returns the current time in milliseconds.
+TimeInMillis GetTimeInMillis() {
+#if GTEST_OS_WINDOWS_MOBILE || defined(__BORLANDC__)
+  // Difference between 1970-01-01 and 1601-01-01 in milliseconds.
+  // http://analogous.blogspot.com/2005/04/epoch.html
+  const TimeInMillis kJavaEpochToWinFileTimeDelta =
+      static_cast<TimeInMillis>(116444736UL) * 100000UL;
+  const DWORD kTenthMicrosInMilliSecond = 10000;
+
+  SYSTEMTIME now_systime;
+  FILETIME now_filetime;
+  ULARGE_INTEGER now_int64;
+  GetSystemTime(&now_systime);
+  if (SystemTimeToFileTime(&now_systime, &now_filetime)) {
+    now_int64.LowPart = now_filetime.dwLowDateTime;
+    now_int64.HighPart = now_filetime.dwHighDateTime;
+    now_int64.QuadPart = (now_int64.QuadPart / kTenthMicrosInMilliSecond) -
+                         kJavaEpochToWinFileTimeDelta;
+    return now_int64.QuadPart;
+  }
+  return 0;
+#elif GTEST_OS_WINDOWS && !GTEST_HAS_GETTIMEOFDAY_
+  __timeb64 now;
+
+  // MSVC 8 deprecates _ftime64(), so we want to suppress warning 4996
+  // (deprecated function) there.
+  GTEST_DISABLE_MSC_DEPRECATED_PUSH_()
+  _ftime64(&now);
+  GTEST_DISABLE_MSC_DEPRECATED_POP_()
+
+  return static_cast<TimeInMillis>(now.time) * 1000 + now.millitm;
+#elif GTEST_HAS_GETTIMEOFDAY_
+  struct timeval now;
+  gettimeofday(&now, nullptr);
+  return static_cast<TimeInMillis>(now.tv_sec) * 1000 + now.tv_usec / 1000;
+#else
+#error "Don't know how to get the current time on your system."
+#endif
+}
+
+// Utilities
+
+// class String.
+
+#if GTEST_OS_WINDOWS_MOBILE
+// Creates a UTF-16 wide string from the given ANSI string, allocating
+// memory using new. The caller is responsible for deleting the return
+// value using delete[]. Returns the wide string, or NULL if the
+// input is NULL.
+LPCWSTR String::AnsiToUtf16(const char *ansi) {
+  if (!ansi) return nullptr;
+  const int length = strlen(ansi);
+  const int unicode_length =
+      MultiByteToWideChar(CP_ACP, 0, ansi, length, nullptr, 0);
+  WCHAR *unicode = new WCHAR[unicode_length + 1];
+  MultiByteToWideChar(CP_ACP, 0, ansi, length, unicode, unicode_length);
+  unicode[unicode_length] = 0;
+  return unicode;
+}
+
+// Creates an ANSI string from the given wide string, allocating
+// memory using new. The caller is responsible for deleting the return
+// value using delete[]. Returns the ANSI string, or NULL if the
+// input is NULL.
+const char *String::Utf16ToAnsi(LPCWSTR utf16_str) {
+  if (!utf16_str) return nullptr;
+  const int ansi_length = WideCharToMultiByte(CP_ACP, 0, utf16_str, -1, nullptr,
+                                              0, nullptr, nullptr);
+  char *ansi = new char[ansi_length + 1];
+  WideCharToMultiByte(CP_ACP, 0, utf16_str, -1, ansi, ansi_length, nullptr,
+                      nullptr);
+  ansi[ansi_length] = 0;
+  return ansi;
+}
+
+#endif  // GTEST_OS_WINDOWS_MOBILE
+
+// Compares two C strings.  Returns true if and only if they have the same
+// content.
+//
+// Unlike strcmp(), this function can handle NULL argument(s).  A NULL
+// C string is considered different to any non-NULL C string,
+// including the empty string.
+bool String::CStringEquals(const char *lhs, const char *rhs) {
+  if (lhs == nullptr) return rhs == nullptr;
+
+  if (rhs == nullptr) return false;
+
+  return strcmp(lhs, rhs) == 0;
+}
+
+#if GTEST_HAS_STD_WSTRING
+
+// Converts an array of wide chars to a narrow string using the UTF-8
+// encoding, and streams the result to the given Message object.
+static void StreamWideCharsToMessage(const wchar_t *wstr, size_t length,
+                                     Message *msg) {
+  for (size_t i = 0; i != length;) {  // NOLINT
+    if (wstr[i] != L'\0') {
+      *msg << WideStringToUtf8(wstr + i, static_cast<int>(length - i));
+      while (i != length && wstr[i] != L'\0') i++;
+    } else {
+      *msg << '\0';
+      i++;
+    }
+  }
+}
+
+#endif  // GTEST_HAS_STD_WSTRING
+
+void SplitString(const ::std::string &str, char delimiter,
+                 ::std::vector< ::std::string> *dest) {
+  ::std::vector< ::std::string> parsed;
+  ::std::string::size_type pos = 0;
+  while (::testing::internal::AlwaysTrue()) {
+    const ::std::string::size_type colon = str.find(delimiter, pos);
+    if (colon == ::std::string::npos) {
+      parsed.push_back(str.substr(pos));
+      break;
+    } else {
+      parsed.push_back(str.substr(pos, colon - pos));
+      pos = colon + 1;
+    }
+  }
+  dest->swap(parsed);
+}
+
+}  // namespace internal
+
+// Constructs an empty Message.
+// We allocate the stringstream separately because otherwise each use of
+// ASSERT/EXPECT in a procedure adds over 200 bytes to the procedure's
+// stack frame leading to huge stack frames in some cases; gcc does not reuse
+// the stack space.
+Message::Message() : ss_(new ::std::stringstream) {
+  // By default, we want there to be enough precision when printing
+  // a double to a Message.
+  *ss_ << std::setprecision(std::numeric_limits<double>::digits10 + 2);
+}
+
+// These two overloads allow streaming a wide C string to a Message
+// using the UTF-8 encoding.
+Message &Message::operator<<(const wchar_t *wide_c_str) {
+  return *this << internal::String::ShowWideCString(wide_c_str);
+}
+Message &Message::operator<<(wchar_t *wide_c_str) {
+  return *this << internal::String::ShowWideCString(wide_c_str);
+}
+
+#if GTEST_HAS_STD_WSTRING
+// Converts the given wide string to a narrow string using the UTF-8
+// encoding, and streams the result to this Message object.
+Message &Message::operator<<(const ::std::wstring &wstr) {
+  internal::StreamWideCharsToMessage(wstr.c_str(), wstr.length(), this);
+  return *this;
+}
+#endif  // GTEST_HAS_STD_WSTRING
+
+// Gets the text streamed to this object so far as an std::string.
+// Each '\0' character in the buffer is replaced with "\\0".
+std::string Message::GetString() const {
+  return internal::StringStreamToString(ss_.get());
+}
+
+// AssertionResult constructors.
+// Used in EXPECT_TRUE/FALSE(assertion_result).
+AssertionResult::AssertionResult(const AssertionResult &other)
+    : success_(other.success_),
+      message_(other.message_.get() != nullptr
+                   ? new ::std::string(*other.message_)
+                   : static_cast< ::std::string *>(nullptr)) {}
+
+// Swaps two AssertionResults.
+void AssertionResult::swap(AssertionResult &other) {
+  using std::swap;
+  swap(success_, other.success_);
+  swap(message_, other.message_);
+}
+
+// Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE.
+AssertionResult AssertionResult::operator!() const {
+  AssertionResult negation(!success_);
+  if (message_.get() != nullptr) negation << *message_;
+  return negation;
+}
+
+// Makes a successful assertion result.
+AssertionResult AssertionSuccess() { return AssertionResult(true); }
+
+// Makes a failed assertion result.
+AssertionResult AssertionFailure() { return AssertionResult(false); }
+
+// Makes a failed assertion result with the given failure message.
+// Deprecated; use AssertionFailure() << message.
+AssertionResult AssertionFailure(const Message &message) {
+  return AssertionFailure() << message;
+}
+
+namespace internal {
+
+namespace edit_distance {
+std::vector<EditType> CalculateOptimalEdits(const std::vector<size_t> &left,
+                                            const std::vector<size_t> &right) {
+  std::vector<std::vector<double> > costs(
+      left.size() + 1, std::vector<double>(right.size() + 1));
+  std::vector<std::vector<EditType> > best_move(
+      left.size() + 1, std::vector<EditType>(right.size() + 1));
+
+  // Populate for empty right.
+  for (size_t l_i = 0; l_i < costs.size(); ++l_i) {
+    costs[l_i][0] = static_cast<double>(l_i);
+    best_move[l_i][0] = kRemove;
+  }
+  // Populate for empty left.
+  for (size_t r_i = 1; r_i < costs[0].size(); ++r_i) {
+    costs[0][r_i] = static_cast<double>(r_i);
+    best_move[0][r_i] = kAdd;
+  }
+
+  for (size_t l_i = 0; l_i < left.size(); ++l_i) {
+    for (size_t r_i = 0; r_i < right.size(); ++r_i) {
+      if (left[l_i] == right[r_i]) {
+        // Found a match. Consume it.
+        costs[l_i + 1][r_i + 1] = costs[l_i][r_i];
+        best_move[l_i + 1][r_i + 1] = kMatch;
+        continue;
+      }
+
+      const double add = costs[l_i + 1][r_i];
+      const double remove = costs[l_i][r_i + 1];
+      const double replace = costs[l_i][r_i];
+      if (add < remove && add < replace) {
+        costs[l_i + 1][r_i + 1] = add + 1;
+        best_move[l_i + 1][r_i + 1] = kAdd;
+      } else if (remove < add && remove < replace) {
+        costs[l_i + 1][r_i + 1] = remove + 1;
+        best_move[l_i + 1][r_i + 1] = kRemove;
+      } else {
+        // We make replace a little more expensive than add/remove to lower
+        // their priority.
+        costs[l_i + 1][r_i + 1] = replace + 1.00001;
+        best_move[l_i + 1][r_i + 1] = kReplace;
+      }
+    }
+  }
+
+  // Reconstruct the best path. We do it in reverse order.
+  std::vector<EditType> best_path;
+  for (size_t l_i = left.size(), r_i = right.size(); l_i > 0 || r_i > 0;) {
+    EditType move = best_move[l_i][r_i];
+    best_path.push_back(move);
+    l_i -= move != kAdd;
+    r_i -= move != kRemove;
+  }
+  std::reverse(best_path.begin(), best_path.end());
+  return best_path;
+}
+
+namespace {
+
+// Helper class to convert string into ids with deduplication.
+class InternalStrings {
+ public:
+  size_t GetId(const std::string &str) {
+    IdMap::iterator it = ids_.find(str);
+    if (it != ids_.end()) return it->second;
+    size_t id = ids_.size();
+    return ids_[str] = id;
+  }
+
+ private:
+  typedef std::map<std::string, size_t> IdMap;
+  IdMap ids_;
+};
+
+}  // namespace
+
+std::vector<EditType> CalculateOptimalEdits(
+    const std::vector<std::string> &left,
+    const std::vector<std::string> &right) {
+  std::vector<size_t> left_ids, right_ids;
+  {
+    InternalStrings intern_table;
+    for (size_t i = 0; i < left.size(); ++i) {
+      left_ids.push_back(intern_table.GetId(left[i]));
+    }
+    for (size_t i = 0; i < right.size(); ++i) {
+      right_ids.push_back(intern_table.GetId(right[i]));
+    }
+  }
+  return CalculateOptimalEdits(left_ids, right_ids);
+}
+
+namespace {
+
+// Helper class that holds the state for one hunk and prints it out to the
+// stream.
+// It reorders adds/removes when possible to group all removes before all
+// adds. It also adds the hunk header before printint into the stream.
+class Hunk {
+ public:
+  Hunk(size_t left_start, size_t right_start)
+      : left_start_(left_start), right_start_(right_start), adds_(), removes_(),
+        common_() {}
+
+  void PushLine(char edit, const char *line) {
+    switch (edit) {
+      case ' ':
+        ++common_;
+        FlushEdits();
+        hunk_.push_back(std::make_pair(' ', line));
+        break;
+      case '-':
+        ++removes_;
+        hunk_removes_.push_back(std::make_pair('-', line));
+        break;
+      case '+':
+        ++adds_;
+        hunk_adds_.push_back(std::make_pair('+', line));
+        break;
+    }
+  }
+
+  void PrintTo(std::ostream *os) {
+    PrintHeader(os);
+    FlushEdits();
+    for (std::list<std::pair<char, const char *> >::const_iterator it =
+             hunk_.begin();
+         it != hunk_.end(); ++it) {
+      *os << it->first << it->second << "\n";
+    }
+  }
+
+  bool has_edits() const { return adds_ || removes_; }
+
+ private:
+  void FlushEdits() {
+    hunk_.splice(hunk_.end(), hunk_removes_);
+    hunk_.splice(hunk_.end(), hunk_adds_);
+  }
+
+  // Print a unified diff header for one hunk.
+  // The format is
+  //   "@@ -<left_start>,<left_length> +<right_start>,<right_length> @@"
+  // where the left/right parts are omitted if unnecessary.
+  void PrintHeader(std::ostream *ss) const {
+    *ss << "@@ ";
+    if (removes_) {
+      *ss << "-" << left_start_ << "," << (removes_ + common_);
+    }
+    if (removes_ && adds_) {
+      *ss << " ";
+    }
+    if (adds_) {
+      *ss << "+" << right_start_ << "," << (adds_ + common_);
+    }
+    *ss << " @@\n";
+  }
+
+  size_t left_start_, right_start_;
+  size_t adds_, removes_, common_;
+  std::list<std::pair<char, const char *> > hunk_, hunk_adds_, hunk_removes_;
+};
+
+}  // namespace
+
+// Create a list of diff hunks in Unified diff format.
+// Each hunk has a header generated by PrintHeader above plus a body with
+// lines prefixed with ' ' for no change, '-' for deletion and '+' for
+// addition.
+// 'context' represents the desired unchanged prefix/suffix around the diff.
+// If two hunks are close enough that their contexts overlap, then they are
+// joined into one hunk.
+std::string CreateUnifiedDiff(const std::vector<std::string> &left,
+                              const std::vector<std::string> &right,
+                              size_t context) {
+  const std::vector<EditType> edits = CalculateOptimalEdits(left, right);
+
+  size_t l_i = 0, r_i = 0, edit_i = 0;
+  std::stringstream ss;
+  while (edit_i < edits.size()) {
+    // Find first edit.
+    while (edit_i < edits.size() && edits[edit_i] == kMatch) {
+      ++l_i;
+      ++r_i;
+      ++edit_i;
+    }
+
+    // Find the first line to include in the hunk.
+    const size_t prefix_context = std::min(l_i, context);
+    Hunk hunk(l_i - prefix_context + 1, r_i - prefix_context + 1);
+    for (size_t i = prefix_context; i > 0; --i) {
+      hunk.PushLine(' ', left[l_i - i].c_str());
+    }
+
+    // Iterate the edits until we found enough suffix for the hunk or the input
+    // is over.
+    size_t n_suffix = 0;
+    for (; edit_i < edits.size(); ++edit_i) {
+      if (n_suffix >= context) {
+        // Continue only if the next hunk is very close.
+        auto it = edits.begin() + static_cast<int>(edit_i);
+        while (it != edits.end() && *it == kMatch) ++it;
+        if (it == edits.end() ||
+            static_cast<size_t>(it - edits.begin()) - edit_i >= context) {
+          // There is no next edit or it is too far away.
+          break;
+        }
+      }
+
+      EditType edit = edits[edit_i];
+      // Reset count when a non match is found.
+      n_suffix = edit == kMatch ? n_suffix + 1 : 0;
+
+      if (edit == kMatch || edit == kRemove || edit == kReplace) {
+        hunk.PushLine(edit == kMatch ? ' ' : '-', left[l_i].c_str());
+      }
+      if (edit == kAdd || edit == kReplace) {
+        hunk.PushLine('+', right[r_i].c_str());
+      }
+
+      // Advance indices, depending on edit type.
+      l_i += edit != kAdd;
+      r_i += edit != kRemove;
+    }
+
+    if (!hunk.has_edits()) {
+      // We are done. We don't want this hunk.
+      break;
+    }
+
+    hunk.PrintTo(&ss);
+  }
+  return ss.str();
+}
+
+}  // namespace edit_distance
+
+namespace {
+
+// The string representation of the values received in EqFailure() are already
+// escaped. Split them on escaped '\n' boundaries. Leave all other escaped
+// characters the same.
+std::vector<std::string> SplitEscapedString(const std::string &str) {
+  std::vector<std::string> lines;
+  size_t start = 0, end = str.size();
+  if (end > 2 && str[0] == '"' && str[end - 1] == '"') {
+    ++start;
+    --end;
+  }
+  bool escaped = false;
+  for (size_t i = start; i + 1 < end; ++i) {
+    if (escaped) {
+      escaped = false;
+      if (str[i] == 'n') {
+        lines.push_back(str.substr(start, i - start - 1));
+        start = i + 1;
+      }
+    } else {
+      escaped = str[i] == '\\';
+    }
+  }
+  lines.push_back(str.substr(start, end - start));
+  return lines;
+}
+
+}  // namespace
+
+// Constructs and returns the message for an equality assertion
+// (e.g. ASSERT_EQ, EXPECT_STREQ, etc) failure.
+//
+// The first four parameters are the expressions used in the assertion
+// and their values, as strings.  For example, for ASSERT_EQ(foo, bar)
+// where foo is 5 and bar is 6, we have:
+//
+//   lhs_expression: "foo"
+//   rhs_expression: "bar"
+//   lhs_value:      "5"
+//   rhs_value:      "6"
+//
+// The ignoring_case parameter is true if and only if the assertion is a
+// *_STRCASEEQ*.  When it's true, the string "Ignoring case" will
+// be inserted into the message.
+AssertionResult EqFailure(const char *lhs_expression,
+                          const char *rhs_expression,
+                          const std::string &lhs_value,
+                          const std::string &rhs_value, bool ignoring_case) {
+  Message msg;
+  msg << "Expected equality of these values:";
+  msg << "\n  " << lhs_expression;
+  if (lhs_value != lhs_expression) {
+    msg << "\n    Which is: " << lhs_value;
+  }
+  msg << "\n  " << rhs_expression;
+  if (rhs_value != rhs_expression) {
+    msg << "\n    Which is: " << rhs_value;
+  }
+
+  if (ignoring_case) {
+    msg << "\nIgnoring case";
+  }
+
+  if (!lhs_value.empty() && !rhs_value.empty()) {
+    const std::vector<std::string> lhs_lines = SplitEscapedString(lhs_value);
+    const std::vector<std::string> rhs_lines = SplitEscapedString(rhs_value);
+    if (lhs_lines.size() > 1 || rhs_lines.size() > 1) {
+      msg << "\nWith diff:\n"
+          << edit_distance::CreateUnifiedDiff(lhs_lines, rhs_lines);
+    }
+  }
+
+  return AssertionFailure() << msg;
+}
+
+// Constructs a failure message for Boolean assertions such as EXPECT_TRUE.
+std::string GetBoolAssertionFailureMessage(
+    const AssertionResult &assertion_result, const char *expression_text,
+    const char *actual_predicate_value, const char *expected_predicate_value) {
+  const char *actual_message = assertion_result.message();
+  Message msg;
+  msg << "Value of: " << expression_text
+      << "\n  Actual: " << actual_predicate_value;
+  if (actual_message[0] != '\0') msg << " (" << actual_message << ")";
+  msg << "\nExpected: " << expected_predicate_value;
+  return msg.GetString();
+}
+
+// Helper function for implementing ASSERT_NEAR.
+AssertionResult DoubleNearPredFormat(const char *expr1, const char *expr2,
+                                     const char *abs_error_expr, double val1,
+                                     double val2, double abs_error) {
+  const double diff = fabs(val1 - val2);
+  if (diff <= abs_error) return AssertionSuccess();
+
+  return AssertionFailure()
+         << "The difference between " << expr1 << " and " << expr2 << " is "
+         << diff << ", which exceeds " << abs_error_expr << ", where\n"
+         << expr1 << " evaluates to " << val1 << ",\n"
+         << expr2 << " evaluates to " << val2 << ", and\n"
+         << abs_error_expr << " evaluates to " << abs_error << ".";
+}
+
+// Helper template for implementing FloatLE() and DoubleLE().
+template <typename RawType>
+AssertionResult FloatingPointLE(const char *expr1, const char *expr2,
+                                RawType val1, RawType val2) {
+  // Returns success if val1 is less than val2,
+  if (val1 < val2) {
+    return AssertionSuccess();
+  }
+
+  // or if val1 is almost equal to val2.
+  const FloatingPoint<RawType> lhs(val1), rhs(val2);
+  if (lhs.AlmostEquals(rhs)) {
+    return AssertionSuccess();
+  }
+
+  // Note that the above two checks will both fail if either val1 or
+  // val2 is NaN, as the IEEE floating-point standard requires that
+  // any predicate involving a NaN must return false.
+
+  ::std::stringstream val1_ss;
+  val1_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
+          << val1;
+
+  ::std::stringstream val2_ss;
+  val2_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
+          << val2;
+
+  return AssertionFailure()
+         << "Expected: (" << expr1 << ") <= (" << expr2 << ")\n"
+         << "  Actual: " << StringStreamToString(&val1_ss) << " vs "
+         << StringStreamToString(&val2_ss);
+}
+
+}  // namespace internal
+
+// Asserts that val1 is less than, or almost equal to, val2.  Fails
+// otherwise.  In particular, it fails if either val1 or val2 is NaN.
+AssertionResult FloatLE(const char *expr1, const char *expr2, float val1,
+                        float val2) {
+  return internal::FloatingPointLE<float>(expr1, expr2, val1, val2);
+}
+
+// Asserts that val1 is less than, or almost equal to, val2.  Fails
+// otherwise.  In particular, it fails if either val1 or val2 is NaN.
+AssertionResult DoubleLE(const char *expr1, const char *expr2, double val1,
+                         double val2) {
+  return internal::FloatingPointLE<double>(expr1, expr2, val1, val2);
+}
+
+namespace internal {
+
+// The helper function for {ASSERT|EXPECT}_EQ with int or enum
+// arguments.
+AssertionResult CmpHelperEQ(const char *lhs_expression,
+                            const char *rhs_expression, BiggestInt lhs,
+                            BiggestInt rhs) {
+  if (lhs == rhs) {
+    return AssertionSuccess();
+  }
+
+  return EqFailure(lhs_expression, rhs_expression,
+                   FormatForComparisonFailureMessage(lhs, rhs),
+                   FormatForComparisonFailureMessage(rhs, lhs), false);
+}
+
+// A macro for implementing the helper functions needed to implement
+// ASSERT_?? and EXPECT_?? with integer or enum arguments.  It is here
+// just to avoid copy-and-paste of similar code.
+#define GTEST_IMPL_CMP_HELPER_(op_name, op)                                    \
+  AssertionResult CmpHelper##op_name(const char *expr1, const char *expr2,     \
+                                     BiggestInt val1, BiggestInt val2) {       \
+    if (val1 op val2) {                                                        \
+      return AssertionSuccess();                                               \
+    } else {                                                                   \
+      return AssertionFailure()                                                \
+             << "Expected: (" << expr1 << ") " #op " (" << expr2               \
+             << "), actual: " << FormatForComparisonFailureMessage(val1, val2) \
+             << " vs " << FormatForComparisonFailureMessage(val2, val1);       \
+    }                                                                          \
+  }
+
+// Implements the helper function for {ASSERT|EXPECT}_NE with int or
+// enum arguments.
+GTEST_IMPL_CMP_HELPER_(NE, !=)
+// Implements the helper function for {ASSERT|EXPECT}_LE with int or
+// enum arguments.
+GTEST_IMPL_CMP_HELPER_(LE, <=)
+// Implements the helper function for {ASSERT|EXPECT}_LT with int or
+// enum arguments.
+GTEST_IMPL_CMP_HELPER_(LT, <)
+// Implements the helper function for {ASSERT|EXPECT}_GE with int or
+// enum arguments.
+GTEST_IMPL_CMP_HELPER_(GE, >=)
+// Implements the helper function for {ASSERT|EXPECT}_GT with int or
+// enum arguments.
+GTEST_IMPL_CMP_HELPER_(GT, >)
+
+#undef GTEST_IMPL_CMP_HELPER_
+
+// The helper function for {ASSERT|EXPECT}_STREQ.
+AssertionResult CmpHelperSTREQ(const char *lhs_expression,
+                               const char *rhs_expression, const char *lhs,
+                               const char *rhs) {
+  if (String::CStringEquals(lhs, rhs)) {
+    return AssertionSuccess();
+  }
+
+  return EqFailure(lhs_expression, rhs_expression, PrintToString(lhs),
+                   PrintToString(rhs), false);
+}
+
+// The helper function for {ASSERT|EXPECT}_STRCASEEQ.
+AssertionResult CmpHelperSTRCASEEQ(const char *lhs_expression,
+                                   const char *rhs_expression, const char *lhs,
+                                   const char *rhs) {
+  if (String::CaseInsensitiveCStringEquals(lhs, rhs)) {
+    return AssertionSuccess();
+  }
+
+  return EqFailure(lhs_expression, rhs_expression, PrintToString(lhs),
+                   PrintToString(rhs), true);
+}
+
+// The helper function for {ASSERT|EXPECT}_STRNE.
+AssertionResult CmpHelperSTRNE(const char *s1_expression,
+                               const char *s2_expression, const char *s1,
+                               const char *s2) {
+  if (!String::CStringEquals(s1, s2)) {
+    return AssertionSuccess();
+  } else {
+    return AssertionFailure()
+           << "Expected: (" << s1_expression << ") != (" << s2_expression
+           << "), actual: \"" << s1 << "\" vs \"" << s2 << "\"";
+  }
+}
+
+// The helper function for {ASSERT|EXPECT}_STRCASENE.
+AssertionResult CmpHelperSTRCASENE(const char *s1_expression,
+                                   const char *s2_expression, const char *s1,
+                                   const char *s2) {
+  if (!String::CaseInsensitiveCStringEquals(s1, s2)) {
+    return AssertionSuccess();
+  } else {
+    return AssertionFailure()
+           << "Expected: (" << s1_expression << ") != (" << s2_expression
+           << ") (ignoring case), actual: \"" << s1 << "\" vs \"" << s2 << "\"";
+  }
+}
+
+}  // namespace internal
+
+namespace {
+
+// Helper functions for implementing IsSubString() and IsNotSubstring().
+
+// This group of overloaded functions return true if and only if needle
+// is a substring of haystack.  NULL is considered a substring of
+// itself only.
+
+bool IsSubstringPred(const char *needle, const char *haystack) {
+  if (needle == nullptr || haystack == nullptr) return needle == haystack;
+
+  return strstr(haystack, needle) != nullptr;
+}
+
+bool IsSubstringPred(const wchar_t *needle, const wchar_t *haystack) {
+  if (needle == nullptr || haystack == nullptr) return needle == haystack;
+
+  return wcsstr(haystack, needle) != nullptr;
+}
+
+// StringType here can be either ::std::string or ::std::wstring.
+template <typename StringType>
+bool IsSubstringPred(const StringType &needle, const StringType &haystack) {
+  return haystack.find(needle) != StringType::npos;
+}
+
+// This function implements either IsSubstring() or IsNotSubstring(),
+// depending on the value of the expected_to_be_substring parameter.
+// StringType here can be const char*, const wchar_t*, ::std::string,
+// or ::std::wstring.
+template <typename StringType>
+AssertionResult IsSubstringImpl(bool expected_to_be_substring,
+                                const char *needle_expr,
+                                const char *haystack_expr,
+                                const StringType &needle,
+                                const StringType &haystack) {
+  if (IsSubstringPred(needle, haystack) == expected_to_be_substring)
+    return AssertionSuccess();
+
+  const bool is_wide_string = sizeof(needle[0]) > 1;
+  const char *const begin_string_quote = is_wide_string ? "L\"" : "\"";
+  return AssertionFailure()
+         << "Value of: " << needle_expr << "\n"
+         << "  Actual: " << begin_string_quote << needle << "\"\n"
+         << "Expected: " << (expected_to_be_substring ? "" : "not ")
+         << "a substring of " << haystack_expr << "\n"
+         << "Which is: " << begin_string_quote << haystack << "\"";
+}
+
+}  // namespace
+
+// IsSubstring() and IsNotSubstring() check whether needle is a
+// substring of haystack (NULL is considered a substring of itself
+// only), and return an appropriate error message when they fail.
+
+AssertionResult IsSubstring(const char *needle_expr, const char *haystack_expr,
+                            const char *needle, const char *haystack) {
+  return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
+}
+
+AssertionResult IsSubstring(const char *needle_expr, const char *haystack_expr,
+                            const wchar_t *needle, const wchar_t *haystack) {
+  return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
+}
+
+AssertionResult IsNotSubstring(const char *needle_expr,
+                               const char *haystack_expr, const char *needle,
+                               const char *haystack) {
+  return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
+}
+
+AssertionResult IsNotSubstring(const char *needle_expr,
+                               const char *haystack_expr, const wchar_t *needle,
+                               const wchar_t *haystack) {
+  return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
+}
+
+AssertionResult IsSubstring(const char *needle_expr, const char *haystack_expr,
+                            const ::std::string &needle,
+                            const ::std::string &haystack) {
+  return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
+}
+
+AssertionResult IsNotSubstring(const char *needle_expr,
+                               const char *haystack_expr,
+                               const ::std::string &needle,
+                               const ::std::string &haystack) {
+  return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
+}
+
+#if GTEST_HAS_STD_WSTRING
+AssertionResult IsSubstring(const char *needle_expr, const char *haystack_expr,
+                            const ::std::wstring &needle,
+                            const ::std::wstring &haystack) {
+  return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
+}
+
+AssertionResult IsNotSubstring(const char *needle_expr,
+                               const char *haystack_expr,
+                               const ::std::wstring &needle,
+                               const ::std::wstring &haystack) {
+  return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
+}
+#endif  // GTEST_HAS_STD_WSTRING
+
+namespace internal {
+
+#if GTEST_OS_WINDOWS
+
+namespace {
+
+// Helper function for IsHRESULT{SuccessFailure} predicates
+AssertionResult HRESULTFailureHelper(const char *expr, const char *expected,
+                                     long hr) {  // NOLINT
+#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_TV_TITLE
+
+  // Windows CE doesn't support FormatMessage.
+  const char error_text[] = "";
+
+#else
+
+  // Looks up the human-readable system message for the HRESULT code
+  // and since we're not passing any params to FormatMessage, we don't
+  // want inserts expanded.
+  const DWORD kFlags =
+      FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS;
+  const DWORD kBufSize = 4096;
+  // Gets the system's human readable message string for this HRESULT.
+  char error_text[kBufSize] = { '\0' };
+  DWORD message_length = ::FormatMessageA(kFlags,
+                                          0,  // no source, we're asking system
+                                          static_cast<DWORD>(hr),  // the error
+                                          0,  // no line width restrictions
+                                          error_text,  // output buffer
+                                          kBufSize,    // buf size
+                                          nullptr);  // no arguments for inserts
+  // Trims tailing white space (FormatMessage leaves a trailing CR-LF)
+  for (; message_length && IsSpace(error_text[message_length - 1]);
+       --message_length) {
+    error_text[message_length - 1] = '\0';
+  }
+
+#endif  // GTEST_OS_WINDOWS_MOBILE
+
+  const std::string error_hex("0x" + String::FormatHexInt(hr));
+  return ::testing::AssertionFailure()
+         << "Expected: " << expr << " " << expected << ".\n"
+         << "  Actual: " << error_hex << " " << error_text << "\n";
+}
+
+}  // namespace
+
+AssertionResult IsHRESULTSuccess(const char *expr, long hr) {  // NOLINT
+  if (SUCCEEDED(hr)) {
+    return AssertionSuccess();
+  }
+  return HRESULTFailureHelper(expr, "succeeds", hr);
+}
+
+AssertionResult IsHRESULTFailure(const char *expr, long hr) {  // NOLINT
+  if (FAILED(hr)) {
+    return AssertionSuccess();
+  }
+  return HRESULTFailureHelper(expr, "fails", hr);
+}
+
+#endif  // GTEST_OS_WINDOWS
+
+// Utility functions for encoding Unicode text (wide strings) in
+// UTF-8.
+
+// A Unicode code-point can have up to 21 bits, and is encoded in UTF-8
+// like this:
+//
+// Code-point length   Encoding
+//   0 -  7 bits       0xxxxxxx
+//   8 - 11 bits       110xxxxx 10xxxxxx
+//  12 - 16 bits       1110xxxx 10xxxxxx 10xxxxxx
+//  17 - 21 bits       11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+
+// The maximum code-point a one-byte UTF-8 sequence can represent.
+constexpr uint32_t kMaxCodePoint1 = (static_cast<uint32_t>(1) << 7) - 1;
+
+// The maximum code-point a two-byte UTF-8 sequence can represent.
+constexpr uint32_t kMaxCodePoint2 = (static_cast<uint32_t>(1) << (5 + 6)) - 1;
+
+// The maximum code-point a three-byte UTF-8 sequence can represent.
+constexpr uint32_t kMaxCodePoint3 =
+    (static_cast<uint32_t>(1) << (4 + 2 * 6)) - 1;
+
+// The maximum code-point a four-byte UTF-8 sequence can represent.
+constexpr uint32_t kMaxCodePoint4 =
+    (static_cast<uint32_t>(1) << (3 + 3 * 6)) - 1;
+
+// Chops off the n lowest bits from a bit pattern.  Returns the n
+// lowest bits.  As a side effect, the original bit pattern will be
+// shifted to the right by n bits.
+inline uint32_t ChopLowBits(uint32_t *bits, int n) {
+  const uint32_t low_bits = *bits & ((static_cast<uint32_t>(1) << n) - 1);
+  *bits >>= n;
+  return low_bits;
+}
+
+// Converts a Unicode code point to a narrow string in UTF-8 encoding.
+// code_point parameter is of type uint32_t because wchar_t may not be
+// wide enough to contain a code point.
+// If the code_point is not a valid Unicode code point
+// (i.e. outside of Unicode range U+0 to U+10FFFF) it will be converted
+// to "(Invalid Unicode 0xXXXXXXXX)".
+std::string CodePointToUtf8(uint32_t code_point) {
+  if (code_point > kMaxCodePoint4) {
+    return "(Invalid Unicode 0x" + String::FormatHexUInt32(code_point) + ")";
+  }
+
+  char str[5];  // Big enough for the largest valid code point.
+  if (code_point <= kMaxCodePoint1) {
+    str[1] = '\0';
+    str[0] = static_cast<char>(code_point);  // 0xxxxxxx
+  } else if (code_point <= kMaxCodePoint2) {
+    str[2] = '\0';
+    str[1] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
+    str[0] = static_cast<char>(0xC0 | code_point);                   // 110xxxxx
+  } else if (code_point <= kMaxCodePoint3) {
+    str[3] = '\0';
+    str[2] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
+    str[1] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
+    str[0] = static_cast<char>(0xE0 | code_point);                   // 1110xxxx
+  } else {  // code_point <= kMaxCodePoint4
+    str[4] = '\0';
+    str[3] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
+    str[2] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
+    str[1] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
+    str[0] = static_cast<char>(0xF0 | code_point);                   // 11110xxx
+  }
+  return str;
+}
+
+// The following two functions only make sense if the system
+// uses UTF-16 for wide string encoding. All supported systems
+// with 16 bit wchar_t (Windows, Cygwin) do use UTF-16.
+
+// Determines if the arguments constitute UTF-16 surrogate pair
+// and thus should be combined into a single Unicode code point
+// using CreateCodePointFromUtf16SurrogatePair.
+inline bool IsUtf16SurrogatePair(wchar_t first, wchar_t second) {
+  return sizeof(wchar_t) == 2 && (first & 0xFC00) == 0xD800 &&
+         (second & 0xFC00) == 0xDC00;
+}
+
+// Creates a Unicode code point from UTF16 surrogate pair.
+inline uint32_t CreateCodePointFromUtf16SurrogatePair(wchar_t first,
+                                                      wchar_t second) {
+  const auto first_u = static_cast<uint32_t>(first);
+  const auto second_u = static_cast<uint32_t>(second);
+  const uint32_t mask = (1 << 10) - 1;
+  return (sizeof(wchar_t) == 2)
+             ? (((first_u & mask) << 10) | (second_u & mask)) + 0x10000
+             :
+             // This function should not be called when the condition is
+             // false, but we provide a sensible default in case it is.
+             first_u;
+}
+
+// Converts a wide string to a narrow string in UTF-8 encoding.
+// The wide string is assumed to have the following encoding:
+//   UTF-16 if sizeof(wchar_t) == 2 (on Windows, Cygwin)
+//   UTF-32 if sizeof(wchar_t) == 4 (on Linux)
+// Parameter str points to a null-terminated wide string.
+// Parameter num_chars may additionally limit the number
+// of wchar_t characters processed. -1 is used when the entire string
+// should be processed.
+// If the string contains code points that are not valid Unicode code points
+// (i.e. outside of Unicode range U+0 to U+10FFFF) they will be output
+// as '(Invalid Unicode 0xXXXXXXXX)'. If the string is in UTF16 encoding
+// and contains invalid UTF-16 surrogate pairs, values in those pairs
+// will be encoded as individual Unicode characters from Basic Normal Plane.
+std::string WideStringToUtf8(const wchar_t *str, int num_chars) {
+  if (num_chars == -1) num_chars = static_cast<int>(wcslen(str));
+
+  ::std::stringstream stream;
+  for (int i = 0; i < num_chars; ++i) {
+    uint32_t unicode_code_point;
+
+    if (str[i] == L'\0') {
+      break;
+    } else if (i + 1 < num_chars && IsUtf16SurrogatePair(str[i], str[i + 1])) {
+      unicode_code_point =
+          CreateCodePointFromUtf16SurrogatePair(str[i], str[i + 1]);
+      i++;
+    } else {
+      unicode_code_point = static_cast<uint32_t>(str[i]);
+    }
+
+    stream << CodePointToUtf8(unicode_code_point);
+  }
+  return StringStreamToString(&stream);
+}
+
+// Converts a wide C string to an std::string using the UTF-8 encoding.
+// NULL will be converted to "(null)".
+std::string String::ShowWideCString(const wchar_t *wide_c_str) {
+  if (wide_c_str == nullptr) return "(null)";
+
+  return internal::WideStringToUtf8(wide_c_str, -1);
+}
+
+// Compares two wide C strings.  Returns true if and only if they have the
+// same content.
+//
+// Unlike wcscmp(), this function can handle NULL argument(s).  A NULL
+// C string is considered different to any non-NULL C string,
+// including the empty string.
+bool String::WideCStringEquals(const wchar_t *lhs, const wchar_t *rhs) {
+  if (lhs == nullptr) return rhs == nullptr;
+
+  if (rhs == nullptr) return false;
+
+  return wcscmp(lhs, rhs) == 0;
+}
+
+// Helper function for *_STREQ on wide strings.
+AssertionResult CmpHelperSTREQ(const char *lhs_expression,
+                               const char *rhs_expression, const wchar_t *lhs,
+                               const wchar_t *rhs) {
+  if (String::WideCStringEquals(lhs, rhs)) {
+    return AssertionSuccess();
+  }
+
+  return EqFailure(lhs_expression, rhs_expression, PrintToString(lhs),
+                   PrintToString(rhs), false);
+}
+
+// Helper function for *_STRNE on wide strings.
+AssertionResult CmpHelperSTRNE(const char *s1_expression,
+                               const char *s2_expression, const wchar_t *s1,
+                               const wchar_t *s2) {
+  if (!String::WideCStringEquals(s1, s2)) {
+    return AssertionSuccess();
+  }
+
+  return AssertionFailure()
+         << "Expected: (" << s1_expression << ") != (" << s2_expression
+         << "), actual: " << PrintToString(s1) << " vs " << PrintToString(s2);
+}
+
+// Compares two C strings, ignoring case.  Returns true if and only if they have
+// the same content.
+//
+// Unlike strcasecmp(), this function can handle NULL argument(s).  A
+// NULL C string is considered different to any non-NULL C string,
+// including the empty string.
+bool String::CaseInsensitiveCStringEquals(const char *lhs, const char *rhs) {
+  if (lhs == nullptr) return rhs == nullptr;
+  if (rhs == nullptr) return false;
+  return posix::StrCaseCmp(lhs, rhs) == 0;
+}
+
+// Compares two wide C strings, ignoring case.  Returns true if and only if they
+// have the same content.
+//
+// Unlike wcscasecmp(), this function can handle NULL argument(s).
+// A NULL C string is considered different to any non-NULL wide C string,
+// including the empty string.
+// NB: The implementations on different platforms slightly differ.
+// On windows, this method uses _wcsicmp which compares according to LC_CTYPE
+// environment variable. On GNU platform this method uses wcscasecmp
+// which compares according to LC_CTYPE category of the current locale.
+// On MacOS X, it uses towlower, which also uses LC_CTYPE category of the
+// current locale.
+bool String::CaseInsensitiveWideCStringEquals(const wchar_t *lhs,
+                                              const wchar_t *rhs) {
+  if (lhs == nullptr) return rhs == nullptr;
+
+  if (rhs == nullptr) return false;
+
+#if GTEST_OS_WINDOWS
+  return _wcsicmp(lhs, rhs) == 0;
+#elif GTEST_OS_LINUX && !GTEST_OS_LINUX_ANDROID
+  return wcscasecmp(lhs, rhs) == 0;
+#else
+  // Android, Mac OS X and Cygwin don't define wcscasecmp.
+  // Other unknown OSes may not define it either.
+  wint_t left, right;
+  do {
+    left = towlower(static_cast<wint_t>(*lhs++));
+    right = towlower(static_cast<wint_t>(*rhs++));
+  } while (left && left == right);
+  return left == right;
+#endif  // OS selector
+}
+
+// Returns true if and only if str ends with the given suffix, ignoring case.
+// Any string is considered to end with an empty suffix.
+bool String::EndsWithCaseInsensitive(const std::string &str,
+                                     const std::string &suffix) {
+  const size_t str_len = str.length();
+  const size_t suffix_len = suffix.length();
+  return (str_len >= suffix_len) &&
+         CaseInsensitiveCStringEquals(str.c_str() + str_len - suffix_len,
+                                      suffix.c_str());
+}
+
+// Formats an int value as "%02d".
+std::string String::FormatIntWidth2(int value) {
+  std::stringstream ss;
+  ss << std::setfill('0') << std::setw(2) << value;
+  return ss.str();
+}
+
+// Formats an int value as "%X".
+std::string String::FormatHexUInt32(uint32_t value) {
+  std::stringstream ss;
+  ss << std::hex << std::uppercase << value;
+  return ss.str();
+}
+
+// Formats an int value as "%X".
+std::string String::FormatHexInt(int value) {
+  return FormatHexUInt32(static_cast<uint32_t>(value));
+}
+
+// Formats a byte as "%02X".
+std::string String::FormatByte(unsigned char value) {
+  std::stringstream ss;
+  ss << std::setfill('0') << std::setw(2) << std::hex << std::uppercase
+     << static_cast<unsigned int>(value);
+  return ss.str();
+}
+
+// Converts the buffer in a stringstream to an std::string, converting NUL
+// bytes to "\\0" along the way.
+std::string StringStreamToString(::std::stringstream *ss) {
+  const ::std::string &str = ss->str();
+  const char *const start = str.c_str();
+  const char *const end = start + str.length();
+
+  std::string result;
+  result.reserve(static_cast<size_t>(2 * (end - start)));
+  for (const char *ch = start; ch != end; ++ch) {
+    if (*ch == '\0') {
+      result += "\\0";  // Replaces NUL with "\\0";
+    } else {
+      result += *ch;
+    }
+  }
+
+  return result;
+}
+
+// Appends the user-supplied message to the Google-Test-generated message.
+std::string AppendUserMessage(const std::string &gtest_msg,
+                              const Message &user_msg) {
+  // Appends the user message if it's non-empty.
+  const std::string user_msg_string = user_msg.GetString();
+  if (user_msg_string.empty()) {
+    return gtest_msg;
+  }
+
+  return gtest_msg + "\n" + user_msg_string;
+}
+
+}  // namespace internal
+
+// class TestResult
+
+// Creates an empty TestResult.
+TestResult::TestResult()
+    : death_test_count_(0), start_timestamp_(0), elapsed_time_(0) {}
+
+// D'tor.
+TestResult::~TestResult() {}
+
+// Returns the i-th test part result among all the results. i can
+// range from 0 to total_part_count() - 1. If i is not in that range,
+// aborts the program.
+const TestPartResult &TestResult::GetTestPartResult(int i) const {
+  if (i < 0 || i >= total_part_count()) internal::posix::Abort();
+  return test_part_results_.at(static_cast<size_t>(i));
+}
+
+// Returns the i-th test property. i can range from 0 to
+// test_property_count() - 1. If i is not in that range, aborts the
+// program.
+const TestProperty &TestResult::GetTestProperty(int i) const {
+  if (i < 0 || i >= test_property_count()) internal::posix::Abort();
+  return test_properties_.at(static_cast<size_t>(i));
+}
+
+// Clears the test part results.
+void TestResult::ClearTestPartResults() { test_part_results_.clear(); }
+
+// Adds a test part result to the list.
+void TestResult::AddTestPartResult(const TestPartResult &test_part_result) {
+  test_part_results_.push_back(test_part_result);
+}
+
+// Adds a test property to the list. If a property with the same key as the
+// supplied property is already represented, the value of this test_property
+// replaces the old value for that key.
+void TestResult::RecordProperty(const std::string &xml_element,
+                                const TestProperty &test_property) {
+  if (!ValidateTestProperty(xml_element, test_property)) {
+    return;
+  }
+  internal::MutexLock lock(&test_properites_mutex_);
+  const std::vector<TestProperty>::iterator property_with_matching_key =
+      std::find_if(test_properties_.begin(), test_properties_.end(),
+                   internal::TestPropertyKeyIs(test_property.key()));
+  if (property_with_matching_key == test_properties_.end()) {
+    test_properties_.push_back(test_property);
+    return;
+  }
+  property_with_matching_key->SetValue(test_property.value());
+}
+
+// The list of reserved attributes used in the <testsuites> element of XML
+// output.
+static const char *const kReservedTestSuitesAttributes[] = {
+  "disabled",    "errors", "failures", "name",
+  "random_seed", "tests",  "time",     "timestamp"
+};
+
+// The list of reserved attributes used in the <testsuite> element of XML
+// output.
+static const char *const kReservedTestSuiteAttributes[] = {
+  "disabled", "errors", "failures", "name", "tests", "time", "timestamp"
+};
+
+// The list of reserved attributes used in the <testcase> element of XML output.
+static const char *const kReservedTestCaseAttributes[] = {
+  "classname",  "name",        "status", "time",
+  "type_param", "value_param", "file",   "line"
+};
+
+// Use a slightly different set for allowed output to ensure existing tests can
+// still RecordProperty("result") or "RecordProperty(timestamp")
+static const char *const kReservedOutputTestCaseAttributes[] = {
+  "classname",   "name", "status", "time",   "type_param",
+  "value_param", "file", "line",   "result", "timestamp"
+};
+
+template <int kSize>
+std::vector<std::string> ArrayAsVector(const char *const (&array)[kSize]) {
+  return std::vector<std::string>(array, array + kSize);
+}
+
+static std::vector<std::string> GetReservedAttributesForElement(
+    const std::string &xml_element) {
+  if (xml_element == "testsuites") {
+    return ArrayAsVector(kReservedTestSuitesAttributes);
+  } else if (xml_element == "testsuite") {
+    return ArrayAsVector(kReservedTestSuiteAttributes);
+  } else if (xml_element == "testcase") {
+    return ArrayAsVector(kReservedTestCaseAttributes);
+  } else {
+    GTEST_CHECK_(false) << "Unrecognized xml_element provided: " << xml_element;
+  }
+  // This code is unreachable but some compilers may not realizes that.
+  return std::vector<std::string>();
+}
+
+// TODO(jdesprez): Merge the two getReserved attributes once skip is improved
+static std::vector<std::string> GetReservedOutputAttributesForElement(
+    const std::string &xml_element) {
+  if (xml_element == "testsuites") {
+    return ArrayAsVector(kReservedTestSuitesAttributes);
+  } else if (xml_element == "testsuite") {
+    return ArrayAsVector(kReservedTestSuiteAttributes);
+  } else if (xml_element == "testcase") {
+    return ArrayAsVector(kReservedOutputTestCaseAttributes);
+  } else {
+    GTEST_CHECK_(false) << "Unrecognized xml_element provided: " << xml_element;
+  }
+  // This code is unreachable but some compilers may not realizes that.
+  return std::vector<std::string>();
+}
+
+static std::string FormatWordList(const std::vector<std::string> &words) {
+  Message word_list;
+  for (size_t i = 0; i < words.size(); ++i) {
+    if (i > 0 && words.size() > 2) {
+      word_list << ", ";
+    }
+    if (i == words.size() - 1) {
+      word_list << "and ";
+    }
+    word_list << "'" << words[i] << "'";
+  }
+  return word_list.GetString();
+}
+
+static bool ValidateTestPropertyName(
+    const std::string &property_name,
+    const std::vector<std::string> &reserved_names) {
+  if (std::find(reserved_names.begin(), reserved_names.end(), property_name) !=
+      reserved_names.end()) {
+    ADD_FAILURE() << "Reserved key used in RecordProperty(): " << property_name
+                  << " (" << FormatWordList(reserved_names)
+                  << " are reserved by " << GTEST_NAME_ << ")";
+    return false;
+  }
+  return true;
+}
+
+// Adds a failure if the key is a reserved attribute of the element named
+// xml_element.  Returns true if the property is valid.
+bool TestResult::ValidateTestProperty(const std::string &xml_element,
+                                      const TestProperty &test_property) {
+  return ValidateTestPropertyName(test_property.key(),
+                                  GetReservedAttributesForElement(xml_element));
+}
+
+// Clears the object.
+void TestResult::Clear() {
+  test_part_results_.clear();
+  test_properties_.clear();
+  death_test_count_ = 0;
+  elapsed_time_ = 0;
+}
+
+// Returns true off the test part was skipped.
+static bool TestPartSkipped(const TestPartResult &result) {
+  return result.skipped();
+}
+
+// Returns true if and only if the test was skipped.
+bool TestResult::Skipped() const {
+  return !Failed() && CountIf(test_part_results_, TestPartSkipped) > 0;
+}
+
+// Returns true if and only if the test failed.
+bool TestResult::Failed() const {
+  for (int i = 0; i < total_part_count(); ++i) {
+    if (GetTestPartResult(i).failed()) return true;
+  }
+  return false;
+}
+
+// Returns true if and only if the test part fatally failed.
+static bool TestPartFatallyFailed(const TestPartResult &result) {
+  return result.fatally_failed();
+}
+
+// Returns true if and only if the test fatally failed.
+bool TestResult::HasFatalFailure() const {
+  return CountIf(test_part_results_, TestPartFatallyFailed) > 0;
+}
+
+// Returns true if and only if the test part non-fatally failed.
+static bool TestPartNonfatallyFailed(const TestPartResult &result) {
+  return result.nonfatally_failed();
+}
+
+// Returns true if and only if the test has a non-fatal failure.
+bool TestResult::HasNonfatalFailure() const {
+  return CountIf(test_part_results_, TestPartNonfatallyFailed) > 0;
+}
+
+// Gets the number of all test parts.  This is the sum of the number
+// of successful test parts and the number of failed test parts.
+int TestResult::total_part_count() const {
+  return static_cast<int>(test_part_results_.size());
+}
+
+// Returns the number of the test properties.
+int TestResult::test_property_count() const {
+  return static_cast<int>(test_properties_.size());
+}
+
+// class Test
+
+// Creates a Test object.
+
+// The c'tor saves the states of all flags.
+Test::Test() : gtest_flag_saver_(new GTEST_FLAG_SAVER_) {}
+
+// The d'tor restores the states of all flags.  The actual work is
+// done by the d'tor of the gtest_flag_saver_ field, and thus not
+// visible here.
+Test::~Test() {}
+
+// Sets up the test fixture.
+//
+// A sub-class may override this.
+void Test::SetUp() {}
+
+// Tears down the test fixture.
+//
+// A sub-class may override this.
+void Test::TearDown() {}
+
+// Allows user supplied key value pairs to be recorded for later output.
+void Test::RecordProperty(const std::string &key, const std::string &value) {
+  UnitTest::GetInstance()->RecordProperty(key, value);
+}
+
+// Allows user supplied key value pairs to be recorded for later output.
+void Test::RecordProperty(const std::string &key, int value) {
+  Message value_message;
+  value_message << value;
+  RecordProperty(key, value_message.GetString().c_str());
+}
+
+namespace internal {
+
+void ReportFailureInUnknownLocation(TestPartResult::Type result_type,
+                                    const std::string &message) {
+  // This function is a friend of UnitTest and as such has access to
+  // AddTestPartResult.
+  UnitTest::GetInstance()->AddTestPartResult(
+      result_type,
+      nullptr,  // No info about the source file where the exception occurred.
+      -1,       // We have no info on which line caused the exception.
+      message,
+      "");  // No stack trace, either.
+}
+
+}  // namespace internal
+
+// Google Test requires all tests in the same test suite to use the same test
+// fixture class.  This function checks if the current test has the
+// same fixture class as the first test in the current test suite.  If
+// yes, it returns true; otherwise it generates a Google Test failure and
+// returns false.
+bool Test::HasSameFixtureClass() {
+  internal::UnitTestImpl *const impl = internal::GetUnitTestImpl();
+  const TestSuite *const test_suite = impl->current_test_suite();
+
+  // Info about the first test in the current test suite.
+  const TestInfo *const first_test_info = test_suite->test_info_list()[0];
+  const internal::TypeId first_fixture_id = first_test_info->fixture_class_id_;
+  const char *const first_test_name = first_test_info->name();
+
+  // Info about the current test.
+  const TestInfo *const this_test_info = impl->current_test_info();
+  const internal::TypeId this_fixture_id = this_test_info->fixture_class_id_;
+  const char *const this_test_name = this_test_info->name();
+
+  if (this_fixture_id != first_fixture_id) {
+    // Is the first test defined using TEST?
+    const bool first_is_TEST = first_fixture_id == internal::GetTestTypeId();
+    // Is this test defined using TEST?
+    const bool this_is_TEST = this_fixture_id == internal::GetTestTypeId();
+
+    if (first_is_TEST || this_is_TEST) {
+      // Both TEST and TEST_F appear in same test suite, which is incorrect.
+      // Tell the user how to fix this.
+
+      // Gets the name of the TEST and the name of the TEST_F.  Note
+      // that first_is_TEST and this_is_TEST cannot both be true, as
+      // the fixture IDs are different for the two tests.
+      const char *const TEST_name =
+          first_is_TEST ? first_test_name : this_test_name;
+      const char *const TEST_F_name =
+          first_is_TEST ? this_test_name : first_test_name;
+
+      ADD_FAILURE()
+          << "All tests in the same test suite must use the same test fixture\n"
+          << "class, so mixing TEST_F and TEST in the same test suite is\n"
+          << "illegal.  In test suite " << this_test_info->test_suite_name()
+          << ",\n"
+          << "test " << TEST_F_name << " is defined using TEST_F but\n"
+          << "test " << TEST_name << " is defined using TEST.  You probably\n"
+          << "want to change the TEST to TEST_F or move it to another test\n"
+          << "case.";
+    } else {
+      // Two fixture classes with the same name appear in two different
+      // namespaces, which is not allowed. Tell the user how to fix this.
+      ADD_FAILURE()
+          << "All tests in the same test suite must use the same test fixture\n"
+          << "class.  However, in test suite "
+          << this_test_info->test_suite_name() << ",\n"
+          << "you defined test " << first_test_name << " and test "
+          << this_test_name << "\n"
+          << "using two different test fixture classes.  This can happen if\n"
+          << "the two classes are from different namespaces or translation\n"
+          << "units and have the same name.  You should probably rename one\n"
+          << "of the classes to put the tests into different test suites.";
+    }
+    return false;
+  }
+
+  return true;
+}
+
+#if GTEST_HAS_SEH
+
+// Adds an "exception thrown" fatal failure to the current test.  This
+// function returns its result via an output parameter pointer because VC++
+// prohibits creation of objects with destructors on stack in functions
+// using __try (see error C2712).
+static std::string *FormatSehExceptionMessage(DWORD exception_code,
+                                              const char *location) {
+  Message message;
+  message << "SEH exception with code 0x" << std::setbase(16) << exception_code
+          << std::setbase(10) << " thrown in " << location << ".";
+
+  return new std::string(message.GetString());
+}
+
+#endif  // GTEST_HAS_SEH
+
+namespace internal {
+
+#if GTEST_HAS_EXCEPTIONS
+
+// Adds an "exception thrown" fatal failure to the current test.
+static std::string FormatCxxExceptionMessage(const char *description,
+                                             const char *location) {
+  Message message;
+  if (description != nullptr) {
+    message << "C++ exception with description \"" << description << "\"";
+  } else {
+    message << "Unknown C++ exception";
+  }
+  message << " thrown in " << location << ".";
+
+  return message.GetString();
+}
+
+static std::string PrintTestPartResultToString(
+    const TestPartResult &test_part_result);
+
+GoogleTestFailureException::GoogleTestFailureException(
+    const TestPartResult &failure)
+    : ::std::runtime_error(PrintTestPartResultToString(failure).c_str()) {}
+
+#endif  // GTEST_HAS_EXCEPTIONS
+
+// We put these helper functions in the internal namespace as IBM's xlC
+// compiler rejects the code if they were declared static.
+
+// Runs the given method and handles SEH exceptions it throws, when
+// SEH is supported; returns the 0-value for type Result in case of an
+// SEH exception.  (Microsoft compilers cannot handle SEH and C++
+// exceptions in the same function.  Therefore, we provide a separate
+// wrapper function for handling SEH exceptions.)
+template <class T, typename Result>
+Result HandleSehExceptionsInMethodIfSupported(T *object, Result (T::*method)(),
+                                              const char *location) {
+#if GTEST_HAS_SEH
+  __try {
+    return (object->*method)();
+  } __except (internal::UnitTestOptions::GTestShouldProcessSEH(  // NOLINT
+      GetExceptionCode())) {
+    // We create the exception message on the heap because VC++ prohibits
+    // creation of objects with destructors on stack in functions using __try
+    // (see error C2712).
+    std::string *exception_message =
+        FormatSehExceptionMessage(GetExceptionCode(), location);
+    internal::ReportFailureInUnknownLocation(TestPartResult::kFatalFailure,
+                                             *exception_message);
+    delete exception_message;
+    return static_cast<Result>(0);
+  }
+#else
+  (void)location;
+  return (object->*method)();
+#endif  // GTEST_HAS_SEH
+}
+
+// Runs the given method and catches and reports C++ and/or SEH-style
+// exceptions, if they are supported; returns the 0-value for type
+// Result in case of an SEH exception.
+template <class T, typename Result>
+Result HandleExceptionsInMethodIfSupported(T *object, Result (T::*method)(),
+                                           const char *location) {
+  // NOTE: The user code can affect the way in which Google Test handles
+  // exceptions by setting GTEST_FLAG(catch_exceptions), but only before
+  // RUN_ALL_TESTS() starts. It is technically possible to check the flag
+  // after the exception is caught and either report or re-throw the
+  // exception based on the flag's value:
+  //
+  // try {
+  //   // Perform the test method.
+  // } catch (...) {
+  //   if (GTEST_FLAG(catch_exceptions))
+  //     // Report the exception as failure.
+  //   else
+  //     throw;  // Re-throws the original exception.
+  // }
+  //
+  // However, the purpose of this flag is to allow the program to drop into
+  // the debugger when the exception is thrown. On most platforms, once the
+  // control enters the catch block, the exception origin information is
+  // lost and the debugger will stop the program at the point of the
+  // re-throw in this function -- instead of at the point of the original
+  // throw statement in the code under test.  For this reason, we perform
+  // the check early, sacrificing the ability to affect Google Test's
+  // exception handling in the method where the exception is thrown.
+  if (internal::GetUnitTestImpl()->catch_exceptions()) {
+#if GTEST_HAS_EXCEPTIONS
+    try {
+      return HandleSehExceptionsInMethodIfSupported(object, method, location);
+    } catch (const AssertionException &) {  // NOLINT
+      // This failure was reported already.
+    } catch (const internal::GoogleTestFailureException &) {  // NOLINT
+      // This exception type can only be thrown by a failed Google
+      // Test assertion with the intention of letting another testing
+      // framework catch it.  Therefore we just re-throw it.
+      throw;
+    } catch (const std::exception &e) {  // NOLINT
+      internal::ReportFailureInUnknownLocation(
+          TestPartResult::kFatalFailure,
+          FormatCxxExceptionMessage(e.what(), location));
+    } catch (...) {  // NOLINT
+      internal::ReportFailureInUnknownLocation(
+          TestPartResult::kFatalFailure,
+          FormatCxxExceptionMessage(nullptr, location));
+    }
+    return static_cast<Result>(0);
+#else
+    return HandleSehExceptionsInMethodIfSupported(object, method, location);
+#endif  // GTEST_HAS_EXCEPTIONS
+  } else {
+    return (object->*method)();
+  }
+}
+
+}  // namespace internal
+
+// Runs the test and updates the test result.
+void Test::Run() {
+  if (!HasSameFixtureClass()) return;
+
+  internal::UnitTestImpl *const impl = internal::GetUnitTestImpl();
+  impl->os_stack_trace_getter()->UponLeavingGTest();
+  internal::HandleExceptionsInMethodIfSupported(this, &Test::SetUp, "SetUp()");
+  // We will run the test only if SetUp() was successful and didn't call
+  // GTEST_SKIP().
+  if (!HasFatalFailure() && !IsSkipped()) {
+    impl->os_stack_trace_getter()->UponLeavingGTest();
+    internal::HandleExceptionsInMethodIfSupported(this, &Test::TestBody,
+                                                  "the test body");
+  }
+
+  // However, we want to clean up as much as possible.  Hence we will
+  // always call TearDown(), even if SetUp() or the test body has
+  // failed.
+  impl->os_stack_trace_getter()->UponLeavingGTest();
+  internal::HandleExceptionsInMethodIfSupported(this, &Test::TearDown,
+                                                "TearDown()");
+}
+
+// Returns true if and only if the current test has a fatal failure.
+bool Test::HasFatalFailure() {
+  return internal::GetUnitTestImpl()->current_test_result()->HasFatalFailure();
+}
+
+// Returns true if and only if the current test has a non-fatal failure.
+bool Test::HasNonfatalFailure() {
+  return internal::GetUnitTestImpl()
+      ->current_test_result()
+      ->HasNonfatalFailure();
+}
+
+// Returns true if and only if the current test was skipped.
+bool Test::IsSkipped() {
+  return internal::GetUnitTestImpl()->current_test_result()->Skipped();
+}
+
+// class TestInfo
+
+// Constructs a TestInfo object. It assumes ownership of the test factory
+// object.
+TestInfo::TestInfo(const std::string &a_test_suite_name,
+                   const std::string &a_name, const char *a_type_param,
+                   const char *a_value_param,
+                   internal::CodeLocation a_code_location,
+                   internal::TypeId fixture_class_id,
+                   internal::TestFactoryBase *factory)
+    : test_suite_name_(a_test_suite_name), name_(a_name),
+      type_param_(a_type_param ? new std::string(a_type_param) : nullptr),
+      value_param_(a_value_param ? new std::string(a_value_param) : nullptr),
+      location_(a_code_location), fixture_class_id_(fixture_class_id),
+      should_run_(false), is_disabled_(false), matches_filter_(false),
+      factory_(factory), result_() {}
+
+// Destructs a TestInfo object.
+TestInfo::~TestInfo() { delete factory_; }
+
+namespace internal {
+
+// Creates a new TestInfo object and registers it with Google Test;
+// returns the created object.
+//
+// Arguments:
+//
+//   test_suite_name:   name of the test suite
+//   name:             name of the test
+//   type_param:       the name of the test's type parameter, or NULL if
+//                     this is not a typed or a type-parameterized test.
+//   value_param:      text representation of the test's value parameter,
+//                     or NULL if this is not a value-parameterized test.
+//   code_location:    code location where the test is defined
+//   fixture_class_id: ID of the test fixture class
+//   set_up_tc:        pointer to the function that sets up the test suite
+//   tear_down_tc:     pointer to the function that tears down the test suite
+//   factory:          pointer to the factory that creates a test object.
+//                     The newly created TestInfo instance will assume
+//                     ownership of the factory object.
+TestInfo *MakeAndRegisterTestInfo(
+    const char *test_suite_name, const char *name, const char *type_param,
+    const char *value_param, CodeLocation code_location,
+    TypeId fixture_class_id, SetUpTestSuiteFunc set_up_tc,
+    TearDownTestSuiteFunc tear_down_tc, TestFactoryBase *factory) {
+  TestInfo *const test_info =
+      new TestInfo(test_suite_name, name, type_param, value_param,
+                   code_location, fixture_class_id, factory);
+  GetUnitTestImpl()->AddTestInfo(set_up_tc, tear_down_tc, test_info);
+  return test_info;
+}
+
+void ReportInvalidTestSuiteType(const char *test_suite_name,
+                                CodeLocation code_location) {
+  Message errors;
+  errors
+      << "Attempted redefinition of test suite " << test_suite_name << ".\n"
+      << "All tests in the same test suite must use the same test fixture\n"
+      << "class.  However, in test suite " << test_suite_name << ", you tried\n"
+      << "to define a test using a fixture class different from the one\n"
+      << "used earlier. This can happen if the two fixture classes are\n"
+      << "from different namespaces and have the same name. You should\n"
+      << "probably rename one of the classes to put the tests into different\n"
+      << "test suites.";
+
+  GTEST_LOG_(ERROR) << FormatFileLocation(code_location.file.c_str(),
+                                          code_location.line)
+                    << " " << errors.GetString();
+}
+}  // namespace internal
+
+namespace {
+
+// A predicate that checks the test name of a TestInfo against a known
+// value.
+//
+// This is used for implementation of the TestSuite class only.  We put
+// it in the anonymous namespace to prevent polluting the outer
+// namespace.
+//
+// TestNameIs is copyable.
+class TestNameIs {
+ public:
+  // Constructor.
+  //
+  // TestNameIs has NO default constructor.
+  explicit TestNameIs(const char *name) : name_(name) {}
+
+  // Returns true if and only if the test name of test_info matches name_.
+  bool operator()(const TestInfo *test_info) const {
+    return test_info && test_info->name() == name_;
+  }
+
+ private:
+  std::string name_;
+};
+
+}  // namespace
+
+namespace internal {
+
+// This method expands all parameterized tests registered with macros TEST_P
+// and INSTANTIATE_TEST_SUITE_P into regular tests and registers those.
+// This will be done just once during the program runtime.
+void UnitTestImpl::RegisterParameterizedTests() {
+  if (!parameterized_tests_registered_) {
+    parameterized_test_registry_.RegisterTests();
+    type_parameterized_test_registry_.CheckForInstantiations();
+    parameterized_tests_registered_ = true;
+  }
+}
+
+}  // namespace internal
+
+// Creates the test object, runs it, records its result, and then
+// deletes it.
+void TestInfo::Run() {
+  if (!should_run_) return;
+
+  // Tells UnitTest where to store test result.
+  internal::UnitTestImpl *const impl = internal::GetUnitTestImpl();
+  impl->set_current_test_info(this);
+
+  TestEventListener *repeater = UnitTest::GetInstance()->listeners().repeater();
+
+  // Notifies the unit test event listeners that a test is about to start.
+  repeater->OnTestStart(*this);
+
+  const TimeInMillis start = internal::GetTimeInMillis();
+
+  impl->os_stack_trace_getter()->UponLeavingGTest();
+
+  // Creates the test object.
+  Test *const test = internal::HandleExceptionsInMethodIfSupported(
+      factory_, &internal::TestFactoryBase::CreateTest,
+      "the test fixture's constructor");
+
+  // Runs the test if the constructor didn't generate a fatal failure or invoke
+  // GTEST_SKIP().
+  // Note that the object will not be null
+  if (!Test::HasFatalFailure() && !Test::IsSkipped()) {
+    // This doesn't throw as all user code that can throw are wrapped into
+    // exception handling code.
+    test->Run();
+  }
+
+  if (test != nullptr) {
+    // Deletes the test object.
+    impl->os_stack_trace_getter()->UponLeavingGTest();
+    internal::HandleExceptionsInMethodIfSupported(
+        test, &Test::DeleteSelf_, "the test fixture's destructor");
+  }
+
+  result_.set_start_timestamp(start);
+  result_.set_elapsed_time(internal::GetTimeInMillis() - start);
+
+  // Notifies the unit test event listener that a test has just finished.
+  repeater->OnTestEnd(*this);
+
+  // Tells UnitTest to stop associating assertion results to this
+  // test.
+  impl->set_current_test_info(nullptr);
+}
+
+// class TestSuite
+
+// Gets the number of successful tests in this test suite.
+int TestSuite::successful_test_count() const {
+  return CountIf(test_info_list_, TestPassed);
+}
+
+// Gets the number of successful tests in this test suite.
+int TestSuite::skipped_test_count() const {
+  return CountIf(test_info_list_, TestSkipped);
+}
+
+// Gets the number of failed tests in this test suite.
+int TestSuite::failed_test_count() const {
+  return CountIf(test_info_list_, TestFailed);
+}
+
+// Gets the number of disabled tests that will be reported in the XML report.
+int TestSuite::reportable_disabled_test_count() const {
+  return CountIf(test_info_list_, TestReportableDisabled);
+}
+
+// Gets the number of disabled tests in this test suite.
+int TestSuite::disabled_test_count() const {
+  return CountIf(test_info_list_, TestDisabled);
+}
+
+// Gets the number of tests to be printed in the XML report.
+int TestSuite::reportable_test_count() const {
+  return CountIf(test_info_list_, TestReportable);
+}
+
+// Get the number of tests in this test suite that should run.
+int TestSuite::test_to_run_count() const {
+  return CountIf(test_info_list_, ShouldRunTest);
+}
+
+// Gets the number of all tests.
+int TestSuite::total_test_count() const {
+  return static_cast<int>(test_info_list_.size());
+}
+
+// Creates a TestSuite with the given name.
+//
+// Arguments:
+//
+//   name:         name of the test suite
+//   a_type_param: the name of the test suite's type parameter, or NULL if
+//                 this is not a typed or a type-parameterized test suite.
+//   set_up_tc:    pointer to the function that sets up the test suite
+//   tear_down_tc: pointer to the function that tears down the test suite
+TestSuite::TestSuite(const char *a_name, const char *a_type_param,
+                     internal::SetUpTestSuiteFunc set_up_tc,
+                     internal::TearDownTestSuiteFunc tear_down_tc)
+    : name_(a_name),
+      type_param_(a_type_param ? new std::string(a_type_param) : nullptr),
+      set_up_tc_(set_up_tc), tear_down_tc_(tear_down_tc), should_run_(false),
+      start_timestamp_(0), elapsed_time_(0) {}
+
+// Destructor of TestSuite.
+TestSuite::~TestSuite() {
+  // Deletes every Test in the collection.
+  ForEach(test_info_list_, internal::Delete<TestInfo>);
+}
+
+// Returns the i-th test among all the tests. i can range from 0 to
+// total_test_count() - 1. If i is not in that range, returns NULL.
+const TestInfo *TestSuite::GetTestInfo(int i) const {
+  const int index = GetElementOr(test_indices_, i, -1);
+  return index < 0 ? nullptr : test_info_list_[static_cast<size_t>(index)];
+}
+
+// Returns the i-th test among all the tests. i can range from 0 to
+// total_test_count() - 1. If i is not in that range, returns NULL.
+TestInfo *TestSuite::GetMutableTestInfo(int i) {
+  const int index = GetElementOr(test_indices_, i, -1);
+  return index < 0 ? nullptr : test_info_list_[static_cast<size_t>(index)];
+}
+
+// Adds a test to this test suite.  Will delete the test upon
+// destruction of the TestSuite object.
+void TestSuite::AddTestInfo(TestInfo *test_info) {
+  test_info_list_.push_back(test_info);
+  test_indices_.push_back(static_cast<int>(test_indices_.size()));
+}
+
+// Runs every test in this TestSuite.
+void TestSuite::Run() {
+  if (!should_run_) return;
+
+  internal::UnitTestImpl *const impl = internal::GetUnitTestImpl();
+  impl->set_current_test_suite(this);
+
+  TestEventListener *repeater = UnitTest::GetInstance()->listeners().repeater();
+
+  // Call both legacy and the new API
+  repeater->OnTestSuiteStart(*this);
+//  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI
+  repeater->OnTestCaseStart(*this);
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI
+
+  impl->os_stack_trace_getter()->UponLeavingGTest();
+  internal::HandleExceptionsInMethodIfSupported(
+      this, &TestSuite::RunSetUpTestSuite, "SetUpTestSuite()");
+
+  start_timestamp_ = internal::GetTimeInMillis();
+  for (int i = 0; i < total_test_count(); i++) {
+    GetMutableTestInfo(i)->Run();
+  }
+  elapsed_time_ = internal::GetTimeInMillis() - start_timestamp_;
+
+  impl->os_stack_trace_getter()->UponLeavingGTest();
+  internal::HandleExceptionsInMethodIfSupported(
+      this, &TestSuite::RunTearDownTestSuite, "TearDownTestSuite()");
+
+  // Call both legacy and the new API
+  repeater->OnTestSuiteEnd(*this);
+//  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI
+  repeater->OnTestCaseEnd(*this);
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI
+
+  impl->set_current_test_suite(nullptr);
+}
+
+// Clears the results of all tests in this test suite.
+void TestSuite::ClearResult() {
+  ad_hoc_test_result_.Clear();
+  ForEach(test_info_list_, TestInfo::ClearTestResult);
+}
+
+// Shuffles the tests in this test suite.
+void TestSuite::ShuffleTests(internal::Random *random) {
+  Shuffle(random, &test_indices_);
+}
+
+// Restores the test order to before the first shuffle.
+void TestSuite::UnshuffleTests() {
+  for (size_t i = 0; i < test_indices_.size(); i++) {
+    test_indices_[i] = static_cast<int>(i);
+  }
+}
+
+// Formats a countable noun.  Depending on its quantity, either the
+// singular form or the plural form is used. e.g.
+//
+// FormatCountableNoun(1, "formula", "formuli") returns "1 formula".
+// FormatCountableNoun(5, "book", "books") returns "5 books".
+static std::string FormatCountableNoun(int count, const char *singular_form,
+                                       const char *plural_form) {
+  return internal::StreamableToString(count) + " " +
+         (count == 1 ? singular_form : plural_form);
+}
+
+// Formats the count of tests.
+static std::string FormatTestCount(int test_count) {
+  return FormatCountableNoun(test_count, "test", "tests");
+}
+
+// Formats the count of test suites.
+static std::string FormatTestSuiteCount(int test_suite_count) {
+  return FormatCountableNoun(test_suite_count, "test suite", "test suites");
+}
+
+// Converts a TestPartResult::Type enum to human-friendly string
+// representation.  Both kNonFatalFailure and kFatalFailure are translated
+// to "Failure", as the user usually doesn't care about the difference
+// between the two when viewing the test result.
+static const char *TestPartResultTypeToString(TestPartResult::Type type) {
+  switch (type) {
+    case TestPartResult::kSkip: return "Skipped";
+    case TestPartResult::kSuccess: return "Success";
+
+    case TestPartResult::kNonFatalFailure:
+    case TestPartResult::kFatalFailure:
+#ifdef _MSC_VER
+      return "error: ";
+#else
+      return "Failure\n";
+#endif
+    default: return "Unknown result type";
+  }
+}
+
+namespace internal {
+
+// Prints a TestPartResult to an std::string.
+static std::string PrintTestPartResultToString(
+    const TestPartResult &test_part_result) {
+  return (Message() << internal::FormatFileLocation(
+                           test_part_result.file_name(),
+                           test_part_result.line_number())
+                    << " "
+                    << TestPartResultTypeToString(test_part_result.type())
+                    << test_part_result.message())
+      .GetString();
+}
+
+// Prints a TestPartResult.
+static void PrintTestPartResult(const TestPartResult &test_part_result) {
+  const std::string &result = PrintTestPartResultToString(test_part_result);
+  printf("%s\n", result.c_str());
+  fflush(stdout);
+  // If the test program runs in Visual Studio or a debugger, the
+  // following statements add the test part result message to the Output
+  // window such that the user can double-click on it to jump to the
+  // corresponding source code location; otherwise they do nothing.
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE
+  // We don't call OutputDebugString*() on Windows Mobile, as printing
+  // to stdout is done by OutputDebugString() there already - we don't
+  // want the same message printed twice.
+  ::OutputDebugStringA(result.c_str());
+  ::OutputDebugStringA("\n");
+#endif
+}
+
+// class PrettyUnitTestResultPrinter
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && \
+    !GTEST_OS_WINDOWS_RT && !GTEST_OS_WINDOWS_MINGW
+
+// Returns the character attribute for the given color.
+static WORD GetColorAttribute(GTestColor color) {
+  switch (color) {
+    case COLOR_RED: return FOREGROUND_RED;
+    case COLOR_GREEN: return FOREGROUND_GREEN;
+    case COLOR_YELLOW: return FOREGROUND_RED | FOREGROUND_GREEN;
+    default: return 0;
+  }
+}
+
+static int GetBitOffset(WORD color_mask) {
+  if (color_mask == 0) return 0;
+
+  int bitOffset = 0;
+  while ((color_mask & 1) == 0) {
+    color_mask >>= 1;
+    ++bitOffset;
+  }
+  return bitOffset;
+}
+
+static WORD GetNewColor(GTestColor color, WORD old_color_attrs) {
+  // Let's reuse the BG
+  static const WORD background_mask = BACKGROUND_BLUE | BACKGROUND_GREEN |
+                                      BACKGROUND_RED | BACKGROUND_INTENSITY;
+  static const WORD foreground_mask = FOREGROUND_BLUE | FOREGROUND_GREEN |
+                                      FOREGROUND_RED | FOREGROUND_INTENSITY;
+  const WORD existing_bg = old_color_attrs & background_mask;
+
+  WORD new_color =
+      GetColorAttribute(color) | existing_bg | FOREGROUND_INTENSITY;
+  static const int bg_bitOffset = GetBitOffset(background_mask);
+  static const int fg_bitOffset = GetBitOffset(foreground_mask);
+
+  if (((new_color & background_mask) >> bg_bitOffset) ==
+      ((new_color & foreground_mask) >> fg_bitOffset)) {
+    new_color ^= FOREGROUND_INTENSITY;  // invert intensity
+  }
+  return new_color;
+}
+
+#else
+
+// Returns the ANSI color code for the given color.  COLOR_DEFAULT is
+// an invalid input.
+static const char *GetAnsiColorCode(GTestColor color) {
+  switch (color) {
+    case COLOR_RED: return "1";
+    case COLOR_GREEN: return "2";
+    case COLOR_YELLOW: return "3";
+    default: return nullptr;
+  }
+}
+
+#endif  // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE
+
+// Returns true if and only if Google Test should use colors in the output.
+bool ShouldUseColor(bool stdout_is_tty) {
+  const char *const gtest_color = GTEST_FLAG(color).c_str();
+
+  if (String::CaseInsensitiveCStringEquals(gtest_color, "auto")) {
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MINGW
+    // On Windows the TERM variable is usually not set, but the
+    // console there does support colors.
+    return stdout_is_tty;
+#else
+    // On non-Windows platforms, we rely on the TERM variable.
+    const char *const term = posix::GetEnv("TERM");
+    const bool term_supports_color =
+        String::CStringEquals(term, "xterm") ||
+        String::CStringEquals(term, "xterm-color") ||
+        String::CStringEquals(term, "xterm-256color") ||
+        String::CStringEquals(term, "screen") ||
+        String::CStringEquals(term, "screen-256color") ||
+        String::CStringEquals(term, "tmux") ||
+        String::CStringEquals(term, "tmux-256color") ||
+        String::CStringEquals(term, "rxvt-unicode") ||
+        String::CStringEquals(term, "rxvt-unicode-256color") ||
+        String::CStringEquals(term, "linux") ||
+        String::CStringEquals(term, "cygwin");
+    return stdout_is_tty && term_supports_color;
+#endif  // GTEST_OS_WINDOWS
+  }
+
+  return String::CaseInsensitiveCStringEquals(gtest_color, "yes") ||
+         String::CaseInsensitiveCStringEquals(gtest_color, "true") ||
+         String::CaseInsensitiveCStringEquals(gtest_color, "t") ||
+         String::CStringEquals(gtest_color, "1");
+  // We take "yes", "true", "t", and "1" as meaning "yes".  If the
+  // value is neither one of these nor "auto", we treat it as "no" to
+  // be conservative.
+}
+
+// Helpers for printing colored strings to stdout. Note that on Windows, we
+// cannot simply emit special characters and have the terminal change colors.
+// This routine must actually emit the characters rather than return a string
+// that would be colored when printed, as can be done on Linux.
+void ColoredPrintf(GTestColor color, const char *fmt, ...) {
+  va_list args;
+  va_start(args, fmt);
+
+#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_ZOS || GTEST_OS_IOS || \
+    GTEST_OS_WINDOWS_PHONE || GTEST_OS_WINDOWS_RT || defined(ESP_PLATFORM)
+  const bool use_color = AlwaysFalse();
+#else
+  static const bool in_color_mode =
+      ShouldUseColor(posix::IsATTY(posix::FileNo(stdout)) != 0);
+  const bool use_color = in_color_mode && (color != COLOR_DEFAULT);
+#endif  // GTEST_OS_WINDOWS_MOBILE || GTEST_OS_ZOS
+
+  if (!use_color) {
+    vprintf(fmt, args);
+    va_end(args);
+    return;
+  }
+
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && \
+    !GTEST_OS_WINDOWS_RT && !GTEST_OS_WINDOWS_MINGW
+  const HANDLE stdout_handle = GetStdHandle(STD_OUTPUT_HANDLE);
+
+  // Gets the current text color.
+  CONSOLE_SCREEN_BUFFER_INFO buffer_info;
+  GetConsoleScreenBufferInfo(stdout_handle, &buffer_info);
+  const WORD old_color_attrs = buffer_info.wAttributes;
+  const WORD new_color = GetNewColor(color, old_color_attrs);
+
+  // We need to flush the stream buffers into the console before each
+  // SetConsoleTextAttribute call lest it affect the text that is already
+  // printed but has not yet reached the console.
+  fflush(stdout);
+  SetConsoleTextAttribute(stdout_handle, new_color);
+
+  vprintf(fmt, args);
+
+  fflush(stdout);
+  // Restores the text color.
+  SetConsoleTextAttribute(stdout_handle, old_color_attrs);
+#else
+  printf("\033[0;3%sm", GetAnsiColorCode(color));
+  vprintf(fmt, args);
+  printf("\033[m");  // Resets the terminal to default.
+#endif  // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE
+  va_end(args);
+}
+
+// Text printed in Google Test's text output and --gtest_list_tests
+// output to label the type parameter and value parameter for a test.
+static const char kTypeParamLabel[] = "TypeParam";
+static const char kValueParamLabel[] = "GetParam()";
+
+static void PrintFullTestCommentIfPresent(const TestInfo &test_info) {
+  const char *const type_param = test_info.type_param();
+  const char *const value_param = test_info.value_param();
+
+  if (type_param != nullptr || value_param != nullptr) {
+    printf(", where ");
+    if (type_param != nullptr) {
+      printf("%s = %s", kTypeParamLabel, type_param);
+      if (value_param != nullptr) printf(" and ");
+    }
+    if (value_param != nullptr) {
+      printf("%s = %s", kValueParamLabel, value_param);
+    }
+  }
+}
+
+// This class implements the TestEventListener interface.
+//
+// Class PrettyUnitTestResultPrinter is copyable.
+class PrettyUnitTestResultPrinter : public TestEventListener {
+ public:
+  PrettyUnitTestResultPrinter() {}
+  static void PrintTestName(const char *test_suite, const char *test) {
+    printf("%s.%s", test_suite, test);
+  }
+
+  // The following methods override what's in the TestEventListener class.
+  void OnTestProgramStart(const UnitTest & /*unit_test*/) override {}
+  void OnTestIterationStart(const UnitTest &unit_test, int iteration) override;
+  void OnEnvironmentsSetUpStart(const UnitTest &unit_test) override;
+  void OnEnvironmentsSetUpEnd(const UnitTest & /*unit_test*/) override {}
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  void OnTestCaseStart(const TestCase &test_case) override;
+#else
+  void OnTestSuiteStart(const TestSuite &test_suite) override;
+#endif  // OnTestCaseStart
+
+  void OnTestStart(const TestInfo &test_info) override;
+
+  void OnTestPartResult(const TestPartResult &result) override;
+  void OnTestEnd(const TestInfo &test_info) override;
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  void OnTestCaseEnd(const TestCase &test_case) override;
+#else
+  void OnTestSuiteEnd(const TestSuite &test_suite) override;
+#endif  // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+  void OnEnvironmentsTearDownStart(const UnitTest &unit_test) override;
+  void OnEnvironmentsTearDownEnd(const UnitTest & /*unit_test*/) override {}
+  void OnTestIterationEnd(const UnitTest &unit_test, int iteration) override;
+  void OnTestProgramEnd(const UnitTest & /*unit_test*/) override {}
+
+ private:
+  static void PrintFailedTests(const UnitTest &unit_test);
+  static void PrintFailedTestSuites(const UnitTest &unit_test);
+  static void PrintSkippedTests(const UnitTest &unit_test);
+};
+
+// Fired before each iteration of tests starts.
+void PrettyUnitTestResultPrinter::OnTestIterationStart(
+    const UnitTest &unit_test, int iteration) {
+  if (GTEST_FLAG(repeat) != 1)
+    printf("\nRepeating all tests (iteration %d) . . .\n\n", iteration + 1);
+
+  const char *const filter = GTEST_FLAG(filter).c_str();
+
+  // Prints the filter if it's not *.  This reminds the user that some
+  // tests may be skipped.
+  if (!String::CStringEquals(filter, kUniversalFilter)) {
+    ColoredPrintf(COLOR_YELLOW, "Note: %s filter = %s\n", GTEST_NAME_, filter);
+  }
+
+  if (internal::ShouldShard(kTestTotalShards, kTestShardIndex, false)) {
+    const int32_t shard_index = Int32FromEnvOrDie(kTestShardIndex, -1);
+    ColoredPrintf(COLOR_YELLOW, "Note: This is test shard %d of %s.\n",
+                  static_cast<int>(shard_index) + 1,
+                  internal::posix::GetEnv(kTestTotalShards));
+  }
+
+  if (GTEST_FLAG(shuffle)) {
+    ColoredPrintf(COLOR_YELLOW,
+                  "Note: Randomizing tests' orders with a seed of %d .\n",
+                  unit_test.random_seed());
+  }
+
+  ColoredPrintf(COLOR_GREEN, "[==========] ");
+  printf("Running %s from %s.\n",
+         FormatTestCount(unit_test.test_to_run_count()).c_str(),
+         FormatTestSuiteCount(unit_test.test_suite_to_run_count()).c_str());
+  fflush(stdout);
+}
+
+void PrettyUnitTestResultPrinter::OnEnvironmentsSetUpStart(
+    const UnitTest & /*unit_test*/) {
+  ColoredPrintf(COLOR_GREEN, "[----------] ");
+  printf("Global test environment set-up.\n");
+  fflush(stdout);
+}
+
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+void PrettyUnitTestResultPrinter::OnTestCaseStart(const TestCase &test_case) {
+  const std::string counts =
+      FormatCountableNoun(test_case.test_to_run_count(), "test", "tests");
+  ColoredPrintf(COLOR_GREEN, "[----------] ");
+  printf("%s from %s", counts.c_str(), test_case.name());
+  if (test_case.type_param() == nullptr) {
+    printf("\n");
+  } else {
+    printf(", where %s = %s\n", kTypeParamLabel, test_case.type_param());
+  }
+  fflush(stdout);
+}
+#else
+void PrettyUnitTestResultPrinter::OnTestSuiteStart(
+    const TestSuite &test_suite) {
+  const std::string counts =
+      FormatCountableNoun(test_suite.test_to_run_count(), "test", "tests");
+  ColoredPrintf(COLOR_GREEN, "[----------] ");
+  printf("%s from %s", counts.c_str(), test_suite.name());
+  if (test_suite.type_param() == nullptr) {
+    printf("\n");
+  } else {
+    printf(", where %s = %s\n", kTypeParamLabel, test_suite.type_param());
+  }
+  fflush(stdout);
+}
+#endif  // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+void PrettyUnitTestResultPrinter::OnTestStart(const TestInfo &test_info) {
+  ColoredPrintf(COLOR_GREEN, "[ RUN      ] ");
+  PrintTestName(test_info.test_suite_name(), test_info.name());
+  printf("\n");
+  fflush(stdout);
+}
+
+// Called after an assertion failure.
+void PrettyUnitTestResultPrinter::OnTestPartResult(
+    const TestPartResult &result) {
+  switch (result.type()) {
+    // If the test part succeeded, we don't need to do anything.
+    case TestPartResult::kSuccess: return;
+    default:
+      // Print failure message from the assertion
+      // (e.g. expected this and got that).
+      PrintTestPartResult(result);
+      fflush(stdout);
+  }
+}
+
+void PrettyUnitTestResultPrinter::OnTestEnd(const TestInfo &test_info) {
+  if (test_info.result()->Passed()) {
+    ColoredPrintf(COLOR_GREEN, "[       OK ] ");
+  } else if (test_info.result()->Skipped()) {
+    ColoredPrintf(COLOR_GREEN, "[  SKIPPED ] ");
+  } else {
+    ColoredPrintf(COLOR_RED, "[  FAILED  ] ");
+  }
+  PrintTestName(test_info.test_suite_name(), test_info.name());
+  if (test_info.result()->Failed()) PrintFullTestCommentIfPresent(test_info);
+
+  if (GTEST_FLAG(print_time)) {
+    printf(" (%s ms)\n",
+           internal::StreamableToString(test_info.result()->elapsed_time())
+               .c_str());
+  } else {
+    printf("\n");
+  }
+  fflush(stdout);
+}
+
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+void PrettyUnitTestResultPrinter::OnTestCaseEnd(const TestCase &test_case) {
+  if (!GTEST_FLAG(print_time)) return;
+
+  const std::string counts =
+      FormatCountableNoun(test_case.test_to_run_count(), "test", "tests");
+  ColoredPrintf(COLOR_GREEN, "[----------] ");
+  printf("%s from %s (%s ms total)\n\n", counts.c_str(), test_case.name(),
+         internal::StreamableToString(test_case.elapsed_time()).c_str());
+  fflush(stdout);
+}
+#else
+void PrettyUnitTestResultPrinter::OnTestSuiteEnd(const TestSuite &test_suite) {
+  if (!GTEST_FLAG(print_time)) return;
+
+  const std::string counts =
+      FormatCountableNoun(test_suite.test_to_run_count(), "test", "tests");
+  ColoredPrintf(COLOR_GREEN, "[----------] ");
+  printf("%s from %s (%s ms total)\n\n", counts.c_str(), test_suite.name(),
+         internal::StreamableToString(test_suite.elapsed_time()).c_str());
+  fflush(stdout);
+}
+#endif  // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+void PrettyUnitTestResultPrinter::OnEnvironmentsTearDownStart(
+    const UnitTest & /*unit_test*/) {
+  ColoredPrintf(COLOR_GREEN, "[----------] ");
+  printf("Global test environment tear-down\n");
+  fflush(stdout);
+}
+
+// Internal helper for printing the list of failed tests.
+void PrettyUnitTestResultPrinter::PrintFailedTests(const UnitTest &unit_test) {
+  const int failed_test_count = unit_test.failed_test_count();
+  ColoredPrintf(COLOR_RED, "[  FAILED  ] ");
+  printf("%s, listed below:\n", FormatTestCount(failed_test_count).c_str());
+
+  for (int i = 0; i < unit_test.total_test_suite_count(); ++i) {
+    const TestSuite &test_suite = *unit_test.GetTestSuite(i);
+    if (!test_suite.should_run() || (test_suite.failed_test_count() == 0)) {
+      continue;
+    }
+    for (int j = 0; j < test_suite.total_test_count(); ++j) {
+      const TestInfo &test_info = *test_suite.GetTestInfo(j);
+      if (!test_info.should_run() || !test_info.result()->Failed()) {
+        continue;
+      }
+      ColoredPrintf(COLOR_RED, "[  FAILED  ] ");
+      printf("%s.%s", test_suite.name(), test_info.name());
+      PrintFullTestCommentIfPresent(test_info);
+      printf("\n");
+    }
+  }
+  printf("\n%2d FAILED %s\n", failed_test_count,
+         failed_test_count == 1 ? "TEST" : "TESTS");
+}
+
+// Internal helper for printing the list of test suite failures not covered by
+// PrintFailedTests.
+void PrettyUnitTestResultPrinter::PrintFailedTestSuites(
+    const UnitTest &unit_test) {
+  int suite_failure_count = 0;
+  for (int i = 0; i < unit_test.total_test_suite_count(); ++i) {
+    const TestSuite &test_suite = *unit_test.GetTestSuite(i);
+    if (!test_suite.should_run()) {
+      continue;
+    }
+    if (test_suite.ad_hoc_test_result().Failed()) {
+      ColoredPrintf(COLOR_RED, "[  FAILED  ] ");
+      printf("%s: SetUpTestSuite or TearDownTestSuite\n", test_suite.name());
+      ++suite_failure_count;
+    }
+  }
+  if (suite_failure_count > 0) {
+    printf("\n%2d FAILED TEST %s\n", suite_failure_count,
+           suite_failure_count == 1 ? "SUITE" : "SUITES");
+  }
+}
+
+// Internal helper for printing the list of skipped tests.
+void PrettyUnitTestResultPrinter::PrintSkippedTests(const UnitTest &unit_test) {
+  const int skipped_test_count = unit_test.skipped_test_count();
+  if (skipped_test_count == 0) {
+    return;
+  }
+
+  for (int i = 0; i < unit_test.total_test_suite_count(); ++i) {
+    const TestSuite &test_suite = *unit_test.GetTestSuite(i);
+    if (!test_suite.should_run() || (test_suite.skipped_test_count() == 0)) {
+      continue;
+    }
+    for (int j = 0; j < test_suite.total_test_count(); ++j) {
+      const TestInfo &test_info = *test_suite.GetTestInfo(j);
+      if (!test_info.should_run() || !test_info.result()->Skipped()) {
+        continue;
+      }
+      ColoredPrintf(COLOR_GREEN, "[  SKIPPED ] ");
+      printf("%s.%s", test_suite.name(), test_info.name());
+      printf("\n");
+    }
+  }
+}
+
+void PrettyUnitTestResultPrinter::OnTestIterationEnd(const UnitTest &unit_test,
+                                                     int /*iteration*/) {
+  ColoredPrintf(COLOR_GREEN, "[==========] ");
+  printf("%s from %s ran.",
+         FormatTestCount(unit_test.test_to_run_count()).c_str(),
+         FormatTestSuiteCount(unit_test.test_suite_to_run_count()).c_str());
+  if (GTEST_FLAG(print_time)) {
+    printf(" (%s ms total)",
+           internal::StreamableToString(unit_test.elapsed_time()).c_str());
+  }
+  printf("\n");
+  ColoredPrintf(COLOR_GREEN, "[  PASSED  ] ");
+  printf("%s.\n", FormatTestCount(unit_test.successful_test_count()).c_str());
+
+  const int skipped_test_count = unit_test.skipped_test_count();
+  if (skipped_test_count > 0) {
+    ColoredPrintf(COLOR_GREEN, "[  SKIPPED ] ");
+    printf("%s, listed below:\n", FormatTestCount(skipped_test_count).c_str());
+    PrintSkippedTests(unit_test);
+  }
+
+  if (!unit_test.Passed()) {
+    PrintFailedTests(unit_test);
+    PrintFailedTestSuites(unit_test);
+  }
+
+  int num_disabled = unit_test.reportable_disabled_test_count();
+  if (num_disabled && !GTEST_FLAG(also_run_disabled_tests)) {
+    if (unit_test.Passed()) {
+      printf("\n");  // Add a spacer if no FAILURE banner is displayed.
+    }
+    ColoredPrintf(COLOR_YELLOW, "  YOU HAVE %d DISABLED %s\n\n", num_disabled,
+                  num_disabled == 1 ? "TEST" : "TESTS");
+  }
+  // Ensure that Google Test output is printed before, e.g., heapchecker output.
+  fflush(stdout);
+}
+
+// End PrettyUnitTestResultPrinter
+
+// class TestEventRepeater
+//
+// This class forwards events to other event listeners.
+class TestEventRepeater : public TestEventListener {
+ public:
+  TestEventRepeater() : forwarding_enabled_(true) {}
+  ~TestEventRepeater() override;
+  void Append(TestEventListener *listener);
+  TestEventListener *Release(TestEventListener *listener);
+
+  // Controls whether events will be forwarded to listeners_. Set to false
+  // in death test child processes.
+  bool forwarding_enabled() const { return forwarding_enabled_; }
+  void set_forwarding_enabled(bool enable) { forwarding_enabled_ = enable; }
+
+  void OnTestProgramStart(const UnitTest &unit_test) override;
+  void OnTestIterationStart(const UnitTest &unit_test, int iteration) override;
+  void OnEnvironmentsSetUpStart(const UnitTest &unit_test) override;
+  void OnEnvironmentsSetUpEnd(const UnitTest &unit_test) override;
+//  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  void OnTestCaseStart(const TestSuite &parameter) override;
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  void OnTestSuiteStart(const TestSuite &parameter) override;
+  void OnTestStart(const TestInfo &test_info) override;
+  void OnTestPartResult(const TestPartResult &result) override;
+  void OnTestEnd(const TestInfo &test_info) override;
+//  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  void OnTestCaseEnd(const TestCase &parameter) override;
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  void OnTestSuiteEnd(const TestSuite &parameter) override;
+  void OnEnvironmentsTearDownStart(const UnitTest &unit_test) override;
+  void OnEnvironmentsTearDownEnd(const UnitTest &unit_test) override;
+  void OnTestIterationEnd(const UnitTest &unit_test, int iteration) override;
+  void OnTestProgramEnd(const UnitTest &unit_test) override;
+
+ private:
+  // Controls whether events will be forwarded to listeners_. Set to false
+  // in death test child processes.
+  bool forwarding_enabled_;
+  // The list of listeners that receive events.
+  std::vector<TestEventListener *> listeners_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestEventRepeater);
+};
+
+TestEventRepeater::~TestEventRepeater() {
+  ForEach(listeners_, Delete<TestEventListener>);
+}
+
+void TestEventRepeater::Append(TestEventListener *listener) {
+  listeners_.push_back(listener);
+}
+
+TestEventListener *TestEventRepeater::Release(TestEventListener *listener) {
+  for (size_t i = 0; i < listeners_.size(); ++i) {
+    if (listeners_[i] == listener) {
+      listeners_.erase(listeners_.begin() + static_cast<int>(i));
+      return listener;
+    }
+  }
+
+  return nullptr;
+}
+
+// Since most methods are very similar, use macros to reduce boilerplate.
+// This defines a member that forwards the call to all listeners.
+#define GTEST_REPEATER_METHOD_(Name, Type)              \
+  void TestEventRepeater::Name(const Type &parameter) { \
+    if (forwarding_enabled_) {                          \
+      for (size_t i = 0; i < listeners_.size(); i++) {  \
+        listeners_[i]->Name(parameter);                 \
+      }                                                 \
+    }                                                   \
+  }
+// This defines a member that forwards the call to all listeners in reverse
+// order.
+#define GTEST_REVERSE_REPEATER_METHOD_(Name, Type)      \
+  void TestEventRepeater::Name(const Type &parameter) { \
+    if (forwarding_enabled_) {                          \
+      for (size_t i = listeners_.size(); i != 0; i--) { \
+        listeners_[i - 1]->Name(parameter);             \
+      }                                                 \
+    }                                                   \
+  }
+
+GTEST_REPEATER_METHOD_(OnTestProgramStart, UnitTest)
+GTEST_REPEATER_METHOD_(OnEnvironmentsSetUpStart, UnitTest)
+//  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+GTEST_REPEATER_METHOD_(OnTestCaseStart, TestSuite)
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+GTEST_REPEATER_METHOD_(OnTestSuiteStart, TestSuite)
+GTEST_REPEATER_METHOD_(OnTestStart, TestInfo)
+GTEST_REPEATER_METHOD_(OnTestPartResult, TestPartResult)
+GTEST_REPEATER_METHOD_(OnEnvironmentsTearDownStart, UnitTest)
+GTEST_REVERSE_REPEATER_METHOD_(OnEnvironmentsSetUpEnd, UnitTest)
+GTEST_REVERSE_REPEATER_METHOD_(OnEnvironmentsTearDownEnd, UnitTest)
+GTEST_REVERSE_REPEATER_METHOD_(OnTestEnd, TestInfo)
+//  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+GTEST_REVERSE_REPEATER_METHOD_(OnTestCaseEnd, TestSuite)
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+GTEST_REVERSE_REPEATER_METHOD_(OnTestSuiteEnd, TestSuite)
+GTEST_REVERSE_REPEATER_METHOD_(OnTestProgramEnd, UnitTest)
+
+#undef GTEST_REPEATER_METHOD_
+#undef GTEST_REVERSE_REPEATER_METHOD_
+
+void TestEventRepeater::OnTestIterationStart(const UnitTest &unit_test,
+                                             int iteration) {
+  if (forwarding_enabled_) {
+    for (size_t i = 0; i < listeners_.size(); i++) {
+      listeners_[i]->OnTestIterationStart(unit_test, iteration);
+    }
+  }
+}
+
+void TestEventRepeater::OnTestIterationEnd(const UnitTest &unit_test,
+                                           int iteration) {
+  if (forwarding_enabled_) {
+    for (size_t i = listeners_.size(); i > 0; i--) {
+      listeners_[i - 1]->OnTestIterationEnd(unit_test, iteration);
+    }
+  }
+}
+
+// End TestEventRepeater
+
+// This class generates an XML output file.
+class XmlUnitTestResultPrinter : public EmptyTestEventListener {
+ public:
+  explicit XmlUnitTestResultPrinter(const char *output_file);
+
+  void OnTestIterationEnd(const UnitTest &unit_test, int iteration) override;
+  void ListTestsMatchingFilter(const std::vector<TestSuite *> &test_suites);
+
+  // Prints an XML summary of all unit tests.
+  static void PrintXmlTestsList(std::ostream *stream,
+                                const std::vector<TestSuite *> &test_suites);
+
+ private:
+  // Is c a whitespace character that is normalized to a space character
+  // when it appears in an XML attribute value?
+  static bool IsNormalizableWhitespace(char c) {
+    return c == 0x9 || c == 0xA || c == 0xD;
+  }
+
+  // May c appear in a well-formed XML document?
+  static bool IsValidXmlCharacter(char c) {
+    return IsNormalizableWhitespace(c) || c >= 0x20;
+  }
+
+  // Returns an XML-escaped copy of the input string str.  If
+  // is_attribute is true, the text is meant to appear as an attribute
+  // value, and normalizable whitespace is preserved by replacing it
+  // with character references.
+  static std::string EscapeXml(const std::string &str, bool is_attribute);
+
+  // Returns the given string with all characters invalid in XML removed.
+  static std::string RemoveInvalidXmlCharacters(const std::string &str);
+
+  // Convenience wrapper around EscapeXml when str is an attribute value.
+  static std::string EscapeXmlAttribute(const std::string &str) {
+    return EscapeXml(str, true);
+  }
+
+  // Convenience wrapper around EscapeXml when str is not an attribute value.
+  static std::string EscapeXmlText(const char *str) {
+    return EscapeXml(str, false);
+  }
+
+  // Verifies that the given attribute belongs to the given element and
+  // streams the attribute as XML.
+  static void OutputXmlAttribute(std::ostream *stream,
+                                 const std::string &element_name,
+                                 const std::string &name,
+                                 const std::string &value);
+
+  // Streams an XML CDATA section, escaping invalid CDATA sequences as needed.
+  static void OutputXmlCDataSection(::std::ostream *stream, const char *data);
+
+  // Streams an XML representation of a TestInfo object.
+  static void OutputXmlTestInfo(::std::ostream *stream,
+                                const char *test_suite_name,
+                                const TestInfo &test_info);
+
+  // Prints an XML representation of a TestSuite object
+  static void PrintXmlTestSuite(::std::ostream *stream,
+                                const TestSuite &test_suite);
+
+  // Prints an XML summary of unit_test to output stream out.
+  static void PrintXmlUnitTest(::std::ostream *stream,
+                               const UnitTest &unit_test);
+
+  // Produces a string representing the test properties in a result as space
+  // delimited XML attributes based on the property key="value" pairs.
+  // When the std::string is not empty, it includes a space at the beginning,
+  // to delimit this attribute from prior attributes.
+  static std::string TestPropertiesAsXmlAttributes(const TestResult &result);
+
+  // Streams an XML representation of the test properties of a TestResult
+  // object.
+  static void OutputXmlTestProperties(std::ostream *stream,
+                                      const TestResult &result);
+
+  // The output file.
+  const std::string output_file_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(XmlUnitTestResultPrinter);
+};
+
+// Creates a new XmlUnitTestResultPrinter.
+XmlUnitTestResultPrinter::XmlUnitTestResultPrinter(const char *output_file)
+    : output_file_(output_file) {
+  if (output_file_.empty()) {
+    GTEST_LOG_(FATAL) << "XML output file may not be null";
+  }
+}
+
+// Called after the unit test ends.
+void XmlUnitTestResultPrinter::OnTestIterationEnd(const UnitTest &unit_test,
+                                                  int /*iteration*/) {
+  FILE *xmlout = OpenFileForWriting(output_file_);
+  std::stringstream stream;
+  PrintXmlUnitTest(&stream, unit_test);
+  fprintf(xmlout, "%s", StringStreamToString(&stream).c_str());
+  fclose(xmlout);
+}
+
+void XmlUnitTestResultPrinter::ListTestsMatchingFilter(
+    const std::vector<TestSuite *> &test_suites) {
+  FILE *xmlout = OpenFileForWriting(output_file_);
+  std::stringstream stream;
+  PrintXmlTestsList(&stream, test_suites);
+  fprintf(xmlout, "%s", StringStreamToString(&stream).c_str());
+  fclose(xmlout);
+}
+
+// Returns an XML-escaped copy of the input string str.  If is_attribute
+// is true, the text is meant to appear as an attribute value, and
+// normalizable whitespace is preserved by replacing it with character
+// references.
+//
+// Invalid XML characters in str, if any, are stripped from the output.
+// It is expected that most, if not all, of the text processed by this
+// module will consist of ordinary English text.
+// If this module is ever modified to produce version 1.1 XML output,
+// most invalid characters can be retained using character references.
+std::string XmlUnitTestResultPrinter::EscapeXml(const std::string &str,
+                                                bool is_attribute) {
+  Message m;
+
+  for (size_t i = 0; i < str.size(); ++i) {
+    const char ch = str[i];
+    switch (ch) {
+      case '<': m << "&lt;"; break;
+      case '>': m << "&gt;"; break;
+      case '&': m << "&amp;"; break;
+      case '\'':
+        if (is_attribute)
+          m << "&apos;";
+        else
+          m << '\'';
+        break;
+      case '"':
+        if (is_attribute)
+          m << "&quot;";
+        else
+          m << '"';
+        break;
+      default:
+        if (IsValidXmlCharacter(ch)) {
+          if (is_attribute && IsNormalizableWhitespace(ch))
+            m << "&#x" << String::FormatByte(static_cast<unsigned char>(ch))
+              << ";";
+          else
+            m << ch;
+        }
+        break;
+    }
+  }
+
+  return m.GetString();
+}
+
+// Returns the given string with all characters invalid in XML removed.
+// Currently invalid characters are dropped from the string. An
+// alternative is to replace them with certain characters such as . or ?.
+std::string XmlUnitTestResultPrinter::RemoveInvalidXmlCharacters(
+    const std::string &str) {
+  std::string output;
+  output.reserve(str.size());
+  for (std::string::const_iterator it = str.begin(); it != str.end(); ++it)
+    if (IsValidXmlCharacter(*it)) output.push_back(*it);
+
+  return output;
+}
+
+// The following routines generate an XML representation of a UnitTest
+// object.
+// GOOGLETEST_CM0009 DO NOT DELETE
+//
+// This is how Google Test concepts map to the DTD:
+//
+// <testsuites name="AllTests">        <-- corresponds to a UnitTest object
+//   <testsuite name="testcase-name">  <-- corresponds to a TestSuite object
+//     <testcase name="test-name">     <-- corresponds to a TestInfo object
+//       <failure message="...">...</failure>
+//       <failure message="...">...</failure>
+//       <failure message="...">...</failure>
+//                                     <-- individual assertion failures
+//     </testcase>
+//   </testsuite>
+// </testsuites>
+
+// Formats the given time in milliseconds as seconds.
+std::string FormatTimeInMillisAsSeconds(TimeInMillis ms) {
+  ::std::stringstream ss;
+  ss << (static_cast<double>(ms) * 1e-3);
+  return ss.str();
+}
+
+static bool PortableLocaltime(time_t seconds, struct tm *out) {
+#if defined(_MSC_VER)
+  return localtime_s(out, &seconds) == 0;
+#elif defined(__MINGW32__) || defined(__MINGW64__)
+  // MINGW <time.h> provides neither localtime_r nor localtime_s, but uses
+  // Windows' localtime(), which has a thread-local tm buffer.
+  struct tm *tm_ptr = localtime(&seconds);  // NOLINT
+  if (tm_ptr == nullptr) return false;
+  *out = *tm_ptr;
+  return true;
+#else
+  return localtime_r(&seconds, out) != nullptr;
+#endif
+}
+
+// Converts the given epoch time in milliseconds to a date string in the ISO
+// 8601 format, without the timezone information.
+std::string FormatEpochTimeInMillisAsIso8601(TimeInMillis ms) {
+  struct tm time_struct;
+  if (!PortableLocaltime(static_cast<time_t>(ms / 1000), &time_struct))
+    return "";
+  // YYYY-MM-DDThh:mm:ss
+  return StreamableToString(time_struct.tm_year + 1900) + "-" +
+         String::FormatIntWidth2(time_struct.tm_mon + 1) + "-" +
+         String::FormatIntWidth2(time_struct.tm_mday) + "T" +
+         String::FormatIntWidth2(time_struct.tm_hour) + ":" +
+         String::FormatIntWidth2(time_struct.tm_min) + ":" +
+         String::FormatIntWidth2(time_struct.tm_sec);
+}
+
+// Streams an XML CDATA section, escaping invalid CDATA sequences as needed.
+void XmlUnitTestResultPrinter::OutputXmlCDataSection(::std::ostream *stream,
+                                                     const char *data) {
+  const char *segment = data;
+  *stream << "<![CDATA[";
+  for (;;) {
+    const char *const next_segment = strstr(segment, "]]>");
+    if (next_segment != nullptr) {
+      stream->write(segment,
+                    static_cast<std::streamsize>(next_segment - segment));
+      *stream << "]]>]]&gt;<![CDATA[";
+      segment = next_segment + strlen("]]>");
+    } else {
+      *stream << segment;
+      break;
+    }
+  }
+  *stream << "]]>";
+}
+
+void XmlUnitTestResultPrinter::OutputXmlAttribute(
+    std::ostream *stream, const std::string &element_name,
+    const std::string &name, const std::string &value) {
+  const std::vector<std::string> &allowed_names =
+      GetReservedOutputAttributesForElement(element_name);
+
+  GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) !=
+               allowed_names.end())
+      << "Attribute " << name << " is not allowed for element <" << element_name
+      << ">.";
+
+  *stream << " " << name << "=\"" << EscapeXmlAttribute(value) << "\"";
+}
+
+// Prints an XML representation of a TestInfo object.
+void XmlUnitTestResultPrinter::OutputXmlTestInfo(::std::ostream *stream,
+                                                 const char *test_suite_name,
+                                                 const TestInfo &test_info) {
+  const TestResult &result = *test_info.result();
+  const std::string kTestsuite = "testcase";
+
+  if (test_info.is_in_another_shard()) {
+    return;
+  }
+
+  *stream << "    <testcase";
+  OutputXmlAttribute(stream, kTestsuite, "name", test_info.name());
+
+  if (test_info.value_param() != nullptr) {
+    OutputXmlAttribute(stream, kTestsuite, "value_param",
+                       test_info.value_param());
+  }
+  if (test_info.type_param() != nullptr) {
+    OutputXmlAttribute(stream, kTestsuite, "type_param",
+                       test_info.type_param());
+  }
+  if (GTEST_FLAG(list_tests)) {
+    OutputXmlAttribute(stream, kTestsuite, "file", test_info.file());
+    OutputXmlAttribute(stream, kTestsuite, "line",
+                       StreamableToString(test_info.line()));
+    *stream << " />\n";
+    return;
+  }
+
+  OutputXmlAttribute(stream, kTestsuite, "status",
+                     test_info.should_run() ? "run" : "notrun");
+  OutputXmlAttribute(stream, kTestsuite, "result",
+                     test_info.should_run()
+                         ? (result.Skipped() ? "skipped" : "completed")
+                         : "suppressed");
+  OutputXmlAttribute(stream, kTestsuite, "time",
+                     FormatTimeInMillisAsSeconds(result.elapsed_time()));
+  OutputXmlAttribute(
+      stream, kTestsuite, "timestamp",
+      FormatEpochTimeInMillisAsIso8601(result.start_timestamp()));
+  OutputXmlAttribute(stream, kTestsuite, "classname", test_suite_name);
+
+  int failures = 0;
+  for (int i = 0; i < result.total_part_count(); ++i) {
+    const TestPartResult &part = result.GetTestPartResult(i);
+    if (part.failed()) {
+      if (++failures == 1) {
+        *stream << ">\n";
+      }
+      const std::string location =
+          internal::FormatCompilerIndependentFileLocation(part.file_name(),
+                                                          part.line_number());
+      const std::string summary = location + "\n" + part.summary();
+      *stream << "      <failure message=\""
+              << EscapeXmlAttribute(summary.c_str()) << "\" type=\"\">";
+      const std::string detail = location + "\n" + part.message();
+      OutputXmlCDataSection(stream, RemoveInvalidXmlCharacters(detail).c_str());
+      *stream << "</failure>\n";
+    }
+  }
+
+  if (failures == 0 && result.test_property_count() == 0) {
+    *stream << " />\n";
+  } else {
+    if (failures == 0) {
+      *stream << ">\n";
+    }
+    OutputXmlTestProperties(stream, result);
+    *stream << "    </testcase>\n";
+  }
+}
+
+// Prints an XML representation of a TestSuite object
+void XmlUnitTestResultPrinter::PrintXmlTestSuite(std::ostream *stream,
+                                                 const TestSuite &test_suite) {
+  const std::string kTestsuite = "testsuite";
+  *stream << "  <" << kTestsuite;
+  OutputXmlAttribute(stream, kTestsuite, "name", test_suite.name());
+  OutputXmlAttribute(stream, kTestsuite, "tests",
+                     StreamableToString(test_suite.reportable_test_count()));
+  if (!GTEST_FLAG(list_tests)) {
+    OutputXmlAttribute(stream, kTestsuite, "failures",
+                       StreamableToString(test_suite.failed_test_count()));
+    OutputXmlAttribute(
+        stream, kTestsuite, "disabled",
+        StreamableToString(test_suite.reportable_disabled_test_count()));
+    OutputXmlAttribute(stream, kTestsuite, "errors", "0");
+    OutputXmlAttribute(stream, kTestsuite, "time",
+                       FormatTimeInMillisAsSeconds(test_suite.elapsed_time()));
+    OutputXmlAttribute(
+        stream, kTestsuite, "timestamp",
+        FormatEpochTimeInMillisAsIso8601(test_suite.start_timestamp()));
+    *stream << TestPropertiesAsXmlAttributes(test_suite.ad_hoc_test_result());
+  }
+  *stream << ">\n";
+  for (int i = 0; i < test_suite.total_test_count(); ++i) {
+    if (test_suite.GetTestInfo(i)->is_reportable())
+      OutputXmlTestInfo(stream, test_suite.name(), *test_suite.GetTestInfo(i));
+  }
+  *stream << "  </" << kTestsuite << ">\n";
+}
+
+// Prints an XML summary of unit_test to output stream out.
+void XmlUnitTestResultPrinter::PrintXmlUnitTest(std::ostream *stream,
+                                                const UnitTest &unit_test) {
+  const std::string kTestsuites = "testsuites";
+
+  *stream << "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
+  *stream << "<" << kTestsuites;
+
+  OutputXmlAttribute(stream, kTestsuites, "tests",
+                     StreamableToString(unit_test.reportable_test_count()));
+  OutputXmlAttribute(stream, kTestsuites, "failures",
+                     StreamableToString(unit_test.failed_test_count()));
+  OutputXmlAttribute(
+      stream, kTestsuites, "disabled",
+      StreamableToString(unit_test.reportable_disabled_test_count()));
+  OutputXmlAttribute(stream, kTestsuites, "errors", "0");
+  OutputXmlAttribute(stream, kTestsuites, "time",
+                     FormatTimeInMillisAsSeconds(unit_test.elapsed_time()));
+  OutputXmlAttribute(
+      stream, kTestsuites, "timestamp",
+      FormatEpochTimeInMillisAsIso8601(unit_test.start_timestamp()));
+
+  if (GTEST_FLAG(shuffle)) {
+    OutputXmlAttribute(stream, kTestsuites, "random_seed",
+                       StreamableToString(unit_test.random_seed()));
+  }
+  *stream << TestPropertiesAsXmlAttributes(unit_test.ad_hoc_test_result());
+
+  OutputXmlAttribute(stream, kTestsuites, "name", "AllTests");
+  *stream << ">\n";
+
+  for (int i = 0; i < unit_test.total_test_suite_count(); ++i) {
+    if (unit_test.GetTestSuite(i)->reportable_test_count() > 0)
+      PrintXmlTestSuite(stream, *unit_test.GetTestSuite(i));
+  }
+  *stream << "</" << kTestsuites << ">\n";
+}
+
+void XmlUnitTestResultPrinter::PrintXmlTestsList(
+    std::ostream *stream, const std::vector<TestSuite *> &test_suites) {
+  const std::string kTestsuites = "testsuites";
+
+  *stream << "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
+  *stream << "<" << kTestsuites;
+
+  int total_tests = 0;
+  for (auto test_suite : test_suites) {
+    total_tests += test_suite->total_test_count();
+  }
+  OutputXmlAttribute(stream, kTestsuites, "tests",
+                     StreamableToString(total_tests));
+  OutputXmlAttribute(stream, kTestsuites, "name", "AllTests");
+  *stream << ">\n";
+
+  for (auto test_suite : test_suites) {
+    PrintXmlTestSuite(stream, *test_suite);
+  }
+  *stream << "</" << kTestsuites << ">\n";
+}
+
+// Produces a string representing the test properties in a result as space
+// delimited XML attributes based on the property key="value" pairs.
+std::string XmlUnitTestResultPrinter::TestPropertiesAsXmlAttributes(
+    const TestResult &result) {
+  Message attributes;
+  for (int i = 0; i < result.test_property_count(); ++i) {
+    const TestProperty &property = result.GetTestProperty(i);
+    attributes << " " << property.key() << "="
+               << "\"" << EscapeXmlAttribute(property.value()) << "\"";
+  }
+  return attributes.GetString();
+}
+
+void XmlUnitTestResultPrinter::OutputXmlTestProperties(
+    std::ostream *stream, const TestResult &result) {
+  const std::string kProperties = "properties";
+  const std::string kProperty = "property";
+
+  if (result.test_property_count() <= 0) {
+    return;
+  }
+
+  *stream << "<" << kProperties << ">\n";
+  for (int i = 0; i < result.test_property_count(); ++i) {
+    const TestProperty &property = result.GetTestProperty(i);
+    *stream << "<" << kProperty;
+    *stream << " name=\"" << EscapeXmlAttribute(property.key()) << "\"";
+    *stream << " value=\"" << EscapeXmlAttribute(property.value()) << "\"";
+    *stream << "/>\n";
+  }
+  *stream << "</" << kProperties << ">\n";
+}
+
+// End XmlUnitTestResultPrinter
+
+// This class generates an JSON output file.
+class JsonUnitTestResultPrinter : public EmptyTestEventListener {
+ public:
+  explicit JsonUnitTestResultPrinter(const char *output_file);
+
+  void OnTestIterationEnd(const UnitTest &unit_test, int iteration) override;
+
+  // Prints an JSON summary of all unit tests.
+  static void PrintJsonTestList(::std::ostream *stream,
+                                const std::vector<TestSuite *> &test_suites);
+
+ private:
+  // Returns an JSON-escaped copy of the input string str.
+  static std::string EscapeJson(const std::string &str);
+
+  //// Verifies that the given attribute belongs to the given element and
+  //// streams the attribute as JSON.
+  static void OutputJsonKey(std::ostream *stream,
+                            const std::string &element_name,
+                            const std::string &name, const std::string &value,
+                            const std::string &indent, bool comma = true);
+  static void OutputJsonKey(std::ostream *stream,
+                            const std::string &element_name,
+                            const std::string &name, int value,
+                            const std::string &indent, bool comma = true);
+
+  // Streams a JSON representation of a TestInfo object.
+  static void OutputJsonTestInfo(::std::ostream *stream,
+                                 const char *test_suite_name,
+                                 const TestInfo &test_info);
+
+  // Prints a JSON representation of a TestSuite object
+  static void PrintJsonTestSuite(::std::ostream *stream,
+                                 const TestSuite &test_suite);
+
+  // Prints a JSON summary of unit_test to output stream out.
+  static void PrintJsonUnitTest(::std::ostream *stream,
+                                const UnitTest &unit_test);
+
+  // Produces a string representing the test properties in a result as
+  // a JSON dictionary.
+  static std::string TestPropertiesAsJson(const TestResult &result,
+                                          const std::string &indent);
+
+  // The output file.
+  const std::string output_file_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(JsonUnitTestResultPrinter);
+};
+
+// Creates a new JsonUnitTestResultPrinter.
+JsonUnitTestResultPrinter::JsonUnitTestResultPrinter(const char *output_file)
+    : output_file_(output_file) {
+  if (output_file_.empty()) {
+    GTEST_LOG_(FATAL) << "JSON output file may not be null";
+  }
+}
+
+void JsonUnitTestResultPrinter::OnTestIterationEnd(const UnitTest &unit_test,
+                                                   int /*iteration*/) {
+  FILE *jsonout = OpenFileForWriting(output_file_);
+  std::stringstream stream;
+  PrintJsonUnitTest(&stream, unit_test);
+  fprintf(jsonout, "%s", StringStreamToString(&stream).c_str());
+  fclose(jsonout);
+}
+
+// Returns an JSON-escaped copy of the input string str.
+std::string JsonUnitTestResultPrinter::EscapeJson(const std::string &str) {
+  Message m;
+
+  for (size_t i = 0; i < str.size(); ++i) {
+    const char ch = str[i];
+    switch (ch) {
+      case '\\':
+      case '"':
+      case '/': m << '\\' << ch; break;
+      case '\b': m << "\\b"; break;
+      case '\t': m << "\\t"; break;
+      case '\n': m << "\\n"; break;
+      case '\f': m << "\\f"; break;
+      case '\r': m << "\\r"; break;
+      default:
+        if (ch < ' ') {
+          m << "\\u00" << String::FormatByte(static_cast<unsigned char>(ch));
+        } else {
+          m << ch;
+        }
+        break;
+    }
+  }
+
+  return m.GetString();
+}
+
+// The following routines generate an JSON representation of a UnitTest
+// object.
+
+// Formats the given time in milliseconds as seconds.
+static std::string FormatTimeInMillisAsDuration(TimeInMillis ms) {
+  ::std::stringstream ss;
+  ss << (static_cast<double>(ms) * 1e-3) << "s";
+  return ss.str();
+}
+
+// Converts the given epoch time in milliseconds to a date string in the
+// RFC3339 format, without the timezone information.
+static std::string FormatEpochTimeInMillisAsRFC3339(TimeInMillis ms) {
+  struct tm time_struct;
+  if (!PortableLocaltime(static_cast<time_t>(ms / 1000), &time_struct))
+    return "";
+  // YYYY-MM-DDThh:mm:ss
+  return StreamableToString(time_struct.tm_year + 1900) + "-" +
+         String::FormatIntWidth2(time_struct.tm_mon + 1) + "-" +
+         String::FormatIntWidth2(time_struct.tm_mday) + "T" +
+         String::FormatIntWidth2(time_struct.tm_hour) + ":" +
+         String::FormatIntWidth2(time_struct.tm_min) + ":" +
+         String::FormatIntWidth2(time_struct.tm_sec) + "Z";
+}
+
+static inline std::string Indent(size_t width) {
+  return std::string(width, ' ');
+}
+
+void JsonUnitTestResultPrinter::OutputJsonKey(std::ostream *stream,
+                                              const std::string &element_name,
+                                              const std::string &name,
+                                              const std::string &value,
+                                              const std::string &indent,
+                                              bool comma) {
+  const std::vector<std::string> &allowed_names =
+      GetReservedOutputAttributesForElement(element_name);
+
+  GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) !=
+               allowed_names.end())
+      << "Key \"" << name << "\" is not allowed for value \"" << element_name
+      << "\".";
+
+  *stream << indent << "\"" << name << "\": \"" << EscapeJson(value) << "\"";
+  if (comma) *stream << ",\n";
+}
+
+void JsonUnitTestResultPrinter::OutputJsonKey(
+    std::ostream *stream, const std::string &element_name,
+    const std::string &name, int value, const std::string &indent, bool comma) {
+  const std::vector<std::string> &allowed_names =
+      GetReservedOutputAttributesForElement(element_name);
+
+  GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) !=
+               allowed_names.end())
+      << "Key \"" << name << "\" is not allowed for value \"" << element_name
+      << "\".";
+
+  *stream << indent << "\"" << name << "\": " << StreamableToString(value);
+  if (comma) *stream << ",\n";
+}
+
+// Prints a JSON representation of a TestInfo object.
+void JsonUnitTestResultPrinter::OutputJsonTestInfo(::std::ostream *stream,
+                                                   const char *test_suite_name,
+                                                   const TestInfo &test_info) {
+  const TestResult &result = *test_info.result();
+  const std::string kTestsuite = "testcase";
+  const std::string kIndent = Indent(10);
+
+  *stream << Indent(8) << "{\n";
+  OutputJsonKey(stream, kTestsuite, "name", test_info.name(), kIndent);
+
+  if (test_info.value_param() != nullptr) {
+    OutputJsonKey(stream, kTestsuite, "value_param", test_info.value_param(),
+                  kIndent);
+  }
+  if (test_info.type_param() != nullptr) {
+    OutputJsonKey(stream, kTestsuite, "type_param", test_info.type_param(),
+                  kIndent);
+  }
+  if (GTEST_FLAG(list_tests)) {
+    OutputJsonKey(stream, kTestsuite, "file", test_info.file(), kIndent);
+    OutputJsonKey(stream, kTestsuite, "line", test_info.line(), kIndent, false);
+    *stream << "\n" << Indent(8) << "}";
+    return;
+  }
+
+  OutputJsonKey(stream, kTestsuite, "status",
+                test_info.should_run() ? "RUN" : "NOTRUN", kIndent);
+  OutputJsonKey(stream, kTestsuite, "result",
+                test_info.should_run()
+                    ? (result.Skipped() ? "SKIPPED" : "COMPLETED")
+                    : "SUPPRESSED",
+                kIndent);
+  OutputJsonKey(stream, kTestsuite, "timestamp",
+                FormatEpochTimeInMillisAsRFC3339(result.start_timestamp()),
+                kIndent);
+  OutputJsonKey(stream, kTestsuite, "time",
+                FormatTimeInMillisAsDuration(result.elapsed_time()), kIndent);
+  OutputJsonKey(stream, kTestsuite, "classname", test_suite_name, kIndent,
+                false);
+  *stream << TestPropertiesAsJson(result, kIndent);
+
+  int failures = 0;
+  for (int i = 0; i < result.total_part_count(); ++i) {
+    const TestPartResult &part = result.GetTestPartResult(i);
+    if (part.failed()) {
+      *stream << ",\n";
+      if (++failures == 1) {
+        *stream << kIndent << "\""
+                << "failures"
+                << "\": [\n";
+      }
+      const std::string location =
+          internal::FormatCompilerIndependentFileLocation(part.file_name(),
+                                                          part.line_number());
+      const std::string message = EscapeJson(location + "\n" + part.message());
+      *stream << kIndent << "  {\n"
+              << kIndent << "    \"failure\": \"" << message << "\",\n"
+              << kIndent << "    \"type\": \"\"\n"
+              << kIndent << "  }";
+    }
+  }
+
+  if (failures > 0) *stream << "\n" << kIndent << "]";
+  *stream << "\n" << Indent(8) << "}";
+}
+
+// Prints an JSON representation of a TestSuite object
+void JsonUnitTestResultPrinter::PrintJsonTestSuite(
+    std::ostream *stream, const TestSuite &test_suite) {
+  const std::string kTestsuite = "testsuite";
+  const std::string kIndent = Indent(6);
+
+  *stream << Indent(4) << "{\n";
+  OutputJsonKey(stream, kTestsuite, "name", test_suite.name(), kIndent);
+  OutputJsonKey(stream, kTestsuite, "tests", test_suite.reportable_test_count(),
+                kIndent);
+  if (!GTEST_FLAG(list_tests)) {
+    OutputJsonKey(stream, kTestsuite, "failures",
+                  test_suite.failed_test_count(), kIndent);
+    OutputJsonKey(stream, kTestsuite, "disabled",
+                  test_suite.reportable_disabled_test_count(), kIndent);
+    OutputJsonKey(stream, kTestsuite, "errors", 0, kIndent);
+    OutputJsonKey(
+        stream, kTestsuite, "timestamp",
+        FormatEpochTimeInMillisAsRFC3339(test_suite.start_timestamp()),
+        kIndent);
+    OutputJsonKey(stream, kTestsuite, "time",
+                  FormatTimeInMillisAsDuration(test_suite.elapsed_time()),
+                  kIndent, false);
+    *stream << TestPropertiesAsJson(test_suite.ad_hoc_test_result(), kIndent)
+            << ",\n";
+  }
+
+  *stream << kIndent << "\"" << kTestsuite << "\": [\n";
+
+  bool comma = false;
+  for (int i = 0; i < test_suite.total_test_count(); ++i) {
+    if (test_suite.GetTestInfo(i)->is_reportable()) {
+      if (comma) {
+        *stream << ",\n";
+      } else {
+        comma = true;
+      }
+      OutputJsonTestInfo(stream, test_suite.name(), *test_suite.GetTestInfo(i));
+    }
+  }
+  *stream << "\n" << kIndent << "]\n" << Indent(4) << "}";
+}
+
+// Prints a JSON summary of unit_test to output stream out.
+void JsonUnitTestResultPrinter::PrintJsonUnitTest(std::ostream *stream,
+                                                  const UnitTest &unit_test) {
+  const std::string kTestsuites = "testsuites";
+  const std::string kIndent = Indent(2);
+  *stream << "{\n";
+
+  OutputJsonKey(stream, kTestsuites, "tests", unit_test.reportable_test_count(),
+                kIndent);
+  OutputJsonKey(stream, kTestsuites, "failures", unit_test.failed_test_count(),
+                kIndent);
+  OutputJsonKey(stream, kTestsuites, "disabled",
+                unit_test.reportable_disabled_test_count(), kIndent);
+  OutputJsonKey(stream, kTestsuites, "errors", 0, kIndent);
+  if (GTEST_FLAG(shuffle)) {
+    OutputJsonKey(stream, kTestsuites, "random_seed", unit_test.random_seed(),
+                  kIndent);
+  }
+  OutputJsonKey(stream, kTestsuites, "timestamp",
+                FormatEpochTimeInMillisAsRFC3339(unit_test.start_timestamp()),
+                kIndent);
+  OutputJsonKey(stream, kTestsuites, "time",
+                FormatTimeInMillisAsDuration(unit_test.elapsed_time()), kIndent,
+                false);
+
+  *stream << TestPropertiesAsJson(unit_test.ad_hoc_test_result(), kIndent)
+          << ",\n";
+
+  OutputJsonKey(stream, kTestsuites, "name", "AllTests", kIndent);
+  *stream << kIndent << "\"" << kTestsuites << "\": [\n";
+
+  bool comma = false;
+  for (int i = 0; i < unit_test.total_test_suite_count(); ++i) {
+    if (unit_test.GetTestSuite(i)->reportable_test_count() > 0) {
+      if (comma) {
+        *stream << ",\n";
+      } else {
+        comma = true;
+      }
+      PrintJsonTestSuite(stream, *unit_test.GetTestSuite(i));
+    }
+  }
+
+  *stream << "\n"
+          << kIndent << "]\n"
+          << "}\n";
+}
+
+void JsonUnitTestResultPrinter::PrintJsonTestList(
+    std::ostream *stream, const std::vector<TestSuite *> &test_suites) {
+  const std::string kTestsuites = "testsuites";
+  const std::string kIndent = Indent(2);
+  *stream << "{\n";
+  int total_tests = 0;
+  for (auto test_suite : test_suites) {
+    total_tests += test_suite->total_test_count();
+  }
+  OutputJsonKey(stream, kTestsuites, "tests", total_tests, kIndent);
+
+  OutputJsonKey(stream, kTestsuites, "name", "AllTests", kIndent);
+  *stream << kIndent << "\"" << kTestsuites << "\": [\n";
+
+  for (size_t i = 0; i < test_suites.size(); ++i) {
+    if (i != 0) {
+      *stream << ",\n";
+    }
+    PrintJsonTestSuite(stream, *test_suites[i]);
+  }
+
+  *stream << "\n"
+          << kIndent << "]\n"
+          << "}\n";
+}
+// Produces a string representing the test properties in a result as
+// a JSON dictionary.
+std::string JsonUnitTestResultPrinter::TestPropertiesAsJson(
+    const TestResult &result, const std::string &indent) {
+  Message attributes;
+  for (int i = 0; i < result.test_property_count(); ++i) {
+    const TestProperty &property = result.GetTestProperty(i);
+    attributes << ",\n"
+               << indent << "\"" << property.key() << "\": "
+               << "\"" << EscapeJson(property.value()) << "\"";
+  }
+  return attributes.GetString();
+}
+
+// End JsonUnitTestResultPrinter
+
+#if GTEST_CAN_STREAM_RESULTS_
+
+// Checks if str contains '=', '&', '%' or '\n' characters. If yes,
+// replaces them by "%xx" where xx is their hexadecimal value. For
+// example, replaces "=" with "%3D".  This algorithm is O(strlen(str))
+// in both time and space -- important as the input str may contain an
+// arbitrarily long test failure message and stack trace.
+std::string StreamingListener::UrlEncode(const char *str) {
+  std::string result;
+  result.reserve(strlen(str) + 1);
+  for (char ch = *str; ch != '\0'; ch = *++str) {
+    switch (ch) {
+      case '%':
+      case '=':
+      case '&':
+      case '\n':
+        result.append("%" + String::FormatByte(static_cast<unsigned char>(ch)));
+        break;
+      default: result.push_back(ch); break;
+    }
+  }
+  return result;
+}
+
+void StreamingListener::SocketWriter::MakeConnection() {
+  GTEST_CHECK_(sockfd_ == -1)
+      << "MakeConnection() can't be called when there is already a connection.";
+
+  addrinfo hints;
+  memset(&hints, 0, sizeof(hints));
+  hints.ai_family = AF_UNSPEC;  // To allow both IPv4 and IPv6 addresses.
+  hints.ai_socktype = SOCK_STREAM;
+  addrinfo *servinfo = nullptr;
+
+  // Use the getaddrinfo() to get a linked list of IP addresses for
+  // the given host name.
+  const int error_num =
+      getaddrinfo(host_name_.c_str(), port_num_.c_str(), &hints, &servinfo);
+  if (error_num != 0) {
+    GTEST_LOG_(WARNING) << "stream_result_to: getaddrinfo() failed: "
+                        << gai_strerror(error_num);
+  }
+
+  // Loop through all the results and connect to the first we can.
+  for (addrinfo *cur_addr = servinfo; sockfd_ == -1 && cur_addr != nullptr;
+       cur_addr = cur_addr->ai_next) {
+    sockfd_ = socket(cur_addr->ai_family, cur_addr->ai_socktype,
+                     cur_addr->ai_protocol);
+    if (sockfd_ != -1) {
+      // Connect the client socket to the server socket.
+      if (connect(sockfd_, cur_addr->ai_addr, cur_addr->ai_addrlen) == -1) {
+        close(sockfd_);
+        sockfd_ = -1;
+      }
+    }
+  }
+
+  freeaddrinfo(servinfo);  // all done with this structure
+
+  if (sockfd_ == -1) {
+    GTEST_LOG_(WARNING) << "stream_result_to: failed to connect to "
+                        << host_name_ << ":" << port_num_;
+  }
+}
+
+// End of class Streaming Listener
+#endif  // GTEST_CAN_STREAM_RESULTS__
+
+// class OsStackTraceGetter
+
+const char *const OsStackTraceGetterInterface::kElidedFramesMarker =
+    "... " GTEST_NAME_ " internal frames ...";
+
+std::string OsStackTraceGetter::CurrentStackTrace(int max_depth, int skip_count)
+    GTEST_LOCK_EXCLUDED_(mutex_) {
+#if GTEST_HAS_ABSL
+  std::string result;
+
+  if (max_depth <= 0) {
+    return result;
+  }
+
+  max_depth = std::min(max_depth, kMaxStackTraceDepth);
+
+  std::vector<void *> raw_stack(max_depth);
+  // Skips the frames requested by the caller, plus this function.
+  const int raw_stack_size =
+      absl::GetStackTrace(&raw_stack[0], max_depth, skip_count + 1);
+
+  void *caller_frame = nullptr;
+  {
+    MutexLock lock(&mutex_);
+    caller_frame = caller_frame_;
+  }
+
+  for (int i = 0; i < raw_stack_size; ++i) {
+    if (raw_stack[i] == caller_frame &&
+        !GTEST_FLAG(show_internal_stack_frames)) {
+      // Add a marker to the trace and stop adding frames.
+      absl::StrAppend(&result, kElidedFramesMarker, "\n");
+      break;
+    }
+
+    char tmp[1024];
+    const char *symbol = "(unknown)";
+    if (absl::Symbolize(raw_stack[i], tmp, sizeof(tmp))) {
+      symbol = tmp;
+    }
+
+    char line[1024];
+    snprintf(line, sizeof(line), "  %p: %s\n", raw_stack[i], symbol);
+    result += line;
+  }
+
+  return result;
+
+#else   // !GTEST_HAS_ABSL
+  static_cast<void>(max_depth);
+  static_cast<void>(skip_count);
+  return "";
+#endif  // GTEST_HAS_ABSL
+}
+
+void OsStackTraceGetter::UponLeavingGTest() GTEST_LOCK_EXCLUDED_(mutex_) {
+#if GTEST_HAS_ABSL
+  void *caller_frame = nullptr;
+  if (absl::GetStackTrace(&caller_frame, 1, 3) <= 0) {
+    caller_frame = nullptr;
+  }
+
+  MutexLock lock(&mutex_);
+  caller_frame_ = caller_frame;
+#endif  // GTEST_HAS_ABSL
+}
+
+// A helper class that creates the premature-exit file in its
+// constructor and deletes the file in its destructor.
+class ScopedPrematureExitFile {
+ public:
+  explicit ScopedPrematureExitFile(const char *premature_exit_filepath)
+      : premature_exit_filepath_(
+            premature_exit_filepath ? premature_exit_filepath : "") {
+    // If a path to the premature-exit file is specified...
+    if (!premature_exit_filepath_.empty()) {
+      // create the file with a single "0" character in it.  I/O
+      // errors are ignored as there's nothing better we can do and we
+      // don't want to fail the test because of this.
+      FILE *pfile = posix::FOpen(premature_exit_filepath, "w");
+      fwrite("0", 1, 1, pfile);
+      fclose(pfile);
+    }
+  }
+
+  ~ScopedPrematureExitFile() {
+#if !defined GTEST_OS_ESP8266
+    if (!premature_exit_filepath_.empty()) {
+      int retval = remove(premature_exit_filepath_.c_str());
+      if (retval) {
+        GTEST_LOG_(ERROR) << "Failed to remove premature exit filepath \""
+                          << premature_exit_filepath_ << "\" with error "
+                          << retval;
+      }
+    }
+#endif
+  }
+
+ private:
+  const std::string premature_exit_filepath_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedPrematureExitFile);
+};
+
+}  // namespace internal
+
+// class TestEventListeners
+
+TestEventListeners::TestEventListeners()
+    : repeater_(new internal::TestEventRepeater()),
+      default_result_printer_(nullptr), default_xml_generator_(nullptr) {}
+
+TestEventListeners::~TestEventListeners() { delete repeater_; }
+
+// Returns the standard listener responsible for the default console
+// output.  Can be removed from the listeners list to shut down default
+// console output.  Note that removing this object from the listener list
+// with Release transfers its ownership to the user.
+void TestEventListeners::Append(TestEventListener *listener) {
+  repeater_->Append(listener);
+}
+
+// Removes the given event listener from the list and returns it.  It then
+// becomes the caller's responsibility to delete the listener. Returns
+// NULL if the listener is not found in the list.
+TestEventListener *TestEventListeners::Release(TestEventListener *listener) {
+  if (listener == default_result_printer_)
+    default_result_printer_ = nullptr;
+  else if (listener == default_xml_generator_)
+    default_xml_generator_ = nullptr;
+  return repeater_->Release(listener);
+}
+
+// Returns repeater that broadcasts the TestEventListener events to all
+// subscribers.
+TestEventListener *TestEventListeners::repeater() { return repeater_; }
+
+// Sets the default_result_printer attribute to the provided listener.
+// The listener is also added to the listener list and previous
+// default_result_printer is removed from it and deleted. The listener can
+// also be NULL in which case it will not be added to the list. Does
+// nothing if the previous and the current listener objects are the same.
+void TestEventListeners::SetDefaultResultPrinter(TestEventListener *listener) {
+  if (default_result_printer_ != listener) {
+    // It is an error to pass this method a listener that is already in the
+    // list.
+    delete Release(default_result_printer_);
+    default_result_printer_ = listener;
+    if (listener != nullptr) Append(listener);
+  }
+}
+
+// Sets the default_xml_generator attribute to the provided listener.  The
+// listener is also added to the listener list and previous
+// default_xml_generator is removed from it and deleted. The listener can
+// also be NULL in which case it will not be added to the list. Does
+// nothing if the previous and the current listener objects are the same.
+void TestEventListeners::SetDefaultXmlGenerator(TestEventListener *listener) {
+  if (default_xml_generator_ != listener) {
+    // It is an error to pass this method a listener that is already in the
+    // list.
+    delete Release(default_xml_generator_);
+    default_xml_generator_ = listener;
+    if (listener != nullptr) Append(listener);
+  }
+}
+
+// Controls whether events will be forwarded by the repeater to the
+// listeners in the list.
+bool TestEventListeners::EventForwardingEnabled() const {
+  return repeater_->forwarding_enabled();
+}
+
+void TestEventListeners::SuppressEventForwarding() {
+  repeater_->set_forwarding_enabled(false);
+}
+
+// class UnitTest
+
+// Gets the singleton UnitTest object.  The first time this method is
+// called, a UnitTest object is constructed and returned.  Consecutive
+// calls will return the same object.
+//
+// We don't protect this under mutex_ as a user is not supposed to
+// call this before main() starts, from which point on the return
+// value will never change.
+UnitTest *UnitTest::GetInstance() {
+  // CodeGear C++Builder insists on a public destructor for the
+  // default implementation.  Use this implementation to keep good OO
+  // design with private destructor.
+
+#if defined(__BORLANDC__)
+  static UnitTest *const instance = new UnitTest;
+  return instance;
+#else
+  static UnitTest instance;
+  return &instance;
+#endif  // defined(__BORLANDC__)
+}
+
+// Gets the number of successful test suites.
+int UnitTest::successful_test_suite_count() const {
+  return impl()->successful_test_suite_count();
+}
+
+// Gets the number of failed test suites.
+int UnitTest::failed_test_suite_count() const {
+  return impl()->failed_test_suite_count();
+}
+
+// Gets the number of all test suites.
+int UnitTest::total_test_suite_count() const {
+  return impl()->total_test_suite_count();
+}
+
+// Gets the number of all test suites that contain at least one test
+// that should run.
+int UnitTest::test_suite_to_run_count() const {
+  return impl()->test_suite_to_run_count();
+}
+
+//  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+int UnitTest::successful_test_case_count() const {
+  return impl()->successful_test_suite_count();
+}
+int UnitTest::failed_test_case_count() const {
+  return impl()->failed_test_suite_count();
+}
+int UnitTest::total_test_case_count() const {
+  return impl()->total_test_suite_count();
+}
+int UnitTest::test_case_to_run_count() const {
+  return impl()->test_suite_to_run_count();
+}
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+// Gets the number of successful tests.
+int UnitTest::successful_test_count() const {
+  return impl()->successful_test_count();
+}
+
+// Gets the number of skipped tests.
+int UnitTest::skipped_test_count() const {
+  return impl()->skipped_test_count();
+}
+
+// Gets the number of failed tests.
+int UnitTest::failed_test_count() const { return impl()->failed_test_count(); }
+
+// Gets the number of disabled tests that will be reported in the XML report.
+int UnitTest::reportable_disabled_test_count() const {
+  return impl()->reportable_disabled_test_count();
+}
+
+// Gets the number of disabled tests.
+int UnitTest::disabled_test_count() const {
+  return impl()->disabled_test_count();
+}
+
+// Gets the number of tests to be printed in the XML report.
+int UnitTest::reportable_test_count() const {
+  return impl()->reportable_test_count();
+}
+
+// Gets the number of all tests.
+int UnitTest::total_test_count() const { return impl()->total_test_count(); }
+
+// Gets the number of tests that should run.
+int UnitTest::test_to_run_count() const { return impl()->test_to_run_count(); }
+
+// Gets the time of the test program start, in ms from the start of the
+// UNIX epoch.
+internal::TimeInMillis UnitTest::start_timestamp() const {
+  return impl()->start_timestamp();
+}
+
+// Gets the elapsed time, in milliseconds.
+internal::TimeInMillis UnitTest::elapsed_time() const {
+  return impl()->elapsed_time();
+}
+
+// Returns true if and only if the unit test passed (i.e. all test suites
+// passed).
+bool UnitTest::Passed() const { return impl()->Passed(); }
+
+// Returns true if and only if the unit test failed (i.e. some test suite
+// failed or something outside of all tests failed).
+bool UnitTest::Failed() const { return impl()->Failed(); }
+
+// Gets the i-th test suite among all the test suites. i can range from 0 to
+// total_test_suite_count() - 1. If i is not in that range, returns NULL.
+const TestSuite *UnitTest::GetTestSuite(int i) const {
+  return impl()->GetTestSuite(i);
+}
+
+//  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+const TestCase *UnitTest::GetTestCase(int i) const {
+  return impl()->GetTestCase(i);
+}
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+// Returns the TestResult containing information on test failures and
+// properties logged outside of individual test suites.
+const TestResult &UnitTest::ad_hoc_test_result() const {
+  return *impl()->ad_hoc_test_result();
+}
+
+// Gets the i-th test suite among all the test suites. i can range from 0 to
+// total_test_suite_count() - 1. If i is not in that range, returns NULL.
+TestSuite *UnitTest::GetMutableTestSuite(int i) {
+  return impl()->GetMutableSuiteCase(i);
+}
+
+// Returns the list of event listeners that can be used to track events
+// inside Google Test.
+TestEventListeners &UnitTest::listeners() { return *impl()->listeners(); }
+
+// Registers and returns a global test environment.  When a test
+// program is run, all global test environments will be set-up in the
+// order they were registered.  After all tests in the program have
+// finished, all global test environments will be torn-down in the
+// *reverse* order they were registered.
+//
+// The UnitTest object takes ownership of the given environment.
+//
+// We don't protect this under mutex_, as we only support calling it
+// from the main thread.
+Environment *UnitTest::AddEnvironment(Environment *env) {
+  if (env == nullptr) {
+    return nullptr;
+  }
+
+  impl_->environments().push_back(env);
+  return env;
+}
+
+// Adds a TestPartResult to the current TestResult object.  All Google Test
+// assertion macros (e.g. ASSERT_TRUE, EXPECT_EQ, etc) eventually call
+// this to report their results.  The user code should use the
+// assertion macros instead of calling this directly.
+void UnitTest::AddTestPartResult(TestPartResult::Type result_type,
+                                 const char *file_name, int line_number,
+                                 const std::string &message,
+                                 const std::string &os_stack_trace)
+    GTEST_LOCK_EXCLUDED_(mutex_) {
+  Message msg;
+  msg << message;
+
+  internal::MutexLock lock(&mutex_);
+  if (impl_->gtest_trace_stack().size() > 0) {
+    msg << "\n" << GTEST_NAME_ << " trace:";
+
+    for (size_t i = impl_->gtest_trace_stack().size(); i > 0; --i) {
+      const internal::TraceInfo &trace = impl_->gtest_trace_stack()[i - 1];
+      msg << "\n"
+          << internal::FormatFileLocation(trace.file, trace.line) << " "
+          << trace.message;
+    }
+  }
+
+  if (os_stack_trace.c_str() != nullptr && !os_stack_trace.empty()) {
+    msg << internal::kStackTraceMarker << os_stack_trace;
+  }
+
+  const TestPartResult result = TestPartResult(
+      result_type, file_name, line_number, msg.GetString().c_str());
+  impl_->GetTestPartResultReporterForCurrentThread()->ReportTestPartResult(
+      result);
+
+  if (result_type != TestPartResult::kSuccess &&
+      result_type != TestPartResult::kSkip) {
+    // gtest_break_on_failure takes precedence over
+    // gtest_throw_on_failure.  This allows a user to set the latter
+    // in the code (perhaps in order to use Google Test assertions
+    // with another testing framework) and specify the former on the
+    // command line for debugging.
+    if (GTEST_FLAG(break_on_failure)) {
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
+      // Using DebugBreak on Windows allows gtest to still break into a debugger
+      // when a failure happens and both the --gtest_break_on_failure and
+      // the --gtest_catch_exceptions flags are specified.
+      DebugBreak();
+#elif (!defined(__native_client__)) &&            \
+    ((defined(__clang__) || defined(__GNUC__)) && \
+     (defined(__x86_64__) || defined(__i386__)))
+      // with clang/gcc we can achieve the same effect on x86 by invoking int3
+      asm("int3");
+#else
+      // Dereference nullptr through a volatile pointer to prevent the compiler
+      // from removing. We use this rather than abort() or __builtin_trap() for
+      // portability: some debuggers don't correctly trap abort().
+      *static_cast<volatile int *>(nullptr) = 1;
+#endif  // GTEST_OS_WINDOWS
+    } else if (GTEST_FLAG(throw_on_failure)) {
+#if GTEST_HAS_EXCEPTIONS
+      throw internal::GoogleTestFailureException(result);
+#else
+      // We cannot call abort() as it generates a pop-up in debug mode
+      // that cannot be suppressed in VC 7.1 or below.
+      exit(1);
+#endif
+    }
+  }
+}
+
+// Adds a TestProperty to the current TestResult object when invoked from
+// inside a test, to current TestSuite's ad_hoc_test_result_ when invoked
+// from SetUpTestSuite or TearDownTestSuite, or to the global property set
+// when invoked elsewhere.  If the result already contains a property with
+// the same key, the value will be updated.
+void UnitTest::RecordProperty(const std::string &key,
+                              const std::string &value) {
+  impl_->RecordProperty(TestProperty(key, value));
+}
+
+// Runs all tests in this UnitTest object and prints the result.
+// Returns 0 if successful, or 1 otherwise.
+//
+// We don't protect this under mutex_, as we only support calling it
+// from the main thread.
+int UnitTest::Run() {
+  const bool in_death_test_child_process =
+      internal::GTEST_FLAG(internal_run_death_test).length() > 0;
+
+  // Google Test implements this protocol for catching that a test
+  // program exits before returning control to Google Test:
+  //
+  //   1. Upon start, Google Test creates a file whose absolute path
+  //      is specified by the environment variable
+  //      TEST_PREMATURE_EXIT_FILE.
+  //   2. When Google Test has finished its work, it deletes the file.
+  //
+  // This allows a test runner to set TEST_PREMATURE_EXIT_FILE before
+  // running a Google-Test-based test program and check the existence
+  // of the file at the end of the test execution to see if it has
+  // exited prematurely.
+
+  // If we are in the child process of a death test, don't
+  // create/delete the premature exit file, as doing so is unnecessary
+  // and will confuse the parent process.  Otherwise, create/delete
+  // the file upon entering/leaving this function.  If the program
+  // somehow exits before this function has a chance to return, the
+  // premature-exit file will be left undeleted, causing a test runner
+  // that understands the premature-exit-file protocol to report the
+  // test as having failed.
+  const internal::ScopedPrematureExitFile premature_exit_file(
+      in_death_test_child_process
+          ? nullptr
+          : internal::posix::GetEnv("TEST_PREMATURE_EXIT_FILE"));
+
+  // Captures the value of GTEST_FLAG(catch_exceptions).  This value will be
+  // used for the duration of the program.
+  impl()->set_catch_exceptions(GTEST_FLAG(catch_exceptions));
+
+#if GTEST_OS_WINDOWS
+  // Either the user wants Google Test to catch exceptions thrown by the
+  // tests or this is executing in the context of death test child
+  // process. In either case the user does not want to see pop-up dialogs
+  // about crashes - they are expected.
+  if (impl()->catch_exceptions() || in_death_test_child_process) {
+#if !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
+    // SetErrorMode doesn't exist on CE.
+    SetErrorMode(SEM_FAILCRITICALERRORS | SEM_NOALIGNMENTFAULTEXCEPT |
+                 SEM_NOGPFAULTERRORBOX | SEM_NOOPENFILEERRORBOX);
+#endif  // !GTEST_OS_WINDOWS_MOBILE
+
+#if (defined(_MSC_VER) || GTEST_OS_WINDOWS_MINGW) && !GTEST_OS_WINDOWS_MOBILE
+    // Death test children can be terminated with _abort().  On Windows,
+    // _abort() can show a dialog with a warning message.  This forces the
+    // abort message to go to stderr instead.
+    _set_error_mode(_OUT_TO_STDERR);
+#endif
+
+#if defined(_MSC_VER) && !GTEST_OS_WINDOWS_MOBILE
+    // In the debug version, Visual Studio pops up a separate dialog
+    // offering a choice to debug the aborted program. We need to suppress
+    // this dialog or it will pop up for every EXPECT/ASSERT_DEATH statement
+    // executed. Google Test will notify the user of any unexpected
+    // failure via stderr.
+    if (!GTEST_FLAG(break_on_failure))
+      _set_abort_behavior(
+          0x0,                                    // Clear the following flags:
+          _WRITE_ABORT_MSG | _CALL_REPORTFAULT);  // pop-up window, core dump.
+
+    // In debug mode, the Windows CRT can crash with an assertion over invalid
+    // input (e.g. passing an invalid file descriptor).  The default handling
+    // for these assertions is to pop up a dialog and wait for user input.
+    // Instead ask the CRT to dump such assertions to stderr non-interactively.
+    if (!IsDebuggerPresent()) {
+      (void)_CrtSetReportMode(_CRT_ASSERT,
+                              _CRTDBG_MODE_FILE | _CRTDBG_MODE_DEBUG);
+      (void)_CrtSetReportFile(_CRT_ASSERT, _CRTDBG_FILE_STDERR);
+    }
+#endif
+  }
+#endif  // GTEST_OS_WINDOWS
+
+  return internal::HandleExceptionsInMethodIfSupported(
+             impl(), &internal::UnitTestImpl::RunAllTests,
+             "auxiliary test code (environments or event listeners)")
+             ? 0
+             : 1;
+}
+
+// Returns the working directory when the first TEST() or TEST_F() was
+// executed.
+const char *UnitTest::original_working_dir() const {
+  return impl_->original_working_dir_.c_str();
+}
+
+// Returns the TestSuite object for the test that's currently running,
+// or NULL if no test is running.
+const TestSuite *UnitTest::current_test_suite() const
+    GTEST_LOCK_EXCLUDED_(mutex_) {
+  internal::MutexLock lock(&mutex_);
+  return impl_->current_test_suite();
+}
+
+// Legacy API is still available but deprecated
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+const TestCase *UnitTest::current_test_case() const
+    GTEST_LOCK_EXCLUDED_(mutex_) {
+  internal::MutexLock lock(&mutex_);
+  return impl_->current_test_suite();
+}
+#endif
+
+// Returns the TestInfo object for the test that's currently running,
+// or NULL if no test is running.
+const TestInfo *UnitTest::current_test_info() const
+    GTEST_LOCK_EXCLUDED_(mutex_) {
+  internal::MutexLock lock(&mutex_);
+  return impl_->current_test_info();
+}
+
+// Returns the random seed used at the start of the current test run.
+int UnitTest::random_seed() const { return impl_->random_seed(); }
+
+// Returns ParameterizedTestSuiteRegistry object used to keep track of
+// value-parameterized tests and instantiate and register them.
+internal::ParameterizedTestSuiteRegistry &
+UnitTest::parameterized_test_registry() GTEST_LOCK_EXCLUDED_(mutex_) {
+  return impl_->parameterized_test_registry();
+}
+
+// Creates an empty UnitTest.
+UnitTest::UnitTest() { impl_ = new internal::UnitTestImpl(this); }
+
+// Destructor of UnitTest.
+UnitTest::~UnitTest() { delete impl_; }
+
+// Pushes a trace defined by SCOPED_TRACE() on to the per-thread
+// Google Test trace stack.
+void UnitTest::PushGTestTrace(const internal::TraceInfo &trace)
+    GTEST_LOCK_EXCLUDED_(mutex_) {
+  internal::MutexLock lock(&mutex_);
+  impl_->gtest_trace_stack().push_back(trace);
+}
+
+// Pops a trace from the per-thread Google Test trace stack.
+void UnitTest::PopGTestTrace() GTEST_LOCK_EXCLUDED_(mutex_) {
+  internal::MutexLock lock(&mutex_);
+  impl_->gtest_trace_stack().pop_back();
+}
+
+namespace internal {
+
+UnitTestImpl::UnitTestImpl(UnitTest *parent)
+    : parent_(parent),
+      GTEST_DISABLE_MSC_WARNINGS_PUSH_(4355 /* using this in initializer */)
+          default_global_test_part_result_reporter_(this),
+      default_per_thread_test_part_result_reporter_(this),
+      GTEST_DISABLE_MSC_WARNINGS_POP_() global_test_part_result_repoter_(
+          &default_global_test_part_result_reporter_),
+      per_thread_test_part_result_reporter_(
+          &default_per_thread_test_part_result_reporter_),
+      parameterized_test_registry_(), parameterized_tests_registered_(false),
+      last_death_test_suite_(-1), current_test_suite_(nullptr),
+      current_test_info_(nullptr), ad_hoc_test_result_(),
+      os_stack_trace_getter_(nullptr), post_flag_parse_init_performed_(false),
+      random_seed_(0),  // Will be overridden by the flag before first use.
+      random_(0),       // Will be reseeded before first use.
+      start_timestamp_(0), elapsed_time_(0),
+#if GTEST_HAS_DEATH_TEST
+      death_test_factory_(new DefaultDeathTestFactory),
+#endif
+      // Will be overridden by the flag before first use.
+      catch_exceptions_(false) {
+  listeners()->SetDefaultResultPrinter(new PrettyUnitTestResultPrinter);
+}
+
+UnitTestImpl::~UnitTestImpl() {
+  // Deletes every TestSuite.
+  ForEach(test_suites_, internal::Delete<TestSuite>);
+
+  // Deletes every Environment.
+  ForEach(environments_, internal::Delete<Environment>);
+
+  delete os_stack_trace_getter_;
+}
+
+// Adds a TestProperty to the current TestResult object when invoked in a
+// context of a test, to current test suite's ad_hoc_test_result when invoke
+// from SetUpTestSuite/TearDownTestSuite, or to the global property set
+// otherwise.  If the result already contains a property with the same key,
+// the value will be updated.
+void UnitTestImpl::RecordProperty(const TestProperty &test_property) {
+  std::string xml_element;
+  TestResult *test_result;  // TestResult appropriate for property recording.
+
+  if (current_test_info_ != nullptr) {
+    xml_element = "testcase";
+    test_result = &(current_test_info_->result_);
+  } else if (current_test_suite_ != nullptr) {
+    xml_element = "testsuite";
+    test_result = &(current_test_suite_->ad_hoc_test_result_);
+  } else {
+    xml_element = "testsuites";
+    test_result = &ad_hoc_test_result_;
+  }
+  test_result->RecordProperty(xml_element, test_property);
+}
+
+#if GTEST_HAS_DEATH_TEST
+// Disables event forwarding if the control is currently in a death test
+// subprocess. Must not be called before InitGoogleTest.
+void UnitTestImpl::SuppressTestEventsIfInSubprocess() {
+  if (internal_run_death_test_flag_.get() != nullptr)
+    listeners()->SuppressEventForwarding();
+}
+#endif  // GTEST_HAS_DEATH_TEST
+
+// Initializes event listeners performing XML output as specified by
+// UnitTestOptions. Must not be called before InitGoogleTest.
+void UnitTestImpl::ConfigureXmlOutput() {
+  const std::string &output_format = UnitTestOptions::GetOutputFormat();
+  if (output_format == "xml") {
+    listeners()->SetDefaultXmlGenerator(new XmlUnitTestResultPrinter(
+        UnitTestOptions::GetAbsolutePathToOutputFile().c_str()));
+  } else if (output_format == "json") {
+    listeners()->SetDefaultXmlGenerator(new JsonUnitTestResultPrinter(
+        UnitTestOptions::GetAbsolutePathToOutputFile().c_str()));
+  } else if (output_format != "") {
+    GTEST_LOG_(WARNING) << "WARNING: unrecognized output format \""
+                        << output_format << "\" ignored.";
+  }
+}
+
+#if GTEST_CAN_STREAM_RESULTS_
+// Initializes event listeners for streaming test results in string form.
+// Must not be called before InitGoogleTest.
+void UnitTestImpl::ConfigureStreamingOutput() {
+  const std::string &target = GTEST_FLAG(stream_result_to);
+  if (!target.empty()) {
+    const size_t pos = target.find(':');
+    if (pos != std::string::npos) {
+      listeners()->Append(
+          new StreamingListener(target.substr(0, pos), target.substr(pos + 1)));
+    } else {
+      GTEST_LOG_(WARNING) << "unrecognized streaming target \"" << target
+                          << "\" ignored.";
+    }
+  }
+}
+#endif  // GTEST_CAN_STREAM_RESULTS_
+
+// Performs initialization dependent upon flag values obtained in
+// ParseGoogleTestFlagsOnly.  Is called from InitGoogleTest after the call to
+// ParseGoogleTestFlagsOnly.  In case a user neglects to call InitGoogleTest
+// this function is also called from RunAllTests.  Since this function can be
+// called more than once, it has to be idempotent.
+void UnitTestImpl::PostFlagParsingInit() {
+  // Ensures that this function does not execute more than once.
+  if (!post_flag_parse_init_performed_) {
+    post_flag_parse_init_performed_ = true;
+
+#if defined(GTEST_CUSTOM_TEST_EVENT_LISTENER_)
+    // Register to send notifications about key process state changes.
+    listeners()->Append(new GTEST_CUSTOM_TEST_EVENT_LISTENER_());
+#endif  // defined(GTEST_CUSTOM_TEST_EVENT_LISTENER_)
+
+#if GTEST_HAS_DEATH_TEST
+    InitDeathTestSubprocessControlInfo();
+    SuppressTestEventsIfInSubprocess();
+#endif  // GTEST_HAS_DEATH_TEST
+
+    // Registers parameterized tests. This makes parameterized tests
+    // available to the UnitTest reflection API without running
+    // RUN_ALL_TESTS.
+    RegisterParameterizedTests();
+
+    // Configures listeners for XML output. This makes it possible for users
+    // to shut down the default XML output before invoking RUN_ALL_TESTS.
+    ConfigureXmlOutput();
+
+#if GTEST_CAN_STREAM_RESULTS_
+    // Configures listeners for streaming test results to the specified server.
+    ConfigureStreamingOutput();
+#endif  // GTEST_CAN_STREAM_RESULTS_
+
+#if GTEST_HAS_ABSL
+    if (GTEST_FLAG(install_failure_signal_handler)) {
+      absl::FailureSignalHandlerOptions options;
+      absl::InstallFailureSignalHandler(options);
+    }
+#endif  // GTEST_HAS_ABSL
+  }
+}
+
+// A predicate that checks the name of a TestSuite against a known
+// value.
+//
+// This is used for implementation of the UnitTest class only.  We put
+// it in the anonymous namespace to prevent polluting the outer
+// namespace.
+//
+// TestSuiteNameIs is copyable.
+class TestSuiteNameIs {
+ public:
+  // Constructor.
+  explicit TestSuiteNameIs(const std::string &name) : name_(name) {}
+
+  // Returns true if and only if the name of test_suite matches name_.
+  bool operator()(const TestSuite *test_suite) const {
+    return test_suite != nullptr &&
+           strcmp(test_suite->name(), name_.c_str()) == 0;
+  }
+
+ private:
+  std::string name_;
+};
+
+// Finds and returns a TestSuite with the given name.  If one doesn't
+// exist, creates one and returns it.  It's the CALLER'S
+// RESPONSIBILITY to ensure that this function is only called WHEN THE
+// TESTS ARE NOT SHUFFLED.
+//
+// Arguments:
+//
+//   test_suite_name: name of the test suite
+//   type_param:     the name of the test suite's type parameter, or NULL if
+//                   this is not a typed or a type-parameterized test suite.
+//   set_up_tc:      pointer to the function that sets up the test suite
+//   tear_down_tc:   pointer to the function that tears down the test suite
+TestSuite *UnitTestImpl::GetTestSuite(
+    const char *test_suite_name, const char *type_param,
+    internal::SetUpTestSuiteFunc set_up_tc,
+    internal::TearDownTestSuiteFunc tear_down_tc) {
+  // Can we find a TestSuite with the given name?
+  const auto test_suite =
+      std::find_if(test_suites_.rbegin(), test_suites_.rend(),
+                   TestSuiteNameIs(test_suite_name));
+
+  if (test_suite != test_suites_.rend()) return *test_suite;
+
+  // No.  Let's create one.
+  auto *const new_test_suite =
+      new TestSuite(test_suite_name, type_param, set_up_tc, tear_down_tc);
+
+  // Is this a death test suite?
+  if (internal::UnitTestOptions::MatchesFilter(test_suite_name,
+                                               kDeathTestSuiteFilter)) {
+    // Yes.  Inserts the test suite after the last death test suite
+    // defined so far.  This only works when the test suites haven't
+    // been shuffled.  Otherwise we may end up running a death test
+    // after a non-death test.
+    ++last_death_test_suite_;
+    test_suites_.insert(test_suites_.begin() + last_death_test_suite_,
+                        new_test_suite);
+  } else {
+    // No.  Appends to the end of the list.
+    test_suites_.push_back(new_test_suite);
+  }
+
+  test_suite_indices_.push_back(static_cast<int>(test_suite_indices_.size()));
+  return new_test_suite;
+}
+
+// Helpers for setting up / tearing down the given environment.  They
+// are for use in the ForEach() function.
+static void SetUpEnvironment(Environment *env) { env->SetUp(); }
+static void TearDownEnvironment(Environment *env) { env->TearDown(); }
+
+// Runs all tests in this UnitTest object, prints the result, and
+// returns true if all tests are successful.  If any exception is
+// thrown during a test, the test is considered to be failed, but the
+// rest of the tests will still be run.
+//
+// When parameterized tests are enabled, it expands and registers
+// parameterized tests first in RegisterParameterizedTests().
+// All other functions called from RunAllTests() may safely assume that
+// parameterized tests are ready to be counted and run.
+bool UnitTestImpl::RunAllTests() {
+  // True if and only if Google Test is initialized before RUN_ALL_TESTS() is
+  // called.
+  const bool gtest_is_initialized_before_run_all_tests = GTestIsInitialized();
+
+  // Do not run any test if the --help flag was specified.
+  if (g_help_flag) return true;
+
+  // Repeats the call to the post-flag parsing initialization in case the
+  // user didn't call InitGoogleTest.
+  PostFlagParsingInit();
+
+  // Even if sharding is not on, test runners may want to use the
+  // GTEST_SHARD_STATUS_FILE to query whether the test supports the sharding
+  // protocol.
+  internal::WriteToShardStatusFileIfNeeded();
+
+  // True if and only if we are in a subprocess for running a thread-safe-style
+  // death test.
+  bool in_subprocess_for_death_test = false;
+
+#if GTEST_HAS_DEATH_TEST
+  in_subprocess_for_death_test =
+      (internal_run_death_test_flag_.get() != nullptr);
+#if defined(GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_)
+  if (in_subprocess_for_death_test) {
+    GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_();
+  }
+#endif  // defined(GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_)
+#endif  // GTEST_HAS_DEATH_TEST
+
+  const bool should_shard = ShouldShard(kTestTotalShards, kTestShardIndex,
+                                        in_subprocess_for_death_test);
+
+  // Compares the full test names with the filter to decide which
+  // tests to run.
+  const bool has_tests_to_run =
+      FilterTests(should_shard ? HONOR_SHARDING_PROTOCOL
+                               : IGNORE_SHARDING_PROTOCOL) > 0;
+
+  // Lists the tests and exits if the --gtest_list_tests flag was specified.
+  if (GTEST_FLAG(list_tests)) {
+    // This must be called *after* FilterTests() has been called.
+    ListTestsMatchingFilter();
+    return true;
+  }
+
+  random_seed_ =
+      GTEST_FLAG(shuffle) ? GetRandomSeedFromFlag(GTEST_FLAG(random_seed)) : 0;
+
+  // True if and only if at least one test has failed.
+  bool failed = false;
+
+  TestEventListener *repeater = listeners()->repeater();
+
+  start_timestamp_ = GetTimeInMillis();
+  repeater->OnTestProgramStart(*parent_);
+
+  // How many times to repeat the tests?  We don't want to repeat them
+  // when we are inside the subprocess of a death test.
+  const int repeat = in_subprocess_for_death_test ? 1 : GTEST_FLAG(repeat);
+  // Repeats forever if the repeat count is negative.
+  const bool gtest_repeat_forever = repeat < 0;
+  for (int i = 0; gtest_repeat_forever || i != repeat; i++) {
+    // We want to preserve failures generated by ad-hoc test
+    // assertions executed before RUN_ALL_TESTS().
+    ClearNonAdHocTestResult();
+
+    const TimeInMillis start = GetTimeInMillis();
+
+    // Shuffles test suites and tests if requested.
+    if (has_tests_to_run && GTEST_FLAG(shuffle)) {
+      random()->Reseed(static_cast<uint32_t>(random_seed_));
+      // This should be done before calling OnTestIterationStart(),
+      // such that a test event listener can see the actual test order
+      // in the event.
+      ShuffleTests();
+    }
+
+    // Tells the unit test event listeners that the tests are about to start.
+    repeater->OnTestIterationStart(*parent_, i);
+
+    // Runs each test suite if there is at least one test to run.
+    if (has_tests_to_run) {
+      // Sets up all environments beforehand.
+      repeater->OnEnvironmentsSetUpStart(*parent_);
+      ForEach(environments_, SetUpEnvironment);
+      repeater->OnEnvironmentsSetUpEnd(*parent_);
+
+      // Runs the tests only if there was no fatal failure or skip triggered
+      // during global set-up.
+      if (Test::IsSkipped()) {
+        // Emit diagnostics when global set-up calls skip, as it will not be
+        // emitted by default.
+        TestResult &test_result =
+            *internal::GetUnitTestImpl()->current_test_result();
+        for (int j = 0; j < test_result.total_part_count(); ++j) {
+          const TestPartResult &test_part_result =
+              test_result.GetTestPartResult(j);
+          if (test_part_result.type() == TestPartResult::kSkip) {
+            const std::string &result = test_part_result.message();
+            printf("%s\n", result.c_str());
+          }
+        }
+        fflush(stdout);
+      } else if (!Test::HasFatalFailure()) {
+        for (int test_index = 0; test_index < total_test_suite_count();
+             test_index++) {
+          GetMutableSuiteCase(test_index)->Run();
+        }
+      }
+
+      // Tears down all environments in reverse order afterwards.
+      repeater->OnEnvironmentsTearDownStart(*parent_);
+      std::for_each(environments_.rbegin(), environments_.rend(),
+                    TearDownEnvironment);
+      repeater->OnEnvironmentsTearDownEnd(*parent_);
+    }
+
+    elapsed_time_ = GetTimeInMillis() - start;
+
+    // Tells the unit test event listener that the tests have just finished.
+    repeater->OnTestIterationEnd(*parent_, i);
+
+    // Gets the result and clears it.
+    if (!Passed()) {
+      failed = true;
+    }
+
+    // Restores the original test order after the iteration.  This
+    // allows the user to quickly repro a failure that happens in the
+    // N-th iteration without repeating the first (N - 1) iterations.
+    // This is not enclosed in "if (GTEST_FLAG(shuffle)) { ... }", in
+    // case the user somehow changes the value of the flag somewhere
+    // (it's always safe to unshuffle the tests).
+    UnshuffleTests();
+
+    if (GTEST_FLAG(shuffle)) {
+      // Picks a new random seed for each iteration.
+      random_seed_ = GetNextRandomSeed(random_seed_);
+    }
+  }
+
+  repeater->OnTestProgramEnd(*parent_);
+
+  if (!gtest_is_initialized_before_run_all_tests) {
+    ColoredPrintf(
+        COLOR_RED,
+        "\nIMPORTANT NOTICE - DO NOT IGNORE:\n"
+        "This test program did NOT call " GTEST_INIT_GOOGLE_TEST_NAME_
+        "() before calling RUN_ALL_TESTS(). This is INVALID. Soon " GTEST_NAME_
+        " will start to enforce the valid usage. "
+        "Please fix it ASAP, or IT WILL START TO FAIL.\n");  // NOLINT
+#if GTEST_FOR_GOOGLE_
+    ColoredPrintf(COLOR_RED,
+                  "For more details, see http://wiki/Main/ValidGUnitMain.\n");
+#endif  // GTEST_FOR_GOOGLE_
+  }
+
+  return !failed;
+}
+
+// Reads the GTEST_SHARD_STATUS_FILE environment variable, and creates the file
+// if the variable is present. If a file already exists at this location, this
+// function will write over it. If the variable is present, but the file cannot
+// be created, prints an error and exits.
+void WriteToShardStatusFileIfNeeded() {
+  const char *const test_shard_file = posix::GetEnv(kTestShardStatusFile);
+  if (test_shard_file != nullptr) {
+    FILE *const file = posix::FOpen(test_shard_file, "w");
+    if (file == nullptr) {
+      ColoredPrintf(COLOR_RED,
+                    "Could not write to the test shard status file \"%s\" "
+                    "specified by the %s environment variable.\n",
+                    test_shard_file, kTestShardStatusFile);
+      fflush(stdout);
+      exit(EXIT_FAILURE);
+    }
+    fclose(file);
+  }
+}
+
+// Checks whether sharding is enabled by examining the relevant
+// environment variable values. If the variables are present,
+// but inconsistent (i.e., shard_index >= total_shards), prints
+// an error and exits. If in_subprocess_for_death_test, sharding is
+// disabled because it must only be applied to the original test
+// process. Otherwise, we could filter out death tests we intended to execute.
+bool ShouldShard(const char *total_shards_env, const char *shard_index_env,
+                 bool in_subprocess_for_death_test) {
+  if (in_subprocess_for_death_test) {
+    return false;
+  }
+
+  const int32_t total_shards = Int32FromEnvOrDie(total_shards_env, -1);
+  const int32_t shard_index = Int32FromEnvOrDie(shard_index_env, -1);
+
+  if (total_shards == -1 && shard_index == -1) {
+    return false;
+  } else if (total_shards == -1 && shard_index != -1) {
+    const Message msg = Message() << "Invalid environment variables: you have "
+                                  << kTestShardIndex << " = " << shard_index
+                                  << ", but have left " << kTestTotalShards
+                                  << " unset.\n";
+    ColoredPrintf(COLOR_RED, "%s", msg.GetString().c_str());
+    fflush(stdout);
+    exit(EXIT_FAILURE);
+  } else if (total_shards != -1 && shard_index == -1) {
+    const Message msg = Message()
+                        << "Invalid environment variables: you have "
+                        << kTestTotalShards << " = " << total_shards
+                        << ", but have left " << kTestShardIndex << " unset.\n";
+    ColoredPrintf(COLOR_RED, "%s", msg.GetString().c_str());
+    fflush(stdout);
+    exit(EXIT_FAILURE);
+  } else if (shard_index < 0 || shard_index >= total_shards) {
+    const Message msg =
+        Message() << "Invalid environment variables: we require 0 <= "
+                  << kTestShardIndex << " < " << kTestTotalShards
+                  << ", but you have " << kTestShardIndex << "=" << shard_index
+                  << ", " << kTestTotalShards << "=" << total_shards << ".\n";
+    ColoredPrintf(COLOR_RED, "%s", msg.GetString().c_str());
+    fflush(stdout);
+    exit(EXIT_FAILURE);
+  }
+
+  return total_shards > 1;
+}
+
+// Parses the environment variable var as an Int32. If it is unset,
+// returns default_val. If it is not an Int32, prints an error
+// and aborts.
+int32_t Int32FromEnvOrDie(const char *var, int32_t default_val) {
+  const char *str_val = posix::GetEnv(var);
+  if (str_val == nullptr) {
+    return default_val;
+  }
+
+  int32_t result;
+  if (!ParseInt32(Message() << "The value of environment variable " << var,
+                  str_val, &result)) {
+    exit(EXIT_FAILURE);
+  }
+  return result;
+}
+
+// Given the total number of shards, the shard index, and the test id,
+// returns true if and only if the test should be run on this shard. The test id
+// is some arbitrary but unique non-negative integer assigned to each test
+// method. Assumes that 0 <= shard_index < total_shards.
+bool ShouldRunTestOnShard(int total_shards, int shard_index, int test_id) {
+  return (test_id % total_shards) == shard_index;
+}
+
+// Compares the name of each test with the user-specified filter to
+// decide whether the test should be run, then records the result in
+// each TestSuite and TestInfo object.
+// If shard_tests == true, further filters tests based on sharding
+// variables in the environment - see
+// https://github.com/google/googletest/blob/master/googletest/docs/advanced.md
+// . Returns the number of tests that should run.
+int UnitTestImpl::FilterTests(ReactionToSharding shard_tests) {
+  const int32_t total_shards = shard_tests == HONOR_SHARDING_PROTOCOL
+                                   ? Int32FromEnvOrDie(kTestTotalShards, -1)
+                                   : -1;
+  const int32_t shard_index = shard_tests == HONOR_SHARDING_PROTOCOL
+                                  ? Int32FromEnvOrDie(kTestShardIndex, -1)
+                                  : -1;
+
+  // num_runnable_tests are the number of tests that will
+  // run across all shards (i.e., match filter and are not disabled).
+  // num_selected_tests are the number of tests to be run on
+  // this shard.
+  int num_runnable_tests = 0;
+  int num_selected_tests = 0;
+  for (auto *test_suite : test_suites_) {
+    const std::string &test_suite_name = test_suite->name();
+    test_suite->set_should_run(false);
+
+    for (size_t j = 0; j < test_suite->test_info_list().size(); j++) {
+      TestInfo *const test_info = test_suite->test_info_list()[j];
+      const std::string test_name(test_info->name());
+      // A test is disabled if test suite name or test name matches
+      // kDisableTestFilter.
+      const bool is_disabled = internal::UnitTestOptions::MatchesFilter(
+                                   test_suite_name, kDisableTestFilter) ||
+                               internal::UnitTestOptions::MatchesFilter(
+                                   test_name, kDisableTestFilter);
+      test_info->is_disabled_ = is_disabled;
+
+      const bool matches_filter = internal::UnitTestOptions::FilterMatchesTest(
+          test_suite_name, test_name);
+      test_info->matches_filter_ = matches_filter;
+
+      const bool is_runnable =
+          (GTEST_FLAG(also_run_disabled_tests) || !is_disabled) &&
+          matches_filter;
+
+      const bool is_in_another_shard =
+          shard_tests != IGNORE_SHARDING_PROTOCOL &&
+          !ShouldRunTestOnShard(total_shards, shard_index, num_runnable_tests);
+      test_info->is_in_another_shard_ = is_in_another_shard;
+      const bool is_selected = is_runnable && !is_in_another_shard;
+
+      num_runnable_tests += is_runnable;
+      num_selected_tests += is_selected;
+
+      test_info->should_run_ = is_selected;
+      test_suite->set_should_run(test_suite->should_run() || is_selected);
+    }
+  }
+  return num_selected_tests;
+}
+
+// Prints the given C-string on a single line by replacing all '\n'
+// characters with string "\\n".  If the output takes more than
+// max_length characters, only prints the first max_length characters
+// and "...".
+static void PrintOnOneLine(const char *str, int max_length) {
+  if (str != nullptr) {
+    for (int i = 0; *str != '\0'; ++str) {
+      if (i >= max_length) {
+        printf("...");
+        break;
+      }
+      if (*str == '\n') {
+        printf("\\n");
+        i += 2;
+      } else {
+        printf("%c", *str);
+        ++i;
+      }
+    }
+  }
+}
+
+// Prints the names of the tests matching the user-specified filter flag.
+void UnitTestImpl::ListTestsMatchingFilter() {
+  // Print at most this many characters for each type/value parameter.
+  const int kMaxParamLength = 250;
+
+  for (auto *test_suite : test_suites_) {
+    bool printed_test_suite_name = false;
+
+    for (size_t j = 0; j < test_suite->test_info_list().size(); j++) {
+      const TestInfo *const test_info = test_suite->test_info_list()[j];
+      if (test_info->matches_filter_) {
+        if (!printed_test_suite_name) {
+          printed_test_suite_name = true;
+          printf("%s.", test_suite->name());
+          if (test_suite->type_param() != nullptr) {
+            printf("  # %s = ", kTypeParamLabel);
+            // We print the type parameter on a single line to make
+            // the output easy to parse by a program.
+            PrintOnOneLine(test_suite->type_param(), kMaxParamLength);
+          }
+          printf("\n");
+        }
+        printf("  %s", test_info->name());
+        if (test_info->value_param() != nullptr) {
+          printf("  # %s = ", kValueParamLabel);
+          // We print the value parameter on a single line to make the
+          // output easy to parse by a program.
+          PrintOnOneLine(test_info->value_param(), kMaxParamLength);
+        }
+        printf("\n");
+      }
+    }
+  }
+  fflush(stdout);
+  const std::string &output_format = UnitTestOptions::GetOutputFormat();
+  if (output_format == "xml" || output_format == "json") {
+    FILE *fileout = OpenFileForWriting(
+        UnitTestOptions::GetAbsolutePathToOutputFile().c_str());
+    std::stringstream stream;
+    if (output_format == "xml") {
+      XmlUnitTestResultPrinter(
+          UnitTestOptions::GetAbsolutePathToOutputFile().c_str())
+          .PrintXmlTestsList(&stream, test_suites_);
+    } else if (output_format == "json") {
+      JsonUnitTestResultPrinter(
+          UnitTestOptions::GetAbsolutePathToOutputFile().c_str())
+          .PrintJsonTestList(&stream, test_suites_);
+    }
+    fprintf(fileout, "%s", StringStreamToString(&stream).c_str());
+    fclose(fileout);
+  }
+}
+
+// Sets the OS stack trace getter.
+//
+// Does nothing if the input and the current OS stack trace getter are
+// the same; otherwise, deletes the old getter and makes the input the
+// current getter.
+void UnitTestImpl::set_os_stack_trace_getter(
+    OsStackTraceGetterInterface *getter) {
+  if (os_stack_trace_getter_ != getter) {
+    delete os_stack_trace_getter_;
+    os_stack_trace_getter_ = getter;
+  }
+}
+
+// Returns the current OS stack trace getter if it is not NULL;
+// otherwise, creates an OsStackTraceGetter, makes it the current
+// getter, and returns it.
+OsStackTraceGetterInterface *UnitTestImpl::os_stack_trace_getter() {
+  if (os_stack_trace_getter_ == nullptr) {
+#ifdef GTEST_OS_STACK_TRACE_GETTER_
+    os_stack_trace_getter_ = new GTEST_OS_STACK_TRACE_GETTER_;
+#else
+    os_stack_trace_getter_ = new OsStackTraceGetter;
+#endif  // GTEST_OS_STACK_TRACE_GETTER_
+  }
+
+  return os_stack_trace_getter_;
+}
+
+// Returns the most specific TestResult currently running.
+TestResult *UnitTestImpl::current_test_result() {
+  if (current_test_info_ != nullptr) {
+    return &current_test_info_->result_;
+  }
+  if (current_test_suite_ != nullptr) {
+    return &current_test_suite_->ad_hoc_test_result_;
+  }
+  return &ad_hoc_test_result_;
+}
+
+// Shuffles all test suites, and the tests within each test suite,
+// making sure that death tests are still run first.
+void UnitTestImpl::ShuffleTests() {
+  // Shuffles the death test suites.
+  ShuffleRange(random(), 0, last_death_test_suite_ + 1, &test_suite_indices_);
+
+  // Shuffles the non-death test suites.
+  ShuffleRange(random(), last_death_test_suite_ + 1,
+               static_cast<int>(test_suites_.size()), &test_suite_indices_);
+
+  // Shuffles the tests inside each test suite.
+  for (auto &test_suite : test_suites_) {
+    test_suite->ShuffleTests(random());
+  }
+}
+
+// Restores the test suites and tests to their order before the first shuffle.
+void UnitTestImpl::UnshuffleTests() {
+  for (size_t i = 0; i < test_suites_.size(); i++) {
+    // Unshuffles the tests in each test suite.
+    test_suites_[i]->UnshuffleTests();
+    // Resets the index of each test suite.
+    test_suite_indices_[i] = static_cast<int>(i);
+  }
+}
+
+// Returns the current OS stack trace as an std::string.
+//
+// The maximum number of stack frames to be included is specified by
+// the gtest_stack_trace_depth flag.  The skip_count parameter
+// specifies the number of top frames to be skipped, which doesn't
+// count against the number of frames to be included.
+//
+// For example, if Foo() calls Bar(), which in turn calls
+// GetCurrentOsStackTraceExceptTop(..., 1), Foo() will be included in
+// the trace but Bar() and GetCurrentOsStackTraceExceptTop() won't.
+std::string GetCurrentOsStackTraceExceptTop(UnitTest * /*unit_test*/,
+                                            int skip_count) {
+  // We pass skip_count + 1 to skip this wrapper function in addition
+  // to what the user really wants to skip.
+  return GetUnitTestImpl()->CurrentOsStackTraceExceptTop(skip_count + 1);
+}
+
+// Used by the GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_ macro to
+// suppress unreachable code warnings.
+namespace {
+class ClassUniqueToAlwaysTrue {};
+}  // namespace
+
+bool IsTrue(bool condition) { return condition; }
+
+bool AlwaysTrue() {
+#if GTEST_HAS_EXCEPTIONS
+  // This condition is always false so AlwaysTrue() never actually throws,
+  // but it makes the compiler think that it may throw.
+  if (IsTrue(false)) throw ClassUniqueToAlwaysTrue();
+#endif  // GTEST_HAS_EXCEPTIONS
+  return true;
+}
+
+// If *pstr starts with the given prefix, modifies *pstr to be right
+// past the prefix and returns true; otherwise leaves *pstr unchanged
+// and returns false.  None of pstr, *pstr, and prefix can be NULL.
+bool SkipPrefix(const char *prefix, const char **pstr) {
+  const size_t prefix_len = strlen(prefix);
+  if (strncmp(*pstr, prefix, prefix_len) == 0) {
+    *pstr += prefix_len;
+    return true;
+  }
+  return false;
+}
+
+// Parses a string as a command line flag.  The string should have
+// the format "--flag=value".  When def_optional is true, the "=value"
+// part can be omitted.
+//
+// Returns the value of the flag, or NULL if the parsing failed.
+static const char *ParseFlagValue(const char *str, const char *flag,
+                                  bool def_optional) {
+  // str and flag must not be NULL.
+  if (str == nullptr || flag == nullptr) return nullptr;
+
+  // The flag must start with "--" followed by GTEST_FLAG_PREFIX_.
+  const std::string flag_str = std::string("--") + GTEST_FLAG_PREFIX_ + flag;
+  const size_t flag_len = flag_str.length();
+  if (strncmp(str, flag_str.c_str(), flag_len) != 0) return nullptr;
+
+  // Skips the flag name.
+  const char *flag_end = str + flag_len;
+
+  // When def_optional is true, it's OK to not have a "=value" part.
+  if (def_optional && (flag_end[0] == '\0')) {
+    return flag_end;
+  }
+
+  // If def_optional is true and there are more characters after the
+  // flag name, or if def_optional is false, there must be a '=' after
+  // the flag name.
+  if (flag_end[0] != '=') return nullptr;
+
+  // Returns the string after "=".
+  return flag_end + 1;
+}
+
+// Parses a string for a bool flag, in the form of either
+// "--flag=value" or "--flag".
+//
+// In the former case, the value is taken as true as long as it does
+// not start with '0', 'f', or 'F'.
+//
+// In the latter case, the value is taken as true.
+//
+// On success, stores the value of the flag in *value, and returns
+// true.  On failure, returns false without changing *value.
+static bool ParseBoolFlag(const char *str, const char *flag, bool *value) {
+  // Gets the value of the flag as a string.
+  const char *const value_str = ParseFlagValue(str, flag, true);
+
+  // Aborts if the parsing failed.
+  if (value_str == nullptr) return false;
+
+  // Converts the string value to a bool.
+  *value = !(*value_str == '0' || *value_str == 'f' || *value_str == 'F');
+  return true;
+}
+
+// Parses a string for an int32_t flag, in the form of "--flag=value".
+//
+// On success, stores the value of the flag in *value, and returns
+// true.  On failure, returns false without changing *value.
+bool ParseInt32Flag(const char *str, const char *flag, int32_t *value) {
+  // Gets the value of the flag as a string.
+  const char *const value_str = ParseFlagValue(str, flag, false);
+
+  // Aborts if the parsing failed.
+  if (value_str == nullptr) return false;
+
+  // Sets *value to the value of the flag.
+  return ParseInt32(Message() << "The value of flag --" << flag, value_str,
+                    value);
+}
+
+// Parses a string for a string flag, in the form of "--flag=value".
+//
+// On success, stores the value of the flag in *value, and returns
+// true.  On failure, returns false without changing *value.
+template <typename String>
+static bool ParseStringFlag(const char *str, const char *flag, String *value) {
+  // Gets the value of the flag as a string.
+  const char *const value_str = ParseFlagValue(str, flag, false);
+
+  // Aborts if the parsing failed.
+  if (value_str == nullptr) return false;
+
+  // Sets *value to the value of the flag.
+  *value = value_str;
+  return true;
+}
+
+// Determines whether a string has a prefix that Google Test uses for its
+// flags, i.e., starts with GTEST_FLAG_PREFIX_ or GTEST_FLAG_PREFIX_DASH_.
+// If Google Test detects that a command line flag has its prefix but is not
+// recognized, it will print its help message. Flags starting with
+// GTEST_INTERNAL_PREFIX_ followed by "internal_" are considered Google Test
+// internal flags and do not trigger the help message.
+static bool HasGoogleTestFlagPrefix(const char *str) {
+  return (SkipPrefix("--", &str) || SkipPrefix("-", &str) ||
+          SkipPrefix("/", &str)) &&
+         !SkipPrefix(GTEST_FLAG_PREFIX_ "internal_", &str) &&
+         (SkipPrefix(GTEST_FLAG_PREFIX_, &str) ||
+          SkipPrefix(GTEST_FLAG_PREFIX_DASH_, &str));
+}
+
+// Prints a string containing code-encoded text.  The following escape
+// sequences can be used in the string to control the text color:
+//
+//   @@    prints a single '@' character.
+//   @R    changes the color to red.
+//   @G    changes the color to green.
+//   @Y    changes the color to yellow.
+//   @D    changes to the default terminal text color.
+//
+static void PrintColorEncoded(const char *str) {
+  GTestColor color = COLOR_DEFAULT;  // The current color.
+
+  // Conceptually, we split the string into segments divided by escape
+  // sequences.  Then we print one segment at a time.  At the end of
+  // each iteration, the str pointer advances to the beginning of the
+  // next segment.
+  for (;;) {
+    const char *p = strchr(str, '@');
+    if (p == nullptr) {
+      ColoredPrintf(color, "%s", str);
+      return;
+    }
+
+    ColoredPrintf(color, "%s", std::string(str, p).c_str());
+
+    const char ch = p[1];
+    str = p + 2;
+    if (ch == '@') {
+      ColoredPrintf(color, "@");
+    } else if (ch == 'D') {
+      color = COLOR_DEFAULT;
+    } else if (ch == 'R') {
+      color = COLOR_RED;
+    } else if (ch == 'G') {
+      color = COLOR_GREEN;
+    } else if (ch == 'Y') {
+      color = COLOR_YELLOW;
+    } else {
+      --str;
+    }
+  }
+}
+
+static const char kColorEncodedHelpMessage[] =
+    "This program contains tests written using " GTEST_NAME_
+    ". You can use the\n"
+    "following command line flags to control its behavior:\n"
+    "\n"
+    "Test Selection:\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "list_tests@D\n"
+    "      List the names of all tests instead of running them. The name of\n"
+    "      TEST(Foo, Bar) is \"Foo.Bar\".\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "filter=@YPOSTIVE_PATTERNS"
+    "[@G-@YNEGATIVE_PATTERNS]@D\n"
+    "      Run only the tests whose name matches one of the positive patterns "
+    "but\n"
+    "      none of the negative patterns. '?' matches any single character; "
+    "'*'\n"
+    "      matches any substring; ':' separates two patterns.\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "also_run_disabled_tests@D\n"
+    "      Run all disabled tests too.\n"
+    "\n"
+    "Test Execution:\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "repeat=@Y[COUNT]@D\n"
+    "      Run the tests repeatedly; use a negative count to repeat forever.\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "shuffle@D\n"
+    "      Randomize tests' orders on every iteration.\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "random_seed=@Y[NUMBER]@D\n"
+    "      Random number seed to use for shuffling test orders (between 1 and\n"
+    "      99999, or 0 to use a seed based on the current time).\n"
+    "\n"
+    "Test Output:\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "color=@Y(@Gyes@Y|@Gno@Y|@Gauto@Y)@D\n"
+    "      Enable/disable colored output. The default is @Gauto@D.\n"
+    "  -@G-" GTEST_FLAG_PREFIX_
+    "print_time=0@D\n"
+    "      Don't print the elapsed time of each test.\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "output=@Y(@Gjson@Y|@Gxml@Y)[@G:@YDIRECTORY_PATH@G" GTEST_PATH_SEP_
+    "@Y|@G:@YFILE_PATH]@D\n"
+    "      Generate a JSON or XML report in the given directory or with the "
+    "given\n"
+    "      file name. @YFILE_PATH@D defaults to @Gtest_detail.xml@D.\n"
+#if GTEST_CAN_STREAM_RESULTS_
+    "  @G--" GTEST_FLAG_PREFIX_
+    "stream_result_to=@YHOST@G:@YPORT@D\n"
+    "      Stream test results to the given server.\n"
+#endif  // GTEST_CAN_STREAM_RESULTS_
+    "\n"
+    "Assertion Behavior:\n"
+#if GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
+    "  @G--" GTEST_FLAG_PREFIX_
+    "death_test_style=@Y(@Gfast@Y|@Gthreadsafe@Y)@D\n"
+    "      Set the default death test style.\n"
+#endif  // GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
+    "  @G--" GTEST_FLAG_PREFIX_
+    "break_on_failure@D\n"
+    "      Turn assertion failures into debugger break-points.\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "throw_on_failure@D\n"
+    "      Turn assertion failures into C++ exceptions for use by an external\n"
+    "      test framework.\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "catch_exceptions=0@D\n"
+    "      Do not report exceptions as test failures. Instead, allow them\n"
+    "      to crash the program or throw a pop-up (on Windows).\n"
+    "\n"
+    "Except for @G--" GTEST_FLAG_PREFIX_
+    "list_tests@D, you can alternatively set "
+    "the corresponding\n"
+    "environment variable of a flag (all letters in upper-case). For example, "
+    "to\n"
+    "disable colored text output, you can either specify "
+    "@G--" GTEST_FLAG_PREFIX_
+    "color=no@D or set\n"
+    "the @G" GTEST_FLAG_PREFIX_UPPER_
+    "COLOR@D environment variable to @Gno@D.\n"
+    "\n"
+    "For more information, please read the " GTEST_NAME_
+    " documentation at\n"
+    "@G" GTEST_PROJECT_URL_ "@D. If you find a bug in " GTEST_NAME_
+    "\n"
+    "(not one in your own code or tests), please report it to\n"
+    "@G<" GTEST_DEV_EMAIL_ ">@D.\n";
+
+static bool ParseGoogleTestFlag(const char *const arg) {
+  return ParseBoolFlag(arg, kAlsoRunDisabledTestsFlag,
+                       &GTEST_FLAG(also_run_disabled_tests)) ||
+         ParseBoolFlag(arg, kBreakOnFailureFlag,
+                       &GTEST_FLAG(break_on_failure)) ||
+         ParseBoolFlag(arg, kCatchExceptionsFlag,
+                       &GTEST_FLAG(catch_exceptions)) ||
+         ParseStringFlag(arg, kColorFlag, &GTEST_FLAG(color)) ||
+         ParseStringFlag(arg, kDeathTestStyleFlag,
+                         &GTEST_FLAG(death_test_style)) ||
+         ParseBoolFlag(arg, kDeathTestUseFork,
+                       &GTEST_FLAG(death_test_use_fork)) ||
+         ParseStringFlag(arg, kFilterFlag, &GTEST_FLAG(filter)) ||
+         ParseStringFlag(arg, kInternalRunDeathTestFlag,
+                         &GTEST_FLAG(internal_run_death_test)) ||
+         ParseBoolFlag(arg, kListTestsFlag, &GTEST_FLAG(list_tests)) ||
+         ParseStringFlag(arg, kOutputFlag, &GTEST_FLAG(output)) ||
+         ParseBoolFlag(arg, kPrintTimeFlag, &GTEST_FLAG(print_time)) ||
+         ParseBoolFlag(arg, kPrintUTF8Flag, &GTEST_FLAG(print_utf8)) ||
+         ParseInt32Flag(arg, kRandomSeedFlag, &GTEST_FLAG(random_seed)) ||
+         ParseInt32Flag(arg, kRepeatFlag, &GTEST_FLAG(repeat)) ||
+         ParseBoolFlag(arg, kShuffleFlag, &GTEST_FLAG(shuffle)) ||
+         ParseInt32Flag(arg, kStackTraceDepthFlag,
+                        &GTEST_FLAG(stack_trace_depth)) ||
+         ParseStringFlag(arg, kStreamResultToFlag,
+                         &GTEST_FLAG(stream_result_to)) ||
+         ParseBoolFlag(arg, kThrowOnFailureFlag, &GTEST_FLAG(throw_on_failure));
+}
+
+#if GTEST_USE_OWN_FLAGFILE_FLAG_
+static void LoadFlagsFromFile(const std::string &path) {
+  FILE *flagfile = posix::FOpen(path.c_str(), "r");
+  if (!flagfile) {
+    GTEST_LOG_(FATAL) << "Unable to open file \"" << GTEST_FLAG(flagfile)
+                      << "\"";
+  }
+  std::string contents(ReadEntireFile(flagfile));
+  posix::FClose(flagfile);
+  std::vector<std::string> lines;
+  SplitString(contents, '\n', &lines);
+  for (size_t i = 0; i < lines.size(); ++i) {
+    if (lines[i].empty()) continue;
+    if (!ParseGoogleTestFlag(lines[i].c_str())) g_help_flag = true;
+  }
+}
+#endif  // GTEST_USE_OWN_FLAGFILE_FLAG_
+
+// Parses the command line for Google Test flags, without initializing
+// other parts of Google Test.  The type parameter CharType can be
+// instantiated to either char or wchar_t.
+template <typename CharType>
+void ParseGoogleTestFlagsOnlyImpl(int *argc, CharType **argv) {
+  for (int i = 1; i < *argc; i++) {
+    const std::string arg_string = StreamableToString(argv[i]);
+    const char *const arg = arg_string.c_str();
+
+    using internal::ParseBoolFlag;
+    using internal::ParseInt32Flag;
+    using internal::ParseStringFlag;
+
+    bool remove_flag = false;
+    if (ParseGoogleTestFlag(arg)) {
+      remove_flag = true;
+#if GTEST_USE_OWN_FLAGFILE_FLAG_
+    } else if (ParseStringFlag(arg, kFlagfileFlag, &GTEST_FLAG(flagfile))) {
+      LoadFlagsFromFile(GTEST_FLAG(flagfile));
+      remove_flag = true;
+#endif  // GTEST_USE_OWN_FLAGFILE_FLAG_
+    } else if (arg_string == "--help" || arg_string == "-h" ||
+               arg_string == "-?" || arg_string == "/?" ||
+               HasGoogleTestFlagPrefix(arg)) {
+      // Both help flag and unrecognized Google Test flags (excluding
+      // internal ones) trigger help display.
+      g_help_flag = true;
+    }
+
+    if (remove_flag) {
+      // Shift the remainder of the argv list left by one.  Note
+      // that argv has (*argc + 1) elements, the last one always being
+      // NULL.  The following loop moves the trailing NULL element as
+      // well.
+      for (int j = i; j != *argc; j++) {
+        argv[j] = argv[j + 1];
+      }
+
+      // Decrements the argument count.
+      (*argc)--;
+
+      // We also need to decrement the iterator as we just removed
+      // an element.
+      i--;
+    }
+  }
+
+  if (g_help_flag) {
+    // We print the help here instead of in RUN_ALL_TESTS(), as the
+    // latter may not be called at all if the user is using Google
+    // Test with another testing framework.
+    PrintColorEncoded(kColorEncodedHelpMessage);
+  }
+}
+
+// Parses the command line for Google Test flags, without initializing
+// other parts of Google Test.
+void ParseGoogleTestFlagsOnly(int *argc, char **argv) {
+  ParseGoogleTestFlagsOnlyImpl(argc, argv);
+
+  // Fix the value of *_NSGetArgc() on macOS, but if and only if
+  // *_NSGetArgv() == argv
+  // Only applicable to char** version of argv
+#if GTEST_OS_MAC
+#ifndef GTEST_OS_IOS
+  if (*_NSGetArgv() == argv) {
+    *_NSGetArgc() = *argc;
+  }
+#endif
+#endif
+}
+void ParseGoogleTestFlagsOnly(int *argc, wchar_t **argv) {
+  ParseGoogleTestFlagsOnlyImpl(argc, argv);
+}
+
+// The internal implementation of InitGoogleTest().
+//
+// The type parameter CharType can be instantiated to either char or
+// wchar_t.
+template <typename CharType>
+void InitGoogleTestImpl(int *argc, CharType **argv) {
+  // We don't want to run the initialization code twice.
+  if (GTestIsInitialized()) return;
+
+  if (*argc <= 0) return;
+
+  g_argvs.clear();
+  for (int i = 0; i != *argc; i++) {
+    g_argvs.push_back(StreamableToString(argv[i]));
+  }
+
+#if GTEST_HAS_ABSL
+  absl::InitializeSymbolizer(g_argvs[0].c_str());
+#endif  // GTEST_HAS_ABSL
+
+  ParseGoogleTestFlagsOnly(argc, argv);
+  GetUnitTestImpl()->PostFlagParsingInit();
+}
+
+}  // namespace internal
+
+// Initializes Google Test.  This must be called before calling
+// RUN_ALL_TESTS().  In particular, it parses a command line for the
+// flags that Google Test recognizes.  Whenever a Google Test flag is
+// seen, it is removed from argv, and *argc is decremented.
+//
+// No value is returned.  Instead, the Google Test flag variables are
+// updated.
+//
+// Calling the function for the second time has no user-visible effect.
+void InitGoogleTest(int *argc, char **argv) {
+#if defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+  GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_(argc, argv);
+#else   // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+  internal::InitGoogleTestImpl(argc, argv);
+#endif  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+}
+
+// This overloaded version can be used in Windows programs compiled in
+// UNICODE mode.
+void InitGoogleTest(int *argc, wchar_t **argv) {
+#if defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+  GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_(argc, argv);
+#else   // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+  internal::InitGoogleTestImpl(argc, argv);
+#endif  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+}
+
+// This overloaded version can be used on Arduino/embedded platforms where
+// there is no argc/argv.
+void InitGoogleTest() {
+  // Since Arduino doesn't have a command line, fake out the argc/argv arguments
+  int argc = 1;
+  const auto arg0 = "dummy";
+  char *argv0 = const_cast<char *>(arg0);
+  char **argv = &argv0;
+
+#if defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+  GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_(&argc, argv);
+#else   // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+  internal::InitGoogleTestImpl(&argc, argv);
+#endif  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+}
+
+std::string TempDir() {
+#if defined(GTEST_CUSTOM_TEMPDIR_FUNCTION_)
+  return GTEST_CUSTOM_TEMPDIR_FUNCTION_();
+#endif
+
+#if GTEST_OS_WINDOWS_MOBILE
+  return "\\temp\\";
+#elif GTEST_OS_WINDOWS
+  const char *temp_dir = internal::posix::GetEnv("TEMP");
+  if (temp_dir == nullptr || temp_dir[0] == '\0')
+    return "\\temp\\";
+  else if (temp_dir[strlen(temp_dir) - 1] == '\\')
+    return temp_dir;
+  else
+    return std::string(temp_dir) + "\\";
+#elif GTEST_OS_LINUX_ANDROID
+  const char *temp_dir = internal::posix::GetEnv("TEST_TMPDIR");
+  if (temp_dir == nullptr || temp_dir[0] == '\0')
+    return "/data/local/tmp/";
+  else
+    return temp_dir;
+#else
+  return "/tmp/";
+#endif  // GTEST_OS_WINDOWS_MOBILE
+}
+
+// Class ScopedTrace
+
+// Pushes the given source file location and message onto a per-thread
+// trace stack maintained by Google Test.
+void ScopedTrace::PushTrace(const char *file, int line, std::string message) {
+  internal::TraceInfo trace;
+  trace.file = file;
+  trace.line = line;
+  trace.message.swap(message);
+
+  UnitTest::GetInstance()->PushGTestTrace(trace);
+}
+
+// Pops the info pushed by the c'tor.
+ScopedTrace::~ScopedTrace() GTEST_LOCK_EXCLUDED_(&UnitTest::mutex_) {
+  UnitTest::GetInstance()->PopGTestTrace();
+}
+
+}  // namespace testing
diff --git a/libs/libaom/src/third_party/googletest/src/googletest/src/gtest_main.cc b/libs/libaom/src/third_party/googletest/src/googletest/src/gtest_main.cc
new file mode 100644
index 000000000..77c90ce61
--- /dev/null
+++ b/libs/libaom/src/third_party/googletest/src/googletest/src/gtest_main.cc
@@ -0,0 +1,52 @@
+// Copyright 2006, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include <cstdio>
+#include "gtest/gtest.h"
+
+#if GTEST_OS_ESP8266 || GTEST_OS_ESP32
+#if GTEST_OS_ESP8266
+extern "C" {
+#endif
+void setup() { testing::InitGoogleTest(); }
+
+void loop() { RUN_ALL_TESTS(); }
+
+#if GTEST_OS_ESP8266
+}
+#endif
+
+#else
+
+GTEST_API_ int main(int argc, char **argv) {
+  printf("Running main() from %s\n", __FILE__);
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+#endif
diff --git a/libs/libaom/src/third_party/libwebm/AUTHORS.TXT b/libs/libaom/src/third_party/libwebm/AUTHORS.TXT
new file mode 100644
index 000000000..9686ac13e
--- /dev/null
+++ b/libs/libaom/src/third_party/libwebm/AUTHORS.TXT
@@ -0,0 +1,4 @@
+# Names should be added to this file like so:
+# Name or Organization <email address>
+
+Google Inc.
diff --git a/libs/libaom/src/third_party/libwebm/Android.mk b/libs/libaom/src/third_party/libwebm/Android.mk
new file mode 100644
index 000000000..b46ba101d
--- /dev/null
+++ b/libs/libaom/src/third_party/libwebm/Android.mk
@@ -0,0 +1,17 @@
+LOCAL_PATH:= $(call my-dir)
+
+include $(CLEAR_VARS)
+LOCAL_MODULE:= libwebm
+LOCAL_CPPFLAGS:=-D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS
+LOCAL_CPPFLAGS+=-D__STDC_LIMIT_MACROS -std=c++11
+LOCAL_C_INCLUDES:= $(LOCAL_PATH)
+LOCAL_EXPORT_C_INCLUDES:= $(LOCAL_PATH)
+
+LOCAL_SRC_FILES:= common/file_util.cc \
+                  common/hdr_util.cc \
+                  mkvparser/mkvparser.cc \
+                  mkvparser/mkvreader.cc \
+                  mkvmuxer/mkvmuxer.cc \
+                  mkvmuxer/mkvmuxerutil.cc \
+                  mkvmuxer/mkvwriter.cc
+include $(BUILD_STATIC_LIBRARY)
diff --git a/libs/libaom/src/third_party/libwebm/LICENSE.TXT b/libs/libaom/src/third_party/libwebm/LICENSE.TXT
new file mode 100644
index 000000000..7a6f99547
--- /dev/null
+++ b/libs/libaom/src/third_party/libwebm/LICENSE.TXT
@@ -0,0 +1,30 @@
+Copyright (c) 2010, Google Inc. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+  * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+
+  * Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in
+    the documentation and/or other materials provided with the
+    distribution.
+
+  * Neither the name of Google nor the names of its contributors may
+    be used to endorse or promote products derived from this software
+    without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
diff --git a/libs/libaom/src/third_party/libwebm/PATENTS.TXT b/libs/libaom/src/third_party/libwebm/PATENTS.TXT
new file mode 100644
index 000000000..caedf607e
--- /dev/null
+++ b/libs/libaom/src/third_party/libwebm/PATENTS.TXT
@@ -0,0 +1,23 @@
+Additional IP Rights Grant (Patents)
+------------------------------------
+
+"These implementations" means the copyrightable works that implement the WebM
+codecs distributed by Google as part of the WebM Project.
+
+Google hereby grants to you a perpetual, worldwide, non-exclusive, no-charge,
+royalty-free, irrevocable (except as stated in this section) patent license to
+make, have made, use, offer to sell, sell, import, transfer, and otherwise
+run, modify and propagate the contents of these implementations of WebM, where
+such license applies only to those patent claims, both currently owned by
+Google and acquired in the future, licensable by Google that are necessarily
+infringed by these implementations of WebM. This grant does not include claims
+that would be infringed only as a consequence of further modification of these
+implementations. If you or your agent or exclusive licensee institute or order
+or agree to the institution of patent litigation or any other patent
+enforcement activity against any entity (including a cross-claim or
+counterclaim in a lawsuit) alleging that any of these implementations of WebM
+or any code incorporated within any of these implementations of WebM
+constitute direct or contributory patent infringement, or inducement of
+patent infringement, then any patent rights granted to you under this License
+for these implementations of WebM shall terminate as of the date such
+litigation is filed.
diff --git a/libs/libaom/src/third_party/libwebm/README.libaom b/libs/libaom/src/third_party/libwebm/README.libaom
new file mode 100644
index 000000000..1e87afd3d
--- /dev/null
+++ b/libs/libaom/src/third_party/libwebm/README.libaom
@@ -0,0 +1,20 @@
+URL: https://chromium.googlesource.com/webm/libwebm
+Version: 37d9b860ebbf40cb0f6dcb7a6fef452d798062da
+License: BSD
+License File: LICENSE.txt
+
+Description:
+libwebm is used to handle WebM container I/O.
+
+Local Changes:
+Only keep:
+ - Android.mk
+ - AUTHORS.TXT
+ - common/
+    file_util.cc/h
+    hdr_util.cc/h
+    webmids.h
+ - LICENSE.TXT
+ - mkvmuxer/
+ - mkvparser/
+ - PATENTS.TXT
diff --git a/libs/libaom/src/third_party/libwebm/common/file_util.cc b/libs/libaom/src/third_party/libwebm/common/file_util.cc
new file mode 100644
index 000000000..6eb6428b9
--- /dev/null
+++ b/libs/libaom/src/third_party/libwebm/common/file_util.cc
@@ -0,0 +1,93 @@
+// Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS.  All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+#include "common/file_util.h"
+
+#include <sys/stat.h>
+#ifndef _MSC_VER
+#include <unistd.h>  // close()
+#endif
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <fstream>
+#include <ios>
+#include <string>
+
+namespace libwebm {
+
+std::string GetTempFileName() {
+#if !defined _MSC_VER && !defined __MINGW32__
+  std::string temp_file_name_template_str =
+      std::string(std::getenv("TEST_TMPDIR") ? std::getenv("TEST_TMPDIR")
+                                             : ".") +
+      "/libwebm_temp.XXXXXX";
+  char* temp_file_name_template =
+      new char[temp_file_name_template_str.length() + 1];
+  memset(temp_file_name_template, 0, temp_file_name_template_str.length() + 1);
+  temp_file_name_template_str.copy(temp_file_name_template,
+                                   temp_file_name_template_str.length(), 0);
+  int fd = mkstemp(temp_file_name_template);
+  std::string temp_file_name =
+      (fd != -1) ? std::string(temp_file_name_template) : std::string();
+  delete[] temp_file_name_template;
+  if (fd != -1) {
+    close(fd);
+  }
+  return temp_file_name;
+#else
+  char tmp_file_name[_MAX_PATH];
+#if defined _MSC_VER || defined MINGW_HAS_SECURE_API
+  errno_t err = tmpnam_s(tmp_file_name);
+#else
+  char* fname_pointer = tmpnam(tmp_file_name);
+  int err = (fname_pointer == &tmp_file_name[0]) ? 0 : -1;
+#endif
+  if (err == 0) {
+    return std::string(tmp_file_name);
+  }
+  return std::string();
+#endif
+}
+
+uint64_t GetFileSize(const std::string& file_name) {
+  uint64_t file_size = 0;
+#ifndef _MSC_VER
+  struct stat st;
+  st.st_size = 0;
+  if (stat(file_name.c_str(), &st) == 0) {
+#else
+  struct _stat st;
+  st.st_size = 0;
+  if (_stat(file_name.c_str(), &st) == 0) {
+#endif
+    file_size = st.st_size;
+  }
+  return file_size;
+}
+
+bool GetFileContents(const std::string& file_name, std::string* contents) {
+  std::ifstream file(file_name.c_str());
+  *contents = std::string(static_cast<size_t>(GetFileSize(file_name)), 0);
+  if (file.good() && contents->size()) {
+    file.read(&(*contents)[0], contents->size());
+  }
+  return !file.fail();
+}
+
+TempFileDeleter::TempFileDeleter() { file_name_ = GetTempFileName(); }
+
+TempFileDeleter::~TempFileDeleter() {
+  std::ifstream file(file_name_.c_str());
+  if (file.good()) {
+    file.close();
+    std::remove(file_name_.c_str());
+  }
+}
+
+}  // namespace libwebm
diff --git a/libs/libaom/src/third_party/libwebm/common/file_util.h b/libs/libaom/src/third_party/libwebm/common/file_util.h
new file mode 100644
index 000000000..a87373464
--- /dev/null
+++ b/libs/libaom/src/third_party/libwebm/common/file_util.h
@@ -0,0 +1,44 @@
+// Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS.  All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+#ifndef LIBWEBM_COMMON_FILE_UTIL_H_
+#define LIBWEBM_COMMON_FILE_UTIL_H_
+
+#include <stdint.h>
+
+#include <string>
+
+#include "mkvmuxer/mkvmuxertypes.h"  // LIBWEBM_DISALLOW_COPY_AND_ASSIGN()
+
+namespace libwebm {
+
+// Returns a temporary file name.
+std::string GetTempFileName();
+
+// Returns size of file specified by |file_name|, or 0 upon failure.
+uint64_t GetFileSize(const std::string& file_name);
+
+// Gets the contents file_name as a string. Returns false on error.
+bool GetFileContents(const std::string& file_name, std::string* contents);
+
+// Manages life of temporary file specified at time of construction. Deletes
+// file upon destruction.
+class TempFileDeleter {
+ public:
+  TempFileDeleter();
+  explicit TempFileDeleter(std::string file_name) : file_name_(file_name) {}
+  ~TempFileDeleter();
+  const std::string& name() const { return file_name_; }
+
+ private:
+  std::string file_name_;
+  LIBWEBM_DISALLOW_COPY_AND_ASSIGN(TempFileDeleter);
+};
+
+}  // namespace libwebm
+
+#endif  // LIBWEBM_COMMON_FILE_UTIL_H_
diff --git a/libs/libaom/src/third_party/libwebm/common/hdr_util.cc b/libs/libaom/src/third_party/libwebm/common/hdr_util.cc
new file mode 100644
index 000000000..916f7170b
--- /dev/null
+++ b/libs/libaom/src/third_party/libwebm/common/hdr_util.cc
@@ -0,0 +1,220 @@
+// Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS.  All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+#include "hdr_util.h"
+
+#include <climits>
+#include <cstddef>
+#include <new>
+
+#include "mkvparser/mkvparser.h"
+
+namespace libwebm {
+const int Vp9CodecFeatures::kValueNotPresent = INT_MAX;
+
+bool CopyPrimaryChromaticity(const mkvparser::PrimaryChromaticity& parser_pc,
+                             PrimaryChromaticityPtr* muxer_pc) {
+  muxer_pc->reset(new (std::nothrow)
+                      mkvmuxer::PrimaryChromaticity(parser_pc.x, parser_pc.y));
+  if (!muxer_pc->get())
+    return false;
+  return true;
+}
+
+bool MasteringMetadataValuePresent(double value) {
+  return value != mkvparser::MasteringMetadata::kValueNotPresent;
+}
+
+bool CopyMasteringMetadata(const mkvparser::MasteringMetadata& parser_mm,
+                           mkvmuxer::MasteringMetadata* muxer_mm) {
+  if (MasteringMetadataValuePresent(parser_mm.luminance_max))
+    muxer_mm->set_luminance_max(parser_mm.luminance_max);
+  if (MasteringMetadataValuePresent(parser_mm.luminance_min))
+    muxer_mm->set_luminance_min(parser_mm.luminance_min);
+
+  PrimaryChromaticityPtr r_ptr(nullptr);
+  PrimaryChromaticityPtr g_ptr(nullptr);
+  PrimaryChromaticityPtr b_ptr(nullptr);
+  PrimaryChromaticityPtr wp_ptr(nullptr);
+
+  if (parser_mm.r) {
+    if (!CopyPrimaryChromaticity(*parser_mm.r, &r_ptr))
+      return false;
+  }
+  if (parser_mm.g) {
+    if (!CopyPrimaryChromaticity(*parser_mm.g, &g_ptr))
+      return false;
+  }
+  if (parser_mm.b) {
+    if (!CopyPrimaryChromaticity(*parser_mm.b, &b_ptr))
+      return false;
+  }
+  if (parser_mm.white_point) {
+    if (!CopyPrimaryChromaticity(*parser_mm.white_point, &wp_ptr))
+      return false;
+  }
+
+  if (!muxer_mm->SetChromaticity(r_ptr.get(), g_ptr.get(), b_ptr.get(),
+                                 wp_ptr.get())) {
+    return false;
+  }
+
+  return true;
+}
+
+bool ColourValuePresent(long long value) {
+  return value != mkvparser::Colour::kValueNotPresent;
+}
+
+bool CopyColour(const mkvparser::Colour& parser_colour,
+                mkvmuxer::Colour* muxer_colour) {
+  if (!muxer_colour)
+    return false;
+
+  if (ColourValuePresent(parser_colour.matrix_coefficients))
+    muxer_colour->set_matrix_coefficients(parser_colour.matrix_coefficients);
+  if (ColourValuePresent(parser_colour.bits_per_channel))
+    muxer_colour->set_bits_per_channel(parser_colour.bits_per_channel);
+  if (ColourValuePresent(parser_colour.chroma_subsampling_horz)) {
+    muxer_colour->set_chroma_subsampling_horz(
+        parser_colour.chroma_subsampling_horz);
+  }
+  if (ColourValuePresent(parser_colour.chroma_subsampling_vert)) {
+    muxer_colour->set_chroma_subsampling_vert(
+        parser_colour.chroma_subsampling_vert);
+  }
+  if (ColourValuePresent(parser_colour.cb_subsampling_horz))
+    muxer_colour->set_cb_subsampling_horz(parser_colour.cb_subsampling_horz);
+  if (ColourValuePresent(parser_colour.cb_subsampling_vert))
+    muxer_colour->set_cb_subsampling_vert(parser_colour.cb_subsampling_vert);
+  if (ColourValuePresent(parser_colour.chroma_siting_horz))
+    muxer_colour->set_chroma_siting_horz(parser_colour.chroma_siting_horz);
+  if (ColourValuePresent(parser_colour.chroma_siting_vert))
+    muxer_colour->set_chroma_siting_vert(parser_colour.chroma_siting_vert);
+  if (ColourValuePresent(parser_colour.range))
+    muxer_colour->set_range(parser_colour.range);
+  if (ColourValuePresent(parser_colour.transfer_characteristics)) {
+    muxer_colour->set_transfer_characteristics(
+        parser_colour.transfer_characteristics);
+  }
+  if (ColourValuePresent(parser_colour.primaries))
+    muxer_colour->set_primaries(parser_colour.primaries);
+  if (ColourValuePresent(parser_colour.max_cll))
+    muxer_colour->set_max_cll(parser_colour.max_cll);
+  if (ColourValuePresent(parser_colour.max_fall))
+    muxer_colour->set_max_fall(parser_colour.max_fall);
+
+  if (parser_colour.mastering_metadata) {
+    mkvmuxer::MasteringMetadata muxer_mm;
+    if (!CopyMasteringMetadata(*parser_colour.mastering_metadata, &muxer_mm))
+      return false;
+    if (!muxer_colour->SetMasteringMetadata(muxer_mm))
+      return false;
+  }
+  return true;
+}
+
+// Format of VPx private data:
+//
+//   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+//  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+//  |    ID Byte    |   Length      |                               |
+//  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+                               |
+//  |                                                               |
+//  :               Bytes 1..Length of Codec Feature                :
+//  |                                                               |
+//  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+//
+// ID Byte Format
+// ID byte is an unsigned byte.
+//   0 1 2 3 4 5 6 7
+//  +-+-+-+-+-+-+-+-+
+//  |X|    ID       |
+//  +-+-+-+-+-+-+-+-+
+//
+// The X bit is reserved.
+//
+// See the following link for more information:
+// http://www.webmproject.org/vp9/profiles/
+bool ParseVpxCodecPrivate(const uint8_t* private_data, int32_t length,
+                          Vp9CodecFeatures* features) {
+  const int kVpxCodecPrivateMinLength = 3;
+  if (!private_data || !features || length < kVpxCodecPrivateMinLength)
+    return false;
+
+  const uint8_t kVp9ProfileId = 1;
+  const uint8_t kVp9LevelId = 2;
+  const uint8_t kVp9BitDepthId = 3;
+  const uint8_t kVp9ChromaSubsamplingId = 4;
+  const int kVpxFeatureLength = 1;
+  int offset = 0;
+
+  // Set features to not set.
+  features->profile = Vp9CodecFeatures::kValueNotPresent;
+  features->level = Vp9CodecFeatures::kValueNotPresent;
+  features->bit_depth = Vp9CodecFeatures::kValueNotPresent;
+  features->chroma_subsampling = Vp9CodecFeatures::kValueNotPresent;
+  do {
+    const uint8_t id_byte = private_data[offset++];
+    const uint8_t length_byte = private_data[offset++];
+    if (length_byte != kVpxFeatureLength)
+      return false;
+    if (id_byte == kVp9ProfileId) {
+      const int priv_profile = static_cast<int>(private_data[offset++]);
+      if (priv_profile < 0 || priv_profile > 3)
+        return false;
+      if (features->profile != Vp9CodecFeatures::kValueNotPresent &&
+          features->profile != priv_profile) {
+        return false;
+      }
+      features->profile = priv_profile;
+    } else if (id_byte == kVp9LevelId) {
+      const int priv_level = static_cast<int>(private_data[offset++]);
+
+      const int kNumLevels = 14;
+      const int levels[kNumLevels] = {10, 11, 20, 21, 30, 31, 40,
+                                      41, 50, 51, 52, 60, 61, 62};
+
+      for (int i = 0; i < kNumLevels; ++i) {
+        if (priv_level == levels[i]) {
+          if (features->level != Vp9CodecFeatures::kValueNotPresent &&
+              features->level != priv_level) {
+            return false;
+          }
+          features->level = priv_level;
+          break;
+        }
+      }
+      if (features->level == Vp9CodecFeatures::kValueNotPresent)
+        return false;
+    } else if (id_byte == kVp9BitDepthId) {
+      const int priv_profile = static_cast<int>(private_data[offset++]);
+      if (priv_profile != 8 && priv_profile != 10 && priv_profile != 12)
+        return false;
+      if (features->bit_depth != Vp9CodecFeatures::kValueNotPresent &&
+          features->bit_depth != priv_profile) {
+        return false;
+      }
+      features->bit_depth = priv_profile;
+    } else if (id_byte == kVp9ChromaSubsamplingId) {
+      const int priv_profile = static_cast<int>(private_data[offset++]);
+      if (priv_profile != 0 && priv_profile != 2 && priv_profile != 3)
+        return false;
+      if (features->chroma_subsampling != Vp9CodecFeatures::kValueNotPresent &&
+          features->chroma_subsampling != priv_profile) {
+        return false;
+      }
+      features->chroma_subsampling = priv_profile;
+    } else {
+      // Invalid ID.
+      return false;
+    }
+  } while (offset + kVpxCodecPrivateMinLength <= length);
+
+  return true;
+}
+}  // namespace libwebm
diff --git a/libs/libaom/src/third_party/libwebm/common/hdr_util.h b/libs/libaom/src/third_party/libwebm/common/hdr_util.h
new file mode 100644
index 000000000..78e2eeb70
--- /dev/null
+++ b/libs/libaom/src/third_party/libwebm/common/hdr_util.h
@@ -0,0 +1,71 @@
+// Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS.  All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+#ifndef LIBWEBM_COMMON_HDR_UTIL_H_
+#define LIBWEBM_COMMON_HDR_UTIL_H_
+
+#include <stdint.h>
+
+#include <memory>
+
+#include "mkvmuxer/mkvmuxer.h"
+
+namespace mkvparser {
+struct Colour;
+struct MasteringMetadata;
+struct PrimaryChromaticity;
+}  // namespace mkvparser
+
+namespace libwebm {
+// Utility types and functions for working with the Colour element and its
+// children. Copiers return true upon success. Presence functions return true
+// when the specified element is present.
+
+// TODO(tomfinegan): These should be moved to libwebm_utils once c++11 is
+// required by libwebm.
+
+// Features of the VP9 codec that may be set in the CodecPrivate of a VP9 video
+// stream. A value of kValueNotPresent represents that the value was not set in
+// the CodecPrivate.
+struct Vp9CodecFeatures {
+  static const int kValueNotPresent;
+
+  Vp9CodecFeatures()
+      : profile(kValueNotPresent),
+        level(kValueNotPresent),
+        bit_depth(kValueNotPresent),
+        chroma_subsampling(kValueNotPresent) {}
+  ~Vp9CodecFeatures() {}
+
+  int profile;
+  int level;
+  int bit_depth;
+  int chroma_subsampling;
+};
+
+typedef std::unique_ptr<mkvmuxer::PrimaryChromaticity> PrimaryChromaticityPtr;
+
+bool CopyPrimaryChromaticity(const mkvparser::PrimaryChromaticity& parser_pc,
+                             PrimaryChromaticityPtr* muxer_pc);
+
+bool MasteringMetadataValuePresent(double value);
+
+bool CopyMasteringMetadata(const mkvparser::MasteringMetadata& parser_mm,
+                           mkvmuxer::MasteringMetadata* muxer_mm);
+
+bool ColourValuePresent(long long value);
+
+bool CopyColour(const mkvparser::Colour& parser_colour,
+                mkvmuxer::Colour* muxer_colour);
+
+// Returns true if |features| is set to one or more valid values.
+bool ParseVpxCodecPrivate(const uint8_t* private_data, int32_t length,
+                          Vp9CodecFeatures* features);
+
+}  // namespace libwebm
+
+#endif  // LIBWEBM_COMMON_HDR_UTIL_H_
diff --git a/libs/libaom/src/third_party/libwebm/common/webmids.h b/libs/libaom/src/third_party/libwebm/common/webmids.h
new file mode 100644
index 000000000..fc0c20814
--- /dev/null
+++ b/libs/libaom/src/third_party/libwebm/common/webmids.h
@@ -0,0 +1,193 @@
+// Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS.  All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+
+#ifndef COMMON_WEBMIDS_H_
+#define COMMON_WEBMIDS_H_
+
+namespace libwebm {
+
+enum MkvId {
+  kMkvEBML = 0x1A45DFA3,
+  kMkvEBMLVersion = 0x4286,
+  kMkvEBMLReadVersion = 0x42F7,
+  kMkvEBMLMaxIDLength = 0x42F2,
+  kMkvEBMLMaxSizeLength = 0x42F3,
+  kMkvDocType = 0x4282,
+  kMkvDocTypeVersion = 0x4287,
+  kMkvDocTypeReadVersion = 0x4285,
+  kMkvVoid = 0xEC,
+  kMkvSignatureSlot = 0x1B538667,
+  kMkvSignatureAlgo = 0x7E8A,
+  kMkvSignatureHash = 0x7E9A,
+  kMkvSignaturePublicKey = 0x7EA5,
+  kMkvSignature = 0x7EB5,
+  kMkvSignatureElements = 0x7E5B,
+  kMkvSignatureElementList = 0x7E7B,
+  kMkvSignedElement = 0x6532,
+  // segment
+  kMkvSegment = 0x18538067,
+  // Meta Seek Information
+  kMkvSeekHead = 0x114D9B74,
+  kMkvSeek = 0x4DBB,
+  kMkvSeekID = 0x53AB,
+  kMkvSeekPosition = 0x53AC,
+  // Segment Information
+  kMkvInfo = 0x1549A966,
+  kMkvTimecodeScale = 0x2AD7B1,
+  kMkvDuration = 0x4489,
+  kMkvDateUTC = 0x4461,
+  kMkvTitle = 0x7BA9,
+  kMkvMuxingApp = 0x4D80,
+  kMkvWritingApp = 0x5741,
+  // Cluster
+  kMkvCluster = 0x1F43B675,
+  kMkvTimecode = 0xE7,
+  kMkvPrevSize = 0xAB,
+  kMkvBlockGroup = 0xA0,
+  kMkvBlock = 0xA1,
+  kMkvBlockDuration = 0x9B,
+  kMkvReferenceBlock = 0xFB,
+  kMkvLaceNumber = 0xCC,
+  kMkvSimpleBlock = 0xA3,
+  kMkvBlockAdditions = 0x75A1,
+  kMkvBlockMore = 0xA6,
+  kMkvBlockAddID = 0xEE,
+  kMkvBlockAdditional = 0xA5,
+  kMkvDiscardPadding = 0x75A2,
+  // Track
+  kMkvTracks = 0x1654AE6B,
+  kMkvTrackEntry = 0xAE,
+  kMkvTrackNumber = 0xD7,
+  kMkvTrackUID = 0x73C5,
+  kMkvTrackType = 0x83,
+  kMkvFlagEnabled = 0xB9,
+  kMkvFlagDefault = 0x88,
+  kMkvFlagForced = 0x55AA,
+  kMkvFlagLacing = 0x9C,
+  kMkvDefaultDuration = 0x23E383,
+  kMkvMaxBlockAdditionID = 0x55EE,
+  kMkvName = 0x536E,
+  kMkvLanguage = 0x22B59C,
+  kMkvCodecID = 0x86,
+  kMkvCodecPrivate = 0x63A2,
+  kMkvCodecName = 0x258688,
+  kMkvCodecDelay = 0x56AA,
+  kMkvSeekPreRoll = 0x56BB,
+  // video
+  kMkvVideo = 0xE0,
+  kMkvFlagInterlaced = 0x9A,
+  kMkvStereoMode = 0x53B8,
+  kMkvAlphaMode = 0x53C0,
+  kMkvPixelWidth = 0xB0,
+  kMkvPixelHeight = 0xBA,
+  kMkvPixelCropBottom = 0x54AA,
+  kMkvPixelCropTop = 0x54BB,
+  kMkvPixelCropLeft = 0x54CC,
+  kMkvPixelCropRight = 0x54DD,
+  kMkvDisplayWidth = 0x54B0,
+  kMkvDisplayHeight = 0x54BA,
+  kMkvDisplayUnit = 0x54B2,
+  kMkvAspectRatioType = 0x54B3,
+  kMkvColourSpace = 0x2EB524,
+  kMkvFrameRate = 0x2383E3,
+  // end video
+  // colour
+  kMkvColour = 0x55B0,
+  kMkvMatrixCoefficients = 0x55B1,
+  kMkvBitsPerChannel = 0x55B2,
+  kMkvChromaSubsamplingHorz = 0x55B3,
+  kMkvChromaSubsamplingVert = 0x55B4,
+  kMkvCbSubsamplingHorz = 0x55B5,
+  kMkvCbSubsamplingVert = 0x55B6,
+  kMkvChromaSitingHorz = 0x55B7,
+  kMkvChromaSitingVert = 0x55B8,
+  kMkvRange = 0x55B9,
+  kMkvTransferCharacteristics = 0x55BA,
+  kMkvPrimaries = 0x55BB,
+  kMkvMaxCLL = 0x55BC,
+  kMkvMaxFALL = 0x55BD,
+  // mastering metadata
+  kMkvMasteringMetadata = 0x55D0,
+  kMkvPrimaryRChromaticityX = 0x55D1,
+  kMkvPrimaryRChromaticityY = 0x55D2,
+  kMkvPrimaryGChromaticityX = 0x55D3,
+  kMkvPrimaryGChromaticityY = 0x55D4,
+  kMkvPrimaryBChromaticityX = 0x55D5,
+  kMkvPrimaryBChromaticityY = 0x55D6,
+  kMkvWhitePointChromaticityX = 0x55D7,
+  kMkvWhitePointChromaticityY = 0x55D8,
+  kMkvLuminanceMax = 0x55D9,
+  kMkvLuminanceMin = 0x55DA,
+  // end mastering metadata
+  // end colour
+  // projection
+  kMkvProjection = 0x7670,
+  kMkvProjectionType = 0x7671,
+  kMkvProjectionPrivate = 0x7672,
+  kMkvProjectionPoseYaw = 0x7673,
+  kMkvProjectionPosePitch = 0x7674,
+  kMkvProjectionPoseRoll = 0x7675,
+  // end projection
+  // audio
+  kMkvAudio = 0xE1,
+  kMkvSamplingFrequency = 0xB5,
+  kMkvOutputSamplingFrequency = 0x78B5,
+  kMkvChannels = 0x9F,
+  kMkvBitDepth = 0x6264,
+  // end audio
+  // ContentEncodings
+  kMkvContentEncodings = 0x6D80,
+  kMkvContentEncoding = 0x6240,
+  kMkvContentEncodingOrder = 0x5031,
+  kMkvContentEncodingScope = 0x5032,
+  kMkvContentEncodingType = 0x5033,
+  kMkvContentCompression = 0x5034,
+  kMkvContentCompAlgo = 0x4254,
+  kMkvContentCompSettings = 0x4255,
+  kMkvContentEncryption = 0x5035,
+  kMkvContentEncAlgo = 0x47E1,
+  kMkvContentEncKeyID = 0x47E2,
+  kMkvContentSignature = 0x47E3,
+  kMkvContentSigKeyID = 0x47E4,
+  kMkvContentSigAlgo = 0x47E5,
+  kMkvContentSigHashAlgo = 0x47E6,
+  kMkvContentEncAESSettings = 0x47E7,
+  kMkvAESSettingsCipherMode = 0x47E8,
+  kMkvAESSettingsCipherInitData = 0x47E9,
+  // end ContentEncodings
+  // Cueing Data
+  kMkvCues = 0x1C53BB6B,
+  kMkvCuePoint = 0xBB,
+  kMkvCueTime = 0xB3,
+  kMkvCueTrackPositions = 0xB7,
+  kMkvCueTrack = 0xF7,
+  kMkvCueClusterPosition = 0xF1,
+  kMkvCueBlockNumber = 0x5378,
+  // Chapters
+  kMkvChapters = 0x1043A770,
+  kMkvEditionEntry = 0x45B9,
+  kMkvChapterAtom = 0xB6,
+  kMkvChapterUID = 0x73C4,
+  kMkvChapterStringUID = 0x5654,
+  kMkvChapterTimeStart = 0x91,
+  kMkvChapterTimeEnd = 0x92,
+  kMkvChapterDisplay = 0x80,
+  kMkvChapString = 0x85,
+  kMkvChapLanguage = 0x437C,
+  kMkvChapCountry = 0x437E,
+  // Tags
+  kMkvTags = 0x1254C367,
+  kMkvTag = 0x7373,
+  kMkvSimpleTag = 0x67C8,
+  kMkvTagName = 0x45A3,
+  kMkvTagString = 0x4487
+};
+
+}  // namespace libwebm
+
+#endif  // COMMON_WEBMIDS_H_
diff --git a/libs/libaom/src/third_party/libwebm/mkvmuxer/mkvmuxer.cc b/libs/libaom/src/third_party/libwebm/mkvmuxer/mkvmuxer.cc
new file mode 100644
index 000000000..512031211
--- /dev/null
+++ b/libs/libaom/src/third_party/libwebm/mkvmuxer/mkvmuxer.cc
@@ -0,0 +1,4221 @@
+// Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS.  All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+
+#include "mkvmuxer/mkvmuxer.h"
+
+#include <stdint.h>
+
+#include <cfloat>
+#include <climits>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <ctime>
+#include <memory>
+#include <new>
+#include <string>
+#include <vector>
+
+#include "common/webmids.h"
+#include "mkvmuxer/mkvmuxerutil.h"
+#include "mkvmuxer/mkvwriter.h"
+#include "mkvparser/mkvparser.h"
+
+namespace mkvmuxer {
+
+const float PrimaryChromaticity::kChromaticityMin = 0.0f;
+const float PrimaryChromaticity::kChromaticityMax = 1.0f;
+const float MasteringMetadata::kMinLuminance = 0.0f;
+const float MasteringMetadata::kMinLuminanceMax = 999.99f;
+const float MasteringMetadata::kMaxLuminanceMax = 9999.99f;
+const float MasteringMetadata::kValueNotPresent = FLT_MAX;
+const uint64_t Colour::kValueNotPresent = UINT64_MAX;
+
+namespace {
+
+const char kDocTypeWebm[] = "webm";
+const char kDocTypeMatroska[] = "matroska";
+
+// Deallocate the string designated by |dst|, and then copy the |src|
+// string to |dst|.  The caller owns both the |src| string and the
+// |dst| copy (hence the caller is responsible for eventually
+// deallocating the strings, either directly, or indirectly via
+// StrCpy).  Returns true if the source string was successfully copied
+// to the destination.
+bool StrCpy(const char* src, char** dst_ptr) {
+  if (dst_ptr == NULL)
+    return false;
+
+  char*& dst = *dst_ptr;
+
+  delete[] dst;
+  dst = NULL;
+
+  if (src == NULL)
+    return true;
+
+  const size_t size = strlen(src) + 1;
+
+  dst = new (std::nothrow) char[size];  // NOLINT
+  if (dst == NULL)
+    return false;
+
+  strcpy(dst, src);  // NOLINT
+  return true;
+}
+
+typedef std::unique_ptr<PrimaryChromaticity> PrimaryChromaticityPtr;
+bool CopyChromaticity(const PrimaryChromaticity* src,
+                      PrimaryChromaticityPtr* dst) {
+  if (!dst)
+    return false;
+
+  dst->reset(new (std::nothrow) PrimaryChromaticity(src->x(), src->y()));
+  if (!dst->get())
+    return false;
+
+  return true;
+}
+
+}  // namespace
+
+///////////////////////////////////////////////////////////////
+//
+// IMkvWriter Class
+
+IMkvWriter::IMkvWriter() {}
+
+IMkvWriter::~IMkvWriter() {}
+
+bool WriteEbmlHeader(IMkvWriter* writer, uint64_t doc_type_version,
+                     const char* const doc_type) {
+  // Level 0
+  uint64_t size =
+      EbmlElementSize(libwebm::kMkvEBMLVersion, static_cast<uint64>(1));
+  size += EbmlElementSize(libwebm::kMkvEBMLReadVersion, static_cast<uint64>(1));
+  size += EbmlElementSize(libwebm::kMkvEBMLMaxIDLength, static_cast<uint64>(4));
+  size +=
+      EbmlElementSize(libwebm::kMkvEBMLMaxSizeLength, static_cast<uint64>(8));
+  size += EbmlElementSize(libwebm::kMkvDocType, doc_type);
+  size += EbmlElementSize(libwebm::kMkvDocTypeVersion,
+                          static_cast<uint64>(doc_type_version));
+  size +=
+      EbmlElementSize(libwebm::kMkvDocTypeReadVersion, static_cast<uint64>(2));
+
+  if (!WriteEbmlMasterElement(writer, libwebm::kMkvEBML, size))
+    return false;
+  if (!WriteEbmlElement(writer, libwebm::kMkvEBMLVersion,
+                        static_cast<uint64>(1))) {
+    return false;
+  }
+  if (!WriteEbmlElement(writer, libwebm::kMkvEBMLReadVersion,
+                        static_cast<uint64>(1))) {
+    return false;
+  }
+  if (!WriteEbmlElement(writer, libwebm::kMkvEBMLMaxIDLength,
+                        static_cast<uint64>(4))) {
+    return false;
+  }
+  if (!WriteEbmlElement(writer, libwebm::kMkvEBMLMaxSizeLength,
+                        static_cast<uint64>(8))) {
+    return false;
+  }
+  if (!WriteEbmlElement(writer, libwebm::kMkvDocType, doc_type))
+    return false;
+  if (!WriteEbmlElement(writer, libwebm::kMkvDocTypeVersion,
+                        static_cast<uint64>(doc_type_version))) {
+    return false;
+  }
+  if (!WriteEbmlElement(writer, libwebm::kMkvDocTypeReadVersion,
+                        static_cast<uint64>(2))) {
+    return false;
+  }
+
+  return true;
+}
+
+bool WriteEbmlHeader(IMkvWriter* writer, uint64_t doc_type_version) {
+  return WriteEbmlHeader(writer, doc_type_version, kDocTypeWebm);
+}
+
+bool WriteEbmlHeader(IMkvWriter* writer) {
+  return WriteEbmlHeader(writer, mkvmuxer::Segment::kDefaultDocTypeVersion);
+}
+
+bool ChunkedCopy(mkvparser::IMkvReader* source, mkvmuxer::IMkvWriter* dst,
+                 int64_t start, int64_t size) {
+  // TODO(vigneshv): Check if this is a reasonable value.
+  const uint32_t kBufSize = 2048;
+  uint8_t* buf = new uint8_t[kBufSize];
+  int64_t offset = start;
+  while (size > 0) {
+    const int64_t read_len = (size > kBufSize) ? kBufSize : size;
+    if (source->Read(offset, static_cast<long>(read_len), buf))
+      return false;
+    dst->Write(buf, static_cast<uint32_t>(read_len));
+    offset += read_len;
+    size -= read_len;
+  }
+  delete[] buf;
+  return true;
+}
+
+///////////////////////////////////////////////////////////////
+//
+// Frame Class
+
+Frame::Frame()
+    : add_id_(0),
+      additional_(NULL),
+      additional_length_(0),
+      duration_(0),
+      duration_set_(false),
+      frame_(NULL),
+      is_key_(false),
+      length_(0),
+      track_number_(0),
+      timestamp_(0),
+      discard_padding_(0),
+      reference_block_timestamp_(0),
+      reference_block_timestamp_set_(false) {}
+
+Frame::~Frame() {
+  delete[] frame_;
+  delete[] additional_;
+}
+
+bool Frame::CopyFrom(const Frame& frame) {
+  delete[] frame_;
+  frame_ = NULL;
+  length_ = 0;
+  if (frame.length() > 0 && frame.frame() != NULL &&
+      !Init(frame.frame(), frame.length())) {
+    return false;
+  }
+  add_id_ = 0;
+  delete[] additional_;
+  additional_ = NULL;
+  additional_length_ = 0;
+  if (frame.additional_length() > 0 && frame.additional() != NULL &&
+      !AddAdditionalData(frame.additional(), frame.additional_length(),
+                         frame.add_id())) {
+    return false;
+  }
+  duration_ = frame.duration();
+  duration_set_ = frame.duration_set();
+  is_key_ = frame.is_key();
+  track_number_ = frame.track_number();
+  timestamp_ = frame.timestamp();
+  discard_padding_ = frame.discard_padding();
+  reference_block_timestamp_ = frame.reference_block_timestamp();
+  reference_block_timestamp_set_ = frame.reference_block_timestamp_set();
+  return true;
+}
+
+bool Frame::Init(const uint8_t* frame, uint64_t length) {
+  uint8_t* const data =
+      new (std::nothrow) uint8_t[static_cast<size_t>(length)];  // NOLINT
+  if (!data)
+    return false;
+
+  delete[] frame_;
+  frame_ = data;
+  length_ = length;
+
+  memcpy(frame_, frame, static_cast<size_t>(length_));
+  return true;
+}
+
+bool Frame::AddAdditionalData(const uint8_t* additional, uint64_t length,
+                              uint64_t add_id) {
+  uint8_t* const data =
+      new (std::nothrow) uint8_t[static_cast<size_t>(length)];  // NOLINT
+  if (!data)
+    return false;
+
+  delete[] additional_;
+  additional_ = data;
+  additional_length_ = length;
+  add_id_ = add_id;
+
+  memcpy(additional_, additional, static_cast<size_t>(additional_length_));
+  return true;
+}
+
+bool Frame::IsValid() const {
+  if (length_ == 0 || !frame_) {
+    return false;
+  }
+  if ((additional_length_ != 0 && !additional_) ||
+      (additional_ != NULL && additional_length_ == 0)) {
+    return false;
+  }
+  if (track_number_ == 0 || track_number_ > kMaxTrackNumber) {
+    return false;
+  }
+  if (!CanBeSimpleBlock() && !is_key_ && !reference_block_timestamp_set_) {
+    return false;
+  }
+  return true;
+}
+
+bool Frame::CanBeSimpleBlock() const {
+  return additional_ == NULL && discard_padding_ == 0 && duration_ == 0;
+}
+
+void Frame::set_duration(uint64_t duration) {
+  duration_ = duration;
+  duration_set_ = true;
+}
+
+void Frame::set_reference_block_timestamp(int64_t reference_block_timestamp) {
+  reference_block_timestamp_ = reference_block_timestamp;
+  reference_block_timestamp_set_ = true;
+}
+
+///////////////////////////////////////////////////////////////
+//
+// CuePoint Class
+
+CuePoint::CuePoint()
+    : time_(0),
+      track_(0),
+      cluster_pos_(0),
+      block_number_(1),
+      output_block_number_(true) {}
+
+CuePoint::~CuePoint() {}
+
+bool CuePoint::Write(IMkvWriter* writer) const {
+  if (!writer || track_ < 1 || cluster_pos_ < 1)
+    return false;
+
+  uint64_t size = EbmlElementSize(libwebm::kMkvCueClusterPosition,
+                                  static_cast<uint64>(cluster_pos_));
+  size += EbmlElementSize(libwebm::kMkvCueTrack, static_cast<uint64>(track_));
+  if (output_block_number_ && block_number_ > 1)
+    size += EbmlElementSize(libwebm::kMkvCueBlockNumber,
+                            static_cast<uint64>(block_number_));
+  const uint64_t track_pos_size =
+      EbmlMasterElementSize(libwebm::kMkvCueTrackPositions, size) + size;
+  const uint64_t payload_size =
+      EbmlElementSize(libwebm::kMkvCueTime, static_cast<uint64>(time_)) +
+      track_pos_size;
+
+  if (!WriteEbmlMasterElement(writer, libwebm::kMkvCuePoint, payload_size))
+    return false;
+
+  const int64_t payload_position = writer->Position();
+  if (payload_position < 0)
+    return false;
+
+  if (!WriteEbmlElement(writer, libwebm::kMkvCueTime,
+                        static_cast<uint64>(time_))) {
+    return false;
+  }
+
+  if (!WriteEbmlMasterElement(writer, libwebm::kMkvCueTrackPositions, size))
+    return false;
+  if (!WriteEbmlElement(writer, libwebm::kMkvCueTrack,
+                        static_cast<uint64>(track_))) {
+    return false;
+  }
+  if (!WriteEbmlElement(writer, libwebm::kMkvCueClusterPosition,
+                        static_cast<uint64>(cluster_pos_))) {
+    return false;
+  }
+  if (output_block_number_ && block_number_ > 1) {
+    if (!WriteEbmlElement(writer, libwebm::kMkvCueBlockNumber,
+                          static_cast<uint64>(block_number_))) {
+      return false;
+    }
+  }
+
+  const int64_t stop_position = writer->Position();
+  if (stop_position < 0)
+    return false;
+
+  if (stop_position - payload_position != static_cast<int64_t>(payload_size))
+    return false;
+
+  return true;
+}
+
+uint64_t CuePoint::PayloadSize() const {
+  uint64_t size = EbmlElementSize(libwebm::kMkvCueClusterPosition,
+                                  static_cast<uint64>(cluster_pos_));
+  size += EbmlElementSize(libwebm::kMkvCueTrack, static_cast<uint64>(track_));
+  if (output_block_number_ && block_number_ > 1)
+    size += EbmlElementSize(libwebm::kMkvCueBlockNumber,
+                            static_cast<uint64>(block_number_));
+  const uint64_t track_pos_size =
+      EbmlMasterElementSize(libwebm::kMkvCueTrackPositions, size) + size;
+  const uint64_t payload_size =
+      EbmlElementSize(libwebm::kMkvCueTime, static_cast<uint64>(time_)) +
+      track_pos_size;
+
+  return payload_size;
+}
+
+uint64_t CuePoint::Size() const {
+  const uint64_t payload_size = PayloadSize();
+  return EbmlMasterElementSize(libwebm::kMkvCuePoint, payload_size) +
+         payload_size;
+}
+
+///////////////////////////////////////////////////////////////
+//
+// Cues Class
+
+Cues::Cues()
+    : cue_entries_capacity_(0),
+      cue_entries_size_(0),
+      cue_entries_(NULL),
+      output_block_number_(true) {}
+
+Cues::~Cues() {
+  if (cue_entries_) {
+    for (int32_t i = 0; i < cue_entries_size_; ++i) {
+      CuePoint* const cue = cue_entries_[i];
+      delete cue;
+    }
+    delete[] cue_entries_;
+  }
+}
+
+bool Cues::AddCue(CuePoint* cue) {
+  if (!cue)
+    return false;
+
+  if ((cue_entries_size_ + 1) > cue_entries_capacity_) {
+    // Add more CuePoints.
+    const int32_t new_capacity =
+        (!cue_entries_capacity_) ? 2 : cue_entries_capacity_ * 2;
+
+    if (new_capacity < 1)
+      return false;
+
+    CuePoint** const cues =
+        new (std::nothrow) CuePoint*[new_capacity];  // NOLINT
+    if (!cues)
+      return false;
+
+    for (int32_t i = 0; i < cue_entries_size_; ++i) {
+      cues[i] = cue_entries_[i];
+    }
+
+    delete[] cue_entries_;
+
+    cue_entries_ = cues;
+    cue_entries_capacity_ = new_capacity;
+  }
+
+  cue->set_output_block_number(output_block_number_);
+  cue_entries_[cue_entries_size_++] = cue;
+  return true;
+}
+
+CuePoint* Cues::GetCueByIndex(int32_t index) const {
+  if (cue_entries_ == NULL)
+    return NULL;
+
+  if (index >= cue_entries_size_)
+    return NULL;
+
+  return cue_entries_[index];
+}
+
+uint64_t Cues::Size() {
+  uint64_t size = 0;
+  for (int32_t i = 0; i < cue_entries_size_; ++i)
+    size += GetCueByIndex(i)->Size();
+  size += EbmlMasterElementSize(libwebm::kMkvCues, size);
+  return size;
+}
+
+bool Cues::Write(IMkvWriter* writer) const {
+  if (!writer)
+    return false;
+
+  uint64_t size = 0;
+  for (int32_t i = 0; i < cue_entries_size_; ++i) {
+    const CuePoint* const cue = GetCueByIndex(i);
+
+    if (!cue)
+      return false;
+
+    size += cue->Size();
+  }
+
+  if (!WriteEbmlMasterElement(writer, libwebm::kMkvCues, size))
+    return false;
+
+  const int64_t payload_position = writer->Position();
+  if (payload_position < 0)
+    return false;
+
+  for (int32_t i = 0; i < cue_entries_size_; ++i) {
+    const CuePoint* const cue = GetCueByIndex(i);
+
+    if (!cue->Write(writer))
+      return false;
+  }
+
+  const int64_t stop_position = writer->Position();
+  if (stop_position < 0)
+    return false;
+
+  if (stop_position - payload_position != static_cast<int64_t>(size))
+    return false;
+
+  return true;
+}
+
+///////////////////////////////////////////////////////////////
+//
+// ContentEncAESSettings Class
+
+ContentEncAESSettings::ContentEncAESSettings() : cipher_mode_(kCTR) {}
+
+uint64_t ContentEncAESSettings::Size() const {
+  const uint64_t payload = PayloadSize();
+  const uint64_t size =
+      EbmlMasterElementSize(libwebm::kMkvContentEncAESSettings, payload) +
+      payload;
+  return size;
+}
+
+bool ContentEncAESSettings::Write(IMkvWriter* writer) const {
+  const uint64_t payload = PayloadSize();
+
+  if (!WriteEbmlMasterElement(writer, libwebm::kMkvContentEncAESSettings,
+                              payload))
+    return false;
+
+  const int64_t payload_position = writer->Position();
+  if (payload_position < 0)
+    return false;
+
+  if (!WriteEbmlElement(writer, libwebm::kMkvAESSettingsCipherMode,
+                        static_cast<uint64>(cipher_mode_))) {
+    return false;
+  }
+
+  const int64_t stop_position = writer->Position();
+  if (stop_position < 0 ||
+      stop_position - payload_position != static_cast<int64_t>(payload))
+    return false;
+
+  return true;
+}
+
+uint64_t ContentEncAESSettings::PayloadSize() const {
+  uint64_t size = EbmlElementSize(libwebm::kMkvAESSettingsCipherMode,
+                                  static_cast<uint64>(cipher_mode_));
+  return size;
+}
+
+///////////////////////////////////////////////////////////////
+//
+// ContentEncoding Class
+
+ContentEncoding::ContentEncoding()
+    : enc_algo_(5),
+      enc_key_id_(NULL),
+      encoding_order_(0),
+      encoding_scope_(1),
+      encoding_type_(1),
+      enc_key_id_length_(0) {}
+
+ContentEncoding::~ContentEncoding() { delete[] enc_key_id_; }
+
+bool ContentEncoding::SetEncryptionID(const uint8_t* id, uint64_t length) {
+  if (!id || length < 1)
+    return false;
+
+  delete[] enc_key_id_;
+
+  enc_key_id_ =
+      new (std::nothrow) uint8_t[static_cast<size_t>(length)];  // NOLINT
+  if (!enc_key_id_)
+    return false;
+
+  memcpy(enc_key_id_, id, static_cast<size_t>(length));
+  enc_key_id_length_ = length;
+
+  return true;
+}
+
+uint64_t ContentEncoding::Size() const {
+  const uint64_t encryption_size = EncryptionSize();
+  const uint64_t encoding_size = EncodingSize(0, encryption_size);
+  const uint64_t encodings_size =
+      EbmlMasterElementSize(libwebm::kMkvContentEncoding, encoding_size) +
+      encoding_size;
+
+  return encodings_size;
+}
+
+bool ContentEncoding::Write(IMkvWriter* writer) const {
+  const uint64_t encryption_size = EncryptionSize();
+  const uint64_t encoding_size = EncodingSize(0, encryption_size);
+  const uint64_t size =
+      EbmlMasterElementSize(libwebm::kMkvContentEncoding, encoding_size) +
+      encoding_size;
+
+  const int64_t payload_position = writer->Position();
+  if (payload_position < 0)
+    return false;
+
+  if (!WriteEbmlMasterElement(writer, libwebm::kMkvContentEncoding,
+                              encoding_size))
+    return false;
+  if (!WriteEbmlElement(writer, libwebm::kMkvContentEncodingOrder,
+                        static_cast<uint64>(encoding_order_)))
+    return false;
+  if (!WriteEbmlElement(writer, libwebm::kMkvContentEncodingScope,
+                        static_cast<uint64>(encoding_scope_)))
+    return false;
+  if (!WriteEbmlElement(writer, libwebm::kMkvContentEncodingType,
+                        static_cast<uint64>(encoding_type_)))
+    return false;
+
+  if (!WriteEbmlMasterElement(writer, libwebm::kMkvContentEncryption,
+                              encryption_size))
+    return false;
+  if (!WriteEbmlElement(writer, libwebm::kMkvContentEncAlgo,
+                        static_cast<uint64>(enc_algo_))) {
+    return false;
+  }
+  if (!WriteEbmlElement(writer, libwebm::kMkvContentEncKeyID, enc_key_id_,
+                        enc_key_id_length_))
+    return false;
+
+  if (!enc_aes_settings_.Write(writer))
+    return false;
+
+  const int64_t stop_position = writer->Position();
+  if (stop_position < 0 ||
+      stop_position - payload_position != static_cast<int64_t>(size))
+    return false;
+
+  return true;
+}
+
+uint64_t ContentEncoding::EncodingSize(uint64_t compresion_size,
+                                       uint64_t encryption_size) const {
+  // TODO(fgalligan): Add support for compression settings.
+  if (compresion_size != 0)
+    return 0;
+
+  uint64_t encoding_size = 0;
+
+  if (encryption_size > 0) {
+    encoding_size +=
+        EbmlMasterElementSize(libwebm::kMkvContentEncryption, encryption_size) +
+        encryption_size;
+  }
+  encoding_size += EbmlElementSize(libwebm::kMkvContentEncodingType,
+                                   static_cast<uint64>(encoding_type_));
+  encoding_size += EbmlElementSize(libwebm::kMkvContentEncodingScope,
+                                   static_cast<uint64>(encoding_scope_));
+  encoding_size += EbmlElementSize(libwebm::kMkvContentEncodingOrder,
+                                   static_cast<uint64>(encoding_order_));
+
+  return encoding_size;
+}
+
+uint64_t ContentEncoding::EncryptionSize() const {
+  const uint64_t aes_size = enc_aes_settings_.Size();
+
+  uint64_t encryption_size = EbmlElementSize(libwebm::kMkvContentEncKeyID,
+                                             enc_key_id_, enc_key_id_length_);
+  encryption_size += EbmlElementSize(libwebm::kMkvContentEncAlgo,
+                                     static_cast<uint64>(enc_algo_));
+
+  return encryption_size + aes_size;
+}
+
+///////////////////////////////////////////////////////////////
+//
+// Track Class
+
+Track::Track(unsigned int* seed)
+    : codec_id_(NULL),
+      codec_private_(NULL),
+      language_(NULL),
+      max_block_additional_id_(0),
+      name_(NULL),
+      number_(0),
+      type_(0),
+      uid_(MakeUID(seed)),
+      codec_delay_(0),
+      seek_pre_roll_(0),
+      default_duration_(0),
+      codec_private_length_(0),
+      content_encoding_entries_(NULL),
+      content_encoding_entries_size_(0) {}
+
+Track::~Track() {
+  delete[] codec_id_;
+  delete[] codec_private_;
+  delete[] language_;
+  delete[] name_;
+
+  if (content_encoding_entries_) {
+    for (uint32_t i = 0; i < content_encoding_entries_size_; ++i) {
+      ContentEncoding* const encoding = content_encoding_entries_[i];
+      delete encoding;
+    }
+    delete[] content_encoding_entries_;
+  }
+}
+
+bool Track::AddContentEncoding() {
+  const uint32_t count = content_encoding_entries_size_ + 1;
+
+  ContentEncoding** const content_encoding_entries =
+      new (std::nothrow) ContentEncoding*[count];  // NOLINT
+  if (!content_encoding_entries)
+    return false;
+
+  ContentEncoding* const content_encoding =
+      new (std::nothrow) ContentEncoding();  // NOLINT
+  if (!content_encoding) {
+    delete[] content_encoding_entries;
+    return false;
+  }
+
+  for (uint32_t i = 0; i < content_encoding_entries_size_; ++i) {
+    content_encoding_entries[i] = content_encoding_entries_[i];
+  }
+
+  delete[] content_encoding_entries_;
+
+  content_encoding_entries_ = content_encoding_entries;
+  content_encoding_entries_[content_encoding_entries_size_] = content_encoding;
+  content_encoding_entries_size_ = count;
+  return true;
+}
+
+ContentEncoding* Track::GetContentEncodingByIndex(uint32_t index) const {
+  if (content_encoding_entries_ == NULL)
+    return NULL;
+
+  if (index >= content_encoding_entries_size_)
+    return NULL;
+
+  return content_encoding_entries_[index];
+}
+
+uint64_t Track::PayloadSize() const {
+  uint64_t size =
+      EbmlElementSize(libwebm::kMkvTrackNumber, static_cast<uint64>(number_));
+  size += EbmlElementSize(libwebm::kMkvTrackUID, static_cast<uint64>(uid_));
+  size += EbmlElementSize(libwebm::kMkvTrackType, static_cast<uint64>(type_));
+  if (codec_id_)
+    size += EbmlElementSize(libwebm::kMkvCodecID, codec_id_);
+  if (codec_private_)
+    size += EbmlElementSize(libwebm::kMkvCodecPrivate, codec_private_,
+                            codec_private_length_);
+  if (language_)
+    size += EbmlElementSize(libwebm::kMkvLanguage, language_);
+  if (name_)
+    size += EbmlElementSize(libwebm::kMkvName, name_);
+  if (max_block_additional_id_) {
+    size += EbmlElementSize(libwebm::kMkvMaxBlockAdditionID,
+                            static_cast<uint64>(max_block_additional_id_));
+  }
+  if (codec_delay_) {
+    size += EbmlElementSize(libwebm::kMkvCodecDelay,
+                            static_cast<uint64>(codec_delay_));
+  }
+  if (seek_pre_roll_) {
+    size += EbmlElementSize(libwebm::kMkvSeekPreRoll,
+                            static_cast<uint64>(seek_pre_roll_));
+  }
+  if (default_duration_) {
+    size += EbmlElementSize(libwebm::kMkvDefaultDuration,
+                            static_cast<uint64>(default_duration_));
+  }
+
+  if (content_encoding_entries_size_ > 0) {
+    uint64_t content_encodings_size = 0;
+    for (uint32_t i = 0; i < content_encoding_entries_size_; ++i) {
+      ContentEncoding* const encoding = content_encoding_entries_[i];
+      content_encodings_size += encoding->Size();
+    }
+
+    size += EbmlMasterElementSize(libwebm::kMkvContentEncodings,
+                                  content_encodings_size) +
+            content_encodings_size;
+  }
+
+  return size;
+}
+
+uint64_t Track::Size() const {
+  uint64_t size = PayloadSize();
+  size += EbmlMasterElementSize(libwebm::kMkvTrackEntry, size);
+  return size;
+}
+
+bool Track::Write(IMkvWriter* writer) const {
+  if (!writer)
+    return false;
+
+  // mandatory elements without a default value.
+  if (!type_ || !codec_id_)
+    return false;
+
+  // AV1 tracks require a CodecPrivate. See
+  // https://github.com/Matroska-Org/matroska-specification/blob/av1-mappin/codec/av1.md
+  // TODO(tomfinegan): Update the above link to the AV1 Matroska mappings to
+  // point to a stable version once it is finalized, or our own WebM mappings
+  // page on webmproject.org should we decide to release them.
+  if (!strcmp(codec_id_, Tracks::kAv1CodecId) && !codec_private_)
+    return false;
+
+  // |size| may be bigger than what is written out in this function because
+  // derived classes may write out more data in the Track element.
+  const uint64_t payload_size = PayloadSize();
+
+  if (!WriteEbmlMasterElement(writer, libwebm::kMkvTrackEntry, payload_size))
+    return false;
+
+  uint64_t size =
+      EbmlElementSize(libwebm::kMkvTrackNumber, static_cast<uint64>(number_));
+  size += EbmlElementSize(libwebm::kMkvTrackUID, static_cast<uint64>(uid_));
+  size += EbmlElementSize(libwebm::kMkvTrackType, static_cast<uint64>(type_));
+  if (codec_id_)
+    size += EbmlElementSize(libwebm::kMkvCodecID, codec_id_);
+  if (codec_private_)
+    size += EbmlElementSize(libwebm::kMkvCodecPrivate, codec_private_,
+                            static_cast<uint64>(codec_private_length_));
+  if (language_)
+    size += EbmlElementSize(libwebm::kMkvLanguage, language_);
+  if (name_)
+    size += EbmlElementSize(libwebm::kMkvName, name_);
+  if (max_block_additional_id_)
+    size += EbmlElementSize(libwebm::kMkvMaxBlockAdditionID,
+                            static_cast<uint64>(max_block_additional_id_));
+  if (codec_delay_)
+    size += EbmlElementSize(libwebm::kMkvCodecDelay,
+                            static_cast<uint64>(codec_delay_));
+  if (seek_pre_roll_)
+    size += EbmlElementSize(libwebm::kMkvSeekPreRoll,
+                            static_cast<uint64>(seek_pre_roll_));
+  if (default_duration_)
+    size += EbmlElementSize(libwebm::kMkvDefaultDuration,
+                            static_cast<uint64>(default_duration_));
+
+  const int64_t payload_position = writer->Position();
+  if (payload_position < 0)
+    return false;
+
+  if (!WriteEbmlElement(writer, libwebm::kMkvTrackNumber,
+                        static_cast<uint64>(number_)))
+    return false;
+  if (!WriteEbmlElement(writer, libwebm::kMkvTrackUID,
+                        static_cast<uint64>(uid_)))
+    return false;
+  if (!WriteEbmlElement(writer, libwebm::kMkvTrackType,
+                        static_cast<uint64>(type_)))
+    return false;
+  if (max_block_additional_id_) {
+    if (!WriteEbmlElement(writer, libwebm::kMkvMaxBlockAdditionID,
+                          static_cast<uint64>(max_block_additional_id_))) {
+      return false;
+    }
+  }
+  if (codec_delay_) {
+    if (!WriteEbmlElement(writer, libwebm::kMkvCodecDelay,
+                          static_cast<uint64>(codec_delay_)))
+      return false;
+  }
+  if (seek_pre_roll_) {
+    if (!WriteEbmlElement(writer, libwebm::kMkvSeekPreRoll,
+                          static_cast<uint64>(seek_pre_roll_)))
+      return false;
+  }
+  if (default_duration_) {
+    if (!WriteEbmlElement(writer, libwebm::kMkvDefaultDuration,
+                          static_cast<uint64>(default_duration_)))
+      return false;
+  }
+  if (codec_id_) {
+    if (!WriteEbmlElement(writer, libwebm::kMkvCodecID, codec_id_))
+      return false;
+  }
+  if (codec_private_) {
+    if (!WriteEbmlElement(writer, libwebm::kMkvCodecPrivate, codec_private_,
+                          static_cast<uint64>(codec_private_length_)))
+      return false;
+  }
+  if (language_) {
+    if (!WriteEbmlElement(writer, libwebm::kMkvLanguage, language_))
+      return false;
+  }
+  if (name_) {
+    if (!WriteEbmlElement(writer, libwebm::kMkvName, name_))
+      return false;
+  }
+
+  int64_t stop_position = writer->Position();
+  if (stop_position < 0 ||
+      stop_position - payload_position != static_cast<int64_t>(size))
+    return false;
+
+  if (content_encoding_entries_size_ > 0) {
+    uint64_t content_encodings_size = 0;
+    for (uint32_t i = 0; i < content_encoding_entries_size_; ++i) {
+      ContentEncoding* const encoding = content_encoding_entries_[i];
+      content_encodings_size += encoding->Size();
+    }
+
+    if (!WriteEbmlMasterElement(writer, libwebm::kMkvContentEncodings,
+                                content_encodings_size))
+      return false;
+
+    for (uint32_t i = 0; i < content_encoding_entries_size_; ++i) {
+      ContentEncoding* const encoding = content_encoding_entries_[i];
+      if (!encoding->Write(writer))
+        return false;
+    }
+  }
+
+  stop_position = writer->Position();
+  if (stop_position < 0)
+    return false;
+  return true;
+}
+
+bool Track::SetCodecPrivate(const uint8_t* codec_private, uint64_t length) {
+  if (!codec_private || length < 1)
+    return false;
+
+  delete[] codec_private_;
+
+  codec_private_ =
+      new (std::nothrow) uint8_t[static_cast<size_t>(length)];  // NOLINT
+  if (!codec_private_)
+    return false;
+
+  memcpy(codec_private_, codec_private, static_cast<size_t>(length));
+  codec_private_length_ = length;
+
+  return true;
+}
+
+void Track::set_codec_id(const char* codec_id) {
+  if (codec_id) {
+    delete[] codec_id_;
+
+    const size_t length = strlen(codec_id) + 1;
+    codec_id_ = new (std::nothrow) char[length];  // NOLINT
+    if (codec_id_) {
+#ifdef _MSC_VER
+      strcpy_s(codec_id_, length, codec_id);
+#else
+      strcpy(codec_id_, codec_id);
+#endif
+    }
+  }
+}
+
+// TODO(fgalligan): Vet the language parameter.
+void Track::set_language(const char* language) {
+  if (language) {
+    delete[] language_;
+
+    const size_t length = strlen(language) + 1;
+    language_ = new (std::nothrow) char[length];  // NOLINT
+    if (language_) {
+#ifdef _MSC_VER
+      strcpy_s(language_, length, language);
+#else
+      strcpy(language_, language);
+#endif
+    }
+  }
+}
+
+void Track::set_name(const char* name) {
+  if (name) {
+    delete[] name_;
+
+    const size_t length = strlen(name) + 1;
+    name_ = new (std::nothrow) char[length];  // NOLINT
+    if (name_) {
+#ifdef _MSC_VER
+      strcpy_s(name_, length, name);
+#else
+      strcpy(name_, name);
+#endif
+    }
+  }
+}
+
+///////////////////////////////////////////////////////////////
+//
+// Colour and its child elements
+
+uint64_t PrimaryChromaticity::PrimaryChromaticitySize(
+    libwebm::MkvId x_id, libwebm::MkvId y_id) const {
+  return EbmlElementSize(x_id, x_) + EbmlElementSize(y_id, y_);
+}
+
+bool PrimaryChromaticity::Write(IMkvWriter* writer, libwebm::MkvId x_id,
+                                libwebm::MkvId y_id) const {
+  if (!Valid()) {
+    return false;
+  }
+  return WriteEbmlElement(writer, x_id, x_) &&
+         WriteEbmlElement(writer, y_id, y_);
+}
+
+bool PrimaryChromaticity::Valid() const {
+  return (x_ >= kChromaticityMin && x_ <= kChromaticityMax &&
+          y_ >= kChromaticityMin && y_ <= kChromaticityMax);
+}
+
+uint64_t MasteringMetadata::MasteringMetadataSize() const {
+  uint64_t size = PayloadSize();
+
+  if (size > 0)
+    size += EbmlMasterElementSize(libwebm::kMkvMasteringMetadata, size);
+
+  return size;
+}
+
+bool MasteringMetadata::Valid() const {
+  if (luminance_min_ != kValueNotPresent) {
+    if (luminance_min_ < kMinLuminance || luminance_min_ > kMinLuminanceMax ||
+        luminance_min_ > luminance_max_) {
+      return false;
+    }
+  }
+  if (luminance_max_ != kValueNotPresent) {
+    if (luminance_max_ < kMinLuminance || luminance_max_ > kMaxLuminanceMax ||
+        luminance_max_ < luminance_min_) {
+      return false;
+    }
+  }
+  if (r_ && !r_->Valid())
+    return false;
+  if (g_ && !g_->Valid())
+    return false;
+  if (b_ && !b_->Valid())
+    return false;
+  if (white_point_ && !white_point_->Valid())
+    return false;
+
+  return true;
+}
+
+bool MasteringMetadata::Write(IMkvWriter* writer) const {
+  const uint64_t size = PayloadSize();
+
+  // Don't write an empty element.
+  if (size == 0)
+    return true;
+
+  if (!WriteEbmlMasterElement(writer, libwebm::kMkvMasteringMetadata, size))
+    return false;
+  if (luminance_max_ != kValueNotPresent &&
+      !WriteEbmlElement(writer, libwebm::kMkvLuminanceMax, luminance_max_)) {
+    return false;
+  }
+  if (luminance_min_ != kValueNotPresent &&
+      !WriteEbmlElement(writer, libwebm::kMkvLuminanceMin, luminance_min_)) {
+    return false;
+  }
+  if (r_ && !r_->Write(writer, libwebm::kMkvPrimaryRChromaticityX,
+                       libwebm::kMkvPrimaryRChromaticityY)) {
+    return false;
+  }
+  if (g_ && !g_->Write(writer, libwebm::kMkvPrimaryGChromaticityX,
+                       libwebm::kMkvPrimaryGChromaticityY)) {
+    return false;
+  }
+  if (b_ && !b_->Write(writer, libwebm::kMkvPrimaryBChromaticityX,
+                       libwebm::kMkvPrimaryBChromaticityY)) {
+    return false;
+  }
+  if (white_point_ &&
+      !white_point_->Write(writer, libwebm::kMkvWhitePointChromaticityX,
+                           libwebm::kMkvWhitePointChromaticityY)) {
+    return false;
+  }
+
+  return true;
+}
+
+bool MasteringMetadata::SetChromaticity(
+    const PrimaryChromaticity* r, const PrimaryChromaticity* g,
+    const PrimaryChromaticity* b, const PrimaryChromaticity* white_point) {
+  PrimaryChromaticityPtr r_ptr(nullptr);
+  if (r) {
+    if (!CopyChromaticity(r, &r_ptr))
+      return false;
+  }
+  PrimaryChromaticityPtr g_ptr(nullptr);
+  if (g) {
+    if (!CopyChromaticity(g, &g_ptr))
+      return false;
+  }
+  PrimaryChromaticityPtr b_ptr(nullptr);
+  if (b) {
+    if (!CopyChromaticity(b, &b_ptr))
+      return false;
+  }
+  PrimaryChromaticityPtr wp_ptr(nullptr);
+  if (white_point) {
+    if (!CopyChromaticity(white_point, &wp_ptr))
+      return false;
+  }
+
+  r_ = r_ptr.release();
+  g_ = g_ptr.release();
+  b_ = b_ptr.release();
+  white_point_ = wp_ptr.release();
+  return true;
+}
+
+uint64_t MasteringMetadata::PayloadSize() const {
+  uint64_t size = 0;
+
+  if (luminance_max_ != kValueNotPresent)
+    size += EbmlElementSize(libwebm::kMkvLuminanceMax, luminance_max_);
+  if (luminance_min_ != kValueNotPresent)
+    size += EbmlElementSize(libwebm::kMkvLuminanceMin, luminance_min_);
+
+  if (r_) {
+    size += r_->PrimaryChromaticitySize(libwebm::kMkvPrimaryRChromaticityX,
+                                        libwebm::kMkvPrimaryRChromaticityY);
+  }
+  if (g_) {
+    size += g_->PrimaryChromaticitySize(libwebm::kMkvPrimaryGChromaticityX,
+                                        libwebm::kMkvPrimaryGChromaticityY);
+  }
+  if (b_) {
+    size += b_->PrimaryChromaticitySize(libwebm::kMkvPrimaryBChromaticityX,
+                                        libwebm::kMkvPrimaryBChromaticityY);
+  }
+  if (white_point_) {
+    size += white_point_->PrimaryChromaticitySize(
+        libwebm::kMkvWhitePointChromaticityX,
+        libwebm::kMkvWhitePointChromaticityY);
+  }
+
+  return size;
+}
+
+uint64_t Colour::ColourSize() const {
+  uint64_t size = PayloadSize();
+
+  if (size > 0)
+    size += EbmlMasterElementSize(libwebm::kMkvColour, size);
+
+  return size;
+}
+
+bool Colour::Valid() const {
+  if (mastering_metadata_ && !mastering_metadata_->Valid())
+    return false;
+  if (matrix_coefficients_ != kValueNotPresent &&
+      !IsMatrixCoefficientsValueValid(matrix_coefficients_)) {
+    return false;
+  }
+  if (chroma_siting_horz_ != kValueNotPresent &&
+      !IsChromaSitingHorzValueValid(chroma_siting_horz_)) {
+    return false;
+  }
+  if (chroma_siting_vert_ != kValueNotPresent &&
+      !IsChromaSitingVertValueValid(chroma_siting_vert_)) {
+    return false;
+  }
+  if (range_ != kValueNotPresent && !IsColourRangeValueValid(range_))
+    return false;
+  if (transfer_characteristics_ != kValueNotPresent &&
+      !IsTransferCharacteristicsValueValid(transfer_characteristics_)) {
+    return false;
+  }
+  if (primaries_ != kValueNotPresent && !IsPrimariesValueValid(primaries_))
+    return false;
+
+  return true;
+}
+
+bool Colour::Write(IMkvWriter* writer) const {
+  const uint64_t size = PayloadSize();
+
+  // Don't write an empty element.
+  if (size == 0)
+    return true;
+
+  // Don't write an invalid element.
+  if (!Valid())
+    return false;
+
+  if (!WriteEbmlMasterElement(writer, libwebm::kMkvColour, size))
+    return false;
+
+  if (matrix_coefficients_ != kValueNotPresent &&
+      !WriteEbmlElement(writer, libwebm::kMkvMatrixCoefficients,
+                        static_cast<uint64>(matrix_coefficients_))) {
+    return false;
+  }
+  if (bits_per_channel_ != kValueNotPresent &&
+      !WriteEbmlElement(writer, libwebm::kMkvBitsPerChannel,
+                        static_cast<uint64>(bits_per_channel_))) {
+    return false;
+  }
+  if (chroma_subsampling_horz_ != kValueNotPresent &&
+      !WriteEbmlElement(writer, libwebm::kMkvChromaSubsamplingHorz,
+                        static_cast<uint64>(chroma_subsampling_horz_))) {
+    return false;
+  }
+  if (chroma_subsampling_vert_ != kValueNotPresent &&
+      !WriteEbmlElement(writer, libwebm::kMkvChromaSubsamplingVert,
+                        static_cast<uint64>(chroma_subsampling_vert_))) {
+    return false;
+  }
+
+  if (cb_subsampling_horz_ != kValueNotPresent &&
+      !WriteEbmlElement(writer, libwebm::kMkvCbSubsamplingHorz,
+                        static_cast<uint64>(cb_subsampling_horz_))) {
+    return false;
+  }
+  if (cb_subsampling_vert_ != kValueNotPresent &&
+      !WriteEbmlElement(writer, libwebm::kMkvCbSubsamplingVert,
+                        static_cast<uint64>(cb_subsampling_vert_))) {
+    return false;
+  }
+  if (chroma_siting_horz_ != kValueNotPresent &&
+      !WriteEbmlElement(writer, libwebm::kMkvChromaSitingHorz,
+                        static_cast<uint64>(chroma_siting_horz_))) {
+    return false;
+  }
+  if (chroma_siting_vert_ != kValueNotPresent &&
+      !WriteEbmlElement(writer, libwebm::kMkvChromaSitingVert,
+                        static_cast<uint64>(chroma_siting_vert_))) {
+    return false;
+  }
+  if (range_ != kValueNotPresent &&
+      !WriteEbmlElement(writer, libwebm::kMkvRange,
+                        static_cast<uint64>(range_))) {
+    return false;
+  }
+  if (transfer_characteristics_ != kValueNotPresent &&
+      !WriteEbmlElement(writer, libwebm::kMkvTransferCharacteristics,
+                        static_cast<uint64>(transfer_characteristics_))) {
+    return false;
+  }
+  if (primaries_ != kValueNotPresent &&
+      !WriteEbmlElement(writer, libwebm::kMkvPrimaries,
+                        static_cast<uint64>(primaries_))) {
+    return false;
+  }
+  if (max_cll_ != kValueNotPresent &&
+      !WriteEbmlElement(writer, libwebm::kMkvMaxCLL,
+                        static_cast<uint64>(max_cll_))) {
+    return false;
+  }
+  if (max_fall_ != kValueNotPresent &&
+      !WriteEbmlElement(writer, libwebm::kMkvMaxFALL,
+                        static_cast<uint64>(max_fall_))) {
+    return false;
+  }
+
+  if (mastering_metadata_ && !mastering_metadata_->Write(writer))
+    return false;
+
+  return true;
+}
+
+bool Colour::SetMasteringMetadata(const MasteringMetadata& mastering_metadata) {
+  std::unique_ptr<MasteringMetadata> mm_ptr(new MasteringMetadata());
+  if (!mm_ptr.get())
+    return false;
+
+  mm_ptr->set_luminance_max(mastering_metadata.luminance_max());
+  mm_ptr->set_luminance_min(mastering_metadata.luminance_min());
+
+  if (!mm_ptr->SetChromaticity(mastering_metadata.r(), mastering_metadata.g(),
+                               mastering_metadata.b(),
+                               mastering_metadata.white_point())) {
+    return false;
+  }
+
+  delete mastering_metadata_;
+  mastering_metadata_ = mm_ptr.release();
+  return true;
+}
+
+uint64_t Colour::PayloadSize() const {
+  uint64_t size = 0;
+
+  if (matrix_coefficients_ != kValueNotPresent) {
+    size += EbmlElementSize(libwebm::kMkvMatrixCoefficients,
+                            static_cast<uint64>(matrix_coefficients_));
+  }
+  if (bits_per_channel_ != kValueNotPresent) {
+    size += EbmlElementSize(libwebm::kMkvBitsPerChannel,
+                            static_cast<uint64>(bits_per_channel_));
+  }
+  if (chroma_subsampling_horz_ != kValueNotPresent) {
+    size += EbmlElementSize(libwebm::kMkvChromaSubsamplingHorz,
+                            static_cast<uint64>(chroma_subsampling_horz_));
+  }
+  if (chroma_subsampling_vert_ != kValueNotPresent) {
+    size += EbmlElementSize(libwebm::kMkvChromaSubsamplingVert,
+                            static_cast<uint64>(chroma_subsampling_vert_));
+  }
+  if (cb_subsampling_horz_ != kValueNotPresent) {
+    size += EbmlElementSize(libwebm::kMkvCbSubsamplingHorz,
+                            static_cast<uint64>(cb_subsampling_horz_));
+  }
+  if (cb_subsampling_vert_ != kValueNotPresent) {
+    size += EbmlElementSize(libwebm::kMkvCbSubsamplingVert,
+                            static_cast<uint64>(cb_subsampling_vert_));
+  }
+  if (chroma_siting_horz_ != kValueNotPresent) {
+    size += EbmlElementSize(libwebm::kMkvChromaSitingHorz,
+                            static_cast<uint64>(chroma_siting_horz_));
+  }
+  if (chroma_siting_vert_ != kValueNotPresent) {
+    size += EbmlElementSize(libwebm::kMkvChromaSitingVert,
+                            static_cast<uint64>(chroma_siting_vert_));
+  }
+  if (range_ != kValueNotPresent) {
+    size += EbmlElementSize(libwebm::kMkvRange, static_cast<uint64>(range_));
+  }
+  if (transfer_characteristics_ != kValueNotPresent) {
+    size += EbmlElementSize(libwebm::kMkvTransferCharacteristics,
+                            static_cast<uint64>(transfer_characteristics_));
+  }
+  if (primaries_ != kValueNotPresent) {
+    size += EbmlElementSize(libwebm::kMkvPrimaries,
+                            static_cast<uint64>(primaries_));
+  }
+  if (max_cll_ != kValueNotPresent) {
+    size += EbmlElementSize(libwebm::kMkvMaxCLL, static_cast<uint64>(max_cll_));
+  }
+  if (max_fall_ != kValueNotPresent) {
+    size +=
+        EbmlElementSize(libwebm::kMkvMaxFALL, static_cast<uint64>(max_fall_));
+  }
+
+  if (mastering_metadata_)
+    size += mastering_metadata_->MasteringMetadataSize();
+
+  return size;
+}
+
+///////////////////////////////////////////////////////////////
+//
+// Projection element
+
+uint64_t Projection::ProjectionSize() const {
+  uint64_t size = PayloadSize();
+
+  if (size > 0)
+    size += EbmlMasterElementSize(libwebm::kMkvProjection, size);
+
+  return size;
+}
+
+bool Projection::Write(IMkvWriter* writer) const {
+  const uint64_t size = PayloadSize();
+
+  // Don't write an empty element.
+  if (size == 0)
+    return true;
+
+  if (!WriteEbmlMasterElement(writer, libwebm::kMkvProjection, size))
+    return false;
+
+  if (!WriteEbmlElement(writer, libwebm::kMkvProjectionType,
+                        static_cast<uint64>(type_))) {
+    return false;
+  }
+
+  if (private_data_length_ > 0 && private_data_ != NULL &&
+      !WriteEbmlElement(writer, libwebm::kMkvProjectionPrivate, private_data_,
+                        private_data_length_)) {
+    return false;
+  }
+
+  if (!WriteEbmlElement(writer, libwebm::kMkvProjectionPoseYaw, pose_yaw_))
+    return false;
+
+  if (!WriteEbmlElement(writer, libwebm::kMkvProjectionPosePitch,
+                        pose_pitch_)) {
+    return false;
+  }
+
+  if (!WriteEbmlElement(writer, libwebm::kMkvProjectionPoseRoll, pose_roll_)) {
+    return false;
+  }
+
+  return true;
+}
+
+bool Projection::SetProjectionPrivate(const uint8_t* data,
+                                      uint64_t data_length) {
+  if (data == NULL || data_length == 0) {
+    return false;
+  }
+
+  if (data_length != static_cast<size_t>(data_length)) {
+    return false;
+  }
+
+  uint8_t* new_private_data =
+      new (std::nothrow) uint8_t[static_cast<size_t>(data_length)];
+  if (new_private_data == NULL) {
+    return false;
+  }
+
+  delete[] private_data_;
+  private_data_ = new_private_data;
+  private_data_length_ = data_length;
+  memcpy(private_data_, data, static_cast<size_t>(data_length));
+
+  return true;
+}
+
+uint64_t Projection::PayloadSize() const {
+  uint64_t size =
+      EbmlElementSize(libwebm::kMkvProjection, static_cast<uint64>(type_));
+
+  if (private_data_length_ > 0 && private_data_ != NULL) {
+    size += EbmlElementSize(libwebm::kMkvProjectionPrivate, private_data_,
+                            private_data_length_);
+  }
+
+  size += EbmlElementSize(libwebm::kMkvProjectionPoseYaw, pose_yaw_);
+  size += EbmlElementSize(libwebm::kMkvProjectionPosePitch, pose_pitch_);
+  size += EbmlElementSize(libwebm::kMkvProjectionPoseRoll, pose_roll_);
+
+  return size;
+}
+
+///////////////////////////////////////////////////////////////
+//
+// VideoTrack Class
+
+VideoTrack::VideoTrack(unsigned int* seed)
+    : Track(seed),
+      display_height_(0),
+      display_width_(0),
+      pixel_height_(0),
+      pixel_width_(0),
+      crop_left_(0),
+      crop_right_(0),
+      crop_top_(0),
+      crop_bottom_(0),
+      frame_rate_(0.0),
+      height_(0),
+      stereo_mode_(0),
+      alpha_mode_(0),
+      width_(0),
+      colour_space_(NULL),
+      colour_(NULL),
+      projection_(NULL) {}
+
+VideoTrack::~VideoTrack() {
+  delete colour_;
+  delete projection_;
+}
+
+bool VideoTrack::SetStereoMode(uint64_t stereo_mode) {
+  if (stereo_mode != kMono && stereo_mode != kSideBySideLeftIsFirst &&
+      stereo_mode != kTopBottomRightIsFirst &&
+      stereo_mode != kTopBottomLeftIsFirst &&
+      stereo_mode != kSideBySideRightIsFirst)
+    return false;
+
+  stereo_mode_ = stereo_mode;
+  return true;
+}
+
+bool VideoTrack::SetAlphaMode(uint64_t alpha_mode) {
+  if (alpha_mode != kNoAlpha && alpha_mode != kAlpha)
+    return false;
+
+  alpha_mode_ = alpha_mode;
+  return true;
+}
+
+uint64_t VideoTrack::PayloadSize() const {
+  const uint64_t parent_size = Track::PayloadSize();
+
+  uint64_t size = VideoPayloadSize();
+  size += EbmlMasterElementSize(libwebm::kMkvVideo, size);
+
+  return parent_size + size;
+}
+
+bool VideoTrack::Write(IMkvWriter* writer) const {
+  if (!Track::Write(writer))
+    return false;
+
+  const uint64_t size = VideoPayloadSize();
+
+  if (!WriteEbmlMasterElement(writer, libwebm::kMkvVideo, size))
+    return false;
+
+  const int64_t payload_position = writer->Position();
+  if (payload_position < 0)
+    return false;
+
+  if (!WriteEbmlElement(
+          writer, libwebm::kMkvPixelWidth,
+          static_cast<uint64>((pixel_width_ > 0) ? pixel_width_ : width_)))
+    return false;
+  if (!WriteEbmlElement(
+          writer, libwebm::kMkvPixelHeight,
+          static_cast<uint64>((pixel_height_ > 0) ? pixel_height_ : height_)))
+    return false;
+  if (display_width_ > 0) {
+    if (!WriteEbmlElement(writer, libwebm::kMkvDisplayWidth,
+                          static_cast<uint64>(display_width_)))
+      return false;
+  }
+  if (display_height_ > 0) {
+    if (!WriteEbmlElement(writer, libwebm::kMkvDisplayHeight,
+                          static_cast<uint64>(display_height_)))
+      return false;
+  }
+  if (crop_left_ > 0) {
+    if (!WriteEbmlElement(writer, libwebm::kMkvPixelCropLeft,
+                          static_cast<uint64>(crop_left_)))
+      return false;
+  }
+  if (crop_right_ > 0) {
+    if (!WriteEbmlElement(writer, libwebm::kMkvPixelCropRight,
+                          static_cast<uint64>(crop_right_)))
+      return false;
+  }
+  if (crop_top_ > 0) {
+    if (!WriteEbmlElement(writer, libwebm::kMkvPixelCropTop,
+                          static_cast<uint64>(crop_top_)))
+      return false;
+  }
+  if (crop_bottom_ > 0) {
+    if (!WriteEbmlElement(writer, libwebm::kMkvPixelCropBottom,
+                          static_cast<uint64>(crop_bottom_)))
+      return false;
+  }
+  if (stereo_mode_ > kMono) {
+    if (!WriteEbmlElement(writer, libwebm::kMkvStereoMode,
+                          static_cast<uint64>(stereo_mode_)))
+      return false;
+  }
+  if (alpha_mode_ > kNoAlpha) {
+    if (!WriteEbmlElement(writer, libwebm::kMkvAlphaMode,
+                          static_cast<uint64>(alpha_mode_)))
+      return false;
+  }
+  if (colour_space_) {
+    if (!WriteEbmlElement(writer, libwebm::kMkvColourSpace, colour_space_))
+      return false;
+  }
+  if (frame_rate_ > 0.0) {
+    if (!WriteEbmlElement(writer, libwebm::kMkvFrameRate,
+                          static_cast<float>(frame_rate_))) {
+      return false;
+    }
+  }
+  if (colour_) {
+    if (!colour_->Write(writer))
+      return false;
+  }
+  if (projection_) {
+    if (!projection_->Write(writer))
+      return false;
+  }
+
+  const int64_t stop_position = writer->Position();
+  if (stop_position < 0 ||
+      stop_position - payload_position != static_cast<int64_t>(size)) {
+    return false;
+  }
+
+  return true;
+}
+
+void VideoTrack::set_colour_space(const char* colour_space) {
+  if (colour_space) {
+    delete[] colour_space_;
+
+    const size_t length = strlen(colour_space) + 1;
+    colour_space_ = new (std::nothrow) char[length];  // NOLINT
+    if (colour_space_) {
+#ifdef _MSC_VER
+      strcpy_s(colour_space_, length, colour_space);
+#else
+      strcpy(colour_space_, colour_space);
+#endif
+    }
+  }
+}
+
+bool VideoTrack::SetColour(const Colour& colour) {
+  std::unique_ptr<Colour> colour_ptr(new Colour());
+  if (!colour_ptr.get())
+    return false;
+
+  if (colour.mastering_metadata()) {
+    if (!colour_ptr->SetMasteringMetadata(*colour.mastering_metadata()))
+      return false;
+  }
+
+  colour_ptr->set_matrix_coefficients(colour.matrix_coefficients());
+  colour_ptr->set_bits_per_channel(colour.bits_per_channel());
+  colour_ptr->set_chroma_subsampling_horz(colour.chroma_subsampling_horz());
+  colour_ptr->set_chroma_subsampling_vert(colour.chroma_subsampling_vert());
+  colour_ptr->set_cb_subsampling_horz(colour.cb_subsampling_horz());
+  colour_ptr->set_cb_subsampling_vert(colour.cb_subsampling_vert());
+  colour_ptr->set_chroma_siting_horz(colour.chroma_siting_horz());
+  colour_ptr->set_chroma_siting_vert(colour.chroma_siting_vert());
+  colour_ptr->set_range(colour.range());
+  colour_ptr->set_transfer_characteristics(colour.transfer_characteristics());
+  colour_ptr->set_primaries(colour.primaries());
+  colour_ptr->set_max_cll(colour.max_cll());
+  colour_ptr->set_max_fall(colour.max_fall());
+  delete colour_;
+  colour_ = colour_ptr.release();
+  return true;
+}
+
+bool VideoTrack::SetProjection(const Projection& projection) {
+  std::unique_ptr<Projection> projection_ptr(new Projection());
+  if (!projection_ptr.get())
+    return false;
+
+  if (projection.private_data()) {
+    if (!projection_ptr->SetProjectionPrivate(
+            projection.private_data(), projection.private_data_length())) {
+      return false;
+    }
+  }
+
+  projection_ptr->set_type(projection.type());
+  projection_ptr->set_pose_yaw(projection.pose_yaw());
+  projection_ptr->set_pose_pitch(projection.pose_pitch());
+  projection_ptr->set_pose_roll(projection.pose_roll());
+  delete projection_;
+  projection_ = projection_ptr.release();
+  return true;
+}
+
+uint64_t VideoTrack::VideoPayloadSize() const {
+  uint64_t size = EbmlElementSize(
+      libwebm::kMkvPixelWidth,
+      static_cast<uint64>((pixel_width_ > 0) ? pixel_width_ : width_));
+  size += EbmlElementSize(
+      libwebm::kMkvPixelHeight,
+      static_cast<uint64>((pixel_height_ > 0) ? pixel_height_ : height_));
+  if (display_width_ > 0)
+    size += EbmlElementSize(libwebm::kMkvDisplayWidth,
+                            static_cast<uint64>(display_width_));
+  if (display_height_ > 0)
+    size += EbmlElementSize(libwebm::kMkvDisplayHeight,
+                            static_cast<uint64>(display_height_));
+  if (crop_left_ > 0)
+    size += EbmlElementSize(libwebm::kMkvPixelCropLeft,
+                            static_cast<uint64>(crop_left_));
+  if (crop_right_ > 0)
+    size += EbmlElementSize(libwebm::kMkvPixelCropRight,
+                            static_cast<uint64>(crop_right_));
+  if (crop_top_ > 0)
+    size += EbmlElementSize(libwebm::kMkvPixelCropTop,
+                            static_cast<uint64>(crop_top_));
+  if (crop_bottom_ > 0)
+    size += EbmlElementSize(libwebm::kMkvPixelCropBottom,
+                            static_cast<uint64>(crop_bottom_));
+  if (stereo_mode_ > kMono)
+    size += EbmlElementSize(libwebm::kMkvStereoMode,
+                            static_cast<uint64>(stereo_mode_));
+  if (alpha_mode_ > kNoAlpha)
+    size += EbmlElementSize(libwebm::kMkvAlphaMode,
+                            static_cast<uint64>(alpha_mode_));
+  if (frame_rate_ > 0.0)
+    size += EbmlElementSize(libwebm::kMkvFrameRate,
+                            static_cast<float>(frame_rate_));
+  if (colour_space_)
+    size += EbmlElementSize(libwebm::kMkvColourSpace, colour_space_);
+  if (colour_)
+    size += colour_->ColourSize();
+  if (projection_)
+    size += projection_->ProjectionSize();
+
+  return size;
+}
+
+///////////////////////////////////////////////////////////////
+//
+// AudioTrack Class
+
+AudioTrack::AudioTrack(unsigned int* seed)
+    : Track(seed), bit_depth_(0), channels_(1), sample_rate_(0.0) {}
+
+AudioTrack::~AudioTrack() {}
+
+uint64_t AudioTrack::PayloadSize() const {
+  const uint64_t parent_size = Track::PayloadSize();
+
+  uint64_t size = EbmlElementSize(libwebm::kMkvSamplingFrequency,
+                                  static_cast<float>(sample_rate_));
+  size +=
+      EbmlElementSize(libwebm::kMkvChannels, static_cast<uint64>(channels_));
+  if (bit_depth_ > 0)
+    size +=
+        EbmlElementSize(libwebm::kMkvBitDepth, static_cast<uint64>(bit_depth_));
+  size += EbmlMasterElementSize(libwebm::kMkvAudio, size);
+
+  return parent_size + size;
+}
+
+bool AudioTrack::Write(IMkvWriter* writer) const {
+  if (!Track::Write(writer))
+    return false;
+
+  // Calculate AudioSettings size.
+  uint64_t size = EbmlElementSize(libwebm::kMkvSamplingFrequency,
+                                  static_cast<float>(sample_rate_));
+  size +=
+      EbmlElementSize(libwebm::kMkvChannels, static_cast<uint64>(channels_));
+  if (bit_depth_ > 0)
+    size +=
+        EbmlElementSize(libwebm::kMkvBitDepth, static_cast<uint64>(bit_depth_));
+
+  if (!WriteEbmlMasterElement(writer, libwebm::kMkvAudio, size))
+    return false;
+
+  const int64_t payload_position = writer->Position();
+  if (payload_position < 0)
+    return false;
+
+  if (!WriteEbmlElement(writer, libwebm::kMkvSamplingFrequency,
+                        static_cast<float>(sample_rate_)))
+    return false;
+  if (!WriteEbmlElement(writer, libwebm::kMkvChannels,
+                        static_cast<uint64>(channels_)))
+    return false;
+  if (bit_depth_ > 0)
+    if (!WriteEbmlElement(writer, libwebm::kMkvBitDepth,
+                          static_cast<uint64>(bit_depth_)))
+      return false;
+
+  const int64_t stop_position = writer->Position();
+  if (stop_position < 0 ||
+      stop_position - payload_position != static_cast<int64_t>(size))
+    return false;
+
+  return true;
+}
+
+///////////////////////////////////////////////////////////////
+//
+// Tracks Class
+
+const char Tracks::kOpusCodecId[] = "A_OPUS";
+const char Tracks::kVorbisCodecId[] = "A_VORBIS";
+const char Tracks::kAv1CodecId[] = "V_AV1";
+const char Tracks::kVp8CodecId[] = "V_VP8";
+const char Tracks::kVp9CodecId[] = "V_VP9";
+const char Tracks::kWebVttCaptionsId[] = "D_WEBVTT/CAPTIONS";
+const char Tracks::kWebVttDescriptionsId[] = "D_WEBVTT/DESCRIPTIONS";
+const char Tracks::kWebVttMetadataId[] = "D_WEBVTT/METADATA";
+const char Tracks::kWebVttSubtitlesId[] = "D_WEBVTT/SUBTITLES";
+
+Tracks::Tracks()
+    : track_entries_(NULL), track_entries_size_(0), wrote_tracks_(false) {}
+
+Tracks::~Tracks() {
+  if (track_entries_) {
+    for (uint32_t i = 0; i < track_entries_size_; ++i) {
+      Track* const track = track_entries_[i];
+      delete track;
+    }
+    delete[] track_entries_;
+  }
+}
+
+bool Tracks::AddTrack(Track* track, int32_t number) {
+  if (number < 0 || wrote_tracks_)
+    return false;
+
+  // This muxer only supports track numbers in the range [1, 126], in
+  // order to be able (to use Matroska integer representation) to
+  // serialize the block header (of which the track number is a part)
+  // for a frame using exactly 4 bytes.
+
+  if (number > 0x7E)
+    return false;
+
+  uint32_t track_num = number;
+
+  if (track_num > 0) {
+    // Check to make sure a track does not already have |track_num|.
+    for (uint32_t i = 0; i < track_entries_size_; ++i) {
+      if (track_entries_[i]->number() == track_num)
+        return false;
+    }
+  }
+
+  const uint32_t count = track_entries_size_ + 1;
+
+  Track** const track_entries = new (std::nothrow) Track*[count];  // NOLINT
+  if (!track_entries)
+    return false;
+
+  for (uint32_t i = 0; i < track_entries_size_; ++i) {
+    track_entries[i] = track_entries_[i];
+  }
+
+  delete[] track_entries_;
+
+  // Find the lowest availible track number > 0.
+  if (track_num == 0) {
+    track_num = count;
+
+    // Check to make sure a track does not already have |track_num|.
+    bool exit = false;
+    do {
+      exit = true;
+      for (uint32_t i = 0; i < track_entries_size_; ++i) {
+        if (track_entries[i]->number() == track_num) {
+          track_num++;
+          exit = false;
+          break;
+        }
+      }
+    } while (!exit);
+  }
+  track->set_number(track_num);
+
+  track_entries_ = track_entries;
+  track_entries_[track_entries_size_] = track;
+  track_entries_size_ = count;
+  return true;
+}
+
+const Track* Tracks::GetTrackByIndex(uint32_t index) const {
+  if (track_entries_ == NULL)
+    return NULL;
+
+  if (index >= track_entries_size_)
+    return NULL;
+
+  return track_entries_[index];
+}
+
+Track* Tracks::GetTrackByNumber(uint64_t track_number) const {
+  const int32_t count = track_entries_size();
+  for (int32_t i = 0; i < count; ++i) {
+    if (track_entries_[i]->number() == track_number)
+      return track_entries_[i];
+  }
+
+  return NULL;
+}
+
+bool Tracks::TrackIsAudio(uint64_t track_number) const {
+  const Track* const track = GetTrackByNumber(track_number);
+
+  if (track->type() == kAudio)
+    return true;
+
+  return false;
+}
+
+bool Tracks::TrackIsVideo(uint64_t track_number) const {
+  const Track* const track = GetTrackByNumber(track_number);
+
+  if (track->type() == kVideo)
+    return true;
+
+  return false;
+}
+
+bool Tracks::Write(IMkvWriter* writer) const {
+  uint64_t size = 0;
+  const int32_t count = track_entries_size();
+  for (int32_t i = 0; i < count; ++i) {
+    const Track* const track = GetTrackByIndex(i);
+
+    if (!track)
+      return false;
+
+    size += track->Size();
+  }
+
+  if (!WriteEbmlMasterElement(writer, libwebm::kMkvTracks, size))
+    return false;
+
+  const int64_t payload_position = writer->Position();
+  if (payload_position < 0)
+    return false;
+
+  for (int32_t i = 0; i < count; ++i) {
+    const Track* const track = GetTrackByIndex(i);
+    if (!track->Write(writer))
+      return false;
+  }
+
+  const int64_t stop_position = writer->Position();
+  if (stop_position < 0 ||
+      stop_position - payload_position != static_cast<int64_t>(size))
+    return false;
+
+  wrote_tracks_ = true;
+  return true;
+}
+
+///////////////////////////////////////////////////////////////
+//
+// Chapter Class
+
+bool Chapter::set_id(const char* id) { return StrCpy(id, &id_); }
+
+void Chapter::set_time(const Segment& segment, uint64_t start_ns,
+                       uint64_t end_ns) {
+  const SegmentInfo* const info = segment.GetSegmentInfo();
+  const uint64_t timecode_scale = info->timecode_scale();
+  start_timecode_ = start_ns / timecode_scale;
+  end_timecode_ = end_ns / timecode_scale;
+}
+
+bool Chapter::add_string(const char* title, const char* language,
+                         const char* country) {
+  if (!ExpandDisplaysArray())
+    return false;
+
+  Display& d = displays_[displays_count_++];
+  d.Init();
+
+  if (!d.set_title(title))
+    return false;
+
+  if (!d.set_language(language))
+    return false;
+
+  if (!d.set_country(country))
+    return false;
+
+  return true;
+}
+
+Chapter::Chapter() {
+  // This ctor only constructs the object.  Proper initialization is
+  // done in Init() (called in Chapters::AddChapter()).  The only
+  // reason we bother implementing this ctor is because we had to
+  // declare it as private (along with the dtor), in order to prevent
+  // clients from creating Chapter instances (a privelege we grant
+  // only to the Chapters class).  Doing no initialization here also
+  // means that creating arrays of chapter objects is more efficient,
+  // because we only initialize each new chapter object as it becomes
+  // active on the array.
+}
+
+Chapter::~Chapter() {}
+
+void Chapter::Init(unsigned int* seed) {
+  id_ = NULL;
+  start_timecode_ = 0;
+  end_timecode_ = 0;
+  displays_ = NULL;
+  displays_size_ = 0;
+  displays_count_ = 0;
+  uid_ = MakeUID(seed);
+}
+
+void Chapter::ShallowCopy(Chapter* dst) const {
+  dst->id_ = id_;
+  dst->start_timecode_ = start_timecode_;
+  dst->end_timecode_ = end_timecode_;
+  dst->uid_ = uid_;
+  dst->displays_ = displays_;
+  dst->displays_size_ = displays_size_;
+  dst->displays_count_ = displays_count_;
+}
+
+void Chapter::Clear() {
+  StrCpy(NULL, &id_);
+
+  while (displays_count_ > 0) {
+    Display& d = displays_[--displays_count_];
+    d.Clear();
+  }
+
+  delete[] displays_;
+  displays_ = NULL;
+
+  displays_size_ = 0;
+}
+
+bool Chapter::ExpandDisplaysArray() {
+  if (displays_size_ > displays_count_)
+    return true;  // nothing to do yet
+
+  const int size = (displays_size_ == 0) ? 1 : 2 * displays_size_;
+
+  Display* const displays = new (std::nothrow) Display[size];  // NOLINT
+  if (displays == NULL)
+    return false;
+
+  for (int idx = 0; idx < displays_count_; ++idx) {
+    displays[idx] = displays_[idx];  // shallow copy
+  }
+
+  delete[] displays_;
+
+  displays_ = displays;
+  displays_size_ = size;
+
+  return true;
+}
+
+uint64_t Chapter::WriteAtom(IMkvWriter* writer) const {
+  uint64_t payload_size =
+      EbmlElementSize(libwebm::kMkvChapterStringUID, id_) +
+      EbmlElementSize(libwebm::kMkvChapterUID, static_cast<uint64>(uid_)) +
+      EbmlElementSize(libwebm::kMkvChapterTimeStart,
+                      static_cast<uint64>(start_timecode_)) +
+      EbmlElementSize(libwebm::kMkvChapterTimeEnd,
+                      static_cast<uint64>(end_timecode_));
+
+  for (int idx = 0; idx < displays_count_; ++idx) {
+    const Display& d = displays_[idx];
+    payload_size += d.WriteDisplay(NULL);
+  }
+
+  const uint64_t atom_size =
+      EbmlMasterElementSize(libwebm::kMkvChapterAtom, payload_size) +
+      payload_size;
+
+  if (writer == NULL)
+    return atom_size;
+
+  const int64_t start = writer->Position();
+
+  if (!WriteEbmlMasterElement(writer, libwebm::kMkvChapterAtom, payload_size))
+    return 0;
+
+  if (!WriteEbmlElement(writer, libwebm::kMkvChapterStringUID, id_))
+    return 0;
+
+  if (!WriteEbmlElement(writer, libwebm::kMkvChapterUID,
+                        static_cast<uint64>(uid_)))
+    return 0;
+
+  if (!WriteEbmlElement(writer, libwebm::kMkvChapterTimeStart,
+                        static_cast<uint64>(start_timecode_)))
+    return 0;
+
+  if (!WriteEbmlElement(writer, libwebm::kMkvChapterTimeEnd,
+                        static_cast<uint64>(end_timecode_)))
+    return 0;
+
+  for (int idx = 0; idx < displays_count_; ++idx) {
+    const Display& d = displays_[idx];
+
+    if (!d.WriteDisplay(writer))
+      return 0;
+  }
+
+  const int64_t stop = writer->Position();
+
+  if (stop >= start && uint64_t(stop - start) != atom_size)
+    return 0;
+
+  return atom_size;
+}
+
+void Chapter::Display::Init() {
+  title_ = NULL;
+  language_ = NULL;
+  country_ = NULL;
+}
+
+void Chapter::Display::Clear() {
+  StrCpy(NULL, &title_);
+  StrCpy(NULL, &language_);
+  StrCpy(NULL, &country_);
+}
+
+bool Chapter::Display::set_title(const char* title) {
+  return StrCpy(title, &title_);
+}
+
+bool Chapter::Display::set_language(const char* language) {
+  return StrCpy(language, &language_);
+}
+
+bool Chapter::Display::set_country(const char* country) {
+  return StrCpy(country, &country_);
+}
+
+uint64_t Chapter::Display::WriteDisplay(IMkvWriter* writer) const {
+  uint64_t payload_size = EbmlElementSize(libwebm::kMkvChapString, title_);
+
+  if (language_)
+    payload_size += EbmlElementSize(libwebm::kMkvChapLanguage, language_);
+
+  if (country_)
+    payload_size += EbmlElementSize(libwebm::kMkvChapCountry, country_);
+
+  const uint64_t display_size =
+      EbmlMasterElementSize(libwebm::kMkvChapterDisplay, payload_size) +
+      payload_size;
+
+  if (writer == NULL)
+    return display_size;
+
+  const int64_t start = writer->Position();
+
+  if (!WriteEbmlMasterElement(writer, libwebm::kMkvChapterDisplay,
+                              payload_size))
+    return 0;
+
+  if (!WriteEbmlElement(writer, libwebm::kMkvChapString, title_))
+    return 0;
+
+  if (language_) {
+    if (!WriteEbmlElement(writer, libwebm::kMkvChapLanguage, language_))
+      return 0;
+  }
+
+  if (country_) {
+    if (!WriteEbmlElement(writer, libwebm::kMkvChapCountry, country_))
+      return 0;
+  }
+
+  const int64_t stop = writer->Position();
+
+  if (stop >= start && uint64_t(stop - start) != display_size)
+    return 0;
+
+  return display_size;
+}
+
+///////////////////////////////////////////////////////////////
+//
+// Chapters Class
+
+Chapters::Chapters() : chapters_size_(0), chapters_count_(0), chapters_(NULL) {}
+
+Chapters::~Chapters() {
+  while (chapters_count_ > 0) {
+    Chapter& chapter = chapters_[--chapters_count_];
+    chapter.Clear();
+  }
+
+  delete[] chapters_;
+  chapters_ = NULL;
+}
+
+int Chapters::Count() const { return chapters_count_; }
+
+Chapter* Chapters::AddChapter(unsigned int* seed) {
+  if (!ExpandChaptersArray())
+    return NULL;
+
+  Chapter& chapter = chapters_[chapters_count_++];
+  chapter.Init(seed);
+
+  return &chapter;
+}
+
+bool Chapters::Write(IMkvWriter* writer) const {
+  if (writer == NULL)
+    return false;
+
+  const uint64_t payload_size = WriteEdition(NULL);  // return size only
+
+  if (!WriteEbmlMasterElement(writer, libwebm::kMkvChapters, payload_size))
+    return false;
+
+  const int64_t start = writer->Position();
+
+  if (WriteEdition(writer) == 0)  // error
+    return false;
+
+  const int64_t stop = writer->Position();
+
+  if (stop >= start && uint64_t(stop - start) != payload_size)
+    return false;
+
+  return true;
+}
+
+bool Chapters::ExpandChaptersArray() {
+  if (chapters_size_ > chapters_count_)
+    return true;  // nothing to do yet
+
+  const int size = (chapters_size_ == 0) ? 1 : 2 * chapters_size_;
+
+  Chapter* const chapters = new (std::nothrow) Chapter[size];  // NOLINT
+  if (chapters == NULL)
+    return false;
+
+  for (int idx = 0; idx < chapters_count_; ++idx) {
+    const Chapter& src = chapters_[idx];
+    Chapter* const dst = chapters + idx;
+    src.ShallowCopy(dst);
+  }
+
+  delete[] chapters_;
+
+  chapters_ = chapters;
+  chapters_size_ = size;
+
+  return true;
+}
+
+uint64_t Chapters::WriteEdition(IMkvWriter* writer) const {
+  uint64_t payload_size = 0;
+
+  for (int idx = 0; idx < chapters_count_; ++idx) {
+    const Chapter& chapter = chapters_[idx];
+    payload_size += chapter.WriteAtom(NULL);
+  }
+
+  const uint64_t edition_size =
+      EbmlMasterElementSize(libwebm::kMkvEditionEntry, payload_size) +
+      payload_size;
+
+  if (writer == NULL)  // return size only
+    return edition_size;
+
+  const int64_t start = writer->Position();
+
+  if (!WriteEbmlMasterElement(writer, libwebm::kMkvEditionEntry, payload_size))
+    return 0;  // error
+
+  for (int idx = 0; idx < chapters_count_; ++idx) {
+    const Chapter& chapter = chapters_[idx];
+
+    const uint64_t chapter_size = chapter.WriteAtom(writer);
+    if (chapter_size == 0)  // error
+      return 0;
+  }
+
+  const int64_t stop = writer->Position();
+
+  if (stop >= start && uint64_t(stop - start) != edition_size)
+    return 0;
+
+  return edition_size;
+}
+
+// Tag Class
+
+bool Tag::add_simple_tag(const char* tag_name, const char* tag_string) {
+  if (!ExpandSimpleTagsArray())
+    return false;
+
+  SimpleTag& st = simple_tags_[simple_tags_count_++];
+  st.Init();
+
+  if (!st.set_tag_name(tag_name))
+    return false;
+
+  if (!st.set_tag_string(tag_string))
+    return false;
+
+  return true;
+}
+
+Tag::Tag() {
+  simple_tags_ = NULL;
+  simple_tags_size_ = 0;
+  simple_tags_count_ = 0;
+}
+
+Tag::~Tag() {}
+
+void Tag::ShallowCopy(Tag* dst) const {
+  dst->simple_tags_ = simple_tags_;
+  dst->simple_tags_size_ = simple_tags_size_;
+  dst->simple_tags_count_ = simple_tags_count_;
+}
+
+void Tag::Clear() {
+  while (simple_tags_count_ > 0) {
+    SimpleTag& st = simple_tags_[--simple_tags_count_];
+    st.Clear();
+  }
+
+  delete[] simple_tags_;
+  simple_tags_ = NULL;
+
+  simple_tags_size_ = 0;
+}
+
+bool Tag::ExpandSimpleTagsArray() {
+  if (simple_tags_size_ > simple_tags_count_)
+    return true;  // nothing to do yet
+
+  const int size = (simple_tags_size_ == 0) ? 1 : 2 * simple_tags_size_;
+
+  SimpleTag* const simple_tags = new (std::nothrow) SimpleTag[size];  // NOLINT
+  if (simple_tags == NULL)
+    return false;
+
+  for (int idx = 0; idx < simple_tags_count_; ++idx) {
+    simple_tags[idx] = simple_tags_[idx];  // shallow copy
+  }
+
+  delete[] simple_tags_;
+
+  simple_tags_ = simple_tags;
+  simple_tags_size_ = size;
+
+  return true;
+}
+
+uint64_t Tag::Write(IMkvWriter* writer) const {
+  uint64_t payload_size = 0;
+
+  for (int idx = 0; idx < simple_tags_count_; ++idx) {
+    const SimpleTag& st = simple_tags_[idx];
+    payload_size += st.Write(NULL);
+  }
+
+  const uint64_t tag_size =
+      EbmlMasterElementSize(libwebm::kMkvTag, payload_size) + payload_size;
+
+  if (writer == NULL)
+    return tag_size;
+
+  const int64_t start = writer->Position();
+
+  if (!WriteEbmlMasterElement(writer, libwebm::kMkvTag, payload_size))
+    return 0;
+
+  for (int idx = 0; idx < simple_tags_count_; ++idx) {
+    const SimpleTag& st = simple_tags_[idx];
+
+    if (!st.Write(writer))
+      return 0;
+  }
+
+  const int64_t stop = writer->Position();
+
+  if (stop >= start && uint64_t(stop - start) != tag_size)
+    return 0;
+
+  return tag_size;
+}
+
+// Tag::SimpleTag
+
+void Tag::SimpleTag::Init() {
+  tag_name_ = NULL;
+  tag_string_ = NULL;
+}
+
+void Tag::SimpleTag::Clear() {
+  StrCpy(NULL, &tag_name_);
+  StrCpy(NULL, &tag_string_);
+}
+
+bool Tag::SimpleTag::set_tag_name(const char* tag_name) {
+  return StrCpy(tag_name, &tag_name_);
+}
+
+bool Tag::SimpleTag::set_tag_string(const char* tag_string) {
+  return StrCpy(tag_string, &tag_string_);
+}
+
+uint64_t Tag::SimpleTag::Write(IMkvWriter* writer) const {
+  uint64_t payload_size = EbmlElementSize(libwebm::kMkvTagName, tag_name_);
+
+  payload_size += EbmlElementSize(libwebm::kMkvTagString, tag_string_);
+
+  const uint64_t simple_tag_size =
+      EbmlMasterElementSize(libwebm::kMkvSimpleTag, payload_size) +
+      payload_size;
+
+  if (writer == NULL)
+    return simple_tag_size;
+
+  const int64_t start = writer->Position();
+
+  if (!WriteEbmlMasterElement(writer, libwebm::kMkvSimpleTag, payload_size))
+    return 0;
+
+  if (!WriteEbmlElement(writer, libwebm::kMkvTagName, tag_name_))
+    return 0;
+
+  if (!WriteEbmlElement(writer, libwebm::kMkvTagString, tag_string_))
+    return 0;
+
+  const int64_t stop = writer->Position();
+
+  if (stop >= start && uint64_t(stop - start) != simple_tag_size)
+    return 0;
+
+  return simple_tag_size;
+}
+
+// Tags Class
+
+Tags::Tags() : tags_size_(0), tags_count_(0), tags_(NULL) {}
+
+Tags::~Tags() {
+  while (tags_count_ > 0) {
+    Tag& tag = tags_[--tags_count_];
+    tag.Clear();
+  }
+
+  delete[] tags_;
+  tags_ = NULL;
+}
+
+int Tags::Count() const { return tags_count_; }
+
+Tag* Tags::AddTag() {
+  if (!ExpandTagsArray())
+    return NULL;
+
+  Tag& tag = tags_[tags_count_++];
+
+  return &tag;
+}
+
+bool Tags::Write(IMkvWriter* writer) const {
+  if (writer == NULL)
+    return false;
+
+  uint64_t payload_size = 0;
+
+  for (int idx = 0; idx < tags_count_; ++idx) {
+    const Tag& tag = tags_[idx];
+    payload_size += tag.Write(NULL);
+  }
+
+  if (!WriteEbmlMasterElement(writer, libwebm::kMkvTags, payload_size))
+    return false;
+
+  const int64_t start = writer->Position();
+
+  for (int idx = 0; idx < tags_count_; ++idx) {
+    const Tag& tag = tags_[idx];
+
+    const uint64_t tag_size = tag.Write(writer);
+    if (tag_size == 0)  // error
+      return 0;
+  }
+
+  const int64_t stop = writer->Position();
+
+  if (stop >= start && uint64_t(stop - start) != payload_size)
+    return false;
+
+  return true;
+}
+
+bool Tags::ExpandTagsArray() {
+  if (tags_size_ > tags_count_)
+    return true;  // nothing to do yet
+
+  const int size = (tags_size_ == 0) ? 1 : 2 * tags_size_;
+
+  Tag* const tags = new (std::nothrow) Tag[size];  // NOLINT
+  if (tags == NULL)
+    return false;
+
+  for (int idx = 0; idx < tags_count_; ++idx) {
+    const Tag& src = tags_[idx];
+    Tag* const dst = tags + idx;
+    src.ShallowCopy(dst);
+  }
+
+  delete[] tags_;
+
+  tags_ = tags;
+  tags_size_ = size;
+
+  return true;
+}
+
+///////////////////////////////////////////////////////////////
+//
+// Cluster class
+
+Cluster::Cluster(uint64_t timecode, int64_t cues_pos, uint64_t timecode_scale,
+                 bool write_last_frame_with_duration, bool fixed_size_timecode)
+    : blocks_added_(0),
+      finalized_(false),
+      fixed_size_timecode_(fixed_size_timecode),
+      header_written_(false),
+      payload_size_(0),
+      position_for_cues_(cues_pos),
+      size_position_(-1),
+      timecode_(timecode),
+      timecode_scale_(timecode_scale),
+      write_last_frame_with_duration_(write_last_frame_with_duration),
+      writer_(NULL) {}
+
+Cluster::~Cluster() {
+  // Delete any stored frames that are left behind. This will happen if the
+  // Cluster was not Finalized for whatever reason.
+  while (!stored_frames_.empty()) {
+    while (!stored_frames_.begin()->second.empty()) {
+      delete stored_frames_.begin()->second.front();
+      stored_frames_.begin()->second.pop_front();
+    }
+    stored_frames_.erase(stored_frames_.begin()->first);
+  }
+}
+
+bool Cluster::Init(IMkvWriter* ptr_writer) {
+  if (!ptr_writer) {
+    return false;
+  }
+  writer_ = ptr_writer;
+  return true;
+}
+
+bool Cluster::AddFrame(const Frame* const frame) {
+  return QueueOrWriteFrame(frame);
+}
+
+bool Cluster::AddFrame(const uint8_t* data, uint64_t length,
+                       uint64_t track_number, uint64_t abs_timecode,
+                       bool is_key) {
+  Frame frame;
+  if (!frame.Init(data, length))
+    return false;
+  frame.set_track_number(track_number);
+  frame.set_timestamp(abs_timecode);
+  frame.set_is_key(is_key);
+  return QueueOrWriteFrame(&frame);
+}
+
+bool Cluster::AddFrameWithAdditional(const uint8_t* data, uint64_t length,
+                                     const uint8_t* additional,
+                                     uint64_t additional_length,
+                                     uint64_t add_id, uint64_t track_number,
+                                     uint64_t abs_timecode, bool is_key) {
+  if (!additional || additional_length == 0) {
+    return false;
+  }
+  Frame frame;
+  if (!frame.Init(data, length) ||
+      !frame.AddAdditionalData(additional, additional_length, add_id)) {
+    return false;
+  }
+  frame.set_track_number(track_number);
+  frame.set_timestamp(abs_timecode);
+  frame.set_is_key(is_key);
+  return QueueOrWriteFrame(&frame);
+}
+
+bool Cluster::AddFrameWithDiscardPadding(const uint8_t* data, uint64_t length,
+                                         int64_t discard_padding,
+                                         uint64_t track_number,
+                                         uint64_t abs_timecode, bool is_key) {
+  Frame frame;
+  if (!frame.Init(data, length))
+    return false;
+  frame.set_discard_padding(discard_padding);
+  frame.set_track_number(track_number);
+  frame.set_timestamp(abs_timecode);
+  frame.set_is_key(is_key);
+  return QueueOrWriteFrame(&frame);
+}
+
+bool Cluster::AddMetadata(const uint8_t* data, uint64_t length,
+                          uint64_t track_number, uint64_t abs_timecode,
+                          uint64_t duration_timecode) {
+  Frame frame;
+  if (!frame.Init(data, length))
+    return false;
+  frame.set_track_number(track_number);
+  frame.set_timestamp(abs_timecode);
+  frame.set_duration(duration_timecode);
+  frame.set_is_key(true);  // All metadata blocks are keyframes.
+  return QueueOrWriteFrame(&frame);
+}
+
+void Cluster::AddPayloadSize(uint64_t size) { payload_size_ += size; }
+
+bool Cluster::Finalize() {
+  return !write_last_frame_with_duration_ && Finalize(false, 0);
+}
+
+bool Cluster::Finalize(bool set_last_frame_duration, uint64_t duration) {
+  if (!writer_ || finalized_)
+    return false;
+
+  if (write_last_frame_with_duration_) {
+    // Write out held back Frames. This essentially performs a k-way merge
+    // across all tracks in the increasing order of timestamps.
+    while (!stored_frames_.empty()) {
+      Frame* frame = stored_frames_.begin()->second.front();
+
+      // Get the next frame to write (frame with least timestamp across all
+      // tracks).
+      for (FrameMapIterator frames_iterator = ++stored_frames_.begin();
+           frames_iterator != stored_frames_.end(); ++frames_iterator) {
+        if (frames_iterator->second.front()->timestamp() < frame->timestamp()) {
+          frame = frames_iterator->second.front();
+        }
+      }
+
+      // Set the duration if it's the last frame for the track.
+      if (set_last_frame_duration &&
+          stored_frames_[frame->track_number()].size() == 1 &&
+          !frame->duration_set()) {
+        frame->set_duration(duration - frame->timestamp());
+        if (!frame->is_key() && !frame->reference_block_timestamp_set()) {
+          frame->set_reference_block_timestamp(
+              last_block_timestamp_[frame->track_number()]);
+        }
+      }
+
+      // Write the frame and remove it from |stored_frames_|.
+      const bool wrote_frame = DoWriteFrame(frame);
+      stored_frames_[frame->track_number()].pop_front();
+      if (stored_frames_[frame->track_number()].empty()) {
+        stored_frames_.erase(frame->track_number());
+      }
+      delete frame;
+      if (!wrote_frame)
+        return false;
+    }
+  }
+
+  if (size_position_ == -1)
+    return false;
+
+  if (writer_->Seekable()) {
+    const int64_t pos = writer_->Position();
+
+    if (writer_->Position(size_position_))
+      return false;
+
+    if (WriteUIntSize(writer_, payload_size(), 8))
+      return false;
+
+    if (writer_->Position(pos))
+      return false;
+  }
+
+  finalized_ = true;
+
+  return true;
+}
+
+uint64_t Cluster::Size() const {
+  const uint64_t element_size =
+      EbmlMasterElementSize(libwebm::kMkvCluster, 0xFFFFFFFFFFFFFFFFULL) +
+      payload_size_;
+  return element_size;
+}
+
+bool Cluster::PreWriteBlock() {
+  if (finalized_)
+    return false;
+
+  if (!header_written_) {
+    if (!WriteClusterHeader())
+      return false;
+  }
+
+  return true;
+}
+
+void Cluster::PostWriteBlock(uint64_t element_size) {
+  AddPayloadSize(element_size);
+  ++blocks_added_;
+}
+
+int64_t Cluster::GetRelativeTimecode(int64_t abs_timecode) const {
+  const int64_t cluster_timecode = this->Cluster::timecode();
+  const int64_t rel_timecode =
+      static_cast<int64_t>(abs_timecode) - cluster_timecode;
+
+  if (rel_timecode < 0 || rel_timecode > kMaxBlockTimecode)
+    return -1;
+
+  return rel_timecode;
+}
+
+bool Cluster::DoWriteFrame(const Frame* const frame) {
+  if (!frame || !frame->IsValid())
+    return false;
+
+  if (!PreWriteBlock())
+    return false;
+
+  const uint64_t element_size = WriteFrame(writer_, frame, this);
+  if (element_size == 0)
+    return false;
+
+  PostWriteBlock(element_size);
+  last_block_timestamp_[frame->track_number()] = frame->timestamp();
+  return true;
+}
+
+bool Cluster::QueueOrWriteFrame(const Frame* const frame) {
+  if (!frame || !frame->IsValid())
+    return false;
+
+  // If |write_last_frame_with_duration_| is not set, then write the frame right
+  // away.
+  if (!write_last_frame_with_duration_) {
+    return DoWriteFrame(frame);
+  }
+
+  // Queue the current frame.
+  uint64_t track_number = frame->track_number();
+  Frame* const frame_to_store = new Frame();
+  frame_to_store->CopyFrom(*frame);
+  stored_frames_[track_number].push_back(frame_to_store);
+
+  // Iterate through all queued frames in the current track except the last one
+  // and write it if it is okay to do so (i.e.) no other track has an held back
+  // frame with timestamp <= the timestamp of the frame in question.
+  std::vector<std::list<Frame*>::iterator> frames_to_erase;
+  for (std::list<Frame*>::iterator
+           current_track_iterator = stored_frames_[track_number].begin(),
+           end = --stored_frames_[track_number].end();
+       current_track_iterator != end; ++current_track_iterator) {
+    const Frame* const frame_to_write = *current_track_iterator;
+    bool okay_to_write = true;
+    for (FrameMapIterator track_iterator = stored_frames_.begin();
+         track_iterator != stored_frames_.end(); ++track_iterator) {
+      if (track_iterator->first == track_number) {
+        continue;
+      }
+      if (track_iterator->second.front()->timestamp() <
+          frame_to_write->timestamp()) {
+        okay_to_write = false;
+        break;
+      }
+    }
+    if (okay_to_write) {
+      const bool wrote_frame = DoWriteFrame(frame_to_write);
+      delete frame_to_write;
+      if (!wrote_frame)
+        return false;
+      frames_to_erase.push_back(current_track_iterator);
+    } else {
+      break;
+    }
+  }
+  for (std::vector<std::list<Frame*>::iterator>::iterator iterator =
+           frames_to_erase.begin();
+       iterator != frames_to_erase.end(); ++iterator) {
+    stored_frames_[track_number].erase(*iterator);
+  }
+  return true;
+}
+
+bool Cluster::WriteClusterHeader() {
+  if (finalized_)
+    return false;
+
+  if (WriteID(writer_, libwebm::kMkvCluster))
+    return false;
+
+  // Save for later.
+  size_position_ = writer_->Position();
+
+  // Write "unknown" (EBML coded -1) as cluster size value. We need to write 8
+  // bytes because we do not know how big our cluster will be.
+  if (SerializeInt(writer_, kEbmlUnknownValue, 8))
+    return false;
+
+  if (!WriteEbmlElement(writer_, libwebm::kMkvTimecode, timecode(),
+                        fixed_size_timecode_ ? 8 : 0)) {
+    return false;
+  }
+  AddPayloadSize(EbmlElementSize(libwebm::kMkvTimecode, timecode(),
+                                 fixed_size_timecode_ ? 8 : 0));
+  header_written_ = true;
+
+  return true;
+}
+
+///////////////////////////////////////////////////////////////
+//
+// SeekHead Class
+
+SeekHead::SeekHead() : start_pos_(0ULL) {
+  for (int32_t i = 0; i < kSeekEntryCount; ++i) {
+    seek_entry_id_[i] = 0;
+    seek_entry_pos_[i] = 0;
+  }
+}
+
+SeekHead::~SeekHead() {}
+
+bool SeekHead::Finalize(IMkvWriter* writer) const {
+  if (writer->Seekable()) {
+    if (start_pos_ == -1)
+      return false;
+
+    uint64_t payload_size = 0;
+    uint64_t entry_size[kSeekEntryCount];
+
+    for (int32_t i = 0; i < kSeekEntryCount; ++i) {
+      if (seek_entry_id_[i] != 0) {
+        entry_size[i] = EbmlElementSize(libwebm::kMkvSeekID,
+                                        static_cast<uint64>(seek_entry_id_[i]));
+        entry_size[i] += EbmlElementSize(
+            libwebm::kMkvSeekPosition, static_cast<uint64>(seek_entry_pos_[i]));
+
+        payload_size +=
+            EbmlMasterElementSize(libwebm::kMkvSeek, entry_size[i]) +
+            entry_size[i];
+      }
+    }
+
+    // No SeekHead elements
+    if (payload_size == 0)
+      return true;
+
+    const int64_t pos = writer->Position();
+    if (writer->Position(start_pos_))
+      return false;
+
+    if (!WriteEbmlMasterElement(writer, libwebm::kMkvSeekHead, payload_size))
+      return false;
+
+    for (int32_t i = 0; i < kSeekEntryCount; ++i) {
+      if (seek_entry_id_[i] != 0) {
+        if (!WriteEbmlMasterElement(writer, libwebm::kMkvSeek, entry_size[i]))
+          return false;
+
+        if (!WriteEbmlElement(writer, libwebm::kMkvSeekID,
+                              static_cast<uint64>(seek_entry_id_[i])))
+          return false;
+
+        if (!WriteEbmlElement(writer, libwebm::kMkvSeekPosition,
+                              static_cast<uint64>(seek_entry_pos_[i])))
+          return false;
+      }
+    }
+
+    const uint64_t total_entry_size = kSeekEntryCount * MaxEntrySize();
+    const uint64_t total_size =
+        EbmlMasterElementSize(libwebm::kMkvSeekHead, total_entry_size) +
+        total_entry_size;
+    const int64_t size_left = total_size - (writer->Position() - start_pos_);
+
+    const uint64_t bytes_written = WriteVoidElement(writer, size_left);
+    if (!bytes_written)
+      return false;
+
+    if (writer->Position(pos))
+      return false;
+  }
+
+  return true;
+}
+
+bool SeekHead::Write(IMkvWriter* writer) {
+  const uint64_t entry_size = kSeekEntryCount * MaxEntrySize();
+  const uint64_t size =
+      EbmlMasterElementSize(libwebm::kMkvSeekHead, entry_size);
+
+  start_pos_ = writer->Position();
+
+  const uint64_t bytes_written = WriteVoidElement(writer, size + entry_size);
+  if (!bytes_written)
+    return false;
+
+  return true;
+}
+
+bool SeekHead::AddSeekEntry(uint32_t id, uint64_t pos) {
+  for (int32_t i = 0; i < kSeekEntryCount; ++i) {
+    if (seek_entry_id_[i] == 0) {
+      seek_entry_id_[i] = id;
+      seek_entry_pos_[i] = pos;
+      return true;
+    }
+  }
+  return false;
+}
+
+uint32_t SeekHead::GetId(int index) const {
+  if (index < 0 || index >= kSeekEntryCount)
+    return UINT_MAX;
+  return seek_entry_id_[index];
+}
+
+uint64_t SeekHead::GetPosition(int index) const {
+  if (index < 0 || index >= kSeekEntryCount)
+    return ULLONG_MAX;
+  return seek_entry_pos_[index];
+}
+
+bool SeekHead::SetSeekEntry(int index, uint32_t id, uint64_t position) {
+  if (index < 0 || index >= kSeekEntryCount)
+    return false;
+  seek_entry_id_[index] = id;
+  seek_entry_pos_[index] = position;
+  return true;
+}
+
+uint64_t SeekHead::MaxEntrySize() const {
+  const uint64_t max_entry_payload_size =
+      EbmlElementSize(libwebm::kMkvSeekID,
+                      static_cast<uint64>(UINT64_C(0xffffffff))) +
+      EbmlElementSize(libwebm::kMkvSeekPosition,
+                      static_cast<uint64>(UINT64_C(0xffffffffffffffff)));
+  const uint64_t max_entry_size =
+      EbmlMasterElementSize(libwebm::kMkvSeek, max_entry_payload_size) +
+      max_entry_payload_size;
+
+  return max_entry_size;
+}
+
+///////////////////////////////////////////////////////////////
+//
+// SegmentInfo Class
+
+SegmentInfo::SegmentInfo()
+    : duration_(-1.0),
+      muxing_app_(NULL),
+      timecode_scale_(1000000ULL),
+      writing_app_(NULL),
+      date_utc_(LLONG_MIN),
+      duration_pos_(-1) {}
+
+SegmentInfo::~SegmentInfo() {
+  delete[] muxing_app_;
+  delete[] writing_app_;
+}
+
+bool SegmentInfo::Init() {
+  int32_t major;
+  int32_t minor;
+  int32_t build;
+  int32_t revision;
+  GetVersion(&major, &minor, &build, &revision);
+  char temp[256];
+#ifdef _MSC_VER
+  sprintf_s(temp, sizeof(temp) / sizeof(temp[0]), "libwebm-%d.%d.%d.%d", major,
+            minor, build, revision);
+#else
+  snprintf(temp, sizeof(temp) / sizeof(temp[0]), "libwebm-%d.%d.%d.%d", major,
+           minor, build, revision);
+#endif
+
+  const size_t app_len = strlen(temp) + 1;
+
+  delete[] muxing_app_;
+
+  muxing_app_ = new (std::nothrow) char[app_len];  // NOLINT
+  if (!muxing_app_)
+    return false;
+
+#ifdef _MSC_VER
+  strcpy_s(muxing_app_, app_len, temp);
+#else
+  strcpy(muxing_app_, temp);
+#endif
+
+  set_writing_app(temp);
+  if (!writing_app_)
+    return false;
+  return true;
+}
+
+bool SegmentInfo::Finalize(IMkvWriter* writer) const {
+  if (!writer)
+    return false;
+
+  if (duration_ > 0.0) {
+    if (writer->Seekable()) {
+      if (duration_pos_ == -1)
+        return false;
+
+      const int64_t pos = writer->Position();
+
+      if (writer->Position(duration_pos_))
+        return false;
+
+      if (!WriteEbmlElement(writer, libwebm::kMkvDuration,
+                            static_cast<float>(duration_)))
+        return false;
+
+      if (writer->Position(pos))
+        return false;
+    }
+  }
+
+  return true;
+}
+
+bool SegmentInfo::Write(IMkvWriter* writer) {
+  if (!writer || !muxing_app_ || !writing_app_)
+    return false;
+
+  uint64_t size = EbmlElementSize(libwebm::kMkvTimecodeScale,
+                                  static_cast<uint64>(timecode_scale_));
+  if (duration_ > 0.0)
+    size +=
+        EbmlElementSize(libwebm::kMkvDuration, static_cast<float>(duration_));
+  if (date_utc_ != LLONG_MIN)
+    size += EbmlDateElementSize(libwebm::kMkvDateUTC);
+  size += EbmlElementSize(libwebm::kMkvMuxingApp, muxing_app_);
+  size += EbmlElementSize(libwebm::kMkvWritingApp, writing_app_);
+
+  if (!WriteEbmlMasterElement(writer, libwebm::kMkvInfo, size))
+    return false;
+
+  const int64_t payload_position = writer->Position();
+  if (payload_position < 0)
+    return false;
+
+  if (!WriteEbmlElement(writer, libwebm::kMkvTimecodeScale,
+                        static_cast<uint64>(timecode_scale_)))
+    return false;
+
+  if (duration_ > 0.0) {
+    // Save for later
+    duration_pos_ = writer->Position();
+
+    if (!WriteEbmlElement(writer, libwebm::kMkvDuration,
+                          static_cast<float>(duration_)))
+      return false;
+  }
+
+  if (date_utc_ != LLONG_MIN)
+    WriteEbmlDateElement(writer, libwebm::kMkvDateUTC, date_utc_);
+
+  if (!WriteEbmlElement(writer, libwebm::kMkvMuxingApp, muxing_app_))
+    return false;
+  if (!WriteEbmlElement(writer, libwebm::kMkvWritingApp, writing_app_))
+    return false;
+
+  const int64_t stop_position = writer->Position();
+  if (stop_position < 0 ||
+      stop_position - payload_position != static_cast<int64_t>(size))
+    return false;
+
+  return true;
+}
+
+void SegmentInfo::set_muxing_app(const char* app) {
+  if (app) {
+    const size_t length = strlen(app) + 1;
+    char* temp_str = new (std::nothrow) char[length];  // NOLINT
+    if (!temp_str)
+      return;
+
+#ifdef _MSC_VER
+    strcpy_s(temp_str, length, app);
+#else
+    strcpy(temp_str, app);
+#endif
+
+    delete[] muxing_app_;
+    muxing_app_ = temp_str;
+  }
+}
+
+void SegmentInfo::set_writing_app(const char* app) {
+  if (app) {
+    const size_t length = strlen(app) + 1;
+    char* temp_str = new (std::nothrow) char[length];  // NOLINT
+    if (!temp_str)
+      return;
+
+#ifdef _MSC_VER
+    strcpy_s(temp_str, length, app);
+#else
+    strcpy(temp_str, app);
+#endif
+
+    delete[] writing_app_;
+    writing_app_ = temp_str;
+  }
+}
+
+///////////////////////////////////////////////////////////////
+//
+// Segment Class
+
+Segment::Segment()
+    : chunk_count_(0),
+      chunk_name_(NULL),
+      chunk_writer_cluster_(NULL),
+      chunk_writer_cues_(NULL),
+      chunk_writer_header_(NULL),
+      chunking_(false),
+      chunking_base_name_(NULL),
+      cluster_list_(NULL),
+      cluster_list_capacity_(0),
+      cluster_list_size_(0),
+      cues_position_(kAfterClusters),
+      cues_track_(0),
+      force_new_cluster_(false),
+      frames_(NULL),
+      frames_capacity_(0),
+      frames_size_(0),
+      has_video_(false),
+      header_written_(false),
+      last_block_duration_(0),
+      last_timestamp_(0),
+      max_cluster_duration_(kDefaultMaxClusterDuration),
+      max_cluster_size_(0),
+      mode_(kFile),
+      new_cuepoint_(false),
+      output_cues_(true),
+      accurate_cluster_duration_(false),
+      fixed_size_cluster_timecode_(false),
+      estimate_file_duration_(false),
+      payload_pos_(0),
+      size_position_(0),
+      doc_type_version_(kDefaultDocTypeVersion),
+      doc_type_version_written_(0),
+      duration_(0.0),
+      writer_cluster_(NULL),
+      writer_cues_(NULL),
+      writer_header_(NULL) {
+  const time_t curr_time = time(NULL);
+  seed_ = static_cast<unsigned int>(curr_time);
+#ifdef _WIN32
+  srand(seed_);
+#endif
+}
+
+Segment::~Segment() {
+  if (cluster_list_) {
+    for (int32_t i = 0; i < cluster_list_size_; ++i) {
+      Cluster* const cluster = cluster_list_[i];
+      delete cluster;
+    }
+    delete[] cluster_list_;
+  }
+
+  if (frames_) {
+    for (int32_t i = 0; i < frames_size_; ++i) {
+      Frame* const frame = frames_[i];
+      delete frame;
+    }
+    delete[] frames_;
+  }
+
+  delete[] chunk_name_;
+  delete[] chunking_base_name_;
+
+  if (chunk_writer_cluster_) {
+    chunk_writer_cluster_->Close();
+    delete chunk_writer_cluster_;
+  }
+  if (chunk_writer_cues_) {
+    chunk_writer_cues_->Close();
+    delete chunk_writer_cues_;
+  }
+  if (chunk_writer_header_) {
+    chunk_writer_header_->Close();
+    delete chunk_writer_header_;
+  }
+}
+
+void Segment::MoveCuesBeforeClustersHelper(uint64_t diff, int32_t index,
+                                           uint64_t* cues_size) {
+  CuePoint* const cue_point = cues_.GetCueByIndex(index);
+  if (cue_point == NULL)
+    return;
+  const uint64_t old_cue_point_size = cue_point->Size();
+  const uint64_t cluster_pos = cue_point->cluster_pos() + diff;
+  cue_point->set_cluster_pos(cluster_pos);  // update the new cluster position
+  // New size of the cue is computed as follows
+  //    Let a = current sum of size of all CuePoints
+  //    Let b = Increase in Cue Point's size due to this iteration
+  //    Let c = Increase in size of Cues Element's length due to this iteration
+  //            (This is computed as CodedSize(a + b) - CodedSize(a))
+  //    Let d = b + c. Now d is the |diff| passed to the next recursive call.
+  //    Let e = a + b. Now e is the |cues_size| passed to the next recursive
+  //                   call.
+  const uint64_t cue_point_size_diff = cue_point->Size() - old_cue_point_size;
+  const uint64_t cue_size_diff =
+      GetCodedUIntSize(*cues_size + cue_point_size_diff) -
+      GetCodedUIntSize(*cues_size);
+  *cues_size += cue_point_size_diff;
+  diff = cue_size_diff + cue_point_size_diff;
+  if (diff > 0) {
+    for (int32_t i = 0; i < cues_.cue_entries_size(); ++i) {
+      MoveCuesBeforeClustersHelper(diff, i, cues_size);
+    }
+  }
+}
+
+void Segment::MoveCuesBeforeClusters() {
+  const uint64_t current_cue_size = cues_.Size();
+  uint64_t cue_size = 0;
+  for (int32_t i = 0; i < cues_.cue_entries_size(); ++i)
+    cue_size += cues_.GetCueByIndex(i)->Size();
+  for (int32_t i = 0; i < cues_.cue_entries_size(); ++i)
+    MoveCuesBeforeClustersHelper(current_cue_size, i, &cue_size);
+
+  // Adjust the Seek Entry to reflect the change in position
+  // of Cluster and Cues
+  int32_t cluster_index = 0;
+  int32_t cues_index = 0;
+  for (int32_t i = 0; i < SeekHead::kSeekEntryCount; ++i) {
+    if (seek_head_.GetId(i) == libwebm::kMkvCluster)
+      cluster_index = i;
+    if (seek_head_.GetId(i) == libwebm::kMkvCues)
+      cues_index = i;
+  }
+  seek_head_.SetSeekEntry(cues_index, libwebm::kMkvCues,
+                          seek_head_.GetPosition(cluster_index));
+  seek_head_.SetSeekEntry(cluster_index, libwebm::kMkvCluster,
+                          cues_.Size() + seek_head_.GetPosition(cues_index));
+}
+
+bool Segment::Init(IMkvWriter* ptr_writer) {
+  if (!ptr_writer) {
+    return false;
+  }
+  writer_cluster_ = ptr_writer;
+  writer_cues_ = ptr_writer;
+  writer_header_ = ptr_writer;
+  memset(&track_frames_written_, 0,
+         sizeof(track_frames_written_[0]) * kMaxTrackNumber);
+  memset(&last_track_timestamp_, 0,
+         sizeof(last_track_timestamp_[0]) * kMaxTrackNumber);
+  return segment_info_.Init();
+}
+
+bool Segment::CopyAndMoveCuesBeforeClusters(mkvparser::IMkvReader* reader,
+                                            IMkvWriter* writer) {
+  if (!writer->Seekable() || chunking_)
+    return false;
+  const int64_t cluster_offset =
+      cluster_list_[0]->size_position() - GetUIntSize(libwebm::kMkvCluster);
+
+  // Copy the headers.
+  if (!ChunkedCopy(reader, writer, 0, cluster_offset))
+    return false;
+
+  // Recompute cue positions and seek entries.
+  MoveCuesBeforeClusters();
+
+  // Write cues and seek entries.
+  // TODO(vigneshv): As of now, it's safe to call seek_head_.Finalize() for the
+  // second time with a different writer object. But the name Finalize() doesn't
+  // indicate something we want to call more than once. So consider renaming it
+  // to write() or some such.
+  if (!cues_.Write(writer) || !seek_head_.Finalize(writer))
+    return false;
+
+  // Copy the Clusters.
+  if (!ChunkedCopy(reader, writer, cluster_offset,
+                   cluster_end_offset_ - cluster_offset))
+    return false;
+
+  // Update the Segment size in case the Cues size has changed.
+  const int64_t pos = writer->Position();
+  const int64_t segment_size = writer->Position() - payload_pos_;
+  if (writer->Position(size_position_) ||
+      WriteUIntSize(writer, segment_size, 8) || writer->Position(pos))
+    return false;
+  return true;
+}
+
+bool Segment::Finalize() {
+  if (WriteFramesAll() < 0)
+    return false;
+
+  // In kLive mode, call Cluster::Finalize only if |accurate_cluster_duration_|
+  // is set. In all other modes, always call Cluster::Finalize.
+  if ((mode_ == kLive ? accurate_cluster_duration_ : true) &&
+      cluster_list_size_ > 0) {
+    // Update last cluster's size
+    Cluster* const old_cluster = cluster_list_[cluster_list_size_ - 1];
+
+    // For the last frame of the last Cluster, we don't write it as a BlockGroup
+    // with Duration unless the frame itself has duration set explicitly.
+    if (!old_cluster || !old_cluster->Finalize(false, 0))
+      return false;
+  }
+
+  if (mode_ == kFile) {
+    if (chunking_ && chunk_writer_cluster_) {
+      chunk_writer_cluster_->Close();
+      chunk_count_++;
+    }
+
+    double duration =
+        (static_cast<double>(last_timestamp_) + last_block_duration_) /
+        segment_info_.timecode_scale();
+    if (duration_ > 0.0) {
+      duration = duration_;
+    } else {
+      if (last_block_duration_ == 0 && estimate_file_duration_) {
+        const int num_tracks = static_cast<int>(tracks_.track_entries_size());
+        for (int i = 0; i < num_tracks; ++i) {
+          if (track_frames_written_[i] < 2)
+            continue;
+
+          // Estimate the duration for the last block of a Track.
+          const double nano_per_frame =
+              static_cast<double>(last_track_timestamp_[i]) /
+              (track_frames_written_[i] - 1);
+          const double track_duration =
+              (last_track_timestamp_[i] + nano_per_frame) /
+              segment_info_.timecode_scale();
+          if (track_duration > duration)
+            duration = track_duration;
+        }
+      }
+    }
+    segment_info_.set_duration(duration);
+    if (!segment_info_.Finalize(writer_header_))
+      return false;
+
+    if (output_cues_)
+      if (!seek_head_.AddSeekEntry(libwebm::kMkvCues, MaxOffset()))
+        return false;
+
+    if (chunking_) {
+      if (!chunk_writer_cues_)
+        return false;
+
+      char* name = NULL;
+      if (!UpdateChunkName("cues", &name))
+        return false;
+
+      const bool cues_open = chunk_writer_cues_->Open(name);
+      delete[] name;
+      if (!cues_open)
+        return false;
+    }
+
+    cluster_end_offset_ = writer_cluster_->Position();
+
+    // Write the seek headers and cues
+    if (output_cues_)
+      if (!cues_.Write(writer_cues_))
+        return false;
+
+    if (!seek_head_.Finalize(writer_header_))
+      return false;
+
+    if (writer_header_->Seekable()) {
+      if (size_position_ == -1)
+        return false;
+
+      const int64_t segment_size = MaxOffset();
+      if (segment_size < 1)
+        return false;
+
+      const int64_t pos = writer_header_->Position();
+      UpdateDocTypeVersion();
+      if (doc_type_version_ != doc_type_version_written_) {
+        if (writer_header_->Position(0))
+          return false;
+
+        const char* const doc_type =
+            DocTypeIsWebm() ? kDocTypeWebm : kDocTypeMatroska;
+        if (!WriteEbmlHeader(writer_header_, doc_type_version_, doc_type))
+          return false;
+        if (writer_header_->Position() != ebml_header_size_)
+          return false;
+
+        doc_type_version_written_ = doc_type_version_;
+      }
+
+      if (writer_header_->Position(size_position_))
+        return false;
+
+      if (WriteUIntSize(writer_header_, segment_size, 8))
+        return false;
+
+      if (writer_header_->Position(pos))
+        return false;
+    }
+
+    if (chunking_) {
+      // Do not close any writers until the segment size has been written,
+      // otherwise the size may be off.
+      if (!chunk_writer_cues_ || !chunk_writer_header_)
+        return false;
+
+      chunk_writer_cues_->Close();
+      chunk_writer_header_->Close();
+    }
+  }
+
+  return true;
+}
+
+Track* Segment::AddTrack(int32_t number) {
+  Track* const track = new (std::nothrow) Track(&seed_);  // NOLINT
+
+  if (!track)
+    return NULL;
+
+  if (!tracks_.AddTrack(track, number)) {
+    delete track;
+    return NULL;
+  }
+
+  return track;
+}
+
+Chapter* Segment::AddChapter() { return chapters_.AddChapter(&seed_); }
+
+Tag* Segment::AddTag() { return tags_.AddTag(); }
+
+uint64_t Segment::AddVideoTrack(int32_t width, int32_t height, int32_t number) {
+  VideoTrack* const track = new (std::nothrow) VideoTrack(&seed_);  // NOLINT
+  if (!track)
+    return 0;
+
+  track->set_type(Tracks::kVideo);
+  track->set_codec_id(Tracks::kVp8CodecId);
+  track->set_width(width);
+  track->set_height(height);
+
+  if (!tracks_.AddTrack(track, number)) {
+    delete track;
+    return 0;
+  }
+  has_video_ = true;
+
+  return track->number();
+}
+
+bool Segment::AddCuePoint(uint64_t timestamp, uint64_t track) {
+  if (cluster_list_size_ < 1)
+    return false;
+
+  const Cluster* const cluster = cluster_list_[cluster_list_size_ - 1];
+  if (!cluster)
+    return false;
+
+  CuePoint* const cue = new (std::nothrow) CuePoint();  // NOLINT
+  if (!cue)
+    return false;
+
+  cue->set_time(timestamp / segment_info_.timecode_scale());
+  cue->set_block_number(cluster->blocks_added());
+  cue->set_cluster_pos(cluster->position_for_cues());
+  cue->set_track(track);
+  if (!cues_.AddCue(cue)) {
+    delete cue;
+    return false;
+  }
+
+  new_cuepoint_ = false;
+  return true;
+}
+
+uint64_t Segment::AddAudioTrack(int32_t sample_rate, int32_t channels,
+                                int32_t number) {
+  AudioTrack* const track = new (std::nothrow) AudioTrack(&seed_);  // NOLINT
+  if (!track)
+    return 0;
+
+  track->set_type(Tracks::kAudio);
+  track->set_codec_id(Tracks::kVorbisCodecId);
+  track->set_sample_rate(sample_rate);
+  track->set_channels(channels);
+
+  if (!tracks_.AddTrack(track, number)) {
+    delete track;
+    return 0;
+  }
+
+  return track->number();
+}
+
+bool Segment::AddFrame(const uint8_t* data, uint64_t length,
+                       uint64_t track_number, uint64_t timestamp, bool is_key) {
+  if (!data)
+    return false;
+
+  Frame frame;
+  if (!frame.Init(data, length))
+    return false;
+  frame.set_track_number(track_number);
+  frame.set_timestamp(timestamp);
+  frame.set_is_key(is_key);
+  return AddGenericFrame(&frame);
+}
+
+bool Segment::AddFrameWithAdditional(const uint8_t* data, uint64_t length,
+                                     const uint8_t* additional,
+                                     uint64_t additional_length,
+                                     uint64_t add_id, uint64_t track_number,
+                                     uint64_t timestamp, bool is_key) {
+  if (!data || !additional)
+    return false;
+
+  Frame frame;
+  if (!frame.Init(data, length) ||
+      !frame.AddAdditionalData(additional, additional_length, add_id)) {
+    return false;
+  }
+  frame.set_track_number(track_number);
+  frame.set_timestamp(timestamp);
+  frame.set_is_key(is_key);
+  return AddGenericFrame(&frame);
+}
+
+bool Segment::AddFrameWithDiscardPadding(const uint8_t* data, uint64_t length,
+                                         int64_t discard_padding,
+                                         uint64_t track_number,
+                                         uint64_t timestamp, bool is_key) {
+  if (!data)
+    return false;
+
+  Frame frame;
+  if (!frame.Init(data, length))
+    return false;
+  frame.set_discard_padding(discard_padding);
+  frame.set_track_number(track_number);
+  frame.set_timestamp(timestamp);
+  frame.set_is_key(is_key);
+  return AddGenericFrame(&frame);
+}
+
+bool Segment::AddMetadata(const uint8_t* data, uint64_t length,
+                          uint64_t track_number, uint64_t timestamp_ns,
+                          uint64_t duration_ns) {
+  if (!data)
+    return false;
+
+  Frame frame;
+  if (!frame.Init(data, length))
+    return false;
+  frame.set_track_number(track_number);
+  frame.set_timestamp(timestamp_ns);
+  frame.set_duration(duration_ns);
+  frame.set_is_key(true);  // All metadata blocks are keyframes.
+  return AddGenericFrame(&frame);
+}
+
+bool Segment::AddGenericFrame(const Frame* frame) {
+  if (!frame)
+    return false;
+
+  if (!CheckHeaderInfo())
+    return false;
+
+  // Check for non-monotonically increasing timestamps.
+  if (frame->timestamp() < last_timestamp_)
+    return false;
+
+  // Check if the track number is valid.
+  if (!tracks_.GetTrackByNumber(frame->track_number()))
+    return false;
+
+  if (frame->discard_padding() != 0)
+    doc_type_version_ = 4;
+
+  if (cluster_list_size_ > 0) {
+    const uint64_t timecode_scale = segment_info_.timecode_scale();
+    const uint64_t frame_timecode = frame->timestamp() / timecode_scale;
+
+    const Cluster* const last_cluster = cluster_list_[cluster_list_size_ - 1];
+    const uint64_t last_cluster_timecode = last_cluster->timecode();
+
+    const uint64_t rel_timecode = frame_timecode - last_cluster_timecode;
+    if (rel_timecode > kMaxBlockTimecode) {
+      force_new_cluster_ = true;
+    }
+  }
+
+  // If the segment has a video track hold onto audio frames to make sure the
+  // audio that is associated with the start time of a video key-frame is
+  // muxed into the same cluster.
+  if (has_video_ && tracks_.TrackIsAudio(frame->track_number()) &&
+      !force_new_cluster_) {
+    Frame* const new_frame = new (std::nothrow) Frame();
+    if (!new_frame || !new_frame->CopyFrom(*frame)) {
+      delete new_frame;
+      return false;
+    }
+    if (!QueueFrame(new_frame)) {
+      delete new_frame;
+      return false;
+    }
+    track_frames_written_[frame->track_number() - 1]++;
+    return true;
+  }
+
+  if (!DoNewClusterProcessing(frame->track_number(), frame->timestamp(),
+                              frame->is_key())) {
+    return false;
+  }
+
+  if (cluster_list_size_ < 1)
+    return false;
+
+  Cluster* const cluster = cluster_list_[cluster_list_size_ - 1];
+  if (!cluster)
+    return false;
+
+  // If the Frame is not a SimpleBlock, then set the reference_block_timestamp
+  // if it is not set already.
+  bool frame_created = false;
+  if (!frame->CanBeSimpleBlock() && !frame->is_key() &&
+      !frame->reference_block_timestamp_set()) {
+    Frame* const new_frame = new (std::nothrow) Frame();
+    if (!new_frame || !new_frame->CopyFrom(*frame)) {
+      delete new_frame;
+      return false;
+    }
+    new_frame->set_reference_block_timestamp(
+        last_track_timestamp_[frame->track_number() - 1]);
+    frame = new_frame;
+    frame_created = true;
+  }
+
+  if (!cluster->AddFrame(frame))
+    return false;
+
+  if (new_cuepoint_ && cues_track_ == frame->track_number()) {
+    if (!AddCuePoint(frame->timestamp(), cues_track_))
+      return false;
+  }
+
+  last_timestamp_ = frame->timestamp();
+  last_track_timestamp_[frame->track_number() - 1] = frame->timestamp();
+  last_block_duration_ = frame->duration();
+  track_frames_written_[frame->track_number() - 1]++;
+
+  if (frame_created)
+    delete frame;
+  return true;
+}
+
+void Segment::OutputCues(bool output_cues) { output_cues_ = output_cues; }
+
+void Segment::AccurateClusterDuration(bool accurate_cluster_duration) {
+  accurate_cluster_duration_ = accurate_cluster_duration;
+}
+
+void Segment::UseFixedSizeClusterTimecode(bool fixed_size_cluster_timecode) {
+  fixed_size_cluster_timecode_ = fixed_size_cluster_timecode;
+}
+
+bool Segment::SetChunking(bool chunking, const char* filename) {
+  if (chunk_count_ > 0)
+    return false;
+
+  if (chunking) {
+    if (!filename)
+      return false;
+
+    // Check if we are being set to what is already set.
+    if (chunking_ && !strcmp(filename, chunking_base_name_))
+      return true;
+
+    const size_t name_length = strlen(filename) + 1;
+    char* const temp = new (std::nothrow) char[name_length];  // NOLINT
+    if (!temp)
+      return false;
+
+#ifdef _MSC_VER
+    strcpy_s(temp, name_length, filename);
+#else
+    strcpy(temp, filename);
+#endif
+
+    delete[] chunking_base_name_;
+    chunking_base_name_ = temp;
+
+    if (!UpdateChunkName("chk", &chunk_name_))
+      return false;
+
+    if (!chunk_writer_cluster_) {
+      chunk_writer_cluster_ = new (std::nothrow) MkvWriter();  // NOLINT
+      if (!chunk_writer_cluster_)
+        return false;
+    }
+
+    if (!chunk_writer_cues_) {
+      chunk_writer_cues_ = new (std::nothrow) MkvWriter();  // NOLINT
+      if (!chunk_writer_cues_)
+        return false;
+    }
+
+    if (!chunk_writer_header_) {
+      chunk_writer_header_ = new (std::nothrow) MkvWriter();  // NOLINT
+      if (!chunk_writer_header_)
+        return false;
+    }
+
+    if (!chunk_writer_cluster_->Open(chunk_name_))
+      return false;
+
+    const size_t header_length = strlen(filename) + strlen(".hdr") + 1;
+    char* const header = new (std::nothrow) char[header_length];  // NOLINT
+    if (!header)
+      return false;
+
+#ifdef _MSC_VER
+    strcpy_s(header, header_length - strlen(".hdr"), chunking_base_name_);
+    strcat_s(header, header_length, ".hdr");
+#else
+    strcpy(header, chunking_base_name_);
+    strcat(header, ".hdr");
+#endif
+    if (!chunk_writer_header_->Open(header)) {
+      delete[] header;
+      return false;
+    }
+
+    writer_cluster_ = chunk_writer_cluster_;
+    writer_cues_ = chunk_writer_cues_;
+    writer_header_ = chunk_writer_header_;
+
+    delete[] header;
+  }
+
+  chunking_ = chunking;
+
+  return true;
+}
+
+bool Segment::CuesTrack(uint64_t track_number) {
+  const Track* const track = GetTrackByNumber(track_number);
+  if (!track)
+    return false;
+
+  cues_track_ = track_number;
+  return true;
+}
+
+void Segment::ForceNewClusterOnNextFrame() { force_new_cluster_ = true; }
+
+Track* Segment::GetTrackByNumber(uint64_t track_number) const {
+  return tracks_.GetTrackByNumber(track_number);
+}
+
+bool Segment::WriteSegmentHeader() {
+  UpdateDocTypeVersion();
+
+  const char* const doc_type =
+      DocTypeIsWebm() ? kDocTypeWebm : kDocTypeMatroska;
+  if (!WriteEbmlHeader(writer_header_, doc_type_version_, doc_type))
+    return false;
+  doc_type_version_written_ = doc_type_version_;
+  ebml_header_size_ = static_cast<int32_t>(writer_header_->Position());
+
+  // Write "unknown" (-1) as segment size value. If mode is kFile, Segment
+  // will write over duration when the file is finalized.
+  if (WriteID(writer_header_, libwebm::kMkvSegment))
+    return false;
+
+  // Save for later.
+  size_position_ = writer_header_->Position();
+
+  // Write "unknown" (EBML coded -1) as segment size value. We need to write 8
+  // bytes because if we are going to overwrite the segment size later we do
+  // not know how big our segment will be.
+  if (SerializeInt(writer_header_, kEbmlUnknownValue, 8))
+    return false;
+
+  payload_pos_ = writer_header_->Position();
+
+  if (mode_ == kFile && writer_header_->Seekable()) {
+    // Set the duration > 0.0 so SegmentInfo will write out the duration. When
+    // the muxer is done writing we will set the correct duration and have
+    // SegmentInfo upadte it.
+    segment_info_.set_duration(1.0);
+
+    if (!seek_head_.Write(writer_header_))
+      return false;
+  }
+
+  if (!seek_head_.AddSeekEntry(libwebm::kMkvInfo, MaxOffset()))
+    return false;
+  if (!segment_info_.Write(writer_header_))
+    return false;
+
+  if (!seek_head_.AddSeekEntry(libwebm::kMkvTracks, MaxOffset()))
+    return false;
+  if (!tracks_.Write(writer_header_))
+    return false;
+
+  if (chapters_.Count() > 0) {
+    if (!seek_head_.AddSeekEntry(libwebm::kMkvChapters, MaxOffset()))
+      return false;
+    if (!chapters_.Write(writer_header_))
+      return false;
+  }
+
+  if (tags_.Count() > 0) {
+    if (!seek_head_.AddSeekEntry(libwebm::kMkvTags, MaxOffset()))
+      return false;
+    if (!tags_.Write(writer_header_))
+      return false;
+  }
+
+  if (chunking_ && (mode_ == kLive || !writer_header_->Seekable())) {
+    if (!chunk_writer_header_)
+      return false;
+
+    chunk_writer_header_->Close();
+  }
+
+  header_written_ = true;
+
+  return true;
+}
+
+// Here we are testing whether to create a new cluster, given a frame
+// having time frame_timestamp_ns.
+//
+int Segment::TestFrame(uint64_t track_number, uint64_t frame_timestamp_ns,
+                       bool is_key) const {
+  if (force_new_cluster_)
+    return 1;
+
+  // If no clusters have been created yet, then create a new cluster
+  // and write this frame immediately, in the new cluster.  This path
+  // should only be followed once, the first time we attempt to write
+  // a frame.
+
+  if (cluster_list_size_ <= 0)
+    return 1;
+
+  // There exists at least one cluster. We must compare the frame to
+  // the last cluster, in order to determine whether the frame is
+  // written to the existing cluster, or that a new cluster should be
+  // created.
+
+  const uint64_t timecode_scale = segment_info_.timecode_scale();
+  const uint64_t frame_timecode = frame_timestamp_ns / timecode_scale;
+
+  const Cluster* const last_cluster = cluster_list_[cluster_list_size_ - 1];
+  const uint64_t last_cluster_timecode = last_cluster->timecode();
+
+  // For completeness we test for the case when the frame's timecode
+  // is less than the cluster's timecode.  Although in principle that
+  // is allowed, this muxer doesn't actually write clusters like that,
+  // so this indicates a bug somewhere in our algorithm.
+
+  if (frame_timecode < last_cluster_timecode)  // should never happen
+    return -1;
+
+  // If the frame has a timestamp significantly larger than the last
+  // cluster (in Matroska, cluster-relative timestamps are serialized
+  // using a 16-bit signed integer), then we cannot write this frame
+  // to that cluster, and so we must create a new cluster.
+
+  const int64_t delta_timecode = frame_timecode - last_cluster_timecode;
+
+  if (delta_timecode > kMaxBlockTimecode)
+    return 2;
+
+  // We decide to create a new cluster when we have a video keyframe.
+  // This will flush queued (audio) frames, and write the keyframe
+  // immediately, in the newly-created cluster.
+
+  if (is_key && tracks_.TrackIsVideo(track_number))
+    return 1;
+
+  // Create a new cluster if we have accumulated too many frames
+  // already, where "too many" is defined as "the total time of frames
+  // in the cluster exceeds a threshold".
+
+  const uint64_t delta_ns = delta_timecode * timecode_scale;
+
+  if (max_cluster_duration_ > 0 && delta_ns >= max_cluster_duration_)
+    return 1;
+
+  // This is similar to the case above, with the difference that a new
+  // cluster is created when the size of the current cluster exceeds a
+  // threshold.
+
+  const uint64_t cluster_size = last_cluster->payload_size();
+
+  if (max_cluster_size_ > 0 && cluster_size >= max_cluster_size_)
+    return 1;
+
+  // There's no need to create a new cluster, so emit this frame now.
+
+  return 0;
+}
+
+bool Segment::MakeNewCluster(uint64_t frame_timestamp_ns) {
+  const int32_t new_size = cluster_list_size_ + 1;
+
+  if (new_size > cluster_list_capacity_) {
+    // Add more clusters.
+    const int32_t new_capacity =
+        (cluster_list_capacity_ <= 0) ? 1 : cluster_list_capacity_ * 2;
+    Cluster** const clusters =
+        new (std::nothrow) Cluster*[new_capacity];  // NOLINT
+    if (!clusters)
+      return false;
+
+    for (int32_t i = 0; i < cluster_list_size_; ++i) {
+      clusters[i] = cluster_list_[i];
+    }
+
+    delete[] cluster_list_;
+
+    cluster_list_ = clusters;
+    cluster_list_capacity_ = new_capacity;
+  }
+
+  if (!WriteFramesLessThan(frame_timestamp_ns))
+    return false;
+
+  if (cluster_list_size_ > 0) {
+    // Update old cluster's size
+    Cluster* const old_cluster = cluster_list_[cluster_list_size_ - 1];
+
+    if (!old_cluster || !old_cluster->Finalize(true, frame_timestamp_ns))
+      return false;
+  }
+
+  if (output_cues_)
+    new_cuepoint_ = true;
+
+  if (chunking_ && cluster_list_size_ > 0) {
+    chunk_writer_cluster_->Close();
+    chunk_count_++;
+
+    if (!UpdateChunkName("chk", &chunk_name_))
+      return false;
+    if (!chunk_writer_cluster_->Open(chunk_name_))
+      return false;
+  }
+
+  const uint64_t timecode_scale = segment_info_.timecode_scale();
+  const uint64_t frame_timecode = frame_timestamp_ns / timecode_scale;
+
+  uint64_t cluster_timecode = frame_timecode;
+
+  if (frames_size_ > 0) {
+    const Frame* const f = frames_[0];  // earliest queued frame
+    const uint64_t ns = f->timestamp();
+    const uint64_t tc = ns / timecode_scale;
+
+    if (tc < cluster_timecode)
+      cluster_timecode = tc;
+  }
+
+  Cluster*& cluster = cluster_list_[cluster_list_size_];
+  const int64_t offset = MaxOffset();
+  cluster = new (std::nothrow)
+      Cluster(cluster_timecode, offset, segment_info_.timecode_scale(),
+              accurate_cluster_duration_, fixed_size_cluster_timecode_);
+  if (!cluster)
+    return false;
+
+  if (!cluster->Init(writer_cluster_))
+    return false;
+
+  cluster_list_size_ = new_size;
+  return true;
+}
+
+bool Segment::DoNewClusterProcessing(uint64_t track_number,
+                                     uint64_t frame_timestamp_ns, bool is_key) {
+  for (;;) {
+    // Based on the characteristics of the current frame and current
+    // cluster, decide whether to create a new cluster.
+    const int result = TestFrame(track_number, frame_timestamp_ns, is_key);
+    if (result < 0)  // error
+      return false;
+
+    // Always set force_new_cluster_ to false after TestFrame.
+    force_new_cluster_ = false;
+
+    // A non-zero result means create a new cluster.
+    if (result > 0 && !MakeNewCluster(frame_timestamp_ns))
+      return false;
+
+    // Write queued (audio) frames.
+    const int frame_count = WriteFramesAll();
+    if (frame_count < 0)  // error
+      return false;
+
+    // Write the current frame to the current cluster (if TestFrame
+    // returns 0) or to a newly created cluster (TestFrame returns 1).
+    if (result <= 1)
+      return true;
+
+    // TestFrame returned 2, which means there was a large time
+    // difference between the cluster and the frame itself.  Do the
+    // test again, comparing the frame to the new cluster.
+  }
+}
+
+bool Segment::CheckHeaderInfo() {
+  if (!header_written_) {
+    if (!WriteSegmentHeader())
+      return false;
+
+    if (!seek_head_.AddSeekEntry(libwebm::kMkvCluster, MaxOffset()))
+      return false;
+
+    if (output_cues_ && cues_track_ == 0) {
+      // Check for a video track
+      for (uint32_t i = 0; i < tracks_.track_entries_size(); ++i) {
+        const Track* const track = tracks_.GetTrackByIndex(i);
+        if (!track)
+          return false;
+
+        if (tracks_.TrackIsVideo(track->number())) {
+          cues_track_ = track->number();
+          break;
+        }
+      }
+
+      // Set first track found
+      if (cues_track_ == 0) {
+        const Track* const track = tracks_.GetTrackByIndex(0);
+        if (!track)
+          return false;
+
+        cues_track_ = track->number();
+      }
+    }
+  }
+  return true;
+}
+
+void Segment::UpdateDocTypeVersion() {
+  for (uint32_t index = 0; index < tracks_.track_entries_size(); ++index) {
+    const Track* track = tracks_.GetTrackByIndex(index);
+    if (track == NULL)
+      break;
+    if ((track->codec_delay() || track->seek_pre_roll()) &&
+        doc_type_version_ < 4) {
+      doc_type_version_ = 4;
+      break;
+    }
+  }
+}
+
+bool Segment::UpdateChunkName(const char* ext, char** name) const {
+  if (!name || !ext)
+    return false;
+
+  char ext_chk[64];
+#ifdef _MSC_VER
+  sprintf_s(ext_chk, sizeof(ext_chk), "_%06d.%s", chunk_count_, ext);
+#else
+  snprintf(ext_chk, sizeof(ext_chk), "_%06d.%s", chunk_count_, ext);
+#endif
+
+  const size_t length = strlen(chunking_base_name_) + strlen(ext_chk) + 1;
+  char* const str = new (std::nothrow) char[length];  // NOLINT
+  if (!str)
+    return false;
+
+#ifdef _MSC_VER
+  strcpy_s(str, length - strlen(ext_chk), chunking_base_name_);
+  strcat_s(str, length, ext_chk);
+#else
+  strcpy(str, chunking_base_name_);
+  strcat(str, ext_chk);
+#endif
+
+  delete[] * name;
+  *name = str;
+
+  return true;
+}
+
+int64_t Segment::MaxOffset() {
+  if (!writer_header_)
+    return -1;
+
+  int64_t offset = writer_header_->Position() - payload_pos_;
+
+  if (chunking_) {
+    for (int32_t i = 0; i < cluster_list_size_; ++i) {
+      Cluster* const cluster = cluster_list_[i];
+      offset += cluster->Size();
+    }
+
+    if (writer_cues_)
+      offset += writer_cues_->Position();
+  }
+
+  return offset;
+}
+
+bool Segment::QueueFrame(Frame* frame) {
+  const int32_t new_size = frames_size_ + 1;
+
+  if (new_size > frames_capacity_) {
+    // Add more frames.
+    const int32_t new_capacity = (!frames_capacity_) ? 2 : frames_capacity_ * 2;
+
+    if (new_capacity < 1)
+      return false;
+
+    Frame** const frames = new (std::nothrow) Frame*[new_capacity];  // NOLINT
+    if (!frames)
+      return false;
+
+    for (int32_t i = 0; i < frames_size_; ++i) {
+      frames[i] = frames_[i];
+    }
+
+    delete[] frames_;
+    frames_ = frames;
+    frames_capacity_ = new_capacity;
+  }
+
+  frames_[frames_size_++] = frame;
+
+  return true;
+}
+
+int Segment::WriteFramesAll() {
+  if (frames_ == NULL)
+    return 0;
+
+  if (cluster_list_size_ < 1)
+    return -1;
+
+  Cluster* const cluster = cluster_list_[cluster_list_size_ - 1];
+
+  if (!cluster)
+    return -1;
+
+  for (int32_t i = 0; i < frames_size_; ++i) {
+    Frame*& frame = frames_[i];
+    // TODO(jzern/vigneshv): using Segment::AddGenericFrame here would limit the
+    // places where |doc_type_version_| needs to be updated.
+    if (frame->discard_padding() != 0)
+      doc_type_version_ = 4;
+    if (!cluster->AddFrame(frame))
+      return -1;
+
+    if (new_cuepoint_ && cues_track_ == frame->track_number()) {
+      if (!AddCuePoint(frame->timestamp(), cues_track_))
+        return -1;
+    }
+
+    if (frame->timestamp() > last_timestamp_) {
+      last_timestamp_ = frame->timestamp();
+      last_track_timestamp_[frame->track_number() - 1] = frame->timestamp();
+    }
+
+    delete frame;
+    frame = NULL;
+  }
+
+  const int result = frames_size_;
+  frames_size_ = 0;
+
+  return result;
+}
+
+bool Segment::WriteFramesLessThan(uint64_t timestamp) {
+  // Check |cluster_list_size_| to see if this is the first cluster. If it is
+  // the first cluster the audio frames that are less than the first video
+  // timesatmp will be written in a later step.
+  if (frames_size_ > 0 && cluster_list_size_ > 0) {
+    if (!frames_)
+      return false;
+
+    Cluster* const cluster = cluster_list_[cluster_list_size_ - 1];
+    if (!cluster)
+      return false;
+
+    int32_t shift_left = 0;
+
+    // TODO(fgalligan): Change this to use the durations of frames instead of
+    // the next frame's start time if the duration is accurate.
+    for (int32_t i = 1; i < frames_size_; ++i) {
+      const Frame* const frame_curr = frames_[i];
+
+      if (frame_curr->timestamp() > timestamp)
+        break;
+
+      const Frame* const frame_prev = frames_[i - 1];
+      if (frame_prev->discard_padding() != 0)
+        doc_type_version_ = 4;
+      if (!cluster->AddFrame(frame_prev))
+        return false;
+
+      if (new_cuepoint_ && cues_track_ == frame_prev->track_number()) {
+        if (!AddCuePoint(frame_prev->timestamp(), cues_track_))
+          return false;
+      }
+
+      ++shift_left;
+      if (frame_prev->timestamp() > last_timestamp_) {
+        last_timestamp_ = frame_prev->timestamp();
+        last_track_timestamp_[frame_prev->track_number() - 1] =
+            frame_prev->timestamp();
+      }
+
+      delete frame_prev;
+    }
+
+    if (shift_left > 0) {
+      if (shift_left >= frames_size_)
+        return false;
+
+      const int32_t new_frames_size = frames_size_ - shift_left;
+      for (int32_t i = 0; i < new_frames_size; ++i) {
+        frames_[i] = frames_[i + shift_left];
+      }
+
+      frames_size_ = new_frames_size;
+    }
+  }
+
+  return true;
+}
+
+bool Segment::DocTypeIsWebm() const {
+  const int kNumCodecIds = 9;
+
+  // TODO(vigneshv): Tweak .clang-format.
+  const char* kWebmCodecIds[kNumCodecIds] = {
+      Tracks::kOpusCodecId,          Tracks::kVorbisCodecId,
+      Tracks::kAv1CodecId,           Tracks::kVp8CodecId,
+      Tracks::kVp9CodecId,           Tracks::kWebVttCaptionsId,
+      Tracks::kWebVttDescriptionsId, Tracks::kWebVttMetadataId,
+      Tracks::kWebVttSubtitlesId};
+
+  const int num_tracks = static_cast<int>(tracks_.track_entries_size());
+  for (int track_index = 0; track_index < num_tracks; ++track_index) {
+    const Track* const track = tracks_.GetTrackByIndex(track_index);
+    const std::string codec_id = track->codec_id();
+
+    bool id_is_webm = false;
+    for (int id_index = 0; id_index < kNumCodecIds; ++id_index) {
+      if (codec_id == kWebmCodecIds[id_index]) {
+        id_is_webm = true;
+        break;
+      }
+    }
+
+    if (!id_is_webm)
+      return false;
+  }
+
+  return true;
+}
+
+}  // namespace mkvmuxer
diff --git a/libs/libaom/src/third_party/libwebm/mkvmuxer/mkvmuxer.h b/libs/libaom/src/third_party/libwebm/mkvmuxer/mkvmuxer.h
new file mode 100644
index 000000000..f2db37714
--- /dev/null
+++ b/libs/libaom/src/third_party/libwebm/mkvmuxer/mkvmuxer.h
@@ -0,0 +1,1924 @@
+// Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS.  All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+
+#ifndef MKVMUXER_MKVMUXER_H_
+#define MKVMUXER_MKVMUXER_H_
+
+#include <stdint.h>
+
+#include <cstddef>
+#include <list>
+#include <map>
+
+#include "common/webmids.h"
+#include "mkvmuxer/mkvmuxertypes.h"
+
+// For a description of the WebM elements see
+// http://www.webmproject.org/code/specs/container/.
+
+namespace mkvparser {
+class IMkvReader;
+}  // namespace mkvparser
+
+namespace mkvmuxer {
+
+class MkvWriter;
+class Segment;
+
+const uint64_t kMaxTrackNumber = 126;
+
+///////////////////////////////////////////////////////////////
+// Interface used by the mkvmuxer to write out the Mkv data.
+class IMkvWriter {
+ public:
+  // Writes out |len| bytes of |buf|. Returns 0 on success.
+  virtual int32 Write(const void* buf, uint32 len) = 0;
+
+  // Returns the offset of the output position from the beginning of the
+  // output.
+  virtual int64 Position() const = 0;
+
+  // Set the current File position. Returns 0 on success.
+  virtual int32 Position(int64 position) = 0;
+
+  // Returns true if the writer is seekable.
+  virtual bool Seekable() const = 0;
+
+  // Element start notification. Called whenever an element identifier is about
+  // to be written to the stream. |element_id| is the element identifier, and
+  // |position| is the location in the WebM stream where the first octet of the
+  // element identifier will be written.
+  // Note: the |MkvId| enumeration in webmids.hpp defines element values.
+  virtual void ElementStartNotify(uint64 element_id, int64 position) = 0;
+
+ protected:
+  IMkvWriter();
+  virtual ~IMkvWriter();
+
+ private:
+  LIBWEBM_DISALLOW_COPY_AND_ASSIGN(IMkvWriter);
+};
+
+// Writes out the EBML header for a WebM file, but allows caller to specify
+// DocType. This function must be called before any other libwebm writing
+// functions are called.
+bool WriteEbmlHeader(IMkvWriter* writer, uint64_t doc_type_version,
+                     const char* const doc_type);
+
+// Writes out the EBML header for a WebM file. This function must be called
+// before any other libwebm writing functions are called.
+bool WriteEbmlHeader(IMkvWriter* writer, uint64_t doc_type_version);
+
+// Deprecated. Writes out EBML header with doc_type_version as
+// kDefaultDocTypeVersion. Exists for backward compatibility.
+bool WriteEbmlHeader(IMkvWriter* writer);
+
+// Copies in Chunk from source to destination between the given byte positions
+bool ChunkedCopy(mkvparser::IMkvReader* source, IMkvWriter* dst, int64_t start,
+                 int64_t size);
+
+///////////////////////////////////////////////////////////////
+// Class to hold data the will be written to a block.
+class Frame {
+ public:
+  Frame();
+  ~Frame();
+
+  // Sets this frame's contents based on |frame|. Returns true on success. On
+  // failure, this frame's existing contents may be lost.
+  bool CopyFrom(const Frame& frame);
+
+  // Copies |frame| data into |frame_|. Returns true on success.
+  bool Init(const uint8_t* frame, uint64_t length);
+
+  // Copies |additional| data into |additional_|. Returns true on success.
+  bool AddAdditionalData(const uint8_t* additional, uint64_t length,
+                         uint64_t add_id);
+
+  // Returns true if the frame has valid parameters.
+  bool IsValid() const;
+
+  // Returns true if the frame can be written as a SimpleBlock based on current
+  // parameters.
+  bool CanBeSimpleBlock() const;
+
+  uint64_t add_id() const { return add_id_; }
+  const uint8_t* additional() const { return additional_; }
+  uint64_t additional_length() const { return additional_length_; }
+  void set_duration(uint64_t duration);
+  uint64_t duration() const { return duration_; }
+  bool duration_set() const { return duration_set_; }
+  const uint8_t* frame() const { return frame_; }
+  void set_is_key(bool key) { is_key_ = key; }
+  bool is_key() const { return is_key_; }
+  uint64_t length() const { return length_; }
+  void set_track_number(uint64_t track_number) { track_number_ = track_number; }
+  uint64_t track_number() const { return track_number_; }
+  void set_timestamp(uint64_t timestamp) { timestamp_ = timestamp; }
+  uint64_t timestamp() const { return timestamp_; }
+  void set_discard_padding(int64_t discard_padding) {
+    discard_padding_ = discard_padding;
+  }
+  int64_t discard_padding() const { return discard_padding_; }
+  void set_reference_block_timestamp(int64_t reference_block_timestamp);
+  int64_t reference_block_timestamp() const {
+    return reference_block_timestamp_;
+  }
+  bool reference_block_timestamp_set() const {
+    return reference_block_timestamp_set_;
+  }
+
+ private:
+  // Id of the Additional data.
+  uint64_t add_id_;
+
+  // Pointer to additional data. Owned by this class.
+  uint8_t* additional_;
+
+  // Length of the additional data.
+  uint64_t additional_length_;
+
+  // Duration of the frame in nanoseconds.
+  uint64_t duration_;
+
+  // Flag indicating that |duration_| has been set. Setting duration causes the
+  // frame to be written out as a Block with BlockDuration instead of as a
+  // SimpleBlock.
+  bool duration_set_;
+
+  // Pointer to the data. Owned by this class.
+  uint8_t* frame_;
+
+  // Flag telling if the data should set the key flag of a block.
+  bool is_key_;
+
+  // Length of the data.
+  uint64_t length_;
+
+  // Mkv track number the data is associated with.
+  uint64_t track_number_;
+
+  // Timestamp of the data in nanoseconds.
+  uint64_t timestamp_;
+
+  // Discard padding for the frame.
+  int64_t discard_padding_;
+
+  // Reference block timestamp.
+  int64_t reference_block_timestamp_;
+
+  // Flag indicating if |reference_block_timestamp_| has been set.
+  bool reference_block_timestamp_set_;
+
+  LIBWEBM_DISALLOW_COPY_AND_ASSIGN(Frame);
+};
+
+///////////////////////////////////////////////////////////////
+// Class to hold one cue point in a Cues element.
+class CuePoint {
+ public:
+  CuePoint();
+  ~CuePoint();
+
+  // Returns the size in bytes for the entire CuePoint element.
+  uint64_t Size() const;
+
+  // Output the CuePoint element to the writer. Returns true on success.
+  bool Write(IMkvWriter* writer) const;
+
+  void set_time(uint64_t time) { time_ = time; }
+  uint64_t time() const { return time_; }
+  void set_track(uint64_t track) { track_ = track; }
+  uint64_t track() const { return track_; }
+  void set_cluster_pos(uint64_t cluster_pos) { cluster_pos_ = cluster_pos; }
+  uint64_t cluster_pos() const { return cluster_pos_; }
+  void set_block_number(uint64_t block_number) { block_number_ = block_number; }
+  uint64_t block_number() const { return block_number_; }
+  void set_output_block_number(bool output_block_number) {
+    output_block_number_ = output_block_number;
+  }
+  bool output_block_number() const { return output_block_number_; }
+
+ private:
+  // Returns the size in bytes for the payload of the CuePoint element.
+  uint64_t PayloadSize() const;
+
+  // Absolute timecode according to the segment time base.
+  uint64_t time_;
+
+  // The Track element associated with the CuePoint.
+  uint64_t track_;
+
+  // The position of the Cluster containing the Block.
+  uint64_t cluster_pos_;
+
+  // Number of the Block within the Cluster, starting from 1.
+  uint64_t block_number_;
+
+  // If true the muxer will write out the block number for the cue if the
+  // block number is different than the default of 1. Default is set to true.
+  bool output_block_number_;
+
+  LIBWEBM_DISALLOW_COPY_AND_ASSIGN(CuePoint);
+};
+
+///////////////////////////////////////////////////////////////
+// Cues element.
+class Cues {
+ public:
+  Cues();
+  ~Cues();
+
+  // Adds a cue point to the Cues element. Returns true on success.
+  bool AddCue(CuePoint* cue);
+
+  // Returns the cue point by index. Returns NULL if there is no cue point
+  // match.
+  CuePoint* GetCueByIndex(int32_t index) const;
+
+  // Returns the total size of the Cues element
+  uint64_t Size();
+
+  // Output the Cues element to the writer. Returns true on success.
+  bool Write(IMkvWriter* writer) const;
+
+  int32_t cue_entries_size() const { return cue_entries_size_; }
+  void set_output_block_number(bool output_block_number) {
+    output_block_number_ = output_block_number;
+  }
+  bool output_block_number() const { return output_block_number_; }
+
+ private:
+  // Number of allocated elements in |cue_entries_|.
+  int32_t cue_entries_capacity_;
+
+  // Number of CuePoints in |cue_entries_|.
+  int32_t cue_entries_size_;
+
+  // CuePoint list.
+  CuePoint** cue_entries_;
+
+  // If true the muxer will write out the block number for the cue if the
+  // block number is different than the default of 1. Default is set to true.
+  bool output_block_number_;
+
+  LIBWEBM_DISALLOW_COPY_AND_ASSIGN(Cues);
+};
+
+///////////////////////////////////////////////////////////////
+// ContentEncAESSettings element
+class ContentEncAESSettings {
+ public:
+  enum { kCTR = 1 };
+
+  ContentEncAESSettings();
+  ~ContentEncAESSettings() {}
+
+  // Returns the size in bytes for the ContentEncAESSettings element.
+  uint64_t Size() const;
+
+  // Writes out the ContentEncAESSettings element to |writer|. Returns true on
+  // success.
+  bool Write(IMkvWriter* writer) const;
+
+  uint64_t cipher_mode() const { return cipher_mode_; }
+
+ private:
+  // Returns the size in bytes for the payload of the ContentEncAESSettings
+  // element.
+  uint64_t PayloadSize() const;
+
+  // Sub elements
+  uint64_t cipher_mode_;
+
+  LIBWEBM_DISALLOW_COPY_AND_ASSIGN(ContentEncAESSettings);
+};
+
+///////////////////////////////////////////////////////////////
+// ContentEncoding element
+// Elements used to describe if the track data has been encrypted or
+// compressed with zlib or header stripping.
+// Currently only whole frames can be encrypted with AES. This dictates that
+// ContentEncodingOrder will be 0, ContentEncodingScope will be 1,
+// ContentEncodingType will be 1, and ContentEncAlgo will be 5.
+class ContentEncoding {
+ public:
+  ContentEncoding();
+  ~ContentEncoding();
+
+  // Sets the content encryption id. Copies |length| bytes from |id| to
+  // |enc_key_id_|. Returns true on success.
+  bool SetEncryptionID(const uint8_t* id, uint64_t length);
+
+  // Returns the size in bytes for the ContentEncoding element.
+  uint64_t Size() const;
+
+  // Writes out the ContentEncoding element to |writer|. Returns true on
+  // success.
+  bool Write(IMkvWriter* writer) const;
+
+  uint64_t enc_algo() const { return enc_algo_; }
+  uint64_t encoding_order() const { return encoding_order_; }
+  uint64_t encoding_scope() const { return encoding_scope_; }
+  uint64_t encoding_type() const { return encoding_type_; }
+  ContentEncAESSettings* enc_aes_settings() { return &enc_aes_settings_; }
+
+ private:
+  // Returns the size in bytes for the encoding elements.
+  uint64_t EncodingSize(uint64_t compresion_size,
+                        uint64_t encryption_size) const;
+
+  // Returns the size in bytes for the encryption elements.
+  uint64_t EncryptionSize() const;
+
+  // Track element names
+  uint64_t enc_algo_;
+  uint8_t* enc_key_id_;
+  uint64_t encoding_order_;
+  uint64_t encoding_scope_;
+  uint64_t encoding_type_;
+
+  // ContentEncAESSettings element.
+  ContentEncAESSettings enc_aes_settings_;
+
+  // Size of the ContentEncKeyID data in bytes.
+  uint64_t enc_key_id_length_;
+
+  LIBWEBM_DISALLOW_COPY_AND_ASSIGN(ContentEncoding);
+};
+
+///////////////////////////////////////////////////////////////
+// Colour element.
+class PrimaryChromaticity {
+ public:
+  static const float kChromaticityMin;
+  static const float kChromaticityMax;
+
+  PrimaryChromaticity(float x_val, float y_val) : x_(x_val), y_(y_val) {}
+  PrimaryChromaticity() : x_(0), y_(0) {}
+  ~PrimaryChromaticity() {}
+
+  // Returns sum of |x_id| and |y_id| element id sizes and payload sizes.
+  uint64_t PrimaryChromaticitySize(libwebm::MkvId x_id,
+                                   libwebm::MkvId y_id) const;
+  bool Valid() const;
+  bool Write(IMkvWriter* writer, libwebm::MkvId x_id,
+             libwebm::MkvId y_id) const;
+
+  float x() const { return x_; }
+  void set_x(float new_x) { x_ = new_x; }
+  float y() const { return y_; }
+  void set_y(float new_y) { y_ = new_y; }
+
+ private:
+  float x_;
+  float y_;
+};
+
+class MasteringMetadata {
+ public:
+  static const float kValueNotPresent;
+  static const float kMinLuminance;
+  static const float kMinLuminanceMax;
+  static const float kMaxLuminanceMax;
+
+  MasteringMetadata()
+      : luminance_max_(kValueNotPresent),
+        luminance_min_(kValueNotPresent),
+        r_(NULL),
+        g_(NULL),
+        b_(NULL),
+        white_point_(NULL) {}
+  ~MasteringMetadata() {
+    delete r_;
+    delete g_;
+    delete b_;
+    delete white_point_;
+  }
+
+  // Returns total size of the MasteringMetadata element.
+  uint64_t MasteringMetadataSize() const;
+  bool Valid() const;
+  bool Write(IMkvWriter* writer) const;
+
+  // Copies non-null chromaticity.
+  bool SetChromaticity(const PrimaryChromaticity* r,
+                       const PrimaryChromaticity* g,
+                       const PrimaryChromaticity* b,
+                       const PrimaryChromaticity* white_point);
+  const PrimaryChromaticity* r() const { return r_; }
+  const PrimaryChromaticity* g() const { return g_; }
+  const PrimaryChromaticity* b() const { return b_; }
+  const PrimaryChromaticity* white_point() const { return white_point_; }
+
+  float luminance_max() const { return luminance_max_; }
+  void set_luminance_max(float luminance_max) {
+    luminance_max_ = luminance_max;
+  }
+  float luminance_min() const { return luminance_min_; }
+  void set_luminance_min(float luminance_min) {
+    luminance_min_ = luminance_min;
+  }
+
+ private:
+  // Returns size of MasteringMetadata child elements.
+  uint64_t PayloadSize() const;
+
+  float luminance_max_;
+  float luminance_min_;
+  PrimaryChromaticity* r_;
+  PrimaryChromaticity* g_;
+  PrimaryChromaticity* b_;
+  PrimaryChromaticity* white_point_;
+};
+
+class Colour {
+ public:
+  enum MatrixCoefficients {
+    kGbr = 0,
+    kBt709 = 1,
+    kUnspecifiedMc = 2,
+    kReserved = 3,
+    kFcc = 4,
+    kBt470bg = 5,
+    kSmpte170MMc = 6,
+    kSmpte240MMc = 7,
+    kYcocg = 8,
+    kBt2020NonConstantLuminance = 9,
+    kBt2020ConstantLuminance = 10,
+  };
+  enum ChromaSitingHorz {
+    kUnspecifiedCsh = 0,
+    kLeftCollocated = 1,
+    kHalfCsh = 2,
+  };
+  enum ChromaSitingVert {
+    kUnspecifiedCsv = 0,
+    kTopCollocated = 1,
+    kHalfCsv = 2,
+  };
+  enum Range {
+    kUnspecifiedCr = 0,
+    kBroadcastRange = 1,
+    kFullRange = 2,
+    kMcTcDefined = 3,  // Defined by MatrixCoefficients/TransferCharacteristics.
+  };
+  enum TransferCharacteristics {
+    kIturBt709Tc = 1,
+    kUnspecifiedTc = 2,
+    kReservedTc = 3,
+    kGamma22Curve = 4,
+    kGamma28Curve = 5,
+    kSmpte170MTc = 6,
+    kSmpte240MTc = 7,
+    kLinear = 8,
+    kLog = 9,
+    kLogSqrt = 10,
+    kIec6196624 = 11,
+    kIturBt1361ExtendedColourGamut = 12,
+    kIec6196621 = 13,
+    kIturBt202010bit = 14,
+    kIturBt202012bit = 15,
+    kSmpteSt2084 = 16,
+    kSmpteSt4281Tc = 17,
+    kAribStdB67Hlg = 18,
+  };
+  enum Primaries {
+    kReservedP0 = 0,
+    kIturBt709P = 1,
+    kUnspecifiedP = 2,
+    kReservedP3 = 3,
+    kIturBt470M = 4,
+    kIturBt470Bg = 5,
+    kSmpte170MP = 6,
+    kSmpte240MP = 7,
+    kFilm = 8,
+    kIturBt2020 = 9,
+    kSmpteSt4281P = 10,
+    kJedecP22Phosphors = 22,
+  };
+  static const uint64_t kValueNotPresent;
+  Colour()
+      : matrix_coefficients_(kValueNotPresent),
+        bits_per_channel_(kValueNotPresent),
+        chroma_subsampling_horz_(kValueNotPresent),
+        chroma_subsampling_vert_(kValueNotPresent),
+        cb_subsampling_horz_(kValueNotPresent),
+        cb_subsampling_vert_(kValueNotPresent),
+        chroma_siting_horz_(kValueNotPresent),
+        chroma_siting_vert_(kValueNotPresent),
+        range_(kValueNotPresent),
+        transfer_characteristics_(kValueNotPresent),
+        primaries_(kValueNotPresent),
+        max_cll_(kValueNotPresent),
+        max_fall_(kValueNotPresent),
+        mastering_metadata_(NULL) {}
+  ~Colour() { delete mastering_metadata_; }
+
+  // Returns total size of the Colour element.
+  uint64_t ColourSize() const;
+  bool Valid() const;
+  bool Write(IMkvWriter* writer) const;
+
+  // Deep copies |mastering_metadata|.
+  bool SetMasteringMetadata(const MasteringMetadata& mastering_metadata);
+
+  const MasteringMetadata* mastering_metadata() const {
+    return mastering_metadata_;
+  }
+
+  uint64_t matrix_coefficients() const { return matrix_coefficients_; }
+  void set_matrix_coefficients(uint64_t matrix_coefficients) {
+    matrix_coefficients_ = matrix_coefficients;
+  }
+  uint64_t bits_per_channel() const { return bits_per_channel_; }
+  void set_bits_per_channel(uint64_t bits_per_channel) {
+    bits_per_channel_ = bits_per_channel;
+  }
+  uint64_t chroma_subsampling_horz() const { return chroma_subsampling_horz_; }
+  void set_chroma_subsampling_horz(uint64_t chroma_subsampling_horz) {
+    chroma_subsampling_horz_ = chroma_subsampling_horz;
+  }
+  uint64_t chroma_subsampling_vert() const { return chroma_subsampling_vert_; }
+  void set_chroma_subsampling_vert(uint64_t chroma_subsampling_vert) {
+    chroma_subsampling_vert_ = chroma_subsampling_vert;
+  }
+  uint64_t cb_subsampling_horz() const { return cb_subsampling_horz_; }
+  void set_cb_subsampling_horz(uint64_t cb_subsampling_horz) {
+    cb_subsampling_horz_ = cb_subsampling_horz;
+  }
+  uint64_t cb_subsampling_vert() const { return cb_subsampling_vert_; }
+  void set_cb_subsampling_vert(uint64_t cb_subsampling_vert) {
+    cb_subsampling_vert_ = cb_subsampling_vert;
+  }
+  uint64_t chroma_siting_horz() const { return chroma_siting_horz_; }
+  void set_chroma_siting_horz(uint64_t chroma_siting_horz) {
+    chroma_siting_horz_ = chroma_siting_horz;
+  }
+  uint64_t chroma_siting_vert() const { return chroma_siting_vert_; }
+  void set_chroma_siting_vert(uint64_t chroma_siting_vert) {
+    chroma_siting_vert_ = chroma_siting_vert;
+  }
+  uint64_t range() const { return range_; }
+  void set_range(uint64_t range) { range_ = range; }
+  uint64_t transfer_characteristics() const {
+    return transfer_characteristics_;
+  }
+  void set_transfer_characteristics(uint64_t transfer_characteristics) {
+    transfer_characteristics_ = transfer_characteristics;
+  }
+  uint64_t primaries() const { return primaries_; }
+  void set_primaries(uint64_t primaries) { primaries_ = primaries; }
+  uint64_t max_cll() const { return max_cll_; }
+  void set_max_cll(uint64_t max_cll) { max_cll_ = max_cll; }
+  uint64_t max_fall() const { return max_fall_; }
+  void set_max_fall(uint64_t max_fall) { max_fall_ = max_fall; }
+
+ private:
+  // Returns size of Colour child elements.
+  uint64_t PayloadSize() const;
+
+  uint64_t matrix_coefficients_;
+  uint64_t bits_per_channel_;
+  uint64_t chroma_subsampling_horz_;
+  uint64_t chroma_subsampling_vert_;
+  uint64_t cb_subsampling_horz_;
+  uint64_t cb_subsampling_vert_;
+  uint64_t chroma_siting_horz_;
+  uint64_t chroma_siting_vert_;
+  uint64_t range_;
+  uint64_t transfer_characteristics_;
+  uint64_t primaries_;
+  uint64_t max_cll_;
+  uint64_t max_fall_;
+
+  MasteringMetadata* mastering_metadata_;
+};
+
+///////////////////////////////////////////////////////////////
+// Projection element.
+class Projection {
+ public:
+  enum ProjectionType {
+    kTypeNotPresent = -1,
+    kRectangular = 0,
+    kEquirectangular = 1,
+    kCubeMap = 2,
+    kMesh = 3,
+  };
+  static const uint64_t kValueNotPresent;
+  Projection()
+      : type_(kRectangular),
+        pose_yaw_(0.0),
+        pose_pitch_(0.0),
+        pose_roll_(0.0),
+        private_data_(NULL),
+        private_data_length_(0) {}
+  ~Projection() { delete[] private_data_; }
+
+  uint64_t ProjectionSize() const;
+  bool Write(IMkvWriter* writer) const;
+
+  bool SetProjectionPrivate(const uint8_t* private_data,
+                            uint64_t private_data_length);
+
+  ProjectionType type() const { return type_; }
+  void set_type(ProjectionType type) { type_ = type; }
+  float pose_yaw() const { return pose_yaw_; }
+  void set_pose_yaw(float pose_yaw) { pose_yaw_ = pose_yaw; }
+  float pose_pitch() const { return pose_pitch_; }
+  void set_pose_pitch(float pose_pitch) { pose_pitch_ = pose_pitch; }
+  float pose_roll() const { return pose_roll_; }
+  void set_pose_roll(float pose_roll) { pose_roll_ = pose_roll; }
+  uint8_t* private_data() const { return private_data_; }
+  uint64_t private_data_length() const { return private_data_length_; }
+
+ private:
+  // Returns size of VideoProjection child elements.
+  uint64_t PayloadSize() const;
+
+  ProjectionType type_;
+  float pose_yaw_;
+  float pose_pitch_;
+  float pose_roll_;
+  uint8_t* private_data_;
+  uint64_t private_data_length_;
+};
+
+///////////////////////////////////////////////////////////////
+// Track element.
+class Track {
+ public:
+  // The |seed| parameter is used to synthesize a UID for the track.
+  explicit Track(unsigned int* seed);
+  virtual ~Track();
+
+  // Adds a ContentEncoding element to the Track. Returns true on success.
+  virtual bool AddContentEncoding();
+
+  // Returns the ContentEncoding by index. Returns NULL if there is no
+  // ContentEncoding match.
+  ContentEncoding* GetContentEncodingByIndex(uint32_t index) const;
+
+  // Returns the size in bytes for the payload of the Track element.
+  virtual uint64_t PayloadSize() const;
+
+  // Returns the size in bytes of the Track element.
+  virtual uint64_t Size() const;
+
+  // Output the Track element to the writer. Returns true on success.
+  virtual bool Write(IMkvWriter* writer) const;
+
+  // Sets the CodecPrivate element of the Track element. Copies |length|
+  // bytes from |codec_private| to |codec_private_|. Returns true on success.
+  bool SetCodecPrivate(const uint8_t* codec_private, uint64_t length);
+
+  void set_codec_id(const char* codec_id);
+  const char* codec_id() const { return codec_id_; }
+  const uint8_t* codec_private() const { return codec_private_; }
+  void set_language(const char* language);
+  const char* language() const { return language_; }
+  void set_max_block_additional_id(uint64_t max_block_additional_id) {
+    max_block_additional_id_ = max_block_additional_id;
+  }
+  uint64_t max_block_additional_id() const { return max_block_additional_id_; }
+  void set_name(const char* name);
+  const char* name() const { return name_; }
+  void set_number(uint64_t number) { number_ = number; }
+  uint64_t number() const { return number_; }
+  void set_type(uint64_t type) { type_ = type; }
+  uint64_t type() const { return type_; }
+  void set_uid(uint64_t uid) { uid_ = uid; }
+  uint64_t uid() const { return uid_; }
+  void set_codec_delay(uint64_t codec_delay) { codec_delay_ = codec_delay; }
+  uint64_t codec_delay() const { return codec_delay_; }
+  void set_seek_pre_roll(uint64_t seek_pre_roll) {
+    seek_pre_roll_ = seek_pre_roll;
+  }
+  uint64_t seek_pre_roll() const { return seek_pre_roll_; }
+  void set_default_duration(uint64_t default_duration) {
+    default_duration_ = default_duration;
+  }
+  uint64_t default_duration() const { return default_duration_; }
+
+  uint64_t codec_private_length() const { return codec_private_length_; }
+  uint32_t content_encoding_entries_size() const {
+    return content_encoding_entries_size_;
+  }
+
+ private:
+  // Track element names.
+  char* codec_id_;
+  uint8_t* codec_private_;
+  char* language_;
+  uint64_t max_block_additional_id_;
+  char* name_;
+  uint64_t number_;
+  uint64_t type_;
+  uint64_t uid_;
+  uint64_t codec_delay_;
+  uint64_t seek_pre_roll_;
+  uint64_t default_duration_;
+
+  // Size of the CodecPrivate data in bytes.
+  uint64_t codec_private_length_;
+
+  // ContentEncoding element list.
+  ContentEncoding** content_encoding_entries_;
+
+  // Number of ContentEncoding elements added.
+  uint32_t content_encoding_entries_size_;
+
+  LIBWEBM_DISALLOW_COPY_AND_ASSIGN(Track);
+};
+
+///////////////////////////////////////////////////////////////
+// Track that has video specific elements.
+class VideoTrack : public Track {
+ public:
+  // Supported modes for stereo 3D.
+  enum StereoMode {
+    kMono = 0,
+    kSideBySideLeftIsFirst = 1,
+    kTopBottomRightIsFirst = 2,
+    kTopBottomLeftIsFirst = 3,
+    kSideBySideRightIsFirst = 11
+  };
+
+  enum AlphaMode { kNoAlpha = 0, kAlpha = 1 };
+
+  // The |seed| parameter is used to synthesize a UID for the track.
+  explicit VideoTrack(unsigned int* seed);
+  virtual ~VideoTrack();
+
+  // Returns the size in bytes for the payload of the Track element plus the
+  // video specific elements.
+  virtual uint64_t PayloadSize() const;
+
+  // Output the VideoTrack element to the writer. Returns true on success.
+  virtual bool Write(IMkvWriter* writer) const;
+
+  // Sets the video's stereo mode. Returns true on success.
+  bool SetStereoMode(uint64_t stereo_mode);
+
+  // Sets the video's alpha mode. Returns true on success.
+  bool SetAlphaMode(uint64_t alpha_mode);
+
+  void set_display_height(uint64_t height) { display_height_ = height; }
+  uint64_t display_height() const { return display_height_; }
+  void set_display_width(uint64_t width) { display_width_ = width; }
+  uint64_t display_width() const { return display_width_; }
+  void set_pixel_height(uint64_t height) { pixel_height_ = height; }
+  uint64_t pixel_height() const { return pixel_height_; }
+  void set_pixel_width(uint64_t width) { pixel_width_ = width; }
+  uint64_t pixel_width() const { return pixel_width_; }
+
+  void set_crop_left(uint64_t crop_left) { crop_left_ = crop_left; }
+  uint64_t crop_left() const { return crop_left_; }
+  void set_crop_right(uint64_t crop_right) { crop_right_ = crop_right; }
+  uint64_t crop_right() const { return crop_right_; }
+  void set_crop_top(uint64_t crop_top) { crop_top_ = crop_top; }
+  uint64_t crop_top() const { return crop_top_; }
+  void set_crop_bottom(uint64_t crop_bottom) { crop_bottom_ = crop_bottom; }
+  uint64_t crop_bottom() const { return crop_bottom_; }
+
+  void set_frame_rate(double frame_rate) { frame_rate_ = frame_rate; }
+  double frame_rate() const { return frame_rate_; }
+  void set_height(uint64_t height) { height_ = height; }
+  uint64_t height() const { return height_; }
+  uint64_t stereo_mode() { return stereo_mode_; }
+  uint64_t alpha_mode() { return alpha_mode_; }
+  void set_width(uint64_t width) { width_ = width; }
+  uint64_t width() const { return width_; }
+  void set_colour_space(const char* colour_space);
+  const char* colour_space() const { return colour_space_; }
+
+  Colour* colour() { return colour_; }
+
+  // Deep copies |colour|.
+  bool SetColour(const Colour& colour);
+
+  Projection* projection() { return projection_; }
+
+  // Deep copies |projection|.
+  bool SetProjection(const Projection& projection);
+
+ private:
+  // Returns the size in bytes of the Video element.
+  uint64_t VideoPayloadSize() const;
+
+  // Video track element names.
+  uint64_t display_height_;
+  uint64_t display_width_;
+  uint64_t pixel_height_;
+  uint64_t pixel_width_;
+  uint64_t crop_left_;
+  uint64_t crop_right_;
+  uint64_t crop_top_;
+  uint64_t crop_bottom_;
+  double frame_rate_;
+  uint64_t height_;
+  uint64_t stereo_mode_;
+  uint64_t alpha_mode_;
+  uint64_t width_;
+  char* colour_space_;
+
+  Colour* colour_;
+  Projection* projection_;
+
+  LIBWEBM_DISALLOW_COPY_AND_ASSIGN(VideoTrack);
+};
+
+///////////////////////////////////////////////////////////////
+// Track that has audio specific elements.
+class AudioTrack : public Track {
+ public:
+  // The |seed| parameter is used to synthesize a UID for the track.
+  explicit AudioTrack(unsigned int* seed);
+  virtual ~AudioTrack();
+
+  // Returns the size in bytes for the payload of the Track element plus the
+  // audio specific elements.
+  virtual uint64_t PayloadSize() const;
+
+  // Output the AudioTrack element to the writer. Returns true on success.
+  virtual bool Write(IMkvWriter* writer) const;
+
+  void set_bit_depth(uint64_t bit_depth) { bit_depth_ = bit_depth; }
+  uint64_t bit_depth() const { return bit_depth_; }
+  void set_channels(uint64_t channels) { channels_ = channels; }
+  uint64_t channels() const { return channels_; }
+  void set_sample_rate(double sample_rate) { sample_rate_ = sample_rate; }
+  double sample_rate() const { return sample_rate_; }
+
+ private:
+  // Audio track element names.
+  uint64_t bit_depth_;
+  uint64_t channels_;
+  double sample_rate_;
+
+  LIBWEBM_DISALLOW_COPY_AND_ASSIGN(AudioTrack);
+};
+
+///////////////////////////////////////////////////////////////
+// Tracks element
+class Tracks {
+ public:
+  // Audio and video type defined by the Matroska specs.
+  enum { kVideo = 0x1, kAudio = 0x2 };
+
+  static const char kOpusCodecId[];
+  static const char kVorbisCodecId[];
+  static const char kAv1CodecId[];
+  static const char kVp8CodecId[];
+  static const char kVp9CodecId[];
+  static const char kWebVttCaptionsId[];
+  static const char kWebVttDescriptionsId[];
+  static const char kWebVttMetadataId[];
+  static const char kWebVttSubtitlesId[];
+
+  Tracks();
+  ~Tracks();
+
+  // Adds a Track element to the Tracks object. |track| will be owned and
+  // deleted by the Tracks object. Returns true on success. |number| is the
+  // number to use for the track. |number| must be >= 0. If |number| == 0
+  // then the muxer will decide on the track number.
+  bool AddTrack(Track* track, int32_t number);
+
+  // Returns the track by index. Returns NULL if there is no track match.
+  const Track* GetTrackByIndex(uint32_t idx) const;
+
+  // Search the Tracks and return the track that matches |tn|. Returns NULL
+  // if there is no track match.
+  Track* GetTrackByNumber(uint64_t track_number) const;
+
+  // Returns true if the track number is an audio track.
+  bool TrackIsAudio(uint64_t track_number) const;
+
+  // Returns true if the track number is a video track.
+  bool TrackIsVideo(uint64_t track_number) const;
+
+  // Output the Tracks element to the writer. Returns true on success.
+  bool Write(IMkvWriter* writer) const;
+
+  uint32_t track_entries_size() const { return track_entries_size_; }
+
+ private:
+  // Track element list.
+  Track** track_entries_;
+
+  // Number of Track elements added.
+  uint32_t track_entries_size_;
+
+  // Whether or not Tracks element has already been written via IMkvWriter.
+  mutable bool wrote_tracks_;
+
+  LIBWEBM_DISALLOW_COPY_AND_ASSIGN(Tracks);
+};
+
+///////////////////////////////////////////////////////////////
+// Chapter element
+//
+class Chapter {
+ public:
+  // Set the identifier for this chapter.  (This corresponds to the
+  // Cue Identifier line in WebVTT.)
+  // TODO(matthewjheaney): the actual serialization of this item in
+  // MKV is pending.
+  bool set_id(const char* id);
+
+  // Converts the nanosecond start and stop times of this chapter to
+  // their corresponding timecode values, and stores them that way.
+  void set_time(const Segment& segment, uint64_t start_time_ns,
+                uint64_t end_time_ns);
+
+  // Sets the uid for this chapter. Primarily used to enable
+  // deterministic output from the muxer.
+  void set_uid(const uint64_t uid) { uid_ = uid; }
+
+  // Add a title string to this chapter, per the semantics described
+  // here:
+  //  http://www.matroska.org/technical/specs/index.html
+  //
+  // The title ("chapter string") is a UTF-8 string.
+  //
+  // The language has ISO 639-2 representation, described here:
+  //  http://www.loc.gov/standards/iso639-2/englangn.html
+  //  http://www.loc.gov/standards/iso639-2/php/English_list.php
+  // If you specify NULL as the language value, this implies
+  // English ("eng").
+  //
+  // The country value corresponds to the codes listed here:
+  //  http://www.iana.org/domains/root/db/
+  //
+  // The function returns false if the string could not be allocated.
+  bool add_string(const char* title, const char* language, const char* country);
+
+ private:
+  friend class Chapters;
+
+  // For storage of chapter titles that differ by language.
+  class Display {
+   public:
+    // Establish representation invariant for new Display object.
+    void Init();
+
+    // Reclaim resources, in anticipation of destruction.
+    void Clear();
+
+    // Copies the title to the |title_| member.  Returns false on
+    // error.
+    bool set_title(const char* title);
+
+    // Copies the language to the |language_| member.  Returns false
+    // on error.
+    bool set_language(const char* language);
+
+    // Copies the country to the |country_| member.  Returns false on
+    // error.
+    bool set_country(const char* country);
+
+    // If |writer| is non-NULL, serialize the Display sub-element of
+    // the Atom into the stream.  Returns the Display element size on
+    // success, 0 if error.
+    uint64_t WriteDisplay(IMkvWriter* writer) const;
+
+   private:
+    char* title_;
+    char* language_;
+    char* country_;
+  };
+
+  Chapter();
+  ~Chapter();
+
+  // Establish the representation invariant for a newly-created
+  // Chapter object.  The |seed| parameter is used to create the UID
+  // for this chapter atom.
+  void Init(unsigned int* seed);
+
+  // Copies this Chapter object to a different one.  This is used when
+  // expanding a plain array of Chapter objects (see Chapters).
+  void ShallowCopy(Chapter* dst) const;
+
+  // Reclaim resources used by this Chapter object, pending its
+  // destruction.
+  void Clear();
+
+  // If there is no storage remaining on the |displays_| array for a
+  // new display object, creates a new, longer array and copies the
+  // existing Display objects to the new array.  Returns false if the
+  // array cannot be expanded.
+  bool ExpandDisplaysArray();
+
+  // If |writer| is non-NULL, serialize the Atom sub-element into the
+  // stream.  Returns the total size of the element on success, 0 if
+  // error.
+  uint64_t WriteAtom(IMkvWriter* writer) const;
+
+  // The string identifier for this chapter (corresponds to WebVTT cue
+  // identifier).
+  char* id_;
+
+  // Start timecode of the chapter.
+  uint64_t start_timecode_;
+
+  // Stop timecode of the chapter.
+  uint64_t end_timecode_;
+
+  // The binary identifier for this chapter.
+  uint64_t uid_;
+
+  // The Atom element can contain multiple Display sub-elements, as
+  // the same logical title can be rendered in different languages.
+  Display* displays_;
+
+  // The physical length (total size) of the |displays_| array.
+  int displays_size_;
+
+  // The logical length (number of active elements) on the |displays_|
+  // array.
+  int displays_count_;
+
+  LIBWEBM_DISALLOW_COPY_AND_ASSIGN(Chapter);
+};
+
+///////////////////////////////////////////////////////////////
+// Chapters element
+//
+class Chapters {
+ public:
+  Chapters();
+  ~Chapters();
+
+  Chapter* AddChapter(unsigned int* seed);
+
+  // Returns the number of chapters that have been added.
+  int Count() const;
+
+  // Output the Chapters element to the writer. Returns true on success.
+  bool Write(IMkvWriter* writer) const;
+
+ private:
+  // Expands the chapters_ array if there is not enough space to contain
+  // another chapter object.  Returns true on success.
+  bool ExpandChaptersArray();
+
+  // If |writer| is non-NULL, serialize the Edition sub-element of the
+  // Chapters element into the stream.  Returns the Edition element
+  // size on success, 0 if error.
+  uint64_t WriteEdition(IMkvWriter* writer) const;
+
+  // Total length of the chapters_ array.
+  int chapters_size_;
+
+  // Number of active chapters on the chapters_ array.
+  int chapters_count_;
+
+  // Array for storage of chapter objects.
+  Chapter* chapters_;
+
+  LIBWEBM_DISALLOW_COPY_AND_ASSIGN(Chapters);
+};
+
+///////////////////////////////////////////////////////////////
+// Tag element
+//
+class Tag {
+ public:
+  bool add_simple_tag(const char* tag_name, const char* tag_string);
+
+ private:
+  // Tags calls Clear and the destructor of Tag
+  friend class Tags;
+
+  // For storage of simple tags
+  class SimpleTag {
+   public:
+    // Establish representation invariant for new SimpleTag object.
+    void Init();
+
+    // Reclaim resources, in anticipation of destruction.
+    void Clear();
+
+    // Copies the title to the |tag_name_| member.  Returns false on
+    // error.
+    bool set_tag_name(const char* tag_name);
+
+    // Copies the language to the |tag_string_| member.  Returns false
+    // on error.
+    bool set_tag_string(const char* tag_string);
+
+    // If |writer| is non-NULL, serialize the SimpleTag sub-element of
+    // the Atom into the stream.  Returns the SimpleTag element size on
+    // success, 0 if error.
+    uint64_t Write(IMkvWriter* writer) const;
+
+   private:
+    char* tag_name_;
+    char* tag_string_;
+  };
+
+  Tag();
+  ~Tag();
+
+  // Copies this Tag object to a different one.  This is used when
+  // expanding a plain array of Tag objects (see Tags).
+  void ShallowCopy(Tag* dst) const;
+
+  // Reclaim resources used by this Tag object, pending its
+  // destruction.
+  void Clear();
+
+  // If there is no storage remaining on the |simple_tags_| array for a
+  // new display object, creates a new, longer array and copies the
+  // existing SimpleTag objects to the new array.  Returns false if the
+  // array cannot be expanded.
+  bool ExpandSimpleTagsArray();
+
+  // If |writer| is non-NULL, serialize the Tag sub-element into the
+  // stream.  Returns the total size of the element on success, 0 if
+  // error.
+  uint64_t Write(IMkvWriter* writer) const;
+
+  // The Atom element can contain multiple SimpleTag sub-elements
+  SimpleTag* simple_tags_;
+
+  // The physical length (total size) of the |simple_tags_| array.
+  int simple_tags_size_;
+
+  // The logical length (number of active elements) on the |simple_tags_|
+  // array.
+  int simple_tags_count_;
+
+  LIBWEBM_DISALLOW_COPY_AND_ASSIGN(Tag);
+};
+
+///////////////////////////////////////////////////////////////
+// Tags element
+//
+class Tags {
+ public:
+  Tags();
+  ~Tags();
+
+  Tag* AddTag();
+
+  // Returns the number of tags that have been added.
+  int Count() const;
+
+  // Output the Tags element to the writer. Returns true on success.
+  bool Write(IMkvWriter* writer) const;
+
+ private:
+  // Expands the tags_ array if there is not enough space to contain
+  // another tag object.  Returns true on success.
+  bool ExpandTagsArray();
+
+  // Total length of the tags_ array.
+  int tags_size_;
+
+  // Number of active tags on the tags_ array.
+  int tags_count_;
+
+  // Array for storage of tag objects.
+  Tag* tags_;
+
+  LIBWEBM_DISALLOW_COPY_AND_ASSIGN(Tags);
+};
+
+///////////////////////////////////////////////////////////////
+// Cluster element
+//
+// Notes:
+//  |Init| must be called before any other method in this class.
+class Cluster {
+ public:
+  // |timecode| is the absolute timecode of the cluster. |cues_pos| is the
+  // position for the cluster within the segment that should be written in
+  // the cues element. |timecode_scale| is the timecode scale of the segment.
+  Cluster(uint64_t timecode, int64_t cues_pos, uint64_t timecode_scale,
+          bool write_last_frame_with_duration = false,
+          bool fixed_size_timecode = false);
+  ~Cluster();
+
+  bool Init(IMkvWriter* ptr_writer);
+
+  // Adds a frame to be output in the file. The frame is written out through
+  // |writer_| if successful. Returns true on success.
+  bool AddFrame(const Frame* frame);
+
+  // Adds a frame to be output in the file. The frame is written out through
+  // |writer_| if successful. Returns true on success.
+  // Inputs:
+  //   data: Pointer to the data
+  //   length: Length of the data
+  //   track_number: Track to add the data to. Value returned by Add track
+  //                 functions.  The range of allowed values is [1, 126].
+  //   timecode:     Absolute (not relative to cluster) timestamp of the
+  //                 frame, expressed in timecode units.
+  //   is_key:       Flag telling whether or not this frame is a key frame.
+  bool AddFrame(const uint8_t* data, uint64_t length, uint64_t track_number,
+                uint64_t timecode,  // timecode units (absolute)
+                bool is_key);
+
+  // Adds a frame to be output in the file. The frame is written out through
+  // |writer_| if successful. Returns true on success.
+  // Inputs:
+  //   data: Pointer to the data
+  //   length: Length of the data
+  //   additional: Pointer to the additional data
+  //   additional_length: Length of the additional data
+  //   add_id: Value of BlockAddID element
+  //   track_number: Track to add the data to. Value returned by Add track
+  //                 functions.  The range of allowed values is [1, 126].
+  //   abs_timecode: Absolute (not relative to cluster) timestamp of the
+  //                 frame, expressed in timecode units.
+  //   is_key:       Flag telling whether or not this frame is a key frame.
+  bool AddFrameWithAdditional(const uint8_t* data, uint64_t length,
+                              const uint8_t* additional,
+                              uint64_t additional_length, uint64_t add_id,
+                              uint64_t track_number, uint64_t abs_timecode,
+                              bool is_key);
+
+  // Adds a frame to be output in the file. The frame is written out through
+  // |writer_| if successful. Returns true on success.
+  // Inputs:
+  //   data: Pointer to the data.
+  //   length: Length of the data.
+  //   discard_padding: DiscardPadding element value.
+  //   track_number: Track to add the data to. Value returned by Add track
+  //                 functions.  The range of allowed values is [1, 126].
+  //   abs_timecode: Absolute (not relative to cluster) timestamp of the
+  //                 frame, expressed in timecode units.
+  //   is_key:       Flag telling whether or not this frame is a key frame.
+  bool AddFrameWithDiscardPadding(const uint8_t* data, uint64_t length,
+                                  int64_t discard_padding,
+                                  uint64_t track_number, uint64_t abs_timecode,
+                                  bool is_key);
+
+  // Writes a frame of metadata to the output medium; returns true on
+  // success.
+  // Inputs:
+  //   data: Pointer to the data
+  //   length: Length of the data
+  //   track_number: Track to add the data to. Value returned by Add track
+  //                 functions.  The range of allowed values is [1, 126].
+  //   timecode:     Absolute (not relative to cluster) timestamp of the
+  //                 metadata frame, expressed in timecode units.
+  //   duration:     Duration of metadata frame, in timecode units.
+  //
+  // The metadata frame is written as a block group, with a duration
+  // sub-element but no reference time sub-elements (indicating that
+  // it is considered a keyframe, per Matroska semantics).
+  bool AddMetadata(const uint8_t* data, uint64_t length, uint64_t track_number,
+                   uint64_t timecode, uint64_t duration);
+
+  // Increments the size of the cluster's data in bytes.
+  void AddPayloadSize(uint64_t size);
+
+  // Closes the cluster so no more data can be written to it. Will update the
+  // cluster's size if |writer_| is seekable. Returns true on success. This
+  // variant of Finalize() fails when |write_last_frame_with_duration_| is set
+  // to true.
+  bool Finalize();
+
+  // Closes the cluster so no more data can be written to it. Will update the
+  // cluster's size if |writer_| is seekable. Returns true on success.
+  // Inputs:
+  //   set_last_frame_duration: Boolean indicating whether or not the duration
+  //                            of the last frame should be set. If set to
+  //                            false, the |duration| value is ignored and
+  //                            |write_last_frame_with_duration_| will not be
+  //                            honored.
+  //   duration: Duration of the Cluster in timecode scale.
+  bool Finalize(bool set_last_frame_duration, uint64_t duration);
+
+  // Returns the size in bytes for the entire Cluster element.
+  uint64_t Size() const;
+
+  // Given |abs_timecode|, calculates timecode relative to most recent timecode.
+  // Returns -1 on failure, or a relative timecode.
+  int64_t GetRelativeTimecode(int64_t abs_timecode) const;
+
+  int64_t size_position() const { return size_position_; }
+  int32_t blocks_added() const { return blocks_added_; }
+  uint64_t payload_size() const { return payload_size_; }
+  int64_t position_for_cues() const { return position_for_cues_; }
+  uint64_t timecode() const { return timecode_; }
+  uint64_t timecode_scale() const { return timecode_scale_; }
+  void set_write_last_frame_with_duration(bool write_last_frame_with_duration) {
+    write_last_frame_with_duration_ = write_last_frame_with_duration;
+  }
+  bool write_last_frame_with_duration() const {
+    return write_last_frame_with_duration_;
+  }
+
+ private:
+  // Iterator type for the |stored_frames_| map.
+  typedef std::map<uint64_t, std::list<Frame*> >::iterator FrameMapIterator;
+
+  // Utility method that confirms that blocks can still be added, and that the
+  // cluster header has been written. Used by |DoWriteFrame*|. Returns true
+  // when successful.
+  bool PreWriteBlock();
+
+  // Utility method used by the |DoWriteFrame*| methods that handles the book
+  // keeping required after each block is written.
+  void PostWriteBlock(uint64_t element_size);
+
+  // Does some verification and calls WriteFrame.
+  bool DoWriteFrame(const Frame* const frame);
+
+  // Either holds back the given frame, or writes it out depending on whether or
+  // not |write_last_frame_with_duration_| is set.
+  bool QueueOrWriteFrame(const Frame* const frame);
+
+  // Outputs the Cluster header to |writer_|. Returns true on success.
+  bool WriteClusterHeader();
+
+  // Number of blocks added to the cluster.
+  int32_t blocks_added_;
+
+  // Flag telling if the cluster has been closed.
+  bool finalized_;
+
+  // Flag indicating whether the cluster's timecode will always be written out
+  // using 8 bytes.
+  bool fixed_size_timecode_;
+
+  // Flag telling if the cluster's header has been written.
+  bool header_written_;
+
+  // The size of the cluster elements in bytes.
+  uint64_t payload_size_;
+
+  // The file position used for cue points.
+  const int64_t position_for_cues_;
+
+  // The file position of the cluster's size element.
+  int64_t size_position_;
+
+  // The absolute timecode of the cluster.
+  const uint64_t timecode_;
+
+  // The timecode scale of the Segment containing the cluster.
+  const uint64_t timecode_scale_;
+
+  // Flag indicating whether the last frame of the cluster should be written as
+  // a Block with Duration. If set to true, then it will result in holding back
+  // of frames and the parameterized version of Finalize() must be called to
+  // finish writing the Cluster.
+  bool write_last_frame_with_duration_;
+
+  // Map used to hold back frames, if required. Track number is the key.
+  std::map<uint64_t, std::list<Frame*> > stored_frames_;
+
+  // Map from track number to the timestamp of the last block written for that
+  // track.
+  std::map<uint64_t, uint64_t> last_block_timestamp_;
+
+  // Pointer to the writer object. Not owned by this class.
+  IMkvWriter* writer_;
+
+  LIBWEBM_DISALLOW_COPY_AND_ASSIGN(Cluster);
+};
+
+///////////////////////////////////////////////////////////////
+// SeekHead element
+class SeekHead {
+ public:
+  SeekHead();
+  ~SeekHead();
+
+  // TODO(fgalligan): Change this to reserve a certain size. Then check how
+  // big the seek entry to be added is as not every seek entry will be the
+  // maximum size it could be.
+  // Adds a seek entry to be written out when the element is finalized. |id|
+  // must be the coded mkv element id. |pos| is the file position of the
+  // element. Returns true on success.
+  bool AddSeekEntry(uint32_t id, uint64_t pos);
+
+  // Writes out SeekHead and SeekEntry elements. Returns true on success.
+  bool Finalize(IMkvWriter* writer) const;
+
+  // Returns the id of the Seek Entry at the given index. Returns -1 if index is
+  // out of range.
+  uint32_t GetId(int index) const;
+
+  // Returns the position of the Seek Entry at the given index. Returns -1 if
+  // index is out of range.
+  uint64_t GetPosition(int index) const;
+
+  // Sets the Seek Entry id and position at given index.
+  // Returns true on success.
+  bool SetSeekEntry(int index, uint32_t id, uint64_t position);
+
+  // Reserves space by writing out a Void element which will be updated with
+  // a SeekHead element later. Returns true on success.
+  bool Write(IMkvWriter* writer);
+
+  // We are going to put a cap on the number of Seek Entries.
+  const static int32_t kSeekEntryCount = 5;
+
+ private:
+  // Returns the maximum size in bytes of one seek entry.
+  uint64_t MaxEntrySize() const;
+
+  // Seek entry id element list.
+  uint32_t seek_entry_id_[kSeekEntryCount];
+
+  // Seek entry pos element list.
+  uint64_t seek_entry_pos_[kSeekEntryCount];
+
+  // The file position of SeekHead element.
+  int64_t start_pos_;
+
+  LIBWEBM_DISALLOW_COPY_AND_ASSIGN(SeekHead);
+};
+
+///////////////////////////////////////////////////////////////
+// Segment Information element
+class SegmentInfo {
+ public:
+  SegmentInfo();
+  ~SegmentInfo();
+
+  // Will update the duration if |duration_| is > 0.0. Returns true on success.
+  bool Finalize(IMkvWriter* writer) const;
+
+  // Sets |muxing_app_| and |writing_app_|.
+  bool Init();
+
+  // Output the Segment Information element to the writer. Returns true on
+  // success.
+  bool Write(IMkvWriter* writer);
+
+  void set_duration(double duration) { duration_ = duration; }
+  double duration() const { return duration_; }
+  void set_muxing_app(const char* app);
+  const char* muxing_app() const { return muxing_app_; }
+  void set_timecode_scale(uint64_t scale) { timecode_scale_ = scale; }
+  uint64_t timecode_scale() const { return timecode_scale_; }
+  void set_writing_app(const char* app);
+  const char* writing_app() const { return writing_app_; }
+  void set_date_utc(int64_t date_utc) { date_utc_ = date_utc; }
+  int64_t date_utc() const { return date_utc_; }
+
+ private:
+  // Segment Information element names.
+  // Initially set to -1 to signify that a duration has not been set and should
+  // not be written out.
+  double duration_;
+  // Set to libwebm-%d.%d.%d.%d, major, minor, build, revision.
+  char* muxing_app_;
+  uint64_t timecode_scale_;
+  // Initially set to libwebm-%d.%d.%d.%d, major, minor, build, revision.
+  char* writing_app_;
+  // LLONG_MIN when DateUTC is not set.
+  int64_t date_utc_;
+
+  // The file position of the duration element.
+  int64_t duration_pos_;
+
+  LIBWEBM_DISALLOW_COPY_AND_ASSIGN(SegmentInfo);
+};
+
+///////////////////////////////////////////////////////////////
+// This class represents the main segment in a WebM file. Currently only
+// supports one Segment element.
+//
+// Notes:
+//  |Init| must be called before any other method in this class.
+class Segment {
+ public:
+  enum Mode { kLive = 0x1, kFile = 0x2 };
+
+  enum CuesPosition {
+    kAfterClusters = 0x0,  // Position Cues after Clusters - Default
+    kBeforeClusters = 0x1  // Position Cues before Clusters
+  };
+
+  static const uint32_t kDefaultDocTypeVersion = 4;
+  static const uint64_t kDefaultMaxClusterDuration = 30000000000ULL;
+
+  Segment();
+  ~Segment();
+
+  // Initializes |SegmentInfo| and returns result. Always returns false when
+  // |ptr_writer| is NULL.
+  bool Init(IMkvWriter* ptr_writer);
+
+  // Adds a generic track to the segment.  Returns the newly-allocated
+  // track object (which is owned by the segment) on success, NULL on
+  // error. |number| is the number to use for the track.  |number|
+  // must be >= 0. If |number| == 0 then the muxer will decide on the
+  // track number.
+  Track* AddTrack(int32_t number);
+
+  // Adds a Vorbis audio track to the segment. Returns the number of the track
+  // on success, 0 on error. |number| is the number to use for the audio track.
+  // |number| must be >= 0. If |number| == 0 then the muxer will decide on
+  // the track number.
+  uint64_t AddAudioTrack(int32_t sample_rate, int32_t channels, int32_t number);
+
+  // Adds an empty chapter to the chapters of this segment.  Returns
+  // non-NULL on success.  After adding the chapter, the caller should
+  // populate its fields via the Chapter member functions.
+  Chapter* AddChapter();
+
+  // Adds an empty tag to the tags of this segment.  Returns
+  // non-NULL on success.  After adding the tag, the caller should
+  // populate its fields via the Tag member functions.
+  Tag* AddTag();
+
+  // Adds a cue point to the Cues element. |timestamp| is the time in
+  // nanoseconds of the cue's time. |track| is the Track of the Cue. This
+  // function must be called after AddFrame to calculate the correct
+  // BlockNumber for the CuePoint. Returns true on success.
+  bool AddCuePoint(uint64_t timestamp, uint64_t track);
+
+  // Adds a frame to be output in the file. Returns true on success.
+  // Inputs:
+  //   data: Pointer to the data
+  //   length: Length of the data
+  //   track_number: Track to add the data to. Value returned by Add track
+  //                 functions.
+  //   timestamp:    Timestamp of the frame in nanoseconds from 0.
+  //   is_key:       Flag telling whether or not this frame is a key frame.
+  bool AddFrame(const uint8_t* data, uint64_t length, uint64_t track_number,
+                uint64_t timestamp_ns, bool is_key);
+
+  // Writes a frame of metadata to the output medium; returns true on
+  // success.
+  // Inputs:
+  //   data: Pointer to the data
+  //   length: Length of the data
+  //   track_number: Track to add the data to. Value returned by Add track
+  //                 functions.
+  //   timecode:     Absolute timestamp of the metadata frame, expressed
+  //                 in nanosecond units.
+  //   duration:     Duration of metadata frame, in nanosecond units.
+  //
+  // The metadata frame is written as a block group, with a duration
+  // sub-element but no reference time sub-elements (indicating that
+  // it is considered a keyframe, per Matroska semantics).
+  bool AddMetadata(const uint8_t* data, uint64_t length, uint64_t track_number,
+                   uint64_t timestamp_ns, uint64_t duration_ns);
+
+  // Writes a frame with additional data to the output medium; returns true on
+  // success.
+  // Inputs:
+  //   data: Pointer to the data.
+  //   length: Length of the data.
+  //   additional: Pointer to additional data.
+  //   additional_length: Length of additional data.
+  //   add_id: Additional ID which identifies the type of additional data.
+  //   track_number: Track to add the data to. Value returned by Add track
+  //                 functions.
+  //   timestamp:    Absolute timestamp of the frame, expressed in nanosecond
+  //                 units.
+  //   is_key:       Flag telling whether or not this frame is a key frame.
+  bool AddFrameWithAdditional(const uint8_t* data, uint64_t length,
+                              const uint8_t* additional,
+                              uint64_t additional_length, uint64_t add_id,
+                              uint64_t track_number, uint64_t timestamp,
+                              bool is_key);
+
+  // Writes a frame with DiscardPadding to the output medium; returns true on
+  // success.
+  // Inputs:
+  //   data: Pointer to the data.
+  //   length: Length of the data.
+  //   discard_padding: DiscardPadding element value.
+  //   track_number: Track to add the data to. Value returned by Add track
+  //                 functions.
+  //   timestamp:    Absolute timestamp of the frame, expressed in nanosecond
+  //                 units.
+  //   is_key:       Flag telling whether or not this frame is a key frame.
+  bool AddFrameWithDiscardPadding(const uint8_t* data, uint64_t length,
+                                  int64_t discard_padding,
+                                  uint64_t track_number, uint64_t timestamp,
+                                  bool is_key);
+
+  // Writes a Frame to the output medium. Chooses the correct way of writing
+  // the frame (Block vs SimpleBlock) based on the parameters passed.
+  // Inputs:
+  //   frame: frame object
+  bool AddGenericFrame(const Frame* frame);
+
+  // Adds a VP8 video track to the segment. Returns the number of the track on
+  // success, 0 on error. |number| is the number to use for the video track.
+  // |number| must be >= 0. If |number| == 0 then the muxer will decide on
+  // the track number.
+  uint64_t AddVideoTrack(int32_t width, int32_t height, int32_t number);
+
+  // This function must be called after Finalize() if you need a copy of the
+  // output with Cues written before the Clusters. It will return false if the
+  // writer is not seekable of if chunking is set to true.
+  // Input parameters:
+  // reader - an IMkvReader object created with the same underlying file of the
+  //          current writer object. Make sure to close the existing writer
+  //          object before creating this so that all the data is properly
+  //          flushed and available for reading.
+  // writer - an IMkvWriter object pointing to a *different* file than the one
+  //          pointed by the current writer object. This file will contain the
+  //          Cues element before the Clusters.
+  bool CopyAndMoveCuesBeforeClusters(mkvparser::IMkvReader* reader,
+                                     IMkvWriter* writer);
+
+  // Sets which track to use for the Cues element. Must have added the track
+  // before calling this function. Returns true on success. |track_number| is
+  // returned by the Add track functions.
+  bool CuesTrack(uint64_t track_number);
+
+  // This will force the muxer to create a new Cluster when the next frame is
+  // added.
+  void ForceNewClusterOnNextFrame();
+
+  // Writes out any frames that have not been written out. Finalizes the last
+  // cluster. May update the size and duration of the segment. May output the
+  // Cues element. May finalize the SeekHead element. Returns true on success.
+  bool Finalize();
+
+  // Returns the Cues object.
+  Cues* GetCues() { return &cues_; }
+
+  // Returns the Segment Information object.
+  const SegmentInfo* GetSegmentInfo() const { return &segment_info_; }
+  SegmentInfo* GetSegmentInfo() { return &segment_info_; }
+
+  // Search the Tracks and return the track that matches |track_number|.
+  // Returns NULL if there is no track match.
+  Track* GetTrackByNumber(uint64_t track_number) const;
+
+  // Toggles whether to output a cues element.
+  void OutputCues(bool output_cues);
+
+  // Toggles whether to write the last frame in each Cluster with Duration.
+  void AccurateClusterDuration(bool accurate_cluster_duration);
+
+  // Toggles whether to write the Cluster Timecode using exactly 8 bytes.
+  void UseFixedSizeClusterTimecode(bool fixed_size_cluster_timecode);
+
+  // Sets if the muxer will output files in chunks or not. |chunking| is a
+  // flag telling whether or not to turn on chunking. |filename| is the base
+  // filename for the chunk files. The header chunk file will be named
+  // |filename|.hdr and the data chunks will be named
+  // |filename|_XXXXXX.chk. Chunking implies that the muxer will be writing
+  // to files so the muxer will use the default MkvWriter class to control
+  // what data is written to what files. Returns true on success.
+  // TODO: Should we change the IMkvWriter Interface to add Open and Close?
+  // That will force the interface to be dependent on files.
+  bool SetChunking(bool chunking, const char* filename);
+
+  bool chunking() const { return chunking_; }
+  uint64_t cues_track() const { return cues_track_; }
+  void set_max_cluster_duration(uint64_t max_cluster_duration) {
+    max_cluster_duration_ = max_cluster_duration;
+  }
+  uint64_t max_cluster_duration() const { return max_cluster_duration_; }
+  void set_max_cluster_size(uint64_t max_cluster_size) {
+    max_cluster_size_ = max_cluster_size;
+  }
+  uint64_t max_cluster_size() const { return max_cluster_size_; }
+  void set_mode(Mode mode) { mode_ = mode; }
+  Mode mode() const { return mode_; }
+  CuesPosition cues_position() const { return cues_position_; }
+  bool output_cues() const { return output_cues_; }
+  void set_estimate_file_duration(bool estimate_duration) {
+    estimate_file_duration_ = estimate_duration;
+  }
+  bool estimate_file_duration() const { return estimate_file_duration_; }
+  const SegmentInfo* segment_info() const { return &segment_info_; }
+  void set_duration(double duration) { duration_ = duration; }
+  double duration() const { return duration_; }
+
+  // Returns true when codec IDs are valid for WebM.
+  bool DocTypeIsWebm() const;
+
+ private:
+  // Checks if header information has been output and initialized. If not it
+  // will output the Segment element and initialize the SeekHead elment and
+  // Cues elements.
+  bool CheckHeaderInfo();
+
+  // Sets |doc_type_version_| based on the current element requirements.
+  void UpdateDocTypeVersion();
+
+  // Sets |name| according to how many chunks have been written. |ext| is the
+  // file extension. |name| must be deleted by the calling app. Returns true
+  // on success.
+  bool UpdateChunkName(const char* ext, char** name) const;
+
+  // Returns the maximum offset within the segment's payload. When chunking
+  // this function is needed to determine offsets of elements within the
+  // chunked files. Returns -1 on error.
+  int64_t MaxOffset();
+
+  // Adds the frame to our frame array.
+  bool QueueFrame(Frame* frame);
+
+  // Output all frames that are queued. Returns -1 on error, otherwise
+  // it returns the number of frames written.
+  int WriteFramesAll();
+
+  // Output all frames that are queued that have an end time that is less
+  // then |timestamp|. Returns true on success and if there are no frames
+  // queued.
+  bool WriteFramesLessThan(uint64_t timestamp);
+
+  // Outputs the segment header, Segment Information element, SeekHead element,
+  // and Tracks element to |writer_|.
+  bool WriteSegmentHeader();
+
+  // Given a frame with the specified timestamp (nanosecond units) and
+  // keyframe status, determine whether a new cluster should be
+  // created, before writing enqueued frames and the frame itself. The
+  // function returns one of the following values:
+  //  -1 = error: an out-of-order frame was detected
+  //  0 = do not create a new cluster, and write frame to the existing cluster
+  //  1 = create a new cluster, and write frame to that new cluster
+  //  2 = create a new cluster, and re-run test
+  int TestFrame(uint64_t track_num, uint64_t timestamp_ns, bool key) const;
+
+  // Create a new cluster, using the earlier of the first enqueued
+  // frame, or the indicated time. Returns true on success.
+  bool MakeNewCluster(uint64_t timestamp_ns);
+
+  // Checks whether a new cluster needs to be created, and if so
+  // creates a new cluster. Returns false if creation of a new cluster
+  // was necessary but creation was not successful.
+  bool DoNewClusterProcessing(uint64_t track_num, uint64_t timestamp_ns,
+                              bool key);
+
+  // Adjusts Cue Point values (to place Cues before Clusters) so that they
+  // reflect the correct offsets.
+  void MoveCuesBeforeClusters();
+
+  // This function recursively computes the correct cluster offsets (this is
+  // done to move the Cues before Clusters). It recursively updates the change
+  // in size (which indicates a change in cluster offset) until no sizes change.
+  // Parameters:
+  // diff - indicates the difference in size of the Cues element that needs to
+  //        accounted for.
+  // index - index in the list of Cues which is currently being adjusted.
+  // cue_size - sum of size of all the CuePoint elements.
+  void MoveCuesBeforeClustersHelper(uint64_t diff, int index,
+                                    uint64_t* cue_size);
+
+  // Seeds the random number generator used to make UIDs.
+  unsigned int seed_;
+
+  // WebM elements
+  Cues cues_;
+  SeekHead seek_head_;
+  SegmentInfo segment_info_;
+  Tracks tracks_;
+  Chapters chapters_;
+  Tags tags_;
+
+  // Number of chunks written.
+  int chunk_count_;
+
+  // Current chunk filename.
+  char* chunk_name_;
+
+  // Default MkvWriter object created by this class used for writing clusters
+  // out in separate files.
+  MkvWriter* chunk_writer_cluster_;
+
+  // Default MkvWriter object created by this class used for writing Cues
+  // element out to a file.
+  MkvWriter* chunk_writer_cues_;
+
+  // Default MkvWriter object created by this class used for writing the
+  // Matroska header out to a file.
+  MkvWriter* chunk_writer_header_;
+
+  // Flag telling whether or not the muxer is chunking output to multiple
+  // files.
+  bool chunking_;
+
+  // Base filename for the chunked files.
+  char* chunking_base_name_;
+
+  // File position offset where the Clusters end.
+  int64_t cluster_end_offset_;
+
+  // List of clusters.
+  Cluster** cluster_list_;
+
+  // Number of cluster pointers allocated in the cluster list.
+  int32_t cluster_list_capacity_;
+
+  // Number of clusters in the cluster list.
+  int32_t cluster_list_size_;
+
+  // Indicates whether Cues should be written before or after Clusters
+  CuesPosition cues_position_;
+
+  // Track number that is associated with the cues element for this segment.
+  uint64_t cues_track_;
+
+  // Tells the muxer to force a new cluster on the next Block.
+  bool force_new_cluster_;
+
+  // List of stored audio frames. These variables are used to store frames so
+  // the muxer can follow the guideline "Audio blocks that contain the video
+  // key frame's timecode should be in the same cluster as the video key frame
+  // block."
+  Frame** frames_;
+
+  // Number of frame pointers allocated in the frame list.
+  int32_t frames_capacity_;
+
+  // Number of frames in the frame list.
+  int32_t frames_size_;
+
+  // Flag telling if a video track has been added to the segment.
+  bool has_video_;
+
+  // Flag telling if the segment's header has been written.
+  bool header_written_;
+
+  // Duration of the last block in nanoseconds.
+  uint64_t last_block_duration_;
+
+  // Last timestamp in nanoseconds added to a cluster.
+  uint64_t last_timestamp_;
+
+  // Last timestamp in nanoseconds by track number added to a cluster.
+  uint64_t last_track_timestamp_[kMaxTrackNumber];
+
+  // Number of frames written per track.
+  uint64_t track_frames_written_[kMaxTrackNumber];
+
+  // Maximum time in nanoseconds for a cluster duration. This variable is a
+  // guideline and some clusters may have a longer duration. Default is 30
+  // seconds.
+  uint64_t max_cluster_duration_;
+
+  // Maximum size in bytes for a cluster. This variable is a guideline and
+  // some clusters may have a larger size. Default is 0 which signifies that
+  // the muxer will decide the size.
+  uint64_t max_cluster_size_;
+
+  // The mode that segment is in. If set to |kLive| the writer must not
+  // seek backwards.
+  Mode mode_;
+
+  // Flag telling the muxer that a new cue point should be added.
+  bool new_cuepoint_;
+
+  // TODO(fgalligan): Should we add support for more than one Cues element?
+  // Flag whether or not the muxer should output a Cues element.
+  bool output_cues_;
+
+  // Flag whether or not the last frame in each Cluster will have a Duration
+  // element in it.
+  bool accurate_cluster_duration_;
+
+  // Flag whether or not to write the Cluster Timecode using exactly 8 bytes.
+  bool fixed_size_cluster_timecode_;
+
+  // Flag whether or not to estimate the file duration.
+  bool estimate_file_duration_;
+
+  // The size of the EBML header, used to validate the header if
+  // WriteEbmlHeader() is called more than once.
+  int32_t ebml_header_size_;
+
+  // The file position of the segment's payload.
+  int64_t payload_pos_;
+
+  // The file position of the element's size.
+  int64_t size_position_;
+
+  // Current DocTypeVersion (|doc_type_version_|) and that written in
+  // WriteSegmentHeader().
+  // WriteEbmlHeader() will be called from Finalize() if |doc_type_version_|
+  // differs from |doc_type_version_written_|.
+  uint32_t doc_type_version_;
+  uint32_t doc_type_version_written_;
+
+  // If |duration_| is > 0, then explicitly set the duration of the segment.
+  double duration_;
+
+  // Pointer to the writer objects. Not owned by this class.
+  IMkvWriter* writer_cluster_;
+  IMkvWriter* writer_cues_;
+  IMkvWriter* writer_header_;
+
+  LIBWEBM_DISALLOW_COPY_AND_ASSIGN(Segment);
+};
+
+}  // namespace mkvmuxer
+
+#endif  // MKVMUXER_MKVMUXER_H_
diff --git a/libs/libaom/src/third_party/libwebm/mkvmuxer/mkvmuxertypes.h b/libs/libaom/src/third_party/libwebm/mkvmuxer/mkvmuxertypes.h
new file mode 100644
index 000000000..e5db12160
--- /dev/null
+++ b/libs/libaom/src/third_party/libwebm/mkvmuxer/mkvmuxertypes.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS.  All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+
+#ifndef MKVMUXER_MKVMUXERTYPES_H_
+#define MKVMUXER_MKVMUXERTYPES_H_
+
+namespace mkvmuxer {
+typedef unsigned char uint8;
+typedef short int16;
+typedef int int32;
+typedef unsigned int uint32;
+typedef long long int64;
+typedef unsigned long long uint64;
+}  // namespace mkvmuxer
+
+// Copied from Chromium basictypes.h
+// A macro to disallow the copy constructor and operator= functions
+// This should be used in the private: declarations for a class
+#define LIBWEBM_DISALLOW_COPY_AND_ASSIGN(TypeName) \
+  TypeName(const TypeName&);                       \
+  void operator=(const TypeName&)
+
+#endif  // MKVMUXER_MKVMUXERTYPES_HPP_
diff --git a/libs/libaom/src/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc b/libs/libaom/src/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc
new file mode 100644
index 000000000..6436817c9
--- /dev/null
+++ b/libs/libaom/src/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc
@@ -0,0 +1,743 @@
+// Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS.  All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+
+#include "mkvmuxer/mkvmuxerutil.h"
+
+#ifdef __ANDROID__
+#include <fcntl.h>
+#include <unistd.h>
+#endif
+
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <ctime>
+#include <new>
+
+#include "common/webmids.h"
+#include "mkvmuxer/mkvmuxer.h"
+#include "mkvmuxer/mkvwriter.h"
+
+namespace mkvmuxer {
+
+namespace {
+
+// Date elements are always 8 octets in size.
+const int kDateElementSize = 8;
+
+uint64 WriteBlock(IMkvWriter* writer, const Frame* const frame, int64 timecode,
+                  uint64 timecode_scale) {
+  uint64 block_additional_elem_size = 0;
+  uint64 block_addid_elem_size = 0;
+  uint64 block_more_payload_size = 0;
+  uint64 block_more_elem_size = 0;
+  uint64 block_additions_payload_size = 0;
+  uint64 block_additions_elem_size = 0;
+  if (frame->additional()) {
+    block_additional_elem_size =
+        EbmlElementSize(libwebm::kMkvBlockAdditional, frame->additional(),
+                        frame->additional_length());
+    block_addid_elem_size = EbmlElementSize(
+        libwebm::kMkvBlockAddID, static_cast<uint64>(frame->add_id()));
+
+    block_more_payload_size =
+        block_addid_elem_size + block_additional_elem_size;
+    block_more_elem_size =
+        EbmlMasterElementSize(libwebm::kMkvBlockMore, block_more_payload_size) +
+        block_more_payload_size;
+    block_additions_payload_size = block_more_elem_size;
+    block_additions_elem_size =
+        EbmlMasterElementSize(libwebm::kMkvBlockAdditions,
+                              block_additions_payload_size) +
+        block_additions_payload_size;
+  }
+
+  uint64 discard_padding_elem_size = 0;
+  if (frame->discard_padding() != 0) {
+    discard_padding_elem_size =
+        EbmlElementSize(libwebm::kMkvDiscardPadding,
+                        static_cast<int64>(frame->discard_padding()));
+  }
+
+  const uint64 reference_block_timestamp =
+      frame->reference_block_timestamp() / timecode_scale;
+  uint64 reference_block_elem_size = 0;
+  if (!frame->is_key()) {
+    reference_block_elem_size =
+        EbmlElementSize(libwebm::kMkvReferenceBlock, reference_block_timestamp);
+  }
+
+  const uint64 duration = frame->duration() / timecode_scale;
+  uint64 block_duration_elem_size = 0;
+  if (duration > 0)
+    block_duration_elem_size =
+        EbmlElementSize(libwebm::kMkvBlockDuration, duration);
+
+  const uint64 block_payload_size = 4 + frame->length();
+  const uint64 block_elem_size =
+      EbmlMasterElementSize(libwebm::kMkvBlock, block_payload_size) +
+      block_payload_size;
+
+  const uint64 block_group_payload_size =
+      block_elem_size + block_additions_elem_size + block_duration_elem_size +
+      discard_padding_elem_size + reference_block_elem_size;
+
+  if (!WriteEbmlMasterElement(writer, libwebm::kMkvBlockGroup,
+                              block_group_payload_size)) {
+    return 0;
+  }
+
+  if (!WriteEbmlMasterElement(writer, libwebm::kMkvBlock, block_payload_size))
+    return 0;
+
+  if (WriteUInt(writer, frame->track_number()))
+    return 0;
+
+  if (SerializeInt(writer, timecode, 2))
+    return 0;
+
+  // For a Block, flags is always 0.
+  if (SerializeInt(writer, 0, 1))
+    return 0;
+
+  if (writer->Write(frame->frame(), static_cast<uint32>(frame->length())))
+    return 0;
+
+  if (frame->additional()) {
+    if (!WriteEbmlMasterElement(writer, libwebm::kMkvBlockAdditions,
+                                block_additions_payload_size)) {
+      return 0;
+    }
+
+    if (!WriteEbmlMasterElement(writer, libwebm::kMkvBlockMore,
+                                block_more_payload_size))
+      return 0;
+
+    if (!WriteEbmlElement(writer, libwebm::kMkvBlockAddID,
+                          static_cast<uint64>(frame->add_id())))
+      return 0;
+
+    if (!WriteEbmlElement(writer, libwebm::kMkvBlockAdditional,
+                          frame->additional(), frame->additional_length())) {
+      return 0;
+    }
+  }
+
+  if (frame->discard_padding() != 0 &&
+      !WriteEbmlElement(writer, libwebm::kMkvDiscardPadding,
+                        static_cast<int64>(frame->discard_padding()))) {
+    return false;
+  }
+
+  if (!frame->is_key() && !WriteEbmlElement(writer, libwebm::kMkvReferenceBlock,
+                                            reference_block_timestamp)) {
+    return false;
+  }
+
+  if (duration > 0 &&
+      !WriteEbmlElement(writer, libwebm::kMkvBlockDuration, duration)) {
+    return false;
+  }
+  return EbmlMasterElementSize(libwebm::kMkvBlockGroup,
+                               block_group_payload_size) +
+         block_group_payload_size;
+}
+
+uint64 WriteSimpleBlock(IMkvWriter* writer, const Frame* const frame,
+                        int64 timecode) {
+  if (WriteID(writer, libwebm::kMkvSimpleBlock))
+    return 0;
+
+  const int32 size = static_cast<int32>(frame->length()) + 4;
+  if (WriteUInt(writer, size))
+    return 0;
+
+  if (WriteUInt(writer, static_cast<uint64>(frame->track_number())))
+    return 0;
+
+  if (SerializeInt(writer, timecode, 2))
+    return 0;
+
+  uint64 flags = 0;
+  if (frame->is_key())
+    flags |= 0x80;
+
+  if (SerializeInt(writer, flags, 1))
+    return 0;
+
+  if (writer->Write(frame->frame(), static_cast<uint32>(frame->length())))
+    return 0;
+
+  return GetUIntSize(libwebm::kMkvSimpleBlock) + GetCodedUIntSize(size) + 4 +
+         frame->length();
+}
+
+}  // namespace
+
+int32 GetCodedUIntSize(uint64 value) {
+  if (value < 0x000000000000007FULL)
+    return 1;
+  else if (value < 0x0000000000003FFFULL)
+    return 2;
+  else if (value < 0x00000000001FFFFFULL)
+    return 3;
+  else if (value < 0x000000000FFFFFFFULL)
+    return 4;
+  else if (value < 0x00000007FFFFFFFFULL)
+    return 5;
+  else if (value < 0x000003FFFFFFFFFFULL)
+    return 6;
+  else if (value < 0x0001FFFFFFFFFFFFULL)
+    return 7;
+  return 8;
+}
+
+int32 GetUIntSize(uint64 value) {
+  if (value < 0x0000000000000100ULL)
+    return 1;
+  else if (value < 0x0000000000010000ULL)
+    return 2;
+  else if (value < 0x0000000001000000ULL)
+    return 3;
+  else if (value < 0x0000000100000000ULL)
+    return 4;
+  else if (value < 0x0000010000000000ULL)
+    return 5;
+  else if (value < 0x0001000000000000ULL)
+    return 6;
+  else if (value < 0x0100000000000000ULL)
+    return 7;
+  return 8;
+}
+
+int32 GetIntSize(int64 value) {
+  // Doubling the requested value ensures positive values with their high bit
+  // set are written with 0-padding to avoid flipping the signedness.
+  const uint64 v = (value < 0) ? value ^ -1LL : value;
+  return GetUIntSize(2 * v);
+}
+
+uint64 EbmlMasterElementSize(uint64 type, uint64 value) {
+  // Size of EBML ID
+  int32 ebml_size = GetUIntSize(type);
+
+  // Datasize
+  ebml_size += GetCodedUIntSize(value);
+
+  return ebml_size;
+}
+
+uint64 EbmlElementSize(uint64 type, int64 value) {
+  // Size of EBML ID
+  int32 ebml_size = GetUIntSize(type);
+
+  // Datasize
+  ebml_size += GetIntSize(value);
+
+  // Size of Datasize
+  ebml_size++;
+
+  return ebml_size;
+}
+
+uint64 EbmlElementSize(uint64 type, uint64 value) {
+  return EbmlElementSize(type, value, 0);
+}
+
+uint64 EbmlElementSize(uint64 type, uint64 value, uint64 fixed_size) {
+  // Size of EBML ID
+  uint64 ebml_size = GetUIntSize(type);
+
+  // Datasize
+  ebml_size += (fixed_size > 0) ? fixed_size : GetUIntSize(value);
+
+  // Size of Datasize
+  ebml_size++;
+
+  return ebml_size;
+}
+
+uint64 EbmlElementSize(uint64 type, float /* value */) {
+  // Size of EBML ID
+  uint64 ebml_size = GetUIntSize(type);
+
+  // Datasize
+  ebml_size += sizeof(float);
+
+  // Size of Datasize
+  ebml_size++;
+
+  return ebml_size;
+}
+
+uint64 EbmlElementSize(uint64 type, const char* value) {
+  if (!value)
+    return 0;
+
+  // Size of EBML ID
+  uint64 ebml_size = GetUIntSize(type);
+
+  // Datasize
+  ebml_size += strlen(value);
+
+  // Size of Datasize
+  ebml_size += GetCodedUIntSize(strlen(value));
+
+  return ebml_size;
+}
+
+uint64 EbmlElementSize(uint64 type, const uint8* value, uint64 size) {
+  if (!value)
+    return 0;
+
+  // Size of EBML ID
+  uint64 ebml_size = GetUIntSize(type);
+
+  // Datasize
+  ebml_size += size;
+
+  // Size of Datasize
+  ebml_size += GetCodedUIntSize(size);
+
+  return ebml_size;
+}
+
+uint64 EbmlDateElementSize(uint64 type) {
+  // Size of EBML ID
+  uint64 ebml_size = GetUIntSize(type);
+
+  // Datasize
+  ebml_size += kDateElementSize;
+
+  // Size of Datasize
+  ebml_size++;
+
+  return ebml_size;
+}
+
+int32 SerializeInt(IMkvWriter* writer, int64 value, int32 size) {
+  if (!writer || size < 1 || size > 8)
+    return -1;
+
+  for (int32 i = 1; i <= size; ++i) {
+    const int32 byte_count = size - i;
+    const int32 bit_count = byte_count * 8;
+
+    const int64 bb = value >> bit_count;
+    const uint8 b = static_cast<uint8>(bb);
+
+    const int32 status = writer->Write(&b, 1);
+
+    if (status < 0)
+      return status;
+  }
+
+  return 0;
+}
+
+int32 SerializeFloat(IMkvWriter* writer, float f) {
+  if (!writer)
+    return -1;
+
+  assert(sizeof(uint32) == sizeof(float));
+  // This union is merely used to avoid a reinterpret_cast from float& to
+  // uint32& which will result in violation of strict aliasing.
+  union U32 {
+    uint32 u32;
+    float f;
+  } value;
+  value.f = f;
+
+  for (int32 i = 1; i <= 4; ++i) {
+    const int32 byte_count = 4 - i;
+    const int32 bit_count = byte_count * 8;
+
+    const uint8 byte = static_cast<uint8>(value.u32 >> bit_count);
+
+    const int32 status = writer->Write(&byte, 1);
+
+    if (status < 0)
+      return status;
+  }
+
+  return 0;
+}
+
+int32 WriteUInt(IMkvWriter* writer, uint64 value) {
+  if (!writer)
+    return -1;
+
+  int32 size = GetCodedUIntSize(value);
+
+  return WriteUIntSize(writer, value, size);
+}
+
+int32 WriteUIntSize(IMkvWriter* writer, uint64 value, int32 size) {
+  if (!writer || size < 0 || size > 8)
+    return -1;
+
+  if (size > 0) {
+    const uint64 bit = 1LL << (size * 7);
+
+    if (value > (bit - 2))
+      return -1;
+
+    value |= bit;
+  } else {
+    size = 1;
+    int64 bit;
+
+    for (;;) {
+      bit = 1LL << (size * 7);
+      const uint64 max = bit - 2;
+
+      if (value <= max)
+        break;
+
+      ++size;
+    }
+
+    if (size > 8)
+      return false;
+
+    value |= bit;
+  }
+
+  return SerializeInt(writer, value, size);
+}
+
+int32 WriteID(IMkvWriter* writer, uint64 type) {
+  if (!writer)
+    return -1;
+
+  writer->ElementStartNotify(type, writer->Position());
+
+  const int32 size = GetUIntSize(type);
+
+  return SerializeInt(writer, type, size);
+}
+
+bool WriteEbmlMasterElement(IMkvWriter* writer, uint64 type, uint64 size) {
+  if (!writer)
+    return false;
+
+  if (WriteID(writer, type))
+    return false;
+
+  if (WriteUInt(writer, size))
+    return false;
+
+  return true;
+}
+
+bool WriteEbmlElement(IMkvWriter* writer, uint64 type, uint64 value) {
+  return WriteEbmlElement(writer, type, value, 0);
+}
+
+bool WriteEbmlElement(IMkvWriter* writer, uint64 type, uint64 value,
+                      uint64 fixed_size) {
+  if (!writer)
+    return false;
+
+  if (WriteID(writer, type))
+    return false;
+
+  uint64 size = GetUIntSize(value);
+  if (fixed_size > 0) {
+    if (size > fixed_size)
+      return false;
+    size = fixed_size;
+  }
+  if (WriteUInt(writer, size))
+    return false;
+
+  if (SerializeInt(writer, value, static_cast<int32>(size)))
+    return false;
+
+  return true;
+}
+
+bool WriteEbmlElement(IMkvWriter* writer, uint64 type, int64 value) {
+  if (!writer)
+    return false;
+
+  if (WriteID(writer, type))
+    return 0;
+
+  const uint64 size = GetIntSize(value);
+  if (WriteUInt(writer, size))
+    return false;
+
+  if (SerializeInt(writer, value, static_cast<int32>(size)))
+    return false;
+
+  return true;
+}
+
+bool WriteEbmlElement(IMkvWriter* writer, uint64 type, float value) {
+  if (!writer)
+    return false;
+
+  if (WriteID(writer, type))
+    return false;
+
+  if (WriteUInt(writer, 4))
+    return false;
+
+  if (SerializeFloat(writer, value))
+    return false;
+
+  return true;
+}
+
+bool WriteEbmlElement(IMkvWriter* writer, uint64 type, const char* value) {
+  if (!writer || !value)
+    return false;
+
+  if (WriteID(writer, type))
+    return false;
+
+  const uint64 length = strlen(value);
+  if (WriteUInt(writer, length))
+    return false;
+
+  if (writer->Write(value, static_cast<uint32>(length)))
+    return false;
+
+  return true;
+}
+
+bool WriteEbmlElement(IMkvWriter* writer, uint64 type, const uint8* value,
+                      uint64 size) {
+  if (!writer || !value || size < 1)
+    return false;
+
+  if (WriteID(writer, type))
+    return false;
+
+  if (WriteUInt(writer, size))
+    return false;
+
+  if (writer->Write(value, static_cast<uint32>(size)))
+    return false;
+
+  return true;
+}
+
+bool WriteEbmlDateElement(IMkvWriter* writer, uint64 type, int64 value) {
+  if (!writer)
+    return false;
+
+  if (WriteID(writer, type))
+    return false;
+
+  if (WriteUInt(writer, kDateElementSize))
+    return false;
+
+  if (SerializeInt(writer, value, kDateElementSize))
+    return false;
+
+  return true;
+}
+
+uint64 WriteFrame(IMkvWriter* writer, const Frame* const frame,
+                  Cluster* cluster) {
+  if (!writer || !frame || !frame->IsValid() || !cluster ||
+      !cluster->timecode_scale())
+    return 0;
+
+  //  Technically the timecode for a block can be less than the
+  //  timecode for the cluster itself (remember that block timecode
+  //  is a signed, 16-bit integer).  However, as a simplification we
+  //  only permit non-negative cluster-relative timecodes for blocks.
+  const int64 relative_timecode = cluster->GetRelativeTimecode(
+      frame->timestamp() / cluster->timecode_scale());
+  if (relative_timecode < 0 || relative_timecode > kMaxBlockTimecode)
+    return 0;
+
+  return frame->CanBeSimpleBlock()
+             ? WriteSimpleBlock(writer, frame, relative_timecode)
+             : WriteBlock(writer, frame, relative_timecode,
+                          cluster->timecode_scale());
+}
+
+uint64 WriteVoidElement(IMkvWriter* writer, uint64 size) {
+  if (!writer)
+    return false;
+
+  // Subtract one for the void ID and the coded size.
+  uint64 void_entry_size = size - 1 - GetCodedUIntSize(size - 1);
+  uint64 void_size = EbmlMasterElementSize(libwebm::kMkvVoid, void_entry_size) +
+                     void_entry_size;
+
+  if (void_size != size)
+    return 0;
+
+  const int64 payload_position = writer->Position();
+  if (payload_position < 0)
+    return 0;
+
+  if (WriteID(writer, libwebm::kMkvVoid))
+    return 0;
+
+  if (WriteUInt(writer, void_entry_size))
+    return 0;
+
+  const uint8 value = 0;
+  for (int32 i = 0; i < static_cast<int32>(void_entry_size); ++i) {
+    if (writer->Write(&value, 1))
+      return 0;
+  }
+
+  const int64 stop_position = writer->Position();
+  if (stop_position < 0 ||
+      stop_position - payload_position != static_cast<int64>(void_size))
+    return 0;
+
+  return void_size;
+}
+
+void GetVersion(int32* major, int32* minor, int32* build, int32* revision) {
+  *major = 0;
+  *minor = 2;
+  *build = 1;
+  *revision = 0;
+}
+
+uint64 MakeUID(unsigned int* seed) {
+  uint64 uid = 0;
+
+#ifdef __MINGW32__
+  srand(*seed);
+#endif
+
+  for (int i = 0; i < 7; ++i) {  // avoid problems with 8-byte values
+    uid <<= 8;
+
+// TODO(fgalligan): Move random number generation to platform specific code.
+#ifdef _MSC_VER
+    (void)seed;
+    const int32 nn = rand();
+#elif __ANDROID__
+    (void)seed;
+    int32 temp_num = 1;
+    int fd = open("/dev/urandom", O_RDONLY);
+    if (fd != -1) {
+      read(fd, &temp_num, sizeof(temp_num));
+      close(fd);
+    }
+    const int32 nn = temp_num;
+#elif defined __MINGW32__
+    const int32 nn = rand();
+#else
+    const int32 nn = rand_r(seed);
+#endif
+    const int32 n = 0xFF & (nn >> 4);  // throw away low-order bits
+
+    uid |= n;
+  }
+
+  return uid;
+}
+
+bool IsMatrixCoefficientsValueValid(uint64_t value) {
+  switch (value) {
+    case mkvmuxer::Colour::kGbr:
+    case mkvmuxer::Colour::kBt709:
+    case mkvmuxer::Colour::kUnspecifiedMc:
+    case mkvmuxer::Colour::kReserved:
+    case mkvmuxer::Colour::kFcc:
+    case mkvmuxer::Colour::kBt470bg:
+    case mkvmuxer::Colour::kSmpte170MMc:
+    case mkvmuxer::Colour::kSmpte240MMc:
+    case mkvmuxer::Colour::kYcocg:
+    case mkvmuxer::Colour::kBt2020NonConstantLuminance:
+    case mkvmuxer::Colour::kBt2020ConstantLuminance:
+      return true;
+  }
+  return false;
+}
+
+bool IsChromaSitingHorzValueValid(uint64_t value) {
+  switch (value) {
+    case mkvmuxer::Colour::kUnspecifiedCsh:
+    case mkvmuxer::Colour::kLeftCollocated:
+    case mkvmuxer::Colour::kHalfCsh:
+      return true;
+  }
+  return false;
+}
+
+bool IsChromaSitingVertValueValid(uint64_t value) {
+  switch (value) {
+    case mkvmuxer::Colour::kUnspecifiedCsv:
+    case mkvmuxer::Colour::kTopCollocated:
+    case mkvmuxer::Colour::kHalfCsv:
+      return true;
+  }
+  return false;
+}
+
+bool IsColourRangeValueValid(uint64_t value) {
+  switch (value) {
+    case mkvmuxer::Colour::kUnspecifiedCr:
+    case mkvmuxer::Colour::kBroadcastRange:
+    case mkvmuxer::Colour::kFullRange:
+    case mkvmuxer::Colour::kMcTcDefined:
+      return true;
+  }
+  return false;
+}
+
+bool IsTransferCharacteristicsValueValid(uint64_t value) {
+  switch (value) {
+    case mkvmuxer::Colour::kIturBt709Tc:
+    case mkvmuxer::Colour::kUnspecifiedTc:
+    case mkvmuxer::Colour::kReservedTc:
+    case mkvmuxer::Colour::kGamma22Curve:
+    case mkvmuxer::Colour::kGamma28Curve:
+    case mkvmuxer::Colour::kSmpte170MTc:
+    case mkvmuxer::Colour::kSmpte240MTc:
+    case mkvmuxer::Colour::kLinear:
+    case mkvmuxer::Colour::kLog:
+    case mkvmuxer::Colour::kLogSqrt:
+    case mkvmuxer::Colour::kIec6196624:
+    case mkvmuxer::Colour::kIturBt1361ExtendedColourGamut:
+    case mkvmuxer::Colour::kIec6196621:
+    case mkvmuxer::Colour::kIturBt202010bit:
+    case mkvmuxer::Colour::kIturBt202012bit:
+    case mkvmuxer::Colour::kSmpteSt2084:
+    case mkvmuxer::Colour::kSmpteSt4281Tc:
+    case mkvmuxer::Colour::kAribStdB67Hlg:
+      return true;
+  }
+  return false;
+}
+
+bool IsPrimariesValueValid(uint64_t value) {
+  switch (value) {
+    case mkvmuxer::Colour::kReservedP0:
+    case mkvmuxer::Colour::kIturBt709P:
+    case mkvmuxer::Colour::kUnspecifiedP:
+    case mkvmuxer::Colour::kReservedP3:
+    case mkvmuxer::Colour::kIturBt470M:
+    case mkvmuxer::Colour::kIturBt470Bg:
+    case mkvmuxer::Colour::kSmpte170MP:
+    case mkvmuxer::Colour::kSmpte240MP:
+    case mkvmuxer::Colour::kFilm:
+    case mkvmuxer::Colour::kIturBt2020:
+    case mkvmuxer::Colour::kSmpteSt4281P:
+    case mkvmuxer::Colour::kJedecP22Phosphors:
+      return true;
+  }
+  return false;
+}
+
+}  // namespace mkvmuxer
diff --git a/libs/libaom/src/third_party/libwebm/mkvmuxer/mkvmuxerutil.h b/libs/libaom/src/third_party/libwebm/mkvmuxer/mkvmuxerutil.h
new file mode 100644
index 000000000..3355428bd
--- /dev/null
+++ b/libs/libaom/src/third_party/libwebm/mkvmuxer/mkvmuxerutil.h
@@ -0,0 +1,115 @@
+// Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS.  All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+#ifndef MKVMUXER_MKVMUXERUTIL_H_
+#define MKVMUXER_MKVMUXERUTIL_H_
+
+#include "mkvmuxertypes.h"
+
+#include "stdint.h"
+
+namespace mkvmuxer {
+class Cluster;
+class Frame;
+class IMkvWriter;
+
+// TODO(tomfinegan): mkvmuxer:: integer types continue to be used here because
+// changing them causes pain for downstream projects. It would be nice if a
+// solution that allows removal of the mkvmuxer:: integer types while avoiding
+// pain for downstream users of libwebm. Considering that mkvmuxerutil.{cc,h}
+// are really, for the great majority of cases, EBML size calculation and writer
+// functions, perhaps a more EBML focused utility would be the way to go as a
+// first step.
+
+const uint64 kEbmlUnknownValue = 0x01FFFFFFFFFFFFFFULL;
+const int64 kMaxBlockTimecode = 0x07FFFLL;
+
+// Writes out |value| in Big Endian order. Returns 0 on success.
+int32 SerializeInt(IMkvWriter* writer, int64 value, int32 size);
+
+// Writes out |f| in Big Endian order. Returns 0 on success.
+int32 SerializeFloat(IMkvWriter* writer, float f);
+
+// Returns the size in bytes of the element.
+int32 GetUIntSize(uint64 value);
+int32 GetIntSize(int64 value);
+int32 GetCodedUIntSize(uint64 value);
+uint64 EbmlMasterElementSize(uint64 type, uint64 value);
+uint64 EbmlElementSize(uint64 type, int64 value);
+uint64 EbmlElementSize(uint64 type, uint64 value);
+uint64 EbmlElementSize(uint64 type, float value);
+uint64 EbmlElementSize(uint64 type, const char* value);
+uint64 EbmlElementSize(uint64 type, const uint8* value, uint64 size);
+uint64 EbmlDateElementSize(uint64 type);
+
+// Returns the size in bytes of the element assuming that the element was
+// written using |fixed_size| bytes. If |fixed_size| is set to zero, then it
+// computes the necessary number of bytes based on |value|.
+uint64 EbmlElementSize(uint64 type, uint64 value, uint64 fixed_size);
+
+// Creates an EBML coded number from |value| and writes it out. The size of
+// the coded number is determined by the value of |value|. |value| must not
+// be in a coded form. Returns 0 on success.
+int32 WriteUInt(IMkvWriter* writer, uint64 value);
+
+// Creates an EBML coded number from |value| and writes it out. The size of
+// the coded number is determined by the value of |size|. |value| must not
+// be in a coded form. Returns 0 on success.
+int32 WriteUIntSize(IMkvWriter* writer, uint64 value, int32 size);
+
+// Output an Mkv master element. Returns true if the element was written.
+bool WriteEbmlMasterElement(IMkvWriter* writer, uint64 value, uint64 size);
+
+// Outputs an Mkv ID, calls |IMkvWriter::ElementStartNotify|, and passes the
+// ID to |SerializeInt|. Returns 0 on success.
+int32 WriteID(IMkvWriter* writer, uint64 type);
+
+// Output an Mkv non-master element. Returns true if the element was written.
+bool WriteEbmlElement(IMkvWriter* writer, uint64 type, uint64 value);
+bool WriteEbmlElement(IMkvWriter* writer, uint64 type, int64 value);
+bool WriteEbmlElement(IMkvWriter* writer, uint64 type, float value);
+bool WriteEbmlElement(IMkvWriter* writer, uint64 type, const char* value);
+bool WriteEbmlElement(IMkvWriter* writer, uint64 type, const uint8* value,
+                      uint64 size);
+bool WriteEbmlDateElement(IMkvWriter* writer, uint64 type, int64 value);
+
+// Output an Mkv non-master element using fixed size. The element will be
+// written out using exactly |fixed_size| bytes. If |fixed_size| is set to zero
+// then it computes the necessary number of bytes based on |value|. Returns true
+// if the element was written.
+bool WriteEbmlElement(IMkvWriter* writer, uint64 type, uint64 value,
+                      uint64 fixed_size);
+
+// Output a Mkv Frame. It decides the correct element to write (Block vs
+// SimpleBlock) based on the parameters of the Frame.
+uint64 WriteFrame(IMkvWriter* writer, const Frame* const frame,
+                  Cluster* cluster);
+
+// Output a void element. |size| must be the entire size in bytes that will be
+// void. The function will calculate the size of the void header and subtract
+// it from |size|.
+uint64 WriteVoidElement(IMkvWriter* writer, uint64 size);
+
+// Returns the version number of the muxer in |major|, |minor|, |build|,
+// and |revision|.
+void GetVersion(int32* major, int32* minor, int32* build, int32* revision);
+
+// Returns a random number to be used for UID, using |seed| to seed
+// the random-number generator (see POSIX rand_r() for semantics).
+uint64 MakeUID(unsigned int* seed);
+
+// Colour field validation helpers. All return true when |value| is valid.
+bool IsMatrixCoefficientsValueValid(uint64_t value);
+bool IsChromaSitingHorzValueValid(uint64_t value);
+bool IsChromaSitingVertValueValid(uint64_t value);
+bool IsColourRangeValueValid(uint64_t value);
+bool IsTransferCharacteristicsValueValid(uint64_t value);
+bool IsPrimariesValueValid(uint64_t value);
+
+}  // namespace mkvmuxer
+
+#endif  // MKVMUXER_MKVMUXERUTIL_H_
diff --git a/libs/libaom/src/third_party/libwebm/mkvmuxer/mkvwriter.cc b/libs/libaom/src/third_party/libwebm/mkvmuxer/mkvwriter.cc
new file mode 100644
index 000000000..d668384d8
--- /dev/null
+++ b/libs/libaom/src/third_party/libwebm/mkvmuxer/mkvwriter.cc
@@ -0,0 +1,92 @@
+// Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS.  All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+
+#include "mkvmuxer/mkvwriter.h"
+
+#include <sys/types.h>
+
+#ifdef _MSC_VER
+#include <share.h>  // for _SH_DENYWR
+#endif
+
+namespace mkvmuxer {
+
+MkvWriter::MkvWriter() : file_(NULL), writer_owns_file_(true) {}
+
+MkvWriter::MkvWriter(FILE* fp) : file_(fp), writer_owns_file_(false) {}
+
+MkvWriter::~MkvWriter() { Close(); }
+
+int32 MkvWriter::Write(const void* buffer, uint32 length) {
+  if (!file_)
+    return -1;
+
+  if (length == 0)
+    return 0;
+
+  if (buffer == NULL)
+    return -1;
+
+  const size_t bytes_written = fwrite(buffer, 1, length, file_);
+
+  return (bytes_written == length) ? 0 : -1;
+}
+
+bool MkvWriter::Open(const char* filename) {
+  if (filename == NULL)
+    return false;
+
+  if (file_)
+    return false;
+
+#ifdef _MSC_VER
+  file_ = _fsopen(filename, "wb", _SH_DENYWR);
+#else
+  file_ = fopen(filename, "wb");
+#endif
+  if (file_ == NULL)
+    return false;
+  return true;
+}
+
+void MkvWriter::Close() {
+  if (file_ && writer_owns_file_) {
+    fclose(file_);
+  }
+  file_ = NULL;
+}
+
+int64 MkvWriter::Position() const {
+  if (!file_)
+    return 0;
+
+#ifdef _MSC_VER
+  return _ftelli64(file_);
+#else
+  return ftell(file_);
+#endif
+}
+
+int32 MkvWriter::Position(int64 position) {
+  if (!file_)
+    return -1;
+
+#ifdef _MSC_VER
+  return _fseeki64(file_, position, SEEK_SET);
+#elif defined(_WIN32)
+  return fseeko64(file_, static_cast<off_t>(position), SEEK_SET);
+#else
+  return fseeko(file_, static_cast<off_t>(position), SEEK_SET);
+#endif
+}
+
+bool MkvWriter::Seekable() const { return true; }
+
+void MkvWriter::ElementStartNotify(uint64, int64) {}
+
+}  // namespace mkvmuxer
diff --git a/libs/libaom/src/third_party/libwebm/mkvmuxer/mkvwriter.h b/libs/libaom/src/third_party/libwebm/mkvmuxer/mkvwriter.h
new file mode 100644
index 000000000..4227c6374
--- /dev/null
+++ b/libs/libaom/src/third_party/libwebm/mkvmuxer/mkvwriter.h
@@ -0,0 +1,51 @@
+// Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS.  All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+
+#ifndef MKVMUXER_MKVWRITER_H_
+#define MKVMUXER_MKVWRITER_H_
+
+#include <stdio.h>
+
+#include "mkvmuxer/mkvmuxer.h"
+#include "mkvmuxer/mkvmuxertypes.h"
+
+namespace mkvmuxer {
+
+// Default implementation of the IMkvWriter interface on Windows.
+class MkvWriter : public IMkvWriter {
+ public:
+  MkvWriter();
+  explicit MkvWriter(FILE* fp);
+  virtual ~MkvWriter();
+
+  // IMkvWriter interface
+  virtual int64 Position() const;
+  virtual int32 Position(int64 position);
+  virtual bool Seekable() const;
+  virtual int32 Write(const void* buffer, uint32 length);
+  virtual void ElementStartNotify(uint64 element_id, int64 position);
+
+  // Creates and opens a file for writing. |filename| is the name of the file
+  // to open. This function will overwrite the contents of |filename|. Returns
+  // true on success.
+  bool Open(const char* filename);
+
+  // Closes an opened file.
+  void Close();
+
+ private:
+  // File handle to output file.
+  FILE* file_;
+  bool writer_owns_file_;
+
+  LIBWEBM_DISALLOW_COPY_AND_ASSIGN(MkvWriter);
+};
+
+}  // namespace mkvmuxer
+
+#endif  // MKVMUXER_MKVWRITER_H_
diff --git a/libs/libaom/src/third_party/libwebm/mkvparser/mkvparser.cc b/libs/libaom/src/third_party/libwebm/mkvparser/mkvparser.cc
new file mode 100644
index 000000000..ace65bd59
--- /dev/null
+++ b/libs/libaom/src/third_party/libwebm/mkvparser/mkvparser.cc
@@ -0,0 +1,8071 @@
+// Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS.  All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+#include "mkvparser/mkvparser.h"
+
+#if defined(_MSC_VER) && _MSC_VER < 1800
+#include <float.h>  // _isnan() / _finite()
+#define MSC_COMPAT
+#endif
+
+#include <cassert>
+#include <cfloat>
+#include <climits>
+#include <cmath>
+#include <cstring>
+#include <memory>
+#include <new>
+
+#include "common/webmids.h"
+
+namespace mkvparser {
+const long long kStringElementSizeLimit = 20 * 1000 * 1000;
+const float MasteringMetadata::kValueNotPresent = FLT_MAX;
+const long long Colour::kValueNotPresent = LLONG_MAX;
+const float Projection::kValueNotPresent = FLT_MAX;
+
+#ifdef MSC_COMPAT
+inline bool isnan(double val) { return !!_isnan(val); }
+inline bool isinf(double val) { return !_finite(val); }
+#else
+inline bool isnan(double val) { return std::isnan(val); }
+inline bool isinf(double val) { return std::isinf(val); }
+#endif  // MSC_COMPAT
+
+template <typename Type>
+Type* SafeArrayAlloc(unsigned long long num_elements,
+                     unsigned long long element_size) {
+  if (num_elements == 0 || element_size == 0)
+    return NULL;
+
+  const size_t kMaxAllocSize = 0x80000000;  // 2GiB
+  const unsigned long long num_bytes = num_elements * element_size;
+  if (element_size > (kMaxAllocSize / num_elements))
+    return NULL;
+  if (num_bytes != static_cast<size_t>(num_bytes))
+    return NULL;
+
+  return new (std::nothrow) Type[static_cast<size_t>(num_bytes)];
+}
+
+void GetVersion(int& major, int& minor, int& build, int& revision) {
+  major = 1;
+  minor = 0;
+  build = 0;
+  revision = 30;
+}
+
+long long ReadUInt(IMkvReader* pReader, long long pos, long& len) {
+  if (!pReader || pos < 0)
+    return E_FILE_FORMAT_INVALID;
+
+  len = 1;
+  unsigned char b;
+  int status = pReader->Read(pos, 1, &b);
+
+  if (status < 0)  // error or underflow
+    return status;
+
+  if (status > 0)  // interpreted as "underflow"
+    return E_BUFFER_NOT_FULL;
+
+  if (b == 0)  // we can't handle u-int values larger than 8 bytes
+    return E_FILE_FORMAT_INVALID;
+
+  unsigned char m = 0x80;
+
+  while (!(b & m)) {
+    m >>= 1;
+    ++len;
+  }
+
+  long long result = b & (~m);
+  ++pos;
+
+  for (int i = 1; i < len; ++i) {
+    status = pReader->Read(pos, 1, &b);
+
+    if (status < 0) {
+      len = 1;
+      return status;
+    }
+
+    if (status > 0) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
+    }
+
+    result <<= 8;
+    result |= b;
+
+    ++pos;
+  }
+
+  return result;
+}
+
+// Reads an EBML ID and returns it.
+// An ID must at least 1 byte long, cannot exceed 4, and its value must be
+// greater than 0.
+// See known EBML values and EBMLMaxIDLength:
+// http://www.matroska.org/technical/specs/index.html
+// Returns the ID, or a value less than 0 to report an error while reading the
+// ID.
+long long ReadID(IMkvReader* pReader, long long pos, long& len) {
+  if (pReader == NULL || pos < 0)
+    return E_FILE_FORMAT_INVALID;
+
+  // Read the first byte. The length in bytes of the ID is determined by
+  // finding the first set bit in the first byte of the ID.
+  unsigned char temp_byte = 0;
+  int read_status = pReader->Read(pos, 1, &temp_byte);
+
+  if (read_status < 0)
+    return E_FILE_FORMAT_INVALID;
+  else if (read_status > 0)  // No data to read.
+    return E_BUFFER_NOT_FULL;
+
+  if (temp_byte == 0)  // ID length > 8 bytes; invalid file.
+    return E_FILE_FORMAT_INVALID;
+
+  int bit_pos = 0;
+  const int kMaxIdLengthInBytes = 4;
+  const int kCheckByte = 0x80;
+
+  // Find the first bit that's set.
+  bool found_bit = false;
+  for (; bit_pos < kMaxIdLengthInBytes; ++bit_pos) {
+    if ((kCheckByte >> bit_pos) & temp_byte) {
+      found_bit = true;
+      break;
+    }
+  }
+
+  if (!found_bit) {
+    // The value is too large to be a valid ID.
+    return E_FILE_FORMAT_INVALID;
+  }
+
+  // Read the remaining bytes of the ID (if any).
+  const int id_length = bit_pos + 1;
+  long long ebml_id = temp_byte;
+  for (int i = 1; i < id_length; ++i) {
+    ebml_id <<= 8;
+    read_status = pReader->Read(pos + i, 1, &temp_byte);
+
+    if (read_status < 0)
+      return E_FILE_FORMAT_INVALID;
+    else if (read_status > 0)
+      return E_BUFFER_NOT_FULL;
+
+    ebml_id |= temp_byte;
+  }
+
+  len = id_length;
+  return ebml_id;
+}
+
+long long GetUIntLength(IMkvReader* pReader, long long pos, long& len) {
+  if (!pReader || pos < 0)
+    return E_FILE_FORMAT_INVALID;
+
+  long long total, available;
+
+  int status = pReader->Length(&total, &available);
+  if (status < 0 || (total >= 0 && available > total))
+    return E_FILE_FORMAT_INVALID;
+
+  len = 1;
+
+  if (pos >= available)
+    return pos;  // too few bytes available
+
+  unsigned char b;
+
+  status = pReader->Read(pos, 1, &b);
+
+  if (status != 0)
+    return status;
+
+  if (b == 0)  // we can't handle u-int values larger than 8 bytes
+    return E_FILE_FORMAT_INVALID;
+
+  unsigned char m = 0x80;
+
+  while (!(b & m)) {
+    m >>= 1;
+    ++len;
+  }
+
+  return 0;  // success
+}
+
+// TODO(vigneshv): This function assumes that unsigned values never have their
+// high bit set.
+long long UnserializeUInt(IMkvReader* pReader, long long pos, long long size) {
+  if (!pReader || pos < 0 || (size <= 0) || (size > 8))
+    return E_FILE_FORMAT_INVALID;
+
+  long long result = 0;
+
+  for (long long i = 0; i < size; ++i) {
+    unsigned char b;
+
+    const long status = pReader->Read(pos, 1, &b);
+
+    if (status < 0)
+      return status;
+
+    result <<= 8;
+    result |= b;
+
+    ++pos;
+  }
+
+  return result;
+}
+
+long UnserializeFloat(IMkvReader* pReader, long long pos, long long size_,
+                      double& result) {
+  if (!pReader || pos < 0 || ((size_ != 4) && (size_ != 8)))
+    return E_FILE_FORMAT_INVALID;
+
+  const long size = static_cast<long>(size_);
+
+  unsigned char buf[8];
+
+  const int status = pReader->Read(pos, size, buf);
+
+  if (status < 0)  // error
+    return status;
+
+  if (size == 4) {
+    union {
+      float f;
+      unsigned long ff;
+    };
+
+    ff = 0;
+
+    for (int i = 0;;) {
+      ff |= buf[i];
+
+      if (++i >= 4)
+        break;
+
+      ff <<= 8;
+    }
+
+    result = f;
+  } else {
+    union {
+      double d;
+      unsigned long long dd;
+    };
+
+    dd = 0;
+
+    for (int i = 0;;) {
+      dd |= buf[i];
+
+      if (++i >= 8)
+        break;
+
+      dd <<= 8;
+    }
+
+    result = d;
+  }
+
+  if (mkvparser::isinf(result) || mkvparser::isnan(result))
+    return E_FILE_FORMAT_INVALID;
+
+  return 0;
+}
+
+long UnserializeInt(IMkvReader* pReader, long long pos, long long size,
+                    long long& result_ref) {
+  if (!pReader || pos < 0 || size < 1 || size > 8)
+    return E_FILE_FORMAT_INVALID;
+
+  signed char first_byte = 0;
+  const long status = pReader->Read(pos, 1, (unsigned char*)&first_byte);
+
+  if (status < 0)
+    return status;
+
+  unsigned long long result = first_byte;
+  ++pos;
+
+  for (long i = 1; i < size; ++i) {
+    unsigned char b;
+
+    const long status = pReader->Read(pos, 1, &b);
+
+    if (status < 0)
+      return status;
+
+    result <<= 8;
+    result |= b;
+
+    ++pos;
+  }
+
+  result_ref = static_cast<long long>(result);
+  return 0;
+}
+
+long UnserializeString(IMkvReader* pReader, long long pos, long long size,
+                       char*& str) {
+  delete[] str;
+  str = NULL;
+
+  if (size >= LONG_MAX || size < 0 || size > kStringElementSizeLimit)
+    return E_FILE_FORMAT_INVALID;
+
+  // +1 for '\0' terminator
+  const long required_size = static_cast<long>(size) + 1;
+
+  str = SafeArrayAlloc<char>(1, required_size);
+  if (str == NULL)
+    return E_FILE_FORMAT_INVALID;
+
+  unsigned char* const buf = reinterpret_cast<unsigned char*>(str);
+
+  const long status = pReader->Read(pos, static_cast<long>(size), buf);
+
+  if (status) {
+    delete[] str;
+    str = NULL;
+
+    return status;
+  }
+
+  str[required_size - 1] = '\0';
+  return 0;
+}
+
+long ParseElementHeader(IMkvReader* pReader, long long& pos, long long stop,
+                        long long& id, long long& size) {
+  if (stop >= 0 && pos >= stop)
+    return E_FILE_FORMAT_INVALID;
+
+  long len;
+
+  id = ReadID(pReader, pos, len);
+
+  if (id < 0)
+    return E_FILE_FORMAT_INVALID;
+
+  pos += len;  // consume id
+
+  if (stop >= 0 && pos >= stop)
+    return E_FILE_FORMAT_INVALID;
+
+  size = ReadUInt(pReader, pos, len);
+
+  if (size < 0 || len < 1 || len > 8) {
+    // Invalid: Negative payload size, negative or 0 length integer, or integer
+    // larger than 64 bits (libwebm cannot handle them).
+    return E_FILE_FORMAT_INVALID;
+  }
+
+  // Avoid rolling over pos when very close to LLONG_MAX.
+  const unsigned long long rollover_check =
+      static_cast<unsigned long long>(pos) + len;
+  if (rollover_check > LLONG_MAX)
+    return E_FILE_FORMAT_INVALID;
+
+  pos += len;  // consume length of size
+
+  // pos now designates payload
+
+  if (stop >= 0 && pos > stop)
+    return E_FILE_FORMAT_INVALID;
+
+  return 0;  // success
+}
+
+bool Match(IMkvReader* pReader, long long& pos, unsigned long expected_id,
+           long long& val) {
+  if (!pReader || pos < 0)
+    return false;
+
+  long long total = 0;
+  long long available = 0;
+
+  const long status = pReader->Length(&total, &available);
+  if (status < 0 || (total >= 0 && available > total))
+    return false;
+
+  long len = 0;
+
+  const long long id = ReadID(pReader, pos, len);
+  if (id < 0 || (available - pos) > len)
+    return false;
+
+  if (static_cast<unsigned long>(id) != expected_id)
+    return false;
+
+  pos += len;  // consume id
+
+  const long long size = ReadUInt(pReader, pos, len);
+  if (size < 0 || size > 8 || len < 1 || len > 8 || (available - pos) > len)
+    return false;
+
+  pos += len;  // consume length of size of payload
+
+  val = UnserializeUInt(pReader, pos, size);
+  if (val < 0)
+    return false;
+
+  pos += size;  // consume size of payload
+
+  return true;
+}
+
+bool Match(IMkvReader* pReader, long long& pos, unsigned long expected_id,
+           unsigned char*& buf, size_t& buflen) {
+  if (!pReader || pos < 0)
+    return false;
+
+  long long total = 0;
+  long long available = 0;
+
+  long status = pReader->Length(&total, &available);
+  if (status < 0 || (total >= 0 && available > total))
+    return false;
+
+  long len = 0;
+  const long long id = ReadID(pReader, pos, len);
+  if (id < 0 || (available - pos) > len)
+    return false;
+
+  if (static_cast<unsigned long>(id) != expected_id)
+    return false;
+
+  pos += len;  // consume id
+
+  const long long size = ReadUInt(pReader, pos, len);
+  if (size < 0 || len <= 0 || len > 8 || (available - pos) > len)
+    return false;
+
+  unsigned long long rollover_check =
+      static_cast<unsigned long long>(pos) + len;
+  if (rollover_check > LLONG_MAX)
+    return false;
+
+  pos += len;  // consume length of size of payload
+
+  rollover_check = static_cast<unsigned long long>(pos) + size;
+  if (rollover_check > LLONG_MAX)
+    return false;
+
+  if ((pos + size) > available)
+    return false;
+
+  if (size >= LONG_MAX)
+    return false;
+
+  const long buflen_ = static_cast<long>(size);
+
+  buf = SafeArrayAlloc<unsigned char>(1, buflen_);
+  if (!buf)
+    return false;
+
+  status = pReader->Read(pos, buflen_, buf);
+  if (status != 0)
+    return false;
+
+  buflen = buflen_;
+
+  pos += size;  // consume size of payload
+  return true;
+}
+
+EBMLHeader::EBMLHeader() : m_docType(NULL) { Init(); }
+
+EBMLHeader::~EBMLHeader() { delete[] m_docType; }
+
+void EBMLHeader::Init() {
+  m_version = 1;
+  m_readVersion = 1;
+  m_maxIdLength = 4;
+  m_maxSizeLength = 8;
+
+  if (m_docType) {
+    delete[] m_docType;
+    m_docType = NULL;
+  }
+
+  m_docTypeVersion = 1;
+  m_docTypeReadVersion = 1;
+}
+
+long long EBMLHeader::Parse(IMkvReader* pReader, long long& pos) {
+  if (!pReader)
+    return E_FILE_FORMAT_INVALID;
+
+  long long total, available;
+
+  long status = pReader->Length(&total, &available);
+
+  if (status < 0)  // error
+    return status;
+
+  pos = 0;
+
+  // Scan until we find what looks like the first byte of the EBML header.
+  const long long kMaxScanBytes = (available >= 1024) ? 1024 : available;
+  const unsigned char kEbmlByte0 = 0x1A;
+  unsigned char scan_byte = 0;
+
+  while (pos < kMaxScanBytes) {
+    status = pReader->Read(pos, 1, &scan_byte);
+
+    if (status < 0)  // error
+      return status;
+    else if (status > 0)
+      return E_BUFFER_NOT_FULL;
+
+    if (scan_byte == kEbmlByte0)
+      break;
+
+    ++pos;
+  }
+
+  long len = 0;
+  const long long ebml_id = ReadID(pReader, pos, len);
+
+  if (ebml_id == E_BUFFER_NOT_FULL)
+    return E_BUFFER_NOT_FULL;
+
+  if (len != 4 || ebml_id != libwebm::kMkvEBML)
+    return E_FILE_FORMAT_INVALID;
+
+  // Move read pos forward to the EBML header size field.
+  pos += 4;
+
+  // Read length of size field.
+  long long result = GetUIntLength(pReader, pos, len);
+
+  if (result < 0)  // error
+    return E_FILE_FORMAT_INVALID;
+  else if (result > 0)  // need more data
+    return E_BUFFER_NOT_FULL;
+
+  if (len < 1 || len > 8)
+    return E_FILE_FORMAT_INVALID;
+
+  if ((total >= 0) && ((total - pos) < len))
+    return E_FILE_FORMAT_INVALID;
+
+  if ((available - pos) < len)
+    return pos + len;  // try again later
+
+  // Read the EBML header size.
+  result = ReadUInt(pReader, pos, len);
+
+  if (result < 0)  // error
+    return result;
+
+  pos += len;  // consume size field
+
+  // pos now designates start of payload
+
+  if ((total >= 0) && ((total - pos) < result))
+    return E_FILE_FORMAT_INVALID;
+
+  if ((available - pos) < result)
+    return pos + result;
+
+  const long long end = pos + result;
+
+  Init();
+
+  while (pos < end) {
+    long long id, size;
+
+    status = ParseElementHeader(pReader, pos, end, id, size);
+
+    if (status < 0)  // error
+      return status;
+
+    if (size == 0)
+      return E_FILE_FORMAT_INVALID;
+
+    if (id == libwebm::kMkvEBMLVersion) {
+      m_version = UnserializeUInt(pReader, pos, size);
+
+      if (m_version <= 0)
+        return E_FILE_FORMAT_INVALID;
+    } else if (id == libwebm::kMkvEBMLReadVersion) {
+      m_readVersion = UnserializeUInt(pReader, pos, size);
+
+      if (m_readVersion <= 0)
+        return E_FILE_FORMAT_INVALID;
+    } else if (id == libwebm::kMkvEBMLMaxIDLength) {
+      m_maxIdLength = UnserializeUInt(pReader, pos, size);
+
+      if (m_maxIdLength <= 0)
+        return E_FILE_FORMAT_INVALID;
+    } else if (id == libwebm::kMkvEBMLMaxSizeLength) {
+      m_maxSizeLength = UnserializeUInt(pReader, pos, size);
+
+      if (m_maxSizeLength <= 0)
+        return E_FILE_FORMAT_INVALID;
+    } else if (id == libwebm::kMkvDocType) {
+      if (m_docType)
+        return E_FILE_FORMAT_INVALID;
+
+      status = UnserializeString(pReader, pos, size, m_docType);
+
+      if (status)  // error
+        return status;
+    } else if (id == libwebm::kMkvDocTypeVersion) {
+      m_docTypeVersion = UnserializeUInt(pReader, pos, size);
+
+      if (m_docTypeVersion <= 0)
+        return E_FILE_FORMAT_INVALID;
+    } else if (id == libwebm::kMkvDocTypeReadVersion) {
+      m_docTypeReadVersion = UnserializeUInt(pReader, pos, size);
+
+      if (m_docTypeReadVersion <= 0)
+        return E_FILE_FORMAT_INVALID;
+    }
+
+    pos += size;
+  }
+
+  if (pos != end)
+    return E_FILE_FORMAT_INVALID;
+
+  // Make sure DocType, DocTypeReadVersion, and DocTypeVersion are valid.
+  if (m_docType == NULL || m_docTypeReadVersion <= 0 || m_docTypeVersion <= 0)
+    return E_FILE_FORMAT_INVALID;
+
+  // Make sure EBMLMaxIDLength and EBMLMaxSizeLength are valid.
+  if (m_maxIdLength <= 0 || m_maxIdLength > 4 || m_maxSizeLength <= 0 ||
+      m_maxSizeLength > 8)
+    return E_FILE_FORMAT_INVALID;
+
+  return 0;
+}
+
+Segment::Segment(IMkvReader* pReader, long long elem_start,
+                 // long long elem_size,
+                 long long start, long long size)
+    : m_pReader(pReader),
+      m_element_start(elem_start),
+      // m_element_size(elem_size),
+      m_start(start),
+      m_size(size),
+      m_pos(start),
+      m_pUnknownSize(0),
+      m_pSeekHead(NULL),
+      m_pInfo(NULL),
+      m_pTracks(NULL),
+      m_pCues(NULL),
+      m_pChapters(NULL),
+      m_pTags(NULL),
+      m_clusters(NULL),
+      m_clusterCount(0),
+      m_clusterPreloadCount(0),
+      m_clusterSize(0) {}
+
+Segment::~Segment() {
+  const long count = m_clusterCount + m_clusterPreloadCount;
+
+  Cluster** i = m_clusters;
+  Cluster** j = m_clusters + count;
+
+  while (i != j) {
+    Cluster* const p = *i++;
+    delete p;
+  }
+
+  delete[] m_clusters;
+
+  delete m_pTracks;
+  delete m_pInfo;
+  delete m_pCues;
+  delete m_pChapters;
+  delete m_pTags;
+  delete m_pSeekHead;
+}
+
+long long Segment::CreateInstance(IMkvReader* pReader, long long pos,
+                                  Segment*& pSegment) {
+  if (pReader == NULL || pos < 0)
+    return E_PARSE_FAILED;
+
+  pSegment = NULL;
+
+  long long total, available;
+
+  const long status = pReader->Length(&total, &available);
+
+  if (status < 0)  // error
+    return status;
+
+  if (available < 0)
+    return -1;
+
+  if ((total >= 0) && (available > total))
+    return -1;
+
+  // I would assume that in practice this loop would execute
+  // exactly once, but we allow for other elements (e.g. Void)
+  // to immediately follow the EBML header.  This is fine for
+  // the source filter case (since the entire file is available),
+  // but in the splitter case over a network we should probably
+  // just give up early.  We could for example decide only to
+  // execute this loop a maximum of, say, 10 times.
+  // TODO:
+  // There is an implied "give up early" by only parsing up
+  // to the available limit.  We do do that, but only if the
+  // total file size is unknown.  We could decide to always
+  // use what's available as our limit (irrespective of whether
+  // we happen to know the total file length).  This would have
+  // as its sense "parse this much of the file before giving up",
+  // which a slightly different sense from "try to parse up to
+  // 10 EMBL elements before giving up".
+
+  for (;;) {
+    if ((total >= 0) && (pos >= total))
+      return E_FILE_FORMAT_INVALID;
+
+    // Read ID
+    long len;
+    long long result = GetUIntLength(pReader, pos, len);
+
+    if (result)  // error, or too few available bytes
+      return result;
+
+    if ((total >= 0) && ((pos + len) > total))
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + len) > available)
+      return pos + len;
+
+    const long long idpos = pos;
+    const long long id = ReadID(pReader, pos, len);
+
+    if (id < 0)
+      return E_FILE_FORMAT_INVALID;
+
+    pos += len;  // consume ID
+
+    // Read Size
+
+    result = GetUIntLength(pReader, pos, len);
+
+    if (result)  // error, or too few available bytes
+      return result;
+
+    if ((total >= 0) && ((pos + len) > total))
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + len) > available)
+      return pos + len;
+
+    long long size = ReadUInt(pReader, pos, len);
+
+    if (size < 0)  // error
+      return size;
+
+    pos += len;  // consume length of size of element
+
+    // Pos now points to start of payload
+
+    // Handle "unknown size" for live streaming of webm files.
+    const long long unknown_size = (1LL << (7 * len)) - 1;
+
+    if (id == libwebm::kMkvSegment) {
+      if (size == unknown_size)
+        size = -1;
+
+      else if (total < 0)
+        size = -1;
+
+      else if ((pos + size) > total)
+        size = -1;
+
+      pSegment = new (std::nothrow) Segment(pReader, idpos, pos, size);
+      if (pSegment == NULL)
+        return E_PARSE_FAILED;
+
+      return 0;  // success
+    }
+
+    if (size == unknown_size)
+      return E_FILE_FORMAT_INVALID;
+
+    if ((total >= 0) && ((pos + size) > total))
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + size) > available)
+      return pos + size;
+
+    pos += size;  // consume payload
+  }
+}
+
+long long Segment::ParseHeaders() {
+  // Outermost (level 0) segment object has been constructed,
+  // and pos designates start of payload.  We need to find the
+  // inner (level 1) elements.
+  long long total, available;
+
+  const int status = m_pReader->Length(&total, &available);
+
+  if (status < 0)  // error
+    return status;
+
+  if (total > 0 && available > total)
+    return E_FILE_FORMAT_INVALID;
+
+  const long long segment_stop = (m_size < 0) ? -1 : m_start + m_size;
+
+  if ((segment_stop >= 0 && total >= 0 && segment_stop > total) ||
+      (segment_stop >= 0 && m_pos > segment_stop)) {
+    return E_FILE_FORMAT_INVALID;
+  }
+
+  for (;;) {
+    if ((total >= 0) && (m_pos >= total))
+      break;
+
+    if ((segment_stop >= 0) && (m_pos >= segment_stop))
+      break;
+
+    long long pos = m_pos;
+    const long long element_start = pos;
+
+    // Avoid rolling over pos when very close to LLONG_MAX.
+    unsigned long long rollover_check = pos + 1ULL;
+    if (rollover_check > LLONG_MAX)
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + 1) > available)
+      return (pos + 1);
+
+    long len;
+    long long result = GetUIntLength(m_pReader, pos, len);
+
+    if (result < 0)  // error
+      return result;
+
+    if (result > 0) {
+      // MkvReader doesn't have enough data to satisfy this read attempt.
+      return (pos + 1);
+    }
+
+    if ((segment_stop >= 0) && ((pos + len) > segment_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + len) > available)
+      return pos + len;
+
+    const long long idpos = pos;
+    const long long id = ReadID(m_pReader, idpos, len);
+
+    if (id < 0)
+      return E_FILE_FORMAT_INVALID;
+
+    if (id == libwebm::kMkvCluster)
+      break;
+
+    pos += len;  // consume ID
+
+    if ((pos + 1) > available)
+      return (pos + 1);
+
+    // Read Size
+    result = GetUIntLength(m_pReader, pos, len);
+
+    if (result < 0)  // error
+      return result;
+
+    if (result > 0) {
+      // MkvReader doesn't have enough data to satisfy this read attempt.
+      return (pos + 1);
+    }
+
+    if ((segment_stop >= 0) && ((pos + len) > segment_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + len) > available)
+      return pos + len;
+
+    const long long size = ReadUInt(m_pReader, pos, len);
+
+    if (size < 0 || len < 1 || len > 8) {
+      // TODO(tomfinegan): ReadUInt should return an error when len is < 1 or
+      // len > 8 is true instead of checking this _everywhere_.
+      return size;
+    }
+
+    pos += len;  // consume length of size of element
+
+    // Avoid rolling over pos when very close to LLONG_MAX.
+    rollover_check = static_cast<unsigned long long>(pos) + size;
+    if (rollover_check > LLONG_MAX)
+      return E_FILE_FORMAT_INVALID;
+
+    const long long element_size = size + pos - element_start;
+
+    // Pos now points to start of payload
+
+    if ((segment_stop >= 0) && ((pos + size) > segment_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    // We read EBML elements either in total or nothing at all.
+
+    if ((pos + size) > available)
+      return pos + size;
+
+    if (id == libwebm::kMkvInfo) {
+      if (m_pInfo)
+        return E_FILE_FORMAT_INVALID;
+
+      m_pInfo = new (std::nothrow)
+          SegmentInfo(this, pos, size, element_start, element_size);
+
+      if (m_pInfo == NULL)
+        return -1;
+
+      const long status = m_pInfo->Parse();
+
+      if (status)
+        return status;
+    } else if (id == libwebm::kMkvTracks) {
+      if (m_pTracks)
+        return E_FILE_FORMAT_INVALID;
+
+      m_pTracks = new (std::nothrow)
+          Tracks(this, pos, size, element_start, element_size);
+
+      if (m_pTracks == NULL)
+        return -1;
+
+      const long status = m_pTracks->Parse();
+
+      if (status)
+        return status;
+    } else if (id == libwebm::kMkvCues) {
+      if (m_pCues == NULL) {
+        m_pCues = new (std::nothrow)
+            Cues(this, pos, size, element_start, element_size);
+
+        if (m_pCues == NULL)
+          return -1;
+      }
+    } else if (id == libwebm::kMkvSeekHead) {
+      if (m_pSeekHead == NULL) {
+        m_pSeekHead = new (std::nothrow)
+            SeekHead(this, pos, size, element_start, element_size);
+
+        if (m_pSeekHead == NULL)
+          return -1;
+
+        const long status = m_pSeekHead->Parse();
+
+        if (status)
+          return status;
+      }
+    } else if (id == libwebm::kMkvChapters) {
+      if (m_pChapters == NULL) {
+        m_pChapters = new (std::nothrow)
+            Chapters(this, pos, size, element_start, element_size);
+
+        if (m_pChapters == NULL)
+          return -1;
+
+        const long status = m_pChapters->Parse();
+
+        if (status)
+          return status;
+      }
+    } else if (id == libwebm::kMkvTags) {
+      if (m_pTags == NULL) {
+        m_pTags = new (std::nothrow)
+            Tags(this, pos, size, element_start, element_size);
+
+        if (m_pTags == NULL)
+          return -1;
+
+        const long status = m_pTags->Parse();
+
+        if (status)
+          return status;
+      }
+    }
+
+    m_pos = pos + size;  // consume payload
+  }
+
+  if (segment_stop >= 0 && m_pos > segment_stop)
+    return E_FILE_FORMAT_INVALID;
+
+  if (m_pInfo == NULL)  // TODO: liberalize this behavior
+    return E_FILE_FORMAT_INVALID;
+
+  if (m_pTracks == NULL)
+    return E_FILE_FORMAT_INVALID;
+
+  return 0;  // success
+}
+
+long Segment::LoadCluster(long long& pos, long& len) {
+  for (;;) {
+    const long result = DoLoadCluster(pos, len);
+
+    if (result <= 1)
+      return result;
+  }
+}
+
+long Segment::DoLoadCluster(long long& pos, long& len) {
+  if (m_pos < 0)
+    return DoLoadClusterUnknownSize(pos, len);
+
+  long long total, avail;
+
+  long status = m_pReader->Length(&total, &avail);
+
+  if (status < 0)  // error
+    return status;
+
+  if (total >= 0 && avail > total)
+    return E_FILE_FORMAT_INVALID;
+
+  const long long segment_stop = (m_size < 0) ? -1 : m_start + m_size;
+
+  long long cluster_off = -1;  // offset relative to start of segment
+  long long cluster_size = -1;  // size of cluster payload
+
+  for (;;) {
+    if ((total >= 0) && (m_pos >= total))
+      return 1;  // no more clusters
+
+    if ((segment_stop >= 0) && (m_pos >= segment_stop))
+      return 1;  // no more clusters
+
+    pos = m_pos;
+
+    // Read ID
+
+    if ((pos + 1) > avail) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
+    }
+
+    long long result = GetUIntLength(m_pReader, pos, len);
+
+    if (result < 0)  // error
+      return static_cast<long>(result);
+
+    if (result > 0)
+      return E_BUFFER_NOT_FULL;
+
+    if ((segment_stop >= 0) && ((pos + len) > segment_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + len) > avail)
+      return E_BUFFER_NOT_FULL;
+
+    const long long idpos = pos;
+    const long long id = ReadID(m_pReader, idpos, len);
+
+    if (id < 0)
+      return E_FILE_FORMAT_INVALID;
+
+    pos += len;  // consume ID
+
+    // Read Size
+
+    if ((pos + 1) > avail) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
+    }
+
+    result = GetUIntLength(m_pReader, pos, len);
+
+    if (result < 0)  // error
+      return static_cast<long>(result);
+
+    if (result > 0)
+      return E_BUFFER_NOT_FULL;
+
+    if ((segment_stop >= 0) && ((pos + len) > segment_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + len) > avail)
+      return E_BUFFER_NOT_FULL;
+
+    const long long size = ReadUInt(m_pReader, pos, len);
+
+    if (size < 0)  // error
+      return static_cast<long>(size);
+
+    pos += len;  // consume length of size of element
+
+    // pos now points to start of payload
+
+    if (size == 0) {
+      // Missing element payload: move on.
+      m_pos = pos;
+      continue;
+    }
+
+    const long long unknown_size = (1LL << (7 * len)) - 1;
+
+    if ((segment_stop >= 0) && (size != unknown_size) &&
+        ((pos + size) > segment_stop)) {
+      return E_FILE_FORMAT_INVALID;
+    }
+
+    if (id == libwebm::kMkvCues) {
+      if (size == unknown_size) {
+        // Cues element of unknown size: Not supported.
+        return E_FILE_FORMAT_INVALID;
+      }
+
+      if (m_pCues == NULL) {
+        const long long element_size = (pos - idpos) + size;
+
+        m_pCues = new (std::nothrow) Cues(this, pos, size, idpos, element_size);
+        if (m_pCues == NULL)
+          return -1;
+      }
+
+      m_pos = pos + size;  // consume payload
+      continue;
+    }
+
+    if (id != libwebm::kMkvCluster) {
+      // Besides the Segment, Libwebm allows only cluster elements of unknown
+      // size. Fail the parse upon encountering a non-cluster element reporting
+      // unknown size.
+      if (size == unknown_size)
+        return E_FILE_FORMAT_INVALID;
+
+      m_pos = pos + size;  // consume payload
+      continue;
+    }
+
+    // We have a cluster.
+
+    cluster_off = idpos - m_start;  // relative pos
+
+    if (size != unknown_size)
+      cluster_size = size;
+
+    break;
+  }
+
+  if (cluster_off < 0) {
+    // No cluster, die.
+    return E_FILE_FORMAT_INVALID;
+  }
+
+  long long pos_;
+  long len_;
+
+  status = Cluster::HasBlockEntries(this, cluster_off, pos_, len_);
+
+  if (status < 0) {  // error, or underflow
+    pos = pos_;
+    len = len_;
+
+    return status;
+  }
+
+  // status == 0 means "no block entries found"
+  // status > 0 means "found at least one block entry"
+
+  // TODO:
+  // The issue here is that the segment increments its own
+  // pos ptr past the most recent cluster parsed, and then
+  // starts from there to parse the next cluster.  If we
+  // don't know the size of the current cluster, then we
+  // must either parse its payload (as we do below), looking
+  // for the cluster (or cues) ID to terminate the parse.
+  // This isn't really what we want: rather, we really need
+  // a way to create the curr cluster object immediately.
+  // The pity is that cluster::parse can determine its own
+  // boundary, and we largely duplicate that same logic here.
+  //
+  // Maybe we need to get rid of our look-ahead preloading
+  // in source::parse???
+  //
+  // As we're parsing the blocks in the curr cluster
+  //(in cluster::parse), we should have some way to signal
+  // to the segment that we have determined the boundary,
+  // so it can adjust its own segment::m_pos member.
+  //
+  // The problem is that we're asserting in asyncreadinit,
+  // because we adjust the pos down to the curr seek pos,
+  // and the resulting adjusted len is > 2GB.  I'm suspicious
+  // that this is even correct, but even if it is, we can't
+  // be loading that much data in the cache anyway.
+
+  const long idx = m_clusterCount;
+
+  if (m_clusterPreloadCount > 0) {
+    if (idx >= m_clusterSize)
+      return E_FILE_FORMAT_INVALID;
+
+    Cluster* const pCluster = m_clusters[idx];
+    if (pCluster == NULL || pCluster->m_index >= 0)
+      return E_FILE_FORMAT_INVALID;
+
+    const long long off = pCluster->GetPosition();
+    if (off < 0)
+      return E_FILE_FORMAT_INVALID;
+
+    if (off == cluster_off) {  // preloaded already
+      if (status == 0)  // no entries found
+        return E_FILE_FORMAT_INVALID;
+
+      if (cluster_size >= 0)
+        pos += cluster_size;
+      else {
+        const long long element_size = pCluster->GetElementSize();
+
+        if (element_size <= 0)
+          return E_FILE_FORMAT_INVALID;  // TODO: handle this case
+
+        pos = pCluster->m_element_start + element_size;
+      }
+
+      pCluster->m_index = idx;  // move from preloaded to loaded
+      ++m_clusterCount;
+      --m_clusterPreloadCount;
+
+      m_pos = pos;  // consume payload
+      if (segment_stop >= 0 && m_pos > segment_stop)
+        return E_FILE_FORMAT_INVALID;
+
+      return 0;  // success
+    }
+  }
+
+  if (status == 0) {  // no entries found
+    if (cluster_size >= 0)
+      pos += cluster_size;
+
+    if ((total >= 0) && (pos >= total)) {
+      m_pos = total;
+      return 1;  // no more clusters
+    }
+
+    if ((segment_stop >= 0) && (pos >= segment_stop)) {
+      m_pos = segment_stop;
+      return 1;  // no more clusters
+    }
+
+    m_pos = pos;
+    return 2;  // try again
+  }
+
+  // status > 0 means we have an entry
+
+  Cluster* const pCluster = Cluster::Create(this, idx, cluster_off);
+  if (pCluster == NULL)
+    return -1;
+
+  if (!AppendCluster(pCluster)) {
+    delete pCluster;
+    return -1;
+  }
+
+  if (cluster_size >= 0) {
+    pos += cluster_size;
+
+    m_pos = pos;
+
+    if (segment_stop > 0 && m_pos > segment_stop)
+      return E_FILE_FORMAT_INVALID;
+
+    return 0;
+  }
+
+  m_pUnknownSize = pCluster;
+  m_pos = -pos;
+
+  return 0;  // partial success, since we have a new cluster
+
+  // status == 0 means "no block entries found"
+  // pos designates start of payload
+  // m_pos has NOT been adjusted yet (in case we need to come back here)
+}
+
+long Segment::DoLoadClusterUnknownSize(long long& pos, long& len) {
+  if (m_pos >= 0 || m_pUnknownSize == NULL)
+    return E_PARSE_FAILED;
+
+  const long status = m_pUnknownSize->Parse(pos, len);
+
+  if (status < 0)  // error or underflow
+    return status;
+
+  if (status == 0)  // parsed a block
+    return 2;  // continue parsing
+
+  const long long start = m_pUnknownSize->m_element_start;
+  const long long size = m_pUnknownSize->GetElementSize();
+
+  if (size < 0)
+    return E_FILE_FORMAT_INVALID;
+
+  pos = start + size;
+  m_pos = pos;
+
+  m_pUnknownSize = 0;
+
+  return 2;  // continue parsing
+}
+
+bool Segment::AppendCluster(Cluster* pCluster) {
+  if (pCluster == NULL || pCluster->m_index < 0)
+    return false;
+
+  const long count = m_clusterCount + m_clusterPreloadCount;
+
+  long& size = m_clusterSize;
+  const long idx = pCluster->m_index;
+
+  if (size < count || idx != m_clusterCount)
+    return false;
+
+  if (count >= size) {
+    const long n = (size <= 0) ? 2048 : 2 * size;
+
+    Cluster** const qq = new (std::nothrow) Cluster*[n];
+    if (qq == NULL)
+      return false;
+
+    Cluster** q = qq;
+    Cluster** p = m_clusters;
+    Cluster** const pp = p + count;
+
+    while (p != pp)
+      *q++ = *p++;
+
+    delete[] m_clusters;
+
+    m_clusters = qq;
+    size = n;
+  }
+
+  if (m_clusterPreloadCount > 0) {
+    Cluster** const p = m_clusters + m_clusterCount;
+    if (*p == NULL || (*p)->m_index >= 0)
+      return false;
+
+    Cluster** q = p + m_clusterPreloadCount;
+    if (q >= (m_clusters + size))
+      return false;
+
+    for (;;) {
+      Cluster** const qq = q - 1;
+      if ((*qq)->m_index >= 0)
+        return false;
+
+      *q = *qq;
+      q = qq;
+
+      if (q == p)
+        break;
+    }
+  }
+
+  m_clusters[idx] = pCluster;
+  ++m_clusterCount;
+  return true;
+}
+
+bool Segment::PreloadCluster(Cluster* pCluster, ptrdiff_t idx) {
+  if (pCluster == NULL || pCluster->m_index >= 0 || idx < m_clusterCount)
+    return false;
+
+  const long count = m_clusterCount + m_clusterPreloadCount;
+
+  long& size = m_clusterSize;
+  if (size < count)
+    return false;
+
+  if (count >= size) {
+    const long n = (size <= 0) ? 2048 : 2 * size;
+
+    Cluster** const qq = new (std::nothrow) Cluster*[n];
+    if (qq == NULL)
+      return false;
+    Cluster** q = qq;
+
+    Cluster** p = m_clusters;
+    Cluster** const pp = p + count;
+
+    while (p != pp)
+      *q++ = *p++;
+
+    delete[] m_clusters;
+
+    m_clusters = qq;
+    size = n;
+  }
+
+  if (m_clusters == NULL)
+    return false;
+
+  Cluster** const p = m_clusters + idx;
+
+  Cluster** q = m_clusters + count;
+  if (q < p || q >= (m_clusters + size))
+    return false;
+
+  while (q > p) {
+    Cluster** const qq = q - 1;
+
+    if ((*qq)->m_index >= 0)
+      return false;
+
+    *q = *qq;
+    q = qq;
+  }
+
+  m_clusters[idx] = pCluster;
+  ++m_clusterPreloadCount;
+  return true;
+}
+
+long Segment::Load() {
+  if (m_clusters != NULL || m_clusterSize != 0 || m_clusterCount != 0)
+    return E_PARSE_FAILED;
+
+  // Outermost (level 0) segment object has been constructed,
+  // and pos designates start of payload.  We need to find the
+  // inner (level 1) elements.
+
+  const long long header_status = ParseHeaders();
+
+  if (header_status < 0)  // error
+    return static_cast<long>(header_status);
+
+  if (header_status > 0)  // underflow
+    return E_BUFFER_NOT_FULL;
+
+  if (m_pInfo == NULL || m_pTracks == NULL)
+    return E_FILE_FORMAT_INVALID;
+
+  for (;;) {
+    const long status = LoadCluster();
+
+    if (status < 0)  // error
+      return status;
+
+    if (status >= 1)  // no more clusters
+      return 0;
+  }
+}
+
+SeekHead::Entry::Entry() : id(0), pos(0), element_start(0), element_size(0) {}
+
+SeekHead::SeekHead(Segment* pSegment, long long start, long long size_,
+                   long long element_start, long long element_size)
+    : m_pSegment(pSegment),
+      m_start(start),
+      m_size(size_),
+      m_element_start(element_start),
+      m_element_size(element_size),
+      m_entries(0),
+      m_entry_count(0),
+      m_void_elements(0),
+      m_void_element_count(0) {}
+
+SeekHead::~SeekHead() {
+  delete[] m_entries;
+  delete[] m_void_elements;
+}
+
+long SeekHead::Parse() {
+  IMkvReader* const pReader = m_pSegment->m_pReader;
+
+  long long pos = m_start;
+  const long long stop = m_start + m_size;
+
+  // first count the seek head entries
+
+  int entry_count = 0;
+  int void_element_count = 0;
+
+  while (pos < stop) {
+    long long id, size;
+
+    const long status = ParseElementHeader(pReader, pos, stop, id, size);
+
+    if (status < 0)  // error
+      return status;
+
+    if (id == libwebm::kMkvSeek)
+      ++entry_count;
+    else if (id == libwebm::kMkvVoid)
+      ++void_element_count;
+
+    pos += size;  // consume payload
+
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  if (pos != stop)
+    return E_FILE_FORMAT_INVALID;
+
+  if (entry_count > 0) {
+    m_entries = new (std::nothrow) Entry[entry_count];
+
+    if (m_entries == NULL)
+      return -1;
+  }
+
+  if (void_element_count > 0) {
+    m_void_elements = new (std::nothrow) VoidElement[void_element_count];
+
+    if (m_void_elements == NULL)
+      return -1;
+  }
+
+  // now parse the entries and void elements
+
+  Entry* pEntry = m_entries;
+  VoidElement* pVoidElement = m_void_elements;
+
+  pos = m_start;
+
+  while (pos < stop) {
+    const long long idpos = pos;
+
+    long long id, size;
+
+    const long status = ParseElementHeader(pReader, pos, stop, id, size);
+
+    if (status < 0)  // error
+      return status;
+
+    if (id == libwebm::kMkvSeek && entry_count > 0) {
+      if (ParseEntry(pReader, pos, size, pEntry)) {
+        Entry& e = *pEntry++;
+
+        e.element_start = idpos;
+        e.element_size = (pos + size) - idpos;
+      }
+    } else if (id == libwebm::kMkvVoid && void_element_count > 0) {
+      VoidElement& e = *pVoidElement++;
+
+      e.element_start = idpos;
+      e.element_size = (pos + size) - idpos;
+    }
+
+    pos += size;  // consume payload
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  if (pos != stop)
+    return E_FILE_FORMAT_INVALID;
+
+  ptrdiff_t count_ = ptrdiff_t(pEntry - m_entries);
+  assert(count_ >= 0);
+  assert(count_ <= entry_count);
+
+  m_entry_count = static_cast<int>(count_);
+
+  count_ = ptrdiff_t(pVoidElement - m_void_elements);
+  assert(count_ >= 0);
+  assert(count_ <= void_element_count);
+
+  m_void_element_count = static_cast<int>(count_);
+
+  return 0;
+}
+
+int SeekHead::GetCount() const { return m_entry_count; }
+
+const SeekHead::Entry* SeekHead::GetEntry(int idx) const {
+  if (idx < 0)
+    return 0;
+
+  if (idx >= m_entry_count)
+    return 0;
+
+  return m_entries + idx;
+}
+
+int SeekHead::GetVoidElementCount() const { return m_void_element_count; }
+
+const SeekHead::VoidElement* SeekHead::GetVoidElement(int idx) const {
+  if (idx < 0)
+    return 0;
+
+  if (idx >= m_void_element_count)
+    return 0;
+
+  return m_void_elements + idx;
+}
+
+long Segment::ParseCues(long long off, long long& pos, long& len) {
+  if (m_pCues)
+    return 0;  // success
+
+  if (off < 0)
+    return -1;
+
+  long long total, avail;
+
+  const int status = m_pReader->Length(&total, &avail);
+
+  if (status < 0)  // error
+    return status;
+
+  assert((total < 0) || (avail <= total));
+
+  pos = m_start + off;
+
+  if ((total < 0) || (pos >= total))
+    return 1;  // don't bother parsing cues
+
+  const long long element_start = pos;
+  const long long segment_stop = (m_size < 0) ? -1 : m_start + m_size;
+
+  if ((pos + 1) > avail) {
+    len = 1;
+    return E_BUFFER_NOT_FULL;
+  }
+
+  long long result = GetUIntLength(m_pReader, pos, len);
+
+  if (result < 0)  // error
+    return static_cast<long>(result);
+
+  if (result > 0)  // underflow (weird)
+  {
+    len = 1;
+    return E_BUFFER_NOT_FULL;
+  }
+
+  if ((segment_stop >= 0) && ((pos + len) > segment_stop))
+    return E_FILE_FORMAT_INVALID;
+
+  if ((pos + len) > avail)
+    return E_BUFFER_NOT_FULL;
+
+  const long long idpos = pos;
+
+  const long long id = ReadID(m_pReader, idpos, len);
+
+  if (id != libwebm::kMkvCues)
+    return E_FILE_FORMAT_INVALID;
+
+  pos += len;  // consume ID
+  assert((segment_stop < 0) || (pos <= segment_stop));
+
+  // Read Size
+
+  if ((pos + 1) > avail) {
+    len = 1;
+    return E_BUFFER_NOT_FULL;
+  }
+
+  result = GetUIntLength(m_pReader, pos, len);
+
+  if (result < 0)  // error
+    return static_cast<long>(result);
+
+  if (result > 0)  // underflow (weird)
+  {
+    len = 1;
+    return E_BUFFER_NOT_FULL;
+  }
+
+  if ((segment_stop >= 0) && ((pos + len) > segment_stop))
+    return E_FILE_FORMAT_INVALID;
+
+  if ((pos + len) > avail)
+    return E_BUFFER_NOT_FULL;
+
+  const long long size = ReadUInt(m_pReader, pos, len);
+
+  if (size < 0)  // error
+    return static_cast<long>(size);
+
+  if (size == 0)  // weird, although technically not illegal
+    return 1;  // done
+
+  pos += len;  // consume length of size of element
+  assert((segment_stop < 0) || (pos <= segment_stop));
+
+  // Pos now points to start of payload
+
+  const long long element_stop = pos + size;
+
+  if ((segment_stop >= 0) && (element_stop > segment_stop))
+    return E_FILE_FORMAT_INVALID;
+
+  if ((total >= 0) && (element_stop > total))
+    return 1;  // don't bother parsing anymore
+
+  len = static_cast<long>(size);
+
+  if (element_stop > avail)
+    return E_BUFFER_NOT_FULL;
+
+  const long long element_size = element_stop - element_start;
+
+  m_pCues =
+      new (std::nothrow) Cues(this, pos, size, element_start, element_size);
+  if (m_pCues == NULL)
+    return -1;
+
+  return 0;  // success
+}
+
+bool SeekHead::ParseEntry(IMkvReader* pReader, long long start, long long size_,
+                          Entry* pEntry) {
+  if (size_ <= 0)
+    return false;
+
+  long long pos = start;
+  const long long stop = start + size_;
+
+  long len;
+
+  // parse the container for the level-1 element ID
+
+  const long long seekIdId = ReadID(pReader, pos, len);
+  if (seekIdId < 0)
+    return false;
+
+  if (seekIdId != libwebm::kMkvSeekID)
+    return false;
+
+  if ((pos + len) > stop)
+    return false;
+
+  pos += len;  // consume SeekID id
+
+  const long long seekIdSize = ReadUInt(pReader, pos, len);
+
+  if (seekIdSize <= 0)
+    return false;
+
+  if ((pos + len) > stop)
+    return false;
+
+  pos += len;  // consume size of field
+
+  if ((pos + seekIdSize) > stop)
+    return false;
+
+  pEntry->id = ReadID(pReader, pos, len);  // payload
+
+  if (pEntry->id <= 0)
+    return false;
+
+  if (len != seekIdSize)
+    return false;
+
+  pos += seekIdSize;  // consume SeekID payload
+
+  const long long seekPosId = ReadID(pReader, pos, len);
+
+  if (seekPosId != libwebm::kMkvSeekPosition)
+    return false;
+
+  if ((pos + len) > stop)
+    return false;
+
+  pos += len;  // consume id
+
+  const long long seekPosSize = ReadUInt(pReader, pos, len);
+
+  if (seekPosSize <= 0)
+    return false;
+
+  if ((pos + len) > stop)
+    return false;
+
+  pos += len;  // consume size
+
+  if ((pos + seekPosSize) > stop)
+    return false;
+
+  pEntry->pos = UnserializeUInt(pReader, pos, seekPosSize);
+
+  if (pEntry->pos < 0)
+    return false;
+
+  pos += seekPosSize;  // consume payload
+
+  if (pos != stop)
+    return false;
+
+  return true;
+}
+
+Cues::Cues(Segment* pSegment, long long start_, long long size_,
+           long long element_start, long long element_size)
+    : m_pSegment(pSegment),
+      m_start(start_),
+      m_size(size_),
+      m_element_start(element_start),
+      m_element_size(element_size),
+      m_cue_points(NULL),
+      m_count(0),
+      m_preload_count(0),
+      m_pos(start_) {}
+
+Cues::~Cues() {
+  const long n = m_count + m_preload_count;
+
+  CuePoint** p = m_cue_points;
+  CuePoint** const q = p + n;
+
+  while (p != q) {
+    CuePoint* const pCP = *p++;
+    assert(pCP);
+
+    delete pCP;
+  }
+
+  delete[] m_cue_points;
+}
+
+long Cues::GetCount() const {
+  if (m_cue_points == NULL)
+    return -1;
+
+  return m_count;  // TODO: really ignore preload count?
+}
+
+bool Cues::DoneParsing() const {
+  const long long stop = m_start + m_size;
+  return (m_pos >= stop);
+}
+
+bool Cues::Init() const {
+  if (m_cue_points)
+    return true;
+
+  if (m_count != 0 || m_preload_count != 0)
+    return false;
+
+  IMkvReader* const pReader = m_pSegment->m_pReader;
+
+  const long long stop = m_start + m_size;
+  long long pos = m_start;
+
+  long cue_points_size = 0;
+
+  while (pos < stop) {
+    const long long idpos = pos;
+
+    long len;
+
+    const long long id = ReadID(pReader, pos, len);
+    if (id < 0 || (pos + len) > stop) {
+      return false;
+    }
+
+    pos += len;  // consume ID
+
+    const long long size = ReadUInt(pReader, pos, len);
+    if (size < 0 || (pos + len > stop)) {
+      return false;
+    }
+
+    pos += len;  // consume Size field
+    if (pos + size > stop) {
+      return false;
+    }
+
+    if (id == libwebm::kMkvCuePoint) {
+      if (!PreloadCuePoint(cue_points_size, idpos))
+        return false;
+    }
+
+    pos += size;  // skip payload
+  }
+  return true;
+}
+
+bool Cues::PreloadCuePoint(long& cue_points_size, long long pos) const {
+  if (m_count != 0)
+    return false;
+
+  if (m_preload_count >= cue_points_size) {
+    const long n = (cue_points_size <= 0) ? 2048 : 2 * cue_points_size;
+
+    CuePoint** const qq = new (std::nothrow) CuePoint*[n];
+    if (qq == NULL)
+      return false;
+
+    CuePoint** q = qq;  // beginning of target
+
+    CuePoint** p = m_cue_points;  // beginning of source
+    CuePoint** const pp = p + m_preload_count;  // end of source
+
+    while (p != pp)
+      *q++ = *p++;
+
+    delete[] m_cue_points;
+
+    m_cue_points = qq;
+    cue_points_size = n;
+  }
+
+  CuePoint* const pCP = new (std::nothrow) CuePoint(m_preload_count, pos);
+  if (pCP == NULL)
+    return false;
+
+  m_cue_points[m_preload_count++] = pCP;
+  return true;
+}
+
+bool Cues::LoadCuePoint() const {
+  const long long stop = m_start + m_size;
+
+  if (m_pos >= stop)
+    return false;  // nothing else to do
+
+  if (!Init()) {
+    m_pos = stop;
+    return false;
+  }
+
+  IMkvReader* const pReader = m_pSegment->m_pReader;
+
+  while (m_pos < stop) {
+    const long long idpos = m_pos;
+
+    long len;
+
+    const long long id = ReadID(pReader, m_pos, len);
+    if (id < 0 || (m_pos + len) > stop)
+      return false;
+
+    m_pos += len;  // consume ID
+
+    const long long size = ReadUInt(pReader, m_pos, len);
+    if (size < 0 || (m_pos + len) > stop)
+      return false;
+
+    m_pos += len;  // consume Size field
+    if ((m_pos + size) > stop)
+      return false;
+
+    if (id != libwebm::kMkvCuePoint) {
+      m_pos += size;  // consume payload
+      if (m_pos > stop)
+        return false;
+
+      continue;
+    }
+
+    if (m_preload_count < 1)
+      return false;
+
+    CuePoint* const pCP = m_cue_points[m_count];
+    if (!pCP || (pCP->GetTimeCode() < 0 && (-pCP->GetTimeCode() != idpos)))
+      return false;
+
+    if (!pCP->Load(pReader)) {
+      m_pos = stop;
+      return false;
+    }
+    ++m_count;
+    --m_preload_count;
+
+    m_pos += size;  // consume payload
+    if (m_pos > stop)
+      return false;
+
+    return true;  // yes, we loaded a cue point
+  }
+
+  return false;  // no, we did not load a cue point
+}
+
+bool Cues::Find(long long time_ns, const Track* pTrack, const CuePoint*& pCP,
+                const CuePoint::TrackPosition*& pTP) const {
+  if (time_ns < 0 || pTrack == NULL || m_cue_points == NULL || m_count == 0)
+    return false;
+
+  CuePoint** const ii = m_cue_points;
+  CuePoint** i = ii;
+
+  CuePoint** const jj = ii + m_count;
+  CuePoint** j = jj;
+
+  pCP = *i;
+  if (pCP == NULL)
+    return false;
+
+  if (time_ns <= pCP->GetTime(m_pSegment)) {
+    pTP = pCP->Find(pTrack);
+    return (pTP != NULL);
+  }
+
+  while (i < j) {
+    // INVARIANT:
+    //[ii, i) <= time_ns
+    //[i, j)  ?
+    //[j, jj) > time_ns
+
+    CuePoint** const k = i + (j - i) / 2;
+    if (k >= jj)
+      return false;
+
+    CuePoint* const pCP = *k;
+    if (pCP == NULL)
+      return false;
+
+    const long long t = pCP->GetTime(m_pSegment);
+
+    if (t <= time_ns)
+      i = k + 1;
+    else
+      j = k;
+
+    if (i > j)
+      return false;
+  }
+
+  if (i != j || i > jj || i <= ii)
+    return false;
+
+  pCP = *--i;
+
+  if (pCP == NULL || pCP->GetTime(m_pSegment) > time_ns)
+    return false;
+
+  // TODO: here and elsewhere, it's probably not correct to search
+  // for the cue point with this time, and then search for a matching
+  // track.  In principle, the matching track could be on some earlier
+  // cue point, and with our current algorithm, we'd miss it.  To make
+  // this bullet-proof, we'd need to create a secondary structure,
+  // with a list of cue points that apply to a track, and then search
+  // that track-based structure for a matching cue point.
+
+  pTP = pCP->Find(pTrack);
+  return (pTP != NULL);
+}
+
+const CuePoint* Cues::GetFirst() const {
+  if (m_cue_points == NULL || m_count == 0)
+    return NULL;
+
+  CuePoint* const* const pp = m_cue_points;
+  if (pp == NULL)
+    return NULL;
+
+  CuePoint* const pCP = pp[0];
+  if (pCP == NULL || pCP->GetTimeCode() < 0)
+    return NULL;
+
+  return pCP;
+}
+
+const CuePoint* Cues::GetLast() const {
+  if (m_cue_points == NULL || m_count <= 0)
+    return NULL;
+
+  const long index = m_count - 1;
+
+  CuePoint* const* const pp = m_cue_points;
+  if (pp == NULL)
+    return NULL;
+
+  CuePoint* const pCP = pp[index];
+  if (pCP == NULL || pCP->GetTimeCode() < 0)
+    return NULL;
+
+  return pCP;
+}
+
+const CuePoint* Cues::GetNext(const CuePoint* pCurr) const {
+  if (pCurr == NULL || pCurr->GetTimeCode() < 0 || m_cue_points == NULL ||
+      m_count < 1) {
+    return NULL;
+  }
+
+  long index = pCurr->m_index;
+  if (index >= m_count)
+    return NULL;
+
+  CuePoint* const* const pp = m_cue_points;
+  if (pp == NULL || pp[index] != pCurr)
+    return NULL;
+
+  ++index;
+
+  if (index >= m_count)
+    return NULL;
+
+  CuePoint* const pNext = pp[index];
+
+  if (pNext == NULL || pNext->GetTimeCode() < 0)
+    return NULL;
+
+  return pNext;
+}
+
+const BlockEntry* Cues::GetBlock(const CuePoint* pCP,
+                                 const CuePoint::TrackPosition* pTP) const {
+  if (pCP == NULL || pTP == NULL)
+    return NULL;
+
+  return m_pSegment->GetBlock(*pCP, *pTP);
+}
+
+const BlockEntry* Segment::GetBlock(const CuePoint& cp,
+                                    const CuePoint::TrackPosition& tp) {
+  Cluster** const ii = m_clusters;
+  Cluster** i = ii;
+
+  const long count = m_clusterCount + m_clusterPreloadCount;
+
+  Cluster** const jj = ii + count;
+  Cluster** j = jj;
+
+  while (i < j) {
+    // INVARIANT:
+    //[ii, i) < pTP->m_pos
+    //[i, j) ?
+    //[j, jj)  > pTP->m_pos
+
+    Cluster** const k = i + (j - i) / 2;
+    assert(k < jj);
+
+    Cluster* const pCluster = *k;
+    assert(pCluster);
+
+    // const long long pos_ = pCluster->m_pos;
+    // assert(pos_);
+    // const long long pos = pos_ * ((pos_ < 0) ? -1 : 1);
+
+    const long long pos = pCluster->GetPosition();
+    assert(pos >= 0);
+
+    if (pos < tp.m_pos)
+      i = k + 1;
+    else if (pos > tp.m_pos)
+      j = k;
+    else
+      return pCluster->GetEntry(cp, tp);
+  }
+
+  assert(i == j);
+  // assert(Cluster::HasBlockEntries(this, tp.m_pos));
+
+  Cluster* const pCluster = Cluster::Create(this, -1, tp.m_pos);  //, -1);
+  if (pCluster == NULL)
+    return NULL;
+
+  const ptrdiff_t idx = i - m_clusters;
+
+  if (!PreloadCluster(pCluster, idx)) {
+    delete pCluster;
+    return NULL;
+  }
+  assert(m_clusters);
+  assert(m_clusterPreloadCount > 0);
+  assert(m_clusters[idx] == pCluster);
+
+  return pCluster->GetEntry(cp, tp);
+}
+
+const Cluster* Segment::FindOrPreloadCluster(long long requested_pos) {
+  if (requested_pos < 0)
+    return 0;
+
+  Cluster** const ii = m_clusters;
+  Cluster** i = ii;
+
+  const long count = m_clusterCount + m_clusterPreloadCount;
+
+  Cluster** const jj = ii + count;
+  Cluster** j = jj;
+
+  while (i < j) {
+    // INVARIANT:
+    //[ii, i) < pTP->m_pos
+    //[i, j) ?
+    //[j, jj)  > pTP->m_pos
+
+    Cluster** const k = i + (j - i) / 2;
+    assert(k < jj);
+
+    Cluster* const pCluster = *k;
+    assert(pCluster);
+
+    // const long long pos_ = pCluster->m_pos;
+    // assert(pos_);
+    // const long long pos = pos_ * ((pos_ < 0) ? -1 : 1);
+
+    const long long pos = pCluster->GetPosition();
+    assert(pos >= 0);
+
+    if (pos < requested_pos)
+      i = k + 1;
+    else if (pos > requested_pos)
+      j = k;
+    else
+      return pCluster;
+  }
+
+  assert(i == j);
+  // assert(Cluster::HasBlockEntries(this, tp.m_pos));
+
+  Cluster* const pCluster = Cluster::Create(this, -1, requested_pos);
+  if (pCluster == NULL)
+    return NULL;
+
+  const ptrdiff_t idx = i - m_clusters;
+
+  if (!PreloadCluster(pCluster, idx)) {
+    delete pCluster;
+    return NULL;
+  }
+  assert(m_clusters);
+  assert(m_clusterPreloadCount > 0);
+  assert(m_clusters[idx] == pCluster);
+
+  return pCluster;
+}
+
+CuePoint::CuePoint(long idx, long long pos)
+    : m_element_start(0),
+      m_element_size(0),
+      m_index(idx),
+      m_timecode(-1 * pos),
+      m_track_positions(NULL),
+      m_track_positions_count(0) {
+  assert(pos > 0);
+}
+
+CuePoint::~CuePoint() { delete[] m_track_positions; }
+
+bool CuePoint::Load(IMkvReader* pReader) {
+  // odbgstream os;
+  // os << "CuePoint::Load(begin): timecode=" << m_timecode << endl;
+
+  if (m_timecode >= 0)  // already loaded
+    return true;
+
+  assert(m_track_positions == NULL);
+  assert(m_track_positions_count == 0);
+
+  long long pos_ = -m_timecode;
+  const long long element_start = pos_;
+
+  long long stop;
+
+  {
+    long len;
+
+    const long long id = ReadID(pReader, pos_, len);
+    if (id != libwebm::kMkvCuePoint)
+      return false;
+
+    pos_ += len;  // consume ID
+
+    const long long size = ReadUInt(pReader, pos_, len);
+    assert(size >= 0);
+
+    pos_ += len;  // consume Size field
+    // pos_ now points to start of payload
+
+    stop = pos_ + size;
+  }
+
+  const long long element_size = stop - element_start;
+
+  long long pos = pos_;
+
+  // First count number of track positions
+
+  while (pos < stop) {
+    long len;
+
+    const long long id = ReadID(pReader, pos, len);
+    if ((id < 0) || (pos + len > stop)) {
+      return false;
+    }
+
+    pos += len;  // consume ID
+
+    const long long size = ReadUInt(pReader, pos, len);
+    if ((size < 0) || (pos + len > stop)) {
+      return false;
+    }
+
+    pos += len;  // consume Size field
+    if ((pos + size) > stop) {
+      return false;
+    }
+
+    if (id == libwebm::kMkvCueTime)
+      m_timecode = UnserializeUInt(pReader, pos, size);
+
+    else if (id == libwebm::kMkvCueTrackPositions)
+      ++m_track_positions_count;
+
+    pos += size;  // consume payload
+  }
+
+  if (m_timecode < 0 || m_track_positions_count <= 0) {
+    return false;
+  }
+
+  // os << "CuePoint::Load(cont'd): idpos=" << idpos
+  //   << " timecode=" << m_timecode
+  //   << endl;
+
+  m_track_positions = new (std::nothrow) TrackPosition[m_track_positions_count];
+  if (m_track_positions == NULL)
+    return false;
+
+  // Now parse track positions
+
+  TrackPosition* p = m_track_positions;
+  pos = pos_;
+
+  while (pos < stop) {
+    long len;
+
+    const long long id = ReadID(pReader, pos, len);
+    if (id < 0 || (pos + len) > stop)
+      return false;
+
+    pos += len;  // consume ID
+
+    const long long size = ReadUInt(pReader, pos, len);
+    assert(size >= 0);
+    assert((pos + len) <= stop);
+
+    pos += len;  // consume Size field
+    assert((pos + size) <= stop);
+
+    if (id == libwebm::kMkvCueTrackPositions) {
+      TrackPosition& tp = *p++;
+      if (!tp.Parse(pReader, pos, size)) {
+        return false;
+      }
+    }
+
+    pos += size;  // consume payload
+    if (pos > stop)
+      return false;
+  }
+
+  assert(size_t(p - m_track_positions) == m_track_positions_count);
+
+  m_element_start = element_start;
+  m_element_size = element_size;
+
+  return true;
+}
+
+bool CuePoint::TrackPosition::Parse(IMkvReader* pReader, long long start_,
+                                    long long size_) {
+  const long long stop = start_ + size_;
+  long long pos = start_;
+
+  m_track = -1;
+  m_pos = -1;
+  m_block = 1;  // default
+
+  while (pos < stop) {
+    long len;
+
+    const long long id = ReadID(pReader, pos, len);
+    if ((id < 0) || ((pos + len) > stop)) {
+      return false;
+    }
+
+    pos += len;  // consume ID
+
+    const long long size = ReadUInt(pReader, pos, len);
+    if ((size < 0) || ((pos + len) > stop)) {
+      return false;
+    }
+
+    pos += len;  // consume Size field
+    if ((pos + size) > stop) {
+      return false;
+    }
+
+    if (id == libwebm::kMkvCueTrack)
+      m_track = UnserializeUInt(pReader, pos, size);
+    else if (id == libwebm::kMkvCueClusterPosition)
+      m_pos = UnserializeUInt(pReader, pos, size);
+    else if (id == libwebm::kMkvCueBlockNumber)
+      m_block = UnserializeUInt(pReader, pos, size);
+
+    pos += size;  // consume payload
+  }
+
+  if ((m_pos < 0) || (m_track <= 0)) {
+    return false;
+  }
+
+  return true;
+}
+
+const CuePoint::TrackPosition* CuePoint::Find(const Track* pTrack) const {
+  if (pTrack == NULL) {
+    return NULL;
+  }
+
+  const long long n = pTrack->GetNumber();
+
+  const TrackPosition* i = m_track_positions;
+  const TrackPosition* const j = i + m_track_positions_count;
+
+  while (i != j) {
+    const TrackPosition& p = *i++;
+
+    if (p.m_track == n)
+      return &p;
+  }
+
+  return NULL;  // no matching track number found
+}
+
+long long CuePoint::GetTimeCode() const { return m_timecode; }
+
+long long CuePoint::GetTime(const Segment* pSegment) const {
+  assert(pSegment);
+  assert(m_timecode >= 0);
+
+  const SegmentInfo* const pInfo = pSegment->GetInfo();
+  assert(pInfo);
+
+  const long long scale = pInfo->GetTimeCodeScale();
+  assert(scale >= 1);
+
+  const long long time = scale * m_timecode;
+
+  return time;
+}
+
+bool Segment::DoneParsing() const {
+  if (m_size < 0) {
+    long long total, avail;
+
+    const int status = m_pReader->Length(&total, &avail);
+
+    if (status < 0)  // error
+      return true;  // must assume done
+
+    if (total < 0)
+      return false;  // assume live stream
+
+    return (m_pos >= total);
+  }
+
+  const long long stop = m_start + m_size;
+
+  return (m_pos >= stop);
+}
+
+const Cluster* Segment::GetFirst() const {
+  if ((m_clusters == NULL) || (m_clusterCount <= 0))
+    return &m_eos;
+
+  Cluster* const pCluster = m_clusters[0];
+  assert(pCluster);
+
+  return pCluster;
+}
+
+const Cluster* Segment::GetLast() const {
+  if ((m_clusters == NULL) || (m_clusterCount <= 0))
+    return &m_eos;
+
+  const long idx = m_clusterCount - 1;
+
+  Cluster* const pCluster = m_clusters[idx];
+  assert(pCluster);
+
+  return pCluster;
+}
+
+unsigned long Segment::GetCount() const { return m_clusterCount; }
+
+const Cluster* Segment::GetNext(const Cluster* pCurr) {
+  assert(pCurr);
+  assert(pCurr != &m_eos);
+  assert(m_clusters);
+
+  long idx = pCurr->m_index;
+
+  if (idx >= 0) {
+    assert(m_clusterCount > 0);
+    assert(idx < m_clusterCount);
+    assert(pCurr == m_clusters[idx]);
+
+    ++idx;
+
+    if (idx >= m_clusterCount)
+      return &m_eos;  // caller will LoadCluster as desired
+
+    Cluster* const pNext = m_clusters[idx];
+    assert(pNext);
+    assert(pNext->m_index >= 0);
+    assert(pNext->m_index == idx);
+
+    return pNext;
+  }
+
+  assert(m_clusterPreloadCount > 0);
+
+  long long pos = pCurr->m_element_start;
+
+  assert(m_size >= 0);  // TODO
+  const long long stop = m_start + m_size;  // end of segment
+
+  {
+    long len;
+
+    long long result = GetUIntLength(m_pReader, pos, len);
+    assert(result == 0);
+    assert((pos + len) <= stop);  // TODO
+    if (result != 0)
+      return NULL;
+
+    const long long id = ReadID(m_pReader, pos, len);
+    if (id != libwebm::kMkvCluster)
+      return NULL;
+
+    pos += len;  // consume ID
+
+    // Read Size
+    result = GetUIntLength(m_pReader, pos, len);
+    assert(result == 0);  // TODO
+    assert((pos + len) <= stop);  // TODO
+
+    const long long size = ReadUInt(m_pReader, pos, len);
+    assert(size > 0);  // TODO
+    // assert((pCurr->m_size <= 0) || (pCurr->m_size == size));
+
+    pos += len;  // consume length of size of element
+    assert((pos + size) <= stop);  // TODO
+
+    // Pos now points to start of payload
+
+    pos += size;  // consume payload
+  }
+
+  long long off_next = 0;
+
+  while (pos < stop) {
+    long len;
+
+    long long result = GetUIntLength(m_pReader, pos, len);
+    assert(result == 0);
+    assert((pos + len) <= stop);  // TODO
+    if (result != 0)
+      return NULL;
+
+    const long long idpos = pos;  // pos of next (potential) cluster
+
+    const long long id = ReadID(m_pReader, idpos, len);
+    if (id < 0)
+      return NULL;
+
+    pos += len;  // consume ID
+
+    // Read Size
+    result = GetUIntLength(m_pReader, pos, len);
+    assert(result == 0);  // TODO
+    assert((pos + len) <= stop);  // TODO
+
+    const long long size = ReadUInt(m_pReader, pos, len);
+    assert(size >= 0);  // TODO
+
+    pos += len;  // consume length of size of element
+    assert((pos + size) <= stop);  // TODO
+
+    // Pos now points to start of payload
+
+    if (size == 0)  // weird
+      continue;
+
+    if (id == libwebm::kMkvCluster) {
+      const long long off_next_ = idpos - m_start;
+
+      long long pos_;
+      long len_;
+
+      const long status = Cluster::HasBlockEntries(this, off_next_, pos_, len_);
+
+      assert(status >= 0);
+
+      if (status > 0) {
+        off_next = off_next_;
+        break;
+      }
+    }
+
+    pos += size;  // consume payload
+  }
+
+  if (off_next <= 0)
+    return 0;
+
+  Cluster** const ii = m_clusters + m_clusterCount;
+  Cluster** i = ii;
+
+  Cluster** const jj = ii + m_clusterPreloadCount;
+  Cluster** j = jj;
+
+  while (i < j) {
+    // INVARIANT:
+    //[0, i) < pos_next
+    //[i, j) ?
+    //[j, jj)  > pos_next
+
+    Cluster** const k = i + (j - i) / 2;
+    assert(k < jj);
+
+    Cluster* const pNext = *k;
+    assert(pNext);
+    assert(pNext->m_index < 0);
+
+    // const long long pos_ = pNext->m_pos;
+    // assert(pos_);
+    // pos = pos_ * ((pos_ < 0) ? -1 : 1);
+
+    pos = pNext->GetPosition();
+
+    if (pos < off_next)
+      i = k + 1;
+    else if (pos > off_next)
+      j = k;
+    else
+      return pNext;
+  }
+
+  assert(i == j);
+
+  Cluster* const pNext = Cluster::Create(this, -1, off_next);
+  if (pNext == NULL)
+    return NULL;
+
+  const ptrdiff_t idx_next = i - m_clusters;  // insertion position
+
+  if (!PreloadCluster(pNext, idx_next)) {
+    delete pNext;
+    return NULL;
+  }
+  assert(m_clusters);
+  assert(idx_next < m_clusterSize);
+  assert(m_clusters[idx_next] == pNext);
+
+  return pNext;
+}
+
+long Segment::ParseNext(const Cluster* pCurr, const Cluster*& pResult,
+                        long long& pos, long& len) {
+  assert(pCurr);
+  assert(!pCurr->EOS());
+  assert(m_clusters);
+
+  pResult = 0;
+
+  if (pCurr->m_index >= 0) {  // loaded (not merely preloaded)
+    assert(m_clusters[pCurr->m_index] == pCurr);
+
+    const long next_idx = pCurr->m_index + 1;
+
+    if (next_idx < m_clusterCount) {
+      pResult = m_clusters[next_idx];
+      return 0;  // success
+    }
+
+    // curr cluster is last among loaded
+
+    const long result = LoadCluster(pos, len);
+
+    if (result < 0)  // error or underflow
+      return result;
+
+    if (result > 0)  // no more clusters
+    {
+      // pResult = &m_eos;
+      return 1;
+    }
+
+    pResult = GetLast();
+    return 0;  // success
+  }
+
+  assert(m_pos > 0);
+
+  long long total, avail;
+
+  long status = m_pReader->Length(&total, &avail);
+
+  if (status < 0)  // error
+    return status;
+
+  assert((total < 0) || (avail <= total));
+
+  const long long segment_stop = (m_size < 0) ? -1 : m_start + m_size;
+
+  // interrogate curr cluster
+
+  pos = pCurr->m_element_start;
+
+  if (pCurr->m_element_size >= 0)
+    pos += pCurr->m_element_size;
+  else {
+    if ((pos + 1) > avail) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
+    }
+
+    long long result = GetUIntLength(m_pReader, pos, len);
+
+    if (result < 0)  // error
+      return static_cast<long>(result);
+
+    if (result > 0)  // weird
+      return E_BUFFER_NOT_FULL;
+
+    if ((segment_stop >= 0) && ((pos + len) > segment_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + len) > avail)
+      return E_BUFFER_NOT_FULL;
+
+    const long long id = ReadUInt(m_pReader, pos, len);
+
+    if (id != libwebm::kMkvCluster)
+      return -1;
+
+    pos += len;  // consume ID
+
+    // Read Size
+
+    if ((pos + 1) > avail) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
+    }
+
+    result = GetUIntLength(m_pReader, pos, len);
+
+    if (result < 0)  // error
+      return static_cast<long>(result);
+
+    if (result > 0)  // weird
+      return E_BUFFER_NOT_FULL;
+
+    if ((segment_stop >= 0) && ((pos + len) > segment_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + len) > avail)
+      return E_BUFFER_NOT_FULL;
+
+    const long long size = ReadUInt(m_pReader, pos, len);
+
+    if (size < 0)  // error
+      return static_cast<long>(size);
+
+    pos += len;  // consume size field
+
+    const long long unknown_size = (1LL << (7 * len)) - 1;
+
+    if (size == unknown_size)  // TODO: should never happen
+      return E_FILE_FORMAT_INVALID;  // TODO: resolve this
+
+    // assert((pCurr->m_size <= 0) || (pCurr->m_size == size));
+
+    if ((segment_stop >= 0) && ((pos + size) > segment_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    // Pos now points to start of payload
+
+    pos += size;  // consume payload (that is, the current cluster)
+    if (segment_stop >= 0 && pos > segment_stop)
+      return E_FILE_FORMAT_INVALID;
+
+    // By consuming the payload, we are assuming that the curr
+    // cluster isn't interesting.  That is, we don't bother checking
+    // whether the payload of the curr cluster is less than what
+    // happens to be available (obtained via IMkvReader::Length).
+    // Presumably the caller has already dispensed with the current
+    // cluster, and really does want the next cluster.
+  }
+
+  // pos now points to just beyond the last fully-loaded cluster
+
+  for (;;) {
+    const long status = DoParseNext(pResult, pos, len);
+
+    if (status <= 1)
+      return status;
+  }
+}
+
+long Segment::DoParseNext(const Cluster*& pResult, long long& pos, long& len) {
+  long long total, avail;
+
+  long status = m_pReader->Length(&total, &avail);
+
+  if (status < 0)  // error
+    return status;
+
+  assert((total < 0) || (avail <= total));
+
+  const long long segment_stop = (m_size < 0) ? -1 : m_start + m_size;
+
+  // Parse next cluster.  This is strictly a parsing activity.
+  // Creation of a new cluster object happens later, after the
+  // parsing is done.
+
+  long long off_next = 0;
+  long long cluster_size = -1;
+
+  for (;;) {
+    if ((total >= 0) && (pos >= total))
+      return 1;  // EOF
+
+    if ((segment_stop >= 0) && (pos >= segment_stop))
+      return 1;  // EOF
+
+    if ((pos + 1) > avail) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
+    }
+
+    long long result = GetUIntLength(m_pReader, pos, len);
+
+    if (result < 0)  // error
+      return static_cast<long>(result);
+
+    if (result > 0)  // weird
+      return E_BUFFER_NOT_FULL;
+
+    if ((segment_stop >= 0) && ((pos + len) > segment_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + len) > avail)
+      return E_BUFFER_NOT_FULL;
+
+    const long long idpos = pos;  // absolute
+    const long long idoff = pos - m_start;  // relative
+
+    const long long id = ReadID(m_pReader, idpos, len);  // absolute
+
+    if (id < 0)  // error
+      return static_cast<long>(id);
+
+    if (id == 0)  // weird
+      return -1;  // generic error
+
+    pos += len;  // consume ID
+
+    // Read Size
+
+    if ((pos + 1) > avail) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
+    }
+
+    result = GetUIntLength(m_pReader, pos, len);
+
+    if (result < 0)  // error
+      return static_cast<long>(result);
+
+    if (result > 0)  // weird
+      return E_BUFFER_NOT_FULL;
+
+    if ((segment_stop >= 0) && ((pos + len) > segment_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + len) > avail)
+      return E_BUFFER_NOT_FULL;
+
+    const long long size = ReadUInt(m_pReader, pos, len);
+
+    if (size < 0)  // error
+      return static_cast<long>(size);
+
+    pos += len;  // consume length of size of element
+
+    // Pos now points to start of payload
+
+    if (size == 0)  // weird
+      continue;
+
+    const long long unknown_size = (1LL << (7 * len)) - 1;
+
+    if ((segment_stop >= 0) && (size != unknown_size) &&
+        ((pos + size) > segment_stop)) {
+      return E_FILE_FORMAT_INVALID;
+    }
+
+    if (id == libwebm::kMkvCues) {
+      if (size == unknown_size)
+        return E_FILE_FORMAT_INVALID;
+
+      const long long element_stop = pos + size;
+
+      if ((segment_stop >= 0) && (element_stop > segment_stop))
+        return E_FILE_FORMAT_INVALID;
+
+      const long long element_start = idpos;
+      const long long element_size = element_stop - element_start;
+
+      if (m_pCues == NULL) {
+        m_pCues = new (std::nothrow)
+            Cues(this, pos, size, element_start, element_size);
+        if (m_pCues == NULL)
+          return false;
+      }
+
+      pos += size;  // consume payload
+      if (segment_stop >= 0 && pos > segment_stop)
+        return E_FILE_FORMAT_INVALID;
+
+      continue;
+    }
+
+    if (id != libwebm::kMkvCluster) {  // not a Cluster ID
+      if (size == unknown_size)
+        return E_FILE_FORMAT_INVALID;
+
+      pos += size;  // consume payload
+      if (segment_stop >= 0 && pos > segment_stop)
+        return E_FILE_FORMAT_INVALID;
+
+      continue;
+    }
+
+    // We have a cluster.
+    off_next = idoff;
+
+    if (size != unknown_size)
+      cluster_size = size;
+
+    break;
+  }
+
+  assert(off_next > 0);  // have cluster
+
+  // We have parsed the next cluster.
+  // We have not created a cluster object yet.  What we need
+  // to do now is determine whether it has already be preloaded
+  //(in which case, an object for this cluster has already been
+  // created), and if not, create a new cluster object.
+
+  Cluster** const ii = m_clusters + m_clusterCount;
+  Cluster** i = ii;
+
+  Cluster** const jj = ii + m_clusterPreloadCount;
+  Cluster** j = jj;
+
+  while (i < j) {
+    // INVARIANT:
+    //[0, i) < pos_next
+    //[i, j) ?
+    //[j, jj)  > pos_next
+
+    Cluster** const k = i + (j - i) / 2;
+    assert(k < jj);
+
+    const Cluster* const pNext = *k;
+    assert(pNext);
+    assert(pNext->m_index < 0);
+
+    pos = pNext->GetPosition();
+    assert(pos >= 0);
+
+    if (pos < off_next)
+      i = k + 1;
+    else if (pos > off_next)
+      j = k;
+    else {
+      pResult = pNext;
+      return 0;  // success
+    }
+  }
+
+  assert(i == j);
+
+  long long pos_;
+  long len_;
+
+  status = Cluster::HasBlockEntries(this, off_next, pos_, len_);
+
+  if (status < 0) {  // error or underflow
+    pos = pos_;
+    len = len_;
+
+    return status;
+  }
+
+  if (status > 0) {  // means "found at least one block entry"
+    Cluster* const pNext = Cluster::Create(this,
+                                           -1,  // preloaded
+                                           off_next);
+    if (pNext == NULL)
+      return -1;
+
+    const ptrdiff_t idx_next = i - m_clusters;  // insertion position
+
+    if (!PreloadCluster(pNext, idx_next)) {
+      delete pNext;
+      return -1;
+    }
+    assert(m_clusters);
+    assert(idx_next < m_clusterSize);
+    assert(m_clusters[idx_next] == pNext);
+
+    pResult = pNext;
+    return 0;  // success
+  }
+
+  // status == 0 means "no block entries found"
+
+  if (cluster_size < 0) {  // unknown size
+    const long long payload_pos = pos;  // absolute pos of cluster payload
+
+    for (;;) {  // determine cluster size
+      if ((total >= 0) && (pos >= total))
+        break;
+
+      if ((segment_stop >= 0) && (pos >= segment_stop))
+        break;  // no more clusters
+
+      // Read ID
+
+      if ((pos + 1) > avail) {
+        len = 1;
+        return E_BUFFER_NOT_FULL;
+      }
+
+      long long result = GetUIntLength(m_pReader, pos, len);
+
+      if (result < 0)  // error
+        return static_cast<long>(result);
+
+      if (result > 0)  // weird
+        return E_BUFFER_NOT_FULL;
+
+      if ((segment_stop >= 0) && ((pos + len) > segment_stop))
+        return E_FILE_FORMAT_INVALID;
+
+      if ((pos + len) > avail)
+        return E_BUFFER_NOT_FULL;
+
+      const long long idpos = pos;
+      const long long id = ReadID(m_pReader, idpos, len);
+
+      if (id < 0)  // error (or underflow)
+        return static_cast<long>(id);
+
+      // This is the distinguished set of ID's we use to determine
+      // that we have exhausted the sub-element's inside the cluster
+      // whose ID we parsed earlier.
+
+      if (id == libwebm::kMkvCluster || id == libwebm::kMkvCues)
+        break;
+
+      pos += len;  // consume ID (of sub-element)
+
+      // Read Size
+
+      if ((pos + 1) > avail) {
+        len = 1;
+        return E_BUFFER_NOT_FULL;
+      }
+
+      result = GetUIntLength(m_pReader, pos, len);
+
+      if (result < 0)  // error
+        return static_cast<long>(result);
+
+      if (result > 0)  // weird
+        return E_BUFFER_NOT_FULL;
+
+      if ((segment_stop >= 0) && ((pos + len) > segment_stop))
+        return E_FILE_FORMAT_INVALID;
+
+      if ((pos + len) > avail)
+        return E_BUFFER_NOT_FULL;
+
+      const long long size = ReadUInt(m_pReader, pos, len);
+
+      if (size < 0)  // error
+        return static_cast<long>(size);
+
+      pos += len;  // consume size field of element
+
+      // pos now points to start of sub-element's payload
+
+      if (size == 0)  // weird
+        continue;
+
+      const long long unknown_size = (1LL << (7 * len)) - 1;
+
+      if (size == unknown_size)
+        return E_FILE_FORMAT_INVALID;  // not allowed for sub-elements
+
+      if ((segment_stop >= 0) && ((pos + size) > segment_stop))  // weird
+        return E_FILE_FORMAT_INVALID;
+
+      pos += size;  // consume payload of sub-element
+      if (segment_stop >= 0 && pos > segment_stop)
+        return E_FILE_FORMAT_INVALID;
+    }  // determine cluster size
+
+    cluster_size = pos - payload_pos;
+    assert(cluster_size >= 0);  // TODO: handle cluster_size = 0
+
+    pos = payload_pos;  // reset and re-parse original cluster
+  }
+
+  pos += cluster_size;  // consume payload
+  if (segment_stop >= 0 && pos > segment_stop)
+    return E_FILE_FORMAT_INVALID;
+
+  return 2;  // try to find a cluster that follows next
+}
+
+const Cluster* Segment::FindCluster(long long time_ns) const {
+  if ((m_clusters == NULL) || (m_clusterCount <= 0))
+    return &m_eos;
+
+  {
+    Cluster* const pCluster = m_clusters[0];
+    assert(pCluster);
+    assert(pCluster->m_index == 0);
+
+    if (time_ns <= pCluster->GetTime())
+      return pCluster;
+  }
+
+  // Binary search of cluster array
+
+  long i = 0;
+  long j = m_clusterCount;
+
+  while (i < j) {
+    // INVARIANT:
+    //[0, i) <= time_ns
+    //[i, j) ?
+    //[j, m_clusterCount)  > time_ns
+
+    const long k = i + (j - i) / 2;
+    assert(k < m_clusterCount);
+
+    Cluster* const pCluster = m_clusters[k];
+    assert(pCluster);
+    assert(pCluster->m_index == k);
+
+    const long long t = pCluster->GetTime();
+
+    if (t <= time_ns)
+      i = k + 1;
+    else
+      j = k;
+
+    assert(i <= j);
+  }
+
+  assert(i == j);
+  assert(i > 0);
+  assert(i <= m_clusterCount);
+
+  const long k = i - 1;
+
+  Cluster* const pCluster = m_clusters[k];
+  assert(pCluster);
+  assert(pCluster->m_index == k);
+  assert(pCluster->GetTime() <= time_ns);
+
+  return pCluster;
+}
+
+const Tracks* Segment::GetTracks() const { return m_pTracks; }
+const SegmentInfo* Segment::GetInfo() const { return m_pInfo; }
+const Cues* Segment::GetCues() const { return m_pCues; }
+const Chapters* Segment::GetChapters() const { return m_pChapters; }
+const Tags* Segment::GetTags() const { return m_pTags; }
+const SeekHead* Segment::GetSeekHead() const { return m_pSeekHead; }
+
+long long Segment::GetDuration() const {
+  assert(m_pInfo);
+  return m_pInfo->GetDuration();
+}
+
+Chapters::Chapters(Segment* pSegment, long long payload_start,
+                   long long payload_size, long long element_start,
+                   long long element_size)
+    : m_pSegment(pSegment),
+      m_start(payload_start),
+      m_size(payload_size),
+      m_element_start(element_start),
+      m_element_size(element_size),
+      m_editions(NULL),
+      m_editions_size(0),
+      m_editions_count(0) {}
+
+Chapters::~Chapters() {
+  while (m_editions_count > 0) {
+    Edition& e = m_editions[--m_editions_count];
+    e.Clear();
+  }
+  delete[] m_editions;
+}
+
+long Chapters::Parse() {
+  IMkvReader* const pReader = m_pSegment->m_pReader;
+
+  long long pos = m_start;  // payload start
+  const long long stop = pos + m_size;  // payload stop
+
+  while (pos < stop) {
+    long long id, size;
+
+    long status = ParseElementHeader(pReader, pos, stop, id, size);
+
+    if (status < 0)  // error
+      return status;
+
+    if (size == 0)  // weird
+      continue;
+
+    if (id == libwebm::kMkvEditionEntry) {
+      status = ParseEdition(pos, size);
+
+      if (status < 0)  // error
+        return status;
+    }
+
+    pos += size;
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  if (pos != stop)
+    return E_FILE_FORMAT_INVALID;
+  return 0;
+}
+
+int Chapters::GetEditionCount() const { return m_editions_count; }
+
+const Chapters::Edition* Chapters::GetEdition(int idx) const {
+  if (idx < 0)
+    return NULL;
+
+  if (idx >= m_editions_count)
+    return NULL;
+
+  return m_editions + idx;
+}
+
+bool Chapters::ExpandEditionsArray() {
+  if (m_editions_size > m_editions_count)
+    return true;  // nothing else to do
+
+  const int size = (m_editions_size == 0) ? 1 : 2 * m_editions_size;
+
+  Edition* const editions = new (std::nothrow) Edition[size];
+
+  if (editions == NULL)
+    return false;
+
+  for (int idx = 0; idx < m_editions_count; ++idx) {
+    m_editions[idx].ShallowCopy(editions[idx]);
+  }
+
+  delete[] m_editions;
+  m_editions = editions;
+
+  m_editions_size = size;
+  return true;
+}
+
+long Chapters::ParseEdition(long long pos, long long size) {
+  if (!ExpandEditionsArray())
+    return -1;
+
+  Edition& e = m_editions[m_editions_count++];
+  e.Init();
+
+  return e.Parse(m_pSegment->m_pReader, pos, size);
+}
+
+Chapters::Edition::Edition() {}
+
+Chapters::Edition::~Edition() {}
+
+int Chapters::Edition::GetAtomCount() const { return m_atoms_count; }
+
+const Chapters::Atom* Chapters::Edition::GetAtom(int index) const {
+  if (index < 0)
+    return NULL;
+
+  if (index >= m_atoms_count)
+    return NULL;
+
+  return m_atoms + index;
+}
+
+void Chapters::Edition::Init() {
+  m_atoms = NULL;
+  m_atoms_size = 0;
+  m_atoms_count = 0;
+}
+
+void Chapters::Edition::ShallowCopy(Edition& rhs) const {
+  rhs.m_atoms = m_atoms;
+  rhs.m_atoms_size = m_atoms_size;
+  rhs.m_atoms_count = m_atoms_count;
+}
+
+void Chapters::Edition::Clear() {
+  while (m_atoms_count > 0) {
+    Atom& a = m_atoms[--m_atoms_count];
+    a.Clear();
+  }
+
+  delete[] m_atoms;
+  m_atoms = NULL;
+
+  m_atoms_size = 0;
+}
+
+long Chapters::Edition::Parse(IMkvReader* pReader, long long pos,
+                              long long size) {
+  const long long stop = pos + size;
+
+  while (pos < stop) {
+    long long id, size;
+
+    long status = ParseElementHeader(pReader, pos, stop, id, size);
+
+    if (status < 0)  // error
+      return status;
+
+    if (size == 0)
+      continue;
+
+    if (id == libwebm::kMkvChapterAtom) {
+      status = ParseAtom(pReader, pos, size);
+
+      if (status < 0)  // error
+        return status;
+    }
+
+    pos += size;
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  if (pos != stop)
+    return E_FILE_FORMAT_INVALID;
+  return 0;
+}
+
+long Chapters::Edition::ParseAtom(IMkvReader* pReader, long long pos,
+                                  long long size) {
+  if (!ExpandAtomsArray())
+    return -1;
+
+  Atom& a = m_atoms[m_atoms_count++];
+  a.Init();
+
+  return a.Parse(pReader, pos, size);
+}
+
+bool Chapters::Edition::ExpandAtomsArray() {
+  if (m_atoms_size > m_atoms_count)
+    return true;  // nothing else to do
+
+  const int size = (m_atoms_size == 0) ? 1 : 2 * m_atoms_size;
+
+  Atom* const atoms = new (std::nothrow) Atom[size];
+
+  if (atoms == NULL)
+    return false;
+
+  for (int idx = 0; idx < m_atoms_count; ++idx) {
+    m_atoms[idx].ShallowCopy(atoms[idx]);
+  }
+
+  delete[] m_atoms;
+  m_atoms = atoms;
+
+  m_atoms_size = size;
+  return true;
+}
+
+Chapters::Atom::Atom() {}
+
+Chapters::Atom::~Atom() {}
+
+unsigned long long Chapters::Atom::GetUID() const { return m_uid; }
+
+const char* Chapters::Atom::GetStringUID() const { return m_string_uid; }
+
+long long Chapters::Atom::GetStartTimecode() const { return m_start_timecode; }
+
+long long Chapters::Atom::GetStopTimecode() const { return m_stop_timecode; }
+
+long long Chapters::Atom::GetStartTime(const Chapters* pChapters) const {
+  return GetTime(pChapters, m_start_timecode);
+}
+
+long long Chapters::Atom::GetStopTime(const Chapters* pChapters) const {
+  return GetTime(pChapters, m_stop_timecode);
+}
+
+int Chapters::Atom::GetDisplayCount() const { return m_displays_count; }
+
+const Chapters::Display* Chapters::Atom::GetDisplay(int index) const {
+  if (index < 0)
+    return NULL;
+
+  if (index >= m_displays_count)
+    return NULL;
+
+  return m_displays + index;
+}
+
+void Chapters::Atom::Init() {
+  m_string_uid = NULL;
+  m_uid = 0;
+  m_start_timecode = -1;
+  m_stop_timecode = -1;
+
+  m_displays = NULL;
+  m_displays_size = 0;
+  m_displays_count = 0;
+}
+
+void Chapters::Atom::ShallowCopy(Atom& rhs) const {
+  rhs.m_string_uid = m_string_uid;
+  rhs.m_uid = m_uid;
+  rhs.m_start_timecode = m_start_timecode;
+  rhs.m_stop_timecode = m_stop_timecode;
+
+  rhs.m_displays = m_displays;
+  rhs.m_displays_size = m_displays_size;
+  rhs.m_displays_count = m_displays_count;
+}
+
+void Chapters::Atom::Clear() {
+  delete[] m_string_uid;
+  m_string_uid = NULL;
+
+  while (m_displays_count > 0) {
+    Display& d = m_displays[--m_displays_count];
+    d.Clear();
+  }
+
+  delete[] m_displays;
+  m_displays = NULL;
+
+  m_displays_size = 0;
+}
+
+long Chapters::Atom::Parse(IMkvReader* pReader, long long pos, long long size) {
+  const long long stop = pos + size;
+
+  while (pos < stop) {
+    long long id, size;
+
+    long status = ParseElementHeader(pReader, pos, stop, id, size);
+
+    if (status < 0)  // error
+      return status;
+
+    if (size == 0)  // 0 length payload, skip.
+      continue;
+
+    if (id == libwebm::kMkvChapterDisplay) {
+      status = ParseDisplay(pReader, pos, size);
+
+      if (status < 0)  // error
+        return status;
+    } else if (id == libwebm::kMkvChapterStringUID) {
+      status = UnserializeString(pReader, pos, size, m_string_uid);
+
+      if (status < 0)  // error
+        return status;
+    } else if (id == libwebm::kMkvChapterUID) {
+      long long val;
+      status = UnserializeInt(pReader, pos, size, val);
+
+      if (status < 0)  // error
+        return status;
+
+      m_uid = static_cast<unsigned long long>(val);
+    } else if (id == libwebm::kMkvChapterTimeStart) {
+      const long long val = UnserializeUInt(pReader, pos, size);
+
+      if (val < 0)  // error
+        return static_cast<long>(val);
+
+      m_start_timecode = val;
+    } else if (id == libwebm::kMkvChapterTimeEnd) {
+      const long long val = UnserializeUInt(pReader, pos, size);
+
+      if (val < 0)  // error
+        return static_cast<long>(val);
+
+      m_stop_timecode = val;
+    }
+
+    pos += size;
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  if (pos != stop)
+    return E_FILE_FORMAT_INVALID;
+  return 0;
+}
+
+long long Chapters::Atom::GetTime(const Chapters* pChapters,
+                                  long long timecode) {
+  if (pChapters == NULL)
+    return -1;
+
+  Segment* const pSegment = pChapters->m_pSegment;
+
+  if (pSegment == NULL)  // weird
+    return -1;
+
+  const SegmentInfo* const pInfo = pSegment->GetInfo();
+
+  if (pInfo == NULL)
+    return -1;
+
+  const long long timecode_scale = pInfo->GetTimeCodeScale();
+
+  if (timecode_scale < 1)  // weird
+    return -1;
+
+  if (timecode < 0)
+    return -1;
+
+  const long long result = timecode_scale * timecode;
+
+  return result;
+}
+
+long Chapters::Atom::ParseDisplay(IMkvReader* pReader, long long pos,
+                                  long long size) {
+  if (!ExpandDisplaysArray())
+    return -1;
+
+  Display& d = m_displays[m_displays_count++];
+  d.Init();
+
+  return d.Parse(pReader, pos, size);
+}
+
+bool Chapters::Atom::ExpandDisplaysArray() {
+  if (m_displays_size > m_displays_count)
+    return true;  // nothing else to do
+
+  const int size = (m_displays_size == 0) ? 1 : 2 * m_displays_size;
+
+  Display* const displays = new (std::nothrow) Display[size];
+
+  if (displays == NULL)
+    return false;
+
+  for (int idx = 0; idx < m_displays_count; ++idx) {
+    m_displays[idx].ShallowCopy(displays[idx]);
+  }
+
+  delete[] m_displays;
+  m_displays = displays;
+
+  m_displays_size = size;
+  return true;
+}
+
+Chapters::Display::Display() {}
+
+Chapters::Display::~Display() {}
+
+const char* Chapters::Display::GetString() const { return m_string; }
+
+const char* Chapters::Display::GetLanguage() const { return m_language; }
+
+const char* Chapters::Display::GetCountry() const { return m_country; }
+
+void Chapters::Display::Init() {
+  m_string = NULL;
+  m_language = NULL;
+  m_country = NULL;
+}
+
+void Chapters::Display::ShallowCopy(Display& rhs) const {
+  rhs.m_string = m_string;
+  rhs.m_language = m_language;
+  rhs.m_country = m_country;
+}
+
+void Chapters::Display::Clear() {
+  delete[] m_string;
+  m_string = NULL;
+
+  delete[] m_language;
+  m_language = NULL;
+
+  delete[] m_country;
+  m_country = NULL;
+}
+
+long Chapters::Display::Parse(IMkvReader* pReader, long long pos,
+                              long long size) {
+  const long long stop = pos + size;
+
+  while (pos < stop) {
+    long long id, size;
+
+    long status = ParseElementHeader(pReader, pos, stop, id, size);
+
+    if (status < 0)  // error
+      return status;
+
+    if (size == 0)  // No payload.
+      continue;
+
+    if (id == libwebm::kMkvChapString) {
+      status = UnserializeString(pReader, pos, size, m_string);
+
+      if (status)
+        return status;
+    } else if (id == libwebm::kMkvChapLanguage) {
+      status = UnserializeString(pReader, pos, size, m_language);
+
+      if (status)
+        return status;
+    } else if (id == libwebm::kMkvChapCountry) {
+      status = UnserializeString(pReader, pos, size, m_country);
+
+      if (status)
+        return status;
+    }
+
+    pos += size;
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  if (pos != stop)
+    return E_FILE_FORMAT_INVALID;
+  return 0;
+}
+
+Tags::Tags(Segment* pSegment, long long payload_start, long long payload_size,
+           long long element_start, long long element_size)
+    : m_pSegment(pSegment),
+      m_start(payload_start),
+      m_size(payload_size),
+      m_element_start(element_start),
+      m_element_size(element_size),
+      m_tags(NULL),
+      m_tags_size(0),
+      m_tags_count(0) {}
+
+Tags::~Tags() {
+  while (m_tags_count > 0) {
+    Tag& t = m_tags[--m_tags_count];
+    t.Clear();
+  }
+  delete[] m_tags;
+}
+
+long Tags::Parse() {
+  IMkvReader* const pReader = m_pSegment->m_pReader;
+
+  long long pos = m_start;  // payload start
+  const long long stop = pos + m_size;  // payload stop
+
+  while (pos < stop) {
+    long long id, size;
+
+    long status = ParseElementHeader(pReader, pos, stop, id, size);
+
+    if (status < 0)
+      return status;
+
+    if (size == 0)  // 0 length tag, read another
+      continue;
+
+    if (id == libwebm::kMkvTag) {
+      status = ParseTag(pos, size);
+
+      if (status < 0)
+        return status;
+    }
+
+    pos += size;
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  if (pos != stop)
+    return E_FILE_FORMAT_INVALID;
+
+  return 0;
+}
+
+int Tags::GetTagCount() const { return m_tags_count; }
+
+const Tags::Tag* Tags::GetTag(int idx) const {
+  if (idx < 0)
+    return NULL;
+
+  if (idx >= m_tags_count)
+    return NULL;
+
+  return m_tags + idx;
+}
+
+bool Tags::ExpandTagsArray() {
+  if (m_tags_size > m_tags_count)
+    return true;  // nothing else to do
+
+  const int size = (m_tags_size == 0) ? 1 : 2 * m_tags_size;
+
+  Tag* const tags = new (std::nothrow) Tag[size];
+
+  if (tags == NULL)
+    return false;
+
+  for (int idx = 0; idx < m_tags_count; ++idx) {
+    m_tags[idx].ShallowCopy(tags[idx]);
+  }
+
+  delete[] m_tags;
+  m_tags = tags;
+
+  m_tags_size = size;
+  return true;
+}
+
+long Tags::ParseTag(long long pos, long long size) {
+  if (!ExpandTagsArray())
+    return -1;
+
+  Tag& t = m_tags[m_tags_count++];
+  t.Init();
+
+  return t.Parse(m_pSegment->m_pReader, pos, size);
+}
+
+Tags::Tag::Tag() {}
+
+Tags::Tag::~Tag() {}
+
+int Tags::Tag::GetSimpleTagCount() const { return m_simple_tags_count; }
+
+const Tags::SimpleTag* Tags::Tag::GetSimpleTag(int index) const {
+  if (index < 0)
+    return NULL;
+
+  if (index >= m_simple_tags_count)
+    return NULL;
+
+  return m_simple_tags + index;
+}
+
+void Tags::Tag::Init() {
+  m_simple_tags = NULL;
+  m_simple_tags_size = 0;
+  m_simple_tags_count = 0;
+}
+
+void Tags::Tag::ShallowCopy(Tag& rhs) const {
+  rhs.m_simple_tags = m_simple_tags;
+  rhs.m_simple_tags_size = m_simple_tags_size;
+  rhs.m_simple_tags_count = m_simple_tags_count;
+}
+
+void Tags::Tag::Clear() {
+  while (m_simple_tags_count > 0) {
+    SimpleTag& d = m_simple_tags[--m_simple_tags_count];
+    d.Clear();
+  }
+
+  delete[] m_simple_tags;
+  m_simple_tags = NULL;
+
+  m_simple_tags_size = 0;
+}
+
+long Tags::Tag::Parse(IMkvReader* pReader, long long pos, long long size) {
+  const long long stop = pos + size;
+
+  while (pos < stop) {
+    long long id, size;
+
+    long status = ParseElementHeader(pReader, pos, stop, id, size);
+
+    if (status < 0)
+      return status;
+
+    if (size == 0)  // 0 length tag, read another
+      continue;
+
+    if (id == libwebm::kMkvSimpleTag) {
+      status = ParseSimpleTag(pReader, pos, size);
+
+      if (status < 0)
+        return status;
+    }
+
+    pos += size;
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  if (pos != stop)
+    return E_FILE_FORMAT_INVALID;
+  return 0;
+}
+
+long Tags::Tag::ParseSimpleTag(IMkvReader* pReader, long long pos,
+                               long long size) {
+  if (!ExpandSimpleTagsArray())
+    return -1;
+
+  SimpleTag& st = m_simple_tags[m_simple_tags_count++];
+  st.Init();
+
+  return st.Parse(pReader, pos, size);
+}
+
+bool Tags::Tag::ExpandSimpleTagsArray() {
+  if (m_simple_tags_size > m_simple_tags_count)
+    return true;  // nothing else to do
+
+  const int size = (m_simple_tags_size == 0) ? 1 : 2 * m_simple_tags_size;
+
+  SimpleTag* const displays = new (std::nothrow) SimpleTag[size];
+
+  if (displays == NULL)
+    return false;
+
+  for (int idx = 0; idx < m_simple_tags_count; ++idx) {
+    m_simple_tags[idx].ShallowCopy(displays[idx]);
+  }
+
+  delete[] m_simple_tags;
+  m_simple_tags = displays;
+
+  m_simple_tags_size = size;
+  return true;
+}
+
+Tags::SimpleTag::SimpleTag() {}
+
+Tags::SimpleTag::~SimpleTag() {}
+
+const char* Tags::SimpleTag::GetTagName() const { return m_tag_name; }
+
+const char* Tags::SimpleTag::GetTagString() const { return m_tag_string; }
+
+void Tags::SimpleTag::Init() {
+  m_tag_name = NULL;
+  m_tag_string = NULL;
+}
+
+void Tags::SimpleTag::ShallowCopy(SimpleTag& rhs) const {
+  rhs.m_tag_name = m_tag_name;
+  rhs.m_tag_string = m_tag_string;
+}
+
+void Tags::SimpleTag::Clear() {
+  delete[] m_tag_name;
+  m_tag_name = NULL;
+
+  delete[] m_tag_string;
+  m_tag_string = NULL;
+}
+
+long Tags::SimpleTag::Parse(IMkvReader* pReader, long long pos,
+                            long long size) {
+  const long long stop = pos + size;
+
+  while (pos < stop) {
+    long long id, size;
+
+    long status = ParseElementHeader(pReader, pos, stop, id, size);
+
+    if (status < 0)  // error
+      return status;
+
+    if (size == 0)  // weird
+      continue;
+
+    if (id == libwebm::kMkvTagName) {
+      status = UnserializeString(pReader, pos, size, m_tag_name);
+
+      if (status)
+        return status;
+    } else if (id == libwebm::kMkvTagString) {
+      status = UnserializeString(pReader, pos, size, m_tag_string);
+
+      if (status)
+        return status;
+    }
+
+    pos += size;
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  if (pos != stop)
+    return E_FILE_FORMAT_INVALID;
+  return 0;
+}
+
+SegmentInfo::SegmentInfo(Segment* pSegment, long long start, long long size_,
+                         long long element_start, long long element_size)
+    : m_pSegment(pSegment),
+      m_start(start),
+      m_size(size_),
+      m_element_start(element_start),
+      m_element_size(element_size),
+      m_pMuxingAppAsUTF8(NULL),
+      m_pWritingAppAsUTF8(NULL),
+      m_pTitleAsUTF8(NULL) {}
+
+SegmentInfo::~SegmentInfo() {
+  delete[] m_pMuxingAppAsUTF8;
+  m_pMuxingAppAsUTF8 = NULL;
+
+  delete[] m_pWritingAppAsUTF8;
+  m_pWritingAppAsUTF8 = NULL;
+
+  delete[] m_pTitleAsUTF8;
+  m_pTitleAsUTF8 = NULL;
+}
+
+long SegmentInfo::Parse() {
+  assert(m_pMuxingAppAsUTF8 == NULL);
+  assert(m_pWritingAppAsUTF8 == NULL);
+  assert(m_pTitleAsUTF8 == NULL);
+
+  IMkvReader* const pReader = m_pSegment->m_pReader;
+
+  long long pos = m_start;
+  const long long stop = m_start + m_size;
+
+  m_timecodeScale = 1000000;
+  m_duration = -1;
+
+  while (pos < stop) {
+    long long id, size;
+
+    const long status = ParseElementHeader(pReader, pos, stop, id, size);
+
+    if (status < 0)  // error
+      return status;
+
+    if (id == libwebm::kMkvTimecodeScale) {
+      m_timecodeScale = UnserializeUInt(pReader, pos, size);
+
+      if (m_timecodeScale <= 0)
+        return E_FILE_FORMAT_INVALID;
+    } else if (id == libwebm::kMkvDuration) {
+      const long status = UnserializeFloat(pReader, pos, size, m_duration);
+
+      if (status < 0)
+        return status;
+
+      if (m_duration < 0)
+        return E_FILE_FORMAT_INVALID;
+    } else if (id == libwebm::kMkvMuxingApp) {
+      const long status =
+          UnserializeString(pReader, pos, size, m_pMuxingAppAsUTF8);
+
+      if (status)
+        return status;
+    } else if (id == libwebm::kMkvWritingApp) {
+      const long status =
+          UnserializeString(pReader, pos, size, m_pWritingAppAsUTF8);
+
+      if (status)
+        return status;
+    } else if (id == libwebm::kMkvTitle) {
+      const long status = UnserializeString(pReader, pos, size, m_pTitleAsUTF8);
+
+      if (status)
+        return status;
+    }
+
+    pos += size;
+
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  const double rollover_check = m_duration * m_timecodeScale;
+  if (rollover_check > static_cast<double>(LLONG_MAX))
+    return E_FILE_FORMAT_INVALID;
+
+  if (pos != stop)
+    return E_FILE_FORMAT_INVALID;
+
+  return 0;
+}
+
+long long SegmentInfo::GetTimeCodeScale() const { return m_timecodeScale; }
+
+long long SegmentInfo::GetDuration() const {
+  if (m_duration < 0)
+    return -1;
+
+  assert(m_timecodeScale >= 1);
+
+  const double dd = double(m_duration) * double(m_timecodeScale);
+  const long long d = static_cast<long long>(dd);
+
+  return d;
+}
+
+const char* SegmentInfo::GetMuxingAppAsUTF8() const {
+  return m_pMuxingAppAsUTF8;
+}
+
+const char* SegmentInfo::GetWritingAppAsUTF8() const {
+  return m_pWritingAppAsUTF8;
+}
+
+const char* SegmentInfo::GetTitleAsUTF8() const { return m_pTitleAsUTF8; }
+
+///////////////////////////////////////////////////////////////
+// ContentEncoding element
+ContentEncoding::ContentCompression::ContentCompression()
+    : algo(0), settings(NULL), settings_len(0) {}
+
+ContentEncoding::ContentCompression::~ContentCompression() {
+  delete[] settings;
+}
+
+ContentEncoding::ContentEncryption::ContentEncryption()
+    : algo(0),
+      key_id(NULL),
+      key_id_len(0),
+      signature(NULL),
+      signature_len(0),
+      sig_key_id(NULL),
+      sig_key_id_len(0),
+      sig_algo(0),
+      sig_hash_algo(0) {}
+
+ContentEncoding::ContentEncryption::~ContentEncryption() {
+  delete[] key_id;
+  delete[] signature;
+  delete[] sig_key_id;
+}
+
+ContentEncoding::ContentEncoding()
+    : compression_entries_(NULL),
+      compression_entries_end_(NULL),
+      encryption_entries_(NULL),
+      encryption_entries_end_(NULL),
+      encoding_order_(0),
+      encoding_scope_(1),
+      encoding_type_(0) {}
+
+ContentEncoding::~ContentEncoding() {
+  ContentCompression** comp_i = compression_entries_;
+  ContentCompression** const comp_j = compression_entries_end_;
+
+  while (comp_i != comp_j) {
+    ContentCompression* const comp = *comp_i++;
+    delete comp;
+  }
+
+  delete[] compression_entries_;
+
+  ContentEncryption** enc_i = encryption_entries_;
+  ContentEncryption** const enc_j = encryption_entries_end_;
+
+  while (enc_i != enc_j) {
+    ContentEncryption* const enc = *enc_i++;
+    delete enc;
+  }
+
+  delete[] encryption_entries_;
+}
+
+const ContentEncoding::ContentCompression*
+ContentEncoding::GetCompressionByIndex(unsigned long idx) const {
+  const ptrdiff_t count = compression_entries_end_ - compression_entries_;
+  assert(count >= 0);
+
+  if (idx >= static_cast<unsigned long>(count))
+    return NULL;
+
+  return compression_entries_[idx];
+}
+
+unsigned long ContentEncoding::GetCompressionCount() const {
+  const ptrdiff_t count = compression_entries_end_ - compression_entries_;
+  assert(count >= 0);
+
+  return static_cast<unsigned long>(count);
+}
+
+const ContentEncoding::ContentEncryption* ContentEncoding::GetEncryptionByIndex(
+    unsigned long idx) const {
+  const ptrdiff_t count = encryption_entries_end_ - encryption_entries_;
+  assert(count >= 0);
+
+  if (idx >= static_cast<unsigned long>(count))
+    return NULL;
+
+  return encryption_entries_[idx];
+}
+
+unsigned long ContentEncoding::GetEncryptionCount() const {
+  const ptrdiff_t count = encryption_entries_end_ - encryption_entries_;
+  assert(count >= 0);
+
+  return static_cast<unsigned long>(count);
+}
+
+long ContentEncoding::ParseContentEncAESSettingsEntry(
+    long long start, long long size, IMkvReader* pReader,
+    ContentEncAESSettings* aes) {
+  assert(pReader);
+  assert(aes);
+
+  long long pos = start;
+  const long long stop = start + size;
+
+  while (pos < stop) {
+    long long id, size;
+    const long status = ParseElementHeader(pReader, pos, stop, id, size);
+    if (status < 0)  // error
+      return status;
+
+    if (id == libwebm::kMkvAESSettingsCipherMode) {
+      aes->cipher_mode = UnserializeUInt(pReader, pos, size);
+      if (aes->cipher_mode != 1)
+        return E_FILE_FORMAT_INVALID;
+    }
+
+    pos += size;  // consume payload
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  return 0;
+}
+
+long ContentEncoding::ParseContentEncodingEntry(long long start, long long size,
+                                                IMkvReader* pReader) {
+  assert(pReader);
+
+  long long pos = start;
+  const long long stop = start + size;
+
+  // Count ContentCompression and ContentEncryption elements.
+  int compression_count = 0;
+  int encryption_count = 0;
+
+  while (pos < stop) {
+    long long id, size;
+    const long status = ParseElementHeader(pReader, pos, stop, id, size);
+    if (status < 0)  // error
+      return status;
+
+    if (id == libwebm::kMkvContentCompression)
+      ++compression_count;
+
+    if (id == libwebm::kMkvContentEncryption)
+      ++encryption_count;
+
+    pos += size;  // consume payload
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  if (compression_count <= 0 && encryption_count <= 0)
+    return -1;
+
+  if (compression_count > 0) {
+    compression_entries_ =
+        new (std::nothrow) ContentCompression*[compression_count];
+    if (!compression_entries_)
+      return -1;
+    compression_entries_end_ = compression_entries_;
+  }
+
+  if (encryption_count > 0) {
+    encryption_entries_ =
+        new (std::nothrow) ContentEncryption*[encryption_count];
+    if (!encryption_entries_) {
+      delete[] compression_entries_;
+      compression_entries_ = NULL;
+      return -1;
+    }
+    encryption_entries_end_ = encryption_entries_;
+  }
+
+  pos = start;
+  while (pos < stop) {
+    long long id, size;
+    long status = ParseElementHeader(pReader, pos, stop, id, size);
+    if (status < 0)  // error
+      return status;
+
+    if (id == libwebm::kMkvContentEncodingOrder) {
+      encoding_order_ = UnserializeUInt(pReader, pos, size);
+    } else if (id == libwebm::kMkvContentEncodingScope) {
+      encoding_scope_ = UnserializeUInt(pReader, pos, size);
+      if (encoding_scope_ < 1)
+        return -1;
+    } else if (id == libwebm::kMkvContentEncodingType) {
+      encoding_type_ = UnserializeUInt(pReader, pos, size);
+    } else if (id == libwebm::kMkvContentCompression) {
+      ContentCompression* const compression =
+          new (std::nothrow) ContentCompression();
+      if (!compression)
+        return -1;
+
+      status = ParseCompressionEntry(pos, size, pReader, compression);
+      if (status) {
+        delete compression;
+        return status;
+      }
+      assert(compression_count > 0);
+      *compression_entries_end_++ = compression;
+    } else if (id == libwebm::kMkvContentEncryption) {
+      ContentEncryption* const encryption =
+          new (std::nothrow) ContentEncryption();
+      if (!encryption)
+        return -1;
+
+      status = ParseEncryptionEntry(pos, size, pReader, encryption);
+      if (status) {
+        delete encryption;
+        return status;
+      }
+      assert(encryption_count > 0);
+      *encryption_entries_end_++ = encryption;
+    }
+
+    pos += size;  // consume payload
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  if (pos != stop)
+    return E_FILE_FORMAT_INVALID;
+  return 0;
+}
+
+long ContentEncoding::ParseCompressionEntry(long long start, long long size,
+                                            IMkvReader* pReader,
+                                            ContentCompression* compression) {
+  assert(pReader);
+  assert(compression);
+
+  long long pos = start;
+  const long long stop = start + size;
+
+  bool valid = false;
+
+  while (pos < stop) {
+    long long id, size;
+    const long status = ParseElementHeader(pReader, pos, stop, id, size);
+    if (status < 0)  // error
+      return status;
+
+    if (id == libwebm::kMkvContentCompAlgo) {
+      long long algo = UnserializeUInt(pReader, pos, size);
+      if (algo < 0)
+        return E_FILE_FORMAT_INVALID;
+      compression->algo = algo;
+      valid = true;
+    } else if (id == libwebm::kMkvContentCompSettings) {
+      if (size <= 0)
+        return E_FILE_FORMAT_INVALID;
+
+      const size_t buflen = static_cast<size_t>(size);
+      unsigned char* buf = SafeArrayAlloc<unsigned char>(1, buflen);
+      if (buf == NULL)
+        return -1;
+
+      const int read_status =
+          pReader->Read(pos, static_cast<long>(buflen), buf);
+      if (read_status) {
+        delete[] buf;
+        return status;
+      }
+
+      // There should be only one settings element per content compression.
+      if (compression->settings != NULL) {
+        delete[] buf;
+        return E_FILE_FORMAT_INVALID;
+      }
+
+      compression->settings = buf;
+      compression->settings_len = buflen;
+    }
+
+    pos += size;  // consume payload
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  // ContentCompAlgo is mandatory
+  if (!valid)
+    return E_FILE_FORMAT_INVALID;
+
+  return 0;
+}
+
+long ContentEncoding::ParseEncryptionEntry(long long start, long long size,
+                                           IMkvReader* pReader,
+                                           ContentEncryption* encryption) {
+  assert(pReader);
+  assert(encryption);
+
+  long long pos = start;
+  const long long stop = start + size;
+
+  while (pos < stop) {
+    long long id, size;
+    const long status = ParseElementHeader(pReader, pos, stop, id, size);
+    if (status < 0)  // error
+      return status;
+
+    if (id == libwebm::kMkvContentEncAlgo) {
+      encryption->algo = UnserializeUInt(pReader, pos, size);
+      if (encryption->algo != 5)
+        return E_FILE_FORMAT_INVALID;
+    } else if (id == libwebm::kMkvContentEncKeyID) {
+      delete[] encryption->key_id;
+      encryption->key_id = NULL;
+      encryption->key_id_len = 0;
+
+      if (size <= 0)
+        return E_FILE_FORMAT_INVALID;
+
+      const size_t buflen = static_cast<size_t>(size);
+      unsigned char* buf = SafeArrayAlloc<unsigned char>(1, buflen);
+      if (buf == NULL)
+        return -1;
+
+      const int read_status =
+          pReader->Read(pos, static_cast<long>(buflen), buf);
+      if (read_status) {
+        delete[] buf;
+        return status;
+      }
+
+      encryption->key_id = buf;
+      encryption->key_id_len = buflen;
+    } else if (id == libwebm::kMkvContentSignature) {
+      delete[] encryption->signature;
+      encryption->signature = NULL;
+      encryption->signature_len = 0;
+
+      if (size <= 0)
+        return E_FILE_FORMAT_INVALID;
+
+      const size_t buflen = static_cast<size_t>(size);
+      unsigned char* buf = SafeArrayAlloc<unsigned char>(1, buflen);
+      if (buf == NULL)
+        return -1;
+
+      const int read_status =
+          pReader->Read(pos, static_cast<long>(buflen), buf);
+      if (read_status) {
+        delete[] buf;
+        return status;
+      }
+
+      encryption->signature = buf;
+      encryption->signature_len = buflen;
+    } else if (id == libwebm::kMkvContentSigKeyID) {
+      delete[] encryption->sig_key_id;
+      encryption->sig_key_id = NULL;
+      encryption->sig_key_id_len = 0;
+
+      if (size <= 0)
+        return E_FILE_FORMAT_INVALID;
+
+      const size_t buflen = static_cast<size_t>(size);
+      unsigned char* buf = SafeArrayAlloc<unsigned char>(1, buflen);
+      if (buf == NULL)
+        return -1;
+
+      const int read_status =
+          pReader->Read(pos, static_cast<long>(buflen), buf);
+      if (read_status) {
+        delete[] buf;
+        return status;
+      }
+
+      encryption->sig_key_id = buf;
+      encryption->sig_key_id_len = buflen;
+    } else if (id == libwebm::kMkvContentSigAlgo) {
+      encryption->sig_algo = UnserializeUInt(pReader, pos, size);
+    } else if (id == libwebm::kMkvContentSigHashAlgo) {
+      encryption->sig_hash_algo = UnserializeUInt(pReader, pos, size);
+    } else if (id == libwebm::kMkvContentEncAESSettings) {
+      const long status = ParseContentEncAESSettingsEntry(
+          pos, size, pReader, &encryption->aes_settings);
+      if (status)
+        return status;
+    }
+
+    pos += size;  // consume payload
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  return 0;
+}
+
+Track::Track(Segment* pSegment, long long element_start, long long element_size)
+    : m_pSegment(pSegment),
+      m_element_start(element_start),
+      m_element_size(element_size),
+      content_encoding_entries_(NULL),
+      content_encoding_entries_end_(NULL) {}
+
+Track::~Track() {
+  Info& info = const_cast<Info&>(m_info);
+  info.Clear();
+
+  ContentEncoding** i = content_encoding_entries_;
+  ContentEncoding** const j = content_encoding_entries_end_;
+
+  while (i != j) {
+    ContentEncoding* const encoding = *i++;
+    delete encoding;
+  }
+
+  delete[] content_encoding_entries_;
+}
+
+long Track::Create(Segment* pSegment, const Info& info, long long element_start,
+                   long long element_size, Track*& pResult) {
+  if (pResult)
+    return -1;
+
+  Track* const pTrack =
+      new (std::nothrow) Track(pSegment, element_start, element_size);
+
+  if (pTrack == NULL)
+    return -1;  // generic error
+
+  const int status = info.Copy(pTrack->m_info);
+
+  if (status) {  // error
+    delete pTrack;
+    return status;
+  }
+
+  pResult = pTrack;
+  return 0;  // success
+}
+
+Track::Info::Info()
+    : uid(0),
+      defaultDuration(0),
+      codecDelay(0),
+      seekPreRoll(0),
+      nameAsUTF8(NULL),
+      language(NULL),
+      codecId(NULL),
+      codecNameAsUTF8(NULL),
+      codecPrivate(NULL),
+      codecPrivateSize(0),
+      lacing(false) {}
+
+Track::Info::~Info() { Clear(); }
+
+void Track::Info::Clear() {
+  delete[] nameAsUTF8;
+  nameAsUTF8 = NULL;
+
+  delete[] language;
+  language = NULL;
+
+  delete[] codecId;
+  codecId = NULL;
+
+  delete[] codecPrivate;
+  codecPrivate = NULL;
+  codecPrivateSize = 0;
+
+  delete[] codecNameAsUTF8;
+  codecNameAsUTF8 = NULL;
+}
+
+int Track::Info::CopyStr(char* Info::*str, Info& dst_) const {
+  if (str == static_cast<char * Info::*>(NULL))
+    return -1;
+
+  char*& dst = dst_.*str;
+
+  if (dst)  // should be NULL already
+    return -1;
+
+  const char* const src = this->*str;
+
+  if (src == NULL)
+    return 0;
+
+  const size_t len = strlen(src);
+
+  dst = SafeArrayAlloc<char>(1, len + 1);
+
+  if (dst == NULL)
+    return -1;
+
+  strcpy(dst, src);
+
+  return 0;
+}
+
+int Track::Info::Copy(Info& dst) const {
+  if (&dst == this)
+    return 0;
+
+  dst.type = type;
+  dst.number = number;
+  dst.defaultDuration = defaultDuration;
+  dst.codecDelay = codecDelay;
+  dst.seekPreRoll = seekPreRoll;
+  dst.uid = uid;
+  dst.lacing = lacing;
+  dst.settings = settings;
+
+  // We now copy the string member variables from src to dst.
+  // This involves memory allocation so in principle the operation
+  // can fail (indeed, that's why we have Info::Copy), so we must
+  // report this to the caller.  An error return from this function
+  // therefore implies that the copy was only partially successful.
+
+  if (int status = CopyStr(&Info::nameAsUTF8, dst))
+    return status;
+
+  if (int status = CopyStr(&Info::language, dst))
+    return status;
+
+  if (int status = CopyStr(&Info::codecId, dst))
+    return status;
+
+  if (int status = CopyStr(&Info::codecNameAsUTF8, dst))
+    return status;
+
+  if (codecPrivateSize > 0) {
+    if (codecPrivate == NULL)
+      return -1;
+
+    if (dst.codecPrivate)
+      return -1;
+
+    if (dst.codecPrivateSize != 0)
+      return -1;
+
+    dst.codecPrivate = SafeArrayAlloc<unsigned char>(1, codecPrivateSize);
+
+    if (dst.codecPrivate == NULL)
+      return -1;
+
+    memcpy(dst.codecPrivate, codecPrivate, codecPrivateSize);
+    dst.codecPrivateSize = codecPrivateSize;
+  }
+
+  return 0;
+}
+
+const BlockEntry* Track::GetEOS() const { return &m_eos; }
+
+long Track::GetType() const { return m_info.type; }
+
+long Track::GetNumber() const { return m_info.number; }
+
+unsigned long long Track::GetUid() const { return m_info.uid; }
+
+const char* Track::GetNameAsUTF8() const { return m_info.nameAsUTF8; }
+
+const char* Track::GetLanguage() const { return m_info.language; }
+
+const char* Track::GetCodecNameAsUTF8() const { return m_info.codecNameAsUTF8; }
+
+const char* Track::GetCodecId() const { return m_info.codecId; }
+
+const unsigned char* Track::GetCodecPrivate(size_t& size) const {
+  size = m_info.codecPrivateSize;
+  return m_info.codecPrivate;
+}
+
+bool Track::GetLacing() const { return m_info.lacing; }
+
+unsigned long long Track::GetDefaultDuration() const {
+  return m_info.defaultDuration;
+}
+
+unsigned long long Track::GetCodecDelay() const { return m_info.codecDelay; }
+
+unsigned long long Track::GetSeekPreRoll() const { return m_info.seekPreRoll; }
+
+long Track::GetFirst(const BlockEntry*& pBlockEntry) const {
+  const Cluster* pCluster = m_pSegment->GetFirst();
+
+  for (int i = 0;;) {
+    if (pCluster == NULL) {
+      pBlockEntry = GetEOS();
+      return 1;
+    }
+
+    if (pCluster->EOS()) {
+      if (m_pSegment->DoneParsing()) {
+        pBlockEntry = GetEOS();
+        return 1;
+      }
+
+      pBlockEntry = 0;
+      return E_BUFFER_NOT_FULL;
+    }
+
+    long status = pCluster->GetFirst(pBlockEntry);
+
+    if (status < 0)  // error
+      return status;
+
+    if (pBlockEntry == 0) {  // empty cluster
+      pCluster = m_pSegment->GetNext(pCluster);
+      continue;
+    }
+
+    for (;;) {
+      const Block* const pBlock = pBlockEntry->GetBlock();
+      assert(pBlock);
+
+      const long long tn = pBlock->GetTrackNumber();
+
+      if ((tn == m_info.number) && VetEntry(pBlockEntry))
+        return 0;
+
+      const BlockEntry* pNextEntry;
+
+      status = pCluster->GetNext(pBlockEntry, pNextEntry);
+
+      if (status < 0)  // error
+        return status;
+
+      if (pNextEntry == 0)
+        break;
+
+      pBlockEntry = pNextEntry;
+    }
+
+    ++i;
+
+    if (i >= 100)
+      break;
+
+    pCluster = m_pSegment->GetNext(pCluster);
+  }
+
+  // NOTE: if we get here, it means that we didn't find a block with
+  // a matching track number.  We interpret that as an error (which
+  // might be too conservative).
+
+  pBlockEntry = GetEOS();  // so we can return a non-NULL value
+  return 1;
+}
+
+long Track::GetNext(const BlockEntry* pCurrEntry,
+                    const BlockEntry*& pNextEntry) const {
+  assert(pCurrEntry);
+  assert(!pCurrEntry->EOS());  //?
+
+  const Block* const pCurrBlock = pCurrEntry->GetBlock();
+  assert(pCurrBlock && pCurrBlock->GetTrackNumber() == m_info.number);
+  if (!pCurrBlock || pCurrBlock->GetTrackNumber() != m_info.number)
+    return -1;
+
+  const Cluster* pCluster = pCurrEntry->GetCluster();
+  assert(pCluster);
+  assert(!pCluster->EOS());
+
+  long status = pCluster->GetNext(pCurrEntry, pNextEntry);
+
+  if (status < 0)  // error
+    return status;
+
+  for (int i = 0;;) {
+    while (pNextEntry) {
+      const Block* const pNextBlock = pNextEntry->GetBlock();
+      assert(pNextBlock);
+
+      if (pNextBlock->GetTrackNumber() == m_info.number)
+        return 0;
+
+      pCurrEntry = pNextEntry;
+
+      status = pCluster->GetNext(pCurrEntry, pNextEntry);
+
+      if (status < 0)  // error
+        return status;
+    }
+
+    pCluster = m_pSegment->GetNext(pCluster);
+
+    if (pCluster == NULL) {
+      pNextEntry = GetEOS();
+      return 1;
+    }
+
+    if (pCluster->EOS()) {
+      if (m_pSegment->DoneParsing()) {
+        pNextEntry = GetEOS();
+        return 1;
+      }
+
+      // TODO: there is a potential O(n^2) problem here: we tell the
+      // caller to (pre)load another cluster, which he does, but then he
+      // calls GetNext again, which repeats the same search.  This is
+      // a pathological case, since the only way it can happen is if
+      // there exists a long sequence of clusters none of which contain a
+      // block from this track.  One way around this problem is for the
+      // caller to be smarter when he loads another cluster: don't call
+      // us back until you have a cluster that contains a block from this
+      // track. (Of course, that's not cheap either, since our caller
+      // would have to scan the each cluster as it's loaded, so that
+      // would just push back the problem.)
+
+      pNextEntry = NULL;
+      return E_BUFFER_NOT_FULL;
+    }
+
+    status = pCluster->GetFirst(pNextEntry);
+
+    if (status < 0)  // error
+      return status;
+
+    if (pNextEntry == NULL)  // empty cluster
+      continue;
+
+    ++i;
+
+    if (i >= 100)
+      break;
+  }
+
+  // NOTE: if we get here, it means that we didn't find a block with
+  // a matching track number after lots of searching, so we give
+  // up trying.
+
+  pNextEntry = GetEOS();  // so we can return a non-NULL value
+  return 1;
+}
+
+bool Track::VetEntry(const BlockEntry* pBlockEntry) const {
+  assert(pBlockEntry);
+  const Block* const pBlock = pBlockEntry->GetBlock();
+  assert(pBlock);
+  assert(pBlock->GetTrackNumber() == m_info.number);
+  if (!pBlock || pBlock->GetTrackNumber() != m_info.number)
+    return false;
+
+  // This function is used during a seek to determine whether the
+  // frame is a valid seek target.  This default function simply
+  // returns true, which means all frames are valid seek targets.
+  // It gets overridden by the VideoTrack class, because only video
+  // keyframes can be used as seek target.
+
+  return true;
+}
+
+long Track::Seek(long long time_ns, const BlockEntry*& pResult) const {
+  const long status = GetFirst(pResult);
+
+  if (status < 0)  // buffer underflow, etc
+    return status;
+
+  assert(pResult);
+
+  if (pResult->EOS())
+    return 0;
+
+  const Cluster* pCluster = pResult->GetCluster();
+  assert(pCluster);
+  assert(pCluster->GetIndex() >= 0);
+
+  if (time_ns <= pResult->GetBlock()->GetTime(pCluster))
+    return 0;
+
+  Cluster** const clusters = m_pSegment->m_clusters;
+  assert(clusters);
+
+  const long count = m_pSegment->GetCount();  // loaded only, not preloaded
+  assert(count > 0);
+
+  Cluster** const i = clusters + pCluster->GetIndex();
+  assert(i);
+  assert(*i == pCluster);
+  assert(pCluster->GetTime() <= time_ns);
+
+  Cluster** const j = clusters + count;
+
+  Cluster** lo = i;
+  Cluster** hi = j;
+
+  while (lo < hi) {
+    // INVARIANT:
+    //[i, lo) <= time_ns
+    //[lo, hi) ?
+    //[hi, j)  > time_ns
+
+    Cluster** const mid = lo + (hi - lo) / 2;
+    assert(mid < hi);
+
+    pCluster = *mid;
+    assert(pCluster);
+    assert(pCluster->GetIndex() >= 0);
+    assert(pCluster->GetIndex() == long(mid - m_pSegment->m_clusters));
+
+    const long long t = pCluster->GetTime();
+
+    if (t <= time_ns)
+      lo = mid + 1;
+    else
+      hi = mid;
+
+    assert(lo <= hi);
+  }
+
+  assert(lo == hi);
+  assert(lo > i);
+  assert(lo <= j);
+
+  while (lo > i) {
+    pCluster = *--lo;
+    assert(pCluster);
+    assert(pCluster->GetTime() <= time_ns);
+
+    pResult = pCluster->GetEntry(this);
+
+    if ((pResult != 0) && !pResult->EOS())
+      return 0;
+
+    // landed on empty cluster (no entries)
+  }
+
+  pResult = GetEOS();  // weird
+  return 0;
+}
+
+const ContentEncoding* Track::GetContentEncodingByIndex(
+    unsigned long idx) const {
+  const ptrdiff_t count =
+      content_encoding_entries_end_ - content_encoding_entries_;
+  assert(count >= 0);
+
+  if (idx >= static_cast<unsigned long>(count))
+    return NULL;
+
+  return content_encoding_entries_[idx];
+}
+
+unsigned long Track::GetContentEncodingCount() const {
+  const ptrdiff_t count =
+      content_encoding_entries_end_ - content_encoding_entries_;
+  assert(count >= 0);
+
+  return static_cast<unsigned long>(count);
+}
+
+long Track::ParseContentEncodingsEntry(long long start, long long size) {
+  IMkvReader* const pReader = m_pSegment->m_pReader;
+  assert(pReader);
+
+  long long pos = start;
+  const long long stop = start + size;
+
+  // Count ContentEncoding elements.
+  int count = 0;
+  while (pos < stop) {
+    long long id, size;
+    const long status = ParseElementHeader(pReader, pos, stop, id, size);
+    if (status < 0)  // error
+      return status;
+
+    // pos now designates start of element
+    if (id == libwebm::kMkvContentEncoding)
+      ++count;
+
+    pos += size;  // consume payload
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  if (count <= 0)
+    return -1;
+
+  content_encoding_entries_ = new (std::nothrow) ContentEncoding*[count];
+  if (!content_encoding_entries_)
+    return -1;
+
+  content_encoding_entries_end_ = content_encoding_entries_;
+
+  pos = start;
+  while (pos < stop) {
+    long long id, size;
+    long status = ParseElementHeader(pReader, pos, stop, id, size);
+    if (status < 0)  // error
+      return status;
+
+    // pos now designates start of element
+    if (id == libwebm::kMkvContentEncoding) {
+      ContentEncoding* const content_encoding =
+          new (std::nothrow) ContentEncoding();
+      if (!content_encoding)
+        return -1;
+
+      status = content_encoding->ParseContentEncodingEntry(pos, size, pReader);
+      if (status) {
+        delete content_encoding;
+        return status;
+      }
+
+      *content_encoding_entries_end_++ = content_encoding;
+    }
+
+    pos += size;  // consume payload
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  if (pos != stop)
+    return E_FILE_FORMAT_INVALID;
+
+  return 0;
+}
+
+Track::EOSBlock::EOSBlock() : BlockEntry(NULL, LONG_MIN) {}
+
+BlockEntry::Kind Track::EOSBlock::GetKind() const { return kBlockEOS; }
+
+const Block* Track::EOSBlock::GetBlock() const { return NULL; }
+
+bool PrimaryChromaticity::Parse(IMkvReader* reader, long long read_pos,
+                                long long value_size, bool is_x,
+                                PrimaryChromaticity** chromaticity) {
+  if (!reader)
+    return false;
+
+  if (!*chromaticity)
+    *chromaticity = new PrimaryChromaticity();
+
+  if (!*chromaticity)
+    return false;
+
+  PrimaryChromaticity* pc = *chromaticity;
+  float* value = is_x ? &pc->x : &pc->y;
+
+  double parser_value = 0;
+  const long long parse_status =
+      UnserializeFloat(reader, read_pos, value_size, parser_value);
+
+  // Valid range is [0, 1]. Make sure the double is representable as a float
+  // before casting.
+  if (parse_status < 0 || parser_value < 0.0 || parser_value > 1.0 ||
+      (parser_value > 0.0 && parser_value < FLT_MIN))
+    return false;
+
+  *value = static_cast<float>(parser_value);
+
+  return true;
+}
+
+bool MasteringMetadata::Parse(IMkvReader* reader, long long mm_start,
+                              long long mm_size, MasteringMetadata** mm) {
+  if (!reader || *mm)
+    return false;
+
+  std::unique_ptr<MasteringMetadata> mm_ptr(new MasteringMetadata());
+  if (!mm_ptr.get())
+    return false;
+
+  const long long mm_end = mm_start + mm_size;
+  long long read_pos = mm_start;
+
+  while (read_pos < mm_end) {
+    long long child_id = 0;
+    long long child_size = 0;
+
+    const long long status =
+        ParseElementHeader(reader, read_pos, mm_end, child_id, child_size);
+    if (status < 0)
+      return false;
+
+    if (child_id == libwebm::kMkvLuminanceMax) {
+      double value = 0;
+      const long long value_parse_status =
+          UnserializeFloat(reader, read_pos, child_size, value);
+      if (value < -FLT_MAX || value > FLT_MAX ||
+          (value > 0.0 && value < FLT_MIN)) {
+        return false;
+      }
+      mm_ptr->luminance_max = static_cast<float>(value);
+      if (value_parse_status < 0 || mm_ptr->luminance_max < 0.0 ||
+          mm_ptr->luminance_max > 9999.99) {
+        return false;
+      }
+    } else if (child_id == libwebm::kMkvLuminanceMin) {
+      double value = 0;
+      const long long value_parse_status =
+          UnserializeFloat(reader, read_pos, child_size, value);
+      if (value < -FLT_MAX || value > FLT_MAX ||
+          (value > 0.0 && value < FLT_MIN)) {
+        return false;
+      }
+      mm_ptr->luminance_min = static_cast<float>(value);
+      if (value_parse_status < 0 || mm_ptr->luminance_min < 0.0 ||
+          mm_ptr->luminance_min > 999.9999) {
+        return false;
+      }
+    } else {
+      bool is_x = false;
+      PrimaryChromaticity** chromaticity;
+      switch (child_id) {
+        case libwebm::kMkvPrimaryRChromaticityX:
+        case libwebm::kMkvPrimaryRChromaticityY:
+          is_x = child_id == libwebm::kMkvPrimaryRChromaticityX;
+          chromaticity = &mm_ptr->r;
+          break;
+        case libwebm::kMkvPrimaryGChromaticityX:
+        case libwebm::kMkvPrimaryGChromaticityY:
+          is_x = child_id == libwebm::kMkvPrimaryGChromaticityX;
+          chromaticity = &mm_ptr->g;
+          break;
+        case libwebm::kMkvPrimaryBChromaticityX:
+        case libwebm::kMkvPrimaryBChromaticityY:
+          is_x = child_id == libwebm::kMkvPrimaryBChromaticityX;
+          chromaticity = &mm_ptr->b;
+          break;
+        case libwebm::kMkvWhitePointChromaticityX:
+        case libwebm::kMkvWhitePointChromaticityY:
+          is_x = child_id == libwebm::kMkvWhitePointChromaticityX;
+          chromaticity = &mm_ptr->white_point;
+          break;
+        default:
+          return false;
+      }
+      const bool value_parse_status = PrimaryChromaticity::Parse(
+          reader, read_pos, child_size, is_x, chromaticity);
+      if (!value_parse_status)
+        return false;
+    }
+
+    read_pos += child_size;
+    if (read_pos > mm_end)
+      return false;
+  }
+
+  *mm = mm_ptr.release();
+  return true;
+}
+
+bool Colour::Parse(IMkvReader* reader, long long colour_start,
+                   long long colour_size, Colour** colour) {
+  if (!reader || *colour)
+    return false;
+
+  std::unique_ptr<Colour> colour_ptr(new Colour());
+  if (!colour_ptr.get())
+    return false;
+
+  const long long colour_end = colour_start + colour_size;
+  long long read_pos = colour_start;
+
+  while (read_pos < colour_end) {
+    long long child_id = 0;
+    long long child_size = 0;
+
+    const long status =
+        ParseElementHeader(reader, read_pos, colour_end, child_id, child_size);
+    if (status < 0)
+      return false;
+
+    if (child_id == libwebm::kMkvMatrixCoefficients) {
+      colour_ptr->matrix_coefficients =
+          UnserializeUInt(reader, read_pos, child_size);
+      if (colour_ptr->matrix_coefficients < 0)
+        return false;
+    } else if (child_id == libwebm::kMkvBitsPerChannel) {
+      colour_ptr->bits_per_channel =
+          UnserializeUInt(reader, read_pos, child_size);
+      if (colour_ptr->bits_per_channel < 0)
+        return false;
+    } else if (child_id == libwebm::kMkvChromaSubsamplingHorz) {
+      colour_ptr->chroma_subsampling_horz =
+          UnserializeUInt(reader, read_pos, child_size);
+      if (colour_ptr->chroma_subsampling_horz < 0)
+        return false;
+    } else if (child_id == libwebm::kMkvChromaSubsamplingVert) {
+      colour_ptr->chroma_subsampling_vert =
+          UnserializeUInt(reader, read_pos, child_size);
+      if (colour_ptr->chroma_subsampling_vert < 0)
+        return false;
+    } else if (child_id == libwebm::kMkvCbSubsamplingHorz) {
+      colour_ptr->cb_subsampling_horz =
+          UnserializeUInt(reader, read_pos, child_size);
+      if (colour_ptr->cb_subsampling_horz < 0)
+        return false;
+    } else if (child_id == libwebm::kMkvCbSubsamplingVert) {
+      colour_ptr->cb_subsampling_vert =
+          UnserializeUInt(reader, read_pos, child_size);
+      if (colour_ptr->cb_subsampling_vert < 0)
+        return false;
+    } else if (child_id == libwebm::kMkvChromaSitingHorz) {
+      colour_ptr->chroma_siting_horz =
+          UnserializeUInt(reader, read_pos, child_size);
+      if (colour_ptr->chroma_siting_horz < 0)
+        return false;
+    } else if (child_id == libwebm::kMkvChromaSitingVert) {
+      colour_ptr->chroma_siting_vert =
+          UnserializeUInt(reader, read_pos, child_size);
+      if (colour_ptr->chroma_siting_vert < 0)
+        return false;
+    } else if (child_id == libwebm::kMkvRange) {
+      colour_ptr->range = UnserializeUInt(reader, read_pos, child_size);
+      if (colour_ptr->range < 0)
+        return false;
+    } else if (child_id == libwebm::kMkvTransferCharacteristics) {
+      colour_ptr->transfer_characteristics =
+          UnserializeUInt(reader, read_pos, child_size);
+      if (colour_ptr->transfer_characteristics < 0)
+        return false;
+    } else if (child_id == libwebm::kMkvPrimaries) {
+      colour_ptr->primaries = UnserializeUInt(reader, read_pos, child_size);
+      if (colour_ptr->primaries < 0)
+        return false;
+    } else if (child_id == libwebm::kMkvMaxCLL) {
+      colour_ptr->max_cll = UnserializeUInt(reader, read_pos, child_size);
+      if (colour_ptr->max_cll < 0)
+        return false;
+    } else if (child_id == libwebm::kMkvMaxFALL) {
+      colour_ptr->max_fall = UnserializeUInt(reader, read_pos, child_size);
+      if (colour_ptr->max_fall < 0)
+        return false;
+    } else if (child_id == libwebm::kMkvMasteringMetadata) {
+      if (!MasteringMetadata::Parse(reader, read_pos, child_size,
+                                    &colour_ptr->mastering_metadata))
+        return false;
+    } else {
+      return false;
+    }
+
+    read_pos += child_size;
+    if (read_pos > colour_end)
+      return false;
+  }
+  *colour = colour_ptr.release();
+  return true;
+}
+
+bool Projection::Parse(IMkvReader* reader, long long start, long long size,
+                       Projection** projection) {
+  if (!reader || *projection)
+    return false;
+
+  std::unique_ptr<Projection> projection_ptr(new Projection());
+  if (!projection_ptr.get())
+    return false;
+
+  const long long end = start + size;
+  long long read_pos = start;
+
+  while (read_pos < end) {
+    long long child_id = 0;
+    long long child_size = 0;
+
+    const long long status =
+        ParseElementHeader(reader, read_pos, end, child_id, child_size);
+    if (status < 0)
+      return false;
+
+    if (child_id == libwebm::kMkvProjectionType) {
+      long long projection_type = kTypeNotPresent;
+      projection_type = UnserializeUInt(reader, read_pos, child_size);
+      if (projection_type < 0)
+        return false;
+
+      projection_ptr->type = static_cast<ProjectionType>(projection_type);
+    } else if (child_id == libwebm::kMkvProjectionPrivate) {
+      unsigned char* data = SafeArrayAlloc<unsigned char>(1, child_size);
+
+      if (data == NULL)
+        return false;
+
+      const int status =
+          reader->Read(read_pos, static_cast<long>(child_size), data);
+
+      if (status) {
+        delete[] data;
+        return false;
+      }
+
+      projection_ptr->private_data = data;
+      projection_ptr->private_data_length = static_cast<size_t>(child_size);
+    } else {
+      double value = 0;
+      const long long value_parse_status =
+          UnserializeFloat(reader, read_pos, child_size, value);
+      // Make sure value is representable as a float before casting.
+      if (value_parse_status < 0 || value < -FLT_MAX || value > FLT_MAX ||
+          (value > 0.0 && value < FLT_MIN)) {
+        return false;
+      }
+
+      switch (child_id) {
+        case libwebm::kMkvProjectionPoseYaw:
+          projection_ptr->pose_yaw = static_cast<float>(value);
+          break;
+        case libwebm::kMkvProjectionPosePitch:
+          projection_ptr->pose_pitch = static_cast<float>(value);
+          break;
+        case libwebm::kMkvProjectionPoseRoll:
+          projection_ptr->pose_roll = static_cast<float>(value);
+          break;
+        default:
+          return false;
+      }
+    }
+
+    read_pos += child_size;
+    if (read_pos > end)
+      return false;
+  }
+
+  *projection = projection_ptr.release();
+  return true;
+}
+
+VideoTrack::VideoTrack(Segment* pSegment, long long element_start,
+                       long long element_size)
+    : Track(pSegment, element_start, element_size),
+      m_colour_space(NULL),
+      m_colour(NULL),
+      m_projection(NULL) {}
+
+VideoTrack::~VideoTrack() {
+  delete m_colour;
+  delete m_projection;
+}
+
+long VideoTrack::Parse(Segment* pSegment, const Info& info,
+                       long long element_start, long long element_size,
+                       VideoTrack*& pResult) {
+  if (pResult)
+    return -1;
+
+  if (info.type != Track::kVideo)
+    return -1;
+
+  long long width = 0;
+  long long height = 0;
+  long long display_width = 0;
+  long long display_height = 0;
+  long long display_unit = 0;
+  long long stereo_mode = 0;
+
+  double rate = 0.0;
+  char* colour_space = NULL;
+
+  IMkvReader* const pReader = pSegment->m_pReader;
+
+  const Settings& s = info.settings;
+  assert(s.start >= 0);
+  assert(s.size >= 0);
+
+  long long pos = s.start;
+  assert(pos >= 0);
+
+  const long long stop = pos + s.size;
+
+  std::unique_ptr<Colour> colour_ptr;
+  std::unique_ptr<Projection> projection_ptr;
+
+  while (pos < stop) {
+    long long id, size;
+
+    const long status = ParseElementHeader(pReader, pos, stop, id, size);
+
+    if (status < 0)  // error
+      return status;
+
+    if (id == libwebm::kMkvPixelWidth) {
+      width = UnserializeUInt(pReader, pos, size);
+
+      if (width <= 0)
+        return E_FILE_FORMAT_INVALID;
+    } else if (id == libwebm::kMkvPixelHeight) {
+      height = UnserializeUInt(pReader, pos, size);
+
+      if (height <= 0)
+        return E_FILE_FORMAT_INVALID;
+    } else if (id == libwebm::kMkvDisplayWidth) {
+      display_width = UnserializeUInt(pReader, pos, size);
+
+      if (display_width <= 0)
+        return E_FILE_FORMAT_INVALID;
+    } else if (id == libwebm::kMkvDisplayHeight) {
+      display_height = UnserializeUInt(pReader, pos, size);
+
+      if (display_height <= 0)
+        return E_FILE_FORMAT_INVALID;
+    } else if (id == libwebm::kMkvDisplayUnit) {
+      display_unit = UnserializeUInt(pReader, pos, size);
+
+      if (display_unit < 0)
+        return E_FILE_FORMAT_INVALID;
+    } else if (id == libwebm::kMkvStereoMode) {
+      stereo_mode = UnserializeUInt(pReader, pos, size);
+
+      if (stereo_mode < 0)
+        return E_FILE_FORMAT_INVALID;
+    } else if (id == libwebm::kMkvFrameRate) {
+      const long status = UnserializeFloat(pReader, pos, size, rate);
+
+      if (status < 0)
+        return status;
+
+      if (rate <= 0)
+        return E_FILE_FORMAT_INVALID;
+    } else if (id == libwebm::kMkvColour) {
+      Colour* colour = NULL;
+      if (!Colour::Parse(pReader, pos, size, &colour)) {
+        return E_FILE_FORMAT_INVALID;
+      } else {
+        colour_ptr.reset(colour);
+      }
+    } else if (id == libwebm::kMkvProjection) {
+      Projection* projection = NULL;
+      if (!Projection::Parse(pReader, pos, size, &projection)) {
+        return E_FILE_FORMAT_INVALID;
+      } else {
+        projection_ptr.reset(projection);
+      }
+    } else if (id == libwebm::kMkvColourSpace) {
+      const long status = UnserializeString(pReader, pos, size, colour_space);
+      if (status < 0)
+        return status;
+    }
+
+    pos += size;  // consume payload
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  if (pos != stop)
+    return E_FILE_FORMAT_INVALID;
+
+  VideoTrack* const pTrack =
+      new (std::nothrow) VideoTrack(pSegment, element_start, element_size);
+
+  if (pTrack == NULL)
+    return -1;  // generic error
+
+  const int status = info.Copy(pTrack->m_info);
+
+  if (status) {  // error
+    delete pTrack;
+    return status;
+  }
+
+  pTrack->m_width = width;
+  pTrack->m_height = height;
+  pTrack->m_display_width = display_width;
+  pTrack->m_display_height = display_height;
+  pTrack->m_display_unit = display_unit;
+  pTrack->m_stereo_mode = stereo_mode;
+  pTrack->m_rate = rate;
+  pTrack->m_colour = colour_ptr.release();
+  pTrack->m_colour_space = colour_space;
+  pTrack->m_projection = projection_ptr.release();
+
+  pResult = pTrack;
+  return 0;  // success
+}
+
+bool VideoTrack::VetEntry(const BlockEntry* pBlockEntry) const {
+  return Track::VetEntry(pBlockEntry) && pBlockEntry->GetBlock()->IsKey();
+}
+
+long VideoTrack::Seek(long long time_ns, const BlockEntry*& pResult) const {
+  const long status = GetFirst(pResult);
+
+  if (status < 0)  // buffer underflow, etc
+    return status;
+
+  assert(pResult);
+
+  if (pResult->EOS())
+    return 0;
+
+  const Cluster* pCluster = pResult->GetCluster();
+  assert(pCluster);
+  assert(pCluster->GetIndex() >= 0);
+
+  if (time_ns <= pResult->GetBlock()->GetTime(pCluster))
+    return 0;
+
+  Cluster** const clusters = m_pSegment->m_clusters;
+  assert(clusters);
+
+  const long count = m_pSegment->GetCount();  // loaded only, not pre-loaded
+  assert(count > 0);
+
+  Cluster** const i = clusters + pCluster->GetIndex();
+  assert(i);
+  assert(*i == pCluster);
+  assert(pCluster->GetTime() <= time_ns);
+
+  Cluster** const j = clusters + count;
+
+  Cluster** lo = i;
+  Cluster** hi = j;
+
+  while (lo < hi) {
+    // INVARIANT:
+    //[i, lo) <= time_ns
+    //[lo, hi) ?
+    //[hi, j)  > time_ns
+
+    Cluster** const mid = lo + (hi - lo) / 2;
+    assert(mid < hi);
+
+    pCluster = *mid;
+    assert(pCluster);
+    assert(pCluster->GetIndex() >= 0);
+    assert(pCluster->GetIndex() == long(mid - m_pSegment->m_clusters));
+
+    const long long t = pCluster->GetTime();
+
+    if (t <= time_ns)
+      lo = mid + 1;
+    else
+      hi = mid;
+
+    assert(lo <= hi);
+  }
+
+  assert(lo == hi);
+  assert(lo > i);
+  assert(lo <= j);
+
+  pCluster = *--lo;
+  assert(pCluster);
+  assert(pCluster->GetTime() <= time_ns);
+
+  pResult = pCluster->GetEntry(this, time_ns);
+
+  if ((pResult != 0) && !pResult->EOS())  // found a keyframe
+    return 0;
+
+  while (lo != i) {
+    pCluster = *--lo;
+    assert(pCluster);
+    assert(pCluster->GetTime() <= time_ns);
+
+    pResult = pCluster->GetEntry(this, time_ns);
+
+    if ((pResult != 0) && !pResult->EOS())
+      return 0;
+  }
+
+  // weird: we're on the first cluster, but no keyframe found
+  // should never happen but we must return something anyway
+
+  pResult = GetEOS();
+  return 0;
+}
+
+Colour* VideoTrack::GetColour() const { return m_colour; }
+
+Projection* VideoTrack::GetProjection() const { return m_projection; }
+
+long long VideoTrack::GetWidth() const { return m_width; }
+
+long long VideoTrack::GetHeight() const { return m_height; }
+
+long long VideoTrack::GetDisplayWidth() const {
+  return m_display_width > 0 ? m_display_width : GetWidth();
+}
+
+long long VideoTrack::GetDisplayHeight() const {
+  return m_display_height > 0 ? m_display_height : GetHeight();
+}
+
+long long VideoTrack::GetDisplayUnit() const { return m_display_unit; }
+
+long long VideoTrack::GetStereoMode() const { return m_stereo_mode; }
+
+double VideoTrack::GetFrameRate() const { return m_rate; }
+
+AudioTrack::AudioTrack(Segment* pSegment, long long element_start,
+                       long long element_size)
+    : Track(pSegment, element_start, element_size) {}
+
+long AudioTrack::Parse(Segment* pSegment, const Info& info,
+                       long long element_start, long long element_size,
+                       AudioTrack*& pResult) {
+  if (pResult)
+    return -1;
+
+  if (info.type != Track::kAudio)
+    return -1;
+
+  IMkvReader* const pReader = pSegment->m_pReader;
+
+  const Settings& s = info.settings;
+  assert(s.start >= 0);
+  assert(s.size >= 0);
+
+  long long pos = s.start;
+  assert(pos >= 0);
+
+  const long long stop = pos + s.size;
+
+  double rate = 8000.0;  // MKV default
+  long long channels = 1;
+  long long bit_depth = 0;
+
+  while (pos < stop) {
+    long long id, size;
+
+    long status = ParseElementHeader(pReader, pos, stop, id, size);
+
+    if (status < 0)  // error
+      return status;
+
+    if (id == libwebm::kMkvSamplingFrequency) {
+      status = UnserializeFloat(pReader, pos, size, rate);
+
+      if (status < 0)
+        return status;
+
+      if (rate <= 0)
+        return E_FILE_FORMAT_INVALID;
+    } else if (id == libwebm::kMkvChannels) {
+      channels = UnserializeUInt(pReader, pos, size);
+
+      if (channels <= 0)
+        return E_FILE_FORMAT_INVALID;
+    } else if (id == libwebm::kMkvBitDepth) {
+      bit_depth = UnserializeUInt(pReader, pos, size);
+
+      if (bit_depth <= 0)
+        return E_FILE_FORMAT_INVALID;
+    }
+
+    pos += size;  // consume payload
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  if (pos != stop)
+    return E_FILE_FORMAT_INVALID;
+
+  AudioTrack* const pTrack =
+      new (std::nothrow) AudioTrack(pSegment, element_start, element_size);
+
+  if (pTrack == NULL)
+    return -1;  // generic error
+
+  const int status = info.Copy(pTrack->m_info);
+
+  if (status) {
+    delete pTrack;
+    return status;
+  }
+
+  pTrack->m_rate = rate;
+  pTrack->m_channels = channels;
+  pTrack->m_bitDepth = bit_depth;
+
+  pResult = pTrack;
+  return 0;  // success
+}
+
+double AudioTrack::GetSamplingRate() const { return m_rate; }
+
+long long AudioTrack::GetChannels() const { return m_channels; }
+
+long long AudioTrack::GetBitDepth() const { return m_bitDepth; }
+
+Tracks::Tracks(Segment* pSegment, long long start, long long size_,
+               long long element_start, long long element_size)
+    : m_pSegment(pSegment),
+      m_start(start),
+      m_size(size_),
+      m_element_start(element_start),
+      m_element_size(element_size),
+      m_trackEntries(NULL),
+      m_trackEntriesEnd(NULL) {}
+
+long Tracks::Parse() {
+  assert(m_trackEntries == NULL);
+  assert(m_trackEntriesEnd == NULL);
+
+  const long long stop = m_start + m_size;
+  IMkvReader* const pReader = m_pSegment->m_pReader;
+
+  int count = 0;
+  long long pos = m_start;
+
+  while (pos < stop) {
+    long long id, size;
+
+    const long status = ParseElementHeader(pReader, pos, stop, id, size);
+
+    if (status < 0)  // error
+      return status;
+
+    if (size == 0)  // weird
+      continue;
+
+    if (id == libwebm::kMkvTrackEntry)
+      ++count;
+
+    pos += size;  // consume payload
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  if (pos != stop)
+    return E_FILE_FORMAT_INVALID;
+
+  if (count <= 0)
+    return 0;  // success
+
+  m_trackEntries = new (std::nothrow) Track*[count];
+
+  if (m_trackEntries == NULL)
+    return -1;
+
+  m_trackEntriesEnd = m_trackEntries;
+
+  pos = m_start;
+
+  while (pos < stop) {
+    const long long element_start = pos;
+
+    long long id, payload_size;
+
+    const long status =
+        ParseElementHeader(pReader, pos, stop, id, payload_size);
+
+    if (status < 0)  // error
+      return status;
+
+    if (payload_size == 0)  // weird
+      continue;
+
+    const long long payload_stop = pos + payload_size;
+    assert(payload_stop <= stop);  // checked in ParseElement
+
+    const long long element_size = payload_stop - element_start;
+
+    if (id == libwebm::kMkvTrackEntry) {
+      Track*& pTrack = *m_trackEntriesEnd;
+      pTrack = NULL;
+
+      const long status = ParseTrackEntry(pos, payload_size, element_start,
+                                          element_size, pTrack);
+      if (status)
+        return status;
+
+      if (pTrack)
+        ++m_trackEntriesEnd;
+    }
+
+    pos = payload_stop;
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  if (pos != stop)
+    return E_FILE_FORMAT_INVALID;
+
+  return 0;  // success
+}
+
+unsigned long Tracks::GetTracksCount() const {
+  const ptrdiff_t result = m_trackEntriesEnd - m_trackEntries;
+  assert(result >= 0);
+
+  return static_cast<unsigned long>(result);
+}
+
+long Tracks::ParseTrackEntry(long long track_start, long long track_size,
+                             long long element_start, long long element_size,
+                             Track*& pResult) const {
+  if (pResult)
+    return -1;
+
+  IMkvReader* const pReader = m_pSegment->m_pReader;
+
+  long long pos = track_start;
+  const long long track_stop = track_start + track_size;
+
+  Track::Info info;
+
+  info.type = 0;
+  info.number = 0;
+  info.uid = 0;
+  info.defaultDuration = 0;
+
+  Track::Settings v;
+  v.start = -1;
+  v.size = -1;
+
+  Track::Settings a;
+  a.start = -1;
+  a.size = -1;
+
+  Track::Settings e;  // content_encodings_settings;
+  e.start = -1;
+  e.size = -1;
+
+  long long lacing = 1;  // default is true
+
+  while (pos < track_stop) {
+    long long id, size;
+
+    const long status = ParseElementHeader(pReader, pos, track_stop, id, size);
+
+    if (status < 0)  // error
+      return status;
+
+    if (size < 0)
+      return E_FILE_FORMAT_INVALID;
+
+    const long long start = pos;
+
+    if (id == libwebm::kMkvVideo) {
+      v.start = start;
+      v.size = size;
+    } else if (id == libwebm::kMkvAudio) {
+      a.start = start;
+      a.size = size;
+    } else if (id == libwebm::kMkvContentEncodings) {
+      e.start = start;
+      e.size = size;
+    } else if (id == libwebm::kMkvTrackUID) {
+      if (size > 8)
+        return E_FILE_FORMAT_INVALID;
+
+      info.uid = 0;
+
+      long long pos_ = start;
+      const long long pos_end = start + size;
+
+      while (pos_ != pos_end) {
+        unsigned char b;
+
+        const int status = pReader->Read(pos_, 1, &b);
+
+        if (status)
+          return status;
+
+        info.uid <<= 8;
+        info.uid |= b;
+
+        ++pos_;
+      }
+    } else if (id == libwebm::kMkvTrackNumber) {
+      const long long num = UnserializeUInt(pReader, pos, size);
+
+      if ((num <= 0) || (num > 127))
+        return E_FILE_FORMAT_INVALID;
+
+      info.number = static_cast<long>(num);
+    } else if (id == libwebm::kMkvTrackType) {
+      const long long type = UnserializeUInt(pReader, pos, size);
+
+      if ((type <= 0) || (type > 254))
+        return E_FILE_FORMAT_INVALID;
+
+      info.type = static_cast<long>(type);
+    } else if (id == libwebm::kMkvName) {
+      const long status =
+          UnserializeString(pReader, pos, size, info.nameAsUTF8);
+
+      if (status)
+        return status;
+    } else if (id == libwebm::kMkvLanguage) {
+      const long status = UnserializeString(pReader, pos, size, info.language);
+
+      if (status)
+        return status;
+    } else if (id == libwebm::kMkvDefaultDuration) {
+      const long long duration = UnserializeUInt(pReader, pos, size);
+
+      if (duration < 0)
+        return E_FILE_FORMAT_INVALID;
+
+      info.defaultDuration = static_cast<unsigned long long>(duration);
+    } else if (id == libwebm::kMkvCodecID) {
+      const long status = UnserializeString(pReader, pos, size, info.codecId);
+
+      if (status)
+        return status;
+    } else if (id == libwebm::kMkvFlagLacing) {
+      lacing = UnserializeUInt(pReader, pos, size);
+
+      if ((lacing < 0) || (lacing > 1))
+        return E_FILE_FORMAT_INVALID;
+    } else if (id == libwebm::kMkvCodecPrivate) {
+      delete[] info.codecPrivate;
+      info.codecPrivate = NULL;
+      info.codecPrivateSize = 0;
+
+      const size_t buflen = static_cast<size_t>(size);
+
+      if (buflen) {
+        unsigned char* buf = SafeArrayAlloc<unsigned char>(1, buflen);
+
+        if (buf == NULL)
+          return -1;
+
+        const int status = pReader->Read(pos, static_cast<long>(buflen), buf);
+
+        if (status) {
+          delete[] buf;
+          return status;
+        }
+
+        info.codecPrivate = buf;
+        info.codecPrivateSize = buflen;
+      }
+    } else if (id == libwebm::kMkvCodecName) {
+      const long status =
+          UnserializeString(pReader, pos, size, info.codecNameAsUTF8);
+
+      if (status)
+        return status;
+    } else if (id == libwebm::kMkvCodecDelay) {
+      info.codecDelay = UnserializeUInt(pReader, pos, size);
+    } else if (id == libwebm::kMkvSeekPreRoll) {
+      info.seekPreRoll = UnserializeUInt(pReader, pos, size);
+    }
+
+    pos += size;  // consume payload
+    if (pos > track_stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  if (pos != track_stop)
+    return E_FILE_FORMAT_INVALID;
+
+  if (info.number <= 0)  // not specified
+    return E_FILE_FORMAT_INVALID;
+
+  if (GetTrackByNumber(info.number))
+    return E_FILE_FORMAT_INVALID;
+
+  if (info.type <= 0)  // not specified
+    return E_FILE_FORMAT_INVALID;
+
+  info.lacing = (lacing > 0) ? true : false;
+
+  if (info.type == Track::kVideo) {
+    if (v.start < 0)
+      return E_FILE_FORMAT_INVALID;
+
+    if (a.start >= 0)
+      return E_FILE_FORMAT_INVALID;
+
+    info.settings = v;
+
+    VideoTrack* pTrack = NULL;
+
+    const long status = VideoTrack::Parse(m_pSegment, info, element_start,
+                                          element_size, pTrack);
+
+    if (status)
+      return status;
+
+    pResult = pTrack;
+    assert(pResult);
+
+    if (e.start >= 0)
+      pResult->ParseContentEncodingsEntry(e.start, e.size);
+  } else if (info.type == Track::kAudio) {
+    if (a.start < 0)
+      return E_FILE_FORMAT_INVALID;
+
+    if (v.start >= 0)
+      return E_FILE_FORMAT_INVALID;
+
+    info.settings = a;
+
+    AudioTrack* pTrack = NULL;
+
+    const long status = AudioTrack::Parse(m_pSegment, info, element_start,
+                                          element_size, pTrack);
+
+    if (status)
+      return status;
+
+    pResult = pTrack;
+    assert(pResult);
+
+    if (e.start >= 0)
+      pResult->ParseContentEncodingsEntry(e.start, e.size);
+  } else {
+    // neither video nor audio - probably metadata or subtitles
+
+    if (a.start >= 0)
+      return E_FILE_FORMAT_INVALID;
+
+    if (v.start >= 0)
+      return E_FILE_FORMAT_INVALID;
+
+    if (info.type == Track::kMetadata && e.start >= 0)
+      return E_FILE_FORMAT_INVALID;
+
+    info.settings.start = -1;
+    info.settings.size = 0;
+
+    Track* pTrack = NULL;
+
+    const long status =
+        Track::Create(m_pSegment, info, element_start, element_size, pTrack);
+
+    if (status)
+      return status;
+
+    pResult = pTrack;
+    assert(pResult);
+  }
+
+  return 0;  // success
+}
+
+Tracks::~Tracks() {
+  Track** i = m_trackEntries;
+  Track** const j = m_trackEntriesEnd;
+
+  while (i != j) {
+    Track* const pTrack = *i++;
+    delete pTrack;
+  }
+
+  delete[] m_trackEntries;
+}
+
+const Track* Tracks::GetTrackByNumber(long tn) const {
+  if (tn < 0)
+    return NULL;
+
+  Track** i = m_trackEntries;
+  Track** const j = m_trackEntriesEnd;
+
+  while (i != j) {
+    Track* const pTrack = *i++;
+
+    if (pTrack == NULL)
+      continue;
+
+    if (tn == pTrack->GetNumber())
+      return pTrack;
+  }
+
+  return NULL;  // not found
+}
+
+const Track* Tracks::GetTrackByIndex(unsigned long idx) const {
+  const ptrdiff_t count = m_trackEntriesEnd - m_trackEntries;
+
+  if (idx >= static_cast<unsigned long>(count))
+    return NULL;
+
+  return m_trackEntries[idx];
+}
+
+long Cluster::Load(long long& pos, long& len) const {
+  if (m_pSegment == NULL)
+    return E_PARSE_FAILED;
+
+  if (m_timecode >= 0)  // at least partially loaded
+    return 0;
+
+  if (m_pos != m_element_start || m_element_size >= 0)
+    return E_PARSE_FAILED;
+
+  IMkvReader* const pReader = m_pSegment->m_pReader;
+  long long total, avail;
+  const int status = pReader->Length(&total, &avail);
+
+  if (status < 0)  // error
+    return status;
+
+  if (total >= 0 && (avail > total || m_pos > total))
+    return E_FILE_FORMAT_INVALID;
+
+  pos = m_pos;
+
+  long long cluster_size = -1;
+
+  if ((pos + 1) > avail) {
+    len = 1;
+    return E_BUFFER_NOT_FULL;
+  }
+
+  long long result = GetUIntLength(pReader, pos, len);
+
+  if (result < 0)  // error or underflow
+    return static_cast<long>(result);
+
+  if (result > 0)
+    return E_BUFFER_NOT_FULL;
+
+  if ((pos + len) > avail)
+    return E_BUFFER_NOT_FULL;
+
+  const long long id_ = ReadID(pReader, pos, len);
+
+  if (id_ < 0)  // error
+    return static_cast<long>(id_);
+
+  if (id_ != libwebm::kMkvCluster)
+    return E_FILE_FORMAT_INVALID;
+
+  pos += len;  // consume id
+
+  // read cluster size
+
+  if ((pos + 1) > avail) {
+    len = 1;
+    return E_BUFFER_NOT_FULL;
+  }
+
+  result = GetUIntLength(pReader, pos, len);
+
+  if (result < 0)  // error
+    return static_cast<long>(result);
+
+  if (result > 0)
+    return E_BUFFER_NOT_FULL;
+
+  if ((pos + len) > avail)
+    return E_BUFFER_NOT_FULL;
+
+  const long long size = ReadUInt(pReader, pos, len);
+
+  if (size < 0)  // error
+    return static_cast<long>(cluster_size);
+
+  if (size == 0)
+    return E_FILE_FORMAT_INVALID;
+
+  pos += len;  // consume length of size of element
+
+  const long long unknown_size = (1LL << (7 * len)) - 1;
+
+  if (size != unknown_size)
+    cluster_size = size;
+
+  // pos points to start of payload
+  long long timecode = -1;
+  long long new_pos = -1;
+  bool bBlock = false;
+
+  long long cluster_stop = (cluster_size < 0) ? -1 : pos + cluster_size;
+
+  for (;;) {
+    if ((cluster_stop >= 0) && (pos >= cluster_stop))
+      break;
+
+    // Parse ID
+
+    if ((pos + 1) > avail) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
+    }
+
+    long long result = GetUIntLength(pReader, pos, len);
+
+    if (result < 0)  // error
+      return static_cast<long>(result);
+
+    if (result > 0)
+      return E_BUFFER_NOT_FULL;
+
+    if ((cluster_stop >= 0) && ((pos + len) > cluster_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + len) > avail)
+      return E_BUFFER_NOT_FULL;
+
+    const long long id = ReadID(pReader, pos, len);
+
+    if (id < 0)  // error
+      return static_cast<long>(id);
+
+    if (id == 0)
+      return E_FILE_FORMAT_INVALID;
+
+    // This is the distinguished set of ID's we use to determine
+    // that we have exhausted the sub-element's inside the cluster
+    // whose ID we parsed earlier.
+
+    if (id == libwebm::kMkvCluster)
+      break;
+
+    if (id == libwebm::kMkvCues)
+      break;
+
+    pos += len;  // consume ID field
+
+    // Parse Size
+
+    if ((pos + 1) > avail) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
+    }
+
+    result = GetUIntLength(pReader, pos, len);
+
+    if (result < 0)  // error
+      return static_cast<long>(result);
+
+    if (result > 0)
+      return E_BUFFER_NOT_FULL;
+
+    if ((cluster_stop >= 0) && ((pos + len) > cluster_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + len) > avail)
+      return E_BUFFER_NOT_FULL;
+
+    const long long size = ReadUInt(pReader, pos, len);
+
+    if (size < 0)  // error
+      return static_cast<long>(size);
+
+    const long long unknown_size = (1LL << (7 * len)) - 1;
+
+    if (size == unknown_size)
+      return E_FILE_FORMAT_INVALID;
+
+    pos += len;  // consume size field
+
+    if ((cluster_stop >= 0) && (pos > cluster_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    // pos now points to start of payload
+
+    if (size == 0)
+      continue;
+
+    if ((cluster_stop >= 0) && ((pos + size) > cluster_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    if (id == libwebm::kMkvTimecode) {
+      len = static_cast<long>(size);
+
+      if ((pos + size) > avail)
+        return E_BUFFER_NOT_FULL;
+
+      timecode = UnserializeUInt(pReader, pos, size);
+
+      if (timecode < 0)  // error (or underflow)
+        return static_cast<long>(timecode);
+
+      new_pos = pos + size;
+
+      if (bBlock)
+        break;
+    } else if (id == libwebm::kMkvBlockGroup) {
+      bBlock = true;
+      break;
+    } else if (id == libwebm::kMkvSimpleBlock) {
+      bBlock = true;
+      break;
+    }
+
+    pos += size;  // consume payload
+    if (cluster_stop >= 0 && pos > cluster_stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  if (cluster_stop >= 0 && pos > cluster_stop)
+    return E_FILE_FORMAT_INVALID;
+
+  if (timecode < 0)  // no timecode found
+    return E_FILE_FORMAT_INVALID;
+
+  if (!bBlock)
+    return E_FILE_FORMAT_INVALID;
+
+  m_pos = new_pos;  // designates position just beyond timecode payload
+  m_timecode = timecode;  // m_timecode >= 0 means we're partially loaded
+
+  if (cluster_size >= 0)
+    m_element_size = cluster_stop - m_element_start;
+
+  return 0;
+}
+
+long Cluster::Parse(long long& pos, long& len) const {
+  long status = Load(pos, len);
+
+  if (status < 0)
+    return status;
+
+  if (m_pos < m_element_start || m_timecode < 0)
+    return E_PARSE_FAILED;
+
+  const long long cluster_stop =
+      (m_element_size < 0) ? -1 : m_element_start + m_element_size;
+
+  if ((cluster_stop >= 0) && (m_pos >= cluster_stop))
+    return 1;  // nothing else to do
+
+  IMkvReader* const pReader = m_pSegment->m_pReader;
+
+  long long total, avail;
+
+  status = pReader->Length(&total, &avail);
+
+  if (status < 0)  // error
+    return status;
+
+  if (total >= 0 && avail > total)
+    return E_FILE_FORMAT_INVALID;
+
+  pos = m_pos;
+
+  for (;;) {
+    if ((cluster_stop >= 0) && (pos >= cluster_stop))
+      break;
+
+    if ((total >= 0) && (pos >= total)) {
+      if (m_element_size < 0)
+        m_element_size = pos - m_element_start;
+
+      break;
+    }
+
+    // Parse ID
+
+    if ((pos + 1) > avail) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
+    }
+
+    long long result = GetUIntLength(pReader, pos, len);
+
+    if (result < 0)  // error
+      return static_cast<long>(result);
+
+    if (result > 0)
+      return E_BUFFER_NOT_FULL;
+
+    if ((cluster_stop >= 0) && ((pos + len) > cluster_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + len) > avail)
+      return E_BUFFER_NOT_FULL;
+
+    const long long id = ReadID(pReader, pos, len);
+
+    if (id < 0)
+      return E_FILE_FORMAT_INVALID;
+
+    // This is the distinguished set of ID's we use to determine
+    // that we have exhausted the sub-element's inside the cluster
+    // whose ID we parsed earlier.
+
+    if ((id == libwebm::kMkvCluster) || (id == libwebm::kMkvCues)) {
+      if (m_element_size < 0)
+        m_element_size = pos - m_element_start;
+
+      break;
+    }
+
+    pos += len;  // consume ID field
+
+    // Parse Size
+
+    if ((pos + 1) > avail) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
+    }
+
+    result = GetUIntLength(pReader, pos, len);
+
+    if (result < 0)  // error
+      return static_cast<long>(result);
+
+    if (result > 0)
+      return E_BUFFER_NOT_FULL;
+
+    if ((cluster_stop >= 0) && ((pos + len) > cluster_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + len) > avail)
+      return E_BUFFER_NOT_FULL;
+
+    const long long size = ReadUInt(pReader, pos, len);
+
+    if (size < 0)  // error
+      return static_cast<long>(size);
+
+    const long long unknown_size = (1LL << (7 * len)) - 1;
+
+    if (size == unknown_size)
+      return E_FILE_FORMAT_INVALID;
+
+    pos += len;  // consume size field
+
+    if ((cluster_stop >= 0) && (pos > cluster_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    // pos now points to start of payload
+
+    if (size == 0)
+      continue;
+
+    // const long long block_start = pos;
+    const long long block_stop = pos + size;
+
+    if (cluster_stop >= 0) {
+      if (block_stop > cluster_stop) {
+        if (id == libwebm::kMkvBlockGroup || id == libwebm::kMkvSimpleBlock) {
+          return E_FILE_FORMAT_INVALID;
+        }
+
+        pos = cluster_stop;
+        break;
+      }
+    } else if ((total >= 0) && (block_stop > total)) {
+      m_element_size = total - m_element_start;
+      pos = total;
+      break;
+    } else if (block_stop > avail) {
+      len = static_cast<long>(size);
+      return E_BUFFER_NOT_FULL;
+    }
+
+    Cluster* const this_ = const_cast<Cluster*>(this);
+
+    if (id == libwebm::kMkvBlockGroup)
+      return this_->ParseBlockGroup(size, pos, len);
+
+    if (id == libwebm::kMkvSimpleBlock)
+      return this_->ParseSimpleBlock(size, pos, len);
+
+    pos += size;  // consume payload
+    if (cluster_stop >= 0 && pos > cluster_stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  if (m_element_size < 1)
+    return E_FILE_FORMAT_INVALID;
+
+  m_pos = pos;
+  if (cluster_stop >= 0 && m_pos > cluster_stop)
+    return E_FILE_FORMAT_INVALID;
+
+  if (m_entries_count > 0) {
+    const long idx = m_entries_count - 1;
+
+    const BlockEntry* const pLast = m_entries[idx];
+    if (pLast == NULL)
+      return E_PARSE_FAILED;
+
+    const Block* const pBlock = pLast->GetBlock();
+    if (pBlock == NULL)
+      return E_PARSE_FAILED;
+
+    const long long start = pBlock->m_start;
+
+    if ((total >= 0) && (start > total))
+      return E_PARSE_FAILED;  // defend against trucated stream
+
+    const long long size = pBlock->m_size;
+
+    const long long stop = start + size;
+    if (cluster_stop >= 0 && stop > cluster_stop)
+      return E_FILE_FORMAT_INVALID;
+
+    if ((total >= 0) && (stop > total))
+      return E_PARSE_FAILED;  // defend against trucated stream
+  }
+
+  return 1;  // no more entries
+}
+
+long Cluster::ParseSimpleBlock(long long block_size, long long& pos,
+                               long& len) {
+  const long long block_start = pos;
+  const long long block_stop = pos + block_size;
+
+  IMkvReader* const pReader = m_pSegment->m_pReader;
+
+  long long total, avail;
+
+  long status = pReader->Length(&total, &avail);
+
+  if (status < 0)  // error
+    return status;
+
+  assert((total < 0) || (avail <= total));
+
+  // parse track number
+
+  if ((pos + 1) > avail) {
+    len = 1;
+    return E_BUFFER_NOT_FULL;
+  }
+
+  long long result = GetUIntLength(pReader, pos, len);
+
+  if (result < 0)  // error
+    return static_cast<long>(result);
+
+  if (result > 0)  // weird
+    return E_BUFFER_NOT_FULL;
+
+  if ((pos + len) > block_stop)
+    return E_FILE_FORMAT_INVALID;
+
+  if ((pos + len) > avail)
+    return E_BUFFER_NOT_FULL;
+
+  const long long track = ReadUInt(pReader, pos, len);
+
+  if (track < 0)  // error
+    return static_cast<long>(track);
+
+  if (track == 0)
+    return E_FILE_FORMAT_INVALID;
+
+  pos += len;  // consume track number
+
+  if ((pos + 2) > block_stop)
+    return E_FILE_FORMAT_INVALID;
+
+  if ((pos + 2) > avail) {
+    len = 2;
+    return E_BUFFER_NOT_FULL;
+  }
+
+  pos += 2;  // consume timecode
+
+  if ((pos + 1) > block_stop)
+    return E_FILE_FORMAT_INVALID;
+
+  if ((pos + 1) > avail) {
+    len = 1;
+    return E_BUFFER_NOT_FULL;
+  }
+
+  unsigned char flags;
+
+  status = pReader->Read(pos, 1, &flags);
+
+  if (status < 0) {  // error or underflow
+    len = 1;
+    return status;
+  }
+
+  ++pos;  // consume flags byte
+  assert(pos <= avail);
+
+  if (pos >= block_stop)
+    return E_FILE_FORMAT_INVALID;
+
+  const int lacing = int(flags & 0x06) >> 1;
+
+  if ((lacing != 0) && (block_stop > avail)) {
+    len = static_cast<long>(block_stop - pos);
+    return E_BUFFER_NOT_FULL;
+  }
+
+  status = CreateBlock(libwebm::kMkvSimpleBlock, block_start, block_size,
+                       0);  // DiscardPadding
+
+  if (status != 0)
+    return status;
+
+  m_pos = block_stop;
+
+  return 0;  // success
+}
+
+long Cluster::ParseBlockGroup(long long payload_size, long long& pos,
+                              long& len) {
+  const long long payload_start = pos;
+  const long long payload_stop = pos + payload_size;
+
+  IMkvReader* const pReader = m_pSegment->m_pReader;
+
+  long long total, avail;
+
+  long status = pReader->Length(&total, &avail);
+
+  if (status < 0)  // error
+    return status;
+
+  assert((total < 0) || (avail <= total));
+
+  if ((total >= 0) && (payload_stop > total))
+    return E_FILE_FORMAT_INVALID;
+
+  if (payload_stop > avail) {
+    len = static_cast<long>(payload_size);
+    return E_BUFFER_NOT_FULL;
+  }
+
+  long long discard_padding = 0;
+
+  while (pos < payload_stop) {
+    // parse sub-block element ID
+
+    if ((pos + 1) > avail) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
+    }
+
+    long long result = GetUIntLength(pReader, pos, len);
+
+    if (result < 0)  // error
+      return static_cast<long>(result);
+
+    if (result > 0)  // weird
+      return E_BUFFER_NOT_FULL;
+
+    if ((pos + len) > payload_stop)
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + len) > avail)
+      return E_BUFFER_NOT_FULL;
+
+    const long long id = ReadID(pReader, pos, len);
+
+    if (id < 0)  // error
+      return static_cast<long>(id);
+
+    if (id == 0)  // not a valid ID
+      return E_FILE_FORMAT_INVALID;
+
+    pos += len;  // consume ID field
+
+    // Parse Size
+
+    if ((pos + 1) > avail) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
+    }
+
+    result = GetUIntLength(pReader, pos, len);
+
+    if (result < 0)  // error
+      return static_cast<long>(result);
+
+    if (result > 0)  // weird
+      return E_BUFFER_NOT_FULL;
+
+    if ((pos + len) > payload_stop)
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + len) > avail)
+      return E_BUFFER_NOT_FULL;
+
+    const long long size = ReadUInt(pReader, pos, len);
+
+    if (size < 0)  // error
+      return static_cast<long>(size);
+
+    pos += len;  // consume size field
+
+    // pos now points to start of sub-block group payload
+
+    if (pos > payload_stop)
+      return E_FILE_FORMAT_INVALID;
+
+    if (size == 0)  // weird
+      continue;
+
+    const long long unknown_size = (1LL << (7 * len)) - 1;
+
+    if (size == unknown_size)
+      return E_FILE_FORMAT_INVALID;
+
+    if (id == libwebm::kMkvDiscardPadding) {
+      status = UnserializeInt(pReader, pos, size, discard_padding);
+
+      if (status < 0)  // error
+        return status;
+    }
+
+    if (id != libwebm::kMkvBlock) {
+      pos += size;  // consume sub-part of block group
+
+      if (pos > payload_stop)
+        return E_FILE_FORMAT_INVALID;
+
+      continue;
+    }
+
+    const long long block_stop = pos + size;
+
+    if (block_stop > payload_stop)
+      return E_FILE_FORMAT_INVALID;
+
+    // parse track number
+
+    if ((pos + 1) > avail) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
+    }
+
+    result = GetUIntLength(pReader, pos, len);
+
+    if (result < 0)  // error
+      return static_cast<long>(result);
+
+    if (result > 0)  // weird
+      return E_BUFFER_NOT_FULL;
+
+    if ((pos + len) > block_stop)
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + len) > avail)
+      return E_BUFFER_NOT_FULL;
+
+    const long long track = ReadUInt(pReader, pos, len);
+
+    if (track < 0)  // error
+      return static_cast<long>(track);
+
+    if (track == 0)
+      return E_FILE_FORMAT_INVALID;
+
+    pos += len;  // consume track number
+
+    if ((pos + 2) > block_stop)
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + 2) > avail) {
+      len = 2;
+      return E_BUFFER_NOT_FULL;
+    }
+
+    pos += 2;  // consume timecode
+
+    if ((pos + 1) > block_stop)
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + 1) > avail) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
+    }
+
+    unsigned char flags;
+
+    status = pReader->Read(pos, 1, &flags);
+
+    if (status < 0) {  // error or underflow
+      len = 1;
+      return status;
+    }
+
+    ++pos;  // consume flags byte
+    assert(pos <= avail);
+
+    if (pos >= block_stop)
+      return E_FILE_FORMAT_INVALID;
+
+    const int lacing = int(flags & 0x06) >> 1;
+
+    if ((lacing != 0) && (block_stop > avail)) {
+      len = static_cast<long>(block_stop - pos);
+      return E_BUFFER_NOT_FULL;
+    }
+
+    pos = block_stop;  // consume block-part of block group
+    if (pos > payload_stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  if (pos != payload_stop)
+    return E_FILE_FORMAT_INVALID;
+
+  status = CreateBlock(libwebm::kMkvBlockGroup, payload_start, payload_size,
+                       discard_padding);
+  if (status != 0)
+    return status;
+
+  m_pos = payload_stop;
+
+  return 0;  // success
+}
+
+long Cluster::GetEntry(long index, const mkvparser::BlockEntry*& pEntry) const {
+  assert(m_pos >= m_element_start);
+
+  pEntry = NULL;
+
+  if (index < 0)
+    return -1;  // generic error
+
+  if (m_entries_count < 0)
+    return E_BUFFER_NOT_FULL;
+
+  assert(m_entries);
+  assert(m_entries_size > 0);
+  assert(m_entries_count <= m_entries_size);
+
+  if (index < m_entries_count) {
+    pEntry = m_entries[index];
+    assert(pEntry);
+
+    return 1;  // found entry
+  }
+
+  if (m_element_size < 0)  // we don't know cluster end yet
+    return E_BUFFER_NOT_FULL;  // underflow
+
+  const long long element_stop = m_element_start + m_element_size;
+
+  if (m_pos >= element_stop)
+    return 0;  // nothing left to parse
+
+  return E_BUFFER_NOT_FULL;  // underflow, since more remains to be parsed
+}
+
+Cluster* Cluster::Create(Segment* pSegment, long idx, long long off) {
+  if (!pSegment || off < 0)
+    return NULL;
+
+  const long long element_start = pSegment->m_start + off;
+
+  Cluster* const pCluster =
+      new (std::nothrow) Cluster(pSegment, idx, element_start);
+
+  return pCluster;
+}
+
+Cluster::Cluster()
+    : m_pSegment(NULL),
+      m_element_start(0),
+      m_index(0),
+      m_pos(0),
+      m_element_size(0),
+      m_timecode(0),
+      m_entries(NULL),
+      m_entries_size(0),
+      m_entries_count(0)  // means "no entries"
+{}
+
+Cluster::Cluster(Segment* pSegment, long idx, long long element_start
+                 /* long long element_size */)
+    : m_pSegment(pSegment),
+      m_element_start(element_start),
+      m_index(idx),
+      m_pos(element_start),
+      m_element_size(-1 /* element_size */),
+      m_timecode(-1),
+      m_entries(NULL),
+      m_entries_size(0),
+      m_entries_count(-1)  // means "has not been parsed yet"
+{}
+
+Cluster::~Cluster() {
+  if (m_entries_count <= 0) {
+    delete[] m_entries;
+    return;
+  }
+
+  BlockEntry** i = m_entries;
+  BlockEntry** const j = m_entries + m_entries_count;
+
+  while (i != j) {
+    BlockEntry* p = *i++;
+    assert(p);
+
+    delete p;
+  }
+
+  delete[] m_entries;
+}
+
+bool Cluster::EOS() const { return (m_pSegment == NULL); }
+
+long Cluster::GetIndex() const { return m_index; }
+
+long long Cluster::GetPosition() const {
+  const long long pos = m_element_start - m_pSegment->m_start;
+  assert(pos >= 0);
+
+  return pos;
+}
+
+long long Cluster::GetElementSize() const { return m_element_size; }
+
+long Cluster::HasBlockEntries(
+    const Segment* pSegment,
+    long long off,  // relative to start of segment payload
+    long long& pos, long& len) {
+  assert(pSegment);
+  assert(off >= 0);  // relative to segment
+
+  IMkvReader* const pReader = pSegment->m_pReader;
+
+  long long total, avail;
+
+  long status = pReader->Length(&total, &avail);
+
+  if (status < 0)  // error
+    return status;
+
+  assert((total < 0) || (avail <= total));
+
+  pos = pSegment->m_start + off;  // absolute
+
+  if ((total >= 0) && (pos >= total))
+    return 0;  // we don't even have a complete cluster
+
+  const long long segment_stop =
+      (pSegment->m_size < 0) ? -1 : pSegment->m_start + pSegment->m_size;
+
+  long long cluster_stop = -1;  // interpreted later to mean "unknown size"
+
+  {
+    if ((pos + 1) > avail) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
+    }
+
+    long long result = GetUIntLength(pReader, pos, len);
+
+    if (result < 0)  // error
+      return static_cast<long>(result);
+
+    if (result > 0)  // need more data
+      return E_BUFFER_NOT_FULL;
+
+    if ((segment_stop >= 0) && ((pos + len) > segment_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    if ((total >= 0) && ((pos + len) > total))
+      return 0;
+
+    if ((pos + len) > avail)
+      return E_BUFFER_NOT_FULL;
+
+    const long long id = ReadID(pReader, pos, len);
+
+    if (id < 0)  // error
+      return static_cast<long>(id);
+
+    if (id != libwebm::kMkvCluster)
+      return E_PARSE_FAILED;
+
+    pos += len;  // consume Cluster ID field
+
+    // read size field
+
+    if ((pos + 1) > avail) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
+    }
+
+    result = GetUIntLength(pReader, pos, len);
+
+    if (result < 0)  // error
+      return static_cast<long>(result);
+
+    if (result > 0)  // weird
+      return E_BUFFER_NOT_FULL;
+
+    if ((segment_stop >= 0) && ((pos + len) > segment_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    if ((total >= 0) && ((pos + len) > total))
+      return 0;
+
+    if ((pos + len) > avail)
+      return E_BUFFER_NOT_FULL;
+
+    const long long size = ReadUInt(pReader, pos, len);
+
+    if (size < 0)  // error
+      return static_cast<long>(size);
+
+    if (size == 0)
+      return 0;  // cluster does not have entries
+
+    pos += len;  // consume size field
+
+    // pos now points to start of payload
+
+    const long long unknown_size = (1LL << (7 * len)) - 1;
+
+    if (size != unknown_size) {
+      cluster_stop = pos + size;
+      assert(cluster_stop >= 0);
+
+      if ((segment_stop >= 0) && (cluster_stop > segment_stop))
+        return E_FILE_FORMAT_INVALID;
+
+      if ((total >= 0) && (cluster_stop > total))
+        // return E_FILE_FORMAT_INVALID;  //too conservative
+        return 0;  // cluster does not have any entries
+    }
+  }
+
+  for (;;) {
+    if ((cluster_stop >= 0) && (pos >= cluster_stop))
+      return 0;  // no entries detected
+
+    if ((pos + 1) > avail) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
+    }
+
+    long long result = GetUIntLength(pReader, pos, len);
+
+    if (result < 0)  // error
+      return static_cast<long>(result);
+
+    if (result > 0)  // need more data
+      return E_BUFFER_NOT_FULL;
+
+    if ((cluster_stop >= 0) && ((pos + len) > cluster_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + len) > avail)
+      return E_BUFFER_NOT_FULL;
+
+    const long long id = ReadID(pReader, pos, len);
+
+    if (id < 0)  // error
+      return static_cast<long>(id);
+
+    // This is the distinguished set of ID's we use to determine
+    // that we have exhausted the sub-element's inside the cluster
+    // whose ID we parsed earlier.
+
+    if (id == libwebm::kMkvCluster)
+      return 0;  // no entries found
+
+    if (id == libwebm::kMkvCues)
+      return 0;  // no entries found
+
+    pos += len;  // consume id field
+
+    if ((cluster_stop >= 0) && (pos >= cluster_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    // read size field
+
+    if ((pos + 1) > avail) {
+      len = 1;
+      return E_BUFFER_NOT_FULL;
+    }
+
+    result = GetUIntLength(pReader, pos, len);
+
+    if (result < 0)  // error
+      return static_cast<long>(result);
+
+    if (result > 0)  // underflow
+      return E_BUFFER_NOT_FULL;
+
+    if ((cluster_stop >= 0) && ((pos + len) > cluster_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + len) > avail)
+      return E_BUFFER_NOT_FULL;
+
+    const long long size = ReadUInt(pReader, pos, len);
+
+    if (size < 0)  // error
+      return static_cast<long>(size);
+
+    pos += len;  // consume size field
+
+    // pos now points to start of payload
+
+    if ((cluster_stop >= 0) && (pos > cluster_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    if (size == 0)  // weird
+      continue;
+
+    const long long unknown_size = (1LL << (7 * len)) - 1;
+
+    if (size == unknown_size)
+      return E_FILE_FORMAT_INVALID;  // not supported inside cluster
+
+    if ((cluster_stop >= 0) && ((pos + size) > cluster_stop))
+      return E_FILE_FORMAT_INVALID;
+
+    if (id == libwebm::kMkvBlockGroup)
+      return 1;  // have at least one entry
+
+    if (id == libwebm::kMkvSimpleBlock)
+      return 1;  // have at least one entry
+
+    pos += size;  // consume payload
+    if (cluster_stop >= 0 && pos > cluster_stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+}
+
+long long Cluster::GetTimeCode() const {
+  long long pos;
+  long len;
+
+  const long status = Load(pos, len);
+
+  if (status < 0)  // error
+    return status;
+
+  return m_timecode;
+}
+
+long long Cluster::GetTime() const {
+  const long long tc = GetTimeCode();
+
+  if (tc < 0)
+    return tc;
+
+  const SegmentInfo* const pInfo = m_pSegment->GetInfo();
+  assert(pInfo);
+
+  const long long scale = pInfo->GetTimeCodeScale();
+  assert(scale >= 1);
+
+  const long long t = m_timecode * scale;
+
+  return t;
+}
+
+long long Cluster::GetFirstTime() const {
+  const BlockEntry* pEntry;
+
+  const long status = GetFirst(pEntry);
+
+  if (status < 0)  // error
+    return status;
+
+  if (pEntry == NULL)  // empty cluster
+    return GetTime();
+
+  const Block* const pBlock = pEntry->GetBlock();
+  assert(pBlock);
+
+  return pBlock->GetTime(this);
+}
+
+long long Cluster::GetLastTime() const {
+  const BlockEntry* pEntry;
+
+  const long status = GetLast(pEntry);
+
+  if (status < 0)  // error
+    return status;
+
+  if (pEntry == NULL)  // empty cluster
+    return GetTime();
+
+  const Block* const pBlock = pEntry->GetBlock();
+  assert(pBlock);
+
+  return pBlock->GetTime(this);
+}
+
+long Cluster::CreateBlock(long long id,
+                          long long pos,  // absolute pos of payload
+                          long long size, long long discard_padding) {
+  if (id != libwebm::kMkvBlockGroup && id != libwebm::kMkvSimpleBlock)
+    return E_PARSE_FAILED;
+
+  if (m_entries_count < 0) {  // haven't parsed anything yet
+    assert(m_entries == NULL);
+    assert(m_entries_size == 0);
+
+    m_entries_size = 1024;
+    m_entries = new (std::nothrow) BlockEntry*[m_entries_size];
+    if (m_entries == NULL)
+      return -1;
+
+    m_entries_count = 0;
+  } else {
+    assert(m_entries);
+    assert(m_entries_size > 0);
+    assert(m_entries_count <= m_entries_size);
+
+    if (m_entries_count >= m_entries_size) {
+      const long entries_size = 2 * m_entries_size;
+
+      BlockEntry** const entries = new (std::nothrow) BlockEntry*[entries_size];
+      if (entries == NULL)
+        return -1;
+
+      BlockEntry** src = m_entries;
+      BlockEntry** const src_end = src + m_entries_count;
+
+      BlockEntry** dst = entries;
+
+      while (src != src_end)
+        *dst++ = *src++;
+
+      delete[] m_entries;
+
+      m_entries = entries;
+      m_entries_size = entries_size;
+    }
+  }
+
+  if (id == libwebm::kMkvBlockGroup)
+    return CreateBlockGroup(pos, size, discard_padding);
+  else
+    return CreateSimpleBlock(pos, size);
+}
+
+long Cluster::CreateBlockGroup(long long start_offset, long long size,
+                               long long discard_padding) {
+  assert(m_entries);
+  assert(m_entries_size > 0);
+  assert(m_entries_count >= 0);
+  assert(m_entries_count < m_entries_size);
+
+  IMkvReader* const pReader = m_pSegment->m_pReader;
+
+  long long pos = start_offset;
+  const long long stop = start_offset + size;
+
+  // For WebM files, there is a bias towards previous reference times
+  //(in order to support alt-ref frames, which refer back to the previous
+  // keyframe).  Normally a 0 value is not possible, but here we tenatively
+  // allow 0 as the value of a reference frame, with the interpretation
+  // that this is a "previous" reference time.
+
+  long long prev = 1;  // nonce
+  long long next = 0;  // nonce
+  long long duration = -1;  // really, this is unsigned
+
+  long long bpos = -1;
+  long long bsize = -1;
+
+  while (pos < stop) {
+    long len;
+    const long long id = ReadID(pReader, pos, len);
+    if (id < 0 || (pos + len) > stop)
+      return E_FILE_FORMAT_INVALID;
+
+    pos += len;  // consume ID
+
+    const long long size = ReadUInt(pReader, pos, len);
+    assert(size >= 0);  // TODO
+    assert((pos + len) <= stop);
+
+    pos += len;  // consume size
+
+    if (id == libwebm::kMkvBlock) {
+      if (bpos < 0) {  // Block ID
+        bpos = pos;
+        bsize = size;
+      }
+    } else if (id == libwebm::kMkvBlockDuration) {
+      if (size > 8)
+        return E_FILE_FORMAT_INVALID;
+
+      duration = UnserializeUInt(pReader, pos, size);
+
+      if (duration < 0)
+        return E_FILE_FORMAT_INVALID;
+    } else if (id == libwebm::kMkvReferenceBlock) {
+      if (size > 8 || size <= 0)
+        return E_FILE_FORMAT_INVALID;
+      const long size_ = static_cast<long>(size);
+
+      long long time;
+
+      long status = UnserializeInt(pReader, pos, size_, time);
+      assert(status == 0);
+      if (status != 0)
+        return -1;
+
+      if (time <= 0)  // see note above
+        prev = time;
+      else
+        next = time;
+    }
+
+    pos += size;  // consume payload
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+  if (bpos < 0)
+    return E_FILE_FORMAT_INVALID;
+
+  if (pos != stop)
+    return E_FILE_FORMAT_INVALID;
+  assert(bsize >= 0);
+
+  const long idx = m_entries_count;
+
+  BlockEntry** const ppEntry = m_entries + idx;
+  BlockEntry*& pEntry = *ppEntry;
+
+  pEntry = new (std::nothrow)
+      BlockGroup(this, idx, bpos, bsize, prev, next, duration, discard_padding);
+
+  if (pEntry == NULL)
+    return -1;  // generic error
+
+  BlockGroup* const p = static_cast<BlockGroup*>(pEntry);
+
+  const long status = p->Parse();
+
+  if (status == 0) {  // success
+    ++m_entries_count;
+    return 0;
+  }
+
+  delete pEntry;
+  pEntry = 0;
+
+  return status;
+}
+
+long Cluster::CreateSimpleBlock(long long st, long long sz) {
+  assert(m_entries);
+  assert(m_entries_size > 0);
+  assert(m_entries_count >= 0);
+  assert(m_entries_count < m_entries_size);
+
+  const long idx = m_entries_count;
+
+  BlockEntry** const ppEntry = m_entries + idx;
+  BlockEntry*& pEntry = *ppEntry;
+
+  pEntry = new (std::nothrow) SimpleBlock(this, idx, st, sz);
+
+  if (pEntry == NULL)
+    return -1;  // generic error
+
+  SimpleBlock* const p = static_cast<SimpleBlock*>(pEntry);
+
+  const long status = p->Parse();
+
+  if (status == 0) {
+    ++m_entries_count;
+    return 0;
+  }
+
+  delete pEntry;
+  pEntry = 0;
+
+  return status;
+}
+
+long Cluster::GetFirst(const BlockEntry*& pFirst) const {
+  if (m_entries_count <= 0) {
+    long long pos;
+    long len;
+
+    const long status = Parse(pos, len);
+
+    if (status < 0) {  // error
+      pFirst = NULL;
+      return status;
+    }
+
+    if (m_entries_count <= 0) {  // empty cluster
+      pFirst = NULL;
+      return 0;
+    }
+  }
+
+  assert(m_entries);
+
+  pFirst = m_entries[0];
+  assert(pFirst);
+
+  return 0;  // success
+}
+
+long Cluster::GetLast(const BlockEntry*& pLast) const {
+  for (;;) {
+    long long pos;
+    long len;
+
+    const long status = Parse(pos, len);
+
+    if (status < 0) {  // error
+      pLast = NULL;
+      return status;
+    }
+
+    if (status > 0)  // no new block
+      break;
+  }
+
+  if (m_entries_count <= 0) {
+    pLast = NULL;
+    return 0;
+  }
+
+  assert(m_entries);
+
+  const long idx = m_entries_count - 1;
+
+  pLast = m_entries[idx];
+  assert(pLast);
+
+  return 0;
+}
+
+long Cluster::GetNext(const BlockEntry* pCurr, const BlockEntry*& pNext) const {
+  assert(pCurr);
+  assert(m_entries);
+  assert(m_entries_count > 0);
+
+  size_t idx = pCurr->GetIndex();
+  assert(idx < size_t(m_entries_count));
+  assert(m_entries[idx] == pCurr);
+
+  ++idx;
+
+  if (idx >= size_t(m_entries_count)) {
+    long long pos;
+    long len;
+
+    const long status = Parse(pos, len);
+
+    if (status < 0) {  // error
+      pNext = NULL;
+      return status;
+    }
+
+    if (status > 0) {
+      pNext = NULL;
+      return 0;
+    }
+
+    assert(m_entries);
+    assert(m_entries_count > 0);
+    assert(idx < size_t(m_entries_count));
+  }
+
+  pNext = m_entries[idx];
+  assert(pNext);
+
+  return 0;
+}
+
+long Cluster::GetEntryCount() const { return m_entries_count; }
+
+const BlockEntry* Cluster::GetEntry(const Track* pTrack,
+                                    long long time_ns) const {
+  assert(pTrack);
+
+  if (m_pSegment == NULL)  // this is the special EOS cluster
+    return pTrack->GetEOS();
+
+  const BlockEntry* pResult = pTrack->GetEOS();
+
+  long index = 0;
+
+  for (;;) {
+    if (index >= m_entries_count) {
+      long long pos;
+      long len;
+
+      const long status = Parse(pos, len);
+      assert(status >= 0);
+
+      if (status > 0)  // completely parsed, and no more entries
+        return pResult;
+
+      if (status < 0)  // should never happen
+        return 0;
+
+      assert(m_entries);
+      assert(index < m_entries_count);
+    }
+
+    const BlockEntry* const pEntry = m_entries[index];
+    assert(pEntry);
+    assert(!pEntry->EOS());
+
+    const Block* const pBlock = pEntry->GetBlock();
+    assert(pBlock);
+
+    if (pBlock->GetTrackNumber() != pTrack->GetNumber()) {
+      ++index;
+      continue;
+    }
+
+    if (pTrack->VetEntry(pEntry)) {
+      if (time_ns < 0)  // just want first candidate block
+        return pEntry;
+
+      const long long ns = pBlock->GetTime(this);
+
+      if (ns > time_ns)
+        return pResult;
+
+      pResult = pEntry;  // have a candidate
+    } else if (time_ns >= 0) {
+      const long long ns = pBlock->GetTime(this);
+
+      if (ns > time_ns)
+        return pResult;
+    }
+
+    ++index;
+  }
+}
+
+const BlockEntry* Cluster::GetEntry(const CuePoint& cp,
+                                    const CuePoint::TrackPosition& tp) const {
+  assert(m_pSegment);
+  const long long tc = cp.GetTimeCode();
+
+  if (tp.m_block > 0) {
+    const long block = static_cast<long>(tp.m_block);
+    const long index = block - 1;
+
+    while (index >= m_entries_count) {
+      long long pos;
+      long len;
+
+      const long status = Parse(pos, len);
+
+      if (status < 0)  // TODO: can this happen?
+        return NULL;
+
+      if (status > 0)  // nothing remains to be parsed
+        return NULL;
+    }
+
+    const BlockEntry* const pEntry = m_entries[index];
+    assert(pEntry);
+    assert(!pEntry->EOS());
+
+    const Block* const pBlock = pEntry->GetBlock();
+    assert(pBlock);
+
+    if ((pBlock->GetTrackNumber() == tp.m_track) &&
+        (pBlock->GetTimeCode(this) == tc)) {
+      return pEntry;
+    }
+  }
+
+  long index = 0;
+
+  for (;;) {
+    if (index >= m_entries_count) {
+      long long pos;
+      long len;
+
+      const long status = Parse(pos, len);
+
+      if (status < 0)  // TODO: can this happen?
+        return NULL;
+
+      if (status > 0)  // nothing remains to be parsed
+        return NULL;
+
+      assert(m_entries);
+      assert(index < m_entries_count);
+    }
+
+    const BlockEntry* const pEntry = m_entries[index];
+    assert(pEntry);
+    assert(!pEntry->EOS());
+
+    const Block* const pBlock = pEntry->GetBlock();
+    assert(pBlock);
+
+    if (pBlock->GetTrackNumber() != tp.m_track) {
+      ++index;
+      continue;
+    }
+
+    const long long tc_ = pBlock->GetTimeCode(this);
+
+    if (tc_ < tc) {
+      ++index;
+      continue;
+    }
+
+    if (tc_ > tc)
+      return NULL;
+
+    const Tracks* const pTracks = m_pSegment->GetTracks();
+    assert(pTracks);
+
+    const long tn = static_cast<long>(tp.m_track);
+    const Track* const pTrack = pTracks->GetTrackByNumber(tn);
+
+    if (pTrack == NULL)
+      return NULL;
+
+    const long long type = pTrack->GetType();
+
+    if (type == 2)  // audio
+      return pEntry;
+
+    if (type != 1)  // not video
+      return NULL;
+
+    if (!pBlock->IsKey())
+      return NULL;
+
+    return pEntry;
+  }
+}
+
+BlockEntry::BlockEntry(Cluster* p, long idx) : m_pCluster(p), m_index(idx) {}
+BlockEntry::~BlockEntry() {}
+const Cluster* BlockEntry::GetCluster() const { return m_pCluster; }
+long BlockEntry::GetIndex() const { return m_index; }
+
+SimpleBlock::SimpleBlock(Cluster* pCluster, long idx, long long start,
+                         long long size)
+    : BlockEntry(pCluster, idx), m_block(start, size, 0) {}
+
+long SimpleBlock::Parse() { return m_block.Parse(m_pCluster); }
+BlockEntry::Kind SimpleBlock::GetKind() const { return kBlockSimple; }
+const Block* SimpleBlock::GetBlock() const { return &m_block; }
+
+BlockGroup::BlockGroup(Cluster* pCluster, long idx, long long block_start,
+                       long long block_size, long long prev, long long next,
+                       long long duration, long long discard_padding)
+    : BlockEntry(pCluster, idx),
+      m_block(block_start, block_size, discard_padding),
+      m_prev(prev),
+      m_next(next),
+      m_duration(duration) {}
+
+long BlockGroup::Parse() {
+  const long status = m_block.Parse(m_pCluster);
+
+  if (status)
+    return status;
+
+  m_block.SetKey((m_prev > 0) && (m_next <= 0));
+
+  return 0;
+}
+
+BlockEntry::Kind BlockGroup::GetKind() const { return kBlockGroup; }
+const Block* BlockGroup::GetBlock() const { return &m_block; }
+long long BlockGroup::GetPrevTimeCode() const { return m_prev; }
+long long BlockGroup::GetNextTimeCode() const { return m_next; }
+long long BlockGroup::GetDurationTimeCode() const { return m_duration; }
+
+Block::Block(long long start, long long size_, long long discard_padding)
+    : m_start(start),
+      m_size(size_),
+      m_track(0),
+      m_timecode(-1),
+      m_flags(0),
+      m_frames(NULL),
+      m_frame_count(-1),
+      m_discard_padding(discard_padding) {}
+
+Block::~Block() { delete[] m_frames; }
+
+long Block::Parse(const Cluster* pCluster) {
+  if (pCluster == NULL)
+    return -1;
+
+  if (pCluster->m_pSegment == NULL)
+    return -1;
+
+  assert(m_start >= 0);
+  assert(m_size >= 0);
+  assert(m_track <= 0);
+  assert(m_frames == NULL);
+  assert(m_frame_count <= 0);
+
+  long long pos = m_start;
+  const long long stop = m_start + m_size;
+
+  long len;
+
+  IMkvReader* const pReader = pCluster->m_pSegment->m_pReader;
+
+  m_track = ReadUInt(pReader, pos, len);
+
+  if (m_track <= 0)
+    return E_FILE_FORMAT_INVALID;
+
+  if ((pos + len) > stop)
+    return E_FILE_FORMAT_INVALID;
+
+  pos += len;  // consume track number
+
+  if ((stop - pos) < 2)
+    return E_FILE_FORMAT_INVALID;
+
+  long status;
+  long long value;
+
+  status = UnserializeInt(pReader, pos, 2, value);
+
+  if (status)
+    return E_FILE_FORMAT_INVALID;
+
+  if (value < SHRT_MIN)
+    return E_FILE_FORMAT_INVALID;
+
+  if (value > SHRT_MAX)
+    return E_FILE_FORMAT_INVALID;
+
+  m_timecode = static_cast<short>(value);
+
+  pos += 2;
+
+  if ((stop - pos) <= 0)
+    return E_FILE_FORMAT_INVALID;
+
+  status = pReader->Read(pos, 1, &m_flags);
+
+  if (status)
+    return E_FILE_FORMAT_INVALID;
+
+  const int lacing = int(m_flags & 0x06) >> 1;
+
+  ++pos;  // consume flags byte
+
+  if (lacing == 0) {  // no lacing
+    if (pos > stop)
+      return E_FILE_FORMAT_INVALID;
+
+    m_frame_count = 1;
+    m_frames = new (std::nothrow) Frame[m_frame_count];
+    if (m_frames == NULL)
+      return -1;
+
+    Frame& f = m_frames[0];
+    f.pos = pos;
+
+    const long long frame_size = stop - pos;
+
+    if (frame_size > LONG_MAX || frame_size <= 0)
+      return E_FILE_FORMAT_INVALID;
+
+    f.len = static_cast<long>(frame_size);
+
+    return 0;  // success
+  }
+
+  if (pos >= stop)
+    return E_FILE_FORMAT_INVALID;
+
+  unsigned char biased_count;
+
+  status = pReader->Read(pos, 1, &biased_count);
+
+  if (status)
+    return E_FILE_FORMAT_INVALID;
+
+  ++pos;  // consume frame count
+  if (pos > stop)
+    return E_FILE_FORMAT_INVALID;
+
+  m_frame_count = int(biased_count) + 1;
+
+  m_frames = new (std::nothrow) Frame[m_frame_count];
+  if (m_frames == NULL)
+    return -1;
+
+  if (!m_frames)
+    return E_FILE_FORMAT_INVALID;
+
+  if (lacing == 1) {  // Xiph
+    Frame* pf = m_frames;
+    Frame* const pf_end = pf + m_frame_count;
+
+    long long size = 0;
+    int frame_count = m_frame_count;
+
+    while (frame_count > 1) {
+      long frame_size = 0;
+
+      for (;;) {
+        unsigned char val;
+
+        if (pos >= stop)
+          return E_FILE_FORMAT_INVALID;
+
+        status = pReader->Read(pos, 1, &val);
+
+        if (status)
+          return E_FILE_FORMAT_INVALID;
+
+        ++pos;  // consume xiph size byte
+
+        frame_size += val;
+
+        if (val < 255)
+          break;
+      }
+
+      Frame& f = *pf++;
+      assert(pf < pf_end);
+      if (pf >= pf_end)
+        return E_FILE_FORMAT_INVALID;
+
+      f.pos = 0;  // patch later
+
+      if (frame_size <= 0)
+        return E_FILE_FORMAT_INVALID;
+
+      f.len = frame_size;
+      size += frame_size;  // contribution of this frame
+
+      --frame_count;
+    }
+
+    if (pf >= pf_end || pos > stop)
+      return E_FILE_FORMAT_INVALID;
+
+    {
+      Frame& f = *pf++;
+
+      if (pf != pf_end)
+        return E_FILE_FORMAT_INVALID;
+
+      f.pos = 0;  // patch later
+
+      const long long total_size = stop - pos;
+
+      if (total_size < size)
+        return E_FILE_FORMAT_INVALID;
+
+      const long long frame_size = total_size - size;
+
+      if (frame_size > LONG_MAX || frame_size <= 0)
+        return E_FILE_FORMAT_INVALID;
+
+      f.len = static_cast<long>(frame_size);
+    }
+
+    pf = m_frames;
+    while (pf != pf_end) {
+      Frame& f = *pf++;
+      assert((pos + f.len) <= stop);
+
+      if ((pos + f.len) > stop)
+        return E_FILE_FORMAT_INVALID;
+
+      f.pos = pos;
+      pos += f.len;
+    }
+
+    assert(pos == stop);
+    if (pos != stop)
+      return E_FILE_FORMAT_INVALID;
+
+  } else if (lacing == 2) {  // fixed-size lacing
+    if (pos >= stop)
+      return E_FILE_FORMAT_INVALID;
+
+    const long long total_size = stop - pos;
+
+    if ((total_size % m_frame_count) != 0)
+      return E_FILE_FORMAT_INVALID;
+
+    const long long frame_size = total_size / m_frame_count;
+
+    if (frame_size > LONG_MAX || frame_size <= 0)
+      return E_FILE_FORMAT_INVALID;
+
+    Frame* pf = m_frames;
+    Frame* const pf_end = pf + m_frame_count;
+
+    while (pf != pf_end) {
+      assert((pos + frame_size) <= stop);
+      if ((pos + frame_size) > stop)
+        return E_FILE_FORMAT_INVALID;
+
+      Frame& f = *pf++;
+
+      f.pos = pos;
+      f.len = static_cast<long>(frame_size);
+
+      pos += frame_size;
+    }
+
+    assert(pos == stop);
+    if (pos != stop)
+      return E_FILE_FORMAT_INVALID;
+
+  } else {
+    assert(lacing == 3);  // EBML lacing
+
+    if (pos >= stop)
+      return E_FILE_FORMAT_INVALID;
+
+    long long size = 0;
+    int frame_count = m_frame_count;
+
+    long long frame_size = ReadUInt(pReader, pos, len);
+
+    if (frame_size <= 0)
+      return E_FILE_FORMAT_INVALID;
+
+    if (frame_size > LONG_MAX)
+      return E_FILE_FORMAT_INVALID;
+
+    if ((pos + len) > stop)
+      return E_FILE_FORMAT_INVALID;
+
+    pos += len;  // consume length of size of first frame
+
+    if ((pos + frame_size) > stop)
+      return E_FILE_FORMAT_INVALID;
+
+    Frame* pf = m_frames;
+    Frame* const pf_end = pf + m_frame_count;
+
+    {
+      Frame& curr = *pf;
+
+      curr.pos = 0;  // patch later
+
+      curr.len = static_cast<long>(frame_size);
+      size += curr.len;  // contribution of this frame
+    }
+
+    --frame_count;
+
+    while (frame_count > 1) {
+      if (pos >= stop)
+        return E_FILE_FORMAT_INVALID;
+
+      assert(pf < pf_end);
+      if (pf >= pf_end)
+        return E_FILE_FORMAT_INVALID;
+
+      const Frame& prev = *pf++;
+      assert(prev.len == frame_size);
+      if (prev.len != frame_size)
+        return E_FILE_FORMAT_INVALID;
+
+      assert(pf < pf_end);
+      if (pf >= pf_end)
+        return E_FILE_FORMAT_INVALID;
+
+      Frame& curr = *pf;
+
+      curr.pos = 0;  // patch later
+
+      const long long delta_size_ = ReadUInt(pReader, pos, len);
+
+      if (delta_size_ < 0)
+        return E_FILE_FORMAT_INVALID;
+
+      if ((pos + len) > stop)
+        return E_FILE_FORMAT_INVALID;
+
+      pos += len;  // consume length of (delta) size
+      if (pos > stop)
+        return E_FILE_FORMAT_INVALID;
+
+      const long exp = 7 * len - 1;
+      const long long bias = (1LL << exp) - 1LL;
+      const long long delta_size = delta_size_ - bias;
+
+      frame_size += delta_size;
+
+      if (frame_size <= 0)
+        return E_FILE_FORMAT_INVALID;
+
+      if (frame_size > LONG_MAX)
+        return E_FILE_FORMAT_INVALID;
+
+      curr.len = static_cast<long>(frame_size);
+      // Check if size + curr.len could overflow.
+      if (size > LLONG_MAX - curr.len) {
+        return E_FILE_FORMAT_INVALID;
+      }
+      size += curr.len;  // contribution of this frame
+
+      --frame_count;
+    }
+
+    // parse last frame
+    if (frame_count > 0) {
+      if (pos > stop || pf >= pf_end)
+        return E_FILE_FORMAT_INVALID;
+
+      const Frame& prev = *pf++;
+      assert(prev.len == frame_size);
+      if (prev.len != frame_size)
+        return E_FILE_FORMAT_INVALID;
+
+      if (pf >= pf_end)
+        return E_FILE_FORMAT_INVALID;
+
+      Frame& curr = *pf++;
+      if (pf != pf_end)
+        return E_FILE_FORMAT_INVALID;
+
+      curr.pos = 0;  // patch later
+
+      const long long total_size = stop - pos;
+
+      if (total_size < size)
+        return E_FILE_FORMAT_INVALID;
+
+      frame_size = total_size - size;
+
+      if (frame_size > LONG_MAX || frame_size <= 0)
+        return E_FILE_FORMAT_INVALID;
+
+      curr.len = static_cast<long>(frame_size);
+    }
+
+    pf = m_frames;
+    while (pf != pf_end) {
+      Frame& f = *pf++;
+      if ((pos + f.len) > stop)
+        return E_FILE_FORMAT_INVALID;
+
+      f.pos = pos;
+      pos += f.len;
+    }
+
+    if (pos != stop)
+      return E_FILE_FORMAT_INVALID;
+  }
+
+  return 0;  // success
+}
+
+long long Block::GetTimeCode(const Cluster* pCluster) const {
+  if (pCluster == 0)
+    return m_timecode;
+
+  const long long tc0 = pCluster->GetTimeCode();
+  assert(tc0 >= 0);
+
+  // Check if tc0 + m_timecode would overflow.
+  if (tc0 < 0 || LLONG_MAX - tc0 < m_timecode) {
+    return -1;
+  }
+
+  const long long tc = tc0 + m_timecode;
+
+  return tc;  // unscaled timecode units
+}
+
+long long Block::GetTime(const Cluster* pCluster) const {
+  assert(pCluster);
+
+  const long long tc = GetTimeCode(pCluster);
+
+  const Segment* const pSegment = pCluster->m_pSegment;
+  const SegmentInfo* const pInfo = pSegment->GetInfo();
+  assert(pInfo);
+
+  const long long scale = pInfo->GetTimeCodeScale();
+  assert(scale >= 1);
+
+  // Check if tc * scale could overflow.
+  if (tc != 0 && scale > LLONG_MAX / tc) {
+    return -1;
+  }
+  const long long ns = tc * scale;
+
+  return ns;
+}
+
+long long Block::GetTrackNumber() const { return m_track; }
+
+bool Block::IsKey() const {
+  return ((m_flags & static_cast<unsigned char>(1 << 7)) != 0);
+}
+
+void Block::SetKey(bool bKey) {
+  if (bKey)
+    m_flags |= static_cast<unsigned char>(1 << 7);
+  else
+    m_flags &= 0x7F;
+}
+
+bool Block::IsInvisible() const { return bool(int(m_flags & 0x08) != 0); }
+
+Block::Lacing Block::GetLacing() const {
+  const int value = int(m_flags & 0x06) >> 1;
+  return static_cast<Lacing>(value);
+}
+
+int Block::GetFrameCount() const { return m_frame_count; }
+
+const Block::Frame& Block::GetFrame(int idx) const {
+  assert(idx >= 0);
+  assert(idx < m_frame_count);
+
+  const Frame& f = m_frames[idx];
+  assert(f.pos > 0);
+  assert(f.len > 0);
+
+  return f;
+}
+
+long Block::Frame::Read(IMkvReader* pReader, unsigned char* buf) const {
+  assert(pReader);
+  assert(buf);
+
+  const long status = pReader->Read(pos, len, buf);
+  return status;
+}
+
+long long Block::GetDiscardPadding() const { return m_discard_padding; }
+
+}  // namespace mkvparser
diff --git a/libs/libaom/src/third_party/libwebm/mkvparser/mkvparser.h b/libs/libaom/src/third_party/libwebm/mkvparser/mkvparser.h
new file mode 100644
index 000000000..848d01f03
--- /dev/null
+++ b/libs/libaom/src/third_party/libwebm/mkvparser/mkvparser.h
@@ -0,0 +1,1147 @@
+// Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS.  All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+#ifndef MKVPARSER_MKVPARSER_H_
+#define MKVPARSER_MKVPARSER_H_
+
+#include <cstddef>
+
+namespace mkvparser {
+
+const int E_PARSE_FAILED = -1;
+const int E_FILE_FORMAT_INVALID = -2;
+const int E_BUFFER_NOT_FULL = -3;
+
+class IMkvReader {
+ public:
+  virtual int Read(long long pos, long len, unsigned char* buf) = 0;
+  virtual int Length(long long* total, long long* available) = 0;
+
+ protected:
+  virtual ~IMkvReader() {}
+};
+
+template <typename Type>
+Type* SafeArrayAlloc(unsigned long long num_elements,
+                     unsigned long long element_size);
+long long GetUIntLength(IMkvReader*, long long, long&);
+long long ReadUInt(IMkvReader*, long long, long&);
+long long ReadID(IMkvReader* pReader, long long pos, long& len);
+long long UnserializeUInt(IMkvReader*, long long pos, long long size);
+
+long UnserializeFloat(IMkvReader*, long long pos, long long size, double&);
+long UnserializeInt(IMkvReader*, long long pos, long long size,
+                    long long& result);
+
+long UnserializeString(IMkvReader*, long long pos, long long size, char*& str);
+
+long ParseElementHeader(IMkvReader* pReader,
+                        long long& pos,  // consume id and size fields
+                        long long stop,  // if you know size of element's parent
+                        long long& id, long long& size);
+
+bool Match(IMkvReader*, long long&, unsigned long, long long&);
+bool Match(IMkvReader*, long long&, unsigned long, unsigned char*&, size_t&);
+
+void GetVersion(int& major, int& minor, int& build, int& revision);
+
+struct EBMLHeader {
+  EBMLHeader();
+  ~EBMLHeader();
+  long long m_version;
+  long long m_readVersion;
+  long long m_maxIdLength;
+  long long m_maxSizeLength;
+  char* m_docType;
+  long long m_docTypeVersion;
+  long long m_docTypeReadVersion;
+
+  long long Parse(IMkvReader*, long long&);
+  void Init();
+};
+
+class Segment;
+class Track;
+class Cluster;
+
+class Block {
+  Block(const Block&);
+  Block& operator=(const Block&);
+
+ public:
+  const long long m_start;
+  const long long m_size;
+
+  Block(long long start, long long size, long long discard_padding);
+  ~Block();
+
+  long Parse(const Cluster*);
+
+  long long GetTrackNumber() const;
+  long long GetTimeCode(const Cluster*) const;  // absolute, but not scaled
+  long long GetTime(const Cluster*) const;  // absolute, and scaled (ns)
+  bool IsKey() const;
+  void SetKey(bool);
+  bool IsInvisible() const;
+
+  enum Lacing { kLacingNone, kLacingXiph, kLacingFixed, kLacingEbml };
+  Lacing GetLacing() const;
+
+  int GetFrameCount() const;  // to index frames: [0, count)
+
+  struct Frame {
+    long long pos;  // absolute offset
+    long len;
+
+    long Read(IMkvReader*, unsigned char*) const;
+  };
+
+  const Frame& GetFrame(int frame_index) const;
+
+  long long GetDiscardPadding() const;
+
+ private:
+  long long m_track;  // Track::Number()
+  short m_timecode;  // relative to cluster
+  unsigned char m_flags;
+
+  Frame* m_frames;
+  int m_frame_count;
+
+ protected:
+  const long long m_discard_padding;
+};
+
+class BlockEntry {
+  BlockEntry(const BlockEntry&);
+  BlockEntry& operator=(const BlockEntry&);
+
+ protected:
+  BlockEntry(Cluster*, long index);
+
+ public:
+  virtual ~BlockEntry();
+
+  bool EOS() const { return (GetKind() == kBlockEOS); }
+  const Cluster* GetCluster() const;
+  long GetIndex() const;
+  virtual const Block* GetBlock() const = 0;
+
+  enum Kind { kBlockEOS, kBlockSimple, kBlockGroup };
+  virtual Kind GetKind() const = 0;
+
+ protected:
+  Cluster* const m_pCluster;
+  const long m_index;
+};
+
+class SimpleBlock : public BlockEntry {
+  SimpleBlock(const SimpleBlock&);
+  SimpleBlock& operator=(const SimpleBlock&);
+
+ public:
+  SimpleBlock(Cluster*, long index, long long start, long long size);
+  long Parse();
+
+  Kind GetKind() const;
+  const Block* GetBlock() const;
+
+ protected:
+  Block m_block;
+};
+
+class BlockGroup : public BlockEntry {
+  BlockGroup(const BlockGroup&);
+  BlockGroup& operator=(const BlockGroup&);
+
+ public:
+  BlockGroup(Cluster*, long index,
+             long long block_start,  // absolute pos of block's payload
+             long long block_size,  // size of block's payload
+             long long prev, long long next, long long duration,
+             long long discard_padding);
+
+  long Parse();
+
+  Kind GetKind() const;
+  const Block* GetBlock() const;
+
+  long long GetPrevTimeCode() const;  // relative to block's time
+  long long GetNextTimeCode() const;  // as above
+  long long GetDurationTimeCode() const;
+
+ private:
+  Block m_block;
+  const long long m_prev;
+  const long long m_next;
+  const long long m_duration;
+};
+
+///////////////////////////////////////////////////////////////
+// ContentEncoding element
+// Elements used to describe if the track data has been encrypted or
+// compressed with zlib or header stripping.
+class ContentEncoding {
+ public:
+  enum { kCTR = 1 };
+
+  ContentEncoding();
+  ~ContentEncoding();
+
+  // ContentCompression element names
+  struct ContentCompression {
+    ContentCompression();
+    ~ContentCompression();
+
+    unsigned long long algo;
+    unsigned char* settings;
+    long long settings_len;
+  };
+
+  // ContentEncAESSettings element names
+  struct ContentEncAESSettings {
+    ContentEncAESSettings() : cipher_mode(kCTR) {}
+    ~ContentEncAESSettings() {}
+
+    unsigned long long cipher_mode;
+  };
+
+  // ContentEncryption element names
+  struct ContentEncryption {
+    ContentEncryption();
+    ~ContentEncryption();
+
+    unsigned long long algo;
+    unsigned char* key_id;
+    long long key_id_len;
+    unsigned char* signature;
+    long long signature_len;
+    unsigned char* sig_key_id;
+    long long sig_key_id_len;
+    unsigned long long sig_algo;
+    unsigned long long sig_hash_algo;
+
+    ContentEncAESSettings aes_settings;
+  };
+
+  // Returns ContentCompression represented by |idx|. Returns NULL if |idx|
+  // is out of bounds.
+  const ContentCompression* GetCompressionByIndex(unsigned long idx) const;
+
+  // Returns number of ContentCompression elements in this ContentEncoding
+  // element.
+  unsigned long GetCompressionCount() const;
+
+  // Parses the ContentCompression element from |pReader|. |start| is the
+  // starting offset of the ContentCompression payload. |size| is the size in
+  // bytes of the ContentCompression payload. |compression| is where the parsed
+  // values will be stored.
+  long ParseCompressionEntry(long long start, long long size,
+                             IMkvReader* pReader,
+                             ContentCompression* compression);
+
+  // Returns ContentEncryption represented by |idx|. Returns NULL if |idx|
+  // is out of bounds.
+  const ContentEncryption* GetEncryptionByIndex(unsigned long idx) const;
+
+  // Returns number of ContentEncryption elements in this ContentEncoding
+  // element.
+  unsigned long GetEncryptionCount() const;
+
+  // Parses the ContentEncAESSettings element from |pReader|. |start| is the
+  // starting offset of the ContentEncAESSettings payload. |size| is the
+  // size in bytes of the ContentEncAESSettings payload. |encryption| is
+  // where the parsed values will be stored.
+  long ParseContentEncAESSettingsEntry(long long start, long long size,
+                                       IMkvReader* pReader,
+                                       ContentEncAESSettings* aes);
+
+  // Parses the ContentEncoding element from |pReader|. |start| is the
+  // starting offset of the ContentEncoding payload. |size| is the size in
+  // bytes of the ContentEncoding payload. Returns true on success.
+  long ParseContentEncodingEntry(long long start, long long size,
+                                 IMkvReader* pReader);
+
+  // Parses the ContentEncryption element from |pReader|. |start| is the
+  // starting offset of the ContentEncryption payload. |size| is the size in
+  // bytes of the ContentEncryption payload. |encryption| is where the parsed
+  // values will be stored.
+  long ParseEncryptionEntry(long long start, long long size,
+                            IMkvReader* pReader, ContentEncryption* encryption);
+
+  unsigned long long encoding_order() const { return encoding_order_; }
+  unsigned long long encoding_scope() const { return encoding_scope_; }
+  unsigned long long encoding_type() const { return encoding_type_; }
+
+ private:
+  // Member variables for list of ContentCompression elements.
+  ContentCompression** compression_entries_;
+  ContentCompression** compression_entries_end_;
+
+  // Member variables for list of ContentEncryption elements.
+  ContentEncryption** encryption_entries_;
+  ContentEncryption** encryption_entries_end_;
+
+  // ContentEncoding element names
+  unsigned long long encoding_order_;
+  unsigned long long encoding_scope_;
+  unsigned long long encoding_type_;
+
+  // LIBWEBM_DISALLOW_COPY_AND_ASSIGN(ContentEncoding);
+  ContentEncoding(const ContentEncoding&);
+  ContentEncoding& operator=(const ContentEncoding&);
+};
+
+class Track {
+  Track(const Track&);
+  Track& operator=(const Track&);
+
+ public:
+  class Info;
+  static long Create(Segment*, const Info&, long long element_start,
+                     long long element_size, Track*&);
+
+  enum Type { kVideo = 1, kAudio = 2, kSubtitle = 0x11, kMetadata = 0x21 };
+
+  Segment* const m_pSegment;
+  const long long m_element_start;
+  const long long m_element_size;
+  virtual ~Track();
+
+  long GetType() const;
+  long GetNumber() const;
+  unsigned long long GetUid() const;
+  const char* GetNameAsUTF8() const;
+  const char* GetLanguage() const;
+  const char* GetCodecNameAsUTF8() const;
+  const char* GetCodecId() const;
+  const unsigned char* GetCodecPrivate(size_t&) const;
+  bool GetLacing() const;
+  unsigned long long GetDefaultDuration() const;
+  unsigned long long GetCodecDelay() const;
+  unsigned long long GetSeekPreRoll() const;
+
+  const BlockEntry* GetEOS() const;
+
+  struct Settings {
+    long long start;
+    long long size;
+  };
+
+  class Info {
+   public:
+    Info();
+    ~Info();
+    int Copy(Info&) const;
+    void Clear();
+    long type;
+    long number;
+    unsigned long long uid;
+    unsigned long long defaultDuration;
+    unsigned long long codecDelay;
+    unsigned long long seekPreRoll;
+    char* nameAsUTF8;
+    char* language;
+    char* codecId;
+    char* codecNameAsUTF8;
+    unsigned char* codecPrivate;
+    size_t codecPrivateSize;
+    bool lacing;
+    Settings settings;
+
+   private:
+    Info(const Info&);
+    Info& operator=(const Info&);
+    int CopyStr(char* Info::*str, Info&) const;
+  };
+
+  long GetFirst(const BlockEntry*&) const;
+  long GetNext(const BlockEntry* pCurr, const BlockEntry*& pNext) const;
+  virtual bool VetEntry(const BlockEntry*) const;
+  virtual long Seek(long long time_ns, const BlockEntry*&) const;
+
+  const ContentEncoding* GetContentEncodingByIndex(unsigned long idx) const;
+  unsigned long GetContentEncodingCount() const;
+
+  long ParseContentEncodingsEntry(long long start, long long size);
+
+ protected:
+  Track(Segment*, long long element_start, long long element_size);
+
+  Info m_info;
+
+  class EOSBlock : public BlockEntry {
+   public:
+    EOSBlock();
+
+    Kind GetKind() const;
+    const Block* GetBlock() const;
+  };
+
+  EOSBlock m_eos;
+
+ private:
+  ContentEncoding** content_encoding_entries_;
+  ContentEncoding** content_encoding_entries_end_;
+};
+
+struct PrimaryChromaticity {
+  PrimaryChromaticity() : x(0), y(0) {}
+  ~PrimaryChromaticity() {}
+  static bool Parse(IMkvReader* reader, long long read_pos,
+                    long long value_size, bool is_x,
+                    PrimaryChromaticity** chromaticity);
+  float x;
+  float y;
+};
+
+struct MasteringMetadata {
+  static const float kValueNotPresent;
+
+  MasteringMetadata()
+      : r(NULL),
+        g(NULL),
+        b(NULL),
+        white_point(NULL),
+        luminance_max(kValueNotPresent),
+        luminance_min(kValueNotPresent) {}
+  ~MasteringMetadata() {
+    delete r;
+    delete g;
+    delete b;
+    delete white_point;
+  }
+
+  static bool Parse(IMkvReader* reader, long long element_start,
+                    long long element_size,
+                    MasteringMetadata** mastering_metadata);
+
+  PrimaryChromaticity* r;
+  PrimaryChromaticity* g;
+  PrimaryChromaticity* b;
+  PrimaryChromaticity* white_point;
+  float luminance_max;
+  float luminance_min;
+};
+
+struct Colour {
+  static const long long kValueNotPresent;
+
+  // Unless otherwise noted all values assigned upon construction are the
+  // equivalent of unspecified/default.
+  Colour()
+      : matrix_coefficients(kValueNotPresent),
+        bits_per_channel(kValueNotPresent),
+        chroma_subsampling_horz(kValueNotPresent),
+        chroma_subsampling_vert(kValueNotPresent),
+        cb_subsampling_horz(kValueNotPresent),
+        cb_subsampling_vert(kValueNotPresent),
+        chroma_siting_horz(kValueNotPresent),
+        chroma_siting_vert(kValueNotPresent),
+        range(kValueNotPresent),
+        transfer_characteristics(kValueNotPresent),
+        primaries(kValueNotPresent),
+        max_cll(kValueNotPresent),
+        max_fall(kValueNotPresent),
+        mastering_metadata(NULL) {}
+  ~Colour() {
+    delete mastering_metadata;
+    mastering_metadata = NULL;
+  }
+
+  static bool Parse(IMkvReader* reader, long long element_start,
+                    long long element_size, Colour** colour);
+
+  long long matrix_coefficients;
+  long long bits_per_channel;
+  long long chroma_subsampling_horz;
+  long long chroma_subsampling_vert;
+  long long cb_subsampling_horz;
+  long long cb_subsampling_vert;
+  long long chroma_siting_horz;
+  long long chroma_siting_vert;
+  long long range;
+  long long transfer_characteristics;
+  long long primaries;
+  long long max_cll;
+  long long max_fall;
+
+  MasteringMetadata* mastering_metadata;
+};
+
+struct Projection {
+  enum ProjectionType {
+    kTypeNotPresent = -1,
+    kRectangular = 0,
+    kEquirectangular = 1,
+    kCubeMap = 2,
+    kMesh = 3,
+  };
+  static const float kValueNotPresent;
+  Projection()
+      : type(kTypeNotPresent),
+        private_data(NULL),
+        private_data_length(0),
+        pose_yaw(kValueNotPresent),
+        pose_pitch(kValueNotPresent),
+        pose_roll(kValueNotPresent) {}
+  ~Projection() { delete[] private_data; }
+  static bool Parse(IMkvReader* reader, long long element_start,
+                    long long element_size, Projection** projection);
+
+  ProjectionType type;
+  unsigned char* private_data;
+  size_t private_data_length;
+  float pose_yaw;
+  float pose_pitch;
+  float pose_roll;
+};
+
+class VideoTrack : public Track {
+  VideoTrack(const VideoTrack&);
+  VideoTrack& operator=(const VideoTrack&);
+
+  VideoTrack(Segment*, long long element_start, long long element_size);
+
+ public:
+  virtual ~VideoTrack();
+  static long Parse(Segment*, const Info&, long long element_start,
+                    long long element_size, VideoTrack*&);
+
+  long long GetWidth() const;
+  long long GetHeight() const;
+  long long GetDisplayWidth() const;
+  long long GetDisplayHeight() const;
+  long long GetDisplayUnit() const;
+  long long GetStereoMode() const;
+  double GetFrameRate() const;
+
+  bool VetEntry(const BlockEntry*) const;
+  long Seek(long long time_ns, const BlockEntry*&) const;
+
+  Colour* GetColour() const;
+
+  Projection* GetProjection() const;
+
+  const char* GetColourSpace() const { return m_colour_space; }
+
+ private:
+  long long m_width;
+  long long m_height;
+  long long m_display_width;
+  long long m_display_height;
+  long long m_display_unit;
+  long long m_stereo_mode;
+  char* m_colour_space;
+  double m_rate;
+
+  Colour* m_colour;
+  Projection* m_projection;
+};
+
+class AudioTrack : public Track {
+  AudioTrack(const AudioTrack&);
+  AudioTrack& operator=(const AudioTrack&);
+
+  AudioTrack(Segment*, long long element_start, long long element_size);
+
+ public:
+  static long Parse(Segment*, const Info&, long long element_start,
+                    long long element_size, AudioTrack*&);
+
+  double GetSamplingRate() const;
+  long long GetChannels() const;
+  long long GetBitDepth() const;
+
+ private:
+  double m_rate;
+  long long m_channels;
+  long long m_bitDepth;
+};
+
+class Tracks {
+  Tracks(const Tracks&);
+  Tracks& operator=(const Tracks&);
+
+ public:
+  Segment* const m_pSegment;
+  const long long m_start;
+  const long long m_size;
+  const long long m_element_start;
+  const long long m_element_size;
+
+  Tracks(Segment*, long long start, long long size, long long element_start,
+         long long element_size);
+
+  ~Tracks();
+
+  long Parse();
+
+  unsigned long GetTracksCount() const;
+
+  const Track* GetTrackByNumber(long tn) const;
+  const Track* GetTrackByIndex(unsigned long idx) const;
+
+ private:
+  Track** m_trackEntries;
+  Track** m_trackEntriesEnd;
+
+  long ParseTrackEntry(long long payload_start, long long payload_size,
+                       long long element_start, long long element_size,
+                       Track*&) const;
+};
+
+class Chapters {
+  Chapters(const Chapters&);
+  Chapters& operator=(const Chapters&);
+
+ public:
+  Segment* const m_pSegment;
+  const long long m_start;
+  const long long m_size;
+  const long long m_element_start;
+  const long long m_element_size;
+
+  Chapters(Segment*, long long payload_start, long long payload_size,
+           long long element_start, long long element_size);
+
+  ~Chapters();
+
+  long Parse();
+
+  class Atom;
+  class Edition;
+
+  class Display {
+    friend class Atom;
+    Display();
+    Display(const Display&);
+    ~Display();
+    Display& operator=(const Display&);
+
+   public:
+    const char* GetString() const;
+    const char* GetLanguage() const;
+    const char* GetCountry() const;
+
+   private:
+    void Init();
+    void ShallowCopy(Display&) const;
+    void Clear();
+    long Parse(IMkvReader*, long long pos, long long size);
+
+    char* m_string;
+    char* m_language;
+    char* m_country;
+  };
+
+  class Atom {
+    friend class Edition;
+    Atom();
+    Atom(const Atom&);
+    ~Atom();
+    Atom& operator=(const Atom&);
+
+   public:
+    unsigned long long GetUID() const;
+    const char* GetStringUID() const;
+
+    long long GetStartTimecode() const;
+    long long GetStopTimecode() const;
+
+    long long GetStartTime(const Chapters*) const;
+    long long GetStopTime(const Chapters*) const;
+
+    int GetDisplayCount() const;
+    const Display* GetDisplay(int index) const;
+
+   private:
+    void Init();
+    void ShallowCopy(Atom&) const;
+    void Clear();
+    long Parse(IMkvReader*, long long pos, long long size);
+    static long long GetTime(const Chapters*, long long timecode);
+
+    long ParseDisplay(IMkvReader*, long long pos, long long size);
+    bool ExpandDisplaysArray();
+
+    char* m_string_uid;
+    unsigned long long m_uid;
+    long long m_start_timecode;
+    long long m_stop_timecode;
+
+    Display* m_displays;
+    int m_displays_size;
+    int m_displays_count;
+  };
+
+  class Edition {
+    friend class Chapters;
+    Edition();
+    Edition(const Edition&);
+    ~Edition();
+    Edition& operator=(const Edition&);
+
+   public:
+    int GetAtomCount() const;
+    const Atom* GetAtom(int index) const;
+
+   private:
+    void Init();
+    void ShallowCopy(Edition&) const;
+    void Clear();
+    long Parse(IMkvReader*, long long pos, long long size);
+
+    long ParseAtom(IMkvReader*, long long pos, long long size);
+    bool ExpandAtomsArray();
+
+    Atom* m_atoms;
+    int m_atoms_size;
+    int m_atoms_count;
+  };
+
+  int GetEditionCount() const;
+  const Edition* GetEdition(int index) const;
+
+ private:
+  long ParseEdition(long long pos, long long size);
+  bool ExpandEditionsArray();
+
+  Edition* m_editions;
+  int m_editions_size;
+  int m_editions_count;
+};
+
+class Tags {
+  Tags(const Tags&);
+  Tags& operator=(const Tags&);
+
+ public:
+  Segment* const m_pSegment;
+  const long long m_start;
+  const long long m_size;
+  const long long m_element_start;
+  const long long m_element_size;
+
+  Tags(Segment*, long long payload_start, long long payload_size,
+       long long element_start, long long element_size);
+
+  ~Tags();
+
+  long Parse();
+
+  class Tag;
+  class SimpleTag;
+
+  class SimpleTag {
+    friend class Tag;
+    SimpleTag();
+    SimpleTag(const SimpleTag&);
+    ~SimpleTag();
+    SimpleTag& operator=(const SimpleTag&);
+
+   public:
+    const char* GetTagName() const;
+    const char* GetTagString() const;
+
+   private:
+    void Init();
+    void ShallowCopy(SimpleTag&) const;
+    void Clear();
+    long Parse(IMkvReader*, long long pos, long long size);
+
+    char* m_tag_name;
+    char* m_tag_string;
+  };
+
+  class Tag {
+    friend class Tags;
+    Tag();
+    Tag(const Tag&);
+    ~Tag();
+    Tag& operator=(const Tag&);
+
+   public:
+    int GetSimpleTagCount() const;
+    const SimpleTag* GetSimpleTag(int index) const;
+
+   private:
+    void Init();
+    void ShallowCopy(Tag&) const;
+    void Clear();
+    long Parse(IMkvReader*, long long pos, long long size);
+
+    long ParseSimpleTag(IMkvReader*, long long pos, long long size);
+    bool ExpandSimpleTagsArray();
+
+    SimpleTag* m_simple_tags;
+    int m_simple_tags_size;
+    int m_simple_tags_count;
+  };
+
+  int GetTagCount() const;
+  const Tag* GetTag(int index) const;
+
+ private:
+  long ParseTag(long long pos, long long size);
+  bool ExpandTagsArray();
+
+  Tag* m_tags;
+  int m_tags_size;
+  int m_tags_count;
+};
+
+class SegmentInfo {
+  SegmentInfo(const SegmentInfo&);
+  SegmentInfo& operator=(const SegmentInfo&);
+
+ public:
+  Segment* const m_pSegment;
+  const long long m_start;
+  const long long m_size;
+  const long long m_element_start;
+  const long long m_element_size;
+
+  SegmentInfo(Segment*, long long start, long long size,
+              long long element_start, long long element_size);
+
+  ~SegmentInfo();
+
+  long Parse();
+
+  long long GetTimeCodeScale() const;
+  long long GetDuration() const;  // scaled
+  const char* GetMuxingAppAsUTF8() const;
+  const char* GetWritingAppAsUTF8() const;
+  const char* GetTitleAsUTF8() const;
+
+ private:
+  long long m_timecodeScale;
+  double m_duration;
+  char* m_pMuxingAppAsUTF8;
+  char* m_pWritingAppAsUTF8;
+  char* m_pTitleAsUTF8;
+};
+
+class SeekHead {
+  SeekHead(const SeekHead&);
+  SeekHead& operator=(const SeekHead&);
+
+ public:
+  Segment* const m_pSegment;
+  const long long m_start;
+  const long long m_size;
+  const long long m_element_start;
+  const long long m_element_size;
+
+  SeekHead(Segment*, long long start, long long size, long long element_start,
+           long long element_size);
+
+  ~SeekHead();
+
+  long Parse();
+
+  struct Entry {
+    Entry();
+
+    // the SeekHead entry payload
+    long long id;
+    long long pos;
+
+    // absolute pos of SeekEntry ID
+    long long element_start;
+
+    // SeekEntry ID size + size size + payload
+    long long element_size;
+  };
+
+  int GetCount() const;
+  const Entry* GetEntry(int idx) const;
+
+  struct VoidElement {
+    // absolute pos of Void ID
+    long long element_start;
+
+    // ID size + size size + payload size
+    long long element_size;
+  };
+
+  int GetVoidElementCount() const;
+  const VoidElement* GetVoidElement(int idx) const;
+
+ private:
+  Entry* m_entries;
+  int m_entry_count;
+
+  VoidElement* m_void_elements;
+  int m_void_element_count;
+
+  static bool ParseEntry(IMkvReader*,
+                         long long pos,  // payload
+                         long long size, Entry*);
+};
+
+class Cues;
+class CuePoint {
+  friend class Cues;
+
+  CuePoint(long, long long);
+  ~CuePoint();
+
+  CuePoint(const CuePoint&);
+  CuePoint& operator=(const CuePoint&);
+
+ public:
+  long long m_element_start;
+  long long m_element_size;
+
+  bool Load(IMkvReader*);
+
+  long long GetTimeCode() const;  // absolute but unscaled
+  long long GetTime(const Segment*) const;  // absolute and scaled (ns units)
+
+  struct TrackPosition {
+    long long m_track;
+    long long m_pos;  // of cluster
+    long long m_block;
+    // codec_state  //defaults to 0
+    // reference = clusters containing req'd referenced blocks
+    //  reftime = timecode of the referenced block
+
+    bool Parse(IMkvReader*, long long, long long);
+  };
+
+  const TrackPosition* Find(const Track*) const;
+
+ private:
+  const long m_index;
+  long long m_timecode;
+  TrackPosition* m_track_positions;
+  size_t m_track_positions_count;
+};
+
+class Cues {
+  friend class Segment;
+
+  Cues(Segment*, long long start, long long size, long long element_start,
+       long long element_size);
+  ~Cues();
+
+  Cues(const Cues&);
+  Cues& operator=(const Cues&);
+
+ public:
+  Segment* const m_pSegment;
+  const long long m_start;
+  const long long m_size;
+  const long long m_element_start;
+  const long long m_element_size;
+
+  bool Find(  // lower bound of time_ns
+      long long time_ns, const Track*, const CuePoint*&,
+      const CuePoint::TrackPosition*&) const;
+
+  const CuePoint* GetFirst() const;
+  const CuePoint* GetLast() const;
+  const CuePoint* GetNext(const CuePoint*) const;
+
+  const BlockEntry* GetBlock(const CuePoint*,
+                             const CuePoint::TrackPosition*) const;
+
+  bool LoadCuePoint() const;
+  long GetCount() const;  // loaded only
+  // long GetTotal() const;  //loaded + preloaded
+  bool DoneParsing() const;
+
+ private:
+  bool Init() const;
+  bool PreloadCuePoint(long&, long long) const;
+
+  mutable CuePoint** m_cue_points;
+  mutable long m_count;
+  mutable long m_preload_count;
+  mutable long long m_pos;
+};
+
+class Cluster {
+  friend class Segment;
+
+  Cluster(const Cluster&);
+  Cluster& operator=(const Cluster&);
+
+ public:
+  Segment* const m_pSegment;
+
+ public:
+  static Cluster* Create(Segment*,
+                         long index,  // index in segment
+                         long long off);  // offset relative to segment
+  // long long element_size);
+
+  Cluster();  // EndOfStream
+  ~Cluster();
+
+  bool EOS() const;
+
+  long long GetTimeCode() const;  // absolute, but not scaled
+  long long GetTime() const;  // absolute, and scaled (nanosecond units)
+  long long GetFirstTime() const;  // time (ns) of first (earliest) block
+  long long GetLastTime() const;  // time (ns) of last (latest) block
+
+  long GetFirst(const BlockEntry*&) const;
+  long GetLast(const BlockEntry*&) const;
+  long GetNext(const BlockEntry* curr, const BlockEntry*& next) const;
+
+  const BlockEntry* GetEntry(const Track*, long long ns = -1) const;
+  const BlockEntry* GetEntry(const CuePoint&,
+                             const CuePoint::TrackPosition&) const;
+  // const BlockEntry* GetMaxKey(const VideoTrack*) const;
+
+  //    static bool HasBlockEntries(const Segment*, long long);
+
+  static long HasBlockEntries(const Segment*, long long idoff, long long& pos,
+                              long& size);
+
+  long GetEntryCount() const;
+
+  long Load(long long& pos, long& size) const;
+
+  long Parse(long long& pos, long& size) const;
+  long GetEntry(long index, const mkvparser::BlockEntry*&) const;
+
+ protected:
+  Cluster(Segment*, long index, long long element_start);
+  // long long element_size);
+
+ public:
+  const long long m_element_start;
+  long long GetPosition() const;  // offset relative to segment
+
+  long GetIndex() const;
+  long long GetElementSize() const;
+  // long long GetPayloadSize() const;
+
+  // long long Unparsed() const;
+
+ private:
+  long m_index;
+  mutable long long m_pos;
+  // mutable long long m_size;
+  mutable long long m_element_size;
+  mutable long long m_timecode;
+  mutable BlockEntry** m_entries;
+  mutable long m_entries_size;
+  mutable long m_entries_count;
+
+  long ParseSimpleBlock(long long, long long&, long&);
+  long ParseBlockGroup(long long, long long&, long&);
+
+  long CreateBlock(long long id, long long pos, long long size,
+                   long long discard_padding);
+  long CreateBlockGroup(long long start_offset, long long size,
+                        long long discard_padding);
+  long CreateSimpleBlock(long long, long long);
+};
+
+class Segment {
+  friend class Cues;
+  friend class Track;
+  friend class VideoTrack;
+
+  Segment(const Segment&);
+  Segment& operator=(const Segment&);
+
+ private:
+  Segment(IMkvReader*, long long elem_start,
+          // long long elem_size,
+          long long pos, long long size);
+
+ public:
+  IMkvReader* const m_pReader;
+  const long long m_element_start;
+  // const long long m_element_size;
+  const long long m_start;  // posn of segment payload
+  const long long m_size;  // size of segment payload
+  Cluster m_eos;  // TODO: make private?
+
+  static long long CreateInstance(IMkvReader*, long long, Segment*&);
+  ~Segment();
+
+  long Load();  // loads headers and all clusters
+
+  // for incremental loading
+  // long long Unparsed() const;
+  bool DoneParsing() const;
+  long long ParseHeaders();  // stops when first cluster is found
+  // long FindNextCluster(long long& pos, long& size) const;
+  long LoadCluster(long long& pos, long& size);  // load one cluster
+  long LoadCluster();
+
+  long ParseNext(const Cluster* pCurr, const Cluster*& pNext, long long& pos,
+                 long& size);
+
+  const SeekHead* GetSeekHead() const;
+  const Tracks* GetTracks() const;
+  const SegmentInfo* GetInfo() const;
+  const Cues* GetCues() const;
+  const Chapters* GetChapters() const;
+  const Tags* GetTags() const;
+
+  long long GetDuration() const;
+
+  unsigned long GetCount() const;
+  const Cluster* GetFirst() const;
+  const Cluster* GetLast() const;
+  const Cluster* GetNext(const Cluster*);
+
+  const Cluster* FindCluster(long long time_nanoseconds) const;
+  // const BlockEntry* Seek(long long time_nanoseconds, const Track*) const;
+
+  const Cluster* FindOrPreloadCluster(long long pos);
+
+  long ParseCues(long long cues_off,  // offset relative to start of segment
+                 long long& parse_pos, long& parse_len);
+
+ private:
+  long long m_pos;  // absolute file posn; what has been consumed so far
+  Cluster* m_pUnknownSize;
+
+  SeekHead* m_pSeekHead;
+  SegmentInfo* m_pInfo;
+  Tracks* m_pTracks;
+  Cues* m_pCues;
+  Chapters* m_pChapters;
+  Tags* m_pTags;
+  Cluster** m_clusters;
+  long m_clusterCount;  // number of entries for which m_index >= 0
+  long m_clusterPreloadCount;  // number of entries for which m_index < 0
+  long m_clusterSize;  // array size
+
+  long DoLoadCluster(long long&, long&);
+  long DoLoadClusterUnknownSize(long long&, long&);
+  long DoParseNext(const Cluster*&, long long&, long&);
+
+  bool AppendCluster(Cluster*);
+  bool PreloadCluster(Cluster*, ptrdiff_t);
+
+  // void ParseSeekHead(long long pos, long long size);
+  // void ParseSeekEntry(long long pos, long long size);
+  // void ParseCues(long long);
+
+  const BlockEntry* GetBlock(const CuePoint&, const CuePoint::TrackPosition&);
+};
+
+}  // namespace mkvparser
+
+inline long mkvparser::Segment::LoadCluster() {
+  long long pos;
+  long size;
+
+  return LoadCluster(pos, size);
+}
+
+#endif  // MKVPARSER_MKVPARSER_H_
diff --git a/libs/libaom/src/third_party/libwebm/mkvparser/mkvreader.cc b/libs/libaom/src/third_party/libwebm/mkvparser/mkvreader.cc
new file mode 100644
index 000000000..9d19c1be5
--- /dev/null
+++ b/libs/libaom/src/third_party/libwebm/mkvparser/mkvreader.cc
@@ -0,0 +1,135 @@
+// Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS.  All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+#include "mkvparser/mkvreader.h"
+
+#include <sys/types.h>
+
+#include <cassert>
+
+namespace mkvparser {
+
+MkvReader::MkvReader() : m_file(NULL), reader_owns_file_(true) {}
+
+MkvReader::MkvReader(FILE* fp) : m_file(fp), reader_owns_file_(false) {
+  GetFileSize();
+}
+
+MkvReader::~MkvReader() {
+  if (reader_owns_file_)
+    Close();
+  m_file = NULL;
+}
+
+int MkvReader::Open(const char* fileName) {
+  if (fileName == NULL)
+    return -1;
+
+  if (m_file)
+    return -1;
+
+#ifdef _MSC_VER
+  const errno_t e = fopen_s(&m_file, fileName, "rb");
+
+  if (e)
+    return -1;  // error
+#else
+  m_file = fopen(fileName, "rb");
+
+  if (m_file == NULL)
+    return -1;
+#endif
+  return !GetFileSize();
+}
+
+bool MkvReader::GetFileSize() {
+  if (m_file == NULL)
+    return false;
+#ifdef _MSC_VER
+  int status = _fseeki64(m_file, 0L, SEEK_END);
+
+  if (status)
+    return false;  // error
+
+  m_length = _ftelli64(m_file);
+#else
+  fseek(m_file, 0L, SEEK_END);
+  m_length = ftell(m_file);
+#endif
+  assert(m_length >= 0);
+
+  if (m_length < 0)
+    return false;
+
+#ifdef _MSC_VER
+  status = _fseeki64(m_file, 0L, SEEK_SET);
+
+  if (status)
+    return false;  // error
+#else
+  fseek(m_file, 0L, SEEK_SET);
+#endif
+
+  return true;
+}
+
+void MkvReader::Close() {
+  if (m_file != NULL) {
+    fclose(m_file);
+    m_file = NULL;
+  }
+}
+
+int MkvReader::Length(long long* total, long long* available) {
+  if (m_file == NULL)
+    return -1;
+
+  if (total)
+    *total = m_length;
+
+  if (available)
+    *available = m_length;
+
+  return 0;
+}
+
+int MkvReader::Read(long long offset, long len, unsigned char* buffer) {
+  if (m_file == NULL)
+    return -1;
+
+  if (offset < 0)
+    return -1;
+
+  if (len < 0)
+    return -1;
+
+  if (len == 0)
+    return 0;
+
+  if (offset >= m_length)
+    return -1;
+
+#ifdef _MSC_VER
+  const int status = _fseeki64(m_file, offset, SEEK_SET);
+
+  if (status)
+    return -1;  // error
+#elif defined(_WIN32)
+  fseeko64(m_file, static_cast<off_t>(offset), SEEK_SET);
+#else
+  fseeko(m_file, static_cast<off_t>(offset), SEEK_SET);
+#endif
+
+  const size_t size = fread(buffer, 1, len, m_file);
+
+  if (size < size_t(len))
+    return -1;  // error
+
+  return 0;  // success
+}
+
+}  // namespace mkvparser
diff --git a/libs/libaom/src/third_party/libwebm/mkvparser/mkvreader.h b/libs/libaom/src/third_party/libwebm/mkvparser/mkvreader.h
new file mode 100644
index 000000000..9831ecf64
--- /dev/null
+++ b/libs/libaom/src/third_party/libwebm/mkvparser/mkvreader.h
@@ -0,0 +1,45 @@
+// Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the LICENSE file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS.  All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+#ifndef MKVPARSER_MKVREADER_H_
+#define MKVPARSER_MKVREADER_H_
+
+#include <cstdio>
+
+#include "mkvparser/mkvparser.h"
+
+namespace mkvparser {
+
+class MkvReader : public IMkvReader {
+ public:
+  MkvReader();
+  explicit MkvReader(FILE* fp);
+  virtual ~MkvReader();
+
+  int Open(const char*);
+  void Close();
+
+  virtual int Read(long long position, long length, unsigned char* buffer);
+  virtual int Length(long long* total, long long* available);
+
+ private:
+  MkvReader(const MkvReader&);
+  MkvReader& operator=(const MkvReader&);
+
+  // Determines the size of the file. This is called either by the constructor
+  // or by the Open function depending on file ownership. Returns true on
+  // success.
+  bool GetFileSize();
+
+  long long m_length;
+  FILE* m_file;
+  bool reader_owns_file_;
+};
+
+}  // namespace mkvparser
+
+#endif  // MKVPARSER_MKVREADER_H_
diff --git a/libs/libaom/src/third_party/libyuv/README.libaom b/libs/libaom/src/third_party/libyuv/README.libaom
new file mode 100644
index 000000000..09693c1f2
--- /dev/null
+++ b/libs/libaom/src/third_party/libyuv/README.libaom
@@ -0,0 +1,15 @@
+Name: libyuv
+URL: http://code.google.com/p/libyuv/
+Version: 1456
+License: BSD
+License File: LICENSE
+
+Description:
+libyuv is an open source project that includes YUV conversion and scaling
+functionality.
+
+The optimized scaler in libyuv is used in multiple resolution encoder example,
+which down-samples the original input video (f.g. 1280x720) a number of times
+in order to encode multiple resolution bit streams.
+
+Local Modifications:
diff --git a/libs/libaom/src/third_party/libyuv/include/libyuv/basic_types.h b/libs/libaom/src/third_party/libyuv/include/libyuv/basic_types.h
new file mode 100644
index 000000000..66e68536c
--- /dev/null
+++ b/libs/libaom/src/third_party/libyuv/include/libyuv/basic_types.h
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef INCLUDE_LIBYUV_BASIC_TYPES_H_  // NOLINT
+#define INCLUDE_LIBYUV_BASIC_TYPES_H_
+
+#include <stddef.h>  // for NULL, size_t
+
+#if defined(__ANDROID__) || (defined(_MSC_VER) && (_MSC_VER < 1600))
+#include <sys/types.h>  // for uintptr_t on x86
+#else
+#include <stdint.h>  // for uintptr_t
+#endif
+
+#ifndef GG_LONGLONG
+#ifndef INT_TYPES_DEFINED
+#define INT_TYPES_DEFINED
+#ifdef COMPILER_MSVC
+typedef unsigned __int64 uint64;
+typedef __int64 int64;
+#ifndef INT64_C
+#define INT64_C(x) x ## I64
+#endif
+#ifndef UINT64_C
+#define UINT64_C(x) x ## UI64
+#endif
+#define INT64_F "I64"
+#else  // COMPILER_MSVC
+#if defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__)
+typedef unsigned long uint64;  // NOLINT
+typedef long int64;  // NOLINT
+#ifndef INT64_C
+#define INT64_C(x) x ## L
+#endif
+#ifndef UINT64_C
+#define UINT64_C(x) x ## UL
+#endif
+#define INT64_F "l"
+#else  // defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__)
+typedef unsigned long long uint64;  // NOLINT
+typedef long long int64;  // NOLINT
+#ifndef INT64_C
+#define INT64_C(x) x ## LL
+#endif
+#ifndef UINT64_C
+#define UINT64_C(x) x ## ULL
+#endif
+#define INT64_F "ll"
+#endif  // __LP64__
+#endif  // COMPILER_MSVC
+typedef unsigned int uint32;
+typedef int int32;
+typedef unsigned short uint16;  // NOLINT
+typedef short int16;  // NOLINT
+typedef unsigned char uint8;
+typedef signed char int8;
+#endif  // INT_TYPES_DEFINED
+#endif  // GG_LONGLONG
+
+// Detect compiler is for x86 or x64.
+#if defined(__x86_64__) || defined(_M_X64) || \
+    defined(__i386__) || defined(_M_IX86)
+#define CPU_X86 1
+#endif
+// Detect compiler is for ARM.
+#if defined(__arm__) || defined(_M_ARM)
+#define CPU_ARM 1
+#endif
+
+#ifndef ALIGNP
+#ifdef __cplusplus
+#define ALIGNP(p, t) \
+    (reinterpret_cast<uint8*>(((reinterpret_cast<uintptr_t>(p) + \
+    ((t) - 1)) & ~((t) - 1))))
+#else
+#define ALIGNP(p, t) \
+    ((uint8*)((((uintptr_t)(p) + ((t) - 1)) & ~((t) - 1))))  /* NOLINT */
+#endif
+#endif
+
+#if !defined(LIBYUV_API)
+#if defined(_WIN32) || defined(__CYGWIN__)
+#if defined(LIBYUV_BUILDING_SHARED_LIBRARY)
+#define LIBYUV_API __declspec(dllexport)
+#elif defined(LIBYUV_USING_SHARED_LIBRARY)
+#define LIBYUV_API __declspec(dllimport)
+#else
+#define LIBYUV_API
+#endif  // LIBYUV_BUILDING_SHARED_LIBRARY
+#elif defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__APPLE__) && \
+    (defined(LIBYUV_BUILDING_SHARED_LIBRARY) || \
+    defined(LIBYUV_USING_SHARED_LIBRARY))
+#define LIBYUV_API __attribute__ ((visibility ("default")))
+#else
+#define LIBYUV_API
+#endif  // __GNUC__
+#endif  // LIBYUV_API
+
+#define LIBYUV_BOOL int
+#define LIBYUV_FALSE 0
+#define LIBYUV_TRUE 1
+
+// Visual C x86 or GCC little endian.
+#if defined(__x86_64__) || defined(_M_X64) || \
+  defined(__i386__) || defined(_M_IX86) || \
+  defined(__arm__) || defined(_M_ARM) || \
+  (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+#define LIBYUV_LITTLE_ENDIAN
+#endif
+
+#endif  // INCLUDE_LIBYUV_BASIC_TYPES_H_  NOLINT
diff --git a/libs/libaom/src/third_party/libyuv/include/libyuv/compare.h b/libs/libaom/src/third_party/libyuv/include/libyuv/compare.h
new file mode 100644
index 000000000..2a9f1560c
--- /dev/null
+++ b/libs/libaom/src/third_party/libyuv/include/libyuv/compare.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef INCLUDE_LIBYUV_COMPARE_H_  // NOLINT
+#define INCLUDE_LIBYUV_COMPARE_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Compute a hash for specified memory. Seed of 5381 recommended.
+LIBYUV_API
+uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed);
+
+// Scan an opaque argb image and return fourcc based on alpha offset.
+// Returns FOURCC_ARGB, FOURCC_BGRA, or 0 if unknown.
+LIBYUV_API
+uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height);
+
+// Sum Square Error - used to compute Mean Square Error or PSNR.
+LIBYUV_API
+uint64 ComputeSumSquareError(const uint8* src_a,
+                             const uint8* src_b, int count);
+
+LIBYUV_API
+uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
+                                  const uint8* src_b, int stride_b,
+                                  int width, int height);
+
+static const int kMaxPsnr = 128;
+
+LIBYUV_API
+double SumSquareErrorToPsnr(uint64 sse, uint64 count);
+
+LIBYUV_API
+double CalcFramePsnr(const uint8* src_a, int stride_a,
+                     const uint8* src_b, int stride_b,
+                     int width, int height);
+
+LIBYUV_API
+double I420Psnr(const uint8* src_y_a, int stride_y_a,
+                const uint8* src_u_a, int stride_u_a,
+                const uint8* src_v_a, int stride_v_a,
+                const uint8* src_y_b, int stride_y_b,
+                const uint8* src_u_b, int stride_u_b,
+                const uint8* src_v_b, int stride_v_b,
+                int width, int height);
+
+LIBYUV_API
+double CalcFrameSsim(const uint8* src_a, int stride_a,
+                     const uint8* src_b, int stride_b,
+                     int width, int height);
+
+LIBYUV_API
+double I420Ssim(const uint8* src_y_a, int stride_y_a,
+                const uint8* src_u_a, int stride_u_a,
+                const uint8* src_v_a, int stride_v_a,
+                const uint8* src_y_b, int stride_y_b,
+                const uint8* src_u_b, int stride_u_b,
+                const uint8* src_v_b, int stride_v_b,
+                int width, int height);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_COMPARE_H_  NOLINT
diff --git a/libs/libaom/src/third_party/libyuv/include/libyuv/convert.h b/libs/libaom/src/third_party/libyuv/include/libyuv/convert.h
new file mode 100644
index 000000000..d6f206c10
--- /dev/null
+++ b/libs/libaom/src/third_party/libyuv/include/libyuv/convert.h
@@ -0,0 +1,246 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef INCLUDE_LIBYUV_CONVERT_H_  // NOLINT
+#define INCLUDE_LIBYUV_CONVERT_H_
+
+#include "libyuv/basic_types.h"
+// TODO(fbarchard): Remove the following headers includes.
+#include "libyuv/convert_from.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Convert I444 to I420.
+LIBYUV_API
+int I444ToI420(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert I422 to I420.
+LIBYUV_API
+int I422ToI420(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert I411 to I420.
+LIBYUV_API
+int I411ToI420(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Copy I420 to I420.
+#define I420ToI420 I420Copy
+LIBYUV_API
+int I420Copy(const uint8* src_y, int src_stride_y,
+             const uint8* src_u, int src_stride_u,
+             const uint8* src_v, int src_stride_v,
+             uint8* dst_y, int dst_stride_y,
+             uint8* dst_u, int dst_stride_u,
+             uint8* dst_v, int dst_stride_v,
+             int width, int height);
+
+// Convert I400 (grey) to I420.
+LIBYUV_API
+int I400ToI420(const uint8* src_y, int src_stride_y,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+#define J400ToJ420 I400ToI420
+
+// Convert NV12 to I420.
+LIBYUV_API
+int NV12ToI420(const uint8* src_y, int src_stride_y,
+               const uint8* src_uv, int src_stride_uv,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert NV21 to I420.
+LIBYUV_API
+int NV21ToI420(const uint8* src_y, int src_stride_y,
+               const uint8* src_vu, int src_stride_vu,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert YUY2 to I420.
+LIBYUV_API
+int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert UYVY to I420.
+LIBYUV_API
+int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert M420 to I420.
+LIBYUV_API
+int M420ToI420(const uint8* src_m420, int src_stride_m420,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// ARGB little endian (bgra in memory) to I420.
+LIBYUV_API
+int ARGBToI420(const uint8* src_frame, int src_stride_frame,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// BGRA little endian (argb in memory) to I420.
+LIBYUV_API
+int BGRAToI420(const uint8* src_frame, int src_stride_frame,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// ABGR little endian (rgba in memory) to I420.
+LIBYUV_API
+int ABGRToI420(const uint8* src_frame, int src_stride_frame,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// RGBA little endian (abgr in memory) to I420.
+LIBYUV_API
+int RGBAToI420(const uint8* src_frame, int src_stride_frame,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// RGB little endian (bgr in memory) to I420.
+LIBYUV_API
+int RGB24ToI420(const uint8* src_frame, int src_stride_frame,
+                uint8* dst_y, int dst_stride_y,
+                uint8* dst_u, int dst_stride_u,
+                uint8* dst_v, int dst_stride_v,
+                int width, int height);
+
+// RGB big endian (rgb in memory) to I420.
+LIBYUV_API
+int RAWToI420(const uint8* src_frame, int src_stride_frame,
+              uint8* dst_y, int dst_stride_y,
+              uint8* dst_u, int dst_stride_u,
+              uint8* dst_v, int dst_stride_v,
+              int width, int height);
+
+// RGB16 (RGBP fourcc) little endian to I420.
+LIBYUV_API
+int RGB565ToI420(const uint8* src_frame, int src_stride_frame,
+                 uint8* dst_y, int dst_stride_y,
+                 uint8* dst_u, int dst_stride_u,
+                 uint8* dst_v, int dst_stride_v,
+                 int width, int height);
+
+// RGB15 (RGBO fourcc) little endian to I420.
+LIBYUV_API
+int ARGB1555ToI420(const uint8* src_frame, int src_stride_frame,
+                   uint8* dst_y, int dst_stride_y,
+                   uint8* dst_u, int dst_stride_u,
+                   uint8* dst_v, int dst_stride_v,
+                   int width, int height);
+
+// RGB12 (R444 fourcc) little endian to I420.
+LIBYUV_API
+int ARGB4444ToI420(const uint8* src_frame, int src_stride_frame,
+                   uint8* dst_y, int dst_stride_y,
+                   uint8* dst_u, int dst_stride_u,
+                   uint8* dst_v, int dst_stride_v,
+                   int width, int height);
+
+#ifdef HAVE_JPEG
+// src_width/height provided by capture.
+// dst_width/height for clipping determine final size.
+LIBYUV_API
+int MJPGToI420(const uint8* sample, size_t sample_size,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int src_width, int src_height,
+               int dst_width, int dst_height);
+
+// Query size of MJPG in pixels.
+LIBYUV_API
+int MJPGSize(const uint8* sample, size_t sample_size,
+             int* width, int* height);
+#endif
+
+// Convert camera sample to I420 with cropping, rotation and vertical flip.
+// "src_size" is needed to parse MJPG.
+// "dst_stride_y" number of bytes in a row of the dst_y plane.
+//   Normally this would be the same as dst_width, with recommended alignment
+//   to 16 bytes for better efficiency.
+//   If rotation of 90 or 270 is used, stride is affected. The caller should
+//   allocate the I420 buffer according to rotation.
+// "dst_stride_u" number of bytes in a row of the dst_u plane.
+//   Normally this would be the same as (dst_width + 1) / 2, with
+//   recommended alignment to 16 bytes for better efficiency.
+//   If rotation of 90 or 270 is used, stride is affected.
+// "crop_x" and "crop_y" are starting position for cropping.
+//   To center, crop_x = (src_width - dst_width) / 2
+//              crop_y = (src_height - dst_height) / 2
+// "src_width" / "src_height" is size of src_frame in pixels.
+//   "src_height" can be negative indicating a vertically flipped image source.
+// "crop_width" / "crop_height" is the size to crop the src to.
+//    Must be less than or equal to src_width/src_height
+//    Cropping parameters are pre-rotation.
+// "rotation" can be 0, 90, 180 or 270.
+// "format" is a fourcc. ie 'I420', 'YUY2'
+// Returns 0 for successful; -1 for invalid parameter. Non-zero for failure.
+LIBYUV_API
+int ConvertToI420(const uint8* src_frame, size_t src_size,
+                  uint8* dst_y, int dst_stride_y,
+                  uint8* dst_u, int dst_stride_u,
+                  uint8* dst_v, int dst_stride_v,
+                  int crop_x, int crop_y,
+                  int src_width, int src_height,
+                  int crop_width, int crop_height,
+                  enum RotationMode rotation,
+                  uint32 format);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_CONVERT_H_  NOLINT
diff --git a/libs/libaom/src/third_party/libyuv/include/libyuv/convert_argb.h b/libs/libaom/src/third_party/libyuv/include/libyuv/convert_argb.h
new file mode 100644
index 000000000..ea75c0b26
--- /dev/null
+++ b/libs/libaom/src/third_party/libyuv/include/libyuv/convert_argb.h
@@ -0,0 +1,232 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef INCLUDE_LIBYUV_CONVERT_ARGB_H_  // NOLINT
+#define INCLUDE_LIBYUV_CONVERT_ARGB_H_
+
+#include "libyuv/basic_types.h"
+// TODO(fbarchard): Remove the following headers includes
+#include "libyuv/convert_from.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"
+
+// TODO(fbarchard): This set of functions should exactly match convert.h
+// TODO(fbarchard): Add tests. Create random content of right size and convert
+// with C vs Opt and or to I420 and compare.
+// TODO(fbarchard): Some of these functions lack parameter setting.
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Alias.
+#define ARGBToARGB ARGBCopy
+
+// Copy ARGB to ARGB.
+LIBYUV_API
+int ARGBCopy(const uint8* src_argb, int src_stride_argb,
+             uint8* dst_argb, int dst_stride_argb,
+             int width, int height);
+
+// Convert I420 to ARGB.
+LIBYUV_API
+int I420ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert I422 to ARGB.
+LIBYUV_API
+int I422ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert I444 to ARGB.
+LIBYUV_API
+int I444ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert I411 to ARGB.
+LIBYUV_API
+int I411ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert I400 (grey) to ARGB.  Reverse of ARGBToI400.
+LIBYUV_API
+int I400ToARGB(const uint8* src_y, int src_stride_y,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert J400 (jpeg grey) to ARGB.
+LIBYUV_API
+int J400ToARGB(const uint8* src_y, int src_stride_y,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Alias.
+#define YToARGB I400ToARGB
+
+// Convert NV12 to ARGB.
+LIBYUV_API
+int NV12ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_uv, int src_stride_uv,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert NV21 to ARGB.
+LIBYUV_API
+int NV21ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_vu, int src_stride_vu,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert M420 to ARGB.
+LIBYUV_API
+int M420ToARGB(const uint8* src_m420, int src_stride_m420,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert YUY2 to ARGB.
+LIBYUV_API
+int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert UYVY to ARGB.
+LIBYUV_API
+int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert J420 to ARGB.
+LIBYUV_API
+int J420ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert J422 to ARGB.
+LIBYUV_API
+int J422ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// BGRA little endian (argb in memory) to ARGB.
+LIBYUV_API
+int BGRAToARGB(const uint8* src_frame, int src_stride_frame,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// ABGR little endian (rgba in memory) to ARGB.
+LIBYUV_API
+int ABGRToARGB(const uint8* src_frame, int src_stride_frame,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// RGBA little endian (abgr in memory) to ARGB.
+LIBYUV_API
+int RGBAToARGB(const uint8* src_frame, int src_stride_frame,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Deprecated function name.
+#define BG24ToARGB RGB24ToARGB
+
+// RGB little endian (bgr in memory) to ARGB.
+LIBYUV_API
+int RGB24ToARGB(const uint8* src_frame, int src_stride_frame,
+                uint8* dst_argb, int dst_stride_argb,
+                int width, int height);
+
+// RGB big endian (rgb in memory) to ARGB.
+LIBYUV_API
+int RAWToARGB(const uint8* src_frame, int src_stride_frame,
+              uint8* dst_argb, int dst_stride_argb,
+              int width, int height);
+
+// RGB16 (RGBP fourcc) little endian to ARGB.
+LIBYUV_API
+int RGB565ToARGB(const uint8* src_frame, int src_stride_frame,
+                 uint8* dst_argb, int dst_stride_argb,
+                 int width, int height);
+
+// RGB15 (RGBO fourcc) little endian to ARGB.
+LIBYUV_API
+int ARGB1555ToARGB(const uint8* src_frame, int src_stride_frame,
+                   uint8* dst_argb, int dst_stride_argb,
+                   int width, int height);
+
+// RGB12 (R444 fourcc) little endian to ARGB.
+LIBYUV_API
+int ARGB4444ToARGB(const uint8* src_frame, int src_stride_frame,
+                   uint8* dst_argb, int dst_stride_argb,
+                   int width, int height);
+
+#ifdef HAVE_JPEG
+// src_width/height provided by capture
+// dst_width/height for clipping determine final size.
+LIBYUV_API
+int MJPGToARGB(const uint8* sample, size_t sample_size,
+               uint8* dst_argb, int dst_stride_argb,
+               int src_width, int src_height,
+               int dst_width, int dst_height);
+#endif
+
+// Convert camera sample to ARGB with cropping, rotation and vertical flip.
+// "src_size" is needed to parse MJPG.
+// "dst_stride_argb" number of bytes in a row of the dst_argb plane.
+//   Normally this would be the same as dst_width, with recommended alignment
+//   to 16 bytes for better efficiency.
+//   If rotation of 90 or 270 is used, stride is affected. The caller should
+//   allocate the I420 buffer according to rotation.
+// "dst_stride_u" number of bytes in a row of the dst_u plane.
+//   Normally this would be the same as (dst_width + 1) / 2, with
+//   recommended alignment to 16 bytes for better efficiency.
+//   If rotation of 90 or 270 is used, stride is affected.
+// "crop_x" and "crop_y" are starting position for cropping.
+//   To center, crop_x = (src_width - dst_width) / 2
+//              crop_y = (src_height - dst_height) / 2
+// "src_width" / "src_height" is size of src_frame in pixels.
+//   "src_height" can be negative indicating a vertically flipped image source.
+// "crop_width" / "crop_height" is the size to crop the src to.
+//    Must be less than or equal to src_width/src_height
+//    Cropping parameters are pre-rotation.
+// "rotation" can be 0, 90, 180 or 270.
+// "format" is a fourcc. ie 'I420', 'YUY2'
+// Returns 0 for successful; -1 for invalid parameter. Non-zero for failure.
+LIBYUV_API
+int ConvertToARGB(const uint8* src_frame, size_t src_size,
+                  uint8* dst_argb, int dst_stride_argb,
+                  int crop_x, int crop_y,
+                  int src_width, int src_height,
+                  int crop_width, int crop_height,
+                  enum RotationMode rotation,
+                  uint32 format);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_CONVERT_ARGB_H_  NOLINT
diff --git a/libs/libaom/src/third_party/libyuv/include/libyuv/convert_from.h b/libs/libaom/src/third_party/libyuv/include/libyuv/convert_from.h
new file mode 100644
index 000000000..3591b4fd6
--- /dev/null
+++ b/libs/libaom/src/third_party/libyuv/include/libyuv/convert_from.h
@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef INCLUDE_LIBYUV_CONVERT_FROM_H_  // NOLINT
+#define INCLUDE_LIBYUV_CONVERT_FROM_H_
+
+#include "libyuv/basic_types.h"
+#include "libyuv/rotate.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// See Also convert.h for conversions from formats to I420.
+
+// I420Copy in convert to I420ToI420.
+
+LIBYUV_API
+int I420ToI422(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+LIBYUV_API
+int I420ToI444(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+LIBYUV_API
+int I420ToI411(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Copy to I400. Source can be I420, I422, I444, I400, NV12 or NV21.
+LIBYUV_API
+int I400Copy(const uint8* src_y, int src_stride_y,
+             uint8* dst_y, int dst_stride_y,
+             int width, int height);
+
+// TODO(fbarchard): I420ToM420
+
+LIBYUV_API
+int I420ToNV12(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_uv, int dst_stride_uv,
+               int width, int height);
+
+LIBYUV_API
+int I420ToNV21(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_vu, int dst_stride_vu,
+               int width, int height);
+
+LIBYUV_API
+int I420ToYUY2(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_frame, int dst_stride_frame,
+               int width, int height);
+
+LIBYUV_API
+int I420ToUYVY(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_frame, int dst_stride_frame,
+               int width, int height);
+
+LIBYUV_API
+int I420ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+LIBYUV_API
+int I420ToBGRA(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+LIBYUV_API
+int I420ToABGR(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+LIBYUV_API
+int I420ToRGBA(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_rgba, int dst_stride_rgba,
+               int width, int height);
+
+LIBYUV_API
+int I420ToRGB24(const uint8* src_y, int src_stride_y,
+                const uint8* src_u, int src_stride_u,
+                const uint8* src_v, int src_stride_v,
+                uint8* dst_frame, int dst_stride_frame,
+                int width, int height);
+
+LIBYUV_API
+int I420ToRAW(const uint8* src_y, int src_stride_y,
+              const uint8* src_u, int src_stride_u,
+              const uint8* src_v, int src_stride_v,
+              uint8* dst_frame, int dst_stride_frame,
+              int width, int height);
+
+LIBYUV_API
+int I420ToRGB565(const uint8* src_y, int src_stride_y,
+                 const uint8* src_u, int src_stride_u,
+                 const uint8* src_v, int src_stride_v,
+                 uint8* dst_frame, int dst_stride_frame,
+                 int width, int height);
+
+// Convert I420 To RGB565 with 4x4 dither matrix (16 bytes).
+// Values in dither matrix from 0 to 7 recommended.
+// The order of the dither matrix is first byte is upper left.
+
+LIBYUV_API
+int I420ToRGB565Dither(const uint8* src_y, int src_stride_y,
+                       const uint8* src_u, int src_stride_u,
+                       const uint8* src_v, int src_stride_v,
+                       uint8* dst_frame, int dst_stride_frame,
+                       const uint8* dither4x4, int width, int height);
+
+LIBYUV_API
+int I420ToARGB1555(const uint8* src_y, int src_stride_y,
+                   const uint8* src_u, int src_stride_u,
+                   const uint8* src_v, int src_stride_v,
+                   uint8* dst_frame, int dst_stride_frame,
+                   int width, int height);
+
+LIBYUV_API
+int I420ToARGB4444(const uint8* src_y, int src_stride_y,
+                   const uint8* src_u, int src_stride_u,
+                   const uint8* src_v, int src_stride_v,
+                   uint8* dst_frame, int dst_stride_frame,
+                   int width, int height);
+
+// Convert I420 to specified format.
+// "dst_sample_stride" is bytes in a row for the destination. Pass 0 if the
+//    buffer has contiguous rows. Can be negative. A multiple of 16 is optimal.
+LIBYUV_API
+int ConvertFromI420(const uint8* y, int y_stride,
+                    const uint8* u, int u_stride,
+                    const uint8* v, int v_stride,
+                    uint8* dst_sample, int dst_sample_stride,
+                    int width, int height,
+                    uint32 format);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_CONVERT_FROM_H_  NOLINT
diff --git a/libs/libaom/src/third_party/libyuv/include/libyuv/convert_from_argb.h b/libs/libaom/src/third_party/libyuv/include/libyuv/convert_from_argb.h
new file mode 100644
index 000000000..4a6226813
--- /dev/null
+++ b/libs/libaom/src/third_party/libyuv/include/libyuv/convert_from_argb.h
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_  // NOLINT
+#define INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Copy ARGB to ARGB.
+#define ARGBToARGB ARGBCopy
+LIBYUV_API
+int ARGBCopy(const uint8* src_argb, int src_stride_argb,
+             uint8* dst_argb, int dst_stride_argb,
+             int width, int height);
+
+// Convert ARGB To BGRA.
+LIBYUV_API
+int ARGBToBGRA(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_bgra, int dst_stride_bgra,
+               int width, int height);
+
+// Convert ARGB To ABGR.
+LIBYUV_API
+int ARGBToABGR(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_abgr, int dst_stride_abgr,
+               int width, int height);
+
+// Convert ARGB To RGBA.
+LIBYUV_API
+int ARGBToRGBA(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_rgba, int dst_stride_rgba,
+               int width, int height);
+
+// Convert ARGB To RGB24.
+LIBYUV_API
+int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
+                uint8* dst_rgb24, int dst_stride_rgb24,
+                int width, int height);
+
+// Convert ARGB To RAW.
+LIBYUV_API
+int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
+              uint8* dst_rgb, int dst_stride_rgb,
+              int width, int height);
+
+// Convert ARGB To RGB565.
+LIBYUV_API
+int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
+                 uint8* dst_rgb565, int dst_stride_rgb565,
+                 int width, int height);
+
+// Convert ARGB To RGB565 with 4x4 dither matrix (16 bytes).
+// Values in dither matrix from 0 to 7 recommended.
+// The order of the dither matrix is first byte is upper left.
+// TODO(fbarchard): Consider pointer to 2d array for dither4x4.
+// const uint8(*dither)[4][4];
+LIBYUV_API
+int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb,
+                       uint8* dst_rgb565, int dst_stride_rgb565,
+                       const uint8* dither4x4, int width, int height);
+
+// Convert ARGB To ARGB1555.
+LIBYUV_API
+int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
+                   uint8* dst_argb1555, int dst_stride_argb1555,
+                   int width, int height);
+
+// Convert ARGB To ARGB4444.
+LIBYUV_API
+int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
+                   uint8* dst_argb4444, int dst_stride_argb4444,
+                   int width, int height);
+
+// Convert ARGB To I444.
+LIBYUV_API
+int ARGBToI444(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert ARGB To I422.
+LIBYUV_API
+int ARGBToI422(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert ARGB To I420. (also in convert.h)
+LIBYUV_API
+int ARGBToI420(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert ARGB to J420. (JPeg full range I420).
+LIBYUV_API
+int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_yj, int dst_stride_yj,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert ARGB to J422.
+LIBYUV_API
+int ARGBToJ422(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_yj, int dst_stride_yj,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert ARGB To I411.
+LIBYUV_API
+int ARGBToI411(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert ARGB to J400. (JPeg full range).
+LIBYUV_API
+int ARGBToJ400(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_yj, int dst_stride_yj,
+               int width, int height);
+
+// Convert ARGB to I400.
+LIBYUV_API
+int ARGBToI400(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               int width, int height);
+
+// Convert ARGB to G. (Reverse of J400toARGB, which replicates G back to ARGB)
+LIBYUV_API
+int ARGBToG(const uint8* src_argb, int src_stride_argb,
+            uint8* dst_g, int dst_stride_g,
+            int width, int height);
+
+// Convert ARGB To NV12.
+LIBYUV_API
+int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_uv, int dst_stride_uv,
+               int width, int height);
+
+// Convert ARGB To NV21.
+LIBYUV_API
+int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_vu, int dst_stride_vu,
+               int width, int height);
+
+// Convert ARGB To NV21.
+LIBYUV_API
+int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_vu, int dst_stride_vu,
+               int width, int height);
+
+// Convert ARGB To YUY2.
+LIBYUV_API
+int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_yuy2, int dst_stride_yuy2,
+               int width, int height);
+
+// Convert ARGB To UYVY.
+LIBYUV_API
+int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_uyvy, int dst_stride_uyvy,
+               int width, int height);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_  NOLINT
diff --git a/libs/libaom/src/third_party/libyuv/include/libyuv/cpu_id.h b/libs/libaom/src/third_party/libyuv/include/libyuv/cpu_id.h
new file mode 100644
index 000000000..870e94e8c
--- /dev/null
+++ b/libs/libaom/src/third_party/libyuv/include/libyuv/cpu_id.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef INCLUDE_LIBYUV_CPU_ID_H_  // NOLINT
+#define INCLUDE_LIBYUV_CPU_ID_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// TODO(fbarchard): Consider overlapping bits for different architectures.
+// Internal flag to indicate cpuid requires initialization.
+#define kCpuInit 0x1
+
+// These flags are only valid on ARM processors.
+static const int kCpuHasARM = 0x2;
+static const int kCpuHasNEON = 0x4;
+// 0x8 reserved for future ARM flag.
+
+// These flags are only valid on x86 processors.
+static const int kCpuHasX86 = 0x10;
+static const int kCpuHasSSE2 = 0x20;
+static const int kCpuHasSSSE3 = 0x40;
+static const int kCpuHasSSE41 = 0x80;
+static const int kCpuHasSSE42 = 0x100;
+static const int kCpuHasAVX = 0x200;
+static const int kCpuHasAVX2 = 0x400;
+static const int kCpuHasERMS = 0x800;
+static const int kCpuHasFMA3 = 0x1000;
+// 0x2000, 0x4000, 0x8000 reserved for future X86 flags.
+
+// These flags are only valid on MIPS processors.
+static const int kCpuHasMIPS = 0x10000;
+static const int kCpuHasMIPS_DSP = 0x20000;
+static const int kCpuHasMIPS_DSPR2 = 0x40000;
+
+// Internal function used to auto-init.
+LIBYUV_API
+int InitCpuFlags(void);
+
+// Internal function for parsing /proc/cpuinfo.
+LIBYUV_API
+int ArmCpuCaps(const char* cpuinfo_name);
+
+// Detect CPU has SSE2 etc.
+// Test_flag parameter should be one of kCpuHas constants above.
+// returns non-zero if instruction set is detected
+static __inline int TestCpuFlag(int test_flag) {
+  LIBYUV_API extern int cpu_info_;
+  return (cpu_info_ == kCpuInit ? InitCpuFlags() : cpu_info_) & test_flag;
+}
+
+// For testing, allow CPU flags to be disabled.
+// ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3.
+// MaskCpuFlags(-1) to enable all cpu specific optimizations.
+// MaskCpuFlags(0) to disable all cpu specific optimizations.
+LIBYUV_API
+void MaskCpuFlags(int enable_flags);
+
+// Low level cpuid for X86. Returns zeros on other CPUs.
+// eax is the info type that you want.
+// ecx is typically the cpu number, and should normally be zero.
+LIBYUV_API
+void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_CPU_ID_H_  NOLINT
diff --git a/libs/libaom/src/third_party/libyuv/include/libyuv/mjpeg_decoder.h b/libs/libaom/src/third_party/libyuv/include/libyuv/mjpeg_decoder.h
new file mode 100644
index 000000000..fa1e51f9a
--- /dev/null
+++ b/libs/libaom/src/third_party/libyuv/include/libyuv/mjpeg_decoder.h
@@ -0,0 +1,193 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef INCLUDE_LIBYUV_MJPEG_DECODER_H_  // NOLINT
+#define INCLUDE_LIBYUV_MJPEG_DECODER_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+// NOTE: For a simplified public API use convert.h MJPGToI420().
+
+struct jpeg_common_struct;
+struct jpeg_decompress_struct;
+struct jpeg_source_mgr;
+
+namespace libyuv {
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+static const uint32 kUnknownDataSize = 0xFFFFFFFF;
+
+enum JpegSubsamplingType {
+  kJpegYuv420,
+  kJpegYuv422,
+  kJpegYuv411,
+  kJpegYuv444,
+  kJpegYuv400,
+  kJpegUnknown
+};
+
+struct Buffer {
+  const uint8* data;
+  int len;
+};
+
+struct BufferVector {
+  Buffer* buffers;
+  int len;
+  int pos;
+};
+
+struct SetJmpErrorMgr;
+
+// MJPEG ("Motion JPEG") is a pseudo-standard video codec where the frames are
+// simply independent JPEG images with a fixed huffman table (which is omitted).
+// It is rarely used in video transmission, but is common as a camera capture
+// format, especially in Logitech devices. This class implements a decoder for
+// MJPEG frames.
+//
+// See http://tools.ietf.org/html/rfc2435
+class LIBYUV_API MJpegDecoder {
+ public:
+  typedef void (*CallbackFunction)(void* opaque,
+                                   const uint8* const* data,
+                                   const int* strides,
+                                   int rows);
+
+  static const int kColorSpaceUnknown;
+  static const int kColorSpaceGrayscale;
+  static const int kColorSpaceRgb;
+  static const int kColorSpaceYCbCr;
+  static const int kColorSpaceCMYK;
+  static const int kColorSpaceYCCK;
+
+  MJpegDecoder();
+  ~MJpegDecoder();
+
+  // Loads a new frame, reads its headers, and determines the uncompressed
+  // image format.
+  // Returns LIBYUV_TRUE if image looks valid and format is supported.
+  // If return value is LIBYUV_TRUE, then the values for all the following
+  // getters are populated.
+  // src_len is the size of the compressed mjpeg frame in bytes.
+  LIBYUV_BOOL LoadFrame(const uint8* src, size_t src_len);
+
+  // Returns width of the last loaded frame in pixels.
+  int GetWidth();
+
+  // Returns height of the last loaded frame in pixels.
+  int GetHeight();
+
+  // Returns format of the last loaded frame. The return value is one of the
+  // kColorSpace* constants.
+  int GetColorSpace();
+
+  // Number of color components in the color space.
+  int GetNumComponents();
+
+  // Sample factors of the n-th component.
+  int GetHorizSampFactor(int component);
+
+  int GetVertSampFactor(int component);
+
+  int GetHorizSubSampFactor(int component);
+
+  int GetVertSubSampFactor(int component);
+
+  // Public for testability.
+  int GetImageScanlinesPerImcuRow();
+
+  // Public for testability.
+  int GetComponentScanlinesPerImcuRow(int component);
+
+  // Width of a component in bytes.
+  int GetComponentWidth(int component);
+
+  // Height of a component.
+  int GetComponentHeight(int component);
+
+  // Width of a component in bytes with padding for DCTSIZE. Public for testing.
+  int GetComponentStride(int component);
+
+  // Size of a component in bytes.
+  int GetComponentSize(int component);
+
+  // Call this after LoadFrame() if you decide you don't want to decode it
+  // after all.
+  LIBYUV_BOOL UnloadFrame();
+
+  // Decodes the entire image into a one-buffer-per-color-component format.
+  // dst_width must match exactly. dst_height must be <= to image height; if
+  // less, the image is cropped. "planes" must have size equal to at least
+  // GetNumComponents() and they must point to non-overlapping buffers of size
+  // at least GetComponentSize(i). The pointers in planes are incremented
+  // to point to after the end of the written data.
+  // TODO(fbarchard): Add dst_x, dst_y to allow specific rect to be decoded.
+  LIBYUV_BOOL DecodeToBuffers(uint8** planes, int dst_width, int dst_height);
+
+  // Decodes the entire image and passes the data via repeated calls to a
+  // callback function. Each call will get the data for a whole number of
+  // image scanlines.
+  // TODO(fbarchard): Add dst_x, dst_y to allow specific rect to be decoded.
+  LIBYUV_BOOL DecodeToCallback(CallbackFunction fn, void* opaque,
+                        int dst_width, int dst_height);
+
+  // The helper function which recognizes the jpeg sub-sampling type.
+  static JpegSubsamplingType JpegSubsamplingTypeHelper(
+     int* subsample_x, int* subsample_y, int number_of_components);
+
+ private:
+  void AllocOutputBuffers(int num_outbufs);
+  void DestroyOutputBuffers();
+
+  LIBYUV_BOOL StartDecode();
+  LIBYUV_BOOL FinishDecode();
+
+  void SetScanlinePointers(uint8** data);
+  LIBYUV_BOOL DecodeImcuRow();
+
+  int GetComponentScanlinePadding(int component);
+
+  // A buffer holding the input data for a frame.
+  Buffer buf_;
+  BufferVector buf_vec_;
+
+  jpeg_decompress_struct* decompress_struct_;
+  jpeg_source_mgr* source_mgr_;
+  SetJmpErrorMgr* error_mgr_;
+
+  // LIBYUV_TRUE iff at least one component has scanline padding. (i.e.,
+  // GetComponentScanlinePadding() != 0.)
+  LIBYUV_BOOL has_scanline_padding_;
+
+  // Temporaries used to point to scanline outputs.
+  int num_outbufs_;  // Outermost size of all arrays below.
+  uint8*** scanlines_;
+  int* scanlines_sizes_;
+  // Temporary buffer used for decoding when we can't decode directly to the
+  // output buffers. Large enough for just one iMCU row.
+  uint8** databuf_;
+  int* databuf_strides_;
+};
+
+}  // namespace libyuv
+
+#endif  //  __cplusplus
+#endif  // INCLUDE_LIBYUV_MJPEG_DECODER_H_  NOLINT
diff --git a/libs/libaom/src/third_party/libyuv/include/libyuv/planar_functions.h b/libs/libaom/src/third_party/libyuv/include/libyuv/planar_functions.h
new file mode 100644
index 000000000..7fe4d8eed
--- /dev/null
+++ b/libs/libaom/src/third_party/libyuv/include/libyuv/planar_functions.h
@@ -0,0 +1,454 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_  // NOLINT
+#define INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_
+
+#include "libyuv/basic_types.h"
+
+// TODO(fbarchard): Remove the following headers includes.
+#include "libyuv/convert.h"
+#include "libyuv/convert_argb.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Copy a plane of data.
+LIBYUV_API
+void CopyPlane(const uint8* src_y, int src_stride_y,
+               uint8* dst_y, int dst_stride_y,
+               int width, int height);
+
+LIBYUV_API
+void CopyPlane_16(const uint16* src_y, int src_stride_y,
+                  uint16* dst_y, int dst_stride_y,
+                  int width, int height);
+
+// Set a plane of data to a 32 bit value.
+LIBYUV_API
+void SetPlane(uint8* dst_y, int dst_stride_y,
+              int width, int height,
+              uint32 value);
+
+// Copy I400.  Supports inverting.
+LIBYUV_API
+int I400ToI400(const uint8* src_y, int src_stride_y,
+               uint8* dst_y, int dst_stride_y,
+               int width, int height);
+
+#define J400ToJ400 I400ToI400
+
+// Copy I422 to I422.
+#define I422ToI422 I422Copy
+LIBYUV_API
+int I422Copy(const uint8* src_y, int src_stride_y,
+             const uint8* src_u, int src_stride_u,
+             const uint8* src_v, int src_stride_v,
+             uint8* dst_y, int dst_stride_y,
+             uint8* dst_u, int dst_stride_u,
+             uint8* dst_v, int dst_stride_v,
+             int width, int height);
+
+// Copy I444 to I444.
+#define I444ToI444 I444Copy
+LIBYUV_API
+int I444Copy(const uint8* src_y, int src_stride_y,
+             const uint8* src_u, int src_stride_u,
+             const uint8* src_v, int src_stride_v,
+             uint8* dst_y, int dst_stride_y,
+             uint8* dst_u, int dst_stride_u,
+             uint8* dst_v, int dst_stride_v,
+             int width, int height);
+
+// Convert YUY2 to I422.
+LIBYUV_API
+int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert UYVY to I422.
+LIBYUV_API
+int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+LIBYUV_API
+int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_uv, int dst_stride_uv,
+               int width, int height);
+
+LIBYUV_API
+int UYVYToNV12(const uint8* src_uyvy, int src_stride_uyvy,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_uv, int dst_stride_uv,
+               int width, int height);
+
+// Convert I420 to I400. (calls CopyPlane ignoring u/v).
+LIBYUV_API
+int I420ToI400(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               int width, int height);
+
+// Alias
+#define J420ToJ400 I420ToI400
+#define I420ToI420Mirror I420Mirror
+
+// I420 mirror.
+LIBYUV_API
+int I420Mirror(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Alias
+#define I400ToI400Mirror I400Mirror
+
+// I400 mirror.  A single plane is mirrored horizontally.
+// Pass negative height to achieve 180 degree rotation.
+LIBYUV_API
+int I400Mirror(const uint8* src_y, int src_stride_y,
+               uint8* dst_y, int dst_stride_y,
+               int width, int height);
+
+// Alias
+#define ARGBToARGBMirror ARGBMirror
+
+// ARGB mirror.
+LIBYUV_API
+int ARGBMirror(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert NV12 to RGB565.
+LIBYUV_API
+int NV12ToRGB565(const uint8* src_y, int src_stride_y,
+                 const uint8* src_uv, int src_stride_uv,
+                 uint8* dst_rgb565, int dst_stride_rgb565,
+                 int width, int height);
+
+// Convert NV21 to RGB565.
+LIBYUV_API
+int NV21ToRGB565(const uint8* src_y, int src_stride_y,
+                 const uint8* src_uv, int src_stride_uv,
+                 uint8* dst_rgb565, int dst_stride_rgb565,
+                 int width, int height);
+
+// I422ToARGB is in convert_argb.h
+// Convert I422 to BGRA.
+LIBYUV_API
+int I422ToBGRA(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_bgra, int dst_stride_bgra,
+               int width, int height);
+
+// Convert I422 to ABGR.
+LIBYUV_API
+int I422ToABGR(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_abgr, int dst_stride_abgr,
+               int width, int height);
+
+// Convert I422 to RGBA.
+LIBYUV_API
+int I422ToRGBA(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_rgba, int dst_stride_rgba,
+               int width, int height);
+
+// Draw a rectangle into I420.
+LIBYUV_API
+int I420Rect(uint8* dst_y, int dst_stride_y,
+             uint8* dst_u, int dst_stride_u,
+             uint8* dst_v, int dst_stride_v,
+             int x, int y, int width, int height,
+             int value_y, int value_u, int value_v);
+
+// Draw a rectangle into ARGB.
+LIBYUV_API
+int ARGBRect(uint8* dst_argb, int dst_stride_argb,
+             int x, int y, int width, int height, uint32 value);
+
+// Convert ARGB to gray scale ARGB.
+LIBYUV_API
+int ARGBGrayTo(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Make a rectangle of ARGB gray scale.
+LIBYUV_API
+int ARGBGray(uint8* dst_argb, int dst_stride_argb,
+             int x, int y, int width, int height);
+
+// Make a rectangle of ARGB Sepia tone.
+LIBYUV_API
+int ARGBSepia(uint8* dst_argb, int dst_stride_argb,
+              int x, int y, int width, int height);
+
+// Apply a matrix rotation to each ARGB pixel.
+// matrix_argb is 4 signed ARGB values. -128 to 127 representing -2 to 2.
+// The first 4 coefficients apply to B, G, R, A and produce B of the output.
+// The next 4 coefficients apply to B, G, R, A and produce G of the output.
+// The next 4 coefficients apply to B, G, R, A and produce R of the output.
+// The last 4 coefficients apply to B, G, R, A and produce A of the output.
+LIBYUV_API
+int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb,
+                    uint8* dst_argb, int dst_stride_argb,
+                    const int8* matrix_argb,
+                    int width, int height);
+
+// Deprecated. Use ARGBColorMatrix instead.
+// Apply a matrix rotation to each ARGB pixel.
+// matrix_argb is 3 signed ARGB values. -128 to 127 representing -1 to 1.
+// The first 4 coefficients apply to B, G, R, A and produce B of the output.
+// The next 4 coefficients apply to B, G, R, A and produce G of the output.
+// The last 4 coefficients apply to B, G, R, A and produce R of the output.
+LIBYUV_API
+int RGBColorMatrix(uint8* dst_argb, int dst_stride_argb,
+                   const int8* matrix_rgb,
+                   int x, int y, int width, int height);
+
+// Apply a color table each ARGB pixel.
+// Table contains 256 ARGB values.
+LIBYUV_API
+int ARGBColorTable(uint8* dst_argb, int dst_stride_argb,
+                   const uint8* table_argb,
+                   int x, int y, int width, int height);
+
+// Apply a color table each ARGB pixel but preserve destination alpha.
+// Table contains 256 ARGB values.
+LIBYUV_API
+int RGBColorTable(uint8* dst_argb, int dst_stride_argb,
+                  const uint8* table_argb,
+                  int x, int y, int width, int height);
+
+// Apply a luma/color table each ARGB pixel but preserve destination alpha.
+// Table contains 32768 values indexed by [Y][C] where 7 it 7 bit luma from
+// RGB (YJ style) and C is an 8 bit color component (R, G or B).
+LIBYUV_API
+int ARGBLumaColorTable(const uint8* src_argb, int src_stride_argb,
+                       uint8* dst_argb, int dst_stride_argb,
+                       const uint8* luma_rgb_table,
+                       int width, int height);
+
+// Apply a 3 term polynomial to ARGB values.
+// poly points to a 4x4 matrix.  The first row is constants.  The 2nd row is
+// coefficients for b, g, r and a.  The 3rd row is coefficients for b squared,
+// g squared, r squared and a squared.  The 4rd row is coefficients for b to
+// the 3, g to the 3, r to the 3 and a to the 3.  The values are summed and
+// result clamped to 0 to 255.
+// A polynomial approximation can be dirived using software such as 'R'.
+
+LIBYUV_API
+int ARGBPolynomial(const uint8* src_argb, int src_stride_argb,
+                   uint8* dst_argb, int dst_stride_argb,
+                   const float* poly,
+                   int width, int height);
+
+// Quantize a rectangle of ARGB. Alpha unaffected.
+// scale is a 16 bit fractional fixed point scaler between 0 and 65535.
+// interval_size should be a value between 1 and 255.
+// interval_offset should be a value between 0 and 255.
+LIBYUV_API
+int ARGBQuantize(uint8* dst_argb, int dst_stride_argb,
+                 int scale, int interval_size, int interval_offset,
+                 int x, int y, int width, int height);
+
+// Copy ARGB to ARGB.
+LIBYUV_API
+int ARGBCopy(const uint8* src_argb, int src_stride_argb,
+             uint8* dst_argb, int dst_stride_argb,
+             int width, int height);
+
+// Copy ARGB to ARGB.
+LIBYUV_API
+int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb,
+                  uint8* dst_argb, int dst_stride_argb,
+                  int width, int height);
+
+// Copy ARGB to ARGB.
+LIBYUV_API
+int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y,
+                     uint8* dst_argb, int dst_stride_argb,
+                     int width, int height);
+
+typedef void (*ARGBBlendRow)(const uint8* src_argb0, const uint8* src_argb1,
+                             uint8* dst_argb, int width);
+
+// Get function to Alpha Blend ARGB pixels and store to destination.
+LIBYUV_API
+ARGBBlendRow GetARGBBlend();
+
+// Alpha Blend ARGB images and store to destination.
+// Alpha of destination is set to 255.
+LIBYUV_API
+int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
+              const uint8* src_argb1, int src_stride_argb1,
+              uint8* dst_argb, int dst_stride_argb,
+              int width, int height);
+
+// Multiply ARGB image by ARGB image. Shifted down by 8. Saturates to 255.
+LIBYUV_API
+int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
+                 const uint8* src_argb1, int src_stride_argb1,
+                 uint8* dst_argb, int dst_stride_argb,
+                 int width, int height);
+
+// Add ARGB image with ARGB image. Saturates to 255.
+LIBYUV_API
+int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
+            const uint8* src_argb1, int src_stride_argb1,
+            uint8* dst_argb, int dst_stride_argb,
+            int width, int height);
+
+// Subtract ARGB image (argb1) from ARGB image (argb0). Saturates to 0.
+LIBYUV_API
+int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,
+                 const uint8* src_argb1, int src_stride_argb1,
+                 uint8* dst_argb, int dst_stride_argb,
+                 int width, int height);
+
+// Convert I422 to YUY2.
+LIBYUV_API
+int I422ToYUY2(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_frame, int dst_stride_frame,
+               int width, int height);
+
+// Convert I422 to UYVY.
+LIBYUV_API
+int I422ToUYVY(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_frame, int dst_stride_frame,
+               int width, int height);
+
+// Convert unattentuated ARGB to preattenuated ARGB.
+LIBYUV_API
+int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
+                  uint8* dst_argb, int dst_stride_argb,
+                  int width, int height);
+
+// Convert preattentuated ARGB to unattenuated ARGB.
+LIBYUV_API
+int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
+                    uint8* dst_argb, int dst_stride_argb,
+                    int width, int height);
+
+// Convert MJPG to ARGB.
+LIBYUV_API
+int MJPGToARGB(const uint8* sample, size_t sample_size,
+               uint8* argb, int argb_stride,
+               int w, int h, int dw, int dh);
+
+// Internal function - do not call directly.
+// Computes table of cumulative sum for image where the value is the sum
+// of all values above and to the left of the entry. Used by ARGBBlur.
+LIBYUV_API
+int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb,
+                             int32* dst_cumsum, int dst_stride32_cumsum,
+                             int width, int height);
+
+// Blur ARGB image.
+// dst_cumsum table of width * (height + 1) * 16 bytes aligned to
+//   16 byte boundary.
+// dst_stride32_cumsum is number of ints in a row (width * 4).
+// radius is number of pixels around the center.  e.g. 1 = 3x3. 2=5x5.
+// Blur is optimized for radius of 5 (11x11) or less.
+LIBYUV_API
+int ARGBBlur(const uint8* src_argb, int src_stride_argb,
+             uint8* dst_argb, int dst_stride_argb,
+             int32* dst_cumsum, int dst_stride32_cumsum,
+             int width, int height, int radius);
+
+// Multiply ARGB image by ARGB value.
+LIBYUV_API
+int ARGBShade(const uint8* src_argb, int src_stride_argb,
+              uint8* dst_argb, int dst_stride_argb,
+              int width, int height, uint32 value);
+
+// Interpolate between two ARGB images using specified amount of interpolation
+// (0 to 255) and store to destination.
+// 'interpolation' is specified as 8 bit fraction where 0 means 100% src_argb0
+// and 255 means 1% src_argb0 and 99% src_argb1.
+// Internally uses ARGBScale bilinear filtering.
+// Caveat: This function will write up to 16 bytes beyond the end of dst_argb.
+LIBYUV_API
+int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
+                    const uint8* src_argb1, int src_stride_argb1,
+                    uint8* dst_argb, int dst_stride_argb,
+                    int width, int height, int interpolation);
+
+#if defined(__pnacl__) || defined(__CLR_VER) || \
+    (defined(__i386__) && !defined(__SSE2__))
+#define LIBYUV_DISABLE_X86
+#endif
+// The following are available on all x86 platforms:
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
+#define HAS_ARGBAFFINEROW_SSE2
+#endif
+
+// Row function for copying pixels from a source with a slope to a row
+// of destination. Useful for scaling, rotation, mirror, texture mapping.
+LIBYUV_API
+void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
+                     uint8* dst_argb, const float* uv_dudv, int width);
+LIBYUV_API
+void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
+                        uint8* dst_argb, const float* uv_dudv, int width);
+
+// Shuffle ARGB channel order.  e.g. BGRA to ARGB.
+// shuffler is 16 bytes and must be aligned.
+LIBYUV_API
+int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
+                uint8* dst_argb, int dst_stride_argb,
+                const uint8* shuffler, int width, int height);
+
+// Sobel ARGB effect with planar output.
+LIBYUV_API
+int ARGBSobelToPlane(const uint8* src_argb, int src_stride_argb,
+                     uint8* dst_y, int dst_stride_y,
+                     int width, int height);
+
+// Sobel ARGB effect.
+LIBYUV_API
+int ARGBSobel(const uint8* src_argb, int src_stride_argb,
+              uint8* dst_argb, int dst_stride_argb,
+              int width, int height);
+
+// Sobel ARGB effect w/ Sobel X, Sobel, Sobel Y in ARGB.
+LIBYUV_API
+int ARGBSobelXY(const uint8* src_argb, int src_stride_argb,
+                uint8* dst_argb, int dst_stride_argb,
+                int width, int height);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_  NOLINT
diff --git a/libs/libaom/src/third_party/libyuv/include/libyuv/rotate.h b/libs/libaom/src/third_party/libyuv/include/libyuv/rotate.h
new file mode 100644
index 000000000..8a9673f28
--- /dev/null
+++ b/libs/libaom/src/third_party/libyuv/include/libyuv/rotate.h
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef INCLUDE_LIBYUV_ROTATE_H_  // NOLINT
+#define INCLUDE_LIBYUV_ROTATE_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Supported rotation.
+typedef enum RotationMode {
+  kRotate0 = 0,  // No rotation.
+  kRotate90 = 90,  // Rotate 90 degrees clockwise.
+  kRotate180 = 180,  // Rotate 180 degrees.
+  kRotate270 = 270,  // Rotate 270 degrees clockwise.
+
+  // Deprecated.
+  kRotateNone = 0,
+  kRotateClockwise = 90,
+  kRotateCounterClockwise = 270,
+} RotationModeEnum;
+
+// Rotate I420 frame.
+LIBYUV_API
+int I420Rotate(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int src_width, int src_height, enum RotationMode mode);
+
+// Rotate NV12 input and store in I420.
+LIBYUV_API
+int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
+                     const uint8* src_uv, int src_stride_uv,
+                     uint8* dst_y, int dst_stride_y,
+                     uint8* dst_u, int dst_stride_u,
+                     uint8* dst_v, int dst_stride_v,
+                     int src_width, int src_height, enum RotationMode mode);
+
+// Rotate a plane by 0, 90, 180, or 270.
+LIBYUV_API
+int RotatePlane(const uint8* src, int src_stride,
+                uint8* dst, int dst_stride,
+                int src_width, int src_height, enum RotationMode mode);
+
+// Rotate planes by 90, 180, 270. Deprecated.
+LIBYUV_API
+void RotatePlane90(const uint8* src, int src_stride,
+                   uint8* dst, int dst_stride,
+                   int width, int height);
+
+LIBYUV_API
+void RotatePlane180(const uint8* src, int src_stride,
+                    uint8* dst, int dst_stride,
+                    int width, int height);
+
+LIBYUV_API
+void RotatePlane270(const uint8* src, int src_stride,
+                    uint8* dst, int dst_stride,
+                    int width, int height);
+
+LIBYUV_API
+void RotateUV90(const uint8* src, int src_stride,
+                uint8* dst_a, int dst_stride_a,
+                uint8* dst_b, int dst_stride_b,
+                int width, int height);
+
+// Rotations for when U and V are interleaved.
+// These functions take one input pointer and
+// split the data into two buffers while
+// rotating them. Deprecated.
+LIBYUV_API
+void RotateUV180(const uint8* src, int src_stride,
+                 uint8* dst_a, int dst_stride_a,
+                 uint8* dst_b, int dst_stride_b,
+                 int width, int height);
+
+LIBYUV_API
+void RotateUV270(const uint8* src, int src_stride,
+                 uint8* dst_a, int dst_stride_a,
+                 uint8* dst_b, int dst_stride_b,
+                 int width, int height);
+
+// The 90 and 270 functions are based on transposes.
+// Doing a transpose with reversing the read/write
+// order will result in a rotation by +- 90 degrees.
+// Deprecated.
+LIBYUV_API
+void TransposePlane(const uint8* src, int src_stride,
+                    uint8* dst, int dst_stride,
+                    int width, int height);
+
+LIBYUV_API
+void TransposeUV(const uint8* src, int src_stride,
+                 uint8* dst_a, int dst_stride_a,
+                 uint8* dst_b, int dst_stride_b,
+                 int width, int height);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_ROTATE_H_  NOLINT
diff --git a/libs/libaom/src/third_party/libyuv/include/libyuv/rotate_argb.h b/libs/libaom/src/third_party/libyuv/include/libyuv/rotate_argb.h
new file mode 100644
index 000000000..2bdc8ec6b
--- /dev/null
+++ b/libs/libaom/src/third_party/libyuv/include/libyuv/rotate_argb.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef INCLUDE_LIBYUV_ROTATE_ARGB_H_  // NOLINT
+#define INCLUDE_LIBYUV_ROTATE_ARGB_H_
+
+#include "libyuv/basic_types.h"
+#include "libyuv/rotate.h"  // For RotationMode.
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Rotate ARGB frame
+LIBYUV_API
+int ARGBRotate(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_argb, int dst_stride_argb,
+               int src_width, int src_height, enum RotationMode mode);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_ROTATE_ARGB_H_  NOLINT
diff --git a/libs/libaom/src/third_party/libyuv/include/libyuv/rotate_row.h b/libs/libaom/src/third_party/libyuv/include/libyuv/rotate_row.h
new file mode 100644
index 000000000..d0bfbdd2b
--- /dev/null
+++ b/libs/libaom/src/third_party/libyuv/include/libyuv/rotate_row.h
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef INCLUDE_LIBYUV_ROTATE_ROW_H_  // NOLINT
+#define INCLUDE_LIBYUV_ROTATE_ROW_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if defined(__pnacl__) || defined(__CLR_VER) || \
+    (defined(__i386__) && !defined(__SSE2__))
+#define LIBYUV_DISABLE_X86
+#endif
+
+// Visual C 2012 required for AVX2.
+#if defined(_M_IX86) && !defined(__clang__) && \
+    defined(_MSC_VER) && _MSC_VER >= 1700
+#define VISUALC_HAS_AVX2 1
+#endif  // VisualStudio >= 2012
+
+// TODO(fbarchard): switch to standard form of inline; fails on clangcl.
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
+#if defined(__APPLE__) && defined(__i386__)
+#define DECLARE_FUNCTION(name)                                                 \
+    ".text                                     \n"                             \
+    ".private_extern _" #name "                \n"                             \
+    ".align 4,0x90                             \n"                             \
+"_" #name ":                                   \n"
+#elif defined(__MINGW32__) || defined(__CYGWIN__) && defined(__i386__)
+#define DECLARE_FUNCTION(name)                                                 \
+    ".text                                     \n"                             \
+    ".align 4,0x90                             \n"                             \
+"_" #name ":                                   \n"
+#else
+#define DECLARE_FUNCTION(name)                                                 \
+    ".text                                     \n"                             \
+    ".align 4,0x90                             \n"                             \
+#name ":                                       \n"
+#endif
+#endif
+
+// The following are available for Visual C:
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
+    defined(_MSC_VER) && !defined(__clang__)
+#define HAS_TRANSPOSEWX8_SSSE3
+#define HAS_TRANSPOSEUVWX8_SSE2
+#endif
+
+// The following are available for GCC but not NaCL:
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__)))
+#define HAS_TRANSPOSEWX8_SSSE3
+#endif
+
+// The following are available for 32 bit GCC:
+#if !defined(LIBYUV_DISABLE_X86) && defined(__i386__)  && !defined(__clang__)
+#define HAS_TRANSPOSEUVWX8_SSE2
+#endif
+
+// The following are available for 64 bit GCC but not NaCL:
+#if !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \
+    defined(__x86_64__)
+#define HAS_TRANSPOSEWX8_FAST_SSSE3
+#define HAS_TRANSPOSEUVWX8_SSE2
+#endif
+
+#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
+    (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
+#define HAS_TRANSPOSEWX8_NEON
+#define HAS_TRANSPOSEUVWX8_NEON
+#endif
+
+#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \
+    defined(__mips__) && \
+    defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+#define HAS_TRANSPOSEWX8_MIPS_DSPR2
+#define HAS_TRANSPOSEUVWx8_MIPS_DSPR2
+#endif  // defined(__mips__)
+
+void TransposeWxH_C(const uint8* src, int src_stride,
+                    uint8* dst, int dst_stride, int width, int height);
+
+void TransposeWx8_C(const uint8* src, int src_stride,
+                    uint8* dst, int dst_stride, int width);
+void TransposeWx8_NEON(const uint8* src, int src_stride,
+                       uint8* dst, int dst_stride, int width);
+void TransposeWx8_SSSE3(const uint8* src, int src_stride,
+                        uint8* dst, int dst_stride, int width);
+void TransposeWx8_Fast_SSSE3(const uint8* src, int src_stride,
+                             uint8* dst, int dst_stride, int width);
+void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride,
+                             uint8* dst, int dst_stride, int width);
+
+void TransposeWx8_Any_NEON(const uint8* src, int src_stride,
+                           uint8* dst, int dst_stride, int width);
+void TransposeWx8_Any_SSSE3(const uint8* src, int src_stride,
+                            uint8* dst, int dst_stride, int width);
+void TransposeWx8_Fast_Any_SSSE3(const uint8* src, int src_stride,
+                                 uint8* dst, int dst_stride, int width);
+void TransposeWx8_Any_MIPS_DSPR2(const uint8* src, int src_stride,
+                                 uint8* dst, int dst_stride, int width);
+
+void TransposeUVWxH_C(const uint8* src, int src_stride,
+                      uint8* dst_a, int dst_stride_a,
+                      uint8* dst_b, int dst_stride_b,
+                      int width, int height);
+
+void TransposeUVWx8_C(const uint8* src, int src_stride,
+                      uint8* dst_a, int dst_stride_a,
+                      uint8* dst_b, int dst_stride_b, int width);
+void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
+                         uint8* dst_a, int dst_stride_a,
+                         uint8* dst_b, int dst_stride_b, int width);
+void TransposeUVWx8_NEON(const uint8* src, int src_stride,
+                         uint8* dst_a, int dst_stride_a,
+                         uint8* dst_b, int dst_stride_b, int width);
+void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride,
+                               uint8* dst_a, int dst_stride_a,
+                               uint8* dst_b, int dst_stride_b, int width);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_ROTATE_ROW_H_  NOLINT
diff --git a/libs/libaom/src/third_party/libyuv/include/libyuv/row.h b/libs/libaom/src/third_party/libyuv/include/libyuv/row.h
new file mode 100644
index 000000000..5c3187ef7
--- /dev/null
+++ b/libs/libaom/src/third_party/libyuv/include/libyuv/row.h
@@ -0,0 +1,1857 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef INCLUDE_LIBYUV_ROW_H_  // NOLINT
+#define INCLUDE_LIBYUV_ROW_H_
+
+#include <stdlib.h>  // For malloc.
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a) - 1)))
+
+#ifdef __cplusplus
+#define align_buffer_64(var, size)                                             \
+  uint8* var##_mem = reinterpret_cast<uint8*>(malloc((size) + 63));            \
+  uint8* var = reinterpret_cast<uint8*>                                        \
+      ((reinterpret_cast<intptr_t>(var##_mem) + 63) & ~63)
+#else
+#define align_buffer_64(var, size)                                             \
+  uint8* var##_mem = (uint8*)(malloc((size) + 63));               /* NOLINT */ \
+  uint8* var = (uint8*)(((intptr_t)(var##_mem) + 63) & ~63)       /* NOLINT */
+#endif
+
+#define free_aligned_buffer_64(var) \
+  free(var##_mem);  \
+  var = 0
+
+#if defined(__pnacl__) || defined(__CLR_VER) || \
+    (defined(__i386__) && !defined(__SSE2__))
+#define LIBYUV_DISABLE_X86
+#endif
+// True if compiling for SSSE3 as a requirement.
+#if defined(__SSSE3__) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 3))
+#define LIBYUV_SSSE3_ONLY
+#endif
+
+#if defined(__native_client__)
+#define LIBYUV_DISABLE_NEON
+#endif
+// clang >= 3.5.0 required for Arm64.
+#if defined(__clang__) && defined(__aarch64__) && !defined(LIBYUV_DISABLE_NEON)
+#if (__clang_major__ < 3) || (__clang_major__ == 3 && (__clang_minor__ < 5))
+#define LIBYUV_DISABLE_NEON
+#endif  // clang >= 3.5
+#endif  // __clang__
+
+// The following are available on all x86 platforms:
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
+// Conversions:
+#define HAS_ABGRTOUVROW_SSSE3
+#define HAS_ABGRTOYROW_SSSE3
+#define HAS_ARGB1555TOARGBROW_SSE2
+#define HAS_ARGB4444TOARGBROW_SSE2
+#define HAS_ARGBSETROW_X86
+#define HAS_ARGBSHUFFLEROW_SSE2
+#define HAS_ARGBSHUFFLEROW_SSSE3
+#define HAS_ARGBTOARGB1555ROW_SSE2
+#define HAS_ARGBTOARGB4444ROW_SSE2
+#define HAS_ARGBTORAWROW_SSSE3
+#define HAS_ARGBTORGB24ROW_SSSE3
+#define HAS_ARGBTORGB565ROW_SSE2
+#define HAS_ARGBTOUV422ROW_SSSE3
+#define HAS_ARGBTOUV444ROW_SSSE3
+#define HAS_ARGBTOUVJROW_SSSE3
+#define HAS_ARGBTOUVROW_SSSE3
+#define HAS_ARGBTOYJROW_SSSE3
+#define HAS_ARGBTOYROW_SSSE3
+#define HAS_BGRATOUVROW_SSSE3
+#define HAS_BGRATOYROW_SSSE3
+#define HAS_COPYROW_ERMS
+#define HAS_COPYROW_SSE2
+#define HAS_I400TOARGBROW_SSE2
+#define HAS_I411TOARGBROW_SSSE3
+#define HAS_I422TOABGRROW_SSSE3
+#define HAS_I422TOARGB1555ROW_SSSE3
+#define HAS_I422TOARGB4444ROW_SSSE3
+#define HAS_I422TOARGBROW_SSSE3
+#define HAS_I422TOBGRAROW_SSSE3
+#define HAS_I422TORAWROW_SSSE3
+#define HAS_I422TORGB24ROW_SSSE3
+#define HAS_I422TORGB565ROW_SSSE3
+#define HAS_I422TORGBAROW_SSSE3
+#define HAS_I422TOUYVYROW_SSE2
+#define HAS_I422TOYUY2ROW_SSE2
+#define HAS_I444TOARGBROW_SSSE3
+#define HAS_J400TOARGBROW_SSE2
+#define HAS_J422TOARGBROW_SSSE3
+#define HAS_MERGEUVROW_SSE2
+#define HAS_MIRRORROW_SSE2
+#define HAS_MIRRORROW_SSSE3
+#define HAS_MIRRORROW_UV_SSSE3
+#define HAS_MIRRORUVROW_SSSE3
+#define HAS_NV12TOARGBROW_SSSE3
+#define HAS_NV12TORGB565ROW_SSSE3
+#define HAS_NV21TOARGBROW_SSSE3
+#define HAS_NV21TORGB565ROW_SSSE3
+#define HAS_RAWTOARGBROW_SSSE3
+#define HAS_RAWTOYROW_SSSE3
+#define HAS_RGB24TOARGBROW_SSSE3
+#define HAS_RGB24TOYROW_SSSE3
+#define HAS_RGB565TOARGBROW_SSE2
+#define HAS_RGBATOUVROW_SSSE3
+#define HAS_RGBATOYROW_SSSE3
+#define HAS_SETROW_ERMS
+#define HAS_SETROW_X86
+#define HAS_SPLITUVROW_SSE2
+#define HAS_UYVYTOARGBROW_SSSE3
+#define HAS_UYVYTOUV422ROW_SSE2
+#define HAS_UYVYTOUVROW_SSE2
+#define HAS_UYVYTOYROW_SSE2
+#define HAS_YUY2TOARGBROW_SSSE3
+#define HAS_YUY2TOUV422ROW_SSE2
+#define HAS_YUY2TOUVROW_SSE2
+#define HAS_YUY2TOYROW_SSE2
+
+// Effects:
+#define HAS_ARGBADDROW_SSE2
+#define HAS_ARGBAFFINEROW_SSE2
+#define HAS_ARGBATTENUATEROW_SSSE3
+#define HAS_ARGBBLENDROW_SSSE3
+#define HAS_ARGBCOLORMATRIXROW_SSSE3
+#define HAS_ARGBCOLORTABLEROW_X86
+#define HAS_ARGBCOPYALPHAROW_SSE2
+#define HAS_ARGBCOPYYTOALPHAROW_SSE2
+#define HAS_ARGBGRAYROW_SSSE3
+#define HAS_ARGBLUMACOLORTABLEROW_SSSE3
+#define HAS_ARGBMIRRORROW_SSE2
+#define HAS_ARGBMULTIPLYROW_SSE2
+#define HAS_ARGBPOLYNOMIALROW_SSE2
+#define HAS_ARGBQUANTIZEROW_SSE2
+#define HAS_ARGBSEPIAROW_SSSE3
+#define HAS_ARGBSHADEROW_SSE2
+#define HAS_ARGBSUBTRACTROW_SSE2
+#define HAS_ARGBUNATTENUATEROW_SSE2
+#define HAS_COMPUTECUMULATIVESUMROW_SSE2
+#define HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
+#define HAS_INTERPOLATEROW_SSE2
+#define HAS_INTERPOLATEROW_SSSE3
+#define HAS_RGBCOLORTABLEROW_X86
+#define HAS_SOBELROW_SSE2
+#define HAS_SOBELTOPLANEROW_SSE2
+#define HAS_SOBELXROW_SSE2
+#define HAS_SOBELXYROW_SSE2
+#define HAS_SOBELYROW_SSE2
+#endif
+
+// The following are available on x64 Visual C and clangcl.
+#if !defined(LIBYUV_DISABLE_X86) && defined (_M_X64) && \
+    (!defined(__clang__) || defined(__SSSE3__))
+#define HAS_I422TOARGBROW_SSSE3
+#endif
+
+// GCC >= 4.7.0 required for AVX2.
+#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
+#if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))
+#define GCC_HAS_AVX2 1
+#endif  // GNUC >= 4.7
+#endif  // __GNUC__
+
+// clang >= 3.4.0 required for AVX2.
+#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))
+#if (__clang_major__ > 3) || (__clang_major__ == 3 && (__clang_minor__ >= 4))
+#define CLANG_HAS_AVX2 1
+#endif  // clang >= 3.4
+#endif  // __clang__
+
+// Visual C 2012 required for AVX2.
+#if defined(_M_IX86) && !defined(__clang__) && \
+    defined(_MSC_VER) && _MSC_VER >= 1700
+#define VISUALC_HAS_AVX2 1
+#endif  // VisualStudio >= 2012
+
+// The following are available require VS2012.  Port to GCC.
+#if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2)
+#define HAS_ARGB1555TOARGBROW_AVX2
+#define HAS_ARGB4444TOARGBROW_AVX2
+#define HAS_ARGBTOARGB1555ROW_AVX2
+#define HAS_ARGBTOARGB4444ROW_AVX2
+#define HAS_ARGBTORGB565DITHERROW_AVX2
+#define HAS_ARGBTORGB565DITHERROW_SSE2
+#define HAS_ARGBTORGB565ROW_AVX2
+#define HAS_I411TOARGBROW_AVX2
+#define HAS_I422TOARGB1555ROW_AVX2
+#define HAS_I422TOARGB4444ROW_AVX2
+#define HAS_I422TORGB565ROW_AVX2
+#define HAS_I444TOARGBROW_AVX2
+#define HAS_J400TOARGBROW_AVX2
+#define HAS_NV12TOARGBROW_AVX2
+#define HAS_NV12TORGB565ROW_AVX2
+#define HAS_NV21TOARGBROW_AVX2
+#define HAS_NV21TORGB565ROW_AVX2
+#define HAS_RGB565TOARGBROW_AVX2
+#endif
+
+// The following are available on all x86 platforms, but
+// require VS2012, clang 3.4 or gcc 4.7.
+// The code supports NaCL but requires a new compiler and validator.
+#if !defined(LIBYUV_DISABLE_X86) && (defined(VISUALC_HAS_AVX2) || \
+    defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
+#define HAS_ARGBCOPYALPHAROW_AVX2
+#define HAS_ARGBCOPYYTOALPHAROW_AVX2
+#define HAS_ARGBMIRRORROW_AVX2
+#define HAS_ARGBPOLYNOMIALROW_AVX2
+#define HAS_ARGBSHUFFLEROW_AVX2
+#define HAS_ARGBTOUVROW_AVX2
+#define HAS_ARGBTOYJROW_AVX2
+#define HAS_ARGBTOYROW_AVX2
+#define HAS_COPYROW_AVX
+#define HAS_I400TOARGBROW_AVX2
+#define HAS_I422TOABGRROW_AVX2
+#define HAS_I422TOARGBROW_AVX2
+#define HAS_I422TOBGRAROW_AVX2
+#define HAS_I422TORAWROW_AVX2
+#define HAS_I422TORGB24ROW_AVX2
+#define HAS_I422TORGBAROW_AVX2
+#define HAS_INTERPOLATEROW_AVX2
+#define HAS_J422TOARGBROW_AVX2
+#define HAS_MERGEUVROW_AVX2
+#define HAS_MIRRORROW_AVX2
+#define HAS_SPLITUVROW_AVX2
+#define HAS_UYVYTOARGBROW_AVX2
+#define HAS_UYVYTOUV422ROW_AVX2
+#define HAS_UYVYTOUVROW_AVX2
+#define HAS_UYVYTOYROW_AVX2
+#define HAS_YUY2TOARGBROW_AVX2
+#define HAS_YUY2TOUV422ROW_AVX2
+#define HAS_YUY2TOUVROW_AVX2
+#define HAS_YUY2TOYROW_AVX2
+
+// Effects:
+#define HAS_ARGBADDROW_AVX2
+#define HAS_ARGBATTENUATEROW_AVX2
+#define HAS_ARGBMULTIPLYROW_AVX2
+#define HAS_ARGBSUBTRACTROW_AVX2
+#define HAS_ARGBUNATTENUATEROW_AVX2
+#endif
+
+// The following are disabled when SSSE3 is available:
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \
+    !defined(LIBYUV_SSSE3_ONLY)
+#define HAS_ARGBATTENUATEROW_SSE2
+#define HAS_ARGBBLENDROW_SSE2
+#define HAS_MIRRORROW_SSE2
+#endif
+
+// The following are available on Neon platforms:
+#if !defined(LIBYUV_DISABLE_NEON) && \
+    (defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON))
+#define HAS_ABGRTOUVROW_NEON
+#define HAS_ABGRTOYROW_NEON
+#define HAS_ARGB1555TOARGBROW_NEON
+#define HAS_ARGB1555TOUVROW_NEON
+#define HAS_ARGB1555TOYROW_NEON
+#define HAS_ARGB4444TOARGBROW_NEON
+#define HAS_ARGB4444TOUVROW_NEON
+#define HAS_ARGB4444TOYROW_NEON
+#define HAS_ARGBTOARGB1555ROW_NEON
+#define HAS_ARGBTOARGB4444ROW_NEON
+#define HAS_ARGBTORAWROW_NEON
+#define HAS_ARGBTORGB24ROW_NEON
+#define HAS_ARGBTORGB565ROW_NEON
+#define HAS_ARGBTOUV411ROW_NEON
+#define HAS_ARGBTOUV422ROW_NEON
+#define HAS_ARGBTOUV444ROW_NEON
+#define HAS_ARGBTOUVJROW_NEON
+#define HAS_ARGBTOUVROW_NEON
+#define HAS_ARGBTOYJROW_NEON
+#define HAS_ARGBTOYROW_NEON
+#define HAS_BGRATOUVROW_NEON
+#define HAS_BGRATOYROW_NEON
+#define HAS_COPYROW_NEON
+#define HAS_J400TOARGBROW_NEON
+#define HAS_I411TOARGBROW_NEON
+#define HAS_I422TOABGRROW_NEON
+#define HAS_I422TOARGB1555ROW_NEON
+#define HAS_I422TOARGB4444ROW_NEON
+#define HAS_I422TOARGBROW_NEON
+#define HAS_I422TOBGRAROW_NEON
+#define HAS_I422TORAWROW_NEON
+#define HAS_I422TORGB24ROW_NEON
+#define HAS_I422TORGB565ROW_NEON
+#define HAS_I422TORGBAROW_NEON
+#define HAS_I422TOUYVYROW_NEON
+#define HAS_I422TOYUY2ROW_NEON
+#define HAS_I444TOARGBROW_NEON
+#define HAS_MERGEUVROW_NEON
+#define HAS_MIRRORROW_NEON
+#define HAS_MIRRORUVROW_NEON
+#define HAS_NV12TOARGBROW_NEON
+#define HAS_NV12TORGB565ROW_NEON
+#define HAS_NV21TOARGBROW_NEON
+#define HAS_NV21TORGB565ROW_NEON
+#define HAS_RAWTOARGBROW_NEON
+#define HAS_RAWTOUVROW_NEON
+#define HAS_RAWTOYROW_NEON
+#define HAS_RGB24TOARGBROW_NEON
+#define HAS_RGB24TOUVROW_NEON
+#define HAS_RGB24TOYROW_NEON
+#define HAS_RGB565TOARGBROW_NEON
+#define HAS_RGB565TOUVROW_NEON
+#define HAS_RGB565TOYROW_NEON
+#define HAS_RGBATOUVROW_NEON
+#define HAS_RGBATOYROW_NEON
+#define HAS_SETROW_NEON
+#define HAS_ARGBSETROW_NEON
+#define HAS_SPLITUVROW_NEON
+#define HAS_UYVYTOARGBROW_NEON
+#define HAS_UYVYTOUV422ROW_NEON
+#define HAS_UYVYTOUVROW_NEON
+#define HAS_UYVYTOYROW_NEON
+#define HAS_I400TOARGBROW_NEON
+#define HAS_YUY2TOARGBROW_NEON
+#define HAS_YUY2TOUV422ROW_NEON
+#define HAS_YUY2TOUVROW_NEON
+#define HAS_YUY2TOYROW_NEON
+#define HAS_ARGBTORGB565DITHERROW_NEON
+
+// Effects:
+#define HAS_ARGBADDROW_NEON
+#define HAS_ARGBATTENUATEROW_NEON
+#define HAS_ARGBBLENDROW_NEON
+#define HAS_ARGBGRAYROW_NEON
+#define HAS_ARGBMIRRORROW_NEON
+#define HAS_ARGBMULTIPLYROW_NEON
+#define HAS_ARGBQUANTIZEROW_NEON
+#define HAS_ARGBSEPIAROW_NEON
+#define HAS_ARGBSHADEROW_NEON
+#define HAS_ARGBSUBTRACTROW_NEON
+#define HAS_INTERPOLATEROW_NEON
+#define HAS_SOBELROW_NEON
+#define HAS_SOBELTOPLANEROW_NEON
+#define HAS_SOBELXROW_NEON
+#define HAS_SOBELXYROW_NEON
+#define HAS_SOBELYROW_NEON
+#define HAS_ARGBCOLORMATRIXROW_NEON
+#define HAS_ARGBSHUFFLEROW_NEON
+#endif
+
+// The following are available on Mips platforms:
+#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) && \
+    (_MIPS_SIM == _MIPS_SIM_ABI32) && (__mips_isa_rev < 6)
+#define HAS_COPYROW_MIPS
+#if defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+#define HAS_I422TOABGRROW_MIPS_DSPR2
+#define HAS_I422TOARGBROW_MIPS_DSPR2
+#define HAS_I422TOBGRAROW_MIPS_DSPR2
+#define HAS_INTERPOLATEROW_MIPS_DSPR2
+#define HAS_MIRRORROW_MIPS_DSPR2
+#define HAS_MIRRORUVROW_MIPS_DSPR2
+#define HAS_SPLITUVROW_MIPS_DSPR2
+#endif
+#endif
+
+#if defined(_MSC_VER) && !defined(__CLR_VER)
+#define SIMD_ALIGNED(var) __declspec(align(16)) var
+#define SIMD_ALIGNED32(var) __declspec(align(64)) var
+typedef __declspec(align(16)) int16 vec16[8];
+typedef __declspec(align(16)) int32 vec32[4];
+typedef __declspec(align(16)) int8 vec8[16];
+typedef __declspec(align(16)) uint16 uvec16[8];
+typedef __declspec(align(16)) uint32 uvec32[4];
+typedef __declspec(align(16)) uint8 uvec8[16];
+typedef __declspec(align(32)) int16 lvec16[16];
+typedef __declspec(align(32)) int32 lvec32[8];
+typedef __declspec(align(32)) int8 lvec8[32];
+typedef __declspec(align(32)) uint16 ulvec16[16];
+typedef __declspec(align(32)) uint32 ulvec32[8];
+typedef __declspec(align(32)) uint8 ulvec8[32];
+#elif defined(__GNUC__)
+// Caveat GCC 4.2 to 4.7 have a known issue using vectors with const.
+#define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
+#define SIMD_ALIGNED32(var) var __attribute__((aligned(64)))
+typedef int16 __attribute__((vector_size(16))) vec16;
+typedef int32 __attribute__((vector_size(16))) vec32;
+typedef int8 __attribute__((vector_size(16))) vec8;
+typedef uint16 __attribute__((vector_size(16))) uvec16;
+typedef uint32 __attribute__((vector_size(16))) uvec32;
+typedef uint8 __attribute__((vector_size(16))) uvec8;
+typedef int16 __attribute__((vector_size(32))) lvec16;
+typedef int32 __attribute__((vector_size(32))) lvec32;
+typedef int8 __attribute__((vector_size(32))) lvec8;
+typedef uint16 __attribute__((vector_size(32))) ulvec16;
+typedef uint32 __attribute__((vector_size(32))) ulvec32;
+typedef uint8 __attribute__((vector_size(32))) ulvec8;
+#else
+#define SIMD_ALIGNED(var) var
+#define SIMD_ALIGNED32(var) var
+typedef int16 vec16[8];
+typedef int32 vec32[4];
+typedef int8 vec8[16];
+typedef uint16 uvec16[8];
+typedef uint32 uvec32[4];
+typedef uint8 uvec8[16];
+typedef int16 lvec16[16];
+typedef int32 lvec32[8];
+typedef int8 lvec8[32];
+typedef uint16 ulvec16[16];
+typedef uint32 ulvec32[8];
+typedef uint8 ulvec8[32];
+#endif
+
+#if defined(__APPLE__) || defined(__x86_64__) || defined(__llvm__)
+#define OMITFP
+#else
+#define OMITFP __attribute__((optimize("omit-frame-pointer")))
+#endif
+
+// NaCL macros for GCC x86 and x64.
+#if defined(__native_client__)
+#define LABELALIGN ".p2align 5\n"
+#else
+#define LABELALIGN
+#endif
+#if defined(__native_client__) && defined(__x86_64__)
+// r14 is used for MEMOP macros.
+#define NACL_R14 "r14",
+#define BUNDLELOCK ".bundle_lock\n"
+#define BUNDLEUNLOCK ".bundle_unlock\n"
+#define MEMACCESS(base) "%%nacl:(%%r15,%q" #base ")"
+#define MEMACCESS2(offset, base) "%%nacl:" #offset "(%%r15,%q" #base ")"
+#define MEMLEA(offset, base) #offset "(%q" #base ")"
+#define MEMLEA3(offset, index, scale) \
+    #offset "(,%q" #index "," #scale ")"
+#define MEMLEA4(offset, base, index, scale) \
+    #offset "(%q" #base ",%q" #index "," #scale ")"
+#define MEMMOVESTRING(s, d) "%%nacl:(%q" #s "),%%nacl:(%q" #d "), %%r15"
+#define MEMSTORESTRING(reg, d) "%%" #reg ",%%nacl:(%q" #d "), %%r15"
+#define MEMOPREG(opcode, offset, base, index, scale, reg) \
+    BUNDLELOCK \
+    "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
+    #opcode " (%%r15,%%r14),%%" #reg "\n" \
+    BUNDLEUNLOCK
+#define MEMOPMEM(opcode, reg, offset, base, index, scale) \
+    BUNDLELOCK \
+    "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
+    #opcode " %%" #reg ",(%%r15,%%r14)\n" \
+    BUNDLEUNLOCK
+#define MEMOPARG(opcode, offset, base, index, scale, arg) \
+    BUNDLELOCK \
+    "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
+    #opcode " (%%r15,%%r14),%" #arg "\n" \
+    BUNDLEUNLOCK
+#define VMEMOPREG(opcode, offset, base, index, scale, reg1, reg2) \
+    BUNDLELOCK \
+    "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
+    #opcode " (%%r15,%%r14),%%" #reg1 ",%%" #reg2 "\n" \
+    BUNDLEUNLOCK
+#define VEXTOPMEM(op, sel, reg, offset, base, index, scale) \
+    BUNDLELOCK \
+    "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
+    #op " $" #sel ",%%" #reg ",(%%r15,%%r14)\n" \
+    BUNDLEUNLOCK
+#else  // defined(__native_client__) && defined(__x86_64__)
+#define NACL_R14
+#define BUNDLEALIGN
+#define MEMACCESS(base) "(%" #base ")"
+#define MEMACCESS2(offset, base) #offset "(%" #base ")"
+#define MEMLEA(offset, base) #offset "(%" #base ")"
+#define MEMLEA3(offset, index, scale) \
+    #offset "(,%" #index "," #scale ")"
+#define MEMLEA4(offset, base, index, scale) \
+    #offset "(%" #base ",%" #index "," #scale ")"
+#define MEMMOVESTRING(s, d)
+#define MEMSTORESTRING(reg, d)
+#define MEMOPREG(opcode, offset, base, index, scale, reg) \
+    #opcode " " #offset "(%" #base ",%" #index "," #scale "),%%" #reg "\n"
+#define MEMOPMEM(opcode, reg, offset, base, index, scale) \
+    #opcode " %%" #reg ","#offset "(%" #base ",%" #index "," #scale ")\n"
+#define MEMOPARG(opcode, offset, base, index, scale, arg) \
+    #opcode " " #offset "(%" #base ",%" #index "," #scale "),%" #arg "\n"
+#define VMEMOPREG(opcode, offset, base, index, scale, reg1, reg2) \
+    #opcode " " #offset "(%" #base ",%" #index "," #scale "),%%" #reg1 ",%%" \
+    #reg2 "\n"
+#define VEXTOPMEM(op, sel, reg, offset, base, index, scale) \
+    #op " $" #sel ",%%" #reg ","#offset "(%" #base ",%" #index "," #scale ")\n"
+#endif  // defined(__native_client__) && defined(__x86_64__)
+
+#if defined(__arm__) || defined(__aarch64__)
+#undef MEMACCESS
+#if defined(__native_client__)
+#define MEMACCESS(base) ".p2align 3\nbic %" #base ", #0xc0000000\n"
+#else
+#define MEMACCESS(base)
+#endif
+#endif
+
+void I444ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        int width);
+void I422ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        int width);
+void I411ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        int width);
+void I422ToBGRARow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_bgra,
+                        int width);
+void I422ToABGRRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_abgr,
+                        int width);
+void I422ToRGBARow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_rgba,
+                        int width);
+void I422ToRGB24Row_NEON(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_rgb24,
+                         int width);
+void I422ToRAWRow_NEON(const uint8* src_y,
+                       const uint8* src_u,
+                       const uint8* src_v,
+                       uint8* dst_raw,
+                       int width);
+void I422ToRGB565Row_NEON(const uint8* src_y,
+                          const uint8* src_u,
+                          const uint8* src_v,
+                          uint8* dst_rgb565,
+                          int width);
+void I422ToARGB1555Row_NEON(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb1555,
+                            int width);
+void I422ToARGB4444Row_NEON(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb4444,
+                            int width);
+void NV12ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_uv,
+                        uint8* dst_argb,
+                        int width);
+void NV21ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_vu,
+                        uint8* dst_argb,
+                        int width);
+void NV12ToRGB565Row_NEON(const uint8* src_y,
+                          const uint8* src_uv,
+                          uint8* dst_rgb565,
+                          int width);
+void NV21ToRGB565Row_NEON(const uint8* src_y,
+                          const uint8* src_vu,
+                          uint8* dst_rgb565,
+                          int width);
+void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
+                        uint8* dst_argb,
+                        int width);
+void UYVYToARGBRow_NEON(const uint8* src_uyvy,
+                        uint8* dst_argb,
+                        int width);
+
+void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToYRow_Any_AVX2(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToYJRow_Any_AVX2(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix);
+void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix);
+void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix);
+void RGB24ToYRow_SSSE3(const uint8* src_rgb24, uint8* dst_y, int pix);
+void RAWToYRow_SSSE3(const uint8* src_raw, uint8* dst_y, int pix);
+void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+                         int pix);
+void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+                         int pix);
+void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+                         int pix);
+void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
+                      uint8* dst_u, uint8* dst_v, int pix);
+void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int pix);
+void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
+                      uint8* dst_u, uint8* dst_v, int pix);
+void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
+                      uint8* dst_u, uint8* dst_v, int pix);
+void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
+                      uint8* dst_u, uint8* dst_v, int pix);
+void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
+                       uint8* dst_u, uint8* dst_v, int pix);
+void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
+                     uint8* dst_u, uint8* dst_v, int pix);
+void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
+                        uint8* dst_u, uint8* dst_v, int pix);
+void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
+                          uint8* dst_u, uint8* dst_v, int pix);
+void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
+                          uint8* dst_u, uint8* dst_v, int pix);
+void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix);
+void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix);
+void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix);
+void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix);
+void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix);
+void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix);
+void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix);
+void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix);
+void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToYJRow_C(const uint8* src_argb, uint8* dst_y, int pix);
+void BGRAToYRow_C(const uint8* src_bgra, uint8* dst_y, int pix);
+void ABGRToYRow_C(const uint8* src_abgr, uint8* dst_y, int pix);
+void RGBAToYRow_C(const uint8* src_rgba, uint8* dst_y, int pix);
+void RGB24ToYRow_C(const uint8* src_rgb24, uint8* dst_y, int pix);
+void RAWToYRow_C(const uint8* src_raw, uint8* dst_y, int pix);
+void RGB565ToYRow_C(const uint8* src_rgb565, uint8* dst_y, int pix);
+void ARGB1555ToYRow_C(const uint8* src_argb1555, uint8* dst_y, int pix);
+void ARGB4444ToYRow_C(const uint8* src_argb4444, uint8* dst_y, int pix);
+void ARGBToYRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToYJRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void BGRAToYRow_Any_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix);
+void ABGRToYRow_Any_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix);
+void RGBAToYRow_Any_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix);
+void RGB24ToYRow_Any_SSSE3(const uint8* src_rgb24, uint8* dst_y, int pix);
+void RAWToYRow_Any_SSSE3(const uint8* src_raw, uint8* dst_y, int pix);
+void ARGBToYRow_Any_NEON(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToYJRow_Any_NEON(const uint8* src_argb, uint8* dst_y, int pix);
+void BGRAToYRow_Any_NEON(const uint8* src_bgra, uint8* dst_y, int pix);
+void ABGRToYRow_Any_NEON(const uint8* src_abgr, uint8* dst_y, int pix);
+void RGBAToYRow_Any_NEON(const uint8* src_rgba, uint8* dst_y, int pix);
+void RGB24ToYRow_Any_NEON(const uint8* src_rgb24, uint8* dst_y, int pix);
+void RAWToYRow_Any_NEON(const uint8* src_raw, uint8* dst_y, int pix);
+void RGB565ToYRow_Any_NEON(const uint8* src_rgb565, uint8* dst_y, int pix);
+void ARGB1555ToYRow_Any_NEON(const uint8* src_argb1555, uint8* dst_y, int pix);
+void ARGB4444ToYRow_Any_NEON(const uint8* src_argb4444, uint8* dst_y, int pix);
+
+void ARGBToUVRow_AVX2(const uint8* src_argb, int src_stride_argb,
+                      uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUVRow_Any_AVX2(const uint8* src_argb, int src_stride_argb,
+                          uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUVJRow_SSSE3(const uint8* src_argb, int src_stride_argb,
+                        uint8* dst_u, uint8* dst_v, int width);
+void BGRAToUVRow_SSSE3(const uint8* src_bgra, int src_stride_bgra,
+                       uint8* dst_u, uint8* dst_v, int width);
+void ABGRToUVRow_SSSE3(const uint8* src_abgr, int src_stride_abgr,
+                       uint8* dst_u, uint8* dst_v, int width);
+void RGBAToUVRow_SSSE3(const uint8* src_rgba, int src_stride_rgba,
+                       uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUVRow_Any_SSSE3(const uint8* src_argb, int src_stride_argb,
+                           uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUVJRow_Any_SSSE3(const uint8* src_argb, int src_stride_argb,
+                            uint8* dst_u, uint8* dst_v, int width);
+void BGRAToUVRow_Any_SSSE3(const uint8* src_bgra, int src_stride_bgra,
+                           uint8* dst_u, uint8* dst_v, int width);
+void ABGRToUVRow_Any_SSSE3(const uint8* src_abgr, int src_stride_abgr,
+                           uint8* dst_u, uint8* dst_v, int width);
+void RGBAToUVRow_Any_SSSE3(const uint8* src_rgba, int src_stride_rgba,
+                           uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUV444Row_Any_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+                             int pix);
+void ARGBToUV422Row_Any_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+                             int pix);
+void ARGBToUV411Row_Any_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+                             int pix);
+void ARGBToUVRow_Any_NEON(const uint8* src_argb, int src_stride_argb,
+                          uint8* dst_u, uint8* dst_v, int pix);
+void ARGBToUVJRow_Any_NEON(const uint8* src_argb, int src_stride_argb,
+                           uint8* dst_u, uint8* dst_v, int pix);
+void BGRAToUVRow_Any_NEON(const uint8* src_bgra, int src_stride_bgra,
+                          uint8* dst_u, uint8* dst_v, int pix);
+void ABGRToUVRow_Any_NEON(const uint8* src_abgr, int src_stride_abgr,
+                          uint8* dst_u, uint8* dst_v, int pix);
+void RGBAToUVRow_Any_NEON(const uint8* src_rgba, int src_stride_rgba,
+                          uint8* dst_u, uint8* dst_v, int pix);
+void RGB24ToUVRow_Any_NEON(const uint8* src_rgb24, int src_stride_rgb24,
+                           uint8* dst_u, uint8* dst_v, int pix);
+void RAWToUVRow_Any_NEON(const uint8* src_raw, int src_stride_raw,
+                         uint8* dst_u, uint8* dst_v, int pix);
+void RGB565ToUVRow_Any_NEON(const uint8* src_rgb565, int src_stride_rgb565,
+                            uint8* dst_u, uint8* dst_v, int pix);
+void ARGB1555ToUVRow_Any_NEON(const uint8* src_argb1555,
+                              int src_stride_argb1555,
+                              uint8* dst_u, uint8* dst_v, int pix);
+void ARGB4444ToUVRow_Any_NEON(const uint8* src_argb4444,
+                              int src_stride_argb4444,
+                              uint8* dst_u, uint8* dst_v, int pix);
+void ARGBToUVRow_C(const uint8* src_argb, int src_stride_argb,
+                   uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUVJRow_C(const uint8* src_argb, int src_stride_argb,
+                    uint8* dst_u, uint8* dst_v, int width);
+void BGRAToUVRow_C(const uint8* src_bgra, int src_stride_bgra,
+                   uint8* dst_u, uint8* dst_v, int width);
+void ABGRToUVRow_C(const uint8* src_abgr, int src_stride_abgr,
+                   uint8* dst_u, uint8* dst_v, int width);
+void RGBAToUVRow_C(const uint8* src_rgba, int src_stride_rgba,
+                   uint8* dst_u, uint8* dst_v, int width);
+void RGB24ToUVRow_C(const uint8* src_rgb24, int src_stride_rgb24,
+                    uint8* dst_u, uint8* dst_v, int width);
+void RAWToUVRow_C(const uint8* src_raw, int src_stride_raw,
+                  uint8* dst_u, uint8* dst_v, int width);
+void RGB565ToUVRow_C(const uint8* src_rgb565, int src_stride_rgb565,
+                     uint8* dst_u, uint8* dst_v, int width);
+void ARGB1555ToUVRow_C(const uint8* src_argb1555, int src_stride_argb1555,
+                       uint8* dst_u, uint8* dst_v, int width);
+void ARGB4444ToUVRow_C(const uint8* src_argb4444, int src_stride_argb4444,
+                       uint8* dst_u, uint8* dst_v, int width);
+
+void ARGBToUV444Row_SSSE3(const uint8* src_argb,
+                          uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUV444Row_Any_SSSE3(const uint8* src_argb,
+                              uint8* dst_u, uint8* dst_v, int width);
+
+void ARGBToUV422Row_SSSE3(const uint8* src_argb,
+                          uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUV422Row_Any_SSSE3(const uint8* src_argb,
+                              uint8* dst_u, uint8* dst_v, int width);
+
+void ARGBToUV444Row_C(const uint8* src_argb,
+                      uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUV422Row_C(const uint8* src_argb,
+                      uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUV411Row_C(const uint8* src_argb,
+                      uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUVJ422Row_C(const uint8* src_argb,
+                       uint8* dst_u, uint8* dst_v, int width);
+
+void MirrorRow_AVX2(const uint8* src, uint8* dst, int width);
+void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width);
+void MirrorRow_SSE2(const uint8* src, uint8* dst, int width);
+void MirrorRow_NEON(const uint8* src, uint8* dst, int width);
+void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width);
+void MirrorRow_C(const uint8* src, uint8* dst, int width);
+void MirrorRow_Any_AVX2(const uint8* src, uint8* dst, int width);
+void MirrorRow_Any_SSSE3(const uint8* src, uint8* dst, int width);
+void MirrorRow_Any_SSE2(const uint8* src, uint8* dst, int width);
+void MirrorRow_Any_NEON(const uint8* src, uint8* dst, int width);
+
+void MirrorUVRow_SSSE3(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                       int width);
+void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                      int width);
+void MirrorUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                            int width);
+void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                   int width);
+
+void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width);
+void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width);
+void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width);
+void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width);
+void ARGBMirrorRow_Any_AVX2(const uint8* src, uint8* dst, int width);
+void ARGBMirrorRow_Any_SSE2(const uint8* src, uint8* dst, int width);
+void ARGBMirrorRow_Any_NEON(const uint8* src, uint8* dst, int width);
+
+void SplitUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
+void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
+void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
+void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
+void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                           int pix);
+void SplitUVRow_Any_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                         int pix);
+void SplitUVRow_Any_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                         int pix);
+void SplitUVRow_Any_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                         int pix);
+void SplitUVRow_Any_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                               int pix);
+
+void MergeUVRow_C(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                  int width);
+void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                     int width);
+void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                     int width);
+void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                     int width);
+void MergeUVRow_Any_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                         int width);
+void MergeUVRow_Any_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                         int width);
+void MergeUVRow_Any_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                         int width);
+
+void CopyRow_SSE2(const uint8* src, uint8* dst, int count);
+void CopyRow_AVX(const uint8* src, uint8* dst, int count);
+void CopyRow_ERMS(const uint8* src, uint8* dst, int count);
+void CopyRow_NEON(const uint8* src, uint8* dst, int count);
+void CopyRow_MIPS(const uint8* src, uint8* dst, int count);
+void CopyRow_C(const uint8* src, uint8* dst, int count);
+void CopyRow_Any_SSE2(const uint8* src, uint8* dst, int count);
+void CopyRow_Any_AVX(const uint8* src, uint8* dst, int count);
+void CopyRow_Any_NEON(const uint8* src, uint8* dst, int count);
+
+void CopyRow_16_C(const uint16* src, uint16* dst, int count);
+
+void ARGBCopyAlphaRow_C(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBCopyAlphaRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBCopyAlphaRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width);
+
+void ARGBCopyYToAlphaRow_C(const uint8* src_y, uint8* dst_argb, int width);
+void ARGBCopyYToAlphaRow_SSE2(const uint8* src_y, uint8* dst_argb, int width);
+void ARGBCopyYToAlphaRow_AVX2(const uint8* src_y, uint8* dst_argb, int width);
+
+void SetRow_C(uint8* dst, uint8 v8, int count);
+void SetRow_X86(uint8* dst, uint8 v8, int count);
+void SetRow_ERMS(uint8* dst, uint8 v8, int count);
+void SetRow_NEON(uint8* dst, uint8 v8, int count);
+void SetRow_Any_X86(uint8* dst, uint8 v8, int count);
+void SetRow_Any_NEON(uint8* dst, uint8 v8, int count);
+
+void ARGBSetRow_C(uint8* dst_argb, uint32 v32, int count);
+void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count);
+void ARGBSetRow_NEON(uint8* dst_argb, uint32 v32, int count);
+void ARGBSetRow_Any_NEON(uint8* dst_argb, uint32 v32, int count);
+
+// ARGBShufflers for BGRAToARGB etc.
+void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb,
+                      const uint8* shuffler, int pix);
+void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
+                         const uint8* shuffler, int pix);
+void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+                          const uint8* shuffler, int pix);
+void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
+                         const uint8* shuffler, int pix);
+void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
+                         const uint8* shuffler, int pix);
+void ARGBShuffleRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb,
+                             const uint8* shuffler, int pix);
+void ARGBShuffleRow_Any_SSSE3(const uint8* src_argb, uint8* dst_argb,
+                              const uint8* shuffler, int pix);
+void ARGBShuffleRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb,
+                             const uint8* shuffler, int pix);
+void ARGBShuffleRow_Any_NEON(const uint8* src_argb, uint8* dst_argb,
+                             const uint8* shuffler, int pix);
+
+void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix);
+void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix);
+void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, int pix);
+void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
+                            int pix);
+void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
+                            int pix);
+void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb, int pix);
+void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb,
+                            int pix);
+void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb,
+                            int pix);
+
+void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix);
+void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix);
+void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix);
+void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
+                            int pix);
+void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
+                            int pix);
+void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int pix);
+void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int pix);
+void RGB565ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int pix);
+void ARGB1555ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int pix);
+void ARGB4444ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int pix);
+void RGB24ToARGBRow_Any_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix);
+void RAWToARGBRow_Any_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix);
+
+void RGB565ToARGBRow_Any_SSE2(const uint8* src_rgb565, uint8* dst_argb,
+                              int pix);
+void ARGB1555ToARGBRow_Any_SSE2(const uint8* src_argb1555, uint8* dst_argb,
+                                int pix);
+void ARGB4444ToARGBRow_Any_SSE2(const uint8* src_argb4444, uint8* dst_argb,
+                                int pix);
+void RGB565ToARGBRow_Any_AVX2(const uint8* src_rgb565, uint8* dst_argb,
+                              int pix);
+void ARGB1555ToARGBRow_Any_AVX2(const uint8* src_argb1555, uint8* dst_argb,
+                                int pix);
+void ARGB4444ToARGBRow_Any_AVX2(const uint8* src_argb4444, uint8* dst_argb,
+                                int pix);
+
+void RGB24ToARGBRow_Any_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix);
+void RAWToARGBRow_Any_NEON(const uint8* src_raw, uint8* dst_argb, int pix);
+void RGB565ToARGBRow_Any_NEON(const uint8* src_rgb565, uint8* dst_argb,
+                              int pix);
+void ARGB1555ToARGBRow_Any_NEON(const uint8* src_argb1555, uint8* dst_argb,
+                                int pix);
+void ARGB4444ToARGBRow_Any_NEON(const uint8* src_argb4444, uint8* dst_argb,
+                                int pix);
+
+void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
+
+void ARGBToRGB565DitherRow_C(const uint8* src_argb, uint8* dst_rgb,
+                             const uint32 dither4, int pix);
+void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb,
+                                const uint32 dither4, int pix);
+void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb,
+                                const uint32 dither4, int pix);
+
+void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
+
+void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
+                                const uint32 dither4, int width);
+
+void ARGBToRGBARow_C(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
+
+void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
+void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int pix);
+void J400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int pix);
+void J400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix);
+void J400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
+void J400ToARGBRow_Any_AVX2(const uint8* src_y, uint8* dst_argb, int pix);
+void J400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int pix);
+
+void I444ToARGBRow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* dst_argb,
+                     int width);
+void I422ToARGBRow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* dst_argb,
+                     int width);
+void I411ToARGBRow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* dst_argb,
+                     int width);
+void NV12ToARGBRow_C(const uint8* src_y,
+                     const uint8* src_uv,
+                     uint8* dst_argb,
+                     int width);
+void NV21ToRGB565Row_C(const uint8* src_y,
+                       const uint8* src_vu,
+                       uint8* dst_argb,
+                       int width);
+void NV12ToRGB565Row_C(const uint8* src_y,
+                       const uint8* src_uv,
+                       uint8* dst_argb,
+                       int width);
+void NV21ToARGBRow_C(const uint8* src_y,
+                     const uint8* src_vu,
+                     uint8* dst_argb,
+                     int width);
+void YUY2ToARGBRow_C(const uint8* src_yuy2,
+                     uint8* dst_argb,
+                     int width);
+void UYVYToARGBRow_C(const uint8* src_uyvy,
+                     uint8* dst_argb,
+                     int width);
+void J422ToARGBRow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* dst_argb,
+                     int width);
+void I422ToBGRARow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* dst_bgra,
+                     int width);
+void I422ToABGRRow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* dst_abgr,
+                     int width);
+void I422ToRGBARow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* dst_rgba,
+                     int width);
+void I422ToRGB24Row_C(const uint8* src_y,
+                      const uint8* src_u,
+                      const uint8* src_v,
+                      uint8* dst_rgb24,
+                      int width);
+void I422ToRAWRow_C(const uint8* src_y,
+                    const uint8* src_u,
+                    const uint8* src_v,
+                    uint8* dst_raw,
+                    int width);
+void I422ToARGB4444Row_C(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_argb4444,
+                         int width);
+void I422ToARGB1555Row_C(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_argb4444,
+                         int width);
+void I422ToRGB565Row_C(const uint8* src_y,
+                       const uint8* src_u,
+                       const uint8* src_v,
+                       uint8* dst_rgb565,
+                       int width);
+void I422ToARGBRow_AVX2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        int width);
+void I422ToBGRARow_AVX2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        int width);
+void I422ToRGBARow_AVX2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        int width);
+void I422ToABGRRow_AVX2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        int width);
+void I444ToARGBRow_SSSE3(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_argb,
+                         int width);
+void I444ToARGBRow_AVX2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        int width);
+void I422ToARGBRow_SSSE3(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_argb,
+                         int width);
+void I411ToARGBRow_SSSE3(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_argb,
+                         int width);
+void I411ToARGBRow_AVX2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        int width);
+void NV12ToARGBRow_SSSE3(const uint8* src_y,
+                         const uint8* src_uv,
+                         uint8* dst_argb,
+                         int width);
+void NV21ToARGBRow_SSSE3(const uint8* src_y,
+                         const uint8* src_vu,
+                         uint8* dst_argb,
+                         int width);
+void NV12ToARGBRow_AVX2(const uint8* src_y,
+                        const uint8* src_uv,
+                        uint8* dst_argb,
+                        int width);
+void NV21ToARGBRow_AVX2(const uint8* src_y,
+                        const uint8* src_vu,
+                        uint8* dst_argb,
+                        int width);
+void NV12ToRGB565Row_SSSE3(const uint8* src_y,
+                           const uint8* src_uv,
+                           uint8* dst_argb,
+                           int width);
+void NV21ToRGB565Row_SSSE3(const uint8* src_y,
+                           const uint8* src_vu,
+                           uint8* dst_argb,
+                           int width);
+void NV12ToRGB565Row_AVX2(const uint8* src_y,
+                          const uint8* src_uv,
+                          uint8* dst_argb,
+                          int width);
+void NV21ToRGB565Row_AVX2(const uint8* src_y,
+                          const uint8* src_vu,
+                          uint8* dst_argb,
+                          int width);
+void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2,
+                         uint8* dst_argb,
+                         int width);
+void UYVYToARGBRow_SSSE3(const uint8* src_uyvy,
+                         uint8* dst_argb,
+                         int width);
+void YUY2ToARGBRow_AVX2(const uint8* src_yuy2,
+                        uint8* dst_argb,
+                        int width);
+void UYVYToARGBRow_AVX2(const uint8* src_uyvy,
+                        uint8* dst_argb,
+                        int width);
+void J422ToARGBRow_SSSE3(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_argb,
+                         int width);
+void J422ToARGBRow_AVX2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        int width);
+void I422ToBGRARow_SSSE3(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_bgra,
+                         int width);
+void I422ToABGRRow_SSSE3(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_abgr,
+                         int width);
+void I422ToRGBARow_SSSE3(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_rgba,
+                         int width);
+void I422ToARGB4444Row_SSSE3(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
+                             uint8* dst_argb,
+                             int width);
+void I422ToARGB4444Row_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            int width);
+void I422ToARGB1555Row_SSSE3(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
+                             uint8* dst_argb,
+                             int width);
+void I422ToARGB1555Row_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            int width);
+void I422ToRGB565Row_SSSE3(const uint8* src_y,
+                           const uint8* src_u,
+                           const uint8* src_v,
+                           uint8* dst_argb,
+                           int width);
+void I422ToRGB565Row_AVX2(const uint8* src_y,
+                          const uint8* src_u,
+                          const uint8* src_v,
+                          uint8* dst_argb,
+                          int width);
+void I422ToRGB24Row_SSSE3(const uint8* src_y,
+                          const uint8* src_u,
+                          const uint8* src_v,
+                          uint8* dst_rgb24,
+                          int width);
+void I422ToRGB24Row_AVX2(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_rgb24,
+                         int width);
+void I422ToRAWRow_SSSE3(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_raw,
+                        int width);
+void I422ToRAWRow_AVX2(const uint8* src_y,
+                       const uint8* src_u,
+                       const uint8* src_v,
+                       uint8* dst_raw,
+                       int width);
+void I422ToARGBRow_Any_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            int width);
+void I422ToBGRARow_Any_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            int width);
+void I422ToRGBARow_Any_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            int width);
+void I422ToABGRRow_Any_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            int width);
+void I444ToARGBRow_Any_SSSE3(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
+                             uint8* dst_argb,
+                             int width);
+void I444ToARGBRow_Any_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            int width);
+void I422ToARGBRow_Any_SSSE3(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
+                             uint8* dst_argb,
+                             int width);
+void I411ToARGBRow_Any_SSSE3(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
+                             uint8* dst_argb,
+                             int width);
+void I411ToARGBRow_Any_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            int width);
+void NV12ToARGBRow_Any_SSSE3(const uint8* src_y,
+                             const uint8* src_uv,
+                             uint8* dst_argb,
+                             int width);
+void NV21ToARGBRow_Any_SSSE3(const uint8* src_y,
+                             const uint8* src_vu,
+                             uint8* dst_argb,
+                             int width);
+void NV12ToARGBRow_Any_AVX2(const uint8* src_y,
+                            const uint8* src_uv,
+                            uint8* dst_argb,
+                            int width);
+void NV21ToARGBRow_Any_AVX2(const uint8* src_y,
+                            const uint8* src_vu,
+                            uint8* dst_argb,
+                            int width);
+void NV12ToRGB565Row_Any_SSSE3(const uint8* src_y,
+                               const uint8* src_uv,
+                               uint8* dst_argb,
+                               int width);
+void NV21ToRGB565Row_Any_SSSE3(const uint8* src_y,
+                               const uint8* src_vu,
+                               uint8* dst_argb,
+                               int width);
+void NV12ToRGB565Row_Any_AVX2(const uint8* src_y,
+                              const uint8* src_uv,
+                              uint8* dst_argb,
+                              int width);
+void NV21ToRGB565Row_Any_AVX2(const uint8* src_y,
+                              const uint8* src_vu,
+                              uint8* dst_argb,
+                              int width);
+void YUY2ToARGBRow_Any_SSSE3(const uint8* src_yuy2,
+                             uint8* dst_argb,
+                             int width);
+void UYVYToARGBRow_Any_SSSE3(const uint8* src_uyvy,
+                             uint8* dst_argb,
+                             int width);
+void YUY2ToARGBRow_Any_AVX2(const uint8* src_yuy2,
+                            uint8* dst_argb,
+                            int width);
+void UYVYToARGBRow_Any_AVX2(const uint8* src_uyvy,
+                            uint8* dst_argb,
+                            int width);
+void J422ToARGBRow_Any_SSSE3(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
+                             uint8* dst_argb,
+                             int width);
+void J422ToARGBRow_Any_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            int width);
+void I422ToBGRARow_Any_SSSE3(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
+                             uint8* dst_bgra,
+                             int width);
+void I422ToABGRRow_Any_SSSE3(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
+                             uint8* dst_abgr,
+                             int width);
+void I422ToRGBARow_Any_SSSE3(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
+                             uint8* dst_rgba,
+                             int width);
+void I422ToARGB4444Row_Any_SSSE3(const uint8* src_y,
+                                 const uint8* src_u,
+                                 const uint8* src_v,
+                                 uint8* dst_rgba,
+                                 int width);
+void I422ToARGB4444Row_Any_AVX2(const uint8* src_y,
+                                const uint8* src_u,
+                                const uint8* src_v,
+                                uint8* dst_rgba,
+                                int width);
+void I422ToARGB1555Row_Any_SSSE3(const uint8* src_y,
+                                 const uint8* src_u,
+                                 const uint8* src_v,
+                                 uint8* dst_rgba,
+                                 int width);
+void I422ToARGB1555Row_Any_AVX2(const uint8* src_y,
+                                const uint8* src_u,
+                                const uint8* src_v,
+                                uint8* dst_rgba,
+                                int width);
+void I422ToRGB565Row_Any_SSSE3(const uint8* src_y,
+                               const uint8* src_u,
+                               const uint8* src_v,
+                               uint8* dst_rgba,
+                               int width);
+void I422ToRGB565Row_Any_AVX2(const uint8* src_y,
+                              const uint8* src_u,
+                              const uint8* src_v,
+                              uint8* dst_rgba,
+                              int width);
+void I422ToRGB24Row_Any_SSSE3(const uint8* src_y,
+                              const uint8* src_u,
+                              const uint8* src_v,
+                              uint8* dst_argb,
+                              int width);
+void I422ToRGB24Row_Any_AVX2(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
+                             uint8* dst_argb,
+                             int width);
+void I422ToRAWRow_Any_SSSE3(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            int width);
+void I422ToRAWRow_Any_AVX2(const uint8* src_y,
+                           const uint8* src_u,
+                           const uint8* src_v,
+                           uint8* dst_argb,
+                           int width);
+
+void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width);
+void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width);
+void I400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width);
+void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width);
+void I400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int width);
+void I400ToARGBRow_Any_AVX2(const uint8* src_y, uint8* dst_argb, int width);
+void I400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int width);
+
+// ARGB preattenuated alpha blend.
+void ARGBBlendRow_SSSE3(const uint8* src_argb, const uint8* src_argb1,
+                        uint8* dst_argb, int width);
+void ARGBBlendRow_SSE2(const uint8* src_argb, const uint8* src_argb1,
+                       uint8* dst_argb, int width);
+void ARGBBlendRow_NEON(const uint8* src_argb, const uint8* src_argb1,
+                       uint8* dst_argb, int width);
+void ARGBBlendRow_C(const uint8* src_argb, const uint8* src_argb1,
+                    uint8* dst_argb, int width);
+
+// ARGB multiply images. Same API as Blend, but these require
+// pointer and width alignment for SSE2.
+void ARGBMultiplyRow_C(const uint8* src_argb, const uint8* src_argb1,
+                       uint8* dst_argb, int width);
+void ARGBMultiplyRow_SSE2(const uint8* src_argb, const uint8* src_argb1,
+                          uint8* dst_argb, int width);
+void ARGBMultiplyRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1,
+                              uint8* dst_argb, int width);
+void ARGBMultiplyRow_AVX2(const uint8* src_argb, const uint8* src_argb1,
+                          uint8* dst_argb, int width);
+void ARGBMultiplyRow_Any_AVX2(const uint8* src_argb, const uint8* src_argb1,
+                              uint8* dst_argb, int width);
+void ARGBMultiplyRow_NEON(const uint8* src_argb, const uint8* src_argb1,
+                          uint8* dst_argb, int width);
+void ARGBMultiplyRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1,
+                              uint8* dst_argb, int width);
+
+// ARGB add images.
+void ARGBAddRow_C(const uint8* src_argb, const uint8* src_argb1,
+                  uint8* dst_argb, int width);
+void ARGBAddRow_SSE2(const uint8* src_argb, const uint8* src_argb1,
+                     uint8* dst_argb, int width);
+void ARGBAddRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1,
+                         uint8* dst_argb, int width);
+void ARGBAddRow_AVX2(const uint8* src_argb, const uint8* src_argb1,
+                     uint8* dst_argb, int width);
+void ARGBAddRow_Any_AVX2(const uint8* src_argb, const uint8* src_argb1,
+                         uint8* dst_argb, int width);
+void ARGBAddRow_NEON(const uint8* src_argb, const uint8* src_argb1,
+                     uint8* dst_argb, int width);
+void ARGBAddRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1,
+                         uint8* dst_argb, int width);
+
+// ARGB subtract images. Same API as Blend, but these require
+// pointer and width alignment for SSE2.
+void ARGBSubtractRow_C(const uint8* src_argb, const uint8* src_argb1,
+                       uint8* dst_argb, int width);
+void ARGBSubtractRow_SSE2(const uint8* src_argb, const uint8* src_argb1,
+                          uint8* dst_argb, int width);
+void ARGBSubtractRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1,
+                              uint8* dst_argb, int width);
+void ARGBSubtractRow_AVX2(const uint8* src_argb, const uint8* src_argb1,
+                          uint8* dst_argb, int width);
+void ARGBSubtractRow_Any_AVX2(const uint8* src_argb, const uint8* src_argb1,
+                              uint8* dst_argb, int width);
+void ARGBSubtractRow_NEON(const uint8* src_argb, const uint8* src_argb1,
+                          uint8* dst_argb, int width);
+void ARGBSubtractRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1,
+                              uint8* dst_argb, int width);
+
+void ARGBToRGB24Row_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRAWRow_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRGB565Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB1555Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB4444Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
+
+void ARGBToRGB565DitherRow_Any_SSE2(const uint8* src_argb, uint8* dst_rgb,
+                                    const uint32 dither4, int pix);
+void ARGBToRGB565DitherRow_Any_AVX2(const uint8* src_argb, uint8* dst_rgb,
+                                    const uint32 dither4, int pix);
+
+void ARGBToRGB565Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB1555Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB4444Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
+
+void ARGBToRGB24Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRAWRow_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRGB565Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB1555Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB4444Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRGB565DitherRow_Any_NEON(const uint8* src_argb, uint8* dst_rgb,
+                                    const uint32 dither4, int width);
+
+void I444ToARGBRow_Any_NEON(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            int width);
+void I422ToARGBRow_Any_NEON(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            int width);
+void I411ToARGBRow_Any_NEON(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            int width);
+void I422ToBGRARow_Any_NEON(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            int width);
+void I422ToABGRRow_Any_NEON(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            int width);
+void I422ToRGBARow_Any_NEON(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            int width);
+void I422ToRGB24Row_Any_NEON(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
+                             uint8* dst_argb,
+                             int width);
+void I422ToRAWRow_Any_NEON(const uint8* src_y,
+                           const uint8* src_u,
+                           const uint8* src_v,
+                           uint8* dst_argb,
+                           int width);
+void I422ToARGB4444Row_Any_NEON(const uint8* src_y,
+                                const uint8* src_u,
+                                const uint8* src_v,
+                                uint8* dst_argb,
+                                int width);
+void I422ToARGB1555Row_Any_NEON(const uint8* src_y,
+                                const uint8* src_u,
+                                const uint8* src_v,
+                                uint8* dst_argb,
+                                int width);
+void I422ToRGB565Row_Any_NEON(const uint8* src_y,
+                              const uint8* src_u,
+                              const uint8* src_v,
+                              uint8* dst_argb,
+                              int width);
+void NV12ToARGBRow_Any_NEON(const uint8* src_y,
+                            const uint8* src_uv,
+                            uint8* dst_argb,
+                            int width);
+void NV21ToARGBRow_Any_NEON(const uint8* src_y,
+                            const uint8* src_uv,
+                            uint8* dst_argb,
+                            int width);
+void NV12ToRGB565Row_Any_NEON(const uint8* src_y,
+                              const uint8* src_uv,
+                              uint8* dst_argb,
+                              int width);
+void NV21ToRGB565Row_Any_NEON(const uint8* src_y,
+                              const uint8* src_uv,
+                              uint8* dst_argb,
+                              int width);
+void YUY2ToARGBRow_Any_NEON(const uint8* src_yuy2,
+                            uint8* dst_argb,
+                            int width);
+void UYVYToARGBRow_Any_NEON(const uint8* src_uyvy,
+                            uint8* dst_argb,
+                            int width);
+void I422ToARGBRow_MIPS_DSPR2(const uint8* src_y,
+                              const uint8* src_u,
+                              const uint8* src_v,
+                              uint8* dst_argb,
+                              int width);
+void I422ToBGRARow_MIPS_DSPR2(const uint8* src_y,
+                              const uint8* src_u,
+                              const uint8* src_v,
+                              uint8* dst_argb,
+                              int width);
+void I422ToABGRRow_MIPS_DSPR2(const uint8* src_y,
+                              const uint8* src_u,
+                              const uint8* src_v,
+                              uint8* dst_argb,
+                              int width);
+void I422ToARGBRow_MIPS_DSPR2(const uint8* src_y,
+                              const uint8* src_u,
+                              const uint8* src_v,
+                              uint8* dst_argb,
+                              int width);
+void I422ToBGRARow_MIPS_DSPR2(const uint8* src_y,
+                              const uint8* src_u,
+                              const uint8* src_v,
+                              uint8* dst_argb,
+                              int width);
+void I422ToABGRRow_MIPS_DSPR2(const uint8* src_y,
+                              const uint8* src_u,
+                              const uint8* src_v,
+                              uint8* dst_argb,
+                              int width);
+
+void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int pix);
+void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
+                      uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
+                         uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix);
+void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
+                      uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
+                         uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix);
+void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
+                      uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToUV422Row_NEON(const uint8* src_yuy2,
+                         uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int pix);
+void YUY2ToUVRow_C(const uint8* src_yuy2, int stride_yuy2,
+                   uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToUV422Row_C(const uint8* src_yuy2,
+                      uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToYRow_Any_AVX2(const uint8* src_yuy2, uint8* dst_y, int pix);
+void YUY2ToUVRow_Any_AVX2(const uint8* src_yuy2, int stride_yuy2,
+                          uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToUV422Row_Any_AVX2(const uint8* src_yuy2,
+                             uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToYRow_Any_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix);
+void YUY2ToUVRow_Any_SSE2(const uint8* src_yuy2, int stride_yuy2,
+                          uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToUV422Row_Any_SSE2(const uint8* src_yuy2,
+                             uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToYRow_Any_NEON(const uint8* src_yuy2, uint8* dst_y, int pix);
+void YUY2ToUVRow_Any_NEON(const uint8* src_yuy2, int stride_yuy2,
+                          uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToUV422Row_Any_NEON(const uint8* src_yuy2,
+                             uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix);
+void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
+                      uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
+                         uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix);
+void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
+                      uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
+                         uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix);
+void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
+                      uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
+                         uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix);
+void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
+                      uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToUV422Row_NEON(const uint8* src_uyvy,
+                         uint8* dst_u, uint8* dst_v, int pix);
+
+void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int pix);
+void UYVYToUVRow_C(const uint8* src_uyvy, int stride_uyvy,
+                   uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToUV422Row_C(const uint8* src_uyvy,
+                      uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToYRow_Any_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix);
+void UYVYToUVRow_Any_AVX2(const uint8* src_uyvy, int stride_uyvy,
+                          uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToUV422Row_Any_AVX2(const uint8* src_uyvy,
+                             uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToYRow_Any_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix);
+void UYVYToUVRow_Any_SSE2(const uint8* src_uyvy, int stride_uyvy,
+                          uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToUV422Row_Any_SSE2(const uint8* src_uyvy,
+                             uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToYRow_Any_NEON(const uint8* src_uyvy, uint8* dst_y, int pix);
+void UYVYToUVRow_Any_NEON(const uint8* src_uyvy, int stride_uyvy,
+                          uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToUV422Row_Any_NEON(const uint8* src_uyvy,
+                             uint8* dst_u, uint8* dst_v, int pix);
+
+void I422ToYUY2Row_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* dst_yuy2, int width);
+void I422ToUYVYRow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* dst_uyvy, int width);
+void I422ToYUY2Row_SSE2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_yuy2, int width);
+void I422ToUYVYRow_SSE2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_uyvy, int width);
+void I422ToYUY2Row_Any_SSE2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_yuy2, int width);
+void I422ToUYVYRow_Any_SSE2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_uyvy, int width);
+void I422ToYUY2Row_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_yuy2, int width);
+void I422ToUYVYRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_uyvy, int width);
+void I422ToYUY2Row_Any_NEON(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_yuy2, int width);
+void I422ToUYVYRow_Any_NEON(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_uyvy, int width);
+
+// Effects related row functions.
+void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBAttenuateRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb,
+                               int width);
+void ARGBAttenuateRow_Any_SSSE3(const uint8* src_argb, uint8* dst_argb,
+                                int width);
+void ARGBAttenuateRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb,
+                               int width);
+void ARGBAttenuateRow_Any_NEON(const uint8* src_argb, uint8* dst_argb,
+                               int width);
+
+// Inverse table for unattenuate, shared by C and SSE2.
+extern const uint32 fixed_invtbl8[256];
+void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBUnattenuateRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb,
+                                 int width);
+void ARGBUnattenuateRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb,
+                                 int width);
+
+void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width);
+
+void ARGBSepiaRow_C(uint8* dst_argb, int width);
+void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width);
+void ARGBSepiaRow_NEON(uint8* dst_argb, int width);
+
+void ARGBColorMatrixRow_C(const uint8* src_argb, uint8* dst_argb,
+                          const int8* matrix_argb, int width);
+void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+                              const int8* matrix_argb, int width);
+void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
+                             const int8* matrix_argb, int width);
+
+void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width);
+void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width);
+
+void RGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width);
+void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width);
+
+void ARGBQuantizeRow_C(uint8* dst_argb, int scale, int interval_size,
+                       int interval_offset, int width);
+void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
+                          int interval_offset, int width);
+void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
+                          int interval_offset, int width);
+
+void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width,
+                    uint32 value);
+void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
+                       uint32 value);
+void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
+                       uint32 value);
+
+// Used for blur.
+void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
+                                    int width, int area, uint8* dst, int count);
+void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
+                                  const int32* previous_cumsum, int width);
+
+void CumulativeSumToAverageRow_C(const int32* topleft, const int32* botleft,
+                                 int width, int area, uint8* dst, int count);
+void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum,
+                               const int32* previous_cumsum, int width);
+
+LIBYUV_API
+void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
+                     uint8* dst_argb, const float* uv_dudv, int width);
+LIBYUV_API
+void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
+                        uint8* dst_argb, const float* uv_dudv, int width);
+
+// Used for I420Scale, ARGBScale, and ARGBInterpolate.
+void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr,
+                      ptrdiff_t src_stride_ptr,
+                      int width, int source_y_fraction);
+void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+                         ptrdiff_t src_stride_ptr, int width,
+                         int source_y_fraction);
+void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+                          ptrdiff_t src_stride_ptr, int width,
+                          int source_y_fraction);
+void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
+                         ptrdiff_t src_stride_ptr, int width,
+                         int source_y_fraction);
+void InterpolateRow_NEON(uint8* dst_ptr, const uint8* src_ptr,
+                         ptrdiff_t src_stride_ptr, int width,
+                         int source_y_fraction);
+void InterpolateRow_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
+                               ptrdiff_t src_stride_ptr, int width,
+                               int source_y_fraction);
+void InterpolateRow_Any_NEON(uint8* dst_ptr, const uint8* src_ptr,
+                             ptrdiff_t src_stride_ptr, int width,
+                             int source_y_fraction);
+void InterpolateRow_Any_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+                             ptrdiff_t src_stride_ptr, int width,
+                             int source_y_fraction);
+void InterpolateRow_Any_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+                              ptrdiff_t src_stride_ptr, int width,
+                              int source_y_fraction);
+void InterpolateRow_Any_AVX2(uint8* dst_ptr, const uint8* src_ptr,
+                             ptrdiff_t src_stride_ptr, int width,
+                             int source_y_fraction);
+void InterpolateRow_Any_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
+                                   ptrdiff_t src_stride_ptr, int width,
+                                   int source_y_fraction);
+
+void InterpolateRow_16_C(uint16* dst_ptr, const uint16* src_ptr,
+                         ptrdiff_t src_stride_ptr,
+                         int width, int source_y_fraction);
+
+// Sobel images.
+void SobelXRow_C(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2,
+                 uint8* dst_sobelx, int width);
+void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
+                    const uint8* src_y2, uint8* dst_sobelx, int width);
+void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
+                    const uint8* src_y2, uint8* dst_sobelx, int width);
+void SobelYRow_C(const uint8* src_y0, const uint8* src_y1,
+                 uint8* dst_sobely, int width);
+void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
+                    uint8* dst_sobely, int width);
+void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
+                    uint8* dst_sobely, int width);
+void SobelRow_C(const uint8* src_sobelx, const uint8* src_sobely,
+                uint8* dst_argb, int width);
+void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+                   uint8* dst_argb, int width);
+void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+                   uint8* dst_argb, int width);
+void SobelToPlaneRow_C(const uint8* src_sobelx, const uint8* src_sobely,
+                       uint8* dst_y, int width);
+void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+                          uint8* dst_y, int width);
+void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+                          uint8* dst_y, int width);
+void SobelXYRow_C(const uint8* src_sobelx, const uint8* src_sobely,
+                  uint8* dst_argb, int width);
+void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+                     uint8* dst_argb, int width);
+void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+                     uint8* dst_argb, int width);
+void SobelRow_Any_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+                       uint8* dst_argb, int width);
+void SobelRow_Any_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+                       uint8* dst_argb, int width);
+void SobelToPlaneRow_Any_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+                              uint8* dst_y, int width);
+void SobelToPlaneRow_Any_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+                              uint8* dst_y, int width);
+void SobelXYRow_Any_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+                         uint8* dst_argb, int width);
+void SobelXYRow_Any_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+                         uint8* dst_argb, int width);
+
+void ARGBPolynomialRow_C(const uint8* src_argb,
+                         uint8* dst_argb, const float* poly,
+                         int width);
+void ARGBPolynomialRow_SSE2(const uint8* src_argb,
+                            uint8* dst_argb, const float* poly,
+                            int width);
+void ARGBPolynomialRow_AVX2(const uint8* src_argb,
+                            uint8* dst_argb, const float* poly,
+                            int width);
+
+void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width,
+                             const uint8* luma, uint32 lumacoeff);
+void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+                                 int width,
+                                 const uint8* luma, uint32 lumacoeff);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_ROW_H_  NOLINT
diff --git a/libs/libaom/src/third_party/libyuv/include/libyuv/scale.h b/libs/libaom/src/third_party/libyuv/include/libyuv/scale.h
new file mode 100644
index 000000000..3974aba34
--- /dev/null
+++ b/libs/libaom/src/third_party/libyuv/include/libyuv/scale.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef INCLUDE_LIBYUV_SCALE_H_  // NOLINT
+#define INCLUDE_LIBYUV_SCALE_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Supported filtering.
+typedef enum FilterMode {
+  kFilterNone = 0,  // Point sample; Fastest.
+  kFilterLinear = 1,  // Filter horizontally only.
+  kFilterBilinear = 2,  // Faster than box, but lower quality scaling down.
+  kFilterBox = 3  // Highest quality.
+} FilterModeEnum;
+
+// Scale a YUV plane.
+LIBYUV_API
+void ScalePlane(const uint8* src, int src_stride,
+                int src_width, int src_height,
+                uint8* dst, int dst_stride,
+                int dst_width, int dst_height,
+                enum FilterMode filtering);
+
+LIBYUV_API
+void ScalePlane_16(const uint16* src, int src_stride,
+                   int src_width, int src_height,
+                   uint16* dst, int dst_stride,
+                   int dst_width, int dst_height,
+                   enum FilterMode filtering);
+
+// Scales a YUV 4:2:0 image from the src width and height to the
+// dst width and height.
+// If filtering is kFilterNone, a simple nearest-neighbor algorithm is
+// used. This produces basic (blocky) quality at the fastest speed.
+// If filtering is kFilterBilinear, interpolation is used to produce a better
+// quality image, at the expense of speed.
+// If filtering is kFilterBox, averaging is used to produce ever better
+// quality image, at further expense of speed.
+// Returns 0 if successful.
+
+LIBYUV_API
+int I420Scale(const uint8* src_y, int src_stride_y,
+              const uint8* src_u, int src_stride_u,
+              const uint8* src_v, int src_stride_v,
+              int src_width, int src_height,
+              uint8* dst_y, int dst_stride_y,
+              uint8* dst_u, int dst_stride_u,
+              uint8* dst_v, int dst_stride_v,
+              int dst_width, int dst_height,
+              enum FilterMode filtering);
+
+LIBYUV_API
+int I420Scale_16(const uint16* src_y, int src_stride_y,
+                 const uint16* src_u, int src_stride_u,
+                 const uint16* src_v, int src_stride_v,
+                 int src_width, int src_height,
+                 uint16* dst_y, int dst_stride_y,
+                 uint16* dst_u, int dst_stride_u,
+                 uint16* dst_v, int dst_stride_v,
+                 int dst_width, int dst_height,
+                 enum FilterMode filtering);
+
+#ifdef __cplusplus
+// Legacy API.  Deprecated.
+LIBYUV_API
+int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
+          int src_stride_y, int src_stride_u, int src_stride_v,
+          int src_width, int src_height,
+          uint8* dst_y, uint8* dst_u, uint8* dst_v,
+          int dst_stride_y, int dst_stride_u, int dst_stride_v,
+          int dst_width, int dst_height,
+          LIBYUV_BOOL interpolate);
+
+// Legacy API.  Deprecated.
+LIBYUV_API
+int ScaleOffset(const uint8* src_i420, int src_width, int src_height,
+                uint8* dst_i420, int dst_width, int dst_height, int dst_yoffset,
+                LIBYUV_BOOL interpolate);
+
+// For testing, allow disabling of specialized scalers.
+LIBYUV_API
+void SetUseReferenceImpl(LIBYUV_BOOL use);
+#endif  // __cplusplus
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_SCALE_H_  NOLINT
diff --git a/libs/libaom/src/third_party/libyuv/include/libyuv/scale_argb.h b/libs/libaom/src/third_party/libyuv/include/libyuv/scale_argb.h
new file mode 100644
index 000000000..22563837d
--- /dev/null
+++ b/libs/libaom/src/third_party/libyuv/include/libyuv/scale_argb.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef INCLUDE_LIBYUV_SCALE_ARGB_H_  // NOLINT
+#define INCLUDE_LIBYUV_SCALE_ARGB_H_
+
+#include "libyuv/basic_types.h"
+#include "libyuv/scale.h"  // For FilterMode
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+LIBYUV_API
+int ARGBScale(const uint8* src_argb, int src_stride_argb,
+              int src_width, int src_height,
+              uint8* dst_argb, int dst_stride_argb,
+              int dst_width, int dst_height,
+              enum FilterMode filtering);
+
+// Clipped scale takes destination rectangle coordinates for clip values.
+LIBYUV_API
+int ARGBScaleClip(const uint8* src_argb, int src_stride_argb,
+                  int src_width, int src_height,
+                  uint8* dst_argb, int dst_stride_argb,
+                  int dst_width, int dst_height,
+                  int clip_x, int clip_y, int clip_width, int clip_height,
+                  enum FilterMode filtering);
+
+// TODO(fbarchard): Implement this.
+// Scale with YUV conversion to ARGB and clipping.
+LIBYUV_API
+int YUVToARGBScaleClip(const uint8* src_y, int src_stride_y,
+                       const uint8* src_u, int src_stride_u,
+                       const uint8* src_v, int src_stride_v,
+                       uint32 src_fourcc,
+                       int src_width, int src_height,
+                       uint8* dst_argb, int dst_stride_argb,
+                       uint32 dst_fourcc,
+                       int dst_width, int dst_height,
+                       int clip_x, int clip_y, int clip_width, int clip_height,
+                       enum FilterMode filtering);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_SCALE_ARGB_H_  NOLINT
diff --git a/libs/libaom/src/third_party/libyuv/include/libyuv/scale_row.h b/libs/libaom/src/third_party/libyuv/include/libyuv/scale_row.h
new file mode 100644
index 000000000..a46b5ce69
--- /dev/null
+++ b/libs/libaom/src/third_party/libyuv/include/libyuv/scale_row.h
@@ -0,0 +1,479 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef INCLUDE_LIBYUV_SCALE_ROW_H_  // NOLINT
+#define INCLUDE_LIBYUV_SCALE_ROW_H_
+
+#include "libyuv/basic_types.h"
+#include "libyuv/scale.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if defined(__pnacl__) || defined(__CLR_VER) || \
+    (defined(__i386__) && !defined(__SSE2__))
+#define LIBYUV_DISABLE_X86
+#endif
+
+// Visual C 2012 required for AVX2.
+#if defined(_M_IX86) && !defined(__clang__) && \
+    defined(_MSC_VER) && _MSC_VER >= 1700
+#define VISUALC_HAS_AVX2 1
+#endif  // VisualStudio >= 2012
+
+// The following are available on all x86 platforms:
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
+#define HAS_FIXEDDIV1_X86
+#define HAS_FIXEDDIV_X86
+#define HAS_SCALEARGBCOLS_SSE2
+#define HAS_SCALEARGBCOLSUP2_SSE2
+#define HAS_SCALEARGBFILTERCOLS_SSSE3
+#define HAS_SCALEARGBROWDOWN2_SSE2
+#define HAS_SCALEARGBROWDOWNEVEN_SSE2
+#define HAS_SCALECOLSUP2_SSE2
+#define HAS_SCALEFILTERCOLS_SSSE3
+#define HAS_SCALEROWDOWN2_SSE2
+#define HAS_SCALEROWDOWN34_SSSE3
+#define HAS_SCALEROWDOWN38_SSSE3
+#define HAS_SCALEROWDOWN4_SSE2
+#endif
+
+// The following are available on VS2012:
+#if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2)
+#define HAS_SCALEADDROW_AVX2
+#define HAS_SCALEROWDOWN2_AVX2
+#define HAS_SCALEROWDOWN4_AVX2
+#endif
+
+// The following are available on Visual C:
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && !defined(__clang__)
+#define HAS_SCALEADDROW_SSE2
+#endif
+
+// The following are available on Neon platforms:
+#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
+    (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
+#define HAS_SCALEARGBCOLS_NEON
+#define HAS_SCALEARGBROWDOWN2_NEON
+#define HAS_SCALEARGBROWDOWNEVEN_NEON
+#define HAS_SCALEFILTERCOLS_NEON
+#define HAS_SCALEROWDOWN2_NEON
+#define HAS_SCALEROWDOWN34_NEON
+#define HAS_SCALEROWDOWN38_NEON
+#define HAS_SCALEROWDOWN4_NEON
+#define HAS_SCALEARGBFILTERCOLS_NEON
+#endif
+
+// The following are available on Mips platforms:
+#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \
+    defined(__mips__) && defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+#define HAS_SCALEROWDOWN2_MIPS_DSPR2
+#define HAS_SCALEROWDOWN4_MIPS_DSPR2
+#define HAS_SCALEROWDOWN34_MIPS_DSPR2
+#define HAS_SCALEROWDOWN38_MIPS_DSPR2
+#endif
+
+// Scale ARGB vertically with bilinear interpolation.
+void ScalePlaneVertical(int src_height,
+                        int dst_width, int dst_height,
+                        int src_stride, int dst_stride,
+                        const uint8* src_argb, uint8* dst_argb,
+                        int x, int y, int dy,
+                        int bpp, enum FilterMode filtering);
+
+void ScalePlaneVertical_16(int src_height,
+                           int dst_width, int dst_height,
+                           int src_stride, int dst_stride,
+                           const uint16* src_argb, uint16* dst_argb,
+                           int x, int y, int dy,
+                           int wpp, enum FilterMode filtering);
+
+// Simplify the filtering based on scale factors.
+enum FilterMode ScaleFilterReduce(int src_width, int src_height,
+                                  int dst_width, int dst_height,
+                                  enum FilterMode filtering);
+
+// Divide num by div and return as 16.16 fixed point result.
+int FixedDiv_C(int num, int div);
+int FixedDiv_X86(int num, int div);
+// Divide num - 1 by div - 1 and return as 16.16 fixed point result.
+int FixedDiv1_C(int num, int div);
+int FixedDiv1_X86(int num, int div);
+#ifdef HAS_FIXEDDIV_X86
+#define FixedDiv FixedDiv_X86
+#define FixedDiv1 FixedDiv1_X86
+#else
+#define FixedDiv FixedDiv_C
+#define FixedDiv1 FixedDiv1_C
+#endif
+
+// Compute slope values for stepping.
+void ScaleSlope(int src_width, int src_height,
+                int dst_width, int dst_height,
+                enum FilterMode filtering,
+                int* x, int* y, int* dx, int* dy);
+
+void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                     uint8* dst, int dst_width);
+void ScaleRowDown2_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                        uint16* dst, int dst_width);
+void ScaleRowDown2Linear_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst, int dst_width);
+void ScaleRowDown2Linear_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                              uint16* dst, int dst_width);
+void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst, int dst_width);
+void ScaleRowDown2Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                           uint16* dst, int dst_width);
+void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                     uint8* dst, int dst_width);
+void ScaleRowDown4_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                        uint16* dst, int dst_width);
+void ScaleRowDown4Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst, int dst_width);
+void ScaleRowDown4Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                           uint16* dst, int dst_width);
+void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                      uint8* dst, int dst_width);
+void ScaleRowDown34_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                         uint16* dst, int dst_width);
+void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* d, int dst_width);
+void ScaleRowDown34_0_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                               uint16* d, int dst_width);
+void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* d, int dst_width);
+void ScaleRowDown34_1_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                               uint16* d, int dst_width);
+void ScaleCols_C(uint8* dst_ptr, const uint8* src_ptr,
+                 int dst_width, int x, int dx);
+void ScaleCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
+                    int dst_width, int x, int dx);
+void ScaleColsUp2_C(uint8* dst_ptr, const uint8* src_ptr,
+                    int dst_width, int, int);
+void ScaleColsUp2_16_C(uint16* dst_ptr, const uint16* src_ptr,
+                       int dst_width, int, int);
+void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
+                       int dst_width, int x, int dx);
+void ScaleFilterCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
+                          int dst_width, int x, int dx);
+void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr,
+                         int dst_width, int x, int dx);
+void ScaleFilterCols64_16_C(uint16* dst_ptr, const uint16* src_ptr,
+                            int dst_width, int x, int dx);
+void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                      uint8* dst, int dst_width);
+void ScaleRowDown38_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                         uint16* dst, int dst_width);
+void ScaleRowDown38_3_Box_C(const uint8* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_3_Box_16_C(const uint16* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint16* dst_ptr, int dst_width);
+void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                               uint16* dst_ptr, int dst_width);
+void ScaleAddRow_C(const uint8* src_ptr, uint16* dst_ptr, int src_width);
+void ScaleAddRow_16_C(const uint16* src_ptr, uint32* dst_ptr, int src_width);
+void ScaleARGBRowDown2_C(const uint8* src_argb,
+                         ptrdiff_t src_stride,
+                         uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2Linear_C(const uint8* src_argb,
+                               ptrdiff_t src_stride,
+                               uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2Box_C(const uint8* src_argb, ptrdiff_t src_stride,
+                            uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEven_C(const uint8* src_argb, ptrdiff_t src_stride,
+                            int src_stepx,
+                            uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEvenBox_C(const uint8* src_argb,
+                               ptrdiff_t src_stride,
+                               int src_stepx,
+                               uint8* dst_argb, int dst_width);
+void ScaleARGBCols_C(uint8* dst_argb, const uint8* src_argb,
+                     int dst_width, int x, int dx);
+void ScaleARGBCols64_C(uint8* dst_argb, const uint8* src_argb,
+                       int dst_width, int x, int dx);
+void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb,
+                        int dst_width, int, int);
+void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb,
+                           int dst_width, int x, int dx);
+void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb,
+                             int dst_width, int x, int dx);
+
+// Specialized scalers for x86.
+void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                              uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width);
+void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                              uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width);
+void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width);
+void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width);
+void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width);
+void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width);
+
+void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                          uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                          uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width);
+void ScaleRowDown2_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Linear_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                  uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Box_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
+void ScaleRowDown2_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Linear_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                  uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Box_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width);
+void ScaleRowDown4_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst_ptr, int dst_width);
+void ScaleRowDown4Box_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
+void ScaleRowDown4_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst_ptr, int dst_width);
+void ScaleRowDown4Box_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
+
+void ScaleRowDown34_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                              uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_1_Box_Any_SSSE3(const uint8* src_ptr,
+                                    ptrdiff_t src_stride,
+                                    uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_0_Box_Any_SSSE3(const uint8* src_ptr,
+                                    ptrdiff_t src_stride,
+                                    uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                              uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_3_Box_Any_SSSE3(const uint8* src_ptr,
+                                    ptrdiff_t src_stride,
+                                    uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_2_Box_Any_SSSE3(const uint8* src_ptr,
+                                    ptrdiff_t src_stride,
+                                    uint8* dst_ptr, int dst_width);
+
+void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
+void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
+void ScaleAddRow_Any_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
+void ScaleAddRow_Any_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
+
+void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+                           int dst_width, int x, int dx);
+void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+                       int dst_width, int x, int dx);
+
+
+// ARGB Column functions
+void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
+                        int dst_width, int x, int dx);
+void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
+                               int dst_width, int x, int dx);
+void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
+                           int dst_width, int x, int dx);
+void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
+                              int dst_width, int x, int dx);
+void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
+                        int dst_width, int x, int dx);
+void ScaleARGBFilterCols_Any_NEON(uint8* dst_argb, const uint8* src_argb,
+                                  int dst_width, int x, int dx);
+void ScaleARGBCols_Any_NEON(uint8* dst_argb, const uint8* src_argb,
+                            int dst_width, int x, int dx);
+
+// ARGB Row functions
+void ScaleARGBRowDown2_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+                            uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+                                  uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+                               uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst, int dst_width);
+void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+                                  uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst, int dst_width);
+void ScaleARGBRowDown2_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+                                uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2Linear_Any_SSE2(const uint8* src_argb,
+                                      ptrdiff_t src_stride,
+                                      uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2Box_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+                                   uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                                uint8* dst, int dst_width);
+void ScaleARGBRowDown2Linear_Any_NEON(const uint8* src_argb,
+                                      ptrdiff_t src_stride,
+                                      uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                                   uint8* dst, int dst_width);
+
+void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+                               int src_stepx, uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+                                  int src_stepx,
+                                  uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+                               int src_stepx,
+                               uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+                                  int src_stepx,
+                                  uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEven_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+                                   int src_stepx,
+                                   uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEvenBox_Any_SSE2(const uint8* src_argb,
+                                      ptrdiff_t src_stride,
+                                      int src_stepx,
+                                      uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEven_Any_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+                                   int src_stepx,
+                                   uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEvenBox_Any_NEON(const uint8* src_argb,
+                                      ptrdiff_t src_stride,
+                                      int src_stepx,
+                                      uint8* dst_argb, int dst_width);
+
+// ScaleRowDown2Box also used by planar functions
+// NEON downscalers with interpolation.
+
+// Note - not static due to reuse in convert for 444 to 420.
+void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst, int dst_width);
+void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                              uint8* dst, int dst_width);
+void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst, int dst_width);
+
+void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width);
+void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width);
+
+// Down scale from 4 to 3 pixels. Use the neon multilane read/write
+//  to load up the every 4th pixel into a 4 different registers.
+// Point samples 32 pixels to 24 pixels.
+void ScaleRowDown34_NEON(const uint8* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
+
+// 32 -> 12
+void ScaleRowDown38_NEON(const uint8* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8* dst_ptr, int dst_width);
+// 32x3 -> 12x1
+void ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
+// 32x2 -> 12x1
+void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
+
+void ScaleRowDown2_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst, int dst_width);
+void ScaleRowDown2Linear_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                                  uint8* dst, int dst_width);
+void ScaleRowDown2Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst, int dst_width);
+void ScaleRowDown4_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst_ptr, int dst_width);
+void ScaleRowDown4Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                             uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_0_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                                   uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_1_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                                   uint8* dst_ptr, int dst_width);
+// 32 -> 12
+void ScaleRowDown38_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                             uint8* dst_ptr, int dst_width);
+// 32x3 -> 12x1
+void ScaleRowDown38_3_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
+// 32x2 -> 12x1
+void ScaleRowDown38_2_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
+
+void ScaleAddRow_NEON(const uint8* src_ptr, uint16* dst_ptr, int src_width);
+void ScaleAddRow_Any_NEON(const uint8* src_ptr, uint16* dst_ptr, int src_width);
+
+void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
+                          int dst_width, int x, int dx);
+
+void ScaleFilterCols_Any_NEON(uint8* dst_ptr, const uint8* src_ptr,
+                              int dst_width, int x, int dx);
+
+
+void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                              uint8* dst, int dst_width);
+void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                 uint8* dst, int dst_width);
+void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                              uint8* dst, int dst_width);
+void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                 uint8* dst, int dst_width);
+void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst, int dst_width);
+void ScaleRowDown34_0_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                     uint8* d, int dst_width);
+void ScaleRowDown34_1_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                     uint8* d, int dst_width);
+void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst, int dst_width);
+void ScaleRowDown38_2_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                     uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_3_Box_MIPS_DSPR2(const uint8* src_ptr,
+                                     ptrdiff_t src_stride,
+                                     uint8* dst_ptr, int dst_width);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_SCALE_ROW_H_  NOLINT
diff --git a/libs/libaom/src/third_party/libyuv/include/libyuv/version.h b/libs/libaom/src/third_party/libyuv/include/libyuv/version.h
new file mode 100644
index 000000000..287b98ebf
--- /dev/null
+++ b/libs/libaom/src/third_party/libyuv/include/libyuv/version.h
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
+#define INCLUDE_LIBYUV_VERSION_H_
+
+#define LIBYUV_VERSION 1456
+
+#endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
diff --git a/libs/libaom/src/third_party/libyuv/include/libyuv/video_common.h b/libs/libaom/src/third_party/libyuv/include/libyuv/video_common.h
new file mode 100644
index 000000000..7b0a19cc9
--- /dev/null
+++ b/libs/libaom/src/third_party/libyuv/include/libyuv/video_common.h
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// Common definitions for video, including fourcc and VideoFormat.
+
+#ifndef INCLUDE_LIBYUV_VIDEO_COMMON_H_  // NOLINT
+#define INCLUDE_LIBYUV_VIDEO_COMMON_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+//////////////////////////////////////////////////////////////////////////////
+// Definition of FourCC codes
+//////////////////////////////////////////////////////////////////////////////
+
+// Convert four characters to a FourCC code.
+// Needs to be a macro otherwise the OS X compiler complains when the kFormat*
+// constants are used in a switch.
+#ifdef __cplusplus
+#define FOURCC(a, b, c, d) ( \
+    (static_cast<uint32>(a)) | (static_cast<uint32>(b) << 8) | \
+    (static_cast<uint32>(c) << 16) | (static_cast<uint32>(d) << 24))
+#else
+#define FOURCC(a, b, c, d) ( \
+    ((uint32)(a)) | ((uint32)(b) << 8) | /* NOLINT */ \
+    ((uint32)(c) << 16) | ((uint32)(d) << 24))  /* NOLINT */
+#endif
+
+// Some pages discussing FourCC codes:
+//   http://www.fourcc.org/yuv.php
+//   http://v4l2spec.bytesex.org/spec/book1.htm
+//   http://developer.apple.com/quicktime/icefloe/dispatch020.html
+//   http://msdn.microsoft.com/library/windows/desktop/dd206750.aspx#nv12
+//   http://people.xiph.org/~xiphmont/containers/nut/nut4cc.txt
+
+// FourCC codes grouped according to implementation efficiency.
+// Primary formats should convert in 1 efficient step.
+// Secondary formats are converted in 2 steps.
+// Auxilliary formats call primary converters.
+enum FourCC {
+  // 9 Primary YUV formats: 5 planar, 2 biplanar, 2 packed.
+  FOURCC_I420 = FOURCC('I', '4', '2', '0'),
+  FOURCC_I422 = FOURCC('I', '4', '2', '2'),
+  FOURCC_I444 = FOURCC('I', '4', '4', '4'),
+  FOURCC_I411 = FOURCC('I', '4', '1', '1'),
+  FOURCC_I400 = FOURCC('I', '4', '0', '0'),
+  FOURCC_NV21 = FOURCC('N', 'V', '2', '1'),
+  FOURCC_NV12 = FOURCC('N', 'V', '1', '2'),
+  FOURCC_YUY2 = FOURCC('Y', 'U', 'Y', '2'),
+  FOURCC_UYVY = FOURCC('U', 'Y', 'V', 'Y'),
+
+  // 2 Secondary YUV formats: row biplanar.
+  FOURCC_M420 = FOURCC('M', '4', '2', '0'),
+  FOURCC_Q420 = FOURCC('Q', '4', '2', '0'), // deprecated.
+
+  // 9 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp.
+  FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'),
+  FOURCC_BGRA = FOURCC('B', 'G', 'R', 'A'),
+  FOURCC_ABGR = FOURCC('A', 'B', 'G', 'R'),
+  FOURCC_24BG = FOURCC('2', '4', 'B', 'G'),
+  FOURCC_RAW  = FOURCC('r', 'a', 'w', ' '),
+  FOURCC_RGBA = FOURCC('R', 'G', 'B', 'A'),
+  FOURCC_RGBP = FOURCC('R', 'G', 'B', 'P'),  // rgb565 LE.
+  FOURCC_RGBO = FOURCC('R', 'G', 'B', 'O'),  // argb1555 LE.
+  FOURCC_R444 = FOURCC('R', '4', '4', '4'),  // argb4444 LE.
+
+  // 4 Secondary RGB formats: 4 Bayer Patterns. deprecated.
+  FOURCC_RGGB = FOURCC('R', 'G', 'G', 'B'),
+  FOURCC_BGGR = FOURCC('B', 'G', 'G', 'R'),
+  FOURCC_GRBG = FOURCC('G', 'R', 'B', 'G'),
+  FOURCC_GBRG = FOURCC('G', 'B', 'R', 'G'),
+
+  // 1 Primary Compressed YUV format.
+  FOURCC_MJPG = FOURCC('M', 'J', 'P', 'G'),
+
+  // 5 Auxiliary YUV variations: 3 with U and V planes are swapped, 1 Alias.
+  FOURCC_YV12 = FOURCC('Y', 'V', '1', '2'),
+  FOURCC_YV16 = FOURCC('Y', 'V', '1', '6'),
+  FOURCC_YV24 = FOURCC('Y', 'V', '2', '4'),
+  FOURCC_YU12 = FOURCC('Y', 'U', '1', '2'),  // Linux version of I420.
+  FOURCC_J420 = FOURCC('J', '4', '2', '0'),
+  FOURCC_J400 = FOURCC('J', '4', '0', '0'),
+
+  // 14 Auxiliary aliases.  CanonicalFourCC() maps these to canonical fourcc.
+  FOURCC_IYUV = FOURCC('I', 'Y', 'U', 'V'),  // Alias for I420.
+  FOURCC_YU16 = FOURCC('Y', 'U', '1', '6'),  // Alias for I422.
+  FOURCC_YU24 = FOURCC('Y', 'U', '2', '4'),  // Alias for I444.
+  FOURCC_YUYV = FOURCC('Y', 'U', 'Y', 'V'),  // Alias for YUY2.
+  FOURCC_YUVS = FOURCC('y', 'u', 'v', 's'),  // Alias for YUY2 on Mac.
+  FOURCC_HDYC = FOURCC('H', 'D', 'Y', 'C'),  // Alias for UYVY.
+  FOURCC_2VUY = FOURCC('2', 'v', 'u', 'y'),  // Alias for UYVY on Mac.
+  FOURCC_JPEG = FOURCC('J', 'P', 'E', 'G'),  // Alias for MJPG.
+  FOURCC_DMB1 = FOURCC('d', 'm', 'b', '1'),  // Alias for MJPG on Mac.
+  FOURCC_BA81 = FOURCC('B', 'A', '8', '1'),  // Alias for BGGR.
+  FOURCC_RGB3 = FOURCC('R', 'G', 'B', '3'),  // Alias for RAW.
+  FOURCC_BGR3 = FOURCC('B', 'G', 'R', '3'),  // Alias for 24BG.
+  FOURCC_CM32 = FOURCC(0, 0, 0, 32),  // Alias for BGRA kCMPixelFormat_32ARGB
+  FOURCC_CM24 = FOURCC(0, 0, 0, 24),  // Alias for RAW kCMPixelFormat_24RGB
+  FOURCC_L555 = FOURCC('L', '5', '5', '5'),  // Alias for RGBO.
+  FOURCC_L565 = FOURCC('L', '5', '6', '5'),  // Alias for RGBP.
+  FOURCC_5551 = FOURCC('5', '5', '5', '1'),  // Alias for RGBO.
+
+  // 1 Auxiliary compressed YUV format set aside for capturer.
+  FOURCC_H264 = FOURCC('H', '2', '6', '4'),
+
+  // Match any fourcc.
+  FOURCC_ANY = -1,
+};
+
+enum FourCCBpp {
+  // Canonical fourcc codes used in our code.
+  FOURCC_BPP_I420 = 12,
+  FOURCC_BPP_I422 = 16,
+  FOURCC_BPP_I444 = 24,
+  FOURCC_BPP_I411 = 12,
+  FOURCC_BPP_I400 = 8,
+  FOURCC_BPP_NV21 = 12,
+  FOURCC_BPP_NV12 = 12,
+  FOURCC_BPP_YUY2 = 16,
+  FOURCC_BPP_UYVY = 16,
+  FOURCC_BPP_M420 = 12,
+  FOURCC_BPP_Q420 = 12,
+  FOURCC_BPP_ARGB = 32,
+  FOURCC_BPP_BGRA = 32,
+  FOURCC_BPP_ABGR = 32,
+  FOURCC_BPP_RGBA = 32,
+  FOURCC_BPP_24BG = 24,
+  FOURCC_BPP_RAW  = 24,
+  FOURCC_BPP_RGBP = 16,
+  FOURCC_BPP_RGBO = 16,
+  FOURCC_BPP_R444 = 16,
+  FOURCC_BPP_RGGB = 8,
+  FOURCC_BPP_BGGR = 8,
+  FOURCC_BPP_GRBG = 8,
+  FOURCC_BPP_GBRG = 8,
+  FOURCC_BPP_YV12 = 12,
+  FOURCC_BPP_YV16 = 16,
+  FOURCC_BPP_YV24 = 24,
+  FOURCC_BPP_YU12 = 12,
+  FOURCC_BPP_J420 = 12,
+  FOURCC_BPP_J400 = 8,
+  FOURCC_BPP_MJPG = 0,  // 0 means unknown.
+  FOURCC_BPP_H264 = 0,
+  FOURCC_BPP_IYUV = 12,
+  FOURCC_BPP_YU16 = 16,
+  FOURCC_BPP_YU24 = 24,
+  FOURCC_BPP_YUYV = 16,
+  FOURCC_BPP_YUVS = 16,
+  FOURCC_BPP_HDYC = 16,
+  FOURCC_BPP_2VUY = 16,
+  FOURCC_BPP_JPEG = 1,
+  FOURCC_BPP_DMB1 = 1,
+  FOURCC_BPP_BA81 = 8,
+  FOURCC_BPP_RGB3 = 24,
+  FOURCC_BPP_BGR3 = 24,
+  FOURCC_BPP_CM32 = 32,
+  FOURCC_BPP_CM24 = 24,
+
+  // Match any fourcc.
+  FOURCC_BPP_ANY  = 0,  // 0 means unknown.
+};
+
+// Converts fourcc aliases into canonical ones.
+LIBYUV_API uint32 CanonicalFourCC(uint32 fourcc);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_VIDEO_COMMON_H_  NOLINT
diff --git a/libs/libaom/src/third_party/libyuv/source/compare.cc b/libs/libaom/src/third_party/libyuv/source/compare.cc
new file mode 100644
index 000000000..46aa8473d
--- /dev/null
+++ b/libs/libaom/src/third_party/libyuv/source/compare.cc
@@ -0,0 +1,373 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/compare.h"
+
+#include <float.h>
+#include <math.h>
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#include "libyuv/basic_types.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/row.h"
+#include "libyuv/video_common.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// hash seed of 5381 recommended.
+// Internal C version of HashDjb2 with int sized count for efficiency.
+uint32 HashDjb2_C(const uint8* src, int count, uint32 seed);
+
+// This module is for Visual C x86
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(_M_IX86) || \
+    (defined(__x86_64__) || (defined(__i386__) && !defined(__pic__))))
+#define HAS_HASHDJB2_SSE41
+uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed);
+
+#ifdef VISUALC_HAS_AVX2
+#define HAS_HASHDJB2_AVX2
+uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed);
+#endif
+
+#endif  // HAS_HASHDJB2_SSE41
+
+// hash seed of 5381 recommended.
+LIBYUV_API
+uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
+  const int kBlockSize = 1 << 15;  // 32768;
+  int remainder;
+  uint32 (*HashDjb2_SSE)(const uint8* src, int count, uint32 seed) = HashDjb2_C;
+#if defined(HAS_HASHDJB2_SSE41)
+  if (TestCpuFlag(kCpuHasSSE41)) {
+    HashDjb2_SSE = HashDjb2_SSE41;
+  }
+#endif
+#if defined(HAS_HASHDJB2_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    HashDjb2_SSE = HashDjb2_AVX2;
+  }
+#endif
+
+  while (count >= (uint64)(kBlockSize)) {
+    seed = HashDjb2_SSE(src, kBlockSize, seed);
+    src += kBlockSize;
+    count -= kBlockSize;
+  }
+  remainder = (int)(count) & ~15;
+  if (remainder) {
+    seed = HashDjb2_SSE(src, remainder, seed);
+    src += remainder;
+    count -= remainder;
+  }
+  remainder = (int)(count) & 15;
+  if (remainder) {
+    seed = HashDjb2_C(src, remainder, seed);
+  }
+  return seed;
+}
+
+static uint32 ARGBDetectRow_C(const uint8* argb, int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    if (argb[0] != 255) {  // First byte is not Alpha of 255, so not ARGB.
+      return FOURCC_BGRA;
+    }
+    if (argb[3] != 255) {  // 4th byte is not Alpha of 255, so not BGRA.
+      return FOURCC_ARGB;
+    }
+    if (argb[4] != 255) {  // Second pixel first byte is not Alpha of 255.
+      return FOURCC_BGRA;
+    }
+    if (argb[7] != 255) {  // Second pixel 4th byte is not Alpha of 255.
+      return FOURCC_ARGB;
+    }
+    argb += 8;
+  }
+  if (width & 1) {
+    if (argb[0] != 255) {  // First byte is not Alpha of 255, so not ARGB.
+      return FOURCC_BGRA;
+    }
+    if (argb[3] != 255) {  // 4th byte is not Alpha of 255, so not BGRA.
+      return FOURCC_ARGB;
+    }
+  }
+  return 0;
+}
+
+// Scan an opaque argb image and return fourcc based on alpha offset.
+// Returns FOURCC_ARGB, FOURCC_BGRA, or 0 if unknown.
+LIBYUV_API
+uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height) {
+  uint32 fourcc = 0;
+  int h;
+
+  // Coalesce rows.
+  if (stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    stride_argb = 0;
+  }
+  for (h = 0; h < height && fourcc == 0; ++h) {
+    fourcc = ARGBDetectRow_C(argb, width);
+    argb += stride_argb;
+  }
+  return fourcc;
+}
+
+uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count);
+#if !defined(LIBYUV_DISABLE_NEON) && \
+    (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
+#define HAS_SUMSQUAREERROR_NEON
+uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count);
+#endif
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
+#define HAS_SUMSQUAREERROR_SSE2
+uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count);
+#endif
+
+#ifdef VISUALC_HAS_AVX2
+#define HAS_SUMSQUAREERROR_AVX2
+uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count);
+#endif
+
+// TODO(fbarchard): Refactor into row function.
+LIBYUV_API
+uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
+                             int count) {
+  // SumSquareError returns values 0 to 65535 for each squared difference.
+  // Up to 65536 of those can be summed and remain within a uint32.
+  // After each block of 65536 pixels, accumulate into a uint64.
+  const int kBlockSize = 65536;
+  int remainder = count & (kBlockSize - 1) & ~31;
+  uint64 sse = 0;
+  int i;
+  uint32 (*SumSquareError)(const uint8* src_a, const uint8* src_b, int count) =
+      SumSquareError_C;
+#if defined(HAS_SUMSQUAREERROR_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SumSquareError = SumSquareError_NEON;
+  }
+#endif
+#if defined(HAS_SUMSQUAREERROR_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    // Note only used for multiples of 16 so count is not checked.
+    SumSquareError = SumSquareError_SSE2;
+  }
+#endif
+#if defined(HAS_SUMSQUAREERROR_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    // Note only used for multiples of 32 so count is not checked.
+    SumSquareError = SumSquareError_AVX2;
+  }
+#endif
+#ifdef _OPENMP
+#pragma omp parallel for reduction(+: sse)
+#endif
+  for (i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) {
+    sse += SumSquareError(src_a + i, src_b + i, kBlockSize);
+  }
+  src_a += count & ~(kBlockSize - 1);
+  src_b += count & ~(kBlockSize - 1);
+  if (remainder) {
+    sse += SumSquareError(src_a, src_b, remainder);
+    src_a += remainder;
+    src_b += remainder;
+  }
+  remainder = count & 31;
+  if (remainder) {
+    sse += SumSquareError_C(src_a, src_b, remainder);
+  }
+  return sse;
+}
+
+LIBYUV_API
+uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
+                                  const uint8* src_b, int stride_b,
+                                  int width, int height) {
+  uint64 sse = 0;
+  int h;
+  // Coalesce rows.
+  if (stride_a == width &&
+      stride_b == width) {
+    width *= height;
+    height = 1;
+    stride_a = stride_b = 0;
+  }
+  for (h = 0; h < height; ++h) {
+    sse += ComputeSumSquareError(src_a, src_b, width);
+    src_a += stride_a;
+    src_b += stride_b;
+  }
+  return sse;
+}
+
+LIBYUV_API
+double SumSquareErrorToPsnr(uint64 sse, uint64 count) {
+  double psnr;
+  if (sse > 0) {
+    double mse = (double)(count) / (double)(sse);
+    psnr = 10.0 * log10(255.0 * 255.0 * mse);
+  } else {
+    psnr = kMaxPsnr;      // Limit to prevent divide by 0
+  }
+
+  if (psnr > kMaxPsnr)
+    psnr = kMaxPsnr;
+
+  return psnr;
+}
+
+LIBYUV_API
+double CalcFramePsnr(const uint8* src_a, int stride_a,
+                     const uint8* src_b, int stride_b,
+                     int width, int height) {
+  const uint64 samples = width * height;
+  const uint64 sse = ComputeSumSquareErrorPlane(src_a, stride_a,
+                                                src_b, stride_b,
+                                                width, height);
+  return SumSquareErrorToPsnr(sse, samples);
+}
+
+LIBYUV_API
+double I420Psnr(const uint8* src_y_a, int stride_y_a,
+                const uint8* src_u_a, int stride_u_a,
+                const uint8* src_v_a, int stride_v_a,
+                const uint8* src_y_b, int stride_y_b,
+                const uint8* src_u_b, int stride_u_b,
+                const uint8* src_v_b, int stride_v_b,
+                int width, int height) {
+  const uint64 sse_y = ComputeSumSquareErrorPlane(src_y_a, stride_y_a,
+                                                  src_y_b, stride_y_b,
+                                                  width, height);
+  const int width_uv = (width + 1) >> 1;
+  const int height_uv = (height + 1) >> 1;
+  const uint64 sse_u = ComputeSumSquareErrorPlane(src_u_a, stride_u_a,
+                                                  src_u_b, stride_u_b,
+                                                  width_uv, height_uv);
+  const uint64 sse_v = ComputeSumSquareErrorPlane(src_v_a, stride_v_a,
+                                                  src_v_b, stride_v_b,
+                                                  width_uv, height_uv);
+  const uint64 samples = width * height + 2 * (width_uv * height_uv);
+  const uint64 sse = sse_y + sse_u + sse_v;
+  return SumSquareErrorToPsnr(sse, samples);
+}
+
+static const int64 cc1 =  26634;  // (64^2*(.01*255)^2
+static const int64 cc2 = 239708;  // (64^2*(.03*255)^2
+
+static double Ssim8x8_C(const uint8* src_a, int stride_a,
+                        const uint8* src_b, int stride_b) {
+  int64 sum_a = 0;
+  int64 sum_b = 0;
+  int64 sum_sq_a = 0;
+  int64 sum_sq_b = 0;
+  int64 sum_axb = 0;
+
+  int i;
+  for (i = 0; i < 8; ++i) {
+    int j;
+    for (j = 0; j < 8; ++j) {
+      sum_a += src_a[j];
+      sum_b += src_b[j];
+      sum_sq_a += src_a[j] * src_a[j];
+      sum_sq_b += src_b[j] * src_b[j];
+      sum_axb += src_a[j] * src_b[j];
+    }
+
+    src_a += stride_a;
+    src_b += stride_b;
+  }
+
+  {
+    const int64 count = 64;
+    // scale the constants by number of pixels
+    const int64 c1 = (cc1 * count * count) >> 12;
+    const int64 c2 = (cc2 * count * count) >> 12;
+
+    const int64 sum_a_x_sum_b = sum_a * sum_b;
+
+    const int64 ssim_n = (2 * sum_a_x_sum_b + c1) *
+                         (2 * count * sum_axb - 2 * sum_a_x_sum_b + c2);
+
+    const int64 sum_a_sq = sum_a*sum_a;
+    const int64 sum_b_sq = sum_b*sum_b;
+
+    const int64 ssim_d = (sum_a_sq + sum_b_sq + c1) *
+                         (count * sum_sq_a - sum_a_sq +
+                          count * sum_sq_b - sum_b_sq + c2);
+
+    if (ssim_d == 0.0) {
+      return DBL_MAX;
+    }
+    return ssim_n * 1.0 / ssim_d;
+  }
+}
+
+// We are using a 8x8 moving window with starting location of each 8x8 window
+// on the 4x4 pixel grid. Such arrangement allows the windows to overlap
+// block boundaries to penalize blocking artifacts.
+LIBYUV_API
+double CalcFrameSsim(const uint8* src_a, int stride_a,
+                     const uint8* src_b, int stride_b,
+                     int width, int height) {
+  int samples = 0;
+  double ssim_total = 0;
+  double (*Ssim8x8)(const uint8* src_a, int stride_a,
+                    const uint8* src_b, int stride_b) = Ssim8x8_C;
+
+  // sample point start with each 4x4 location
+  int i;
+  for (i = 0; i < height - 8; i += 4) {
+    int j;
+    for (j = 0; j < width - 8; j += 4) {
+      ssim_total += Ssim8x8(src_a + j, stride_a, src_b + j, stride_b);
+      samples++;
+    }
+
+    src_a += stride_a * 4;
+    src_b += stride_b * 4;
+  }
+
+  ssim_total /= samples;
+  return ssim_total;
+}
+
+LIBYUV_API
+double I420Ssim(const uint8* src_y_a, int stride_y_a,
+                const uint8* src_u_a, int stride_u_a,
+                const uint8* src_v_a, int stride_v_a,
+                const uint8* src_y_b, int stride_y_b,
+                const uint8* src_u_b, int stride_u_b,
+                const uint8* src_v_b, int stride_v_b,
+                int width, int height) {
+  const double ssim_y = CalcFrameSsim(src_y_a, stride_y_a,
+                                      src_y_b, stride_y_b, width, height);
+  const int width_uv = (width + 1) >> 1;
+  const int height_uv = (height + 1) >> 1;
+  const double ssim_u = CalcFrameSsim(src_u_a, stride_u_a,
+                                      src_u_b, stride_u_b,
+                                      width_uv, height_uv);
+  const double ssim_v = CalcFrameSsim(src_v_a, stride_v_a,
+                                      src_v_b, stride_v_b,
+                                      width_uv, height_uv);
+  return ssim_y * 0.8 + 0.1 * (ssim_u + ssim_v);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libaom/src/third_party/libyuv/source/compare_common.cc b/libs/libaom/src/third_party/libyuv/source/compare_common.cc
new file mode 100644
index 000000000..c546b5182
--- /dev/null
+++ b/libs/libaom/src/third_party/libyuv/source/compare_common.cc
@@ -0,0 +1,42 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count) {
+  uint32 sse = 0u;
+  int i;
+  for (i = 0; i < count; ++i) {
+    int diff = src_a[i] - src_b[i];
+    sse += (uint32)(diff * diff);
+  }
+  return sse;
+}
+
+// hash seed of 5381 recommended.
+// Internal C version of HashDjb2 with int sized count for efficiency.
+uint32 HashDjb2_C(const uint8* src, int count, uint32 seed) {
+  uint32 hash = seed;
+  int i;
+  for (i = 0; i < count; ++i) {
+    hash += (hash << 5) + src[i];
+  }
+  return hash;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libaom/src/third_party/libyuv/source/compare_gcc.cc b/libs/libaom/src/third_party/libyuv/source/compare_gcc.cc
new file mode 100644
index 000000000..247cb33bb
--- /dev/null
+++ b/libs/libaom/src/third_party/libyuv/source/compare_gcc.cc
@@ -0,0 +1,152 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
+
+uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
+  uint32 sse;
+  asm volatile (  // NOLINT
+    "pxor      %%xmm0,%%xmm0                   \n"
+    "pxor      %%xmm5,%%xmm5                   \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
+    "lea       " MEMLEA(0x10, 0) ",%0          \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm2         \n"
+    "lea       " MEMLEA(0x10, 1) ",%1          \n"
+    "movdqa    %%xmm1,%%xmm3                   \n"
+    "psubusb   %%xmm2,%%xmm1                   \n"
+    "psubusb   %%xmm3,%%xmm2                   \n"
+    "por       %%xmm2,%%xmm1                   \n"
+    "movdqa    %%xmm1,%%xmm2                   \n"
+    "punpcklbw %%xmm5,%%xmm1                   \n"
+    "punpckhbw %%xmm5,%%xmm2                   \n"
+    "pmaddwd   %%xmm1,%%xmm1                   \n"
+    "pmaddwd   %%xmm2,%%xmm2                   \n"
+    "paddd     %%xmm1,%%xmm0                   \n"
+    "paddd     %%xmm2,%%xmm0                   \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+
+    "pshufd    $0xee,%%xmm0,%%xmm1             \n"
+    "paddd     %%xmm1,%%xmm0                   \n"
+    "pshufd    $0x1,%%xmm0,%%xmm1              \n"
+    "paddd     %%xmm1,%%xmm0                   \n"
+    "movd      %%xmm0,%3                       \n"
+
+  : "+r"(src_a),      // %0
+    "+r"(src_b),      // %1
+    "+r"(count),      // %2
+    "=g"(sse)         // %3
+  :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );  // NOLINT
+  return sse;
+}
+
+#endif  // defined(__x86_64__) || defined(__i386__)
+
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
+#define HAS_HASHDJB2_SSE41
+static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 };  // 33 ^ 16
+static uvec32 kHashMul0 = {
+  0x0c3525e1,  // 33 ^ 15
+  0xa3476dc1,  // 33 ^ 14
+  0x3b4039a1,  // 33 ^ 13
+  0x4f5f0981,  // 33 ^ 12
+};
+static uvec32 kHashMul1 = {
+  0x30f35d61,  // 33 ^ 11
+  0x855cb541,  // 33 ^ 10
+  0x040a9121,  // 33 ^ 9
+  0x747c7101,  // 33 ^ 8
+};
+static uvec32 kHashMul2 = {
+  0xec41d4e1,  // 33 ^ 7
+  0x4cfa3cc1,  // 33 ^ 6
+  0x025528a1,  // 33 ^ 5
+  0x00121881,  // 33 ^ 4
+};
+static uvec32 kHashMul3 = {
+  0x00008c61,  // 33 ^ 3
+  0x00000441,  // 33 ^ 2
+  0x00000021,  // 33 ^ 1
+  0x00000001,  // 33 ^ 0
+};
+
+uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
+  uint32 hash;
+  asm volatile (  // NOLINT
+    "movd      %2,%%xmm0                       \n"
+    "pxor      %%xmm7,%%xmm7                   \n"
+    "movdqa    %4,%%xmm6                       \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
+    "lea       " MEMLEA(0x10, 0) ",%0          \n"
+    "pmulld    %%xmm6,%%xmm0                   \n"
+    "movdqa    %5,%%xmm5                       \n"
+    "movdqa    %%xmm1,%%xmm2                   \n"
+    "punpcklbw %%xmm7,%%xmm2                   \n"
+    "movdqa    %%xmm2,%%xmm3                   \n"
+    "punpcklwd %%xmm7,%%xmm3                   \n"
+    "pmulld    %%xmm5,%%xmm3                   \n"
+    "movdqa    %6,%%xmm5                       \n"
+    "movdqa    %%xmm2,%%xmm4                   \n"
+    "punpckhwd %%xmm7,%%xmm4                   \n"
+    "pmulld    %%xmm5,%%xmm4                   \n"
+    "movdqa    %7,%%xmm5                       \n"
+    "punpckhbw %%xmm7,%%xmm1                   \n"
+    "movdqa    %%xmm1,%%xmm2                   \n"
+    "punpcklwd %%xmm7,%%xmm2                   \n"
+    "pmulld    %%xmm5,%%xmm2                   \n"
+    "movdqa    %8,%%xmm5                       \n"
+    "punpckhwd %%xmm7,%%xmm1                   \n"
+    "pmulld    %%xmm5,%%xmm1                   \n"
+    "paddd     %%xmm4,%%xmm3                   \n"
+    "paddd     %%xmm2,%%xmm1                   \n"
+    "paddd     %%xmm3,%%xmm1                   \n"
+    "pshufd    $0xe,%%xmm1,%%xmm2              \n"
+    "paddd     %%xmm2,%%xmm1                   \n"
+    "pshufd    $0x1,%%xmm1,%%xmm2              \n"
+    "paddd     %%xmm2,%%xmm1                   \n"
+    "paddd     %%xmm1,%%xmm0                   \n"
+    "sub       $0x10,%1                        \n"
+    "jg        1b                              \n"
+    "movd      %%xmm0,%3                       \n"
+  : "+r"(src),        // %0
+    "+r"(count),      // %1
+    "+rm"(seed),      // %2
+    "=g"(hash)        // %3
+  : "m"(kHash16x33),  // %4
+    "m"(kHashMul0),   // %5
+    "m"(kHashMul1),   // %6
+    "m"(kHashMul2),   // %7
+    "m"(kHashMul3)    // %8
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );  // NOLINT
+  return hash;
+}
+#endif  // defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
diff --git a/libs/libaom/src/third_party/libyuv/source/compare_neon.cc b/libs/libaom/src/third_party/libyuv/source/compare_neon.cc
new file mode 100644
index 000000000..ef006ec41
--- /dev/null
+++ b/libs/libaom/src/third_party/libyuv/source/compare_neon.cc
@@ -0,0 +1,65 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
+    !defined(__aarch64__)
+
+uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
+  volatile uint32 sse;
+  asm volatile (
+    "vmov.u8    q8, #0                         \n"
+    "vmov.u8    q10, #0                        \n"
+    "vmov.u8    q9, #0                         \n"
+    "vmov.u8    q11, #0                        \n"
+
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"
+    MEMACCESS(1)
+    "vld1.8     {q1}, [%1]!                    \n"
+    "subs       %2, %2, #16                    \n"
+    "vsubl.u8   q2, d0, d2                     \n"
+    "vsubl.u8   q3, d1, d3                     \n"
+    "vmlal.s16  q8, d4, d4                     \n"
+    "vmlal.s16  q9, d6, d6                     \n"
+    "vmlal.s16  q10, d5, d5                    \n"
+    "vmlal.s16  q11, d7, d7                    \n"
+    "bgt        1b                             \n"
+
+    "vadd.u32   q8, q8, q9                     \n"
+    "vadd.u32   q10, q10, q11                  \n"
+    "vadd.u32   q11, q8, q10                   \n"
+    "vpaddl.u32 q1, q11                        \n"
+    "vadd.u64   d0, d2, d3                     \n"
+    "vmov.32    %3, d0[0]                      \n"
+    : "+r"(src_a),
+      "+r"(src_b),
+      "+r"(count),
+      "=r"(sse)
+    :
+    : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
+  return sse;
+}
+
+#endif  // defined(__ARM_NEON__) && !defined(__aarch64__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libaom/src/third_party/libyuv/source/compare_neon64.cc b/libs/libaom/src/third_party/libyuv/source/compare_neon64.cc
new file mode 100644
index 000000000..6d1e5e1bc
--- /dev/null
+++ b/libs/libaom/src/third_party/libyuv/source/compare_neon64.cc
@@ -0,0 +1,63 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
+  volatile uint32 sse;
+  asm volatile (
+    "eor        v16.16b, v16.16b, v16.16b      \n"
+    "eor        v18.16b, v18.16b, v18.16b      \n"
+    "eor        v17.16b, v17.16b, v17.16b      \n"
+    "eor        v19.16b, v19.16b, v19.16b      \n"
+
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"
+    MEMACCESS(1)
+    "ld1        {v1.16b}, [%1], #16            \n"
+    "subs       %w2, %w2, #16                  \n"
+    "usubl      v2.8h, v0.8b, v1.8b            \n"
+    "usubl2     v3.8h, v0.16b, v1.16b          \n"
+    "smlal      v16.4s, v2.4h, v2.4h           \n"
+    "smlal      v17.4s, v3.4h, v3.4h           \n"
+    "smlal2     v18.4s, v2.8h, v2.8h           \n"
+    "smlal2     v19.4s, v3.8h, v3.8h           \n"
+    "b.gt       1b                             \n"
+
+    "add        v16.4s, v16.4s, v17.4s         \n"
+    "add        v18.4s, v18.4s, v19.4s         \n"
+    "add        v19.4s, v16.4s, v18.4s         \n"
+    "addv       s0, v19.4s                     \n"
+    "fmov       %w3, s0                        \n"
+    : "+r"(src_a),
+      "+r"(src_b),
+      "+r"(count),
+      "=r"(sse)
+    :
+    : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
+  return sse;
+}
+
+#endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libaom/src/third_party/libyuv/source/compare_win.cc b/libs/libaom/src/third_party/libyuv/source/compare_win.cc
new file mode 100644
index 000000000..19806f275
--- /dev/null
+++ b/libs/libaom/src/third_party/libyuv/source/compare_win.cc
@@ -0,0 +1,229 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for Visual C x86.
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
+    defined(_MSC_VER) && !defined(__clang__)
+
+__declspec(naked)
+uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
+  __asm {
+    mov        eax, [esp + 4]    // src_a
+    mov        edx, [esp + 8]    // src_b
+    mov        ecx, [esp + 12]   // count
+    pxor       xmm0, xmm0
+    pxor       xmm5, xmm5
+
+  wloop:
+    movdqu     xmm1, [eax]
+    lea        eax,  [eax + 16]
+    movdqu     xmm2, [edx]
+    lea        edx,  [edx + 16]
+    movdqa     xmm3, xmm1  // abs trick
+    psubusb    xmm1, xmm2
+    psubusb    xmm2, xmm3
+    por        xmm1, xmm2
+    movdqa     xmm2, xmm1
+    punpcklbw  xmm1, xmm5
+    punpckhbw  xmm2, xmm5
+    pmaddwd    xmm1, xmm1
+    pmaddwd    xmm2, xmm2
+    paddd      xmm0, xmm1
+    paddd      xmm0, xmm2
+    sub        ecx, 16
+    jg         wloop
+
+    pshufd     xmm1, xmm0, 0xee
+    paddd      xmm0, xmm1
+    pshufd     xmm1, xmm0, 0x01
+    paddd      xmm0, xmm1
+    movd       eax, xmm0
+    ret
+  }
+}
+
+// Visual C 2012 required for AVX2.
+#if _MSC_VER >= 1700
+// C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.
+#pragma warning(disable: 4752)
+__declspec(naked)
+uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
+  __asm {
+    mov        eax, [esp + 4]    // src_a
+    mov        edx, [esp + 8]    // src_b
+    mov        ecx, [esp + 12]   // count
+    vpxor      ymm0, ymm0, ymm0  // sum
+    vpxor      ymm5, ymm5, ymm5  // constant 0 for unpck
+    sub        edx, eax
+
+  wloop:
+    vmovdqu    ymm1, [eax]
+    vmovdqu    ymm2, [eax + edx]
+    lea        eax,  [eax + 32]
+    vpsubusb   ymm3, ymm1, ymm2  // abs difference trick
+    vpsubusb   ymm2, ymm2, ymm1
+    vpor       ymm1, ymm2, ymm3
+    vpunpcklbw ymm2, ymm1, ymm5  // u16.  mutates order.
+    vpunpckhbw ymm1, ymm1, ymm5
+    vpmaddwd   ymm2, ymm2, ymm2  // square + hadd to u32.
+    vpmaddwd   ymm1, ymm1, ymm1
+    vpaddd     ymm0, ymm0, ymm1
+    vpaddd     ymm0, ymm0, ymm2
+    sub        ecx, 32
+    jg         wloop
+
+    vpshufd    ymm1, ymm0, 0xee  // 3, 2 + 1, 0 both lanes.
+    vpaddd     ymm0, ymm0, ymm1
+    vpshufd    ymm1, ymm0, 0x01  // 1 + 0 both lanes.
+    vpaddd     ymm0, ymm0, ymm1
+    vpermq     ymm1, ymm0, 0x02  // high + low lane.
+    vpaddd     ymm0, ymm0, ymm1
+    vmovd      eax, xmm0
+    vzeroupper
+    ret
+  }
+}
+#endif  // _MSC_VER >= 1700
+
+#define HAS_HASHDJB2_SSE41
+static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 };  // 33 ^ 16
+static uvec32 kHashMul0 = {
+  0x0c3525e1,  // 33 ^ 15
+  0xa3476dc1,  // 33 ^ 14
+  0x3b4039a1,  // 33 ^ 13
+  0x4f5f0981,  // 33 ^ 12
+};
+static uvec32 kHashMul1 = {
+  0x30f35d61,  // 33 ^ 11
+  0x855cb541,  // 33 ^ 10
+  0x040a9121,  // 33 ^ 9
+  0x747c7101,  // 33 ^ 8
+};
+static uvec32 kHashMul2 = {
+  0xec41d4e1,  // 33 ^ 7
+  0x4cfa3cc1,  // 33 ^ 6
+  0x025528a1,  // 33 ^ 5
+  0x00121881,  // 33 ^ 4
+};
+static uvec32 kHashMul3 = {
+  0x00008c61,  // 33 ^ 3
+  0x00000441,  // 33 ^ 2
+  0x00000021,  // 33 ^ 1
+  0x00000001,  // 33 ^ 0
+};
+
+// 27: 66 0F 38 40 C6     pmulld      xmm0,xmm6
+// 44: 66 0F 38 40 DD     pmulld      xmm3,xmm5
+// 59: 66 0F 38 40 E5     pmulld      xmm4,xmm5
+// 72: 66 0F 38 40 D5     pmulld      xmm2,xmm5
+// 83: 66 0F 38 40 CD     pmulld      xmm1,xmm5
+#define pmulld(reg) _asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 \
+    _asm _emit 0x40 _asm _emit reg
+
+__declspec(naked)
+uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
+  __asm {
+    mov        eax, [esp + 4]    // src
+    mov        ecx, [esp + 8]    // count
+    movd       xmm0, [esp + 12]  // seed
+
+    pxor       xmm7, xmm7        // constant 0 for unpck
+    movdqa     xmm6, kHash16x33
+
+  wloop:
+    movdqu     xmm1, [eax]       // src[0-15]
+    lea        eax, [eax + 16]
+    pmulld(0xc6)                 // pmulld      xmm0,xmm6  hash *= 33 ^ 16
+    movdqa     xmm5, kHashMul0
+    movdqa     xmm2, xmm1
+    punpcklbw  xmm2, xmm7        // src[0-7]
+    movdqa     xmm3, xmm2
+    punpcklwd  xmm3, xmm7        // src[0-3]
+    pmulld(0xdd)                 // pmulld     xmm3, xmm5
+    movdqa     xmm5, kHashMul1
+    movdqa     xmm4, xmm2
+    punpckhwd  xmm4, xmm7        // src[4-7]
+    pmulld(0xe5)                 // pmulld     xmm4, xmm5
+    movdqa     xmm5, kHashMul2
+    punpckhbw  xmm1, xmm7        // src[8-15]
+    movdqa     xmm2, xmm1
+    punpcklwd  xmm2, xmm7        // src[8-11]
+    pmulld(0xd5)                 // pmulld     xmm2, xmm5
+    movdqa     xmm5, kHashMul3
+    punpckhwd  xmm1, xmm7        // src[12-15]
+    pmulld(0xcd)                 // pmulld     xmm1, xmm5
+    paddd      xmm3, xmm4        // add 16 results
+    paddd      xmm1, xmm2
+    paddd      xmm1, xmm3
+
+    pshufd     xmm2, xmm1, 0x0e  // upper 2 dwords
+    paddd      xmm1, xmm2
+    pshufd     xmm2, xmm1, 0x01
+    paddd      xmm1, xmm2
+    paddd      xmm0, xmm1
+    sub        ecx, 16
+    jg         wloop
+
+    movd       eax, xmm0         // return hash
+    ret
+  }
+}
+
+// Visual C 2012 required for AVX2.
+#if _MSC_VER >= 1700
+__declspec(naked)
+uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
+  __asm {
+    mov        eax, [esp + 4]    // src
+    mov        ecx, [esp + 8]    // count
+    movd       xmm0, [esp + 12]  // seed
+    movdqa     xmm6, kHash16x33
+
+  wloop:
+    vpmovzxbd  xmm3, dword ptr [eax]  // src[0-3]
+    pmulld     xmm0, xmm6  // hash *= 33 ^ 16
+    vpmovzxbd  xmm4, dword ptr [eax + 4]  // src[4-7]
+    pmulld     xmm3, kHashMul0
+    vpmovzxbd  xmm2, dword ptr [eax + 8]  // src[8-11]
+    pmulld     xmm4, kHashMul1
+    vpmovzxbd  xmm1, dword ptr [eax + 12]  // src[12-15]
+    pmulld     xmm2, kHashMul2
+    lea        eax, [eax + 16]
+    pmulld     xmm1, kHashMul3
+    paddd      xmm3, xmm4        // add 16 results
+    paddd      xmm1, xmm2
+    paddd      xmm1, xmm3
+    pshufd     xmm2, xmm1, 0x0e  // upper 2 dwords
+    paddd      xmm1, xmm2
+    pshufd     xmm2, xmm1, 0x01
+    paddd      xmm1, xmm2
+    paddd      xmm0, xmm1
+    sub        ecx, 16
+    jg         wloop
+
+    movd       eax, xmm0         // return hash
+    ret
+  }
+}
+#endif  // _MSC_VER >= 1700
+#endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libaom/src/third_party/libyuv/source/convert.cc b/libs/libaom/src/third_party/libyuv/source/convert.cc
new file mode 100644
index 000000000..3ad6bd7a4
--- /dev/null
+++ b/libs/libaom/src/third_party/libyuv/source/convert.cc
@@ -0,0 +1,1389 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert.h"
+
+#include "libyuv/basic_types.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"
+#include "libyuv/scale.h"  // For ScalePlane()
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
+static __inline int Abs(int v) {
+  return v >= 0 ? v : -v;
+}
+
+// Any I4xx To I420 format with mirroring.
+static int I4xxToI420(const uint8* src_y, int src_stride_y,
+                      const uint8* src_u, int src_stride_u,
+                      const uint8* src_v, int src_stride_v,
+                      uint8* dst_y, int dst_stride_y,
+                      uint8* dst_u, int dst_stride_u,
+                      uint8* dst_v, int dst_stride_v,
+                      int src_y_width, int src_y_height,
+                      int src_uv_width, int src_uv_height) {
+  const int dst_y_width = Abs(src_y_width);
+  const int dst_y_height = Abs(src_y_height);
+  const int dst_uv_width = SUBSAMPLE(dst_y_width, 1, 1);
+  const int dst_uv_height = SUBSAMPLE(dst_y_height, 1, 1);
+  if (src_y_width == 0 || src_y_height == 0 ||
+      src_uv_width == 0 || src_uv_height == 0) {
+    return -1;
+  }
+  ScalePlane(src_y, src_stride_y, src_y_width, src_y_height,
+             dst_y, dst_stride_y, dst_y_width, dst_y_height,
+             kFilterBilinear);
+  ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height,
+             dst_u, dst_stride_u, dst_uv_width, dst_uv_height,
+             kFilterBilinear);
+  ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height,
+             dst_v, dst_stride_v, dst_uv_width, dst_uv_height,
+             kFilterBilinear);
+  return 0;
+}
+
+// Copy I420 with optional flipping
+// TODO(fbarchard): Use Scale plane which supports mirroring, but ensure
+// is does row coalescing.
+LIBYUV_API
+int I420Copy(const uint8* src_y, int src_stride_y,
+             const uint8* src_u, int src_stride_u,
+             const uint8* src_v, int src_stride_v,
+             uint8* dst_y, int dst_stride_y,
+             uint8* dst_u, int dst_stride_u,
+             uint8* dst_v, int dst_stride_v,
+             int width, int height) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!src_y || !src_u || !src_v ||
+      !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (halfheight - 1) * src_stride_u;
+    src_v = src_v + (halfheight - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  if (dst_y) {
+    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  }
+  // Copy UV planes.
+  CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);
+  CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);
+  return 0;
+}
+
+// 422 chroma is 1/2 width, 1x height
+// 420 chroma is 1/2 width, 1/2 height
+LIBYUV_API
+int I422ToI420(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  const int src_uv_width = SUBSAMPLE(width, 1, 1);
+  return I4xxToI420(src_y, src_stride_y,
+                    src_u, src_stride_u,
+                    src_v, src_stride_v,
+                    dst_y, dst_stride_y,
+                    dst_u, dst_stride_u,
+                    dst_v, dst_stride_v,
+                    width, height,
+                    src_uv_width, height);
+}
+
+// 444 chroma is 1x width, 1x height
+// 420 chroma is 1/2 width, 1/2 height
+LIBYUV_API
+int I444ToI420(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  return I4xxToI420(src_y, src_stride_y,
+                    src_u, src_stride_u,
+                    src_v, src_stride_v,
+                    dst_y, dst_stride_y,
+                    dst_u, dst_stride_u,
+                    dst_v, dst_stride_v,
+                    width, height,
+                    width, height);
+}
+
+// 411 chroma is 1/4 width, 1x height
+// 420 chroma is 1/2 width, 1/2 height
+LIBYUV_API
+int I411ToI420(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  const int src_uv_width = SUBSAMPLE(width, 3, 2);
+  return I4xxToI420(src_y, src_stride_y,
+                    src_u, src_stride_u,
+                    src_v, src_stride_v,
+                    dst_y, dst_stride_y,
+                    dst_u, dst_stride_u,
+                    dst_v, dst_stride_v,
+                    width, height,
+                    src_uv_width, height);
+}
+
+// I400 is greyscale typically used in MJPG
+LIBYUV_API
+int I400ToI420(const uint8* src_y, int src_stride_y,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!src_y || !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_stride_y = -src_stride_y;
+  }
+  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  SetPlane(dst_u, dst_stride_u, halfwidth, halfheight, 128);
+  SetPlane(dst_v, dst_stride_v, halfwidth, halfheight, 128);
+  return 0;
+}
+
+static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1,
+                       uint8* dst, int dst_stride,
+                       int width, int height) {
+  int y;
+  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
+#if defined(HAS_COPYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
+  }
+#endif
+#if defined(HAS_COPYROW_AVX)
+  if (TestCpuFlag(kCpuHasAVX)) {
+    CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
+  }
+#endif
+#if defined(HAS_COPYROW_ERMS)
+  if (TestCpuFlag(kCpuHasERMS)) {
+    CopyRow = CopyRow_ERMS;
+  }
+#endif
+#if defined(HAS_COPYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
+  }
+#endif
+#if defined(HAS_COPYROW_MIPS)
+  if (TestCpuFlag(kCpuHasMIPS)) {
+    CopyRow = CopyRow_MIPS;
+  }
+#endif
+
+  // Copy plane
+  for (y = 0; y < height - 1; y += 2) {
+    CopyRow(src, dst, width);
+    CopyRow(src + src_stride_0, dst + dst_stride, width);
+    src += src_stride_0 + src_stride_1;
+    dst += dst_stride * 2;
+  }
+  if (height & 1) {
+    CopyRow(src, dst, width);
+  }
+}
+
+// Support converting from FOURCC_M420
+// Useful for bandwidth constrained transports like USB 1.0 and 2.0 and for
+// easy conversion to I420.
+// M420 format description:
+// M420 is row biplanar 420: 2 rows of Y and 1 row of UV.
+// Chroma is half width / half height. (420)
+// src_stride_m420 is row planar. Normally this will be the width in pixels.
+//   The UV plane is half width, but 2 values, so src_stride_m420 applies to
+//   this as well as the two Y planes.
+static int X420ToI420(const uint8* src_y,
+                      int src_stride_y0, int src_stride_y1,
+                      const uint8* src_uv, int src_stride_uv,
+                      uint8* dst_y, int dst_stride_y,
+                      uint8* dst_u, int dst_stride_u,
+                      uint8* dst_v, int dst_stride_v,
+                      int width, int height) {
+  int y;
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) =
+      SplitUVRow_C;
+  if (!src_y || !src_uv ||
+      !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
+    dst_u = dst_u + (halfheight - 1) * dst_stride_u;
+    dst_v = dst_v + (halfheight - 1) * dst_stride_v;
+    dst_stride_y = -dst_stride_y;
+    dst_stride_u = -dst_stride_u;
+    dst_stride_v = -dst_stride_v;
+  }
+  // Coalesce rows.
+  if (src_stride_y0 == width &&
+      src_stride_y1 == width &&
+      dst_stride_y == width) {
+    width *= height;
+    height = 1;
+    src_stride_y0 = src_stride_y1 = dst_stride_y = 0;
+  }
+  // Coalesce rows.
+  if (src_stride_uv == halfwidth * 2 &&
+      dst_stride_u == halfwidth &&
+      dst_stride_v == halfwidth) {
+    halfwidth *= halfheight;
+    halfheight = 1;
+    src_stride_uv = dst_stride_u = dst_stride_v = 0;
+  }
+#if defined(HAS_SPLITUVROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    SplitUVRow = SplitUVRow_Any_SSE2;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      SplitUVRow = SplitUVRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_SPLITUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    SplitUVRow = SplitUVRow_Any_AVX2;
+    if (IS_ALIGNED(halfwidth, 32)) {
+      SplitUVRow = SplitUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_SPLITUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SplitUVRow = SplitUVRow_Any_NEON;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      SplitUVRow = SplitUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SPLITUVROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
+      IS_ALIGNED(src_uv, 4) && IS_ALIGNED(src_stride_uv, 4) &&
+      IS_ALIGNED(dst_u, 4) && IS_ALIGNED(dst_stride_u, 4) &&
+      IS_ALIGNED(dst_v, 4) && IS_ALIGNED(dst_stride_v, 4)) {
+    SplitUVRow = SplitUVRow_Any_MIPS_DSPR2;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      SplitUVRow = SplitUVRow_MIPS_DSPR2;
+    }
+  }
+#endif
+
+  if (dst_y) {
+    if (src_stride_y0 == src_stride_y1) {
+      CopyPlane(src_y, src_stride_y0, dst_y, dst_stride_y, width, height);
+    } else {
+      CopyPlane2(src_y, src_stride_y0, src_stride_y1, dst_y, dst_stride_y,
+                 width, height);
+    }
+  }
+
+  for (y = 0; y < halfheight; ++y) {
+    // Copy a row of UV.
+    SplitUVRow(src_uv, dst_u, dst_v, halfwidth);
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+    src_uv += src_stride_uv;
+  }
+  return 0;
+}
+
+// Convert NV12 to I420.
+LIBYUV_API
+int NV12ToI420(const uint8* src_y, int src_stride_y,
+               const uint8* src_uv, int src_stride_uv,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  return X420ToI420(src_y, src_stride_y, src_stride_y,
+                    src_uv, src_stride_uv,
+                    dst_y, dst_stride_y,
+                    dst_u, dst_stride_u,
+                    dst_v, dst_stride_v,
+                    width, height);
+}
+
+// Convert NV21 to I420.  Same as NV12 but u and v pointers swapped.
+LIBYUV_API
+int NV21ToI420(const uint8* src_y, int src_stride_y,
+               const uint8* src_vu, int src_stride_vu,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  return X420ToI420(src_y, src_stride_y, src_stride_y,
+                    src_vu, src_stride_vu,
+                    dst_y, dst_stride_y,
+                    dst_v, dst_stride_v,
+                    dst_u, dst_stride_u,
+                    width, height);
+}
+
+// Convert M420 to I420.
+LIBYUV_API
+int M420ToI420(const uint8* src_m420, int src_stride_m420,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  return X420ToI420(src_m420, src_stride_m420, src_stride_m420 * 2,
+                    src_m420 + src_stride_m420 * 2, src_stride_m420 * 3,
+                    dst_y, dst_stride_y,
+                    dst_u, dst_stride_u,
+                    dst_v, dst_stride_v,
+                    width, height);
+}
+
+// Convert YUY2 to I420.
+LIBYUV_API
+int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int y;
+  void (*YUY2ToUVRow)(const uint8* src_yuy2, int src_stride_yuy2,
+      uint8* dst_u, uint8* dst_v, int pix) = YUY2ToUVRow_C;
+  void (*YUY2ToYRow)(const uint8* src_yuy2,
+      uint8* dst_y, int pix) = YUY2ToYRow_C;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
+    src_stride_yuy2 = -src_stride_yuy2;
+  }
+#if defined(HAS_YUY2TOYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    YUY2ToUVRow = YUY2ToUVRow_Any_SSE2;
+    YUY2ToYRow = YUY2ToYRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      YUY2ToUVRow = YUY2ToUVRow_SSE2;
+      YUY2ToYRow = YUY2ToYRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_YUY2TOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    YUY2ToUVRow = YUY2ToUVRow_Any_AVX2;
+    YUY2ToYRow = YUY2ToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      YUY2ToUVRow = YUY2ToUVRow_AVX2;
+      YUY2ToYRow = YUY2ToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_YUY2TOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    YUY2ToYRow = YUY2ToYRow_Any_NEON;
+    YUY2ToUVRow = YUY2ToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      YUY2ToYRow = YUY2ToYRow_NEON;
+      YUY2ToUVRow = YUY2ToUVRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    YUY2ToUVRow(src_yuy2, src_stride_yuy2, dst_u, dst_v, width);
+    YUY2ToYRow(src_yuy2, dst_y, width);
+    YUY2ToYRow(src_yuy2 + src_stride_yuy2, dst_y + dst_stride_y, width);
+    src_yuy2 += src_stride_yuy2 * 2;
+    dst_y += dst_stride_y * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+    YUY2ToUVRow(src_yuy2, 0, dst_u, dst_v, width);
+    YUY2ToYRow(src_yuy2, dst_y, width);
+  }
+  return 0;
+}
+
+// Convert UYVY to I420.
+LIBYUV_API
+int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int y;
+  void (*UYVYToUVRow)(const uint8* src_uyvy, int src_stride_uyvy,
+      uint8* dst_u, uint8* dst_v, int pix) = UYVYToUVRow_C;
+  void (*UYVYToYRow)(const uint8* src_uyvy,
+      uint8* dst_y, int pix) = UYVYToYRow_C;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
+    src_stride_uyvy = -src_stride_uyvy;
+  }
+#if defined(HAS_UYVYTOYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    UYVYToUVRow = UYVYToUVRow_Any_SSE2;
+    UYVYToYRow = UYVYToYRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      UYVYToUVRow = UYVYToUVRow_SSE2;
+      UYVYToYRow = UYVYToYRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_UYVYTOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    UYVYToUVRow = UYVYToUVRow_Any_AVX2;
+    UYVYToYRow = UYVYToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      UYVYToUVRow = UYVYToUVRow_AVX2;
+      UYVYToYRow = UYVYToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_UYVYTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    UYVYToYRow = UYVYToYRow_Any_NEON;
+    UYVYToUVRow = UYVYToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      UYVYToYRow = UYVYToYRow_NEON;
+      UYVYToUVRow = UYVYToUVRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    UYVYToUVRow(src_uyvy, src_stride_uyvy, dst_u, dst_v, width);
+    UYVYToYRow(src_uyvy, dst_y, width);
+    UYVYToYRow(src_uyvy + src_stride_uyvy, dst_y + dst_stride_y, width);
+    src_uyvy += src_stride_uyvy * 2;
+    dst_y += dst_stride_y * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+    UYVYToUVRow(src_uyvy, 0, dst_u, dst_v, width);
+    UYVYToYRow(src_uyvy, dst_y, width);
+  }
+  return 0;
+}
+
+// Convert ARGB to I420.
+LIBYUV_API
+int ARGBToI420(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int y;
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+  if (!src_argb ||
+      !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUVRow = ARGBToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    ARGBToUVRow(src_argb, src_stride_argb, dst_u, dst_v, width);
+    ARGBToYRow(src_argb, dst_y, width);
+    ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
+    src_argb += src_stride_argb * 2;
+    dst_y += dst_stride_y * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+    ARGBToUVRow(src_argb, 0, dst_u, dst_v, width);
+    ARGBToYRow(src_argb, dst_y, width);
+  }
+  return 0;
+}
+
+// Convert BGRA to I420.
+LIBYUV_API
+int BGRAToI420(const uint8* src_bgra, int src_stride_bgra,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int y;
+  void (*BGRAToUVRow)(const uint8* src_bgra0, int src_stride_bgra,
+      uint8* dst_u, uint8* dst_v, int width) = BGRAToUVRow_C;
+  void (*BGRAToYRow)(const uint8* src_bgra, uint8* dst_y, int pix) =
+      BGRAToYRow_C;
+  if (!src_bgra ||
+      !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_bgra = src_bgra + (height - 1) * src_stride_bgra;
+    src_stride_bgra = -src_stride_bgra;
+  }
+#if defined(HAS_BGRATOYROW_SSSE3) && defined(HAS_BGRATOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    BGRAToUVRow = BGRAToUVRow_Any_SSSE3;
+    BGRAToYRow = BGRAToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      BGRAToUVRow = BGRAToUVRow_SSSE3;
+      BGRAToYRow = BGRAToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_BGRATOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    BGRAToYRow = BGRAToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      BGRAToYRow = BGRAToYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_BGRATOUVROW_NEON)
+    if (TestCpuFlag(kCpuHasNEON)) {
+      BGRAToUVRow = BGRAToUVRow_Any_NEON;
+      if (IS_ALIGNED(width, 16)) {
+        BGRAToUVRow = BGRAToUVRow_NEON;
+      }
+    }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    BGRAToUVRow(src_bgra, src_stride_bgra, dst_u, dst_v, width);
+    BGRAToYRow(src_bgra, dst_y, width);
+    BGRAToYRow(src_bgra + src_stride_bgra, dst_y + dst_stride_y, width);
+    src_bgra += src_stride_bgra * 2;
+    dst_y += dst_stride_y * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+    BGRAToUVRow(src_bgra, 0, dst_u, dst_v, width);
+    BGRAToYRow(src_bgra, dst_y, width);
+  }
+  return 0;
+}
+
+// Convert ABGR to I420.
+LIBYUV_API
+int ABGRToI420(const uint8* src_abgr, int src_stride_abgr,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int y;
+  void (*ABGRToUVRow)(const uint8* src_abgr0, int src_stride_abgr,
+      uint8* dst_u, uint8* dst_v, int width) = ABGRToUVRow_C;
+  void (*ABGRToYRow)(const uint8* src_abgr, uint8* dst_y, int pix) =
+      ABGRToYRow_C;
+  if (!src_abgr ||
+      !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_abgr = src_abgr + (height - 1) * src_stride_abgr;
+    src_stride_abgr = -src_stride_abgr;
+  }
+#if defined(HAS_ABGRTOYROW_SSSE3) && defined(HAS_ABGRTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ABGRToUVRow = ABGRToUVRow_Any_SSSE3;
+    ABGRToYRow = ABGRToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToUVRow = ABGRToUVRow_SSSE3;
+      ABGRToYRow = ABGRToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ABGRToYRow = ABGRToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ABGRToYRow = ABGRToYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ABGRToUVRow = ABGRToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToUVRow = ABGRToUVRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    ABGRToUVRow(src_abgr, src_stride_abgr, dst_u, dst_v, width);
+    ABGRToYRow(src_abgr, dst_y, width);
+    ABGRToYRow(src_abgr + src_stride_abgr, dst_y + dst_stride_y, width);
+    src_abgr += src_stride_abgr * 2;
+    dst_y += dst_stride_y * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+    ABGRToUVRow(src_abgr, 0, dst_u, dst_v, width);
+    ABGRToYRow(src_abgr, dst_y, width);
+  }
+  return 0;
+}
+
+// Convert RGBA to I420.
+LIBYUV_API
+int RGBAToI420(const uint8* src_rgba, int src_stride_rgba,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int y;
+  void (*RGBAToUVRow)(const uint8* src_rgba0, int src_stride_rgba,
+      uint8* dst_u, uint8* dst_v, int width) = RGBAToUVRow_C;
+  void (*RGBAToYRow)(const uint8* src_rgba, uint8* dst_y, int pix) =
+      RGBAToYRow_C;
+  if (!src_rgba ||
+      !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_rgba = src_rgba + (height - 1) * src_stride_rgba;
+    src_stride_rgba = -src_stride_rgba;
+  }
+#if defined(HAS_RGBATOYROW_SSSE3) && defined(HAS_RGBATOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    RGBAToUVRow = RGBAToUVRow_Any_SSSE3;
+    RGBAToYRow = RGBAToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RGBAToUVRow = RGBAToUVRow_SSSE3;
+      RGBAToYRow = RGBAToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_RGBATOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RGBAToYRow = RGBAToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RGBAToYRow = RGBAToYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_RGBATOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RGBAToUVRow = RGBAToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      RGBAToUVRow = RGBAToUVRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    RGBAToUVRow(src_rgba, src_stride_rgba, dst_u, dst_v, width);
+    RGBAToYRow(src_rgba, dst_y, width);
+    RGBAToYRow(src_rgba + src_stride_rgba, dst_y + dst_stride_y, width);
+    src_rgba += src_stride_rgba * 2;
+    dst_y += dst_stride_y * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+    RGBAToUVRow(src_rgba, 0, dst_u, dst_v, width);
+    RGBAToYRow(src_rgba, dst_y, width);
+  }
+  return 0;
+}
+
+// Convert RGB24 to I420.
+LIBYUV_API
+int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
+                uint8* dst_y, int dst_stride_y,
+                uint8* dst_u, int dst_stride_u,
+                uint8* dst_v, int dst_stride_v,
+                int width, int height) {
+  int y;
+#if defined(HAS_RGB24TOYROW_NEON)
+  void (*RGB24ToUVRow)(const uint8* src_rgb24, int src_stride_rgb24,
+      uint8* dst_u, uint8* dst_v, int width) = RGB24ToUVRow_C;
+  void (*RGB24ToYRow)(const uint8* src_rgb24, uint8* dst_y, int pix) =
+      RGB24ToYRow_C;
+#else
+  void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
+      RGB24ToARGBRow_C;
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+#endif
+  if (!src_rgb24 || !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
+    src_stride_rgb24 = -src_stride_rgb24;
+  }
+
+// Neon version does direct RGB24 to YUV.
+#if defined(HAS_RGB24TOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RGB24ToUVRow = RGB24ToUVRow_Any_NEON;
+    RGB24ToYRow = RGB24ToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RGB24ToYRow = RGB24ToYRow_NEON;
+      if (IS_ALIGNED(width, 16)) {
+        RGB24ToUVRow = RGB24ToUVRow_NEON;
+      }
+    }
+  }
+// Other platforms do intermediate conversion from RGB24 to ARGB.
+#else
+#if defined(HAS_RGB24TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+  {
+    // Allocate 2 rows of ARGB.
+    const int kRowSize = (width * 4 + 31) & ~31;
+    align_buffer_64(row, kRowSize * 2);
+#endif
+
+    for (y = 0; y < height - 1; y += 2) {
+#if defined(HAS_RGB24TOYROW_NEON)
+      RGB24ToUVRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width);
+      RGB24ToYRow(src_rgb24, dst_y, width);
+      RGB24ToYRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width);
+#else
+      RGB24ToARGBRow(src_rgb24, row, width);
+      RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width);
+      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+#endif
+      src_rgb24 += src_stride_rgb24 * 2;
+      dst_y += dst_stride_y * 2;
+      dst_u += dst_stride_u;
+      dst_v += dst_stride_v;
+    }
+    if (height & 1) {
+#if defined(HAS_RGB24TOYROW_NEON)
+      RGB24ToUVRow(src_rgb24, 0, dst_u, dst_v, width);
+      RGB24ToYRow(src_rgb24, dst_y, width);
+#else
+      RGB24ToARGBRow(src_rgb24, row, width);
+      ARGBToUVRow(row, 0, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+#endif
+    }
+#if !defined(HAS_RGB24TOYROW_NEON)
+    free_aligned_buffer_64(row);
+  }
+#endif
+  return 0;
+}
+
+// Convert RAW to I420.
+LIBYUV_API
+int RAWToI420(const uint8* src_raw, int src_stride_raw,
+              uint8* dst_y, int dst_stride_y,
+              uint8* dst_u, int dst_stride_u,
+              uint8* dst_v, int dst_stride_v,
+              int width, int height) {
+  int y;
+#if defined(HAS_RAWTOYROW_NEON)
+  void (*RAWToUVRow)(const uint8* src_raw, int src_stride_raw,
+      uint8* dst_u, uint8* dst_v, int width) = RAWToUVRow_C;
+  void (*RAWToYRow)(const uint8* src_raw, uint8* dst_y, int pix) =
+      RAWToYRow_C;
+#else
+  void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
+      RAWToARGBRow_C;
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+#endif
+  if (!src_raw || !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_raw = src_raw + (height - 1) * src_stride_raw;
+    src_stride_raw = -src_stride_raw;
+  }
+
+// Neon version does direct RAW to YUV.
+#if defined(HAS_RAWTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RAWToUVRow = RAWToUVRow_Any_NEON;
+    RAWToYRow = RAWToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RAWToYRow = RAWToYRow_NEON;
+      if (IS_ALIGNED(width, 16)) {
+        RAWToUVRow = RAWToUVRow_NEON;
+      }
+    }
+  }
+// Other platforms do intermediate conversion from RAW to ARGB.
+#else
+#if defined(HAS_RAWTOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RAWToARGBRow = RAWToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+  {
+    // Allocate 2 rows of ARGB.
+    const int kRowSize = (width * 4 + 31) & ~31;
+    align_buffer_64(row, kRowSize * 2);
+#endif
+
+    for (y = 0; y < height - 1; y += 2) {
+#if defined(HAS_RAWTOYROW_NEON)
+      RAWToUVRow(src_raw, src_stride_raw, dst_u, dst_v, width);
+      RAWToYRow(src_raw, dst_y, width);
+      RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width);
+#else
+      RAWToARGBRow(src_raw, row, width);
+      RAWToARGBRow(src_raw + src_stride_raw, row + kRowSize, width);
+      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+#endif
+      src_raw += src_stride_raw * 2;
+      dst_y += dst_stride_y * 2;
+      dst_u += dst_stride_u;
+      dst_v += dst_stride_v;
+    }
+    if (height & 1) {
+#if defined(HAS_RAWTOYROW_NEON)
+      RAWToUVRow(src_raw, 0, dst_u, dst_v, width);
+      RAWToYRow(src_raw, dst_y, width);
+#else
+      RAWToARGBRow(src_raw, row, width);
+      ARGBToUVRow(row, 0, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+#endif
+    }
+#if !defined(HAS_RAWTOYROW_NEON)
+    free_aligned_buffer_64(row);
+  }
+#endif
+  return 0;
+}
+
+// Convert RGB565 to I420.
+LIBYUV_API
+int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,
+                 uint8* dst_y, int dst_stride_y,
+                 uint8* dst_u, int dst_stride_u,
+                 uint8* dst_v, int dst_stride_v,
+                 int width, int height) {
+  int y;
+#if defined(HAS_RGB565TOYROW_NEON)
+  void (*RGB565ToUVRow)(const uint8* src_rgb565, int src_stride_rgb565,
+      uint8* dst_u, uint8* dst_v, int width) = RGB565ToUVRow_C;
+  void (*RGB565ToYRow)(const uint8* src_rgb565, uint8* dst_y, int pix) =
+      RGB565ToYRow_C;
+#else
+  void (*RGB565ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
+      RGB565ToARGBRow_C;
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+#endif
+  if (!src_rgb565 || !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_rgb565 = src_rgb565 + (height - 1) * src_stride_rgb565;
+    src_stride_rgb565 = -src_stride_rgb565;
+  }
+
+// Neon version does direct RGB565 to YUV.
+#if defined(HAS_RGB565TOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RGB565ToUVRow = RGB565ToUVRow_Any_NEON;
+    RGB565ToYRow = RGB565ToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RGB565ToYRow = RGB565ToYRow_NEON;
+      if (IS_ALIGNED(width, 16)) {
+        RGB565ToUVRow = RGB565ToUVRow_NEON;
+      }
+    }
+  }
+// Other platforms do intermediate conversion from RGB565 to ARGB.
+#else
+#if defined(HAS_RGB565TOARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      RGB565ToARGBRow = RGB565ToARGBRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_RGB565TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    RGB565ToARGBRow = RGB565ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      RGB565ToARGBRow = RGB565ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+  {
+    // Allocate 2 rows of ARGB.
+    const int kRowSize = (width * 4 + 31) & ~31;
+    align_buffer_64(row, kRowSize * 2);
+#endif
+
+    for (y = 0; y < height - 1; y += 2) {
+#if defined(HAS_RGB565TOYROW_NEON)
+      RGB565ToUVRow(src_rgb565, src_stride_rgb565, dst_u, dst_v, width);
+      RGB565ToYRow(src_rgb565, dst_y, width);
+      RGB565ToYRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y, width);
+#else
+      RGB565ToARGBRow(src_rgb565, row, width);
+      RGB565ToARGBRow(src_rgb565 + src_stride_rgb565, row + kRowSize, width);
+      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+#endif
+      src_rgb565 += src_stride_rgb565 * 2;
+      dst_y += dst_stride_y * 2;
+      dst_u += dst_stride_u;
+      dst_v += dst_stride_v;
+    }
+    if (height & 1) {
+#if defined(HAS_RGB565TOYROW_NEON)
+      RGB565ToUVRow(src_rgb565, 0, dst_u, dst_v, width);
+      RGB565ToYRow(src_rgb565, dst_y, width);
+#else
+      RGB565ToARGBRow(src_rgb565, row, width);
+      ARGBToUVRow(row, 0, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+#endif
+    }
+#if !defined(HAS_RGB565TOYROW_NEON)
+    free_aligned_buffer_64(row);
+  }
+#endif
+  return 0;
+}
+
+// Convert ARGB1555 to I420.
+LIBYUV_API
+int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555,
+                   uint8* dst_y, int dst_stride_y,
+                   uint8* dst_u, int dst_stride_u,
+                   uint8* dst_v, int dst_stride_v,
+                   int width, int height) {
+  int y;
+#if defined(HAS_ARGB1555TOYROW_NEON)
+  void (*ARGB1555ToUVRow)(const uint8* src_argb1555, int src_stride_argb1555,
+      uint8* dst_u, uint8* dst_v, int width) = ARGB1555ToUVRow_C;
+  void (*ARGB1555ToYRow)(const uint8* src_argb1555, uint8* dst_y, int pix) =
+      ARGB1555ToYRow_C;
+#else
+  void (*ARGB1555ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
+      ARGB1555ToARGBRow_C;
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+#endif
+  if (!src_argb1555 || !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb1555 = src_argb1555 + (height - 1) * src_stride_argb1555;
+    src_stride_argb1555 = -src_stride_argb1555;
+  }
+
+// Neon version does direct ARGB1555 to YUV.
+#if defined(HAS_ARGB1555TOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGB1555ToUVRow = ARGB1555ToUVRow_Any_NEON;
+    ARGB1555ToYRow = ARGB1555ToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGB1555ToYRow = ARGB1555ToYRow_NEON;
+      if (IS_ALIGNED(width, 16)) {
+        ARGB1555ToUVRow = ARGB1555ToUVRow_NEON;
+      }
+    }
+  }
+// Other platforms do intermediate conversion from ARGB1555 to ARGB.
+#else
+#if defined(HAS_ARGB1555TOARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGB1555TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      ARGB1555ToARGBRow = ARGB1555ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+  {
+    // Allocate 2 rows of ARGB.
+    const int kRowSize = (width * 4 + 31) & ~31;
+    align_buffer_64(row, kRowSize * 2);
+#endif
+
+    for (y = 0; y < height - 1; y += 2) {
+#if defined(HAS_ARGB1555TOYROW_NEON)
+      ARGB1555ToUVRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width);
+      ARGB1555ToYRow(src_argb1555, dst_y, width);
+      ARGB1555ToYRow(src_argb1555 + src_stride_argb1555, dst_y + dst_stride_y,
+                     width);
+#else
+      ARGB1555ToARGBRow(src_argb1555, row, width);
+      ARGB1555ToARGBRow(src_argb1555 + src_stride_argb1555, row + kRowSize,
+                        width);
+      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+#endif
+      src_argb1555 += src_stride_argb1555 * 2;
+      dst_y += dst_stride_y * 2;
+      dst_u += dst_stride_u;
+      dst_v += dst_stride_v;
+    }
+    if (height & 1) {
+#if defined(HAS_ARGB1555TOYROW_NEON)
+      ARGB1555ToUVRow(src_argb1555, 0, dst_u, dst_v, width);
+      ARGB1555ToYRow(src_argb1555, dst_y, width);
+#else
+      ARGB1555ToARGBRow(src_argb1555, row, width);
+      ARGBToUVRow(row, 0, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+#endif
+    }
+#if !defined(HAS_ARGB1555TOYROW_NEON)
+    free_aligned_buffer_64(row);
+  }
+#endif
+  return 0;
+}
+
+// Convert ARGB4444 to I420.
+LIBYUV_API
+int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444,
+                   uint8* dst_y, int dst_stride_y,
+                   uint8* dst_u, int dst_stride_u,
+                   uint8* dst_v, int dst_stride_v,
+                   int width, int height) {
+  int y;
+#if defined(HAS_ARGB4444TOYROW_NEON)
+  void (*ARGB4444ToUVRow)(const uint8* src_argb4444, int src_stride_argb4444,
+      uint8* dst_u, uint8* dst_v, int width) = ARGB4444ToUVRow_C;
+  void (*ARGB4444ToYRow)(const uint8* src_argb4444, uint8* dst_y, int pix) =
+      ARGB4444ToYRow_C;
+#else
+  void (*ARGB4444ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
+      ARGB4444ToARGBRow_C;
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+#endif
+  if (!src_argb4444 || !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb4444 = src_argb4444 + (height - 1) * src_stride_argb4444;
+    src_stride_argb4444 = -src_stride_argb4444;
+  }
+
+// Neon version does direct ARGB4444 to YUV.
+#if defined(HAS_ARGB4444TOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGB4444ToUVRow = ARGB4444ToUVRow_Any_NEON;
+    ARGB4444ToYRow = ARGB4444ToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGB4444ToYRow = ARGB4444ToYRow_NEON;
+      if (IS_ALIGNED(width, 16)) {
+        ARGB4444ToUVRow = ARGB4444ToUVRow_NEON;
+      }
+    }
+  }
+// Other platforms do intermediate conversion from ARGB4444 to ARGB.
+#else
+#if defined(HAS_ARGB4444TOARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGB4444TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      ARGB4444ToARGBRow = ARGB4444ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+  {
+    // Allocate 2 rows of ARGB.
+    const int kRowSize = (width * 4 + 31) & ~31;
+    align_buffer_64(row, kRowSize * 2);
+#endif
+
+    for (y = 0; y < height - 1; y += 2) {
+#if defined(HAS_ARGB4444TOYROW_NEON)
+      ARGB4444ToUVRow(src_argb4444, src_stride_argb4444, dst_u, dst_v, width);
+      ARGB4444ToYRow(src_argb4444, dst_y, width);
+      ARGB4444ToYRow(src_argb4444 + src_stride_argb4444, dst_y + dst_stride_y,
+                     width);
+#else
+      ARGB4444ToARGBRow(src_argb4444, row, width);
+      ARGB4444ToARGBRow(src_argb4444 + src_stride_argb4444, row + kRowSize,
+                        width);
+      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+#endif
+      src_argb4444 += src_stride_argb4444 * 2;
+      dst_y += dst_stride_y * 2;
+      dst_u += dst_stride_u;
+      dst_v += dst_stride_v;
+    }
+    if (height & 1) {
+#if defined(HAS_ARGB4444TOYROW_NEON)
+      ARGB4444ToUVRow(src_argb4444, 0, dst_u, dst_v, width);
+      ARGB4444ToYRow(src_argb4444, dst_y, width);
+#else
+      ARGB4444ToARGBRow(src_argb4444, row, width);
+      ARGBToUVRow(row, 0, dst_u, dst_v, width);
+      ARGBToYRow(row, dst_y, width);
+#endif
+    }
+#if !defined(HAS_ARGB4444TOYROW_NEON)
+    free_aligned_buffer_64(row);
+  }
+#endif
+  return 0;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libaom/src/third_party/libyuv/source/convert_argb.cc b/libs/libaom/src/third_party/libyuv/source/convert_argb.cc
new file mode 100644
index 000000000..44756bc41
--- /dev/null
+++ b/libs/libaom/src/third_party/libyuv/source/convert_argb.cc
@@ -0,0 +1,1155 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert_argb.h"
+
+#include "libyuv/cpu_id.h"
+#ifdef HAVE_JPEG
+#include "libyuv/mjpeg_decoder.h"
+#endif
+#include "libyuv/rotate_argb.h"
+#include "libyuv/row.h"
+#include "libyuv/video_common.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Copy ARGB with optional flipping
+LIBYUV_API
+int ARGBCopy(const uint8* src_argb, int src_stride_argb,
+             uint8* dst_argb, int dst_stride_argb,
+             int width, int height) {
+  if (!src_argb || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+
+  CopyPlane(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+            width * 4, height);
+  return 0;
+}
+
+// Convert I444 to ARGB.
+LIBYUV_API
+int I444ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  int y;
+  void (*I444ToARGBRow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = I444ToARGBRow_C;
+  if (!src_y || !src_u || !src_v ||
+      !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width &&
+      src_stride_u == width &&
+      src_stride_v == width &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
+  }
+#if defined(HAS_I444TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I444ToARGBRow = I444ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I444ToARGBRow = I444ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I444TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I444ToARGBRow = I444ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I444ToARGBRow = I444ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I444TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I444ToARGBRow = I444ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I444ToARGBRow = I444ToARGBRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I444ToARGBRow(src_y, src_u, src_v, dst_argb, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+  return 0;
+}
+
+// Convert I422 to ARGB.
+LIBYUV_API
+int I422ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  int y;
+  void (*I422ToARGBRow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = I422ToARGBRow_C;
+  if (!src_y || !src_u || !src_v ||
+      !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width &&
+      src_stride_u * 2 == width &&
+      src_stride_v * 2 == width &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
+  }
+#if defined(HAS_I422TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToARGBRow = I422ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToARGBRow = I422ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToARGBRow = I422ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+    I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToARGBRow(src_y, src_u, src_v, dst_argb, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+  return 0;
+}
+
+// Convert I411 to ARGB.
+LIBYUV_API
+int I411ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  int y;
+  void (*I411ToARGBRow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = I411ToARGBRow_C;
+  if (!src_y || !src_u || !src_v ||
+      !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width &&
+      src_stride_u * 4 == width &&
+      src_stride_v * 4 == width &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
+  }
+#if defined(HAS_I411TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I411ToARGBRow = I411ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I411ToARGBRow = I411ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I411TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I411ToARGBRow = I411ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I411ToARGBRow = I411ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I411TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I411ToARGBRow = I411ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I411ToARGBRow = I411ToARGBRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I411ToARGBRow(src_y, src_u, src_v, dst_argb, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+  return 0;
+}
+
+// Convert I400 to ARGB.
+LIBYUV_API
+int I400ToARGB(const uint8* src_y, int src_stride_y,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  int y;
+  void (*I400ToARGBRow)(const uint8* y_buf,
+                     uint8* rgb_buf,
+                     int width) = I400ToARGBRow_C;
+  if (!src_y || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_y = dst_stride_argb = 0;
+  }
+#if defined(HAS_I400TOARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    I400ToARGBRow = I400ToARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      I400ToARGBRow = I400ToARGBRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_I400TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I400ToARGBRow = I400ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I400ToARGBRow = I400ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I400TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I400ToARGBRow = I400ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I400ToARGBRow = I400ToARGBRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I400ToARGBRow(src_y, dst_argb, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+  }
+  return 0;
+}
+
+// Convert J400 to ARGB.
+LIBYUV_API
+int J400ToARGB(const uint8* src_y, int src_stride_y,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  int y;
+  void (*J400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int pix) =
+      J400ToARGBRow_C;
+  if (!src_y || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_stride_y = -src_stride_y;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_y = dst_stride_argb = 0;
+  }
+#if defined(HAS_J400TOARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    J400ToARGBRow = J400ToARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      J400ToARGBRow = J400ToARGBRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_J400TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    J400ToARGBRow = J400ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      J400ToARGBRow = J400ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_J400TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    J400ToARGBRow = J400ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      J400ToARGBRow = J400ToARGBRow_NEON;
+    }
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    J400ToARGBRow(src_y, dst_argb, width);
+    src_y += src_stride_y;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Shuffle table for converting BGRA to ARGB.
+static uvec8 kShuffleMaskBGRAToARGB = {
+  3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
+};
+
+// Shuffle table for converting ABGR to ARGB.
+static uvec8 kShuffleMaskABGRToARGB = {
+  2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
+};
+
+// Shuffle table for converting RGBA to ARGB.
+static uvec8 kShuffleMaskRGBAToARGB = {
+  1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u
+};
+
+// Convert BGRA to ARGB.
+LIBYUV_API
+int BGRAToARGB(const uint8* src_bgra, int src_stride_bgra,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  return ARGBShuffle(src_bgra, src_stride_bgra,
+                     dst_argb, dst_stride_argb,
+                     (const uint8*)(&kShuffleMaskBGRAToARGB),
+                     width, height);
+}
+
+// Convert ARGB to BGRA (same as BGRAToARGB).
+LIBYUV_API
+int ARGBToBGRA(const uint8* src_bgra, int src_stride_bgra,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  return ARGBShuffle(src_bgra, src_stride_bgra,
+                     dst_argb, dst_stride_argb,
+                     (const uint8*)(&kShuffleMaskBGRAToARGB),
+                     width, height);
+}
+
+// Convert ABGR to ARGB.
+LIBYUV_API
+int ABGRToARGB(const uint8* src_abgr, int src_stride_abgr,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  return ARGBShuffle(src_abgr, src_stride_abgr,
+                     dst_argb, dst_stride_argb,
+                     (const uint8*)(&kShuffleMaskABGRToARGB),
+                     width, height);
+}
+
+// Convert ARGB to ABGR to (same as ABGRToARGB).
+LIBYUV_API
+int ARGBToABGR(const uint8* src_abgr, int src_stride_abgr,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  return ARGBShuffle(src_abgr, src_stride_abgr,
+                     dst_argb, dst_stride_argb,
+                     (const uint8*)(&kShuffleMaskABGRToARGB),
+                     width, height);
+}
+
+// Convert RGBA to ARGB.
+LIBYUV_API
+int RGBAToARGB(const uint8* src_rgba, int src_stride_rgba,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  return ARGBShuffle(src_rgba, src_stride_rgba,
+                     dst_argb, dst_stride_argb,
+                     (const uint8*)(&kShuffleMaskRGBAToARGB),
+                     width, height);
+}
+
+// Convert RGB24 to ARGB.
+LIBYUV_API
+int RGB24ToARGB(const uint8* src_rgb24, int src_stride_rgb24,
+                uint8* dst_argb, int dst_stride_argb,
+                int width, int height) {
+  int y;
+  void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
+      RGB24ToARGBRow_C;
+  if (!src_rgb24 || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
+    src_stride_rgb24 = -src_stride_rgb24;
+  }
+  // Coalesce rows.
+  if (src_stride_rgb24 == width * 3 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_rgb24 = dst_stride_argb = 0;
+  }
+#if defined(HAS_RGB24TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_RGB24TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RGB24ToARGBRow = RGB24ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RGB24ToARGBRow = RGB24ToARGBRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    RGB24ToARGBRow(src_rgb24, dst_argb, width);
+    src_rgb24 += src_stride_rgb24;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert RAW to ARGB.
+LIBYUV_API
+int RAWToARGB(const uint8* src_raw, int src_stride_raw,
+              uint8* dst_argb, int dst_stride_argb,
+              int width, int height) {
+  int y;
+  void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
+      RAWToARGBRow_C;
+  if (!src_raw || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_raw = src_raw + (height - 1) * src_stride_raw;
+    src_stride_raw = -src_stride_raw;
+  }
+  // Coalesce rows.
+  if (src_stride_raw == width * 3 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_raw = dst_stride_argb = 0;
+  }
+#if defined(HAS_RAWTOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RAWToARGBRow = RAWToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_RAWTOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RAWToARGBRow = RAWToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RAWToARGBRow = RAWToARGBRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    RAWToARGBRow(src_raw, dst_argb, width);
+    src_raw += src_stride_raw;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert RGB565 to ARGB.
+LIBYUV_API
+int RGB565ToARGB(const uint8* src_rgb565, int src_stride_rgb565,
+                 uint8* dst_argb, int dst_stride_argb,
+                 int width, int height) {
+  int y;
+  void (*RGB565ToARGBRow)(const uint8* src_rgb565, uint8* dst_argb, int pix) =
+      RGB565ToARGBRow_C;
+  if (!src_rgb565 || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_rgb565 = src_rgb565 + (height - 1) * src_stride_rgb565;
+    src_stride_rgb565 = -src_stride_rgb565;
+  }
+  // Coalesce rows.
+  if (src_stride_rgb565 == width * 2 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_rgb565 = dst_stride_argb = 0;
+  }
+#if defined(HAS_RGB565TOARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      RGB565ToARGBRow = RGB565ToARGBRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_RGB565TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    RGB565ToARGBRow = RGB565ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      RGB565ToARGBRow = RGB565ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_RGB565TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RGB565ToARGBRow = RGB565ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RGB565ToARGBRow = RGB565ToARGBRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    RGB565ToARGBRow(src_rgb565, dst_argb, width);
+    src_rgb565 += src_stride_rgb565;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert ARGB1555 to ARGB.
+LIBYUV_API
+int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555,
+                   uint8* dst_argb, int dst_stride_argb,
+                   int width, int height) {
+  int y;
+  void (*ARGB1555ToARGBRow)(const uint8* src_argb1555, uint8* dst_argb,
+      int pix) = ARGB1555ToARGBRow_C;
+  if (!src_argb1555 || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb1555 = src_argb1555 + (height - 1) * src_stride_argb1555;
+    src_stride_argb1555 = -src_stride_argb1555;
+  }
+  // Coalesce rows.
+  if (src_stride_argb1555 == width * 2 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb1555 = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGB1555TOARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGB1555TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      ARGB1555ToARGBRow = ARGB1555ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGB1555TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGB1555ToARGBRow = ARGB1555ToARGBRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGB1555ToARGBRow(src_argb1555, dst_argb, width);
+    src_argb1555 += src_stride_argb1555;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert ARGB4444 to ARGB.
+LIBYUV_API
+int ARGB4444ToARGB(const uint8* src_argb4444, int src_stride_argb4444,
+                   uint8* dst_argb, int dst_stride_argb,
+                   int width, int height) {
+  int y;
+  void (*ARGB4444ToARGBRow)(const uint8* src_argb4444, uint8* dst_argb,
+      int pix) = ARGB4444ToARGBRow_C;
+  if (!src_argb4444 || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb4444 = src_argb4444 + (height - 1) * src_stride_argb4444;
+    src_stride_argb4444 = -src_stride_argb4444;
+  }
+  // Coalesce rows.
+  if (src_stride_argb4444 == width * 2 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb4444 = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGB4444TOARGBROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGB4444TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      ARGB4444ToARGBRow = ARGB4444ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGB4444TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGB4444ToARGBRow = ARGB4444ToARGBRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGB4444ToARGBRow(src_argb4444, dst_argb, width);
+    src_argb4444 += src_stride_argb4444;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert NV12 to ARGB.
+LIBYUV_API
+int NV12ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_uv, int src_stride_uv,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  int y;
+  void (*NV12ToARGBRow)(const uint8* y_buf,
+                        const uint8* uv_buf,
+                        uint8* rgb_buf,
+                        int width) = NV12ToARGBRow_C;
+  if (!src_y || !src_uv || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+#if defined(HAS_NV12TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToARGBRow = NV12ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_NV12TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    NV12ToARGBRow = NV12ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      NV12ToARGBRow = NV12ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_NV12TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    NV12ToARGBRow = NV12ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToARGBRow = NV12ToARGBRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    NV12ToARGBRow(src_y, src_uv, dst_argb, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_uv += src_stride_uv;
+    }
+  }
+  return 0;
+}
+
+// Convert NV21 to ARGB.
+LIBYUV_API
+int NV21ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_uv, int src_stride_uv,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  int y;
+  void (*NV21ToARGBRow)(const uint8* y_buf,
+                        const uint8* uv_buf,
+                        uint8* rgb_buf,
+                        int width) = NV21ToARGBRow_C;
+  if (!src_y || !src_uv || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+#if defined(HAS_NV21TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    NV21ToARGBRow = NV21ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      NV21ToARGBRow = NV21ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_NV21TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    NV21ToARGBRow = NV21ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      NV21ToARGBRow = NV21ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_NV21TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    NV21ToARGBRow = NV21ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      NV21ToARGBRow = NV21ToARGBRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    NV21ToARGBRow(src_y, src_uv, dst_argb, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_uv += src_stride_uv;
+    }
+  }
+  return 0;
+}
+
+// Convert M420 to ARGB.
+LIBYUV_API
+int M420ToARGB(const uint8* src_m420, int src_stride_m420,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  int y;
+  void (*NV12ToARGBRow)(const uint8* y_buf,
+                        const uint8* uv_buf,
+                        uint8* rgb_buf,
+                        int width) = NV12ToARGBRow_C;
+  if (!src_m420 || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+#if defined(HAS_NV12TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToARGBRow = NV12ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_NV12TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    NV12ToARGBRow = NV12ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      NV12ToARGBRow = NV12ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_NV12TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    NV12ToARGBRow = NV12ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToARGBRow = NV12ToARGBRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb, width);
+    NV12ToARGBRow(src_m420 + src_stride_m420, src_m420 + src_stride_m420 * 2,
+                  dst_argb + dst_stride_argb, width);
+    dst_argb += dst_stride_argb * 2;
+    src_m420 += src_stride_m420 * 3;
+  }
+  if (height & 1) {
+    NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb, width);
+  }
+  return 0;
+}
+
+// Convert YUY2 to ARGB.
+LIBYUV_API
+int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  int y;
+  void (*YUY2ToARGBRow)(const uint8* src_yuy2, uint8* dst_argb, int pix) =
+      YUY2ToARGBRow_C;
+  if (!src_yuy2 || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
+    src_stride_yuy2 = -src_stride_yuy2;
+  }
+  // Coalesce rows.
+  if (src_stride_yuy2 == width * 2 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_yuy2 = dst_stride_argb = 0;
+  }
+#if defined(HAS_YUY2TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    YUY2ToARGBRow = YUY2ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      YUY2ToARGBRow = YUY2ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_YUY2TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    YUY2ToARGBRow = YUY2ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      YUY2ToARGBRow = YUY2ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_YUY2TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    YUY2ToARGBRow = YUY2ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      YUY2ToARGBRow = YUY2ToARGBRow_NEON;
+    }
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    YUY2ToARGBRow(src_yuy2, dst_argb, width);
+    src_yuy2 += src_stride_yuy2;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert UYVY to ARGB.
+LIBYUV_API
+int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  int y;
+  void (*UYVYToARGBRow)(const uint8* src_uyvy, uint8* dst_argb, int pix) =
+      UYVYToARGBRow_C;
+  if (!src_uyvy || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
+    src_stride_uyvy = -src_stride_uyvy;
+  }
+  // Coalesce rows.
+  if (src_stride_uyvy == width * 2 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_uyvy = dst_stride_argb = 0;
+  }
+#if defined(HAS_UYVYTOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    UYVYToARGBRow = UYVYToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      UYVYToARGBRow = UYVYToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_UYVYTOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    UYVYToARGBRow = UYVYToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      UYVYToARGBRow = UYVYToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_UYVYTOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    UYVYToARGBRow = UYVYToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      UYVYToARGBRow = UYVYToARGBRow_NEON;
+    }
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    UYVYToARGBRow(src_uyvy, dst_argb, width);
+    src_uyvy += src_stride_uyvy;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert J420 to ARGB.
+LIBYUV_API
+int J420ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  int y;
+  void (*J422ToARGBRow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = J422ToARGBRow_C;
+  if (!src_y || !src_u || !src_v || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+#if defined(HAS_J422TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    J422ToARGBRow = J422ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      J422ToARGBRow = J422ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_J422TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    J422ToARGBRow = J422ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      J422ToARGBRow = J422ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_J422TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    J422ToARGBRow = J422ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      J422ToARGBRow = J422ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_J422TOARGBROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+    J422ToARGBRow = J422ToARGBRow_MIPS_DSPR2;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    J422ToARGBRow(src_y, src_u, src_v, dst_argb, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert J422 to ARGB.
+LIBYUV_API
+int J422ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  int y;
+  void (*J422ToARGBRow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = J422ToARGBRow_C;
+  if (!src_y || !src_u || !src_v ||
+      !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width &&
+      src_stride_u * 2 == width &&
+      src_stride_v * 2 == width &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
+  }
+#if defined(HAS_J422TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    J422ToARGBRow = J422ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      J422ToARGBRow = J422ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_J422TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    J422ToARGBRow = J422ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      J422ToARGBRow = J422ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_J422TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    J422ToARGBRow = J422ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      J422ToARGBRow = J422ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_J422TOARGBROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+    J422ToARGBRow = J422ToARGBRow_MIPS_DSPR2;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    J422ToARGBRow(src_y, src_u, src_v, dst_argb, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+  return 0;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libaom/src/third_party/libyuv/source/convert_from.cc b/libs/libaom/src/third_party/libyuv/source/convert_from.cc
new file mode 100644
index 000000000..31f1ac992
--- /dev/null
+++ b/libs/libaom/src/third_party/libyuv/source/convert_from.cc
@@ -0,0 +1,1348 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert_from.h"
+
+#include "libyuv/basic_types.h"
+#include "libyuv/convert.h"  // For I420Copy
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"
+#include "libyuv/scale.h"  // For ScalePlane()
+#include "libyuv/video_common.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
+static __inline int Abs(int v) {
+  return v >= 0 ? v : -v;
+}
+
+// I420 To any I4xx YUV format with mirroring.
+static int I420ToI4xx(const uint8* src_y, int src_stride_y,
+                      const uint8* src_u, int src_stride_u,
+                      const uint8* src_v, int src_stride_v,
+                      uint8* dst_y, int dst_stride_y,
+                      uint8* dst_u, int dst_stride_u,
+                      uint8* dst_v, int dst_stride_v,
+                      int src_y_width, int src_y_height,
+                      int dst_uv_width, int dst_uv_height) {
+  const int dst_y_width = Abs(src_y_width);
+  const int dst_y_height = Abs(src_y_height);
+  const int src_uv_width = SUBSAMPLE(src_y_width, 1, 1);
+  const int src_uv_height = SUBSAMPLE(src_y_height, 1, 1);
+  if (src_y_width == 0 || src_y_height == 0 ||
+      dst_uv_width <= 0 || dst_uv_height <= 0) {
+    return -1;
+  }
+  ScalePlane(src_y, src_stride_y, src_y_width, src_y_height,
+             dst_y, dst_stride_y, dst_y_width, dst_y_height,
+             kFilterBilinear);
+  ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height,
+             dst_u, dst_stride_u, dst_uv_width, dst_uv_height,
+             kFilterBilinear);
+  ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height,
+             dst_v, dst_stride_v, dst_uv_width, dst_uv_height,
+             kFilterBilinear);
+  return 0;
+}
+
+// 420 chroma is 1/2 width, 1/2 height
+// 422 chroma is 1/2 width, 1x height
+LIBYUV_API
+int I420ToI422(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  const int dst_uv_width = (Abs(width) + 1) >> 1;
+  const int dst_uv_height = Abs(height);
+  return I420ToI4xx(src_y, src_stride_y,
+                    src_u, src_stride_u,
+                    src_v, src_stride_v,
+                    dst_y, dst_stride_y,
+                    dst_u, dst_stride_u,
+                    dst_v, dst_stride_v,
+                    width, height,
+                    dst_uv_width, dst_uv_height);
+}
+
+// 420 chroma is 1/2 width, 1/2 height
+// 444 chroma is 1x width, 1x height
+LIBYUV_API
+int I420ToI444(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  const int dst_uv_width = Abs(width);
+  const int dst_uv_height = Abs(height);
+  return I420ToI4xx(src_y, src_stride_y,
+                    src_u, src_stride_u,
+                    src_v, src_stride_v,
+                    dst_y, dst_stride_y,
+                    dst_u, dst_stride_u,
+                    dst_v, dst_stride_v,
+                    width, height,
+                    dst_uv_width, dst_uv_height);
+}
+
+// 420 chroma is 1/2 width, 1/2 height
+// 411 chroma is 1/4 width, 1x height
+LIBYUV_API
+int I420ToI411(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  const int dst_uv_width = (Abs(width) + 3) >> 2;
+  const int dst_uv_height = Abs(height);
+  return I420ToI4xx(src_y, src_stride_y,
+                    src_u, src_stride_u,
+                    src_v, src_stride_v,
+                    dst_y, dst_stride_y,
+                    dst_u, dst_stride_u,
+                    dst_v, dst_stride_v,
+                    width, height,
+                    dst_uv_width, dst_uv_height);
+}
+
+// Copy to I400. Source can be I420,422,444,400,NV12,NV21
+LIBYUV_API
+int I400Copy(const uint8* src_y, int src_stride_y,
+             uint8* dst_y, int dst_stride_y,
+             int width, int height) {
+  if (!src_y || !dst_y ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_stride_y = -src_stride_y;
+  }
+  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  return 0;
+}
+
+LIBYUV_API
+int I422ToYUY2(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_yuy2, int dst_stride_yuy2,
+               int width, int height) {
+  int y;
+  void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,
+                        const uint8* src_v, uint8* dst_yuy2, int width) =
+      I422ToYUY2Row_C;
+  if (!src_y || !src_u || !src_v || !dst_yuy2 ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
+    dst_stride_yuy2 = -dst_stride_yuy2;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width &&
+      src_stride_u * 2 == width &&
+      src_stride_v * 2 == width &&
+      dst_stride_yuy2 == width * 2) {
+    width *= height;
+    height = 1;
+    src_stride_y = src_stride_u = src_stride_v = dst_stride_yuy2 = 0;
+  }
+#if defined(HAS_I422TOYUY2ROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToYUY2Row = I422ToYUY2Row_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOYUY2ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToYUY2Row = I422ToYUY2Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+    dst_yuy2 += dst_stride_yuy2;
+  }
+  return 0;
+}
+
+LIBYUV_API
+int I420ToYUY2(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_yuy2, int dst_stride_yuy2,
+               int width, int height) {
+  int y;
+  void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,
+                        const uint8* src_v, uint8* dst_yuy2, int width) =
+      I422ToYUY2Row_C;
+  if (!src_y || !src_u || !src_v || !dst_yuy2 ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
+    dst_stride_yuy2 = -dst_stride_yuy2;
+  }
+#if defined(HAS_I422TOYUY2ROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToYUY2Row = I422ToYUY2Row_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOYUY2ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToYUY2Row = I422ToYUY2Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
+    I422ToYUY2Row(src_y + src_stride_y, src_u, src_v,
+                  dst_yuy2 + dst_stride_yuy2, width);
+    src_y += src_stride_y * 2;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+    dst_yuy2 += dst_stride_yuy2 * 2;
+  }
+  if (height & 1) {
+    I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
+  }
+  return 0;
+}
+
+LIBYUV_API
+int I422ToUYVY(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_uyvy, int dst_stride_uyvy,
+               int width, int height) {
+  int y;
+  void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,
+                        const uint8* src_v, uint8* dst_uyvy, int width) =
+      I422ToUYVYRow_C;
+  if (!src_y || !src_u || !src_v || !dst_uyvy ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy;
+    dst_stride_uyvy = -dst_stride_uyvy;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width &&
+      src_stride_u * 2 == width &&
+      src_stride_v * 2 == width &&
+      dst_stride_uyvy == width * 2) {
+    width *= height;
+    height = 1;
+    src_stride_y = src_stride_u = src_stride_v = dst_stride_uyvy = 0;
+  }
+#if defined(HAS_I422TOUYVYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToUYVYRow = I422ToUYVYRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOUYVYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToUYVYRow = I422ToUYVYRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+    dst_uyvy += dst_stride_uyvy;
+  }
+  return 0;
+}
+
+LIBYUV_API
+int I420ToUYVY(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_uyvy, int dst_stride_uyvy,
+               int width, int height) {
+  int y;
+  void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,
+                        const uint8* src_v, uint8* dst_uyvy, int width) =
+      I422ToUYVYRow_C;
+  if (!src_y || !src_u || !src_v || !dst_uyvy ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy;
+    dst_stride_uyvy = -dst_stride_uyvy;
+  }
+#if defined(HAS_I422TOUYVYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToUYVYRow = I422ToUYVYRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOUYVYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToUYVYRow = I422ToUYVYRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
+    I422ToUYVYRow(src_y + src_stride_y, src_u, src_v,
+                  dst_uyvy + dst_stride_uyvy, width);
+    src_y += src_stride_y * 2;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+    dst_uyvy += dst_stride_uyvy * 2;
+  }
+  if (height & 1) {
+    I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
+  }
+  return 0;
+}
+
+LIBYUV_API
+int I420ToNV12(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_uv, int dst_stride_uv,
+               int width, int height) {
+  int y;
+  void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+      int width) = MergeUVRow_C;
+  // Coalesce rows.
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!src_y || !src_u || !src_v || !dst_y || !dst_uv ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
+    dst_uv = dst_uv + (halfheight - 1) * dst_stride_uv;
+    dst_stride_y = -dst_stride_y;
+    dst_stride_uv = -dst_stride_uv;
+  }
+  if (src_stride_y == width &&
+      dst_stride_y == width) {
+    width *= height;
+    height = 1;
+    src_stride_y = dst_stride_y = 0;
+  }
+  // Coalesce rows.
+  if (src_stride_u == halfwidth &&
+      src_stride_v == halfwidth &&
+      dst_stride_uv == halfwidth * 2) {
+    halfwidth *= halfheight;
+    halfheight = 1;
+    src_stride_u = src_stride_v = dst_stride_uv = 0;
+  }
+#if defined(HAS_MERGEUVROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    MergeUVRow_ = MergeUVRow_Any_SSE2;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MergeUVRow_ = MergeUVRow_Any_AVX2;
+    if (IS_ALIGNED(halfwidth, 32)) {
+      MergeUVRow_ = MergeUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MergeUVRow_ = MergeUVRow_Any_NEON;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_NEON;
+    }
+  }
+#endif
+
+  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  for (y = 0; y < halfheight; ++y) {
+    // Merge a row of U and V into a row of UV.
+    MergeUVRow_(src_u, src_v, dst_uv, halfwidth);
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+    dst_uv += dst_stride_uv;
+  }
+  return 0;
+}
+
+LIBYUV_API
+int I420ToNV21(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_vu, int dst_stride_vu,
+               int width, int height) {
+  return I420ToNV12(src_y, src_stride_y,
+                    src_v, src_stride_v,
+                    src_u, src_stride_u,
+                    dst_y, src_stride_y,
+                    dst_vu, dst_stride_vu,
+                    width, height);
+}
+
+// Convert I420 to ARGB.
+LIBYUV_API
+int I420ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  int y;
+  void (*I422ToARGBRow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = I422ToARGBRow_C;
+  if (!src_y || !src_u || !src_v || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+#if defined(HAS_I422TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToARGBRow = I422ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToARGBRow = I422ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToARGBRow = I422ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+    I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToARGBRow(src_y, src_u, src_v, dst_argb, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I420 to BGRA.
+LIBYUV_API
+int I420ToBGRA(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_bgra, int dst_stride_bgra,
+               int width, int height) {
+  int y;
+  void (*I422ToBGRARow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = I422ToBGRARow_C;
+  if (!src_y || !src_u || !src_v || !dst_bgra ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_bgra = dst_bgra + (height - 1) * dst_stride_bgra;
+    dst_stride_bgra = -dst_stride_bgra;
+  }
+#if defined(HAS_I422TOBGRAROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToBGRARow = I422ToBGRARow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToBGRARow = I422ToBGRARow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TOBGRAROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToBGRARow = I422ToBGRARow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToBGRARow = I422ToBGRARow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOBGRAROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToBGRARow = I422ToBGRARow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToBGRARow = I422ToBGRARow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TOBGRAROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+      IS_ALIGNED(dst_bgra, 4) && IS_ALIGNED(dst_stride_bgra, 4)) {
+    I422ToBGRARow = I422ToBGRARow_MIPS_DSPR2;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToBGRARow(src_y, src_u, src_v, dst_bgra, width);
+    dst_bgra += dst_stride_bgra;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I420 to ABGR.
+LIBYUV_API
+int I420ToABGR(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_abgr, int dst_stride_abgr,
+               int width, int height) {
+  int y;
+  void (*I422ToABGRRow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = I422ToABGRRow_C;
+  if (!src_y || !src_u || !src_v || !dst_abgr ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_abgr = dst_abgr + (height - 1) * dst_stride_abgr;
+    dst_stride_abgr = -dst_stride_abgr;
+  }
+#if defined(HAS_I422TOABGRROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToABGRRow = I422ToABGRRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToABGRRow = I422ToABGRRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TOABGRROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToABGRRow = I422ToABGRRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToABGRRow = I422ToABGRRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOABGRROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToABGRRow = I422ToABGRRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToABGRRow = I422ToABGRRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToABGRRow(src_y, src_u, src_v, dst_abgr, width);
+    dst_abgr += dst_stride_abgr;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I420 to RGBA.
+LIBYUV_API
+int I420ToRGBA(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_rgba, int dst_stride_rgba,
+               int width, int height) {
+  int y;
+  void (*I422ToRGBARow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = I422ToRGBARow_C;
+  if (!src_y || !src_u || !src_v || !dst_rgba ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
+    dst_stride_rgba = -dst_stride_rgba;
+  }
+#if defined(HAS_I422TORGBAROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGBARow = I422ToRGBARow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGBAROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToRGBARow = I422ToRGBARow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGBARow = I422ToRGBARow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGBAROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToRGBARow = I422ToRGBARow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGBARow = I422ToRGBARow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToRGBARow(src_y, src_u, src_v, dst_rgba, width);
+    dst_rgba += dst_stride_rgba;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I420 to RGB24.
+LIBYUV_API
+int I420ToRGB24(const uint8* src_y, int src_stride_y,
+                const uint8* src_u, int src_stride_u,
+                const uint8* src_v, int src_stride_v,
+                uint8* dst_rgb24, int dst_stride_rgb24,
+                int width, int height) {
+  int y;
+  void (*I422ToRGB24Row)(const uint8* y_buf,
+                         const uint8* u_buf,
+                         const uint8* v_buf,
+                         uint8* rgb_buf,
+                         int width) = I422ToRGB24Row_C;
+  if (!src_y || !src_u || !src_v || !dst_rgb24 ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
+    dst_stride_rgb24 = -dst_stride_rgb24;
+  }
+#if defined(HAS_I422TORGB24ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGB24Row = I422ToRGB24Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB24ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToRGB24Row = I422ToRGB24Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGB24Row = I422ToRGB24Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB24ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToRGB24Row = I422ToRGB24Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGB24Row = I422ToRGB24Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, width);
+    dst_rgb24 += dst_stride_rgb24;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I420 to RAW.
+LIBYUV_API
+int I420ToRAW(const uint8* src_y, int src_stride_y,
+                const uint8* src_u, int src_stride_u,
+                const uint8* src_v, int src_stride_v,
+                uint8* dst_raw, int dst_stride_raw,
+                int width, int height) {
+  int y;
+  void (*I422ToRAWRow)(const uint8* y_buf,
+                       const uint8* u_buf,
+                       const uint8* v_buf,
+                       uint8* rgb_buf,
+                       int width) = I422ToRAWRow_C;
+  if (!src_y || !src_u || !src_v || !dst_raw ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_raw = dst_raw + (height - 1) * dst_stride_raw;
+    dst_stride_raw = -dst_stride_raw;
+  }
+#if defined(HAS_I422TORAWROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToRAWRow = I422ToRAWRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRAWRow = I422ToRAWRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TORAWROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToRAWRow = I422ToRAWRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRAWRow = I422ToRAWRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TORAWROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToRAWRow = I422ToRAWRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRAWRow = I422ToRAWRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToRAWRow(src_y, src_u, src_v, dst_raw, width);
+    dst_raw += dst_stride_raw;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I420 to ARGB1555.
+LIBYUV_API
+int I420ToARGB1555(const uint8* src_y, int src_stride_y,
+                   const uint8* src_u, int src_stride_u,
+                   const uint8* src_v, int src_stride_v,
+                   uint8* dst_argb1555, int dst_stride_argb1555,
+                   int width, int height) {
+  int y;
+  void (*I422ToARGB1555Row)(const uint8* y_buf,
+                            const uint8* u_buf,
+                            const uint8* v_buf,
+                            uint8* rgb_buf,
+                            int width) = I422ToARGB1555Row_C;
+  if (!src_y || !src_u || !src_v || !dst_argb1555 ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb1555 = dst_argb1555 + (height - 1) * dst_stride_argb1555;
+    dst_stride_argb1555 = -dst_stride_argb1555;
+  }
+#if defined(HAS_I422TOARGB1555ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToARGB1555Row = I422ToARGB1555Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGB1555Row = I422ToARGB1555Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGB1555ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToARGB1555Row = I422ToARGB1555Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToARGB1555Row = I422ToARGB1555Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGB1555ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToARGB1555Row = I422ToARGB1555Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGB1555Row = I422ToARGB1555Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToARGB1555Row(src_y, src_u, src_v, dst_argb1555, width);
+    dst_argb1555 += dst_stride_argb1555;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+
+// Convert I420 to ARGB4444.
+LIBYUV_API
+int I420ToARGB4444(const uint8* src_y, int src_stride_y,
+                   const uint8* src_u, int src_stride_u,
+                   const uint8* src_v, int src_stride_v,
+                   uint8* dst_argb4444, int dst_stride_argb4444,
+                   int width, int height) {
+  int y;
+  void (*I422ToARGB4444Row)(const uint8* y_buf,
+                            const uint8* u_buf,
+                            const uint8* v_buf,
+                            uint8* rgb_buf,
+                            int width) = I422ToARGB4444Row_C;
+  if (!src_y || !src_u || !src_v || !dst_argb4444 ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb4444 = dst_argb4444 + (height - 1) * dst_stride_argb4444;
+    dst_stride_argb4444 = -dst_stride_argb4444;
+  }
+#if defined(HAS_I422TOARGB4444ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToARGB4444Row = I422ToARGB4444Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGB4444Row = I422ToARGB4444Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGB4444ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToARGB4444Row = I422ToARGB4444Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToARGB4444Row = I422ToARGB4444Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGB4444ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToARGB4444Row = I422ToARGB4444Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGB4444Row = I422ToARGB4444Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToARGB4444Row(src_y, src_u, src_v, dst_argb4444, width);
+    dst_argb4444 += dst_stride_argb4444;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I420 to RGB565.
+LIBYUV_API
+int I420ToRGB565(const uint8* src_y, int src_stride_y,
+                 const uint8* src_u, int src_stride_u,
+                 const uint8* src_v, int src_stride_v,
+                 uint8* dst_rgb565, int dst_stride_rgb565,
+                 int width, int height) {
+  int y;
+  void (*I422ToRGB565Row)(const uint8* y_buf,
+                          const uint8* u_buf,
+                          const uint8* v_buf,
+                          uint8* rgb_buf,
+                          int width) = I422ToRGB565Row_C;
+  if (!src_y || !src_u || !src_v || !dst_rgb565 ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+    dst_stride_rgb565 = -dst_stride_rgb565;
+  }
+#if defined(HAS_I422TORGB565ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGB565Row = I422ToRGB565Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB565ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToRGB565Row = I422ToRGB565Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGB565Row = I422ToRGB565Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB565ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToRGB565Row = I422ToRGB565Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGB565Row = I422ToRGB565Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, width);
+    dst_rgb565 += dst_stride_rgb565;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Ordered 8x8 dither for 888 to 565.  Values from 0 to 7.
+static const uint8 kDither565_4x4[16] = {
+  0, 4, 1, 5,
+  6, 2, 7, 3,
+  1, 5, 0, 4,
+  7, 3, 6, 2,
+};
+
+// Convert I420 to RGB565 with dithering.
+LIBYUV_API
+int I420ToRGB565Dither(const uint8* src_y, int src_stride_y,
+                       const uint8* src_u, int src_stride_u,
+                       const uint8* src_v, int src_stride_v,
+                       uint8* dst_rgb565, int dst_stride_rgb565,
+                       const uint8* dither4x4, int width, int height) {
+  int y;
+  void (*I422ToARGBRow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = I422ToARGBRow_C;
+  void (*ARGBToRGB565DitherRow)(const uint8* src_argb, uint8* dst_rgb,
+      const uint32 dither4, int pix) = ARGBToRGB565DitherRow_C;
+  if (!src_y || !src_u || !src_v || !dst_rgb565 ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+    dst_stride_rgb565 = -dst_stride_rgb565;
+  }
+  if (!dither4x4) {
+    dither4x4 = kDither565_4x4;
+  }
+#if defined(HAS_I422TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToARGBRow = I422ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToARGBRow = I422ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToARGBRow = I422ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2)) {
+    I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
+  }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_NEON;
+    }
+  }
+#endif
+  {
+    // Allocate a row of argb.
+    align_buffer_64(row_argb, width * 4);
+    for (y = 0; y < height; ++y) {
+      I422ToARGBRow(src_y, src_u, src_v, row_argb, width);
+      ARGBToRGB565DitherRow(row_argb, dst_rgb565,
+                            *(uint32*)(dither4x4 + ((y & 3) << 2)), width);
+      dst_rgb565 += dst_stride_rgb565;
+      src_y += src_stride_y;
+      if (y & 1) {
+        src_u += src_stride_u;
+        src_v += src_stride_v;
+      }
+    }
+    free_aligned_buffer_64(row_argb);
+  }
+  return 0;
+}
+
+// Convert I420 to specified format
+LIBYUV_API
+int ConvertFromI420(const uint8* y, int y_stride,
+                    const uint8* u, int u_stride,
+                    const uint8* v, int v_stride,
+                    uint8* dst_sample, int dst_sample_stride,
+                    int width, int height,
+                    uint32 fourcc) {
+  uint32 format = CanonicalFourCC(fourcc);
+  int r = 0;
+  if (!y || !u|| !v || !dst_sample ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  switch (format) {
+    // Single plane formats
+    case FOURCC_YUY2:
+      r = I420ToYUY2(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 2,
+                     width, height);
+      break;
+    case FOURCC_UYVY:
+      r = I420ToUYVY(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 2,
+                     width, height);
+      break;
+    case FOURCC_RGBP:
+      r = I420ToRGB565(y, y_stride,
+                       u, u_stride,
+                       v, v_stride,
+                       dst_sample,
+                       dst_sample_stride ? dst_sample_stride : width * 2,
+                       width, height);
+      break;
+    case FOURCC_RGBO:
+      r = I420ToARGB1555(y, y_stride,
+                         u, u_stride,
+                         v, v_stride,
+                         dst_sample,
+                         dst_sample_stride ? dst_sample_stride : width * 2,
+                         width, height);
+      break;
+    case FOURCC_R444:
+      r = I420ToARGB4444(y, y_stride,
+                         u, u_stride,
+                         v, v_stride,
+                         dst_sample,
+                         dst_sample_stride ? dst_sample_stride : width * 2,
+                         width, height);
+      break;
+    case FOURCC_24BG:
+      r = I420ToRGB24(y, y_stride,
+                      u, u_stride,
+                      v, v_stride,
+                      dst_sample,
+                      dst_sample_stride ? dst_sample_stride : width * 3,
+                      width, height);
+      break;
+    case FOURCC_RAW:
+      r = I420ToRAW(y, y_stride,
+                    u, u_stride,
+                    v, v_stride,
+                    dst_sample,
+                    dst_sample_stride ? dst_sample_stride : width * 3,
+                    width, height);
+      break;
+    case FOURCC_ARGB:
+      r = I420ToARGB(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 4,
+                     width, height);
+      break;
+    case FOURCC_BGRA:
+      r = I420ToBGRA(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 4,
+                     width, height);
+      break;
+    case FOURCC_ABGR:
+      r = I420ToABGR(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 4,
+                     width, height);
+      break;
+    case FOURCC_RGBA:
+      r = I420ToRGBA(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 4,
+                     width, height);
+      break;
+    case FOURCC_I400:
+      r = I400Copy(y, y_stride,
+                   dst_sample,
+                   dst_sample_stride ? dst_sample_stride : width,
+                   width, height);
+      break;
+    case FOURCC_NV12: {
+      uint8* dst_uv = dst_sample + width * height;
+      r = I420ToNV12(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width,
+                     dst_uv,
+                     dst_sample_stride ? dst_sample_stride : width,
+                     width, height);
+      break;
+    }
+    case FOURCC_NV21: {
+      uint8* dst_vu = dst_sample + width * height;
+      r = I420ToNV21(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width,
+                     dst_vu,
+                     dst_sample_stride ? dst_sample_stride : width,
+                     width, height);
+      break;
+    }
+    // TODO(fbarchard): Add M420.
+    // Triplanar formats
+    // TODO(fbarchard): halfstride instead of halfwidth
+    case FOURCC_I420:
+    case FOURCC_YU12:
+    case FOURCC_YV12: {
+      int halfwidth = (width + 1) / 2;
+      int halfheight = (height + 1) / 2;
+      uint8* dst_u;
+      uint8* dst_v;
+      if (format == FOURCC_YV12) {
+        dst_v = dst_sample + width * height;
+        dst_u = dst_v + halfwidth * halfheight;
+      } else {
+        dst_u = dst_sample + width * height;
+        dst_v = dst_u + halfwidth * halfheight;
+      }
+      r = I420Copy(y, y_stride,
+                   u, u_stride,
+                   v, v_stride,
+                   dst_sample, width,
+                   dst_u, halfwidth,
+                   dst_v, halfwidth,
+                   width, height);
+      break;
+    }
+    case FOURCC_I422:
+    case FOURCC_YV16: {
+      int halfwidth = (width + 1) / 2;
+      uint8* dst_u;
+      uint8* dst_v;
+      if (format == FOURCC_YV16) {
+        dst_v = dst_sample + width * height;
+        dst_u = dst_v + halfwidth * height;
+      } else {
+        dst_u = dst_sample + width * height;
+        dst_v = dst_u + halfwidth * height;
+      }
+      r = I420ToI422(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_sample, width,
+                     dst_u, halfwidth,
+                     dst_v, halfwidth,
+                     width, height);
+      break;
+    }
+    case FOURCC_I444:
+    case FOURCC_YV24: {
+      uint8* dst_u;
+      uint8* dst_v;
+      if (format == FOURCC_YV24) {
+        dst_v = dst_sample + width * height;
+        dst_u = dst_v + width * height;
+      } else {
+        dst_u = dst_sample + width * height;
+        dst_v = dst_u + width * height;
+      }
+      r = I420ToI444(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_sample, width,
+                     dst_u, width,
+                     dst_v, width,
+                     width, height);
+      break;
+    }
+    case FOURCC_I411: {
+      int quarterwidth = (width + 3) / 4;
+      uint8* dst_u = dst_sample + width * height;
+      uint8* dst_v = dst_u + quarterwidth * height;
+      r = I420ToI411(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     dst_sample, width,
+                     dst_u, quarterwidth,
+                     dst_v, quarterwidth,
+                     width, height);
+      break;
+    }
+
+    // Formats not supported - MJPG, biplanar, some rgb formats.
+    default:
+      return -1;  // unknown fourcc - return failure code.
+  }
+  return r;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libaom/src/third_party/libyuv/source/convert_from_argb.cc b/libs/libaom/src/third_party/libyuv/source/convert_from_argb.cc
new file mode 100644
index 000000000..8d1e97aec
--- /dev/null
+++ b/libs/libaom/src/third_party/libyuv/source/convert_from_argb.cc
@@ -0,0 +1,1301 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert_from_argb.h"
+
+#include "libyuv/basic_types.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// ARGB little endian (bgra in memory) to I444
+LIBYUV_API
+int ARGBToI444(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int y;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+  void (*ARGBToUV444Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+      int pix) = ARGBToUV444Row_C;
+  if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_y == width &&
+      dst_stride_u == width &&
+      dst_stride_v == width) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
+  }
+#if defined(HAS_ARGBTOUV444ROW_SSSE3)
+    if (TestCpuFlag(kCpuHasSSSE3)) {
+      ARGBToUV444Row = ARGBToUV444Row_Any_SSSE3;
+      if (IS_ALIGNED(width, 16)) {
+        ARGBToUV444Row = ARGBToUV444Row_SSSE3;
+      }
+  }
+#endif
+#if defined(HAS_ARGBTOUV444ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUV444Row = ARGBToUV444Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToUV444Row = ARGBToUV444Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToUV444Row(src_argb, dst_u, dst_v, width);
+    ARGBToYRow(src_argb, dst_y, width);
+    src_argb += src_stride_argb;
+    dst_y += dst_stride_y;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  return 0;
+}
+
+// ARGB little endian (bgra in memory) to I422
+LIBYUV_API
+int ARGBToI422(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int y;
+  void (*ARGBToUV422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+      int pix) = ARGBToUV422Row_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+  if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_y == width &&
+      dst_stride_u * 2 == width &&
+      dst_stride_v * 2 == width) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
+  }
+#if defined(HAS_ARGBTOUV422ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUV422Row = ARGBToUV422Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUV422ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUV422Row = ARGBToUV422Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToUV422Row(src_argb, dst_u, dst_v, width);
+    ARGBToYRow(src_argb, dst_y, width);
+    src_argb += src_stride_argb;
+    dst_y += dst_stride_y;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  return 0;
+}
+
+// ARGB little endian (bgra in memory) to I411
+LIBYUV_API
+int ARGBToI411(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int y;
+  void (*ARGBToUV411Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+      int pix) = ARGBToUV411Row_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+  if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_y == width &&
+      dst_stride_u * 4 == width &&
+      dst_stride_v * 4 == width) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
+  }
+#if defined(HAS_ARGBTOYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUV411ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUV411Row = ARGBToUV411Row_Any_NEON;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUV411Row = ARGBToUV411Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToUV411Row(src_argb, dst_u, dst_v, width);
+    ARGBToYRow(src_argb, dst_y, width);
+    src_argb += src_stride_argb;
+    dst_y += dst_stride_y;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  return 0;
+}
+
+LIBYUV_API
+int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_uv, int dst_stride_uv,
+               int width, int height) {
+  int y;
+  int halfwidth = (width + 1) >> 1;
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+                      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+  void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                      int width) = MergeUVRow_C;
+  if (!src_argb ||
+      !dst_y || !dst_uv ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUVRow = ARGBToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    MergeUVRow_ = MergeUVRow_Any_SSE2;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MergeUVRow_ = MergeUVRow_Any_AVX2;
+    if (IS_ALIGNED(halfwidth, 32)) {
+      MergeUVRow_ = MergeUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MergeUVRow_ = MergeUVRow_Any_NEON;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_NEON;
+    }
+  }
+#endif
+  {
+    // Allocate a rows of uv.
+    align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
+    uint8* row_v = row_u + ((halfwidth + 31) & ~31);
+
+    for (y = 0; y < height - 1; y += 2) {
+      ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
+      MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
+      ARGBToYRow(src_argb, dst_y, width);
+      ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
+      src_argb += src_stride_argb * 2;
+      dst_y += dst_stride_y * 2;
+      dst_uv += dst_stride_uv;
+    }
+    if (height & 1) {
+      ARGBToUVRow(src_argb, 0, row_u, row_v, width);
+      MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
+      ARGBToYRow(src_argb, dst_y, width);
+    }
+    free_aligned_buffer_64(row_u);
+  }
+  return 0;
+}
+
+// Same as NV12 but U and V swapped.
+LIBYUV_API
+int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_uv, int dst_stride_uv,
+               int width, int height) {
+  int y;
+  int halfwidth = (width + 1) >> 1;
+  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+                      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+  void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                      int width) = MergeUVRow_C;
+  if (!src_argb ||
+      !dst_y || !dst_uv ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_AVX2;
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUVRow = ARGBToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVRow = ARGBToUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    MergeUVRow_ = MergeUVRow_Any_SSE2;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MergeUVRow_ = MergeUVRow_Any_AVX2;
+    if (IS_ALIGNED(halfwidth, 32)) {
+      MergeUVRow_ = MergeUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MergeUVRow_ = MergeUVRow_Any_NEON;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_NEON;
+    }
+  }
+#endif
+  {
+    // Allocate a rows of uv.
+    align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
+    uint8* row_v = row_u + ((halfwidth + 31) & ~31);
+
+    for (y = 0; y < height - 1; y += 2) {
+      ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
+      MergeUVRow_(row_v, row_u, dst_uv, halfwidth);
+      ARGBToYRow(src_argb, dst_y, width);
+      ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
+      src_argb += src_stride_argb * 2;
+      dst_y += dst_stride_y * 2;
+      dst_uv += dst_stride_uv;
+    }
+    if (height & 1) {
+      ARGBToUVRow(src_argb, 0, row_u, row_v, width);
+      MergeUVRow_(row_v, row_u, dst_uv, halfwidth);
+      ARGBToYRow(src_argb, dst_y, width);
+    }
+    free_aligned_buffer_64(row_u);
+  }
+  return 0;
+}
+
+// Convert ARGB to YUY2.
+LIBYUV_API
+int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_yuy2, int dst_stride_yuy2,
+               int width, int height) {
+  int y;
+  void (*ARGBToUV422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+      int pix) = ARGBToUV422Row_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+  void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,
+      const uint8* src_v, uint8* dst_yuy2, int width) = I422ToYUY2Row_C;
+
+  if (!src_argb || !dst_yuy2 ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
+    dst_stride_yuy2 = -dst_stride_yuy2;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_yuy2 == width * 2) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_yuy2 = 0;
+  }
+#if defined(HAS_ARGBTOUV422ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUV422Row = ARGBToUV422Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUV422ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUV422Row = ARGBToUV422Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+    }
+  }
+#endif
+
+#if defined(HAS_I422TOYUY2ROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToYUY2Row = I422ToYUY2Row_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOYUY2ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToYUY2Row = I422ToYUY2Row_NEON;
+    }
+  }
+#endif
+
+  {
+    // Allocate a rows of yuv.
+    align_buffer_64(row_y, ((width + 63) & ~63) * 2);
+    uint8* row_u = row_y + ((width + 63) & ~63);
+    uint8* row_v = row_u + ((width + 63) & ~63) / 2;
+
+    for (y = 0; y < height; ++y) {
+      ARGBToUV422Row(src_argb, row_u, row_v, width);
+      ARGBToYRow(src_argb, row_y, width);
+      I422ToYUY2Row(row_y, row_u, row_v, dst_yuy2, width);
+      src_argb += src_stride_argb;
+      dst_yuy2 += dst_stride_yuy2;
+    }
+
+    free_aligned_buffer_64(row_y);
+  }
+  return 0;
+}
+
+// Convert ARGB to UYVY.
+LIBYUV_API
+int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_uyvy, int dst_stride_uyvy,
+               int width, int height) {
+  int y;
+  void (*ARGBToUV422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+      int pix) = ARGBToUV422Row_C;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+  void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,
+      const uint8* src_v, uint8* dst_uyvy, int width) = I422ToUYVYRow_C;
+
+  if (!src_argb || !dst_uyvy ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy;
+    dst_stride_uyvy = -dst_stride_uyvy;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_uyvy == width * 2) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_uyvy = 0;
+  }
+#if defined(HAS_ARGBTOUV422ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUV422Row = ARGBToUV422Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUV422ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUV422Row = ARGBToUV422Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+    }
+  }
+#endif
+
+#if defined(HAS_I422TOUYVYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToUYVYRow = I422ToUYVYRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOUYVYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToUYVYRow = I422ToUYVYRow_NEON;
+    }
+  }
+#endif
+
+  {
+    // Allocate a rows of yuv.
+    align_buffer_64(row_y, ((width + 63) & ~63) * 2);
+    uint8* row_u = row_y + ((width + 63) & ~63);
+    uint8* row_v = row_u + ((width + 63) & ~63) / 2;
+
+    for (y = 0; y < height; ++y) {
+      ARGBToUV422Row(src_argb, row_u, row_v, width);
+      ARGBToYRow(src_argb, row_y, width);
+      I422ToUYVYRow(row_y, row_u, row_v, dst_uyvy, width);
+      src_argb += src_stride_argb;
+      dst_uyvy += dst_stride_uyvy;
+    }
+
+    free_aligned_buffer_64(row_y);
+  }
+  return 0;
+}
+
+// Convert ARGB to I400.
+LIBYUV_API
+int ARGBToI400(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               int width, int height) {
+  int y;
+  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYRow_C;
+  if (!src_argb || !dst_y || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_y == width) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_y = 0;
+  }
+#if defined(HAS_ARGBTOYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYRow = ARGBToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYRow = ARGBToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYRow = ARGBToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYRow = ARGBToYRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToYRow(src_argb, dst_y, width);
+    src_argb += src_stride_argb;
+    dst_y += dst_stride_y;
+  }
+  return 0;
+}
+
+// Shuffle table for converting ARGB to RGBA.
+static uvec8 kShuffleMaskARGBToRGBA = {
+  3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u
+};
+
+// Convert ARGB to RGBA.
+LIBYUV_API
+int ARGBToRGBA(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_rgba, int dst_stride_rgba,
+               int width, int height) {
+  return ARGBShuffle(src_argb, src_stride_argb,
+                     dst_rgba, dst_stride_rgba,
+                     (const uint8*)(&kShuffleMaskARGBToRGBA),
+                     width, height);
+}
+
+// Convert ARGB To RGB24.
+LIBYUV_API
+int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
+                uint8* dst_rgb24, int dst_stride_rgb24,
+                int width, int height) {
+  int y;
+  void (*ARGBToRGB24Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
+      ARGBToRGB24Row_C;
+  if (!src_argb || !dst_rgb24 || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_rgb24 == width * 3) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_rgb24 = 0;
+  }
+#if defined(HAS_ARGBTORGB24ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToRGB24Row = ARGBToRGB24Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToRGB24Row = ARGBToRGB24Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORGB24ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToRGB24Row = ARGBToRGB24Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB24Row = ARGBToRGB24Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToRGB24Row(src_argb, dst_rgb24, width);
+    src_argb += src_stride_argb;
+    dst_rgb24 += dst_stride_rgb24;
+  }
+  return 0;
+}
+
+// Convert ARGB To RAW.
+LIBYUV_API
+int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
+              uint8* dst_raw, int dst_stride_raw,
+              int width, int height) {
+  int y;
+  void (*ARGBToRAWRow)(const uint8* src_argb, uint8* dst_rgb, int pix) =
+      ARGBToRAWRow_C;
+  if (!src_argb || !dst_raw || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_raw == width * 3) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_raw = 0;
+  }
+#if defined(HAS_ARGBTORAWROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToRAWRow = ARGBToRAWRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToRAWRow = ARGBToRAWRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORAWROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToRAWRow = ARGBToRAWRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRAWRow = ARGBToRAWRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToRAWRow(src_argb, dst_raw, width);
+    src_argb += src_stride_argb;
+    dst_raw += dst_stride_raw;
+  }
+  return 0;
+}
+
+// Ordered 8x8 dither for 888 to 565.  Values from 0 to 7.
+static const uint8 kDither565_4x4[16] = {
+  0, 4, 1, 5,
+  6, 2, 7, 3,
+  1, 5, 0, 4,
+  7, 3, 6, 2,
+};
+
+// Convert ARGB To RGB565 with 4x4 dither matrix (16 bytes).
+LIBYUV_API
+int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb,
+                       uint8* dst_rgb565, int dst_stride_rgb565,
+                       const uint8* dither4x4, int width, int height) {
+  int y;
+  void (*ARGBToRGB565DitherRow)(const uint8* src_argb, uint8* dst_rgb,
+      const uint32 dither4, int pix) = ARGBToRGB565DitherRow_C;
+  if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  if (!dither4x4) {
+    dither4x4 = kDither565_4x4;
+  }
+#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_NEON;
+    }
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    ARGBToRGB565DitherRow(src_argb, dst_rgb565,
+                          *(uint32*)(dither4x4 + ((y & 3) << 2)), width);
+    src_argb += src_stride_argb;
+    dst_rgb565 += dst_stride_rgb565;
+  }
+  return 0;
+}
+
+// Convert ARGB To RGB565.
+// TODO(fbarchard): Consider using dither function low level with zeros.
+LIBYUV_API
+int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
+                 uint8* dst_rgb565, int dst_stride_rgb565,
+                 int width, int height) {
+  int y;
+  void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
+      ARGBToRGB565Row_C;
+  if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_rgb565 == width * 2) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_rgb565 = 0;
+  }
+#if defined(HAS_ARGBTORGB565ROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBToRGB565Row = ARGBToRGB565Row_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBToRGB565Row = ARGBToRGB565Row_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORGB565ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToRGB565Row = ARGBToRGB565Row_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565Row = ARGBToRGB565Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORGB565ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToRGB565Row = ARGBToRGB565Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565Row = ARGBToRGB565Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToRGB565Row(src_argb, dst_rgb565, width);
+    src_argb += src_stride_argb;
+    dst_rgb565 += dst_stride_rgb565;
+  }
+  return 0;
+}
+
+// Convert ARGB To ARGB1555.
+LIBYUV_API
+int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
+                   uint8* dst_argb1555, int dst_stride_argb1555,
+                   int width, int height) {
+  int y;
+  void (*ARGBToARGB1555Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
+      ARGBToARGB1555Row_C;
+  if (!src_argb || !dst_argb1555 || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_argb1555 == width * 2) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_argb1555 = 0;
+  }
+#if defined(HAS_ARGBTOARGB1555ROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBToARGB1555Row = ARGBToARGB1555Row_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBToARGB1555Row = ARGBToARGB1555Row_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOARGB1555ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToARGB1555Row = ARGBToARGB1555Row_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToARGB1555Row = ARGBToARGB1555Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOARGB1555ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToARGB1555Row = ARGBToARGB1555Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToARGB1555Row = ARGBToARGB1555Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToARGB1555Row(src_argb, dst_argb1555, width);
+    src_argb += src_stride_argb;
+    dst_argb1555 += dst_stride_argb1555;
+  }
+  return 0;
+}
+
+// Convert ARGB To ARGB4444.
+LIBYUV_API
+int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
+                   uint8* dst_argb4444, int dst_stride_argb4444,
+                   int width, int height) {
+  int y;
+  void (*ARGBToARGB4444Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
+      ARGBToARGB4444Row_C;
+  if (!src_argb || !dst_argb4444 || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_argb4444 == width * 2) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_argb4444 = 0;
+  }
+#if defined(HAS_ARGBTOARGB4444ROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBToARGB4444Row = ARGBToARGB4444Row_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBToARGB4444Row = ARGBToARGB4444Row_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOARGB4444ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToARGB4444Row = ARGBToARGB4444Row_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToARGB4444Row = ARGBToARGB4444Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOARGB4444ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToARGB4444Row = ARGBToARGB4444Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToARGB4444Row = ARGBToARGB4444Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToARGB4444Row(src_argb, dst_argb4444, width);
+    src_argb += src_stride_argb;
+    dst_argb4444 += dst_stride_argb4444;
+  }
+  return 0;
+}
+
+// Convert ARGB to J420. (JPeg full range I420).
+LIBYUV_API
+int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_yj, int dst_stride_yj,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int y;
+  void (*ARGBToUVJRow)(const uint8* src_argb0, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int width) = ARGBToUVJRow_C;
+  void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int pix) =
+      ARGBToYJRow_C;
+  if (!src_argb ||
+      !dst_yj || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+#if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
+    ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVJRow = ARGBToUVJRow_SSSE3;
+      ARGBToYJRow = ARGBToYJRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYJRow = ARGBToYJRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYJRow = ARGBToYJRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYJRow = ARGBToYJRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYJRow = ARGBToYJRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUVJRow = ARGBToUVJRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVJRow = ARGBToUVJRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    ARGBToUVJRow(src_argb, src_stride_argb, dst_u, dst_v, width);
+    ARGBToYJRow(src_argb, dst_yj, width);
+    ARGBToYJRow(src_argb + src_stride_argb, dst_yj + dst_stride_yj, width);
+    src_argb += src_stride_argb * 2;
+    dst_yj += dst_stride_yj * 2;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  if (height & 1) {
+    ARGBToUVJRow(src_argb, 0, dst_u, dst_v, width);
+    ARGBToYJRow(src_argb, dst_yj, width);
+  }
+  return 0;
+}
+
+// ARGB little endian (bgra in memory) to J422
+LIBYUV_API
+int ARGBToJ422(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int y;
+  void (*ARGBToUVJ422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+      int pix) = ARGBToUVJ422Row_C;
+  void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYJRow_C;
+  if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_y == width &&
+      dst_stride_u * 2 == width &&
+      dst_stride_v * 2 == width) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
+  }
+#if defined(HAS_ARGBTOUVJ422ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVJ422Row = ARGBToUVJ422Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVJ422Row = ARGBToUVJ422Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVJ422ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUVJ422Row = ARGBToUVJ422Row_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVJ422Row = ARGBToUVJ422Row_NEON;
+    }
+  }
+#endif
+
+#if defined(HAS_ARGBTOYJROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYJRow = ARGBToYJRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYJRow = ARGBToYJRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYJRow = ARGBToYJRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYJRow = ARGBToYJRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYJRow = ARGBToYJRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToUVJ422Row(src_argb, dst_u, dst_v, width);
+    ARGBToYJRow(src_argb, dst_y, width);
+    src_argb += src_stride_argb;
+    dst_y += dst_stride_y;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  return 0;
+}
+
+// Convert ARGB to J400.
+LIBYUV_API
+int ARGBToJ400(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_yj, int dst_stride_yj,
+               int width, int height) {
+  int y;
+  void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int pix) =
+      ARGBToYJRow_C;
+  if (!src_argb || !dst_yj || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_yj == width) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_yj = 0;
+  }
+#if defined(HAS_ARGBTOYJROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYJRow = ARGBToYJRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYJRow = ARGBToYJRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYJRow = ARGBToYJRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYJRow = ARGBToYJRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYJRow = ARGBToYJRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToYJRow(src_argb, dst_yj, width);
+    src_argb += src_stride_argb;
+    dst_yj += dst_stride_yj;
+  }
+  return 0;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libaom/src/third_party/libyuv/source/convert_jpeg.cc b/libs/libaom/src/third_party/libyuv/source/convert_jpeg.cc
new file mode 100644
index 000000000..bcb980f7f
--- /dev/null
+++ b/libs/libaom/src/third_party/libyuv/source/convert_jpeg.cc
@@ -0,0 +1,392 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert.h"
+
+#ifdef HAVE_JPEG
+#include "libyuv/mjpeg_decoder.h"
+#endif
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#ifdef HAVE_JPEG
+struct I420Buffers {
+  uint8* y;
+  int y_stride;
+  uint8* u;
+  int u_stride;
+  uint8* v;
+  int v_stride;
+  int w;
+  int h;
+};
+
+static void JpegCopyI420(void* opaque,
+                         const uint8* const* data,
+                         const int* strides,
+                         int rows) {
+  I420Buffers* dest = (I420Buffers*)(opaque);
+  I420Copy(data[0], strides[0],
+           data[1], strides[1],
+           data[2], strides[2],
+           dest->y, dest->y_stride,
+           dest->u, dest->u_stride,
+           dest->v, dest->v_stride,
+           dest->w, rows);
+  dest->y += rows * dest->y_stride;
+  dest->u += ((rows + 1) >> 1) * dest->u_stride;
+  dest->v += ((rows + 1) >> 1) * dest->v_stride;
+  dest->h -= rows;
+}
+
+static void JpegI422ToI420(void* opaque,
+                           const uint8* const* data,
+                           const int* strides,
+                           int rows) {
+  I420Buffers* dest = (I420Buffers*)(opaque);
+  I422ToI420(data[0], strides[0],
+             data[1], strides[1],
+             data[2], strides[2],
+             dest->y, dest->y_stride,
+             dest->u, dest->u_stride,
+             dest->v, dest->v_stride,
+             dest->w, rows);
+  dest->y += rows * dest->y_stride;
+  dest->u += ((rows + 1) >> 1) * dest->u_stride;
+  dest->v += ((rows + 1) >> 1) * dest->v_stride;
+  dest->h -= rows;
+}
+
+static void JpegI444ToI420(void* opaque,
+                           const uint8* const* data,
+                           const int* strides,
+                           int rows) {
+  I420Buffers* dest = (I420Buffers*)(opaque);
+  I444ToI420(data[0], strides[0],
+             data[1], strides[1],
+             data[2], strides[2],
+             dest->y, dest->y_stride,
+             dest->u, dest->u_stride,
+             dest->v, dest->v_stride,
+             dest->w, rows);
+  dest->y += rows * dest->y_stride;
+  dest->u += ((rows + 1) >> 1) * dest->u_stride;
+  dest->v += ((rows + 1) >> 1) * dest->v_stride;
+  dest->h -= rows;
+}
+
+static void JpegI411ToI420(void* opaque,
+                           const uint8* const* data,
+                           const int* strides,
+                           int rows) {
+  I420Buffers* dest = (I420Buffers*)(opaque);
+  I411ToI420(data[0], strides[0],
+             data[1], strides[1],
+             data[2], strides[2],
+             dest->y, dest->y_stride,
+             dest->u, dest->u_stride,
+             dest->v, dest->v_stride,
+             dest->w, rows);
+  dest->y += rows * dest->y_stride;
+  dest->u += ((rows + 1) >> 1) * dest->u_stride;
+  dest->v += ((rows + 1) >> 1) * dest->v_stride;
+  dest->h -= rows;
+}
+
+static void JpegI400ToI420(void* opaque,
+                           const uint8* const* data,
+                           const int* strides,
+                           int rows) {
+  I420Buffers* dest = (I420Buffers*)(opaque);
+  I400ToI420(data[0], strides[0],
+             dest->y, dest->y_stride,
+             dest->u, dest->u_stride,
+             dest->v, dest->v_stride,
+             dest->w, rows);
+  dest->y += rows * dest->y_stride;
+  dest->u += ((rows + 1) >> 1) * dest->u_stride;
+  dest->v += ((rows + 1) >> 1) * dest->v_stride;
+  dest->h -= rows;
+}
+
+// Query size of MJPG in pixels.
+LIBYUV_API
+int MJPGSize(const uint8* sample, size_t sample_size,
+             int* width, int* height) {
+  MJpegDecoder mjpeg_decoder;
+  LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
+  if (ret) {
+    *width = mjpeg_decoder.GetWidth();
+    *height = mjpeg_decoder.GetHeight();
+  }
+  mjpeg_decoder.UnloadFrame();
+  return ret ? 0 : -1;  // -1 for runtime failure.
+}
+
+// MJPG (Motion JPeg) to I420
+// TODO(fbarchard): review w and h requirement. dw and dh may be enough.
+LIBYUV_API
+int MJPGToI420(const uint8* sample,
+               size_t sample_size,
+               uint8* y, int y_stride,
+               uint8* u, int u_stride,
+               uint8* v, int v_stride,
+               int w, int h,
+               int dw, int dh) {
+  if (sample_size == kUnknownDataSize) {
+    // ERROR: MJPEG frame size unknown
+    return -1;
+  }
+
+  // TODO(fbarchard): Port MJpeg to C.
+  MJpegDecoder mjpeg_decoder;
+  LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
+  if (ret && (mjpeg_decoder.GetWidth() != w ||
+              mjpeg_decoder.GetHeight() != h)) {
+    // ERROR: MJPEG frame has unexpected dimensions
+    mjpeg_decoder.UnloadFrame();
+    return 1;  // runtime failure
+  }
+  if (ret) {
+    I420Buffers bufs = { y, y_stride, u, u_stride, v, v_stride, dw, dh };
+    // YUV420
+    if (mjpeg_decoder.GetColorSpace() ==
+            MJpegDecoder::kColorSpaceYCbCr &&
+        mjpeg_decoder.GetNumComponents() == 3 &&
+        mjpeg_decoder.GetVertSampFactor(0) == 2 &&
+        mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+        mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+        mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+        mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+        mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegCopyI420, &bufs, dw, dh);
+    // YUV422
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceYCbCr &&
+               mjpeg_decoder.GetNumComponents() == 3 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToI420, &bufs, dw, dh);
+    // YUV444
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceYCbCr &&
+               mjpeg_decoder.GetNumComponents() == 3 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 1 &&
+               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToI420, &bufs, dw, dh);
+    // YUV411
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceYCbCr &&
+               mjpeg_decoder.GetNumComponents() == 3 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 4 &&
+               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI411ToI420, &bufs, dw, dh);
+    // YUV400
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceGrayscale &&
+               mjpeg_decoder.GetNumComponents() == 1 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToI420, &bufs, dw, dh);
+    } else {
+      // TODO(fbarchard): Implement conversion for any other colorspace/sample
+      // factors that occur in practice. 411 is supported by libjpeg
+      // ERROR: Unable to convert MJPEG frame because format is not supported
+      mjpeg_decoder.UnloadFrame();
+      return 1;
+    }
+  }
+  return ret ? 0 : 1;
+}
+
+#ifdef HAVE_JPEG
+struct ARGBBuffers {
+  uint8* argb;
+  int argb_stride;
+  int w;
+  int h;
+};
+
+static void JpegI420ToARGB(void* opaque,
+                         const uint8* const* data,
+                         const int* strides,
+                         int rows) {
+  ARGBBuffers* dest = (ARGBBuffers*)(opaque);
+  I420ToARGB(data[0], strides[0],
+             data[1], strides[1],
+             data[2], strides[2],
+             dest->argb, dest->argb_stride,
+             dest->w, rows);
+  dest->argb += rows * dest->argb_stride;
+  dest->h -= rows;
+}
+
+static void JpegI422ToARGB(void* opaque,
+                           const uint8* const* data,
+                           const int* strides,
+                           int rows) {
+  ARGBBuffers* dest = (ARGBBuffers*)(opaque);
+  I422ToARGB(data[0], strides[0],
+             data[1], strides[1],
+             data[2], strides[2],
+             dest->argb, dest->argb_stride,
+             dest->w, rows);
+  dest->argb += rows * dest->argb_stride;
+  dest->h -= rows;
+}
+
+static void JpegI444ToARGB(void* opaque,
+                           const uint8* const* data,
+                           const int* strides,
+                           int rows) {
+  ARGBBuffers* dest = (ARGBBuffers*)(opaque);
+  I444ToARGB(data[0], strides[0],
+             data[1], strides[1],
+             data[2], strides[2],
+             dest->argb, dest->argb_stride,
+             dest->w, rows);
+  dest->argb += rows * dest->argb_stride;
+  dest->h -= rows;
+}
+
+static void JpegI411ToARGB(void* opaque,
+                           const uint8* const* data,
+                           const int* strides,
+                           int rows) {
+  ARGBBuffers* dest = (ARGBBuffers*)(opaque);
+  I411ToARGB(data[0], strides[0],
+             data[1], strides[1],
+             data[2], strides[2],
+             dest->argb, dest->argb_stride,
+             dest->w, rows);
+  dest->argb += rows * dest->argb_stride;
+  dest->h -= rows;
+}
+
+static void JpegI400ToARGB(void* opaque,
+                           const uint8* const* data,
+                           const int* strides,
+                           int rows) {
+  ARGBBuffers* dest = (ARGBBuffers*)(opaque);
+  I400ToARGB(data[0], strides[0],
+             dest->argb, dest->argb_stride,
+             dest->w, rows);
+  dest->argb += rows * dest->argb_stride;
+  dest->h -= rows;
+}
+
+// MJPG (Motion JPeg) to ARGB
+// TODO(fbarchard): review w and h requirement. dw and dh may be enough.
+LIBYUV_API
+int MJPGToARGB(const uint8* sample,
+               size_t sample_size,
+               uint8* argb, int argb_stride,
+               int w, int h,
+               int dw, int dh) {
+  if (sample_size == kUnknownDataSize) {
+    // ERROR: MJPEG frame size unknown
+    return -1;
+  }
+
+  // TODO(fbarchard): Port MJpeg to C.
+  MJpegDecoder mjpeg_decoder;
+  LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
+  if (ret && (mjpeg_decoder.GetWidth() != w ||
+              mjpeg_decoder.GetHeight() != h)) {
+    // ERROR: MJPEG frame has unexpected dimensions
+    mjpeg_decoder.UnloadFrame();
+    return 1;  // runtime failure
+  }
+  if (ret) {
+    ARGBBuffers bufs = { argb, argb_stride, dw, dh };
+    // YUV420
+    if (mjpeg_decoder.GetColorSpace() ==
+            MJpegDecoder::kColorSpaceYCbCr &&
+        mjpeg_decoder.GetNumComponents() == 3 &&
+        mjpeg_decoder.GetVertSampFactor(0) == 2 &&
+        mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+        mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+        mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+        mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+        mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToARGB, &bufs, dw, dh);
+    // YUV422
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceYCbCr &&
+               mjpeg_decoder.GetNumComponents() == 3 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToARGB, &bufs, dw, dh);
+    // YUV444
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceYCbCr &&
+               mjpeg_decoder.GetNumComponents() == 3 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 1 &&
+               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToARGB, &bufs, dw, dh);
+    // YUV411
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceYCbCr &&
+               mjpeg_decoder.GetNumComponents() == 3 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 4 &&
+               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI411ToARGB, &bufs, dw, dh);
+    // YUV400
+    } else if (mjpeg_decoder.GetColorSpace() ==
+                   MJpegDecoder::kColorSpaceGrayscale &&
+               mjpeg_decoder.GetNumComponents() == 1 &&
+               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+               mjpeg_decoder.GetHorizSampFactor(0) == 1) {
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToARGB, &bufs, dw, dh);
+    } else {
+      // TODO(fbarchard): Implement conversion for any other colorspace/sample
+      // factors that occur in practice. 411 is supported by libjpeg
+      // ERROR: Unable to convert MJPEG frame because format is not supported
+      mjpeg_decoder.UnloadFrame();
+      return 1;
+    }
+  }
+  return ret ? 0 : 1;
+}
+#endif
+
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libaom/src/third_party/libyuv/source/convert_to_argb.cc b/libs/libaom/src/third_party/libyuv/source/convert_to_argb.cc
new file mode 100644
index 000000000..af829fbd3
--- /dev/null
+++ b/libs/libaom/src/third_party/libyuv/source/convert_to_argb.cc
@@ -0,0 +1,306 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert_argb.h"
+
+#include "libyuv/cpu_id.h"
+#ifdef HAVE_JPEG
+#include "libyuv/mjpeg_decoder.h"
+#endif
+#include "libyuv/rotate_argb.h"
+#include "libyuv/row.h"
+#include "libyuv/video_common.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Convert camera sample to I420 with cropping, rotation and vertical flip.
+// src_width is used for source stride computation
+// src_height is used to compute location of planes, and indicate inversion
+// sample_size is measured in bytes and is the size of the frame.
+//   With MJPEG it is the compressed size of the frame.
+LIBYUV_API
+int ConvertToARGB(const uint8* sample, size_t sample_size,
+                  uint8* crop_argb, int argb_stride,
+                  int crop_x, int crop_y,
+                  int src_width, int src_height,
+                  int crop_width, int crop_height,
+                  enum RotationMode rotation,
+                  uint32 fourcc) {
+  uint32 format = CanonicalFourCC(fourcc);
+  int aligned_src_width = (src_width + 1) & ~1;
+  const uint8* src;
+  const uint8* src_uv;
+  int abs_src_height = (src_height < 0) ? -src_height : src_height;
+  int inv_crop_height = (crop_height < 0) ? -crop_height : crop_height;
+  int r = 0;
+
+  // One pass rotation is available for some formats. For the rest, convert
+  // to I420 (with optional vertical flipping) into a temporary I420 buffer,
+  // and then rotate the I420 to the final destination buffer.
+  // For in-place conversion, if destination crop_argb is same as source sample,
+  // also enable temporary buffer.
+  LIBYUV_BOOL need_buf = (rotation && format != FOURCC_ARGB) ||
+      crop_argb == sample;
+  uint8* tmp_argb = crop_argb;
+  int tmp_argb_stride = argb_stride;
+  uint8* rotate_buffer = NULL;
+  int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
+
+  if (crop_argb == NULL || sample == NULL ||
+      src_width <= 0 || crop_width <= 0 ||
+      src_height == 0 || crop_height == 0) {
+    return -1;
+  }
+  if (src_height < 0) {
+    inv_crop_height = -inv_crop_height;
+  }
+
+  if (need_buf) {
+    int argb_size = crop_width * abs_crop_height * 4;
+    rotate_buffer = (uint8*)malloc(argb_size);
+    if (!rotate_buffer) {
+      return 1;  // Out of memory runtime error.
+    }
+    crop_argb = rotate_buffer;
+    argb_stride = crop_width;
+  }
+
+  switch (format) {
+    // Single plane formats
+    case FOURCC_YUY2:
+      src = sample + (aligned_src_width * crop_y + crop_x) * 2;
+      r = YUY2ToARGB(src, aligned_src_width * 2,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_UYVY:
+      src = sample + (aligned_src_width * crop_y + crop_x) * 2;
+      r = UYVYToARGB(src, aligned_src_width * 2,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_24BG:
+      src = sample + (src_width * crop_y + crop_x) * 3;
+      r = RGB24ToARGB(src, src_width * 3,
+                      crop_argb, argb_stride,
+                      crop_width, inv_crop_height);
+      break;
+    case FOURCC_RAW:
+      src = sample + (src_width * crop_y + crop_x) * 3;
+      r = RAWToARGB(src, src_width * 3,
+                    crop_argb, argb_stride,
+                    crop_width, inv_crop_height);
+      break;
+    case FOURCC_ARGB:
+      src = sample + (src_width * crop_y + crop_x) * 4;
+      r = ARGBToARGB(src, src_width * 4,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_BGRA:
+      src = sample + (src_width * crop_y + crop_x) * 4;
+      r = BGRAToARGB(src, src_width * 4,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_ABGR:
+      src = sample + (src_width * crop_y + crop_x) * 4;
+      r = ABGRToARGB(src, src_width * 4,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_RGBA:
+      src = sample + (src_width * crop_y + crop_x) * 4;
+      r = RGBAToARGB(src, src_width * 4,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_RGBP:
+      src = sample + (src_width * crop_y + crop_x) * 2;
+      r = RGB565ToARGB(src, src_width * 2,
+                       crop_argb, argb_stride,
+                       crop_width, inv_crop_height);
+      break;
+    case FOURCC_RGBO:
+      src = sample + (src_width * crop_y + crop_x) * 2;
+      r = ARGB1555ToARGB(src, src_width * 2,
+                         crop_argb, argb_stride,
+                         crop_width, inv_crop_height);
+      break;
+    case FOURCC_R444:
+      src = sample + (src_width * crop_y + crop_x) * 2;
+      r = ARGB4444ToARGB(src, src_width * 2,
+                         crop_argb, argb_stride,
+                         crop_width, inv_crop_height);
+      break;
+    case FOURCC_I400:
+      src = sample + src_width * crop_y + crop_x;
+      r = I400ToARGB(src, src_width,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+
+    // Biplanar formats
+    case FOURCC_NV12:
+      src = sample + (src_width * crop_y + crop_x);
+      src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
+      r = NV12ToARGB(src, src_width,
+                     src_uv, aligned_src_width,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_NV21:
+      src = sample + (src_width * crop_y + crop_x);
+      src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
+      // Call NV12 but with u and v parameters swapped.
+      r = NV21ToARGB(src, src_width,
+                     src_uv, aligned_src_width,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_M420:
+      src = sample + (src_width * crop_y) * 12 / 8 + crop_x;
+      r = M420ToARGB(src, src_width,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    // Triplanar formats
+    case FOURCC_I420:
+    case FOURCC_YU12:
+    case FOURCC_YV12: {
+      const uint8* src_y = sample + (src_width * crop_y + crop_x);
+      const uint8* src_u;
+      const uint8* src_v;
+      int halfwidth = (src_width + 1) / 2;
+      int halfheight = (abs_src_height + 1) / 2;
+      if (format == FOURCC_YV12) {
+        src_v = sample + src_width * abs_src_height +
+            (halfwidth * crop_y + crop_x) / 2;
+        src_u = sample + src_width * abs_src_height +
+            halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+      } else {
+        src_u = sample + src_width * abs_src_height +
+            (halfwidth * crop_y + crop_x) / 2;
+        src_v = sample + src_width * abs_src_height +
+            halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+      }
+      r = I420ToARGB(src_y, src_width,
+                     src_u, halfwidth,
+                     src_v, halfwidth,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    }
+
+    case FOURCC_J420: {
+      const uint8* src_y = sample + (src_width * crop_y + crop_x);
+      const uint8* src_u;
+      const uint8* src_v;
+      int halfwidth = (src_width + 1) / 2;
+      int halfheight = (abs_src_height + 1) / 2;
+      src_u = sample + src_width * abs_src_height +
+          (halfwidth * crop_y + crop_x) / 2;
+      src_v = sample + src_width * abs_src_height +
+          halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+      r = J420ToARGB(src_y, src_width,
+                     src_u, halfwidth,
+                     src_v, halfwidth,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    }
+
+    case FOURCC_I422:
+    case FOURCC_YV16: {
+      const uint8* src_y = sample + src_width * crop_y + crop_x;
+      const uint8* src_u;
+      const uint8* src_v;
+      int halfwidth = (src_width + 1) / 2;
+      if (format == FOURCC_YV16) {
+        src_v = sample + src_width * abs_src_height +
+            halfwidth * crop_y + crop_x / 2;
+        src_u = sample + src_width * abs_src_height +
+            halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+      } else {
+        src_u = sample + src_width * abs_src_height +
+            halfwidth * crop_y + crop_x / 2;
+        src_v = sample + src_width * abs_src_height +
+            halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+      }
+      r = I422ToARGB(src_y, src_width,
+                     src_u, halfwidth,
+                     src_v, halfwidth,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    }
+    case FOURCC_I444:
+    case FOURCC_YV24: {
+      const uint8* src_y = sample + src_width * crop_y + crop_x;
+      const uint8* src_u;
+      const uint8* src_v;
+      if (format == FOURCC_YV24) {
+        src_v = sample + src_width * (abs_src_height + crop_y) + crop_x;
+        src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+      } else {
+        src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
+        src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+      }
+      r = I444ToARGB(src_y, src_width,
+                     src_u, src_width,
+                     src_v, src_width,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    }
+    case FOURCC_I411: {
+      int quarterwidth = (src_width + 3) / 4;
+      const uint8* src_y = sample + src_width * crop_y + crop_x;
+      const uint8* src_u = sample + src_width * abs_src_height +
+          quarterwidth * crop_y + crop_x / 4;
+      const uint8* src_v = sample + src_width * abs_src_height +
+          quarterwidth * (abs_src_height + crop_y) + crop_x / 4;
+      r = I411ToARGB(src_y, src_width,
+                     src_u, quarterwidth,
+                     src_v, quarterwidth,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    }
+#ifdef HAVE_JPEG
+    case FOURCC_MJPG:
+      r = MJPGToARGB(sample, sample_size,
+                     crop_argb, argb_stride,
+                     src_width, abs_src_height, crop_width, inv_crop_height);
+      break;
+#endif
+    default:
+      r = -1;  // unknown fourcc - return failure code.
+  }
+
+  if (need_buf) {
+    if (!r) {
+      r = ARGBRotate(crop_argb, argb_stride,
+                     tmp_argb, tmp_argb_stride,
+                     crop_width, abs_crop_height, rotation);
+    }
+    free(rotate_buffer);
+  }
+
+  return r;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libaom/src/third_party/libyuv/source/convert_to_i420.cc b/libs/libaom/src/third_party/libyuv/source/convert_to_i420.cc
new file mode 100644
index 000000000..5e75369b5
--- /dev/null
+++ b/libs/libaom/src/third_party/libyuv/source/convert_to_i420.cc
@@ -0,0 +1,339 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "libyuv/convert.h"
+
+#include "libyuv/video_common.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Convert camera sample to I420 with cropping, rotation and vertical flip.
+// src_width is used for source stride computation
+// src_height is used to compute location of planes, and indicate inversion
+// sample_size is measured in bytes and is the size of the frame.
+//   With MJPEG it is the compressed size of the frame.
+LIBYUV_API
+int ConvertToI420(const uint8* sample,
+                  size_t sample_size,
+                  uint8* y, int y_stride,
+                  uint8* u, int u_stride,
+                  uint8* v, int v_stride,
+                  int crop_x, int crop_y,
+                  int src_width, int src_height,
+                  int crop_width, int crop_height,
+                  enum RotationMode rotation,
+                  uint32 fourcc) {
+  uint32 format = CanonicalFourCC(fourcc);
+  int aligned_src_width = (src_width + 1) & ~1;
+  const uint8* src;
+  const uint8* src_uv;
+  int abs_src_height = (src_height < 0) ? -src_height : src_height;
+  int inv_crop_height = (crop_height < 0) ? -crop_height : crop_height;
+  int r = 0;
+  LIBYUV_BOOL need_buf = (rotation && format != FOURCC_I420 &&
+      format != FOURCC_NV12 && format != FOURCC_NV21 &&
+      format != FOURCC_YU12 && format != FOURCC_YV12) || y == sample;
+  uint8* tmp_y = y;
+  uint8* tmp_u = u;
+  uint8* tmp_v = v;
+  int tmp_y_stride = y_stride;
+  int tmp_u_stride = u_stride;
+  int tmp_v_stride = v_stride;
+  uint8* rotate_buffer = NULL;
+  int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
+
+  if (!y || !u || !v || !sample ||
+      src_width <= 0 || crop_width <= 0  ||
+      src_height == 0 || crop_height == 0) {
+    return -1;
+  }
+  if (src_height < 0) {
+    inv_crop_height = -inv_crop_height;
+  }
+
+  // One pass rotation is available for some formats. For the rest, convert
+  // to I420 (with optional vertical flipping) into a temporary I420 buffer,
+  // and then rotate the I420 to the final destination buffer.
+  // For in-place conversion, if destination y is same as source sample,
+  // also enable temporary buffer.
+  if (need_buf) {
+    int y_size = crop_width * abs_crop_height;
+    int uv_size = ((crop_width + 1) / 2) * ((abs_crop_height + 1) / 2);
+    rotate_buffer = (uint8*)malloc(y_size + uv_size * 2);
+    if (!rotate_buffer) {
+      return 1;  // Out of memory runtime error.
+    }
+    y = rotate_buffer;
+    u = y + y_size;
+    v = u + uv_size;
+    y_stride = crop_width;
+    u_stride = v_stride = ((crop_width + 1) / 2);
+  }
+
+  switch (format) {
+    // Single plane formats
+    case FOURCC_YUY2:
+      src = sample + (aligned_src_width * crop_y + crop_x) * 2;
+      r = YUY2ToI420(src, aligned_src_width * 2,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_UYVY:
+      src = sample + (aligned_src_width * crop_y + crop_x) * 2;
+      r = UYVYToI420(src, aligned_src_width * 2,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_RGBP:
+      src = sample + (src_width * crop_y + crop_x) * 2;
+      r = RGB565ToI420(src, src_width * 2,
+                       y, y_stride,
+                       u, u_stride,
+                       v, v_stride,
+                       crop_width, inv_crop_height);
+      break;
+    case FOURCC_RGBO:
+      src = sample + (src_width * crop_y + crop_x) * 2;
+      r = ARGB1555ToI420(src, src_width * 2,
+                         y, y_stride,
+                         u, u_stride,
+                         v, v_stride,
+                         crop_width, inv_crop_height);
+      break;
+    case FOURCC_R444:
+      src = sample + (src_width * crop_y + crop_x) * 2;
+      r = ARGB4444ToI420(src, src_width * 2,
+                         y, y_stride,
+                         u, u_stride,
+                         v, v_stride,
+                         crop_width, inv_crop_height);
+      break;
+    case FOURCC_24BG:
+      src = sample + (src_width * crop_y + crop_x) * 3;
+      r = RGB24ToI420(src, src_width * 3,
+                      y, y_stride,
+                      u, u_stride,
+                      v, v_stride,
+                      crop_width, inv_crop_height);
+      break;
+    case FOURCC_RAW:
+      src = sample + (src_width * crop_y + crop_x) * 3;
+      r = RAWToI420(src, src_width * 3,
+                    y, y_stride,
+                    u, u_stride,
+                    v, v_stride,
+                    crop_width, inv_crop_height);
+      break;
+    case FOURCC_ARGB:
+      src = sample + (src_width * crop_y + crop_x) * 4;
+      r = ARGBToI420(src, src_width * 4,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_BGRA:
+      src = sample + (src_width * crop_y + crop_x) * 4;
+      r = BGRAToI420(src, src_width * 4,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_ABGR:
+      src = sample + (src_width * crop_y + crop_x) * 4;
+      r = ABGRToI420(src, src_width * 4,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_RGBA:
+      src = sample + (src_width * crop_y + crop_x) * 4;
+      r = RGBAToI420(src, src_width * 4,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     crop_width, inv_crop_height);
+      break;
+    case FOURCC_I400:
+      src = sample + src_width * crop_y + crop_x;
+      r = I400ToI420(src, src_width,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     crop_width, inv_crop_height);
+      break;
+    // Biplanar formats
+    case FOURCC_NV12:
+      src = sample + (src_width * crop_y + crop_x);
+      src_uv = sample + (src_width * src_height) +
+        ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
+      r = NV12ToI420Rotate(src, src_width,
+                           src_uv, aligned_src_width,
+                           y, y_stride,
+                           u, u_stride,
+                           v, v_stride,
+                           crop_width, inv_crop_height, rotation);
+      break;
+    case FOURCC_NV21:
+      src = sample + (src_width * crop_y + crop_x);
+      src_uv = sample + (src_width * src_height) +
+        ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
+      // Call NV12 but with u and v parameters swapped.
+      r = NV12ToI420Rotate(src, src_width,
+                           src_uv, aligned_src_width,
+                           y, y_stride,
+                           v, v_stride,
+                           u, u_stride,
+                           crop_width, inv_crop_height, rotation);
+      break;
+    case FOURCC_M420:
+      src = sample + (src_width * crop_y) * 12 / 8 + crop_x;
+      r = M420ToI420(src, src_width,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     crop_width, inv_crop_height);
+      break;
+    // Triplanar formats
+    case FOURCC_I420:
+    case FOURCC_YU12:
+    case FOURCC_YV12: {
+      const uint8* src_y = sample + (src_width * crop_y + crop_x);
+      const uint8* src_u;
+      const uint8* src_v;
+      int halfwidth = (src_width + 1) / 2;
+      int halfheight = (abs_src_height + 1) / 2;
+      if (format == FOURCC_YV12) {
+        src_v = sample + src_width * abs_src_height +
+            (halfwidth * crop_y + crop_x) / 2;
+        src_u = sample + src_width * abs_src_height +
+            halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+      } else {
+        src_u = sample + src_width * abs_src_height +
+            (halfwidth * crop_y + crop_x) / 2;
+        src_v = sample + src_width * abs_src_height +
+            halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+      }
+      r = I420Rotate(src_y, src_width,
+                     src_u, halfwidth,
+                     src_v, halfwidth,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     crop_width, inv_crop_height, rotation);
+      break;
+    }
+    case FOURCC_I422:
+    case FOURCC_YV16: {
+      const uint8* src_y = sample + src_width * crop_y + crop_x;
+      const uint8* src_u;
+      const uint8* src_v;
+      int halfwidth = (src_width + 1) / 2;
+      if (format == FOURCC_YV16) {
+        src_v = sample + src_width * abs_src_height +
+            halfwidth * crop_y + crop_x / 2;
+        src_u = sample + src_width * abs_src_height +
+            halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+      } else {
+        src_u = sample + src_width * abs_src_height +
+            halfwidth * crop_y + crop_x / 2;
+        src_v = sample + src_width * abs_src_height +
+            halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+      }
+      r = I422ToI420(src_y, src_width,
+                     src_u, halfwidth,
+                     src_v, halfwidth,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     crop_width, inv_crop_height);
+      break;
+    }
+    case FOURCC_I444:
+    case FOURCC_YV24: {
+      const uint8* src_y = sample + src_width * crop_y + crop_x;
+      const uint8* src_u;
+      const uint8* src_v;
+      if (format == FOURCC_YV24) {
+        src_v = sample + src_width * (abs_src_height + crop_y) + crop_x;
+        src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+      } else {
+        src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
+        src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+      }
+      r = I444ToI420(src_y, src_width,
+                     src_u, src_width,
+                     src_v, src_width,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     crop_width, inv_crop_height);
+      break;
+    }
+    case FOURCC_I411: {
+      int quarterwidth = (src_width + 3) / 4;
+      const uint8* src_y = sample + src_width * crop_y + crop_x;
+      const uint8* src_u = sample + src_width * abs_src_height +
+          quarterwidth * crop_y + crop_x / 4;
+      const uint8* src_v = sample + src_width * abs_src_height +
+          quarterwidth * (abs_src_height + crop_y) + crop_x / 4;
+      r = I411ToI420(src_y, src_width,
+                     src_u, quarterwidth,
+                     src_v, quarterwidth,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     crop_width, inv_crop_height);
+      break;
+    }
+#ifdef HAVE_JPEG
+    case FOURCC_MJPG:
+      r = MJPGToI420(sample, sample_size,
+                     y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     src_width, abs_src_height, crop_width, inv_crop_height);
+      break;
+#endif
+    default:
+      r = -1;  // unknown fourcc - return failure code.
+  }
+
+  if (need_buf) {
+    if (!r) {
+      r = I420Rotate(y, y_stride,
+                     u, u_stride,
+                     v, v_stride,
+                     tmp_y, tmp_y_stride,
+                     tmp_u, tmp_u_stride,
+                     tmp_v, tmp_v_stride,
+                     crop_width, abs_crop_height, rotation);
+    }
+    free(rotate_buffer);
+  }
+
+  return r;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libaom/src/third_party/libyuv/source/cpu_id.cc b/libs/libaom/src/third_party/libyuv/source/cpu_id.cc
new file mode 100644
index 000000000..72f686e3b
--- /dev/null
+++ b/libs/libaom/src/third_party/libyuv/source/cpu_id.cc
@@ -0,0 +1,307 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/cpu_id.h"
+
+#if (defined(_MSC_VER) && !defined(__clang__)) && !defined(__clang__)
+#include <intrin.h>  // For __cpuidex()
+#endif
+#if !defined(__pnacl__) && !defined(__CLR_VER) && \
+    !defined(__native_client__) && (defined(_M_IX86) || defined(_M_X64)) && \
+    defined(_MSC_VER) && !defined(__clang__) && (_MSC_FULL_VER >= 160040219)
+#include <immintrin.h>  // For _xgetbv()
+#endif
+
+#if !defined(__native_client__)
+#include <stdlib.h>  // For getenv()
+#endif
+
+// For ArmCpuCaps() but unittested on all platforms
+#include <stdio.h>
+#include <string.h>
+
+#include "libyuv/basic_types.h"  // For CPU_X86
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// For functions that use the stack and have runtime checks for overflow,
+// use SAFEBUFFERS to avoid additional check.
+#if (defined(_MSC_VER) && !defined(__clang__)) && (_MSC_FULL_VER >= 160040219)
+#define SAFEBUFFERS __declspec(safebuffers)
+#else
+#define SAFEBUFFERS
+#endif
+
+// Low level cpuid for X86.
+#if (defined(_M_IX86) || defined(_M_X64) || \
+    defined(__i386__) || defined(__x86_64__)) && \
+    !defined(__pnacl__) && !defined(__CLR_VER)
+LIBYUV_API
+void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) {
+#if (defined(_MSC_VER) && !defined(__clang__)) && !defined(__clang__)
+// Visual C version uses intrinsic or inline x86 assembly.
+#if (_MSC_FULL_VER >= 160040219)
+  __cpuidex((int*)(cpu_info), info_eax, info_ecx);
+#elif defined(_M_IX86)
+  __asm {
+    mov        eax, info_eax
+    mov        ecx, info_ecx
+    mov        edi, cpu_info
+    cpuid
+    mov        [edi], eax
+    mov        [edi + 4], ebx
+    mov        [edi + 8], ecx
+    mov        [edi + 12], edx
+  }
+#else
+  if (info_ecx == 0) {
+    __cpuid((int*)(cpu_info), info_eax);
+  } else {
+    cpu_info[3] = cpu_info[2] = cpu_info[1] = cpu_info[0] = 0;
+  }
+#endif
+// GCC version uses inline x86 assembly.
+#else  // (defined(_MSC_VER) && !defined(__clang__)) && !defined(__clang__)
+  uint32 info_ebx, info_edx;
+  asm volatile (  // NOLINT
+#if defined( __i386__) && defined(__PIC__)
+    // Preserve ebx for fpic 32 bit.
+    "mov %%ebx, %%edi                          \n"
+    "cpuid                                     \n"
+    "xchg %%edi, %%ebx                         \n"
+    : "=D" (info_ebx),
+#else
+    "cpuid                                     \n"
+    : "=b" (info_ebx),
+#endif  //  defined( __i386__) && defined(__PIC__)
+      "+a" (info_eax), "+c" (info_ecx), "=d" (info_edx));
+  cpu_info[0] = info_eax;
+  cpu_info[1] = info_ebx;
+  cpu_info[2] = info_ecx;
+  cpu_info[3] = info_edx;
+#endif  // (defined(_MSC_VER) && !defined(__clang__)) && !defined(__clang__)
+}
+#else  // (defined(_M_IX86) || defined(_M_X64) ...
+LIBYUV_API
+void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info) {
+  cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0;
+}
+#endif
+
+// TODO(fbarchard): Enable xgetbv when validator supports it.
+#if (defined(_M_IX86) || defined(_M_X64) || \
+    defined(__i386__) || defined(__x86_64__)) && \
+    !defined(__pnacl__) && !defined(__CLR_VER) && !defined(__native_client__)
+#define HAS_XGETBV
+// X86 CPUs have xgetbv to detect OS saves high parts of ymm registers.
+int TestOsSaveYmm() {
+  uint32 xcr0 = 0u;
+#if (defined(_MSC_VER) && !defined(__clang__)) && (_MSC_FULL_VER >= 160040219)
+  xcr0 = (uint32)(_xgetbv(0));  // VS2010 SP1 required.
+#elif defined(_M_IX86) && defined(_MSC_VER) && !defined(__clang__)
+  __asm {
+    xor        ecx, ecx    // xcr 0
+    _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0  // For VS2010 and earlier.
+    mov        xcr0, eax
+  }
+#elif defined(__i386__) || defined(__x86_64__)
+  asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcr0) : "c" (0) : "%edx");
+#endif  // defined(__i386__) || defined(__x86_64__)
+  return((xcr0 & 6) == 6);  // Is ymm saved?
+}
+#endif  // defined(_M_IX86) || defined(_M_X64) ..
+
+// based on libaom arm_cpudetect.c
+// For Arm, but public to allow testing on any CPU
+LIBYUV_API SAFEBUFFERS
+int ArmCpuCaps(const char* cpuinfo_name) {
+  char cpuinfo_line[512];
+  FILE* f = fopen(cpuinfo_name, "r");
+  if (!f) {
+    // Assume Neon if /proc/cpuinfo is unavailable.
+    // This will occur for Chrome sandbox for Pepper or Render process.
+    return kCpuHasNEON;
+  }
+  while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) {
+    if (memcmp(cpuinfo_line, "Features", 8) == 0) {
+      char* p = strstr(cpuinfo_line, " neon");
+      if (p && (p[5] == ' ' || p[5] == '\n')) {
+        fclose(f);
+        return kCpuHasNEON;
+      }
+      // aarch64 uses asimd for Neon.
+      p = strstr(cpuinfo_line, " asimd");
+      if (p && (p[6] == ' ' || p[6] == '\n')) {
+        fclose(f);
+        return kCpuHasNEON;
+      }
+    }
+  }
+  fclose(f);
+  return 0;
+}
+
+#if defined(__mips__) && defined(__linux__)
+static int MipsCpuCaps(const char* search_string) {
+  char cpuinfo_line[512];
+  const char* file_name = "/proc/cpuinfo";
+  FILE* f = fopen(file_name, "r");
+  if (!f) {
+    // Assume DSP if /proc/cpuinfo is unavailable.
+    // This will occur for Chrome sandbox for Pepper or Render process.
+    return kCpuHasMIPS_DSP;
+  }
+  while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f) != NULL) {
+    if (strstr(cpuinfo_line, search_string) != NULL) {
+      fclose(f);
+      return kCpuHasMIPS_DSP;
+    }
+  }
+  fclose(f);
+  return 0;
+}
+#endif
+
+// CPU detect function for SIMD instruction sets.
+LIBYUV_API
+int cpu_info_ = kCpuInit;  // cpu_info is not initialized yet.
+
+// Test environment variable for disabling CPU features. Any non-zero value
+// to disable. Zero ignored to make it easy to set the variable on/off.
+#if !defined(__native_client__) && !defined(_M_ARM)
+
+static LIBYUV_BOOL TestEnv(const char* name) {
+  const char* var = getenv(name);
+  if (var) {
+    if (var[0] != '0') {
+      return LIBYUV_TRUE;
+    }
+  }
+  return LIBYUV_FALSE;
+}
+#else  // nacl does not support getenv().
+static LIBYUV_BOOL TestEnv(const char*) {
+  return LIBYUV_FALSE;
+}
+#endif
+
+LIBYUV_API SAFEBUFFERS
+int InitCpuFlags(void) {
+#if !defined(__pnacl__) && !defined(__CLR_VER) && defined(CPU_X86)
+
+  uint32 cpu_info0[4] = { 0, 0, 0, 0 };
+  uint32 cpu_info1[4] = { 0, 0, 0, 0 };
+  uint32 cpu_info7[4] = { 0, 0, 0, 0 };
+  CpuId(0, 0, cpu_info0);
+  CpuId(1, 0, cpu_info1);
+  if (cpu_info0[0] >= 7) {
+    CpuId(7, 0, cpu_info7);
+  }
+  cpu_info_ = ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) |
+              ((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) |
+              ((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) |
+              ((cpu_info1[2] & 0x00100000) ? kCpuHasSSE42 : 0) |
+              ((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0) |
+              ((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) |
+              kCpuHasX86;
+
+#ifdef HAS_XGETBV
+  if ((cpu_info1[2] & 0x18000000) == 0x18000000 &&  // AVX and OSSave
+      TestOsSaveYmm()) {  // Saves YMM.
+    cpu_info_ |= ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) |
+                 kCpuHasAVX;
+  }
+#endif
+  // Environment variable overrides for testing.
+  if (TestEnv("LIBYUV_DISABLE_X86")) {
+    cpu_info_ &= ~kCpuHasX86;
+  }
+  if (TestEnv("LIBYUV_DISABLE_SSE2")) {
+    cpu_info_ &= ~kCpuHasSSE2;
+  }
+  if (TestEnv("LIBYUV_DISABLE_SSSE3")) {
+    cpu_info_ &= ~kCpuHasSSSE3;
+  }
+  if (TestEnv("LIBYUV_DISABLE_SSE41")) {
+    cpu_info_ &= ~kCpuHasSSE41;
+  }
+  if (TestEnv("LIBYUV_DISABLE_SSE42")) {
+    cpu_info_ &= ~kCpuHasSSE42;
+  }
+  if (TestEnv("LIBYUV_DISABLE_AVX")) {
+    cpu_info_ &= ~kCpuHasAVX;
+  }
+  if (TestEnv("LIBYUV_DISABLE_AVX2")) {
+    cpu_info_ &= ~kCpuHasAVX2;
+  }
+  if (TestEnv("LIBYUV_DISABLE_ERMS")) {
+    cpu_info_ &= ~kCpuHasERMS;
+  }
+  if (TestEnv("LIBYUV_DISABLE_FMA3")) {
+    cpu_info_ &= ~kCpuHasFMA3;
+  }
+#endif
+#if defined(__mips__) && defined(__linux__)
+  // Linux mips parse text file for dsp detect.
+  cpu_info_ = MipsCpuCaps("dsp");  // set kCpuHasMIPS_DSP.
+#if defined(__mips_dspr2)
+  cpu_info_ |= kCpuHasMIPS_DSPR2;
+#endif
+  cpu_info_ |= kCpuHasMIPS;
+
+  if (getenv("LIBYUV_DISABLE_MIPS")) {
+    cpu_info_ &= ~kCpuHasMIPS;
+  }
+  if (getenv("LIBYUV_DISABLE_MIPS_DSP")) {
+    cpu_info_ &= ~kCpuHasMIPS_DSP;
+  }
+  if (getenv("LIBYUV_DISABLE_MIPS_DSPR2")) {
+    cpu_info_ &= ~kCpuHasMIPS_DSPR2;
+  }
+#endif
+#if defined(__arm__) || defined(__aarch64__)
+// gcc -mfpu=neon defines __ARM_NEON__
+// __ARM_NEON__ generates code that requires Neon.  NaCL also requires Neon.
+// For Linux, /proc/cpuinfo can be tested but without that assume Neon.
+#if defined(__ARM_NEON__) || defined(__native_client__) || !defined(__linux__)
+  cpu_info_ = kCpuHasNEON;
+// For aarch64(arm64), /proc/cpuinfo's feature is not complete, e.g. no neon
+// flag in it.
+// So for aarch64, neon enabling is hard coded here.
+#endif
+#if defined(__aarch64__)
+  cpu_info_ = kCpuHasNEON;
+#else
+  // Linux arm parse text file for neon detect.
+  cpu_info_ = ArmCpuCaps("/proc/cpuinfo");
+#endif
+  cpu_info_ |= kCpuHasARM;
+  if (TestEnv("LIBYUV_DISABLE_NEON")) {
+    cpu_info_ &= ~kCpuHasNEON;
+  }
+#endif  // __arm__
+  if (TestEnv("LIBYUV_DISABLE_ASM")) {
+    cpu_info_ = 0;
+  }
+  return cpu_info_;
+}
+
+LIBYUV_API
+void MaskCpuFlags(int enable_flags) {
+  cpu_info_ = InitCpuFlags() & enable_flags;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libaom/src/third_party/libyuv/source/mjpeg_decoder.cc b/libs/libaom/src/third_party/libyuv/source/mjpeg_decoder.cc
new file mode 100644
index 000000000..75f8a610e
--- /dev/null
+++ b/libs/libaom/src/third_party/libyuv/source/mjpeg_decoder.cc
@@ -0,0 +1,572 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/mjpeg_decoder.h"
+
+#ifdef HAVE_JPEG
+#include <assert.h>
+
+#if !defined(__pnacl__) && !defined(__CLR_VER) && \
+    !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
+// Must be included before jpeglib.
+#include <setjmp.h>
+#define HAVE_SETJMP
+
+#if defined(_MSC_VER)
+// disable warning 4324: structure was padded due to __declspec(align())
+#pragma warning(disable:4324)
+#endif
+
+#endif
+struct FILE;  // For jpeglib.h.
+
+// C++ build requires extern C for jpeg internals.
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <jpeglib.h>
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#include "libyuv/planar_functions.h"  // For CopyPlane().
+
+namespace libyuv {
+
+#ifdef HAVE_SETJMP
+struct SetJmpErrorMgr {
+  jpeg_error_mgr base;  // Must be at the top
+  jmp_buf setjmp_buffer;
+};
+#endif
+
+const int MJpegDecoder::kColorSpaceUnknown = JCS_UNKNOWN;
+const int MJpegDecoder::kColorSpaceGrayscale = JCS_GRAYSCALE;
+const int MJpegDecoder::kColorSpaceRgb = JCS_RGB;
+const int MJpegDecoder::kColorSpaceYCbCr = JCS_YCbCr;
+const int MJpegDecoder::kColorSpaceCMYK = JCS_CMYK;
+const int MJpegDecoder::kColorSpaceYCCK = JCS_YCCK;
+
+// Methods that are passed to jpeglib.
+boolean fill_input_buffer(jpeg_decompress_struct* cinfo);
+void init_source(jpeg_decompress_struct* cinfo);
+void skip_input_data(jpeg_decompress_struct* cinfo,
+                     long num_bytes);  // NOLINT
+void term_source(jpeg_decompress_struct* cinfo);
+void ErrorHandler(jpeg_common_struct* cinfo);
+
+MJpegDecoder::MJpegDecoder()
+    : has_scanline_padding_(LIBYUV_FALSE),
+      num_outbufs_(0),
+      scanlines_(NULL),
+      scanlines_sizes_(NULL),
+      databuf_(NULL),
+      databuf_strides_(NULL) {
+  decompress_struct_ = new jpeg_decompress_struct;
+  source_mgr_ = new jpeg_source_mgr;
+#ifdef HAVE_SETJMP
+  error_mgr_ = new SetJmpErrorMgr;
+  decompress_struct_->err = jpeg_std_error(&error_mgr_->base);
+  // Override standard exit()-based error handler.
+  error_mgr_->base.error_exit = &ErrorHandler;
+#endif
+  decompress_struct_->client_data = NULL;
+  source_mgr_->init_source = &init_source;
+  source_mgr_->fill_input_buffer = &fill_input_buffer;
+  source_mgr_->skip_input_data = &skip_input_data;
+  source_mgr_->resync_to_restart = &jpeg_resync_to_restart;
+  source_mgr_->term_source = &term_source;
+  jpeg_create_decompress(decompress_struct_);
+  decompress_struct_->src = source_mgr_;
+  buf_vec_.buffers = &buf_;
+  buf_vec_.len = 1;
+}
+
+MJpegDecoder::~MJpegDecoder() {
+  jpeg_destroy_decompress(decompress_struct_);
+  delete decompress_struct_;
+  delete source_mgr_;
+#ifdef HAVE_SETJMP
+  delete error_mgr_;
+#endif
+  DestroyOutputBuffers();
+}
+
+LIBYUV_BOOL MJpegDecoder::LoadFrame(const uint8* src, size_t src_len) {
+  if (!ValidateJpeg(src, src_len)) {
+    return LIBYUV_FALSE;
+  }
+
+  buf_.data = src;
+  buf_.len = static_cast<int>(src_len);
+  buf_vec_.pos = 0;
+  decompress_struct_->client_data = &buf_vec_;
+#ifdef HAVE_SETJMP
+  if (setjmp(error_mgr_->setjmp_buffer)) {
+    // We called jpeg_read_header, it experienced an error, and we called
+    // longjmp() and rewound the stack to here. Return error.
+    return LIBYUV_FALSE;
+  }
+#endif
+  if (jpeg_read_header(decompress_struct_, TRUE) != JPEG_HEADER_OK) {
+    // ERROR: Bad MJPEG header
+    return LIBYUV_FALSE;
+  }
+  AllocOutputBuffers(GetNumComponents());
+  for (int i = 0; i < num_outbufs_; ++i) {
+    int scanlines_size = GetComponentScanlinesPerImcuRow(i);
+    if (scanlines_sizes_[i] != scanlines_size) {
+      if (scanlines_[i]) {
+        delete scanlines_[i];
+      }
+      scanlines_[i] = new uint8* [scanlines_size];
+      scanlines_sizes_[i] = scanlines_size;
+    }
+
+    // We allocate padding for the final scanline to pad it up to DCTSIZE bytes
+    // to avoid memory errors, since jpeglib only reads full MCUs blocks. For
+    // the preceding scanlines, the padding is not needed/wanted because the
+    // following addresses will already be valid (they are the initial bytes of
+    // the next scanline) and will be overwritten when jpeglib writes out that
+    // next scanline.
+    int databuf_stride = GetComponentStride(i);
+    int databuf_size = scanlines_size * databuf_stride;
+    if (databuf_strides_[i] != databuf_stride) {
+      if (databuf_[i]) {
+        delete databuf_[i];
+      }
+      databuf_[i] = new uint8[databuf_size];
+      databuf_strides_[i] = databuf_stride;
+    }
+
+    if (GetComponentStride(i) != GetComponentWidth(i)) {
+      has_scanline_padding_ = LIBYUV_TRUE;
+    }
+  }
+  return LIBYUV_TRUE;
+}
+
+static int DivideAndRoundUp(int numerator, int denominator) {
+  return (numerator + denominator - 1) / denominator;
+}
+
+static int DivideAndRoundDown(int numerator, int denominator) {
+  return numerator / denominator;
+}
+
+// Returns width of the last loaded frame.
+int MJpegDecoder::GetWidth() {
+  return decompress_struct_->image_width;
+}
+
+// Returns height of the last loaded frame.
+int MJpegDecoder::GetHeight() {
+  return decompress_struct_->image_height;
+}
+
+// Returns format of the last loaded frame. The return value is one of the
+// kColorSpace* constants.
+int MJpegDecoder::GetColorSpace() {
+  return decompress_struct_->jpeg_color_space;
+}
+
+// Number of color components in the color space.
+int MJpegDecoder::GetNumComponents() {
+  return decompress_struct_->num_components;
+}
+
+// Sample factors of the n-th component.
+int MJpegDecoder::GetHorizSampFactor(int component) {
+  return decompress_struct_->comp_info[component].h_samp_factor;
+}
+
+int MJpegDecoder::GetVertSampFactor(int component) {
+  return decompress_struct_->comp_info[component].v_samp_factor;
+}
+
+int MJpegDecoder::GetHorizSubSampFactor(int component) {
+  return decompress_struct_->max_h_samp_factor /
+      GetHorizSampFactor(component);
+}
+
+int MJpegDecoder::GetVertSubSampFactor(int component) {
+  return decompress_struct_->max_v_samp_factor /
+      GetVertSampFactor(component);
+}
+
+int MJpegDecoder::GetImageScanlinesPerImcuRow() {
+  return decompress_struct_->max_v_samp_factor * DCTSIZE;
+}
+
+int MJpegDecoder::GetComponentScanlinesPerImcuRow(int component) {
+  int vs = GetVertSubSampFactor(component);
+  return DivideAndRoundUp(GetImageScanlinesPerImcuRow(), vs);
+}
+
+int MJpegDecoder::GetComponentWidth(int component) {
+  int hs = GetHorizSubSampFactor(component);
+  return DivideAndRoundUp(GetWidth(), hs);
+}
+
+int MJpegDecoder::GetComponentHeight(int component) {
+  int vs = GetVertSubSampFactor(component);
+  return DivideAndRoundUp(GetHeight(), vs);
+}
+
+// Get width in bytes padded out to a multiple of DCTSIZE
+int MJpegDecoder::GetComponentStride(int component) {
+  return (GetComponentWidth(component) + DCTSIZE - 1) & ~(DCTSIZE - 1);
+}
+
+int MJpegDecoder::GetComponentSize(int component) {
+  return GetComponentWidth(component) * GetComponentHeight(component);
+}
+
+LIBYUV_BOOL MJpegDecoder::UnloadFrame() {
+#ifdef HAVE_SETJMP
+  if (setjmp(error_mgr_->setjmp_buffer)) {
+    // We called jpeg_abort_decompress, it experienced an error, and we called
+    // longjmp() and rewound the stack to here. Return error.
+    return LIBYUV_FALSE;
+  }
+#endif
+  jpeg_abort_decompress(decompress_struct_);
+  return LIBYUV_TRUE;
+}
+
+// TODO(fbarchard): Allow rectangle to be specified: x, y, width, height.
+LIBYUV_BOOL MJpegDecoder::DecodeToBuffers(
+    uint8** planes, int dst_width, int dst_height) {
+  if (dst_width != GetWidth() ||
+      dst_height > GetHeight()) {
+    // ERROR: Bad dimensions
+    return LIBYUV_FALSE;
+  }
+#ifdef HAVE_SETJMP
+  if (setjmp(error_mgr_->setjmp_buffer)) {
+    // We called into jpeglib, it experienced an error sometime during this
+    // function call, and we called longjmp() and rewound the stack to here.
+    // Return error.
+    return LIBYUV_FALSE;
+  }
+#endif
+  if (!StartDecode()) {
+    return LIBYUV_FALSE;
+  }
+  SetScanlinePointers(databuf_);
+  int lines_left = dst_height;
+  // Compute amount of lines to skip to implement vertical crop.
+  // TODO(fbarchard): Ensure skip is a multiple of maximum component
+  // subsample. ie 2
+  int skip = (GetHeight() - dst_height) / 2;
+  if (skip > 0) {
+    // There is no API to skip lines in the output data, so we read them
+    // into the temp buffer.
+    while (skip >= GetImageScanlinesPerImcuRow()) {
+      if (!DecodeImcuRow()) {
+        FinishDecode();
+        return LIBYUV_FALSE;
+      }
+      skip -= GetImageScanlinesPerImcuRow();
+    }
+    if (skip > 0) {
+      // Have a partial iMCU row left over to skip. Must read it and then
+      // copy the parts we want into the destination.
+      if (!DecodeImcuRow()) {
+        FinishDecode();
+        return LIBYUV_FALSE;
+      }
+      for (int i = 0; i < num_outbufs_; ++i) {
+        // TODO(fbarchard): Compute skip to avoid this
+        assert(skip % GetVertSubSampFactor(i) == 0);
+        int rows_to_skip =
+            DivideAndRoundDown(skip, GetVertSubSampFactor(i));
+        int scanlines_to_copy = GetComponentScanlinesPerImcuRow(i) -
+                                rows_to_skip;
+        int data_to_skip = rows_to_skip * GetComponentStride(i);
+        CopyPlane(databuf_[i] + data_to_skip, GetComponentStride(i),
+                  planes[i], GetComponentWidth(i),
+                  GetComponentWidth(i), scanlines_to_copy);
+        planes[i] += scanlines_to_copy * GetComponentWidth(i);
+      }
+      lines_left -= (GetImageScanlinesPerImcuRow() - skip);
+    }
+  }
+
+  // Read full MCUs but cropped horizontally
+  for (; lines_left > GetImageScanlinesPerImcuRow();
+         lines_left -= GetImageScanlinesPerImcuRow()) {
+    if (!DecodeImcuRow()) {
+      FinishDecode();
+      return LIBYUV_FALSE;
+    }
+    for (int i = 0; i < num_outbufs_; ++i) {
+      int scanlines_to_copy = GetComponentScanlinesPerImcuRow(i);
+      CopyPlane(databuf_[i], GetComponentStride(i),
+                planes[i], GetComponentWidth(i),
+                GetComponentWidth(i), scanlines_to_copy);
+      planes[i] += scanlines_to_copy * GetComponentWidth(i);
+    }
+  }
+
+  if (lines_left > 0) {
+    // Have a partial iMCU row left over to decode.
+    if (!DecodeImcuRow()) {
+      FinishDecode();
+      return LIBYUV_FALSE;
+    }
+    for (int i = 0; i < num_outbufs_; ++i) {
+      int scanlines_to_copy =
+          DivideAndRoundUp(lines_left, GetVertSubSampFactor(i));
+      CopyPlane(databuf_[i], GetComponentStride(i),
+                planes[i], GetComponentWidth(i),
+                GetComponentWidth(i), scanlines_to_copy);
+      planes[i] += scanlines_to_copy * GetComponentWidth(i);
+    }
+  }
+  return FinishDecode();
+}
+
+LIBYUV_BOOL MJpegDecoder::DecodeToCallback(CallbackFunction fn, void* opaque,
+    int dst_width, int dst_height) {
+  if (dst_width != GetWidth() ||
+      dst_height > GetHeight()) {
+    // ERROR: Bad dimensions
+    return LIBYUV_FALSE;
+  }
+#ifdef HAVE_SETJMP
+  if (setjmp(error_mgr_->setjmp_buffer)) {
+    // We called into jpeglib, it experienced an error sometime during this
+    // function call, and we called longjmp() and rewound the stack to here.
+    // Return error.
+    return LIBYUV_FALSE;
+  }
+#endif
+  if (!StartDecode()) {
+    return LIBYUV_FALSE;
+  }
+  SetScanlinePointers(databuf_);
+  int lines_left = dst_height;
+  // TODO(fbarchard): Compute amount of lines to skip to implement vertical crop
+  int skip = (GetHeight() - dst_height) / 2;
+  if (skip > 0) {
+    while (skip >= GetImageScanlinesPerImcuRow()) {
+      if (!DecodeImcuRow()) {
+        FinishDecode();
+        return LIBYUV_FALSE;
+      }
+      skip -= GetImageScanlinesPerImcuRow();
+    }
+    if (skip > 0) {
+      // Have a partial iMCU row left over to skip.
+      if (!DecodeImcuRow()) {
+        FinishDecode();
+        return LIBYUV_FALSE;
+      }
+      for (int i = 0; i < num_outbufs_; ++i) {
+        // TODO(fbarchard): Compute skip to avoid this
+        assert(skip % GetVertSubSampFactor(i) == 0);
+        int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i));
+        int data_to_skip = rows_to_skip * GetComponentStride(i);
+        // Change our own data buffer pointers so we can pass them to the
+        // callback.
+        databuf_[i] += data_to_skip;
+      }
+      int scanlines_to_copy = GetImageScanlinesPerImcuRow() - skip;
+      (*fn)(opaque, databuf_, databuf_strides_, scanlines_to_copy);
+      // Now change them back.
+      for (int i = 0; i < num_outbufs_; ++i) {
+        int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i));
+        int data_to_skip = rows_to_skip * GetComponentStride(i);
+        databuf_[i] -= data_to_skip;
+      }
+      lines_left -= scanlines_to_copy;
+    }
+  }
+  // Read full MCUs until we get to the crop point.
+  for (; lines_left >= GetImageScanlinesPerImcuRow();
+         lines_left -= GetImageScanlinesPerImcuRow()) {
+    if (!DecodeImcuRow()) {
+      FinishDecode();
+      return LIBYUV_FALSE;
+    }
+    (*fn)(opaque, databuf_, databuf_strides_, GetImageScanlinesPerImcuRow());
+  }
+  if (lines_left > 0) {
+    // Have a partial iMCU row left over to decode.
+    if (!DecodeImcuRow()) {
+      FinishDecode();
+      return LIBYUV_FALSE;
+    }
+    (*fn)(opaque, databuf_, databuf_strides_, lines_left);
+  }
+  return FinishDecode();
+}
+
+void init_source(j_decompress_ptr cinfo) {
+  fill_input_buffer(cinfo);
+}
+
+boolean fill_input_buffer(j_decompress_ptr cinfo) {
+  BufferVector* buf_vec = reinterpret_cast<BufferVector*>(cinfo->client_data);
+  if (buf_vec->pos >= buf_vec->len) {
+    assert(0 && "No more data");
+    // ERROR: No more data
+    return FALSE;
+  }
+  cinfo->src->next_input_byte = buf_vec->buffers[buf_vec->pos].data;
+  cinfo->src->bytes_in_buffer = buf_vec->buffers[buf_vec->pos].len;
+  ++buf_vec->pos;
+  return TRUE;
+}
+
+void skip_input_data(j_decompress_ptr cinfo,
+                     long num_bytes) {  // NOLINT
+  cinfo->src->next_input_byte += num_bytes;
+}
+
+void term_source(j_decompress_ptr cinfo) {
+  // Nothing to do.
+}
+
+#ifdef HAVE_SETJMP
+void ErrorHandler(j_common_ptr cinfo) {
+  // This is called when a jpeglib command experiences an error. Unfortunately
+  // jpeglib's error handling model is not very flexible, because it expects the
+  // error handler to not return--i.e., it wants the program to terminate. To
+  // recover from errors we use setjmp() as shown in their example. setjmp() is
+  // C's implementation for the "call with current continuation" functionality
+  // seen in some functional programming languages.
+  // A formatted message can be output, but is unsafe for release.
+#ifdef DEBUG
+  char buf[JMSG_LENGTH_MAX];
+  (*cinfo->err->format_message)(cinfo, buf);
+  // ERROR: Error in jpeglib: buf
+#endif
+
+  SetJmpErrorMgr* mgr = reinterpret_cast<SetJmpErrorMgr*>(cinfo->err);
+  // This rewinds the call stack to the point of the corresponding setjmp()
+  // and causes it to return (for a second time) with value 1.
+  longjmp(mgr->setjmp_buffer, 1);
+}
+#endif
+
+void MJpegDecoder::AllocOutputBuffers(int num_outbufs) {
+  if (num_outbufs != num_outbufs_) {
+    // We could perhaps optimize this case to resize the output buffers without
+    // necessarily having to delete and recreate each one, but it's not worth
+    // it.
+    DestroyOutputBuffers();
+
+    scanlines_ = new uint8** [num_outbufs];
+    scanlines_sizes_ = new int[num_outbufs];
+    databuf_ = new uint8* [num_outbufs];
+    databuf_strides_ = new int[num_outbufs];
+
+    for (int i = 0; i < num_outbufs; ++i) {
+      scanlines_[i] = NULL;
+      scanlines_sizes_[i] = 0;
+      databuf_[i] = NULL;
+      databuf_strides_[i] = 0;
+    }
+
+    num_outbufs_ = num_outbufs;
+  }
+}
+
+void MJpegDecoder::DestroyOutputBuffers() {
+  for (int i = 0; i < num_outbufs_; ++i) {
+    delete [] scanlines_[i];
+    delete [] databuf_[i];
+  }
+  delete [] scanlines_;
+  delete [] databuf_;
+  delete [] scanlines_sizes_;
+  delete [] databuf_strides_;
+  scanlines_ = NULL;
+  databuf_ = NULL;
+  scanlines_sizes_ = NULL;
+  databuf_strides_ = NULL;
+  num_outbufs_ = 0;
+}
+
+// JDCT_IFAST and do_block_smoothing improve performance substantially.
+LIBYUV_BOOL MJpegDecoder::StartDecode() {
+  decompress_struct_->raw_data_out = TRUE;
+  decompress_struct_->dct_method = JDCT_IFAST;  // JDCT_ISLOW is default
+  decompress_struct_->dither_mode = JDITHER_NONE;
+  // Not applicable to 'raw':
+  decompress_struct_->do_fancy_upsampling = (boolean)(LIBYUV_FALSE);
+  // Only for buffered mode:
+  decompress_struct_->enable_2pass_quant = (boolean)(LIBYUV_FALSE);
+  // Blocky but fast:
+  decompress_struct_->do_block_smoothing = (boolean)(LIBYUV_FALSE);
+
+  if (!jpeg_start_decompress(decompress_struct_)) {
+    // ERROR: Couldn't start JPEG decompressor";
+    return LIBYUV_FALSE;
+  }
+  return LIBYUV_TRUE;
+}
+
+LIBYUV_BOOL MJpegDecoder::FinishDecode() {
+  // jpeglib considers it an error if we finish without decoding the whole
+  // image, so we call "abort" rather than "finish".
+  jpeg_abort_decompress(decompress_struct_);
+  return LIBYUV_TRUE;
+}
+
+void MJpegDecoder::SetScanlinePointers(uint8** data) {
+  for (int i = 0; i < num_outbufs_; ++i) {
+    uint8* data_i = data[i];
+    for (int j = 0; j < scanlines_sizes_[i]; ++j) {
+      scanlines_[i][j] = data_i;
+      data_i += GetComponentStride(i);
+    }
+  }
+}
+
+inline LIBYUV_BOOL MJpegDecoder::DecodeImcuRow() {
+  return (unsigned int)(GetImageScanlinesPerImcuRow()) ==
+      jpeg_read_raw_data(decompress_struct_,
+                         scanlines_,
+                         GetImageScanlinesPerImcuRow());
+}
+
+// The helper function which recognizes the jpeg sub-sampling type.
+JpegSubsamplingType MJpegDecoder::JpegSubsamplingTypeHelper(
+    int* subsample_x, int* subsample_y, int number_of_components) {
+  if (number_of_components == 3) {  // Color images.
+    if (subsample_x[0] == 1 && subsample_y[0] == 1 &&
+        subsample_x[1] == 2 && subsample_y[1] == 2 &&
+        subsample_x[2] == 2 && subsample_y[2] == 2) {
+      return kJpegYuv420;
+    } else if (subsample_x[0] == 1 && subsample_y[0] == 1 &&
+        subsample_x[1] == 2 && subsample_y[1] == 1 &&
+        subsample_x[2] == 2 && subsample_y[2] == 1) {
+      return kJpegYuv422;
+    } else if (subsample_x[0] == 1 && subsample_y[0] == 1 &&
+        subsample_x[1] == 1 && subsample_y[1] == 1 &&
+        subsample_x[2] == 1 && subsample_y[2] == 1) {
+      return kJpegYuv444;
+    }
+  } else if (number_of_components == 1) {  // Grey-scale images.
+    if (subsample_x[0] == 1 && subsample_y[0] == 1) {
+      return kJpegYuv400;
+    }
+  }
+  return kJpegUnknown;
+}
+
+}  // namespace libyuv
+#endif  // HAVE_JPEG
+
diff --git a/libs/libaom/src/third_party/libyuv/source/mjpeg_validate.cc b/libs/libaom/src/third_party/libyuv/source/mjpeg_validate.cc
new file mode 100644
index 000000000..8edfbe1e7
--- /dev/null
+++ b/libs/libaom/src/third_party/libyuv/source/mjpeg_validate.cc
@@ -0,0 +1,101 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/mjpeg_decoder.h"
+
+#include <string.h>  // For memchr.
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Enable this to try scasb implementation.
+// #define ENABLE_SCASB 1
+
+#ifdef ENABLE_SCASB
+
+// Multiple of 1.
+__declspec(naked)
+const uint8* ScanRow_ERMS(const uint8* src, uint32 val, int count) {
+  __asm {
+    mov        edx, edi
+    mov        edi, [esp + 4]   // src
+    mov        eax, [esp + 8]   // val
+    mov        ecx, [esp + 12]  // count
+    repne scasb
+    jne        sr99
+    mov        eax, edi
+    sub        eax, 1
+    mov        edi, edx
+    ret
+
+  sr99:
+    mov        eax, 0
+    mov        edi, edx
+    ret
+  }
+}
+#endif
+
+// Helper function to scan for EOI marker.
+static LIBYUV_BOOL ScanEOI(const uint8* sample, size_t sample_size) {
+  const uint8* end = sample + sample_size - 1;
+  const uint8* it = sample;
+  for (;;) {
+#ifdef ENABLE_SCASB
+    it = ScanRow_ERMS(it, 0xff, end - it);
+#else
+    it = static_cast<const uint8*>(memchr(it, 0xff, end - it));
+#endif
+    if (it == NULL) {
+      break;
+    }
+    if (it[1] == 0xd9) {
+      return LIBYUV_TRUE;  // Success: Valid jpeg.
+    }
+    ++it;  // Skip over current 0xff.
+  }
+  // ERROR: Invalid jpeg end code not found. Size sample_size
+  return LIBYUV_FALSE;
+}
+
+// Helper function to validate the jpeg appears intact.
+LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size) {
+  const size_t kBackSearchSize = 1024;
+  if (sample_size < 64) {
+    // ERROR: Invalid jpeg size: sample_size
+    return LIBYUV_FALSE;
+  }
+  if (sample[0] != 0xff || sample[1] != 0xd8) {  // Start Of Image
+    // ERROR: Invalid jpeg initial start code
+    return LIBYUV_FALSE;
+  }
+  // Step over SOI marker.
+  sample += 2;
+  sample_size -= 2;
+
+  // Look for the End Of Image (EOI) marker in the end kilobyte of the buffer.
+  if (sample_size > kBackSearchSize) {
+    if (ScanEOI(sample + sample_size - kBackSearchSize, kBackSearchSize)) {
+      return LIBYUV_TRUE;  // Success: Valid jpeg.
+    }
+    // Reduce search size for forward search.
+    sample_size = sample_size - kBackSearchSize + 1;
+  }
+  return ScanEOI(sample, sample_size);
+
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
diff --git a/libs/libaom/src/third_party/libyuv/source/planar_functions.cc b/libs/libaom/src/third_party/libyuv/source/planar_functions.cc
new file mode 100644
index 000000000..b96bd5020
--- /dev/null
+++ b/libs/libaom/src/third_party/libyuv/source/planar_functions.cc
@@ -0,0 +1,2555 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/planar_functions.h"
+
+#include <string.h>  // for memset()
+
+#include "libyuv/cpu_id.h"
+#ifdef HAVE_JPEG
+#include "libyuv/mjpeg_decoder.h"
+#endif
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Copy a plane of data
+LIBYUV_API
+void CopyPlane(const uint8* src_y, int src_stride_y,
+               uint8* dst_y, int dst_stride_y,
+               int width, int height) {
+  int y;
+  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
+  // Coalesce rows.
+  if (src_stride_y == width &&
+      dst_stride_y == width) {
+    width *= height;
+    height = 1;
+    src_stride_y = dst_stride_y = 0;
+  }
+  // Nothing to do.
+  if (src_y == dst_y && src_stride_y == dst_stride_y) {
+    return;
+  }
+#if defined(HAS_COPYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
+  }
+#endif
+#if defined(HAS_COPYROW_AVX)
+  if (TestCpuFlag(kCpuHasAVX)) {
+    CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
+  }
+#endif
+#if defined(HAS_COPYROW_ERMS)
+  if (TestCpuFlag(kCpuHasERMS)) {
+    CopyRow = CopyRow_ERMS;
+  }
+#endif
+#if defined(HAS_COPYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
+  }
+#endif
+#if defined(HAS_COPYROW_MIPS)
+  if (TestCpuFlag(kCpuHasMIPS)) {
+    CopyRow = CopyRow_MIPS;
+  }
+#endif
+
+  // Copy plane
+  for (y = 0; y < height; ++y) {
+    CopyRow(src_y, dst_y, width);
+    src_y += src_stride_y;
+    dst_y += dst_stride_y;
+  }
+}
+
+LIBYUV_API
+void CopyPlane_16(const uint16* src_y, int src_stride_y,
+                  uint16* dst_y, int dst_stride_y,
+                  int width, int height) {
+  int y;
+  void (*CopyRow)(const uint16* src, uint16* dst, int width) = CopyRow_16_C;
+  // Coalesce rows.
+  if (src_stride_y == width &&
+      dst_stride_y == width) {
+    width *= height;
+    height = 1;
+    src_stride_y = dst_stride_y = 0;
+  }
+#if defined(HAS_COPYROW_16_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) {
+    CopyRow = CopyRow_16_SSE2;
+  }
+#endif
+#if defined(HAS_COPYROW_16_ERMS)
+  if (TestCpuFlag(kCpuHasERMS)) {
+    CopyRow = CopyRow_16_ERMS;
+  }
+#endif
+#if defined(HAS_COPYROW_16_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
+    CopyRow = CopyRow_16_NEON;
+  }
+#endif
+#if defined(HAS_COPYROW_16_MIPS)
+  if (TestCpuFlag(kCpuHasMIPS)) {
+    CopyRow = CopyRow_16_MIPS;
+  }
+#endif
+
+  // Copy plane
+  for (y = 0; y < height; ++y) {
+    CopyRow(src_y, dst_y, width);
+    src_y += src_stride_y;
+    dst_y += dst_stride_y;
+  }
+}
+
+// Copy I422.
+LIBYUV_API
+int I422Copy(const uint8* src_y, int src_stride_y,
+             const uint8* src_u, int src_stride_u,
+             const uint8* src_v, int src_stride_v,
+             uint8* dst_y, int dst_stride_y,
+             uint8* dst_u, int dst_stride_u,
+             uint8* dst_v, int dst_stride_v,
+             int width, int height) {
+  int halfwidth = (width + 1) >> 1;
+  if (!src_y || !src_u || !src_v ||
+      !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, height);
+  CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, height);
+  return 0;
+}
+
+// Copy I444.
+LIBYUV_API
+int I444Copy(const uint8* src_y, int src_stride_y,
+             const uint8* src_u, int src_stride_u,
+             const uint8* src_v, int src_stride_v,
+             uint8* dst_y, int dst_stride_y,
+             uint8* dst_u, int dst_stride_u,
+             uint8* dst_v, int dst_stride_v,
+             int width, int height) {
+  if (!src_y || !src_u || !src_v ||
+      !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
+  CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
+  return 0;
+}
+
+// Copy I400.
+LIBYUV_API
+int I400ToI400(const uint8* src_y, int src_stride_y,
+               uint8* dst_y, int dst_stride_y,
+               int width, int height) {
+  if (!src_y || !dst_y || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_stride_y = -src_stride_y;
+  }
+  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  return 0;
+}
+
+// Convert I420 to I400.
+LIBYUV_API
+int I420ToI400(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               int width, int height) {
+  if (!src_y || !dst_y || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_stride_y = -src_stride_y;
+  }
+  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  return 0;
+}
+
+// Mirror a plane of data.
+void MirrorPlane(const uint8* src_y, int src_stride_y,
+                 uint8* dst_y, int dst_stride_y,
+                 int width, int height) {
+  int y;
+  void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_stride_y = -src_stride_y;
+  }
+#if defined(HAS_MIRRORROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MirrorRow = MirrorRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      MirrorRow = MirrorRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_MIRRORROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    MirrorRow = MirrorRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      MirrorRow = MirrorRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_MIRRORROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    MirrorRow = MirrorRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      MirrorRow = MirrorRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_MIRRORROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MirrorRow = MirrorRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      MirrorRow = MirrorRow_AVX2;
+    }
+  }
+#endif
+// TODO(fbarchard): Mirror on mips handle unaligned memory.
+#if defined(HAS_MIRRORROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+      IS_ALIGNED(dst_y, 4) && IS_ALIGNED(dst_stride_y, 4)) {
+    MirrorRow = MirrorRow_MIPS_DSPR2;
+  }
+#endif
+
+  // Mirror plane
+  for (y = 0; y < height; ++y) {
+    MirrorRow(src_y, dst_y, width);
+    src_y += src_stride_y;
+    dst_y += dst_stride_y;
+  }
+}
+
+// Convert YUY2 to I422.
+LIBYUV_API
+int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int y;
+  void (*YUY2ToUV422Row)(const uint8* src_yuy2,
+                         uint8* dst_u, uint8* dst_v, int pix) =
+      YUY2ToUV422Row_C;
+  void (*YUY2ToYRow)(const uint8* src_yuy2, uint8* dst_y, int pix) =
+      YUY2ToYRow_C;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
+    src_stride_yuy2 = -src_stride_yuy2;
+  }
+  // Coalesce rows.
+  if (src_stride_yuy2 == width * 2 &&
+      dst_stride_y == width &&
+      dst_stride_u * 2 == width &&
+      dst_stride_v * 2 == width) {
+    width *= height;
+    height = 1;
+    src_stride_yuy2 = dst_stride_y = dst_stride_u = dst_stride_v = 0;
+  }
+#if defined(HAS_YUY2TOYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2;
+    YUY2ToYRow = YUY2ToYRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      YUY2ToUV422Row = YUY2ToUV422Row_SSE2;
+      YUY2ToYRow = YUY2ToYRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_YUY2TOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    YUY2ToUV422Row = YUY2ToUV422Row_Any_AVX2;
+    YUY2ToYRow = YUY2ToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      YUY2ToUV422Row = YUY2ToUV422Row_AVX2;
+      YUY2ToYRow = YUY2ToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_YUY2TOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    YUY2ToYRow = YUY2ToYRow_Any_NEON;
+    if (width >= 16) {
+      YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON;
+    }
+    if (IS_ALIGNED(width, 16)) {
+      YUY2ToYRow = YUY2ToYRow_NEON;
+      YUY2ToUV422Row = YUY2ToUV422Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width);
+    YUY2ToYRow(src_yuy2, dst_y, width);
+    src_yuy2 += src_stride_yuy2;
+    dst_y += dst_stride_y;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  return 0;
+}
+
+// Convert UYVY to I422.
+LIBYUV_API
+int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int y;
+  void (*UYVYToUV422Row)(const uint8* src_uyvy,
+                         uint8* dst_u, uint8* dst_v, int pix) =
+      UYVYToUV422Row_C;
+  void (*UYVYToYRow)(const uint8* src_uyvy,
+                     uint8* dst_y, int pix) = UYVYToYRow_C;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
+    src_stride_uyvy = -src_stride_uyvy;
+  }
+  // Coalesce rows.
+  if (src_stride_uyvy == width * 2 &&
+      dst_stride_y == width &&
+      dst_stride_u * 2 == width &&
+      dst_stride_v * 2 == width) {
+    width *= height;
+    height = 1;
+    src_stride_uyvy = dst_stride_y = dst_stride_u = dst_stride_v = 0;
+  }
+#if defined(HAS_UYVYTOYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    UYVYToUV422Row = UYVYToUV422Row_Any_SSE2;
+    UYVYToYRow = UYVYToYRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      UYVYToUV422Row = UYVYToUV422Row_SSE2;
+      UYVYToYRow = UYVYToYRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_UYVYTOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    UYVYToUV422Row = UYVYToUV422Row_Any_AVX2;
+    UYVYToYRow = UYVYToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      UYVYToUV422Row = UYVYToUV422Row_AVX2;
+      UYVYToYRow = UYVYToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_UYVYTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    UYVYToYRow = UYVYToYRow_Any_NEON;
+    if (width >= 16) {
+      UYVYToUV422Row = UYVYToUV422Row_Any_NEON;
+    }
+    if (IS_ALIGNED(width, 16)) {
+      UYVYToYRow = UYVYToYRow_NEON;
+      UYVYToUV422Row = UYVYToUV422Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    UYVYToUV422Row(src_uyvy, dst_u, dst_v, width);
+    UYVYToYRow(src_uyvy, dst_y, width);
+    src_uyvy += src_stride_uyvy;
+    dst_y += dst_stride_y;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  return 0;
+}
+
+// Mirror I400 with optional flipping
+LIBYUV_API
+int I400Mirror(const uint8* src_y, int src_stride_y,
+               uint8* dst_y, int dst_stride_y,
+               int width, int height) {
+  if (!src_y || !dst_y ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_stride_y = -src_stride_y;
+  }
+
+  MirrorPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  return 0;
+}
+
+// Mirror I420 with optional flipping
+LIBYUV_API
+int I420Mirror(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!src_y || !src_u || !src_v || !dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (halfheight - 1) * src_stride_u;
+    src_v = src_v + (halfheight - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  if (dst_y) {
+    MirrorPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  }
+  MirrorPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);
+  MirrorPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);
+  return 0;
+}
+
+// ARGB mirror.
+LIBYUV_API
+int ARGBMirror(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  int y;
+  void (*ARGBMirrorRow)(const uint8* src, uint8* dst, int width) =
+      ARGBMirrorRow_C;
+  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+#if defined(HAS_ARGBMIRRORROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBMirrorRow = ARGBMirrorRow_Any_NEON;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBMirrorRow = ARGBMirrorRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBMIRRORROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBMirrorRow = ARGBMirrorRow_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBMirrorRow = ARGBMirrorRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBMIRRORROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBMirrorRow = ARGBMirrorRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBMirrorRow = ARGBMirrorRow_AVX2;
+    }
+  }
+#endif
+
+  // Mirror plane
+  for (y = 0; y < height; ++y) {
+    ARGBMirrorRow(src_argb, dst_argb, width);
+    src_argb += src_stride_argb;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Get a blender that optimized for the CPU and pixel count.
+// As there are 6 blenders to choose from, the caller should try to use
+// the same blend function for all pixels if possible.
+LIBYUV_API
+ARGBBlendRow GetARGBBlend() {
+  void (*ARGBBlendRow)(const uint8* src_argb, const uint8* src_argb1,
+                       uint8* dst_argb, int width) = ARGBBlendRow_C;
+#if defined(HAS_ARGBBLENDROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBBlendRow = ARGBBlendRow_SSSE3;
+    return ARGBBlendRow;
+  }
+#endif
+#if defined(HAS_ARGBBLENDROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBBlendRow = ARGBBlendRow_SSE2;
+  }
+#endif
+#if defined(HAS_ARGBBLENDROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBBlendRow = ARGBBlendRow_NEON;
+  }
+#endif
+  return ARGBBlendRow;
+}
+
+// Alpha Blend 2 ARGB images and store to destination.
+LIBYUV_API
+int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
+              const uint8* src_argb1, int src_stride_argb1,
+              uint8* dst_argb, int dst_stride_argb,
+              int width, int height) {
+  int y;
+  void (*ARGBBlendRow)(const uint8* src_argb, const uint8* src_argb1,
+                       uint8* dst_argb, int width) = GetARGBBlend();
+  if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb0 == width * 4 &&
+      src_stride_argb1 == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
+  }
+
+  for (y = 0; y < height; ++y) {
+    ARGBBlendRow(src_argb0, src_argb1, dst_argb, width);
+    src_argb0 += src_stride_argb0;
+    src_argb1 += src_stride_argb1;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Multiply 2 ARGB images and store to destination.
+LIBYUV_API
+int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
+                 const uint8* src_argb1, int src_stride_argb1,
+                 uint8* dst_argb, int dst_stride_argb,
+                 int width, int height) {
+  int y;
+  void (*ARGBMultiplyRow)(const uint8* src0, const uint8* src1, uint8* dst,
+                          int width) = ARGBMultiplyRow_C;
+  if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb0 == width * 4 &&
+      src_stride_argb1 == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBMULTIPLYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBMultiplyRow = ARGBMultiplyRow_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBMultiplyRow = ARGBMultiplyRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBMULTIPLYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBMultiplyRow = ARGBMultiplyRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBMultiplyRow = ARGBMultiplyRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBMULTIPLYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBMultiplyRow = ARGBMultiplyRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBMultiplyRow = ARGBMultiplyRow_NEON;
+    }
+  }
+#endif
+
+  // Multiply plane
+  for (y = 0; y < height; ++y) {
+    ARGBMultiplyRow(src_argb0, src_argb1, dst_argb, width);
+    src_argb0 += src_stride_argb0;
+    src_argb1 += src_stride_argb1;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Add 2 ARGB images and store to destination.
+LIBYUV_API
+int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
+            const uint8* src_argb1, int src_stride_argb1,
+            uint8* dst_argb, int dst_stride_argb,
+            int width, int height) {
+  int y;
+  void (*ARGBAddRow)(const uint8* src0, const uint8* src1, uint8* dst,
+                     int width) = ARGBAddRow_C;
+  if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb0 == width * 4 &&
+      src_stride_argb1 == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBADDROW_SSE2) && (defined(_MSC_VER) && !defined(__clang__))
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBAddRow = ARGBAddRow_SSE2;
+  }
+#endif
+#if defined(HAS_ARGBADDROW_SSE2) && !(defined(_MSC_VER) && !defined(__clang__))
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBAddRow = ARGBAddRow_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBAddRow = ARGBAddRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBADDROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBAddRow = ARGBAddRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAddRow = ARGBAddRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBADDROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBAddRow = ARGBAddRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAddRow = ARGBAddRow_NEON;
+    }
+  }
+#endif
+
+  // Add plane
+  for (y = 0; y < height; ++y) {
+    ARGBAddRow(src_argb0, src_argb1, dst_argb, width);
+    src_argb0 += src_stride_argb0;
+    src_argb1 += src_stride_argb1;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Subtract 2 ARGB images and store to destination.
+LIBYUV_API
+int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,
+                 const uint8* src_argb1, int src_stride_argb1,
+                 uint8* dst_argb, int dst_stride_argb,
+                 int width, int height) {
+  int y;
+  void (*ARGBSubtractRow)(const uint8* src0, const uint8* src1, uint8* dst,
+                          int width) = ARGBSubtractRow_C;
+  if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb0 == width * 4 &&
+      src_stride_argb1 == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBSUBTRACTROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBSubtractRow = ARGBSubtractRow_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBSubtractRow = ARGBSubtractRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBSUBTRACTROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBSubtractRow = ARGBSubtractRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBSubtractRow = ARGBSubtractRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBSUBTRACTROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBSubtractRow = ARGBSubtractRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBSubtractRow = ARGBSubtractRow_NEON;
+    }
+  }
+#endif
+
+  // Subtract plane
+  for (y = 0; y < height; ++y) {
+    ARGBSubtractRow(src_argb0, src_argb1, dst_argb, width);
+    src_argb0 += src_stride_argb0;
+    src_argb1 += src_stride_argb1;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert I422 to BGRA.
+LIBYUV_API
+int I422ToBGRA(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_bgra, int dst_stride_bgra,
+               int width, int height) {
+  int y;
+  void (*I422ToBGRARow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = I422ToBGRARow_C;
+  if (!src_y || !src_u || !src_v ||
+      !dst_bgra ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_bgra = dst_bgra + (height - 1) * dst_stride_bgra;
+    dst_stride_bgra = -dst_stride_bgra;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width &&
+      src_stride_u * 2 == width &&
+      src_stride_v * 2 == width &&
+      dst_stride_bgra == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_y = src_stride_u = src_stride_v = dst_stride_bgra = 0;
+  }
+#if defined(HAS_I422TOBGRAROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToBGRARow = I422ToBGRARow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToBGRARow = I422ToBGRARow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TOBGRAROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToBGRARow = I422ToBGRARow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToBGRARow = I422ToBGRARow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOBGRAROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToBGRARow = I422ToBGRARow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToBGRARow = I422ToBGRARow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TOBGRAROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+      IS_ALIGNED(dst_bgra, 4) && IS_ALIGNED(dst_stride_bgra, 4)) {
+    I422ToBGRARow = I422ToBGRARow_MIPS_DSPR2;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToBGRARow(src_y, src_u, src_v, dst_bgra, width);
+    dst_bgra += dst_stride_bgra;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+  return 0;
+}
+
+// Convert I422 to ABGR.
+LIBYUV_API
+int I422ToABGR(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_abgr, int dst_stride_abgr,
+               int width, int height) {
+  int y;
+  void (*I422ToABGRRow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = I422ToABGRRow_C;
+  if (!src_y || !src_u || !src_v ||
+      !dst_abgr ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_abgr = dst_abgr + (height - 1) * dst_stride_abgr;
+    dst_stride_abgr = -dst_stride_abgr;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width &&
+      src_stride_u * 2 == width &&
+      src_stride_v * 2 == width &&
+      dst_stride_abgr == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_y = src_stride_u = src_stride_v = dst_stride_abgr = 0;
+  }
+#if defined(HAS_I422TOABGRROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    I422ToABGRRow = I422ToABGRRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToABGRRow = I422ToABGRRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TOABGRROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToABGRRow = I422ToABGRRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToABGRRow = I422ToABGRRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TOABGRROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToABGRRow = I422ToABGRRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToABGRRow = I422ToABGRRow_AVX2;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToABGRRow(src_y, src_u, src_v, dst_abgr, width);
+    dst_abgr += dst_stride_abgr;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+  return 0;
+}
+
+// Convert I422 to RGBA.
+LIBYUV_API
+int I422ToRGBA(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_rgba, int dst_stride_rgba,
+               int width, int height) {
+  int y;
+  void (*I422ToRGBARow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = I422ToRGBARow_C;
+  if (!src_y || !src_u || !src_v ||
+      !dst_rgba ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
+    dst_stride_rgba = -dst_stride_rgba;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width &&
+      src_stride_u * 2 == width &&
+      src_stride_v * 2 == width &&
+      dst_stride_rgba == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_y = src_stride_u = src_stride_v = dst_stride_rgba = 0;
+  }
+#if defined(HAS_I422TORGBAROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    I422ToRGBARow = I422ToRGBARow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGBARow = I422ToRGBARow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGBAROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGBARow = I422ToRGBARow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGBAROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToRGBARow = I422ToRGBARow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGBARow = I422ToRGBARow_AVX2;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToRGBARow(src_y, src_u, src_v, dst_rgba, width);
+    dst_rgba += dst_stride_rgba;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+  return 0;
+}
+
+// Convert NV12 to RGB565.
+LIBYUV_API
+int NV12ToRGB565(const uint8* src_y, int src_stride_y,
+                 const uint8* src_uv, int src_stride_uv,
+                 uint8* dst_rgb565, int dst_stride_rgb565,
+                 int width, int height) {
+  int y;
+  void (*NV12ToRGB565Row)(const uint8* y_buf,
+                          const uint8* uv_buf,
+                          uint8* rgb_buf,
+                          int width) = NV12ToRGB565Row_C;
+  if (!src_y || !src_uv || !dst_rgb565 ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+    dst_stride_rgb565 = -dst_stride_rgb565;
+  }
+#if defined(HAS_NV12TORGB565ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    NV12ToRGB565Row = NV12ToRGB565Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToRGB565Row = NV12ToRGB565Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_NV12TORGB565ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    NV12ToRGB565Row = NV12ToRGB565Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      NV12ToRGB565Row = NV12ToRGB565Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_NV12TORGB565ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    NV12ToRGB565Row = NV12ToRGB565Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToRGB565Row = NV12ToRGB565Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    NV12ToRGB565Row(src_y, src_uv, dst_rgb565, width);
+    dst_rgb565 += dst_stride_rgb565;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_uv += src_stride_uv;
+    }
+  }
+  return 0;
+}
+
+// Convert NV21 to RGB565.
+LIBYUV_API
+int NV21ToRGB565(const uint8* src_y, int src_stride_y,
+                 const uint8* src_vu, int src_stride_vu,
+                 uint8* dst_rgb565, int dst_stride_rgb565,
+                 int width, int height) {
+  int y;
+  void (*NV21ToRGB565Row)(const uint8* y_buf,
+                          const uint8* src_vu,
+                          uint8* rgb_buf,
+                          int width) = NV21ToRGB565Row_C;
+  if (!src_y || !src_vu || !dst_rgb565 ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+    dst_stride_rgb565 = -dst_stride_rgb565;
+  }
+#if defined(HAS_NV21TORGB565ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    NV21ToRGB565Row = NV21ToRGB565Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      NV21ToRGB565Row = NV21ToRGB565Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_NV21TORGB565ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    NV21ToRGB565Row = NV21ToRGB565Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      NV21ToRGB565Row = NV21ToRGB565Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_NV21TORGB565ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    NV21ToRGB565Row = NV21ToRGB565Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      NV21ToRGB565Row = NV21ToRGB565Row_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    NV21ToRGB565Row(src_y, src_vu, dst_rgb565, width);
+    dst_rgb565 += dst_stride_rgb565;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_vu += src_stride_vu;
+    }
+  }
+  return 0;
+}
+
+LIBYUV_API
+void SetPlane(uint8* dst_y, int dst_stride_y,
+              int width, int height,
+              uint32 value) {
+  int y;
+  void (*SetRow)(uint8* dst, uint8 value, int pix) = SetRow_C;
+  if (height < 0) {
+    height = -height;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
+    dst_stride_y = -dst_stride_y;
+  }
+  // Coalesce rows.
+  if (dst_stride_y == width) {
+    width *= height;
+    height = 1;
+    dst_stride_y = 0;
+  }
+#if defined(HAS_SETROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SetRow = SetRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      SetRow = SetRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SETROW_X86)
+  if (TestCpuFlag(kCpuHasX86)) {
+    SetRow = SetRow_Any_X86;
+    if (IS_ALIGNED(width, 4)) {
+      SetRow = SetRow_X86;
+    }
+  }
+#endif
+#if defined(HAS_SETROW_ERMS)
+  if (TestCpuFlag(kCpuHasERMS)) {
+    SetRow = SetRow_ERMS;
+  }
+#endif
+
+  // Set plane
+  for (y = 0; y < height; ++y) {
+    SetRow(dst_y, value, width);
+    dst_y += dst_stride_y;
+  }
+}
+
+// Draw a rectangle into I420
+LIBYUV_API
+int I420Rect(uint8* dst_y, int dst_stride_y,
+             uint8* dst_u, int dst_stride_u,
+             uint8* dst_v, int dst_stride_v,
+             int x, int y,
+             int width, int height,
+             int value_y, int value_u, int value_v) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  uint8* start_y = dst_y + y * dst_stride_y + x;
+  uint8* start_u = dst_u + (y / 2) * dst_stride_u + (x / 2);
+  uint8* start_v = dst_v + (y / 2) * dst_stride_v + (x / 2);
+  if (!dst_y || !dst_u || !dst_v ||
+      width <= 0 || height == 0 ||
+      x < 0 || y < 0 ||
+      value_y < 0 || value_y > 255 ||
+      value_u < 0 || value_u > 255 ||
+      value_v < 0 || value_v > 255) {
+    return -1;
+  }
+
+  SetPlane(start_y, dst_stride_y, width, height, value_y);
+  SetPlane(start_u, dst_stride_u, halfwidth, halfheight, value_u);
+  SetPlane(start_v, dst_stride_v, halfwidth, halfheight, value_v);
+  return 0;
+}
+
+// Draw a rectangle into ARGB
+LIBYUV_API
+int ARGBRect(uint8* dst_argb, int dst_stride_argb,
+             int dst_x, int dst_y,
+             int width, int height,
+             uint32 value) {
+  int y;
+  void (*ARGBSetRow)(uint8* dst_argb, uint32 value, int pix) = ARGBSetRow_C;
+  if (!dst_argb ||
+      width <= 0 || height == 0 ||
+      dst_x < 0 || dst_y < 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  dst_argb += dst_y * dst_stride_argb + dst_x * 4;
+  // Coalesce rows.
+  if (dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    dst_stride_argb = 0;
+  }
+
+#if defined(HAS_ARGBSETROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBSetRow = ARGBSetRow_Any_NEON;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBSetRow = ARGBSetRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBSETROW_X86)
+  if (TestCpuFlag(kCpuHasX86)) {
+    ARGBSetRow = ARGBSetRow_X86;
+  }
+#endif
+
+  // Set plane
+  for (y = 0; y < height; ++y) {
+    ARGBSetRow(dst_argb, value, width);
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert unattentuated ARGB to preattenuated ARGB.
+// An unattenutated ARGB alpha blend uses the formula
+// p = a * f + (1 - a) * b
+// where
+//   p is output pixel
+//   f is foreground pixel
+//   b is background pixel
+//   a is alpha value from foreground pixel
+// An preattenutated ARGB alpha blend uses the formula
+// p = f + (1 - a) * b
+// where
+//   f is foreground pixel premultiplied by alpha
+
+LIBYUV_API
+int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
+                  uint8* dst_argb, int dst_stride_argb,
+                  int width, int height) {
+  int y;
+  void (*ARGBAttenuateRow)(const uint8* src_argb, uint8* dst_argb,
+                           int width) = ARGBAttenuateRow_C;
+  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBATTENUATEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBAttenuateRow(src_argb, dst_argb, width);
+    src_argb += src_stride_argb;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert preattentuated ARGB to unattenuated ARGB.
+LIBYUV_API
+int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
+                    uint8* dst_argb, int dst_stride_argb,
+                    int width, int height) {
+  int y;
+  void (*ARGBUnattenuateRow)(const uint8* src_argb, uint8* dst_argb,
+                             int width) = ARGBUnattenuateRow_C;
+  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBUNATTENUATEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBUnattenuateRow = ARGBUnattenuateRow_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBUnattenuateRow = ARGBUnattenuateRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBUNATTENUATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBUnattenuateRow = ARGBUnattenuateRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBUnattenuateRow = ARGBUnattenuateRow_AVX2;
+    }
+  }
+#endif
+// TODO(fbarchard): Neon version.
+
+  for (y = 0; y < height; ++y) {
+    ARGBUnattenuateRow(src_argb, dst_argb, width);
+    src_argb += src_stride_argb;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert ARGB to Grayed ARGB.
+LIBYUV_API
+int ARGBGrayTo(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  int y;
+  void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb,
+                      int width) = ARGBGrayRow_C;
+  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBGRAYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
+    ARGBGrayRow = ARGBGrayRow_SSSE3;
+  }
+#endif
+#if defined(HAS_ARGBGRAYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+    ARGBGrayRow = ARGBGrayRow_NEON;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBGrayRow(src_argb, dst_argb, width);
+    src_argb += src_stride_argb;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Make a rectangle of ARGB gray scale.
+LIBYUV_API
+int ARGBGray(uint8* dst_argb, int dst_stride_argb,
+             int dst_x, int dst_y,
+             int width, int height) {
+  int y;
+  void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb,
+                      int width) = ARGBGrayRow_C;
+  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+  if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) {
+    return -1;
+  }
+  // Coalesce rows.
+  if (dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBGRAYROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
+    ARGBGrayRow = ARGBGrayRow_SSSE3;
+  }
+#endif
+#if defined(HAS_ARGBGRAYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+    ARGBGrayRow = ARGBGrayRow_NEON;
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    ARGBGrayRow(dst, dst, width);
+    dst += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Make a rectangle of ARGB Sepia tone.
+LIBYUV_API
+int ARGBSepia(uint8* dst_argb, int dst_stride_argb,
+              int dst_x, int dst_y, int width, int height) {
+  int y;
+  void (*ARGBSepiaRow)(uint8* dst_argb, int width) = ARGBSepiaRow_C;
+  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+  if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) {
+    return -1;
+  }
+  // Coalesce rows.
+  if (dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBSEPIAROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
+    ARGBSepiaRow = ARGBSepiaRow_SSSE3;
+  }
+#endif
+#if defined(HAS_ARGBSEPIAROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+    ARGBSepiaRow = ARGBSepiaRow_NEON;
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    ARGBSepiaRow(dst, width);
+    dst += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Apply a 4x4 matrix to each ARGB pixel.
+// Note: Normally for shading, but can be used to swizzle or invert.
+LIBYUV_API
+int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb,
+                    uint8* dst_argb, int dst_stride_argb,
+                    const int8* matrix_argb,
+                    int width, int height) {
+  int y;
+  void (*ARGBColorMatrixRow)(const uint8* src_argb, uint8* dst_argb,
+      const int8* matrix_argb, int width) = ARGBColorMatrixRow_C;
+  if (!src_argb || !dst_argb || !matrix_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBCOLORMATRIXROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
+    ARGBColorMatrixRow = ARGBColorMatrixRow_SSSE3;
+  }
+#endif
+#if defined(HAS_ARGBCOLORMATRIXROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+    ARGBColorMatrixRow = ARGBColorMatrixRow_NEON;
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    ARGBColorMatrixRow(src_argb, dst_argb, matrix_argb, width);
+    src_argb += src_stride_argb;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Apply a 4x3 matrix to each ARGB pixel.
+// Deprecated.
+LIBYUV_API
+int RGBColorMatrix(uint8* dst_argb, int dst_stride_argb,
+                   const int8* matrix_rgb,
+                   int dst_x, int dst_y, int width, int height) {
+  SIMD_ALIGNED(int8 matrix_argb[16]);
+  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+  if (!dst_argb || !matrix_rgb || width <= 0 || height <= 0 ||
+      dst_x < 0 || dst_y < 0) {
+    return -1;
+  }
+
+  // Convert 4x3 7 bit matrix to 4x4 6 bit matrix.
+  matrix_argb[0] = matrix_rgb[0] / 2;
+  matrix_argb[1] = matrix_rgb[1] / 2;
+  matrix_argb[2] = matrix_rgb[2] / 2;
+  matrix_argb[3] = matrix_rgb[3] / 2;
+  matrix_argb[4] = matrix_rgb[4] / 2;
+  matrix_argb[5] = matrix_rgb[5] / 2;
+  matrix_argb[6] = matrix_rgb[6] / 2;
+  matrix_argb[7] = matrix_rgb[7] / 2;
+  matrix_argb[8] = matrix_rgb[8] / 2;
+  matrix_argb[9] = matrix_rgb[9] / 2;
+  matrix_argb[10] = matrix_rgb[10] / 2;
+  matrix_argb[11] = matrix_rgb[11] / 2;
+  matrix_argb[14] = matrix_argb[13] = matrix_argb[12] = 0;
+  matrix_argb[15] = 64;  // 1.0
+
+  return ARGBColorMatrix((const uint8*)(dst), dst_stride_argb,
+                         dst, dst_stride_argb,
+                         &matrix_argb[0], width, height);
+}
+
+// Apply a color table each ARGB pixel.
+// Table contains 256 ARGB values.
+LIBYUV_API
+int ARGBColorTable(uint8* dst_argb, int dst_stride_argb,
+                   const uint8* table_argb,
+                   int dst_x, int dst_y, int width, int height) {
+  int y;
+  void (*ARGBColorTableRow)(uint8* dst_argb, const uint8* table_argb,
+                            int width) = ARGBColorTableRow_C;
+  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+  if (!dst_argb || !table_argb || width <= 0 || height <= 0 ||
+      dst_x < 0 || dst_y < 0) {
+    return -1;
+  }
+  // Coalesce rows.
+  if (dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBCOLORTABLEROW_X86)
+  if (TestCpuFlag(kCpuHasX86)) {
+    ARGBColorTableRow = ARGBColorTableRow_X86;
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    ARGBColorTableRow(dst, table_argb, width);
+    dst += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Apply a color table each ARGB pixel but preserve destination alpha.
+// Table contains 256 ARGB values.
+LIBYUV_API
+int RGBColorTable(uint8* dst_argb, int dst_stride_argb,
+                  const uint8* table_argb,
+                  int dst_x, int dst_y, int width, int height) {
+  int y;
+  void (*RGBColorTableRow)(uint8* dst_argb, const uint8* table_argb,
+                           int width) = RGBColorTableRow_C;
+  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+  if (!dst_argb || !table_argb || width <= 0 || height <= 0 ||
+      dst_x < 0 || dst_y < 0) {
+    return -1;
+  }
+  // Coalesce rows.
+  if (dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    dst_stride_argb = 0;
+  }
+#if defined(HAS_RGBCOLORTABLEROW_X86)
+  if (TestCpuFlag(kCpuHasX86)) {
+    RGBColorTableRow = RGBColorTableRow_X86;
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    RGBColorTableRow(dst, table_argb, width);
+    dst += dst_stride_argb;
+  }
+  return 0;
+}
+
+// ARGBQuantize is used to posterize art.
+// e.g. rgb / qvalue * qvalue + qvalue / 2
+// But the low levels implement efficiently with 3 parameters, and could be
+// used for other high level operations.
+// dst_argb[0] = (b * scale >> 16) * interval_size + interval_offset;
+// where scale is 1 / interval_size as a fixed point value.
+// The divide is replaces with a multiply by reciprocal fixed point multiply.
+// Caveat - although SSE2 saturates, the C function does not and should be used
+// with care if doing anything but quantization.
+LIBYUV_API
+int ARGBQuantize(uint8* dst_argb, int dst_stride_argb,
+                 int scale, int interval_size, int interval_offset,
+                 int dst_x, int dst_y, int width, int height) {
+  int y;
+  void (*ARGBQuantizeRow)(uint8* dst_argb, int scale, int interval_size,
+                          int interval_offset, int width) = ARGBQuantizeRow_C;
+  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+  if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0 ||
+      interval_size < 1 || interval_size > 255) {
+    return -1;
+  }
+  // Coalesce rows.
+  if (dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBQUANTIZEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) {
+    ARGBQuantizeRow = ARGBQuantizeRow_SSE2;
+  }
+#endif
+#if defined(HAS_ARGBQUANTIZEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+    ARGBQuantizeRow = ARGBQuantizeRow_NEON;
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    ARGBQuantizeRow(dst, scale, interval_size, interval_offset, width);
+    dst += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Computes table of cumulative sum for image where the value is the sum
+// of all values above and to the left of the entry. Used by ARGBBlur.
+LIBYUV_API
+int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb,
+                             int32* dst_cumsum, int dst_stride32_cumsum,
+                             int width, int height) {
+  int y;
+  void (*ComputeCumulativeSumRow)(const uint8* row, int32* cumsum,
+      const int32* previous_cumsum, int width) = ComputeCumulativeSumRow_C;
+  int32* previous_cumsum = dst_cumsum;
+  if (!dst_cumsum || !src_argb || width <= 0 || height <= 0) {
+    return -1;
+  }
+#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ComputeCumulativeSumRow = ComputeCumulativeSumRow_SSE2;
+  }
+#endif
+  memset(dst_cumsum, 0, width * sizeof(dst_cumsum[0]) * 4);  // 4 int per pixel.
+  for (y = 0; y < height; ++y) {
+    ComputeCumulativeSumRow(src_argb, dst_cumsum, previous_cumsum, width);
+    previous_cumsum = dst_cumsum;
+    dst_cumsum += dst_stride32_cumsum;
+    src_argb += src_stride_argb;
+  }
+  return 0;
+}
+
+// Blur ARGB image.
+// Caller should allocate CumulativeSum table of width * height * 16 bytes
+// aligned to 16 byte boundary. height can be radius * 2 + 2 to save memory
+// as the buffer is treated as circular.
+LIBYUV_API
+int ARGBBlur(const uint8* src_argb, int src_stride_argb,
+             uint8* dst_argb, int dst_stride_argb,
+             int32* dst_cumsum, int dst_stride32_cumsum,
+             int width, int height, int radius) {
+  int y;
+  void (*ComputeCumulativeSumRow)(const uint8 *row, int32 *cumsum,
+      const int32* previous_cumsum, int width) = ComputeCumulativeSumRow_C;
+  void (*CumulativeSumToAverageRow)(const int32* topleft, const int32* botleft,
+      int width, int area, uint8* dst, int count) = CumulativeSumToAverageRow_C;
+  int32* cumsum_bot_row;
+  int32* max_cumsum_bot_row;
+  int32* cumsum_top_row;
+
+  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  if (radius > height) {
+    radius = height;
+  }
+  if (radius > (width / 2 - 1)) {
+    radius = width / 2 - 1;
+  }
+  if (radius <= 0) {
+    return -1;
+  }
+#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ComputeCumulativeSumRow = ComputeCumulativeSumRow_SSE2;
+    CumulativeSumToAverageRow = CumulativeSumToAverageRow_SSE2;
+  }
+#endif
+  // Compute enough CumulativeSum for first row to be blurred. After this
+  // one row of CumulativeSum is updated at a time.
+  ARGBComputeCumulativeSum(src_argb, src_stride_argb,
+                           dst_cumsum, dst_stride32_cumsum,
+                           width, radius);
+
+  src_argb = src_argb + radius * src_stride_argb;
+  cumsum_bot_row = &dst_cumsum[(radius - 1) * dst_stride32_cumsum];
+
+  max_cumsum_bot_row = &dst_cumsum[(radius * 2 + 2) * dst_stride32_cumsum];
+  cumsum_top_row = &dst_cumsum[0];
+
+  for (y = 0; y < height; ++y) {
+    int top_y = ((y - radius - 1) >= 0) ? (y - radius - 1) : 0;
+    int bot_y = ((y + radius) < height) ? (y + radius) : (height - 1);
+    int area = radius * (bot_y - top_y);
+    int boxwidth = radius * 4;
+    int x;
+    int n;
+
+    // Increment cumsum_top_row pointer with circular buffer wrap around.
+    if (top_y) {
+      cumsum_top_row += dst_stride32_cumsum;
+      if (cumsum_top_row >= max_cumsum_bot_row) {
+        cumsum_top_row = dst_cumsum;
+      }
+    }
+    // Increment cumsum_bot_row pointer with circular buffer wrap around and
+    // then fill in a row of CumulativeSum.
+    if ((y + radius) < height) {
+      const int32* prev_cumsum_bot_row = cumsum_bot_row;
+      cumsum_bot_row += dst_stride32_cumsum;
+      if (cumsum_bot_row >= max_cumsum_bot_row) {
+        cumsum_bot_row = dst_cumsum;
+      }
+      ComputeCumulativeSumRow(src_argb, cumsum_bot_row, prev_cumsum_bot_row,
+                              width);
+      src_argb += src_stride_argb;
+    }
+
+    // Left clipped.
+    for (x = 0; x < radius + 1; ++x) {
+      CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row,
+                                boxwidth, area, &dst_argb[x * 4], 1);
+      area += (bot_y - top_y);
+      boxwidth += 4;
+    }
+
+    // Middle unclipped.
+    n = (width - 1) - radius - x + 1;
+    CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row,
+                              boxwidth, area, &dst_argb[x * 4], n);
+
+    // Right clipped.
+    for (x += n; x <= width - 1; ++x) {
+      area -= (bot_y - top_y);
+      boxwidth -= 4;
+      CumulativeSumToAverageRow(cumsum_top_row + (x - radius - 1) * 4,
+                                cumsum_bot_row + (x - radius - 1) * 4,
+                                boxwidth, area, &dst_argb[x * 4], 1);
+    }
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Multiply ARGB image by a specified ARGB value.
+LIBYUV_API
+int ARGBShade(const uint8* src_argb, int src_stride_argb,
+              uint8* dst_argb, int dst_stride_argb,
+              int width, int height, uint32 value) {
+  int y;
+  void (*ARGBShadeRow)(const uint8* src_argb, uint8* dst_argb,
+                       int width, uint32 value) = ARGBShadeRow_C;
+  if (!src_argb || !dst_argb || width <= 0 || height == 0 || value == 0u) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBSHADEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) {
+    ARGBShadeRow = ARGBShadeRow_SSE2;
+  }
+#endif
+#if defined(HAS_ARGBSHADEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+    ARGBShadeRow = ARGBShadeRow_NEON;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBShadeRow(src_argb, dst_argb, width, value);
+    src_argb += src_stride_argb;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Interpolate 2 ARGB images by specified amount (0 to 255).
+LIBYUV_API
+int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
+                    const uint8* src_argb1, int src_stride_argb1,
+                    uint8* dst_argb, int dst_stride_argb,
+                    int width, int height, int interpolation) {
+  int y;
+  void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_C;
+  if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb0 == width * 4 &&
+      src_stride_argb1 == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
+  }
+#if defined(HAS_INTERPOLATEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    InterpolateRow = InterpolateRow_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      InterpolateRow = InterpolateRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 4)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(width, 4)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
+      IS_ALIGNED(src_argb0, 4) && IS_ALIGNED(src_stride_argb0, 4) &&
+      IS_ALIGNED(src_argb1, 4) && IS_ALIGNED(src_stride_argb1, 4) &&
+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+    InterpolateRow = InterpolateRow_MIPS_DSPR2;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    InterpolateRow(dst_argb, src_argb0, src_argb1 - src_argb0,
+                   width * 4, interpolation);
+    src_argb0 += src_stride_argb0;
+    src_argb1 += src_stride_argb1;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Shuffle ARGB channel order.  e.g. BGRA to ARGB.
+LIBYUV_API
+int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
+                uint8* dst_argb, int dst_stride_argb,
+                const uint8* shuffler, int width, int height) {
+  int y;
+  void (*ARGBShuffleRow)(const uint8* src_bgra, uint8* dst_argb,
+                         const uint8* shuffler, int pix) = ARGBShuffleRow_C;
+  if (!src_bgra || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_bgra = src_bgra + (height - 1) * src_stride_bgra;
+    src_stride_bgra = -src_stride_bgra;
+  }
+  // Coalesce rows.
+  if (src_stride_bgra == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_bgra = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBSHUFFLEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBShuffleRow = ARGBShuffleRow_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBShuffleRow = ARGBShuffleRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBSHUFFLEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBShuffleRow = ARGBShuffleRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBShuffleRow = ARGBShuffleRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBSHUFFLEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBShuffleRow = ARGBShuffleRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBShuffleRow = ARGBShuffleRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBSHUFFLEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBShuffleRow = ARGBShuffleRow_Any_NEON;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBShuffleRow = ARGBShuffleRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBShuffleRow(src_bgra, dst_argb, shuffler, width);
+    src_bgra += src_stride_bgra;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Sobel ARGB effect.
+static int ARGBSobelize(const uint8* src_argb, int src_stride_argb,
+                        uint8* dst_argb, int dst_stride_argb,
+                        int width, int height,
+                        void (*SobelRow)(const uint8* src_sobelx,
+                                         const uint8* src_sobely,
+                                         uint8* dst, int width)) {
+  int y;
+  void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_g, int pix) =
+      ARGBToYJRow_C;
+  void (*SobelYRow)(const uint8* src_y0, const uint8* src_y1,
+                    uint8* dst_sobely, int width) = SobelYRow_C;
+  void (*SobelXRow)(const uint8* src_y0, const uint8* src_y1,
+                    const uint8* src_y2, uint8* dst_sobely, int width) =
+      SobelXRow_C;
+  const int kEdge = 16;  // Extra pixels at start of row for extrude/align.
+  if (!src_argb  || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb  = src_argb  + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+
+#if defined(HAS_ARGBTOYJROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYJRow = ARGBToYJRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYJRow = ARGBToYJRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYJRow = ARGBToYJRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYJRow = ARGBToYJRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYJRow = ARGBToYJRow_NEON;
+    }
+  }
+#endif
+
+#if defined(HAS_SOBELYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    SobelYRow = SobelYRow_SSE2;
+  }
+#endif
+#if defined(HAS_SOBELYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SobelYRow = SobelYRow_NEON;
+  }
+#endif
+#if defined(HAS_SOBELXROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    SobelXRow = SobelXRow_SSE2;
+  }
+#endif
+#if defined(HAS_SOBELXROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SobelXRow = SobelXRow_NEON;
+  }
+#endif
+  {
+    // 3 rows with edges before/after.
+    const int kRowSize = (width + kEdge + 31) & ~31;
+    align_buffer_64(rows, kRowSize * 2 + (kEdge + kRowSize * 3 + kEdge));
+    uint8* row_sobelx = rows;
+    uint8* row_sobely = rows + kRowSize;
+    uint8* row_y = rows + kRowSize * 2;
+
+    // Convert first row.
+    uint8* row_y0 = row_y + kEdge;
+    uint8* row_y1 = row_y0 + kRowSize;
+    uint8* row_y2 = row_y1 + kRowSize;
+    ARGBToYJRow(src_argb, row_y0, width);
+    row_y0[-1] = row_y0[0];
+    memset(row_y0 + width, row_y0[width - 1], 16);  // Extrude 16 for valgrind.
+    ARGBToYJRow(src_argb, row_y1, width);
+    row_y1[-1] = row_y1[0];
+    memset(row_y1 + width, row_y1[width - 1], 16);
+    memset(row_y2 + width, 0, 16);
+
+    for (y = 0; y < height; ++y) {
+      // Convert next row of ARGB to G.
+      if (y < (height - 1)) {
+        src_argb += src_stride_argb;
+      }
+      ARGBToYJRow(src_argb, row_y2, width);
+      row_y2[-1] = row_y2[0];
+      row_y2[width] = row_y2[width - 1];
+
+      SobelXRow(row_y0 - 1, row_y1 - 1, row_y2 - 1, row_sobelx, width);
+      SobelYRow(row_y0 - 1, row_y2 - 1, row_sobely, width);
+      SobelRow(row_sobelx, row_sobely, dst_argb, width);
+
+      // Cycle thru circular queue of 3 row_y buffers.
+      {
+        uint8* row_yt = row_y0;
+        row_y0 = row_y1;
+        row_y1 = row_y2;
+        row_y2 = row_yt;
+      }
+
+      dst_argb += dst_stride_argb;
+    }
+    free_aligned_buffer_64(rows);
+  }
+  return 0;
+}
+
+// Sobel ARGB effect.
+LIBYUV_API
+int ARGBSobel(const uint8* src_argb, int src_stride_argb,
+              uint8* dst_argb, int dst_stride_argb,
+              int width, int height) {
+  void (*SobelRow)(const uint8* src_sobelx, const uint8* src_sobely,
+                   uint8* dst_argb, int width) = SobelRow_C;
+#if defined(HAS_SOBELROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    SobelRow = SobelRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      SobelRow = SobelRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_SOBELROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SobelRow = SobelRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      SobelRow = SobelRow_NEON;
+    }
+  }
+#endif
+  return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+                      width, height, SobelRow);
+}
+
+// Sobel ARGB effect with planar output.
+LIBYUV_API
+int ARGBSobelToPlane(const uint8* src_argb, int src_stride_argb,
+                     uint8* dst_y, int dst_stride_y,
+                     int width, int height) {
+  void (*SobelToPlaneRow)(const uint8* src_sobelx, const uint8* src_sobely,
+                          uint8* dst_, int width) = SobelToPlaneRow_C;
+#if defined(HAS_SOBELTOPLANEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    SobelToPlaneRow = SobelToPlaneRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      SobelToPlaneRow = SobelToPlaneRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_SOBELTOPLANEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SobelToPlaneRow = SobelToPlaneRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      SobelToPlaneRow = SobelToPlaneRow_NEON;
+    }
+  }
+#endif
+  return ARGBSobelize(src_argb, src_stride_argb, dst_y, dst_stride_y,
+                      width, height, SobelToPlaneRow);
+}
+
+// SobelXY ARGB effect.
+// Similar to Sobel, but also stores Sobel X in R and Sobel Y in B.  G = Sobel.
+LIBYUV_API
+int ARGBSobelXY(const uint8* src_argb, int src_stride_argb,
+                uint8* dst_argb, int dst_stride_argb,
+                int width, int height) {
+  void (*SobelXYRow)(const uint8* src_sobelx, const uint8* src_sobely,
+                     uint8* dst_argb, int width) = SobelXYRow_C;
+#if defined(HAS_SOBELXYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    SobelXYRow = SobelXYRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      SobelXYRow = SobelXYRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_SOBELXYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SobelXYRow = SobelXYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      SobelXYRow = SobelXYRow_NEON;
+    }
+  }
+#endif
+  return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+                      width, height, SobelXYRow);
+}
+
+// Apply a 4x4 polynomial to each ARGB pixel.
+LIBYUV_API
+int ARGBPolynomial(const uint8* src_argb, int src_stride_argb,
+                   uint8* dst_argb, int dst_stride_argb,
+                   const float* poly,
+                   int width, int height) {
+  int y;
+  void (*ARGBPolynomialRow)(const uint8* src_argb,
+                            uint8* dst_argb, const float* poly,
+                            int width) = ARGBPolynomialRow_C;
+  if (!src_argb || !dst_argb || !poly || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb  = src_argb  + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBPOLYNOMIALROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 2)) {
+    ARGBPolynomialRow = ARGBPolynomialRow_SSE2;
+  }
+#endif
+#if defined(HAS_ARGBPOLYNOMIALROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2) && TestCpuFlag(kCpuHasFMA3) &&
+      IS_ALIGNED(width, 2)) {
+    ARGBPolynomialRow = ARGBPolynomialRow_AVX2;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBPolynomialRow(src_argb, dst_argb, poly, width);
+    src_argb += src_stride_argb;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Apply a lumacolortable to each ARGB pixel.
+LIBYUV_API
+int ARGBLumaColorTable(const uint8* src_argb, int src_stride_argb,
+                       uint8* dst_argb, int dst_stride_argb,
+                       const uint8* luma,
+                       int width, int height) {
+  int y;
+  void (*ARGBLumaColorTableRow)(const uint8* src_argb, uint8* dst_argb,
+      int width, const uint8* luma, const uint32 lumacoeff) =
+      ARGBLumaColorTableRow_C;
+  if (!src_argb || !dst_argb || !luma || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb  = src_argb  + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBLUMACOLORTABLEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4)) {
+    ARGBLumaColorTableRow = ARGBLumaColorTableRow_SSSE3;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBLumaColorTableRow(src_argb, dst_argb, width, luma, 0x00264b0f);
+    src_argb += src_stride_argb;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Copy Alpha from one ARGB image to another.
+LIBYUV_API
+int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb,
+                  uint8* dst_argb, int dst_stride_argb,
+                  int width, int height) {
+  int y;
+  void (*ARGBCopyAlphaRow)(const uint8* src_argb, uint8* dst_argb, int width) =
+      ARGBCopyAlphaRow_C;
+  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBCOPYALPHAROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8)) {
+    ARGBCopyAlphaRow = ARGBCopyAlphaRow_SSE2;
+  }
+#endif
+#if defined(HAS_ARGBCOPYALPHAROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 16)) {
+    ARGBCopyAlphaRow = ARGBCopyAlphaRow_AVX2;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBCopyAlphaRow(src_argb, dst_argb, width);
+    src_argb += src_stride_argb;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Copy a planar Y channel to the alpha channel of a destination ARGB image.
+LIBYUV_API
+int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y,
+                     uint8* dst_argb, int dst_stride_argb,
+                     int width, int height) {
+  int y;
+  void (*ARGBCopyYToAlphaRow)(const uint8* src_y, uint8* dst_argb, int width) =
+      ARGBCopyYToAlphaRow_C;
+  if (!src_y || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_stride_y = -src_stride_y;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_y = dst_stride_argb = 0;
+  }
+#if defined(HAS_ARGBCOPYYTOALPHAROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8)) {
+    ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_SSE2;
+  }
+#endif
+#if defined(HAS_ARGBCOPYYTOALPHAROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 16)) {
+    ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_AVX2;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBCopyYToAlphaRow(src_y, dst_argb, width);
+    src_y += src_stride_y;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+LIBYUV_API
+int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_uv, int dst_stride_uv,
+               int width, int height) {
+  int y;
+  int halfwidth = (width + 1) >> 1;
+  void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) =
+      SplitUVRow_C;
+  void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_C;
+  if (!src_yuy2 ||
+      !dst_y || !dst_uv ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
+    src_stride_yuy2 = -src_stride_yuy2;
+  }
+#if defined(HAS_SPLITUVROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    SplitUVRow = SplitUVRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      SplitUVRow = SplitUVRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_SPLITUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    SplitUVRow = SplitUVRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      SplitUVRow = SplitUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_SPLITUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SplitUVRow = SplitUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      SplitUVRow = SplitUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    InterpolateRow = InterpolateRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      InterpolateRow = InterpolateRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
+  }
+#endif
+
+  {
+    int awidth = halfwidth * 2;
+    // 2 rows of uv
+    align_buffer_64(rows, awidth * 2);
+
+    for (y = 0; y < height - 1; y += 2) {
+      // Split Y from UV.
+      SplitUVRow(src_yuy2, dst_y, rows, awidth);
+      SplitUVRow(src_yuy2 + src_stride_yuy2, dst_y + dst_stride_y,
+                 rows + awidth, awidth);
+      InterpolateRow(dst_uv, rows, awidth, awidth, 128);
+      src_yuy2 += src_stride_yuy2 * 2;
+      dst_y += dst_stride_y * 2;
+      dst_uv += dst_stride_uv;
+    }
+    if (height & 1) {
+      // Split Y from UV.
+      SplitUVRow(src_yuy2, dst_y, dst_uv, width);
+    }
+    free_aligned_buffer_64(rows);
+  }
+  return 0;
+}
+
+LIBYUV_API
+int UYVYToNV12(const uint8* src_uyvy, int src_stride_uyvy,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_uv, int dst_stride_uv,
+               int width, int height) {
+  int y;
+  int halfwidth = (width + 1) >> 1;
+  void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) =
+      SplitUVRow_C;
+  void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_C;
+  if (!src_uyvy ||
+      !dst_y || !dst_uv ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
+    src_stride_uyvy = -src_stride_uyvy;
+  }
+#if defined(HAS_SPLITUVROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    SplitUVRow = SplitUVRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      SplitUVRow = SplitUVRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_SPLITUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    SplitUVRow = SplitUVRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      SplitUVRow = SplitUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_SPLITUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SplitUVRow = SplitUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      SplitUVRow = SplitUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    InterpolateRow = InterpolateRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      InterpolateRow = InterpolateRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
+  }
+#endif
+
+  {
+    int awidth = halfwidth * 2;
+    // 2 rows of uv
+    align_buffer_64(rows, awidth * 2);
+
+    for (y = 0; y < height - 1; y += 2) {
+      // Split Y from UV.
+      SplitUVRow(src_uyvy, rows, dst_y, awidth);
+      SplitUVRow(src_uyvy + src_stride_uyvy, rows + awidth,
+                 dst_y + dst_stride_y, awidth);
+      InterpolateRow(dst_uv, rows, awidth, awidth, 128);
+      src_uyvy += src_stride_uyvy * 2;
+      dst_y += dst_stride_y * 2;
+      dst_uv += dst_stride_uv;
+    }
+    if (height & 1) {
+      // Split Y from UV.
+      SplitUVRow(src_uyvy, dst_y, dst_uv, width);
+    }
+    free_aligned_buffer_64(rows);
+  }
+  return 0;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libaom/src/third_party/libyuv/source/rotate.cc b/libs/libaom/src/third_party/libyuv/source/rotate.cc
new file mode 100644
index 000000000..be3d58920
--- /dev/null
+++ b/libs/libaom/src/third_party/libyuv/source/rotate.cc
@@ -0,0 +1,496 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/rotate.h"
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/convert.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate_row.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+LIBYUV_API
+void TransposePlane(const uint8* src, int src_stride,
+                    uint8* dst, int dst_stride,
+                    int width, int height) {
+  int i = height;
+  void (*TransposeWx8)(const uint8* src, int src_stride,
+                       uint8* dst, int dst_stride, int width) = TransposeWx8_C;
+#if defined(HAS_TRANSPOSEWX8_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    TransposeWx8 = TransposeWx8_NEON;
+  }
+#endif
+#if defined(HAS_TRANSPOSEWX8_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    TransposeWx8 = TransposeWx8_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      TransposeWx8 = TransposeWx8_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    TransposeWx8 = TransposeWx8_Fast_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      TransposeWx8 = TransposeWx8_Fast_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_TRANSPOSEWX8_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {
+    if (IS_ALIGNED(width, 4) &&
+        IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
+      TransposeWx8 = TransposeWx8_Fast_MIPS_DSPR2;
+    } else {
+      TransposeWx8 = TransposeWx8_MIPS_DSPR2;
+    }
+  }
+#endif
+
+  // Work across the source in 8x8 tiles
+  while (i >= 8) {
+    TransposeWx8(src, src_stride, dst, dst_stride, width);
+    src += 8 * src_stride;    // Go down 8 rows.
+    dst += 8;                 // Move over 8 columns.
+    i -= 8;
+  }
+
+  if (i > 0) {
+    TransposeWxH_C(src, src_stride, dst, dst_stride, width, i);
+  }
+}
+
+LIBYUV_API
+void RotatePlane90(const uint8* src, int src_stride,
+                   uint8* dst, int dst_stride,
+                   int width, int height) {
+  // Rotate by 90 is a transpose with the source read
+  // from bottom to top. So set the source pointer to the end
+  // of the buffer and flip the sign of the source stride.
+  src += src_stride * (height - 1);
+  src_stride = -src_stride;
+  TransposePlane(src, src_stride, dst, dst_stride, width, height);
+}
+
+LIBYUV_API
+void RotatePlane270(const uint8* src, int src_stride,
+                    uint8* dst, int dst_stride,
+                    int width, int height) {
+  // Rotate by 270 is a transpose with the destination written
+  // from bottom to top. So set the destination pointer to the end
+  // of the buffer and flip the sign of the destination stride.
+  dst += dst_stride * (width - 1);
+  dst_stride = -dst_stride;
+  TransposePlane(src, src_stride, dst, dst_stride, width, height);
+}
+
+LIBYUV_API
+void RotatePlane180(const uint8* src, int src_stride,
+                    uint8* dst, int dst_stride,
+                    int width, int height) {
+  // Swap first and last row and mirror the content. Uses a temporary row.
+  align_buffer_64(row, width);
+  const uint8* src_bot = src + src_stride * (height - 1);
+  uint8* dst_bot = dst + dst_stride * (height - 1);
+  int half_height = (height + 1) >> 1;
+  int y;
+  void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C;
+  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
+#if defined(HAS_MIRRORROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MirrorRow = MirrorRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      MirrorRow = MirrorRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_MIRRORROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    MirrorRow = MirrorRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      MirrorRow = MirrorRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_MIRRORROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    MirrorRow = MirrorRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      MirrorRow = MirrorRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_MIRRORROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MirrorRow = MirrorRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      MirrorRow = MirrorRow_AVX2;
+    }
+  }
+#endif
+// TODO(fbarchard): Mirror on mips handle unaligned memory.
+#if defined(HAS_MIRRORROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
+      IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4) &&
+      IS_ALIGNED(dst, 4) && IS_ALIGNED(dst_stride, 4)) {
+    MirrorRow = MirrorRow_MIPS_DSPR2;
+  }
+#endif
+#if defined(HAS_COPYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
+  }
+#endif
+#if defined(HAS_COPYROW_AVX)
+  if (TestCpuFlag(kCpuHasAVX)) {
+    CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
+  }
+#endif
+#if defined(HAS_COPYROW_ERMS)
+  if (TestCpuFlag(kCpuHasERMS)) {
+    CopyRow = CopyRow_ERMS;
+  }
+#endif
+#if defined(HAS_COPYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
+  }
+#endif
+#if defined(HAS_COPYROW_MIPS)
+  if (TestCpuFlag(kCpuHasMIPS)) {
+    CopyRow = CopyRow_MIPS;
+  }
+#endif
+
+  // Odd height will harmlessly mirror the middle row twice.
+  for (y = 0; y < half_height; ++y) {
+    MirrorRow(src, row, width);  // Mirror first row into a buffer
+    src += src_stride;
+    MirrorRow(src_bot, dst, width);  // Mirror last row into first row
+    dst += dst_stride;
+    CopyRow(row, dst_bot, width);  // Copy first mirrored row into last
+    src_bot -= src_stride;
+    dst_bot -= dst_stride;
+  }
+  free_aligned_buffer_64(row);
+}
+
+LIBYUV_API
+void TransposeUV(const uint8* src, int src_stride,
+                 uint8* dst_a, int dst_stride_a,
+                 uint8* dst_b, int dst_stride_b,
+                 int width, int height) {
+  int i = height;
+  void (*TransposeUVWx8)(const uint8* src, int src_stride,
+                         uint8* dst_a, int dst_stride_a,
+                         uint8* dst_b, int dst_stride_b,
+                         int width) = TransposeUVWx8_C;
+#if defined(HAS_TRANSPOSEUVWX8_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    TransposeUVWx8 = TransposeUVWx8_NEON;
+  }
+#endif
+#if defined(HAS_TRANSPOSEUVWX8_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8)) {
+    TransposeUVWx8 = TransposeUVWx8_SSE2;
+  }
+#endif
+#if defined(HAS_TRANSPOSEUVWx8_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 2) &&
+      IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
+    TransposeUVWx8 = TransposeUVWx8_MIPS_DSPR2;
+  }
+#endif
+
+  // Work through the source in 8x8 tiles.
+  while (i >= 8) {
+    TransposeUVWx8(src, src_stride,
+                   dst_a, dst_stride_a,
+                   dst_b, dst_stride_b,
+                   width);
+    src += 8 * src_stride;    // Go down 8 rows.
+    dst_a += 8;               // Move over 8 columns.
+    dst_b += 8;               // Move over 8 columns.
+    i -= 8;
+  }
+
+  if (i > 0) {
+    TransposeUVWxH_C(src, src_stride,
+                     dst_a, dst_stride_a,
+                     dst_b, dst_stride_b,
+                     width, i);
+  }
+}
+
+LIBYUV_API
+void RotateUV90(const uint8* src, int src_stride,
+                uint8* dst_a, int dst_stride_a,
+                uint8* dst_b, int dst_stride_b,
+                int width, int height) {
+  src += src_stride * (height - 1);
+  src_stride = -src_stride;
+
+  TransposeUV(src, src_stride,
+              dst_a, dst_stride_a,
+              dst_b, dst_stride_b,
+              width, height);
+}
+
+LIBYUV_API
+void RotateUV270(const uint8* src, int src_stride,
+                 uint8* dst_a, int dst_stride_a,
+                 uint8* dst_b, int dst_stride_b,
+                 int width, int height) {
+  dst_a += dst_stride_a * (width - 1);
+  dst_b += dst_stride_b * (width - 1);
+  dst_stride_a = -dst_stride_a;
+  dst_stride_b = -dst_stride_b;
+
+  TransposeUV(src, src_stride,
+              dst_a, dst_stride_a,
+              dst_b, dst_stride_b,
+              width, height);
+}
+
+// Rotate 180 is a horizontal and vertical flip.
+LIBYUV_API
+void RotateUV180(const uint8* src, int src_stride,
+                 uint8* dst_a, int dst_stride_a,
+                 uint8* dst_b, int dst_stride_b,
+                 int width, int height) {
+  int i;
+  void (*MirrorRowUV)(const uint8* src, uint8* dst_u, uint8* dst_v, int width) =
+      MirrorUVRow_C;
+#if defined(HAS_MIRRORUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+    MirrorRowUV = MirrorUVRow_NEON;
+  }
+#endif
+#if defined(HAS_MIRRORROW_UV_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {
+    MirrorRowUV = MirrorUVRow_SSSE3;
+  }
+#endif
+#if defined(HAS_MIRRORUVROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
+      IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
+    MirrorRowUV = MirrorUVRow_MIPS_DSPR2;
+  }
+#endif
+
+  dst_a += dst_stride_a * (height - 1);
+  dst_b += dst_stride_b * (height - 1);
+
+  for (i = 0; i < height; ++i) {
+    MirrorRowUV(src, dst_a, dst_b, width);
+    src += src_stride;
+    dst_a -= dst_stride_a;
+    dst_b -= dst_stride_b;
+  }
+}
+
+LIBYUV_API
+int RotatePlane(const uint8* src, int src_stride,
+                uint8* dst, int dst_stride,
+                int width, int height,
+                enum RotationMode mode) {
+  if (!src || width <= 0 || height == 0 || !dst) {
+    return -1;
+  }
+
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src = src + (height - 1) * src_stride;
+    src_stride = -src_stride;
+  }
+
+  switch (mode) {
+    case kRotate0:
+      // copy frame
+      CopyPlane(src, src_stride,
+                dst, dst_stride,
+                width, height);
+      return 0;
+    case kRotate90:
+      RotatePlane90(src, src_stride,
+                    dst, dst_stride,
+                    width, height);
+      return 0;
+    case kRotate270:
+      RotatePlane270(src, src_stride,
+                     dst, dst_stride,
+                     width, height);
+      return 0;
+    case kRotate180:
+      RotatePlane180(src, src_stride,
+                     dst, dst_stride,
+                     width, height);
+      return 0;
+    default:
+      break;
+  }
+  return -1;
+}
+
+LIBYUV_API
+int I420Rotate(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height,
+               enum RotationMode mode) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!src_y || !src_u || !src_v || width <= 0 || height == 0 ||
+      !dst_y || !dst_u || !dst_v) {
+    return -1;
+  }
+
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (halfheight - 1) * src_stride_u;
+    src_v = src_v + (halfheight - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  switch (mode) {
+    case kRotate0:
+      // copy frame
+      return I420Copy(src_y, src_stride_y,
+                      src_u, src_stride_u,
+                      src_v, src_stride_v,
+                      dst_y, dst_stride_y,
+                      dst_u, dst_stride_u,
+                      dst_v, dst_stride_v,
+                      width, height);
+    case kRotate90:
+      RotatePlane90(src_y, src_stride_y,
+                    dst_y, dst_stride_y,
+                    width, height);
+      RotatePlane90(src_u, src_stride_u,
+                    dst_u, dst_stride_u,
+                    halfwidth, halfheight);
+      RotatePlane90(src_v, src_stride_v,
+                    dst_v, dst_stride_v,
+                    halfwidth, halfheight);
+      return 0;
+    case kRotate270:
+      RotatePlane270(src_y, src_stride_y,
+                     dst_y, dst_stride_y,
+                     width, height);
+      RotatePlane270(src_u, src_stride_u,
+                     dst_u, dst_stride_u,
+                     halfwidth, halfheight);
+      RotatePlane270(src_v, src_stride_v,
+                     dst_v, dst_stride_v,
+                     halfwidth, halfheight);
+      return 0;
+    case kRotate180:
+      RotatePlane180(src_y, src_stride_y,
+                     dst_y, dst_stride_y,
+                     width, height);
+      RotatePlane180(src_u, src_stride_u,
+                     dst_u, dst_stride_u,
+                     halfwidth, halfheight);
+      RotatePlane180(src_v, src_stride_v,
+                     dst_v, dst_stride_v,
+                     halfwidth, halfheight);
+      return 0;
+    default:
+      break;
+  }
+  return -1;
+}
+
+LIBYUV_API
+int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
+                     const uint8* src_uv, int src_stride_uv,
+                     uint8* dst_y, int dst_stride_y,
+                     uint8* dst_u, int dst_stride_u,
+                     uint8* dst_v, int dst_stride_v,
+                     int width, int height,
+                     enum RotationMode mode) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!src_y || !src_uv || width <= 0 || height == 0 ||
+      !dst_y || !dst_u || !dst_v) {
+    return -1;
+  }
+
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_uv = src_uv + (halfheight - 1) * src_stride_uv;
+    src_stride_y = -src_stride_y;
+    src_stride_uv = -src_stride_uv;
+  }
+
+  switch (mode) {
+    case kRotate0:
+      // copy frame
+      return NV12ToI420(src_y, src_stride_y,
+                        src_uv, src_stride_uv,
+                        dst_y, dst_stride_y,
+                        dst_u, dst_stride_u,
+                        dst_v, dst_stride_v,
+                        width, height);
+    case kRotate90:
+      RotatePlane90(src_y, src_stride_y,
+                    dst_y, dst_stride_y,
+                    width, height);
+      RotateUV90(src_uv, src_stride_uv,
+                 dst_u, dst_stride_u,
+                 dst_v, dst_stride_v,
+                 halfwidth, halfheight);
+      return 0;
+    case kRotate270:
+      RotatePlane270(src_y, src_stride_y,
+                     dst_y, dst_stride_y,
+                     width, height);
+      RotateUV270(src_uv, src_stride_uv,
+                  dst_u, dst_stride_u,
+                  dst_v, dst_stride_v,
+                  halfwidth, halfheight);
+      return 0;
+    case kRotate180:
+      RotatePlane180(src_y, src_stride_y,
+                     dst_y, dst_stride_y,
+                     width, height);
+      RotateUV180(src_uv, src_stride_uv,
+                  dst_u, dst_stride_u,
+                  dst_v, dst_stride_v,
+                  halfwidth, halfheight);
+      return 0;
+    default:
+      break;
+  }
+  return -1;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libaom/src/third_party/libyuv/source/rotate_any.cc b/libs/libaom/src/third_party/libyuv/source/rotate_any.cc
new file mode 100644
index 000000000..4d6eb34e1
--- /dev/null
+++ b/libs/libaom/src/third_party/libyuv/source/rotate_any.cc
@@ -0,0 +1,55 @@
+/*
+ *  Copyright 2015 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/rotate.h"
+#include "libyuv/rotate_row.h"
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#define TANY(NAMEANY, TPOS_SIMD, TPOS_C, MASK)                                 \
+    void NAMEANY(const uint8* src, int src_stride,                             \
+                 uint8* dst, int dst_stride, int width) {                      \
+      int r = width & MASK;                                                    \
+      int n = width - r;                                                       \
+      if (n > 0) {                                                             \
+        TPOS_SIMD(src, src_stride, dst, dst_stride, n);                        \
+      }                                                                        \
+      TPOS_C(src + n, src_stride, dst + n * dst_stride, dst_stride, r);        \
+    }
+
+#ifdef HAS_TRANSPOSEWX8_NEON
+TANY(TransposeWx8_Any_NEON, TransposeWx8_NEON, TransposeWx8_C, 7)
+#endif
+#ifdef HAS_TRANSPOSEWX8_SSSE3
+TANY(TransposeWx8_Any_SSSE3, TransposeWx8_SSSE3, TransposeWx8_C, 7)
+#endif
+#ifdef HAS_TRANSPOSEWX8_FAST_SSSE3
+TANY(TransposeWx8_Fast_Any_SSSE3, TransposeWx8_Fast_SSSE3, TransposeWx8_C, 15)
+#endif
+#ifdef HAS_TRANSPOSEWX8_MIPS_DSPR2
+TANY(TransposeWx8_Any_MIPS_DSPR2, TransposeWx8_MIPS_DSPR2, TransposeWx8_C, 7)
+#endif
+
+#undef TANY
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+
+
+
+
diff --git a/libs/libaom/src/third_party/libyuv/source/rotate_argb.cc b/libs/libaom/src/third_party/libyuv/source/rotate_argb.cc
new file mode 100644
index 000000000..787c0ad1b
--- /dev/null
+++ b/libs/libaom/src/third_party/libyuv/source/rotate_argb.cc
@@ -0,0 +1,205 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/rotate.h"
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/convert.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// ARGBScale has a function to copy pixels to a row, striding each source
+// pixel by a constant.
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(_M_IX86) || \
+    (defined(__x86_64__) && !defined(__native_client__)) || defined(__i386__))
+#define HAS_SCALEARGBROWDOWNEVEN_SSE2
+void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr, int src_stride,
+                               int src_stepx, uint8* dst_ptr, int dst_width);
+#endif
+#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
+    (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
+#define HAS_SCALEARGBROWDOWNEVEN_NEON
+void ScaleARGBRowDownEven_NEON(const uint8* src_ptr, int src_stride,
+                               int src_stepx, uint8* dst_ptr, int dst_width);
+#endif
+
+void ScaleARGBRowDownEven_C(const uint8* src_ptr, int,
+                            int src_stepx, uint8* dst_ptr, int dst_width);
+
+static void ARGBTranspose(const uint8* src, int src_stride,
+                          uint8* dst, int dst_stride, int width, int height) {
+  int i;
+  int src_pixel_step = src_stride >> 2;
+  void (*ScaleARGBRowDownEven)(const uint8* src_ptr, int src_stride,
+      int src_step, uint8* dst_ptr, int dst_width) = ScaleARGBRowDownEven_C;
+#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(height, 4)) {  // Width of dest.
+    ScaleARGBRowDownEven = ScaleARGBRowDownEven_SSE2;
+  }
+#endif
+#if defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(height, 4)) {  // Width of dest.
+    ScaleARGBRowDownEven = ScaleARGBRowDownEven_NEON;
+  }
+#endif
+
+  for (i = 0; i < width; ++i) {  // column of source to row of dest.
+    ScaleARGBRowDownEven(src, 0, src_pixel_step, dst, height);
+    dst += dst_stride;
+    src += 4;
+  }
+}
+
+void ARGBRotate90(const uint8* src, int src_stride,
+                  uint8* dst, int dst_stride, int width, int height) {
+  // Rotate by 90 is a ARGBTranspose with the source read
+  // from bottom to top. So set the source pointer to the end
+  // of the buffer and flip the sign of the source stride.
+  src += src_stride * (height - 1);
+  src_stride = -src_stride;
+  ARGBTranspose(src, src_stride, dst, dst_stride, width, height);
+}
+
+void ARGBRotate270(const uint8* src, int src_stride,
+                    uint8* dst, int dst_stride, int width, int height) {
+  // Rotate by 270 is a ARGBTranspose with the destination written
+  // from bottom to top. So set the destination pointer to the end
+  // of the buffer and flip the sign of the destination stride.
+  dst += dst_stride * (width - 1);
+  dst_stride = -dst_stride;
+  ARGBTranspose(src, src_stride, dst, dst_stride, width, height);
+}
+
+void ARGBRotate180(const uint8* src, int src_stride,
+                   uint8* dst, int dst_stride, int width, int height) {
+  // Swap first and last row and mirror the content. Uses a temporary row.
+  align_buffer_64(row, width * 4);
+  const uint8* src_bot = src + src_stride * (height - 1);
+  uint8* dst_bot = dst + dst_stride * (height - 1);
+  int half_height = (height + 1) >> 1;
+  int y;
+  void (*ARGBMirrorRow)(const uint8* src, uint8* dst, int width) =
+      ARGBMirrorRow_C;
+  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
+#if defined(HAS_ARGBMIRRORROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBMirrorRow = ARGBMirrorRow_Any_NEON;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBMirrorRow = ARGBMirrorRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ARGBMIRRORROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBMirrorRow = ARGBMirrorRow_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBMirrorRow = ARGBMirrorRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBMIRRORROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBMirrorRow = ARGBMirrorRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBMirrorRow = ARGBMirrorRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_COPYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
+  }
+#endif
+#if defined(HAS_COPYROW_AVX)
+  if (TestCpuFlag(kCpuHasAVX)) {
+    CopyRow = IS_ALIGNED(width * 4, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
+  }
+#endif
+#if defined(HAS_COPYROW_ERMS)
+  if (TestCpuFlag(kCpuHasERMS)) {
+    CopyRow = CopyRow_ERMS;
+  }
+#endif
+#if defined(HAS_COPYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
+  }
+#endif
+#if defined(HAS_COPYROW_MIPS)
+  if (TestCpuFlag(kCpuHasMIPS)) {
+    CopyRow = CopyRow_MIPS;
+  }
+#endif
+
+  // Odd height will harmlessly mirror the middle row twice.
+  for (y = 0; y < half_height; ++y) {
+    ARGBMirrorRow(src, row, width);  // Mirror first row into a buffer
+    ARGBMirrorRow(src_bot, dst, width);  // Mirror last row into first row
+    CopyRow(row, dst_bot, width * 4);  // Copy first mirrored row into last
+    src += src_stride;
+    dst += dst_stride;
+    src_bot -= src_stride;
+    dst_bot -= dst_stride;
+  }
+  free_aligned_buffer_64(row);
+}
+
+LIBYUV_API
+int ARGBRotate(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_argb, int dst_stride_argb, int width, int height,
+               enum RotationMode mode) {
+  if (!src_argb || width <= 0 || height == 0 || !dst_argb) {
+    return -1;
+  }
+
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+
+  switch (mode) {
+    case kRotate0:
+      // copy frame
+      return ARGBCopy(src_argb, src_stride_argb,
+                      dst_argb, dst_stride_argb,
+                      width, height);
+    case kRotate90:
+      ARGBRotate90(src_argb, src_stride_argb,
+                   dst_argb, dst_stride_argb,
+                   width, height);
+      return 0;
+    case kRotate270:
+      ARGBRotate270(src_argb, src_stride_argb,
+                    dst_argb, dst_stride_argb,
+                    width, height);
+      return 0;
+    case kRotate180:
+      ARGBRotate180(src_argb, src_stride_argb,
+                    dst_argb, dst_stride_argb,
+                    width, height);
+      return 0;
+    default:
+      break;
+  }
+  return -1;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libaom/src/third_party/libyuv/source/rotate_common.cc b/libs/libaom/src/third_party/libyuv/source/rotate_common.cc
new file mode 100644
index 000000000..b33a9a0c6
--- /dev/null
+++ b/libs/libaom/src/third_party/libyuv/source/rotate_common.cc
@@ -0,0 +1,92 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+#include "libyuv/rotate_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+void TransposeWx8_C(const uint8* src, int src_stride,
+                    uint8* dst, int dst_stride, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    dst[0] = src[0 * src_stride];
+    dst[1] = src[1 * src_stride];
+    dst[2] = src[2 * src_stride];
+    dst[3] = src[3 * src_stride];
+    dst[4] = src[4 * src_stride];
+    dst[5] = src[5 * src_stride];
+    dst[6] = src[6 * src_stride];
+    dst[7] = src[7 * src_stride];
+    ++src;
+    dst += dst_stride;
+  }
+}
+
+void TransposeUVWx8_C(const uint8* src, int src_stride,
+                      uint8* dst_a, int dst_stride_a,
+                      uint8* dst_b, int dst_stride_b, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    dst_a[0] = src[0 * src_stride + 0];
+    dst_b[0] = src[0 * src_stride + 1];
+    dst_a[1] = src[1 * src_stride + 0];
+    dst_b[1] = src[1 * src_stride + 1];
+    dst_a[2] = src[2 * src_stride + 0];
+    dst_b[2] = src[2 * src_stride + 1];
+    dst_a[3] = src[3 * src_stride + 0];
+    dst_b[3] = src[3 * src_stride + 1];
+    dst_a[4] = src[4 * src_stride + 0];
+    dst_b[4] = src[4 * src_stride + 1];
+    dst_a[5] = src[5 * src_stride + 0];
+    dst_b[5] = src[5 * src_stride + 1];
+    dst_a[6] = src[6 * src_stride + 0];
+    dst_b[6] = src[6 * src_stride + 1];
+    dst_a[7] = src[7 * src_stride + 0];
+    dst_b[7] = src[7 * src_stride + 1];
+    src += 2;
+    dst_a += dst_stride_a;
+    dst_b += dst_stride_b;
+  }
+}
+
+void TransposeWxH_C(const uint8* src, int src_stride,
+                    uint8* dst, int dst_stride,
+                    int width, int height) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    int j;
+    for (j = 0; j < height; ++j) {
+      dst[i * dst_stride + j] = src[j * src_stride + i];
+    }
+  }
+}
+
+void TransposeUVWxH_C(const uint8* src, int src_stride,
+                      uint8* dst_a, int dst_stride_a,
+                      uint8* dst_b, int dst_stride_b,
+                      int width, int height) {
+  int i;
+  for (i = 0; i < width * 2; i += 2) {
+    int j;
+    for (j = 0; j < height; ++j) {
+      dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)];
+      dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1];
+    }
+  }
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libaom/src/third_party/libyuv/source/rotate_gcc.cc b/libs/libaom/src/third_party/libyuv/source/rotate_gcc.cc
new file mode 100644
index 000000000..fd385bcd3
--- /dev/null
+++ b/libs/libaom/src/third_party/libyuv/source/rotate_gcc.cc
@@ -0,0 +1,493 @@
+/*
+ *  Copyright 2015 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+#include "libyuv/rotate_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC x86 and x64.
+#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
+
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__)))
+void TransposeWx8_SSSE3(const uint8* src, int src_stride,
+                        uint8* dst, int dst_stride, int width) {
+  asm volatile (
+    // Read in the data from the source pointer.
+    // First round of bit swap.
+    ".p2align  2                                 \n"
+  "1:                                            \n"
+    "movq       (%0),%%xmm0                      \n"
+    "movq       (%0,%3),%%xmm1                   \n"
+    "lea        (%0,%3,2),%0                     \n"
+    "punpcklbw  %%xmm1,%%xmm0                    \n"
+    "movq       (%0),%%xmm2                      \n"
+    "movdqa     %%xmm0,%%xmm1                    \n"
+    "palignr    $0x8,%%xmm1,%%xmm1               \n"
+    "movq       (%0,%3),%%xmm3                   \n"
+    "lea        (%0,%3,2),%0                     \n"
+    "punpcklbw  %%xmm3,%%xmm2                    \n"
+    "movdqa     %%xmm2,%%xmm3                    \n"
+    "movq       (%0),%%xmm4                      \n"
+    "palignr    $0x8,%%xmm3,%%xmm3               \n"
+    "movq       (%0,%3),%%xmm5                   \n"
+    "lea        (%0,%3,2),%0                     \n"
+    "punpcklbw  %%xmm5,%%xmm4                    \n"
+    "movdqa     %%xmm4,%%xmm5                    \n"
+    "movq       (%0),%%xmm6                      \n"
+    "palignr    $0x8,%%xmm5,%%xmm5               \n"
+    "movq       (%0,%3),%%xmm7                   \n"
+    "lea        (%0,%3,2),%0                     \n"
+    "punpcklbw  %%xmm7,%%xmm6                    \n"
+    "neg        %3                               \n"
+    "movdqa     %%xmm6,%%xmm7                    \n"
+    "lea        0x8(%0,%3,8),%0                  \n"
+    "palignr    $0x8,%%xmm7,%%xmm7               \n"
+    "neg        %3                               \n"
+     // Second round of bit swap.
+    "punpcklwd  %%xmm2,%%xmm0                    \n"
+    "punpcklwd  %%xmm3,%%xmm1                    \n"
+    "movdqa     %%xmm0,%%xmm2                    \n"
+    "movdqa     %%xmm1,%%xmm3                    \n"
+    "palignr    $0x8,%%xmm2,%%xmm2               \n"
+    "palignr    $0x8,%%xmm3,%%xmm3               \n"
+    "punpcklwd  %%xmm6,%%xmm4                    \n"
+    "punpcklwd  %%xmm7,%%xmm5                    \n"
+    "movdqa     %%xmm4,%%xmm6                    \n"
+    "movdqa     %%xmm5,%%xmm7                    \n"
+    "palignr    $0x8,%%xmm6,%%xmm6               \n"
+    "palignr    $0x8,%%xmm7,%%xmm7               \n"
+    // Third round of bit swap.
+    // Write to the destination pointer.
+    "punpckldq  %%xmm4,%%xmm0                    \n"
+    "movq       %%xmm0,(%1)                      \n"
+    "movdqa     %%xmm0,%%xmm4                    \n"
+    "palignr    $0x8,%%xmm4,%%xmm4               \n"
+    "movq       %%xmm4,(%1,%4)                   \n"
+    "lea        (%1,%4,2),%1                     \n"
+    "punpckldq  %%xmm6,%%xmm2                    \n"
+    "movdqa     %%xmm2,%%xmm6                    \n"
+    "movq       %%xmm2,(%1)                      \n"
+    "palignr    $0x8,%%xmm6,%%xmm6               \n"
+    "punpckldq  %%xmm5,%%xmm1                    \n"
+    "movq       %%xmm6,(%1,%4)                   \n"
+    "lea        (%1,%4,2),%1                     \n"
+    "movdqa     %%xmm1,%%xmm5                    \n"
+    "movq       %%xmm1,(%1)                      \n"
+    "palignr    $0x8,%%xmm5,%%xmm5               \n"
+    "movq       %%xmm5,(%1,%4)                   \n"
+    "lea        (%1,%4,2),%1                     \n"
+    "punpckldq  %%xmm7,%%xmm3                    \n"
+    "movq       %%xmm3,(%1)                      \n"
+    "movdqa     %%xmm3,%%xmm7                    \n"
+    "palignr    $0x8,%%xmm7,%%xmm7               \n"
+    "sub        $0x8,%2                          \n"
+    "movq       %%xmm7,(%1,%4)                   \n"
+    "lea        (%1,%4,2),%1                     \n"
+    "jg         1b                               \n"
+    : "+r"(src),    // %0
+      "+r"(dst),    // %1
+      "+r"(width)   // %2
+    : "r"((intptr_t)(src_stride)),  // %3
+      "r"((intptr_t)(dst_stride))   // %4
+    : "memory", "cc",
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+
+#if !defined(LIBYUV_DISABLE_X86) && defined(__i386__)  && !defined(__clang__)
+void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
+                         uint8* dst_a, int dst_stride_a,
+                         uint8* dst_b, int dst_stride_b, int width);
+  asm (
+    DECLARE_FUNCTION(TransposeUVWx8_SSE2)
+    "push   %ebx                               \n"
+    "push   %esi                               \n"
+    "push   %edi                               \n"
+    "push   %ebp                               \n"
+    "mov    0x14(%esp),%eax                    \n"
+    "mov    0x18(%esp),%edi                    \n"
+    "mov    0x1c(%esp),%edx                    \n"
+    "mov    0x20(%esp),%esi                    \n"
+    "mov    0x24(%esp),%ebx                    \n"
+    "mov    0x28(%esp),%ebp                    \n"
+    "mov    %esp,%ecx                          \n"
+    "sub    $0x14,%esp                         \n"
+    "and    $0xfffffff0,%esp                   \n"
+    "mov    %ecx,0x10(%esp)                    \n"
+    "mov    0x2c(%ecx),%ecx                    \n"
+
+"1:                                            \n"
+    "movdqu (%eax),%xmm0                       \n"
+    "movdqu (%eax,%edi,1),%xmm1                \n"
+    "lea    (%eax,%edi,2),%eax                 \n"
+    "movdqa %xmm0,%xmm7                        \n"
+    "punpcklbw %xmm1,%xmm0                     \n"
+    "punpckhbw %xmm1,%xmm7                     \n"
+    "movdqa %xmm7,%xmm1                        \n"
+    "movdqu (%eax),%xmm2                       \n"
+    "movdqu (%eax,%edi,1),%xmm3                \n"
+    "lea    (%eax,%edi,2),%eax                 \n"
+    "movdqa %xmm2,%xmm7                        \n"
+    "punpcklbw %xmm3,%xmm2                     \n"
+    "punpckhbw %xmm3,%xmm7                     \n"
+    "movdqa %xmm7,%xmm3                        \n"
+    "movdqu (%eax),%xmm4                       \n"
+    "movdqu (%eax,%edi,1),%xmm5                \n"
+    "lea    (%eax,%edi,2),%eax                 \n"
+    "movdqa %xmm4,%xmm7                        \n"
+    "punpcklbw %xmm5,%xmm4                     \n"
+    "punpckhbw %xmm5,%xmm7                     \n"
+    "movdqa %xmm7,%xmm5                        \n"
+    "movdqu (%eax),%xmm6                       \n"
+    "movdqu (%eax,%edi,1),%xmm7                \n"
+    "lea    (%eax,%edi,2),%eax                 \n"
+    "movdqu %xmm5,(%esp)                       \n"
+    "neg    %edi                               \n"
+    "movdqa %xmm6,%xmm5                        \n"
+    "punpcklbw %xmm7,%xmm6                     \n"
+    "punpckhbw %xmm7,%xmm5                     \n"
+    "movdqa %xmm5,%xmm7                        \n"
+    "lea    0x10(%eax,%edi,8),%eax             \n"
+    "neg    %edi                               \n"
+    "movdqa %xmm0,%xmm5                        \n"
+    "punpcklwd %xmm2,%xmm0                     \n"
+    "punpckhwd %xmm2,%xmm5                     \n"
+    "movdqa %xmm5,%xmm2                        \n"
+    "movdqa %xmm1,%xmm5                        \n"
+    "punpcklwd %xmm3,%xmm1                     \n"
+    "punpckhwd %xmm3,%xmm5                     \n"
+    "movdqa %xmm5,%xmm3                        \n"
+    "movdqa %xmm4,%xmm5                        \n"
+    "punpcklwd %xmm6,%xmm4                     \n"
+    "punpckhwd %xmm6,%xmm5                     \n"
+    "movdqa %xmm5,%xmm6                        \n"
+    "movdqu (%esp),%xmm5                       \n"
+    "movdqu %xmm6,(%esp)                       \n"
+    "movdqa %xmm5,%xmm6                        \n"
+    "punpcklwd %xmm7,%xmm5                     \n"
+    "punpckhwd %xmm7,%xmm6                     \n"
+    "movdqa %xmm6,%xmm7                        \n"
+    "movdqa %xmm0,%xmm6                        \n"
+    "punpckldq %xmm4,%xmm0                     \n"
+    "punpckhdq %xmm4,%xmm6                     \n"
+    "movdqa %xmm6,%xmm4                        \n"
+    "movdqu (%esp),%xmm6                       \n"
+    "movlpd %xmm0,(%edx)                       \n"
+    "movhpd %xmm0,(%ebx)                       \n"
+    "movlpd %xmm4,(%edx,%esi,1)                \n"
+    "lea    (%edx,%esi,2),%edx                 \n"
+    "movhpd %xmm4,(%ebx,%ebp,1)                \n"
+    "lea    (%ebx,%ebp,2),%ebx                 \n"
+    "movdqa %xmm2,%xmm0                        \n"
+    "punpckldq %xmm6,%xmm2                     \n"
+    "movlpd %xmm2,(%edx)                       \n"
+    "movhpd %xmm2,(%ebx)                       \n"
+    "punpckhdq %xmm6,%xmm0                     \n"
+    "movlpd %xmm0,(%edx,%esi,1)                \n"
+    "lea    (%edx,%esi,2),%edx                 \n"
+    "movhpd %xmm0,(%ebx,%ebp,1)                \n"
+    "lea    (%ebx,%ebp,2),%ebx                 \n"
+    "movdqa %xmm1,%xmm0                        \n"
+    "punpckldq %xmm5,%xmm1                     \n"
+    "movlpd %xmm1,(%edx)                       \n"
+    "movhpd %xmm1,(%ebx)                       \n"
+    "punpckhdq %xmm5,%xmm0                     \n"
+    "movlpd %xmm0,(%edx,%esi,1)                \n"
+    "lea    (%edx,%esi,2),%edx                 \n"
+    "movhpd %xmm0,(%ebx,%ebp,1)                \n"
+    "lea    (%ebx,%ebp,2),%ebx                 \n"
+    "movdqa %xmm3,%xmm0                        \n"
+    "punpckldq %xmm7,%xmm3                     \n"
+    "movlpd %xmm3,(%edx)                       \n"
+    "movhpd %xmm3,(%ebx)                       \n"
+    "punpckhdq %xmm7,%xmm0                     \n"
+    "sub    $0x8,%ecx                          \n"
+    "movlpd %xmm0,(%edx,%esi,1)                \n"
+    "lea    (%edx,%esi,2),%edx                 \n"
+    "movhpd %xmm0,(%ebx,%ebp,1)                \n"
+    "lea    (%ebx,%ebp,2),%ebx                 \n"
+    "jg     1b                                 \n"
+    "mov    0x10(%esp),%esp                    \n"
+    "pop    %ebp                               \n"
+    "pop    %edi                               \n"
+    "pop    %esi                               \n"
+    "pop    %ebx                               \n"
+#if defined(__native_client__)
+    "pop    %ecx                               \n"
+    "and    $0xffffffe0,%ecx                   \n"
+    "jmp    *%ecx                              \n"
+#else
+    "ret                                       \n"
+#endif
+);
+#endif
+#if !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \
+    defined(__x86_64__)
+// 64 bit version has enough registers to do 16x8 to 8x16 at a time.
+void TransposeWx8_Fast_SSSE3(const uint8* src, int src_stride,
+                             uint8* dst, int dst_stride, int width) {
+  asm volatile (
+  // Read in the data from the source pointer.
+  // First round of bit swap.
+  ".p2align  2                                 \n"
+"1:                                            \n"
+  "movdqu     (%0),%%xmm0                      \n"
+  "movdqu     (%0,%3),%%xmm1                   \n"
+  "lea        (%0,%3,2),%0                     \n"
+  "movdqa     %%xmm0,%%xmm8                    \n"
+  "punpcklbw  %%xmm1,%%xmm0                    \n"
+  "punpckhbw  %%xmm1,%%xmm8                    \n"
+  "movdqu     (%0),%%xmm2                      \n"
+  "movdqa     %%xmm0,%%xmm1                    \n"
+  "movdqa     %%xmm8,%%xmm9                    \n"
+  "palignr    $0x8,%%xmm1,%%xmm1               \n"
+  "palignr    $0x8,%%xmm9,%%xmm9               \n"
+  "movdqu     (%0,%3),%%xmm3                   \n"
+  "lea        (%0,%3,2),%0                     \n"
+  "movdqa     %%xmm2,%%xmm10                   \n"
+  "punpcklbw  %%xmm3,%%xmm2                    \n"
+  "punpckhbw  %%xmm3,%%xmm10                   \n"
+  "movdqa     %%xmm2,%%xmm3                    \n"
+  "movdqa     %%xmm10,%%xmm11                  \n"
+  "movdqu     (%0),%%xmm4                      \n"
+  "palignr    $0x8,%%xmm3,%%xmm3               \n"
+  "palignr    $0x8,%%xmm11,%%xmm11             \n"
+  "movdqu     (%0,%3),%%xmm5                   \n"
+  "lea        (%0,%3,2),%0                     \n"
+  "movdqa     %%xmm4,%%xmm12                   \n"
+  "punpcklbw  %%xmm5,%%xmm4                    \n"
+  "punpckhbw  %%xmm5,%%xmm12                   \n"
+  "movdqa     %%xmm4,%%xmm5                    \n"
+  "movdqa     %%xmm12,%%xmm13                  \n"
+  "movdqu     (%0),%%xmm6                      \n"
+  "palignr    $0x8,%%xmm5,%%xmm5               \n"
+  "palignr    $0x8,%%xmm13,%%xmm13             \n"
+  "movdqu     (%0,%3),%%xmm7                   \n"
+  "lea        (%0,%3,2),%0                     \n"
+  "movdqa     %%xmm6,%%xmm14                   \n"
+  "punpcklbw  %%xmm7,%%xmm6                    \n"
+  "punpckhbw  %%xmm7,%%xmm14                   \n"
+  "neg        %3                               \n"
+  "movdqa     %%xmm6,%%xmm7                    \n"
+  "movdqa     %%xmm14,%%xmm15                  \n"
+  "lea        0x10(%0,%3,8),%0                 \n"
+  "palignr    $0x8,%%xmm7,%%xmm7               \n"
+  "palignr    $0x8,%%xmm15,%%xmm15             \n"
+  "neg        %3                               \n"
+   // Second round of bit swap.
+  "punpcklwd  %%xmm2,%%xmm0                    \n"
+  "punpcklwd  %%xmm3,%%xmm1                    \n"
+  "movdqa     %%xmm0,%%xmm2                    \n"
+  "movdqa     %%xmm1,%%xmm3                    \n"
+  "palignr    $0x8,%%xmm2,%%xmm2               \n"
+  "palignr    $0x8,%%xmm3,%%xmm3               \n"
+  "punpcklwd  %%xmm6,%%xmm4                    \n"
+  "punpcklwd  %%xmm7,%%xmm5                    \n"
+  "movdqa     %%xmm4,%%xmm6                    \n"
+  "movdqa     %%xmm5,%%xmm7                    \n"
+  "palignr    $0x8,%%xmm6,%%xmm6               \n"
+  "palignr    $0x8,%%xmm7,%%xmm7               \n"
+  "punpcklwd  %%xmm10,%%xmm8                   \n"
+  "punpcklwd  %%xmm11,%%xmm9                   \n"
+  "movdqa     %%xmm8,%%xmm10                   \n"
+  "movdqa     %%xmm9,%%xmm11                   \n"
+  "palignr    $0x8,%%xmm10,%%xmm10             \n"
+  "palignr    $0x8,%%xmm11,%%xmm11             \n"
+  "punpcklwd  %%xmm14,%%xmm12                  \n"
+  "punpcklwd  %%xmm15,%%xmm13                  \n"
+  "movdqa     %%xmm12,%%xmm14                  \n"
+  "movdqa     %%xmm13,%%xmm15                  \n"
+  "palignr    $0x8,%%xmm14,%%xmm14             \n"
+  "palignr    $0x8,%%xmm15,%%xmm15             \n"
+  // Third round of bit swap.
+  // Write to the destination pointer.
+  "punpckldq  %%xmm4,%%xmm0                    \n"
+  "movq       %%xmm0,(%1)                      \n"
+  "movdqa     %%xmm0,%%xmm4                    \n"
+  "palignr    $0x8,%%xmm4,%%xmm4               \n"
+  "movq       %%xmm4,(%1,%4)                   \n"
+  "lea        (%1,%4,2),%1                     \n"
+  "punpckldq  %%xmm6,%%xmm2                    \n"
+  "movdqa     %%xmm2,%%xmm6                    \n"
+  "movq       %%xmm2,(%1)                      \n"
+  "palignr    $0x8,%%xmm6,%%xmm6               \n"
+  "punpckldq  %%xmm5,%%xmm1                    \n"
+  "movq       %%xmm6,(%1,%4)                   \n"
+  "lea        (%1,%4,2),%1                     \n"
+  "movdqa     %%xmm1,%%xmm5                    \n"
+  "movq       %%xmm1,(%1)                      \n"
+  "palignr    $0x8,%%xmm5,%%xmm5               \n"
+  "movq       %%xmm5,(%1,%4)                   \n"
+  "lea        (%1,%4,2),%1                     \n"
+  "punpckldq  %%xmm7,%%xmm3                    \n"
+  "movq       %%xmm3,(%1)                      \n"
+  "movdqa     %%xmm3,%%xmm7                    \n"
+  "palignr    $0x8,%%xmm7,%%xmm7               \n"
+  "movq       %%xmm7,(%1,%4)                   \n"
+  "lea        (%1,%4,2),%1                     \n"
+  "punpckldq  %%xmm12,%%xmm8                   \n"
+  "movq       %%xmm8,(%1)                      \n"
+  "movdqa     %%xmm8,%%xmm12                   \n"
+  "palignr    $0x8,%%xmm12,%%xmm12             \n"
+  "movq       %%xmm12,(%1,%4)                  \n"
+  "lea        (%1,%4,2),%1                     \n"
+  "punpckldq  %%xmm14,%%xmm10                  \n"
+  "movdqa     %%xmm10,%%xmm14                  \n"
+  "movq       %%xmm10,(%1)                     \n"
+  "palignr    $0x8,%%xmm14,%%xmm14             \n"
+  "punpckldq  %%xmm13,%%xmm9                   \n"
+  "movq       %%xmm14,(%1,%4)                  \n"
+  "lea        (%1,%4,2),%1                     \n"
+  "movdqa     %%xmm9,%%xmm13                   \n"
+  "movq       %%xmm9,(%1)                      \n"
+  "palignr    $0x8,%%xmm13,%%xmm13             \n"
+  "movq       %%xmm13,(%1,%4)                  \n"
+  "lea        (%1,%4,2),%1                     \n"
+  "punpckldq  %%xmm15,%%xmm11                  \n"
+  "movq       %%xmm11,(%1)                     \n"
+  "movdqa     %%xmm11,%%xmm15                  \n"
+  "palignr    $0x8,%%xmm15,%%xmm15             \n"
+  "sub        $0x10,%2                         \n"
+  "movq       %%xmm15,(%1,%4)                  \n"
+  "lea        (%1,%4,2),%1                     \n"
+  "jg         1b                               \n"
+  : "+r"(src),    // %0
+    "+r"(dst),    // %1
+    "+r"(width)   // %2
+  : "r"((intptr_t)(src_stride)),  // %3
+    "r"((intptr_t)(dst_stride))   // %4
+  : "memory", "cc",
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
+    "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13",  "xmm14",  "xmm15"
+);
+}
+
+void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
+                         uint8* dst_a, int dst_stride_a,
+                         uint8* dst_b, int dst_stride_b, int width) {
+  asm volatile (
+  // Read in the data from the source pointer.
+  // First round of bit swap.
+  ".p2align  2                                 \n"
+"1:                                            \n"
+  "movdqu     (%0),%%xmm0                      \n"
+  "movdqu     (%0,%4),%%xmm1                   \n"
+  "lea        (%0,%4,2),%0                     \n"
+  "movdqa     %%xmm0,%%xmm8                    \n"
+  "punpcklbw  %%xmm1,%%xmm0                    \n"
+  "punpckhbw  %%xmm1,%%xmm8                    \n"
+  "movdqa     %%xmm8,%%xmm1                    \n"
+  "movdqu     (%0),%%xmm2                      \n"
+  "movdqu     (%0,%4),%%xmm3                   \n"
+  "lea        (%0,%4,2),%0                     \n"
+  "movdqa     %%xmm2,%%xmm8                    \n"
+  "punpcklbw  %%xmm3,%%xmm2                    \n"
+  "punpckhbw  %%xmm3,%%xmm8                    \n"
+  "movdqa     %%xmm8,%%xmm3                    \n"
+  "movdqu     (%0),%%xmm4                      \n"
+  "movdqu     (%0,%4),%%xmm5                   \n"
+  "lea        (%0,%4,2),%0                     \n"
+  "movdqa     %%xmm4,%%xmm8                    \n"
+  "punpcklbw  %%xmm5,%%xmm4                    \n"
+  "punpckhbw  %%xmm5,%%xmm8                    \n"
+  "movdqa     %%xmm8,%%xmm5                    \n"
+  "movdqu     (%0),%%xmm6                      \n"
+  "movdqu     (%0,%4),%%xmm7                   \n"
+  "lea        (%0,%4,2),%0                     \n"
+  "movdqa     %%xmm6,%%xmm8                    \n"
+  "punpcklbw  %%xmm7,%%xmm6                    \n"
+  "neg        %4                               \n"
+  "lea        0x10(%0,%4,8),%0                 \n"
+  "punpckhbw  %%xmm7,%%xmm8                    \n"
+  "movdqa     %%xmm8,%%xmm7                    \n"
+  "neg        %4                               \n"
+   // Second round of bit swap.
+  "movdqa     %%xmm0,%%xmm8                    \n"
+  "movdqa     %%xmm1,%%xmm9                    \n"
+  "punpckhwd  %%xmm2,%%xmm8                    \n"
+  "punpckhwd  %%xmm3,%%xmm9                    \n"
+  "punpcklwd  %%xmm2,%%xmm0                    \n"
+  "punpcklwd  %%xmm3,%%xmm1                    \n"
+  "movdqa     %%xmm8,%%xmm2                    \n"
+  "movdqa     %%xmm9,%%xmm3                    \n"
+  "movdqa     %%xmm4,%%xmm8                    \n"
+  "movdqa     %%xmm5,%%xmm9                    \n"
+  "punpckhwd  %%xmm6,%%xmm8                    \n"
+  "punpckhwd  %%xmm7,%%xmm9                    \n"
+  "punpcklwd  %%xmm6,%%xmm4                    \n"
+  "punpcklwd  %%xmm7,%%xmm5                    \n"
+  "movdqa     %%xmm8,%%xmm6                    \n"
+  "movdqa     %%xmm9,%%xmm7                    \n"
+  // Third round of bit swap.
+  // Write to the destination pointer.
+  "movdqa     %%xmm0,%%xmm8                    \n"
+  "punpckldq  %%xmm4,%%xmm0                    \n"
+  "movlpd     %%xmm0,(%1)                      \n"  // Write back U channel
+  "movhpd     %%xmm0,(%2)                      \n"  // Write back V channel
+  "punpckhdq  %%xmm4,%%xmm8                    \n"
+  "movlpd     %%xmm8,(%1,%5)                   \n"
+  "lea        (%1,%5,2),%1                     \n"
+  "movhpd     %%xmm8,(%2,%6)                   \n"
+  "lea        (%2,%6,2),%2                     \n"
+  "movdqa     %%xmm2,%%xmm8                    \n"
+  "punpckldq  %%xmm6,%%xmm2                    \n"
+  "movlpd     %%xmm2,(%1)                      \n"
+  "movhpd     %%xmm2,(%2)                      \n"
+  "punpckhdq  %%xmm6,%%xmm8                    \n"
+  "movlpd     %%xmm8,(%1,%5)                   \n"
+  "lea        (%1,%5,2),%1                     \n"
+  "movhpd     %%xmm8,(%2,%6)                   \n"
+  "lea        (%2,%6,2),%2                     \n"
+  "movdqa     %%xmm1,%%xmm8                    \n"
+  "punpckldq  %%xmm5,%%xmm1                    \n"
+  "movlpd     %%xmm1,(%1)                      \n"
+  "movhpd     %%xmm1,(%2)                      \n"
+  "punpckhdq  %%xmm5,%%xmm8                    \n"
+  "movlpd     %%xmm8,(%1,%5)                   \n"
+  "lea        (%1,%5,2),%1                     \n"
+  "movhpd     %%xmm8,(%2,%6)                   \n"
+  "lea        (%2,%6,2),%2                     \n"
+  "movdqa     %%xmm3,%%xmm8                    \n"
+  "punpckldq  %%xmm7,%%xmm3                    \n"
+  "movlpd     %%xmm3,(%1)                      \n"
+  "movhpd     %%xmm3,(%2)                      \n"
+  "punpckhdq  %%xmm7,%%xmm8                    \n"
+  "sub        $0x8,%3                          \n"
+  "movlpd     %%xmm8,(%1,%5)                   \n"
+  "lea        (%1,%5,2),%1                     \n"
+  "movhpd     %%xmm8,(%2,%6)                   \n"
+  "lea        (%2,%6,2),%2                     \n"
+  "jg         1b                               \n"
+  : "+r"(src),    // %0
+    "+r"(dst_a),  // %1
+    "+r"(dst_b),  // %2
+    "+r"(width)   // %3
+  : "r"((intptr_t)(src_stride)),    // %4
+    "r"((intptr_t)(dst_stride_a)),  // %5
+    "r"((intptr_t)(dst_stride_b))   // %6
+  : "memory", "cc",
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
+    "xmm8", "xmm9"
+);
+}
+#endif
+#endif
+
+#endif  // defined(__x86_64__) || defined(__i386__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libaom/src/third_party/libyuv/source/rotate_mips.cc b/libs/libaom/src/third_party/libyuv/source/rotate_mips.cc
new file mode 100644
index 000000000..efe6bd909
--- /dev/null
+++ b/libs/libaom/src/third_party/libyuv/source/rotate_mips.cc
@@ -0,0 +1,484 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+#include "libyuv/rotate_row.h"
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if !defined(LIBYUV_DISABLE_MIPS) && \
+    defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \
+    (_MIPS_SIM == _MIPS_SIM_ABI32)
+
+void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride,
+                             uint8* dst, int dst_stride, int width) {
+   __asm__ __volatile__ (
+      ".set push                                         \n"
+      ".set noreorder                                    \n"
+      "sll              $t2, %[src_stride], 0x1          \n" // src_stride x 2
+      "sll              $t4, %[src_stride], 0x2          \n" // src_stride x 4
+      "sll              $t9, %[src_stride], 0x3          \n" // src_stride x 8
+      "addu             $t3, $t2, %[src_stride]          \n"
+      "addu             $t5, $t4, %[src_stride]          \n"
+      "addu             $t6, $t2, $t4                    \n"
+      "andi             $t0, %[dst], 0x3                 \n"
+      "andi             $t1, %[dst_stride], 0x3          \n"
+      "or               $t0, $t0, $t1                    \n"
+      "bnez             $t0, 11f                         \n"
+      " subu            $t7, $t9, %[src_stride]          \n"
+//dst + dst_stride word aligned
+    "1:                                                  \n"
+      "lbu              $t0, 0(%[src])                   \n"
+      "lbux             $t1, %[src_stride](%[src])       \n"
+      "lbux             $t8, $t2(%[src])                 \n"
+      "lbux             $t9, $t3(%[src])                 \n"
+      "sll              $t1, $t1, 16                     \n"
+      "sll              $t9, $t9, 16                     \n"
+      "or               $t0, $t0, $t1                    \n"
+      "or               $t8, $t8, $t9                    \n"
+      "precr.qb.ph      $s0, $t8, $t0                    \n"
+      "lbux             $t0, $t4(%[src])                 \n"
+      "lbux             $t1, $t5(%[src])                 \n"
+      "lbux             $t8, $t6(%[src])                 \n"
+      "lbux             $t9, $t7(%[src])                 \n"
+      "sll              $t1, $t1, 16                     \n"
+      "sll              $t9, $t9, 16                     \n"
+      "or               $t0, $t0, $t1                    \n"
+      "or               $t8, $t8, $t9                    \n"
+      "precr.qb.ph      $s1, $t8, $t0                    \n"
+      "sw               $s0, 0(%[dst])                   \n"
+      "addiu            %[width], -1                     \n"
+      "addiu            %[src], 1                        \n"
+      "sw               $s1, 4(%[dst])                   \n"
+      "bnez             %[width], 1b                     \n"
+      " addu            %[dst], %[dst], %[dst_stride]    \n"
+      "b                2f                               \n"
+//dst + dst_stride unaligned
+   "11:                                                  \n"
+      "lbu              $t0, 0(%[src])                   \n"
+      "lbux             $t1, %[src_stride](%[src])       \n"
+      "lbux             $t8, $t2(%[src])                 \n"
+      "lbux             $t9, $t3(%[src])                 \n"
+      "sll              $t1, $t1, 16                     \n"
+      "sll              $t9, $t9, 16                     \n"
+      "or               $t0, $t0, $t1                    \n"
+      "or               $t8, $t8, $t9                    \n"
+      "precr.qb.ph      $s0, $t8, $t0                    \n"
+      "lbux             $t0, $t4(%[src])                 \n"
+      "lbux             $t1, $t5(%[src])                 \n"
+      "lbux             $t8, $t6(%[src])                 \n"
+      "lbux             $t9, $t7(%[src])                 \n"
+      "sll              $t1, $t1, 16                     \n"
+      "sll              $t9, $t9, 16                     \n"
+      "or               $t0, $t0, $t1                    \n"
+      "or               $t8, $t8, $t9                    \n"
+      "precr.qb.ph      $s1, $t8, $t0                    \n"
+      "swr              $s0, 0(%[dst])                   \n"
+      "swl              $s0, 3(%[dst])                   \n"
+      "addiu            %[width], -1                     \n"
+      "addiu            %[src], 1                        \n"
+      "swr              $s1, 4(%[dst])                   \n"
+      "swl              $s1, 7(%[dst])                   \n"
+      "bnez             %[width], 11b                    \n"
+       "addu             %[dst], %[dst], %[dst_stride]   \n"
+    "2:                                                  \n"
+      ".set pop                                          \n"
+      :[src] "+r" (src),
+       [dst] "+r" (dst),
+       [width] "+r" (width)
+      :[src_stride] "r" (src_stride),
+       [dst_stride] "r" (dst_stride)
+      : "t0", "t1",  "t2", "t3", "t4", "t5",
+        "t6", "t7", "t8", "t9",
+        "s0", "s1"
+  );
+}
+
+void TransposeWx8_Fast_MIPS_DSPR2(const uint8* src, int src_stride,
+                                  uint8* dst, int dst_stride, int width) {
+  __asm__ __volatile__ (
+      ".set noat                                         \n"
+      ".set push                                         \n"
+      ".set noreorder                                    \n"
+      "beqz             %[width], 2f                     \n"
+      " sll             $t2, %[src_stride], 0x1          \n"  // src_stride x 2
+      "sll              $t4, %[src_stride], 0x2          \n"  // src_stride x 4
+      "sll              $t9, %[src_stride], 0x3          \n"  // src_stride x 8
+      "addu             $t3, $t2, %[src_stride]          \n"
+      "addu             $t5, $t4, %[src_stride]          \n"
+      "addu             $t6, $t2, $t4                    \n"
+
+      "srl              $AT, %[width], 0x2               \n"
+      "andi             $t0, %[dst], 0x3                 \n"
+      "andi             $t1, %[dst_stride], 0x3          \n"
+      "or               $t0, $t0, $t1                    \n"
+      "bnez             $t0, 11f                         \n"
+      " subu            $t7, $t9, %[src_stride]          \n"
+//dst + dst_stride word aligned
+      "1:                                                \n"
+      "lw               $t0, 0(%[src])                   \n"
+      "lwx              $t1, %[src_stride](%[src])       \n"
+      "lwx              $t8, $t2(%[src])                 \n"
+      "lwx              $t9, $t3(%[src])                 \n"
+
+// t0 = | 30 | 20 | 10 | 00 |
+// t1 = | 31 | 21 | 11 | 01 |
+// t8 = | 32 | 22 | 12 | 02 |
+// t9 = | 33 | 23 | 13 | 03 |
+
+      "precr.qb.ph     $s0, $t1, $t0                     \n"
+      "precr.qb.ph     $s1, $t9, $t8                     \n"
+      "precrq.qb.ph    $s2, $t1, $t0                     \n"
+      "precrq.qb.ph    $s3, $t9, $t8                     \n"
+
+  // s0 = | 21 | 01 | 20 | 00 |
+  // s1 = | 23 | 03 | 22 | 02 |
+  // s2 = | 31 | 11 | 30 | 10 |
+  // s3 = | 33 | 13 | 32 | 12 |
+
+      "precr.qb.ph     $s4, $s1, $s0                     \n"
+      "precrq.qb.ph    $s5, $s1, $s0                     \n"
+      "precr.qb.ph     $s6, $s3, $s2                     \n"
+      "precrq.qb.ph    $s7, $s3, $s2                     \n"
+
+  // s4 = | 03 | 02 | 01 | 00 |
+  // s5 = | 23 | 22 | 21 | 20 |
+  // s6 = | 13 | 12 | 11 | 10 |
+  // s7 = | 33 | 32 | 31 | 30 |
+
+      "lwx              $t0, $t4(%[src])                 \n"
+      "lwx              $t1, $t5(%[src])                 \n"
+      "lwx              $t8, $t6(%[src])                 \n"
+      "lwx              $t9, $t7(%[src])                 \n"
+
+// t0 = | 34 | 24 | 14 | 04 |
+// t1 = | 35 | 25 | 15 | 05 |
+// t8 = | 36 | 26 | 16 | 06 |
+// t9 = | 37 | 27 | 17 | 07 |
+
+      "precr.qb.ph     $s0, $t1, $t0                     \n"
+      "precr.qb.ph     $s1, $t9, $t8                     \n"
+      "precrq.qb.ph    $s2, $t1, $t0                     \n"
+      "precrq.qb.ph    $s3, $t9, $t8                     \n"
+
+  // s0 = | 25 | 05 | 24 | 04 |
+  // s1 = | 27 | 07 | 26 | 06 |
+  // s2 = | 35 | 15 | 34 | 14 |
+  // s3 = | 37 | 17 | 36 | 16 |
+
+      "precr.qb.ph     $t0, $s1, $s0                     \n"
+      "precrq.qb.ph    $t1, $s1, $s0                     \n"
+      "precr.qb.ph     $t8, $s3, $s2                     \n"
+      "precrq.qb.ph    $t9, $s3, $s2                     \n"
+
+  // t0 = | 07 | 06 | 05 | 04 |
+  // t1 = | 27 | 26 | 25 | 24 |
+  // t8 = | 17 | 16 | 15 | 14 |
+  // t9 = | 37 | 36 | 35 | 34 |
+
+      "addu            $s0, %[dst], %[dst_stride]        \n"
+      "addu            $s1, $s0, %[dst_stride]           \n"
+      "addu            $s2, $s1, %[dst_stride]           \n"
+
+      "sw              $s4, 0(%[dst])                    \n"
+      "sw              $t0, 4(%[dst])                    \n"
+      "sw              $s6, 0($s0)                       \n"
+      "sw              $t8, 4($s0)                       \n"
+      "sw              $s5, 0($s1)                       \n"
+      "sw              $t1, 4($s1)                       \n"
+      "sw              $s7, 0($s2)                       \n"
+      "sw              $t9, 4($s2)                       \n"
+
+      "addiu            $AT, -1                          \n"
+      "addiu            %[src], 4                        \n"
+
+      "bnez             $AT, 1b                          \n"
+      " addu            %[dst], $s2, %[dst_stride]       \n"
+      "b                2f                               \n"
+//dst + dst_stride unaligned
+      "11:                                               \n"
+      "lw               $t0, 0(%[src])                   \n"
+      "lwx              $t1, %[src_stride](%[src])       \n"
+      "lwx              $t8, $t2(%[src])                 \n"
+      "lwx              $t9, $t3(%[src])                 \n"
+
+// t0 = | 30 | 20 | 10 | 00 |
+// t1 = | 31 | 21 | 11 | 01 |
+// t8 = | 32 | 22 | 12 | 02 |
+// t9 = | 33 | 23 | 13 | 03 |
+
+      "precr.qb.ph     $s0, $t1, $t0                     \n"
+      "precr.qb.ph     $s1, $t9, $t8                     \n"
+      "precrq.qb.ph    $s2, $t1, $t0                     \n"
+      "precrq.qb.ph    $s3, $t9, $t8                     \n"
+
+  // s0 = | 21 | 01 | 20 | 00 |
+  // s1 = | 23 | 03 | 22 | 02 |
+  // s2 = | 31 | 11 | 30 | 10 |
+  // s3 = | 33 | 13 | 32 | 12 |
+
+      "precr.qb.ph     $s4, $s1, $s0                     \n"
+      "precrq.qb.ph    $s5, $s1, $s0                     \n"
+      "precr.qb.ph     $s6, $s3, $s2                     \n"
+      "precrq.qb.ph    $s7, $s3, $s2                     \n"
+
+  // s4 = | 03 | 02 | 01 | 00 |
+  // s5 = | 23 | 22 | 21 | 20 |
+  // s6 = | 13 | 12 | 11 | 10 |
+  // s7 = | 33 | 32 | 31 | 30 |
+
+      "lwx              $t0, $t4(%[src])                 \n"
+      "lwx              $t1, $t5(%[src])                 \n"
+      "lwx              $t8, $t6(%[src])                 \n"
+      "lwx              $t9, $t7(%[src])                 \n"
+
+// t0 = | 34 | 24 | 14 | 04 |
+// t1 = | 35 | 25 | 15 | 05 |
+// t8 = | 36 | 26 | 16 | 06 |
+// t9 = | 37 | 27 | 17 | 07 |
+
+      "precr.qb.ph     $s0, $t1, $t0                     \n"
+      "precr.qb.ph     $s1, $t9, $t8                     \n"
+      "precrq.qb.ph    $s2, $t1, $t0                     \n"
+      "precrq.qb.ph    $s3, $t9, $t8                     \n"
+
+  // s0 = | 25 | 05 | 24 | 04 |
+  // s1 = | 27 | 07 | 26 | 06 |
+  // s2 = | 35 | 15 | 34 | 14 |
+  // s3 = | 37 | 17 | 36 | 16 |
+
+      "precr.qb.ph     $t0, $s1, $s0                     \n"
+      "precrq.qb.ph    $t1, $s1, $s0                     \n"
+      "precr.qb.ph     $t8, $s3, $s2                     \n"
+      "precrq.qb.ph    $t9, $s3, $s2                     \n"
+
+  // t0 = | 07 | 06 | 05 | 04 |
+  // t1 = | 27 | 26 | 25 | 24 |
+  // t8 = | 17 | 16 | 15 | 14 |
+  // t9 = | 37 | 36 | 35 | 34 |
+
+      "addu            $s0, %[dst], %[dst_stride]        \n"
+      "addu            $s1, $s0, %[dst_stride]           \n"
+      "addu            $s2, $s1, %[dst_stride]           \n"
+
+      "swr              $s4, 0(%[dst])                   \n"
+      "swl              $s4, 3(%[dst])                   \n"
+      "swr              $t0, 4(%[dst])                   \n"
+      "swl              $t0, 7(%[dst])                   \n"
+      "swr              $s6, 0($s0)                      \n"
+      "swl              $s6, 3($s0)                      \n"
+      "swr              $t8, 4($s0)                      \n"
+      "swl              $t8, 7($s0)                      \n"
+      "swr              $s5, 0($s1)                      \n"
+      "swl              $s5, 3($s1)                      \n"
+      "swr              $t1, 4($s1)                      \n"
+      "swl              $t1, 7($s1)                      \n"
+      "swr              $s7, 0($s2)                      \n"
+      "swl              $s7, 3($s2)                      \n"
+      "swr              $t9, 4($s2)                      \n"
+      "swl              $t9, 7($s2)                      \n"
+
+      "addiu            $AT, -1                          \n"
+      "addiu            %[src], 4                        \n"
+
+      "bnez             $AT, 11b                         \n"
+      " addu            %[dst], $s2, %[dst_stride]       \n"
+      "2:                                                \n"
+      ".set pop                                          \n"
+      ".set at                                           \n"
+      :[src] "+r" (src),
+       [dst] "+r" (dst),
+       [width] "+r" (width)
+      :[src_stride] "r" (src_stride),
+       [dst_stride] "r" (dst_stride)
+      : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9",
+        "s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7"
+  );
+}
+
+void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride,
+                               uint8* dst_a, int dst_stride_a,
+                               uint8* dst_b, int dst_stride_b,
+                               int width) {
+  __asm__ __volatile__ (
+      ".set push                                         \n"
+      ".set noreorder                                    \n"
+      "beqz            %[width], 2f                      \n"
+      " sll            $t2, %[src_stride], 0x1           \n" // src_stride x 2
+      "sll             $t4, %[src_stride], 0x2           \n" // src_stride x 4
+      "sll             $t9, %[src_stride], 0x3           \n" // src_stride x 8
+      "addu            $t3, $t2, %[src_stride]           \n"
+      "addu            $t5, $t4, %[src_stride]           \n"
+      "addu            $t6, $t2, $t4                     \n"
+      "subu            $t7, $t9, %[src_stride]           \n"
+      "srl             $t1, %[width], 1                  \n"
+
+// check word aligment for dst_a, dst_b, dst_stride_a and dst_stride_b
+      "andi            $t0, %[dst_a], 0x3                \n"
+      "andi            $t8, %[dst_b], 0x3                \n"
+      "or              $t0, $t0, $t8                     \n"
+      "andi            $t8, %[dst_stride_a], 0x3         \n"
+      "andi            $s5, %[dst_stride_b], 0x3         \n"
+      "or              $t8, $t8, $s5                     \n"
+      "or              $t0, $t0, $t8                     \n"
+      "bnez            $t0, 11f                          \n"
+      " nop                                              \n"
+// dst + dst_stride word aligned (both, a & b dst addresses)
+    "1:                                                  \n"
+      "lw              $t0, 0(%[src])                    \n" // |B0|A0|b0|a0|
+      "lwx             $t8, %[src_stride](%[src])        \n" // |B1|A1|b1|a1|
+      "addu            $s5, %[dst_a], %[dst_stride_a]    \n"
+      "lwx             $t9, $t2(%[src])                  \n" // |B2|A2|b2|a2|
+      "lwx             $s0, $t3(%[src])                  \n" // |B3|A3|b3|a3|
+      "addu            $s6, %[dst_b], %[dst_stride_b]    \n"
+
+      "precrq.ph.w     $s1, $t8, $t0                     \n" // |B1|A1|B0|A0|
+      "precrq.ph.w     $s2, $s0, $t9                     \n" // |B3|A3|B2|A2|
+      "precr.qb.ph     $s3, $s2, $s1                     \n" // |A3|A2|A1|A0|
+      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |B3|B2|B1|B0|
+
+      "sll             $t0, $t0, 16                      \n"
+      "packrl.ph       $s1, $t8, $t0                     \n" // |b1|a1|b0|a0|
+      "sll             $t9, $t9, 16                      \n"
+      "packrl.ph       $s2, $s0, $t9                     \n" // |b3|a3|b2|a2|
+
+      "sw              $s3, 0($s5)                       \n"
+      "sw              $s4, 0($s6)                       \n"
+
+      "precr.qb.ph     $s3, $s2, $s1                     \n" // |a3|a2|a1|a0|
+      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |b3|b2|b1|b0|
+
+      "lwx             $t0, $t4(%[src])                  \n" // |B4|A4|b4|a4|
+      "lwx             $t8, $t5(%[src])                  \n" // |B5|A5|b5|a5|
+      "lwx             $t9, $t6(%[src])                  \n" // |B6|A6|b6|a6|
+      "lwx             $s0, $t7(%[src])                  \n" // |B7|A7|b7|a7|
+      "sw              $s3, 0(%[dst_a])                  \n"
+      "sw              $s4, 0(%[dst_b])                  \n"
+
+      "precrq.ph.w     $s1, $t8, $t0                     \n" // |B5|A5|B4|A4|
+      "precrq.ph.w     $s2, $s0, $t9                     \n" // |B6|A6|B7|A7|
+      "precr.qb.ph     $s3, $s2, $s1                     \n" // |A7|A6|A5|A4|
+      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |B7|B6|B5|B4|
+
+      "sll             $t0, $t0, 16                      \n"
+      "packrl.ph       $s1, $t8, $t0                     \n" // |b5|a5|b4|a4|
+      "sll             $t9, $t9, 16                      \n"
+      "packrl.ph       $s2, $s0, $t9                     \n" // |b7|a7|b6|a6|
+      "sw              $s3, 4($s5)                       \n"
+      "sw              $s4, 4($s6)                       \n"
+
+      "precr.qb.ph     $s3, $s2, $s1                     \n" // |a7|a6|a5|a4|
+      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |b7|b6|b5|b4|
+
+      "addiu           %[src], 4                         \n"
+      "addiu           $t1, -1                           \n"
+      "sll             $t0, %[dst_stride_a], 1           \n"
+      "sll             $t8, %[dst_stride_b], 1           \n"
+      "sw              $s3, 4(%[dst_a])                  \n"
+      "sw              $s4, 4(%[dst_b])                  \n"
+      "addu            %[dst_a], %[dst_a], $t0           \n"
+      "bnez            $t1, 1b                           \n"
+      " addu           %[dst_b], %[dst_b], $t8           \n"
+      "b               2f                                \n"
+      " nop                                              \n"
+
+// dst_a or dst_b or dst_stride_a or dst_stride_b not word aligned
+   "11:                                                  \n"
+      "lw              $t0, 0(%[src])                    \n" // |B0|A0|b0|a0|
+      "lwx             $t8, %[src_stride](%[src])        \n" // |B1|A1|b1|a1|
+      "addu            $s5, %[dst_a], %[dst_stride_a]    \n"
+      "lwx             $t9, $t2(%[src])                  \n" // |B2|A2|b2|a2|
+      "lwx             $s0, $t3(%[src])                  \n" // |B3|A3|b3|a3|
+      "addu            $s6, %[dst_b], %[dst_stride_b]    \n"
+
+      "precrq.ph.w     $s1, $t8, $t0                     \n" // |B1|A1|B0|A0|
+      "precrq.ph.w     $s2, $s0, $t9                     \n" // |B3|A3|B2|A2|
+      "precr.qb.ph     $s3, $s2, $s1                     \n" // |A3|A2|A1|A0|
+      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |B3|B2|B1|B0|
+
+      "sll             $t0, $t0, 16                      \n"
+      "packrl.ph       $s1, $t8, $t0                     \n" // |b1|a1|b0|a0|
+      "sll             $t9, $t9, 16                      \n"
+      "packrl.ph       $s2, $s0, $t9                     \n" // |b3|a3|b2|a2|
+
+      "swr             $s3, 0($s5)                       \n"
+      "swl             $s3, 3($s5)                       \n"
+      "swr             $s4, 0($s6)                       \n"
+      "swl             $s4, 3($s6)                       \n"
+
+      "precr.qb.ph     $s3, $s2, $s1                     \n" // |a3|a2|a1|a0|
+      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |b3|b2|b1|b0|
+
+      "lwx             $t0, $t4(%[src])                  \n" // |B4|A4|b4|a4|
+      "lwx             $t8, $t5(%[src])                  \n" // |B5|A5|b5|a5|
+      "lwx             $t9, $t6(%[src])                  \n" // |B6|A6|b6|a6|
+      "lwx             $s0, $t7(%[src])                  \n" // |B7|A7|b7|a7|
+      "swr             $s3, 0(%[dst_a])                  \n"
+      "swl             $s3, 3(%[dst_a])                  \n"
+      "swr             $s4, 0(%[dst_b])                  \n"
+      "swl             $s4, 3(%[dst_b])                  \n"
+
+      "precrq.ph.w     $s1, $t8, $t0                     \n" // |B5|A5|B4|A4|
+      "precrq.ph.w     $s2, $s0, $t9                     \n" // |B6|A6|B7|A7|
+      "precr.qb.ph     $s3, $s2, $s1                     \n" // |A7|A6|A5|A4|
+      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |B7|B6|B5|B4|
+
+      "sll             $t0, $t0, 16                      \n"
+      "packrl.ph       $s1, $t8, $t0                     \n" // |b5|a5|b4|a4|
+      "sll             $t9, $t9, 16                      \n"
+      "packrl.ph       $s2, $s0, $t9                     \n" // |b7|a7|b6|a6|
+
+      "swr             $s3, 4($s5)                       \n"
+      "swl             $s3, 7($s5)                       \n"
+      "swr             $s4, 4($s6)                       \n"
+      "swl             $s4, 7($s6)                       \n"
+
+      "precr.qb.ph     $s3, $s2, $s1                     \n" // |a7|a6|a5|a4|
+      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |b7|b6|b5|b4|
+
+      "addiu           %[src], 4                         \n"
+      "addiu           $t1, -1                           \n"
+      "sll             $t0, %[dst_stride_a], 1           \n"
+      "sll             $t8, %[dst_stride_b], 1           \n"
+      "swr             $s3, 4(%[dst_a])                  \n"
+      "swl             $s3, 7(%[dst_a])                  \n"
+      "swr             $s4, 4(%[dst_b])                  \n"
+      "swl             $s4, 7(%[dst_b])                  \n"
+      "addu            %[dst_a], %[dst_a], $t0           \n"
+      "bnez            $t1, 11b                          \n"
+      " addu           %[dst_b], %[dst_b], $t8           \n"
+
+      "2:                                                \n"
+      ".set pop                                          \n"
+      : [src] "+r" (src),
+        [dst_a] "+r" (dst_a),
+        [dst_b] "+r" (dst_b),
+        [width] "+r" (width),
+        [src_stride] "+r" (src_stride)
+      : [dst_stride_a] "r" (dst_stride_a),
+        [dst_stride_b] "r" (dst_stride_b)
+      : "t0", "t1",  "t2", "t3",  "t4", "t5",
+        "t6", "t7", "t8", "t9",
+        "s0", "s1", "s2", "s3",
+        "s4", "s5", "s6"
+  );
+}
+
+#endif  // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libaom/src/third_party/libyuv/source/rotate_neon.cc b/libs/libaom/src/third_party/libyuv/source/rotate_neon.cc
new file mode 100644
index 000000000..76043b3b3
--- /dev/null
+++ b/libs/libaom/src/third_party/libyuv/source/rotate_neon.cc
@@ -0,0 +1,535 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+#include "libyuv/rotate_row.h"
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
+    !defined(__aarch64__)
+
+static uvec8 kVTbl4x4Transpose =
+  { 0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15 };
+
+void TransposeWx8_NEON(const uint8* src, int src_stride,
+                       uint8* dst, int dst_stride,
+                       int width) {
+  const uint8* src_temp = NULL;
+  asm volatile (
+    // loops are on blocks of 8. loop will stop when
+    // counter gets to or below 0. starting the counter
+    // at w-8 allow for this
+    "sub         %5, #8                        \n"
+
+    // handle 8x8 blocks. this should be the majority of the plane
+    ".p2align  2                               \n"
+    "1:                                        \n"
+      "mov         %0, %1                      \n"
+
+      MEMACCESS(0)
+      "vld1.8      {d0}, [%0], %2              \n"
+      MEMACCESS(0)
+      "vld1.8      {d1}, [%0], %2              \n"
+      MEMACCESS(0)
+      "vld1.8      {d2}, [%0], %2              \n"
+      MEMACCESS(0)
+      "vld1.8      {d3}, [%0], %2              \n"
+      MEMACCESS(0)
+      "vld1.8      {d4}, [%0], %2              \n"
+      MEMACCESS(0)
+      "vld1.8      {d5}, [%0], %2              \n"
+      MEMACCESS(0)
+      "vld1.8      {d6}, [%0], %2              \n"
+      MEMACCESS(0)
+      "vld1.8      {d7}, [%0]                  \n"
+
+      "vtrn.8      d1, d0                      \n"
+      "vtrn.8      d3, d2                      \n"
+      "vtrn.8      d5, d4                      \n"
+      "vtrn.8      d7, d6                      \n"
+
+      "vtrn.16     d1, d3                      \n"
+      "vtrn.16     d0, d2                      \n"
+      "vtrn.16     d5, d7                      \n"
+      "vtrn.16     d4, d6                      \n"
+
+      "vtrn.32     d1, d5                      \n"
+      "vtrn.32     d0, d4                      \n"
+      "vtrn.32     d3, d7                      \n"
+      "vtrn.32     d2, d6                      \n"
+
+      "vrev16.8    q0, q0                      \n"
+      "vrev16.8    q1, q1                      \n"
+      "vrev16.8    q2, q2                      \n"
+      "vrev16.8    q3, q3                      \n"
+
+      "mov         %0, %3                      \n"
+
+    MEMACCESS(0)
+      "vst1.8      {d1}, [%0], %4              \n"
+    MEMACCESS(0)
+      "vst1.8      {d0}, [%0], %4              \n"
+    MEMACCESS(0)
+      "vst1.8      {d3}, [%0], %4              \n"
+    MEMACCESS(0)
+      "vst1.8      {d2}, [%0], %4              \n"
+    MEMACCESS(0)
+      "vst1.8      {d5}, [%0], %4              \n"
+    MEMACCESS(0)
+      "vst1.8      {d4}, [%0], %4              \n"
+    MEMACCESS(0)
+      "vst1.8      {d7}, [%0], %4              \n"
+    MEMACCESS(0)
+      "vst1.8      {d6}, [%0]                  \n"
+
+      "add         %1, #8                      \n"  // src += 8
+      "add         %3, %3, %4, lsl #3          \n"  // dst += 8 * dst_stride
+      "subs        %5,  #8                     \n"  // w   -= 8
+      "bge         1b                          \n"
+
+    // add 8 back to counter. if the result is 0 there are
+    // no residuals.
+    "adds        %5, #8                        \n"
+    "beq         4f                            \n"
+
+    // some residual, so between 1 and 7 lines left to transpose
+    "cmp         %5, #2                        \n"
+    "blt         3f                            \n"
+
+    "cmp         %5, #4                        \n"
+    "blt         2f                            \n"
+
+    // 4x8 block
+    "mov         %0, %1                        \n"
+    MEMACCESS(0)
+    "vld1.32     {d0[0]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.32     {d0[1]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.32     {d1[0]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.32     {d1[1]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.32     {d2[0]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.32     {d2[1]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.32     {d3[0]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.32     {d3[1]}, [%0]                 \n"
+
+    "mov         %0, %3                        \n"
+
+    MEMACCESS(6)
+    "vld1.8      {q3}, [%6]                    \n"
+
+    "vtbl.8      d4, {d0, d1}, d6              \n"
+    "vtbl.8      d5, {d0, d1}, d7              \n"
+    "vtbl.8      d0, {d2, d3}, d6              \n"
+    "vtbl.8      d1, {d2, d3}, d7              \n"
+
+    // TODO(frkoenig): Rework shuffle above to
+    // write out with 4 instead of 8 writes.
+    MEMACCESS(0)
+    "vst1.32     {d4[0]}, [%0], %4             \n"
+    MEMACCESS(0)
+    "vst1.32     {d4[1]}, [%0], %4             \n"
+    MEMACCESS(0)
+    "vst1.32     {d5[0]}, [%0], %4             \n"
+    MEMACCESS(0)
+    "vst1.32     {d5[1]}, [%0]                 \n"
+
+    "add         %0, %3, #4                    \n"
+    MEMACCESS(0)
+    "vst1.32     {d0[0]}, [%0], %4             \n"
+    MEMACCESS(0)
+    "vst1.32     {d0[1]}, [%0], %4             \n"
+    MEMACCESS(0)
+    "vst1.32     {d1[0]}, [%0], %4             \n"
+    MEMACCESS(0)
+    "vst1.32     {d1[1]}, [%0]                 \n"
+
+    "add         %1, #4                        \n"  // src += 4
+    "add         %3, %3, %4, lsl #2            \n"  // dst += 4 * dst_stride
+    "subs        %5,  #4                       \n"  // w   -= 4
+    "beq         4f                            \n"
+
+    // some residual, check to see if it includes a 2x8 block,
+    // or less
+    "cmp         %5, #2                        \n"
+    "blt         3f                            \n"
+
+    // 2x8 block
+    "2:                                        \n"
+    "mov         %0, %1                        \n"
+    MEMACCESS(0)
+    "vld1.16     {d0[0]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.16     {d1[0]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.16     {d0[1]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.16     {d1[1]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.16     {d0[2]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.16     {d1[2]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.16     {d0[3]}, [%0], %2             \n"
+    MEMACCESS(0)
+    "vld1.16     {d1[3]}, [%0]                 \n"
+
+    "vtrn.8      d0, d1                        \n"
+
+    "mov         %0, %3                        \n"
+
+    MEMACCESS(0)
+    "vst1.64     {d0}, [%0], %4                \n"
+    MEMACCESS(0)
+    "vst1.64     {d1}, [%0]                    \n"
+
+    "add         %1, #2                        \n"  // src += 2
+    "add         %3, %3, %4, lsl #1            \n"  // dst += 2 * dst_stride
+    "subs        %5,  #2                       \n"  // w   -= 2
+    "beq         4f                            \n"
+
+    // 1x8 block
+    "3:                                        \n"
+    MEMACCESS(1)
+    "vld1.8      {d0[0]}, [%1], %2             \n"
+    MEMACCESS(1)
+    "vld1.8      {d0[1]}, [%1], %2             \n"
+    MEMACCESS(1)
+    "vld1.8      {d0[2]}, [%1], %2             \n"
+    MEMACCESS(1)
+    "vld1.8      {d0[3]}, [%1], %2             \n"
+    MEMACCESS(1)
+    "vld1.8      {d0[4]}, [%1], %2             \n"
+    MEMACCESS(1)
+    "vld1.8      {d0[5]}, [%1], %2             \n"
+    MEMACCESS(1)
+    "vld1.8      {d0[6]}, [%1], %2             \n"
+    MEMACCESS(1)
+    "vld1.8      {d0[7]}, [%1]                 \n"
+
+    MEMACCESS(3)
+    "vst1.64     {d0}, [%3]                    \n"
+
+    "4:                                        \n"
+
+    : "+r"(src_temp),          // %0
+      "+r"(src),               // %1
+      "+r"(src_stride),        // %2
+      "+r"(dst),               // %3
+      "+r"(dst_stride),        // %4
+      "+r"(width)              // %5
+    : "r"(&kVTbl4x4Transpose)  // %6
+    : "memory", "cc", "q0", "q1", "q2", "q3"
+  );
+}
+
+static uvec8 kVTbl4x4TransposeDi =
+  { 0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15 };
+
+void TransposeUVWx8_NEON(const uint8* src, int src_stride,
+                         uint8* dst_a, int dst_stride_a,
+                         uint8* dst_b, int dst_stride_b,
+                         int width) {
+  const uint8* src_temp = NULL;
+  asm volatile (
+    // loops are on blocks of 8. loop will stop when
+    // counter gets to or below 0. starting the counter
+    // at w-8 allow for this
+    "sub         %7, #8                        \n"
+
+    // handle 8x8 blocks. this should be the majority of the plane
+    ".p2align  2                               \n"
+    "1:                                        \n"
+      "mov         %0, %1                      \n"
+
+      MEMACCESS(0)
+      "vld2.8      {d0,  d1},  [%0], %2        \n"
+      MEMACCESS(0)
+      "vld2.8      {d2,  d3},  [%0], %2        \n"
+      MEMACCESS(0)
+      "vld2.8      {d4,  d5},  [%0], %2        \n"
+      MEMACCESS(0)
+      "vld2.8      {d6,  d7},  [%0], %2        \n"
+      MEMACCESS(0)
+      "vld2.8      {d16, d17}, [%0], %2        \n"
+      MEMACCESS(0)
+      "vld2.8      {d18, d19}, [%0], %2        \n"
+      MEMACCESS(0)
+      "vld2.8      {d20, d21}, [%0], %2        \n"
+      MEMACCESS(0)
+      "vld2.8      {d22, d23}, [%0]            \n"
+
+      "vtrn.8      q1, q0                      \n"
+      "vtrn.8      q3, q2                      \n"
+      "vtrn.8      q9, q8                      \n"
+      "vtrn.8      q11, q10                    \n"
+
+      "vtrn.16     q1, q3                      \n"
+      "vtrn.16     q0, q2                      \n"
+      "vtrn.16     q9, q11                     \n"
+      "vtrn.16     q8, q10                     \n"
+
+      "vtrn.32     q1, q9                      \n"
+      "vtrn.32     q0, q8                      \n"
+      "vtrn.32     q3, q11                     \n"
+      "vtrn.32     q2, q10                     \n"
+
+      "vrev16.8    q0, q0                      \n"
+      "vrev16.8    q1, q1                      \n"
+      "vrev16.8    q2, q2                      \n"
+      "vrev16.8    q3, q3                      \n"
+      "vrev16.8    q8, q8                      \n"
+      "vrev16.8    q9, q9                      \n"
+      "vrev16.8    q10, q10                    \n"
+      "vrev16.8    q11, q11                    \n"
+
+      "mov         %0, %3                      \n"
+
+    MEMACCESS(0)
+      "vst1.8      {d2},  [%0], %4             \n"
+    MEMACCESS(0)
+      "vst1.8      {d0},  [%0], %4             \n"
+    MEMACCESS(0)
+      "vst1.8      {d6},  [%0], %4             \n"
+    MEMACCESS(0)
+      "vst1.8      {d4},  [%0], %4             \n"
+    MEMACCESS(0)
+      "vst1.8      {d18}, [%0], %4             \n"
+    MEMACCESS(0)
+      "vst1.8      {d16}, [%0], %4             \n"
+    MEMACCESS(0)
+      "vst1.8      {d22}, [%0], %4             \n"
+    MEMACCESS(0)
+      "vst1.8      {d20}, [%0]                 \n"
+
+      "mov         %0, %5                      \n"
+
+    MEMACCESS(0)
+      "vst1.8      {d3},  [%0], %6             \n"
+    MEMACCESS(0)
+      "vst1.8      {d1},  [%0], %6             \n"
+    MEMACCESS(0)
+      "vst1.8      {d7},  [%0], %6             \n"
+    MEMACCESS(0)
+      "vst1.8      {d5},  [%0], %6             \n"
+    MEMACCESS(0)
+      "vst1.8      {d19}, [%0], %6             \n"
+    MEMACCESS(0)
+      "vst1.8      {d17}, [%0], %6             \n"
+    MEMACCESS(0)
+      "vst1.8      {d23}, [%0], %6             \n"
+    MEMACCESS(0)
+      "vst1.8      {d21}, [%0]                 \n"
+
+      "add         %1, #8*2                    \n"  // src   += 8*2
+      "add         %3, %3, %4, lsl #3          \n"  // dst_a += 8 * dst_stride_a
+      "add         %5, %5, %6, lsl #3          \n"  // dst_b += 8 * dst_stride_b
+      "subs        %7,  #8                     \n"  // w     -= 8
+      "bge         1b                          \n"
+
+    // add 8 back to counter. if the result is 0 there are
+    // no residuals.
+    "adds        %7, #8                        \n"
+    "beq         4f                            \n"
+
+    // some residual, so between 1 and 7 lines left to transpose
+    "cmp         %7, #2                        \n"
+    "blt         3f                            \n"
+
+    "cmp         %7, #4                        \n"
+    "blt         2f                            \n"
+
+    // TODO(frkoenig): Clean this up
+    // 4x8 block
+    "mov         %0, %1                        \n"
+    MEMACCESS(0)
+    "vld1.64     {d0}, [%0], %2                \n"
+    MEMACCESS(0)
+    "vld1.64     {d1}, [%0], %2                \n"
+    MEMACCESS(0)
+    "vld1.64     {d2}, [%0], %2                \n"
+    MEMACCESS(0)
+    "vld1.64     {d3}, [%0], %2                \n"
+    MEMACCESS(0)
+    "vld1.64     {d4}, [%0], %2                \n"
+    MEMACCESS(0)
+    "vld1.64     {d5}, [%0], %2                \n"
+    MEMACCESS(0)
+    "vld1.64     {d6}, [%0], %2                \n"
+    MEMACCESS(0)
+    "vld1.64     {d7}, [%0]                    \n"
+
+    MEMACCESS(8)
+    "vld1.8      {q15}, [%8]                   \n"
+
+    "vtrn.8      q0, q1                        \n"
+    "vtrn.8      q2, q3                        \n"
+
+    "vtbl.8      d16, {d0, d1}, d30            \n"
+    "vtbl.8      d17, {d0, d1}, d31            \n"
+    "vtbl.8      d18, {d2, d3}, d30            \n"
+    "vtbl.8      d19, {d2, d3}, d31            \n"
+    "vtbl.8      d20, {d4, d5}, d30            \n"
+    "vtbl.8      d21, {d4, d5}, d31            \n"
+    "vtbl.8      d22, {d6, d7}, d30            \n"
+    "vtbl.8      d23, {d6, d7}, d31            \n"
+
+    "mov         %0, %3                        \n"
+
+    MEMACCESS(0)
+    "vst1.32     {d16[0]},  [%0], %4           \n"
+    MEMACCESS(0)
+    "vst1.32     {d16[1]},  [%0], %4           \n"
+    MEMACCESS(0)
+    "vst1.32     {d17[0]},  [%0], %4           \n"
+    MEMACCESS(0)
+    "vst1.32     {d17[1]},  [%0], %4           \n"
+
+    "add         %0, %3, #4                    \n"
+    MEMACCESS(0)
+    "vst1.32     {d20[0]}, [%0], %4            \n"
+    MEMACCESS(0)
+    "vst1.32     {d20[1]}, [%0], %4            \n"
+    MEMACCESS(0)
+    "vst1.32     {d21[0]}, [%0], %4            \n"
+    MEMACCESS(0)
+    "vst1.32     {d21[1]}, [%0]                \n"
+
+    "mov         %0, %5                        \n"
+
+    MEMACCESS(0)
+    "vst1.32     {d18[0]}, [%0], %6            \n"
+    MEMACCESS(0)
+    "vst1.32     {d18[1]}, [%0], %6            \n"
+    MEMACCESS(0)
+    "vst1.32     {d19[0]}, [%0], %6            \n"
+    MEMACCESS(0)
+    "vst1.32     {d19[1]}, [%0], %6            \n"
+
+    "add         %0, %5, #4                    \n"
+    MEMACCESS(0)
+    "vst1.32     {d22[0]},  [%0], %6           \n"
+    MEMACCESS(0)
+    "vst1.32     {d22[1]},  [%0], %6           \n"
+    MEMACCESS(0)
+    "vst1.32     {d23[0]},  [%0], %6           \n"
+    MEMACCESS(0)
+    "vst1.32     {d23[1]},  [%0]               \n"
+
+    "add         %1, #4*2                      \n"  // src   += 4 * 2
+    "add         %3, %3, %4, lsl #2            \n"  // dst_a += 4 * dst_stride_a
+    "add         %5, %5, %6, lsl #2            \n"  // dst_b += 4 * dst_stride_b
+    "subs        %7,  #4                       \n"  // w     -= 4
+    "beq         4f                            \n"
+
+    // some residual, check to see if it includes a 2x8 block,
+    // or less
+    "cmp         %7, #2                        \n"
+    "blt         3f                            \n"
+
+    // 2x8 block
+    "2:                                        \n"
+    "mov         %0, %1                        \n"
+    MEMACCESS(0)
+    "vld2.16     {d0[0], d2[0]}, [%0], %2      \n"
+    MEMACCESS(0)
+    "vld2.16     {d1[0], d3[0]}, [%0], %2      \n"
+    MEMACCESS(0)
+    "vld2.16     {d0[1], d2[1]}, [%0], %2      \n"
+    MEMACCESS(0)
+    "vld2.16     {d1[1], d3[1]}, [%0], %2      \n"
+    MEMACCESS(0)
+    "vld2.16     {d0[2], d2[2]}, [%0], %2      \n"
+    MEMACCESS(0)
+    "vld2.16     {d1[2], d3[2]}, [%0], %2      \n"
+    MEMACCESS(0)
+    "vld2.16     {d0[3], d2[3]}, [%0], %2      \n"
+    MEMACCESS(0)
+    "vld2.16     {d1[3], d3[3]}, [%0]          \n"
+
+    "vtrn.8      d0, d1                        \n"
+    "vtrn.8      d2, d3                        \n"
+
+    "mov         %0, %3                        \n"
+
+    MEMACCESS(0)
+    "vst1.64     {d0}, [%0], %4                \n"
+    MEMACCESS(0)
+    "vst1.64     {d2}, [%0]                    \n"
+
+    "mov         %0, %5                        \n"
+
+    MEMACCESS(0)
+    "vst1.64     {d1}, [%0], %6                \n"
+    MEMACCESS(0)
+    "vst1.64     {d3}, [%0]                    \n"
+
+    "add         %1, #2*2                      \n"  // src   += 2 * 2
+    "add         %3, %3, %4, lsl #1            \n"  // dst_a += 2 * dst_stride_a
+    "add         %5, %5, %6, lsl #1            \n"  // dst_b += 2 * dst_stride_b
+    "subs        %7,  #2                       \n"  // w     -= 2
+    "beq         4f                            \n"
+
+    // 1x8 block
+    "3:                                        \n"
+    MEMACCESS(1)
+    "vld2.8      {d0[0], d1[0]}, [%1], %2      \n"
+    MEMACCESS(1)
+    "vld2.8      {d0[1], d1[1]}, [%1], %2      \n"
+    MEMACCESS(1)
+    "vld2.8      {d0[2], d1[2]}, [%1], %2      \n"
+    MEMACCESS(1)
+    "vld2.8      {d0[3], d1[3]}, [%1], %2      \n"
+    MEMACCESS(1)
+    "vld2.8      {d0[4], d1[4]}, [%1], %2      \n"
+    MEMACCESS(1)
+    "vld2.8      {d0[5], d1[5]}, [%1], %2      \n"
+    MEMACCESS(1)
+    "vld2.8      {d0[6], d1[6]}, [%1], %2      \n"
+    MEMACCESS(1)
+    "vld2.8      {d0[7], d1[7]}, [%1]          \n"
+
+    MEMACCESS(3)
+    "vst1.64     {d0}, [%3]                    \n"
+    MEMACCESS(5)
+    "vst1.64     {d1}, [%5]                    \n"
+
+    "4:                                        \n"
+
+    : "+r"(src_temp),            // %0
+      "+r"(src),                 // %1
+      "+r"(src_stride),          // %2
+      "+r"(dst_a),               // %3
+      "+r"(dst_stride_a),        // %4
+      "+r"(dst_b),               // %5
+      "+r"(dst_stride_b),        // %6
+      "+r"(width)                // %7
+    : "r"(&kVTbl4x4TransposeDi)  // %8
+    : "memory", "cc",
+      "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
+  );
+}
+#endif  // defined(__ARM_NEON__) && !defined(__aarch64__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libaom/src/third_party/libyuv/source/rotate_neon64.cc b/libs/libaom/src/third_party/libyuv/source/rotate_neon64.cc
new file mode 100644
index 000000000..f52c082b3
--- /dev/null
+++ b/libs/libaom/src/third_party/libyuv/source/rotate_neon64.cc
@@ -0,0 +1,543 @@
+/*
+ *  Copyright 2014 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+#include "libyuv/rotate_row.h"
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC Neon armv8 64 bit.
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+static uvec8 kVTbl4x4Transpose =
+  { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
+
+void TransposeWx8_NEON(const uint8* src, int src_stride,
+                       uint8* dst, int dst_stride, int width) {
+  const uint8* src_temp = NULL;
+  int64 width64 = (int64) width;  // Work around clang 3.4 warning.
+  asm volatile (
+    // loops are on blocks of 8. loop will stop when
+    // counter gets to or below 0. starting the counter
+    // at w-8 allow for this
+    "sub         %3, %3, #8                      \n"
+
+    // handle 8x8 blocks. this should be the majority of the plane
+    "1:                                          \n"
+      "mov         %0, %1                        \n"
+
+      MEMACCESS(0)
+      "ld1        {v0.8b}, [%0], %5              \n"
+      MEMACCESS(0)
+      "ld1        {v1.8b}, [%0], %5              \n"
+      MEMACCESS(0)
+      "ld1        {v2.8b}, [%0], %5              \n"
+      MEMACCESS(0)
+      "ld1        {v3.8b}, [%0], %5              \n"
+      MEMACCESS(0)
+      "ld1        {v4.8b}, [%0], %5              \n"
+      MEMACCESS(0)
+      "ld1        {v5.8b}, [%0], %5              \n"
+      MEMACCESS(0)
+      "ld1        {v6.8b}, [%0], %5              \n"
+      MEMACCESS(0)
+      "ld1        {v7.8b}, [%0]                  \n"
+
+      "trn2     v16.8b, v0.8b, v1.8b             \n"
+      "trn1     v17.8b, v0.8b, v1.8b             \n"
+      "trn2     v18.8b, v2.8b, v3.8b             \n"
+      "trn1     v19.8b, v2.8b, v3.8b             \n"
+      "trn2     v20.8b, v4.8b, v5.8b             \n"
+      "trn1     v21.8b, v4.8b, v5.8b             \n"
+      "trn2     v22.8b, v6.8b, v7.8b             \n"
+      "trn1     v23.8b, v6.8b, v7.8b             \n"
+
+      "trn2     v3.4h, v17.4h, v19.4h            \n"
+      "trn1     v1.4h, v17.4h, v19.4h            \n"
+      "trn2     v2.4h, v16.4h, v18.4h            \n"
+      "trn1     v0.4h, v16.4h, v18.4h            \n"
+      "trn2     v7.4h, v21.4h, v23.4h            \n"
+      "trn1     v5.4h, v21.4h, v23.4h            \n"
+      "trn2     v6.4h, v20.4h, v22.4h            \n"
+      "trn1     v4.4h, v20.4h, v22.4h            \n"
+
+      "trn2     v21.2s, v1.2s, v5.2s             \n"
+      "trn1     v17.2s, v1.2s, v5.2s             \n"
+      "trn2     v20.2s, v0.2s, v4.2s             \n"
+      "trn1     v16.2s, v0.2s, v4.2s             \n"
+      "trn2     v23.2s, v3.2s, v7.2s             \n"
+      "trn1     v19.2s, v3.2s, v7.2s             \n"
+      "trn2     v22.2s, v2.2s, v6.2s             \n"
+      "trn1     v18.2s, v2.2s, v6.2s             \n"
+
+      "mov         %0, %2                        \n"
+
+    MEMACCESS(0)
+      "st1      {v17.8b}, [%0], %6               \n"
+    MEMACCESS(0)
+      "st1      {v16.8b}, [%0], %6               \n"
+    MEMACCESS(0)
+      "st1      {v19.8b}, [%0], %6               \n"
+    MEMACCESS(0)
+      "st1      {v18.8b}, [%0], %6               \n"
+    MEMACCESS(0)
+      "st1      {v21.8b}, [%0], %6               \n"
+    MEMACCESS(0)
+      "st1      {v20.8b}, [%0], %6               \n"
+    MEMACCESS(0)
+      "st1      {v23.8b}, [%0], %6               \n"
+    MEMACCESS(0)
+      "st1      {v22.8b}, [%0]                   \n"
+
+      "add         %1, %1, #8                    \n"  // src += 8
+      "add         %2, %2, %6, lsl #3            \n"  // dst += 8 * dst_stride
+      "subs        %3, %3, #8                    \n"  // w   -= 8
+      "b.ge        1b                            \n"
+
+    // add 8 back to counter. if the result is 0 there are
+    // no residuals.
+    "adds        %3, %3, #8                      \n"
+    "b.eq        4f                              \n"
+
+    // some residual, so between 1 and 7 lines left to transpose
+    "cmp         %3, #2                          \n"
+    "b.lt        3f                              \n"
+
+    "cmp         %3, #4                          \n"
+    "b.lt        2f                              \n"
+
+    // 4x8 block
+    "mov         %0, %1                          \n"
+    MEMACCESS(0)
+    "ld1     {v0.s}[0], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v0.s}[1], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v0.s}[2], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v0.s}[3], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v1.s}[0], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v1.s}[1], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v1.s}[2], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v1.s}[3], [%0]                     \n"
+
+    "mov         %0, %2                          \n"
+
+    MEMACCESS(4)
+    "ld1      {v2.16b}, [%4]                     \n"
+
+    "tbl      v3.16b, {v0.16b}, v2.16b           \n"
+    "tbl      v0.16b, {v1.16b}, v2.16b           \n"
+
+    // TODO(frkoenig): Rework shuffle above to
+    // write out with 4 instead of 8 writes.
+    MEMACCESS(0)
+    "st1 {v3.s}[0], [%0], %6                     \n"
+    MEMACCESS(0)
+    "st1 {v3.s}[1], [%0], %6                     \n"
+    MEMACCESS(0)
+    "st1 {v3.s}[2], [%0], %6                     \n"
+    MEMACCESS(0)
+    "st1 {v3.s}[3], [%0]                         \n"
+
+    "add         %0, %2, #4                      \n"
+    MEMACCESS(0)
+    "st1 {v0.s}[0], [%0], %6                     \n"
+    MEMACCESS(0)
+    "st1 {v0.s}[1], [%0], %6                     \n"
+    MEMACCESS(0)
+    "st1 {v0.s}[2], [%0], %6                     \n"
+    MEMACCESS(0)
+    "st1 {v0.s}[3], [%0]                         \n"
+
+    "add         %1, %1, #4                      \n"  // src += 4
+    "add         %2, %2, %6, lsl #2              \n"  // dst += 4 * dst_stride
+    "subs        %3, %3, #4                      \n"  // w   -= 4
+    "b.eq        4f                              \n"
+
+    // some residual, check to see if it includes a 2x8 block,
+    // or less
+    "cmp         %3, #2                          \n"
+    "b.lt        3f                              \n"
+
+    // 2x8 block
+    "2:                                          \n"
+    "mov         %0, %1                          \n"
+    MEMACCESS(0)
+    "ld1     {v0.h}[0], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v1.h}[0], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v0.h}[1], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v1.h}[1], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v0.h}[2], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v1.h}[2], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v0.h}[3], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v1.h}[3], [%0]                     \n"
+
+    "trn2    v2.8b, v0.8b, v1.8b                 \n"
+    "trn1    v3.8b, v0.8b, v1.8b                 \n"
+
+    "mov         %0, %2                          \n"
+
+    MEMACCESS(0)
+    "st1     {v3.8b}, [%0], %6                   \n"
+    MEMACCESS(0)
+    "st1     {v2.8b}, [%0]                       \n"
+
+    "add         %1, %1, #2                      \n"  // src += 2
+    "add         %2, %2, %6, lsl #1              \n"  // dst += 2 * dst_stride
+    "subs        %3, %3,  #2                     \n"  // w   -= 2
+    "b.eq        4f                              \n"
+
+    // 1x8 block
+    "3:                                          \n"
+    MEMACCESS(1)
+    "ld1         {v0.b}[0], [%1], %5             \n"
+    MEMACCESS(1)
+    "ld1         {v0.b}[1], [%1], %5             \n"
+    MEMACCESS(1)
+    "ld1         {v0.b}[2], [%1], %5             \n"
+    MEMACCESS(1)
+    "ld1         {v0.b}[3], [%1], %5             \n"
+    MEMACCESS(1)
+    "ld1         {v0.b}[4], [%1], %5             \n"
+    MEMACCESS(1)
+    "ld1         {v0.b}[5], [%1], %5             \n"
+    MEMACCESS(1)
+    "ld1         {v0.b}[6], [%1], %5             \n"
+    MEMACCESS(1)
+    "ld1         {v0.b}[7], [%1]                 \n"
+
+    MEMACCESS(2)
+    "st1         {v0.8b}, [%2]                   \n"
+
+    "4:                                          \n"
+
+    : "+r"(src_temp),                             // %0
+      "+r"(src),                                  // %1
+      "+r"(dst),                                  // %2
+      "+r"(width64)                               // %3
+    : "r"(&kVTbl4x4Transpose),                    // %4
+      "r"(static_cast<ptrdiff_t>(src_stride)),    // %5
+      "r"(static_cast<ptrdiff_t>(dst_stride))     // %6
+    : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
+      "v17", "v18", "v19", "v20", "v21", "v22", "v23"
+  );
+}
+
+static uint8 kVTbl4x4TransposeDi[32] =
+  { 0,  16, 32, 48,  2, 18, 34, 50,  4, 20, 36, 52,  6, 22, 38, 54,
+    1,  17, 33, 49,  3, 19, 35, 51,  5, 21, 37, 53,  7, 23, 39, 55};
+
+void TransposeUVWx8_NEON(const uint8* src, int src_stride,
+                         uint8* dst_a, int dst_stride_a,
+                         uint8* dst_b, int dst_stride_b,
+                         int width) {
+  const uint8* src_temp = NULL;
+  int64 width64 = (int64) width;  // Work around clang 3.4 warning.
+  asm volatile (
+    // loops are on blocks of 8. loop will stop when
+    // counter gets to or below 0. starting the counter
+    // at w-8 allow for this
+    "sub       %4, %4, #8                      \n"
+
+    // handle 8x8 blocks. this should be the majority of the plane
+    "1:                                        \n"
+    "mov       %0, %1                          \n"
+
+    MEMACCESS(0)
+    "ld1       {v0.16b}, [%0], %5              \n"
+    MEMACCESS(0)
+    "ld1       {v1.16b}, [%0], %5              \n"
+    MEMACCESS(0)
+    "ld1       {v2.16b}, [%0], %5              \n"
+    MEMACCESS(0)
+    "ld1       {v3.16b}, [%0], %5              \n"
+    MEMACCESS(0)
+    "ld1       {v4.16b}, [%0], %5              \n"
+    MEMACCESS(0)
+    "ld1       {v5.16b}, [%0], %5              \n"
+    MEMACCESS(0)
+    "ld1       {v6.16b}, [%0], %5              \n"
+    MEMACCESS(0)
+    "ld1       {v7.16b}, [%0]                  \n"
+
+    "trn1      v16.16b, v0.16b, v1.16b         \n"
+    "trn2      v17.16b, v0.16b, v1.16b         \n"
+    "trn1      v18.16b, v2.16b, v3.16b         \n"
+    "trn2      v19.16b, v2.16b, v3.16b         \n"
+    "trn1      v20.16b, v4.16b, v5.16b         \n"
+    "trn2      v21.16b, v4.16b, v5.16b         \n"
+    "trn1      v22.16b, v6.16b, v7.16b         \n"
+    "trn2      v23.16b, v6.16b, v7.16b         \n"
+
+    "trn1      v0.8h, v16.8h, v18.8h           \n"
+    "trn2      v1.8h, v16.8h, v18.8h           \n"
+    "trn1      v2.8h, v20.8h, v22.8h           \n"
+    "trn2      v3.8h, v20.8h, v22.8h           \n"
+    "trn1      v4.8h, v17.8h, v19.8h           \n"
+    "trn2      v5.8h, v17.8h, v19.8h           \n"
+    "trn1      v6.8h, v21.8h, v23.8h           \n"
+    "trn2      v7.8h, v21.8h, v23.8h           \n"
+
+    "trn1      v16.4s, v0.4s, v2.4s            \n"
+    "trn2      v17.4s, v0.4s, v2.4s            \n"
+    "trn1      v18.4s, v1.4s, v3.4s            \n"
+    "trn2      v19.4s, v1.4s, v3.4s            \n"
+    "trn1      v20.4s, v4.4s, v6.4s            \n"
+    "trn2      v21.4s, v4.4s, v6.4s            \n"
+    "trn1      v22.4s, v5.4s, v7.4s            \n"
+    "trn2      v23.4s, v5.4s, v7.4s            \n"
+
+    "mov       %0, %2                          \n"
+
+    MEMACCESS(0)
+    "st1       {v16.d}[0], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v18.d}[0], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v17.d}[0], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v19.d}[0], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v16.d}[1], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v18.d}[1], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v17.d}[1], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v19.d}[1], [%0]                \n"
+
+    "mov       %0, %3                          \n"
+
+    MEMACCESS(0)
+    "st1       {v20.d}[0], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v22.d}[0], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v21.d}[0], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v23.d}[0], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v20.d}[1], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v22.d}[1], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v21.d}[1], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v23.d}[1], [%0]                \n"
+
+    "add       %1, %1, #16                     \n"  // src   += 8*2
+    "add       %2, %2, %6, lsl #3              \n"  // dst_a += 8 * dst_stride_a
+    "add       %3, %3, %7, lsl #3              \n"  // dst_b += 8 * dst_stride_b
+    "subs      %4, %4,  #8                     \n"  // w     -= 8
+    "b.ge      1b                              \n"
+
+    // add 8 back to counter. if the result is 0 there are
+    // no residuals.
+    "adds      %4, %4, #8                      \n"
+    "b.eq      4f                              \n"
+
+    // some residual, so between 1 and 7 lines left to transpose
+    "cmp       %4, #2                          \n"
+    "b.lt      3f                              \n"
+
+    "cmp       %4, #4                          \n"
+    "b.lt      2f                              \n"
+
+    // TODO(frkoenig): Clean this up
+    // 4x8 block
+    "mov       %0, %1                          \n"
+    MEMACCESS(0)
+    "ld1       {v0.8b}, [%0], %5               \n"
+    MEMACCESS(0)
+    "ld1       {v1.8b}, [%0], %5               \n"
+    MEMACCESS(0)
+    "ld1       {v2.8b}, [%0], %5               \n"
+    MEMACCESS(0)
+    "ld1       {v3.8b}, [%0], %5               \n"
+    MEMACCESS(0)
+    "ld1       {v4.8b}, [%0], %5               \n"
+    MEMACCESS(0)
+    "ld1       {v5.8b}, [%0], %5               \n"
+    MEMACCESS(0)
+    "ld1       {v6.8b}, [%0], %5               \n"
+    MEMACCESS(0)
+    "ld1       {v7.8b}, [%0]                   \n"
+
+    MEMACCESS(8)
+    "ld1       {v30.16b}, [%8], #16            \n"
+    "ld1       {v31.16b}, [%8]                 \n"
+
+    "tbl       v16.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b  \n"
+    "tbl       v17.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v31.16b  \n"
+    "tbl       v18.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v30.16b  \n"
+    "tbl       v19.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v31.16b  \n"
+
+    "mov       %0, %2                          \n"
+
+    MEMACCESS(0)
+    "st1       {v16.s}[0],  [%0], %6           \n"
+    MEMACCESS(0)
+    "st1       {v16.s}[1],  [%0], %6           \n"
+    MEMACCESS(0)
+    "st1       {v16.s}[2],  [%0], %6           \n"
+    MEMACCESS(0)
+    "st1       {v16.s}[3],  [%0], %6           \n"
+
+    "add       %0, %2, #4                      \n"
+    MEMACCESS(0)
+    "st1       {v18.s}[0], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v18.s}[1], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v18.s}[2], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v18.s}[3], [%0]                \n"
+
+    "mov       %0, %3                          \n"
+
+    MEMACCESS(0)
+    "st1       {v17.s}[0], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v17.s}[1], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v17.s}[2], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v17.s}[3], [%0], %7            \n"
+
+    "add       %0, %3, #4                      \n"
+    MEMACCESS(0)
+    "st1       {v19.s}[0],  [%0], %7           \n"
+    MEMACCESS(0)
+    "st1       {v19.s}[1],  [%0], %7           \n"
+    MEMACCESS(0)
+    "st1       {v19.s}[2],  [%0], %7           \n"
+    MEMACCESS(0)
+    "st1       {v19.s}[3],  [%0]               \n"
+
+    "add       %1, %1, #8                      \n"  // src   += 4 * 2
+    "add       %2, %2, %6, lsl #2              \n"  // dst_a += 4 * dst_stride_a
+    "add       %3, %3, %7, lsl #2              \n"  // dst_b += 4 * dst_stride_b
+    "subs      %4,  %4,  #4                    \n"  // w     -= 4
+    "b.eq      4f                              \n"
+
+    // some residual, check to see if it includes a 2x8 block,
+    // or less
+    "cmp       %4, #2                          \n"
+    "b.lt      3f                              \n"
+
+    // 2x8 block
+    "2:                                        \n"
+    "mov       %0, %1                          \n"
+    MEMACCESS(0)
+    "ld2       {v0.h, v1.h}[0], [%0], %5       \n"
+    MEMACCESS(0)
+    "ld2       {v2.h, v3.h}[0], [%0], %5       \n"
+    MEMACCESS(0)
+    "ld2       {v0.h, v1.h}[1], [%0], %5       \n"
+    MEMACCESS(0)
+    "ld2       {v2.h, v3.h}[1], [%0], %5       \n"
+    MEMACCESS(0)
+    "ld2       {v0.h, v1.h}[2], [%0], %5       \n"
+    MEMACCESS(0)
+    "ld2       {v2.h, v3.h}[2], [%0], %5       \n"
+    MEMACCESS(0)
+    "ld2       {v0.h, v1.h}[3], [%0], %5       \n"
+    MEMACCESS(0)
+    "ld2       {v2.h, v3.h}[3], [%0]           \n"
+
+    "trn1      v4.8b, v0.8b, v2.8b             \n"
+    "trn2      v5.8b, v0.8b, v2.8b             \n"
+    "trn1      v6.8b, v1.8b, v3.8b             \n"
+    "trn2      v7.8b, v1.8b, v3.8b             \n"
+
+    "mov       %0, %2                          \n"
+
+    MEMACCESS(0)
+    "st1       {v4.d}[0], [%0], %6             \n"
+    MEMACCESS(0)
+    "st1       {v6.d}[0], [%0]                 \n"
+
+    "mov       %0, %3                          \n"
+
+    MEMACCESS(0)
+    "st1       {v5.d}[0], [%0], %7             \n"
+    MEMACCESS(0)
+    "st1       {v7.d}[0], [%0]                 \n"
+
+    "add       %1, %1, #4                      \n"  // src   += 2 * 2
+    "add       %2, %2, %6, lsl #1              \n"  // dst_a += 2 * dst_stride_a
+    "add       %3, %3, %7, lsl #1              \n"  // dst_b += 2 * dst_stride_b
+    "subs      %4,  %4,  #2                    \n"  // w     -= 2
+    "b.eq      4f                              \n"
+
+    // 1x8 block
+    "3:                                        \n"
+    MEMACCESS(1)
+    "ld2       {v0.b, v1.b}[0], [%1], %5       \n"
+    MEMACCESS(1)
+    "ld2       {v0.b, v1.b}[1], [%1], %5       \n"
+    MEMACCESS(1)
+    "ld2       {v0.b, v1.b}[2], [%1], %5       \n"
+    MEMACCESS(1)
+    "ld2       {v0.b, v1.b}[3], [%1], %5       \n"
+    MEMACCESS(1)
+    "ld2       {v0.b, v1.b}[4], [%1], %5       \n"
+    MEMACCESS(1)
+    "ld2       {v0.b, v1.b}[5], [%1], %5       \n"
+    MEMACCESS(1)
+    "ld2       {v0.b, v1.b}[6], [%1], %5       \n"
+    MEMACCESS(1)
+    "ld2       {v0.b, v1.b}[7], [%1]           \n"
+
+    MEMACCESS(2)
+    "st1       {v0.d}[0], [%2]                 \n"
+    MEMACCESS(3)
+    "st1       {v1.d}[0], [%3]                 \n"
+
+    "4:                                        \n"
+
+    : "+r"(src_temp),                             // %0
+      "+r"(src),                                  // %1
+      "+r"(dst_a),                                // %2
+      "+r"(dst_b),                                // %3
+      "+r"(width64)                               // %4
+    : "r"(static_cast<ptrdiff_t>(src_stride)),    // %5
+      "r"(static_cast<ptrdiff_t>(dst_stride_a)),  // %6
+      "r"(static_cast<ptrdiff_t>(dst_stride_b)),  // %7
+      "r"(&kVTbl4x4TransposeDi)                   // %8
+    : "memory", "cc",
+      "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+      "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
+      "v30", "v31"
+  );
+}
+#endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libaom/src/third_party/libyuv/source/rotate_win.cc b/libs/libaom/src/third_party/libyuv/source/rotate_win.cc
new file mode 100644
index 000000000..2760066df
--- /dev/null
+++ b/libs/libaom/src/third_party/libyuv/source/rotate_win.cc
@@ -0,0 +1,248 @@
+/*
+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+#include "libyuv/rotate_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for Visual C x86.
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
+    defined(_MSC_VER) && !defined(__clang__)
+
+__declspec(naked)
+void TransposeWx8_SSSE3(const uint8* src, int src_stride,
+                        uint8* dst, int dst_stride, int width) {
+  __asm {
+    push      edi
+    push      esi
+    push      ebp
+    mov       eax, [esp + 12 + 4]   // src
+    mov       edi, [esp + 12 + 8]   // src_stride
+    mov       edx, [esp + 12 + 12]  // dst
+    mov       esi, [esp + 12 + 16]  // dst_stride
+    mov       ecx, [esp + 12 + 20]  // width
+
+    // Read in the data from the source pointer.
+    // First round of bit swap.
+    align      4
+ convertloop:
+    movq      xmm0, qword ptr [eax]
+    lea       ebp, [eax + 8]
+    movq      xmm1, qword ptr [eax + edi]
+    lea       eax, [eax + 2 * edi]
+    punpcklbw xmm0, xmm1
+    movq      xmm2, qword ptr [eax]
+    movdqa    xmm1, xmm0
+    palignr   xmm1, xmm1, 8
+    movq      xmm3, qword ptr [eax + edi]
+    lea       eax, [eax + 2 * edi]
+    punpcklbw xmm2, xmm3
+    movdqa    xmm3, xmm2
+    movq      xmm4, qword ptr [eax]
+    palignr   xmm3, xmm3, 8
+    movq      xmm5, qword ptr [eax + edi]
+    punpcklbw xmm4, xmm5
+    lea       eax, [eax + 2 * edi]
+    movdqa    xmm5, xmm4
+    movq      xmm6, qword ptr [eax]
+    palignr   xmm5, xmm5, 8
+    movq      xmm7, qword ptr [eax + edi]
+    punpcklbw xmm6, xmm7
+    mov       eax, ebp
+    movdqa    xmm7, xmm6
+    palignr   xmm7, xmm7, 8
+    // Second round of bit swap.
+    punpcklwd xmm0, xmm2
+    punpcklwd xmm1, xmm3
+    movdqa    xmm2, xmm0
+    movdqa    xmm3, xmm1
+    palignr   xmm2, xmm2, 8
+    palignr   xmm3, xmm3, 8
+    punpcklwd xmm4, xmm6
+    punpcklwd xmm5, xmm7
+    movdqa    xmm6, xmm4
+    movdqa    xmm7, xmm5
+    palignr   xmm6, xmm6, 8
+    palignr   xmm7, xmm7, 8
+    // Third round of bit swap.
+    // Write to the destination pointer.
+    punpckldq xmm0, xmm4
+    movq      qword ptr [edx], xmm0
+    movdqa    xmm4, xmm0
+    palignr   xmm4, xmm4, 8
+    movq      qword ptr [edx + esi], xmm4
+    lea       edx, [edx + 2 * esi]
+    punpckldq xmm2, xmm6
+    movdqa    xmm6, xmm2
+    palignr   xmm6, xmm6, 8
+    movq      qword ptr [edx], xmm2
+    punpckldq xmm1, xmm5
+    movq      qword ptr [edx + esi], xmm6
+    lea       edx, [edx + 2 * esi]
+    movdqa    xmm5, xmm1
+    movq      qword ptr [edx], xmm1
+    palignr   xmm5, xmm5, 8
+    punpckldq xmm3, xmm7
+    movq      qword ptr [edx + esi], xmm5
+    lea       edx, [edx + 2 * esi]
+    movq      qword ptr [edx], xmm3
+    movdqa    xmm7, xmm3
+    palignr   xmm7, xmm7, 8
+    sub       ecx, 8
+    movq      qword ptr [edx + esi], xmm7
+    lea       edx, [edx + 2 * esi]
+    jg        convertloop
+
+    pop       ebp
+    pop       esi
+    pop       edi
+    ret
+  }
+}
+
+__declspec(naked)
+void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
+                         uint8* dst_a, int dst_stride_a,
+                         uint8* dst_b, int dst_stride_b,
+                         int w) {
+  __asm {
+    push      ebx
+    push      esi
+    push      edi
+    push      ebp
+    mov       eax, [esp + 16 + 4]   // src
+    mov       edi, [esp + 16 + 8]   // src_stride
+    mov       edx, [esp + 16 + 12]  // dst_a
+    mov       esi, [esp + 16 + 16]  // dst_stride_a
+    mov       ebx, [esp + 16 + 20]  // dst_b
+    mov       ebp, [esp + 16 + 24]  // dst_stride_b
+    mov       ecx, esp
+    sub       esp, 4 + 16
+    and       esp, ~15
+    mov       [esp + 16], ecx
+    mov       ecx, [ecx + 16 + 28]  // w
+
+    align      4
+ convertloop:
+    // Read in the data from the source pointer.
+    // First round of bit swap.
+    movdqu    xmm0, [eax]
+    movdqu    xmm1, [eax + edi]
+    lea       eax, [eax + 2 * edi]
+    movdqa    xmm7, xmm0  // use xmm7 as temp register.
+    punpcklbw xmm0, xmm1
+    punpckhbw xmm7, xmm1
+    movdqa    xmm1, xmm7
+    movdqu    xmm2, [eax]
+    movdqu    xmm3, [eax + edi]
+    lea       eax, [eax + 2 * edi]
+    movdqa    xmm7, xmm2
+    punpcklbw xmm2, xmm3
+    punpckhbw xmm7, xmm3
+    movdqa    xmm3, xmm7
+    movdqu    xmm4, [eax]
+    movdqu    xmm5, [eax + edi]
+    lea       eax, [eax + 2 * edi]
+    movdqa    xmm7, xmm4
+    punpcklbw xmm4, xmm5
+    punpckhbw xmm7, xmm5
+    movdqa    xmm5, xmm7
+    movdqu    xmm6, [eax]
+    movdqu    xmm7, [eax + edi]
+    lea       eax, [eax + 2 * edi]
+    movdqu    [esp], xmm5  // backup xmm5
+    neg       edi
+    movdqa    xmm5, xmm6   // use xmm5 as temp register.
+    punpcklbw xmm6, xmm7
+    punpckhbw xmm5, xmm7
+    movdqa    xmm7, xmm5
+    lea       eax, [eax + 8 * edi + 16]
+    neg       edi
+    // Second round of bit swap.
+    movdqa    xmm5, xmm0
+    punpcklwd xmm0, xmm2
+    punpckhwd xmm5, xmm2
+    movdqa    xmm2, xmm5
+    movdqa    xmm5, xmm1
+    punpcklwd xmm1, xmm3
+    punpckhwd xmm5, xmm3
+    movdqa    xmm3, xmm5
+    movdqa    xmm5, xmm4
+    punpcklwd xmm4, xmm6
+    punpckhwd xmm5, xmm6
+    movdqa    xmm6, xmm5
+    movdqu    xmm5, [esp]  // restore xmm5
+    movdqu    [esp], xmm6  // backup xmm6
+    movdqa    xmm6, xmm5    // use xmm6 as temp register.
+    punpcklwd xmm5, xmm7
+    punpckhwd xmm6, xmm7
+    movdqa    xmm7, xmm6
+    // Third round of bit swap.
+    // Write to the destination pointer.
+    movdqa    xmm6, xmm0
+    punpckldq xmm0, xmm4
+    punpckhdq xmm6, xmm4
+    movdqa    xmm4, xmm6
+    movdqu    xmm6, [esp]  // restore xmm6
+    movlpd    qword ptr [edx], xmm0
+    movhpd    qword ptr [ebx], xmm0
+    movlpd    qword ptr [edx + esi], xmm4
+    lea       edx, [edx + 2 * esi]
+    movhpd    qword ptr [ebx + ebp], xmm4
+    lea       ebx, [ebx + 2 * ebp]
+    movdqa    xmm0, xmm2   // use xmm0 as the temp register.
+    punpckldq xmm2, xmm6
+    movlpd    qword ptr [edx], xmm2
+    movhpd    qword ptr [ebx], xmm2
+    punpckhdq xmm0, xmm6
+    movlpd    qword ptr [edx + esi], xmm0
+    lea       edx, [edx + 2 * esi]
+    movhpd    qword ptr [ebx + ebp], xmm0
+    lea       ebx, [ebx + 2 * ebp]
+    movdqa    xmm0, xmm1   // use xmm0 as the temp register.
+    punpckldq xmm1, xmm5
+    movlpd    qword ptr [edx], xmm1
+    movhpd    qword ptr [ebx], xmm1
+    punpckhdq xmm0, xmm5
+    movlpd    qword ptr [edx + esi], xmm0
+    lea       edx, [edx + 2 * esi]
+    movhpd    qword ptr [ebx + ebp], xmm0
+    lea       ebx, [ebx + 2 * ebp]
+    movdqa    xmm0, xmm3   // use xmm0 as the temp register.
+    punpckldq xmm3, xmm7
+    movlpd    qword ptr [edx], xmm3
+    movhpd    qword ptr [ebx], xmm3
+    punpckhdq xmm0, xmm7
+    sub       ecx, 8
+    movlpd    qword ptr [edx + esi], xmm0
+    lea       edx, [edx + 2 * esi]
+    movhpd    qword ptr [ebx + ebp], xmm0
+    lea       ebx, [ebx + 2 * ebp]
+    jg        convertloop
+
+    mov       esp, [esp + 16]
+    pop       ebp
+    pop       edi
+    pop       esi
+    pop       ebx
+    ret
+  }
+}
+
+#endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libaom/src/third_party/libyuv/source/row_any.cc b/libs/libaom/src/third_party/libyuv/source/row_any.cc
new file mode 100644
index 000000000..1cb1f6b93
--- /dev/null
+++ b/libs/libaom/src/third_party/libyuv/source/row_any.cc
@@ -0,0 +1,680 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#include <string.h>  // For memset.
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Subsampled source needs to be increase by 1 of not even.
+#define SS(width, shift) (((width) + (1 << (shift)) - 1) >> (shift))
+
+// Any 3 planes to 1.
+#define ANY31(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)                 \
+    void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf,   \
+                 uint8* dst_ptr, int width) {                                  \
+      SIMD_ALIGNED(uint8 temp[64 * 4]);                                        \
+      memset(temp, 0, 64 * 3);  /* for YUY2 and msan */                        \
+      int r = width & MASK;                                                    \
+      int n = width & ~MASK;                                                   \
+      if (n > 0) {                                                             \
+        ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, n);                             \
+      }                                                                        \
+      memcpy(temp, y_buf + n, r);                                              \
+      memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));               \
+      memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));              \
+      ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, MASK + 1);             \
+      memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192,                      \
+             SS(r, DUVSHIFT) * BPP);                                           \
+    }
+
+#ifdef HAS_I422TOARGBROW_SSSE3
+ANY31(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_SSSE3, 1, 0, 4, 7)
+#endif
+#ifdef HAS_I444TOARGBROW_SSSE3
+ANY31(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_SSSE3, 0, 0, 4, 7)
+ANY31(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_SSSE3, 2, 0, 4, 7)
+ANY31(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_SSSE3, 1, 0, 4, 7)
+ANY31(I422ToABGRRow_Any_SSSE3, I422ToABGRRow_SSSE3, 1, 0, 4, 7)
+ANY31(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_SSSE3, 1, 0, 4, 7)
+ANY31(I422ToARGB4444Row_Any_SSSE3, I422ToARGB4444Row_SSSE3, 1, 0, 2, 7)
+ANY31(I422ToARGB1555Row_Any_SSSE3, I422ToARGB1555Row_SSSE3, 1, 0, 2, 7)
+ANY31(I422ToRGB565Row_Any_SSSE3, I422ToRGB565Row_SSSE3, 1, 0, 2, 7)
+ANY31(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, 1, 0, 3, 7)
+ANY31(I422ToRAWRow_Any_SSSE3, I422ToRAWRow_SSSE3, 1, 0, 3, 7)
+ANY31(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, 1, 1, 4, 15)
+ANY31(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, 1, 1, 4, 15)
+#endif  // HAS_I444TOARGBROW_SSSE3
+#ifdef HAS_I422TORGB24ROW_AVX2
+ANY31(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 15)
+#endif
+#ifdef HAS_I422TORAWROW_AVX2
+ANY31(I422ToRAWRow_Any_AVX2, I422ToRAWRow_AVX2, 1, 0, 3, 15)
+#endif
+#ifdef HAS_J422TOARGBROW_SSSE3
+ANY31(J422ToARGBRow_Any_SSSE3, J422ToARGBRow_SSSE3, 1, 0, 4, 7)
+#endif
+#ifdef HAS_J422TOARGBROW_AVX2
+ANY31(J422ToARGBRow_Any_AVX2, J422ToARGBRow_AVX2, 1, 0, 4, 15)
+#endif
+#ifdef HAS_I422TOARGBROW_AVX2
+ANY31(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, 1, 0, 4, 15)
+#endif
+#ifdef HAS_I422TOBGRAROW_AVX2
+ANY31(I422ToBGRARow_Any_AVX2, I422ToBGRARow_AVX2, 1, 0, 4, 15)
+#endif
+#ifdef HAS_I422TORGBAROW_AVX2
+ANY31(I422ToRGBARow_Any_AVX2, I422ToRGBARow_AVX2, 1, 0, 4, 15)
+#endif
+#ifdef HAS_I422TOABGRROW_AVX2
+ANY31(I422ToABGRRow_Any_AVX2, I422ToABGRRow_AVX2, 1, 0, 4, 15)
+#endif
+#ifdef HAS_I444TOARGBROW_AVX2
+ANY31(I444ToARGBRow_Any_AVX2, I444ToARGBRow_AVX2, 0, 0, 4, 15)
+#endif
+#ifdef HAS_I411TOARGBROW_AVX2
+ANY31(I411ToARGBRow_Any_AVX2, I411ToARGBRow_AVX2, 2, 0, 4, 15)
+#endif
+#ifdef HAS_I422TOARGB4444ROW_AVX2
+ANY31(I422ToARGB4444Row_Any_AVX2, I422ToARGB4444Row_AVX2, 1, 0, 2, 7)
+#endif
+#ifdef HAS_I422TOARGB1555ROW_AVX2
+ANY31(I422ToARGB1555Row_Any_AVX2, I422ToARGB1555Row_AVX2, 1, 0, 2, 7)
+#endif
+#ifdef HAS_I422TORGB565ROW_AVX2
+ANY31(I422ToRGB565Row_Any_AVX2, I422ToRGB565Row_AVX2, 1, 0, 2, 7)
+#endif
+#ifdef HAS_I422TOARGBROW_NEON
+ANY31(I444ToARGBRow_Any_NEON, I444ToARGBRow_NEON, 0, 0, 4, 7)
+ANY31(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, 1, 0, 4, 7)
+ANY31(I411ToARGBRow_Any_NEON, I411ToARGBRow_NEON, 2, 0, 4, 7)
+ANY31(I422ToBGRARow_Any_NEON, I422ToBGRARow_NEON, 1, 0, 4, 7)
+ANY31(I422ToABGRRow_Any_NEON, I422ToABGRRow_NEON, 1, 0, 4, 7)
+ANY31(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, 1, 0, 4, 7)
+ANY31(I422ToRGB24Row_Any_NEON, I422ToRGB24Row_NEON, 1, 0, 3, 7)
+ANY31(I422ToRAWRow_Any_NEON, I422ToRAWRow_NEON, 1, 0, 3, 7)
+ANY31(I422ToARGB4444Row_Any_NEON, I422ToARGB4444Row_NEON, 1, 0, 2, 7)
+ANY31(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, 1, 0, 2, 7)
+ANY31(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, 1, 0, 2, 7)
+#endif
+#ifdef HAS_I422TOYUY2ROW_NEON
+ANY31(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, 1, 1, 4, 15)
+#endif
+#ifdef HAS_I422TOUYVYROW_NEON
+ANY31(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, 1, 1, 4, 15)
+#endif
+#undef ANY31
+
+// Any 2 planes to 1.
+#define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK)              \
+    void NAMEANY(const uint8* y_buf, const uint8* uv_buf,                      \
+                 uint8* dst_ptr, int width) {                                  \
+      SIMD_ALIGNED(uint8 temp[64 * 3]);                                        \
+      memset(temp, 0, 64 * 2);  /* for msan */                                 \
+      int r = width & MASK;                                                    \
+      int n = width & ~MASK;                                                   \
+      if (n > 0) {                                                             \
+        ANY_SIMD(y_buf, uv_buf, dst_ptr, n);                                   \
+      }                                                                        \
+      memcpy(temp, y_buf + n * SBPP, r * SBPP);                                \
+      memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2,                       \
+             SS(r, UVSHIFT) * SBPP2);                                          \
+      ANY_SIMD(temp, temp + 64, temp + 128, MASK + 1);                         \
+      memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                          \
+    }
+
+// Biplanar to RGB.
+#ifdef HAS_NV12TOARGBROW_SSSE3
+ANY21(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_SSSE3, 1, 1, 2, 4, 7)
+ANY21(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_SSSE3, 1, 1, 2, 4, 7)
+#endif
+#ifdef HAS_NV12TOARGBROW_AVX2
+ANY21(NV12ToARGBRow_Any_AVX2, NV12ToARGBRow_AVX2, 1, 1, 2, 4, 15)
+ANY21(NV21ToARGBRow_Any_AVX2, NV21ToARGBRow_AVX2, 1, 1, 2, 4, 15)
+#endif
+#ifdef HAS_NV12TOARGBROW_NEON
+ANY21(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, 1, 1, 2, 4, 7)
+ANY21(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, 1, 1, 2, 4, 7)
+#endif
+#ifdef HAS_NV12TORGB565ROW_SSSE3
+ANY21(NV12ToRGB565Row_Any_SSSE3, NV12ToRGB565Row_SSSE3, 1, 1, 2, 2, 7)
+ANY21(NV21ToRGB565Row_Any_SSSE3, NV21ToRGB565Row_SSSE3, 1, 1, 2, 2, 7)
+#endif
+#ifdef HAS_NV12TORGB565ROW_AVX2
+ANY21(NV12ToRGB565Row_Any_AVX2, NV12ToRGB565Row_AVX2, 1, 1, 2, 2, 15)
+ANY21(NV21ToRGB565Row_Any_AVX2, NV21ToRGB565Row_AVX2, 1, 1, 2, 2, 15)
+#endif
+#ifdef HAS_NV12TORGB565ROW_NEON
+ANY21(NV12ToRGB565Row_Any_NEON, NV12ToRGB565Row_NEON, 1, 1, 2, 2, 7)
+ANY21(NV21ToRGB565Row_Any_NEON, NV21ToRGB565Row_NEON, 1, 1, 2, 2, 7)
+#endif
+
+// Merge functions.
+#ifdef HAS_MERGEUVROW_SSE2
+ANY21(MergeUVRow_Any_SSE2, MergeUVRow_SSE2, 0, 1, 1, 2, 15)
+#endif
+#ifdef HAS_MERGEUVROW_AVX2
+ANY21(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, 0, 1, 1, 2, 31)
+#endif
+#ifdef HAS_MERGEUVROW_NEON
+ANY21(MergeUVRow_Any_NEON, MergeUVRow_NEON, 0, 1, 1, 2, 15)
+#endif
+
+// Math functions.
+#ifdef HAS_ARGBMULTIPLYROW_SSE2
+ANY21(ARGBMultiplyRow_Any_SSE2, ARGBMultiplyRow_SSE2, 0, 4, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBADDROW_SSE2
+ANY21(ARGBAddRow_Any_SSE2, ARGBAddRow_SSE2, 0, 4, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBSUBTRACTROW_SSE2
+ANY21(ARGBSubtractRow_Any_SSE2, ARGBSubtractRow_SSE2, 0, 4, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBMULTIPLYROW_AVX2
+ANY21(ARGBMultiplyRow_Any_AVX2, ARGBMultiplyRow_AVX2, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBADDROW_AVX2
+ANY21(ARGBAddRow_Any_AVX2, ARGBAddRow_AVX2, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBSUBTRACTROW_AVX2
+ANY21(ARGBSubtractRow_Any_AVX2, ARGBSubtractRow_AVX2, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBMULTIPLYROW_NEON
+ANY21(ARGBMultiplyRow_Any_NEON, ARGBMultiplyRow_NEON, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBADDROW_NEON
+ANY21(ARGBAddRow_Any_NEON, ARGBAddRow_NEON, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBSUBTRACTROW_NEON
+ANY21(ARGBSubtractRow_Any_NEON, ARGBSubtractRow_NEON, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_SOBELROW_SSE2
+ANY21(SobelRow_Any_SSE2, SobelRow_SSE2, 0, 1, 1, 4, 15)
+#endif
+#ifdef HAS_SOBELROW_NEON
+ANY21(SobelRow_Any_NEON, SobelRow_NEON, 0, 1, 1, 4, 7)
+#endif
+#ifdef HAS_SOBELTOPLANEROW_SSE2
+ANY21(SobelToPlaneRow_Any_SSE2, SobelToPlaneRow_SSE2, 0, 1, 1, 1, 15)
+#endif
+#ifdef HAS_SOBELTOPLANEROW_NEON
+ANY21(SobelToPlaneRow_Any_NEON, SobelToPlaneRow_NEON, 0, 1, 1, 1, 15)
+#endif
+#ifdef HAS_SOBELXYROW_SSE2
+ANY21(SobelXYRow_Any_SSE2, SobelXYRow_SSE2, 0, 1, 1, 4, 15)
+#endif
+#ifdef HAS_SOBELXYROW_NEON
+ANY21(SobelXYRow_Any_NEON, SobelXYRow_NEON, 0, 1, 1, 4, 7)
+#endif
+#undef ANY21
+
+// Any 1 to 1.
+#define ANY11(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)                     \
+    void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) {            \
+      SIMD_ALIGNED(uint8 temp[128 * 2]);                                       \
+      memset(temp, 0, 128);  /* for YUY2 and msan */                           \
+      int r = width & MASK;                                                    \
+      int n = width & ~MASK;                                                   \
+      if (n > 0) {                                                             \
+        ANY_SIMD(src_ptr, dst_ptr, n);                                         \
+      }                                                                        \
+      memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP);    \
+      ANY_SIMD(temp, temp + 128, MASK + 1);                                    \
+      memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                          \
+    }
+
+#ifdef HAS_COPYROW_AVX
+ANY11(CopyRow_Any_AVX, CopyRow_AVX, 0, 1, 1, 63)
+#endif
+#ifdef HAS_COPYROW_SSE2
+ANY11(CopyRow_Any_SSE2, CopyRow_SSE2, 0, 1, 1, 31)
+#endif
+#ifdef HAS_COPYROW_NEON
+ANY11(CopyRow_Any_NEON, CopyRow_NEON, 0, 1, 1, 31)
+#endif
+#if defined(HAS_ARGBTORGB24ROW_SSSE3)
+ANY11(ARGBToRGB24Row_Any_SSSE3, ARGBToRGB24Row_SSSE3, 0, 4, 3, 15)
+ANY11(ARGBToRAWRow_Any_SSSE3, ARGBToRAWRow_SSSE3, 0, 4, 3, 15)
+ANY11(ARGBToRGB565Row_Any_SSE2, ARGBToRGB565Row_SSE2, 0, 4, 2, 3)
+ANY11(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, 0, 4, 2, 3)
+ANY11(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, 0, 4, 2, 3)
+#endif
+#if defined(HAS_ARGBTOARGB4444ROW_AVX2)
+ANY11(ARGBToRGB565Row_Any_AVX2, ARGBToRGB565Row_AVX2, 0, 4, 2, 7)
+ANY11(ARGBToARGB1555Row_Any_AVX2, ARGBToARGB1555Row_AVX2, 0, 4, 2, 7)
+ANY11(ARGBToARGB4444Row_Any_AVX2, ARGBToARGB4444Row_AVX2, 0, 4, 2, 7)
+#endif
+#if defined(HAS_J400TOARGBROW_SSE2)
+ANY11(J400ToARGBRow_Any_SSE2, J400ToARGBRow_SSE2, 0, 1, 4, 7)
+#endif
+#if defined(HAS_J400TOARGBROW_AVX2)
+ANY11(J400ToARGBRow_Any_AVX2, J400ToARGBRow_AVX2, 0, 1, 4, 15)
+#endif
+#if defined(HAS_I400TOARGBROW_SSE2)
+ANY11(I400ToARGBRow_Any_SSE2, I400ToARGBRow_SSE2, 0, 1, 4, 7)
+#endif
+#if defined(HAS_I400TOARGBROW_AVX2)
+ANY11(I400ToARGBRow_Any_AVX2, I400ToARGBRow_AVX2, 0, 1, 4, 15)
+#endif
+#if defined(HAS_YUY2TOARGBROW_SSSE3)
+ANY11(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_SSSE3, 1, 4, 4, 15)
+ANY11(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_SSSE3, 1, 4, 4, 15)
+ANY11(RGB24ToARGBRow_Any_SSSE3, RGB24ToARGBRow_SSSE3, 0, 3, 4, 15)
+ANY11(RAWToARGBRow_Any_SSSE3, RAWToARGBRow_SSSE3, 0, 3, 4, 15)
+ANY11(RGB565ToARGBRow_Any_SSE2, RGB565ToARGBRow_SSE2, 0, 2, 4, 7)
+ANY11(ARGB1555ToARGBRow_Any_SSE2, ARGB1555ToARGBRow_SSE2, 0, 2, 4, 7)
+ANY11(ARGB4444ToARGBRow_Any_SSE2, ARGB4444ToARGBRow_SSE2, 0, 2, 4, 7)
+#endif
+#if defined(HAS_RGB565TOARGBROW_AVX2)
+ANY11(RGB565ToARGBRow_Any_AVX2, RGB565ToARGBRow_AVX2, 0, 2, 4, 15)
+#endif
+#if defined(HAS_ARGB1555TOARGBROW_AVX2)
+ANY11(ARGB1555ToARGBRow_Any_AVX2, ARGB1555ToARGBRow_AVX2, 0, 2, 4, 15)
+#endif
+#if defined(HAS_ARGB4444TOARGBROW_AVX2)
+ANY11(ARGB4444ToARGBRow_Any_AVX2, ARGB4444ToARGBRow_AVX2, 0, 2, 4, 15)
+#endif
+#if defined(HAS_YUY2TOARGBROW_AVX2)
+ANY11(YUY2ToARGBRow_Any_AVX2, YUY2ToARGBRow_AVX2, 1, 4, 4, 31)
+ANY11(UYVYToARGBRow_Any_AVX2, UYVYToARGBRow_AVX2, 1, 4, 4, 31)
+#endif
+#if defined(HAS_ARGBTORGB24ROW_NEON)
+ANY11(ARGBToRGB24Row_Any_NEON, ARGBToRGB24Row_NEON, 0, 4, 3, 7)
+ANY11(ARGBToRAWRow_Any_NEON, ARGBToRAWRow_NEON, 0, 4, 3, 7)
+ANY11(ARGBToRGB565Row_Any_NEON, ARGBToRGB565Row_NEON, 0, 4, 2, 7)
+ANY11(ARGBToARGB1555Row_Any_NEON, ARGBToARGB1555Row_NEON, 0, 4, 2, 7)
+ANY11(ARGBToARGB4444Row_Any_NEON, ARGBToARGB4444Row_NEON, 0, 4, 2, 7)
+ANY11(J400ToARGBRow_Any_NEON, J400ToARGBRow_NEON, 0, 1, 4, 7)
+ANY11(I400ToARGBRow_Any_NEON, I400ToARGBRow_NEON, 0, 1, 4, 7)
+ANY11(YUY2ToARGBRow_Any_NEON, YUY2ToARGBRow_NEON, 1, 4, 4, 7)
+ANY11(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, 1, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBTOYROW_AVX2
+ANY11(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 0, 4, 1, 31)
+#endif
+#ifdef HAS_ARGBTOYJROW_AVX2
+ANY11(ARGBToYJRow_Any_AVX2, ARGBToYJRow_AVX2, 0, 4, 1, 31)
+#endif
+#ifdef HAS_UYVYTOYROW_AVX2
+ANY11(UYVYToYRow_Any_AVX2, UYVYToYRow_AVX2, 0, 2, 1, 31)
+#endif
+#ifdef HAS_YUY2TOYROW_AVX2
+ANY11(YUY2ToYRow_Any_AVX2, YUY2ToYRow_AVX2, 1, 4, 1, 31)
+#endif
+#ifdef HAS_ARGBTOYROW_SSSE3
+ANY11(ARGBToYRow_Any_SSSE3, ARGBToYRow_SSSE3, 0, 4, 1, 15)
+#endif
+#ifdef HAS_BGRATOYROW_SSSE3
+ANY11(BGRAToYRow_Any_SSSE3, BGRAToYRow_SSSE3, 0, 4, 1, 15)
+ANY11(ABGRToYRow_Any_SSSE3, ABGRToYRow_SSSE3, 0, 4, 1, 15)
+ANY11(RGBAToYRow_Any_SSSE3, RGBAToYRow_SSSE3, 0, 4, 1, 15)
+ANY11(YUY2ToYRow_Any_SSE2, YUY2ToYRow_SSE2, 1, 4, 1, 15)
+ANY11(UYVYToYRow_Any_SSE2, UYVYToYRow_SSE2, 1, 4, 1, 15)
+#endif
+#ifdef HAS_ARGBTOYJROW_SSSE3
+ANY11(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_SSSE3, 0, 4, 1, 15)
+#endif
+#ifdef HAS_ARGBTOYROW_NEON
+ANY11(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 0, 4, 1, 7)
+#endif
+#ifdef HAS_ARGBTOYJROW_NEON
+ANY11(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 0, 4, 1, 7)
+#endif
+#ifdef HAS_BGRATOYROW_NEON
+ANY11(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, 0, 4, 1, 7)
+#endif
+#ifdef HAS_ABGRTOYROW_NEON
+ANY11(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, 0, 4, 1, 7)
+#endif
+#ifdef HAS_RGBATOYROW_NEON
+ANY11(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, 0, 4, 1, 7)
+#endif
+#ifdef HAS_RGB24TOYROW_NEON
+ANY11(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 0, 3, 1, 7)
+#endif
+#ifdef HAS_RAWTOYROW_NEON
+ANY11(RAWToYRow_Any_NEON, RAWToYRow_NEON, 0, 3, 1, 7)
+#endif
+#ifdef HAS_RGB565TOYROW_NEON
+ANY11(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 0, 2, 1, 7)
+#endif
+#ifdef HAS_ARGB1555TOYROW_NEON
+ANY11(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 0, 2, 1, 7)
+#endif
+#ifdef HAS_ARGB4444TOYROW_NEON
+ANY11(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 0, 2, 1, 7)
+#endif
+#ifdef HAS_YUY2TOYROW_NEON
+ANY11(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 1, 4, 1, 15)
+#endif
+#ifdef HAS_UYVYTOYROW_NEON
+ANY11(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 0, 2, 1, 15)
+#endif
+#ifdef HAS_RGB24TOARGBROW_NEON
+ANY11(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 0, 3, 4, 7)
+#endif
+#ifdef HAS_RAWTOARGBROW_NEON
+ANY11(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, 0, 3, 4, 7)
+#endif
+#ifdef HAS_RGB565TOARGBROW_NEON
+ANY11(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 0, 2, 4, 7)
+#endif
+#ifdef HAS_ARGB1555TOARGBROW_NEON
+ANY11(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 0, 2, 4, 7)
+#endif
+#ifdef HAS_ARGB4444TOARGBROW_NEON
+ANY11(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 0, 2, 4, 7)
+#endif
+#ifdef HAS_ARGBATTENUATEROW_SSSE3
+ANY11(ARGBAttenuateRow_Any_SSSE3, ARGBAttenuateRow_SSSE3, 0, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBATTENUATEROW_SSE2
+ANY11(ARGBAttenuateRow_Any_SSE2, ARGBAttenuateRow_SSE2, 0, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBUNATTENUATEROW_SSE2
+ANY11(ARGBUnattenuateRow_Any_SSE2, ARGBUnattenuateRow_SSE2, 0, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBATTENUATEROW_AVX2
+ANY11(ARGBAttenuateRow_Any_AVX2, ARGBAttenuateRow_AVX2, 0, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBUNATTENUATEROW_AVX2
+ANY11(ARGBUnattenuateRow_Any_AVX2, ARGBUnattenuateRow_AVX2, 0, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBATTENUATEROW_NEON
+ANY11(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, 0, 4, 4, 7)
+#endif
+#undef ANY11
+
+// Any 1 to 1 with parameter.
+#define ANY11P(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK)                          \
+    void NAMEANY(const uint8* src_ptr, uint8* dst_ptr,                         \
+                 T shuffler, int width) {                                      \
+      SIMD_ALIGNED(uint8 temp[64 * 2]);                                        \
+      memset(temp, 0, 64);  /* for msan */                                     \
+      int r = width & MASK;                                                    \
+      int n = width & ~MASK;                                                   \
+      if (n > 0) {                                                             \
+        ANY_SIMD(src_ptr, dst_ptr, shuffler, n);                               \
+      }                                                                        \
+      memcpy(temp, src_ptr + n * SBPP, r * SBPP);                              \
+      ANY_SIMD(temp, temp + 64, shuffler, MASK + 1);                           \
+      memcpy(dst_ptr + n * BPP, temp + 64, r * BPP);                           \
+    }
+
+#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
+ANY11P(ARGBToRGB565DitherRow_Any_SSE2, ARGBToRGB565DitherRow_SSE2,
+       const uint32, 4, 2, 3)
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_AVX2)
+ANY11P(ARGBToRGB565DitherRow_Any_AVX2, ARGBToRGB565DitherRow_AVX2,
+       const uint32, 4, 2, 7)
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_NEON)
+ANY11P(ARGBToRGB565DitherRow_Any_NEON, ARGBToRGB565DitherRow_NEON,
+       const uint32, 4, 2, 7)
+#endif
+#ifdef HAS_ARGBSHUFFLEROW_SSE2
+ANY11P(ARGBShuffleRow_Any_SSE2, ARGBShuffleRow_SSE2, const uint8*, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBSHUFFLEROW_SSSE3
+ANY11P(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_SSSE3, const uint8*, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBSHUFFLEROW_AVX2
+ANY11P(ARGBShuffleRow_Any_AVX2, ARGBShuffleRow_AVX2, const uint8*, 4, 4, 15)
+#endif
+#ifdef HAS_ARGBSHUFFLEROW_NEON
+ANY11P(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, const uint8*, 4, 4, 3)
+#endif
+#undef ANY11P
+
+// Any 1 to 1 interpolate.  Takes 2 rows of source via stride.
+#define ANY11T(NAMEANY, ANY_SIMD, SBPP, BPP, MASK)                             \
+    void NAMEANY(uint8* dst_ptr, const uint8* src_ptr,                         \
+                 ptrdiff_t src_stride_ptr, int width,                          \
+                 int source_y_fraction) {                                      \
+      SIMD_ALIGNED(uint8 temp[64 * 3]);                                        \
+      memset(temp, 0, 64 * 2);  /* for msan */                                 \
+      int r = width & MASK;                                                    \
+      int n = width & ~MASK;                                                   \
+      if (n > 0) {                                                             \
+        ANY_SIMD(dst_ptr, src_ptr, src_stride_ptr, n, source_y_fraction);      \
+      }                                                                        \
+      memcpy(temp, src_ptr + n * SBPP, r * SBPP);                              \
+      memcpy(temp + 64, src_ptr + src_stride_ptr + n * SBPP, r * SBPP);        \
+      ANY_SIMD(temp + 128, temp, 64, MASK + 1, source_y_fraction);             \
+      memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                          \
+    }
+
+#ifdef HAS_INTERPOLATEROW_AVX2
+ANY11T(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, 1, 1, 31)
+#endif
+#ifdef HAS_INTERPOLATEROW_SSSE3
+ANY11T(InterpolateRow_Any_SSSE3, InterpolateRow_SSSE3, 1, 1, 15)
+#endif
+#ifdef HAS_INTERPOLATEROW_SSE2
+ANY11T(InterpolateRow_Any_SSE2, InterpolateRow_SSE2, 1, 1, 15)
+#endif
+#ifdef HAS_INTERPOLATEROW_NEON
+ANY11T(InterpolateRow_Any_NEON, InterpolateRow_NEON, 1, 1, 15)
+#endif
+#ifdef HAS_INTERPOLATEROW_MIPS_DSPR2
+ANY11T(InterpolateRow_Any_MIPS_DSPR2, InterpolateRow_MIPS_DSPR2, 1, 1, 3)
+#endif
+#undef ANY11T
+
+// Any 1 to 1 mirror.
+#define ANY11M(NAMEANY, ANY_SIMD, BPP, MASK)                                   \
+    void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) {            \
+      SIMD_ALIGNED(uint8 temp[64 * 2]);                                        \
+      memset(temp, 0, 64);  /* for msan */                                     \
+      int r = width & MASK;                                                    \
+      int n = width & ~MASK;                                                   \
+      if (n > 0) {                                                             \
+        ANY_SIMD(src_ptr + r * BPP, dst_ptr, n);                               \
+      }                                                                        \
+      memcpy(temp, src_ptr, r * BPP);                                          \
+      ANY_SIMD(temp, temp + 64, MASK + 1);                                     \
+      memcpy(dst_ptr + n * BPP, temp + 64 + (MASK + 1 - r) * BPP, r * BPP);    \
+    }
+
+#ifdef HAS_MIRRORROW_AVX2
+ANY11M(MirrorRow_Any_AVX2, MirrorRow_AVX2, 1, 31)
+#endif
+#ifdef HAS_MIRRORROW_SSSE3
+ANY11M(MirrorRow_Any_SSSE3, MirrorRow_SSSE3, 1, 15)
+#endif
+#ifdef HAS_MIRRORROW_SSE2
+ANY11M(MirrorRow_Any_SSE2, MirrorRow_SSE2, 1, 15)
+#endif
+#ifdef HAS_MIRRORROW_NEON
+ANY11M(MirrorRow_Any_NEON, MirrorRow_NEON, 1, 15)
+#endif
+#ifdef HAS_ARGBMIRRORROW_AVX2
+ANY11M(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, 4, 7)
+#endif
+#ifdef HAS_ARGBMIRRORROW_SSE2
+ANY11M(ARGBMirrorRow_Any_SSE2, ARGBMirrorRow_SSE2, 4, 3)
+#endif
+#ifdef HAS_ARGBMIRRORROW_NEON
+ANY11M(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, 4, 3)
+#endif
+#undef ANY11M
+
+// Any 1 plane. (memset)
+#define ANY1(NAMEANY, ANY_SIMD, T, BPP, MASK)                                  \
+    void NAMEANY(uint8* dst_ptr, T v32, int width) {                           \
+      SIMD_ALIGNED(uint8 temp[64]);                                            \
+      int r = width & MASK;                                                    \
+      int n = width & ~MASK;                                                   \
+      if (n > 0) {                                                             \
+        ANY_SIMD(dst_ptr, v32, n);                                             \
+      }                                                                        \
+      ANY_SIMD(temp, v32, MASK + 1);                                           \
+      memcpy(dst_ptr + n * BPP, temp, r * BPP);                                \
+    }
+
+#ifdef HAS_SETROW_X86
+ANY1(SetRow_Any_X86, SetRow_X86, uint8, 1, 3)
+#endif
+#ifdef HAS_SETROW_NEON
+ANY1(SetRow_Any_NEON, SetRow_NEON, uint8, 1, 15)
+#endif
+#ifdef HAS_ARGBSETROW_NEON
+ANY1(ARGBSetRow_Any_NEON, ARGBSetRow_NEON, uint32, 4, 3)
+#endif
+#undef ANY1
+
+// Any 1 to 2.  Outputs UV planes.
+#define ANY12(NAMEANY, ANY_SIMD, UVSHIFT, BPP, DUVSHIFT, MASK)                 \
+    void NAMEANY(const uint8* src_ptr, uint8* dst_u, uint8* dst_v, int width) {\
+      SIMD_ALIGNED(uint8 temp[128 * 3]);                                       \
+      memset(temp, 0, 128);  /* for msan */                                    \
+      int r = width & MASK;                                                    \
+      int n = width & ~MASK;                                                   \
+      if (n > 0) {                                                             \
+        ANY_SIMD(src_ptr, dst_u, dst_v, n);                                    \
+      }                                                                        \
+      memcpy(temp, src_ptr  + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP);     \
+      if ((width & 1) && BPP == 4) {  /* repeat last 4 bytes for subsampler */ \
+        memcpy(temp + SS(r, UVSHIFT) * BPP,                                    \
+               temp + SS(r, UVSHIFT) * BPP - BPP, 4);                          \
+      }                                                                        \
+      ANY_SIMD(temp, temp + 128, temp + 256, MASK + 1);                        \
+      memcpy(dst_u + (n >> DUVSHIFT), temp + 128, SS(r, DUVSHIFT));            \
+      memcpy(dst_v + (n >> DUVSHIFT), temp + 256, SS(r, DUVSHIFT));            \
+    }
+
+#ifdef HAS_SPLITUVROW_SSE2
+ANY12(SplitUVRow_Any_SSE2, SplitUVRow_SSE2, 0, 2, 0, 15)
+#endif
+#ifdef HAS_SPLITUVROW_AVX2
+ANY12(SplitUVRow_Any_AVX2, SplitUVRow_AVX2, 0, 2, 0, 31)
+#endif
+#ifdef HAS_SPLITUVROW_NEON
+ANY12(SplitUVRow_Any_NEON, SplitUVRow_NEON, 0, 2, 0, 15)
+#endif
+#ifdef HAS_SPLITUVROW_MIPS_DSPR2
+ANY12(SplitUVRow_Any_MIPS_DSPR2, SplitUVRow_MIPS_DSPR2, 0, 2, 0, 15)
+#endif
+#ifdef HAS_ARGBTOUV444ROW_SSSE3
+ANY12(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_SSSE3, 0, 4, 0, 15)
+#endif
+#ifdef HAS_YUY2TOUV422ROW_AVX2
+ANY12(YUY2ToUV422Row_Any_AVX2, YUY2ToUV422Row_AVX2, 1, 4, 1, 31)
+ANY12(UYVYToUV422Row_Any_AVX2, UYVYToUV422Row_AVX2, 1, 4, 1, 31)
+#endif
+#ifdef HAS_ARGBTOUV422ROW_SSSE3
+ANY12(ARGBToUV422Row_Any_SSSE3, ARGBToUV422Row_SSSE3, 0, 4, 1, 15)
+#endif
+#ifdef HAS_YUY2TOUV422ROW_SSE2
+ANY12(YUY2ToUV422Row_Any_SSE2, YUY2ToUV422Row_SSE2, 1, 4, 1, 15)
+ANY12(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_SSE2, 1, 4, 1, 15)
+#endif
+#ifdef HAS_YUY2TOUV422ROW_NEON
+ANY12(ARGBToUV444Row_Any_NEON, ARGBToUV444Row_NEON, 0, 4, 0, 7)
+ANY12(ARGBToUV422Row_Any_NEON, ARGBToUV422Row_NEON, 0, 4, 1, 15)
+ANY12(ARGBToUV411Row_Any_NEON, ARGBToUV411Row_NEON, 0, 4, 2, 31)
+ANY12(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON, 1, 4, 1, 15)
+ANY12(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, 1, 4, 1, 15)
+#endif
+#undef ANY12
+
+// Any 1 to 2 with source stride (2 rows of source).  Outputs UV planes.
+// 128 byte row allows for 32 avx ARGB pixels.
+#define ANY12S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK)                          \
+    void NAMEANY(const uint8* src_ptr, int src_stride_ptr,                     \
+                 uint8* dst_u, uint8* dst_v, int width) {                      \
+      SIMD_ALIGNED(uint8 temp[128 * 4]);                                       \
+      memset(temp, 0, 128 * 2);  /* for msan */                                \
+      int r = width & MASK;                                                    \
+      int n = width & ~MASK;                                                   \
+      if (n > 0) {                                                             \
+        ANY_SIMD(src_ptr, src_stride_ptr, dst_u, dst_v, n);                    \
+      }                                                                        \
+      memcpy(temp, src_ptr  + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP);     \
+      memcpy(temp + 128, src_ptr  + src_stride_ptr + (n >> UVSHIFT) * BPP,     \
+             SS(r, UVSHIFT) * BPP);                                            \
+      if ((width & 1) && BPP == 4) {  /* repeat last 4 bytes for subsampler */ \
+        memcpy(temp + SS(r, UVSHIFT) * BPP,                                    \
+               temp + SS(r, UVSHIFT) * BPP - BPP, 4);                          \
+        memcpy(temp + 128 + SS(r, UVSHIFT) * BPP,                              \
+               temp + 128 + SS(r, UVSHIFT) * BPP - BPP, 4);                    \
+      }                                                                        \
+      ANY_SIMD(temp, 128, temp + 256, temp + 384, MASK + 1);                   \
+      memcpy(dst_u + (n >> 1), temp + 256, SS(r, 1));                          \
+      memcpy(dst_v + (n >> 1), temp + 384, SS(r, 1));                          \
+    }
+
+#ifdef HAS_ARGBTOUVROW_AVX2
+ANY12S(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, 0, 4, 31)
+#endif
+#ifdef HAS_ARGBTOUVROW_SSSE3
+ANY12S(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_SSSE3, 0, 4, 15)
+ANY12S(ARGBToUVJRow_Any_SSSE3, ARGBToUVJRow_SSSE3, 0, 4, 15)
+ANY12S(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_SSSE3, 0, 4, 15)
+ANY12S(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_SSSE3, 0, 4, 15)
+ANY12S(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_SSSE3, 0, 4, 15)
+#endif
+#ifdef HAS_YUY2TOUVROW_AVX2
+ANY12S(YUY2ToUVRow_Any_AVX2, YUY2ToUVRow_AVX2, 1, 4, 31)
+ANY12S(UYVYToUVRow_Any_AVX2, UYVYToUVRow_AVX2, 1, 4, 31)
+#endif
+#ifdef HAS_YUY2TOUVROW_SSE2
+ANY12S(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_SSE2, 1, 4, 15)
+ANY12S(UYVYToUVRow_Any_SSE2, UYVYToUVRow_SSE2, 1, 4, 15)
+#endif
+#ifdef HAS_ARGBTOUVROW_NEON
+ANY12S(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, 0, 4, 15)
+#endif
+#ifdef HAS_ARGBTOUVJROW_NEON
+ANY12S(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, 0, 4, 15)
+#endif
+#ifdef HAS_BGRATOUVROW_NEON
+ANY12S(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, 0, 4, 15)
+#endif
+#ifdef HAS_ABGRTOUVROW_NEON
+ANY12S(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, 0, 4, 15)
+#endif
+#ifdef HAS_RGBATOUVROW_NEON
+ANY12S(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, 0, 4, 15)
+#endif
+#ifdef HAS_RGB24TOUVROW_NEON
+ANY12S(RGB24ToUVRow_Any_NEON, RGB24ToUVRow_NEON, 0, 3, 15)
+#endif
+#ifdef HAS_RAWTOUVROW_NEON
+ANY12S(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, 0, 3, 15)
+#endif
+#ifdef HAS_RGB565TOUVROW_NEON
+ANY12S(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, 0, 2, 15)
+#endif
+#ifdef HAS_ARGB1555TOUVROW_NEON
+ANY12S(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, 0, 2, 15)
+#endif
+#ifdef HAS_ARGB4444TOUVROW_NEON
+ANY12S(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, 0, 2, 15)
+#endif
+#ifdef HAS_YUY2TOUVROW_NEON
+ANY12S(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, 1, 4, 15)
+#endif
+#ifdef HAS_UYVYTOUVROW_NEON
+ANY12S(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, 1, 4, 15)
+#endif
+#undef ANY12S
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libaom/src/third_party/libyuv/source/row_common.cc b/libs/libaom/src/third_party/libyuv/source/row_common.cc
new file mode 100644
index 000000000..49875894f
--- /dev/null
+++ b/libs/libaom/src/third_party/libyuv/source/row_common.cc
@@ -0,0 +1,2576 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#include <string.h>  // For memcpy and memset.
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// llvm x86 is poor at ternary operator, so use branchless min/max.
+
+#define USE_BRANCHLESS 1
+#if USE_BRANCHLESS
+static __inline int32 clamp0(int32 v) {
+  return ((-(v) >> 31) & (v));
+}
+
+static __inline int32 clamp255(int32 v) {
+  return (((255 - (v)) >> 31) | (v)) & 255;
+}
+
+static __inline uint32 Clamp(int32 val) {
+  int v = clamp0(val);
+  return (uint32)(clamp255(v));
+}
+
+static __inline uint32 Abs(int32 v) {
+  int m = v >> 31;
+  return (v + m) ^ m;
+}
+#else  // USE_BRANCHLESS
+static __inline int32 clamp0(int32 v) {
+  return (v < 0) ? 0 : v;
+}
+
+static __inline int32 clamp255(int32 v) {
+  return (v > 255) ? 255 : v;
+}
+
+static __inline uint32 Clamp(int32 val) {
+  int v = clamp0(val);
+  return (uint32)(clamp255(v));
+}
+
+static __inline uint32 Abs(int32 v) {
+  return (v < 0) ? -v : v;
+}
+#endif  // USE_BRANCHLESS
+
+#ifdef LIBYUV_LITTLE_ENDIAN
+#define WRITEWORD(p, v) *(uint32*)(p) = v
+#else
+static inline void WRITEWORD(uint8* p, uint32 v) {
+  p[0] = (uint8)(v & 255);
+  p[1] = (uint8)((v >> 8) & 255);
+  p[2] = (uint8)((v >> 16) & 255);
+  p[3] = (uint8)((v >> 24) & 255);
+}
+#endif
+
+void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8 b = src_rgb24[0];
+    uint8 g = src_rgb24[1];
+    uint8 r = src_rgb24[2];
+    dst_argb[0] = b;
+    dst_argb[1] = g;
+    dst_argb[2] = r;
+    dst_argb[3] = 255u;
+    dst_argb += 4;
+    src_rgb24 += 3;
+  }
+}
+
+void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8 r = src_raw[0];
+    uint8 g = src_raw[1];
+    uint8 b = src_raw[2];
+    dst_argb[0] = b;
+    dst_argb[1] = g;
+    dst_argb[2] = r;
+    dst_argb[3] = 255u;
+    dst_argb += 4;
+    src_raw += 3;
+  }
+}
+
+void RGB565ToARGBRow_C(const uint8* src_rgb565, uint8* dst_argb, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8 b = src_rgb565[0] & 0x1f;
+    uint8 g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
+    uint8 r = src_rgb565[1] >> 3;
+    dst_argb[0] = (b << 3) | (b >> 2);
+    dst_argb[1] = (g << 2) | (g >> 4);
+    dst_argb[2] = (r << 3) | (r >> 2);
+    dst_argb[3] = 255u;
+    dst_argb += 4;
+    src_rgb565 += 2;
+  }
+}
+
+void ARGB1555ToARGBRow_C(const uint8* src_argb1555, uint8* dst_argb,
+                         int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8 b = src_argb1555[0] & 0x1f;
+    uint8 g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
+    uint8 r = (src_argb1555[1] & 0x7c) >> 2;
+    uint8 a = src_argb1555[1] >> 7;
+    dst_argb[0] = (b << 3) | (b >> 2);
+    dst_argb[1] = (g << 3) | (g >> 2);
+    dst_argb[2] = (r << 3) | (r >> 2);
+    dst_argb[3] = -a;
+    dst_argb += 4;
+    src_argb1555 += 2;
+  }
+}
+
+void ARGB4444ToARGBRow_C(const uint8* src_argb4444, uint8* dst_argb,
+                         int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8 b = src_argb4444[0] & 0x0f;
+    uint8 g = src_argb4444[0] >> 4;
+    uint8 r = src_argb4444[1] & 0x0f;
+    uint8 a = src_argb4444[1] >> 4;
+    dst_argb[0] = (b << 4) | b;
+    dst_argb[1] = (g << 4) | g;
+    dst_argb[2] = (r << 4) | r;
+    dst_argb[3] = (a << 4) | a;
+    dst_argb += 4;
+    src_argb4444 += 2;
+  }
+}
+
+void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8 b = src_argb[0];
+    uint8 g = src_argb[1];
+    uint8 r = src_argb[2];
+    dst_rgb[0] = b;
+    dst_rgb[1] = g;
+    dst_rgb[2] = r;
+    dst_rgb += 3;
+    src_argb += 4;
+  }
+}
+
+void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8 b = src_argb[0];
+    uint8 g = src_argb[1];
+    uint8 r = src_argb[2];
+    dst_rgb[0] = r;
+    dst_rgb[1] = g;
+    dst_rgb[2] = b;
+    dst_rgb += 3;
+    src_argb += 4;
+  }
+}
+
+void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    uint8 b0 = src_argb[0] >> 3;
+    uint8 g0 = src_argb[1] >> 2;
+    uint8 r0 = src_argb[2] >> 3;
+    uint8 b1 = src_argb[4] >> 3;
+    uint8 g1 = src_argb[5] >> 2;
+    uint8 r1 = src_argb[6] >> 3;
+    WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) |
+              (b1 << 16) | (g1 << 21) | (r1 << 27));
+    dst_rgb += 4;
+    src_argb += 8;
+  }
+  if (width & 1) {
+    uint8 b0 = src_argb[0] >> 3;
+    uint8 g0 = src_argb[1] >> 2;
+    uint8 r0 = src_argb[2] >> 3;
+    *(uint16*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
+  }
+}
+
+// dither4 is a row of 4 values from 4x4 dither matrix.
+// The 4x4 matrix contains values to increase RGB.  When converting to
+// fewer bits (565) this provides an ordered dither.
+// The order in the 4x4 matrix in first byte is upper left.
+// The 4 values are passed as an int, then referenced as an array, so
+// endian will not affect order of the original matrix.  But the dither4
+// will containing the first pixel in the lower byte for little endian
+// or the upper byte for big endian.
+void ARGBToRGB565DitherRow_C(const uint8* src_argb, uint8* dst_rgb,
+                             const uint32 dither4, int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    int dither0 = ((const unsigned char*)(&dither4))[x & 3];
+    int dither1 = ((const unsigned char*)(&dither4))[(x + 1) & 3];
+    uint8 b0 = clamp255(src_argb[0] + dither0) >> 3;
+    uint8 g0 = clamp255(src_argb[1] + dither0) >> 2;
+    uint8 r0 = clamp255(src_argb[2] + dither0) >> 3;
+    uint8 b1 = clamp255(src_argb[4] + dither1) >> 3;
+    uint8 g1 = clamp255(src_argb[5] + dither1) >> 2;
+    uint8 r1 = clamp255(src_argb[6] + dither1) >> 3;
+    WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) |
+              (b1 << 16) | (g1 << 21) | (r1 << 27));
+    dst_rgb += 4;
+    src_argb += 8;
+  }
+  if (width & 1) {
+    int dither0 = ((const unsigned char*)(&dither4))[(width - 1) & 3];
+    uint8 b0 = clamp255(src_argb[0] + dither0) >> 3;
+    uint8 g0 = clamp255(src_argb[1] + dither0) >> 2;
+    uint8 r0 = clamp255(src_argb[2] + dither0) >> 3;
+    *(uint16*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
+  }
+}
+
+void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    uint8 b0 = src_argb[0] >> 3;
+    uint8 g0 = src_argb[1] >> 3;
+    uint8 r0 = src_argb[2] >> 3;
+    uint8 a0 = src_argb[3] >> 7;
+    uint8 b1 = src_argb[4] >> 3;
+    uint8 g1 = src_argb[5] >> 3;
+    uint8 r1 = src_argb[6] >> 3;
+    uint8 a1 = src_argb[7] >> 7;
+    *(uint32*)(dst_rgb) =
+        b0 | (g0 << 5) | (r0 << 10) | (a0 << 15) |
+        (b1 << 16) | (g1 << 21) | (r1 << 26) | (a1 << 31);
+    dst_rgb += 4;
+    src_argb += 8;
+  }
+  if (width & 1) {
+    uint8 b0 = src_argb[0] >> 3;
+    uint8 g0 = src_argb[1] >> 3;
+    uint8 r0 = src_argb[2] >> 3;
+    uint8 a0 = src_argb[3] >> 7;
+    *(uint16*)(dst_rgb) =
+        b0 | (g0 << 5) | (r0 << 10) | (a0 << 15);
+  }
+}
+
+void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    uint8 b0 = src_argb[0] >> 4;
+    uint8 g0 = src_argb[1] >> 4;
+    uint8 r0 = src_argb[2] >> 4;
+    uint8 a0 = src_argb[3] >> 4;
+    uint8 b1 = src_argb[4] >> 4;
+    uint8 g1 = src_argb[5] >> 4;
+    uint8 r1 = src_argb[6] >> 4;
+    uint8 a1 = src_argb[7] >> 4;
+    *(uint32*)(dst_rgb) =
+        b0 | (g0 << 4) | (r0 << 8) | (a0 << 12) |
+        (b1 << 16) | (g1 << 20) | (r1 << 24) | (a1 << 28);
+    dst_rgb += 4;
+    src_argb += 8;
+  }
+  if (width & 1) {
+    uint8 b0 = src_argb[0] >> 4;
+    uint8 g0 = src_argb[1] >> 4;
+    uint8 r0 = src_argb[2] >> 4;
+    uint8 a0 = src_argb[3] >> 4;
+    *(uint16*)(dst_rgb) =
+        b0 | (g0 << 4) | (r0 << 8) | (a0 << 12);
+  }
+}
+
+static __inline int RGBToY(uint8 r, uint8 g, uint8 b) {
+  return (66 * r + 129 * g +  25 * b + 0x1080) >> 8;
+}
+
+static __inline int RGBToU(uint8 r, uint8 g, uint8 b) {
+  return (112 * b - 74 * g - 38 * r + 0x8080) >> 8;
+}
+static __inline int RGBToV(uint8 r, uint8 g, uint8 b) {
+  return (112 * r - 94 * g - 18 * b + 0x8080) >> 8;
+}
+
+#define MAKEROWY(NAME, R, G, B, BPP) \
+void NAME ## ToYRow_C(const uint8* src_argb0, uint8* dst_y, int width) {       \
+  int x;                                                                       \
+  for (x = 0; x < width; ++x) {                                                \
+    dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]);               \
+    src_argb0 += BPP;                                                          \
+    dst_y += 1;                                                                \
+  }                                                                            \
+}                                                                              \
+void NAME ## ToUVRow_C(const uint8* src_rgb0, int src_stride_rgb,              \
+                       uint8* dst_u, uint8* dst_v, int width) {                \
+  const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;                           \
+  int x;                                                                       \
+  for (x = 0; x < width - 1; x += 2) {                                         \
+    uint8 ab = (src_rgb0[B] + src_rgb0[B + BPP] +                              \
+               src_rgb1[B] + src_rgb1[B + BPP]) >> 2;                          \
+    uint8 ag = (src_rgb0[G] + src_rgb0[G + BPP] +                              \
+               src_rgb1[G] + src_rgb1[G + BPP]) >> 2;                          \
+    uint8 ar = (src_rgb0[R] + src_rgb0[R + BPP] +                              \
+               src_rgb1[R] + src_rgb1[R + BPP]) >> 2;                          \
+    dst_u[0] = RGBToU(ar, ag, ab);                                             \
+    dst_v[0] = RGBToV(ar, ag, ab);                                             \
+    src_rgb0 += BPP * 2;                                                       \
+    src_rgb1 += BPP * 2;                                                       \
+    dst_u += 1;                                                                \
+    dst_v += 1;                                                                \
+  }                                                                            \
+  if (width & 1) {                                                             \
+    uint8 ab = (src_rgb0[B] + src_rgb1[B]) >> 1;                               \
+    uint8 ag = (src_rgb0[G] + src_rgb1[G]) >> 1;                               \
+    uint8 ar = (src_rgb0[R] + src_rgb1[R]) >> 1;                               \
+    dst_u[0] = RGBToU(ar, ag, ab);                                             \
+    dst_v[0] = RGBToV(ar, ag, ab);                                             \
+  }                                                                            \
+}
+
+MAKEROWY(ARGB, 2, 1, 0, 4)
+MAKEROWY(BGRA, 1, 2, 3, 4)
+MAKEROWY(ABGR, 0, 1, 2, 4)
+MAKEROWY(RGBA, 3, 2, 1, 4)
+MAKEROWY(RGB24, 2, 1, 0, 3)
+MAKEROWY(RAW, 0, 1, 2, 3)
+#undef MAKEROWY
+
+// JPeg uses a variation on BT.601-1 full range
+// y =  0.29900 * r + 0.58700 * g + 0.11400 * b
+// u = -0.16874 * r - 0.33126 * g + 0.50000 * b  + center
+// v =  0.50000 * r - 0.41869 * g - 0.08131 * b  + center
+// BT.601 Mpeg range uses:
+// b 0.1016 * 255 = 25.908 = 25
+// g 0.5078 * 255 = 129.489 = 129
+// r 0.2578 * 255 = 65.739 = 66
+// JPeg 8 bit Y (not used):
+// b 0.11400 * 256 = 29.184 = 29
+// g 0.58700 * 256 = 150.272 = 150
+// r 0.29900 * 256 = 76.544 = 77
+// JPeg 7 bit Y:
+// b 0.11400 * 128 = 14.592 = 15
+// g 0.58700 * 128 = 75.136 = 75
+// r 0.29900 * 128 = 38.272 = 38
+// JPeg 8 bit U:
+// b  0.50000 * 255 = 127.5 = 127
+// g -0.33126 * 255 = -84.4713 = -84
+// r -0.16874 * 255 = -43.0287 = -43
+// JPeg 8 bit V:
+// b -0.08131 * 255 = -20.73405 = -20
+// g -0.41869 * 255 = -106.76595 = -107
+// r  0.50000 * 255 = 127.5 = 127
+
+static __inline int RGBToYJ(uint8 r, uint8 g, uint8 b) {
+  return (38 * r + 75 * g +  15 * b + 64) >> 7;
+}
+
+static __inline int RGBToUJ(uint8 r, uint8 g, uint8 b) {
+  return (127 * b - 84 * g - 43 * r + 0x8080) >> 8;
+}
+static __inline int RGBToVJ(uint8 r, uint8 g, uint8 b) {
+  return (127 * r - 107 * g - 20 * b + 0x8080) >> 8;
+}
+
+#define AVGB(a, b) (((a) + (b) + 1) >> 1)
+
+#define MAKEROWYJ(NAME, R, G, B, BPP) \
+void NAME ## ToYJRow_C(const uint8* src_argb0, uint8* dst_y, int width) {      \
+  int x;                                                                       \
+  for (x = 0; x < width; ++x) {                                                \
+    dst_y[0] = RGBToYJ(src_argb0[R], src_argb0[G], src_argb0[B]);              \
+    src_argb0 += BPP;                                                          \
+    dst_y += 1;                                                                \
+  }                                                                            \
+}                                                                              \
+void NAME ## ToUVJRow_C(const uint8* src_rgb0, int src_stride_rgb,             \
+                        uint8* dst_u, uint8* dst_v, int width) {               \
+  const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;                           \
+  int x;                                                                       \
+  for (x = 0; x < width - 1; x += 2) {                                         \
+    uint8 ab = AVGB(AVGB(src_rgb0[B], src_rgb1[B]),                            \
+                    AVGB(src_rgb0[B + BPP], src_rgb1[B + BPP]));               \
+    uint8 ag = AVGB(AVGB(src_rgb0[G], src_rgb1[G]),                            \
+                    AVGB(src_rgb0[G + BPP], src_rgb1[G + BPP]));               \
+    uint8 ar = AVGB(AVGB(src_rgb0[R], src_rgb1[R]),                            \
+                    AVGB(src_rgb0[R + BPP], src_rgb1[R + BPP]));               \
+    dst_u[0] = RGBToUJ(ar, ag, ab);                                            \
+    dst_v[0] = RGBToVJ(ar, ag, ab);                                            \
+    src_rgb0 += BPP * 2;                                                       \
+    src_rgb1 += BPP * 2;                                                       \
+    dst_u += 1;                                                                \
+    dst_v += 1;                                                                \
+  }                                                                            \
+  if (width & 1) {                                                             \
+    uint8 ab = AVGB(src_rgb0[B], src_rgb1[B]);                                 \
+    uint8 ag = AVGB(src_rgb0[G], src_rgb1[G]);                                 \
+    uint8 ar = AVGB(src_rgb0[R], src_rgb1[R]);                                 \
+    dst_u[0] = RGBToUJ(ar, ag, ab);                                            \
+    dst_v[0] = RGBToVJ(ar, ag, ab);                                            \
+  }                                                                            \
+}
+
+MAKEROWYJ(ARGB, 2, 1, 0, 4)
+#undef MAKEROWYJ
+
+void ARGBToUVJ422Row_C(const uint8* src_argb,
+                       uint8* dst_u, uint8* dst_v, int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    uint8 ab = (src_argb[0] + src_argb[4]) >> 1;
+    uint8 ag = (src_argb[1] + src_argb[5]) >> 1;
+    uint8 ar = (src_argb[2] + src_argb[6]) >> 1;
+    dst_u[0] = RGBToUJ(ar, ag, ab);
+    dst_v[0] = RGBToVJ(ar, ag, ab);
+    src_argb += 8;
+    dst_u += 1;
+    dst_v += 1;
+  }
+  if (width & 1) {
+    uint8 ab = src_argb[0];
+    uint8 ag = src_argb[1];
+    uint8 ar = src_argb[2];
+    dst_u[0] = RGBToUJ(ar, ag, ab);
+    dst_v[0] = RGBToVJ(ar, ag, ab);
+  }
+}
+
+void RGB565ToYRow_C(const uint8* src_rgb565, uint8* dst_y, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8 b = src_rgb565[0] & 0x1f;
+    uint8 g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
+    uint8 r = src_rgb565[1] >> 3;
+    b = (b << 3) | (b >> 2);
+    g = (g << 2) | (g >> 4);
+    r = (r << 3) | (r >> 2);
+    dst_y[0] = RGBToY(r, g, b);
+    src_rgb565 += 2;
+    dst_y += 1;
+  }
+}
+
+void ARGB1555ToYRow_C(const uint8* src_argb1555, uint8* dst_y, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8 b = src_argb1555[0] & 0x1f;
+    uint8 g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
+    uint8 r = (src_argb1555[1] & 0x7c) >> 2;
+    b = (b << 3) | (b >> 2);
+    g = (g << 3) | (g >> 2);
+    r = (r << 3) | (r >> 2);
+    dst_y[0] = RGBToY(r, g, b);
+    src_argb1555 += 2;
+    dst_y += 1;
+  }
+}
+
+void ARGB4444ToYRow_C(const uint8* src_argb4444, uint8* dst_y, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8 b = src_argb4444[0] & 0x0f;
+    uint8 g = src_argb4444[0] >> 4;
+    uint8 r = src_argb4444[1] & 0x0f;
+    b = (b << 4) | b;
+    g = (g << 4) | g;
+    r = (r << 4) | r;
+    dst_y[0] = RGBToY(r, g, b);
+    src_argb4444 += 2;
+    dst_y += 1;
+  }
+}
+
+void RGB565ToUVRow_C(const uint8* src_rgb565, int src_stride_rgb565,
+                     uint8* dst_u, uint8* dst_v, int width) {
+  const uint8* next_rgb565 = src_rgb565 + src_stride_rgb565;
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    uint8 b0 = src_rgb565[0] & 0x1f;
+    uint8 g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
+    uint8 r0 = src_rgb565[1] >> 3;
+    uint8 b1 = src_rgb565[2] & 0x1f;
+    uint8 g1 = (src_rgb565[2] >> 5) | ((src_rgb565[3] & 0x07) << 3);
+    uint8 r1 = src_rgb565[3] >> 3;
+    uint8 b2 = next_rgb565[0] & 0x1f;
+    uint8 g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
+    uint8 r2 = next_rgb565[1] >> 3;
+    uint8 b3 = next_rgb565[2] & 0x1f;
+    uint8 g3 = (next_rgb565[2] >> 5) | ((next_rgb565[3] & 0x07) << 3);
+    uint8 r3 = next_rgb565[3] >> 3;
+    uint8 b = (b0 + b1 + b2 + b3);  // 565 * 4 = 787.
+    uint8 g = (g0 + g1 + g2 + g3);
+    uint8 r = (r0 + r1 + r2 + r3);
+    b = (b << 1) | (b >> 6);  // 787 -> 888.
+    r = (r << 1) | (r >> 6);
+    dst_u[0] = RGBToU(r, g, b);
+    dst_v[0] = RGBToV(r, g, b);
+    src_rgb565 += 4;
+    next_rgb565 += 4;
+    dst_u += 1;
+    dst_v += 1;
+  }
+  if (width & 1) {
+    uint8 b0 = src_rgb565[0] & 0x1f;
+    uint8 g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
+    uint8 r0 = src_rgb565[1] >> 3;
+    uint8 b2 = next_rgb565[0] & 0x1f;
+    uint8 g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
+    uint8 r2 = next_rgb565[1] >> 3;
+    uint8 b = (b0 + b2);  // 565 * 2 = 676.
+    uint8 g = (g0 + g2);
+    uint8 r = (r0 + r2);
+    b = (b << 2) | (b >> 4);  // 676 -> 888
+    g = (g << 1) | (g >> 6);
+    r = (r << 2) | (r >> 4);
+    dst_u[0] = RGBToU(r, g, b);
+    dst_v[0] = RGBToV(r, g, b);
+  }
+}
+
+void ARGB1555ToUVRow_C(const uint8* src_argb1555, int src_stride_argb1555,
+                       uint8* dst_u, uint8* dst_v, int width) {
+  const uint8* next_argb1555 = src_argb1555 + src_stride_argb1555;
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    uint8 b0 = src_argb1555[0] & 0x1f;
+    uint8 g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
+    uint8 r0 = (src_argb1555[1] & 0x7c) >> 2;
+    uint8 b1 = src_argb1555[2] & 0x1f;
+    uint8 g1 = (src_argb1555[2] >> 5) | ((src_argb1555[3] & 0x03) << 3);
+    uint8 r1 = (src_argb1555[3] & 0x7c) >> 2;
+    uint8 b2 = next_argb1555[0] & 0x1f;
+    uint8 g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
+    uint8 r2 = (next_argb1555[1] & 0x7c) >> 2;
+    uint8 b3 = next_argb1555[2] & 0x1f;
+    uint8 g3 = (next_argb1555[2] >> 5) | ((next_argb1555[3] & 0x03) << 3);
+    uint8 r3 = (next_argb1555[3] & 0x7c) >> 2;
+    uint8 b = (b0 + b1 + b2 + b3);  // 555 * 4 = 777.
+    uint8 g = (g0 + g1 + g2 + g3);
+    uint8 r = (r0 + r1 + r2 + r3);
+    b = (b << 1) | (b >> 6);  // 777 -> 888.
+    g = (g << 1) | (g >> 6);
+    r = (r << 1) | (r >> 6);
+    dst_u[0] = RGBToU(r, g, b);
+    dst_v[0] = RGBToV(r, g, b);
+    src_argb1555 += 4;
+    next_argb1555 += 4;
+    dst_u += 1;
+    dst_v += 1;
+  }
+  if (width & 1) {
+    uint8 b0 = src_argb1555[0] & 0x1f;
+    uint8 g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
+    uint8 r0 = (src_argb1555[1] & 0x7c) >> 2;
+    uint8 b2 = next_argb1555[0] & 0x1f;
+    uint8 g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
+    uint8 r2 = next_argb1555[1] >> 3;
+    uint8 b = (b0 + b2);  // 555 * 2 = 666.
+    uint8 g = (g0 + g2);
+    uint8 r = (r0 + r2);
+    b = (b << 2) | (b >> 4);  // 666 -> 888.
+    g = (g << 2) | (g >> 4);
+    r = (r << 2) | (r >> 4);
+    dst_u[0] = RGBToU(r, g, b);
+    dst_v[0] = RGBToV(r, g, b);
+  }
+}
+
+void ARGB4444ToUVRow_C(const uint8* src_argb4444, int src_stride_argb4444,
+                       uint8* dst_u, uint8* dst_v, int width) {
+  const uint8* next_argb4444 = src_argb4444 + src_stride_argb4444;
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    uint8 b0 = src_argb4444[0] & 0x0f;
+    uint8 g0 = src_argb4444[0] >> 4;
+    uint8 r0 = src_argb4444[1] & 0x0f;
+    uint8 b1 = src_argb4444[2] & 0x0f;
+    uint8 g1 = src_argb4444[2] >> 4;
+    uint8 r1 = src_argb4444[3] & 0x0f;
+    uint8 b2 = next_argb4444[0] & 0x0f;
+    uint8 g2 = next_argb4444[0] >> 4;
+    uint8 r2 = next_argb4444[1] & 0x0f;
+    uint8 b3 = next_argb4444[2] & 0x0f;
+    uint8 g3 = next_argb4444[2] >> 4;
+    uint8 r3 = next_argb4444[3] & 0x0f;
+    uint8 b = (b0 + b1 + b2 + b3);  // 444 * 4 = 666.
+    uint8 g = (g0 + g1 + g2 + g3);
+    uint8 r = (r0 + r1 + r2 + r3);
+    b = (b << 2) | (b >> 4);  // 666 -> 888.
+    g = (g << 2) | (g >> 4);
+    r = (r << 2) | (r >> 4);
+    dst_u[0] = RGBToU(r, g, b);
+    dst_v[0] = RGBToV(r, g, b);
+    src_argb4444 += 4;
+    next_argb4444 += 4;
+    dst_u += 1;
+    dst_v += 1;
+  }
+  if (width & 1) {
+    uint8 b0 = src_argb4444[0] & 0x0f;
+    uint8 g0 = src_argb4444[0] >> 4;
+    uint8 r0 = src_argb4444[1] & 0x0f;
+    uint8 b2 = next_argb4444[0] & 0x0f;
+    uint8 g2 = next_argb4444[0] >> 4;
+    uint8 r2 = next_argb4444[1] & 0x0f;
+    uint8 b = (b0 + b2);  // 444 * 2 = 555.
+    uint8 g = (g0 + g2);
+    uint8 r = (r0 + r2);
+    b = (b << 3) | (b >> 2);  // 555 -> 888.
+    g = (g << 3) | (g >> 2);
+    r = (r << 3) | (r >> 2);
+    dst_u[0] = RGBToU(r, g, b);
+    dst_v[0] = RGBToV(r, g, b);
+  }
+}
+
+void ARGBToUV444Row_C(const uint8* src_argb,
+                      uint8* dst_u, uint8* dst_v, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8 ab = src_argb[0];
+    uint8 ag = src_argb[1];
+    uint8 ar = src_argb[2];
+    dst_u[0] = RGBToU(ar, ag, ab);
+    dst_v[0] = RGBToV(ar, ag, ab);
+    src_argb += 4;
+    dst_u += 1;
+    dst_v += 1;
+  }
+}
+
+void ARGBToUV422Row_C(const uint8* src_argb,
+                      uint8* dst_u, uint8* dst_v, int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    uint8 ab = (src_argb[0] + src_argb[4]) >> 1;
+    uint8 ag = (src_argb[1] + src_argb[5]) >> 1;
+    uint8 ar = (src_argb[2] + src_argb[6]) >> 1;
+    dst_u[0] = RGBToU(ar, ag, ab);
+    dst_v[0] = RGBToV(ar, ag, ab);
+    src_argb += 8;
+    dst_u += 1;
+    dst_v += 1;
+  }
+  if (width & 1) {
+    uint8 ab = src_argb[0];
+    uint8 ag = src_argb[1];
+    uint8 ar = src_argb[2];
+    dst_u[0] = RGBToU(ar, ag, ab);
+    dst_v[0] = RGBToV(ar, ag, ab);
+  }
+}
+
+void ARGBToUV411Row_C(const uint8* src_argb,
+                      uint8* dst_u, uint8* dst_v, int width) {
+  int x;
+  for (x = 0; x < width - 3; x += 4) {
+    uint8 ab = (src_argb[0] + src_argb[4] + src_argb[8] + src_argb[12]) >> 2;
+    uint8 ag = (src_argb[1] + src_argb[5] + src_argb[9] + src_argb[13]) >> 2;
+    uint8 ar = (src_argb[2] + src_argb[6] + src_argb[10] + src_argb[14]) >> 2;
+    dst_u[0] = RGBToU(ar, ag, ab);
+    dst_v[0] = RGBToV(ar, ag, ab);
+    src_argb += 16;
+    dst_u += 1;
+    dst_v += 1;
+  }
+  if ((width & 3) == 3) {
+    uint8 ab = (src_argb[0] + src_argb[4] + src_argb[8]) / 3;
+    uint8 ag = (src_argb[1] + src_argb[5] + src_argb[9]) / 3;
+    uint8 ar = (src_argb[2] + src_argb[6] + src_argb[10]) / 3;
+    dst_u[0] = RGBToU(ar, ag, ab);
+    dst_v[0] = RGBToV(ar, ag, ab);
+  } else if ((width & 3) == 2) {
+    uint8 ab = (src_argb[0] + src_argb[4]) >> 1;
+    uint8 ag = (src_argb[1] + src_argb[5]) >> 1;
+    uint8 ar = (src_argb[2] + src_argb[6]) >> 1;
+    dst_u[0] = RGBToU(ar, ag, ab);
+    dst_v[0] = RGBToV(ar, ag, ab);
+  } else if ((width & 3) == 1) {
+    uint8 ab = src_argb[0];
+    uint8 ag = src_argb[1];
+    uint8 ar = src_argb[2];
+    dst_u[0] = RGBToU(ar, ag, ab);
+    dst_v[0] = RGBToV(ar, ag, ab);
+  }
+}
+
+void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8 y = RGBToYJ(src_argb[2], src_argb[1], src_argb[0]);
+    dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
+    dst_argb[3] = src_argb[3];
+    dst_argb += 4;
+    src_argb += 4;
+  }
+}
+
+// Convert a row of image to Sepia tone.
+void ARGBSepiaRow_C(uint8* dst_argb, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    int b = dst_argb[0];
+    int g = dst_argb[1];
+    int r = dst_argb[2];
+    int sb = (b * 17 + g * 68 + r * 35) >> 7;
+    int sg = (b * 22 + g * 88 + r * 45) >> 7;
+    int sr = (b * 24 + g * 98 + r * 50) >> 7;
+    // b does not over flow. a is preserved from original.
+    dst_argb[0] = sb;
+    dst_argb[1] = clamp255(sg);
+    dst_argb[2] = clamp255(sr);
+    dst_argb += 4;
+  }
+}
+
+// Apply color matrix to a row of image. Matrix is signed.
+// TODO(fbarchard): Consider adding rounding (+32).
+void ARGBColorMatrixRow_C(const uint8* src_argb, uint8* dst_argb,
+                          const int8* matrix_argb, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    int b = src_argb[0];
+    int g = src_argb[1];
+    int r = src_argb[2];
+    int a = src_argb[3];
+    int sb = (b * matrix_argb[0] + g * matrix_argb[1] +
+              r * matrix_argb[2] + a * matrix_argb[3]) >> 6;
+    int sg = (b * matrix_argb[4] + g * matrix_argb[5] +
+              r * matrix_argb[6] + a * matrix_argb[7]) >> 6;
+    int sr = (b * matrix_argb[8] + g * matrix_argb[9] +
+              r * matrix_argb[10] + a * matrix_argb[11]) >> 6;
+    int sa = (b * matrix_argb[12] + g * matrix_argb[13] +
+              r * matrix_argb[14] + a * matrix_argb[15]) >> 6;
+    dst_argb[0] = Clamp(sb);
+    dst_argb[1] = Clamp(sg);
+    dst_argb[2] = Clamp(sr);
+    dst_argb[3] = Clamp(sa);
+    src_argb += 4;
+    dst_argb += 4;
+  }
+}
+
+// Apply color table to a row of image.
+void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    int b = dst_argb[0];
+    int g = dst_argb[1];
+    int r = dst_argb[2];
+    int a = dst_argb[3];
+    dst_argb[0] = table_argb[b * 4 + 0];
+    dst_argb[1] = table_argb[g * 4 + 1];
+    dst_argb[2] = table_argb[r * 4 + 2];
+    dst_argb[3] = table_argb[a * 4 + 3];
+    dst_argb += 4;
+  }
+}
+
+// Apply color table to a row of image.
+void RGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    int b = dst_argb[0];
+    int g = dst_argb[1];
+    int r = dst_argb[2];
+    dst_argb[0] = table_argb[b * 4 + 0];
+    dst_argb[1] = table_argb[g * 4 + 1];
+    dst_argb[2] = table_argb[r * 4 + 2];
+    dst_argb += 4;
+  }
+}
+
+void ARGBQuantizeRow_C(uint8* dst_argb, int scale, int interval_size,
+                       int interval_offset, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    int b = dst_argb[0];
+    int g = dst_argb[1];
+    int r = dst_argb[2];
+    dst_argb[0] = (b * scale >> 16) * interval_size + interval_offset;
+    dst_argb[1] = (g * scale >> 16) * interval_size + interval_offset;
+    dst_argb[2] = (r * scale >> 16) * interval_size + interval_offset;
+    dst_argb += 4;
+  }
+}
+
+#define REPEAT8(v) (v) | ((v) << 8)
+#define SHADE(f, v) v * f >> 24
+
+void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width,
+                    uint32 value) {
+  const uint32 b_scale = REPEAT8(value & 0xff);
+  const uint32 g_scale = REPEAT8((value >> 8) & 0xff);
+  const uint32 r_scale = REPEAT8((value >> 16) & 0xff);
+  const uint32 a_scale = REPEAT8(value >> 24);
+
+  int i;
+  for (i = 0; i < width; ++i) {
+    const uint32 b = REPEAT8(src_argb[0]);
+    const uint32 g = REPEAT8(src_argb[1]);
+    const uint32 r = REPEAT8(src_argb[2]);
+    const uint32 a = REPEAT8(src_argb[3]);
+    dst_argb[0] = SHADE(b, b_scale);
+    dst_argb[1] = SHADE(g, g_scale);
+    dst_argb[2] = SHADE(r, r_scale);
+    dst_argb[3] = SHADE(a, a_scale);
+    src_argb += 4;
+    dst_argb += 4;
+  }
+}
+#undef REPEAT8
+#undef SHADE
+
+#define REPEAT8(v) (v) | ((v) << 8)
+#define SHADE(f, v) v * f >> 16
+
+void ARGBMultiplyRow_C(const uint8* src_argb0, const uint8* src_argb1,
+                       uint8* dst_argb, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    const uint32 b = REPEAT8(src_argb0[0]);
+    const uint32 g = REPEAT8(src_argb0[1]);
+    const uint32 r = REPEAT8(src_argb0[2]);
+    const uint32 a = REPEAT8(src_argb0[3]);
+    const uint32 b_scale = src_argb1[0];
+    const uint32 g_scale = src_argb1[1];
+    const uint32 r_scale = src_argb1[2];
+    const uint32 a_scale = src_argb1[3];
+    dst_argb[0] = SHADE(b, b_scale);
+    dst_argb[1] = SHADE(g, g_scale);
+    dst_argb[2] = SHADE(r, r_scale);
+    dst_argb[3] = SHADE(a, a_scale);
+    src_argb0 += 4;
+    src_argb1 += 4;
+    dst_argb += 4;
+  }
+}
+#undef REPEAT8
+#undef SHADE
+
+#define SHADE(f, v) clamp255(v + f)
+
+void ARGBAddRow_C(const uint8* src_argb0, const uint8* src_argb1,
+                  uint8* dst_argb, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    const int b = src_argb0[0];
+    const int g = src_argb0[1];
+    const int r = src_argb0[2];
+    const int a = src_argb0[3];
+    const int b_add = src_argb1[0];
+    const int g_add = src_argb1[1];
+    const int r_add = src_argb1[2];
+    const int a_add = src_argb1[3];
+    dst_argb[0] = SHADE(b, b_add);
+    dst_argb[1] = SHADE(g, g_add);
+    dst_argb[2] = SHADE(r, r_add);
+    dst_argb[3] = SHADE(a, a_add);
+    src_argb0 += 4;
+    src_argb1 += 4;
+    dst_argb += 4;
+  }
+}
+#undef SHADE
+
+#define SHADE(f, v) clamp0(f - v)
+
+void ARGBSubtractRow_C(const uint8* src_argb0, const uint8* src_argb1,
+                       uint8* dst_argb, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    const int b = src_argb0[0];
+    const int g = src_argb0[1];
+    const int r = src_argb0[2];
+    const int a = src_argb0[3];
+    const int b_sub = src_argb1[0];
+    const int g_sub = src_argb1[1];
+    const int r_sub = src_argb1[2];
+    const int a_sub = src_argb1[3];
+    dst_argb[0] = SHADE(b, b_sub);
+    dst_argb[1] = SHADE(g, g_sub);
+    dst_argb[2] = SHADE(r, r_sub);
+    dst_argb[3] = SHADE(a, a_sub);
+    src_argb0 += 4;
+    src_argb1 += 4;
+    dst_argb += 4;
+  }
+}
+#undef SHADE
+
+// Sobel functions which mimics SSSE3.
+void SobelXRow_C(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2,
+                 uint8* dst_sobelx, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    int a = src_y0[i];
+    int b = src_y1[i];
+    int c = src_y2[i];
+    int a_sub = src_y0[i + 2];
+    int b_sub = src_y1[i + 2];
+    int c_sub = src_y2[i + 2];
+    int a_diff = a - a_sub;
+    int b_diff = b - b_sub;
+    int c_diff = c - c_sub;
+    int sobel = Abs(a_diff + b_diff * 2 + c_diff);
+    dst_sobelx[i] = (uint8)(clamp255(sobel));
+  }
+}
+
+void SobelYRow_C(const uint8* src_y0, const uint8* src_y1,
+                 uint8* dst_sobely, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    int a = src_y0[i + 0];
+    int b = src_y0[i + 1];
+    int c = src_y0[i + 2];
+    int a_sub = src_y1[i + 0];
+    int b_sub = src_y1[i + 1];
+    int c_sub = src_y1[i + 2];
+    int a_diff = a - a_sub;
+    int b_diff = b - b_sub;
+    int c_diff = c - c_sub;
+    int sobel = Abs(a_diff + b_diff * 2 + c_diff);
+    dst_sobely[i] = (uint8)(clamp255(sobel));
+  }
+}
+
+void SobelRow_C(const uint8* src_sobelx, const uint8* src_sobely,
+                uint8* dst_argb, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    int r = src_sobelx[i];
+    int b = src_sobely[i];
+    int s = clamp255(r + b);
+    dst_argb[0] = (uint8)(s);
+    dst_argb[1] = (uint8)(s);
+    dst_argb[2] = (uint8)(s);
+    dst_argb[3] = (uint8)(255u);
+    dst_argb += 4;
+  }
+}
+
+void SobelToPlaneRow_C(const uint8* src_sobelx, const uint8* src_sobely,
+                       uint8* dst_y, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    int r = src_sobelx[i];
+    int b = src_sobely[i];
+    int s = clamp255(r + b);
+    dst_y[i] = (uint8)(s);
+  }
+}
+
+void SobelXYRow_C(const uint8* src_sobelx, const uint8* src_sobely,
+                  uint8* dst_argb, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    int r = src_sobelx[i];
+    int b = src_sobely[i];
+    int g = clamp255(r + b);
+    dst_argb[0] = (uint8)(b);
+    dst_argb[1] = (uint8)(g);
+    dst_argb[2] = (uint8)(r);
+    dst_argb[3] = (uint8)(255u);
+    dst_argb += 4;
+  }
+}
+
+void J400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) {
+  // Copy a Y to RGB.
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8 y = src_y[0];
+    dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
+    dst_argb[3] = 255u;
+    dst_argb += 4;
+    ++src_y;
+  }
+}
+
+// BT.601 YUV to RGB reference
+//  R = (Y - 16) * 1.164              - V * -1.596
+//  G = (Y - 16) * 1.164 - U *  0.391 - V *  0.813
+//  B = (Y - 16) * 1.164 - U * -2.018
+
+// Y contribution to R,G,B.  Scale and bias.
+// TODO(fbarchard): Consider moving constants into a common header.
+#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
+
+// U and V contributions to R,G,B.
+#define UB -128 /* max(-128, round(-2.018 * 64)) */
+#define UG 25 /* round(0.391 * 64) */
+#define VG 52 /* round(0.813 * 64) */
+#define VR -102 /* round(-1.596 * 64) */
+
+// Bias values to subtract 16 from Y and 128 from U and V.
+#define BB (UB * 128 + YGB)
+#define BG (UG * 128 + VG * 128 + YGB)
+#define BR (VR * 128 + YGB)
+
+// C reference code that mimics the YUV assembly.
+static __inline void YuvPixel(uint8 y, uint8 u, uint8 v,
+                              uint8* b, uint8* g, uint8* r) {
+  uint32 y1 = (uint32)(y * 0x0101 * YG) >> 16;
+  *b = Clamp((int32)(-(u * UB) + y1 + BB) >> 6);
+  *g = Clamp((int32)(-(v * VG + u * UG) + y1 + BG) >> 6);
+  *r = Clamp((int32)(-(v * VR)+ y1 + BR) >> 6);
+}
+
+// C reference code that mimics the YUV assembly.
+static __inline void YPixel(uint8 y, uint8* b, uint8* g, uint8* r) {
+  uint32 y1 = (uint32)(y * 0x0101 * YG) >> 16;
+  *b = Clamp((int32)(y1 + YGB) >> 6);
+  *g = Clamp((int32)(y1 + YGB) >> 6);
+  *r = Clamp((int32)(y1 + YGB) >> 6);
+}
+
+#undef YG
+#undef YGB
+#undef UB
+#undef UG
+#undef VG
+#undef VR
+#undef BB
+#undef BG
+#undef BR
+
+// JPEG YUV to RGB reference
+// *  R = Y                - V * -1.40200
+// *  G = Y - U *  0.34414 - V *  0.71414
+// *  B = Y - U * -1.77200
+
+// Y contribution to R,G,B.  Scale and bias.
+// TODO(fbarchard): Consider moving constants into a common header.
+#define YGJ 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
+#define YGBJ 32  /* 64 / 2 */
+
+// U and V contributions to R,G,B.
+#define UBJ -113 /* round(-1.77200 * 64) */
+#define UGJ 22 /* round(0.34414 * 64) */
+#define VGJ 46 /* round(0.71414  * 64) */
+#define VRJ -90 /* round(-1.40200 * 64) */
+
+// Bias values to subtract 16 from Y and 128 from U and V.
+#define BBJ (UBJ * 128 + YGBJ)
+#define BGJ (UGJ * 128 + VGJ * 128 + YGBJ)
+#define BRJ (VRJ * 128 + YGBJ)
+
+// C reference code that mimics the YUV assembly.
+static __inline void YuvJPixel(uint8 y, uint8 u, uint8 v,
+                               uint8* b, uint8* g, uint8* r) {
+  uint32 y1 = (uint32)(y * 0x0101 * YGJ) >> 16;
+  *b = Clamp((int32)(-(u * UBJ) + y1 + BBJ) >> 6);
+  *g = Clamp((int32)(-(v * VGJ + u * UGJ) + y1 + BGJ) >> 6);
+  *r = Clamp((int32)(-(v * VRJ) + y1 + BRJ) >> 6);
+}
+
+#undef YGJ
+#undef YGBJ
+#undef UBJ
+#undef UGJ
+#undef VGJ
+#undef VRJ
+#undef BBJ
+#undef BGJ
+#undef BRJ
+
+#if !defined(LIBYUV_DISABLE_NEON) && \
+    (defined(__ARM_NEON__) || defined(__aarch64__) || defined(LIBYUV_NEON))
+// C mimic assembly.
+// TODO(fbarchard): Remove subsampling from Neon.
+void I444ToARGBRow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* rgb_buf,
+                     int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    uint8 u = (src_u[0] + src_u[1] + 1) >> 1;
+    uint8 v = (src_v[0] + src_v[1] + 1) >> 1;
+    YuvPixel(src_y[0], u, v, rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    rgb_buf[3] = 255;
+    YuvPixel(src_y[1], u, v, rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
+    rgb_buf[7] = 255;
+    src_y += 2;
+    src_u += 2;
+    src_v += 2;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+  }
+}
+#else
+void I444ToARGBRow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* rgb_buf,
+                     int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    rgb_buf[3] = 255;
+    src_y += 1;
+    src_u += 1;
+    src_v += 1;
+    rgb_buf += 4;  // Advance 1 pixel.
+  }
+}
+#endif
+
+// Also used for 420
+void I422ToARGBRow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* rgb_buf,
+                     int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    rgb_buf[3] = 255;
+    YuvPixel(src_y[1], src_u[0], src_v[0],
+             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
+    rgb_buf[7] = 255;
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    rgb_buf[3] = 255;
+  }
+}
+
+void J422ToARGBRow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* rgb_buf,
+                     int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvJPixel(src_y[0], src_u[0], src_v[0],
+              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    rgb_buf[3] = 255;
+    YuvJPixel(src_y[1], src_u[0], src_v[0],
+              rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
+    rgb_buf[7] = 255;
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvJPixel(src_y[0], src_u[0], src_v[0],
+              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    rgb_buf[3] = 255;
+  }
+}
+
+void I422ToRGB24Row_C(const uint8* src_y,
+                      const uint8* src_u,
+                      const uint8* src_v,
+                      uint8* rgb_buf,
+                      int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    YuvPixel(src_y[1], src_u[0], src_v[0],
+             rgb_buf + 3, rgb_buf + 4, rgb_buf + 5);
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    rgb_buf += 6;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+  }
+}
+
+void I422ToRAWRow_C(const uint8* src_y,
+                    const uint8* src_u,
+                    const uint8* src_v,
+                    uint8* rgb_buf,
+                    int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);
+    YuvPixel(src_y[1], src_u[0], src_v[0],
+             rgb_buf + 5, rgb_buf + 4, rgb_buf + 3);
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    rgb_buf += 6;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);
+  }
+}
+
+void I422ToARGB4444Row_C(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_argb4444,
+                         int width) {
+  uint8 b0;
+  uint8 g0;
+  uint8 r0;
+  uint8 b1;
+  uint8 g1;
+  uint8 r1;
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
+    YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1);
+    b0 = b0 >> 4;
+    g0 = g0 >> 4;
+    r0 = r0 >> 4;
+    b1 = b1 >> 4;
+    g1 = g1 >> 4;
+    r1 = r1 >> 4;
+    *(uint32*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) |
+        (b1 << 16) | (g1 << 20) | (r1 << 24) | 0xf000f000;
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    dst_argb4444 += 4;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
+    b0 = b0 >> 4;
+    g0 = g0 >> 4;
+    r0 = r0 >> 4;
+    *(uint16*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) |
+        0xf000;
+  }
+}
+
+void I422ToARGB1555Row_C(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_argb1555,
+                         int width) {
+  uint8 b0;
+  uint8 g0;
+  uint8 r0;
+  uint8 b1;
+  uint8 g1;
+  uint8 r1;
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
+    YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1);
+    b0 = b0 >> 3;
+    g0 = g0 >> 3;
+    r0 = r0 >> 3;
+    b1 = b1 >> 3;
+    g1 = g1 >> 3;
+    r1 = r1 >> 3;
+    *(uint32*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) |
+        (b1 << 16) | (g1 << 21) | (r1 << 26) | 0x80008000;
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    dst_argb1555 += 4;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
+    b0 = b0 >> 3;
+    g0 = g0 >> 3;
+    r0 = r0 >> 3;
+    *(uint16*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) |
+        0x8000;
+  }
+}
+
+void I422ToRGB565Row_C(const uint8* src_y,
+                       const uint8* src_u,
+                       const uint8* src_v,
+                       uint8* dst_rgb565,
+                       int width) {
+  uint8 b0;
+  uint8 g0;
+  uint8 r0;
+  uint8 b1;
+  uint8 g1;
+  uint8 r1;
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
+    YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1);
+    b0 = b0 >> 3;
+    g0 = g0 >> 2;
+    r0 = r0 >> 3;
+    b1 = b1 >> 3;
+    g1 = g1 >> 2;
+    r1 = r1 >> 3;
+    *(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) |
+        (b1 << 16) | (g1 << 21) | (r1 << 27);
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    dst_rgb565 += 4;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
+    b0 = b0 >> 3;
+    g0 = g0 >> 2;
+    r0 = r0 >> 3;
+    *(uint16*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
+  }
+}
+
+void I411ToARGBRow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* rgb_buf,
+                     int width) {
+  int x;
+  for (x = 0; x < width - 3; x += 4) {
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    rgb_buf[3] = 255;
+    YuvPixel(src_y[1], src_u[0], src_v[0],
+             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
+    rgb_buf[7] = 255;
+    YuvPixel(src_y[2], src_u[0], src_v[0],
+             rgb_buf + 8, rgb_buf + 9, rgb_buf + 10);
+    rgb_buf[11] = 255;
+    YuvPixel(src_y[3], src_u[0], src_v[0],
+             rgb_buf + 12, rgb_buf + 13, rgb_buf + 14);
+    rgb_buf[15] = 255;
+    src_y += 4;
+    src_u += 1;
+    src_v += 1;
+    rgb_buf += 16;  // Advance 4 pixels.
+  }
+  if (width & 2) {
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    rgb_buf[3] = 255;
+    YuvPixel(src_y[1], src_u[0], src_v[0],
+             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
+    rgb_buf[7] = 255;
+    src_y += 2;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    rgb_buf[3] = 255;
+  }
+}
+
+void NV12ToARGBRow_C(const uint8* src_y,
+                     const uint8* src_uv,
+                     uint8* rgb_buf,
+                     int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_uv[0], src_uv[1],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    rgb_buf[3] = 255;
+    YuvPixel(src_y[1], src_uv[0], src_uv[1],
+             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
+    rgb_buf[7] = 255;
+    src_y += 2;
+    src_uv += 2;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_uv[0], src_uv[1],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    rgb_buf[3] = 255;
+  }
+}
+
+void NV21ToARGBRow_C(const uint8* src_y,
+                     const uint8* src_vu,
+                     uint8* rgb_buf,
+                     int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_vu[1], src_vu[0],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    rgb_buf[3] = 255;
+
+    YuvPixel(src_y[1], src_vu[1], src_vu[0],
+             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
+    rgb_buf[7] = 255;
+
+    src_y += 2;
+    src_vu += 2;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_vu[1], src_vu[0],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    rgb_buf[3] = 255;
+  }
+}
+
+void NV12ToRGB565Row_C(const uint8* src_y,
+                       const uint8* src_uv,
+                       uint8* dst_rgb565,
+                       int width) {
+  uint8 b0;
+  uint8 g0;
+  uint8 r0;
+  uint8 b1;
+  uint8 g1;
+  uint8 r1;
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0);
+    YuvPixel(src_y[1], src_uv[0], src_uv[1], &b1, &g1, &r1);
+    b0 = b0 >> 3;
+    g0 = g0 >> 2;
+    r0 = r0 >> 3;
+    b1 = b1 >> 3;
+    g1 = g1 >> 2;
+    r1 = r1 >> 3;
+    *(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) |
+        (b1 << 16) | (g1 << 21) | (r1 << 27);
+    src_y += 2;
+    src_uv += 2;
+    dst_rgb565 += 4;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0);
+    b0 = b0 >> 3;
+    g0 = g0 >> 2;
+    r0 = r0 >> 3;
+    *(uint16*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
+  }
+}
+
+void NV21ToRGB565Row_C(const uint8* src_y,
+                       const uint8* vsrc_u,
+                       uint8* dst_rgb565,
+                       int width) {
+  uint8 b0;
+  uint8 g0;
+  uint8 r0;
+  uint8 b1;
+  uint8 g1;
+  uint8 r1;
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], vsrc_u[1], vsrc_u[0], &b0, &g0, &r0);
+    YuvPixel(src_y[1], vsrc_u[1], vsrc_u[0], &b1, &g1, &r1);
+    b0 = b0 >> 3;
+    g0 = g0 >> 2;
+    r0 = r0 >> 3;
+    b1 = b1 >> 3;
+    g1 = g1 >> 2;
+    r1 = r1 >> 3;
+    *(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) |
+        (b1 << 16) | (g1 << 21) | (r1 << 27);
+    src_y += 2;
+    vsrc_u += 2;
+    dst_rgb565 += 4;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], vsrc_u[1], vsrc_u[0], &b0, &g0, &r0);
+    b0 = b0 >> 3;
+    g0 = g0 >> 2;
+    r0 = r0 >> 3;
+    *(uint16*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
+  }
+}
+
+void YUY2ToARGBRow_C(const uint8* src_yuy2,
+                     uint8* rgb_buf,
+                     int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    rgb_buf[3] = 255;
+    YuvPixel(src_yuy2[2], src_yuy2[1], src_yuy2[3],
+             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
+    rgb_buf[7] = 255;
+    src_yuy2 += 4;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    rgb_buf[3] = 255;
+  }
+}
+
+void UYVYToARGBRow_C(const uint8* src_uyvy,
+                     uint8* rgb_buf,
+                     int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    rgb_buf[3] = 255;
+    YuvPixel(src_uyvy[3], src_uyvy[0], src_uyvy[2],
+             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
+    rgb_buf[7] = 255;
+    src_uyvy += 4;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2],
+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    rgb_buf[3] = 255;
+  }
+}
+
+void I422ToBGRARow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* rgb_buf,
+                     int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 3, rgb_buf + 2, rgb_buf + 1);
+    rgb_buf[0] = 255;
+    YuvPixel(src_y[1], src_u[0], src_v[0],
+             rgb_buf + 7, rgb_buf + 6, rgb_buf + 5);
+    rgb_buf[4] = 255;
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 3, rgb_buf + 2, rgb_buf + 1);
+    rgb_buf[0] = 255;
+  }
+}
+
+void I422ToABGRRow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* rgb_buf,
+                     int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);
+    rgb_buf[3] = 255;
+    YuvPixel(src_y[1], src_u[0], src_v[0],
+             rgb_buf + 6, rgb_buf + 5, rgb_buf + 4);
+    rgb_buf[7] = 255;
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);
+    rgb_buf[3] = 255;
+  }
+}
+
+void I422ToRGBARow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* rgb_buf,
+                     int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 1, rgb_buf + 2, rgb_buf + 3);
+    rgb_buf[0] = 255;
+    YuvPixel(src_y[1], src_u[0], src_v[0],
+             rgb_buf + 5, rgb_buf + 6, rgb_buf + 7);
+    rgb_buf[4] = 255;
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_u[0], src_v[0],
+             rgb_buf + 1, rgb_buf + 2, rgb_buf + 3);
+    rgb_buf[0] = 255;
+  }
+}
+
+void I400ToARGBRow_C(const uint8* src_y, uint8* rgb_buf, int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    rgb_buf[3] = 255;
+    YPixel(src_y[1], rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
+    rgb_buf[7] = 255;
+    src_y += 2;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    rgb_buf[3] = 255;
+  }
+}
+
+void MirrorRow_C(const uint8* src, uint8* dst, int width) {
+  int x;
+  src += width - 1;
+  for (x = 0; x < width - 1; x += 2) {
+    dst[x] = src[0];
+    dst[x + 1] = src[-1];
+    src -= 2;
+  }
+  if (width & 1) {
+    dst[width - 1] = src[0];
+  }
+}
+
+void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
+  int x;
+  src_uv += (width - 1) << 1;
+  for (x = 0; x < width - 1; x += 2) {
+    dst_u[x] = src_uv[0];
+    dst_u[x + 1] = src_uv[-2];
+    dst_v[x] = src_uv[1];
+    dst_v[x + 1] = src_uv[-2 + 1];
+    src_uv -= 4;
+  }
+  if (width & 1) {
+    dst_u[width - 1] = src_uv[0];
+    dst_v[width - 1] = src_uv[1];
+  }
+}
+
+void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width) {
+  int x;
+  const uint32* src32 = (const uint32*)(src);
+  uint32* dst32 = (uint32*)(dst);
+  src32 += width - 1;
+  for (x = 0; x < width - 1; x += 2) {
+    dst32[x] = src32[0];
+    dst32[x + 1] = src32[-1];
+    src32 -= 2;
+  }
+  if (width & 1) {
+    dst32[width - 1] = src32[0];
+  }
+}
+
+void SplitUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    dst_u[x] = src_uv[0];
+    dst_u[x + 1] = src_uv[2];
+    dst_v[x] = src_uv[1];
+    dst_v[x + 1] = src_uv[3];
+    src_uv += 4;
+  }
+  if (width & 1) {
+    dst_u[width - 1] = src_uv[0];
+    dst_v[width - 1] = src_uv[1];
+  }
+}
+
+void MergeUVRow_C(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                  int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    dst_uv[0] = src_u[x];
+    dst_uv[1] = src_v[x];
+    dst_uv[2] = src_u[x + 1];
+    dst_uv[3] = src_v[x + 1];
+    dst_uv += 4;
+  }
+  if (width & 1) {
+    dst_uv[0] = src_u[width - 1];
+    dst_uv[1] = src_v[width - 1];
+  }
+}
+
+void CopyRow_C(const uint8* src, uint8* dst, int count) {
+  memcpy(dst, src, count);
+}
+
+void CopyRow_16_C(const uint16* src, uint16* dst, int count) {
+  memcpy(dst, src, count * 2);
+}
+
+void SetRow_C(uint8* dst, uint8 v8, int width) {
+  memset(dst, v8, width);
+}
+
+void ARGBSetRow_C(uint8* dst_argb, uint32 v32, int width) {
+  uint32* d = (uint32*)(dst_argb);
+  int x;
+  for (x = 0; x < width; ++x) {
+    d[x] = v32;
+  }
+}
+
+// Filter 2 rows of YUY2 UV's (422) into U and V (420).
+void YUY2ToUVRow_C(const uint8* src_yuy2, int src_stride_yuy2,
+                   uint8* dst_u, uint8* dst_v, int width) {
+  // Output a row of UV values, filtering 2 rows of YUY2.
+  int x;
+  for (x = 0; x < width; x += 2) {
+    dst_u[0] = (src_yuy2[1] + src_yuy2[src_stride_yuy2 + 1] + 1) >> 1;
+    dst_v[0] = (src_yuy2[3] + src_yuy2[src_stride_yuy2 + 3] + 1) >> 1;
+    src_yuy2 += 4;
+    dst_u += 1;
+    dst_v += 1;
+  }
+}
+
+// Copy row of YUY2 UV's (422) into U and V (422).
+void YUY2ToUV422Row_C(const uint8* src_yuy2,
+                      uint8* dst_u, uint8* dst_v, int width) {
+  // Output a row of UV values.
+  int x;
+  for (x = 0; x < width; x += 2) {
+    dst_u[0] = src_yuy2[1];
+    dst_v[0] = src_yuy2[3];
+    src_yuy2 += 4;
+    dst_u += 1;
+    dst_v += 1;
+  }
+}
+
+// Copy row of YUY2 Y's (422) into Y (420/422).
+void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width) {
+  // Output a row of Y values.
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    dst_y[x] = src_yuy2[0];
+    dst_y[x + 1] = src_yuy2[2];
+    src_yuy2 += 4;
+  }
+  if (width & 1) {
+    dst_y[width - 1] = src_yuy2[0];
+  }
+}
+
+// Filter 2 rows of UYVY UV's (422) into U and V (420).
+void UYVYToUVRow_C(const uint8* src_uyvy, int src_stride_uyvy,
+                   uint8* dst_u, uint8* dst_v, int width) {
+  // Output a row of UV values.
+  int x;
+  for (x = 0; x < width; x += 2) {
+    dst_u[0] = (src_uyvy[0] + src_uyvy[src_stride_uyvy + 0] + 1) >> 1;
+    dst_v[0] = (src_uyvy[2] + src_uyvy[src_stride_uyvy + 2] + 1) >> 1;
+    src_uyvy += 4;
+    dst_u += 1;
+    dst_v += 1;
+  }
+}
+
+// Copy row of UYVY UV's (422) into U and V (422).
+void UYVYToUV422Row_C(const uint8* src_uyvy,
+                      uint8* dst_u, uint8* dst_v, int width) {
+  // Output a row of UV values.
+  int x;
+  for (x = 0; x < width; x += 2) {
+    dst_u[0] = src_uyvy[0];
+    dst_v[0] = src_uyvy[2];
+    src_uyvy += 4;
+    dst_u += 1;
+    dst_v += 1;
+  }
+}
+
+// Copy row of UYVY Y's (422) into Y (420/422).
+void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int width) {
+  // Output a row of Y values.
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    dst_y[x] = src_uyvy[1];
+    dst_y[x + 1] = src_uyvy[3];
+    src_uyvy += 4;
+  }
+  if (width & 1) {
+    dst_y[width - 1] = src_uyvy[1];
+  }
+}
+
+#define BLEND(f, b, a) (((256 - a) * b) >> 8) + f
+
+// Blend src_argb0 over src_argb1 and store to dst_argb.
+// dst_argb may be src_argb0 or src_argb1.
+// This code mimics the SSSE3 version for better testability.
+void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1,
+                    uint8* dst_argb, int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    uint32 fb = src_argb0[0];
+    uint32 fg = src_argb0[1];
+    uint32 fr = src_argb0[2];
+    uint32 a = src_argb0[3];
+    uint32 bb = src_argb1[0];
+    uint32 bg = src_argb1[1];
+    uint32 br = src_argb1[2];
+    dst_argb[0] = BLEND(fb, bb, a);
+    dst_argb[1] = BLEND(fg, bg, a);
+    dst_argb[2] = BLEND(fr, br, a);
+    dst_argb[3] = 255u;
+
+    fb = src_argb0[4 + 0];
+    fg = src_argb0[4 + 1];
+    fr = src_argb0[4 + 2];
+    a = src_argb0[4 + 3];
+    bb = src_argb1[4 + 0];
+    bg = src_argb1[4 + 1];
+    br = src_argb1[4 + 2];
+    dst_argb[4 + 0] = BLEND(fb, bb, a);
+    dst_argb[4 + 1] = BLEND(fg, bg, a);
+    dst_argb[4 + 2] = BLEND(fr, br, a);
+    dst_argb[4 + 3] = 255u;
+    src_argb0 += 8;
+    src_argb1 += 8;
+    dst_argb += 8;
+  }
+
+  if (width & 1) {
+    uint32 fb = src_argb0[0];
+    uint32 fg = src_argb0[1];
+    uint32 fr = src_argb0[2];
+    uint32 a = src_argb0[3];
+    uint32 bb = src_argb1[0];
+    uint32 bg = src_argb1[1];
+    uint32 br = src_argb1[2];
+    dst_argb[0] = BLEND(fb, bb, a);
+    dst_argb[1] = BLEND(fg, bg, a);
+    dst_argb[2] = BLEND(fr, br, a);
+    dst_argb[3] = 255u;
+  }
+}
+#undef BLEND
+#define ATTENUATE(f, a) (a | (a << 8)) * (f | (f << 8)) >> 24
+
+// Multiply source RGB by alpha and store to destination.
+// This code mimics the SSSE3 version for better testability.
+void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
+  int i;
+  for (i = 0; i < width - 1; i += 2) {
+    uint32 b = src_argb[0];
+    uint32 g = src_argb[1];
+    uint32 r = src_argb[2];
+    uint32 a = src_argb[3];
+    dst_argb[0] = ATTENUATE(b, a);
+    dst_argb[1] = ATTENUATE(g, a);
+    dst_argb[2] = ATTENUATE(r, a);
+    dst_argb[3] = a;
+    b = src_argb[4];
+    g = src_argb[5];
+    r = src_argb[6];
+    a = src_argb[7];
+    dst_argb[4] = ATTENUATE(b, a);
+    dst_argb[5] = ATTENUATE(g, a);
+    dst_argb[6] = ATTENUATE(r, a);
+    dst_argb[7] = a;
+    src_argb += 8;
+    dst_argb += 8;
+  }
+
+  if (width & 1) {
+    const uint32 b = src_argb[0];
+    const uint32 g = src_argb[1];
+    const uint32 r = src_argb[2];
+    const uint32 a = src_argb[3];
+    dst_argb[0] = ATTENUATE(b, a);
+    dst_argb[1] = ATTENUATE(g, a);
+    dst_argb[2] = ATTENUATE(r, a);
+    dst_argb[3] = a;
+  }
+}
+#undef ATTENUATE
+
+// Divide source RGB by alpha and store to destination.
+// b = (b * 255 + (a / 2)) / a;
+// g = (g * 255 + (a / 2)) / a;
+// r = (r * 255 + (a / 2)) / a;
+// Reciprocal method is off by 1 on some values. ie 125
+// 8.8 fixed point inverse table with 1.0 in upper short and 1 / a in lower.
+#define T(a) 0x01000000 + (0x10000 / a)
+const uint32 fixed_invtbl8[256] = {
+  0x01000000, 0x0100ffff, T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), T(0x07),
+  T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d), T(0x0e), T(0x0f),
+  T(0x10), T(0x11), T(0x12), T(0x13), T(0x14), T(0x15), T(0x16), T(0x17),
+  T(0x18), T(0x19), T(0x1a), T(0x1b), T(0x1c), T(0x1d), T(0x1e), T(0x1f),
+  T(0x20), T(0x21), T(0x22), T(0x23), T(0x24), T(0x25), T(0x26), T(0x27),
+  T(0x28), T(0x29), T(0x2a), T(0x2b), T(0x2c), T(0x2d), T(0x2e), T(0x2f),
+  T(0x30), T(0x31), T(0x32), T(0x33), T(0x34), T(0x35), T(0x36), T(0x37),
+  T(0x38), T(0x39), T(0x3a), T(0x3b), T(0x3c), T(0x3d), T(0x3e), T(0x3f),
+  T(0x40), T(0x41), T(0x42), T(0x43), T(0x44), T(0x45), T(0x46), T(0x47),
+  T(0x48), T(0x49), T(0x4a), T(0x4b), T(0x4c), T(0x4d), T(0x4e), T(0x4f),
+  T(0x50), T(0x51), T(0x52), T(0x53), T(0x54), T(0x55), T(0x56), T(0x57),
+  T(0x58), T(0x59), T(0x5a), T(0x5b), T(0x5c), T(0x5d), T(0x5e), T(0x5f),
+  T(0x60), T(0x61), T(0x62), T(0x63), T(0x64), T(0x65), T(0x66), T(0x67),
+  T(0x68), T(0x69), T(0x6a), T(0x6b), T(0x6c), T(0x6d), T(0x6e), T(0x6f),
+  T(0x70), T(0x71), T(0x72), T(0x73), T(0x74), T(0x75), T(0x76), T(0x77),
+  T(0x78), T(0x79), T(0x7a), T(0x7b), T(0x7c), T(0x7d), T(0x7e), T(0x7f),
+  T(0x80), T(0x81), T(0x82), T(0x83), T(0x84), T(0x85), T(0x86), T(0x87),
+  T(0x88), T(0x89), T(0x8a), T(0x8b), T(0x8c), T(0x8d), T(0x8e), T(0x8f),
+  T(0x90), T(0x91), T(0x92), T(0x93), T(0x94), T(0x95), T(0x96), T(0x97),
+  T(0x98), T(0x99), T(0x9a), T(0x9b), T(0x9c), T(0x9d), T(0x9e), T(0x9f),
+  T(0xa0), T(0xa1), T(0xa2), T(0xa3), T(0xa4), T(0xa5), T(0xa6), T(0xa7),
+  T(0xa8), T(0xa9), T(0xaa), T(0xab), T(0xac), T(0xad), T(0xae), T(0xaf),
+  T(0xb0), T(0xb1), T(0xb2), T(0xb3), T(0xb4), T(0xb5), T(0xb6), T(0xb7),
+  T(0xb8), T(0xb9), T(0xba), T(0xbb), T(0xbc), T(0xbd), T(0xbe), T(0xbf),
+  T(0xc0), T(0xc1), T(0xc2), T(0xc3), T(0xc4), T(0xc5), T(0xc6), T(0xc7),
+  T(0xc8), T(0xc9), T(0xca), T(0xcb), T(0xcc), T(0xcd), T(0xce), T(0xcf),
+  T(0xd0), T(0xd1), T(0xd2), T(0xd3), T(0xd4), T(0xd5), T(0xd6), T(0xd7),
+  T(0xd8), T(0xd9), T(0xda), T(0xdb), T(0xdc), T(0xdd), T(0xde), T(0xdf),
+  T(0xe0), T(0xe1), T(0xe2), T(0xe3), T(0xe4), T(0xe5), T(0xe6), T(0xe7),
+  T(0xe8), T(0xe9), T(0xea), T(0xeb), T(0xec), T(0xed), T(0xee), T(0xef),
+  T(0xf0), T(0xf1), T(0xf2), T(0xf3), T(0xf4), T(0xf5), T(0xf6), T(0xf7),
+  T(0xf8), T(0xf9), T(0xfa), T(0xfb), T(0xfc), T(0xfd), T(0xfe), 0x01000100 };
+#undef T
+
+void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    uint32 b = src_argb[0];
+    uint32 g = src_argb[1];
+    uint32 r = src_argb[2];
+    const uint32 a = src_argb[3];
+    const uint32 ia = fixed_invtbl8[a] & 0xffff;  // 8.8 fixed point
+    b = (b * ia) >> 8;
+    g = (g * ia) >> 8;
+    r = (r * ia) >> 8;
+    // Clamping should not be necessary but is free in assembly.
+    dst_argb[0] = clamp255(b);
+    dst_argb[1] = clamp255(g);
+    dst_argb[2] = clamp255(r);
+    dst_argb[3] = a;
+    src_argb += 4;
+    dst_argb += 4;
+  }
+}
+
+void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum,
+                               const int32* previous_cumsum, int width) {
+  int32 row_sum[4] = {0, 0, 0, 0};
+  int x;
+  for (x = 0; x < width; ++x) {
+    row_sum[0] += row[x * 4 + 0];
+    row_sum[1] += row[x * 4 + 1];
+    row_sum[2] += row[x * 4 + 2];
+    row_sum[3] += row[x * 4 + 3];
+    cumsum[x * 4 + 0] = row_sum[0]  + previous_cumsum[x * 4 + 0];
+    cumsum[x * 4 + 1] = row_sum[1]  + previous_cumsum[x * 4 + 1];
+    cumsum[x * 4 + 2] = row_sum[2]  + previous_cumsum[x * 4 + 2];
+    cumsum[x * 4 + 3] = row_sum[3]  + previous_cumsum[x * 4 + 3];
+  }
+}
+
+void CumulativeSumToAverageRow_C(const int32* tl, const int32* bl,
+                                int w, int area, uint8* dst, int count) {
+  float ooa = 1.0f / area;
+  int i;
+  for (i = 0; i < count; ++i) {
+    dst[0] = (uint8)((bl[w + 0] + tl[0] - bl[0] - tl[w + 0]) * ooa);
+    dst[1] = (uint8)((bl[w + 1] + tl[1] - bl[1] - tl[w + 1]) * ooa);
+    dst[2] = (uint8)((bl[w + 2] + tl[2] - bl[2] - tl[w + 2]) * ooa);
+    dst[3] = (uint8)((bl[w + 3] + tl[3] - bl[3] - tl[w + 3]) * ooa);
+    dst += 4;
+    tl += 4;
+    bl += 4;
+  }
+}
+
+// Copy pixels from rotated source to destination row with a slope.
+LIBYUV_API
+void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
+                     uint8* dst_argb, const float* uv_dudv, int width) {
+  int i;
+  // Render a row of pixels from source into a buffer.
+  float uv[2];
+  uv[0] = uv_dudv[0];
+  uv[1] = uv_dudv[1];
+  for (i = 0; i < width; ++i) {
+    int x = (int)(uv[0]);
+    int y = (int)(uv[1]);
+    *(uint32*)(dst_argb) =
+        *(const uint32*)(src_argb + y * src_argb_stride +
+                                         x * 4);
+    dst_argb += 4;
+    uv[0] += uv_dudv[2];
+    uv[1] += uv_dudv[3];
+  }
+}
+
+// Blend 2 rows into 1.
+static void HalfRow_C(const uint8* src_uv, int src_uv_stride,
+                      uint8* dst_uv, int pix) {
+  int x;
+  for (x = 0; x < pix; ++x) {
+    dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
+  }
+}
+
+static void HalfRow_16_C(const uint16* src_uv, int src_uv_stride,
+                         uint16* dst_uv, int pix) {
+  int x;
+  for (x = 0; x < pix; ++x) {
+    dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
+  }
+}
+
+// C version 2x2 -> 2x1.
+void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr,
+                      ptrdiff_t src_stride,
+                      int width, int source_y_fraction) {
+  int y1_fraction = source_y_fraction;
+  int y0_fraction = 256 - y1_fraction;
+  const uint8* src_ptr1 = src_ptr + src_stride;
+  int x;
+  if (source_y_fraction == 0) {
+    memcpy(dst_ptr, src_ptr, width);
+    return;
+  }
+  if (source_y_fraction == 128) {
+    HalfRow_C(src_ptr, (int)(src_stride), dst_ptr, width);
+    return;
+  }
+  for (x = 0; x < width - 1; x += 2) {
+    dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
+    dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
+    src_ptr += 2;
+    src_ptr1 += 2;
+    dst_ptr += 2;
+  }
+  if (width & 1) {
+    dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
+  }
+}
+
+void InterpolateRow_16_C(uint16* dst_ptr, const uint16* src_ptr,
+                         ptrdiff_t src_stride,
+                         int width, int source_y_fraction) {
+  int y1_fraction = source_y_fraction;
+  int y0_fraction = 256 - y1_fraction;
+  const uint16* src_ptr1 = src_ptr + src_stride;
+  int x;
+  if (source_y_fraction == 0) {
+    memcpy(dst_ptr, src_ptr, width * 2);
+    return;
+  }
+  if (source_y_fraction == 128) {
+    HalfRow_16_C(src_ptr, (int)(src_stride), dst_ptr, width);
+    return;
+  }
+  for (x = 0; x < width - 1; x += 2) {
+    dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
+    dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
+    src_ptr += 2;
+    src_ptr1 += 2;
+    dst_ptr += 2;
+  }
+  if (width & 1) {
+    dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
+  }
+}
+
+// Use first 4 shuffler values to reorder ARGB channels.
+void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb,
+                      const uint8* shuffler, int pix) {
+  int index0 = shuffler[0];
+  int index1 = shuffler[1];
+  int index2 = shuffler[2];
+  int index3 = shuffler[3];
+  // Shuffle a row of ARGB.
+  int x;
+  for (x = 0; x < pix; ++x) {
+    // To support in-place conversion.
+    uint8 b = src_argb[index0];
+    uint8 g = src_argb[index1];
+    uint8 r = src_argb[index2];
+    uint8 a = src_argb[index3];
+    dst_argb[0] = b;
+    dst_argb[1] = g;
+    dst_argb[2] = r;
+    dst_argb[3] = a;
+    src_argb += 4;
+    dst_argb += 4;
+  }
+}
+
+void I422ToYUY2Row_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* dst_frame, int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    dst_frame[0] = src_y[0];
+    dst_frame[1] = src_u[0];
+    dst_frame[2] = src_y[1];
+    dst_frame[3] = src_v[0];
+    dst_frame += 4;
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+  }
+  if (width & 1) {
+    dst_frame[0] = src_y[0];
+    dst_frame[1] = src_u[0];
+    dst_frame[2] = 0;
+    dst_frame[3] = src_v[0];
+  }
+}
+
+void I422ToUYVYRow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* dst_frame, int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    dst_frame[0] = src_u[0];
+    dst_frame[1] = src_y[0];
+    dst_frame[2] = src_v[0];
+    dst_frame[3] = src_y[1];
+    dst_frame += 4;
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+  }
+  if (width & 1) {
+    dst_frame[0] = src_u[0];
+    dst_frame[1] = src_y[0];
+    dst_frame[2] = src_v[0];
+    dst_frame[3] = 0;
+  }
+}
+
+// Maximum temporary width for wrappers to process at a time, in pixels.
+#define MAXTWIDTH 2048
+
+#if !(defined(_MSC_VER) && !defined(__clang__)) && \
+    defined(HAS_I422TORGB565ROW_SSSE3)
+// row_win.cc has asm version, but GCC uses 2 step wrapper.
+void I422ToRGB565Row_SSSE3(const uint8* src_y,
+                           const uint8* src_u,
+                           const uint8* src_v,
+                           uint8* dst_rgb565,
+                           int width) {
+  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, twidth);
+    ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
+    src_y += twidth;
+    src_u += twidth / 2;
+    src_v += twidth / 2;
+    dst_rgb565 += twidth * 2;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_I422TOARGB1555ROW_SSSE3)
+void I422ToARGB1555Row_SSSE3(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
+                             uint8* dst_argb1555,
+                             int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, twidth);
+    ARGBToARGB1555Row_SSE2(row, dst_argb1555, twidth);
+    src_y += twidth;
+    src_u += twidth / 2;
+    src_v += twidth / 2;
+    dst_argb1555 += twidth * 2;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_I422TOARGB4444ROW_SSSE3)
+void I422ToARGB4444Row_SSSE3(const uint8* src_y,
+                             const uint8* src_u,
+                             const uint8* src_v,
+                             uint8* dst_argb4444,
+                             int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, twidth);
+    ARGBToARGB4444Row_SSE2(row, dst_argb4444, twidth);
+    src_y += twidth;
+    src_u += twidth / 2;
+    src_v += twidth / 2;
+    dst_argb4444 += twidth * 2;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_NV12TORGB565ROW_SSSE3)
+void NV12ToRGB565Row_SSSE3(const uint8* src_y, const uint8* src_uv,
+                           uint8* dst_rgb565, int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    NV12ToARGBRow_SSSE3(src_y, src_uv, row, twidth);
+    ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
+    src_y += twidth;
+    src_uv += twidth;
+    dst_rgb565 += twidth * 2;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_NV21TORGB565ROW_SSSE3)
+void NV21ToRGB565Row_SSSE3(const uint8* src_y, const uint8* src_vu,
+                           uint8* dst_rgb565, int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    NV21ToARGBRow_SSSE3(src_y, src_vu, row, twidth);
+    ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
+    src_y += twidth;
+    src_vu += twidth;
+    dst_rgb565 += twidth * 2;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_YUY2TOARGBROW_SSSE3)
+void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2, uint8* dst_argb, int width) {
+  // Row buffers for intermediate YUV pixels.
+  SIMD_ALIGNED(uint8 row_y[MAXTWIDTH]);
+  SIMD_ALIGNED(uint8 row_u[MAXTWIDTH / 2]);
+  SIMD_ALIGNED(uint8 row_v[MAXTWIDTH / 2]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    YUY2ToUV422Row_SSE2(src_yuy2, row_u, row_v, twidth);
+    YUY2ToYRow_SSE2(src_yuy2, row_y, twidth);
+    I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, twidth);
+    src_yuy2 += twidth * 2;
+    dst_argb += twidth * 4;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_UYVYTOARGBROW_SSSE3)
+void UYVYToARGBRow_SSSE3(const uint8* src_uyvy, uint8* dst_argb, int width) {
+  // Row buffers for intermediate YUV pixels.
+  SIMD_ALIGNED(uint8 row_y[MAXTWIDTH]);
+  SIMD_ALIGNED(uint8 row_u[MAXTWIDTH / 2]);
+  SIMD_ALIGNED(uint8 row_v[MAXTWIDTH / 2]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    UYVYToUV422Row_SSE2(src_uyvy, row_u, row_v, twidth);
+    UYVYToYRow_SSE2(src_uyvy, row_y, twidth);
+    I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, twidth);
+    src_uyvy += twidth * 2;
+    dst_argb += twidth * 4;
+    width -= twidth;
+  }
+}
+#endif  // !defined(LIBYUV_DISABLE_X86)
+
+#if defined(HAS_I422TORGB565ROW_AVX2)
+void I422ToRGB565Row_AVX2(const uint8* src_y,
+                          const uint8* src_u,
+                          const uint8* src_v,
+                          uint8* dst_rgb565,
+                          int width) {
+  SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, twidth);
+    ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth);
+    src_y += twidth;
+    src_u += twidth / 2;
+    src_v += twidth / 2;
+    dst_rgb565 += twidth * 2;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_I422TOARGB1555ROW_AVX2)
+void I422ToARGB1555Row_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb1555,
+                            int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, twidth);
+    ARGBToARGB1555Row_AVX2(row, dst_argb1555, twidth);
+    src_y += twidth;
+    src_u += twidth / 2;
+    src_v += twidth / 2;
+    dst_argb1555 += twidth * 2;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_I422TOARGB4444ROW_AVX2)
+void I422ToARGB4444Row_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb4444,
+                            int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, twidth);
+    ARGBToARGB4444Row_AVX2(row, dst_argb4444, twidth);
+    src_y += twidth;
+    src_u += twidth / 2;
+    src_v += twidth / 2;
+    dst_argb4444 += twidth * 2;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_I422TORGB24ROW_AVX2)
+void I422ToRGB24Row_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_rgb24,
+                            int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, twidth);
+    // TODO(fbarchard): ARGBToRGB24Row_AVX2
+    ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
+    src_y += twidth;
+    src_u += twidth / 2;
+    src_v += twidth / 2;
+    dst_rgb24 += twidth * 3;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_I422TORAWROW_AVX2)
+void I422ToRAWRow_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_raw,
+                            int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, twidth);
+    // TODO(fbarchard): ARGBToRAWRow_AVX2
+    ARGBToRAWRow_SSSE3(row, dst_raw, twidth);
+    src_y += twidth;
+    src_u += twidth / 2;
+    src_v += twidth / 2;
+    dst_raw += twidth * 3;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_NV12TORGB565ROW_AVX2)
+void NV12ToRGB565Row_AVX2(const uint8* src_y, const uint8* src_uv,
+                          uint8* dst_rgb565, int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    NV12ToARGBRow_AVX2(src_y, src_uv, row, twidth);
+    ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth);
+    src_y += twidth;
+    src_uv += twidth;
+    dst_rgb565 += twidth * 2;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_NV21TORGB565ROW_AVX2)
+void NV21ToRGB565Row_AVX2(const uint8* src_y, const uint8* src_vu,
+                          uint8* dst_rgb565, int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    NV21ToARGBRow_AVX2(src_y, src_vu, row, twidth);
+    ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth);
+    src_y += twidth;
+    src_vu += twidth;
+    dst_rgb565 += twidth * 2;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_YUY2TOARGBROW_AVX2)
+void YUY2ToARGBRow_AVX2(const uint8* src_yuy2, uint8* dst_argb, int width) {
+  // Row buffers for intermediate YUV pixels.
+  SIMD_ALIGNED32(uint8 row_y[MAXTWIDTH]);
+  SIMD_ALIGNED32(uint8 row_u[MAXTWIDTH / 2]);
+  SIMD_ALIGNED32(uint8 row_v[MAXTWIDTH / 2]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    YUY2ToUV422Row_AVX2(src_yuy2, row_u, row_v, twidth);
+    YUY2ToYRow_AVX2(src_yuy2, row_y, twidth);
+    I422ToARGBRow_AVX2(row_y, row_u, row_v, dst_argb, twidth);
+    src_yuy2 += twidth * 2;
+    dst_argb += twidth * 4;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_UYVYTOARGBROW_AVX2)
+void UYVYToARGBRow_AVX2(const uint8* src_uyvy, uint8* dst_argb, int width) {
+  // Row buffers for intermediate YUV pixels.
+  SIMD_ALIGNED32(uint8 row_y[MAXTWIDTH]);
+  SIMD_ALIGNED32(uint8 row_u[MAXTWIDTH / 2]);
+  SIMD_ALIGNED32(uint8 row_v[MAXTWIDTH / 2]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    UYVYToUV422Row_AVX2(src_uyvy, row_u, row_v, twidth);
+    UYVYToYRow_AVX2(src_uyvy, row_y, twidth);
+    I422ToARGBRow_AVX2(row_y, row_u, row_v, dst_argb, twidth);
+    src_uyvy += twidth * 2;
+    dst_argb += twidth * 4;
+    width -= twidth;
+  }
+}
+#endif  // !defined(LIBYUV_DISABLE_X86)
+
+void ARGBPolynomialRow_C(const uint8* src_argb,
+                         uint8* dst_argb, const float* poly,
+                         int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    float b = (float)(src_argb[0]);
+    float g = (float)(src_argb[1]);
+    float r = (float)(src_argb[2]);
+    float a = (float)(src_argb[3]);
+    float b2 = b * b;
+    float g2 = g * g;
+    float r2 = r * r;
+    float a2 = a * a;
+    float db = poly[0] + poly[4] * b;
+    float dg = poly[1] + poly[5] * g;
+    float dr = poly[2] + poly[6] * r;
+    float da = poly[3] + poly[7] * a;
+    float b3 = b2 * b;
+    float g3 = g2 * g;
+    float r3 = r2 * r;
+    float a3 = a2 * a;
+    db += poly[8] * b2;
+    dg += poly[9] * g2;
+    dr += poly[10] * r2;
+    da += poly[11] * a2;
+    db += poly[12] * b3;
+    dg += poly[13] * g3;
+    dr += poly[14] * r3;
+    da += poly[15] * a3;
+
+    dst_argb[0] = Clamp((int32)(db));
+    dst_argb[1] = Clamp((int32)(dg));
+    dst_argb[2] = Clamp((int32)(dr));
+    dst_argb[3] = Clamp((int32)(da));
+    src_argb += 4;
+    dst_argb += 4;
+  }
+}
+
+void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width,
+                             const uint8* luma, uint32 lumacoeff) {
+  uint32 bc = lumacoeff & 0xff;
+  uint32 gc = (lumacoeff >> 8) & 0xff;
+  uint32 rc = (lumacoeff >> 16) & 0xff;
+
+  int i;
+  for (i = 0; i < width - 1; i += 2) {
+    // Luminance in rows, color values in columns.
+    const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc +
+                           src_argb[2] * rc) & 0x7F00u) + luma;
+    const uint8* luma1;
+    dst_argb[0] = luma0[src_argb[0]];
+    dst_argb[1] = luma0[src_argb[1]];
+    dst_argb[2] = luma0[src_argb[2]];
+    dst_argb[3] = src_argb[3];
+    luma1 = ((src_argb[4] * bc + src_argb[5] * gc +
+              src_argb[6] * rc) & 0x7F00u) + luma;
+    dst_argb[4] = luma1[src_argb[4]];
+    dst_argb[5] = luma1[src_argb[5]];
+    dst_argb[6] = luma1[src_argb[6]];
+    dst_argb[7] = src_argb[7];
+    src_argb += 8;
+    dst_argb += 8;
+  }
+  if (width & 1) {
+    // Luminance in rows, color values in columns.
+    const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc +
+                           src_argb[2] * rc) & 0x7F00u) + luma;
+    dst_argb[0] = luma0[src_argb[0]];
+    dst_argb[1] = luma0[src_argb[1]];
+    dst_argb[2] = luma0[src_argb[2]];
+    dst_argb[3] = src_argb[3];
+  }
+}
+
+void ARGBCopyAlphaRow_C(const uint8* src, uint8* dst, int width) {
+  int i;
+  for (i = 0; i < width - 1; i += 2) {
+    dst[3] = src[3];
+    dst[7] = src[7];
+    dst += 8;
+    src += 8;
+  }
+  if (width & 1) {
+    dst[3] = src[3];
+  }
+}
+
+void ARGBCopyYToAlphaRow_C(const uint8* src, uint8* dst, int width) {
+  int i;
+  for (i = 0; i < width - 1; i += 2) {
+    dst[3] = src[0];
+    dst[7] = src[1];
+    dst += 8;
+    src += 2;
+  }
+  if (width & 1) {
+    dst[3] = src[0];
+  }
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libaom/src/third_party/libyuv/source/row_gcc.cc b/libs/libaom/src/third_party/libyuv/source/row_gcc.cc
new file mode 100644
index 000000000..820de0a1c
--- /dev/null
+++ b/libs/libaom/src/third_party/libyuv/source/row_gcc.cc
@@ -0,0 +1,5475 @@
+// VERSION 2
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC x86 and x64.
+#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
+
+#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
+
+// Constants for ARGB
+static vec8 kARGBToY = {
+  13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
+};
+
+// JPeg full range.
+static vec8 kARGBToYJ = {
+  15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
+};
+#endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
+
+#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
+
+static vec8 kARGBToU = {
+  112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
+};
+
+static vec8 kARGBToUJ = {
+  127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
+};
+
+static vec8 kARGBToV = {
+  -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
+};
+
+static vec8 kARGBToVJ = {
+  -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
+};
+
+// Constants for BGRA
+static vec8 kBGRAToY = {
+  0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
+};
+
+static vec8 kBGRAToU = {
+  0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
+};
+
+static vec8 kBGRAToV = {
+  0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
+};
+
+// Constants for ABGR
+static vec8 kABGRToY = {
+  33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
+};
+
+static vec8 kABGRToU = {
+  -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
+};
+
+static vec8 kABGRToV = {
+  112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
+};
+
+// Constants for RGBA.
+static vec8 kRGBAToY = {
+  0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
+};
+
+static vec8 kRGBAToU = {
+  0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
+};
+
+static vec8 kRGBAToV = {
+  0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
+};
+
+static uvec8 kAddY16 = {
+  16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
+};
+
+// 7 bit fixed point 0.5.
+static vec16 kAddYJ64 = {
+  64, 64, 64, 64, 64, 64, 64, 64
+};
+
+static uvec8 kAddUV128 = {
+  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
+  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
+};
+
+static uvec16 kAddUVJ128 = {
+  0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
+};
+#endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
+
+#ifdef HAS_RGB24TOARGBROW_SSSE3
+
+// Shuffle table for converting RGB24 to ARGB.
+static uvec8 kShuffleMaskRGB24ToARGB = {
+  0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
+};
+
+// Shuffle table for converting RAW to ARGB.
+static uvec8 kShuffleMaskRAWToARGB = {
+  2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
+};
+
+// Shuffle table for converting ARGB to RGB24.
+static uvec8 kShuffleMaskARGBToRGB24 = {
+  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
+};
+
+// Shuffle table for converting ARGB to RAW.
+static uvec8 kShuffleMaskARGBToRAW = {
+  2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
+};
+
+// Shuffle table for converting ARGBToRGB24 for I422ToRGB24.  First 8 + next 4
+static uvec8 kShuffleMaskARGBToRGB24_0 = {
+  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
+};
+
+// Shuffle table for converting ARGB to RAW.
+static uvec8 kShuffleMaskARGBToRAW_0 = {
+  2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
+};
+#endif  // HAS_RGB24TOARGBROW_SSSE3
+
+#if defined(TESTING) && defined(__x86_64__)
+void TestRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
+  asm volatile (
+    ".p2align  5                               \n"
+    "mov       %%eax,%%eax                     \n"
+    "mov       %%ebx,%%ebx                     \n"
+    "mov       %%ecx,%%ecx                     \n"
+    "mov       %%edx,%%edx                     \n"
+    "mov       %%esi,%%esi                     \n"
+    "mov       %%edi,%%edi                     \n"
+    "mov       %%ebp,%%ebp                     \n"
+    "mov       %%esp,%%esp                     \n"
+    ".p2align  5                               \n"
+    "mov       %%r8d,%%r8d                     \n"
+    "mov       %%r9d,%%r9d                     \n"
+    "mov       %%r10d,%%r10d                   \n"
+    "mov       %%r11d,%%r11d                   \n"
+    "mov       %%r12d,%%r12d                   \n"
+    "mov       %%r13d,%%r13d                   \n"
+    "mov       %%r14d,%%r14d                   \n"
+    "mov       %%r15d,%%r15d                   \n"
+    ".p2align  5                               \n"
+    "lea       (%%rax),%%eax                   \n"
+    "lea       (%%rbx),%%ebx                   \n"
+    "lea       (%%rcx),%%ecx                   \n"
+    "lea       (%%rdx),%%edx                   \n"
+    "lea       (%%rsi),%%esi                   \n"
+    "lea       (%%rdi),%%edi                   \n"
+    "lea       (%%rbp),%%ebp                   \n"
+    "lea       (%%rsp),%%esp                   \n"
+    ".p2align  5                               \n"
+    "lea       (%%r8),%%r8d                    \n"
+    "lea       (%%r9),%%r9d                    \n"
+    "lea       (%%r10),%%r10d                  \n"
+    "lea       (%%r11),%%r11d                  \n"
+    "lea       (%%r12),%%r12d                  \n"
+    "lea       (%%r13),%%r13d                  \n"
+    "lea       (%%r14),%%r14d                  \n"
+    "lea       (%%r15),%%r15d                  \n"
+
+    ".p2align  5                               \n"
+    "lea       0x10(%%rax),%%eax               \n"
+    "lea       0x10(%%rbx),%%ebx               \n"
+    "lea       0x10(%%rcx),%%ecx               \n"
+    "lea       0x10(%%rdx),%%edx               \n"
+    "lea       0x10(%%rsi),%%esi               \n"
+    "lea       0x10(%%rdi),%%edi               \n"
+    "lea       0x10(%%rbp),%%ebp               \n"
+    "lea       0x10(%%rsp),%%esp               \n"
+    ".p2align  5                               \n"
+    "lea       0x10(%%r8),%%r8d                \n"
+    "lea       0x10(%%r9),%%r9d                \n"
+    "lea       0x10(%%r10),%%r10d              \n"
+    "lea       0x10(%%r11),%%r11d              \n"
+    "lea       0x10(%%r12),%%r12d              \n"
+    "lea       0x10(%%r13),%%r13d              \n"
+    "lea       0x10(%%r14),%%r14d              \n"
+    "lea       0x10(%%r15),%%r15d              \n"
+
+    ".p2align  5                               \n"
+    "add       0x10,%%eax                      \n"
+    "add       0x10,%%ebx                      \n"
+    "add       0x10,%%ecx                      \n"
+    "add       0x10,%%edx                      \n"
+    "add       0x10,%%esi                      \n"
+    "add       0x10,%%edi                      \n"
+    "add       0x10,%%ebp                      \n"
+    "add       0x10,%%esp                      \n"
+    ".p2align  5                               \n"
+    "add       0x10,%%r8d                      \n"
+    "add       0x10,%%r9d                      \n"
+    "add       0x10,%%r10d                     \n"
+    "add       0x10,%%r11d                     \n"
+    "add       0x10,%%r12d                     \n"
+    "add       0x10,%%r13d                     \n"
+    "add       0x10,%%r14d                     \n"
+    "add       0x10,%%r15d                     \n"
+
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    "movq      " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x8,0) ",%0            \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_y),     // %0
+    "+r"(dst_argb),  // %1
+    "+r"(pix)        // %2
+  :
+  : "memory", "cc", "xmm0", "xmm1", "xmm5"
+  );
+}
+#endif  // TESTING
+
+#ifdef HAS_J400TOARGBROW_SSE2
+void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "pslld     $0x18,%%xmm5                    \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movq      " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x8,0) ",%0            \n"
+    "punpcklbw %%xmm0,%%xmm0                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklwd %%xmm0,%%xmm0                   \n"
+    "punpckhwd %%xmm1,%%xmm1                   \n"
+    "por       %%xmm5,%%xmm0                   \n"
+    "por       %%xmm5,%%xmm1                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_y),     // %0
+    "+r"(dst_argb),  // %1
+    "+r"(pix)        // %2
+  :: "memory", "cc", "xmm0", "xmm1", "xmm5"
+  );
+}
+#endif  // HAS_J400TOARGBROW_SSE2
+
+#ifdef HAS_RGB24TOARGBROW_SSSE3
+void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"  // generate mask 0xff000000
+    "pslld     $0x18,%%xmm5                    \n"
+    "movdqa    %3,%%xmm4                       \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm3   \n"
+    "lea       " MEMLEA(0x30,0) ",%0           \n"
+    "movdqa    %%xmm3,%%xmm2                   \n"
+    "palignr   $0x8,%%xmm1,%%xmm2              \n"
+    "pshufb    %%xmm4,%%xmm2                   \n"
+    "por       %%xmm5,%%xmm2                   \n"
+    "palignr   $0xc,%%xmm0,%%xmm1              \n"
+    "pshufb    %%xmm4,%%xmm0                   \n"
+    "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
+    "por       %%xmm5,%%xmm0                   \n"
+    "pshufb    %%xmm4,%%xmm1                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "por       %%xmm5,%%xmm1                   \n"
+    "palignr   $0x4,%%xmm3,%%xmm3              \n"
+    "pshufb    %%xmm4,%%xmm3                   \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
+    "por       %%xmm5,%%xmm3                   \n"
+    "movdqu    %%xmm3," MEMACCESS2(0x30,1) "   \n"
+    "lea       " MEMLEA(0x40,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_rgb24),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(pix)        // %2
+  : "m"(kShuffleMaskRGB24ToARGB)  // %3
+  : "memory", "cc" , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+
+void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"  // generate mask 0xff000000
+    "pslld     $0x18,%%xmm5                    \n"
+    "movdqa    %3,%%xmm4                       \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm3   \n"
+    "lea       " MEMLEA(0x30,0) ",%0           \n"
+    "movdqa    %%xmm3,%%xmm2                   \n"
+    "palignr   $0x8,%%xmm1,%%xmm2              \n"
+    "pshufb    %%xmm4,%%xmm2                   \n"
+    "por       %%xmm5,%%xmm2                   \n"
+    "palignr   $0xc,%%xmm0,%%xmm1              \n"
+    "pshufb    %%xmm4,%%xmm0                   \n"
+    "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
+    "por       %%xmm5,%%xmm0                   \n"
+    "pshufb    %%xmm4,%%xmm1                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "por       %%xmm5,%%xmm1                   \n"
+    "palignr   $0x4,%%xmm3,%%xmm3              \n"
+    "pshufb    %%xmm4,%%xmm3                   \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
+    "por       %%xmm5,%%xmm3                   \n"
+    "movdqu    %%xmm3," MEMACCESS2(0x30,1) "   \n"
+    "lea       " MEMLEA(0x40,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_raw),   // %0
+    "+r"(dst_argb),  // %1
+    "+r"(pix)        // %2
+  : "m"(kShuffleMaskRAWToARGB)  // %3
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+
+void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
+  asm volatile (
+    "mov       $0x1080108,%%eax                \n"
+    "movd      %%eax,%%xmm5                    \n"
+    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
+    "mov       $0x20802080,%%eax               \n"
+    "movd      %%eax,%%xmm6                    \n"
+    "pshufd    $0x0,%%xmm6,%%xmm6              \n"
+    "pcmpeqb   %%xmm3,%%xmm3                   \n"
+    "psllw     $0xb,%%xmm3                     \n"
+    "pcmpeqb   %%xmm4,%%xmm4                   \n"
+    "psllw     $0xa,%%xmm4                     \n"
+    "psrlw     $0x5,%%xmm4                     \n"
+    "pcmpeqb   %%xmm7,%%xmm7                   \n"
+    "psllw     $0x8,%%xmm7                     \n"
+    "sub       %0,%1                           \n"
+    "sub       %0,%1                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "pand      %%xmm3,%%xmm1                   \n"
+    "psllw     $0xb,%%xmm2                     \n"
+    "pmulhuw   %%xmm5,%%xmm1                   \n"
+    "pmulhuw   %%xmm5,%%xmm2                   \n"
+    "psllw     $0x8,%%xmm1                     \n"
+    "por       %%xmm2,%%xmm1                   \n"
+    "pand      %%xmm4,%%xmm0                   \n"
+    "pmulhuw   %%xmm6,%%xmm0                   \n"
+    "por       %%xmm7,%%xmm0                   \n"
+    "movdqa    %%xmm1,%%xmm2                   \n"
+    "punpcklbw %%xmm0,%%xmm1                   \n"
+    "punpckhbw %%xmm0,%%xmm2                   \n"
+    MEMOPMEM(movdqu,xmm1,0x00,1,0,2)           //  movdqu  %%xmm1,(%1,%0,2)
+    MEMOPMEM(movdqu,xmm2,0x10,1,0,2)           //  movdqu  %%xmm2,0x10(%1,%0,2)
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(pix)   // %2
+  :
+  : "memory", "cc", "eax", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+
+void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
+  asm volatile (
+    "mov       $0x1080108,%%eax                \n"
+    "movd      %%eax,%%xmm5                    \n"
+    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
+    "mov       $0x42004200,%%eax               \n"
+    "movd      %%eax,%%xmm6                    \n"
+    "pshufd    $0x0,%%xmm6,%%xmm6              \n"
+    "pcmpeqb   %%xmm3,%%xmm3                   \n"
+    "psllw     $0xb,%%xmm3                     \n"
+    "movdqa    %%xmm3,%%xmm4                   \n"
+    "psrlw     $0x6,%%xmm4                     \n"
+    "pcmpeqb   %%xmm7,%%xmm7                   \n"
+    "psllw     $0x8,%%xmm7                     \n"
+    "sub       %0,%1                           \n"
+    "sub       %0,%1                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "psllw     $0x1,%%xmm1                     \n"
+    "psllw     $0xb,%%xmm2                     \n"
+    "pand      %%xmm3,%%xmm1                   \n"
+    "pmulhuw   %%xmm5,%%xmm2                   \n"
+    "pmulhuw   %%xmm5,%%xmm1                   \n"
+    "psllw     $0x8,%%xmm1                     \n"
+    "por       %%xmm2,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "pand      %%xmm4,%%xmm0                   \n"
+    "psraw     $0x8,%%xmm2                     \n"
+    "pmulhuw   %%xmm6,%%xmm0                   \n"
+    "pand      %%xmm7,%%xmm2                   \n"
+    "por       %%xmm2,%%xmm0                   \n"
+    "movdqa    %%xmm1,%%xmm2                   \n"
+    "punpcklbw %%xmm0,%%xmm1                   \n"
+    "punpckhbw %%xmm0,%%xmm2                   \n"
+    MEMOPMEM(movdqu,xmm1,0x00,1,0,2)           //  movdqu  %%xmm1,(%1,%0,2)
+    MEMOPMEM(movdqu,xmm2,0x10,1,0,2)           //  movdqu  %%xmm2,0x10(%1,%0,2)
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(pix)   // %2
+  :
+  : "memory", "cc", "eax", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+
+void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
+  asm volatile (
+    "mov       $0xf0f0f0f,%%eax                \n"
+    "movd      %%eax,%%xmm4                    \n"
+    "pshufd    $0x0,%%xmm4,%%xmm4              \n"
+    "movdqa    %%xmm4,%%xmm5                   \n"
+    "pslld     $0x4,%%xmm5                     \n"
+    "sub       %0,%1                           \n"
+    "sub       %0,%1                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "pand      %%xmm4,%%xmm0                   \n"
+    "pand      %%xmm5,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm2,%%xmm3                   \n"
+    "psllw     $0x4,%%xmm1                     \n"
+    "psrlw     $0x4,%%xmm3                     \n"
+    "por       %%xmm1,%%xmm0                   \n"
+    "por       %%xmm3,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklbw %%xmm2,%%xmm0                   \n"
+    "punpckhbw %%xmm2,%%xmm1                   \n"
+    MEMOPMEM(movdqu,xmm0,0x00,1,0,2)           //  movdqu  %%xmm0,(%1,%0,2)
+    MEMOPMEM(movdqu,xmm1,0x10,1,0,2)           //  movdqu  %%xmm1,0x10(%1,%0,2)
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(pix)   // %2
+  :
+  : "memory", "cc", "eax", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+
+void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
+  asm volatile (
+    "movdqa    %3,%%xmm6                       \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "pshufb    %%xmm6,%%xmm0                   \n"
+    "pshufb    %%xmm6,%%xmm1                   \n"
+    "pshufb    %%xmm6,%%xmm2                   \n"
+    "pshufb    %%xmm6,%%xmm3                   \n"
+    "movdqa    %%xmm1,%%xmm4                   \n"
+    "psrldq    $0x4,%%xmm1                     \n"
+    "pslldq    $0xc,%%xmm4                     \n"
+    "movdqa    %%xmm2,%%xmm5                   \n"
+    "por       %%xmm4,%%xmm0                   \n"
+    "pslldq    $0x8,%%xmm5                     \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "por       %%xmm5,%%xmm1                   \n"
+    "psrldq    $0x8,%%xmm2                     \n"
+    "pslldq    $0x4,%%xmm3                     \n"
+    "por       %%xmm3,%%xmm2                   \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
+    "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
+    "lea       " MEMLEA(0x30,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(pix)   // %2
+  : "m"(kShuffleMaskARGBToRGB24)  // %3
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+  );
+}
+
+void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
+  asm volatile (
+    "movdqa    %3,%%xmm6                       \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "pshufb    %%xmm6,%%xmm0                   \n"
+    "pshufb    %%xmm6,%%xmm1                   \n"
+    "pshufb    %%xmm6,%%xmm2                   \n"
+    "pshufb    %%xmm6,%%xmm3                   \n"
+    "movdqa    %%xmm1,%%xmm4                   \n"
+    "psrldq    $0x4,%%xmm1                     \n"
+    "pslldq    $0xc,%%xmm4                     \n"
+    "movdqa    %%xmm2,%%xmm5                   \n"
+    "por       %%xmm4,%%xmm0                   \n"
+    "pslldq    $0x8,%%xmm5                     \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "por       %%xmm5,%%xmm1                   \n"
+    "psrldq    $0x8,%%xmm2                     \n"
+    "pslldq    $0x4,%%xmm3                     \n"
+    "por       %%xmm3,%%xmm2                   \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
+    "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
+    "lea       " MEMLEA(0x30,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(pix)   // %2
+  : "m"(kShuffleMaskARGBToRAW)  // %3
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+  );
+}
+
+void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
+  asm volatile (
+    "pcmpeqb   %%xmm3,%%xmm3                   \n"
+    "psrld     $0x1b,%%xmm3                    \n"
+    "pcmpeqb   %%xmm4,%%xmm4                   \n"
+    "psrld     $0x1a,%%xmm4                    \n"
+    "pslld     $0x5,%%xmm4                     \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "pslld     $0xb,%%xmm5                     \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "pslld     $0x8,%%xmm0                     \n"
+    "psrld     $0x3,%%xmm1                     \n"
+    "psrld     $0x5,%%xmm2                     \n"
+    "psrad     $0x10,%%xmm0                    \n"
+    "pand      %%xmm3,%%xmm1                   \n"
+    "pand      %%xmm4,%%xmm2                   \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "por       %%xmm2,%%xmm1                   \n"
+    "por       %%xmm1,%%xmm0                   \n"
+    "packssdw  %%xmm0,%%xmm0                   \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x4,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(pix)   // %2
+  :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+
+void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
+  asm volatile (
+    "pcmpeqb   %%xmm4,%%xmm4                   \n"
+    "psrld     $0x1b,%%xmm4                    \n"
+    "movdqa    %%xmm4,%%xmm5                   \n"
+    "pslld     $0x5,%%xmm5                     \n"
+    "movdqa    %%xmm4,%%xmm6                   \n"
+    "pslld     $0xa,%%xmm6                     \n"
+    "pcmpeqb   %%xmm7,%%xmm7                   \n"
+    "pslld     $0xf,%%xmm7                     \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm3                   \n"
+    "psrad     $0x10,%%xmm0                    \n"
+    "psrld     $0x3,%%xmm1                     \n"
+    "psrld     $0x6,%%xmm2                     \n"
+    "psrld     $0x9,%%xmm3                     \n"
+    "pand      %%xmm7,%%xmm0                   \n"
+    "pand      %%xmm4,%%xmm1                   \n"
+    "pand      %%xmm5,%%xmm2                   \n"
+    "pand      %%xmm6,%%xmm3                   \n"
+    "por       %%xmm1,%%xmm0                   \n"
+    "por       %%xmm3,%%xmm2                   \n"
+    "por       %%xmm2,%%xmm0                   \n"
+    "packssdw  %%xmm0,%%xmm0                   \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x4,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(pix)   // %2
+  :: "memory", "cc",
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+
+void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
+  asm volatile (
+    "pcmpeqb   %%xmm4,%%xmm4                   \n"
+    "psllw     $0xc,%%xmm4                     \n"
+    "movdqa    %%xmm4,%%xmm3                   \n"
+    "psrlw     $0x8,%%xmm3                     \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "pand      %%xmm3,%%xmm0                   \n"
+    "pand      %%xmm4,%%xmm1                   \n"
+    "psrlq     $0x4,%%xmm0                     \n"
+    "psrlq     $0x8,%%xmm1                     \n"
+    "por       %%xmm1,%%xmm0                   \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x4,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(pix)   // %2
+  :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
+  );
+}
+#endif  // HAS_RGB24TOARGBROW_SSSE3
+
+#ifdef HAS_ARGBTOYROW_SSSE3
+// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
+void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+  asm volatile (
+    "movdqa    %3,%%xmm4                       \n"
+    "movdqa    %4,%%xmm5                       \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm1                   \n"
+    "pmaddubsw %%xmm4,%%xmm2                   \n"
+    "pmaddubsw %%xmm4,%%xmm3                   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "phaddw    %%xmm1,%%xmm0                   \n"
+    "phaddw    %%xmm3,%%xmm2                   \n"
+    "psrlw     $0x7,%%xmm0                     \n"
+    "psrlw     $0x7,%%xmm2                     \n"
+    "packuswb  %%xmm2,%%xmm0                   \n"
+    "paddb     %%xmm5,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  : "m"(kARGBToY),   // %3
+    "m"(kAddY16)     // %4
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_ARGBTOYROW_SSSE3
+
+#ifdef HAS_ARGBTOYJROW_SSSE3
+// Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
+// Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
+void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+  asm volatile (
+    "movdqa    %3,%%xmm4                       \n"
+    "movdqa    %4,%%xmm5                       \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm1                   \n"
+    "pmaddubsw %%xmm4,%%xmm2                   \n"
+    "pmaddubsw %%xmm4,%%xmm3                   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "phaddw    %%xmm1,%%xmm0                   \n"
+    "phaddw    %%xmm3,%%xmm2                   \n"
+    "paddw     %%xmm5,%%xmm0                   \n"
+    "paddw     %%xmm5,%%xmm2                   \n"
+    "psrlw     $0x7,%%xmm0                     \n"
+    "psrlw     $0x7,%%xmm2                     \n"
+    "packuswb  %%xmm2,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  : "m"(kARGBToYJ),  // %3
+    "m"(kAddYJ64)    // %4
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_ARGBTOYJROW_SSSE3
+
+#ifdef HAS_ARGBTOYROW_AVX2
+// vpermd for vphaddw + vpackuswb vpermd.
+static const lvec32 kPermdARGBToY_AVX = {
+  0, 4, 1, 5, 2, 6, 3, 7
+};
+
+// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
+void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
+  asm volatile (
+    "vbroadcastf128 %3,%%ymm4                  \n"
+    "vbroadcastf128 %4,%%ymm5                  \n"
+    "vmovdqu    %5,%%ymm6                      \n"
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
+    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
+    "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"
+    "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"
+    "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
+    "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
+    "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
+    "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
+    "lea       " MEMLEA(0x80,0) ",%0           \n"
+    "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"  // mutates.
+    "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"
+    "vpsrlw     $0x7,%%ymm0,%%ymm0             \n"
+    "vpsrlw     $0x7,%%ymm2,%%ymm2             \n"
+    "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n"  // mutates.
+    "vpermd     %%ymm0,%%ymm6,%%ymm0           \n"  // unmutate.
+    "vpaddb     %%ymm5,%%ymm0,%%ymm0           \n"  // add 16 for Y
+    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x20,%2                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  : "m"(kARGBToY),   // %3
+    "m"(kAddY16),    // %4
+    "m"(kPermdARGBToY_AVX)  // %5
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+  );
+}
+#endif  // HAS_ARGBTOYROW_AVX2
+
+#ifdef HAS_ARGBTOYJROW_AVX2
+// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
+void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
+  asm volatile (
+    "vbroadcastf128 %3,%%ymm4                  \n"
+    "vbroadcastf128 %4,%%ymm5                  \n"
+    "vmovdqu    %5,%%ymm6                      \n"
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
+    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
+    "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"
+    "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"
+    "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
+    "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
+    "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
+    "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
+    "lea       " MEMLEA(0x80,0) ",%0           \n"
+    "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"  // mutates.
+    "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"
+    "vpaddw     %%ymm5,%%ymm0,%%ymm0           \n"  // Add .5 for rounding.
+    "vpaddw     %%ymm5,%%ymm2,%%ymm2           \n"
+    "vpsrlw     $0x7,%%ymm0,%%ymm0             \n"
+    "vpsrlw     $0x7,%%ymm2,%%ymm2             \n"
+    "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n"  // mutates.
+    "vpermd     %%ymm0,%%ymm6,%%ymm0           \n"  // unmutate.
+    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x20,%2                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  : "m"(kARGBToYJ),   // %3
+    "m"(kAddYJ64),    // %4
+    "m"(kPermdARGBToY_AVX)  // %5
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+  );
+}
+#endif  // HAS_ARGBTOYJROW_AVX2
+
+#ifdef HAS_ARGBTOUVROW_SSSE3
+void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "movdqa    %5,%%xmm3                       \n"
+    "movdqa    %6,%%xmm4                       \n"
+    "movdqa    %7,%%xmm5                       \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm0                   \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm1                   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm2                   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
+    MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm6                   \n"
+
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm7                   \n"
+    "shufps    $0x88,%%xmm1,%%xmm0             \n"
+    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
+    "pavgb     %%xmm7,%%xmm0                   \n"
+    "movdqa    %%xmm2,%%xmm7                   \n"
+    "shufps    $0x88,%%xmm6,%%xmm2             \n"
+    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
+    "pavgb     %%xmm7,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm2,%%xmm6                   \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm2                   \n"
+    "pmaddubsw %%xmm3,%%xmm1                   \n"
+    "pmaddubsw %%xmm3,%%xmm6                   \n"
+    "phaddw    %%xmm2,%%xmm0                   \n"
+    "phaddw    %%xmm6,%%xmm1                   \n"
+    "psraw     $0x8,%%xmm0                     \n"
+    "psraw     $0x8,%%xmm1                     \n"
+    "packsswb  %%xmm1,%%xmm0                   \n"
+    "paddb     %%xmm5,%%xmm0                   \n"
+    "movlps    %%xmm0," MEMACCESS(1) "         \n"
+    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps    %%xmm0,(%1,%2,1)
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb0),       // %0
+    "+r"(dst_u),           // %1
+    "+r"(dst_v),           // %2
+    "+rm"(width)           // %3
+  : "r"((intptr_t)(src_stride_argb)), // %4
+    "m"(kARGBToV),  // %5
+    "m"(kARGBToU),  // %6
+    "m"(kAddUV128)  // %7
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_ARGBTOUVROW_SSSE3
+
+#ifdef HAS_ARGBTOUVROW_AVX2
+// vpshufb for vphaddw + vpackuswb packed to shorts.
+static const lvec8 kShufARGBToUV_AVX = {
+  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
+  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
+};
+void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
+                      uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "vbroadcastf128 %5,%%ymm5                  \n"
+    "vbroadcastf128 %6,%%ymm6                  \n"
+    "vbroadcastf128 %7,%%ymm7                  \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
+    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
+    "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"
+    "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"
+    VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0
+    VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
+    VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2)
+    VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3)
+    "lea       " MEMLEA(0x80,0) ",%0           \n"
+    "vshufps    $0x88,%%ymm1,%%ymm0,%%ymm4     \n"
+    "vshufps    $0xdd,%%ymm1,%%ymm0,%%ymm0     \n"
+    "vpavgb     %%ymm4,%%ymm0,%%ymm0           \n"
+    "vshufps    $0x88,%%ymm3,%%ymm2,%%ymm4     \n"
+    "vshufps    $0xdd,%%ymm3,%%ymm2,%%ymm2     \n"
+    "vpavgb     %%ymm4,%%ymm2,%%ymm2           \n"
+
+    "vpmaddubsw %%ymm7,%%ymm0,%%ymm1           \n"
+    "vpmaddubsw %%ymm7,%%ymm2,%%ymm3           \n"
+    "vpmaddubsw %%ymm6,%%ymm0,%%ymm0           \n"
+    "vpmaddubsw %%ymm6,%%ymm2,%%ymm2           \n"
+    "vphaddw    %%ymm3,%%ymm1,%%ymm1           \n"
+    "vphaddw    %%ymm2,%%ymm0,%%ymm0           \n"
+    "vpsraw     $0x8,%%ymm1,%%ymm1             \n"
+    "vpsraw     $0x8,%%ymm0,%%ymm0             \n"
+    "vpacksswb  %%ymm0,%%ymm1,%%ymm0           \n"
+    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+    "vpshufb    %8,%%ymm0,%%ymm0               \n"
+    "vpaddb     %%ymm5,%%ymm0,%%ymm0           \n"
+
+    "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n"
+    VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1)
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x20,%3                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_argb0),       // %0
+    "+r"(dst_u),           // %1
+    "+r"(dst_v),           // %2
+    "+rm"(width)           // %3
+  : "r"((intptr_t)(src_stride_argb)), // %4
+    "m"(kAddUV128),  // %5
+    "m"(kARGBToV),   // %6
+    "m"(kARGBToU),   // %7
+    "m"(kShufARGBToUV_AVX)  // %8
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_ARGBTOUVROW_AVX2
+
+#ifdef HAS_ARGBTOUVJROW_SSSE3
+void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                        uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "movdqa    %5,%%xmm3                       \n"
+    "movdqa    %6,%%xmm4                       \n"
+    "movdqa    %7,%%xmm5                       \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm0                   \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm1                   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm2                   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
+    MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm6                   \n"
+
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm7                   \n"
+    "shufps    $0x88,%%xmm1,%%xmm0             \n"
+    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
+    "pavgb     %%xmm7,%%xmm0                   \n"
+    "movdqa    %%xmm2,%%xmm7                   \n"
+    "shufps    $0x88,%%xmm6,%%xmm2             \n"
+    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
+    "pavgb     %%xmm7,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm2,%%xmm6                   \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm2                   \n"
+    "pmaddubsw %%xmm3,%%xmm1                   \n"
+    "pmaddubsw %%xmm3,%%xmm6                   \n"
+    "phaddw    %%xmm2,%%xmm0                   \n"
+    "phaddw    %%xmm6,%%xmm1                   \n"
+    "paddw     %%xmm5,%%xmm0                   \n"
+    "paddw     %%xmm5,%%xmm1                   \n"
+    "psraw     $0x8,%%xmm0                     \n"
+    "psraw     $0x8,%%xmm1                     \n"
+    "packsswb  %%xmm1,%%xmm0                   \n"
+    "movlps    %%xmm0," MEMACCESS(1) "         \n"
+    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb0),       // %0
+    "+r"(dst_u),           // %1
+    "+r"(dst_v),           // %2
+    "+rm"(width)           // %3
+  : "r"((intptr_t)(src_stride_argb)), // %4
+    "m"(kARGBToVJ),  // %5
+    "m"(kARGBToUJ),  // %6
+    "m"(kAddUVJ128)  // %7
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_ARGBTOUVJROW_SSSE3
+
+#ifdef HAS_ARGBTOUV444ROW_SSSE3
+void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+                          int width) {
+  asm volatile (
+    "movdqa    %4,%%xmm3                       \n"
+    "movdqa    %5,%%xmm4                       \n"
+    "movdqa    %6,%%xmm5                       \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm1                   \n"
+    "pmaddubsw %%xmm4,%%xmm2                   \n"
+    "pmaddubsw %%xmm4,%%xmm6                   \n"
+    "phaddw    %%xmm1,%%xmm0                   \n"
+    "phaddw    %%xmm6,%%xmm2                   \n"
+    "psraw     $0x8,%%xmm0                     \n"
+    "psraw     $0x8,%%xmm2                     \n"
+    "packsswb  %%xmm2,%%xmm0                   \n"
+    "paddb     %%xmm5,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
+    "pmaddubsw %%xmm3,%%xmm0                   \n"
+    "pmaddubsw %%xmm3,%%xmm1                   \n"
+    "pmaddubsw %%xmm3,%%xmm2                   \n"
+    "pmaddubsw %%xmm3,%%xmm6                   \n"
+    "phaddw    %%xmm1,%%xmm0                   \n"
+    "phaddw    %%xmm6,%%xmm2                   \n"
+    "psraw     $0x8,%%xmm0                     \n"
+    "psraw     $0x8,%%xmm2                     \n"
+    "packsswb  %%xmm2,%%xmm0                   \n"
+    "paddb     %%xmm5,%%xmm0                   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    MEMOPMEM(movdqu,xmm0,0x00,1,2,1)           //  movdqu  %%xmm0,(%1,%2,1)
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),        // %0
+    "+r"(dst_u),           // %1
+    "+r"(dst_v),           // %2
+    "+rm"(width)           // %3
+  : "m"(kARGBToV),  // %4
+    "m"(kARGBToU),  // %5
+    "m"(kAddUV128)  // %6
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm6"
+  );
+}
+#endif  // HAS_ARGBTOUV444ROW_SSSE3
+
+#ifdef HAS_ARGBTOUV422ROW_SSSE3
+void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
+                          uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "movdqa    %4,%%xmm3                       \n"
+    "movdqa    %5,%%xmm4                       \n"
+    "movdqa    %6,%%xmm5                       \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm7                   \n"
+    "shufps    $0x88,%%xmm1,%%xmm0             \n"
+    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
+    "pavgb     %%xmm7,%%xmm0                   \n"
+    "movdqa    %%xmm2,%%xmm7                   \n"
+    "shufps    $0x88,%%xmm6,%%xmm2             \n"
+    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
+    "pavgb     %%xmm7,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm2,%%xmm6                   \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm2                   \n"
+    "pmaddubsw %%xmm3,%%xmm1                   \n"
+    "pmaddubsw %%xmm3,%%xmm6                   \n"
+    "phaddw    %%xmm2,%%xmm0                   \n"
+    "phaddw    %%xmm6,%%xmm1                   \n"
+    "psraw     $0x8,%%xmm0                     \n"
+    "psraw     $0x8,%%xmm1                     \n"
+    "packsswb  %%xmm1,%%xmm0                   \n"
+    "paddb     %%xmm5,%%xmm0                   \n"
+    "movlps    %%xmm0," MEMACCESS(1) "         \n"
+    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb0),       // %0
+    "+r"(dst_u),           // %1
+    "+r"(dst_v),           // %2
+    "+rm"(width)           // %3
+  : "m"(kARGBToV),  // %4
+    "m"(kARGBToU),  // %5
+    "m"(kAddUV128)  // %6
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_ARGBTOUV422ROW_SSSE3
+
+void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
+  asm volatile (
+    "movdqa    %4,%%xmm5                       \n"
+    "movdqa    %3,%%xmm4                       \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm1                   \n"
+    "pmaddubsw %%xmm4,%%xmm2                   \n"
+    "pmaddubsw %%xmm4,%%xmm3                   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "phaddw    %%xmm1,%%xmm0                   \n"
+    "phaddw    %%xmm3,%%xmm2                   \n"
+    "psrlw     $0x7,%%xmm0                     \n"
+    "psrlw     $0x7,%%xmm2                     \n"
+    "packuswb  %%xmm2,%%xmm0                   \n"
+    "paddb     %%xmm5,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_bgra),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  : "m"(kBGRAToY),   // %3
+    "m"(kAddY16)     // %4
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+
+void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
+                       uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "movdqa    %5,%%xmm3                       \n"
+    "movdqa    %6,%%xmm4                       \n"
+    "movdqa    %7,%%xmm5                       \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm0                   \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm1                   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm2                   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
+    MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm6                   \n"
+
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm7                   \n"
+    "shufps    $0x88,%%xmm1,%%xmm0             \n"
+    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
+    "pavgb     %%xmm7,%%xmm0                   \n"
+    "movdqa    %%xmm2,%%xmm7                   \n"
+    "shufps    $0x88,%%xmm6,%%xmm2             \n"
+    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
+    "pavgb     %%xmm7,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm2,%%xmm6                   \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm2                   \n"
+    "pmaddubsw %%xmm3,%%xmm1                   \n"
+    "pmaddubsw %%xmm3,%%xmm6                   \n"
+    "phaddw    %%xmm2,%%xmm0                   \n"
+    "phaddw    %%xmm6,%%xmm1                   \n"
+    "psraw     $0x8,%%xmm0                     \n"
+    "psraw     $0x8,%%xmm1                     \n"
+    "packsswb  %%xmm1,%%xmm0                   \n"
+    "paddb     %%xmm5,%%xmm0                   \n"
+    "movlps    %%xmm0," MEMACCESS(1) "         \n"
+    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_bgra0),       // %0
+    "+r"(dst_u),           // %1
+    "+r"(dst_v),           // %2
+    "+rm"(width)           // %3
+  : "r"((intptr_t)(src_stride_bgra)), // %4
+    "m"(kBGRAToV),  // %5
+    "m"(kBGRAToU),  // %6
+    "m"(kAddUV128)  // %7
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+  );
+}
+
+void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
+  asm volatile (
+    "movdqa    %4,%%xmm5                       \n"
+    "movdqa    %3,%%xmm4                       \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm1                   \n"
+    "pmaddubsw %%xmm4,%%xmm2                   \n"
+    "pmaddubsw %%xmm4,%%xmm3                   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "phaddw    %%xmm1,%%xmm0                   \n"
+    "phaddw    %%xmm3,%%xmm2                   \n"
+    "psrlw     $0x7,%%xmm0                     \n"
+    "psrlw     $0x7,%%xmm2                     \n"
+    "packuswb  %%xmm2,%%xmm0                   \n"
+    "paddb     %%xmm5,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_abgr),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  : "m"(kABGRToY),   // %3
+    "m"(kAddY16)     // %4
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+
+void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
+  asm volatile (
+    "movdqa    %4,%%xmm5                       \n"
+    "movdqa    %3,%%xmm4                       \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm1                   \n"
+    "pmaddubsw %%xmm4,%%xmm2                   \n"
+    "pmaddubsw %%xmm4,%%xmm3                   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "phaddw    %%xmm1,%%xmm0                   \n"
+    "phaddw    %%xmm3,%%xmm2                   \n"
+    "psrlw     $0x7,%%xmm0                     \n"
+    "psrlw     $0x7,%%xmm2                     \n"
+    "packuswb  %%xmm2,%%xmm0                   \n"
+    "paddb     %%xmm5,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_rgba),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  : "m"(kRGBAToY),   // %3
+    "m"(kAddY16)     // %4
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+
+void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
+                       uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "movdqa    %5,%%xmm3                       \n"
+    "movdqa    %6,%%xmm4                       \n"
+    "movdqa    %7,%%xmm5                       \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm0                   \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm1                   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm2                   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
+    MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm6                   \n"
+
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm7                   \n"
+    "shufps    $0x88,%%xmm1,%%xmm0             \n"
+    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
+    "pavgb     %%xmm7,%%xmm0                   \n"
+    "movdqa    %%xmm2,%%xmm7                   \n"
+    "shufps    $0x88,%%xmm6,%%xmm2             \n"
+    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
+    "pavgb     %%xmm7,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm2,%%xmm6                   \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm2                   \n"
+    "pmaddubsw %%xmm3,%%xmm1                   \n"
+    "pmaddubsw %%xmm3,%%xmm6                   \n"
+    "phaddw    %%xmm2,%%xmm0                   \n"
+    "phaddw    %%xmm6,%%xmm1                   \n"
+    "psraw     $0x8,%%xmm0                     \n"
+    "psraw     $0x8,%%xmm1                     \n"
+    "packsswb  %%xmm1,%%xmm0                   \n"
+    "paddb     %%xmm5,%%xmm0                   \n"
+    "movlps    %%xmm0," MEMACCESS(1) "         \n"
+    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_abgr0),       // %0
+    "+r"(dst_u),           // %1
+    "+r"(dst_v),           // %2
+    "+rm"(width)           // %3
+  : "r"((intptr_t)(src_stride_abgr)), // %4
+    "m"(kABGRToV),  // %5
+    "m"(kABGRToU),  // %6
+    "m"(kAddUV128)  // %7
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+  );
+}
+
+void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
+                       uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile (
+    "movdqa    %5,%%xmm3                       \n"
+    "movdqa    %6,%%xmm4                       \n"
+    "movdqa    %7,%%xmm5                       \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm0                   \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm1                   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm2                   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
+    MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm6                   \n"
+
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm7                   \n"
+    "shufps    $0x88,%%xmm1,%%xmm0             \n"
+    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
+    "pavgb     %%xmm7,%%xmm0                   \n"
+    "movdqa    %%xmm2,%%xmm7                   \n"
+    "shufps    $0x88,%%xmm6,%%xmm2             \n"
+    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
+    "pavgb     %%xmm7,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm2,%%xmm6                   \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm2                   \n"
+    "pmaddubsw %%xmm3,%%xmm1                   \n"
+    "pmaddubsw %%xmm3,%%xmm6                   \n"
+    "phaddw    %%xmm2,%%xmm0                   \n"
+    "phaddw    %%xmm6,%%xmm1                   \n"
+    "psraw     $0x8,%%xmm0                     \n"
+    "psraw     $0x8,%%xmm1                     \n"
+    "packsswb  %%xmm1,%%xmm0                   \n"
+    "paddb     %%xmm5,%%xmm0                   \n"
+    "movlps    %%xmm0," MEMACCESS(1) "         \n"
+    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_rgba0),       // %0
+    "+r"(dst_u),           // %1
+    "+r"(dst_v),           // %2
+    "+rm"(width)           // %3
+  : "r"((intptr_t)(src_stride_rgba)), // %4
+    "m"(kRGBAToV),  // %5
+    "m"(kRGBAToU),  // %6
+    "m"(kAddUV128)  // %7
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+  );
+}
+
+#if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2)
+
+struct YuvConstants {
+  lvec8 kUVToB;     // 0
+  lvec8 kUVToG;     // 32
+  lvec8 kUVToR;     // 64
+  lvec16 kUVBiasB;  // 96
+  lvec16 kUVBiasG;  // 128
+  lvec16 kUVBiasR;  // 160
+  lvec16 kYToRgb;   // 192
+};
+
+// BT.601 YUV to RGB reference
+//  R = (Y - 16) * 1.164              - V * -1.596
+//  G = (Y - 16) * 1.164 - U *  0.391 - V *  0.813
+//  B = (Y - 16) * 1.164 - U * -2.018
+
+// Y contribution to R,G,B.  Scale and bias.
+// TODO(fbarchard): Consider moving constants into a common header.
+#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
+
+// U and V contributions to R,G,B.
+#define UB -128 /* max(-128, round(-2.018 * 64)) */
+#define UG 25 /* round(0.391 * 64) */
+#define VG 52 /* round(0.813 * 64) */
+#define VR -102 /* round(-1.596 * 64) */
+
+// Bias values to subtract 16 from Y and 128 from U and V.
+#define BB (UB * 128            + YGB)
+#define BG (UG * 128 + VG * 128 + YGB)
+#define BR            (VR * 128 + YGB)
+
+// BT601 constants for YUV to RGB.
+static YuvConstants SIMD_ALIGNED(kYuvConstants) = {
+  { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
+    UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },
+  { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
+    UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
+  { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
+    0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR },
+  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
+  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
+  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
+  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
+};
+
+// BT601 constants for NV21 where chroma plane is VU instead of UV.
+static YuvConstants SIMD_ALIGNED(kYvuConstants) = {
+  { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
+    0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },
+  { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
+    VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
+  { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
+    VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 },
+  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
+  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
+  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
+  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
+};
+
+#undef YG
+#undef YGB
+#undef UB
+#undef UG
+#undef VG
+#undef VR
+#undef BB
+#undef BG
+#undef BR
+
+// JPEG YUV to RGB reference
+// *  R = Y                - V * -1.40200
+// *  G = Y - U *  0.34414 - V *  0.71414
+// *  B = Y - U * -1.77200
+
+// Y contribution to R,G,B.  Scale and bias.
+// TODO(fbarchard): Consider moving constants into a common header.
+#define YGJ 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
+#define YGBJ 32  /* 64 / 2 */
+
+// U and V contributions to R,G,B.
+#define UBJ -113 /* round(-1.77200 * 64) */
+#define UGJ 22 /* round(0.34414 * 64) */
+#define VGJ 46 /* round(0.71414  * 64) */
+#define VRJ -90 /* round(-1.40200 * 64) */
+
+// Bias values to subtract 16 from Y and 128 from U and V.
+#define BBJ (UBJ * 128             + YGBJ)
+#define BGJ (UGJ * 128 + VGJ * 128 + YGBJ)
+#define BRJ             (VRJ * 128 + YGBJ)
+
+// JPEG constants for YUV to RGB.
+YuvConstants SIMD_ALIGNED(kYuvJConstants) = {
+  { UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0,
+    UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0 },
+  { UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
+    UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
+    UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
+    UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ },
+  { 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ,
+    0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ },
+  { BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ,
+    BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ },
+  { BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ,
+    BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ },
+  { BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ,
+    BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ },
+  { YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ,
+    YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ }
+};
+
+#undef YGJ
+#undef YGBJ
+#undef UBJ
+#undef UGJ
+#undef VGJ
+#undef VRJ
+#undef BBJ
+#undef BGJ
+#undef BRJ
+
+// Read 8 UV from 411
+#define READYUV444                                                             \
+    "movq       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \
+    MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
+    "lea        " MEMLEA(0x8, [u_buf]) ",%[u_buf]               \n"            \
+    "punpcklbw  %%xmm1,%%xmm0                                   \n"
+
+// Read 4 UV from 422, upsample to 8 UV
+#define READYUV422                                                             \
+    "movd       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \
+    MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
+    "lea        " MEMLEA(0x4, [u_buf]) ",%[u_buf]               \n"            \
+    "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
+    "punpcklwd  %%xmm0,%%xmm0                                   \n"
+
+// Read 2 UV from 411, upsample to 8 UV
+#define READYUV411                                                             \
+    "movd       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \
+    MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
+    "lea        " MEMLEA(0x2, [u_buf]) ",%[u_buf]               \n"            \
+    "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
+    "punpcklwd  %%xmm0,%%xmm0                                   \n"            \
+    "punpckldq  %%xmm0,%%xmm0                                   \n"
+
+// Read 4 UV from NV12, upsample to 8 UV
+#define READNV12                                                               \
+    "movq       " MEMACCESS([uv_buf]) ",%%xmm0                  \n"            \
+    "lea        " MEMLEA(0x8, [uv_buf]) ",%[uv_buf]             \n"            \
+    "punpcklwd  %%xmm0,%%xmm0                                   \n"
+
+// Convert 8 pixels: 8 UV and 8 Y
+#define YUVTORGB(YuvConstants)                                                 \
+    "movdqa     %%xmm0,%%xmm1                                   \n"            \
+    "movdqa     %%xmm0,%%xmm2                                   \n"            \
+    "movdqa     %%xmm0,%%xmm3                                   \n"            \
+    "movdqa     " MEMACCESS2(96, [YuvConstants]) ",%%xmm0       \n"            \
+    "pmaddubsw  " MEMACCESS([YuvConstants]) ",%%xmm1            \n"            \
+    "psubw      %%xmm1,%%xmm0                                   \n"            \
+    "movdqa     " MEMACCESS2(128, [YuvConstants]) ",%%xmm1      \n"            \
+    "pmaddubsw  " MEMACCESS2(32, [YuvConstants]) ",%%xmm2       \n"            \
+    "psubw      %%xmm2,%%xmm1                                   \n"            \
+    "movdqa     " MEMACCESS2(160, [YuvConstants]) ",%%xmm2      \n"            \
+    "pmaddubsw  " MEMACCESS2(64, [YuvConstants]) ",%%xmm3       \n"            \
+    "psubw      %%xmm3,%%xmm2                                   \n"            \
+    "movq       " MEMACCESS([y_buf]) ",%%xmm3                   \n"            \
+    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"            \
+    "punpcklbw  %%xmm3,%%xmm3                                   \n"            \
+    "pmulhuw    " MEMACCESS2(192, [YuvConstants]) ",%%xmm3      \n"            \
+    "paddsw     %%xmm3,%%xmm0                                   \n"            \
+    "paddsw     %%xmm3,%%xmm1                                   \n"            \
+    "paddsw     %%xmm3,%%xmm2                                   \n"            \
+    "psraw      $0x6,%%xmm0                                     \n"            \
+    "psraw      $0x6,%%xmm1                                     \n"            \
+    "psraw      $0x6,%%xmm2                                     \n"            \
+    "packuswb   %%xmm0,%%xmm0                                   \n"            \
+    "packuswb   %%xmm1,%%xmm1                                   \n"            \
+    "packuswb   %%xmm2,%%xmm2                                   \n"
+
+// Store 8 ARGB values. Assumes XMM5 is zero.
+#define STOREARGB                                                              \
+    "punpcklbw  %%xmm1,%%xmm0                                    \n"           \
+    "punpcklbw  %%xmm5,%%xmm2                                    \n"           \
+    "movdqa     %%xmm0,%%xmm1                                    \n"           \
+    "punpcklwd  %%xmm2,%%xmm0                                    \n"           \
+    "punpckhwd  %%xmm2,%%xmm1                                    \n"           \
+    "movdqu     %%xmm0," MEMACCESS([dst_argb]) "                 \n"           \
+    "movdqu     %%xmm1," MEMACCESS2(0x10, [dst_argb]) "          \n"           \
+    "lea        " MEMLEA(0x20, [dst_argb]) ", %[dst_argb]        \n"
+
+// Store 8 BGRA values. Assumes XMM5 is zero.
+#define STOREBGRA                                                              \
+    "pcmpeqb   %%xmm5,%%xmm5                                     \n"           \
+    "punpcklbw %%xmm0,%%xmm1                                     \n"           \
+    "punpcklbw %%xmm2,%%xmm5                                     \n"           \
+    "movdqa    %%xmm5,%%xmm0                                     \n"           \
+    "punpcklwd %%xmm1,%%xmm5                                     \n"           \
+    "punpckhwd %%xmm1,%%xmm0                                     \n"           \
+    "movdqu    %%xmm5," MEMACCESS([dst_bgra]) "                  \n"           \
+    "movdqu    %%xmm0," MEMACCESS2(0x10, [dst_bgra]) "           \n"           \
+    "lea       " MEMLEA(0x20, [dst_bgra]) ", %[dst_bgra]         \n"
+
+// Store 8 ABGR values. Assumes XMM5 is zero.
+#define STOREABGR                                                              \
+    "punpcklbw %%xmm1,%%xmm2                                     \n"           \
+    "punpcklbw %%xmm5,%%xmm0                                     \n"           \
+    "movdqa    %%xmm2,%%xmm1                                     \n"           \
+    "punpcklwd %%xmm0,%%xmm2                                     \n"           \
+    "punpckhwd %%xmm0,%%xmm1                                     \n"           \
+    "movdqu    %%xmm2," MEMACCESS([dst_abgr]) "                  \n"           \
+    "movdqu    %%xmm1," MEMACCESS2(0x10, [dst_abgr]) "           \n"           \
+    "lea       " MEMLEA(0x20, [dst_abgr]) ", %[dst_abgr]         \n"
+
+// Store 8 RGBA values. Assumes XMM5 is zero.
+#define STORERGBA                                                              \
+    "pcmpeqb   %%xmm5,%%xmm5                                     \n"           \
+    "punpcklbw %%xmm2,%%xmm1                                     \n"           \
+    "punpcklbw %%xmm0,%%xmm5                                     \n"           \
+    "movdqa    %%xmm5,%%xmm0                                     \n"           \
+    "punpcklwd %%xmm1,%%xmm5                                     \n"           \
+    "punpckhwd %%xmm1,%%xmm0                                     \n"           \
+    "movdqu    %%xmm5," MEMACCESS([dst_rgba]) "                  \n"           \
+    "movdqu    %%xmm0," MEMACCESS2(0x10, [dst_rgba]) "           \n"           \
+    "lea       " MEMLEA(0x20, [dst_rgba]) ",%[dst_rgba]          \n"
+
+void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
+                                const uint8* u_buf,
+                                const uint8* v_buf,
+                                uint8* dst_argb,
+                                int width) {
+  asm volatile (
+    "sub       %[u_buf],%[v_buf]               \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    LABELALIGN
+  "1:                                          \n"
+    READYUV444
+    YUVTORGB(kYuvConstants)
+    STOREARGB
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+
+// TODO(fbarchard): Consider putting masks into constants.
+void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
+                                 const uint8* u_buf,
+                                 const uint8* v_buf,
+                                 uint8* dst_rgb24,
+                                 int width) {
+  asm volatile (
+    "movdqa    %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
+    "movdqa    %[kShuffleMaskARGBToRGB24],%%xmm6   \n"
+    "sub       %[u_buf],%[v_buf]               \n"
+    LABELALIGN
+  "1:                                          \n"
+    READYUV422
+    YUVTORGB(kYuvConstants)
+    "punpcklbw %%xmm1,%%xmm0                   \n"
+    "punpcklbw %%xmm2,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklwd %%xmm2,%%xmm0                   \n"
+    "punpckhwd %%xmm2,%%xmm1                   \n"
+    "pshufb    %%xmm5,%%xmm0                   \n"
+    "pshufb    %%xmm6,%%xmm1                   \n"
+    "palignr   $0xc,%%xmm0,%%xmm1              \n"
+    "movq      %%xmm0," MEMACCESS([dst_rgb24]) "\n"
+    "movdqu    %%xmm1," MEMACCESS2(0x8,[dst_rgb24]) "\n"
+    "lea       " MEMLEA(0x18,[dst_rgb24]) ",%[dst_rgb24] \n"
+    "subl      $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_rgb24]"+r"(dst_rgb24),  // %[dst_rgb24]
+// TODO(fbarchard): Make width a register for 32 bit.
+#if defined(__i386__) && defined(__pic__)
+    [width]"+m"(width)     // %[width]
+#else
+    [width]"+rm"(width)    // %[width]
+#endif
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB),
+    [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
+    [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6"
+  );
+}
+
+void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf,
+                               const uint8* u_buf,
+                               const uint8* v_buf,
+                               uint8* dst_raw,
+                               int width) {
+  asm volatile (
+    "movdqa    %[kShuffleMaskARGBToRAW_0],%%xmm5 \n"
+    "movdqa    %[kShuffleMaskARGBToRAW],%%xmm6   \n"
+    "sub       %[u_buf],%[v_buf]               \n"
+    LABELALIGN
+  "1:                                          \n"
+    READYUV422
+    YUVTORGB(kYuvConstants)
+    "punpcklbw %%xmm1,%%xmm0                   \n"
+    "punpcklbw %%xmm2,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklwd %%xmm2,%%xmm0                   \n"
+    "punpckhwd %%xmm2,%%xmm1                   \n"
+    "pshufb    %%xmm5,%%xmm0                   \n"
+    "pshufb    %%xmm6,%%xmm1                   \n"
+    "palignr   $0xc,%%xmm0,%%xmm1              \n"
+    "movq      %%xmm0," MEMACCESS([dst_raw]) " \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x8,[dst_raw]) "\n"
+    "lea       " MEMLEA(0x18,[dst_raw]) ",%[dst_raw] \n"
+    "subl      $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_raw]"+r"(dst_raw),  // %[dst_raw]
+// TODO(fbarchard): Make width a register for 32 bit.
+#if defined(__i386__) && defined(__pic__)
+    [width]"+m"(width)    // %[width]
+#else
+    [width]"+rm"(width)    // %[width]
+#endif
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB),
+    [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0),
+    [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW)
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6"
+  );
+}
+
+void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
+                                const uint8* u_buf,
+                                const uint8* v_buf,
+                                uint8* dst_argb,
+                                int width) {
+  asm volatile (
+    "sub       %[u_buf],%[v_buf]               \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    LABELALIGN
+  "1:                                          \n"
+    READYUV422
+    YUVTORGB(kYuvConstants)
+    STOREARGB
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+
+void OMITFP J422ToARGBRow_SSSE3(const uint8* y_buf,
+                                const uint8* u_buf,
+                                const uint8* v_buf,
+                                uint8* dst_argb,
+                                int width) {
+  asm volatile (
+    "sub       %[u_buf],%[v_buf]               \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    LABELALIGN
+  "1:                                          \n"
+    READYUV422
+    YUVTORGB(kYuvConstants)
+    STOREARGB
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvJConstants.kUVToB) // %[kYuvConstants]
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+
+void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
+                                const uint8* u_buf,
+                                const uint8* v_buf,
+                                uint8* dst_argb,
+                                int width) {
+  asm volatile (
+    "sub       %[u_buf],%[v_buf]               \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    LABELALIGN
+  "1:                                          \n"
+    READYUV411
+    YUVTORGB(kYuvConstants)
+    STOREARGB
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+
+void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
+                                const uint8* uv_buf,
+                                uint8* dst_argb,
+                                int width) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    LABELALIGN
+  "1:                                          \n"
+    READNV12
+    YUVTORGB(kYuvConstants)
+    STOREARGB
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [uv_buf]"+r"(uv_buf),    // %[uv_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+  // Does not use r14.
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+
+void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
+                                const uint8* uv_buf,
+                                uint8* dst_argb,
+                                int width) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    LABELALIGN
+  "1:                                          \n"
+    READNV12
+    YUVTORGB(kYuvConstants)
+    STOREARGB
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [uv_buf]"+r"(uv_buf),    // %[uv_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYvuConstants.kUVToB) // %[kYuvConstants]
+  // Does not use r14.
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+
+void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
+                                const uint8* u_buf,
+                                const uint8* v_buf,
+                                uint8* dst_bgra,
+                                int width) {
+  asm volatile (
+    "sub       %[u_buf],%[v_buf]               \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    LABELALIGN
+  "1:                                          \n"
+    READYUV422
+    YUVTORGB(kYuvConstants)
+    STOREBGRA
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_bgra]"+r"(dst_bgra),  // %[dst_bgra]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+
+void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
+                                const uint8* u_buf,
+                                const uint8* v_buf,
+                                uint8* dst_abgr,
+                                int width) {
+  asm volatile (
+    "sub       %[u_buf],%[v_buf]               \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    LABELALIGN
+  "1:                                          \n"
+    READYUV422
+    YUVTORGB(kYuvConstants)
+    STOREABGR
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_abgr]"+r"(dst_abgr),  // %[dst_abgr]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+
+void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
+                                const uint8* u_buf,
+                                const uint8* v_buf,
+                                uint8* dst_rgba,
+                                int width) {
+  asm volatile (
+    "sub       %[u_buf],%[v_buf]               \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    LABELALIGN
+  "1:                                          \n"
+    READYUV422
+    YUVTORGB(kYuvConstants)
+    STORERGBA
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_rgba]"+r"(dst_rgba),  // %[dst_rgba]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+
+#endif  // HAS_I422TOARGBROW_SSSE3
+
+// Read 8 UV from 422, upsample to 16 UV.
+#define READYUV422_AVX2                                                        \
+    "vmovq       " MEMACCESS([u_buf]) ",%%xmm0                      \n"        \
+    MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1)                           \
+    "lea        " MEMLEA(0x8, [u_buf]) ",%[u_buf]                   \n"        \
+    "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n"        \
+    "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
+    "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n"
+
+// Convert 16 pixels: 16 UV and 16 Y.
+#define YUVTORGB_AVX2(YuvConstants)                                            \
+    "vpmaddubsw  " MEMACCESS2(64, [YuvConstants]) ",%%ymm0,%%ymm2   \n"        \
+    "vpmaddubsw  " MEMACCESS2(32, [YuvConstants]) ",%%ymm0,%%ymm1   \n"        \
+    "vpmaddubsw  " MEMACCESS([YuvConstants]) ",%%ymm0,%%ymm0        \n"        \
+    "vmovdqu     " MEMACCESS2(160, [YuvConstants]) ",%%ymm3         \n"        \
+    "vpsubw      %%ymm2,%%ymm3,%%ymm2                               \n"        \
+    "vmovdqu     " MEMACCESS2(128, [YuvConstants]) ",%%ymm3         \n"        \
+    "vpsubw      %%ymm1,%%ymm3,%%ymm1                               \n"        \
+    "vmovdqu     " MEMACCESS2(96, [YuvConstants]) ",%%ymm3          \n"        \
+    "vpsubw      %%ymm0,%%ymm3,%%ymm0                               \n"        \
+    "vmovdqu     " MEMACCESS([y_buf]) ",%%xmm3                      \n"        \
+    "lea         " MEMLEA(0x10, [y_buf]) ",%[y_buf]                 \n"        \
+    "vpermq      $0xd8,%%ymm3,%%ymm3                                \n"        \
+    "vpunpcklbw  %%ymm3,%%ymm3,%%ymm3                               \n"        \
+    "vpmulhuw    " MEMACCESS2(192, [YuvConstants]) ",%%ymm3,%%ymm3  \n"        \
+    "vpaddsw     %%ymm3,%%ymm0,%%ymm0           \n"                            \
+    "vpaddsw     %%ymm3,%%ymm1,%%ymm1           \n"                            \
+    "vpaddsw     %%ymm3,%%ymm2,%%ymm2           \n"                            \
+    "vpsraw      $0x6,%%ymm0,%%ymm0             \n"                            \
+    "vpsraw      $0x6,%%ymm1,%%ymm1             \n"                            \
+    "vpsraw      $0x6,%%ymm2,%%ymm2             \n"                            \
+    "vpackuswb   %%ymm0,%%ymm0,%%ymm0           \n"                            \
+    "vpackuswb   %%ymm1,%%ymm1,%%ymm1           \n"                            \
+    "vpackuswb   %%ymm2,%%ymm2,%%ymm2           \n"
+
+#if defined(HAS_I422TOBGRAROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes).
+void OMITFP I422ToBGRARow_AVX2(const uint8* y_buf,
+                               const uint8* u_buf,
+                               const uint8* v_buf,
+                               uint8* dst_bgra,
+                               int width) {
+  asm volatile (
+    "sub       %[u_buf],%[v_buf]               \n"
+    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+    LABELALIGN
+  "1:                                          \n"
+    READYUV422_AVX2
+    YUVTORGB_AVX2(kYuvConstants)
+
+    // Step 3: Weave into BGRA
+    "vpunpcklbw %%ymm0,%%ymm1,%%ymm1           \n"  // GB
+    "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
+    "vpunpcklbw %%ymm2,%%ymm5,%%ymm2           \n"  // AR
+    "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
+    "vpunpcklwd %%ymm1,%%ymm2,%%ymm0           \n"  // ARGB first 8 pixels
+    "vpunpckhwd %%ymm1,%%ymm2,%%ymm2           \n"  // ARGB next 8 pixels
+
+    "vmovdqu    %%ymm0," MEMACCESS([dst_bgra]) "\n"
+    "vmovdqu    %%ymm2," MEMACCESS2(0x20,[dst_bgra]) "\n"
+    "lea       " MEMLEA(0x40,[dst_bgra]) ",%[dst_bgra] \n"
+    "sub       $0x10,%[width]                  \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_bgra]"+r"(dst_bgra),  // %[dst_bgra]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB)  // %[kYuvConstants]
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+#endif  // HAS_I422TOBGRAROW_AVX2
+
+#if defined(HAS_I422TOARGBROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf,
+                               const uint8* u_buf,
+                               const uint8* v_buf,
+                               uint8* dst_argb,
+                               int width) {
+  asm volatile (
+    "sub       %[u_buf],%[v_buf]               \n"
+    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+    LABELALIGN
+  "1:                                          \n"
+    READYUV422_AVX2
+    YUVTORGB_AVX2(kYuvConstants)
+
+    // Step 3: Weave into ARGB
+    "vpunpcklbw %%ymm1,%%ymm0,%%ymm0           \n"  // BG
+    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+    "vpunpcklbw %%ymm5,%%ymm2,%%ymm2           \n"  // RA
+    "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
+    "vpunpcklwd %%ymm2,%%ymm0,%%ymm1           \n"  // BGRA first 8 pixels
+    "vpunpckhwd %%ymm2,%%ymm0,%%ymm0           \n"  // BGRA next 8 pixels
+
+    "vmovdqu    %%ymm1," MEMACCESS([dst_argb]) "\n"
+    "vmovdqu    %%ymm0," MEMACCESS2(0x20,[dst_argb]) "\n"
+    "lea       " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
+    "sub       $0x10,%[width]                  \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB)  // %[kYuvConstants]
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+#endif  // HAS_I422TOARGBROW_AVX2
+
+#if defined(HAS_J422TOARGBROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+void OMITFP J422ToARGBRow_AVX2(const uint8* y_buf,
+                               const uint8* u_buf,
+                               const uint8* v_buf,
+                               uint8* dst_argb,
+                               int width) {
+  asm volatile (
+    "sub       %[u_buf],%[v_buf]               \n"
+    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+    LABELALIGN
+  "1:                                          \n"
+    READYUV422_AVX2
+    YUVTORGB_AVX2(kYuvConstants)
+
+    // Step 3: Weave into ARGB
+    "vpunpcklbw %%ymm1,%%ymm0,%%ymm0           \n"  // BG
+    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+    "vpunpcklbw %%ymm5,%%ymm2,%%ymm2           \n"  // RA
+    "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
+    "vpunpcklwd %%ymm2,%%ymm0,%%ymm1           \n"  // BGRA first 8 pixels
+    "vpunpckhwd %%ymm2,%%ymm0,%%ymm0           \n"  // BGRA next 8 pixels
+
+    "vmovdqu    %%ymm1," MEMACCESS([dst_argb]) "\n"
+    "vmovdqu    %%ymm0," MEMACCESS2(0x20,[dst_argb]) "\n"
+    "lea       " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
+    "sub       $0x10,%[width]                  \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvJConstants.kUVToB)  // %[kYuvConstants]
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+#endif  // HAS_J422TOARGBROW_AVX2
+
+#if defined(HAS_I422TOABGRROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes).
+void OMITFP I422ToABGRRow_AVX2(const uint8* y_buf,
+                               const uint8* u_buf,
+                               const uint8* v_buf,
+                               uint8* dst_argb,
+                               int width) {
+  asm volatile (
+    "sub       %[u_buf],%[v_buf]               \n"
+    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+    LABELALIGN
+  "1:                                          \n"
+    READYUV422_AVX2
+    YUVTORGB_AVX2(kYuvConstants)
+
+    // Step 3: Weave into ABGR
+    "vpunpcklbw %%ymm1,%%ymm2,%%ymm1           \n"  // RG
+    "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
+    "vpunpcklbw %%ymm5,%%ymm0,%%ymm2           \n"  // BA
+    "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
+    "vpunpcklwd %%ymm2,%%ymm1,%%ymm0           \n"  // RGBA first 8 pixels
+    "vpunpckhwd %%ymm2,%%ymm1,%%ymm1           \n"  // RGBA next 8 pixels
+    "vmovdqu    %%ymm0," MEMACCESS([dst_argb]) "\n"
+    "vmovdqu    %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n"
+    "lea       " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
+    "sub       $0x10,%[width]                  \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB)  // %[kYuvConstants]
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+#endif  // HAS_I422TOABGRROW_AVX2
+
+#if defined(HAS_I422TORGBAROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
+void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf,
+                               const uint8* u_buf,
+                               const uint8* v_buf,
+                               uint8* dst_argb,
+                               int width) {
+  asm volatile (
+    "sub       %[u_buf],%[v_buf]               \n"
+    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+    LABELALIGN
+  "1:                                          \n"
+    READYUV422_AVX2
+    YUVTORGB_AVX2(kYuvConstants)
+
+    // Step 3: Weave into RGBA
+    "vpunpcklbw %%ymm2,%%ymm1,%%ymm1           \n"
+    "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
+    "vpunpcklbw %%ymm0,%%ymm5,%%ymm2           \n"
+    "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
+    "vpunpcklwd %%ymm1,%%ymm2,%%ymm0           \n"
+    "vpunpckhwd %%ymm1,%%ymm2,%%ymm1           \n"
+    "vmovdqu    %%ymm0," MEMACCESS([dst_argb]) "\n"
+    "vmovdqu    %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n"
+    "lea       " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
+    "sub       $0x10,%[width]                  \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB)  // %[kYuvConstants]
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+#endif  // HAS_I422TORGBAROW_AVX2
+
+#ifdef HAS_I400TOARGBROW_SSE2
+void I400ToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) {
+  asm volatile (
+    "mov       $0x4a354a35,%%eax               \n"  // 4a35 = 18997 = 1.164
+    "movd      %%eax,%%xmm2                    \n"
+    "pshufd    $0x0,%%xmm2,%%xmm2              \n"
+    "mov       $0x04880488,%%eax               \n"  // 0488 = 1160 = 1.164 * 16
+    "movd      %%eax,%%xmm3                    \n"
+    "pshufd    $0x0,%%xmm3,%%xmm3              \n"
+    "pcmpeqb   %%xmm4,%%xmm4                   \n"
+    "pslld     $0x18,%%xmm4                    \n"
+    LABELALIGN
+  "1:                                          \n"
+    // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
+    "movq      " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x8,0) ",%0            \n"
+    "punpcklbw %%xmm0,%%xmm0                   \n"
+    "pmulhuw   %%xmm2,%%xmm0                   \n"
+    "psubusw   %%xmm3,%%xmm0                   \n"
+    "psrlw     $6, %%xmm0                      \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+
+    // Step 2: Weave into ARGB
+    "punpcklbw %%xmm0,%%xmm0                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklwd %%xmm0,%%xmm0                   \n"
+    "punpckhwd %%xmm1,%%xmm1                   \n"
+    "por       %%xmm4,%%xmm0                   \n"
+    "por       %%xmm4,%%xmm1                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(y_buf),     // %0
+    "+r"(dst_argb),  // %1
+    "+rm"(width)     // %2
+  :
+  : "memory", "cc", "eax"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
+  );
+}
+#endif  // HAS_I400TOARGBROW_SSE2
+
+#ifdef HAS_I400TOARGBROW_AVX2
+// 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
+// note: vpunpcklbw mutates and vpackuswb unmutates.
+void I400ToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, int width) {
+  asm volatile (
+    "mov        $0x4a354a35,%%eax              \n" // 0488 = 1160 = 1.164 * 16
+    "vmovd      %%eax,%%xmm2                   \n"
+    "vbroadcastss %%xmm2,%%ymm2                \n"
+    "mov        $0x4880488,%%eax               \n" // 4a35 = 18997 = 1.164
+    "vmovd      %%eax,%%xmm3                   \n"
+    "vbroadcastss %%xmm3,%%ymm3                \n"
+    "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
+    "vpslld     $0x18,%%ymm4,%%ymm4            \n"
+
+    LABELALIGN
+  "1:                                          \n"
+    // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164
+    "vmovdqu    " MEMACCESS(0) ",%%xmm0        \n"
+    "lea        " MEMLEA(0x10,0) ",%0          \n"
+    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+    "vpunpcklbw %%ymm0,%%ymm0,%%ymm0           \n"
+    "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
+    "vpsubusw   %%ymm3,%%ymm0,%%ymm0           \n"
+    "vpsrlw     $0x6,%%ymm0,%%ymm0             \n"
+    "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"
+    "vpunpcklbw %%ymm0,%%ymm0,%%ymm1           \n"
+    "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
+    "vpunpcklwd %%ymm1,%%ymm1,%%ymm0           \n"
+    "vpunpckhwd %%ymm1,%%ymm1,%%ymm1           \n"
+    "vpor       %%ymm4,%%ymm0,%%ymm0           \n"
+    "vpor       %%ymm4,%%ymm1,%%ymm1           \n"
+    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
+    "vmovdqu    %%ymm1," MEMACCESS2(0x20,1) "  \n"
+    "lea       " MEMLEA(0x40,1) ",%1           \n"
+    "sub        $0x10,%2                       \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(y_buf),     // %0
+    "+r"(dst_argb),  // %1
+    "+rm"(width)     // %2
+  :
+  : "memory", "cc", "eax"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
+  );
+}
+#endif  // HAS_I400TOARGBROW_AVX2
+
+#ifdef HAS_MIRRORROW_SSSE3
+// Shuffle table for reversing the bytes.
+static uvec8 kShuffleMirror = {
+  15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
+};
+
+void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
+  intptr_t temp_width = (intptr_t)(width);
+  asm volatile (
+    "movdqa    %3,%%xmm5                       \n"
+    LABELALIGN
+  "1:                                          \n"
+    MEMOPREG(movdqu,-0x10,0,2,1,xmm0)          //  movdqu -0x10(%0,%2),%%xmm0
+    "pshufb    %%xmm5,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(temp_width)  // %2
+  : "m"(kShuffleMirror) // %3
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm5"
+  );
+}
+#endif  // HAS_MIRRORROW_SSSE3
+
+#ifdef HAS_MIRRORROW_AVX2
+void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
+  intptr_t temp_width = (intptr_t)(width);
+  asm volatile (
+    "vbroadcastf128 %3,%%ymm5                  \n"
+    LABELALIGN
+  "1:                                          \n"
+    MEMOPREG(vmovdqu,-0x20,0,2,1,ymm0)         //  vmovdqu -0x20(%0,%2),%%ymm0
+    "vpshufb    %%ymm5,%%ymm0,%%ymm0           \n"
+    "vpermq     $0x4e,%%ymm0,%%ymm0            \n"
+    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x20,%2                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(temp_width)  // %2
+  : "m"(kShuffleMirror) // %3
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm5"
+  );
+}
+#endif  // HAS_MIRRORROW_AVX2
+
+#ifdef HAS_MIRRORROW_SSE2
+void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
+  intptr_t temp_width = (intptr_t)(width);
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    MEMOPREG(movdqu,-0x10,0,2,1,xmm0)          //  movdqu -0x10(%0,%2),%%xmm0
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "psllw     $0x8,%%xmm0                     \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "por       %%xmm1,%%xmm0                   \n"
+    "pshuflw   $0x1b,%%xmm0,%%xmm0             \n"
+    "pshufhw   $0x1b,%%xmm0,%%xmm0             \n"
+    "pshufd    $0x4e,%%xmm0,%%xmm0             \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1)",%1            \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(temp_width)  // %2
+  :
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1"
+  );
+}
+#endif  // HAS_MIRRORROW_SSE2
+
+#ifdef HAS_MIRRORROW_UV_SSSE3
+// Shuffle table for reversing the bytes of UV channels.
+static uvec8 kShuffleMirrorUV = {
+  14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
+};
+void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
+                       int width) {
+  intptr_t temp_width = (intptr_t)(width);
+  asm volatile (
+    "movdqa    %4,%%xmm1                       \n"
+    "lea       " MEMLEA4(-0x10,0,3,2) ",%0     \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(-0x10,0) ",%0          \n"
+    "pshufb    %%xmm1,%%xmm0                   \n"
+    "movlpd    %%xmm0," MEMACCESS(1) "         \n"
+    MEMOPMEM(movhpd,xmm0,0x00,1,2,1)           //  movhpd    %%xmm0,(%1,%2)
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $8,%3                           \n"
+    "jg        1b                              \n"
+  : "+r"(src),      // %0
+    "+r"(dst_u),    // %1
+    "+r"(dst_v),    // %2
+    "+r"(temp_width)  // %3
+  : "m"(kShuffleMirrorUV)  // %4
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1"
+  );
+}
+#endif  // HAS_MIRRORROW_UV_SSSE3
+
+#ifdef HAS_ARGBMIRRORROW_SSE2
+
+void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
+  intptr_t temp_width = (intptr_t)(width);
+  asm volatile (
+    "lea       " MEMLEA4(-0x10,0,2,4) ",%0     \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "pshufd    $0x1b,%%xmm0,%%xmm0             \n"
+    "lea       " MEMLEA(-0x10,0) ",%0          \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(temp_width)  // %2
+  :
+  : "memory", "cc"
+    , "xmm0"
+  );
+}
+#endif  // HAS_ARGBMIRRORROW_SSE2
+
+#ifdef HAS_ARGBMIRRORROW_AVX2
+// Shuffle table for reversing the bytes.
+static const ulvec32 kARGBShuffleMirror_AVX2 = {
+  7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
+};
+void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
+  intptr_t temp_width = (intptr_t)(width);
+  asm volatile (
+    "vmovdqu    %3,%%ymm5                      \n"
+    LABELALIGN
+  "1:                                          \n"
+    VMEMOPREG(vpermd,-0x20,0,2,4,ymm5,ymm0) // vpermd -0x20(%0,%2,4),ymm5,ymm0
+    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
+    "lea        " MEMLEA(0x20,1) ",%1          \n"
+    "sub        $0x8,%2                        \n"
+    "jg         1b                             \n"
+    "vzeroupper                                \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(temp_width)  // %2
+  : "m"(kARGBShuffleMirror_AVX2) // %3
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm5"
+  );
+}
+#endif  // HAS_ARGBMIRRORROW_AVX2
+
+#ifdef HAS_SPLITUVROW_AVX2
+void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5             \n"
+    "vpsrlw     $0x8,%%ymm5,%%ymm5               \n"
+    "sub        %1,%2                            \n"
+    LABELALIGN
+  "1:                                            \n"
+    "vmovdqu    " MEMACCESS(0) ",%%ymm0          \n"
+    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1    \n"
+    "lea        " MEMLEA(0x40,0) ",%0            \n"
+    "vpsrlw     $0x8,%%ymm0,%%ymm2               \n"
+    "vpsrlw     $0x8,%%ymm1,%%ymm3               \n"
+    "vpand      %%ymm5,%%ymm0,%%ymm0             \n"
+    "vpand      %%ymm5,%%ymm1,%%ymm1             \n"
+    "vpackuswb  %%ymm1,%%ymm0,%%ymm0             \n"
+    "vpackuswb  %%ymm3,%%ymm2,%%ymm2             \n"
+    "vpermq     $0xd8,%%ymm0,%%ymm0              \n"
+    "vpermq     $0xd8,%%ymm2,%%ymm2              \n"
+    "vmovdqu    %%ymm0," MEMACCESS(1) "          \n"
+    MEMOPMEM(vmovdqu,ymm2,0x00,1,2,1)             //  vmovdqu %%ymm2,(%1,%2)
+    "lea        " MEMLEA(0x20,1) ",%1            \n"
+    "sub        $0x20,%3                         \n"
+    "jg         1b                               \n"
+    "vzeroupper                                  \n"
+  : "+r"(src_uv),     // %0
+    "+r"(dst_u),      // %1
+    "+r"(dst_v),      // %2
+    "+r"(pix)         // %3
+  :
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+#endif  // HAS_SPLITUVROW_AVX2
+
+#ifdef HAS_SPLITUVROW_SSE2
+void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "pcmpeqb    %%xmm5,%%xmm5                    \n"
+    "psrlw      $0x8,%%xmm5                      \n"
+    "sub        %1,%2                            \n"
+    LABELALIGN
+  "1:                                            \n"
+    "movdqu     " MEMACCESS(0) ",%%xmm0          \n"
+    "movdqu     " MEMACCESS2(0x10,0) ",%%xmm1    \n"
+    "lea        " MEMLEA(0x20,0) ",%0            \n"
+    "movdqa     %%xmm0,%%xmm2                    \n"
+    "movdqa     %%xmm1,%%xmm3                    \n"
+    "pand       %%xmm5,%%xmm0                    \n"
+    "pand       %%xmm5,%%xmm1                    \n"
+    "packuswb   %%xmm1,%%xmm0                    \n"
+    "psrlw      $0x8,%%xmm2                      \n"
+    "psrlw      $0x8,%%xmm3                      \n"
+    "packuswb   %%xmm3,%%xmm2                    \n"
+    "movdqu     %%xmm0," MEMACCESS(1) "          \n"
+    MEMOPMEM(movdqu,xmm2,0x00,1,2,1)             //  movdqu     %%xmm2,(%1,%2)
+    "lea        " MEMLEA(0x10,1) ",%1            \n"
+    "sub        $0x10,%3                         \n"
+    "jg         1b                               \n"
+  : "+r"(src_uv),     // %0
+    "+r"(dst_u),      // %1
+    "+r"(dst_v),      // %2
+    "+r"(pix)         // %3
+  :
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+#endif  // HAS_SPLITUVROW_SSE2
+
+#ifdef HAS_MERGEUVROW_AVX2
+void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                     int width) {
+  asm volatile (
+    "sub       %0,%1                             \n"
+    LABELALIGN
+  "1:                                            \n"
+    "vmovdqu   " MEMACCESS(0) ",%%ymm0           \n"
+    MEMOPREG(vmovdqu,0x00,0,1,1,ymm1)             //  vmovdqu (%0,%1,1),%%ymm1
+    "lea       " MEMLEA(0x20,0) ",%0             \n"
+    "vpunpcklbw %%ymm1,%%ymm0,%%ymm2             \n"
+    "vpunpckhbw %%ymm1,%%ymm0,%%ymm0             \n"
+    "vextractf128 $0x0,%%ymm2," MEMACCESS(2) "   \n"
+    "vextractf128 $0x0,%%ymm0," MEMACCESS2(0x10,2) "\n"
+    "vextractf128 $0x1,%%ymm2," MEMACCESS2(0x20,2) "\n"
+    "vextractf128 $0x1,%%ymm0," MEMACCESS2(0x30,2) "\n"
+    "lea       " MEMLEA(0x40,2) ",%2             \n"
+    "sub       $0x20,%3                          \n"
+    "jg        1b                                \n"
+    "vzeroupper                                  \n"
+  : "+r"(src_u),     // %0
+    "+r"(src_v),     // %1
+    "+r"(dst_uv),    // %2
+    "+r"(width)      // %3
+  :
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2"
+  );
+}
+#endif  // HAS_MERGEUVROW_AVX2
+
+#ifdef HAS_MERGEUVROW_SSE2
+void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                     int width) {
+  asm volatile (
+    "sub       %0,%1                             \n"
+    LABELALIGN
+  "1:                                            \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0           \n"
+    MEMOPREG(movdqu,0x00,0,1,1,xmm1)             //  movdqu    (%0,%1,1),%%xmm1
+    "lea       " MEMLEA(0x10,0) ",%0             \n"
+    "movdqa    %%xmm0,%%xmm2                     \n"
+    "punpcklbw %%xmm1,%%xmm0                     \n"
+    "punpckhbw %%xmm1,%%xmm2                     \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "           \n"
+    "movdqu    %%xmm2," MEMACCESS2(0x10,2) "     \n"
+    "lea       " MEMLEA(0x20,2) ",%2             \n"
+    "sub       $0x10,%3                          \n"
+    "jg        1b                                \n"
+  : "+r"(src_u),     // %0
+    "+r"(src_v),     // %1
+    "+r"(dst_uv),    // %2
+    "+r"(width)      // %3
+  :
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2"
+  );
+}
+#endif  // HAS_MERGEUVROW_SSE2
+
+#ifdef HAS_COPYROW_SSE2
+void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x20,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(count)  // %2
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1"
+  );
+}
+#endif  // HAS_COPYROW_SSE2
+
+#ifdef HAS_COPYROW_AVX
+void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
+    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
+    "vmovdqu   %%ymm1," MEMACCESS2(0x20,1) "   \n"
+    "lea       " MEMLEA(0x40,1) ",%1           \n"
+    "sub       $0x40,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(count)  // %2
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1"
+  );
+}
+#endif  // HAS_COPYROW_AVX
+
+#ifdef HAS_COPYROW_ERMS
+// Multiple of 1.
+void CopyRow_ERMS(const uint8* src, uint8* dst, int width) {
+  size_t width_tmp = (size_t)(width);
+  asm volatile (
+    "rep movsb " MEMMOVESTRING(0,1) "          \n"
+  : "+S"(src),  // %0
+    "+D"(dst),  // %1
+    "+c"(width_tmp) // %2
+  :
+  : "memory", "cc"
+  );
+}
+#endif  // HAS_COPYROW_ERMS
+
+#ifdef HAS_ARGBCOPYALPHAROW_SSE2
+// width in pixels
+void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
+  asm volatile (
+    "pcmpeqb   %%xmm0,%%xmm0                   \n"
+    "pslld     $0x18,%%xmm0                    \n"
+    "pcmpeqb   %%xmm1,%%xmm1                   \n"
+    "psrld     $0x8,%%xmm1                     \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm3   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm4         \n"
+    "movdqu    " MEMACCESS2(0x10,1) ",%%xmm5   \n"
+    "pand      %%xmm0,%%xmm2                   \n"
+    "pand      %%xmm0,%%xmm3                   \n"
+    "pand      %%xmm1,%%xmm4                   \n"
+    "pand      %%xmm1,%%xmm5                   \n"
+    "por       %%xmm4,%%xmm2                   \n"
+    "por       %%xmm5,%%xmm3                   \n"
+    "movdqu    %%xmm2," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm3," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(width)  // %2
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_ARGBCOPYALPHAROW_SSE2
+
+#ifdef HAS_ARGBCOPYALPHAROW_AVX2
+// width in pixels
+void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
+  asm volatile (
+    "vpcmpeqb  %%ymm0,%%ymm0,%%ymm0            \n"
+    "vpsrld    $0x8,%%ymm0,%%ymm0              \n"
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu   " MEMACCESS(0) ",%%ymm1         \n"
+    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm2   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1        \n"
+    "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2  \n"
+    "vmovdqu   %%ymm1," MEMACCESS(1) "         \n"
+    "vmovdqu   %%ymm2," MEMACCESS2(0x20,1) "   \n"
+    "lea       " MEMLEA(0x40,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(width)  // %2
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2"
+  );
+}
+#endif  // HAS_ARGBCOPYALPHAROW_AVX2
+
+#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
+// width in pixels
+void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
+  asm volatile (
+    "pcmpeqb   %%xmm0,%%xmm0                   \n"
+    "pslld     $0x18,%%xmm0                    \n"
+    "pcmpeqb   %%xmm1,%%xmm1                   \n"
+    "psrld     $0x8,%%xmm1                     \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movq      " MEMACCESS(0) ",%%xmm2         \n"
+    "lea       " MEMLEA(0x8,0) ",%0            \n"
+    "punpcklbw %%xmm2,%%xmm2                   \n"
+    "punpckhwd %%xmm2,%%xmm3                   \n"
+    "punpcklwd %%xmm2,%%xmm2                   \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm4         \n"
+    "movdqu    " MEMACCESS2(0x10,1) ",%%xmm5   \n"
+    "pand      %%xmm0,%%xmm2                   \n"
+    "pand      %%xmm0,%%xmm3                   \n"
+    "pand      %%xmm1,%%xmm4                   \n"
+    "pand      %%xmm1,%%xmm5                   \n"
+    "por       %%xmm4,%%xmm2                   \n"
+    "por       %%xmm5,%%xmm3                   \n"
+    "movdqu    %%xmm2," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm3," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(width)  // %2
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_ARGBCOPYYTOALPHAROW_SSE2
+
+#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
+// width in pixels
+void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
+  asm volatile (
+    "vpcmpeqb  %%ymm0,%%ymm0,%%ymm0            \n"
+    "vpsrld    $0x8,%%ymm0,%%ymm0              \n"
+    LABELALIGN
+  "1:                                          \n"
+    "vpmovzxbd " MEMACCESS(0) ",%%ymm1         \n"
+    "vpmovzxbd " MEMACCESS2(0x8,0) ",%%ymm2    \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "vpslld    $0x18,%%ymm1,%%ymm1             \n"
+    "vpslld    $0x18,%%ymm2,%%ymm2             \n"
+    "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1        \n"
+    "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2  \n"
+    "vmovdqu   %%ymm1," MEMACCESS(1) "         \n"
+    "vmovdqu   %%ymm2," MEMACCESS2(0x20,1) "   \n"
+    "lea       " MEMLEA(0x40,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(width)  // %2
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2"
+  );
+}
+#endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2
+
+#ifdef HAS_SETROW_X86
+void SetRow_X86(uint8* dst, uint8 v8, int width) {
+  size_t width_tmp = (size_t)(width >> 2);
+  const uint32 v32 = v8 * 0x01010101;  // Duplicate byte to all bytes.
+  asm volatile (
+    "rep stosl " MEMSTORESTRING(eax,0) "       \n"
+    : "+D"(dst),       // %0
+      "+c"(width_tmp)  // %1
+    : "a"(v32)         // %2
+    : "memory", "cc");
+}
+
+void SetRow_ERMS(uint8* dst, uint8 v8, int width) {
+  size_t width_tmp = (size_t)(width);
+  asm volatile (
+    "rep stosb " MEMSTORESTRING(al,0) "        \n"
+    : "+D"(dst),       // %0
+      "+c"(width_tmp)  // %1
+    : "a"(v8)          // %2
+    : "memory", "cc");
+}
+
+void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int width) {
+  size_t width_tmp = (size_t)(width);
+  asm volatile (
+    "rep stosl " MEMSTORESTRING(eax,0) "       \n"
+    : "+D"(dst_argb),  // %0
+      "+c"(width_tmp)  // %1
+    : "a"(v32)         // %2
+    : "memory", "cc");
+}
+#endif  // HAS_SETROW_X86
+
+#ifdef HAS_YUY2TOYROW_SSE2
+void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psrlw     $0x8,%%xmm5                     \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "pand      %%xmm5,%%xmm1                   \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_yuy2),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm5"
+  );
+}
+
+void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psrlw     $0x8,%%xmm5                     \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2
+    MEMOPREG(movdqu,0x10,0,4,1,xmm3)           //  movdqu  0x10(%0,%4,1),%%xmm3
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "pavgb     %%xmm2,%%xmm0                   \n"
+    "pavgb     %%xmm3,%%xmm1                   \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm1                   \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_yuy2),    // %0
+    "+r"(dst_u),       // %1
+    "+r"(dst_v),       // %2
+    "+r"(pix)          // %3
+  : "r"((intptr_t)(stride_yuy2))  // %4
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+
+void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
+                         uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psrlw     $0x8,%%xmm5                     \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm1                   \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_yuy2),    // %0
+    "+r"(dst_u),       // %1
+    "+r"(dst_v),       // %2
+    "+r"(pix)          // %3
+  :
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm5"
+  );
+}
+
+void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_uyvy),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1"
+  );
+}
+
+void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psrlw     $0x8,%%xmm5                     \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2
+    MEMOPREG(movdqu,0x10,0,4,1,xmm3)           //  movdqu  0x10(%0,%4,1),%%xmm3
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "pavgb     %%xmm2,%%xmm0                   \n"
+    "pavgb     %%xmm3,%%xmm1                   \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "pand      %%xmm5,%%xmm1                   \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm1                   \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_uyvy),    // %0
+    "+r"(dst_u),       // %1
+    "+r"(dst_v),       // %2
+    "+r"(pix)          // %3
+  : "r"((intptr_t)(stride_uyvy))  // %4
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+
+void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
+                         uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psrlw     $0x8,%%xmm5                     \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "pand      %%xmm5,%%xmm1                   \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm1                   \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_uyvy),    // %0
+    "+r"(dst_u),       // %1
+    "+r"(dst_v),       // %2
+    "+r"(pix)          // %3
+  :
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm5"
+  );
+}
+#endif  // HAS_YUY2TOYROW_SSE2
+
+#ifdef HAS_YUY2TOYROW_AVX2
+void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int pix) {
+  asm volatile (
+    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
+    "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
+    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
+    "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
+    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
+    "lea      " MEMLEA(0x20,1) ",%1            \n"
+    "sub       $0x20,%2                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_yuy2),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm5"
+  );
+}
+
+void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
+    "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
+    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
+    VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0
+    VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+    "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
+    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
+    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+    "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
+    "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
+    "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
+    VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
+    "lea      " MEMLEA(0x10,1) ",%1            \n"
+    "sub       $0x20,%3                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_yuy2),    // %0
+    "+r"(dst_u),       // %1
+    "+r"(dst_v),       // %2
+    "+r"(pix)          // %3
+  : "r"((intptr_t)(stride_yuy2))  // %4
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm5"
+  );
+}
+
+void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
+                         uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
+    "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
+    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+    "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
+    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
+    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+    "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
+    "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
+    "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
+    VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
+    "lea      " MEMLEA(0x10,1) ",%1            \n"
+    "sub       $0x20,%3                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_yuy2),    // %0
+    "+r"(dst_u),       // %1
+    "+r"(dst_v),       // %2
+    "+r"(pix)          // %3
+  :
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm5"
+  );
+}
+
+void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix) {
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
+    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+    "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
+    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
+    "lea      " MEMLEA(0x20,1) ",%1            \n"
+    "sub       $0x20,%2                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_uyvy),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm5"
+  );
+}
+void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
+    "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
+    "sub       %1,%2                           \n"
+
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
+    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
+    VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0
+    VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
+    "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
+    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
+    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+    "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
+    "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
+    "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
+    VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
+    "lea      " MEMLEA(0x10,1) ",%1            \n"
+    "sub       $0x20,%3                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_uyvy),    // %0
+    "+r"(dst_u),       // %1
+    "+r"(dst_v),       // %2
+    "+r"(pix)          // %3
+  : "r"((intptr_t)(stride_uyvy))  // %4
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm5"
+  );
+}
+
+void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
+                         uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+    "vpsrlw     $0x8,%%ymm5,%%ymm5             \n"
+    "sub       %1,%2                           \n"
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
+    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
+    "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
+    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
+    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+    "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
+    "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
+    "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
+    VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
+    "lea      " MEMLEA(0x10,1) ",%1            \n"
+    "sub       $0x20,%3                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_uyvy),    // %0
+    "+r"(dst_u),       // %1
+    "+r"(dst_v),       // %2
+    "+r"(pix)          // %3
+  :
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm5"
+  );
+}
+#endif  // HAS_YUY2TOYROW_AVX2
+
+#ifdef HAS_ARGBBLENDROW_SSE2
+// Blend 8 pixels at a time.
+void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+                       uint8* dst_argb, int width) {
+  asm volatile (
+    "pcmpeqb   %%xmm7,%%xmm7                   \n"
+    "psrlw     $0xf,%%xmm7                     \n"
+    "pcmpeqb   %%xmm6,%%xmm6                   \n"
+    "psrlw     $0x8,%%xmm6                     \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psllw     $0x8,%%xmm5                     \n"
+    "pcmpeqb   %%xmm4,%%xmm4                   \n"
+    "pslld     $0x18,%%xmm4                    \n"
+    "sub       $0x4,%3                         \n"
+    "jl        49f                             \n"
+
+    // 4 pixel loop.
+    LABELALIGN
+  "41:                                         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm3         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movdqa    %%xmm3,%%xmm0                   \n"
+    "pxor      %%xmm4,%%xmm3                   \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm2         \n"
+    "psrlw     $0x8,%%xmm3                     \n"
+    "pshufhw   $0xf5,%%xmm3,%%xmm3             \n"
+    "pshuflw   $0xf5,%%xmm3,%%xmm3             \n"
+    "pand      %%xmm6,%%xmm2                   \n"
+    "paddw     %%xmm7,%%xmm3                   \n"
+    "pmullw    %%xmm3,%%xmm2                   \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "por       %%xmm4,%%xmm0                   \n"
+    "pmullw    %%xmm3,%%xmm1                   \n"
+    "psrlw     $0x8,%%xmm2                     \n"
+    "paddusb   %%xmm2,%%xmm0                   \n"
+    "pand      %%xmm5,%%xmm1                   \n"
+    "paddusb   %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x4,%3                         \n"
+    "jge       41b                             \n"
+
+  "49:                                         \n"
+    "add       $0x3,%3                         \n"
+    "jl        99f                             \n"
+
+    // 1 pixel loop.
+  "91:                                         \n"
+    "movd      " MEMACCESS(0) ",%%xmm3         \n"
+    "lea       " MEMLEA(0x4,0) ",%0            \n"
+    "movdqa    %%xmm3,%%xmm0                   \n"
+    "pxor      %%xmm4,%%xmm3                   \n"
+    "movd      " MEMACCESS(1) ",%%xmm2         \n"
+    "psrlw     $0x8,%%xmm3                     \n"
+    "pshufhw   $0xf5,%%xmm3,%%xmm3             \n"
+    "pshuflw   $0xf5,%%xmm3,%%xmm3             \n"
+    "pand      %%xmm6,%%xmm2                   \n"
+    "paddw     %%xmm7,%%xmm3                   \n"
+    "pmullw    %%xmm3,%%xmm2                   \n"
+    "movd      " MEMACCESS(1) ",%%xmm1         \n"
+    "lea       " MEMLEA(0x4,1) ",%1            \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "por       %%xmm4,%%xmm0                   \n"
+    "pmullw    %%xmm3,%%xmm1                   \n"
+    "psrlw     $0x8,%%xmm2                     \n"
+    "paddusb   %%xmm2,%%xmm0                   \n"
+    "pand      %%xmm5,%%xmm1                   \n"
+    "paddusb   %%xmm1,%%xmm0                   \n"
+    "movd      %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x4,2) ",%2            \n"
+    "sub       $0x1,%3                         \n"
+    "jge       91b                             \n"
+  "99:                                         \n"
+  : "+r"(src_argb0),    // %0
+    "+r"(src_argb1),    // %1
+    "+r"(dst_argb),     // %2
+    "+r"(width)         // %3
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_ARGBBLENDROW_SSE2
+
+#ifdef HAS_ARGBBLENDROW_SSSE3
+// Shuffle table for isolating alpha.
+static uvec8 kShuffleAlpha = {
+  3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
+  11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
+};
+
+// Blend 8 pixels at a time
+// Shuffle table for reversing the bytes.
+
+// Same as SSE2, but replaces
+//    psrlw      xmm3, 8          // alpha
+//    pshufhw    xmm3, xmm3,0F5h  // 8 alpha words
+//    pshuflw    xmm3, xmm3,0F5h
+// with..
+//    pshufb     xmm3, kShuffleAlpha // alpha
+
+void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
+                        uint8* dst_argb, int width) {
+  asm volatile (
+    "pcmpeqb   %%xmm7,%%xmm7                   \n"
+    "psrlw     $0xf,%%xmm7                     \n"
+    "pcmpeqb   %%xmm6,%%xmm6                   \n"
+    "psrlw     $0x8,%%xmm6                     \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psllw     $0x8,%%xmm5                     \n"
+    "pcmpeqb   %%xmm4,%%xmm4                   \n"
+    "pslld     $0x18,%%xmm4                    \n"
+    "sub       $0x4,%3                         \n"
+    "jl        49f                             \n"
+
+    // 4 pixel loop.
+    LABELALIGN
+  "40:                                         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm3         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movdqa    %%xmm3,%%xmm0                   \n"
+    "pxor      %%xmm4,%%xmm3                   \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm2         \n"
+    "pshufb    %4,%%xmm3                       \n"
+    "pand      %%xmm6,%%xmm2                   \n"
+    "paddw     %%xmm7,%%xmm3                   \n"
+    "pmullw    %%xmm3,%%xmm2                   \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "por       %%xmm4,%%xmm0                   \n"
+    "pmullw    %%xmm3,%%xmm1                   \n"
+    "psrlw     $0x8,%%xmm2                     \n"
+    "paddusb   %%xmm2,%%xmm0                   \n"
+    "pand      %%xmm5,%%xmm1                   \n"
+    "paddusb   %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x4,%3                         \n"
+    "jge       40b                             \n"
+
+  "49:                                         \n"
+    "add       $0x3,%3                         \n"
+    "jl        99f                             \n"
+
+    // 1 pixel loop.
+  "91:                                         \n"
+    "movd      " MEMACCESS(0) ",%%xmm3         \n"
+    "lea       " MEMLEA(0x4,0) ",%0            \n"
+    "movdqa    %%xmm3,%%xmm0                   \n"
+    "pxor      %%xmm4,%%xmm3                   \n"
+    "movd      " MEMACCESS(1) ",%%xmm2         \n"
+    "pshufb    %4,%%xmm3                       \n"
+    "pand      %%xmm6,%%xmm2                   \n"
+    "paddw     %%xmm7,%%xmm3                   \n"
+    "pmullw    %%xmm3,%%xmm2                   \n"
+    "movd      " MEMACCESS(1) ",%%xmm1         \n"
+    "lea       " MEMLEA(0x4,1) ",%1            \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "por       %%xmm4,%%xmm0                   \n"
+    "pmullw    %%xmm3,%%xmm1                   \n"
+    "psrlw     $0x8,%%xmm2                     \n"
+    "paddusb   %%xmm2,%%xmm0                   \n"
+    "pand      %%xmm5,%%xmm1                   \n"
+    "paddusb   %%xmm1,%%xmm0                   \n"
+    "movd      %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x4,2) ",%2            \n"
+    "sub       $0x1,%3                         \n"
+    "jge       91b                             \n"
+  "99:                                         \n"
+  : "+r"(src_argb0),    // %0
+    "+r"(src_argb1),    // %1
+    "+r"(dst_argb),     // %2
+    "+r"(width)         // %3
+  : "m"(kShuffleAlpha)  // %4
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_ARGBBLENDROW_SSSE3
+
+#ifdef HAS_ARGBATTENUATEROW_SSE2
+// Attenuate 4 pixels at a time.
+void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
+  asm volatile (
+    "pcmpeqb   %%xmm4,%%xmm4                   \n"
+    "pslld     $0x18,%%xmm4                    \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psrld     $0x8,%%xmm5                     \n"
+
+    // 4 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "punpcklbw %%xmm0,%%xmm0                   \n"
+    "pshufhw   $0xff,%%xmm0,%%xmm2             \n"
+    "pshuflw   $0xff,%%xmm2,%%xmm2             \n"
+    "pmulhuw   %%xmm2,%%xmm0                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
+    "punpckhbw %%xmm1,%%xmm1                   \n"
+    "pshufhw   $0xff,%%xmm1,%%xmm2             \n"
+    "pshuflw   $0xff,%%xmm2,%%xmm2             \n"
+    "pmulhuw   %%xmm2,%%xmm1                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "pand      %%xmm4,%%xmm2                   \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "por       %%xmm2,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),    // %0
+    "+r"(dst_argb),    // %1
+    "+r"(width)        // %2
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_ARGBATTENUATEROW_SSE2
+
+#ifdef HAS_ARGBATTENUATEROW_SSSE3
+// Shuffle table duplicating alpha
+static uvec8 kShuffleAlpha0 = {
+  3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u
+};
+static uvec8 kShuffleAlpha1 = {
+  11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
+  15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u
+};
+// Attenuate 4 pixels at a time.
+void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
+  asm volatile (
+    "pcmpeqb   %%xmm3,%%xmm3                   \n"
+    "pslld     $0x18,%%xmm3                    \n"
+    "movdqa    %3,%%xmm4                       \n"
+    "movdqa    %4,%%xmm5                       \n"
+
+    // 4 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "pshufb    %%xmm4,%%xmm0                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
+    "punpcklbw %%xmm1,%%xmm1                   \n"
+    "pmulhuw   %%xmm1,%%xmm0                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
+    "pshufb    %%xmm5,%%xmm1                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
+    "punpckhbw %%xmm2,%%xmm2                   \n"
+    "pmulhuw   %%xmm2,%%xmm1                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "pand      %%xmm3,%%xmm2                   \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "por       %%xmm2,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),    // %0
+    "+r"(dst_argb),    // %1
+    "+r"(width)        // %2
+  : "m"(kShuffleAlpha0),  // %3
+    "m"(kShuffleAlpha1)  // %4
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_ARGBATTENUATEROW_SSSE3
+
+#ifdef HAS_ARGBATTENUATEROW_AVX2
+// Shuffle table duplicating alpha.
+static const uvec8 kShuffleAlpha_AVX2 = {
+  6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u
+};
+// Attenuate 8 pixels at a time.
+void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
+  asm volatile (
+    "vbroadcastf128 %3,%%ymm4                  \n"
+    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+    "vpslld     $0x18,%%ymm5,%%ymm5            \n"
+    "sub        %0,%1                          \n"
+
+    // 8 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu    " MEMACCESS(0) ",%%ymm6        \n"
+    "vpunpcklbw %%ymm6,%%ymm6,%%ymm0           \n"
+    "vpunpckhbw %%ymm6,%%ymm6,%%ymm1           \n"
+    "vpshufb    %%ymm4,%%ymm0,%%ymm2           \n"
+    "vpshufb    %%ymm4,%%ymm1,%%ymm3           \n"
+    "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
+    "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
+    "vpand      %%ymm5,%%ymm6,%%ymm6           \n"
+    "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
+    "vpsrlw     $0x8,%%ymm1,%%ymm1             \n"
+    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+    "vpor       %%ymm6,%%ymm0,%%ymm0           \n"
+    MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1)          //  vmovdqu %%ymm0,(%0,%1)
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "sub        $0x8,%2                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_argb),    // %0
+    "+r"(dst_argb),    // %1
+    "+r"(width)        // %2
+  : "m"(kShuffleAlpha_AVX2)  // %3
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+  );
+}
+#endif  // HAS_ARGBATTENUATEROW_AVX2
+
+#ifdef HAS_ARGBUNATTENUATEROW_SSE2
+// Unattenuate 4 pixels at a time.
+void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
+                             int width) {
+  uintptr_t alpha = 0;
+  asm volatile (
+    // 4 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movzb     " MEMACCESS2(0x03,0) ",%3       \n"
+    "punpcklbw %%xmm0,%%xmm0                   \n"
+    MEMOPREG(movd,0x00,4,3,4,xmm2)             //  movd      0x0(%4,%3,4),%%xmm2
+    "movzb     " MEMACCESS2(0x07,0) ",%3       \n"
+    MEMOPREG(movd,0x00,4,3,4,xmm3)             //  movd      0x0(%4,%3,4),%%xmm3
+    "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
+    "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
+    "movlhps   %%xmm3,%%xmm2                   \n"
+    "pmulhuw   %%xmm2,%%xmm0                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
+    "movzb     " MEMACCESS2(0x0b,0) ",%3       \n"
+    "punpckhbw %%xmm1,%%xmm1                   \n"
+    MEMOPREG(movd,0x00,4,3,4,xmm2)             //  movd      0x0(%4,%3,4),%%xmm2
+    "movzb     " MEMACCESS2(0x0f,0) ",%3       \n"
+    MEMOPREG(movd,0x00,4,3,4,xmm3)             //  movd      0x0(%4,%3,4),%%xmm3
+    "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
+    "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
+    "movlhps   %%xmm3,%%xmm2                   \n"
+    "pmulhuw   %%xmm2,%%xmm1                   \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),    // %0
+    "+r"(dst_argb),    // %1
+    "+r"(width),       // %2
+    "+r"(alpha)        // %3
+  : "r"(fixed_invtbl8)  // %4
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_ARGBUNATTENUATEROW_SSE2
+
+#ifdef HAS_ARGBUNATTENUATEROW_AVX2
+// Shuffle table duplicating alpha.
+static const uvec8 kUnattenShuffleAlpha_AVX2 = {
+  0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u
+};
+// Unattenuate 8 pixels at a time.
+void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
+                             int width) {
+  uintptr_t alpha = 0;
+  asm volatile (
+    "sub        %0,%1                          \n"
+    "vbroadcastf128 %5,%%ymm5                  \n"
+
+    // 8 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    // replace VPGATHER
+    "movzb     " MEMACCESS2(0x03,0) ",%3       \n"
+    MEMOPREG(vmovd,0x00,4,3,4,xmm0)             //  vmovd 0x0(%4,%3,4),%%xmm0
+    "movzb     " MEMACCESS2(0x07,0) ",%3       \n"
+    MEMOPREG(vmovd,0x00,4,3,4,xmm1)             //  vmovd 0x0(%4,%3,4),%%xmm1
+    "movzb     " MEMACCESS2(0x0b,0) ",%3       \n"
+    "vpunpckldq %%xmm1,%%xmm0,%%xmm6           \n"
+    MEMOPREG(vmovd,0x00,4,3,4,xmm2)             //  vmovd 0x0(%4,%3,4),%%xmm2
+    "movzb     " MEMACCESS2(0x0f,0) ",%3       \n"
+    MEMOPREG(vmovd,0x00,4,3,4,xmm3)             //  vmovd 0x0(%4,%3,4),%%xmm3
+    "movzb     " MEMACCESS2(0x13,0) ",%3       \n"
+    "vpunpckldq %%xmm3,%%xmm2,%%xmm7           \n"
+    MEMOPREG(vmovd,0x00,4,3,4,xmm0)             //  vmovd 0x0(%4,%3,4),%%xmm0
+    "movzb     " MEMACCESS2(0x17,0) ",%3       \n"
+    MEMOPREG(vmovd,0x00,4,3,4,xmm1)             //  vmovd 0x0(%4,%3,4),%%xmm1
+    "movzb     " MEMACCESS2(0x1b,0) ",%3       \n"
+    "vpunpckldq %%xmm1,%%xmm0,%%xmm0           \n"
+    MEMOPREG(vmovd,0x00,4,3,4,xmm2)             //  vmovd 0x0(%4,%3,4),%%xmm2
+    "movzb     " MEMACCESS2(0x1f,0) ",%3       \n"
+    MEMOPREG(vmovd,0x00,4,3,4,xmm3)             //  vmovd 0x0(%4,%3,4),%%xmm3
+    "vpunpckldq %%xmm3,%%xmm2,%%xmm2           \n"
+    "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3          \n"
+    "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0          \n"
+    "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3     \n"
+    // end of VPGATHER
+
+    "vmovdqu    " MEMACCESS(0) ",%%ymm6        \n"
+    "vpunpcklbw %%ymm6,%%ymm6,%%ymm0           \n"
+    "vpunpckhbw %%ymm6,%%ymm6,%%ymm1           \n"
+    "vpunpcklwd %%ymm3,%%ymm3,%%ymm2           \n"
+    "vpunpckhwd %%ymm3,%%ymm3,%%ymm3           \n"
+    "vpshufb    %%ymm5,%%ymm2,%%ymm2           \n"
+    "vpshufb    %%ymm5,%%ymm3,%%ymm3           \n"
+    "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
+    "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
+    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+    MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1)          //  vmovdqu %%ymm0,(%0,%1)
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "sub        $0x8,%2                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_argb),    // %0
+    "+r"(dst_argb),    // %1
+    "+r"(width),       // %2
+    "+r"(alpha)        // %3
+  : "r"(fixed_invtbl8),  // %4
+    "m"(kUnattenShuffleAlpha_AVX2)  // %5
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_ARGBUNATTENUATEROW_AVX2
+
+#ifdef HAS_ARGBGRAYROW_SSSE3
+// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
+void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
+  asm volatile (
+    "movdqa    %3,%%xmm4                       \n"
+    "movdqa    %4,%%xmm5                       \n"
+
+    // 8 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "pmaddubsw %%xmm4,%%xmm0                   \n"
+    "pmaddubsw %%xmm4,%%xmm1                   \n"
+    "phaddw    %%xmm1,%%xmm0                   \n"
+    "paddw     %%xmm5,%%xmm0                   \n"
+    "psrlw     $0x7,%%xmm0                     \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm3   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "psrld     $0x18,%%xmm2                    \n"
+    "psrld     $0x18,%%xmm3                    \n"
+    "packuswb  %%xmm3,%%xmm2                   \n"
+    "packuswb  %%xmm2,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm3                   \n"
+    "punpcklbw %%xmm0,%%xmm0                   \n"
+    "punpcklbw %%xmm2,%%xmm3                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklwd %%xmm3,%%xmm0                   \n"
+    "punpckhwd %%xmm3,%%xmm1                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),   // %0
+    "+r"(dst_argb),   // %1
+    "+r"(width)       // %2
+  : "m"(kARGBToYJ),   // %3
+    "m"(kAddYJ64)     // %4
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_ARGBGRAYROW_SSSE3
+
+#ifdef HAS_ARGBSEPIAROW_SSSE3
+//    b = (r * 35 + g * 68 + b * 17) >> 7
+//    g = (r * 45 + g * 88 + b * 22) >> 7
+//    r = (r * 50 + g * 98 + b * 24) >> 7
+// Constant for ARGB color to sepia tone
+static vec8 kARGBToSepiaB = {
+  17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
+};
+
+static vec8 kARGBToSepiaG = {
+  22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
+};
+
+static vec8 kARGBToSepiaR = {
+  24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
+};
+
+// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
+void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
+  asm volatile (
+    "movdqa    %2,%%xmm2                       \n"
+    "movdqa    %3,%%xmm3                       \n"
+    "movdqa    %4,%%xmm4                       \n"
+
+    // 8 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
+    "pmaddubsw %%xmm2,%%xmm0                   \n"
+    "pmaddubsw %%xmm2,%%xmm6                   \n"
+    "phaddw    %%xmm6,%%xmm0                   \n"
+    "psrlw     $0x7,%%xmm0                     \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm5         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "pmaddubsw %%xmm3,%%xmm5                   \n"
+    "pmaddubsw %%xmm3,%%xmm1                   \n"
+    "phaddw    %%xmm1,%%xmm5                   \n"
+    "psrlw     $0x7,%%xmm5                     \n"
+    "packuswb  %%xmm5,%%xmm5                   \n"
+    "punpcklbw %%xmm5,%%xmm0                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm5         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "pmaddubsw %%xmm4,%%xmm5                   \n"
+    "pmaddubsw %%xmm4,%%xmm1                   \n"
+    "phaddw    %%xmm1,%%xmm5                   \n"
+    "psrlw     $0x7,%%xmm5                     \n"
+    "packuswb  %%xmm5,%%xmm5                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "psrld     $0x18,%%xmm6                    \n"
+    "psrld     $0x18,%%xmm1                    \n"
+    "packuswb  %%xmm1,%%xmm6                   \n"
+    "packuswb  %%xmm6,%%xmm6                   \n"
+    "punpcklbw %%xmm6,%%xmm5                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklwd %%xmm5,%%xmm0                   \n"
+    "punpckhwd %%xmm5,%%xmm1                   \n"
+    "movdqu    %%xmm0," MEMACCESS(0) "         \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,0) "   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "sub       $0x8,%1                         \n"
+    "jg        1b                              \n"
+  : "+r"(dst_argb),      // %0
+    "+r"(width)          // %1
+  : "m"(kARGBToSepiaB),  // %2
+    "m"(kARGBToSepiaG),  // %3
+    "m"(kARGBToSepiaR)   // %4
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+  );
+}
+#endif  // HAS_ARGBSEPIAROW_SSSE3
+
+#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
+// Tranform 8 ARGB pixels (32 bytes) with color matrix.
+// Same as Sepia except matrix is provided.
+void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+                              const int8* matrix_argb, int width) {
+  asm volatile (
+    "movdqu    " MEMACCESS(3) ",%%xmm5         \n"
+    "pshufd    $0x00,%%xmm5,%%xmm2             \n"
+    "pshufd    $0x55,%%xmm5,%%xmm3             \n"
+    "pshufd    $0xaa,%%xmm5,%%xmm4             \n"
+    "pshufd    $0xff,%%xmm5,%%xmm5             \n"
+
+    // 8 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
+    "pmaddubsw %%xmm2,%%xmm0                   \n"
+    "pmaddubsw %%xmm2,%%xmm7                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "pmaddubsw %%xmm3,%%xmm6                   \n"
+    "pmaddubsw %%xmm3,%%xmm1                   \n"
+    "phaddsw   %%xmm7,%%xmm0                   \n"
+    "phaddsw   %%xmm1,%%xmm6                   \n"
+    "psraw     $0x6,%%xmm0                     \n"
+    "psraw     $0x6,%%xmm6                     \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "packuswb  %%xmm6,%%xmm6                   \n"
+    "punpcklbw %%xmm6,%%xmm0                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
+    "pmaddubsw %%xmm4,%%xmm1                   \n"
+    "pmaddubsw %%xmm4,%%xmm7                   \n"
+    "phaddsw   %%xmm7,%%xmm1                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
+    "pmaddubsw %%xmm5,%%xmm6                   \n"
+    "pmaddubsw %%xmm5,%%xmm7                   \n"
+    "phaddsw   %%xmm7,%%xmm6                   \n"
+    "psraw     $0x6,%%xmm1                     \n"
+    "psraw     $0x6,%%xmm6                     \n"
+    "packuswb  %%xmm1,%%xmm1                   \n"
+    "packuswb  %%xmm6,%%xmm6                   \n"
+    "punpcklbw %%xmm6,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm6                   \n"
+    "punpcklwd %%xmm1,%%xmm0                   \n"
+    "punpckhwd %%xmm1,%%xmm6                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm6," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),      // %0
+    "+r"(dst_argb),      // %1
+    "+r"(width)          // %2
+  : "r"(matrix_argb)     // %3
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
+
+#ifdef HAS_ARGBQUANTIZEROW_SSE2
+// Quantize 4 ARGB pixels (16 bytes).
+void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
+                          int interval_offset, int width) {
+  asm volatile (
+    "movd      %2,%%xmm2                       \n"
+    "movd      %3,%%xmm3                       \n"
+    "movd      %4,%%xmm4                       \n"
+    "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
+    "pshufd    $0x44,%%xmm2,%%xmm2             \n"
+    "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
+    "pshufd    $0x44,%%xmm3,%%xmm3             \n"
+    "pshuflw   $0x40,%%xmm4,%%xmm4             \n"
+    "pshufd    $0x44,%%xmm4,%%xmm4             \n"
+    "pxor      %%xmm5,%%xmm5                   \n"
+    "pcmpeqb   %%xmm6,%%xmm6                   \n"
+    "pslld     $0x18,%%xmm6                    \n"
+
+    // 4 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "punpcklbw %%xmm5,%%xmm0                   \n"
+    "pmulhuw   %%xmm2,%%xmm0                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
+    "punpckhbw %%xmm5,%%xmm1                   \n"
+    "pmulhuw   %%xmm2,%%xmm1                   \n"
+    "pmullw    %%xmm3,%%xmm0                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm7         \n"
+    "pmullw    %%xmm3,%%xmm1                   \n"
+    "pand      %%xmm6,%%xmm7                   \n"
+    "paddw     %%xmm4,%%xmm0                   \n"
+    "paddw     %%xmm4,%%xmm1                   \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "por       %%xmm7,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(0) "         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "sub       $0x4,%1                         \n"
+    "jg        1b                              \n"
+  : "+r"(dst_argb),       // %0
+    "+r"(width)           // %1
+  : "r"(scale),           // %2
+    "r"(interval_size),   // %3
+    "r"(interval_offset)  // %4
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_ARGBQUANTIZEROW_SSE2
+
+#ifdef HAS_ARGBSHADEROW_SSE2
+// Shade 4 pixels at a time by specified value.
+void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
+                       uint32 value) {
+  asm volatile (
+    "movd      %3,%%xmm2                       \n"
+    "punpcklbw %%xmm2,%%xmm2                   \n"
+    "punpcklqdq %%xmm2,%%xmm2                  \n"
+
+    // 4 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklbw %%xmm0,%%xmm0                   \n"
+    "punpckhbw %%xmm1,%%xmm1                   \n"
+    "pmulhuw   %%xmm2,%%xmm0                   \n"
+    "pmulhuw   %%xmm2,%%xmm1                   \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(width)      // %2
+  : "r"(value)       // %3
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2"
+  );
+}
+#endif  // HAS_ARGBSHADEROW_SSE2
+
+#ifdef HAS_ARGBMULTIPLYROW_SSE2
+// Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
+void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+                          uint8* dst_argb, int width) {
+  asm volatile (
+    "pxor      %%xmm5,%%xmm5                  \n"
+
+    // 4 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm2         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "movdqu    %%xmm0,%%xmm1                   \n"
+    "movdqu    %%xmm2,%%xmm3                   \n"
+    "punpcklbw %%xmm0,%%xmm0                   \n"
+    "punpckhbw %%xmm1,%%xmm1                   \n"
+    "punpcklbw %%xmm5,%%xmm2                   \n"
+    "punpckhbw %%xmm5,%%xmm3                   \n"
+    "pmulhuw   %%xmm2,%%xmm0                   \n"
+    "pmulhuw   %%xmm3,%%xmm1                   \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x4,%3                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb0),  // %0
+    "+r"(src_argb1),  // %1
+    "+r"(dst_argb),   // %2
+    "+r"(width)       // %3
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+#endif  // HAS_ARGBMULTIPLYROW_SSE2
+
+#ifdef HAS_ARGBMULTIPLYROW_AVX2
+// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
+void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
+                          uint8* dst_argb, int width) {
+  asm volatile (
+    "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
+
+    // 4 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu    " MEMACCESS(0) ",%%ymm1        \n"
+    "lea        " MEMLEA(0x20,0) ",%0          \n"
+    "vmovdqu    " MEMACCESS(1) ",%%ymm3        \n"
+    "lea        " MEMLEA(0x20,1) ",%1          \n"
+    "vpunpcklbw %%ymm1,%%ymm1,%%ymm0           \n"
+    "vpunpckhbw %%ymm1,%%ymm1,%%ymm1           \n"
+    "vpunpcklbw %%ymm5,%%ymm3,%%ymm2           \n"
+    "vpunpckhbw %%ymm5,%%ymm3,%%ymm3           \n"
+    "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
+    "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
+    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+    "vmovdqu    %%ymm0," MEMACCESS(2) "        \n"
+    "lea       " MEMLEA(0x20,2) ",%2           \n"
+    "sub        $0x8,%3                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_argb0),  // %0
+    "+r"(src_argb1),  // %1
+    "+r"(dst_argb),   // %2
+    "+r"(width)       // %3
+  :
+  : "memory", "cc"
+#if defined(__AVX2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+#endif
+  );
+}
+#endif  // HAS_ARGBMULTIPLYROW_AVX2
+
+#ifdef HAS_ARGBADDROW_SSE2
+// Add 2 rows of ARGB pixels together, 4 pixels at a time.
+void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+                     uint8* dst_argb, int width) {
+  asm volatile (
+    // 4 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "paddusb   %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x4,%3                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb0),  // %0
+    "+r"(src_argb1),  // %1
+    "+r"(dst_argb),   // %2
+    "+r"(width)       // %3
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1"
+  );
+}
+#endif  // HAS_ARGBADDROW_SSE2
+
+#ifdef HAS_ARGBADDROW_AVX2
+// Add 2 rows of ARGB pixels together, 4 pixels at a time.
+void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
+                     uint8* dst_argb, int width) {
+  asm volatile (
+    // 4 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
+    "lea        " MEMLEA(0x20,0) ",%0          \n"
+    "vpaddusb   " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"
+    "lea        " MEMLEA(0x20,1) ",%1          \n"
+    "vmovdqu    %%ymm0," MEMACCESS(2) "        \n"
+    "lea        " MEMLEA(0x20,2) ",%2          \n"
+    "sub        $0x8,%3                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_argb0),  // %0
+    "+r"(src_argb1),  // %1
+    "+r"(dst_argb),   // %2
+    "+r"(width)       // %3
+  :
+  : "memory", "cc"
+    , "xmm0"
+  );
+}
+#endif  // HAS_ARGBADDROW_AVX2
+
+#ifdef HAS_ARGBSUBTRACTROW_SSE2
+// Subtract 2 rows of ARGB pixels, 4 pixels at a time.
+void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+                          uint8* dst_argb, int width) {
+  asm volatile (
+    // 4 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "psubusb   %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x4,%3                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb0),  // %0
+    "+r"(src_argb1),  // %1
+    "+r"(dst_argb),   // %2
+    "+r"(width)       // %3
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1"
+  );
+}
+#endif  // HAS_ARGBSUBTRACTROW_SSE2
+
+#ifdef HAS_ARGBSUBTRACTROW_AVX2
+// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
+void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
+                          uint8* dst_argb, int width) {
+  asm volatile (
+    // 4 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
+    "lea        " MEMLEA(0x20,0) ",%0          \n"
+    "vpsubusb   " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"
+    "lea        " MEMLEA(0x20,1) ",%1          \n"
+    "vmovdqu    %%ymm0," MEMACCESS(2) "        \n"
+    "lea        " MEMLEA(0x20,2) ",%2          \n"
+    "sub        $0x8,%3                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_argb0),  // %0
+    "+r"(src_argb1),  // %1
+    "+r"(dst_argb),   // %2
+    "+r"(width)       // %3
+  :
+  : "memory", "cc"
+    , "xmm0"
+  );
+}
+#endif  // HAS_ARGBSUBTRACTROW_AVX2
+
+#ifdef HAS_SOBELXROW_SSE2
+// SobelX as a matrix is
+// -1  0  1
+// -2  0  2
+// -1  0  1
+void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
+                    const uint8* src_y2, uint8* dst_sobelx, int width) {
+  asm volatile (
+    "sub       %0,%1                           \n"
+    "sub       %0,%2                           \n"
+    "sub       %0,%3                           \n"
+    "pxor      %%xmm5,%%xmm5                   \n"
+
+    // 8 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movq      " MEMACCESS(0) ",%%xmm0         \n"
+    "movq      " MEMACCESS2(0x2,0) ",%%xmm1    \n"
+    "punpcklbw %%xmm5,%%xmm0                   \n"
+    "punpcklbw %%xmm5,%%xmm1                   \n"
+    "psubw     %%xmm1,%%xmm0                   \n"
+    MEMOPREG(movq,0x00,0,1,1,xmm1)             //  movq      (%0,%1,1),%%xmm1
+    MEMOPREG(movq,0x02,0,1,1,xmm2)             //  movq      0x2(%0,%1,1),%%xmm2
+    "punpcklbw %%xmm5,%%xmm1                   \n"
+    "punpcklbw %%xmm5,%%xmm2                   \n"
+    "psubw     %%xmm2,%%xmm1                   \n"
+    MEMOPREG(movq,0x00,0,2,1,xmm2)             //  movq      (%0,%2,1),%%xmm2
+    MEMOPREG(movq,0x02,0,2,1,xmm3)             //  movq      0x2(%0,%2,1),%%xmm3
+    "punpcklbw %%xmm5,%%xmm2                   \n"
+    "punpcklbw %%xmm5,%%xmm3                   \n"
+    "psubw     %%xmm3,%%xmm2                   \n"
+    "paddw     %%xmm2,%%xmm0                   \n"
+    "paddw     %%xmm1,%%xmm0                   \n"
+    "paddw     %%xmm1,%%xmm0                   \n"
+    "pxor      %%xmm1,%%xmm1                   \n"
+    "psubw     %%xmm0,%%xmm1                   \n"
+    "pmaxsw    %%xmm1,%%xmm0                   \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    MEMOPMEM(movq,xmm0,0x00,0,3,1)             //  movq      %%xmm0,(%0,%3,1)
+    "lea       " MEMLEA(0x8,0) ",%0            \n"
+    "sub       $0x8,%4                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_y0),      // %0
+    "+r"(src_y1),      // %1
+    "+r"(src_y2),      // %2
+    "+r"(dst_sobelx),  // %3
+    "+r"(width)        // %4
+  :
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+#endif  // HAS_SOBELXROW_SSE2
+
+#ifdef HAS_SOBELYROW_SSE2
+// SobelY as a matrix is
+// -1 -2 -1
+//  0  0  0
+//  1  2  1
+void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
+                    uint8* dst_sobely, int width) {
+  asm volatile (
+    "sub       %0,%1                           \n"
+    "sub       %0,%2                           \n"
+    "pxor      %%xmm5,%%xmm5                   \n"
+
+    // 8 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movq      " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movq,0x00,0,1,1,xmm1)             //  movq      (%0,%1,1),%%xmm1
+    "punpcklbw %%xmm5,%%xmm0                   \n"
+    "punpcklbw %%xmm5,%%xmm1                   \n"
+    "psubw     %%xmm1,%%xmm0                   \n"
+    "movq      " MEMACCESS2(0x1,0) ",%%xmm1    \n"
+    MEMOPREG(movq,0x01,0,1,1,xmm2)             //  movq      0x1(%0,%1,1),%%xmm2
+    "punpcklbw %%xmm5,%%xmm1                   \n"
+    "punpcklbw %%xmm5,%%xmm2                   \n"
+    "psubw     %%xmm2,%%xmm1                   \n"
+    "movq      " MEMACCESS2(0x2,0) ",%%xmm2    \n"
+    MEMOPREG(movq,0x02,0,1,1,xmm3)             //  movq      0x2(%0,%1,1),%%xmm3
+    "punpcklbw %%xmm5,%%xmm2                   \n"
+    "punpcklbw %%xmm5,%%xmm3                   \n"
+    "psubw     %%xmm3,%%xmm2                   \n"
+    "paddw     %%xmm2,%%xmm0                   \n"
+    "paddw     %%xmm1,%%xmm0                   \n"
+    "paddw     %%xmm1,%%xmm0                   \n"
+    "pxor      %%xmm1,%%xmm1                   \n"
+    "psubw     %%xmm0,%%xmm1                   \n"
+    "pmaxsw    %%xmm1,%%xmm0                   \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    MEMOPMEM(movq,xmm0,0x00,0,2,1)             //  movq      %%xmm0,(%0,%2,1)
+    "lea       " MEMLEA(0x8,0) ",%0            \n"
+    "sub       $0x8,%3                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_y0),      // %0
+    "+r"(src_y1),      // %1
+    "+r"(dst_sobely),  // %2
+    "+r"(width)        // %3
+  :
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+#endif  // HAS_SOBELYROW_SSE2
+
+#ifdef HAS_SOBELROW_SSE2
+// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
+// A = 255
+// R = Sobel
+// G = Sobel
+// B = Sobel
+void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+                   uint8* dst_argb, int width) {
+  asm volatile (
+    "sub       %0,%1                           \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "pslld     $0x18,%%xmm5                    \n"
+
+    // 8 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,0,1,1,xmm1)           //  movdqu    (%0,%1,1),%%xmm1
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "paddusb   %%xmm1,%%xmm0                   \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "punpcklbw %%xmm0,%%xmm2                   \n"
+    "punpckhbw %%xmm0,%%xmm0                   \n"
+    "movdqa    %%xmm2,%%xmm1                   \n"
+    "punpcklwd %%xmm2,%%xmm1                   \n"
+    "punpckhwd %%xmm2,%%xmm2                   \n"
+    "por       %%xmm5,%%xmm1                   \n"
+    "por       %%xmm5,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm3                   \n"
+    "punpcklwd %%xmm0,%%xmm3                   \n"
+    "punpckhwd %%xmm0,%%xmm0                   \n"
+    "por       %%xmm5,%%xmm3                   \n"
+    "por       %%xmm5,%%xmm0                   \n"
+    "movdqu    %%xmm1," MEMACCESS(2) "         \n"
+    "movdqu    %%xmm2," MEMACCESS2(0x10,2) "   \n"
+    "movdqu    %%xmm3," MEMACCESS2(0x20,2) "   \n"
+    "movdqu    %%xmm0," MEMACCESS2(0x30,2) "   \n"
+    "lea       " MEMLEA(0x40,2) ",%2           \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_sobelx),  // %0
+    "+r"(src_sobely),  // %1
+    "+r"(dst_argb),    // %2
+    "+r"(width)        // %3
+  :
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+#endif  // HAS_SOBELROW_SSE2
+
+#ifdef HAS_SOBELTOPLANEROW_SSE2
+// Adds Sobel X and Sobel Y and stores Sobel into a plane.
+void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+                          uint8* dst_y, int width) {
+  asm volatile (
+    "sub       %0,%1                           \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "pslld     $0x18,%%xmm5                    \n"
+
+    // 8 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,0,1,1,xmm1)           //  movdqu    (%0,%1,1),%%xmm1
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "paddusb   %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_sobelx),  // %0
+    "+r"(src_sobely),  // %1
+    "+r"(dst_y),       // %2
+    "+r"(width)        // %3
+  :
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1"
+  );
+}
+#endif  // HAS_SOBELTOPLANEROW_SSE2
+
+#ifdef HAS_SOBELXYROW_SSE2
+// Mixes Sobel X, Sobel Y and Sobel into ARGB.
+// A = 255
+// R = Sobel X
+// G = Sobel
+// B = Sobel Y
+void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+                     uint8* dst_argb, int width) {
+  asm volatile (
+    "sub       %0,%1                           \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+
+    // 8 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,0,1,1,xmm1)           //  movdqu    (%0,%1,1),%%xmm1
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "paddusb   %%xmm1,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm3                   \n"
+    "punpcklbw %%xmm5,%%xmm3                   \n"
+    "punpckhbw %%xmm5,%%xmm0                   \n"
+    "movdqa    %%xmm1,%%xmm4                   \n"
+    "punpcklbw %%xmm2,%%xmm4                   \n"
+    "punpckhbw %%xmm2,%%xmm1                   \n"
+    "movdqa    %%xmm4,%%xmm6                   \n"
+    "punpcklwd %%xmm3,%%xmm6                   \n"
+    "punpckhwd %%xmm3,%%xmm4                   \n"
+    "movdqa    %%xmm1,%%xmm7                   \n"
+    "punpcklwd %%xmm0,%%xmm7                   \n"
+    "punpckhwd %%xmm0,%%xmm1                   \n"
+    "movdqu    %%xmm6," MEMACCESS(2) "         \n"
+    "movdqu    %%xmm4," MEMACCESS2(0x10,2) "   \n"
+    "movdqu    %%xmm7," MEMACCESS2(0x20,2) "   \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x30,2) "   \n"
+    "lea       " MEMLEA(0x40,2) ",%2           \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_sobelx),  // %0
+    "+r"(src_sobely),  // %1
+    "+r"(dst_argb),    // %2
+    "+r"(width)        // %3
+  :
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_SOBELXYROW_SSE2
+
+#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
+// Creates a table of cumulative sums where each value is a sum of all values
+// above and to the left of the value, inclusive of the value.
+void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
+                                  const int32* previous_cumsum, int width) {
+  asm volatile (
+    "pxor      %%xmm0,%%xmm0                   \n"
+    "pxor      %%xmm1,%%xmm1                   \n"
+    "sub       $0x4,%3                         \n"
+    "jl        49f                             \n"
+    "test      $0xf,%1                         \n"
+    "jne       49f                             \n"
+
+  // 4 pixel loop                              \n"
+    LABELALIGN
+  "40:                                         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movdqa    %%xmm2,%%xmm4                   \n"
+    "punpcklbw %%xmm1,%%xmm2                   \n"
+    "movdqa    %%xmm2,%%xmm3                   \n"
+    "punpcklwd %%xmm1,%%xmm2                   \n"
+    "punpckhwd %%xmm1,%%xmm3                   \n"
+    "punpckhbw %%xmm1,%%xmm4                   \n"
+    "movdqa    %%xmm4,%%xmm5                   \n"
+    "punpcklwd %%xmm1,%%xmm4                   \n"
+    "punpckhwd %%xmm1,%%xmm5                   \n"
+    "paddd     %%xmm2,%%xmm0                   \n"
+    "movdqu    " MEMACCESS(2) ",%%xmm2         \n"
+    "paddd     %%xmm0,%%xmm2                   \n"
+    "paddd     %%xmm3,%%xmm0                   \n"
+    "movdqu    " MEMACCESS2(0x10,2) ",%%xmm3   \n"
+    "paddd     %%xmm0,%%xmm3                   \n"
+    "paddd     %%xmm4,%%xmm0                   \n"
+    "movdqu    " MEMACCESS2(0x20,2) ",%%xmm4   \n"
+    "paddd     %%xmm0,%%xmm4                   \n"
+    "paddd     %%xmm5,%%xmm0                   \n"
+    "movdqu    " MEMACCESS2(0x30,2) ",%%xmm5   \n"
+    "lea       " MEMLEA(0x40,2) ",%2           \n"
+    "paddd     %%xmm0,%%xmm5                   \n"
+    "movdqu    %%xmm2," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm3," MEMACCESS2(0x10,1) "   \n"
+    "movdqu    %%xmm4," MEMACCESS2(0x20,1) "   \n"
+    "movdqu    %%xmm5," MEMACCESS2(0x30,1) "   \n"
+    "lea       " MEMLEA(0x40,1) ",%1           \n"
+    "sub       $0x4,%3                         \n"
+    "jge       40b                             \n"
+
+  "49:                                         \n"
+    "add       $0x3,%3                         \n"
+    "jl        19f                             \n"
+
+  // 1 pixel loop                              \n"
+    LABELALIGN
+  "10:                                         \n"
+    "movd      " MEMACCESS(0) ",%%xmm2         \n"
+    "lea       " MEMLEA(0x4,0) ",%0            \n"
+    "punpcklbw %%xmm1,%%xmm2                   \n"
+    "punpcklwd %%xmm1,%%xmm2                   \n"
+    "paddd     %%xmm2,%%xmm0                   \n"
+    "movdqu    " MEMACCESS(2) ",%%xmm2         \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "paddd     %%xmm0,%%xmm2                   \n"
+    "movdqu    %%xmm2," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x1,%3                         \n"
+    "jge       10b                             \n"
+
+  "19:                                         \n"
+  : "+r"(row),  // %0
+    "+r"(cumsum),  // %1
+    "+r"(previous_cumsum),  // %2
+    "+r"(width)  // %3
+  :
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
+
+#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
+void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
+                                    int width, int area, uint8* dst,
+                                    int count) {
+  asm volatile (
+    "movd      %5,%%xmm5                       \n"
+    "cvtdq2ps  %%xmm5,%%xmm5                   \n"
+    "rcpss     %%xmm5,%%xmm4                   \n"
+    "pshufd    $0x0,%%xmm4,%%xmm4              \n"
+    "sub       $0x4,%3                         \n"
+    "jl        49f                             \n"
+    "cmpl      $0x80,%5                        \n"
+    "ja        40f                             \n"
+
+    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
+    "pcmpeqb   %%xmm6,%%xmm6                   \n"
+    "psrld     $0x10,%%xmm6                    \n"
+    "cvtdq2ps  %%xmm6,%%xmm6                   \n"
+    "addps     %%xmm6,%%xmm5                   \n"
+    "mulps     %%xmm4,%%xmm5                   \n"
+    "cvtps2dq  %%xmm5,%%xmm5                   \n"
+    "packssdw  %%xmm5,%%xmm5                   \n"
+
+  // 4 pixel small loop                        \n"
+    LABELALIGN
+  "4:                                         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
+    MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
+    MEMOPREG(psubd,0x10,0,4,4,xmm1)            // psubd    0x10(%0,%4,4),%%xmm1
+    MEMOPREG(psubd,0x20,0,4,4,xmm2)            // psubd    0x20(%0,%4,4),%%xmm2
+    MEMOPREG(psubd,0x30,0,4,4,xmm3)            // psubd    0x30(%0,%4,4),%%xmm3
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "psubd     " MEMACCESS(1) ",%%xmm0         \n"
+    "psubd     " MEMACCESS2(0x10,1) ",%%xmm1   \n"
+    "psubd     " MEMACCESS2(0x20,1) ",%%xmm2   \n"
+    "psubd     " MEMACCESS2(0x30,1) ",%%xmm3   \n"
+    MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
+    MEMOPREG(paddd,0x10,1,4,4,xmm1)            // paddd    0x10(%1,%4,4),%%xmm1
+    MEMOPREG(paddd,0x20,1,4,4,xmm2)            // paddd    0x20(%1,%4,4),%%xmm2
+    MEMOPREG(paddd,0x30,1,4,4,xmm3)            // paddd    0x30(%1,%4,4),%%xmm3
+    "lea       " MEMLEA(0x40,1) ",%1           \n"
+    "packssdw  %%xmm1,%%xmm0                   \n"
+    "packssdw  %%xmm3,%%xmm2                   \n"
+    "pmulhuw   %%xmm5,%%xmm0                   \n"
+    "pmulhuw   %%xmm5,%%xmm2                   \n"
+    "packuswb  %%xmm2,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x4,%3                         \n"
+    "jge       4b                              \n"
+    "jmp       49f                             \n"
+
+  // 4 pixel loop                              \n"
+    LABELALIGN
+  "40:                                         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
+    MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
+    MEMOPREG(psubd,0x10,0,4,4,xmm1)            // psubd    0x10(%0,%4,4),%%xmm1
+    MEMOPREG(psubd,0x20,0,4,4,xmm2)            // psubd    0x20(%0,%4,4),%%xmm2
+    MEMOPREG(psubd,0x30,0,4,4,xmm3)            // psubd    0x30(%0,%4,4),%%xmm3
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "psubd     " MEMACCESS(1) ",%%xmm0         \n"
+    "psubd     " MEMACCESS2(0x10,1) ",%%xmm1   \n"
+    "psubd     " MEMACCESS2(0x20,1) ",%%xmm2   \n"
+    "psubd     " MEMACCESS2(0x30,1) ",%%xmm3   \n"
+    MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
+    MEMOPREG(paddd,0x10,1,4,4,xmm1)            // paddd    0x10(%1,%4,4),%%xmm1
+    MEMOPREG(paddd,0x20,1,4,4,xmm2)            // paddd    0x20(%1,%4,4),%%xmm2
+    MEMOPREG(paddd,0x30,1,4,4,xmm3)            // paddd    0x30(%1,%4,4),%%xmm3
+    "lea       " MEMLEA(0x40,1) ",%1           \n"
+    "cvtdq2ps  %%xmm0,%%xmm0                   \n"
+    "cvtdq2ps  %%xmm1,%%xmm1                   \n"
+    "mulps     %%xmm4,%%xmm0                   \n"
+    "mulps     %%xmm4,%%xmm1                   \n"
+    "cvtdq2ps  %%xmm2,%%xmm2                   \n"
+    "cvtdq2ps  %%xmm3,%%xmm3                   \n"
+    "mulps     %%xmm4,%%xmm2                   \n"
+    "mulps     %%xmm4,%%xmm3                   \n"
+    "cvtps2dq  %%xmm0,%%xmm0                   \n"
+    "cvtps2dq  %%xmm1,%%xmm1                   \n"
+    "cvtps2dq  %%xmm2,%%xmm2                   \n"
+    "cvtps2dq  %%xmm3,%%xmm3                   \n"
+    "packssdw  %%xmm1,%%xmm0                   \n"
+    "packssdw  %%xmm3,%%xmm2                   \n"
+    "packuswb  %%xmm2,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x4,%3                         \n"
+    "jge       40b                             \n"
+
+  "49:                                         \n"
+    "add       $0x3,%3                         \n"
+    "jl        19f                             \n"
+
+  // 1 pixel loop                              \n"
+    LABELALIGN
+  "10:                                         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "psubd     " MEMACCESS(1) ",%%xmm0         \n"
+    MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "cvtdq2ps  %%xmm0,%%xmm0                   \n"
+    "mulps     %%xmm4,%%xmm0                   \n"
+    "cvtps2dq  %%xmm0,%%xmm0                   \n"
+    "packssdw  %%xmm0,%%xmm0                   \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "movd      %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x4,2) ",%2            \n"
+    "sub       $0x1,%3                         \n"
+    "jge       10b                             \n"
+  "19:                                         \n"
+  : "+r"(topleft),  // %0
+    "+r"(botleft),  // %1
+    "+r"(dst),      // %2
+    "+rm"(count)    // %3
+  : "r"((intptr_t)(width)),  // %4
+    "rm"(area)     // %5
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+  );
+}
+#endif  // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
+
+#ifdef HAS_ARGBAFFINEROW_SSE2
+// Copy ARGB pixels from source image with slope to a row of destination.
+LIBYUV_API
+void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
+                        uint8* dst_argb, const float* src_dudv, int width) {
+  intptr_t src_argb_stride_temp = src_argb_stride;
+  intptr_t temp = 0;
+  asm volatile (
+    "movq      " MEMACCESS(3) ",%%xmm2         \n"
+    "movq      " MEMACCESS2(0x08,3) ",%%xmm7   \n"
+    "shl       $0x10,%1                        \n"
+    "add       $0x4,%1                         \n"
+    "movd      %1,%%xmm5                       \n"
+    "sub       $0x4,%4                         \n"
+    "jl        49f                             \n"
+
+    "pshufd    $0x44,%%xmm7,%%xmm7             \n"
+    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
+    "movdqa    %%xmm2,%%xmm0                   \n"
+    "addps     %%xmm7,%%xmm0                   \n"
+    "movlhps   %%xmm0,%%xmm2                   \n"
+    "movdqa    %%xmm7,%%xmm4                   \n"
+    "addps     %%xmm4,%%xmm4                   \n"
+    "movdqa    %%xmm2,%%xmm3                   \n"
+    "addps     %%xmm4,%%xmm3                   \n"
+    "addps     %%xmm4,%%xmm4                   \n"
+
+  // 4 pixel loop                              \n"
+    LABELALIGN
+  "40:                                         \n"
+    "cvttps2dq %%xmm2,%%xmm0                   \n"  // x, y float to int first 2
+    "cvttps2dq %%xmm3,%%xmm1                   \n"  // x, y float to int next 2
+    "packssdw  %%xmm1,%%xmm0                   \n"  // x, y as 8 shorts
+    "pmaddwd   %%xmm5,%%xmm0                   \n"  // off = x * 4 + y * stride
+    "movd      %%xmm0,%k1                      \n"
+    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
+    "movd      %%xmm0,%k5                      \n"
+    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
+    MEMOPREG(movd,0x00,0,1,1,xmm1)             //  movd      (%0,%1,1),%%xmm1
+    MEMOPREG(movd,0x00,0,5,1,xmm6)             //  movd      (%0,%5,1),%%xmm6
+    "punpckldq %%xmm6,%%xmm1                   \n"
+    "addps     %%xmm4,%%xmm2                   \n"
+    "movq      %%xmm1," MEMACCESS(2) "         \n"
+    "movd      %%xmm0,%k1                      \n"
+    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
+    "movd      %%xmm0,%k5                      \n"
+    MEMOPREG(movd,0x00,0,1,1,xmm0)             //  movd      (%0,%1,1),%%xmm0
+    MEMOPREG(movd,0x00,0,5,1,xmm6)             //  movd      (%0,%5,1),%%xmm6
+    "punpckldq %%xmm6,%%xmm0                   \n"
+    "addps     %%xmm4,%%xmm3                   \n"
+    "movq      %%xmm0," MEMACCESS2(0x08,2) "   \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x4,%4                         \n"
+    "jge       40b                             \n"
+
+  "49:                                         \n"
+    "add       $0x3,%4                         \n"
+    "jl        19f                             \n"
+
+  // 1 pixel loop                              \n"
+    LABELALIGN
+  "10:                                         \n"
+    "cvttps2dq %%xmm2,%%xmm0                   \n"
+    "packssdw  %%xmm0,%%xmm0                   \n"
+    "pmaddwd   %%xmm5,%%xmm0                   \n"
+    "addps     %%xmm7,%%xmm2                   \n"
+    "movd      %%xmm0,%k1                      \n"
+    MEMOPREG(movd,0x00,0,1,1,xmm0)             //  movd      (%0,%1,1),%%xmm0
+    "movd      %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x04,2) ",%2           \n"
+    "sub       $0x1,%4                         \n"
+    "jge       10b                             \n"
+  "19:                                         \n"
+  : "+r"(src_argb),  // %0
+    "+r"(src_argb_stride_temp),  // %1
+    "+r"(dst_argb),  // %2
+    "+r"(src_dudv),  // %3
+    "+rm"(width),    // %4
+    "+r"(temp)   // %5
+  :
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_ARGBAFFINEROW_SSE2
+
+#ifdef HAS_INTERPOLATEROW_SSSE3
+// Bilinear filter 16x2 -> 16x1
+void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+                          ptrdiff_t src_stride, int dst_width,
+                          int source_y_fraction) {
+  asm volatile (
+    "sub       %1,%0                           \n"
+    "shr       %3                              \n"
+    "cmp       $0x0,%3                         \n"
+    "je        100f                            \n"
+    "cmp       $0x20,%3                        \n"
+    "je        75f                             \n"
+    "cmp       $0x40,%3                        \n"
+    "je        50f                             \n"
+    "cmp       $0x60,%3                        \n"
+    "je        25f                             \n"
+
+    "movd      %3,%%xmm0                       \n"
+    "neg       %3                              \n"
+    "add       $0x80,%3                        \n"
+    "movd      %3,%%xmm5                       \n"
+    "punpcklbw %%xmm0,%%xmm5                   \n"
+    "punpcklwd %%xmm5,%%xmm5                   \n"
+    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
+
+    // General purpose row blend.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,1,4,1,xmm2)
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklbw %%xmm2,%%xmm0                   \n"
+    "punpckhbw %%xmm2,%%xmm1                   \n"
+    "pmaddubsw %%xmm5,%%xmm0                   \n"
+    "pmaddubsw %%xmm5,%%xmm1                   \n"
+    "psrlw     $0x7,%%xmm0                     \n"
+    "psrlw     $0x7,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+    "jmp       99f                             \n"
+
+    // Blend 25 / 75.
+    LABELALIGN
+  "25:                                         \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,1,4,1,xmm1)
+    "pavgb     %%xmm1,%%xmm0                   \n"
+    "pavgb     %%xmm1,%%xmm0                   \n"
+    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        25b                             \n"
+    "jmp       99f                             \n"
+
+    // Blend 50 / 50.
+    LABELALIGN
+  "50:                                         \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,1,4,1,xmm1)
+    "pavgb     %%xmm1,%%xmm0                   \n"
+    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        50b                             \n"
+    "jmp       99f                             \n"
+
+    // Blend 75 / 25.
+    LABELALIGN
+  "75:                                         \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
+    MEMOPREG(movdqu,0x00,1,4,1,xmm0)
+    "pavgb     %%xmm1,%%xmm0                   \n"
+    "pavgb     %%xmm1,%%xmm0                   \n"
+    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        75b                             \n"
+    "jmp       99f                             \n"
+
+    // Blend 100 / 0 - Copy row unchanged.
+    LABELALIGN
+  "100:                                        \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
+    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        100b                            \n"
+
+  "99:                                         \n"
+  : "+r"(dst_ptr),    // %0
+    "+r"(src_ptr),    // %1
+    "+r"(dst_width),  // %2
+    "+r"(source_y_fraction)  // %3
+  : "r"((intptr_t)(src_stride))  // %4
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm5"
+  );
+}
+#endif  // HAS_INTERPOLATEROW_SSSE3
+
+#ifdef HAS_INTERPOLATEROW_AVX2
+// Bilinear filter 32x2 -> 32x1
+void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) {
+  asm volatile (
+    "shr       %3                              \n"
+    "cmp       $0x0,%3                         \n"
+    "je        100f                            \n"
+    "sub       %1,%0                           \n"
+    "cmp       $0x20,%3                        \n"
+    "je        75f                             \n"
+    "cmp       $0x40,%3                        \n"
+    "je        50f                             \n"
+    "cmp       $0x60,%3                        \n"
+    "je        25f                             \n"
+
+    "vmovd      %3,%%xmm0                      \n"
+    "neg        %3                             \n"
+    "add        $0x80,%3                       \n"
+    "vmovd      %3,%%xmm5                      \n"
+    "vpunpcklbw %%xmm0,%%xmm5,%%xmm5           \n"
+    "vpunpcklwd %%xmm5,%%xmm5,%%xmm5           \n"
+    "vpxor      %%ymm0,%%ymm0,%%ymm0           \n"
+    "vpermd     %%ymm5,%%ymm0,%%ymm5           \n"
+
+    // General purpose row blend.
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu    " MEMACCESS(1) ",%%ymm0        \n"
+    MEMOPREG(vmovdqu,0x00,1,4,1,ymm2)
+    "vpunpckhbw %%ymm2,%%ymm0,%%ymm1           \n"
+    "vpunpcklbw %%ymm2,%%ymm0,%%ymm0           \n"
+    "vpmaddubsw %%ymm5,%%ymm0,%%ymm0           \n"
+    "vpmaddubsw %%ymm5,%%ymm1,%%ymm1           \n"
+    "vpsrlw     $0x7,%%ymm0,%%ymm0             \n"
+    "vpsrlw     $0x7,%%ymm1,%%ymm1             \n"
+    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+    MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x20,%2                        \n"
+    "jg        1b                              \n"
+    "jmp       99f                             \n"
+
+    // Blend 25 / 75.
+    LABELALIGN
+  "25:                                         \n"
+    "vmovdqu    " MEMACCESS(1) ",%%ymm0        \n"
+    MEMOPREG(vmovdqu,0x00,1,4,1,ymm1)
+    "vpavgb     %%ymm1,%%ymm0,%%ymm0           \n"
+    "vpavgb     %%ymm1,%%ymm0,%%ymm0           \n"
+    MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x20,%2                        \n"
+    "jg        25b                             \n"
+    "jmp       99f                             \n"
+
+    // Blend 50 / 50.
+    LABELALIGN
+  "50:                                         \n"
+    "vmovdqu    " MEMACCESS(1) ",%%ymm0        \n"
+    VMEMOPREG(vpavgb,0x00,1,4,1,ymm0,ymm0)     // vpavgb (%1,%4,1),%%ymm0,%%ymm0
+    MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x20,%2                        \n"
+    "jg        50b                             \n"
+    "jmp       99f                             \n"
+
+    // Blend 75 / 25.
+    LABELALIGN
+  "75:                                         \n"
+    "vmovdqu    " MEMACCESS(1) ",%%ymm1        \n"
+    MEMOPREG(vmovdqu,0x00,1,4,1,ymm0)
+    "vpavgb     %%ymm1,%%ymm0,%%ymm0           \n"
+    "vpavgb     %%ymm1,%%ymm0,%%ymm0           \n"
+    MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x20,%2                        \n"
+    "jg        75b                             \n"
+    "jmp       99f                             \n"
+
+    // Blend 100 / 0 - Copy row unchanged.
+    LABELALIGN
+  "100:                                        \n"
+    "rep movsb " MEMMOVESTRING(1,0) "          \n"
+    "jmp       999f                            \n"
+
+  "99:                                         \n"
+    "vzeroupper                                \n"
+  "999:                                        \n"
+  : "+D"(dst_ptr),    // %0
+    "+S"(src_ptr),    // %1
+    "+c"(dst_width),  // %2
+    "+r"(source_y_fraction)  // %3
+  : "r"((intptr_t)(src_stride))  // %4
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm5"
+  );
+}
+#endif  // HAS_INTERPOLATEROW_AVX2
+
+#ifdef HAS_INTERPOLATEROW_SSE2
+// Bilinear filter 16x2 -> 16x1
+void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) {
+  asm volatile (
+    "sub       %1,%0                           \n"
+    "shr       %3                              \n"
+    "cmp       $0x0,%3                         \n"
+    "je        100f                            \n"
+    "cmp       $0x20,%3                        \n"
+    "je        75f                             \n"
+    "cmp       $0x40,%3                        \n"
+    "je        50f                             \n"
+    "cmp       $0x60,%3                        \n"
+    "je        25f                             \n"
+
+    "movd      %3,%%xmm0                       \n"
+    "neg       %3                              \n"
+    "add       $0x80,%3                        \n"
+    "movd      %3,%%xmm5                       \n"
+    "punpcklbw %%xmm0,%%xmm5                   \n"
+    "punpcklwd %%xmm5,%%xmm5                   \n"
+    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
+    "pxor      %%xmm4,%%xmm4                   \n"
+
+    // General purpose row blend.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,1,4,1,xmm2)           //  movdqu    (%1,%4,1),%%xmm2
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm2,%%xmm3                   \n"
+    "punpcklbw %%xmm4,%%xmm2                   \n"
+    "punpckhbw %%xmm4,%%xmm3                   \n"
+    "punpcklbw %%xmm4,%%xmm0                   \n"
+    "punpckhbw %%xmm4,%%xmm1                   \n"
+    "psubw     %%xmm0,%%xmm2                   \n"
+    "psubw     %%xmm1,%%xmm3                   \n"
+    "paddw     %%xmm2,%%xmm2                   \n"
+    "paddw     %%xmm3,%%xmm3                   \n"
+    "pmulhw    %%xmm5,%%xmm2                   \n"
+    "pmulhw    %%xmm5,%%xmm3                   \n"
+    "paddw     %%xmm2,%%xmm0                   \n"
+    "paddw     %%xmm3,%%xmm1                   \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+    "jmp       99f                             \n"
+
+    // Blend 25 / 75.
+    LABELALIGN
+  "25:                                         \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,1,4,1,xmm1)           //  movdqu    (%1,%4,1),%%xmm1
+    "pavgb     %%xmm1,%%xmm0                   \n"
+    "pavgb     %%xmm1,%%xmm0                   \n"
+    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        25b                             \n"
+    "jmp       99f                             \n"
+
+    // Blend 50 / 50.
+    LABELALIGN
+  "50:                                         \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,1,4,1,xmm1)           //  movdqu    (%1,%4,1),%%xmm1
+    "pavgb     %%xmm1,%%xmm0                   \n"
+    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        50b                             \n"
+    "jmp       99f                             \n"
+
+    // Blend 75 / 25.
+    LABELALIGN
+  "75:                                         \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
+    MEMOPREG(movdqu,0x00,1,4,1,xmm0)           //  movdqu    (%1,%4,1),%%xmm0
+    "pavgb     %%xmm1,%%xmm0                   \n"
+    "pavgb     %%xmm1,%%xmm0                   \n"
+    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        75b                             \n"
+    "jmp       99f                             \n"
+
+    // Blend 100 / 0 - Copy row unchanged.
+    LABELALIGN
+  "100:                                        \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
+    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        100b                            \n"
+
+  "99:                                         \n"
+  : "+r"(dst_ptr),    // %0
+    "+r"(src_ptr),    // %1
+    "+r"(dst_width),  // %2
+    "+r"(source_y_fraction)  // %3
+  : "r"((intptr_t)(src_stride))  // %4
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_INTERPOLATEROW_SSE2
+
+#ifdef HAS_ARGBSHUFFLEROW_SSSE3
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+                          const uint8* shuffler, int pix) {
+  asm volatile (
+    "movdqu    " MEMACCESS(3) ",%%xmm5         \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "pshufb    %%xmm5,%%xmm0                   \n"
+    "pshufb    %%xmm5,%%xmm1                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(pix)        // %2
+  : "r"(shuffler)    // %3
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm5"
+  );
+}
+#endif  // HAS_ARGBSHUFFLEROW_SSSE3
+
+#ifdef HAS_ARGBSHUFFLEROW_AVX2
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
+                         const uint8* shuffler, int pix) {
+  asm volatile (
+    "vbroadcastf128 " MEMACCESS(3) ",%%ymm5    \n"
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
+    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "vpshufb   %%ymm5,%%ymm0,%%ymm0            \n"
+    "vpshufb   %%ymm5,%%ymm1,%%ymm1            \n"
+    "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
+    "vmovdqu   %%ymm1," MEMACCESS2(0x20,1) "   \n"
+    "lea       " MEMLEA(0x40,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(pix)        // %2
+  : "r"(shuffler)    // %3
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm5"
+  );
+}
+#endif  // HAS_ARGBSHUFFLEROW_AVX2
+
+#ifdef HAS_ARGBSHUFFLEROW_SSE2
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
+                         const uint8* shuffler, int pix) {
+  uintptr_t pixel_temp = 0u;
+  asm volatile (
+    "pxor      %%xmm5,%%xmm5                   \n"
+    "mov       " MEMACCESS(4) ",%k2            \n"
+    "cmp       $0x3000102,%k2                  \n"
+    "je        3012f                           \n"
+    "cmp       $0x10203,%k2                    \n"
+    "je        123f                            \n"
+    "cmp       $0x30201,%k2                    \n"
+    "je        321f                            \n"
+    "cmp       $0x2010003,%k2                  \n"
+    "je        2103f                           \n"
+
+    LABELALIGN
+  "1:                                          \n"
+    "movzb     " MEMACCESS(4) ",%2             \n"
+    MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
+    "mov       %b2," MEMACCESS(1) "            \n"
+    "movzb     " MEMACCESS2(0x1,4) ",%2        \n"
+    MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
+    "mov       %b2," MEMACCESS2(0x1,1) "       \n"
+    "movzb     " MEMACCESS2(0x2,4) ",%2        \n"
+    MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
+    "mov       %b2," MEMACCESS2(0x2,1) "       \n"
+    "movzb     " MEMACCESS2(0x3,4) ",%2        \n"
+    MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
+    "mov       %b2," MEMACCESS2(0x3,1) "       \n"
+    "lea       " MEMLEA(0x4,0) ",%0            \n"
+    "lea       " MEMLEA(0x4,1) ",%1            \n"
+    "sub       $0x1,%3                         \n"
+    "jg        1b                              \n"
+    "jmp       99f                             \n"
+
+    LABELALIGN
+  "123:                                        \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklbw %%xmm5,%%xmm0                   \n"
+    "punpckhbw %%xmm5,%%xmm1                   \n"
+    "pshufhw   $0x1b,%%xmm0,%%xmm0             \n"
+    "pshuflw   $0x1b,%%xmm0,%%xmm0             \n"
+    "pshufhw   $0x1b,%%xmm1,%%xmm1             \n"
+    "pshuflw   $0x1b,%%xmm1,%%xmm1             \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%3                         \n"
+    "jg        123b                            \n"
+    "jmp       99f                             \n"
+
+    LABELALIGN
+  "321:                                        \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklbw %%xmm5,%%xmm0                   \n"
+    "punpckhbw %%xmm5,%%xmm1                   \n"
+    "pshufhw   $0x39,%%xmm0,%%xmm0             \n"
+    "pshuflw   $0x39,%%xmm0,%%xmm0             \n"
+    "pshufhw   $0x39,%%xmm1,%%xmm1             \n"
+    "pshuflw   $0x39,%%xmm1,%%xmm1             \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%3                         \n"
+    "jg        321b                            \n"
+    "jmp       99f                             \n"
+
+    LABELALIGN
+  "2103:                                       \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklbw %%xmm5,%%xmm0                   \n"
+    "punpckhbw %%xmm5,%%xmm1                   \n"
+    "pshufhw   $0x93,%%xmm0,%%xmm0             \n"
+    "pshuflw   $0x93,%%xmm0,%%xmm0             \n"
+    "pshufhw   $0x93,%%xmm1,%%xmm1             \n"
+    "pshuflw   $0x93,%%xmm1,%%xmm1             \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%3                         \n"
+    "jg        2103b                           \n"
+    "jmp       99f                             \n"
+
+    LABELALIGN
+  "3012:                                       \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklbw %%xmm5,%%xmm0                   \n"
+    "punpckhbw %%xmm5,%%xmm1                   \n"
+    "pshufhw   $0xc6,%%xmm0,%%xmm0             \n"
+    "pshuflw   $0xc6,%%xmm0,%%xmm0             \n"
+    "pshufhw   $0xc6,%%xmm1,%%xmm1             \n"
+    "pshuflw   $0xc6,%%xmm1,%%xmm1             \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%3                         \n"
+    "jg        3012b                           \n"
+
+  "99:                                         \n"
+  : "+r"(src_argb),    // %0
+    "+r"(dst_argb),    // %1
+    "+d"(pixel_temp),  // %2
+    "+r"(pix)         // %3
+  : "r"(shuffler)      // %4
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm5"
+  );
+}
+#endif  // HAS_ARGBSHUFFLEROW_SSE2
+
+#ifdef HAS_I422TOYUY2ROW_SSE2
+void I422ToYUY2Row_SSE2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_frame, int width) {
+ asm volatile (
+    "sub       %1,%2                             \n"
+    LABELALIGN
+  "1:                                            \n"
+    "movq      " MEMACCESS(1) ",%%xmm2           \n"
+    MEMOPREG(movq,0x00,1,2,1,xmm3)               //  movq    (%1,%2,1),%%xmm3
+    "lea       " MEMLEA(0x8,1) ",%1              \n"
+    "punpcklbw %%xmm3,%%xmm2                     \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0           \n"
+    "lea       " MEMLEA(0x10,0) ",%0             \n"
+    "movdqa    %%xmm0,%%xmm1                     \n"
+    "punpcklbw %%xmm2,%%xmm0                     \n"
+    "punpckhbw %%xmm2,%%xmm1                     \n"
+    "movdqu    %%xmm0," MEMACCESS(3) "           \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,3) "     \n"
+    "lea       " MEMLEA(0x20,3) ",%3             \n"
+    "sub       $0x10,%4                          \n"
+    "jg         1b                               \n"
+    : "+r"(src_y),  // %0
+      "+r"(src_u),  // %1
+      "+r"(src_v),  // %2
+      "+r"(dst_frame),  // %3
+      "+rm"(width)  // %4
+    :
+    : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3"
+  );
+}
+#endif  // HAS_I422TOYUY2ROW_SSE2
+
+#ifdef HAS_I422TOUYVYROW_SSE2
+void I422ToUYVYRow_SSE2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_frame, int width) {
+ asm volatile (
+    "sub        %1,%2                            \n"
+    LABELALIGN
+  "1:                                            \n"
+    "movq      " MEMACCESS(1) ",%%xmm2           \n"
+    MEMOPREG(movq,0x00,1,2,1,xmm3)               //  movq    (%1,%2,1),%%xmm3
+    "lea       " MEMLEA(0x8,1) ",%1              \n"
+    "punpcklbw %%xmm3,%%xmm2                     \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0           \n"
+    "movdqa    %%xmm2,%%xmm1                     \n"
+    "lea       " MEMLEA(0x10,0) ",%0             \n"
+    "punpcklbw %%xmm0,%%xmm1                     \n"
+    "punpckhbw %%xmm0,%%xmm2                     \n"
+    "movdqu    %%xmm1," MEMACCESS(3) "           \n"
+    "movdqu    %%xmm2," MEMACCESS2(0x10,3) "     \n"
+    "lea       " MEMLEA(0x20,3) ",%3             \n"
+    "sub       $0x10,%4                          \n"
+    "jg         1b                               \n"
+    : "+r"(src_y),  // %0
+      "+r"(src_u),  // %1
+      "+r"(src_v),  // %2
+      "+r"(dst_frame),  // %3
+      "+rm"(width)  // %4
+    :
+    : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3"
+  );
+}
+#endif  // HAS_I422TOUYVYROW_SSE2
+
+#ifdef HAS_ARGBPOLYNOMIALROW_SSE2
+void ARGBPolynomialRow_SSE2(const uint8* src_argb,
+                            uint8* dst_argb, const float* poly,
+                            int width) {
+  asm volatile (
+    "pxor      %%xmm3,%%xmm3                   \n"
+
+    // 2 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movq      " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x8,0) ",%0            \n"
+    "punpcklbw %%xmm3,%%xmm0                   \n"
+    "movdqa    %%xmm0,%%xmm4                   \n"
+    "punpcklwd %%xmm3,%%xmm0                   \n"
+    "punpckhwd %%xmm3,%%xmm4                   \n"
+    "cvtdq2ps  %%xmm0,%%xmm0                   \n"
+    "cvtdq2ps  %%xmm4,%%xmm4                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm4,%%xmm5                   \n"
+    "mulps     " MEMACCESS2(0x10,3) ",%%xmm0   \n"
+    "mulps     " MEMACCESS2(0x10,3) ",%%xmm4   \n"
+    "addps     " MEMACCESS(3) ",%%xmm0         \n"
+    "addps     " MEMACCESS(3) ",%%xmm4         \n"
+    "movdqa    %%xmm1,%%xmm2                   \n"
+    "movdqa    %%xmm5,%%xmm6                   \n"
+    "mulps     %%xmm1,%%xmm2                   \n"
+    "mulps     %%xmm5,%%xmm6                   \n"
+    "mulps     %%xmm2,%%xmm1                   \n"
+    "mulps     %%xmm6,%%xmm5                   \n"
+    "mulps     " MEMACCESS2(0x20,3) ",%%xmm2   \n"
+    "mulps     " MEMACCESS2(0x20,3) ",%%xmm6   \n"
+    "mulps     " MEMACCESS2(0x30,3) ",%%xmm1   \n"
+    "mulps     " MEMACCESS2(0x30,3) ",%%xmm5   \n"
+    "addps     %%xmm2,%%xmm0                   \n"
+    "addps     %%xmm6,%%xmm4                   \n"
+    "addps     %%xmm1,%%xmm0                   \n"
+    "addps     %%xmm5,%%xmm4                   \n"
+    "cvttps2dq %%xmm0,%%xmm0                   \n"
+    "cvttps2dq %%xmm4,%%xmm4                   \n"
+    "packuswb  %%xmm4,%%xmm0                   \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x2,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(width)      // %2
+  : "r"(poly)        // %3
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+  );
+}
+#endif  // HAS_ARGBPOLYNOMIALROW_SSE2
+
+#ifdef HAS_ARGBPOLYNOMIALROW_AVX2
+void ARGBPolynomialRow_AVX2(const uint8* src_argb,
+                            uint8* dst_argb, const float* poly,
+                            int width) {
+  asm volatile (
+    "vbroadcastf128 " MEMACCESS(3) ",%%ymm4     \n"
+    "vbroadcastf128 " MEMACCESS2(0x10,3) ",%%ymm5 \n"
+    "vbroadcastf128 " MEMACCESS2(0x20,3) ",%%ymm6 \n"
+    "vbroadcastf128 " MEMACCESS2(0x30,3) ",%%ymm7 \n"
+
+    // 2 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "vpmovzxbd   " MEMACCESS(0) ",%%ymm0       \n"  // 2 ARGB pixels
+    "lea         " MEMLEA(0x8,0) ",%0          \n"
+    "vcvtdq2ps   %%ymm0,%%ymm0                 \n"  // X 8 floats
+    "vmulps      %%ymm0,%%ymm0,%%ymm2          \n"  // X * X
+    "vmulps      %%ymm7,%%ymm0,%%ymm3          \n"  // C3 * X
+    "vfmadd132ps %%ymm5,%%ymm4,%%ymm0          \n"  // result = C0 + C1 * X
+    "vfmadd231ps %%ymm6,%%ymm2,%%ymm0          \n"  // result += C2 * X * X
+    "vfmadd231ps %%ymm3,%%ymm2,%%ymm0          \n"  // result += C3 * X * X * X
+    "vcvttps2dq  %%ymm0,%%ymm0                 \n"
+    "vpackusdw   %%ymm0,%%ymm0,%%ymm0          \n"
+    "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+    "vpackuswb   %%xmm0,%%xmm0,%%xmm0          \n"
+    "vmovq       %%xmm0," MEMACCESS(1) "       \n"
+    "lea         " MEMLEA(0x8,1) ",%1          \n"
+    "sub         $0x2,%2                       \n"
+    "jg          1b                            \n"
+    "vzeroupper                                \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(width)      // %2
+  : "r"(poly)        // %3
+  : "memory", "cc",
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_ARGBPOLYNOMIALROW_AVX2
+
+#ifdef HAS_ARGBCOLORTABLEROW_X86
+// Tranform ARGB pixels with color table.
+void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
+                           int width) {
+  uintptr_t pixel_temp = 0u;
+  asm volatile (
+    // 1 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movzb     " MEMACCESS(0) ",%1             \n"
+    "lea       " MEMLEA(0x4,0) ",%0            \n"
+    MEMOPARG(movzb,0x00,3,1,4,1) "             \n"  // movzb (%3,%1,4),%1
+    "mov       %b1," MEMACCESS2(-0x4,0) "      \n"
+    "movzb     " MEMACCESS2(-0x3,0) ",%1       \n"
+    MEMOPARG(movzb,0x01,3,1,4,1) "             \n"  // movzb 0x1(%3,%1,4),%1
+    "mov       %b1," MEMACCESS2(-0x3,0) "      \n"
+    "movzb     " MEMACCESS2(-0x2,0) ",%1       \n"
+    MEMOPARG(movzb,0x02,3,1,4,1) "             \n"  // movzb 0x2(%3,%1,4),%1
+    "mov       %b1," MEMACCESS2(-0x2,0) "      \n"
+    "movzb     " MEMACCESS2(-0x1,0) ",%1       \n"
+    MEMOPARG(movzb,0x03,3,1,4,1) "             \n"  // movzb 0x3(%3,%1,4),%1
+    "mov       %b1," MEMACCESS2(-0x1,0) "      \n"
+    "dec       %2                              \n"
+    "jg        1b                              \n"
+  : "+r"(dst_argb),   // %0
+    "+d"(pixel_temp), // %1
+    "+r"(width)       // %2
+  : "r"(table_argb)   // %3
+  : "memory", "cc");
+}
+#endif  // HAS_ARGBCOLORTABLEROW_X86
+
+#ifdef HAS_RGBCOLORTABLEROW_X86
+// Tranform RGB pixels with color table.
+void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
+  uintptr_t pixel_temp = 0u;
+  asm volatile (
+    // 1 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movzb     " MEMACCESS(0) ",%1             \n"
+    "lea       " MEMLEA(0x4,0) ",%0            \n"
+    MEMOPARG(movzb,0x00,3,1,4,1) "             \n"  // movzb (%3,%1,4),%1
+    "mov       %b1," MEMACCESS2(-0x4,0) "      \n"
+    "movzb     " MEMACCESS2(-0x3,0) ",%1       \n"
+    MEMOPARG(movzb,0x01,3,1,4,1) "             \n"  // movzb 0x1(%3,%1,4),%1
+    "mov       %b1," MEMACCESS2(-0x3,0) "      \n"
+    "movzb     " MEMACCESS2(-0x2,0) ",%1       \n"
+    MEMOPARG(movzb,0x02,3,1,4,1) "             \n"  // movzb 0x2(%3,%1,4),%1
+    "mov       %b1," MEMACCESS2(-0x2,0) "      \n"
+    "dec       %2                              \n"
+    "jg        1b                              \n"
+  : "+r"(dst_argb),   // %0
+    "+d"(pixel_temp), // %1
+    "+r"(width)       // %2
+  : "r"(table_argb)   // %3
+  : "memory", "cc");
+}
+#endif  // HAS_RGBCOLORTABLEROW_X86
+
+#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
+// Tranform RGB pixels with luma table.
+void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+                                 int width,
+                                 const uint8* luma, uint32 lumacoeff) {
+  uintptr_t pixel_temp = 0u;
+  uintptr_t table_temp = 0u;
+  asm volatile (
+    "movd      %6,%%xmm3                       \n"
+    "pshufd    $0x0,%%xmm3,%%xmm3              \n"
+    "pcmpeqb   %%xmm4,%%xmm4                   \n"
+    "psllw     $0x8,%%xmm4                     \n"
+    "pxor      %%xmm5,%%xmm5                   \n"
+
+    // 4 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(2) ",%%xmm0         \n"
+    "pmaddubsw %%xmm3,%%xmm0                   \n"
+    "phaddw    %%xmm0,%%xmm0                   \n"
+    "pand      %%xmm4,%%xmm0                   \n"
+    "punpcklwd %%xmm5,%%xmm0                   \n"
+    "movd      %%xmm0,%k1                      \n"  // 32 bit offset
+    "add       %5,%1                           \n"
+    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
+
+    "movzb     " MEMACCESS(2) ",%0             \n"
+    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
+    "mov       %b0," MEMACCESS(3) "            \n"
+    "movzb     " MEMACCESS2(0x1,2) ",%0        \n"
+    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
+    "mov       %b0," MEMACCESS2(0x1,3) "       \n"
+    "movzb     " MEMACCESS2(0x2,2) ",%0        \n"
+    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
+    "mov       %b0," MEMACCESS2(0x2,3) "       \n"
+    "movzb     " MEMACCESS2(0x3,2) ",%0        \n"
+    "mov       %b0," MEMACCESS2(0x3,3) "       \n"
+
+    "movd      %%xmm0,%k1                      \n"  // 32 bit offset
+    "add       %5,%1                           \n"
+    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
+
+    "movzb     " MEMACCESS2(0x4,2) ",%0        \n"
+    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
+    "mov       %b0," MEMACCESS2(0x4,3) "       \n"
+    "movzb     " MEMACCESS2(0x5,2) ",%0        \n"
+    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
+    "mov       %b0," MEMACCESS2(0x5,3) "       \n"
+    "movzb     " MEMACCESS2(0x6,2) ",%0        \n"
+    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
+    "mov       %b0," MEMACCESS2(0x6,3) "       \n"
+    "movzb     " MEMACCESS2(0x7,2) ",%0        \n"
+    "mov       %b0," MEMACCESS2(0x7,3) "       \n"
+
+    "movd      %%xmm0,%k1                      \n"  // 32 bit offset
+    "add       %5,%1                           \n"
+    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
+
+    "movzb     " MEMACCESS2(0x8,2) ",%0        \n"
+    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
+    "mov       %b0," MEMACCESS2(0x8,3) "       \n"
+    "movzb     " MEMACCESS2(0x9,2) ",%0        \n"
+    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
+    "mov       %b0," MEMACCESS2(0x9,3) "       \n"
+    "movzb     " MEMACCESS2(0xa,2) ",%0        \n"
+    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
+    "mov       %b0," MEMACCESS2(0xa,3) "       \n"
+    "movzb     " MEMACCESS2(0xb,2) ",%0        \n"
+    "mov       %b0," MEMACCESS2(0xb,3) "       \n"
+
+    "movd      %%xmm0,%k1                      \n"  // 32 bit offset
+    "add       %5,%1                           \n"
+
+    "movzb     " MEMACCESS2(0xc,2) ",%0        \n"
+    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
+    "mov       %b0," MEMACCESS2(0xc,3) "       \n"
+    "movzb     " MEMACCESS2(0xd,2) ",%0        \n"
+    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
+    "mov       %b0," MEMACCESS2(0xd,3) "       \n"
+    "movzb     " MEMACCESS2(0xe,2) ",%0        \n"
+    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
+    "mov       %b0," MEMACCESS2(0xe,3) "       \n"
+    "movzb     " MEMACCESS2(0xf,2) ",%0        \n"
+    "mov       %b0," MEMACCESS2(0xf,3) "       \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "lea       " MEMLEA(0x10,3) ",%3           \n"
+    "sub       $0x4,%4                         \n"
+    "jg        1b                              \n"
+  : "+d"(pixel_temp),  // %0
+    "+a"(table_temp),  // %1
+    "+r"(src_argb),    // %2
+    "+r"(dst_argb),    // %3
+    "+rm"(width)       // %4
+  : "r"(luma),         // %5
+    "rm"(lumacoeff)    // %6
+  : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3
+
+#endif  // defined(__x86_64__) || defined(__i386__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libaom/src/third_party/libyuv/source/row_mips.cc b/libs/libaom/src/third_party/libyuv/source/row_mips.cc
new file mode 100644
index 000000000..cfc9ffe03
--- /dev/null
+++ b/libs/libaom/src/third_party/libyuv/source/row_mips.cc
@@ -0,0 +1,911 @@
+/*
+ *  Copyright (c) 2012 The LibYuv project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// The following are available on Mips platforms:
+#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) && \
+    (_MIPS_SIM == _MIPS_SIM_ABI32)
+
+#ifdef HAS_COPYROW_MIPS
+void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
+  __asm__ __volatile__ (
+    ".set      noreorder                         \n"
+    ".set      noat                              \n"
+    "slti      $at, %[count], 8                  \n"
+    "bne       $at ,$zero, $last8                \n"
+    "xor       $t8, %[src], %[dst]               \n"
+    "andi      $t8, $t8, 0x3                     \n"
+
+    "bne       $t8, $zero, unaligned             \n"
+    "negu      $a3, %[dst]                       \n"
+    // make dst/src aligned
+    "andi      $a3, $a3, 0x3                     \n"
+    "beq       $a3, $zero, $chk16w               \n"
+    // word-aligned now count is the remining bytes count
+    "subu     %[count], %[count], $a3            \n"
+
+    "lwr       $t8, 0(%[src])                    \n"
+    "addu      %[src], %[src], $a3               \n"
+    "swr       $t8, 0(%[dst])                    \n"
+    "addu      %[dst], %[dst], $a3               \n"
+
+    // Now the dst/src are mutually word-aligned with word-aligned addresses
+    "$chk16w:                                    \n"
+    "andi      $t8, %[count], 0x3f               \n"  // whole 64-B chunks?
+    // t8 is the byte count after 64-byte chunks
+    "beq       %[count], $t8, chk8w              \n"
+    // There will be at most 1 32-byte chunk after it
+    "subu      $a3, %[count], $t8                \n"  // the reminder
+    // Here a3 counts bytes in 16w chunks
+    "addu      $a3, %[dst], $a3                  \n"
+    // Now a3 is the final dst after 64-byte chunks
+    "addu      $t0, %[dst], %[count]             \n"
+    // t0 is the "past the end" address
+
+    // When in the loop we exercise "pref 30,x(a1)", the a1+x should not be past
+    // the "t0-32" address
+    // This means: for x=128 the last "safe" a1 address is "t0-160"
+    // Alternatively, for x=64 the last "safe" a1 address is "t0-96"
+    // we will use "pref 30,128(a1)", so "t0-160" is the limit
+    "subu      $t9, $t0, 160                     \n"
+    // t9 is the "last safe pref 30,128(a1)" address
+    "pref      0, 0(%[src])                      \n"  // first line of src
+    "pref      0, 32(%[src])                     \n"  // second line of src
+    "pref      0, 64(%[src])                     \n"
+    "pref      30, 32(%[dst])                    \n"
+    // In case the a1 > t9 don't use "pref 30" at all
+    "sgtu      $v1, %[dst], $t9                  \n"
+    "bgtz      $v1, $loop16w                     \n"
+    "nop                                         \n"
+    // otherwise, start with using pref30
+    "pref      30, 64(%[dst])                    \n"
+    "$loop16w:                                    \n"
+    "pref      0, 96(%[src])                     \n"
+    "lw        $t0, 0(%[src])                    \n"
+    "bgtz      $v1, $skip_pref30_96              \n"  // skip
+    "lw        $t1, 4(%[src])                    \n"
+    "pref      30, 96(%[dst])                    \n"  // continue
+    "$skip_pref30_96:                            \n"
+    "lw        $t2, 8(%[src])                    \n"
+    "lw        $t3, 12(%[src])                   \n"
+    "lw        $t4, 16(%[src])                   \n"
+    "lw        $t5, 20(%[src])                   \n"
+    "lw        $t6, 24(%[src])                   \n"
+    "lw        $t7, 28(%[src])                   \n"
+    "pref      0, 128(%[src])                    \n"
+    //  bring the next lines of src, addr 128
+    "sw        $t0, 0(%[dst])                    \n"
+    "sw        $t1, 4(%[dst])                    \n"
+    "sw        $t2, 8(%[dst])                    \n"
+    "sw        $t3, 12(%[dst])                   \n"
+    "sw        $t4, 16(%[dst])                   \n"
+    "sw        $t5, 20(%[dst])                   \n"
+    "sw        $t6, 24(%[dst])                   \n"
+    "sw        $t7, 28(%[dst])                   \n"
+    "lw        $t0, 32(%[src])                   \n"
+    "bgtz      $v1, $skip_pref30_128             \n"  // skip pref 30,128(a1)
+    "lw        $t1, 36(%[src])                   \n"
+    "pref      30, 128(%[dst])                   \n"  // set dest, addr 128
+    "$skip_pref30_128:                           \n"
+    "lw        $t2, 40(%[src])                   \n"
+    "lw        $t3, 44(%[src])                   \n"
+    "lw        $t4, 48(%[src])                   \n"
+    "lw        $t5, 52(%[src])                   \n"
+    "lw        $t6, 56(%[src])                   \n"
+    "lw        $t7, 60(%[src])                   \n"
+    "pref      0, 160(%[src])                    \n"
+    // bring the next lines of src, addr 160
+    "sw        $t0, 32(%[dst])                   \n"
+    "sw        $t1, 36(%[dst])                   \n"
+    "sw        $t2, 40(%[dst])                   \n"
+    "sw        $t3, 44(%[dst])                   \n"
+    "sw        $t4, 48(%[dst])                   \n"
+    "sw        $t5, 52(%[dst])                   \n"
+    "sw        $t6, 56(%[dst])                   \n"
+    "sw        $t7, 60(%[dst])                   \n"
+
+    "addiu     %[dst], %[dst], 64                \n"  // adding 64 to dest
+    "sgtu      $v1, %[dst], $t9                  \n"
+    "bne       %[dst], $a3, $loop16w             \n"
+    " addiu    %[src], %[src], 64                \n"  // adding 64 to src
+    "move      %[count], $t8                     \n"
+
+    // Here we have src and dest word-aligned but less than 64-bytes to go
+
+    "chk8w:                                      \n"
+    "pref      0, 0x0(%[src])                    \n"
+    "andi      $t8, %[count], 0x1f               \n"  // 32-byte chunk?
+    // the t8 is the reminder count past 32-bytes
+    "beq       %[count], $t8, chk1w              \n"
+    // count=t8,no 32-byte chunk
+    " nop                                        \n"
+
+    "lw        $t0, 0(%[src])                    \n"
+    "lw        $t1, 4(%[src])                    \n"
+    "lw        $t2, 8(%[src])                    \n"
+    "lw        $t3, 12(%[src])                   \n"
+    "lw        $t4, 16(%[src])                   \n"
+    "lw        $t5, 20(%[src])                   \n"
+    "lw        $t6, 24(%[src])                   \n"
+    "lw        $t7, 28(%[src])                   \n"
+    "addiu     %[src], %[src], 32                \n"
+
+    "sw        $t0, 0(%[dst])                    \n"
+    "sw        $t1, 4(%[dst])                    \n"
+    "sw        $t2, 8(%[dst])                    \n"
+    "sw        $t3, 12(%[dst])                   \n"
+    "sw        $t4, 16(%[dst])                   \n"
+    "sw        $t5, 20(%[dst])                   \n"
+    "sw        $t6, 24(%[dst])                   \n"
+    "sw        $t7, 28(%[dst])                   \n"
+    "addiu     %[dst], %[dst], 32                \n"
+
+    "chk1w:                                      \n"
+    "andi      %[count], $t8, 0x3                \n"
+    // now count is the reminder past 1w chunks
+    "beq       %[count], $t8, $last8             \n"
+    " subu     $a3, $t8, %[count]                \n"
+    // a3 is count of bytes in 1w chunks
+    "addu      $a3, %[dst], $a3                  \n"
+    // now a3 is the dst address past the 1w chunks
+    // copying in words (4-byte chunks)
+    "$wordCopy_loop:                             \n"
+    "lw        $t3, 0(%[src])                    \n"
+    // the first t3 may be equal t0 ... optimize?
+    "addiu     %[src], %[src],4                  \n"
+    "addiu     %[dst], %[dst],4                  \n"
+    "bne       %[dst], $a3,$wordCopy_loop        \n"
+    " sw       $t3, -4(%[dst])                   \n"
+
+    // For the last (<8) bytes
+    "$last8:                                     \n"
+    "blez      %[count], leave                   \n"
+    " addu     $a3, %[dst], %[count]             \n"  // a3 -last dst address
+    "$last8loop:                                 \n"
+    "lb        $v1, 0(%[src])                    \n"
+    "addiu     %[src], %[src], 1                 \n"
+    "addiu     %[dst], %[dst], 1                 \n"
+    "bne       %[dst], $a3, $last8loop           \n"
+    " sb       $v1, -1(%[dst])                   \n"
+
+    "leave:                                      \n"
+    "  j       $ra                               \n"
+    "  nop                                       \n"
+
+    //
+    // UNALIGNED case
+    //
+
+    "unaligned:                                  \n"
+    // got here with a3="negu a1"
+    "andi      $a3, $a3, 0x3                     \n"  // a1 is word aligned?
+    "beqz      $a3, $ua_chk16w                   \n"
+    " subu     %[count], %[count], $a3           \n"
+    // bytes left after initial a3 bytes
+    "lwr       $v1, 0(%[src])                    \n"
+    "lwl       $v1, 3(%[src])                    \n"
+    "addu      %[src], %[src], $a3               \n"  // a3 may be 1, 2 or 3
+    "swr       $v1, 0(%[dst])                    \n"
+    "addu      %[dst], %[dst], $a3               \n"
+    // below the dst will be word aligned (NOTE1)
+    "$ua_chk16w:                                 \n"
+    "andi      $t8, %[count], 0x3f               \n"  // whole 64-B chunks?
+    // t8 is the byte count after 64-byte chunks
+    "beq       %[count], $t8, ua_chk8w           \n"
+    // if a2==t8, no 64-byte chunks
+    // There will be at most 1 32-byte chunk after it
+    "subu      $a3, %[count], $t8                \n"  // the reminder
+    // Here a3 counts bytes in 16w chunks
+    "addu      $a3, %[dst], $a3                  \n"
+    // Now a3 is the final dst after 64-byte chunks
+    "addu      $t0, %[dst], %[count]             \n"  // t0 "past the end"
+    "subu      $t9, $t0, 160                     \n"
+    // t9 is the "last safe pref 30,128(a1)" address
+    "pref      0, 0(%[src])                      \n"  // first line of src
+    "pref      0, 32(%[src])                     \n"  // second line  addr 32
+    "pref      0, 64(%[src])                     \n"
+    "pref      30, 32(%[dst])                    \n"
+    // safe, as we have at least 64 bytes ahead
+    // In case the a1 > t9 don't use "pref 30" at all
+    "sgtu      $v1, %[dst], $t9                  \n"
+    "bgtz      $v1, $ua_loop16w                  \n"
+    // skip "pref 30,64(a1)" for too short arrays
+    " nop                                        \n"
+    // otherwise, start with using pref30
+    "pref      30, 64(%[dst])                    \n"
+    "$ua_loop16w:                                \n"
+    "pref      0, 96(%[src])                     \n"
+    "lwr       $t0, 0(%[src])                    \n"
+    "lwl       $t0, 3(%[src])                    \n"
+    "lwr       $t1, 4(%[src])                    \n"
+    "bgtz      $v1, $ua_skip_pref30_96           \n"
+    " lwl      $t1, 7(%[src])                    \n"
+    "pref      30, 96(%[dst])                    \n"
+    // continue setting up the dest, addr 96
+    "$ua_skip_pref30_96:                         \n"
+    "lwr       $t2, 8(%[src])                    \n"
+    "lwl       $t2, 11(%[src])                   \n"
+    "lwr       $t3, 12(%[src])                   \n"
+    "lwl       $t3, 15(%[src])                   \n"
+    "lwr       $t4, 16(%[src])                   \n"
+    "lwl       $t4, 19(%[src])                   \n"
+    "lwr       $t5, 20(%[src])                   \n"
+    "lwl       $t5, 23(%[src])                   \n"
+    "lwr       $t6, 24(%[src])                   \n"
+    "lwl       $t6, 27(%[src])                   \n"
+    "lwr       $t7, 28(%[src])                   \n"
+    "lwl       $t7, 31(%[src])                   \n"
+    "pref      0, 128(%[src])                    \n"
+    // bring the next lines of src, addr 128
+    "sw        $t0, 0(%[dst])                    \n"
+    "sw        $t1, 4(%[dst])                    \n"
+    "sw        $t2, 8(%[dst])                    \n"
+    "sw        $t3, 12(%[dst])                   \n"
+    "sw        $t4, 16(%[dst])                   \n"
+    "sw        $t5, 20(%[dst])                   \n"
+    "sw        $t6, 24(%[dst])                   \n"
+    "sw        $t7, 28(%[dst])                   \n"
+    "lwr       $t0, 32(%[src])                   \n"
+    "lwl       $t0, 35(%[src])                   \n"
+    "lwr       $t1, 36(%[src])                   \n"
+    "bgtz      $v1, ua_skip_pref30_128           \n"
+    " lwl      $t1, 39(%[src])                   \n"
+    "pref      30, 128(%[dst])                   \n"
+    // continue setting up the dest, addr 128
+    "ua_skip_pref30_128:                         \n"
+
+    "lwr       $t2, 40(%[src])                   \n"
+    "lwl       $t2, 43(%[src])                   \n"
+    "lwr       $t3, 44(%[src])                   \n"
+    "lwl       $t3, 47(%[src])                   \n"
+    "lwr       $t4, 48(%[src])                   \n"
+    "lwl       $t4, 51(%[src])                   \n"
+    "lwr       $t5, 52(%[src])                   \n"
+    "lwl       $t5, 55(%[src])                   \n"
+    "lwr       $t6, 56(%[src])                   \n"
+    "lwl       $t6, 59(%[src])                   \n"
+    "lwr       $t7, 60(%[src])                   \n"
+    "lwl       $t7, 63(%[src])                   \n"
+    "pref      0, 160(%[src])                    \n"
+    // bring the next lines of src, addr 160
+    "sw        $t0, 32(%[dst])                   \n"
+    "sw        $t1, 36(%[dst])                   \n"
+    "sw        $t2, 40(%[dst])                   \n"
+    "sw        $t3, 44(%[dst])                   \n"
+    "sw        $t4, 48(%[dst])                   \n"
+    "sw        $t5, 52(%[dst])                   \n"
+    "sw        $t6, 56(%[dst])                   \n"
+    "sw        $t7, 60(%[dst])                   \n"
+
+    "addiu     %[dst],%[dst],64                  \n"  // adding 64 to dest
+    "sgtu      $v1,%[dst],$t9                    \n"
+    "bne       %[dst],$a3,$ua_loop16w            \n"
+    " addiu    %[src],%[src],64                  \n"  // adding 64 to src
+    "move      %[count],$t8                      \n"
+
+    // Here we have src and dest word-aligned but less than 64-bytes to go
+
+    "ua_chk8w:                                   \n"
+    "pref      0, 0x0(%[src])                    \n"
+    "andi      $t8, %[count], 0x1f               \n"  // 32-byte chunk?
+    // the t8 is the reminder count
+    "beq       %[count], $t8, $ua_chk1w          \n"
+    // when count==t8, no 32-byte chunk
+
+    "lwr       $t0, 0(%[src])                    \n"
+    "lwl       $t0, 3(%[src])                    \n"
+    "lwr       $t1, 4(%[src])                    \n"
+    "lwl       $t1, 7(%[src])                    \n"
+    "lwr       $t2, 8(%[src])                    \n"
+    "lwl       $t2, 11(%[src])                   \n"
+    "lwr       $t3, 12(%[src])                   \n"
+    "lwl       $t3, 15(%[src])                   \n"
+    "lwr       $t4, 16(%[src])                   \n"
+    "lwl       $t4, 19(%[src])                   \n"
+    "lwr       $t5, 20(%[src])                   \n"
+    "lwl       $t5, 23(%[src])                   \n"
+    "lwr       $t6, 24(%[src])                   \n"
+    "lwl       $t6, 27(%[src])                   \n"
+    "lwr       $t7, 28(%[src])                   \n"
+    "lwl       $t7, 31(%[src])                   \n"
+    "addiu     %[src], %[src], 32                \n"
+
+    "sw        $t0, 0(%[dst])                    \n"
+    "sw        $t1, 4(%[dst])                    \n"
+    "sw        $t2, 8(%[dst])                    \n"
+    "sw        $t3, 12(%[dst])                   \n"
+    "sw        $t4, 16(%[dst])                   \n"
+    "sw        $t5, 20(%[dst])                   \n"
+    "sw        $t6, 24(%[dst])                   \n"
+    "sw        $t7, 28(%[dst])                   \n"
+    "addiu     %[dst], %[dst], 32                \n"
+
+    "$ua_chk1w:                                  \n"
+    "andi      %[count], $t8, 0x3                \n"
+    // now count is the reminder past 1w chunks
+    "beq       %[count], $t8, ua_smallCopy       \n"
+    "subu      $a3, $t8, %[count]                \n"
+    // a3 is count of bytes in 1w chunks
+    "addu      $a3, %[dst], $a3                  \n"
+    // now a3 is the dst address past the 1w chunks
+
+    // copying in words (4-byte chunks)
+    "$ua_wordCopy_loop:                          \n"
+    "lwr       $v1, 0(%[src])                    \n"
+    "lwl       $v1, 3(%[src])                    \n"
+    "addiu     %[src], %[src], 4                 \n"
+    "addiu     %[dst], %[dst], 4                 \n"
+    // note: dst=a1 is word aligned here, see NOTE1
+    "bne       %[dst], $a3, $ua_wordCopy_loop    \n"
+    " sw       $v1,-4(%[dst])                    \n"
+
+    // Now less than 4 bytes (value in count) left to copy
+    "ua_smallCopy:                               \n"
+    "beqz      %[count], leave                   \n"
+    " addu     $a3, %[dst], %[count]             \n" // a3 = last dst address
+    "$ua_smallCopy_loop:                         \n"
+    "lb        $v1, 0(%[src])                    \n"
+    "addiu     %[src], %[src], 1                 \n"
+    "addiu     %[dst], %[dst], 1                 \n"
+    "bne       %[dst],$a3,$ua_smallCopy_loop     \n"
+    " sb       $v1, -1(%[dst])                   \n"
+
+    "j         $ra                               \n"
+    " nop                                        \n"
+    ".set      at                                \n"
+    ".set      reorder                           \n"
+       : [dst] "+r" (dst), [src] "+r" (src)
+       : [count] "r" (count)
+       : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7",
+       "t8", "t9", "a3", "v1", "at"
+  );
+}
+#endif  // HAS_COPYROW_MIPS
+
+// MIPS DSPR2 functions
+#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips_dsp) && \
+    (__mips_dsp_rev >= 2) && \
+    (_MIPS_SIM == _MIPS_SIM_ABI32) && (__mips_isa_rev < 6)
+
+void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                           int width) {
+  __asm__ __volatile__ (
+    ".set push                                     \n"
+    ".set noreorder                                \n"
+    "srl             $t4, %[width], 4              \n"  // multiplies of 16
+    "blez            $t4, 2f                       \n"
+    " andi           %[width], %[width], 0xf       \n"  // residual
+
+    ".p2align        2                             \n"
+  "1:                                              \n"
+    "addiu           $t4, $t4, -1                  \n"
+    "lw              $t0, 0(%[src_uv])             \n"  // V1 | U1 | V0 | U0
+    "lw              $t1, 4(%[src_uv])             \n"  // V3 | U3 | V2 | U2
+    "lw              $t2, 8(%[src_uv])             \n"  // V5 | U5 | V4 | U4
+    "lw              $t3, 12(%[src_uv])            \n"  // V7 | U7 | V6 | U6
+    "lw              $t5, 16(%[src_uv])            \n"  // V9 | U9 | V8 | U8
+    "lw              $t6, 20(%[src_uv])            \n"  // V11 | U11 | V10 | U10
+    "lw              $t7, 24(%[src_uv])            \n"  // V13 | U13 | V12 | U12
+    "lw              $t8, 28(%[src_uv])            \n"  // V15 | U15 | V14 | U14
+    "addiu           %[src_uv], %[src_uv], 32      \n"
+    "precrq.qb.ph    $t9, $t1, $t0                 \n"  // V3 | V2 | V1 | V0
+    "precr.qb.ph     $t0, $t1, $t0                 \n"  // U3 | U2 | U1 | U0
+    "precrq.qb.ph    $t1, $t3, $t2                 \n"  // V7 | V6 | V5 | V4
+    "precr.qb.ph     $t2, $t3, $t2                 \n"  // U7 | U6 | U5 | U4
+    "precrq.qb.ph    $t3, $t6, $t5                 \n"  // V11 | V10 | V9 | V8
+    "precr.qb.ph     $t5, $t6, $t5                 \n"  // U11 | U10 | U9 | U8
+    "precrq.qb.ph    $t6, $t8, $t7                 \n"  // V15 | V14 | V13 | V12
+    "precr.qb.ph     $t7, $t8, $t7                 \n"  // U15 | U14 | U13 | U12
+    "sw              $t9, 0(%[dst_v])              \n"
+    "sw              $t0, 0(%[dst_u])              \n"
+    "sw              $t1, 4(%[dst_v])              \n"
+    "sw              $t2, 4(%[dst_u])              \n"
+    "sw              $t3, 8(%[dst_v])              \n"
+    "sw              $t5, 8(%[dst_u])              \n"
+    "sw              $t6, 12(%[dst_v])             \n"
+    "sw              $t7, 12(%[dst_u])             \n"
+    "addiu           %[dst_v], %[dst_v], 16        \n"
+    "bgtz            $t4, 1b                       \n"
+    " addiu          %[dst_u], %[dst_u], 16        \n"
+
+    "beqz            %[width], 3f                  \n"
+    " nop                                          \n"
+
+  "2:                                              \n"
+    "lbu             $t0, 0(%[src_uv])             \n"
+    "lbu             $t1, 1(%[src_uv])             \n"
+    "addiu           %[src_uv], %[src_uv], 2       \n"
+    "addiu           %[width], %[width], -1        \n"
+    "sb              $t0, 0(%[dst_u])              \n"
+    "sb              $t1, 0(%[dst_v])              \n"
+    "addiu           %[dst_u], %[dst_u], 1         \n"
+    "bgtz            %[width], 2b                  \n"
+    " addiu          %[dst_v], %[dst_v], 1         \n"
+
+  "3:                                              \n"
+    ".set pop                                      \n"
+     : [src_uv] "+r" (src_uv),
+       [width] "+r" (width),
+       [dst_u] "+r" (dst_u),
+       [dst_v] "+r" (dst_v)
+     :
+     : "t0", "t1", "t2", "t3",
+     "t4", "t5", "t6", "t7", "t8", "t9"
+  );
+}
+
+void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width) {
+  __asm__ __volatile__ (
+    ".set push                             \n"
+    ".set noreorder                        \n"
+
+    "srl       $t4, %[width], 4            \n"  // multiplies of 16
+    "andi      $t5, %[width], 0xf          \n"
+    "blez      $t4, 2f                     \n"
+    " addu     %[src], %[src], %[width]    \n"  // src += width
+
+    ".p2align  2                           \n"
+   "1:                                     \n"
+    "lw        $t0, -16(%[src])            \n"  // |3|2|1|0|
+    "lw        $t1, -12(%[src])            \n"  // |7|6|5|4|
+    "lw        $t2, -8(%[src])             \n"  // |11|10|9|8|
+    "lw        $t3, -4(%[src])             \n"  // |15|14|13|12|
+    "wsbh      $t0, $t0                    \n"  // |2|3|0|1|
+    "wsbh      $t1, $t1                    \n"  // |6|7|4|5|
+    "wsbh      $t2, $t2                    \n"  // |10|11|8|9|
+    "wsbh      $t3, $t3                    \n"  // |14|15|12|13|
+    "rotr      $t0, $t0, 16                \n"  // |0|1|2|3|
+    "rotr      $t1, $t1, 16                \n"  // |4|5|6|7|
+    "rotr      $t2, $t2, 16                \n"  // |8|9|10|11|
+    "rotr      $t3, $t3, 16                \n"  // |12|13|14|15|
+    "addiu     %[src], %[src], -16         \n"
+    "addiu     $t4, $t4, -1                \n"
+    "sw        $t3, 0(%[dst])              \n"  // |15|14|13|12|
+    "sw        $t2, 4(%[dst])              \n"  // |11|10|9|8|
+    "sw        $t1, 8(%[dst])              \n"  // |7|6|5|4|
+    "sw        $t0, 12(%[dst])             \n"  // |3|2|1|0|
+    "bgtz      $t4, 1b                     \n"
+    " addiu    %[dst], %[dst], 16          \n"
+    "beqz      $t5, 3f                     \n"
+    " nop                                  \n"
+
+   "2:                                     \n"
+    "lbu       $t0, -1(%[src])             \n"
+    "addiu     $t5, $t5, -1                \n"
+    "addiu     %[src], %[src], -1          \n"
+    "sb        $t0, 0(%[dst])              \n"
+    "bgez      $t5, 2b                     \n"
+    " addiu    %[dst], %[dst], 1           \n"
+
+   "3:                                     \n"
+    ".set pop                              \n"
+      : [src] "+r" (src), [dst] "+r" (dst)
+      : [width] "r" (width)
+      : "t0", "t1", "t2", "t3", "t4", "t5"
+  );
+}
+
+void MirrorUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                            int width) {
+  int x = 0;
+  int y = 0;
+  __asm__ __volatile__ (
+    ".set push                                    \n"
+    ".set noreorder                               \n"
+
+    "addu            $t4, %[width], %[width]      \n"
+    "srl             %[x], %[width], 4            \n"
+    "andi            %[y], %[width], 0xf          \n"
+    "blez            %[x], 2f                     \n"
+    " addu           %[src_uv], %[src_uv], $t4    \n"
+
+    ".p2align        2                            \n"
+   "1:                                            \n"
+    "lw              $t0, -32(%[src_uv])          \n"  // |3|2|1|0|
+    "lw              $t1, -28(%[src_uv])          \n"  // |7|6|5|4|
+    "lw              $t2, -24(%[src_uv])          \n"  // |11|10|9|8|
+    "lw              $t3, -20(%[src_uv])          \n"  // |15|14|13|12|
+    "lw              $t4, -16(%[src_uv])          \n"  // |19|18|17|16|
+    "lw              $t6, -12(%[src_uv])          \n"  // |23|22|21|20|
+    "lw              $t7, -8(%[src_uv])           \n"  // |27|26|25|24|
+    "lw              $t8, -4(%[src_uv])           \n"  // |31|30|29|28|
+
+    "rotr            $t0, $t0, 16                 \n"  // |1|0|3|2|
+    "rotr            $t1, $t1, 16                 \n"  // |5|4|7|6|
+    "rotr            $t2, $t2, 16                 \n"  // |9|8|11|10|
+    "rotr            $t3, $t3, 16                 \n"  // |13|12|15|14|
+    "rotr            $t4, $t4, 16                 \n"  // |17|16|19|18|
+    "rotr            $t6, $t6, 16                 \n"  // |21|20|23|22|
+    "rotr            $t7, $t7, 16                 \n"  // |25|24|27|26|
+    "rotr            $t8, $t8, 16                 \n"  // |29|28|31|30|
+    "precr.qb.ph     $t9, $t0, $t1                \n"  // |0|2|4|6|
+    "precrq.qb.ph    $t5, $t0, $t1                \n"  // |1|3|5|7|
+    "precr.qb.ph     $t0, $t2, $t3                \n"  // |8|10|12|14|
+    "precrq.qb.ph    $t1, $t2, $t3                \n"  // |9|11|13|15|
+    "precr.qb.ph     $t2, $t4, $t6                \n"  // |16|18|20|22|
+    "precrq.qb.ph    $t3, $t4, $t6                \n"  // |17|19|21|23|
+    "precr.qb.ph     $t4, $t7, $t8                \n"  // |24|26|28|30|
+    "precrq.qb.ph    $t6, $t7, $t8                \n"  // |25|27|29|31|
+    "addiu           %[src_uv], %[src_uv], -32    \n"
+    "addiu           %[x], %[x], -1               \n"
+    "swr             $t4, 0(%[dst_u])             \n"
+    "swl             $t4, 3(%[dst_u])             \n"  // |30|28|26|24|
+    "swr             $t6, 0(%[dst_v])             \n"
+    "swl             $t6, 3(%[dst_v])             \n"  // |31|29|27|25|
+    "swr             $t2, 4(%[dst_u])             \n"
+    "swl             $t2, 7(%[dst_u])             \n"  // |22|20|18|16|
+    "swr             $t3, 4(%[dst_v])             \n"
+    "swl             $t3, 7(%[dst_v])             \n"  // |23|21|19|17|
+    "swr             $t0, 8(%[dst_u])             \n"
+    "swl             $t0, 11(%[dst_u])            \n"  // |14|12|10|8|
+    "swr             $t1, 8(%[dst_v])             \n"
+    "swl             $t1, 11(%[dst_v])            \n"  // |15|13|11|9|
+    "swr             $t9, 12(%[dst_u])            \n"
+    "swl             $t9, 15(%[dst_u])            \n"  // |6|4|2|0|
+    "swr             $t5, 12(%[dst_v])            \n"
+    "swl             $t5, 15(%[dst_v])            \n"  // |7|5|3|1|
+    "addiu           %[dst_v], %[dst_v], 16       \n"
+    "bgtz            %[x], 1b                     \n"
+    " addiu          %[dst_u], %[dst_u], 16       \n"
+    "beqz            %[y], 3f                     \n"
+    " nop                                         \n"
+    "b               2f                           \n"
+    " nop                                         \n"
+
+   "2:                                            \n"
+    "lbu             $t0, -2(%[src_uv])           \n"
+    "lbu             $t1, -1(%[src_uv])           \n"
+    "addiu           %[src_uv], %[src_uv], -2     \n"
+    "addiu           %[y], %[y], -1               \n"
+    "sb              $t0, 0(%[dst_u])             \n"
+    "sb              $t1, 0(%[dst_v])             \n"
+    "addiu           %[dst_u], %[dst_u], 1        \n"
+    "bgtz            %[y], 2b                     \n"
+    " addiu          %[dst_v], %[dst_v], 1        \n"
+
+   "3:                                            \n"
+    ".set pop                                     \n"
+      : [src_uv] "+r" (src_uv),
+        [dst_u] "+r" (dst_u),
+        [dst_v] "+r" (dst_v),
+        [x] "=&r" (x),
+        [y] "+r" (y)
+      : [width] "r" (width)
+      : "t0", "t1", "t2", "t3", "t4",
+      "t5", "t7", "t8", "t9"
+  );
+}
+
+// Convert (4 Y and 2 VU) I422 and arrange RGB values into
+// t5 = | 0 | B0 | 0 | b0 |
+// t4 = | 0 | B1 | 0 | b1 |
+// t9 = | 0 | G0 | 0 | g0 |
+// t8 = | 0 | G1 | 0 | g1 |
+// t2 = | 0 | R0 | 0 | r0 |
+// t1 = | 0 | R1 | 0 | r1 |
+#define I422ToTransientMipsRGB                                                 \
+      "lw                $t0, 0(%[y_buf])       \n"                            \
+      "lhu               $t1, 0(%[u_buf])       \n"                            \
+      "lhu               $t2, 0(%[v_buf])       \n"                            \
+      "preceu.ph.qbr     $t1, $t1               \n"                            \
+      "preceu.ph.qbr     $t2, $t2               \n"                            \
+      "preceu.ph.qbra    $t3, $t0               \n"                            \
+      "preceu.ph.qbla    $t0, $t0               \n"                            \
+      "subu.ph           $t1, $t1, $s5          \n"                            \
+      "subu.ph           $t2, $t2, $s5          \n"                            \
+      "subu.ph           $t3, $t3, $s4          \n"                            \
+      "subu.ph           $t0, $t0, $s4          \n"                            \
+      "mul.ph            $t3, $t3, $s0          \n"                            \
+      "mul.ph            $t0, $t0, $s0          \n"                            \
+      "shll.ph           $t4, $t1, 0x7          \n"                            \
+      "subu.ph           $t4, $t4, $t1          \n"                            \
+      "mul.ph            $t6, $t1, $s1          \n"                            \
+      "mul.ph            $t1, $t2, $s2          \n"                            \
+      "addq_s.ph         $t5, $t4, $t3          \n"                            \
+      "addq_s.ph         $t4, $t4, $t0          \n"                            \
+      "shra.ph           $t5, $t5, 6            \n"                            \
+      "shra.ph           $t4, $t4, 6            \n"                            \
+      "addiu             %[u_buf], 2            \n"                            \
+      "addiu             %[v_buf], 2            \n"                            \
+      "addu.ph           $t6, $t6, $t1          \n"                            \
+      "mul.ph            $t1, $t2, $s3          \n"                            \
+      "addu.ph           $t9, $t6, $t3          \n"                            \
+      "addu.ph           $t8, $t6, $t0          \n"                            \
+      "shra.ph           $t9, $t9, 6            \n"                            \
+      "shra.ph           $t8, $t8, 6            \n"                            \
+      "addu.ph           $t2, $t1, $t3          \n"                            \
+      "addu.ph           $t1, $t1, $t0          \n"                            \
+      "shra.ph           $t2, $t2, 6            \n"                            \
+      "shra.ph           $t1, $t1, 6            \n"                            \
+      "subu.ph           $t5, $t5, $s5          \n"                            \
+      "subu.ph           $t4, $t4, $s5          \n"                            \
+      "subu.ph           $t9, $t9, $s5          \n"                            \
+      "subu.ph           $t8, $t8, $s5          \n"                            \
+      "subu.ph           $t2, $t2, $s5          \n"                            \
+      "subu.ph           $t1, $t1, $s5          \n"                            \
+      "shll_s.ph         $t5, $t5, 8            \n"                            \
+      "shll_s.ph         $t4, $t4, 8            \n"                            \
+      "shll_s.ph         $t9, $t9, 8            \n"                            \
+      "shll_s.ph         $t8, $t8, 8            \n"                            \
+      "shll_s.ph         $t2, $t2, 8            \n"                            \
+      "shll_s.ph         $t1, $t1, 8            \n"                            \
+      "shra.ph           $t5, $t5, 8            \n"                            \
+      "shra.ph           $t4, $t4, 8            \n"                            \
+      "shra.ph           $t9, $t9, 8            \n"                            \
+      "shra.ph           $t8, $t8, 8            \n"                            \
+      "shra.ph           $t2, $t2, 8            \n"                            \
+      "shra.ph           $t1, $t1, 8            \n"                            \
+      "addu.ph           $t5, $t5, $s5          \n"                            \
+      "addu.ph           $t4, $t4, $s5          \n"                            \
+      "addu.ph           $t9, $t9, $s5          \n"                            \
+      "addu.ph           $t8, $t8, $s5          \n"                            \
+      "addu.ph           $t2, $t2, $s5          \n"                            \
+      "addu.ph           $t1, $t1, $s5          \n"
+
+void I422ToARGBRow_MIPS_DSPR2(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              uint8* rgb_buf,
+                              int width) {
+  __asm__ __volatile__ (
+    ".set push                                \n"
+    ".set noreorder                           \n"
+    "beqz              %[width], 2f           \n"
+    " repl.ph          $s0, 74                \n"  // |YG|YG| = |74|74|
+    "repl.ph           $s1, -25               \n"  // |UG|UG| = |-25|-25|
+    "repl.ph           $s2, -52               \n"  // |VG|VG| = |-52|-52|
+    "repl.ph           $s3, 102               \n"  // |VR|VR| = |102|102|
+    "repl.ph           $s4, 16                \n"  // |0|16|0|16|
+    "repl.ph           $s5, 128               \n"  // |128|128| // clipping
+    "lui               $s6, 0xff00            \n"
+    "ori               $s6, 0xff00            \n"  // |ff|00|ff|00|ff|
+
+    ".p2align          2                      \n"
+   "1:                                        \n"
+      I422ToTransientMipsRGB
+// Arranging into argb format
+    "precr.qb.ph       $t4, $t8, $t4          \n"  // |G1|g1|B1|b1|
+    "precr.qb.ph       $t5, $t9, $t5          \n"  // |G0|g0|B0|b0|
+    "addiu             %[width], -4           \n"
+    "precrq.qb.ph      $t8, $t4, $t5          \n"  // |G1|B1|G0|B0|
+    "precr.qb.ph       $t9, $t4, $t5          \n"  // |g1|b1|g0|b0|
+    "precr.qb.ph       $t2, $t1, $t2          \n"  // |R1|r1|R0|r0|
+
+    "addiu             %[y_buf], 4            \n"
+    "preceu.ph.qbla    $t1, $t2               \n"  // |0 |R1|0 |R0|
+    "preceu.ph.qbra    $t2, $t2               \n"  // |0 |r1|0 |r0|
+    "or                $t1, $t1, $s6          \n"  // |ff|R1|ff|R0|
+    "or                $t2, $t2, $s6          \n"  // |ff|r1|ff|r0|
+    "precrq.ph.w       $t0, $t2, $t9          \n"  // |ff|r1|g1|b1|
+    "precrq.ph.w       $t3, $t1, $t8          \n"  // |ff|R1|G1|B1|
+    "sll               $t9, $t9, 16           \n"
+    "sll               $t8, $t8, 16           \n"
+    "packrl.ph         $t2, $t2, $t9          \n"  // |ff|r0|g0|b0|
+    "packrl.ph         $t1, $t1, $t8          \n"  // |ff|R0|G0|B0|
+// Store results.
+    "sw                $t2, 0(%[rgb_buf])     \n"
+    "sw                $t0, 4(%[rgb_buf])     \n"
+    "sw                $t1, 8(%[rgb_buf])     \n"
+    "sw                $t3, 12(%[rgb_buf])    \n"
+    "bnez              %[width], 1b           \n"
+    " addiu            %[rgb_buf], 16         \n"
+   "2:                                        \n"
+    ".set pop                                 \n"
+      :[y_buf] "+r" (y_buf),
+       [u_buf] "+r" (u_buf),
+       [v_buf] "+r" (v_buf),
+       [width] "+r" (width),
+       [rgb_buf] "+r" (rgb_buf)
+      :
+      : "t0", "t1",  "t2", "t3",  "t4", "t5",
+      "t6", "t7", "t8", "t9",
+      "s0", "s1", "s2", "s3",
+      "s4", "s5", "s6"
+  );
+}
+
+void I422ToABGRRow_MIPS_DSPR2(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              uint8* rgb_buf,
+                              int width) {
+  __asm__ __volatile__ (
+    ".set push                                \n"
+    ".set noreorder                           \n"
+    "beqz              %[width], 2f           \n"
+    " repl.ph          $s0, 74                \n"  // |YG|YG| = |74|74|
+    "repl.ph           $s1, -25               \n"  // |UG|UG| = |-25|-25|
+    "repl.ph           $s2, -52               \n"  // |VG|VG| = |-52|-52|
+    "repl.ph           $s3, 102               \n"  // |VR|VR| = |102|102|
+    "repl.ph           $s4, 16                \n"  // |0|16|0|16|
+    "repl.ph           $s5, 128               \n"  // |128|128|
+    "lui               $s6, 0xff00            \n"
+    "ori               $s6, 0xff00            \n"  // |ff|00|ff|00|
+
+    ".p2align          2                       \n"
+   "1:                                         \n"
+      I422ToTransientMipsRGB
+// Arranging into abgr format
+    "precr.qb.ph      $t0, $t8, $t1           \n"  // |G1|g1|R1|r1|
+    "precr.qb.ph      $t3, $t9, $t2           \n"  // |G0|g0|R0|r0|
+    "precrq.qb.ph     $t8, $t0, $t3           \n"  // |G1|R1|G0|R0|
+    "precr.qb.ph      $t9, $t0, $t3           \n"  // |g1|r1|g0|r0|
+
+    "precr.qb.ph       $t2, $t4, $t5          \n"  // |B1|b1|B0|b0|
+    "addiu             %[width], -4           \n"
+    "addiu             %[y_buf], 4            \n"
+    "preceu.ph.qbla    $t1, $t2               \n"  // |0 |B1|0 |B0|
+    "preceu.ph.qbra    $t2, $t2               \n"  // |0 |b1|0 |b0|
+    "or                $t1, $t1, $s6          \n"  // |ff|B1|ff|B0|
+    "or                $t2, $t2, $s6          \n"  // |ff|b1|ff|b0|
+    "precrq.ph.w       $t0, $t2, $t9          \n"  // |ff|b1|g1|r1|
+    "precrq.ph.w       $t3, $t1, $t8          \n"  // |ff|B1|G1|R1|
+    "sll               $t9, $t9, 16           \n"
+    "sll               $t8, $t8, 16           \n"
+    "packrl.ph         $t2, $t2, $t9          \n"  // |ff|b0|g0|r0|
+    "packrl.ph         $t1, $t1, $t8          \n"  // |ff|B0|G0|R0|
+// Store results.
+    "sw                $t2, 0(%[rgb_buf])     \n"
+    "sw                $t0, 4(%[rgb_buf])     \n"
+    "sw                $t1, 8(%[rgb_buf])     \n"
+    "sw                $t3, 12(%[rgb_buf])    \n"
+    "bnez              %[width], 1b           \n"
+    " addiu            %[rgb_buf], 16         \n"
+   "2:                                        \n"
+    ".set pop                                 \n"
+      :[y_buf] "+r" (y_buf),
+       [u_buf] "+r" (u_buf),
+       [v_buf] "+r" (v_buf),
+       [width] "+r" (width),
+       [rgb_buf] "+r" (rgb_buf)
+      :
+      : "t0", "t1",  "t2", "t3",  "t4", "t5",
+      "t6", "t7", "t8", "t9",
+      "s0", "s1", "s2", "s3",
+      "s4", "s5", "s6"
+  );
+}
+
+void I422ToBGRARow_MIPS_DSPR2(const uint8* y_buf,
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              uint8* rgb_buf,
+                              int width) {
+  __asm__ __volatile__ (
+    ".set push                                \n"
+    ".set noreorder                           \n"
+    "beqz              %[width], 2f           \n"
+    " repl.ph          $s0, 74                \n"  // |YG|YG| = |74 |74 |
+    "repl.ph           $s1, -25               \n"  // |UG|UG| = |-25|-25|
+    "repl.ph           $s2, -52               \n"  // |VG|VG| = |-52|-52|
+    "repl.ph           $s3, 102               \n"  // |VR|VR| = |102|102|
+    "repl.ph           $s4, 16                \n"  // |0|16|0|16|
+    "repl.ph           $s5, 128               \n"  // |128|128|
+    "lui               $s6, 0xff              \n"
+    "ori               $s6, 0xff              \n"  // |00|ff|00|ff|
+
+    ".p2align          2                      \n"
+   "1:                                        \n"
+      I422ToTransientMipsRGB
+      // Arranging into bgra format
+    "precr.qb.ph       $t4, $t4, $t8          \n"  // |B1|b1|G1|g1|
+    "precr.qb.ph       $t5, $t5, $t9          \n"  // |B0|b0|G0|g0|
+    "precrq.qb.ph      $t8, $t4, $t5          \n"  // |B1|G1|B0|G0|
+    "precr.qb.ph       $t9, $t4, $t5          \n"  // |b1|g1|b0|g0|
+
+    "precr.qb.ph       $t2, $t1, $t2          \n"  // |R1|r1|R0|r0|
+    "addiu             %[width], -4           \n"
+    "addiu             %[y_buf], 4            \n"
+    "preceu.ph.qbla    $t1, $t2               \n"  // |0 |R1|0 |R0|
+    "preceu.ph.qbra    $t2, $t2               \n"  // |0 |r1|0 |r0|
+    "sll               $t1, $t1, 8            \n"  // |R1|0 |R0|0 |
+    "sll               $t2, $t2, 8            \n"  // |r1|0 |r0|0 |
+    "or                $t1, $t1, $s6          \n"  // |R1|ff|R0|ff|
+    "or                $t2, $t2, $s6          \n"  // |r1|ff|r0|ff|
+    "precrq.ph.w       $t0, $t9, $t2          \n"  // |b1|g1|r1|ff|
+    "precrq.ph.w       $t3, $t8, $t1          \n"  // |B1|G1|R1|ff|
+    "sll               $t1, $t1, 16           \n"
+    "sll               $t2, $t2, 16           \n"
+    "packrl.ph         $t2, $t9, $t2          \n"  // |b0|g0|r0|ff|
+    "packrl.ph         $t1, $t8, $t1          \n"  // |B0|G0|R0|ff|
+// Store results.
+    "sw                $t2, 0(%[rgb_buf])     \n"
+    "sw                $t0, 4(%[rgb_buf])     \n"
+    "sw                $t1, 8(%[rgb_buf])     \n"
+    "sw                $t3, 12(%[rgb_buf])    \n"
+    "bnez              %[width], 1b           \n"
+    " addiu            %[rgb_buf], 16         \n"
+   "2:                                        \n"
+    ".set pop                                 \n"
+      :[y_buf] "+r" (y_buf),
+       [u_buf] "+r" (u_buf),
+       [v_buf] "+r" (v_buf),
+       [width] "+r" (width),
+       [rgb_buf] "+r" (rgb_buf)
+      :
+      : "t0", "t1",  "t2", "t3",  "t4", "t5",
+      "t6", "t7", "t8", "t9",
+      "s0", "s1", "s2", "s3",
+      "s4", "s5", "s6"
+  );
+}
+
+// Bilinear filter 8x2 -> 8x1
+void InterpolateRow_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
+                               ptrdiff_t src_stride, int dst_width,
+                               int source_y_fraction) {
+    int y0_fraction = 256 - source_y_fraction;
+    const uint8* src_ptr1 = src_ptr + src_stride;
+
+  __asm__ __volatile__ (
+     ".set push                                           \n"
+     ".set noreorder                                      \n"
+
+     "replv.ph          $t0, %[y0_fraction]               \n"
+     "replv.ph          $t1, %[source_y_fraction]         \n"
+
+    ".p2align           2                                 \n"
+   "1:                                                    \n"
+     "lw                $t2, 0(%[src_ptr])                \n"
+     "lw                $t3, 0(%[src_ptr1])               \n"
+     "lw                $t4, 4(%[src_ptr])                \n"
+     "lw                $t5, 4(%[src_ptr1])               \n"
+     "muleu_s.ph.qbl    $t6, $t2, $t0                     \n"
+     "muleu_s.ph.qbr    $t7, $t2, $t0                     \n"
+     "muleu_s.ph.qbl    $t8, $t3, $t1                     \n"
+     "muleu_s.ph.qbr    $t9, $t3, $t1                     \n"
+     "muleu_s.ph.qbl    $t2, $t4, $t0                     \n"
+     "muleu_s.ph.qbr    $t3, $t4, $t0                     \n"
+     "muleu_s.ph.qbl    $t4, $t5, $t1                     \n"
+     "muleu_s.ph.qbr    $t5, $t5, $t1                     \n"
+     "addq.ph           $t6, $t6, $t8                     \n"
+     "addq.ph           $t7, $t7, $t9                     \n"
+     "addq.ph           $t2, $t2, $t4                     \n"
+     "addq.ph           $t3, $t3, $t5                     \n"
+     "shra.ph           $t6, $t6, 8                       \n"
+     "shra.ph           $t7, $t7, 8                       \n"
+     "shra.ph           $t2, $t2, 8                       \n"
+     "shra.ph           $t3, $t3, 8                       \n"
+     "precr.qb.ph       $t6, $t6, $t7                     \n"
+     "precr.qb.ph       $t2, $t2, $t3                     \n"
+     "addiu             %[src_ptr], %[src_ptr], 8         \n"
+     "addiu             %[src_ptr1], %[src_ptr1], 8       \n"
+     "addiu             %[dst_width], %[dst_width], -8    \n"
+     "sw                $t6, 0(%[dst_ptr])                \n"
+     "sw                $t2, 4(%[dst_ptr])                \n"
+     "bgtz              %[dst_width], 1b                  \n"
+     " addiu            %[dst_ptr], %[dst_ptr], 8         \n"
+
+     ".set pop                                            \n"
+  : [dst_ptr] "+r" (dst_ptr),
+    [src_ptr1] "+r" (src_ptr1),
+    [src_ptr] "+r" (src_ptr),
+    [dst_width] "+r" (dst_width)
+  : [source_y_fraction] "r" (source_y_fraction),
+    [y0_fraction] "r" (y0_fraction),
+    [src_stride] "r" (src_stride)
+  : "t0", "t1", "t2", "t3", "t4", "t5",
+    "t6", "t7", "t8", "t9"
+  );
+}
+#endif  // __mips_dsp_rev >= 2
+
+#endif  // defined(__mips__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libaom/src/third_party/libyuv/source/row_neon.cc b/libs/libaom/src/third_party/libyuv/source/row_neon.cc
new file mode 100644
index 000000000..1a72eb903
--- /dev/null
+++ b/libs/libaom/src/third_party/libyuv/source/row_neon.cc
@@ -0,0 +1,3084 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC Neon
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
+    !defined(__aarch64__)
+
+// Read 8 Y, 4 U and 4 V from 422
+#define READYUV422                                                             \
+    MEMACCESS(0)                                                               \
+    "vld1.8     {d0}, [%0]!                    \n"                             \
+    MEMACCESS(1)                                                               \
+    "vld1.32    {d2[0]}, [%1]!                 \n"                             \
+    MEMACCESS(2)                                                               \
+    "vld1.32    {d2[1]}, [%2]!                 \n"
+
+// Read 8 Y, 2 U and 2 V from 422
+#define READYUV411                                                             \
+    MEMACCESS(0)                                                               \
+    "vld1.8     {d0}, [%0]!                    \n"                             \
+    MEMACCESS(1)                                                               \
+    "vld1.16    {d2[0]}, [%1]!                 \n"                             \
+    MEMACCESS(2)                                                               \
+    "vld1.16    {d2[1]}, [%2]!                 \n"                             \
+    "vmov.u8    d3, d2                         \n"                             \
+    "vzip.u8    d2, d3                         \n"
+
+// Read 8 Y, 8 U and 8 V from 444
+#define READYUV444                                                             \
+    MEMACCESS(0)                                                               \
+    "vld1.8     {d0}, [%0]!                    \n"                             \
+    MEMACCESS(1)                                                               \
+    "vld1.8     {d2}, [%1]!                    \n"                             \
+    MEMACCESS(2)                                                               \
+    "vld1.8     {d3}, [%2]!                    \n"                             \
+    "vpaddl.u8  q1, q1                         \n"                             \
+    "vrshrn.u16 d2, q1, #1                     \n"
+
+// Read 8 Y, and set 4 U and 4 V to 128
+#define READYUV400                                                             \
+    MEMACCESS(0)                                                               \
+    "vld1.8     {d0}, [%0]!                    \n"                             \
+    "vmov.u8    d2, #128                       \n"
+
+// Read 8 Y and 4 UV from NV12
+#define READNV12                                                               \
+    MEMACCESS(0)                                                               \
+    "vld1.8     {d0}, [%0]!                    \n"                             \
+    MEMACCESS(1)                                                               \
+    "vld1.8     {d2}, [%1]!                    \n"                             \
+    "vmov.u8    d3, d2                         \n"/* split odd/even uv apart */\
+    "vuzp.u8    d2, d3                         \n"                             \
+    "vtrn.u32   d2, d3                         \n"
+
+// Read 8 Y and 4 VU from NV21
+#define READNV21                                                               \
+    MEMACCESS(0)                                                               \
+    "vld1.8     {d0}, [%0]!                    \n"                             \
+    MEMACCESS(1)                                                               \
+    "vld1.8     {d2}, [%1]!                    \n"                             \
+    "vmov.u8    d3, d2                         \n"/* split odd/even uv apart */\
+    "vuzp.u8    d3, d2                         \n"                             \
+    "vtrn.u32   d2, d3                         \n"
+
+// Read 8 YUY2
+#define READYUY2                                                               \
+    MEMACCESS(0)                                                               \
+    "vld2.8     {d0, d2}, [%0]!                \n"                             \
+    "vmov.u8    d3, d2                         \n"                             \
+    "vuzp.u8    d2, d3                         \n"                             \
+    "vtrn.u32   d2, d3                         \n"
+
+// Read 8 UYVY
+#define READUYVY                                                               \
+    MEMACCESS(0)                                                               \
+    "vld2.8     {d2, d3}, [%0]!                \n"                             \
+    "vmov.u8    d0, d3                         \n"                             \
+    "vmov.u8    d3, d2                         \n"                             \
+    "vuzp.u8    d2, d3                         \n"                             \
+    "vtrn.u32   d2, d3                         \n"
+
+#define YUV422TORGB_SETUP_REG                                                  \
+    MEMACCESS([kUVToRB])                                                       \
+    "vld1.8     {d24}, [%[kUVToRB]]            \n"                             \
+    MEMACCESS([kUVToG])                                                        \
+    "vld1.8     {d25}, [%[kUVToG]]             \n"                             \
+    MEMACCESS([kUVBiasBGR])                                                    \
+    "vld1.16    {d26[], d27[]}, [%[kUVBiasBGR]]! \n"                           \
+    MEMACCESS([kUVBiasBGR])                                                    \
+    "vld1.16    {d8[], d9[]}, [%[kUVBiasBGR]]!   \n"                           \
+    MEMACCESS([kUVBiasBGR])                                                    \
+    "vld1.16    {d28[], d29[]}, [%[kUVBiasBGR]]  \n"                           \
+    MEMACCESS([kYToRgb])                                                       \
+    "vld1.32    {d30[], d31[]}, [%[kYToRgb]]     \n"
+
+#define YUV422TORGB                                                            \
+    "vmull.u8   q8, d2, d24                    \n" /* u/v B/R component      */\
+    "vmull.u8   q9, d2, d25                    \n" /* u/v G component        */\
+    "vmovl.u8   q0, d0                         \n" /* Y                      */\
+    "vmovl.s16  q10, d1                        \n"                             \
+    "vmovl.s16  q0, d0                         \n"                             \
+    "vmul.s32   q10, q10, q15                  \n"                             \
+    "vmul.s32   q0, q0, q15                    \n"                             \
+    "vqshrun.s32 d0, q0, #16                   \n"                             \
+    "vqshrun.s32 d1, q10, #16                  \n" /* Y                      */\
+    "vadd.s16   d18, d19                       \n"                             \
+    "vshll.u16  q1, d16, #16                   \n" /* Replicate u * UB       */\
+    "vshll.u16  q10, d17, #16                  \n" /* Replicate v * VR       */\
+    "vshll.u16  q3, d18, #16                   \n" /* Replicate (v*VG + u*UG)*/\
+    "vaddw.u16  q1, q1, d16                    \n"                             \
+    "vaddw.u16  q10, q10, d17                  \n"                             \
+    "vaddw.u16  q3, q3, d18                    \n"                             \
+    "vqadd.s16  q8, q0, q13                    \n" /* B */                     \
+    "vqadd.s16  q9, q0, q14                    \n" /* R */                     \
+    "vqadd.s16  q0, q0, q4                     \n" /* G */                     \
+    "vqadd.s16  q8, q8, q1                     \n" /* B */                     \
+    "vqadd.s16  q9, q9, q10                    \n" /* R */                     \
+    "vqsub.s16  q0, q0, q3                     \n" /* G */                     \
+    "vqshrun.s16 d20, q8, #6                   \n" /* B */                     \
+    "vqshrun.s16 d22, q9, #6                   \n" /* R */                     \
+    "vqshrun.s16 d21, q0, #6                   \n" /* G */
+
+// YUV to RGB conversion constants.
+// Y contribution to R,G,B.  Scale and bias.
+#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YGB 1160 /* 1.164 * 64 * 16 - adjusted for even error distribution */
+
+// U and V contributions to R,G,B.
+#define UB -128 /* -min(128, round(2.018 * 64)) */
+#define UG 25 /* -round(-0.391 * 64) */
+#define VG 52 /* -round(-0.813 * 64) */
+#define VR -102 /* -round(1.596 * 64) */
+
+// Bias values to subtract 16 from Y and 128 from U and V.
+#define BB (UB * 128            - YGB)
+#define BG (UG * 128 + VG * 128 - YGB)
+#define BR            (VR * 128 - YGB)
+
+static uvec8 kUVToRB  = { 128, 128, 128, 128, 102, 102, 102, 102,
+                          0, 0, 0, 0, 0, 0, 0, 0 };
+static uvec8 kUVToG = { 25, 25, 25, 25, 52, 52, 52, 52,
+                        0, 0, 0, 0, 0, 0, 0, 0 };
+static vec16 kUVBiasBGR = { BB, BG, BR, 0, 0, 0, 0, 0 };
+static vec32 kYToRgb = { 0x0101 * YG, 0, 0, 0 };
+
+#undef YG
+#undef YGB
+#undef UB
+#undef UG
+#undef VG
+#undef VR
+#undef BB
+#undef BG
+#undef BR
+
+void I444ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    READYUV444
+    YUV422TORGB
+    "subs       %4, %4, #8                     \n"
+    "vmov.u8    d23, #255                      \n"
+    MEMACCESS(3)
+    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
+    "bgt        1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_argb),  // %3
+      "+r"(width)      // %4
+    : [kUVToRB]"r"(&kUVToRB),   // %5
+      [kUVToG]"r"(&kUVToG),     // %6
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void I422ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    READYUV422
+    YUV422TORGB
+    "subs       %4, %4, #8                     \n"
+    "vmov.u8    d23, #255                      \n"
+    MEMACCESS(3)
+    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
+    "bgt        1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_argb),  // %3
+      "+r"(width)      // %4
+    : [kUVToRB]"r"(&kUVToRB),   // %5
+      [kUVToG]"r"(&kUVToG),     // %6
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void I411ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    READYUV411
+    YUV422TORGB
+    "subs       %4, %4, #8                     \n"
+    "vmov.u8    d23, #255                      \n"
+    MEMACCESS(3)
+    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
+    "bgt        1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_argb),  // %3
+      "+r"(width)      // %4
+    : [kUVToRB]"r"(&kUVToRB),   // %5
+      [kUVToG]"r"(&kUVToG),     // %6
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void I422ToBGRARow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_bgra,
+                        int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    READYUV422
+    YUV422TORGB
+    "subs       %4, %4, #8                     \n"
+    "vswp.u8    d20, d22                       \n"
+    "vmov.u8    d19, #255                      \n"
+    MEMACCESS(3)
+    "vst4.8     {d19, d20, d21, d22}, [%3]!    \n"
+    "bgt        1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_bgra),  // %3
+      "+r"(width)      // %4
+    : [kUVToRB]"r"(&kUVToRB),   // %5
+      [kUVToG]"r"(&kUVToG),     // %6
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void I422ToABGRRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_abgr,
+                        int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    READYUV422
+    YUV422TORGB
+    "subs       %4, %4, #8                     \n"
+    "vswp.u8    d20, d22                       \n"
+    "vmov.u8    d23, #255                      \n"
+    MEMACCESS(3)
+    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
+    "bgt        1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_abgr),  // %3
+      "+r"(width)      // %4
+    : [kUVToRB]"r"(&kUVToRB),   // %5
+      [kUVToG]"r"(&kUVToG),     // %6
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void I422ToRGBARow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_rgba,
+                        int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    READYUV422
+    YUV422TORGB
+    "subs       %4, %4, #8                     \n"
+    "vmov.u8    d19, #255                      \n"
+    MEMACCESS(3)
+    "vst4.8     {d19, d20, d21, d22}, [%3]!    \n"
+    "bgt        1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_rgba),  // %3
+      "+r"(width)      // %4
+    : [kUVToRB]"r"(&kUVToRB),   // %5
+      [kUVToG]"r"(&kUVToG),     // %6
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void I422ToRGB24Row_NEON(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_rgb24,
+                         int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    READYUV422
+    YUV422TORGB
+    "subs       %4, %4, #8                     \n"
+    MEMACCESS(3)
+    "vst3.8     {d20, d21, d22}, [%3]!         \n"
+    "bgt        1b                             \n"
+    : "+r"(src_y),      // %0
+      "+r"(src_u),      // %1
+      "+r"(src_v),      // %2
+      "+r"(dst_rgb24),  // %3
+      "+r"(width)       // %4
+    : [kUVToRB]"r"(&kUVToRB),   // %5
+      [kUVToG]"r"(&kUVToG),     // %6
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void I422ToRAWRow_NEON(const uint8* src_y,
+                       const uint8* src_u,
+                       const uint8* src_v,
+                       uint8* dst_raw,
+                       int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    READYUV422
+    YUV422TORGB
+    "subs       %4, %4, #8                     \n"
+    "vswp.u8    d20, d22                       \n"
+    MEMACCESS(3)
+    "vst3.8     {d20, d21, d22}, [%3]!         \n"
+    "bgt        1b                             \n"
+    : "+r"(src_y),    // %0
+      "+r"(src_u),    // %1
+      "+r"(src_v),    // %2
+      "+r"(dst_raw),  // %3
+      "+r"(width)     // %4
+    : [kUVToRB]"r"(&kUVToRB),   // %5
+      [kUVToG]"r"(&kUVToG),     // %6
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+#define ARGBTORGB565                                                           \
+    "vshr.u8    d20, d20, #3                   \n"  /* B                    */ \
+    "vshr.u8    d21, d21, #2                   \n"  /* G                    */ \
+    "vshr.u8    d22, d22, #3                   \n"  /* R                    */ \
+    "vmovl.u8   q8, d20                        \n"  /* B                    */ \
+    "vmovl.u8   q9, d21                        \n"  /* G                    */ \
+    "vmovl.u8   q10, d22                       \n"  /* R                    */ \
+    "vshl.u16   q9, q9, #5                     \n"  /* G                    */ \
+    "vshl.u16   q10, q10, #11                  \n"  /* R                    */ \
+    "vorr       q0, q8, q9                     \n"  /* BG                   */ \
+    "vorr       q0, q0, q10                    \n"  /* BGR                  */
+
+void I422ToRGB565Row_NEON(const uint8* src_y,
+                          const uint8* src_u,
+                          const uint8* src_v,
+                          uint8* dst_rgb565,
+                          int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    READYUV422
+    YUV422TORGB
+    "subs       %4, %4, #8                     \n"
+    ARGBTORGB565
+    MEMACCESS(3)
+    "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels RGB565.
+    "bgt        1b                             \n"
+    : "+r"(src_y),    // %0
+      "+r"(src_u),    // %1
+      "+r"(src_v),    // %2
+      "+r"(dst_rgb565),  // %3
+      "+r"(width)     // %4
+    : [kUVToRB]"r"(&kUVToRB),   // %5
+      [kUVToG]"r"(&kUVToG),     // %6
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+#define ARGBTOARGB1555                                                         \
+    "vshr.u8    q10, q10, #3                   \n"  /* B                    */ \
+    "vshr.u8    d22, d22, #3                   \n"  /* R                    */ \
+    "vshr.u8    d23, d23, #7                   \n"  /* A                    */ \
+    "vmovl.u8   q8, d20                        \n"  /* B                    */ \
+    "vmovl.u8   q9, d21                        \n"  /* G                    */ \
+    "vmovl.u8   q10, d22                       \n"  /* R                    */ \
+    "vmovl.u8   q11, d23                       \n"  /* A                    */ \
+    "vshl.u16   q9, q9, #5                     \n"  /* G                    */ \
+    "vshl.u16   q10, q10, #10                  \n"  /* R                    */ \
+    "vshl.u16   q11, q11, #15                  \n"  /* A                    */ \
+    "vorr       q0, q8, q9                     \n"  /* BG                   */ \
+    "vorr       q1, q10, q11                   \n"  /* RA                   */ \
+    "vorr       q0, q0, q1                     \n"  /* BGRA                 */
+
+void I422ToARGB1555Row_NEON(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb1555,
+                            int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    READYUV422
+    YUV422TORGB
+    "subs       %4, %4, #8                     \n"
+    "vmov.u8    d23, #255                      \n"
+    ARGBTOARGB1555
+    MEMACCESS(3)
+    "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels ARGB1555.
+    "bgt        1b                             \n"
+    : "+r"(src_y),    // %0
+      "+r"(src_u),    // %1
+      "+r"(src_v),    // %2
+      "+r"(dst_argb1555),  // %3
+      "+r"(width)     // %4
+    : [kUVToRB]"r"(&kUVToRB),   // %5
+      [kUVToG]"r"(&kUVToG),     // %6
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+#define ARGBTOARGB4444                                                         \
+    "vshr.u8    d20, d20, #4                   \n"  /* B                    */ \
+    "vbic.32    d21, d21, d4                   \n"  /* G                    */ \
+    "vshr.u8    d22, d22, #4                   \n"  /* R                    */ \
+    "vbic.32    d23, d23, d4                   \n"  /* A                    */ \
+    "vorr       d0, d20, d21                   \n"  /* BG                   */ \
+    "vorr       d1, d22, d23                   \n"  /* RA                   */ \
+    "vzip.u8    d0, d1                         \n"  /* BGRA                 */
+
+void I422ToARGB4444Row_NEON(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb4444,
+                            int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+    "vmov.u8    d4, #0x0f                      \n"  // bits to clear with vbic.
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    READYUV422
+    YUV422TORGB
+    "subs       %4, %4, #8                     \n"
+    "vmov.u8    d23, #255                      \n"
+    ARGBTOARGB4444
+    MEMACCESS(3)
+    "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels ARGB4444.
+    "bgt        1b                             \n"
+    : "+r"(src_y),    // %0
+      "+r"(src_u),    // %1
+      "+r"(src_v),    // %2
+      "+r"(dst_argb4444),  // %3
+      "+r"(width)     // %4
+    : [kUVToRB]"r"(&kUVToRB),   // %5
+      [kUVToG]"r"(&kUVToG),     // %6
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void I400ToARGBRow_NEON(const uint8* src_y,
+                        uint8* dst_argb,
+                        int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    READYUV400
+    YUV422TORGB
+    "subs       %2, %2, #8                     \n"
+    "vmov.u8    d23, #255                      \n"
+    MEMACCESS(1)
+    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
+    "bgt        1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(dst_argb),  // %1
+      "+r"(width)      // %2
+    : [kUVToRB]"r"(&kUVToRB),   // %3
+      [kUVToG]"r"(&kUVToG),     // %4
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void J400ToARGBRow_NEON(const uint8* src_y,
+                        uint8* dst_argb,
+                        int width) {
+  asm volatile (
+    "vmov.u8    d23, #255                      \n"
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {d20}, [%0]!                   \n"
+    "vmov       d21, d20                       \n"
+    "vmov       d22, d20                       \n"
+    "subs       %2, %2, #8                     \n"
+    MEMACCESS(1)
+    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
+    "bgt        1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(dst_argb),  // %1
+      "+r"(width)      // %2
+    :
+    : "cc", "memory", "d20", "d21", "d22", "d23"
+  );
+}
+
+void NV12ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_uv,
+                        uint8* dst_argb,
+                        int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    READNV12
+    YUV422TORGB
+    "subs       %3, %3, #8                     \n"
+    "vmov.u8    d23, #255                      \n"
+    MEMACCESS(2)
+    "vst4.8     {d20, d21, d22, d23}, [%2]!    \n"
+    "bgt        1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_uv),    // %1
+      "+r"(dst_argb),  // %2
+      "+r"(width)      // %3
+    : [kUVToRB]"r"(&kUVToRB),   // %4
+      [kUVToG]"r"(&kUVToG),     // %5
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void NV21ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_uv,
+                        uint8* dst_argb,
+                        int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    READNV21
+    YUV422TORGB
+    "subs       %3, %3, #8                     \n"
+    "vmov.u8    d23, #255                      \n"
+    MEMACCESS(2)
+    "vst4.8     {d20, d21, d22, d23}, [%2]!    \n"
+    "bgt        1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_uv),    // %1
+      "+r"(dst_argb),  // %2
+      "+r"(width)      // %3
+    : [kUVToRB]"r"(&kUVToRB),   // %4
+      [kUVToG]"r"(&kUVToG),     // %5
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void NV12ToRGB565Row_NEON(const uint8* src_y,
+                          const uint8* src_uv,
+                          uint8* dst_rgb565,
+                          int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    READNV12
+    YUV422TORGB
+    "subs       %3, %3, #8                     \n"
+    ARGBTORGB565
+    MEMACCESS(2)
+    "vst1.8     {q0}, [%2]!                    \n"  // store 8 pixels RGB565.
+    "bgt        1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_uv),    // %1
+      "+r"(dst_rgb565),  // %2
+      "+r"(width)      // %3
+    : [kUVToRB]"r"(&kUVToRB),   // %4
+      [kUVToG]"r"(&kUVToG),     // %5
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void NV21ToRGB565Row_NEON(const uint8* src_y,
+                          const uint8* src_uv,
+                          uint8* dst_rgb565,
+                          int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    READNV21
+    YUV422TORGB
+    "subs       %3, %3, #8                     \n"
+    ARGBTORGB565
+    MEMACCESS(2)
+    "vst1.8     {q0}, [%2]!                    \n"  // store 8 pixels RGB565.
+    "bgt        1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_uv),    // %1
+      "+r"(dst_rgb565),  // %2
+      "+r"(width)      // %3
+    : [kUVToRB]"r"(&kUVToRB),   // %4
+      [kUVToG]"r"(&kUVToG),     // %5
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
+                        uint8* dst_argb,
+                        int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    READYUY2
+    YUV422TORGB
+    "subs       %2, %2, #8                     \n"
+    "vmov.u8    d23, #255                      \n"
+    MEMACCESS(1)
+    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
+    "bgt        1b                             \n"
+    : "+r"(src_yuy2),  // %0
+      "+r"(dst_argb),  // %1
+      "+r"(width)      // %2
+    : [kUVToRB]"r"(&kUVToRB),   // %3
+      [kUVToG]"r"(&kUVToG),     // %4
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void UYVYToARGBRow_NEON(const uint8* src_uyvy,
+                        uint8* dst_argb,
+                        int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    READUYVY
+    YUV422TORGB
+    "subs       %2, %2, #8                     \n"
+    "vmov.u8    d23, #255                      \n"
+    MEMACCESS(1)
+    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
+    "bgt        1b                             \n"
+    : "+r"(src_uyvy),  // %0
+      "+r"(dst_argb),  // %1
+      "+r"(width)      // %2
+    : [kUVToRB]"r"(&kUVToRB),   // %3
+      [kUVToG]"r"(&kUVToG),     // %4
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
+void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                     int width) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pairs of UV
+    "subs       %3, %3, #16                    \n"  // 16 processed per loop
+    MEMACCESS(1)
+    "vst1.8     {q0}, [%1]!                    \n"  // store U
+    MEMACCESS(2)
+    "vst1.8     {q1}, [%2]!                    \n"  // store V
+    "bgt        1b                             \n"
+    : "+r"(src_uv),  // %0
+      "+r"(dst_u),   // %1
+      "+r"(dst_v),   // %2
+      "+r"(width)    // %3  // Output registers
+    :                       // Input registers
+    : "cc", "memory", "q0", "q1"  // Clobber List
+  );
+}
+
+// Reads 16 U's and V's and writes out 16 pairs of UV.
+void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                     int width) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // load U
+    MEMACCESS(1)
+    "vld1.8     {q1}, [%1]!                    \n"  // load V
+    "subs       %3, %3, #16                    \n"  // 16 processed per loop
+    MEMACCESS(2)
+    "vst2.u8    {q0, q1}, [%2]!                \n"  // store 16 pairs of UV
+    "bgt        1b                             \n"
+    :
+      "+r"(src_u),   // %0
+      "+r"(src_v),   // %1
+      "+r"(dst_uv),  // %2
+      "+r"(width)    // %3  // Output registers
+    :                       // Input registers
+    : "cc", "memory", "q0", "q1"  // Clobber List
+  );
+}
+
+// Copy multiple of 32.  vld4.8  allow unaligned and is fastest on a15.
+void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 32
+    "subs       %2, %2, #32                    \n"  // 32 processed per loop
+    MEMACCESS(1)
+    "vst1.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 32
+    "bgt        1b                             \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(count)  // %2  // Output registers
+  :                     // Input registers
+  : "cc", "memory", "q0", "q1"  // Clobber List
+  );
+}
+
+// SetRow writes 'count' bytes using an 8 bit value repeated.
+void SetRow_NEON(uint8* dst, uint8 v8, int count) {
+  asm volatile (
+    "vdup.8    q0, %2                          \n"  // duplicate 16 bytes
+  "1:                                          \n"
+    "subs      %1, %1, #16                     \n"  // 16 bytes per loop
+    MEMACCESS(0)
+    "vst1.8    {q0}, [%0]!                     \n"  // store
+    "bgt       1b                              \n"
+  : "+r"(dst),   // %0
+    "+r"(count)  // %1
+  : "r"(v8)      // %2
+  : "cc", "memory", "q0"
+  );
+}
+
+// ARGBSetRow writes 'count' pixels using an 32 bit value repeated.
+void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {
+  asm volatile (
+    "vdup.u32  q0, %2                          \n"  // duplicate 4 ints
+  "1:                                          \n"
+    "subs      %1, %1, #4                      \n"  // 4 pixels per loop
+    MEMACCESS(0)
+    "vst1.8    {q0}, [%0]!                     \n"  // store
+    "bgt       1b                              \n"
+  : "+r"(dst),   // %0
+    "+r"(count)  // %1
+  : "r"(v32)     // %2
+  : "cc", "memory", "q0"
+  );
+}
+
+void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
+  asm volatile (
+    // Start at end of source row.
+    "mov        r3, #-16                       \n"
+    "add        %0, %0, %2                     \n"
+    "sub        %0, #16                        \n"
+
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0], r3                 \n"  // src -= 16
+    "subs       %2, #16                        \n"  // 16 pixels per loop.
+    "vrev64.8   q0, q0                         \n"
+    MEMACCESS(1)
+    "vst1.8     {d1}, [%1]!                    \n"  // dst += 16
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"
+    "bgt        1b                             \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(width)  // %2
+  :
+  : "cc", "memory", "r3", "q0"
+  );
+}
+
+void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                      int width) {
+  asm volatile (
+    // Start at end of source row.
+    "mov        r12, #-16                      \n"
+    "add        %0, %0, %3, lsl #1             \n"
+    "sub        %0, #16                        \n"
+
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld2.8     {d0, d1}, [%0], r12            \n"  // src -= 16
+    "subs       %3, #8                         \n"  // 8 pixels per loop.
+    "vrev64.8   q0, q0                         \n"
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // dst += 8
+    MEMACCESS(2)
+    "vst1.8     {d1}, [%2]!                    \n"
+    "bgt        1b                             \n"
+  : "+r"(src_uv),  // %0
+    "+r"(dst_u),   // %1
+    "+r"(dst_v),   // %2
+    "+r"(width)    // %3
+  :
+  : "cc", "memory", "r12", "q0"
+  );
+}
+
+void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
+  asm volatile (
+    // Start at end of source row.
+    "mov        r3, #-16                       \n"
+    "add        %0, %0, %2, lsl #2             \n"
+    "sub        %0, #16                        \n"
+
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0], r3                 \n"  // src -= 16
+    "subs       %2, #4                         \n"  // 4 pixels per loop.
+    "vrev64.32  q0, q0                         \n"
+    MEMACCESS(1)
+    "vst1.8     {d1}, [%1]!                    \n"  // dst += 16
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"
+    "bgt        1b                             \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(width)  // %2
+  :
+  : "cc", "memory", "r3", "q0"
+  );
+}
+
+void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
+  asm volatile (
+    "vmov.u8    d4, #255                       \n"  // Alpha
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RGB24.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    MEMACCESS(1)
+    "vst4.8     {d1, d2, d3, d4}, [%1]!        \n"  // store 8 pixels of ARGB.
+    "bgt        1b                             \n"
+  : "+r"(src_rgb24),  // %0
+    "+r"(dst_argb),   // %1
+    "+r"(pix)         // %2
+  :
+  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
+  );
+}
+
+void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {
+  asm volatile (
+    "vmov.u8    d4, #255                       \n"  // Alpha
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RAW.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vswp.u8    d1, d3                         \n"  // swap R, B
+    MEMACCESS(1)
+    "vst4.8     {d1, d2, d3, d4}, [%1]!        \n"  // store 8 pixels of ARGB.
+    "bgt        1b                             \n"
+  : "+r"(src_raw),   // %0
+    "+r"(dst_argb),  // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
+  );
+}
+
+#define RGB565TOARGB                                                           \
+    "vshrn.u16  d6, q0, #5                     \n"  /* G xxGGGGGG           */ \
+    "vuzp.u8    d0, d1                         \n"  /* d0 xxxBBBBB RRRRRxxx */ \
+    "vshl.u8    d6, d6, #2                     \n"  /* G GGGGGG00 upper 6   */ \
+    "vshr.u8    d1, d1, #3                     \n"  /* R 000RRRRR lower 5   */ \
+    "vshl.u8    q0, q0, #3                     \n"  /* B,R BBBBB000 upper 5 */ \
+    "vshr.u8    q2, q0, #5                     \n"  /* B,R 00000BBB lower 3 */ \
+    "vorr.u8    d0, d0, d4                     \n"  /* B                    */ \
+    "vshr.u8    d4, d6, #6                     \n"  /* G 000000GG lower 2   */ \
+    "vorr.u8    d2, d1, d5                     \n"  /* R                    */ \
+    "vorr.u8    d1, d4, d6                     \n"  /* G                    */
+
+void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) {
+  asm volatile (
+    "vmov.u8    d3, #255                       \n"  // Alpha
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    RGB565TOARGB
+    MEMACCESS(1)
+    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
+    "bgt        1b                             \n"
+  : "+r"(src_rgb565),  // %0
+    "+r"(dst_argb),    // %1
+    "+r"(pix)          // %2
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
+  );
+}
+
+#define ARGB1555TOARGB                                                         \
+    "vshrn.u16  d7, q0, #8                     \n"  /* A Arrrrrxx           */ \
+    "vshr.u8    d6, d7, #2                     \n"  /* R xxxRRRRR           */ \
+    "vshrn.u16  d5, q0, #5                     \n"  /* G xxxGGGGG           */ \
+    "vmovn.u16  d4, q0                         \n"  /* B xxxBBBBB           */ \
+    "vshr.u8    d7, d7, #7                     \n"  /* A 0000000A           */ \
+    "vneg.s8    d7, d7                         \n"  /* A AAAAAAAA upper 8   */ \
+    "vshl.u8    d6, d6, #3                     \n"  /* R RRRRR000 upper 5   */ \
+    "vshr.u8    q1, q3, #5                     \n"  /* R,A 00000RRR lower 3 */ \
+    "vshl.u8    q0, q2, #3                     \n"  /* B,G BBBBB000 upper 5 */ \
+    "vshr.u8    q2, q0, #5                     \n"  /* B,G 00000BBB lower 3 */ \
+    "vorr.u8    q1, q1, q3                     \n"  /* R,A                  */ \
+    "vorr.u8    q0, q0, q2                     \n"  /* B,G                  */ \
+
+// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
+#define RGB555TOARGB                                                           \
+    "vshrn.u16  d6, q0, #5                     \n"  /* G xxxGGGGG           */ \
+    "vuzp.u8    d0, d1                         \n"  /* d0 xxxBBBBB xRRRRRxx */ \
+    "vshl.u8    d6, d6, #3                     \n"  /* G GGGGG000 upper 5   */ \
+    "vshr.u8    d1, d1, #2                     \n"  /* R 00xRRRRR lower 5   */ \
+    "vshl.u8    q0, q0, #3                     \n"  /* B,R BBBBB000 upper 5 */ \
+    "vshr.u8    q2, q0, #5                     \n"  /* B,R 00000BBB lower 3 */ \
+    "vorr.u8    d0, d0, d4                     \n"  /* B                    */ \
+    "vshr.u8    d4, d6, #5                     \n"  /* G 00000GGG lower 3   */ \
+    "vorr.u8    d2, d1, d5                     \n"  /* R                    */ \
+    "vorr.u8    d1, d4, d6                     \n"  /* G                    */
+
+void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
+                            int pix) {
+  asm volatile (
+    "vmov.u8    d3, #255                       \n"  // Alpha
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    ARGB1555TOARGB
+    MEMACCESS(1)
+    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
+    "bgt        1b                             \n"
+  : "+r"(src_argb1555),  // %0
+    "+r"(dst_argb),    // %1
+    "+r"(pix)          // %2
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
+  );
+}
+
+#define ARGB4444TOARGB                                                         \
+    "vuzp.u8    d0, d1                         \n"  /* d0 BG, d1 RA         */ \
+    "vshl.u8    q2, q0, #4                     \n"  /* B,R BBBB0000         */ \
+    "vshr.u8    q1, q0, #4                     \n"  /* G,A 0000GGGG         */ \
+    "vshr.u8    q0, q2, #4                     \n"  /* B,R 0000BBBB         */ \
+    "vorr.u8    q0, q0, q2                     \n"  /* B,R BBBBBBBB         */ \
+    "vshl.u8    q2, q1, #4                     \n"  /* G,A GGGG0000         */ \
+    "vorr.u8    q1, q1, q2                     \n"  /* G,A GGGGGGGG         */ \
+    "vswp.u8    d1, d2                         \n"  /* B,R,G,A -> B,G,R,A   */
+
+void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
+                            int pix) {
+  asm volatile (
+    "vmov.u8    d3, #255                       \n"  // Alpha
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    ARGB4444TOARGB
+    MEMACCESS(1)
+    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
+    "bgt        1b                             \n"
+  : "+r"(src_argb4444),  // %0
+    "+r"(dst_argb),    // %1
+    "+r"(pix)          // %2
+  :
+  : "cc", "memory", "q0", "q1", "q2"  // Clobber List
+  );
+}
+
+void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d1, d2, d3, d4}, [%0]!        \n"  // load 8 pixels of ARGB.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    MEMACCESS(1)
+    "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of RGB24.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),   // %0
+    "+r"(dst_rgb24),  // %1
+    "+r"(pix)         // %2
+  :
+  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
+  );
+}
+
+void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d1, d2, d3, d4}, [%0]!        \n"  // load 8 pixels of ARGB.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vswp.u8    d1, d3                         \n"  // swap R, B
+    MEMACCESS(1)
+    "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of RAW.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_raw),   // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
+  );
+}
+
+void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pixels of YUY2.
+    "subs       %2, %2, #16                    \n"  // 16 processed per loop.
+    MEMACCESS(1)
+    "vst1.8     {q0}, [%1]!                    \n"  // store 16 pixels of Y.
+    "bgt        1b                             \n"
+  : "+r"(src_yuy2),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "q0", "q1"  // Clobber List
+  );
+}
+
+void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pixels of UYVY.
+    "subs       %2, %2, #16                    \n"  // 16 processed per loop.
+    MEMACCESS(1)
+    "vst1.8     {q1}, [%1]!                    \n"  // store 16 pixels of Y.
+    "bgt        1b                             \n"
+  : "+r"(src_uyvy),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "q0", "q1"  // Clobber List
+  );
+}
+
+void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
+                         int pix) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of YUY2.
+    "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.
+    MEMACCESS(1)
+    "vst1.8     {d1}, [%1]!                    \n"  // store 8 U.
+    MEMACCESS(2)
+    "vst1.8     {d3}, [%2]!                    \n"  // store 8 V.
+    "bgt        1b                             \n"
+  : "+r"(src_yuy2),  // %0
+    "+r"(dst_u),     // %1
+    "+r"(dst_v),     // %2
+    "+r"(pix)        // %3
+  :
+  : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber List
+  );
+}
+
+void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
+                         int pix) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of UYVY.
+    "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 U.
+    MEMACCESS(2)
+    "vst1.8     {d2}, [%2]!                    \n"  // store 8 V.
+    "bgt        1b                             \n"
+  : "+r"(src_uyvy),  // %0
+    "+r"(dst_u),     // %1
+    "+r"(dst_v),     // %2
+    "+r"(pix)        // %3
+  :
+  : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber List
+  );
+}
+
+void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // stride + src_yuy2
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of YUY2.
+    "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs.
+    MEMACCESS(1)
+    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load next row YUY2.
+    "vrhadd.u8  d1, d1, d5                     \n"  // average rows of U
+    "vrhadd.u8  d3, d3, d7                     \n"  // average rows of V
+    MEMACCESS(2)
+    "vst1.8     {d1}, [%2]!                    \n"  // store 8 U.
+    MEMACCESS(3)
+    "vst1.8     {d3}, [%3]!                    \n"  // store 8 V.
+    "bgt        1b                             \n"
+  : "+r"(src_yuy2),     // %0
+    "+r"(stride_yuy2),  // %1
+    "+r"(dst_u),        // %2
+    "+r"(dst_v),        // %3
+    "+r"(pix)           // %4
+  :
+  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"  // Clobber List
+  );
+}
+
+void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // stride + src_uyvy
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of UYVY.
+    "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs.
+    MEMACCESS(1)
+    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load next row UYVY.
+    "vrhadd.u8  d0, d0, d4                     \n"  // average rows of U
+    "vrhadd.u8  d2, d2, d6                     \n"  // average rows of V
+    MEMACCESS(2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 U.
+    MEMACCESS(3)
+    "vst1.8     {d2}, [%3]!                    \n"  // store 8 V.
+    "bgt        1b                             \n"
+  : "+r"(src_uyvy),     // %0
+    "+r"(stride_uyvy),  // %1
+    "+r"(dst_u),        // %2
+    "+r"(dst_v),        // %3
+    "+r"(pix)           // %4
+  :
+  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"  // Clobber List
+  );
+}
+
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
+                         const uint8* shuffler, int pix) {
+  asm volatile (
+    MEMACCESS(3)
+    "vld1.8     {q2}, [%3]                     \n"  // shuffler
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // load 4 pixels.
+    "subs       %2, %2, #4                     \n"  // 4 processed per loop
+    "vtbl.8     d2, {d0, d1}, d4               \n"  // look up 2 first pixels
+    "vtbl.8     d3, {d0, d1}, d5               \n"  // look up 2 next pixels
+    MEMACCESS(1)
+    "vst1.8     {q1}, [%1]!                    \n"  // store 4.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(pix)        // %2
+  : "r"(shuffler)    // %3
+  : "cc", "memory", "q0", "q1", "q2"  // Clobber List
+  );
+}
+
+void I422ToYUY2Row_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_yuy2, int width) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld2.8     {d0, d2}, [%0]!                \n"  // load 16 Ys
+    MEMACCESS(1)
+    "vld1.8     {d1}, [%1]!                    \n"  // load 8 Us
+    MEMACCESS(2)
+    "vld1.8     {d3}, [%2]!                    \n"  // load 8 Vs
+    "subs       %4, %4, #16                    \n"  // 16 pixels
+    MEMACCESS(3)
+    "vst4.8     {d0, d1, d2, d3}, [%3]!        \n"  // Store 8 YUY2/16 pixels.
+    "bgt        1b                             \n"
+  : "+r"(src_y),     // %0
+    "+r"(src_u),     // %1
+    "+r"(src_v),     // %2
+    "+r"(dst_yuy2),  // %3
+    "+r"(width)      // %4
+  :
+  : "cc", "memory", "d0", "d1", "d2", "d3"
+  );
+}
+
+void I422ToUYVYRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_uyvy, int width) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld2.8     {d1, d3}, [%0]!                \n"  // load 16 Ys
+    MEMACCESS(1)
+    "vld1.8     {d0}, [%1]!                    \n"  // load 8 Us
+    MEMACCESS(2)
+    "vld1.8     {d2}, [%2]!                    \n"  // load 8 Vs
+    "subs       %4, %4, #16                    \n"  // 16 pixels
+    MEMACCESS(3)
+    "vst4.8     {d0, d1, d2, d3}, [%3]!        \n"  // Store 8 UYVY/16 pixels.
+    "bgt        1b                             \n"
+  : "+r"(src_y),     // %0
+    "+r"(src_u),     // %1
+    "+r"(src_v),     // %2
+    "+r"(dst_uyvy),  // %3
+    "+r"(width)      // %4
+  :
+  : "cc", "memory", "d0", "d1", "d2", "d3"
+  );
+}
+
+void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    ARGBTORGB565
+    MEMACCESS(1)
+    "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels RGB565.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_rgb565),  // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
+  );
+}
+
+void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
+                                const uint32 dither4, int width) {
+  asm volatile (
+    ".p2align   2                              \n"
+    "vdup.32    d2, %2                         \n"  // dither4
+  "1:                                          \n"
+    MEMACCESS(1)
+    "vld4.8     {d20, d21, d22, d23}, [%1]!    \n"  // load 8 pixels of ARGB.
+    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+    "vqadd.u8   d20, d20, d2                   \n"
+    "vqadd.u8   d21, d21, d2                   \n"
+    "vqadd.u8   d22, d22, d2                   \n"
+    ARGBTORGB565
+    MEMACCESS(0)
+    "vst1.8     {q0}, [%0]!                    \n"  // store 8 pixels RGB565.
+    "bgt        1b                             \n"
+  : "+r"(dst_rgb)    // %0
+  : "r"(src_argb),   // %1
+    "r"(dither4),    // %2
+    "r"(width)       // %3
+  : "cc", "memory", "q0", "q1", "q8", "q9", "q10", "q11"
+  );
+}
+
+void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
+                            int pix) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    ARGBTOARGB1555
+    MEMACCESS(1)
+    "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels ARGB1555.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb1555),  // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
+  );
+}
+
+void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
+                            int pix) {
+  asm volatile (
+    "vmov.u8    d4, #0x0f                      \n"  // bits to clear with vbic.
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    ARGBTOARGB4444
+    MEMACCESS(1)
+    "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels ARGB4444.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),      // %0
+    "+r"(dst_argb4444),  // %1
+    "+r"(pix)            // %2
+  :
+  : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
+  );
+}
+
+void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
+  asm volatile (
+    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
+    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
+    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
+    "vmov.u8    d27, #16                       \n"  // Add 16 constant
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q2, d0, d24                    \n"  // B
+    "vmlal.u8   q2, d1, d25                    \n"  // G
+    "vmlal.u8   q2, d2, d26                    \n"  // R
+    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
+    "vqadd.u8   d0, d27                        \n"
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
+  );
+}
+
+void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
+  asm volatile (
+    "vmov.u8    d24, #15                       \n"  // B * 0.11400 coefficient
+    "vmov.u8    d25, #75                       \n"  // G * 0.58700 coefficient
+    "vmov.u8    d26, #38                       \n"  // R * 0.29900 coefficient
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q2, d0, d24                    \n"  // B
+    "vmlal.u8   q2, d1, d25                    \n"  // G
+    "vmlal.u8   q2, d2, d26                    \n"  // R
+    "vqrshrun.s16 d0, q2, #7                   \n"  // 15 bit to 8 bit Y
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
+  );
+}
+
+// 8x1 pixels.
+void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+                         int pix) {
+  asm volatile (
+    "vmov.u8    d24, #112                      \n"  // UB / VR 0.875 coefficient
+    "vmov.u8    d25, #74                       \n"  // UG -0.5781 coefficient
+    "vmov.u8    d26, #38                       \n"  // UR -0.2969 coefficient
+    "vmov.u8    d27, #18                       \n"  // VB -0.1406 coefficient
+    "vmov.u8    d28, #94                       \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
+    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q2, d0, d24                    \n"  // B
+    "vmlsl.u8   q2, d1, d25                    \n"  // G
+    "vmlsl.u8   q2, d2, d26                    \n"  // R
+    "vadd.u16   q2, q2, q15                    \n"  // +128 -> unsigned
+
+    "vmull.u8   q3, d2, d24                    \n"  // R
+    "vmlsl.u8   q3, d1, d28                    \n"  // G
+    "vmlsl.u8   q3, d0, d27                    \n"  // B
+    "vadd.u16   q3, q3, q15                    \n"  // +128 -> unsigned
+
+    "vqshrn.u16  d0, q2, #8                    \n"  // 16 bit to 8 bit U
+    "vqshrn.u16  d1, q3, #8                    \n"  // 16 bit to 8 bit V
+
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels U.
+    MEMACCESS(2)
+    "vst1.8     {d1}, [%2]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_u),     // %1
+    "+r"(dst_v),     // %2
+    "+r"(pix)        // %3
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14", "q15"
+  );
+}
+
+// 16x1 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
+void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+                         int pix) {
+  asm volatile (
+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
+    MEMACCESS(0)
+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
+
+    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
+
+    "subs       %3, %3, #16                    \n"  // 16 processed per loop.
+    "vmul.s16   q8, q0, q10                    \n"  // B
+    "vmls.s16   q8, q1, q11                    \n"  // G
+    "vmls.s16   q8, q2, q12                    \n"  // R
+    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
+
+    "vmul.s16   q9, q2, q10                    \n"  // R
+    "vmls.s16   q9, q1, q14                    \n"  // G
+    "vmls.s16   q9, q0, q13                    \n"  // B
+    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
+
+    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
+    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
+
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels U.
+    MEMACCESS(2)
+    "vst1.8     {d1}, [%2]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_u),     // %1
+    "+r"(dst_v),     // %2
+    "+r"(pix)        // %3
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+// 32x1 pixels -> 8x1.  pix is number of argb pixels. e.g. 32.
+void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+                         int pix) {
+  asm volatile (
+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
+    MEMACCESS(0)
+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
+    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(0)
+    "vld4.8     {d8, d10, d12, d14}, [%0]!     \n"  // load 8 more ARGB pixels.
+    MEMACCESS(0)
+    "vld4.8     {d9, d11, d13, d15}, [%0]!     \n"  // load last 8 ARGB pixels.
+    "vpaddl.u8  q4, q4                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q5, q5                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q6, q6                         \n"  // R 16 bytes -> 8 shorts.
+
+    "vpadd.u16  d0, d0, d1                     \n"  // B 16 shorts -> 8 shorts.
+    "vpadd.u16  d1, d8, d9                     \n"  // B
+    "vpadd.u16  d2, d2, d3                     \n"  // G 16 shorts -> 8 shorts.
+    "vpadd.u16  d3, d10, d11                   \n"  // G
+    "vpadd.u16  d4, d4, d5                     \n"  // R 16 shorts -> 8 shorts.
+    "vpadd.u16  d5, d12, d13                   \n"  // R
+
+    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
+    "vrshr.u16  q1, q1, #1                     \n"
+    "vrshr.u16  q2, q2, #1                     \n"
+
+    "subs       %3, %3, #32                    \n"  // 32 processed per loop.
+    "vmul.s16   q8, q0, q10                    \n"  // B
+    "vmls.s16   q8, q1, q11                    \n"  // G
+    "vmls.s16   q8, q2, q12                    \n"  // R
+    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
+    "vmul.s16   q9, q2, q10                    \n"  // R
+    "vmls.s16   q9, q1, q14                    \n"  // G
+    "vmls.s16   q9, q0, q13                    \n"  // B
+    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
+    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
+    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels U.
+    MEMACCESS(2)
+    "vst1.8     {d1}, [%2]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_u),     // %1
+    "+r"(dst_v),     // %2
+    "+r"(pix)        // %3
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+// 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
+#define RGBTOUV(QB, QG, QR) \
+    "vmul.s16   q8, " #QB ", q10               \n"  /* B                    */ \
+    "vmls.s16   q8, " #QG ", q11               \n"  /* G                    */ \
+    "vmls.s16   q8, " #QR ", q12               \n"  /* R                    */ \
+    "vadd.u16   q8, q8, q15                    \n"  /* +128 -> unsigned     */ \
+    "vmul.s16   q9, " #QR ", q10               \n"  /* R                    */ \
+    "vmls.s16   q9, " #QG ", q14               \n"  /* G                    */ \
+    "vmls.s16   q9, " #QB ", q13               \n"  /* B                    */ \
+    "vadd.u16   q9, q9, q15                    \n"  /* +128 -> unsigned     */ \
+    "vqshrn.u16  d0, q8, #8                    \n"  /* 16 bit to 8 bit U    */ \
+    "vqshrn.u16  d1, q9, #8                    \n"  /* 16 bit to 8 bit V    */
+
+// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
+void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // src_stride + src_argb
+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
+    MEMACCESS(0)
+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
+    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ARGB pixels.
+    MEMACCESS(1)
+    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ARGB pixels.
+    "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
+    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
+    "vpadal.u8  q2, q6                         \n"  // R 16 bytes -> 8 shorts.
+
+    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
+    "vrshr.u16  q1, q1, #1                     \n"
+    "vrshr.u16  q2, q2, #1                     \n"
+
+    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
+    RGBTOUV(q0, q1, q2)
+    MEMACCESS(2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(src_stride_argb),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(pix)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+// TODO(fbarchard): Subsample match C code.
+void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // src_stride + src_argb
+    "vmov.s16   q10, #127 / 2                  \n"  // UB / VR 0.500 coefficient
+    "vmov.s16   q11, #84 / 2                   \n"  // UG -0.33126 coefficient
+    "vmov.s16   q12, #43 / 2                   \n"  // UR -0.16874 coefficient
+    "vmov.s16   q13, #20 / 2                   \n"  // VB -0.08131 coefficient
+    "vmov.s16   q14, #107 / 2                  \n"  // VG -0.41869 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
+    MEMACCESS(0)
+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
+    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ARGB pixels.
+    MEMACCESS(1)
+    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ARGB pixels.
+    "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
+    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
+    "vpadal.u8  q2, q6                         \n"  // R 16 bytes -> 8 shorts.
+
+    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
+    "vrshr.u16  q1, q1, #1                     \n"
+    "vrshr.u16  q2, q2, #1                     \n"
+
+    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
+    RGBTOUV(q0, q1, q2)
+    MEMACCESS(2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(src_stride_argb),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(pix)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // src_stride + src_bgra
+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 BGRA pixels.
+    MEMACCESS(0)
+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 BGRA pixels.
+    "vpaddl.u8  q3, q3                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q2, q2                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q1, q1                         \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more BGRA pixels.
+    MEMACCESS(1)
+    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 BGRA pixels.
+    "vpadal.u8  q3, q7                         \n"  // B 16 bytes -> 8 shorts.
+    "vpadal.u8  q2, q6                         \n"  // G 16 bytes -> 8 shorts.
+    "vpadal.u8  q1, q5                         \n"  // R 16 bytes -> 8 shorts.
+
+    "vrshr.u16  q1, q1, #1                     \n"  // 2x average
+    "vrshr.u16  q2, q2, #1                     \n"
+    "vrshr.u16  q3, q3, #1                     \n"
+
+    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
+    RGBTOUV(q3, q2, q1)
+    MEMACCESS(2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_bgra),  // %0
+    "+r"(src_stride_bgra),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(pix)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // src_stride + src_abgr
+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ABGR pixels.
+    MEMACCESS(0)
+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ABGR pixels.
+    "vpaddl.u8  q2, q2                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q0, q0                         \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ABGR pixels.
+    MEMACCESS(1)
+    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ABGR pixels.
+    "vpadal.u8  q2, q6                         \n"  // B 16 bytes -> 8 shorts.
+    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
+    "vpadal.u8  q0, q4                         \n"  // R 16 bytes -> 8 shorts.
+
+    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
+    "vrshr.u16  q1, q1, #1                     \n"
+    "vrshr.u16  q2, q2, #1                     \n"
+
+    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
+    RGBTOUV(q2, q1, q0)
+    MEMACCESS(2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_abgr),  // %0
+    "+r"(src_stride_abgr),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(pix)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // src_stride + src_rgba
+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 RGBA pixels.
+    MEMACCESS(0)
+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 RGBA pixels.
+    "vpaddl.u8  q0, q1                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q1, q2                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q2, q3                         \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more RGBA pixels.
+    MEMACCESS(1)
+    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 RGBA pixels.
+    "vpadal.u8  q0, q5                         \n"  // B 16 bytes -> 8 shorts.
+    "vpadal.u8  q1, q6                         \n"  // G 16 bytes -> 8 shorts.
+    "vpadal.u8  q2, q7                         \n"  // R 16 bytes -> 8 shorts.
+
+    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
+    "vrshr.u16  q1, q1, #1                     \n"
+    "vrshr.u16  q2, q2, #1                     \n"
+
+    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
+    RGBTOUV(q0, q1, q2)
+    MEMACCESS(2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_rgba),  // %0
+    "+r"(src_stride_rgba),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(pix)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
+                       uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // src_stride + src_rgb24
+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld3.8     {d0, d2, d4}, [%0]!            \n"  // load 8 RGB24 pixels.
+    MEMACCESS(0)
+    "vld3.8     {d1, d3, d5}, [%0]!            \n"  // load next 8 RGB24 pixels.
+    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "vld3.8     {d8, d10, d12}, [%1]!          \n"  // load 8 more RGB24 pixels.
+    MEMACCESS(1)
+    "vld3.8     {d9, d11, d13}, [%1]!          \n"  // load last 8 RGB24 pixels.
+    "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
+    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
+    "vpadal.u8  q2, q6                         \n"  // R 16 bytes -> 8 shorts.
+
+    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
+    "vrshr.u16  q1, q1, #1                     \n"
+    "vrshr.u16  q2, q2, #1                     \n"
+
+    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
+    RGBTOUV(q0, q1, q2)
+    MEMACCESS(2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_rgb24),  // %0
+    "+r"(src_stride_rgb24),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(pix)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
+                     uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // src_stride + src_raw
+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld3.8     {d0, d2, d4}, [%0]!            \n"  // load 8 RAW pixels.
+    MEMACCESS(0)
+    "vld3.8     {d1, d3, d5}, [%0]!            \n"  // load next 8 RAW pixels.
+    "vpaddl.u8  q2, q2                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q0, q0                         \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "vld3.8     {d8, d10, d12}, [%1]!          \n"  // load 8 more RAW pixels.
+    MEMACCESS(1)
+    "vld3.8     {d9, d11, d13}, [%1]!          \n"  // load last 8 RAW pixels.
+    "vpadal.u8  q2, q6                         \n"  // B 16 bytes -> 8 shorts.
+    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
+    "vpadal.u8  q0, q4                         \n"  // R 16 bytes -> 8 shorts.
+
+    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
+    "vrshr.u16  q1, q1, #1                     \n"
+    "vrshr.u16  q2, q2, #1                     \n"
+
+    "subs       %4, %4, #16                    \n"  // 32 processed per loop.
+    RGBTOUV(q2, q1, q0)
+    MEMACCESS(2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_raw),  // %0
+    "+r"(src_stride_raw),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(pix)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+// 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
+void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
+                        uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // src_stride + src_argb
+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
+    RGB565TOARGB
+    "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // next 8 RGB565 pixels.
+    RGB565TOARGB
+    "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
+
+    MEMACCESS(1)
+    "vld1.8     {q0}, [%1]!                    \n"  // load 8 RGB565 pixels.
+    RGB565TOARGB
+    "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(1)
+    "vld1.8     {q0}, [%1]!                    \n"  // next 8 RGB565 pixels.
+    RGB565TOARGB
+    "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
+
+    "vrshr.u16  q4, q4, #1                     \n"  // 2x average
+    "vrshr.u16  q5, q5, #1                     \n"
+    "vrshr.u16  q6, q6, #1                     \n"
+
+    "subs       %4, %4, #16                    \n"  // 16 processed per loop.
+    "vmul.s16   q8, q4, q10                    \n"  // B
+    "vmls.s16   q8, q5, q11                    \n"  // G
+    "vmls.s16   q8, q6, q12                    \n"  // R
+    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
+    "vmul.s16   q9, q6, q10                    \n"  // R
+    "vmls.s16   q9, q5, q14                    \n"  // G
+    "vmls.s16   q9, q4, q13                    \n"  // B
+    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
+    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
+    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
+    MEMACCESS(2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_rgb565),  // %0
+    "+r"(src_stride_rgb565),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(pix)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+// 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
+void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
+                        uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // src_stride + src_argb
+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
+    RGB555TOARGB
+    "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // next 8 ARGB1555 pixels.
+    RGB555TOARGB
+    "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
+
+    MEMACCESS(1)
+    "vld1.8     {q0}, [%1]!                    \n"  // load 8 ARGB1555 pixels.
+    RGB555TOARGB
+    "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(1)
+    "vld1.8     {q0}, [%1]!                    \n"  // next 8 ARGB1555 pixels.
+    RGB555TOARGB
+    "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
+
+    "vrshr.u16  q4, q4, #1                     \n"  // 2x average
+    "vrshr.u16  q5, q5, #1                     \n"
+    "vrshr.u16  q6, q6, #1                     \n"
+
+    "subs       %4, %4, #16                    \n"  // 16 processed per loop.
+    "vmul.s16   q8, q4, q10                    \n"  // B
+    "vmls.s16   q8, q5, q11                    \n"  // G
+    "vmls.s16   q8, q6, q12                    \n"  // R
+    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
+    "vmul.s16   q9, q6, q10                    \n"  // R
+    "vmls.s16   q9, q5, q14                    \n"  // G
+    "vmls.s16   q9, q4, q13                    \n"  // B
+    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
+    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
+    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
+    MEMACCESS(2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_argb1555),  // %0
+    "+r"(src_stride_argb1555),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(pix)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+// 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
+void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
+                          uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // src_stride + src_argb
+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
+    ARGB4444TOARGB
+    "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // next 8 ARGB4444 pixels.
+    ARGB4444TOARGB
+    "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
+
+    MEMACCESS(1)
+    "vld1.8     {q0}, [%1]!                    \n"  // load 8 ARGB4444 pixels.
+    ARGB4444TOARGB
+    "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(1)
+    "vld1.8     {q0}, [%1]!                    \n"  // next 8 ARGB4444 pixels.
+    ARGB4444TOARGB
+    "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
+
+    "vrshr.u16  q4, q4, #1                     \n"  // 2x average
+    "vrshr.u16  q5, q5, #1                     \n"
+    "vrshr.u16  q6, q6, #1                     \n"
+
+    "subs       %4, %4, #16                    \n"  // 16 processed per loop.
+    "vmul.s16   q8, q4, q10                    \n"  // B
+    "vmls.s16   q8, q5, q11                    \n"  // G
+    "vmls.s16   q8, q6, q12                    \n"  // R
+    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
+    "vmul.s16   q9, q6, q10                    \n"  // R
+    "vmls.s16   q9, q5, q14                    \n"  // G
+    "vmls.s16   q9, q4, q13                    \n"  // B
+    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
+    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
+    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
+    MEMACCESS(2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_argb4444),  // %0
+    "+r"(src_stride_argb4444),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(pix)        // %4
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) {
+  asm volatile (
+    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
+    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
+    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
+    "vmov.u8    d27, #16                       \n"  // Add 16 constant
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    RGB565TOARGB
+    "vmull.u8   q2, d0, d24                    \n"  // B
+    "vmlal.u8   q2, d1, d25                    \n"  // G
+    "vmlal.u8   q2, d2, d26                    \n"  // R
+    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
+    "vqadd.u8   d0, d27                        \n"
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+    "bgt        1b                             \n"
+  : "+r"(src_rgb565),  // %0
+    "+r"(dst_y),       // %1
+    "+r"(pix)          // %2
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
+  );
+}
+
+void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) {
+  asm volatile (
+    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
+    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
+    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
+    "vmov.u8    d27, #16                       \n"  // Add 16 constant
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    ARGB1555TOARGB
+    "vmull.u8   q2, d0, d24                    \n"  // B
+    "vmlal.u8   q2, d1, d25                    \n"  // G
+    "vmlal.u8   q2, d2, d26                    \n"  // R
+    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
+    "vqadd.u8   d0, d27                        \n"
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+    "bgt        1b                             \n"
+  : "+r"(src_argb1555),  // %0
+    "+r"(dst_y),         // %1
+    "+r"(pix)            // %2
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
+  );
+}
+
+void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) {
+  asm volatile (
+    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
+    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
+    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
+    "vmov.u8    d27, #16                       \n"  // Add 16 constant
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    ARGB4444TOARGB
+    "vmull.u8   q2, d0, d24                    \n"  // B
+    "vmlal.u8   q2, d1, d25                    \n"  // G
+    "vmlal.u8   q2, d2, d26                    \n"  // R
+    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
+    "vqadd.u8   d0, d27                        \n"
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+    "bgt        1b                             \n"
+  : "+r"(src_argb4444),  // %0
+    "+r"(dst_y),         // %1
+    "+r"(pix)            // %2
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
+  );
+}
+
+void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) {
+  asm volatile (
+    "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient
+    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
+    "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
+    "vmov.u8    d7, #16                        \n"  // Add 16 constant
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of BGRA.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q8, d1, d4                     \n"  // R
+    "vmlal.u8   q8, d2, d5                     \n"  // G
+    "vmlal.u8   q8, d3, d6                     \n"  // B
+    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
+    "vqadd.u8   d0, d7                         \n"
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+    "bgt        1b                             \n"
+  : "+r"(src_bgra),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
+  );
+}
+
+void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) {
+  asm volatile (
+    "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient
+    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
+    "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
+    "vmov.u8    d7, #16                        \n"  // Add 16 constant
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ABGR.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q8, d0, d4                     \n"  // R
+    "vmlal.u8   q8, d1, d5                     \n"  // G
+    "vmlal.u8   q8, d2, d6                     \n"  // B
+    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
+    "vqadd.u8   d0, d7                         \n"
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+    "bgt        1b                             \n"
+  : "+r"(src_abgr),  // %0
+    "+r"(dst_y),  // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
+  );
+}
+
+void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) {
+  asm volatile (
+    "vmov.u8    d4, #13                        \n"  // B * 0.1016 coefficient
+    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
+    "vmov.u8    d6, #33                        \n"  // R * 0.2578 coefficient
+    "vmov.u8    d7, #16                        \n"  // Add 16 constant
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of RGBA.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q8, d1, d4                     \n"  // B
+    "vmlal.u8   q8, d2, d5                     \n"  // G
+    "vmlal.u8   q8, d3, d6                     \n"  // R
+    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
+    "vqadd.u8   d0, d7                         \n"
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+    "bgt        1b                             \n"
+  : "+r"(src_rgba),  // %0
+    "+r"(dst_y),  // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
+  );
+}
+
+void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) {
+  asm volatile (
+    "vmov.u8    d4, #13                        \n"  // B * 0.1016 coefficient
+    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
+    "vmov.u8    d6, #33                        \n"  // R * 0.2578 coefficient
+    "vmov.u8    d7, #16                        \n"  // Add 16 constant
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld3.8     {d0, d1, d2}, [%0]!            \n"  // load 8 pixels of RGB24.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q8, d0, d4                     \n"  // B
+    "vmlal.u8   q8, d1, d5                     \n"  // G
+    "vmlal.u8   q8, d2, d6                     \n"  // R
+    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
+    "vqadd.u8   d0, d7                         \n"
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+    "bgt        1b                             \n"
+  : "+r"(src_rgb24),  // %0
+    "+r"(dst_y),  // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
+  );
+}
+
+void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) {
+  asm volatile (
+    "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient
+    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
+    "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
+    "vmov.u8    d7, #16                        \n"  // Add 16 constant
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld3.8     {d0, d1, d2}, [%0]!            \n"  // load 8 pixels of RAW.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q8, d0, d4                     \n"  // B
+    "vmlal.u8   q8, d1, d5                     \n"  // G
+    "vmlal.u8   q8, d2, d6                     \n"  // R
+    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
+    "vqadd.u8   d0, d7                         \n"
+    MEMACCESS(1)
+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+    "bgt        1b                             \n"
+  : "+r"(src_raw),  // %0
+    "+r"(dst_y),  // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
+  );
+}
+
+// Bilinear filter 16x2 -> 16x1
+void InterpolateRow_NEON(uint8* dst_ptr,
+                         const uint8* src_ptr, ptrdiff_t src_stride,
+                         int dst_width, int source_y_fraction) {
+  asm volatile (
+    "cmp        %4, #0                         \n"
+    "beq        100f                           \n"
+    "add        %2, %1                         \n"
+    "cmp        %4, #64                        \n"
+    "beq        75f                            \n"
+    "cmp        %4, #128                       \n"
+    "beq        50f                            \n"
+    "cmp        %4, #192                       \n"
+    "beq        25f                            \n"
+
+    "vdup.8     d5, %4                         \n"
+    "rsb        %4, #256                       \n"
+    "vdup.8     d4, %4                         \n"
+    // General purpose row blend.
+  "1:                                          \n"
+    MEMACCESS(1)
+    "vld1.8     {q0}, [%1]!                    \n"
+    MEMACCESS(2)
+    "vld1.8     {q1}, [%2]!                    \n"
+    "subs       %3, %3, #16                    \n"
+    "vmull.u8   q13, d0, d4                    \n"
+    "vmull.u8   q14, d1, d4                    \n"
+    "vmlal.u8   q13, d2, d5                    \n"
+    "vmlal.u8   q14, d3, d5                    \n"
+    "vrshrn.u16 d0, q13, #8                    \n"
+    "vrshrn.u16 d1, q14, #8                    \n"
+    MEMACCESS(0)
+    "vst1.8     {q0}, [%0]!                    \n"
+    "bgt        1b                             \n"
+    "b          99f                            \n"
+
+    // Blend 25 / 75.
+  "25:                                         \n"
+    MEMACCESS(1)
+    "vld1.8     {q0}, [%1]!                    \n"
+    MEMACCESS(2)
+    "vld1.8     {q1}, [%2]!                    \n"
+    "subs       %3, %3, #16                    \n"
+    "vrhadd.u8  q0, q1                         \n"
+    "vrhadd.u8  q0, q1                         \n"
+    MEMACCESS(0)
+    "vst1.8     {q0}, [%0]!                    \n"
+    "bgt        25b                            \n"
+    "b          99f                            \n"
+
+    // Blend 50 / 50.
+  "50:                                         \n"
+    MEMACCESS(1)
+    "vld1.8     {q0}, [%1]!                    \n"
+    MEMACCESS(2)
+    "vld1.8     {q1}, [%2]!                    \n"
+    "subs       %3, %3, #16                    \n"
+    "vrhadd.u8  q0, q1                         \n"
+    MEMACCESS(0)
+    "vst1.8     {q0}, [%0]!                    \n"
+    "bgt        50b                            \n"
+    "b          99f                            \n"
+
+    // Blend 75 / 25.
+  "75:                                         \n"
+    MEMACCESS(1)
+    "vld1.8     {q1}, [%1]!                    \n"
+    MEMACCESS(2)
+    "vld1.8     {q0}, [%2]!                    \n"
+    "subs       %3, %3, #16                    \n"
+    "vrhadd.u8  q0, q1                         \n"
+    "vrhadd.u8  q0, q1                         \n"
+    MEMACCESS(0)
+    "vst1.8     {q0}, [%0]!                    \n"
+    "bgt        75b                            \n"
+    "b          99f                            \n"
+
+    // Blend 100 / 0 - Copy row unchanged.
+  "100:                                        \n"
+    MEMACCESS(1)
+    "vld1.8     {q0}, [%1]!                    \n"
+    "subs       %3, %3, #16                    \n"
+    MEMACCESS(0)
+    "vst1.8     {q0}, [%0]!                    \n"
+    "bgt        100b                           \n"
+
+  "99:                                         \n"
+  : "+r"(dst_ptr),          // %0
+    "+r"(src_ptr),          // %1
+    "+r"(src_stride),       // %2
+    "+r"(dst_width),        // %3
+    "+r"(source_y_fraction) // %4
+  :
+  : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14"
+  );
+}
+
+// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
+void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
+                       uint8* dst_argb, int width) {
+  asm volatile (
+    "subs       %3, #8                         \n"
+    "blt        89f                            \n"
+    // Blend 8 pixels.
+  "8:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ARGB0.
+    MEMACCESS(1)
+    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 pixels of ARGB1.
+    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q10, d4, d3                    \n"  // db * a
+    "vmull.u8   q11, d5, d3                    \n"  // dg * a
+    "vmull.u8   q12, d6, d3                    \n"  // dr * a
+    "vqrshrn.u16 d20, q10, #8                  \n"  // db >>= 8
+    "vqrshrn.u16 d21, q11, #8                  \n"  // dg >>= 8
+    "vqrshrn.u16 d22, q12, #8                  \n"  // dr >>= 8
+    "vqsub.u8   q2, q2, q10                    \n"  // dbg - dbg * a / 256
+    "vqsub.u8   d6, d6, d22                    \n"  // dr - dr * a / 256
+    "vqadd.u8   q0, q0, q2                     \n"  // + sbg
+    "vqadd.u8   d2, d2, d6                     \n"  // + sr
+    "vmov.u8    d3, #255                       \n"  // a = 255
+    MEMACCESS(2)
+    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 pixels of ARGB.
+    "bge        8b                             \n"
+
+  "89:                                         \n"
+    "adds       %3, #8-1                       \n"
+    "blt        99f                            \n"
+
+    // Blend 1 pixels.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n"  // load 1 pixel ARGB0.
+    MEMACCESS(1)
+    "vld4.8     {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n"  // load 1 pixel ARGB1.
+    "subs       %3, %3, #1                     \n"  // 1 processed per loop.
+    "vmull.u8   q10, d4, d3                    \n"  // db * a
+    "vmull.u8   q11, d5, d3                    \n"  // dg * a
+    "vmull.u8   q12, d6, d3                    \n"  // dr * a
+    "vqrshrn.u16 d20, q10, #8                  \n"  // db >>= 8
+    "vqrshrn.u16 d21, q11, #8                  \n"  // dg >>= 8
+    "vqrshrn.u16 d22, q12, #8                  \n"  // dr >>= 8
+    "vqsub.u8   q2, q2, q10                    \n"  // dbg - dbg * a / 256
+    "vqsub.u8   d6, d6, d22                    \n"  // dr - dr * a / 256
+    "vqadd.u8   q0, q0, q2                     \n"  // + sbg
+    "vqadd.u8   d2, d2, d6                     \n"  // + sr
+    "vmov.u8    d3, #255                       \n"  // a = 255
+    MEMACCESS(2)
+    "vst4.8     {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n"  // store 1 pixel.
+    "bge        1b                             \n"
+
+  "99:                                         \n"
+
+  : "+r"(src_argb0),    // %0
+    "+r"(src_argb1),    // %1
+    "+r"(dst_argb),     // %2
+    "+r"(width)         // %3
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12"
+  );
+}
+
+// Attenuate 8 pixels at a time.
+void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
+  asm volatile (
+    // Attenuate 8 pixels.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ARGB.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q10, d0, d3                    \n"  // b * a
+    "vmull.u8   q11, d1, d3                    \n"  // g * a
+    "vmull.u8   q12, d2, d3                    \n"  // r * a
+    "vqrshrn.u16 d0, q10, #8                   \n"  // b >>= 8
+    "vqrshrn.u16 d1, q11, #8                   \n"  // g >>= 8
+    "vqrshrn.u16 d2, q12, #8                   \n"  // r >>= 8
+    MEMACCESS(1)
+    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),   // %0
+    "+r"(dst_argb),   // %1
+    "+r"(width)       // %2
+  :
+  : "cc", "memory", "q0", "q1", "q10", "q11", "q12"
+  );
+}
+
+// Quantize 8 ARGB pixels (32 bytes).
+// dst = (dst * scale >> 16) * interval_size + interval_offset;
+void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
+                          int interval_offset, int width) {
+  asm volatile (
+    "vdup.u16   q8, %2                         \n"
+    "vshr.u16   q8, q8, #1                     \n"  // scale >>= 1
+    "vdup.u16   q9, %3                         \n"  // interval multiply.
+    "vdup.u16   q10, %4                        \n"  // interval add
+
+    // 8 pixel loop.
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d2, d4, d6}, [%0]         \n"  // load 8 pixels of ARGB.
+    "subs       %1, %1, #8                     \n"  // 8 processed per loop.
+    "vmovl.u8   q0, d0                         \n"  // b (0 .. 255)
+    "vmovl.u8   q1, d2                         \n"
+    "vmovl.u8   q2, d4                         \n"
+    "vqdmulh.s16 q0, q0, q8                    \n"  // b * scale
+    "vqdmulh.s16 q1, q1, q8                    \n"  // g
+    "vqdmulh.s16 q2, q2, q8                    \n"  // r
+    "vmul.u16   q0, q0, q9                     \n"  // b * interval_size
+    "vmul.u16   q1, q1, q9                     \n"  // g
+    "vmul.u16   q2, q2, q9                     \n"  // r
+    "vadd.u16   q0, q0, q10                    \n"  // b + interval_offset
+    "vadd.u16   q1, q1, q10                    \n"  // g
+    "vadd.u16   q2, q2, q10                    \n"  // r
+    "vqmovn.u16 d0, q0                         \n"
+    "vqmovn.u16 d2, q1                         \n"
+    "vqmovn.u16 d4, q2                         \n"
+    MEMACCESS(0)
+    "vst4.8     {d0, d2, d4, d6}, [%0]!        \n"  // store 8 pixels of ARGB.
+    "bgt        1b                             \n"
+  : "+r"(dst_argb),       // %0
+    "+r"(width)           // %1
+  : "r"(scale),           // %2
+    "r"(interval_size),   // %3
+    "r"(interval_offset)  // %4
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10"
+  );
+}
+
+// Shade 8 pixels at a time by specified value.
+// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
+// Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
+void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
+                       uint32 value) {
+  asm volatile (
+    "vdup.u32   q0, %3                         \n"  // duplicate scale value.
+    "vzip.u8    d0, d1                         \n"  // d0 aarrggbb.
+    "vshr.u16   q0, q0, #1                     \n"  // scale / 2.
+
+    // 8 pixel loop.
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d20, d22, d24, d26}, [%0]!    \n"  // load 8 pixels of ARGB.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vmovl.u8   q10, d20                       \n"  // b (0 .. 255)
+    "vmovl.u8   q11, d22                       \n"
+    "vmovl.u8   q12, d24                       \n"
+    "vmovl.u8   q13, d26                       \n"
+    "vqrdmulh.s16 q10, q10, d0[0]              \n"  // b * scale * 2
+    "vqrdmulh.s16 q11, q11, d0[1]              \n"  // g
+    "vqrdmulh.s16 q12, q12, d0[2]              \n"  // r
+    "vqrdmulh.s16 q13, q13, d0[3]              \n"  // a
+    "vqmovn.u16 d20, q10                       \n"
+    "vqmovn.u16 d22, q11                       \n"
+    "vqmovn.u16 d24, q12                       \n"
+    "vqmovn.u16 d26, q13                       \n"
+    MEMACCESS(1)
+    "vst4.8     {d20, d22, d24, d26}, [%1]!    \n"  // store 8 pixels of ARGB.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),       // %0
+    "+r"(dst_argb),       // %1
+    "+r"(width)           // %2
+  : "r"(value)            // %3
+  : "cc", "memory", "q0", "q10", "q11", "q12", "q13"
+  );
+}
+
+// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
+// Similar to ARGBToYJ but stores ARGB.
+// C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
+void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
+  asm volatile (
+    "vmov.u8    d24, #15                       \n"  // B * 0.11400 coefficient
+    "vmov.u8    d25, #75                       \n"  // G * 0.58700 coefficient
+    "vmov.u8    d26, #38                       \n"  // R * 0.29900 coefficient
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q2, d0, d24                    \n"  // B
+    "vmlal.u8   q2, d1, d25                    \n"  // G
+    "vmlal.u8   q2, d2, d26                    \n"  // R
+    "vqrshrun.s16 d0, q2, #7                   \n"  // 15 bit to 8 bit B
+    "vmov       d1, d0                         \n"  // G
+    "vmov       d2, d0                         \n"  // R
+    MEMACCESS(1)
+    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 ARGB pixels.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(width)      // %2
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
+  );
+}
+
+// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
+//    b = (r * 35 + g * 68 + b * 17) >> 7
+//    g = (r * 45 + g * 88 + b * 22) >> 7
+//    r = (r * 50 + g * 98 + b * 24) >> 7
+void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
+  asm volatile (
+    "vmov.u8    d20, #17                       \n"  // BB coefficient
+    "vmov.u8    d21, #68                       \n"  // BG coefficient
+    "vmov.u8    d22, #35                       \n"  // BR coefficient
+    "vmov.u8    d24, #22                       \n"  // GB coefficient
+    "vmov.u8    d25, #88                       \n"  // GG coefficient
+    "vmov.u8    d26, #45                       \n"  // GR coefficient
+    "vmov.u8    d28, #24                       \n"  // BB coefficient
+    "vmov.u8    d29, #98                       \n"  // BG coefficient
+    "vmov.u8    d30, #50                       \n"  // BR coefficient
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]         \n"  // load 8 ARGB pixels.
+    "subs       %1, %1, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q2, d0, d20                    \n"  // B to Sepia B
+    "vmlal.u8   q2, d1, d21                    \n"  // G
+    "vmlal.u8   q2, d2, d22                    \n"  // R
+    "vmull.u8   q3, d0, d24                    \n"  // B to Sepia G
+    "vmlal.u8   q3, d1, d25                    \n"  // G
+    "vmlal.u8   q3, d2, d26                    \n"  // R
+    "vmull.u8   q8, d0, d28                    \n"  // B to Sepia R
+    "vmlal.u8   q8, d1, d29                    \n"  // G
+    "vmlal.u8   q8, d2, d30                    \n"  // R
+    "vqshrn.u16 d0, q2, #7                     \n"  // 16 bit to 8 bit B
+    "vqshrn.u16 d1, q3, #7                     \n"  // 16 bit to 8 bit G
+    "vqshrn.u16 d2, q8, #7                     \n"  // 16 bit to 8 bit R
+    MEMACCESS(0)
+    "vst4.8     {d0, d1, d2, d3}, [%0]!        \n"  // store 8 ARGB pixels.
+    "bgt        1b                             \n"
+  : "+r"(dst_argb),  // %0
+    "+r"(width)      // %1
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3",
+    "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+// Tranform 8 ARGB pixels (32 bytes) with color matrix.
+// TODO(fbarchard): Was same as Sepia except matrix is provided.  This function
+// needs to saturate.  Consider doing a non-saturating version.
+void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
+                             const int8* matrix_argb, int width) {
+  asm volatile (
+    MEMACCESS(3)
+    "vld1.8     {q2}, [%3]                     \n"  // load 3 ARGB vectors.
+    "vmovl.s8   q0, d4                         \n"  // B,G coefficients s16.
+    "vmovl.s8   q1, d5                         \n"  // R,A coefficients s16.
+
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d16, d18, d20, d22}, [%0]!    \n"  // load 8 ARGB pixels.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vmovl.u8   q8, d16                        \n"  // b (0 .. 255) 16 bit
+    "vmovl.u8   q9, d18                        \n"  // g
+    "vmovl.u8   q10, d20                       \n"  // r
+    "vmovl.u8   q11, d22                       \n"  // a
+    "vmul.s16   q12, q8, d0[0]                 \n"  // B = B * Matrix B
+    "vmul.s16   q13, q8, d1[0]                 \n"  // G = B * Matrix G
+    "vmul.s16   q14, q8, d2[0]                 \n"  // R = B * Matrix R
+    "vmul.s16   q15, q8, d3[0]                 \n"  // A = B * Matrix A
+    "vmul.s16   q4, q9, d0[1]                  \n"  // B += G * Matrix B
+    "vmul.s16   q5, q9, d1[1]                  \n"  // G += G * Matrix G
+    "vmul.s16   q6, q9, d2[1]                  \n"  // R += G * Matrix R
+    "vmul.s16   q7, q9, d3[1]                  \n"  // A += G * Matrix A
+    "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
+    "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
+    "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
+    "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
+    "vmul.s16   q4, q10, d0[2]                 \n"  // B += R * Matrix B
+    "vmul.s16   q5, q10, d1[2]                 \n"  // G += R * Matrix G
+    "vmul.s16   q6, q10, d2[2]                 \n"  // R += R * Matrix R
+    "vmul.s16   q7, q10, d3[2]                 \n"  // A += R * Matrix A
+    "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
+    "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
+    "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
+    "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
+    "vmul.s16   q4, q11, d0[3]                 \n"  // B += A * Matrix B
+    "vmul.s16   q5, q11, d1[3]                 \n"  // G += A * Matrix G
+    "vmul.s16   q6, q11, d2[3]                 \n"  // R += A * Matrix R
+    "vmul.s16   q7, q11, d3[3]                 \n"  // A += A * Matrix A
+    "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
+    "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
+    "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
+    "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
+    "vqshrun.s16 d16, q12, #6                  \n"  // 16 bit to 8 bit B
+    "vqshrun.s16 d18, q13, #6                  \n"  // 16 bit to 8 bit G
+    "vqshrun.s16 d20, q14, #6                  \n"  // 16 bit to 8 bit R
+    "vqshrun.s16 d22, q15, #6                  \n"  // 16 bit to 8 bit A
+    MEMACCESS(1)
+    "vst4.8     {d16, d18, d20, d22}, [%1]!    \n"  // store 8 ARGB pixels.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),   // %0
+    "+r"(dst_argb),   // %1
+    "+r"(width)       // %2
+  : "r"(matrix_argb)  // %3
+  : "cc", "memory", "q0", "q1", "q2", "q4", "q5", "q6", "q7", "q8", "q9",
+    "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+// TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable.
+#ifdef HAS_ARGBMULTIPLYROW_NEON
+// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
+void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
+                          uint8* dst_argb, int width) {
+  asm volatile (
+    // 8 pixel loop.
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
+    MEMACCESS(1)
+    "vld4.8     {d1, d3, d5, d7}, [%1]!        \n"  // load 8 more ARGB pixels.
+    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q0, d0, d1                     \n"  // multiply B
+    "vmull.u8   q1, d2, d3                     \n"  // multiply G
+    "vmull.u8   q2, d4, d5                     \n"  // multiply R
+    "vmull.u8   q3, d6, d7                     \n"  // multiply A
+    "vrshrn.u16 d0, q0, #8                     \n"  // 16 bit to 8 bit B
+    "vrshrn.u16 d1, q1, #8                     \n"  // 16 bit to 8 bit G
+    "vrshrn.u16 d2, q2, #8                     \n"  // 16 bit to 8 bit R
+    "vrshrn.u16 d3, q3, #8                     \n"  // 16 bit to 8 bit A
+    MEMACCESS(2)
+    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
+    "bgt        1b                             \n"
+
+  : "+r"(src_argb0),  // %0
+    "+r"(src_argb1),  // %1
+    "+r"(dst_argb),   // %2
+    "+r"(width)       // %3
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3"
+  );
+}
+#endif  // HAS_ARGBMULTIPLYROW_NEON
+
+// Add 2 rows of ARGB pixels together, 8 pixels at a time.
+void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
+                     uint8* dst_argb, int width) {
+  asm volatile (
+    // 8 pixel loop.
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
+    MEMACCESS(1)
+    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 more ARGB pixels.
+    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+    "vqadd.u8   q0, q0, q2                     \n"  // add B, G
+    "vqadd.u8   q1, q1, q3                     \n"  // add R, A
+    MEMACCESS(2)
+    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
+    "bgt        1b                             \n"
+
+  : "+r"(src_argb0),  // %0
+    "+r"(src_argb1),  // %1
+    "+r"(dst_argb),   // %2
+    "+r"(width)       // %3
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3"
+  );
+}
+
+// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
+void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
+                          uint8* dst_argb, int width) {
+  asm volatile (
+    // 8 pixel loop.
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
+    MEMACCESS(1)
+    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 more ARGB pixels.
+    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+    "vqsub.u8   q0, q0, q2                     \n"  // subtract B, G
+    "vqsub.u8   q1, q1, q3                     \n"  // subtract R, A
+    MEMACCESS(2)
+    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
+    "bgt        1b                             \n"
+
+  : "+r"(src_argb0),  // %0
+    "+r"(src_argb1),  // %1
+    "+r"(dst_argb),   // %2
+    "+r"(width)       // %3
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3"
+  );
+}
+
+// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
+// A = 255
+// R = Sobel
+// G = Sobel
+// B = Sobel
+void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+                     uint8* dst_argb, int width) {
+  asm volatile (
+    "vmov.u8    d3, #255                       \n"  // alpha
+    // 8 pixel loop.
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {d0}, [%0]!                    \n"  // load 8 sobelx.
+    MEMACCESS(1)
+    "vld1.8     {d1}, [%1]!                    \n"  // load 8 sobely.
+    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+    "vqadd.u8   d0, d0, d1                     \n"  // add
+    "vmov.u8    d1, d0                         \n"
+    "vmov.u8    d2, d0                         \n"
+    MEMACCESS(2)
+    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
+    "bgt        1b                             \n"
+  : "+r"(src_sobelx),  // %0
+    "+r"(src_sobely),  // %1
+    "+r"(dst_argb),    // %2
+    "+r"(width)        // %3
+  :
+  : "cc", "memory", "q0", "q1"
+  );
+}
+
+// Adds Sobel X and Sobel Y and stores Sobel into plane.
+void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+                          uint8* dst_y, int width) {
+  asm volatile (
+    // 16 pixel loop.
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"  // load 16 sobelx.
+    MEMACCESS(1)
+    "vld1.8     {q1}, [%1]!                    \n"  // load 16 sobely.
+    "subs       %3, %3, #16                    \n"  // 16 processed per loop.
+    "vqadd.u8   q0, q0, q1                     \n"  // add
+    MEMACCESS(2)
+    "vst1.8     {q0}, [%2]!                    \n"  // store 16 pixels.
+    "bgt        1b                             \n"
+  : "+r"(src_sobelx),  // %0
+    "+r"(src_sobely),  // %1
+    "+r"(dst_y),       // %2
+    "+r"(width)        // %3
+  :
+  : "cc", "memory", "q0", "q1"
+  );
+}
+
+// Mixes Sobel X, Sobel Y and Sobel into ARGB.
+// A = 255
+// R = Sobel X
+// G = Sobel
+// B = Sobel Y
+void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+                     uint8* dst_argb, int width) {
+  asm volatile (
+    "vmov.u8    d3, #255                       \n"  // alpha
+    // 8 pixel loop.
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {d2}, [%0]!                    \n"  // load 8 sobelx.
+    MEMACCESS(1)
+    "vld1.8     {d0}, [%1]!                    \n"  // load 8 sobely.
+    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+    "vqadd.u8   d1, d0, d2                     \n"  // add
+    MEMACCESS(2)
+    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
+    "bgt        1b                             \n"
+  : "+r"(src_sobelx),  // %0
+    "+r"(src_sobely),  // %1
+    "+r"(dst_argb),    // %2
+    "+r"(width)        // %3
+  :
+  : "cc", "memory", "q0", "q1"
+  );
+}
+
+// SobelX as a matrix is
+// -1  0  1
+// -2  0  2
+// -1  0  1
+void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
+                    const uint8* src_y2, uint8* dst_sobelx, int width) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {d0}, [%0],%5                  \n"  // top
+    MEMACCESS(0)
+    "vld1.8     {d1}, [%0],%6                  \n"
+    "vsubl.u8   q0, d0, d1                     \n"
+    MEMACCESS(1)
+    "vld1.8     {d2}, [%1],%5                  \n"  // center * 2
+    MEMACCESS(1)
+    "vld1.8     {d3}, [%1],%6                  \n"
+    "vsubl.u8   q1, d2, d3                     \n"
+    "vadd.s16   q0, q0, q1                     \n"
+    "vadd.s16   q0, q0, q1                     \n"
+    MEMACCESS(2)
+    "vld1.8     {d2}, [%2],%5                  \n"  // bottom
+    MEMACCESS(2)
+    "vld1.8     {d3}, [%2],%6                  \n"
+    "subs       %4, %4, #8                     \n"  // 8 pixels
+    "vsubl.u8   q1, d2, d3                     \n"
+    "vadd.s16   q0, q0, q1                     \n"
+    "vabs.s16   q0, q0                         \n"
+    "vqmovn.u16 d0, q0                         \n"
+    MEMACCESS(3)
+    "vst1.8     {d0}, [%3]!                    \n"  // store 8 sobelx
+    "bgt        1b                             \n"
+  : "+r"(src_y0),      // %0
+    "+r"(src_y1),      // %1
+    "+r"(src_y2),      // %2
+    "+r"(dst_sobelx),  // %3
+    "+r"(width)        // %4
+  : "r"(2),            // %5
+    "r"(6)             // %6
+  : "cc", "memory", "q0", "q1"  // Clobber List
+  );
+}
+
+// SobelY as a matrix is
+// -1 -2 -1
+//  0  0  0
+//  1  2  1
+void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
+                    uint8* dst_sobely, int width) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {d0}, [%0],%4                  \n"  // left
+    MEMACCESS(1)
+    "vld1.8     {d1}, [%1],%4                  \n"
+    "vsubl.u8   q0, d0, d1                     \n"
+    MEMACCESS(0)
+    "vld1.8     {d2}, [%0],%4                  \n"  // center * 2
+    MEMACCESS(1)
+    "vld1.8     {d3}, [%1],%4                  \n"
+    "vsubl.u8   q1, d2, d3                     \n"
+    "vadd.s16   q0, q0, q1                     \n"
+    "vadd.s16   q0, q0, q1                     \n"
+    MEMACCESS(0)
+    "vld1.8     {d2}, [%0],%5                  \n"  // right
+    MEMACCESS(1)
+    "vld1.8     {d3}, [%1],%5                  \n"
+    "subs       %3, %3, #8                     \n"  // 8 pixels
+    "vsubl.u8   q1, d2, d3                     \n"
+    "vadd.s16   q0, q0, q1                     \n"
+    "vabs.s16   q0, q0                         \n"
+    "vqmovn.u16 d0, q0                         \n"
+    MEMACCESS(2)
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 sobely
+    "bgt        1b                             \n"
+  : "+r"(src_y0),      // %0
+    "+r"(src_y1),      // %1
+    "+r"(dst_sobely),  // %2
+    "+r"(width)        // %3
+  : "r"(1),            // %4
+    "r"(6)             // %5
+  : "cc", "memory", "q0", "q1"  // Clobber List
+  );
+}
+#endif  // defined(__ARM_NEON__) && !defined(__aarch64__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libaom/src/third_party/libyuv/source/row_neon64.cc b/libs/libaom/src/third_party/libyuv/source/row_neon64.cc
new file mode 100644
index 000000000..5d015454b
--- /dev/null
+++ b/libs/libaom/src/third_party/libyuv/source/row_neon64.cc
@@ -0,0 +1,3087 @@
+/*
+ *  Copyright 2014 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC Neon armv8 64 bit.
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+// Read 8 Y, 4 U and 4 V from 422
+#define READYUV422                                                             \
+    MEMACCESS(0)                                                               \
+    "ld1        {v0.8b}, [%0], #8              \n"                             \
+    MEMACCESS(1)                                                               \
+    "ld1        {v1.s}[0], [%1], #4            \n"                             \
+    MEMACCESS(2)                                                               \
+    "ld1        {v1.s}[1], [%2], #4            \n"
+
+// Read 8 Y, 2 U and 2 V from 422
+#define READYUV411                                                             \
+    MEMACCESS(0)                                                               \
+    "ld1        {v0.8b}, [%0], #8              \n"                             \
+    MEMACCESS(1)                                                               \
+    "ld1        {v2.h}[0], [%1], #2            \n"                             \
+    MEMACCESS(2)                                                               \
+    "ld1        {v2.h}[1], [%2], #2            \n"                             \
+    "zip1       v1.8b, v2.8b, v2.8b            \n"
+
+// Read 8 Y, 8 U and 8 V from 444
+#define READYUV444                                                             \
+    MEMACCESS(0)                                                               \
+    "ld1        {v0.8b}, [%0], #8              \n"                             \
+    MEMACCESS(1)                                                               \
+    "ld1        {v1.d}[0], [%1], #8            \n"                             \
+    MEMACCESS(2)                                                               \
+    "ld1        {v1.d}[1], [%2], #8            \n"                             \
+    "uaddlp     v1.8h, v1.16b                  \n"                             \
+    "rshrn      v1.8b, v1.8h, #1               \n"
+
+// Read 8 Y, and set 4 U and 4 V to 128
+#define READYUV400                                                             \
+    MEMACCESS(0)                                                               \
+    "ld1        {v0.8b}, [%0], #8              \n"                             \
+    "movi       v1.8b , #128                   \n"
+
+// Read 8 Y and 4 UV from NV12
+#define READNV12                                                               \
+    MEMACCESS(0)                                                               \
+    "ld1        {v0.8b}, [%0], #8              \n"                             \
+    MEMACCESS(1)                                                               \
+    "ld1        {v2.8b}, [%1], #8              \n"                             \
+    "uzp1       v1.8b, v2.8b, v2.8b            \n"                             \
+    "uzp2       v3.8b, v2.8b, v2.8b            \n"                             \
+    "ins        v1.s[1], v3.s[0]               \n"
+
+// Read 8 Y and 4 VU from NV21
+#define READNV21                                                               \
+    MEMACCESS(0)                                                               \
+    "ld1        {v0.8b}, [%0], #8              \n"                             \
+    MEMACCESS(1)                                                               \
+    "ld1        {v2.8b}, [%1], #8              \n"                             \
+    "uzp1       v3.8b, v2.8b, v2.8b            \n"                             \
+    "uzp2       v1.8b, v2.8b, v2.8b            \n"                             \
+    "ins        v1.s[1], v3.s[0]               \n"
+
+// Read 8 YUY2
+#define READYUY2                                                               \
+    MEMACCESS(0)                                                               \
+    "ld2        {v0.8b, v1.8b}, [%0], #16      \n"                             \
+    "uzp2       v3.8b, v1.8b, v1.8b            \n"                             \
+    "uzp1       v1.8b, v1.8b, v1.8b            \n"                             \
+    "ins        v1.s[1], v3.s[0]               \n"
+
+// Read 8 UYVY
+#define READUYVY                                                               \
+    MEMACCESS(0)                                                               \
+    "ld2        {v2.8b, v3.8b}, [%0], #16      \n"                             \
+    "orr        v0.8b, v3.8b, v3.8b            \n"                             \
+    "uzp1       v1.8b, v2.8b, v2.8b            \n"                             \
+    "uzp2       v3.8b, v2.8b, v2.8b            \n"                             \
+    "ins        v1.s[1], v3.s[0]               \n"
+
+#define YUV422TORGB_SETUP_REG                                                  \
+    "ld1r       {v24.8h}, [%[kUVBiasBGR]], #2  \n"                             \
+    "ld1r       {v25.8h}, [%[kUVBiasBGR]], #2  \n"                             \
+    "ld1r       {v26.8h}, [%[kUVBiasBGR]]      \n"                             \
+    "ld1r       {v31.4s}, [%[kYToRgb]]         \n"                             \
+    "movi       v27.8h, #128                   \n"                             \
+    "movi       v28.8h, #102                   \n"                             \
+    "movi       v29.8h, #25                    \n"                             \
+    "movi       v30.8h, #52                    \n"
+
+#define YUV422TORGB(vR, vG, vB)                                                \
+    "uxtl       v0.8h, v0.8b                   \n" /* Extract Y    */          \
+    "shll       v2.8h, v1.8b, #8               \n" /* Replicate UV */          \
+    "ushll2     v3.4s, v0.8h, #0               \n" /* Y */                     \
+    "ushll      v0.4s, v0.4h, #0               \n"                             \
+    "mul        v3.4s, v3.4s, v31.4s           \n"                             \
+    "mul        v0.4s, v0.4s, v31.4s           \n"                             \
+    "sqshrun    v0.4h, v0.4s, #16              \n"                             \
+    "sqshrun2   v0.8h, v3.4s, #16              \n" /* Y */                     \
+    "uaddw      v1.8h, v2.8h, v1.8b            \n" /* Replicate UV */          \
+    "mov        v2.d[0], v1.d[1]               \n" /* Extract V */             \
+    "uxtl       v2.8h, v2.8b                   \n"                             \
+    "uxtl       v1.8h, v1.8b                   \n" /* Extract U */             \
+    "mul        v3.8h, v1.8h, v27.8h           \n"                             \
+    "mul        v5.8h, v1.8h, v29.8h           \n"                             \
+    "mul        v6.8h, v2.8h, v30.8h           \n"                             \
+    "mul        v7.8h, v2.8h, v28.8h           \n"                             \
+    "sqadd      v6.8h, v6.8h, v5.8h            \n"                             \
+    "sqadd      " #vB ".8h, v24.8h, v0.8h      \n" /* B */                     \
+    "sqadd      " #vG ".8h, v25.8h, v0.8h      \n" /* G */                     \
+    "sqadd      " #vR ".8h, v26.8h, v0.8h      \n" /* R */                     \
+    "sqadd      " #vB ".8h, " #vB ".8h, v3.8h  \n" /* B */                     \
+    "sqsub      " #vG ".8h, " #vG ".8h, v6.8h  \n" /* G */                     \
+    "sqadd      " #vR ".8h, " #vR ".8h, v7.8h  \n" /* R */                     \
+    "sqshrun    " #vB ".8b, " #vB ".8h, #6     \n" /* B */                     \
+    "sqshrun    " #vG ".8b, " #vG ".8h, #6     \n" /* G */                     \
+    "sqshrun    " #vR ".8b, " #vR ".8h, #6     \n" /* R */                     \
+
+// YUV to RGB conversion constants.
+// Y contribution to R,G,B.  Scale and bias.
+#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YGB 1160 /* 1.164 * 64 * 16 - adjusted for even error distribution */
+
+// U and V contributions to R,G,B.
+#define UB -128 /* -min(128, round(2.018 * 64)) */
+#define UG 25 /* -round(-0.391 * 64) */
+#define VG 52 /* -round(-0.813 * 64) */
+#define VR -102 /* -round(1.596 * 64) */
+
+// Bias values to subtract 16 from Y and 128 from U and V.
+#define BB (UB * 128            - YGB)
+#define BG (UG * 128 + VG * 128 - YGB)
+#define BR            (VR * 128 - YGB)
+
+static vec16 kUVBiasBGR = { BB, BG, BR, 0, 0, 0, 0, 0 };
+static vec32 kYToRgb = { 0x0101 * YG, 0, 0, 0 };
+
+#undef YG
+#undef YGB
+#undef UB
+#undef UG
+#undef VG
+#undef VR
+#undef BB
+#undef BG
+#undef BR
+
+#define RGBTOUV_SETUP_REG                                                      \
+    "movi       v20.8h, #56, lsl #0  \n"  /* UB/VR coefficient (0.875) / 2 */  \
+    "movi       v21.8h, #37, lsl #0  \n"  /* UG coefficient (-0.5781) / 2  */  \
+    "movi       v22.8h, #19, lsl #0  \n"  /* UR coefficient (-0.2969) / 2  */  \
+    "movi       v23.8h, #9,  lsl #0  \n"  /* VB coefficient (-0.1406) / 2  */  \
+    "movi       v24.8h, #47, lsl #0  \n"  /* VG coefficient (-0.7344) / 2  */  \
+    "movi       v25.16b, #0x80       \n"  /* 128.5 (0x8080 in 16-bit)      */
+
+
+#ifdef HAS_I444TOARGBROW_NEON
+void I444ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+  "1:                                          \n"
+    READYUV444
+    YUV422TORGB(v22, v21, v20)
+    "subs       %w4, %w4, #8                 \n"
+    "movi       v23.8b, #255                   \n" /* A */
+    MEMACCESS(3)
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_argb),  // %3
+      "+r"(width)      // %4
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+#endif  // HAS_I444TOARGBROW_NEON
+
+#ifdef HAS_I422TOARGBROW_NEON
+void I422ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+  "1:                                          \n"
+    READYUV422
+    YUV422TORGB(v22, v21, v20)
+    "subs       %w4, %w4, #8                   \n"
+    "movi       v23.8b, #255                   \n" /* A */
+    MEMACCESS(3)
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_argb),  // %3
+      "+r"(width)      // %4
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+#endif  // HAS_I422TOARGBROW_NEON
+
+#ifdef HAS_I411TOARGBROW_NEON
+void I411ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+  "1:                                          \n"
+    READYUV411
+    YUV422TORGB(v22, v21, v20)
+    "subs       %w4, %w4, #8                   \n"
+    "movi       v23.8b, #255                   \n" /* A */
+    MEMACCESS(3)
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_argb),  // %3
+      "+r"(width)      // %4
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+#endif  // HAS_I411TOARGBROW_NEON
+
+#ifdef HAS_I422TOBGRAROW_NEON
+void I422ToBGRARow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_bgra,
+                        int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+  "1:                                          \n"
+    READYUV422
+    YUV422TORGB(v21, v22, v23)
+    "subs       %w4, %w4, #8                   \n"
+    "movi       v20.8b, #255                   \n" /* A */
+    MEMACCESS(3)
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_bgra),  // %3
+      "+r"(width)      // %4
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+#endif  // HAS_I422TOBGRAROW_NEON
+
+#ifdef HAS_I422TOABGRROW_NEON
+void I422ToABGRRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_abgr,
+                        int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+  "1:                                          \n"
+    READYUV422
+    YUV422TORGB(v20, v21, v22)
+    "subs       %w4, %w4, #8                   \n"
+    "movi       v23.8b, #255                   \n" /* A */
+    MEMACCESS(3)
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_abgr),  // %3
+      "+r"(width)      // %4
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+#endif  // HAS_I422TOABGRROW_NEON
+
+#ifdef HAS_I422TORGBAROW_NEON
+void I422ToRGBARow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_rgba,
+                        int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+  "1:                                          \n"
+    READYUV422
+    YUV422TORGB(v23, v22, v21)
+    "subs       %w4, %w4, #8                   \n"
+    "movi       v20.8b, #255                   \n" /* A */
+    MEMACCESS(3)
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_rgba),  // %3
+      "+r"(width)      // %4
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+#endif  // HAS_I422TORGBAROW_NEON
+
+#ifdef HAS_I422TORGB24ROW_NEON
+void I422ToRGB24Row_NEON(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_rgb24,
+                         int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+  "1:                                          \n"
+    READYUV422
+    YUV422TORGB(v22, v21, v20)
+    "subs       %w4, %w4, #8                   \n"
+    MEMACCESS(3)
+    "st3        {v20.8b,v21.8b,v22.8b}, [%3], #24     \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_rgb24), // %3
+      "+r"(width)      // %4
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+#endif  // HAS_I422TORGB24ROW_NEON
+
+#ifdef HAS_I422TORAWROW_NEON
+void I422ToRAWRow_NEON(const uint8* src_y,
+                       const uint8* src_u,
+                       const uint8* src_v,
+                       uint8* dst_raw,
+                       int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+  "1:                                          \n"
+    READYUV422
+    YUV422TORGB(v20, v21, v22)
+    "subs       %w4, %w4, #8                   \n"
+    MEMACCESS(3)
+    "st3        {v20.8b,v21.8b,v22.8b}, [%3], #24     \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_raw),   // %3
+      "+r"(width)      // %4
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+#endif  // HAS_I422TORAWROW_NEON
+
+#define ARGBTORGB565                                                           \
+    "shll       v0.8h,  v22.8b, #8             \n"  /* R                    */ \
+    "shll       v20.8h, v20.8b, #8             \n"  /* B                    */ \
+    "shll       v21.8h, v21.8b, #8             \n"  /* G                    */ \
+    "sri        v0.8h,  v21.8h, #5             \n"  /* RG                   */ \
+    "sri        v0.8h,  v20.8h, #11            \n"  /* RGB                  */
+
+#ifdef HAS_I422TORGB565ROW_NEON
+void I422ToRGB565Row_NEON(const uint8* src_y,
+                          const uint8* src_u,
+                          const uint8* src_v,
+                          uint8* dst_rgb565,
+                          int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+  "1:                                          \n"
+    READYUV422
+    YUV422TORGB(v22, v21, v20)
+    "subs       %w4, %w4, #8                   \n"
+    ARGBTORGB565
+    MEMACCESS(3)
+    "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels RGB565.
+    "b.gt       1b                             \n"
+    : "+r"(src_y),    // %0
+      "+r"(src_u),    // %1
+      "+r"(src_v),    // %2
+      "+r"(dst_rgb565),  // %3
+      "+r"(width)     // %4
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+#endif  // HAS_I422TORGB565ROW_NEON
+
+#define ARGBTOARGB1555                                                         \
+    "shll       v0.8h,  v23.8b, #8             \n"  /* A                    */ \
+    "shll       v22.8h, v22.8b, #8             \n"  /* R                    */ \
+    "shll       v20.8h, v20.8b, #8             \n"  /* B                    */ \
+    "shll       v21.8h, v21.8b, #8             \n"  /* G                    */ \
+    "sri        v0.8h,  v22.8h, #1             \n"  /* AR                   */ \
+    "sri        v0.8h,  v21.8h, #6             \n"  /* ARG                  */ \
+    "sri        v0.8h,  v20.8h, #11            \n"  /* ARGB                 */
+
+#ifdef HAS_I422TOARGB1555ROW_NEON
+void I422ToARGB1555Row_NEON(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb1555,
+                            int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+  "1:                                          \n"
+    READYUV422
+    YUV422TORGB(v22, v21, v20)
+    "subs       %w4, %w4, #8                   \n"
+    "movi       v23.8b, #255                   \n"
+    ARGBTOARGB1555
+    MEMACCESS(3)
+    "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels RGB565.
+    "b.gt       1b                             \n"
+    : "+r"(src_y),    // %0
+      "+r"(src_u),    // %1
+      "+r"(src_v),    // %2
+      "+r"(dst_argb1555),  // %3
+      "+r"(width)     // %4
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+#endif  // HAS_I422TOARGB1555ROW_NEON
+
+#define ARGBTOARGB4444                                                         \
+    /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f        */ \
+    "ushr       v20.8b, v20.8b, #4             \n"  /* B                    */ \
+    "bic        v21.8b, v21.8b, v4.8b          \n"  /* G                    */ \
+    "ushr       v22.8b, v22.8b, #4             \n"  /* R                    */ \
+    "bic        v23.8b, v23.8b, v4.8b          \n"  /* A                    */ \
+    "orr        v0.8b,  v20.8b, v21.8b         \n"  /* BG                   */ \
+    "orr        v1.8b,  v22.8b, v23.8b         \n"  /* RA                   */ \
+    "zip1       v0.16b, v0.16b, v1.16b         \n"  /* BGRA                 */
+
+#ifdef HAS_I422TOARGB4444ROW_NEON
+void I422ToARGB4444Row_NEON(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb4444,
+                            int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+    "movi       v4.16b, #0x0f                  \n"  // bits to clear with vbic.
+  "1:                                          \n"
+    READYUV422
+    YUV422TORGB(v22, v21, v20)
+    "subs       %w4, %w4, #8                   \n"
+    "movi       v23.8b, #255                   \n"
+    ARGBTOARGB4444
+    MEMACCESS(3)
+    "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels ARGB4444.
+    "b.gt       1b                             \n"
+    : "+r"(src_y),    // %0
+      "+r"(src_u),    // %1
+      "+r"(src_v),    // %2
+      "+r"(dst_argb4444),  // %3
+      "+r"(width)     // %4
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+#endif  // HAS_I422TOARGB4444ROW_NEON
+
+#ifdef HAS_I400TOARGBROW_NEON
+void I400ToARGBRow_NEON(const uint8* src_y,
+                        uint8* dst_argb,
+                        int width) {
+  int64 width64 = (int64)(width);
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+  "1:                                          \n"
+    READYUV400
+    YUV422TORGB(v22, v21, v20)
+    "subs       %w2, %w2, #8                   \n"
+    "movi       v23.8b, #255                   \n"
+    MEMACCESS(1)
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32     \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(dst_argb),  // %1
+      "+r"(width64)    // %2
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+#endif  // HAS_I400TOARGBROW_NEON
+
+#ifdef HAS_J400TOARGBROW_NEON
+void J400ToARGBRow_NEON(const uint8* src_y,
+                        uint8* dst_argb,
+                        int width) {
+  asm volatile (
+    "movi       v23.8b, #255                   \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v20.8b}, [%0], #8             \n"
+    "orr        v21.8b, v20.8b, v20.8b         \n"
+    "orr        v22.8b, v20.8b, v20.8b         \n"
+    "subs       %w2, %w2, #8                   \n"
+    MEMACCESS(1)
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32     \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(dst_argb),  // %1
+      "+r"(width)      // %2
+    :
+    : "cc", "memory", "v20", "v21", "v22", "v23"
+  );
+}
+#endif  // HAS_J400TOARGBROW_NEON
+
+#ifdef HAS_NV12TOARGBROW_NEON
+void NV12ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_uv,
+                        uint8* dst_argb,
+                        int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+  "1:                                          \n"
+    READNV12
+    YUV422TORGB(v22, v21, v20)
+    "subs       %w3, %w3, #8                   \n"
+    "movi       v23.8b, #255                   \n"
+    MEMACCESS(2)
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32     \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_uv),    // %1
+      "+r"(dst_argb),  // %2
+      "+r"(width)      // %3
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+#endif  // HAS_NV12TOARGBROW_NEON
+
+#ifdef HAS_NV21TOARGBROW_NEON
+void NV21ToARGBRow_NEON(const uint8* src_y,
+                        const uint8* src_uv,
+                        uint8* dst_argb,
+                        int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+  "1:                                          \n"
+    READNV21
+    YUV422TORGB(v22, v21, v20)
+    "subs       %w3, %w3, #8                   \n"
+    "movi       v23.8b, #255                   \n"
+    MEMACCESS(2)
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32     \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_uv),    // %1
+      "+r"(dst_argb),  // %2
+      "+r"(width)      // %3
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+#endif  // HAS_NV21TOARGBROW_NEON
+
+#ifdef HAS_NV12TORGB565ROW_NEON
+void NV12ToRGB565Row_NEON(const uint8* src_y,
+                          const uint8* src_uv,
+                          uint8* dst_rgb565,
+                          int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+  "1:                                          \n"
+    READNV12
+    YUV422TORGB(v22, v21, v20)
+    "subs       %w3, %w3, #8                   \n"
+    ARGBTORGB565
+    MEMACCESS(2)
+    "st1        {v0.8h}, [%2], 16              \n"  // store 8 pixels RGB565.
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_uv),    // %1
+      "+r"(dst_rgb565),  // %2
+      "+r"(width)      // %3
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+#endif  // HAS_NV12TORGB565ROW_NEON
+
+#ifdef HAS_NV21TORGB565ROW_NEON
+void NV21ToRGB565Row_NEON(const uint8* src_y,
+                          const uint8* src_uv,
+                          uint8* dst_rgb565,
+                          int width) {
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+  "1:                                          \n"
+    READNV21
+    YUV422TORGB(v22, v21, v20)
+    "subs       %w3, %w3, #8                   \n"
+    ARGBTORGB565
+    MEMACCESS(2)
+    "st1        {v0.8h}, [%2], 16              \n"  // store 8 pixels RGB565.
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_uv),    // %1
+      "+r"(dst_rgb565),  // %2
+      "+r"(width)      // %3
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+#endif  // HAS_NV21TORGB565ROW_NEON
+
+#ifdef HAS_YUY2TOARGBROW_NEON
+void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
+                        uint8* dst_argb,
+                        int width) {
+  int64 width64 = (int64)(width);
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+  "1:                                          \n"
+    READYUY2
+    YUV422TORGB(v22, v21, v20)
+    "subs       %w2, %w2, #8                   \n"
+    "movi       v23.8b, #255                   \n"
+    MEMACCESS(1)
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32      \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_yuy2),  // %0
+      "+r"(dst_argb),  // %1
+      "+r"(width64)    // %2
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+#endif  // HAS_YUY2TOARGBROW_NEON
+
+#ifdef HAS_UYVYTOARGBROW_NEON
+void UYVYToARGBRow_NEON(const uint8* src_uyvy,
+                        uint8* dst_argb,
+                        int width) {
+  int64 width64 = (int64)(width);
+  asm volatile (
+    YUV422TORGB_SETUP_REG
+  "1:                                          \n"
+    READUYVY
+    YUV422TORGB(v22, v21, v20)
+    "subs       %w2, %w2, #8                   \n"
+    "movi       v23.8b, #255                   \n"
+    MEMACCESS(1)
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32      \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_uyvy),  // %0
+      "+r"(dst_argb),  // %1
+      "+r"(width64)    // %2
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+  );
+}
+#endif  // HAS_UYVYTOARGBROW_NEON
+
+// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
+#ifdef HAS_SPLITUVROW_NEON
+void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                     int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pairs of UV
+    "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
+    MEMACCESS(1)
+    "st1        {v0.16b}, [%1], #16            \n"  // store U
+    MEMACCESS(2)
+    "st1        {v1.16b}, [%2], #16            \n"  // store V
+    "b.gt       1b                             \n"
+    : "+r"(src_uv),  // %0
+      "+r"(dst_u),   // %1
+      "+r"(dst_v),   // %2
+      "+r"(width)    // %3  // Output registers
+    :                       // Input registers
+    : "cc", "memory", "v0", "v1"  // Clobber List
+  );
+}
+#endif  // HAS_SPLITUVROW_NEON
+
+// Reads 16 U's and V's and writes out 16 pairs of UV.
+#ifdef HAS_MERGEUVROW_NEON
+void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                     int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"  // load U
+    MEMACCESS(1)
+    "ld1        {v1.16b}, [%1], #16            \n"  // load V
+    "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
+    MEMACCESS(2)
+    "st2        {v0.16b,v1.16b}, [%2], #32     \n"  // store 16 pairs of UV
+    "b.gt       1b                             \n"
+    :
+      "+r"(src_u),   // %0
+      "+r"(src_v),   // %1
+      "+r"(dst_uv),  // %2
+      "+r"(width)    // %3  // Output registers
+    :                       // Input registers
+    : "cc", "memory", "v0", "v1"  // Clobber List
+  );
+}
+#endif  // HAS_MERGEUVROW_NEON
+
+// Copy multiple of 32.  vld4.8  allow unaligned and is fastest on a15.
+#ifdef HAS_COPYROW_NEON
+void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32       \n"  // load 32
+    "subs       %w2, %w2, #32                  \n"  // 32 processed per loop
+    MEMACCESS(1)
+    "st1        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32       \n"  // store 32
+    "b.gt       1b                             \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(count)  // %2  // Output registers
+  :                     // Input registers
+  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
+  );
+}
+#endif  // HAS_COPYROW_NEON
+
+// SetRow writes 'count' bytes using an 8 bit value repeated.
+void SetRow_NEON(uint8* dst, uint8 v8, int count) {
+  asm volatile (
+    "dup        v0.16b, %w2                    \n"  // duplicate 16 bytes
+  "1:                                          \n"
+    "subs      %w1, %w1, #16                   \n"  // 16 bytes per loop
+    MEMACCESS(0)
+    "st1        {v0.16b}, [%0], #16            \n"  // store
+    "b.gt      1b                              \n"
+  : "+r"(dst),   // %0
+    "+r"(count)  // %1
+  : "r"(v8)      // %2
+  : "cc", "memory", "v0"
+  );
+}
+
+void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {
+  asm volatile (
+    "dup        v0.4s, %w2                     \n"  // duplicate 4 ints
+  "1:                                          \n"
+    "subs      %w1, %w1, #4                    \n"  // 4 ints per loop
+    MEMACCESS(0)
+    "st1        {v0.16b}, [%0], #16            \n"  // store
+    "b.gt      1b                              \n"
+  : "+r"(dst),   // %0
+    "+r"(count)  // %1
+  : "r"(v32)     // %2
+  : "cc", "memory", "v0"
+  );
+}
+
+#ifdef HAS_MIRRORROW_NEON
+void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
+  int64 width64 = (int64) width;
+  asm volatile (
+    // Start at end of source row.
+    "add        %0, %0, %2                     \n"
+    "sub        %0, %0, #16                    \n"
+
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], %3             \n"  // src -= 16
+    "subs       %2, %2, #16                   \n"  // 16 pixels per loop.
+    "rev64      v0.16b, v0.16b                 \n"
+    MEMACCESS(1)
+    "st1        {v0.D}[1], [%1], #8            \n"  // dst += 16
+    MEMACCESS(1)
+    "st1        {v0.D}[0], [%1], #8            \n"
+    "b.gt       1b                             \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(width64)  // %2
+  : "r"((ptrdiff_t)-16)    // %3
+  : "cc", "memory", "v0"
+  );
+}
+#endif  // HAS_MIRRORROW_NEON
+
+#ifdef HAS_MIRRORUVROW_NEON
+void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+                      int width) {
+  int64 width64 = (int64) width;
+  asm volatile (
+    // Start at end of source row.
+    "add        %0, %0, %3, lsl #1             \n"
+    "sub        %0, %0, #16                    \n"
+
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld2        {v0.8b, v1.8b}, [%0], %4       \n"  // src -= 16
+    "subs       %3, %3, #8                     \n"  // 8 pixels per loop.
+    "rev64      v0.8b, v0.8b                   \n"
+    "rev64      v1.8b, v1.8b                   \n"
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // dst += 8
+    MEMACCESS(2)
+    "st1        {v1.8b}, [%2], #8              \n"
+    "b.gt       1b                             \n"
+  : "+r"(src_uv),  // %0
+    "+r"(dst_u),   // %1
+    "+r"(dst_v),   // %2
+    "+r"(width64)    // %3
+  : "r"((ptrdiff_t)-16)      // %4
+  : "cc", "memory", "v0", "v1"
+  );
+}
+#endif  // HAS_MIRRORUVROW_NEON
+
+#ifdef HAS_ARGBMIRRORROW_NEON
+void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
+  int64 width64 = (int64) width;
+  asm volatile (
+    // Start at end of source row.
+    "add        %0, %0, %2, lsl #2             \n"
+    "sub        %0, %0, #16                    \n"
+
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], %3             \n"  // src -= 16
+    "subs       %2, %2, #4                     \n"  // 4 pixels per loop.
+    "rev64      v0.4s, v0.4s                   \n"
+    MEMACCESS(1)
+    "st1        {v0.D}[1], [%1], #8            \n"  // dst += 16
+    MEMACCESS(1)
+    "st1        {v0.D}[0], [%1], #8            \n"
+    "b.gt       1b                             \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(width64)  // %2
+  : "r"((ptrdiff_t)-16)    // %3
+  : "cc", "memory", "v0"
+  );
+}
+#endif  // HAS_ARGBMIRRORROW_NEON
+
+#ifdef HAS_RGB24TOARGBROW_NEON
+void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
+  asm volatile (
+    "movi       v4.8b, #255                    \n"  // Alpha
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld3        {v1.8b,v2.8b,v3.8b}, [%0], #24 \n"  // load 8 pixels of RGB24.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    MEMACCESS(1)
+    "st4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
+  : "+r"(src_rgb24),  // %0
+    "+r"(dst_argb),   // %1
+    "+r"(pix)         // %2
+  :
+  : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List
+  );
+}
+#endif  // HAS_RGB24TOARGBROW_NEON
+
+#ifdef HAS_RAWTOARGBROW_NEON
+void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {
+  asm volatile (
+    "movi       v5.8b, #255                    \n"  // Alpha
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // read r g b
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "orr        v3.8b, v1.8b, v1.8b            \n"  // move g
+    "orr        v4.8b, v0.8b, v0.8b            \n"  // move r
+    MEMACCESS(1)
+    "st4        {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n"  // store b g r a
+    "b.gt       1b                             \n"
+  : "+r"(src_raw),   // %0
+    "+r"(dst_argb),  // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5"  // Clobber List
+  );
+}
+#endif  // HAS_RAWTOARGBROW_NEON
+
+#define RGB565TOARGB                                                           \
+    "shrn       v6.8b, v0.8h, #5               \n"  /* G xxGGGGGG           */ \
+    "shl        v6.8b, v6.8b, #2               \n"  /* G GGGGGG00 upper 6   */ \
+    "ushr       v4.8b, v6.8b, #6               \n"  /* G 000000GG lower 2   */ \
+    "orr        v1.8b, v4.8b, v6.8b            \n"  /* G                    */ \
+    "xtn        v2.8b, v0.8h                   \n"  /* B xxxBBBBB           */ \
+    "ushr       v0.8h, v0.8h, #11              \n"  /* R 000RRRRR           */ \
+    "xtn2       v2.16b,v0.8h                   \n"  /* R in upper part      */ \
+    "shl        v2.16b, v2.16b, #3             \n"  /* R,B BBBBB000 upper 5 */ \
+    "ushr       v0.16b, v2.16b, #5             \n"  /* R,B 00000BBB lower 3 */ \
+    "orr        v0.16b, v0.16b, v2.16b         \n"  /* R,B                  */ \
+    "dup        v2.2D, v0.D[1]                 \n"  /* R                    */
+
+#ifdef HAS_RGB565TOARGBROW_NEON
+void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) {
+  asm volatile (
+    "movi       v3.8b, #255                    \n"  // Alpha
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    RGB565TOARGB
+    MEMACCESS(1)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
+  : "+r"(src_rgb565),  // %0
+    "+r"(dst_argb),    // %1
+    "+r"(pix)          // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6"  // Clobber List
+  );
+}
+#endif  // HAS_RGB565TOARGBROW_NEON
+
+#define ARGB1555TOARGB                                                         \
+    "ushr       v2.8h, v0.8h, #10              \n"  /* R xxxRRRRR           */ \
+    "shl        v2.8h, v2.8h, #3               \n"  /* R RRRRR000 upper 5   */ \
+    "xtn        v3.8b, v2.8h                   \n"  /* RRRRR000 AAAAAAAA    */ \
+                                                                               \
+    "sshr       v2.8h, v0.8h, #15              \n"  /* A AAAAAAAA           */ \
+    "xtn2       v3.16b, v2.8h                  \n"                             \
+                                                                               \
+    "xtn        v2.8b, v0.8h                   \n"  /* B xxxBBBBB           */ \
+    "shrn2      v2.16b,v0.8h, #5               \n"  /* G xxxGGGGG           */ \
+                                                                               \
+    "ushr       v1.16b, v3.16b, #5             \n"  /* R,A 00000RRR lower 3 */ \
+    "shl        v0.16b, v2.16b, #3             \n"  /* B,G BBBBB000 upper 5 */ \
+    "ushr       v2.16b, v0.16b, #5             \n"  /* B,G 00000BBB lower 3 */ \
+                                                                               \
+    "orr        v0.16b, v0.16b, v2.16b         \n"  /* B,G                  */ \
+    "orr        v2.16b, v1.16b, v3.16b         \n"  /* R,A                  */ \
+    "dup        v1.2D, v0.D[1]                 \n"                             \
+    "dup        v3.2D, v2.D[1]                 \n"
+
+// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
+#define RGB555TOARGB                                                           \
+    "ushr       v2.8h, v0.8h, #10              \n"  /* R xxxRRRRR           */ \
+    "shl        v2.8h, v2.8h, #3               \n"  /* R RRRRR000 upper 5   */ \
+    "xtn        v3.8b, v2.8h                   \n"  /* RRRRR000             */ \
+                                                                               \
+    "xtn        v2.8b, v0.8h                   \n"  /* B xxxBBBBB           */ \
+    "shrn2      v2.16b,v0.8h, #5               \n"  /* G xxxGGGGG           */ \
+                                                                               \
+    "ushr       v1.16b, v3.16b, #5             \n"  /* R   00000RRR lower 3 */ \
+    "shl        v0.16b, v2.16b, #3             \n"  /* B,G BBBBB000 upper 5 */ \
+    "ushr       v2.16b, v0.16b, #5             \n"  /* B,G 00000BBB lower 3 */ \
+                                                                               \
+    "orr        v0.16b, v0.16b, v2.16b         \n"  /* B,G                  */ \
+    "orr        v2.16b, v1.16b, v3.16b         \n"  /* R                    */ \
+    "dup        v1.2D, v0.D[1]                 \n"  /* G */                    \
+
+#ifdef HAS_ARGB1555TOARGBROW_NEON
+void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
+                            int pix) {
+  asm volatile (
+    "movi       v3.8b, #255                    \n"  // Alpha
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    ARGB1555TOARGB
+    MEMACCESS(1)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
+  : "+r"(src_argb1555),  // %0
+    "+r"(dst_argb),    // %1
+    "+r"(pix)          // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
+  );
+}
+#endif  // HAS_ARGB1555TOARGBROW_NEON
+
+#define ARGB4444TOARGB                                                         \
+    "shrn       v1.8b,  v0.8h, #8              \n"  /* v1(l) AR             */ \
+    "xtn2       v1.16b, v0.8h                  \n"  /* v1(h) GB             */ \
+    "shl        v2.16b, v1.16b, #4             \n"  /* B,R BBBB0000         */ \
+    "ushr       v3.16b, v1.16b, #4             \n"  /* G,A 0000GGGG         */ \
+    "ushr       v0.16b, v2.16b, #4             \n"  /* B,R 0000BBBB         */ \
+    "shl        v1.16b, v3.16b, #4             \n"  /* G,A GGGG0000         */ \
+    "orr        v2.16b, v0.16b, v2.16b         \n"  /* B,R BBBBBBBB         */ \
+    "orr        v3.16b, v1.16b, v3.16b         \n"  /* G,A GGGGGGGG         */ \
+    "dup        v0.2D, v2.D[1]                 \n"                             \
+    "dup        v1.2D, v3.D[1]                 \n"
+
+#ifdef HAS_ARGB4444TOARGBROW_NEON
+void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
+                            int pix) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    ARGB4444TOARGB
+    MEMACCESS(1)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
+  : "+r"(src_argb4444),  // %0
+    "+r"(dst_argb),    // %1
+    "+r"(pix)          // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4"  // Clobber List
+  );
+}
+#endif  // HAS_ARGB4444TOARGBROW_NEON
+
+#ifdef HAS_ARGBTORGB24ROW_NEON
+void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load 8 ARGB pixels
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    MEMACCESS(1)
+    "st3        {v1.8b,v2.8b,v3.8b}, [%1], #24 \n"  // store 8 pixels of RGB24.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),   // %0
+    "+r"(dst_rgb24),  // %1
+    "+r"(pix)         // %2
+  :
+  : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List
+  );
+}
+#endif  // HAS_ARGBTORGB24ROW_NEON
+
+#ifdef HAS_ARGBTORAWROW_NEON
+void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load b g r a
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "orr        v4.8b, v2.8b, v2.8b            \n"  // mov g
+    "orr        v5.8b, v1.8b, v1.8b            \n"  // mov b
+    MEMACCESS(1)
+    "st3        {v3.8b,v4.8b,v5.8b}, [%1], #24 \n"  // store r g b
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_raw),   // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "v1", "v2", "v3", "v4", "v5"  // Clobber List
+  );
+}
+#endif  // HAS_ARGBTORAWROW_NEON
+
+#ifdef HAS_YUY2TOYROW_NEON
+void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pixels of YUY2.
+    "subs       %w2, %w2, #16                  \n"  // 16 processed per loop.
+    MEMACCESS(1)
+    "st1        {v0.16b}, [%1], #16            \n"  // store 16 pixels of Y.
+    "b.gt       1b                             \n"
+  : "+r"(src_yuy2),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "v0", "v1"  // Clobber List
+  );
+}
+#endif  // HAS_YUY2TOYROW_NEON
+
+#ifdef HAS_UYVYTOYROW_NEON
+void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pixels of UYVY.
+    "subs       %w2, %w2, #16                  \n"  // 16 processed per loop.
+    MEMACCESS(1)
+    "st1        {v1.16b}, [%1], #16            \n"  // store 16 pixels of Y.
+    "b.gt       1b                             \n"
+  : "+r"(src_uyvy),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "v0", "v1"  // Clobber List
+  );
+}
+#endif  // HAS_UYVYTOYROW_NEON
+
+#ifdef HAS_YUY2TOUV422ROW_NEON
+void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
+                         int pix) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 YUY2 pixels
+    "subs       %w3, %w3, #16                  \n"  // 16 pixels = 8 UVs.
+    MEMACCESS(1)
+    "st1        {v1.8b}, [%1], #8              \n"  // store 8 U.
+    MEMACCESS(2)
+    "st1        {v3.8b}, [%2], #8              \n"  // store 8 V.
+    "b.gt       1b                             \n"
+  : "+r"(src_yuy2),  // %0
+    "+r"(dst_u),     // %1
+    "+r"(dst_v),     // %2
+    "+r"(pix)        // %3
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
+  );
+}
+#endif  // HAS_YUY2TOUV422ROW_NEON
+
+#ifdef HAS_UYVYTOUV422ROW_NEON
+void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
+                         int pix) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 UYVY pixels
+    "subs       %w3, %w3, #16                  \n"  // 16 pixels = 8 UVs.
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 U.
+    MEMACCESS(2)
+    "st1        {v2.8b}, [%2], #8              \n"  // store 8 V.
+    "b.gt       1b                             \n"
+  : "+r"(src_uyvy),  // %0
+    "+r"(dst_u),     // %1
+    "+r"(dst_v),     // %2
+    "+r"(pix)        // %3
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
+  );
+}
+#endif  // HAS_UYVYTOUV422ROW_NEON
+
+#ifdef HAS_YUY2TOUVROW_NEON
+void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  const uint8* src_yuy2b = src_yuy2 + stride_yuy2;
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels
+    "subs       %w4, %w4, #16                  \n"  // 16 pixels = 8 UVs.
+    MEMACCESS(1)
+    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row
+    "urhadd     v1.8b, v1.8b, v5.8b            \n"  // average rows of U
+    "urhadd     v3.8b, v3.8b, v7.8b            \n"  // average rows of V
+    MEMACCESS(2)
+    "st1        {v1.8b}, [%2], #8              \n"  // store 8 U.
+    MEMACCESS(3)
+    "st1        {v3.8b}, [%3], #8              \n"  // store 8 V.
+    "b.gt       1b                             \n"
+  : "+r"(src_yuy2),     // %0
+    "+r"(src_yuy2b),    // %1
+    "+r"(dst_u),        // %2
+    "+r"(dst_v),        // %3
+    "+r"(pix)           // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
+    "v5", "v6", "v7"  // Clobber List
+  );
+}
+#endif  // HAS_YUY2TOUVROW_NEON
+
+#ifdef HAS_UYVYTOUVROW_NEON
+void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  const uint8* src_uyvyb = src_uyvy + stride_uyvy;
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels
+    "subs       %w4, %w4, #16                  \n"  // 16 pixels = 8 UVs.
+    MEMACCESS(1)
+    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row
+    "urhadd     v0.8b, v0.8b, v4.8b            \n"  // average rows of U
+    "urhadd     v2.8b, v2.8b, v6.8b            \n"  // average rows of V
+    MEMACCESS(2)
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 U.
+    MEMACCESS(3)
+    "st1        {v2.8b}, [%3], #8              \n"  // store 8 V.
+    "b.gt       1b                             \n"
+  : "+r"(src_uyvy),     // %0
+    "+r"(src_uyvyb),    // %1
+    "+r"(dst_u),        // %2
+    "+r"(dst_v),        // %3
+    "+r"(pix)           // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
+    "v5", "v6", "v7"  // Clobber List
+  );
+}
+#endif  // HAS_UYVYTOUVROW_NEON
+
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+#ifdef HAS_ARGBSHUFFLEROW_NEON
+void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
+                         const uint8* shuffler, int pix) {
+  asm volatile (
+    MEMACCESS(3)
+    "ld1        {v2.16b}, [%3]                 \n"  // shuffler
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 4 pixels.
+    "subs       %w2, %w2, #4                   \n"  // 4 processed per loop
+    "tbl        v1.16b, {v0.16b}, v2.16b       \n"  // look up 4 pixels
+    MEMACCESS(1)
+    "st1        {v1.16b}, [%1], #16            \n"  // store 4.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(pix)        // %2
+  : "r"(shuffler)    // %3
+  : "cc", "memory", "v0", "v1", "v2"  // Clobber List
+  );
+}
+#endif  // HAS_ARGBSHUFFLEROW_NEON
+
+#ifdef HAS_I422TOYUY2ROW_NEON
+void I422ToYUY2Row_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_yuy2, int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld2        {v0.8b, v1.8b}, [%0], #16      \n"  // load 16 Ys
+    "orr        v2.8b, v1.8b, v1.8b            \n"
+    MEMACCESS(1)
+    "ld1        {v1.8b}, [%1], #8              \n"  // load 8 Us
+    MEMACCESS(2)
+    "ld1        {v3.8b}, [%2], #8              \n"  // load 8 Vs
+    "subs       %w4, %w4, #16                  \n"  // 16 pixels
+    MEMACCESS(3)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.
+    "b.gt       1b                             \n"
+  : "+r"(src_y),     // %0
+    "+r"(src_u),     // %1
+    "+r"(src_v),     // %2
+    "+r"(dst_yuy2),  // %3
+    "+r"(width)      // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3"
+  );
+}
+#endif  // HAS_I422TOYUY2ROW_NEON
+
+#ifdef HAS_I422TOUYVYROW_NEON
+void I422ToUYVYRow_NEON(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_uyvy, int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld2        {v1.8b,v2.8b}, [%0], #16       \n"  // load 16 Ys
+    "orr        v3.8b, v2.8b, v2.8b            \n"
+    MEMACCESS(1)
+    "ld1        {v0.8b}, [%1], #8              \n"  // load 8 Us
+    MEMACCESS(2)
+    "ld1        {v2.8b}, [%2], #8              \n"  // load 8 Vs
+    "subs       %w4, %w4, #16                  \n"  // 16 pixels
+    MEMACCESS(3)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.
+    "b.gt       1b                             \n"
+  : "+r"(src_y),     // %0
+    "+r"(src_u),     // %1
+    "+r"(src_v),     // %2
+    "+r"(dst_uyvy),  // %3
+    "+r"(width)      // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3"
+  );
+}
+#endif  // HAS_I422TOUYVYROW_NEON
+
+#ifdef HAS_ARGBTORGB565ROW_NEON
+void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    ARGBTORGB565
+    MEMACCESS(1)
+    "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels RGB565.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_rgb565),  // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "v0", "v20", "v21", "v22", "v23"
+  );
+}
+#endif  // HAS_ARGBTORGB565ROW_NEON
+
+#ifdef HAS_ARGBTORGB565DITHERROW_NEON
+void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
+                                const uint32 dither4, int width) {
+  asm volatile (
+    "dup        v1.4s, %w2                     \n"  // dither4
+  "1:                                          \n"
+    MEMACCESS(1)
+    "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"  // load 8 pixels
+    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+    "uqadd      v20.8b, v20.8b, v1.8b          \n"
+    "uqadd      v21.8b, v21.8b, v1.8b          \n"
+    "uqadd      v22.8b, v22.8b, v1.8b          \n"
+    ARGBTORGB565
+    MEMACCESS(0)
+    "st1        {v0.16b}, [%0], #16            \n"  // store 8 pixels RGB565.
+    "b.gt       1b                             \n"
+  : "+r"(dst_rgb)    // %0
+  : "r"(src_argb),   // %1
+    "r"(dither4),    // %2
+    "r"(width)       // %3
+  : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23"
+  );
+}
+#endif  // HAS_ARGBTORGB565ROW_NEON
+
+#ifdef HAS_ARGBTOARGB1555ROW_NEON
+void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
+                            int pix) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    ARGBTOARGB1555
+    MEMACCESS(1)
+    "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels ARGB1555.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb1555),  // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "v0", "v20", "v21", "v22", "v23"
+  );
+}
+#endif  // HAS_ARGBTOARGB1555ROW_NEON
+
+#ifdef HAS_ARGBTOARGB4444ROW_NEON
+void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
+                            int pix) {
+  asm volatile (
+    "movi       v4.16b, #0x0f                  \n"  // bits to clear with vbic.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    ARGBTOARGB4444
+    MEMACCESS(1)
+    "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels ARGB4444.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),      // %0
+    "+r"(dst_argb4444),  // %1
+    "+r"(pix)            // %2
+  :
+  : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23"
+  );
+}
+#endif  // HAS_ARGBTOARGB4444ROW_NEON
+
+#ifdef HAS_ARGBTOYROW_NEON
+void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
+  asm volatile (
+    "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
+    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
+    "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
+    "movi       v7.8b, #16                     \n"  // Add 16 constant
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "umull      v3.8h, v0.8b, v4.8b            \n"  // B
+    "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
+    "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
+    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
+    "uqadd      v0.8b, v0.8b, v7.8b            \n"
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
+  );
+}
+#endif  // HAS_ARGBTOYROW_NEON
+
+#ifdef HAS_ARGBTOYJROW_NEON
+void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
+  asm volatile (
+    "movi       v4.8b, #15                     \n"  // B * 0.11400 coefficient
+    "movi       v5.8b, #75                     \n"  // G * 0.58700 coefficient
+    "movi       v6.8b, #38                     \n"  // R * 0.29900 coefficient
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "umull      v3.8h, v0.8b, v4.8b            \n"  // B
+    "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
+    "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
+    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 15 bit to 8 bit Y
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
+  );
+}
+#endif  // HAS_ARGBTOYJROW_NEON
+
+// 8x1 pixels.
+#ifdef HAS_ARGBTOUV444ROW_NEON
+void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+                         int pix) {
+  asm volatile (
+    "movi       v24.8b, #112                   \n"  // UB / VR 0.875 coefficient
+    "movi       v25.8b, #74                    \n"  // UG -0.5781 coefficient
+    "movi       v26.8b, #38                    \n"  // UR -0.2969 coefficient
+    "movi       v27.8b, #18                    \n"  // VB -0.1406 coefficient
+    "movi       v28.8b, #94                    \n"  // VG -0.7344 coefficient
+    "movi       v29.16b,#0x80                  \n"  // 128.5
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
+    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+    "umull      v4.8h, v0.8b, v24.8b           \n"  // B
+    "umlsl      v4.8h, v1.8b, v25.8b           \n"  // G
+    "umlsl      v4.8h, v2.8b, v26.8b           \n"  // R
+    "add        v4.8h, v4.8h, v29.8h           \n"  // +128 -> unsigned
+
+    "umull      v3.8h, v2.8b, v24.8b           \n"  // R
+    "umlsl      v3.8h, v1.8b, v28.8b           \n"  // G
+    "umlsl      v3.8h, v0.8b, v27.8b           \n"  // B
+    "add        v3.8h, v3.8h, v29.8h           \n"  // +128 -> unsigned
+
+    "uqshrn     v0.8b, v4.8h, #8               \n"  // 16 bit to 8 bit U
+    "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
+
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels U.
+    MEMACCESS(2)
+    "st1        {v1.8b}, [%2], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_u),     // %1
+    "+r"(dst_v),     // %2
+    "+r"(pix)        // %3
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
+    "v24", "v25", "v26", "v27", "v28", "v29"
+  );
+}
+#endif  // HAS_ARGBTOUV444ROW_NEON
+
+// 16x1 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
+#ifdef HAS_ARGBTOUV422ROW_NEON
+void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+                         int pix) {
+  asm volatile (
+    RGBTOUV_SETUP_REG
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+
+    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
+
+    "subs       %w3, %w3, #16                  \n"  // 16 processed per loop.
+    "mul        v3.8h, v0.8h, v20.8h           \n"  // B
+    "mls        v3.8h, v1.8h, v21.8h           \n"  // G
+    "mls        v3.8h, v2.8h, v22.8h           \n"  // R
+    "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
+
+    "mul        v4.8h, v2.8h, v20.8h           \n"  // R
+    "mls        v4.8h, v1.8h, v24.8h           \n"  // G
+    "mls        v4.8h, v0.8h, v23.8h           \n"  // B
+    "add        v4.8h, v4.8h, v25.8h           \n"  // +128 -> unsigned
+
+    "uqshrn     v0.8b, v3.8h, #8               \n"  // 16 bit to 8 bit U
+    "uqshrn     v1.8b, v4.8h, #8               \n"  // 16 bit to 8 bit V
+
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels U.
+    MEMACCESS(2)
+    "st1        {v1.8b}, [%2], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_u),     // %1
+    "+r"(dst_v),     // %2
+    "+r"(pix)        // %3
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
+  );
+}
+#endif  // HAS_ARGBTOUV422ROW_NEON
+
+// 32x1 pixels -> 8x1.  pix is number of argb pixels. e.g. 32.
+#ifdef HAS_ARGBTOUV411ROW_NEON
+void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+                         int pix) {
+  asm volatile (
+    RGBTOUV_SETUP_REG
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(0)
+    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%0], #64 \n"  // load next 16.
+    "uaddlp     v4.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v5.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v6.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
+
+    "addp       v0.8h, v0.8h, v4.8h            \n"  // B 16 shorts -> 8 shorts.
+    "addp       v1.8h, v1.8h, v5.8h            \n"  // G 16 shorts -> 8 shorts.
+    "addp       v2.8h, v2.8h, v6.8h            \n"  // R 16 shorts -> 8 shorts.
+
+    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
+    "urshr      v1.8h, v1.8h, #1               \n"
+    "urshr      v2.8h, v2.8h, #1               \n"
+
+    "subs       %w3, %w3, #32                  \n"  // 32 processed per loop.
+    "mul        v3.8h, v0.8h, v20.8h           \n"  // B
+    "mls        v3.8h, v1.8h, v21.8h           \n"  // G
+    "mls        v3.8h, v2.8h, v22.8h           \n"  // R
+    "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
+    "mul        v4.8h, v2.8h, v20.8h           \n"  // R
+    "mls        v4.8h, v1.8h, v24.8h           \n"  // G
+    "mls        v4.8h, v0.8h, v23.8h           \n"  // B
+    "add        v4.8h, v4.8h, v25.8h           \n"  // +128 -> unsigned
+    "uqshrn     v0.8b, v3.8h, #8               \n"  // 16 bit to 8 bit U
+    "uqshrn     v1.8b, v4.8h, #8               \n"  // 16 bit to 8 bit V
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels U.
+    MEMACCESS(2)
+    "st1        {v1.8b}, [%2], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_u),     // %1
+    "+r"(dst_v),     // %2
+    "+r"(pix)        // %3
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
+  );
+}
+#endif  // HAS_ARGBTOUV411ROW_NEON
+
+// 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
+#define RGBTOUV(QB, QG, QR) \
+    "mul        v3.8h, " #QB ",v20.8h          \n"  /* B                    */ \
+    "mul        v4.8h, " #QR ",v20.8h          \n"  /* R                    */ \
+    "mls        v3.8h, " #QG ",v21.8h          \n"  /* G                    */ \
+    "mls        v4.8h, " #QG ",v24.8h          \n"  /* G                    */ \
+    "mls        v3.8h, " #QR ",v22.8h          \n"  /* R                    */ \
+    "mls        v4.8h, " #QB ",v23.8h          \n"  /* B                    */ \
+    "add        v3.8h, v3.8h, v25.8h           \n"  /* +128 -> unsigned     */ \
+    "add        v4.8h, v4.8h, v25.8h           \n"  /* +128 -> unsigned     */ \
+    "uqshrn     v0.8b, v3.8h, #8               \n"  /* 16 bit to 8 bit U    */ \
+    "uqshrn     v1.8b, v4.8h, #8               \n"  /* 16 bit to 8 bit V    */
+
+// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
+// TODO(fbarchard): consider ptrdiff_t for all strides.
+
+#ifdef HAS_ARGBTOUVROW_NEON
+void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  const uint8* src_argb_1 = src_argb + src_stride_argb;
+  asm volatile (
+    RGBTOUV_SETUP_REG
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
+
+    MEMACCESS(1)
+    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
+    "uadalp     v0.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uadalp     v2.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
+
+    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
+    "urshr      v1.8h, v1.8h, #1               \n"
+    "urshr      v2.8h, v2.8h, #1               \n"
+
+    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
+    RGBTOUV(v0.8h, v1.8h, v2.8h)
+    MEMACCESS(2)
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(src_argb_1),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(pix)        // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
+  );
+}
+#endif  // HAS_ARGBTOUVROW_NEON
+
+// TODO(fbarchard): Subsample match C code.
+#ifdef HAS_ARGBTOUVJROW_NEON
+void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int pix) {
+  const uint8* src_argb_1 = src_argb + src_stride_argb;
+  asm volatile (
+    "movi       v20.8h, #63, lsl #0            \n"  // UB/VR coeff (0.500) / 2
+    "movi       v21.8h, #42, lsl #0            \n"  // UG coeff (-0.33126) / 2
+    "movi       v22.8h, #21, lsl #0            \n"  // UR coeff (-0.16874) / 2
+    "movi       v23.8h, #10, lsl #0            \n"  // VB coeff (-0.08131) / 2
+    "movi       v24.8h, #53, lsl #0            \n"  // VG coeff (-0.41869) / 2
+    "movi       v25.16b, #0x80                 \n"  // 128.5 (0x8080 in 16-bit)
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64  \n"  // load next 16
+    "uadalp     v0.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uadalp     v2.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
+
+    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
+    "urshr      v1.8h, v1.8h, #1               \n"
+    "urshr      v2.8h, v2.8h, #1               \n"
+
+    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
+    RGBTOUV(v0.8h, v1.8h, v2.8h)
+    MEMACCESS(2)
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(src_argb_1),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(pix)        // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
+  );
+}
+#endif  // HAS_ARGBTOUVJROW_NEON
+
+#ifdef HAS_BGRATOUVROW_NEON
+void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  const uint8* src_bgra_1 = src_bgra + src_stride_bgra;
+  asm volatile (
+    RGBTOUV_SETUP_REG
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+    "uaddlp     v0.8h, v3.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v3.8h, v2.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v2.8h, v1.16b                  \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more
+    "uadalp     v0.8h, v7.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uadalp     v3.8h, v6.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uadalp     v2.8h, v5.16b                  \n"  // R 16 bytes -> 8 shorts.
+
+    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
+    "urshr      v1.8h, v3.8h, #1               \n"
+    "urshr      v2.8h, v2.8h, #1               \n"
+
+    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
+    RGBTOUV(v0.8h, v1.8h, v2.8h)
+    MEMACCESS(2)
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
+  : "+r"(src_bgra),  // %0
+    "+r"(src_bgra_1),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(pix)        // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
+  );
+}
+#endif  // HAS_BGRATOUVROW_NEON
+
+#ifdef HAS_ABGRTOUVROW_NEON
+void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  const uint8* src_abgr_1 = src_abgr + src_stride_abgr;
+  asm volatile (
+    RGBTOUV_SETUP_REG
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+    "uaddlp     v3.8h, v2.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v2.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v1.8h, v0.16b                  \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more.
+    "uadalp     v3.8h, v6.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uadalp     v2.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uadalp     v1.8h, v4.16b                  \n"  // R 16 bytes -> 8 shorts.
+
+    "urshr      v0.8h, v3.8h, #1               \n"  // 2x average
+    "urshr      v2.8h, v2.8h, #1               \n"
+    "urshr      v1.8h, v1.8h, #1               \n"
+
+    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
+    RGBTOUV(v0.8h, v2.8h, v1.8h)
+    MEMACCESS(2)
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
+  : "+r"(src_abgr),  // %0
+    "+r"(src_abgr_1),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(pix)        // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
+  );
+}
+#endif  // HAS_ABGRTOUVROW_NEON
+
+#ifdef HAS_RGBATOUVROW_NEON
+void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  const uint8* src_rgba_1 = src_rgba + src_stride_rgba;
+  asm volatile (
+    RGBTOUV_SETUP_REG
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+    "uaddlp     v0.8h, v1.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v1.8h, v2.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v2.8h, v3.16b                  \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more.
+    "uadalp     v0.8h, v5.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uadalp     v1.8h, v6.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uadalp     v2.8h, v7.16b                  \n"  // R 16 bytes -> 8 shorts.
+
+    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
+    "urshr      v1.8h, v1.8h, #1               \n"
+    "urshr      v2.8h, v2.8h, #1               \n"
+
+    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
+    RGBTOUV(v0.8h, v1.8h, v2.8h)
+    MEMACCESS(2)
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
+  : "+r"(src_rgba),  // %0
+    "+r"(src_rgba_1),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(pix)        // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
+  );
+}
+#endif  // HAS_RGBATOUVROW_NEON
+
+#ifdef HAS_RGB24TOUVROW_NEON
+void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
+                       uint8* dst_u, uint8* dst_v, int pix) {
+  const uint8* src_rgb24_1 = src_rgb24 + src_stride_rgb24;
+  asm volatile (
+    RGBTOUV_SETUP_REG
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld3        {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 16 pixels.
+    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "ld3        {v4.16b,v5.16b,v6.16b}, [%1], #48 \n"  // load 16 more.
+    "uadalp     v0.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uadalp     v2.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
+
+    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
+    "urshr      v1.8h, v1.8h, #1               \n"
+    "urshr      v2.8h, v2.8h, #1               \n"
+
+    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
+    RGBTOUV(v0.8h, v1.8h, v2.8h)
+    MEMACCESS(2)
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
+  : "+r"(src_rgb24),  // %0
+    "+r"(src_rgb24_1),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(pix)        // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
+  );
+}
+#endif  // HAS_RGB24TOUVROW_NEON
+
+#ifdef HAS_RAWTOUVROW_NEON
+void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
+                     uint8* dst_u, uint8* dst_v, int pix) {
+  const uint8* src_raw_1 = src_raw + src_stride_raw;
+  asm volatile (
+    RGBTOUV_SETUP_REG
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld3        {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 8 RAW pixels.
+    "uaddlp     v2.8h, v2.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v0.8h, v0.16b                  \n"  // R 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "ld3        {v4.16b,v5.16b,v6.16b}, [%1], #48 \n"  // load 8 more RAW pixels
+    "uadalp     v2.8h, v6.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uadalp     v0.8h, v4.16b                  \n"  // R 16 bytes -> 8 shorts.
+
+    "urshr      v2.8h, v2.8h, #1               \n"  // 2x average
+    "urshr      v1.8h, v1.8h, #1               \n"
+    "urshr      v0.8h, v0.8h, #1               \n"
+
+    "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
+    RGBTOUV(v2.8h, v1.8h, v0.8h)
+    MEMACCESS(2)
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
+  : "+r"(src_raw),  // %0
+    "+r"(src_raw_1),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(pix)        // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
+  );
+}
+#endif  // HAS_RAWTOUVROW_NEON
+
+// 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
+#ifdef HAS_RGB565TOUVROW_NEON
+void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
+                        uint8* dst_u, uint8* dst_v, int pix) {
+  const uint8* src_rgb565_1 = src_rgb565 + src_stride_rgb565;
+  asm volatile (
+    "movi       v22.8h, #56, lsl #0            \n"  // UB / VR coeff (0.875) / 2
+    "movi       v23.8h, #37, lsl #0            \n"  // UG coeff (-0.5781) / 2
+    "movi       v24.8h, #19, lsl #0            \n"  // UR coeff (-0.2969) / 2
+    "movi       v25.8h, #9 , lsl #0            \n"  // VB coeff (-0.1406) / 2
+    "movi       v26.8h, #47, lsl #0            \n"  // VG coeff (-0.7344) / 2
+    "movi       v27.16b, #0x80                 \n"  // 128.5 (0x8080 in 16-bit)
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
+    RGB565TOARGB
+    "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uaddlp     v18.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uaddlp     v20.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"  // next 8 RGB565 pixels.
+    RGB565TOARGB
+    "uaddlp     v17.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uaddlp     v19.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uaddlp     v21.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+
+    MEMACCESS(1)
+    "ld1        {v0.16b}, [%1], #16            \n"  // load 8 RGB565 pixels.
+    RGB565TOARGB
+    "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uadalp     v18.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uadalp     v20.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(1)
+    "ld1        {v0.16b}, [%1], #16            \n"  // next 8 RGB565 pixels.
+    RGB565TOARGB
+    "uadalp     v17.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uadalp     v19.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uadalp     v21.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+
+    "ins        v16.D[1], v17.D[0]             \n"
+    "ins        v18.D[1], v19.D[0]             \n"
+    "ins        v20.D[1], v21.D[0]             \n"
+
+    "urshr      v4.8h, v16.8h, #1              \n"  // 2x average
+    "urshr      v5.8h, v18.8h, #1              \n"
+    "urshr      v6.8h, v20.8h, #1              \n"
+
+    "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.
+    "mul        v16.8h, v4.8h, v22.8h          \n"  // B
+    "mls        v16.8h, v5.8h, v23.8h          \n"  // G
+    "mls        v16.8h, v6.8h, v24.8h          \n"  // R
+    "add        v16.8h, v16.8h, v27.8h         \n"  // +128 -> unsigned
+    "mul        v17.8h, v6.8h, v22.8h          \n"  // R
+    "mls        v17.8h, v5.8h, v26.8h          \n"  // G
+    "mls        v17.8h, v4.8h, v25.8h          \n"  // B
+    "add        v17.8h, v17.8h, v27.8h         \n"  // +128 -> unsigned
+    "uqshrn     v0.8b, v16.8h, #8              \n"  // 16 bit to 8 bit U
+    "uqshrn     v1.8b, v17.8h, #8              \n"  // 16 bit to 8 bit V
+    MEMACCESS(2)
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
+  : "+r"(src_rgb565),  // %0
+    "+r"(src_rgb565_1),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(pix)        // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
+    "v25", "v26", "v27"
+  );
+}
+#endif  // HAS_RGB565TOUVROW_NEON
+
+// 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
+#ifdef HAS_ARGB1555TOUVROW_NEON
+void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
+                        uint8* dst_u, uint8* dst_v, int pix) {
+  const uint8* src_argb1555_1 = src_argb1555 + src_stride_argb1555;
+  asm volatile (
+    RGBTOUV_SETUP_REG
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
+    RGB555TOARGB
+    "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uaddlp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uaddlp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"  // next 8 ARGB1555 pixels.
+    RGB555TOARGB
+    "uaddlp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uaddlp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uaddlp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+
+    MEMACCESS(1)
+    "ld1        {v0.16b}, [%1], #16            \n"  // load 8 ARGB1555 pixels.
+    RGB555TOARGB
+    "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uadalp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uadalp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(1)
+    "ld1        {v0.16b}, [%1], #16            \n"  // next 8 ARGB1555 pixels.
+    RGB555TOARGB
+    "uadalp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uadalp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uadalp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+
+    "ins        v16.D[1], v26.D[0]             \n"
+    "ins        v17.D[1], v27.D[0]             \n"
+    "ins        v18.D[1], v28.D[0]             \n"
+
+    "urshr      v4.8h, v16.8h, #1              \n"  // 2x average
+    "urshr      v5.8h, v17.8h, #1              \n"
+    "urshr      v6.8h, v18.8h, #1              \n"
+
+    "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.
+    "mul        v2.8h, v4.8h, v20.8h           \n"  // B
+    "mls        v2.8h, v5.8h, v21.8h           \n"  // G
+    "mls        v2.8h, v6.8h, v22.8h           \n"  // R
+    "add        v2.8h, v2.8h, v25.8h           \n"  // +128 -> unsigned
+    "mul        v3.8h, v6.8h, v20.8h           \n"  // R
+    "mls        v3.8h, v5.8h, v24.8h           \n"  // G
+    "mls        v3.8h, v4.8h, v23.8h           \n"  // B
+    "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
+    "uqshrn     v0.8b, v2.8h, #8               \n"  // 16 bit to 8 bit U
+    "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
+    MEMACCESS(2)
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb1555),  // %0
+    "+r"(src_argb1555_1),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(pix)        // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
+    "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
+    "v26", "v27", "v28"
+  );
+}
+#endif  // HAS_ARGB1555TOUVROW_NEON
+
+// 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
+#ifdef HAS_ARGB4444TOUVROW_NEON
+void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
+                          uint8* dst_u, uint8* dst_v, int pix) {
+  const uint8* src_argb4444_1 = src_argb4444 + src_stride_argb4444;
+  asm volatile (
+    RGBTOUV_SETUP_REG
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
+    ARGB4444TOARGB
+    "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uaddlp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uaddlp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"  // next 8 ARGB4444 pixels.
+    ARGB4444TOARGB
+    "uaddlp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uaddlp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uaddlp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+
+    MEMACCESS(1)
+    "ld1        {v0.16b}, [%1], #16            \n"  // load 8 ARGB4444 pixels.
+    ARGB4444TOARGB
+    "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uadalp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uadalp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+    MEMACCESS(1)
+    "ld1        {v0.16b}, [%1], #16            \n"  // next 8 ARGB4444 pixels.
+    ARGB4444TOARGB
+    "uadalp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uadalp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uadalp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+
+    "ins        v16.D[1], v26.D[0]             \n"
+    "ins        v17.D[1], v27.D[0]             \n"
+    "ins        v18.D[1], v28.D[0]             \n"
+
+    "urshr      v4.8h, v16.8h, #1              \n"  // 2x average
+    "urshr      v5.8h, v17.8h, #1              \n"
+    "urshr      v6.8h, v18.8h, #1              \n"
+
+    "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.
+    "mul        v2.8h, v4.8h, v20.8h           \n"  // B
+    "mls        v2.8h, v5.8h, v21.8h           \n"  // G
+    "mls        v2.8h, v6.8h, v22.8h           \n"  // R
+    "add        v2.8h, v2.8h, v25.8h           \n"  // +128 -> unsigned
+    "mul        v3.8h, v6.8h, v20.8h           \n"  // R
+    "mls        v3.8h, v5.8h, v24.8h           \n"  // G
+    "mls        v3.8h, v4.8h, v23.8h           \n"  // B
+    "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
+    "uqshrn     v0.8b, v2.8h, #8               \n"  // 16 bit to 8 bit U
+    "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
+    MEMACCESS(2)
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
+    MEMACCESS(3)
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb4444),  // %0
+    "+r"(src_argb4444_1),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(pix)        // %4
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
+    "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
+    "v26", "v27", "v28"
+
+  );
+}
+#endif  // HAS_ARGB4444TOUVROW_NEON
+
+#ifdef HAS_RGB565TOYROW_NEON
+void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) {
+  asm volatile (
+    "movi       v24.8b, #13                    \n"  // B * 0.1016 coefficient
+    "movi       v25.8b, #65                    \n"  // G * 0.5078 coefficient
+    "movi       v26.8b, #33                    \n"  // R * 0.2578 coefficient
+    "movi       v27.8b, #16                    \n"  // Add 16 constant
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    RGB565TOARGB
+    "umull      v3.8h, v0.8b, v24.8b           \n"  // B
+    "umlal      v3.8h, v1.8b, v25.8b           \n"  // G
+    "umlal      v3.8h, v2.8b, v26.8b           \n"  // R
+    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
+    "uqadd      v0.8b, v0.8b, v27.8b           \n"
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+    "b.gt       1b                             \n"
+  : "+r"(src_rgb565),  // %0
+    "+r"(dst_y),       // %1
+    "+r"(pix)          // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6",
+    "v24", "v25", "v26", "v27"
+  );
+}
+#endif  // HAS_RGB565TOYROW_NEON
+
+#ifdef HAS_ARGB1555TOYROW_NEON
+void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) {
+  asm volatile (
+    "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
+    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
+    "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
+    "movi       v7.8b, #16                     \n"  // Add 16 constant
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    ARGB1555TOARGB
+    "umull      v3.8h, v0.8b, v4.8b            \n"  // B
+    "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
+    "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
+    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
+    "uqadd      v0.8b, v0.8b, v7.8b            \n"
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb1555),  // %0
+    "+r"(dst_y),         // %1
+    "+r"(pix)            // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
+  );
+}
+#endif  // HAS_ARGB1555TOYROW_NEON
+
+#ifdef HAS_ARGB4444TOYROW_NEON
+void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) {
+  asm volatile (
+    "movi       v24.8b, #13                    \n"  // B * 0.1016 coefficient
+    "movi       v25.8b, #65                    \n"  // G * 0.5078 coefficient
+    "movi       v26.8b, #33                    \n"  // R * 0.2578 coefficient
+    "movi       v27.8b, #16                    \n"  // Add 16 constant
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    ARGB4444TOARGB
+    "umull      v3.8h, v0.8b, v24.8b           \n"  // B
+    "umlal      v3.8h, v1.8b, v25.8b           \n"  // G
+    "umlal      v3.8h, v2.8b, v26.8b           \n"  // R
+    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
+    "uqadd      v0.8b, v0.8b, v27.8b           \n"
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb4444),  // %0
+    "+r"(dst_y),         // %1
+    "+r"(pix)            // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27"
+  );
+}
+#endif  // HAS_ARGB4444TOYROW_NEON
+
+#ifdef HAS_BGRATOYROW_NEON
+void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) {
+  asm volatile (
+    "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient
+    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
+    "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient
+    "movi       v7.8b, #16                     \n"  // Add 16 constant
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "umull      v16.8h, v1.8b, v4.8b           \n"  // R
+    "umlal      v16.8h, v2.8b, v5.8b           \n"  // G
+    "umlal      v16.8h, v3.8b, v6.8b           \n"  // B
+    "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
+    "uqadd      v0.8b, v0.8b, v7.8b            \n"
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+    "b.gt       1b                             \n"
+  : "+r"(src_bgra),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
+  );
+}
+#endif  // HAS_BGRATOYROW_NEON
+
+#ifdef HAS_ABGRTOYROW_NEON
+void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) {
+  asm volatile (
+    "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient
+    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
+    "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient
+    "movi       v7.8b, #16                     \n"  // Add 16 constant
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "umull      v16.8h, v0.8b, v4.8b           \n"  // R
+    "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
+    "umlal      v16.8h, v2.8b, v6.8b           \n"  // B
+    "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
+    "uqadd      v0.8b, v0.8b, v7.8b            \n"
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+    "b.gt       1b                             \n"
+  : "+r"(src_abgr),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
+  );
+}
+#endif  // HAS_ABGRTOYROW_NEON
+
+#ifdef HAS_RGBATOYROW_NEON
+void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) {
+  asm volatile (
+    "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
+    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
+    "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
+    "movi       v7.8b, #16                     \n"  // Add 16 constant
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "umull      v16.8h, v1.8b, v4.8b           \n"  // B
+    "umlal      v16.8h, v2.8b, v5.8b           \n"  // G
+    "umlal      v16.8h, v3.8b, v6.8b           \n"  // R
+    "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
+    "uqadd      v0.8b, v0.8b, v7.8b            \n"
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+    "b.gt       1b                             \n"
+  : "+r"(src_rgba),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
+  );
+}
+#endif  // HAS_RGBATOYROW_NEON
+
+#ifdef HAS_RGB24TOYROW_NEON
+void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) {
+  asm volatile (
+    "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
+    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
+    "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
+    "movi       v7.8b, #16                     \n"  // Add 16 constant
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "umull      v16.8h, v0.8b, v4.8b           \n"  // B
+    "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
+    "umlal      v16.8h, v2.8b, v6.8b           \n"  // R
+    "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
+    "uqadd      v0.8b, v0.8b, v7.8b            \n"
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+    "b.gt       1b                             \n"
+  : "+r"(src_rgb24),  // %0
+    "+r"(dst_y),      // %1
+    "+r"(pix)         // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
+  );
+}
+#endif  // HAS_RGB24TOYROW_NEON
+
+#ifdef HAS_RAWTOYROW_NEON
+void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) {
+  asm volatile (
+    "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient
+    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
+    "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient
+    "movi       v7.8b, #16                     \n"  // Add 16 constant
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "umull      v16.8h, v0.8b, v4.8b           \n"  // B
+    "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
+    "umlal      v16.8h, v2.8b, v6.8b           \n"  // R
+    "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
+    "uqadd      v0.8b, v0.8b, v7.8b            \n"
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+    "b.gt       1b                             \n"
+  : "+r"(src_raw),  // %0
+    "+r"(dst_y),    // %1
+    "+r"(pix)       // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
+  );
+}
+#endif  // HAS_RAWTOYROW_NEON
+
+// Bilinear filter 16x2 -> 16x1
+#ifdef HAS_INTERPOLATEROW_NEON
+void InterpolateRow_NEON(uint8* dst_ptr,
+                         const uint8* src_ptr, ptrdiff_t src_stride,
+                         int dst_width, int source_y_fraction) {
+  int y1_fraction = source_y_fraction;
+  int y0_fraction = 256 - y1_fraction;
+  const uint8* src_ptr1 = src_ptr + src_stride;
+  asm volatile (
+    "cmp        %w4, #0                        \n"
+    "b.eq       100f                           \n"
+    "cmp        %w4, #64                       \n"
+    "b.eq       75f                            \n"
+    "cmp        %w4, #128                      \n"
+    "b.eq       50f                            \n"
+    "cmp        %w4, #192                      \n"
+    "b.eq       25f                            \n"
+
+    "dup        v5.16b, %w4                    \n"
+    "dup        v4.16b, %w5                    \n"
+    // General purpose row blend.
+  "1:                                          \n"
+    MEMACCESS(1)
+    "ld1        {v0.16b}, [%1], #16            \n"
+    MEMACCESS(2)
+    "ld1        {v1.16b}, [%2], #16            \n"
+    "subs       %w3, %w3, #16                  \n"
+    "umull      v2.8h, v0.8b,  v4.8b           \n"
+    "umull2     v3.8h, v0.16b, v4.16b          \n"
+    "umlal      v2.8h, v1.8b,  v5.8b           \n"
+    "umlal2     v3.8h, v1.16b, v5.16b          \n"
+    "rshrn      v0.8b,  v2.8h, #8              \n"
+    "rshrn2     v0.16b, v3.8h, #8              \n"
+    MEMACCESS(0)
+    "st1        {v0.16b}, [%0], #16            \n"
+    "b.gt       1b                             \n"
+    "b          99f                            \n"
+
+    // Blend 25 / 75.
+  "25:                                         \n"
+    MEMACCESS(1)
+    "ld1        {v0.16b}, [%1], #16            \n"
+    MEMACCESS(2)
+    "ld1        {v1.16b}, [%2], #16            \n"
+    "subs       %w3, %w3, #16                  \n"
+    "urhadd     v0.16b, v0.16b, v1.16b         \n"
+    "urhadd     v0.16b, v0.16b, v1.16b         \n"
+    MEMACCESS(0)
+    "st1        {v0.16b}, [%0], #16            \n"
+    "b.gt       25b                            \n"
+    "b          99f                            \n"
+
+    // Blend 50 / 50.
+  "50:                                         \n"
+    MEMACCESS(1)
+    "ld1        {v0.16b}, [%1], #16            \n"
+    MEMACCESS(2)
+    "ld1        {v1.16b}, [%2], #16            \n"
+    "subs       %w3, %w3, #16                  \n"
+    "urhadd     v0.16b, v0.16b, v1.16b         \n"
+    MEMACCESS(0)
+    "st1        {v0.16b}, [%0], #16            \n"
+    "b.gt       50b                            \n"
+    "b          99f                            \n"
+
+    // Blend 75 / 25.
+  "75:                                         \n"
+    MEMACCESS(1)
+    "ld1        {v1.16b}, [%1], #16            \n"
+    MEMACCESS(2)
+    "ld1        {v0.16b}, [%2], #16            \n"
+    "subs       %w3, %w3, #16                  \n"
+    "urhadd     v0.16b, v0.16b, v1.16b         \n"
+    "urhadd     v0.16b, v0.16b, v1.16b         \n"
+    MEMACCESS(0)
+    "st1        {v0.16b}, [%0], #16            \n"
+    "b.gt       75b                            \n"
+    "b          99f                            \n"
+
+    // Blend 100 / 0 - Copy row unchanged.
+  "100:                                        \n"
+    MEMACCESS(1)
+    "ld1        {v0.16b}, [%1], #16            \n"
+    "subs       %w3, %w3, #16                  \n"
+    MEMACCESS(0)
+    "st1        {v0.16b}, [%0], #16            \n"
+    "b.gt       100b                           \n"
+
+  "99:                                         \n"
+  : "+r"(dst_ptr),          // %0
+    "+r"(src_ptr),          // %1
+    "+r"(src_ptr1),         // %2
+    "+r"(dst_width),        // %3
+    "+r"(y1_fraction),      // %4
+    "+r"(y0_fraction)       // %5
+  :
+  : "cc", "memory", "v0", "v1", "v3", "v4", "v5"
+  );
+}
+#endif  // HAS_INTERPOLATEROW_NEON
+
+// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
+#ifdef HAS_ARGBBLENDROW_NEON
+void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
+                       uint8* dst_argb, int width) {
+  asm volatile (
+    "subs       %w3, %w3, #8                   \n"
+    "b.lt       89f                            \n"
+    // Blend 8 pixels.
+  "8:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB0 pixels
+    MEMACCESS(1)
+    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 ARGB1 pixels
+    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+    "umull      v16.8h, v4.8b, v3.8b           \n"  // db * a
+    "umull      v17.8h, v5.8b, v3.8b           \n"  // dg * a
+    "umull      v18.8h, v6.8b, v3.8b           \n"  // dr * a
+    "uqrshrn    v16.8b, v16.8h, #8             \n"  // db >>= 8
+    "uqrshrn    v17.8b, v17.8h, #8             \n"  // dg >>= 8
+    "uqrshrn    v18.8b, v18.8h, #8             \n"  // dr >>= 8
+    "uqsub      v4.8b, v4.8b, v16.8b           \n"  // db - (db * a / 256)
+    "uqsub      v5.8b, v5.8b, v17.8b           \n"  // dg - (dg * a / 256)
+    "uqsub      v6.8b, v6.8b, v18.8b           \n"  // dr - (dr * a / 256)
+    "uqadd      v0.8b, v0.8b, v4.8b            \n"  // + sb
+    "uqadd      v1.8b, v1.8b, v5.8b            \n"  // + sg
+    "uqadd      v2.8b, v2.8b, v6.8b            \n"  // + sr
+    "movi       v3.8b, #255                    \n"  // a = 255
+    MEMACCESS(2)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
+    "b.ge       8b                             \n"
+
+  "89:                                         \n"
+    "adds       %w3, %w3, #8-1                 \n"
+    "b.lt       99f                            \n"
+
+    // Blend 1 pixels.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n"  // load 1 pixel ARGB0.
+    MEMACCESS(1)
+    "ld4        {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n"  // load 1 pixel ARGB1.
+    "subs       %w3, %w3, #1                   \n"  // 1 processed per loop.
+    "umull      v16.8h, v4.8b, v3.8b           \n"  // db * a
+    "umull      v17.8h, v5.8b, v3.8b           \n"  // dg * a
+    "umull      v18.8h, v6.8b, v3.8b           \n"  // dr * a
+    "uqrshrn    v16.8b, v16.8h, #8             \n"  // db >>= 8
+    "uqrshrn    v17.8b, v17.8h, #8             \n"  // dg >>= 8
+    "uqrshrn    v18.8b, v18.8h, #8             \n"  // dr >>= 8
+    "uqsub      v4.8b, v4.8b, v16.8b           \n"  // db - (db * a / 256)
+    "uqsub      v5.8b, v5.8b, v17.8b           \n"  // dg - (dg * a / 256)
+    "uqsub      v6.8b, v6.8b, v18.8b           \n"  // dr - (dr * a / 256)
+    "uqadd      v0.8b, v0.8b, v4.8b            \n"  // + sb
+    "uqadd      v1.8b, v1.8b, v5.8b            \n"  // + sg
+    "uqadd      v2.8b, v2.8b, v6.8b            \n"  // + sr
+    "movi       v3.8b, #255                    \n"  // a = 255
+    MEMACCESS(2)
+    "st4        {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n"  // store 1 pixel.
+    "b.ge       1b                             \n"
+
+  "99:                                         \n"
+
+  : "+r"(src_argb0),    // %0
+    "+r"(src_argb1),    // %1
+    "+r"(dst_argb),     // %2
+    "+r"(width)         // %3
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v16", "v17", "v18"
+  );
+}
+#endif  // HAS_ARGBBLENDROW_NEON
+
+// Attenuate 8 pixels at a time.
+#ifdef HAS_ARGBATTENUATEROW_NEON
+void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
+  asm volatile (
+    // Attenuate 8 pixels.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "umull      v4.8h, v0.8b, v3.8b            \n"  // b * a
+    "umull      v5.8h, v1.8b, v3.8b            \n"  // g * a
+    "umull      v6.8h, v2.8b, v3.8b            \n"  // r * a
+    "uqrshrn    v0.8b, v4.8h, #8               \n"  // b >>= 8
+    "uqrshrn    v1.8b, v5.8h, #8               \n"  // g >>= 8
+    "uqrshrn    v2.8b, v6.8h, #8               \n"  // r >>= 8
+    MEMACCESS(1)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),   // %0
+    "+r"(dst_argb),   // %1
+    "+r"(width)       // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
+  );
+}
+#endif  // HAS_ARGBATTENUATEROW_NEON
+
+// Quantize 8 ARGB pixels (32 bytes).
+// dst = (dst * scale >> 16) * interval_size + interval_offset;
+#ifdef HAS_ARGBQUANTIZEROW_NEON
+void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
+                          int interval_offset, int width) {
+  asm volatile (
+    "dup        v4.8h, %w2                     \n"
+    "ushr       v4.8h, v4.8h, #1               \n"  // scale >>= 1
+    "dup        v5.8h, %w3                     \n"  // interval multiply.
+    "dup        v6.8h, %w4                     \n"  // interval add
+
+    // 8 pixel loop.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0]  \n"  // load 8 pixels of ARGB.
+    "subs       %w1, %w1, #8                   \n"  // 8 processed per loop.
+    "uxtl       v0.8h, v0.8b                   \n"  // b (0 .. 255)
+    "uxtl       v1.8h, v1.8b                   \n"
+    "uxtl       v2.8h, v2.8b                   \n"
+    "sqdmulh    v0.8h, v0.8h, v4.8h            \n"  // b * scale
+    "sqdmulh    v1.8h, v1.8h, v4.8h            \n"  // g
+    "sqdmulh    v2.8h, v2.8h, v4.8h            \n"  // r
+    "mul        v0.8h, v0.8h, v5.8h            \n"  // b * interval_size
+    "mul        v1.8h, v1.8h, v5.8h            \n"  // g
+    "mul        v2.8h, v2.8h, v5.8h            \n"  // r
+    "add        v0.8h, v0.8h, v6.8h            \n"  // b + interval_offset
+    "add        v1.8h, v1.8h, v6.8h            \n"  // g
+    "add        v2.8h, v2.8h, v6.8h            \n"  // r
+    "uqxtn      v0.8b, v0.8h                   \n"
+    "uqxtn      v1.8b, v1.8h                   \n"
+    "uqxtn      v2.8b, v2.8h                   \n"
+    MEMACCESS(0)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
+  : "+r"(dst_argb),       // %0
+    "+r"(width)           // %1
+  : "r"(scale),           // %2
+    "r"(interval_size),   // %3
+    "r"(interval_offset)  // %4
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
+  );
+}
+#endif  // HAS_ARGBQUANTIZEROW_NEON
+
+// Shade 8 pixels at a time by specified value.
+// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
+// Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
+#ifdef HAS_ARGBSHADEROW_NEON
+void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
+                       uint32 value) {
+  asm volatile (
+    "dup        v0.4s, %w3                     \n"  // duplicate scale value.
+    "zip1       v0.8b, v0.8b, v0.8b            \n"  // v0.8b aarrggbb.
+    "ushr       v0.8h, v0.8h, #1               \n"  // scale / 2.
+
+    // 8 pixel loop.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "uxtl       v4.8h, v4.8b                   \n"  // b (0 .. 255)
+    "uxtl       v5.8h, v5.8b                   \n"
+    "uxtl       v6.8h, v6.8b                   \n"
+    "uxtl       v7.8h, v7.8b                   \n"
+    "sqrdmulh   v4.8h, v4.8h, v0.h[0]          \n"  // b * scale * 2
+    "sqrdmulh   v5.8h, v5.8h, v0.h[1]          \n"  // g
+    "sqrdmulh   v6.8h, v6.8h, v0.h[2]          \n"  // r
+    "sqrdmulh   v7.8h, v7.8h, v0.h[3]          \n"  // a
+    "uqxtn      v4.8b, v4.8h                   \n"
+    "uqxtn      v5.8b, v5.8h                   \n"
+    "uqxtn      v6.8b, v6.8h                   \n"
+    "uqxtn      v7.8b, v7.8h                   \n"
+    MEMACCESS(1)
+    "st4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),       // %0
+    "+r"(dst_argb),       // %1
+    "+r"(width)           // %2
+  : "r"(value)            // %3
+  : "cc", "memory", "v0", "v4", "v5", "v6", "v7"
+  );
+}
+#endif  // HAS_ARGBSHADEROW_NEON
+
+// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
+// Similar to ARGBToYJ but stores ARGB.
+// C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
+#ifdef HAS_ARGBGRAYROW_NEON
+void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
+  asm volatile (
+    "movi       v24.8b, #15                    \n"  // B * 0.11400 coefficient
+    "movi       v25.8b, #75                    \n"  // G * 0.58700 coefficient
+    "movi       v26.8b, #38                    \n"  // R * 0.29900 coefficient
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "umull      v4.8h, v0.8b, v24.8b           \n"  // B
+    "umlal      v4.8h, v1.8b, v25.8b           \n"  // G
+    "umlal      v4.8h, v2.8b, v26.8b           \n"  // R
+    "sqrshrun   v0.8b, v4.8h, #7               \n"  // 15 bit to 8 bit B
+    "orr        v1.8b, v0.8b, v0.8b            \n"  // G
+    "orr        v2.8b, v0.8b, v0.8b            \n"  // R
+    MEMACCESS(1)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 pixels.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(width)      // %2
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26"
+  );
+}
+#endif  // HAS_ARGBGRAYROW_NEON
+
+// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
+//    b = (r * 35 + g * 68 + b * 17) >> 7
+//    g = (r * 45 + g * 88 + b * 22) >> 7
+//    r = (r * 50 + g * 98 + b * 24) >> 7
+
+#ifdef HAS_ARGBSEPIAROW_NEON
+void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
+  asm volatile (
+    "movi       v20.8b, #17                    \n"  // BB coefficient
+    "movi       v21.8b, #68                    \n"  // BG coefficient
+    "movi       v22.8b, #35                    \n"  // BR coefficient
+    "movi       v24.8b, #22                    \n"  // GB coefficient
+    "movi       v25.8b, #88                    \n"  // GG coefficient
+    "movi       v26.8b, #45                    \n"  // GR coefficient
+    "movi       v28.8b, #24                    \n"  // BB coefficient
+    "movi       v29.8b, #98                    \n"  // BG coefficient
+    "movi       v30.8b, #50                    \n"  // BR coefficient
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n"  // load 8 ARGB pixels.
+    "subs       %w1, %w1, #8                   \n"  // 8 processed per loop.
+    "umull      v4.8h, v0.8b, v20.8b           \n"  // B to Sepia B
+    "umlal      v4.8h, v1.8b, v21.8b           \n"  // G
+    "umlal      v4.8h, v2.8b, v22.8b           \n"  // R
+    "umull      v5.8h, v0.8b, v24.8b           \n"  // B to Sepia G
+    "umlal      v5.8h, v1.8b, v25.8b           \n"  // G
+    "umlal      v5.8h, v2.8b, v26.8b           \n"  // R
+    "umull      v6.8h, v0.8b, v28.8b           \n"  // B to Sepia R
+    "umlal      v6.8h, v1.8b, v29.8b           \n"  // G
+    "umlal      v6.8h, v2.8b, v30.8b           \n"  // R
+    "uqshrn     v0.8b, v4.8h, #7               \n"  // 16 bit to 8 bit B
+    "uqshrn     v1.8b, v5.8h, #7               \n"  // 16 bit to 8 bit G
+    "uqshrn     v2.8b, v6.8h, #7               \n"  // 16 bit to 8 bit R
+    MEMACCESS(0)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // store 8 pixels.
+    "b.gt       1b                             \n"
+  : "+r"(dst_argb),  // %0
+    "+r"(width)      // %1
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30"
+  );
+}
+#endif  // HAS_ARGBSEPIAROW_NEON
+
+// Tranform 8 ARGB pixels (32 bytes) with color matrix.
+// TODO(fbarchard): Was same as Sepia except matrix is provided.  This function
+// needs to saturate.  Consider doing a non-saturating version.
+#ifdef HAS_ARGBCOLORMATRIXROW_NEON
+void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
+                             const int8* matrix_argb, int width) {
+  asm volatile (
+    MEMACCESS(3)
+    "ld1        {v2.16b}, [%3]                 \n"  // load 3 ARGB vectors.
+    "sxtl       v0.8h, v2.8b                   \n"  // B,G coefficients s16.
+    "sxtl2      v1.8h, v2.16b                  \n"  // R,A coefficients s16.
+
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n"  // load 8 pixels.
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "uxtl       v16.8h, v16.8b                 \n"  // b (0 .. 255) 16 bit
+    "uxtl       v17.8h, v17.8b                 \n"  // g
+    "uxtl       v18.8h, v18.8b                 \n"  // r
+    "uxtl       v19.8h, v19.8b                 \n"  // a
+    "mul        v22.8h, v16.8h, v0.h[0]        \n"  // B = B * Matrix B
+    "mul        v23.8h, v16.8h, v0.h[4]        \n"  // G = B * Matrix G
+    "mul        v24.8h, v16.8h, v1.h[0]        \n"  // R = B * Matrix R
+    "mul        v25.8h, v16.8h, v1.h[4]        \n"  // A = B * Matrix A
+    "mul        v4.8h, v17.8h, v0.h[1]         \n"  // B += G * Matrix B
+    "mul        v5.8h, v17.8h, v0.h[5]         \n"  // G += G * Matrix G
+    "mul        v6.8h, v17.8h, v1.h[1]         \n"  // R += G * Matrix R
+    "mul        v7.8h, v17.8h, v1.h[5]         \n"  // A += G * Matrix A
+    "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B
+    "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G
+    "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R
+    "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A
+    "mul        v4.8h, v18.8h, v0.h[2]         \n"  // B += R * Matrix B
+    "mul        v5.8h, v18.8h, v0.h[6]         \n"  // G += R * Matrix G
+    "mul        v6.8h, v18.8h, v1.h[2]         \n"  // R += R * Matrix R
+    "mul        v7.8h, v18.8h, v1.h[6]         \n"  // A += R * Matrix A
+    "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B
+    "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G
+    "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R
+    "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A
+    "mul        v4.8h, v19.8h, v0.h[3]         \n"  // B += A * Matrix B
+    "mul        v5.8h, v19.8h, v0.h[7]         \n"  // G += A * Matrix G
+    "mul        v6.8h, v19.8h, v1.h[3]         \n"  // R += A * Matrix R
+    "mul        v7.8h, v19.8h, v1.h[7]         \n"  // A += A * Matrix A
+    "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B
+    "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G
+    "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R
+    "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A
+    "sqshrun    v16.8b, v22.8h, #6             \n"  // 16 bit to 8 bit B
+    "sqshrun    v17.8b, v23.8h, #6             \n"  // 16 bit to 8 bit G
+    "sqshrun    v18.8b, v24.8h, #6             \n"  // 16 bit to 8 bit R
+    "sqshrun    v19.8b, v25.8h, #6             \n"  // 16 bit to 8 bit A
+    MEMACCESS(1)
+    "st4        {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n"  // store 8 pixels.
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),   // %0
+    "+r"(dst_argb),   // %1
+    "+r"(width)       // %2
+  : "r"(matrix_argb)  // %3
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
+    "v18", "v19", "v22", "v23", "v24", "v25"
+  );
+}
+#endif  // HAS_ARGBCOLORMATRIXROW_NEON
+
+// TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable.
+// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
+#ifdef HAS_ARGBMULTIPLYROW_NEON
+void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
+                          uint8* dst_argb, int width) {
+  asm volatile (
+    // 8 pixel loop.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
+    MEMACCESS(1)
+    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more pixels.
+    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+    "umull      v0.8h, v0.8b, v4.8b            \n"  // multiply B
+    "umull      v1.8h, v1.8b, v5.8b            \n"  // multiply G
+    "umull      v2.8h, v2.8b, v6.8b            \n"  // multiply R
+    "umull      v3.8h, v3.8b, v7.8b            \n"  // multiply A
+    "rshrn      v0.8b, v0.8h, #8               \n"  // 16 bit to 8 bit B
+    "rshrn      v1.8b, v1.8h, #8               \n"  // 16 bit to 8 bit G
+    "rshrn      v2.8b, v2.8h, #8               \n"  // 16 bit to 8 bit R
+    "rshrn      v3.8b, v3.8h, #8               \n"  // 16 bit to 8 bit A
+    MEMACCESS(2)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
+
+  : "+r"(src_argb0),  // %0
+    "+r"(src_argb1),  // %1
+    "+r"(dst_argb),   // %2
+    "+r"(width)       // %3
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
+  );
+}
+#endif  // HAS_ARGBMULTIPLYROW_NEON
+
+// Add 2 rows of ARGB pixels together, 8 pixels at a time.
+#ifdef HAS_ARGBADDROW_NEON
+void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
+                     uint8* dst_argb, int width) {
+  asm volatile (
+    // 8 pixel loop.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
+    MEMACCESS(1)
+    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more pixels.
+    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+    "uqadd      v0.8b, v0.8b, v4.8b            \n"
+    "uqadd      v1.8b, v1.8b, v5.8b            \n"
+    "uqadd      v2.8b, v2.8b, v6.8b            \n"
+    "uqadd      v3.8b, v3.8b, v7.8b            \n"
+    MEMACCESS(2)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
+
+  : "+r"(src_argb0),  // %0
+    "+r"(src_argb1),  // %1
+    "+r"(dst_argb),   // %2
+    "+r"(width)       // %3
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
+  );
+}
+#endif  // HAS_ARGBADDROW_NEON
+
+// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
+#ifdef HAS_ARGBSUBTRACTROW_NEON
+void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
+                          uint8* dst_argb, int width) {
+  asm volatile (
+    // 8 pixel loop.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
+    MEMACCESS(1)
+    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more pixels.
+    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+    "uqsub      v0.8b, v0.8b, v4.8b            \n"
+    "uqsub      v1.8b, v1.8b, v5.8b            \n"
+    "uqsub      v2.8b, v2.8b, v6.8b            \n"
+    "uqsub      v3.8b, v3.8b, v7.8b            \n"
+    MEMACCESS(2)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
+
+  : "+r"(src_argb0),  // %0
+    "+r"(src_argb1),  // %1
+    "+r"(dst_argb),   // %2
+    "+r"(width)       // %3
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
+  );
+}
+#endif  // HAS_ARGBSUBTRACTROW_NEON
+
+// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
+// A = 255
+// R = Sobel
+// G = Sobel
+// B = Sobel
+#ifdef HAS_SOBELROW_NEON
+void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+                     uint8* dst_argb, int width) {
+  asm volatile (
+    "movi       v3.8b, #255                    \n"  // alpha
+    // 8 pixel loop.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.8b}, [%0], #8              \n"  // load 8 sobelx.
+    MEMACCESS(1)
+    "ld1        {v1.8b}, [%1], #8              \n"  // load 8 sobely.
+    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+    "uqadd      v0.8b, v0.8b, v1.8b            \n"  // add
+    "orr        v1.8b, v0.8b, v0.8b            \n"
+    "orr        v2.8b, v0.8b, v0.8b            \n"
+    MEMACCESS(2)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
+  : "+r"(src_sobelx),  // %0
+    "+r"(src_sobely),  // %1
+    "+r"(dst_argb),    // %2
+    "+r"(width)        // %3
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3"
+  );
+}
+#endif  // HAS_SOBELROW_NEON
+
+// Adds Sobel X and Sobel Y and stores Sobel into plane.
+#ifdef HAS_SOBELTOPLANEROW_NEON
+void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+                          uint8* dst_y, int width) {
+  asm volatile (
+    // 16 pixel loop.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 16 sobelx.
+    MEMACCESS(1)
+    "ld1        {v1.16b}, [%1], #16            \n"  // load 16 sobely.
+    "subs       %w3, %w3, #16                  \n"  // 16 processed per loop.
+    "uqadd      v0.16b, v0.16b, v1.16b         \n"  // add
+    MEMACCESS(2)
+    "st1        {v0.16b}, [%2], #16            \n"  // store 16 pixels.
+    "b.gt       1b                             \n"
+  : "+r"(src_sobelx),  // %0
+    "+r"(src_sobely),  // %1
+    "+r"(dst_y),       // %2
+    "+r"(width)        // %3
+  :
+  : "cc", "memory", "v0", "v1"
+  );
+}
+#endif  // HAS_SOBELTOPLANEROW_NEON
+
+// Mixes Sobel X, Sobel Y and Sobel into ARGB.
+// A = 255
+// R = Sobel X
+// G = Sobel
+// B = Sobel Y
+#ifdef HAS_SOBELXYROW_NEON
+void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+                     uint8* dst_argb, int width) {
+  asm volatile (
+    "movi       v3.8b, #255                    \n"  // alpha
+    // 8 pixel loop.
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v2.8b}, [%0], #8              \n"  // load 8 sobelx.
+    MEMACCESS(1)
+    "ld1        {v0.8b}, [%1], #8              \n"  // load 8 sobely.
+    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+    "uqadd      v1.8b, v0.8b, v2.8b            \n"  // add
+    MEMACCESS(2)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
+  : "+r"(src_sobelx),  // %0
+    "+r"(src_sobely),  // %1
+    "+r"(dst_argb),    // %2
+    "+r"(width)        // %3
+  :
+  : "cc", "memory", "v0", "v1", "v2", "v3"
+  );
+}
+#endif  // HAS_SOBELXYROW_NEON
+
+// SobelX as a matrix is
+// -1  0  1
+// -2  0  2
+// -1  0  1
+#ifdef HAS_SOBELXROW_NEON
+void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
+                    const uint8* src_y2, uint8* dst_sobelx, int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.8b}, [%0],%5               \n"  // top
+    MEMACCESS(0)
+    "ld1        {v1.8b}, [%0],%6               \n"
+    "usubl      v0.8h, v0.8b, v1.8b            \n"
+    MEMACCESS(1)
+    "ld1        {v2.8b}, [%1],%5               \n"  // center * 2
+    MEMACCESS(1)
+    "ld1        {v3.8b}, [%1],%6               \n"
+    "usubl      v1.8h, v2.8b, v3.8b            \n"
+    "add        v0.8h, v0.8h, v1.8h            \n"
+    "add        v0.8h, v0.8h, v1.8h            \n"
+    MEMACCESS(2)
+    "ld1        {v2.8b}, [%2],%5               \n"  // bottom
+    MEMACCESS(2)
+    "ld1        {v3.8b}, [%2],%6               \n"
+    "subs       %w4, %w4, #8                   \n"  // 8 pixels
+    "usubl      v1.8h, v2.8b, v3.8b            \n"
+    "add        v0.8h, v0.8h, v1.8h            \n"
+    "abs        v0.8h, v0.8h                   \n"
+    "uqxtn      v0.8b, v0.8h                   \n"
+    MEMACCESS(3)
+    "st1        {v0.8b}, [%3], #8              \n"  // store 8 sobelx
+    "b.gt       1b                             \n"
+  : "+r"(src_y0),      // %0
+    "+r"(src_y1),      // %1
+    "+r"(src_y2),      // %2
+    "+r"(dst_sobelx),  // %3
+    "+r"(width)        // %4
+  : "r"(2LL),          // %5
+    "r"(6LL)           // %6
+  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
+  );
+}
+#endif  // HAS_SOBELXROW_NEON
+
+// SobelY as a matrix is
+// -1 -2 -1
+//  0  0  0
+//  1  2  1
+#ifdef HAS_SOBELYROW_NEON
+void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
+                    uint8* dst_sobely, int width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.8b}, [%0],%4               \n"  // left
+    MEMACCESS(1)
+    "ld1        {v1.8b}, [%1],%4               \n"
+    "usubl      v0.8h, v0.8b, v1.8b            \n"
+    MEMACCESS(0)
+    "ld1        {v2.8b}, [%0],%4               \n"  // center * 2
+    MEMACCESS(1)
+    "ld1        {v3.8b}, [%1],%4               \n"
+    "usubl      v1.8h, v2.8b, v3.8b            \n"
+    "add        v0.8h, v0.8h, v1.8h            \n"
+    "add        v0.8h, v0.8h, v1.8h            \n"
+    MEMACCESS(0)
+    "ld1        {v2.8b}, [%0],%5               \n"  // right
+    MEMACCESS(1)
+    "ld1        {v3.8b}, [%1],%5               \n"
+    "subs       %w3, %w3, #8                   \n"  // 8 pixels
+    "usubl      v1.8h, v2.8b, v3.8b            \n"
+    "add        v0.8h, v0.8h, v1.8h            \n"
+    "abs        v0.8h, v0.8h                   \n"
+    "uqxtn      v0.8b, v0.8h                   \n"
+    MEMACCESS(2)
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 sobely
+    "b.gt       1b                             \n"
+  : "+r"(src_y0),      // %0
+    "+r"(src_y1),      // %1
+    "+r"(dst_sobely),  // %2
+    "+r"(width)        // %3
+  : "r"(1LL),          // %4
+    "r"(6LL)           // %5
+  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
+  );
+}
+#endif  // HAS_SOBELYROW_NEON
+#endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libaom/src/third_party/libyuv/source/row_win.cc b/libs/libaom/src/third_party/libyuv/source/row_win.cc
new file mode 100644
index 000000000..71be268b4
--- /dev/null
+++ b/libs/libaom/src/third_party/libyuv/source/row_win.cc
@@ -0,0 +1,6331 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_X64) && \
+    defined(_MSC_VER) && !defined(__clang__)
+#include <emmintrin.h>
+#include <tmmintrin.h>  // For _mm_maddubs_epi16
+#endif
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for Visual C.
+#if !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) && \
+    defined(_MSC_VER) && !defined(__clang__)
+
+struct YuvConstants {
+  lvec8 kUVToB;     // 0
+  lvec8 kUVToG;     // 32
+  lvec8 kUVToR;     // 64
+  lvec16 kUVBiasB;  // 96
+  lvec16 kUVBiasG;  // 128
+  lvec16 kUVBiasR;  // 160
+  lvec16 kYToRgb;   // 192
+};
+
+// BT.601 YUV to RGB reference
+//  R = (Y - 16) * 1.164              - V * -1.596
+//  G = (Y - 16) * 1.164 - U *  0.391 - V *  0.813
+//  B = (Y - 16) * 1.164 - U * -2.018
+
+// Y contribution to R,G,B.  Scale and bias.
+// TODO(fbarchard): Consider moving constants into a common header.
+#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
+
+// U and V contributions to R,G,B.
+#define UB -128 /* max(-128, round(-2.018 * 64)) */
+#define UG 25 /* round(0.391 * 64) */
+#define VG 52 /* round(0.813 * 64) */
+#define VR -102 /* round(-1.596 * 64) */
+
+// Bias values to subtract 16 from Y and 128 from U and V.
+#define BB (UB * 128            + YGB)
+#define BG (UG * 128 + VG * 128 + YGB)
+#define BR            (VR * 128 + YGB)
+
+// BT601 constants for YUV to RGB.
+static YuvConstants SIMD_ALIGNED(kYuvConstants) = {
+  { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
+    UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },
+  { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
+    UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
+  { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
+    0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR },
+  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
+  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
+  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
+  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
+};
+
+// BT601 constants for NV21 where chroma plane is VU instead of UV.
+static YuvConstants SIMD_ALIGNED(kYvuConstants) = {
+  { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
+    0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },
+  { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
+    VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
+  { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
+    VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 },
+  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
+  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
+  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
+  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
+};
+
+#undef YG
+#undef YGB
+#undef UB
+#undef UG
+#undef VG
+#undef VR
+#undef BB
+#undef BG
+#undef BR
+
+// JPEG YUV to RGB reference
+// *  R = Y                - V * -1.40200
+// *  G = Y - U *  0.34414 - V *  0.71414
+// *  B = Y - U * -1.77200
+
+// Y contribution to R,G,B.  Scale and bias.
+// TODO(fbarchard): Consider moving constants into a common header.
+#define YGJ 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
+#define YGBJ 32  /* 64 / 2 */
+
+// U and V contributions to R,G,B.
+#define UBJ -113 /* round(-1.77200 * 64) */
+#define UGJ 22 /* round(0.34414 * 64) */
+#define VGJ 46 /* round(0.71414  * 64) */
+#define VRJ -90 /* round(-1.40200 * 64) */
+
+// Bias values to subtract 16 from Y and 128 from U and V.
+#define BBJ (UBJ * 128             + YGBJ)
+#define BGJ (UGJ * 128 + VGJ * 128 + YGBJ)
+#define BRJ             (VRJ * 128 + YGBJ)
+
+// JPEG constants for YUV to RGB.
+static YuvConstants SIMD_ALIGNED(kYuvJConstants) = {
+  { UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0,
+    UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0 },
+  { UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
+    UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
+    UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
+    UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ },
+  { 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ,
+    0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ },
+  { BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ,
+    BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ },
+  { BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ,
+    BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ },
+  { BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ,
+    BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ },
+  { YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ,
+    YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ }
+};
+
+#undef YGJ
+#undef YGBJ
+#undef UBJ
+#undef UGJ
+#undef VGJ
+#undef VRJ
+#undef BBJ
+#undef BGJ
+#undef BRJ
+
+// 64 bit
+#if defined(_M_X64)
+#if defined(HAS_I422TOARGBROW_SSSE3)
+void I422ToARGBRow_SSSE3(const uint8* y_buf,
+                         const uint8* u_buf,
+                         const uint8* v_buf,
+                         uint8* dst_argb,
+                         int width) {
+  __m128i xmm0, xmm1, xmm2, xmm3;
+  const __m128i xmm5 = _mm_set1_epi8(-1);
+  const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
+
+  while (width > 0) {
+    xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf);
+    xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset));
+    xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
+    xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);
+    xmm1 = _mm_loadu_si128(&xmm0);
+    xmm2 = _mm_loadu_si128(&xmm0);
+    xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kYuvConstants.kUVToB);
+    xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kYuvConstants.kUVToG);
+    xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kYuvConstants.kUVToR);
+    xmm0 = _mm_sub_epi16(*(__m128i*)kYuvConstants.kUVBiasB, xmm0);
+    xmm1 = _mm_sub_epi16(*(__m128i*)kYuvConstants.kUVBiasG, xmm1);
+    xmm2 = _mm_sub_epi16(*(__m128i*)kYuvConstants.kUVBiasR, xmm2);
+    xmm3 = _mm_loadl_epi64((__m128i*)y_buf);
+    xmm3 = _mm_unpacklo_epi8(xmm3, xmm3);
+    xmm3 = _mm_mulhi_epu16(xmm3, *(__m128i*)kYuvConstants.kYToRgb);
+    xmm0 = _mm_adds_epi16(xmm0, xmm3);
+    xmm1 = _mm_adds_epi16(xmm1, xmm3);
+    xmm2 = _mm_adds_epi16(xmm2, xmm3);
+    xmm0 = _mm_srai_epi16(xmm0, 6);
+    xmm1 = _mm_srai_epi16(xmm1, 6);
+    xmm2 = _mm_srai_epi16(xmm2, 6);
+    xmm0 = _mm_packus_epi16(xmm0, xmm0);
+    xmm1 = _mm_packus_epi16(xmm1, xmm1);
+    xmm2 = _mm_packus_epi16(xmm2, xmm2);
+    xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
+    xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);
+    xmm1 = _mm_loadu_si128(&xmm0);
+    xmm0 = _mm_unpacklo_epi16(xmm0, xmm2);
+    xmm1 = _mm_unpackhi_epi16(xmm1, xmm2);
+
+    _mm_storeu_si128((__m128i *)dst_argb, xmm0);
+    _mm_storeu_si128((__m128i *)(dst_argb + 16), xmm1);
+
+    y_buf += 8;
+    u_buf += 4;
+    dst_argb += 32;
+    width -= 8;
+  }
+}
+#endif
+// 32 bit
+#else  // defined(_M_X64)
+#ifdef HAS_ARGBTOYROW_SSSE3
+
+// Constants for ARGB.
+static const vec8 kARGBToY = {
+  13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
+};
+
+// JPeg full range.
+static const vec8 kARGBToYJ = {
+  15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
+};
+
+static const vec8 kARGBToU = {
+  112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
+};
+
+static const vec8 kARGBToUJ = {
+  127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
+};
+
+static const vec8 kARGBToV = {
+  -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
+};
+
+static const vec8 kARGBToVJ = {
+  -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
+};
+
+// vpshufb for vphaddw + vpackuswb packed to shorts.
+static const lvec8 kShufARGBToUV_AVX = {
+  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
+  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
+};
+
+// Constants for BGRA.
+static const vec8 kBGRAToY = {
+  0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
+};
+
+static const vec8 kBGRAToU = {
+  0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
+};
+
+static const vec8 kBGRAToV = {
+  0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
+};
+
+// Constants for ABGR.
+static const vec8 kABGRToY = {
+  33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
+};
+
+static const vec8 kABGRToU = {
+  -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
+};
+
+static const vec8 kABGRToV = {
+  112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
+};
+
+// Constants for RGBA.
+static const vec8 kRGBAToY = {
+  0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
+};
+
+static const vec8 kRGBAToU = {
+  0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
+};
+
+static const vec8 kRGBAToV = {
+  0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
+};
+
+static const uvec8 kAddY16 = {
+  16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
+};
+
+// 7 bit fixed point 0.5.
+static const vec16 kAddYJ64 = {
+  64, 64, 64, 64, 64, 64, 64, 64
+};
+
+static const uvec8 kAddUV128 = {
+  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
+  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
+};
+
+static const uvec16 kAddUVJ128 = {
+  0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
+};
+
+// Shuffle table for converting RGB24 to ARGB.
+static const uvec8 kShuffleMaskRGB24ToARGB = {
+  0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
+};
+
+// Shuffle table for converting RAW to ARGB.
+static const uvec8 kShuffleMaskRAWToARGB = {
+  2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
+};
+
+// Shuffle table for converting ARGB to RGB24.
+static const uvec8 kShuffleMaskARGBToRGB24 = {
+  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
+};
+
+// Shuffle table for converting ARGB to RAW.
+static const uvec8 kShuffleMaskARGBToRAW = {
+  2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
+};
+
+// Shuffle table for converting ARGBToRGB24 for I422ToRGB24.  First 8 + next 4
+static const uvec8 kShuffleMaskARGBToRGB24_0 = {
+  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
+};
+
+// Shuffle table for converting ARGB to RAW.
+static const uvec8 kShuffleMaskARGBToRAW_0 = {
+  2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
+};
+
+// Duplicates gray value 3 times and fills in alpha opaque.
+__declspec(naked)
+void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
+  __asm {
+    mov        eax, [esp + 4]        // src_y
+    mov        edx, [esp + 8]        // dst_argb
+    mov        ecx, [esp + 12]       // pix
+    pcmpeqb    xmm5, xmm5            // generate mask 0xff000000
+    pslld      xmm5, 24
+
+  convertloop:
+    movq       xmm0, qword ptr [eax]
+    lea        eax,  [eax + 8]
+    punpcklbw  xmm0, xmm0
+    movdqa     xmm1, xmm0
+    punpcklwd  xmm0, xmm0
+    punpckhwd  xmm1, xmm1
+    por        xmm0, xmm5
+    por        xmm1, xmm5
+    movdqu     [edx], xmm0
+    movdqu     [edx + 16], xmm1
+    lea        edx, [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+    ret
+  }
+}
+
+#ifdef HAS_J400TOARGBROW_AVX2
+// Duplicates gray value 3 times and fills in alpha opaque.
+__declspec(naked)
+void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int pix) {
+  __asm {
+    mov         eax, [esp + 4]        // src_y
+    mov         edx, [esp + 8]        // dst_argb
+    mov         ecx, [esp + 12]       // pix
+    vpcmpeqb    ymm5, ymm5, ymm5      // generate mask 0xff000000
+    vpslld      ymm5, ymm5, 24
+
+  convertloop:
+    vmovdqu     xmm0, [eax]
+    lea         eax,  [eax + 16]
+    vpermq      ymm0, ymm0, 0xd8
+    vpunpcklbw  ymm0, ymm0, ymm0
+    vpermq      ymm0, ymm0, 0xd8
+    vpunpckhwd  ymm1, ymm0, ymm0
+    vpunpcklwd  ymm0, ymm0, ymm0
+    vpor        ymm0, ymm0, ymm5
+    vpor        ymm1, ymm1, ymm5
+    vmovdqu     [edx], ymm0
+    vmovdqu     [edx + 32], ymm1
+    lea         edx, [edx + 64]
+    sub         ecx, 16
+    jg          convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_J400TOARGBROW_AVX2
+
+__declspec(naked)
+void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
+  __asm {
+    mov       eax, [esp + 4]   // src_rgb24
+    mov       edx, [esp + 8]   // dst_argb
+    mov       ecx, [esp + 12]  // pix
+    pcmpeqb   xmm5, xmm5       // generate mask 0xff000000
+    pslld     xmm5, 24
+    movdqa    xmm4, kShuffleMaskRGB24ToARGB
+
+ convertloop:
+    movdqu    xmm0, [eax]
+    movdqu    xmm1, [eax + 16]
+    movdqu    xmm3, [eax + 32]
+    lea       eax, [eax + 48]
+    movdqa    xmm2, xmm3
+    palignr   xmm2, xmm1, 8    // xmm2 = { xmm3[0:3] xmm1[8:15]}
+    pshufb    xmm2, xmm4
+    por       xmm2, xmm5
+    palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
+    pshufb    xmm0, xmm4
+    movdqu    [edx + 32], xmm2
+    por       xmm0, xmm5
+    pshufb    xmm1, xmm4
+    movdqu    [edx], xmm0
+    por       xmm1, xmm5
+    palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
+    pshufb    xmm3, xmm4
+    movdqu    [edx + 16], xmm1
+    por       xmm3, xmm5
+    movdqu    [edx + 48], xmm3
+    lea       edx, [edx + 64]
+    sub       ecx, 16
+    jg        convertloop
+    ret
+  }
+}
+
+__declspec(naked)
+void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
+                        int pix) {
+  __asm {
+    mov       eax, [esp + 4]   // src_raw
+    mov       edx, [esp + 8]   // dst_argb
+    mov       ecx, [esp + 12]  // pix
+    pcmpeqb   xmm5, xmm5       // generate mask 0xff000000
+    pslld     xmm5, 24
+    movdqa    xmm4, kShuffleMaskRAWToARGB
+
+ convertloop:
+    movdqu    xmm0, [eax]
+    movdqu    xmm1, [eax + 16]
+    movdqu    xmm3, [eax + 32]
+    lea       eax, [eax + 48]
+    movdqa    xmm2, xmm3
+    palignr   xmm2, xmm1, 8    // xmm2 = { xmm3[0:3] xmm1[8:15]}
+    pshufb    xmm2, xmm4
+    por       xmm2, xmm5
+    palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
+    pshufb    xmm0, xmm4
+    movdqu    [edx + 32], xmm2
+    por       xmm0, xmm5
+    pshufb    xmm1, xmm4
+    movdqu    [edx], xmm0
+    por       xmm1, xmm5
+    palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
+    pshufb    xmm3, xmm4
+    movdqu    [edx + 16], xmm1
+    por       xmm3, xmm5
+    movdqu    [edx + 48], xmm3
+    lea       edx, [edx + 64]
+    sub       ecx, 16
+    jg        convertloop
+    ret
+  }
+}
+
+// pmul method to replicate bits.
+// Math to replicate bits:
+// (v << 8) | (v << 3)
+// v * 256 + v * 8
+// v * (256 + 8)
+// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
+// 20 instructions.
+__declspec(naked)
+void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
+                          int pix) {
+  __asm {
+    mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
+    movd      xmm5, eax
+    pshufd    xmm5, xmm5, 0
+    mov       eax, 0x20802080  // multiplier shift by 5 and then repeat 6 bits
+    movd      xmm6, eax
+    pshufd    xmm6, xmm6, 0
+    pcmpeqb   xmm3, xmm3       // generate mask 0xf800f800 for Red
+    psllw     xmm3, 11
+    pcmpeqb   xmm4, xmm4       // generate mask 0x07e007e0 for Green
+    psllw     xmm4, 10
+    psrlw     xmm4, 5
+    pcmpeqb   xmm7, xmm7       // generate mask 0xff00ff00 for Alpha
+    psllw     xmm7, 8
+
+    mov       eax, [esp + 4]   // src_rgb565
+    mov       edx, [esp + 8]   // dst_argb
+    mov       ecx, [esp + 12]  // pix
+    sub       edx, eax
+    sub       edx, eax
+
+ convertloop:
+    movdqu    xmm0, [eax]   // fetch 8 pixels of bgr565
+    movdqa    xmm1, xmm0
+    movdqa    xmm2, xmm0
+    pand      xmm1, xmm3    // R in upper 5 bits
+    psllw     xmm2, 11      // B in upper 5 bits
+    pmulhuw   xmm1, xmm5    // * (256 + 8)
+    pmulhuw   xmm2, xmm5    // * (256 + 8)
+    psllw     xmm1, 8
+    por       xmm1, xmm2    // RB
+    pand      xmm0, xmm4    // G in middle 6 bits
+    pmulhuw   xmm0, xmm6    // << 5 * (256 + 4)
+    por       xmm0, xmm7    // AG
+    movdqa    xmm2, xmm1
+    punpcklbw xmm1, xmm0
+    punpckhbw xmm2, xmm0
+    movdqu    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
+    movdqu    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
+    lea       eax, [eax + 16]
+    sub       ecx, 8
+    jg        convertloop
+    ret
+  }
+}
+
+#ifdef HAS_RGB565TOARGBROW_AVX2
+// pmul method to replicate bits.
+// Math to replicate bits:
+// (v << 8) | (v << 3)
+// v * 256 + v * 8
+// v * (256 + 8)
+// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
+__declspec(naked)
+void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb,
+                          int pix) {
+  __asm {
+    mov        eax, 0x01080108  // generate multiplier to repeat 5 bits
+    vmovd      xmm5, eax
+    vbroadcastss ymm5, xmm5
+    mov        eax, 0x20802080  // multiplier shift by 5 and then repeat 6 bits
+    movd       xmm6, eax
+    vbroadcastss ymm6, xmm6
+    vpcmpeqb   ymm3, ymm3, ymm3       // generate mask 0xf800f800 for Red
+    vpsllw     ymm3, ymm3, 11
+    vpcmpeqb   ymm4, ymm4, ymm4       // generate mask 0x07e007e0 for Green
+    vpsllw     ymm4, ymm4, 10
+    vpsrlw     ymm4, ymm4, 5
+    vpcmpeqb   ymm7, ymm7, ymm7       // generate mask 0xff00ff00 for Alpha
+    vpsllw     ymm7, ymm7, 8
+
+    mov        eax, [esp + 4]   // src_rgb565
+    mov        edx, [esp + 8]   // dst_argb
+    mov        ecx, [esp + 12]  // pix
+    sub        edx, eax
+    sub        edx, eax
+
+ convertloop:
+    vmovdqu    ymm0, [eax]   // fetch 16 pixels of bgr565
+    vpand      ymm1, ymm0, ymm3    // R in upper 5 bits
+    vpsllw     ymm2, ymm0, 11      // B in upper 5 bits
+    vpmulhuw   ymm1, ymm1, ymm5    // * (256 + 8)
+    vpmulhuw   ymm2, ymm2, ymm5    // * (256 + 8)
+    vpsllw     ymm1, ymm1, 8
+    vpor       ymm1, ymm1, ymm2    // RB
+    vpand      ymm0, ymm0, ymm4    // G in middle 6 bits
+    vpmulhuw   ymm0, ymm0, ymm6    // << 5 * (256 + 4)
+    vpor       ymm0, ymm0, ymm7    // AG
+    vpermq     ymm0, ymm0, 0xd8    // mutate for unpack
+    vpermq     ymm1, ymm1, 0xd8
+    vpunpckhbw ymm2, ymm1, ymm0
+    vpunpcklbw ymm1, ymm1, ymm0
+    vmovdqu    [eax * 2 + edx], ymm1  // store 4 pixels of ARGB
+    vmovdqu    [eax * 2 + edx + 32], ymm2  // store next 4 pixels of ARGB
+    lea       eax, [eax + 32]
+    sub       ecx, 16
+    jg        convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_RGB565TOARGBROW_AVX2
+
+#ifdef HAS_ARGB1555TOARGBROW_AVX2
+__declspec(naked)
+void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb,
+                            int pix) {
+  __asm {
+    mov        eax, 0x01080108  // generate multiplier to repeat 5 bits
+    vmovd      xmm5, eax
+    vbroadcastss ymm5, xmm5
+    mov        eax, 0x42004200  // multiplier shift by 6 and then repeat 5 bits
+    movd       xmm6, eax
+    vbroadcastss ymm6, xmm6
+    vpcmpeqb   ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red
+    vpsllw     ymm3, ymm3, 11
+    vpsrlw     ymm4, ymm3, 6    // generate mask 0x03e003e0 for Green
+    vpcmpeqb   ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha
+    vpsllw     ymm7, ymm7, 8
+
+    mov        eax,  [esp + 4]   // src_argb1555
+    mov        edx,  [esp + 8]   // dst_argb
+    mov        ecx,  [esp + 12]  // pix
+    sub        edx,  eax
+    sub        edx,  eax
+
+ convertloop:
+    vmovdqu    ymm0, [eax]         // fetch 16 pixels of 1555
+    vpsllw     ymm1, ymm0, 1       // R in upper 5 bits
+    vpsllw     ymm2, ymm0, 11      // B in upper 5 bits
+    vpand      ymm1, ymm1, ymm3
+    vpmulhuw   ymm2, ymm2, ymm5    // * (256 + 8)
+    vpmulhuw   ymm1, ymm1, ymm5    // * (256 + 8)
+    vpsllw     ymm1, ymm1, 8
+    vpor       ymm1, ymm1, ymm2    // RB
+    vpsraw     ymm2, ymm0, 8       // A
+    vpand      ymm0, ymm0, ymm4    // G in middle 5 bits
+    vpmulhuw   ymm0, ymm0, ymm6    // << 6 * (256 + 8)
+    vpand      ymm2, ymm2, ymm7
+    vpor       ymm0, ymm0, ymm2    // AG
+    vpermq     ymm0, ymm0, 0xd8    // mutate for unpack
+    vpermq     ymm1, ymm1, 0xd8
+    vpunpckhbw ymm2, ymm1, ymm0
+    vpunpcklbw ymm1, ymm1, ymm0
+    vmovdqu    [eax * 2 + edx], ymm1  // store 8 pixels of ARGB
+    vmovdqu    [eax * 2 + edx + 32], ymm2  // store next 8 pixels of ARGB
+    lea       eax, [eax + 32]
+    sub       ecx, 16
+    jg        convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGB1555TOARGBROW_AVX2
+
+#ifdef HAS_ARGB4444TOARGBROW_AVX2
+__declspec(naked)
+void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb,
+                            int pix) {
+  __asm {
+    mov       eax,  0x0f0f0f0f  // generate mask 0x0f0f0f0f
+    vmovd     xmm4, eax
+    vbroadcastss ymm4, xmm4
+    vpslld    ymm5, ymm4, 4     // 0xf0f0f0f0 for high nibbles
+    mov       eax,  [esp + 4]   // src_argb4444
+    mov       edx,  [esp + 8]   // dst_argb
+    mov       ecx,  [esp + 12]  // pix
+    sub       edx,  eax
+    sub       edx,  eax
+
+ convertloop:
+    vmovdqu    ymm0, [eax]         // fetch 16 pixels of bgra4444
+    vpand      ymm2, ymm0, ymm5    // mask high nibbles
+    vpand      ymm0, ymm0, ymm4    // mask low nibbles
+    vpsrlw     ymm3, ymm2, 4
+    vpsllw     ymm1, ymm0, 4
+    vpor       ymm2, ymm2, ymm3
+    vpor       ymm0, ymm0, ymm1
+    vpermq     ymm0, ymm0, 0xd8    // mutate for unpack
+    vpermq     ymm2, ymm2, 0xd8
+    vpunpckhbw ymm1, ymm0, ymm2
+    vpunpcklbw ymm0, ymm0, ymm2
+    vmovdqu    [eax * 2 + edx], ymm0  // store 8 pixels of ARGB
+    vmovdqu    [eax * 2 + edx + 32], ymm1  // store next 8 pixels of ARGB
+    lea       eax, [eax + 32]
+    sub       ecx, 16
+    jg        convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGB4444TOARGBROW_AVX2
+
+// 24 instructions
+__declspec(naked)
+void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
+                            int pix) {
+  __asm {
+    mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
+    movd      xmm5, eax
+    pshufd    xmm5, xmm5, 0
+    mov       eax, 0x42004200  // multiplier shift by 6 and then repeat 5 bits
+    movd      xmm6, eax
+    pshufd    xmm6, xmm6, 0
+    pcmpeqb   xmm3, xmm3       // generate mask 0xf800f800 for Red
+    psllw     xmm3, 11
+    movdqa    xmm4, xmm3       // generate mask 0x03e003e0 for Green
+    psrlw     xmm4, 6
+    pcmpeqb   xmm7, xmm7       // generate mask 0xff00ff00 for Alpha
+    psllw     xmm7, 8
+
+    mov       eax, [esp + 4]   // src_argb1555
+    mov       edx, [esp + 8]   // dst_argb
+    mov       ecx, [esp + 12]  // pix
+    sub       edx, eax
+    sub       edx, eax
+
+ convertloop:
+    movdqu    xmm0, [eax]   // fetch 8 pixels of 1555
+    movdqa    xmm1, xmm0
+    movdqa    xmm2, xmm0
+    psllw     xmm1, 1       // R in upper 5 bits
+    psllw     xmm2, 11      // B in upper 5 bits
+    pand      xmm1, xmm3
+    pmulhuw   xmm2, xmm5    // * (256 + 8)
+    pmulhuw   xmm1, xmm5    // * (256 + 8)
+    psllw     xmm1, 8
+    por       xmm1, xmm2    // RB
+    movdqa    xmm2, xmm0
+    pand      xmm0, xmm4    // G in middle 5 bits
+    psraw     xmm2, 8       // A
+    pmulhuw   xmm0, xmm6    // << 6 * (256 + 8)
+    pand      xmm2, xmm7
+    por       xmm0, xmm2    // AG
+    movdqa    xmm2, xmm1
+    punpcklbw xmm1, xmm0
+    punpckhbw xmm2, xmm0
+    movdqu    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
+    movdqu    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
+    lea       eax, [eax + 16]
+    sub       ecx, 8
+    jg        convertloop
+    ret
+  }
+}
+
+// 18 instructions.
+__declspec(naked)
+void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
+                            int pix) {
+  __asm {
+    mov       eax, 0x0f0f0f0f  // generate mask 0x0f0f0f0f
+    movd      xmm4, eax
+    pshufd    xmm4, xmm4, 0
+    movdqa    xmm5, xmm4       // 0xf0f0f0f0 for high nibbles
+    pslld     xmm5, 4
+    mov       eax, [esp + 4]   // src_argb4444
+    mov       edx, [esp + 8]   // dst_argb
+    mov       ecx, [esp + 12]  // pix
+    sub       edx, eax
+    sub       edx, eax
+
+ convertloop:
+    movdqu    xmm0, [eax]   // fetch 8 pixels of bgra4444
+    movdqa    xmm2, xmm0
+    pand      xmm0, xmm4    // mask low nibbles
+    pand      xmm2, xmm5    // mask high nibbles
+    movdqa    xmm1, xmm0
+    movdqa    xmm3, xmm2
+    psllw     xmm1, 4
+    psrlw     xmm3, 4
+    por       xmm0, xmm1
+    por       xmm2, xmm3
+    movdqa    xmm1, xmm0
+    punpcklbw xmm0, xmm2
+    punpckhbw xmm1, xmm2
+    movdqu    [eax * 2 + edx], xmm0  // store 4 pixels of ARGB
+    movdqu    [eax * 2 + edx + 16], xmm1  // store next 4 pixels of ARGB
+    lea       eax, [eax + 16]
+    sub       ecx, 8
+    jg        convertloop
+    ret
+  }
+}
+
+__declspec(naked)
+void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
+  __asm {
+    mov       eax, [esp + 4]   // src_argb
+    mov       edx, [esp + 8]   // dst_rgb
+    mov       ecx, [esp + 12]  // pix
+    movdqa    xmm6, kShuffleMaskARGBToRGB24
+
+ convertloop:
+    movdqu    xmm0, [eax]   // fetch 16 pixels of argb
+    movdqu    xmm1, [eax + 16]
+    movdqu    xmm2, [eax + 32]
+    movdqu    xmm3, [eax + 48]
+    lea       eax, [eax + 64]
+    pshufb    xmm0, xmm6    // pack 16 bytes of ARGB to 12 bytes of RGB
+    pshufb    xmm1, xmm6
+    pshufb    xmm2, xmm6
+    pshufb    xmm3, xmm6
+    movdqa    xmm4, xmm1   // 4 bytes from 1 for 0
+    psrldq    xmm1, 4      // 8 bytes from 1
+    pslldq    xmm4, 12     // 4 bytes from 1 for 0
+    movdqa    xmm5, xmm2   // 8 bytes from 2 for 1
+    por       xmm0, xmm4   // 4 bytes from 1 for 0
+    pslldq    xmm5, 8      // 8 bytes from 2 for 1
+    movdqu    [edx], xmm0  // store 0
+    por       xmm1, xmm5   // 8 bytes from 2 for 1
+    psrldq    xmm2, 8      // 4 bytes from 2
+    pslldq    xmm3, 4      // 12 bytes from 3 for 2
+    por       xmm2, xmm3   // 12 bytes from 3 for 2
+    movdqu    [edx + 16], xmm1   // store 1
+    movdqu    [edx + 32], xmm2   // store 2
+    lea       edx, [edx + 48]
+    sub       ecx, 16
+    jg        convertloop
+    ret
+  }
+}
+
+__declspec(naked)
+void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
+  __asm {
+    mov       eax, [esp + 4]   // src_argb
+    mov       edx, [esp + 8]   // dst_rgb
+    mov       ecx, [esp + 12]  // pix
+    movdqa    xmm6, kShuffleMaskARGBToRAW
+
+ convertloop:
+    movdqu    xmm0, [eax]   // fetch 16 pixels of argb
+    movdqu    xmm1, [eax + 16]
+    movdqu    xmm2, [eax + 32]
+    movdqu    xmm3, [eax + 48]
+    lea       eax, [eax + 64]
+    pshufb    xmm0, xmm6    // pack 16 bytes of ARGB to 12 bytes of RGB
+    pshufb    xmm1, xmm6
+    pshufb    xmm2, xmm6
+    pshufb    xmm3, xmm6
+    movdqa    xmm4, xmm1   // 4 bytes from 1 for 0
+    psrldq    xmm1, 4      // 8 bytes from 1
+    pslldq    xmm4, 12     // 4 bytes from 1 for 0
+    movdqa    xmm5, xmm2   // 8 bytes from 2 for 1
+    por       xmm0, xmm4   // 4 bytes from 1 for 0
+    pslldq    xmm5, 8      // 8 bytes from 2 for 1
+    movdqu    [edx], xmm0  // store 0
+    por       xmm1, xmm5   // 8 bytes from 2 for 1
+    psrldq    xmm2, 8      // 4 bytes from 2
+    pslldq    xmm3, 4      // 12 bytes from 3 for 2
+    por       xmm2, xmm3   // 12 bytes from 3 for 2
+    movdqu    [edx + 16], xmm1   // store 1
+    movdqu    [edx + 32], xmm2   // store 2
+    lea       edx, [edx + 48]
+    sub       ecx, 16
+    jg        convertloop
+    ret
+  }
+}
+
+// 4 pixels
+__declspec(naked)
+void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
+  __asm {
+    mov       eax, [esp + 4]   // src_argb
+    mov       edx, [esp + 8]   // dst_rgb
+    mov       ecx, [esp + 12]  // pix
+    pcmpeqb   xmm3, xmm3       // generate mask 0x0000001f
+    psrld     xmm3, 27
+    pcmpeqb   xmm4, xmm4       // generate mask 0x000007e0
+    psrld     xmm4, 26
+    pslld     xmm4, 5
+    pcmpeqb   xmm5, xmm5       // generate mask 0xfffff800
+    pslld     xmm5, 11
+
+ convertloop:
+    movdqu    xmm0, [eax]   // fetch 4 pixels of argb
+    movdqa    xmm1, xmm0    // B
+    movdqa    xmm2, xmm0    // G
+    pslld     xmm0, 8       // R
+    psrld     xmm1, 3       // B
+    psrld     xmm2, 5       // G
+    psrad     xmm0, 16      // R
+    pand      xmm1, xmm3    // B
+    pand      xmm2, xmm4    // G
+    pand      xmm0, xmm5    // R
+    por       xmm1, xmm2    // BG
+    por       xmm0, xmm1    // BGR
+    packssdw  xmm0, xmm0
+    lea       eax, [eax + 16]
+    movq      qword ptr [edx], xmm0  // store 4 pixels of RGB565
+    lea       edx, [edx + 8]
+    sub       ecx, 4
+    jg        convertloop
+    ret
+  }
+}
+
+// 8 pixels
+__declspec(naked)
+void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb,
+                                const uint32 dither4, int pix) {
+  __asm {
+
+    mov       eax, [esp + 4]   // src_argb
+    mov       edx, [esp + 8]   // dst_rgb
+    movd      xmm6, [esp + 12] // dither4
+    mov       ecx, [esp + 16]  // pix
+    punpcklbw xmm6, xmm6       // make dither 16 bytes
+    movdqa    xmm7, xmm6
+    punpcklwd xmm6, xmm6
+    punpckhwd xmm7, xmm7
+    pcmpeqb   xmm3, xmm3       // generate mask 0x0000001f
+    psrld     xmm3, 27
+    pcmpeqb   xmm4, xmm4       // generate mask 0x000007e0
+    psrld     xmm4, 26
+    pslld     xmm4, 5
+    pcmpeqb   xmm5, xmm5       // generate mask 0xfffff800
+    pslld     xmm5, 11
+
+ convertloop:
+    movdqu    xmm0, [eax]   // fetch 4 pixels of argb
+    paddusb   xmm0, xmm6    // add dither
+    movdqa    xmm1, xmm0    // B
+    movdqa    xmm2, xmm0    // G
+    pslld     xmm0, 8       // R
+    psrld     xmm1, 3       // B
+    psrld     xmm2, 5       // G
+    psrad     xmm0, 16      // R
+    pand      xmm1, xmm3    // B
+    pand      xmm2, xmm4    // G
+    pand      xmm0, xmm5    // R
+    por       xmm1, xmm2    // BG
+    por       xmm0, xmm1    // BGR
+    packssdw  xmm0, xmm0
+    lea       eax, [eax + 16]
+    movq      qword ptr [edx], xmm0  // store 4 pixels of RGB565
+    lea       edx, [edx + 8]
+    sub       ecx, 4
+    jg        convertloop
+    ret
+  }
+}
+
+#ifdef HAS_ARGBTORGB565DITHERROW_AVX2
+__declspec(naked)
+void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb,
+                                const uint32 dither4, int pix) {
+  __asm {
+    mov        eax, [esp + 4]      // src_argb
+    mov        edx, [esp + 8]      // dst_rgb
+    vbroadcastss xmm6, [esp + 12]  // dither4
+    mov        ecx, [esp + 16]     // pix
+    vpunpcklbw xmm6, xmm6, xmm6    // make dither 32 bytes
+    vpermq     ymm6, ymm6, 0xd8
+    vpunpcklwd ymm6, ymm6, ymm6
+    vpcmpeqb   ymm3, ymm3, ymm3    // generate mask 0x0000001f
+    vpsrld     ymm3, ymm3, 27
+    vpcmpeqb   ymm4, ymm4, ymm4    // generate mask 0x000007e0
+    vpsrld     ymm4, ymm4, 26
+    vpslld     ymm4, ymm4, 5
+    vpslld     ymm5, ymm3, 11      // generate mask 0x0000f800
+
+ convertloop:
+    vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
+    vpaddusb   ymm0, ymm0, ymm6    // add dither
+    vpsrld     ymm2, ymm0, 5       // G
+    vpsrld     ymm1, ymm0, 3       // B
+    vpsrld     ymm0, ymm0, 8       // R
+    vpand      ymm2, ymm2, ymm4    // G
+    vpand      ymm1, ymm1, ymm3    // B
+    vpand      ymm0, ymm0, ymm5    // R
+    vpor       ymm1, ymm1, ymm2    // BG
+    vpor       ymm0, ymm0, ymm1    // BGR
+    vpackusdw  ymm0, ymm0, ymm0
+    vpermq     ymm0, ymm0, 0xd8
+    lea        eax, [eax + 32]
+    vmovdqu    [edx], xmm0         // store 8 pixels of RGB565
+    lea        edx, [edx + 16]
+    sub        ecx, 8
+    jg         convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBTORGB565DITHERROW_AVX2
+
+// TODO(fbarchard): Improve sign extension/packing.
+__declspec(naked)
+void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
+  __asm {
+    mov       eax, [esp + 4]   // src_argb
+    mov       edx, [esp + 8]   // dst_rgb
+    mov       ecx, [esp + 12]  // pix
+    pcmpeqb   xmm4, xmm4       // generate mask 0x0000001f
+    psrld     xmm4, 27
+    movdqa    xmm5, xmm4       // generate mask 0x000003e0
+    pslld     xmm5, 5
+    movdqa    xmm6, xmm4       // generate mask 0x00007c00
+    pslld     xmm6, 10
+    pcmpeqb   xmm7, xmm7       // generate mask 0xffff8000
+    pslld     xmm7, 15
+
+ convertloop:
+    movdqu    xmm0, [eax]   // fetch 4 pixels of argb
+    movdqa    xmm1, xmm0    // B
+    movdqa    xmm2, xmm0    // G
+    movdqa    xmm3, xmm0    // R
+    psrad     xmm0, 16      // A
+    psrld     xmm1, 3       // B
+    psrld     xmm2, 6       // G
+    psrld     xmm3, 9       // R
+    pand      xmm0, xmm7    // A
+    pand      xmm1, xmm4    // B
+    pand      xmm2, xmm5    // G
+    pand      xmm3, xmm6    // R
+    por       xmm0, xmm1    // BA
+    por       xmm2, xmm3    // GR
+    por       xmm0, xmm2    // BGRA
+    packssdw  xmm0, xmm0
+    lea       eax, [eax + 16]
+    movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB1555
+    lea       edx, [edx + 8]
+    sub       ecx, 4
+    jg        convertloop
+    ret
+  }
+}
+
+__declspec(naked)
+void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
+  __asm {
+    mov       eax, [esp + 4]   // src_argb
+    mov       edx, [esp + 8]   // dst_rgb
+    mov       ecx, [esp + 12]  // pix
+    pcmpeqb   xmm4, xmm4       // generate mask 0xf000f000
+    psllw     xmm4, 12
+    movdqa    xmm3, xmm4       // generate mask 0x00f000f0
+    psrlw     xmm3, 8
+
+ convertloop:
+    movdqu    xmm0, [eax]   // fetch 4 pixels of argb
+    movdqa    xmm1, xmm0
+    pand      xmm0, xmm3    // low nibble
+    pand      xmm1, xmm4    // high nibble
+    psrld     xmm0, 4
+    psrld     xmm1, 8
+    por       xmm0, xmm1
+    packuswb  xmm0, xmm0
+    lea       eax, [eax + 16]
+    movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB4444
+    lea       edx, [edx + 8]
+    sub       ecx, 4
+    jg        convertloop
+    ret
+  }
+}
+
+#ifdef HAS_ARGBTORGB565ROW_AVX2
+__declspec(naked)
+void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
+  __asm {
+    mov        eax, [esp + 4]      // src_argb
+    mov        edx, [esp + 8]      // dst_rgb
+    mov        ecx, [esp + 12]     // pix
+    vpcmpeqb   ymm3, ymm3, ymm3    // generate mask 0x0000001f
+    vpsrld     ymm3, ymm3, 27
+    vpcmpeqb   ymm4, ymm4, ymm4    // generate mask 0x000007e0
+    vpsrld     ymm4, ymm4, 26
+    vpslld     ymm4, ymm4, 5
+    vpslld     ymm5, ymm3, 11      // generate mask 0x0000f800
+
+ convertloop:
+    vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
+    vpsrld     ymm2, ymm0, 5       // G
+    vpsrld     ymm1, ymm0, 3       // B
+    vpsrld     ymm0, ymm0, 8       // R
+    vpand      ymm2, ymm2, ymm4    // G
+    vpand      ymm1, ymm1, ymm3    // B
+    vpand      ymm0, ymm0, ymm5    // R
+    vpor       ymm1, ymm1, ymm2    // BG
+    vpor       ymm0, ymm0, ymm1    // BGR
+    vpackusdw  ymm0, ymm0, ymm0
+    vpermq     ymm0, ymm0, 0xd8
+    lea        eax, [eax + 32]
+    vmovdqu    [edx], xmm0         // store 8 pixels of RGB565
+    lea        edx, [edx + 16]
+    sub        ecx, 8
+    jg         convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBTORGB565ROW_AVX2
+
+#ifdef HAS_ARGBTOARGB1555ROW_AVX2
+__declspec(naked)
+void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
+  __asm {
+    mov        eax, [esp + 4]      // src_argb
+    mov        edx, [esp + 8]      // dst_rgb
+    mov        ecx, [esp + 12]     // pix
+    vpcmpeqb   ymm4, ymm4, ymm4
+    vpsrld     ymm4, ymm4, 27      // generate mask 0x0000001f
+    vpslld     ymm5, ymm4, 5       // generate mask 0x000003e0
+    vpslld     ymm6, ymm4, 10      // generate mask 0x00007c00
+    vpcmpeqb   ymm7, ymm7, ymm7    // generate mask 0xffff8000
+    vpslld     ymm7, ymm7, 15
+
+ convertloop:
+    vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
+    vpsrld     ymm3, ymm0, 9       // R
+    vpsrld     ymm2, ymm0, 6       // G
+    vpsrld     ymm1, ymm0, 3       // B
+    vpsrad     ymm0, ymm0, 16      // A
+    vpand      ymm3, ymm3, ymm6    // R
+    vpand      ymm2, ymm2, ymm5    // G
+    vpand      ymm1, ymm1, ymm4    // B
+    vpand      ymm0, ymm0, ymm7    // A
+    vpor       ymm0, ymm0, ymm1    // BA
+    vpor       ymm2, ymm2, ymm3    // GR
+    vpor       ymm0, ymm0, ymm2    // BGRA
+    vpackssdw  ymm0, ymm0, ymm0
+    vpermq     ymm0, ymm0, 0xd8
+    lea        eax, [eax + 32]
+    vmovdqu    [edx], xmm0         // store 8 pixels of ARGB1555
+    lea        edx, [edx + 16]
+    sub        ecx, 8
+    jg         convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBTOARGB1555ROW_AVX2
+
+#ifdef HAS_ARGBTOARGB4444ROW_AVX2
+__declspec(naked)
+void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
+  __asm {
+    mov        eax, [esp + 4]   // src_argb
+    mov        edx, [esp + 8]   // dst_rgb
+    mov        ecx, [esp + 12]  // pix
+    vpcmpeqb   ymm4, ymm4, ymm4   // generate mask 0xf000f000
+    vpsllw     ymm4, ymm4, 12
+    vpsrlw     ymm3, ymm4, 8      // generate mask 0x00f000f0
+
+ convertloop:
+    vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
+    vpand      ymm1, ymm0, ymm4    // high nibble
+    vpand      ymm0, ymm0, ymm3    // low nibble
+    vpsrld     ymm1, ymm1, 8
+    vpsrld     ymm0, ymm0, 4
+    vpor       ymm0, ymm0, ymm1
+    vpackuswb  ymm0, ymm0, ymm0
+    vpermq     ymm0, ymm0, 0xd8
+    lea        eax, [eax + 32]
+    vmovdqu    [edx], xmm0         // store 8 pixels of ARGB4444
+    lea        edx, [edx + 16]
+    sub        ecx, 8
+    jg         convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBTOARGB4444ROW_AVX2
+
+// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
+__declspec(naked)
+void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+  __asm {
+    mov        eax, [esp + 4]   /* src_argb */
+    mov        edx, [esp + 8]   /* dst_y */
+    mov        ecx, [esp + 12]  /* pix */
+    movdqa     xmm4, kARGBToY
+    movdqa     xmm5, kAddY16
+
+ convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
+    pmaddubsw  xmm0, xmm4
+    pmaddubsw  xmm1, xmm4
+    pmaddubsw  xmm2, xmm4
+    pmaddubsw  xmm3, xmm4
+    lea        eax, [eax + 64]
+    phaddw     xmm0, xmm1
+    phaddw     xmm2, xmm3
+    psrlw      xmm0, 7
+    psrlw      xmm2, 7
+    packuswb   xmm0, xmm2
+    paddb      xmm0, xmm5
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    jg         convertloop
+    ret
+  }
+}
+
+// Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
+// Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
+__declspec(naked)
+void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+  __asm {
+    mov        eax, [esp + 4]   /* src_argb */
+    mov        edx, [esp + 8]   /* dst_y */
+    mov        ecx, [esp + 12]  /* pix */
+    movdqa     xmm4, kARGBToYJ
+    movdqa     xmm5, kAddYJ64
+
+ convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
+    pmaddubsw  xmm0, xmm4
+    pmaddubsw  xmm1, xmm4
+    pmaddubsw  xmm2, xmm4
+    pmaddubsw  xmm3, xmm4
+    lea        eax, [eax + 64]
+    phaddw     xmm0, xmm1
+    phaddw     xmm2, xmm3
+    paddw      xmm0, xmm5  // Add .5 for rounding.
+    paddw      xmm2, xmm5
+    psrlw      xmm0, 7
+    psrlw      xmm2, 7
+    packuswb   xmm0, xmm2
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    jg         convertloop
+    ret
+  }
+}
+
+#ifdef HAS_ARGBTOYROW_AVX2
+// vpermd for vphaddw + vpackuswb vpermd.
+static const lvec32 kPermdARGBToY_AVX = {
+  0, 4, 1, 5, 2, 6, 3, 7
+};
+
+// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
+__declspec(naked)
+void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
+  __asm {
+    mov        eax, [esp + 4]   /* src_argb */
+    mov        edx, [esp + 8]   /* dst_y */
+    mov        ecx, [esp + 12]  /* pix */
+    vbroadcastf128 ymm4, kARGBToY
+    vbroadcastf128 ymm5, kAddY16
+    vmovdqu    ymm6, kPermdARGBToY_AVX
+
+ convertloop:
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    vmovdqu    ymm2, [eax + 64]
+    vmovdqu    ymm3, [eax + 96]
+    vpmaddubsw ymm0, ymm0, ymm4
+    vpmaddubsw ymm1, ymm1, ymm4
+    vpmaddubsw ymm2, ymm2, ymm4
+    vpmaddubsw ymm3, ymm3, ymm4
+    lea        eax, [eax + 128]
+    vphaddw    ymm0, ymm0, ymm1  // mutates.
+    vphaddw    ymm2, ymm2, ymm3
+    vpsrlw     ymm0, ymm0, 7
+    vpsrlw     ymm2, ymm2, 7
+    vpackuswb  ymm0, ymm0, ymm2  // mutates.
+    vpermd     ymm0, ymm6, ymm0  // For vphaddw + vpackuswb mutation.
+    vpaddb     ymm0, ymm0, ymm5  // add 16 for Y
+    vmovdqu    [edx], ymm0
+    lea        edx, [edx + 32]
+    sub        ecx, 32
+    jg         convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  //  HAS_ARGBTOYROW_AVX2
+
+#ifdef HAS_ARGBTOYJROW_AVX2
+// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
+__declspec(naked)
+void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
+  __asm {
+    mov        eax, [esp + 4]   /* src_argb */
+    mov        edx, [esp + 8]   /* dst_y */
+    mov        ecx, [esp + 12]  /* pix */
+    vbroadcastf128 ymm4, kARGBToYJ
+    vbroadcastf128 ymm5, kAddYJ64
+    vmovdqu    ymm6, kPermdARGBToY_AVX
+
+ convertloop:
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    vmovdqu    ymm2, [eax + 64]
+    vmovdqu    ymm3, [eax + 96]
+    vpmaddubsw ymm0, ymm0, ymm4
+    vpmaddubsw ymm1, ymm1, ymm4
+    vpmaddubsw ymm2, ymm2, ymm4
+    vpmaddubsw ymm3, ymm3, ymm4
+    lea        eax, [eax + 128]
+    vphaddw    ymm0, ymm0, ymm1  // mutates.
+    vphaddw    ymm2, ymm2, ymm3
+    vpaddw     ymm0, ymm0, ymm5  // Add .5 for rounding.
+    vpaddw     ymm2, ymm2, ymm5
+    vpsrlw     ymm0, ymm0, 7
+    vpsrlw     ymm2, ymm2, 7
+    vpackuswb  ymm0, ymm0, ymm2  // mutates.
+    vpermd     ymm0, ymm6, ymm0  // For vphaddw + vpackuswb mutation.
+    vmovdqu    [edx], ymm0
+    lea        edx, [edx + 32]
+    sub        ecx, 32
+    jg         convertloop
+
+    vzeroupper
+    ret
+  }
+}
+#endif  //  HAS_ARGBTOYJROW_AVX2
+
+__declspec(naked)
+void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+  __asm {
+    mov        eax, [esp + 4]   /* src_argb */
+    mov        edx, [esp + 8]   /* dst_y */
+    mov        ecx, [esp + 12]  /* pix */
+    movdqa     xmm4, kBGRAToY
+    movdqa     xmm5, kAddY16
+
+ convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
+    pmaddubsw  xmm0, xmm4
+    pmaddubsw  xmm1, xmm4
+    pmaddubsw  xmm2, xmm4
+    pmaddubsw  xmm3, xmm4
+    lea        eax, [eax + 64]
+    phaddw     xmm0, xmm1
+    phaddw     xmm2, xmm3
+    psrlw      xmm0, 7
+    psrlw      xmm2, 7
+    packuswb   xmm0, xmm2
+    paddb      xmm0, xmm5
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    jg         convertloop
+    ret
+  }
+}
+
+__declspec(naked)
+void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+  __asm {
+    mov        eax, [esp + 4]   /* src_argb */
+    mov        edx, [esp + 8]   /* dst_y */
+    mov        ecx, [esp + 12]  /* pix */
+    movdqa     xmm4, kABGRToY
+    movdqa     xmm5, kAddY16
+
+ convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
+    pmaddubsw  xmm0, xmm4
+    pmaddubsw  xmm1, xmm4
+    pmaddubsw  xmm2, xmm4
+    pmaddubsw  xmm3, xmm4
+    lea        eax, [eax + 64]
+    phaddw     xmm0, xmm1
+    phaddw     xmm2, xmm3
+    psrlw      xmm0, 7
+    psrlw      xmm2, 7
+    packuswb   xmm0, xmm2
+    paddb      xmm0, xmm5
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    jg         convertloop
+    ret
+  }
+}
+
+__declspec(naked)
+void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+  __asm {
+    mov        eax, [esp + 4]   /* src_argb */
+    mov        edx, [esp + 8]   /* dst_y */
+    mov        ecx, [esp + 12]  /* pix */
+    movdqa     xmm4, kRGBAToY
+    movdqa     xmm5, kAddY16
+
+ convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
+    pmaddubsw  xmm0, xmm4
+    pmaddubsw  xmm1, xmm4
+    pmaddubsw  xmm2, xmm4
+    pmaddubsw  xmm3, xmm4
+    lea        eax, [eax + 64]
+    phaddw     xmm0, xmm1
+    phaddw     xmm2, xmm3
+    psrlw      xmm0, 7
+    psrlw      xmm2, 7
+    packuswb   xmm0, xmm2
+    paddb      xmm0, xmm5
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    jg         convertloop
+    ret
+  }
+}
+
+__declspec(naked)
+void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // src_argb
+    mov        esi, [esp + 8 + 8]   // src_stride_argb
+    mov        edx, [esp + 8 + 12]  // dst_u
+    mov        edi, [esp + 8 + 16]  // dst_v
+    mov        ecx, [esp + 8 + 20]  // pix
+    movdqa     xmm5, kAddUV128
+    movdqa     xmm6, kARGBToV
+    movdqa     xmm7, kARGBToU
+    sub        edi, edx             // stride from u to v
+
+ convertloop:
+    /* step 1 - subsample 16x2 argb pixels to 8x1 */
+    movdqu     xmm0, [eax]
+    movdqu     xmm4, [eax + esi]
+    pavgb      xmm0, xmm4
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm4, [eax + esi + 16]
+    pavgb      xmm1, xmm4
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm4, [eax + esi + 32]
+    pavgb      xmm2, xmm4
+    movdqu     xmm3, [eax + 48]
+    movdqu     xmm4, [eax + esi + 48]
+    pavgb      xmm3, xmm4
+
+    lea        eax,  [eax + 64]
+    movdqa     xmm4, xmm0
+    shufps     xmm0, xmm1, 0x88
+    shufps     xmm4, xmm1, 0xdd
+    pavgb      xmm0, xmm4
+    movdqa     xmm4, xmm2
+    shufps     xmm2, xmm3, 0x88
+    shufps     xmm4, xmm3, 0xdd
+    pavgb      xmm2, xmm4
+
+    // step 2 - convert to U and V
+    // from here down is very similar to Y code except
+    // instead of 16 different pixels, its 8 pixels of U and 8 of V
+    movdqa     xmm1, xmm0
+    movdqa     xmm3, xmm2
+    pmaddubsw  xmm0, xmm7  // U
+    pmaddubsw  xmm2, xmm7
+    pmaddubsw  xmm1, xmm6  // V
+    pmaddubsw  xmm3, xmm6
+    phaddw     xmm0, xmm2
+    phaddw     xmm1, xmm3
+    psraw      xmm0, 8
+    psraw      xmm1, 8
+    packsswb   xmm0, xmm1
+    paddb      xmm0, xmm5            // -> unsigned
+
+    // step 3 - store 8 U and 8 V values
+    movlps     qword ptr [edx], xmm0 // U
+    movhps     qword ptr [edx + edi], xmm0 // V
+    lea        edx, [edx + 8]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+__declspec(naked)
+void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                        uint8* dst_u, uint8* dst_v, int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // src_argb
+    mov        esi, [esp + 8 + 8]   // src_stride_argb
+    mov        edx, [esp + 8 + 12]  // dst_u
+    mov        edi, [esp + 8 + 16]  // dst_v
+    mov        ecx, [esp + 8 + 20]  // pix
+    movdqa     xmm5, kAddUVJ128
+    movdqa     xmm6, kARGBToVJ
+    movdqa     xmm7, kARGBToUJ
+    sub        edi, edx             // stride from u to v
+
+ convertloop:
+    /* step 1 - subsample 16x2 argb pixels to 8x1 */
+    movdqu     xmm0, [eax]
+    movdqu     xmm4, [eax + esi]
+    pavgb      xmm0, xmm4
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm4, [eax + esi + 16]
+    pavgb      xmm1, xmm4
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm4, [eax + esi + 32]
+    pavgb      xmm2, xmm4
+    movdqu     xmm3, [eax + 48]
+    movdqu     xmm4, [eax + esi + 48]
+    pavgb      xmm3, xmm4
+
+    lea        eax,  [eax + 64]
+    movdqa     xmm4, xmm0
+    shufps     xmm0, xmm1, 0x88
+    shufps     xmm4, xmm1, 0xdd
+    pavgb      xmm0, xmm4
+    movdqa     xmm4, xmm2
+    shufps     xmm2, xmm3, 0x88
+    shufps     xmm4, xmm3, 0xdd
+    pavgb      xmm2, xmm4
+
+    // step 2 - convert to U and V
+    // from here down is very similar to Y code except
+    // instead of 16 different pixels, its 8 pixels of U and 8 of V
+    movdqa     xmm1, xmm0
+    movdqa     xmm3, xmm2
+    pmaddubsw  xmm0, xmm7  // U
+    pmaddubsw  xmm2, xmm7
+    pmaddubsw  xmm1, xmm6  // V
+    pmaddubsw  xmm3, xmm6
+    phaddw     xmm0, xmm2
+    phaddw     xmm1, xmm3
+    paddw      xmm0, xmm5            // +.5 rounding -> unsigned
+    paddw      xmm1, xmm5
+    psraw      xmm0, 8
+    psraw      xmm1, 8
+    packsswb   xmm0, xmm1
+
+    // step 3 - store 8 U and 8 V values
+    movlps     qword ptr [edx], xmm0 // U
+    movhps     qword ptr [edx + edi], xmm0 // V
+    lea        edx, [edx + 8]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+#ifdef HAS_ARGBTOUVROW_AVX2
+__declspec(naked)
+void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
+                      uint8* dst_u, uint8* dst_v, int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // src_argb
+    mov        esi, [esp + 8 + 8]   // src_stride_argb
+    mov        edx, [esp + 8 + 12]  // dst_u
+    mov        edi, [esp + 8 + 16]  // dst_v
+    mov        ecx, [esp + 8 + 20]  // pix
+    vbroadcastf128 ymm5, kAddUV128
+    vbroadcastf128 ymm6, kARGBToV
+    vbroadcastf128 ymm7, kARGBToU
+    sub        edi, edx             // stride from u to v
+
+ convertloop:
+    /* step 1 - subsample 32x2 argb pixels to 16x1 */
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    vmovdqu    ymm2, [eax + 64]
+    vmovdqu    ymm3, [eax + 96]
+    vpavgb     ymm0, ymm0, [eax + esi]
+    vpavgb     ymm1, ymm1, [eax + esi + 32]
+    vpavgb     ymm2, ymm2, [eax + esi + 64]
+    vpavgb     ymm3, ymm3, [eax + esi + 96]
+    lea        eax,  [eax + 128]
+    vshufps    ymm4, ymm0, ymm1, 0x88
+    vshufps    ymm0, ymm0, ymm1, 0xdd
+    vpavgb     ymm0, ymm0, ymm4  // mutated by vshufps
+    vshufps    ymm4, ymm2, ymm3, 0x88
+    vshufps    ymm2, ymm2, ymm3, 0xdd
+    vpavgb     ymm2, ymm2, ymm4  // mutated by vshufps
+
+    // step 2 - convert to U and V
+    // from here down is very similar to Y code except
+    // instead of 32 different pixels, its 16 pixels of U and 16 of V
+    vpmaddubsw ymm1, ymm0, ymm7  // U
+    vpmaddubsw ymm3, ymm2, ymm7
+    vpmaddubsw ymm0, ymm0, ymm6  // V
+    vpmaddubsw ymm2, ymm2, ymm6
+    vphaddw    ymm1, ymm1, ymm3  // mutates
+    vphaddw    ymm0, ymm0, ymm2
+    vpsraw     ymm1, ymm1, 8
+    vpsraw     ymm0, ymm0, 8
+    vpacksswb  ymm0, ymm1, ymm0  // mutates
+    vpermq     ymm0, ymm0, 0xd8  // For vpacksswb
+    vpshufb    ymm0, ymm0, kShufARGBToUV_AVX  // For vshufps + vphaddw
+    vpaddb     ymm0, ymm0, ymm5  // -> unsigned
+
+    // step 3 - store 16 U and 16 V values
+    vextractf128 [edx], ymm0, 0 // U
+    vextractf128 [edx + edi], ymm0, 1 // V
+    lea        edx, [edx + 16]
+    sub        ecx, 32
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBTOUVROW_AVX2
+
+__declspec(naked)
+void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
+                          uint8* dst_u, uint8* dst_v, int width) {
+  __asm {
+    push       edi
+    mov        eax, [esp + 4 + 4]   // src_argb
+    mov        edx, [esp + 4 + 8]   // dst_u
+    mov        edi, [esp + 4 + 12]  // dst_v
+    mov        ecx, [esp + 4 + 16]  // pix
+    movdqa     xmm5, kAddUV128
+    movdqa     xmm6, kARGBToV
+    movdqa     xmm7, kARGBToU
+    sub        edi, edx             // stride from u to v
+
+ convertloop:
+    /* convert to U and V */
+    movdqu     xmm0, [eax]          // U
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
+    pmaddubsw  xmm0, xmm7
+    pmaddubsw  xmm1, xmm7
+    pmaddubsw  xmm2, xmm7
+    pmaddubsw  xmm3, xmm7
+    phaddw     xmm0, xmm1
+    phaddw     xmm2, xmm3
+    psraw      xmm0, 8
+    psraw      xmm2, 8
+    packsswb   xmm0, xmm2
+    paddb      xmm0, xmm5
+    movdqu     [edx], xmm0
+
+    movdqu     xmm0, [eax]          // V
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
+    pmaddubsw  xmm0, xmm6
+    pmaddubsw  xmm1, xmm6
+    pmaddubsw  xmm2, xmm6
+    pmaddubsw  xmm3, xmm6
+    phaddw     xmm0, xmm1
+    phaddw     xmm2, xmm3
+    psraw      xmm0, 8
+    psraw      xmm2, 8
+    packsswb   xmm0, xmm2
+    paddb      xmm0, xmm5
+    lea        eax,  [eax + 64]
+    movdqu     [edx + edi], xmm0
+    lea        edx,  [edx + 16]
+    sub        ecx,  16
+    jg         convertloop
+
+    pop        edi
+    ret
+  }
+}
+
+__declspec(naked)
+void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
+                          uint8* dst_u, uint8* dst_v, int width) {
+  __asm {
+    push       edi
+    mov        eax, [esp + 4 + 4]   // src_argb
+    mov        edx, [esp + 4 + 8]   // dst_u
+    mov        edi, [esp + 4 + 12]  // dst_v
+    mov        ecx, [esp + 4 + 16]  // pix
+    movdqa     xmm5, kAddUV128
+    movdqa     xmm6, kARGBToV
+    movdqa     xmm7, kARGBToU
+    sub        edi, edx             // stride from u to v
+
+ convertloop:
+    /* step 1 - subsample 16x2 argb pixels to 8x1 */
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
+    lea        eax,  [eax + 64]
+    movdqa     xmm4, xmm0
+    shufps     xmm0, xmm1, 0x88
+    shufps     xmm4, xmm1, 0xdd
+    pavgb      xmm0, xmm4
+    movdqa     xmm4, xmm2
+    shufps     xmm2, xmm3, 0x88
+    shufps     xmm4, xmm3, 0xdd
+    pavgb      xmm2, xmm4
+
+    // step 2 - convert to U and V
+    // from here down is very similar to Y code except
+    // instead of 16 different pixels, its 8 pixels of U and 8 of V
+    movdqa     xmm1, xmm0
+    movdqa     xmm3, xmm2
+    pmaddubsw  xmm0, xmm7  // U
+    pmaddubsw  xmm2, xmm7
+    pmaddubsw  xmm1, xmm6  // V
+    pmaddubsw  xmm3, xmm6
+    phaddw     xmm0, xmm2
+    phaddw     xmm1, xmm3
+    psraw      xmm0, 8
+    psraw      xmm1, 8
+    packsswb   xmm0, xmm1
+    paddb      xmm0, xmm5            // -> unsigned
+
+    // step 3 - store 8 U and 8 V values
+    movlps     qword ptr [edx], xmm0 // U
+    movhps     qword ptr [edx + edi], xmm0 // V
+    lea        edx, [edx + 8]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    ret
+  }
+}
+
+__declspec(naked)
+void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // src_argb
+    mov        esi, [esp + 8 + 8]   // src_stride_argb
+    mov        edx, [esp + 8 + 12]  // dst_u
+    mov        edi, [esp + 8 + 16]  // dst_v
+    mov        ecx, [esp + 8 + 20]  // pix
+    movdqa     xmm5, kAddUV128
+    movdqa     xmm6, kBGRAToV
+    movdqa     xmm7, kBGRAToU
+    sub        edi, edx             // stride from u to v
+
+ convertloop:
+    /* step 1 - subsample 16x2 argb pixels to 8x1 */
+    movdqu     xmm0, [eax]
+    movdqu     xmm4, [eax + esi]
+    pavgb      xmm0, xmm4
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm4, [eax + esi + 16]
+    pavgb      xmm1, xmm4
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm4, [eax + esi + 32]
+    pavgb      xmm2, xmm4
+    movdqu     xmm3, [eax + 48]
+    movdqu     xmm4, [eax + esi + 48]
+    pavgb      xmm3, xmm4
+
+    lea        eax,  [eax + 64]
+    movdqa     xmm4, xmm0
+    shufps     xmm0, xmm1, 0x88
+    shufps     xmm4, xmm1, 0xdd
+    pavgb      xmm0, xmm4
+    movdqa     xmm4, xmm2
+    shufps     xmm2, xmm3, 0x88
+    shufps     xmm4, xmm3, 0xdd
+    pavgb      xmm2, xmm4
+
+    // step 2 - convert to U and V
+    // from here down is very similar to Y code except
+    // instead of 16 different pixels, its 8 pixels of U and 8 of V
+    movdqa     xmm1, xmm0
+    movdqa     xmm3, xmm2
+    pmaddubsw  xmm0, xmm7  // U
+    pmaddubsw  xmm2, xmm7
+    pmaddubsw  xmm1, xmm6  // V
+    pmaddubsw  xmm3, xmm6
+    phaddw     xmm0, xmm2
+    phaddw     xmm1, xmm3
+    psraw      xmm0, 8
+    psraw      xmm1, 8
+    packsswb   xmm0, xmm1
+    paddb      xmm0, xmm5            // -> unsigned
+
+    // step 3 - store 8 U and 8 V values
+    movlps     qword ptr [edx], xmm0 // U
+    movhps     qword ptr [edx + edi], xmm0 // V
+    lea        edx, [edx + 8]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+__declspec(naked)
+void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // src_argb
+    mov        esi, [esp + 8 + 8]   // src_stride_argb
+    mov        edx, [esp + 8 + 12]  // dst_u
+    mov        edi, [esp + 8 + 16]  // dst_v
+    mov        ecx, [esp + 8 + 20]  // pix
+    movdqa     xmm5, kAddUV128
+    movdqa     xmm6, kABGRToV
+    movdqa     xmm7, kABGRToU
+    sub        edi, edx             // stride from u to v
+
+ convertloop:
+    /* step 1 - subsample 16x2 argb pixels to 8x1 */
+    movdqu     xmm0, [eax]
+    movdqu     xmm4, [eax + esi]
+    pavgb      xmm0, xmm4
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm4, [eax + esi + 16]
+    pavgb      xmm1, xmm4
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm4, [eax + esi + 32]
+    pavgb      xmm2, xmm4
+    movdqu     xmm3, [eax + 48]
+    movdqu     xmm4, [eax + esi + 48]
+    pavgb      xmm3, xmm4
+
+    lea        eax,  [eax + 64]
+    movdqa     xmm4, xmm0
+    shufps     xmm0, xmm1, 0x88
+    shufps     xmm4, xmm1, 0xdd
+    pavgb      xmm0, xmm4
+    movdqa     xmm4, xmm2
+    shufps     xmm2, xmm3, 0x88
+    shufps     xmm4, xmm3, 0xdd
+    pavgb      xmm2, xmm4
+
+    // step 2 - convert to U and V
+    // from here down is very similar to Y code except
+    // instead of 16 different pixels, its 8 pixels of U and 8 of V
+    movdqa     xmm1, xmm0
+    movdqa     xmm3, xmm2
+    pmaddubsw  xmm0, xmm7  // U
+    pmaddubsw  xmm2, xmm7
+    pmaddubsw  xmm1, xmm6  // V
+    pmaddubsw  xmm3, xmm6
+    phaddw     xmm0, xmm2
+    phaddw     xmm1, xmm3
+    psraw      xmm0, 8
+    psraw      xmm1, 8
+    packsswb   xmm0, xmm1
+    paddb      xmm0, xmm5            // -> unsigned
+
+    // step 3 - store 8 U and 8 V values
+    movlps     qword ptr [edx], xmm0 // U
+    movhps     qword ptr [edx + edi], xmm0 // V
+    lea        edx, [edx + 8]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+__declspec(naked)
+void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // src_argb
+    mov        esi, [esp + 8 + 8]   // src_stride_argb
+    mov        edx, [esp + 8 + 12]  // dst_u
+    mov        edi, [esp + 8 + 16]  // dst_v
+    mov        ecx, [esp + 8 + 20]  // pix
+    movdqa     xmm5, kAddUV128
+    movdqa     xmm6, kRGBAToV
+    movdqa     xmm7, kRGBAToU
+    sub        edi, edx             // stride from u to v
+
+ convertloop:
+    /* step 1 - subsample 16x2 argb pixels to 8x1 */
+    movdqu     xmm0, [eax]
+    movdqu     xmm4, [eax + esi]
+    pavgb      xmm0, xmm4
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm4, [eax + esi + 16]
+    pavgb      xmm1, xmm4
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm4, [eax + esi + 32]
+    pavgb      xmm2, xmm4
+    movdqu     xmm3, [eax + 48]
+    movdqu     xmm4, [eax + esi + 48]
+    pavgb      xmm3, xmm4
+
+    lea        eax,  [eax + 64]
+    movdqa     xmm4, xmm0
+    shufps     xmm0, xmm1, 0x88
+    shufps     xmm4, xmm1, 0xdd
+    pavgb      xmm0, xmm4
+    movdqa     xmm4, xmm2
+    shufps     xmm2, xmm3, 0x88
+    shufps     xmm4, xmm3, 0xdd
+    pavgb      xmm2, xmm4
+
+    // step 2 - convert to U and V
+    // from here down is very similar to Y code except
+    // instead of 16 different pixels, its 8 pixels of U and 8 of V
+    movdqa     xmm1, xmm0
+    movdqa     xmm3, xmm2
+    pmaddubsw  xmm0, xmm7  // U
+    pmaddubsw  xmm2, xmm7
+    pmaddubsw  xmm1, xmm6  // V
+    pmaddubsw  xmm3, xmm6
+    phaddw     xmm0, xmm2
+    phaddw     xmm1, xmm3
+    psraw      xmm0, 8
+    psraw      xmm1, 8
+    packsswb   xmm0, xmm1
+    paddb      xmm0, xmm5            // -> unsigned
+
+    // step 3 - store 8 U and 8 V values
+    movlps     qword ptr [edx], xmm0 // U
+    movhps     qword ptr [edx + edi], xmm0 // V
+    lea        edx, [edx + 8]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_ARGBTOYROW_SSSE3
+
+// Read 16 UV from 444
+#define READYUV444_AVX2 __asm {                                                \
+    __asm vmovdqu    xmm0, [esi]                  /* U */         /* NOLINT */ \
+    __asm vmovdqu    xmm1, [esi + edi]            /* V */         /* NOLINT */ \
+    __asm lea        esi,  [esi + 16]                                          \
+    __asm vpermq     ymm0, ymm0, 0xd8                                          \
+    __asm vpermq     ymm1, ymm1, 0xd8                                          \
+    __asm vpunpcklbw ymm0, ymm0, ymm1             /* UV */                     \
+  }
+
+// Read 8 UV from 422, upsample to 16 UV.
+#define READYUV422_AVX2 __asm {                                                \
+    __asm vmovq      xmm0, qword ptr [esi]        /* U */         /* NOLINT */ \
+    __asm vmovq      xmm1, qword ptr [esi + edi]  /* V */         /* NOLINT */ \
+    __asm lea        esi,  [esi + 8]                                           \
+    __asm vpunpcklbw ymm0, ymm0, ymm1             /* UV */                     \
+    __asm vpermq     ymm0, ymm0, 0xd8                                          \
+    __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \
+  }
+
+// Read 4 UV from 411, upsample to 16 UV.
+#define READYUV411_AVX2 __asm {                                                \
+    __asm vmovd      xmm0, dword ptr [esi]        /* U */         /* NOLINT */ \
+    __asm vmovd      xmm1, dword ptr [esi + edi]  /* V */         /* NOLINT */ \
+    __asm lea        esi,  [esi + 4]                                           \
+    __asm vpunpcklbw ymm0, ymm0, ymm1             /* UV */                     \
+    __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \
+    __asm vpermq     ymm0, ymm0, 0xd8                                          \
+    __asm vpunpckldq ymm0, ymm0, ymm0             /* UVUVUVUV (upsample) */    \
+  }
+
+// Read 8 UV from NV12, upsample to 16 UV.
+#define READNV12_AVX2 __asm {                                                  \
+    __asm vmovdqu    xmm0, [esi]                  /* UV */                     \
+    __asm lea        esi,  [esi + 16]                                          \
+    __asm vpermq     ymm0, ymm0, 0xd8                                          \
+    __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \
+  }
+
+// Convert 16 pixels: 16 UV and 16 Y.
+#define YUVTORGB_AVX2(YuvConstants) __asm {                                    \
+    /* Step 1: Find 8 UV contributions to 16 R,G,B values */                   \
+    __asm vpmaddubsw ymm2, ymm0, YuvConstants.kUVToR        /* scale R UV */   \
+    __asm vpmaddubsw ymm1, ymm0, YuvConstants.kUVToG        /* scale G UV */   \
+    __asm vpmaddubsw ymm0, ymm0, YuvConstants.kUVToB        /* scale B UV */   \
+    __asm vmovdqu    ymm3, YuvConstants.kUVBiasR                               \
+    __asm vpsubw     ymm2, ymm3, ymm2                                          \
+    __asm vmovdqu    ymm3, YuvConstants.kUVBiasG                               \
+    __asm vpsubw     ymm1, ymm3, ymm1                                          \
+    __asm vmovdqu    ymm3, YuvConstants.kUVBiasB                               \
+    __asm vpsubw     ymm0, ymm3, ymm0                                          \
+    /* Step 2: Find Y contribution to 16 R,G,B values */                       \
+    __asm vmovdqu    xmm3, [eax]                  /* NOLINT */                 \
+    __asm lea        eax, [eax + 16]                                           \
+    __asm vpermq     ymm3, ymm3, 0xd8                                          \
+    __asm vpunpcklbw ymm3, ymm3, ymm3                                          \
+    __asm vpmulhuw   ymm3, ymm3, YuvConstants.kYToRgb                          \
+    __asm vpaddsw    ymm0, ymm0, ymm3           /* B += Y */                   \
+    __asm vpaddsw    ymm1, ymm1, ymm3           /* G += Y */                   \
+    __asm vpaddsw    ymm2, ymm2, ymm3           /* R += Y */                   \
+    __asm vpsraw     ymm0, ymm0, 6                                             \
+    __asm vpsraw     ymm1, ymm1, 6                                             \
+    __asm vpsraw     ymm2, ymm2, 6                                             \
+    __asm vpackuswb  ymm0, ymm0, ymm0           /* B */                        \
+    __asm vpackuswb  ymm1, ymm1, ymm1           /* G */                        \
+    __asm vpackuswb  ymm2, ymm2, ymm2           /* R */                        \
+  }
+
+// Store 16 ARGB values.
+#define STOREARGB_AVX2 __asm {                                                 \
+    /* Step 3: Weave into ARGB */                                              \
+    __asm vpunpcklbw ymm0, ymm0, ymm1           /* BG */                       \
+    __asm vpermq     ymm0, ymm0, 0xd8                                          \
+    __asm vpunpcklbw ymm2, ymm2, ymm5           /* RA */                       \
+    __asm vpermq     ymm2, ymm2, 0xd8                                          \
+    __asm vpunpcklwd ymm1, ymm0, ymm2           /* BGRA first 8 pixels */      \
+    __asm vpunpckhwd ymm0, ymm0, ymm2           /* BGRA next 8 pixels */       \
+    __asm vmovdqu    0[edx], ymm1                                              \
+    __asm vmovdqu    32[edx], ymm0                                             \
+    __asm lea        edx,  [edx + 64]                                          \
+  }
+
+#ifdef HAS_I422TOARGBROW_AVX2
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+__declspec(naked)
+void I422ToARGBRow_AVX2(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* dst_argb,
+                        int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // U
+    mov        edi, [esp + 8 + 12]  // V
+    mov        edx, [esp + 8 + 16]  // argb
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edi, esi
+    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
+
+ convertloop:
+    READYUV422_AVX2
+    YUVTORGB_AVX2(kYuvConstants)
+    STOREARGB_AVX2
+
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_I422TOARGBROW_AVX2
+
+#ifdef HAS_J422TOARGBROW_AVX2
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+__declspec(naked)
+void J422ToARGBRow_AVX2(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* dst_argb,
+                        int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // U
+    mov        edi, [esp + 8 + 12]  // V
+    mov        edx, [esp + 8 + 16]  // argb
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edi, esi
+    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
+
+ convertloop:
+    READYUV422_AVX2
+    YUVTORGB_AVX2(kYuvJConstants)
+    STOREARGB_AVX2
+
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_J422TOARGBROW_AVX2
+
+#ifdef HAS_I444TOARGBROW_AVX2
+// 16 pixels
+// 16 UV values with 16 Y producing 16 ARGB (64 bytes).
+__declspec(naked)
+void I444ToARGBRow_AVX2(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* dst_argb,
+                        int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // U
+    mov        edi, [esp + 8 + 12]  // V
+    mov        edx, [esp + 8 + 16]  // argb
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edi, esi
+    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
+
+ convertloop:
+    READYUV444_AVX2
+    YUVTORGB_AVX2(kYuvConstants)
+    STOREARGB_AVX2
+
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_I444TOARGBROW_AVX2
+
+#ifdef HAS_I411TOARGBROW_AVX2
+// 16 pixels
+// 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+__declspec(naked)
+void I411ToARGBRow_AVX2(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* dst_argb,
+                        int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // U
+    mov        edi, [esp + 8 + 12]  // V
+    mov        edx, [esp + 8 + 16]  // argb
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edi, esi
+    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
+
+ convertloop:
+    READYUV411_AVX2
+    YUVTORGB_AVX2(kYuvConstants)
+    STOREARGB_AVX2
+
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_I411TOARGBROW_AVX2
+
+#ifdef HAS_NV12TOARGBROW_AVX2
+// 16 pixels.
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+__declspec(naked)
+void NV12ToARGBRow_AVX2(const uint8* y_buf,
+                        const uint8* uv_buf,
+                        uint8* dst_argb,
+                        int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // Y
+    mov        esi, [esp + 4 + 8]   // UV
+    mov        edx, [esp + 4 + 12]  // argb
+    mov        ecx, [esp + 4 + 16]  // width
+    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
+
+ convertloop:
+    READNV12_AVX2
+    YUVTORGB_AVX2(kYuvConstants)
+    STOREARGB_AVX2
+
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_NV12TOARGBROW_AVX2
+
+#ifdef HAS_NV21TOARGBROW_AVX2
+// 16 pixels.
+// 8 VU values upsampled to 16 VU, mixed with 16 Y producing 16 ARGB (64 bytes).
+__declspec(naked)
+void NV21ToARGBRow_AVX2(const uint8* y_buf,
+                        const uint8* uv_buf,
+                        uint8* dst_argb,
+                        int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // Y
+    mov        esi, [esp + 4 + 8]   // UV
+    mov        edx, [esp + 4 + 12]  // argb
+    mov        ecx, [esp + 4 + 16]  // width
+    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
+
+ convertloop:
+    READNV12_AVX2
+    YUVTORGB_AVX2(kYvuConstants)
+    STOREARGB_AVX2
+
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_NV21TOARGBROW_AVX2
+
+#ifdef HAS_I422TOBGRAROW_AVX2
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes).
+// TODO(fbarchard): Use macros to reduce duplicate code.  See SSSE3.
+__declspec(naked)
+void I422ToBGRARow_AVX2(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* dst_argb,
+                        int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // U
+    mov        edi, [esp + 8 + 12]  // V
+    mov        edx, [esp + 8 + 16]  // argb
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edi, esi
+    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
+
+ convertloop:
+    READYUV422_AVX2
+    YUVTORGB_AVX2(kYuvConstants)
+
+    // Step 3: Weave into BGRA
+    vpunpcklbw ymm1, ymm1, ymm0           // GB
+    vpermq     ymm1, ymm1, 0xd8
+    vpunpcklbw ymm2, ymm5, ymm2           // AR
+    vpermq     ymm2, ymm2, 0xd8
+    vpunpcklwd ymm0, ymm2, ymm1           // ARGB first 8 pixels
+    vpunpckhwd ymm2, ymm2, ymm1           // ARGB next 8 pixels
+    vmovdqu    [edx], ymm0
+    vmovdqu    [edx + 32], ymm2
+    lea        edx,  [edx + 64]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_I422TOBGRAROW_AVX2
+
+#ifdef HAS_I422TORGBAROW_AVX2
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
+// TODO(fbarchard): Use macros to reduce duplicate code.  See SSSE3.
+__declspec(naked)
+void I422ToRGBARow_AVX2(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* dst_argb,
+                        int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // U
+    mov        edi, [esp + 8 + 12]  // V
+    mov        edx, [esp + 8 + 16]  // argb
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edi, esi
+    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
+
+ convertloop:
+    READYUV422_AVX2
+    YUVTORGB_AVX2(kYuvConstants)
+
+    // Step 3: Weave into RGBA
+    vpunpcklbw ymm1, ymm1, ymm2           // GR
+    vpermq     ymm1, ymm1, 0xd8
+    vpunpcklbw ymm2, ymm5, ymm0           // AB
+    vpermq     ymm2, ymm2, 0xd8
+    vpunpcklwd ymm0, ymm2, ymm1           // ABGR first 8 pixels
+    vpunpckhwd ymm1, ymm2, ymm1           // ABGR next 8 pixels
+    vmovdqu    [edx], ymm0
+    vmovdqu    [edx + 32], ymm1
+    lea        edx,  [edx + 64]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_I422TORGBAROW_AVX2
+
+#ifdef HAS_I422TOABGRROW_AVX2
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes).
+// TODO(fbarchard): Use macros to reduce duplicate code.  See SSSE3.
+__declspec(naked)
+void I422ToABGRRow_AVX2(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* dst_argb,
+                        int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // U
+    mov        edi, [esp + 8 + 12]  // V
+    mov        edx, [esp + 8 + 16]  // argb
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edi, esi
+    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
+
+ convertloop:
+    READYUV422_AVX2
+    YUVTORGB_AVX2(kYuvConstants)
+
+    // Step 3: Weave into ABGR
+    vpunpcklbw ymm1, ymm2, ymm1           // RG
+    vpermq     ymm1, ymm1, 0xd8
+    vpunpcklbw ymm2, ymm0, ymm5           // BA
+    vpermq     ymm2, ymm2, 0xd8
+    vpunpcklwd ymm0, ymm1, ymm2           // RGBA first 8 pixels
+    vpunpckhwd ymm1, ymm1, ymm2           // RGBA next 8 pixels
+    vmovdqu    [edx], ymm0
+    vmovdqu    [edx + 32], ymm1
+    lea        edx,  [edx + 64]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_I422TOABGRROW_AVX2
+
+#if defined(HAS_I422TOARGBROW_SSSE3)
+// TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
+
+// Read 8 UV from 444.
+#define READYUV444 __asm {                                                     \
+    __asm movq       xmm0, qword ptr [esi] /* U */                /* NOLINT */ \
+    __asm movq       xmm1, qword ptr [esi + edi] /* V */          /* NOLINT */ \
+    __asm lea        esi,  [esi + 8]                                           \
+    __asm punpcklbw  xmm0, xmm1           /* UV */                             \
+  }
+
+// Read 4 UV from 422, upsample to 8 UV.
+#define READYUV422 __asm {                                                     \
+    __asm movd       xmm0, [esi]          /* U */                              \
+    __asm movd       xmm1, [esi + edi]    /* V */                              \
+    __asm lea        esi,  [esi + 4]                                           \
+    __asm punpcklbw  xmm0, xmm1           /* UV */                             \
+    __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
+  }
+
+// Read 2 UV from 411, upsample to 8 UV.
+#define READYUV411 __asm {                                                     \
+    __asm movzx      ebx, word ptr [esi]        /* U */           /* NOLINT */ \
+    __asm movd       xmm0, ebx                                                 \
+    __asm movzx      ebx, word ptr [esi + edi]  /* V */           /* NOLINT */ \
+    __asm movd       xmm1, ebx                                                 \
+    __asm lea        esi,  [esi + 2]                                           \
+    __asm punpcklbw  xmm0, xmm1           /* UV */                             \
+    __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
+    __asm punpckldq  xmm0, xmm0           /* UVUVUVUV (upsample) */            \
+  }
+
+// Read 4 UV from NV12, upsample to 8 UV.
+#define READNV12 __asm {                                                       \
+    __asm movq       xmm0, qword ptr [esi] /* UV */               /* NOLINT */ \
+    __asm lea        esi,  [esi + 8]                                           \
+    __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
+  }
+
+// Convert 8 pixels: 8 UV and 8 Y.
+#define YUVTORGB(YuvConstants) __asm {                                         \
+    /* Step 1: Find 4 UV contributions to 8 R,G,B values */                    \
+    __asm movdqa     xmm1, xmm0                                                \
+    __asm movdqa     xmm2, xmm0                                                \
+    __asm movdqa     xmm3, xmm0                                                \
+    __asm movdqa     xmm0, YuvConstants.kUVBiasB /* unbias back to signed */   \
+    __asm pmaddubsw  xmm1, YuvConstants.kUVToB   /* scale B UV */              \
+    __asm psubw      xmm0, xmm1                                                \
+    __asm movdqa     xmm1, YuvConstants.kUVBiasG                               \
+    __asm pmaddubsw  xmm2, YuvConstants.kUVToG   /* scale G UV */              \
+    __asm psubw      xmm1, xmm2                                                \
+    __asm movdqa     xmm2, YuvConstants.kUVBiasR                               \
+    __asm pmaddubsw  xmm3, YuvConstants.kUVToR   /* scale R UV */              \
+    __asm psubw      xmm2, xmm3                                                \
+    /* Step 2: Find Y contribution to 8 R,G,B values */                        \
+    __asm movq       xmm3, qword ptr [eax]                        /* NOLINT */ \
+    __asm lea        eax, [eax + 8]                                            \
+    __asm punpcklbw  xmm3, xmm3                                                \
+    __asm pmulhuw    xmm3, YuvConstants.kYToRgb                                \
+    __asm paddsw     xmm0, xmm3           /* B += Y */                         \
+    __asm paddsw     xmm1, xmm3           /* G += Y */                         \
+    __asm paddsw     xmm2, xmm3           /* R += Y */                         \
+    __asm psraw      xmm0, 6                                                   \
+    __asm psraw      xmm1, 6                                                   \
+    __asm psraw      xmm2, 6                                                   \
+    __asm packuswb   xmm0, xmm0           /* B */                              \
+    __asm packuswb   xmm1, xmm1           /* G */                              \
+    __asm packuswb   xmm2, xmm2           /* R */                              \
+  }
+
+// Store 8 ARGB values.
+#define STOREARGB __asm {                                                      \
+    /* Step 3: Weave into ARGB */                                              \
+    __asm punpcklbw  xmm0, xmm1           /* BG */                             \
+    __asm punpcklbw  xmm2, xmm5           /* RA */                             \
+    __asm movdqa     xmm1, xmm0                                                \
+    __asm punpcklwd  xmm0, xmm2           /* BGRA first 4 pixels */            \
+    __asm punpckhwd  xmm1, xmm2           /* BGRA next 4 pixels */             \
+    __asm movdqu     0[edx], xmm0                                              \
+    __asm movdqu     16[edx], xmm1                                             \
+    __asm lea        edx,  [edx + 32]                                          \
+  }
+
+// Store 8 BGRA values.
+#define STOREBGRA __asm {                                                      \
+    /* Step 3: Weave into BGRA */                                              \
+    __asm pcmpeqb    xmm5, xmm5           /* generate 0xffffffff for alpha */  \
+    __asm punpcklbw  xmm1, xmm0           /* GB */                             \
+    __asm punpcklbw  xmm5, xmm2           /* AR */                             \
+    __asm movdqa     xmm0, xmm5                                                \
+    __asm punpcklwd  xmm5, xmm1           /* BGRA first 4 pixels */            \
+    __asm punpckhwd  xmm0, xmm1           /* BGRA next 4 pixels */             \
+    __asm movdqu     0[edx], xmm5                                              \
+    __asm movdqu     16[edx], xmm0                                             \
+    __asm lea        edx,  [edx + 32]                                          \
+  }
+
+// Store 8 ABGR values.
+#define STOREABGR __asm {                                                      \
+    /* Step 3: Weave into ABGR */                                              \
+    __asm punpcklbw  xmm2, xmm1           /* RG */                             \
+    __asm punpcklbw  xmm0, xmm5           /* BA */                             \
+    __asm movdqa     xmm1, xmm2                                                \
+    __asm punpcklwd  xmm2, xmm0           /* RGBA first 4 pixels */            \
+    __asm punpckhwd  xmm1, xmm0           /* RGBA next 4 pixels */             \
+    __asm movdqu     0[edx], xmm2                                              \
+    __asm movdqu     16[edx], xmm1                                             \
+    __asm lea        edx,  [edx + 32]                                          \
+  }
+
+// Store 8 RGBA values.
+#define STORERGBA __asm {                                                      \
+    /* Step 3: Weave into RGBA */                                              \
+    __asm pcmpeqb    xmm5, xmm5           /* generate 0xffffffff for alpha */  \
+    __asm punpcklbw  xmm1, xmm2           /* GR */                             \
+    __asm punpcklbw  xmm5, xmm0           /* AB */                             \
+    __asm movdqa     xmm0, xmm5                                                \
+    __asm punpcklwd  xmm5, xmm1           /* RGBA first 4 pixels */            \
+    __asm punpckhwd  xmm0, xmm1           /* RGBA next 4 pixels */             \
+    __asm movdqu     0[edx], xmm5                                              \
+    __asm movdqu     16[edx], xmm0                                             \
+    __asm lea        edx,  [edx + 32]                                          \
+  }
+
+// Store 8 RGB24 values.
+#define STORERGB24 __asm {                                                     \
+    /* Step 3: Weave into RRGB */                                              \
+    __asm punpcklbw  xmm0, xmm1           /* BG */                             \
+    __asm punpcklbw  xmm2, xmm2           /* RR */                             \
+    __asm movdqa     xmm1, xmm0                                                \
+    __asm punpcklwd  xmm0, xmm2           /* BGRR first 4 pixels */            \
+    __asm punpckhwd  xmm1, xmm2           /* BGRR next 4 pixels */             \
+    /* Step 4: RRGB -> RGB24 */                                                \
+    __asm pshufb     xmm0, xmm5           /* Pack first 8 and last 4 bytes. */ \
+    __asm pshufb     xmm1, xmm6           /* Pack first 12 bytes. */           \
+    __asm palignr    xmm1, xmm0, 12       /* last 4 bytes of xmm0 + 12 xmm1 */ \
+    __asm movq       qword ptr 0[edx], xmm0  /* First 8 bytes */               \
+    __asm movdqu     8[edx], xmm1         /* Last 16 bytes */                  \
+    __asm lea        edx,  [edx + 24]                                          \
+  }
+
+// Store 8 RAW values.
+#define STORERAW __asm {                                                       \
+    /* Step 3: Weave into RRGB */                                              \
+    __asm punpcklbw  xmm0, xmm1           /* BG */                             \
+    __asm punpcklbw  xmm2, xmm2           /* RR */                             \
+    __asm movdqa     xmm1, xmm0                                                \
+    __asm punpcklwd  xmm0, xmm2           /* BGRR first 4 pixels */            \
+    __asm punpckhwd  xmm1, xmm2           /* BGRR next 4 pixels */             \
+    /* Step 4: RRGB -> RAW */                                                  \
+    __asm pshufb     xmm0, xmm5           /* Pack first 8 and last 4 bytes. */ \
+    __asm pshufb     xmm1, xmm6           /* Pack first 12 bytes. */           \
+    __asm palignr    xmm1, xmm0, 12       /* last 4 bytes of xmm0 + 12 xmm1 */ \
+    __asm movq       qword ptr 0[edx], xmm0  /* First 8 bytes */               \
+    __asm movdqu     8[edx], xmm1         /* Last 16 bytes */                  \
+    __asm lea        edx,  [edx + 24]                                          \
+  }
+
+// Store 8 RGB565 values.
+#define STORERGB565 __asm {                                                    \
+    /* Step 3: Weave into RRGB */                                              \
+    __asm punpcklbw  xmm0, xmm1           /* BG */                             \
+    __asm punpcklbw  xmm2, xmm2           /* RR */                             \
+    __asm movdqa     xmm1, xmm0                                                \
+    __asm punpcklwd  xmm0, xmm2           /* BGRR first 4 pixels */            \
+    __asm punpckhwd  xmm1, xmm2           /* BGRR next 4 pixels */             \
+    /* Step 4: RRGB -> RGB565 */                                               \
+    __asm movdqa     xmm3, xmm0    /* B  first 4 pixels of argb */             \
+    __asm movdqa     xmm2, xmm0    /* G */                                     \
+    __asm pslld      xmm0, 8       /* R */                                     \
+    __asm psrld      xmm3, 3       /* B */                                     \
+    __asm psrld      xmm2, 5       /* G */                                     \
+    __asm psrad      xmm0, 16      /* R */                                     \
+    __asm pand       xmm3, xmm5    /* B */                                     \
+    __asm pand       xmm2, xmm6    /* G */                                     \
+    __asm pand       xmm0, xmm7    /* R */                                     \
+    __asm por        xmm3, xmm2    /* BG */                                    \
+    __asm por        xmm0, xmm3    /* BGR */                                   \
+    __asm movdqa     xmm3, xmm1    /* B  next 4 pixels of argb */              \
+    __asm movdqa     xmm2, xmm1    /* G */                                     \
+    __asm pslld      xmm1, 8       /* R */                                     \
+    __asm psrld      xmm3, 3       /* B */                                     \
+    __asm psrld      xmm2, 5       /* G */                                     \
+    __asm psrad      xmm1, 16      /* R */                                     \
+    __asm pand       xmm3, xmm5    /* B */                                     \
+    __asm pand       xmm2, xmm6    /* G */                                     \
+    __asm pand       xmm1, xmm7    /* R */                                     \
+    __asm por        xmm3, xmm2    /* BG */                                    \
+    __asm por        xmm1, xmm3    /* BGR */                                   \
+    __asm packssdw   xmm0, xmm1                                                \
+    __asm movdqu     0[edx], xmm0  /* store 8 pixels of RGB565 */              \
+    __asm lea        edx, [edx + 16]                                           \
+  }
+
+// 8 pixels.
+// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked)
+void I444ToARGBRow_SSSE3(const uint8* y_buf,
+                         const uint8* u_buf,
+                         const uint8* v_buf,
+                         uint8* dst_argb,
+                         int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // U
+    mov        edi, [esp + 8 + 12]  // V
+    mov        edx, [esp + 8 + 16]  // argb
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edi, esi
+    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
+
+ convertloop:
+    READYUV444
+    YUVTORGB(kYuvConstants)
+    STOREARGB
+
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+// 8 pixels.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes).
+__declspec(naked)
+void I422ToRGB24Row_SSSE3(const uint8* y_buf,
+                          const uint8* u_buf,
+                          const uint8* v_buf,
+                          uint8* dst_rgb24,
+                          int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // U
+    mov        edi, [esp + 8 + 12]  // V
+    mov        edx, [esp + 8 + 16]  // rgb24
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edi, esi
+    movdqa     xmm5, kShuffleMaskARGBToRGB24_0
+    movdqa     xmm6, kShuffleMaskARGBToRGB24
+
+ convertloop:
+    READYUV422
+    YUVTORGB(kYuvConstants)
+    STORERGB24
+
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+// 8 pixels.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RAW (24 bytes).
+__declspec(naked)
+void I422ToRAWRow_SSSE3(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* dst_raw,
+                        int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // U
+    mov        edi, [esp + 8 + 12]  // V
+    mov        edx, [esp + 8 + 16]  // raw
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edi, esi
+    movdqa     xmm5, kShuffleMaskARGBToRAW_0
+    movdqa     xmm6, kShuffleMaskARGBToRAW
+
+ convertloop:
+    READYUV422
+    YUVTORGB(kYuvConstants)
+    STORERAW
+
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+// 8 pixels
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes).
+__declspec(naked)
+void I422ToRGB565Row_SSSE3(const uint8* y_buf,
+                           const uint8* u_buf,
+                           const uint8* v_buf,
+                           uint8* rgb565_buf,
+                           int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // U
+    mov        edi, [esp + 8 + 12]  // V
+    mov        edx, [esp + 8 + 16]  // rgb565
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edi, esi
+    pcmpeqb    xmm5, xmm5       // generate mask 0x0000001f
+    psrld      xmm5, 27
+    pcmpeqb    xmm6, xmm6       // generate mask 0x000007e0
+    psrld      xmm6, 26
+    pslld      xmm6, 5
+    pcmpeqb    xmm7, xmm7       // generate mask 0xfffff800
+    pslld      xmm7, 11
+
+ convertloop:
+    READYUV422
+    YUVTORGB(kYuvConstants)
+    STORERGB565
+
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+// 8 pixels.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked)
+void I422ToARGBRow_SSSE3(const uint8* y_buf,
+                         const uint8* u_buf,
+                         const uint8* v_buf,
+                         uint8* dst_argb,
+                         int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // U
+    mov        edi, [esp + 8 + 12]  // V
+    mov        edx, [esp + 8 + 16]  // argb
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edi, esi
+    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
+
+ convertloop:
+    READYUV422
+    YUVTORGB(kYuvConstants)
+    STOREARGB
+
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+// 8 pixels.
+// JPeg color space version of I422ToARGB
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked)
+void J422ToARGBRow_SSSE3(const uint8* y_buf,
+                         const uint8* u_buf,
+                         const uint8* v_buf,
+                         uint8* dst_argb,
+                         int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // U
+    mov        edi, [esp + 8 + 12]  // V
+    mov        edx, [esp + 8 + 16]  // argb
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edi, esi
+    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
+
+ convertloop:
+    READYUV422
+    YUVTORGB(kYuvJConstants)
+    STOREARGB
+
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+// 8 pixels.
+// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+// Similar to I420 but duplicate UV once more.
+__declspec(naked)
+void I411ToARGBRow_SSSE3(const uint8* y_buf,
+                         const uint8* u_buf,
+                         const uint8* v_buf,
+                         uint8* dst_argb,
+                         int width) {
+  __asm {
+    push       ebx
+    push       esi
+    push       edi
+    mov        eax, [esp + 12 + 4]   // Y
+    mov        esi, [esp + 12 + 8]   // U
+    mov        edi, [esp + 12 + 12]  // V
+    mov        edx, [esp + 12 + 16]  // argb
+    mov        ecx, [esp + 12 + 20]  // width
+    sub        edi, esi
+    pcmpeqb    xmm5, xmm5            // generate 0xffffffff for alpha
+
+ convertloop:
+    READYUV411  // modifies EBX
+    YUVTORGB(kYuvConstants)
+    STOREARGB
+
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    pop        ebx
+    ret
+  }
+}
+
+// 8 pixels.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked)
+void NV12ToARGBRow_SSSE3(const uint8* y_buf,
+                         const uint8* uv_buf,
+                         uint8* dst_argb,
+                         int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // Y
+    mov        esi, [esp + 4 + 8]   // UV
+    mov        edx, [esp + 4 + 12]  // argb
+    mov        ecx, [esp + 4 + 16]  // width
+    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
+
+ convertloop:
+    READNV12
+    YUVTORGB(kYuvConstants)
+    STOREARGB
+
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        esi
+    ret
+  }
+}
+
+// 8 pixels.
+// 4 VU values upsampled to 8 VU, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked)
+void NV21ToARGBRow_SSSE3(const uint8* y_buf,
+                         const uint8* uv_buf,
+                         uint8* dst_argb,
+                         int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // Y
+    mov        esi, [esp + 4 + 8]   // UV
+    mov        edx, [esp + 4 + 12]  // argb
+    mov        ecx, [esp + 4 + 16]  // width
+    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
+
+ convertloop:
+    READNV12
+    YUVTORGB(kYvuConstants)
+    STOREARGB
+
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        esi
+    ret
+  }
+}
+
+__declspec(naked)
+void I422ToBGRARow_SSSE3(const uint8* y_buf,
+                         const uint8* u_buf,
+                         const uint8* v_buf,
+                         uint8* dst_bgra,
+                         int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // U
+    mov        edi, [esp + 8 + 12]  // V
+    mov        edx, [esp + 8 + 16]  // bgra
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edi, esi
+
+ convertloop:
+    READYUV422
+    YUVTORGB(kYuvConstants)
+    STOREBGRA
+
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+__declspec(naked)
+void I422ToABGRRow_SSSE3(const uint8* y_buf,
+                         const uint8* u_buf,
+                         const uint8* v_buf,
+                         uint8* dst_abgr,
+                         int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // U
+    mov        edi, [esp + 8 + 12]  // V
+    mov        edx, [esp + 8 + 16]  // abgr
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edi, esi
+    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
+
+ convertloop:
+    READYUV422
+    YUVTORGB(kYuvConstants)
+    STOREABGR
+
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+__declspec(naked)
+void I422ToRGBARow_SSSE3(const uint8* y_buf,
+                         const uint8* u_buf,
+                         const uint8* v_buf,
+                         uint8* dst_rgba,
+                         int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // U
+    mov        edi, [esp + 8 + 12]  // V
+    mov        edx, [esp + 8 + 16]  // rgba
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edi, esi
+
+ convertloop:
+    READYUV422
+    YUVTORGB(kYuvConstants)
+    STORERGBA
+
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+#endif  // HAS_I422TOARGBROW_SSSE3
+
+#ifdef HAS_I400TOARGBROW_SSE2
+// 8 pixels of Y converted to 8 pixels of ARGB (32 bytes).
+__declspec(naked)
+void I400ToARGBRow_SSE2(const uint8* y_buf,
+                        uint8* rgb_buf,
+                        int width) {
+  __asm {
+    mov        eax, 0x4a354a35      // 4a35 = 18997 = round(1.164 * 64 * 256)
+    movd       xmm2, eax
+    pshufd     xmm2, xmm2,0
+    mov        eax, 0x04880488      // 0488 = 1160 = round(1.164 * 64 * 16)
+    movd       xmm3, eax
+    pshufd     xmm3, xmm3, 0
+    pcmpeqb    xmm4, xmm4           // generate mask 0xff000000
+    pslld      xmm4, 24
+
+    mov        eax, [esp + 4]       // Y
+    mov        edx, [esp + 8]       // rgb
+    mov        ecx, [esp + 12]      // width
+
+ convertloop:
+    // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
+    movq       xmm0, qword ptr [eax]
+    lea        eax, [eax + 8]
+    punpcklbw  xmm0, xmm0           // Y.Y
+    pmulhuw    xmm0, xmm2
+    psubusw    xmm0, xmm3
+    psrlw      xmm0, 6
+    packuswb   xmm0, xmm0           // G
+
+    // Step 2: Weave into ARGB
+    punpcklbw  xmm0, xmm0           // GG
+    movdqa     xmm1, xmm0
+    punpcklwd  xmm0, xmm0           // BGRA first 4 pixels
+    punpckhwd  xmm1, xmm1           // BGRA next 4 pixels
+    por        xmm0, xmm4
+    por        xmm1, xmm4
+    movdqu     [edx], xmm0
+    movdqu     [edx + 16], xmm1
+    lea        edx,  [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+    ret
+  }
+}
+#endif  // HAS_I400TOARGBROW_SSE2
+
+#ifdef HAS_I400TOARGBROW_AVX2
+// 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
+// note: vpunpcklbw mutates and vpackuswb unmutates.
+__declspec(naked)
+void I400ToARGBRow_AVX2(const uint8* y_buf,
+                        uint8* rgb_buf,
+                        int width) {
+  __asm {
+    mov        eax, 0x4a354a35      // 4a35 = 18997 = round(1.164 * 64 * 256)
+    vmovd      xmm2, eax
+    vbroadcastss ymm2, xmm2
+    mov        eax, 0x04880488      // 0488 = 1160 = round(1.164 * 64 * 16)
+    vmovd      xmm3, eax
+    vbroadcastss ymm3, xmm3
+    vpcmpeqb   ymm4, ymm4, ymm4     // generate mask 0xff000000
+    vpslld     ymm4, ymm4, 24
+
+    mov        eax, [esp + 4]       // Y
+    mov        edx, [esp + 8]       // rgb
+    mov        ecx, [esp + 12]      // width
+
+ convertloop:
+    // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164
+    vmovdqu    xmm0, [eax]
+    lea        eax, [eax + 16]
+    vpermq     ymm0, ymm0, 0xd8           // vpunpcklbw mutates
+    vpunpcklbw ymm0, ymm0, ymm0           // Y.Y
+    vpmulhuw   ymm0, ymm0, ymm2
+    vpsubusw   ymm0, ymm0, ymm3
+    vpsrlw     ymm0, ymm0, 6
+    vpackuswb  ymm0, ymm0, ymm0           // G.  still mutated: 3120
+
+    // TODO(fbarchard): Weave alpha with unpack.
+    // Step 2: Weave into ARGB
+    vpunpcklbw ymm1, ymm0, ymm0           // GG - mutates
+    vpermq     ymm1, ymm1, 0xd8
+    vpunpcklwd ymm0, ymm1, ymm1           // GGGG first 8 pixels
+    vpunpckhwd ymm1, ymm1, ymm1           // GGGG next 8 pixels
+    vpor       ymm0, ymm0, ymm4
+    vpor       ymm1, ymm1, ymm4
+    vmovdqu    [edx], ymm0
+    vmovdqu    [edx + 32], ymm1
+    lea        edx,  [edx + 64]
+    sub        ecx, 16
+    jg         convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_I400TOARGBROW_AVX2
+
+#ifdef HAS_MIRRORROW_SSSE3
+// Shuffle table for reversing the bytes.
+static const uvec8 kShuffleMirror = {
+  15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
+};
+
+// TODO(fbarchard): Replace lea with -16 offset.
+__declspec(naked)
+void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
+  __asm {
+    mov       eax, [esp + 4]   // src
+    mov       edx, [esp + 8]   // dst
+    mov       ecx, [esp + 12]  // width
+    movdqa    xmm5, kShuffleMirror
+
+ convertloop:
+    movdqu    xmm0, [eax - 16 + ecx]
+    pshufb    xmm0, xmm5
+    movdqu    [edx], xmm0
+    lea       edx, [edx + 16]
+    sub       ecx, 16
+    jg        convertloop
+    ret
+  }
+}
+#endif  // HAS_MIRRORROW_SSSE3
+
+#ifdef HAS_MIRRORROW_AVX2
+__declspec(naked)
+void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
+  __asm {
+    mov       eax, [esp + 4]   // src
+    mov       edx, [esp + 8]   // dst
+    mov       ecx, [esp + 12]  // width
+    vbroadcastf128 ymm5, kShuffleMirror
+
+ convertloop:
+    vmovdqu   ymm0, [eax - 32 + ecx]
+    vpshufb   ymm0, ymm0, ymm5
+    vpermq    ymm0, ymm0, 0x4e  // swap high and low halfs
+    vmovdqu   [edx], ymm0
+    lea       edx, [edx + 32]
+    sub       ecx, 32
+    jg        convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_MIRRORROW_AVX2
+
+#ifdef HAS_MIRRORROW_SSE2
+__declspec(naked)
+void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
+  __asm {
+    mov       eax, [esp + 4]   // src
+    mov       edx, [esp + 8]   // dst
+    mov       ecx, [esp + 12]  // width
+
+ convertloop:
+    movdqu    xmm0, [eax - 16 + ecx]
+    movdqa    xmm1, xmm0        // swap bytes
+    psllw     xmm0, 8
+    psrlw     xmm1, 8
+    por       xmm0, xmm1
+    pshuflw   xmm0, xmm0, 0x1b  // swap words
+    pshufhw   xmm0, xmm0, 0x1b
+    pshufd    xmm0, xmm0, 0x4e  // swap qwords
+    movdqu    [edx], xmm0
+    lea       edx, [edx + 16]
+    sub       ecx, 16
+    jg        convertloop
+    ret
+  }
+}
+#endif  // HAS_MIRRORROW_SSE2
+
+#ifdef HAS_MIRRORROW_UV_SSSE3
+// Shuffle table for reversing the bytes of UV channels.
+static const uvec8 kShuffleMirrorUV = {
+  14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
+};
+
+__declspec(naked)
+void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
+                       int width) {
+  __asm {
+    push      edi
+    mov       eax, [esp + 4 + 4]   // src
+    mov       edx, [esp + 4 + 8]   // dst_u
+    mov       edi, [esp + 4 + 12]  // dst_v
+    mov       ecx, [esp + 4 + 16]  // width
+    movdqa    xmm1, kShuffleMirrorUV
+    lea       eax, [eax + ecx * 2 - 16]
+    sub       edi, edx
+
+ convertloop:
+    movdqu    xmm0, [eax]
+    lea       eax, [eax - 16]
+    pshufb    xmm0, xmm1
+    movlpd    qword ptr [edx], xmm0
+    movhpd    qword ptr [edx + edi], xmm0
+    lea       edx, [edx + 8]
+    sub       ecx, 8
+    jg        convertloop
+
+    pop       edi
+    ret
+  }
+}
+#endif  // HAS_MIRRORROW_UV_SSSE3
+
+#ifdef HAS_ARGBMIRRORROW_SSE2
+__declspec(naked)
+void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
+  __asm {
+    mov       eax, [esp + 4]   // src
+    mov       edx, [esp + 8]   // dst
+    mov       ecx, [esp + 12]  // width
+    lea       eax, [eax - 16 + ecx * 4]  // last 4 pixels.
+
+ convertloop:
+    movdqu    xmm0, [eax]
+    lea       eax, [eax - 16]
+    pshufd    xmm0, xmm0, 0x1b
+    movdqu    [edx], xmm0
+    lea       edx, [edx + 16]
+    sub       ecx, 4
+    jg        convertloop
+    ret
+  }
+}
+#endif  // HAS_ARGBMIRRORROW_SSE2
+
+#ifdef HAS_ARGBMIRRORROW_AVX2
+// Shuffle table for reversing the bytes.
+static const ulvec32 kARGBShuffleMirror_AVX2 = {
+  7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
+};
+
+__declspec(naked)
+void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
+  __asm {
+    mov       eax, [esp + 4]   // src
+    mov       edx, [esp + 8]   // dst
+    mov       ecx, [esp + 12]  // width
+    vmovdqu   ymm5, kARGBShuffleMirror_AVX2
+
+ convertloop:
+    vpermd    ymm0, ymm5, [eax - 32 + ecx * 4]  // permute dword order
+    vmovdqu   [edx], ymm0
+    lea       edx, [edx + 32]
+    sub       ecx, 8
+    jg        convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBMIRRORROW_AVX2
+
+#ifdef HAS_SPLITUVROW_SSE2
+__declspec(naked)
+void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
+  __asm {
+    push       edi
+    mov        eax, [esp + 4 + 4]    // src_uv
+    mov        edx, [esp + 4 + 8]    // dst_u
+    mov        edi, [esp + 4 + 12]   // dst_v
+    mov        ecx, [esp + 4 + 16]   // pix
+    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
+    psrlw      xmm5, 8
+    sub        edi, edx
+
+  convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    movdqa     xmm2, xmm0
+    movdqa     xmm3, xmm1
+    pand       xmm0, xmm5   // even bytes
+    pand       xmm1, xmm5
+    packuswb   xmm0, xmm1
+    psrlw      xmm2, 8      // odd bytes
+    psrlw      xmm3, 8
+    packuswb   xmm2, xmm3
+    movdqu     [edx], xmm0
+    movdqu     [edx + edi], xmm2
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    ret
+  }
+}
+
+#endif  // HAS_SPLITUVROW_SSE2
+
+#ifdef HAS_SPLITUVROW_AVX2
+__declspec(naked)
+void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
+  __asm {
+    push       edi
+    mov        eax, [esp + 4 + 4]    // src_uv
+    mov        edx, [esp + 4 + 8]    // dst_u
+    mov        edi, [esp + 4 + 12]   // dst_v
+    mov        ecx, [esp + 4 + 16]   // pix
+    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
+    vpsrlw     ymm5, ymm5, 8
+    sub        edi, edx
+
+  convertloop:
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    lea        eax,  [eax + 64]
+    vpsrlw     ymm2, ymm0, 8      // odd bytes
+    vpsrlw     ymm3, ymm1, 8
+    vpand      ymm0, ymm0, ymm5   // even bytes
+    vpand      ymm1, ymm1, ymm5
+    vpackuswb  ymm0, ymm0, ymm1
+    vpackuswb  ymm2, ymm2, ymm3
+    vpermq     ymm0, ymm0, 0xd8
+    vpermq     ymm2, ymm2, 0xd8
+    vmovdqu    [edx], ymm0
+    vmovdqu    [edx + edi], ymm2
+    lea        edx, [edx + 32]
+    sub        ecx, 32
+    jg         convertloop
+
+    pop        edi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_SPLITUVROW_AVX2
+
+#ifdef HAS_MERGEUVROW_SSE2
+__declspec(naked)
+void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                     int width) {
+  __asm {
+    push       edi
+    mov        eax, [esp + 4 + 4]    // src_u
+    mov        edx, [esp + 4 + 8]    // src_v
+    mov        edi, [esp + 4 + 12]   // dst_uv
+    mov        ecx, [esp + 4 + 16]   // width
+    sub        edx, eax
+
+  convertloop:
+    movdqu     xmm0, [eax]      // read 16 U's
+    movdqu     xmm1, [eax + edx]  // and 16 V's
+    lea        eax,  [eax + 16]
+    movdqa     xmm2, xmm0
+    punpcklbw  xmm0, xmm1       // first 8 UV pairs
+    punpckhbw  xmm2, xmm1       // next 8 UV pairs
+    movdqu     [edi], xmm0
+    movdqu     [edi + 16], xmm2
+    lea        edi, [edi + 32]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    ret
+  }
+}
+#endif  //  HAS_MERGEUVROW_SSE2
+
+#ifdef HAS_MERGEUVROW_AVX2
+__declspec(naked)
+void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                     int width) {
+  __asm {
+    push       edi
+    mov        eax, [esp + 4 + 4]    // src_u
+    mov        edx, [esp + 4 + 8]    // src_v
+    mov        edi, [esp + 4 + 12]   // dst_uv
+    mov        ecx, [esp + 4 + 16]   // width
+    sub        edx, eax
+
+  convertloop:
+    vmovdqu    ymm0, [eax]           // read 32 U's
+    vmovdqu    ymm1, [eax + edx]     // and 32 V's
+    lea        eax,  [eax + 32]
+    vpunpcklbw ymm2, ymm0, ymm1      // low 16 UV pairs. mutated qqword 0,2
+    vpunpckhbw ymm0, ymm0, ymm1      // high 16 UV pairs. mutated qqword 1,3
+    vextractf128 [edi], ymm2, 0       // bytes 0..15
+    vextractf128 [edi + 16], ymm0, 0  // bytes 16..31
+    vextractf128 [edi + 32], ymm2, 1  // bytes 32..47
+    vextractf128 [edi + 48], ymm0, 1  // bytes 47..63
+    lea        edi, [edi + 64]
+    sub        ecx, 32
+    jg         convertloop
+
+    pop        edi
+    vzeroupper
+    ret
+  }
+}
+#endif  //  HAS_MERGEUVROW_AVX2
+
+#ifdef HAS_COPYROW_SSE2
+// CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time.
+__declspec(naked)
+void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
+  __asm {
+    mov        eax, [esp + 4]   // src
+    mov        edx, [esp + 8]   // dst
+    mov        ecx, [esp + 12]  // count
+
+  convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax, [eax + 32]
+    movdqu     [edx], xmm0
+    movdqu     [edx + 16], xmm1
+    lea        edx, [edx + 32]
+    sub        ecx, 32
+    jg         convertloop
+    ret
+  }
+}
+#endif  // HAS_COPYROW_SSE2
+
+#ifdef HAS_COPYROW_AVX
+// CopyRow copys 'count' bytes using a 32 byte load/store, 64 bytes at time.
+__declspec(naked)
+void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
+  __asm {
+    mov        eax, [esp + 4]   // src
+    mov        edx, [esp + 8]   // dst
+    mov        ecx, [esp + 12]  // count
+
+  convertloop:
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    lea        eax, [eax + 64]
+    vmovdqu    [edx], ymm0
+    vmovdqu    [edx + 32], ymm1
+    lea        edx, [edx + 64]
+    sub        ecx, 64
+    jg         convertloop
+
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_COPYROW_AVX
+
+// Multiple of 1.
+__declspec(naked)
+void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
+  __asm {
+    mov        eax, esi
+    mov        edx, edi
+    mov        esi, [esp + 4]   // src
+    mov        edi, [esp + 8]   // dst
+    mov        ecx, [esp + 12]  // count
+    rep movsb
+    mov        edi, edx
+    mov        esi, eax
+    ret
+  }
+}
+
+#ifdef HAS_ARGBCOPYALPHAROW_SSE2
+// width in pixels
+__declspec(naked)
+void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
+  __asm {
+    mov        eax, [esp + 4]   // src
+    mov        edx, [esp + 8]   // dst
+    mov        ecx, [esp + 12]  // count
+    pcmpeqb    xmm0, xmm0       // generate mask 0xff000000
+    pslld      xmm0, 24
+    pcmpeqb    xmm1, xmm1       // generate mask 0x00ffffff
+    psrld      xmm1, 8
+
+  convertloop:
+    movdqu     xmm2, [eax]
+    movdqu     xmm3, [eax + 16]
+    lea        eax, [eax + 32]
+    movdqu     xmm4, [edx]
+    movdqu     xmm5, [edx + 16]
+    pand       xmm2, xmm0
+    pand       xmm3, xmm0
+    pand       xmm4, xmm1
+    pand       xmm5, xmm1
+    por        xmm2, xmm4
+    por        xmm3, xmm5
+    movdqu     [edx], xmm2
+    movdqu     [edx + 16], xmm3
+    lea        edx, [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+
+    ret
+  }
+}
+#endif  // HAS_ARGBCOPYALPHAROW_SSE2
+
+#ifdef HAS_ARGBCOPYALPHAROW_AVX2
+// width in pixels
+__declspec(naked)
+void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
+  __asm {
+    mov        eax, [esp + 4]   // src
+    mov        edx, [esp + 8]   // dst
+    mov        ecx, [esp + 12]  // count
+    vpcmpeqb   ymm0, ymm0, ymm0
+    vpsrld     ymm0, ymm0, 8    // generate mask 0x00ffffff
+
+  convertloop:
+    vmovdqu    ymm1, [eax]
+    vmovdqu    ymm2, [eax + 32]
+    lea        eax, [eax + 64]
+    vpblendvb  ymm1, ymm1, [edx], ymm0
+    vpblendvb  ymm2, ymm2, [edx + 32], ymm0
+    vmovdqu    [edx], ymm1
+    vmovdqu    [edx + 32], ymm2
+    lea        edx, [edx + 64]
+    sub        ecx, 16
+    jg         convertloop
+
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBCOPYALPHAROW_AVX2
+
+#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
+// width in pixels
+__declspec(naked)
+void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
+  __asm {
+    mov        eax, [esp + 4]   // src
+    mov        edx, [esp + 8]   // dst
+    mov        ecx, [esp + 12]  // count
+    pcmpeqb    xmm0, xmm0       // generate mask 0xff000000
+    pslld      xmm0, 24
+    pcmpeqb    xmm1, xmm1       // generate mask 0x00ffffff
+    psrld      xmm1, 8
+
+  convertloop:
+    movq       xmm2, qword ptr [eax]  // 8 Y's
+    lea        eax, [eax + 8]
+    punpcklbw  xmm2, xmm2
+    punpckhwd  xmm3, xmm2
+    punpcklwd  xmm2, xmm2
+    movdqu     xmm4, [edx]
+    movdqu     xmm5, [edx + 16]
+    pand       xmm2, xmm0
+    pand       xmm3, xmm0
+    pand       xmm4, xmm1
+    pand       xmm5, xmm1
+    por        xmm2, xmm4
+    por        xmm3, xmm5
+    movdqu     [edx], xmm2
+    movdqu     [edx + 16], xmm3
+    lea        edx, [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+
+    ret
+  }
+}
+#endif  // HAS_ARGBCOPYYTOALPHAROW_SSE2
+
+#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
+// width in pixels
+__declspec(naked)
+void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
+  __asm {
+    mov        eax, [esp + 4]   // src
+    mov        edx, [esp + 8]   // dst
+    mov        ecx, [esp + 12]  // count
+    vpcmpeqb   ymm0, ymm0, ymm0
+    vpsrld     ymm0, ymm0, 8    // generate mask 0x00ffffff
+
+  convertloop:
+    vpmovzxbd  ymm1, qword ptr [eax]
+    vpmovzxbd  ymm2, qword ptr [eax + 8]
+    lea        eax, [eax + 16]
+    vpslld     ymm1, ymm1, 24
+    vpslld     ymm2, ymm2, 24
+    vpblendvb  ymm1, ymm1, [edx], ymm0
+    vpblendvb  ymm2, ymm2, [edx + 32], ymm0
+    vmovdqu    [edx], ymm1
+    vmovdqu    [edx + 32], ymm2
+    lea        edx, [edx + 64]
+    sub        ecx, 16
+    jg         convertloop
+
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2
+
+#ifdef HAS_SETROW_X86
+// Write 'count' bytes using an 8 bit value repeated.
+// Count should be multiple of 4.
+__declspec(naked)
+void SetRow_X86(uint8* dst, uint8 v8, int count) {
+  __asm {
+    movzx      eax, byte ptr [esp + 8]    // v8
+    mov        edx, 0x01010101  // Duplicate byte to all bytes.
+    mul        edx              // overwrites edx with upper part of result.
+    mov        edx, edi
+    mov        edi, [esp + 4]   // dst
+    mov        ecx, [esp + 12]  // count
+    shr        ecx, 2
+    rep stosd
+    mov        edi, edx
+    ret
+  }
+}
+
+// Write 'count' bytes using an 8 bit value repeated.
+__declspec(naked)
+void SetRow_ERMS(uint8* dst, uint8 v8, int count) {
+  __asm {
+    mov        edx, edi
+    mov        edi, [esp + 4]   // dst
+    mov        eax, [esp + 8]   // v8
+    mov        ecx, [esp + 12]  // count
+    rep stosb
+    mov        edi, edx
+    ret
+  }
+}
+
+// Write 'count' 32 bit values.
+__declspec(naked)
+void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) {
+  __asm {
+    mov        edx, edi
+    mov        edi, [esp + 4]   // dst
+    mov        eax, [esp + 8]   // v32
+    mov        ecx, [esp + 12]  // count
+    rep stosd
+    mov        edi, edx
+    ret
+  }
+}
+#endif  // HAS_SETROW_X86
+
+#ifdef HAS_YUY2TOYROW_AVX2
+__declspec(naked)
+void YUY2ToYRow_AVX2(const uint8* src_yuy2,
+                     uint8* dst_y, int pix) {
+  __asm {
+    mov        eax, [esp + 4]    // src_yuy2
+    mov        edx, [esp + 8]    // dst_y
+    mov        ecx, [esp + 12]   // pix
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
+    vpsrlw     ymm5, ymm5, 8
+
+  convertloop:
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    lea        eax,  [eax + 64]
+    vpand      ymm0, ymm0, ymm5   // even bytes are Y
+    vpand      ymm1, ymm1, ymm5
+    vpackuswb  ymm0, ymm0, ymm1   // mutates.
+    vpermq     ymm0, ymm0, 0xd8
+    vmovdqu    [edx], ymm0
+    lea        edx, [edx + 32]
+    sub        ecx, 32
+    jg         convertloop
+    vzeroupper
+    ret
+  }
+}
+
+__declspec(naked)
+void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]    // src_yuy2
+    mov        esi, [esp + 8 + 8]    // stride_yuy2
+    mov        edx, [esp + 8 + 12]   // dst_u
+    mov        edi, [esp + 8 + 16]   // dst_v
+    mov        ecx, [esp + 8 + 20]   // pix
+    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
+    vpsrlw     ymm5, ymm5, 8
+    sub        edi, edx
+
+  convertloop:
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    vpavgb     ymm0, ymm0, [eax + esi]
+    vpavgb     ymm1, ymm1, [eax + esi + 32]
+    lea        eax,  [eax + 64]
+    vpsrlw     ymm0, ymm0, 8      // YUYV -> UVUV
+    vpsrlw     ymm1, ymm1, 8
+    vpackuswb  ymm0, ymm0, ymm1   // mutates.
+    vpermq     ymm0, ymm0, 0xd8
+    vpand      ymm1, ymm0, ymm5  // U
+    vpsrlw     ymm0, ymm0, 8     // V
+    vpackuswb  ymm1, ymm1, ymm1  // mutates.
+    vpackuswb  ymm0, ymm0, ymm0  // mutates.
+    vpermq     ymm1, ymm1, 0xd8
+    vpermq     ymm0, ymm0, 0xd8
+    vextractf128 [edx], ymm1, 0  // U
+    vextractf128 [edx + edi], ymm0, 0 // V
+    lea        edx, [edx + 16]
+    sub        ecx, 32
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+
+__declspec(naked)
+void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
+                         uint8* dst_u, uint8* dst_v, int pix) {
+  __asm {
+    push       edi
+    mov        eax, [esp + 4 + 4]    // src_yuy2
+    mov        edx, [esp + 4 + 8]    // dst_u
+    mov        edi, [esp + 4 + 12]   // dst_v
+    mov        ecx, [esp + 4 + 16]   // pix
+    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
+    vpsrlw     ymm5, ymm5, 8
+    sub        edi, edx
+
+  convertloop:
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    lea        eax,  [eax + 64]
+    vpsrlw     ymm0, ymm0, 8      // YUYV -> UVUV
+    vpsrlw     ymm1, ymm1, 8
+    vpackuswb  ymm0, ymm0, ymm1   // mutates.
+    vpermq     ymm0, ymm0, 0xd8
+    vpand      ymm1, ymm0, ymm5  // U
+    vpsrlw     ymm0, ymm0, 8     // V
+    vpackuswb  ymm1, ymm1, ymm1  // mutates.
+    vpackuswb  ymm0, ymm0, ymm0  // mutates.
+    vpermq     ymm1, ymm1, 0xd8
+    vpermq     ymm0, ymm0, 0xd8
+    vextractf128 [edx], ymm1, 0  // U
+    vextractf128 [edx + edi], ymm0, 0 // V
+    lea        edx, [edx + 16]
+    sub        ecx, 32
+    jg         convertloop
+
+    pop        edi
+    vzeroupper
+    ret
+  }
+}
+
+__declspec(naked)
+void UYVYToYRow_AVX2(const uint8* src_uyvy,
+                     uint8* dst_y, int pix) {
+  __asm {
+    mov        eax, [esp + 4]    // src_uyvy
+    mov        edx, [esp + 8]    // dst_y
+    mov        ecx, [esp + 12]   // pix
+
+  convertloop:
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    lea        eax,  [eax + 64]
+    vpsrlw     ymm0, ymm0, 8      // odd bytes are Y
+    vpsrlw     ymm1, ymm1, 8
+    vpackuswb  ymm0, ymm0, ymm1   // mutates.
+    vpermq     ymm0, ymm0, 0xd8
+    vmovdqu    [edx], ymm0
+    lea        edx, [edx + 32]
+    sub        ecx, 32
+    jg         convertloop
+    vzeroupper
+    ret
+  }
+}
+
+__declspec(naked)
+void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]    // src_yuy2
+    mov        esi, [esp + 8 + 8]    // stride_yuy2
+    mov        edx, [esp + 8 + 12]   // dst_u
+    mov        edi, [esp + 8 + 16]   // dst_v
+    mov        ecx, [esp + 8 + 20]   // pix
+    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
+    vpsrlw     ymm5, ymm5, 8
+    sub        edi, edx
+
+  convertloop:
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    vpavgb     ymm0, ymm0, [eax + esi]
+    vpavgb     ymm1, ymm1, [eax + esi + 32]
+    lea        eax,  [eax + 64]
+    vpand      ymm0, ymm0, ymm5   // UYVY -> UVUV
+    vpand      ymm1, ymm1, ymm5
+    vpackuswb  ymm0, ymm0, ymm1   // mutates.
+    vpermq     ymm0, ymm0, 0xd8
+    vpand      ymm1, ymm0, ymm5  // U
+    vpsrlw     ymm0, ymm0, 8     // V
+    vpackuswb  ymm1, ymm1, ymm1  // mutates.
+    vpackuswb  ymm0, ymm0, ymm0  // mutates.
+    vpermq     ymm1, ymm1, 0xd8
+    vpermq     ymm0, ymm0, 0xd8
+    vextractf128 [edx], ymm1, 0  // U
+    vextractf128 [edx + edi], ymm0, 0 // V
+    lea        edx, [edx + 16]
+    sub        ecx, 32
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+
+__declspec(naked)
+void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
+                         uint8* dst_u, uint8* dst_v, int pix) {
+  __asm {
+    push       edi
+    mov        eax, [esp + 4 + 4]    // src_yuy2
+    mov        edx, [esp + 4 + 8]    // dst_u
+    mov        edi, [esp + 4 + 12]   // dst_v
+    mov        ecx, [esp + 4 + 16]   // pix
+    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
+    vpsrlw     ymm5, ymm5, 8
+    sub        edi, edx
+
+  convertloop:
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    lea        eax,  [eax + 64]
+    vpand      ymm0, ymm0, ymm5   // UYVY -> UVUV
+    vpand      ymm1, ymm1, ymm5
+    vpackuswb  ymm0, ymm0, ymm1   // mutates.
+    vpermq     ymm0, ymm0, 0xd8
+    vpand      ymm1, ymm0, ymm5  // U
+    vpsrlw     ymm0, ymm0, 8     // V
+    vpackuswb  ymm1, ymm1, ymm1  // mutates.
+    vpackuswb  ymm0, ymm0, ymm0  // mutates.
+    vpermq     ymm1, ymm1, 0xd8
+    vpermq     ymm0, ymm0, 0xd8
+    vextractf128 [edx], ymm1, 0  // U
+    vextractf128 [edx + edi], ymm0, 0 // V
+    lea        edx, [edx + 16]
+    sub        ecx, 32
+    jg         convertloop
+
+    pop        edi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_YUY2TOYROW_AVX2
+
+#ifdef HAS_YUY2TOYROW_SSE2
+__declspec(naked)
+void YUY2ToYRow_SSE2(const uint8* src_yuy2,
+                     uint8* dst_y, int pix) {
+  __asm {
+    mov        eax, [esp + 4]    // src_yuy2
+    mov        edx, [esp + 8]    // dst_y
+    mov        ecx, [esp + 12]   // pix
+    pcmpeqb    xmm5, xmm5        // generate mask 0x00ff00ff
+    psrlw      xmm5, 8
+
+  convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    pand       xmm0, xmm5   // even bytes are Y
+    pand       xmm1, xmm5
+    packuswb   xmm0, xmm1
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    jg         convertloop
+    ret
+  }
+}
+
+__declspec(naked)
+void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]    // src_yuy2
+    mov        esi, [esp + 8 + 8]    // stride_yuy2
+    mov        edx, [esp + 8 + 12]   // dst_u
+    mov        edi, [esp + 8 + 16]   // dst_v
+    mov        ecx, [esp + 8 + 20]   // pix
+    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
+    psrlw      xmm5, 8
+    sub        edi, edx
+
+  convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + esi]
+    movdqu     xmm3, [eax + esi + 16]
+    lea        eax,  [eax + 32]
+    pavgb      xmm0, xmm2
+    pavgb      xmm1, xmm3
+    psrlw      xmm0, 8      // YUYV -> UVUV
+    psrlw      xmm1, 8
+    packuswb   xmm0, xmm1
+    movdqa     xmm1, xmm0
+    pand       xmm0, xmm5  // U
+    packuswb   xmm0, xmm0
+    psrlw      xmm1, 8     // V
+    packuswb   xmm1, xmm1
+    movq       qword ptr [edx], xmm0
+    movq       qword ptr [edx + edi], xmm1
+    lea        edx, [edx + 8]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+__declspec(naked)
+void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
+                         uint8* dst_u, uint8* dst_v, int pix) {
+  __asm {
+    push       edi
+    mov        eax, [esp + 4 + 4]    // src_yuy2
+    mov        edx, [esp + 4 + 8]    // dst_u
+    mov        edi, [esp + 4 + 12]   // dst_v
+    mov        ecx, [esp + 4 + 16]   // pix
+    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
+    psrlw      xmm5, 8
+    sub        edi, edx
+
+  convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    psrlw      xmm0, 8      // YUYV -> UVUV
+    psrlw      xmm1, 8
+    packuswb   xmm0, xmm1
+    movdqa     xmm1, xmm0
+    pand       xmm0, xmm5  // U
+    packuswb   xmm0, xmm0
+    psrlw      xmm1, 8     // V
+    packuswb   xmm1, xmm1
+    movq       qword ptr [edx], xmm0
+    movq       qword ptr [edx + edi], xmm1
+    lea        edx, [edx + 8]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    ret
+  }
+}
+
+__declspec(naked)
+void UYVYToYRow_SSE2(const uint8* src_uyvy,
+                     uint8* dst_y, int pix) {
+  __asm {
+    mov        eax, [esp + 4]    // src_uyvy
+    mov        edx, [esp + 8]    // dst_y
+    mov        ecx, [esp + 12]   // pix
+
+  convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    psrlw      xmm0, 8    // odd bytes are Y
+    psrlw      xmm1, 8
+    packuswb   xmm0, xmm1
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    jg         convertloop
+    ret
+  }
+}
+
+__declspec(naked)
+void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
+                      uint8* dst_u, uint8* dst_v, int pix) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]    // src_yuy2
+    mov        esi, [esp + 8 + 8]    // stride_yuy2
+    mov        edx, [esp + 8 + 12]   // dst_u
+    mov        edi, [esp + 8 + 16]   // dst_v
+    mov        ecx, [esp + 8 + 20]   // pix
+    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
+    psrlw      xmm5, 8
+    sub        edi, edx
+
+  convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + esi]
+    movdqu     xmm3, [eax + esi + 16]
+    lea        eax,  [eax + 32]
+    pavgb      xmm0, xmm2
+    pavgb      xmm1, xmm3
+    pand       xmm0, xmm5   // UYVY -> UVUV
+    pand       xmm1, xmm5
+    packuswb   xmm0, xmm1
+    movdqa     xmm1, xmm0
+    pand       xmm0, xmm5  // U
+    packuswb   xmm0, xmm0
+    psrlw      xmm1, 8     // V
+    packuswb   xmm1, xmm1
+    movq       qword ptr [edx], xmm0
+    movq       qword ptr [edx + edi], xmm1
+    lea        edx, [edx + 8]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+__declspec(naked)
+void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
+                         uint8* dst_u, uint8* dst_v, int pix) {
+  __asm {
+    push       edi
+    mov        eax, [esp + 4 + 4]    // src_yuy2
+    mov        edx, [esp + 4 + 8]    // dst_u
+    mov        edi, [esp + 4 + 12]   // dst_v
+    mov        ecx, [esp + 4 + 16]   // pix
+    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
+    psrlw      xmm5, 8
+    sub        edi, edx
+
+  convertloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    pand       xmm0, xmm5   // UYVY -> UVUV
+    pand       xmm1, xmm5
+    packuswb   xmm0, xmm1
+    movdqa     xmm1, xmm0
+    pand       xmm0, xmm5  // U
+    packuswb   xmm0, xmm0
+    psrlw      xmm1, 8     // V
+    packuswb   xmm1, xmm1
+    movq       qword ptr [edx], xmm0
+    movq       qword ptr [edx + edi], xmm1
+    lea        edx, [edx + 8]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    ret
+  }
+}
+#endif  // HAS_YUY2TOYROW_SSE2
+
+#ifdef HAS_ARGBBLENDROW_SSE2
+// Blend 8 pixels at a time.
+__declspec(naked)
+void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+                       uint8* dst_argb, int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // src_argb0
+    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        edx, [esp + 4 + 12]  // dst_argb
+    mov        ecx, [esp + 4 + 16]  // width
+    pcmpeqb    xmm7, xmm7       // generate constant 1
+    psrlw      xmm7, 15
+    pcmpeqb    xmm6, xmm6       // generate mask 0x00ff00ff
+    psrlw      xmm6, 8
+    pcmpeqb    xmm5, xmm5       // generate mask 0xff00ff00
+    psllw      xmm5, 8
+    pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
+    pslld      xmm4, 24
+    sub        ecx, 4
+    jl         convertloop4b    // less than 4 pixels?
+
+    // 4 pixel loop.
+  convertloop4:
+    movdqu     xmm3, [eax]      // src argb
+    lea        eax, [eax + 16]
+    movdqa     xmm0, xmm3       // src argb
+    pxor       xmm3, xmm4       // ~alpha
+    movdqu     xmm2, [esi]      // _r_b
+    psrlw      xmm3, 8          // alpha
+    pshufhw    xmm3, xmm3, 0F5h // 8 alpha words
+    pshuflw    xmm3, xmm3, 0F5h
+    pand       xmm2, xmm6       // _r_b
+    paddw      xmm3, xmm7       // 256 - alpha
+    pmullw     xmm2, xmm3       // _r_b * alpha
+    movdqu     xmm1, [esi]      // _a_g
+    lea        esi, [esi + 16]
+    psrlw      xmm1, 8          // _a_g
+    por        xmm0, xmm4       // set alpha to 255
+    pmullw     xmm1, xmm3       // _a_g * alpha
+    psrlw      xmm2, 8          // _r_b convert to 8 bits again
+    paddusb    xmm0, xmm2       // + src argb
+    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
+    paddusb    xmm0, xmm1       // + src argb
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jge        convertloop4
+
+  convertloop4b:
+    add        ecx, 4 - 1
+    jl         convertloop1b
+
+    // 1 pixel loop.
+  convertloop1:
+    movd       xmm3, [eax]      // src argb
+    lea        eax, [eax + 4]
+    movdqa     xmm0, xmm3       // src argb
+    pxor       xmm3, xmm4       // ~alpha
+    movd       xmm2, [esi]      // _r_b
+    psrlw      xmm3, 8          // alpha
+    pshufhw    xmm3, xmm3, 0F5h // 8 alpha words
+    pshuflw    xmm3, xmm3, 0F5h
+    pand       xmm2, xmm6       // _r_b
+    paddw      xmm3, xmm7       // 256 - alpha
+    pmullw     xmm2, xmm3       // _r_b * alpha
+    movd       xmm1, [esi]      // _a_g
+    lea        esi, [esi + 4]
+    psrlw      xmm1, 8          // _a_g
+    por        xmm0, xmm4       // set alpha to 255
+    pmullw     xmm1, xmm3       // _a_g * alpha
+    psrlw      xmm2, 8          // _r_b convert to 8 bits again
+    paddusb    xmm0, xmm2       // + src argb
+    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
+    paddusb    xmm0, xmm1       // + src argb
+    movd       [edx], xmm0
+    lea        edx, [edx + 4]
+    sub        ecx, 1
+    jge        convertloop1
+
+  convertloop1b:
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_ARGBBLENDROW_SSE2
+
+#ifdef HAS_ARGBBLENDROW_SSSE3
+// Shuffle table for isolating alpha.
+static const uvec8 kShuffleAlpha = {
+  3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
+  11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
+};
+// Same as SSE2, but replaces:
+//    psrlw      xmm3, 8          // alpha
+//    pshufhw    xmm3, xmm3, 0F5h // 8 alpha words
+//    pshuflw    xmm3, xmm3, 0F5h
+// with..
+//    pshufb     xmm3, kShuffleAlpha // alpha
+// Blend 8 pixels at a time.
+
+__declspec(naked)
+void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
+                        uint8* dst_argb, int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // src_argb0
+    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        edx, [esp + 4 + 12]  // dst_argb
+    mov        ecx, [esp + 4 + 16]  // width
+    pcmpeqb    xmm7, xmm7       // generate constant 0x0001
+    psrlw      xmm7, 15
+    pcmpeqb    xmm6, xmm6       // generate mask 0x00ff00ff
+    psrlw      xmm6, 8
+    pcmpeqb    xmm5, xmm5       // generate mask 0xff00ff00
+    psllw      xmm5, 8
+    pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
+    pslld      xmm4, 24
+    sub        ecx, 4
+    jl         convertloop4b    // less than 4 pixels?
+
+    // 4 pixel loop.
+  convertloop4:
+    movdqu     xmm3, [eax]      // src argb
+    lea        eax, [eax + 16]
+    movdqa     xmm0, xmm3       // src argb
+    pxor       xmm3, xmm4       // ~alpha
+    movdqu     xmm2, [esi]      // _r_b
+    pshufb     xmm3, kShuffleAlpha // alpha
+    pand       xmm2, xmm6       // _r_b
+    paddw      xmm3, xmm7       // 256 - alpha
+    pmullw     xmm2, xmm3       // _r_b * alpha
+    movdqu     xmm1, [esi]      // _a_g
+    lea        esi, [esi + 16]
+    psrlw      xmm1, 8          // _a_g
+    por        xmm0, xmm4       // set alpha to 255
+    pmullw     xmm1, xmm3       // _a_g * alpha
+    psrlw      xmm2, 8          // _r_b convert to 8 bits again
+    paddusb    xmm0, xmm2       // + src argb
+    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
+    paddusb    xmm0, xmm1       // + src argb
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jge        convertloop4
+
+  convertloop4b:
+    add        ecx, 4 - 1
+    jl         convertloop1b
+
+    // 1 pixel loop.
+  convertloop1:
+    movd       xmm3, [eax]      // src argb
+    lea        eax, [eax + 4]
+    movdqa     xmm0, xmm3       // src argb
+    pxor       xmm3, xmm4       // ~alpha
+    movd       xmm2, [esi]      // _r_b
+    pshufb     xmm3, kShuffleAlpha // alpha
+    pand       xmm2, xmm6       // _r_b
+    paddw      xmm3, xmm7       // 256 - alpha
+    pmullw     xmm2, xmm3       // _r_b * alpha
+    movd       xmm1, [esi]      // _a_g
+    lea        esi, [esi + 4]
+    psrlw      xmm1, 8          // _a_g
+    por        xmm0, xmm4       // set alpha to 255
+    pmullw     xmm1, xmm3       // _a_g * alpha
+    psrlw      xmm2, 8          // _r_b convert to 8 bits again
+    paddusb    xmm0, xmm2       // + src argb
+    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
+    paddusb    xmm0, xmm1       // + src argb
+    movd       [edx], xmm0
+    lea        edx, [edx + 4]
+    sub        ecx, 1
+    jge        convertloop1
+
+  convertloop1b:
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_ARGBBLENDROW_SSSE3
+
+#ifdef HAS_ARGBATTENUATEROW_SSE2
+// Attenuate 4 pixels at a time.
+__declspec(naked)
+void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
+  __asm {
+    mov        eax, [esp + 4]   // src_argb0
+    mov        edx, [esp + 8]   // dst_argb
+    mov        ecx, [esp + 12]  // width
+    pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
+    pslld      xmm4, 24
+    pcmpeqb    xmm5, xmm5       // generate mask 0x00ffffff
+    psrld      xmm5, 8
+
+ convertloop:
+    movdqu     xmm0, [eax]      // read 4 pixels
+    punpcklbw  xmm0, xmm0       // first 2
+    pshufhw    xmm2, xmm0, 0FFh // 8 alpha words
+    pshuflw    xmm2, xmm2, 0FFh
+    pmulhuw    xmm0, xmm2       // rgb * a
+    movdqu     xmm1, [eax]      // read 4 pixels
+    punpckhbw  xmm1, xmm1       // next 2 pixels
+    pshufhw    xmm2, xmm1, 0FFh // 8 alpha words
+    pshuflw    xmm2, xmm2, 0FFh
+    pmulhuw    xmm1, xmm2       // rgb * a
+    movdqu     xmm2, [eax]      // alphas
+    lea        eax, [eax + 16]
+    psrlw      xmm0, 8
+    pand       xmm2, xmm4
+    psrlw      xmm1, 8
+    packuswb   xmm0, xmm1
+    pand       xmm0, xmm5       // keep original alphas
+    por        xmm0, xmm2
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         convertloop
+
+    ret
+  }
+}
+#endif  // HAS_ARGBATTENUATEROW_SSE2
+
+#ifdef HAS_ARGBATTENUATEROW_SSSE3
+// Shuffle table duplicating alpha.
+static const uvec8 kShuffleAlpha0 = {
+  3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
+};
+static const uvec8 kShuffleAlpha1 = {
+  11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
+  15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
+};
+__declspec(naked)
+void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
+  __asm {
+    mov        eax, [esp + 4]   // src_argb0
+    mov        edx, [esp + 8]   // dst_argb
+    mov        ecx, [esp + 12]  // width
+    pcmpeqb    xmm3, xmm3       // generate mask 0xff000000
+    pslld      xmm3, 24
+    movdqa     xmm4, kShuffleAlpha0
+    movdqa     xmm5, kShuffleAlpha1
+
+ convertloop:
+    movdqu     xmm0, [eax]      // read 4 pixels
+    pshufb     xmm0, xmm4       // isolate first 2 alphas
+    movdqu     xmm1, [eax]      // read 4 pixels
+    punpcklbw  xmm1, xmm1       // first 2 pixel rgbs
+    pmulhuw    xmm0, xmm1       // rgb * a
+    movdqu     xmm1, [eax]      // read 4 pixels
+    pshufb     xmm1, xmm5       // isolate next 2 alphas
+    movdqu     xmm2, [eax]      // read 4 pixels
+    punpckhbw  xmm2, xmm2       // next 2 pixel rgbs
+    pmulhuw    xmm1, xmm2       // rgb * a
+    movdqu     xmm2, [eax]      // mask original alpha
+    lea        eax, [eax + 16]
+    pand       xmm2, xmm3
+    psrlw      xmm0, 8
+    psrlw      xmm1, 8
+    packuswb   xmm0, xmm1
+    por        xmm0, xmm2       // copy original alpha
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         convertloop
+
+    ret
+  }
+}
+#endif  // HAS_ARGBATTENUATEROW_SSSE3
+
+#ifdef HAS_ARGBATTENUATEROW_AVX2
+// Shuffle table duplicating alpha.
+static const uvec8 kShuffleAlpha_AVX2 = {
+  6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u
+};
+__declspec(naked)
+void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
+  __asm {
+    mov        eax, [esp + 4]   // src_argb0
+    mov        edx, [esp + 8]   // dst_argb
+    mov        ecx, [esp + 12]  // width
+    sub        edx, eax
+    vbroadcastf128 ymm4,kShuffleAlpha_AVX2
+    vpcmpeqb   ymm5, ymm5, ymm5 // generate mask 0xff000000
+    vpslld     ymm5, ymm5, 24
+
+ convertloop:
+    vmovdqu    ymm6, [eax]       // read 8 pixels.
+    vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
+    vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
+    vpshufb    ymm2, ymm0, ymm4  // low 4 alphas
+    vpshufb    ymm3, ymm1, ymm4  // high 4 alphas
+    vpmulhuw   ymm0, ymm0, ymm2  // rgb * a
+    vpmulhuw   ymm1, ymm1, ymm3  // rgb * a
+    vpand      ymm6, ymm6, ymm5  // isolate alpha
+    vpsrlw     ymm0, ymm0, 8
+    vpsrlw     ymm1, ymm1, 8
+    vpackuswb  ymm0, ymm0, ymm1  // unmutated.
+    vpor       ymm0, ymm0, ymm6  // copy original alpha
+    vmovdqu    [eax + edx], ymm0
+    lea        eax, [eax + 32]
+    sub        ecx, 8
+    jg         convertloop
+
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBATTENUATEROW_AVX2
+
+#ifdef HAS_ARGBUNATTENUATEROW_SSE2
+// Unattenuate 4 pixels at a time.
+__declspec(naked)
+void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
+                             int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // src_argb0
+    mov        edx, [esp + 8 + 8]   // dst_argb
+    mov        ecx, [esp + 8 + 12]  // width
+
+ convertloop:
+    movdqu     xmm0, [eax]      // read 4 pixels
+    movzx      esi, byte ptr [eax + 3]  // first alpha
+    movzx      edi, byte ptr [eax + 7]  // second alpha
+    punpcklbw  xmm0, xmm0       // first 2
+    movd       xmm2, dword ptr fixed_invtbl8[esi * 4]
+    movd       xmm3, dword ptr fixed_invtbl8[edi * 4]
+    pshuflw    xmm2, xmm2, 040h // first 4 inv_alpha words.  1, a, a, a
+    pshuflw    xmm3, xmm3, 040h // next 4 inv_alpha words
+    movlhps    xmm2, xmm3
+    pmulhuw    xmm0, xmm2       // rgb * a
+
+    movdqu     xmm1, [eax]      // read 4 pixels
+    movzx      esi, byte ptr [eax + 11]  // third alpha
+    movzx      edi, byte ptr [eax + 15]  // forth alpha
+    punpckhbw  xmm1, xmm1       // next 2
+    movd       xmm2, dword ptr fixed_invtbl8[esi * 4]
+    movd       xmm3, dword ptr fixed_invtbl8[edi * 4]
+    pshuflw    xmm2, xmm2, 040h // first 4 inv_alpha words
+    pshuflw    xmm3, xmm3, 040h // next 4 inv_alpha words
+    movlhps    xmm2, xmm3
+    pmulhuw    xmm1, xmm2       // rgb * a
+    lea        eax, [eax + 16]
+
+    packuswb   xmm0, xmm1
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         convertloop
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_ARGBUNATTENUATEROW_SSE2
+
+#ifdef HAS_ARGBUNATTENUATEROW_AVX2
+// Shuffle table duplicating alpha.
+static const uvec8 kUnattenShuffleAlpha_AVX2 = {
+  0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u
+};
+// TODO(fbarchard): Enable USE_GATHER for future hardware if faster.
+// USE_GATHER is not on by default, due to being a slow instruction.
+#ifdef USE_GATHER
+__declspec(naked)
+void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
+                             int width) {
+  __asm {
+    mov        eax, [esp + 4]   // src_argb0
+    mov        edx, [esp + 8]   // dst_argb
+    mov        ecx, [esp + 12]  // width
+    sub        edx, eax
+    vbroadcastf128 ymm4, kUnattenShuffleAlpha_AVX2
+
+ convertloop:
+    vmovdqu    ymm6, [eax]       // read 8 pixels.
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0xffffffff for gather.
+    vpsrld     ymm2, ymm6, 24    // alpha in low 8 bits.
+    vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
+    vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
+    vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5  // ymm5 cleared.  1, a
+    vpunpcklwd ymm2, ymm3, ymm3  // low 4 inverted alphas. mutated. 1, 1, a, a
+    vpunpckhwd ymm3, ymm3, ymm3  // high 4 inverted alphas. mutated.
+    vpshufb    ymm2, ymm2, ymm4  // replicate low 4 alphas. 1, a, a, a
+    vpshufb    ymm3, ymm3, ymm4  // replicate high 4 alphas
+    vpmulhuw   ymm0, ymm0, ymm2  // rgb * ia
+    vpmulhuw   ymm1, ymm1, ymm3  // rgb * ia
+    vpackuswb  ymm0, ymm0, ymm1  // unmutated.
+    vmovdqu    [eax + edx], ymm0
+    lea        eax, [eax + 32]
+    sub        ecx, 8
+    jg         convertloop
+
+    vzeroupper
+    ret
+  }
+}
+#else  // USE_GATHER
+__declspec(naked)
+void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
+                             int width) {
+  __asm {
+
+    mov        eax, [esp + 4]   // src_argb0
+    mov        edx, [esp + 8]   // dst_argb
+    mov        ecx, [esp + 12]  // width
+    sub        edx, eax
+    vbroadcastf128 ymm5, kUnattenShuffleAlpha_AVX2
+
+    push       esi
+    push       edi
+
+ convertloop:
+    // replace VPGATHER
+    movzx      esi, byte ptr [eax + 3]                 // alpha0
+    movzx      edi, byte ptr [eax + 7]                 // alpha1
+    vmovd      xmm0, dword ptr fixed_invtbl8[esi * 4]  // [1,a0]
+    vmovd      xmm1, dword ptr fixed_invtbl8[edi * 4]  // [1,a1]
+    movzx      esi, byte ptr [eax + 11]                // alpha2
+    movzx      edi, byte ptr [eax + 15]                // alpha3
+    vpunpckldq xmm6, xmm0, xmm1                        // [1,a1,1,a0]
+    vmovd      xmm2, dword ptr fixed_invtbl8[esi * 4]  // [1,a2]
+    vmovd      xmm3, dword ptr fixed_invtbl8[edi * 4]  // [1,a3]
+    movzx      esi, byte ptr [eax + 19]                // alpha4
+    movzx      edi, byte ptr [eax + 23]                // alpha5
+    vpunpckldq xmm7, xmm2, xmm3                        // [1,a3,1,a2]
+    vmovd      xmm0, dword ptr fixed_invtbl8[esi * 4]  // [1,a4]
+    vmovd      xmm1, dword ptr fixed_invtbl8[edi * 4]  // [1,a5]
+    movzx      esi, byte ptr [eax + 27]                // alpha6
+    movzx      edi, byte ptr [eax + 31]                // alpha7
+    vpunpckldq xmm0, xmm0, xmm1                        // [1,a5,1,a4]
+    vmovd      xmm2, dword ptr fixed_invtbl8[esi * 4]  // [1,a6]
+    vmovd      xmm3, dword ptr fixed_invtbl8[edi * 4]  // [1,a7]
+    vpunpckldq xmm2, xmm2, xmm3                        // [1,a7,1,a6]
+    vpunpcklqdq xmm3, xmm6, xmm7                       // [1,a3,1,a2,1,a1,1,a0]
+    vpunpcklqdq xmm0, xmm0, xmm2                       // [1,a7,1,a6,1,a5,1,a4]
+    vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0]
+    // end of VPGATHER
+
+    vmovdqu    ymm6, [eax]       // read 8 pixels.
+    vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
+    vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
+    vpunpcklwd ymm2, ymm3, ymm3  // low 4 inverted alphas. mutated. 1, 1, a, a
+    vpunpckhwd ymm3, ymm3, ymm3  // high 4 inverted alphas. mutated.
+    vpshufb    ymm2, ymm2, ymm5  // replicate low 4 alphas. 1, a, a, a
+    vpshufb    ymm3, ymm3, ymm5  // replicate high 4 alphas
+    vpmulhuw   ymm0, ymm0, ymm2  // rgb * ia
+    vpmulhuw   ymm1, ymm1, ymm3  // rgb * ia
+    vpackuswb  ymm0, ymm0, ymm1  // unmutated.
+    vmovdqu    [eax + edx], ymm0
+    lea        eax, [eax + 32]
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // USE_GATHER
+#endif  // HAS_ARGBATTENUATEROW_AVX2
+
+#ifdef HAS_ARGBGRAYROW_SSSE3
+// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.
+__declspec(naked)
+void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
+  __asm {
+    mov        eax, [esp + 4]   /* src_argb */
+    mov        edx, [esp + 8]   /* dst_argb */
+    mov        ecx, [esp + 12]  /* width */
+    movdqa     xmm4, kARGBToYJ
+    movdqa     xmm5, kAddYJ64
+
+ convertloop:
+    movdqu     xmm0, [eax]  // G
+    movdqu     xmm1, [eax + 16]
+    pmaddubsw  xmm0, xmm4
+    pmaddubsw  xmm1, xmm4
+    phaddw     xmm0, xmm1
+    paddw      xmm0, xmm5  // Add .5 for rounding.
+    psrlw      xmm0, 7
+    packuswb   xmm0, xmm0   // 8 G bytes
+    movdqu     xmm2, [eax]  // A
+    movdqu     xmm3, [eax + 16]
+    lea        eax, [eax + 32]
+    psrld      xmm2, 24
+    psrld      xmm3, 24
+    packuswb   xmm2, xmm3
+    packuswb   xmm2, xmm2   // 8 A bytes
+    movdqa     xmm3, xmm0   // Weave into GG, GA, then GGGA
+    punpcklbw  xmm0, xmm0   // 8 GG words
+    punpcklbw  xmm3, xmm2   // 8 GA words
+    movdqa     xmm1, xmm0
+    punpcklwd  xmm0, xmm3   // GGGA first 4
+    punpckhwd  xmm1, xmm3   // GGGA next 4
+    movdqu     [edx], xmm0
+    movdqu     [edx + 16], xmm1
+    lea        edx, [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+    ret
+  }
+}
+#endif  // HAS_ARGBGRAYROW_SSSE3
+
+#ifdef HAS_ARGBSEPIAROW_SSSE3
+//    b = (r * 35 + g * 68 + b * 17) >> 7
+//    g = (r * 45 + g * 88 + b * 22) >> 7
+//    r = (r * 50 + g * 98 + b * 24) >> 7
+// Constant for ARGB color to sepia tone.
+static const vec8 kARGBToSepiaB = {
+  17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
+};
+
+static const vec8 kARGBToSepiaG = {
+  22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
+};
+
+static const vec8 kARGBToSepiaR = {
+  24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
+};
+
+// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
+__declspec(naked)
+void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
+  __asm {
+    mov        eax, [esp + 4]   /* dst_argb */
+    mov        ecx, [esp + 8]   /* width */
+    movdqa     xmm2, kARGBToSepiaB
+    movdqa     xmm3, kARGBToSepiaG
+    movdqa     xmm4, kARGBToSepiaR
+
+ convertloop:
+    movdqu     xmm0, [eax]  // B
+    movdqu     xmm6, [eax + 16]
+    pmaddubsw  xmm0, xmm2
+    pmaddubsw  xmm6, xmm2
+    phaddw     xmm0, xmm6
+    psrlw      xmm0, 7
+    packuswb   xmm0, xmm0   // 8 B values
+    movdqu     xmm5, [eax]  // G
+    movdqu     xmm1, [eax + 16]
+    pmaddubsw  xmm5, xmm3
+    pmaddubsw  xmm1, xmm3
+    phaddw     xmm5, xmm1
+    psrlw      xmm5, 7
+    packuswb   xmm5, xmm5   // 8 G values
+    punpcklbw  xmm0, xmm5   // 8 BG values
+    movdqu     xmm5, [eax]  // R
+    movdqu     xmm1, [eax + 16]
+    pmaddubsw  xmm5, xmm4
+    pmaddubsw  xmm1, xmm4
+    phaddw     xmm5, xmm1
+    psrlw      xmm5, 7
+    packuswb   xmm5, xmm5   // 8 R values
+    movdqu     xmm6, [eax]  // A
+    movdqu     xmm1, [eax + 16]
+    psrld      xmm6, 24
+    psrld      xmm1, 24
+    packuswb   xmm6, xmm1
+    packuswb   xmm6, xmm6   // 8 A values
+    punpcklbw  xmm5, xmm6   // 8 RA values
+    movdqa     xmm1, xmm0   // Weave BG, RA together
+    punpcklwd  xmm0, xmm5   // BGRA first 4
+    punpckhwd  xmm1, xmm5   // BGRA next 4
+    movdqu     [eax], xmm0
+    movdqu     [eax + 16], xmm1
+    lea        eax, [eax + 32]
+    sub        ecx, 8
+    jg         convertloop
+    ret
+  }
+}
+#endif  // HAS_ARGBSEPIAROW_SSSE3
+
+#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
+// Tranform 8 ARGB pixels (32 bytes) with color matrix.
+// Same as Sepia except matrix is provided.
+// TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
+// and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
+__declspec(naked)
+void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+                              const int8* matrix_argb, int width) {
+  __asm {
+    mov        eax, [esp + 4]   /* src_argb */
+    mov        edx, [esp + 8]   /* dst_argb */
+    mov        ecx, [esp + 12]  /* matrix_argb */
+    movdqu     xmm5, [ecx]
+    pshufd     xmm2, xmm5, 0x00
+    pshufd     xmm3, xmm5, 0x55
+    pshufd     xmm4, xmm5, 0xaa
+    pshufd     xmm5, xmm5, 0xff
+    mov        ecx, [esp + 16]  /* width */
+
+ convertloop:
+    movdqu     xmm0, [eax]  // B
+    movdqu     xmm7, [eax + 16]
+    pmaddubsw  xmm0, xmm2
+    pmaddubsw  xmm7, xmm2
+    movdqu     xmm6, [eax]  // G
+    movdqu     xmm1, [eax + 16]
+    pmaddubsw  xmm6, xmm3
+    pmaddubsw  xmm1, xmm3
+    phaddsw    xmm0, xmm7   // B
+    phaddsw    xmm6, xmm1   // G
+    psraw      xmm0, 6      // B
+    psraw      xmm6, 6      // G
+    packuswb   xmm0, xmm0   // 8 B values
+    packuswb   xmm6, xmm6   // 8 G values
+    punpcklbw  xmm0, xmm6   // 8 BG values
+    movdqu     xmm1, [eax]  // R
+    movdqu     xmm7, [eax + 16]
+    pmaddubsw  xmm1, xmm4
+    pmaddubsw  xmm7, xmm4
+    phaddsw    xmm1, xmm7   // R
+    movdqu     xmm6, [eax]  // A
+    movdqu     xmm7, [eax + 16]
+    pmaddubsw  xmm6, xmm5
+    pmaddubsw  xmm7, xmm5
+    phaddsw    xmm6, xmm7   // A
+    psraw      xmm1, 6      // R
+    psraw      xmm6, 6      // A
+    packuswb   xmm1, xmm1   // 8 R values
+    packuswb   xmm6, xmm6   // 8 A values
+    punpcklbw  xmm1, xmm6   // 8 RA values
+    movdqa     xmm6, xmm0   // Weave BG, RA together
+    punpcklwd  xmm0, xmm1   // BGRA first 4
+    punpckhwd  xmm6, xmm1   // BGRA next 4
+    movdqu     [edx], xmm0
+    movdqu     [edx + 16], xmm6
+    lea        eax, [eax + 32]
+    lea        edx, [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+    ret
+  }
+}
+#endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
+
+#ifdef HAS_ARGBQUANTIZEROW_SSE2
+// Quantize 4 ARGB pixels (16 bytes).
+__declspec(naked)
+void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
+                          int interval_offset, int width) {
+  __asm {
+    mov        eax, [esp + 4]    /* dst_argb */
+    movd       xmm2, [esp + 8]   /* scale */
+    movd       xmm3, [esp + 12]  /* interval_size */
+    movd       xmm4, [esp + 16]  /* interval_offset */
+    mov        ecx, [esp + 20]   /* width */
+    pshuflw    xmm2, xmm2, 040h
+    pshufd     xmm2, xmm2, 044h
+    pshuflw    xmm3, xmm3, 040h
+    pshufd     xmm3, xmm3, 044h
+    pshuflw    xmm4, xmm4, 040h
+    pshufd     xmm4, xmm4, 044h
+    pxor       xmm5, xmm5  // constant 0
+    pcmpeqb    xmm6, xmm6  // generate mask 0xff000000
+    pslld      xmm6, 24
+
+ convertloop:
+    movdqu     xmm0, [eax]  // read 4 pixels
+    punpcklbw  xmm0, xmm5   // first 2 pixels
+    pmulhuw    xmm0, xmm2   // pixel * scale >> 16
+    movdqu     xmm1, [eax]  // read 4 pixels
+    punpckhbw  xmm1, xmm5   // next 2 pixels
+    pmulhuw    xmm1, xmm2
+    pmullw     xmm0, xmm3   // * interval_size
+    movdqu     xmm7, [eax]  // read 4 pixels
+    pmullw     xmm1, xmm3
+    pand       xmm7, xmm6   // mask alpha
+    paddw      xmm0, xmm4   // + interval_size / 2
+    paddw      xmm1, xmm4
+    packuswb   xmm0, xmm1
+    por        xmm0, xmm7
+    movdqu     [eax], xmm0
+    lea        eax, [eax + 16]
+    sub        ecx, 4
+    jg         convertloop
+    ret
+  }
+}
+#endif  // HAS_ARGBQUANTIZEROW_SSE2
+
+#ifdef HAS_ARGBSHADEROW_SSE2
+// Shade 4 pixels at a time by specified value.
+__declspec(naked)
+void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
+                       uint32 value) {
+  __asm {
+    mov        eax, [esp + 4]   // src_argb
+    mov        edx, [esp + 8]   // dst_argb
+    mov        ecx, [esp + 12]  // width
+    movd       xmm2, [esp + 16]  // value
+    punpcklbw  xmm2, xmm2
+    punpcklqdq xmm2, xmm2
+
+ convertloop:
+    movdqu     xmm0, [eax]      // read 4 pixels
+    lea        eax, [eax + 16]
+    movdqa     xmm1, xmm0
+    punpcklbw  xmm0, xmm0       // first 2
+    punpckhbw  xmm1, xmm1       // next 2
+    pmulhuw    xmm0, xmm2       // argb * value
+    pmulhuw    xmm1, xmm2       // argb * value
+    psrlw      xmm0, 8
+    psrlw      xmm1, 8
+    packuswb   xmm0, xmm1
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         convertloop
+
+    ret
+  }
+}
+#endif  // HAS_ARGBSHADEROW_SSE2
+
+#ifdef HAS_ARGBMULTIPLYROW_SSE2
+// Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
+__declspec(naked)
+void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+                          uint8* dst_argb, int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // src_argb0
+    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        edx, [esp + 4 + 12]  // dst_argb
+    mov        ecx, [esp + 4 + 16]  // width
+    pxor       xmm5, xmm5  // constant 0
+
+ convertloop:
+    movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
+    movdqu     xmm2, [esi]        // read 4 pixels from src_argb1
+    movdqu     xmm1, xmm0
+    movdqu     xmm3, xmm2
+    punpcklbw  xmm0, xmm0         // first 2
+    punpckhbw  xmm1, xmm1         // next 2
+    punpcklbw  xmm2, xmm5         // first 2
+    punpckhbw  xmm3, xmm5         // next 2
+    pmulhuw    xmm0, xmm2         // src_argb0 * src_argb1 first 2
+    pmulhuw    xmm1, xmm3         // src_argb0 * src_argb1 next 2
+    lea        eax, [eax + 16]
+    lea        esi, [esi + 16]
+    packuswb   xmm0, xmm1
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         convertloop
+
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_ARGBMULTIPLYROW_SSE2
+
+#ifdef HAS_ARGBADDROW_SSE2
+// Add 2 rows of ARGB pixels together, 4 pixels at a time.
+// TODO(fbarchard): Port this to posix, neon and other math functions.
+__declspec(naked)
+void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+                     uint8* dst_argb, int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // src_argb0
+    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        edx, [esp + 4 + 12]  // dst_argb
+    mov        ecx, [esp + 4 + 16]  // width
+
+    sub        ecx, 4
+    jl         convertloop49
+
+ convertloop4:
+    movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
+    lea        eax, [eax + 16]
+    movdqu     xmm1, [esi]        // read 4 pixels from src_argb1
+    lea        esi, [esi + 16]
+    paddusb    xmm0, xmm1         // src_argb0 + src_argb1
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jge        convertloop4
+
+ convertloop49:
+    add        ecx, 4 - 1
+    jl         convertloop19
+
+ convertloop1:
+    movd       xmm0, [eax]        // read 1 pixels from src_argb0
+    lea        eax, [eax + 4]
+    movd       xmm1, [esi]        // read 1 pixels from src_argb1
+    lea        esi, [esi + 4]
+    paddusb    xmm0, xmm1         // src_argb0 + src_argb1
+    movd       [edx], xmm0
+    lea        edx, [edx + 4]
+    sub        ecx, 1
+    jge        convertloop1
+
+ convertloop19:
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_ARGBADDROW_SSE2
+
+#ifdef HAS_ARGBSUBTRACTROW_SSE2
+// Subtract 2 rows of ARGB pixels together, 4 pixels at a time.
+__declspec(naked)
+void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+                          uint8* dst_argb, int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // src_argb0
+    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        edx, [esp + 4 + 12]  // dst_argb
+    mov        ecx, [esp + 4 + 16]  // width
+
+ convertloop:
+    movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
+    lea        eax, [eax + 16]
+    movdqu     xmm1, [esi]        // read 4 pixels from src_argb1
+    lea        esi, [esi + 16]
+    psubusb    xmm0, xmm1         // src_argb0 - src_argb1
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         convertloop
+
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_ARGBSUBTRACTROW_SSE2
+
+#ifdef HAS_ARGBMULTIPLYROW_AVX2
+// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
+__declspec(naked)
+void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
+                          uint8* dst_argb, int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // src_argb0
+    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        edx, [esp + 4 + 12]  // dst_argb
+    mov        ecx, [esp + 4 + 16]  // width
+    vpxor      ymm5, ymm5, ymm5     // constant 0
+
+ convertloop:
+    vmovdqu    ymm1, [eax]        // read 8 pixels from src_argb0
+    lea        eax, [eax + 32]
+    vmovdqu    ymm3, [esi]        // read 8 pixels from src_argb1
+    lea        esi, [esi + 32]
+    vpunpcklbw ymm0, ymm1, ymm1   // low 4
+    vpunpckhbw ymm1, ymm1, ymm1   // high 4
+    vpunpcklbw ymm2, ymm3, ymm5   // low 4
+    vpunpckhbw ymm3, ymm3, ymm5   // high 4
+    vpmulhuw   ymm0, ymm0, ymm2   // src_argb0 * src_argb1 low 4
+    vpmulhuw   ymm1, ymm1, ymm3   // src_argb0 * src_argb1 high 4
+    vpackuswb  ymm0, ymm0, ymm1
+    vmovdqu    [edx], ymm0
+    lea        edx, [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBMULTIPLYROW_AVX2
+
+#ifdef HAS_ARGBADDROW_AVX2
+// Add 2 rows of ARGB pixels together, 8 pixels at a time.
+__declspec(naked)
+void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
+                     uint8* dst_argb, int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // src_argb0
+    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        edx, [esp + 4 + 12]  // dst_argb
+    mov        ecx, [esp + 4 + 16]  // width
+
+ convertloop:
+    vmovdqu    ymm0, [eax]              // read 8 pixels from src_argb0
+    lea        eax, [eax + 32]
+    vpaddusb   ymm0, ymm0, [esi]        // add 8 pixels from src_argb1
+    lea        esi, [esi + 32]
+    vmovdqu    [edx], ymm0
+    lea        edx, [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBADDROW_AVX2
+
+#ifdef HAS_ARGBSUBTRACTROW_AVX2
+// Subtract 2 rows of ARGB pixels together, 8 pixels at a time.
+__declspec(naked)
+void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
+                          uint8* dst_argb, int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // src_argb0
+    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        edx, [esp + 4 + 12]  // dst_argb
+    mov        ecx, [esp + 4 + 16]  // width
+
+ convertloop:
+    vmovdqu    ymm0, [eax]              // read 8 pixels from src_argb0
+    lea        eax, [eax + 32]
+    vpsubusb   ymm0, ymm0, [esi]        // src_argb0 - src_argb1
+    lea        esi, [esi + 32]
+    vmovdqu    [edx], ymm0
+    lea        edx, [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBSUBTRACTROW_AVX2
+
+#ifdef HAS_SOBELXROW_SSE2
+// SobelX as a matrix is
+// -1  0  1
+// -2  0  2
+// -1  0  1
+__declspec(naked)
+void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
+                    const uint8* src_y2, uint8* dst_sobelx, int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   // src_y0
+    mov        esi, [esp + 8 + 8]   // src_y1
+    mov        edi, [esp + 8 + 12]  // src_y2
+    mov        edx, [esp + 8 + 16]  // dst_sobelx
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        esi, eax
+    sub        edi, eax
+    sub        edx, eax
+    pxor       xmm5, xmm5  // constant 0
+
+ convertloop:
+    movq       xmm0, qword ptr [eax]            // read 8 pixels from src_y0[0]
+    movq       xmm1, qword ptr [eax + 2]        // read 8 pixels from src_y0[2]
+    punpcklbw  xmm0, xmm5
+    punpcklbw  xmm1, xmm5
+    psubw      xmm0, xmm1
+    movq       xmm1, qword ptr [eax + esi]      // read 8 pixels from src_y1[0]
+    movq       xmm2, qword ptr [eax + esi + 2]  // read 8 pixels from src_y1[2]
+    punpcklbw  xmm1, xmm5
+    punpcklbw  xmm2, xmm5
+    psubw      xmm1, xmm2
+    movq       xmm2, qword ptr [eax + edi]      // read 8 pixels from src_y2[0]
+    movq       xmm3, qword ptr [eax + edi + 2]  // read 8 pixels from src_y2[2]
+    punpcklbw  xmm2, xmm5
+    punpcklbw  xmm3, xmm5
+    psubw      xmm2, xmm3
+    paddw      xmm0, xmm2
+    paddw      xmm0, xmm1
+    paddw      xmm0, xmm1
+    pxor       xmm1, xmm1   // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw
+    psubw      xmm1, xmm0
+    pmaxsw     xmm0, xmm1
+    packuswb   xmm0, xmm0
+    movq       qword ptr [eax + edx], xmm0
+    lea        eax, [eax + 8]
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_SOBELXROW_SSE2
+
+#ifdef HAS_SOBELYROW_SSE2
+// SobelY as a matrix is
+// -1 -2 -1
+//  0  0  0
+//  1  2  1
+__declspec(naked)
+void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
+                    uint8* dst_sobely, int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // src_y0
+    mov        esi, [esp + 4 + 8]   // src_y1
+    mov        edx, [esp + 4 + 12]  // dst_sobely
+    mov        ecx, [esp + 4 + 16]  // width
+    sub        esi, eax
+    sub        edx, eax
+    pxor       xmm5, xmm5  // constant 0
+
+ convertloop:
+    movq       xmm0, qword ptr [eax]            // read 8 pixels from src_y0[0]
+    movq       xmm1, qword ptr [eax + esi]      // read 8 pixels from src_y1[0]
+    punpcklbw  xmm0, xmm5
+    punpcklbw  xmm1, xmm5
+    psubw      xmm0, xmm1
+    movq       xmm1, qword ptr [eax + 1]        // read 8 pixels from src_y0[1]
+    movq       xmm2, qword ptr [eax + esi + 1]  // read 8 pixels from src_y1[1]
+    punpcklbw  xmm1, xmm5
+    punpcklbw  xmm2, xmm5
+    psubw      xmm1, xmm2
+    movq       xmm2, qword ptr [eax + 2]        // read 8 pixels from src_y0[2]
+    movq       xmm3, qword ptr [eax + esi + 2]  // read 8 pixels from src_y1[2]
+    punpcklbw  xmm2, xmm5
+    punpcklbw  xmm3, xmm5
+    psubw      xmm2, xmm3
+    paddw      xmm0, xmm2
+    paddw      xmm0, xmm1
+    paddw      xmm0, xmm1
+    pxor       xmm1, xmm1   // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw
+    psubw      xmm1, xmm0
+    pmaxsw     xmm0, xmm1
+    packuswb   xmm0, xmm0
+    movq       qword ptr [eax + edx], xmm0
+    lea        eax, [eax + 8]
+    sub        ecx, 8
+    jg         convertloop
+
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_SOBELYROW_SSE2
+
+#ifdef HAS_SOBELROW_SSE2
+// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
+// A = 255
+// R = Sobel
+// G = Sobel
+// B = Sobel
+__declspec(naked)
+void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+                   uint8* dst_argb, int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // src_sobelx
+    mov        esi, [esp + 4 + 8]   // src_sobely
+    mov        edx, [esp + 4 + 12]  // dst_argb
+    mov        ecx, [esp + 4 + 16]  // width
+    sub        esi, eax
+    pcmpeqb    xmm5, xmm5           // alpha 255
+    pslld      xmm5, 24             // 0xff000000
+
+ convertloop:
+    movdqu     xmm0, [eax]            // read 16 pixels src_sobelx
+    movdqu     xmm1, [eax + esi]      // read 16 pixels src_sobely
+    lea        eax, [eax + 16]
+    paddusb    xmm0, xmm1             // sobel = sobelx + sobely
+    movdqa     xmm2, xmm0             // GG
+    punpcklbw  xmm2, xmm0             // First 8
+    punpckhbw  xmm0, xmm0             // Next 8
+    movdqa     xmm1, xmm2             // GGGG
+    punpcklwd  xmm1, xmm2             // First 4
+    punpckhwd  xmm2, xmm2             // Next 4
+    por        xmm1, xmm5             // GGGA
+    por        xmm2, xmm5
+    movdqa     xmm3, xmm0             // GGGG
+    punpcklwd  xmm3, xmm0             // Next 4
+    punpckhwd  xmm0, xmm0             // Last 4
+    por        xmm3, xmm5             // GGGA
+    por        xmm0, xmm5
+    movdqu     [edx], xmm1
+    movdqu     [edx + 16], xmm2
+    movdqu     [edx + 32], xmm3
+    movdqu     [edx + 48], xmm0
+    lea        edx, [edx + 64]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_SOBELROW_SSE2
+
+#ifdef HAS_SOBELTOPLANEROW_SSE2
+// Adds Sobel X and Sobel Y and stores Sobel into a plane.
+__declspec(naked)
+void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+                          uint8* dst_y, int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // src_sobelx
+    mov        esi, [esp + 4 + 8]   // src_sobely
+    mov        edx, [esp + 4 + 12]  // dst_argb
+    mov        ecx, [esp + 4 + 16]  // width
+    sub        esi, eax
+
+ convertloop:
+    movdqu     xmm0, [eax]            // read 16 pixels src_sobelx
+    movdqu     xmm1, [eax + esi]      // read 16 pixels src_sobely
+    lea        eax, [eax + 16]
+    paddusb    xmm0, xmm1             // sobel = sobelx + sobely
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_SOBELTOPLANEROW_SSE2
+
+#ifdef HAS_SOBELXYROW_SSE2
+// Mixes Sobel X, Sobel Y and Sobel into ARGB.
+// A = 255
+// R = Sobel X
+// G = Sobel
+// B = Sobel Y
+__declspec(naked)
+void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+                     uint8* dst_argb, int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // src_sobelx
+    mov        esi, [esp + 4 + 8]   // src_sobely
+    mov        edx, [esp + 4 + 12]  // dst_argb
+    mov        ecx, [esp + 4 + 16]  // width
+    sub        esi, eax
+    pcmpeqb    xmm5, xmm5           // alpha 255
+
+ convertloop:
+    movdqu     xmm0, [eax]            // read 16 pixels src_sobelx
+    movdqu     xmm1, [eax + esi]      // read 16 pixels src_sobely
+    lea        eax, [eax + 16]
+    movdqa     xmm2, xmm0
+    paddusb    xmm2, xmm1             // sobel = sobelx + sobely
+    movdqa     xmm3, xmm0             // XA
+    punpcklbw  xmm3, xmm5
+    punpckhbw  xmm0, xmm5
+    movdqa     xmm4, xmm1             // YS
+    punpcklbw  xmm4, xmm2
+    punpckhbw  xmm1, xmm2
+    movdqa     xmm6, xmm4             // YSXA
+    punpcklwd  xmm6, xmm3             // First 4
+    punpckhwd  xmm4, xmm3             // Next 4
+    movdqa     xmm7, xmm1             // YSXA
+    punpcklwd  xmm7, xmm0             // Next 4
+    punpckhwd  xmm1, xmm0             // Last 4
+    movdqu     [edx], xmm6
+    movdqu     [edx + 16], xmm4
+    movdqu     [edx + 32], xmm7
+    movdqu     [edx + 48], xmm1
+    lea        edx, [edx + 64]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_SOBELXYROW_SSE2
+
+#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
+// Consider float CumulativeSum.
+// Consider calling CumulativeSum one row at time as needed.
+// Consider circular CumulativeSum buffer of radius * 2 + 1 height.
+// Convert cumulative sum for an area to an average for 1 pixel.
+// topleft is pointer to top left of CumulativeSum buffer for area.
+// botleft is pointer to bottom left of CumulativeSum buffer.
+// width is offset from left to right of area in CumulativeSum buffer measured
+//   in number of ints.
+// area is the number of pixels in the area being averaged.
+// dst points to pixel to store result to.
+// count is number of averaged pixels to produce.
+// Does 4 pixels at a time.
+void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
+                                    int width, int area, uint8* dst,
+                                    int count) {
+  __asm {
+    mov        eax, topleft  // eax topleft
+    mov        esi, botleft  // esi botleft
+    mov        edx, width
+    movd       xmm5, area
+    mov        edi, dst
+    mov        ecx, count
+    cvtdq2ps   xmm5, xmm5
+    rcpss      xmm4, xmm5  // 1.0f / area
+    pshufd     xmm4, xmm4, 0
+    sub        ecx, 4
+    jl         l4b
+
+    cmp        area, 128  // 128 pixels will not overflow 15 bits.
+    ja         l4
+
+    pshufd     xmm5, xmm5, 0        // area
+    pcmpeqb    xmm6, xmm6           // constant of 65536.0 - 1 = 65535.0
+    psrld      xmm6, 16
+    cvtdq2ps   xmm6, xmm6
+    addps      xmm5, xmm6           // (65536.0 + area - 1)
+    mulps      xmm5, xmm4           // (65536.0 + area - 1) * 1 / area
+    cvtps2dq   xmm5, xmm5           // 0.16 fixed point
+    packssdw   xmm5, xmm5           // 16 bit shorts
+
+    // 4 pixel loop small blocks.
+  s4:
+    // top left
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
+
+    // - top right
+    psubd      xmm0, [eax + edx * 4]
+    psubd      xmm1, [eax + edx * 4 + 16]
+    psubd      xmm2, [eax + edx * 4 + 32]
+    psubd      xmm3, [eax + edx * 4 + 48]
+    lea        eax, [eax + 64]
+
+    // - bottom left
+    psubd      xmm0, [esi]
+    psubd      xmm1, [esi + 16]
+    psubd      xmm2, [esi + 32]
+    psubd      xmm3, [esi + 48]
+
+    // + bottom right
+    paddd      xmm0, [esi + edx * 4]
+    paddd      xmm1, [esi + edx * 4 + 16]
+    paddd      xmm2, [esi + edx * 4 + 32]
+    paddd      xmm3, [esi + edx * 4 + 48]
+    lea        esi, [esi + 64]
+
+    packssdw   xmm0, xmm1  // pack 4 pixels into 2 registers
+    packssdw   xmm2, xmm3
+
+    pmulhuw    xmm0, xmm5
+    pmulhuw    xmm2, xmm5
+
+    packuswb   xmm0, xmm2
+    movdqu     [edi], xmm0
+    lea        edi, [edi + 16]
+    sub        ecx, 4
+    jge        s4
+
+    jmp        l4b
+
+    // 4 pixel loop
+  l4:
+    // top left
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
+
+    // - top right
+    psubd      xmm0, [eax + edx * 4]
+    psubd      xmm1, [eax + edx * 4 + 16]
+    psubd      xmm2, [eax + edx * 4 + 32]
+    psubd      xmm3, [eax + edx * 4 + 48]
+    lea        eax, [eax + 64]
+
+    // - bottom left
+    psubd      xmm0, [esi]
+    psubd      xmm1, [esi + 16]
+    psubd      xmm2, [esi + 32]
+    psubd      xmm3, [esi + 48]
+
+    // + bottom right
+    paddd      xmm0, [esi + edx * 4]
+    paddd      xmm1, [esi + edx * 4 + 16]
+    paddd      xmm2, [esi + edx * 4 + 32]
+    paddd      xmm3, [esi + edx * 4 + 48]
+    lea        esi, [esi + 64]
+
+    cvtdq2ps   xmm0, xmm0   // Average = Sum * 1 / Area
+    cvtdq2ps   xmm1, xmm1
+    mulps      xmm0, xmm4
+    mulps      xmm1, xmm4
+    cvtdq2ps   xmm2, xmm2
+    cvtdq2ps   xmm3, xmm3
+    mulps      xmm2, xmm4
+    mulps      xmm3, xmm4
+    cvtps2dq   xmm0, xmm0
+    cvtps2dq   xmm1, xmm1
+    cvtps2dq   xmm2, xmm2
+    cvtps2dq   xmm3, xmm3
+    packssdw   xmm0, xmm1
+    packssdw   xmm2, xmm3
+    packuswb   xmm0, xmm2
+    movdqu     [edi], xmm0
+    lea        edi, [edi + 16]
+    sub        ecx, 4
+    jge        l4
+
+  l4b:
+    add        ecx, 4 - 1
+    jl         l1b
+
+    // 1 pixel loop
+  l1:
+    movdqu     xmm0, [eax]
+    psubd      xmm0, [eax + edx * 4]
+    lea        eax, [eax + 16]
+    psubd      xmm0, [esi]
+    paddd      xmm0, [esi + edx * 4]
+    lea        esi, [esi + 16]
+    cvtdq2ps   xmm0, xmm0
+    mulps      xmm0, xmm4
+    cvtps2dq   xmm0, xmm0
+    packssdw   xmm0, xmm0
+    packuswb   xmm0, xmm0
+    movd       dword ptr [edi], xmm0
+    lea        edi, [edi + 4]
+    sub        ecx, 1
+    jge        l1
+  l1b:
+  }
+}
+#endif  // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
+
+#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
+// Creates a table of cumulative sums where each value is a sum of all values
+// above and to the left of the value.
+void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
+                                  const int32* previous_cumsum, int width) {
+  __asm {
+    mov        eax, row
+    mov        edx, cumsum
+    mov        esi, previous_cumsum
+    mov        ecx, width
+    pxor       xmm0, xmm0
+    pxor       xmm1, xmm1
+
+    sub        ecx, 4
+    jl         l4b
+    test       edx, 15
+    jne        l4b
+
+    // 4 pixel loop
+  l4:
+    movdqu     xmm2, [eax]  // 4 argb pixels 16 bytes.
+    lea        eax, [eax + 16]
+    movdqa     xmm4, xmm2
+
+    punpcklbw  xmm2, xmm1
+    movdqa     xmm3, xmm2
+    punpcklwd  xmm2, xmm1
+    punpckhwd  xmm3, xmm1
+
+    punpckhbw  xmm4, xmm1
+    movdqa     xmm5, xmm4
+    punpcklwd  xmm4, xmm1
+    punpckhwd  xmm5, xmm1
+
+    paddd      xmm0, xmm2
+    movdqu     xmm2, [esi]  // previous row above.
+    paddd      xmm2, xmm0
+
+    paddd      xmm0, xmm3
+    movdqu     xmm3, [esi + 16]
+    paddd      xmm3, xmm0
+
+    paddd      xmm0, xmm4
+    movdqu     xmm4, [esi + 32]
+    paddd      xmm4, xmm0
+
+    paddd      xmm0, xmm5
+    movdqu     xmm5, [esi + 48]
+    lea        esi, [esi + 64]
+    paddd      xmm5, xmm0
+
+    movdqu     [edx], xmm2
+    movdqu     [edx + 16], xmm3
+    movdqu     [edx + 32], xmm4
+    movdqu     [edx + 48], xmm5
+
+    lea        edx, [edx + 64]
+    sub        ecx, 4
+    jge        l4
+
+  l4b:
+    add        ecx, 4 - 1
+    jl         l1b
+
+    // 1 pixel loop
+  l1:
+    movd       xmm2, dword ptr [eax]  // 1 argb pixel 4 bytes.
+    lea        eax, [eax + 4]
+    punpcklbw  xmm2, xmm1
+    punpcklwd  xmm2, xmm1
+    paddd      xmm0, xmm2
+    movdqu     xmm2, [esi]
+    lea        esi, [esi + 16]
+    paddd      xmm2, xmm0
+    movdqu     [edx], xmm2
+    lea        edx, [edx + 16]
+    sub        ecx, 1
+    jge        l1
+
+ l1b:
+  }
+}
+#endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
+
+#ifdef HAS_ARGBAFFINEROW_SSE2
+// Copy ARGB pixels from source image with slope to a row of destination.
+__declspec(naked)
+LIBYUV_API
+void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
+                        uint8* dst_argb, const float* uv_dudv, int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 12]  // src_argb
+    mov        esi, [esp + 16]  // stride
+    mov        edx, [esp + 20]  // dst_argb
+    mov        ecx, [esp + 24]  // pointer to uv_dudv
+    movq       xmm2, qword ptr [ecx]  // uv
+    movq       xmm7, qword ptr [ecx + 8]  // dudv
+    mov        ecx, [esp + 28]  // width
+    shl        esi, 16          // 4, stride
+    add        esi, 4
+    movd       xmm5, esi
+    sub        ecx, 4
+    jl         l4b
+
+    // setup for 4 pixel loop
+    pshufd     xmm7, xmm7, 0x44  // dup dudv
+    pshufd     xmm5, xmm5, 0  // dup 4, stride
+    movdqa     xmm0, xmm2    // x0, y0, x1, y1
+    addps      xmm0, xmm7
+    movlhps    xmm2, xmm0
+    movdqa     xmm4, xmm7
+    addps      xmm4, xmm4    // dudv *= 2
+    movdqa     xmm3, xmm2    // x2, y2, x3, y3
+    addps      xmm3, xmm4
+    addps      xmm4, xmm4    // dudv *= 4
+
+    // 4 pixel loop
+  l4:
+    cvttps2dq  xmm0, xmm2    // x, y float to int first 2
+    cvttps2dq  xmm1, xmm3    // x, y float to int next 2
+    packssdw   xmm0, xmm1    // x, y as 8 shorts
+    pmaddwd    xmm0, xmm5    // offsets = x * 4 + y * stride.
+    movd       esi, xmm0
+    pshufd     xmm0, xmm0, 0x39  // shift right
+    movd       edi, xmm0
+    pshufd     xmm0, xmm0, 0x39  // shift right
+    movd       xmm1, [eax + esi]  // read pixel 0
+    movd       xmm6, [eax + edi]  // read pixel 1
+    punpckldq  xmm1, xmm6     // combine pixel 0 and 1
+    addps      xmm2, xmm4    // x, y += dx, dy first 2
+    movq       qword ptr [edx], xmm1
+    movd       esi, xmm0
+    pshufd     xmm0, xmm0, 0x39  // shift right
+    movd       edi, xmm0
+    movd       xmm6, [eax + esi]  // read pixel 2
+    movd       xmm0, [eax + edi]  // read pixel 3
+    punpckldq  xmm6, xmm0     // combine pixel 2 and 3
+    addps      xmm3, xmm4    // x, y += dx, dy next 2
+    movq       qword ptr 8[edx], xmm6
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jge        l4
+
+  l4b:
+    add        ecx, 4 - 1
+    jl         l1b
+
+    // 1 pixel loop
+  l1:
+    cvttps2dq  xmm0, xmm2    // x, y float to int
+    packssdw   xmm0, xmm0    // x, y as shorts
+    pmaddwd    xmm0, xmm5    // offset = x * 4 + y * stride
+    addps      xmm2, xmm7    // x, y += dx, dy
+    movd       esi, xmm0
+    movd       xmm0, [eax + esi]  // copy a pixel
+    movd       [edx], xmm0
+    lea        edx, [edx + 4]
+    sub        ecx, 1
+    jge        l1
+  l1b:
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_ARGBAFFINEROW_SSE2
+
+#ifdef HAS_INTERPOLATEROW_AVX2
+// Bilinear filter 32x2 -> 32x1
+__declspec(naked)
+void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) {
+  __asm {
+    push       esi
+    push       edi
+    mov        edi, [esp + 8 + 4]   // dst_ptr
+    mov        esi, [esp + 8 + 8]   // src_ptr
+    mov        edx, [esp + 8 + 12]  // src_stride
+    mov        ecx, [esp + 8 + 16]  // dst_width
+    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
+    shr        eax, 1
+    // Dispatch to specialized filters if applicable.
+    cmp        eax, 0
+    je         xloop100  // 0 / 128.  Blend 100 / 0.
+    sub        edi, esi
+    cmp        eax, 32
+    je         xloop75   // 32 / 128 is 0.25.  Blend 75 / 25.
+    cmp        eax, 64
+    je         xloop50   // 64 / 128 is 0.50.  Blend 50 / 50.
+    cmp        eax, 96
+    je         xloop25   // 96 / 128 is 0.75.  Blend 25 / 75.
+
+    vmovd      xmm0, eax  // high fraction 0..127
+    neg        eax
+    add        eax, 128
+    vmovd      xmm5, eax  // low fraction 128..1
+    vpunpcklbw xmm5, xmm5, xmm0
+    vpunpcklwd xmm5, xmm5, xmm5
+    vpxor      ymm0, ymm0, ymm0
+    vpermd     ymm5, ymm0, ymm5
+
+  xloop:
+    vmovdqu    ymm0, [esi]
+    vmovdqu    ymm2, [esi + edx]
+    vpunpckhbw ymm1, ymm0, ymm2  // mutates
+    vpunpcklbw ymm0, ymm0, ymm2  // mutates
+    vpmaddubsw ymm0, ymm0, ymm5
+    vpmaddubsw ymm1, ymm1, ymm5
+    vpsrlw     ymm0, ymm0, 7
+    vpsrlw     ymm1, ymm1, 7
+    vpackuswb  ymm0, ymm0, ymm1  // unmutates
+    vmovdqu    [esi + edi], ymm0
+    lea        esi, [esi + 32]
+    sub        ecx, 32
+    jg         xloop
+    jmp        xloop99
+
+   // Blend 25 / 75.
+ xloop25:
+   vmovdqu    ymm0, [esi]
+   vmovdqu    ymm1, [esi + edx]
+   vpavgb     ymm0, ymm0, ymm1
+   vpavgb     ymm0, ymm0, ymm1
+   vmovdqu    [esi + edi], ymm0
+   lea        esi, [esi + 32]
+   sub        ecx, 32
+   jg         xloop25
+   jmp        xloop99
+
+   // Blend 50 / 50.
+ xloop50:
+   vmovdqu    ymm0, [esi]
+   vpavgb     ymm0, ymm0, [esi + edx]
+   vmovdqu    [esi + edi], ymm0
+   lea        esi, [esi + 32]
+   sub        ecx, 32
+   jg         xloop50
+   jmp        xloop99
+
+   // Blend 75 / 25.
+ xloop75:
+   vmovdqu    ymm1, [esi]
+   vmovdqu    ymm0, [esi + edx]
+   vpavgb     ymm0, ymm0, ymm1
+   vpavgb     ymm0, ymm0, ymm1
+   vmovdqu    [esi + edi], ymm0
+   lea        esi, [esi + 32]
+   sub        ecx, 32
+   jg         xloop75
+   jmp        xloop99
+
+   // Blend 100 / 0 - Copy row unchanged.
+ xloop100:
+   rep movsb
+
+  xloop99:
+    pop        edi
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_INTERPOLATEROW_AVX2
+
+// Bilinear filter 16x2 -> 16x1
+__declspec(naked)
+void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+                          ptrdiff_t src_stride, int dst_width,
+                          int source_y_fraction) {
+  __asm {
+    push       esi
+    push       edi
+    mov        edi, [esp + 8 + 4]   // dst_ptr
+    mov        esi, [esp + 8 + 8]   // src_ptr
+    mov        edx, [esp + 8 + 12]  // src_stride
+    mov        ecx, [esp + 8 + 16]  // dst_width
+    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
+    sub        edi, esi
+    shr        eax, 1
+    // Dispatch to specialized filters if applicable.
+    cmp        eax, 0
+    je         xloop100  // 0 / 128.  Blend 100 / 0.
+    cmp        eax, 32
+    je         xloop75   // 32 / 128 is 0.25.  Blend 75 / 25.
+    cmp        eax, 64
+    je         xloop50   // 64 / 128 is 0.50.  Blend 50 / 50.
+    cmp        eax, 96
+    je         xloop25   // 96 / 128 is 0.75.  Blend 25 / 75.
+
+    movd       xmm0, eax  // high fraction 0..127
+    neg        eax
+    add        eax, 128
+    movd       xmm5, eax  // low fraction 128..1
+    punpcklbw  xmm5, xmm0
+    punpcklwd  xmm5, xmm5
+    pshufd     xmm5, xmm5, 0
+
+  xloop:
+    movdqu     xmm0, [esi]
+    movdqu     xmm2, [esi + edx]
+    movdqu     xmm1, xmm0
+    punpcklbw  xmm0, xmm2
+    punpckhbw  xmm1, xmm2
+    pmaddubsw  xmm0, xmm5
+    pmaddubsw  xmm1, xmm5
+    psrlw      xmm0, 7
+    psrlw      xmm1, 7
+    packuswb   xmm0, xmm1
+    movdqu     [esi + edi], xmm0
+    lea        esi, [esi + 16]
+    sub        ecx, 16
+    jg         xloop
+    jmp        xloop99
+
+    // Blend 25 / 75.
+  xloop25:
+    movdqu     xmm0, [esi]
+    movdqu     xmm1, [esi + edx]
+    pavgb      xmm0, xmm1
+    pavgb      xmm0, xmm1
+    movdqu     [esi + edi], xmm0
+    lea        esi, [esi + 16]
+    sub        ecx, 16
+    jg         xloop25
+    jmp        xloop99
+
+    // Blend 50 / 50.
+  xloop50:
+    movdqu     xmm0, [esi]
+    movdqu     xmm1, [esi + edx]
+    pavgb      xmm0, xmm1
+    movdqu     [esi + edi], xmm0
+    lea        esi, [esi + 16]
+    sub        ecx, 16
+    jg         xloop50
+    jmp        xloop99
+
+    // Blend 75 / 25.
+  xloop75:
+    movdqu     xmm1, [esi]
+    movdqu     xmm0, [esi + edx]
+    pavgb      xmm0, xmm1
+    pavgb      xmm0, xmm1
+    movdqu     [esi + edi], xmm0
+    lea        esi, [esi + 16]
+    sub        ecx, 16
+    jg         xloop75
+    jmp        xloop99
+
+    // Blend 100 / 0 - Copy row unchanged.
+  xloop100:
+    movdqu     xmm0, [esi]
+    movdqu     [esi + edi], xmm0
+    lea        esi, [esi + 16]
+    sub        ecx, 16
+    jg         xloop100
+
+  xloop99:
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+#ifdef HAS_INTERPOLATEROW_SSE2
+// Bilinear filter 16x2 -> 16x1
+__declspec(naked)
+void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) {
+  __asm {
+    push       esi
+    push       edi
+    mov        edi, [esp + 8 + 4]   // dst_ptr
+    mov        esi, [esp + 8 + 8]   // src_ptr
+    mov        edx, [esp + 8 + 12]  // src_stride
+    mov        ecx, [esp + 8 + 16]  // dst_width
+    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
+    sub        edi, esi
+    // Dispatch to specialized filters if applicable.
+    cmp        eax, 0
+    je         xloop100  // 0 / 256.  Blend 100 / 0.
+    cmp        eax, 64
+    je         xloop75   // 64 / 256 is 0.25.  Blend 75 / 25.
+    cmp        eax, 128
+    je         xloop50   // 128 / 256 is 0.50.  Blend 50 / 50.
+    cmp        eax, 192
+    je         xloop25   // 192 / 256 is 0.75.  Blend 25 / 75.
+
+    movd       xmm5, eax            // xmm5 = y fraction
+    punpcklbw  xmm5, xmm5
+    psrlw      xmm5, 1
+    punpcklwd  xmm5, xmm5
+    punpckldq  xmm5, xmm5
+    punpcklqdq xmm5, xmm5
+    pxor       xmm4, xmm4
+
+  xloop:
+    movdqu     xmm0, [esi]  // row0
+    movdqu     xmm2, [esi + edx]  // row1
+    movdqu     xmm1, xmm0
+    movdqu     xmm3, xmm2
+    punpcklbw  xmm2, xmm4
+    punpckhbw  xmm3, xmm4
+    punpcklbw  xmm0, xmm4
+    punpckhbw  xmm1, xmm4
+    psubw      xmm2, xmm0  // row1 - row0
+    psubw      xmm3, xmm1
+    paddw      xmm2, xmm2  // 9 bits * 15 bits = 8.16
+    paddw      xmm3, xmm3
+    pmulhw     xmm2, xmm5  // scale diff
+    pmulhw     xmm3, xmm5
+    paddw      xmm0, xmm2  // sum rows
+    paddw      xmm1, xmm3
+    packuswb   xmm0, xmm1
+    movdqu     [esi + edi], xmm0
+    lea        esi, [esi + 16]
+    sub        ecx, 16
+    jg         xloop
+    jmp        xloop99
+
+    // Blend 25 / 75.
+  xloop25:
+    movdqu     xmm0, [esi]
+    movdqu     xmm1, [esi + edx]
+    pavgb      xmm0, xmm1
+    pavgb      xmm0, xmm1
+    movdqu     [esi + edi], xmm0
+    lea        esi, [esi + 16]
+    sub        ecx, 16
+    jg         xloop25
+    jmp        xloop99
+
+    // Blend 50 / 50.
+  xloop50:
+    movdqu     xmm0, [esi]
+    movdqu     xmm1, [esi + edx]
+    pavgb      xmm0, xmm1
+    movdqu     [esi + edi], xmm0
+    lea        esi, [esi + 16]
+    sub        ecx, 16
+    jg         xloop50
+    jmp        xloop99
+
+    // Blend 75 / 25.
+  xloop75:
+    movdqu     xmm1, [esi]
+    movdqu     xmm0, [esi + edx]
+    pavgb      xmm0, xmm1
+    pavgb      xmm0, xmm1
+    movdqu     [esi + edi], xmm0
+    lea        esi, [esi + 16]
+    sub        ecx, 16
+    jg         xloop75
+    jmp        xloop99
+
+    // Blend 100 / 0 - Copy row unchanged.
+  xloop100:
+    movdqu     xmm0, [esi]
+    movdqu     [esi + edi], xmm0
+    lea        esi, [esi + 16]
+    sub        ecx, 16
+    jg         xloop100
+
+  xloop99:
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_INTERPOLATEROW_SSE2
+
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+__declspec(naked)
+void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+                          const uint8* shuffler, int pix) {
+  __asm {
+    mov        eax, [esp + 4]    // src_argb
+    mov        edx, [esp + 8]    // dst_argb
+    mov        ecx, [esp + 12]   // shuffler
+    movdqu     xmm5, [ecx]
+    mov        ecx, [esp + 16]   // pix
+
+  wloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax, [eax + 32]
+    pshufb     xmm0, xmm5
+    pshufb     xmm1, xmm5
+    movdqu     [edx], xmm0
+    movdqu     [edx + 16], xmm1
+    lea        edx, [edx + 32]
+    sub        ecx, 8
+    jg         wloop
+    ret
+  }
+}
+
+#ifdef HAS_ARGBSHUFFLEROW_AVX2
+__declspec(naked)
+void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
+                         const uint8* shuffler, int pix) {
+  __asm {
+    mov        eax, [esp + 4]     // src_argb
+    mov        edx, [esp + 8]     // dst_argb
+    mov        ecx, [esp + 12]    // shuffler
+    vbroadcastf128 ymm5, [ecx]    // same shuffle in high as low.
+    mov        ecx, [esp + 16]    // pix
+
+  wloop:
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    lea        eax, [eax + 64]
+    vpshufb    ymm0, ymm0, ymm5
+    vpshufb    ymm1, ymm1, ymm5
+    vmovdqu    [edx], ymm0
+    vmovdqu    [edx + 32], ymm1
+    lea        edx, [edx + 64]
+    sub        ecx, 16
+    jg         wloop
+
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBSHUFFLEROW_AVX2
+
+__declspec(naked)
+void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
+                         const uint8* shuffler, int pix) {
+  __asm {
+    push       ebx
+    push       esi
+    mov        eax, [esp + 8 + 4]    // src_argb
+    mov        edx, [esp + 8 + 8]    // dst_argb
+    mov        esi, [esp + 8 + 12]   // shuffler
+    mov        ecx, [esp + 8 + 16]   // pix
+    pxor       xmm5, xmm5
+
+    mov        ebx, [esi]   // shuffler
+    cmp        ebx, 0x03000102
+    je         shuf_3012
+    cmp        ebx, 0x00010203
+    je         shuf_0123
+    cmp        ebx, 0x00030201
+    je         shuf_0321
+    cmp        ebx, 0x02010003
+    je         shuf_2103
+
+  // TODO(fbarchard): Use one source pointer and 3 offsets.
+  shuf_any1:
+    movzx      ebx, byte ptr [esi]
+    movzx      ebx, byte ptr [eax + ebx]
+    mov        [edx], bl
+    movzx      ebx, byte ptr [esi + 1]
+    movzx      ebx, byte ptr [eax + ebx]
+    mov        [edx + 1], bl
+    movzx      ebx, byte ptr [esi + 2]
+    movzx      ebx, byte ptr [eax + ebx]
+    mov        [edx + 2], bl
+    movzx      ebx, byte ptr [esi + 3]
+    movzx      ebx, byte ptr [eax + ebx]
+    mov        [edx + 3], bl
+    lea        eax, [eax + 4]
+    lea        edx, [edx + 4]
+    sub        ecx, 1
+    jg         shuf_any1
+    jmp        shuf99
+
+  shuf_0123:
+    movdqu     xmm0, [eax]
+    lea        eax, [eax + 16]
+    movdqa     xmm1, xmm0
+    punpcklbw  xmm0, xmm5
+    punpckhbw  xmm1, xmm5
+    pshufhw    xmm0, xmm0, 01Bh   // 1B = 00011011 = 0x0123 = BGRAToARGB
+    pshuflw    xmm0, xmm0, 01Bh
+    pshufhw    xmm1, xmm1, 01Bh
+    pshuflw    xmm1, xmm1, 01Bh
+    packuswb   xmm0, xmm1
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         shuf_0123
+    jmp        shuf99
+
+  shuf_0321:
+    movdqu     xmm0, [eax]
+    lea        eax, [eax + 16]
+    movdqa     xmm1, xmm0
+    punpcklbw  xmm0, xmm5
+    punpckhbw  xmm1, xmm5
+    pshufhw    xmm0, xmm0, 039h   // 39 = 00111001 = 0x0321 = RGBAToARGB
+    pshuflw    xmm0, xmm0, 039h
+    pshufhw    xmm1, xmm1, 039h
+    pshuflw    xmm1, xmm1, 039h
+    packuswb   xmm0, xmm1
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         shuf_0321
+    jmp        shuf99
+
+  shuf_2103:
+    movdqu     xmm0, [eax]
+    lea        eax, [eax + 16]
+    movdqa     xmm1, xmm0
+    punpcklbw  xmm0, xmm5
+    punpckhbw  xmm1, xmm5
+    pshufhw    xmm0, xmm0, 093h   // 93 = 10010011 = 0x2103 = ARGBToRGBA
+    pshuflw    xmm0, xmm0, 093h
+    pshufhw    xmm1, xmm1, 093h
+    pshuflw    xmm1, xmm1, 093h
+    packuswb   xmm0, xmm1
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         shuf_2103
+    jmp        shuf99
+
+  shuf_3012:
+    movdqu     xmm0, [eax]
+    lea        eax, [eax + 16]
+    movdqa     xmm1, xmm0
+    punpcklbw  xmm0, xmm5
+    punpckhbw  xmm1, xmm5
+    pshufhw    xmm0, xmm0, 0C6h   // C6 = 11000110 = 0x3012 = ABGRToARGB
+    pshuflw    xmm0, xmm0, 0C6h
+    pshufhw    xmm1, xmm1, 0C6h
+    pshuflw    xmm1, xmm1, 0C6h
+    packuswb   xmm0, xmm1
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         shuf_3012
+
+  shuf99:
+    pop        esi
+    pop        ebx
+    ret
+  }
+}
+
+// YUY2 - Macro-pixel = 2 image pixels
+// Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....
+
+// UYVY - Macro-pixel = 2 image pixels
+// U0Y0V0Y1
+
+__declspec(naked)
+void I422ToYUY2Row_SSE2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_frame, int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]    // src_y
+    mov        esi, [esp + 8 + 8]    // src_u
+    mov        edx, [esp + 8 + 12]   // src_v
+    mov        edi, [esp + 8 + 16]   // dst_frame
+    mov        ecx, [esp + 8 + 20]   // width
+    sub        edx, esi
+
+  convertloop:
+    movq       xmm2, qword ptr [esi] // U
+    movq       xmm3, qword ptr [esi + edx] // V
+    lea        esi, [esi + 8]
+    punpcklbw  xmm2, xmm3 // UV
+    movdqu     xmm0, [eax] // Y
+    lea        eax, [eax + 16]
+    movdqa     xmm1, xmm0
+    punpcklbw  xmm0, xmm2 // YUYV
+    punpckhbw  xmm1, xmm2
+    movdqu     [edi], xmm0
+    movdqu     [edi + 16], xmm1
+    lea        edi, [edi + 32]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+__declspec(naked)
+void I422ToUYVYRow_SSE2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_frame, int width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]    // src_y
+    mov        esi, [esp + 8 + 8]    // src_u
+    mov        edx, [esp + 8 + 12]   // src_v
+    mov        edi, [esp + 8 + 16]   // dst_frame
+    mov        ecx, [esp + 8 + 20]   // width
+    sub        edx, esi
+
+  convertloop:
+    movq       xmm2, qword ptr [esi] // U
+    movq       xmm3, qword ptr [esi + edx] // V
+    lea        esi, [esi + 8]
+    punpcklbw  xmm2, xmm3 // UV
+    movdqu     xmm0, [eax] // Y
+    movdqa     xmm1, xmm2
+    lea        eax, [eax + 16]
+    punpcklbw  xmm1, xmm0 // UYVY
+    punpckhbw  xmm2, xmm0
+    movdqu     [edi], xmm1
+    movdqu     [edi + 16], xmm2
+    lea        edi, [edi + 32]
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+#ifdef HAS_ARGBPOLYNOMIALROW_SSE2
+__declspec(naked)
+void ARGBPolynomialRow_SSE2(const uint8* src_argb,
+                            uint8* dst_argb, const float* poly,
+                            int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   /* src_argb */
+    mov        edx, [esp + 4 + 8]   /* dst_argb */
+    mov        esi, [esp + 4 + 12]  /* poly */
+    mov        ecx, [esp + 4 + 16]  /* width */
+    pxor       xmm3, xmm3  // 0 constant for zero extending bytes to ints.
+
+    // 2 pixel loop.
+ convertloop:
+//    pmovzxbd  xmm0, dword ptr [eax]  // BGRA pixel
+//    pmovzxbd  xmm4, dword ptr [eax + 4]  // BGRA pixel
+    movq       xmm0, qword ptr [eax]  // BGRABGRA
+    lea        eax, [eax + 8]
+    punpcklbw  xmm0, xmm3
+    movdqa     xmm4, xmm0
+    punpcklwd  xmm0, xmm3  // pixel 0
+    punpckhwd  xmm4, xmm3  // pixel 1
+    cvtdq2ps   xmm0, xmm0  // 4 floats
+    cvtdq2ps   xmm4, xmm4
+    movdqa     xmm1, xmm0  // X
+    movdqa     xmm5, xmm4
+    mulps      xmm0, [esi + 16]  // C1 * X
+    mulps      xmm4, [esi + 16]
+    addps      xmm0, [esi]  // result = C0 + C1 * X
+    addps      xmm4, [esi]
+    movdqa     xmm2, xmm1
+    movdqa     xmm6, xmm5
+    mulps      xmm2, xmm1  // X * X
+    mulps      xmm6, xmm5
+    mulps      xmm1, xmm2  // X * X * X
+    mulps      xmm5, xmm6
+    mulps      xmm2, [esi + 32]  // C2 * X * X
+    mulps      xmm6, [esi + 32]
+    mulps      xmm1, [esi + 48]  // C3 * X * X * X
+    mulps      xmm5, [esi + 48]
+    addps      xmm0, xmm2  // result += C2 * X * X
+    addps      xmm4, xmm6
+    addps      xmm0, xmm1  // result += C3 * X * X * X
+    addps      xmm4, xmm5
+    cvttps2dq  xmm0, xmm0
+    cvttps2dq  xmm4, xmm4
+    packuswb   xmm0, xmm4
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edx], xmm0
+    lea        edx, [edx + 8]
+    sub        ecx, 2
+    jg         convertloop
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_ARGBPOLYNOMIALROW_SSE2
+
+#ifdef HAS_ARGBPOLYNOMIALROW_AVX2
+__declspec(naked)
+void ARGBPolynomialRow_AVX2(const uint8* src_argb,
+                            uint8* dst_argb, const float* poly,
+                            int width) {
+  __asm {
+    mov        eax, [esp + 4]   /* src_argb */
+    mov        edx, [esp + 8]   /* dst_argb */
+    mov        ecx, [esp + 12]   /* poly */
+    vbroadcastf128 ymm4, [ecx]       // C0
+    vbroadcastf128 ymm5, [ecx + 16]  // C1
+    vbroadcastf128 ymm6, [ecx + 32]  // C2
+    vbroadcastf128 ymm7, [ecx + 48]  // C3
+    mov        ecx, [esp + 16]  /* width */
+
+    // 2 pixel loop.
+ convertloop:
+    vpmovzxbd   ymm0, qword ptr [eax]  // 2 BGRA pixels
+    lea         eax, [eax + 8]
+    vcvtdq2ps   ymm0, ymm0        // X 8 floats
+    vmulps      ymm2, ymm0, ymm0  // X * X
+    vmulps      ymm3, ymm0, ymm7  // C3 * X
+    vfmadd132ps ymm0, ymm4, ymm5  // result = C0 + C1 * X
+    vfmadd231ps ymm0, ymm2, ymm6  // result += C2 * X * X
+    vfmadd231ps ymm0, ymm2, ymm3  // result += C3 * X * X * X
+    vcvttps2dq  ymm0, ymm0
+    vpackusdw   ymm0, ymm0, ymm0  // b0g0r0a0_00000000_b0g0r0a0_00000000
+    vpermq      ymm0, ymm0, 0xd8  // b0g0r0a0_b0g0r0a0_00000000_00000000
+    vpackuswb   xmm0, xmm0, xmm0  // bgrabgra_00000000_00000000_00000000
+    vmovq       qword ptr [edx], xmm0
+    lea         edx, [edx + 8]
+    sub         ecx, 2
+    jg          convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBPOLYNOMIALROW_AVX2
+
+#ifdef HAS_ARGBCOLORTABLEROW_X86
+// Tranform ARGB pixels with color table.
+__declspec(naked)
+void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
+                           int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   /* dst_argb */
+    mov        esi, [esp + 4 + 8]   /* table_argb */
+    mov        ecx, [esp + 4 + 12]  /* width */
+
+    // 1 pixel loop.
+  convertloop:
+    movzx      edx, byte ptr [eax]
+    lea        eax, [eax + 4]
+    movzx      edx, byte ptr [esi + edx * 4]
+    mov        byte ptr [eax - 4], dl
+    movzx      edx, byte ptr [eax - 4 + 1]
+    movzx      edx, byte ptr [esi + edx * 4 + 1]
+    mov        byte ptr [eax - 4 + 1], dl
+    movzx      edx, byte ptr [eax - 4 + 2]
+    movzx      edx, byte ptr [esi + edx * 4 + 2]
+    mov        byte ptr [eax - 4 + 2], dl
+    movzx      edx, byte ptr [eax - 4 + 3]
+    movzx      edx, byte ptr [esi + edx * 4 + 3]
+    mov        byte ptr [eax - 4 + 3], dl
+    dec        ecx
+    jg         convertloop
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_ARGBCOLORTABLEROW_X86
+
+#ifdef HAS_RGBCOLORTABLEROW_X86
+// Tranform RGB pixels with color table.
+__declspec(naked)
+void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   /* dst_argb */
+    mov        esi, [esp + 4 + 8]   /* table_argb */
+    mov        ecx, [esp + 4 + 12]  /* width */
+
+    // 1 pixel loop.
+  convertloop:
+    movzx      edx, byte ptr [eax]
+    lea        eax, [eax + 4]
+    movzx      edx, byte ptr [esi + edx * 4]
+    mov        byte ptr [eax - 4], dl
+    movzx      edx, byte ptr [eax - 4 + 1]
+    movzx      edx, byte ptr [esi + edx * 4 + 1]
+    mov        byte ptr [eax - 4 + 1], dl
+    movzx      edx, byte ptr [eax - 4 + 2]
+    movzx      edx, byte ptr [esi + edx * 4 + 2]
+    mov        byte ptr [eax - 4 + 2], dl
+    dec        ecx
+    jg         convertloop
+
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_RGBCOLORTABLEROW_X86
+
+#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
+// Tranform RGB pixels with luma table.
+__declspec(naked)
+void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+                                 int width,
+                                 const uint8* luma, uint32 lumacoeff) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   /* src_argb */
+    mov        edi, [esp + 8 + 8]   /* dst_argb */
+    mov        ecx, [esp + 8 + 12]  /* width */
+    movd       xmm2, dword ptr [esp + 8 + 16]  // luma table
+    movd       xmm3, dword ptr [esp + 8 + 20]  // lumacoeff
+    pshufd     xmm2, xmm2, 0
+    pshufd     xmm3, xmm3, 0
+    pcmpeqb    xmm4, xmm4        // generate mask 0xff00ff00
+    psllw      xmm4, 8
+    pxor       xmm5, xmm5
+
+    // 4 pixel loop.
+  convertloop:
+    movdqu     xmm0, qword ptr [eax]      // generate luma ptr
+    pmaddubsw  xmm0, xmm3
+    phaddw     xmm0, xmm0
+    pand       xmm0, xmm4  // mask out low bits
+    punpcklwd  xmm0, xmm5
+    paddd      xmm0, xmm2  // add table base
+    movd       esi, xmm0
+    pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
+
+    movzx      edx, byte ptr [eax]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi], dl
+    movzx      edx, byte ptr [eax + 1]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 1], dl
+    movzx      edx, byte ptr [eax + 2]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 2], dl
+    movzx      edx, byte ptr [eax + 3]  // copy alpha.
+    mov        byte ptr [edi + 3], dl
+
+    movd       esi, xmm0
+    pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
+
+    movzx      edx, byte ptr [eax + 4]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 4], dl
+    movzx      edx, byte ptr [eax + 5]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 5], dl
+    movzx      edx, byte ptr [eax + 6]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 6], dl
+    movzx      edx, byte ptr [eax + 7]  // copy alpha.
+    mov        byte ptr [edi + 7], dl
+
+    movd       esi, xmm0
+    pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
+
+    movzx      edx, byte ptr [eax + 8]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 8], dl
+    movzx      edx, byte ptr [eax + 9]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 9], dl
+    movzx      edx, byte ptr [eax + 10]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 10], dl
+    movzx      edx, byte ptr [eax + 11]  // copy alpha.
+    mov        byte ptr [edi + 11], dl
+
+    movd       esi, xmm0
+
+    movzx      edx, byte ptr [eax + 12]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 12], dl
+    movzx      edx, byte ptr [eax + 13]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 13], dl
+    movzx      edx, byte ptr [eax + 14]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 14], dl
+    movzx      edx, byte ptr [eax + 15]  // copy alpha.
+    mov        byte ptr [edi + 15], dl
+
+    lea        eax, [eax + 16]
+    lea        edi, [edi + 16]
+    sub        ecx, 4
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3
+
+#endif  // defined(_M_X64)
+#endif  // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64))
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libaom/src/third_party/libyuv/source/row_x86.asm b/libs/libaom/src/third_party/libyuv/source/row_x86.asm
new file mode 100644
index 000000000..0cb326f8e
--- /dev/null
+++ b/libs/libaom/src/third_party/libyuv/source/row_x86.asm
@@ -0,0 +1,146 @@
+;
+; Copyright 2012 The LibYuv Project Authors. All rights reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%ifdef __YASM_VERSION_ID__
+%if __YASM_VERSION_ID__ < 01020000h
+%error AVX2 is supported only by yasm 1.2.0 or later.
+%endif
+%endif
+%include "x86inc.asm"
+
+SECTION .text
+
+; cglobal numeric constants are parameters, gpr regs, mm regs
+
+; void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix)
+
+%macro YUY2TOYROW 2-3
+cglobal %1ToYRow%3, 3, 3, 3, src_yuy2, dst_y, pix
+%ifidn %1,YUY2
+    pcmpeqb    m2, m2, m2        ; generate mask 0x00ff00ff
+    psrlw      m2, m2, 8
+%endif
+
+    ALIGN      4
+.convertloop:
+    mov%2      m0, [src_yuy2q]
+    mov%2      m1, [src_yuy2q + mmsize]
+    lea        src_yuy2q, [src_yuy2q + mmsize * 2]
+%ifidn %1,YUY2
+    pand       m0, m0, m2   ; YUY2 even bytes are Y
+    pand       m1, m1, m2
+%else
+    psrlw      m0, m0, 8    ; UYVY odd bytes are Y
+    psrlw      m1, m1, 8
+%endif
+    packuswb   m0, m0, m1
+%if cpuflag(AVX2)
+    vpermq     m0, m0, 0xd8
+%endif
+    sub        pixd, mmsize
+    mov%2      [dst_yq], m0
+    lea        dst_yq, [dst_yq + mmsize]
+    jg         .convertloop
+    REP_RET
+%endmacro
+
+; TODO(fbarchard): Remove MMX.  Add SSSE3 pshufb version.
+INIT_MMX MMX
+YUY2TOYROW YUY2,a,
+YUY2TOYROW YUY2,u,_Unaligned
+YUY2TOYROW UYVY,a,
+YUY2TOYROW UYVY,u,_Unaligned
+INIT_XMM SSE2
+YUY2TOYROW YUY2,a,
+YUY2TOYROW YUY2,u,_Unaligned
+YUY2TOYROW UYVY,a,
+YUY2TOYROW UYVY,u,_Unaligned
+INIT_YMM AVX2
+YUY2TOYROW YUY2,a,
+YUY2TOYROW UYVY,a,
+
+; void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix)
+
+%macro SplitUVRow 1-2
+cglobal SplitUVRow%2, 4, 4, 5, src_uv, dst_u, dst_v, pix
+    pcmpeqb    m4, m4, m4        ; generate mask 0x00ff00ff
+    psrlw      m4, m4, 8
+    sub        dst_vq, dst_uq
+
+    ALIGN      4
+.convertloop:
+    mov%1      m0, [src_uvq]
+    mov%1      m1, [src_uvq + mmsize]
+    lea        src_uvq, [src_uvq + mmsize * 2]
+    psrlw      m2, m0, 8         ; odd bytes
+    psrlw      m3, m1, 8
+    pand       m0, m0, m4        ; even bytes
+    pand       m1, m1, m4
+    packuswb   m0, m0, m1
+    packuswb   m2, m2, m3
+%if cpuflag(AVX2)
+    vpermq     m0, m0, 0xd8
+    vpermq     m2, m2, 0xd8
+%endif
+    mov%1      [dst_uq], m0
+    mov%1      [dst_uq + dst_vq], m2
+    lea        dst_uq, [dst_uq + mmsize]
+    sub        pixd, mmsize
+    jg         .convertloop
+    REP_RET
+%endmacro
+
+INIT_MMX MMX
+SplitUVRow a,
+SplitUVRow u,_Unaligned
+INIT_XMM SSE2
+SplitUVRow a,
+SplitUVRow u,_Unaligned
+INIT_YMM AVX2
+SplitUVRow a,
+
+; void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+;                      int width);
+
+%macro MergeUVRow_ 1-2
+cglobal MergeUVRow_%2, 4, 4, 3, src_u, src_v, dst_uv, pix
+    sub        src_vq, src_uq
+
+    ALIGN      4
+.convertloop:
+    mov%1      m0, [src_uq]
+    mov%1      m1, [src_vq]
+    lea        src_uq, [src_uq + mmsize]
+    punpcklbw  m2, m0, m1       // first 8 UV pairs
+    punpckhbw  m0, m0, m1       // next 8 UV pairs
+%if cpuflag(AVX2)
+    vperm2i128 m1, m2, m0, 0x20  // low 128 of ymm2 and low 128 of ymm0
+    vperm2i128 m2, m2, m0, 0x31  // high 128 of ymm2 and high 128 of ymm0
+    mov%1      [dst_uvq], m1
+    mov%1      [dst_uvq + mmsize], m2
+%else
+    mov%1      [dst_uvq], m2
+    mov%1      [dst_uvq + mmsize], m0
+%endif
+    lea        dst_uvq, [dst_uvq + mmsize * 2]
+    sub        pixd, mmsize
+    jg         .convertloop
+    REP_RET
+%endmacro
+
+INIT_MMX MMX
+MergeUVRow_ a,
+MergeUVRow_ u,_Unaligned
+INIT_XMM SSE2
+MergeUVRow_ a,
+MergeUVRow_ u,_Unaligned
+INIT_YMM AVX2
+MergeUVRow_ a,
+
diff --git a/libs/libaom/src/third_party/libyuv/source/scale.cc b/libs/libaom/src/third_party/libyuv/source/scale.cc
new file mode 100644
index 000000000..0a01304c4
--- /dev/null
+++ b/libs/libaom/src/third_party/libyuv/source/scale.cc
@@ -0,0 +1,1689 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/scale.h"
+
+#include <assert.h>
+#include <string.h>
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h"  // For CopyPlane
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+static __inline int Abs(int v) {
+  return v >= 0 ? v : -v;
+}
+
+#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
+
+// Scale plane, 1/2
+// This is an optimized version for scaling down a plane to 1/2 of
+// its original size.
+
+static void ScalePlaneDown2(int src_width, int src_height,
+                            int dst_width, int dst_height,
+                            int src_stride, int dst_stride,
+                            const uint8* src_ptr, uint8* dst_ptr,
+                            enum FilterMode filtering) {
+  int y;
+  void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width) =
+      filtering == kFilterNone ? ScaleRowDown2_C :
+      (filtering == kFilterLinear ? ScaleRowDown2Linear_C : ScaleRowDown2Box_C);
+  int row_stride = src_stride << 1;
+  if (!filtering) {
+    src_ptr += src_stride;  // Point to odd rows.
+    src_stride = 0;
+  }
+
+#if defined(HAS_SCALEROWDOWN2_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_NEON :
+        (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_NEON :
+        ScaleRowDown2Box_Any_NEON);
+    if (IS_ALIGNED(dst_width, 16)) {
+      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_NEON :
+          (filtering == kFilterLinear ? ScaleRowDown2Linear_NEON :
+          ScaleRowDown2Box_NEON);
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN2_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_SSE2 :
+        (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_SSE2 :
+        ScaleRowDown2Box_Any_SSE2);
+    if (IS_ALIGNED(dst_width, 16)) {
+      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSE2 :
+          (filtering == kFilterLinear ? ScaleRowDown2Linear_SSE2 :
+          ScaleRowDown2Box_SSE2);
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN2_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_AVX2 :
+        (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_AVX2 :
+        ScaleRowDown2Box_Any_AVX2);
+    if (IS_ALIGNED(dst_width, 32)) {
+      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_AVX2 :
+          (filtering == kFilterLinear ? ScaleRowDown2Linear_AVX2 :
+          ScaleRowDown2Box_AVX2);
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN2_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_ptr, 4) &&
+      IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) &&
+      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+    ScaleRowDown2 = filtering ?
+        ScaleRowDown2Box_MIPS_DSPR2 : ScaleRowDown2_MIPS_DSPR2;
+  }
+#endif
+
+  if (filtering == kFilterLinear) {
+    src_stride = 0;
+  }
+  // TODO(fbarchard): Loop through source height to allow odd height.
+  for (y = 0; y < dst_height; ++y) {
+    ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width);
+    src_ptr += row_stride;
+    dst_ptr += dst_stride;
+  }
+}
+
+static void ScalePlaneDown2_16(int src_width, int src_height,
+                               int dst_width, int dst_height,
+                               int src_stride, int dst_stride,
+                               const uint16* src_ptr, uint16* dst_ptr,
+                               enum FilterMode filtering) {
+  int y;
+  void (*ScaleRowDown2)(const uint16* src_ptr, ptrdiff_t src_stride,
+                        uint16* dst_ptr, int dst_width) =
+    filtering == kFilterNone ? ScaleRowDown2_16_C :
+        (filtering == kFilterLinear ? ScaleRowDown2Linear_16_C :
+        ScaleRowDown2Box_16_C);
+  int row_stride = src_stride << 1;
+  if (!filtering) {
+    src_ptr += src_stride;  // Point to odd rows.
+    src_stride = 0;
+  }
+
+#if defined(HAS_SCALEROWDOWN2_16_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 16)) {
+    ScaleRowDown2 = filtering ? ScaleRowDown2Box_16_NEON :
+        ScaleRowDown2_16_NEON;
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN2_16_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) {
+    ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_16_SSE2 :
+        (filtering == kFilterLinear ? ScaleRowDown2Linear_16_SSE2 :
+        ScaleRowDown2Box_16_SSE2);
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN2_16_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_ptr, 4) &&
+      IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) &&
+      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+    ScaleRowDown2 = filtering ?
+        ScaleRowDown2Box_16_MIPS_DSPR2 : ScaleRowDown2_16_MIPS_DSPR2;
+  }
+#endif
+
+  if (filtering == kFilterLinear) {
+    src_stride = 0;
+  }
+  // TODO(fbarchard): Loop through source height to allow odd height.
+  for (y = 0; y < dst_height; ++y) {
+    ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width);
+    src_ptr += row_stride;
+    dst_ptr += dst_stride;
+  }
+}
+
+// Scale plane, 1/4
+// This is an optimized version for scaling down a plane to 1/4 of
+// its original size.
+
+static void ScalePlaneDown4(int src_width, int src_height,
+                            int dst_width, int dst_height,
+                            int src_stride, int dst_stride,
+                            const uint8* src_ptr, uint8* dst_ptr,
+                            enum FilterMode filtering) {
+  int y;
+  void (*ScaleRowDown4)(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width) =
+      filtering ? ScaleRowDown4Box_C : ScaleRowDown4_C;
+  int row_stride = src_stride << 2;
+  if (!filtering) {
+    src_ptr += src_stride * 2;  // Point to row 2.
+    src_stride = 0;
+  }
+#if defined(HAS_SCALEROWDOWN4_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleRowDown4 = filtering ?
+        ScaleRowDown4Box_Any_NEON : ScaleRowDown4_Any_NEON;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleRowDown4 = filtering ? ScaleRowDown4Box_NEON : ScaleRowDown4_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN4_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ScaleRowDown4 = filtering ?
+        ScaleRowDown4Box_Any_SSE2 : ScaleRowDown4_Any_SSE2;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleRowDown4 = filtering ? ScaleRowDown4Box_SSE2 : ScaleRowDown4_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN4_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ScaleRowDown4 = filtering ?
+        ScaleRowDown4Box_Any_AVX2 : ScaleRowDown4_Any_AVX2;
+    if (IS_ALIGNED(dst_width, 16)) {
+      ScaleRowDown4 = filtering ? ScaleRowDown4Box_AVX2 : ScaleRowDown4_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN4_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(row_stride, 4) &&
+      IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
+      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+    ScaleRowDown4 = filtering ?
+        ScaleRowDown4Box_MIPS_DSPR2 : ScaleRowDown4_MIPS_DSPR2;
+  }
+#endif
+
+  if (filtering == kFilterLinear) {
+    src_stride = 0;
+  }
+  for (y = 0; y < dst_height; ++y) {
+    ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width);
+    src_ptr += row_stride;
+    dst_ptr += dst_stride;
+  }
+}
+
+static void ScalePlaneDown4_16(int src_width, int src_height,
+                               int dst_width, int dst_height,
+                               int src_stride, int dst_stride,
+                               const uint16* src_ptr, uint16* dst_ptr,
+                               enum FilterMode filtering) {
+  int y;
+  void (*ScaleRowDown4)(const uint16* src_ptr, ptrdiff_t src_stride,
+                        uint16* dst_ptr, int dst_width) =
+      filtering ? ScaleRowDown4Box_16_C : ScaleRowDown4_16_C;
+  int row_stride = src_stride << 2;
+  if (!filtering) {
+    src_ptr += src_stride * 2;  // Point to row 2.
+    src_stride = 0;
+  }
+#if defined(HAS_SCALEROWDOWN4_16_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) {
+    ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_NEON :
+        ScaleRowDown4_16_NEON;
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN4_16_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+    ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_SSE2 :
+        ScaleRowDown4_16_SSE2;
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN4_16_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(row_stride, 4) &&
+      IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
+      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+    ScaleRowDown4 = filtering ?
+        ScaleRowDown4Box_16_MIPS_DSPR2 : ScaleRowDown4_16_MIPS_DSPR2;
+  }
+#endif
+
+  if (filtering == kFilterLinear) {
+    src_stride = 0;
+  }
+  for (y = 0; y < dst_height; ++y) {
+    ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width);
+    src_ptr += row_stride;
+    dst_ptr += dst_stride;
+  }
+}
+
+// Scale plane down, 3/4
+
+static void ScalePlaneDown34(int src_width, int src_height,
+                             int dst_width, int dst_height,
+                             int src_stride, int dst_stride,
+                             const uint8* src_ptr, uint8* dst_ptr,
+                             enum FilterMode filtering) {
+  int y;
+  void (*ScaleRowDown34_0)(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width);
+  void (*ScaleRowDown34_1)(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width);
+  const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
+  assert(dst_width % 3 == 0);
+  if (!filtering) {
+    ScaleRowDown34_0 = ScaleRowDown34_C;
+    ScaleRowDown34_1 = ScaleRowDown34_C;
+  } else {
+    ScaleRowDown34_0 = ScaleRowDown34_0_Box_C;
+    ScaleRowDown34_1 = ScaleRowDown34_1_Box_C;
+  }
+#if defined(HAS_SCALEROWDOWN34_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    if (!filtering) {
+      ScaleRowDown34_0 = ScaleRowDown34_Any_NEON;
+      ScaleRowDown34_1 = ScaleRowDown34_Any_NEON;
+    } else {
+      ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_NEON;
+      ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_NEON;
+    }
+    if (dst_width % 24 == 0) {
+      if (!filtering) {
+        ScaleRowDown34_0 = ScaleRowDown34_NEON;
+        ScaleRowDown34_1 = ScaleRowDown34_NEON;
+      } else {
+        ScaleRowDown34_0 = ScaleRowDown34_0_Box_NEON;
+        ScaleRowDown34_1 = ScaleRowDown34_1_Box_NEON;
+      }
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN34_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    if (!filtering) {
+      ScaleRowDown34_0 = ScaleRowDown34_Any_SSSE3;
+      ScaleRowDown34_1 = ScaleRowDown34_Any_SSSE3;
+    } else {
+      ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_SSSE3;
+      ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_SSSE3;
+    }
+    if (dst_width % 24 == 0) {
+      if (!filtering) {
+        ScaleRowDown34_0 = ScaleRowDown34_SSSE3;
+        ScaleRowDown34_1 = ScaleRowDown34_SSSE3;
+      } else {
+        ScaleRowDown34_0 = ScaleRowDown34_0_Box_SSSE3;
+        ScaleRowDown34_1 = ScaleRowDown34_1_Box_SSSE3;
+      }
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN34_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 24 == 0) &&
+      IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
+      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+    if (!filtering) {
+      ScaleRowDown34_0 = ScaleRowDown34_MIPS_DSPR2;
+      ScaleRowDown34_1 = ScaleRowDown34_MIPS_DSPR2;
+    } else {
+      ScaleRowDown34_0 = ScaleRowDown34_0_Box_MIPS_DSPR2;
+      ScaleRowDown34_1 = ScaleRowDown34_1_Box_MIPS_DSPR2;
+    }
+  }
+#endif
+
+  for (y = 0; y < dst_height - 2; y += 3) {
+    ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+    ScaleRowDown34_1(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+    ScaleRowDown34_0(src_ptr + src_stride, -filter_stride,
+                     dst_ptr, dst_width);
+    src_ptr += src_stride * 2;
+    dst_ptr += dst_stride;
+  }
+
+  // Remainder 1 or 2 rows with last row vertically unfiltered
+  if ((dst_height % 3) == 2) {
+    ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+    ScaleRowDown34_1(src_ptr, 0, dst_ptr, dst_width);
+  } else if ((dst_height % 3) == 1) {
+    ScaleRowDown34_0(src_ptr, 0, dst_ptr, dst_width);
+  }
+}
+
+static void ScalePlaneDown34_16(int src_width, int src_height,
+                                int dst_width, int dst_height,
+                                int src_stride, int dst_stride,
+                                const uint16* src_ptr, uint16* dst_ptr,
+                                enum FilterMode filtering) {
+  int y;
+  void (*ScaleRowDown34_0)(const uint16* src_ptr, ptrdiff_t src_stride,
+                           uint16* dst_ptr, int dst_width);
+  void (*ScaleRowDown34_1)(const uint16* src_ptr, ptrdiff_t src_stride,
+                           uint16* dst_ptr, int dst_width);
+  const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
+  assert(dst_width % 3 == 0);
+  if (!filtering) {
+    ScaleRowDown34_0 = ScaleRowDown34_16_C;
+    ScaleRowDown34_1 = ScaleRowDown34_16_C;
+  } else {
+    ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_C;
+    ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_C;
+  }
+#if defined(HAS_SCALEROWDOWN34_16_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && (dst_width % 24 == 0)) {
+    if (!filtering) {
+      ScaleRowDown34_0 = ScaleRowDown34_16_NEON;
+      ScaleRowDown34_1 = ScaleRowDown34_16_NEON;
+    } else {
+      ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_NEON;
+      ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN34_16_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) {
+    if (!filtering) {
+      ScaleRowDown34_0 = ScaleRowDown34_16_SSSE3;
+      ScaleRowDown34_1 = ScaleRowDown34_16_SSSE3;
+    } else {
+      ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_SSSE3;
+      ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN34_16_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 24 == 0) &&
+      IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
+      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+    if (!filtering) {
+      ScaleRowDown34_0 = ScaleRowDown34_16_MIPS_DSPR2;
+      ScaleRowDown34_1 = ScaleRowDown34_16_MIPS_DSPR2;
+    } else {
+      ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_MIPS_DSPR2;
+      ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_MIPS_DSPR2;
+    }
+  }
+#endif
+
+  for (y = 0; y < dst_height - 2; y += 3) {
+    ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+    ScaleRowDown34_1(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+    ScaleRowDown34_0(src_ptr + src_stride, -filter_stride,
+                     dst_ptr, dst_width);
+    src_ptr += src_stride * 2;
+    dst_ptr += dst_stride;
+  }
+
+  // Remainder 1 or 2 rows with last row vertically unfiltered
+  if ((dst_height % 3) == 2) {
+    ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+    ScaleRowDown34_1(src_ptr, 0, dst_ptr, dst_width);
+  } else if ((dst_height % 3) == 1) {
+    ScaleRowDown34_0(src_ptr, 0, dst_ptr, dst_width);
+  }
+}
+
+
+// Scale plane, 3/8
+// This is an optimized version for scaling down a plane to 3/8
+// of its original size.
+//
+// Uses box filter arranges like this
+// aaabbbcc -> abc
+// aaabbbcc    def
+// aaabbbcc    ghi
+// dddeeeff
+// dddeeeff
+// dddeeeff
+// ggghhhii
+// ggghhhii
+// Boxes are 3x3, 2x3, 3x2 and 2x2
+
+static void ScalePlaneDown38(int src_width, int src_height,
+                             int dst_width, int dst_height,
+                             int src_stride, int dst_stride,
+                             const uint8* src_ptr, uint8* dst_ptr,
+                             enum FilterMode filtering) {
+  int y;
+  void (*ScaleRowDown38_3)(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width);
+  void (*ScaleRowDown38_2)(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width);
+  const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
+  assert(dst_width % 3 == 0);
+  if (!filtering) {
+    ScaleRowDown38_3 = ScaleRowDown38_C;
+    ScaleRowDown38_2 = ScaleRowDown38_C;
+  } else {
+    ScaleRowDown38_3 = ScaleRowDown38_3_Box_C;
+    ScaleRowDown38_2 = ScaleRowDown38_2_Box_C;
+  }
+
+#if defined(HAS_SCALEROWDOWN38_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    if (!filtering) {
+      ScaleRowDown38_3 = ScaleRowDown38_Any_NEON;
+      ScaleRowDown38_2 = ScaleRowDown38_Any_NEON;
+    } else {
+      ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_NEON;
+      ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_NEON;
+    }
+    if (dst_width % 12 == 0) {
+      if (!filtering) {
+        ScaleRowDown38_3 = ScaleRowDown38_NEON;
+        ScaleRowDown38_2 = ScaleRowDown38_NEON;
+      } else {
+        ScaleRowDown38_3 = ScaleRowDown38_3_Box_NEON;
+        ScaleRowDown38_2 = ScaleRowDown38_2_Box_NEON;
+      }
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN38_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    if (!filtering) {
+      ScaleRowDown38_3 = ScaleRowDown38_Any_SSSE3;
+      ScaleRowDown38_2 = ScaleRowDown38_Any_SSSE3;
+    } else {
+      ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_SSSE3;
+      ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_SSSE3;
+    }
+    if (dst_width % 12 == 0 && !filtering) {
+      ScaleRowDown38_3 = ScaleRowDown38_SSSE3;
+      ScaleRowDown38_2 = ScaleRowDown38_SSSE3;
+    }
+    if (dst_width % 6 == 0 && filtering) {
+      ScaleRowDown38_3 = ScaleRowDown38_3_Box_SSSE3;
+      ScaleRowDown38_2 = ScaleRowDown38_2_Box_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN38_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 12 == 0) &&
+      IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
+      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+    if (!filtering) {
+      ScaleRowDown38_3 = ScaleRowDown38_MIPS_DSPR2;
+      ScaleRowDown38_2 = ScaleRowDown38_MIPS_DSPR2;
+    } else {
+      ScaleRowDown38_3 = ScaleRowDown38_3_Box_MIPS_DSPR2;
+      ScaleRowDown38_2 = ScaleRowDown38_2_Box_MIPS_DSPR2;
+    }
+  }
+#endif
+
+  for (y = 0; y < dst_height - 2; y += 3) {
+    ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride * 3;
+    dst_ptr += dst_stride;
+    ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride * 3;
+    dst_ptr += dst_stride;
+    ScaleRowDown38_2(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride * 2;
+    dst_ptr += dst_stride;
+  }
+
+  // Remainder 1 or 2 rows with last row vertically unfiltered
+  if ((dst_height % 3) == 2) {
+    ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride * 3;
+    dst_ptr += dst_stride;
+    ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);
+  } else if ((dst_height % 3) == 1) {
+    ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);
+  }
+}
+
+static void ScalePlaneDown38_16(int src_width, int src_height,
+                                int dst_width, int dst_height,
+                                int src_stride, int dst_stride,
+                                const uint16* src_ptr, uint16* dst_ptr,
+                                enum FilterMode filtering) {
+  int y;
+  void (*ScaleRowDown38_3)(const uint16* src_ptr, ptrdiff_t src_stride,
+                           uint16* dst_ptr, int dst_width);
+  void (*ScaleRowDown38_2)(const uint16* src_ptr, ptrdiff_t src_stride,
+                           uint16* dst_ptr, int dst_width);
+  const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
+  assert(dst_width % 3 == 0);
+  if (!filtering) {
+    ScaleRowDown38_3 = ScaleRowDown38_16_C;
+    ScaleRowDown38_2 = ScaleRowDown38_16_C;
+  } else {
+    ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_C;
+    ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_C;
+  }
+#if defined(HAS_SCALEROWDOWN38_16_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && (dst_width % 12 == 0)) {
+    if (!filtering) {
+      ScaleRowDown38_3 = ScaleRowDown38_16_NEON;
+      ScaleRowDown38_2 = ScaleRowDown38_16_NEON;
+    } else {
+      ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_NEON;
+      ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN38_16_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) {
+    if (!filtering) {
+      ScaleRowDown38_3 = ScaleRowDown38_16_SSSE3;
+      ScaleRowDown38_2 = ScaleRowDown38_16_SSSE3;
+    } else {
+      ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_SSSE3;
+      ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_SCALEROWDOWN38_16_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 12 == 0) &&
+      IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
+      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+    if (!filtering) {
+      ScaleRowDown38_3 = ScaleRowDown38_16_MIPS_DSPR2;
+      ScaleRowDown38_2 = ScaleRowDown38_16_MIPS_DSPR2;
+    } else {
+      ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_MIPS_DSPR2;
+      ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_MIPS_DSPR2;
+    }
+  }
+#endif
+
+  for (y = 0; y < dst_height - 2; y += 3) {
+    ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride * 3;
+    dst_ptr += dst_stride;
+    ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride * 3;
+    dst_ptr += dst_stride;
+    ScaleRowDown38_2(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride * 2;
+    dst_ptr += dst_stride;
+  }
+
+  // Remainder 1 or 2 rows with last row vertically unfiltered
+  if ((dst_height % 3) == 2) {
+    ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
+    src_ptr += src_stride * 3;
+    dst_ptr += dst_stride;
+    ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);
+  } else if ((dst_height % 3) == 1) {
+    ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);
+  }
+}
+
+#define MIN1(x) ((x) < 1 ? 1 : (x))
+
+static __inline uint32 SumPixels(int iboxwidth, const uint16* src_ptr) {
+  uint32 sum = 0u;
+  int x;
+  assert(iboxwidth > 0);
+  for (x = 0; x < iboxwidth; ++x) {
+    sum += src_ptr[x];
+  }
+  return sum;
+}
+
+static __inline uint32 SumPixels_16(int iboxwidth, const uint32* src_ptr) {
+  uint32 sum = 0u;
+  int x;
+  assert(iboxwidth > 0);
+  for (x = 0; x < iboxwidth; ++x) {
+    sum += src_ptr[x];
+  }
+  return sum;
+}
+
+static void ScaleAddCols2_C(int dst_width, int boxheight, int x, int dx,
+                            const uint16* src_ptr, uint8* dst_ptr) {
+  int i;
+  int scaletbl[2];
+  int minboxwidth = dx >> 16;
+  int* scaleptr = scaletbl - minboxwidth;
+  int boxwidth;
+  scaletbl[0] = 65536 / (MIN1(minboxwidth) * boxheight);
+  scaletbl[1] = 65536 / (MIN1(minboxwidth + 1) * boxheight);
+  for (i = 0; i < dst_width; ++i) {
+    int ix = x >> 16;
+    x += dx;
+    boxwidth = MIN1((x >> 16) - ix);
+    *dst_ptr++ = SumPixels(boxwidth, src_ptr + ix) * scaleptr[boxwidth] >> 16;
+  }
+}
+
+static void ScaleAddCols2_16_C(int dst_width, int boxheight, int x, int dx,
+                               const uint32* src_ptr, uint16* dst_ptr) {
+  int i;
+  int scaletbl[2];
+  int minboxwidth = dx >> 16;
+  int* scaleptr = scaletbl - minboxwidth;
+  int boxwidth;
+  scaletbl[0] = 65536 / (MIN1(minboxwidth) * boxheight);
+  scaletbl[1] = 65536 / (MIN1(minboxwidth + 1) * boxheight);
+  for (i = 0; i < dst_width; ++i) {
+    int ix = x >> 16;
+    x += dx;
+    boxwidth = MIN1((x >> 16) - ix);
+    *dst_ptr++ =
+        SumPixels_16(boxwidth, src_ptr + ix) * scaleptr[boxwidth] >> 16;
+  }
+}
+
+static void ScaleAddCols0_C(int dst_width, int boxheight, int x, int,
+                            const uint16* src_ptr, uint8* dst_ptr) {
+  int scaleval = 65536 / boxheight;
+  int i;
+  src_ptr += (x >> 16);
+  for (i = 0; i < dst_width; ++i) {
+    *dst_ptr++ = src_ptr[i] * scaleval >> 16;
+  }
+}
+
+static void ScaleAddCols1_C(int dst_width, int boxheight, int x, int dx,
+                            const uint16* src_ptr, uint8* dst_ptr) {
+  int boxwidth = MIN1(dx >> 16);
+  int scaleval = 65536 / (boxwidth * boxheight);
+  int i;
+  x >>= 16;
+  for (i = 0; i < dst_width; ++i) {
+    *dst_ptr++ = SumPixels(boxwidth, src_ptr + x) * scaleval >> 16;
+    x += boxwidth;
+  }
+}
+
+static void ScaleAddCols1_16_C(int dst_width, int boxheight, int x, int dx,
+                               const uint32* src_ptr, uint16* dst_ptr) {
+  int boxwidth = MIN1(dx >> 16);
+  int scaleval = 65536 / (boxwidth * boxheight);
+  int i;
+  for (i = 0; i < dst_width; ++i) {
+    *dst_ptr++ = SumPixels_16(boxwidth, src_ptr + x) * scaleval >> 16;
+    x += boxwidth;
+  }
+}
+
+// Scale plane down to any dimensions, with interpolation.
+// (boxfilter).
+//
+// Same method as SimpleScale, which is fixed point, outputting
+// one pixel of destination using fixed point (16.16) to step
+// through source, sampling a box of pixel with simple
+// averaging.
+static void ScalePlaneBox(int src_width, int src_height,
+                          int dst_width, int dst_height,
+                          int src_stride, int dst_stride,
+                          const uint8* src_ptr, uint8* dst_ptr) {
+  int j, k;
+  // Initial source x/y coordinate and step values as 16.16 fixed point.
+  int x = 0;
+  int y = 0;
+  int dx = 0;
+  int dy = 0;
+  const int max_y = (src_height << 16);
+  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox,
+             &x, &y, &dx, &dy);
+  src_width = Abs(src_width);
+  {
+    // Allocate a row buffer of uint16.
+    align_buffer_64(row16, src_width * 2);
+    void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
+        const uint16* src_ptr, uint8* dst_ptr) =
+        (dx & 0xffff) ? ScaleAddCols2_C:
+        ((dx != 0x10000) ? ScaleAddCols1_C : ScaleAddCols0_C);
+    void (*ScaleAddRow)(const uint8* src_ptr, uint16* dst_ptr, int src_width) =
+        ScaleAddRow_C;
+#if defined(HAS_SCALEADDROW_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2)) {
+      ScaleAddRow = ScaleAddRow_Any_SSE2;
+      if (IS_ALIGNED(src_width, 16)) {
+        ScaleAddRow = ScaleAddRow_SSE2;
+      }
+    }
+#endif
+#if defined(HAS_SCALEADDROW_AVX2)
+    if (TestCpuFlag(kCpuHasAVX2)) {
+      ScaleAddRow = ScaleAddRow_Any_AVX2;
+      if (IS_ALIGNED(src_width, 32)) {
+        ScaleAddRow = ScaleAddRow_AVX2;
+      }
+    }
+#endif
+#if defined(HAS_SCALEADDROW_NEON)
+    if (TestCpuFlag(kCpuHasNEON)) {
+      ScaleAddRow = ScaleAddRow_Any_NEON;
+      if (IS_ALIGNED(src_width, 16)) {
+        ScaleAddRow = ScaleAddRow_NEON;
+      }
+    }
+#endif
+
+    for (j = 0; j < dst_height; ++j) {
+      int boxheight;
+      int iy = y >> 16;
+      const uint8* src = src_ptr + iy * src_stride;
+      y += dy;
+      if (y > max_y) {
+        y = max_y;
+      }
+      boxheight = MIN1((y >> 16) - iy);
+      memset(row16, 0, src_width * 2);
+      for (k = 0; k < boxheight; ++k) {
+        ScaleAddRow(src, (uint16 *)(row16), src_width);
+        src += src_stride;
+      }
+      ScaleAddCols(dst_width, boxheight, x, dx, (uint16*)(row16), dst_ptr);
+      dst_ptr += dst_stride;
+    }
+    free_aligned_buffer_64(row16);
+  }
+}
+
+static void ScalePlaneBox_16(int src_width, int src_height,
+                             int dst_width, int dst_height,
+                             int src_stride, int dst_stride,
+                             const uint16* src_ptr, uint16* dst_ptr) {
+  int j, k;
+  // Initial source x/y coordinate and step values as 16.16 fixed point.
+  int x = 0;
+  int y = 0;
+  int dx = 0;
+  int dy = 0;
+  const int max_y = (src_height << 16);
+  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox,
+             &x, &y, &dx, &dy);
+  src_width = Abs(src_width);
+  {
+    // Allocate a row buffer of uint32.
+    align_buffer_64(row32, src_width * 4);
+    void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
+        const uint32* src_ptr, uint16* dst_ptr) =
+        (dx & 0xffff) ? ScaleAddCols2_16_C: ScaleAddCols1_16_C;
+    void (*ScaleAddRow)(const uint16* src_ptr, uint32* dst_ptr, int src_width) =
+        ScaleAddRow_16_C;
+
+#if defined(HAS_SCALEADDROW_16_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(src_width, 16)) {
+      ScaleAddRow = ScaleAddRow_16_SSE2;
+    }
+#endif
+
+    for (j = 0; j < dst_height; ++j) {
+      int boxheight;
+      int iy = y >> 16;
+      const uint16* src = src_ptr + iy * src_stride;
+      y += dy;
+      if (y > max_y) {
+        y = max_y;
+      }
+      boxheight = MIN1((y >> 16) - iy);
+      memset(row32, 0, src_width * 4);
+      for (k = 0; k < boxheight; ++k) {
+        ScaleAddRow(src, (uint32 *)(row32), src_width);
+        src += src_stride;
+      }
+      ScaleAddCols(dst_width, boxheight, x, dx, (uint32*)(row32), dst_ptr);
+      dst_ptr += dst_stride;
+    }
+    free_aligned_buffer_64(row32);
+  }
+}
+
+// Scale plane down with bilinear interpolation.
+void ScalePlaneBilinearDown(int src_width, int src_height,
+                            int dst_width, int dst_height,
+                            int src_stride, int dst_stride,
+                            const uint8* src_ptr, uint8* dst_ptr,
+                            enum FilterMode filtering) {
+  // Initial source x/y coordinate and step values as 16.16 fixed point.
+  int x = 0;
+  int y = 0;
+  int dx = 0;
+  int dy = 0;
+  // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
+  // Allocate a row buffer.
+  align_buffer_64(row, src_width);
+
+  const int max_y = (src_height - 1) << 16;
+  int j;
+  void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr,
+      int dst_width, int x, int dx) =
+      (src_width >= 32768) ? ScaleFilterCols64_C : ScaleFilterCols_C;
+  void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
+      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+      InterpolateRow_C;
+  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
+             &x, &y, &dx, &dy);
+  src_width = Abs(src_width);
+
+#if defined(HAS_INTERPOLATEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    InterpolateRow = InterpolateRow_Any_SSE2;
+    if (IS_ALIGNED(src_width, 16)) {
+      InterpolateRow = InterpolateRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(src_width, 16)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(src_width, 32)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(src_width, 16)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {
+    InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;
+    if (IS_ALIGNED(src_width, 4)) {
+      InterpolateRow = InterpolateRow_MIPS_DSPR2;
+    }
+  }
+#endif
+
+
+#if defined(HAS_SCALEFILTERCOLS_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+    ScaleFilterCols = ScaleFilterCols_SSSE3;
+  }
+#endif
+#if defined(HAS_SCALEFILTERCOLS_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && src_width < 32768) {
+    ScaleFilterCols = ScaleFilterCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleFilterCols = ScaleFilterCols_NEON;
+    }
+  }
+#endif
+  if (y > max_y) {
+    y = max_y;
+  }
+
+  for (j = 0; j < dst_height; ++j) {
+    int yi = y >> 16;
+    const uint8* src = src_ptr + yi * src_stride;
+    if (filtering == kFilterLinear) {
+      ScaleFilterCols(dst_ptr, src, dst_width, x, dx);
+    } else {
+      int yf = (y >> 8) & 255;
+      InterpolateRow(row, src, src_stride, src_width, yf);
+      ScaleFilterCols(dst_ptr, row, dst_width, x, dx);
+    }
+    dst_ptr += dst_stride;
+    y += dy;
+    if (y > max_y) {
+      y = max_y;
+    }
+  }
+  free_aligned_buffer_64(row);
+}
+
+void ScalePlaneBilinearDown_16(int src_width, int src_height,
+                               int dst_width, int dst_height,
+                               int src_stride, int dst_stride,
+                               const uint16* src_ptr, uint16* dst_ptr,
+                               enum FilterMode filtering) {
+  // Initial source x/y coordinate and step values as 16.16 fixed point.
+  int x = 0;
+  int y = 0;
+  int dx = 0;
+  int dy = 0;
+  // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
+  // Allocate a row buffer.
+  align_buffer_64(row, src_width * 2);
+
+  const int max_y = (src_height - 1) << 16;
+  int j;
+  void (*ScaleFilterCols)(uint16* dst_ptr, const uint16* src_ptr,
+      int dst_width, int x, int dx) =
+      (src_width >= 32768) ? ScaleFilterCols64_16_C : ScaleFilterCols_16_C;
+  void (*InterpolateRow)(uint16* dst_ptr, const uint16* src_ptr,
+      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+      InterpolateRow_16_C;
+  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
+             &x, &y, &dx, &dy);
+  src_width = Abs(src_width);
+
+#if defined(HAS_INTERPOLATEROW_16_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    InterpolateRow = InterpolateRow_Any_16_SSE2;
+    if (IS_ALIGNED(src_width, 16)) {
+      InterpolateRow = InterpolateRow_16_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_16_SSSE3;
+    if (IS_ALIGNED(src_width, 16)) {
+      InterpolateRow = InterpolateRow_16_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_16_AVX2;
+    if (IS_ALIGNED(src_width, 32)) {
+      InterpolateRow = InterpolateRow_16_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_16_NEON;
+    if (IS_ALIGNED(src_width, 16)) {
+      InterpolateRow = InterpolateRow_16_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {
+    InterpolateRow = InterpolateRow_Any_16_MIPS_DSPR2;
+    if (IS_ALIGNED(src_width, 4)) {
+      InterpolateRow = InterpolateRow_16_MIPS_DSPR2;
+    }
+  }
+#endif
+
+
+#if defined(HAS_SCALEFILTERCOLS_16_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+    ScaleFilterCols = ScaleFilterCols_16_SSSE3;
+  }
+#endif
+  if (y > max_y) {
+    y = max_y;
+  }
+
+  for (j = 0; j < dst_height; ++j) {
+    int yi = y >> 16;
+    const uint16* src = src_ptr + yi * src_stride;
+    if (filtering == kFilterLinear) {
+      ScaleFilterCols(dst_ptr, src, dst_width, x, dx);
+    } else {
+      int yf = (y >> 8) & 255;
+      InterpolateRow((uint16*)row, src, src_stride, src_width, yf);
+      ScaleFilterCols(dst_ptr, (uint16*)row, dst_width, x, dx);
+    }
+    dst_ptr += dst_stride;
+    y += dy;
+    if (y > max_y) {
+      y = max_y;
+    }
+  }
+  free_aligned_buffer_64(row);
+}
+
+// Scale up down with bilinear interpolation.
+void ScalePlaneBilinearUp(int src_width, int src_height,
+                          int dst_width, int dst_height,
+                          int src_stride, int dst_stride,
+                          const uint8* src_ptr, uint8* dst_ptr,
+                          enum FilterMode filtering) {
+  int j;
+  // Initial source x/y coordinate and step values as 16.16 fixed point.
+  int x = 0;
+  int y = 0;
+  int dx = 0;
+  int dy = 0;
+  const int max_y = (src_height - 1) << 16;
+  void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
+      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+      InterpolateRow_C;
+  void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr,
+      int dst_width, int x, int dx) =
+      filtering ? ScaleFilterCols_C : ScaleCols_C;
+  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
+             &x, &y, &dx, &dy);
+  src_width = Abs(src_width);
+
+#if defined(HAS_INTERPOLATEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    InterpolateRow = InterpolateRow_Any_SSE2;
+    if (IS_ALIGNED(dst_width, 16)) {
+      InterpolateRow = InterpolateRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(dst_width, 16)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(dst_width, 32)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(dst_width, 16)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {
+    InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;
+    if (IS_ALIGNED(dst_width, 4)) {
+      InterpolateRow = InterpolateRow_MIPS_DSPR2;
+    }
+  }
+#endif
+
+  if (filtering && src_width >= 32768) {
+    ScaleFilterCols = ScaleFilterCols64_C;
+  }
+#if defined(HAS_SCALEFILTERCOLS_SSSE3)
+  if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+    ScaleFilterCols = ScaleFilterCols_SSSE3;
+  }
+#endif
+#if defined(HAS_SCALEFILTERCOLS_NEON)
+  if (filtering && TestCpuFlag(kCpuHasNEON) && src_width < 32768) {
+    ScaleFilterCols = ScaleFilterCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleFilterCols = ScaleFilterCols_NEON;
+    }
+  }
+#endif
+  if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
+    ScaleFilterCols = ScaleColsUp2_C;
+#if defined(HAS_SCALECOLS_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+      ScaleFilterCols = ScaleColsUp2_SSE2;
+    }
+#endif
+  }
+
+  if (y > max_y) {
+    y = max_y;
+  }
+  {
+    int yi = y >> 16;
+    const uint8* src = src_ptr + yi * src_stride;
+
+    // Allocate 2 row buffers.
+    const int kRowSize = (dst_width + 31) & ~31;
+    align_buffer_64(row, kRowSize * 2);
+
+    uint8* rowptr = row;
+    int rowstride = kRowSize;
+    int lasty = yi;
+
+    ScaleFilterCols(rowptr, src, dst_width, x, dx);
+    if (src_height > 1) {
+      src += src_stride;
+    }
+    ScaleFilterCols(rowptr + rowstride, src, dst_width, x, dx);
+    src += src_stride;
+
+    for (j = 0; j < dst_height; ++j) {
+      yi = y >> 16;
+      if (yi != lasty) {
+        if (y > max_y) {
+          y = max_y;
+          yi = y >> 16;
+          src = src_ptr + yi * src_stride;
+        }
+        if (yi != lasty) {
+          ScaleFilterCols(rowptr, src, dst_width, x, dx);
+          rowptr += rowstride;
+          rowstride = -rowstride;
+          lasty = yi;
+          src += src_stride;
+        }
+      }
+      if (filtering == kFilterLinear) {
+        InterpolateRow(dst_ptr, rowptr, 0, dst_width, 0);
+      } else {
+        int yf = (y >> 8) & 255;
+        InterpolateRow(dst_ptr, rowptr, rowstride, dst_width, yf);
+      }
+      dst_ptr += dst_stride;
+      y += dy;
+    }
+    free_aligned_buffer_64(row);
+  }
+}
+
+void ScalePlaneBilinearUp_16(int src_width, int src_height,
+                             int dst_width, int dst_height,
+                             int src_stride, int dst_stride,
+                             const uint16* src_ptr, uint16* dst_ptr,
+                             enum FilterMode filtering) {
+  int j;
+  // Initial source x/y coordinate and step values as 16.16 fixed point.
+  int x = 0;
+  int y = 0;
+  int dx = 0;
+  int dy = 0;
+  const int max_y = (src_height - 1) << 16;
+  void (*InterpolateRow)(uint16* dst_ptr, const uint16* src_ptr,
+      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+      InterpolateRow_16_C;
+  void (*ScaleFilterCols)(uint16* dst_ptr, const uint16* src_ptr,
+      int dst_width, int x, int dx) =
+      filtering ? ScaleFilterCols_16_C : ScaleCols_16_C;
+  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
+             &x, &y, &dx, &dy);
+  src_width = Abs(src_width);
+
+#if defined(HAS_INTERPOLATEROW_16_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    InterpolateRow = InterpolateRow_Any_16_SSE2;
+    if (IS_ALIGNED(dst_width, 16)) {
+      InterpolateRow = InterpolateRow_16_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_16_SSSE3;
+    if (IS_ALIGNED(dst_width, 16)) {
+      InterpolateRow = InterpolateRow_16_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_16_AVX2;
+    if (IS_ALIGNED(dst_width, 32)) {
+      InterpolateRow = InterpolateRow_16_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_16_NEON;
+    if (IS_ALIGNED(dst_width, 16)) {
+      InterpolateRow = InterpolateRow_16_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {
+    InterpolateRow = InterpolateRow_Any_16_MIPS_DSPR2;
+    if (IS_ALIGNED(dst_width, 4)) {
+      InterpolateRow = InterpolateRow_16_MIPS_DSPR2;
+    }
+  }
+#endif
+
+  if (filtering && src_width >= 32768) {
+    ScaleFilterCols = ScaleFilterCols64_16_C;
+  }
+#if defined(HAS_SCALEFILTERCOLS_16_SSSE3)
+  if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+    ScaleFilterCols = ScaleFilterCols_16_SSSE3;
+  }
+#endif
+  if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
+    ScaleFilterCols = ScaleColsUp2_16_C;
+#if defined(HAS_SCALECOLS_16_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+      ScaleFilterCols = ScaleColsUp2_16_SSE2;
+    }
+#endif
+  }
+
+  if (y > max_y) {
+    y = max_y;
+  }
+  {
+    int yi = y >> 16;
+    const uint16* src = src_ptr + yi * src_stride;
+
+    // Allocate 2 row buffers.
+    const int kRowSize = (dst_width + 31) & ~31;
+    align_buffer_64(row, kRowSize * 4);
+
+    uint16* rowptr = (uint16*)row;
+    int rowstride = kRowSize;
+    int lasty = yi;
+
+    ScaleFilterCols(rowptr, src, dst_width, x, dx);
+    if (src_height > 1) {
+      src += src_stride;
+    }
+    ScaleFilterCols(rowptr + rowstride, src, dst_width, x, dx);
+    src += src_stride;
+
+    for (j = 0; j < dst_height; ++j) {
+      yi = y >> 16;
+      if (yi != lasty) {
+        if (y > max_y) {
+          y = max_y;
+          yi = y >> 16;
+          src = src_ptr + yi * src_stride;
+        }
+        if (yi != lasty) {
+          ScaleFilterCols(rowptr, src, dst_width, x, dx);
+          rowptr += rowstride;
+          rowstride = -rowstride;
+          lasty = yi;
+          src += src_stride;
+        }
+      }
+      if (filtering == kFilterLinear) {
+        InterpolateRow(dst_ptr, rowptr, 0, dst_width, 0);
+      } else {
+        int yf = (y >> 8) & 255;
+        InterpolateRow(dst_ptr, rowptr, rowstride, dst_width, yf);
+      }
+      dst_ptr += dst_stride;
+      y += dy;
+    }
+    free_aligned_buffer_64(row);
+  }
+}
+
+// Scale Plane to/from any dimensions, without interpolation.
+// Fixed point math is used for performance: The upper 16 bits
+// of x and dx is the integer part of the source position and
+// the lower 16 bits are the fixed decimal part.
+
+static void ScalePlaneSimple(int src_width, int src_height,
+                             int dst_width, int dst_height,
+                             int src_stride, int dst_stride,
+                             const uint8* src_ptr, uint8* dst_ptr) {
+  int i;
+  void (*ScaleCols)(uint8* dst_ptr, const uint8* src_ptr,
+      int dst_width, int x, int dx) = ScaleCols_C;
+  // Initial source x/y coordinate and step values as 16.16 fixed point.
+  int x = 0;
+  int y = 0;
+  int dx = 0;
+  int dy = 0;
+  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone,
+             &x, &y, &dx, &dy);
+  src_width = Abs(src_width);
+
+  if (src_width * 2 == dst_width && x < 0x8000) {
+    ScaleCols = ScaleColsUp2_C;
+#if defined(HAS_SCALECOLS_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+      ScaleCols = ScaleColsUp2_SSE2;
+    }
+#endif
+  }
+
+  for (i = 0; i < dst_height; ++i) {
+    ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride, dst_width, x, dx);
+    dst_ptr += dst_stride;
+    y += dy;
+  }
+}
+
+static void ScalePlaneSimple_16(int src_width, int src_height,
+                                int dst_width, int dst_height,
+                                int src_stride, int dst_stride,
+                                const uint16* src_ptr, uint16* dst_ptr) {
+  int i;
+  void (*ScaleCols)(uint16* dst_ptr, const uint16* src_ptr,
+      int dst_width, int x, int dx) = ScaleCols_16_C;
+  // Initial source x/y coordinate and step values as 16.16 fixed point.
+  int x = 0;
+  int y = 0;
+  int dx = 0;
+  int dy = 0;
+  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone,
+             &x, &y, &dx, &dy);
+  src_width = Abs(src_width);
+
+  if (src_width * 2 == dst_width && x < 0x8000) {
+    ScaleCols = ScaleColsUp2_16_C;
+#if defined(HAS_SCALECOLS_16_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+      ScaleCols = ScaleColsUp2_16_SSE2;
+    }
+#endif
+  }
+
+  for (i = 0; i < dst_height; ++i) {
+    ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride,
+              dst_width, x, dx);
+    dst_ptr += dst_stride;
+    y += dy;
+  }
+}
+
+// Scale a plane.
+// This function dispatches to a specialized scaler based on scale factor.
+
+LIBYUV_API
+void ScalePlane(const uint8* src, int src_stride,
+                int src_width, int src_height,
+                uint8* dst, int dst_stride,
+                int dst_width, int dst_height,
+                enum FilterMode filtering) {
+  // Simplify filtering when possible.
+  filtering = ScaleFilterReduce(src_width, src_height,
+                                dst_width, dst_height, filtering);
+
+  // Negative height means invert the image.
+  if (src_height < 0) {
+    src_height = -src_height;
+    src = src + (src_height - 1) * src_stride;
+    src_stride = -src_stride;
+  }
+
+  // Use specialized scales to improve performance for common resolutions.
+  // For example, all the 1/2 scalings will use ScalePlaneDown2()
+  if (dst_width == src_width && dst_height == src_height) {
+    // Straight copy.
+    CopyPlane(src, src_stride, dst, dst_stride, dst_width, dst_height);
+    return;
+  }
+  if (dst_width == src_width && filtering != kFilterBox) {
+    int dy = FixedDiv(src_height, dst_height);
+    // Arbitrary scale vertically, but unscaled horizontally.
+    ScalePlaneVertical(src_height,
+                       dst_width, dst_height,
+                       src_stride, dst_stride, src, dst,
+                       0, 0, dy, 1, filtering);
+    return;
+  }
+  if (dst_width <= Abs(src_width) && dst_height <= src_height) {
+    // Scale down.
+    if (4 * dst_width == 3 * src_width &&
+        4 * dst_height == 3 * src_height) {
+      // optimized, 3/4
+      ScalePlaneDown34(src_width, src_height, dst_width, dst_height,
+                       src_stride, dst_stride, src, dst, filtering);
+      return;
+    }
+    if (2 * dst_width == src_width && 2 * dst_height == src_height) {
+      // optimized, 1/2
+      ScalePlaneDown2(src_width, src_height, dst_width, dst_height,
+                      src_stride, dst_stride, src, dst, filtering);
+      return;
+    }
+    // 3/8 rounded up for odd sized chroma height.
+    if (8 * dst_width == 3 * src_width &&
+        dst_height == ((src_height * 3 + 7) / 8)) {
+      // optimized, 3/8
+      ScalePlaneDown38(src_width, src_height, dst_width, dst_height,
+                       src_stride, dst_stride, src, dst, filtering);
+      return;
+    }
+    if (4 * dst_width == src_width && 4 * dst_height == src_height &&
+        (filtering == kFilterBox || filtering == kFilterNone)) {
+      // optimized, 1/4
+      ScalePlaneDown4(src_width, src_height, dst_width, dst_height,
+                      src_stride, dst_stride, src, dst, filtering);
+      return;
+    }
+  }
+  if (filtering == kFilterBox && dst_height * 2 < src_height) {
+    ScalePlaneBox(src_width, src_height, dst_width, dst_height,
+                  src_stride, dst_stride, src, dst);
+    return;
+  }
+  if (filtering && dst_height > src_height) {
+    ScalePlaneBilinearUp(src_width, src_height, dst_width, dst_height,
+                         src_stride, dst_stride, src, dst, filtering);
+    return;
+  }
+  if (filtering) {
+    ScalePlaneBilinearDown(src_width, src_height, dst_width, dst_height,
+                           src_stride, dst_stride, src, dst, filtering);
+    return;
+  }
+  ScalePlaneSimple(src_width, src_height, dst_width, dst_height,
+                   src_stride, dst_stride, src, dst);
+}
+
+LIBYUV_API
+void ScalePlane_16(const uint16* src, int src_stride,
+                  int src_width, int src_height,
+                  uint16* dst, int dst_stride,
+                  int dst_width, int dst_height,
+                  enum FilterMode filtering) {
+  // Simplify filtering when possible.
+  filtering = ScaleFilterReduce(src_width, src_height,
+                                dst_width, dst_height, filtering);
+
+  // Negative height means invert the image.
+  if (src_height < 0) {
+    src_height = -src_height;
+    src = src + (src_height - 1) * src_stride;
+    src_stride = -src_stride;
+  }
+
+  // Use specialized scales to improve performance for common resolutions.
+  // For example, all the 1/2 scalings will use ScalePlaneDown2()
+  if (dst_width == src_width && dst_height == src_height) {
+    // Straight copy.
+    CopyPlane_16(src, src_stride, dst, dst_stride, dst_width, dst_height);
+    return;
+  }
+  if (dst_width == src_width) {
+    int dy = FixedDiv(src_height, dst_height);
+    // Arbitrary scale vertically, but unscaled vertically.
+    ScalePlaneVertical_16(src_height,
+                          dst_width, dst_height,
+                          src_stride, dst_stride, src, dst,
+                          0, 0, dy, 1, filtering);
+    return;
+  }
+  if (dst_width <= Abs(src_width) && dst_height <= src_height) {
+    // Scale down.
+    if (4 * dst_width == 3 * src_width &&
+        4 * dst_height == 3 * src_height) {
+      // optimized, 3/4
+      ScalePlaneDown34_16(src_width, src_height, dst_width, dst_height,
+                          src_stride, dst_stride, src, dst, filtering);
+      return;
+    }
+    if (2 * dst_width == src_width && 2 * dst_height == src_height) {
+      // optimized, 1/2
+      ScalePlaneDown2_16(src_width, src_height, dst_width, dst_height,
+                         src_stride, dst_stride, src, dst, filtering);
+      return;
+    }
+    // 3/8 rounded up for odd sized chroma height.
+    if (8 * dst_width == 3 * src_width &&
+        dst_height == ((src_height * 3 + 7) / 8)) {
+      // optimized, 3/8
+      ScalePlaneDown38_16(src_width, src_height, dst_width, dst_height,
+                          src_stride, dst_stride, src, dst, filtering);
+      return;
+    }
+    if (4 * dst_width == src_width && 4 * dst_height == src_height &&
+               filtering != kFilterBilinear) {
+      // optimized, 1/4
+      ScalePlaneDown4_16(src_width, src_height, dst_width, dst_height,
+                         src_stride, dst_stride, src, dst, filtering);
+      return;
+    }
+  }
+  if (filtering == kFilterBox && dst_height * 2 < src_height) {
+    ScalePlaneBox_16(src_width, src_height, dst_width, dst_height,
+                     src_stride, dst_stride, src, dst);
+    return;
+  }
+  if (filtering && dst_height > src_height) {
+    ScalePlaneBilinearUp_16(src_width, src_height, dst_width, dst_height,
+                            src_stride, dst_stride, src, dst, filtering);
+    return;
+  }
+  if (filtering) {
+    ScalePlaneBilinearDown_16(src_width, src_height, dst_width, dst_height,
+                              src_stride, dst_stride, src, dst, filtering);
+    return;
+  }
+  ScalePlaneSimple_16(src_width, src_height, dst_width, dst_height,
+                      src_stride, dst_stride, src, dst);
+}
+
+// Scale an I420 image.
+// This function in turn calls a scaling function for each plane.
+
+LIBYUV_API
+int I420Scale(const uint8* src_y, int src_stride_y,
+              const uint8* src_u, int src_stride_u,
+              const uint8* src_v, int src_stride_v,
+              int src_width, int src_height,
+              uint8* dst_y, int dst_stride_y,
+              uint8* dst_u, int dst_stride_u,
+              uint8* dst_v, int dst_stride_v,
+              int dst_width, int dst_height,
+              enum FilterMode filtering) {
+  int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
+  int src_halfheight = SUBSAMPLE(src_height, 1, 1);
+  int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
+  int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
+  if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
+      src_width > 32768 || src_height > 32768 ||
+      !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
+    return -1;
+  }
+
+  ScalePlane(src_y, src_stride_y, src_width, src_height,
+             dst_y, dst_stride_y, dst_width, dst_height,
+             filtering);
+  ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight,
+             dst_u, dst_stride_u, dst_halfwidth, dst_halfheight,
+             filtering);
+  ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight,
+             dst_v, dst_stride_v, dst_halfwidth, dst_halfheight,
+             filtering);
+  return 0;
+}
+
+LIBYUV_API
+int I420Scale_16(const uint16* src_y, int src_stride_y,
+                 const uint16* src_u, int src_stride_u,
+                 const uint16* src_v, int src_stride_v,
+                 int src_width, int src_height,
+                 uint16* dst_y, int dst_stride_y,
+                 uint16* dst_u, int dst_stride_u,
+                 uint16* dst_v, int dst_stride_v,
+                 int dst_width, int dst_height,
+                 enum FilterMode filtering) {
+  int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
+  int src_halfheight = SUBSAMPLE(src_height, 1, 1);
+  int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
+  int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
+  if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
+      src_width > 32768 || src_height > 32768 ||
+      !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
+    return -1;
+  }
+
+  ScalePlane_16(src_y, src_stride_y, src_width, src_height,
+                dst_y, dst_stride_y, dst_width, dst_height,
+                filtering);
+  ScalePlane_16(src_u, src_stride_u, src_halfwidth, src_halfheight,
+                dst_u, dst_stride_u, dst_halfwidth, dst_halfheight,
+                filtering);
+  ScalePlane_16(src_v, src_stride_v, src_halfwidth, src_halfheight,
+                dst_v, dst_stride_v, dst_halfwidth, dst_halfheight,
+                filtering);
+  return 0;
+}
+
+// Deprecated api
+LIBYUV_API
+int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
+          int src_stride_y, int src_stride_u, int src_stride_v,
+          int src_width, int src_height,
+          uint8* dst_y, uint8* dst_u, uint8* dst_v,
+          int dst_stride_y, int dst_stride_u, int dst_stride_v,
+          int dst_width, int dst_height,
+          LIBYUV_BOOL interpolate) {
+  return I420Scale(src_y, src_stride_y,
+                   src_u, src_stride_u,
+                   src_v, src_stride_v,
+                   src_width, src_height,
+                   dst_y, dst_stride_y,
+                   dst_u, dst_stride_u,
+                   dst_v, dst_stride_v,
+                   dst_width, dst_height,
+                   interpolate ? kFilterBox : kFilterNone);
+}
+
+// Deprecated api
+LIBYUV_API
+int ScaleOffset(const uint8* src, int src_width, int src_height,
+                uint8* dst, int dst_width, int dst_height, int dst_yoffset,
+                LIBYUV_BOOL interpolate) {
+  // Chroma requires offset to multiple of 2.
+  int dst_yoffset_even = dst_yoffset & ~1;
+  int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
+  int src_halfheight = SUBSAMPLE(src_height, 1, 1);
+  int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
+  int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
+  int aheight = dst_height - dst_yoffset_even * 2;  // actual output height
+  const uint8* src_y = src;
+  const uint8* src_u = src + src_width * src_height;
+  const uint8* src_v = src + src_width * src_height +
+                             src_halfwidth * src_halfheight;
+  uint8* dst_y = dst + dst_yoffset_even * dst_width;
+  uint8* dst_u = dst + dst_width * dst_height +
+                 (dst_yoffset_even >> 1) * dst_halfwidth;
+  uint8* dst_v = dst + dst_width * dst_height + dst_halfwidth * dst_halfheight +
+                 (dst_yoffset_even >> 1) * dst_halfwidth;
+  if (!src || src_width <= 0 || src_height <= 0 ||
+      !dst || dst_width <= 0 || dst_height <= 0 || dst_yoffset_even < 0 ||
+      dst_yoffset_even >= dst_height) {
+    return -1;
+  }
+  return I420Scale(src_y, src_width,
+                   src_u, src_halfwidth,
+                   src_v, src_halfwidth,
+                   src_width, src_height,
+                   dst_y, dst_width,
+                   dst_u, dst_halfwidth,
+                   dst_v, dst_halfwidth,
+                   dst_width, aheight,
+                   interpolate ? kFilterBox : kFilterNone);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libaom/src/third_party/libyuv/source/scale_any.cc b/libs/libaom/src/third_party/libyuv/source/scale_any.cc
new file mode 100644
index 000000000..2f6a2c8ba
--- /dev/null
+++ b/libs/libaom/src/third_party/libyuv/source/scale_any.cc
@@ -0,0 +1,200 @@
+/*
+ *  Copyright 2015 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/scale.h"
+#include "libyuv/scale_row.h"
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Definition for ScaleFilterCols, ScaleARGBCols and ScaleARGBFilterCols
+#define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK)                            \
+    void NAMEANY(uint8* dst_ptr, const uint8* src_ptr,                         \
+                 int dst_width, int x, int dx) {                               \
+      int n = dst_width & ~MASK;                                               \
+      if (n > 0) {                                                             \
+        TERP_SIMD(dst_ptr, src_ptr, n, x, dx);                                 \
+      }                                                                        \
+      TERP_C(dst_ptr + n * BPP, src_ptr,                                       \
+             dst_width & MASK, x + n * dx, dx);                                \
+    }
+
+#ifdef HAS_SCALEFILTERCOLS_NEON
+CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7)
+#endif
+#ifdef HAS_SCALEARGBCOLS_NEON
+CANY(ScaleARGBCols_Any_NEON, ScaleARGBCols_NEON, ScaleARGBCols_C, 4, 7)
+#endif
+#ifdef HAS_SCALEARGBFILTERCOLS_NEON
+CANY(ScaleARGBFilterCols_Any_NEON, ScaleARGBFilterCols_NEON,
+     ScaleARGBFilterCols_C, 4, 3)
+#endif
+#undef CANY
+
+// Fixed scale down.
+#define SDANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK)   \
+    void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride,                   \
+                 uint8* dst_ptr, int dst_width) {                              \
+      int r = (int)((unsigned int)dst_width % (MASK + 1));                     \
+      int n = dst_width - r;                                                   \
+      if (n > 0) {                                                             \
+        SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n);                    \
+      }                                                                        \
+      SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride,                 \
+                     dst_ptr + n * BPP, r);                                    \
+    }
+
+#ifdef HAS_SCALEROWDOWN2_SSE2
+SDANY(ScaleRowDown2_Any_SSE2, ScaleRowDown2_SSE2, ScaleRowDown2_C, 2, 1, 15)
+SDANY(ScaleRowDown2Linear_Any_SSE2, ScaleRowDown2Linear_SSE2,
+      ScaleRowDown2Linear_C, 2, 1, 15)
+SDANY(ScaleRowDown2Box_Any_SSE2, ScaleRowDown2Box_SSE2, ScaleRowDown2Box_C,
+      2, 1, 15)
+#endif
+#ifdef HAS_SCALEROWDOWN2_AVX2
+SDANY(ScaleRowDown2_Any_AVX2, ScaleRowDown2_AVX2, ScaleRowDown2_C, 2, 1, 31)
+SDANY(ScaleRowDown2Linear_Any_AVX2, ScaleRowDown2Linear_AVX2,
+      ScaleRowDown2Linear_C, 2, 1, 31)
+SDANY(ScaleRowDown2Box_Any_AVX2, ScaleRowDown2Box_AVX2, ScaleRowDown2Box_C,
+      2, 1, 31)
+#endif
+#ifdef HAS_SCALEROWDOWN2_NEON
+SDANY(ScaleRowDown2_Any_NEON, ScaleRowDown2_NEON, ScaleRowDown2_C, 2, 1, 15)
+SDANY(ScaleRowDown2Linear_Any_NEON, ScaleRowDown2Linear_NEON,
+      ScaleRowDown2Linear_C, 2, 1, 15)
+SDANY(ScaleRowDown2Box_Any_NEON, ScaleRowDown2Box_NEON,
+      ScaleRowDown2Box_C, 2, 1, 15)
+#endif
+#ifdef HAS_SCALEROWDOWN4_SSE2
+SDANY(ScaleRowDown4_Any_SSE2, ScaleRowDown4_SSE2, ScaleRowDown4_C, 4, 1, 7)
+SDANY(ScaleRowDown4Box_Any_SSE2, ScaleRowDown4Box_SSE2, ScaleRowDown4Box_C,
+      4, 1, 7)
+#endif
+#ifdef HAS_SCALEROWDOWN4_AVX2
+SDANY(ScaleRowDown4_Any_AVX2, ScaleRowDown4_AVX2, ScaleRowDown4_C, 4, 1, 15)
+SDANY(ScaleRowDown4Box_Any_AVX2, ScaleRowDown4Box_AVX2, ScaleRowDown4Box_C,
+      4, 1, 15)
+#endif
+#ifdef HAS_SCALEROWDOWN4_NEON
+SDANY(ScaleRowDown4_Any_NEON, ScaleRowDown4_NEON, ScaleRowDown4_C, 4, 1, 7)
+SDANY(ScaleRowDown4Box_Any_NEON, ScaleRowDown4Box_NEON, ScaleRowDown4Box_C,
+      4, 1, 7)
+#endif
+#ifdef HAS_SCALEROWDOWN34_SSSE3
+SDANY(ScaleRowDown34_Any_SSSE3, ScaleRowDown34_SSSE3,
+      ScaleRowDown34_C, 4 / 3, 1, 23)
+SDANY(ScaleRowDown34_0_Box_Any_SSSE3, ScaleRowDown34_0_Box_SSSE3,
+      ScaleRowDown34_0_Box_C, 4 / 3, 1, 23)
+SDANY(ScaleRowDown34_1_Box_Any_SSSE3, ScaleRowDown34_1_Box_SSSE3,
+      ScaleRowDown34_1_Box_C, 4 / 3, 1, 23)
+#endif
+#ifdef HAS_SCALEROWDOWN34_NEON
+SDANY(ScaleRowDown34_Any_NEON, ScaleRowDown34_NEON,
+      ScaleRowDown34_C, 4 / 3, 1, 23)
+SDANY(ScaleRowDown34_0_Box_Any_NEON, ScaleRowDown34_0_Box_NEON,
+      ScaleRowDown34_0_Box_C, 4 / 3, 1, 23)
+SDANY(ScaleRowDown34_1_Box_Any_NEON, ScaleRowDown34_1_Box_NEON,
+      ScaleRowDown34_1_Box_C, 4 / 3, 1, 23)
+#endif
+#ifdef HAS_SCALEROWDOWN38_SSSE3
+SDANY(ScaleRowDown38_Any_SSSE3, ScaleRowDown38_SSSE3,
+      ScaleRowDown38_C, 8 / 3, 1, 11)
+SDANY(ScaleRowDown38_3_Box_Any_SSSE3, ScaleRowDown38_3_Box_SSSE3,
+      ScaleRowDown38_3_Box_C, 8 / 3, 1, 5)
+SDANY(ScaleRowDown38_2_Box_Any_SSSE3, ScaleRowDown38_2_Box_SSSE3,
+      ScaleRowDown38_2_Box_C, 8 / 3, 1, 5)
+#endif
+#ifdef HAS_SCALEROWDOWN38_NEON
+SDANY(ScaleRowDown38_Any_NEON, ScaleRowDown38_NEON,
+      ScaleRowDown38_C, 8 / 3, 1, 11)
+SDANY(ScaleRowDown38_3_Box_Any_NEON, ScaleRowDown38_3_Box_NEON,
+      ScaleRowDown38_3_Box_C, 8 / 3, 1, 11)
+SDANY(ScaleRowDown38_2_Box_Any_NEON, ScaleRowDown38_2_Box_NEON,
+      ScaleRowDown38_2_Box_C, 8 / 3, 1, 11)
+#endif
+
+#ifdef HAS_SCALEARGBROWDOWN2_SSE2
+SDANY(ScaleARGBRowDown2_Any_SSE2, ScaleARGBRowDown2_SSE2,
+      ScaleARGBRowDown2_C, 2, 4, 3)
+SDANY(ScaleARGBRowDown2Linear_Any_SSE2, ScaleARGBRowDown2Linear_SSE2,
+      ScaleARGBRowDown2Linear_C, 2, 4, 3)
+SDANY(ScaleARGBRowDown2Box_Any_SSE2, ScaleARGBRowDown2Box_SSE2,
+      ScaleARGBRowDown2Box_C, 2, 4, 3)
+#endif
+#ifdef HAS_SCALEARGBROWDOWN2_NEON
+SDANY(ScaleARGBRowDown2_Any_NEON, ScaleARGBRowDown2_NEON,
+      ScaleARGBRowDown2_C, 2, 4, 7)
+SDANY(ScaleARGBRowDown2Linear_Any_NEON, ScaleARGBRowDown2Linear_NEON,
+      ScaleARGBRowDown2Linear_C, 2, 4, 7)
+SDANY(ScaleARGBRowDown2Box_Any_NEON, ScaleARGBRowDown2Box_NEON,
+      ScaleARGBRowDown2Box_C, 2, 4, 7)
+#endif
+#undef SDANY
+
+// Scale down by even scale factor.
+#define SDAANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, BPP, MASK)          \
+    void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, int src_stepx,    \
+                 uint8* dst_ptr, int dst_width) {                              \
+      int r = (int)((unsigned int)dst_width % (MASK + 1));                     \
+      int n = dst_width - r;                                                   \
+      if (n > 0) {                                                             \
+        SCALEROWDOWN_SIMD(src_ptr, src_stride, src_stepx, dst_ptr, n);         \
+      }                                                                        \
+      SCALEROWDOWN_C(src_ptr + (n * src_stepx) * BPP, src_stride,              \
+                     src_stepx, dst_ptr + n * BPP, r);                         \
+    }
+
+#ifdef HAS_SCALEARGBROWDOWNEVEN_SSE2
+SDAANY(ScaleARGBRowDownEven_Any_SSE2, ScaleARGBRowDownEven_SSE2,
+       ScaleARGBRowDownEven_C, 4, 3)
+SDAANY(ScaleARGBRowDownEvenBox_Any_SSE2, ScaleARGBRowDownEvenBox_SSE2,
+       ScaleARGBRowDownEvenBox_C, 4, 3)
+#endif
+#ifdef HAS_SCALEARGBROWDOWNEVEN_NEON
+SDAANY(ScaleARGBRowDownEven_Any_NEON, ScaleARGBRowDownEven_NEON,
+       ScaleARGBRowDownEven_C, 4, 3)
+SDAANY(ScaleARGBRowDownEvenBox_Any_NEON, ScaleARGBRowDownEvenBox_NEON,
+       ScaleARGBRowDownEvenBox_C, 4, 3)
+#endif
+
+// Add rows box filter scale down.
+#define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK)                  \
+  void NAMEANY(const uint8* src_ptr, uint16* dst_ptr, int src_width) {         \
+      int n = src_width & ~MASK;                                               \
+      if (n > 0) {                                                             \
+        SCALEADDROW_SIMD(src_ptr, dst_ptr, n);                                 \
+      }                                                                        \
+      SCALEADDROW_C(src_ptr + n, dst_ptr + n, src_width & MASK);               \
+    }
+
+#ifdef HAS_SCALEADDROW_SSE2
+SAANY(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, ScaleAddRow_C, 15)
+#endif
+#ifdef HAS_SCALEADDROW_AVX2
+SAANY(ScaleAddRow_Any_AVX2, ScaleAddRow_AVX2, ScaleAddRow_C, 31)
+#endif
+#ifdef HAS_SCALEADDROW_NEON
+SAANY(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, ScaleAddRow_C, 15)
+#endif
+#undef SAANY
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+
+
+
+
diff --git a/libs/libaom/src/third_party/libyuv/source/scale_argb.cc b/libs/libaom/src/third_party/libyuv/source/scale_argb.cc
new file mode 100644
index 000000000..40a2d1ab2
--- /dev/null
+++ b/libs/libaom/src/third_party/libyuv/source/scale_argb.cc
@@ -0,0 +1,853 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/scale.h"
+
+#include <assert.h>
+#include <string.h>
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h"  // For CopyARGB
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+static __inline int Abs(int v) {
+  return v >= 0 ? v : -v;
+}
+
+// ScaleARGB ARGB, 1/2
+// This is an optimized version for scaling down a ARGB to 1/2 of
+// its original size.
+static void ScaleARGBDown2(int src_width, int src_height,
+                           int dst_width, int dst_height,
+                           int src_stride, int dst_stride,
+                           const uint8* src_argb, uint8* dst_argb,
+                           int x, int dx, int y, int dy,
+                           enum FilterMode filtering) {
+  int j;
+  int row_stride = src_stride * (dy >> 16);
+  void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride,
+                            uint8* dst_argb, int dst_width) =
+    filtering == kFilterNone ? ScaleARGBRowDown2_C :
+        (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_C :
+        ScaleARGBRowDown2Box_C);
+  assert(dx == 65536 * 2);  // Test scale factor of 2.
+  assert((dy & 0x1ffff) == 0);  // Test vertical scale is multiple of 2.
+  // Advance to odd row, even column.
+  if (filtering == kFilterBilinear) {
+    src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
+  } else {
+    src_argb += (y >> 16) * src_stride + ((x >> 16) - 1) * 4;
+  }
+
+#if defined(HAS_SCALEARGBROWDOWN2_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_Any_SSE2 :
+        (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_SSE2 :
+        ScaleARGBRowDown2Box_Any_SSE2);
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_SSE2 :
+          (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2 :
+          ScaleARGBRowDown2Box_SSE2);
+    }
+  }
+#endif
+#if defined(HAS_SCALEARGBROWDOWN2_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_Any_NEON :
+        (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_NEON :
+        ScaleARGBRowDown2Box_Any_NEON);
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_NEON :
+          (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_NEON :
+          ScaleARGBRowDown2Box_NEON);
+    }
+  }
+#endif
+
+  if (filtering == kFilterLinear) {
+    src_stride = 0;
+  }
+  for (j = 0; j < dst_height; ++j) {
+    ScaleARGBRowDown2(src_argb, src_stride, dst_argb, dst_width);
+    src_argb += row_stride;
+    dst_argb += dst_stride;
+  }
+}
+
+// ScaleARGB ARGB, 1/4
+// This is an optimized version for scaling down a ARGB to 1/4 of
+// its original size.
+static void ScaleARGBDown4Box(int src_width, int src_height,
+                              int dst_width, int dst_height,
+                              int src_stride, int dst_stride,
+                              const uint8* src_argb, uint8* dst_argb,
+                              int x, int dx, int y, int dy) {
+  int j;
+  // Allocate 2 rows of ARGB.
+  const int kRowSize = (dst_width * 2 * 4 + 31) & ~31;
+  align_buffer_64(row, kRowSize * 2);
+  int row_stride = src_stride * (dy >> 16);
+  void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride,
+    uint8* dst_argb, int dst_width) = ScaleARGBRowDown2Box_C;
+  // Advance to odd row, even column.
+  src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
+  assert(dx == 65536 * 4);  // Test scale factor of 4.
+  assert((dy & 0x3ffff) == 0);  // Test vertical scale is multiple of 4.
+#if defined(HAS_SCALEARGBROWDOWN2_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ScaleARGBRowDown2 = ScaleARGBRowDown2Box_Any_SSE2;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBRowDown2 = ScaleARGBRowDown2Box_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_SCALEARGBROWDOWN2_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBRowDown2 = ScaleARGBRowDown2Box_Any_NEON;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBRowDown2 = ScaleARGBRowDown2Box_NEON;
+    }
+  }
+#endif
+
+  for (j = 0; j < dst_height; ++j) {
+    ScaleARGBRowDown2(src_argb, src_stride, row, dst_width * 2);
+    ScaleARGBRowDown2(src_argb + src_stride * 2, src_stride,
+                      row + kRowSize, dst_width * 2);
+    ScaleARGBRowDown2(row, kRowSize, dst_argb, dst_width);
+    src_argb += row_stride;
+    dst_argb += dst_stride;
+  }
+  free_aligned_buffer_64(row);
+}
+
+// ScaleARGB ARGB Even
+// This is an optimized version for scaling down a ARGB to even
+// multiple of its original size.
+static void ScaleARGBDownEven(int src_width, int src_height,
+                              int dst_width, int dst_height,
+                              int src_stride, int dst_stride,
+                              const uint8* src_argb, uint8* dst_argb,
+                              int x, int dx, int y, int dy,
+                              enum FilterMode filtering) {
+  int j;
+  int col_step = dx >> 16;
+  int row_stride = (dy >> 16) * src_stride;
+  void (*ScaleARGBRowDownEven)(const uint8* src_argb, ptrdiff_t src_stride,
+                               int src_step, uint8* dst_argb, int dst_width) =
+      filtering ? ScaleARGBRowDownEvenBox_C : ScaleARGBRowDownEven_C;
+  assert(IS_ALIGNED(src_width, 2));
+  assert(IS_ALIGNED(src_height, 2));
+  src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
+#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_SSE2 :
+        ScaleARGBRowDownEven_Any_SSE2;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_SSE2 :
+          ScaleARGBRowDownEven_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_NEON :
+        ScaleARGBRowDownEven_Any_NEON;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_NEON :
+          ScaleARGBRowDownEven_NEON;
+    }
+  }
+#endif
+
+  if (filtering == kFilterLinear) {
+    src_stride = 0;
+  }
+  for (j = 0; j < dst_height; ++j) {
+    ScaleARGBRowDownEven(src_argb, src_stride, col_step, dst_argb, dst_width);
+    src_argb += row_stride;
+    dst_argb += dst_stride;
+  }
+}
+
+// Scale ARGB down with bilinear interpolation.
+static void ScaleARGBBilinearDown(int src_width, int src_height,
+                                  int dst_width, int dst_height,
+                                  int src_stride, int dst_stride,
+                                  const uint8* src_argb, uint8* dst_argb,
+                                  int x, int dx, int y, int dy,
+                                  enum FilterMode filtering) {
+  int j;
+  void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
+      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+      InterpolateRow_C;
+  void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
+      int dst_width, int x, int dx) =
+      (src_width >= 32768) ? ScaleARGBFilterCols64_C : ScaleARGBFilterCols_C;
+  int64 xlast = x + (int64)(dst_width - 1) * dx;
+  int64 xl = (dx >= 0) ? x : xlast;
+  int64 xr = (dx >= 0) ? xlast : x;
+  int clip_src_width;
+  xl = (xl >> 16) & ~3;  // Left edge aligned.
+  xr = (xr >> 16) + 1;  // Right most pixel used.  Bilinear uses 2 pixels.
+  xr = (xr + 1 + 3) & ~3;  // 1 beyond 4 pixel aligned right most pixel.
+  if (xr > src_width) {
+    xr = src_width;
+  }
+  clip_src_width = (int)(xr - xl) * 4;  // Width aligned to 4.
+  src_argb += xl * 4;
+  x -= (int)(xl << 16);
+#if defined(HAS_INTERPOLATEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    InterpolateRow = InterpolateRow_Any_SSE2;
+    if (IS_ALIGNED(clip_src_width, 16)) {
+      InterpolateRow = InterpolateRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(clip_src_width, 16)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(clip_src_width, 32)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(clip_src_width, 16)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
+      IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4)) {
+    InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;
+    if (IS_ALIGNED(clip_src_width, 4)) {
+      InterpolateRow = InterpolateRow_MIPS_DSPR2;
+    }
+  }
+#endif
+#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+    ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
+  }
+#endif
+#if defined(HAS_SCALEARGBFILTERCOLS_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
+    }
+  }
+#endif
+  // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
+  // Allocate a row of ARGB.
+  {
+    align_buffer_64(row, clip_src_width * 4);
+
+    const int max_y = (src_height - 1) << 16;
+    if (y > max_y) {
+      y = max_y;
+    }
+    for (j = 0; j < dst_height; ++j) {
+      int yi = y >> 16;
+      const uint8* src = src_argb + yi * src_stride;
+      if (filtering == kFilterLinear) {
+        ScaleARGBFilterCols(dst_argb, src, dst_width, x, dx);
+      } else {
+        int yf = (y >> 8) & 255;
+        InterpolateRow(row, src, src_stride, clip_src_width, yf);
+        ScaleARGBFilterCols(dst_argb, row, dst_width, x, dx);
+      }
+      dst_argb += dst_stride;
+      y += dy;
+      if (y > max_y) {
+        y = max_y;
+      }
+    }
+    free_aligned_buffer_64(row);
+  }
+}
+
+// Scale ARGB up with bilinear interpolation.
+static void ScaleARGBBilinearUp(int src_width, int src_height,
+                                int dst_width, int dst_height,
+                                int src_stride, int dst_stride,
+                                const uint8* src_argb, uint8* dst_argb,
+                                int x, int dx, int y, int dy,
+                                enum FilterMode filtering) {
+  int j;
+  void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
+      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+      InterpolateRow_C;
+  void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
+      int dst_width, int x, int dx) =
+      filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
+  const int max_y = (src_height - 1) << 16;
+#if defined(HAS_INTERPOLATEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    InterpolateRow = InterpolateRow_Any_SSE2;
+    if (IS_ALIGNED(dst_width, 4)) {
+      InterpolateRow = InterpolateRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(dst_width, 4)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(dst_width, 8)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(dst_width, 4)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
+    InterpolateRow = InterpolateRow_MIPS_DSPR2;
+  }
+#endif
+  if (src_width >= 32768) {
+    ScaleARGBFilterCols = filtering ?
+        ScaleARGBFilterCols64_C : ScaleARGBCols64_C;
+  }
+#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
+  if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+    ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
+  }
+#endif
+#if defined(HAS_SCALEARGBFILTERCOLS_NEON)
+  if (filtering && TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SCALEARGBCOLS_SSE2)
+  if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
+    ScaleARGBFilterCols = ScaleARGBCols_SSE2;
+  }
+#endif
+#if defined(HAS_SCALEARGBCOLS_NEON)
+  if (!filtering && TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBFilterCols = ScaleARGBCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBFilterCols = ScaleARGBCols_NEON;
+    }
+  }
+#endif
+  if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
+    ScaleARGBFilterCols = ScaleARGBColsUp2_C;
+#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2;
+    }
+#endif
+  }
+
+  if (y > max_y) {
+    y = max_y;
+  }
+
+  {
+    int yi = y >> 16;
+    const uint8* src = src_argb + yi * src_stride;
+
+    // Allocate 2 rows of ARGB.
+    const int kRowSize = (dst_width * 4 + 31) & ~31;
+    align_buffer_64(row, kRowSize * 2);
+
+    uint8* rowptr = row;
+    int rowstride = kRowSize;
+    int lasty = yi;
+
+    ScaleARGBFilterCols(rowptr, src, dst_width, x, dx);
+    if (src_height > 1) {
+      src += src_stride;
+    }
+    ScaleARGBFilterCols(rowptr + rowstride, src, dst_width, x, dx);
+    src += src_stride;
+
+    for (j = 0; j < dst_height; ++j) {
+      yi = y >> 16;
+      if (yi != lasty) {
+        if (y > max_y) {
+          y = max_y;
+          yi = y >> 16;
+          src = src_argb + yi * src_stride;
+        }
+        if (yi != lasty) {
+          ScaleARGBFilterCols(rowptr, src, dst_width, x, dx);
+          rowptr += rowstride;
+          rowstride = -rowstride;
+          lasty = yi;
+          src += src_stride;
+        }
+      }
+      if (filtering == kFilterLinear) {
+        InterpolateRow(dst_argb, rowptr, 0, dst_width * 4, 0);
+      } else {
+        int yf = (y >> 8) & 255;
+        InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf);
+      }
+      dst_argb += dst_stride;
+      y += dy;
+    }
+    free_aligned_buffer_64(row);
+  }
+}
+
+#ifdef YUVSCALEUP
+// Scale YUV to ARGB up with bilinear interpolation.
+static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
+                                     int dst_width, int dst_height,
+                                     int src_stride_y,
+                                     int src_stride_u,
+                                     int src_stride_v,
+                                     int dst_stride_argb,
+                                     const uint8* src_y,
+                                     const uint8* src_u,
+                                     const uint8* src_v,
+                                     uint8* dst_argb,
+                                     int x, int dx, int y, int dy,
+                                     enum FilterMode filtering) {
+  int j;
+  void (*I422ToARGBRow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = I422ToARGBRow_C;
+#if defined(HAS_I422TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(src_width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToARGBRow = I422ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(src_width, 16)) {
+      I422ToARGBRow = I422ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToARGBRow = I422ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(src_width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_width, 4) &&
+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+    I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
+  }
+#endif
+
+  void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
+      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+      InterpolateRow_C;
+#if defined(HAS_INTERPOLATEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    InterpolateRow = InterpolateRow_Any_SSE2;
+    if (IS_ALIGNED(dst_width, 4)) {
+      InterpolateRow = InterpolateRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(dst_width, 4)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(dst_width, 8)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(dst_width, 4)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+    InterpolateRow = InterpolateRow_MIPS_DSPR2;
+  }
+#endif
+
+  void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
+      int dst_width, int x, int dx) =
+      filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
+  if (src_width >= 32768) {
+    ScaleARGBFilterCols = filtering ?
+        ScaleARGBFilterCols64_C : ScaleARGBCols64_C;
+  }
+#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
+  if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+    ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
+  }
+#endif
+#if defined(HAS_SCALEARGBFILTERCOLS_NEON)
+  if (filtering && TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SCALEARGBCOLS_SSE2)
+  if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
+    ScaleARGBFilterCols = ScaleARGBCols_SSE2;
+  }
+#endif
+#if defined(HAS_SCALEARGBCOLS_NEON)
+  if (!filtering && TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBFilterCols = ScaleARGBCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBFilterCols = ScaleARGBCols_NEON;
+    }
+  }
+#endif
+  if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
+    ScaleARGBFilterCols = ScaleARGBColsUp2_C;
+#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2;
+    }
+#endif
+  }
+
+  const int max_y = (src_height - 1) << 16;
+  if (y > max_y) {
+    y = max_y;
+  }
+  const int kYShift = 1;  // Shift Y by 1 to convert Y plane to UV coordinate.
+  int yi = y >> 16;
+  int uv_yi = yi >> kYShift;
+  const uint8* src_row_y = src_y + yi * src_stride_y;
+  const uint8* src_row_u = src_u + uv_yi * src_stride_u;
+  const uint8* src_row_v = src_v + uv_yi * src_stride_v;
+
+  // Allocate 2 rows of ARGB.
+  const int kRowSize = (dst_width * 4 + 31) & ~31;
+  align_buffer_64(row, kRowSize * 2);
+
+  // Allocate 1 row of ARGB for source conversion.
+  align_buffer_64(argb_row, src_width * 4);
+
+  uint8* rowptr = row;
+  int rowstride = kRowSize;
+  int lasty = yi;
+
+  // TODO(fbarchard): Convert first 2 rows of YUV to ARGB.
+  ScaleARGBFilterCols(rowptr, src_row_y, dst_width, x, dx);
+  if (src_height > 1) {
+    src_row_y += src_stride_y;
+    if (yi & 1) {
+      src_row_u += src_stride_u;
+      src_row_v += src_stride_v;
+    }
+  }
+  ScaleARGBFilterCols(rowptr + rowstride, src_row_y, dst_width, x, dx);
+  if (src_height > 2) {
+    src_row_y += src_stride_y;
+    if (!(yi & 1)) {
+      src_row_u += src_stride_u;
+      src_row_v += src_stride_v;
+    }
+  }
+
+  for (j = 0; j < dst_height; ++j) {
+    yi = y >> 16;
+    if (yi != lasty) {
+      if (y > max_y) {
+        y = max_y;
+        yi = y >> 16;
+        uv_yi = yi >> kYShift;
+        src_row_y = src_y + yi * src_stride_y;
+        src_row_u = src_u + uv_yi * src_stride_u;
+        src_row_v = src_v + uv_yi * src_stride_v;
+      }
+      if (yi != lasty) {
+        // TODO(fbarchard): Convert the clipped region of row.
+        I422ToARGBRow(src_row_y, src_row_u, src_row_v, argb_row, src_width);
+        ScaleARGBFilterCols(rowptr, argb_row, dst_width, x, dx);
+        rowptr += rowstride;
+        rowstride = -rowstride;
+        lasty = yi;
+        src_row_y += src_stride_y;
+        if (yi & 1) {
+          src_row_u += src_stride_u;
+          src_row_v += src_stride_v;
+        }
+      }
+    }
+    if (filtering == kFilterLinear) {
+      InterpolateRow(dst_argb, rowptr, 0, dst_width * 4, 0);
+    } else {
+      int yf = (y >> 8) & 255;
+      InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf);
+    }
+    dst_argb += dst_stride_argb;
+    y += dy;
+  }
+  free_aligned_buffer_64(row);
+  free_aligned_buffer_64(row_argb);
+}
+#endif
+
+// Scale ARGB to/from any dimensions, without interpolation.
+// Fixed point math is used for performance: The upper 16 bits
+// of x and dx is the integer part of the source position and
+// the lower 16 bits are the fixed decimal part.
+
+static void ScaleARGBSimple(int src_width, int src_height,
+                            int dst_width, int dst_height,
+                            int src_stride, int dst_stride,
+                            const uint8* src_argb, uint8* dst_argb,
+                            int x, int dx, int y, int dy) {
+  int j;
+  void (*ScaleARGBCols)(uint8* dst_argb, const uint8* src_argb,
+      int dst_width, int x, int dx) =
+      (src_width >= 32768) ? ScaleARGBCols64_C : ScaleARGBCols_C;
+#if defined(HAS_SCALEARGBCOLS_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
+    ScaleARGBCols = ScaleARGBCols_SSE2;
+  }
+#endif
+#if defined(HAS_SCALEARGBCOLS_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBCols = ScaleARGBCols_Any_NEON;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBCols = ScaleARGBCols_NEON;
+    }
+  }
+#endif
+  if (src_width * 2 == dst_width && x < 0x8000) {
+    ScaleARGBCols = ScaleARGBColsUp2_C;
+#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBCols = ScaleARGBColsUp2_SSE2;
+    }
+#endif
+  }
+
+  for (j = 0; j < dst_height; ++j) {
+    ScaleARGBCols(dst_argb, src_argb + (y >> 16) * src_stride,
+                  dst_width, x, dx);
+    dst_argb += dst_stride;
+    y += dy;
+  }
+}
+
+// ScaleARGB a ARGB.
+// This function in turn calls a scaling function
+// suitable for handling the desired resolutions.
+static void ScaleARGB(const uint8* src, int src_stride,
+                      int src_width, int src_height,
+                      uint8* dst, int dst_stride,
+                      int dst_width, int dst_height,
+                      int clip_x, int clip_y, int clip_width, int clip_height,
+                      enum FilterMode filtering) {
+  // Initial source x/y coordinate and step values as 16.16 fixed point.
+  int x = 0;
+  int y = 0;
+  int dx = 0;
+  int dy = 0;
+  // ARGB does not support box filter yet, but allow the user to pass it.
+  // Simplify filtering when possible.
+  filtering = ScaleFilterReduce(src_width, src_height,
+                                dst_width, dst_height,
+                                filtering);
+
+  // Negative src_height means invert the image.
+  if (src_height < 0) {
+    src_height = -src_height;
+    src = src + (src_height - 1) * src_stride;
+    src_stride = -src_stride;
+  }
+  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
+             &x, &y, &dx, &dy);
+  src_width = Abs(src_width);
+  if (clip_x) {
+    int64 clipf = (int64)(clip_x) * dx;
+    x += (clipf & 0xffff);
+    src += (clipf >> 16) * 4;
+    dst += clip_x * 4;
+  }
+  if (clip_y) {
+    int64 clipf = (int64)(clip_y) * dy;
+    y += (clipf & 0xffff);
+    src += (clipf >> 16) * src_stride;
+    dst += clip_y * dst_stride;
+  }
+
+  // Special case for integer step values.
+  if (((dx | dy) & 0xffff) == 0) {
+    if (!dx || !dy) {  // 1 pixel wide and/or tall.
+      filtering = kFilterNone;
+    } else {
+      // Optimized even scale down. ie 2, 4, 6, 8, 10x.
+      if (!(dx & 0x10000) && !(dy & 0x10000)) {
+        if (dx == 0x20000) {
+          // Optimized 1/2 downsample.
+          ScaleARGBDown2(src_width, src_height,
+                         clip_width, clip_height,
+                         src_stride, dst_stride, src, dst,
+                         x, dx, y, dy, filtering);
+          return;
+        }
+        if (dx == 0x40000 && filtering == kFilterBox) {
+          // Optimized 1/4 box downsample.
+          ScaleARGBDown4Box(src_width, src_height,
+                            clip_width, clip_height,
+                            src_stride, dst_stride, src, dst,
+                            x, dx, y, dy);
+          return;
+        }
+        ScaleARGBDownEven(src_width, src_height,
+                          clip_width, clip_height,
+                          src_stride, dst_stride, src, dst,
+                          x, dx, y, dy, filtering);
+        return;
+      }
+      // Optimized odd scale down. ie 3, 5, 7, 9x.
+      if ((dx & 0x10000) && (dy & 0x10000)) {
+        filtering = kFilterNone;
+        if (dx == 0x10000 && dy == 0x10000) {
+          // Straight copy.
+          ARGBCopy(src + (y >> 16) * src_stride + (x >> 16) * 4, src_stride,
+                   dst, dst_stride, clip_width, clip_height);
+          return;
+        }
+      }
+    }
+  }
+  if (dx == 0x10000 && (x & 0xffff) == 0) {
+    // Arbitrary scale vertically, but unscaled vertically.
+    ScalePlaneVertical(src_height,
+                       clip_width, clip_height,
+                       src_stride, dst_stride, src, dst,
+                       x, y, dy, 4, filtering);
+    return;
+  }
+  if (filtering && dy < 65536) {
+    ScaleARGBBilinearUp(src_width, src_height,
+                        clip_width, clip_height,
+                        src_stride, dst_stride, src, dst,
+                        x, dx, y, dy, filtering);
+    return;
+  }
+  if (filtering) {
+    ScaleARGBBilinearDown(src_width, src_height,
+                          clip_width, clip_height,
+                          src_stride, dst_stride, src, dst,
+                          x, dx, y, dy, filtering);
+    return;
+  }
+  ScaleARGBSimple(src_width, src_height, clip_width, clip_height,
+                  src_stride, dst_stride, src, dst,
+                  x, dx, y, dy);
+}
+
+LIBYUV_API
+int ARGBScaleClip(const uint8* src_argb, int src_stride_argb,
+                  int src_width, int src_height,
+                  uint8* dst_argb, int dst_stride_argb,
+                  int dst_width, int dst_height,
+                  int clip_x, int clip_y, int clip_width, int clip_height,
+                  enum FilterMode filtering) {
+  if (!src_argb || src_width == 0 || src_height == 0 ||
+      !dst_argb || dst_width <= 0 || dst_height <= 0 ||
+      clip_x < 0 || clip_y < 0 ||
+      clip_width > 32768 || clip_height > 32768 ||
+      (clip_x + clip_width) > dst_width ||
+      (clip_y + clip_height) > dst_height) {
+    return -1;
+  }
+  ScaleARGB(src_argb, src_stride_argb, src_width, src_height,
+            dst_argb, dst_stride_argb, dst_width, dst_height,
+            clip_x, clip_y, clip_width, clip_height, filtering);
+  return 0;
+}
+
+// Scale an ARGB image.
+LIBYUV_API
+int ARGBScale(const uint8* src_argb, int src_stride_argb,
+              int src_width, int src_height,
+              uint8* dst_argb, int dst_stride_argb,
+              int dst_width, int dst_height,
+              enum FilterMode filtering) {
+  if (!src_argb || src_width == 0 || src_height == 0 ||
+      src_width > 32768 || src_height > 32768 ||
+      !dst_argb || dst_width <= 0 || dst_height <= 0) {
+    return -1;
+  }
+  ScaleARGB(src_argb, src_stride_argb, src_width, src_height,
+            dst_argb, dst_stride_argb, dst_width, dst_height,
+            0, 0, dst_width, dst_height, filtering);
+  return 0;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libaom/src/third_party/libyuv/source/scale_common.cc b/libs/libaom/src/third_party/libyuv/source/scale_common.cc
new file mode 100644
index 000000000..1711f3d54
--- /dev/null
+++ b/libs/libaom/src/third_party/libyuv/source/scale_common.cc
@@ -0,0 +1,1137 @@
+/*
+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/scale.h"
+
+#include <assert.h>
+#include <string.h>
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h"  // For CopyARGB
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+static __inline int Abs(int v) {
+  return v >= 0 ? v : -v;
+}
+
+// CPU agnostic row functions
+void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                     uint8* dst, int dst_width) {
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = src_ptr[1];
+    dst[1] = src_ptr[3];
+    dst += 2;
+    src_ptr += 4;
+  }
+  if (dst_width & 1) {
+    dst[0] = src_ptr[1];
+  }
+}
+
+void ScaleRowDown2_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                        uint16* dst, int dst_width) {
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = src_ptr[1];
+    dst[1] = src_ptr[3];
+    dst += 2;
+    src_ptr += 4;
+  }
+  if (dst_width & 1) {
+    dst[0] = src_ptr[1];
+  }
+}
+
+void ScaleRowDown2Linear_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst, int dst_width) {
+  const uint8* s = src_ptr;
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = (s[0] + s[1] + 1) >> 1;
+    dst[1] = (s[2] + s[3] + 1) >> 1;
+    dst += 2;
+    s += 4;
+  }
+  if (dst_width & 1) {
+    dst[0] = (s[0] + s[1] + 1) >> 1;
+  }
+}
+
+void ScaleRowDown2Linear_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                              uint16* dst, int dst_width) {
+  const uint16* s = src_ptr;
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = (s[0] + s[1] + 1) >> 1;
+    dst[1] = (s[2] + s[3] + 1) >> 1;
+    dst += 2;
+    s += 4;
+  }
+  if (dst_width & 1) {
+    dst[0] = (s[0] + s[1] + 1) >> 1;
+  }
+}
+
+void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst, int dst_width) {
+  const uint8* s = src_ptr;
+  const uint8* t = src_ptr + src_stride;
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
+    dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2;
+    dst += 2;
+    s += 4;
+    t += 4;
+  }
+  if (dst_width & 1) {
+    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
+  }
+}
+
+void ScaleRowDown2Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                           uint16* dst, int dst_width) {
+  const uint16* s = src_ptr;
+  const uint16* t = src_ptr + src_stride;
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
+    dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2;
+    dst += 2;
+    s += 4;
+    t += 4;
+  }
+  if (dst_width & 1) {
+    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
+  }
+}
+
+void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                     uint8* dst, int dst_width) {
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = src_ptr[2];
+    dst[1] = src_ptr[6];
+    dst += 2;
+    src_ptr += 8;
+  }
+  if (dst_width & 1) {
+    dst[0] = src_ptr[2];
+  }
+}
+
+void ScaleRowDown4_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                        uint16* dst, int dst_width) {
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = src_ptr[2];
+    dst[1] = src_ptr[6];
+    dst += 2;
+    src_ptr += 8;
+  }
+  if (dst_width & 1) {
+    dst[0] = src_ptr[2];
+  }
+}
+
+void ScaleRowDown4Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst, int dst_width) {
+  intptr_t stride = src_stride;
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
+             src_ptr[stride + 0] + src_ptr[stride + 1] +
+             src_ptr[stride + 2] + src_ptr[stride + 3] +
+             src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
+             src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
+             src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
+             src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
+             8) >> 4;
+    dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] +
+             src_ptr[stride + 4] + src_ptr[stride + 5] +
+             src_ptr[stride + 6] + src_ptr[stride + 7] +
+             src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] +
+             src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] +
+             src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] +
+             src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] +
+             8) >> 4;
+    dst += 2;
+    src_ptr += 8;
+  }
+  if (dst_width & 1) {
+    dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
+             src_ptr[stride + 0] + src_ptr[stride + 1] +
+             src_ptr[stride + 2] + src_ptr[stride + 3] +
+             src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
+             src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
+             src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
+             src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
+             8) >> 4;
+  }
+}
+
+void ScaleRowDown4Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                           uint16* dst, int dst_width) {
+  intptr_t stride = src_stride;
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
+             src_ptr[stride + 0] + src_ptr[stride + 1] +
+             src_ptr[stride + 2] + src_ptr[stride + 3] +
+             src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
+             src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
+             src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
+             src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
+             8) >> 4;
+    dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] +
+             src_ptr[stride + 4] + src_ptr[stride + 5] +
+             src_ptr[stride + 6] + src_ptr[stride + 7] +
+             src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] +
+             src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] +
+             src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] +
+             src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] +
+             8) >> 4;
+    dst += 2;
+    src_ptr += 8;
+  }
+  if (dst_width & 1) {
+    dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
+             src_ptr[stride + 0] + src_ptr[stride + 1] +
+             src_ptr[stride + 2] + src_ptr[stride + 3] +
+             src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
+             src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
+             src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
+             src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
+             8) >> 4;
+  }
+}
+
+void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                      uint8* dst, int dst_width) {
+  int x;
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  for (x = 0; x < dst_width; x += 3) {
+    dst[0] = src_ptr[0];
+    dst[1] = src_ptr[1];
+    dst[2] = src_ptr[3];
+    dst += 3;
+    src_ptr += 4;
+  }
+}
+
+void ScaleRowDown34_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                         uint16* dst, int dst_width) {
+  int x;
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  for (x = 0; x < dst_width; x += 3) {
+    dst[0] = src_ptr[0];
+    dst[1] = src_ptr[1];
+    dst[2] = src_ptr[3];
+    dst += 3;
+    src_ptr += 4;
+  }
+}
+
+// Filter rows 0 and 1 together, 3 : 1
+void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* d, int dst_width) {
+  const uint8* s = src_ptr;
+  const uint8* t = src_ptr + src_stride;
+  int x;
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  for (x = 0; x < dst_width; x += 3) {
+    uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+    uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+    uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+    uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
+    uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
+    uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
+    d[0] = (a0 * 3 + b0 + 2) >> 2;
+    d[1] = (a1 * 3 + b1 + 2) >> 2;
+    d[2] = (a2 * 3 + b2 + 2) >> 2;
+    d += 3;
+    s += 4;
+    t += 4;
+  }
+}
+
+void ScaleRowDown34_0_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                               uint16* d, int dst_width) {
+  const uint16* s = src_ptr;
+  const uint16* t = src_ptr + src_stride;
+  int x;
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  for (x = 0; x < dst_width; x += 3) {
+    uint16 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+    uint16 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+    uint16 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+    uint16 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
+    uint16 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
+    uint16 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
+    d[0] = (a0 * 3 + b0 + 2) >> 2;
+    d[1] = (a1 * 3 + b1 + 2) >> 2;
+    d[2] = (a2 * 3 + b2 + 2) >> 2;
+    d += 3;
+    s += 4;
+    t += 4;
+  }
+}
+
+// Filter rows 1 and 2 together, 1 : 1
+void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* d, int dst_width) {
+  const uint8* s = src_ptr;
+  const uint8* t = src_ptr + src_stride;
+  int x;
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  for (x = 0; x < dst_width; x += 3) {
+    uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+    uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+    uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+    uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
+    uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
+    uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
+    d[0] = (a0 + b0 + 1) >> 1;
+    d[1] = (a1 + b1 + 1) >> 1;
+    d[2] = (a2 + b2 + 1) >> 1;
+    d += 3;
+    s += 4;
+    t += 4;
+  }
+}
+
+void ScaleRowDown34_1_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                               uint16* d, int dst_width) {
+  const uint16* s = src_ptr;
+  const uint16* t = src_ptr + src_stride;
+  int x;
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  for (x = 0; x < dst_width; x += 3) {
+    uint16 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+    uint16 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+    uint16 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+    uint16 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
+    uint16 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
+    uint16 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
+    d[0] = (a0 + b0 + 1) >> 1;
+    d[1] = (a1 + b1 + 1) >> 1;
+    d[2] = (a2 + b2 + 1) >> 1;
+    d += 3;
+    s += 4;
+    t += 4;
+  }
+}
+
+// Scales a single row of pixels using point sampling.
+void ScaleCols_C(uint8* dst_ptr, const uint8* src_ptr,
+                 int dst_width, int x, int dx) {
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    dst_ptr[0] = src_ptr[x >> 16];
+    x += dx;
+    dst_ptr[1] = src_ptr[x >> 16];
+    x += dx;
+    dst_ptr += 2;
+  }
+  if (dst_width & 1) {
+    dst_ptr[0] = src_ptr[x >> 16];
+  }
+}
+
+void ScaleCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
+                    int dst_width, int x, int dx) {
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    dst_ptr[0] = src_ptr[x >> 16];
+    x += dx;
+    dst_ptr[1] = src_ptr[x >> 16];
+    x += dx;
+    dst_ptr += 2;
+  }
+  if (dst_width & 1) {
+    dst_ptr[0] = src_ptr[x >> 16];
+  }
+}
+
+// Scales a single row of pixels up by 2x using point sampling.
+void ScaleColsUp2_C(uint8* dst_ptr, const uint8* src_ptr,
+                    int dst_width, int x, int dx) {
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    dst_ptr[1] = dst_ptr[0] = src_ptr[0];
+    src_ptr += 1;
+    dst_ptr += 2;
+  }
+  if (dst_width & 1) {
+    dst_ptr[0] = src_ptr[0];
+  }
+}
+
+void ScaleColsUp2_16_C(uint16* dst_ptr, const uint16* src_ptr,
+                       int dst_width, int x, int dx) {
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    dst_ptr[1] = dst_ptr[0] = src_ptr[0];
+    src_ptr += 1;
+    dst_ptr += 2;
+  }
+  if (dst_width & 1) {
+    dst_ptr[0] = src_ptr[0];
+  }
+}
+
+// (1-f)a + fb can be replaced with a + f(b-a)
+#define BLENDER(a, b, f) (uint8)((int)(a) + \
+    ((int)(f) * ((int)(b) - (int)(a)) >> 16))
+
+void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
+                       int dst_width, int x, int dx) {
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    int xi = x >> 16;
+    int a = src_ptr[xi];
+    int b = src_ptr[xi + 1];
+    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+    x += dx;
+    xi = x >> 16;
+    a = src_ptr[xi];
+    b = src_ptr[xi + 1];
+    dst_ptr[1] = BLENDER(a, b, x & 0xffff);
+    x += dx;
+    dst_ptr += 2;
+  }
+  if (dst_width & 1) {
+    int xi = x >> 16;
+    int a = src_ptr[xi];
+    int b = src_ptr[xi + 1];
+    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+  }
+}
+
+void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr,
+                         int dst_width, int x32, int dx) {
+  int64 x = (int64)(x32);
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    int64 xi = x >> 16;
+    int a = src_ptr[xi];
+    int b = src_ptr[xi + 1];
+    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+    x += dx;
+    xi = x >> 16;
+    a = src_ptr[xi];
+    b = src_ptr[xi + 1];
+    dst_ptr[1] = BLENDER(a, b, x & 0xffff);
+    x += dx;
+    dst_ptr += 2;
+  }
+  if (dst_width & 1) {
+    int64 xi = x >> 16;
+    int a = src_ptr[xi];
+    int b = src_ptr[xi + 1];
+    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+  }
+}
+#undef BLENDER
+
+#define BLENDER(a, b, f) (uint16)((int)(a) + \
+    ((int)(f) * ((int)(b) - (int)(a)) >> 16))
+
+void ScaleFilterCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
+                       int dst_width, int x, int dx) {
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    int xi = x >> 16;
+    int a = src_ptr[xi];
+    int b = src_ptr[xi + 1];
+    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+    x += dx;
+    xi = x >> 16;
+    a = src_ptr[xi];
+    b = src_ptr[xi + 1];
+    dst_ptr[1] = BLENDER(a, b, x & 0xffff);
+    x += dx;
+    dst_ptr += 2;
+  }
+  if (dst_width & 1) {
+    int xi = x >> 16;
+    int a = src_ptr[xi];
+    int b = src_ptr[xi + 1];
+    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+  }
+}
+
+void ScaleFilterCols64_16_C(uint16* dst_ptr, const uint16* src_ptr,
+                         int dst_width, int x32, int dx) {
+  int64 x = (int64)(x32);
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    int64 xi = x >> 16;
+    int a = src_ptr[xi];
+    int b = src_ptr[xi + 1];
+    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+    x += dx;
+    xi = x >> 16;
+    a = src_ptr[xi];
+    b = src_ptr[xi + 1];
+    dst_ptr[1] = BLENDER(a, b, x & 0xffff);
+    x += dx;
+    dst_ptr += 2;
+  }
+  if (dst_width & 1) {
+    int64 xi = x >> 16;
+    int a = src_ptr[xi];
+    int b = src_ptr[xi + 1];
+    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+  }
+}
+#undef BLENDER
+
+void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                      uint8* dst, int dst_width) {
+  int x;
+  assert(dst_width % 3 == 0);
+  for (x = 0; x < dst_width; x += 3) {
+    dst[0] = src_ptr[0];
+    dst[1] = src_ptr[3];
+    dst[2] = src_ptr[6];
+    dst += 3;
+    src_ptr += 8;
+  }
+}
+
+void ScaleRowDown38_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                         uint16* dst, int dst_width) {
+  int x;
+  assert(dst_width % 3 == 0);
+  for (x = 0; x < dst_width; x += 3) {
+    dst[0] = src_ptr[0];
+    dst[1] = src_ptr[3];
+    dst[2] = src_ptr[6];
+    dst += 3;
+    src_ptr += 8;
+  }
+}
+
+// 8x3 -> 3x1
+void ScaleRowDown38_3_Box_C(const uint8* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8* dst_ptr, int dst_width) {
+  intptr_t stride = src_stride;
+  int i;
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  for (i = 0; i < dst_width; i += 3) {
+    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
+        src_ptr[stride + 0] + src_ptr[stride + 1] +
+        src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
+        src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
+        (65536 / 9) >> 16;
+    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
+        src_ptr[stride + 3] + src_ptr[stride + 4] +
+        src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
+        src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
+        (65536 / 9) >> 16;
+    dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
+        src_ptr[stride + 6] + src_ptr[stride + 7] +
+        src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
+        (65536 / 6) >> 16;
+    src_ptr += 8;
+    dst_ptr += 3;
+  }
+}
+
+void ScaleRowDown38_3_Box_16_C(const uint16* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint16* dst_ptr, int dst_width) {
+  intptr_t stride = src_stride;
+  int i;
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  for (i = 0; i < dst_width; i += 3) {
+    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
+        src_ptr[stride + 0] + src_ptr[stride + 1] +
+        src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
+        src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
+        (65536 / 9) >> 16;
+    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
+        src_ptr[stride + 3] + src_ptr[stride + 4] +
+        src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
+        src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
+        (65536 / 9) >> 16;
+    dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
+        src_ptr[stride + 6] + src_ptr[stride + 7] +
+        src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
+        (65536 / 6) >> 16;
+    src_ptr += 8;
+    dst_ptr += 3;
+  }
+}
+
+// 8x2 -> 3x1
+void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst_ptr, int dst_width) {
+  intptr_t stride = src_stride;
+  int i;
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  for (i = 0; i < dst_width; i += 3) {
+    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
+        src_ptr[stride + 0] + src_ptr[stride + 1] +
+        src_ptr[stride + 2]) * (65536 / 6) >> 16;
+    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
+        src_ptr[stride + 3] + src_ptr[stride + 4] +
+        src_ptr[stride + 5]) * (65536 / 6) >> 16;
+    dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
+        src_ptr[stride + 6] + src_ptr[stride + 7]) *
+        (65536 / 4) >> 16;
+    src_ptr += 8;
+    dst_ptr += 3;
+  }
+}
+
+void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
+                               uint16* dst_ptr, int dst_width) {
+  intptr_t stride = src_stride;
+  int i;
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  for (i = 0; i < dst_width; i += 3) {
+    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
+        src_ptr[stride + 0] + src_ptr[stride + 1] +
+        src_ptr[stride + 2]) * (65536 / 6) >> 16;
+    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
+        src_ptr[stride + 3] + src_ptr[stride + 4] +
+        src_ptr[stride + 5]) * (65536 / 6) >> 16;
+    dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
+        src_ptr[stride + 6] + src_ptr[stride + 7]) *
+        (65536 / 4) >> 16;
+    src_ptr += 8;
+    dst_ptr += 3;
+  }
+}
+
+void ScaleAddRow_C(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
+  int x;
+  assert(src_width > 0);
+  for (x = 0; x < src_width - 1; x += 2) {
+    dst_ptr[0] += src_ptr[0];
+    dst_ptr[1] += src_ptr[1];
+    src_ptr += 2;
+    dst_ptr += 2;
+  }
+  if (src_width & 1) {
+    dst_ptr[0] += src_ptr[0];
+  }
+}
+
+void ScaleAddRow_16_C(const uint16* src_ptr, uint32* dst_ptr, int src_width) {
+  int x;
+  assert(src_width > 0);
+  for (x = 0; x < src_width - 1; x += 2) {
+    dst_ptr[0] += src_ptr[0];
+    dst_ptr[1] += src_ptr[1];
+    src_ptr += 2;
+    dst_ptr += 2;
+  }
+  if (src_width & 1) {
+    dst_ptr[0] += src_ptr[0];
+  }
+}
+
+void ScaleARGBRowDown2_C(const uint8* src_argb,
+                         ptrdiff_t src_stride,
+                         uint8* dst_argb, int dst_width) {
+  const uint32* src = (const uint32*)(src_argb);
+  uint32* dst = (uint32*)(dst_argb);
+
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = src[1];
+    dst[1] = src[3];
+    src += 4;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    dst[0] = src[1];
+  }
+}
+
+void ScaleARGBRowDown2Linear_C(const uint8* src_argb,
+                               ptrdiff_t src_stride,
+                               uint8* dst_argb, int dst_width) {
+  int x;
+  for (x = 0; x < dst_width; ++x) {
+    dst_argb[0] = (src_argb[0] + src_argb[4] + 1) >> 1;
+    dst_argb[1] = (src_argb[1] + src_argb[5] + 1) >> 1;
+    dst_argb[2] = (src_argb[2] + src_argb[6] + 1) >> 1;
+    dst_argb[3] = (src_argb[3] + src_argb[7] + 1) >> 1;
+    src_argb += 8;
+    dst_argb += 4;
+  }
+}
+
+void ScaleARGBRowDown2Box_C(const uint8* src_argb, ptrdiff_t src_stride,
+                            uint8* dst_argb, int dst_width) {
+  int x;
+  for (x = 0; x < dst_width; ++x) {
+    dst_argb[0] = (src_argb[0] + src_argb[4] +
+                  src_argb[src_stride] + src_argb[src_stride + 4] + 2) >> 2;
+    dst_argb[1] = (src_argb[1] + src_argb[5] +
+                  src_argb[src_stride + 1] + src_argb[src_stride + 5] + 2) >> 2;
+    dst_argb[2] = (src_argb[2] + src_argb[6] +
+                  src_argb[src_stride + 2] + src_argb[src_stride + 6] + 2) >> 2;
+    dst_argb[3] = (src_argb[3] + src_argb[7] +
+                  src_argb[src_stride + 3] + src_argb[src_stride + 7] + 2) >> 2;
+    src_argb += 8;
+    dst_argb += 4;
+  }
+}
+
+void ScaleARGBRowDownEven_C(const uint8* src_argb, ptrdiff_t src_stride,
+                            int src_stepx,
+                            uint8* dst_argb, int dst_width) {
+  const uint32* src = (const uint32*)(src_argb);
+  uint32* dst = (uint32*)(dst_argb);
+
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = src[0];
+    dst[1] = src[src_stepx];
+    src += src_stepx * 2;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    dst[0] = src[0];
+  }
+}
+
+void ScaleARGBRowDownEvenBox_C(const uint8* src_argb,
+                               ptrdiff_t src_stride,
+                               int src_stepx,
+                               uint8* dst_argb, int dst_width) {
+  int x;
+  for (x = 0; x < dst_width; ++x) {
+    dst_argb[0] = (src_argb[0] + src_argb[4] +
+                  src_argb[src_stride] + src_argb[src_stride + 4] + 2) >> 2;
+    dst_argb[1] = (src_argb[1] + src_argb[5] +
+                  src_argb[src_stride + 1] + src_argb[src_stride + 5] + 2) >> 2;
+    dst_argb[2] = (src_argb[2] + src_argb[6] +
+                  src_argb[src_stride + 2] + src_argb[src_stride + 6] + 2) >> 2;
+    dst_argb[3] = (src_argb[3] + src_argb[7] +
+                  src_argb[src_stride + 3] + src_argb[src_stride + 7] + 2) >> 2;
+    src_argb += src_stepx * 4;
+    dst_argb += 4;
+  }
+}
+
+// Scales a single row of pixels using point sampling.
+void ScaleARGBCols_C(uint8* dst_argb, const uint8* src_argb,
+                     int dst_width, int x, int dx) {
+  const uint32* src = (const uint32*)(src_argb);
+  uint32* dst = (uint32*)(dst_argb);
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    dst[0] = src[x >> 16];
+    x += dx;
+    dst[1] = src[x >> 16];
+    x += dx;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    dst[0] = src[x >> 16];
+  }
+}
+
+void ScaleARGBCols64_C(uint8* dst_argb, const uint8* src_argb,
+                       int dst_width, int x32, int dx) {
+  int64 x = (int64)(x32);
+  const uint32* src = (const uint32*)(src_argb);
+  uint32* dst = (uint32*)(dst_argb);
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    dst[0] = src[x >> 16];
+    x += dx;
+    dst[1] = src[x >> 16];
+    x += dx;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    dst[0] = src[x >> 16];
+  }
+}
+
+// Scales a single row of pixels up by 2x using point sampling.
+void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb,
+                        int dst_width, int x, int dx) {
+  const uint32* src = (const uint32*)(src_argb);
+  uint32* dst = (uint32*)(dst_argb);
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    dst[1] = dst[0] = src[0];
+    src += 1;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    dst[0] = src[0];
+  }
+}
+
+// Mimics SSSE3 blender
+#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b) * f) >> 7
+#define BLENDERC(a, b, f, s) (uint32)( \
+    BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s)
+#define BLENDER(a, b, f) \
+    BLENDERC(a, b, f, 24) | BLENDERC(a, b, f, 16) | \
+    BLENDERC(a, b, f, 8) | BLENDERC(a, b, f, 0)
+
+void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb,
+                           int dst_width, int x, int dx) {
+  const uint32* src = (const uint32*)(src_argb);
+  uint32* dst = (uint32*)(dst_argb);
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    int xi = x >> 16;
+    int xf = (x >> 9) & 0x7f;
+    uint32 a = src[xi];
+    uint32 b = src[xi + 1];
+    dst[0] = BLENDER(a, b, xf);
+    x += dx;
+    xi = x >> 16;
+    xf = (x >> 9) & 0x7f;
+    a = src[xi];
+    b = src[xi + 1];
+    dst[1] = BLENDER(a, b, xf);
+    x += dx;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    int xi = x >> 16;
+    int xf = (x >> 9) & 0x7f;
+    uint32 a = src[xi];
+    uint32 b = src[xi + 1];
+    dst[0] = BLENDER(a, b, xf);
+  }
+}
+
+void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb,
+                             int dst_width, int x32, int dx) {
+  int64 x = (int64)(x32);
+  const uint32* src = (const uint32*)(src_argb);
+  uint32* dst = (uint32*)(dst_argb);
+  int j;
+  for (j = 0; j < dst_width - 1; j += 2) {
+    int64 xi = x >> 16;
+    int xf = (x >> 9) & 0x7f;
+    uint32 a = src[xi];
+    uint32 b = src[xi + 1];
+    dst[0] = BLENDER(a, b, xf);
+    x += dx;
+    xi = x >> 16;
+    xf = (x >> 9) & 0x7f;
+    a = src[xi];
+    b = src[xi + 1];
+    dst[1] = BLENDER(a, b, xf);
+    x += dx;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    int64 xi = x >> 16;
+    int xf = (x >> 9) & 0x7f;
+    uint32 a = src[xi];
+    uint32 b = src[xi + 1];
+    dst[0] = BLENDER(a, b, xf);
+  }
+}
+#undef BLENDER1
+#undef BLENDERC
+#undef BLENDER
+
+// Scale plane vertically with bilinear interpolation.
+void ScalePlaneVertical(int src_height,
+                        int dst_width, int dst_height,
+                        int src_stride, int dst_stride,
+                        const uint8* src_argb, uint8* dst_argb,
+                        int x, int y, int dy,
+                        int bpp, enum FilterMode filtering) {
+  // TODO(fbarchard): Allow higher bpp.
+  int dst_width_bytes = dst_width * bpp;
+  void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
+      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+      InterpolateRow_C;
+  const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
+  int j;
+  assert(bpp >= 1 && bpp <= 4);
+  assert(src_height != 0);
+  assert(dst_width > 0);
+  assert(dst_height > 0);
+  src_argb += (x >> 16) * bpp;
+#if defined(HAS_INTERPOLATEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    InterpolateRow = InterpolateRow_Any_SSE2;
+    if (IS_ALIGNED(dst_width_bytes, 16)) {
+      InterpolateRow = InterpolateRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(dst_width_bytes, 16)) {
+      InterpolateRow = InterpolateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(dst_width_bytes, 32)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(dst_width_bytes, 16)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
+      IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) &&
+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
+    InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;
+    if (IS_ALIGNED(dst_width_bytes, 4)) {
+      InterpolateRow = InterpolateRow_MIPS_DSPR2;
+    }
+  }
+#endif
+  for (j = 0; j < dst_height; ++j) {
+    int yi;
+    int yf;
+    if (y > max_y) {
+      y = max_y;
+    }
+    yi = y >> 16;
+    yf = filtering ? ((y >> 8) & 255) : 0;
+    InterpolateRow(dst_argb, src_argb + yi * src_stride,
+                   src_stride, dst_width_bytes, yf);
+    dst_argb += dst_stride;
+    y += dy;
+  }
+}
+void ScalePlaneVertical_16(int src_height,
+                           int dst_width, int dst_height,
+                           int src_stride, int dst_stride,
+                           const uint16* src_argb, uint16* dst_argb,
+                           int x, int y, int dy,
+                           int wpp, enum FilterMode filtering) {
+  // TODO(fbarchard): Allow higher wpp.
+  int dst_width_words = dst_width * wpp;
+  void (*InterpolateRow)(uint16* dst_argb, const uint16* src_argb,
+      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+      InterpolateRow_16_C;
+  const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
+  int j;
+  assert(wpp >= 1 && wpp <= 2);
+  assert(src_height != 0);
+  assert(dst_width > 0);
+  assert(dst_height > 0);
+  src_argb += (x >> 16) * wpp;
+#if defined(HAS_INTERPOLATEROW_16_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    InterpolateRow = InterpolateRow_Any_16_SSE2;
+    if (IS_ALIGNED(dst_width_bytes, 16)) {
+      InterpolateRow = InterpolateRow_16_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    InterpolateRow = InterpolateRow_Any_16_SSSE3;
+    if (IS_ALIGNED(dst_width_bytes, 16)) {
+      InterpolateRow = InterpolateRow_16_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow = InterpolateRow_Any_16_AVX2;
+    if (IS_ALIGNED(dst_width_bytes, 32)) {
+      InterpolateRow = InterpolateRow_16_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow = InterpolateRow_Any_16_NEON;
+    if (IS_ALIGNED(dst_width_bytes, 16)) {
+      InterpolateRow = InterpolateRow_16_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
+      IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) &&
+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
+    InterpolateRow = InterpolateRow_Any_16_MIPS_DSPR2;
+    if (IS_ALIGNED(dst_width_bytes, 4)) {
+      InterpolateRow = InterpolateRow_16_MIPS_DSPR2;
+    }
+  }
+#endif
+  for (j = 0; j < dst_height; ++j) {
+    int yi;
+    int yf;
+    if (y > max_y) {
+      y = max_y;
+    }
+    yi = y >> 16;
+    yf = filtering ? ((y >> 8) & 255) : 0;
+    InterpolateRow(dst_argb, src_argb + yi * src_stride,
+                   src_stride, dst_width_words, yf);
+    dst_argb += dst_stride;
+    y += dy;
+  }
+}
+
+// Simplify the filtering based on scale factors.
+enum FilterMode ScaleFilterReduce(int src_width, int src_height,
+                                  int dst_width, int dst_height,
+                                  enum FilterMode filtering) {
+  if (src_width < 0) {
+    src_width = -src_width;
+  }
+  if (src_height < 0) {
+    src_height = -src_height;
+  }
+  if (filtering == kFilterBox) {
+    // If scaling both axis to 0.5 or larger, switch from Box to Bilinear.
+    if (dst_width * 2 >= src_width && dst_height * 2 >= src_height) {
+      filtering = kFilterBilinear;
+    }
+  }
+  if (filtering == kFilterBilinear) {
+    if (src_height == 1) {
+      filtering = kFilterLinear;
+    }
+    // TODO(fbarchard): Detect any odd scale factor and reduce to Linear.
+    if (dst_height == src_height || dst_height * 3 == src_height) {
+      filtering = kFilterLinear;
+    }
+    // TODO(fbarchard): Remove 1 pixel wide filter restriction, which is to
+    // avoid reading 2 pixels horizontally that causes memory exception.
+    if (src_width == 1) {
+      filtering = kFilterNone;
+    }
+  }
+  if (filtering == kFilterLinear) {
+    if (src_width == 1) {
+      filtering = kFilterNone;
+    }
+    // TODO(fbarchard): Detect any odd scale factor and reduce to None.
+    if (dst_width == src_width || dst_width * 3 == src_width) {
+      filtering = kFilterNone;
+    }
+  }
+  return filtering;
+}
+
+// Divide num by div and return as 16.16 fixed point result.
+int FixedDiv_C(int num, int div) {
+  return (int)(((int64)(num) << 16) / div);
+}
+
+// Divide num by div and return as 16.16 fixed point result.
+int FixedDiv1_C(int num, int div) {
+  return (int)((((int64)(num) << 16) - 0x00010001) /
+                          (div - 1));
+}
+
+#define CENTERSTART(dx, s) (dx < 0) ? -((-dx >> 1) + s) : ((dx >> 1) + s)
+
+// Compute slope values for stepping.
+void ScaleSlope(int src_width, int src_height,
+                int dst_width, int dst_height,
+                enum FilterMode filtering,
+                int* x, int* y, int* dx, int* dy) {
+  assert(x != NULL);
+  assert(y != NULL);
+  assert(dx != NULL);
+  assert(dy != NULL);
+  assert(src_width != 0);
+  assert(src_height != 0);
+  assert(dst_width > 0);
+  assert(dst_height > 0);
+  // Check for 1 pixel and avoid FixedDiv overflow.
+  if (dst_width == 1 && src_width >= 32768) {
+    dst_width = src_width;
+  }
+  if (dst_height == 1 && src_height >= 32768) {
+    dst_height = src_height;
+  }
+  if (filtering == kFilterBox) {
+    // Scale step for point sampling duplicates all pixels equally.
+    *dx = FixedDiv(Abs(src_width), dst_width);
+    *dy = FixedDiv(src_height, dst_height);
+    *x = 0;
+    *y = 0;
+  } else if (filtering == kFilterBilinear) {
+    // Scale step for bilinear sampling renders last pixel once for upsample.
+    if (dst_width <= Abs(src_width)) {
+      *dx = FixedDiv(Abs(src_width), dst_width);
+      *x = CENTERSTART(*dx, -32768);  // Subtract 0.5 (32768) to center filter.
+    } else if (dst_width > 1) {
+      *dx = FixedDiv1(Abs(src_width), dst_width);
+      *x = 0;
+    }
+    if (dst_height <= src_height) {
+      *dy = FixedDiv(src_height,  dst_height);
+      *y = CENTERSTART(*dy, -32768);  // Subtract 0.5 (32768) to center filter.
+    } else if (dst_height > 1) {
+      *dy = FixedDiv1(src_height, dst_height);
+      *y = 0;
+    }
+  } else if (filtering == kFilterLinear) {
+    // Scale step for bilinear sampling renders last pixel once for upsample.
+    if (dst_width <= Abs(src_width)) {
+      *dx = FixedDiv(Abs(src_width), dst_width);
+      *x = CENTERSTART(*dx, -32768);  // Subtract 0.5 (32768) to center filter.
+    } else if (dst_width > 1) {
+      *dx = FixedDiv1(Abs(src_width), dst_width);
+      *x = 0;
+    }
+    *dy = FixedDiv(src_height, dst_height);
+    *y = *dy >> 1;
+  } else {
+    // Scale step for point sampling duplicates all pixels equally.
+    *dx = FixedDiv(Abs(src_width), dst_width);
+    *dy = FixedDiv(src_height, dst_height);
+    *x = CENTERSTART(*dx, 0);
+    *y = CENTERSTART(*dy, 0);
+  }
+  // Negative src_width means horizontally mirror.
+  if (src_width < 0) {
+    *x += (dst_width - 1) * *dx;
+    *dx = -*dx;
+    // src_width = -src_width;   // Caller must do this.
+  }
+}
+#undef CENTERSTART
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libaom/src/third_party/libyuv/source/scale_gcc.cc b/libs/libaom/src/third_party/libyuv/source/scale_gcc.cc
new file mode 100644
index 000000000..8a6ac5459
--- /dev/null
+++ b/libs/libaom/src/third_party/libyuv/source/scale_gcc.cc
@@ -0,0 +1,1089 @@
+/*
+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC x86 and x64.
+#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
+
+// Offsets for source bytes 0 to 9
+static uvec8 kShuf0 =
+  { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
+static uvec8 kShuf1 =
+  { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
+static uvec8 kShuf2 =
+  { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Offsets for source bytes 0 to 10
+static uvec8 kShuf01 =
+  { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
+
+// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
+static uvec8 kShuf11 =
+  { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
+
+// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
+static uvec8 kShuf21 =
+  { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
+
+// Coefficients for source bytes 0 to 10
+static uvec8 kMadd01 =
+  { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
+
+// Coefficients for source bytes 10 to 21
+static uvec8 kMadd11 =
+  { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
+
+// Coefficients for source bytes 21 to 31
+static uvec8 kMadd21 =
+  { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
+
+// Coefficients for source bytes 21 to 31
+static vec16 kRound34 =
+  { 2, 2, 2, 2, 2, 2, 2, 2 };
+
+static uvec8 kShuf38a =
+  { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+static uvec8 kShuf38b =
+  { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
+
+// Arrange words 0,3,6 into 0,1,2
+static uvec8 kShufAc =
+  { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Arrange words 0,3,6 into 3,4,5
+static uvec8 kShufAc3 =
+  { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
+
+// Scaling values for boxes of 3x3 and 2x3
+static uvec16 kScaleAc33 =
+  { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
+
+// Arrange first value for pixels 0,1,2,3,4,5
+static uvec8 kShufAb0 =
+  { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
+
+// Arrange second value for pixels 0,1,2,3,4,5
+static uvec8 kShufAb1 =
+  { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
+
+// Arrange third value for pixels 0,1,2,3,4,5
+static uvec8 kShufAb2 =
+  { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
+
+// Scaling values for boxes of 3x2 and 2x2
+static uvec16 kScaleAb2 =
+  { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
+
+// GCC versions of row functions are verbatim conversions from Visual C.
+// Generated using gcc disassembly on Visual C object file:
+// objdump -D yuvscaler.obj >yuvscaler.txt
+
+void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),    // %0
+    "+r"(dst_ptr),    // %1
+    "+r"(dst_width)   // %2
+  :: "memory", "cc", "xmm0", "xmm1"
+  );
+}
+
+void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                              uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psrlw     $0x8,%%xmm5                     \n"
+
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10, 0) ",%%xmm1  \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "movdqa    %%xmm1,%%xmm3                   \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "pand      %%xmm5,%%xmm2                   \n"
+    "pand      %%xmm5,%%xmm3                   \n"
+    "pavgw     %%xmm2,%%xmm0                   \n"
+    "pavgw     %%xmm3,%%xmm1                   \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),    // %0
+    "+r"(dst_ptr),    // %1
+    "+r"(dst_width)   // %2
+  :: "memory", "cc", "xmm0", "xmm1", "xmm5"
+  );
+}
+
+void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psrlw     $0x8,%%xmm5                     \n"
+
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    MEMOPREG(movdqu,0x00,0,3,1,xmm2)           //  movdqu  (%0,%3,1),%%xmm2
+    MEMOPREG(movdqu,0x10,0,3,1,xmm3)           //  movdqu  0x10(%0,%3,1),%%xmm3
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "pavgb     %%xmm2,%%xmm0                   \n"
+    "pavgb     %%xmm3,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "movdqa    %%xmm1,%%xmm3                   \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "pand      %%xmm5,%%xmm2                   \n"
+    "pand      %%xmm5,%%xmm3                   \n"
+    "pavgw     %%xmm2,%%xmm0                   \n"
+    "pavgw     %%xmm3,%%xmm1                   \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),    // %0
+    "+r"(dst_ptr),    // %1
+    "+r"(dst_width)   // %2
+  : "r"((intptr_t)(src_stride))   // %3
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+
+void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psrld     $0x18,%%xmm5                    \n"
+    "pslld     $0x10,%%xmm5                    \n"
+
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "pand      %%xmm5,%%xmm1                   \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),    // %0
+    "+r"(dst_ptr),    // %1
+    "+r"(dst_width)   // %2
+  :: "memory", "cc", "xmm0", "xmm1", "xmm5"
+  );
+}
+
+void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width) {
+  intptr_t stridex3 = 0;
+  asm volatile (
+    "pcmpeqb   %%xmm7,%%xmm7                   \n"
+    "psrlw     $0x8,%%xmm7                     \n"
+    "lea       " MEMLEA4(0x00,4,4,2) ",%3      \n"
+
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2
+    MEMOPREG(movdqu,0x10,0,4,1,xmm3)           //  movdqu  0x10(%0,%4,1),%%xmm3
+    "pavgb     %%xmm2,%%xmm0                   \n"
+    "pavgb     %%xmm3,%%xmm1                   \n"
+    MEMOPREG(movdqu,0x00,0,4,2,xmm2)           //  movdqu  (%0,%4,2),%%xmm2
+    MEMOPREG(movdqu,0x10,0,4,2,xmm3)           //  movdqu  0x10(%0,%4,2),%%xmm3
+    MEMOPREG(movdqu,0x00,0,3,1,xmm4)           //  movdqu  (%0,%3,1),%%xmm4
+    MEMOPREG(movdqu,0x10,0,3,1,xmm5)           //  movdqu  0x10(%0,%3,1),%%xmm5
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "pavgb     %%xmm4,%%xmm2                   \n"
+    "pavgb     %%xmm2,%%xmm0                   \n"
+    "pavgb     %%xmm5,%%xmm3                   \n"
+    "pavgb     %%xmm3,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "movdqa    %%xmm1,%%xmm3                   \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "pand      %%xmm7,%%xmm2                   \n"
+    "pand      %%xmm7,%%xmm3                   \n"
+    "pavgw     %%xmm2,%%xmm0                   \n"
+    "pavgw     %%xmm3,%%xmm1                   \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "pand      %%xmm7,%%xmm2                   \n"
+    "pavgw     %%xmm2,%%xmm0                   \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),     // %0
+    "+r"(dst_ptr),     // %1
+    "+r"(dst_width),   // %2
+    "+r"(stridex3)     // %3
+  : "r"((intptr_t)(src_stride))    // %4
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm7"
+  );
+}
+
+void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                          uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "movdqa    %0,%%xmm3                       \n"
+    "movdqa    %1,%%xmm4                       \n"
+    "movdqa    %2,%%xmm5                       \n"
+  :
+  : "m"(kShuf0),  // %0
+    "m"(kShuf1),  // %1
+    "m"(kShuf2)   // %2
+  );
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm2   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "movdqa    %%xmm2,%%xmm1                   \n"
+    "palignr   $0x8,%%xmm0,%%xmm1              \n"
+    "pshufb    %%xmm3,%%xmm0                   \n"
+    "pshufb    %%xmm4,%%xmm1                   \n"
+    "pshufb    %%xmm5,%%xmm2                   \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    "movq      %%xmm1," MEMACCESS2(0x8,1) "    \n"
+    "movq      %%xmm2," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x18,1) ",%1           \n"
+    "sub       $0x18,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),   // %0
+    "+r"(dst_ptr),   // %1
+    "+r"(dst_width)  // %2
+  :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+
+void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "movdqa    %0,%%xmm2                       \n"  // kShuf01
+    "movdqa    %1,%%xmm3                       \n"  // kShuf11
+    "movdqa    %2,%%xmm4                       \n"  // kShuf21
+  :
+  : "m"(kShuf01),  // %0
+    "m"(kShuf11),  // %1
+    "m"(kShuf21)   // %2
+  );
+  asm volatile (
+    "movdqa    %0,%%xmm5                       \n"  // kMadd01
+    "movdqa    %1,%%xmm0                       \n"  // kMadd11
+    "movdqa    %2,%%xmm1                       \n"  // kRound34
+  :
+  : "m"(kMadd01),  // %0
+    "m"(kMadd11),  // %1
+    "m"(kRound34)  // %2
+  );
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
+    MEMOPREG(movdqu,0x00,0,3,1,xmm7)           //  movdqu  (%0,%3),%%xmm7
+    "pavgb     %%xmm7,%%xmm6                   \n"
+    "pshufb    %%xmm2,%%xmm6                   \n"
+    "pmaddubsw %%xmm5,%%xmm6                   \n"
+    "paddsw    %%xmm1,%%xmm6                   \n"
+    "psrlw     $0x2,%%xmm6                     \n"
+    "packuswb  %%xmm6,%%xmm6                   \n"
+    "movq      %%xmm6," MEMACCESS(1) "         \n"
+    "movdqu    " MEMACCESS2(0x8,0) ",%%xmm6    \n"
+    MEMOPREG(movdqu,0x8,0,3,1,xmm7)            //  movdqu  0x8(%0,%3),%%xmm7
+    "pavgb     %%xmm7,%%xmm6                   \n"
+    "pshufb    %%xmm3,%%xmm6                   \n"
+    "pmaddubsw %%xmm0,%%xmm6                   \n"
+    "paddsw    %%xmm1,%%xmm6                   \n"
+    "psrlw     $0x2,%%xmm6                     \n"
+    "packuswb  %%xmm6,%%xmm6                   \n"
+    "movq      %%xmm6," MEMACCESS2(0x8,1) "    \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
+    MEMOPREG(movdqu,0x10,0,3,1,xmm7)           //  movdqu  0x10(%0,%3),%%xmm7
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "pavgb     %%xmm7,%%xmm6                   \n"
+    "pshufb    %%xmm4,%%xmm6                   \n"
+    "pmaddubsw %4,%%xmm6                       \n"
+    "paddsw    %%xmm1,%%xmm6                   \n"
+    "psrlw     $0x2,%%xmm6                     \n"
+    "packuswb  %%xmm6,%%xmm6                   \n"
+    "movq      %%xmm6," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x18,1) ",%1           \n"
+    "sub       $0x18,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),   // %0
+    "+r"(dst_ptr),   // %1
+    "+r"(dst_width)  // %2
+  : "r"((intptr_t)(src_stride)),  // %3
+    "m"(kMadd21)     // %4
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+
+void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "movdqa    %0,%%xmm2                       \n"  // kShuf01
+    "movdqa    %1,%%xmm3                       \n"  // kShuf11
+    "movdqa    %2,%%xmm4                       \n"  // kShuf21
+  :
+  : "m"(kShuf01),  // %0
+    "m"(kShuf11),  // %1
+    "m"(kShuf21)   // %2
+  );
+  asm volatile (
+    "movdqa    %0,%%xmm5                       \n"  // kMadd01
+    "movdqa    %1,%%xmm0                       \n"  // kMadd11
+    "movdqa    %2,%%xmm1                       \n"  // kRound34
+  :
+  : "m"(kMadd01),  // %0
+    "m"(kMadd11),  // %1
+    "m"(kRound34)  // %2
+  );
+
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
+    MEMOPREG(movdqu,0x00,0,3,1,xmm7)           //  movdqu  (%0,%3,1),%%xmm7
+    "pavgb     %%xmm6,%%xmm7                   \n"
+    "pavgb     %%xmm7,%%xmm6                   \n"
+    "pshufb    %%xmm2,%%xmm6                   \n"
+    "pmaddubsw %%xmm5,%%xmm6                   \n"
+    "paddsw    %%xmm1,%%xmm6                   \n"
+    "psrlw     $0x2,%%xmm6                     \n"
+    "packuswb  %%xmm6,%%xmm6                   \n"
+    "movq      %%xmm6," MEMACCESS(1) "         \n"
+    "movdqu    " MEMACCESS2(0x8,0) ",%%xmm6    \n"
+    MEMOPREG(movdqu,0x8,0,3,1,xmm7)            //  movdqu  0x8(%0,%3,1),%%xmm7
+    "pavgb     %%xmm6,%%xmm7                   \n"
+    "pavgb     %%xmm7,%%xmm6                   \n"
+    "pshufb    %%xmm3,%%xmm6                   \n"
+    "pmaddubsw %%xmm0,%%xmm6                   \n"
+    "paddsw    %%xmm1,%%xmm6                   \n"
+    "psrlw     $0x2,%%xmm6                     \n"
+    "packuswb  %%xmm6,%%xmm6                   \n"
+    "movq      %%xmm6," MEMACCESS2(0x8,1) "    \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
+    MEMOPREG(movdqu,0x10,0,3,1,xmm7)           //  movdqu  0x10(%0,%3,1),%%xmm7
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "pavgb     %%xmm6,%%xmm7                   \n"
+    "pavgb     %%xmm7,%%xmm6                   \n"
+    "pshufb    %%xmm4,%%xmm6                   \n"
+    "pmaddubsw %4,%%xmm6                       \n"
+    "paddsw    %%xmm1,%%xmm6                   \n"
+    "psrlw     $0x2,%%xmm6                     \n"
+    "packuswb  %%xmm6,%%xmm6                   \n"
+    "movq      %%xmm6," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x18,1) ",%1           \n"
+    "sub       $0x18,%2                        \n"
+    "jg        1b                              \n"
+    : "+r"(src_ptr),   // %0
+      "+r"(dst_ptr),   // %1
+      "+r"(dst_width)  // %2
+    : "r"((intptr_t)(src_stride)),  // %3
+      "m"(kMadd21)     // %4
+    : "memory", "cc", NACL_R14
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+
+void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                          uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "movdqa    %3,%%xmm4                       \n"
+    "movdqa    %4,%%xmm5                       \n"
+
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "pshufb    %%xmm4,%%xmm0                   \n"
+    "pshufb    %%xmm5,%%xmm1                   \n"
+    "paddusb   %%xmm1,%%xmm0                   \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    "movhlps   %%xmm0,%%xmm1                   \n"
+    "movd      %%xmm1," MEMACCESS2(0x8,1) "    \n"
+    "lea       " MEMLEA(0xc,1) ",%1            \n"
+    "sub       $0xc,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),   // %0
+    "+r"(dst_ptr),   // %1
+    "+r"(dst_width)  // %2
+  : "m"(kShuf38a),   // %3
+    "m"(kShuf38b)    // %4
+  : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
+  );
+}
+
+void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "movdqa    %0,%%xmm2                       \n"
+    "movdqa    %1,%%xmm3                       \n"
+    "movdqa    %2,%%xmm4                       \n"
+    "movdqa    %3,%%xmm5                       \n"
+  :
+  : "m"(kShufAb0),   // %0
+    "m"(kShufAb1),   // %1
+    "m"(kShufAb2),   // %2
+    "m"(kScaleAb2)   // %3
+  );
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,0,3,1,xmm1)           //  movdqu  (%0,%3,1),%%xmm1
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "pavgb     %%xmm1,%%xmm0                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "pshufb    %%xmm2,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm6                   \n"
+    "pshufb    %%xmm3,%%xmm6                   \n"
+    "paddusw   %%xmm6,%%xmm1                   \n"
+    "pshufb    %%xmm4,%%xmm0                   \n"
+    "paddusw   %%xmm0,%%xmm1                   \n"
+    "pmulhuw   %%xmm5,%%xmm1                   \n"
+    "packuswb  %%xmm1,%%xmm1                   \n"
+    "movd      %%xmm1," MEMACCESS(1) "         \n"
+    "psrlq     $0x10,%%xmm1                    \n"
+    "movd      %%xmm1," MEMACCESS2(0x2,1) "    \n"
+    "lea       " MEMLEA(0x6,1) ",%1            \n"
+    "sub       $0x6,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),     // %0
+    "+r"(dst_ptr),     // %1
+    "+r"(dst_width)    // %2
+  : "r"((intptr_t)(src_stride))  // %3
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+  );
+}
+
+void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "movdqa    %0,%%xmm2                       \n"
+    "movdqa    %1,%%xmm3                       \n"
+    "movdqa    %2,%%xmm4                       \n"
+    "pxor      %%xmm5,%%xmm5                   \n"
+  :
+  : "m"(kShufAc),    // %0
+    "m"(kShufAc3),   // %1
+    "m"(kScaleAc33)  // %2
+  );
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,0,3,1,xmm6)           //  movdqu  (%0,%3,1),%%xmm6
+    "movhlps   %%xmm0,%%xmm1                   \n"
+    "movhlps   %%xmm6,%%xmm7                   \n"
+    "punpcklbw %%xmm5,%%xmm0                   \n"
+    "punpcklbw %%xmm5,%%xmm1                   \n"
+    "punpcklbw %%xmm5,%%xmm6                   \n"
+    "punpcklbw %%xmm5,%%xmm7                   \n"
+    "paddusw   %%xmm6,%%xmm0                   \n"
+    "paddusw   %%xmm7,%%xmm1                   \n"
+    MEMOPREG(movdqu,0x00,0,3,2,xmm6)           //  movdqu  (%0,%3,2),%%xmm6
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movhlps   %%xmm6,%%xmm7                   \n"
+    "punpcklbw %%xmm5,%%xmm6                   \n"
+    "punpcklbw %%xmm5,%%xmm7                   \n"
+    "paddusw   %%xmm6,%%xmm0                   \n"
+    "paddusw   %%xmm7,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm6                   \n"
+    "psrldq    $0x2,%%xmm0                     \n"
+    "paddusw   %%xmm0,%%xmm6                   \n"
+    "psrldq    $0x2,%%xmm0                     \n"
+    "paddusw   %%xmm0,%%xmm6                   \n"
+    "pshufb    %%xmm2,%%xmm6                   \n"
+    "movdqa    %%xmm1,%%xmm7                   \n"
+    "psrldq    $0x2,%%xmm1                     \n"
+    "paddusw   %%xmm1,%%xmm7                   \n"
+    "psrldq    $0x2,%%xmm1                     \n"
+    "paddusw   %%xmm1,%%xmm7                   \n"
+    "pshufb    %%xmm3,%%xmm7                   \n"
+    "paddusw   %%xmm7,%%xmm6                   \n"
+    "pmulhuw   %%xmm4,%%xmm6                   \n"
+    "packuswb  %%xmm6,%%xmm6                   \n"
+    "movd      %%xmm6," MEMACCESS(1) "         \n"
+    "psrlq     $0x10,%%xmm6                    \n"
+    "movd      %%xmm6," MEMACCESS2(0x2,1) "    \n"
+    "lea       " MEMLEA(0x6,1) ",%1            \n"
+    "sub       $0x6,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),    // %0
+    "+r"(dst_ptr),    // %1
+    "+r"(dst_width)   // %2
+  : "r"((intptr_t)(src_stride))   // %3
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+
+// Reads 16xN bytes and produces 16 shorts at a time.
+void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                       uint16* dst_ptr, int src_width, int src_height) {
+  int tmp_height = 0;
+  intptr_t tmp_src = 0;
+  asm volatile (
+    "mov       %0,%3                           \n"  // row pointer
+    "mov       %5,%2                           \n"  // height
+    "pxor      %%xmm0,%%xmm0                   \n"  // clear accumulators
+    "pxor      %%xmm1,%%xmm1                   \n"
+    "pxor      %%xmm4,%%xmm4                   \n"
+
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(3) ",%%xmm2         \n"
+    "add       %6,%3                           \n"
+    "movdqa    %%xmm2,%%xmm3                   \n"
+    "punpcklbw %%xmm4,%%xmm2                   \n"
+    "punpckhbw %%xmm4,%%xmm3                   \n"
+    "paddusw   %%xmm2,%%xmm0                   \n"
+    "paddusw   %%xmm3,%%xmm1                   \n"
+    "sub       $0x1,%2                         \n"
+    "jg        1b                              \n"
+
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"  // src_ptr += 16
+    "mov       %0,%3                           \n"  // row pointer
+    "mov       %5,%2                           \n"  // height
+    "pxor      %%xmm0,%%xmm0                   \n"  // clear accumulators
+    "pxor      %%xmm1,%%xmm1                   \n"
+    "sub       $0x10,%4                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),     // %0
+    "+r"(dst_ptr),     // %1
+    "+r"(tmp_height),  // %2
+    "+r"(tmp_src),     // %3
+    "+r"(src_width),   // %4
+    "+rm"(src_height)  // %5
+  : "rm"((intptr_t)(src_stride))  // %6
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
+  );
+}
+
+// Bilinear column filtering. SSSE3 version.
+void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+                           int dst_width, int x, int dx) {
+  intptr_t x0 = 0, x1 = 0, temp_pixel = 0;
+  asm volatile (
+    "movd      %6,%%xmm2                       \n"
+    "movd      %7,%%xmm3                       \n"
+    "movl      $0x04040000,%k2                 \n"
+    "movd      %k2,%%xmm5                      \n"
+    "pcmpeqb   %%xmm6,%%xmm6                   \n"
+    "psrlw     $0x9,%%xmm6                     \n"
+    "pextrw    $0x1,%%xmm2,%k3                 \n"
+    "subl      $0x2,%5                         \n"
+    "jl        29f                             \n"
+    "movdqa    %%xmm2,%%xmm0                   \n"
+    "paddd     %%xmm3,%%xmm0                   \n"
+    "punpckldq %%xmm0,%%xmm2                   \n"
+    "punpckldq %%xmm3,%%xmm3                   \n"
+    "paddd     %%xmm3,%%xmm3                   \n"
+    "pextrw    $0x3,%%xmm2,%k4                 \n"
+
+    LABELALIGN
+  "2:                                          \n"
+    "movdqa    %%xmm2,%%xmm1                   \n"
+    "paddd     %%xmm3,%%xmm2                   \n"
+    MEMOPARG(movzwl,0x00,1,3,1,k2)             //  movzwl  (%1,%3,1),%k2
+    "movd      %k2,%%xmm0                      \n"
+    "psrlw     $0x9,%%xmm1                     \n"
+    MEMOPARG(movzwl,0x00,1,4,1,k2)             //  movzwl  (%1,%4,1),%k2
+    "movd      %k2,%%xmm4                      \n"
+    "pshufb    %%xmm5,%%xmm1                   \n"
+    "punpcklwd %%xmm4,%%xmm0                   \n"
+    "pxor      %%xmm6,%%xmm1                   \n"
+    "pmaddubsw %%xmm1,%%xmm0                   \n"
+    "pextrw    $0x1,%%xmm2,%k3                 \n"
+    "pextrw    $0x3,%%xmm2,%k4                 \n"
+    "psrlw     $0x7,%%xmm0                     \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "movd      %%xmm0,%k2                      \n"
+    "mov       %w2," MEMACCESS(0) "            \n"
+    "lea       " MEMLEA(0x2,0) ",%0            \n"
+    "sub       $0x2,%5                         \n"
+    "jge       2b                              \n"
+
+    LABELALIGN
+  "29:                                         \n"
+    "addl      $0x1,%5                         \n"
+    "jl        99f                             \n"
+    MEMOPARG(movzwl,0x00,1,3,1,k2)             //  movzwl  (%1,%3,1),%k2
+    "movd      %k2,%%xmm0                      \n"
+    "psrlw     $0x9,%%xmm2                     \n"
+    "pshufb    %%xmm5,%%xmm2                   \n"
+    "pxor      %%xmm6,%%xmm2                   \n"
+    "pmaddubsw %%xmm2,%%xmm0                   \n"
+    "psrlw     $0x7,%%xmm0                     \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "movd      %%xmm0,%k2                      \n"
+    "mov       %b2," MEMACCESS(0) "            \n"
+  "99:                                         \n"
+  : "+r"(dst_ptr),     // %0
+    "+r"(src_ptr),     // %1
+    "+a"(temp_pixel),  // %2
+    "+r"(x0),          // %3
+    "+r"(x1),          // %4
+    "+rm"(dst_width)   // %5
+  : "rm"(x),           // %6
+    "rm"(dx)           // %7
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+  );
+}
+
+// Reads 4 pixels, duplicates them and writes 8 pixels.
+// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
+void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+                       int dst_width, int x, int dx) {
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklbw %%xmm0,%%xmm0                   \n"
+    "punpckhbw %%xmm1,%%xmm1                   \n"
+    "movdqu    %%xmm0," MEMACCESS(0) "         \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,0) "   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "sub       $0x20,%2                         \n"
+    "jg        1b                              \n"
+
+  : "+r"(dst_ptr),     // %0
+    "+r"(src_ptr),     // %1
+    "+r"(dst_width)    // %2
+  :: "memory", "cc", "xmm0", "xmm1"
+  );
+}
+
+void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
+                            ptrdiff_t src_stride,
+                            uint8* dst_argb, int dst_width) {
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "shufps    $0xdd,%%xmm1,%%xmm0             \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(dst_width)  // %2
+  :: "memory", "cc", "xmm0", "xmm1"
+  );
+}
+
+void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
+                                  ptrdiff_t src_stride,
+                                  uint8* dst_argb, int dst_width) {
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "shufps    $0x88,%%xmm1,%%xmm0             \n"
+    "shufps    $0xdd,%%xmm1,%%xmm2             \n"
+    "pavgb     %%xmm2,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(dst_width)  // %2
+  :: "memory", "cc", "xmm0", "xmm1"
+  );
+}
+
+void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
+                               ptrdiff_t src_stride,
+                               uint8* dst_argb, int dst_width) {
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    MEMOPREG(movdqu,0x00,0,3,1,xmm2)           //  movdqu   (%0,%3,1),%%xmm2
+    MEMOPREG(movdqu,0x10,0,3,1,xmm3)           //  movdqu   0x10(%0,%3,1),%%xmm3
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "pavgb     %%xmm2,%%xmm0                   \n"
+    "pavgb     %%xmm3,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "shufps    $0x88,%%xmm1,%%xmm0             \n"
+    "shufps    $0xdd,%%xmm1,%%xmm2             \n"
+    "pavgb     %%xmm2,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),   // %0
+    "+r"(dst_argb),   // %1
+    "+r"(dst_width)   // %2
+  : "r"((intptr_t)(src_stride))   // %3
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3"
+  );
+}
+
+// Reads 4 pixels at a time.
+// Alignment requirement: dst_argb 16 byte aligned.
+void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+                               int src_stepx, uint8* dst_argb, int dst_width) {
+  intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
+  intptr_t src_stepx_x12 = 0;
+  asm volatile (
+    "lea       " MEMLEA3(0x00,1,4) ",%1        \n"
+    "lea       " MEMLEA4(0x00,1,1,2) ",%4      \n"
+    LABELALIGN
+  "1:                                          \n"
+    "movd      " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movd,0x00,0,1,1,xmm1)             //  movd      (%0,%1,1),%%xmm1
+    "punpckldq %%xmm1,%%xmm0                   \n"
+    MEMOPREG(movd,0x00,0,1,2,xmm2)             //  movd      (%0,%1,2),%%xmm2
+    MEMOPREG(movd,0x00,0,4,1,xmm3)             //  movd      (%0,%4,1),%%xmm3
+    "lea       " MEMLEA4(0x00,0,1,4) ",%0      \n"
+    "punpckldq %%xmm3,%%xmm2                   \n"
+    "punpcklqdq %%xmm2,%%xmm0                  \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x4,%3                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),      // %0
+    "+r"(src_stepx_x4),  // %1
+    "+r"(dst_argb),      // %2
+    "+r"(dst_width),     // %3
+    "+r"(src_stepx_x12)  // %4
+  :: "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3"
+  );
+}
+
+// Blends four 2x2 to 4x1.
+// Alignment requirement: dst_argb 16 byte aligned.
+void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
+                                  ptrdiff_t src_stride, int src_stepx,
+                                  uint8* dst_argb, int dst_width) {
+  intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
+  intptr_t src_stepx_x12 = 0;
+  intptr_t row1 = (intptr_t)(src_stride);
+  asm volatile (
+    "lea       " MEMLEA3(0x00,1,4) ",%1        \n"
+    "lea       " MEMLEA4(0x00,1,1,2) ",%4      \n"
+    "lea       " MEMLEA4(0x00,0,5,1) ",%5      \n"
+
+    LABELALIGN
+  "1:                                          \n"
+    "movq      " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movhps,0x00,0,1,1,xmm0)           //  movhps    (%0,%1,1),%%xmm0
+    MEMOPREG(movq,0x00,0,1,2,xmm1)             //  movq      (%0,%1,2),%%xmm1
+    MEMOPREG(movhps,0x00,0,4,1,xmm1)           //  movhps    (%0,%4,1),%%xmm1
+    "lea       " MEMLEA4(0x00,0,1,4) ",%0      \n"
+    "movq      " MEMACCESS(5) ",%%xmm2         \n"
+    MEMOPREG(movhps,0x00,5,1,1,xmm2)           //  movhps    (%5,%1,1),%%xmm2
+    MEMOPREG(movq,0x00,5,1,2,xmm3)             //  movq      (%5,%1,2),%%xmm3
+    MEMOPREG(movhps,0x00,5,4,1,xmm3)           //  movhps    (%5,%4,1),%%xmm3
+    "lea       " MEMLEA4(0x00,5,1,4) ",%5      \n"
+    "pavgb     %%xmm2,%%xmm0                   \n"
+    "pavgb     %%xmm3,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "shufps    $0x88,%%xmm1,%%xmm0             \n"
+    "shufps    $0xdd,%%xmm1,%%xmm2             \n"
+    "pavgb     %%xmm2,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x4,%3                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),       // %0
+    "+r"(src_stepx_x4),   // %1
+    "+r"(dst_argb),       // %2
+    "+rm"(dst_width),     // %3
+    "+r"(src_stepx_x12),  // %4
+    "+r"(row1)            // %5
+  :: "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3"
+  );
+}
+
+void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
+                        int dst_width, int x, int dx) {
+  intptr_t x0 = 0, x1 = 0;
+  asm volatile (
+    "movd      %5,%%xmm2                       \n"
+    "movd      %6,%%xmm3                       \n"
+    "pshufd    $0x0,%%xmm2,%%xmm2              \n"
+    "pshufd    $0x11,%%xmm3,%%xmm0             \n"
+    "paddd     %%xmm0,%%xmm2                   \n"
+    "paddd     %%xmm3,%%xmm3                   \n"
+    "pshufd    $0x5,%%xmm3,%%xmm0              \n"
+    "paddd     %%xmm0,%%xmm2                   \n"
+    "paddd     %%xmm3,%%xmm3                   \n"
+    "pshufd    $0x0,%%xmm3,%%xmm3              \n"
+    "pextrw    $0x1,%%xmm2,%k0                 \n"
+    "pextrw    $0x3,%%xmm2,%k1                 \n"
+    "cmp       $0x0,%4                         \n"
+    "jl        99f                             \n"
+    "sub       $0x4,%4                         \n"
+    "jl        49f                             \n"
+
+    LABELALIGN
+  "40:                                         \n"
+    MEMOPREG(movd,0x00,3,0,4,xmm0)             //  movd      (%3,%0,4),%%xmm0
+    MEMOPREG(movd,0x00,3,1,4,xmm1)             //  movd      (%3,%1,4),%%xmm1
+    "pextrw    $0x5,%%xmm2,%k0                 \n"
+    "pextrw    $0x7,%%xmm2,%k1                 \n"
+    "paddd     %%xmm3,%%xmm2                   \n"
+    "punpckldq %%xmm1,%%xmm0                   \n"
+    MEMOPREG(movd,0x00,3,0,4,xmm1)             //  movd      (%3,%0,4),%%xmm1
+    MEMOPREG(movd,0x00,3,1,4,xmm4)             //  movd      (%3,%1,4),%%xmm4
+    "pextrw    $0x1,%%xmm2,%k0                 \n"
+    "pextrw    $0x3,%%xmm2,%k1                 \n"
+    "punpckldq %%xmm4,%%xmm1                   \n"
+    "punpcklqdq %%xmm1,%%xmm0                  \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x4,%4                         \n"
+    "jge       40b                             \n"
+
+  "49:                                         \n"
+    "test      $0x2,%4                         \n"
+    "je        29f                             \n"
+    MEMOPREG(movd,0x00,3,0,4,xmm0)             //  movd      (%3,%0,4),%%xmm0
+    MEMOPREG(movd,0x00,3,1,4,xmm1)             //  movd      (%3,%1,4),%%xmm1
+    "pextrw    $0x5,%%xmm2,%k0                 \n"
+    "punpckldq %%xmm1,%%xmm0                   \n"
+    "movq      %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x8,2) ",%2            \n"
+  "29:                                         \n"
+    "test      $0x1,%4                         \n"
+    "je        99f                             \n"
+    MEMOPREG(movd,0x00,3,0,4,xmm0)             //  movd      (%3,%0,4),%%xmm0
+    "movd      %%xmm0," MEMACCESS(2) "         \n"
+  "99:                                         \n"
+  : "+a"(x0),          // %0
+    "+d"(x1),          // %1
+    "+r"(dst_argb),    // %2
+    "+r"(src_argb),    // %3
+    "+r"(dst_width)    // %4
+  : "rm"(x),           // %5
+    "rm"(dx)           // %6
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
+  );
+}
+
+// Reads 4 pixels, duplicates them and writes 8 pixels.
+// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
+void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
+                           int dst_width, int x, int dx) {
+  asm volatile (
+    LABELALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpckldq %%xmm0,%%xmm0                   \n"
+    "punpckhdq %%xmm1,%%xmm1                   \n"
+    "movdqu    %%xmm0," MEMACCESS(0) "         \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,0) "   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+
+  : "+r"(dst_argb),    // %0
+    "+r"(src_argb),    // %1
+    "+r"(dst_width)    // %2
+  :: "memory", "cc", NACL_R14
+    "xmm0", "xmm1"
+  );
+}
+
+// Shuffle table for arranging 2 pixels into pairs for pmaddubsw
+static uvec8 kShuffleColARGB = {
+  0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u,  // bbggrraa 1st pixel
+  8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
+};
+
+// Shuffle table for duplicating 2 fractions into 8 bytes each
+static uvec8 kShuffleFractions = {
+  0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
+};
+
+// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
+void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
+                               int dst_width, int x, int dx) {
+  intptr_t x0 = 0, x1 = 0;
+  asm volatile (
+    "movdqa    %0,%%xmm4                       \n"
+    "movdqa    %1,%%xmm5                       \n"
+  :
+  : "m"(kShuffleColARGB),  // %0
+    "m"(kShuffleFractions)  // %1
+  );
+
+  asm volatile (
+    "movd      %5,%%xmm2                       \n"
+    "movd      %6,%%xmm3                       \n"
+    "pcmpeqb   %%xmm6,%%xmm6                   \n"
+    "psrlw     $0x9,%%xmm6                     \n"
+    "pextrw    $0x1,%%xmm2,%k3                 \n"
+    "sub       $0x2,%2                         \n"
+    "jl        29f                             \n"
+    "movdqa    %%xmm2,%%xmm0                   \n"
+    "paddd     %%xmm3,%%xmm0                   \n"
+    "punpckldq %%xmm0,%%xmm2                   \n"
+    "punpckldq %%xmm3,%%xmm3                   \n"
+    "paddd     %%xmm3,%%xmm3                   \n"
+    "pextrw    $0x3,%%xmm2,%k4                 \n"
+
+    LABELALIGN
+  "2:                                          \n"
+    "movdqa    %%xmm2,%%xmm1                   \n"
+    "paddd     %%xmm3,%%xmm2                   \n"
+    MEMOPREG(movq,0x00,1,3,4,xmm0)             //  movq      (%1,%3,4),%%xmm0
+    "psrlw     $0x9,%%xmm1                     \n"
+    MEMOPREG(movhps,0x00,1,4,4,xmm0)           //  movhps    (%1,%4,4),%%xmm0
+    "pshufb    %%xmm5,%%xmm1                   \n"
+    "pshufb    %%xmm4,%%xmm0                   \n"
+    "pxor      %%xmm6,%%xmm1                   \n"
+    "pmaddubsw %%xmm1,%%xmm0                   \n"
+    "psrlw     $0x7,%%xmm0                     \n"
+    "pextrw    $0x1,%%xmm2,%k3                 \n"
+    "pextrw    $0x3,%%xmm2,%k4                 \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "movq      %%xmm0," MEMACCESS(0) "         \n"
+    "lea       " MEMLEA(0x8,0) ",%0            \n"
+    "sub       $0x2,%2                         \n"
+    "jge       2b                              \n"
+
+    LABELALIGN
+  "29:                                         \n"
+    "add       $0x1,%2                         \n"
+    "jl        99f                             \n"
+    "psrlw     $0x9,%%xmm2                     \n"
+    MEMOPREG(movq,0x00,1,3,4,xmm0)             //  movq      (%1,%3,4),%%xmm0
+    "pshufb    %%xmm5,%%xmm2                   \n"
+    "pshufb    %%xmm4,%%xmm0                   \n"
+    "pxor      %%xmm6,%%xmm2                   \n"
+    "pmaddubsw %%xmm2,%%xmm0                   \n"
+    "psrlw     $0x7,%%xmm0                     \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "movd      %%xmm0," MEMACCESS(0) "         \n"
+
+    LABELALIGN
+  "99:                                         \n"
+  : "+r"(dst_argb),    // %0
+    "+r"(src_argb),    // %1
+    "+rm"(dst_width),  // %2
+    "+r"(x0),          // %3
+    "+r"(x1)           // %4
+  : "rm"(x),           // %5
+    "rm"(dx)           // %6
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+  );
+}
+
+// Divide num by div and return as 16.16 fixed point result.
+int FixedDiv_X86(int num, int div) {
+  asm volatile (
+    "cdq                                       \n"
+    "shld      $0x10,%%eax,%%edx               \n"
+    "shl       $0x10,%%eax                     \n"
+    "idiv      %1                              \n"
+    "mov       %0, %%eax                       \n"
+    : "+a"(num)  // %0
+    : "c"(div)   // %1
+    : "memory", "cc", "edx"
+  );
+  return num;
+}
+
+// Divide num - 1 by div - 1 and return as 16.16 fixed point result.
+int FixedDiv1_X86(int num, int div) {
+  asm volatile (
+    "cdq                                       \n"
+    "shld      $0x10,%%eax,%%edx               \n"
+    "shl       $0x10,%%eax                     \n"
+    "sub       $0x10001,%%eax                  \n"
+    "sbb       $0x0,%%edx                      \n"
+    "sub       $0x1,%1                         \n"
+    "idiv      %1                              \n"
+    "mov       %0, %%eax                       \n"
+    : "+a"(num)  // %0
+    : "c"(div)   // %1
+    : "memory", "cc", "edx"
+  );
+  return num;
+}
+
+#endif  // defined(__x86_64__) || defined(__i386__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libaom/src/third_party/libyuv/source/scale_mips.cc b/libs/libaom/src/third_party/libyuv/source/scale_mips.cc
new file mode 100644
index 000000000..3eb4f27c4
--- /dev/null
+++ b/libs/libaom/src/third_party/libyuv/source/scale_mips.cc
@@ -0,0 +1,654 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC MIPS DSPR2
+#if !defined(LIBYUV_DISABLE_MIPS) && \
+    defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \
+    (_MIPS_SIM == _MIPS_SIM_ABI32)
+
+void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                              uint8* dst, int dst_width) {
+  __asm__ __volatile__(
+    ".set push                                     \n"
+    ".set noreorder                                \n"
+
+    "srl            $t9, %[dst_width], 4           \n"  // iterations -> by 16
+    "beqz           $t9, 2f                        \n"
+    " nop                                          \n"
+
+    ".p2align       2                              \n"
+  "1:                                              \n"
+    "lw             $t0, 0(%[src_ptr])             \n"  // |3|2|1|0|
+    "lw             $t1, 4(%[src_ptr])             \n"  // |7|6|5|4|
+    "lw             $t2, 8(%[src_ptr])             \n"  // |11|10|9|8|
+    "lw             $t3, 12(%[src_ptr])            \n"  // |15|14|13|12|
+    "lw             $t4, 16(%[src_ptr])            \n"  // |19|18|17|16|
+    "lw             $t5, 20(%[src_ptr])            \n"  // |23|22|21|20|
+    "lw             $t6, 24(%[src_ptr])            \n"  // |27|26|25|24|
+    "lw             $t7, 28(%[src_ptr])            \n"  // |31|30|29|28|
+    // TODO(fbarchard): Use odd pixels instead of even.
+    "precr.qb.ph    $t8, $t1, $t0                  \n"  // |6|4|2|0|
+    "precr.qb.ph    $t0, $t3, $t2                  \n"  // |14|12|10|8|
+    "precr.qb.ph    $t1, $t5, $t4                  \n"  // |22|20|18|16|
+    "precr.qb.ph    $t2, $t7, $t6                  \n"  // |30|28|26|24|
+    "addiu          %[src_ptr], %[src_ptr], 32     \n"
+    "addiu          $t9, $t9, -1                   \n"
+    "sw             $t8, 0(%[dst])                 \n"
+    "sw             $t0, 4(%[dst])                 \n"
+    "sw             $t1, 8(%[dst])                 \n"
+    "sw             $t2, 12(%[dst])                \n"
+    "bgtz           $t9, 1b                        \n"
+    " addiu         %[dst], %[dst], 16             \n"
+
+  "2:                                              \n"
+    "andi           $t9, %[dst_width], 0xf         \n"  // residue
+    "beqz           $t9, 3f                        \n"
+    " nop                                          \n"
+
+  "21:                                             \n"
+    "lbu            $t0, 0(%[src_ptr])             \n"
+    "addiu          %[src_ptr], %[src_ptr], 2      \n"
+    "addiu          $t9, $t9, -1                   \n"
+    "sb             $t0, 0(%[dst])                 \n"
+    "bgtz           $t9, 21b                       \n"
+    " addiu         %[dst], %[dst], 1              \n"
+
+  "3:                                              \n"
+    ".set pop                                      \n"
+  : [src_ptr] "+r" (src_ptr),
+    [dst] "+r" (dst)
+  : [dst_width] "r" (dst_width)
+  : "t0", "t1", "t2", "t3", "t4", "t5",
+    "t6", "t7", "t8", "t9"
+  );
+}
+
+void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                 uint8* dst, int dst_width) {
+  const uint8* t = src_ptr + src_stride;
+
+  __asm__ __volatile__ (
+    ".set push                                    \n"
+    ".set noreorder                               \n"
+
+    "srl            $t9, %[dst_width], 3          \n"  // iterations -> step 8
+    "bltz           $t9, 2f                       \n"
+    " nop                                         \n"
+
+    ".p2align       2                             \n"
+  "1:                                             \n"
+    "lw             $t0, 0(%[src_ptr])            \n"  // |3|2|1|0|
+    "lw             $t1, 4(%[src_ptr])            \n"  // |7|6|5|4|
+    "lw             $t2, 8(%[src_ptr])            \n"  // |11|10|9|8|
+    "lw             $t3, 12(%[src_ptr])           \n"  // |15|14|13|12|
+    "lw             $t4, 0(%[t])                  \n"  // |19|18|17|16|
+    "lw             $t5, 4(%[t])                  \n"  // |23|22|21|20|
+    "lw             $t6, 8(%[t])                  \n"  // |27|26|25|24|
+    "lw             $t7, 12(%[t])                 \n"  // |31|30|29|28|
+    "addiu          $t9, $t9, -1                  \n"
+    "srl            $t8, $t0, 16                  \n"  // |X|X|3|2|
+    "ins            $t0, $t4, 16, 16              \n"  // |17|16|1|0|
+    "ins            $t4, $t8, 0, 16               \n"  // |19|18|3|2|
+    "raddu.w.qb     $t0, $t0                      \n"  // |17+16+1+0|
+    "raddu.w.qb     $t4, $t4                      \n"  // |19+18+3+2|
+    "shra_r.w       $t0, $t0, 2                   \n"  // |t0+2|>>2
+    "shra_r.w       $t4, $t4, 2                   \n"  // |t4+2|>>2
+    "srl            $t8, $t1, 16                  \n"  // |X|X|7|6|
+    "ins            $t1, $t5, 16, 16              \n"  // |21|20|5|4|
+    "ins            $t5, $t8, 0, 16               \n"  // |22|23|7|6|
+    "raddu.w.qb     $t1, $t1                      \n"  // |21+20+5+4|
+    "raddu.w.qb     $t5, $t5                      \n"  // |23+22+7+6|
+    "shra_r.w       $t1, $t1, 2                   \n"  // |t1+2|>>2
+    "shra_r.w       $t5, $t5, 2                   \n"  // |t5+2|>>2
+    "srl            $t8, $t2, 16                  \n"  // |X|X|11|10|
+    "ins            $t2, $t6, 16, 16              \n"  // |25|24|9|8|
+    "ins            $t6, $t8, 0, 16               \n"  // |27|26|11|10|
+    "raddu.w.qb     $t2, $t2                      \n"  // |25+24+9+8|
+    "raddu.w.qb     $t6, $t6                      \n"  // |27+26+11+10|
+    "shra_r.w       $t2, $t2, 2                   \n"  // |t2+2|>>2
+    "shra_r.w       $t6, $t6, 2                   \n"  // |t5+2|>>2
+    "srl            $t8, $t3, 16                  \n"  // |X|X|15|14|
+    "ins            $t3, $t7, 16, 16              \n"  // |29|28|13|12|
+    "ins            $t7, $t8, 0, 16               \n"  // |31|30|15|14|
+    "raddu.w.qb     $t3, $t3                      \n"  // |29+28+13+12|
+    "raddu.w.qb     $t7, $t7                      \n"  // |31+30+15+14|
+    "shra_r.w       $t3, $t3, 2                   \n"  // |t3+2|>>2
+    "shra_r.w       $t7, $t7, 2                   \n"  // |t7+2|>>2
+    "addiu          %[src_ptr], %[src_ptr], 16    \n"
+    "addiu          %[t], %[t], 16                \n"
+    "sb             $t0, 0(%[dst])                \n"
+    "sb             $t4, 1(%[dst])                \n"
+    "sb             $t1, 2(%[dst])                \n"
+    "sb             $t5, 3(%[dst])                \n"
+    "sb             $t2, 4(%[dst])                \n"
+    "sb             $t6, 5(%[dst])                \n"
+    "sb             $t3, 6(%[dst])                \n"
+    "sb             $t7, 7(%[dst])                \n"
+    "bgtz           $t9, 1b                       \n"
+    " addiu         %[dst], %[dst], 8             \n"
+
+  "2:                                             \n"
+    "andi           $t9, %[dst_width], 0x7        \n"  // x = residue
+    "beqz           $t9, 3f                       \n"
+    " nop                                         \n"
+
+    "21:                                          \n"
+    "lwr            $t1, 0(%[src_ptr])            \n"
+    "lwl            $t1, 3(%[src_ptr])            \n"
+    "lwr            $t2, 0(%[t])                  \n"
+    "lwl            $t2, 3(%[t])                  \n"
+    "srl            $t8, $t1, 16                  \n"
+    "ins            $t1, $t2, 16, 16              \n"
+    "ins            $t2, $t8, 0, 16               \n"
+    "raddu.w.qb     $t1, $t1                      \n"
+    "raddu.w.qb     $t2, $t2                      \n"
+    "shra_r.w       $t1, $t1, 2                   \n"
+    "shra_r.w       $t2, $t2, 2                   \n"
+    "sb             $t1, 0(%[dst])                \n"
+    "sb             $t2, 1(%[dst])                \n"
+    "addiu          %[src_ptr], %[src_ptr], 4     \n"
+    "addiu          $t9, $t9, -2                  \n"
+    "addiu          %[t], %[t], 4                 \n"
+    "bgtz           $t9, 21b                      \n"
+    " addiu         %[dst], %[dst], 2             \n"
+
+  "3:                                             \n"
+    ".set pop                                     \n"
+
+  : [src_ptr] "+r" (src_ptr),
+    [dst] "+r" (dst), [t] "+r" (t)
+  : [dst_width] "r" (dst_width)
+  : "t0", "t1", "t2", "t3", "t4", "t5",
+    "t6", "t7", "t8", "t9"
+  );
+}
+
+void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                              uint8* dst, int dst_width) {
+  __asm__ __volatile__ (
+      ".set push                                    \n"
+      ".set noreorder                               \n"
+
+      "srl            $t9, %[dst_width], 3          \n"
+      "beqz           $t9, 2f                       \n"
+      " nop                                         \n"
+
+      ".p2align       2                             \n"
+     "1:                                            \n"
+      "lw             $t1, 0(%[src_ptr])            \n"  // |3|2|1|0|
+      "lw             $t2, 4(%[src_ptr])            \n"  // |7|6|5|4|
+      "lw             $t3, 8(%[src_ptr])            \n"  // |11|10|9|8|
+      "lw             $t4, 12(%[src_ptr])           \n"  // |15|14|13|12|
+      "lw             $t5, 16(%[src_ptr])           \n"  // |19|18|17|16|
+      "lw             $t6, 20(%[src_ptr])           \n"  // |23|22|21|20|
+      "lw             $t7, 24(%[src_ptr])           \n"  // |27|26|25|24|
+      "lw             $t8, 28(%[src_ptr])           \n"  // |31|30|29|28|
+      "precr.qb.ph    $t1, $t2, $t1                 \n"  // |6|4|2|0|
+      "precr.qb.ph    $t2, $t4, $t3                 \n"  // |14|12|10|8|
+      "precr.qb.ph    $t5, $t6, $t5                 \n"  // |22|20|18|16|
+      "precr.qb.ph    $t6, $t8, $t7                 \n"  // |30|28|26|24|
+      "precr.qb.ph    $t1, $t2, $t1                 \n"  // |12|8|4|0|
+      "precr.qb.ph    $t5, $t6, $t5                 \n"  // |28|24|20|16|
+      "addiu          %[src_ptr], %[src_ptr], 32    \n"
+      "addiu          $t9, $t9, -1                  \n"
+      "sw             $t1, 0(%[dst])                \n"
+      "sw             $t5, 4(%[dst])                \n"
+      "bgtz           $t9, 1b                       \n"
+      " addiu         %[dst], %[dst], 8             \n"
+
+    "2:                                             \n"
+      "andi           $t9, %[dst_width], 7          \n"  // residue
+      "beqz           $t9, 3f                       \n"
+      " nop                                         \n"
+
+    "21:                                            \n"
+      "lbu            $t1, 0(%[src_ptr])            \n"
+      "addiu          %[src_ptr], %[src_ptr], 4     \n"
+      "addiu          $t9, $t9, -1                  \n"
+      "sb             $t1, 0(%[dst])                \n"
+      "bgtz           $t9, 21b                      \n"
+      " addiu         %[dst], %[dst], 1             \n"
+
+    "3:                                             \n"
+      ".set pop                                     \n"
+      : [src_ptr] "+r" (src_ptr),
+        [dst] "+r" (dst)
+      : [dst_width] "r" (dst_width)
+      : "t1", "t2", "t3", "t4", "t5",
+        "t6", "t7", "t8", "t9"
+  );
+}
+
+void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                 uint8* dst, int dst_width) {
+  intptr_t stride = src_stride;
+  const uint8* s1 = src_ptr + stride;
+  const uint8* s2 = s1 + stride;
+  const uint8* s3 = s2 + stride;
+
+  __asm__ __volatile__ (
+      ".set push                                  \n"
+      ".set noreorder                             \n"
+
+      "srl           $t9, %[dst_width], 1         \n"
+      "andi          $t8, %[dst_width], 1         \n"
+
+      ".p2align      2                            \n"
+     "1:                                          \n"
+      "lw            $t0, 0(%[src_ptr])           \n"  // |3|2|1|0|
+      "lw            $t1, 0(%[s1])                \n"  // |7|6|5|4|
+      "lw            $t2, 0(%[s2])                \n"  // |11|10|9|8|
+      "lw            $t3, 0(%[s3])                \n"  // |15|14|13|12|
+      "lw            $t4, 4(%[src_ptr])           \n"  // |19|18|17|16|
+      "lw            $t5, 4(%[s1])                \n"  // |23|22|21|20|
+      "lw            $t6, 4(%[s2])                \n"  // |27|26|25|24|
+      "lw            $t7, 4(%[s3])                \n"  // |31|30|29|28|
+      "raddu.w.qb    $t0, $t0                     \n"  // |3 + 2 + 1 + 0|
+      "raddu.w.qb    $t1, $t1                     \n"  // |7 + 6 + 5 + 4|
+      "raddu.w.qb    $t2, $t2                     \n"  // |11 + 10 + 9 + 8|
+      "raddu.w.qb    $t3, $t3                     \n"  // |15 + 14 + 13 + 12|
+      "raddu.w.qb    $t4, $t4                     \n"  // |19 + 18 + 17 + 16|
+      "raddu.w.qb    $t5, $t5                     \n"  // |23 + 22 + 21 + 20|
+      "raddu.w.qb    $t6, $t6                     \n"  // |27 + 26 + 25 + 24|
+      "raddu.w.qb    $t7, $t7                     \n"  // |31 + 30 + 29 + 28|
+      "add           $t0, $t0, $t1                \n"
+      "add           $t1, $t2, $t3                \n"
+      "add           $t0, $t0, $t1                \n"
+      "add           $t4, $t4, $t5                \n"
+      "add           $t6, $t6, $t7                \n"
+      "add           $t4, $t4, $t6                \n"
+      "shra_r.w      $t0, $t0, 4                  \n"
+      "shra_r.w      $t4, $t4, 4                  \n"
+      "sb            $t0, 0(%[dst])               \n"
+      "sb            $t4, 1(%[dst])               \n"
+      "addiu         %[src_ptr], %[src_ptr], 8    \n"
+      "addiu         %[s1], %[s1], 8              \n"
+      "addiu         %[s2], %[s2], 8              \n"
+      "addiu         %[s3], %[s3], 8              \n"
+      "addiu         $t9, $t9, -1                 \n"
+      "bgtz          $t9, 1b                      \n"
+      " addiu        %[dst], %[dst], 2            \n"
+      "beqz          $t8, 2f                      \n"
+      " nop                                       \n"
+
+      "lw            $t0, 0(%[src_ptr])           \n"  // |3|2|1|0|
+      "lw            $t1, 0(%[s1])                \n"  // |7|6|5|4|
+      "lw            $t2, 0(%[s2])                \n"  // |11|10|9|8|
+      "lw            $t3, 0(%[s3])                \n"  // |15|14|13|12|
+      "raddu.w.qb    $t0, $t0                     \n"  // |3 + 2 + 1 + 0|
+      "raddu.w.qb    $t1, $t1                     \n"  // |7 + 6 + 5 + 4|
+      "raddu.w.qb    $t2, $t2                     \n"  // |11 + 10 + 9 + 8|
+      "raddu.w.qb    $t3, $t3                     \n"  // |15 + 14 + 13 + 12|
+      "add           $t0, $t0, $t1                \n"
+      "add           $t1, $t2, $t3                \n"
+      "add           $t0, $t0, $t1                \n"
+      "shra_r.w      $t0, $t0, 4                  \n"
+      "sb            $t0, 0(%[dst])               \n"
+
+      "2:                                         \n"
+      ".set pop                                   \n"
+
+      : [src_ptr] "+r" (src_ptr),
+        [dst] "+r" (dst),
+        [s1] "+r" (s1),
+        [s2] "+r" (s2),
+        [s3] "+r" (s3)
+      : [dst_width] "r" (dst_width)
+      : "t0", "t1", "t2", "t3", "t4", "t5",
+        "t6","t7", "t8", "t9"
+  );
+}
+
+void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst, int dst_width) {
+  __asm__ __volatile__ (
+      ".set push                                          \n"
+      ".set noreorder                                     \n"
+      ".p2align        2                                  \n"
+    "1:                                                   \n"
+      "lw              $t1, 0(%[src_ptr])                 \n"  // |3|2|1|0|
+      "lw              $t2, 4(%[src_ptr])                 \n"  // |7|6|5|4|
+      "lw              $t3, 8(%[src_ptr])                 \n"  // |11|10|9|8|
+      "lw              $t4, 12(%[src_ptr])                \n"  // |15|14|13|12|
+      "lw              $t5, 16(%[src_ptr])                \n"  // |19|18|17|16|
+      "lw              $t6, 20(%[src_ptr])                \n"  // |23|22|21|20|
+      "lw              $t7, 24(%[src_ptr])                \n"  // |27|26|25|24|
+      "lw              $t8, 28(%[src_ptr])                \n"  // |31|30|29|28|
+      "precrq.qb.ph    $t0, $t2, $t4                      \n"  // |7|5|15|13|
+      "precrq.qb.ph    $t9, $t6, $t8                      \n"  // |23|21|31|30|
+      "addiu           %[dst_width], %[dst_width], -24    \n"
+      "ins             $t1, $t1, 8, 16                    \n"  // |3|1|0|X|
+      "ins             $t4, $t0, 8, 16                    \n"  // |X|15|13|12|
+      "ins             $t5, $t5, 8, 16                    \n"  // |19|17|16|X|
+      "ins             $t8, $t9, 8, 16                    \n"  // |X|31|29|28|
+      "addiu           %[src_ptr], %[src_ptr], 32         \n"
+      "packrl.ph       $t0, $t3, $t0                      \n"  // |9|8|7|5|
+      "packrl.ph       $t9, $t7, $t9                      \n"  // |25|24|23|21|
+      "prepend         $t1, $t2, 8                        \n"  // |4|3|1|0|
+      "prepend         $t3, $t4, 24                       \n"  // |15|13|12|11|
+      "prepend         $t5, $t6, 8                        \n"  // |20|19|17|16|
+      "prepend         $t7, $t8, 24                       \n"  // |31|29|28|27|
+      "sw              $t1, 0(%[dst])                     \n"
+      "sw              $t0, 4(%[dst])                     \n"
+      "sw              $t3, 8(%[dst])                     \n"
+      "sw              $t5, 12(%[dst])                    \n"
+      "sw              $t9, 16(%[dst])                    \n"
+      "sw              $t7, 20(%[dst])                    \n"
+      "bnez            %[dst_width], 1b                   \n"
+      " addiu          %[dst], %[dst], 24                 \n"
+      ".set pop                                           \n"
+      : [src_ptr] "+r" (src_ptr),
+        [dst] "+r" (dst),
+        [dst_width] "+r" (dst_width)
+      :
+      : "t0", "t1", "t2", "t3", "t4", "t5",
+        "t6","t7", "t8", "t9"
+  );
+}
+
+void ScaleRowDown34_0_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                     uint8* d, int dst_width) {
+  __asm__ __volatile__ (
+      ".set push                                         \n"
+      ".set noreorder                                    \n"
+      "repl.ph           $t3, 3                          \n"  // 0x00030003
+
+     ".p2align           2                               \n"
+    "1:                                                  \n"
+      "lw                $t0, 0(%[src_ptr])              \n"  // |S3|S2|S1|S0|
+      "lwx               $t1, %[src_stride](%[src_ptr])  \n"  // |T3|T2|T1|T0|
+      "rotr              $t2, $t0, 8                     \n"  // |S0|S3|S2|S1|
+      "rotr              $t6, $t1, 8                     \n"  // |T0|T3|T2|T1|
+      "muleu_s.ph.qbl    $t4, $t2, $t3                   \n"  // |S0*3|S3*3|
+      "muleu_s.ph.qbl    $t5, $t6, $t3                   \n"  // |T0*3|T3*3|
+      "andi              $t0, $t2, 0xFFFF                \n"  // |0|0|S2|S1|
+      "andi              $t1, $t6, 0xFFFF                \n"  // |0|0|T2|T1|
+      "raddu.w.qb        $t0, $t0                        \n"
+      "raddu.w.qb        $t1, $t1                        \n"
+      "shra_r.w          $t0, $t0, 1                     \n"
+      "shra_r.w          $t1, $t1, 1                     \n"
+      "preceu.ph.qbr     $t2, $t2                        \n"  // |0|S2|0|S1|
+      "preceu.ph.qbr     $t6, $t6                        \n"  // |0|T2|0|T1|
+      "rotr              $t2, $t2, 16                    \n"  // |0|S1|0|S2|
+      "rotr              $t6, $t6, 16                    \n"  // |0|T1|0|T2|
+      "addu.ph           $t2, $t2, $t4                   \n"
+      "addu.ph           $t6, $t6, $t5                   \n"
+      "sll               $t5, $t0, 1                     \n"
+      "add               $t0, $t5, $t0                   \n"
+      "shra_r.ph         $t2, $t2, 2                     \n"
+      "shra_r.ph         $t6, $t6, 2                     \n"
+      "shll.ph           $t4, $t2, 1                     \n"
+      "addq.ph           $t4, $t4, $t2                   \n"
+      "addu              $t0, $t0, $t1                   \n"
+      "addiu             %[src_ptr], %[src_ptr], 4       \n"
+      "shra_r.w          $t0, $t0, 2                     \n"
+      "addu.ph           $t6, $t6, $t4                   \n"
+      "shra_r.ph         $t6, $t6, 2                     \n"
+      "srl               $t1, $t6, 16                    \n"
+      "addiu             %[dst_width], %[dst_width], -3  \n"
+      "sb                $t1, 0(%[d])                    \n"
+      "sb                $t0, 1(%[d])                    \n"
+      "sb                $t6, 2(%[d])                    \n"
+      "bgtz              %[dst_width], 1b                \n"
+      " addiu            %[d], %[d], 3                   \n"
+    "3:                                                  \n"
+      ".set pop                                          \n"
+      : [src_ptr] "+r" (src_ptr),
+        [src_stride] "+r" (src_stride),
+        [d] "+r" (d),
+        [dst_width] "+r" (dst_width)
+      :
+      : "t0", "t1", "t2", "t3",
+        "t4", "t5", "t6"
+  );
+}
+
+void ScaleRowDown34_1_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                     uint8* d, int dst_width) {
+  __asm__ __volatile__ (
+      ".set push                                           \n"
+      ".set noreorder                                      \n"
+      "repl.ph           $t2, 3                            \n"  // 0x00030003
+
+      ".p2align          2                                 \n"
+    "1:                                                    \n"
+      "lw                $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|
+      "lwx               $t1, %[src_stride](%[src_ptr])    \n"  // |T3|T2|T1|T0|
+      "rotr              $t4, $t0, 8                       \n"  // |S0|S3|S2|S1|
+      "rotr              $t6, $t1, 8                       \n"  // |T0|T3|T2|T1|
+      "muleu_s.ph.qbl    $t3, $t4, $t2                     \n"  // |S0*3|S3*3|
+      "muleu_s.ph.qbl    $t5, $t6, $t2                     \n"  // |T0*3|T3*3|
+      "andi              $t0, $t4, 0xFFFF                  \n"  // |0|0|S2|S1|
+      "andi              $t1, $t6, 0xFFFF                  \n"  // |0|0|T2|T1|
+      "raddu.w.qb        $t0, $t0                          \n"
+      "raddu.w.qb        $t1, $t1                          \n"
+      "shra_r.w          $t0, $t0, 1                       \n"
+      "shra_r.w          $t1, $t1, 1                       \n"
+      "preceu.ph.qbr     $t4, $t4                          \n"  // |0|S2|0|S1|
+      "preceu.ph.qbr     $t6, $t6                          \n"  // |0|T2|0|T1|
+      "rotr              $t4, $t4, 16                      \n"  // |0|S1|0|S2|
+      "rotr              $t6, $t6, 16                      \n"  // |0|T1|0|T2|
+      "addu.ph           $t4, $t4, $t3                     \n"
+      "addu.ph           $t6, $t6, $t5                     \n"
+      "shra_r.ph         $t6, $t6, 2                       \n"
+      "shra_r.ph         $t4, $t4, 2                       \n"
+      "addu.ph           $t6, $t6, $t4                     \n"
+      "addiu             %[src_ptr], %[src_ptr], 4         \n"
+      "shra_r.ph         $t6, $t6, 1                       \n"
+      "addu              $t0, $t0, $t1                     \n"
+      "addiu             %[dst_width], %[dst_width], -3    \n"
+      "shra_r.w          $t0, $t0, 1                       \n"
+      "srl               $t1, $t6, 16                      \n"
+      "sb                $t1, 0(%[d])                      \n"
+      "sb                $t0, 1(%[d])                      \n"
+      "sb                $t6, 2(%[d])                      \n"
+      "bgtz              %[dst_width], 1b                  \n"
+      " addiu            %[d], %[d], 3                     \n"
+    "3:                                                    \n"
+      ".set pop                                            \n"
+      : [src_ptr] "+r" (src_ptr),
+        [src_stride] "+r" (src_stride),
+        [d] "+r" (d),
+        [dst_width] "+r" (dst_width)
+      :
+      : "t0", "t1", "t2", "t3",
+        "t4", "t5", "t6"
+  );
+}
+
+void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst, int dst_width) {
+  __asm__ __volatile__ (
+      ".set push                                     \n"
+      ".set noreorder                                \n"
+
+      ".p2align   2                                  \n"
+    "1:                                              \n"
+      "lw         $t0, 0(%[src_ptr])                 \n"  // |3|2|1|0|
+      "lw         $t1, 4(%[src_ptr])                 \n"  // |7|6|5|4|
+      "lw         $t2, 8(%[src_ptr])                 \n"  // |11|10|9|8|
+      "lw         $t3, 12(%[src_ptr])                \n"  // |15|14|13|12|
+      "lw         $t4, 16(%[src_ptr])                \n"  // |19|18|17|16|
+      "lw         $t5, 20(%[src_ptr])                \n"  // |23|22|21|20|
+      "lw         $t6, 24(%[src_ptr])                \n"  // |27|26|25|24|
+      "lw         $t7, 28(%[src_ptr])                \n"  // |31|30|29|28|
+      "wsbh       $t0, $t0                           \n"  // |2|3|0|1|
+      "wsbh       $t6, $t6                           \n"  // |26|27|24|25|
+      "srl        $t0, $t0, 8                        \n"  // |X|2|3|0|
+      "srl        $t3, $t3, 16                       \n"  // |X|X|15|14|
+      "srl        $t5, $t5, 16                       \n"  // |X|X|23|22|
+      "srl        $t7, $t7, 16                       \n"  // |X|X|31|30|
+      "ins        $t1, $t2, 24, 8                    \n"  // |8|6|5|4|
+      "ins        $t6, $t5, 0, 8                     \n"  // |26|27|24|22|
+      "ins        $t1, $t0, 0, 16                    \n"  // |8|6|3|0|
+      "ins        $t6, $t7, 24, 8                    \n"  // |30|27|24|22|
+      "prepend    $t2, $t3, 24                       \n"  // |X|15|14|11|
+      "ins        $t4, $t4, 16, 8                    \n"  // |19|16|17|X|
+      "ins        $t4, $t2, 0, 16                    \n"  // |19|16|14|11|
+      "addiu      %[src_ptr], %[src_ptr], 32         \n"
+      "addiu      %[dst_width], %[dst_width], -12    \n"
+      "addiu      $t8,%[dst_width], -12              \n"
+      "sw         $t1, 0(%[dst])                     \n"
+      "sw         $t4, 4(%[dst])                     \n"
+      "sw         $t6, 8(%[dst])                     \n"
+      "bgez       $t8, 1b                            \n"
+      " addiu     %[dst], %[dst], 12                 \n"
+      ".set pop                                      \n"
+      : [src_ptr] "+r" (src_ptr),
+        [dst] "+r" (dst),
+        [dst_width] "+r" (dst_width)
+      :
+      : "t0", "t1", "t2", "t3", "t4",
+        "t5", "t6", "t7", "t8"
+  );
+}
+
+void ScaleRowDown38_2_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                     uint8* dst_ptr, int dst_width) {
+  intptr_t stride = src_stride;
+  const uint8* t = src_ptr + stride;
+  const int c = 0x2AAA;
+
+  __asm__ __volatile__ (
+      ".set push                                         \n"
+      ".set noreorder                                    \n"
+
+      ".p2align        2                                 \n"
+    "1:                                                  \n"
+      "lw              $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|
+      "lw              $t1, 4(%[src_ptr])                \n"  // |S7|S6|S5|S4|
+      "lw              $t2, 0(%[t])                      \n"  // |T3|T2|T1|T0|
+      "lw              $t3, 4(%[t])                      \n"  // |T7|T6|T5|T4|
+      "rotr            $t1, $t1, 16                      \n"  // |S5|S4|S7|S6|
+      "packrl.ph       $t4, $t1, $t3                     \n"  // |S7|S6|T7|T6|
+      "packrl.ph       $t5, $t3, $t1                     \n"  // |T5|T4|S5|S4|
+      "raddu.w.qb      $t4, $t4                          \n"  // S7+S6+T7+T6
+      "raddu.w.qb      $t5, $t5                          \n"  // T5+T4+S5+S4
+      "precrq.qb.ph    $t6, $t0, $t2                     \n"  // |S3|S1|T3|T1|
+      "precrq.qb.ph    $t6, $t6, $t6                     \n"  // |S3|T3|S3|T3|
+      "srl             $t4, $t4, 2                       \n"  // t4 / 4
+      "srl             $t6, $t6, 16                      \n"  // |0|0|S3|T3|
+      "raddu.w.qb      $t6, $t6                          \n"  // 0+0+S3+T3
+      "addu            $t6, $t5, $t6                     \n"
+      "mul             $t6, $t6, %[c]                    \n"  // t6 * 0x2AAA
+      "sll             $t0, $t0, 8                       \n"  // |S2|S1|S0|0|
+      "sll             $t2, $t2, 8                       \n"  // |T2|T1|T0|0|
+      "raddu.w.qb      $t0, $t0                          \n"  // S2+S1+S0+0
+      "raddu.w.qb      $t2, $t2                          \n"  // T2+T1+T0+0
+      "addu            $t0, $t0, $t2                     \n"
+      "mul             $t0, $t0, %[c]                    \n"  // t0 * 0x2AAA
+      "addiu           %[src_ptr], %[src_ptr], 8         \n"
+      "addiu           %[t], %[t], 8                     \n"
+      "addiu           %[dst_width], %[dst_width], -3    \n"
+      "addiu           %[dst_ptr], %[dst_ptr], 3         \n"
+      "srl             $t6, $t6, 16                      \n"
+      "srl             $t0, $t0, 16                      \n"
+      "sb              $t4, -1(%[dst_ptr])               \n"
+      "sb              $t6, -2(%[dst_ptr])               \n"
+      "bgtz            %[dst_width], 1b                  \n"
+      " sb             $t0, -3(%[dst_ptr])               \n"
+      ".set pop                                          \n"
+      : [src_ptr] "+r" (src_ptr),
+        [dst_ptr] "+r" (dst_ptr),
+        [t] "+r" (t),
+        [dst_width] "+r" (dst_width)
+      : [c] "r" (c)
+      : "t0", "t1", "t2", "t3", "t4", "t5", "t6"
+  );
+}
+
+void ScaleRowDown38_3_Box_MIPS_DSPR2(const uint8* src_ptr,
+                                     ptrdiff_t src_stride,
+                                     uint8* dst_ptr, int dst_width) {
+  intptr_t stride = src_stride;
+  const uint8* s1 = src_ptr + stride;
+  stride += stride;
+  const uint8* s2 = src_ptr + stride;
+  const int c1 = 0x1C71;
+  const int c2 = 0x2AAA;
+
+  __asm__ __volatile__ (
+      ".set push                                         \n"
+      ".set noreorder                                    \n"
+
+      ".p2align        2                                 \n"
+    "1:                                                  \n"
+      "lw              $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|
+      "lw              $t1, 4(%[src_ptr])                \n"  // |S7|S6|S5|S4|
+      "lw              $t2, 0(%[s1])                     \n"  // |T3|T2|T1|T0|
+      "lw              $t3, 4(%[s1])                     \n"  // |T7|T6|T5|T4|
+      "lw              $t4, 0(%[s2])                     \n"  // |R3|R2|R1|R0|
+      "lw              $t5, 4(%[s2])                     \n"  // |R7|R6|R5|R4|
+      "rotr            $t1, $t1, 16                      \n"  // |S5|S4|S7|S6|
+      "packrl.ph       $t6, $t1, $t3                     \n"  // |S7|S6|T7|T6|
+      "raddu.w.qb      $t6, $t6                          \n"  // S7+S6+T7+T6
+      "packrl.ph       $t7, $t3, $t1                     \n"  // |T5|T4|S5|S4|
+      "raddu.w.qb      $t7, $t7                          \n"  // T5+T4+S5+S4
+      "sll             $t8, $t5, 16                      \n"  // |R5|R4|0|0|
+      "raddu.w.qb      $t8, $t8                          \n"  // R5+R4
+      "addu            $t7, $t7, $t8                     \n"
+      "srl             $t8, $t5, 16                      \n"  // |0|0|R7|R6|
+      "raddu.w.qb      $t8, $t8                          \n"  // R7 + R6
+      "addu            $t6, $t6, $t8                     \n"
+      "mul             $t6, $t6, %[c2]                   \n"  // t6 * 0x2AAA
+      "precrq.qb.ph    $t8, $t0, $t2                     \n"  // |S3|S1|T3|T1|
+      "precrq.qb.ph    $t8, $t8, $t4                     \n"  // |S3|T3|R3|R1|
+      "srl             $t8, $t8, 8                       \n"  // |0|S3|T3|R3|
+      "raddu.w.qb      $t8, $t8                          \n"  // S3 + T3 + R3
+      "addu            $t7, $t7, $t8                     \n"
+      "mul             $t7, $t7, %[c1]                   \n"  // t7 * 0x1C71
+      "sll             $t0, $t0, 8                       \n"  // |S2|S1|S0|0|
+      "sll             $t2, $t2, 8                       \n"  // |T2|T1|T0|0|
+      "sll             $t4, $t4, 8                       \n"  // |R2|R1|R0|0|
+      "raddu.w.qb      $t0, $t0                          \n"
+      "raddu.w.qb      $t2, $t2                          \n"
+      "raddu.w.qb      $t4, $t4                          \n"
+      "addu            $t0, $t0, $t2                     \n"
+      "addu            $t0, $t0, $t4                     \n"
+      "mul             $t0, $t0, %[c1]                   \n"  // t0 * 0x1C71
+      "addiu           %[src_ptr], %[src_ptr], 8         \n"
+      "addiu           %[s1], %[s1], 8                   \n"
+      "addiu           %[s2], %[s2], 8                   \n"
+      "addiu           %[dst_width], %[dst_width], -3    \n"
+      "addiu           %[dst_ptr], %[dst_ptr], 3         \n"
+      "srl             $t6, $t6, 16                      \n"
+      "srl             $t7, $t7, 16                      \n"
+      "srl             $t0, $t0, 16                      \n"
+      "sb              $t6, -1(%[dst_ptr])               \n"
+      "sb              $t7, -2(%[dst_ptr])               \n"
+      "bgtz            %[dst_width], 1b                  \n"
+      " sb             $t0, -3(%[dst_ptr])               \n"
+      ".set pop                                          \n"
+      : [src_ptr] "+r" (src_ptr),
+        [dst_ptr] "+r" (dst_ptr),
+        [s1] "+r" (s1),
+        [s2] "+r" (s2),
+        [dst_width] "+r" (dst_width)
+      : [c1] "r" (c1), [c2] "r" (c2)
+      : "t0", "t1", "t2", "t3", "t4",
+        "t5", "t6", "t7", "t8"
+  );
+}
+
+#endif  // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
diff --git a/libs/libaom/src/third_party/libyuv/source/scale_neon.cc b/libs/libaom/src/third_party/libyuv/source/scale_neon.cc
new file mode 100644
index 000000000..7825878e9
--- /dev/null
+++ b/libs/libaom/src/third_party/libyuv/source/scale_neon.cc
@@ -0,0 +1,1037 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC Neon.
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
+    !defined(__aarch64__)
+
+// NEON downscalers with interpolation.
+// Provided by Fritz Koenig
+
+// Read 32x1 throw away even pixels, and write 16x1.
+void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst, int dst_width) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    // load even pixels into q0, odd into q1
+    MEMACCESS(0)
+    "vld2.8     {q0, q1}, [%0]!                \n"
+    "subs       %2, %2, #16                    \n"  // 16 processed per loop
+    MEMACCESS(1)
+    "vst1.8     {q1}, [%1]!                    \n"  // store odd pixels
+    "bgt        1b                             \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst),              // %1
+    "+r"(dst_width)         // %2
+  :
+  : "q0", "q1"              // Clobber List
+  );
+}
+
+// Read 32x1 average down and write 16x1.
+void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst, int dst_width) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0, q1}, [%0]!                \n"  // load pixels and post inc
+    "subs       %2, %2, #16                    \n"  // 16 processed per loop
+    "vpaddl.u8  q0, q0                         \n"  // add adjacent
+    "vpaddl.u8  q1, q1                         \n"
+    "vrshrn.u16 d0, q0, #1                     \n"  // downshift, round and pack
+    "vrshrn.u16 d1, q1, #1                     \n"
+    MEMACCESS(1)
+    "vst1.8     {q0}, [%1]!                    \n"
+    "bgt        1b                             \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst),              // %1
+    "+r"(dst_width)         // %2
+  :
+  : "q0", "q1"     // Clobber List
+  );
+}
+
+// Read 32x2 average down and write 16x1.
+void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst, int dst_width) {
+  asm volatile (
+    // change the stride to row 2 pointer
+    "add        %1, %0                         \n"
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0, q1}, [%0]!                \n"  // load row 1 and post inc
+    MEMACCESS(1)
+    "vld1.8     {q2, q3}, [%1]!                \n"  // load row 2 and post inc
+    "subs       %3, %3, #16                    \n"  // 16 processed per loop
+    "vpaddl.u8  q0, q0                         \n"  // row 1 add adjacent
+    "vpaddl.u8  q1, q1                         \n"
+    "vpadal.u8  q0, q2                         \n"  // row 2 add adjacent + row1
+    "vpadal.u8  q1, q3                         \n"
+    "vrshrn.u16 d0, q0, #2                     \n"  // downshift, round and pack
+    "vrshrn.u16 d1, q1, #2                     \n"
+    MEMACCESS(2)
+    "vst1.8     {q0}, [%2]!                    \n"
+    "bgt        1b                             \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(src_stride),       // %1
+    "+r"(dst),              // %2
+    "+r"(dst_width)         // %3
+  :
+  : "q0", "q1", "q2", "q3"     // Clobber List
+  );
+}
+
+void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n" // src line 0
+    "subs       %2, %2, #8                     \n" // 8 processed per loop
+    MEMACCESS(1)
+    "vst1.8     {d2}, [%1]!                    \n"
+    "bgt        1b                             \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(dst_width)         // %2
+  :
+  : "q0", "q1", "memory", "cc"
+  );
+}
+
+void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width) {
+  const uint8* src_ptr1 = src_ptr + src_stride;
+  const uint8* src_ptr2 = src_ptr + src_stride * 2;
+  const uint8* src_ptr3 = src_ptr + src_stride * 3;
+asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0]!                    \n"   // load up 16x4
+    MEMACCESS(3)
+    "vld1.8     {q1}, [%3]!                    \n"
+    MEMACCESS(4)
+    "vld1.8     {q2}, [%4]!                    \n"
+    MEMACCESS(5)
+    "vld1.8     {q3}, [%5]!                    \n"
+    "subs       %2, %2, #4                     \n"
+    "vpaddl.u8  q0, q0                         \n"
+    "vpadal.u8  q0, q1                         \n"
+    "vpadal.u8  q0, q2                         \n"
+    "vpadal.u8  q0, q3                         \n"
+    "vpaddl.u16 q0, q0                         \n"
+    "vrshrn.u32 d0, q0, #4                     \n"   // divide by 16 w/rounding
+    "vmovn.u16  d0, q0                         \n"
+    MEMACCESS(1)
+    "vst1.32    {d0[0]}, [%1]!                 \n"
+    "bgt        1b                             \n"
+  : "+r"(src_ptr),   // %0
+    "+r"(dst_ptr),   // %1
+    "+r"(dst_width), // %2
+    "+r"(src_ptr1),  // %3
+    "+r"(src_ptr2),  // %4
+    "+r"(src_ptr3)   // %5
+  :
+  : "q0", "q1", "q2", "q3", "memory", "cc"
+  );
+}
+
+// Down scale from 4 to 3 pixels. Use the neon multilane read/write
+// to load up the every 4th pixel into a 4 different registers.
+// Point samples 32 pixels to 24 pixels.
+void ScaleRowDown34_NEON(const uint8* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d1, d2, d3}, [%0]!      \n" // src line 0
+    "subs       %2, %2, #24                  \n"
+    "vmov       d2, d3                       \n" // order d0, d1, d2
+    MEMACCESS(1)
+    "vst3.8     {d0, d1, d2}, [%1]!          \n"
+    "bgt        1b                           \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(dst_width)         // %2
+  :
+  : "d0", "d1", "d2", "d3", "memory", "cc"
+  );
+}
+
+void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "vmov.u8    d24, #3                        \n"
+    "add        %3, %0                         \n"
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n" // src line 0
+    MEMACCESS(3)
+    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n" // src line 1
+    "subs         %2, %2, #24                  \n"
+
+    // filter src line 0 with src line 1
+    // expand chars to shorts to allow for room
+    // when adding lines together
+    "vmovl.u8     q8, d4                       \n"
+    "vmovl.u8     q9, d5                       \n"
+    "vmovl.u8     q10, d6                      \n"
+    "vmovl.u8     q11, d7                      \n"
+
+    // 3 * line_0 + line_1
+    "vmlal.u8     q8, d0, d24                  \n"
+    "vmlal.u8     q9, d1, d24                  \n"
+    "vmlal.u8     q10, d2, d24                 \n"
+    "vmlal.u8     q11, d3, d24                 \n"
+
+    // (3 * line_0 + line_1) >> 2
+    "vqrshrn.u16  d0, q8, #2                   \n"
+    "vqrshrn.u16  d1, q9, #2                   \n"
+    "vqrshrn.u16  d2, q10, #2                  \n"
+    "vqrshrn.u16  d3, q11, #2                  \n"
+
+    // a0 = (src[0] * 3 + s[1] * 1) >> 2
+    "vmovl.u8     q8, d1                       \n"
+    "vmlal.u8     q8, d0, d24                  \n"
+    "vqrshrn.u16  d0, q8, #2                   \n"
+
+    // a1 = (src[1] * 1 + s[2] * 1) >> 1
+    "vrhadd.u8    d1, d1, d2                   \n"
+
+    // a2 = (src[2] * 1 + s[3] * 3) >> 2
+    "vmovl.u8     q8, d2                       \n"
+    "vmlal.u8     q8, d3, d24                  \n"
+    "vqrshrn.u16  d2, q8, #2                   \n"
+
+    MEMACCESS(1)
+    "vst3.8       {d0, d1, d2}, [%1]!          \n"
+
+    "bgt          1b                           \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(dst_width),        // %2
+    "+r"(src_stride)        // %3
+  :
+  : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc"
+  );
+}
+
+void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "vmov.u8    d24, #3                        \n"
+    "add        %3, %0                         \n"
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n" // src line 0
+    MEMACCESS(3)
+    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n" // src line 1
+    "subs         %2, %2, #24                  \n"
+    // average src line 0 with src line 1
+    "vrhadd.u8    q0, q0, q2                   \n"
+    "vrhadd.u8    q1, q1, q3                   \n"
+
+    // a0 = (src[0] * 3 + s[1] * 1) >> 2
+    "vmovl.u8     q3, d1                       \n"
+    "vmlal.u8     q3, d0, d24                  \n"
+    "vqrshrn.u16  d0, q3, #2                   \n"
+
+    // a1 = (src[1] * 1 + s[2] * 1) >> 1
+    "vrhadd.u8    d1, d1, d2                   \n"
+
+    // a2 = (src[2] * 1 + s[3] * 3) >> 2
+    "vmovl.u8     q3, d2                       \n"
+    "vmlal.u8     q3, d3, d24                  \n"
+    "vqrshrn.u16  d2, q3, #2                   \n"
+
+    MEMACCESS(1)
+    "vst3.8       {d0, d1, d2}, [%1]!          \n"
+    "bgt          1b                           \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(dst_width),        // %2
+    "+r"(src_stride)        // %3
+  :
+  : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc"
+  );
+}
+
+#define HAS_SCALEROWDOWN38_NEON
+static uvec8 kShuf38 =
+  { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
+static uvec8 kShuf38_2 =
+  { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 };
+static vec16 kMult38_Div6 =
+  { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
+    65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
+static vec16 kMult38_Div9 =
+  { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
+    65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
+
+// 32 -> 12
+void ScaleRowDown38_NEON(const uint8* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    MEMACCESS(3)
+    "vld1.8     {q3}, [%3]                     \n"
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {d0, d1, d2, d3}, [%0]!        \n"
+    "subs       %2, %2, #12                    \n"
+    "vtbl.u8    d4, {d0, d1, d2, d3}, d6       \n"
+    "vtbl.u8    d5, {d0, d1, d2, d3}, d7       \n"
+    MEMACCESS(1)
+    "vst1.8     {d4}, [%1]!                    \n"
+    MEMACCESS(1)
+    "vst1.32    {d5[0]}, [%1]!                 \n"
+    "bgt        1b                             \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(dst_width)         // %2
+  : "r"(&kShuf38)           // %3
+  : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc"
+  );
+}
+
+// 32x3 -> 12x1
+void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
+                                      ptrdiff_t src_stride,
+                                      uint8* dst_ptr, int dst_width) {
+  const uint8* src_ptr1 = src_ptr + src_stride * 2;
+
+  asm volatile (
+    MEMACCESS(5)
+    "vld1.16    {q13}, [%5]                    \n"
+    MEMACCESS(6)
+    "vld1.8     {q14}, [%6]                    \n"
+    MEMACCESS(7)
+    "vld1.8     {q15}, [%7]                    \n"
+    "add        %3, %0                         \n"
+    ".p2align   2                              \n"
+  "1:                                          \n"
+
+    // d0 = 00 40 01 41 02 42 03 43
+    // d1 = 10 50 11 51 12 52 13 53
+    // d2 = 20 60 21 61 22 62 23 63
+    // d3 = 30 70 31 71 32 72 33 73
+    MEMACCESS(0)
+    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n"
+    MEMACCESS(3)
+    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n"
+    MEMACCESS(4)
+    "vld4.8       {d16, d17, d18, d19}, [%4]!  \n"
+    "subs         %2, %2, #12                  \n"
+
+    // Shuffle the input data around to get align the data
+    //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+    // d0 = 00 10 01 11 02 12 03 13
+    // d1 = 40 50 41 51 42 52 43 53
+    "vtrn.u8      d0, d1                       \n"
+    "vtrn.u8      d4, d5                       \n"
+    "vtrn.u8      d16, d17                     \n"
+
+    // d2 = 20 30 21 31 22 32 23 33
+    // d3 = 60 70 61 71 62 72 63 73
+    "vtrn.u8      d2, d3                       \n"
+    "vtrn.u8      d6, d7                       \n"
+    "vtrn.u8      d18, d19                     \n"
+
+    // d0 = 00+10 01+11 02+12 03+13
+    // d2 = 40+50 41+51 42+52 43+53
+    "vpaddl.u8    q0, q0                       \n"
+    "vpaddl.u8    q2, q2                       \n"
+    "vpaddl.u8    q8, q8                       \n"
+
+    // d3 = 60+70 61+71 62+72 63+73
+    "vpaddl.u8    d3, d3                       \n"
+    "vpaddl.u8    d7, d7                       \n"
+    "vpaddl.u8    d19, d19                     \n"
+
+    // combine source lines
+    "vadd.u16     q0, q2                       \n"
+    "vadd.u16     q0, q8                       \n"
+    "vadd.u16     d4, d3, d7                   \n"
+    "vadd.u16     d4, d19                      \n"
+
+    // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
+    //             + s[6 + st * 1] + s[7 + st * 1]
+    //             + s[6 + st * 2] + s[7 + st * 2]) / 6
+    "vqrdmulh.s16 q2, q2, q13                  \n"
+    "vmovn.u16    d4, q2                       \n"
+
+    // Shuffle 2,3 reg around so that 2 can be added to the
+    //  0,1 reg and 3 can be added to the 4,5 reg. This
+    //  requires expanding from u8 to u16 as the 0,1 and 4,5
+    //  registers are already expanded. Then do transposes
+    //  to get aligned.
+    // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+    "vmovl.u8     q1, d2                       \n"
+    "vmovl.u8     q3, d6                       \n"
+    "vmovl.u8     q9, d18                      \n"
+
+    // combine source lines
+    "vadd.u16     q1, q3                       \n"
+    "vadd.u16     q1, q9                       \n"
+
+    // d4 = xx 20 xx 30 xx 22 xx 32
+    // d5 = xx 21 xx 31 xx 23 xx 33
+    "vtrn.u32     d2, d3                       \n"
+
+    // d4 = xx 20 xx 21 xx 22 xx 23
+    // d5 = xx 30 xx 31 xx 32 xx 33
+    "vtrn.u16     d2, d3                       \n"
+
+    // 0+1+2, 3+4+5
+    "vadd.u16     q0, q1                       \n"
+
+    // Need to divide, but can't downshift as the the value
+    //  isn't a power of 2. So multiply by 65536 / n
+    //  and take the upper 16 bits.
+    "vqrdmulh.s16 q0, q0, q15                  \n"
+
+    // Align for table lookup, vtbl requires registers to
+    //  be adjacent
+    "vmov.u8      d2, d4                       \n"
+
+    "vtbl.u8      d3, {d0, d1, d2}, d28        \n"
+    "vtbl.u8      d4, {d0, d1, d2}, d29        \n"
+
+    MEMACCESS(1)
+    "vst1.8       {d3}, [%1]!                  \n"
+    MEMACCESS(1)
+    "vst1.32      {d4[0]}, [%1]!               \n"
+    "bgt          1b                           \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(dst_width),        // %2
+    "+r"(src_stride),       // %3
+    "+r"(src_ptr1)          // %4
+  : "r"(&kMult38_Div6),     // %5
+    "r"(&kShuf38_2),        // %6
+    "r"(&kMult38_Div9)      // %7
+  : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory", "cc"
+  );
+}
+
+// 32x2 -> 12x1
+void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    MEMACCESS(4)
+    "vld1.16    {q13}, [%4]                    \n"
+    MEMACCESS(5)
+    "vld1.8     {q14}, [%5]                    \n"
+    "add        %3, %0                         \n"
+    ".p2align   2                              \n"
+  "1:                                          \n"
+
+    // d0 = 00 40 01 41 02 42 03 43
+    // d1 = 10 50 11 51 12 52 13 53
+    // d2 = 20 60 21 61 22 62 23 63
+    // d3 = 30 70 31 71 32 72 33 73
+    MEMACCESS(0)
+    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n"
+    MEMACCESS(3)
+    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n"
+    "subs         %2, %2, #12                  \n"
+
+    // Shuffle the input data around to get align the data
+    //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+    // d0 = 00 10 01 11 02 12 03 13
+    // d1 = 40 50 41 51 42 52 43 53
+    "vtrn.u8      d0, d1                       \n"
+    "vtrn.u8      d4, d5                       \n"
+
+    // d2 = 20 30 21 31 22 32 23 33
+    // d3 = 60 70 61 71 62 72 63 73
+    "vtrn.u8      d2, d3                       \n"
+    "vtrn.u8      d6, d7                       \n"
+
+    // d0 = 00+10 01+11 02+12 03+13
+    // d2 = 40+50 41+51 42+52 43+53
+    "vpaddl.u8    q0, q0                       \n"
+    "vpaddl.u8    q2, q2                       \n"
+
+    // d3 = 60+70 61+71 62+72 63+73
+    "vpaddl.u8    d3, d3                       \n"
+    "vpaddl.u8    d7, d7                       \n"
+
+    // combine source lines
+    "vadd.u16     q0, q2                       \n"
+    "vadd.u16     d4, d3, d7                   \n"
+
+    // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
+    "vqrshrn.u16  d4, q2, #2                   \n"
+
+    // Shuffle 2,3 reg around so that 2 can be added to the
+    //  0,1 reg and 3 can be added to the 4,5 reg. This
+    //  requires expanding from u8 to u16 as the 0,1 and 4,5
+    //  registers are already expanded. Then do transposes
+    //  to get aligned.
+    // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+    "vmovl.u8     q1, d2                       \n"
+    "vmovl.u8     q3, d6                       \n"
+
+    // combine source lines
+    "vadd.u16     q1, q3                       \n"
+
+    // d4 = xx 20 xx 30 xx 22 xx 32
+    // d5 = xx 21 xx 31 xx 23 xx 33
+    "vtrn.u32     d2, d3                       \n"
+
+    // d4 = xx 20 xx 21 xx 22 xx 23
+    // d5 = xx 30 xx 31 xx 32 xx 33
+    "vtrn.u16     d2, d3                       \n"
+
+    // 0+1+2, 3+4+5
+    "vadd.u16     q0, q1                       \n"
+
+    // Need to divide, but can't downshift as the the value
+    //  isn't a power of 2. So multiply by 65536 / n
+    //  and take the upper 16 bits.
+    "vqrdmulh.s16 q0, q0, q13                  \n"
+
+    // Align for table lookup, vtbl requires registers to
+    //  be adjacent
+    "vmov.u8      d2, d4                       \n"
+
+    "vtbl.u8      d3, {d0, d1, d2}, d28        \n"
+    "vtbl.u8      d4, {d0, d1, d2}, d29        \n"
+
+    MEMACCESS(1)
+    "vst1.8       {d3}, [%1]!                  \n"
+    MEMACCESS(1)
+    "vst1.32      {d4[0]}, [%1]!               \n"
+    "bgt          1b                           \n"
+  : "+r"(src_ptr),       // %0
+    "+r"(dst_ptr),       // %1
+    "+r"(dst_width),     // %2
+    "+r"(src_stride)     // %3
+  : "r"(&kMult38_Div6),  // %4
+    "r"(&kShuf38_2)      // %5
+  : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"
+  );
+}
+
+void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                    uint16* dst_ptr, int src_width, int src_height) {
+  const uint8* src_tmp = NULL;
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    "mov       %0, %1                          \n"
+    "mov       r12, %5                         \n"
+    "veor      q2, q2, q2                      \n"
+    "veor      q3, q3, q3                      \n"
+  "2:                                          \n"
+    // load 16 pixels into q0
+    MEMACCESS(0)
+    "vld1.8     {q0}, [%0], %3                 \n"
+    "vaddw.u8   q3, q3, d1                     \n"
+    "vaddw.u8   q2, q2, d0                     \n"
+    "subs       r12, r12, #1                   \n"
+    "bgt        2b                             \n"
+    MEMACCESS(2)
+    "vst1.16    {q2, q3}, [%2]!                \n"  // store pixels
+    "add        %1, %1, #16                    \n"
+    "subs       %4, %4, #16                    \n"  // 16 processed per loop
+    "bgt        1b                             \n"
+  : "+r"(src_tmp),          // %0
+    "+r"(src_ptr),          // %1
+    "+r"(dst_ptr),          // %2
+    "+r"(src_stride),       // %3
+    "+r"(src_width),        // %4
+    "+r"(src_height)        // %5
+  :
+  : "memory", "cc", "r12", "q0", "q1", "q2", "q3"  // Clobber List
+  );
+}
+
+// TODO(Yang Zhang): Investigate less load instructions for
+// the x/dx stepping
+#define LOAD2_DATA8_LANE(n)                                    \
+    "lsr        %5, %3, #16                    \n"             \
+    "add        %6, %1, %5                     \n"             \
+    "add        %3, %3, %4                     \n"             \
+    MEMACCESS(6)                                               \
+    "vld2.8     {d6["#n"], d7["#n"]}, [%6]     \n"
+
+void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
+                          int dst_width, int x, int dx) {
+  int dx_offset[4] = {0, 1, 2, 3};
+  int* tmp = dx_offset;
+  const uint8* src_tmp = src_ptr;
+  asm volatile (
+    ".p2align   2                              \n"
+    "vdup.32    q0, %3                         \n"  // x
+    "vdup.32    q1, %4                         \n"  // dx
+    "vld1.32    {q2}, [%5]                     \n"  // 0 1 2 3
+    "vshl.i32   q3, q1, #2                     \n"  // 4 * dx
+    "vmul.s32   q1, q1, q2                     \n"
+    // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
+    "vadd.s32   q1, q1, q0                     \n"
+    // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
+    "vadd.s32   q2, q1, q3                     \n"
+    "vshl.i32   q0, q3, #1                     \n"  // 8 * dx
+  "1:                                          \n"
+    LOAD2_DATA8_LANE(0)
+    LOAD2_DATA8_LANE(1)
+    LOAD2_DATA8_LANE(2)
+    LOAD2_DATA8_LANE(3)
+    LOAD2_DATA8_LANE(4)
+    LOAD2_DATA8_LANE(5)
+    LOAD2_DATA8_LANE(6)
+    LOAD2_DATA8_LANE(7)
+    "vmov       q10, q1                        \n"
+    "vmov       q11, q2                        \n"
+    "vuzp.16    q10, q11                       \n"
+    "vmovl.u8   q8, d6                         \n"
+    "vmovl.u8   q9, d7                         \n"
+    "vsubl.s16  q11, d18, d16                  \n"
+    "vsubl.s16  q12, d19, d17                  \n"
+    "vmovl.u16  q13, d20                       \n"
+    "vmovl.u16  q10, d21                       \n"
+    "vmul.s32   q11, q11, q13                  \n"
+    "vmul.s32   q12, q12, q10                  \n"
+    "vshrn.s32  d18, q11, #16                  \n"
+    "vshrn.s32  d19, q12, #16                  \n"
+    "vadd.s16   q8, q8, q9                     \n"
+    "vmovn.s16  d6, q8                         \n"
+
+    MEMACCESS(0)
+    "vst1.8     {d6}, [%0]!                    \n"  // store pixels
+    "vadd.s32   q1, q1, q0                     \n"
+    "vadd.s32   q2, q2, q0                     \n"
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop
+    "bgt        1b                             \n"
+  : "+r"(dst_ptr),          // %0
+    "+r"(src_ptr),          // %1
+    "+r"(dst_width),        // %2
+    "+r"(x),                // %3
+    "+r"(dx),               // %4
+    "+r"(tmp),              // %5
+    "+r"(src_tmp)           // %6
+  :
+  : "memory", "cc", "q0", "q1", "q2", "q3",
+    "q8", "q9", "q10", "q11", "q12", "q13"
+  );
+}
+
+#undef LOAD2_DATA8_LANE
+
+// 16x2 -> 16x1
+void ScaleFilterRows_NEON(uint8* dst_ptr,
+                          const uint8* src_ptr, ptrdiff_t src_stride,
+                          int dst_width, int source_y_fraction) {
+  asm volatile (
+    "cmp          %4, #0                       \n"
+    "beq          100f                         \n"
+    "add          %2, %1                       \n"
+    "cmp          %4, #64                      \n"
+    "beq          75f                          \n"
+    "cmp          %4, #128                     \n"
+    "beq          50f                          \n"
+    "cmp          %4, #192                     \n"
+    "beq          25f                          \n"
+
+    "vdup.8       d5, %4                       \n"
+    "rsb          %4, #256                     \n"
+    "vdup.8       d4, %4                       \n"
+    // General purpose row blend.
+  "1:                                          \n"
+    MEMACCESS(1)
+    "vld1.8       {q0}, [%1]!                  \n"
+    MEMACCESS(2)
+    "vld1.8       {q1}, [%2]!                  \n"
+    "subs         %3, %3, #16                  \n"
+    "vmull.u8     q13, d0, d4                  \n"
+    "vmull.u8     q14, d1, d4                  \n"
+    "vmlal.u8     q13, d2, d5                  \n"
+    "vmlal.u8     q14, d3, d5                  \n"
+    "vrshrn.u16   d0, q13, #8                  \n"
+    "vrshrn.u16   d1, q14, #8                  \n"
+    MEMACCESS(0)
+    "vst1.8       {q0}, [%0]!                  \n"
+    "bgt          1b                           \n"
+    "b            99f                          \n"
+
+    // Blend 25 / 75.
+  "25:                                         \n"
+    MEMACCESS(1)
+    "vld1.8       {q0}, [%1]!                  \n"
+    MEMACCESS(2)
+    "vld1.8       {q1}, [%2]!                  \n"
+    "subs         %3, %3, #16                  \n"
+    "vrhadd.u8    q0, q1                       \n"
+    "vrhadd.u8    q0, q1                       \n"
+    MEMACCESS(0)
+    "vst1.8       {q0}, [%0]!                  \n"
+    "bgt          25b                          \n"
+    "b            99f                          \n"
+
+    // Blend 50 / 50.
+  "50:                                         \n"
+    MEMACCESS(1)
+    "vld1.8       {q0}, [%1]!                  \n"
+    MEMACCESS(2)
+    "vld1.8       {q1}, [%2]!                  \n"
+    "subs         %3, %3, #16                  \n"
+    "vrhadd.u8    q0, q1                       \n"
+    MEMACCESS(0)
+    "vst1.8       {q0}, [%0]!                  \n"
+    "bgt          50b                          \n"
+    "b            99f                          \n"
+
+    // Blend 75 / 25.
+  "75:                                         \n"
+    MEMACCESS(1)
+    "vld1.8       {q1}, [%1]!                  \n"
+    MEMACCESS(2)
+    "vld1.8       {q0}, [%2]!                  \n"
+    "subs         %3, %3, #16                  \n"
+    "vrhadd.u8    q0, q1                       \n"
+    "vrhadd.u8    q0, q1                       \n"
+    MEMACCESS(0)
+    "vst1.8       {q0}, [%0]!                  \n"
+    "bgt          75b                          \n"
+    "b            99f                          \n"
+
+    // Blend 100 / 0 - Copy row unchanged.
+  "100:                                        \n"
+    MEMACCESS(1)
+    "vld1.8       {q0}, [%1]!                  \n"
+    "subs         %3, %3, #16                  \n"
+    MEMACCESS(0)
+    "vst1.8       {q0}, [%0]!                  \n"
+    "bgt          100b                         \n"
+
+  "99:                                         \n"
+    MEMACCESS(0)
+    "vst1.8       {d1[7]}, [%0]                \n"
+  : "+r"(dst_ptr),          // %0
+    "+r"(src_ptr),          // %1
+    "+r"(src_stride),       // %2
+    "+r"(dst_width),        // %3
+    "+r"(source_y_fraction) // %4
+  :
+  : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc"
+  );
+}
+
+void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst, int dst_width) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    // load even pixels into q0, odd into q1
+    MEMACCESS(0)
+    "vld2.32    {q0, q1}, [%0]!                \n"
+    MEMACCESS(0)
+    "vld2.32    {q2, q3}, [%0]!                \n"
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop
+    MEMACCESS(1)
+    "vst1.8     {q1}, [%1]!                    \n"  // store odd pixels
+    MEMACCESS(1)
+    "vst1.8     {q3}, [%1]!                    \n"
+    "bgt        1b                             \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst),              // %1
+    "+r"(dst_width)         // %2
+  :
+  : "memory", "cc", "q0", "q1", "q2", "q3"  // Clobber List
+  );
+}
+
+void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+                                  uint8* dst_argb, int dst_width) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
+    MEMACCESS(0)
+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop
+    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
+    "vpaddl.u8  q3, q3                         \n"  // A 16 bytes -> 8 shorts.
+    "vrshrn.u16 d0, q0, #1                     \n"  // downshift, round and pack
+    "vrshrn.u16 d1, q1, #1                     \n"
+    "vrshrn.u16 d2, q2, #1                     \n"
+    "vrshrn.u16 d3, q3, #1                     \n"
+    MEMACCESS(1)
+    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"
+    "bgt       1b                              \n"
+  : "+r"(src_argb),         // %0
+    "+r"(dst_argb),         // %1
+    "+r"(dst_width)         // %2
+  :
+  : "memory", "cc", "q0", "q1", "q2", "q3"     // Clobber List
+  );
+}
+
+void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst, int dst_width) {
+  asm volatile (
+    // change the stride to row 2 pointer
+    "add        %1, %1, %0                     \n"
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
+    MEMACCESS(0)
+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
+    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
+    "vpaddl.u8  q3, q3                         \n"  // A 16 bytes -> 8 shorts.
+    MEMACCESS(1)
+    "vld4.8     {d16, d18, d20, d22}, [%1]!    \n"  // load 8 more ARGB pixels.
+    MEMACCESS(1)
+    "vld4.8     {d17, d19, d21, d23}, [%1]!    \n"  // load last 8 ARGB pixels.
+    "vpadal.u8  q0, q8                         \n"  // B 16 bytes -> 8 shorts.
+    "vpadal.u8  q1, q9                         \n"  // G 16 bytes -> 8 shorts.
+    "vpadal.u8  q2, q10                        \n"  // R 16 bytes -> 8 shorts.
+    "vpadal.u8  q3, q11                        \n"  // A 16 bytes -> 8 shorts.
+    "vrshrn.u16 d0, q0, #2                     \n"  // downshift, round and pack
+    "vrshrn.u16 d1, q1, #2                     \n"
+    "vrshrn.u16 d2, q2, #2                     \n"
+    "vrshrn.u16 d3, q3, #2                     \n"
+    MEMACCESS(2)
+    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"
+    "bgt        1b                             \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(src_stride),       // %1
+    "+r"(dst),              // %2
+    "+r"(dst_width)         // %3
+  :
+  : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
+  );
+}
+
+// Reads 4 pixels at a time.
+// Alignment requirement: src_argb 4 byte aligned.
+void ScaleARGBRowDownEven_NEON(const uint8* src_argb,  ptrdiff_t src_stride,
+                               int src_stepx, uint8* dst_argb, int dst_width) {
+  asm volatile (
+    "mov        r12, %3, lsl #2                \n"
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.32    {d0[0]}, [%0], r12             \n"
+    MEMACCESS(0)
+    "vld1.32    {d0[1]}, [%0], r12             \n"
+    MEMACCESS(0)
+    "vld1.32    {d1[0]}, [%0], r12             \n"
+    MEMACCESS(0)
+    "vld1.32    {d1[1]}, [%0], r12             \n"
+    "subs       %2, %2, #4                     \n"  // 4 pixels per loop.
+    MEMACCESS(1)
+    "vst1.8     {q0}, [%1]!                    \n"
+    "bgt        1b                             \n"
+  : "+r"(src_argb),    // %0
+    "+r"(dst_argb),    // %1
+    "+r"(dst_width)    // %2
+  : "r"(src_stepx)     // %3
+  : "memory", "cc", "r12", "q0"
+  );
+}
+
+// Reads 4 pixels at a time.
+// Alignment requirement: src_argb 4 byte aligned.
+void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+                                  int src_stepx,
+                                  uint8* dst_argb, int dst_width) {
+  asm volatile (
+    "mov        r12, %4, lsl #2                \n"
+    "add        %1, %1, %0                     \n"
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "vld1.8     {d0}, [%0], r12                \n"  // Read 4 2x2 blocks -> 2x1
+    MEMACCESS(1)
+    "vld1.8     {d1}, [%1], r12                \n"
+    MEMACCESS(0)
+    "vld1.8     {d2}, [%0], r12                \n"
+    MEMACCESS(1)
+    "vld1.8     {d3}, [%1], r12                \n"
+    MEMACCESS(0)
+    "vld1.8     {d4}, [%0], r12                \n"
+    MEMACCESS(1)
+    "vld1.8     {d5}, [%1], r12                \n"
+    MEMACCESS(0)
+    "vld1.8     {d6}, [%0], r12                \n"
+    MEMACCESS(1)
+    "vld1.8     {d7}, [%1], r12                \n"
+    "vaddl.u8   q0, d0, d1                     \n"
+    "vaddl.u8   q1, d2, d3                     \n"
+    "vaddl.u8   q2, d4, d5                     \n"
+    "vaddl.u8   q3, d6, d7                     \n"
+    "vswp.8     d1, d2                         \n"  // ab_cd -> ac_bd
+    "vswp.8     d5, d6                         \n"  // ef_gh -> eg_fh
+    "vadd.u16   q0, q0, q1                     \n"  // (a+b)_(c+d)
+    "vadd.u16   q2, q2, q3                     \n"  // (e+f)_(g+h)
+    "vrshrn.u16 d0, q0, #2                     \n"  // first 2 pixels.
+    "vrshrn.u16 d1, q2, #2                     \n"  // next 2 pixels.
+    "subs       %3, %3, #4                     \n"  // 4 pixels per loop.
+    MEMACCESS(2)
+    "vst1.8     {q0}, [%2]!                    \n"
+    "bgt        1b                             \n"
+  : "+r"(src_argb),    // %0
+    "+r"(src_stride),  // %1
+    "+r"(dst_argb),    // %2
+    "+r"(dst_width)    // %3
+  : "r"(src_stepx)     // %4
+  : "memory", "cc", "r12", "q0", "q1", "q2", "q3"
+  );
+}
+
+// TODO(Yang Zhang): Investigate less load instructions for
+// the x/dx stepping
+#define LOAD1_DATA32_LANE(dn, n)                               \
+    "lsr        %5, %3, #16                    \n"             \
+    "add        %6, %1, %5, lsl #2             \n"             \
+    "add        %3, %3, %4                     \n"             \
+    MEMACCESS(6)                                               \
+    "vld1.32    {"#dn"["#n"]}, [%6]            \n"
+
+void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
+                        int dst_width, int x, int dx) {
+  int tmp = 0;
+  const uint8* src_tmp = src_argb;
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    LOAD1_DATA32_LANE(d0, 0)
+    LOAD1_DATA32_LANE(d0, 1)
+    LOAD1_DATA32_LANE(d1, 0)
+    LOAD1_DATA32_LANE(d1, 1)
+    LOAD1_DATA32_LANE(d2, 0)
+    LOAD1_DATA32_LANE(d2, 1)
+    LOAD1_DATA32_LANE(d3, 0)
+    LOAD1_DATA32_LANE(d3, 1)
+
+    MEMACCESS(0)
+    "vst1.32     {q0, q1}, [%0]!               \n"  // store pixels
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop
+    "bgt        1b                             \n"
+  : "+r"(dst_argb),         // %0
+    "+r"(src_argb),         // %1
+    "+r"(dst_width),        // %2
+    "+r"(x),                // %3
+    "+r"(dx),               // %4
+    "+r"(tmp),              // %5
+    "+r"(src_tmp)           // %6
+  :
+  : "memory", "cc", "q0", "q1"
+  );
+}
+
+#undef LOAD1_DATA32_LANE
+
+// TODO(Yang Zhang): Investigate less load instructions for
+// the x/dx stepping
+#define LOAD2_DATA32_LANE(dn1, dn2, n)                         \
+    "lsr        %5, %3, #16                           \n"      \
+    "add        %6, %1, %5, lsl #2                    \n"      \
+    "add        %3, %3, %4                            \n"      \
+    MEMACCESS(6)                                               \
+    "vld2.32    {"#dn1"["#n"], "#dn2"["#n"]}, [%6]    \n"
+
+void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
+                              int dst_width, int x, int dx) {
+  int dx_offset[4] = {0, 1, 2, 3};
+  int* tmp = dx_offset;
+  const uint8* src_tmp = src_argb;
+  asm volatile (
+    ".p2align   2                              \n"
+    "vdup.32    q0, %3                         \n"  // x
+    "vdup.32    q1, %4                         \n"  // dx
+    "vld1.32    {q2}, [%5]                     \n"  // 0 1 2 3
+    "vshl.i32   q9, q1, #2                     \n"  // 4 * dx
+    "vmul.s32   q1, q1, q2                     \n"
+    "vmov.i8    q3, #0x7f                      \n"  // 0x7F
+    "vmov.i16   q15, #0x7f                     \n"  // 0x7F
+    // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
+    "vadd.s32   q8, q1, q0                     \n"
+  "1:                                          \n"
+    // d0, d1: a
+    // d2, d3: b
+    LOAD2_DATA32_LANE(d0, d2, 0)
+    LOAD2_DATA32_LANE(d0, d2, 1)
+    LOAD2_DATA32_LANE(d1, d3, 0)
+    LOAD2_DATA32_LANE(d1, d3, 1)
+    "vshrn.i32   d22, q8, #9                   \n"
+    "vand.16     d22, d22, d30                 \n"
+    "vdup.8      d24, d22[0]                   \n"
+    "vdup.8      d25, d22[2]                   \n"
+    "vdup.8      d26, d22[4]                   \n"
+    "vdup.8      d27, d22[6]                   \n"
+    "vext.8      d4, d24, d25, #4              \n"
+    "vext.8      d5, d26, d27, #4              \n"  // f
+    "veor.8      q10, q2, q3                   \n"  // 0x7f ^ f
+    "vmull.u8    q11, d0, d20                  \n"
+    "vmull.u8    q12, d1, d21                  \n"
+    "vmull.u8    q13, d2, d4                   \n"
+    "vmull.u8    q14, d3, d5                   \n"
+    "vadd.i16    q11, q11, q13                 \n"
+    "vadd.i16    q12, q12, q14                 \n"
+    "vshrn.i16   d0, q11, #7                   \n"
+    "vshrn.i16   d1, q12, #7                   \n"
+
+    MEMACCESS(0)
+    "vst1.32     {d0, d1}, [%0]!               \n"  // store pixels
+    "vadd.s32    q8, q8, q9                    \n"
+    "subs        %2, %2, #4                    \n"  // 4 processed per loop
+    "bgt         1b                            \n"
+  : "+r"(dst_argb),         // %0
+    "+r"(src_argb),         // %1
+    "+r"(dst_width),        // %2
+    "+r"(x),                // %3
+    "+r"(dx),               // %4
+    "+r"(tmp),              // %5
+    "+r"(src_tmp)           // %6
+  :
+  : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9",
+    "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+
+#undef LOAD2_DATA32_LANE
+
+#endif  // defined(__ARM_NEON__) && !defined(__aarch64__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libaom/src/third_party/libyuv/source/scale_neon64.cc b/libs/libaom/src/third_party/libyuv/source/scale_neon64.cc
new file mode 100644
index 000000000..1d5519357
--- /dev/null
+++ b/libs/libaom/src/third_party/libyuv/source/scale_neon64.cc
@@ -0,0 +1,1042 @@
+/*
+ *  Copyright 2014 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/scale.h"
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC Neon armv8 64 bit.
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+// Read 32x1 throw away even pixels, and write 16x1.
+void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst, int dst_width) {
+  asm volatile (
+  "1:                                          \n"
+    // load even pixels into v0, odd into v1
+    MEMACCESS(0)
+    "ld2        {v0.16b,v1.16b}, [%0], #32     \n"
+    "subs       %w2, %w2, #16                  \n"  // 16 processed per loop
+    MEMACCESS(1)
+    "st1        {v1.16b}, [%1], #16            \n"  // store odd pixels
+    "b.gt       1b                             \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst),              // %1
+    "+r"(dst_width)         // %2
+  :
+  : "v0", "v1"              // Clobber List
+  );
+}
+
+// Read 32x1 average down and write 16x1.
+void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst, int dst_width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b,v1.16b}, [%0], #32     \n"  // load pixels and post inc
+    "subs       %w2, %w2, #16                  \n"  // 16 processed per loop
+    "uaddlp     v0.8h, v0.16b                  \n"  // add adjacent
+    "uaddlp     v1.8h, v1.16b                  \n"
+    "rshrn      v0.8b, v0.8h, #1               \n"  // downshift, round and pack
+    "rshrn2     v0.16b, v1.8h, #1              \n"
+    MEMACCESS(1)
+    "st1        {v0.16b}, [%1], #16            \n"
+    "b.gt       1b                             \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst),              // %1
+    "+r"(dst_width)         // %2
+  :
+  : "v0", "v1"     // Clobber List
+  );
+}
+
+// Read 32x2 average down and write 16x1.
+void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst, int dst_width) {
+  asm volatile (
+    // change the stride to row 2 pointer
+    "add        %1, %1, %0                     \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b,v1.16b}, [%0], #32    \n"  // load row 1 and post inc
+    MEMACCESS(1)
+    "ld1        {v2.16b, v3.16b}, [%1], #32    \n"  // load row 2 and post inc
+    "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
+    "uaddlp     v0.8h, v0.16b                  \n"  // row 1 add adjacent
+    "uaddlp     v1.8h, v1.16b                  \n"
+    "uadalp     v0.8h, v2.16b                  \n"  // row 2 add adjacent + row1
+    "uadalp     v1.8h, v3.16b                  \n"
+    "rshrn      v0.8b, v0.8h, #2               \n"  // downshift, round and pack
+    "rshrn2     v0.16b, v1.8h, #2              \n"
+    MEMACCESS(2)
+    "st1        {v0.16b}, [%2], #16            \n"
+    "b.gt       1b                             \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(src_stride),       // %1
+    "+r"(dst),              // %2
+    "+r"(dst_width)         // %3
+  :
+  : "v0", "v1", "v2", "v3"     // Clobber List
+  );
+}
+
+void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld4     {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32          \n"  // src line 0
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
+    MEMACCESS(1)
+    "st1     {v2.8b}, [%1], #8                 \n"
+    "b.gt       1b                             \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(dst_width)         // %2
+  :
+  : "v0", "v1", "v2", "v3", "memory", "cc"
+  );
+}
+
+void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width) {
+  const uint8* src_ptr1 = src_ptr + src_stride;
+  const uint8* src_ptr2 = src_ptr + src_stride * 2;
+  const uint8* src_ptr3 = src_ptr + src_stride * 3;
+asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1     {v0.16b}, [%0], #16               \n"   // load up 16x4
+    MEMACCESS(3)
+    "ld1     {v1.16b}, [%2], #16               \n"
+    MEMACCESS(4)
+    "ld1     {v2.16b}, [%3], #16               \n"
+    MEMACCESS(5)
+    "ld1     {v3.16b}, [%4], #16               \n"
+    "subs    %w5, %w5, #4                      \n"
+    "uaddlp  v0.8h, v0.16b                     \n"
+    "uadalp  v0.8h, v1.16b                     \n"
+    "uadalp  v0.8h, v2.16b                     \n"
+    "uadalp  v0.8h, v3.16b                     \n"
+    "addp    v0.8h, v0.8h, v0.8h               \n"
+    "rshrn   v0.8b, v0.8h, #4                  \n"   // divide by 16 w/rounding
+    MEMACCESS(1)
+    "st1    {v0.s}[0], [%1], #4                \n"
+    "b.gt       1b                             \n"
+  : "+r"(src_ptr),   // %0
+    "+r"(dst_ptr),   // %1
+    "+r"(src_ptr1),  // %2
+    "+r"(src_ptr2),  // %3
+    "+r"(src_ptr3),  // %4
+    "+r"(dst_width)  // %5
+  :
+  : "v0", "v1", "v2", "v3", "memory", "cc"
+  );
+}
+
+// Down scale from 4 to 3 pixels. Use the neon multilane read/write
+// to load up the every 4th pixel into a 4 different registers.
+// Point samples 32 pixels to 24 pixels.
+void ScaleRowDown34_NEON(const uint8* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8* dst_ptr, int dst_width) {
+  asm volatile (
+  "1:                                                  \n"
+    MEMACCESS(0)
+    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"  // src line 0
+    "subs      %w2, %w2, #24                           \n"
+    "orr       v2.16b, v3.16b, v3.16b                  \n"  // order v0, v1, v2
+    MEMACCESS(1)
+    "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24                \n"
+    "b.gt      1b                                      \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(dst_width)         // %2
+  :
+  : "v0", "v1", "v2", "v3", "memory", "cc"
+  );
+}
+
+void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "movi      v20.8b, #3                              \n"
+    "add       %3, %3, %0                              \n"
+  "1:                                                  \n"
+    MEMACCESS(0)
+    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"  // src line 0
+    MEMACCESS(3)
+    "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32                \n"  // src line 1
+    "subs         %w2, %w2, #24                        \n"
+
+    // filter src line 0 with src line 1
+    // expand chars to shorts to allow for room
+    // when adding lines together
+    "ushll     v16.8h, v4.8b, #0                       \n"
+    "ushll     v17.8h, v5.8b, #0                       \n"
+    "ushll     v18.8h, v6.8b, #0                       \n"
+    "ushll     v19.8h, v7.8b, #0                       \n"
+
+    // 3 * line_0 + line_1
+    "umlal     v16.8h, v0.8b, v20.8b                   \n"
+    "umlal     v17.8h, v1.8b, v20.8b                   \n"
+    "umlal     v18.8h, v2.8b, v20.8b                   \n"
+    "umlal     v19.8h, v3.8b, v20.8b                   \n"
+
+    // (3 * line_0 + line_1) >> 2
+    "uqrshrn   v0.8b, v16.8h, #2                       \n"
+    "uqrshrn   v1.8b, v17.8h, #2                       \n"
+    "uqrshrn   v2.8b, v18.8h, #2                       \n"
+    "uqrshrn   v3.8b, v19.8h, #2                       \n"
+
+    // a0 = (src[0] * 3 + s[1] * 1) >> 2
+    "ushll     v16.8h, v1.8b, #0                       \n"
+    "umlal     v16.8h, v0.8b, v20.8b                   \n"
+    "uqrshrn   v0.8b, v16.8h, #2                       \n"
+
+    // a1 = (src[1] * 1 + s[2] * 1) >> 1
+    "urhadd    v1.8b, v1.8b, v2.8b                     \n"
+
+    // a2 = (src[2] * 1 + s[3] * 3) >> 2
+    "ushll     v16.8h, v2.8b, #0                       \n"
+    "umlal     v16.8h, v3.8b, v20.8b                   \n"
+    "uqrshrn   v2.8b, v16.8h, #2                       \n"
+
+    MEMACCESS(1)
+    "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24                \n"
+
+    "b.gt      1b                                      \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(dst_width),        // %2
+    "+r"(src_stride)        // %3
+  :
+  : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19",
+    "v20", "memory", "cc"
+  );
+}
+
+void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "movi      v20.8b, #3                              \n"
+    "add       %3, %3, %0                              \n"
+  "1:                                                  \n"
+    MEMACCESS(0)
+    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"  // src line 0
+    MEMACCESS(3)
+    "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32                \n"  // src line 1
+    "subs         %w2, %w2, #24                        \n"
+    // average src line 0 with src line 1
+    "urhadd    v0.8b, v0.8b, v4.8b                     \n"
+    "urhadd    v1.8b, v1.8b, v5.8b                     \n"
+    "urhadd    v2.8b, v2.8b, v6.8b                     \n"
+    "urhadd    v3.8b, v3.8b, v7.8b                     \n"
+
+    // a0 = (src[0] * 3 + s[1] * 1) >> 2
+    "ushll     v4.8h, v1.8b, #0                        \n"
+    "umlal     v4.8h, v0.8b, v20.8b                    \n"
+    "uqrshrn   v0.8b, v4.8h, #2                        \n"
+
+    // a1 = (src[1] * 1 + s[2] * 1) >> 1
+    "urhadd    v1.8b, v1.8b, v2.8b                     \n"
+
+    // a2 = (src[2] * 1 + s[3] * 3) >> 2
+    "ushll     v4.8h, v2.8b, #0                        \n"
+    "umlal     v4.8h, v3.8b, v20.8b                    \n"
+    "uqrshrn   v2.8b, v4.8h, #2                        \n"
+
+    MEMACCESS(1)
+    "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24                \n"
+    "b.gt      1b                                      \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(dst_width),        // %2
+    "+r"(src_stride)        // %3
+  :
+  : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "memory", "cc"
+  );
+}
+
+static uvec8 kShuf38 =
+  { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
+static uvec8 kShuf38_2 =
+  { 0, 16, 32, 2, 18, 33, 4, 20, 34, 6, 22, 35, 0, 0, 0, 0 };
+static vec16 kMult38_Div6 =
+  { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
+    65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
+static vec16 kMult38_Div9 =
+  { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
+    65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
+
+// 32 -> 12
+void ScaleRowDown38_NEON(const uint8* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    MEMACCESS(3)
+    "ld1       {v3.16b}, [%3]                          \n"
+  "1:                                                  \n"
+    MEMACCESS(0)
+    "ld1       {v0.16b,v1.16b}, [%0], #32             \n"
+    "subs      %w2, %w2, #12                           \n"
+    "tbl       v2.16b, {v0.16b,v1.16b}, v3.16b        \n"
+    MEMACCESS(1)
+    "st1       {v2.8b}, [%1], #8                       \n"
+    MEMACCESS(1)
+    "st1       {v2.s}[2], [%1], #4                     \n"
+    "b.gt      1b                                      \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(dst_width)         // %2
+  : "r"(&kShuf38)           // %3
+  : "v0", "v1", "v2", "v3", "memory", "cc"
+  );
+}
+
+// 32x3 -> 12x1
+void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
+                                      ptrdiff_t src_stride,
+                                      uint8* dst_ptr, int dst_width) {
+  const uint8* src_ptr1 = src_ptr + src_stride * 2;
+  ptrdiff_t tmp_src_stride = src_stride;
+
+  asm volatile (
+    MEMACCESS(5)
+    "ld1       {v29.8h}, [%5]                          \n"
+    MEMACCESS(6)
+    "ld1       {v30.16b}, [%6]                         \n"
+    MEMACCESS(7)
+    "ld1       {v31.8h}, [%7]                          \n"
+    "add       %2, %2, %0                              \n"
+  "1:                                                  \n"
+
+    // 00 40 01 41 02 42 03 43
+    // 10 50 11 51 12 52 13 53
+    // 20 60 21 61 22 62 23 63
+    // 30 70 31 71 32 72 33 73
+    MEMACCESS(0)
+    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"
+    MEMACCESS(3)
+    "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32                \n"
+    MEMACCESS(4)
+    "ld4       {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32              \n"
+    "subs      %w4, %w4, #12                           \n"
+
+    // Shuffle the input data around to get align the data
+    //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+    // 00 10 01 11 02 12 03 13
+    // 40 50 41 51 42 52 43 53
+    "trn1      v20.8b, v0.8b, v1.8b                    \n"
+    "trn2      v21.8b, v0.8b, v1.8b                    \n"
+    "trn1      v22.8b, v4.8b, v5.8b                    \n"
+    "trn2      v23.8b, v4.8b, v5.8b                    \n"
+    "trn1      v24.8b, v16.8b, v17.8b                  \n"
+    "trn2      v25.8b, v16.8b, v17.8b                  \n"
+
+    // 20 30 21 31 22 32 23 33
+    // 60 70 61 71 62 72 63 73
+    "trn1      v0.8b, v2.8b, v3.8b                     \n"
+    "trn2      v1.8b, v2.8b, v3.8b                     \n"
+    "trn1      v4.8b, v6.8b, v7.8b                     \n"
+    "trn2      v5.8b, v6.8b, v7.8b                     \n"
+    "trn1      v16.8b, v18.8b, v19.8b                  \n"
+    "trn2      v17.8b, v18.8b, v19.8b                  \n"
+
+    // 00+10 01+11 02+12 03+13
+    // 40+50 41+51 42+52 43+53
+    "uaddlp    v20.4h, v20.8b                          \n"
+    "uaddlp    v21.4h, v21.8b                          \n"
+    "uaddlp    v22.4h, v22.8b                          \n"
+    "uaddlp    v23.4h, v23.8b                          \n"
+    "uaddlp    v24.4h, v24.8b                          \n"
+    "uaddlp    v25.4h, v25.8b                          \n"
+
+    // 60+70 61+71 62+72 63+73
+    "uaddlp    v1.4h, v1.8b                            \n"
+    "uaddlp    v5.4h, v5.8b                            \n"
+    "uaddlp    v17.4h, v17.8b                          \n"
+
+    // combine source lines
+    "add       v20.4h, v20.4h, v22.4h                  \n"
+    "add       v21.4h, v21.4h, v23.4h                  \n"
+    "add       v20.4h, v20.4h, v24.4h                  \n"
+    "add       v21.4h, v21.4h, v25.4h                  \n"
+    "add       v2.4h, v1.4h, v5.4h                     \n"
+    "add       v2.4h, v2.4h, v17.4h                    \n"
+
+    // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
+    //             + s[6 + st * 1] + s[7 + st * 1]
+    //             + s[6 + st * 2] + s[7 + st * 2]) / 6
+    "sqrdmulh  v2.8h, v2.8h, v29.8h                    \n"
+    "xtn       v2.8b,  v2.8h                           \n"
+
+    // Shuffle 2,3 reg around so that 2 can be added to the
+    //  0,1 reg and 3 can be added to the 4,5 reg. This
+    //  requires expanding from u8 to u16 as the 0,1 and 4,5
+    //  registers are already expanded. Then do transposes
+    //  to get aligned.
+    // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+    "ushll     v16.8h, v16.8b, #0                      \n"
+    "uaddl     v0.8h, v0.8b, v4.8b                     \n"
+
+    // combine source lines
+    "add       v0.8h, v0.8h, v16.8h                    \n"
+
+    // xx 20 xx 21 xx 22 xx 23
+    // xx 30 xx 31 xx 32 xx 33
+    "trn1      v1.8h, v0.8h, v0.8h                     \n"
+    "trn2      v4.8h, v0.8h, v0.8h                     \n"
+    "xtn       v0.4h, v1.4s                            \n"
+    "xtn       v4.4h, v4.4s                            \n"
+
+    // 0+1+2, 3+4+5
+    "add       v20.8h, v20.8h, v0.8h                   \n"
+    "add       v21.8h, v21.8h, v4.8h                   \n"
+
+    // Need to divide, but can't downshift as the the value
+    //  isn't a power of 2. So multiply by 65536 / n
+    //  and take the upper 16 bits.
+    "sqrdmulh  v0.8h, v20.8h, v31.8h                   \n"
+    "sqrdmulh  v1.8h, v21.8h, v31.8h                   \n"
+
+    // Align for table lookup, vtbl requires registers to
+    //  be adjacent
+    "tbl       v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n"
+
+    MEMACCESS(1)
+    "st1       {v3.8b}, [%1], #8                       \n"
+    MEMACCESS(1)
+    "st1       {v3.s}[2], [%1], #4                     \n"
+    "b.gt      1b                                      \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst_ptr),          // %1
+    "+r"(tmp_src_stride),   // %2
+    "+r"(src_ptr1),         // %3
+    "+r"(dst_width)         // %4
+  : "r"(&kMult38_Div6),     // %5
+    "r"(&kShuf38_2),        // %6
+    "r"(&kMult38_Div9)      // %7
+  : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
+    "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v29",
+    "v30", "v31", "memory", "cc"
+  );
+}
+
+// 32x2 -> 12x1
+void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width) {
+  // TODO(fbarchard): use src_stride directly for clang 3.5+.
+  ptrdiff_t tmp_src_stride = src_stride;
+  asm volatile (
+    MEMACCESS(4)
+    "ld1       {v30.8h}, [%4]                          \n"
+    MEMACCESS(5)
+    "ld1       {v31.16b}, [%5]                         \n"
+    "add       %2, %2, %0                              \n"
+  "1:                                                  \n"
+
+    // 00 40 01 41 02 42 03 43
+    // 10 50 11 51 12 52 13 53
+    // 20 60 21 61 22 62 23 63
+    // 30 70 31 71 32 72 33 73
+    MEMACCESS(0)
+    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"
+    MEMACCESS(3)
+    "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32                \n"
+    "subs      %w3, %w3, #12                           \n"
+
+    // Shuffle the input data around to get align the data
+    //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+    // 00 10 01 11 02 12 03 13
+    // 40 50 41 51 42 52 43 53
+    "trn1      v16.8b, v0.8b, v1.8b                    \n"
+    "trn2      v17.8b, v0.8b, v1.8b                    \n"
+    "trn1      v18.8b, v4.8b, v5.8b                    \n"
+    "trn2      v19.8b, v4.8b, v5.8b                    \n"
+
+    // 20 30 21 31 22 32 23 33
+    // 60 70 61 71 62 72 63 73
+    "trn1      v0.8b, v2.8b, v3.8b                     \n"
+    "trn2      v1.8b, v2.8b, v3.8b                     \n"
+    "trn1      v4.8b, v6.8b, v7.8b                     \n"
+    "trn2      v5.8b, v6.8b, v7.8b                     \n"
+
+    // 00+10 01+11 02+12 03+13
+    // 40+50 41+51 42+52 43+53
+    "uaddlp    v16.4h, v16.8b                          \n"
+    "uaddlp    v17.4h, v17.8b                          \n"
+    "uaddlp    v18.4h, v18.8b                          \n"
+    "uaddlp    v19.4h, v19.8b                          \n"
+
+    // 60+70 61+71 62+72 63+73
+    "uaddlp    v1.4h, v1.8b                            \n"
+    "uaddlp    v5.4h, v5.8b                            \n"
+
+    // combine source lines
+    "add       v16.4h, v16.4h, v18.4h                  \n"
+    "add       v17.4h, v17.4h, v19.4h                  \n"
+    "add       v2.4h, v1.4h, v5.4h                     \n"
+
+    // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
+    "uqrshrn   v2.8b, v2.8h, #2                        \n"
+
+    // Shuffle 2,3 reg around so that 2 can be added to the
+    //  0,1 reg and 3 can be added to the 4,5 reg. This
+    //  requires expanding from u8 to u16 as the 0,1 and 4,5
+    //  registers are already expanded. Then do transposes
+    //  to get aligned.
+    // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+
+    // combine source lines
+    "uaddl     v0.8h, v0.8b, v4.8b                     \n"
+
+    // xx 20 xx 21 xx 22 xx 23
+    // xx 30 xx 31 xx 32 xx 33
+    "trn1      v1.8h, v0.8h, v0.8h                     \n"
+    "trn2      v4.8h, v0.8h, v0.8h                     \n"
+    "xtn       v0.4h, v1.4s                            \n"
+    "xtn       v4.4h, v4.4s                            \n"
+
+    // 0+1+2, 3+4+5
+    "add       v16.8h, v16.8h, v0.8h                   \n"
+    "add       v17.8h, v17.8h, v4.8h                   \n"
+
+    // Need to divide, but can't downshift as the the value
+    //  isn't a power of 2. So multiply by 65536 / n
+    //  and take the upper 16 bits.
+    "sqrdmulh  v0.8h, v16.8h, v30.8h                   \n"
+    "sqrdmulh  v1.8h, v17.8h, v30.8h                   \n"
+
+    // Align for table lookup, vtbl requires registers to
+    //  be adjacent
+
+    "tbl       v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n"
+
+    MEMACCESS(1)
+    "st1       {v3.8b}, [%1], #8                       \n"
+    MEMACCESS(1)
+    "st1       {v3.s}[2], [%1], #4                     \n"
+    "b.gt      1b                                      \n"
+  : "+r"(src_ptr),         // %0
+    "+r"(dst_ptr),         // %1
+    "+r"(tmp_src_stride),  // %2
+    "+r"(dst_width)        // %3
+  : "r"(&kMult38_Div6),    // %4
+    "r"(&kShuf38_2)        // %5
+  : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
+    "v18", "v19", "v30", "v31", "memory", "cc"
+  );
+}
+
+void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                    uint16* dst_ptr, int src_width, int src_height) {
+  const uint8* src_tmp = NULL;
+  asm volatile (
+  "1:                                          \n"
+    "mov       %0, %1                          \n"
+    "mov       w12, %w5                        \n"
+    "eor       v2.16b, v2.16b, v2.16b          \n"
+    "eor       v3.16b, v3.16b, v3.16b          \n"
+  "2:                                          \n"
+    // load 16 pixels into q0
+    MEMACCESS(0)
+    "ld1       {v0.16b}, [%0], %3              \n"
+    "uaddw2    v3.8h, v3.8h, v0.16b            \n"
+    "uaddw     v2.8h, v2.8h, v0.8b             \n"
+    "subs      w12, w12, #1                    \n"
+    "b.gt      2b                              \n"
+    MEMACCESS(2)
+    "st1      {v2.8h, v3.8h}, [%2], #32        \n"  // store pixels
+    "add      %1, %1, #16                      \n"
+    "subs     %w4, %w4, #16                    \n"  // 16 processed per loop
+    "b.gt     1b                               \n"
+  : "+r"(src_tmp),          // %0
+    "+r"(src_ptr),          // %1
+    "+r"(dst_ptr),          // %2
+    "+r"(src_stride),       // %3
+    "+r"(src_width),        // %4
+    "+r"(src_height)        // %5
+  :
+  : "memory", "cc", "w12", "v0", "v1", "v2", "v3"  // Clobber List
+  );
+}
+
+// TODO(Yang Zhang): Investigate less load instructions for
+// the x/dx stepping
+#define LOAD2_DATA8_LANE(n)                                    \
+    "lsr        %5, %3, #16                    \n"             \
+    "add        %6, %1, %5                    \n"              \
+    "add        %3, %3, %4                     \n"             \
+    MEMACCESS(6)                                               \
+    "ld2        {v4.b, v5.b}["#n"], [%6]      \n"
+
+void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
+                          int dst_width, int x, int dx) {
+  int dx_offset[4] = {0, 1, 2, 3};
+  int* tmp = dx_offset;
+  const uint8* src_tmp = src_ptr;
+  int64 dst_width64 = (int64) dst_width;  // Work around ios 64 bit warning.
+  int64 x64 = (int64) x;
+  int64 dx64 = (int64) dx;
+  asm volatile (
+    "dup        v0.4s, %w3                     \n"  // x
+    "dup        v1.4s, %w4                     \n"  // dx
+    "ld1        {v2.4s}, [%5]                  \n"  // 0 1 2 3
+    "shl        v3.4s, v1.4s, #2               \n"  // 4 * dx
+    "mul        v1.4s, v1.4s, v2.4s            \n"
+    // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
+    "add        v1.4s, v1.4s, v0.4s            \n"
+    // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
+    "add        v2.4s, v1.4s, v3.4s            \n"
+    "shl        v0.4s, v3.4s, #1               \n"  // 8 * dx
+  "1:                                          \n"
+    LOAD2_DATA8_LANE(0)
+    LOAD2_DATA8_LANE(1)
+    LOAD2_DATA8_LANE(2)
+    LOAD2_DATA8_LANE(3)
+    LOAD2_DATA8_LANE(4)
+    LOAD2_DATA8_LANE(5)
+    LOAD2_DATA8_LANE(6)
+    LOAD2_DATA8_LANE(7)
+    "mov       v6.16b, v1.16b                  \n"
+    "mov       v7.16b, v2.16b                  \n"
+    "uzp1      v6.8h, v6.8h, v7.8h             \n"
+    "ushll     v4.8h, v4.8b, #0                \n"
+    "ushll     v5.8h, v5.8b, #0                \n"
+    "ssubl     v16.4s, v5.4h, v4.4h            \n"
+    "ssubl2    v17.4s, v5.8h, v4.8h            \n"
+    "ushll     v7.4s, v6.4h, #0                \n"
+    "ushll2    v6.4s, v6.8h, #0                \n"
+    "mul       v16.4s, v16.4s, v7.4s           \n"
+    "mul       v17.4s, v17.4s, v6.4s           \n"
+    "shrn      v6.4h, v16.4s, #16              \n"
+    "shrn2     v6.8h, v17.4s, #16              \n"
+    "add       v4.8h, v4.8h, v6.8h             \n"
+    "xtn       v4.8b, v4.8h                    \n"
+
+    MEMACCESS(0)
+    "st1       {v4.8b}, [%0], #8               \n"  // store pixels
+    "add       v1.4s, v1.4s, v0.4s             \n"
+    "add       v2.4s, v2.4s, v0.4s             \n"
+    "subs      %w2, %w2, #8                    \n"  // 8 processed per loop
+    "b.gt      1b                              \n"
+  : "+r"(dst_ptr),          // %0
+    "+r"(src_ptr),          // %1
+    "+r"(dst_width64),      // %2
+    "+r"(x64),              // %3
+    "+r"(dx64),             // %4
+    "+r"(tmp),              // %5
+    "+r"(src_tmp)           // %6
+  :
+  : "memory", "cc", "v0", "v1", "v2", "v3",
+    "v4", "v5", "v6", "v7", "v16", "v17"
+  );
+}
+
+#undef LOAD2_DATA8_LANE
+
+// 16x2 -> 16x1
+void ScaleFilterRows_NEON(uint8* dst_ptr,
+                          const uint8* src_ptr, ptrdiff_t src_stride,
+                          int dst_width, int source_y_fraction) {
+    int y_fraction = 256 - source_y_fraction;
+  asm volatile (
+    "cmp          %w4, #0                      \n"
+    "b.eq         100f                         \n"
+    "add          %2, %2, %1                   \n"
+    "cmp          %w4, #64                     \n"
+    "b.eq         75f                          \n"
+    "cmp          %w4, #128                    \n"
+    "b.eq         50f                          \n"
+    "cmp          %w4, #192                    \n"
+    "b.eq         25f                          \n"
+
+    "dup          v5.8b, %w4                   \n"
+    "dup          v4.8b, %w5                   \n"
+    // General purpose row blend.
+  "1:                                          \n"
+    MEMACCESS(1)
+    "ld1          {v0.16b}, [%1], #16          \n"
+    MEMACCESS(2)
+    "ld1          {v1.16b}, [%2], #16          \n"
+    "subs         %w3, %w3, #16                \n"
+    "umull        v6.8h, v0.8b, v4.8b          \n"
+    "umull2       v7.8h, v0.16b, v4.16b        \n"
+    "umlal        v6.8h, v1.8b, v5.8b          \n"
+    "umlal2       v7.8h, v1.16b, v5.16b        \n"
+    "rshrn        v0.8b, v6.8h, #8             \n"
+    "rshrn2       v0.16b, v7.8h, #8            \n"
+    MEMACCESS(0)
+    "st1          {v0.16b}, [%0], #16          \n"
+    "b.gt         1b                           \n"
+    "b            99f                          \n"
+
+    // Blend 25 / 75.
+  "25:                                         \n"
+    MEMACCESS(1)
+    "ld1          {v0.16b}, [%1], #16          \n"
+    MEMACCESS(2)
+    "ld1          {v1.16b}, [%2], #16          \n"
+    "subs         %w3, %w3, #16                \n"
+    "urhadd       v0.16b, v0.16b, v1.16b       \n"
+    "urhadd       v0.16b, v0.16b, v1.16b       \n"
+    MEMACCESS(0)
+    "st1          {v0.16b}, [%0], #16          \n"
+    "b.gt         25b                          \n"
+    "b            99f                          \n"
+
+    // Blend 50 / 50.
+  "50:                                         \n"
+    MEMACCESS(1)
+    "ld1          {v0.16b}, [%1], #16          \n"
+    MEMACCESS(2)
+    "ld1          {v1.16b}, [%2], #16          \n"
+    "subs         %w3, %w3, #16                \n"
+    "urhadd       v0.16b, v0.16b, v1.16b       \n"
+    MEMACCESS(0)
+    "st1          {v0.16b}, [%0], #16          \n"
+    "b.gt         50b                          \n"
+    "b            99f                          \n"
+
+    // Blend 75 / 25.
+  "75:                                         \n"
+    MEMACCESS(1)
+    "ld1          {v1.16b}, [%1], #16          \n"
+    MEMACCESS(2)
+    "ld1          {v0.16b}, [%2], #16          \n"
+    "subs         %w3, %w3, #16                \n"
+    "urhadd       v0.16b, v0.16b, v1.16b       \n"
+    "urhadd       v0.16b, v0.16b, v1.16b       \n"
+    MEMACCESS(0)
+    "st1          {v0.16b}, [%0], #16          \n"
+    "b.gt         75b                          \n"
+    "b            99f                          \n"
+
+    // Blend 100 / 0 - Copy row unchanged.
+  "100:                                        \n"
+    MEMACCESS(1)
+    "ld1          {v0.16b}, [%1], #16          \n"
+    "subs         %w3, %w3, #16                \n"
+    MEMACCESS(0)
+    "st1          {v0.16b}, [%0], #16          \n"
+    "b.gt         100b                         \n"
+
+  "99:                                         \n"
+    MEMACCESS(0)
+    "st1          {v0.b}[15], [%0]             \n"
+  : "+r"(dst_ptr),          // %0
+    "+r"(src_ptr),          // %1
+    "+r"(src_stride),       // %2
+    "+r"(dst_width),        // %3
+    "+r"(source_y_fraction),// %4
+    "+r"(y_fraction)        // %5
+  :
+  : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory", "cc"
+  );
+}
+
+void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst, int dst_width) {
+  asm volatile (
+  "1:                                          \n"
+    // load even pixels into q0, odd into q1
+    MEMACCESS (0)
+    "ld2        {v0.4s, v1.4s}, [%0], #32      \n"
+    MEMACCESS (0)
+    "ld2        {v2.4s, v3.4s}, [%0], #32      \n"
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
+    MEMACCESS (1)
+    "st1        {v1.16b}, [%1], #16            \n"  // store odd pixels
+    MEMACCESS (1)
+    "st1        {v3.16b}, [%1], #16            \n"
+    "b.gt       1b                             \n"
+  : "+r" (src_ptr),          // %0
+    "+r" (dst),              // %1
+    "+r" (dst_width)         // %2
+  :
+  : "memory", "cc", "v0", "v1", "v2", "v3"  // Clobber List
+  );
+}
+
+void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+                                  uint8* dst_argb, int dst_width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS (0)
+    // load 8 ARGB pixels.
+    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64   \n"
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
+    "uaddlp     v3.8h, v3.16b                  \n"  // A 16 bytes -> 8 shorts.
+    "rshrn      v0.8b, v0.8h, #1               \n"  // downshift, round and pack
+    "rshrn      v1.8b, v1.8h, #1               \n"
+    "rshrn      v2.8b, v2.8h, #1               \n"
+    "rshrn      v3.8b, v3.8h, #1               \n"
+    MEMACCESS (1)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32     \n"
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),         // %0
+    "+r"(dst_argb),         // %1
+    "+r"(dst_width)         // %2
+  :
+  : "memory", "cc", "v0", "v1", "v2", "v3"    // Clobber List
+  );
+}
+
+void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst, int dst_width) {
+  asm volatile (
+    // change the stride to row 2 pointer
+    "add        %1, %1, %0                     \n"
+  "1:                                          \n"
+    MEMACCESS (0)
+    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64   \n"  // load 8 ARGB pixels.
+    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
+    "uaddlp     v3.8h, v3.16b                  \n"  // A 16 bytes -> 8 shorts.
+    MEMACCESS (1)
+    "ld4        {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n"  // load 8 more ARGB pixels.
+    "uadalp     v0.8h, v16.16b                 \n"  // B 16 bytes -> 8 shorts.
+    "uadalp     v1.8h, v17.16b                 \n"  // G 16 bytes -> 8 shorts.
+    "uadalp     v2.8h, v18.16b                 \n"  // R 16 bytes -> 8 shorts.
+    "uadalp     v3.8h, v19.16b                 \n"  // A 16 bytes -> 8 shorts.
+    "rshrn      v0.8b, v0.8h, #2               \n"  // downshift, round and pack
+    "rshrn      v1.8b, v1.8h, #2               \n"
+    "rshrn      v2.8b, v2.8h, #2               \n"
+    "rshrn      v3.8b, v3.8h, #2               \n"
+    MEMACCESS (2)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32     \n"
+    "b.gt       1b                             \n"
+  : "+r" (src_ptr),          // %0
+    "+r" (src_stride),       // %1
+    "+r" (dst),              // %2
+    "+r" (dst_width)         // %3
+  :
+  : "memory", "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"
+  );
+}
+
+// Reads 4 pixels at a time.
+// Alignment requirement: src_argb 4 byte aligned.
+void ScaleARGBRowDownEven_NEON(const uint8* src_argb,  ptrdiff_t src_stride,
+                               int src_stepx, uint8* dst_argb, int dst_width) {
+  asm volatile (
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.s}[0], [%0], %3            \n"
+    MEMACCESS(0)
+    "ld1        {v0.s}[1], [%0], %3            \n"
+    MEMACCESS(0)
+    "ld1        {v0.s}[2], [%0], %3            \n"
+    MEMACCESS(0)
+    "ld1        {v0.s}[3], [%0], %3            \n"
+    "subs       %w2, %w2, #4                   \n"  // 4 pixels per loop.
+    MEMACCESS(1)
+    "st1        {v0.16b}, [%1], #16            \n"
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),    // %0
+    "+r"(dst_argb),    // %1
+    "+r"(dst_width)    // %2
+  : "r"((int64)(src_stepx * 4)) // %3
+  : "memory", "cc", "v0"
+  );
+}
+
+// Reads 4 pixels at a time.
+// Alignment requirement: src_argb 4 byte aligned.
+// TODO(Yang Zhang): Might be worth another optimization pass in future.
+// It could be upgraded to 8 pixels at a time to start with.
+void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+                                  int src_stepx,
+                                  uint8* dst_argb, int dst_width) {
+  asm volatile (
+    "add        %1, %1, %0                     \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.8b}, [%0], %4              \n"  // Read 4 2x2 blocks -> 2x1
+    MEMACCESS(1)
+    "ld1        {v1.8b}, [%1], %4              \n"
+    MEMACCESS(0)
+    "ld1        {v2.8b}, [%0], %4              \n"
+    MEMACCESS(1)
+    "ld1        {v3.8b}, [%1], %4              \n"
+    MEMACCESS(0)
+    "ld1        {v4.8b}, [%0], %4              \n"
+    MEMACCESS(1)
+    "ld1        {v5.8b}, [%1], %4              \n"
+    MEMACCESS(0)
+    "ld1        {v6.8b}, [%0], %4              \n"
+    MEMACCESS(1)
+    "ld1        {v7.8b}, [%1], %4              \n"
+    "uaddl      v0.8h, v0.8b, v1.8b            \n"
+    "uaddl      v2.8h, v2.8b, v3.8b            \n"
+    "uaddl      v4.8h, v4.8b, v5.8b            \n"
+    "uaddl      v6.8h, v6.8b, v7.8b            \n"
+    "mov        v16.d[1], v0.d[1]              \n"  // ab_cd -> ac_bd
+    "mov        v0.d[1], v2.d[0]               \n"
+    "mov        v2.d[0], v16.d[1]              \n"
+    "mov        v16.d[1], v4.d[1]              \n"  // ef_gh -> eg_fh
+    "mov        v4.d[1], v6.d[0]               \n"
+    "mov        v6.d[0], v16.d[1]              \n"
+    "add        v0.8h, v0.8h, v2.8h            \n"  // (a+b)_(c+d)
+    "add        v4.8h, v4.8h, v6.8h            \n"  // (e+f)_(g+h)
+    "rshrn      v0.8b, v0.8h, #2               \n"  // first 2 pixels.
+    "rshrn2     v0.16b, v4.8h, #2              \n"  // next 2 pixels.
+    "subs       %w3, %w3, #4                   \n"  // 4 pixels per loop.
+    MEMACCESS(2)
+    "st1     {v0.16b}, [%2], #16               \n"
+    "b.gt       1b                             \n"
+  : "+r"(src_argb),    // %0
+    "+r"(src_stride),  // %1
+    "+r"(dst_argb),    // %2
+    "+r"(dst_width)    // %3
+  : "r"((int64)(src_stepx * 4)) // %4
+  : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
+  );
+}
+
+// TODO(Yang Zhang): Investigate less load instructions for
+// the x/dx stepping
+#define LOAD1_DATA32_LANE(vn, n)                               \
+    "lsr        %5, %3, #16                    \n"             \
+    "add        %6, %1, %5, lsl #2             \n"             \
+    "add        %3, %3, %4                     \n"             \
+    MEMACCESS(6)                                               \
+    "ld1        {"#vn".s}["#n"], [%6]          \n"
+
+void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
+                        int dst_width, int x, int dx) {
+  const uint8* src_tmp = src_argb;
+  int64 dst_width64 = (int64) dst_width;  // Work around ios 64 bit warning.
+  int64 x64 = (int64) x;
+  int64 dx64 = (int64) dx;
+  int64 tmp64 = 0;
+  asm volatile (
+  "1:                                          \n"
+    LOAD1_DATA32_LANE(v0, 0)
+    LOAD1_DATA32_LANE(v0, 1)
+    LOAD1_DATA32_LANE(v0, 2)
+    LOAD1_DATA32_LANE(v0, 3)
+    LOAD1_DATA32_LANE(v1, 0)
+    LOAD1_DATA32_LANE(v1, 1)
+    LOAD1_DATA32_LANE(v1, 2)
+    LOAD1_DATA32_LANE(v1, 3)
+
+    MEMACCESS(0)
+    "st1        {v0.4s, v1.4s}, [%0], #32      \n"  // store pixels
+    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
+    "b.gt        1b                            \n"
+  : "+r"(dst_argb),         // %0
+    "+r"(src_argb),         // %1
+    "+r"(dst_width64),      // %2
+    "+r"(x64),              // %3
+    "+r"(dx64),             // %4
+    "+r"(tmp64),            // %5
+    "+r"(src_tmp)           // %6
+  :
+  : "memory", "cc", "v0", "v1"
+  );
+}
+
+#undef LOAD1_DATA32_LANE
+
+// TODO(Yang Zhang): Investigate less load instructions for
+// the x/dx stepping
+#define LOAD2_DATA32_LANE(vn1, vn2, n)                         \
+    "lsr        %5, %3, #16                           \n"      \
+    "add        %6, %1, %5, lsl #2                    \n"      \
+    "add        %3, %3, %4                            \n"      \
+    MEMACCESS(6)                                               \
+    "ld2        {"#vn1".s, "#vn2".s}["#n"], [%6]      \n"
+
+void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
+                              int dst_width, int x, int dx) {
+  int dx_offset[4] = {0, 1, 2, 3};
+  int* tmp = dx_offset;
+  const uint8* src_tmp = src_argb;
+  int64 dst_width64 = (int64) dst_width;  // Work around ios 64 bit warning.
+  int64 x64 = (int64) x;
+  int64 dx64 = (int64) dx;
+  asm volatile (
+    "dup        v0.4s, %w3                     \n"  // x
+    "dup        v1.4s, %w4                     \n"  // dx
+    "ld1        {v2.4s}, [%5]                  \n"  // 0 1 2 3
+    "shl        v6.4s, v1.4s, #2               \n"  // 4 * dx
+    "mul        v1.4s, v1.4s, v2.4s            \n"
+    "movi       v3.16b, #0x7f                  \n"  // 0x7F
+    "movi       v4.8h, #0x7f                   \n"  // 0x7F
+    // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
+    "add        v5.4s, v1.4s, v0.4s            \n"
+  "1:                                          \n"
+    // d0, d1: a
+    // d2, d3: b
+    LOAD2_DATA32_LANE(v0, v1, 0)
+    LOAD2_DATA32_LANE(v0, v1, 1)
+    LOAD2_DATA32_LANE(v0, v1, 2)
+    LOAD2_DATA32_LANE(v0, v1, 3)
+    "shrn       v2.4h, v5.4s, #9               \n"
+    "and        v2.8b, v2.8b, v4.8b            \n"
+    "dup        v16.8b, v2.b[0]                \n"
+    "dup        v17.8b, v2.b[2]                \n"
+    "dup        v18.8b, v2.b[4]                \n"
+    "dup        v19.8b, v2.b[6]                \n"
+    "ext        v2.8b, v16.8b, v17.8b, #4      \n"
+    "ext        v17.8b, v18.8b, v19.8b, #4     \n"
+    "ins        v2.d[1], v17.d[0]              \n"  // f
+    "eor        v7.16b, v2.16b, v3.16b         \n"  // 0x7f ^ f
+    "umull      v16.8h, v0.8b, v7.8b           \n"
+    "umull2     v17.8h, v0.16b, v7.16b         \n"
+    "umull      v18.8h, v1.8b, v2.8b           \n"
+    "umull2     v19.8h, v1.16b, v2.16b         \n"
+    "add        v16.8h, v16.8h, v18.8h         \n"
+    "add        v17.8h, v17.8h, v19.8h         \n"
+    "shrn       v0.8b, v16.8h, #7              \n"
+    "shrn2      v0.16b, v17.8h, #7             \n"
+
+    MEMACCESS(0)
+    "st1     {v0.4s}, [%0], #16                \n"  // store pixels
+    "add     v5.4s, v5.4s, v6.4s               \n"
+    "subs    %w2, %w2, #4                      \n"  // 4 processed per loop
+    "b.gt    1b                                \n"
+  : "+r"(dst_argb),         // %0
+    "+r"(src_argb),         // %1
+    "+r"(dst_width64),      // %2
+    "+r"(x64),              // %3
+    "+r"(dx64),             // %4
+    "+r"(tmp),              // %5
+    "+r"(src_tmp)           // %6
+  :
+  : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5",
+    "v6", "v7", "v16", "v17", "v18", "v19"
+  );
+}
+
+#undef LOAD2_DATA32_LANE
+
+#endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libaom/src/third_party/libyuv/source/scale_win.cc b/libs/libaom/src/third_party/libyuv/source/scale_win.cc
new file mode 100644
index 000000000..c3896ebad
--- /dev/null
+++ b/libs/libaom/src/third_party/libyuv/source/scale_win.cc
@@ -0,0 +1,1354 @@
+/*
+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for Visual C x86.
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
+    defined(_MSC_VER) && !defined(__clang__)
+
+// Offsets for source bytes 0 to 9
+static uvec8 kShuf0 =
+  { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
+static uvec8 kShuf1 =
+  { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
+static uvec8 kShuf2 =
+  { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Offsets for source bytes 0 to 10
+static uvec8 kShuf01 =
+  { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
+
+// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
+static uvec8 kShuf11 =
+  { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
+
+// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
+static uvec8 kShuf21 =
+  { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
+
+// Coefficients for source bytes 0 to 10
+static uvec8 kMadd01 =
+  { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
+
+// Coefficients for source bytes 10 to 21
+static uvec8 kMadd11 =
+  { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
+
+// Coefficients for source bytes 21 to 31
+static uvec8 kMadd21 =
+  { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
+
+// Coefficients for source bytes 21 to 31
+static vec16 kRound34 =
+  { 2, 2, 2, 2, 2, 2, 2, 2 };
+
+static uvec8 kShuf38a =
+  { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+static uvec8 kShuf38b =
+  { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
+
+// Arrange words 0,3,6 into 0,1,2
+static uvec8 kShufAc =
+  { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Arrange words 0,3,6 into 3,4,5
+static uvec8 kShufAc3 =
+  { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
+
+// Scaling values for boxes of 3x3 and 2x3
+static uvec16 kScaleAc33 =
+  { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
+
+// Arrange first value for pixels 0,1,2,3,4,5
+static uvec8 kShufAb0 =
+  { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
+
+// Arrange second value for pixels 0,1,2,3,4,5
+static uvec8 kShufAb1 =
+  { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
+
+// Arrange third value for pixels 0,1,2,3,4,5
+static uvec8 kShufAb2 =
+  { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
+
+// Scaling values for boxes of 3x2 and 2x2
+static uvec16 kScaleAb2 =
+  { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
+
+// Reads 32 pixels, throws half away and writes 16 pixels.
+__declspec(naked)
+void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width) {
+  __asm {
+    mov        eax, [esp + 4]        // src_ptr
+                                     // src_stride ignored
+    mov        edx, [esp + 12]       // dst_ptr
+    mov        ecx, [esp + 16]       // dst_width
+
+  wloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    psrlw      xmm0, 8               // isolate odd pixels.
+    psrlw      xmm1, 8
+    packuswb   xmm0, xmm1
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    jg         wloop
+
+    ret
+  }
+}
+
+// Blends 32x1 rectangle to 16x1.
+__declspec(naked)
+void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                              uint8* dst_ptr, int dst_width) {
+  __asm {
+    mov        eax, [esp + 4]        // src_ptr
+                                     // src_stride
+    mov        edx, [esp + 12]       // dst_ptr
+    mov        ecx, [esp + 16]       // dst_width
+    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
+    psrlw      xmm5, 8
+
+  wloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+
+    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
+    psrlw      xmm0, 8
+    movdqa     xmm3, xmm1
+    psrlw      xmm1, 8
+    pand       xmm2, xmm5
+    pand       xmm3, xmm5
+    pavgw      xmm0, xmm2
+    pavgw      xmm1, xmm3
+    packuswb   xmm0, xmm1
+
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    jg         wloop
+
+    ret
+  }
+}
+
+// Blends 32x2 rectangle to 16x1.
+__declspec(naked)
+void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]    // src_ptr
+    mov        esi, [esp + 4 + 8]    // src_stride
+    mov        edx, [esp + 4 + 12]   // dst_ptr
+    mov        ecx, [esp + 4 + 16]   // dst_width
+    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
+    psrlw      xmm5, 8
+
+  wloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + esi]
+    movdqu     xmm3, [eax + esi + 16]
+    lea        eax,  [eax + 32]
+    pavgb      xmm0, xmm2            // average rows
+    pavgb      xmm1, xmm3
+
+    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
+    psrlw      xmm0, 8
+    movdqa     xmm3, xmm1
+    psrlw      xmm1, 8
+    pand       xmm2, xmm5
+    pand       xmm3, xmm5
+    pavgw      xmm0, xmm2
+    pavgw      xmm1, xmm3
+    packuswb   xmm0, xmm1
+
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 16
+    jg         wloop
+
+    pop        esi
+    ret
+  }
+}
+
+#ifdef HAS_SCALEROWDOWN2_AVX2
+// Reads 64 pixels, throws half away and writes 32 pixels.
+__declspec(naked)
+void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width) {
+  __asm {
+    mov        eax, [esp + 4]        // src_ptr
+                                     // src_stride ignored
+    mov        edx, [esp + 12]       // dst_ptr
+    mov        ecx, [esp + 16]       // dst_width
+
+  wloop:
+    vmovdqu     ymm0, [eax]
+    vmovdqu     ymm1, [eax + 32]
+    lea         eax,  [eax + 64]
+    vpsrlw      ymm0, ymm0, 8        // isolate odd pixels.
+    vpsrlw      ymm1, ymm1, 8
+    vpackuswb   ymm0, ymm0, ymm1
+    vpermq      ymm0, ymm0, 0xd8     // unmutate vpackuswb
+    vmovdqu     [edx], ymm0
+    lea         edx, [edx + 32]
+    sub         ecx, 32
+    jg          wloop
+
+    vzeroupper
+    ret
+  }
+}
+
+// Blends 64x1 rectangle to 32x1.
+__declspec(naked)
+void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                              uint8* dst_ptr, int dst_width) {
+  __asm {
+    mov         eax, [esp + 4]        // src_ptr
+                                      // src_stride
+    mov         edx, [esp + 12]       // dst_ptr
+    mov         ecx, [esp + 16]       // dst_width
+
+    vpcmpeqb    ymm4, ymm4, ymm4      // '1' constant, 8b
+    vpsrlw      ymm4, ymm4, 15
+    vpackuswb   ymm4, ymm4, ymm4
+    vpxor       ymm5, ymm5, ymm5      // constant 0
+
+  wloop:
+    vmovdqu     ymm0, [eax]
+    vmovdqu     ymm1, [eax + 32]
+    lea         eax,  [eax + 64]
+
+    vpmaddubsw  ymm0, ymm0, ymm4      // average horizontally
+    vpmaddubsw  ymm1, ymm1, ymm4
+    vpavgw      ymm0, ymm0, ymm5      // (x + 1) / 2
+    vpavgw      ymm1, ymm1, ymm5
+    vpackuswb   ymm0, ymm0, ymm1
+    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
+
+    vmovdqu     [edx], ymm0
+    lea         edx, [edx + 32]
+    sub         ecx, 32
+    jg          wloop
+
+    vzeroupper
+    ret
+  }
+}
+
+// Blends 64x2 rectangle to 32x1.
+__declspec(naked)
+void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width) {
+  __asm {
+    push        esi
+    mov         eax, [esp + 4 + 4]    // src_ptr
+    mov         esi, [esp + 4 + 8]    // src_stride
+    mov         edx, [esp + 4 + 12]   // dst_ptr
+    mov         ecx, [esp + 4 + 16]   // dst_width
+
+    vpcmpeqb    ymm4, ymm4, ymm4      // '1' constant, 8b
+    vpsrlw      ymm4, ymm4, 15
+    vpackuswb   ymm4, ymm4, ymm4
+    vpxor       ymm5, ymm5, ymm5      // constant 0
+
+  wloop:
+    vmovdqu     ymm0, [eax]           // average rows
+    vmovdqu     ymm1, [eax + 32]
+    vpavgb      ymm0, ymm0, [eax + esi]
+    vpavgb      ymm1, ymm1, [eax + esi + 32]
+    lea         eax,  [eax + 64]
+
+    vpmaddubsw  ymm0, ymm0, ymm4      // average horizontally
+    vpmaddubsw  ymm1, ymm1, ymm4
+    vpavgw      ymm0, ymm0, ymm5      // (x + 1) / 2
+    vpavgw      ymm1, ymm1, ymm5
+    vpackuswb   ymm0, ymm0, ymm1
+    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
+
+    vmovdqu     [edx], ymm0
+    lea         edx, [edx + 32]
+    sub         ecx, 32
+    jg          wloop
+
+    pop         esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_SCALEROWDOWN2_AVX2
+
+// Point samples 32 pixels to 8 pixels.
+__declspec(naked)
+void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width) {
+  __asm {
+    mov        eax, [esp + 4]        // src_ptr
+                                     // src_stride ignored
+    mov        edx, [esp + 12]       // dst_ptr
+    mov        ecx, [esp + 16]       // dst_width
+    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff0000
+    psrld      xmm5, 24
+    pslld      xmm5, 16
+
+  wloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    pand       xmm0, xmm5
+    pand       xmm1, xmm5
+    packuswb   xmm0, xmm1
+    psrlw      xmm0, 8
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edx], xmm0
+    lea        edx, [edx + 8]
+    sub        ecx, 8
+    jg         wloop
+
+    ret
+  }
+}
+
+// Blends 32x4 rectangle to 8x1.
+__declspec(naked)
+void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]    // src_ptr
+    mov        esi, [esp + 8 + 8]    // src_stride
+    mov        edx, [esp + 8 + 12]   // dst_ptr
+    mov        ecx, [esp + 8 + 16]   // dst_width
+    lea        edi, [esi + esi * 2]  // src_stride * 3
+    pcmpeqb    xmm7, xmm7            // generate mask 0x00ff00ff
+    psrlw      xmm7, 8
+
+  wloop:
+    movdqu     xmm0, [eax]           // average rows
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + esi]
+    movdqu     xmm3, [eax + esi + 16]
+    pavgb      xmm0, xmm2
+    pavgb      xmm1, xmm3
+    movdqu     xmm2, [eax + esi * 2]
+    movdqu     xmm3, [eax + esi * 2 + 16]
+    movdqu     xmm4, [eax + edi]
+    movdqu     xmm5, [eax + edi + 16]
+    lea        eax, [eax + 32]
+    pavgb      xmm2, xmm4
+    pavgb      xmm3, xmm5
+    pavgb      xmm0, xmm2
+    pavgb      xmm1, xmm3
+
+    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
+    psrlw      xmm0, 8
+    movdqa     xmm3, xmm1
+    psrlw      xmm1, 8
+    pand       xmm2, xmm7
+    pand       xmm3, xmm7
+    pavgw      xmm0, xmm2
+    pavgw      xmm1, xmm3
+    packuswb   xmm0, xmm1
+
+    movdqa     xmm2, xmm0            // average columns (16 to 8 pixels)
+    psrlw      xmm0, 8
+    pand       xmm2, xmm7
+    pavgw      xmm0, xmm2
+    packuswb   xmm0, xmm0
+
+    movq       qword ptr [edx], xmm0
+    lea        edx, [edx + 8]
+    sub        ecx, 8
+    jg         wloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+#ifdef HAS_SCALEROWDOWN4_AVX2
+// Point samples 64 pixels to 16 pixels.
+__declspec(naked)
+void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width) {
+  __asm {
+    mov         eax, [esp + 4]        // src_ptr
+                                      // src_stride ignored
+    mov         edx, [esp + 12]       // dst_ptr
+    mov         ecx, [esp + 16]       // dst_width
+    vpcmpeqb    ymm5, ymm5, ymm5      // generate mask 0x00ff0000
+    vpsrld      ymm5, ymm5, 24
+    vpslld      ymm5, ymm5, 16
+
+  wloop:
+    vmovdqu     ymm0, [eax]
+    vmovdqu     ymm1, [eax + 32]
+    lea         eax,  [eax + 64]
+    vpand       ymm0, ymm0, ymm5
+    vpand       ymm1, ymm1, ymm5
+    vpackuswb   ymm0, ymm0, ymm1
+    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
+    vpsrlw      ymm0, ymm0, 8
+    vpackuswb   ymm0, ymm0, ymm0
+    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
+    vmovdqu     [edx], xmm0
+    lea         edx, [edx + 16]
+    sub         ecx, 16
+    jg          wloop
+
+    vzeroupper
+    ret
+  }
+}
+
+// Blends 64x4 rectangle to 16x1.
+__declspec(naked)
+void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width) {
+  __asm {
+    push        esi
+    push        edi
+    mov         eax, [esp + 8 + 4]    // src_ptr
+    mov         esi, [esp + 8 + 8]    // src_stride
+    mov         edx, [esp + 8 + 12]   // dst_ptr
+    mov         ecx, [esp + 8 + 16]   // dst_width
+    lea         edi, [esi + esi * 2]  // src_stride * 3
+    vpcmpeqb    ymm7, ymm7, ymm7      // generate mask 0x00ff00ff
+    vpsrlw      ymm7, ymm7, 8
+
+  wloop:
+    vmovdqu     ymm0, [eax]           // average rows
+    vmovdqu     ymm1, [eax + 32]
+    vpavgb      ymm0, ymm0, [eax + esi]
+    vpavgb      ymm1, ymm1, [eax + esi + 32]
+    vmovdqu     ymm2, [eax + esi * 2]
+    vmovdqu     ymm3, [eax + esi * 2 + 32]
+    vpavgb      ymm2, ymm2, [eax + edi]
+    vpavgb      ymm3, ymm3, [eax + edi + 32]
+    lea         eax, [eax + 64]
+    vpavgb      ymm0, ymm0, ymm2
+    vpavgb      ymm1, ymm1, ymm3
+
+    vpand       ymm2, ymm0, ymm7      // average columns (64 to 32 pixels)
+    vpand       ymm3, ymm1, ymm7
+    vpsrlw      ymm0, ymm0, 8
+    vpsrlw      ymm1, ymm1, 8
+    vpavgw      ymm0, ymm0, ymm2
+    vpavgw      ymm1, ymm1, ymm3
+    vpackuswb   ymm0, ymm0, ymm1
+    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
+
+    vpand       ymm2, ymm0, ymm7      // average columns (32 to 16 pixels)
+    vpsrlw      ymm0, ymm0, 8
+    vpavgw      ymm0, ymm0, ymm2
+    vpackuswb   ymm0, ymm0, ymm0
+    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
+
+    vmovdqu     [edx], xmm0
+    lea         edx, [edx + 16]
+    sub         ecx, 16
+    jg          wloop
+
+    pop        edi
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_SCALEROWDOWN4_AVX2
+
+// Point samples 32 pixels to 24 pixels.
+// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
+// Then shuffled to do the scaling.
+
+__declspec(naked)
+void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                          uint8* dst_ptr, int dst_width) {
+  __asm {
+    mov        eax, [esp + 4]        // src_ptr
+                                     // src_stride ignored
+    mov        edx, [esp + 12]       // dst_ptr
+    mov        ecx, [esp + 16]       // dst_width
+    movdqa     xmm3, kShuf0
+    movdqa     xmm4, kShuf1
+    movdqa     xmm5, kShuf2
+
+  wloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    movdqa     xmm2, xmm1
+    palignr    xmm1, xmm0, 8
+    pshufb     xmm0, xmm3
+    pshufb     xmm1, xmm4
+    pshufb     xmm2, xmm5
+    movq       qword ptr [edx], xmm0
+    movq       qword ptr [edx + 8], xmm1
+    movq       qword ptr [edx + 16], xmm2
+    lea        edx, [edx + 24]
+    sub        ecx, 24
+    jg         wloop
+
+    ret
+  }
+}
+
+// Blends 32x2 rectangle to 24x1
+// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
+// Then shuffled to do the scaling.
+
+// Register usage:
+// xmm0 src_row 0
+// xmm1 src_row 1
+// xmm2 shuf 0
+// xmm3 shuf 1
+// xmm4 shuf 2
+// xmm5 madd 0
+// xmm6 madd 1
+// xmm7 kRound34
+
+// Note that movdqa+palign may be better than movdqu.
+__declspec(naked)
+void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]    // src_ptr
+    mov        esi, [esp + 4 + 8]    // src_stride
+    mov        edx, [esp + 4 + 12]   // dst_ptr
+    mov        ecx, [esp + 4 + 16]   // dst_width
+    movdqa     xmm2, kShuf01
+    movdqa     xmm3, kShuf11
+    movdqa     xmm4, kShuf21
+    movdqa     xmm5, kMadd01
+    movdqa     xmm6, kMadd11
+    movdqa     xmm7, kRound34
+
+  wloop:
+    movdqu     xmm0, [eax]           // pixels 0..7
+    movdqu     xmm1, [eax + esi]
+    pavgb      xmm0, xmm1
+    pshufb     xmm0, xmm2
+    pmaddubsw  xmm0, xmm5
+    paddsw     xmm0, xmm7
+    psrlw      xmm0, 2
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edx], xmm0
+    movdqu     xmm0, [eax + 8]       // pixels 8..15
+    movdqu     xmm1, [eax + esi + 8]
+    pavgb      xmm0, xmm1
+    pshufb     xmm0, xmm3
+    pmaddubsw  xmm0, xmm6
+    paddsw     xmm0, xmm7
+    psrlw      xmm0, 2
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edx + 8], xmm0
+    movdqu     xmm0, [eax + 16]      // pixels 16..23
+    movdqu     xmm1, [eax + esi + 16]
+    lea        eax, [eax + 32]
+    pavgb      xmm0, xmm1
+    pshufb     xmm0, xmm4
+    movdqa     xmm1, kMadd21
+    pmaddubsw  xmm0, xmm1
+    paddsw     xmm0, xmm7
+    psrlw      xmm0, 2
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edx + 16], xmm0
+    lea        edx, [edx + 24]
+    sub        ecx, 24
+    jg         wloop
+
+    pop        esi
+    ret
+  }
+}
+
+// Note that movdqa+palign may be better than movdqu.
+__declspec(naked)
+void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]    // src_ptr
+    mov        esi, [esp + 4 + 8]    // src_stride
+    mov        edx, [esp + 4 + 12]   // dst_ptr
+    mov        ecx, [esp + 4 + 16]   // dst_width
+    movdqa     xmm2, kShuf01
+    movdqa     xmm3, kShuf11
+    movdqa     xmm4, kShuf21
+    movdqa     xmm5, kMadd01
+    movdqa     xmm6, kMadd11
+    movdqa     xmm7, kRound34
+
+  wloop:
+    movdqu     xmm0, [eax]           // pixels 0..7
+    movdqu     xmm1, [eax + esi]
+    pavgb      xmm1, xmm0
+    pavgb      xmm0, xmm1
+    pshufb     xmm0, xmm2
+    pmaddubsw  xmm0, xmm5
+    paddsw     xmm0, xmm7
+    psrlw      xmm0, 2
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edx], xmm0
+    movdqu     xmm0, [eax + 8]       // pixels 8..15
+    movdqu     xmm1, [eax + esi + 8]
+    pavgb      xmm1, xmm0
+    pavgb      xmm0, xmm1
+    pshufb     xmm0, xmm3
+    pmaddubsw  xmm0, xmm6
+    paddsw     xmm0, xmm7
+    psrlw      xmm0, 2
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edx + 8], xmm0
+    movdqu     xmm0, [eax + 16]      // pixels 16..23
+    movdqu     xmm1, [eax + esi + 16]
+    lea        eax, [eax + 32]
+    pavgb      xmm1, xmm0
+    pavgb      xmm0, xmm1
+    pshufb     xmm0, xmm4
+    movdqa     xmm1, kMadd21
+    pmaddubsw  xmm0, xmm1
+    paddsw     xmm0, xmm7
+    psrlw      xmm0, 2
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edx + 16], xmm0
+    lea        edx, [edx+24]
+    sub        ecx, 24
+    jg         wloop
+
+    pop        esi
+    ret
+  }
+}
+
+// 3/8 point sampler
+
+// Scale 32 pixels to 12
+__declspec(naked)
+void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                          uint8* dst_ptr, int dst_width) {
+  __asm {
+    mov        eax, [esp + 4]        // src_ptr
+                                     // src_stride ignored
+    mov        edx, [esp + 12]       // dst_ptr
+    mov        ecx, [esp + 16]       // dst_width
+    movdqa     xmm4, kShuf38a
+    movdqa     xmm5, kShuf38b
+
+  xloop:
+    movdqu     xmm0, [eax]           // 16 pixels -> 0,1,2,3,4,5
+    movdqu     xmm1, [eax + 16]      // 16 pixels -> 6,7,8,9,10,11
+    lea        eax, [eax + 32]
+    pshufb     xmm0, xmm4
+    pshufb     xmm1, xmm5
+    paddusb    xmm0, xmm1
+
+    movq       qword ptr [edx], xmm0  // write 12 pixels
+    movhlps    xmm1, xmm0
+    movd       [edx + 8], xmm1
+    lea        edx, [edx + 12]
+    sub        ecx, 12
+    jg         xloop
+
+    ret
+  }
+}
+
+// Scale 16x3 pixels to 6x1 with interpolation
+__declspec(naked)
+void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]    // src_ptr
+    mov        esi, [esp + 4 + 8]    // src_stride
+    mov        edx, [esp + 4 + 12]   // dst_ptr
+    mov        ecx, [esp + 4 + 16]   // dst_width
+    movdqa     xmm2, kShufAc
+    movdqa     xmm3, kShufAc3
+    movdqa     xmm4, kScaleAc33
+    pxor       xmm5, xmm5
+
+  xloop:
+    movdqu     xmm0, [eax]           // sum up 3 rows into xmm0/1
+    movdqu     xmm6, [eax + esi]
+    movhlps    xmm1, xmm0
+    movhlps    xmm7, xmm6
+    punpcklbw  xmm0, xmm5
+    punpcklbw  xmm1, xmm5
+    punpcklbw  xmm6, xmm5
+    punpcklbw  xmm7, xmm5
+    paddusw    xmm0, xmm6
+    paddusw    xmm1, xmm7
+    movdqu     xmm6, [eax + esi * 2]
+    lea        eax, [eax + 16]
+    movhlps    xmm7, xmm6
+    punpcklbw  xmm6, xmm5
+    punpcklbw  xmm7, xmm5
+    paddusw    xmm0, xmm6
+    paddusw    xmm1, xmm7
+
+    movdqa     xmm6, xmm0            // 8 pixels -> 0,1,2 of xmm6
+    psrldq     xmm0, 2
+    paddusw    xmm6, xmm0
+    psrldq     xmm0, 2
+    paddusw    xmm6, xmm0
+    pshufb     xmm6, xmm2
+
+    movdqa     xmm7, xmm1            // 8 pixels -> 3,4,5 of xmm6
+    psrldq     xmm1, 2
+    paddusw    xmm7, xmm1
+    psrldq     xmm1, 2
+    paddusw    xmm7, xmm1
+    pshufb     xmm7, xmm3
+    paddusw    xmm6, xmm7
+
+    pmulhuw    xmm6, xmm4            // divide by 9,9,6, 9,9,6
+    packuswb   xmm6, xmm6
+
+    movd       [edx], xmm6           // write 6 pixels
+    psrlq      xmm6, 16
+    movd       [edx + 2], xmm6
+    lea        edx, [edx + 6]
+    sub        ecx, 6
+    jg         xloop
+
+    pop        esi
+    ret
+  }
+}
+
+// Scale 16x2 pixels to 6x1 with interpolation
+__declspec(naked)
+void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]    // src_ptr
+    mov        esi, [esp + 4 + 8]    // src_stride
+    mov        edx, [esp + 4 + 12]   // dst_ptr
+    mov        ecx, [esp + 4 + 16]   // dst_width
+    movdqa     xmm2, kShufAb0
+    movdqa     xmm3, kShufAb1
+    movdqa     xmm4, kShufAb2
+    movdqa     xmm5, kScaleAb2
+
+  xloop:
+    movdqu     xmm0, [eax]           // average 2 rows into xmm0
+    movdqu     xmm1, [eax + esi]
+    lea        eax, [eax + 16]
+    pavgb      xmm0, xmm1
+
+    movdqa     xmm1, xmm0            // 16 pixels -> 0,1,2,3,4,5 of xmm1
+    pshufb     xmm1, xmm2
+    movdqa     xmm6, xmm0
+    pshufb     xmm6, xmm3
+    paddusw    xmm1, xmm6
+    pshufb     xmm0, xmm4
+    paddusw    xmm1, xmm0
+
+    pmulhuw    xmm1, xmm5            // divide by 3,3,2, 3,3,2
+    packuswb   xmm1, xmm1
+
+    movd       [edx], xmm1           // write 6 pixels
+    psrlq      xmm1, 16
+    movd       [edx + 2], xmm1
+    lea        edx, [edx + 6]
+    sub        ecx, 6
+    jg         xloop
+
+    pop        esi
+    ret
+  }
+}
+
+// Reads 16 bytes and accumulates to 16 shorts at a time.
+__declspec(naked)
+void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
+  __asm {
+    mov        eax, [esp + 4]   // src_ptr
+    mov        edx, [esp + 8]   // dst_ptr
+    mov        ecx, [esp + 12]  // src_width
+    pxor       xmm5, xmm5
+
+  // sum rows
+  xloop:
+    movdqu     xmm3, [eax]       // read 16 bytes
+    lea        eax, [eax + 16]
+    movdqu     xmm0, [edx]       // read 16 words from destination
+    movdqu     xmm1, [edx + 16]
+    movdqa     xmm2, xmm3
+    punpcklbw  xmm2, xmm5
+    punpckhbw  xmm3, xmm5
+    paddusw    xmm0, xmm2        // sum 16 words
+    paddusw    xmm1, xmm3
+    movdqu     [edx], xmm0       // write 16 words to destination
+    movdqu     [edx + 16], xmm1
+    lea        edx, [edx + 32]
+    sub        ecx, 16
+    jg         xloop
+    ret
+  }
+}
+
+#ifdef HAS_SCALEADDROW_AVX2
+// Reads 32 bytes and accumulates to 32 shorts at a time.
+__declspec(naked)
+void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
+  __asm {
+    mov         eax, [esp + 4]   // src_ptr
+    mov         edx, [esp + 8]   // dst_ptr
+    mov         ecx, [esp + 12]  // src_width
+    vpxor       ymm5, ymm5, ymm5
+
+  // sum rows
+  xloop:
+    vmovdqu     ymm3, [eax]       // read 32 bytes
+    lea         eax, [eax + 32]
+    vpermq      ymm3, ymm3, 0xd8  // unmutate for vpunpck
+    vpunpcklbw  ymm2, ymm3, ymm5
+    vpunpckhbw  ymm3, ymm3, ymm5
+    vpaddusw    ymm0, ymm2, [edx] // sum 16 words
+    vpaddusw    ymm1, ymm3, [edx + 32]
+    vmovdqu     [edx], ymm0       // write 32 words to destination
+    vmovdqu     [edx + 32], ymm1
+    lea         edx, [edx + 64]
+    sub         ecx, 32
+    jg          xloop
+
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_SCALEADDROW_AVX2
+
+// Bilinear column filtering. SSSE3 version.
+__declspec(naked)
+void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+                           int dst_width, int x, int dx) {
+  __asm {
+    push       ebx
+    push       esi
+    push       edi
+    mov        edi, [esp + 12 + 4]    // dst_ptr
+    mov        esi, [esp + 12 + 8]    // src_ptr
+    mov        ecx, [esp + 12 + 12]   // dst_width
+    movd       xmm2, [esp + 12 + 16]  // x
+    movd       xmm3, [esp + 12 + 20]  // dx
+    mov        eax, 0x04040000      // shuffle to line up fractions with pixel.
+    movd       xmm5, eax
+    pcmpeqb    xmm6, xmm6           // generate 0x007f for inverting fraction.
+    psrlw      xmm6, 9
+    pextrw     eax, xmm2, 1         // get x0 integer. preroll
+    sub        ecx, 2
+    jl         xloop29
+
+    movdqa     xmm0, xmm2           // x1 = x0 + dx
+    paddd      xmm0, xmm3
+    punpckldq  xmm2, xmm0           // x0 x1
+    punpckldq  xmm3, xmm3           // dx dx
+    paddd      xmm3, xmm3           // dx * 2, dx * 2
+    pextrw     edx, xmm2, 3         // get x1 integer. preroll
+
+    // 2 Pixel loop.
+  xloop2:
+    movdqa     xmm1, xmm2           // x0, x1 fractions.
+    paddd      xmm2, xmm3           // x += dx
+    movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels
+    movd       xmm0, ebx
+    psrlw      xmm1, 9              // 7 bit fractions.
+    movzx      ebx, word ptr [esi + edx]  // 2 source x1 pixels
+    movd       xmm4, ebx
+    pshufb     xmm1, xmm5           // 0011
+    punpcklwd  xmm0, xmm4
+    pxor       xmm1, xmm6           // 0..7f and 7f..0
+    pmaddubsw  xmm0, xmm1           // 16 bit, 2 pixels.
+    pextrw     eax, xmm2, 1         // get x0 integer. next iteration.
+    pextrw     edx, xmm2, 3         // get x1 integer. next iteration.
+    psrlw      xmm0, 7              // 8.7 fixed point to low 8 bits.
+    packuswb   xmm0, xmm0           // 8 bits, 2 pixels.
+    movd       ebx, xmm0
+    mov        [edi], bx
+    lea        edi, [edi + 2]
+    sub        ecx, 2               // 2 pixels
+    jge        xloop2
+
+ xloop29:
+
+    add        ecx, 2 - 1
+    jl         xloop99
+
+    // 1 pixel remainder
+    movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels
+    movd       xmm0, ebx
+    psrlw      xmm2, 9              // 7 bit fractions.
+    pshufb     xmm2, xmm5           // 0011
+    pxor       xmm2, xmm6           // 0..7f and 7f..0
+    pmaddubsw  xmm0, xmm2           // 16 bit
+    psrlw      xmm0, 7              // 8.7 fixed point to low 8 bits.
+    packuswb   xmm0, xmm0           // 8 bits
+    movd       ebx, xmm0
+    mov        [edi], bl
+
+ xloop99:
+
+    pop        edi
+    pop        esi
+    pop        ebx
+    ret
+  }
+}
+
+// Reads 16 pixels, duplicates them and writes 32 pixels.
+__declspec(naked)
+void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+                       int dst_width, int x, int dx) {
+  __asm {
+    mov        edx, [esp + 4]    // dst_ptr
+    mov        eax, [esp + 8]    // src_ptr
+    mov        ecx, [esp + 12]   // dst_width
+
+  wloop:
+    movdqu     xmm0, [eax]
+    lea        eax,  [eax + 16]
+    movdqa     xmm1, xmm0
+    punpcklbw  xmm0, xmm0
+    punpckhbw  xmm1, xmm1
+    movdqu     [edx], xmm0
+    movdqu     [edx + 16], xmm1
+    lea        edx, [edx + 32]
+    sub        ecx, 32
+    jg         wloop
+
+    ret
+  }
+}
+
+// Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)
+__declspec(naked)
+void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
+                            ptrdiff_t src_stride,
+                            uint8* dst_argb, int dst_width) {
+  __asm {
+    mov        eax, [esp + 4]        // src_argb
+                                     // src_stride ignored
+    mov        edx, [esp + 12]       // dst_argb
+    mov        ecx, [esp + 16]       // dst_width
+
+  wloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    shufps     xmm0, xmm1, 0xdd
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         wloop
+
+    ret
+  }
+}
+
+// Blends 8x1 rectangle to 4x1.
+__declspec(naked)
+void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
+                                  ptrdiff_t src_stride,
+                                  uint8* dst_argb, int dst_width) {
+  __asm {
+    mov        eax, [esp + 4]        // src_argb
+                                     // src_stride ignored
+    mov        edx, [esp + 12]       // dst_argb
+    mov        ecx, [esp + 16]       // dst_width
+
+  wloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    movdqa     xmm2, xmm0
+    shufps     xmm0, xmm1, 0x88      // even pixels
+    shufps     xmm2, xmm1, 0xdd      // odd pixels
+    pavgb      xmm0, xmm2
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         wloop
+
+    ret
+  }
+}
+
+// Blends 8x2 rectangle to 4x1.
+__declspec(naked)
+void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
+                               ptrdiff_t src_stride,
+                               uint8* dst_argb, int dst_width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]    // src_argb
+    mov        esi, [esp + 4 + 8]    // src_stride
+    mov        edx, [esp + 4 + 12]   // dst_argb
+    mov        ecx, [esp + 4 + 16]   // dst_width
+
+  wloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + esi]
+    movdqu     xmm3, [eax + esi + 16]
+    lea        eax,  [eax + 32]
+    pavgb      xmm0, xmm2            // average rows
+    pavgb      xmm1, xmm3
+    movdqa     xmm2, xmm0            // average columns (8 to 4 pixels)
+    shufps     xmm0, xmm1, 0x88      // even pixels
+    shufps     xmm2, xmm1, 0xdd      // odd pixels
+    pavgb      xmm0, xmm2
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         wloop
+
+    pop        esi
+    ret
+  }
+}
+
+// Reads 4 pixels at a time.
+__declspec(naked)
+void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+                               int src_stepx,
+                               uint8* dst_argb, int dst_width) {
+  __asm {
+    push       ebx
+    push       edi
+    mov        eax, [esp + 8 + 4]    // src_argb
+                                     // src_stride ignored
+    mov        ebx, [esp + 8 + 12]   // src_stepx
+    mov        edx, [esp + 8 + 16]   // dst_argb
+    mov        ecx, [esp + 8 + 20]   // dst_width
+    lea        ebx, [ebx * 4]
+    lea        edi, [ebx + ebx * 2]
+
+  wloop:
+    movd       xmm0, [eax]
+    movd       xmm1, [eax + ebx]
+    punpckldq  xmm0, xmm1
+    movd       xmm2, [eax + ebx * 2]
+    movd       xmm3, [eax + edi]
+    lea        eax,  [eax + ebx * 4]
+    punpckldq  xmm2, xmm3
+    punpcklqdq xmm0, xmm2
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         wloop
+
+    pop        edi
+    pop        ebx
+    ret
+  }
+}
+
+// Blends four 2x2 to 4x1.
+__declspec(naked)
+void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
+                                  ptrdiff_t src_stride,
+                                  int src_stepx,
+                                  uint8* dst_argb, int dst_width) {
+  __asm {
+    push       ebx
+    push       esi
+    push       edi
+    mov        eax, [esp + 12 + 4]    // src_argb
+    mov        esi, [esp + 12 + 8]    // src_stride
+    mov        ebx, [esp + 12 + 12]   // src_stepx
+    mov        edx, [esp + 12 + 16]   // dst_argb
+    mov        ecx, [esp + 12 + 20]   // dst_width
+    lea        esi, [eax + esi]       // row1 pointer
+    lea        ebx, [ebx * 4]
+    lea        edi, [ebx + ebx * 2]
+
+  wloop:
+    movq       xmm0, qword ptr [eax]  // row0 4 pairs
+    movhps     xmm0, qword ptr [eax + ebx]
+    movq       xmm1, qword ptr [eax + ebx * 2]
+    movhps     xmm1, qword ptr [eax + edi]
+    lea        eax,  [eax + ebx * 4]
+    movq       xmm2, qword ptr [esi]  // row1 4 pairs
+    movhps     xmm2, qword ptr [esi + ebx]
+    movq       xmm3, qword ptr [esi + ebx * 2]
+    movhps     xmm3, qword ptr [esi + edi]
+    lea        esi,  [esi + ebx * 4]
+    pavgb      xmm0, xmm2            // average rows
+    pavgb      xmm1, xmm3
+    movdqa     xmm2, xmm0            // average columns (8 to 4 pixels)
+    shufps     xmm0, xmm1, 0x88      // even pixels
+    shufps     xmm2, xmm1, 0xdd      // odd pixels
+    pavgb      xmm0, xmm2
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    sub        ecx, 4
+    jg         wloop
+
+    pop        edi
+    pop        esi
+    pop        ebx
+    ret
+  }
+}
+
+// Column scaling unfiltered. SSE2 version.
+__declspec(naked)
+void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
+                        int dst_width, int x, int dx) {
+  __asm {
+    push       edi
+    push       esi
+    mov        edi, [esp + 8 + 4]    // dst_argb
+    mov        esi, [esp + 8 + 8]    // src_argb
+    mov        ecx, [esp + 8 + 12]   // dst_width
+    movd       xmm2, [esp + 8 + 16]  // x
+    movd       xmm3, [esp + 8 + 20]  // dx
+
+    pshufd     xmm2, xmm2, 0         // x0 x0 x0 x0
+    pshufd     xmm0, xmm3, 0x11      // dx  0 dx  0
+    paddd      xmm2, xmm0
+    paddd      xmm3, xmm3            // 0, 0, 0,  dx * 2
+    pshufd     xmm0, xmm3, 0x05      // dx * 2, dx * 2, 0, 0
+    paddd      xmm2, xmm0            // x3 x2 x1 x0
+    paddd      xmm3, xmm3            // 0, 0, 0,  dx * 4
+    pshufd     xmm3, xmm3, 0         // dx * 4, dx * 4, dx * 4, dx * 4
+
+    pextrw     eax, xmm2, 1          // get x0 integer.
+    pextrw     edx, xmm2, 3          // get x1 integer.
+
+    cmp        ecx, 0
+    jle        xloop99
+    sub        ecx, 4
+    jl         xloop49
+
+    // 4 Pixel loop.
+ xloop4:
+    movd       xmm0, [esi + eax * 4]  // 1 source x0 pixels
+    movd       xmm1, [esi + edx * 4]  // 1 source x1 pixels
+    pextrw     eax, xmm2, 5           // get x2 integer.
+    pextrw     edx, xmm2, 7           // get x3 integer.
+    paddd      xmm2, xmm3             // x += dx
+    punpckldq  xmm0, xmm1             // x0 x1
+
+    movd       xmm1, [esi + eax * 4]  // 1 source x2 pixels
+    movd       xmm4, [esi + edx * 4]  // 1 source x3 pixels
+    pextrw     eax, xmm2, 1           // get x0 integer. next iteration.
+    pextrw     edx, xmm2, 3           // get x1 integer. next iteration.
+    punpckldq  xmm1, xmm4             // x2 x3
+    punpcklqdq xmm0, xmm1             // x0 x1 x2 x3
+    movdqu     [edi], xmm0
+    lea        edi, [edi + 16]
+    sub        ecx, 4                 // 4 pixels
+    jge        xloop4
+
+ xloop49:
+    test       ecx, 2
+    je         xloop29
+
+    // 2 Pixels.
+    movd       xmm0, [esi + eax * 4]  // 1 source x0 pixels
+    movd       xmm1, [esi + edx * 4]  // 1 source x1 pixels
+    pextrw     eax, xmm2, 5           // get x2 integer.
+    punpckldq  xmm0, xmm1             // x0 x1
+
+    movq       qword ptr [edi], xmm0
+    lea        edi, [edi + 8]
+
+ xloop29:
+    test       ecx, 1
+    je         xloop99
+
+    // 1 Pixels.
+    movd       xmm0, [esi + eax * 4]  // 1 source x2 pixels
+    movd       dword ptr [edi], xmm0
+ xloop99:
+
+    pop        esi
+    pop        edi
+    ret
+  }
+}
+
+// Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version.
+// TODO(fbarchard): Port to Neon
+
+// Shuffle table for arranging 2 pixels into pairs for pmaddubsw
+static uvec8 kShuffleColARGB = {
+  0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u,  // bbggrraa 1st pixel
+  8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
+};
+
+// Shuffle table for duplicating 2 fractions into 8 bytes each
+static uvec8 kShuffleFractions = {
+  0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
+};
+
+__declspec(naked)
+void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
+                               int dst_width, int x, int dx) {
+  __asm {
+    push       esi
+    push       edi
+    mov        edi, [esp + 8 + 4]    // dst_argb
+    mov        esi, [esp + 8 + 8]    // src_argb
+    mov        ecx, [esp + 8 + 12]   // dst_width
+    movd       xmm2, [esp + 8 + 16]  // x
+    movd       xmm3, [esp + 8 + 20]  // dx
+    movdqa     xmm4, kShuffleColARGB
+    movdqa     xmm5, kShuffleFractions
+    pcmpeqb    xmm6, xmm6           // generate 0x007f for inverting fraction.
+    psrlw      xmm6, 9
+    pextrw     eax, xmm2, 1         // get x0 integer. preroll
+    sub        ecx, 2
+    jl         xloop29
+
+    movdqa     xmm0, xmm2           // x1 = x0 + dx
+    paddd      xmm0, xmm3
+    punpckldq  xmm2, xmm0           // x0 x1
+    punpckldq  xmm3, xmm3           // dx dx
+    paddd      xmm3, xmm3           // dx * 2, dx * 2
+    pextrw     edx, xmm2, 3         // get x1 integer. preroll
+
+    // 2 Pixel loop.
+  xloop2:
+    movdqa     xmm1, xmm2           // x0, x1 fractions.
+    paddd      xmm2, xmm3           // x += dx
+    movq       xmm0, qword ptr [esi + eax * 4]  // 2 source x0 pixels
+    psrlw      xmm1, 9              // 7 bit fractions.
+    movhps     xmm0, qword ptr [esi + edx * 4]  // 2 source x1 pixels
+    pshufb     xmm1, xmm5           // 0000000011111111
+    pshufb     xmm0, xmm4           // arrange pixels into pairs
+    pxor       xmm1, xmm6           // 0..7f and 7f..0
+    pmaddubsw  xmm0, xmm1           // argb_argb 16 bit, 2 pixels.
+    pextrw     eax, xmm2, 1         // get x0 integer. next iteration.
+    pextrw     edx, xmm2, 3         // get x1 integer. next iteration.
+    psrlw      xmm0, 7              // argb 8.7 fixed point to low 8 bits.
+    packuswb   xmm0, xmm0           // argb_argb 8 bits, 2 pixels.
+    movq       qword ptr [edi], xmm0
+    lea        edi, [edi + 8]
+    sub        ecx, 2               // 2 pixels
+    jge        xloop2
+
+ xloop29:
+
+    add        ecx, 2 - 1
+    jl         xloop99
+
+    // 1 pixel remainder
+    psrlw      xmm2, 9              // 7 bit fractions.
+    movq       xmm0, qword ptr [esi + eax * 4]  // 2 source x0 pixels
+    pshufb     xmm2, xmm5           // 00000000
+    pshufb     xmm0, xmm4           // arrange pixels into pairs
+    pxor       xmm2, xmm6           // 0..7f and 7f..0
+    pmaddubsw  xmm0, xmm2           // argb 16 bit, 1 pixel.
+    psrlw      xmm0, 7
+    packuswb   xmm0, xmm0           // argb 8 bits, 1 pixel.
+    movd       [edi], xmm0
+
+ xloop99:
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+// Reads 4 pixels, duplicates them and writes 8 pixels.
+__declspec(naked)
+void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
+                           int dst_width, int x, int dx) {
+  __asm {
+    mov        edx, [esp + 4]    // dst_argb
+    mov        eax, [esp + 8]    // src_argb
+    mov        ecx, [esp + 12]   // dst_width
+
+  wloop:
+    movdqu     xmm0, [eax]
+    lea        eax,  [eax + 16]
+    movdqa     xmm1, xmm0
+    punpckldq  xmm0, xmm0
+    punpckhdq  xmm1, xmm1
+    movdqu     [edx], xmm0
+    movdqu     [edx + 16], xmm1
+    lea        edx, [edx + 32]
+    sub        ecx, 8
+    jg         wloop
+
+    ret
+  }
+}
+
+// Divide num by div and return as 16.16 fixed point result.
+__declspec(naked)
+int FixedDiv_X86(int num, int div) {
+  __asm {
+    mov        eax, [esp + 4]    // num
+    cdq                          // extend num to 64 bits
+    shld       edx, eax, 16      // 32.16
+    shl        eax, 16
+    idiv       dword ptr [esp + 8]
+    ret
+  }
+}
+
+// Divide num by div and return as 16.16 fixed point result.
+__declspec(naked)
+int FixedDiv1_X86(int num, int div) {
+  __asm {
+    mov        eax, [esp + 4]    // num
+    mov        ecx, [esp + 8]    // denom
+    cdq                          // extend num to 64 bits
+    shld       edx, eax, 16      // 32.16
+    shl        eax, 16
+    sub        eax, 0x00010001
+    sbb        edx, 0
+    sub        ecx, 1
+    idiv       ecx
+    ret
+  }
+}
+#endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/libs/libaom/src/third_party/libyuv/source/video_common.cc b/libs/libaom/src/third_party/libyuv/source/video_common.cc
new file mode 100644
index 000000000..379a0669a
--- /dev/null
+++ b/libs/libaom/src/third_party/libyuv/source/video_common.cc
@@ -0,0 +1,64 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "libyuv/video_common.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#define ARRAY_SIZE(x) (int)(sizeof(x) / sizeof(x[0]))
+
+struct FourCCAliasEntry {
+  uint32 alias;
+  uint32 canonical;
+};
+
+static const struct FourCCAliasEntry kFourCCAliases[] = {
+  {FOURCC_IYUV, FOURCC_I420},
+  {FOURCC_YU16, FOURCC_I422},
+  {FOURCC_YU24, FOURCC_I444},
+  {FOURCC_YUYV, FOURCC_YUY2},
+  {FOURCC_YUVS, FOURCC_YUY2},  // kCMPixelFormat_422YpCbCr8_yuvs
+  {FOURCC_HDYC, FOURCC_UYVY},
+  {FOURCC_2VUY, FOURCC_UYVY},  // kCMPixelFormat_422YpCbCr8
+  {FOURCC_JPEG, FOURCC_MJPG},  // Note: JPEG has DHT while MJPG does not.
+  {FOURCC_DMB1, FOURCC_MJPG},
+  {FOURCC_BA81, FOURCC_BGGR},  // deprecated.
+  {FOURCC_RGB3, FOURCC_RAW },
+  {FOURCC_BGR3, FOURCC_24BG},
+  {FOURCC_CM32, FOURCC_BGRA},  // kCMPixelFormat_32ARGB
+  {FOURCC_CM24, FOURCC_RAW },  // kCMPixelFormat_24RGB
+  {FOURCC_L555, FOURCC_RGBO},  // kCMPixelFormat_16LE555
+  {FOURCC_L565, FOURCC_RGBP},  // kCMPixelFormat_16LE565
+  {FOURCC_5551, FOURCC_RGBO},  // kCMPixelFormat_16LE5551
+};
+// TODO(fbarchard): Consider mapping kCMPixelFormat_32BGRA to FOURCC_ARGB.
+//  {FOURCC_BGRA, FOURCC_ARGB},  // kCMPixelFormat_32BGRA
+
+LIBYUV_API
+uint32 CanonicalFourCC(uint32 fourcc) {
+  int i;
+  for (i = 0; i < ARRAY_SIZE(kFourCCAliases); ++i) {
+    if (kFourCCAliases[i].alias == fourcc) {
+      return kFourCCAliases[i].canonical;
+    }
+  }
+  // Not an alias, so return it as-is.
+  return fourcc;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
diff --git a/libs/libaom/src/third_party/libyuv/source/x86inc.asm b/libs/libaom/src/third_party/libyuv/source/x86inc.asm
new file mode 100644
index 000000000..cb5c32df3
--- /dev/null
+++ b/libs/libaom/src/third_party/libyuv/source/x86inc.asm
@@ -0,0 +1,1136 @@
+;*****************************************************************************
+;* x86inc.asm: x264asm abstraction layer
+;*****************************************************************************
+;* Copyright (C) 2005-2012 x264 project
+;*
+;* Authors: Loren Merritt <lorenm@u.washington.edu>
+;*          Anton Mitrofanov <BugMaster@narod.ru>
+;*          Jason Garrett-Glaser <darkshikari@gmail.com>
+;*          Henrik Gramner <hengar-6@student.ltu.se>
+;*
+;* Permission to use, copy, modify, and/or distribute this software for any
+;* purpose with or without fee is hereby granted, provided that the above
+;* copyright notice and this permission notice appear in all copies.
+;*
+;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+;*****************************************************************************
+
+; This is a header file for the x264ASM assembly language, which uses
+; NASM/YASM syntax combined with a large number of macros to provide easy
+; abstraction between different calling conventions (x86_32, win64, linux64).
+; It also has various other useful features to simplify writing the kind of
+; DSP functions that are most often used in x264.
+
+; Unlike the rest of x264, this file is available under an ISC license, as it
+; has significant usefulness outside of x264 and we want it to be available
+; to the largest audience possible.  Of course, if you modify it for your own
+; purposes to add a new feature, we strongly encourage contributing a patch
+; as this feature might be useful for others as well.  Send patches or ideas
+; to x264-devel@videolan.org .
+
+; Local changes for libyuv:
+; remove %define program_name and references in labels
+; rename cpus to uppercase
+
+%define WIN64  0
+%define UNIX64 0
+%if ARCH_X86_64
+    %ifidn __OUTPUT_FORMAT__,win32
+        %define WIN64  1
+    %elifidn __OUTPUT_FORMAT__,win64
+        %define WIN64  1
+    %else
+        %define UNIX64 1
+    %endif
+%endif
+
+%ifdef PREFIX
+    %define mangle(x) _ %+ x
+%else
+    %define mangle(x) x
+%endif
+
+; Name of the .rodata section.
+; Kludge: Something on OS X fails to align .rodata even given an align attribute,
+; so use a different read-only section.
+%macro SECTION_RODATA 0-1 16
+    %ifidn __OUTPUT_FORMAT__,macho64
+        SECTION .text align=%1
+    %elifidn __OUTPUT_FORMAT__,macho
+        SECTION .text align=%1
+        fakegot:
+    %elifidn __OUTPUT_FORMAT__,aout
+        section .text
+    %else
+        SECTION .rodata align=%1
+    %endif
+%endmacro
+
+; aout does not support align=
+%macro SECTION_TEXT 0-1 16
+    %ifidn __OUTPUT_FORMAT__,aout
+        SECTION .text
+    %else
+        SECTION .text align=%1
+    %endif
+%endmacro
+
+%if WIN64
+    %define PIC
+%elif ARCH_X86_64 == 0
+; x86_32 doesn't require PIC.
+; Some distros prefer shared objects to be PIC, but nothing breaks if
+; the code contains a few textrels, so we'll skip that complexity.
+    %undef PIC
+%endif
+%ifdef PIC
+    default rel
+%endif
+
+; Always use long nops (reduces 0x90 spam in disassembly on x86_32)
+CPU amdnop
+
+; Macros to eliminate most code duplication between x86_32 and x86_64:
+; Currently this works only for leaf functions which load all their arguments
+; into registers at the start, and make no other use of the stack. Luckily that
+; covers most of x264's asm.
+
+; PROLOGUE:
+; %1 = number of arguments. loads them from stack if needed.
+; %2 = number of registers used. pushes callee-saved regs if needed.
+; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.
+; %4 = list of names to define to registers
+; PROLOGUE can also be invoked by adding the same options to cglobal
+
+; e.g.
+; cglobal foo, 2,3,0, dst, src, tmp
+; declares a function (foo), taking two args (dst and src) and one local variable (tmp)
+
+; TODO Some functions can use some args directly from the stack. If they're the
+; last args then you can just not declare them, but if they're in the middle
+; we need more flexible macro.
+
+; RET:
+; Pops anything that was pushed by PROLOGUE, and returns.
+
+; REP_RET:
+; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons
+; which are slow when a normal ret follows a branch.
+
+; registers:
+; rN and rNq are the native-size register holding function argument N
+; rNd, rNw, rNb are dword, word, and byte size
+; rNh is the high 8 bits of the word size
+; rNm is the original location of arg N (a register or on the stack), dword
+; rNmp is native size
+
+%macro DECLARE_REG 2-3
+    %define r%1q %2
+    %define r%1d %2d
+    %define r%1w %2w
+    %define r%1b %2b
+    %define r%1h %2h
+    %if %0 == 2
+        %define r%1m  %2d
+        %define r%1mp %2
+    %elif ARCH_X86_64 ; memory
+        %define r%1m [rsp + stack_offset + %3]
+        %define r%1mp qword r %+ %1m
+    %else
+        %define r%1m [esp + stack_offset + %3]
+        %define r%1mp dword r %+ %1m
+    %endif
+    %define r%1  %2
+%endmacro
+
+%macro DECLARE_REG_SIZE 3
+    %define r%1q r%1
+    %define e%1q r%1
+    %define r%1d e%1
+    %define e%1d e%1
+    %define r%1w %1
+    %define e%1w %1
+    %define r%1h %3
+    %define e%1h %3
+    %define r%1b %2
+    %define e%1b %2
+%if ARCH_X86_64 == 0
+    %define r%1  e%1
+%endif
+%endmacro
+
+DECLARE_REG_SIZE ax, al, ah
+DECLARE_REG_SIZE bx, bl, bh
+DECLARE_REG_SIZE cx, cl, ch
+DECLARE_REG_SIZE dx, dl, dh
+DECLARE_REG_SIZE si, sil, null
+DECLARE_REG_SIZE di, dil, null
+DECLARE_REG_SIZE bp, bpl, null
+
+; t# defines for when per-arch register allocation is more complex than just function arguments
+
+%macro DECLARE_REG_TMP 1-*
+    %assign %%i 0
+    %rep %0
+        CAT_XDEFINE t, %%i, r%1
+        %assign %%i %%i+1
+        %rotate 1
+    %endrep
+%endmacro
+
+%macro DECLARE_REG_TMP_SIZE 0-*
+    %rep %0
+        %define t%1q t%1 %+ q
+        %define t%1d t%1 %+ d
+        %define t%1w t%1 %+ w
+        %define t%1h t%1 %+ h
+        %define t%1b t%1 %+ b
+        %rotate 1
+    %endrep
+%endmacro
+
+DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
+
+%if ARCH_X86_64
+    %define gprsize 8
+%else
+    %define gprsize 4
+%endif
+
+%macro PUSH 1
+    push %1
+    %assign stack_offset stack_offset+gprsize
+%endmacro
+
+%macro POP 1
+    pop %1
+    %assign stack_offset stack_offset-gprsize
+%endmacro
+
+%macro PUSH_IF_USED 1-*
+    %rep %0
+        %if %1 < regs_used
+            PUSH r%1
+        %endif
+        %rotate 1
+    %endrep
+%endmacro
+
+%macro POP_IF_USED 1-*
+    %rep %0
+        %if %1 < regs_used
+            pop r%1
+        %endif
+        %rotate 1
+    %endrep
+%endmacro
+
+%macro LOAD_IF_USED 1-*
+    %rep %0
+        %if %1 < num_args
+            mov r%1, r %+ %1 %+ mp
+        %endif
+        %rotate 1
+    %endrep
+%endmacro
+
+%macro SUB 2
+    sub %1, %2
+    %ifidn %1, rsp
+        %assign stack_offset stack_offset+(%2)
+    %endif
+%endmacro
+
+%macro ADD 2
+    add %1, %2
+    %ifidn %1, rsp
+        %assign stack_offset stack_offset-(%2)
+    %endif
+%endmacro
+
+%macro movifnidn 2
+    %ifnidn %1, %2
+        mov %1, %2
+    %endif
+%endmacro
+
+%macro movsxdifnidn 2
+    %ifnidn %1, %2
+        movsxd %1, %2
+    %endif
+%endmacro
+
+%macro ASSERT 1
+    %if (%1) == 0
+        %error assert failed
+    %endif
+%endmacro
+
+%macro DEFINE_ARGS 0-*
+    %ifdef n_arg_names
+        %assign %%i 0
+        %rep n_arg_names
+            CAT_UNDEF arg_name %+ %%i, q
+            CAT_UNDEF arg_name %+ %%i, d
+            CAT_UNDEF arg_name %+ %%i, w
+            CAT_UNDEF arg_name %+ %%i, h
+            CAT_UNDEF arg_name %+ %%i, b
+            CAT_UNDEF arg_name %+ %%i, m
+            CAT_UNDEF arg_name %+ %%i, mp
+            CAT_UNDEF arg_name, %%i
+            %assign %%i %%i+1
+        %endrep
+    %endif
+
+    %xdefine %%stack_offset stack_offset
+    %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine
+    %assign %%i 0
+    %rep %0
+        %xdefine %1q r %+ %%i %+ q
+        %xdefine %1d r %+ %%i %+ d
+        %xdefine %1w r %+ %%i %+ w
+        %xdefine %1h r %+ %%i %+ h
+        %xdefine %1b r %+ %%i %+ b
+        %xdefine %1m r %+ %%i %+ m
+        %xdefine %1mp r %+ %%i %+ mp
+        CAT_XDEFINE arg_name, %%i, %1
+        %assign %%i %%i+1
+        %rotate 1
+    %endrep
+    %xdefine stack_offset %%stack_offset
+    %assign n_arg_names %0
+%endmacro
+
+%if WIN64 ; Windows x64 ;=================================================
+
+DECLARE_REG 0,  rcx
+DECLARE_REG 1,  rdx
+DECLARE_REG 2,  R8
+DECLARE_REG 3,  R9
+DECLARE_REG 4,  R10, 40
+DECLARE_REG 5,  R11, 48
+DECLARE_REG 6,  rax, 56
+DECLARE_REG 7,  rdi, 64
+DECLARE_REG 8,  rsi, 72
+DECLARE_REG 9,  rbx, 80
+DECLARE_REG 10, rbp, 88
+DECLARE_REG 11, R12, 96
+DECLARE_REG 12, R13, 104
+DECLARE_REG 13, R14, 112
+DECLARE_REG 14, R15, 120
+
+%macro PROLOGUE 2-4+ 0 ; #args, #regs, #xmm_regs, arg_names...
+    %assign num_args %1
+    %assign regs_used %2
+    ASSERT regs_used >= num_args
+    ASSERT regs_used <= 15
+    PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14
+    %if mmsize == 8
+        %assign xmm_regs_used 0
+    %else
+        WIN64_SPILL_XMM %3
+    %endif
+    LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
+    DEFINE_ARGS %4
+%endmacro
+
+%macro WIN64_SPILL_XMM 1
+    %assign xmm_regs_used %1
+    ASSERT xmm_regs_used <= 16
+    %if xmm_regs_used > 6
+        SUB rsp, (xmm_regs_used-6)*16+16
+        %assign %%i xmm_regs_used
+        %rep (xmm_regs_used-6)
+            %assign %%i %%i-1
+            movdqa [rsp + (%%i-6)*16+(~stack_offset&8)], xmm %+ %%i
+        %endrep
+    %endif
+%endmacro
+
+%macro WIN64_RESTORE_XMM_INTERNAL 1
+    %if xmm_regs_used > 6
+        %assign %%i xmm_regs_used
+        %rep (xmm_regs_used-6)
+            %assign %%i %%i-1
+            movdqa xmm %+ %%i, [%1 + (%%i-6)*16+(~stack_offset&8)]
+        %endrep
+        add %1, (xmm_regs_used-6)*16+16
+    %endif
+%endmacro
+
+%macro WIN64_RESTORE_XMM 1
+    WIN64_RESTORE_XMM_INTERNAL %1
+    %assign stack_offset stack_offset-(xmm_regs_used-6)*16+16
+    %assign xmm_regs_used 0
+%endmacro
+
+%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32
+
+%macro RET 0
+    WIN64_RESTORE_XMM_INTERNAL rsp
+    POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
+%if mmsize == 32
+    vzeroupper
+%endif
+    ret
+%endmacro
+
+%elif ARCH_X86_64 ; *nix x64 ;=============================================
+
+DECLARE_REG 0,  rdi
+DECLARE_REG 1,  rsi
+DECLARE_REG 2,  rdx
+DECLARE_REG 3,  rcx
+DECLARE_REG 4,  R8
+DECLARE_REG 5,  R9
+DECLARE_REG 6,  rax, 8
+DECLARE_REG 7,  R10, 16
+DECLARE_REG 8,  R11, 24
+DECLARE_REG 9,  rbx, 32
+DECLARE_REG 10, rbp, 40
+DECLARE_REG 11, R12, 48
+DECLARE_REG 12, R13, 56
+DECLARE_REG 13, R14, 64
+DECLARE_REG 14, R15, 72
+
+%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
+    %assign num_args %1
+    %assign regs_used %2
+    ASSERT regs_used >= num_args
+    ASSERT regs_used <= 15
+    PUSH_IF_USED 9, 10, 11, 12, 13, 14
+    LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14
+    DEFINE_ARGS %4
+%endmacro
+
+%define has_epilogue regs_used > 9 || mmsize == 32
+
+%macro RET 0
+    POP_IF_USED 14, 13, 12, 11, 10, 9
+%if mmsize == 32
+    vzeroupper
+%endif
+    ret
+%endmacro
+
+%else ; X86_32 ;==============================================================
+
+DECLARE_REG 0, eax, 4
+DECLARE_REG 1, ecx, 8
+DECLARE_REG 2, edx, 12
+DECLARE_REG 3, ebx, 16
+DECLARE_REG 4, esi, 20
+DECLARE_REG 5, edi, 24
+DECLARE_REG 6, ebp, 28
+%define rsp esp
+
+%macro DECLARE_ARG 1-*
+    %rep %0
+        %define r%1m [esp + stack_offset + 4*%1 + 4]
+        %define r%1mp dword r%1m
+        %rotate 1
+    %endrep
+%endmacro
+
+DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
+
+%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
+    %assign num_args %1
+    %assign regs_used %2
+    %if regs_used > 7
+        %assign regs_used 7
+    %endif
+    ASSERT regs_used >= num_args
+    PUSH_IF_USED 3, 4, 5, 6
+    LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6
+    DEFINE_ARGS %4
+%endmacro
+
+%define has_epilogue regs_used > 3 || mmsize == 32
+
+%macro RET 0
+    POP_IF_USED 6, 5, 4, 3
+%if mmsize == 32
+    vzeroupper
+%endif
+    ret
+%endmacro
+
+%endif ;======================================================================
+
+%if WIN64 == 0
+%macro WIN64_SPILL_XMM 1
+%endmacro
+%macro WIN64_RESTORE_XMM 1
+%endmacro
+%endif
+
+%macro REP_RET 0
+    %if has_epilogue
+        RET
+    %else
+        rep ret
+    %endif
+%endmacro
+
+%macro TAIL_CALL 2 ; callee, is_nonadjacent
+    %if has_epilogue
+        call %1
+        RET
+    %elif %2
+        jmp %1
+    %endif
+%endmacro
+
+;=============================================================================
+; arch-independent part
+;=============================================================================
+
+%assign function_align 16
+
+; Begin a function.
+; Applies any symbol mangling needed for C linkage, and sets up a define such that
+; subsequent uses of the function name automatically refer to the mangled version.
+; Appends cpuflags to the function name if cpuflags has been specified.
+%macro cglobal 1-2+ ; name, [PROLOGUE args]
+%if %0 == 1
+    cglobal_internal %1 %+ SUFFIX
+%else
+    cglobal_internal %1 %+ SUFFIX, %2
+%endif
+%endmacro
+%macro cglobal_internal 1-2+
+    %ifndef cglobaled_%1
+        %xdefine %1 mangle(%1)
+        %xdefine %1.skip_prologue %1 %+ .skip_prologue
+        CAT_XDEFINE cglobaled_, %1, 1
+    %endif
+    %xdefine current_function %1
+    %ifidn __OUTPUT_FORMAT__,elf
+        global %1:function hidden
+    %else
+        global %1
+    %endif
+    align function_align
+    %1:
+    RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer
+    %assign stack_offset 0
+    %if %0 > 1
+        PROLOGUE %2
+    %endif
+%endmacro
+
+%macro cextern 1
+    %xdefine %1 mangle(%1)
+    CAT_XDEFINE cglobaled_, %1, 1
+    extern %1
+%endmacro
+
+; like cextern, but without the prefix
+%macro cextern_naked 1
+    %xdefine %1 mangle(%1)
+    CAT_XDEFINE cglobaled_, %1, 1
+    extern %1
+%endmacro
+
+%macro const 2+
+    %xdefine %1 mangle(%1)
+    global %1
+    %1: %2
+%endmacro
+
+; This is needed for ELF, otherwise the GNU linker assumes the stack is
+; executable by default.
+%ifidn __OUTPUT_FORMAT__,elf
+SECTION .note.GNU-stack noalloc noexec nowrite progbits
+%endif
+%ifidn __OUTPUT_FORMAT__,elf32
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
+%ifidn __OUTPUT_FORMAT__,elf64
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
+
+; cpuflags
+
+%assign cpuflags_MMX      (1<<0)
+%assign cpuflags_MMX2     (1<<1) | cpuflags_MMX
+%assign cpuflags_3dnow    (1<<2) | cpuflags_MMX
+%assign cpuflags_3dnow2   (1<<3) | cpuflags_3dnow
+%assign cpuflags_SSE      (1<<4) | cpuflags_MMX2
+%assign cpuflags_SSE2     (1<<5) | cpuflags_SSE
+%assign cpuflags_SSE2slow (1<<6) | cpuflags_SSE2
+%assign cpuflags_SSE3     (1<<7) | cpuflags_SSE2
+%assign cpuflags_SSSE3    (1<<8) | cpuflags_SSE3
+%assign cpuflags_SSE4     (1<<9) | cpuflags_SSSE3
+%assign cpuflags_SSE42    (1<<10)| cpuflags_SSE4
+%assign cpuflags_AVX      (1<<11)| cpuflags_SSE42
+%assign cpuflags_xop      (1<<12)| cpuflags_AVX
+%assign cpuflags_fma4     (1<<13)| cpuflags_AVX
+%assign cpuflags_AVX2     (1<<14)| cpuflags_AVX
+%assign cpuflags_fma3     (1<<15)| cpuflags_AVX
+
+%assign cpuflags_cache32  (1<<16)
+%assign cpuflags_cache64  (1<<17)
+%assign cpuflags_slowctz  (1<<18)
+%assign cpuflags_lzcnt    (1<<19)
+%assign cpuflags_misalign (1<<20)
+%assign cpuflags_aligned  (1<<21) ; not a cpu feature, but a function variant
+%assign cpuflags_atom     (1<<22)
+%assign cpuflags_bmi1     (1<<23)
+%assign cpuflags_bmi2     (1<<24)|cpuflags_bmi1
+%assign cpuflags_tbm      (1<<25)|cpuflags_bmi1
+
+%define    cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x))
+%define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x))
+
+; Takes up to 2 cpuflags from the above list.
+; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu.
+; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co.
+%macro INIT_CPUFLAGS 0-2
+    %if %0 >= 1
+        %xdefine cpuname %1
+        %assign cpuflags cpuflags_%1
+        %if %0 >= 2
+            %xdefine cpuname %1_%2
+            %assign cpuflags cpuflags | cpuflags_%2
+        %endif
+        %xdefine SUFFIX _ %+ cpuname
+        %if cpuflag(AVX)
+            %assign AVX_enabled 1
+        %endif
+        %if mmsize == 16 && notcpuflag(SSE2)
+            %define mova movaps
+            %define movu movups
+            %define movnta movntps
+        %endif
+        %if cpuflag(aligned)
+            %define movu mova
+        %elifidn %1, SSE3
+            %define movu lddqu
+        %endif
+    %else
+        %xdefine SUFFIX
+        %undef cpuname
+        %undef cpuflags
+    %endif
+%endmacro
+
+; merge MMX and SSE*
+
+%macro CAT_XDEFINE 3
+    %xdefine %1%2 %3
+%endmacro
+
+%macro CAT_UNDEF 2
+    %undef %1%2
+%endmacro
+
+%macro INIT_MMX 0-1+
+    %assign AVX_enabled 0
+    %define RESET_MM_PERMUTATION INIT_MMX %1
+    %define mmsize 8
+    %define num_mmregs 8
+    %define mova movq
+    %define movu movq
+    %define movh movd
+    %define movnta movntq
+    %assign %%i 0
+    %rep 8
+    CAT_XDEFINE m, %%i, mm %+ %%i
+    CAT_XDEFINE nmm, %%i, %%i
+    %assign %%i %%i+1
+    %endrep
+    %rep 8
+    CAT_UNDEF m, %%i
+    CAT_UNDEF nmm, %%i
+    %assign %%i %%i+1
+    %endrep
+    INIT_CPUFLAGS %1
+%endmacro
+
+%macro INIT_XMM 0-1+
+    %assign AVX_enabled 0
+    %define RESET_MM_PERMUTATION INIT_XMM %1
+    %define mmsize 16
+    %define num_mmregs 8
+    %if ARCH_X86_64
+    %define num_mmregs 16
+    %endif
+    %define mova movdqa
+    %define movu movdqu
+    %define movh movq
+    %define movnta movntdq
+    %assign %%i 0
+    %rep num_mmregs
+    CAT_XDEFINE m, %%i, xmm %+ %%i
+    CAT_XDEFINE nxmm, %%i, %%i
+    %assign %%i %%i+1
+    %endrep
+    INIT_CPUFLAGS %1
+%endmacro
+
+%macro INIT_YMM 0-1+
+    %assign AVX_enabled 1
+    %define RESET_MM_PERMUTATION INIT_YMM %1
+    %define mmsize 32
+    %define num_mmregs 8
+    %if ARCH_X86_64
+    %define num_mmregs 16
+    %endif
+    %define mova vmovaps
+    %define movu vmovups
+    %undef movh
+    %define movnta vmovntps
+    %assign %%i 0
+    %rep num_mmregs
+    CAT_XDEFINE m, %%i, ymm %+ %%i
+    CAT_XDEFINE nymm, %%i, %%i
+    %assign %%i %%i+1
+    %endrep
+    INIT_CPUFLAGS %1
+%endmacro
+
+INIT_XMM
+
+; I often want to use macros that permute their arguments. e.g. there's no
+; efficient way to implement butterfly or transpose or dct without swapping some
+; arguments.
+;
+; I would like to not have to manually keep track of the permutations:
+; If I insert a permutation in the middle of a function, it should automatically
+; change everything that follows. For more complex macros I may also have multiple
+; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations.
+;
+; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that
+; permutes its arguments. It's equivalent to exchanging the contents of the
+; registers, except that this way you exchange the register names instead, so it
+; doesn't cost any cycles.
+
+%macro PERMUTE 2-* ; takes a list of pairs to swap
+%rep %0/2
+    %xdefine tmp%2 m%2
+    %xdefine ntmp%2 nm%2
+    %rotate 2
+%endrep
+%rep %0/2
+    %xdefine m%1 tmp%2
+    %xdefine nm%1 ntmp%2
+    %undef tmp%2
+    %undef ntmp%2
+    %rotate 2
+%endrep
+%endmacro
+
+%macro SWAP 2-* ; swaps a single chain (sometimes more concise than pairs)
+%rep %0-1
+%ifdef m%1
+    %xdefine tmp m%1
+    %xdefine m%1 m%2
+    %xdefine m%2 tmp
+    CAT_XDEFINE n, m%1, %1
+    CAT_XDEFINE n, m%2, %2
+%else
+    ; If we were called as "SWAP m0,m1" rather than "SWAP 0,1" infer the original numbers here.
+    ; Be careful using this mode in nested macros though, as in some cases there may be
+    ; other copies of m# that have already been dereferenced and don't get updated correctly.
+    %xdefine %%n1 n %+ %1
+    %xdefine %%n2 n %+ %2
+    %xdefine tmp m %+ %%n1
+    CAT_XDEFINE m, %%n1, m %+ %%n2
+    CAT_XDEFINE m, %%n2, tmp
+    CAT_XDEFINE n, m %+ %%n1, %%n1
+    CAT_XDEFINE n, m %+ %%n2, %%n2
+%endif
+    %undef tmp
+    %rotate 1
+%endrep
+%endmacro
+
+; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later
+; calls to that function will automatically load the permutation, so values can
+; be returned in mmregs.
+%macro SAVE_MM_PERMUTATION 0-1
+    %if %0
+        %xdefine %%f %1_m
+    %else
+        %xdefine %%f current_function %+ _m
+    %endif
+    %assign %%i 0
+    %rep num_mmregs
+        CAT_XDEFINE %%f, %%i, m %+ %%i
+    %assign %%i %%i+1
+    %endrep
+%endmacro
+
+%macro LOAD_MM_PERMUTATION 1 ; name to load from
+    %ifdef %1_m0
+        %assign %%i 0
+        %rep num_mmregs
+            CAT_XDEFINE m, %%i, %1_m %+ %%i
+            CAT_XDEFINE n, m %+ %%i, %%i
+        %assign %%i %%i+1
+        %endrep
+    %endif
+%endmacro
+
+; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't
+%macro call 1
+    call_internal %1, %1 %+ SUFFIX
+%endmacro
+%macro call_internal 2
+    %xdefine %%i %1
+    %ifndef cglobaled_%1
+        %ifdef cglobaled_%2
+            %xdefine %%i %2
+        %endif
+    %endif
+    call %%i
+    LOAD_MM_PERMUTATION %%i
+%endmacro
+
+; Substitutions that reduce instruction size but are functionally equivalent
+%macro add 2
+    %ifnum %2
+        %if %2==128
+            sub %1, -128
+        %else
+            add %1, %2
+        %endif
+    %else
+        add %1, %2
+    %endif
+%endmacro
+
+%macro sub 2
+    %ifnum %2
+        %if %2==128
+            add %1, -128
+        %else
+            sub %1, %2
+        %endif
+    %else
+        sub %1, %2
+    %endif
+%endmacro
+
+;=============================================================================
+; AVX abstraction layer
+;=============================================================================
+
+%assign i 0
+%rep 16
+    %if i < 8
+        CAT_XDEFINE sizeofmm, i, 8
+    %endif
+    CAT_XDEFINE sizeofxmm, i, 16
+    CAT_XDEFINE sizeofymm, i, 32
+%assign i i+1
+%endrep
+%undef i
+
+%macro CHECK_AVX_INSTR_EMU 3-*
+    %xdefine %%opcode %1
+    %xdefine %%dst %2
+    %rep %0-2
+        %ifidn %%dst, %3
+            %error non-AVX emulation of ``%%opcode'' is not supported
+        %endif
+        %rotate 1
+    %endrep
+%endmacro
+
+;%1 == instruction
+;%2 == 1 if float, 0 if int
+;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 2- or 3-operand (xmm, xmm, xmm)
+;%4 == number of operands given
+;%5+: operands
+%macro RUN_AVX_INSTR 6-7+
+    %ifid %6
+        %define %%sizeofreg sizeof%6
+    %elifid %5
+        %define %%sizeofreg sizeof%5
+    %else
+        %define %%sizeofreg mmsize
+    %endif
+    %if %%sizeofreg==32
+        %if %4>=3
+            v%1 %5, %6, %7
+        %else
+            v%1 %5, %6
+        %endif
+    %else
+        %if %%sizeofreg==8
+            %define %%regmov movq
+        %elif %2
+            %define %%regmov movaps
+        %else
+            %define %%regmov movdqa
+        %endif
+
+        %if %4>=3+%3
+            %ifnidn %5, %6
+                %if AVX_enabled && %%sizeofreg==16
+                    v%1 %5, %6, %7
+                %else
+                    CHECK_AVX_INSTR_EMU {%1 %5, %6, %7}, %5, %7
+                    %%regmov %5, %6
+                    %1 %5, %7
+                %endif
+            %else
+                %1 %5, %7
+            %endif
+        %elif %4>=3
+            %1 %5, %6, %7
+        %else
+            %1 %5, %6
+        %endif
+    %endif
+%endmacro
+
+; 3arg AVX ops with a memory arg can only have it in src2,
+; whereas SSE emulation of 3arg prefers to have it in src1 (i.e. the mov).
+; So, if the op is symmetric and the wrong one is memory, swap them.
+%macro RUN_AVX_INSTR1 8
+    %assign %%swap 0
+    %if AVX_enabled
+        %ifnid %6
+            %assign %%swap 1
+        %endif
+    %elifnidn %5, %6
+        %ifnid %7
+            %assign %%swap 1
+        %endif
+    %endif
+    %if %%swap && %3 == 0 && %8 == 1
+        RUN_AVX_INSTR %1, %2, %3, %4, %5, %7, %6
+    %else
+        RUN_AVX_INSTR %1, %2, %3, %4, %5, %6, %7
+    %endif
+%endmacro
+
+;%1 == instruction
+;%2 == 1 if float, 0 if int
+;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 2- or 3-operand (xmm, xmm, xmm)
+;%4 == 1 if symmetric (i.e. doesn't matter which src arg is which), 0 if not
+%macro AVX_INSTR 4
+    %macro %1 2-9 fnord, fnord, fnord, %1, %2, %3, %4
+        %ifidn %3, fnord
+            RUN_AVX_INSTR %6, %7, %8, 2, %1, %2
+        %elifidn %4, fnord
+            RUN_AVX_INSTR1 %6, %7, %8, 3, %1, %2, %3, %9
+        %elifidn %5, fnord
+            RUN_AVX_INSTR %6, %7, %8, 4, %1, %2, %3, %4
+        %else
+            RUN_AVX_INSTR %6, %7, %8, 5, %1, %2, %3, %4, %5
+        %endif
+    %endmacro
+%endmacro
+
+AVX_INSTR addpd, 1, 0, 1
+AVX_INSTR addps, 1, 0, 1
+AVX_INSTR addsd, 1, 0, 1
+AVX_INSTR addss, 1, 0, 1
+AVX_INSTR addsubpd, 1, 0, 0
+AVX_INSTR addsubps, 1, 0, 0
+AVX_INSTR andpd, 1, 0, 1
+AVX_INSTR andps, 1, 0, 1
+AVX_INSTR andnpd, 1, 0, 0
+AVX_INSTR andnps, 1, 0, 0
+AVX_INSTR blendpd, 1, 0, 0
+AVX_INSTR blendps, 1, 0, 0
+AVX_INSTR blendvpd, 1, 0, 0
+AVX_INSTR blendvps, 1, 0, 0
+AVX_INSTR cmppd, 1, 0, 0
+AVX_INSTR cmpps, 1, 0, 0
+AVX_INSTR cmpsd, 1, 0, 0
+AVX_INSTR cmpss, 1, 0, 0
+AVX_INSTR cvtdq2ps, 1, 0, 0
+AVX_INSTR cvtps2dq, 1, 0, 0
+AVX_INSTR divpd, 1, 0, 0
+AVX_INSTR divps, 1, 0, 0
+AVX_INSTR divsd, 1, 0, 0
+AVX_INSTR divss, 1, 0, 0
+AVX_INSTR dppd, 1, 1, 0
+AVX_INSTR dpps, 1, 1, 0
+AVX_INSTR haddpd, 1, 0, 0
+AVX_INSTR haddps, 1, 0, 0
+AVX_INSTR hsubpd, 1, 0, 0
+AVX_INSTR hsubps, 1, 0, 0
+AVX_INSTR maxpd, 1, 0, 1
+AVX_INSTR maxps, 1, 0, 1
+AVX_INSTR maxsd, 1, 0, 1
+AVX_INSTR maxss, 1, 0, 1
+AVX_INSTR minpd, 1, 0, 1
+AVX_INSTR minps, 1, 0, 1
+AVX_INSTR minsd, 1, 0, 1
+AVX_INSTR minss, 1, 0, 1
+AVX_INSTR movhlps, 1, 0, 0
+AVX_INSTR movlhps, 1, 0, 0
+AVX_INSTR movsd, 1, 0, 0
+AVX_INSTR movss, 1, 0, 0
+AVX_INSTR mpsadbw, 0, 1, 0
+AVX_INSTR mulpd, 1, 0, 1
+AVX_INSTR mulps, 1, 0, 1
+AVX_INSTR mulsd, 1, 0, 1
+AVX_INSTR mulss, 1, 0, 1
+AVX_INSTR orpd, 1, 0, 1
+AVX_INSTR orps, 1, 0, 1
+AVX_INSTR pabsb, 0, 0, 0
+AVX_INSTR pabsw, 0, 0, 0
+AVX_INSTR pabsd, 0, 0, 0
+AVX_INSTR packsswb, 0, 0, 0
+AVX_INSTR packssdw, 0, 0, 0
+AVX_INSTR packuswb, 0, 0, 0
+AVX_INSTR packusdw, 0, 0, 0
+AVX_INSTR paddb, 0, 0, 1
+AVX_INSTR paddw, 0, 0, 1
+AVX_INSTR paddd, 0, 0, 1
+AVX_INSTR paddq, 0, 0, 1
+AVX_INSTR paddsb, 0, 0, 1
+AVX_INSTR paddsw, 0, 0, 1
+AVX_INSTR paddusb, 0, 0, 1
+AVX_INSTR paddusw, 0, 0, 1
+AVX_INSTR palignr, 0, 1, 0
+AVX_INSTR pand, 0, 0, 1
+AVX_INSTR pandn, 0, 0, 0
+AVX_INSTR pavgb, 0, 0, 1
+AVX_INSTR pavgw, 0, 0, 1
+AVX_INSTR pblendvb, 0, 0, 0
+AVX_INSTR pblendw, 0, 1, 0
+AVX_INSTR pcmpestri, 0, 0, 0
+AVX_INSTR pcmpestrm, 0, 0, 0
+AVX_INSTR pcmpistri, 0, 0, 0
+AVX_INSTR pcmpistrm, 0, 0, 0
+AVX_INSTR pcmpeqb, 0, 0, 1
+AVX_INSTR pcmpeqw, 0, 0, 1
+AVX_INSTR pcmpeqd, 0, 0, 1
+AVX_INSTR pcmpeqq, 0, 0, 1
+AVX_INSTR pcmpgtb, 0, 0, 0
+AVX_INSTR pcmpgtw, 0, 0, 0
+AVX_INSTR pcmpgtd, 0, 0, 0
+AVX_INSTR pcmpgtq, 0, 0, 0
+AVX_INSTR phaddw, 0, 0, 0
+AVX_INSTR phaddd, 0, 0, 0
+AVX_INSTR phaddsw, 0, 0, 0
+AVX_INSTR phsubw, 0, 0, 0
+AVX_INSTR phsubd, 0, 0, 0
+AVX_INSTR phsubsw, 0, 0, 0
+AVX_INSTR pmaddwd, 0, 0, 1
+AVX_INSTR pmaddubsw, 0, 0, 0
+AVX_INSTR pmaxsb, 0, 0, 1
+AVX_INSTR pmaxsw, 0, 0, 1
+AVX_INSTR pmaxsd, 0, 0, 1
+AVX_INSTR pmaxub, 0, 0, 1
+AVX_INSTR pmaxuw, 0, 0, 1
+AVX_INSTR pmaxud, 0, 0, 1
+AVX_INSTR pminsb, 0, 0, 1
+AVX_INSTR pminsw, 0, 0, 1
+AVX_INSTR pminsd, 0, 0, 1
+AVX_INSTR pminub, 0, 0, 1
+AVX_INSTR pminuw, 0, 0, 1
+AVX_INSTR pminud, 0, 0, 1
+AVX_INSTR pmovmskb, 0, 0, 0
+AVX_INSTR pmulhuw, 0, 0, 1
+AVX_INSTR pmulhrsw, 0, 0, 1
+AVX_INSTR pmulhw, 0, 0, 1
+AVX_INSTR pmullw, 0, 0, 1
+AVX_INSTR pmulld, 0, 0, 1
+AVX_INSTR pmuludq, 0, 0, 1
+AVX_INSTR pmuldq, 0, 0, 1
+AVX_INSTR por, 0, 0, 1
+AVX_INSTR psadbw, 0, 0, 1
+AVX_INSTR pshufb, 0, 0, 0
+AVX_INSTR pshufd, 0, 1, 0
+AVX_INSTR pshufhw, 0, 1, 0
+AVX_INSTR pshuflw, 0, 1, 0
+AVX_INSTR psignb, 0, 0, 0
+AVX_INSTR psignw, 0, 0, 0
+AVX_INSTR psignd, 0, 0, 0
+AVX_INSTR psllw, 0, 0, 0
+AVX_INSTR pslld, 0, 0, 0
+AVX_INSTR psllq, 0, 0, 0
+AVX_INSTR pslldq, 0, 0, 0
+AVX_INSTR psraw, 0, 0, 0
+AVX_INSTR psrad, 0, 0, 0
+AVX_INSTR psrlw, 0, 0, 0
+AVX_INSTR psrld, 0, 0, 0
+AVX_INSTR psrlq, 0, 0, 0
+AVX_INSTR psrldq, 0, 0, 0
+AVX_INSTR psubb, 0, 0, 0
+AVX_INSTR psubw, 0, 0, 0
+AVX_INSTR psubd, 0, 0, 0
+AVX_INSTR psubq, 0, 0, 0
+AVX_INSTR psubsb, 0, 0, 0
+AVX_INSTR psubsw, 0, 0, 0
+AVX_INSTR psubusb, 0, 0, 0
+AVX_INSTR psubusw, 0, 0, 0
+AVX_INSTR ptest, 0, 0, 0
+AVX_INSTR punpckhbw, 0, 0, 0
+AVX_INSTR punpckhwd, 0, 0, 0
+AVX_INSTR punpckhdq, 0, 0, 0
+AVX_INSTR punpckhqdq, 0, 0, 0
+AVX_INSTR punpcklbw, 0, 0, 0
+AVX_INSTR punpcklwd, 0, 0, 0
+AVX_INSTR punpckldq, 0, 0, 0
+AVX_INSTR punpcklqdq, 0, 0, 0
+AVX_INSTR pxor, 0, 0, 1
+AVX_INSTR shufps, 1, 1, 0
+AVX_INSTR subpd, 1, 0, 0
+AVX_INSTR subps, 1, 0, 0
+AVX_INSTR subsd, 1, 0, 0
+AVX_INSTR subss, 1, 0, 0
+AVX_INSTR unpckhpd, 1, 0, 0
+AVX_INSTR unpckhps, 1, 0, 0
+AVX_INSTR unpcklpd, 1, 0, 0
+AVX_INSTR unpcklps, 1, 0, 0
+AVX_INSTR xorpd, 1, 0, 1
+AVX_INSTR xorps, 1, 0, 1
+
+; 3DNow instructions, for sharing code between AVX, SSE and 3DN
+AVX_INSTR pfadd, 1, 0, 1
+AVX_INSTR pfsub, 1, 0, 0
+AVX_INSTR pfmul, 1, 0, 1
+
+; base-4 constants for shuffles
+%assign i 0
+%rep 256
+    %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3)
+    %if j < 10
+        CAT_XDEFINE q000, j, i
+    %elif j < 100
+        CAT_XDEFINE q00, j, i
+    %elif j < 1000
+        CAT_XDEFINE q0, j, i
+    %else
+        CAT_XDEFINE q, j, i
+    %endif
+%assign i i+1
+%endrep
+%undef i
+%undef j
+
+%macro FMA_INSTR 3
+    %macro %1 4-7 %1, %2, %3
+        %if cpuflag(xop)
+            v%5 %1, %2, %3, %4
+        %else
+            %6 %1, %2, %3
+            %7 %1, %4
+        %endif
+    %endmacro
+%endmacro
+
+FMA_INSTR  pmacsdd,  pmulld, paddd
+FMA_INSTR  pmacsww,  pmullw, paddw
+FMA_INSTR pmadcswd, pmaddwd, paddd
+
+; tzcnt is equivalent to "rep bsf" and is backwards-compatible with bsf.
+; This lets us use tzcnt without bumping the yasm version requirement yet.
+%define tzcnt rep bsf
diff --git a/libs/libaom/src/third_party/vector/LICENSE b/libs/libaom/src/third_party/vector/LICENSE
new file mode 100644
index 000000000..afcb9f00a
--- /dev/null
+++ b/libs/libaom/src/third_party/vector/LICENSE
@@ -0,0 +1,19 @@
+The MIT License (MIT)
+Copyright (c) 2016 Peter Goldsborough
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/libs/libaom/src/third_party/vector/README.libaom b/libs/libaom/src/third_party/vector/README.libaom
new file mode 100644
index 000000000..729446dbc
--- /dev/null
+++ b/libs/libaom/src/third_party/vector/README.libaom
@@ -0,0 +1,16 @@
+Name: vector
+URL: https://github.com/goldsborough/vector
+Version: commit-id: 40efe82
+License: MIT
+License File: LICENSE
+
+Description:
+A feature-complete, generic and customizable resizable
+array implementation in pure C that supports almost
+the entire C++ std::vector API, including iterators.
+
+Local Modifications:
+1. Renamed some functions to fit in with the AOMedia
+naming convention.
+2. Removed non-global functions from vector.h.
+3. Made all non-global functions in vector.c static.
diff --git a/libs/libaom/src/third_party/vector/vector.c b/libs/libaom/src/third_party/vector/vector.c
new file mode 100644
index 000000000..4b8b9c6fd
--- /dev/null
+++ b/libs/libaom/src/third_party/vector/vector.c
@@ -0,0 +1,540 @@
+/*
+The MIT License(MIT)
+Copyright(c) 2016 Peter Goldsborough
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files(the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions :
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+#define __STDC_WANT_LIB_EXT1__ 1
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/vector/vector.h"
+
+/***** PRIVATE *****/
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+
+static bool _vector_should_grow(Vector *vector) {
+  assert(vector->size <= vector->capacity);
+  return vector->size == vector->capacity;
+}
+
+static bool _vector_should_shrink(Vector *vector) {
+  assert(vector->size <= vector->capacity);
+  return vector->size == vector->capacity * VECTOR_SHRINK_THRESHOLD;
+}
+
+static void *_vector_offset(Vector *vector, size_t index) {
+  // return vector->data + (index * vector->element_size);
+  return (unsigned char *)vector->data + (index * vector->element_size);
+}
+
+static const void *_vector_const_offset(const Vector *vector, size_t index) {
+  // return vector->data + (index * vector->element_size);
+  return (unsigned char *)vector->data + (index * vector->element_size);
+}
+
+static void _vector_assign(Vector *vector, size_t index, void *element) {
+  /* Insert the element */
+  void *offset = _vector_offset(vector, index);
+  memcpy(offset, element, vector->element_size);
+}
+
+static int _vector_move_right(Vector *vector, size_t index) {
+  assert(vector->size < vector->capacity);
+
+  /* The location where to start to move from. */
+  void *offset = _vector_offset(vector, index);
+
+  /* How many to move to the right. */
+  size_t elements_in_bytes = (vector->size - index) * vector->element_size;
+
+#ifdef __STDC_LIB_EXT1__
+  size_t right_capacity_in_bytes =
+      (vector->capacity - (index + 1)) * vector->element_size;
+
+  /* clang-format off */
+    int return_code =  memmove_s(
+        offset + vector->element_size,
+        right_capacity_in_bytes,
+        offset,
+        elements_in_bytes);
+
+  /* clang-format on */
+
+  return return_code == 0 ? VECTOR_SUCCESS : VECTOR_ERROR;
+
+#else
+  // memmove(offset + vector->element_size, offset, elements_in_bytes);
+  memmove((unsigned char *)offset + vector->element_size, offset,
+          elements_in_bytes);
+  return VECTOR_SUCCESS;
+#endif
+}
+
+static void _vector_move_left(Vector *vector, size_t index) {
+  size_t right_elements_in_bytes;
+  void *offset;
+
+  /* The offset into the memory */
+  offset = _vector_offset(vector, index);
+
+  /* How many to move to the left */
+  right_elements_in_bytes = (vector->size - index - 1) * vector->element_size;
+
+  // memmove(offset, offset + vector->element_size, right_elements_in_bytes);
+  memmove(offset, (unsigned char *)offset + vector->element_size,
+          right_elements_in_bytes);
+}
+
+static int _vector_reallocate(Vector *vector, size_t new_capacity) {
+  size_t new_capacity_in_bytes;
+  void *old;
+  assert(vector != NULL);
+
+  if (new_capacity < VECTOR_MINIMUM_CAPACITY) {
+    if (vector->capacity > VECTOR_MINIMUM_CAPACITY) {
+      new_capacity = VECTOR_MINIMUM_CAPACITY;
+    } else {
+      /* NO-OP */
+      return VECTOR_SUCCESS;
+    }
+  }
+
+  new_capacity_in_bytes = new_capacity * vector->element_size;
+  old = vector->data;
+
+  if ((vector->data = malloc(new_capacity_in_bytes)) == NULL) {
+    return VECTOR_ERROR;
+  }
+
+#ifdef __STDC_LIB_EXT1__
+  /* clang-format off */
+    if (memcpy_s(vector->data,
+                             new_capacity_in_bytes,
+                             old,
+                             aom_vector_byte_size(vector)) != 0) {
+        return VECTOR_ERROR;
+    }
+/* clang-format on */
+#else
+  memcpy(vector->data, old, aom_vector_byte_size(vector));
+#endif
+
+  vector->capacity = new_capacity;
+
+  free(old);
+
+  return VECTOR_SUCCESS;
+}
+
+static int _vector_adjust_capacity(Vector *vector) {
+  return _vector_reallocate(vector,
+                            MAX(1, vector->size * VECTOR_GROWTH_FACTOR));
+}
+
+static void _vector_swap(size_t *first, size_t *second) {
+  size_t temp = *first;
+  *first = *second;
+  *second = temp;
+}
+
+int aom_vector_setup(Vector *vector, size_t capacity, size_t element_size) {
+  assert(vector != NULL);
+
+  if (vector == NULL) return VECTOR_ERROR;
+
+  vector->size = 0;
+  vector->capacity = MAX(VECTOR_MINIMUM_CAPACITY, capacity);
+  vector->element_size = element_size;
+  vector->data = malloc(vector->capacity * element_size);
+
+  return vector->data == NULL ? VECTOR_ERROR : VECTOR_SUCCESS;
+}
+
+int aom_vector_copy(Vector *destination, Vector *source) {
+  assert(destination != NULL);
+  assert(source != NULL);
+  assert(aom_vector_is_initialized(source));
+  assert(!aom_vector_is_initialized(destination));
+
+  if (destination == NULL) return VECTOR_ERROR;
+  if (source == NULL) return VECTOR_ERROR;
+  if (aom_vector_is_initialized(destination)) return VECTOR_ERROR;
+  if (!aom_vector_is_initialized(source)) return VECTOR_ERROR;
+
+  /* Copy ALL the data */
+  destination->size = source->size;
+  destination->capacity = source->size * 2;
+  destination->element_size = source->element_size;
+
+  /* Note that we are not necessarily allocating the same capacity */
+  destination->data = malloc(destination->capacity * source->element_size);
+  if (destination->data == NULL) return VECTOR_ERROR;
+
+  memcpy(destination->data, source->data, aom_vector_byte_size(source));
+
+  return VECTOR_SUCCESS;
+}
+
+int aom_vector_copy_assign(Vector *destination, Vector *source) {
+  assert(destination != NULL);
+  assert(source != NULL);
+  assert(aom_vector_is_initialized(source));
+  assert(aom_vector_is_initialized(destination));
+
+  if (destination == NULL) return VECTOR_ERROR;
+  if (source == NULL) return VECTOR_ERROR;
+  if (!aom_vector_is_initialized(destination)) return VECTOR_ERROR;
+  if (!aom_vector_is_initialized(source)) return VECTOR_ERROR;
+
+  aom_vector_destroy(destination);
+
+  return aom_vector_copy(destination, source);
+}
+
+int aom_vector_move(Vector *destination, Vector *source) {
+  assert(destination != NULL);
+  assert(source != NULL);
+
+  if (destination == NULL) return VECTOR_ERROR;
+  if (source == NULL) return VECTOR_ERROR;
+
+  *destination = *source;
+  source->data = NULL;
+
+  return VECTOR_SUCCESS;
+}
+
+int aom_vector_move_assign(Vector *destination, Vector *source) {
+  aom_vector_swap(destination, source);
+  return aom_vector_destroy(source);
+}
+
+int aom_vector_swap(Vector *destination, Vector *source) {
+  void *temp;
+
+  assert(destination != NULL);
+  assert(source != NULL);
+  assert(aom_vector_is_initialized(source));
+  assert(aom_vector_is_initialized(destination));
+
+  if (destination == NULL) return VECTOR_ERROR;
+  if (source == NULL) return VECTOR_ERROR;
+  if (!aom_vector_is_initialized(destination)) return VECTOR_ERROR;
+  if (!aom_vector_is_initialized(source)) return VECTOR_ERROR;
+
+  _vector_swap(&destination->size, &source->size);
+  _vector_swap(&destination->capacity, &source->capacity);
+  _vector_swap(&destination->element_size, &source->element_size);
+
+  temp = destination->data;
+  destination->data = source->data;
+  source->data = temp;
+
+  return VECTOR_SUCCESS;
+}
+
+int aom_vector_destroy(Vector *vector) {
+  assert(vector != NULL);
+
+  if (vector == NULL) return VECTOR_ERROR;
+
+  free(vector->data);
+  vector->data = NULL;
+
+  return VECTOR_SUCCESS;
+}
+
+/* Insertion */
+int aom_vector_push_back(Vector *vector, void *element) {
+  assert(vector != NULL);
+  assert(element != NULL);
+
+  if (_vector_should_grow(vector)) {
+    if (_vector_adjust_capacity(vector) == VECTOR_ERROR) {
+      return VECTOR_ERROR;
+    }
+  }
+
+  _vector_assign(vector, vector->size, element);
+
+  ++vector->size;
+
+  return VECTOR_SUCCESS;
+}
+
+int aom_vector_push_front(Vector *vector, void *element) {
+  return aom_vector_insert(vector, 0, element);
+}
+
+int aom_vector_insert(Vector *vector, size_t index, void *element) {
+  void *offset;
+
+  assert(vector != NULL);
+  assert(element != NULL);
+  assert(index <= vector->size);
+
+  if (vector == NULL) return VECTOR_ERROR;
+  if (element == NULL) return VECTOR_ERROR;
+  if (vector->element_size == 0) return VECTOR_ERROR;
+  if (index > vector->size) return VECTOR_ERROR;
+
+  if (_vector_should_grow(vector)) {
+    if (_vector_adjust_capacity(vector) == VECTOR_ERROR) {
+      return VECTOR_ERROR;
+    }
+  }
+
+  /* Move other elements to the right */
+  if (_vector_move_right(vector, index) == VECTOR_ERROR) {
+    return VECTOR_ERROR;
+  }
+
+  /* Insert the element */
+  offset = _vector_offset(vector, index);
+  memcpy(offset, element, vector->element_size);
+  ++vector->size;
+
+  return VECTOR_SUCCESS;
+}
+
+int aom_vector_assign(Vector *vector, size_t index, void *element) {
+  assert(vector != NULL);
+  assert(element != NULL);
+  assert(index < vector->size);
+
+  if (vector == NULL) return VECTOR_ERROR;
+  if (element == NULL) return VECTOR_ERROR;
+  if (vector->element_size == 0) return VECTOR_ERROR;
+  if (index >= vector->size) return VECTOR_ERROR;
+
+  _vector_assign(vector, index, element);
+
+  return VECTOR_SUCCESS;
+}
+
+/* Deletion */
+int aom_vector_pop_back(Vector *vector) {
+  assert(vector != NULL);
+  assert(vector->size > 0);
+
+  if (vector == NULL) return VECTOR_ERROR;
+  if (vector->element_size == 0) return VECTOR_ERROR;
+
+  --vector->size;
+
+#ifndef VECTOR_NO_SHRINK
+  if (_vector_should_shrink(vector)) {
+    _vector_adjust_capacity(vector);
+  }
+#endif
+
+  return VECTOR_SUCCESS;
+}
+
+int aom_vector_pop_front(Vector *vector) { return aom_vector_erase(vector, 0); }
+
+int aom_vector_erase(Vector *vector, size_t index) {
+  assert(vector != NULL);
+  assert(index < vector->size);
+
+  if (vector == NULL) return VECTOR_ERROR;
+  if (vector->element_size == 0) return VECTOR_ERROR;
+  if (index >= vector->size) return VECTOR_ERROR;
+
+  /* Just overwrite */
+  _vector_move_left(vector, index);
+
+#ifndef VECTOR_NO_SHRINK
+  if (--vector->size == vector->capacity / 4) {
+    _vector_adjust_capacity(vector);
+  }
+#endif
+
+  return VECTOR_SUCCESS;
+}
+
+int aom_vector_clear(Vector *vector) { return aom_vector_resize(vector, 0); }
+
+/* Lookup */
+void *aom_vector_get(Vector *vector, size_t index) {
+  assert(vector != NULL);
+  assert(index < vector->size);
+
+  if (vector == NULL) return NULL;
+  if (vector->element_size == 0) return NULL;
+  if (index >= vector->size) return NULL;
+
+  return _vector_offset(vector, index);
+}
+
+const void *aom_vector_const_get(const Vector *vector, size_t index) {
+  assert(vector != NULL);
+  assert(index < vector->size);
+
+  if (vector == NULL) return NULL;
+  if (vector->element_size == 0) return NULL;
+  if (index >= vector->size) return NULL;
+
+  return _vector_const_offset(vector, index);
+}
+
+void *aom_vector_front(Vector *vector) { return aom_vector_get(vector, 0); }
+
+void *aom_vector_back(Vector *vector) {
+  return aom_vector_get(vector, vector->size - 1);
+}
+
+/* Information */
+
+bool aom_vector_is_initialized(const Vector *vector) {
+  return vector->data != NULL;
+}
+
+size_t aom_vector_byte_size(const Vector *vector) {
+  return vector->size * vector->element_size;
+}
+
+size_t aom_vector_free_space(const Vector *vector) {
+  return vector->capacity - vector->size;
+}
+
+bool aom_vector_is_empty(const Vector *vector) { return vector->size == 0; }
+
+/* Memory management */
+int aom_vector_resize(Vector *vector, size_t new_size) {
+  if (new_size <= vector->capacity * VECTOR_SHRINK_THRESHOLD) {
+    vector->size = new_size;
+    if (_vector_reallocate(vector, new_size * VECTOR_GROWTH_FACTOR) == -1) {
+      return VECTOR_ERROR;
+    }
+  } else if (new_size > vector->capacity) {
+    if (_vector_reallocate(vector, new_size * VECTOR_GROWTH_FACTOR) == -1) {
+      return VECTOR_ERROR;
+    }
+  }
+
+  vector->size = new_size;
+
+  return VECTOR_SUCCESS;
+}
+
+int aom_vector_reserve(Vector *vector, size_t minimum_capacity) {
+  if (minimum_capacity > vector->capacity) {
+    if (_vector_reallocate(vector, minimum_capacity) == VECTOR_ERROR) {
+      return VECTOR_ERROR;
+    }
+  }
+
+  return VECTOR_SUCCESS;
+}
+
+int aom_vector_shrink_to_fit(Vector *vector) {
+  return _vector_reallocate(vector, vector->size);
+}
+
+/* Iterators */
+Iterator aom_vector_begin(Vector *vector) { return aom_vector_iterator(vector, 0); }
+
+Iterator aom_vector_end(Vector *vector) {
+  return aom_vector_iterator(vector, vector->size);
+}
+
+Iterator aom_vector_iterator(Vector *vector, size_t index) {
+  Iterator iterator = { NULL, 0 };
+
+  assert(vector != NULL);
+  assert(index <= vector->size);
+
+  if (vector == NULL) return iterator;
+  if (index > vector->size) return iterator;
+  if (vector->element_size == 0) return iterator;
+
+  iterator.pointer = _vector_offset(vector, index);
+  iterator.element_size = vector->element_size;
+
+  return iterator;
+}
+
+void *aom_iterator_get(Iterator *iterator) { return iterator->pointer; }
+
+int aom_iterator_erase(Vector *vector, Iterator *iterator) {
+  size_t index = aom_iterator_index(vector, iterator);
+
+  if (aom_vector_erase(vector, index) == VECTOR_ERROR) {
+    return VECTOR_ERROR;
+  }
+
+  *iterator = aom_vector_iterator(vector, index);
+
+  return VECTOR_SUCCESS;
+}
+
+void aom_iterator_increment(Iterator *iterator) {
+  assert(iterator != NULL);
+  // iterator->pointer += iterator->element_size;
+  iterator->pointer =
+      (unsigned char *)iterator->pointer + iterator->element_size;
+}
+
+void aom_iterator_decrement(Iterator *iterator) {
+  assert(iterator != NULL);
+  // iterator->pointer -= iterator->element_size;
+  iterator->pointer =
+      (unsigned char *)iterator->pointer - iterator->element_size;
+}
+
+void *aom_iterator_next(Iterator *iterator) {
+  void *current = iterator->pointer;
+  aom_iterator_increment(iterator);
+
+  return current;
+}
+
+void *aom_iterator_previous(Iterator *iterator) {
+  void *current = iterator->pointer;
+  aom_iterator_decrement(iterator);
+
+  return current;
+}
+
+bool aom_iterator_equals(Iterator *first, Iterator *second) {
+  assert(first->element_size == second->element_size);
+  return first->pointer == second->pointer;
+}
+
+bool aom_iterator_is_before(Iterator *first, Iterator *second) {
+  assert(first->element_size == second->element_size);
+  return first->pointer < second->pointer;
+}
+
+bool aom_iterator_is_after(Iterator *first, Iterator *second) {
+  assert(first->element_size == second->element_size);
+  return first->pointer > second->pointer;
+}
+
+size_t aom_iterator_index(Vector *vector, Iterator *iterator) {
+  assert(vector != NULL);
+  assert(iterator != NULL);
+  // return (iterator->pointer - vector->data) / vector->element_size;
+  return ((unsigned char *)iterator->pointer - (unsigned char *)vector->data) /
+         vector->element_size;
+}
diff --git a/libs/libaom/src/third_party/vector/vector.h b/libs/libaom/src/third_party/vector/vector.h
new file mode 100644
index 000000000..d09eb64c9
--- /dev/null
+++ b/libs/libaom/src/third_party/vector/vector.h
@@ -0,0 +1,138 @@
+/*
+The MIT License(MIT)
+Copyright(c) 2016 Peter Goldsborough
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files(the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions :
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+#ifndef VECTOR_H
+#define VECTOR_H
+
+#include <stdbool.h>
+#include <stddef.h>
+
+/***** DEFINITIONS *****/
+
+#define VECTOR_MINIMUM_CAPACITY 2
+#define VECTOR_GROWTH_FACTOR 2
+#define VECTOR_SHRINK_THRESHOLD (1 / 4)
+
+#define VECTOR_ERROR -1
+#define VECTOR_SUCCESS 0
+
+#define VECTOR_UNINITIALIZED NULL
+#define VECTOR_INITIALIZER \
+  { 0, 0, 0, VECTOR_UNINITIALIZED }
+
+/***** STRUCTURES *****/
+
+typedef struct Vector {
+  size_t size;
+  size_t capacity;
+  size_t element_size;
+
+  void *data;
+} Vector;
+
+typedef struct Iterator {
+  void *pointer;
+  size_t element_size;
+} Iterator;
+
+/***** METHODS *****/
+
+/* Constructor */
+int aom_vector_setup(Vector *vector, size_t capacity, size_t element_size);
+
+/* Copy Constructor */
+int aom_vector_copy(Vector *destination, Vector *source);
+
+/* Copy Assignment */
+int aom_vector_copy_assign(Vector *destination, Vector *source);
+
+/* Move Constructor */
+int aom_vector_move(Vector *destination, Vector *source);
+
+/* Move Assignment */
+int aom_vector_move_assign(Vector *destination, Vector *source);
+
+int aom_vector_swap(Vector *destination, Vector *source);
+
+/* Destructor */
+int aom_vector_destroy(Vector *vector);
+
+/* Insertion */
+int aom_vector_push_back(Vector *vector, void *element);
+int aom_vector_push_front(Vector *vector, void *element);
+int aom_vector_insert(Vector *vector, size_t index, void *element);
+int aom_vector_assign(Vector *vector, size_t index, void *element);
+
+/* Deletion */
+int aom_vector_pop_back(Vector *vector);
+int aom_vector_pop_front(Vector *vector);
+int aom_vector_erase(Vector *vector, size_t index);
+int aom_vector_clear(Vector *vector);
+
+/* Lookup */
+void *aom_vector_get(Vector *vector, size_t index);
+const void *aom_vector_const_get(const Vector *vector, size_t index);
+void *aom_vector_front(Vector *vector);
+void *aom_vector_back(Vector *vector);
+#define VECTOR_GET_AS(type, aom_vector_pointer, index) \
+  *((type *)aom_vector_get((aom_vector_pointer), (index)))
+
+/* Information */
+bool aom_vector_is_initialized(const Vector *vector);
+size_t aom_vector_byte_size(const Vector *vector);
+size_t aom_vector_free_space(const Vector *vector);
+bool aom_vector_is_empty(const Vector *vector);
+
+/* Memory management */
+int aom_vector_resize(Vector *vector, size_t new_size);
+int aom_vector_reserve(Vector *vector, size_t minimum_capacity);
+int aom_vector_shrink_to_fit(Vector *vector);
+
+/* Iterators */
+Iterator aom_vector_begin(Vector *vector);
+Iterator aom_vector_end(Vector *vector);
+Iterator aom_vector_iterator(Vector *vector, size_t index);
+
+void *aom_iterator_get(Iterator *iterator);
+#define ITERATOR_GET_AS(type, iterator) *((type *)aom_iterator_get((iterator)))
+
+int aom_iterator_erase(Vector *vector, Iterator *iterator);
+
+void aom_iterator_increment(Iterator *iterator);
+void aom_iterator_decrement(Iterator *iterator);
+
+void *aom_iterator_next(Iterator *iterator);
+void *aom_iterator_previous(Iterator *iterator);
+
+bool aom_iterator_equals(Iterator *first, Iterator *second);
+bool aom_iterator_is_before(Iterator *first, Iterator *second);
+bool aom_iterator_is_after(Iterator *first, Iterator *second);
+
+size_t aom_iterator_index(Vector *vector, Iterator *iterator);
+
+#define VECTOR_FOR_EACH(aom_vector_pointer, iterator_name)               \
+  for (Iterator(iterator_name) = aom_vector_begin((aom_vector_pointer)), \
+      end = aom_vector_end((aom_vector_pointer));                        \
+       !aom_iterator_equals(&(iterator_name), &end);                     \
+       aom_iterator_increment(&(iterator_name)))
+
+#endif /* VECTOR_H */
diff --git a/libs/libaom/src/third_party/x86inc/LICENSE b/libs/libaom/src/third_party/x86inc/LICENSE
new file mode 100644
index 000000000..7d07645a1
--- /dev/null
+++ b/libs/libaom/src/third_party/x86inc/LICENSE
@@ -0,0 +1,18 @@
+Copyright (C) 2005-2012 x264 project
+
+Authors: Loren Merritt <lorenm@u.washington.edu>
+         Anton Mitrofanov <BugMaster@narod.ru>
+         Jason Garrett-Glaser <darkshikari@gmail.com>
+         Henrik Gramner <hengar-6@student.ltu.se>
+
+Permission to use, copy, modify, and/or distribute this software for any
+purpose with or without fee is hereby granted, provided that the above
+copyright notice and this permission notice appear in all copies.
+
+THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
diff --git a/libs/libaom/src/third_party/x86inc/README.libaom b/libs/libaom/src/third_party/x86inc/README.libaom
new file mode 100644
index 000000000..07c4dad20
--- /dev/null
+++ b/libs/libaom/src/third_party/x86inc/README.libaom
@@ -0,0 +1,20 @@
+URL: https://git.videolan.org/git/x264.git
+Version: d23d18655249944c1ca894b451e2c82c7a584c62
+License: ISC
+License File: LICENSE
+
+Description:
+x264/libav's framework for x86 assembly. Contains a variety of macros and
+defines that help automatically allow assembly to work cross-platform.
+
+Local Modifications:
+Get configuration from aom_config.asm.
+Prefix functions with aom by default.
+Manage name mangling (prefixing with '_') manually because 'PREFIX' does not
+  exist in libaom.
+Expand PIC default to macho64 and respect CONFIG_PIC from libaom
+Set 'private_extern' visibility for macho targets.
+Copy PIC 'GLOBAL' macros from x86_abi_support.asm
+Use .text instead of .rodata on macho to avoid broken tables in PIC mode.
+Use .text with no alignment for aout
+Only use 'hidden' visibility with Chromium
diff --git a/libs/libaom/src/third_party/x86inc/x86inc.asm b/libs/libaom/src/third_party/x86inc/x86inc.asm
new file mode 100644
index 000000000..adaf2d99e
--- /dev/null
+++ b/libs/libaom/src/third_party/x86inc/x86inc.asm
@@ -0,0 +1,1649 @@
+;*****************************************************************************
+;* x86inc.asm: x264asm abstraction layer
+;*****************************************************************************
+;* Copyright (C) 2005-2016 x264 project
+;*
+;* Authors: Loren Merritt <lorenm@u.washington.edu>
+;*          Anton Mitrofanov <BugMaster@narod.ru>
+;*          Fiona Glaser <fiona@x264.com>
+;*          Henrik Gramner <henrik@gramner.com>
+;*
+;* Permission to use, copy, modify, and/or distribute this software for any
+;* purpose with or without fee is hereby granted, provided that the above
+;* copyright notice and this permission notice appear in all copies.
+;*
+;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+;*****************************************************************************
+
+; This is a header file for the x264ASM assembly language, which uses
+; NASM/YASM syntax combined with a large number of macros to provide easy
+; abstraction between different calling conventions (x86_32, win64, linux64).
+; It also has various other useful features to simplify writing the kind of
+; DSP functions that are most often used in x264.
+
+; Unlike the rest of x264, this file is available under an ISC license, as it
+; has significant usefulness outside of x264 and we want it to be available
+; to the largest audience possible.  Of course, if you modify it for your own
+; purposes to add a new feature, we strongly encourage contributing a patch
+; as this feature might be useful for others as well.  Send patches or ideas
+; to x264-devel@videolan.org .
+
+%include "config/aom_config.asm"
+
+%ifndef private_prefix
+    %define private_prefix aom
+%endif
+
+%ifndef public_prefix
+    %define public_prefix private_prefix
+%endif
+
+%ifndef STACK_ALIGNMENT
+    %if ARCH_X86_64
+        %define STACK_ALIGNMENT 16
+    %else
+        %define STACK_ALIGNMENT 4
+    %endif
+%endif
+
+%define WIN64  0
+%define UNIX64 0
+%if ARCH_X86_64
+    %ifidn __OUTPUT_FORMAT__,win32
+        %define WIN64  1
+    %elifidn __OUTPUT_FORMAT__,win64
+        %define WIN64  1
+    %elifidn __OUTPUT_FORMAT__,x64
+        %define WIN64  1
+    %else
+        %define UNIX64 1
+    %endif
+%endif
+
+%define FORMAT_ELF 0
+%ifidn __OUTPUT_FORMAT__,elf
+    %define FORMAT_ELF 1
+%elifidn __OUTPUT_FORMAT__,elf32
+    %define FORMAT_ELF 1
+%elifidn __OUTPUT_FORMAT__,elf64
+    %define FORMAT_ELF 1
+%endif
+
+%define FORMAT_MACHO 0
+%ifidn __OUTPUT_FORMAT__,macho32
+     %define FORMAT_MACHO 1
+%elifidn __OUTPUT_FORMAT__,macho64
+     %define FORMAT_MACHO 1
+%endif
+
+; Set PREFIX for libaom builds.
+%if FORMAT_ELF
+    %undef PREFIX
+%elif WIN64
+    %undef PREFIX
+%else
+    %define PREFIX
+%endif
+
+%ifdef PREFIX
+    %define mangle(x) _ %+ x
+%else
+    %define mangle(x) x
+%endif
+
+; In some instances macho32 tables get misaligned when using .rodata.
+; When looking at the disassembly it appears that the offset is either
+; correct or consistently off by 90. Placing them in the .text section
+; works around the issue. It appears to be specific to the way libaom
+; handles the tables.
+%macro SECTION_RODATA 0-1 16
+    %ifidn __OUTPUT_FORMAT__,macho32
+        SECTION .text align=%1
+        fakegot:
+    %elifidn __OUTPUT_FORMAT__,aout
+        SECTION .text
+    %else
+        SECTION .rodata align=%1
+    %endif
+%endmacro
+
+; PIC macros are copied from aom_ports/x86_abi_support.asm. The "define PIC"
+; from original code is added in for 64bit.
+%ifidn __OUTPUT_FORMAT__,elf32
+%define ABI_IS_32BIT 1
+%elifidn __OUTPUT_FORMAT__,macho32
+%define ABI_IS_32BIT 1
+%elifidn __OUTPUT_FORMAT__,win32
+%define ABI_IS_32BIT 1
+%elifidn __OUTPUT_FORMAT__,aout
+%define ABI_IS_32BIT 1
+%else
+%define ABI_IS_32BIT 0
+%endif
+
+%if ABI_IS_32BIT
+    %if CONFIG_PIC=1
+        %ifidn __OUTPUT_FORMAT__,elf32
+            %define GET_GOT_DEFINED 1
+            %define WRT_PLT wrt ..plt
+            %macro GET_GOT 1
+                extern _GLOBAL_OFFSET_TABLE_
+                push %1
+                call %%get_got
+                %%sub_offset:
+                jmp %%exitGG
+                %%get_got:
+                mov %1, [esp]
+                add %1, _GLOBAL_OFFSET_TABLE_ + $$ - %%sub_offset wrt ..gotpc
+                ret
+                %%exitGG:
+                %undef GLOBAL
+                %define GLOBAL(x) x + %1 wrt ..gotoff
+                %undef RESTORE_GOT
+                %define RESTORE_GOT pop %1
+            %endmacro
+        %elifidn __OUTPUT_FORMAT__,macho32
+            %define GET_GOT_DEFINED 1
+            %macro GET_GOT 1
+                push %1
+                call %%get_got
+                %%get_got:
+                pop  %1
+                %undef GLOBAL
+                %define GLOBAL(x) x + %1 - %%get_got
+                %undef RESTORE_GOT
+                %define RESTORE_GOT pop %1
+            %endmacro
+        %else
+            %define GET_GOT_DEFINED 0
+        %endif
+    %endif
+
+    %if ARCH_X86_64 == 0
+        %undef PIC
+    %endif
+
+%else
+    %macro GET_GOT 1
+    %endmacro
+    %define GLOBAL(x) rel x
+    %define WRT_PLT wrt ..plt
+
+    %if WIN64
+        %define PIC
+    %elifidn __OUTPUT_FORMAT__,macho64
+        %define PIC
+    %elif CONFIG_PIC
+        %define PIC
+    %endif
+%endif
+
+%ifnmacro GET_GOT
+    %macro GET_GOT 1
+    %endmacro
+    %define GLOBAL(x) x
+%endif
+%ifndef RESTORE_GOT
+    %define RESTORE_GOT
+%endif
+%ifndef WRT_PLT
+    %define WRT_PLT
+%endif
+
+%ifdef PIC
+    default rel
+%endif
+
+%ifndef GET_GOT_DEFINED
+    %define GET_GOT_DEFINED 0
+%endif
+; Done with PIC macros
+
+%ifdef __NASM_VER__
+    %use smartalign
+%endif
+
+; Macros to eliminate most code duplication between x86_32 and x86_64:
+; Currently this works only for leaf functions which load all their arguments
+; into registers at the start, and make no other use of the stack. Luckily that
+; covers most of x264's asm.
+
+; PROLOGUE:
+; %1 = number of arguments. loads them from stack if needed.
+; %2 = number of registers used. pushes callee-saved regs if needed.
+; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.
+; %4 = (optional) stack size to be allocated. The stack will be aligned before
+;      allocating the specified stack size. If the required stack alignment is
+;      larger than the known stack alignment the stack will be manually aligned
+;      and an extra register will be allocated to hold the original stack
+;      pointer (to not invalidate r0m etc.). To prevent the use of an extra
+;      register as stack pointer, request a negative stack size.
+; %4+/%5+ = list of names to define to registers
+; PROLOGUE can also be invoked by adding the same options to cglobal
+
+; e.g.
+; cglobal foo, 2,3,7,0x40, dst, src, tmp
+; declares a function (foo) that automatically loads two arguments (dst and
+; src) into registers, uses one additional register (tmp) plus 7 vector
+; registers (m0-m6) and allocates 0x40 bytes of stack space.
+
+; TODO Some functions can use some args directly from the stack. If they're the
+; last args then you can just not declare them, but if they're in the middle
+; we need more flexible macro.
+
+; RET:
+; Pops anything that was pushed by PROLOGUE, and returns.
+
+; REP_RET:
+; Use this instead of RET if it's a branch target.
+
+; registers:
+; rN and rNq are the native-size register holding function argument N
+; rNd, rNw, rNb are dword, word, and byte size
+; rNh is the high 8 bits of the word size
+; rNm is the original location of arg N (a register or on the stack), dword
+; rNmp is native size
+
+%macro DECLARE_REG 2-3
+    %define r%1q %2
+    %define r%1d %2d
+    %define r%1w %2w
+    %define r%1b %2b
+    %define r%1h %2h
+    %define %2q %2
+    %if %0 == 2
+        %define r%1m  %2d
+        %define r%1mp %2
+    %elif ARCH_X86_64 ; memory
+        %define r%1m [rstk + stack_offset + %3]
+        %define r%1mp qword r %+ %1 %+ m
+    %else
+        %define r%1m [rstk + stack_offset + %3]
+        %define r%1mp dword r %+ %1 %+ m
+    %endif
+    %define r%1  %2
+%endmacro
+
+%macro DECLARE_REG_SIZE 3
+    %define r%1q r%1
+    %define e%1q r%1
+    %define r%1d e%1
+    %define e%1d e%1
+    %define r%1w %1
+    %define e%1w %1
+    %define r%1h %3
+    %define e%1h %3
+    %define r%1b %2
+    %define e%1b %2
+    %if ARCH_X86_64 == 0
+        %define r%1 e%1
+    %endif
+%endmacro
+
+DECLARE_REG_SIZE ax, al, ah
+DECLARE_REG_SIZE bx, bl, bh
+DECLARE_REG_SIZE cx, cl, ch
+DECLARE_REG_SIZE dx, dl, dh
+DECLARE_REG_SIZE si, sil, null
+DECLARE_REG_SIZE di, dil, null
+DECLARE_REG_SIZE bp, bpl, null
+
+; t# defines for when per-arch register allocation is more complex than just function arguments
+
+%macro DECLARE_REG_TMP 1-*
+    %assign %%i 0
+    %rep %0
+        CAT_XDEFINE t, %%i, r%1
+        %assign %%i %%i+1
+        %rotate 1
+    %endrep
+%endmacro
+
+%macro DECLARE_REG_TMP_SIZE 0-*
+    %rep %0
+        %define t%1q t%1 %+ q
+        %define t%1d t%1 %+ d
+        %define t%1w t%1 %+ w
+        %define t%1h t%1 %+ h
+        %define t%1b t%1 %+ b
+        %rotate 1
+    %endrep
+%endmacro
+
+DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
+
+%if ARCH_X86_64
+    %define gprsize 8
+%else
+    %define gprsize 4
+%endif
+
+%macro PUSH 1
+    push %1
+    %ifidn rstk, rsp
+        %assign stack_offset stack_offset+gprsize
+    %endif
+%endmacro
+
+%macro POP 1
+    pop %1
+    %ifidn rstk, rsp
+        %assign stack_offset stack_offset-gprsize
+    %endif
+%endmacro
+
+%macro PUSH_IF_USED 1-*
+    %rep %0
+        %if %1 < regs_used
+            PUSH r%1
+        %endif
+        %rotate 1
+    %endrep
+%endmacro
+
+%macro POP_IF_USED 1-*
+    %rep %0
+        %if %1 < regs_used
+            pop r%1
+        %endif
+        %rotate 1
+    %endrep
+%endmacro
+
+%macro LOAD_IF_USED 1-*
+    %rep %0
+        %if %1 < num_args
+            mov r%1, r %+ %1 %+ mp
+        %endif
+        %rotate 1
+    %endrep
+%endmacro
+
+%macro SUB 2
+    sub %1, %2
+    %ifidn %1, rstk
+        %assign stack_offset stack_offset+(%2)
+    %endif
+%endmacro
+
+%macro ADD 2
+    add %1, %2
+    %ifidn %1, rstk
+        %assign stack_offset stack_offset-(%2)
+    %endif
+%endmacro
+
+%macro movifnidn 2
+    %ifnidn %1, %2
+        mov %1, %2
+    %endif
+%endmacro
+
+%macro movsxdifnidn 2
+    %ifnidn %1, %2
+        movsxd %1, %2
+    %endif
+%endmacro
+
+%macro ASSERT 1
+    %if (%1) == 0
+        %error assertion ``%1'' failed
+    %endif
+%endmacro
+
+%macro DEFINE_ARGS 0-*
+    %ifdef n_arg_names
+        %assign %%i 0
+        %rep n_arg_names
+            CAT_UNDEF arg_name %+ %%i, q
+            CAT_UNDEF arg_name %+ %%i, d
+            CAT_UNDEF arg_name %+ %%i, w
+            CAT_UNDEF arg_name %+ %%i, h
+            CAT_UNDEF arg_name %+ %%i, b
+            CAT_UNDEF arg_name %+ %%i, m
+            CAT_UNDEF arg_name %+ %%i, mp
+            CAT_UNDEF arg_name, %%i
+            %assign %%i %%i+1
+        %endrep
+    %endif
+
+    %xdefine %%stack_offset stack_offset
+    %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine
+    %assign %%i 0
+    %rep %0
+        %xdefine %1q r %+ %%i %+ q
+        %xdefine %1d r %+ %%i %+ d
+        %xdefine %1w r %+ %%i %+ w
+        %xdefine %1h r %+ %%i %+ h
+        %xdefine %1b r %+ %%i %+ b
+        %xdefine %1m r %+ %%i %+ m
+        %xdefine %1mp r %+ %%i %+ mp
+        CAT_XDEFINE arg_name, %%i, %1
+        %assign %%i %%i+1
+        %rotate 1
+    %endrep
+    %xdefine stack_offset %%stack_offset
+    %assign n_arg_names %0
+%endmacro
+
+%define required_stack_alignment ((mmsize + 15) & ~15)
+
+%macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only)
+    %ifnum %1
+        %if %1 != 0
+            %assign %%pad 0
+            %assign stack_size %1
+            %if stack_size < 0
+                %assign stack_size -stack_size
+            %endif
+            %if WIN64
+                %assign %%pad %%pad + 32 ; shadow space
+                %if mmsize != 8
+                    %assign xmm_regs_used %2
+                    %if xmm_regs_used > 8
+                        %assign %%pad %%pad + (xmm_regs_used-8)*16 ; callee-saved xmm registers
+                    %endif
+                %endif
+            %endif
+            %if required_stack_alignment <= STACK_ALIGNMENT
+                ; maintain the current stack alignment
+                %assign stack_size_padded stack_size + %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1))
+                SUB rsp, stack_size_padded
+            %else
+                %assign %%reg_num (regs_used - 1)
+                %xdefine rstk r %+ %%reg_num
+                ; align stack, and save original stack location directly above
+                ; it, i.e. in [rsp+stack_size_padded], so we can restore the
+                ; stack in a single instruction (i.e. mov rsp, rstk or mov
+                ; rsp, [rsp+stack_size_padded])
+                %if %1 < 0 ; need to store rsp on stack
+                    %xdefine rstkm [rsp + stack_size + %%pad]
+                    %assign %%pad %%pad + gprsize
+                %else ; can keep rsp in rstk during whole function
+                    %xdefine rstkm rstk
+                %endif
+                %assign stack_size_padded stack_size + ((%%pad + required_stack_alignment-1) & ~(required_stack_alignment-1))
+                mov rstk, rsp
+                and rsp, ~(required_stack_alignment-1)
+                sub rsp, stack_size_padded
+                movifnidn rstkm, rstk
+            %endif
+            WIN64_PUSH_XMM
+        %endif
+    %endif
+%endmacro
+
+%macro SETUP_STACK_POINTER 1
+    %ifnum %1
+        %if %1 != 0 && required_stack_alignment > STACK_ALIGNMENT
+            %if %1 > 0
+                %assign regs_used (regs_used + 1)
+            %endif
+            %if ARCH_X86_64 && regs_used < 5 + UNIX64 * 3
+                ; Ensure that we don't clobber any registers containing arguments
+                %assign regs_used 5 + UNIX64 * 3
+            %endif
+        %endif
+    %endif
+%endmacro
+
+%macro DEFINE_ARGS_INTERNAL 3+
+    %ifnum %2
+        DEFINE_ARGS %3
+    %elif %1 == 4
+        DEFINE_ARGS %2
+    %elif %1 > 4
+        DEFINE_ARGS %2, %3
+    %endif
+%endmacro
+
+%if WIN64 ; Windows x64 ;=================================================
+
+DECLARE_REG 0,  rcx
+DECLARE_REG 1,  rdx
+DECLARE_REG 2,  R8
+DECLARE_REG 3,  R9
+DECLARE_REG 4,  R10, 40
+DECLARE_REG 5,  R11, 48
+DECLARE_REG 6,  rax, 56
+DECLARE_REG 7,  rdi, 64
+DECLARE_REG 8,  rsi, 72
+DECLARE_REG 9,  rbx, 80
+DECLARE_REG 10, rbp, 88
+DECLARE_REG 11, R12, 96
+DECLARE_REG 12, R13, 104
+DECLARE_REG 13, R14, 112
+DECLARE_REG 14, R15, 120
+
+%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
+    %assign num_args %1
+    %assign regs_used %2
+    ASSERT regs_used >= num_args
+    SETUP_STACK_POINTER %4
+    ASSERT regs_used <= 15
+    PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14
+    ALLOC_STACK %4, %3
+    %if mmsize != 8 && stack_size == 0
+        WIN64_SPILL_XMM %3
+    %endif
+    LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
+    DEFINE_ARGS_INTERNAL %0, %4, %5
+%endmacro
+
+%macro WIN64_PUSH_XMM 0
+    ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated.
+    %if xmm_regs_used > 6
+        movaps [rstk + stack_offset +  8], xmm6
+    %endif
+    %if xmm_regs_used > 7
+        movaps [rstk + stack_offset + 24], xmm7
+    %endif
+    %if xmm_regs_used > 8
+        %assign %%i 8
+        %rep xmm_regs_used-8
+            movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i
+            %assign %%i %%i+1
+        %endrep
+    %endif
+%endmacro
+
+%macro WIN64_SPILL_XMM 1
+    %assign xmm_regs_used %1
+    ASSERT xmm_regs_used <= 16
+    %if xmm_regs_used > 8
+        ; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack.
+        %assign %%pad (xmm_regs_used-8)*16 + 32
+        %assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1))
+        SUB rsp, stack_size_padded
+    %endif
+    WIN64_PUSH_XMM
+%endmacro
+
+%macro WIN64_RESTORE_XMM_INTERNAL 1
+    %assign %%pad_size 0
+    %if xmm_regs_used > 8
+        %assign %%i xmm_regs_used
+        %rep xmm_regs_used-8
+            %assign %%i %%i-1
+            movaps xmm %+ %%i, [%1 + (%%i-8)*16 + stack_size + 32]
+        %endrep
+    %endif
+    %if stack_size_padded > 0
+        %if stack_size > 0 && required_stack_alignment > STACK_ALIGNMENT
+            mov rsp, rstkm
+        %else
+            add %1, stack_size_padded
+            %assign %%pad_size stack_size_padded
+        %endif
+    %endif
+    %if xmm_regs_used > 7
+        movaps xmm7, [%1 + stack_offset - %%pad_size + 24]
+    %endif
+    %if xmm_regs_used > 6
+        movaps xmm6, [%1 + stack_offset - %%pad_size +  8]
+    %endif
+%endmacro
+
+%macro WIN64_RESTORE_XMM 1
+    WIN64_RESTORE_XMM_INTERNAL %1
+    %assign stack_offset (stack_offset-stack_size_padded)
+    %assign xmm_regs_used 0
+%endmacro
+
+%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 || stack_size > 0
+
+%macro RET 0
+    WIN64_RESTORE_XMM_INTERNAL rsp
+    POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
+    %if mmsize == 32
+        vzeroupper
+    %endif
+    AUTO_REP_RET
+%endmacro
+
+%elif ARCH_X86_64 ; *nix x64 ;=============================================
+
+DECLARE_REG 0,  rdi
+DECLARE_REG 1,  rsi
+DECLARE_REG 2,  rdx
+DECLARE_REG 3,  rcx
+DECLARE_REG 4,  R8
+DECLARE_REG 5,  R9
+DECLARE_REG 6,  rax, 8
+DECLARE_REG 7,  R10, 16
+DECLARE_REG 8,  R11, 24
+DECLARE_REG 9,  rbx, 32
+DECLARE_REG 10, rbp, 40
+DECLARE_REG 11, R12, 48
+DECLARE_REG 12, R13, 56
+DECLARE_REG 13, R14, 64
+DECLARE_REG 14, R15, 72
+
+%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
+    %assign num_args %1
+    %assign regs_used %2
+    ASSERT regs_used >= num_args
+    SETUP_STACK_POINTER %4
+    ASSERT regs_used <= 15
+    PUSH_IF_USED 9, 10, 11, 12, 13, 14
+    ALLOC_STACK %4
+    LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14
+    DEFINE_ARGS_INTERNAL %0, %4, %5
+%endmacro
+
+%define has_epilogue regs_used > 9 || mmsize == 32 || stack_size > 0
+
+%macro RET 0
+    %if stack_size_padded > 0
+        %if required_stack_alignment > STACK_ALIGNMENT
+            mov rsp, rstkm
+        %else
+            add rsp, stack_size_padded
+        %endif
+    %endif
+    POP_IF_USED 14, 13, 12, 11, 10, 9
+    %if mmsize == 32
+        vzeroupper
+    %endif
+    AUTO_REP_RET
+%endmacro
+
+%else ; X86_32 ;==============================================================
+
+DECLARE_REG 0, eax, 4
+DECLARE_REG 1, ecx, 8
+DECLARE_REG 2, edx, 12
+DECLARE_REG 3, ebx, 16
+DECLARE_REG 4, esi, 20
+DECLARE_REG 5, edi, 24
+DECLARE_REG 6, ebp, 28
+%define rsp esp
+
+%macro DECLARE_ARG 1-*
+    %rep %0
+        %define r%1m [rstk + stack_offset + 4*%1 + 4]
+        %define r%1mp dword r%1m
+        %rotate 1
+    %endrep
+%endmacro
+
+DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
+
+%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
+    %assign num_args %1
+    %assign regs_used %2
+    ASSERT regs_used >= num_args
+    %if num_args > 7
+        %assign num_args 7
+    %endif
+    %if regs_used > 7
+        %assign regs_used 7
+    %endif
+    SETUP_STACK_POINTER %4
+    ASSERT regs_used <= 7
+    PUSH_IF_USED 3, 4, 5, 6
+    ALLOC_STACK %4
+    LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6
+    DEFINE_ARGS_INTERNAL %0, %4, %5
+%endmacro
+
+%define has_epilogue regs_used > 3 || mmsize == 32 || stack_size > 0
+
+%macro RET 0
+    %if stack_size_padded > 0
+        %if required_stack_alignment > STACK_ALIGNMENT
+            mov rsp, rstkm
+        %else
+            add rsp, stack_size_padded
+        %endif
+    %endif
+    POP_IF_USED 6, 5, 4, 3
+    %if mmsize == 32
+        vzeroupper
+    %endif
+    AUTO_REP_RET
+%endmacro
+
+%endif ;======================================================================
+
+%if WIN64 == 0
+    %macro WIN64_SPILL_XMM 1
+    %endmacro
+    %macro WIN64_RESTORE_XMM 1
+    %endmacro
+    %macro WIN64_PUSH_XMM 0
+    %endmacro
+%endif
+
+; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either
+; a branch or a branch target. So switch to a 2-byte form of ret in that case.
+; We can automatically detect "follows a branch", but not a branch target.
+; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.)
+%macro REP_RET 0
+    %if has_epilogue
+        RET
+    %else
+        rep ret
+    %endif
+    annotate_function_size
+%endmacro
+
+%define last_branch_adr $$
+%macro AUTO_REP_RET 0
+    %if notcpuflag(ssse3)
+        times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ == last_branch_adr.
+    %endif
+    ret
+    annotate_function_size
+%endmacro
+
+%macro BRANCH_INSTR 0-*
+    %rep %0
+        %macro %1 1-2 %1
+            %2 %1
+            %if notcpuflag(ssse3)
+                %%branch_instr equ $
+                %xdefine last_branch_adr %%branch_instr
+            %endif
+        %endmacro
+        %rotate 1
+    %endrep
+%endmacro
+
+BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp
+
+%macro TAIL_CALL 2 ; callee, is_nonadjacent
+    %if has_epilogue
+        call %1
+        RET
+    %elif %2
+        jmp %1
+    %endif
+    annotate_function_size
+%endmacro
+
+;=============================================================================
+; arch-independent part
+;=============================================================================
+
+%assign function_align 16
+
+; Begin a function.
+; Applies any symbol mangling needed for C linkage, and sets up a define such that
+; subsequent uses of the function name automatically refer to the mangled version.
+; Appends cpuflags to the function name if cpuflags has been specified.
+; The "" empty default parameter is a workaround for nasm, which fails if SUFFIX
+; is empty and we call cglobal_internal with just %1 %+ SUFFIX (without %2).
+%macro cglobal 1-2+ "" ; name, [PROLOGUE args]
+    cglobal_internal 1, %1 %+ SUFFIX, %2
+%endmacro
+%macro cvisible 1-2+ "" ; name, [PROLOGUE args]
+    cglobal_internal 0, %1 %+ SUFFIX, %2
+%endmacro
+%macro cglobal_internal 2-3+
+    annotate_function_size
+    %if %1
+        %xdefine %%FUNCTION_PREFIX private_prefix
+        ; libaom explicitly sets visibility in shared object builds. Avoid
+        ; setting visibility to hidden as it may break builds that split
+        ; sources on e.g., directory boundaries.
+        %ifdef CHROMIUM
+            %xdefine %%VISIBILITY hidden
+        %else
+            %xdefine %%VISIBILITY
+        %endif
+    %else
+        %xdefine %%FUNCTION_PREFIX public_prefix
+        %xdefine %%VISIBILITY
+    %endif
+    %ifndef cglobaled_%2
+        %xdefine %2 mangle(%%FUNCTION_PREFIX %+ _ %+ %2)
+        %xdefine %2.skip_prologue %2 %+ .skip_prologue
+        CAT_XDEFINE cglobaled_, %2, 1
+    %endif
+    %xdefine current_function %2
+    %xdefine current_function_section __SECT__
+    %if FORMAT_ELF
+        global %2:function %%VISIBILITY
+    %elif FORMAT_MACHO
+        %ifdef __NASM_VER__
+            global %2
+        %else
+            global %2:private_extern
+        %endif
+    %else
+        global %2
+    %endif
+    align function_align
+    %2:
+    RESET_MM_PERMUTATION        ; needed for x86-64, also makes disassembly somewhat nicer
+    %xdefine rstk rsp           ; copy of the original stack pointer, used when greater alignment than the known stack alignment is required
+    %assign stack_offset 0      ; stack pointer offset relative to the return address
+    %assign stack_size 0        ; amount of stack space that can be freely used inside a function
+    %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding
+    %assign xmm_regs_used 0     ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64
+    %ifnidn %3, ""
+        PROLOGUE %3
+    %endif
+%endmacro
+
+%macro cextern 1
+    %xdefine %1 mangle(private_prefix %+ _ %+ %1)
+    CAT_XDEFINE cglobaled_, %1, 1
+    extern %1
+%endmacro
+
+; like cextern, but without the prefix
+%macro cextern_naked 1
+    %ifdef PREFIX
+        %xdefine %1 mangle(%1)
+    %endif
+    CAT_XDEFINE cglobaled_, %1, 1
+    extern %1
+%endmacro
+
+%macro const 1-2+
+    %xdefine %1 mangle(private_prefix %+ _ %+ %1)
+    %if FORMAT_ELF
+        global %1:data hidden
+    %else
+        global %1
+    %endif
+    %1: %2
+%endmacro
+
+; This is needed for ELF, otherwise the GNU linker assumes the stack is executable by default.
+%if FORMAT_ELF
+    [SECTION .note.GNU-stack noalloc noexec nowrite progbits]
+%endif
+
+; Tell debuggers how large the function was.
+; This may be invoked multiple times per function; we rely on later instances overriding earlier ones.
+; This is invoked by RET and similar macros, and also cglobal does it for the previous function,
+; but if the last function in a source file doesn't use any of the standard macros for its epilogue,
+; then its size might be unspecified.
+%macro annotate_function_size 0
+    %ifdef __YASM_VER__
+        %ifdef current_function
+            %if FORMAT_ELF
+                current_function_section
+                %%ecf equ $
+                size current_function %%ecf - current_function
+                __SECT__
+            %endif
+        %endif
+    %endif
+%endmacro
+
+; cpuflags
+
+%assign cpuflags_mmx      (1<<0)
+%assign cpuflags_mmx2     (1<<1) | cpuflags_mmx
+%assign cpuflags_3dnow    (1<<2) | cpuflags_mmx
+%assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow
+%assign cpuflags_sse      (1<<4) | cpuflags_mmx2
+%assign cpuflags_sse2     (1<<5) | cpuflags_sse
+%assign cpuflags_sse2slow (1<<6) | cpuflags_sse2
+%assign cpuflags_sse3     (1<<7) | cpuflags_sse2
+%assign cpuflags_ssse3    (1<<8) | cpuflags_sse3
+%assign cpuflags_sse4     (1<<9) | cpuflags_ssse3
+%assign cpuflags_sse42    (1<<10)| cpuflags_sse4
+%assign cpuflags_avx      (1<<11)| cpuflags_sse42
+%assign cpuflags_xop      (1<<12)| cpuflags_avx
+%assign cpuflags_fma4     (1<<13)| cpuflags_avx
+%assign cpuflags_fma3     (1<<14)| cpuflags_avx
+%assign cpuflags_avx2     (1<<15)| cpuflags_fma3
+
+%assign cpuflags_cache32  (1<<16)
+%assign cpuflags_cache64  (1<<17)
+%assign cpuflags_slowctz  (1<<18)
+%assign cpuflags_lzcnt    (1<<19)
+%assign cpuflags_aligned  (1<<20) ; not a cpu feature, but a function variant
+%assign cpuflags_atom     (1<<21)
+%assign cpuflags_bmi1     (1<<22)|cpuflags_lzcnt
+%assign cpuflags_bmi2     (1<<23)|cpuflags_bmi1
+
+; Returns a boolean value expressing whether or not the specified cpuflag is enabled.
+%define    cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 1) >> 31) & 1)
+%define notcpuflag(x) (cpuflag(x) ^ 1)
+
+; Takes an arbitrary number of cpuflags from the above list.
+; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu.
+; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co.
+%macro INIT_CPUFLAGS 0-*
+    %xdefine SUFFIX
+    %undef cpuname
+    %assign cpuflags 0
+
+    %if %0 >= 1
+        %rep %0
+            %ifdef cpuname
+                %xdefine cpuname cpuname %+ _%1
+            %else
+                %xdefine cpuname %1
+            %endif
+            %assign cpuflags cpuflags | cpuflags_%1
+            %rotate 1
+        %endrep
+        %xdefine SUFFIX _ %+ cpuname
+
+        %if cpuflag(avx)
+            %assign avx_enabled 1
+        %endif
+        %if (mmsize == 16 && notcpuflag(sse2)) || (mmsize == 32 && notcpuflag(avx2))
+            %define mova movaps
+            %define movu movups
+            %define movnta movntps
+        %endif
+        %if cpuflag(aligned)
+            %define movu mova
+        %elif cpuflag(sse3) && notcpuflag(ssse3)
+            %define movu lddqu
+        %endif
+    %endif
+
+    %if ARCH_X86_64 || cpuflag(sse2)
+        %ifdef __NASM_VER__
+            ALIGNMODE k8
+        %else
+            CPU amdnop
+        %endif
+    %else
+        %ifdef __NASM_VER__
+            ALIGNMODE nop
+        %else
+            CPU basicnop
+        %endif
+    %endif
+%endmacro
+
+; Merge mmx and sse*
+; m# is a simd register of the currently selected size
+; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m#
+; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m#
+; (All 3 remain in sync through SWAP.)
+
+%macro CAT_XDEFINE 3
+    %xdefine %1%2 %3
+%endmacro
+
+%macro CAT_UNDEF 2
+    %undef %1%2
+%endmacro
+
+%macro INIT_MMX 0-1+
+    %assign avx_enabled 0
+    %define RESET_MM_PERMUTATION INIT_MMX %1
+    %define mmsize 8
+    %define num_mmregs 8
+    %define mova movq
+    %define movu movq
+    %define movh movd
+    %define movnta movntq
+    %assign %%i 0
+    %rep 8
+        CAT_XDEFINE m, %%i, mm %+ %%i
+        CAT_XDEFINE nnmm, %%i, %%i
+        %assign %%i %%i+1
+    %endrep
+    %rep 8
+        CAT_UNDEF m, %%i
+        CAT_UNDEF nnmm, %%i
+        %assign %%i %%i+1
+    %endrep
+    INIT_CPUFLAGS %1
+%endmacro
+
+%macro INIT_XMM 0-1+
+    %assign avx_enabled 0
+    %define RESET_MM_PERMUTATION INIT_XMM %1
+    %define mmsize 16
+    %define num_mmregs 8
+    %if ARCH_X86_64
+        %define num_mmregs 16
+    %endif
+    %define mova movdqa
+    %define movu movdqu
+    %define movh movq
+    %define movnta movntdq
+    %assign %%i 0
+    %rep num_mmregs
+        CAT_XDEFINE m, %%i, xmm %+ %%i
+        CAT_XDEFINE nnxmm, %%i, %%i
+        %assign %%i %%i+1
+    %endrep
+    INIT_CPUFLAGS %1
+%endmacro
+
+%macro INIT_YMM 0-1+
+    %assign avx_enabled 1
+    %define RESET_MM_PERMUTATION INIT_YMM %1
+    %define mmsize 32
+    %define num_mmregs 8
+    %if ARCH_X86_64
+        %define num_mmregs 16
+    %endif
+    %define mova movdqa
+    %define movu movdqu
+    %undef movh
+    %define movnta movntdq
+    %assign %%i 0
+    %rep num_mmregs
+        CAT_XDEFINE m, %%i, ymm %+ %%i
+        CAT_XDEFINE nnymm, %%i, %%i
+        %assign %%i %%i+1
+    %endrep
+    INIT_CPUFLAGS %1
+%endmacro
+
+INIT_XMM
+
+%macro DECLARE_MMCAST 1
+    %define  mmmm%1   mm%1
+    %define  mmxmm%1  mm%1
+    %define  mmymm%1  mm%1
+    %define xmmmm%1   mm%1
+    %define xmmxmm%1 xmm%1
+    %define xmmymm%1 xmm%1
+    %define ymmmm%1   mm%1
+    %define ymmxmm%1 xmm%1
+    %define ymmymm%1 ymm%1
+    %define xm%1 xmm %+ m%1
+    %define ym%1 ymm %+ m%1
+%endmacro
+
+%assign i 0
+%rep 16
+    DECLARE_MMCAST i
+    %assign i i+1
+%endrep
+
+; I often want to use macros that permute their arguments. e.g. there's no
+; efficient way to implement butterfly or transpose or dct without swapping some
+; arguments.
+;
+; I would like to not have to manually keep track of the permutations:
+; If I insert a permutation in the middle of a function, it should automatically
+; change everything that follows. For more complex macros I may also have multiple
+; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations.
+;
+; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that
+; permutes its arguments. It's equivalent to exchanging the contents of the
+; registers, except that this way you exchange the register names instead, so it
+; doesn't cost any cycles.
+
+%macro PERMUTE 2-* ; takes a list of pairs to swap
+    %rep %0/2
+        %xdefine %%tmp%2 m%2
+        %rotate 2
+    %endrep
+    %rep %0/2
+        %xdefine m%1 %%tmp%2
+        CAT_XDEFINE nn, m%1, %1
+        %rotate 2
+    %endrep
+%endmacro
+
+%macro SWAP 2+ ; swaps a single chain (sometimes more concise than pairs)
+    %ifnum %1 ; SWAP 0, 1, ...
+        SWAP_INTERNAL_NUM %1, %2
+    %else ; SWAP m0, m1, ...
+        SWAP_INTERNAL_NAME %1, %2
+    %endif
+%endmacro
+
+%macro SWAP_INTERNAL_NUM 2-*
+    %rep %0-1
+        %xdefine %%tmp m%1
+        %xdefine m%1 m%2
+        %xdefine m%2 %%tmp
+        CAT_XDEFINE nn, m%1, %1
+        CAT_XDEFINE nn, m%2, %2
+        %rotate 1
+    %endrep
+%endmacro
+
+%macro SWAP_INTERNAL_NAME 2-*
+    %xdefine %%args nn %+ %1
+    %rep %0-1
+        %xdefine %%args %%args, nn %+ %2
+        %rotate 1
+    %endrep
+    SWAP_INTERNAL_NUM %%args
+%endmacro
+
+; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later
+; calls to that function will automatically load the permutation, so values can
+; be returned in mmregs.
+%macro SAVE_MM_PERMUTATION 0-1
+    %if %0
+        %xdefine %%f %1_m
+    %else
+        %xdefine %%f current_function %+ _m
+    %endif
+    %assign %%i 0
+    %rep num_mmregs
+        CAT_XDEFINE %%f, %%i, m %+ %%i
+        %assign %%i %%i+1
+    %endrep
+%endmacro
+
+%macro LOAD_MM_PERMUTATION 1 ; name to load from
+    %ifdef %1_m0
+        %assign %%i 0
+        %rep num_mmregs
+            CAT_XDEFINE m, %%i, %1_m %+ %%i
+            CAT_XDEFINE nn, m %+ %%i, %%i
+            %assign %%i %%i+1
+        %endrep
+    %endif
+%endmacro
+
+; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't
+%macro call 1
+    call_internal %1 %+ SUFFIX, %1
+%endmacro
+%macro call_internal 2
+    %xdefine %%i %2
+    %ifndef cglobaled_%2
+        %ifdef cglobaled_%1
+            %xdefine %%i %1
+        %endif
+    %endif
+    call %%i
+    LOAD_MM_PERMUTATION %%i
+%endmacro
+
+; Substitutions that reduce instruction size but are functionally equivalent
+%macro add 2
+    %ifnum %2
+        %if %2==128
+            sub %1, -128
+        %else
+            add %1, %2
+        %endif
+    %else
+        add %1, %2
+    %endif
+%endmacro
+
+%macro sub 2
+    %ifnum %2
+        %if %2==128
+            add %1, -128
+        %else
+            sub %1, %2
+        %endif
+    %else
+        sub %1, %2
+    %endif
+%endmacro
+
+;=============================================================================
+; AVX abstraction layer
+;=============================================================================
+
+%assign i 0
+%rep 16
+    %if i < 8
+        CAT_XDEFINE sizeofmm, i, 8
+    %endif
+    CAT_XDEFINE sizeofxmm, i, 16
+    CAT_XDEFINE sizeofymm, i, 32
+    %assign i i+1
+%endrep
+%undef i
+
+%macro CHECK_AVX_INSTR_EMU 3-*
+    %xdefine %%opcode %1
+    %xdefine %%dst %2
+    %rep %0-2
+        %ifidn %%dst, %3
+            %error non-avx emulation of ``%%opcode'' is not supported
+        %endif
+        %rotate 1
+    %endrep
+%endmacro
+
+;%1 == instruction
+;%2 == minimal instruction set
+;%3 == 1 if float, 0 if int
+;%4 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise
+;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
+;%6+: operands
+%macro RUN_AVX_INSTR 6-9+
+    %ifnum sizeof%7
+        %assign __sizeofreg sizeof%7
+    %elifnum sizeof%6
+        %assign __sizeofreg sizeof%6
+    %else
+        %assign __sizeofreg mmsize
+    %endif
+    %assign __emulate_avx 0
+    %if avx_enabled && __sizeofreg >= 16
+        %xdefine __instr v%1
+    %else
+        %xdefine __instr %1
+        %if %0 >= 8+%4
+            %assign __emulate_avx 1
+        %endif
+    %endif
+    %ifnidn %2, fnord
+        %ifdef cpuname
+            %if notcpuflag(%2)
+                %error use of ``%1'' %2 instruction in cpuname function: current_function
+            %elif cpuflags_%2 < cpuflags_sse && notcpuflag(sse2) && __sizeofreg > 8
+                %error use of ``%1'' sse2 instruction in cpuname function: current_function
+            %endif
+        %endif
+    %endif
+
+    %if __emulate_avx
+        %xdefine __src1 %7
+        %xdefine __src2 %8
+        %ifnidn %6, %7
+            %if %0 >= 9
+                CHECK_AVX_INSTR_EMU {%1 %6, %7, %8, %9}, %6, %8, %9
+            %else
+                CHECK_AVX_INSTR_EMU {%1 %6, %7, %8}, %6, %8
+            %endif
+            %if %5 && %4 == 0
+                %ifnid %8
+                    ; 3-operand AVX instructions with a memory arg can only have it in src2,
+                    ; whereas SSE emulation prefers to have it in src1 (i.e. the mov).
+                    ; So, if the instruction is commutative with a memory arg, swap them.
+                    %xdefine __src1 %8
+                    %xdefine __src2 %7
+                %endif
+            %endif
+            %if __sizeofreg == 8
+                MOVQ %6, __src1
+            %elif %3
+                MOVAPS %6, __src1
+            %else
+                MOVDQA %6, __src1
+            %endif
+        %endif
+        %if %0 >= 9
+            %1 %6, __src2, %9
+        %else
+            %1 %6, __src2
+        %endif
+    %elif %0 >= 9
+        __instr %6, %7, %8, %9
+    %elif %0 == 8
+        __instr %6, %7, %8
+    %elif %0 == 7
+        __instr %6, %7
+    %else
+        __instr %6
+    %endif
+%endmacro
+
+;%1 == instruction
+;%2 == minimal instruction set
+;%3 == 1 if float, 0 if int
+;%4 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise
+;%5 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
+%macro AVX_INSTR 1-5 fnord, 0, 1, 0
+    %macro %1 1-10 fnord, fnord, fnord, fnord, %1, %2, %3, %4, %5
+        %ifidn %2, fnord
+            RUN_AVX_INSTR %6, %7, %8, %9, %10, %1
+        %elifidn %3, fnord
+            RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2
+        %elifidn %4, fnord
+            RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3
+        %elifidn %5, fnord
+            RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4
+        %else
+            RUN_AVX_INSTR %6, %7, %8, %9, %10, %1, %2, %3, %4, %5
+        %endif
+    %endmacro
+%endmacro
+
+; Instructions with both VEX and non-VEX encodings
+; Non-destructive instructions are written without parameters
+AVX_INSTR addpd, sse2, 1, 0, 1
+AVX_INSTR addps, sse, 1, 0, 1
+AVX_INSTR addsd, sse2, 1, 0, 1
+AVX_INSTR addss, sse, 1, 0, 1
+AVX_INSTR addsubpd, sse3, 1, 0, 0
+AVX_INSTR addsubps, sse3, 1, 0, 0
+AVX_INSTR aesdec, fnord, 0, 0, 0
+AVX_INSTR aesdeclast, fnord, 0, 0, 0
+AVX_INSTR aesenc, fnord, 0, 0, 0
+AVX_INSTR aesenclast, fnord, 0, 0, 0
+AVX_INSTR aesimc
+AVX_INSTR aeskeygenassist
+AVX_INSTR andnpd, sse2, 1, 0, 0
+AVX_INSTR andnps, sse, 1, 0, 0
+AVX_INSTR andpd, sse2, 1, 0, 1
+AVX_INSTR andps, sse, 1, 0, 1
+AVX_INSTR blendpd, sse4, 1, 0, 0
+AVX_INSTR blendps, sse4, 1, 0, 0
+AVX_INSTR blendvpd, sse4, 1, 0, 0
+AVX_INSTR blendvps, sse4, 1, 0, 0
+AVX_INSTR cmppd, sse2, 1, 1, 0
+AVX_INSTR cmpps, sse, 1, 1, 0
+AVX_INSTR cmpsd, sse2, 1, 1, 0
+AVX_INSTR cmpss, sse, 1, 1, 0
+AVX_INSTR comisd, sse2
+AVX_INSTR comiss, sse
+AVX_INSTR cvtdq2pd, sse2
+AVX_INSTR cvtdq2ps, sse2
+AVX_INSTR cvtpd2dq, sse2
+AVX_INSTR cvtpd2ps, sse2
+AVX_INSTR cvtps2dq, sse2
+AVX_INSTR cvtps2pd, sse2
+AVX_INSTR cvtsd2si, sse2
+AVX_INSTR cvtsd2ss, sse2
+AVX_INSTR cvtsi2sd, sse2
+AVX_INSTR cvtsi2ss, sse
+AVX_INSTR cvtss2sd, sse2
+AVX_INSTR cvtss2si, sse
+AVX_INSTR cvttpd2dq, sse2
+AVX_INSTR cvttps2dq, sse2
+AVX_INSTR cvttsd2si, sse2
+AVX_INSTR cvttss2si, sse
+AVX_INSTR divpd, sse2, 1, 0, 0
+AVX_INSTR divps, sse, 1, 0, 0
+AVX_INSTR divsd, sse2, 1, 0, 0
+AVX_INSTR divss, sse, 1, 0, 0
+AVX_INSTR dppd, sse4, 1, 1, 0
+AVX_INSTR dpps, sse4, 1, 1, 0
+AVX_INSTR extractps, sse4
+AVX_INSTR haddpd, sse3, 1, 0, 0
+AVX_INSTR haddps, sse3, 1, 0, 0
+AVX_INSTR hsubpd, sse3, 1, 0, 0
+AVX_INSTR hsubps, sse3, 1, 0, 0
+AVX_INSTR insertps, sse4, 1, 1, 0
+AVX_INSTR lddqu, sse3
+AVX_INSTR ldmxcsr, sse
+AVX_INSTR maskmovdqu, sse2
+AVX_INSTR maxpd, sse2, 1, 0, 1
+AVX_INSTR maxps, sse, 1, 0, 1
+AVX_INSTR maxsd, sse2, 1, 0, 1
+AVX_INSTR maxss, sse, 1, 0, 1
+AVX_INSTR minpd, sse2, 1, 0, 1
+AVX_INSTR minps, sse, 1, 0, 1
+AVX_INSTR minsd, sse2, 1, 0, 1
+AVX_INSTR minss, sse, 1, 0, 1
+AVX_INSTR movapd, sse2
+AVX_INSTR movaps, sse
+AVX_INSTR movd, mmx
+AVX_INSTR movddup, sse3
+AVX_INSTR movdqa, sse2
+AVX_INSTR movdqu, sse2
+AVX_INSTR movhlps, sse, 1, 0, 0
+AVX_INSTR movhpd, sse2, 1, 0, 0
+AVX_INSTR movhps, sse, 1, 0, 0
+AVX_INSTR movlhps, sse, 1, 0, 0
+AVX_INSTR movlpd, sse2, 1, 0, 0
+AVX_INSTR movlps, sse, 1, 0, 0
+AVX_INSTR movmskpd, sse2
+AVX_INSTR movmskps, sse
+AVX_INSTR movntdq, sse2
+AVX_INSTR movntdqa, sse4
+AVX_INSTR movntpd, sse2
+AVX_INSTR movntps, sse
+AVX_INSTR movq, mmx
+AVX_INSTR movsd, sse2, 1, 0, 0
+AVX_INSTR movshdup, sse3
+AVX_INSTR movsldup, sse3
+AVX_INSTR movss, sse, 1, 0, 0
+AVX_INSTR movupd, sse2
+AVX_INSTR movups, sse
+AVX_INSTR mpsadbw, sse4
+AVX_INSTR mulpd, sse2, 1, 0, 1
+AVX_INSTR mulps, sse, 1, 0, 1
+AVX_INSTR mulsd, sse2, 1, 0, 1
+AVX_INSTR mulss, sse, 1, 0, 1
+AVX_INSTR orpd, sse2, 1, 0, 1
+AVX_INSTR orps, sse, 1, 0, 1
+AVX_INSTR pabsb, ssse3
+AVX_INSTR pabsd, ssse3
+AVX_INSTR pabsw, ssse3
+AVX_INSTR packsswb, mmx, 0, 0, 0
+AVX_INSTR packssdw, mmx, 0, 0, 0
+AVX_INSTR packuswb, mmx, 0, 0, 0
+AVX_INSTR packusdw, sse4, 0, 0, 0
+AVX_INSTR paddb, mmx, 0, 0, 1
+AVX_INSTR paddw, mmx, 0, 0, 1
+AVX_INSTR paddd, mmx, 0, 0, 1
+AVX_INSTR paddq, sse2, 0, 0, 1
+AVX_INSTR paddsb, mmx, 0, 0, 1
+AVX_INSTR paddsw, mmx, 0, 0, 1
+AVX_INSTR paddusb, mmx, 0, 0, 1
+AVX_INSTR paddusw, mmx, 0, 0, 1
+AVX_INSTR palignr, ssse3
+AVX_INSTR pand, mmx, 0, 0, 1
+AVX_INSTR pandn, mmx, 0, 0, 0
+AVX_INSTR pavgb, mmx2, 0, 0, 1
+AVX_INSTR pavgw, mmx2, 0, 0, 1
+AVX_INSTR pblendvb, sse4, 0, 0, 0
+AVX_INSTR pblendw, sse4
+AVX_INSTR pclmulqdq
+AVX_INSTR pcmpestri, sse42
+AVX_INSTR pcmpestrm, sse42
+AVX_INSTR pcmpistri, sse42
+AVX_INSTR pcmpistrm, sse42
+AVX_INSTR pcmpeqb, mmx, 0, 0, 1
+AVX_INSTR pcmpeqw, mmx, 0, 0, 1
+AVX_INSTR pcmpeqd, mmx, 0, 0, 1
+AVX_INSTR pcmpeqq, sse4, 0, 0, 1
+AVX_INSTR pcmpgtb, mmx, 0, 0, 0
+AVX_INSTR pcmpgtw, mmx, 0, 0, 0
+AVX_INSTR pcmpgtd, mmx, 0, 0, 0
+AVX_INSTR pcmpgtq, sse42, 0, 0, 0
+AVX_INSTR pextrb, sse4
+AVX_INSTR pextrd, sse4
+AVX_INSTR pextrq, sse4
+AVX_INSTR pextrw, mmx2
+AVX_INSTR phaddw, ssse3, 0, 0, 0
+AVX_INSTR phaddd, ssse3, 0, 0, 0
+AVX_INSTR phaddsw, ssse3, 0, 0, 0
+AVX_INSTR phminposuw, sse4
+AVX_INSTR phsubw, ssse3, 0, 0, 0
+AVX_INSTR phsubd, ssse3, 0, 0, 0
+AVX_INSTR phsubsw, ssse3, 0, 0, 0
+AVX_INSTR pinsrb, sse4
+AVX_INSTR pinsrd, sse4
+AVX_INSTR pinsrq, sse4
+AVX_INSTR pinsrw, mmx2
+AVX_INSTR pmaddwd, mmx, 0, 0, 1
+AVX_INSTR pmaddubsw, ssse3, 0, 0, 0
+AVX_INSTR pmaxsb, sse4, 0, 0, 1
+AVX_INSTR pmaxsw, mmx2, 0, 0, 1
+AVX_INSTR pmaxsd, sse4, 0, 0, 1
+AVX_INSTR pmaxub, mmx2, 0, 0, 1
+AVX_INSTR pmaxuw, sse4, 0, 0, 1
+AVX_INSTR pmaxud, sse4, 0, 0, 1
+AVX_INSTR pminsb, sse4, 0, 0, 1
+AVX_INSTR pminsw, mmx2, 0, 0, 1
+AVX_INSTR pminsd, sse4, 0, 0, 1
+AVX_INSTR pminub, mmx2, 0, 0, 1
+AVX_INSTR pminuw, sse4, 0, 0, 1
+AVX_INSTR pminud, sse4, 0, 0, 1
+AVX_INSTR pmovmskb, mmx2
+AVX_INSTR pmovsxbw, sse4
+AVX_INSTR pmovsxbd, sse4
+AVX_INSTR pmovsxbq, sse4
+AVX_INSTR pmovsxwd, sse4
+AVX_INSTR pmovsxwq, sse4
+AVX_INSTR pmovsxdq, sse4
+AVX_INSTR pmovzxbw, sse4
+AVX_INSTR pmovzxbd, sse4
+AVX_INSTR pmovzxbq, sse4
+AVX_INSTR pmovzxwd, sse4
+AVX_INSTR pmovzxwq, sse4
+AVX_INSTR pmovzxdq, sse4
+AVX_INSTR pmuldq, sse4, 0, 0, 1
+AVX_INSTR pmulhrsw, ssse3, 0, 0, 1
+AVX_INSTR pmulhuw, mmx2, 0, 0, 1
+AVX_INSTR pmulhw, mmx, 0, 0, 1
+AVX_INSTR pmullw, mmx, 0, 0, 1
+AVX_INSTR pmulld, sse4, 0, 0, 1
+AVX_INSTR pmuludq, sse2, 0, 0, 1
+AVX_INSTR por, mmx, 0, 0, 1
+AVX_INSTR psadbw, mmx2, 0, 0, 1
+AVX_INSTR pshufb, ssse3, 0, 0, 0
+AVX_INSTR pshufd, sse2
+AVX_INSTR pshufhw, sse2
+AVX_INSTR pshuflw, sse2
+AVX_INSTR psignb, ssse3, 0, 0, 0
+AVX_INSTR psignw, ssse3, 0, 0, 0
+AVX_INSTR psignd, ssse3, 0, 0, 0
+AVX_INSTR psllw, mmx, 0, 0, 0
+AVX_INSTR pslld, mmx, 0, 0, 0
+AVX_INSTR psllq, mmx, 0, 0, 0
+AVX_INSTR pslldq, sse2, 0, 0, 0
+AVX_INSTR psraw, mmx, 0, 0, 0
+AVX_INSTR psrad, mmx, 0, 0, 0
+AVX_INSTR psrlw, mmx, 0, 0, 0
+AVX_INSTR psrld, mmx, 0, 0, 0
+AVX_INSTR psrlq, mmx, 0, 0, 0
+AVX_INSTR psrldq, sse2, 0, 0, 0
+AVX_INSTR psubb, mmx, 0, 0, 0
+AVX_INSTR psubw, mmx, 0, 0, 0
+AVX_INSTR psubd, mmx, 0, 0, 0
+AVX_INSTR psubq, sse2, 0, 0, 0
+AVX_INSTR psubsb, mmx, 0, 0, 0
+AVX_INSTR psubsw, mmx, 0, 0, 0
+AVX_INSTR psubusb, mmx, 0, 0, 0
+AVX_INSTR psubusw, mmx, 0, 0, 0
+AVX_INSTR ptest, sse4
+AVX_INSTR punpckhbw, mmx, 0, 0, 0
+AVX_INSTR punpckhwd, mmx, 0, 0, 0
+AVX_INSTR punpckhdq, mmx, 0, 0, 0
+AVX_INSTR punpckhqdq, sse2, 0, 0, 0
+AVX_INSTR punpcklbw, mmx, 0, 0, 0
+AVX_INSTR punpcklwd, mmx, 0, 0, 0
+AVX_INSTR punpckldq, mmx, 0, 0, 0
+AVX_INSTR punpcklqdq, sse2, 0, 0, 0
+AVX_INSTR pxor, mmx, 0, 0, 1
+AVX_INSTR rcpps, sse, 1, 0, 0
+AVX_INSTR rcpss, sse, 1, 0, 0
+AVX_INSTR roundpd, sse4
+AVX_INSTR roundps, sse4
+AVX_INSTR roundsd, sse4
+AVX_INSTR roundss, sse4
+AVX_INSTR rsqrtps, sse, 1, 0, 0
+AVX_INSTR rsqrtss, sse, 1, 0, 0
+AVX_INSTR shufpd, sse2, 1, 1, 0
+AVX_INSTR shufps, sse, 1, 1, 0
+AVX_INSTR sqrtpd, sse2, 1, 0, 0
+AVX_INSTR sqrtps, sse, 1, 0, 0
+AVX_INSTR sqrtsd, sse2, 1, 0, 0
+AVX_INSTR sqrtss, sse, 1, 0, 0
+AVX_INSTR stmxcsr, sse
+AVX_INSTR subpd, sse2, 1, 0, 0
+AVX_INSTR subps, sse, 1, 0, 0
+AVX_INSTR subsd, sse2, 1, 0, 0
+AVX_INSTR subss, sse, 1, 0, 0
+AVX_INSTR ucomisd, sse2
+AVX_INSTR ucomiss, sse
+AVX_INSTR unpckhpd, sse2, 1, 0, 0
+AVX_INSTR unpckhps, sse, 1, 0, 0
+AVX_INSTR unpcklpd, sse2, 1, 0, 0
+AVX_INSTR unpcklps, sse, 1, 0, 0
+AVX_INSTR xorpd, sse2, 1, 0, 1
+AVX_INSTR xorps, sse, 1, 0, 1
+
+; 3DNow instructions, for sharing code between AVX, SSE and 3DN
+AVX_INSTR pfadd, 3dnow, 1, 0, 1
+AVX_INSTR pfsub, 3dnow, 1, 0, 0
+AVX_INSTR pfmul, 3dnow, 1, 0, 1
+
+; base-4 constants for shuffles
+%assign i 0
+%rep 256
+    %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3)
+    %if j < 10
+        CAT_XDEFINE q000, j, i
+    %elif j < 100
+        CAT_XDEFINE q00, j, i
+    %elif j < 1000
+        CAT_XDEFINE q0, j, i
+    %else
+        CAT_XDEFINE q, j, i
+    %endif
+    %assign i i+1
+%endrep
+%undef i
+%undef j
+
+%macro FMA_INSTR 3
+    %macro %1 4-7 %1, %2, %3
+        %if cpuflag(xop)
+            v%5 %1, %2, %3, %4
+        %elifnidn %1, %4
+            %6 %1, %2, %3
+            %7 %1, %4
+        %else
+            %error non-xop emulation of ``%5 %1, %2, %3, %4'' is not supported
+        %endif
+    %endmacro
+%endmacro
+
+FMA_INSTR  pmacsww,  pmullw, paddw
+FMA_INSTR  pmacsdd,  pmulld, paddd ; sse4 emulation
+FMA_INSTR pmacsdql,  pmuldq, paddq ; sse4 emulation
+FMA_INSTR pmadcswd, pmaddwd, paddd
+
+; Macros for consolidating FMA3 and FMA4 using 4-operand (dst, src1, src2, src3) syntax.
+; FMA3 is only possible if dst is the same as one of the src registers.
+; Either src2 or src3 can be a memory operand.
+%macro FMA4_INSTR 2-*
+    %push fma4_instr
+    %xdefine %$prefix %1
+    %rep %0 - 1
+        %macro %$prefix%2 4-6 %$prefix, %2
+            %if notcpuflag(fma3) && notcpuflag(fma4)
+                %error use of ``%5%6'' fma instruction in cpuname function: current_function
+            %elif cpuflag(fma4)
+                v%5%6 %1, %2, %3, %4
+            %elifidn %1, %2
+                ; If %3 or %4 is a memory operand it needs to be encoded as the last operand.
+                %ifid %3
+                    v%{5}213%6 %2, %3, %4
+                %else
+                    v%{5}132%6 %2, %4, %3
+                %endif
+            %elifidn %1, %3
+                v%{5}213%6 %3, %2, %4
+            %elifidn %1, %4
+                v%{5}231%6 %4, %2, %3
+            %else
+                %error fma3 emulation of ``%5%6 %1, %2, %3, %4'' is not supported
+            %endif
+        %endmacro
+        %rotate 1
+    %endrep
+    %pop
+%endmacro
+
+FMA4_INSTR fmadd,    pd, ps, sd, ss
+FMA4_INSTR fmaddsub, pd, ps
+FMA4_INSTR fmsub,    pd, ps, sd, ss
+FMA4_INSTR fmsubadd, pd, ps
+FMA4_INSTR fnmadd,   pd, ps, sd, ss
+FMA4_INSTR fnmsub,   pd, ps, sd, ss
+
+; workaround: vpbroadcastq is broken in x86_32 due to a yasm bug (fixed in 1.3.0)
+%ifdef __YASM_VER__
+    %if __YASM_VERSION_ID__ < 0x01030000 && ARCH_X86_64 == 0
+        %macro vpbroadcastq 2
+            %if sizeof%1 == 16
+                movddup %1, %2
+            %else
+                vbroadcastsd %1, %2
+            %endif
+        %endmacro
+    %endif
+%endif
diff --git a/libs/libaom/src/tools/aggregate_entropy_stats.py b/libs/libaom/src/tools/aggregate_entropy_stats.py
new file mode 100644
index 000000000..7cb4d18e1
--- /dev/null
+++ b/libs/libaom/src/tools/aggregate_entropy_stats.py
@@ -0,0 +1,39 @@
+#!/usr/bin/env python
+## Copyright (c) 2017, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+"""Aggregate multiple entropy stats output which is written in 32-bit int.
+
+python ./aggregate_entropy_stats.py [dir of stats files] [keyword of filenames]
+ [filename of final stats]
+"""
+
+__author__ = "yuec@google.com"
+
+import os
+import sys
+import numpy as np
+
+def main():
+    dir = sys.argv[1]
+    sum = []
+    for fn in os.listdir(dir):
+        if sys.argv[2] in fn:
+            stats = np.fromfile(dir + fn, dtype=np.int32)
+            if len(sum) == 0:
+                sum = stats
+            else:
+                sum = np.add(sum, stats)
+    if len(sum) == 0:
+        print("No stats file is found. Double-check directory and keyword?")
+    else:
+        sum.tofile(dir+sys.argv[3])
+
+if __name__ == '__main__':
+    main()
diff --git a/libs/libaom/src/tools/aom_entropy_optimizer.c b/libs/libaom/src/tools/aom_entropy_optimizer.c
new file mode 100644
index 000000000..9f529d9ab
--- /dev/null
+++ b/libs/libaom/src/tools/aom_entropy_optimizer.c
@@ -0,0 +1,761 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// This tool is a gadget for offline probability training.
+// A binary executable aom_entropy_optimizer will be generated in tools/. It
+// parses a binary file consisting of counts written in the format of
+// FRAME_COUNTS in entropymode.h, and computes optimized probability tables
+// and CDF tables, which will be written to a new c file optimized_probs.c
+// according to format in the codebase.
+//
+// Command line: ./aom_entropy_optimizer [directory of the count file]
+//
+// The input file can either be generated by encoding a single clip by
+// turning on entropy_stats experiment, or be collected at a larger scale at
+// which a python script which will be provided soon can be used to aggregate
+// multiple stats output.
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "config/aom_config.h"
+
+#include "av1/encoder/encoder.h"
+
+#define SPACES_PER_TAB 2
+#define CDF_MAX_SIZE 16
+
+typedef unsigned int aom_count_type;
+// A log file recording parsed counts
+static FILE *logfile;  // TODO(yuec): make it a command line option
+
+static void counts_to_cdf(const aom_count_type *counts, aom_cdf_prob *cdf,
+                          int modes) {
+  int64_t csum[CDF_MAX_SIZE];
+  assert(modes <= CDF_MAX_SIZE);
+
+  csum[0] = counts[0] + 1;
+  for (int i = 1; i < modes; ++i) csum[i] = counts[i] + 1 + csum[i - 1];
+
+  for (int i = 0; i < modes; ++i) fprintf(logfile, "%d ", counts[i]);
+  fprintf(logfile, "\n");
+
+  int64_t sum = csum[modes - 1];
+  const int64_t round_shift = sum >> 1;
+  for (int i = 0; i < modes; ++i) {
+    cdf[i] = (csum[i] * CDF_PROB_TOP + round_shift) / sum;
+    cdf[i] = AOMMIN(cdf[i], CDF_PROB_TOP - (modes - 1 + i) * 4);
+    cdf[i] = (i == 0) ? AOMMAX(cdf[i], 4) : AOMMAX(cdf[i], cdf[i - 1] + 4);
+  }
+}
+
+static int parse_counts_for_cdf_opt(aom_count_type **ct_ptr,
+                                    FILE *const probsfile, int tabs,
+                                    int dim_of_cts, int *cts_each_dim) {
+  if (dim_of_cts < 1) {
+    fprintf(stderr, "The dimension of a counts vector should be at least 1!\n");
+    return 1;
+  }
+  const int total_modes = cts_each_dim[0];
+  if (dim_of_cts == 1) {
+    assert(total_modes <= CDF_MAX_SIZE);
+    aom_cdf_prob cdfs[CDF_MAX_SIZE];
+    aom_count_type *counts1d = *ct_ptr;
+
+    counts_to_cdf(counts1d, cdfs, total_modes);
+    (*ct_ptr) += total_modes;
+
+    if (tabs > 0) fprintf(probsfile, "%*c", tabs * SPACES_PER_TAB, ' ');
+    fprintf(probsfile, "AOM_CDF%d(", total_modes);
+    for (int k = 0; k < total_modes - 1; ++k) {
+      fprintf(probsfile, "%d", cdfs[k]);
+      if (k < total_modes - 2) fprintf(probsfile, ", ");
+    }
+    fprintf(probsfile, ")");
+  } else {
+    for (int k = 0; k < total_modes; ++k) {
+      int tabs_next_level;
+
+      if (dim_of_cts == 2)
+        fprintf(probsfile, "%*c{ ", tabs * SPACES_PER_TAB, ' ');
+      else
+        fprintf(probsfile, "%*c{\n", tabs * SPACES_PER_TAB, ' ');
+      tabs_next_level = dim_of_cts == 2 ? 0 : tabs + 1;
+
+      if (parse_counts_for_cdf_opt(ct_ptr, probsfile, tabs_next_level,
+                                   dim_of_cts - 1, cts_each_dim + 1)) {
+        return 1;
+      }
+
+      if (dim_of_cts == 2) {
+        if (k == total_modes - 1)
+          fprintf(probsfile, " }\n");
+        else
+          fprintf(probsfile, " },\n");
+      } else {
+        if (k == total_modes - 1)
+          fprintf(probsfile, "%*c}\n", tabs * SPACES_PER_TAB, ' ');
+        else
+          fprintf(probsfile, "%*c},\n", tabs * SPACES_PER_TAB, ' ');
+      }
+    }
+  }
+  return 0;
+}
+
+static void optimize_cdf_table(aom_count_type *counts, FILE *const probsfile,
+                               int dim_of_cts, int *cts_each_dim,
+                               char *prefix) {
+  aom_count_type *ct_ptr = counts;
+
+  fprintf(probsfile, "%s = {\n", prefix);
+  fprintf(logfile, "%s\n", prefix);
+  if (parse_counts_for_cdf_opt(&ct_ptr, probsfile, 1, dim_of_cts,
+                               cts_each_dim)) {
+    fprintf(probsfile, "Optimizer failed!\n");
+  }
+  fprintf(probsfile, "};\n\n");
+  fprintf(logfile, "============================\n");
+}
+
+static void optimize_uv_mode(aom_count_type *counts, FILE *const probsfile,
+                             int dim_of_cts, int *cts_each_dim, char *prefix) {
+  aom_count_type *ct_ptr = counts;
+
+  fprintf(probsfile, "%s = {\n", prefix);
+  fprintf(probsfile, "%*c{\n", SPACES_PER_TAB, ' ');
+  fprintf(logfile, "%s\n", prefix);
+  cts_each_dim[2] = UV_INTRA_MODES - 1;
+  for (int k = 0; k < cts_each_dim[1]; ++k) {
+    fprintf(probsfile, "%*c{ ", 2 * SPACES_PER_TAB, ' ');
+    parse_counts_for_cdf_opt(&ct_ptr, probsfile, 0, dim_of_cts - 2,
+                             cts_each_dim + 2);
+    if (k + 1 == cts_each_dim[1]) {
+      fprintf(probsfile, " }\n");
+    } else {
+      fprintf(probsfile, " },\n");
+    }
+    ++ct_ptr;
+  }
+  fprintf(probsfile, "%*c},\n", SPACES_PER_TAB, ' ');
+  fprintf(probsfile, "%*c{\n", SPACES_PER_TAB, ' ');
+  cts_each_dim[2] = UV_INTRA_MODES;
+  parse_counts_for_cdf_opt(&ct_ptr, probsfile, 2, dim_of_cts - 1,
+                           cts_each_dim + 1);
+  fprintf(probsfile, "%*c}\n", SPACES_PER_TAB, ' ');
+  fprintf(probsfile, "};\n\n");
+  fprintf(logfile, "============================\n");
+}
+
+static void optimize_cdf_table_var_modes_2d(aom_count_type *counts,
+                                            FILE *const probsfile,
+                                            int dim_of_cts, int *cts_each_dim,
+                                            int *modes_each_ctx, char *prefix) {
+  aom_count_type *ct_ptr = counts;
+
+  assert(dim_of_cts == 2);
+  (void)dim_of_cts;
+
+  fprintf(probsfile, "%s = {\n", prefix);
+  fprintf(logfile, "%s\n", prefix);
+
+  for (int d0_idx = 0; d0_idx < cts_each_dim[0]; ++d0_idx) {
+    int num_of_modes = modes_each_ctx[d0_idx];
+
+    if (num_of_modes > 0) {
+      fprintf(probsfile, "%*c{ ", SPACES_PER_TAB, ' ');
+      parse_counts_for_cdf_opt(&ct_ptr, probsfile, 0, 1, &num_of_modes);
+      ct_ptr += cts_each_dim[1] - num_of_modes;
+      fprintf(probsfile, " },\n");
+    } else {
+      fprintf(probsfile, "%*c{ 0 },\n", SPACES_PER_TAB, ' ');
+      fprintf(logfile, "dummy cdf, no need to optimize\n");
+      ct_ptr += cts_each_dim[1];
+    }
+  }
+  fprintf(probsfile, "};\n\n");
+  fprintf(logfile, "============================\n");
+}
+
+static void optimize_cdf_table_var_modes_3d(aom_count_type *counts,
+                                            FILE *const probsfile,
+                                            int dim_of_cts, int *cts_each_dim,
+                                            int *modes_each_ctx, char *prefix) {
+  aom_count_type *ct_ptr = counts;
+
+  assert(dim_of_cts == 3);
+  (void)dim_of_cts;
+
+  fprintf(probsfile, "%s = {\n", prefix);
+  fprintf(logfile, "%s\n", prefix);
+
+  for (int d0_idx = 0; d0_idx < cts_each_dim[0]; ++d0_idx) {
+    fprintf(probsfile, "%*c{\n", SPACES_PER_TAB, ' ');
+    for (int d1_idx = 0; d1_idx < cts_each_dim[1]; ++d1_idx) {
+      int num_of_modes = modes_each_ctx[d0_idx];
+
+      if (num_of_modes > 0) {
+        fprintf(probsfile, "%*c{ ", 2 * SPACES_PER_TAB, ' ');
+        parse_counts_for_cdf_opt(&ct_ptr, probsfile, 0, 1, &num_of_modes);
+        ct_ptr += cts_each_dim[2] - num_of_modes;
+        fprintf(probsfile, " },\n");
+      } else {
+        fprintf(probsfile, "%*c{ 0 },\n", 2 * SPACES_PER_TAB, ' ');
+        fprintf(logfile, "dummy cdf, no need to optimize\n");
+        ct_ptr += cts_each_dim[2];
+      }
+    }
+    fprintf(probsfile, "%*c},\n", SPACES_PER_TAB, ' ');
+  }
+  fprintf(probsfile, "};\n\n");
+  fprintf(logfile, "============================\n");
+}
+
+static void optimize_cdf_table_var_modes_4d(aom_count_type *counts,
+                                            FILE *const probsfile,
+                                            int dim_of_cts, int *cts_each_dim,
+                                            int *modes_each_ctx, char *prefix) {
+  aom_count_type *ct_ptr = counts;
+
+  assert(dim_of_cts == 4);
+  (void)dim_of_cts;
+
+  fprintf(probsfile, "%s = {\n", prefix);
+  fprintf(logfile, "%s\n", prefix);
+
+  for (int d0_idx = 0; d0_idx < cts_each_dim[0]; ++d0_idx) {
+    fprintf(probsfile, "%*c{\n", SPACES_PER_TAB, ' ');
+    for (int d1_idx = 0; d1_idx < cts_each_dim[1]; ++d1_idx) {
+      fprintf(probsfile, "%*c{\n", 2 * SPACES_PER_TAB, ' ');
+      for (int d2_idx = 0; d2_idx < cts_each_dim[2]; ++d2_idx) {
+        int num_of_modes = modes_each_ctx[d0_idx];
+
+        if (num_of_modes > 0) {
+          fprintf(probsfile, "%*c{ ", 3 * SPACES_PER_TAB, ' ');
+          parse_counts_for_cdf_opt(&ct_ptr, probsfile, 0, 1, &num_of_modes);
+          ct_ptr += cts_each_dim[3] - num_of_modes;
+          fprintf(probsfile, " },\n");
+        } else {
+          fprintf(probsfile, "%*c{ 0 },\n", 3 * SPACES_PER_TAB, ' ');
+          fprintf(logfile, "dummy cdf, no need to optimize\n");
+          ct_ptr += cts_each_dim[3];
+        }
+      }
+      fprintf(probsfile, "%*c},\n", 2 * SPACES_PER_TAB, ' ');
+    }
+    fprintf(probsfile, "%*c},\n", SPACES_PER_TAB, ' ');
+  }
+  fprintf(probsfile, "};\n\n");
+  fprintf(logfile, "============================\n");
+}
+
+int main(int argc, const char **argv) {
+  if (argc < 2) {
+    fprintf(stderr, "Please specify the input stats file!\n");
+    exit(EXIT_FAILURE);
+  }
+
+  FILE *const statsfile = fopen(argv[1], "rb");
+  if (statsfile == NULL) {
+    fprintf(stderr, "Failed to open input file!\n");
+    exit(EXIT_FAILURE);
+  }
+
+  FRAME_COUNTS fc;
+  const size_t bytes = fread(&fc, sizeof(FRAME_COUNTS), 1, statsfile);
+  if (!bytes) {
+    fclose(statsfile);
+    return 1;
+  }
+
+  FILE *const probsfile = fopen("optimized_probs.c", "w");
+  if (probsfile == NULL) {
+    fprintf(stderr,
+            "Failed to create output file for optimized entropy tables!\n");
+    exit(EXIT_FAILURE);
+  }
+
+  logfile = fopen("aom_entropy_optimizer_parsed_counts.log", "w");
+  if (logfile == NULL) {
+    fprintf(stderr, "Failed to create log file for parsed counts!\n");
+    exit(EXIT_FAILURE);
+  }
+
+  int cts_each_dim[10];
+
+  /* Intra mode (keyframe luma) */
+  cts_each_dim[0] = KF_MODE_CONTEXTS;
+  cts_each_dim[1] = KF_MODE_CONTEXTS;
+  cts_each_dim[2] = INTRA_MODES;
+  optimize_cdf_table(&fc.kf_y_mode[0][0][0], probsfile, 3, cts_each_dim,
+                     "const aom_cdf_prob\n"
+                     "default_kf_y_mode_cdf[KF_MODE_CONTEXTS][KF_MODE_CONTEXTS]"
+                     "[CDF_SIZE(INTRA_MODES)]");
+
+  cts_each_dim[0] = DIRECTIONAL_MODES;
+  cts_each_dim[1] = 2 * MAX_ANGLE_DELTA + 1;
+  optimize_cdf_table(&fc.angle_delta[0][0], probsfile, 2, cts_each_dim,
+                     "static const aom_cdf_prob default_angle_delta_cdf"
+                     "[DIRECTIONAL_MODES][CDF_SIZE(2 * MAX_ANGLE_DELTA + 1)]");
+
+  /* Intra mode (non-keyframe luma) */
+  cts_each_dim[0] = BLOCK_SIZE_GROUPS;
+  cts_each_dim[1] = INTRA_MODES;
+  optimize_cdf_table(
+      &fc.y_mode[0][0], probsfile, 2, cts_each_dim,
+      "static const aom_cdf_prob\n"
+      "default_if_y_mode_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE(INTRA_MODES)]");
+
+  /* Intra mode (chroma) */
+  cts_each_dim[0] = CFL_ALLOWED_TYPES;
+  cts_each_dim[1] = INTRA_MODES;
+  cts_each_dim[2] = UV_INTRA_MODES;
+  optimize_uv_mode(&fc.uv_mode[0][0][0], probsfile, 3, cts_each_dim,
+                   "static const aom_cdf_prob\n"
+                   "default_uv_mode_cdf[CFL_ALLOWED_TYPES][INTRA_MODES]"
+                   "[CDF_SIZE(UV_INTRA_MODES)]");
+
+  /* block partition */
+  cts_each_dim[0] = PARTITION_CONTEXTS;
+  cts_each_dim[1] = EXT_PARTITION_TYPES;
+  int part_types_each_ctx[PARTITION_CONTEXTS] = { 4,  4,  4,  4,  10, 10, 10,
+                                                  10, 10, 10, 10, 10, 10, 10,
+                                                  10, 10, 8,  8,  8,  8 };
+  optimize_cdf_table_var_modes_2d(
+      &fc.partition[0][0], probsfile, 2, cts_each_dim, part_types_each_ctx,
+      "static const aom_cdf_prob default_partition_cdf[PARTITION_CONTEXTS]"
+      "[CDF_SIZE(EXT_PARTITION_TYPES)]");
+
+  /* tx type */
+  cts_each_dim[0] = EXT_TX_SETS_INTRA;
+  cts_each_dim[1] = EXT_TX_SIZES;
+  cts_each_dim[2] = INTRA_MODES;
+  cts_each_dim[3] = TX_TYPES;
+  int intra_ext_tx_types_each_ctx[EXT_TX_SETS_INTRA] = { 0, 7, 5 };
+  optimize_cdf_table_var_modes_4d(
+      &fc.intra_ext_tx[0][0][0][0], probsfile, 4, cts_each_dim,
+      intra_ext_tx_types_each_ctx,
+      "static const aom_cdf_prob default_intra_ext_tx_cdf[EXT_TX_SETS_INTRA]"
+      "[EXT_TX_SIZES][INTRA_MODES][CDF_SIZE(TX_TYPES)]");
+
+  cts_each_dim[0] = EXT_TX_SETS_INTER;
+  cts_each_dim[1] = EXT_TX_SIZES;
+  cts_each_dim[2] = TX_TYPES;
+  int inter_ext_tx_types_each_ctx[EXT_TX_SETS_INTER] = { 0, 16, 12, 2 };
+  optimize_cdf_table_var_modes_3d(
+      &fc.inter_ext_tx[0][0][0], probsfile, 3, cts_each_dim,
+      inter_ext_tx_types_each_ctx,
+      "static const aom_cdf_prob default_inter_ext_tx_cdf[EXT_TX_SETS_INTER]"
+      "[EXT_TX_SIZES][CDF_SIZE(TX_TYPES)]");
+
+  /* Chroma from Luma */
+  cts_each_dim[0] = CFL_JOINT_SIGNS;
+  optimize_cdf_table(&fc.cfl_sign[0], probsfile, 1, cts_each_dim,
+                     "static const aom_cdf_prob\n"
+                     "default_cfl_sign_cdf[CDF_SIZE(CFL_JOINT_SIGNS)]");
+  cts_each_dim[0] = CFL_ALPHA_CONTEXTS;
+  cts_each_dim[1] = CFL_ALPHABET_SIZE;
+  optimize_cdf_table(&fc.cfl_alpha[0][0], probsfile, 2, cts_each_dim,
+                     "static const aom_cdf_prob\n"
+                     "default_cfl_alpha_cdf[CFL_ALPHA_CONTEXTS]"
+                     "[CDF_SIZE(CFL_ALPHABET_SIZE)]");
+
+  /* Interpolation filter */
+  cts_each_dim[0] = SWITCHABLE_FILTER_CONTEXTS;
+  cts_each_dim[1] = SWITCHABLE_FILTERS;
+  optimize_cdf_table(&fc.switchable_interp[0][0], probsfile, 2, cts_each_dim,
+                     "static const aom_cdf_prob\n"
+                     "default_switchable_interp_cdf[SWITCHABLE_FILTER_CONTEXTS]"
+                     "[CDF_SIZE(SWITCHABLE_FILTERS)]");
+
+  /* Motion vector referencing */
+  cts_each_dim[0] = NEWMV_MODE_CONTEXTS;
+  cts_each_dim[1] = 2;
+  optimize_cdf_table(&fc.newmv_mode[0][0], probsfile, 2, cts_each_dim,
+                     "static const aom_cdf_prob "
+                     "default_newmv_cdf[NEWMV_MODE_CONTEXTS][CDF_SIZE(2)]");
+
+  cts_each_dim[0] = GLOBALMV_MODE_CONTEXTS;
+  cts_each_dim[1] = 2;
+  optimize_cdf_table(&fc.zeromv_mode[0][0], probsfile, 2, cts_each_dim,
+                     "static const aom_cdf_prob "
+                     "default_zeromv_cdf[GLOBALMV_MODE_CONTEXTS][CDF_SIZE(2)]");
+
+  cts_each_dim[0] = REFMV_MODE_CONTEXTS;
+  cts_each_dim[1] = 2;
+  optimize_cdf_table(&fc.refmv_mode[0][0], probsfile, 2, cts_each_dim,
+                     "static const aom_cdf_prob "
+                     "default_refmv_cdf[REFMV_MODE_CONTEXTS][CDF_SIZE(2)]");
+
+  cts_each_dim[0] = DRL_MODE_CONTEXTS;
+  cts_each_dim[1] = 2;
+  optimize_cdf_table(&fc.drl_mode[0][0], probsfile, 2, cts_each_dim,
+                     "static const aom_cdf_prob "
+                     "default_drl_cdf[DRL_MODE_CONTEXTS][CDF_SIZE(2)]");
+
+  /* ext_inter experiment */
+  /* New compound mode */
+  cts_each_dim[0] = INTER_MODE_CONTEXTS;
+  cts_each_dim[1] = INTER_COMPOUND_MODES;
+  optimize_cdf_table(&fc.inter_compound_mode[0][0], probsfile, 2, cts_each_dim,
+                     "static const aom_cdf_prob\n"
+                     "default_inter_compound_mode_cdf[INTER_MODE_CONTEXTS][CDF_"
+                     "SIZE(INTER_COMPOUND_MODES)]");
+
+  /* Interintra */
+  cts_each_dim[0] = BLOCK_SIZE_GROUPS;
+  cts_each_dim[1] = 2;
+  optimize_cdf_table(&fc.interintra[0][0], probsfile, 2, cts_each_dim,
+                     "static const aom_cdf_prob "
+                     "default_interintra_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE(2)]");
+
+  cts_each_dim[0] = BLOCK_SIZE_GROUPS;
+  cts_each_dim[1] = INTERINTRA_MODES;
+  optimize_cdf_table(&fc.interintra_mode[0][0], probsfile, 2, cts_each_dim,
+                     "static const aom_cdf_prob\n"
+                     "default_interintra_mode_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE("
+                     "INTERINTRA_MODES)]");
+
+  cts_each_dim[0] = BLOCK_SIZES_ALL;
+  cts_each_dim[1] = 2;
+  optimize_cdf_table(
+      &fc.wedge_interintra[0][0], probsfile, 2, cts_each_dim,
+      "static const aom_cdf_prob\n"
+      "default_wedge_interintra_cdf[BLOCK_SIZES_ALL][CDF_SIZE(2)]");
+
+  /* Compound type */
+  cts_each_dim[0] = BLOCK_SIZES_ALL;
+  cts_each_dim[1] = COMPOUND_TYPES - 1;
+  optimize_cdf_table(&fc.compound_type[0][0], probsfile, 2, cts_each_dim,
+                     "static const aom_cdf_prob default_compound_type_cdf"
+                     "[BLOCK_SIZES_ALL][CDF_SIZE(COMPOUND_TYPES - 1)]");
+
+  cts_each_dim[0] = BLOCK_SIZES_ALL;
+  cts_each_dim[1] = 16;
+  optimize_cdf_table(&fc.wedge_idx[0][0], probsfile, 2, cts_each_dim,
+                     "static const aom_cdf_prob "
+                     "default_wedge_idx_cdf[BLOCK_SIZES_ALL][CDF_SIZE(16)]");
+
+  /* motion_var and warped_motion experiments */
+  cts_each_dim[0] = BLOCK_SIZES_ALL;
+  cts_each_dim[1] = MOTION_MODES;
+  optimize_cdf_table(
+      &fc.motion_mode[0][0], probsfile, 2, cts_each_dim,
+      "static const aom_cdf_prob\n"
+      "default_motion_mode_cdf[BLOCK_SIZES_ALL][CDF_SIZE(MOTION_MODES)]");
+  cts_each_dim[0] = BLOCK_SIZES_ALL;
+  cts_each_dim[1] = 2;
+  optimize_cdf_table(&fc.obmc[0][0], probsfile, 2, cts_each_dim,
+                     "static const aom_cdf_prob "
+                     "default_obmc_cdf[BLOCK_SIZES_ALL][CDF_SIZE(2)]");
+
+  /* Intra/inter flag */
+  cts_each_dim[0] = INTRA_INTER_CONTEXTS;
+  cts_each_dim[1] = 2;
+  optimize_cdf_table(
+      &fc.intra_inter[0][0], probsfile, 2, cts_each_dim,
+      "static const aom_cdf_prob\n"
+      "default_intra_inter_cdf[INTRA_INTER_CONTEXTS][CDF_SIZE(2)]");
+
+  /* Single/comp ref flag */
+  cts_each_dim[0] = COMP_INTER_CONTEXTS;
+  cts_each_dim[1] = 2;
+  optimize_cdf_table(
+      &fc.comp_inter[0][0], probsfile, 2, cts_each_dim,
+      "static const aom_cdf_prob\n"
+      "default_comp_inter_cdf[COMP_INTER_CONTEXTS][CDF_SIZE(2)]");
+
+  /* ext_comp_refs experiment */
+  cts_each_dim[0] = COMP_REF_TYPE_CONTEXTS;
+  cts_each_dim[1] = 2;
+  optimize_cdf_table(
+      &fc.comp_ref_type[0][0], probsfile, 2, cts_each_dim,
+      "static const aom_cdf_prob\n"
+      "default_comp_ref_type_cdf[COMP_REF_TYPE_CONTEXTS][CDF_SIZE(2)]");
+
+  cts_each_dim[0] = UNI_COMP_REF_CONTEXTS;
+  cts_each_dim[1] = UNIDIR_COMP_REFS - 1;
+  cts_each_dim[2] = 2;
+  optimize_cdf_table(&fc.uni_comp_ref[0][0][0], probsfile, 3, cts_each_dim,
+                     "static const aom_cdf_prob\n"
+                     "default_uni_comp_ref_cdf[UNI_COMP_REF_CONTEXTS][UNIDIR_"
+                     "COMP_REFS - 1][CDF_SIZE(2)]");
+
+  /* Reference frame (single ref) */
+  cts_each_dim[0] = REF_CONTEXTS;
+  cts_each_dim[1] = SINGLE_REFS - 1;
+  cts_each_dim[2] = 2;
+  optimize_cdf_table(
+      &fc.single_ref[0][0][0], probsfile, 3, cts_each_dim,
+      "static const aom_cdf_prob\n"
+      "default_single_ref_cdf[REF_CONTEXTS][SINGLE_REFS - 1][CDF_SIZE(2)]");
+
+  /* ext_refs experiment */
+  cts_each_dim[0] = REF_CONTEXTS;
+  cts_each_dim[1] = FWD_REFS - 1;
+  cts_each_dim[2] = 2;
+  optimize_cdf_table(
+      &fc.comp_ref[0][0][0], probsfile, 3, cts_each_dim,
+      "static const aom_cdf_prob\n"
+      "default_comp_ref_cdf[REF_CONTEXTS][FWD_REFS - 1][CDF_SIZE(2)]");
+
+  cts_each_dim[0] = REF_CONTEXTS;
+  cts_each_dim[1] = BWD_REFS - 1;
+  cts_each_dim[2] = 2;
+  optimize_cdf_table(
+      &fc.comp_bwdref[0][0][0], probsfile, 3, cts_each_dim,
+      "static const aom_cdf_prob\n"
+      "default_comp_bwdref_cdf[REF_CONTEXTS][BWD_REFS - 1][CDF_SIZE(2)]");
+
+  /* palette */
+  cts_each_dim[0] = PALATTE_BSIZE_CTXS;
+  cts_each_dim[1] = PALETTE_SIZES;
+  optimize_cdf_table(&fc.palette_y_size[0][0], probsfile, 2, cts_each_dim,
+                     "const aom_cdf_prob default_palette_y_size_cdf"
+                     "[PALATTE_BSIZE_CTXS][CDF_SIZE(PALETTE_SIZES)]");
+
+  cts_each_dim[0] = PALATTE_BSIZE_CTXS;
+  cts_each_dim[1] = PALETTE_SIZES;
+  optimize_cdf_table(&fc.palette_uv_size[0][0], probsfile, 2, cts_each_dim,
+                     "const aom_cdf_prob default_palette_uv_size_cdf"
+                     "[PALATTE_BSIZE_CTXS][CDF_SIZE(PALETTE_SIZES)]");
+
+  cts_each_dim[0] = PALATTE_BSIZE_CTXS;
+  cts_each_dim[1] = PALETTE_Y_MODE_CONTEXTS;
+  cts_each_dim[2] = 2;
+  optimize_cdf_table(&fc.palette_y_mode[0][0][0], probsfile, 3, cts_each_dim,
+                     "const aom_cdf_prob default_palette_y_mode_cdf"
+                     "[PALATTE_BSIZE_CTXS][PALETTE_Y_MODE_CONTEXTS]"
+                     "[CDF_SIZE(2)]");
+
+  cts_each_dim[0] = PALETTE_UV_MODE_CONTEXTS;
+  cts_each_dim[1] = 2;
+  optimize_cdf_table(&fc.palette_uv_mode[0][0], probsfile, 2, cts_each_dim,
+                     "const aom_cdf_prob default_palette_uv_mode_cdf"
+                     "[PALETTE_UV_MODE_CONTEXTS][CDF_SIZE(2)]");
+
+  cts_each_dim[0] = PALETTE_SIZES;
+  cts_each_dim[1] = PALETTE_COLOR_INDEX_CONTEXTS;
+  cts_each_dim[2] = PALETTE_COLORS;
+  int palette_color_indexes_each_ctx[PALETTE_SIZES] = { 2, 3, 4, 5, 6, 7, 8 };
+  optimize_cdf_table_var_modes_3d(
+      &fc.palette_y_color_index[0][0][0], probsfile, 3, cts_each_dim,
+      palette_color_indexes_each_ctx,
+      "const aom_cdf_prob default_palette_y_color_index_cdf[PALETTE_SIZES]"
+      "[PALETTE_COLOR_INDEX_CONTEXTS][CDF_SIZE(PALETTE_COLORS)]");
+
+  cts_each_dim[0] = PALETTE_SIZES;
+  cts_each_dim[1] = PALETTE_COLOR_INDEX_CONTEXTS;
+  cts_each_dim[2] = PALETTE_COLORS;
+  optimize_cdf_table_var_modes_3d(
+      &fc.palette_uv_color_index[0][0][0], probsfile, 3, cts_each_dim,
+      palette_color_indexes_each_ctx,
+      "const aom_cdf_prob default_palette_uv_color_index_cdf[PALETTE_SIZES]"
+      "[PALETTE_COLOR_INDEX_CONTEXTS][CDF_SIZE(PALETTE_COLORS)]");
+
+  /* Transform size */
+  cts_each_dim[0] = TXFM_PARTITION_CONTEXTS;
+  cts_each_dim[1] = 2;
+  optimize_cdf_table(
+      &fc.txfm_partition[0][0], probsfile, 2, cts_each_dim,
+      "static const aom_cdf_prob\n"
+      "default_txfm_partition_cdf[TXFM_PARTITION_CONTEXTS][CDF_SIZE(2)]");
+
+  /* Skip flag */
+  cts_each_dim[0] = SKIP_CONTEXTS;
+  cts_each_dim[1] = 2;
+  optimize_cdf_table(&fc.skip[0][0], probsfile, 2, cts_each_dim,
+                     "static const aom_cdf_prob "
+                     "default_skip_cdfs[SKIP_CONTEXTS][CDF_SIZE(2)]");
+
+  /* Skip mode flag */
+  cts_each_dim[0] = SKIP_MODE_CONTEXTS;
+  cts_each_dim[1] = 2;
+  optimize_cdf_table(&fc.skip_mode[0][0], probsfile, 2, cts_each_dim,
+                     "static const aom_cdf_prob "
+                     "default_skip_mode_cdfs[SKIP_MODE_CONTEXTS][CDF_SIZE(2)]");
+
+  /* joint compound flag */
+  cts_each_dim[0] = COMP_INDEX_CONTEXTS;
+  cts_each_dim[1] = 2;
+  optimize_cdf_table(&fc.compound_index[0][0], probsfile, 2, cts_each_dim,
+                     "static const aom_cdf_prob default_compound_idx_cdfs"
+                     "[COMP_INDEX_CONTEXTS][CDF_SIZE(2)]");
+
+  cts_each_dim[0] = COMP_GROUP_IDX_CONTEXTS;
+  cts_each_dim[1] = 2;
+  optimize_cdf_table(&fc.comp_group_idx[0][0], probsfile, 2, cts_each_dim,
+                     "static const aom_cdf_prob default_comp_group_idx_cdfs"
+                     "[COMP_GROUP_IDX_CONTEXTS][CDF_SIZE(2)]");
+
+  /* intrabc */
+  cts_each_dim[0] = 2;
+  optimize_cdf_table(
+      &fc.intrabc[0], probsfile, 1, cts_each_dim,
+      "static const aom_cdf_prob default_intrabc_cdf[CDF_SIZE(2)]");
+
+  /* filter_intra experiment */
+  cts_each_dim[0] = FILTER_INTRA_MODES;
+  optimize_cdf_table(
+      &fc.filter_intra_mode[0], probsfile, 1, cts_each_dim,
+      "static const aom_cdf_prob "
+      "default_filter_intra_mode_cdf[CDF_SIZE(FILTER_INTRA_MODES)]");
+
+  cts_each_dim[0] = BLOCK_SIZES_ALL;
+  cts_each_dim[1] = 2;
+  optimize_cdf_table(&fc.filter_intra[0][0], probsfile, 2, cts_each_dim,
+                     "static const aom_cdf_prob "
+                     "default_filter_intra_cdfs[BLOCK_SIZES_ALL][CDF_SIZE(2)]");
+
+  /* restoration type */
+  cts_each_dim[0] = RESTORE_SWITCHABLE_TYPES;
+  optimize_cdf_table(&fc.switchable_restore[0], probsfile, 1, cts_each_dim,
+                     "static const aom_cdf_prob default_switchable_restore_cdf"
+                     "[CDF_SIZE(RESTORE_SWITCHABLE_TYPES)]");
+
+  cts_each_dim[0] = 2;
+  optimize_cdf_table(&fc.wiener_restore[0], probsfile, 1, cts_each_dim,
+                     "static const aom_cdf_prob default_wiener_restore_cdf"
+                     "[CDF_SIZE(2)]");
+
+  cts_each_dim[0] = 2;
+  optimize_cdf_table(&fc.sgrproj_restore[0], probsfile, 1, cts_each_dim,
+                     "static const aom_cdf_prob default_sgrproj_restore_cdf"
+                     "[CDF_SIZE(2)]");
+
+  /* intra tx size */
+  cts_each_dim[0] = MAX_TX_CATS;
+  cts_each_dim[1] = TX_SIZE_CONTEXTS;
+  cts_each_dim[2] = MAX_TX_DEPTH + 1;
+  int intra_tx_sizes_each_ctx[MAX_TX_CATS] = { 2, 3, 3, 3 };
+  optimize_cdf_table_var_modes_3d(
+      &fc.intra_tx_size[0][0][0], probsfile, 3, cts_each_dim,
+      intra_tx_sizes_each_ctx,
+      "static const aom_cdf_prob default_tx_size_cdf"
+      "[MAX_TX_CATS][TX_SIZE_CONTEXTS][CDF_SIZE(MAX_TX_DEPTH + 1)]");
+
+  /* transform coding */
+  cts_each_dim[0] = TOKEN_CDF_Q_CTXS;
+  cts_each_dim[1] = TX_SIZES;
+  cts_each_dim[2] = TXB_SKIP_CONTEXTS;
+  cts_each_dim[3] = 2;
+  optimize_cdf_table(&fc.txb_skip[0][0][0][0], probsfile, 4, cts_each_dim,
+                     "static const aom_cdf_prob "
+                     "av1_default_txb_skip_cdfs[TOKEN_CDF_Q_CTXS][TX_SIZES]"
+                     "[TXB_SKIP_CONTEXTS][CDF_SIZE(2)]");
+
+  cts_each_dim[0] = TOKEN_CDF_Q_CTXS;
+  cts_each_dim[1] = TX_SIZES;
+  cts_each_dim[2] = PLANE_TYPES;
+  cts_each_dim[3] = EOB_COEF_CONTEXTS;
+  cts_each_dim[4] = 2;
+  optimize_cdf_table(
+      &fc.eob_extra[0][0][0][0][0], probsfile, 5, cts_each_dim,
+      "static const aom_cdf_prob av1_default_eob_extra_cdfs "
+      "[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES][EOB_COEF_CONTEXTS]"
+      "[CDF_SIZE(2)]");
+
+  cts_each_dim[0] = TOKEN_CDF_Q_CTXS;
+  cts_each_dim[1] = PLANE_TYPES;
+  cts_each_dim[2] = 2;
+  cts_each_dim[3] = 5;
+  optimize_cdf_table(&fc.eob_multi16[0][0][0][0], probsfile, 4, cts_each_dim,
+                     "static const aom_cdf_prob av1_default_eob_multi16_cdfs"
+                     "[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(5)]");
+
+  cts_each_dim[0] = TOKEN_CDF_Q_CTXS;
+  cts_each_dim[1] = PLANE_TYPES;
+  cts_each_dim[2] = 2;
+  cts_each_dim[3] = 6;
+  optimize_cdf_table(&fc.eob_multi32[0][0][0][0], probsfile, 4, cts_each_dim,
+                     "static const aom_cdf_prob av1_default_eob_multi32_cdfs"
+                     "[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(6)]");
+
+  cts_each_dim[0] = TOKEN_CDF_Q_CTXS;
+  cts_each_dim[1] = PLANE_TYPES;
+  cts_each_dim[2] = 2;
+  cts_each_dim[3] = 7;
+  optimize_cdf_table(&fc.eob_multi64[0][0][0][0], probsfile, 4, cts_each_dim,
+                     "static const aom_cdf_prob av1_default_eob_multi64_cdfs"
+                     "[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(7)]");
+
+  cts_each_dim[0] = TOKEN_CDF_Q_CTXS;
+  cts_each_dim[1] = PLANE_TYPES;
+  cts_each_dim[2] = 2;
+  cts_each_dim[3] = 8;
+  optimize_cdf_table(&fc.eob_multi128[0][0][0][0], probsfile, 4, cts_each_dim,
+                     "static const aom_cdf_prob av1_default_eob_multi128_cdfs"
+                     "[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(8)]");
+
+  cts_each_dim[0] = TOKEN_CDF_Q_CTXS;
+  cts_each_dim[1] = PLANE_TYPES;
+  cts_each_dim[2] = 2;
+  cts_each_dim[3] = 9;
+  optimize_cdf_table(&fc.eob_multi256[0][0][0][0], probsfile, 4, cts_each_dim,
+                     "static const aom_cdf_prob av1_default_eob_multi256_cdfs"
+                     "[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(9)]");
+
+  cts_each_dim[0] = TOKEN_CDF_Q_CTXS;
+  cts_each_dim[1] = PLANE_TYPES;
+  cts_each_dim[2] = 2;
+  cts_each_dim[3] = 10;
+  optimize_cdf_table(&fc.eob_multi512[0][0][0][0], probsfile, 4, cts_each_dim,
+                     "static const aom_cdf_prob av1_default_eob_multi512_cdfs"
+                     "[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(10)]");
+
+  cts_each_dim[0] = TOKEN_CDF_Q_CTXS;
+  cts_each_dim[1] = PLANE_TYPES;
+  cts_each_dim[2] = 2;
+  cts_each_dim[3] = 11;
+  optimize_cdf_table(&fc.eob_multi1024[0][0][0][0], probsfile, 4, cts_each_dim,
+                     "static const aom_cdf_prob av1_default_eob_multi1024_cdfs"
+                     "[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(11)]");
+
+  cts_each_dim[0] = TOKEN_CDF_Q_CTXS;
+  cts_each_dim[1] = TX_SIZES;
+  cts_each_dim[2] = PLANE_TYPES;
+  cts_each_dim[3] = LEVEL_CONTEXTS;
+  cts_each_dim[4] = BR_CDF_SIZE;
+  optimize_cdf_table(&fc.coeff_lps_multi[0][0][0][0][0], probsfile, 5,
+                     cts_each_dim,
+                     "static const aom_cdf_prob "
+                     "av1_default_coeff_lps_multi_cdfs[TOKEN_CDF_Q_CTXS]"
+                     "[TX_SIZES][PLANE_TYPES][LEVEL_CONTEXTS]"
+                     "[CDF_SIZE(BR_CDF_SIZE)]");
+
+  cts_each_dim[0] = TOKEN_CDF_Q_CTXS;
+  cts_each_dim[1] = TX_SIZES;
+  cts_each_dim[2] = PLANE_TYPES;
+  cts_each_dim[3] = SIG_COEF_CONTEXTS;
+  cts_each_dim[4] = NUM_BASE_LEVELS + 2;
+  optimize_cdf_table(
+      &fc.coeff_base_multi[0][0][0][0][0], probsfile, 5, cts_each_dim,
+      "static const aom_cdf_prob av1_default_coeff_base_multi_cdfs"
+      "[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES][SIG_COEF_CONTEXTS]"
+      "[CDF_SIZE(NUM_BASE_LEVELS + 2)]");
+
+  cts_each_dim[0] = TOKEN_CDF_Q_CTXS;
+  cts_each_dim[1] = TX_SIZES;
+  cts_each_dim[2] = PLANE_TYPES;
+  cts_each_dim[3] = SIG_COEF_CONTEXTS_EOB;
+  cts_each_dim[4] = NUM_BASE_LEVELS + 1;
+  optimize_cdf_table(
+      &fc.coeff_base_eob_multi[0][0][0][0][0], probsfile, 5, cts_each_dim,
+      "static const aom_cdf_prob av1_default_coeff_base_eob_multi_cdfs"
+      "[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES][SIG_COEF_CONTEXTS_EOB]"
+      "[CDF_SIZE(NUM_BASE_LEVELS + 1)]");
+
+  fclose(statsfile);
+  fclose(logfile);
+  fclose(probsfile);
+
+  return 0;
+}
diff --git a/libs/libaom/src/tools/cpplint.py b/libs/libaom/src/tools/cpplint.py
new file mode 100644
index 000000000..25fbef73d
--- /dev/null
+++ b/libs/libaom/src/tools/cpplint.py
@@ -0,0 +1,4756 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2009 Google Inc. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#    * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#    * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""Does google-lint on c++ files.
+
+The goal of this script is to identify places in the code that *may*
+be in non-compliance with google style.  It does not attempt to fix
+up these problems -- the point is to educate.  It does also not
+attempt to find all problems, or to ensure that everything it does
+find is legitimately a problem.
+
+In particular, we can get very confused by /* and // inside strings!
+We do a small hack, which is to ignore //'s with "'s after them on the
+same line, but it is far from perfect (in either direction).
+"""
+
+import codecs
+import copy
+import getopt
+import math  # for log
+import os
+import re
+import sre_compile
+import string
+import sys
+import unicodedata
+
+
+_USAGE = """
+Syntax: cpplint.py [--verbose=#] [--output=vs7] [--filter=-x,+y,...]
+                   [--counting=total|toplevel|detailed] [--root=subdir]
+                   [--linelength=digits]
+        <file> [file] ...
+
+  The style guidelines this tries to follow are those in
+    http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml
+
+  Every problem is given a confidence score from 1-5, with 5 meaning we are
+  certain of the problem, and 1 meaning it could be a legitimate construct.
+  This will miss some errors, and is not a substitute for a code review.
+
+  To suppress false-positive errors of a certain category, add a
+  'NOLINT(category)' comment to the line.  NOLINT or NOLINT(*)
+  suppresses errors of all categories on that line.
+
+  The files passed in will be linted; at least one file must be provided.
+  Default linted extensions are .cc, .cpp, .cu, .cuh and .h.  Change the
+  extensions with the --extensions flag.
+
+  Flags:
+
+    output=vs7
+      By default, the output is formatted to ease emacs parsing.  Visual Studio
+      compatible output (vs7) may also be used.  Other formats are unsupported.
+
+    verbose=#
+      Specify a number 0-5 to restrict errors to certain verbosity levels.
+
+    filter=-x,+y,...
+      Specify a comma-separated list of category-filters to apply: only
+      error messages whose category names pass the filters will be printed.
+      (Category names are printed with the message and look like
+      "[whitespace/indent]".)  Filters are evaluated left to right.
+      "-FOO" and "FOO" means "do not print categories that start with FOO".
+      "+FOO" means "do print categories that start with FOO".
+
+      Examples: --filter=-whitespace,+whitespace/braces
+                --filter=whitespace,runtime/printf,+runtime/printf_format
+                --filter=-,+build/include_what_you_use
+
+      To see a list of all the categories used in cpplint, pass no arg:
+         --filter=
+
+    counting=total|toplevel|detailed
+      The total number of errors found is always printed. If
+      'toplevel' is provided, then the count of errors in each of
+      the top-level categories like 'build' and 'whitespace' will
+      also be printed. If 'detailed' is provided, then a count
+      is provided for each category like 'build/class'.
+
+    root=subdir
+      The root directory used for deriving header guard CPP variable.
+      By default, the header guard CPP variable is calculated as the relative
+      path to the directory that contains .git, .hg, or .svn.  When this flag
+      is specified, the relative path is calculated from the specified
+      directory. If the specified directory does not exist, this flag is
+      ignored.
+
+      Examples:
+        Assuing that src/.git exists, the header guard CPP variables for
+        src/chrome/browser/ui/browser.h are:
+
+        No flag => CHROME_BROWSER_UI_BROWSER_H_
+        --root=chrome => BROWSER_UI_BROWSER_H_
+        --root=chrome/browser => UI_BROWSER_H_
+
+    linelength=digits
+      This is the allowed line length for the project. The default value is
+      80 characters.
+
+      Examples:
+        --linelength=120
+
+    extensions=extension,extension,...
+      The allowed file extensions that cpplint will check
+
+      Examples:
+        --extensions=hpp,cpp
+"""
+
+# We categorize each error message we print.  Here are the categories.
+# We want an explicit list so we can list them all in cpplint --filter=.
+# If you add a new error message with a new category, add it to the list
+# here!  cpplint_unittest.py should tell you if you forget to do this.
+_ERROR_CATEGORIES = [
+  'build/class',
+  'build/deprecated',
+  'build/endif_comment',
+  'build/explicit_make_pair',
+  'build/forward_decl',
+  'build/header_guard',
+  'build/include',
+  'build/include_alpha',
+  'build/include_order',
+  'build/include_what_you_use',
+  'build/namespaces',
+  'build/printf_format',
+  'build/storage_class',
+  'legal/copyright',
+  'readability/alt_tokens',
+  'readability/braces',
+  'readability/casting',
+  'readability/check',
+  'readability/constructors',
+  'readability/fn_size',
+  'readability/function',
+  'readability/multiline_comment',
+  'readability/multiline_string',
+  'readability/namespace',
+  'readability/nolint',
+  'readability/nul',
+  'readability/streams',
+  'readability/todo',
+  'readability/utf8',
+  'runtime/arrays',
+  'runtime/casting',
+  'runtime/explicit',
+  'runtime/int',
+  'runtime/init',
+  'runtime/invalid_increment',
+  'runtime/member_string_references',
+  'runtime/memset',
+  'runtime/operator',
+  'runtime/printf',
+  'runtime/printf_format',
+  'runtime/references',
+  'runtime/sizeof',
+  'runtime/string',
+  'runtime/threadsafe_fn',
+  'runtime/vlog',
+  'whitespace/blank_line',
+  'whitespace/braces',
+  'whitespace/comma',
+  'whitespace/comments',
+  'whitespace/empty_conditional_body',
+  'whitespace/empty_loop_body',
+  'whitespace/end_of_line',
+  'whitespace/ending_newline',
+  'whitespace/forcolon',
+  'whitespace/indent',
+  'whitespace/line_length',
+  'whitespace/newline',
+  'whitespace/operators',
+  'whitespace/parens',
+  'whitespace/semicolon',
+  'whitespace/tab',
+  'whitespace/todo'
+  ]
+
+# The default state of the category filter. This is overrided by the --filter=
+# flag. By default all errors are on, so only add here categories that should be
+# off by default (i.e., categories that must be enabled by the --filter= flags).
+# All entries here should start with a '-' or '+', as in the --filter= flag.
+_DEFAULT_FILTERS = ['-build/include_alpha']
+
+# We used to check for high-bit characters, but after much discussion we
+# decided those were OK, as long as they were in UTF-8 and didn't represent
+# hard-coded international strings, which belong in a separate i18n file.
+
+
+# C++ headers
+_CPP_HEADERS = frozenset([
+    # Legacy
+    'algobase.h',
+    'algo.h',
+    'alloc.h',
+    'builtinbuf.h',
+    'bvector.h',
+    'complex.h',
+    'defalloc.h',
+    'deque.h',
+    'editbuf.h',
+    'fstream.h',
+    'function.h',
+    'hash_map',
+    'hash_map.h',
+    'hash_set',
+    'hash_set.h',
+    'hashtable.h',
+    'heap.h',
+    'indstream.h',
+    'iomanip.h',
+    'iostream.h',
+    'istream.h',
+    'iterator.h',
+    'list.h',
+    'map.h',
+    'multimap.h',
+    'multiset.h',
+    'ostream.h',
+    'pair.h',
+    'parsestream.h',
+    'pfstream.h',
+    'procbuf.h',
+    'pthread_alloc',
+    'pthread_alloc.h',
+    'rope',
+    'rope.h',
+    'ropeimpl.h',
+    'set.h',
+    'slist',
+    'slist.h',
+    'stack.h',
+    'stdiostream.h',
+    'stl_alloc.h',
+    'stl_relops.h',
+    'streambuf.h',
+    'stream.h',
+    'strfile.h',
+    'strstream.h',
+    'tempbuf.h',
+    'tree.h',
+    'type_traits.h',
+    'vector.h',
+    # 17.6.1.2 C++ library headers
+    'algorithm',
+    'array',
+    'atomic',
+    'bitset',
+    'chrono',
+    'codecvt',
+    'complex',
+    'condition_variable',
+    'deque',
+    'exception',
+    'forward_list',
+    'fstream',
+    'functional',
+    'future',
+    'initializer_list',
+    'iomanip',
+    'ios',
+    'iosfwd',
+    'iostream',
+    'istream',
+    'iterator',
+    'limits',
+    'list',
+    'locale',
+    'map',
+    'memory',
+    'mutex',
+    'new',
+    'numeric',
+    'ostream',
+    'queue',
+    'random',
+    'ratio',
+    'regex',
+    'set',
+    'sstream',
+    'stack',
+    'stdexcept',
+    'streambuf',
+    'string',
+    'strstream',
+    'system_error',
+    'thread',
+    'tuple',
+    'typeindex',
+    'typeinfo',
+    'type_traits',
+    'unordered_map',
+    'unordered_set',
+    'utility',
+    'valarray',
+    'vector',
+    # 17.6.1.2 C++ headers for C library facilities
+    'cassert',
+    'ccomplex',
+    'cctype',
+    'cerrno',
+    'cfenv',
+    'cfloat',
+    'cinttypes',
+    'ciso646',
+    'climits',
+    'clocale',
+    'cmath',
+    'csetjmp',
+    'csignal',
+    'cstdalign',
+    'cstdarg',
+    'cstdbool',
+    'cstddef',
+    'cstdint',
+    'cstdio',
+    'cstdlib',
+    'cstring',
+    'ctgmath',
+    'ctime',
+    'cuchar',
+    'cwchar',
+    'cwctype',
+    ])
+
+# Assertion macros.  These are defined in base/logging.h and
+# testing/base/gunit.h.  Note that the _M versions need to come first
+# for substring matching to work.
+_CHECK_MACROS = [
+    'DCHECK', 'CHECK',
+    'EXPECT_TRUE_M', 'EXPECT_TRUE',
+    'ASSERT_TRUE_M', 'ASSERT_TRUE',
+    'EXPECT_FALSE_M', 'EXPECT_FALSE',
+    'ASSERT_FALSE_M', 'ASSERT_FALSE',
+    ]
+
+# Replacement macros for CHECK/DCHECK/EXPECT_TRUE/EXPECT_FALSE
+_CHECK_REPLACEMENT = dict([(m, {}) for m in _CHECK_MACROS])
+
+for op, replacement in [('==', 'EQ'), ('!=', 'NE'),
+                        ('>=', 'GE'), ('>', 'GT'),
+                        ('<=', 'LE'), ('<', 'LT')]:
+  _CHECK_REPLACEMENT['DCHECK'][op] = 'DCHECK_%s' % replacement
+  _CHECK_REPLACEMENT['CHECK'][op] = 'CHECK_%s' % replacement
+  _CHECK_REPLACEMENT['EXPECT_TRUE'][op] = 'EXPECT_%s' % replacement
+  _CHECK_REPLACEMENT['ASSERT_TRUE'][op] = 'ASSERT_%s' % replacement
+  _CHECK_REPLACEMENT['EXPECT_TRUE_M'][op] = 'EXPECT_%s_M' % replacement
+  _CHECK_REPLACEMENT['ASSERT_TRUE_M'][op] = 'ASSERT_%s_M' % replacement
+
+for op, inv_replacement in [('==', 'NE'), ('!=', 'EQ'),
+                            ('>=', 'LT'), ('>', 'LE'),
+                            ('<=', 'GT'), ('<', 'GE')]:
+  _CHECK_REPLACEMENT['EXPECT_FALSE'][op] = 'EXPECT_%s' % inv_replacement
+  _CHECK_REPLACEMENT['ASSERT_FALSE'][op] = 'ASSERT_%s' % inv_replacement
+  _CHECK_REPLACEMENT['EXPECT_FALSE_M'][op] = 'EXPECT_%s_M' % inv_replacement
+  _CHECK_REPLACEMENT['ASSERT_FALSE_M'][op] = 'ASSERT_%s_M' % inv_replacement
+
+# Alternative tokens and their replacements.  For full list, see section 2.5
+# Alternative tokens [lex.digraph] in the C++ standard.
+#
+# Digraphs (such as '%:') are not included here since it's a mess to
+# match those on a word boundary.
+_ALT_TOKEN_REPLACEMENT = {
+    'and': '&&',
+    'bitor': '|',
+    'or': '||',
+    'xor': '^',
+    'compl': '~',
+    'bitand': '&',
+    'and_eq': '&=',
+    'or_eq': '|=',
+    'xor_eq': '^=',
+    'not': '!',
+    'not_eq': '!='
+    }
+
+# Compile regular expression that matches all the above keywords.  The "[ =()]"
+# bit is meant to avoid matching these keywords outside of boolean expressions.
+#
+# False positives include C-style multi-line comments and multi-line strings
+# but those have always been troublesome for cpplint.
+_ALT_TOKEN_REPLACEMENT_PATTERN = re.compile(
+    r'[ =()](' + ('|'.join(_ALT_TOKEN_REPLACEMENT.keys())) + r')(?=[ (]|$)')
+
+
+# These constants define types of headers for use with
+# _IncludeState.CheckNextIncludeOrder().
+_C_SYS_HEADER = 1
+_CPP_SYS_HEADER = 2
+_LIKELY_MY_HEADER = 3
+_POSSIBLE_MY_HEADER = 4
+_OTHER_HEADER = 5
+
+# These constants define the current inline assembly state
+_NO_ASM = 0       # Outside of inline assembly block
+_INSIDE_ASM = 1   # Inside inline assembly block
+_END_ASM = 2      # Last line of inline assembly block
+_BLOCK_ASM = 3    # The whole block is an inline assembly block
+
+# Match start of assembly blocks
+_MATCH_ASM = re.compile(r'^\s*(?:asm|_asm|__asm|__asm__)'
+                        r'(?:\s+(volatile|__volatile__))?'
+                        r'\s*[{(]')
+
+
+_regexp_compile_cache = {}
+
+# Finds occurrences of NOLINT or NOLINT(...).
+_RE_SUPPRESSION = re.compile(r'\bNOLINT\b(\([^)]*\))?')
+
+# {str, set(int)}: a map from error categories to sets of linenumbers
+# on which those errors are expected and should be suppressed.
+_error_suppressions = {}
+
+# The root directory used for deriving header guard CPP variable.
+# This is set by --root flag.
+_root = None
+
+# The allowed line length of files.
+# This is set by --linelength flag.
+_line_length = 80
+
+# The allowed extensions for file names
+# This is set by --extensions flag.
+_valid_extensions = set(['cc', 'h', 'cpp', 'cu', 'cuh'])
+
+def ParseNolintSuppressions(filename, raw_line, linenum, error):
+  """Updates the global list of error-suppressions.
+
+  Parses any NOLINT comments on the current line, updating the global
+  error_suppressions store.  Reports an error if the NOLINT comment
+  was malformed.
+
+  Args:
+    filename: str, the name of the input file.
+    raw_line: str, the line of input text, with comments.
+    linenum: int, the number of the current line.
+    error: function, an error handler.
+  """
+  # FIXME(adonovan): "NOLINT(" is misparsed as NOLINT(*).
+  matched = _RE_SUPPRESSION.search(raw_line)
+  if matched:
+    category = matched.group(1)
+    if category in (None, '(*)'):  # => "suppress all"
+      _error_suppressions.setdefault(None, set()).add(linenum)
+    else:
+      if category.startswith('(') and category.endswith(')'):
+        category = category[1:-1]
+        if category in _ERROR_CATEGORIES:
+          _error_suppressions.setdefault(category, set()).add(linenum)
+        else:
+          error(filename, linenum, 'readability/nolint', 5,
+                'Unknown NOLINT error category: %s' % category)
+
+
+def ResetNolintSuppressions():
+  "Resets the set of NOLINT suppressions to empty."
+  _error_suppressions.clear()
+
+
+def IsErrorSuppressedByNolint(category, linenum):
+  """Returns true if the specified error category is suppressed on this line.
+
+  Consults the global error_suppressions map populated by
+  ParseNolintSuppressions/ResetNolintSuppressions.
+
+  Args:
+    category: str, the category of the error.
+    linenum: int, the current line number.
+  Returns:
+    bool, True iff the error should be suppressed due to a NOLINT comment.
+  """
+  return (linenum in _error_suppressions.get(category, set()) or
+          linenum in _error_suppressions.get(None, set()))
+
+def Match(pattern, s):
+  """Matches the string with the pattern, caching the compiled regexp."""
+  # The regexp compilation caching is inlined in both Match and Search for
+  # performance reasons; factoring it out into a separate function turns out
+  # to be noticeably expensive.
+  if pattern not in _regexp_compile_cache:
+    _regexp_compile_cache[pattern] = sre_compile.compile(pattern)
+  return _regexp_compile_cache[pattern].match(s)
+
+
+def ReplaceAll(pattern, rep, s):
+  """Replaces instances of pattern in a string with a replacement.
+
+  The compiled regex is kept in a cache shared by Match and Search.
+
+  Args:
+    pattern: regex pattern
+    rep: replacement text
+    s: search string
+
+  Returns:
+    string with replacements made (or original string if no replacements)
+  """
+  if pattern not in _regexp_compile_cache:
+    _regexp_compile_cache[pattern] = sre_compile.compile(pattern)
+  return _regexp_compile_cache[pattern].sub(rep, s)
+
+
+def Search(pattern, s):
+  """Searches the string for the pattern, caching the compiled regexp."""
+  if pattern not in _regexp_compile_cache:
+    _regexp_compile_cache[pattern] = sre_compile.compile(pattern)
+  return _regexp_compile_cache[pattern].search(s)
+
+
+class _IncludeState(dict):
+  """Tracks line numbers for includes, and the order in which includes appear.
+
+  As a dict, an _IncludeState object serves as a mapping between include
+  filename and line number on which that file was included.
+
+  Call CheckNextIncludeOrder() once for each header in the file, passing
+  in the type constants defined above. Calls in an illegal order will
+  raise an _IncludeError with an appropriate error message.
+
+  """
+  # self._section will move monotonically through this set. If it ever
+  # needs to move backwards, CheckNextIncludeOrder will raise an error.
+  _INITIAL_SECTION = 0
+  _MY_H_SECTION = 1
+  _C_SECTION = 2
+  _CPP_SECTION = 3
+  _OTHER_H_SECTION = 4
+
+  _TYPE_NAMES = {
+      _C_SYS_HEADER: 'C system header',
+      _CPP_SYS_HEADER: 'C++ system header',
+      _LIKELY_MY_HEADER: 'header this file implements',
+      _POSSIBLE_MY_HEADER: 'header this file may implement',
+      _OTHER_HEADER: 'other header',
+      }
+  _SECTION_NAMES = {
+      _INITIAL_SECTION: "... nothing. (This can't be an error.)",
+      _MY_H_SECTION: 'a header this file implements',
+      _C_SECTION: 'C system header',
+      _CPP_SECTION: 'C++ system header',
+      _OTHER_H_SECTION: 'other header',
+      }
+
+  def __init__(self):
+    dict.__init__(self)
+    self.ResetSection()
+
+  def ResetSection(self):
+    # The name of the current section.
+    self._section = self._INITIAL_SECTION
+    # The path of last found header.
+    self._last_header = ''
+
+  def SetLastHeader(self, header_path):
+    self._last_header = header_path
+
+  def CanonicalizeAlphabeticalOrder(self, header_path):
+    """Returns a path canonicalized for alphabetical comparison.
+
+    - replaces "-" with "_" so they both cmp the same.
+    - removes '-inl' since we don't require them to be after the main header.
+    - lowercase everything, just in case.
+
+    Args:
+      header_path: Path to be canonicalized.
+
+    Returns:
+      Canonicalized path.
+    """
+    return header_path.replace('-inl.h', '.h').replace('-', '_').lower()
+
+  def IsInAlphabeticalOrder(self, clean_lines, linenum, header_path):
+    """Check if a header is in alphabetical order with the previous header.
+
+    Args:
+      clean_lines: A CleansedLines instance containing the file.
+      linenum: The number of the line to check.
+      header_path: Canonicalized header to be checked.
+
+    Returns:
+      Returns true if the header is in alphabetical order.
+    """
+    # If previous section is different from current section, _last_header will
+    # be reset to empty string, so it's always less than current header.
+    #
+    # If previous line was a blank line, assume that the headers are
+    # intentionally sorted the way they are.
+    if (self._last_header > header_path and
+        not Match(r'^\s*$', clean_lines.elided[linenum - 1])):
+      return False
+    return True
+
+  def CheckNextIncludeOrder(self, header_type):
+    """Returns a non-empty error message if the next header is out of order.
+
+    This function also updates the internal state to be ready to check
+    the next include.
+
+    Args:
+      header_type: One of the _XXX_HEADER constants defined above.
+
+    Returns:
+      The empty string if the header is in the right order, or an
+      error message describing what's wrong.
+
+    """
+    error_message = ('Found %s after %s' %
+                     (self._TYPE_NAMES[header_type],
+                      self._SECTION_NAMES[self._section]))
+
+    last_section = self._section
+
+    if header_type == _C_SYS_HEADER:
+      if self._section <= self._C_SECTION:
+        self._section = self._C_SECTION
+      else:
+        self._last_header = ''
+        return error_message
+    elif header_type == _CPP_SYS_HEADER:
+      if self._section <= self._CPP_SECTION:
+        self._section = self._CPP_SECTION
+      else:
+        self._last_header = ''
+        return error_message
+    elif header_type == _LIKELY_MY_HEADER:
+      if self._section <= self._MY_H_SECTION:
+        self._section = self._MY_H_SECTION
+      else:
+        self._section = self._OTHER_H_SECTION
+    elif header_type == _POSSIBLE_MY_HEADER:
+      if self._section <= self._MY_H_SECTION:
+        self._section = self._MY_H_SECTION
+      else:
+        # This will always be the fallback because we're not sure
+        # enough that the header is associated with this file.
+        self._section = self._OTHER_H_SECTION
+    else:
+      assert header_type == _OTHER_HEADER
+      self._section = self._OTHER_H_SECTION
+
+    if last_section != self._section:
+      self._last_header = ''
+
+    return ''
+
+
+class _CppLintState(object):
+  """Maintains module-wide state.."""
+
+  def __init__(self):
+    self.verbose_level = 1  # global setting.
+    self.error_count = 0    # global count of reported errors
+    # filters to apply when emitting error messages
+    self.filters = _DEFAULT_FILTERS[:]
+    self.counting = 'total'  # In what way are we counting errors?
+    self.errors_by_category = {}  # string to int dict storing error counts
+
+    # output format:
+    # "emacs" - format that emacs can parse (default)
+    # "vs7" - format that Microsoft Visual Studio 7 can parse
+    self.output_format = 'emacs'
+
+  def SetOutputFormat(self, output_format):
+    """Sets the output format for errors."""
+    self.output_format = output_format
+
+  def SetVerboseLevel(self, level):
+    """Sets the module's verbosity, and returns the previous setting."""
+    last_verbose_level = self.verbose_level
+    self.verbose_level = level
+    return last_verbose_level
+
+  def SetCountingStyle(self, counting_style):
+    """Sets the module's counting options."""
+    self.counting = counting_style
+
+  def SetFilters(self, filters):
+    """Sets the error-message filters.
+
+    These filters are applied when deciding whether to emit a given
+    error message.
+
+    Args:
+      filters: A string of comma-separated filters (eg "+whitespace/indent").
+               Each filter should start with + or -; else we die.
+
+    Raises:
+      ValueError: The comma-separated filters did not all start with '+' or '-'.
+                  E.g. "-,+whitespace,-whitespace/indent,whitespace/badfilter"
+    """
+    # Default filters always have less priority than the flag ones.
+    self.filters = _DEFAULT_FILTERS[:]
+    for filt in filters.split(','):
+      clean_filt = filt.strip()
+      if clean_filt:
+        self.filters.append(clean_filt)
+    for filt in self.filters:
+      if not (filt.startswith('+') or filt.startswith('-')):
+        raise ValueError('Every filter in --filters must start with + or -'
+                         ' (%s does not)' % filt)
+
+  def ResetErrorCounts(self):
+    """Sets the module's error statistic back to zero."""
+    self.error_count = 0
+    self.errors_by_category = {}
+
+  def IncrementErrorCount(self, category):
+    """Bumps the module's error statistic."""
+    self.error_count += 1
+    if self.counting in ('toplevel', 'detailed'):
+      if self.counting != 'detailed':
+        category = category.split('/')[0]
+      if category not in self.errors_by_category:
+        self.errors_by_category[category] = 0
+      self.errors_by_category[category] += 1
+
+  def PrintErrorCounts(self):
+    """Print a summary of errors by category, and the total."""
+    for category, count in self.errors_by_category.iteritems():
+      sys.stderr.write('Category \'%s\' errors found: %d\n' %
+                       (category, count))
+    sys.stderr.write('Total errors found: %d\n' % self.error_count)
+
+_cpplint_state = _CppLintState()
+
+
+def _OutputFormat():
+  """Gets the module's output format."""
+  return _cpplint_state.output_format
+
+
+def _SetOutputFormat(output_format):
+  """Sets the module's output format."""
+  _cpplint_state.SetOutputFormat(output_format)
+
+
+def _VerboseLevel():
+  """Returns the module's verbosity setting."""
+  return _cpplint_state.verbose_level
+
+
+def _SetVerboseLevel(level):
+  """Sets the module's verbosity, and returns the previous setting."""
+  return _cpplint_state.SetVerboseLevel(level)
+
+
+def _SetCountingStyle(level):
+  """Sets the module's counting options."""
+  _cpplint_state.SetCountingStyle(level)
+
+
+def _Filters():
+  """Returns the module's list of output filters, as a list."""
+  return _cpplint_state.filters
+
+
+def _SetFilters(filters):
+  """Sets the module's error-message filters.
+
+  These filters are applied when deciding whether to emit a given
+  error message.
+
+  Args:
+    filters: A string of comma-separated filters (eg "whitespace/indent").
+             Each filter should start with + or -; else we die.
+  """
+  _cpplint_state.SetFilters(filters)
+
+
+class _FunctionState(object):
+  """Tracks current function name and the number of lines in its body."""
+
+  _NORMAL_TRIGGER = 250  # for --v=0, 500 for --v=1, etc.
+  _TEST_TRIGGER = 400    # about 50% more than _NORMAL_TRIGGER.
+
+  def __init__(self):
+    self.in_a_function = False
+    self.lines_in_function = 0
+    self.current_function = ''
+
+  def Begin(self, function_name):
+    """Start analyzing function body.
+
+    Args:
+      function_name: The name of the function being tracked.
+    """
+    self.in_a_function = True
+    self.lines_in_function = 0
+    self.current_function = function_name
+
+  def Count(self):
+    """Count line in current function body."""
+    if self.in_a_function:
+      self.lines_in_function += 1
+
+  def Check(self, error, filename, linenum):
+    """Report if too many lines in function body.
+
+    Args:
+      error: The function to call with any errors found.
+      filename: The name of the current file.
+      linenum: The number of the line to check.
+    """
+    if Match(r'T(EST|est)', self.current_function):
+      base_trigger = self._TEST_TRIGGER
+    else:
+      base_trigger = self._NORMAL_TRIGGER
+    trigger = base_trigger * 2**_VerboseLevel()
+
+    if self.lines_in_function > trigger:
+      error_level = int(math.log(self.lines_in_function / base_trigger, 2))
+      # 50 => 0, 100 => 1, 200 => 2, 400 => 3, 800 => 4, 1600 => 5, ...
+      if error_level > 5:
+        error_level = 5
+      error(filename, linenum, 'readability/fn_size', error_level,
+            'Small and focused functions are preferred:'
+            ' %s has %d non-comment lines'
+            ' (error triggered by exceeding %d lines).'  % (
+                self.current_function, self.lines_in_function, trigger))
+
+  def End(self):
+    """Stop analyzing function body."""
+    self.in_a_function = False
+
+
+class _IncludeError(Exception):
+  """Indicates a problem with the include order in a file."""
+  pass
+
+
+class FileInfo:
+  """Provides utility functions for filenames.
+
+  FileInfo provides easy access to the components of a file's path
+  relative to the project root.
+  """
+
+  def __init__(self, filename):
+    self._filename = filename
+
+  def FullName(self):
+    """Make Windows paths like Unix."""
+    return os.path.abspath(self._filename).replace('\\', '/')
+
+  def RepositoryName(self):
+    """FullName after removing the local path to the repository.
+
+    If we have a real absolute path name here we can try to do something smart:
+    detecting the root of the checkout and truncating /path/to/checkout from
+    the name so that we get header guards that don't include things like
+    "C:\Documents and Settings\..." or "/home/username/..." in them and thus
+    people on different computers who have checked the source out to different
+    locations won't see bogus errors.
+    """
+    fullname = self.FullName()
+
+    if os.path.exists(fullname):
+      project_dir = os.path.dirname(fullname)
+
+      if os.path.exists(os.path.join(project_dir, ".svn")):
+        # If there's a .svn file in the current directory, we recursively look
+        # up the directory tree for the top of the SVN checkout
+        root_dir = project_dir
+        one_up_dir = os.path.dirname(root_dir)
+        while os.path.exists(os.path.join(one_up_dir, ".svn")):
+          root_dir = os.path.dirname(root_dir)
+          one_up_dir = os.path.dirname(one_up_dir)
+
+        prefix = os.path.commonprefix([root_dir, project_dir])
+        return fullname[len(prefix) + 1:]
+
+      # Not SVN <= 1.6? Try to find a git, hg, or svn top level directory by
+      # searching up from the current path.
+      root_dir = os.path.dirname(fullname)
+      while (root_dir != os.path.dirname(root_dir) and
+             not os.path.exists(os.path.join(root_dir, ".git")) and
+             not os.path.exists(os.path.join(root_dir, ".hg")) and
+             not os.path.exists(os.path.join(root_dir, ".svn"))):
+        root_dir = os.path.dirname(root_dir)
+
+      if (os.path.exists(os.path.join(root_dir, ".git")) or
+          os.path.exists(os.path.join(root_dir, ".hg")) or
+          os.path.exists(os.path.join(root_dir, ".svn"))):
+        prefix = os.path.commonprefix([root_dir, project_dir])
+        return fullname[len(prefix) + 1:]
+
+    # Don't know what to do; header guard warnings may be wrong...
+    return fullname
+
+  def Split(self):
+    """Splits the file into the directory, basename, and extension.
+
+    For 'chrome/browser/browser.cc', Split() would
+    return ('chrome/browser', 'browser', '.cc')
+
+    Returns:
+      A tuple of (directory, basename, extension).
+    """
+
+    googlename = self.RepositoryName()
+    project, rest = os.path.split(googlename)
+    return (project,) + os.path.splitext(rest)
+
+  def BaseName(self):
+    """File base name - text after the final slash, before the final period."""
+    return self.Split()[1]
+
+  def Extension(self):
+    """File extension - text following the final period."""
+    return self.Split()[2]
+
+  def NoExtension(self):
+    """File has no source file extension."""
+    return '/'.join(self.Split()[0:2])
+
+  def IsSource(self):
+    """File has a source file extension."""
+    return self.Extension()[1:] in ('c', 'cc', 'cpp', 'cxx')
+
+
+def _ShouldPrintError(category, confidence, linenum):
+  """If confidence >= verbose, category passes filter and is not suppressed."""
+
+  # There are three ways we might decide not to print an error message:
+  # a "NOLINT(category)" comment appears in the source,
+  # the verbosity level isn't high enough, or the filters filter it out.
+  if IsErrorSuppressedByNolint(category, linenum):
+    return False
+  if confidence < _cpplint_state.verbose_level:
+    return False
+
+  is_filtered = False
+  for one_filter in _Filters():
+    if one_filter.startswith('-'):
+      if category.startswith(one_filter[1:]):
+        is_filtered = True
+    elif one_filter.startswith('+'):
+      if category.startswith(one_filter[1:]):
+        is_filtered = False
+    else:
+      assert False  # should have been checked for in SetFilter.
+  if is_filtered:
+    return False
+
+  return True
+
+
+def Error(filename, linenum, category, confidence, message):
+  """Logs the fact we've found a lint error.
+
+  We log where the error was found, and also our confidence in the error,
+  that is, how certain we are this is a legitimate style regression, and
+  not a misidentification or a use that's sometimes justified.
+
+  False positives can be suppressed by the use of
+  "cpplint(category)"  comments on the offending line.  These are
+  parsed into _error_suppressions.
+
+  Args:
+    filename: The name of the file containing the error.
+    linenum: The number of the line containing the error.
+    category: A string used to describe the "category" this bug
+      falls under: "whitespace", say, or "runtime".  Categories
+      may have a hierarchy separated by slashes: "whitespace/indent".
+    confidence: A number from 1-5 representing a confidence score for
+      the error, with 5 meaning that we are certain of the problem,
+      and 1 meaning that it could be a legitimate construct.
+    message: The error message.
+  """
+  if _ShouldPrintError(category, confidence, linenum):
+    _cpplint_state.IncrementErrorCount(category)
+    if _cpplint_state.output_format == 'vs7':
+      sys.stderr.write('%s(%s):  %s  [%s] [%d]\n' % (
+          filename, linenum, message, category, confidence))
+    elif _cpplint_state.output_format == 'eclipse':
+      sys.stderr.write('%s:%s: warning: %s  [%s] [%d]\n' % (
+          filename, linenum, message, category, confidence))
+    else:
+      sys.stderr.write('%s:%s:  %s  [%s] [%d]\n' % (
+          filename, linenum, message, category, confidence))
+
+
+# Matches standard C++ escape sequences per 2.13.2.3 of the C++ standard.
+_RE_PATTERN_CLEANSE_LINE_ESCAPES = re.compile(
+    r'\\([abfnrtv?"\\\']|\d+|x[0-9a-fA-F]+)')
+# Matches strings.  Escape codes should already be removed by ESCAPES.
+_RE_PATTERN_CLEANSE_LINE_DOUBLE_QUOTES = re.compile(r'"[^"]*"')
+# Matches characters.  Escape codes should already be removed by ESCAPES.
+_RE_PATTERN_CLEANSE_LINE_SINGLE_QUOTES = re.compile(r"'.'")
+# Matches multi-line C++ comments.
+# This RE is a little bit more complicated than one might expect, because we
+# have to take care of space removals tools so we can handle comments inside
+# statements better.
+# The current rule is: We only clear spaces from both sides when we're at the
+# end of the line. Otherwise, we try to remove spaces from the right side,
+# if this doesn't work we try on left side but only if there's a non-character
+# on the right.
+_RE_PATTERN_CLEANSE_LINE_C_COMMENTS = re.compile(
+    r"""(\s*/\*.*\*/\s*$|
+            /\*.*\*/\s+|
+         \s+/\*.*\*/(?=\W)|
+            /\*.*\*/)""", re.VERBOSE)
+
+
+def IsCppString(line):
+  """Does line terminate so, that the next symbol is in string constant.
+
+  This function does not consider single-line nor multi-line comments.
+
+  Args:
+    line: is a partial line of code starting from the 0..n.
+
+  Returns:
+    True, if next character appended to 'line' is inside a
+    string constant.
+  """
+
+  line = line.replace(r'\\', 'XX')  # after this, \\" does not match to \"
+  return ((line.count('"') - line.count(r'\"') - line.count("'\"'")) & 1) == 1
+
+
+def CleanseRawStrings(raw_lines):
+  """Removes C++11 raw strings from lines.
+
+    Before:
+      static const char kData[] = R"(
+          multi-line string
+          )";
+
+    After:
+      static const char kData[] = ""
+          (replaced by blank line)
+          "";
+
+  Args:
+    raw_lines: list of raw lines.
+
+  Returns:
+    list of lines with C++11 raw strings replaced by empty strings.
+  """
+
+  delimiter = None
+  lines_without_raw_strings = []
+  for line in raw_lines:
+    if delimiter:
+      # Inside a raw string, look for the end
+      end = line.find(delimiter)
+      if end >= 0:
+        # Found the end of the string, match leading space for this
+        # line and resume copying the original lines, and also insert
+        # a "" on the last line.
+        leading_space = Match(r'^(\s*)\S', line)
+        line = leading_space.group(1) + '""' + line[end + len(delimiter):]
+        delimiter = None
+      else:
+        # Haven't found the end yet, append a blank line.
+        line = ''
+
+    else:
+      # Look for beginning of a raw string.
+      # See 2.14.15 [lex.string] for syntax.
+      matched = Match(r'^(.*)\b(?:R|u8R|uR|UR|LR)"([^\s\\()]*)\((.*)$', line)
+      if matched:
+        delimiter = ')' + matched.group(2) + '"'
+
+        end = matched.group(3).find(delimiter)
+        if end >= 0:
+          # Raw string ended on same line
+          line = (matched.group(1) + '""' +
+                  matched.group(3)[end + len(delimiter):])
+          delimiter = None
+        else:
+          # Start of a multi-line raw string
+          line = matched.group(1) + '""'
+
+    lines_without_raw_strings.append(line)
+
+  # TODO(unknown): if delimiter is not None here, we might want to
+  # emit a warning for unterminated string.
+  return lines_without_raw_strings
+
+
+def FindNextMultiLineCommentStart(lines, lineix):
+  """Find the beginning marker for a multiline comment."""
+  while lineix < len(lines):
+    if lines[lineix].strip().startswith('/*'):
+      # Only return this marker if the comment goes beyond this line
+      if lines[lineix].strip().find('*/', 2) < 0:
+        return lineix
+    lineix += 1
+  return len(lines)
+
+
+def FindNextMultiLineCommentEnd(lines, lineix):
+  """We are inside a comment, find the end marker."""
+  while lineix < len(lines):
+    if lines[lineix].strip().endswith('*/'):
+      return lineix
+    lineix += 1
+  return len(lines)
+
+
+def RemoveMultiLineCommentsFromRange(lines, begin, end):
+  """Clears a range of lines for multi-line comments."""
+  # Having // dummy comments makes the lines non-empty, so we will not get
+  # unnecessary blank line warnings later in the code.
+  for i in range(begin, end):
+    lines[i] = '// dummy'
+
+
+def RemoveMultiLineComments(filename, lines, error):
+  """Removes multiline (c-style) comments from lines."""
+  lineix = 0
+  while lineix < len(lines):
+    lineix_begin = FindNextMultiLineCommentStart(lines, lineix)
+    if lineix_begin >= len(lines):
+      return
+    lineix_end = FindNextMultiLineCommentEnd(lines, lineix_begin)
+    if lineix_end >= len(lines):
+      error(filename, lineix_begin + 1, 'readability/multiline_comment', 5,
+            'Could not find end of multi-line comment')
+      return
+    RemoveMultiLineCommentsFromRange(lines, lineix_begin, lineix_end + 1)
+    lineix = lineix_end + 1
+
+
+def CleanseComments(line):
+  """Removes //-comments and single-line C-style /* */ comments.
+
+  Args:
+    line: A line of C++ source.
+
+  Returns:
+    The line with single-line comments removed.
+  """
+  commentpos = line.find('//')
+  if commentpos != -1 and not IsCppString(line[:commentpos]):
+    line = line[:commentpos].rstrip()
+  # get rid of /* ... */
+  return _RE_PATTERN_CLEANSE_LINE_C_COMMENTS.sub('', line)
+
+
+class CleansedLines(object):
+  """Holds 3 copies of all lines with different preprocessing applied to them.
+
+  1) elided member contains lines without strings and comments,
+  2) lines member contains lines without comments, and
+  3) raw_lines member contains all the lines without processing.
+  All these three members are of <type 'list'>, and of the same length.
+  """
+
+  def __init__(self, lines):
+    self.elided = []
+    self.lines = []
+    self.raw_lines = lines
+    self.num_lines = len(lines)
+    self.lines_without_raw_strings = CleanseRawStrings(lines)
+    for linenum in range(len(self.lines_without_raw_strings)):
+      self.lines.append(CleanseComments(
+          self.lines_without_raw_strings[linenum]))
+      elided = self._CollapseStrings(self.lines_without_raw_strings[linenum])
+      self.elided.append(CleanseComments(elided))
+
+  def NumLines(self):
+    """Returns the number of lines represented."""
+    return self.num_lines
+
+  @staticmethod
+  def _CollapseStrings(elided):
+    """Collapses strings and chars on a line to simple "" or '' blocks.
+
+    We nix strings first so we're not fooled by text like '"http://"'
+
+    Args:
+      elided: The line being processed.
+
+    Returns:
+      The line with collapsed strings.
+    """
+    if not _RE_PATTERN_INCLUDE.match(elided):
+      # Remove escaped characters first to make quote/single quote collapsing
+      # basic.  Things that look like escaped characters shouldn't occur
+      # outside of strings and chars.
+      elided = _RE_PATTERN_CLEANSE_LINE_ESCAPES.sub('', elided)
+      elided = _RE_PATTERN_CLEANSE_LINE_SINGLE_QUOTES.sub("''", elided)
+      elided = _RE_PATTERN_CLEANSE_LINE_DOUBLE_QUOTES.sub('""', elided)
+    return elided
+
+
+def FindEndOfExpressionInLine(line, startpos, depth, startchar, endchar):
+  """Find the position just after the matching endchar.
+
+  Args:
+    line: a CleansedLines line.
+    startpos: start searching at this position.
+    depth: nesting level at startpos.
+    startchar: expression opening character.
+    endchar: expression closing character.
+
+  Returns:
+    On finding matching endchar: (index just after matching endchar, 0)
+    Otherwise: (-1, new depth at end of this line)
+  """
+  for i in xrange(startpos, len(line)):
+    if line[i] == startchar:
+      depth += 1
+    elif line[i] == endchar:
+      depth -= 1
+      if depth == 0:
+        return (i + 1, 0)
+  return (-1, depth)
+
+
+def CloseExpression(clean_lines, linenum, pos):
+  """If input points to ( or { or [ or <, finds the position that closes it.
+
+  If lines[linenum][pos] points to a '(' or '{' or '[' or '<', finds the
+  linenum/pos that correspond to the closing of the expression.
+
+  Args:
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    pos: A position on the line.
+
+  Returns:
+    A tuple (line, linenum, pos) pointer *past* the closing brace, or
+    (line, len(lines), -1) if we never find a close.  Note we ignore
+    strings and comments when matching; and the line we return is the
+    'cleansed' line at linenum.
+  """
+
+  line = clean_lines.elided[linenum]
+  startchar = line[pos]
+  if startchar not in '({[<':
+    return (line, clean_lines.NumLines(), -1)
+  if startchar == '(': endchar = ')'
+  if startchar == '[': endchar = ']'
+  if startchar == '{': endchar = '}'
+  if startchar == '<': endchar = '>'
+
+  # Check first line
+  (end_pos, num_open) = FindEndOfExpressionInLine(
+      line, pos, 0, startchar, endchar)
+  if end_pos > -1:
+    return (line, linenum, end_pos)
+
+  # Continue scanning forward
+  while linenum < clean_lines.NumLines() - 1:
+    linenum += 1
+    line = clean_lines.elided[linenum]
+    (end_pos, num_open) = FindEndOfExpressionInLine(
+        line, 0, num_open, startchar, endchar)
+    if end_pos > -1:
+      return (line, linenum, end_pos)
+
+  # Did not find endchar before end of file, give up
+  return (line, clean_lines.NumLines(), -1)
+
+
+def FindStartOfExpressionInLine(line, endpos, depth, startchar, endchar):
+  """Find position at the matching startchar.
+
+  This is almost the reverse of FindEndOfExpressionInLine, but note
+  that the input position and returned position differs by 1.
+
+  Args:
+    line: a CleansedLines line.
+    endpos: start searching at this position.
+    depth: nesting level at endpos.
+    startchar: expression opening character.
+    endchar: expression closing character.
+
+  Returns:
+    On finding matching startchar: (index at matching startchar, 0)
+    Otherwise: (-1, new depth at beginning of this line)
+  """
+  for i in xrange(endpos, -1, -1):
+    if line[i] == endchar:
+      depth += 1
+    elif line[i] == startchar:
+      depth -= 1
+      if depth == 0:
+        return (i, 0)
+  return (-1, depth)
+
+
+def ReverseCloseExpression(clean_lines, linenum, pos):
+  """If input points to ) or } or ] or >, finds the position that opens it.
+
+  If lines[linenum][pos] points to a ')' or '}' or ']' or '>', finds the
+  linenum/pos that correspond to the opening of the expression.
+
+  Args:
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    pos: A position on the line.
+
+  Returns:
+    A tuple (line, linenum, pos) pointer *at* the opening brace, or
+    (line, 0, -1) if we never find the matching opening brace.  Note
+    we ignore strings and comments when matching; and the line we
+    return is the 'cleansed' line at linenum.
+  """
+  line = clean_lines.elided[linenum]
+  endchar = line[pos]
+  if endchar not in ')}]>':
+    return (line, 0, -1)
+  if endchar == ')': startchar = '('
+  if endchar == ']': startchar = '['
+  if endchar == '}': startchar = '{'
+  if endchar == '>': startchar = '<'
+
+  # Check last line
+  (start_pos, num_open) = FindStartOfExpressionInLine(
+      line, pos, 0, startchar, endchar)
+  if start_pos > -1:
+    return (line, linenum, start_pos)
+
+  # Continue scanning backward
+  while linenum > 0:
+    linenum -= 1
+    line = clean_lines.elided[linenum]
+    (start_pos, num_open) = FindStartOfExpressionInLine(
+        line, len(line) - 1, num_open, startchar, endchar)
+    if start_pos > -1:
+      return (line, linenum, start_pos)
+
+  # Did not find startchar before beginning of file, give up
+  return (line, 0, -1)
+
+
+def CheckForCopyright(filename, lines, error):
+  """Logs an error if no Copyright message appears at the top of the file."""
+
+  # We'll say it should occur by line 10. Don't forget there's a
+  # dummy line at the front.
+  for line in xrange(1, min(len(lines), 11)):
+    if re.search(r'Copyright', lines[line], re.I): break
+  else:                       # means no copyright line was found
+    error(filename, 0, 'legal/copyright', 5,
+          'No copyright message found.  '
+          'You should have a line: "Copyright [year] <Copyright Owner>"')
+
+
+def GetHeaderGuardCPPVariable(filename):
+  """Returns the CPP variable that should be used as a header guard.
+
+  Args:
+    filename: The name of a C++ header file.
+
+  Returns:
+    The CPP variable that should be used as a header guard in the
+    named file.
+
+  """
+
+  # Restores original filename in case that cpplint is invoked from Emacs's
+  # flymake.
+  filename = re.sub(r'_flymake\.h$', '.h', filename)
+  filename = re.sub(r'/\.flymake/([^/]*)$', r'/\1', filename)
+
+  fileinfo = FileInfo(filename)
+  file_path_from_root = fileinfo.RepositoryName()
+  if _root:
+    file_path_from_root = re.sub('^' + _root + os.sep, '', file_path_from_root)
+  return re.sub(r'[-./\s]', '_', file_path_from_root).upper() + '_'
+
+
+def CheckForHeaderGuard(filename, lines, error):
+  """Checks that the file contains a header guard.
+
+  Logs an error if no #ifndef header guard is present.  For other
+  headers, checks that the full pathname is used.
+
+  Args:
+    filename: The name of the C++ header file.
+    lines: An array of strings, each representing a line of the file.
+    error: The function to call with any errors found.
+  """
+
+  cppvar = GetHeaderGuardCPPVariable(filename)
+
+  ifndef = None
+  ifndef_linenum = 0
+  define = None
+  endif = None
+  endif_linenum = 0
+  for linenum, line in enumerate(lines):
+    linesplit = line.split()
+    if len(linesplit) >= 2:
+      # find the first occurrence of #ifndef and #define, save arg
+      if not ifndef and linesplit[0] == '#ifndef':
+        # set ifndef to the header guard presented on the #ifndef line.
+        ifndef = linesplit[1]
+        ifndef_linenum = linenum
+      if not define and linesplit[0] == '#define':
+        define = linesplit[1]
+    # find the last occurrence of #endif, save entire line
+    if line.startswith('#endif'):
+      endif = line
+      endif_linenum = linenum
+
+  if not ifndef:
+    error(filename, 0, 'build/header_guard', 5,
+          'No #ifndef header guard found, suggested CPP variable is: %s' %
+          cppvar)
+    return
+
+  if not define:
+    error(filename, 0, 'build/header_guard', 5,
+          'No #define header guard found, suggested CPP variable is: %s' %
+          cppvar)
+    return
+
+  # The guard should be PATH_FILE_H_, but we also allow PATH_FILE_H__
+  # for backward compatibility.
+  if ifndef != cppvar:
+    error_level = 0
+    if ifndef != cppvar + '_':
+      error_level = 5
+
+    ParseNolintSuppressions(filename, lines[ifndef_linenum], ifndef_linenum,
+                            error)
+    error(filename, ifndef_linenum, 'build/header_guard', error_level,
+          '#ifndef header guard has wrong style, please use: %s' % cppvar)
+
+  if define != ifndef:
+    error(filename, 0, 'build/header_guard', 5,
+          '#ifndef and #define don\'t match, suggested CPP variable is: %s' %
+          cppvar)
+    return
+
+  if endif != ('#endif  // %s' % cppvar):
+    error_level = 0
+    if endif != ('#endif  // %s' % (cppvar + '_')):
+      error_level = 5
+
+    ParseNolintSuppressions(filename, lines[endif_linenum], endif_linenum,
+                            error)
+    error(filename, endif_linenum, 'build/header_guard', error_level,
+          '#endif line should be "#endif  // %s"' % cppvar)
+
+
+def CheckForBadCharacters(filename, lines, error):
+  """Logs an error for each line containing bad characters.
+
+  Two kinds of bad characters:
+
+  1. Unicode replacement characters: These indicate that either the file
+  contained invalid UTF-8 (likely) or Unicode replacement characters (which
+  it shouldn't).  Note that it's possible for this to throw off line
+  numbering if the invalid UTF-8 occurred adjacent to a newline.
+
+  2. NUL bytes.  These are problematic for some tools.
+
+  Args:
+    filename: The name of the current file.
+    lines: An array of strings, each representing a line of the file.
+    error: The function to call with any errors found.
+  """
+  for linenum, line in enumerate(lines):
+    if u'\ufffd' in line:
+      error(filename, linenum, 'readability/utf8', 5,
+            'Line contains invalid UTF-8 (or Unicode replacement character).')
+    if '\0' in line:
+      error(filename, linenum, 'readability/nul', 5, 'Line contains NUL byte.')
+
+
+def CheckForNewlineAtEOF(filename, lines, error):
+  """Logs an error if there is no newline char at the end of the file.
+
+  Args:
+    filename: The name of the current file.
+    lines: An array of strings, each representing a line of the file.
+    error: The function to call with any errors found.
+  """
+
+  # The array lines() was created by adding two newlines to the
+  # original file (go figure), then splitting on \n.
+  # To verify that the file ends in \n, we just have to make sure the
+  # last-but-two element of lines() exists and is empty.
+  if len(lines) < 3 or lines[-2]:
+    error(filename, len(lines) - 2, 'whitespace/ending_newline', 5,
+          'Could not find a newline character at the end of the file.')
+
+
+def CheckForMultilineCommentsAndStrings(filename, clean_lines, linenum, error):
+  """Logs an error if we see /* ... */ or "..." that extend past one line.
+
+  /* ... */ comments are legit inside macros, for one line.
+  Otherwise, we prefer // comments, so it's ok to warn about the
+  other.  Likewise, it's ok for strings to extend across multiple
+  lines, as long as a line continuation character (backslash)
+  terminates each line. Although not currently prohibited by the C++
+  style guide, it's ugly and unnecessary. We don't do well with either
+  in this lint program, so we warn about both.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+
+  # Remove all \\ (escaped backslashes) from the line. They are OK, and the
+  # second (escaped) slash may trigger later \" detection erroneously.
+  line = line.replace('\\\\', '')
+
+  if line.count('/*') > line.count('*/'):
+    error(filename, linenum, 'readability/multiline_comment', 5,
+          'Complex multi-line /*...*/-style comment found. '
+          'Lint may give bogus warnings.  '
+          'Consider replacing these with //-style comments, '
+          'with #if 0...#endif, '
+          'or with more clearly structured multi-line comments.')
+
+  if (line.count('"') - line.count('\\"')) % 2:
+    error(filename, linenum, 'readability/multiline_string', 5,
+          'Multi-line string ("...") found.  This lint script doesn\'t '
+          'do well with such strings, and may give bogus warnings.  '
+          'Use C++11 raw strings or concatenation instead.')
+
+
+threading_list = (
+    ('asctime(', 'asctime_r('),
+    ('ctime(', 'ctime_r('),
+    ('getgrgid(', 'getgrgid_r('),
+    ('getgrnam(', 'getgrnam_r('),
+    ('getlogin(', 'getlogin_r('),
+    ('getpwnam(', 'getpwnam_r('),
+    ('getpwuid(', 'getpwuid_r('),
+    ('gmtime(', 'gmtime_r('),
+    ('localtime(', 'localtime_r('),
+    ('rand(', 'rand_r('),
+    ('strtok(', 'strtok_r('),
+    ('ttyname(', 'ttyname_r('),
+    )
+
+
+def CheckPosixThreading(filename, clean_lines, linenum, error):
+  """Checks for calls to thread-unsafe functions.
+
+  Much code has been originally written without consideration of
+  multi-threading. Also, engineers are relying on their old experience;
+  they have learned posix before threading extensions were added. These
+  tests guide the engineers to use thread-safe functions (when using
+  posix directly).
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+  for single_thread_function, multithread_safe_function in threading_list:
+    ix = line.find(single_thread_function)
+    # Comparisons made explicit for clarity -- pylint: disable=g-explicit-bool-comparison
+    if ix >= 0 and (ix == 0 or (not line[ix - 1].isalnum() and
+                                line[ix - 1] not in ('_', '.', '>'))):
+      error(filename, linenum, 'runtime/threadsafe_fn', 2,
+            'Consider using ' + multithread_safe_function +
+            '...) instead of ' + single_thread_function +
+            '...) for improved thread safety.')
+
+
+def CheckVlogArguments(filename, clean_lines, linenum, error):
+  """Checks that VLOG() is only used for defining a logging level.
+
+  For example, VLOG(2) is correct. VLOG(INFO), VLOG(WARNING), VLOG(ERROR), and
+  VLOG(FATAL) are not.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+  if Search(r'\bVLOG\((INFO|ERROR|WARNING|DFATAL|FATAL)\)', line):
+    error(filename, linenum, 'runtime/vlog', 5,
+          'VLOG() should be used with numeric verbosity level.  '
+          'Use LOG() if you want symbolic severity levels.')
+
+
+# Matches invalid increment: *count++, which moves pointer instead of
+# incrementing a value.
+_RE_PATTERN_INVALID_INCREMENT = re.compile(
+    r'^\s*\*\w+(\+\+|--);')
+
+
+def CheckInvalidIncrement(filename, clean_lines, linenum, error):
+  """Checks for invalid increment *count++.
+
+  For example following function:
+  void increment_counter(int* count) {
+    *count++;
+  }
+  is invalid, because it effectively does count++, moving pointer, and should
+  be replaced with ++*count, (*count)++ or *count += 1.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+  if _RE_PATTERN_INVALID_INCREMENT.match(line):
+    error(filename, linenum, 'runtime/invalid_increment', 5,
+          'Changing pointer instead of value (or unused value of operator*).')
+
+
+class _BlockInfo(object):
+  """Stores information about a generic block of code."""
+
+  def __init__(self, seen_open_brace):
+    self.seen_open_brace = seen_open_brace
+    self.open_parentheses = 0
+    self.inline_asm = _NO_ASM
+
+  def CheckBegin(self, filename, clean_lines, linenum, error):
+    """Run checks that applies to text up to the opening brace.
+
+    This is mostly for checking the text after the class identifier
+    and the "{", usually where the base class is specified.  For other
+    blocks, there isn't much to check, so we always pass.
+
+    Args:
+      filename: The name of the current file.
+      clean_lines: A CleansedLines instance containing the file.
+      linenum: The number of the line to check.
+      error: The function to call with any errors found.
+    """
+    pass
+
+  def CheckEnd(self, filename, clean_lines, linenum, error):
+    """Run checks that applies to text after the closing brace.
+
+    This is mostly used for checking end of namespace comments.
+
+    Args:
+      filename: The name of the current file.
+      clean_lines: A CleansedLines instance containing the file.
+      linenum: The number of the line to check.
+      error: The function to call with any errors found.
+    """
+    pass
+
+
+class _ClassInfo(_BlockInfo):
+  """Stores information about a class."""
+
+  def __init__(self, name, class_or_struct, clean_lines, linenum):
+    _BlockInfo.__init__(self, False)
+    self.name = name
+    self.starting_linenum = linenum
+    self.is_derived = False
+    if class_or_struct == 'struct':
+      self.access = 'public'
+      self.is_struct = True
+    else:
+      self.access = 'private'
+      self.is_struct = False
+
+    # Remember initial indentation level for this class.  Using raw_lines here
+    # instead of elided to account for leading comments.
+    initial_indent = Match(r'^( *)\S', clean_lines.raw_lines[linenum])
+    if initial_indent:
+      self.class_indent = len(initial_indent.group(1))
+    else:
+      self.class_indent = 0
+
+    # Try to find the end of the class.  This will be confused by things like:
+    #   class A {
+    #   } *x = { ...
+    #
+    # But it's still good enough for CheckSectionSpacing.
+    self.last_line = 0
+    depth = 0
+    for i in range(linenum, clean_lines.NumLines()):
+      line = clean_lines.elided[i]
+      depth += line.count('{') - line.count('}')
+      if not depth:
+        self.last_line = i
+        break
+
+  def CheckBegin(self, filename, clean_lines, linenum, error):
+    # Look for a bare ':'
+    if Search('(^|[^:]):($|[^:])', clean_lines.elided[linenum]):
+      self.is_derived = True
+
+  def CheckEnd(self, filename, clean_lines, linenum, error):
+    # Check that closing brace is aligned with beginning of the class.
+    # Only do this if the closing brace is indented by only whitespaces.
+    # This means we will not check single-line class definitions.
+    indent = Match(r'^( *)\}', clean_lines.elided[linenum])
+    if indent and len(indent.group(1)) != self.class_indent:
+      if self.is_struct:
+        parent = 'struct ' + self.name
+      else:
+        parent = 'class ' + self.name
+      error(filename, linenum, 'whitespace/indent', 3,
+            'Closing brace should be aligned with beginning of %s' % parent)
+
+
+class _NamespaceInfo(_BlockInfo):
+  """Stores information about a namespace."""
+
+  def __init__(self, name, linenum):
+    _BlockInfo.__init__(self, False)
+    self.name = name or ''
+    self.starting_linenum = linenum
+
+  def CheckEnd(self, filename, clean_lines, linenum, error):
+    """Check end of namespace comments."""
+    line = clean_lines.raw_lines[linenum]
+
+    # Check how many lines is enclosed in this namespace.  Don't issue
+    # warning for missing namespace comments if there aren't enough
+    # lines.  However, do apply checks if there is already an end of
+    # namespace comment and it's incorrect.
+    #
+    # TODO(unknown): We always want to check end of namespace comments
+    # if a namespace is large, but sometimes we also want to apply the
+    # check if a short namespace contained nontrivial things (something
+    # other than forward declarations).  There is currently no logic on
+    # deciding what these nontrivial things are, so this check is
+    # triggered by namespace size only, which works most of the time.
+    if (linenum - self.starting_linenum < 10
+        and not Match(r'};*\s*(//|/\*).*\bnamespace\b', line)):
+      return
+
+    # Look for matching comment at end of namespace.
+    #
+    # Note that we accept C style "/* */" comments for terminating
+    # namespaces, so that code that terminate namespaces inside
+    # preprocessor macros can be cpplint clean.
+    #
+    # We also accept stuff like "// end of namespace <name>." with the
+    # period at the end.
+    #
+    # Besides these, we don't accept anything else, otherwise we might
+    # get false negatives when existing comment is a substring of the
+    # expected namespace.
+    if self.name:
+      # Named namespace
+      if not Match((r'};*\s*(//|/\*).*\bnamespace\s+' + re.escape(self.name) +
+                    r'[\*/\.\\\s]*$'),
+                   line):
+        error(filename, linenum, 'readability/namespace', 5,
+              'Namespace should be terminated with "// namespace %s"' %
+              self.name)
+    else:
+      # Anonymous namespace
+      if not Match(r'};*\s*(//|/\*).*\bnamespace[\*/\.\\\s]*$', line):
+        error(filename, linenum, 'readability/namespace', 5,
+              'Namespace should be terminated with "// namespace"')
+
+
+class _PreprocessorInfo(object):
+  """Stores checkpoints of nesting stacks when #if/#else is seen."""
+
+  def __init__(self, stack_before_if):
+    # The entire nesting stack before #if
+    self.stack_before_if = stack_before_if
+
+    # The entire nesting stack up to #else
+    self.stack_before_else = []
+
+    # Whether we have already seen #else or #elif
+    self.seen_else = False
+
+
+class _NestingState(object):
+  """Holds states related to parsing braces."""
+
+  def __init__(self):
+    # Stack for tracking all braces.  An object is pushed whenever we
+    # see a "{", and popped when we see a "}".  Only 3 types of
+    # objects are possible:
+    # - _ClassInfo: a class or struct.
+    # - _NamespaceInfo: a namespace.
+    # - _BlockInfo: some other type of block.
+    self.stack = []
+
+    # Stack of _PreprocessorInfo objects.
+    self.pp_stack = []
+
+  def SeenOpenBrace(self):
+    """Check if we have seen the opening brace for the innermost block.
+
+    Returns:
+      True if we have seen the opening brace, False if the innermost
+      block is still expecting an opening brace.
+    """
+    return (not self.stack) or self.stack[-1].seen_open_brace
+
+  def InNamespaceBody(self):
+    """Check if we are currently one level inside a namespace body.
+
+    Returns:
+      True if top of the stack is a namespace block, False otherwise.
+    """
+    return self.stack and isinstance(self.stack[-1], _NamespaceInfo)
+
+  def UpdatePreprocessor(self, line):
+    """Update preprocessor stack.
+
+    We need to handle preprocessors due to classes like this:
+      #ifdef SWIG
+      struct ResultDetailsPageElementExtensionPoint {
+      #else
+      struct ResultDetailsPageElementExtensionPoint : public Extension {
+      #endif
+
+    We make the following assumptions (good enough for most files):
+    - Preprocessor condition evaluates to true from #if up to first
+      #else/#elif/#endif.
+
+    - Preprocessor condition evaluates to false from #else/#elif up
+      to #endif.  We still perform lint checks on these lines, but
+      these do not affect nesting stack.
+
+    Args:
+      line: current line to check.
+    """
+    if Match(r'^\s*#\s*(if|ifdef|ifndef)\b', line):
+      # Beginning of #if block, save the nesting stack here.  The saved
+      # stack will allow us to restore the parsing state in the #else case.
+      self.pp_stack.append(_PreprocessorInfo(copy.deepcopy(self.stack)))
+    elif Match(r'^\s*#\s*(else|elif)\b', line):
+      # Beginning of #else block
+      if self.pp_stack:
+        if not self.pp_stack[-1].seen_else:
+          # This is the first #else or #elif block.  Remember the
+          # whole nesting stack up to this point.  This is what we
+          # keep after the #endif.
+          self.pp_stack[-1].seen_else = True
+          self.pp_stack[-1].stack_before_else = copy.deepcopy(self.stack)
+
+        # Restore the stack to how it was before the #if
+        self.stack = copy.deepcopy(self.pp_stack[-1].stack_before_if)
+      else:
+        # TODO(unknown): unexpected #else, issue warning?
+        pass
+    elif Match(r'^\s*#\s*endif\b', line):
+      # End of #if or #else blocks.
+      if self.pp_stack:
+        # If we saw an #else, we will need to restore the nesting
+        # stack to its former state before the #else, otherwise we
+        # will just continue from where we left off.
+        if self.pp_stack[-1].seen_else:
+          # Here we can just use a shallow copy since we are the last
+          # reference to it.
+          self.stack = self.pp_stack[-1].stack_before_else
+        # Drop the corresponding #if
+        self.pp_stack.pop()
+      else:
+        # TODO(unknown): unexpected #endif, issue warning?
+        pass
+
+  def Update(self, filename, clean_lines, linenum, error):
+    """Update nesting state with current line.
+
+    Args:
+      filename: The name of the current file.
+      clean_lines: A CleansedLines instance containing the file.
+      linenum: The number of the line to check.
+      error: The function to call with any errors found.
+    """
+    line = clean_lines.elided[linenum]
+
+    # Update pp_stack first
+    self.UpdatePreprocessor(line)
+
+    # Count parentheses.  This is to avoid adding struct arguments to
+    # the nesting stack.
+    if self.stack:
+      inner_block = self.stack[-1]
+      depth_change = line.count('(') - line.count(')')
+      inner_block.open_parentheses += depth_change
+
+      # Also check if we are starting or ending an inline assembly block.
+      if inner_block.inline_asm in (_NO_ASM, _END_ASM):
+        if (depth_change != 0 and
+            inner_block.open_parentheses == 1 and
+            _MATCH_ASM.match(line)):
+          # Enter assembly block
+          inner_block.inline_asm = _INSIDE_ASM
+        else:
+          # Not entering assembly block.  If previous line was _END_ASM,
+          # we will now shift to _NO_ASM state.
+          inner_block.inline_asm = _NO_ASM
+      elif (inner_block.inline_asm == _INSIDE_ASM and
+            inner_block.open_parentheses == 0):
+        # Exit assembly block
+        inner_block.inline_asm = _END_ASM
+
+    # Consume namespace declaration at the beginning of the line.  Do
+    # this in a loop so that we catch same line declarations like this:
+    #   namespace proto2 { namespace bridge { class MessageSet; } }
+    while True:
+      # Match start of namespace.  The "\b\s*" below catches namespace
+      # declarations even if it weren't followed by a whitespace, this
+      # is so that we don't confuse our namespace checker.  The
+      # missing spaces will be flagged by CheckSpacing.
+      namespace_decl_match = Match(r'^\s*namespace\b\s*([:\w]+)?(.*)$', line)
+      if not namespace_decl_match:
+        break
+
+      new_namespace = _NamespaceInfo(namespace_decl_match.group(1), linenum)
+      self.stack.append(new_namespace)
+
+      line = namespace_decl_match.group(2)
+      if line.find('{') != -1:
+        new_namespace.seen_open_brace = True
+        line = line[line.find('{') + 1:]
+
+    # Look for a class declaration in whatever is left of the line
+    # after parsing namespaces.  The regexp accounts for decorated classes
+    # such as in:
+    #   class LOCKABLE API Object {
+    #   };
+    #
+    # Templates with class arguments may confuse the parser, for example:
+    #   template <class T
+    #             class Comparator = less<T>,
+    #             class Vector = vector<T> >
+    #   class HeapQueue {
+    #
+    # Because this parser has no nesting state about templates, by the
+    # time it saw "class Comparator", it may think that it's a new class.
+    # Nested templates have a similar problem:
+    #   template <
+    #       typename ExportedType,
+    #       typename TupleType,
+    #       template <typename, typename> class ImplTemplate>
+    #
+    # To avoid these cases, we ignore classes that are followed by '=' or '>'
+    class_decl_match = Match(
+        r'\s*(template\s*<[\w\s<>,:]*>\s*)?'
+        r'(class|struct)\s+([A-Z_]+\s+)*(\w+(?:::\w+)*)'
+        r'(([^=>]|<[^<>]*>|<[^<>]*<[^<>]*>\s*>)*)$', line)
+    if (class_decl_match and
+        (not self.stack or self.stack[-1].open_parentheses == 0)):
+      self.stack.append(_ClassInfo(
+          class_decl_match.group(4), class_decl_match.group(2),
+          clean_lines, linenum))
+      line = class_decl_match.group(5)
+
+    # If we have not yet seen the opening brace for the innermost block,
+    # run checks here.
+    if not self.SeenOpenBrace():
+      self.stack[-1].CheckBegin(filename, clean_lines, linenum, error)
+
+    # Update access control if we are inside a class/struct
+    if self.stack and isinstance(self.stack[-1], _ClassInfo):
+      classinfo = self.stack[-1]
+      access_match = Match(
+          r'^(.*)\b(public|private|protected|signals)(\s+(?:slots\s*)?)?'
+          r':(?:[^:]|$)',
+          line)
+      if access_match:
+        classinfo.access = access_match.group(2)
+
+        # Check that access keywords are indented +1 space.  Skip this
+        # check if the keywords are not preceded by whitespaces.
+        indent = access_match.group(1)
+        if (len(indent) != classinfo.class_indent + 1 and
+            Match(r'^\s*$', indent)):
+          if classinfo.is_struct:
+            parent = 'struct ' + classinfo.name
+          else:
+            parent = 'class ' + classinfo.name
+          slots = ''
+          if access_match.group(3):
+            slots = access_match.group(3)
+          error(filename, linenum, 'whitespace/indent', 3,
+                '%s%s: should be indented +1 space inside %s' % (
+                    access_match.group(2), slots, parent))
+
+    # Consume braces or semicolons from what's left of the line
+    while True:
+      # Match first brace, semicolon, or closed parenthesis.
+      matched = Match(r'^[^{;)}]*([{;)}])(.*)$', line)
+      if not matched:
+        break
+
+      token = matched.group(1)
+      if token == '{':
+        # If namespace or class hasn't seen a opening brace yet, mark
+        # namespace/class head as complete.  Push a new block onto the
+        # stack otherwise.
+        if not self.SeenOpenBrace():
+          self.stack[-1].seen_open_brace = True
+        else:
+          self.stack.append(_BlockInfo(True))
+          if _MATCH_ASM.match(line):
+            self.stack[-1].inline_asm = _BLOCK_ASM
+      elif token == ';' or token == ')':
+        # If we haven't seen an opening brace yet, but we already saw
+        # a semicolon, this is probably a forward declaration.  Pop
+        # the stack for these.
+        #
+        # Similarly, if we haven't seen an opening brace yet, but we
+        # already saw a closing parenthesis, then these are probably
+        # function arguments with extra "class" or "struct" keywords.
+        # Also pop these stack for these.
+        if not self.SeenOpenBrace():
+          self.stack.pop()
+      else:  # token == '}'
+        # Perform end of block checks and pop the stack.
+        if self.stack:
+          self.stack[-1].CheckEnd(filename, clean_lines, linenum, error)
+          self.stack.pop()
+      line = matched.group(2)
+
+  def InnermostClass(self):
+    """Get class info on the top of the stack.
+
+    Returns:
+      A _ClassInfo object if we are inside a class, or None otherwise.
+    """
+    for i in range(len(self.stack), 0, -1):
+      classinfo = self.stack[i - 1]
+      if isinstance(classinfo, _ClassInfo):
+        return classinfo
+    return None
+
+  def CheckCompletedBlocks(self, filename, error):
+    """Checks that all classes and namespaces have been completely parsed.
+
+    Call this when all lines in a file have been processed.
+    Args:
+      filename: The name of the current file.
+      error: The function to call with any errors found.
+    """
+    # Note: This test can result in false positives if #ifdef constructs
+    # get in the way of brace matching. See the testBuildClass test in
+    # cpplint_unittest.py for an example of this.
+    for obj in self.stack:
+      if isinstance(obj, _ClassInfo):
+        error(filename, obj.starting_linenum, 'build/class', 5,
+              'Failed to find complete declaration of class %s' %
+              obj.name)
+      elif isinstance(obj, _NamespaceInfo):
+        error(filename, obj.starting_linenum, 'build/namespaces', 5,
+              'Failed to find complete declaration of namespace %s' %
+              obj.name)
+
+
+def CheckForNonStandardConstructs(filename, clean_lines, linenum,
+                                  nesting_state, error):
+  r"""Logs an error if we see certain non-ANSI constructs ignored by gcc-2.
+
+  Complain about several constructs which gcc-2 accepts, but which are
+  not standard C++.  Warning about these in lint is one way to ease the
+  transition to new compilers.
+  - put storage class first (e.g. "static const" instead of "const static").
+  - "%lld" instead of %qd" in printf-type functions.
+  - "%1$d" is non-standard in printf-type functions.
+  - "\%" is an undefined character escape sequence.
+  - text after #endif is not allowed.
+  - invalid inner-style forward declaration.
+  - >? and <? operators, and their >?= and <?= cousins.
+
+  Additionally, check for constructor/destructor style violations and reference
+  members, as it is very convenient to do so while checking for
+  gcc-2 compliance.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    nesting_state: A _NestingState instance which maintains information about
+                   the current stack of nested blocks being parsed.
+    error: A callable to which errors are reported, which takes 4 arguments:
+           filename, line number, error level, and message
+  """
+
+  # Remove comments from the line, but leave in strings for now.
+  line = clean_lines.lines[linenum]
+
+  if Search(r'printf\s*\(.*".*%[-+ ]?\d*q', line):
+    error(filename, linenum, 'runtime/printf_format', 3,
+          '%q in format strings is deprecated.  Use %ll instead.')
+
+  if Search(r'printf\s*\(.*".*%\d+\$', line):
+    error(filename, linenum, 'runtime/printf_format', 2,
+          '%N$ formats are unconventional.  Try rewriting to avoid them.')
+
+  # Remove escaped backslashes before looking for undefined escapes.
+  line = line.replace('\\\\', '')
+
+  if Search(r'("|\').*\\(%|\[|\(|{)', line):
+    error(filename, linenum, 'build/printf_format', 3,
+          '%, [, (, and { are undefined character escapes.  Unescape them.')
+
+  # For the rest, work with both comments and strings removed.
+  line = clean_lines.elided[linenum]
+
+  if Search(r'\b(const|volatile|void|char|short|int|long'
+            r'|float|double|signed|unsigned'
+            r'|schar|u?int8|u?int16|u?int32|u?int64)'
+            r'\s+(register|static|extern|typedef)\b',
+            line):
+    error(filename, linenum, 'build/storage_class', 5,
+          'Storage class (static, extern, typedef, etc) should be first.')
+
+  if Match(r'\s*#\s*endif\s*[^/\s]+', line):
+    error(filename, linenum, 'build/endif_comment', 5,
+          'Uncommented text after #endif is non-standard.  Use a comment.')
+
+  if Match(r'\s*class\s+(\w+\s*::\s*)+\w+\s*;', line):
+    error(filename, linenum, 'build/forward_decl', 5,
+          'Inner-style forward declarations are invalid.  Remove this line.')
+
+  if Search(r'(\w+|[+-]?\d+(\.\d*)?)\s*(<|>)\?=?\s*(\w+|[+-]?\d+)(\.\d*)?',
+            line):
+    error(filename, linenum, 'build/deprecated', 3,
+          '>? and <? (max and min) operators are non-standard and deprecated.')
+
+  if Search(r'^\s*const\s*string\s*&\s*\w+\s*;', line):
+    # TODO(unknown): Could it be expanded safely to arbitrary references,
+    # without triggering too many false positives? The first
+    # attempt triggered 5 warnings for mostly benign code in the regtest, hence
+    # the restriction.
+    # Here's the original regexp, for the reference:
+    # type_name = r'\w+((\s*::\s*\w+)|(\s*<\s*\w+?\s*>))?'
+    # r'\s*const\s*' + type_name + '\s*&\s*\w+\s*;'
+    error(filename, linenum, 'runtime/member_string_references', 2,
+          'const string& members are dangerous. It is much better to use '
+          'alternatives, such as pointers or simple constants.')
+
+  # Everything else in this function operates on class declarations.
+  # Return early if the top of the nesting stack is not a class, or if
+  # the class head is not completed yet.
+  classinfo = nesting_state.InnermostClass()
+  if not classinfo or not classinfo.seen_open_brace:
+    return
+
+  # The class may have been declared with namespace or classname qualifiers.
+  # The constructor and destructor will not have those qualifiers.
+  base_classname = classinfo.name.split('::')[-1]
+
+  # Look for single-argument constructors that aren't marked explicit.
+  # Technically a valid construct, but against style.
+  args = Match(r'\s+(?:inline\s+)?%s\s*\(([^,()]+)\)'
+               % re.escape(base_classname),
+               line)
+  if (args and
+      args.group(1) != 'void' and
+      not Match(r'(const\s+)?%s(\s+const)?\s*(?:<\w+>\s*)?&'
+                % re.escape(base_classname), args.group(1).strip())):
+    error(filename, linenum, 'runtime/explicit', 5,
+          'Single-argument constructors should be marked explicit.')
+
+
+def CheckSpacingForFunctionCall(filename, line, linenum, error):
+  """Checks for the correctness of various spacing around function calls.
+
+  Args:
+    filename: The name of the current file.
+    line: The text of the line to check.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+
+  # Since function calls often occur inside if/for/while/switch
+  # expressions - which have their own, more liberal conventions - we
+  # first see if we should be looking inside such an expression for a
+  # function call, to which we can apply more strict standards.
+  fncall = line    # if there's no control flow construct, look at whole line
+  for pattern in (r'\bif\s*\((.*)\)\s*{',
+                  r'\bfor\s*\((.*)\)\s*{',
+                  r'\bwhile\s*\((.*)\)\s*[{;]',
+                  r'\bswitch\s*\((.*)\)\s*{'):
+    match = Search(pattern, line)
+    if match:
+      fncall = match.group(1)    # look inside the parens for function calls
+      break
+
+  # Except in if/for/while/switch, there should never be space
+  # immediately inside parens (eg "f( 3, 4 )").  We make an exception
+  # for nested parens ( (a+b) + c ).  Likewise, there should never be
+  # a space before a ( when it's a function argument.  I assume it's a
+  # function argument when the char before the whitespace is legal in
+  # a function name (alnum + _) and we're not starting a macro. Also ignore
+  # pointers and references to arrays and functions coz they're too tricky:
+  # we use a very simple way to recognize these:
+  # " (something)(maybe-something)" or
+  # " (something)(maybe-something," or
+  # " (something)[something]"
+  # Note that we assume the contents of [] to be short enough that
+  # they'll never need to wrap.
+  if (  # Ignore control structures.
+      not Search(r'\b(if|for|while|switch|return|new|delete|catch|sizeof)\b',
+                 fncall) and
+      # Ignore pointers/references to functions.
+      not Search(r' \([^)]+\)\([^)]*(\)|,$)', fncall) and
+      # Ignore pointers/references to arrays.
+      not Search(r' \([^)]+\)\[[^\]]+\]', fncall)):
+    if Search(r'\w\s*\(\s(?!\s*\\$)', fncall):      # a ( used for a fn call
+      error(filename, linenum, 'whitespace/parens', 4,
+            'Extra space after ( in function call')
+    elif Search(r'\(\s+(?!(\s*\\)|\()', fncall):
+      error(filename, linenum, 'whitespace/parens', 2,
+            'Extra space after (')
+    if (Search(r'\w\s+\(', fncall) and
+        not Search(r'#\s*define|typedef', fncall) and
+        not Search(r'\w\s+\((\w+::)*\*\w+\)\(', fncall)):
+      error(filename, linenum, 'whitespace/parens', 4,
+            'Extra space before ( in function call')
+    # If the ) is followed only by a newline or a { + newline, assume it's
+    # part of a control statement (if/while/etc), and don't complain
+    if Search(r'[^)]\s+\)\s*[^{\s]', fncall):
+      # If the closing parenthesis is preceded by only whitespaces,
+      # try to give a more descriptive error message.
+      if Search(r'^\s+\)', fncall):
+        error(filename, linenum, 'whitespace/parens', 2,
+              'Closing ) should be moved to the previous line')
+      else:
+        error(filename, linenum, 'whitespace/parens', 2,
+              'Extra space before )')
+
+
+def IsBlankLine(line):
+  """Returns true if the given line is blank.
+
+  We consider a line to be blank if the line is empty or consists of
+  only white spaces.
+
+  Args:
+    line: A line of a string.
+
+  Returns:
+    True, if the given line is blank.
+  """
+  return not line or line.isspace()
+
+
+def CheckForFunctionLengths(filename, clean_lines, linenum,
+                            function_state, error):
+  """Reports for long function bodies.
+
+  For an overview why this is done, see:
+  http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml#Write_Short_Functions
+
+  Uses a simplistic algorithm assuming other style guidelines
+  (especially spacing) are followed.
+  Only checks unindented functions, so class members are unchecked.
+  Trivial bodies are unchecked, so constructors with huge initializer lists
+  may be missed.
+  Blank/comment lines are not counted so as to avoid encouraging the removal
+  of vertical space and comments just to get through a lint check.
+  NOLINT *on the last line of a function* disables this check.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    function_state: Current function name and lines in body so far.
+    error: The function to call with any errors found.
+  """
+  lines = clean_lines.lines
+  line = lines[linenum]
+  raw = clean_lines.raw_lines
+  raw_line = raw[linenum]
+  joined_line = ''
+
+  starting_func = False
+  regexp = r'(\w(\w|::|\*|\&|\s)*)\('  # decls * & space::name( ...
+  match_result = Match(regexp, line)
+  if match_result:
+    # If the name is all caps and underscores, figure it's a macro and
+    # ignore it, unless it's TEST or TEST_F.
+    function_name = match_result.group(1).split()[-1]
+    if function_name == 'TEST' or function_name == 'TEST_F' or (
+        not Match(r'[A-Z_]+$', function_name)):
+      starting_func = True
+
+  if starting_func:
+    body_found = False
+    for start_linenum in xrange(linenum, clean_lines.NumLines()):
+      start_line = lines[start_linenum]
+      joined_line += ' ' + start_line.lstrip()
+      if Search(r'(;|})', start_line):  # Declarations and trivial functions
+        body_found = True
+        break                              # ... ignore
+      elif Search(r'{', start_line):
+        body_found = True
+        function = Search(r'((\w|:)*)\(', line).group(1)
+        if Match(r'TEST', function):    # Handle TEST... macros
+          parameter_regexp = Search(r'(\(.*\))', joined_line)
+          if parameter_regexp:             # Ignore bad syntax
+            function += parameter_regexp.group(1)
+        else:
+          function += '()'
+        function_state.Begin(function)
+        break
+    if not body_found:
+      # No body for the function (or evidence of a non-function) was found.
+      error(filename, linenum, 'readability/fn_size', 5,
+            'Lint failed to find start of function body.')
+  elif Match(r'^\}\s*$', line):  # function end
+    function_state.Check(error, filename, linenum)
+    function_state.End()
+  elif not Match(r'^\s*$', line):
+    function_state.Count()  # Count non-blank/non-comment lines.
+
+
+_RE_PATTERN_TODO = re.compile(r'^//(\s*)TODO(\(.+?\))?:?(\s|$)?')
+
+
+def CheckComment(comment, filename, linenum, error):
+  """Checks for common mistakes in TODO comments.
+
+  Args:
+    comment: The text of the comment from the line in question.
+    filename: The name of the current file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  match = _RE_PATTERN_TODO.match(comment)
+  if match:
+    # One whitespace is correct; zero whitespace is handled elsewhere.
+    leading_whitespace = match.group(1)
+    if len(leading_whitespace) > 1:
+      error(filename, linenum, 'whitespace/todo', 2,
+            'Too many spaces before TODO')
+
+    username = match.group(2)
+    if not username:
+      error(filename, linenum, 'readability/todo', 2,
+            'Missing username in TODO; it should look like '
+            '"// TODO(my_username): Stuff."')
+
+    middle_whitespace = match.group(3)
+    # Comparisons made explicit for correctness -- pylint: disable=g-explicit-bool-comparison
+    if middle_whitespace != ' ' and middle_whitespace != '':
+      error(filename, linenum, 'whitespace/todo', 2,
+            'TODO(my_username) should be followed by a space')
+
+def CheckAccess(filename, clean_lines, linenum, nesting_state, error):
+  """Checks for improper use of DISALLOW* macros.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    nesting_state: A _NestingState instance which maintains information about
+                   the current stack of nested blocks being parsed.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]  # get rid of comments and strings
+
+  matched = Match((r'\s*(DISALLOW_COPY_AND_ASSIGN|'
+                   r'DISALLOW_EVIL_CONSTRUCTORS|'
+                   r'DISALLOW_IMPLICIT_CONSTRUCTORS)'), line)
+  if not matched:
+    return
+  if nesting_state.stack and isinstance(nesting_state.stack[-1], _ClassInfo):
+    if nesting_state.stack[-1].access != 'private':
+      error(filename, linenum, 'readability/constructors', 3,
+            '%s must be in the private: section' % matched.group(1))
+
+  else:
+    # Found DISALLOW* macro outside a class declaration, or perhaps it
+    # was used inside a function when it should have been part of the
+    # class declaration.  We could issue a warning here, but it
+    # probably resulted in a compiler error already.
+    pass
+
+
+def FindNextMatchingAngleBracket(clean_lines, linenum, init_suffix):
+  """Find the corresponding > to close a template.
+
+  Args:
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: Current line number.
+    init_suffix: Remainder of the current line after the initial <.
+
+  Returns:
+    True if a matching bracket exists.
+  """
+  line = init_suffix
+  nesting_stack = ['<']
+  while True:
+    # Find the next operator that can tell us whether < is used as an
+    # opening bracket or as a less-than operator.  We only want to
+    # warn on the latter case.
+    #
+    # We could also check all other operators and terminate the search
+    # early, e.g. if we got something like this "a<b+c", the "<" is
+    # most likely a less-than operator, but then we will get false
+    # positives for default arguments and other template expressions.
+    match = Search(r'^[^<>(),;\[\]]*([<>(),;\[\]])(.*)$', line)
+    if match:
+      # Found an operator, update nesting stack
+      operator = match.group(1)
+      line = match.group(2)
+
+      if nesting_stack[-1] == '<':
+        # Expecting closing angle bracket
+        if operator in ('<', '(', '['):
+          nesting_stack.append(operator)
+        elif operator == '>':
+          nesting_stack.pop()
+          if not nesting_stack:
+            # Found matching angle bracket
+            return True
+        elif operator == ',':
+          # Got a comma after a bracket, this is most likely a template
+          # argument.  We have not seen a closing angle bracket yet, but
+          # it's probably a few lines later if we look for it, so just
+          # return early here.
+          return True
+        else:
+          # Got some other operator.
+          return False
+
+      else:
+        # Expecting closing parenthesis or closing bracket
+        if operator in ('<', '(', '['):
+          nesting_stack.append(operator)
+        elif operator in (')', ']'):
+          # We don't bother checking for matching () or [].  If we got
+          # something like (] or [), it would have been a syntax error.
+          nesting_stack.pop()
+
+    else:
+      # Scan the next line
+      linenum += 1
+      if linenum >= len(clean_lines.elided):
+        break
+      line = clean_lines.elided[linenum]
+
+  # Exhausted all remaining lines and still no matching angle bracket.
+  # Most likely the input was incomplete, otherwise we should have
+  # seen a semicolon and returned early.
+  return True
+
+
+def FindPreviousMatchingAngleBracket(clean_lines, linenum, init_prefix):
+  """Find the corresponding < that started a template.
+
+  Args:
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: Current line number.
+    init_prefix: Part of the current line before the initial >.
+
+  Returns:
+    True if a matching bracket exists.
+  """
+  line = init_prefix
+  nesting_stack = ['>']
+  while True:
+    # Find the previous operator
+    match = Search(r'^(.*)([<>(),;\[\]])[^<>(),;\[\]]*$', line)
+    if match:
+      # Found an operator, update nesting stack
+      operator = match.group(2)
+      line = match.group(1)
+
+      if nesting_stack[-1] == '>':
+        # Expecting opening angle bracket
+        if operator in ('>', ')', ']'):
+          nesting_stack.append(operator)
+        elif operator == '<':
+          nesting_stack.pop()
+          if not nesting_stack:
+            # Found matching angle bracket
+            return True
+        elif operator == ',':
+          # Got a comma before a bracket, this is most likely a
+          # template argument.  The opening angle bracket is probably
+          # there if we look for it, so just return early here.
+          return True
+        else:
+          # Got some other operator.
+          return False
+
+      else:
+        # Expecting opening parenthesis or opening bracket
+        if operator in ('>', ')', ']'):
+          nesting_stack.append(operator)
+        elif operator in ('(', '['):
+          nesting_stack.pop()
+
+    else:
+      # Scan the previous line
+      linenum -= 1
+      if linenum < 0:
+        break
+      line = clean_lines.elided[linenum]
+
+  # Exhausted all earlier lines and still no matching angle bracket.
+  return False
+
+
+def CheckSpacing(filename, clean_lines, linenum, nesting_state, error):
+  """Checks for the correctness of various spacing issues in the code.
+
+  Things we check for: spaces around operators, spaces after
+  if/for/while/switch, no spaces around parens in function calls, two
+  spaces between code and comment, don't start a block with a blank
+  line, don't end a function with a blank line, don't add a blank line
+  after public/protected/private, don't have too many blank lines in a row.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    nesting_state: A _NestingState instance which maintains information about
+                   the current stack of nested blocks being parsed.
+    error: The function to call with any errors found.
+  """
+
+  # Don't use "elided" lines here, otherwise we can't check commented lines.
+  # Don't want to use "raw" either, because we don't want to check inside C++11
+  # raw strings,
+  raw = clean_lines.lines_without_raw_strings
+  line = raw[linenum]
+
+  # Before nixing comments, check if the line is blank for no good
+  # reason.  This includes the first line after a block is opened, and
+  # blank lines at the end of a function (ie, right before a line like '}'
+  #
+  # Skip all the blank line checks if we are immediately inside a
+  # namespace body.  In other words, don't issue blank line warnings
+  # for this block:
+  #   namespace {
+  #
+  #   }
+  #
+  # A warning about missing end of namespace comments will be issued instead.
+  if IsBlankLine(line) and not nesting_state.InNamespaceBody():
+    elided = clean_lines.elided
+    prev_line = elided[linenum - 1]
+    prevbrace = prev_line.rfind('{')
+    # TODO(unknown): Don't complain if line before blank line, and line after,
+    #                both start with alnums and are indented the same amount.
+    #                This ignores whitespace at the start of a namespace block
+    #                because those are not usually indented.
+    if prevbrace != -1 and prev_line[prevbrace:].find('}') == -1:
+      # OK, we have a blank line at the start of a code block.  Before we
+      # complain, we check if it is an exception to the rule: The previous
+      # non-empty line has the parameters of a function header that are indented
+      # 4 spaces (because they did not fit in a 80 column line when placed on
+      # the same line as the function name).  We also check for the case where
+      # the previous line is indented 6 spaces, which may happen when the
+      # initializers of a constructor do not fit into a 80 column line.
+      exception = False
+      if Match(r' {6}\w', prev_line):  # Initializer list?
+        # We are looking for the opening column of initializer list, which
+        # should be indented 4 spaces to cause 6 space indentation afterwards.
+        search_position = linenum-2
+        while (search_position >= 0
+               and Match(r' {6}\w', elided[search_position])):
+          search_position -= 1
+        exception = (search_position >= 0
+                     and elided[search_position][:5] == '    :')
+      else:
+        # Search for the function arguments or an initializer list.  We use a
+        # simple heuristic here: If the line is indented 4 spaces; and we have a
+        # closing paren, without the opening paren, followed by an opening brace
+        # or colon (for initializer lists) we assume that it is the last line of
+        # a function header.  If we have a colon indented 4 spaces, it is an
+        # initializer list.
+        exception = (Match(r' {4}\w[^\(]*\)\s*(const\s*)?(\{\s*$|:)',
+                           prev_line)
+                     or Match(r' {4}:', prev_line))
+
+      if not exception:
+        error(filename, linenum, 'whitespace/blank_line', 2,
+              'Redundant blank line at the start of a code block '
+              'should be deleted.')
+    # Ignore blank lines at the end of a block in a long if-else
+    # chain, like this:
+    #   if (condition1) {
+    #     // Something followed by a blank line
+    #
+    #   } else if (condition2) {
+    #     // Something else
+    #   }
+    if linenum + 1 < clean_lines.NumLines():
+      next_line = raw[linenum + 1]
+      if (next_line
+          and Match(r'\s*}', next_line)
+          and next_line.find('} else ') == -1):
+        error(filename, linenum, 'whitespace/blank_line', 3,
+              'Redundant blank line at the end of a code block '
+              'should be deleted.')
+
+    matched = Match(r'\s*(public|protected|private):', prev_line)
+    if matched:
+      error(filename, linenum, 'whitespace/blank_line', 3,
+            'Do not leave a blank line after "%s:"' % matched.group(1))
+
+  # Next, we complain if there's a comment too near the text
+  commentpos = line.find('//')
+  if commentpos != -1:
+    # Check if the // may be in quotes.  If so, ignore it
+    # Comparisons made explicit for clarity -- pylint: disable=g-explicit-bool-comparison
+    if (line.count('"', 0, commentpos) -
+        line.count('\\"', 0, commentpos)) % 2 == 0:   # not in quotes
+      # Allow one space for new scopes, two spaces otherwise:
+      if (not Match(r'^\s*{ //', line) and
+          ((commentpos >= 1 and
+            line[commentpos-1] not in string.whitespace) or
+           (commentpos >= 2 and
+            line[commentpos-2] not in string.whitespace))):
+        error(filename, linenum, 'whitespace/comments', 2,
+              'At least two spaces is best between code and comments')
+      # There should always be a space between the // and the comment
+      commentend = commentpos + 2
+      if commentend < len(line) and not line[commentend] == ' ':
+        # but some lines are exceptions -- e.g. if they're big
+        # comment delimiters like:
+        # //----------------------------------------------------------
+        # or are an empty C++ style Doxygen comment, like:
+        # ///
+        # or C++ style Doxygen comments placed after the variable:
+        # ///<  Header comment
+        # //!<  Header comment
+        # or they begin with multiple slashes followed by a space:
+        # //////// Header comment
+        match = (Search(r'[=/-]{4,}\s*$', line[commentend:]) or
+                 Search(r'^/$', line[commentend:]) or
+                 Search(r'^!< ', line[commentend:]) or
+                 Search(r'^/< ', line[commentend:]) or
+                 Search(r'^/+ ', line[commentend:]))
+        if not match:
+          error(filename, linenum, 'whitespace/comments', 4,
+                'Should have a space between // and comment')
+      CheckComment(line[commentpos:], filename, linenum, error)
+
+  line = clean_lines.elided[linenum]  # get rid of comments and strings
+
+  # Don't try to do spacing checks for operator methods
+  line = re.sub(r'operator(==|!=|<|<<|<=|>=|>>|>)\(', 'operator\(', line)
+
+  # We allow no-spaces around = within an if: "if ( (a=Foo()) == 0 )".
+  # Otherwise not.  Note we only check for non-spaces on *both* sides;
+  # sometimes people put non-spaces on one side when aligning ='s among
+  # many lines (not that this is behavior that I approve of...)
+  if Search(r'[\w.]=[\w.]', line) and not Search(r'\b(if|while) ', line):
+    error(filename, linenum, 'whitespace/operators', 4,
+          'Missing spaces around =')
+
+  # It's ok not to have spaces around binary operators like + - * /, but if
+  # there's too little whitespace, we get concerned.  It's hard to tell,
+  # though, so we punt on this one for now.  TODO.
+
+  # You should always have whitespace around binary operators.
+  #
+  # Check <= and >= first to avoid false positives with < and >, then
+  # check non-include lines for spacing around < and >.
+  match = Search(r'[^<>=!\s](==|!=|<=|>=)[^<>=!\s]', line)
+  if match:
+    error(filename, linenum, 'whitespace/operators', 3,
+          'Missing spaces around %s' % match.group(1))
+  # We allow no-spaces around << when used like this: 10<<20, but
+  # not otherwise (particularly, not when used as streams)
+  # Also ignore using ns::operator<<;
+  match = Search(r'(operator|\S)(?:L|UL|ULL|l|ul|ull)?<<(\S)', line)
+  if (match and
+      not (match.group(1).isdigit() and match.group(2).isdigit()) and
+      not (match.group(1) == 'operator' and match.group(2) == ';')):
+    error(filename, linenum, 'whitespace/operators', 3,
+          'Missing spaces around <<')
+  elif not Match(r'#.*include', line):
+    # Avoid false positives on ->
+    reduced_line = line.replace('->', '')
+
+    # Look for < that is not surrounded by spaces.  This is only
+    # triggered if both sides are missing spaces, even though
+    # technically should should flag if at least one side is missing a
+    # space.  This is done to avoid some false positives with shifts.
+    match = Search(r'[^\s<]<([^\s=<].*)', reduced_line)
+    if (match and
+        not FindNextMatchingAngleBracket(clean_lines, linenum, match.group(1))):
+      error(filename, linenum, 'whitespace/operators', 3,
+            'Missing spaces around <')
+
+    # Look for > that is not surrounded by spaces.  Similar to the
+    # above, we only trigger if both sides are missing spaces to avoid
+    # false positives with shifts.
+    match = Search(r'^(.*[^\s>])>[^\s=>]', reduced_line)
+    if (match and
+        not FindPreviousMatchingAngleBracket(clean_lines, linenum,
+                                             match.group(1))):
+      error(filename, linenum, 'whitespace/operators', 3,
+            'Missing spaces around >')
+
+  # We allow no-spaces around >> for almost anything.  This is because
+  # C++11 allows ">>" to close nested templates, which accounts for
+  # most cases when ">>" is not followed by a space.
+  #
+  # We still warn on ">>" followed by alpha character, because that is
+  # likely due to ">>" being used for right shifts, e.g.:
+  #   value >> alpha
+  #
+  # When ">>" is used to close templates, the alphanumeric letter that
+  # follows would be part of an identifier, and there should still be
+  # a space separating the template type and the identifier.
+  #   type<type<type>> alpha
+  match = Search(r'>>[a-zA-Z_]', line)
+  if match:
+    error(filename, linenum, 'whitespace/operators', 3,
+          'Missing spaces around >>')
+
+  # There shouldn't be space around unary operators
+  match = Search(r'(!\s|~\s|[\s]--[\s;]|[\s]\+\+[\s;])', line)
+  if match:
+    error(filename, linenum, 'whitespace/operators', 4,
+          'Extra space for operator %s' % match.group(1))
+
+  # A pet peeve of mine: no spaces after an if, while, switch, or for
+  match = Search(r' (if\(|for\(|while\(|switch\()', line)
+  if match:
+    error(filename, linenum, 'whitespace/parens', 5,
+          'Missing space before ( in %s' % match.group(1))
+
+  # For if/for/while/switch, the left and right parens should be
+  # consistent about how many spaces are inside the parens, and
+  # there should either be zero or one spaces inside the parens.
+  # We don't want: "if ( foo)" or "if ( foo   )".
+  # Exception: "for ( ; foo; bar)" and "for (foo; bar; )" are allowed.
+  match = Search(r'\b(if|for|while|switch)\s*'
+                 r'\(([ ]*)(.).*[^ ]+([ ]*)\)\s*{\s*$',
+                 line)
+  if match:
+    if len(match.group(2)) != len(match.group(4)):
+      if not (match.group(3) == ';' and
+              len(match.group(2)) == 1 + len(match.group(4)) or
+              not match.group(2) and Search(r'\bfor\s*\(.*; \)', line)):
+        error(filename, linenum, 'whitespace/parens', 5,
+              'Mismatching spaces inside () in %s' % match.group(1))
+    if len(match.group(2)) not in [0, 1]:
+      error(filename, linenum, 'whitespace/parens', 5,
+            'Should have zero or one spaces inside ( and ) in %s' %
+            match.group(1))
+
+  # You should always have a space after a comma (either as fn arg or operator)
+  #
+  # This does not apply when the non-space character following the
+  # comma is another comma, since the only time when that happens is
+  # for empty macro arguments.
+  #
+  # We run this check in two passes: first pass on elided lines to
+  # verify that lines contain missing whitespaces, second pass on raw
+  # lines to confirm that those missing whitespaces are not due to
+  # elided comments.
+  if Search(r',[^,\s]', line) and Search(r',[^,\s]', raw[linenum]):
+    error(filename, linenum, 'whitespace/comma', 3,
+          'Missing space after ,')
+
+  # You should always have a space after a semicolon
+  # except for few corner cases
+  # TODO(unknown): clarify if 'if (1) { return 1;}' is requires one more
+  # space after ;
+  if Search(r';[^\s};\\)/]', line):
+    error(filename, linenum, 'whitespace/semicolon', 3,
+          'Missing space after ;')
+
+  # Next we will look for issues with function calls.
+  CheckSpacingForFunctionCall(filename, line, linenum, error)
+
+  # Except after an opening paren, or after another opening brace (in case of
+  # an initializer list, for instance), you should have spaces before your
+  # braces. And since you should never have braces at the beginning of a line,
+  # this is an easy test.
+  match = Match(r'^(.*[^ ({]){', line)
+  if match:
+    # Try a bit harder to check for brace initialization.  This
+    # happens in one of the following forms:
+    #   Constructor() : initializer_list_{} { ... }
+    #   Constructor{}.MemberFunction()
+    #   Type variable{};
+    #   FunctionCall(type{}, ...);
+    #   LastArgument(..., type{});
+    #   LOG(INFO) << type{} << " ...";
+    #   map_of_type[{...}] = ...;
+    #
+    # We check for the character following the closing brace, and
+    # silence the warning if it's one of those listed above, i.e.
+    # "{.;,)<]".
+    #
+    # To account for nested initializer list, we allow any number of
+    # closing braces up to "{;,)<".  We can't simply silence the
+    # warning on first sight of closing brace, because that would
+    # cause false negatives for things that are not initializer lists.
+    #   Silence this:         But not this:
+    #     Outer{                if (...) {
+    #       Inner{...}            if (...){  // Missing space before {
+    #     };                    }
+    #
+    # There is a false negative with this approach if people inserted
+    # spurious semicolons, e.g. "if (cond){};", but we will catch the
+    # spurious semicolon with a separate check.
+    (endline, endlinenum, endpos) = CloseExpression(
+        clean_lines, linenum, len(match.group(1)))
+    trailing_text = ''
+    if endpos > -1:
+      trailing_text = endline[endpos:]
+    for offset in xrange(endlinenum + 1,
+                         min(endlinenum + 3, clean_lines.NumLines() - 1)):
+      trailing_text += clean_lines.elided[offset]
+    if not Match(r'^[\s}]*[{.;,)<\]]', trailing_text):
+      error(filename, linenum, 'whitespace/braces', 5,
+            'Missing space before {')
+
+  # Make sure '} else {' has spaces.
+  if Search(r'}else', line):
+    error(filename, linenum, 'whitespace/braces', 5,
+          'Missing space before else')
+
+  # You shouldn't have spaces before your brackets, except maybe after
+  # 'delete []' or 'new char * []'.
+  if Search(r'\w\s+\[', line) and not Search(r'delete\s+\[', line):
+    error(filename, linenum, 'whitespace/braces', 5,
+          'Extra space before [')
+
+  # You shouldn't have a space before a semicolon at the end of the line.
+  # There's a special case for "for" since the style guide allows space before
+  # the semicolon there.
+  if Search(r':\s*;\s*$', line):
+    error(filename, linenum, 'whitespace/semicolon', 5,
+          'Semicolon defining empty statement. Use {} instead.')
+  elif Search(r'^\s*;\s*$', line):
+    error(filename, linenum, 'whitespace/semicolon', 5,
+          'Line contains only semicolon. If this should be an empty statement, '
+          'use {} instead.')
+  elif (Search(r'\s+;\s*$', line) and
+        not Search(r'\bfor\b', line)):
+    error(filename, linenum, 'whitespace/semicolon', 5,
+          'Extra space before last semicolon. If this should be an empty '
+          'statement, use {} instead.')
+
+  # In range-based for, we wanted spaces before and after the colon, but
+  # not around "::" tokens that might appear.
+  if (Search('for *\(.*[^:]:[^: ]', line) or
+      Search('for *\(.*[^: ]:[^:]', line)):
+    error(filename, linenum, 'whitespace/forcolon', 2,
+          'Missing space around colon in range-based for loop')
+
+
+def CheckSectionSpacing(filename, clean_lines, class_info, linenum, error):
+  """Checks for additional blank line issues related to sections.
+
+  Currently the only thing checked here is blank line before protected/private.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    class_info: A _ClassInfo objects.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  # Skip checks if the class is small, where small means 25 lines or less.
+  # 25 lines seems like a good cutoff since that's the usual height of
+  # terminals, and any class that can't fit in one screen can't really
+  # be considered "small".
+  #
+  # Also skip checks if we are on the first line.  This accounts for
+  # classes that look like
+  #   class Foo { public: ... };
+  #
+  # If we didn't find the end of the class, last_line would be zero,
+  # and the check will be skipped by the first condition.
+  if (class_info.last_line - class_info.starting_linenum <= 24 or
+      linenum <= class_info.starting_linenum):
+    return
+
+  matched = Match(r'\s*(public|protected|private):', clean_lines.lines[linenum])
+  if matched:
+    # Issue warning if the line before public/protected/private was
+    # not a blank line, but don't do this if the previous line contains
+    # "class" or "struct".  This can happen two ways:
+    #  - We are at the beginning of the class.
+    #  - We are forward-declaring an inner class that is semantically
+    #    private, but needed to be public for implementation reasons.
+    # Also ignores cases where the previous line ends with a backslash as can be
+    # common when defining classes in C macros.
+    prev_line = clean_lines.lines[linenum - 1]
+    if (not IsBlankLine(prev_line) and
+        not Search(r'\b(class|struct)\b', prev_line) and
+        not Search(r'\\$', prev_line)):
+      # Try a bit harder to find the beginning of the class.  This is to
+      # account for multi-line base-specifier lists, e.g.:
+      #   class Derived
+      #       : public Base {
+      end_class_head = class_info.starting_linenum
+      for i in range(class_info.starting_linenum, linenum):
+        if Search(r'\{\s*$', clean_lines.lines[i]):
+          end_class_head = i
+          break
+      if end_class_head < linenum - 1:
+        error(filename, linenum, 'whitespace/blank_line', 3,
+              '"%s:" should be preceded by a blank line' % matched.group(1))
+
+
+def GetPreviousNonBlankLine(clean_lines, linenum):
+  """Return the most recent non-blank line and its line number.
+
+  Args:
+    clean_lines: A CleansedLines instance containing the file contents.
+    linenum: The number of the line to check.
+
+  Returns:
+    A tuple with two elements.  The first element is the contents of the last
+    non-blank line before the current line, or the empty string if this is the
+    first non-blank line.  The second is the line number of that line, or -1
+    if this is the first non-blank line.
+  """
+
+  prevlinenum = linenum - 1
+  while prevlinenum >= 0:
+    prevline = clean_lines.elided[prevlinenum]
+    if not IsBlankLine(prevline):     # if not a blank line...
+      return (prevline, prevlinenum)
+    prevlinenum -= 1
+  return ('', -1)
+
+
+def CheckBraces(filename, clean_lines, linenum, error):
+  """Looks for misplaced braces (e.g. at the end of line).
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+
+  line = clean_lines.elided[linenum]        # get rid of comments and strings
+
+  if Match(r'\s*{\s*$', line):
+    # We allow an open brace to start a line in the case where someone is using
+    # braces in a block to explicitly create a new scope, which is commonly used
+    # to control the lifetime of stack-allocated variables.  Braces are also
+    # used for brace initializers inside function calls.  We don't detect this
+    # perfectly: we just don't complain if the last non-whitespace character on
+    # the previous non-blank line is ',', ';', ':', '(', '{', or '}', or if the
+    # previous line starts a preprocessor block.
+    prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0]
+    if (not Search(r'[,;:}{(]\s*$', prevline) and
+        not Match(r'\s*#', prevline)):
+      error(filename, linenum, 'whitespace/braces', 4,
+            '{ should almost always be at the end of the previous line')
+
+  # An else clause should be on the same line as the preceding closing brace.
+  if Match(r'\s*else\s*', line):
+    prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0]
+    if Match(r'\s*}\s*$', prevline):
+      error(filename, linenum, 'whitespace/newline', 4,
+            'An else should appear on the same line as the preceding }')
+
+  # If braces come on one side of an else, they should be on both.
+  # However, we have to worry about "else if" that spans multiple lines!
+  if Search(r'}\s*else[^{]*$', line) or Match(r'[^}]*else\s*{', line):
+    if Search(r'}\s*else if([^{]*)$', line):       # could be multi-line if
+      # find the ( after the if
+      pos = line.find('else if')
+      pos = line.find('(', pos)
+      if pos > 0:
+        (endline, _, endpos) = CloseExpression(clean_lines, linenum, pos)
+        if endline[endpos:].find('{') == -1:    # must be brace after if
+          error(filename, linenum, 'readability/braces', 5,
+                'If an else has a brace on one side, it should have it on both')
+    else:            # common case: else not followed by a multi-line if
+      error(filename, linenum, 'readability/braces', 5,
+            'If an else has a brace on one side, it should have it on both')
+
+  # Likewise, an else should never have the else clause on the same line
+  if Search(r'\belse [^\s{]', line) and not Search(r'\belse if\b', line):
+    error(filename, linenum, 'whitespace/newline', 4,
+          'Else clause should never be on same line as else (use 2 lines)')
+
+  # In the same way, a do/while should never be on one line
+  if Match(r'\s*do [^\s{]', line):
+    error(filename, linenum, 'whitespace/newline', 4,
+          'do/while clauses should not be on a single line')
+
+  # Block bodies should not be followed by a semicolon.  Due to C++11
+  # brace initialization, there are more places where semicolons are
+  # required than not, so we use a whitelist approach to check these
+  # rather than a blacklist.  These are the places where "};" should
+  # be replaced by just "}":
+  # 1. Some flavor of block following closing parenthesis:
+  #    for (;;) {};
+  #    while (...) {};
+  #    switch (...) {};
+  #    Function(...) {};
+  #    if (...) {};
+  #    if (...) else if (...) {};
+  #
+  # 2. else block:
+  #    if (...) else {};
+  #
+  # 3. const member function:
+  #    Function(...) const {};
+  #
+  # 4. Block following some statement:
+  #    x = 42;
+  #    {};
+  #
+  # 5. Block at the beginning of a function:
+  #    Function(...) {
+  #      {};
+  #    }
+  #
+  #    Note that naively checking for the preceding "{" will also match
+  #    braces inside multi-dimensional arrays, but this is fine since
+  #    that expression will not contain semicolons.
+  #
+  # 6. Block following another block:
+  #    while (true) {}
+  #    {};
+  #
+  # 7. End of namespaces:
+  #    namespace {};
+  #
+  #    These semicolons seems far more common than other kinds of
+  #    redundant semicolons, possibly due to people converting classes
+  #    to namespaces.  For now we do not warn for this case.
+  #
+  # Try matching case 1 first.
+  match = Match(r'^(.*\)\s*)\{', line)
+  if match:
+    # Matched closing parenthesis (case 1).  Check the token before the
+    # matching opening parenthesis, and don't warn if it looks like a
+    # macro.  This avoids these false positives:
+    #  - macro that defines a base class
+    #  - multi-line macro that defines a base class
+    #  - macro that defines the whole class-head
+    #
+    # But we still issue warnings for macros that we know are safe to
+    # warn, specifically:
+    #  - TEST, TEST_F, TEST_P, MATCHER, MATCHER_P
+    #  - TYPED_TEST
+    #  - INTERFACE_DEF
+    #  - EXCLUSIVE_LOCKS_REQUIRED, SHARED_LOCKS_REQUIRED, LOCKS_EXCLUDED:
+    #
+    # We implement a whitelist of safe macros instead of a blacklist of
+    # unsafe macros, even though the latter appears less frequently in
+    # google code and would have been easier to implement.  This is because
+    # the downside for getting the whitelist wrong means some extra
+    # semicolons, while the downside for getting the blacklist wrong
+    # would result in compile errors.
+    #
+    # In addition to macros, we also don't want to warn on compound
+    # literals.
+    closing_brace_pos = match.group(1).rfind(')')
+    opening_parenthesis = ReverseCloseExpression(
+        clean_lines, linenum, closing_brace_pos)
+    if opening_parenthesis[2] > -1:
+      line_prefix = opening_parenthesis[0][0:opening_parenthesis[2]]
+      macro = Search(r'\b([A-Z_]+)\s*$', line_prefix)
+      if ((macro and
+           macro.group(1) not in (
+               'TEST', 'TEST_F', 'MATCHER', 'MATCHER_P', 'TYPED_TEST',
+               'EXCLUSIVE_LOCKS_REQUIRED', 'SHARED_LOCKS_REQUIRED',
+               'LOCKS_EXCLUDED', 'INTERFACE_DEF')) or
+          Search(r'\s+=\s*$', line_prefix)):
+        match = None
+
+  else:
+    # Try matching cases 2-3.
+    match = Match(r'^(.*(?:else|\)\s*const)\s*)\{', line)
+    if not match:
+      # Try matching cases 4-6.  These are always matched on separate lines.
+      #
+      # Note that we can't simply concatenate the previous line to the
+      # current line and do a single match, otherwise we may output
+      # duplicate warnings for the blank line case:
+      #   if (cond) {
+      #     // blank line
+      #   }
+      prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0]
+      if prevline and Search(r'[;{}]\s*$', prevline):
+        match = Match(r'^(\s*)\{', line)
+
+  # Check matching closing brace
+  if match:
+    (endline, endlinenum, endpos) = CloseExpression(
+        clean_lines, linenum, len(match.group(1)))
+    if endpos > -1 and Match(r'^\s*;', endline[endpos:]):
+      # Current {} pair is eligible for semicolon check, and we have found
+      # the redundant semicolon, output warning here.
+      #
+      # Note: because we are scanning forward for opening braces, and
+      # outputting warnings for the matching closing brace, if there are
+      # nested blocks with trailing semicolons, we will get the error
+      # messages in reversed order.
+      error(filename, endlinenum, 'readability/braces', 4,
+            "You don't need a ; after a }")
+
+
+def CheckEmptyBlockBody(filename, clean_lines, linenum, error):
+  """Look for empty loop/conditional body with only a single semicolon.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+
+  # Search for loop keywords at the beginning of the line.  Because only
+  # whitespaces are allowed before the keywords, this will also ignore most
+  # do-while-loops, since those lines should start with closing brace.
+  #
+  # We also check "if" blocks here, since an empty conditional block
+  # is likely an error.
+  line = clean_lines.elided[linenum]
+  matched = Match(r'\s*(for|while|if)\s*\(', line)
+  if matched:
+    # Find the end of the conditional expression
+    (end_line, end_linenum, end_pos) = CloseExpression(
+        clean_lines, linenum, line.find('('))
+
+    # Output warning if what follows the condition expression is a semicolon.
+    # No warning for all other cases, including whitespace or newline, since we
+    # have a separate check for semicolons preceded by whitespace.
+    if end_pos >= 0 and Match(r';', end_line[end_pos:]):
+      if matched.group(1) == 'if':
+        error(filename, end_linenum, 'whitespace/empty_conditional_body', 5,
+              'Empty conditional bodies should use {}')
+      else:
+        error(filename, end_linenum, 'whitespace/empty_loop_body', 5,
+              'Empty loop bodies should use {} or continue')
+
+
+def CheckCheck(filename, clean_lines, linenum, error):
+  """Checks the use of CHECK and EXPECT macros.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+
+  # Decide the set of replacement macros that should be suggested
+  lines = clean_lines.elided
+  check_macro = None
+  start_pos = -1
+  for macro in _CHECK_MACROS:
+    i = lines[linenum].find(macro)
+    if i >= 0:
+      check_macro = macro
+
+      # Find opening parenthesis.  Do a regular expression match here
+      # to make sure that we are matching the expected CHECK macro, as
+      # opposed to some other macro that happens to contain the CHECK
+      # substring.
+      matched = Match(r'^(.*\b' + check_macro + r'\s*)\(', lines[linenum])
+      if not matched:
+        continue
+      start_pos = len(matched.group(1))
+      break
+  if not check_macro or start_pos < 0:
+    # Don't waste time here if line doesn't contain 'CHECK' or 'EXPECT'
+    return
+
+  # Find end of the boolean expression by matching parentheses
+  (last_line, end_line, end_pos) = CloseExpression(
+      clean_lines, linenum, start_pos)
+  if end_pos < 0:
+    return
+  if linenum == end_line:
+    expression = lines[linenum][start_pos + 1:end_pos - 1]
+  else:
+    expression = lines[linenum][start_pos + 1:]
+    for i in xrange(linenum + 1, end_line):
+      expression += lines[i]
+    expression += last_line[0:end_pos - 1]
+
+  # Parse expression so that we can take parentheses into account.
+  # This avoids false positives for inputs like "CHECK((a < 4) == b)",
+  # which is not replaceable by CHECK_LE.
+  lhs = ''
+  rhs = ''
+  operator = None
+  while expression:
+    matched = Match(r'^\s*(<<|<<=|>>|>>=|->\*|->|&&|\|\||'
+                    r'==|!=|>=|>|<=|<|\()(.*)$', expression)
+    if matched:
+      token = matched.group(1)
+      if token == '(':
+        # Parenthesized operand
+        expression = matched.group(2)
+        (end, _) = FindEndOfExpressionInLine(expression, 0, 1, '(', ')')
+        if end < 0:
+          return  # Unmatched parenthesis
+        lhs += '(' + expression[0:end]
+        expression = expression[end:]
+      elif token in ('&&', '||'):
+        # Logical and/or operators.  This means the expression
+        # contains more than one term, for example:
+        #   CHECK(42 < a && a < b);
+        #
+        # These are not replaceable with CHECK_LE, so bail out early.
+        return
+      elif token in ('<<', '<<=', '>>', '>>=', '->*', '->'):
+        # Non-relational operator
+        lhs += token
+        expression = matched.group(2)
+      else:
+        # Relational operator
+        operator = token
+        rhs = matched.group(2)
+        break
+    else:
+      # Unparenthesized operand.  Instead of appending to lhs one character
+      # at a time, we do another regular expression match to consume several
+      # characters at once if possible.  Trivial benchmark shows that this
+      # is more efficient when the operands are longer than a single
+      # character, which is generally the case.
+      matched = Match(r'^([^-=!<>()&|]+)(.*)$', expression)
+      if not matched:
+        matched = Match(r'^(\s*\S)(.*)$', expression)
+        if not matched:
+          break
+      lhs += matched.group(1)
+      expression = matched.group(2)
+
+  # Only apply checks if we got all parts of the boolean expression
+  if not (lhs and operator and rhs):
+    return
+
+  # Check that rhs do not contain logical operators.  We already know
+  # that lhs is fine since the loop above parses out && and ||.
+  if rhs.find('&&') > -1 or rhs.find('||') > -1:
+    return
+
+  # At least one of the operands must be a constant literal.  This is
+  # to avoid suggesting replacements for unprintable things like
+  # CHECK(variable != iterator)
+  #
+  # The following pattern matches decimal, hex integers, strings, and
+  # characters (in that order).
+  lhs = lhs.strip()
+  rhs = rhs.strip()
+  match_constant = r'^([-+]?(\d+|0[xX][0-9a-fA-F]+)[lLuU]{0,3}|".*"|\'.*\')$'
+  if Match(match_constant, lhs) or Match(match_constant, rhs):
+    # Note: since we know both lhs and rhs, we can provide a more
+    # descriptive error message like:
+    #   Consider using CHECK_EQ(x, 42) instead of CHECK(x == 42)
+    # Instead of:
+    #   Consider using CHECK_EQ instead of CHECK(a == b)
+    #
+    # We are still keeping the less descriptive message because if lhs
+    # or rhs gets long, the error message might become unreadable.
+    error(filename, linenum, 'readability/check', 2,
+          'Consider using %s instead of %s(a %s b)' % (
+              _CHECK_REPLACEMENT[check_macro][operator],
+              check_macro, operator))
+
+
+def CheckAltTokens(filename, clean_lines, linenum, error):
+  """Check alternative keywords being used in boolean expressions.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+
+  # Avoid preprocessor lines
+  if Match(r'^\s*#', line):
+    return
+
+  # Last ditch effort to avoid multi-line comments.  This will not help
+  # if the comment started before the current line or ended after the
+  # current line, but it catches most of the false positives.  At least,
+  # it provides a way to workaround this warning for people who use
+  # multi-line comments in preprocessor macros.
+  #
+  # TODO(unknown): remove this once cpplint has better support for
+  # multi-line comments.
+  if line.find('/*') >= 0 or line.find('*/') >= 0:
+    return
+
+  for match in _ALT_TOKEN_REPLACEMENT_PATTERN.finditer(line):
+    error(filename, linenum, 'readability/alt_tokens', 2,
+          'Use operator %s instead of %s' % (
+              _ALT_TOKEN_REPLACEMENT[match.group(1)], match.group(1)))
+
+
+def GetLineWidth(line):
+  """Determines the width of the line in column positions.
+
+  Args:
+    line: A string, which may be a Unicode string.
+
+  Returns:
+    The width of the line in column positions, accounting for Unicode
+    combining characters and wide characters.
+  """
+  if isinstance(line, unicode):
+    width = 0
+    for uc in unicodedata.normalize('NFC', line):
+      if unicodedata.east_asian_width(uc) in ('W', 'F'):
+        width += 2
+      elif not unicodedata.combining(uc):
+        width += 1
+    return width
+  else:
+    return len(line)
+
+
+def CheckStyle(filename, clean_lines, linenum, file_extension, nesting_state,
+               error):
+  """Checks rules from the 'C++ style rules' section of cppguide.html.
+
+  Most of these rules are hard to test (naming, comment style), but we
+  do what we can.  In particular we check for 2-space indents, line lengths,
+  tab usage, spaces inside code, etc.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    file_extension: The extension (without the dot) of the filename.
+    nesting_state: A _NestingState instance which maintains information about
+                   the current stack of nested blocks being parsed.
+    error: The function to call with any errors found.
+  """
+
+  # Don't use "elided" lines here, otherwise we can't check commented lines.
+  # Don't want to use "raw" either, because we don't want to check inside C++11
+  # raw strings,
+  raw_lines = clean_lines.lines_without_raw_strings
+  line = raw_lines[linenum]
+
+  if line.find('\t') != -1:
+    error(filename, linenum, 'whitespace/tab', 1,
+          'Tab found; better to use spaces')
+
+  # One or three blank spaces at the beginning of the line is weird; it's
+  # hard to reconcile that with 2-space indents.
+  # NOTE: here are the conditions rob pike used for his tests.  Mine aren't
+  # as sophisticated, but it may be worth becoming so:  RLENGTH==initial_spaces
+  # if(RLENGTH > 20) complain = 0;
+  # if(match($0, " +(error|private|public|protected):")) complain = 0;
+  # if(match(prev, "&& *$")) complain = 0;
+  # if(match(prev, "\\|\\| *$")) complain = 0;
+  # if(match(prev, "[\",=><] *$")) complain = 0;
+  # if(match($0, " <<")) complain = 0;
+  # if(match(prev, " +for \\(")) complain = 0;
+  # if(prevodd && match(prevprev, " +for \\(")) complain = 0;
+  initial_spaces = 0
+  cleansed_line = clean_lines.elided[linenum]
+  while initial_spaces < len(line) and line[initial_spaces] == ' ':
+    initial_spaces += 1
+  if line and line[-1].isspace():
+    error(filename, linenum, 'whitespace/end_of_line', 4,
+          'Line ends in whitespace.  Consider deleting these extra spaces.')
+  # There are certain situations we allow one space, notably for section labels
+  elif ((initial_spaces == 1 or initial_spaces == 3) and
+        not Match(r'\s*\w+\s*:\s*$', cleansed_line)):
+    error(filename, linenum, 'whitespace/indent', 3,
+          'Weird number of spaces at line-start.  '
+          'Are you using a 2-space indent?')
+
+  # Check if the line is a header guard.
+  is_header_guard = False
+  if file_extension == 'h':
+    cppvar = GetHeaderGuardCPPVariable(filename)
+    if (line.startswith('#ifndef %s' % cppvar) or
+        line.startswith('#define %s' % cppvar) or
+        line.startswith('#endif  // %s' % cppvar)):
+      is_header_guard = True
+  # #include lines and header guards can be long, since there's no clean way to
+  # split them.
+  #
+  # URLs can be long too.  It's possible to split these, but it makes them
+  # harder to cut&paste.
+  #
+  # The "$Id:...$" comment may also get very long without it being the
+  # developers fault.
+  if (not line.startswith('#include') and not is_header_guard and
+      not Match(r'^\s*//.*http(s?)://\S*$', line) and
+      not Match(r'^// \$Id:.*#[0-9]+ \$$', line)):
+    line_width = GetLineWidth(line)
+    extended_length = int((_line_length * 1.25))
+    if line_width > extended_length:
+      error(filename, linenum, 'whitespace/line_length', 4,
+            'Lines should very rarely be longer than %i characters' %
+            extended_length)
+    elif line_width > _line_length:
+      error(filename, linenum, 'whitespace/line_length', 2,
+            'Lines should be <= %i characters long' % _line_length)
+
+  if (cleansed_line.count(';') > 1 and
+      # for loops are allowed two ;'s (and may run over two lines).
+      cleansed_line.find('for') == -1 and
+      (GetPreviousNonBlankLine(clean_lines, linenum)[0].find('for') == -1 or
+       GetPreviousNonBlankLine(clean_lines, linenum)[0].find(';') != -1) and
+      # It's ok to have many commands in a switch case that fits in 1 line
+      not ((cleansed_line.find('case ') != -1 or
+            cleansed_line.find('default:') != -1) and
+           cleansed_line.find('break;') != -1)):
+    error(filename, linenum, 'whitespace/newline', 0,
+          'More than one command on the same line')
+
+  # Some more style checks
+  CheckBraces(filename, clean_lines, linenum, error)
+  CheckEmptyBlockBody(filename, clean_lines, linenum, error)
+  CheckAccess(filename, clean_lines, linenum, nesting_state, error)
+  CheckSpacing(filename, clean_lines, linenum, nesting_state, error)
+  CheckCheck(filename, clean_lines, linenum, error)
+  CheckAltTokens(filename, clean_lines, linenum, error)
+  classinfo = nesting_state.InnermostClass()
+  if classinfo:
+    CheckSectionSpacing(filename, clean_lines, classinfo, linenum, error)
+
+
+_RE_PATTERN_INCLUDE_NEW_STYLE = re.compile(r'#include +"[^/]+\.h"')
+_RE_PATTERN_INCLUDE = re.compile(r'^\s*#\s*include\s*([<"])([^>"]*)[>"].*$')
+# Matches the first component of a filename delimited by -s and _s. That is:
+#  _RE_FIRST_COMPONENT.match('foo').group(0) == 'foo'
+#  _RE_FIRST_COMPONENT.match('foo.cc').group(0) == 'foo'
+#  _RE_FIRST_COMPONENT.match('foo-bar_baz.cc').group(0) == 'foo'
+#  _RE_FIRST_COMPONENT.match('foo_bar-baz.cc').group(0) == 'foo'
+_RE_FIRST_COMPONENT = re.compile(r'^[^-_.]+')
+
+
+def _DropCommonSuffixes(filename):
+  """Drops common suffixes like _test.cc or -inl.h from filename.
+
+  For example:
+    >>> _DropCommonSuffixes('foo/foo-inl.h')
+    'foo/foo'
+    >>> _DropCommonSuffixes('foo/bar/foo.cc')
+    'foo/bar/foo'
+    >>> _DropCommonSuffixes('foo/foo_internal.h')
+    'foo/foo'
+    >>> _DropCommonSuffixes('foo/foo_unusualinternal.h')
+    'foo/foo_unusualinternal'
+
+  Args:
+    filename: The input filename.
+
+  Returns:
+    The filename with the common suffix removed.
+  """
+  for suffix in ('test.cc', 'regtest.cc', 'unittest.cc',
+                 'inl.h', 'impl.h', 'internal.h'):
+    if (filename.endswith(suffix) and len(filename) > len(suffix) and
+        filename[-len(suffix) - 1] in ('-', '_')):
+      return filename[:-len(suffix) - 1]
+  return os.path.splitext(filename)[0]
+
+
+def _IsTestFilename(filename):
+  """Determines if the given filename has a suffix that identifies it as a test.
+
+  Args:
+    filename: The input filename.
+
+  Returns:
+    True if 'filename' looks like a test, False otherwise.
+  """
+  if (filename.endswith('_test.cc') or
+      filename.endswith('_unittest.cc') or
+      filename.endswith('_regtest.cc')):
+    return True
+  else:
+    return False
+
+
+def _ClassifyInclude(fileinfo, include, is_system):
+  """Figures out what kind of header 'include' is.
+
+  Args:
+    fileinfo: The current file cpplint is running over. A FileInfo instance.
+    include: The path to a #included file.
+    is_system: True if the #include used <> rather than "".
+
+  Returns:
+    One of the _XXX_HEADER constants.
+
+  For example:
+    >>> _ClassifyInclude(FileInfo('foo/foo.cc'), 'stdio.h', True)
+    _C_SYS_HEADER
+    >>> _ClassifyInclude(FileInfo('foo/foo.cc'), 'string', True)
+    _CPP_SYS_HEADER
+    >>> _ClassifyInclude(FileInfo('foo/foo.cc'), 'foo/foo.h', False)
+    _LIKELY_MY_HEADER
+    >>> _ClassifyInclude(FileInfo('foo/foo_unknown_extension.cc'),
+    ...                  'bar/foo_other_ext.h', False)
+    _POSSIBLE_MY_HEADER
+    >>> _ClassifyInclude(FileInfo('foo/foo.cc'), 'foo/bar.h', False)
+    _OTHER_HEADER
+  """
+  # This is a list of all standard c++ header files, except
+  # those already checked for above.
+  is_cpp_h = include in _CPP_HEADERS
+
+  if is_system:
+    if is_cpp_h:
+      return _CPP_SYS_HEADER
+    else:
+      return _C_SYS_HEADER
+
+  # If the target file and the include we're checking share a
+  # basename when we drop common extensions, and the include
+  # lives in . , then it's likely to be owned by the target file.
+  target_dir, target_base = (
+      os.path.split(_DropCommonSuffixes(fileinfo.RepositoryName())))
+  include_dir, include_base = os.path.split(_DropCommonSuffixes(include))
+  if target_base == include_base and (
+      include_dir == target_dir or
+      include_dir == os.path.normpath(target_dir + '/../public')):
+    return _LIKELY_MY_HEADER
+
+  # If the target and include share some initial basename
+  # component, it's possible the target is implementing the
+  # include, so it's allowed to be first, but we'll never
+  # complain if it's not there.
+  target_first_component = _RE_FIRST_COMPONENT.match(target_base)
+  include_first_component = _RE_FIRST_COMPONENT.match(include_base)
+  if (target_first_component and include_first_component and
+      target_first_component.group(0) ==
+      include_first_component.group(0)):
+    return _POSSIBLE_MY_HEADER
+
+  return _OTHER_HEADER
+
+
+
+def CheckIncludeLine(filename, clean_lines, linenum, include_state, error):
+  """Check rules that are applicable to #include lines.
+
+  Strings on #include lines are NOT removed from elided line, to make
+  certain tasks easier. However, to prevent false positives, checks
+  applicable to #include lines in CheckLanguage must be put here.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    include_state: An _IncludeState instance in which the headers are inserted.
+    error: The function to call with any errors found.
+  """
+  fileinfo = FileInfo(filename)
+
+  line = clean_lines.lines[linenum]
+
+  # "include" should use the new style "foo/bar.h" instead of just "bar.h"
+  if _RE_PATTERN_INCLUDE_NEW_STYLE.search(line):
+    error(filename, linenum, 'build/include', 4,
+          'Include the directory when naming .h files')
+
+  # we shouldn't include a file more than once. actually, there are a
+  # handful of instances where doing so is okay, but in general it's
+  # not.
+  match = _RE_PATTERN_INCLUDE.search(line)
+  if match:
+    include = match.group(2)
+    is_system = (match.group(1) == '<')
+    if include in include_state:
+      error(filename, linenum, 'build/include', 4,
+            '"%s" already included at %s:%s' %
+            (include, filename, include_state[include]))
+    else:
+      include_state[include] = linenum
+
+      # We want to ensure that headers appear in the right order:
+      # 1) for foo.cc, foo.h  (preferred location)
+      # 2) c system files
+      # 3) cpp system files
+      # 4) for foo.cc, foo.h  (deprecated location)
+      # 5) other google headers
+      #
+      # We classify each include statement as one of those 5 types
+      # using a number of techniques. The include_state object keeps
+      # track of the highest type seen, and complains if we see a
+      # lower type after that.
+      error_message = include_state.CheckNextIncludeOrder(
+          _ClassifyInclude(fileinfo, include, is_system))
+      if error_message:
+        error(filename, linenum, 'build/include_order', 4,
+              '%s. Should be: %s.h, c system, c++ system, other.' %
+              (error_message, fileinfo.BaseName()))
+      canonical_include = include_state.CanonicalizeAlphabeticalOrder(include)
+      if not include_state.IsInAlphabeticalOrder(
+          clean_lines, linenum, canonical_include):
+        error(filename, linenum, 'build/include_alpha', 4,
+              'Include "%s" not in alphabetical order' % include)
+      include_state.SetLastHeader(canonical_include)
+
+  # Look for any of the stream classes that are part of standard C++.
+  match = _RE_PATTERN_INCLUDE.match(line)
+  if match:
+    include = match.group(2)
+    if Match(r'(f|ind|io|i|o|parse|pf|stdio|str|)?stream$', include):
+      # Many unit tests use cout, so we exempt them.
+      if not _IsTestFilename(filename):
+        error(filename, linenum, 'readability/streams', 3,
+              'Streams are highly discouraged.')
+
+
+def _GetTextInside(text, start_pattern):
+  r"""Retrieves all the text between matching open and close parentheses.
+
+  Given a string of lines and a regular expression string, retrieve all the text
+  following the expression and between opening punctuation symbols like
+  (, [, or {, and the matching close-punctuation symbol. This properly nested
+  occurrences of the punctuations, so for the text like
+    printf(a(), b(c()));
+  a call to _GetTextInside(text, r'printf\(') will return 'a(), b(c())'.
+  start_pattern must match string having an open punctuation symbol at the end.
+
+  Args:
+    text: The lines to extract text. Its comments and strings must be elided.
+           It can be single line and can span multiple lines.
+    start_pattern: The regexp string indicating where to start extracting
+                   the text.
+  Returns:
+    The extracted text.
+    None if either the opening string or ending punctuation could not be found.
+  """
+  # TODO(sugawarayu): Audit cpplint.py to see what places could be profitably
+  # rewritten to use _GetTextInside (and use inferior regexp matching today).
+
+  # Give opening punctuations to get the matching close-punctuations.
+  matching_punctuation = {'(': ')', '{': '}', '[': ']'}
+  closing_punctuation = set(matching_punctuation.itervalues())
+
+  # Find the position to start extracting text.
+  match = re.search(start_pattern, text, re.M)
+  if not match:  # start_pattern not found in text.
+    return None
+  start_position = match.end(0)
+
+  assert start_position > 0, (
+      'start_pattern must ends with an opening punctuation.')
+  assert text[start_position - 1] in matching_punctuation, (
+      'start_pattern must ends with an opening punctuation.')
+  # Stack of closing punctuations we expect to have in text after position.
+  punctuation_stack = [matching_punctuation[text[start_position - 1]]]
+  position = start_position
+  while punctuation_stack and position < len(text):
+    if text[position] == punctuation_stack[-1]:
+      punctuation_stack.pop()
+    elif text[position] in closing_punctuation:
+      # A closing punctuation without matching opening punctuations.
+      return None
+    elif text[position] in matching_punctuation:
+      punctuation_stack.append(matching_punctuation[text[position]])
+    position += 1
+  if punctuation_stack:
+    # Opening punctuations left without matching close-punctuations.
+    return None
+  # punctuations match.
+  return text[start_position:position - 1]
+
+
+# Patterns for matching call-by-reference parameters.
+#
+# Supports nested templates up to 2 levels deep using this messy pattern:
+#   < (?: < (?: < [^<>]*
+#               >
+#           |   [^<>] )*
+#         >
+#     |   [^<>] )*
+#   >
+_RE_PATTERN_IDENT = r'[_a-zA-Z]\w*'  # =~ [[:alpha:]][[:alnum:]]*
+_RE_PATTERN_TYPE = (
+    r'(?:const\s+)?(?:typename\s+|class\s+|struct\s+|union\s+|enum\s+)?'
+    r'(?:\w|'
+    r'\s*<(?:<(?:<[^<>]*>|[^<>])*>|[^<>])*>|'
+    r'::)+')
+# A call-by-reference parameter ends with '& identifier'.
+_RE_PATTERN_REF_PARAM = re.compile(
+    r'(' + _RE_PATTERN_TYPE + r'(?:\s*(?:\bconst\b|[*]))*\s*'
+    r'&\s*' + _RE_PATTERN_IDENT + r')\s*(?:=[^,()]+)?[,)]')
+# A call-by-const-reference parameter either ends with 'const& identifier'
+# or looks like 'const type& identifier' when 'type' is atomic.
+_RE_PATTERN_CONST_REF_PARAM = (
+    r'(?:.*\s*\bconst\s*&\s*' + _RE_PATTERN_IDENT +
+    r'|const\s+' + _RE_PATTERN_TYPE + r'\s*&\s*' + _RE_PATTERN_IDENT + r')')
+
+
+def CheckLanguage(filename, clean_lines, linenum, file_extension,
+                  include_state, nesting_state, error):
+  """Checks rules from the 'C++ language rules' section of cppguide.html.
+
+  Some of these rules are hard to test (function overloading, using
+  uint32 inappropriately), but we do the best we can.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    file_extension: The extension (without the dot) of the filename.
+    include_state: An _IncludeState instance in which the headers are inserted.
+    nesting_state: A _NestingState instance which maintains information about
+                   the current stack of nested blocks being parsed.
+    error: The function to call with any errors found.
+  """
+  # If the line is empty or consists of entirely a comment, no need to
+  # check it.
+  line = clean_lines.elided[linenum]
+  if not line:
+    return
+
+  match = _RE_PATTERN_INCLUDE.search(line)
+  if match:
+    CheckIncludeLine(filename, clean_lines, linenum, include_state, error)
+    return
+
+  # Reset include state across preprocessor directives.  This is meant
+  # to silence warnings for conditional includes.
+  if Match(r'^\s*#\s*(?:ifdef|elif|else|endif)\b', line):
+    include_state.ResetSection()
+
+  # Make Windows paths like Unix.
+  fullname = os.path.abspath(filename).replace('\\', '/')
+
+  # TODO(unknown): figure out if they're using default arguments in fn proto.
+
+  # Check to see if they're using an conversion function cast.
+  # I just try to capture the most common basic types, though there are more.
+  # Parameterless conversion functions, such as bool(), are allowed as they are
+  # probably a member operator declaration or default constructor.
+  match = Search(
+      r'(\bnew\s+)?\b'  # Grab 'new' operator, if it's there
+      r'(int|float|double|bool|char|int32|uint32|int64|uint64)'
+      r'(\([^)].*)', line)
+  if match:
+    matched_new = match.group(1)
+    matched_type = match.group(2)
+    matched_funcptr = match.group(3)
+
+    # gMock methods are defined using some variant of MOCK_METHODx(name, type)
+    # where type may be float(), int(string), etc.  Without context they are
+    # virtually indistinguishable from int(x) casts. Likewise, gMock's
+    # MockCallback takes a template parameter of the form return_type(arg_type),
+    # which looks much like the cast we're trying to detect.
+    #
+    # std::function<> wrapper has a similar problem.
+    #
+    # Return types for function pointers also look like casts if they
+    # don't have an extra space.
+    if (matched_new is None and  # If new operator, then this isn't a cast
+        not (Match(r'^\s*MOCK_(CONST_)?METHOD\d+(_T)?\(', line) or
+             Search(r'\bMockCallback<.*>', line) or
+             Search(r'\bstd::function<.*>', line)) and
+        not (matched_funcptr and
+             Match(r'\((?:[^() ]+::\s*\*\s*)?[^() ]+\)\s*\(',
+                   matched_funcptr))):
+      # Try a bit harder to catch gmock lines: the only place where
+      # something looks like an old-style cast is where we declare the
+      # return type of the mocked method, and the only time when we
+      # are missing context is if MOCK_METHOD was split across
+      # multiple lines.  The missing MOCK_METHOD is usually one or two
+      # lines back, so scan back one or two lines.
+      #
+      # It's not possible for gmock macros to appear in the first 2
+      # lines, since the class head + section name takes up 2 lines.
+      if (linenum < 2 or
+          not (Match(r'^\s*MOCK_(?:CONST_)?METHOD\d+(?:_T)?\((?:\S+,)?\s*$',
+                     clean_lines.elided[linenum - 1]) or
+               Match(r'^\s*MOCK_(?:CONST_)?METHOD\d+(?:_T)?\(\s*$',
+                     clean_lines.elided[linenum - 2]))):
+        error(filename, linenum, 'readability/casting', 4,
+              'Using deprecated casting style.  '
+              'Use static_cast<%s>(...) instead' %
+              matched_type)
+
+  CheckCStyleCast(filename, linenum, line, clean_lines.raw_lines[linenum],
+                  'static_cast',
+                  r'\((int|float|double|bool|char|u?int(16|32|64))\)', error)
+
+  # This doesn't catch all cases. Consider (const char * const)"hello".
+  #
+  # (char *) "foo" should always be a const_cast (reinterpret_cast won't
+  # compile).
+  if CheckCStyleCast(filename, linenum, line, clean_lines.raw_lines[linenum],
+                     'const_cast', r'\((char\s?\*+\s?)\)\s*"', error):
+    pass
+  else:
+    # Check pointer casts for other than string constants
+    CheckCStyleCast(filename, linenum, line, clean_lines.raw_lines[linenum],
+                    'reinterpret_cast', r'\((\w+\s?\*+\s?)\)', error)
+
+  # In addition, we look for people taking the address of a cast.  This
+  # is dangerous -- casts can assign to temporaries, so the pointer doesn't
+  # point where you think.
+  match = Search(
+      r'(?:&\(([^)]+)\)[\w(])|'
+      r'(?:&(static|dynamic|down|reinterpret)_cast\b)', line)
+  if match and match.group(1) != '*':
+    error(filename, linenum, 'runtime/casting', 4,
+          ('Are you taking an address of a cast?  '
+           'This is dangerous: could be a temp var.  '
+           'Take the address before doing the cast, rather than after'))
+
+  # Create an extended_line, which is the concatenation of the current and
+  # next lines, for more effective checking of code that may span more than one
+  # line.
+  if linenum + 1 < clean_lines.NumLines():
+    extended_line = line + clean_lines.elided[linenum + 1]
+  else:
+    extended_line = line
+
+  # Check for people declaring static/global STL strings at the top level.
+  # This is dangerous because the C++ language does not guarantee that
+  # globals with constructors are initialized before the first access.
+  match = Match(
+      r'((?:|static +)(?:|const +))string +([a-zA-Z0-9_:]+)\b(.*)',
+      line)
+  # Make sure it's not a function.
+  # Function template specialization looks like: "string foo<Type>(...".
+  # Class template definitions look like: "string Foo<Type>::Method(...".
+  #
+  # Also ignore things that look like operators.  These are matched separately
+  # because operator names cross non-word boundaries.  If we change the pattern
+  # above, we would decrease the accuracy of matching identifiers.
+  if (match and
+      not Search(r'\boperator\W', line) and
+      not Match(r'\s*(<.*>)?(::[a-zA-Z0-9_]+)?\s*\(([^"]|$)', match.group(3))):
+    error(filename, linenum, 'runtime/string', 4,
+          'For a static/global string constant, use a C style string instead: '
+          '"%schar %s[]".' %
+          (match.group(1), match.group(2)))
+
+  if Search(r'\b([A-Za-z0-9_]*_)\(\1\)', line):
+    error(filename, linenum, 'runtime/init', 4,
+          'You seem to be initializing a member variable with itself.')
+
+  if file_extension == 'h':
+    # TODO(unknown): check that 1-arg constructors are explicit.
+    #                How to tell it's a constructor?
+    #                (handled in CheckForNonStandardConstructs for now)
+    # TODO(unknown): check that classes have DISALLOW_EVIL_CONSTRUCTORS
+    #                (level 1 error)
+    pass
+
+  # Check if people are using the verboten C basic types.  The only exception
+  # we regularly allow is "unsigned short port" for port.
+  if Search(r'\bshort port\b', line):
+    if not Search(r'\bunsigned short port\b', line):
+      error(filename, linenum, 'runtime/int', 4,
+            'Use "unsigned short" for ports, not "short"')
+  else:
+    match = Search(r'\b(short|long(?! +double)|long long)\b', line)
+    if match:
+      error(filename, linenum, 'runtime/int', 4,
+            'Use int16/int64/etc, rather than the C type %s' % match.group(1))
+
+  # When snprintf is used, the second argument shouldn't be a literal.
+  match = Search(r'snprintf\s*\(([^,]*),\s*([0-9]*)\s*,', line)
+  if match and match.group(2) != '0':
+    # If 2nd arg is zero, snprintf is used to calculate size.
+    error(filename, linenum, 'runtime/printf', 3,
+          'If you can, use sizeof(%s) instead of %s as the 2nd arg '
+          'to snprintf.' % (match.group(1), match.group(2)))
+
+  # Check if some verboten C functions are being used.
+  if Search(r'\bsprintf\b', line):
+    error(filename, linenum, 'runtime/printf', 5,
+          'Never use sprintf.  Use snprintf instead.')
+  match = Search(r'\b(strcpy|strcat)\b', line)
+  if match:
+    error(filename, linenum, 'runtime/printf', 4,
+          'Almost always, snprintf is better than %s' % match.group(1))
+
+  # Check if some verboten operator overloading is going on
+  # TODO(unknown): catch out-of-line unary operator&:
+  #   class X {};
+  #   int operator&(const X& x) { return 42; }  // unary operator&
+  # The trick is it's hard to tell apart from binary operator&:
+  #   class Y { int operator&(const Y& x) { return 23; } }; // binary operator&
+  if Search(r'\boperator\s*&\s*\(\s*\)', line):
+    error(filename, linenum, 'runtime/operator', 4,
+          'Unary operator& is dangerous.  Do not use it.')
+
+  # Check for suspicious usage of "if" like
+  # } if (a == b) {
+  if Search(r'\}\s*if\s*\(', line):
+    error(filename, linenum, 'readability/braces', 4,
+          'Did you mean "else if"? If not, start a new line for "if".')
+
+  # Check for potential format string bugs like printf(foo).
+  # We constrain the pattern not to pick things like DocidForPrintf(foo).
+  # Not perfect but it can catch printf(foo.c_str()) and printf(foo->c_str())
+  # TODO(sugawarayu): Catch the following case. Need to change the calling
+  # convention of the whole function to process multiple line to handle it.
+  #   printf(
+  #       boy_this_is_a_really_long_variable_that_cannot_fit_on_the_prev_line);
+  printf_args = _GetTextInside(line, r'(?i)\b(string)?printf\s*\(')
+  if printf_args:
+    match = Match(r'([\w.\->()]+)$', printf_args)
+    if match and match.group(1) != '__VA_ARGS__':
+      function_name = re.search(r'\b((?:string)?printf)\s*\(',
+                                line, re.I).group(1)
+      error(filename, linenum, 'runtime/printf', 4,
+            'Potential format string bug. Do %s("%%s", %s) instead.'
+            % (function_name, match.group(1)))
+
+  # Check for potential memset bugs like memset(buf, sizeof(buf), 0).
+  match = Search(r'memset\s*\(([^,]*),\s*([^,]*),\s*0\s*\)', line)
+  if match and not Match(r"^''|-?[0-9]+|0x[0-9A-Fa-f]$", match.group(2)):
+    error(filename, linenum, 'runtime/memset', 4,
+          'Did you mean "memset(%s, 0, %s)"?'
+          % (match.group(1), match.group(2)))
+
+  if Search(r'\busing namespace\b', line):
+    error(filename, linenum, 'build/namespaces', 5,
+          'Do not use namespace using-directives.  '
+          'Use using-declarations instead.')
+
+  # Detect variable-length arrays.
+  match = Match(r'\s*(.+::)?(\w+) [a-z]\w*\[(.+)];', line)
+  if (match and match.group(2) != 'return' and match.group(2) != 'delete' and
+      match.group(3).find(']') == -1):
+    # Split the size using space and arithmetic operators as delimiters.
+    # If any of the resulting tokens are not compile time constants then
+    # report the error.
+    tokens = re.split(r'\s|\+|\-|\*|\/|<<|>>]', match.group(3))
+    is_const = True
+    skip_next = False
+    for tok in tokens:
+      if skip_next:
+        skip_next = False
+        continue
+
+      if Search(r'sizeof\(.+\)', tok): continue
+      if Search(r'arraysize\(\w+\)', tok): continue
+
+      tok = tok.lstrip('(')
+      tok = tok.rstrip(')')
+      if not tok: continue
+      if Match(r'\d+', tok): continue
+      if Match(r'0[xX][0-9a-fA-F]+', tok): continue
+      if Match(r'k[A-Z0-9]\w*', tok): continue
+      if Match(r'(.+::)?k[A-Z0-9]\w*', tok): continue
+      if Match(r'(.+::)?[A-Z][A-Z0-9_]*', tok): continue
+      # A catch all for tricky sizeof cases, including 'sizeof expression',
+      # 'sizeof(*type)', 'sizeof(const type)', 'sizeof(struct StructName)'
+      # requires skipping the next token because we split on ' ' and '*'.
+      if tok.startswith('sizeof'):
+        skip_next = True
+        continue
+      is_const = False
+      break
+    if not is_const:
+      error(filename, linenum, 'runtime/arrays', 1,
+            'Do not use variable-length arrays.  Use an appropriately named '
+            "('k' followed by CamelCase) compile-time constant for the size.")
+
+  # If DISALLOW_EVIL_CONSTRUCTORS, DISALLOW_COPY_AND_ASSIGN, or
+  # DISALLOW_IMPLICIT_CONSTRUCTORS is present, then it should be the last thing
+  # in the class declaration.
+  match = Match(
+      (r'\s*'
+       r'(DISALLOW_(EVIL_CONSTRUCTORS|COPY_AND_ASSIGN|IMPLICIT_CONSTRUCTORS))'
+       r'\(.*\);$'),
+      line)
+  if match and linenum + 1 < clean_lines.NumLines():
+    next_line = clean_lines.elided[linenum + 1]
+    # We allow some, but not all, declarations of variables to be present
+    # in the statement that defines the class.  The [\w\*,\s]* fragment of
+    # the regular expression below allows users to declare instances of
+    # the class or pointers to instances, but not less common types such
+    # as function pointers or arrays.  It's a tradeoff between allowing
+    # reasonable code and avoiding trying to parse more C++ using regexps.
+    if not Search(r'^\s*}[\w\*,\s]*;', next_line):
+      error(filename, linenum, 'readability/constructors', 3,
+            match.group(1) + ' should be the last thing in the class')
+
+  # Check for use of unnamed namespaces in header files.  Registration
+  # macros are typically OK, so we allow use of "namespace {" on lines
+  # that end with backslashes.
+  if (file_extension == 'h'
+      and Search(r'\bnamespace\s*{', line)
+      and line[-1] != '\\'):
+    error(filename, linenum, 'build/namespaces', 4,
+          'Do not use unnamed namespaces in header files.  See '
+          'http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml#Namespaces'
+          ' for more information.')
+
+def CheckForNonConstReference(filename, clean_lines, linenum,
+                              nesting_state, error):
+  """Check for non-const references.
+
+  Separate from CheckLanguage since it scans backwards from current
+  line, instead of scanning forward.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    nesting_state: A _NestingState instance which maintains information about
+                   the current stack of nested blocks being parsed.
+    error: The function to call with any errors found.
+  """
+  # Do nothing if there is no '&' on current line.
+  line = clean_lines.elided[linenum]
+  if '&' not in line:
+    return
+
+  # Long type names may be broken across multiple lines, usually in one
+  # of these forms:
+  #   LongType
+  #       ::LongTypeContinued &identifier
+  #   LongType::
+  #       LongTypeContinued &identifier
+  #   LongType<
+  #       ...>::LongTypeContinued &identifier
+  #
+  # If we detected a type split across two lines, join the previous
+  # line to current line so that we can match const references
+  # accordingly.
+  #
+  # Note that this only scans back one line, since scanning back
+  # arbitrary number of lines would be expensive.  If you have a type
+  # that spans more than 2 lines, please use a typedef.
+  if linenum > 1:
+    previous = None
+    if Match(r'\s*::(?:[\w<>]|::)+\s*&\s*\S', line):
+      # previous_line\n + ::current_line
+      previous = Search(r'\b((?:const\s*)?(?:[\w<>]|::)+[\w<>])\s*$',
+                        clean_lines.elided[linenum - 1])
+    elif Match(r'\s*[a-zA-Z_]([\w<>]|::)+\s*&\s*\S', line):
+      # previous_line::\n + current_line
+      previous = Search(r'\b((?:const\s*)?(?:[\w<>]|::)+::)\s*$',
+                        clean_lines.elided[linenum - 1])
+    if previous:
+      line = previous.group(1) + line.lstrip()
+    else:
+      # Check for templated parameter that is split across multiple lines
+      endpos = line.rfind('>')
+      if endpos > -1:
+        (_, startline, startpos) = ReverseCloseExpression(
+            clean_lines, linenum, endpos)
+        if startpos > -1 and startline < linenum:
+          # Found the matching < on an earlier line, collect all
+          # pieces up to current line.
+          line = ''
+          for i in xrange(startline, linenum + 1):
+            line += clean_lines.elided[i].strip()
+
+  # Check for non-const references in function parameters.  A single '&' may
+  # found in the following places:
+  #   inside expression: binary & for bitwise AND
+  #   inside expression: unary & for taking the address of something
+  #   inside declarators: reference parameter
+  # We will exclude the first two cases by checking that we are not inside a
+  # function body, including one that was just introduced by a trailing '{'.
+  # TODO(unknwon): Doesn't account for preprocessor directives.
+  # TODO(unknown): Doesn't account for 'catch(Exception& e)' [rare].
+  check_params = False
+  if not nesting_state.stack:
+    check_params = True  # top level
+  elif (isinstance(nesting_state.stack[-1], _ClassInfo) or
+        isinstance(nesting_state.stack[-1], _NamespaceInfo)):
+    check_params = True  # within class or namespace
+  elif Match(r'.*{\s*$', line):
+    if (len(nesting_state.stack) == 1 or
+        isinstance(nesting_state.stack[-2], _ClassInfo) or
+        isinstance(nesting_state.stack[-2], _NamespaceInfo)):
+      check_params = True  # just opened global/class/namespace block
+  # We allow non-const references in a few standard places, like functions
+  # called "swap()" or iostream operators like "<<" or ">>".  Do not check
+  # those function parameters.
+  #
+  # We also accept & in static_assert, which looks like a function but
+  # it's actually a declaration expression.
+  whitelisted_functions = (r'(?:[sS]wap(?:<\w:+>)?|'
+                           r'operator\s*[<>][<>]|'
+                           r'static_assert|COMPILE_ASSERT'
+                           r')\s*\(')
+  if Search(whitelisted_functions, line):
+    check_params = False
+  elif not Search(r'\S+\([^)]*$', line):
+    # Don't see a whitelisted function on this line.  Actually we
+    # didn't see any function name on this line, so this is likely a
+    # multi-line parameter list.  Try a bit harder to catch this case.
+    for i in xrange(2):
+      if (linenum > i and
+          Search(whitelisted_functions, clean_lines.elided[linenum - i - 1])):
+        check_params = False
+        break
+
+  if check_params:
+    decls = ReplaceAll(r'{[^}]*}', ' ', line)  # exclude function body
+    for parameter in re.findall(_RE_PATTERN_REF_PARAM, decls):
+      if not Match(_RE_PATTERN_CONST_REF_PARAM, parameter):
+        error(filename, linenum, 'runtime/references', 2,
+              'Is this a non-const reference? '
+              'If so, make const or use a pointer: ' +
+              ReplaceAll(' *<', '<', parameter))
+
+
+def CheckCStyleCast(filename, linenum, line, raw_line, cast_type, pattern,
+                    error):
+  """Checks for a C-style cast by looking for the pattern.
+
+  Args:
+    filename: The name of the current file.
+    linenum: The number of the line to check.
+    line: The line of code to check.
+    raw_line: The raw line of code to check, with comments.
+    cast_type: The string for the C++ cast to recommend.  This is either
+      reinterpret_cast, static_cast, or const_cast, depending.
+    pattern: The regular expression used to find C-style casts.
+    error: The function to call with any errors found.
+
+  Returns:
+    True if an error was emitted.
+    False otherwise.
+  """
+  match = Search(pattern, line)
+  if not match:
+    return False
+
+  # e.g., sizeof(int)
+  sizeof_match = Match(r'.*sizeof\s*$', line[0:match.start(1) - 1])
+  if sizeof_match:
+    error(filename, linenum, 'runtime/sizeof', 1,
+          'Using sizeof(type).  Use sizeof(varname) instead if possible')
+    return True
+
+  # operator++(int) and operator--(int)
+  if (line[0:match.start(1) - 1].endswith(' operator++') or
+      line[0:match.start(1) - 1].endswith(' operator--')):
+    return False
+
+  # A single unnamed argument for a function tends to look like old
+  # style cast.  If we see those, don't issue warnings for deprecated
+  # casts, instead issue warnings for unnamed arguments where
+  # appropriate.
+  #
+  # These are things that we want warnings for, since the style guide
+  # explicitly require all parameters to be named:
+  #   Function(int);
+  #   Function(int) {
+  #   ConstMember(int) const;
+  #   ConstMember(int) const {
+  #   ExceptionMember(int) throw (...);
+  #   ExceptionMember(int) throw (...) {
+  #   PureVirtual(int) = 0;
+  #
+  # These are functions of some sort, where the compiler would be fine
+  # if they had named parameters, but people often omit those
+  # identifiers to reduce clutter:
+  #   (FunctionPointer)(int);
+  #   (FunctionPointer)(int) = value;
+  #   Function((function_pointer_arg)(int))
+  #   <TemplateArgument(int)>;
+  #   <(FunctionPointerTemplateArgument)(int)>;
+  remainder = line[match.end(0):]
+  if Match(r'^\s*(?:;|const\b|throw\b|=|>|\{|\))', remainder):
+    # Looks like an unnamed parameter.
+
+    # Don't warn on any kind of template arguments.
+    if Match(r'^\s*>', remainder):
+      return False
+
+    # Don't warn on assignments to function pointers, but keep warnings for
+    # unnamed parameters to pure virtual functions.  Note that this pattern
+    # will also pass on assignments of "0" to function pointers, but the
+    # preferred values for those would be "nullptr" or "NULL".
+    matched_zero = Match(r'^\s=\s*(\S+)\s*;', remainder)
+    if matched_zero and matched_zero.group(1) != '0':
+      return False
+
+    # Don't warn on function pointer declarations.  For this we need
+    # to check what came before the "(type)" string.
+    if Match(r'.*\)\s*$', line[0:match.start(0)]):
+      return False
+
+    # Don't warn if the parameter is named with block comments, e.g.:
+    #  Function(int /*unused_param*/);
+    if '/*' in raw_line:
+      return False
+
+    # Passed all filters, issue warning here.
+    error(filename, linenum, 'readability/function', 3,
+          'All parameters should be named in a function')
+    return True
+
+  # At this point, all that should be left is actual casts.
+  error(filename, linenum, 'readability/casting', 4,
+        'Using C-style cast.  Use %s<%s>(...) instead' %
+        (cast_type, match.group(1)))
+
+  return True
+
+
+_HEADERS_CONTAINING_TEMPLATES = (
+    ('<deque>', ('deque',)),
+    ('<functional>', ('unary_function', 'binary_function',
+                      'plus', 'minus', 'multiplies', 'divides', 'modulus',
+                      'negate',
+                      'equal_to', 'not_equal_to', 'greater', 'less',
+                      'greater_equal', 'less_equal',
+                      'logical_and', 'logical_or', 'logical_not',
+                      'unary_negate', 'not1', 'binary_negate', 'not2',
+                      'bind1st', 'bind2nd',
+                      'pointer_to_unary_function',
+                      'pointer_to_binary_function',
+                      'ptr_fun',
+                      'mem_fun_t', 'mem_fun', 'mem_fun1_t', 'mem_fun1_ref_t',
+                      'mem_fun_ref_t',
+                      'const_mem_fun_t', 'const_mem_fun1_t',
+                      'const_mem_fun_ref_t', 'const_mem_fun1_ref_t',
+                      'mem_fun_ref',
+                     )),
+    ('<limits>', ('numeric_limits',)),
+    ('<list>', ('list',)),
+    ('<map>', ('map', 'multimap',)),
+    ('<memory>', ('allocator',)),
+    ('<queue>', ('queue', 'priority_queue',)),
+    ('<set>', ('set', 'multiset',)),
+    ('<stack>', ('stack',)),
+    ('<string>', ('char_traits', 'basic_string',)),
+    ('<utility>', ('pair',)),
+    ('<vector>', ('vector',)),
+
+    # gcc extensions.
+    # Note: std::hash is their hash, ::hash is our hash
+    ('<hash_map>', ('hash_map', 'hash_multimap',)),
+    ('<hash_set>', ('hash_set', 'hash_multiset',)),
+    ('<slist>', ('slist',)),
+    )
+
+_RE_PATTERN_STRING = re.compile(r'\bstring\b')
+
+_re_pattern_algorithm_header = []
+for _template in ('copy', 'max', 'min', 'min_element', 'sort', 'swap',
+                  'transform'):
+  # Match max<type>(..., ...), max(..., ...), but not foo->max, foo.max or
+  # type::max().
+  _re_pattern_algorithm_header.append(
+      (re.compile(r'[^>.]\b' + _template + r'(<.*?>)?\([^\)]'),
+       _template,
+       '<algorithm>'))
+
+_re_pattern_templates = []
+for _header, _templates in _HEADERS_CONTAINING_TEMPLATES:
+  for _template in _templates:
+    _re_pattern_templates.append(
+        (re.compile(r'(\<|\b)' + _template + r'\s*\<'),
+         _template + '<>',
+         _header))
+
+
+def FilesBelongToSameModule(filename_cc, filename_h):
+  """Check if these two filenames belong to the same module.
+
+  The concept of a 'module' here is a as follows:
+  foo.h, foo-inl.h, foo.cc, foo_test.cc and foo_unittest.cc belong to the
+  same 'module' if they are in the same directory.
+  some/path/public/xyzzy and some/path/internal/xyzzy are also considered
+  to belong to the same module here.
+
+  If the filename_cc contains a longer path than the filename_h, for example,
+  '/absolute/path/to/base/sysinfo.cc', and this file would include
+  'base/sysinfo.h', this function also produces the prefix needed to open the
+  header. This is used by the caller of this function to more robustly open the
+  header file. We don't have access to the real include paths in this context,
+  so we need this guesswork here.
+
+  Known bugs: tools/base/bar.cc and base/bar.h belong to the same module
+  according to this implementation. Because of this, this function gives
+  some false positives. This should be sufficiently rare in practice.
+
+  Args:
+    filename_cc: is the path for the .cc file
+    filename_h: is the path for the header path
+
+  Returns:
+    Tuple with a bool and a string:
+    bool: True if filename_cc and filename_h belong to the same module.
+    string: the additional prefix needed to open the header file.
+  """
+
+  if not filename_cc.endswith('.cc'):
+    return (False, '')
+  filename_cc = filename_cc[:-len('.cc')]
+  if filename_cc.endswith('_unittest'):
+    filename_cc = filename_cc[:-len('_unittest')]
+  elif filename_cc.endswith('_test'):
+    filename_cc = filename_cc[:-len('_test')]
+  filename_cc = filename_cc.replace('/public/', '/')
+  filename_cc = filename_cc.replace('/internal/', '/')
+
+  if not filename_h.endswith('.h'):
+    return (False, '')
+  filename_h = filename_h[:-len('.h')]
+  if filename_h.endswith('-inl'):
+    filename_h = filename_h[:-len('-inl')]
+  filename_h = filename_h.replace('/public/', '/')
+  filename_h = filename_h.replace('/internal/', '/')
+
+  files_belong_to_same_module = filename_cc.endswith(filename_h)
+  common_path = ''
+  if files_belong_to_same_module:
+    common_path = filename_cc[:-len(filename_h)]
+  return files_belong_to_same_module, common_path
+
+
+def UpdateIncludeState(filename, include_state, io=codecs):
+  """Fill up the include_state with new includes found from the file.
+
+  Args:
+    filename: the name of the header to read.
+    include_state: an _IncludeState instance in which the headers are inserted.
+    io: The io factory to use to read the file. Provided for testability.
+
+  Returns:
+    True if a header was succesfully added. False otherwise.
+  """
+  headerfile = None
+  try:
+    headerfile = io.open(filename, 'r', 'utf8', 'replace')
+  except IOError:
+    return False
+  linenum = 0
+  for line in headerfile:
+    linenum += 1
+    clean_line = CleanseComments(line)
+    match = _RE_PATTERN_INCLUDE.search(clean_line)
+    if match:
+      include = match.group(2)
+      # The value formatting is cute, but not really used right now.
+      # What matters here is that the key is in include_state.
+      include_state.setdefault(include, '%s:%d' % (filename, linenum))
+  return True
+
+
+def CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error,
+                              io=codecs):
+  """Reports for missing stl includes.
+
+  This function will output warnings to make sure you are including the headers
+  necessary for the stl containers and functions that you use. We only give one
+  reason to include a header. For example, if you use both equal_to<> and
+  less<> in a .h file, only one (the latter in the file) of these will be
+  reported as a reason to include the <functional>.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    include_state: An _IncludeState instance.
+    error: The function to call with any errors found.
+    io: The IO factory to use to read the header file. Provided for unittest
+        injection.
+  """
+  required = {}  # A map of header name to linenumber and the template entity.
+                 # Example of required: { '<functional>': (1219, 'less<>') }
+
+  for linenum in xrange(clean_lines.NumLines()):
+    line = clean_lines.elided[linenum]
+    if not line or line[0] == '#':
+      continue
+
+    # String is special -- it is a non-templatized type in STL.
+    matched = _RE_PATTERN_STRING.search(line)
+    if matched:
+      # Don't warn about strings in non-STL namespaces:
+      # (We check only the first match per line; good enough.)
+      prefix = line[:matched.start()]
+      if prefix.endswith('std::') or not prefix.endswith('::'):
+        required['<string>'] = (linenum, 'string')
+
+    for pattern, template, header in _re_pattern_algorithm_header:
+      if pattern.search(line):
+        required[header] = (linenum, template)
+
+    # The following function is just a speed up, no semantics are changed.
+    if not '<' in line:  # Reduces the cpu time usage by skipping lines.
+      continue
+
+    for pattern, template, header in _re_pattern_templates:
+      if pattern.search(line):
+        required[header] = (linenum, template)
+
+  # The policy is that if you #include something in foo.h you don't need to
+  # include it again in foo.cc. Here, we will look at possible includes.
+  # Let's copy the include_state so it is only messed up within this function.
+  include_state = include_state.copy()
+
+  # Did we find the header for this file (if any) and succesfully load it?
+  header_found = False
+
+  # Use the absolute path so that matching works properly.
+  abs_filename = FileInfo(filename).FullName()
+
+  # For Emacs's flymake.
+  # If cpplint is invoked from Emacs's flymake, a temporary file is generated
+  # by flymake and that file name might end with '_flymake.cc'. In that case,
+  # restore original file name here so that the corresponding header file can be
+  # found.
+  # e.g. If the file name is 'foo_flymake.cc', we should search for 'foo.h'
+  # instead of 'foo_flymake.h'
+  abs_filename = re.sub(r'_flymake\.cc$', '.cc', abs_filename)
+
+  # include_state is modified during iteration, so we iterate over a copy of
+  # the keys.
+  header_keys = include_state.keys()
+  for header in header_keys:
+    (same_module, common_path) = FilesBelongToSameModule(abs_filename, header)
+    fullpath = common_path + header
+    if same_module and UpdateIncludeState(fullpath, include_state, io):
+      header_found = True
+
+  # If we can't find the header file for a .cc, assume it's because we don't
+  # know where to look. In that case we'll give up as we're not sure they
+  # didn't include it in the .h file.
+  # TODO(unknown): Do a better job of finding .h files so we are confident that
+  # not having the .h file means there isn't one.
+  if filename.endswith('.cc') and not header_found:
+    return
+
+  # All the lines have been processed, report the errors found.
+  for required_header_unstripped in required:
+    template = required[required_header_unstripped][1]
+    if required_header_unstripped.strip('<>"') not in include_state:
+      error(filename, required[required_header_unstripped][0],
+            'build/include_what_you_use', 4,
+            'Add #include ' + required_header_unstripped + ' for ' + template)
+
+
+_RE_PATTERN_EXPLICIT_MAKEPAIR = re.compile(r'\bmake_pair\s*<')
+
+
+def CheckMakePairUsesDeduction(filename, clean_lines, linenum, error):
+  """Check that make_pair's template arguments are deduced.
+
+  G++ 4.6 in C++0x mode fails badly if make_pair's template arguments are
+  specified explicitly, and such use isn't intended in any case.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+  match = _RE_PATTERN_EXPLICIT_MAKEPAIR.search(line)
+  if match:
+    error(filename, linenum, 'build/explicit_make_pair',
+          4,  # 4 = high confidence
+          'For C++11-compatibility, omit template arguments from make_pair'
+          ' OR use pair directly OR if appropriate, construct a pair directly')
+
+
+def ProcessLine(filename, file_extension, clean_lines, line,
+                include_state, function_state, nesting_state, error,
+                extra_check_functions=[]):
+  """Processes a single line in the file.
+
+  Args:
+    filename: Filename of the file that is being processed.
+    file_extension: The extension (dot not included) of the file.
+    clean_lines: An array of strings, each representing a line of the file,
+                 with comments stripped.
+    line: Number of line being processed.
+    include_state: An _IncludeState instance in which the headers are inserted.
+    function_state: A _FunctionState instance which counts function lines, etc.
+    nesting_state: A _NestingState instance which maintains information about
+                   the current stack of nested blocks being parsed.
+    error: A callable to which errors are reported, which takes 4 arguments:
+           filename, line number, error level, and message
+    extra_check_functions: An array of additional check functions that will be
+                           run on each source line. Each function takes 4
+                           arguments: filename, clean_lines, line, error
+  """
+  raw_lines = clean_lines.raw_lines
+  ParseNolintSuppressions(filename, raw_lines[line], line, error)
+  nesting_state.Update(filename, clean_lines, line, error)
+  if nesting_state.stack and nesting_state.stack[-1].inline_asm != _NO_ASM:
+    return
+  CheckForFunctionLengths(filename, clean_lines, line, function_state, error)
+  CheckForMultilineCommentsAndStrings(filename, clean_lines, line, error)
+  CheckStyle(filename, clean_lines, line, file_extension, nesting_state, error)
+  CheckLanguage(filename, clean_lines, line, file_extension, include_state,
+                nesting_state, error)
+  CheckForNonConstReference(filename, clean_lines, line, nesting_state, error)
+  CheckForNonStandardConstructs(filename, clean_lines, line,
+                                nesting_state, error)
+  CheckVlogArguments(filename, clean_lines, line, error)
+  CheckPosixThreading(filename, clean_lines, line, error)
+  CheckInvalidIncrement(filename, clean_lines, line, error)
+  CheckMakePairUsesDeduction(filename, clean_lines, line, error)
+  for check_fn in extra_check_functions:
+    check_fn(filename, clean_lines, line, error)
+
+def ProcessFileData(filename, file_extension, lines, error,
+                    extra_check_functions=[]):
+  """Performs lint checks and reports any errors to the given error function.
+
+  Args:
+    filename: Filename of the file that is being processed.
+    file_extension: The extension (dot not included) of the file.
+    lines: An array of strings, each representing a line of the file, with the
+           last element being empty if the file is terminated with a newline.
+    error: A callable to which errors are reported, which takes 4 arguments:
+           filename, line number, error level, and message
+    extra_check_functions: An array of additional check functions that will be
+                           run on each source line. Each function takes 4
+                           arguments: filename, clean_lines, line, error
+  """
+  lines = (['// marker so line numbers and indices both start at 1'] + lines +
+           ['// marker so line numbers end in a known way'])
+
+  include_state = _IncludeState()
+  function_state = _FunctionState()
+  nesting_state = _NestingState()
+
+  ResetNolintSuppressions()
+
+  CheckForCopyright(filename, lines, error)
+
+  if file_extension == 'h':
+    CheckForHeaderGuard(filename, lines, error)
+
+  RemoveMultiLineComments(filename, lines, error)
+  clean_lines = CleansedLines(lines)
+  for line in xrange(clean_lines.NumLines()):
+    ProcessLine(filename, file_extension, clean_lines, line,
+                include_state, function_state, nesting_state, error,
+                extra_check_functions)
+  nesting_state.CheckCompletedBlocks(filename, error)
+
+  CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error)
+
+  # We check here rather than inside ProcessLine so that we see raw
+  # lines rather than "cleaned" lines.
+  CheckForBadCharacters(filename, lines, error)
+
+  CheckForNewlineAtEOF(filename, lines, error)
+
+def ProcessFile(filename, vlevel, extra_check_functions=[]):
+  """Does google-lint on a single file.
+
+  Args:
+    filename: The name of the file to parse.
+
+    vlevel: The level of errors to report.  Every error of confidence
+    >= verbose_level will be reported.  0 is a good default.
+
+    extra_check_functions: An array of additional check functions that will be
+                           run on each source line. Each function takes 4
+                           arguments: filename, clean_lines, line, error
+  """
+
+  _SetVerboseLevel(vlevel)
+
+  try:
+    # Support the UNIX convention of using "-" for stdin.  Note that
+    # we are not opening the file with universal newline support
+    # (which codecs doesn't support anyway), so the resulting lines do
+    # contain trailing '\r' characters if we are reading a file that
+    # has CRLF endings.
+    # If after the split a trailing '\r' is present, it is removed
+    # below. If it is not expected to be present (i.e. os.linesep !=
+    # '\r\n' as in Windows), a warning is issued below if this file
+    # is processed.
+
+    if filename == '-':
+      lines = codecs.StreamReaderWriter(sys.stdin,
+                                        codecs.getreader('utf8'),
+                                        codecs.getwriter('utf8'),
+                                        'replace').read().split('\n')
+    else:
+      lines = codecs.open(filename, 'r', 'utf8', 'replace').read().split('\n')
+
+    carriage_return_found = False
+    # Remove trailing '\r'.
+    for linenum in range(len(lines)):
+      if lines[linenum].endswith('\r'):
+        lines[linenum] = lines[linenum].rstrip('\r')
+        carriage_return_found = True
+
+  except IOError:
+    sys.stderr.write(
+        "Skipping input '%s': Can't open for reading\n" % filename)
+    return
+
+  # Note, if no dot is found, this will give the entire filename as the ext.
+  file_extension = filename[filename.rfind('.') + 1:]
+
+  # When reading from stdin, the extension is unknown, so no cpplint tests
+  # should rely on the extension.
+  if filename != '-' and file_extension not in _valid_extensions:
+    sys.stderr.write('Ignoring %s; not a valid file name '
+                     '(%s)\n' % (filename, ', '.join(_valid_extensions)))
+  else:
+    ProcessFileData(filename, file_extension, lines, Error,
+                    extra_check_functions)
+    if carriage_return_found and os.linesep != '\r\n':
+      # Use 0 for linenum since outputting only one error for potentially
+      # several lines.
+      Error(filename, 0, 'whitespace/newline', 1,
+            'One or more unexpected \\r (^M) found;'
+            'better to use only a \\n')
+
+  sys.stderr.write('Done processing %s\n' % filename)
+
+
+def PrintUsage(message):
+  """Prints a brief usage string and exits, optionally with an error message.
+
+  Args:
+    message: The optional error message.
+  """
+  sys.stderr.write(_USAGE)
+  if message:
+    sys.exit('\nFATAL ERROR: ' + message)
+  else:
+    sys.exit(1)
+
+
+def PrintCategories():
+  """Prints a list of all the error-categories used by error messages.
+
+  These are the categories used to filter messages via --filter.
+  """
+  sys.stderr.write(''.join('  %s\n' % cat for cat in _ERROR_CATEGORIES))
+  sys.exit(0)
+
+
+def ParseArguments(args):
+  """Parses the command line arguments.
+
+  This may set the output format and verbosity level as side-effects.
+
+  Args:
+    args: The command line arguments:
+
+  Returns:
+    The list of filenames to lint.
+  """
+  try:
+    (opts, filenames) = getopt.getopt(args, '', ['help', 'output=', 'verbose=',
+                                                 'counting=',
+                                                 'filter=',
+                                                 'root=',
+                                                 'linelength=',
+                                                 'extensions='])
+  except getopt.GetoptError:
+    PrintUsage('Invalid arguments.')
+
+  verbosity = _VerboseLevel()
+  output_format = _OutputFormat()
+  filters = ''
+  counting_style = ''
+
+  for (opt, val) in opts:
+    if opt == '--help':
+      PrintUsage(None)
+    elif opt == '--output':
+      if val not in ('emacs', 'vs7', 'eclipse'):
+        PrintUsage('The only allowed output formats are emacs, vs7 and eclipse.')
+      output_format = val
+    elif opt == '--verbose':
+      verbosity = int(val)
+    elif opt == '--filter':
+      filters = val
+      if not filters:
+        PrintCategories()
+    elif opt == '--counting':
+      if val not in ('total', 'toplevel', 'detailed'):
+        PrintUsage('Valid counting options are total, toplevel, and detailed')
+      counting_style = val
+    elif opt == '--root':
+      global _root
+      _root = val
+    elif opt == '--linelength':
+      global _line_length
+      try:
+          _line_length = int(val)
+      except ValueError:
+          PrintUsage('Line length must be digits.')
+    elif opt == '--extensions':
+      global _valid_extensions
+      try:
+          _valid_extensions = set(val.split(','))
+      except ValueError:
+          PrintUsage('Extensions must be comma seperated list.')
+
+  if not filenames:
+    PrintUsage('No files were specified.')
+
+  _SetOutputFormat(output_format)
+  _SetVerboseLevel(verbosity)
+  _SetFilters(filters)
+  _SetCountingStyle(counting_style)
+
+  return filenames
+
+
+def main():
+  filenames = ParseArguments(sys.argv[1:])
+
+  # Change stderr to write with replacement characters so we don't die
+  # if we try to print something containing non-ASCII characters.
+  sys.stderr = codecs.StreamReaderWriter(sys.stderr,
+                                         codecs.getreader('utf8'),
+                                         codecs.getwriter('utf8'),
+                                         'replace')
+
+  _cpplint_state.ResetErrorCounts()
+  for filename in filenames:
+    ProcessFile(filename, _cpplint_state.verbose_level)
+  _cpplint_state.PrintErrorCounts()
+
+  sys.exit(_cpplint_state.error_count > 0)
+
+
+if __name__ == '__main__':
+  main()
diff --git a/libs/libaom/src/tools/diff.py b/libs/libaom/src/tools/diff.py
new file mode 100644
index 000000000..bac6aabdc
--- /dev/null
+++ b/libs/libaom/src/tools/diff.py
@@ -0,0 +1,132 @@
+#!/usr/bin/env python
+##
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+"""Classes for representing diff pieces."""
+
+__author__ = "jkoleszar@google.com"
+
+import re
+
+
+class DiffLines(object):
+    """A container for one half of a diff."""
+
+    def __init__(self, filename, offset, length):
+        self.filename = filename
+        self.offset = offset
+        self.length = length
+        self.lines = []
+        self.delta_line_nums = []
+
+    def Append(self, line):
+        l = len(self.lines)
+        if line[0] != " ":
+            self.delta_line_nums.append(self.offset + l)
+        self.lines.append(line[1:])
+        assert l+1 <= self.length
+
+    def Complete(self):
+        return len(self.lines) == self.length
+
+    def __contains__(self, item):
+        return item >= self.offset and item <= self.offset + self.length - 1
+
+
+class DiffHunk(object):
+    """A container for one diff hunk, consisting of two DiffLines."""
+
+    def __init__(self, header, file_a, file_b, start_a, len_a, start_b, len_b):
+        self.header = header
+        self.left = DiffLines(file_a, start_a, len_a)
+        self.right = DiffLines(file_b, start_b, len_b)
+        self.lines = []
+
+    def Append(self, line):
+        """Adds a line to the DiffHunk and its DiffLines children."""
+        if line[0] == "-":
+            self.left.Append(line)
+        elif line[0] == "+":
+            self.right.Append(line)
+        elif line[0] == " ":
+            self.left.Append(line)
+            self.right.Append(line)
+        elif line[0] == "\\":
+            # Ignore newline messages from git diff.
+            pass
+        else:
+            assert False, ("Unrecognized character at start of diff line "
+                           "%r" % line[0])
+        self.lines.append(line)
+
+    def Complete(self):
+        return self.left.Complete() and self.right.Complete()
+
+    def __repr__(self):
+        return "DiffHunk(%s, %s, len %d)" % (
+            self.left.filename, self.right.filename,
+            max(self.left.length, self.right.length))
+
+
+def ParseDiffHunks(stream):
+    """Walk a file-like object, yielding DiffHunks as they're parsed."""
+
+    file_regex = re.compile(r"(\+\+\+|---) (\S+)")
+    range_regex = re.compile(r"@@ -(\d+)(,(\d+))? \+(\d+)(,(\d+))?")
+    hunk = None
+    while True:
+        line = stream.readline()
+        if not line:
+            break
+
+        if hunk is None:
+            # Parse file names
+            diff_file = file_regex.match(line)
+            if diff_file:
+              if line.startswith("---"):
+                  a_line = line
+                  a = diff_file.group(2)
+                  continue
+              if line.startswith("+++"):
+                  b_line = line
+                  b = diff_file.group(2)
+                  continue
+
+            # Parse offset/lengths
+            diffrange = range_regex.match(line)
+            if diffrange:
+                if diffrange.group(2):
+                    start_a = int(diffrange.group(1))
+                    len_a = int(diffrange.group(3))
+                else:
+                    start_a = 1
+                    len_a = int(diffrange.group(1))
+
+                if diffrange.group(5):
+                    start_b = int(diffrange.group(4))
+                    len_b = int(diffrange.group(6))
+                else:
+                    start_b = 1
+                    len_b = int(diffrange.group(4))
+
+                header = [a_line, b_line, line]
+                hunk = DiffHunk(header, a, b, start_a, len_a, start_b, len_b)
+        else:
+            # Add the current line to the hunk
+            hunk.Append(line)
+
+            # See if the whole hunk has been parsed. If so, yield it and prepare
+            # for the next hunk.
+            if hunk.Complete():
+                yield hunk
+                hunk = None
+
+    # Partial hunks are a parse error
+    assert hunk is None
diff --git a/libs/libaom/src/tools/dump_obu.cc b/libs/libaom/src/tools/dump_obu.cc
new file mode 100644
index 000000000..30ee5e7a1
--- /dev/null
+++ b/libs/libaom/src/tools/dump_obu.cc
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+#include <memory>
+#include <string>
+
+#include "config/aom_config.h"
+
+#include "common/ivfdec.h"
+#include "common/obudec.h"
+#include "common/tools_common.h"
+#include "common/webmdec.h"
+#include "tools/obu_parser.h"
+
+namespace {
+
+const size_t kInitialBufferSize = 100 * 1024;
+
+struct InputContext {
+  InputContext() = default;
+  ~InputContext() { free(unit_buffer); }
+
+  void Init() {
+    memset(avx_ctx, 0, sizeof(*avx_ctx));
+    memset(obu_ctx, 0, sizeof(*obu_ctx));
+    obu_ctx->avx_ctx = avx_ctx;
+#if CONFIG_WEBM_IO
+    memset(webm_ctx, 0, sizeof(*webm_ctx));
+#endif
+  }
+
+  AvxInputContext *avx_ctx = nullptr;
+  ObuDecInputContext *obu_ctx = nullptr;
+#if CONFIG_WEBM_IO
+  WebmInputContext *webm_ctx = nullptr;
+#endif
+  uint8_t *unit_buffer = nullptr;
+  size_t unit_buffer_size = 0;
+};
+
+void PrintUsage() {
+  printf("Libaom OBU dump.\nUsage: dump_obu <input_file>\n");
+}
+
+VideoFileType GetFileType(InputContext *ctx) {
+  if (file_is_ivf(ctx->avx_ctx)) return FILE_TYPE_IVF;
+  if (file_is_obu(ctx->obu_ctx)) return FILE_TYPE_OBU;
+#if CONFIG_WEBM_IO
+  if (file_is_webm(ctx->webm_ctx, ctx->avx_ctx)) return FILE_TYPE_WEBM;
+#endif
+  return FILE_TYPE_RAW;
+}
+
+bool ReadTemporalUnit(InputContext *ctx, size_t *unit_size) {
+  const VideoFileType file_type = ctx->avx_ctx->file_type;
+  switch (file_type) {
+    case FILE_TYPE_IVF: {
+      if (ivf_read_frame(ctx->avx_ctx->file, &ctx->unit_buffer, unit_size,
+                         &ctx->unit_buffer_size, NULL)) {
+        return false;
+      }
+      break;
+    }
+    case FILE_TYPE_OBU: {
+      if (obudec_read_temporal_unit(ctx->obu_ctx, &ctx->unit_buffer, unit_size,
+                                    &ctx->unit_buffer_size)) {
+        return false;
+      }
+      break;
+    }
+#if CONFIG_WEBM_IO
+    case FILE_TYPE_WEBM: {
+      if (webm_read_frame(ctx->webm_ctx, &ctx->unit_buffer, unit_size,
+                          &ctx->unit_buffer_size)) {
+        return false;
+      }
+      break;
+    }
+#endif
+    default:
+      // TODO(tomfinegan): Abuse FILE_TYPE_RAW for AV1/OBU elementary streams?
+      fprintf(stderr, "Error: Unsupported file type.\n");
+      return false;
+  }
+
+  return true;
+}
+
+}  // namespace
+
+int main(int argc, const char *argv[]) {
+  // TODO(tomfinegan): Could do with some params for verbosity.
+  if (argc < 2) {
+    PrintUsage();
+    return EXIT_SUCCESS;
+  }
+
+  const std::string filename = argv[1];
+
+  using FilePtr = std::unique_ptr<FILE, decltype(&fclose)>;
+  FilePtr input_file(fopen(filename.c_str(), "rb"), &fclose);
+  if (input_file.get() == nullptr) {
+    input_file.release();
+    fprintf(stderr, "Error: Cannot open input file.\n");
+    return EXIT_FAILURE;
+  }
+
+  AvxInputContext avx_ctx;
+  InputContext input_ctx;
+  input_ctx.avx_ctx = &avx_ctx;
+  ObuDecInputContext obu_ctx;
+  input_ctx.obu_ctx = &obu_ctx;
+#if CONFIG_WEBM_IO
+  WebmInputContext webm_ctx;
+  input_ctx.webm_ctx = &webm_ctx;
+#endif
+
+  input_ctx.Init();
+  avx_ctx.file = input_file.get();
+  avx_ctx.file_type = GetFileType(&input_ctx);
+
+  // Note: the reader utilities will realloc the buffer using realloc() etc.
+  // Can't have nice things like unique_ptr wrappers with that type of
+  // behavior underneath the function calls.
+  input_ctx.unit_buffer =
+      reinterpret_cast<uint8_t *>(calloc(kInitialBufferSize, 1));
+  if (!input_ctx.unit_buffer) {
+    fprintf(stderr, "Error: No memory, can't alloc input buffer.\n");
+    return EXIT_FAILURE;
+  }
+  input_ctx.unit_buffer_size = kInitialBufferSize;
+
+  size_t unit_size = 0;
+  int unit_number = 0;
+  int64_t obu_overhead_bytes_total = 0;
+  while (ReadTemporalUnit(&input_ctx, &unit_size)) {
+    printf("Temporal unit %d\n", unit_number);
+
+    int obu_overhead_current_unit = 0;
+    if (!aom_tools::DumpObu(input_ctx.unit_buffer, static_cast<int>(unit_size),
+                            &obu_overhead_current_unit)) {
+      fprintf(stderr, "Error: Temporal Unit parse failed on unit number %d.\n",
+              unit_number);
+      return EXIT_FAILURE;
+    }
+    printf("  OBU overhead:    %d\n", obu_overhead_current_unit);
+    ++unit_number;
+    obu_overhead_bytes_total += obu_overhead_current_unit;
+  }
+
+  printf("File total OBU overhead: %" PRId64 "\n", obu_overhead_bytes_total);
+  return EXIT_SUCCESS;
+}
diff --git a/libs/libaom/src/tools/gen_authors.sh b/libs/libaom/src/tools/gen_authors.sh
new file mode 100644
index 000000000..5def8bc89
--- /dev/null
+++ b/libs/libaom/src/tools/gen_authors.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+# Add organization names manually.
+
+cat <<EOF
+# This file is automatically generated from the git commit history
+# by tools/gen_authors.sh.
+
+$(git log --pretty=format:"%aN <%aE>" | sort | uniq | grep -v "corp.google\|clang-format")
+EOF
diff --git a/libs/libaom/src/tools/gen_constrained_tokenset.py b/libs/libaom/src/tools/gen_constrained_tokenset.py
new file mode 100644
index 000000000..5d12ee1ef
--- /dev/null
+++ b/libs/libaom/src/tools/gen_constrained_tokenset.py
@@ -0,0 +1,120 @@
+#!/usr/bin/python
+##
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+"""Generate the probability model for the constrained token set.
+
+Model obtained from a 2-sided zero-centered distribution derived
+from a Pareto distribution. The cdf of the distribution is:
+cdf(x) = 0.5 + 0.5 * sgn(x) * [1 - {alpha/(alpha + |x|)} ^ beta]
+
+For a given beta and a given probability of the 1-node, the alpha
+is first solved, and then the {alpha, beta} pair is used to generate
+the probabilities for the rest of the nodes.
+"""
+
+import heapq
+import sys
+import numpy as np
+import scipy.optimize
+import scipy.stats
+
+
+def cdf_spareto(x, xm, beta):
+  p = 1 - (xm / (np.abs(x) + xm))**beta
+  p = 0.5 + 0.5 * np.sign(x) * p
+  return p
+
+
+def get_spareto(p, beta):
+  cdf = cdf_spareto
+
+  def func(x):
+    return ((cdf(1.5, x, beta) - cdf(0.5, x, beta)) /
+            (1 - cdf(0.5, x, beta)) - p)**2
+
+  alpha = scipy.optimize.fminbound(func, 1e-12, 10000, xtol=1e-12)
+  parray = np.zeros(11)
+  parray[0] = 2 * (cdf(0.5, alpha, beta) - 0.5)
+  parray[1] = (2 * (cdf(1.5, alpha, beta) - cdf(0.5, alpha, beta)))
+  parray[2] = (2 * (cdf(2.5, alpha, beta) - cdf(1.5, alpha, beta)))
+  parray[3] = (2 * (cdf(3.5, alpha, beta) - cdf(2.5, alpha, beta)))
+  parray[4] = (2 * (cdf(4.5, alpha, beta) - cdf(3.5, alpha, beta)))
+  parray[5] = (2 * (cdf(6.5, alpha, beta) - cdf(4.5, alpha, beta)))
+  parray[6] = (2 * (cdf(10.5, alpha, beta) - cdf(6.5, alpha, beta)))
+  parray[7] = (2 * (cdf(18.5, alpha, beta) - cdf(10.5, alpha, beta)))
+  parray[8] = (2 * (cdf(34.5, alpha, beta) - cdf(18.5, alpha, beta)))
+  parray[9] = (2 * (cdf(66.5, alpha, beta) - cdf(34.5, alpha, beta)))
+  parray[10] = 2 * (1. - cdf(66.5, alpha, beta))
+  return parray
+
+
+def quantize_probs(p, save_first_bin, bits):
+  """Quantize probability precisely.
+
+  Quantize probabilities minimizing dH (Kullback-Leibler divergence)
+  approximated by: sum (p_i-q_i)^2/p_i.
+  References:
+  https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence
+  https://github.com/JarekDuda/AsymmetricNumeralSystemsToolkit
+  """
+  num_sym = p.size
+  p = np.clip(p, 1e-16, 1)
+  L = 2**bits
+  pL = p * L
+  ip = 1. / p  # inverse probability
+  q = np.clip(np.round(pL), 1, L + 1 - num_sym)
+  quant_err = (pL - q)**2 * ip
+  sgn = np.sign(L - q.sum())  # direction of correction
+  if sgn != 0:  # correction is needed
+    v = []  # heap of adjustment results (adjustment err, index) of each symbol
+    for i in range(1 if save_first_bin else 0, num_sym):
+      q_adj = q[i] + sgn
+      if q_adj > 0 and q_adj < L:
+        adj_err = (pL[i] - q_adj)**2 * ip[i] - quant_err[i]
+        heapq.heappush(v, (adj_err, i))
+    while q.sum() != L:
+      # apply lowest error adjustment
+      (adj_err, i) = heapq.heappop(v)
+      quant_err[i] += adj_err
+      q[i] += sgn
+      # calculate the cost of adjusting this symbol again
+      q_adj = q[i] + sgn
+      if q_adj > 0 and q_adj < L:
+        adj_err = (pL[i] - q_adj)**2 * ip[i] - quant_err[i]
+        heapq.heappush(v, (adj_err, i))
+  return q
+
+
+def get_quantized_spareto(p, beta, bits, first_token):
+  parray = get_spareto(p, beta)
+  parray = parray[1:] / (1 - parray[0])
+  # CONFIG_NEW_TOKENSET
+  if first_token > 1:
+    parray = parray[1:] / (1 - parray[0])
+  qarray = quantize_probs(parray, first_token == 1, bits)
+  return qarray.astype(np.int)
+
+
+def main(bits=15, first_token=1):
+  beta = 8
+  for q in range(1, 256):
+    parray = get_quantized_spareto(q / 256., beta, bits, first_token)
+    assert parray.sum() == 2**bits
+    print '{', ', '.join('%d' % i for i in parray), '},'
+
+
+if __name__ == '__main__':
+  if len(sys.argv) > 2:
+    main(int(sys.argv[1]), int(sys.argv[2]))
+  elif len(sys.argv) > 1:
+    main(int(sys.argv[1]))
+  else:
+    main()
diff --git a/libs/libaom/src/tools/inspect-cli.js b/libs/libaom/src/tools/inspect-cli.js
new file mode 100644
index 000000000..a14c08111
--- /dev/null
+++ b/libs/libaom/src/tools/inspect-cli.js
@@ -0,0 +1,39 @@
+/**
+ * This tool lets you test if the compiled Javascript decoder is functioning properly. You'll
+ * need to download a SpiderMonkey js-shell to run this script.
+ * https://archive.mozilla.org/pub/firefox/nightly/latest-mozilla-central/
+ *
+ * Example:
+ *   js-shell inspect-cli.js video.ivf
+ */
+load("inspect.js");
+var buffer = read(scriptArgs[0], "binary");
+var Module = {
+  noExitRuntime: true,
+  noInitialRun: true,
+  preInit: [],
+  preRun: [],
+  postRun: [function () {
+    printErr(`Loaded Javascript Decoder OK`);
+  }],
+  memoryInitializerPrefixURL: "bin/",
+  arguments: ['input.ivf', 'output.raw'],
+  on_frame_decoded_json: function (jsonString) {
+    let json = JSON.parse("[" + Module.UTF8ToString(jsonString) + "null]");
+    json.forEach(frame => {
+      if (frame) {
+        print(frame.frame);
+      }
+    });
+  }
+};
+DecoderModule(Module);
+Module.FS.writeFile("/tmp/input.ivf", buffer, { encoding: "binary" });
+Module._open_file();
+Module._set_layers(0xFFFFFFFF); // Set this to zero if you want to benchmark decoding.
+while(true) {
+  printErr("Decoding Frame ...");
+  if (Module._read_frame()) {
+    break;
+  }
+}
diff --git a/libs/libaom/src/tools/inspect-post.js b/libs/libaom/src/tools/inspect-post.js
new file mode 100644
index 000000000..31c40bb82
--- /dev/null
+++ b/libs/libaom/src/tools/inspect-post.js
@@ -0,0 +1 @@
+Module["FS"] = FS;
diff --git a/libs/libaom/src/tools/intersect-diffs.py b/libs/libaom/src/tools/intersect-diffs.py
new file mode 100644
index 000000000..df13c4ef7
--- /dev/null
+++ b/libs/libaom/src/tools/intersect-diffs.py
@@ -0,0 +1,78 @@
+#!/usr/bin/env python
+##
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+"""Calculates the "intersection" of two unified diffs.
+
+Given two diffs, A and B, it finds all hunks in B that had non-context lines
+in A and prints them to stdout. This is useful to determine the hunks in B that
+are relevant to A. The resulting file can be applied with patch(1) on top of A.
+"""
+
+__author__ = "jkoleszar@google.com"
+
+import sys
+
+import diff
+
+
+def FormatDiffHunks(hunks):
+    """Re-serialize a list of DiffHunks."""
+    r = []
+    last_header = None
+    for hunk in hunks:
+        this_header = hunk.header[0:2]
+        if last_header != this_header:
+            r.extend(hunk.header)
+            last_header = this_header
+        else:
+            r.extend(hunk.header[2])
+        r.extend(hunk.lines)
+        r.append("\n")
+    return "".join(r)
+
+
+def ZipHunks(rhs_hunks, lhs_hunks):
+    """Join two hunk lists on filename."""
+    for rhs_hunk in rhs_hunks:
+        rhs_file = rhs_hunk.right.filename.split("/")[1:]
+
+        for lhs_hunk in lhs_hunks:
+            lhs_file = lhs_hunk.left.filename.split("/")[1:]
+            if lhs_file != rhs_file:
+                continue
+            yield (rhs_hunk, lhs_hunk)
+
+
+def main():
+    old_hunks = [x for x in diff.ParseDiffHunks(open(sys.argv[1], "r"))]
+    new_hunks = [x for x in diff.ParseDiffHunks(open(sys.argv[2], "r"))]
+    out_hunks = []
+
+    # Join the right hand side of the older diff with the left hand side of the
+    # newer diff.
+    for old_hunk, new_hunk in ZipHunks(old_hunks, new_hunks):
+        if new_hunk in out_hunks:
+            continue
+        old_lines = old_hunk.right
+        new_lines = new_hunk.left
+
+        # Determine if this hunk overlaps any non-context line from the other
+        for i in old_lines.delta_line_nums:
+            if i in new_lines:
+                out_hunks.append(new_hunk)
+                break
+
+    if out_hunks:
+        print FormatDiffHunks(out_hunks)
+        sys.exit(1)
+
+if __name__ == "__main__":
+    main()
diff --git a/libs/libaom/src/tools/lint-hunks.py b/libs/libaom/src/tools/lint-hunks.py
new file mode 100644
index 000000000..d02bee16c
--- /dev/null
+++ b/libs/libaom/src/tools/lint-hunks.py
@@ -0,0 +1,146 @@
+#!/usr/bin/python
+##
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+"""Performs style checking on each diff hunk."""
+import getopt
+import os
+import StringIO
+import subprocess
+import sys
+
+import diff
+
+
+SHORT_OPTIONS = "h"
+LONG_OPTIONS = ["help"]
+
+TOPLEVEL_CMD = ["git", "rev-parse", "--show-toplevel"]
+DIFF_CMD = ["git", "diff"]
+DIFF_INDEX_CMD = ["git", "diff-index", "-u", "HEAD", "--"]
+SHOW_CMD = ["git", "show"]
+CPPLINT_FILTERS = ["-readability/casting"]
+
+
+class Usage(Exception):
+    pass
+
+
+class SubprocessException(Exception):
+    def __init__(self, args):
+        msg = "Failed to execute '%s'"%(" ".join(args))
+        super(SubprocessException, self).__init__(msg)
+
+
+class Subprocess(subprocess.Popen):
+    """Adds the notion of an expected returncode to Popen."""
+
+    def __init__(self, args, expected_returncode=0, **kwargs):
+        self._args = args
+        self._expected_returncode = expected_returncode
+        super(Subprocess, self).__init__(args, **kwargs)
+
+    def communicate(self, *args, **kwargs):
+        result = super(Subprocess, self).communicate(*args, **kwargs)
+        if self._expected_returncode is not None:
+            try:
+                ok = self.returncode in self._expected_returncode
+            except TypeError:
+                ok = self.returncode == self._expected_returncode
+            if not ok:
+                raise SubprocessException(self._args)
+        return result
+
+
+def main(argv=None):
+    if argv is None:
+        argv = sys.argv
+    try:
+        try:
+            opts, args = getopt.getopt(argv[1:], SHORT_OPTIONS, LONG_OPTIONS)
+        except getopt.error, msg:
+            raise Usage(msg)
+
+        # process options
+        for o, _ in opts:
+            if o in ("-h", "--help"):
+                print __doc__
+                sys.exit(0)
+
+        if args and len(args) > 1:
+            print __doc__
+            sys.exit(0)
+
+        # Find the fully qualified path to the root of the tree
+        tl = Subprocess(TOPLEVEL_CMD, stdout=subprocess.PIPE)
+        tl = tl.communicate()[0].strip()
+
+        # See if we're working on the index or not.
+        if args:
+            diff_cmd = DIFF_CMD + [args[0] + "^!"]
+        else:
+            diff_cmd = DIFF_INDEX_CMD
+
+        # Build the command line to execute cpplint
+        cpplint_cmd = [os.path.join(tl, "tools", "cpplint.py"),
+                       "--filter=" + ",".join(CPPLINT_FILTERS),
+                       "-"]
+
+        # Get a list of all affected lines
+        file_affected_line_map = {}
+        p = Subprocess(diff_cmd, stdout=subprocess.PIPE)
+        stdout = p.communicate()[0]
+        for hunk in diff.ParseDiffHunks(StringIO.StringIO(stdout)):
+            filename = hunk.right.filename[2:]
+            if filename not in file_affected_line_map:
+                file_affected_line_map[filename] = set()
+            file_affected_line_map[filename].update(hunk.right.delta_line_nums)
+
+        # Run each affected file through cpplint
+        lint_failed = False
+        for filename, affected_lines in file_affected_line_map.iteritems():
+            if filename.split(".")[-1] not in ("c", "h", "cc"):
+                continue
+
+            if args:
+                # File contents come from git
+                show_cmd = SHOW_CMD + [args[0] + ":" + filename]
+                show = Subprocess(show_cmd, stdout=subprocess.PIPE)
+                lint = Subprocess(cpplint_cmd, expected_returncode=(0, 1),
+                                  stdin=show.stdout, stderr=subprocess.PIPE)
+                lint_out = lint.communicate()[1]
+            else:
+                # File contents come from the working tree
+                lint = Subprocess(cpplint_cmd, expected_returncode=(0, 1),
+                                  stdin=subprocess.PIPE, stderr=subprocess.PIPE)
+                stdin = open(os.path.join(tl, filename)).read()
+                lint_out = lint.communicate(stdin)[1]
+
+            for line in lint_out.split("\n"):
+                fields = line.split(":")
+                if fields[0] != "-":
+                    continue
+                warning_line_num = int(fields[1])
+                if warning_line_num in affected_lines:
+                    print "%s:%d:%s"%(filename, warning_line_num,
+                                      ":".join(fields[2:]))
+                    lint_failed = True
+
+        # Set exit code if any relevant lint errors seen
+        if lint_failed:
+            return 1
+
+    except Usage, err:
+        print >>sys.stderr, err
+        print >>sys.stderr, "for help use --help"
+        return 2
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/libs/libaom/src/tools/obu_parser.cc b/libs/libaom/src/tools/obu_parser.cc
new file mode 100644
index 000000000..7d71386ce
--- /dev/null
+++ b/libs/libaom/src/tools/obu_parser.cc
@@ -0,0 +1,190 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <string.h>
+
+#include <cstdio>
+#include <string>
+
+#include "aom/aom_codec.h"
+#include "aom/aom_integer.h"
+#include "aom_ports/mem_ops.h"
+#include "av1/common/obu_util.h"
+#include "tools/obu_parser.h"
+
+namespace aom_tools {
+
+// Basic OBU syntax
+// 8 bits: Header
+//   7
+//     forbidden bit
+//   6,5,4,3
+//     type bits
+//   2
+//     extension flag bit
+//   1
+//     has size field bit
+//   0
+//     reserved bit
+const uint32_t kObuForbiddenBitMask = 0x1;
+const uint32_t kObuForbiddenBitShift = 7;
+const uint32_t kObuTypeBitsMask = 0xF;
+const uint32_t kObuTypeBitsShift = 3;
+const uint32_t kObuExtensionFlagBitMask = 0x1;
+const uint32_t kObuExtensionFlagBitShift = 2;
+const uint32_t kObuHasSizeFieldBitMask = 0x1;
+const uint32_t kObuHasSizeFieldBitShift = 1;
+
+// When extension flag bit is set:
+// 8 bits: extension header
+// 7,6,5
+//   temporal ID
+// 4,3
+//   spatial ID
+// 2,1,0
+//   reserved bits
+const uint32_t kObuExtTemporalIdBitsMask = 0x7;
+const uint32_t kObuExtTemporalIdBitsShift = 5;
+const uint32_t kObuExtSpatialIdBitsMask = 0x3;
+const uint32_t kObuExtSpatialIdBitsShift = 3;
+
+bool ValidObuType(int obu_type) {
+  switch (obu_type) {
+    case OBU_SEQUENCE_HEADER:
+    case OBU_TEMPORAL_DELIMITER:
+    case OBU_FRAME_HEADER:
+    case OBU_TILE_GROUP:
+    case OBU_METADATA:
+    case OBU_FRAME:
+    case OBU_REDUNDANT_FRAME_HEADER:
+    case OBU_TILE_LIST:
+    case OBU_PADDING: return true;
+  }
+  return false;
+}
+
+bool ParseObuHeader(uint8_t obu_header_byte, ObuHeader *obu_header) {
+  const int forbidden_bit =
+      (obu_header_byte >> kObuForbiddenBitShift) & kObuForbiddenBitMask;
+  if (forbidden_bit) {
+    fprintf(stderr, "Invalid OBU, forbidden bit set.\n");
+    return false;
+  }
+
+  obu_header->type = static_cast<OBU_TYPE>(
+      (obu_header_byte >> kObuTypeBitsShift) & kObuTypeBitsMask);
+  if (!ValidObuType(obu_header->type)) {
+    fprintf(stderr, "Invalid OBU type: %d.\n", obu_header->type);
+    return false;
+  }
+
+  obu_header->has_extension =
+      (obu_header_byte >> kObuExtensionFlagBitShift) & kObuExtensionFlagBitMask;
+  obu_header->has_size_field =
+      (obu_header_byte >> kObuHasSizeFieldBitShift) & kObuHasSizeFieldBitMask;
+  return true;
+}
+
+bool ParseObuExtensionHeader(uint8_t ext_header_byte, ObuHeader *obu_header) {
+  obu_header->temporal_layer_id =
+      (ext_header_byte >> kObuExtTemporalIdBitsShift) &
+      kObuExtTemporalIdBitsMask;
+  obu_header->spatial_layer_id =
+      (ext_header_byte >> kObuExtSpatialIdBitsShift) & kObuExtSpatialIdBitsMask;
+
+  return true;
+}
+
+void PrintObuHeader(const ObuHeader *header) {
+  printf(
+      "  OBU type:        %s\n"
+      "      extension:   %s\n",
+      aom_obu_type_to_string(static_cast<OBU_TYPE>(header->type)),
+      header->has_extension ? "yes" : "no");
+  if (header->has_extension) {
+    printf(
+        "      temporal_id: %d\n"
+        "      spatial_id:  %d\n",
+        header->temporal_layer_id, header->temporal_layer_id);
+  }
+}
+
+bool DumpObu(const uint8_t *data, int length, int *obu_overhead_bytes) {
+  const int kObuHeaderSizeBytes = 1;
+  const int kMinimumBytesRequired = 1 + kObuHeaderSizeBytes;
+  int consumed = 0;
+  int obu_overhead = 0;
+  ObuHeader obu_header;
+  while (consumed < length) {
+    const int remaining = length - consumed;
+    if (remaining < kMinimumBytesRequired) {
+      fprintf(stderr,
+              "OBU parse error. Did not consume all data, %d bytes remain.\n",
+              remaining);
+      return false;
+    }
+
+    int obu_header_size = 0;
+
+    memset(&obu_header, 0, sizeof(obu_header));
+    const uint8_t obu_header_byte = *(data + consumed);
+    if (!ParseObuHeader(obu_header_byte, &obu_header)) {
+      fprintf(stderr, "OBU parsing failed at offset %d.\n", consumed);
+      return false;
+    }
+
+    ++obu_overhead;
+    ++obu_header_size;
+
+    if (obu_header.has_extension) {
+      const uint8_t obu_ext_header_byte =
+          *(data + consumed + kObuHeaderSizeBytes);
+      if (!ParseObuExtensionHeader(obu_ext_header_byte, &obu_header)) {
+        fprintf(stderr, "OBU extension parsing failed at offset %d.\n",
+                consumed + kObuHeaderSizeBytes);
+        return false;
+      }
+
+      ++obu_overhead;
+      ++obu_header_size;
+    }
+
+    PrintObuHeader(&obu_header);
+
+    uint64_t obu_size = 0;
+    size_t length_field_size = 0;
+    if (aom_uleb_decode(data + consumed + obu_header_size,
+                        remaining - obu_header_size, &obu_size,
+                        &length_field_size) != 0) {
+      fprintf(stderr, "OBU size parsing failed at offset %d.\n",
+              consumed + obu_header_size);
+      return false;
+    }
+    int current_obu_length = static_cast<int>(obu_size);
+    if (obu_header_size + static_cast<int>(length_field_size) +
+            current_obu_length >
+        remaining) {
+      fprintf(stderr, "OBU parsing failed: not enough OBU data.\n");
+      return false;
+    }
+    consumed += obu_header_size + static_cast<int>(length_field_size) +
+                current_obu_length;
+    printf("      length:      %d\n",
+           static_cast<int>(obu_header_size + length_field_size +
+                            current_obu_length));
+  }
+
+  if (obu_overhead_bytes != nullptr) *obu_overhead_bytes = obu_overhead;
+  printf("  TU size: %d\n", consumed);
+
+  return true;
+}
+
+}  // namespace aom_tools
diff --git a/libs/libaom/src/tools/obu_parser.h b/libs/libaom/src/tools/obu_parser.h
new file mode 100644
index 000000000..1d7d2d794
--- /dev/null
+++ b/libs/libaom/src/tools/obu_parser.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_TOOLS_OBU_PARSER_H_
+#define AOM_TOOLS_OBU_PARSER_H_
+
+#include <cstdint>
+
+namespace aom_tools {
+
+// Print information obtained from OBU(s) in data until data is exhausted or an
+// error occurs. Returns true when all data is consumed successfully, and
+// optionally reports OBU storage overhead via obu_overhead_bytes when the
+// pointer is non-null.
+bool DumpObu(const uint8_t *data, int length, int *obu_overhead_bytes);
+
+}  // namespace aom_tools
+
+#endif  // AOM_TOOLS_OBU_PARSER_H_
diff --git a/libs/libaom/src/tools/txfm_analyzer/txfm_gen_code.cc b/libs/libaom/src/tools/txfm_analyzer/txfm_gen_code.cc
new file mode 100644
index 000000000..7c5400b91
--- /dev/null
+++ b/libs/libaom/src/tools/txfm_analyzer/txfm_gen_code.cc
@@ -0,0 +1,580 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <float.h>
+#include <string.h>
+
+#include "tools/txfm_analyzer/txfm_graph.h"
+
+typedef enum CODE_TYPE {
+  CODE_TYPE_C,
+  CODE_TYPE_SSE2,
+  CODE_TYPE_SSE4_1
+} CODE_TYPE;
+
+int get_cos_idx(double value, int mod) {
+  return round(acos(fabs(value)) / PI * mod);
+}
+
+char *cos_text_arr(double value, int mod, char *text, int size) {
+  int num = get_cos_idx(value, mod);
+  if (value < 0) {
+    snprintf(text, size, "-cospi[%2d]", num);
+  } else {
+    snprintf(text, size, " cospi[%2d]", num);
+  }
+
+  if (num == 0)
+    printf("v: %f -> %d/%d v==-1 is %d\n", value, num, mod, value == -1);
+
+  return text;
+}
+
+char *cos_text_sse2(double w0, double w1, int mod, char *text, int size) {
+  int idx0 = get_cos_idx(w0, mod);
+  int idx1 = get_cos_idx(w1, mod);
+  char p[] = "p";
+  char n[] = "m";
+  char *sgn0 = w0 < 0 ? n : p;
+  char *sgn1 = w1 < 0 ? n : p;
+  snprintf(text, size, "cospi_%s%02d_%s%02d", sgn0, idx0, sgn1, idx1);
+  return text;
+}
+
+char *cos_text_sse4_1(double w, int mod, char *text, int size) {
+  int idx = get_cos_idx(w, mod);
+  char p[] = "p";
+  char n[] = "m";
+  char *sgn = w < 0 ? n : p;
+  snprintf(text, size, "cospi_%s%02d", sgn, idx);
+  return text;
+}
+
+void node_to_code_c(Node *node, const char *buf0, const char *buf1) {
+  int cnt = 0;
+  for (int i = 0; i < 2; i++) {
+    if (fabs(node->inWeight[i]) == 1 || fabs(node->inWeight[i]) == 0) cnt++;
+  }
+  if (cnt == 2) {
+    int cnt2 = 0;
+    printf("  %s[%d] =", buf1, node->nodeIdx);
+    for (int i = 0; i < 2; i++) {
+      if (fabs(node->inWeight[i]) == 1) {
+        cnt2++;
+      }
+    }
+    if (cnt2 == 2) {
+      printf(" apply_value(");
+    }
+    int cnt1 = 0;
+    for (int i = 0; i < 2; i++) {
+      if (node->inWeight[i] == 1) {
+        if (cnt1 > 0)
+          printf(" + %s[%d]", buf0, node->inNodeIdx[i]);
+        else
+          printf(" %s[%d]", buf0, node->inNodeIdx[i]);
+        cnt1++;
+      } else if (node->inWeight[i] == -1) {
+        if (cnt1 > 0)
+          printf(" - %s[%d]", buf0, node->inNodeIdx[i]);
+        else
+          printf("-%s[%d]", buf0, node->inNodeIdx[i]);
+        cnt1++;
+      }
+    }
+    if (cnt2 == 2) {
+      printf(", stage_range[stage])");
+    }
+    printf(";\n");
+  } else {
+    char w0[100];
+    char w1[100];
+    printf(
+        "  %s[%d] = half_btf(%s, %s[%d], %s, %s[%d], "
+        "cos_bit);\n",
+        buf1, node->nodeIdx, cos_text_arr(node->inWeight[0], COS_MOD, w0, 100),
+        buf0, node->inNodeIdx[0],
+        cos_text_arr(node->inWeight[1], COS_MOD, w1, 100), buf0,
+        node->inNodeIdx[1]);
+  }
+}
+
+void gen_code_c(Node *node, int stage_num, int node_num, TYPE_TXFM type) {
+  char *fun_name = new char[100];
+  get_fun_name(fun_name, 100, type, node_num);
+
+  printf("\n");
+  printf(
+      "void av1_%s(const int32_t *input, int32_t *output, int8_t cos_bit, "
+      "const int8_t* stage_range) "
+      "{\n",
+      fun_name);
+  printf("  assert(output != input);\n");
+  printf("  const int32_t size = %d;\n", node_num);
+  printf("  const int32_t *cospi = cospi_arr(cos_bit);\n");
+  printf("\n");
+
+  printf("  int32_t stage = 0;\n");
+  printf("  int32_t *bf0, *bf1;\n");
+  printf("  int32_t step[%d];\n", node_num);
+
+  const char *buf0 = "bf0";
+  const char *buf1 = "bf1";
+  const char *input = "input";
+
+  int si = 0;
+  printf("\n");
+  printf("  // stage %d;\n", si);
+  printf("  apply_range(stage, input, %s, size, stage_range[stage]);\n", input);
+
+  si = 1;
+  printf("\n");
+  printf("  // stage %d;\n", si);
+  printf("  stage++;\n");
+  if (si % 2 == (stage_num - 1) % 2) {
+    printf("  %s = output;\n", buf1);
+  } else {
+    printf("  %s = step;\n", buf1);
+  }
+
+  for (int ni = 0; ni < node_num; ni++) {
+    int idx = get_idx(si, ni, node_num);
+    node_to_code_c(node + idx, input, buf1);
+  }
+
+  printf("  range_check_buf(stage, input, bf1, size, stage_range[stage]);\n");
+
+  for (int si = 2; si < stage_num; si++) {
+    printf("\n");
+    printf("  // stage %d\n", si);
+    printf("  stage++;\n");
+    if (si % 2 == (stage_num - 1) % 2) {
+      printf("  %s = step;\n", buf0);
+      printf("  %s = output;\n", buf1);
+    } else {
+      printf("  %s = output;\n", buf0);
+      printf("  %s = step;\n", buf1);
+    }
+
+    // computation code
+    for (int ni = 0; ni < node_num; ni++) {
+      int idx = get_idx(si, ni, node_num);
+      node_to_code_c(node + idx, buf0, buf1);
+    }
+
+    if (si != stage_num - 1) {
+      printf(
+          "  range_check_buf(stage, input, bf1, size, stage_range[stage]);\n");
+    }
+  }
+  printf("  apply_range(stage, input, output, size, stage_range[stage]);\n");
+  printf("}\n");
+}
+
+void single_node_to_code_sse2(Node *node, const char *buf0, const char *buf1) {
+  printf("  %s[%2d] =", buf1, node->nodeIdx);
+  if (node->inWeight[0] == 1 && node->inWeight[1] == 1) {
+    printf(" _mm_adds_epi16(%s[%d], %s[%d])", buf0, node->inNodeIdx[0], buf0,
+           node->inNodeIdx[1]);
+  } else if (node->inWeight[0] == 1 && node->inWeight[1] == -1) {
+    printf(" _mm_subs_epi16(%s[%d], %s[%d])", buf0, node->inNodeIdx[0], buf0,
+           node->inNodeIdx[1]);
+  } else if (node->inWeight[0] == -1 && node->inWeight[1] == 1) {
+    printf(" _mm_subs_epi16(%s[%d], %s[%d])", buf0, node->inNodeIdx[1], buf0,
+           node->inNodeIdx[0]);
+  } else if (node->inWeight[0] == 1 && node->inWeight[1] == 0) {
+    printf(" %s[%d]", buf0, node->inNodeIdx[0]);
+  } else if (node->inWeight[0] == 0 && node->inWeight[1] == 1) {
+    printf(" %s[%d]", buf0, node->inNodeIdx[1]);
+  } else if (node->inWeight[0] == -1 && node->inWeight[1] == 0) {
+    printf(" _mm_subs_epi16(__zero, %s[%d])", buf0, node->inNodeIdx[0]);
+  } else if (node->inWeight[0] == 0 && node->inWeight[1] == -1) {
+    printf(" _mm_subs_epi16(__zero, %s[%d])", buf0, node->inNodeIdx[1]);
+  }
+  printf(";\n");
+}
+
+void pair_node_to_code_sse2(Node *node, Node *partnerNode, const char *buf0,
+                            const char *buf1) {
+  char temp0[100];
+  char temp1[100];
+  // btf_16_sse2_type0(w0, w1, in0, in1, out0, out1)
+  if (node->inNodeIdx[0] != partnerNode->inNodeIdx[0])
+    printf("  btf_16_sse2(%s, %s, %s[%d], %s[%d], %s[%d], %s[%d]);\n",
+           cos_text_sse2(node->inWeight[0], node->inWeight[1], COS_MOD, temp0,
+                         100),
+           cos_text_sse2(partnerNode->inWeight[1], partnerNode->inWeight[0],
+                         COS_MOD, temp1, 100),
+           buf0, node->inNodeIdx[0], buf0, node->inNodeIdx[1], buf1,
+           node->nodeIdx, buf1, partnerNode->nodeIdx);
+  else
+    printf("  btf_16_sse2(%s, %s, %s[%d], %s[%d], %s[%d], %s[%d]);\n",
+           cos_text_sse2(node->inWeight[0], node->inWeight[1], COS_MOD, temp0,
+                         100),
+           cos_text_sse2(partnerNode->inWeight[0], partnerNode->inWeight[1],
+                         COS_MOD, temp1, 100),
+           buf0, node->inNodeIdx[0], buf0, node->inNodeIdx[1], buf1,
+           node->nodeIdx, buf1, partnerNode->nodeIdx);
+}
+
+Node *get_partner_node(Node *node) {
+  int diff = node->inNode[1]->nodeIdx - node->nodeIdx;
+  return node + diff;
+}
+
+void node_to_code_sse2(Node *node, const char *buf0, const char *buf1) {
+  int cnt = 0;
+  int cnt1 = 0;
+  if (node->visited == 0) {
+    node->visited = 1;
+    for (int i = 0; i < 2; i++) {
+      if (fabs(node->inWeight[i]) == 1 || fabs(node->inWeight[i]) == 0) cnt++;
+      if (fabs(node->inWeight[i]) == 1) cnt1++;
+    }
+    if (cnt == 2) {
+      if (cnt1 == 2) {
+        // has a partner
+        Node *partnerNode = get_partner_node(node);
+        partnerNode->visited = 1;
+        single_node_to_code_sse2(node, buf0, buf1);
+        single_node_to_code_sse2(partnerNode, buf0, buf1);
+      } else {
+        single_node_to_code_sse2(node, buf0, buf1);
+      }
+    } else {
+      Node *partnerNode = get_partner_node(node);
+      partnerNode->visited = 1;
+      pair_node_to_code_sse2(node, partnerNode, buf0, buf1);
+    }
+  }
+}
+
+void gen_cospi_list_sse2(Node *node, int stage_num, int node_num) {
+  int visited[65][65][2][2];
+  memset(visited, 0, sizeof(visited));
+  char text[100];
+  char text1[100];
+  char text2[100];
+  int size = 100;
+  printf("\n");
+  for (int si = 1; si < stage_num; si++) {
+    for (int ni = 0; ni < node_num; ni++) {
+      int idx = get_idx(si, ni, node_num);
+      int cnt = 0;
+      Node *node0 = node + idx;
+      if (node0->visited == 0) {
+        node0->visited = 1;
+        for (int i = 0; i < 2; i++) {
+          if (fabs(node0->inWeight[i]) == 1 || fabs(node0->inWeight[i]) == 0)
+            cnt++;
+        }
+        if (cnt != 2) {
+          {
+            double w0 = node0->inWeight[0];
+            double w1 = node0->inWeight[1];
+            int idx0 = get_cos_idx(w0, COS_MOD);
+            int idx1 = get_cos_idx(w1, COS_MOD);
+            int sgn0 = w0 < 0 ? 1 : 0;
+            int sgn1 = w1 < 0 ? 1 : 0;
+
+            if (!visited[idx0][idx1][sgn0][sgn1]) {
+              visited[idx0][idx1][sgn0][sgn1] = 1;
+              printf("  __m128i %s = pair_set_epi16(%s, %s);\n",
+                     cos_text_sse2(w0, w1, COS_MOD, text, size),
+                     cos_text_arr(w0, COS_MOD, text1, size),
+                     cos_text_arr(w1, COS_MOD, text2, size));
+            }
+          }
+          Node *node1 = get_partner_node(node0);
+          node1->visited = 1;
+          if (node1->inNode[0]->nodeIdx != node0->inNode[0]->nodeIdx) {
+            double w0 = node1->inWeight[0];
+            double w1 = node1->inWeight[1];
+            int idx0 = get_cos_idx(w0, COS_MOD);
+            int idx1 = get_cos_idx(w1, COS_MOD);
+            int sgn0 = w0 < 0 ? 1 : 0;
+            int sgn1 = w1 < 0 ? 1 : 0;
+
+            if (!visited[idx1][idx0][sgn1][sgn0]) {
+              visited[idx1][idx0][sgn1][sgn0] = 1;
+              printf("  __m128i %s = pair_set_epi16(%s, %s);\n",
+                     cos_text_sse2(w1, w0, COS_MOD, text, size),
+                     cos_text_arr(w1, COS_MOD, text1, size),
+                     cos_text_arr(w0, COS_MOD, text2, size));
+            }
+          } else {
+            double w0 = node1->inWeight[0];
+            double w1 = node1->inWeight[1];
+            int idx0 = get_cos_idx(w0, COS_MOD);
+            int idx1 = get_cos_idx(w1, COS_MOD);
+            int sgn0 = w0 < 0 ? 1 : 0;
+            int sgn1 = w1 < 0 ? 1 : 0;
+
+            if (!visited[idx0][idx1][sgn0][sgn1]) {
+              visited[idx0][idx1][sgn0][sgn1] = 1;
+              printf("  __m128i %s = pair_set_epi16(%s, %s);\n",
+                     cos_text_sse2(w0, w1, COS_MOD, text, size),
+                     cos_text_arr(w0, COS_MOD, text1, size),
+                     cos_text_arr(w1, COS_MOD, text2, size));
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+void gen_code_sse2(Node *node, int stage_num, int node_num, TYPE_TXFM type) {
+  char *fun_name = new char[100];
+  get_fun_name(fun_name, 100, type, node_num);
+
+  printf("\n");
+  printf(
+      "void %s_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) "
+      "{\n",
+      fun_name);
+
+  printf("  const int32_t* cospi = cospi_arr(cos_bit);\n");
+  printf("  const __m128i __zero = _mm_setzero_si128();\n");
+  printf("  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));\n");
+
+  graph_reset_visited(node, stage_num, node_num);
+  gen_cospi_list_sse2(node, stage_num, node_num);
+  graph_reset_visited(node, stage_num, node_num);
+  for (int si = 1; si < stage_num; si++) {
+    char in[100];
+    char out[100];
+    printf("\n");
+    printf("  // stage %d\n", si);
+    if (si == 1)
+      snprintf(in, 100, "%s", "input");
+    else
+      snprintf(in, 100, "x%d", si - 1);
+    if (si == stage_num - 1) {
+      snprintf(out, 100, "%s", "output");
+    } else {
+      snprintf(out, 100, "x%d", si);
+      printf("  __m128i %s[%d];\n", out, node_num);
+    }
+    // computation code
+    for (int ni = 0; ni < node_num; ni++) {
+      int idx = get_idx(si, ni, node_num);
+      node_to_code_sse2(node + idx, in, out);
+    }
+  }
+
+  printf("}\n");
+}
+void gen_cospi_list_sse4_1(Node *node, int stage_num, int node_num) {
+  int visited[65][2];
+  memset(visited, 0, sizeof(visited));
+  char text[100];
+  char text1[100];
+  int size = 100;
+  printf("\n");
+  for (int si = 1; si < stage_num; si++) {
+    for (int ni = 0; ni < node_num; ni++) {
+      int idx = get_idx(si, ni, node_num);
+      Node *node0 = node + idx;
+      if (node0->visited == 0) {
+        int cnt = 0;
+        node0->visited = 1;
+        for (int i = 0; i < 2; i++) {
+          if (fabs(node0->inWeight[i]) == 1 || fabs(node0->inWeight[i]) == 0)
+            cnt++;
+        }
+        if (cnt != 2) {
+          for (int i = 0; i < 2; i++) {
+            if (fabs(node0->inWeight[i]) != 1 &&
+                fabs(node0->inWeight[i]) != 0) {
+              double w = node0->inWeight[i];
+              int idx = get_cos_idx(w, COS_MOD);
+              int sgn = w < 0 ? 1 : 0;
+
+              if (!visited[idx][sgn]) {
+                visited[idx][sgn] = 1;
+                printf("  __m128i %s = _mm_set1_epi32(%s);\n",
+                       cos_text_sse4_1(w, COS_MOD, text, size),
+                       cos_text_arr(w, COS_MOD, text1, size));
+              }
+            }
+          }
+          Node *node1 = get_partner_node(node0);
+          node1->visited = 1;
+        }
+      }
+    }
+  }
+}
+
+void single_node_to_code_sse4_1(Node *node, const char *buf0,
+                                const char *buf1) {
+  printf("  %s[%2d] =", buf1, node->nodeIdx);
+  if (node->inWeight[0] == 1 && node->inWeight[1] == 1) {
+    printf(" _mm_add_epi32(%s[%d], %s[%d])", buf0, node->inNodeIdx[0], buf0,
+           node->inNodeIdx[1]);
+  } else if (node->inWeight[0] == 1 && node->inWeight[1] == -1) {
+    printf(" _mm_sub_epi32(%s[%d], %s[%d])", buf0, node->inNodeIdx[0], buf0,
+           node->inNodeIdx[1]);
+  } else if (node->inWeight[0] == -1 && node->inWeight[1] == 1) {
+    printf(" _mm_sub_epi32(%s[%d], %s[%d])", buf0, node->inNodeIdx[1], buf0,
+           node->inNodeIdx[0]);
+  } else if (node->inWeight[0] == 1 && node->inWeight[1] == 0) {
+    printf(" %s[%d]", buf0, node->inNodeIdx[0]);
+  } else if (node->inWeight[0] == 0 && node->inWeight[1] == 1) {
+    printf(" %s[%d]", buf0, node->inNodeIdx[1]);
+  } else if (node->inWeight[0] == -1 && node->inWeight[1] == 0) {
+    printf(" _mm_sub_epi32(__zero, %s[%d])", buf0, node->inNodeIdx[0]);
+  } else if (node->inWeight[0] == 0 && node->inWeight[1] == -1) {
+    printf(" _mm_sub_epi32(__zero, %s[%d])", buf0, node->inNodeIdx[1]);
+  }
+  printf(";\n");
+}
+
+void pair_node_to_code_sse4_1(Node *node, Node *partnerNode, const char *buf0,
+                              const char *buf1) {
+  char temp0[100];
+  char temp1[100];
+  if (node->inWeight[0] * partnerNode->inWeight[0] < 0) {
+    /* type0
+     * cos  sin
+     * sin -cos
+     */
+    // btf_32_sse2_type0(w0, w1, in0, in1, out0, out1)
+    // out0 = w0*in0 + w1*in1
+    // out1 = -w0*in1 + w1*in0
+    printf(
+        "  btf_32_type0_sse4_1_new(%s, %s, %s[%d], %s[%d], %s[%d], %s[%d], "
+        "__rounding, cos_bit);\n",
+        cos_text_sse4_1(node->inWeight[0], COS_MOD, temp0, 100),
+        cos_text_sse4_1(node->inWeight[1], COS_MOD, temp1, 100), buf0,
+        node->inNodeIdx[0], buf0, node->inNodeIdx[1], buf1, node->nodeIdx, buf1,
+        partnerNode->nodeIdx);
+  } else {
+    /* type1
+     *  cos sin
+     * -sin cos
+     */
+    // btf_32_sse2_type1(w0, w1, in0, in1, out0, out1)
+    // out0 = w0*in0 + w1*in1
+    // out1 = w0*in1 - w1*in0
+    printf(
+        "  btf_32_type1_sse4_1_new(%s, %s, %s[%d], %s[%d], %s[%d], %s[%d], "
+        "__rounding, cos_bit);\n",
+        cos_text_sse4_1(node->inWeight[0], COS_MOD, temp0, 100),
+        cos_text_sse4_1(node->inWeight[1], COS_MOD, temp1, 100), buf0,
+        node->inNodeIdx[0], buf0, node->inNodeIdx[1], buf1, node->nodeIdx, buf1,
+        partnerNode->nodeIdx);
+  }
+}
+
+void node_to_code_sse4_1(Node *node, const char *buf0, const char *buf1) {
+  int cnt = 0;
+  int cnt1 = 0;
+  if (node->visited == 0) {
+    node->visited = 1;
+    for (int i = 0; i < 2; i++) {
+      if (fabs(node->inWeight[i]) == 1 || fabs(node->inWeight[i]) == 0) cnt++;
+      if (fabs(node->inWeight[i]) == 1) cnt1++;
+    }
+    if (cnt == 2) {
+      if (cnt1 == 2) {
+        // has a partner
+        Node *partnerNode = get_partner_node(node);
+        partnerNode->visited = 1;
+        single_node_to_code_sse4_1(node, buf0, buf1);
+        single_node_to_code_sse4_1(partnerNode, buf0, buf1);
+      } else {
+        single_node_to_code_sse2(node, buf0, buf1);
+      }
+    } else {
+      Node *partnerNode = get_partner_node(node);
+      partnerNode->visited = 1;
+      pair_node_to_code_sse4_1(node, partnerNode, buf0, buf1);
+    }
+  }
+}
+
+void gen_code_sse4_1(Node *node, int stage_num, int node_num, TYPE_TXFM type) {
+  char *fun_name = new char[100];
+  get_fun_name(fun_name, 100, type, node_num);
+
+  printf("\n");
+  printf(
+      "void %s_sse4_1(const __m128i *input, __m128i *output, int8_t cos_bit) "
+      "{\n",
+      fun_name);
+
+  printf("  const int32_t* cospi = cospi_arr(cos_bit);\n");
+  printf("  const __m128i __zero = _mm_setzero_si128();\n");
+  printf("  const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));\n");
+
+  graph_reset_visited(node, stage_num, node_num);
+  gen_cospi_list_sse4_1(node, stage_num, node_num);
+  graph_reset_visited(node, stage_num, node_num);
+  for (int si = 1; si < stage_num; si++) {
+    char in[100];
+    char out[100];
+    printf("\n");
+    printf("  // stage %d\n", si);
+    if (si == 1)
+      snprintf(in, 100, "%s", "input");
+    else
+      snprintf(in, 100, "x%d", si - 1);
+    if (si == stage_num - 1) {
+      snprintf(out, 100, "%s", "output");
+    } else {
+      snprintf(out, 100, "x%d", si);
+      printf("  __m128i %s[%d];\n", out, node_num);
+    }
+    // computation code
+    for (int ni = 0; ni < node_num; ni++) {
+      int idx = get_idx(si, ni, node_num);
+      node_to_code_sse4_1(node + idx, in, out);
+    }
+  }
+
+  printf("}\n");
+}
+
+void gen_hybrid_code(CODE_TYPE code_type, TYPE_TXFM txfm_type, int node_num) {
+  int stage_num = get_hybrid_stage_num(txfm_type, node_num);
+
+  Node *node = new Node[node_num * stage_num];
+  init_graph(node, stage_num, node_num);
+
+  gen_hybrid_graph_1d(node, stage_num, node_num, 0, 0, node_num, txfm_type);
+
+  switch (code_type) {
+    case CODE_TYPE_C: gen_code_c(node, stage_num, node_num, txfm_type); break;
+    case CODE_TYPE_SSE2:
+      gen_code_sse2(node, stage_num, node_num, txfm_type);
+      break;
+    case CODE_TYPE_SSE4_1:
+      gen_code_sse4_1(node, stage_num, node_num, txfm_type);
+      break;
+  }
+
+  delete[] node;
+}
+
+int main(int argc, char **argv) {
+  CODE_TYPE code_type = CODE_TYPE_SSE4_1;
+  for (int txfm_type = TYPE_DCT; txfm_type < TYPE_LAST; txfm_type++) {
+    for (int node_num = 4; node_num <= 64; node_num *= 2) {
+      gen_hybrid_code(code_type, (TYPE_TXFM)txfm_type, node_num);
+    }
+  }
+  return 0;
+}
diff --git a/libs/libaom/src/tools/txfm_analyzer/txfm_graph.cc b/libs/libaom/src/tools/txfm_analyzer/txfm_graph.cc
new file mode 100644
index 000000000..a24906100
--- /dev/null
+++ b/libs/libaom/src/tools/txfm_analyzer/txfm_graph.cc
@@ -0,0 +1,943 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "tools/txfm_analyzer/txfm_graph.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+
+typedef struct Node Node;
+
+void get_fun_name(char *str_fun_name, int str_buf_size, const TYPE_TXFM type,
+                  const int txfm_size) {
+  if (type == TYPE_DCT)
+    snprintf(str_fun_name, str_buf_size, "fdct%d_new", txfm_size);
+  else if (type == TYPE_ADST)
+    snprintf(str_fun_name, str_buf_size, "fadst%d_new", txfm_size);
+  else if (type == TYPE_IDCT)
+    snprintf(str_fun_name, str_buf_size, "idct%d_new", txfm_size);
+  else if (type == TYPE_IADST)
+    snprintf(str_fun_name, str_buf_size, "iadst%d_new", txfm_size);
+}
+
+void get_txfm_type_name(char *str_fun_name, int str_buf_size,
+                        const TYPE_TXFM type, const int txfm_size) {
+  if (type == TYPE_DCT)
+    snprintf(str_fun_name, str_buf_size, "TXFM_TYPE_DCT%d", txfm_size);
+  else if (type == TYPE_ADST)
+    snprintf(str_fun_name, str_buf_size, "TXFM_TYPE_ADST%d", txfm_size);
+  else if (type == TYPE_IDCT)
+    snprintf(str_fun_name, str_buf_size, "TXFM_TYPE_DCT%d", txfm_size);
+  else if (type == TYPE_IADST)
+    snprintf(str_fun_name, str_buf_size, "TXFM_TYPE_ADST%d", txfm_size);
+}
+
+void get_hybrid_2d_type_name(char *buf, int buf_size, const TYPE_TXFM type0,
+                             const TYPE_TXFM type1, const int txfm_size0,
+                             const int txfm_size1) {
+  if (type0 == TYPE_DCT && type1 == TYPE_DCT)
+    snprintf(buf, buf_size, "_dct_dct_%dx%d", txfm_size1, txfm_size0);
+  else if (type0 == TYPE_DCT && type1 == TYPE_ADST)
+    snprintf(buf, buf_size, "_dct_adst_%dx%d", txfm_size1, txfm_size0);
+  else if (type0 == TYPE_ADST && type1 == TYPE_ADST)
+    snprintf(buf, buf_size, "_adst_adst_%dx%d", txfm_size1, txfm_size0);
+  else if (type0 == TYPE_ADST && type1 == TYPE_DCT)
+    snprintf(buf, buf_size, "_adst_dct_%dx%d", txfm_size1, txfm_size0);
+}
+
+TYPE_TXFM get_inv_type(TYPE_TXFM type) {
+  if (type == TYPE_DCT)
+    return TYPE_IDCT;
+  else if (type == TYPE_ADST)
+    return TYPE_IADST;
+  else if (type == TYPE_IDCT)
+    return TYPE_DCT;
+  else if (type == TYPE_IADST)
+    return TYPE_ADST;
+  else
+    return TYPE_LAST;
+}
+
+void reference_dct_1d(double *in, double *out, int size) {
+  const double kInvSqrt2 = 0.707106781186547524400844362104;
+  for (int k = 0; k < size; k++) {
+    out[k] = 0;  // initialize out[k]
+    for (int n = 0; n < size; n++) {
+      out[k] += in[n] * cos(PI * (2 * n + 1) * k / (2 * size));
+    }
+    if (k == 0) out[k] = out[k] * kInvSqrt2;
+  }
+}
+
+void reference_dct_2d(double *in, double *out, int size) {
+  double *tempOut = new double[size * size];
+  // dct each row: in -> out
+  for (int r = 0; r < size; r++) {
+    reference_dct_1d(in + r * size, out + r * size, size);
+  }
+
+  for (int r = 0; r < size; r++) {
+    // out ->tempOut
+    for (int c = 0; c < size; c++) {
+      tempOut[r * size + c] = out[c * size + r];
+    }
+  }
+  for (int r = 0; r < size; r++) {
+    reference_dct_1d(tempOut + r * size, out + r * size, size);
+  }
+  delete[] tempOut;
+}
+
+void reference_adst_1d(double *in, double *out, int size) {
+  for (int k = 0; k < size; k++) {
+    out[k] = 0;  // initialize out[k]
+    for (int n = 0; n < size; n++) {
+      out[k] += in[n] * sin(PI * (2 * n + 1) * (2 * k + 1) / (4 * size));
+    }
+  }
+}
+
+void reference_hybrid_2d(double *in, double *out, int size, int type0,
+                         int type1) {
+  double *tempOut = new double[size * size];
+  // dct each row: in -> out
+  for (int r = 0; r < size; r++) {
+    if (type0 == TYPE_DCT)
+      reference_dct_1d(in + r * size, out + r * size, size);
+    else
+      reference_adst_1d(in + r * size, out + r * size, size);
+  }
+
+  for (int r = 0; r < size; r++) {
+    // out ->tempOut
+    for (int c = 0; c < size; c++) {
+      tempOut[r * size + c] = out[c * size + r];
+    }
+  }
+  for (int r = 0; r < size; r++) {
+    if (type1 == TYPE_DCT)
+      reference_dct_1d(tempOut + r * size, out + r * size, size);
+    else
+      reference_adst_1d(tempOut + r * size, out + r * size, size);
+  }
+  delete[] tempOut;
+}
+
+void reference_hybrid_2d_new(double *in, double *out, int size0, int size1,
+                             int type0, int type1) {
+  double *tempOut = new double[size0 * size1];
+  // dct each row: in -> out
+  for (int r = 0; r < size1; r++) {
+    if (type0 == TYPE_DCT)
+      reference_dct_1d(in + r * size0, out + r * size0, size0);
+    else
+      reference_adst_1d(in + r * size0, out + r * size0, size0);
+  }
+
+  for (int r = 0; r < size1; r++) {
+    // out ->tempOut
+    for (int c = 0; c < size0; c++) {
+      tempOut[c * size1 + r] = out[r * size0 + c];
+    }
+  }
+  for (int r = 0; r < size0; r++) {
+    if (type1 == TYPE_DCT)
+      reference_dct_1d(tempOut + r * size1, out + r * size1, size1);
+    else
+      reference_adst_1d(tempOut + r * size1, out + r * size1, size1);
+  }
+  delete[] tempOut;
+}
+
+unsigned int get_max_bit(unsigned int x) {
+  int max_bit = -1;
+  while (x) {
+    x = x >> 1;
+    max_bit++;
+  }
+  return max_bit;
+}
+
+unsigned int bitwise_reverse(unsigned int x, int max_bit) {
+  x = ((x >> 16) & 0x0000ffff) | ((x & 0x0000ffff) << 16);
+  x = ((x >> 8) & 0x00ff00ff) | ((x & 0x00ff00ff) << 8);
+  x = ((x >> 4) & 0x0f0f0f0f) | ((x & 0x0f0f0f0f) << 4);
+  x = ((x >> 2) & 0x33333333) | ((x & 0x33333333) << 2);
+  x = ((x >> 1) & 0x55555555) | ((x & 0x55555555) << 1);
+  x = x >> (31 - max_bit);
+  return x;
+}
+
+int get_idx(int ri, int ci, int cSize) { return ri * cSize + ci; }
+
+void add_node(Node *node, int stage_num, int node_num, int stage_idx,
+              int node_idx, int in, double w) {
+  int outIdx = get_idx(stage_idx, node_idx, node_num);
+  int inIdx = get_idx(stage_idx - 1, in, node_num);
+  int idx = node[outIdx].inNodeNum;
+  if (idx < 2) {
+    node[outIdx].inNode[idx] = &node[inIdx];
+    node[outIdx].inNodeIdx[idx] = in;
+    node[outIdx].inWeight[idx] = w;
+    idx++;
+    node[outIdx].inNodeNum = idx;
+  } else {
+    printf("Error: inNode is full");
+  }
+}
+
+void connect_node(Node *node, int stage_num, int node_num, int stage_idx,
+                  int node_idx, int in0, double w0, int in1, double w1) {
+  int outIdx = get_idx(stage_idx, node_idx, node_num);
+  int inIdx0 = get_idx(stage_idx - 1, in0, node_num);
+  int inIdx1 = get_idx(stage_idx - 1, in1, node_num);
+
+  int idx = 0;
+  // if(w0 != 0) {
+  node[outIdx].inNode[idx] = &node[inIdx0];
+  node[outIdx].inNodeIdx[idx] = in0;
+  node[outIdx].inWeight[idx] = w0;
+  idx++;
+  //}
+
+  // if(w1 != 0) {
+  node[outIdx].inNode[idx] = &node[inIdx1];
+  node[outIdx].inNodeIdx[idx] = in1;
+  node[outIdx].inWeight[idx] = w1;
+  idx++;
+  //}
+
+  node[outIdx].inNodeNum = idx;
+}
+
+void propagate(Node *node, int stage_num, int node_num, int stage_idx) {
+  for (int ni = 0; ni < node_num; ni++) {
+    int outIdx = get_idx(stage_idx, ni, node_num);
+    node[outIdx].value = 0;
+    for (int k = 0; k < node[outIdx].inNodeNum; k++) {
+      node[outIdx].value +=
+          node[outIdx].inNode[k]->value * node[outIdx].inWeight[k];
+    }
+  }
+}
+
+int64_t round_shift(int64_t value, int bit) {
+  if (bit > 0) {
+    if (value < 0) {
+      return -round_shift(-value, bit);
+    } else {
+      return (value + (1 << (bit - 1))) >> bit;
+    }
+  } else {
+    return value << (-bit);
+  }
+}
+
+void round_shift_array(int32_t *arr, int size, int bit) {
+  if (bit == 0) {
+    return;
+  } else {
+    for (int i = 0; i < size; i++) {
+      arr[i] = round_shift(arr[i], bit);
+    }
+  }
+}
+
+void graph_reset_visited(Node *node, int stage_num, int node_num) {
+  for (int si = 0; si < stage_num; si++) {
+    for (int ni = 0; ni < node_num; ni++) {
+      int idx = get_idx(si, ni, node_num);
+      node[idx].visited = 0;
+    }
+  }
+}
+
+void estimate_value(Node *node, int stage_num, int node_num, int stage_idx,
+                    int node_idx, int estimate_bit) {
+  if (stage_idx > 0) {
+    int outIdx = get_idx(stage_idx, node_idx, node_num);
+    int64_t out = 0;
+    node[outIdx].value = 0;
+    for (int k = 0; k < node[outIdx].inNodeNum; k++) {
+      int64_t w = round(node[outIdx].inWeight[k] * (1 << estimate_bit));
+      int64_t v = round(node[outIdx].inNode[k]->value);
+      out += v * w;
+    }
+    node[outIdx].value = round_shift(out, estimate_bit);
+  }
+}
+
+void amplify_value(Node *node, int stage_num, int node_num, int stage_idx,
+                   int node_idx, int amplify_bit) {
+  int outIdx = get_idx(stage_idx, node_idx, node_num);
+  node[outIdx].value = round_shift(round(node[outIdx].value), -amplify_bit);
+}
+
+void propagate_estimate_amlify(Node *node, int stage_num, int node_num,
+                               int stage_idx, int amplify_bit,
+                               int estimate_bit) {
+  for (int ni = 0; ni < node_num; ni++) {
+    estimate_value(node, stage_num, node_num, stage_idx, ni, estimate_bit);
+    amplify_value(node, stage_num, node_num, stage_idx, ni, amplify_bit);
+  }
+}
+
+void init_graph(Node *node, int stage_num, int node_num) {
+  for (int si = 0; si < stage_num; si++) {
+    for (int ni = 0; ni < node_num; ni++) {
+      int outIdx = get_idx(si, ni, node_num);
+      node[outIdx].stageIdx = si;
+      node[outIdx].nodeIdx = ni;
+      node[outIdx].value = 0;
+      node[outIdx].inNodeNum = 0;
+      if (si >= 1) {
+        connect_node(node, stage_num, node_num, si, ni, ni, 1, ni, 0);
+      }
+    }
+  }
+}
+
+void gen_B_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                 int node_idx, int N, int star) {
+  for (int i = 0; i < N / 2; i++) {
+    int out = node_idx + i;
+    int in1 = node_idx + N - 1 - i;
+    if (star == 1) {
+      connect_node(node, stage_num, node_num, stage_idx + 1, out, out, -1, in1,
+                   1);
+    } else {
+      connect_node(node, stage_num, node_num, stage_idx + 1, out, out, 1, in1,
+                   1);
+    }
+  }
+  for (int i = N / 2; i < N; i++) {
+    int out = node_idx + i;
+    int in1 = node_idx + N - 1 - i;
+    if (star == 1) {
+      connect_node(node, stage_num, node_num, stage_idx + 1, out, out, 1, in1,
+                   1);
+    } else {
+      connect_node(node, stage_num, node_num, stage_idx + 1, out, out, -1, in1,
+                   1);
+    }
+  }
+}
+
+void gen_P_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                 int node_idx, int N) {
+  int max_bit = get_max_bit(N - 1);
+  for (int i = 0; i < N; i++) {
+    int out = node_idx + bitwise_reverse(i, max_bit);
+    int in = node_idx + i;
+    connect_node(node, stage_num, node_num, stage_idx + 1, out, in, 1, in, 0);
+  }
+}
+
+void gen_type1_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                     int node_idx, int N) {
+  int max_bit = get_max_bit(N);
+  for (int ni = 0; ni < N / 2; ni++) {
+    int ai = bitwise_reverse(N + ni, max_bit);
+    int out = node_idx + ni;
+    int in1 = node_idx + N - ni - 1;
+    connect_node(node, stage_num, node_num, stage_idx + 1, out, out,
+                 sin(PI * ai / (2 * 2 * N)), in1, cos(PI * ai / (2 * 2 * N)));
+  }
+  for (int ni = N / 2; ni < N; ni++) {
+    int ai = bitwise_reverse(N + ni, max_bit);
+    int out = node_idx + ni;
+    int in1 = node_idx + N - ni - 1;
+    connect_node(node, stage_num, node_num, stage_idx + 1, out, out,
+                 cos(PI * ai / (2 * 2 * N)), in1, -sin(PI * ai / (2 * 2 * N)));
+  }
+}
+
+void gen_type2_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                     int node_idx, int N) {
+  for (int ni = 0; ni < N / 4; ni++) {
+    int out = node_idx + ni;
+    connect_node(node, stage_num, node_num, stage_idx + 1, out, out, 1, out, 0);
+  }
+
+  for (int ni = N / 4; ni < N / 2; ni++) {
+    int out = node_idx + ni;
+    int in1 = node_idx + N - ni - 1;
+    connect_node(node, stage_num, node_num, stage_idx + 1, out, out,
+                 -cos(PI / 4), in1, cos(-PI / 4));
+  }
+
+  for (int ni = N / 2; ni < N * 3 / 4; ni++) {
+    int out = node_idx + ni;
+    int in1 = node_idx + N - ni - 1;
+    connect_node(node, stage_num, node_num, stage_idx + 1, out, out,
+                 cos(-PI / 4), in1, cos(PI / 4));
+  }
+
+  for (int ni = N * 3 / 4; ni < N; ni++) {
+    int out = node_idx + ni;
+    connect_node(node, stage_num, node_num, stage_idx + 1, out, out, 1, out, 0);
+  }
+}
+
+void gen_type3_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                     int node_idx, int idx, int N) {
+  // TODO(angiebird): Simplify and clarify this function
+
+  int i = 2 * N / (1 << (idx / 2));
+  int max_bit =
+      get_max_bit(i / 2) - 1;  // the max_bit counts on i/2 instead of N here
+  int N_over_i = 2 << (idx / 2);
+
+  for (int nj = 0; nj < N / 2; nj += N_over_i) {
+    int j = nj / (N_over_i);
+    int kj = bitwise_reverse(i / 4 + j, max_bit);
+    // printf("kj = %d\n", kj);
+
+    // I_N/2i   --- 0
+    int offset = nj;
+    for (int ni = 0; ni < N_over_i / 4; ni++) {
+      int out = node_idx + offset + ni;
+      int in = out;
+      connect_node(node, stage_num, node_num, stage_idx + 1, out, in, 1, in, 0);
+    }
+
+    // -C_Kj/i --- S_Kj/i
+    offset += N_over_i / 4;
+    for (int ni = 0; ni < N_over_i / 4; ni++) {
+      int out = node_idx + offset + ni;
+      int in0 = out;
+      double w0 = -cos(kj * PI / i);
+      int in1 = N - (offset + ni) - 1 + node_idx;
+      double w1 = sin(kj * PI / i);
+      connect_node(node, stage_num, node_num, stage_idx + 1, out, in0, w0, in1,
+                   w1);
+    }
+
+    // S_kj/i  --- -C_Kj/i
+    offset += N_over_i / 4;
+    for (int ni = 0; ni < N_over_i / 4; ni++) {
+      int out = node_idx + offset + ni;
+      int in0 = out;
+      double w0 = -sin(kj * PI / i);
+      int in1 = N - (offset + ni) - 1 + node_idx;
+      double w1 = -cos(kj * PI / i);
+      connect_node(node, stage_num, node_num, stage_idx + 1, out, in0, w0, in1,
+                   w1);
+    }
+
+    // I_N/2i   --- 0
+    offset += N_over_i / 4;
+    for (int ni = 0; ni < N_over_i / 4; ni++) {
+      int out = node_idx + offset + ni;
+      int in = out;
+      connect_node(node, stage_num, node_num, stage_idx + 1, out, in, 1, in, 0);
+    }
+  }
+
+  for (int nj = N / 2; nj < N; nj += N_over_i) {
+    int j = nj / N_over_i;
+    int kj = bitwise_reverse(i / 4 + j, max_bit);
+
+    // I_N/2i --- 0
+    int offset = nj;
+    for (int ni = 0; ni < N_over_i / 4; ni++) {
+      int out = node_idx + offset + ni;
+      int in = out;
+      connect_node(node, stage_num, node_num, stage_idx + 1, out, in, 1, in, 0);
+    }
+
+    // C_kj/i --- -S_Kj/i
+    offset += N_over_i / 4;
+    for (int ni = 0; ni < N_over_i / 4; ni++) {
+      int out = node_idx + offset + ni;
+      int in0 = out;
+      double w0 = cos(kj * PI / i);
+      int in1 = N - (offset + ni) - 1 + node_idx;
+      double w1 = -sin(kj * PI / i);
+      connect_node(node, stage_num, node_num, stage_idx + 1, out, in0, w0, in1,
+                   w1);
+    }
+
+    // S_kj/i --- C_Kj/i
+    offset += N_over_i / 4;
+    for (int ni = 0; ni < N_over_i / 4; ni++) {
+      int out = node_idx + offset + ni;
+      int in0 = out;
+      double w0 = sin(kj * PI / i);
+      int in1 = N - (offset + ni) - 1 + node_idx;
+      double w1 = cos(kj * PI / i);
+      connect_node(node, stage_num, node_num, stage_idx + 1, out, in0, w0, in1,
+                   w1);
+    }
+
+    // I_N/2i --- 0
+    offset += N_over_i / 4;
+    for (int ni = 0; ni < N_over_i / 4; ni++) {
+      int out = node_idx + offset + ni;
+      int in = out;
+      connect_node(node, stage_num, node_num, stage_idx + 1, out, in, 1, in, 0);
+    }
+  }
+}
+
+void gen_type4_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                     int node_idx, int idx, int N) {
+  int B_size = 1 << ((idx + 1) / 2);
+  for (int ni = 0; ni < N; ni += B_size) {
+    gen_B_graph(node, stage_num, node_num, stage_idx, node_idx + ni, B_size,
+                (ni / B_size) % 2);
+  }
+}
+
+void gen_R_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                 int node_idx, int N) {
+  int max_idx = 2 * (get_max_bit(N) + 1) - 3;
+  for (int idx = 0; idx < max_idx; idx++) {
+    int s = stage_idx + max_idx - idx - 1;
+    if (idx == 0) {
+      // type 1
+      gen_type1_graph(node, stage_num, node_num, s, node_idx, N);
+    } else if (idx == max_idx - 1) {
+      // type 2
+      gen_type2_graph(node, stage_num, node_num, s, node_idx, N);
+    } else if ((idx + 1) % 2 == 0) {
+      // type 4
+      gen_type4_graph(node, stage_num, node_num, s, node_idx, idx, N);
+    } else if ((idx + 1) % 2 == 1) {
+      // type 3
+      gen_type3_graph(node, stage_num, node_num, s, node_idx, idx, N);
+    } else {
+      printf("check gen_R_graph()\n");
+    }
+  }
+}
+
+void gen_DCT_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                   int node_idx, int N) {
+  if (N > 2) {
+    gen_B_graph(node, stage_num, node_num, stage_idx, node_idx, N, 0);
+    gen_DCT_graph(node, stage_num, node_num, stage_idx + 1, node_idx, N / 2);
+    gen_R_graph(node, stage_num, node_num, stage_idx + 1, node_idx + N / 2,
+                N / 2);
+  } else {
+    // generate dct_2
+    connect_node(node, stage_num, node_num, stage_idx + 1, node_idx, node_idx,
+                 cos(PI / 4), node_idx + 1, cos(PI / 4));
+    connect_node(node, stage_num, node_num, stage_idx + 1, node_idx + 1,
+                 node_idx + 1, -cos(PI / 4), node_idx, cos(PI / 4));
+  }
+}
+
+int get_dct_stage_num(int size) { return 2 * get_max_bit(size); }
+
+void gen_DCT_graph_1d(Node *node, int stage_num, int node_num, int stage_idx,
+                      int node_idx, int dct_node_num) {
+  gen_DCT_graph(node, stage_num, node_num, stage_idx, node_idx, dct_node_num);
+  int dct_stage_num = get_dct_stage_num(dct_node_num);
+  gen_P_graph(node, stage_num, node_num, stage_idx + dct_stage_num - 2,
+              node_idx, dct_node_num);
+}
+
+void gen_adst_B_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                      int node_idx, int adst_idx) {
+  int size = 1 << (adst_idx + 1);
+  for (int ni = 0; ni < size / 2; ni++) {
+    int nOut = node_idx + ni;
+    int nIn = nOut + size / 2;
+    connect_node(node, stage_num, node_num, stage_idx + 1, nOut, nOut, 1, nIn,
+                 1);
+    // printf("nOut: %d nIn: %d\n", nOut, nIn);
+  }
+  for (int ni = size / 2; ni < size; ni++) {
+    int nOut = node_idx + ni;
+    int nIn = nOut - size / 2;
+    connect_node(node, stage_num, node_num, stage_idx + 1, nOut, nOut, -1, nIn,
+                 1);
+    // printf("ndctOut: %d nIn: %d\n", nOut, nIn);
+  }
+}
+
+void gen_adst_U_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                      int node_idx, int adst_idx, int adst_node_num) {
+  int size = 1 << (adst_idx + 1);
+  for (int ni = 0; ni < adst_node_num; ni += size) {
+    gen_adst_B_graph(node, stage_num, node_num, stage_idx, node_idx + ni,
+                     adst_idx);
+  }
+}
+
+void gen_adst_T_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                      int node_idx, double freq) {
+  connect_node(node, stage_num, node_num, stage_idx + 1, node_idx, node_idx,
+               cos(freq * PI), node_idx + 1, sin(freq * PI));
+  connect_node(node, stage_num, node_num, stage_idx + 1, node_idx + 1,
+               node_idx + 1, -cos(freq * PI), node_idx, sin(freq * PI));
+}
+
+void gen_adst_E_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                      int node_idx, int adst_idx) {
+  int size = 1 << (adst_idx);
+  for (int i = 0; i < size / 2; i++) {
+    int ni = i * 2;
+    double fi = (1 + 4 * i) * 1.0 / (1 << (adst_idx + 1));
+    gen_adst_T_graph(node, stage_num, node_num, stage_idx, node_idx + ni, fi);
+  }
+}
+
+void gen_adst_V_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                      int node_idx, int adst_idx, int adst_node_num) {
+  int size = 1 << (adst_idx);
+  for (int i = 0; i < adst_node_num / size; i++) {
+    if (i % 2 == 1) {
+      int ni = i * size;
+      gen_adst_E_graph(node, stage_num, node_num, stage_idx, node_idx + ni,
+                       adst_idx);
+    }
+  }
+}
+void gen_adst_VJ_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                       int node_idx, int adst_node_num) {
+  for (int i = 0; i < adst_node_num / 2; i++) {
+    int ni = i * 2;
+    double fi = (1 + 4 * i) * 1.0 / (4 * adst_node_num);
+    gen_adst_T_graph(node, stage_num, node_num, stage_idx, node_idx + ni, fi);
+  }
+}
+void gen_adst_Q_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                      int node_idx, int adst_node_num) {
+  // reverse order when idx is 1, 3, 5, 7 ...
+  // example of adst_node_num = 8:
+  //   0 1 2 3 4 5 6 7
+  // --> 0 7 2 5 4 3 6 1
+  for (int ni = 0; ni < adst_node_num; ni++) {
+    if (ni % 2 == 0) {
+      int out = node_idx + ni;
+      connect_node(node, stage_num, node_num, stage_idx + 1, out, out, 1, out,
+                   0);
+    } else {
+      int out = node_idx + ni;
+      int in = node_idx + adst_node_num - ni;
+      connect_node(node, stage_num, node_num, stage_idx + 1, out, in, 1, in, 0);
+    }
+  }
+}
+void gen_adst_Ibar_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                         int node_idx, int adst_node_num) {
+  // reverse order
+  // 0 1 2 3 --> 3 2 1 0
+  for (int ni = 0; ni < adst_node_num; ni++) {
+    int out = node_idx + ni;
+    int in = node_idx + adst_node_num - ni - 1;
+    connect_node(node, stage_num, node_num, stage_idx + 1, out, in, 1, in, 0);
+  }
+}
+
+int get_Q_out2in(int adst_node_num, int out) {
+  int in;
+  if (out % 2 == 0) {
+    in = out;
+  } else {
+    in = adst_node_num - out;
+  }
+  return in;
+}
+
+int get_Ibar_out2in(int adst_node_num, int out) {
+  return adst_node_num - out - 1;
+}
+
+void gen_adst_IbarQ_graph(Node *node, int stage_num, int node_num,
+                          int stage_idx, int node_idx, int adst_node_num) {
+  // in -> Ibar -> Q -> out
+  for (int ni = 0; ni < adst_node_num; ni++) {
+    int out = node_idx + ni;
+    int in = node_idx +
+             get_Ibar_out2in(adst_node_num, get_Q_out2in(adst_node_num, ni));
+    connect_node(node, stage_num, node_num, stage_idx + 1, out, in, 1, in, 0);
+  }
+}
+
+void gen_adst_D_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                      int node_idx, int adst_node_num) {
+  // reverse order
+  for (int ni = 0; ni < adst_node_num; ni++) {
+    int out = node_idx + ni;
+    int in = out;
+    if (ni % 2 == 0) {
+      connect_node(node, stage_num, node_num, stage_idx + 1, out, in, 1, in, 0);
+    } else {
+      connect_node(node, stage_num, node_num, stage_idx + 1, out, in, -1, in,
+                   0);
+    }
+  }
+}
+
+int get_hadamard_idx(int x, int adst_node_num) {
+  int max_bit = get_max_bit(adst_node_num - 1);
+  x = bitwise_reverse(x, max_bit);
+
+  // gray code
+  int c = x & 1;
+  int p = x & 1;
+  int y = c;
+
+  for (int i = 1; i <= max_bit; i++) {
+    p = c;
+    c = (x >> i) & 1;
+    y += (c ^ p) << i;
+  }
+  return y;
+}
+
+void gen_adst_Ht_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                       int node_idx, int adst_node_num) {
+  for (int ni = 0; ni < adst_node_num; ni++) {
+    int out = node_idx + ni;
+    int in = node_idx + get_hadamard_idx(ni, adst_node_num);
+    connect_node(node, stage_num, node_num, stage_idx + 1, out, in, 1, in, 0);
+  }
+}
+
+void gen_adst_HtD_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                        int node_idx, int adst_node_num) {
+  for (int ni = 0; ni < adst_node_num; ni++) {
+    int out = node_idx + ni;
+    int in = node_idx + get_hadamard_idx(ni, adst_node_num);
+    double inW;
+    if (ni % 2 == 0)
+      inW = 1;
+    else
+      inW = -1;
+    connect_node(node, stage_num, node_num, stage_idx + 1, out, in, inW, in, 0);
+  }
+}
+
+int get_adst_stage_num(int adst_node_num) {
+  return 2 * get_max_bit(adst_node_num) + 2;
+}
+
+int gen_iadst_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                    int node_idx, int adst_node_num) {
+  int max_bit = get_max_bit(adst_node_num);
+  int si = 0;
+  gen_adst_IbarQ_graph(node, stage_num, node_num, stage_idx + si, node_idx,
+                       adst_node_num);
+  si++;
+  gen_adst_VJ_graph(node, stage_num, node_num, stage_idx + si, node_idx,
+                    adst_node_num);
+  si++;
+  for (int adst_idx = max_bit - 1; adst_idx >= 1; adst_idx--) {
+    gen_adst_U_graph(node, stage_num, node_num, stage_idx + si, node_idx,
+                     adst_idx, adst_node_num);
+    si++;
+    gen_adst_V_graph(node, stage_num, node_num, stage_idx + si, node_idx,
+                     adst_idx, adst_node_num);
+    si++;
+  }
+  gen_adst_HtD_graph(node, stage_num, node_num, stage_idx + si, node_idx,
+                     adst_node_num);
+  si++;
+  return si + 1;
+}
+
+int gen_adst_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                   int node_idx, int adst_node_num) {
+  int hybrid_stage_num = get_hybrid_stage_num(TYPE_ADST, adst_node_num);
+  // generate a adst tempNode
+  Node *tempNode = new Node[hybrid_stage_num * adst_node_num];
+  init_graph(tempNode, hybrid_stage_num, adst_node_num);
+  int si = gen_iadst_graph(tempNode, hybrid_stage_num, adst_node_num, 0, 0,
+                           adst_node_num);
+
+  // tempNode's inverse graph to node[stage_idx][node_idx]
+  gen_inv_graph(tempNode, hybrid_stage_num, adst_node_num, node, stage_num,
+                node_num, stage_idx, node_idx);
+  delete[] tempNode;
+  return si;
+}
+
+void connect_layer_2d(Node *node, int stage_num, int node_num, int stage_idx,
+                      int node_idx, int dct_node_num) {
+  for (int first = 0; first < dct_node_num; first++) {
+    for (int second = 0; second < dct_node_num; second++) {
+      // int sIn = stage_idx;
+      int sOut = stage_idx + 1;
+      int nIn = node_idx + first * dct_node_num + second;
+      int nOut = node_idx + second * dct_node_num + first;
+
+      // printf("sIn: %d nIn: %d sOut: %d nOut: %d\n", sIn, nIn, sOut, nOut);
+
+      connect_node(node, stage_num, node_num, sOut, nOut, nIn, 1, nIn, 0);
+    }
+  }
+}
+
+void connect_layer_2d_new(Node *node, int stage_num, int node_num,
+                          int stage_idx, int node_idx, int dct_node_num0,
+                          int dct_node_num1) {
+  for (int i = 0; i < dct_node_num1; i++) {
+    for (int j = 0; j < dct_node_num0; j++) {
+      // int sIn = stage_idx;
+      int sOut = stage_idx + 1;
+      int nIn = node_idx + i * dct_node_num0 + j;
+      int nOut = node_idx + j * dct_node_num1 + i;
+
+      // printf("sIn: %d nIn: %d sOut: %d nOut: %d\n", sIn, nIn, sOut, nOut);
+
+      connect_node(node, stage_num, node_num, sOut, nOut, nIn, 1, nIn, 0);
+    }
+  }
+}
+
+void gen_DCT_graph_2d(Node *node, int stage_num, int node_num, int stage_idx,
+                      int node_idx, int dct_node_num) {
+  int dct_stage_num = get_dct_stage_num(dct_node_num);
+  // put 2 layers of dct_node_num DCTs on the graph
+  for (int ni = 0; ni < dct_node_num; ni++) {
+    gen_DCT_graph_1d(node, stage_num, node_num, stage_idx,
+                     node_idx + ni * dct_node_num, dct_node_num);
+    gen_DCT_graph_1d(node, stage_num, node_num, stage_idx + dct_stage_num,
+                     node_idx + ni * dct_node_num, dct_node_num);
+  }
+  // connect first layer and second layer
+  connect_layer_2d(node, stage_num, node_num, stage_idx + dct_stage_num - 1,
+                   node_idx, dct_node_num);
+}
+
+int get_hybrid_stage_num(int type, int hybrid_node_num) {
+  if (type == TYPE_DCT || type == TYPE_IDCT) {
+    return get_dct_stage_num(hybrid_node_num);
+  } else if (type == TYPE_ADST || type == TYPE_IADST) {
+    return get_adst_stage_num(hybrid_node_num);
+  }
+  return 0;
+}
+
+int get_hybrid_2d_stage_num(int type0, int type1, int hybrid_node_num) {
+  int stage_num = 0;
+  stage_num += get_hybrid_stage_num(type0, hybrid_node_num);
+  stage_num += get_hybrid_stage_num(type1, hybrid_node_num);
+  return stage_num;
+}
+
+int get_hybrid_2d_stage_num_new(int type0, int type1, int hybrid_node_num0,
+                                int hybrid_node_num1) {
+  int stage_num = 0;
+  stage_num += get_hybrid_stage_num(type0, hybrid_node_num0);
+  stage_num += get_hybrid_stage_num(type1, hybrid_node_num1);
+  return stage_num;
+}
+
+int get_hybrid_amplify_factor(int type, int hybrid_node_num) {
+  return get_max_bit(hybrid_node_num) - 1;
+}
+
+void gen_hybrid_graph_1d(Node *node, int stage_num, int node_num, int stage_idx,
+                         int node_idx, int hybrid_node_num, int type) {
+  if (type == TYPE_DCT) {
+    gen_DCT_graph_1d(node, stage_num, node_num, stage_idx, node_idx,
+                     hybrid_node_num);
+  } else if (type == TYPE_ADST) {
+    gen_adst_graph(node, stage_num, node_num, stage_idx, node_idx,
+                   hybrid_node_num);
+  } else if (type == TYPE_IDCT) {
+    int hybrid_stage_num = get_hybrid_stage_num(type, hybrid_node_num);
+    // generate a dct tempNode
+    Node *tempNode = new Node[hybrid_stage_num * hybrid_node_num];
+    init_graph(tempNode, hybrid_stage_num, hybrid_node_num);
+    gen_DCT_graph_1d(tempNode, hybrid_stage_num, hybrid_node_num, 0, 0,
+                     hybrid_node_num);
+
+    // tempNode's inverse graph to node[stage_idx][node_idx]
+    gen_inv_graph(tempNode, hybrid_stage_num, hybrid_node_num, node, stage_num,
+                  node_num, stage_idx, node_idx);
+    delete[] tempNode;
+  } else if (type == TYPE_IADST) {
+    int hybrid_stage_num = get_hybrid_stage_num(type, hybrid_node_num);
+    // generate a adst tempNode
+    Node *tempNode = new Node[hybrid_stage_num * hybrid_node_num];
+    init_graph(tempNode, hybrid_stage_num, hybrid_node_num);
+    gen_adst_graph(tempNode, hybrid_stage_num, hybrid_node_num, 0, 0,
+                   hybrid_node_num);
+
+    // tempNode's inverse graph to node[stage_idx][node_idx]
+    gen_inv_graph(tempNode, hybrid_stage_num, hybrid_node_num, node, stage_num,
+                  node_num, stage_idx, node_idx);
+    delete[] tempNode;
+  }
+}
+
+void gen_hybrid_graph_2d(Node *node, int stage_num, int node_num, int stage_idx,
+                         int node_idx, int hybrid_node_num, int type0,
+                         int type1) {
+  int hybrid_stage_num = get_hybrid_stage_num(type0, hybrid_node_num);
+
+  for (int ni = 0; ni < hybrid_node_num; ni++) {
+    gen_hybrid_graph_1d(node, stage_num, node_num, stage_idx,
+                        node_idx + ni * hybrid_node_num, hybrid_node_num,
+                        type0);
+    gen_hybrid_graph_1d(node, stage_num, node_num, stage_idx + hybrid_stage_num,
+                        node_idx + ni * hybrid_node_num, hybrid_node_num,
+                        type1);
+  }
+
+  // connect first layer and second layer
+  connect_layer_2d(node, stage_num, node_num, stage_idx + hybrid_stage_num - 1,
+                   node_idx, hybrid_node_num);
+}
+
+void gen_hybrid_graph_2d_new(Node *node, int stage_num, int node_num,
+                             int stage_idx, int node_idx, int hybrid_node_num0,
+                             int hybrid_node_num1, int type0, int type1) {
+  int hybrid_stage_num0 = get_hybrid_stage_num(type0, hybrid_node_num0);
+
+  for (int ni = 0; ni < hybrid_node_num1; ni++) {
+    gen_hybrid_graph_1d(node, stage_num, node_num, stage_idx,
+                        node_idx + ni * hybrid_node_num0, hybrid_node_num0,
+                        type0);
+  }
+  for (int ni = 0; ni < hybrid_node_num0; ni++) {
+    gen_hybrid_graph_1d(
+        node, stage_num, node_num, stage_idx + hybrid_stage_num0,
+        node_idx + ni * hybrid_node_num1, hybrid_node_num1, type1);
+  }
+
+  // connect first layer and second layer
+  connect_layer_2d_new(node, stage_num, node_num,
+                       stage_idx + hybrid_stage_num0 - 1, node_idx,
+                       hybrid_node_num0, hybrid_node_num1);
+}
+
+void gen_inv_graph(Node *node, int stage_num, int node_num, Node *invNode,
+                   int inv_stage_num, int inv_node_num, int inv_stage_idx,
+                   int inv_node_idx) {
+  // clean up inNodeNum in invNode because of add_node
+  for (int si = 1 + inv_stage_idx; si < inv_stage_idx + stage_num; si++) {
+    for (int ni = inv_node_idx; ni < inv_node_idx + node_num; ni++) {
+      int idx = get_idx(si, ni, inv_node_num);
+      invNode[idx].inNodeNum = 0;
+    }
+  }
+  // generate inverse graph of node on invNode
+  for (int si = 1; si < stage_num; si++) {
+    for (int ni = 0; ni < node_num; ni++) {
+      int invSi = stage_num - si;
+      int idx = get_idx(si, ni, node_num);
+      for (int k = 0; k < node[idx].inNodeNum; k++) {
+        int invNi = node[idx].inNodeIdx[k];
+        add_node(invNode, inv_stage_num, inv_node_num, invSi + inv_stage_idx,
+                 invNi + inv_node_idx, ni + inv_node_idx,
+                 node[idx].inWeight[k]);
+      }
+    }
+  }
+}
diff --git a/libs/libaom/src/tools/txfm_analyzer/txfm_graph.h b/libs/libaom/src/tools/txfm_analyzer/txfm_graph.h
new file mode 100644
index 000000000..8dc36146d
--- /dev/null
+++ b/libs/libaom/src/tools/txfm_analyzer/txfm_graph.h
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_TOOLS_TXFM_ANALYZER_TXFM_GRAPH_H_
+#define AOM_TOOLS_TXFM_ANALYZER_TXFM_GRAPH_H_
+
+struct Node {
+  Node *inNode[2];
+  int inNodeNum;
+  int inNodeIdx[2];
+  double inWeight[2];
+  double value;
+  int nodeIdx;
+  int stageIdx;
+  int visited;
+};
+
+#define STAGENUM (10)
+#define NODENUM (32)
+#define COS_MOD (128)
+
+typedef enum {
+  TYPE_DCT = 0,
+  TYPE_ADST,
+  TYPE_IDCT,
+  TYPE_IADST,
+  TYPE_LAST
+} TYPE_TXFM;
+
+TYPE_TXFM get_inv_type(TYPE_TXFM type);
+void get_fun_name(char *str_fun_name, int str_buf_size, const TYPE_TXFM type,
+                  const int txfm_size);
+
+void get_txfm_type_name(char *str_fun_name, int str_buf_size,
+                        const TYPE_TXFM type, const int txfm_size);
+void get_hybrid_2d_type_name(char *buf, int buf_size, const TYPE_TXFM type0,
+                             const TYPE_TXFM type1, const int txfm_size0,
+                             const int txfm_size1);
+unsigned int get_max_bit(unsigned int x);
+unsigned int bitwise_reverse(unsigned int x, int max_bit);
+int get_idx(int ri, int ci, int cSize);
+
+int get_dct_stage_num(int size);
+void reference_dct_1d(double *in, double *out, int size);
+void reference_dct_2d(double *in, double *out, int size);
+void connect_node(Node *node, int stage_num, int node_num, int stage_idx,
+                  int node_idx, int in0, double w0, int in1, double w1);
+void propagate(Node *node, int stage_num, int node_num, int stage);
+void init_graph(Node *node, int stage_num, int node_num);
+void graph_reset_visited(Node *node, int stage_num, int node_num);
+void gen_B_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                 int node_idx, int N, int star);
+void gen_P_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                 int node_idx, int N);
+
+void gen_type1_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                     int node_idx, int N);
+void gen_type2_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                     int node_idx, int N);
+void gen_type3_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                     int node_idx, int idx, int N);
+void gen_type4_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                     int node_idx, int idx, int N);
+
+void gen_R_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                 int node_idx, int N);
+
+void gen_DCT_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                   int node_idx, int N);
+
+void gen_DCT_graph_1d(Node *node, int stage_num, int node_num, int stage_idx,
+                      int node_idx, int dct_node_num);
+void connect_layer_2d(Node *node, int stage_num, int node_num, int stage_idx,
+                      int node_idx, int dct_node_num);
+
+void gen_DCT_graph_2d(Node *node, int stage_num, int node_num, int stage_idx,
+                      int node_idx, int dct_node_num);
+
+void gen_adst_B_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                      int node_idx, int adst_idx);
+
+void gen_adst_U_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                      int node_idx, int adst_idx, int adst_node_num);
+void gen_adst_T_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                      int node_idx, double freq);
+
+void gen_adst_E_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                      int node_idx, int adst_idx);
+
+void gen_adst_V_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                      int node_idx, int adst_idx, int adst_node_num);
+
+void gen_adst_VJ_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                       int node_idx, int adst_node_num);
+void gen_adst_Q_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                      int node_idx, int adst_node_num);
+void gen_adst_Ibar_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                         int node_idx, int adst_node_num);
+
+void gen_adst_D_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                      int node_idx, int adst_node_num);
+
+int get_hadamard_idx(int x, int adst_node_num);
+void gen_adst_Ht_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                       int node_idx, int adst_node_num);
+
+int gen_adst_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                   int node_idx, int adst_node_num);
+int gen_iadst_graph(Node *node, int stage_num, int node_num, int stage_idx,
+                    int node_idx, int adst_node_num);
+void reference_adst_1d(double *in, double *out, int size);
+
+int get_adst_stage_num(int adst_node_num);
+int get_hybrid_stage_num(int type, int hybrid_node_num);
+int get_hybrid_2d_stage_num(int type0, int type1, int hybrid_node_num);
+int get_hybrid_2d_stage_num_new(int type0, int type1, int hybrid_node_num0,
+                                int hybrid_node_num1);
+int get_hybrid_amplify_factor(int type, int hybrid_node_num);
+void gen_hybrid_graph_1d(Node *node, int stage_num, int node_num, int stage_idx,
+                         int node_idx, int hybrid_node_num, int type);
+void gen_hybrid_graph_2d(Node *node, int stage_num, int node_num, int stage_idx,
+                         int node_idx, int hybrid_node_num, int type0,
+                         int type1);
+void gen_hybrid_graph_2d_new(Node *node, int stage_num, int node_num,
+                             int stage_idx, int node_idx, int hybrid_node_num0,
+                             int hybrid_node_num1, int type0, int type1);
+
+void reference_hybrid_2d(double *in, double *out, int size, int type0,
+                         int type1);
+
+void reference_hybrid_2d_new(double *in, double *out, int size0, int size1,
+                             int type0, int type1);
+void reference_adst_dct_2d(double *in, double *out, int size);
+
+void gen_code(Node *node, int stage_num, int node_num, TYPE_TXFM type);
+
+void gen_inv_graph(Node *node, int stage_num, int node_num, Node *invNode,
+                   int inv_stage_num, int inv_node_num, int inv_stage_idx,
+                   int inv_node_idx);
+
+TYPE_TXFM hybrid_char_to_int(char ctype);
+
+int64_t round_shift(int64_t value, int bit);
+void round_shift_array(int32_t *arr, int size, int bit);
+void estimate_value(Node *node, int stage_num, int node_num, int stage_idx,
+                    int node_idx, int estimate_bit);
+void amplify_value(Node *node, int stage_num, int node_num, int stage_idx,
+                   int node_idx, int estimate_bit);
+void propagate_estimate_amlify(Node *node, int stage_num, int node_num,
+                               int stage_idx, int amplify_bit,
+                               int estimate_bit);
+#endif  // AOM_TOOLS_TXFM_ANALYZER_TXFM_GRAPH_H_
diff --git a/libs/libaom/src/tools/wrap-commit-msg.py b/libs/libaom/src/tools/wrap-commit-msg.py
new file mode 100644
index 000000000..1c7882443
--- /dev/null
+++ b/libs/libaom/src/tools/wrap-commit-msg.py
@@ -0,0 +1,72 @@
+#!/usr/bin/env python
+##
+## Copyright (c) 2016, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+"""Wraps paragraphs of text, preserving manual formatting
+
+This is like fold(1), but has the special convention of not modifying lines
+that start with whitespace. This allows you to intersperse blocks with
+special formatting, like code blocks, with written prose. The prose will
+be wordwrapped, and the manual formatting will be preserved.
+
+ * This won't handle the case of a bulleted (or ordered) list specially, so
+   manual wrapping must be done.
+
+Occasionally it's useful to put something with explicit formatting that
+doesn't look at all like a block of text inline.
+
+  indicator = has_leading_whitespace(line);
+  if (indicator)
+    preserve_formatting(line);
+
+The intent is that this docstring would make it through the transform
+and still be legible and presented as it is in the source. If additional
+cases are handled, update this doc to describe the effect.
+"""
+
+__author__ = "jkoleszar@google.com"
+import textwrap
+import sys
+
+def wrap(text):
+    if text:
+        return textwrap.fill(text, break_long_words=False) + '\n'
+    return ""
+
+
+def main(fileobj):
+    text = ""
+    output = ""
+    while True:
+        line = fileobj.readline()
+        if not line:
+            break
+
+        if line.lstrip() == line:
+            text += line
+        else:
+            output += wrap(text)
+            text=""
+            output += line
+    output += wrap(text)
+
+    # Replace the file or write to stdout.
+    if fileobj == sys.stdin:
+        fileobj = sys.stdout
+    else:
+        fileobj.seek(0)
+        fileobj.truncate(0)
+    fileobj.write(output)
+
+if __name__ == "__main__":
+    if len(sys.argv) > 1:
+        main(open(sys.argv[1], "r+"))
+    else:
+        main(sys.stdin)
diff --git a/libs/libaom/src/usage.dox b/libs/libaom/src/usage.dox
new file mode 100644
index 000000000..4004f4af5
--- /dev/null
+++ b/libs/libaom/src/usage.dox
@@ -0,0 +1,109 @@
+/*!\page usage Usage
+
+    The aom multi-format codec SDK provides a unified interface amongst its
+    supported codecs. This abstraction allows applications using this SDK to
+    easily support multiple video formats with minimal code duplication or
+    "special casing." This section describes the interface common to all codecs.
+    For codec-specific details, see the \ref codecs page.
+
+    The following sections are common to all codecs:
+    - \ref usage_types
+    - \ref usage_features
+    - \ref usage_init
+    - \ref usage_errors
+
+    For more information on decoder and encoder specific usage, see the
+    following pages:
+    \if decoder
+    \li \subpage usage_decode
+    \endif
+    \if encoder
+    \li \subpage usage_encode
+    \endif
+
+    \section usage_types Important Data Types
+    There are two important data structures to consider in this interface.
+
+    \subsection usage_ctxs Contexts
+    A context is a storage area allocated by the calling application that the
+    codec may write into to store details about a single instance of that codec.
+    Most of the context is implementation specific, and thus opaque to the
+    application. The context structure as seen by the application is of fixed
+    size, and thus can be allocated with automatic storage or dynamically
+    on the heap.
+
+    Most operations require an initialized codec context. Codec context
+    instances are codec specific. That is, the codec to be used for the encoded
+    video must be known at initialization time. See #aom_codec_ctx_t for further
+    information.
+
+    \subsection usage_ifaces Interfaces
+    A codec interface is an opaque structure that controls how function calls
+    into the generic interface are dispatched to their codec-specific
+    implementations. Applications \ref MUSTNOT attempt to examine or override
+    this storage, as it contains internal implementation details likely to
+    change from release to release.
+
+    Each supported codec will expose an interface structure to the application
+    as an <code>extern</code> reference to a structure of the incomplete type
+    #aom_codec_iface_t.
+
+    \section usage_features Features
+    Several "features" are defined that are optionally implemented by codec
+    algorithms. Indeed, the same algorithm may support different features on
+    different platforms. The purpose of defining these features is that when
+    they are implemented, they conform to a common interface. The features, or
+    capabilities, of an algorithm can be queried from it's interface by using
+    the aom_codec_get_caps() method. Attempts to invoke features not supported
+    by an algorithm will generally result in #AOM_CODEC_INCAPABLE.
+
+    \if decoder
+    Currently defined decoder features include:
+    \endif
+
+    \section usage_init Initialization
+    To initialize a codec instance, the address of the codec context
+    and interface structures are passed to an initialization function. Depending
+    on the \ref usage_features that the codec supports, the codec could be
+    initialized in different modes.
+
+    To prevent cases of confusion where the ABI of the library changes,
+    the ABI is versioned. The ABI version number must be passed at
+    initialization time to ensure the application is using a header file that
+    matches the library. The current ABI version number is stored in the
+    preprocessor macros #AOM_CODEC_ABI_VERSION, #AOM_ENCODER_ABI_VERSION, and
+    #AOM_DECODER_ABI_VERSION. For convenience, each initialization function has
+    a wrapper macro that inserts the correct version number. These macros are
+    named like the initialization methods, but without the _ver suffix.
+
+
+    The available initialization methods are:
+    \if encoder
+    \li #aom_codec_enc_init (calls aom_codec_enc_init_ver())
+    \endif
+    \if decoder
+    \li #aom_codec_dec_init (calls aom_codec_dec_init_ver())
+    \endif
+
+
+    \section usage_errors Error Handling
+    Almost all codec functions return an error status of type #aom_codec_err_t.
+    The semantics of how each error condition should be processed is clearly
+    defined in the definitions of each enumerated value. Error values can be
+    converted into ASCII strings with the aom_codec_error() and
+    aom_codec_err_to_string() methods. The difference between these two methods is
+    that aom_codec_error() returns the error state from an initialized context,
+    whereas aom_codec_err_to_string() can be used in cases where an error occurs
+    outside any context. The enumerated value returned from the last call can be
+    retrieved from the <code>err</code> member of the decoder context as well.
+    Finally, more detailed error information may be able to be obtained by using
+    the aom_codec_error_detail() method. Not all errors produce detailed error
+    information.
+
+    In addition to error information, the codec library's build configuration
+    is available at runtime on some platforms. This information can be returned
+    by calling aom_codec_build_config(), and is formatted as a base64 coded string
+    (comprised of characters in the set [a-z_a-Z0-9+/]). This information is not
+    useful to an application at runtime, but may be of use to aom for support.
+
+*/
diff --git a/libs/libaom/src/usage_cx.dox b/libs/libaom/src/usage_cx.dox
new file mode 100644
index 000000000..51b4e8e3e
--- /dev/null
+++ b/libs/libaom/src/usage_cx.dox
@@ -0,0 +1,9 @@
+/*! \page usage_encode Encoding
+
+    The aom_codec_encode() function is at the core of the encode loop. It
+    processes raw images passed by the application, producing packets of
+    compressed data.
+
+    \ref samples
+
+*/
diff --git a/libs/libaom/src/usage_dx.dox b/libs/libaom/src/usage_dx.dox
new file mode 100644
index 000000000..76dc213bf
--- /dev/null
+++ b/libs/libaom/src/usage_dx.dox
@@ -0,0 +1,22 @@
+/*! \page usage_decode Decoding
+
+    The aom_codec_decode() function is at the core of the decode loop. It
+    processes packets of compressed data passed by the application, producing
+    decoded images. The decoder expects packets to comprise exactly one image
+    frame of data. Packets \ref MUST be passed in decode order. If the
+    application wishes to associate some data with the frame, the
+    <code>user_priv</code> member may be set.
+
+    \ref samples
+
+
+    \section usage_frame_iter Frame Iterator Based Decoding
+    Decoded frames are made available to the application
+    through the aom_codec_get_frame() iterator. The application initializes the
+    iterator storage (of type #aom_codec_iter_t) to NULL, then calls
+    aom_codec_get_frame repeatedly until it returns NULL, indicating that all
+    images have been returned. This process may result in zero, one, or many
+    frames that are ready for display, depending on the codec.
+
+
+*/
-- 
cgit v1.2.3